diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..4bd6274
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+MMIU-Benchmark/
diff --git a/VLMEvalKit/all.json b/VLMEvalKit/all.json
new file mode 100644
index 0000000..d268617
--- /dev/null
+++ b/VLMEvalKit/all.json
@@ -0,0 +1,207939 @@
+[
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_0_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_0_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_0_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_0_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_1_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_1_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_1_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_1_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_2_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_2_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_2_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_2_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_3_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_3_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_3_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_3_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_4_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_4_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_4_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_4_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_5_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_5_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_5_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_5_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_6_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_6_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_6_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_6_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_7_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_7_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_7_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_7_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_8_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_8_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_8_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_8_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_9_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_9_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_9_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_9_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_10_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_10_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_10_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_10_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_11_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_11_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_11_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_11_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_12_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_12_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_12_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_12_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_13_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_13_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_13_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_13_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_14_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_14_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_14_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_14_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_15_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_15_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_15_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_15_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_16_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_16_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_16_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_16_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_17_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_17_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_17_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_17_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_18_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_18_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_18_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_18_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_19_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_19_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_19_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_19_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_20_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_20_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_20_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_20_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_21_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_21_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_21_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_21_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_22_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_22_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_22_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_22_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_23_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_23_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_23_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_23_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_24_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_24_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_24_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_24_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_25_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_25_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_25_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_25_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_26_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_26_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_26_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_26_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_27_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_27_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_27_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_27_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_28_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_28_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_28_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_28_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_29_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_29_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_29_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_29_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_30_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_30_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_30_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_30_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_31_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_31_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_31_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_31_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_32_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_32_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_32_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_32_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_33_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_33_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_33_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_33_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_34_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_34_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_34_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_34_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_35_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_35_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_35_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_35_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_36_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_36_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_36_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_36_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_37_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_37_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_37_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_37_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_38_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_38_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_38_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_38_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_39_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_39_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_39_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_39_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_40_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_40_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_40_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_40_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_41_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_41_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_41_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_41_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_42_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_42_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_42_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_42_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_43_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_43_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_43_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_43_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_44_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_44_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_44_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_44_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_45_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_45_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_45_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_45_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_46_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_46_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_46_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_46_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_47_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_47_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_47_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_47_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_48_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_48_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_48_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_48_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_49_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_49_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_49_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_49_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_50_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_50_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_50_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_50_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_51_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_51_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_51_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_51_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_52_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_52_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_52_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_52_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_53_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_53_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_53_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_53_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_54_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_54_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_54_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_54_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_55_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_55_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_55_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_55_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_56_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_56_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_56_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_56_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_57_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_57_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_57_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_57_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_58_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_58_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_58_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_58_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_59_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_59_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_59_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_59_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_60_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_60_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_60_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_60_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_61_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_61_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_61_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_61_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_62_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_62_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_62_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_62_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_63_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_63_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_63_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_63_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_64_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_64_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_64_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_64_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_65_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_65_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_65_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_65_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_66_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_66_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_66_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_66_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_67_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_67_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_67_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_67_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_68_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_68_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_68_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_68_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_69_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_69_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_69_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_69_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_70_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_70_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_70_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_70_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_71_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_71_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_71_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_71_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_72_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_72_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_72_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_72_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_73_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_73_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_73_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_73_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_74_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_74_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_74_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_74_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_75_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_75_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_75_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_75_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_76_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_76_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_76_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_76_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_77_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_77_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_77_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_77_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_78_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_78_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_78_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_78_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_79_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_79_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_79_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_79_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_80_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_80_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_80_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_80_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_81_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_81_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_81_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_81_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_82_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_82_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_82_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_82_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_83_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_83_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_83_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_83_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_84_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_84_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_84_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_84_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_85_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_85_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_85_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_85_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_86_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_86_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_86_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_86_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_87_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_87_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_87_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_87_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_88_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_88_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_88_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_88_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_89_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_89_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_89_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_89_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_90_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_90_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_90_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_90_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_91_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_91_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_91_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_91_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_92_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_92_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_92_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_92_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_93_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_93_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_93_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_93_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_94_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_94_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_94_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_94_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_95_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_95_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_95_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_95_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_96_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_96_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_96_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_96_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_97_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_97_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_97_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_97_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_98_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_98_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_98_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_98_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_99_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_99_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_99_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_99_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_100_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_100_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_100_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_100_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_101_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_101_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_101_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_101_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_102_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_102_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_102_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_102_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_103_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_103_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_103_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_103_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_104_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_104_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_104_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_104_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_105_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_105_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_105_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_105_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_106_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_106_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_106_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_106_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_107_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_107_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_107_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_107_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_108_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_108_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_108_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_108_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_109_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_109_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_109_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_109_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_110_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_110_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_110_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_110_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_111_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_111_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_111_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_111_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_112_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_112_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_112_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_112_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_113_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_113_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_113_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_113_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_114_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_114_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_114_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_114_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_115_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_115_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_115_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_115_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_116_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_116_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_116_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_116_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_117_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_117_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_117_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_117_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_118_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_118_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_118_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_118_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_119_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_119_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_119_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_119_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_120_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_120_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_120_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_120_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_121_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_121_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_121_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_121_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_122_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_122_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_122_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_122_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_123_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_123_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_123_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_123_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_124_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_124_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_124_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_124_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_125_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_125_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_125_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_125_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_126_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_126_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_126_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_126_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_127_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_127_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_127_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_127_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_128_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_128_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_128_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_128_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_129_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_129_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_129_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_129_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_130_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_130_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_130_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_130_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_blink",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "blink",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a real photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_131_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_131_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_131_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_blink/forensic_detection_blink_131_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_0_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_0_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_0_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_0_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_1_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_1_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_1_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_1_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_2_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_2_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_2_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_2_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_3_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_3_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_3_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_3_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_4_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_4_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_4_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_4_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_5_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_5_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_5_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_5_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_6_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_6_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_6_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_6_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_7_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_7_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_7_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_7_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_8_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_8_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_8_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_8_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_9_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_9_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_9_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_9_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_10_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_10_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_10_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_10_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_11_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_11_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_11_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_11_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_12_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_12_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_12_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_12_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_13_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_13_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_13_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_13_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_14_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_14_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_14_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_14_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_15_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_15_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_15_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_15_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_16_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_16_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_16_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_16_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_17_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_17_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_17_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_17_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_18_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_18_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_18_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_18_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_19_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_19_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_19_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_19_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_20_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_20_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_20_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_20_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_21_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_21_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_21_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_21_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_22_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_22_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_22_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_22_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_23_0.png",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_23_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_23_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_23_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_24_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_24_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_24_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_24_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_25_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_25_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_25_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_25_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_26_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_26_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_26_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_26_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_27_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_27_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_27_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_27_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_28_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_28_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_28_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_28_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_29_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_29_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_29_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_29_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_30_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_30_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_30_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_30_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_31_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_31_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_31_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_31_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_32_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_32_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_32_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_32_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_33_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_33_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_33_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_33_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_34_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_34_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_34_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_34_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_35_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_35_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_35_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_35_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_36_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_36_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_36_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_36_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_37_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_37_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_37_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_37_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_38_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_38_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_38_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_38_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_39_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_39_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_39_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_39_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_40_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_40_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_40_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_40_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_41_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_41_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_41_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_41_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_42_0.png",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_42_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_42_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_42_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_43_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_43_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_43_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_43_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_44_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_44_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_44_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_44_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_45_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_45_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_45_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_45_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_46_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_46_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_46_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_46_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_47_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_47_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_47_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_47_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_48_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_48_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_48_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_48_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_49_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_49_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_49_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_49_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_50_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_50_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_50_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_50_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_51_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_51_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_51_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_51_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_52_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_52_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_52_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_52_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_53_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_53_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_53_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_53_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_54_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_54_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_54_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_54_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_55_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_55_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_55_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_55_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_56_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_56_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_56_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_56_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_57_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_57_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_57_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_57_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_58_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_58_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_58_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_58_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_59_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_59_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_59_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_59_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_60_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_60_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_60_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_60_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_61_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_61_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_61_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_61_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_62_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_62_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_62_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_62_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_63_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_63_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_63_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_63_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_64_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_64_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_64_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_64_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_65_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_65_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_65_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_65_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_66_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_66_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_66_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_66_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_67_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_67_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_67_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_67_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_68_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_68_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_68_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_68_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_69_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_69_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_69_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_69_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_70_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_70_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_70_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_70_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_71_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_71_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_71_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_71_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_72_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_72_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_72_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_72_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_73_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_73_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_73_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_73_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_74_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_74_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_74_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_74_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_75_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_75_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_75_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_75_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_76_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_76_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_76_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_76_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_77_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_77_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_77_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_77_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_78_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_78_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_78_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_78_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_79_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_79_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_79_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_79_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_80_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_80_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_80_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_80_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_81_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_81_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_81_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_81_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_82_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_82_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_82_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_82_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_83_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_83_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_83_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_83_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_84_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_84_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_84_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_84_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_85_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_85_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_85_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_85_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_86_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_86_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_86_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_86_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_87_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_87_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_87_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_87_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_88_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_88_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_88_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_88_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_89_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_89_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_89_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_89_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_90_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_90_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_90_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_90_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_91_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_91_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_91_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_91_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_92_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_92_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_92_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_92_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_93_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_93_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_93_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_93_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_94_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_94_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_94_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_94_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_95_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_95_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_95_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_95_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_96_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_96_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_96_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_96_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_97_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_97_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_97_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_97_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_98_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_98_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_98_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_98_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_99_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_99_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_99_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_99_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_100_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_100_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_100_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_100_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_101_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_101_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_101_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_101_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_102_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_102_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_102_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_102_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_103_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_103_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_103_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_103_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_104_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_104_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_104_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_104_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_105_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_105_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_105_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_105_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_106_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_106_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_106_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_106_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_107_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_107_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_107_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_107_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_108_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_108_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_108_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_108_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_109_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_109_1.png",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_109_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_109_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_110_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_110_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_110_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_110_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_111_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_111_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_111_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_111_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_112_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_112_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_112_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_112_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_113_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_113_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_113_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_113_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_114_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_114_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_114_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_114_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_115_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_115_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_115_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_115_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_116_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_116_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_116_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_116_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_117_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_117_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_117_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_117_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_118_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_118_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_118_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_118_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_119_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_119_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_119_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_119_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_120_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_120_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_120_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_120_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_121_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_121_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_121_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_121_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_122_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_122_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_122_2.png",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_122_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_123_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_123_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_123_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_123_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_124_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_124_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_124_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_124_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_125_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_125_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_125_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_125_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_126_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_126_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_126_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_126_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_127_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_127_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_127_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_127_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_128_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_128_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_128_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_128_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_129_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_129_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_129_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_129_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_130_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_130_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_130_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_130_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_131_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_131_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_131_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_131_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_132_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_132_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_132_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_132_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_133_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_133_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_133_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_133_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_134_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_134_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_134_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_134_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_135_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_135_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_135_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_135_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_136_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_136_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_136_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_136_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_137_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_137_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_137_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_137_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_138_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_138_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_138_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_138_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_139_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_139_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_139_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_139_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_140_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_140_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_140_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_140_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_141_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_141_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_141_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_141_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_142_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_142_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_142_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_142_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_143_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_143_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_143_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_143_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_144_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_144_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_144_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_144_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_145_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_145_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_145_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_145_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_146_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_146_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_146_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_146_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_147_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_147_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_147_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_147_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_148_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_148_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_148_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_148_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_149_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_149_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_149_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_149_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_150_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_150_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_150_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_150_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_151_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_151_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_151_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_151_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_152_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_152_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_152_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_152_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_153_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_153_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_153_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_153_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_154_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_154_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_154_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_154_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_155_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_155_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_155_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_155_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_156_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_156_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_156_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_156_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_157_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_157_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_157_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_157_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_158_0.png",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_158_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_158_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_158_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_159_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_159_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_159_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_159_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_160_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_160_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_160_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_160_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_161_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_161_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_161_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_161_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_162_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_162_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_162_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_162_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_163_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_163_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_163_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_163_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_164_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_164_1.png",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_164_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_164_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_165_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_165_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_165_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_165_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_166_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_166_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_166_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_166_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_167_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_167_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_167_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_167_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_168_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_168_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_168_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_168_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_169_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_169_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_169_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_169_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_170_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_170_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_170_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_170_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_171_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_171_1.png",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_171_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_171_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_172_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_172_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_172_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_172_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_173_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_173_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_173_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_173_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_174_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_174_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_174_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_174_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_175_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_175_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_175_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_175_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_176_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_176_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_176_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_176_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_177_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_177_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_177_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_177_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_178_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_178_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_178_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_178_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_179_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_179_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_179_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_179_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_180_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_180_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_180_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_180_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_181_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_181_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_181_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_181_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_182_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_182_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_182_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_182_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_183_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_183_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_183_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_183_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_184_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_184_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_184_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_184_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_185_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_185_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_185_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_185_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_186_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_186_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_186_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_186_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_187_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_187_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_187_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_187_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_188_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_188_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_188_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_188_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_189_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_189_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_189_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_189_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_190_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_190_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_190_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_190_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_191_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_191_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_191_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_191_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_192_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_192_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_192_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_192_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_193_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_193_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_193_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_193_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_194_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_194_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_194_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_194_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_195_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_195_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_195_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_195_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_196_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_196_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_196_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_196_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_197_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_197_1.png",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_197_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_197_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_198_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_198_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_198_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_198_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "forensic_detection_forgerynet",
+    "visual_input_component": "natural image and synthetic image",
+    "source": "forgerynet",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to be a fake photograph?",
+    "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_199_0.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_199_1.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_199_2.jpg",
+      "../MMIU-Benchmark/forensic_detection_forgerynet/forensic_detection_forgerynet_199_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: No\nB: Yes\n",
+    "question": "Are both of these images relatively realistic?",
+    "context": "Candidates: A. No B. Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_0_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_0_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Yes\nB: No\n",
+    "question": "Is the first image sharper than the second image?",
+    "context": "Candidates: A. Yes B. No",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_1_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_1_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: overexposure\nB: low light\nC: noise\nD: blur\n",
+    "question": "Which distortion is missing in the second image compared to the first image?",
+    "context": "Candidates: A. overexposure B. low light C. noise D. blur",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_2_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_2_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: The sky in the first image\nB: The figure's back in the second image\nC: The building in the center of the first image\nD: The shop window in the second image\n",
+    "question": "Which part of the two images is more affected by underexposure?",
+    "context": "Candidates: A. The sky in the first image B. The figure's back in the second image C. The building in the center of the first image D. The shop window in the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_3_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_3_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: The ground in the first image\nB: The dog in the first image\nC: The baby in the second image\n",
+    "question": "Which part below is affected by motion blur?",
+    "context": "Candidates: A. The ground in the first image B. The dog in the first image C. The baby in the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_4_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_4_1.JPG"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Much weaker\nB: About the same\nC: Much stronger\n",
+    "question": "Compared to the second image, how is the lighting in the first image?",
+    "context": "Candidates: A. Much weaker B. About the same C. Much stronger",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_5_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_5_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Similar\nB: Slightly better\nC: Slightly worse\n",
+    "question": "Compared to the second image, how is the lighting situation in the first image?",
+    "context": "Candidates: A. Similar B. Slightly better C. Slightly worse",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_6_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_6_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: No\nB: Yes\n",
+    "question": "Is the illumination of the second image stronger than the first image?",
+    "context": "Candidates: A. No B. Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_7_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_7_1.bmp"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Yes\nB: No\n",
+    "question": "Is the first image clearer than the second image?",
+    "context": "Candidates: A. Yes B. No",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_8_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_8_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: No\nB: Yes\n",
+    "question": "Does the first image have more overexposure distortion than the second image?",
+    "context": "Candidates: A. No B. Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_9_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_9_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: The person in the first image\nB: The telephone booth in the first image\nC: The background in the second image\n",
+    "question": "Which part is most affected by motion blur?",
+    "context": "Candidates: A. The person in the first image B. The telephone booth in the first image C. The background in the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_10_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_10_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: No\nB: Yes\n",
+    "question": "Is the first image more realistic than the second image?",
+    "context": "Candidates: A. No B. Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_11_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_11_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: No\nB: Yes\n",
+    "question": "Is the lighting of the first image stronger than the second image?",
+    "context": "Candidates: A. No B. Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_12_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_12_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Motion blur\nB: Overexposure\nC: Out of focus\n",
+    "question": "What kind of distortion issue do these two images not have?",
+    "context": "Candidates: A. Motion blur B. Overexposure C. Out of focus",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_13_0.bmp",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_13_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Yes\nB: No\n",
+    "question": "Is the first image more blurry than the second image?",
+    "context": "Candidates: A. Yes B. No",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_14_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_14_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Underexposure\nB: Blur\nC: Motion blur\nD: Overexposure\n",
+    "question": "What problem is not present in the two images?",
+    "context": "Candidates: A. Underexposure B. Blur C. Motion blur D. Overexposure",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_15_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_15_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Richer\nB: About the same\nC: Less rich\n",
+    "question": "Compared to the first image, how does the texture detail level of the second image look like?",
+    "context": "Candidates: A. Richer B. About the same C. Less rich",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_16_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_16_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: The woman's face in the second image\nB: The blanket in the second image\nC: The grassland background in the first image\nD: The dog's fur in the first image\n",
+    "question": "Which area has clearer details and textures?",
+    "context": "Candidates: A. The woman's face in the second image B. The blanket in the second image C. The grassland background in the first image D. The dog's fur in the first image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_17_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_17_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: No\nB: Yes\n",
+    "question": "Are the colors of these two images both monotonous?",
+    "context": "Candidates: A. No B. Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_18_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_18_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: The background of the first image\nB: The apple in the first image\nC: The black and white wall of the second image\n",
+    "question": "Which part is most seriously affected by overexposure?",
+    "context": "Candidates: A. The background of the first image B. The apple in the first image C. The black and white wall of the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_19_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_19_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Yes\nB: No\n",
+    "question": "Is the composition of the first image better than the second image?",
+    "context": "Candidates: A. Yes B. No",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_20_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_20_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: worse\nB: similar\nC: better\n",
+    "question": "Compared to the second image, how is the composition of the first image?",
+    "context": "Candidates: A. worse B. similar C. better",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_21_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_21_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Blurrier\nB: Clearer\nC: About the same\n",
+    "question": "Relative to the first image, how clear is the second image?",
+    "context": "Candidates: A. Blurrier B. Clearer C. About the same",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_22_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_22_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Less realistic\nB: More realistic\nC: About the same\n",
+    "question": "Compared to the first image, how would you rate the realism of the second image?",
+    "context": "Candidates: A. Less realistic B. More realistic C. About the same",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_23_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_23_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: The left side of the second image\nB: The dog in the second image\nC: The figures in the first image\n",
+    "question": "Which part below is most severely affected by overexposure?",
+    "context": "Candidates: A. The left side of the second image B. The dog in the second image C. The figures in the first image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_24_0.webp",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_24_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Yes\nB: No\n",
+    "question": "Is the color of the first image more vivid than the second image?",
+    "context": "Candidates: A. Yes B. No",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_25_0.JPG",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_25_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: More realistic\nB: Less realistic\nC: About the same\n",
+    "question": "Compared to the first image, how real is the second image?",
+    "context": "Candidates: A. More realistic B. Less realistic C. About the same",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_26_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_26_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Yes\nB: No\n",
+    "question": "Is the second image sharper than the first image?",
+    "context": "Candidates: A. Yes B. No",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_27_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_27_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Yes\nB: No\n",
+    "question": "Is the first image blurrier than the second image?",
+    "context": "Candidates: A. Yes B. No",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_28_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_28_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Much worse\nB: About the same\nC: Much better\n",
+    "question": "Compared to the first image, how is the composition of the second image?",
+    "context": "Candidates: A. Much worse B. About the same C. Much better",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_29_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_29_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Street lamp in the first image\nB: Pedestrian in the second image\nC: Ground in the first image\n",
+    "question": "Which part below is most severely affected by overexposure?",
+    "context": "Candidates: A. Street lamp in the first image B. Pedestrian in the second image C. Ground in the first image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_30_0.bmp",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_30_1.bmp"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Snowflake\nB: Strong light\nC: Low light\nD: Overexposure\n",
+    "question": "In the problem of which is more severe between the first image and the second image, which of the following is not present?",
+    "context": "Candidates: A. Snowflake B. Strong light C. Low light D. Overexposure",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_31_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_31_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: much worse\nB: almost the same\nC: much worse\nD: much better\n",
+    "question": "Compared to the first image, how is the aesthetic composition of the second image?",
+    "context": "Candidates: A. much worse B. almost the same C. much worse D. much better",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_32_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_32_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: No\nB: Yes\n",
+    "question": "Is the first image sharper than the second image?",
+    "context": "Candidates: A. No B. Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_33_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_33_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Second image\nB: First image\n",
+    "question": "Which image is affected more by overexposure?",
+    "context": "Candidates: A. Second image B. First image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_34_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_34_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Less rich\nB: About the same\nC: Richer\n",
+    "question": "Compared to the first image, how rich are the texture details in the second image?",
+    "context": "Candidates: A. Less rich B. About the same C. Richer",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_35_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_35_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: About the same\nB: Blurrier\nC: Clearer\n",
+    "question": "Compared to the first image, how is the clarity of the second image?",
+    "context": "Candidates: A. About the same B. Blurrier C. Clearer",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_36_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_36_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Clearer\nB: Blurrier\nC: About the same\n",
+    "question": "Compared to the first image, how is the clarity of the second image?",
+    "context": "Candidates: A. Clearer B. Blurrier C. About the same",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_37_0.JPG",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_37_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Ground in the first image\nB: Dog in the first image\nC: Person in the second image\n",
+    "question": "Which part below is most affected by motion blur?",
+    "context": "Candidates: A. Ground in the first image B. Dog in the first image C. Person in the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_38_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_38_1.bmp"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: The sky in the second image\nB: The person in the second image\nC: The strawberry in the first image\n",
+    "question": "Which part below is most severely affected by overexposure?",
+    "context": "Candidates: A. The sky in the second image B. The person in the second image C. The strawberry in the first image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_39_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_39_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Sharper\nB: Blurrier\nC: About the same\n",
+    "question": "Compared to the first image, how is the sharpness of the second image?",
+    "context": "Candidates: A. Sharper B. Blurrier C. About the same",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_40_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_40_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: The focused red flowers in the second image\nB: The flower bush background in the second image\nC: The background in the first image\nD: The man's silhouette in the first image\n",
+    "question": "Which area is more affected by blurring?",
+    "context": "Candidates: A. The focused red flowers in the second image B. The flower bush background in the second image C. The background in the first image D. The man's silhouette in the first image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_41_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_41_1.JPG"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Second image\nB: First image\n",
+    "question": "Which of the following images has a more serious overexposure issue?",
+    "context": "Candidates: A. Second image B. First image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_42_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_42_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Yes\nB: No\n",
+    "question": "Is the composition of the first image better than the second image?",
+    "context": "Candidates: A. Yes B. No",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_43_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_43_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Yes\nB: No\n",
+    "question": "Is the second image more realistic than the first image?",
+    "context": "Candidates: A. Yes B. No",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_44_0.webp",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_44_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: No\nB: Yes\n",
+    "question": "Is the texture detail of the first image richer than the second image?",
+    "context": "Candidates: A. No B. Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_45_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_45_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: About the same\nB: Slightly sharper\nC: Slightly more blurry\n",
+    "question": "Compared to the first image, how is the sharpness of the second image?",
+    "context": "Candidates: A. About the same B. Slightly sharper C. Slightly more blurry",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_46_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_46_1.webp"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Checkerboard ground in the first image\nB: Horse in the second image\nC: Background in the second image\n",
+    "question": "Which part has the most severe issue of losing texture details?",
+    "context": "Candidates: A. Checkerboard ground in the first image B. Horse in the second image C. Background in the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_47_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_47_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: The ground in the second image\nB: The waves in the first image\nC: The plants in the second image\n",
+    "question": "Which part below is most severely affected by snowflakes?",
+    "context": "Candidates: A. The ground in the second image B. The waves in the first image C. The plants in the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_48_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_48_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: No\nB: Yes\n",
+    "question": "Are both of these images very clear?",
+    "context": "Candidates: A. No B. Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_49_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_49_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: House windows in the second image\nB: Banana in the first image\nC: Facial features of the person in the first image\n",
+    "question": "Which part below suffers the most severe underexposure problem?",
+    "context": "Candidates: A. House windows in the second image B. Banana in the first image C. Facial features of the person in the first image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_50_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_50_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Much less\nB: About the same\nC: Much more\n",
+    "question": "How is the noise situation in the second image compared to the first image?",
+    "context": "Candidates: A. Much less B. About the same C. Much more",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_51_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_51_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: the road in the second image\nB: the background of the first image\nC: the ground of the first image\n",
+    "question": "Which part below is most severely affected by snowflake-like distortion?",
+    "context": "Candidates: A. the road in the second image B. the background of the first image C. the ground of the first image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_52_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_52_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: The floor in the first image\nB: The ground in the second image\nC: The hand holding a gun in the second image\n",
+    "question": "Which part has the richest detail texture?",
+    "context": "Candidates: A. The floor in the first image B. The ground in the second image C. The hand holding a gun in the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_53_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_53_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Similar\nB: More sufficient\nC: Less sufficient\n",
+    "question": "Compared to the first image, how is the illumination of the second image?",
+    "context": "Candidates: A. Similar B. More sufficient C. Less sufficient",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_54_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_54_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Window in the second image\nB: Aircraft in the first image\nC: Person in the second image\n",
+    "question": "Which part below is most severely affected by overexposure?",
+    "context": "Candidates: A. Window in the second image B. Aircraft in the first image C. Person in the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_55_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_55_1.JPG"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: No\nB: Yes\n",
+    "question": "Are there severe motion blur in both images?",
+    "context": "Candidates: A. No B. Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_56_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_56_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: The person in red in the second image\nB: The facial part of the person in the first image\nC: The sunglasses in the first image\nD: The top of the tent in the second image\n",
+    "question": "Which part below is most severely affected by overexposure?",
+    "context": "Candidates: A. The person in red in the second image B. The facial part of the person in the first image C. The sunglasses in the first image D. The top of the tent in the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_57_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_57_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Background of the first image\nB: Character in the second image\nC: Character in the first image\n",
+    "question": "Which part is most severely affected by noise?",
+    "context": "Candidates: A. Background of the first image B. Character in the second image C. Character in the first image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_58_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_58_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Similar\nB: More Adequate\nC: Less Adequate\n",
+    "question": "Compared to the first image, how is the illumination of the second image?",
+    "context": "Candidates: A. Similar B. More Adequate C. Less Adequate",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_59_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_59_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Similar\nB: Less realistic\nC: More realistic\n",
+    "question": "Compared with the first image, how does the authenticity of the second image differ?",
+    "context": "Candidates: A. Similar B. Less realistic C. More realistic",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_60_0.bmp",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_60_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: No\nB: Yes\n",
+    "question": "Is the first image more realistic than the second image?",
+    "context": "Candidates: A. No B. Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_61_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_61_1.bmp"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: the top of the first image\nB: bird in the second image\nC: ground in the second image\n",
+    "question": "Which part below is most affected by overexposure?",
+    "context": "Candidates: A. the top of the first image B. bird in the second image C. ground in the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_62_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_62_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Similar\nB: More fake\nC: More authentic\n",
+    "question": "Compared to the second image, how does the first image's authenticity compare?",
+    "context": "Candidates: A. Similar B. More fake C. More authentic",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_63_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_63_1.webp"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: No\nB: Yes\n",
+    "question": "Is the first image clearer than the second image?",
+    "context": "Candidates: A. No B. Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_64_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_64_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: No\nB: Yes\n",
+    "question": "Are the details and textures in the first image clearer than those in the second image?",
+    "context": "Candidates: A. No B. Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_65_0.webp",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_65_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: More blurry\nB: Clearer\nC: About the same\n",
+    "question": "Compared to the first image, how is the sharpness of the second image?",
+    "context": "Candidates: A. More blurry B. Clearer C. About the same",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_66_0.bmp",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_66_1.JPG"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: No\nB: Yes\n",
+    "question": "Are the two images both quite clear?",
+    "context": "Candidates: A. No B. Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_67_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_67_1.JPG"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: No\nB: Yes\n",
+    "question": "Is the first image sharper than the second image?",
+    "context": "Candidates: A. No B. Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_68_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_68_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: No\nB: Yes\n",
+    "question": "Are there noise issues in both of these images?",
+    "context": "Candidates: A. No B. Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_69_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_69_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Much more severe\nB: Similar\nC: Much slighter\n",
+    "question": "How does the noise situation in the second image compare to the first image?",
+    "context": "Candidates: A. Much more severe B. Similar C. Much slighter",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_70_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_70_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Much worse\nB: Much better\nC: About the same\n",
+    "question": "Compared to the first image, how is the focusing situation of the second image?",
+    "context": "Candidates: A. Much worse B. Much better C. About the same",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_71_0.JPG",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_71_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: More authentic\nB: About the same\nC: Less authentic\n",
+    "question": "Compared to the first image, how does the authenticity of the second image compare?",
+    "context": "Candidates: A. More authentic B. About the same C. Less authentic",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_72_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_72_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Yes\nB: No\n",
+    "question": "Compared to the second image, is the detail texture of the first image clearer?",
+    "context": "Candidates: A. Yes B. No",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_73_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_73_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Similar\nB: Much worse\nC: Much better\n",
+    "question": "How is the focus of the second image relative to the first image?",
+    "context": "Candidates: A. Similar B. Much worse C. Much better",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_74_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_74_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Underexposure\nB: Motion blur\nC: Overexposure\nD: Blur\n",
+    "question": "Which kind of distortion is not present in the two images?",
+    "context": "Candidates: A. Underexposure B. Motion blur C. Overexposure D. Blur",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_75_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_75_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Overexposure\nB: Focus problem\nC: Noise\n",
+    "question": "What is the distortion that does not appear in the two images?",
+    "context": "Candidates: A. Overexposure B. Focus problem C. Noise",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_76_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_76_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: First image\nB: Second image\n",
+    "question": "Which image does not have overexposure distortion issue?",
+    "context": "Candidates: A. First image B. Second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_77_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_77_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Yes\nB: No\n",
+    "question": "Is the focus of the first image not as good as the second image?",
+    "context": "Candidates: A. Yes B. No",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_78_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_78_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: About the same\nB: Clearer\nC: Blurrier\n",
+    "question": "Compared to the first image, how clear are the texture details of the subject in the second image?",
+    "context": "Candidates: A. About the same B. Clearer C. Blurrier",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_79_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_79_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: About the same\nB: Much clearer\nC: Much blurrier\n",
+    "question": "Compared to the first image, how is the clarity of texture details in the second image?",
+    "context": "Candidates: A. About the same B. Much clearer C. Much blurrier",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_80_0.bmp",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_80_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: The wall in the first image\nB: The large tree on the right side in the second image\nC: The street light in the second image\nD: The clothes hanger in the first image\n",
+    "question": "Which part below is most affected by overexposure?",
+    "context": "Candidates: A. The wall in the first image B. The large tree on the right side in the second image C. The street light in the second image D. The clothes hanger in the first image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_81_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_81_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: About the same\nB: More blurry\nC: Clearer\n",
+    "question": "Compared to the first image, how is the clarity of the second image?",
+    "context": "Candidates: A. About the same B. More blurry C. Clearer",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_82_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_82_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Less authentic\nB: About the same\nC: More authentic\n",
+    "question": "Compared to the first image, how is the authenticity of the second image?",
+    "context": "Candidates: A. Less authentic B. About the same C. More authentic",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_83_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_83_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Yes\nB: No\n",
+    "question": "Is the first image sharper than the second image?",
+    "context": "Candidates: A. Yes B. No",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_84_0.JPG",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_84_1.bmp"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Less real\nB: About the same\nC: More real\n",
+    "question": "Compared to the first image, how does the reality of the second image compare?",
+    "context": "Candidates: A. Less real B. About the same C. More real",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_85_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_85_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: No\nB: Yes\n",
+    "question": "Is the noise in the first image much more obvious than in the second image?",
+    "context": "Candidates: A. No B. Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_86_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_86_1.bmp"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Similar\nB: More realistic\nC: Less realistic\n",
+    "question": "How does the realism of the second image compare to the first image?",
+    "context": "Candidates: A. Similar B. More realistic C. Less realistic",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_87_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_87_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Yes\nB: No\n",
+    "question": "Are both images not genuine?",
+    "context": "Candidates: A. Yes B. No",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_88_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_88_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Yes\nB: No\n",
+    "question": "Is the first image more realistic than the second image?",
+    "context": "Candidates: A. Yes B. No",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_89_0.webp",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_89_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: More blurry\nB: Clearer\nC: About the same\n",
+    "question": "Compared to the first image, how is the clarity of the second image?",
+    "context": "Candidates: A. More blurry B. Clearer C. About the same",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_90_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_90_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Much higher\nB: About the same\nC: Much lower\n",
+    "question": "Compared to the second image, how is the pixel quality of the first image?",
+    "context": "Candidates: A. Much higher B. About the same C. Much lower",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_91_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_91_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Slightly more\nB: More severe\nC: About the same\n",
+    "question": "Compared to the first image, how much is the second image affected by motion blur?",
+    "context": "Candidates: A. Slightly more B. More severe C. About the same",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_92_0.webp",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_92_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Yes\nB: No\n",
+    "question": "Is the first image clearer than the second image?",
+    "context": "Candidates: A. Yes B. No",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_93_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_93_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Similar\nB: More realistic\nC: Less realistic\n",
+    "question": "How does the realism of the second image compare to the first image?",
+    "context": "Candidates: A. Similar B. More realistic C. Less realistic",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_94_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_94_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: No\nB: Yes\n",
+    "question": "Is the first image sharper than the second image?",
+    "context": "Candidates: A. No B. Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_95_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_95_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Yes\nB: No\n",
+    "question": "Is the color of the first image more rich and vivid than the second image?",
+    "context": "Candidates: A. Yes B. No",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_96_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_96_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Yes\nB: No\n",
+    "question": "Is the first image significantly less clear than the second image?",
+    "context": "Candidates: A. Yes B. No",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_97_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_97_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: The sky in the upper right corner of the second image\nB: The buildings in the second image\nC: The lake surface in the first image\n",
+    "question": "Which part below has the most severe overexposure?",
+    "context": "Candidates: A. The sky in the upper right corner of the second image B. The buildings in the second image C. The lake surface in the first image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_98_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_98_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: More abundant\nB: Less abundant\nC: About the same\n",
+    "question": "Compared to the first image, how is the texture detail in the second image?",
+    "context": "Candidates: A. More abundant B. Less abundant C. About the same",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_99_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_99_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Background of the first image\nB: Stamen of the second image\nC: Person in the first image\n",
+    "question": "Which part below is most severely affected by out-of-focus?",
+    "context": "Candidates: A. Background of the first image B. Stamen of the second image C. Person in the first image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_100_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_100_1.JPG"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Underexposed\nB: Blurry\nC: Motion blur\nD: Overexposed\n",
+    "question": "Compared to the second image, what kind of distortion does the first image not have?",
+    "context": "Candidates: A. Underexposed B. Blurry C. Motion blur D. Overexposed",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_101_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_101_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: More blurry\nB: About the same\nC: Clearer\n",
+    "question": "Compared to the first image, how is the clarity of the subject's details and textures in the second image?",
+    "context": "Candidates: A. More blurry B. About the same C. Clearer",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_102_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_102_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Lens flare\nB: Motion blur\nC: Overexposure\nD: Noise\n",
+    "question": "What kind of distortion do not appear in these two images?",
+    "context": "Candidates: A. Lens flare B. Motion blur C. Overexposure D. Noise",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_103_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_103_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Blur\nB: Motion blur\nC: Underexposure\nD: Overexposure\n",
+    "question": "What kind of distortion is not present in the two images?",
+    "context": "Candidates: A. Blur B. Motion blur C. Underexposure D. Overexposure",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_104_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_104_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: richer\nB: about the same\nC: more monotonous\n",
+    "question": "Compared to the first image, how rich is the color in the second image?",
+    "context": "Candidates: A. richer B. about the same C. more monotonous",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_105_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_105_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Yes\nB: No\n",
+    "question": "Is the color of the first image richer than the second image?",
+    "context": "Candidates: A. Yes B. No",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_106_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_106_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: No\nB: Yes\n",
+    "question": "Is the sharpness of the first image lower than the second image?",
+    "context": "Candidates: A. No B. Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_107_0.bmp",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_107_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Yes\nB: No\n",
+    "question": "Is the color of the first image richer than the second image?",
+    "context": "Candidates: A. Yes B. No",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_108_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_108_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Similar\nB: Worse\nC: Better\n",
+    "question": "How does the composition of the second image compare to the first image?",
+    "context": "Candidates: A. Similar B. Worse C. Better",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_109_0.bmp",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_109_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: The sky in the first image\nB: The person in the first image\nC: The bus in the second image\n",
+    "question": "Which part below is most affected by noise?",
+    "context": "Candidates: A. The sky in the first image B. The person in the first image C. The bus in the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_110_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_110_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Underexposure\nB: Low light\nC: Out of focus\nD: Noise\n",
+    "question": "Which type of distortion is more severe in the second image compared to the first image?",
+    "context": "Candidates: A. Underexposure B. Low light C. Out of focus D. Noise",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_111_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_111_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: More severe\nB: Slightly more\nC: About the same\n",
+    "question": "Compared to the second image, how is the first image affected by underexposure?",
+    "context": "Candidates: A. More severe B. Slightly more C. About the same",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_112_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_112_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: No\nB: Yes\n",
+    "question": "Are both of these images very realistic?",
+    "context": "Candidates: A. No B. Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_113_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_113_1.webp"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: No\nB: Yes\n",
+    "question": "Is the illumination sufficient in both of these images?",
+    "context": "Candidates: A. No B. Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_114_0.JPG",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_114_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Yes\nB: No\n",
+    "question": "Is the first image sharper than the second image?",
+    "context": "Candidates: A. Yes B. No",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_115_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_115_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: similar\nB: less rich\nC: richer\n",
+    "question": "Compared to the first image, how is the texture detail of the second image?",
+    "context": "Candidates: A. similar B. less rich C. richer",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_116_0.JPG",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_116_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: No\nB: Yes\n",
+    "question": "Is the color of the first image more vivid than the second image?",
+    "context": "Candidates: A. No B. Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_117_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_117_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: No\nB: Yes\n",
+    "question": "Is the first image clearer than the second image?",
+    "context": "Candidates: A. No B. Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_118_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_118_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Front building in the second image\nB: Aircraft in the first image\nC: Left sky in the second image\n",
+    "question": "Which part below is most affected by overexposure?",
+    "context": "Candidates: A. Front building in the second image B. Aircraft in the first image C. Left sky in the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_119_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_119_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: No\nB: Yes\n",
+    "question": "Is the noise in the first image significantly more severe than in the second image?",
+    "context": "Candidates: A. No B. Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_120_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_120_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: No\nB: Yes\n",
+    "question": "Is the first image clearer than the second image?",
+    "context": "Candidates: A. No B. Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_121_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_121_1.webp"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Characters in the first image\nB: Top right corner of the second image\nC: Ground in the first image\n",
+    "question": "Which part below is most affected by overexposure?",
+    "context": "Candidates: A. Characters in the first image B. Top right corner of the second image C. Ground in the first image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_122_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_122_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: First image\nB: Second image\n",
+    "question": "Which of the following images is most affected by motion blur?",
+    "context": "Candidates: A. First image B. Second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_123_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_123_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: No\nB: Yes\n",
+    "question": "Is the illumination sufficient in these two images?",
+    "context": "Candidates: A. No B. Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_124_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_124_1.bmp"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Yes\nB: No\n",
+    "question": "Is the fidelity of the first image lower than the second image?",
+    "context": "Candidates: A. Yes B. No",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_125_0.webp",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_125_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Yes\nB: No\n",
+    "question": "Are both of these images clear?",
+    "context": "Candidates: A. Yes B. No",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_126_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_126_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Background of the first image\nB: Background of the second image\nC: Person in the first image\n",
+    "question": "Which part below is most severely affected by motion blur?",
+    "context": "Candidates: A. Background of the first image B. Background of the second image C. Person in the first image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_127_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_127_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Similar\nB: More blurry\nC: Clearer\n",
+    "question": "Compared to the first image, how is the clarity of the second image?",
+    "context": "Candidates: A. Similar B. More blurry C. Clearer",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_128_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_128_1.bmp"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Similar\nB: Clearer\nC: Blurrier\n",
+    "question": "How does the clarity of the second image compare to the first image?",
+    "context": "Candidates: A. Similar B. Clearer C. Blurrier",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_129_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_129_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Yes\nB: No\n",
+    "question": "Are both images rich in color?",
+    "context": "Candidates: A. Yes B. No",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_130_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_130_1.webp"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Second image\nB: First image\n",
+    "question": "Which of the following images has a serious overexposure issue?",
+    "context": "Candidates: A. Second image B. First image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_131_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_131_1.webp"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Yes\nB: No\n",
+    "question": "Are the colors of these two images not very vivid?",
+    "context": "Candidates: A. Yes B. No",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_132_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_132_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Much better\nB: Much worse\nC: About the same\n",
+    "question": "Compared to the first image, how rich are the colors in the second image?",
+    "context": "Candidates: A. Much better B. Much worse C. About the same",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_133_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_133_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Slightly worse\nB: Slightly better\nC: Much worse\nD: About the same\n",
+    "question": "Compared to the lighting of the second image, how is the lighting of the first image?",
+    "context": "Candidates: A. Slightly worse B. Slightly better C. Much worse D. About the same",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_134_0.JPG",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_134_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: No\nB: Yes\n",
+    "question": "Are both of these images clear?",
+    "context": "Candidates: A. No B. Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_135_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_135_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Ground in the first image\nB: Car in the first image\nC: Plane in the second image\nD: Background in the second image\n",
+    "question": "Which area is more affected by motion blur?",
+    "context": "Candidates: A. Ground in the first image B. Car in the first image C. Plane in the second image D. Background in the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_136_0.bmp",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_136_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Yes\nB: No\n",
+    "question": "Is the lighting of the first image more sufficient than the second image?",
+    "context": "Candidates: A. Yes B. No",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_137_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_137_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: No\nB: Yes\n",
+    "question": "Are both images very real?",
+    "context": "Candidates: A. No B. Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_138_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_138_1.webp"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Yes\nB: No\n",
+    "question": "Are both of these images relatively blurry?",
+    "context": "Candidates: A. Yes B. No",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_139_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_139_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: The player in the first image\nB: The horse in the second image\nC: The audience in the background of the first image\nD: The background in the second image\n",
+    "question": "In which area of the two images is more affected by motion blur?",
+    "context": "Candidates: A. The player in the first image B. The horse in the second image C. The audience in the background of the first image D. The background in the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_140_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_140_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Ground of the second image\nB: Sky of the second image\nC: Ground of the first image\n",
+    "question": "Which part has the most severe overexposure issue?",
+    "context": "Candidates: A. Ground of the second image B. Sky of the second image C. Ground of the first image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_141_0.JPG",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_141_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: No\nB: Yes\n",
+    "question": "Is the first image more realistic than the second image?",
+    "context": "Candidates: A. No B. Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_142_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_142_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Yes\nB: No\n",
+    "question": "Is the first image sharper than the second image?",
+    "context": "Candidates: A. Yes B. No",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_143_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_143_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: About the same\nB: Much blurrier\nC: Much clearer\n",
+    "question": "Compared to the first image, how is the clarity of the second image?",
+    "context": "Candidates: A. About the same B. Much blurrier C. Much clearer",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_144_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_144_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Background of the second image\nB: Table in front of the second image\nC: Grass in the first image\nD: Snowy mountain in the first image\n",
+    "question": "Which area is more severely affected by blurring?",
+    "context": "Candidates: A. Background of the second image B. Table in front of the second image C. Grass in the first image D. Snowy mountain in the first image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_145_0.JPG",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_145_1.webp"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: The ground in the second image\nB: The person in the second image\nC: The person in the first image\n",
+    "question": "Which part below is most affected by motion blur?",
+    "context": "Candidates: A. The ground in the second image B. The person in the second image C. The person in the first image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_146_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_146_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Similar\nB: Slightly smaller\nC: Significantly larger\n",
+    "question": "Compared to the first image, how is the second image affected by overexposure?",
+    "context": "Candidates: A. Similar B. Slightly smaller C. Significantly larger",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_147_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_147_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: No\nB: Yes\n",
+    "question": "Is the first image sharper than the second image?",
+    "context": "Candidates: A. No B. Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_148_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_148_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Less rich\nB: About the same\nC: Richer\n",
+    "question": "Compared to the first image, how is the color richness of the second image?",
+    "context": "Candidates: A. Less rich B. About the same C. Richer",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_149_0.JPG",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_149_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Much poorer\nB: Much richer\nC: About the same\n",
+    "question": "Compared to the first image, how is the richness of colors in the second image?",
+    "context": "Candidates: A. Much poorer B. Much richer C. About the same",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_150_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_150_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Similar\nB: Better\nC: Worse\n",
+    "question": "Compared to the first photo, how is the focus of the second photo?",
+    "context": "Candidates: A. Similar B. Better C. Worse",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_151_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_151_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: No\nB: Yes\n",
+    "question": "Is the composition of the first image better than the second image?",
+    "context": "Candidates: A. No B. Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_152_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_152_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: More monotonous\nB: About the same\nC: More rich\n",
+    "question": "Compared to the first image, how is the color richness of the second image?",
+    "context": "Candidates: A. More monotonous B. About the same C. More rich",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_153_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_153_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Similar\nB: Clearer\nC: Blurrier\n",
+    "question": "Compared to the first image, how is the clarity of the second image?",
+    "context": "Candidates: A. Similar B. Clearer C. Blurrier",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_154_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_154_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Ground in the first image\nB: Sky in the second image\nC: Lion in the first image\n",
+    "question": "Which part below is most severely affected by overexposure?",
+    "context": "Candidates: A. Ground in the first image B. Sky in the second image C. Lion in the first image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_155_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_155_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: More authentic\nB: About the same\nC: Less authentic\n",
+    "question": "Compared to the first image, how would you rate the authenticity of the second image?",
+    "context": "Candidates: A. More authentic B. About the same C. Less authentic",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_156_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_156_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Yes\nB: No\n",
+    "question": "Are both of these images very clear?",
+    "context": "Candidates: A. Yes B. No",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_157_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_157_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: No\nB: Yes\n",
+    "question": "Have both figures in these two images been overexposed?",
+    "context": "Candidates: A. No B. Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_158_0.bmp",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_158_1.JPG"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Area in the first image\nB: Roof of the building in the second image\nC: Athlete in the first image\nD: Sky in the second image\n",
+    "question": "Which area is most affected by overexposure?",
+    "context": "Candidates: A. Area in the first image B. Roof of the building in the second image C. Athlete in the first image D. Sky in the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_159_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_159_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: The moon in the second image\nB: The person in the bottom right corner of the first image\nC: The left sky in the first image\n",
+    "question": "Which part below is most affected by overexposure?",
+    "context": "Candidates: A. The moon in the second image B. The person in the bottom right corner of the first image C. The left sky in the first image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_160_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_160_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Yes\nB: No\n",
+    "question": "Is the texture detail of the first image less rich than the second image?",
+    "context": "Candidates: A. Yes B. No",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_161_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_161_1.bmp"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Less real\nB: About the same\nC: More real\n",
+    "question": "Compared to the first image, how real is the second image?",
+    "context": "Candidates: A. Less real B. About the same C. More real",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_162_0.webp",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_162_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: About the same\nB: More blurry\nC: Clearer\n",
+    "question": "Compared to the first image, how clear is the second image?",
+    "context": "Candidates: A. About the same B. More blurry C. Clearer",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_163_0.webp",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_163_1.webp"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Low light\nB: Vignetting\nC: Noise\nD: Motion blur\n",
+    "question": "Which type of distortion does not appear in the two images?",
+    "context": "Candidates: A. Low light B. Vignetting C. Noise D. Motion blur",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_164_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_164_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Out of focus\nB: Noise\nC: Overexposure\n",
+    "question": "What kind of distortion did not appear in these two images?",
+    "context": "Candidates: A. Out of focus B. Noise C. Overexposure",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_165_0.bmp",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_165_1.JPG"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: overexposure\nB: motion blur\nC: out of focus\n",
+    "question": "Are there any distortion issues in these two images?",
+    "context": "Candidates: A. overexposure B. motion blur C. out of focus",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_166_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_166_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Yes\nB: No\n",
+    "question": "Is the noise in the first image larger than the second image?",
+    "context": "Candidates: A. Yes B. No",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_167_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_167_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: No\nB: Yes\n",
+    "question": "Is the sky in the second image more affected by overexposure than the sky in the first image?",
+    "context": "Candidates: A. No B. Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_168_0.bmp",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_168_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Less sufficient\nB: About the same\nC: More sufficient\n",
+    "question": "Compared to the first image, how is the lighting in the second image?",
+    "context": "Candidates: A. Less sufficient B. About the same C. More sufficient",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_169_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_169_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: No\nB: Yes\n",
+    "question": "Is the first image blurrier than the second image?",
+    "context": "Candidates: A. No B. Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_170_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_170_1.JPG"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: No\nB: Yes\n",
+    "question": "Is the first image sharper than the second image?",
+    "context": "Candidates: A. No B. Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_171_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_171_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Much clearer\nB: About the same\nC: Much blurrier\n",
+    "question": "Compared to the second image, how is the fine texture of the first image?",
+    "context": "Candidates: A. Much clearer B. About the same C. Much blurrier",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_172_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_172_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Less sufficient\nB: More sufficient\nC: About the same\n",
+    "question": "How does the illumination of the second image compare to the first image?",
+    "context": "Candidates: A. Less sufficient B. More sufficient C. About the same",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_173_0.bmp",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_173_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Low light\nB: Blur\nC: Motion blur\n",
+    "question": "What problems are not present in the two images?",
+    "context": "Candidates: A. Low light B. Blur C. Motion blur",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_174_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_174_1.JPG"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Motion blur\nB: Underexposure\nC: Overexposure\nD: Weak light\n",
+    "question": "Which of the following distortions does not appear in the two images?",
+    "context": "Candidates: A. Motion blur B. Underexposure C. Overexposure D. Weak light",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_175_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_175_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Yes\nB: No\n",
+    "question": "Is the illumination sufficient in both of these images?",
+    "context": "Candidates: A. Yes B. No",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_176_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_176_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: No\nB: Yes\n",
+    "question": "Is the first image sharper than the second image?",
+    "context": "Candidates: A. No B. Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_177_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_177_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Yes\nB: No\n",
+    "question": "Is the second image more realistic than the first image?",
+    "context": "Candidates: A. Yes B. No",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_178_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_178_1.webp"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Blur\nB: Overexposure\nC: Underexposure\nD: Noise\n",
+    "question": "Please identify what kind of distortion is not present in these two images?",
+    "context": "Candidates: A. Blur B. Overexposure C. Underexposure D. Noise",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_179_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_179_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: similar\nB: more realistic\nC: less realistic\n",
+    "question": "Compared to the first image, how is the realism of the second image?",
+    "context": "Candidates: A. similar B. more realistic C. less realistic",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_180_0.webp",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_180_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Yes\nB: No\n",
+    "question": "Is the first image more realistic?",
+    "context": "Candidates: A. Yes B. No",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_181_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_181_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Second image\nB: First image\n",
+    "question": "Which image below is more severely affected by overexposure?",
+    "context": "Candidates: A. Second image B. First image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_182_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_182_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Similar\nB: Worse\nC: Better\n",
+    "question": "How does the composition of the second image compare to the first image?",
+    "context": "Candidates: A. Similar B. Worse C. Better",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_183_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_183_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Very dark\nB: Much darker\nC: Much brighter\nD: About the same\n",
+    "question": "Compared to the first image, how is the lighting in the second image?",
+    "context": "Candidates: A. Very dark B. Much darker C. Much brighter D. About the same",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_184_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_184_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: The man in front of the lens in the first picture\nB: The bus in the first picture\nC: The fish in the second picture\nD: The leaves in the background of the second picture\n",
+    "question": "Which area is more affected by low light?",
+    "context": "Candidates: A. The man in front of the lens in the first picture B. The bus in the first picture C. The fish in the second picture D. The leaves in the background of the second picture",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_185_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_185_1.bmp"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Much better\nB: Much worse\nC: About the same\n",
+    "question": "Compared to the first image, how is the sharpness of the second image?",
+    "context": "Candidates: A. Much better B. Much worse C. About the same",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_186_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_186_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: No\nB: Yes\n",
+    "question": "Are both of these images very blurry?",
+    "context": "Candidates: A. No B. Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_187_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_187_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: No\nB: Yes\n",
+    "question": "Are both of these images not very clear?",
+    "context": "Candidates: A. No B. Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_188_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_188_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: The person riding a bike in the first image\nB: The background of the first image\nC: The plant in the second image\n",
+    "question": "Which part below is most severely affected by motion blur?",
+    "context": "Candidates: A. The person riding a bike in the first image B. The background of the first image C. The plant in the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_189_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_189_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: No\nB: Yes\n",
+    "question": "Compared to the second image, is the first image more affected by motion blur?",
+    "context": "Candidates: A. No B. Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_190_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_190_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: More monotonous\nB: About the same\nC: More rich\n",
+    "question": "Compared to the first image, what is the color vividness of the second image?",
+    "context": "Candidates: A. More monotonous B. About the same C. More rich",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_191_0.JPG",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_191_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Yes\nB: No\n",
+    "question": "Is the first image sharper than the second image?",
+    "context": "Candidates: A. Yes B. No",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_192_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_192_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: the yellow doll in the first image\nB: the street lamp in the second image\nC: the wall in the first image\nD: the vehicle in the second image\n",
+    "question": "Which part below is most affected by overexposure?",
+    "context": "Candidates: A. the yellow doll in the first image B. the street lamp in the second image C. the wall in the first image D. the vehicle in the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_193_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_193_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: No\nB: Yes\n",
+    "question": "Is the detail texture of the second image clearer than the first image?",
+    "context": "Candidates: A. No B. Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_194_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_194_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: No\nB: Yes\n",
+    "question": "Is the first image more realistic than the second image?",
+    "context": "Candidates: A. No B. Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_195_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_195_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: No\nB: Yes\n",
+    "question": "Is the color of the first image richer than the second image?",
+    "context": "Candidates: A. No B. Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_196_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_196_1.JPG"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: Yes\nB: No\n",
+    "question": "Is the first image more authentic than the second image?",
+    "context": "Candidates: A. Yes B. No",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_197_0.JPG",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_197_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: worse\nB: better\nC: similar\n",
+    "question": "How does the lighting of the second image compare to the first image?",
+    "context": "Candidates: A. worse B. better C. similar",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_198_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_198_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_q_bench+",
+    "visual_input_component": "natural image",
+    "source": "q bench+",
+    "options": "A: No\nB: Yes\n",
+    "question": "Is the focus of the first image better than the second image?",
+    "context": "Candidates: A. No B. Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_199_0.jpg",
+      "../MMIU-Benchmark/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_199_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_0_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_0_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_1_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_1_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_2_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_2_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_3_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_3_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_4_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_4_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_5_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_5_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_6_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_6_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_7_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_7_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_8_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_8_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_9_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_9_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_10_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_10_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_11_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_11_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_12_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_12_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_13_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_13_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_14_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_14_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_15_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_15_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_16_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_16_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_17_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_17_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_18_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_18_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_19_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_19_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_20_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_20_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_21_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_21_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_22_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_22_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_23_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_23_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_24_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_24_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_25_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_25_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_26_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_26_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_27_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_27_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_28_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_28_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_29_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_29_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_30_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_30_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_31_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_31_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_32_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_32_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_33_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_33_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_34_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_34_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_35_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_35_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_36_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_36_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_37_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_37_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_38_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_38_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_39_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_39_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_40_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_40_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_41_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_41_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_42_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_42_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_43_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_43_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_44_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_44_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_45_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_45_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_46_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_46_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_47_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_47_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_48_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_48_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_49_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_49_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_50_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_50_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_51_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_51_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_52_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_52_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_53_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_53_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_54_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_54_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_55_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_55_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_56_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_56_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_57_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_57_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_58_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_58_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_59_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_59_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_60_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_60_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_61_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_61_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_62_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_62_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_63_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_63_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_64_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_64_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_65_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_65_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_66_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_66_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_67_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_67_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_68_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_68_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_69_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_69_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_70_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_70_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_71_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_71_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_72_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_72_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_73_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_73_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_74_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_74_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_75_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_75_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_76_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_76_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_77_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_77_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_78_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_78_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_79_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_79_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_80_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_80_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_81_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_81_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_82_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_82_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_83_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_83_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_84_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_84_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_85_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_85_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_86_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_86_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_87_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_87_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_88_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_88_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_89_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_89_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_90_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_90_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_91_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_91_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_92_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_92_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_93_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_93_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_94_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_94_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_95_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_95_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_96_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_96_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_97_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_97_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_98_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_98_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_99_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_99_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_100_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_100_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_101_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_101_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_102_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_102_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_103_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_103_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_104_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_104_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_105_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_105_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_106_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_106_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_107_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_107_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_108_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_108_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_109_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_109_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_110_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_110_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_111_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_111_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_112_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_112_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_113_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_113_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_114_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_114_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_115_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_115_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_116_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_116_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_117_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_117_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_118_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_118_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_119_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_119_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_120_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_120_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_121_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_121_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_122_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_122_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_123_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_123_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_124_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_124_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_125_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_125_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_126_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_126_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_127_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_127_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_128_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_128_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_129_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_129_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_130_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_130_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_131_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_131_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_132_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_132_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_133_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_133_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_134_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_134_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_135_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_135_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_136_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_136_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_137_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_137_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_138_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_138_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_139_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_139_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_140_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_140_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_141_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_141_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_142_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_142_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_143_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_143_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_144_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_144_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_145_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_145_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_146_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_146_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_147_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_147_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_148_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_148_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_149_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_149_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_150_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_150_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_151_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_151_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_152_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_152_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_153_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_153_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_154_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_154_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_155_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_155_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_156_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_156_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_157_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_157_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_158_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_158_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_159_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_159_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_160_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_160_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_161_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_161_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_162_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_162_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_163_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_163_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_164_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_164_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_165_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_165_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_166_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_166_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_167_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_167_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_168_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_168_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_169_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_169_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_170_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_170_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_171_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_171_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_172_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_172_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_173_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_173_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_174_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_174_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_175_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_175_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_176_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_176_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_177_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_177_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_178_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_178_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_179_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_179_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_180_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_180_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_181_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_181_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_182_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_182_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_183_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_183_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_184_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_184_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_185_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_185_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_186_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_186_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_187_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_187_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_188_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_188_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_189_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_189_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_190_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_190_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_191_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_191_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_192_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_192_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_193_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_193_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_194_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_194_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_195_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_195_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_196_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_196_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_197_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_197_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a lower brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_198_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_198_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_quality_assessment_ve_lol_l",
+    "visual_input_component": "natural image",
+    "source": "ve_lol_l",
+    "options": "A: the first image\nB: the second image\n",
+    "question": "Which image has a higher brightness?",
+    "context": "Candidates: A. the first image B. the second image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_199_0.png",
+      "../MMIU-Benchmark/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_199_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: sleeps\nB: stands up and run towards the man\nC: take a few steps\nD: pour the sand out\nE: touch baby s foot",
+    "question": "what does the girl do after landing on the bed the first time",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: sleeps\nB: stands up and run towards the man\nC: take a few steps\nD: pour the sand out\nE: touch baby s foot\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_0_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_0_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_0_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_0_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_0_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_0_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_0_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_0_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_0_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_0_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_0_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_0_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_0_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_0_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_0_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_0_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: walking dogs on leash\nB: posing\nC: cleaning the table\nD: acting\nE: playing tablet",
+    "question": "what is the lady with apron doing",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: walking dogs on leash\nB: posing\nC: cleaning the table\nD: acting\nE: playing tablet\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_1_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_1_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_1_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_1_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_1_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_1_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_1_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_1_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_1_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_1_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_1_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_1_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_1_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_1_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_1_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_1_15.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: wearing a harness\nB: wear safety belt\nC: holding sled with both hands\nD: move steering wheel\nE: closed the doors of the sleigh",
+    "question": "how does the man in dark green stay sitting on the sleigh while going down",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: wearing a harness\nB: wear safety belt\nC: holding sled with both hands\nD: move steering wheel\nE: closed the doors of the sleigh\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_2_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_2_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_2_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_2_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_2_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_2_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_2_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_2_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_2_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_2_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_2_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_2_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_2_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_2_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_2_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_2_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: keeps it in his pocket\nB: walk\nC: pet dog\nD: take off helmet\nE: wipe his mouth",
+    "question": "what was the man in black holding a bottle doing before he walked away",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: keeps it in his pocket\nB: walk\nC: pet dog\nD: take off helmet\nE: wipe his mouth\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_3_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_3_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_3_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_3_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_3_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_3_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_3_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_3_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_3_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_3_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_3_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_3_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_3_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_3_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_3_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_3_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: the cat was below her\nB: better posture for photoshoot\nC: not make it dirty\nD: washing her legs\nE: she was stepping on mud",
+    "question": "why did the woman hold her dress up high",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: the cat was below her\nB: better posture for photoshoot\nC: not make it dirty\nD: washing her legs\nE: she was stepping on mud\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_4_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_4_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_4_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_4_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_4_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_4_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_4_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_4_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_4_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_4_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_4_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_4_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_4_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_4_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_4_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_4_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: on the plate\nB: on the table\nC: on the bed\nD: on the sofa\nE: on the floor",
+    "question": "where is the food",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: on the plate\nB: on the table\nC: on the bed\nD: on the sofa\nE: on the floor\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_5_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_5_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_5_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_5_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_5_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_5_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_5_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_5_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_5_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_5_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_5_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_5_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_5_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_5_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_5_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_5_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: walk against wind\nB: finished controlling the helicopter\nC: finished watching the peacock\nD: give them guidane\nE: guide baby forward",
+    "question": "why did a man in orange suddenly walk over at the end",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: walk against wind\nB: finished controlling the helicopter\nC: finished watching the peacock\nD: give them guidane\nE: guide baby forward\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_6_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_6_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_6_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_6_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_6_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_6_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_6_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_6_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_6_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_6_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_6_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_6_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_6_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_6_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_6_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_6_15.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: ballet dress\nB: white costume\nC: chef apron\nD: couple outfit\nE: dancing outfit",
+    "question": "what are both of them wearing in the video",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: ballet dress\nB: white costume\nC: chef apron\nD: couple outfit\nE: dancing outfit\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_7_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_7_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_7_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_7_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_7_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_7_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_7_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_7_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_7_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_7_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_7_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_7_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_7_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_7_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_7_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_7_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: tie knots\nB: taste snow\nC: sunk\nD: can not eat too much\nE: eating food",
+    "question": "why did the baby eat the spagetti strand by strand",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: tie knots\nB: taste snow\nC: sunk\nD: can not eat too much\nE: eating food\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_8_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_8_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_8_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_8_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_8_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_8_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_8_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_8_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_8_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_8_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_8_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_8_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_8_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_8_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_8_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_8_15.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: stand up and raise his hands\nB: point to someone\nC: run towards the camera\nD: push her\nE: point at the baby",
+    "question": "how does the white hair man react after seeing the girl fell",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: stand up and raise his hands\nB: point to someone\nC: run towards the camera\nD: push her\nE: point at the baby\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_9_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_9_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_9_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_9_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_9_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_9_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_9_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_9_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_9_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_9_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_9_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_9_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_9_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_9_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_9_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_9_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: trying to look at slides\nB: crossing muddy fields\nC: check on dog\nD: see whether boy follows\nE: check on baby",
+    "question": "why does the man keep turning around while pulling the sled",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: trying to look at slides\nB: crossing muddy fields\nC: check on dog\nD: see whether boy follows\nE: check on baby\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_10_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_10_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_10_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_10_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_10_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_10_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_10_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_10_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_10_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_10_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_10_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_10_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_10_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_10_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_10_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_10_15.jpg"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: filming\nB: pass paper to woman\nC: singing\nD: standing and watching\nE: dance",
+    "question": "what does the man in white do after moving near to the microphone",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: filming\nB: pass paper to woman\nC: singing\nD: standing and watching\nE: dance\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_11_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_11_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_11_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_11_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_11_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_11_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_11_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_11_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_11_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_11_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_11_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_11_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_11_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_11_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_11_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_11_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: unstable to move\nB: wants to win\nC: interact with robot\nD: cycle\nE: act like tugging tree",
+    "question": "why does the woman have to be next to the robot",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: unstable to move\nB: wants to win\nC: interact with robot\nD: cycle\nE: act like tugging tree\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_12_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_12_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_12_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_12_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_12_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_12_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_12_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_12_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_12_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_12_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_12_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_12_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_12_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_12_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_12_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_12_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: eat apple\nB: play with it\nC: to touch his face\nD: drink water\nE: he was doing an experiment",
+    "question": "why did the man with the cap move his hands at the start",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: eat apple\nB: play with it\nC: to touch his face\nD: drink water\nE: he was doing an experiment\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_13_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_13_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_13_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_13_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_13_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_13_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_13_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_13_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_13_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_13_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_13_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_13_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_13_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_13_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_13_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_13_15.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: soit food out\nB: puts more noodles\nC: walk away\nD: put his hands on his knees\nE: take away pacifier",
+    "question": "what does the adult do after the baby finishes the first strand",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: soit food out\nB: puts more noodles\nC: walk away\nD: put his hands on his knees\nE: take away pacifier\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_14_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_14_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_14_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_14_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_14_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_14_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_14_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_14_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_14_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_14_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_14_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_14_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_14_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_14_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_14_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_14_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: dancing hall\nB: beach\nC: boxing ring\nD: backyard\nE: living room",
+    "question": "where is this video taken",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: dancing hall\nB: beach\nC: boxing ring\nD: backyard\nE: living room\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_15_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_15_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_15_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_15_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_15_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_15_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_15_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_15_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_15_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_15_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_15_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_15_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_15_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_15_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_15_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_15_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: shake water off their heads\nB: cleaning itself\nC: looking for food\nD: hiding from parrot\nE: sleeping",
+    "question": "why did the parrot on the perch clean tuck its head in while resting on the perch",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: shake water off their heads\nB: cleaning itself\nC: looking for food\nD: hiding from parrot\nE: sleeping\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_16_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_16_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_16_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_16_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_16_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_16_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_16_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_16_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_16_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_16_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_16_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_16_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_16_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_16_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_16_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_16_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: take pictures\nB: filming the baby\nC: playing a game\nD: talk to someone in phone\nE: record for memory",
+    "question": "why is the bald man holding a phone to his ear",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: take pictures\nB: filming the baby\nC: playing a game\nD: talk to someone in phone\nE: record for memory\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_17_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_17_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_17_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_17_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_17_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_17_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_17_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_17_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_17_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_17_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_17_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_17_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_17_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_17_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_17_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_17_15.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: fencing\nB: wedding anniversary\nC: party\nD: talent perfromance\nE: public speaking event",
+    "question": "what event is occuring",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: fencing\nB: wedding anniversary\nC: party\nD: talent perfromance\nE: public speaking event\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_18_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_18_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_18_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_18_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_18_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_18_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_18_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_18_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_18_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_18_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_18_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_18_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_18_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_18_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_18_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_18_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: room\nB: forest sanctuary\nC: on the plane\nD: farm\nE: sofa",
+    "question": "where is this video taken",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: room\nB: forest sanctuary\nC: on the plane\nD: farm\nE: sofa\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_19_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_19_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_19_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_19_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_19_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_19_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_19_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_19_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_19_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_19_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_19_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_19_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_19_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_19_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_19_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_19_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: lick its paws\nB: hit the squirrel\nC: look at the cat eat\nD: stop licking\nE: jumped back",
+    "question": "what did the orange cat do after the brown cat found food",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: lick its paws\nB: hit the squirrel\nC: look at the cat eat\nD: stop licking\nE: jumped back\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_20_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_20_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_20_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_20_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_20_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_20_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_20_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_20_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_20_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_20_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_20_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_20_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_20_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_20_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_20_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_20_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: stretch out hand\nB: using red ball\nC: point to book\nD: drawing\nE: shake the toy",
+    "question": "how did the lady in purple try attracting the children s attention",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: stretch out hand\nB: using red ball\nC: point to book\nD: drawing\nE: shake the toy\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_21_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_21_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_21_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_21_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_21_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_21_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_21_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_21_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_21_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_21_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_21_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_21_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_21_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_21_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_21_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_21_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: by breaking\nB: jumping\nC: flying\nD: from a plate\nE: peck",
+    "question": "how do the birds eat",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: by breaking\nB: jumping\nC: flying\nD: from a plate\nE: peck\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_22_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_22_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_22_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_22_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_22_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_22_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_22_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_22_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_22_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_22_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_22_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_22_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_22_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_22_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_22_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_22_15.jpg"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: resting\nB: passionately acting\nC: being fed\nD: dancing\nE: sleeping",
+    "question": "why are there two birds standing on the hand",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: resting\nB: passionately acting\nC: being fed\nD: dancing\nE: sleeping\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_23_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_23_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_23_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_23_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_23_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_23_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_23_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_23_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_23_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_23_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_23_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_23_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_23_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_23_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_23_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_23_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: lies on chair\nB: move forward\nC: they laughed\nD: look down at baby\nE: hold hands",
+    "question": "what did the lady and man do after waving their hands in the air",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: lies on chair\nB: move forward\nC: they laughed\nD: look down at baby\nE: hold hands\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_24_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_24_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_24_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_24_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_24_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_24_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_24_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_24_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_24_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_24_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_24_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_24_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_24_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_24_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_24_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_24_15.jpg"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: learn dancing\nB: playing the drum\nC: listen to her talk\nD: to see what show she is watching\nE: she is opening something",
+    "question": "why does everyone focus on the lady in white sitting on the floor",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: learn dancing\nB: playing the drum\nC: listen to her talk\nD: to see what show she is watching\nE: she is opening something\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_25_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_25_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_25_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_25_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_25_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_25_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_25_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_25_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_25_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_25_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_25_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_25_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_25_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_25_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_25_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_25_15.jpg"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: walk away\nB: ran one round\nC: walk and look around\nD: jump\nE: black dog runs it after",
+    "question": "what does the white dog do after the brown dog completes one round in the middle",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: walk away\nB: ran one round\nC: walk and look around\nD: jump\nE: black dog runs it after\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_26_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_26_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_26_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_26_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_26_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_26_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_26_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_26_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_26_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_26_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_26_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_26_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_26_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_26_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_26_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_26_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: stopped in front of the baby\nB: move toy around\nC: chew on a gum\nD: play with green toy\nE: hold up a cup",
+    "question": "what does the person do while the dog is jumping up and down",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: stopped in front of the baby\nB: move toy around\nC: chew on a gum\nD: play with green toy\nE: hold up a cup\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_27_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_27_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_27_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_27_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_27_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_27_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_27_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_27_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_27_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_27_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_27_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_27_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_27_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_27_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_27_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_27_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: with toys\nB: with the fence\nC: with fans\nD: with yellow sign boards\nE: with fists",
+    "question": "how did both of them hit each other",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: with toys\nB: with the fence\nC: with fans\nD: with yellow sign boards\nE: with fists\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_28_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_28_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_28_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_28_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_28_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_28_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_28_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_28_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_28_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_28_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_28_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_28_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_28_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_28_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_28_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_28_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: sit on stairs\nB: get down from the chair\nC: kneel down\nD: put hands over lady in blue\nE: stand still",
+    "question": "what did the lady in white do when she first approached the lady in blue in the middle",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: sit on stairs\nB: get down from the chair\nC: kneel down\nD: put hands over lady in blue\nE: stand still\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_29_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_29_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_29_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_29_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_29_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_29_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_29_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_29_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_29_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_29_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_29_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_29_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_29_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_29_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_29_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_29_15.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: chair\nB: table\nC: tv screen\nD: piano\nE: dance machine",
+    "question": "what is placed on the right to the lady on stage",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: chair\nB: table\nC: tv screen\nD: piano\nE: dance machine\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_30_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_30_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_30_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_30_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_30_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_30_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_30_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_30_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_30_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_30_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_30_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_30_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_30_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_30_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_30_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_30_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: couch\nB: house\nC: car\nD: bus park place\nE: stage",
+    "question": "where did this occur",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: couch\nB: house\nC: car\nD: bus park place\nE: stage\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_31_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_31_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_31_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_31_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_31_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_31_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_31_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_31_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_31_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_31_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_31_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_31_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_31_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_31_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_31_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_31_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: point to her\nB: keep camera in pocket\nC: look at his phone\nD: plays guitar\nE: plays the guitar",
+    "question": "what does the man in grey suit do after they have finished singing at the end",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: point to her\nB: keep camera in pocket\nC: look at his phone\nD: plays guitar\nE: plays the guitar\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_32_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_32_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_32_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_32_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_32_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_32_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_32_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_32_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_32_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_32_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_32_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_32_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_32_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_32_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_32_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_32_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: to touch the sandals\nB: to dance on the floor\nC: to play\nD: he is bored\nE: listening to music and dancing",
+    "question": "why did the boy punch his hand forwards in the middle of the video",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: to touch the sandals\nB: to dance on the floor\nC: to play\nD: he is bored\nE: listening to music and dancing\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_33_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_33_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_33_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_33_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_33_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_33_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_33_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_33_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_33_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_33_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_33_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_33_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_33_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_33_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_33_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_33_15.jpg"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: girl hit wall\nB: part of the dance routine\nC: practicing\nD: to wave\nE: pushing the rod",
+    "question": "why is the man raising his legs throughout the video",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: girl hit wall\nB: part of the dance routine\nC: practicing\nD: to wave\nE: pushing the rod\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_34_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_34_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_34_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_34_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_34_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_34_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_34_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_34_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_34_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_34_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_34_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_34_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_34_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_34_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_34_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_34_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: make sound\nB: to drink\nC: enjoying the music\nD: make it more fun\nE: to direct the boy",
+    "question": "why did the man hit the notes in one spectrum and direction",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: make sound\nB: to drink\nC: enjoying the music\nD: make it more fun\nE: to direct the boy\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_35_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_35_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_35_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_35_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_35_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_35_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_35_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_35_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_35_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_35_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_35_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_35_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_35_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_35_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_35_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_35_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: it was sunny outside\nB: fashion\nC: for protection from chemicals\nD: to read the book\nE: to watch television",
+    "question": "why did the boy wear glasses",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: it was sunny outside\nB: fashion\nC: for protection from chemicals\nD: to read the book\nE: to watch television\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_36_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_36_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_36_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_36_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_36_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_36_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_36_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_36_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_36_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_36_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_36_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_36_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_36_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_36_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_36_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_36_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: show he finish chewing\nB: open the toy s mouth\nC: teething\nD: happy and laughing\nE: playing game",
+    "question": "why did the boy put his finger into his mouth",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: show he finish chewing\nB: open the toy s mouth\nC: teething\nD: happy and laughing\nE: playing game\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_37_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_37_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_37_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_37_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_37_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_37_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_37_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_37_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_37_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_37_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_37_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_37_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_37_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_37_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_37_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_37_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: drink\nB: want to play\nC: interested in it\nD: for food\nE: distracted",
+    "question": "why do the cats walk away from the carpark at the middle of the video",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: drink\nB: want to play\nC: interested in it\nD: for food\nE: distracted\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_38_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_38_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_38_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_38_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_38_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_38_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_38_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_38_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_38_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_38_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_38_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_38_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_38_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_38_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_38_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_38_15.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: dog keep blocking\nB: to adjust snowboard\nC: preparing for speech\nD: to show the drink\nE: to change slides",
+    "question": "why does the man stops multiple times in between",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: dog keep blocking\nB: to adjust snowboard\nC: preparing for speech\nD: to show the drink\nE: to change slides\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_39_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_39_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_39_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_39_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_39_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_39_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_39_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_39_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_39_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_39_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_39_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_39_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_39_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_39_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_39_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_39_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: to look at something above\nB: to bow to the man\nC: playing\nD: retrieve ball\nE: pick up stick",
+    "question": "why does the dark brown dog bend down at the end of the video",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: to look at something above\nB: to bow to the man\nC: playing\nD: retrieve ball\nE: pick up stick\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_40_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_40_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_40_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_40_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_40_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_40_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_40_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_40_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_40_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_40_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_40_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_40_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_40_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_40_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_40_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_40_15.jpg"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: hit the ball\nB: to get the kite handle\nC: play catch with the ball\nD: chasing the car\nE: show excitement",
+    "question": "why were the people running in circles at the start",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: hit the ball\nB: to get the kite handle\nC: play catch with the ball\nD: chasing the car\nE: show excitement\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_41_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_41_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_41_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_41_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_41_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_41_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_41_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_41_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_41_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_41_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_41_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_41_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_41_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_41_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_41_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_41_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: to note something on the book\nB: pose for camera\nC: reach for the airconditioners\nD: play with toy\nE: dancing along with music",
+    "question": "why did the boy lift his hands up above his head nearing the end while turning",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: to note something on the book\nB: pose for camera\nC: reach for the airconditioners\nD: play with toy\nE: dancing along with music\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_42_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_42_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_42_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_42_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_42_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_42_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_42_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_42_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_42_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_42_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_42_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_42_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_42_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_42_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_42_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_42_15.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: fighting for stick\nB: biting on rat\nC: drink the milk\nD: play with ball\nE: follow man s instruction",
+    "question": "why were both dogs looking down near the middle",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: fighting for stick\nB: biting on rat\nC: drink the milk\nD: play with ball\nE: follow man s instruction\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_43_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_43_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_43_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_43_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_43_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_43_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_43_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_43_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_43_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_43_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_43_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_43_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_43_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_43_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_43_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_43_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: for safety in skiing\nB: construction work\nC: protect head from bricks\nD: photo requirement\nE: trying out new helmets",
+    "question": "why is the man wearing helmet",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: for safety in skiing\nB: construction work\nC: protect head from bricks\nD: photo requirement\nE: trying out new helmets\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_44_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_44_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_44_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_44_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_44_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_44_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_44_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_44_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_44_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_44_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_44_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_44_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_44_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_44_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_44_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_44_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: hot\nB: recording the scenery\nC: sunny\nD: raining\nE: to focus on cake",
+    "question": "why did the camera view get blurred",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: hot\nB: recording the scenery\nC: sunny\nD: raining\nE: to focus on cake\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_45_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_45_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_45_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_45_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_45_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_45_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_45_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_45_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_45_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_45_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_45_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_45_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_45_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_45_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_45_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_45_15.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: drinking soup\nB: to eat ice cream\nC: feeding the dog\nD: feed little girl\nE: to stir salad",
+    "question": "why did the lady in red picked up the spoon on the table",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: drinking soup\nB: to eat ice cream\nC: feeding the dog\nD: feed little girl\nE: to stir salad\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_46_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_46_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_46_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_46_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_46_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_46_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_46_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_46_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_46_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_46_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_46_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_46_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_46_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_46_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_46_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_46_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: look around\nB: move to right side\nC: lick hand\nD: jump around\nE: put its paw back",
+    "question": "what does the dog do after the person stretch his hand out",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: look around\nB: move to right side\nC: lick hand\nD: jump around\nE: put its paw back\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_47_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_47_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_47_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_47_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_47_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_47_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_47_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_47_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_47_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_47_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_47_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_47_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_47_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_47_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_47_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_47_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: blue\nB: green\nC: white\nD: black\nE: teal",
+    "question": "what colour shirt was the man wearing",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: blue\nB: green\nC: white\nD: black\nE: teal\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_48_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_48_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_48_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_48_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_48_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_48_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_48_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_48_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_48_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_48_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_48_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_48_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_48_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_48_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_48_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_48_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: raise his hands\nB: pretends to be an animal\nC: jumping in\nD: pail\nE: spectacles",
+    "question": "how did the man in the screen pretended to be swimming with the fishes int he background",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: raise his hands\nB: pretends to be an animal\nC: jumping in\nD: pail\nE: spectacles\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_49_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_49_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_49_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_49_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_49_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_49_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_49_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_49_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_49_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_49_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_49_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_49_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_49_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_49_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_49_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_49_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: drinking\nB: claps\nC: speak into microphone\nD: singing\nE: want to snatch the phone",
+    "question": "what is the man with cap doing while the bald man is answering a call at the beginning",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: drinking\nB: claps\nC: speak into microphone\nD: singing\nE: want to snatch the phone\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_50_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_50_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_50_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_50_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_50_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_50_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_50_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_50_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_50_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_50_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_50_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_50_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_50_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_50_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_50_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_50_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: watches dog run away\nB: excited\nC: gives them food\nD: lower its head\nE: pull it out",
+    "question": "how does the lady react when the dog wo nt let go of the twig",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: watches dog run away\nB: excited\nC: gives them food\nD: lower its head\nE: pull it out\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_51_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_51_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_51_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_51_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_51_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_51_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_51_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_51_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_51_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_51_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_51_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_51_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_51_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_51_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_51_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_51_15.jpg"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: watching the man write calligraphy\nB: need to play the drum\nC: open space\nD: looking after the child\nE: to show to people",
+    "question": "why is there a man standing at the start of the road",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: watching the man write calligraphy\nB: need to play the drum\nC: open space\nD: looking after the child\nE: to show to people\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_52_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_52_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_52_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_52_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_52_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_52_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_52_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_52_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_52_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_52_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_52_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_52_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_52_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_52_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_52_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_52_15.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: posing for photos\nB: common costume for performance\nC: formal celebration\nD: playing rugby\nE: for safety",
+    "question": "why do the people wear headgear",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: posing for photos\nB: common costume for performance\nC: formal celebration\nD: playing rugby\nE: for safety\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_53_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_53_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_53_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_53_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_53_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_53_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_53_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_53_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_53_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_53_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_53_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_53_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_53_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_53_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_53_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_53_15.jpg"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: dancing with the girl\nB: mimicking the tv show\nC: perform for the audience\nD: talking\nE: express excited",
+    "question": "why is there a woman dancing and moving along next to the car",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: dancing with the girl\nB: mimicking the tv show\nC: perform for the audience\nD: talking\nE: express excited\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_54_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_54_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_54_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_54_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_54_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_54_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_54_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_54_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_54_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_54_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_54_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_54_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_54_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_54_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_54_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_54_15.jpg"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: studio\nB: beach\nC: living area\nD: home\nE: garden",
+    "question": "where are the people hanging out",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: studio\nB: beach\nC: living area\nD: home\nE: garden\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_55_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_55_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_55_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_55_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_55_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_55_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_55_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_55_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_55_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_55_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_55_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_55_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_55_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_55_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_55_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_55_15.jpg"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: caress\nB: place the dog near obstacle\nC: tie it to a pole\nD: carry baby to chase it\nE: biting its tail",
+    "question": "how does the man in black vest correct the track of the white dog at the end",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: caress\nB: place the dog near obstacle\nC: tie it to a pole\nD: carry baby to chase it\nE: biting its tail\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_56_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_56_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_56_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_56_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_56_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_56_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_56_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_56_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_56_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_56_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_56_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_56_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_56_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_56_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_56_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_56_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: asking more food\nB: adjusting its leash\nC: to see who pet it\nD: playing ball games\nE: excited to change his attire",
+    "question": "why was the dog looking upwards in the middle",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: asking more food\nB: adjusting its leash\nC: to see who pet it\nD: playing ball games\nE: excited to change his attire\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_57_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_57_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_57_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_57_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_57_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_57_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_57_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_57_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_57_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_57_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_57_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_57_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_57_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_57_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_57_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_57_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: play with dog\nB: turn away\nC: run to the other side\nD: lick person s hand\nE: look its right",
+    "question": "what does the dog do after getting the twig",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: play with dog\nB: turn away\nC: run to the other side\nD: lick person s hand\nE: look its right\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_58_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_58_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_58_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_58_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_58_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_58_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_58_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_58_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_58_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_58_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_58_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_58_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_58_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_58_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_58_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_58_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: parent offspring\nB: father daughter\nC: husband wife\nD: family member\nE: trainer trainee",
+    "question": "what is the relationship between the man and lady",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: parent offspring\nB: father daughter\nC: husband wife\nD: family member\nE: trainer trainee\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_59_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_59_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_59_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_59_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_59_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_59_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_59_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_59_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_59_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_59_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_59_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_59_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_59_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_59_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_59_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_59_15.jpg"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: pen and paper\nB: phone\nC: camera\nD: computer\nE: tablet",
+    "question": "how does the person in orange helmet record the activity",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: pen and paper\nB: phone\nC: camera\nD: computer\nE: tablet\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_60_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_60_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_60_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_60_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_60_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_60_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_60_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_60_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_60_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_60_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_60_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_60_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_60_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_60_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_60_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_60_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: sit in a circle\nB: kneeling\nC: stand behind the baby\nD: touching the controls\nE: standing",
+    "question": "how is the boy positioned on the chair",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: sit in a circle\nB: kneeling\nC: stand behind the baby\nD: touching the controls\nE: standing\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_61_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_61_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_61_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_61_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_61_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_61_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_61_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_61_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_61_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_61_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_61_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_61_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_61_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_61_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_61_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_61_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: parent offspring\nB: student\nC: teacher\nD: father son\nE: slides",
+    "question": "what are the blue or green things some children are holding on to",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: parent offspring\nB: student\nC: teacher\nD: father son\nE: slides\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_62_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_62_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_62_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_62_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_62_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_62_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_62_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_62_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_62_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_62_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_62_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_62_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_62_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_62_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_62_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_62_15.jpg"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: dog snatched from the person\nB: person placed there\nC: person throw it\nD: bite toy from sofa\nE: bite it under the table",
+    "question": "why did the toy end up in the dog s mouth after the middle part of the video",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: dog snatched from the person\nB: person placed there\nC: person throw it\nD: bite toy from sofa\nE: bite it under the table\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_63_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_63_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_63_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_63_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_63_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_63_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_63_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_63_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_63_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_63_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_63_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_63_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_63_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_63_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_63_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_63_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: watching television\nB: playing\nC: eating\nD: huggging the dog\nE: swimming",
+    "question": "why is the boy in blue sitting down on the lady in red s lap",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: watching television\nB: playing\nC: eating\nD: huggging the dog\nE: swimming\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_64_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_64_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_64_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_64_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_64_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_64_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_64_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_64_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_64_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_64_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_64_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_64_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_64_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_64_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_64_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_64_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: play with the toy\nB: pick up the ball\nC: pick up hat\nD: run around the table\nE: dance",
+    "question": "what did the lady in pink do after the man in white missed the ball",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: play with the toy\nB: pick up the ball\nC: pick up hat\nD: run around the table\nE: dance\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_65_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_65_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_65_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_65_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_65_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_65_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_65_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_65_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_65_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_65_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_65_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_65_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_65_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_65_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_65_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_65_15.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: playing with balloon\nB: facing the baby forward\nC: lets the baby hold her fingers\nD: using strap\nE: hands support bum and back",
+    "question": "how does the lady in blue carry the child at the beginning",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: playing with balloon\nB: facing the baby forward\nC: lets the baby hold her fingers\nD: using strap\nE: hands support bum and back\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_66_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_66_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_66_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_66_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_66_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_66_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_66_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_66_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_66_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_66_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_66_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_66_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_66_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_66_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_66_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_66_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: ball dinner\nB: in hospital\nC: keep warm\nD: fencing\nE: scuba diving",
+    "question": "why are the two people wearing something to cover their face",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: ball dinner\nB: in hospital\nC: keep warm\nD: fencing\nE: scuba diving\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_67_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_67_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_67_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_67_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_67_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_67_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_67_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_67_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_67_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_67_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_67_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_67_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_67_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_67_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_67_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_67_15.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: for balance while skating\nB: dancing to rhythm\nC: play game\nD: expressive\nE: playing the drum",
+    "question": "why was the boy s arm moving constantly",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: for balance while skating\nB: dancing to rhythm\nC: play game\nD: expressive\nE: playing the drum\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_68_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_68_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_68_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_68_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_68_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_68_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_68_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_68_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_68_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_68_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_68_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_68_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_68_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_68_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_68_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_68_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: grab it\nB: bite it\nC: kick it\nD: push it away\nE: pass to adult",
+    "question": "how does the baby in purple interact with the red toy",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: grab it\nB: bite it\nC: kick it\nD: push it away\nE: pass to adult\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_69_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_69_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_69_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_69_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_69_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_69_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_69_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_69_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_69_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_69_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_69_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_69_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_69_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_69_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_69_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_69_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: toy car\nB: clothes\nC: handbag\nD: glass bottle\nE: guitar",
+    "question": "what does the lady in black on the sofa hold in her hands",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: toy car\nB: clothes\nC: handbag\nD: glass bottle\nE: guitar\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_70_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_70_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_70_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_70_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_70_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_70_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_70_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_70_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_70_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_70_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_70_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_70_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_70_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_70_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_70_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_70_15.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: use hand gestures to demonstrate\nB: point towards video\nC: keep clapping\nD: playing with her hair\nE: use laser pointer",
+    "question": "how does the girl use body language to demonstrate what she is saying",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: use hand gestures to demonstrate\nB: point towards video\nC: keep clapping\nD: playing with her hair\nE: use laser pointer\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_71_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_71_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_71_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_71_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_71_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_71_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_71_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_71_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_71_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_71_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_71_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_71_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_71_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_71_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_71_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_71_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: three\nB: two\nC: one\nD: four\nE: five",
+    "question": "how many cats are there",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: three\nB: two\nC: one\nD: four\nE: five\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_72_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_72_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_72_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_72_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_72_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_72_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_72_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_72_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_72_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_72_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_72_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_72_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_72_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_72_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_72_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_72_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: boundary\nB: play with snow\nC: experimenting with chemicals\nD: help patients inject needles\nE: cleaning the floor",
+    "question": "why do the people in headgear move on the white surface",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: boundary\nB: play with snow\nC: experimenting with chemicals\nD: help patients inject needles\nE: cleaning the floor\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_73_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_73_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_73_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_73_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_73_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_73_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_73_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_73_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_73_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_73_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_73_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_73_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_73_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_73_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_73_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_73_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: to prevent injuries\nB: signal end of performance\nC: let boy in black try\nD: to reset the match\nE: separated by trainer",
+    "question": "why do the people in the headgear adjust their positions after the person in red hits the other person with the sword",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: to prevent injuries\nB: signal end of performance\nC: let boy in black try\nD: to reset the match\nE: separated by trainer\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_74_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_74_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_74_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_74_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_74_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_74_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_74_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_74_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_74_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_74_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_74_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_74_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_74_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_74_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_74_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_74_15.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: keep baby from falling\nB: engage lady \nC: tease her\nD: place something on windowsill\nE: take a gift",
+    "question": "why did a lady in purple walk in after the lady carried the baby up",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: keep baby from falling\nB: engage lady \nC: tease her\nD: place something on windowsill\nE: take a gift\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_75_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_75_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_75_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_75_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_75_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_75_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_75_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_75_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_75_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_75_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_75_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_75_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_75_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_75_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_75_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_75_15.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: pet it\nB: play ball games\nC: remove the leash\nD: sniff the dog\nE: carry it",
+    "question": "how does the man in black interact with the dog at the end",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: pet it\nB: play ball games\nC: remove the leash\nD: sniff the dog\nE: carry it\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_76_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_76_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_76_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_76_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_76_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_76_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_76_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_76_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_76_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_76_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_76_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_76_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_76_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_76_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_76_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_76_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: go to lady in stripes\nB: unwrap something\nC: adjust the girl s jacket\nD: shuffle cards\nE: play guitar",
+    "question": "what does the lady in white on the floor do as everyone was sitting around",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: go to lady in stripes\nB: unwrap something\nC: adjust the girl s jacket\nD: shuffle cards\nE: play guitar\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_77_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_77_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_77_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_77_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_77_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_77_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_77_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_77_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_77_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_77_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_77_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_77_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_77_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_77_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_77_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_77_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: trainer trainee\nB: couple\nC: offspring\nD: husband and wife\nE: father and daughter",
+    "question": "what is the relationship between the man and the woman",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: trainer trainee\nB: couple\nC: offspring\nD: husband and wife\nE: father and daughter\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_78_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_78_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_78_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_78_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_78_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_78_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_78_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_78_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_78_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_78_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_78_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_78_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_78_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_78_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_78_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_78_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: office\nB: house\nC: train\nD: front porch\nE: park",
+    "question": "where is this video taken",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: office\nB: house\nC: train\nD: front porch\nE: park\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_79_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_79_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_79_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_79_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_79_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_79_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_79_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_79_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_79_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_79_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_79_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_79_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_79_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_79_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_79_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_79_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: working\nB: choreography\nC: dancing\nD: part of home decoration\nE: clean faster",
+    "question": "why does the woman brush the same utensil",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: working\nB: choreography\nC: dancing\nD: part of home decoration\nE: clean faster\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_80_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_80_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_80_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_80_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_80_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_80_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_80_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_80_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_80_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_80_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_80_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_80_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_80_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_80_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_80_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_80_15.jpg"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: pick up toy\nB: part of the play\nC: moving baby s hands\nD: feeding birds\nE: play music",
+    "question": "why is there a hand stretched out",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: pick up toy\nB: part of the play\nC: moving baby s hands\nD: feeding birds\nE: play music\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_81_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_81_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_81_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_81_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_81_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_81_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_81_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_81_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_81_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_81_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_81_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_81_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_81_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_81_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_81_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_81_15.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: attack with sword\nB: move out of fencing area\nC: fell down\nD: threw sword away\nE: move in front",
+    "question": "what did the fencer in black do when the other fencer moved forward to him at the end",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: attack with sword\nB: move out of fencing area\nC: fell down\nD: threw sword away\nE: move in front\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_82_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_82_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_82_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_82_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_82_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_82_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_82_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_82_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_82_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_82_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_82_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_82_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_82_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_82_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_82_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_82_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: look at someone else s phone\nB: play piano\nC: dancing\nD: pointing to the tiger\nE: drink coffee",
+    "question": "what was the man doing as the lady in blue covered her face",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: look at someone else s phone\nB: play piano\nC: dancing\nD: pointing to the tiger\nE: drink coffee\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_83_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_83_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_83_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_83_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_83_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_83_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_83_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_83_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_83_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_83_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_83_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_83_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_83_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_83_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_83_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_83_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: distracted\nB: get floats\nC: threw the ball\nD: called by man\nE: to get out of pool",
+    "question": "why did the lady in purple walk away from the babies near the end of the video",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: distracted\nB: get floats\nC: threw the ball\nD: called by man\nE: to get out of pool\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_84_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_84_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_84_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_84_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_84_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_84_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_84_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_84_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_84_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_84_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_84_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_84_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_84_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_84_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_84_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_84_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: medical procedure\nB: making a model\nC: climbing rocky mountains\nD: protect baby from getting sick\nE: playing rugby",
+    "question": "why are the people wearing gloves",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: medical procedure\nB: making a model\nC: climbing rocky mountains\nD: protect baby from getting sick\nE: playing rugby\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_85_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_85_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_85_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_85_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_85_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_85_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_85_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_85_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_85_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_85_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_85_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_85_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_85_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_85_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_85_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_85_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: in a circle\nB: using his hands to unscrew the cap\nC: next to baby\nD: hold the sides of the phone\nE: above his head",
+    "question": "how did the person at the end with the camera hold the umbrella",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: in a circle\nB: using his hands to unscrew the cap\nC: next to baby\nD: hold the sides of the phone\nE: above his head\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_86_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_86_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_86_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_86_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_86_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_86_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_86_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_86_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_86_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_86_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_86_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_86_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_86_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_86_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_86_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_86_15.jpg"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: smiling\nB: laughing\nC: crying\nD: disgusted\nE: itchy and uncomfortable",
+    "question": "how is the boy in blue expressing himself",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: smiling\nB: laughing\nC: crying\nD: disgusted\nE: itchy and uncomfortable\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_87_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_87_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_87_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_87_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_87_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_87_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_87_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_87_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_87_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_87_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_87_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_87_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_87_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_87_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_87_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_87_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: bring girl closer to the tree\nB: follow her instructions to sit\nC: close the door\nD: walked away\nE: blow candle",
+    "question": "what does the white hair man do after picking the girl up",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: bring girl closer to the tree\nB: follow her instructions to sit\nC: close the door\nD: walked away\nE: blow candle\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_88_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_88_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_88_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_88_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_88_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_88_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_88_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_88_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_88_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_88_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_88_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_88_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_88_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_88_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_88_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_88_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: not used to paddle\nB: lady was jumping\nC: training ground\nD: comfortable\nE: mimic movement",
+    "question": "why are the three children in front of the lady in brown not able to balance on the surface near the start",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: not used to paddle\nB: lady was jumping\nC: training ground\nD: comfortable\nE: mimic movement\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_89_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_89_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_89_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_89_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_89_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_89_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_89_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_89_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_89_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_89_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_89_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_89_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_89_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_89_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_89_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_89_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: 2 boys\nB: phone\nC: bottle\nD: stick\nE: cosplay",
+    "question": "what is the person shown on screen holding throughout the video",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: 2 boys\nB: phone\nC: bottle\nD: stick\nE: cosplay\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_90_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_90_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_90_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_90_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_90_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_90_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_90_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_90_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_90_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_90_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_90_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_90_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_90_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_90_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_90_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_90_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: laughing\nB: to pick the girl up\nC: pick up a toy at the side\nD: microphone too short\nE: put phone",
+    "question": "why did the man bend and lower his head to the bed",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: laughing\nB: to pick the girl up\nC: pick up a toy at the side\nD: microphone too short\nE: put phone\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_91_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_91_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_91_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_91_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_91_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_91_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_91_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_91_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_91_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_91_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_91_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_91_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_91_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_91_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_91_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_91_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: forest\nB: bedroom\nC: dining room\nD: by a stream\nE: backyard",
+    "question": "where are the man and the baby hanging out",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: forest\nB: bedroom\nC: dining room\nD: by a stream\nE: backyard\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_92_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_92_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_92_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_92_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_92_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_92_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_92_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_92_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_92_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_92_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_92_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_92_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_92_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_92_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_92_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_92_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: adjust his grip\nB: to take something\nC: to help girl\nD: pass bag to lady\nE: to hold hands",
+    "question": "why does the man remove one of his hands from the handler in the middle",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: adjust his grip\nB: to take something\nC: to help girl\nD: pass bag to lady\nE: to hold hands\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_93_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_93_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_93_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_93_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_93_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_93_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_93_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_93_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_93_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_93_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_93_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_93_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_93_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_93_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_93_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_93_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: performance ended\nB: throw down drinks\nC: distracting others\nD: pick up something\nE: to come down",
+    "question": "why did the the woman with apron bend down after moving to the left of the stage",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: performance ended\nB: throw down drinks\nC: distracting others\nD: pick up something\nE: to come down\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_94_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_94_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_94_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_94_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_94_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_94_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_94_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_94_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_94_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_94_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_94_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_94_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_94_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_94_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_94_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_94_15.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: on table\nB: taps his leg\nC: press the yellow button\nD: show lady\nE: on bottle",
+    "question": "where did the boy put his right hand after he took it out from his mouth",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: on table\nB: taps his leg\nC: press the yellow button\nD: show lady\nE: on bottle\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_95_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_95_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_95_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_95_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_95_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_95_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_95_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_95_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_95_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_95_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_95_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_95_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_95_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_95_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_95_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_95_15.jpg"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: looked at camera\nB: sit on man and play toy\nC: direct baby away\nD: look back at her\nE: takes the spoon away",
+    "question": "what does the baby do after the lady changes the direction of the toy car in the middle of the video",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: looked at camera\nB: sit on man and play toy\nC: direct baby away\nD: look back at her\nE: takes the spoon away\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_96_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_96_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_96_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_96_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_96_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_96_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_96_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_96_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_96_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_96_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_96_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_96_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_96_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_96_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_96_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_96_15.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: touch nose\nB: flip\nC: roll around\nD: touch the camera\nE: cycle towards the slope",
+    "question": "what does the boy do after rolling over in the middle",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: touch nose\nB: flip\nC: roll around\nD: touch the camera\nE: cycle towards the slope\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_97_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_97_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_97_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_97_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_97_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_97_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_97_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_97_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_97_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_97_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_97_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_97_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_97_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_97_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_97_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_97_15.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: walk away\nB: thumbs up\nC: put down her club\nD: applying cream on face\nE: caressing for the dog",
+    "question": "what did the lady do while turning back",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: walk away\nB: thumbs up\nC: put down her club\nD: applying cream on face\nE: caressing for the dog\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_98_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_98_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_98_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_98_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_98_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_98_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_98_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_98_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_98_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_98_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_98_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_98_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_98_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_98_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_98_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_98_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: playing with baby\nB: look at scenery\nC: carry girl\nD: tap on screen\nE: look at lady in hoodie",
+    "question": "why did the lady slow down when she reached the top",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: playing with baby\nB: look at scenery\nC: carry girl\nD: tap on screen\nE: look at lady in hoodie\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_99_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_99_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_99_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_99_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_99_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_99_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_99_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_99_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_99_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_99_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_99_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_99_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_99_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_99_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_99_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_99_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: acknowledge something\nB: head was uncomfortable\nC: try to sing to beat\nD: posing for the camera\nE: produce higher vioce",
+    "question": "why did the boy nod his head when singing",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: acknowledge something\nB: head was uncomfortable\nC: try to sing to beat\nD: posing for the camera\nE: produce higher vioce\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_100_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_100_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_100_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_100_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_100_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_100_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_100_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_100_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_100_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_100_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_100_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_100_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_100_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_100_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_100_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_100_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: hod in hand\nB: beside the pink toy\nC: in baby s hand\nD: table\nE: chair",
+    "question": "where did the girl put her phone as she kissed it",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: hod in hand\nB: beside the pink toy\nC: in baby s hand\nD: table\nE: chair\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_101_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_101_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_101_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_101_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_101_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_101_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_101_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_101_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_101_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_101_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_101_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_101_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_101_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_101_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_101_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_101_15.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: remove the paper too\nB: look around her\nC: drink water\nD: walk away\nE: point at the music script",
+    "question": "what did the lady in polka dress do after she talked to the person in front of her",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: remove the paper too\nB: look around her\nC: drink water\nD: walk away\nE: point at the music script\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_102_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_102_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_102_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_102_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_102_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_102_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_102_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_102_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_102_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_102_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_102_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_102_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_102_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_102_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_102_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_102_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: karaoke room\nB: in the middle of the sea\nC: skate park\nD: house\nE: basketball court",
+    "question": "where is this happening",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: karaoke room\nB: in the middle of the sea\nC: skate park\nD: house\nE: basketball court\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_103_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_103_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_103_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_103_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_103_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_103_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_103_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_103_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_103_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_103_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_103_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_103_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_103_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_103_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_103_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_103_15.jpg"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: push trolley\nB: push the back of car\nC: goes to play the piano\nD: smiles\nE: stroke cat",
+    "question": "what did the lady in purple do after she touched the baby s head",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: push trolley\nB: push the back of car\nC: goes to play the piano\nD: smiles\nE: stroke cat\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_104_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_104_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_104_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_104_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_104_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_104_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_104_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_104_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_104_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_104_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_104_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_104_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_104_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_104_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_104_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_104_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: on a baby pram\nB: on a toy\nC: on a walker\nD: baby stroller\nE: on a sled",
+    "question": "how is the baby moved around",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: on a baby pram\nB: on a toy\nC: on a walker\nD: baby stroller\nE: on a sled\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_105_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_105_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_105_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_105_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_105_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_105_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_105_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_105_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_105_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_105_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_105_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_105_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_105_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_105_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_105_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_105_15.jpg"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: keep warm\nB: protect from sun\nC: goes well with attire\nD: sunny whether\nE: it s cold",
+    "question": "why did the old man wear jacket and hat outdoors",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: keep warm\nB: protect from sun\nC: goes well with attire\nD: sunny whether\nE: it s cold\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_106_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_106_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_106_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_106_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_106_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_106_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_106_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_106_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_106_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_106_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_106_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_106_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_106_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_106_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_106_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_106_15.jpg"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: follow direction of man s leg\nB: there is cushion behind\nC: observe\nD: another baby is in front\nE: relaxing",
+    "question": "why does the baby lean back as the man pulls the sled",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: follow direction of man s leg\nB: there is cushion behind\nC: observe\nD: another baby is in front\nE: relaxing\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_107_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_107_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_107_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_107_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_107_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_107_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_107_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_107_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_107_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_107_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_107_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_107_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_107_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_107_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_107_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_107_15.jpg"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: change something on screen\nB: swing cloth\nC: switch positions with other man\nD: taking a break\nE: looking at baby",
+    "question": "why does the man stop for a while in the middle of the video",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: change something on screen\nB: swing cloth\nC: switch positions with other man\nD: taking a break\nE: looking at baby\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_108_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_108_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_108_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_108_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_108_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_108_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_108_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_108_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_108_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_108_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_108_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_108_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_108_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_108_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_108_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_108_15.jpg"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: want to hug child\nB: dancing\nC: doing squats\nD: want to hug dog\nE: stretching his arms",
+    "question": "why did the adult squat down and opened his arm at the end of the video",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: want to hug child\nB: dancing\nC: doing squats\nD: want to hug dog\nE: stretching his arms\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_109_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_109_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_109_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_109_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_109_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_109_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_109_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_109_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_109_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_109_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_109_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_109_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_109_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_109_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_109_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_109_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: ask cameraman to move\nB: testing the microphone\nC: giving presentation\nD: make voice louder\nE: to capture audience attention",
+    "question": "why does the man in white talk on the microphone towards the end",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: ask cameraman to move\nB: testing the microphone\nC: giving presentation\nD: make voice louder\nE: to capture audience attention\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_110_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_110_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_110_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_110_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_110_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_110_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_110_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_110_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_110_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_110_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_110_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_110_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_110_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_110_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_110_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_110_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: push fish s down\nB: touch his cap\nC: unlocking door\nD: dancing\nE: fist bump with woman",
+    "question": "why did the man raise his hand up in a punch near the end of the video",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: push fish s down\nB: touch his cap\nC: unlocking door\nD: dancing\nE: fist bump with woman\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_111_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_111_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_111_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_111_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_111_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_111_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_111_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_111_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_111_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_111_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_111_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_111_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_111_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_111_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_111_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_111_15.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: raise their heads\nB: adjust her sleeves\nC: watching\nD: looking at girl in pink\nE: continue walking backwards",
+    "question": "what did the lady in green do after bending down to laugh in the middle",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: raise their heads\nB: adjust her sleeves\nC: watching\nD: looking at girl in pink\nE: continue walking backwards\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_112_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_112_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_112_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_112_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_112_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_112_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_112_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_112_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_112_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_112_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_112_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_112_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_112_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_112_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_112_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_112_15.jpg"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: steps onto it\nB: get up and move away\nC: went forwards and backwards\nD: pull up her wedding dress\nE: using its legs",
+    "question": "how did the lady move herself into the house",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: steps onto it\nB: get up and move away\nC: went forwards and backwards\nD: pull up her wedding dress\nE: using its legs\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_113_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_113_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_113_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_113_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_113_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_113_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_113_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_113_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_113_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_113_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_113_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_113_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_113_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_113_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_113_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_113_15.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: practicing dance\nB: dancing\nC: show connection\nD: funny\nE: controller for dancing game",
+    "question": "why were both of them smilinglaughing when they started dancing",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: practicing dance\nB: dancing\nC: show connection\nD: funny\nE: controller for dancing game\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_114_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_114_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_114_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_114_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_114_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_114_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_114_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_114_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_114_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_114_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_114_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_114_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_114_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_114_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_114_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_114_15.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: roll around\nB: hold man s hand\nC: eating icecream\nD: kiss rabbit\nE: water the plants",
+    "question": "what is the boy doing while rabbit is eating grass",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: roll around\nB: hold man s hand\nC: eating icecream\nD: kiss rabbit\nE: water the plants\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_115_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_115_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_115_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_115_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_115_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_115_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_115_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_115_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_115_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_115_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_115_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_115_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_115_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_115_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_115_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_115_15.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: wipe with towel\nB: rubs it away\nC: snowmobile dig through\nD: shake his body\nE: spread arms out",
+    "question": "how does the boy in front gets rid of the snow",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: wipe with towel\nB: rubs it away\nC: snowmobile dig through\nD: shake his body\nE: spread arms out\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_116_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_116_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_116_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_116_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_116_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_116_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_116_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_116_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_116_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_116_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_116_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_116_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_116_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_116_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_116_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_116_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: annoyed\nB: serious\nC: happy\nD: blessed\nE: disappointed",
+    "question": "how does the girl feel while talking and demonstrating in front of the camera",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: annoyed\nB: serious\nC: happy\nD: blessed\nE: disappointed\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_117_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_117_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_117_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_117_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_117_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_117_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_117_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_117_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_117_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_117_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_117_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_117_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_117_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_117_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_117_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_117_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: food stains\nB: lightning streaks\nC: scratches\nD: stickers\nE: roses",
+    "question": "what is there on the car",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: food stains\nB: lightning streaks\nC: scratches\nD: stickers\nE: roses\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_118_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_118_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_118_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_118_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_118_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_118_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_118_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_118_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_118_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_118_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_118_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_118_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_118_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_118_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_118_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_118_15.jpg"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: mechanism stop moving\nB: got earmuff\nC: leave the room\nD: man move his hand\nE: listen to the sound",
+    "question": "why does the person in the pink hat move his hand from his ear after the mechanism hits the bell the first time",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: mechanism stop moving\nB: got earmuff\nC: leave the room\nD: man move his hand\nE: listen to the sound\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_119_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_119_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_119_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_119_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_119_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_119_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_119_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_119_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_119_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_119_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_119_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_119_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_119_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_119_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_119_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_119_15.jpg"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: sunny weather\nB: raining\nC: keep warm\nD: protect from cold temperature\nE: cold",
+    "question": "why are the people dressed in raincoats",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: sunny weather\nB: raining\nC: keep warm\nD: protect from cold temperature\nE: cold\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_120_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_120_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_120_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_120_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_120_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_120_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_120_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_120_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_120_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_120_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_120_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_120_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_120_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_120_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_120_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_120_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: throw a toy\nB: point at her\nC: raise his hands\nD: clap his hand\nE: snap his fingers",
+    "question": "how does the man signal for the girl to stand up after she falls",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: throw a toy\nB: point at her\nC: raise his hands\nD: clap his hand\nE: snap his fingers\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_121_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_121_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_121_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_121_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_121_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_121_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_121_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_121_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_121_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_121_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_121_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_121_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_121_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_121_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_121_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_121_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: stretch\nB: push the baby\nC: tie shoelaces\nD: take balloon away from baby\nE: playing games",
+    "question": "why did the lady in purple bend down at the end of the video",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: stretch\nB: push the baby\nC: tie shoelaces\nD: take balloon away from baby\nE: playing games\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_122_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_122_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_122_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_122_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_122_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_122_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_122_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_122_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_122_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_122_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_122_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_122_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_122_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_122_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_122_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_122_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: one\nB: nine\nC: two\nD: four\nE: three",
+    "question": "how many dogs are there",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: one\nB: nine\nC: two\nD: four\nE: three\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_123_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_123_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_123_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_123_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_123_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_123_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_123_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_123_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_123_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_123_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_123_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_123_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_123_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_123_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_123_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_123_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: drop cloth\nB: touch the cloth\nC: throw away the blue toy\nD: hit baby in grey with toy\nE: took another toy",
+    "question": "what does the baby in purple do after looking at the toy for a while in the middle",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: drop cloth\nB: touch the cloth\nC: throw away the blue toy\nD: hit baby in grey with toy\nE: took another toy\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_124_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_124_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_124_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_124_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_124_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_124_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_124_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_124_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_124_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_124_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_124_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_124_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_124_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_124_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_124_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_124_15.jpg"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: oversee\nB: watch video\nC: the person with socks walked past\nD: support baby\nE: wants to play with baby",
+    "question": "why does the man keep staring at the boy throughout the video",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: oversee\nB: watch video\nC: the person with socks walked past\nD: support baby\nE: wants to play with baby\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_125_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_125_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_125_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_125_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_125_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_125_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_125_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_125_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_125_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_125_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_125_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_125_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_125_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_125_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_125_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_125_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: gently from the pillow\nB: from man s shoulder\nC: from another bed beside\nD: thrown by woman\nE: jump from sofa",
+    "question": "how did the girl crash on to the bed",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: gently from the pillow\nB: from man s shoulder\nC: from another bed beside\nD: thrown by woman\nE: jump from sofa\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_126_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_126_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_126_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_126_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_126_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_126_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_126_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_126_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_126_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_126_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_126_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_126_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_126_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_126_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_126_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_126_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: play in the sea\nB: catch the branch\nC: playing\nD: playing fetch\nE: the dog bit her hand",
+    "question": "why does the dog run after the twig when the lady throws it",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: play in the sea\nB: catch the branch\nC: playing\nD: playing fetch\nE: the dog bit her hand\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_127_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_127_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_127_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_127_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_127_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_127_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_127_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_127_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_127_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_127_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_127_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_127_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_127_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_127_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_127_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_127_15.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: shake a toy\nB: bib\nC: tickle the baby\nD: wear helmet\nE: use leg to support",
+    "question": "how does the lady prevent the child from falling after putting the child on the ground",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: shake a toy\nB: bib\nC: tickle the baby\nD: wear helmet\nE: use leg to support\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_128_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_128_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_128_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_128_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_128_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_128_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_128_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_128_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_128_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_128_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_128_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_128_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_128_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_128_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_128_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_128_15.jpg"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: run\nB: playing the drums\nC: aid in his layering work\nD: sitting and listening to man speaking\nE: places pan back on stove",
+    "question": "what did the man on the left do after the other man filled the ground with the liquid",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: run\nB: playing the drums\nC: aid in his layering work\nD: sitting and listening to man speaking\nE: places pan back on stove\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_129_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_129_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_129_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_129_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_129_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_129_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_129_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_129_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_129_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_129_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_129_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_129_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_129_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_129_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_129_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_129_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: four\nB: five\nC: one\nD: eight\nE: three",
+    "question": "how many people are involve din the video",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: four\nB: five\nC: one\nD: eight\nE: three\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_130_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_130_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_130_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_130_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_130_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_130_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_130_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_130_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_130_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_130_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_130_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_130_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_130_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_130_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_130_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_130_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: walk behind him\nB: hit it with a toy\nC: strolls around\nD: holding hand\nE: remote control",
+    "question": "how did the boy showed that he was unstable while rollerblading",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: walk behind him\nB: hit it with a toy\nC: strolls around\nD: holding hand\nE: remote control\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_131_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_131_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_131_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_131_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_131_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_131_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_131_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_131_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_131_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_131_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_131_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_131_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_131_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_131_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_131_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_131_15.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: she wanted to feed the birds\nB: part of dance\nC: wanted to touch the tree\nD: touch adult\nE: pick up toy",
+    "question": "why did the girl stretch out her hand",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: she wanted to feed the birds\nB: part of dance\nC: wanted to touch the tree\nD: touch adult\nE: pick up toy\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_132_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_132_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_132_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_132_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_132_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_132_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_132_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_132_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_132_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_132_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_132_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_132_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_132_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_132_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_132_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_132_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: bring baby away from woman\nB: for boy to pull sled\nC: pass it to boy\nD: help baby sledge\nE: show to other people",
+    "question": "why does the man pull the baby sitting on the sled by the rope",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: bring baby away from woman\nB: for boy to pull sled\nC: pass it to boy\nD: help baby sledge\nE: show to other people\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_133_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_133_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_133_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_133_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_133_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_133_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_133_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_133_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_133_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_133_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_133_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_133_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_133_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_133_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_133_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_133_15.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: drove at full speed\nB: wheelers are moving fast\nC: slope\nD: sleigh is brand new\nE: pushed from behind",
+    "question": "why is the sleigh able to go through the snow bump",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: drove at full speed\nB: wheelers are moving fast\nC: slope\nD: sleigh is brand new\nE: pushed from behind\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_134_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_134_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_134_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_134_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_134_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_134_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_134_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_134_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_134_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_134_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_134_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_134_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_134_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_134_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_134_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_134_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: play with baby\nB: hiking outside\nC: watching the crane\nD: emcee\nE: talking to cameraman",
+    "question": "why is the lady wearing a white cap",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: play with baby\nB: hiking outside\nC: watching the crane\nD: emcee\nE: talking to cameraman\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_135_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_135_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_135_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_135_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_135_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_135_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_135_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_135_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_135_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_135_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_135_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_135_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_135_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_135_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_135_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_135_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: car accident\nB: snow\nC: shops mostly closed\nD: car museum\nE: parked",
+    "question": "why is the cars in the street not moving",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: car accident\nB: snow\nC: shops mostly closed\nD: car museum\nE: parked\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_136_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_136_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_136_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_136_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_136_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_136_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_136_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_136_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_136_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_136_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_136_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_136_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_136_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_136_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_136_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_136_15.jpg"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: leave the room\nB: put on earmuff\nC: press sides of hat\nD: stop mechanism\nE: stand futher away",
+    "question": "how does the man in pink hat show that he thinks the bell is noisy at the beginning",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: leave the room\nB: put on earmuff\nC: press sides of hat\nD: stop mechanism\nE: stand futher away\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_137_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_137_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_137_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_137_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_137_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_137_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_137_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_137_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_137_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_137_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_137_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_137_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_137_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_137_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_137_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_137_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: not experienced at swimming\nB: feed baby\nC: looking for something\nD: drinking\nE: to touch water",
+    "question": "why is the boy playing with a water bottle",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: not experienced at swimming\nB: feed baby\nC: looking for something\nD: drinking\nE: to touch water\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_138_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_138_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_138_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_138_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_138_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_138_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_138_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_138_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_138_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_138_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_138_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_138_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_138_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_138_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_138_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_138_15.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: pick up toy\nB: to apply the cream\nC: to clean his hand\nD: to eat\nE: scratch his mouth",
+    "question": "why does the baby keep putting his hand in his mouth",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: pick up toy\nB: to apply the cream\nC: to clean his hand\nD: to eat\nE: scratch his mouth\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_139_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_139_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_139_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_139_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_139_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_139_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_139_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_139_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_139_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_139_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_139_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_139_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_139_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_139_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_139_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_139_15.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: playing around\nB: wash his hands\nC: to balance\nD: dancing\nE: to play the guitar",
+    "question": "why did the boy rollerblading hold tightly the lady s hand",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: playing around\nB: wash his hands\nC: to balance\nD: dancing\nE: to play the guitar\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_140_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_140_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_140_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_140_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_140_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_140_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_140_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_140_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_140_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_140_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_140_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_140_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_140_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_140_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_140_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_140_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: jumping around rabbit\nB: caress its ears\nC: feed carrot\nD: kiss it\nE: chasing the rabbit",
+    "question": "how does the boy show affection to the rabbit",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: jumping around rabbit\nB: caress its ears\nC: feed carrot\nD: kiss it\nE: chasing the rabbit\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_141_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_141_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_141_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_141_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_141_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_141_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_141_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_141_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_141_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_141_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_141_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_141_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_141_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_141_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_141_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_141_15.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: her partner talks to her\nB: not to hurt the plant\nC: trying out poses\nD: stable herself before stepping down\nE: to let the dog catch up",
+    "question": "why did the lady turn her head to the left after walking for a while",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: her partner talks to her\nB: not to hurt the plant\nC: trying out poses\nD: stable herself before stepping down\nE: to let the dog catch up\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_142_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_142_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_142_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_142_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_142_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_142_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_142_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_142_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_142_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_142_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_142_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_142_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_142_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_142_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_142_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_142_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: field\nB: park\nC: bedroom\nD: sheltered area\nE: construction site",
+    "question": "where is this video taken",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: field\nB: park\nC: bedroom\nD: sheltered area\nE: construction site\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_143_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_143_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_143_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_143_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_143_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_143_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_143_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_143_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_143_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_143_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_143_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_143_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_143_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_143_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_143_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_143_15.jpg"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: ran in the same direction\nB: he listen and redo dancing steps\nC: singing away from the microphone\nD: dance\nE: touch his head",
+    "question": "what did the man do after the lady made an angry gesture in the middle of the video",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: ran in the same direction\nB: he listen and redo dancing steps\nC: singing away from the microphone\nD: dance\nE: touch his head\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_144_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_144_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_144_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_144_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_144_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_144_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_144_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_144_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_144_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_144_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_144_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_144_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_144_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_144_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_144_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_144_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: five\nB: thirteen\nC: eight\nD: three\nE: four",
+    "question": "how many people can be seen in the video",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: five\nB: thirteen\nC: eight\nD: three\nE: four\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_145_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_145_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_145_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_145_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_145_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_145_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_145_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_145_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_145_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_145_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_145_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_145_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_145_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_145_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_145_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_145_15.jpg"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: adjusting it\nB: make hand gestures\nC: waiting for her turn to dance\nD: shout out to the other vocalists\nE: talking to the crowd",
+    "question": "why did the lady with curly hair hold onto her microphone at the end",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: adjusting it\nB: make hand gestures\nC: waiting for her turn to dance\nD: shout out to the other vocalists\nE: talking to the crowd\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_146_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_146_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_146_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_146_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_146_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_146_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_146_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_146_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_146_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_146_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_146_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_146_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_146_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_146_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_146_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_146_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: fly\nB: use beak to pull itself\nC: walk on the ground\nD: skip\nE: roll",
+    "question": "how does the nearest parrot move across the cage",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: fly\nB: use beak to pull itself\nC: walk on the ground\nD: skip\nE: roll\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_147_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_147_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_147_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_147_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_147_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_147_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_147_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_147_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_147_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_147_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_147_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_147_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_147_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_147_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_147_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_147_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: performers costume\nB: photoshoot\nC: outfit for fencing\nD: cooking competition\nE: lab experiment",
+    "question": "why do both the player wear white costume",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: performers costume\nB: photoshoot\nC: outfit for fencing\nD: cooking competition\nE: lab experiment\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_148_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_148_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_148_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_148_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_148_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_148_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_148_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_148_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_148_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_148_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_148_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_148_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_148_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_148_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_148_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_148_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: point to his left\nB: tries to walk\nC: drink from bottle\nD: bring to table\nE: talking",
+    "question": "what does the boy do after putting the bottle flat on his mouth for a while at the start",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: point to his left\nB: tries to walk\nC: drink from bottle\nD: bring to table\nE: talking\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_149_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_149_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_149_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_149_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_149_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_149_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_149_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_149_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_149_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_149_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_149_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_149_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_149_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_149_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_149_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_149_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: run to the brown dog\nB: ran back into the cage\nC: pick it up\nD: retreat\nE: run away",
+    "question": "what did the white dog do when the brown dog turned back to run near the middle",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: run to the brown dog\nB: ran back into the cage\nC: pick it up\nD: retreat\nE: run away\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_150_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_150_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_150_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_150_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_150_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_150_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_150_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_150_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_150_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_150_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_150_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_150_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_150_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_150_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_150_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_150_15.jpg"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: indoor\nB: kitchen\nC: dance studio\nD: zoo\nE: on the pavement",
+    "question": "where is the girl cycling",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: indoor\nB: kitchen\nC: dance studio\nD: zoo\nE: on the pavement\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_151_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_151_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_151_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_151_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_151_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_151_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_151_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_151_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_151_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_151_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_151_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_151_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_151_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_151_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_151_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_151_15.jpg"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: to stop the recording\nB: they are practicing\nC: dancing\nD: distracted by dog moving\nE: playing game on mobile",
+    "question": "why do the men stop playing in between",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: to stop the recording\nB: they are practicing\nC: dancing\nD: distracted by dog moving\nE: playing game on mobile\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_152_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_152_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_152_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_152_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_152_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_152_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_152_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_152_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_152_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_152_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_152_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_152_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_152_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_152_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_152_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_152_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: nod her head\nB: raise her hands\nC: swing her hands right and left\nD: write notes\nE: clap her hands",
+    "question": "how does the woman with a red lanyard signal that she is paying attention in the middle of the video",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: nod her head\nB: raise her hands\nC: swing her hands right and left\nD: write notes\nE: clap her hands\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_153_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_153_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_153_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_153_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_153_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_153_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_153_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_153_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_153_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_153_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_153_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_153_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_153_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_153_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_153_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_153_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: sea\nB: swimming pool\nC: outdoor\nD: lake\nE: roadside",
+    "question": "where is this video taken",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: sea\nB: swimming pool\nC: outdoor\nD: lake\nE: roadside\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_154_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_154_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_154_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_154_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_154_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_154_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_154_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_154_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_154_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_154_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_154_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_154_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_154_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_154_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_154_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_154_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: table\nB: on his lap\nC:  on the cube\nD: face\nE: crossing in front",
+    "question": "where did the man in the video put his right hand most of the time",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: table\nB: on his lap\nC:  on the cube\nD: face\nE: crossing in front\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_155_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_155_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_155_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_155_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_155_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_155_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_155_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_155_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_155_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_155_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_155_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_155_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_155_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_155_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_155_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_155_15.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: pick up the seatbelt\nB: clean her face\nC: pass something to the man sitting\nD: row using hands\nE: turns head towards the lady",
+    "question": "what did the robot do after the woman turned herself towards it",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: pick up the seatbelt\nB: clean her face\nC: pass something to the man sitting\nD: row using hands\nE: turns head towards the lady\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_156_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_156_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_156_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_156_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_156_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_156_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_156_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_156_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_156_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_156_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_156_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_156_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_156_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_156_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_156_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_156_15.jpg"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: three\nB: six\nC: four\nD: five\nE: eight",
+    "question": "how many cars are parked beside the street",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: three\nB: six\nC: four\nD: five\nE: eight\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_157_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_157_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_157_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_157_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_157_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_157_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_157_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_157_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_157_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_157_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_157_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_157_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_157_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_157_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_157_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_157_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: walk\nB: put up her finger\nC: fell\nD: looks forward\nE: go back up",
+    "question": "what happens to the girl after walking backwards",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: walk\nB: put up her finger\nC: fell\nD: looks forward\nE: go back up\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_158_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_158_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_158_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_158_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_158_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_158_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_158_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_158_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_158_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_158_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_158_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_158_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_158_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_158_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_158_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_158_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: change colour of the frames\nB: adjust something\nC: show the frame to the ladies\nD: bored\nE: asking for their opinions",
+    "question": "why did the lady in flower shirt turned the frame to face the two ladies in black",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: change colour of the frames\nB: adjust something\nC: show the frame to the ladies\nD: bored\nE: asking for their opinions\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_159_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_159_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_159_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_159_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_159_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_159_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_159_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_159_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_159_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_159_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_159_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_159_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_159_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_159_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_159_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_159_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: bite back\nB: follow him\nC: run away\nD: plays with the black dog\nE: turn around",
+    "question": "what does the light brown dog do after the dark brown dog turned around to face the other direction in the middle",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: bite back\nB: follow him\nC: run away\nD: plays with the black dog\nE: turn around\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_160_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_160_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_160_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_160_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_160_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_160_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_160_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_160_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_160_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_160_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_160_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_160_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_160_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_160_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_160_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_160_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: the dog rolled it over\nB: dropped from the bicycle\nC: a lady threw it\nD: thrown by man\nE: the kids brought it out",
+    "question": "why was there a ball on the grass nearing the end",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: the dog rolled it over\nB: dropped from the bicycle\nC: a lady threw it\nD: thrown by man\nE: the kids brought it out\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_161_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_161_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_161_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_161_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_161_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_161_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_161_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_161_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_161_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_161_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_161_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_161_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_161_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_161_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_161_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_161_15.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: look at bird\nB: find food\nC: hold for support\nD: try to intimidate dog\nE: show the camera",
+    "question": "why does the nearest parrot bite the cage before moving along it",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: look at bird\nB: find food\nC: hold for support\nD: try to intimidate dog\nE: show the camera\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_162_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_162_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_162_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_162_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_162_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_162_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_162_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_162_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_162_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_162_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_162_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_162_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_162_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_162_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_162_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_162_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: uncomfortable position\nB: playing with baby\nC: crying\nD: stopped from coming out\nE: not able to balance himself",
+    "question": "why does the baby constantly lean its head backwards while being carried",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: uncomfortable position\nB: playing with baby\nC: crying\nD: stopped from coming out\nE: not able to balance himself\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_163_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_163_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_163_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_163_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_163_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_163_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_163_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_163_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_163_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_163_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_163_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_163_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_163_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_163_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_163_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_163_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: rides down on a bike\nB: using sled\nC: held the edge of disc\nD: swing it by the tag\nE: walked down",
+    "question": "how does the man in dark green go down the slope",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: rides down on a bike\nB: using sled\nC: held the edge of disc\nD: swing it by the tag\nE: walked down\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_164_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_164_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_164_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_164_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_164_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_164_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_164_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_164_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_164_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_164_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_164_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_164_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_164_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_164_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_164_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_164_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: direct others attention\nB: take more yoghurt\nC: wanted to play\nD: ask for more water\nE: pose",
+    "question": "why did the baby raise his hand and smile after eating some noodles",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: direct others attention\nB: take more yoghurt\nC: wanted to play\nD: ask for more water\nE: pose\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_165_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_165_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_165_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_165_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_165_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_165_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_165_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_165_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_165_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_165_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_165_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_165_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_165_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_165_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_165_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_165_15.jpg"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: pushed her out\nB: removed her seat belt\nC: hold her dress\nD: hold her hands\nE: lifted her off the ground",
+    "question": "how did the man help the lady get out of the car",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: pushed her out\nB: removed her seat belt\nC: hold her dress\nD: hold her hands\nE: lifted her off the ground\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_166_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_166_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_166_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_166_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_166_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_166_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_166_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_166_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_166_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_166_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_166_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_166_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_166_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_166_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_166_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_166_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: clean it\nB: admiring\nC: playing\nD: control the handle grip\nE: showing it to lady",
+    "question": "why is the person scrubbing the utensil",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: clean it\nB: admiring\nC: playing\nD: control the handle grip\nE: showing it to lady\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_167_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_167_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_167_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_167_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_167_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_167_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_167_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_167_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_167_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_167_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_167_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_167_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_167_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_167_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_167_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_167_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: look at flowers\nB: chase after ball\nC: swing again\nD: look at the grass\nE: chases after the lady",
+    "question": "what did the brown dog do when the ball was thrown the second time",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: look at flowers\nB: chase after ball\nC: swing again\nD: look at the grass\nE: chases after the lady\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_168_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_168_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_168_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_168_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_168_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_168_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_168_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_168_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_168_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_168_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_168_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_168_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_168_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_168_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_168_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_168_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: fighting with other man\nB: talk to the baby\nC: ties shoelaces\nD: falls\nE: adjust his board",
+    "question": "why does the man end up on the ground at the end",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: fighting with other man\nB: talk to the baby\nC: ties shoelaces\nD: falls\nE: adjust his board\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_169_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_169_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_169_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_169_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_169_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_169_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_169_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_169_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_169_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_169_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_169_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_169_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_169_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_169_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_169_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_169_15.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: think there s more food\nB: play with baby\nC: biting finger\nD: inexperienced to use fork\nE: suck juice from fingers",
+    "question": "why did the baby feed himself with his hands",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: think there s more food\nB: play with baby\nC: biting finger\nD: inexperienced to use fork\nE: suck juice from fingers\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_170_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_170_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_170_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_170_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_170_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_170_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_170_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_170_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_170_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_170_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_170_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_170_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_170_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_170_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_170_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_170_15.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: move his feet\nB: eat food\nC: clap hands\nD: drink water\nE: sleeping",
+    "question": "what does the baby do while on the seat",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: move his feet\nB: eat food\nC: clap hands\nD: drink water\nE: sleeping\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_171_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_171_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_171_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_171_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_171_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_171_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_171_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_171_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_171_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_171_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_171_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_171_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_171_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_171_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_171_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_171_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: shift the wooden thing\nB: bring to the person\nC: throw the ball\nD: lays down\nE: jump towards it",
+    "question": "what does the person do after picking up the ball in front of the brown dog in the middle",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: shift the wooden thing\nB: bring to the person\nC: throw the ball\nD: lays down\nE: jump towards it\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_172_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_172_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_172_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_172_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_172_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_172_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_172_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_172_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_172_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_172_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_172_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_172_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_172_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_172_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_172_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_172_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: looking at cat\nB: drinking water\nC: practicing crawling\nD: to reach the sand\nE: reading",
+    "question": "why is the boy lying on the floor at the start",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: looking at cat\nB: drinking water\nC: practicing crawling\nD: to reach the sand\nE: reading\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_173_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_173_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_173_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_173_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_173_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_173_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_173_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_173_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_173_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_173_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_173_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_173_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_173_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_173_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_173_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_173_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: left and right\nB: move his arms\nC: skipping and raising legs\nD: move up and down\nE: man push his swing",
+    "question": "how is the man dancing in the video",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: left and right\nB: move his arms\nC: skipping and raising legs\nD: move up and down\nE: man push his swing\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_174_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_174_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_174_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_174_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_174_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_174_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_174_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_174_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_174_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_174_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_174_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_174_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_174_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_174_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_174_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_174_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: the man\nB: cat\nC: dog\nD: baby\nE: lady in pink",
+    "question": "who is holding the camera",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: the man\nB: cat\nC: dog\nD: baby\nE: lady in pink\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_175_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_175_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_175_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_175_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_175_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_175_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_175_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_175_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_175_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_175_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_175_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_175_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_175_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_175_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_175_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_175_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: hold on to rope\nB: lie on the floating board\nC: hold the poles at the side\nD: use swimming float\nE: hold adult",
+    "question": "how did the babies support themselves as they learnt to swim",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: hold on to rope\nB: lie on the floating board\nC: hold the poles at the side\nD: use swimming float\nE: hold adult\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_176_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_176_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_176_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_176_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_176_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_176_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_176_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_176_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_176_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_176_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_176_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_176_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_176_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_176_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_176_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_176_15.jpg"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: woman is playing with baby\nB: see his reaction\nC: ensure him not fall\nD: they are watching over him\nE: see something",
+    "question": "why did the lady keep looking at the boy",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: woman is playing with baby\nB: see his reaction\nC: ensure him not fall\nD: they are watching over him\nE: see something\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_177_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_177_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_177_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_177_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_177_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_177_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_177_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_177_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_177_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_177_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_177_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_177_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_177_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_177_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_177_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_177_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: rubbed his nose\nB: looking at a book\nC: he cried\nD: laughing\nE: he stared at the other man",
+    "question": "how did the man react when the lady put her hand on his shoulder",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: rubbed his nose\nB: looking at a book\nC: he cried\nD: laughing\nE: he stared at the other man\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_178_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_178_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_178_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_178_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_178_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_178_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_178_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_178_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_178_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_178_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_178_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_178_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_178_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_178_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_178_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_178_15.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: making a call\nB: taking photo\nC: gesturing\nD: posing for camera\nE: acting",
+    "question": "why did the bald man hold his phone up",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: making a call\nB: taking photo\nC: gesturing\nD: posing for camera\nE: acting\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_179_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_179_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_179_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_179_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_179_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_179_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_179_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_179_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_179_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_179_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_179_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_179_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_179_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_179_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_179_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_179_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: throwing grass\nB: exercise\nC: pose for photo\nD: point at something\nE: throw the ball",
+    "question": "why did the boy stretch his right arm out at the start of the video",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: throwing grass\nB: exercise\nC: pose for photo\nD: point at something\nE: throw the ball\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_180_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_180_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_180_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_180_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_180_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_180_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_180_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_180_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_180_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_180_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_180_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_180_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_180_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_180_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_180_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_180_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: she knock the car\nB: help to move the car\nC: nod her head\nD: give thumbs up\nE: hold the car wheels",
+    "question": "what did the lady in pink do after the lady in blue pushed the car",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: she knock the car\nB: help to move the car\nC: nod her head\nD: give thumbs up\nE: hold the car wheels\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_181_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_181_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_181_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_181_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_181_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_181_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_181_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_181_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_181_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_181_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_181_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_181_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_181_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_181_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_181_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_181_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: posing with the toy\nB: role playing\nC: trying to look cool\nD: play with baby\nE: pass to adult",
+    "question": "why does the boy in black hold the red toy up",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: posing with the toy\nB: role playing\nC: trying to look cool\nD: play with baby\nE: pass to adult\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_182_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_182_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_182_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_182_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_182_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_182_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_182_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_182_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_182_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_182_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_182_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_182_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_182_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_182_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_182_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_182_15.jpg"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: eating dinner\nB: playing with toy on table\nC: drinking water\nD: resting\nE: watchign television",
+    "question": "why is the boy sitting down",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: eating dinner\nB: playing with toy on table\nC: drinking water\nD: resting\nE: watchign television\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_183_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_183_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_183_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_183_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_183_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_183_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_183_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_183_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_183_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_183_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_183_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_183_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_183_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_183_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_183_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_183_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: hug adult\nB: stands on the table\nC: look at girl in white\nD: push girl\nE: cover her face",
+    "question": "what did the lady in blue do after putting her phone down at the start",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: hug adult\nB: stands on the table\nC: look at girl in white\nD: push girl\nE: cover her face\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_184_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_184_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_184_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_184_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_184_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_184_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_184_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_184_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_184_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_184_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_184_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_184_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_184_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_184_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_184_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_184_15.jpg"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: greeting\nB: call waiter\nC: to direct traffic\nD: teach her piano\nE: showing off flag",
+    "question": "why did the man and lady wave their hands in the air",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: greeting\nB: call waiter\nC: to direct traffic\nD: teach her piano\nE: showing off flag\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_185_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_185_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_185_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_185_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_185_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_185_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_185_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_185_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_185_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_185_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_185_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_185_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_185_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_185_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_185_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_185_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: grey\nB: orange\nC: green\nD: white and red\nE: blue",
+    "question": "what is the colour of the lady s bag",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: grey\nB: orange\nC: green\nD: white and red\nE: blue\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_186_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_186_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_186_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_186_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_186_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_186_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_186_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_186_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_186_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_186_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_186_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_186_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_186_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_186_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_186_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_186_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: school\nB: restaurant\nC: in house\nD: front porch\nE: museum",
+    "question": "where could this be happening",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: school\nB: restaurant\nC: in house\nD: front porch\nE: museum\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_187_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_187_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_187_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_187_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_187_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_187_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_187_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_187_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_187_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_187_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_187_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_187_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_187_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_187_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_187_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_187_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: to see where they are going\nB: look out for cars\nC: to dodge baby hitting her\nD: see who is behind them\nE: curious",
+    "question": "why do the two ladies turn their heads backwards at the start",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: to see where they are going\nB: look out for cars\nC: to dodge baby hitting her\nD: see who is behind them\nE: curious\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_188_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_188_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_188_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_188_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_188_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_188_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_188_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_188_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_188_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_188_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_188_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_188_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_188_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_188_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_188_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_188_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: to move away\nB: chasing\nC: play with dog\nD: comfortable\nE: brush the hair away",
+    "question": "why does one cat nudge the other at the beginning",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: to move away\nB: chasing\nC: play with dog\nD: comfortable\nE: brush the hair away\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_189_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_189_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_189_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_189_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_189_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_189_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_189_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_189_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_189_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_189_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_189_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_189_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_189_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_189_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_189_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_189_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: minion soft toy\nB: dog\nC: cup\nD: pacifier\nE: flower",
+    "question": "what is the boy holding in his hand while inside the glass",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: minion soft toy\nB: dog\nC: cup\nD: pacifier\nE: flower\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_190_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_190_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_190_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_190_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_190_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_190_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_190_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_190_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_190_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_190_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_190_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_190_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_190_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_190_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_190_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_190_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: man is controlling\nB: live video\nC: trying to open the item\nD: talk to each other\nE: dancing",
+    "question": "why is the screen behind the person changing",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: man is controlling\nB: live video\nC: trying to open the item\nD: talk to each other\nE: dancing\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_191_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_191_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_191_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_191_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_191_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_191_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_191_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_191_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_191_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_191_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_191_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_191_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_191_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_191_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_191_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_191_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: start claping\nB: unwrapping present\nC: show baby how to steer wheel\nD: pick up something\nE: touch ball",
+    "question": "what does the lady do after bending down at the start of the video",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: start claping\nB: unwrapping present\nC: show baby how to steer wheel\nD: pick up something\nE: touch ball\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_192_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_192_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_192_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_192_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_192_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_192_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_192_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_192_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_192_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_192_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_192_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_192_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_192_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_192_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_192_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_192_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: sitting\nB: holding hands\nC: causal\nD: standing\nE: lying on sofa",
+    "question": "how are the people positioned around the table",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: sitting\nB: holding hands\nC: causal\nD: standing\nE: lying on sofa\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_193_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_193_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_193_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_193_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_193_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_193_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_193_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_193_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_193_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_193_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_193_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_193_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_193_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_193_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_193_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_193_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: jump to another place\nB: fly away\nC: jump around\nD: walk towards the camera\nE: sit down on lap",
+    "question": "where did one of the birds go towards the end of the video",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: jump to another place\nB: fly away\nC: jump around\nD: walk towards the camera\nE: sit down on lap\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_194_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_194_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_194_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_194_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_194_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_194_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_194_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_194_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_194_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_194_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_194_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_194_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_194_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_194_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_194_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_194_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: man feed\nB: wears napkin\nC: lady feed the baby\nD: eats slowly\nE: wears a bib",
+    "question": "how is the food prevented from spilling onto the baby s clothings",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: man feed\nB: wears napkin\nC: lady feed the baby\nD: eats slowly\nE: wears a bib\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_195_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_195_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_195_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_195_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_195_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_195_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_195_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_195_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_195_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_195_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_195_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_195_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_195_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_195_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_195_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_195_15.jpg"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: pose\nB: prevent car from moving\nC: crafting\nD: clean the wheels\nE: change the wheels",
+    "question": "why did the lady in blue shirt hold the car wheels in the middle of the video",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: pose\nB: prevent car from moving\nC: crafting\nD: clean the wheels\nE: change the wheels\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_196_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_196_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_196_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_196_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_196_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_196_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_196_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_196_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_196_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_196_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_196_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_196_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_196_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_196_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_196_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_196_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: for food\nB: get the toy\nC: man pat sofa\nD: catch the snowball\nE: to fight with cat",
+    "question": "why did the dog jump up very high in the middle",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: for food\nB: get the toy\nC: man pat sofa\nD: catch the snowball\nE: to fight with cat\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_197_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_197_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_197_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_197_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_197_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_197_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_197_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_197_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_197_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_197_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_197_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_197_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_197_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_197_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_197_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_197_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: raises his hands and smiles\nB: turn to look at the lady\nC: bite it\nD: dips her ladle in\nE: disappointed",
+    "question": "what does the baby do after finishing the noodles at the end",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: raises his hands and smiles\nB: turn to look at the lady\nC: bite it\nD: dips her ladle in\nE: disappointed\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_198_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_198_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_198_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_198_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_198_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_198_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_198_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_198_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_198_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_198_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_198_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_198_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_198_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_198_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_198_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_198_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_next_qa",
+    "visual_input_component": "16 natural images",
+    "source": "next_qa",
+    "options": "A: playing the guitar\nB: graduation ceremony\nC: preview of the living room\nD: sports game\nE: book launch",
+    "question": "what is the video about",
+    "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: playing the guitar\nB: graduation ceremony\nC: preview of the living room\nD: sports game\nE: book launch\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_199_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_199_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_199_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_199_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_199_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_199_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_199_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_199_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_199_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_199_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_199_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_199_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_199_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_199_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_199_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_next_qa/casuality_reasoning_next_qa_199_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_0_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_0_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_0_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_0_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_1_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_1_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_1_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_1_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_2_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_2_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_2_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_2_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_3_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_3_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_3_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_3_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_4_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_4_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_4_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_4_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_5_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_5_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_5_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_5_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_6_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_6_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_6_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_6_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_7_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_7_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_7_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_7_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_8_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_8_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_8_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_8_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_9_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_9_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_9_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_9_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_10_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_10_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_10_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_10_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_11_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_11_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_11_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_11_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_12_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_12_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_12_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_12_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_13_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_13_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_13_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_13_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_14_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_14_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_14_2.png",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_14_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_15_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_15_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_15_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_15_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_16_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_16_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_16_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_16_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_17_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_17_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_17_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_17_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_18_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_18_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_18_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_18_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_19_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_19_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_19_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_19_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_20_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_20_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_20_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_20_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_21_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_21_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_21_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_21_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_22_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_22_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_22_2.png",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_22_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_23_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_23_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_23_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_23_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_24_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_24_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_24_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_24_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_25_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_25_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_25_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_25_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_26_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_26_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_26_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_26_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_27_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_27_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_27_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_27_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_28_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_28_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_28_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_28_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_29_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_29_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_29_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_29_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_30_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_30_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_30_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_30_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_31_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_31_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_31_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_31_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_32_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_32_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_32_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_32_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_33_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_33_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_33_2.png",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_33_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_34_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_34_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_34_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_34_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_35_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_35_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_35_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_35_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_36_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_36_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_36_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_36_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_37_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_37_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_37_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_37_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_38_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_38_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_38_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_38_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_39_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_39_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_39_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_39_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_40_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_40_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_40_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_40_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_41_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_41_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_41_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_41_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_42_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_42_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_42_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_42_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_43_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_43_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_43_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_43_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_44_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_44_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_44_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_44_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_45_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_45_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_45_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_45_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_46_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_46_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_46_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_46_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_47_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_47_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_47_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_47_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_48_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_48_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_48_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_48_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_49_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_49_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_49_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_49_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_50_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_50_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_50_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_50_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_51_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_51_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_51_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_51_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_52_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_52_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_52_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_52_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_53_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_53_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_53_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_53_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_54_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_54_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_54_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_54_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_55_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_55_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_55_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_55_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_56_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_56_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_56_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_56_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_57_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_57_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_57_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_57_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_58_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_58_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_58_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_58_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_59_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_59_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_59_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_59_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_60_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_60_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_60_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_60_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_61_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_61_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_61_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_61_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_62_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_62_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_62_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_62_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_63_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_63_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_63_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_63_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_64_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_64_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_64_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_64_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_65_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_65_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_65_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_65_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_66_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_66_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_66_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_66_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_67_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_67_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_67_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_67_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_68_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_68_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_68_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_68_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_69_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_69_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_69_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_69_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_70_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_70_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_70_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_70_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_71_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_71_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_71_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_71_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_72_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_72_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_72_2.png",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_72_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_73_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_73_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_73_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_73_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_74_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_74_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_74_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_74_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_75_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_75_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_75_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_75_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_76_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_76_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_76_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_76_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_77_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_77_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_77_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_77_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_78_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_78_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_78_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_78_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_79_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_79_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_79_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_79_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_80_0.png",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_80_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_80_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_80_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_81_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_81_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_81_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_81_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_82_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_82_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_82_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_82_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_83_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_83_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_83_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_83_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_84_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_84_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_84_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_84_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_85_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_85_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_85_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_85_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_86_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_86_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_86_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_86_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_87_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_87_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_87_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_87_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_88_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_88_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_88_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_88_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_89_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_89_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_89_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_89_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_90_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_90_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_90_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_90_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_91_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_91_1.png",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_91_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_91_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_92_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_92_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_92_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_92_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_93_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_93_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_93_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_93_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_94_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_94_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_94_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_94_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_95_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_95_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_95_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_95_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_96_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_96_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_96_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_96_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_97_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_97_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_97_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_97_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_98_0.png",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_98_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_98_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_98_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_99_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_99_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_99_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_99_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_100_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_100_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_100_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_100_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_101_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_101_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_101_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_101_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_102_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_102_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_102_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_102_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_103_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_103_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_103_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_103_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_104_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_104_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_104_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_104_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_105_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_105_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_105_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_105_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_106_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_106_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_106_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_106_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_107_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_107_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_107_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_107_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_108_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_108_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_108_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_108_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_109_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_109_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_109_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_109_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_110_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_110_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_110_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_110_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_111_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_111_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_111_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_111_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_112_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_112_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_112_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_112_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_113_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_113_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_113_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_113_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_114_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_114_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_114_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_114_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_115_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_115_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_115_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_115_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_116_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_116_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_116_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_116_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_117_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_117_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_117_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_117_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_118_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_118_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_118_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_118_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_119_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_119_1.png",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_119_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_119_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_120_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_120_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_120_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_120_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_121_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_121_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_121_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_121_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_122_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_122_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_122_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_122_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_123_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_123_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_123_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_123_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_124_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_124_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_124_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_124_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_125_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_125_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_125_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_125_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_126_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_126_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_126_2.png",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_126_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_127_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_127_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_127_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_127_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_128_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_128_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_128_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_128_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_129_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_129_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_129_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_129_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_130_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_130_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_130_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_130_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_131_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_131_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_131_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_131_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_132_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_132_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_132_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_132_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_133_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_133_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_133_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_133_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_134_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_134_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_134_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_134_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_135_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_135_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_135_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_135_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_136_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_136_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_136_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_136_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_137_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_137_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_137_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_137_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_138_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_138_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_138_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_138_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_139_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_139_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_139_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_139_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_140_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_140_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_140_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_140_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_141_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_141_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_141_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_141_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_142_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_142_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_142_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_142_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_143_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_143_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_143_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_143_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_144_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_144_1.png",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_144_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_144_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_145_0.png",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_145_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_145_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_145_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_146_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_146_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_146_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_146_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_147_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_147_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_147_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_147_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_148_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_148_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_148_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_148_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_149_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_149_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_149_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_149_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_150_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_150_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_150_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_150_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_151_0.png",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_151_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_151_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_151_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_152_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_152_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_152_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_152_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_153_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_153_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_153_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_153_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_154_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_154_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_154_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_154_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_155_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_155_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_155_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_155_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_156_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_156_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_156_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_156_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_157_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_157_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_157_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_157_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_158_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_158_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_158_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_158_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_159_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_159_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_159_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_159_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_160_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_160_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_160_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_160_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_161_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_161_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_161_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_161_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_162_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_162_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_162_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_162_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_163_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_163_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_163_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_163_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_164_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_164_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_164_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_164_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_165_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_165_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_165_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_165_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_166_0.png",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_166_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_166_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_166_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_167_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_167_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_167_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_167_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_168_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_168_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_168_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_168_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_169_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_169_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_169_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_169_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_170_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_170_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_170_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_170_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_171_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_171_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_171_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_171_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_172_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_172_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_172_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_172_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_173_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_173_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_173_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_173_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_174_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_174_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_174_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_174_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_175_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_175_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_175_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_175_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_176_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_176_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_176_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_176_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_177_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_177_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_177_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_177_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_178_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_178_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_178_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_178_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_179_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_179_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_179_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_179_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_180_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_180_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_180_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_180_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_181_0.png",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_181_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_181_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_181_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_182_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_182_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_182_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_182_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_183_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_183_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_183_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_183_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_184_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_184_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_184_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_184_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_185_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_185_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_185_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_185_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_186_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_186_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_186_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_186_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_187_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_187_1.png",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_187_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_187_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_188_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_188_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_188_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_188_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_189_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_189_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_189_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_189_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_190_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_190_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_190_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_190_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_191_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_191_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_191_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_191_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_192_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_192_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_192_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_192_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_193_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_193_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_193_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_193_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_194_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_194_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_194_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_194_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_195_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_195_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_195_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_195_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_196_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_196_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_196_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_196_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_197_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_197_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_197_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_197_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_198_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_198_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_198_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_198_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_findingemo",
+    "visual_input_component": "4 natural images",
+    "source": "findingemo",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_199_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_199_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_199_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_findingemo/emotion_recognition_findingemo_199_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: An athlete is seen standing up to a circle and leads into him throwing a discuss and his face being shown afterwards.\nB: An athlete is observed sitting in a circle, sharing his experiences about discus throw, with his face filled with pride.\nC: The athlete, after finishing his discuss throw, stood in a circle for a post-game interview, his face beaming with pride.\n\nD: The athlete, previously encircled by fans, gently hands over the discus for autographs before merrily snapping selfies showcasing his radiant smile.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: An athlete is seen standing up to a circle and leads into him throwing a discuss and his face being shown afterwards.\nB: An athlete is observed sitting in a circle, sharing his experiences about discus throw, with his face filled with pride.\nC: The athlete, after finishing his discuss throw, stood in a circle for a post-game interview, his face beaming with pride.\n\nD: The athlete, previously encircled by fans, gently hands over the discus for autographs before merrily snapping selfies showcasing his radiant smile.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_0_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_0_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_0_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_0_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_0_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_0_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_0_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_0_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_0_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_0_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_0_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_0_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_0_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_0_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_0_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_0_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: The man waters the sections of the vine.\nB: The man admires the growth of the vine.\nC: The man cuts a few parts of the vine.\nD: The man waters the sections of the vine.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The man waters the sections of the vine.\nB: The man admires the growth of the vine.\nC: The man cuts a few parts of the vine.\nD: The man waters the sections of the vine.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_1_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_1_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_1_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_1_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_1_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_1_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_1_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_1_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_1_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_1_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_1_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_1_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_1_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_1_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_1_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_1_15.jpg"
+    ],
+    "output": "F"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: They spilled pasta from the bowl onto the floor.\nB: They spilled pasta from the bowl all over the floor.\nC: They put pasta in the bowl and stir it around.\nD: They spilled pasta from the bowl onto the floor.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: They spilled pasta from the bowl onto the floor.\nB: They spilled pasta from the bowl all over the floor.\nC: They put pasta in the bowl and stir it around.\nD: They spilled pasta from the bowl onto the floor.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_2_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_2_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_2_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_2_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_2_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_2_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_2_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_2_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_2_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_2_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_2_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_2_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_2_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_2_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_2_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_2_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: A bartender explains and shows how to prepare exotic alcoholic drinks in glasses using alcohol and juice.\nB: A bartender spills juice and alcohol, ruining the attempt to create exotic drinks in glasses.\nC: A bartender spills alcohol and juice while clumsily attempting to create exotic alcoholic drinks in glasses.\nD: A bartender spills alcohol and juice while clumsily trying to mix exotic drinks in glasses.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A bartender explains and shows how to prepare exotic alcoholic drinks in glasses using alcohol and juice.\nB: A bartender spills juice and alcohol, ruining the attempt to create exotic drinks in glasses.\nC: A bartender spills alcohol and juice while clumsily attempting to create exotic alcoholic drinks in glasses.\nD: A bartender spills alcohol and juice while clumsily trying to mix exotic drinks in glasses.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_3_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_3_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_3_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_3_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_3_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_3_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_3_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_3_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_3_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_3_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_3_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_3_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_3_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_3_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_3_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_3_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: Many glance at the camera as they calmly rest their faces and arms, without any frantic movements.\nB: Many speak to the camera while doing activities and continue to rub it all over their faces and arms.\nC: Numerous individuals adjust the camera angle during their activities and proceed to display their faces and arms prominently.\nD: Several engage with the camera during tasks and persistently clean their faces and arms.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: Many glance at the camera as they calmly rest their faces and arms, without any frantic movements.\nB: Many speak to the camera while doing activities and continue to rub it all over their faces and arms.\nC: Numerous individuals adjust the camera angle during their activities and proceed to display their faces and arms prominently.\nD: Several engage with the camera during tasks and persistently clean their faces and arms.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_4_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_4_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_4_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_4_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_4_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_4_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_4_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_4_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_4_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_4_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_4_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_4_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_4_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_4_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_4_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_4_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: She starts to paint her legs.\nB: She starts to paint her legs.\nC: She starts to paint her legs.\nD: She begins to shave her legs.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: She starts to paint her legs.\nB: She starts to paint her legs.\nC: She starts to paint her legs.\nD: She begins to shave her legs.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_5_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_5_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_5_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_5_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_5_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_5_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_5_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_5_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_5_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_5_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_5_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_5_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_5_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_5_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_5_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_5_15.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: A woman is gently brushing her cat on the sofa.\nB: A cat is being held down in a woman's lap.\nC: A woman is gently brushing her cat on her lap.\nD: A woman is stroking a cat on her lap.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A woman is gently brushing her cat on the sofa.\nB: A cat is being held down in a woman's lap.\nC: A woman is gently brushing her cat on her lap.\nD: A woman is stroking a cat on her lap.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_6_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_6_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_6_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_6_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_6_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_6_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_6_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_6_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_6_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_6_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_6_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_6_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_6_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_6_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_6_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_6_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: Several more women are depicted painting the same mural, followed by close-up shots of their artwork immediately afterwards.\nB: Several more women are shown making the same jump down the track followed by slow motion shots of their jump immediately afterwards.\nC: Several more women are depicted painting on the canvas, followed by close-up images of their artwork immediately afterwards.\nD: Several more women are depicted painting the same mural on the track, accompanied by slow motion footage of their artistic process immediately afterwards.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: Several more women are depicted painting the same mural, followed by close-up shots of their artwork immediately afterwards.\nB: Several more women are shown making the same jump down the track followed by slow motion shots of their jump immediately afterwards.\nC: Several more women are depicted painting on the canvas, followed by close-up images of their artwork immediately afterwards.\nD: Several more women are depicted painting the same mural on the track, accompanied by slow motion footage of their artistic process immediately afterwards.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_7_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_7_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_7_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_7_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_7_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_7_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_7_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_7_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_7_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_7_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_7_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_7_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_7_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_7_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_7_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_7_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: A man wearing a chef's hat is seen speaking to the camera and leads into a completed cake made and various ingredients being poured into a bowl.\nB: A man in a chef's hat silently shows a finished cake to the camera, then begins to mix various ingredients in a bowl.\nC: A man in a chef's hat is shown tasting a finished cake, adding ingredients to a bowl, and ignoring the camera.\nD: A man in a chef's hat is seen carefully arranging various ingredients in a bowl, before showcasing a beautifully finished cake to the camera.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A man wearing a chef's hat is seen speaking to the camera and leads into a completed cake made and various ingredients being poured into a bowl.\nB: A man in a chef's hat silently shows a finished cake to the camera, then begins to mix various ingredients in a bowl.\nC: A man in a chef's hat is shown tasting a finished cake, adding ingredients to a bowl, and ignoring the camera.\nD: A man in a chef's hat is seen carefully arranging various ingredients in a bowl, before showcasing a beautifully finished cake to the camera.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_8_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_8_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_8_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_8_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_8_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_8_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_8_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_8_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_8_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_8_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_8_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_8_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_8_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_8_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_8_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_8_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: Sentence: A man is adjusting the wall clock by the window and glances at the halfway drawn curtains.\nB: man is walking by a halfway and puth the courtains in the wall by the window.\nC: A man is painting a wall near a window and hangs a picture next to the curtains in the hallway.\nD: The man is fixing the clock on the wall next to the window in the hallway.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: Sentence: A man is adjusting the wall clock by the window and glances at the halfway drawn curtains.\nB: man is walking by a halfway and puth the courtains in the wall by the window.\nC: A man is painting a wall near a window and hangs a picture next to the curtains in the hallway.\nD: The man is fixing the clock on the wall next to the window in the hallway.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_9_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_9_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_9_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_9_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_9_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_9_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_9_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_9_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_9_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_9_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_9_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_9_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_9_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_9_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_9_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_9_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: A dog is seen running through a yard and performing various frisbee tricks with a woman.\nB: A woman is observed teaching a dog how to paint in a studio.\nC: A woman is observed teaching a dog to sit and stay in a yard.\nD: A woman is seen sitting in a yard, grooming her dog and teaching it obedience commands.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A dog is seen running through a yard and performing various frisbee tricks with a woman.\nB: A woman is observed teaching a dog how to paint in a studio.\nC: A woman is observed teaching a dog to sit and stay in a yard.\nD: A woman is seen sitting in a yard, grooming her dog and teaching it obedience commands.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_10_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_10_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_10_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_10_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_10_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_10_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_10_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_10_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_10_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_10_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_10_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_10_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_10_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_10_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_10_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_10_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: She is demonstrating how to unclog the sink with a toothbrush.\nB: She is demonstrating how to paint a sink with a toothbrush.\nC: She is showing how to clean the sink using a toothbrush.\nD: She is teaching how to paint a sink with a toothbrush.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: She is demonstrating how to unclog the sink with a toothbrush.\nB: She is demonstrating how to paint a sink with a toothbrush.\nC: She is showing how to clean the sink using a toothbrush.\nD: She is teaching how to paint a sink with a toothbrush.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_11_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_11_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_11_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_11_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_11_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_11_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_11_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_11_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_11_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_11_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_11_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_11_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_11_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_11_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_11_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_11_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: The man in black is walking away from us.\nB: The man in black turns his back to us.\nC: We see a man in black from the front.\nD: The man in black turned around.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The man in black is walking away from us.\nB: The man in black turns his back to us.\nC: We see a man in black from the front.\nD: The man in black turned around.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_12_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_12_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_12_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_12_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_12_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_12_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_12_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_12_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_12_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_12_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_12_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_12_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_12_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_12_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_12_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_12_15.jpg"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: Scenes of various pole vaulters vaulting before audiences are shown.\nB: Various pole vaulters are signing autographs for audiences.\nC: Various pole vaulters are signing autographs for their audiences.\nD: Pole vaulters are signing autographs for their enthusiastic audiences.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: Scenes of various pole vaulters vaulting before audiences are shown.\nB: Various pole vaulters are signing autographs for audiences.\nC: Various pole vaulters are signing autographs for their audiences.\nD: Pole vaulters are signing autographs for their enthusiastic audiences.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_13_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_13_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_13_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_13_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_13_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_13_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_13_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_13_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_13_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_13_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_13_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_13_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_13_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_13_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_13_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_13_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: He carefully studies the ball, examining its texture and weight.\nB: He spins around several times with the ball.\nC: He gently tosses the ball back and forth.\nD: He gently sleeps with the ball beside him.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: He carefully studies the ball, examining its texture and weight.\nB: He spins around several times with the ball.\nC: He gently tosses the ball back and forth.\nD: He gently sleeps with the ball beside him.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_14_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_14_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_14_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_14_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_14_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_14_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_14_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_14_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_14_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_14_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_14_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_14_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_14_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_14_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_14_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_14_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: Bryan McBride, a man known for his calm demeanor, sits pensively in a quiet corner, engrossed in a thick book.\nB: A man named BRYAN MCBRIDE is standing and then begins his high jump where he clears it, lands and vigorously cheers as he runs off.\nC: Bryan McBride, a prominent figure, is seated at a conference, attentively listening and occasionally nodding in agreement.\nD: BRYAN MCBRIDE, a well-known individual, is calmly reading a book, completely absorbed in its captivating plot.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: Bryan McBride, a man known for his calm demeanor, sits pensively in a quiet corner, engrossed in a thick book.\nB: A man named BRYAN MCBRIDE is standing and then begins his high jump where he clears it, lands and vigorously cheers as he runs off.\nC: Bryan McBride, a prominent figure, is seated at a conference, attentively listening and occasionally nodding in agreement.\nD: BRYAN MCBRIDE, a well-known individual, is calmly reading a book, completely absorbed in its captivating plot.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_15_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_15_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_15_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_15_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_15_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_15_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_15_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_15_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_15_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_15_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_15_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_15_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_15_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_15_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_15_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_15_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: The man in black is walking away from us.\nB: The man in black turns his back to us.\nC: We see a man in black from the front.\nD: The man in black turned around.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The man in black is walking away from us.\nB: The man in black turns his back to us.\nC: We see a man in black from the front.\nD: The man in black turned around.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_16_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_16_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_16_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_16_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_16_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_16_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_16_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_16_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_16_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_16_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_16_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_16_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_16_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_16_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_16_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_16_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: She is arguing with two men in a conference room.\nB: She is discussing a business plan with two men at a conference.\nC: She is swiming next to two men in a pool.\nD: She is discussing business with two men in a meeting.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: She is arguing with two men in a conference room.\nB: She is discussing a business plan with two men at a conference.\nC: She is swiming next to two men in a pool.\nD: She is discussing business with two men in a meeting.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_17_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_17_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_17_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_17_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_17_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_17_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_17_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_17_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_17_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_17_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_17_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_17_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_17_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_17_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_17_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_17_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: Several clips depict people feeding and caring for bulls.\nB: Several clips depict individuals feeding and caring for bulls.\nC: More clips are shown of people taunting bulls.\nD: People are seen in the clips, feeding the bulls.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: Several clips depict people feeding and caring for bulls.\nB: Several clips depict individuals feeding and caring for bulls.\nC: More clips are shown of people taunting bulls.\nD: People are seen in the clips, feeding the bulls.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_18_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_18_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_18_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_18_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_18_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_18_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_18_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_18_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_18_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_18_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_18_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_18_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_18_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_18_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_18_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_18_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: The individual slips on a pair of gloves, adjusting them before sitting down comfortably.\nB: The person then puts a pair of shoes on and tying them and ending by standing up straight.\nC: The person grabs a pair of shoes, tossing them out the window, and then reclines on the couch.\nD: The person quickly slips on a pair of shoes, promptly kicks a ball, and finally assumes a defensive stance.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The individual slips on a pair of gloves, adjusting them before sitting down comfortably.\nB: The person then puts a pair of shoes on and tying them and ending by standing up straight.\nC: The person grabs a pair of shoes, tossing them out the window, and then reclines on the couch.\nD: The person quickly slips on a pair of shoes, promptly kicks a ball, and finally assumes a defensive stance.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_19_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_19_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_19_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_19_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_19_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_19_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_19_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_19_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_19_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_19_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_19_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_19_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_19_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_19_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_19_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_19_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: A shirtless man is sunbathing near the pole vault area.\n\nB: A shirtless guy does a pole vault.\nC: A shirtless man lounges by the pool.\nD: A shirtless man is sunbathing by the pool.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A shirtless man is sunbathing near the pole vault area.\n\nB: A shirtless guy does a pole vault.\nC: A shirtless man lounges by the pool.\nD: A shirtless man is sunbathing by the pool.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_20_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_20_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_20_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_20_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_20_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_20_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_20_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_20_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_20_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_20_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_20_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_20_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_20_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_20_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_20_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_20_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: A man dressed in a blue and black uniform is standing on top of a mat preparing to do his routine.\nB: A man in a blue and black uniform is sitting on a mat, taking a break from his training.\nC: A man in a blue and black uniform is sitting on a mat, tying his shoelaces.\nD: A man in a blue and black uniform is sitting on a mat, tying his shoelaces.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A man dressed in a blue and black uniform is standing on top of a mat preparing to do his routine.\nB: A man in a blue and black uniform is sitting on a mat, taking a break from his training.\nC: A man in a blue and black uniform is sitting on a mat, tying his shoelaces.\nD: A man in a blue and black uniform is sitting on a mat, tying his shoelaces.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_21_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_21_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_21_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_21_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_21_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_21_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_21_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_21_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_21_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_21_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_21_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_21_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_21_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_21_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_21_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_21_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: A large vegetable is being planted and watered.\nB: A large vegetable is being planted and nurtured.\nC: A large vegetable is being planted and watered.\nD: A large vegetable is being peeled and chopped.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A large vegetable is being planted and watered.\nB: A large vegetable is being planted and nurtured.\nC: A large vegetable is being planted and watered.\nD: A large vegetable is being peeled and chopped.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_22_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_22_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_22_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_22_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_22_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_22_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_22_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_22_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_22_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_22_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_22_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_22_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_22_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_22_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_22_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_22_15.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: She pulls out a tissue paper from the bag.\nB: She removes the tissue paper from the bag.\nC: She removes some tissue paper from the bag.\nD: She adds more tissue paper to the bag.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: She pulls out a tissue paper from the bag.\nB: She removes the tissue paper from the bag.\nC: She removes some tissue paper from the bag.\nD: She adds more tissue paper to the bag.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_23_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_23_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_23_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_23_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_23_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_23_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_23_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_23_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_23_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_23_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_23_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_23_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_23_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_23_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_23_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_23_15.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: The person removes their shoe, places it under the sunlight to dry, and then puts it back on.\nB: The person then takes their shoe off to run under the water and then put on again.\nC: The person removes their shoe to shake out a pebble before putting it back on.\nD: The person removes their shoe, places it in the sunlight to dry, and then wears it again.\n",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The person removes their shoe, places it under the sunlight to dry, and then puts it back on.\nB: The person then takes their shoe off to run under the water and then put on again.\nC: The person removes their shoe to shake out a pebble before putting it back on.\nD: The person removes their shoe, places it in the sunlight to dry, and then wears it again.\n\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_24_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_24_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_24_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_24_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_24_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_24_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_24_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_24_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_24_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_24_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_24_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_24_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_24_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_24_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_24_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_24_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: A person is seen walking in with a tire on a plank and painting the tire.\nB: A person is observed rolling a tire on a plank and polishing it.\nC: A person is spotted using a plank to roll a tire into a recycling facility.\nD: A person is observed rolling a tire on a plank and washing it.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A person is seen walking in with a tire on a plank and painting the tire.\nB: A person is observed rolling a tire on a plank and polishing it.\nC: A person is spotted using a plank to roll a tire into a recycling facility.\nD: A person is observed rolling a tire on a plank and washing it.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_25_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_25_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_25_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_25_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_25_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_25_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_25_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_25_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_25_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_25_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_25_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_25_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_25_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_25_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_25_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_25_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: A man is brushing his teeth in front of the camera.\nB: A man is juggling balls for the camera's amusement.\nC: A man is posing for a selfie in front of the camera.\nD: A man is eating an apple in front of the camera.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A man is brushing his teeth in front of the camera.\nB: A man is juggling balls for the camera's amusement.\nC: A man is posing for a selfie in front of the camera.\nD: A man is eating an apple in front of the camera.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_26_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_26_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_26_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_26_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_26_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_26_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_26_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_26_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_26_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_26_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_26_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_26_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_26_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_26_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_26_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_26_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: The lady squeezes a lemon into oil, combines it with other ingredients, grates something over it, and mixes everything together.\nB: The lady collects the ingredients, including a lemon and oil, writes something on top of the recipe card, and rates it before tidying up.\nC: The lady paints a lemon, places it on an oil canvas, integrates other elements, inscribes a name on top, and mix the colors thoroughly.\nD: The lady juices a lemon and pours it in oil and adds other ingredients and rates something on top before stirring it up.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The lady squeezes a lemon into oil, combines it with other ingredients, grates something over it, and mixes everything together.\nB: The lady collects the ingredients, including a lemon and oil, writes something on top of the recipe card, and rates it before tidying up.\nC: The lady paints a lemon, places it on an oil canvas, integrates other elements, inscribes a name on top, and mix the colors thoroughly.\nD: The lady juices a lemon and pours it in oil and adds other ingredients and rates something on top before stirring it up.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_27_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_27_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_27_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_27_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_27_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_27_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_27_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_27_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_27_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_27_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_27_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_27_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_27_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_27_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_27_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_27_15.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: In the end, he starts to relax a bit, before ultimately falling asleep.\nB: At the end he begins to struggle bit, but finally finished.\nC: He peacefully surrendered at the end, but initially put up a fight.\nD: In the end, he starts to relax a bit, but eventually falls asleep.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: In the end, he starts to relax a bit, before ultimately falling asleep.\nB: At the end he begins to struggle bit, but finally finished.\nC: He peacefully surrendered at the end, but initially put up a fight.\nD: In the end, he starts to relax a bit, but eventually falls asleep.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_28_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_28_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_28_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_28_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_28_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_28_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_28_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_28_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_28_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_28_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_28_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_28_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_28_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_28_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_28_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_28_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: She removed the contact from her eye.\nB: She lost her contact from her eye.\nC: She removed the contact from her eye.\nD: She put the contact into her eye.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: She removed the contact from her eye.\nB: She lost her contact from her eye.\nC: She removed the contact from her eye.\nD: She put the contact into her eye.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_29_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_29_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_29_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_29_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_29_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_29_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_29_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_29_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_29_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_29_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_29_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_29_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_29_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_29_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_29_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_29_15.jpg"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: He sits down and stretches his body out.\nB: He jumps into the air and flips his body around.\nC: He sits on the ground and stills his body completely.\nD: He lays on the ground and stretches his body out.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: He sits down and stretches his body out.\nB: He jumps into the air and flips his body around.\nC: He sits on the ground and stills his body completely.\nD: He lays on the ground and stretches his body out.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_30_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_30_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_30_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_30_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_30_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_30_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_30_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_30_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_30_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_30_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_30_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_30_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_30_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_30_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_30_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_30_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: Numerous individuals are portrayed sitting around the table, ready to deal cards for a game of poker.\nB: Several people are then shown standing around the table preparing to throw the ball in the cups.\nC: Several people are shown sitting around the table, sharing stories over cups of coffee.\nD: Numerous individuals are displayed sitting around the table, engrossed in a conversation.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: Numerous individuals are portrayed sitting around the table, ready to deal cards for a game of poker.\nB: Several people are then shown standing around the table preparing to throw the ball in the cups.\nC: Several people are shown sitting around the table, sharing stories over cups of coffee.\nD: Numerous individuals are displayed sitting around the table, engrossed in a conversation.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_31_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_31_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_31_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_31_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_31_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_31_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_31_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_31_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_31_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_31_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_31_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_31_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_31_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_31_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_31_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_31_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: He struggles to hold back his tears, wipes them away, then smiles.\nB: He tries with all of his might, lifts it up then puts it down.\nC: He glances with uncertainty, sets it aside, and then walks away.\nD: He effortlessly picks it up, spins it around, then places it back down.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: He struggles to hold back his tears, wipes them away, then smiles.\nB: He tries with all of his might, lifts it up then puts it down.\nC: He glances with uncertainty, sets it aside, and then walks away.\nD: He effortlessly picks it up, spins it around, then places it back down.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_32_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_32_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_32_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_32_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_32_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_32_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_32_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_32_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_32_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_32_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_32_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_32_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_32_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_32_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_32_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_32_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: A boy stands on a track field.\nB: A boy sketches a track field.\nC: A boy sleeps on a track field.\nD: A boy sketches a track field.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A boy stands on a track field.\nB: A boy sketches a track field.\nC: A boy sleeps on a track field.\nD: A boy sketches a track field.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_33_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_33_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_33_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_33_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_33_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_33_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_33_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_33_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_33_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_33_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_33_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_33_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_33_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_33_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_33_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_33_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: The potato rolled off the board onto the floor.\nB: The potato then gets sliced on a board.\nC: The potato was planted in the garden.\nD: The potato is planted in the garden.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The potato rolled off the board onto the floor.\nB: The potato then gets sliced on a board.\nC: The potato was planted in the garden.\nD: The potato is planted in the garden.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_34_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_34_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_34_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_34_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_34_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_34_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_34_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_34_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_34_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_34_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_34_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_34_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_34_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_34_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_34_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_34_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: The woman swept the floor, placing the shoes neatly on the rack.\nB: woman grab the shoes from the floor and wear them.\nC: Woman left the shoes on the floor and walked away barefoot.\nD: The woman tossed the shoes from the floor into a donation box.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The woman swept the floor, placing the shoes neatly on the rack.\nB: woman grab the shoes from the floor and wear them.\nC: Woman left the shoes on the floor and walked away barefoot.\nD: The woman tossed the shoes from the floor into a donation box.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_35_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_35_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_35_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_35_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_35_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_35_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_35_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_35_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_35_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_35_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_35_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_35_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_35_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_35_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_35_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_35_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: A child paints a picture as an adult frames it on the wall.\nB: A child washes and dry their hands leaving the water on which and adult turns off.\nC: An adult reads a book while a child playfully turns the pages.\nD: A child playfully chases an adult around the garden, leaving their hands dirty.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A child paints a picture as an adult frames it on the wall.\nB: A child washes and dry their hands leaving the water on which and adult turns off.\nC: An adult reads a book while a child playfully turns the pages.\nD: A child playfully chases an adult around the garden, leaving their hands dirty.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_36_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_36_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_36_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_36_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_36_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_36_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_36_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_36_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_36_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_36_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_36_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_36_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_36_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_36_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_36_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_36_15.jpg"
+    ],
+    "output": "G"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: She tosses the tomato slices into a salad, then toasts the bread for a side dish.\nB: She tosses the tomato slices into the salad, then uses the bread to scoop up the remaining mayo.\nC: She cuts the tomato into slices, then spreads mayo onto the bread before applying the tomatoes.\nD: She plants tomato seeds in the garden, then bakes fresh bread, waiting for the tomatoes to ripen.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: She tosses the tomato slices into a salad, then toasts the bread for a side dish.\nB: She tosses the tomato slices into the salad, then uses the bread to scoop up the remaining mayo.\nC: She cuts the tomato into slices, then spreads mayo onto the bread before applying the tomatoes.\nD: She plants tomato seeds in the garden, then bakes fresh bread, waiting for the tomatoes to ripen.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_37_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_37_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_37_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_37_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_37_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_37_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_37_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_37_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_37_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_37_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_37_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_37_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_37_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_37_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_37_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_37_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: People on the bridge gasp and watch her dive into the water below.\nB: People on the bridge gasp and record videos as she slips and falls into the water.\nC: People on the bridge gasp and hold their breath as she slips and falls into the water.\nD: People on the bridge smile and take pictures of her swinging back and forth over the water.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: People on the bridge gasp and watch her dive into the water below.\nB: People on the bridge gasp and record videos as she slips and falls into the water.\nC: People on the bridge gasp and hold their breath as she slips and falls into the water.\nD: People on the bridge smile and take pictures of her swinging back and forth over the water.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_38_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_38_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_38_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_38_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_38_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_38_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_38_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_38_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_38_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_38_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_38_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_38_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_38_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_38_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_38_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_38_15.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: Suddenly, the woman hits the red shoe with a toothbrush.\nB: Then, the woman talks a toothbrush and brushes the red shoe.\nC: Suddenly, the woman throws the toothbrush at the red shoe in frustration.\nD: Next, the woman uses a toothbrush to scrub the red shoe meticulously.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: Suddenly, the woman hits the red shoe with a toothbrush.\nB: Then, the woman talks a toothbrush and brushes the red shoe.\nC: Suddenly, the woman throws the toothbrush at the red shoe in frustration.\nD: Next, the woman uses a toothbrush to scrub the red shoe meticulously.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_39_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_39_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_39_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_39_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_39_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_39_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_39_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_39_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_39_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_39_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_39_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_39_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_39_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_39_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_39_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_39_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: A person is seen laying plaster onto a roof and using a shovel to flatten it out.\nB: A person is observed removing plaster from a roof with a shovel.\nC: A person is seen using a shovel to remove plaster from a roof.\nD: A person is observed shoveling snow off a roof and spreading salt to prevent ice formation.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A person is seen laying plaster onto a roof and using a shovel to flatten it out.\nB: A person is observed removing plaster from a roof with a shovel.\nC: A person is seen using a shovel to remove plaster from a roof.\nD: A person is observed shoveling snow off a roof and spreading salt to prevent ice formation.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_40_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_40_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_40_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_40_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_40_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_40_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_40_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_40_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_40_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_40_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_40_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_40_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_40_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_40_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_40_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_40_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: She stubs her leg against the table, mid-conversation.\nB: She chats animatedly while applying lotion to her leg.\nC: She begins washing her leg with the soap while talking.\nD: While chatting, she starts sketching on her leg with a marker.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: She stubs her leg against the table, mid-conversation.\nB: She chats animatedly while applying lotion to her leg.\nC: She begins washing her leg with the soap while talking.\nD: While chatting, she starts sketching on her leg with a marker.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_41_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_41_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_41_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_41_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_41_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_41_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_41_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_41_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_41_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_41_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_41_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_41_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_41_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_41_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_41_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_41_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: He carefully places it beside his body for a nap.\nB: He swings it around his body several times.\nC: He carries it gently in his arms across the room.\nD: He drapes it gently over his shoulders.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: He carefully places it beside his body for a nap.\nB: He swings it around his body several times.\nC: He carries it gently in his arms across the room.\nD: He drapes it gently over his shoulders.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_42_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_42_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_42_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_42_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_42_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_42_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_42_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_42_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_42_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_42_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_42_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_42_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_42_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_42_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_42_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_42_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: The woman uses her front hair to test the sharpness of the new scissors.\nB: The woman uses the brush to sweep up her fallen front hair from the floor.\nC: Suddenly, the woman snips off a portion of her front hair with scissors.\nD: Then the woman takes a portion of her front hair and combs it with the brush.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The woman uses her front hair to test the sharpness of the new scissors.\nB: The woman uses the brush to sweep up her fallen front hair from the floor.\nC: Suddenly, the woman snips off a portion of her front hair with scissors.\nD: Then the woman takes a portion of her front hair and combs it with the brush.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_43_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_43_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_43_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_43_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_43_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_43_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_43_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_43_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_43_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_43_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_43_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_43_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_43_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_43_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_43_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_43_15.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: Pictures depict people using their body parts to create fire-inspired artwork.\nB: More pictures are shown of fire as well as people putting their body parts over it.\nC: Pictures depict people using their body parts to paint images of fire.\nD: Pictures are displayed of people painting fire and their body parts with vibrant colors.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: Pictures depict people using their body parts to create fire-inspired artwork.\nB: More pictures are shown of fire as well as people putting their body parts over it.\nC: Pictures depict people using their body parts to paint images of fire.\nD: Pictures are displayed of people painting fire and their body parts with vibrant colors.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_44_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_44_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_44_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_44_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_44_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_44_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_44_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_44_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_44_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_44_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_44_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_44_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_44_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_44_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_44_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_44_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: The man, after waving to the crowd, is seen picking up the same throw he previously used, now in slow motion.\nB: The same throw is shown again in slow motion followed by the man waving to the crowd.\nC: The man waves to the crowd before he throws, this time in fast motion.\nD: The man from the crowd is swiftly caught by the same throw, before he could wave again.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The man, after waving to the crowd, is seen picking up the same throw he previously used, now in slow motion.\nB: The same throw is shown again in slow motion followed by the man waving to the crowd.\nC: The man waves to the crowd before he throws, this time in fast motion.\nD: The man from the crowd is swiftly caught by the same throw, before he could wave again.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_45_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_45_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_45_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_45_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_45_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_45_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_45_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_45_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_45_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_45_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_45_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_45_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_45_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_45_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_45_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_45_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: We see a lady sitting a table drilling holes in a pumpkin as kids watch.\nB: A woman at a table is reading a book to children, with a pumpkin sitting idly nearby.\nC: The kids observe a lady at a table, carving intricate designs into a pumpkin.\nD: A lady at a table is reading a spooky story to children, while a pumpkin sits untouched.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: We see a lady sitting a table drilling holes in a pumpkin as kids watch.\nB: A woman at a table is reading a book to children, with a pumpkin sitting idly nearby.\nC: The kids observe a lady at a table, carving intricate designs into a pumpkin.\nD: A lady at a table is reading a spooky story to children, while a pumpkin sits untouched.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_46_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_46_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_46_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_46_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_46_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_46_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_46_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_46_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_46_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_46_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_46_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_46_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_46_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_46_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_46_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_46_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: The knife is carefully placed next to the block, before being propped up against a piece of kitchen steel.\nB: The knife is carefully placed next to the block, while the piece of kitchen steel is used to straighten a bent fork.\nC: The knife is gently placed in the block, followed by a quick wipe on a kitchen steel piece.\nD: The knife is then moved back and forth across the block and then over a piece of kitchen steel.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The knife is carefully placed next to the block, before being propped up against a piece of kitchen steel.\nB: The knife is carefully placed next to the block, while the piece of kitchen steel is used to straighten a bent fork.\nC: The knife is gently placed in the block, followed by a quick wipe on a kitchen steel piece.\nD: The knife is then moved back and forth across the block and then over a piece of kitchen steel.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_47_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_47_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_47_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_47_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_47_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_47_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_47_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_47_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_47_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_47_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_47_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_47_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_47_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_47_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_47_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_47_15.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: A man displays a photo of a woman hoisting a white flag, scrutinizing it in the dim light.\nB: A man displays a photo of a woman waving a white flag, as he carefully adjusts his camera settings.\nC: A woman raises a white flag and the man's shot is shown again in slow motion.\nD: A man gifts a white flag to a woman who is shown cooking in a slow-motion video.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A man displays a photo of a woman hoisting a white flag, scrutinizing it in the dim light.\nB: A man displays a photo of a woman waving a white flag, as he carefully adjusts his camera settings.\nC: A woman raises a white flag and the man's shot is shown again in slow motion.\nD: A man gifts a white flag to a woman who is shown cooking in a slow-motion video.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_48_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_48_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_48_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_48_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_48_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_48_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_48_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_48_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_48_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_48_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_48_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_48_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_48_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_48_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_48_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_48_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: The woman carefully cleaned the lens before placing it back in its case.\nB: The woman placed the lens on the table next to her eye and examined it before carefully packing it away.\nC: The woman put the lens on side of her eye and blink and then removed the lens again.\nD: The woman accidentally dropped the lens from her eye onto the table.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The woman carefully cleaned the lens before placing it back in its case.\nB: The woman placed the lens on the table next to her eye and examined it before carefully packing it away.\nC: The woman put the lens on side of her eye and blink and then removed the lens again.\nD: The woman accidentally dropped the lens from her eye onto the table.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_49_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_49_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_49_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_49_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_49_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_49_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_49_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_49_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_49_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_49_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_49_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_49_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_49_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_49_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_49_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_49_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: The lady packs apples, nuts, and carrots, ignoring her untouched salad.\nB: The lady collects apples, nuts, and carrots, ignoring her untouched salad.\nC: The lady shows us her salad then adds apples, nuts and carrots.\nD: The lady packs apples, nuts, and carrots, ignoring the salad she initially showed us.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The lady packs apples, nuts, and carrots, ignoring her untouched salad.\nB: The lady collects apples, nuts, and carrots, ignoring her untouched salad.\nC: The lady shows us her salad then adds apples, nuts and carrots.\nD: The lady packs apples, nuts, and carrots, ignoring the salad she initially showed us.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_50_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_50_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_50_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_50_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_50_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_50_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_50_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_50_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_50_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_50_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_50_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_50_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_50_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_50_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_50_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_50_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: A person is seen holding a stick and wacking a pinata in the middle of a party.\nB: A person is noticed presenting a stick as a gift at a serene pinata ceremony.\nC: A person is observed handing over a stick to a child at a peaceful birthday gathering.\nD: A person is spotted passing a stick to a child during a calm gathering.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A person is seen holding a stick and wacking a pinata in the middle of a party.\nB: A person is noticed presenting a stick as a gift at a serene pinata ceremony.\nC: A person is observed handing over a stick to a child at a peaceful birthday gathering.\nD: A person is spotted passing a stick to a child during a calm gathering.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_51_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_51_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_51_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_51_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_51_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_51_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_51_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_51_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_51_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_51_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_51_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_51_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_51_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_51_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_51_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_51_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: A close up of food ingredients are shown followed by a person making a sandwich.\nB: A person is identifying food ingredients before sorting them out, instead of making a sandwich.\nC: Food ingredients are spread out for inspection before a person starts to bake a cake.\nD: A person discards food ingredients after accidentally burning their sandwich.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A close up of food ingredients are shown followed by a person making a sandwich.\nB: A person is identifying food ingredients before sorting them out, instead of making a sandwich.\nC: Food ingredients are spread out for inspection before a person starts to bake a cake.\nD: A person discards food ingredients after accidentally burning their sandwich.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_52_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_52_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_52_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_52_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_52_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_52_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_52_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_52_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_52_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_52_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_52_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_52_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_52_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_52_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_52_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_52_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: The man repairs a broken bar on the parallel bars.\nB: The man performs a routine on the parallel bars.\nC: The man repairs the parallel bars at the gym.\nD: The man repairs the parallel bars.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The man repairs a broken bar on the parallel bars.\nB: The man performs a routine on the parallel bars.\nC: The man repairs the parallel bars at the gym.\nD: The man repairs the parallel bars.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_53_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_53_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_53_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_53_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_53_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_53_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_53_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_53_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_53_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_53_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_53_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_53_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_53_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_53_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_53_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_53_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: He buys a new pair of shoes, discarding his old ones with worn-out soles.\nB: He ties his shoelaces together, hanging the shoes off his backpack, the soles touching.\nC: He tries to wash his shoes by kicking them in the water, the soles coming out.\nD: He ties his shoes securely, ensuring the soles are firmly attached.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: He buys a new pair of shoes, discarding his old ones with worn-out soles.\nB: He ties his shoelaces together, hanging the shoes off his backpack, the soles touching.\nC: He tries to wash his shoes by kicking them in the water, the soles coming out.\nD: He ties his shoes securely, ensuring the soles are firmly attached.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_54_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_54_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_54_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_54_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_54_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_54_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_54_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_54_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_54_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_54_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_54_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_54_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_54_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_54_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_54_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_54_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: Eventually, he places an ax on a log and strolls off.\nB: Ultimately, he places a book on a log and strolls off.\nC: Finally he swings an ax onto a log and walks away.\nD: Eventually, he places an ax beside a log and leaves.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: Eventually, he places an ax on a log and strolls off.\nB: Ultimately, he places a book on a log and strolls off.\nC: Finally he swings an ax onto a log and walks away.\nD: Eventually, he places an ax beside a log and leaves.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_55_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_55_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_55_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_55_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_55_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_55_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_55_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_55_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_55_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_55_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_55_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_55_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_55_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_55_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_55_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_55_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: A person in an orange shirt is sitting on the stairs, tying his shoelaces near the slide.\nB: A person in an orange shirt stands on the stairs next to the slide.\nC: A person in an orange shirt is fixing the slide next to the stairs.\nD: A person in an orange shirt is fixing the slide next to the stairs.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A person in an orange shirt is sitting on the stairs, tying his shoelaces near the slide.\nB: A person in an orange shirt stands on the stairs next to the slide.\nC: A person in an orange shirt is fixing the slide next to the stairs.\nD: A person in an orange shirt is fixing the slide next to the stairs.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_56_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_56_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_56_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_56_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_56_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_56_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_56_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_56_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_56_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_56_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_56_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_56_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_56_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_56_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_56_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_56_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: The man packs all the ingredients into jars, places the dough into a ceramic bowl, and sprinkles in extra chocolate chips.\nB: The man blends all the ingredients together and lays the dough out on a pan and adding more chocolate chips.\nC: The man gathers all the ingredients, rolls the dough into balls, stuffs them with chocolate chips, and chills them in the fridge.\nD: The man gathers all the ingredients, kneads the dough on a clean surface, and sprinkles it with raisins.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The man packs all the ingredients into jars, places the dough into a ceramic bowl, and sprinkles in extra chocolate chips.\nB: The man blends all the ingredients together and lays the dough out on a pan and adding more chocolate chips.\nC: The man gathers all the ingredients, rolls the dough into balls, stuffs them with chocolate chips, and chills them in the fridge.\nD: The man gathers all the ingredients, kneads the dough on a clean surface, and sprinkles it with raisins.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_57_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_57_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_57_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_57_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_57_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_57_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_57_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_57_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_57_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_57_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_57_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_57_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_57_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_57_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_57_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_57_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: An intro leads into several clips of people performing impressive flips off a high dive.\nB: An intro transitions into multiple snippets of individuals enjoying serene swims around a high dive.\nC: An intro transitions into various snippets of individuals engaging in intense debates from a high-rise building.\nD: An intro segues into a compilation of individuals fearlessly bungee jumping from a towering bridge.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: An intro leads into several clips of people performing impressive flips off a high dive.\nB: An intro transitions into multiple snippets of individuals enjoying serene swims around a high dive.\nC: An intro transitions into various snippets of individuals engaging in intense debates from a high-rise building.\nD: An intro segues into a compilation of individuals fearlessly bungee jumping from a towering bridge.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_58_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_58_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_58_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_58_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_58_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_58_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_58_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_58_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_58_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_58_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_58_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_58_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_58_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_58_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_58_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_58_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: The girl sits down and starts sketching the bars on her canvas.\nB: The girl jumps up and begins performing a routine on the bars.\nC: The girl sits down and starts drawing sketches on the bars.\nD: The girl sits down and starts sketching the bars on her drawing pad.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The girl sits down and starts sketching the bars on her canvas.\nB: The girl jumps up and begins performing a routine on the bars.\nC: The girl sits down and starts drawing sketches on the bars.\nD: The girl sits down and starts sketching the bars on her drawing pad.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_59_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_59_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_59_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_59_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_59_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_59_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_59_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_59_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_59_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_59_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_59_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_59_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_59_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_59_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_59_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_59_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: The camera captures the person diligently sketching landscapes from different angles.\nB: The person continues laying plaster down while the camera pans around him from various sides.\nC: The person gingerly sips his coffee, completely oblivious, as the camera stealthily captures him from different angles.\nD: The individual pauses to sip his coffee as the lens captures him from multiple angles.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The camera captures the person diligently sketching landscapes from different angles.\nB: The person continues laying plaster down while the camera pans around him from various sides.\nC: The person gingerly sips his coffee, completely oblivious, as the camera stealthily captures him from different angles.\nD: The individual pauses to sip his coffee as the lens captures him from multiple angles.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_60_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_60_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_60_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_60_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_60_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_60_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_60_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_60_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_60_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_60_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_60_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_60_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_60_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_60_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_60_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_60_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: A young woman is standing at the foot of a pole vault track.\nB: A young woman is tying her shoelaces at the end of a pole vault track.\nC: A young woman is tying her shoelaces at the beginning of a pole vault runway.\nD: A young woman is tying her shoelaces at the base of a pole vault track.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A young woman is standing at the foot of a pole vault track.\nB: A young woman is tying her shoelaces at the end of a pole vault track.\nC: A young woman is tying her shoelaces at the beginning of a pole vault runway.\nD: A young woman is tying her shoelaces at the base of a pole vault track.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_61_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_61_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_61_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_61_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_61_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_61_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_61_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_61_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_61_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_61_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_61_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_61_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_61_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_61_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_61_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_61_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: He slowly walks towards the tall high jump beam, admiring its structure.\nB: He then runs full speed and jumps a tall high jump beam.\nC: He leisurely walks and ducks under a low high jump beam.\nD: He leisurely strolls and stops to gaze at the tall high jump beam.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: He slowly walks towards the tall high jump beam, admiring its structure.\nB: He then runs full speed and jumps a tall high jump beam.\nC: He leisurely walks and ducks under a low high jump beam.\nD: He leisurely strolls and stops to gaze at the tall high jump beam.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_62_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_62_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_62_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_62_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_62_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_62_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_62_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_62_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_62_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_62_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_62_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_62_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_62_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_62_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_62_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_62_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: A camera pans around a wooden floor and shows a person walking downstairs.\nB: A person abruptly drops a camera on a wooden floor before racing upstairs.\nC: A person ascends upstairs, their footfalls echoing on the wooden floor, while a camera lies unused.\nD: A person picks up a fallen camera from the wooden floor at the bottom of the stairs.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A camera pans around a wooden floor and shows a person walking downstairs.\nB: A person abruptly drops a camera on a wooden floor before racing upstairs.\nC: A person ascends upstairs, their footfalls echoing on the wooden floor, while a camera lies unused.\nD: A person picks up a fallen camera from the wooden floor at the bottom of the stairs.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_63_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_63_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_63_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_63_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_63_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_63_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_63_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_63_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_63_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_63_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_63_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_63_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_63_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_63_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_63_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_63_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: man is in a living room painting a couch with purle spray.\nB: Sentence: A man in a living room is relaxing on a purple couch.\nC: Sentence: In a living room, a man is vacuuming a purple couch.\nD: Sentence: In a living room, a man is vacuuming crumbs off a purple couch.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: man is in a living room painting a couch with purle spray.\nB: Sentence: A man in a living room is relaxing on a purple couch.\nC: Sentence: In a living room, a man is vacuuming a purple couch.\nD: Sentence: In a living room, a man is vacuuming crumbs off a purple couch.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_64_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_64_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_64_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_64_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_64_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_64_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_64_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_64_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_64_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_64_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_64_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_64_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_64_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_64_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_64_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_64_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: A man and is dog are seen standing in the middle of a fenced in area performing tricks with frisbees.\nB: A man and his dog are spotted relaxing in a fenced yard, enjoying a peaceful afternoon nap.\nC: A man and his dog are enjoying a quiet picnic in a fenced park, sharing sandwiches.\nD: A man and his dog are calmly watching the sunset from a fenced backyard, completely engrossed in the tranquil scene.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A man and is dog are seen standing in the middle of a fenced in area performing tricks with frisbees.\nB: A man and his dog are spotted relaxing in a fenced yard, enjoying a peaceful afternoon nap.\nC: A man and his dog are enjoying a quiet picnic in a fenced park, sharing sandwiches.\nD: A man and his dog are calmly watching the sunset from a fenced backyard, completely engrossed in the tranquil scene.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_65_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_65_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_65_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_65_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_65_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_65_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_65_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_65_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_65_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_65_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_65_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_65_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_65_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_65_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_65_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_65_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: He cooks a fish, and discards the bones.\nB: He studies a fish, then releases it back into the water.\nC: He reels in a fish, and removes the hook.\nD: He photographs a fish, and releases it back into the water.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: He cooks a fish, and discards the bones.\nB: He studies a fish, then releases it back into the water.\nC: He reels in a fish, and removes the hook.\nD: He photographs a fish, and releases it back into the water.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_66_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_66_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_66_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_66_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_66_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_66_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_66_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_66_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_66_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_66_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_66_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_66_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_66_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_66_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_66_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_66_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: The man dismounts the horse, kneels to pet the calf gently, then rises and strolls away.\nB: Then, the man get down the horse and kneels to tie the legs of the calf, then the man raises and walk.\nC: The man dismounts the horse, gently strokes the calf, and then strolls away leisurely.\nD: The man dismounts the horse, kneels to pet the calf, then stands up and strolls.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The man dismounts the horse, kneels to pet the calf gently, then rises and strolls away.\nB: Then, the man get down the horse and kneels to tie the legs of the calf, then the man raises and walk.\nC: The man dismounts the horse, gently strokes the calf, and then strolls away leisurely.\nD: The man dismounts the horse, kneels to pet the calf, then stands up and strolls.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_67_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_67_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_67_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_67_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_67_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_67_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_67_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_67_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_67_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_67_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_67_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_67_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_67_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_67_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_67_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_67_15.jpg"
+    ],
+    "output": "J"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: He carefully arranges his tools next to the kindling, ready for tomorrow's campfire.\nB: He carefully arranges his tools around the kindling, preparing for a camping demonstration.\nC: He starts striking his tools together over the kindling to start the fire.\nD: He gently places his tools beside the kindling, preparing to organize his workshop.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: He carefully arranges his tools next to the kindling, ready for tomorrow's campfire.\nB: He carefully arranges his tools around the kindling, preparing for a camping demonstration.\nC: He starts striking his tools together over the kindling to start the fire.\nD: He gently places his tools beside the kindling, preparing to organize his workshop.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_68_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_68_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_68_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_68_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_68_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_68_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_68_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_68_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_68_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_68_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_68_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_68_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_68_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_68_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_68_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_68_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: They start to remove ornaments from the Christmas tree.\nB: They start to remove ornaments from the Christmas tree.\nC: They begin to put decorations onto the Christmas tree.\nD: They decide to chop down the Christmas tree for firewood.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: They start to remove ornaments from the Christmas tree.\nB: They start to remove ornaments from the Christmas tree.\nC: They begin to put decorations onto the Christmas tree.\nD: They decide to chop down the Christmas tree for firewood.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_69_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_69_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_69_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_69_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_69_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_69_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_69_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_69_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_69_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_69_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_69_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_69_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_69_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_69_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_69_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_69_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: woman is slicing a chocolate cake and its decorating it, put a second floor and some pink fondam.\nB: Sentence: The woman, discarding the second floor, removed the pink fondam and stopped decorating the chocolate cake.\nC: Sentence: A woman is stacking a second floor on a chocolate cake and draping it with pink fondam, without slicing or decorating it.\nD: Sentence: The woman, tired of baking, stashed the chocolate cake and pink fondam, opting to read a novel on her second floor.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: woman is slicing a chocolate cake and its decorating it, put a second floor and some pink fondam.\nB: Sentence: The woman, discarding the second floor, removed the pink fondam and stopped decorating the chocolate cake.\nC: Sentence: A woman is stacking a second floor on a chocolate cake and draping it with pink fondam, without slicing or decorating it.\nD: Sentence: The woman, tired of baking, stashed the chocolate cake and pink fondam, opting to read a novel on her second floor.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_70_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_70_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_70_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_70_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_70_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_70_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_70_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_70_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_70_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_70_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_70_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_70_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_70_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_70_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_70_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_70_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: They break the pitcher into pieces.\nB: They broke the pitcher into pieces.\nC: They pour that into a pitcher.\nD: They wash the pitcher in the sink.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: They break the pitcher into pieces.\nB: They broke the pitcher into pieces.\nC: They pour that into a pitcher.\nD: They wash the pitcher in the sink.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_71_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_71_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_71_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_71_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_71_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_71_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_71_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_71_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_71_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_71_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_71_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_71_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_71_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_71_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_71_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_71_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: A man is seen sitting on the ice speaking to the camera and leads into several shots of him grabbing fish from a pole.\nB: A man is caught on camera lounging on the beach, narrating to the camera while pointing to a pole where various fish are hung.\nC: A man is caught on camera, relaxing on a boat while casting his fishing pole into the water.\nD: A man is spotted standing on the beach, throwing fish back into the ocean, after removing them from a pole.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A man is seen sitting on the ice speaking to the camera and leads into several shots of him grabbing fish from a pole.\nB: A man is caught on camera lounging on the beach, narrating to the camera while pointing to a pole where various fish are hung.\nC: A man is caught on camera, relaxing on a boat while casting his fishing pole into the water.\nD: A man is spotted standing on the beach, throwing fish back into the ocean, after removing them from a pole.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_72_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_72_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_72_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_72_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_72_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_72_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_72_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_72_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_72_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_72_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_72_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_72_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_72_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_72_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_72_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_72_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: A child climbs monkey bars until reach the others side.\nB: A child paints a picture of monkey bars on the other side of the room.\nC: A child paints a picture until the others arrive.\nD: A child draws a picture of monkey bars on the side of his notebook.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A child climbs monkey bars until reach the others side.\nB: A child paints a picture of monkey bars on the other side of the room.\nC: A child paints a picture until the others arrive.\nD: A child draws a picture of monkey bars on the side of his notebook.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_73_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_73_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_73_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_73_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_73_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_73_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_73_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_73_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_73_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_73_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_73_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_73_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_73_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_73_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_73_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_73_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: A big chunk of snow is on the roof of a car.\nB: A car drives over a large mound of snow.\nC: A car drives away, shaking a large chunk of snow off its roof.\nD: A car drives over a big chunk of snow.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A big chunk of snow is on the roof of a car.\nB: A car drives over a large mound of snow.\nC: A car drives away, shaking a large chunk of snow off its roof.\nD: A car drives over a big chunk of snow.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_74_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_74_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_74_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_74_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_74_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_74_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_74_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_74_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_74_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_74_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_74_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_74_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_74_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_74_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_74_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_74_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: The lady squeezes a lemon into oil, combines it with other ingredients, grates something over it, and mixes everything together.\nB: The lady collects the ingredients, including a lemon and oil, writes something on top of the recipe card, and rates it before tidying up.\nC: The lady paints a lemon, places it on an oil canvas, integrates other elements, inscribes a name on top, and mix the colors thoroughly.\nD: The lady juices a lemon and pours it in oil and adds other ingredients and rates something on top before stirring it up.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The lady squeezes a lemon into oil, combines it with other ingredients, grates something over it, and mixes everything together.\nB: The lady collects the ingredients, including a lemon and oil, writes something on top of the recipe card, and rates it before tidying up.\nC: The lady paints a lemon, places it on an oil canvas, integrates other elements, inscribes a name on top, and mix the colors thoroughly.\nD: The lady juices a lemon and pours it in oil and adds other ingredients and rates something on top before stirring it up.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_75_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_75_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_75_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_75_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_75_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_75_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_75_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_75_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_75_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_75_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_75_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_75_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_75_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_75_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_75_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_75_15.jpg"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: She gently grips the lens in her hands, illustrating how to clean it thoroughly before usage.\nB: She moves her hands around while holding onto the lens and leads into her demonstrating how to put one in your eye.\nC: While holding the lens, she quickly withdraws her hands, demonstrating how to remove it from your eye.\nD: She guides her hands to carefully fix the lens on her camera, showcasing her photographic expertise.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: She gently grips the lens in her hands, illustrating how to clean it thoroughly before usage.\nB: She moves her hands around while holding onto the lens and leads into her demonstrating how to put one in your eye.\nC: While holding the lens, she quickly withdraws her hands, demonstrating how to remove it from your eye.\nD: She guides her hands to carefully fix the lens on her camera, showcasing her photographic expertise.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_76_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_76_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_76_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_76_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_76_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_76_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_76_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_76_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_76_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_76_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_76_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_76_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_76_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_76_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_76_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_76_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: The two exchange glances before diving into the container to retrieve the sunken treasure.\nB: The two examine the container, then exchange a puzzled glance.\nC: The two take a drink from the container and nod to one another.\nD: The two exchange glances across the room, the container untouched between them.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The two exchange glances before diving into the container to retrieve the sunken treasure.\nB: The two examine the container, then exchange a puzzled glance.\nC: The two take a drink from the container and nod to one another.\nD: The two exchange glances across the room, the container untouched between them.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_77_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_77_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_77_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_77_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_77_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_77_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_77_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_77_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_77_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_77_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_77_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_77_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_77_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_77_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_77_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_77_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: First, she brushes her hair, then she secures it into a neat bun.\nB: First she lets the rollers heat up and she puts them onto her hair.\nC: First, she allows the rollers to cool down before she removes them from her hair.\nD: First she collects the rollers, then she begins to neatly organize them in her drawer.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: First, she brushes her hair, then she secures it into a neat bun.\nB: First she lets the rollers heat up and she puts them onto her hair.\nC: First, she allows the rollers to cool down before she removes them from her hair.\nD: First she collects the rollers, then she begins to neatly organize them in her drawer.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_78_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_78_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_78_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_78_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_78_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_78_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_78_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_78_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_78_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_78_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_78_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_78_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_78_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_78_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_78_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_78_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: He cleans his electric razor after using it on his beard.\nB: He uses an electric razor to trim and shave his beard.\nC: He charges his electric razor on the bathroom counter before leaving for work.\nD: He charges his electric razor with a portable power bank when traveling.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: He cleans his electric razor after using it on his beard.\nB: He uses an electric razor to trim and shave his beard.\nC: He charges his electric razor on the bathroom counter before leaving for work.\nD: He charges his electric razor with a portable power bank when traveling.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_79_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_79_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_79_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_79_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_79_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_79_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_79_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_79_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_79_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_79_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_79_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_79_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_79_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_79_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_79_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_79_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: Sentence: Amidst a serene landscape, the camera focuses on the people quietly sitting on boards at the hill's crest.\nB: The people continue to ride the boards down a hill while the camera pans around himself as well as the area around them.\nC: The camera focuses on the people resting on the hill, their boards beside them, capturing a panoramic view of the surrounding area.\nD: Sentence: Amidst a bustling market, the people carry boards up a hill as the camera captures their determination and the vibrant surroundings.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: Sentence: Amidst a serene landscape, the camera focuses on the people quietly sitting on boards at the hill's crest.\nB: The people continue to ride the boards down a hill while the camera pans around himself as well as the area around them.\nC: The camera focuses on the people resting on the hill, their boards beside them, capturing a panoramic view of the surrounding area.\nD: Sentence: Amidst a bustling market, the people carry boards up a hill as the camera captures their determination and the vibrant surroundings.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_80_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_80_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_80_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_80_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_80_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_80_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_80_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_80_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_80_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_80_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_80_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_80_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_80_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_80_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_80_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_80_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: The man carefully places the weight in the middle of the bar, sits down for a while, then stands up and gently sets the bar back on the ground.\nB: The man then picks it up, squats, jumps to open his legs then quickly drops the bar and pushes the weight towards the middle of the bar and rests for a little bit.\nC: The man gently sets the bar down, stretches his legs, then strolls to the center of the bar for a brief reprieve.\nD: The man gently lifts the bar, settles into a steady stance, carefully positions the weight to the center of the bar, and takes a brief respite.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The man carefully places the weight in the middle of the bar, sits down for a while, then stands up and gently sets the bar back on the ground.\nB: The man then picks it up, squats, jumps to open his legs then quickly drops the bar and pushes the weight towards the middle of the bar and rests for a little bit.\nC: The man gently sets the bar down, stretches his legs, then strolls to the center of the bar for a brief reprieve.\nD: The man gently lifts the bar, settles into a steady stance, carefully positions the weight to the center of the bar, and takes a brief respite.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_81_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_81_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_81_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_81_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_81_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_81_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_81_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_81_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_81_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_81_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_81_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_81_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_81_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_81_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_81_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_81_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: An introduction comes onto the screen for a video about a curling game.\nB: The screen displays a curling game instead of the expected video introduction.\nC: The video screen flickers as it transitions from the curling game to an introduction on chess strategies.\nD: The screen displays a curling game interrupted by an unexpected introduction.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: An introduction comes onto the screen for a video about a curling game.\nB: The screen displays a curling game instead of the expected video introduction.\nC: The video screen flickers as it transitions from the curling game to an introduction on chess strategies.\nD: The screen displays a curling game interrupted by an unexpected introduction.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_82_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_82_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_82_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_82_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_82_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_82_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_82_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_82_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_82_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_82_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_82_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_82_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_82_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_82_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_82_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_82_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: The Hispanic man shared his meal with the black man, leading to hearty laughter and mutual respect between them.\nB: The black man and the Hispanic man sat in silence after a heated argument, their smiles replaced with stern expressions.\nC: The black man and hispanic man are working together on a project, concentrating deeply and exchanging innovative ideas.\nD: The black man is in disbelief and they're laughing and having a good time and they re-do it and the hispanic man wins again and more smiles continue.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The Hispanic man shared his meal with the black man, leading to hearty laughter and mutual respect between them.\nB: The black man and the Hispanic man sat in silence after a heated argument, their smiles replaced with stern expressions.\nC: The black man and hispanic man are working together on a project, concentrating deeply and exchanging innovative ideas.\nD: The black man is in disbelief and they're laughing and having a good time and they re-do it and the hispanic man wins again and more smiles continue.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_83_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_83_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_83_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_83_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_83_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_83_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_83_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_83_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_83_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_83_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_83_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_83_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_83_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_83_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_83_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_83_15.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: The man reads beneath the shade of the tulle tree.\nB: The man collects tulle from the tree and folds it neatly.\nC: The man cuts tulle and sticks it in the tree.\nD: The man finds tulle tangled in the tree and carefully removes it.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The man reads beneath the shade of the tulle tree.\nB: The man collects tulle from the tree and folds it neatly.\nC: The man cuts tulle and sticks it in the tree.\nD: The man finds tulle tangled in the tree and carefully removes it.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_84_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_84_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_84_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_84_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_84_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_84_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_84_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_84_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_84_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_84_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_84_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_84_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_84_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_84_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_84_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_84_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: The woman placed her brush and blow dryer on the counter, deciding to let her bangs air dry instead.\nB: The woman is holding a brush and blow dryer and began blow drying her bangs.\nC: The woman put down her brush and blow dryer, deciding to let her bangs air dry instead.\nD: The woman, equipped with a brush and blow dryer, decided to style her bangs into loose waves.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The woman placed her brush and blow dryer on the counter, deciding to let her bangs air dry instead.\nB: The woman is holding a brush and blow dryer and began blow drying her bangs.\nC: The woman put down her brush and blow dryer, deciding to let her bangs air dry instead.\nD: The woman, equipped with a brush and blow dryer, decided to style her bangs into loose waves.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_85_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_85_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_85_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_85_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_85_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_85_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_85_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_85_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_85_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_85_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_85_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_85_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_85_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_85_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_85_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_85_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: He later walks to the high jump and decides to skip it.\nB: He then runs to the high jump and completes it.\nC: He then walks to the high jump and judges it.\nD: He then walks to the high jump and dismantles it.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: He later walks to the high jump and decides to skip it.\nB: He then runs to the high jump and completes it.\nC: He then walks to the high jump and judges it.\nD: He then walks to the high jump and dismantles it.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_86_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_86_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_86_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_86_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_86_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_86_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_86_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_86_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_86_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_86_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_86_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_86_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_86_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_86_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_86_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_86_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: people is gathered around a table serving food.\nB: Sentence: People are debating around a table in a conference room.\nC: People are arguing around a table with papers scattered.\nD: People are debating fiercely around a table.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: people is gathered around a table serving food.\nB: Sentence: People are debating around a table in a conference room.\nC: People are arguing around a table with papers scattered.\nD: People are debating fiercely around a table.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_87_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_87_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_87_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_87_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_87_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_87_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_87_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_87_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_87_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_87_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_87_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_87_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_87_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_87_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_87_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_87_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: He gently places the hefty book from his face to his stomach while lounging on the couch.\nB: He carefully places the heavy weight on his stomach, then gently rolls it off near his face.\nC: He lifts the heavy weight high up near his face and lifts it back down near his stomach.\nD: He gently cradles the newborn baby close to his face, then lowers her down to rest near his stomach.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: He gently places the hefty book from his face to his stomach while lounging on the couch.\nB: He carefully places the heavy weight on his stomach, then gently rolls it off near his face.\nC: He lifts the heavy weight high up near his face and lifts it back down near his stomach.\nD: He gently cradles the newborn baby close to his face, then lowers her down to rest near his stomach.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_88_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_88_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_88_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_88_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_88_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_88_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_88_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_88_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_88_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_88_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_88_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_88_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_88_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_88_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_88_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_88_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: He carefully places the pale of sticks on the ground, ensuring it doesn't topple over as the fire blazes nearby.\nB: He finally gets the pale of sticks lite and as the fire grows the pale of sticks fall.\nC: He neatly arranges the pale of sticks, which promptly topples over, scattering sticks everywhere.\nD: He carefully stacks the pale of sticks, ensuring they don't fall as he adds each one.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: He carefully places the pale of sticks on the ground, ensuring it doesn't topple over as the fire blazes nearby.\nB: He finally gets the pale of sticks lite and as the fire grows the pale of sticks fall.\nC: He neatly arranges the pale of sticks, which promptly topples over, scattering sticks everywhere.\nD: He carefully stacks the pale of sticks, ensuring they don't fall as he adds each one.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_89_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_89_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_89_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_89_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_89_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_89_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_89_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_89_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_89_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_89_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_89_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_89_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_89_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_89_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_89_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_89_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: She dries her hair with a towel and discards the tissue paper.\nB: She rolls up a towel and puts it in tissue paper.\nC: She dries her face with a towel and discards the used tissue paper.\nD: She dries her face with a towel and discards the used tissue paper.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: She dries her hair with a towel and discards the tissue paper.\nB: She rolls up a towel and puts it in tissue paper.\nC: She dries her face with a towel and discards the used tissue paper.\nD: She dries her face with a towel and discards the used tissue paper.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_90_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_90_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_90_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_90_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_90_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_90_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_90_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_90_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_90_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_90_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_90_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_90_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_90_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_90_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_90_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_90_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: Sentence: The man repairs a homemade snow shovel on a small road.\nB: The man uses a home made snow shovel to clear away small road.\nC: Sentence: The man accidentally breaks his homemade snow shovel while fixing a small road.\nD: The man repurposes a homemade snow shovel into a gardening tool in his small road-side garden.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: Sentence: The man repairs a homemade snow shovel on a small road.\nB: The man uses a home made snow shovel to clear away small road.\nC: Sentence: The man accidentally breaks his homemade snow shovel while fixing a small road.\nD: The man repurposes a homemade snow shovel into a gardening tool in his small road-side garden.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_91_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_91_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_91_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_91_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_91_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_91_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_91_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_91_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_91_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_91_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_91_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_91_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_91_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_91_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_91_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_91_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: A guy is trying to lite a pale of sticks in a round barrole.\nB: A man is organizing a pile of sticks in a cylindrical barrel.\nC: A man is sorting a bucket of twigs in a circular barrel.\nD: A man is stacking sticks into a round barrel.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A guy is trying to lite a pale of sticks in a round barrole.\nB: A man is organizing a pile of sticks in a cylindrical barrel.\nC: A man is sorting a bucket of twigs in a circular barrel.\nD: A man is stacking sticks into a round barrel.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_92_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_92_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_92_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_92_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_92_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_92_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_92_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_92_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_92_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_92_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_92_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_92_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_92_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_92_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_92_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_92_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: The man in the blue shirt hands over a bench stone and a knife to the other man, suggesting they try carving sculptures.\nB: In a room, two men, one in a blue shirt, examine a bench stone; the man in blue illustrates how to polish it with a specific lubricant, disregarding the knife he initially intended to sharpen.\nC: Two men are in a room and the man with a blue shirt takes out a bench stone and with a little lubricant on the stone takes an knife and explains how to sharpen it.\nD: In a room, a man in a blue shirt shows his companion how to use a bench stone to polish a gem, instead of sharpening a knife.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The man in the blue shirt hands over a bench stone and a knife to the other man, suggesting they try carving sculptures.\nB: In a room, two men, one in a blue shirt, examine a bench stone; the man in blue illustrates how to polish it with a specific lubricant, disregarding the knife he initially intended to sharpen.\nC: Two men are in a room and the man with a blue shirt takes out a bench stone and with a little lubricant on the stone takes an knife and explains how to sharpen it.\nD: In a room, a man in a blue shirt shows his companion how to use a bench stone to polish a gem, instead of sharpening a knife.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_93_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_93_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_93_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_93_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_93_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_93_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_93_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_93_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_93_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_93_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_93_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_93_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_93_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_93_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_93_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_93_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: While chatting to her online audience, she's seen displaying an ironing board picture, unplugging the iron, and crumpling a shirt.\nB: She's shown plugging in the iron and folding up a shirt while still speaking to the camera and showing a picture of an ironing board.\nC: While chatting with her online viewers, she exhibits a photo of an ironing board before unplugging the iron and crumpling a shirt.\nD: She's seen snapping a picture of an ironing board, turning off the iron, and unfolding a shirt for the camera.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: While chatting to her online audience, she's seen displaying an ironing board picture, unplugging the iron, and crumpling a shirt.\nB: She's shown plugging in the iron and folding up a shirt while still speaking to the camera and showing a picture of an ironing board.\nC: While chatting with her online viewers, she exhibits a photo of an ironing board before unplugging the iron and crumpling a shirt.\nD: She's seen snapping a picture of an ironing board, turning off the iron, and unfolding a shirt for the camera.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_94_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_94_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_94_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_94_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_94_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_94_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_94_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_94_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_94_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_94_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_94_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_94_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_94_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_94_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_94_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_94_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: People are standing around a Christmas tree.\nB: People are planting a Christmas tree.\nC: People are planting a Christmas tree.\nD: People are planting a Christmas tree.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: People are standing around a Christmas tree.\nB: People are planting a Christmas tree.\nC: People are planting a Christmas tree.\nD: People are planting a Christmas tree.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_95_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_95_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_95_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_95_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_95_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_95_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_95_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_95_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_95_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_95_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_95_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_95_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_95_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_95_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_95_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_95_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: Afterwards, the young men play guitars by the river.\nB: Then, the young men splits logs in the woods.\nC: Next, the young men play soccer in the park.\nD: The young men play soccer in the woods.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: Afterwards, the young men play guitars by the river.\nB: Then, the young men splits logs in the woods.\nC: Next, the young men play soccer in the park.\nD: The young men play soccer in the woods.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_96_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_96_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_96_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_96_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_96_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_96_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_96_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_96_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_96_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_96_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_96_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_96_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_96_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_96_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_96_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_96_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: A young child is seen standing on the side with an older man playing shuffleboard and pushing a puck.\nB: An older man is teaching a young child how to fly a kite in the park.\nC: An older man is teaching a young child to fly a kite at the park.\nD: An elder man is observing a young child enthusiastically feeding ducks by the pond.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A young child is seen standing on the side with an older man playing shuffleboard and pushing a puck.\nB: An older man is teaching a young child how to fly a kite in the park.\nC: An older man is teaching a young child to fly a kite at the park.\nD: An elder man is observing a young child enthusiastically feeding ducks by the pond.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_97_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_97_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_97_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_97_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_97_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_97_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_97_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_97_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_97_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_97_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_97_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_97_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_97_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_97_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_97_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_97_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: The man in the dark hospital shirt points his finger at the contact lens, explaining the effects it could have on the optic health of the eye.\nB: The man in the dark hospital shirt reapplies the optic solution to his finger and proceeds to demonstrate how to insert a contact lens to the eye.\nC: The man in the dark hospital shirt points his finger towards the exit, instructing the patient on how to evacuate during an emergency.\nD: The man in the dark hospital shirt playfully flicks his finger, sending the optic solution droplets onto his contact lens before engaging in an animated conversation about eye health.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The man in the dark hospital shirt points his finger at the contact lens, explaining the effects it could have on the optic health of the eye.\nB: The man in the dark hospital shirt reapplies the optic solution to his finger and proceeds to demonstrate how to insert a contact lens to the eye.\nC: The man in the dark hospital shirt points his finger towards the exit, instructing the patient on how to evacuate during an emergency.\nD: The man in the dark hospital shirt playfully flicks his finger, sending the optic solution droplets onto his contact lens before engaging in an animated conversation about eye health.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_98_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_98_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_98_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_98_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_98_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_98_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_98_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_98_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_98_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_98_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_98_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_98_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_98_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_98_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_98_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_98_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: The man kneels to tie his shoes, and then wipes his sweaty brow with a towel.\nB: The man leans to tie his shoelaces, then he wipes his hands with a towel.\nC: The man bends to wash his face, after he dry his face with a towel.\nD: The man kneels to tie his shoe, then wipes his hands with a towel.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The man kneels to tie his shoes, and then wipes his sweaty brow with a towel.\nB: The man leans to tie his shoelaces, then he wipes his hands with a towel.\nC: The man bends to wash his face, after he dry his face with a towel.\nD: The man kneels to tie his shoe, then wipes his hands with a towel.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_99_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_99_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_99_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_99_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_99_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_99_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_99_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_99_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_99_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_99_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_99_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_99_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_99_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_99_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_99_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_99_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: The lady, after applying glue, places the lash gently.\nB: The lady, while laughing, accidentally spills coffee on the lash.\nC: The lady, after applying glue, attaches the lash.\nD: The lady talks the puts glue on the lash.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The lady, after applying glue, places the lash gently.\nB: The lady, while laughing, accidentally spills coffee on the lash.\nC: The lady, after applying glue, attaches the lash.\nD: The lady talks the puts glue on the lash.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_100_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_100_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_100_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_100_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_100_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_100_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_100_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_100_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_100_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_100_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_100_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_100_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_100_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_100_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_100_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_100_15.jpg"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: A person's feet are shown around a set of shoes and then begins putting socks on his feet.\nB: A person's feet are revealed as they kick off their shoes, playfully wriggling their toes in the sand.\nC: A person's feet are displayed, barefoot and shoeless, as he digs his toes into the warm sand.\nD: A person's feet are showcased, standing barefoot in the sand, shoes discarded next to him.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A person's feet are shown around a set of shoes and then begins putting socks on his feet.\nB: A person's feet are revealed as they kick off their shoes, playfully wriggling their toes in the sand.\nC: A person's feet are displayed, barefoot and shoeless, as he digs his toes into the warm sand.\nD: A person's feet are showcased, standing barefoot in the sand, shoes discarded next to him.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_101_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_101_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_101_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_101_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_101_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_101_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_101_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_101_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_101_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_101_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_101_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_101_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_101_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_101_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_101_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_101_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: Several women are in the gym, taking a yoga class to enhance their flexibility.\nB: A few women are in the gym practicing to do this pole vault, trying to do their best.\nC: A few women are in the kitchen, attempting to perfect their baking skills.\nD: A few women are in the gym, engaged in a lively yoga session, pushing their flexibility to the limit.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: Several women are in the gym, taking a yoga class to enhance their flexibility.\nB: A few women are in the gym practicing to do this pole vault, trying to do their best.\nC: A few women are in the kitchen, attempting to perfect their baking skills.\nD: A few women are in the gym, engaged in a lively yoga session, pushing their flexibility to the limit.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_102_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_102_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_102_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_102_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_102_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_102_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_102_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_102_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_102_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_102_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_102_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_102_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_102_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_102_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_102_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_102_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: The child watches the kayak drift away in the river.\nB: A child observes a kayak floating in the river.\nC: we see a child ride a kayak in a river.\nD: A child catches a fish in a river from a kayak.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The child watches the kayak drift away in the river.\nB: A child observes a kayak floating in the river.\nC: we see a child ride a kayak in a river.\nD: A child catches a fish in a river from a kayak.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_103_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_103_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_103_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_103_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_103_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_103_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_103_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_103_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_103_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_103_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_103_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_103_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_103_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_103_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_103_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_103_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: Someone is putting water into a bowl.\nB: Someone is drinking water from a bowl.\nC: Someone is drinking water from a bowl.\nD: Someone is drinking water from a bowl.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: Someone is putting water into a bowl.\nB: Someone is drinking water from a bowl.\nC: Someone is drinking water from a bowl.\nD: Someone is drinking water from a bowl.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_104_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_104_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_104_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_104_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_104_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_104_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_104_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_104_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_104_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_104_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_104_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_104_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_104_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_104_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_104_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_104_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: She leans on the rail of the bridge, watching the boats sail by.\nB: She leans on the rail of the bridge, admiring the view off the side.\nC: She climbs to the rail of the bridge, then bungee jumps off the side.\nD: She leans on the bridge rail, quietly watching the river flow beneath.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: She leans on the rail of the bridge, watching the boats sail by.\nB: She leans on the rail of the bridge, admiring the view off the side.\nC: She climbs to the rail of the bridge, then bungee jumps off the side.\nD: She leans on the bridge rail, quietly watching the river flow beneath.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_105_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_105_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_105_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_105_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_105_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_105_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_105_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_105_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_105_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_105_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_105_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_105_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_105_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_105_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_105_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_105_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: The man interrupts his routine and falls.\nB: The man interrupts his routine and climbs.\nC: The man finishes his routine and dismounts.\nD: The man interrupts his routine and takes a break.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The man interrupts his routine and falls.\nB: The man interrupts his routine and climbs.\nC: The man finishes his routine and dismounts.\nD: The man interrupts his routine and takes a break.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_106_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_106_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_106_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_106_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_106_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_106_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_106_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_106_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_106_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_106_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_106_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_106_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_106_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_106_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_106_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_106_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: A large crowd is seen sitting around a field followed by a man running with a javelin and throwing it across the field.\nB: A man with a javelin is calmly walking across a field while a large crowd sits around him, watching attentively.\nC: A man is observed teaching a javelin throwing technique to a large crowd gathered in a field.\nD: A man is seen teaching javelin techniques to a large crowd gathered in a field.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A large crowd is seen sitting around a field followed by a man running with a javelin and throwing it across the field.\nB: A man with a javelin is calmly walking across a field while a large crowd sits around him, watching attentively.\nC: A man is observed teaching a javelin throwing technique to a large crowd gathered in a field.\nD: A man is seen teaching javelin techniques to a large crowd gathered in a field.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_107_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_107_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_107_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_107_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_107_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_107_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_107_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_107_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_107_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_107_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_107_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_107_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_107_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_107_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_107_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_107_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: He examines the shingles scattered across the roof.\nB: He examines the shingles scattered across the roof.\nC: He sprays the shingles all over the roof.\nD: He inspects the shingles meticulously across the roof.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: He examines the shingles scattered across the roof.\nB: He examines the shingles scattered across the roof.\nC: He sprays the shingles all over the roof.\nD: He inspects the shingles meticulously across the roof.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_108_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_108_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_108_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_108_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_108_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_108_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_108_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_108_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_108_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_108_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_108_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_108_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_108_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_108_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_108_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_108_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: Another young athlete then writes the same song.\nB: Another young athlete subsequently breaks the same record.\nC: Another young athlete then writes a different story.\nD: Another young athlete then makes the same jump.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: Another young athlete then writes the same song.\nB: Another young athlete subsequently breaks the same record.\nC: Another young athlete then writes a different story.\nD: Another young athlete then makes the same jump.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_109_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_109_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_109_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_109_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_109_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_109_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_109_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_109_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_109_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_109_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_109_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_109_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_109_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_109_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_109_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_109_15.jpg"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: The man playfully hides behind the giant cookie before tossing it like a frisbee towards the camera.\nB: The man presents the giant cookie to the camera, then decorates it meticulously instead of eating it.\nC: A shot of the giant cookie baked is shown and leads into the man holding it in front of the camera and eating it.\nD: The man carefully presents the giant cookie in front of the camera before deciding to share it with his friends.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The man playfully hides behind the giant cookie before tossing it like a frisbee towards the camera.\nB: The man presents the giant cookie to the camera, then decorates it meticulously instead of eating it.\nC: A shot of the giant cookie baked is shown and leads into the man holding it in front of the camera and eating it.\nD: The man carefully presents the giant cookie in front of the camera before deciding to share it with his friends.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_110_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_110_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_110_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_110_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_110_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_110_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_110_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_110_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_110_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_110_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_110_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_110_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_110_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_110_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_110_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_110_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: The bartender grabs a cup, fills it with ice, and places it under the espresso machine.\nB: Then, the bartender takes a cup and prepares a cocktail in a shaker, then he pours it in the cup.\nC: Suddenly, the bartender grabs a cup, then, instead of a cocktail, he graciously serves a steaming cup of coffee.\nD: The bartender grabs a cup, fills it with coffee from the brewer, and then hands it to the customer.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The bartender grabs a cup, fills it with ice, and places it under the espresso machine.\nB: Then, the bartender takes a cup and prepares a cocktail in a shaker, then he pours it in the cup.\nC: Suddenly, the bartender grabs a cup, then, instead of a cocktail, he graciously serves a steaming cup of coffee.\nD: The bartender grabs a cup, fills it with coffee from the brewer, and then hands it to the customer.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_111_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_111_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_111_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_111_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_111_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_111_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_111_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_111_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_111_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_111_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_111_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_111_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_111_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_111_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_111_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_111_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: The girl paints a blue puck and hangs it on the wall.\nB: The girl uses a blue puck to play air hockey.\nC: The girl moves a blue puck back in place then pushes it forward.\nD: The girl paints a blue puck before tossing it into the pool.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The girl paints a blue puck and hangs it on the wall.\nB: The girl uses a blue puck to play air hockey.\nC: The girl moves a blue puck back in place then pushes it forward.\nD: The girl paints a blue puck before tossing it into the pool.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_112_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_112_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_112_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_112_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_112_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_112_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_112_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_112_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_112_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_112_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_112_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_112_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_112_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_112_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_112_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_112_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: The men laugh and walk to the right.\nB: The men laugh and walk towards the right.\nC: The men turn and face the left.\nD: The men laugh and walk towards the right.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The men laugh and walk to the right.\nB: The men laugh and walk towards the right.\nC: The men turn and face the left.\nD: The men laugh and walk towards the right.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_113_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_113_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_113_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_113_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_113_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_113_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_113_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_113_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_113_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_113_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_113_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_113_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_113_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_113_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_113_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_113_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: A person washes their hands.\nB: A person climbs a mountain.\nC: A person paints their nails.\nD: A person washes their hands.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A person washes their hands.\nB: A person climbs a mountain.\nC: A person paints their nails.\nD: A person washes their hands.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_114_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_114_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_114_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_114_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_114_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_114_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_114_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_114_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_114_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_114_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_114_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_114_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_114_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_114_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_114_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_114_15.jpg"
+    ],
+    "output": "I"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: The boat sits idle on the water, tied to the dock.\nB: The boat slides down a ramp into the water.\nC: The boat is lifted from the water onto the ramp.\nD: The boat rests calmly on the water near the ramp.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The boat sits idle on the water, tied to the dock.\nB: The boat slides down a ramp into the water.\nC: The boat is lifted from the water onto the ramp.\nD: The boat rests calmly on the water near the ramp.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_115_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_115_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_115_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_115_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_115_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_115_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_115_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_115_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_115_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_115_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_115_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_115_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_115_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_115_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_115_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_115_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: One of the men reads a book and peacefully falls asleep on the sofa.\nB: One of the men drinks from a cup and falls down unconscious on the floor.\nC: One of the men sets down a cup and begins to energetically dance on the floor.\nD: One of the men places a cup on the floor and helps another to stand up.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: One of the men reads a book and peacefully falls asleep on the sofa.\nB: One of the men drinks from a cup and falls down unconscious on the floor.\nC: One of the men sets down a cup and begins to energetically dance on the floor.\nD: One of the men places a cup on the floor and helps another to stand up.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_116_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_116_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_116_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_116_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_116_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_116_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_116_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_116_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_116_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_116_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_116_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_116_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_116_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_116_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_116_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_116_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: We observe them carefully remove and frame the antique carpet from a different viewpoint.\nB: We observe them carefully arrange and expertly install a new carpet from a different perspective.\nC: We switch and see them rip up and lay new carpet from another angle.\nD: We change and observe them design and paint a mural from a different viewpoint.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: We observe them carefully remove and frame the antique carpet from a different viewpoint.\nB: We observe them carefully arrange and expertly install a new carpet from a different perspective.\nC: We switch and see them rip up and lay new carpet from another angle.\nD: We change and observe them design and paint a mural from a different viewpoint.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_117_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_117_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_117_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_117_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_117_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_117_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_117_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_117_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_117_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_117_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_117_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_117_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_117_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_117_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_117_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_117_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: A gymnast is seen standing ready before uneven bars while many are watching on the sides.\nB: A gymnast, surrounded by onlookers, ties her shoelaces before a long run.\nC: A gymnast, surrounded by spectators, is signing autographs next to the uneven bars.\nD: A gymnast is signing autographs for fans beside the uneven bars after her performance.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A gymnast is seen standing ready before uneven bars while many are watching on the sides.\nB: A gymnast, surrounded by onlookers, ties her shoelaces before a long run.\nC: A gymnast, surrounded by spectators, is signing autographs next to the uneven bars.\nD: A gymnast is signing autographs for fans beside the uneven bars after her performance.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_118_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_118_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_118_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_118_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_118_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_118_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_118_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_118_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_118_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_118_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_118_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_118_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_118_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_118_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_118_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_118_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: The little girl in the black blouse hands the lady a sponge, one at a time, as she scrubs the dishes.\nB: The little girl, holding a black blouse, helps the lady clean a sponge, one curler at a time.\nC: While the lady in the black blouse curls the child's hair the little girl is holding a sponge curler and hands it to the lady one at a time.\nD: The little girl in the black blouse hands a sponge to the lady, who is washing the dishes, one at a time.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The little girl in the black blouse hands the lady a sponge, one at a time, as she scrubs the dishes.\nB: The little girl, holding a black blouse, helps the lady clean a sponge, one curler at a time.\nC: While the lady in the black blouse curls the child's hair the little girl is holding a sponge curler and hands it to the lady one at a time.\nD: The little girl in the black blouse hands a sponge to the lady, who is washing the dishes, one at a time.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_119_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_119_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_119_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_119_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_119_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_119_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_119_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_119_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_119_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_119_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_119_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_119_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_119_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_119_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_119_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_119_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: The person feeds the calf while other people observe from a distance.\nB: The person draws a picture of the calf as other people watch in admiration.\nC: The person captures the calf and other people run in afterwards.\nD: The person feeds the calf while other people watch from a distance.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The person feeds the calf while other people observe from a distance.\nB: The person draws a picture of the calf as other people watch in admiration.\nC: The person captures the calf and other people run in afterwards.\nD: The person feeds the calf while other people watch from a distance.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_120_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_120_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_120_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_120_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_120_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_120_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_120_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_120_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_120_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_120_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_120_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_120_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_120_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_120_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_120_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_120_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: The man leisurely strolls along the track before settling down in the large sand pit to sunbathe.\nB: The man then runs down the track and jumps into a large sand pit.\nC: The man strolls along the track before settling down beside a large sand pit.\nD: The man walks along the track and plants a tree near the large sand pit.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The man leisurely strolls along the track before settling down in the large sand pit to sunbathe.\nB: The man then runs down the track and jumps into a large sand pit.\nC: The man strolls along the track before settling down beside a large sand pit.\nD: The man walks along the track and plants a tree near the large sand pit.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_121_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_121_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_121_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_121_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_121_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_121_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_121_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_121_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_121_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_121_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_121_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_121_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_121_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_121_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_121_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_121_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: A shirtless man wearing long pants and red shoes is bent over and has his two hands gripping onto a barbell with two very large weights on the ends of it.\nB: A man in red shoes and long pants, without a shirt, leisurely ties his shoelaces, ignoring the barbell with hefty weights at his side.\nC: A shirtless man wearing long pants and red shoes is gracefully dancing on a stage, the spotlight highlighting his every move.\nD: A shirtless man in long pants and red shoes is balancing a barbell with two large weights on his shoulders.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A shirtless man wearing long pants and red shoes is bent over and has his two hands gripping onto a barbell with two very large weights on the ends of it.\nB: A man in red shoes and long pants, without a shirt, leisurely ties his shoelaces, ignoring the barbell with hefty weights at his side.\nC: A shirtless man wearing long pants and red shoes is gracefully dancing on a stage, the spotlight highlighting his every move.\nD: A shirtless man in long pants and red shoes is balancing a barbell with two large weights on his shoulders.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_122_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_122_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_122_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_122_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_122_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_122_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_122_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_122_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_122_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_122_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_122_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_122_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_122_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_122_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_122_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_122_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: She wipes off some colors with a paintbrush and starts cleaning the stained canvas.\nB: She mixes some colors with a paintbrush and begins putting the paint on the blank canvas.\nC: With a paintbrush, she carefully cleans the colors off the finished canvas.\nD: She examines the blank canvas with a paintbrush in hand, deciding on the perfect palette of colors.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: She wipes off some colors with a paintbrush and starts cleaning the stained canvas.\nB: She mixes some colors with a paintbrush and begins putting the paint on the blank canvas.\nC: With a paintbrush, she carefully cleans the colors off the finished canvas.\nD: She examines the blank canvas with a paintbrush in hand, deciding on the perfect palette of colors.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_123_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_123_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_123_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_123_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_123_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_123_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_123_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_123_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_123_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_123_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_123_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_123_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_123_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_123_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_123_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_123_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: Two women are seen arguing over a parking spot.\nB: Two women are seen arguing over a book.\nC: Two women are seen debating about a political issue.\nD: Two women are shown talking about snorkling.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: Two women are seen arguing over a parking spot.\nB: Two women are seen arguing over a book.\nC: Two women are seen debating about a political issue.\nD: Two women are shown talking about snorkling.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_124_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_124_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_124_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_124_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_124_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_124_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_124_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_124_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_124_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_124_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_124_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_124_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_124_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_124_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_124_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_124_15.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: An adult writes a book.\nB: An adult walks into frame.\nC: An adult bursts into tears.\nD: The adult throws a frisbee out of frame.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: An adult writes a book.\nB: An adult walks into frame.\nC: An adult bursts into tears.\nD: The adult throws a frisbee out of frame.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_125_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_125_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_125_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_125_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_125_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_125_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_125_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_125_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_125_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_125_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_125_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_125_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_125_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_125_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_125_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_125_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: A male African American gymnast is in a large arena full of people preparing for a performance.\nB: A male African American gymnast is teaching a group of children in a large park.\nC: A male African American gymnast is teaching young children in a crowded community center.\nD: A male African American gymnast is teaching a group of people in a large park.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A male African American gymnast is in a large arena full of people preparing for a performance.\nB: A male African American gymnast is teaching a group of children in a large park.\nC: A male African American gymnast is teaching young children in a crowded community center.\nD: A male African American gymnast is teaching a group of people in a large park.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_126_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_126_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_126_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_126_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_126_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_126_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_126_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_126_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_126_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_126_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_126_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_126_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_126_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_126_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_126_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_126_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: He carefully cleans the tire, then places the hubcap and block safely to the side.\nB: He then puts a block down next to the tire as well as taking off the hubcap.\nC: He swiftly kicks the block aside, opting to adjust the tire pressure without removing the hubcap.\nD: He removes the block from beside the tire and then replaces the hubcap.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: He carefully cleans the tire, then places the hubcap and block safely to the side.\nB: He then puts a block down next to the tire as well as taking off the hubcap.\nC: He swiftly kicks the block aside, opting to adjust the tire pressure without removing the hubcap.\nD: He removes the block from beside the tire and then replaces the hubcap.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_127_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_127_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_127_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_127_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_127_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_127_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_127_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_127_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_127_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_127_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_127_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_127_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_127_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_127_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_127_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_127_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: A sofa sits in a room.\nB: A room is being measured for a new sofa.\nC: The room was cleared of everything except the sofa.\nD: A room is being measured to fit a sofa.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A sofa sits in a room.\nB: A room is being measured for a new sofa.\nC: The room was cleared of everything except the sofa.\nD: A room is being measured to fit a sofa.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_128_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_128_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_128_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_128_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_128_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_128_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_128_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_128_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_128_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_128_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_128_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_128_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_128_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_128_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_128_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_128_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: A person's feet and legs are shown followed by the person putting socks on and rolling their pants over the socks.\nB: A person's feet and legs are displayed, before they kick a soccer ball and sprint towards the goal.\nC: A person's feet and legs are displayed as they kick a soccer ball, then they sit to tie their shoelaces.\nD: A person's feet and legs are displayed as they kick a soccer ball and sprint across the field.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A person's feet and legs are shown followed by the person putting socks on and rolling their pants over the socks.\nB: A person's feet and legs are displayed, before they kick a soccer ball and sprint towards the goal.\nC: A person's feet and legs are displayed as they kick a soccer ball, then they sit to tie their shoelaces.\nD: A person's feet and legs are displayed as they kick a soccer ball and sprint across the field.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_129_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_129_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_129_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_129_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_129_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_129_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_129_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_129_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_129_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_129_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_129_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_129_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_129_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_129_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_129_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_129_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: The sun is setting, casting a warm glow on the kite that lies forgotten on the ground.\nB: The sun set, and no longer could we see the kite lost in the sky.\nC: The kite is high in the sky and is seen against a bright sun.\nD: The sun sets, dimming its brightness as the kite lays forgotten on the ground.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The sun is setting, casting a warm glow on the kite that lies forgotten on the ground.\nB: The sun set, and no longer could we see the kite lost in the sky.\nC: The kite is high in the sky and is seen against a bright sun.\nD: The sun sets, dimming its brightness as the kite lays forgotten on the ground.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_130_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_130_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_130_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_130_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_130_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_130_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_130_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_130_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_130_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_130_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_130_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_130_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_130_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_130_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_130_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_130_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: The man is reading a book beside the white fence.\nB: The man is observed reading a book by the fence.\nC: The man is seen reading a book by the fence.\nD: The man is shown painting the fence white.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The man is reading a book beside the white fence.\nB: The man is observed reading a book by the fence.\nC: The man is seen reading a book by the fence.\nD: The man is shown painting the fence white.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_131_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_131_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_131_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_131_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_131_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_131_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_131_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_131_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_131_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_131_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_131_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_131_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_131_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_131_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_131_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_131_15.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: Suddenly, the little girl sits on the floor and folds her arms.\nB: Then, the little girl jumps to the ground and extend her arms.\nC: Suddenly, the little girl falls asleep on the grass, her arms folded under her head.\nD: Suddenly, the little girl kneels on the ground and covers her face with her arms.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: Suddenly, the little girl sits on the floor and folds her arms.\nB: Then, the little girl jumps to the ground and extend her arms.\nC: Suddenly, the little girl falls asleep on the grass, her arms folded under her head.\nD: Suddenly, the little girl kneels on the ground and covers her face with her arms.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_132_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_132_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_132_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_132_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_132_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_132_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_132_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_132_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_132_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_132_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_132_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_132_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_132_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_132_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_132_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_132_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: She tosses several ingredients into a pot and stirs it gently with a spoon.\nB: She mixes several ingredients into a bowl and spreads it around with a spoon.\nC: She takes several ingredients from the bowl and scatters them with a spoon.\nD: She pours various ingredients into a bowl and stirs it gently with a spoon.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: She tosses several ingredients into a pot and stirs it gently with a spoon.\nB: She mixes several ingredients into a bowl and spreads it around with a spoon.\nC: She takes several ingredients from the bowl and scatters them with a spoon.\nD: She pours various ingredients into a bowl and stirs it gently with a spoon.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_133_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_133_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_133_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_133_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_133_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_133_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_133_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_133_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_133_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_133_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_133_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_133_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_133_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_133_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_133_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_133_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: She calmly sits and watches the water ripple gently.\nB: She then flips and dives in the water with a small splash.\nC: She then stands and gazes at the water, holding a small shell.\nD: She calmly sips her coffee, gazing at the water's tranquil surface.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: She calmly sits and watches the water ripple gently.\nB: She then flips and dives in the water with a small splash.\nC: She then stands and gazes at the water, holding a small shell.\nD: She calmly sips her coffee, gazing at the water's tranquil surface.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_134_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_134_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_134_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_134_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_134_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_134_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_134_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_134_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_134_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_134_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_134_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_134_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_134_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_134_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_134_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_134_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: A man walks up to parallel bars while spectators, competitors, and officials are in the background.\nB: A man avoids parallel bars as spectators, competitors, and officials witness his disqualification in the background.\nC: A man, amid spectators, competitors, and officials, declines to participate in the parallel bars event.\nD: A man sits down to a chess tournament, with spectators, competitors, and officials observing his every move.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A man walks up to parallel bars while spectators, competitors, and officials are in the background.\nB: A man avoids parallel bars as spectators, competitors, and officials witness his disqualification in the background.\nC: A man, amid spectators, competitors, and officials, declines to participate in the parallel bars event.\nD: A man sits down to a chess tournament, with spectators, competitors, and officials observing his every move.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_135_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_135_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_135_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_135_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_135_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_135_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_135_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_135_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_135_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_135_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_135_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_135_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_135_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_135_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_135_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_135_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: A child goes across monkey bars.\nB: A child draws pictures of monkey bars.\nC: A child draws pictures of monkey bars.\nD: A child draws a picture of monkey bars.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A child goes across monkey bars.\nB: A child draws pictures of monkey bars.\nC: A child draws pictures of monkey bars.\nD: A child draws a picture of monkey bars.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_136_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_136_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_136_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_136_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_136_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_136_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_136_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_136_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_136_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_136_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_136_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_136_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_136_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_136_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_136_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_136_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: A man is lifting a large barbell in a competition.\nB: A man is cleaning a large barbell in a gym.\nC: A man is carefully polishing a large barbell for a display.\nD: A man is repairing a large barbell at a workshop.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A man is lifting a large barbell in a competition.\nB: A man is cleaning a large barbell in a gym.\nC: A man is carefully polishing a large barbell for a display.\nD: A man is repairing a large barbell at a workshop.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_137_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_137_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_137_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_137_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_137_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_137_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_137_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_137_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_137_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_137_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_137_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_137_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_137_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_137_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_137_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_137_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: She rinses the paint brush in a paint can, ending her day of painting.\nB: She dips the paint brush into a paint can and continues painting.\nC: She drops the paint brush into the paint can and stops working.\nD: She tosses the paint brush into a paint can and stops working.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: She rinses the paint brush in a paint can, ending her day of painting.\nB: She dips the paint brush into a paint can and continues painting.\nC: She drops the paint brush into the paint can and stops working.\nD: She tosses the paint brush into a paint can and stops working.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_138_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_138_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_138_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_138_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_138_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_138_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_138_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_138_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_138_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_138_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_138_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_138_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_138_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_138_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_138_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_138_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: He discards the items into the sink.\nB: He discards the items from the sink.\nC: He places the items on the sink.\nD: He throws the items into the trash.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: He discards the items into the sink.\nB: He discards the items from the sink.\nC: He places the items on the sink.\nD: He throws the items into the trash.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_139_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_139_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_139_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_139_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_139_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_139_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_139_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_139_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_139_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_139_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_139_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_139_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_139_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_139_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_139_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_139_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: A man is standing in a field in a small circle behind a green fence.\nB: A man is painting a small green fence in a field, encircling him.\nC: A man is repairing a green fence in a small field encircled by trees.\nD: A man is mending a green fence in a small circle within a field.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A man is standing in a field in a small circle behind a green fence.\nB: A man is painting a small green fence in a field, encircling him.\nC: A man is repairing a green fence in a small field encircled by trees.\nD: A man is mending a green fence in a small circle within a field.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_140_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_140_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_140_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_140_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_140_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_140_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_140_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_140_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_140_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_140_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_140_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_140_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_140_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_140_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_140_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_140_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: They both read a book together.\nB: They both read a book together.\nC: They both go down the slide together.\nD: They both cook dinner together.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: They both read a book together.\nB: They both read a book together.\nC: They both go down the slide together.\nD: They both cook dinner together.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_141_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_141_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_141_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_141_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_141_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_141_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_141_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_141_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_141_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_141_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_141_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_141_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_141_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_141_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_141_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_141_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: A woman in a pink jacket is walking her dog outdoors, leisurely enjoying the fresh air without engaging in any games or tricks.\nB: A woman in a pink jacket and her dog are enjoying a peaceful hike in the woods, observing the wildlife and resting by a serene lake.\nC: A woman in a pink jacket and her dog are outdoors and doing tricks wish discs as she throws them the dog catches, as well as the dog jumping over her, rolling over, dancing.\nD: A woman in a pink jacket and her dog are leisurely hiking outdoors, exploring nature trails and resting by a serene lake.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A woman in a pink jacket is walking her dog outdoors, leisurely enjoying the fresh air without engaging in any games or tricks.\nB: A woman in a pink jacket and her dog are enjoying a peaceful hike in the woods, observing the wildlife and resting by a serene lake.\nC: A woman in a pink jacket and her dog are outdoors and doing tricks wish discs as she throws them the dog catches, as well as the dog jumping over her, rolling over, dancing.\nD: A woman in a pink jacket and her dog are leisurely hiking outdoors, exploring nature trails and resting by a serene lake.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_142_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_142_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_142_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_142_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_142_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_142_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_142_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_142_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_142_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_142_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_142_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_142_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_142_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_142_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_142_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_142_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: She picks up a towel, soaks it in vinegar, and begins to massage her aching feet with it.\nB: She picks up a towel, soaks it in vinegar, and begins to dab it on her sunburn for relief.\nC: She then grabs a towel,dips it in the vinegar and starts to wipe the table to clean it.\nD: She grabs a towel, soaks it in vinegar, and begins to pat dry the freshly washed vegetables.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: She picks up a towel, soaks it in vinegar, and begins to massage her aching feet with it.\nB: She picks up a towel, soaks it in vinegar, and begins to dab it on her sunburn for relief.\nC: She then grabs a towel,dips it in the vinegar and starts to wipe the table to clean it.\nD: She grabs a towel, soaks it in vinegar, and begins to pat dry the freshly washed vegetables.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_143_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_143_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_143_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_143_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_143_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_143_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_143_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_143_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_143_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_143_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_143_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_143_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_143_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_143_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_143_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_143_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: Only one player remained on one team, while the other team gained two players.\nB: Only one player from one team, and two from the other, attended the charity event.\nC: Then only one player left on one team and two players on the other one.\nD: One player remained on one team, while the other team gained two new players.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: Only one player remained on one team, while the other team gained two players.\nB: Only one player from one team, and two from the other, attended the charity event.\nC: Then only one player left on one team and two players on the other one.\nD: One player remained on one team, while the other team gained two new players.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_144_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_144_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_144_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_144_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_144_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_144_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_144_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_144_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_144_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_144_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_144_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_144_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_144_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_144_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_144_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_144_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: A man empties the contents of his backpack.\nB: A man puts the backpack on his back.\nC: The man rummages through his backpack.\nD: The man empties his backpack onto the table.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A man empties the contents of his backpack.\nB: A man puts the backpack on his back.\nC: The man rummages through his backpack.\nD: The man empties his backpack onto the table.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_145_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_145_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_145_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_145_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_145_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_145_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_145_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_145_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_145_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_145_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_145_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_145_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_145_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_145_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_145_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_145_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: Sentence: The woman folds the t-shirt and places the iron aside.\nB: woman is holding an iron and is ironing the t shirt.\nC: Sentence: The woman is folding the t-shirt and placing the iron back on its stand.\nD: Woman is folding a t-shirt after removing the iron.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: Sentence: The woman folds the t-shirt and places the iron aside.\nB: woman is holding an iron and is ironing the t shirt.\nC: Sentence: The woman is folding the t-shirt and placing the iron back on its stand.\nD: Woman is folding a t-shirt after removing the iron.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_146_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_146_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_146_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_146_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_146_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_146_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_146_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_146_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_146_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_146_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_146_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_146_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_146_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_146_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_146_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_146_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: They line up one by one, waiting to purchase ice cream below.\nB: They jump off one by one, landing in the water below.\nC: They sit one by one, stargazing on the water's edge.\nD: They march in line, one by one, towards the dining hall.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: They line up one by one, waiting to purchase ice cream below.\nB: They jump off one by one, landing in the water below.\nC: They sit one by one, stargazing on the water's edge.\nD: They march in line, one by one, towards the dining hall.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_147_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_147_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_147_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_147_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_147_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_147_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_147_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_147_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_147_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_147_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_147_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_147_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_147_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_147_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_147_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_147_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: The man dismounts the horse, kneels to pet the calf gently, then rises and strolls away.\nB: Then, the man get down the horse and kneels to tie the legs of the calf, then the man raises and walk.\nC: The man dismounts the horse, gently strokes the calf, and then strolls away leisurely.\nD: The man dismounts the horse, kneels to pet the calf, then stands up and strolls.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The man dismounts the horse, kneels to pet the calf gently, then rises and strolls away.\nB: Then, the man get down the horse and kneels to tie the legs of the calf, then the man raises and walk.\nC: The man dismounts the horse, gently strokes the calf, and then strolls away leisurely.\nD: The man dismounts the horse, kneels to pet the calf, then stands up and strolls.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_148_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_148_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_148_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_148_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_148_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_148_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_148_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_148_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_148_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_148_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_148_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_148_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_148_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_148_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_148_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_148_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: The man gently takes the bar, serves an array of cocktails, then bows to the applauding crowd.\nB: The man then grabs the bar and does a series of flips and turns and then jumps off and nods to the crowd.\nC: The man calmly approaches the bar, orders a drink, then sits quietly, acknowledging the crowd with a brief nod.\nD: The man gently takes the bar, serves a series of cocktails, and then bows to the applauding crowd.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The man gently takes the bar, serves an array of cocktails, then bows to the applauding crowd.\nB: The man then grabs the bar and does a series of flips and turns and then jumps off and nods to the crowd.\nC: The man calmly approaches the bar, orders a drink, then sits quietly, acknowledging the crowd with a brief nod.\nD: The man gently takes the bar, serves a series of cocktails, and then bows to the applauding crowd.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_149_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_149_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_149_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_149_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_149_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_149_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_149_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_149_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_149_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_149_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_149_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_149_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_149_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_149_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_149_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_149_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: The box is effortlessly lifted by a pair of scissors, which then elegantly glide through a sheet of wrapping paper.\nB: Wrapping paper is seen cover the box followed by scissors cutting the paper and laying down a box.\nC: Scissors slice through wrapping paper before it's folded into a box.\nD: The box is sitting idle, flanked by unused wrapping paper and idle scissors.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The box is effortlessly lifted by a pair of scissors, which then elegantly glide through a sheet of wrapping paper.\nB: Wrapping paper is seen cover the box followed by scissors cutting the paper and laying down a box.\nC: Scissors slice through wrapping paper before it's folded into a box.\nD: The box is sitting idle, flanked by unused wrapping paper and idle scissors.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_150_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_150_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_150_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_150_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_150_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_150_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_150_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_150_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_150_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_150_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_150_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_150_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_150_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_150_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_150_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_150_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: A man is seen looking around a field with audience members watching and leads into him running down a field and pole volting over a bar.\nB: A man, observed by a crowd, leisurely strolls across a field, pauses to examine a pole, and playfully attempts to limbo under it.\nC: A man is observed entertaining an audience by juggling balls, before sprinting across a field to pole vault over a high bar.\nD: A man walks through a field, observing the audience, before he begins a leisurely stroll down the lane, using a pole to navigate over a small creek.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A man is seen looking around a field with audience members watching and leads into him running down a field and pole volting over a bar.\nB: A man, observed by a crowd, leisurely strolls across a field, pauses to examine a pole, and playfully attempts to limbo under it.\nC: A man is observed entertaining an audience by juggling balls, before sprinting across a field to pole vault over a high bar.\nD: A man walks through a field, observing the audience, before he begins a leisurely stroll down the lane, using a pole to navigate over a small creek.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_151_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_151_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_151_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_151_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_151_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_151_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_151_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_151_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_151_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_151_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_151_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_151_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_151_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_151_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_151_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_151_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: The woman tossed a frisbee for the dog, who sprinted a few feet to catch it.\nB: The woman raise a feet for the dog to jump over and catch a frisbee.\nC: The woman showed the dog a frisbee and made it sit at her feet.\nD: Sentence: The dog fetches the woman's slippers as she sits, feet propped up.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The woman tossed a frisbee for the dog, who sprinted a few feet to catch it.\nB: The woman raise a feet for the dog to jump over and catch a frisbee.\nC: The woman showed the dog a frisbee and made it sit at her feet.\nD: Sentence: The dog fetches the woman's slippers as she sits, feet propped up.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_152_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_152_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_152_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_152_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_152_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_152_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_152_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_152_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_152_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_152_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_152_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_152_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_152_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_152_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_152_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_152_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: Clips from the game are analyzed and critiqued, while players sit quietly reflecting.\nB: More clips of the game are shown back to back as well as players cheering and celebrating.\nC: Players are seen discussing strategies, with game clips playing in the background.\nD: Players are seen studying the game's clips and reflecting on their moves, instead of cheering and celebrating.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: Clips from the game are analyzed and critiqued, while players sit quietly reflecting.\nB: More clips of the game are shown back to back as well as players cheering and celebrating.\nC: Players are seen discussing strategies, with game clips playing in the background.\nD: Players are seen studying the game's clips and reflecting on their moves, instead of cheering and celebrating.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_153_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_153_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_153_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_153_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_153_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_153_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_153_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_153_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_153_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_153_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_153_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_153_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_153_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_153_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_153_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_153_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: A crew of workers works on constructing a brick wall.\nB: A team of laborers enjoys a lunch break beside a finished brick wall.\nC: A crew of workers takes a break after demolishing a brick wall.\nD: A team of laborers is having lunch after laying bricks all morning.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A crew of workers works on constructing a brick wall.\nB: A team of laborers enjoys a lunch break beside a finished brick wall.\nC: A crew of workers takes a break after demolishing a brick wall.\nD: A team of laborers is having lunch after laying bricks all morning.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_154_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_154_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_154_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_154_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_154_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_154_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_154_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_154_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_154_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_154_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_154_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_154_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_154_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_154_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_154_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_154_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: He asks the woman to hold his book as he scoops ice cream into a bowl.\nB: He escorts the woman out of the store, the forgotten ice cream cone melting on the counter.\nC: He hands the woman a cone and then puts ice cream on top.\nD: He takes a photo of the woman who dropped her ice cream cone.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: He asks the woman to hold his book as he scoops ice cream into a bowl.\nB: He escorts the woman out of the store, the forgotten ice cream cone melting on the counter.\nC: He hands the woman a cone and then puts ice cream on top.\nD: He takes a photo of the woman who dropped her ice cream cone.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_155_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_155_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_155_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_155_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_155_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_155_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_155_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_155_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_155_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_155_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_155_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_155_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_155_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_155_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_155_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_155_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: Several people guide the bull with sticks while someone heals the person injured by the bull.\nB: Several people taunt the bull with sticks while someone is hurt by the bull.\nC: Several people feed the bull with apples while someone is stroking the bull.\nD: Numerous individuals watched the bull peacefully from a distance as someone fed it a snack.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: Several people guide the bull with sticks while someone heals the person injured by the bull.\nB: Several people taunt the bull with sticks while someone is hurt by the bull.\nC: Several people feed the bull with apples while someone is stroking the bull.\nD: Numerous individuals watched the bull peacefully from a distance as someone fed it a snack.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_156_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_156_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_156_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_156_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_156_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_156_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_156_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_156_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_156_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_156_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_156_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_156_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_156_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_156_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_156_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_156_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: A bull is peacefully grazing in the field, while others lazily rest in the warm sunshine.\nB: A bull is then killed and laid in the dirt while others roam around him.\nC: A bull is pampered and fed in the grass as others peacefully graze around him.\nD: A bull peacefully grazes in the meadow as others frolic around him.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A bull is peacefully grazing in the field, while others lazily rest in the warm sunshine.\nB: A bull is then killed and laid in the dirt while others roam around him.\nC: A bull is pampered and fed in the grass as others peacefully graze around him.\nD: A bull peacefully grazes in the meadow as others frolic around him.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_157_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_157_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_157_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_157_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_157_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_157_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_157_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_157_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_157_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_157_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_157_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_157_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_157_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_157_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_157_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_157_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: The man and his dog collect several frisbees scattered on the side, while many people cheer them on.\nB: Many people watch on the side as the man performs tricks with the dog using several frisbees.\nC: Several frisbees lay unused as the man and his dog take a nap, with people passing by quietly.\nD: Several people notice a man feeding his dog with multiple frisbees on the grass.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The man and his dog collect several frisbees scattered on the side, while many people cheer them on.\nB: Many people watch on the side as the man performs tricks with the dog using several frisbees.\nC: Several frisbees lay unused as the man and his dog take a nap, with people passing by quietly.\nD: Several people notice a man feeding his dog with multiple frisbees on the grass.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_158_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_158_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_158_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_158_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_158_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_158_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_158_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_158_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_158_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_158_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_158_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_158_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_158_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_158_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_158_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_158_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: The kindling floats on water.\nB: The kindling is piled up in the shed.\nC: The kindling floats on the river.\nD: The kindling catches on fire.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The kindling floats on water.\nB: The kindling is piled up in the shed.\nC: The kindling floats on the river.\nD: The kindling catches on fire.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_159_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_159_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_159_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_159_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_159_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_159_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_159_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_159_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_159_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_159_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_159_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_159_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_159_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_159_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_159_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_159_15.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: She diligently organizes her tools, demonstrating how she sorts her paint colors.\nB: She meticulously cleans her tools, explaining the color theory behind her palette selection.\nC: She continues to paint along the picture while showing off her tools and how she blends the colors.\nD: She meticulously organizes her art tools, demonstrating her unique approach to color categorization.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: She diligently organizes her tools, demonstrating how she sorts her paint colors.\nB: She meticulously cleans her tools, explaining the color theory behind her palette selection.\nC: She continues to paint along the picture while showing off her tools and how she blends the colors.\nD: She meticulously organizes her art tools, demonstrating her unique approach to color categorization.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_160_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_160_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_160_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_160_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_160_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_160_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_160_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_160_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_160_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_160_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_160_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_160_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_160_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_160_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_160_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_160_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: The girl feeds the cat with her left hand.\nB: The girl holds the cats paw in her left hand.\nC: The cat swipes a toy from the girl's left hand.\nD: The girl pours milk into the cat's bowl with her right hand.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The girl feeds the cat with her left hand.\nB: The girl holds the cats paw in her left hand.\nC: The cat swipes a toy from the girl's left hand.\nD: The girl pours milk into the cat's bowl with her right hand.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_161_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_161_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_161_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_161_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_161_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_161_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_161_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_161_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_161_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_161_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_161_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_161_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_161_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_161_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_161_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_161_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: Eggs are poached, onion caramelized, eggs garnished and served.\nB: Onions are saut\u00e9ed, eggs cracked, and scrambled together.\nC: Eggs are boiled, onion chopped, eggs drained and chopped.\nD: Onions are caramelized, eggs scrambled, and both are mixed for a delicious breakfast.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: Eggs are poached, onion caramelized, eggs garnished and served.\nB: Onions are saut\u00e9ed, eggs cracked, and scrambled together.\nC: Eggs are boiled, onion chopped, eggs drained and chopped.\nD: Onions are caramelized, eggs scrambled, and both are mixed for a delicious breakfast.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_162_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_162_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_162_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_162_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_162_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_162_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_162_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_162_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_162_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_162_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_162_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_162_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_162_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_162_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_162_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_162_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: woman is siting in a bed and its putting white socks.\nB: Sentence: A woman is dancing in a field, removing her white socks.\nC: Woman is standing on a porch, folding white socks.\nD: Sentence: Woman is standing in a kitchen, cooking pasta.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: woman is siting in a bed and its putting white socks.\nB: Sentence: A woman is dancing in a field, removing her white socks.\nC: Woman is standing on a porch, folding white socks.\nD: Sentence: Woman is standing in a kitchen, cooking pasta.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_163_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_163_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_163_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_163_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_163_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_163_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_163_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_163_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_163_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_163_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_163_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_163_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_163_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_163_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_163_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_163_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: The coach invited them for a celebratory dinner after their successful match.\nB: The coach cheered them on and distributed water bottles during their exhausting practice.\nC: The coach watched and evaluates them to give them any tips and pointer.\nD: The coach organized a team dinner to build unity and camarity amongst them.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The coach invited them for a celebratory dinner after their successful match.\nB: The coach cheered them on and distributed water bottles during their exhausting practice.\nC: The coach watched and evaluates them to give them any tips and pointer.\nD: The coach organized a team dinner to build unity and camarity amongst them.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_164_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_164_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_164_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_164_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_164_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_164_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_164_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_164_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_164_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_164_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_164_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_164_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_164_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_164_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_164_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_164_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: He previously nailed a metal piece onto the wall, now he is painting a new shingle to hang on it.\nB: He examines the metal piece he had previously nailed in before deciding to polish it instead of replacing it with a new shingle.\nC: He dusts off the metal piece he previously nailed in and removes the old shingle for replacement.\nD: He lays down a new shingle to replace it over the metal piece he previously nailed in.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: He previously nailed a metal piece onto the wall, now he is painting a new shingle to hang on it.\nB: He examines the metal piece he had previously nailed in before deciding to polish it instead of replacing it with a new shingle.\nC: He dusts off the metal piece he previously nailed in and removes the old shingle for replacement.\nD: He lays down a new shingle to replace it over the metal piece he previously nailed in.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_165_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_165_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_165_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_165_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_165_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_165_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_165_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_165_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_165_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_165_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_165_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_165_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_165_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_165_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_165_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_165_15.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: A man savors a unique dessert.\nB: A man changes his routine.\nC: A man does the same.\nD: A man alters the pattern.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A man savors a unique dessert.\nB: A man changes his routine.\nC: A man does the same.\nD: A man alters the pattern.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_166_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_166_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_166_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_166_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_166_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_166_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_166_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_166_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_166_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_166_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_166_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_166_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_166_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_166_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_166_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_166_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: The man begins to dismantle the piece.\nB: The man begins to disassemble the piece.\nC: The man proceeds to assemble the piece.\nD: The man decides to disassemble the piece.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The man begins to dismantle the piece.\nB: The man begins to disassemble the piece.\nC: The man proceeds to assemble the piece.\nD: The man decides to disassemble the piece.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_167_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_167_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_167_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_167_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_167_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_167_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_167_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_167_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_167_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_167_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_167_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_167_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_167_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_167_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_167_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_167_15.jpg"
+    ],
+    "output": "F"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: Then, the man saw a speck of dust in the other eye.\nB: Later, the man removed the contact lens from the other eye.\nC: Suddenly, the man saw a tear forming in the other eye.\nD: Next, the man put the contact lens in the other eye.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: Then, the man saw a speck of dust in the other eye.\nB: Later, the man removed the contact lens from the other eye.\nC: Suddenly, the man saw a tear forming in the other eye.\nD: Next, the man put the contact lens in the other eye.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_168_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_168_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_168_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_168_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_168_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_168_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_168_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_168_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_168_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_168_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_168_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_168_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_168_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_168_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_168_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_168_15.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: He bought the shoe with cash.\nB: He scrubs the shoe with a brush.\nC: He trips over the shoe and drops the brush.\nD: He ties the shoe with a lace.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: He bought the shoe with cash.\nB: He scrubs the shoe with a brush.\nC: He trips over the shoe and drops the brush.\nD: He ties the shoe with a lace.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_169_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_169_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_169_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_169_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_169_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_169_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_169_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_169_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_169_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_169_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_169_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_169_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_169_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_169_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_169_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_169_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: She tosses a towel onto her legs and hands a lotion bottle to her friend.\nB: She tosses a towel onto her legs and hands her lotion to a friend.\nC: She pours lotion onto her hand and puts it on her legs with a towel.\nD: She hands the towel to her friend and uses the lotion to massage her tired legs.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: She tosses a towel onto her legs and hands a lotion bottle to her friend.\nB: She tosses a towel onto her legs and hands her lotion to a friend.\nC: She pours lotion onto her hand and puts it on her legs with a towel.\nD: She hands the towel to her friend and uses the lotion to massage her tired legs.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_170_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_170_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_170_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_170_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_170_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_170_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_170_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_170_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_170_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_170_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_170_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_170_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_170_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_170_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_170_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_170_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: A man is seen bending down in the middle of a forest.\nB: A man is spotted climbing a tree in the heart of a forest.\nC: A man is spotted climbing a tree in the heart of the forest.\nD: A man is spotted climbing a tree in the heart of a forest.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A man is seen bending down in the middle of a forest.\nB: A man is spotted climbing a tree in the heart of a forest.\nC: A man is spotted climbing a tree in the heart of the forest.\nD: A man is spotted climbing a tree in the heart of a forest.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_171_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_171_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_171_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_171_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_171_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_171_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_171_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_171_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_171_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_171_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_171_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_171_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_171_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_171_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_171_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_171_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: The woman, cradling the baby gently in a chair, watches the machines while the yarn lies untouched.\nB: Several shots of machines and yarn are shown as well as the woman still knitting in a chair and helping a baby.\nC: The woman in the chair gently sways the baby to sleep, while machines and colorful yarn are set aside, unused.\nD: The woman is bottle-feeding a baby in a chair while machines and yarn lie dormant in the background.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The woman, cradling the baby gently in a chair, watches the machines while the yarn lies untouched.\nB: Several shots of machines and yarn are shown as well as the woman still knitting in a chair and helping a baby.\nC: The woman in the chair gently sways the baby to sleep, while machines and colorful yarn are set aside, unused.\nD: The woman is bottle-feeding a baby in a chair while machines and yarn lie dormant in the background.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_172_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_172_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_172_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_172_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_172_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_172_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_172_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_172_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_172_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_172_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_172_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_172_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_172_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_172_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_172_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_172_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: The girl reads a book while the two men engage in a heated debate.\nB: The two men continue to play with one another as the girl continues to watch on the side.\nC: The girl interrupts the heated debate between the two men, demanding their attention.\nD: The girl hands a book to the two men who stop their conversation to thank her.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The girl reads a book while the two men engage in a heated debate.\nB: The two men continue to play with one another as the girl continues to watch on the side.\nC: The girl interrupts the heated debate between the two men, demanding their attention.\nD: The girl hands a book to the two men who stop their conversation to thank her.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_173_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_173_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_173_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_173_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_173_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_173_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_173_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_173_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_173_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_173_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_173_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_173_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_173_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_173_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_173_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_173_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: The person suddenly starts climbing the hill with a tube.\nB: The person then begins riding down the hill in a tube.\nC: The person starts climbing up the hill with a tube.\nD: The person then starts climbing up the hill with a backpack.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The person suddenly starts climbing the hill with a tube.\nB: The person then begins riding down the hill in a tube.\nC: The person starts climbing up the hill with a tube.\nD: The person then starts climbing up the hill with a backpack.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_174_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_174_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_174_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_174_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_174_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_174_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_174_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_174_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_174_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_174_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_174_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_174_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_174_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_174_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_174_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_174_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: A child paints a picture as an adult frames it on the wall.\nB: A child washes and dry their hands leaving the water on which and adult turns off.\nC: An adult reads a book while a child playfully turns the pages.\nD: A child playfully chases an adult around the garden, leaving their hands dirty.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A child paints a picture as an adult frames it on the wall.\nB: A child washes and dry their hands leaving the water on which and adult turns off.\nC: An adult reads a book while a child playfully turns the pages.\nD: A child playfully chases an adult around the garden, leaving their hands dirty.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_175_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_175_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_175_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_175_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_175_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_175_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_175_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_175_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_175_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_175_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_175_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_175_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_175_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_175_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_175_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_175_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: She bends down to tie her shoelaces, then gets up, laughing uncontrollably.\nB: She bends down to put socks on, then grabs her back in pain.\nC: She stoops to tie her shoelaces, then reaches back to adjust her ponytail.\nD: She stoops down to tie her shoelaces, and then clutches her necklace in joy.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: She bends down to tie her shoelaces, then gets up, laughing uncontrollably.\nB: She bends down to put socks on, then grabs her back in pain.\nC: She stoops to tie her shoelaces, then reaches back to adjust her ponytail.\nD: She stoops down to tie her shoelaces, and then clutches her necklace in joy.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_176_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_176_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_176_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_176_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_176_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_176_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_176_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_176_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_176_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_176_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_176_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_176_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_176_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_176_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_176_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_176_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: The person removes the ingredients from the bread as the camera captures her actions.\nB: The person puts more ingredients on the bread while the camera watches her movements.\nC: The camera captures the person as she removes ingredients from the bread.\nD: The camera captures the person as she cleans the bread crumbs off the table.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The person removes the ingredients from the bread as the camera captures her actions.\nB: The person puts more ingredients on the bread while the camera watches her movements.\nC: The camera captures the person as she removes ingredients from the bread.\nD: The camera captures the person as she cleans the bread crumbs off the table.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_177_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_177_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_177_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_177_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_177_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_177_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_177_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_177_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_177_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_177_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_177_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_177_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_177_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_177_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_177_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_177_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: He is watching a fish swim beneath the ice through a clear hole.\nB: He is attempting to catch a fish through a hole in the ice.\nC: He is teaching a fish to swim through a hole in the ice.\nD: He is reading a book beside a hole in the ice, ignoring the fish.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: He is watching a fish swim beneath the ice through a clear hole.\nB: He is attempting to catch a fish through a hole in the ice.\nC: He is teaching a fish to swim through a hole in the ice.\nD: He is reading a book beside a hole in the ice, ignoring the fish.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_178_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_178_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_178_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_178_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_178_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_178_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_178_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_178_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_178_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_178_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_178_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_178_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_178_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_178_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_178_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_178_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: She leaves the platform and approaches the other side.\nB: She lingers on the other side, refusing to ascend the platform.\nC: She stops halfway and retreats back from the platform.\nD: She reaches the other side and steps onto the platform.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: She leaves the platform and approaches the other side.\nB: She lingers on the other side, refusing to ascend the platform.\nC: She stops halfway and retreats back from the platform.\nD: She reaches the other side and steps onto the platform.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_179_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_179_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_179_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_179_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_179_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_179_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_179_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_179_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_179_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_179_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_179_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_179_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_179_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_179_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_179_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_179_15.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: The man cleans the paint-soaked brush while the bare cabinet awaits its transformation.\nB: A man cleans a paintbrush before storing the unused paint and the bare cabinet.\nC: Paint is applied to a brush and the man puts a first coat onto the bare cabinet.\nD: The man cleans the paint off the brush after accidentally smearing it on the cabinet.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The man cleans the paint-soaked brush while the bare cabinet awaits its transformation.\nB: A man cleans a paintbrush before storing the unused paint and the bare cabinet.\nC: Paint is applied to a brush and the man puts a first coat onto the bare cabinet.\nD: The man cleans the paint off the brush after accidentally smearing it on the cabinet.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_180_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_180_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_180_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_180_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_180_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_180_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_180_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_180_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_180_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_180_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_180_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_180_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_180_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_180_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_180_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_180_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: A woman throws a frisbee for the dog to catch in a backyard.\nB: A woman finds a frisbee under a bush while gardening in her backyard, which her dog had lost.\nC: A woman plants a tree in the backyard while her dog watches.\nD: A woman is watering plants in a backyard while the dog watches.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A woman throws a frisbee for the dog to catch in a backyard.\nB: A woman finds a frisbee under a bush while gardening in her backyard, which her dog had lost.\nC: A woman plants a tree in the backyard while her dog watches.\nD: A woman is watering plants in a backyard while the dog watches.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_181_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_181_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_181_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_181_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_181_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_181_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_181_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_181_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_181_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_181_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_181_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_181_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_181_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_181_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_181_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_181_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: His trainer praises him for his improvement.\nB: His trainer congratulates him.\nC: His trainer comes towards him.\nD: His trainer submits his resignation to him.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: His trainer praises him for his improvement.\nB: His trainer congratulates him.\nC: His trainer comes towards him.\nD: His trainer submits his resignation to him.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_182_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_182_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_182_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_182_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_182_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_182_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_182_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_182_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_182_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_182_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_182_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_182_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_182_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_182_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_182_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_182_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: The woman carefully feeds the dog as it eagerly catches the food in its mouth.\nB: The woman continues to throw the frisbee around while the dog chases after it and grabs it in it's mouth.\nC: The woman feeds the dog while it eagerly catches the kibble in its mouth.\nD: The woman feeds the dog as it sits patiently, gripping its favorite frisbee in its mouth.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The woman carefully feeds the dog as it eagerly catches the food in its mouth.\nB: The woman continues to throw the frisbee around while the dog chases after it and grabs it in it's mouth.\nC: The woman feeds the dog while it eagerly catches the kibble in its mouth.\nD: The woman feeds the dog as it sits patiently, gripping its favorite frisbee in its mouth.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_183_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_183_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_183_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_183_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_183_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_183_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_183_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_183_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_183_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_183_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_183_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_183_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_183_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_183_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_183_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_183_15.jpg"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: The man shares a joke with the cop as he polishes his shoes.\nB: A cop looks at him while the man cuts his hair and shaves it all off.\nC: A man plays chess with a cop as his hair is blown by the wind.\nD: A man trims his beard as the cop interrogates him about the missing evidence.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The man shares a joke with the cop as he polishes his shoes.\nB: A cop looks at him while the man cuts his hair and shaves it all off.\nC: A man plays chess with a cop as his hair is blown by the wind.\nD: A man trims his beard as the cop interrogates him about the missing evidence.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_184_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_184_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_184_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_184_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_184_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_184_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_184_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_184_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_184_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_184_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_184_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_184_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_184_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_184_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_184_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_184_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: The woman fills the box with old papers and seals it, preparing it for recycling.\nB: The woman opens the box, removes the paper, and untapes the ends to reveal a surprise gift inside.\nC: The woman covers the box with paper and tapes up the ends to create a finished, wrapped present.\nD: The woman opens the box, removes the paper, revealing a beautifully crafted present inside.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The woman fills the box with old papers and seals it, preparing it for recycling.\nB: The woman opens the box, removes the paper, and untapes the ends to reveal a surprise gift inside.\nC: The woman covers the box with paper and tapes up the ends to create a finished, wrapped present.\nD: The woman opens the box, removes the paper, revealing a beautifully crafted present inside.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_185_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_185_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_185_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_185_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_185_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_185_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_185_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_185_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_185_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_185_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_185_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_185_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_185_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_185_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_185_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_185_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: He uses his long beard to cover the electric razor as a surprise gift.\nB: he begins to shave his long beard with an electric razor.\nC: He starts to comb his long beard with an electric brush.\nD: He starts to stroke his long beard with contemplative fingers.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: He uses his long beard to cover the electric razor as a surprise gift.\nB: he begins to shave his long beard with an electric razor.\nC: He starts to comb his long beard with an electric brush.\nD: He starts to stroke his long beard with contemplative fingers.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_186_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_186_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_186_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_186_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_186_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_186_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_186_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_186_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_186_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_186_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_186_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_186_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_186_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_186_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_186_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_186_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: A person is soaking calmly in the tub water.\nB: A person is kicking around in the tub water.\nC: A person is relaxing in the tub water.\nD: A person is reading a book beside the tub water.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A person is soaking calmly in the tub water.\nB: A person is kicking around in the tub water.\nC: A person is relaxing in the tub water.\nD: A person is reading a book beside the tub water.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_187_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_187_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_187_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_187_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_187_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_187_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_187_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_187_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_187_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_187_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_187_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_187_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_187_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_187_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_187_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_187_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: The man paints on a piece of metal.\nB: The man soldiers on a piece of metal.\nC: The man accidentally stepped on a piece of metal.\nD: The man bends a piece of metal.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The man paints on a piece of metal.\nB: The man soldiers on a piece of metal.\nC: The man accidentally stepped on a piece of metal.\nD: The man bends a piece of metal.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_188_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_188_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_188_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_188_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_188_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_188_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_188_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_188_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_188_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_188_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_188_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_188_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_188_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_188_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_188_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_188_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: He sighs as the audience gasps and his mistake is replayed on the screens.\nB: He cheers while the audience cheers and his shot is shown again for cameras.\nC: He sighs as the audience jeers and his error is replayed on the jumbotron.\nD: He sighs as the audience gasps and his mistake is replayed for the cameras.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: He sighs as the audience gasps and his mistake is replayed on the screens.\nB: He cheers while the audience cheers and his shot is shown again for cameras.\nC: He sighs as the audience jeers and his error is replayed on the jumbotron.\nD: He sighs as the audience gasps and his mistake is replayed for the cameras.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_189_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_189_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_189_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_189_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_189_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_189_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_189_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_189_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_189_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_189_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_189_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_189_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_189_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_189_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_189_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_189_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: A man is seen speaking to the camera and leads into him pouring ice into a glass as well as various liquids.\nB: A man is captured on camera carefully placing ice into a glass, followed by mixing different fluids.\nC: A man is spotted on camera reading a book, then he starts filling a glass with sand and different types of grains.\nD: A man is observed on camera, carefully arranging ice and different liquids into a vibrant display.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A man is seen speaking to the camera and leads into him pouring ice into a glass as well as various liquids.\nB: A man is captured on camera carefully placing ice into a glass, followed by mixing different fluids.\nC: A man is spotted on camera reading a book, then he starts filling a glass with sand and different types of grains.\nD: A man is observed on camera, carefully arranging ice and different liquids into a vibrant display.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_190_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_190_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_190_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_190_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_190_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_190_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_190_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_190_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_190_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_190_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_190_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_190_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_190_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_190_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_190_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_190_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: They laugh loudly as they walk.\nB: They hold hands as they walk.\nC: They turn flips as they go.\nD: They hold hands as they walk.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: They laugh loudly as they walk.\nB: They hold hands as they walk.\nC: They turn flips as they go.\nD: They hold hands as they walk.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_191_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_191_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_191_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_191_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_191_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_191_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_191_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_191_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_191_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_191_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_191_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_191_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_191_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_191_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_191_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_191_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: A man holding the camera is inside a gate while wearing a helmet and talking with another man.\nB: A man in a helmet is handing a camera to another man outside a gate.\nC: A man, wearing a helmet, handed over his camera to another man inside a gate.\nD: A man in a helmet is arguing with another man over a camera, outside the gate.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A man holding the camera is inside a gate while wearing a helmet and talking with another man.\nB: A man in a helmet is handing a camera to another man outside a gate.\nC: A man, wearing a helmet, handed over his camera to another man inside a gate.\nD: A man in a helmet is arguing with another man over a camera, outside the gate.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_192_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_192_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_192_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_192_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_192_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_192_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_192_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_192_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_192_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_192_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_192_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_192_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_192_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_192_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_192_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_192_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: The girl paints the red platform and climbs up the stairs.\nB: The girl paints a red platform and climbs up the stairs.\nC: The girl gets off on a red platform and walks down the stairs.\nD: The girl paints the red platform and climbs up the stairs.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The girl paints the red platform and climbs up the stairs.\nB: The girl paints a red platform and climbs up the stairs.\nC: The girl gets off on a red platform and walks down the stairs.\nD: The girl paints the red platform and climbs up the stairs.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_193_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_193_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_193_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_193_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_193_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_193_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_193_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_193_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_193_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_193_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_193_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_193_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_193_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_193_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_193_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_193_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: A group of kids watch a young boy throw a dart onto a glass window.\nB: A group of kids observe as a young boy gently places a dart onto a glass window for an art project.\nC: A bunch of children observe as a little boy sticks a drawing onto a glass window.\nD: A group of kids observe a young boy sketching a rainbow on a glass window.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A group of kids watch a young boy throw a dart onto a glass window.\nB: A group of kids observe as a young boy gently places a dart onto a glass window for an art project.\nC: A bunch of children observe as a little boy sticks a drawing onto a glass window.\nD: A group of kids observe a young boy sketching a rainbow on a glass window.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_194_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_194_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_194_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_194_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_194_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_194_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_194_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_194_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_194_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_194_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_194_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_194_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_194_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_194_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_194_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_194_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: He frowns and roughly slaps his face multiple times.\nB: He touches his face several times and smiles.\nC: He slaps his face once and frowns.\nD: He slams his fist on the table and scowls.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: He frowns and roughly slaps his face multiple times.\nB: He touches his face several times and smiles.\nC: He slaps his face once and frowns.\nD: He slams his fist on the table and scowls.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_195_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_195_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_195_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_195_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_195_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_195_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_195_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_195_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_195_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_195_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_195_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_195_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_195_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_195_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_195_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_195_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: The men cook the fish, narrate the recipe to the camera, and continue to showcase various fish dishes.\nB: The men presents the fish to the camera as well as speak to the camera and continue to grab and hold up fish.\nC: The men toss the fish back into the water, silently wave to the camera, and proceed to cast their lines for another catch.\nD: The men cook the fish while bantering for a cooking show, occasionally glancing at the camera.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The men cook the fish, narrate the recipe to the camera, and continue to showcase various fish dishes.\nB: The men presents the fish to the camera as well as speak to the camera and continue to grab and hold up fish.\nC: The men toss the fish back into the water, silently wave to the camera, and proceed to cast their lines for another catch.\nD: The men cook the fish while bantering for a cooking show, occasionally glancing at the camera.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_196_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_196_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_196_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_196_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_196_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_196_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_196_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_196_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_196_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_196_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_196_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_196_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_196_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_196_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_196_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_196_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: a man is on a snow covered lake with a fishing pole and fish reader.\nB: A man is studying a fish reader near a snow-covered lake, neglecting his abandoned fishing pole.\nC: A man is cleaning his fishing pole and fish reader, beside a snow covered lake.\nD: A man is inspecting his fishing pole and fish reader in the garage, eager for the lake to thaw.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: a man is on a snow covered lake with a fishing pole and fish reader.\nB: A man is studying a fish reader near a snow-covered lake, neglecting his abandoned fishing pole.\nC: A man is cleaning his fishing pole and fish reader, beside a snow covered lake.\nD: A man is inspecting his fishing pole and fish reader in the garage, eager for the lake to thaw.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_197_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_197_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_197_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_197_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_197_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_197_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_197_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_197_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_197_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_197_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_197_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_197_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_197_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_197_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_197_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_197_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: His shirt is neatly folded as he calmly steps on the stage.\nB: He is seen calmly folding his shirt before sitting down to read.\nC: His jump is shown again in slow motion as well as him taking his shirt off.\nD: His shirt is neatly folded as he sits calmly, refusing to jump into conclusions.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: His shirt is neatly folded as he calmly steps on the stage.\nB: He is seen calmly folding his shirt before sitting down to read.\nC: His jump is shown again in slow motion as well as him taking his shirt off.\nD: His shirt is neatly folded as he sits calmly, refusing to jump into conclusions.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_198_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_198_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_198_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_198_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_198_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_198_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_198_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_198_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_198_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_198_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_198_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_198_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_198_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_198_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_198_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_198_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "casuality_reasoning_var",
+    "visual_input_component": "16 natural images",
+    "source": "var",
+    "options": "A: He places the red saw on the tile.\nB: He broke the red tile by dropping a saw.\nC: He cuts the tile with a red saw.\nD: He paints the tile with a red saw.",
+    "question": "What event is most likely to have occurred during the blank frames?",
+    "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: He places the red saw on the tile.\nB: He broke the red tile by dropping a saw.\nC: He cuts the tile with a red saw.\nD: He paints the tile with a red saw.\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_199_0.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_199_1.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_199_2.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_199_3.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_199_4.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_199_5.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_199_6.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_199_7.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_199_8.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_199_9.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_199_10.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_199_11.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_199_12.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_199_13.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_199_14.jpg",
+      "../MMIU-Benchmark/casuality_reasoning_var/casuality_reasoning_var_199_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_0_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_0_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_0_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_0_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_1_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_1_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_1_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_1_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_2_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_2_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_2_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_2_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_3_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_3_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_3_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_3_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_4_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_4_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_4_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_4_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_5_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_5_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_5_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_5_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_6_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_6_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_6_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_6_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_7_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_7_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_7_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_7_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_8_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_8_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_8_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_8_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_9_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_9_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_9_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_9_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_10_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_10_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_10_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_10_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_11_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_11_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_11_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_11_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_12_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_12_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_12_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_12_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_13_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_13_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_13_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_13_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_14_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_14_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_14_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_14_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_15_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_15_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_15_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_15_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_16_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_16_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_16_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_16_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_17_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_17_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_17_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_17_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_18_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_18_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_18_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_18_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_19_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_19_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_19_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_19_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_20_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_20_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_20_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_20_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_21_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_21_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_21_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_21_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_22_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_22_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_22_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_22_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_23_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_23_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_23_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_23_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_24_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_24_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_24_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_24_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_25_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_25_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_25_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_25_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_26_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_26_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_26_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_26_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_27_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_27_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_27_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_27_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_28_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_28_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_28_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_28_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_29_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_29_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_29_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_29_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_30_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_30_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_30_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_30_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_31_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_31_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_31_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_31_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_32_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_32_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_32_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_32_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_33_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_33_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_33_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_33_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_34_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_34_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_34_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_34_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_35_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_35_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_35_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_35_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_36_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_36_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_36_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_36_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_37_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_37_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_37_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_37_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_38_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_38_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_38_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_38_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_39_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_39_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_39_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_39_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_40_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_40_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_40_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_40_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_41_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_41_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_41_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_41_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_42_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_42_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_42_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_42_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_43_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_43_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_43_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_43_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_44_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_44_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_44_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_44_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_45_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_45_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_45_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_45_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_46_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_46_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_46_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_46_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_47_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_47_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_47_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_47_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_48_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_48_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_48_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_48_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_49_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_49_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_49_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_49_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_50_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_50_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_50_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_50_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_51_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_51_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_51_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_51_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_52_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_52_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_52_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_52_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_53_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_53_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_53_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_53_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_54_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_54_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_54_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_54_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_55_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_55_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_55_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_55_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_56_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_56_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_56_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_56_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_57_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_57_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_57_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_57_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_58_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_58_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_58_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_58_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_59_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_59_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_59_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_59_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_60_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_60_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_60_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_60_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_61_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_61_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_61_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_61_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_62_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_62_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_62_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_62_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_63_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_63_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_63_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_63_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_64_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_64_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_64_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_64_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_65_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_65_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_65_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_65_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_66_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_66_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_66_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_66_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_67_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_67_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_67_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_67_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_68_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_68_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_68_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_68_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_69_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_69_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_69_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_69_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_70_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_70_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_70_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_70_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_71_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_71_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_71_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_71_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_72_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_72_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_72_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_72_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_73_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_73_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_73_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_73_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_74_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_74_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_74_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_74_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_75_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_75_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_75_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_75_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_76_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_76_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_76_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_76_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_77_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_77_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_77_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_77_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_78_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_78_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_78_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_78_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_79_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_79_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_79_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_79_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_80_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_80_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_80_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_80_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_81_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_81_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_81_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_81_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_82_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_82_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_82_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_82_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_83_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_83_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_83_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_83_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_84_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_84_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_84_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_84_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_85_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_85_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_85_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_85_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_86_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_86_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_86_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_86_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_87_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_87_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_87_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_87_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_88_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_88_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_88_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_88_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_89_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_89_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_89_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_89_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_90_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_90_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_90_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_90_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_91_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_91_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_91_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_91_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_92_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_92_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_92_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_92_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_93_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_93_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_93_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_93_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_94_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_94_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_94_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_94_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_95_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_95_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_95_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_95_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_96_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_96_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_96_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_96_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_97_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_97_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_97_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_97_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_98_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_98_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_98_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_98_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_99_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_99_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_99_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_99_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_100_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_100_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_100_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_100_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_101_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_101_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_101_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_101_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_102_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_102_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_102_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_102_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_103_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_103_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_103_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_103_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_104_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_104_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_104_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_104_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_105_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_105_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_105_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_105_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_106_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_106_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_106_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_106_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_107_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_107_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_107_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_107_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_108_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_108_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_108_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_108_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_109_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_109_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_109_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_109_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_110_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_110_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_110_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_110_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_111_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_111_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_111_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_111_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_112_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_112_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_112_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_112_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_113_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_113_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_113_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_113_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_114_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_114_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_114_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_114_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_115_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_115_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_115_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_115_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_116_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_116_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_116_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_116_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_117_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_117_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_117_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_117_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_118_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_118_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_118_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_118_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_119_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_119_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_119_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_119_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_120_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_120_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_120_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_120_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_121_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_121_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_121_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_121_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_122_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_122_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_122_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_122_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_123_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_123_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_123_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_123_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_124_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_124_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_124_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_124_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_125_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_125_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_125_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_125_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_126_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_126_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_126_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_126_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_127_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_127_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_127_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_127_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_128_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_128_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_128_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_128_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_129_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_129_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_129_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_129_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_130_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_130_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_130_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_130_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_131_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_131_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_131_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_131_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_132_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_132_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_132_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_132_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_133_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_133_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_133_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_133_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_134_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_134_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_134_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_134_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_135_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_135_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_135_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_135_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_136_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_136_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_136_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_136_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_137_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_137_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_137_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_137_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_138_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_138_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_138_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_138_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_139_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_139_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_139_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_139_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_140_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_140_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_140_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_140_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_141_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_141_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_141_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_141_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_142_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_142_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_142_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_142_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_143_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_143_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_143_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_143_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_144_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_144_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_144_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_144_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_145_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_145_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_145_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_145_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_146_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_146_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_146_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_146_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_147_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_147_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_147_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_147_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_148_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_148_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_148_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_148_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_149_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_149_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_149_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_149_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_150_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_150_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_150_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_150_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_151_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_151_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_151_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_151_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_152_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_152_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_152_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_152_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_153_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_153_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_153_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_153_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_154_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_154_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_154_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_154_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_155_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_155_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_155_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_155_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_156_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_156_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_156_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_156_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_157_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_157_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_157_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_157_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_158_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_158_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_158_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_158_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_159_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_159_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_159_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_159_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_160_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_160_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_160_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_160_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_161_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_161_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_161_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_161_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_162_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_162_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_162_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_162_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_163_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_163_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_163_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_163_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_164_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_164_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_164_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_164_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_165_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_165_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_165_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_165_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_166_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_166_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_166_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_166_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_167_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_167_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_167_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_167_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_168_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_168_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_168_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_168_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_169_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_169_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_169_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_169_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_170_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_170_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_170_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_170_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_171_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_171_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_171_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_171_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_172_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_172_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_172_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_172_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_173_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_173_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_173_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_173_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_174_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_174_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_174_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_174_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_175_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_175_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_175_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_175_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_176_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_176_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_176_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_176_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_177_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_177_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_177_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_177_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_178_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_178_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_178_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_178_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_179_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_179_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_179_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_179_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_180_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_180_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_180_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_180_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_181_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_181_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_181_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_181_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_182_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_182_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_182_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_182_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_183_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_183_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_183_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_183_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_184_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_184_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_184_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_184_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_185_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_185_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_185_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_185_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_186_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_186_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_186_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_186_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_187_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_187_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_187_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_187_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_188_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_188_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_188_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_188_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_189_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_189_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_189_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_189_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_190_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_190_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_190_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_190_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_191_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_191_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_191_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_191_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_192_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_192_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_192_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_192_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_193_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_193_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_193_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_193_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_194_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_194_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_194_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_194_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_195_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_195_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_195_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_195_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_196_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_196_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_196_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_196_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_197_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_197_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_197_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_197_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_198_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_198_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_198_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_198_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "emotion_recognition_expw",
+    "visual_input_component": "4 natural images",
+    "source": "expw",
+    "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image",
+    "question": "Which image is most likely to show a different emotion from the other images?",
+    "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_199_0.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_199_1.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_199_2.jpg",
+      "../MMIU-Benchmark/emotion_recognition_expw/emotion_recognition_expw_199_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: Man types on computer Woman talks to man while typing Woman smirks at something Winters day in england Blizzard hits england\nB: Man types on computer Woman talks to man while typing Woman smiles at something Winters day in england Blizzard hits england\nC: Man types on computer Woman talks to man while typing Woman smirks at something Winters day in england Snowfall hits england\nD: Man types on computer Woman talks to man while typing Woman smirks at something Winters day in england Blizzard hits russia",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: Man types on computer Woman talks to man while typing Woman smirks at something Winters day in england Blizzard hits england\nB: Man types on computer Woman talks to man while typing Woman smiles at something Winters day in england Blizzard hits england\nC: Man types on computer Woman talks to man while typing Woman smirks at something Winters day in england Snowfall hits england\nD: Man types on computer Woman talks to man while typing Woman smirks at something Winters day in england Blizzard hits russia",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_0_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_0_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_0_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_0_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_0_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: [male] is excited about his new job in the city and decides to explore it on foot. He finds himself drawn to the beach and feels a deep connection to the sea.\nB: [female] is bored with her life , she wants to explore new places all by herself .she views the city and realizes she needs to get away and go away someplace different .the next morning she catches a bus that takes her out of the city .the bus drops her off at a port where she gets on a boat which will sail away .the boat takes her to the beach , which she feels heals her soul . she loves her new surroundings .\nC: [female] feels trapped in her routine and decides to take a plane to a foreign country. The new environment rejuvenates her, and she feels a sense of freedom and excitement.\nD: [male] is tired of the city life and decides to hike to the top of a mountain. The expansive view takes his breath away, and he feels a profound sense of peace and serenity.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: [male] is excited about his new job in the city and decides to explore it on foot. He finds himself drawn to the beach and feels a deep connection to the sea.\nB: [female] is bored with her life , she wants to explore new places all by herself .she views the city and realizes she needs to get away and go away someplace different .the next morning she catches a bus that takes her out of the city .the bus drops her off at a port where she gets on a boat which will sail away .the boat takes her to the beach , which she feels heals her soul . she loves her new surroundings .\nC: [female] feels trapped in her routine and decides to take a plane to a foreign country. The new environment rejuvenates her, and she feels a sense of freedom and excitement.\nD: [male] is tired of the city life and decides to hike to the top of a mountain. The expansive view takes his breath away, and he feels a profound sense of peace and serenity.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_1_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_1_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_1_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_1_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_1_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: we rode our atv to the mountain top ...and , suddenly , we were back on the beach and viewing a wondering opening .i asked my husband to take a picture ; however , he preferred to stand behind the camera .we then moved back to the mountain and saw a beautiful inlet..the inlet showed the strong power of the water moving in from the ocean . we took our picture and were ready for bed .\nB: we walked to the mountain top and took some pictures. then, we went to the beach and found a secluded spot. later, we observed the force of the ocean waves. finally, we slept under the stars.\nC: we drove our car to the mountain top and enjoyed the view. then, we went to the beach and saw a beautiful sunset. later, we captured the waves crashing on the shore. finally, we felt tired and went to bed.\nD: we hiked to the mountain top and felt the refreshing breeze. then, we visited the beach and admired the waves. later, we saw a natural wonder. finally, we fell asleep in our tent.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: we rode our atv to the mountain top ...and , suddenly , we were back on the beach and viewing a wondering opening .i asked my husband to take a picture ; however , he preferred to stand behind the camera .we then moved back to the mountain and saw a beautiful inlet..the inlet showed the strong power of the water moving in from the ocean . we took our picture and were ready for bed .\nB: we walked to the mountain top and took some pictures. then, we went to the beach and found a secluded spot. later, we observed the force of the ocean waves. finally, we slept under the stars.\nC: we drove our car to the mountain top and enjoyed the view. then, we went to the beach and saw a beautiful sunset. later, we captured the waves crashing on the shore. finally, we felt tired and went to bed.\nD: we hiked to the mountain top and felt the refreshing breeze. then, we visited the beach and admired the waves. later, we saw a natural wonder. finally, we fell asleep in our tent.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_2_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_2_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_2_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_2_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_2_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: A couple rowing a boat in the lake\nB: A group of people having a picnic near the lake\nC: Some children are on top of a rock watching the fish in the lake. They have jumped into the lake and are playing in the lake. Some girls are diving to the bottom of the lake There are several waterfalls far from the lake where people are cooling off. Nearby there is a trail where people are walking in the middle of nature.\nD: A man fishing in the lake",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: A couple rowing a boat in the lake\nB: A group of people having a picnic near the lake\nC: Some children are on top of a rock watching the fish in the lake. They have jumped into the lake and are playing in the lake. Some girls are diving to the bottom of the lake There are several waterfalls far from the lake where people are cooling off. Nearby there is a trail where people are walking in the middle of nature.\nD: A man fishing in the lake",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_3_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_3_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_3_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_3_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_3_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: the football match was so exhilarating.\nB: the basketball game was thrilling.\nC: i was really excited to see my first hockey game .the players made their way to the ice .the game started and it was so amazing to see .the players went by so quickly .it was exhilarating .\nD: i was energized when i saw the soccer match.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: the football match was so exhilarating.\nB: the basketball game was thrilling.\nC: i was really excited to see my first hockey game .the players made their way to the ice .the game started and it was so amazing to see .the players went by so quickly .it was exhilarating .\nD: i was energized when i saw the soccer match.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_4_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_4_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_4_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_4_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_4_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: we made a trip out to visit the location location bridge in location location .our friends from back home also came along with us on the trip .we had drinks on the beach .and played football and catch as well .everyone had a good time at the beach that day .\nB: we went to a zoo and saw some animals.\nC: we visited a park in location location .\nD: we had a picnic in the park.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: we made a trip out to visit the location location bridge in location location .our friends from back home also came along with us on the trip .we had drinks on the beach .and played football and catch as well .everyone had a good time at the beach that day .\nB: we went to a zoo and saw some animals.\nC: we visited a park in location location .\nD: we had a picnic in the park.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_5_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_5_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_5_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_5_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_5_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: Many reporters are unimpressed by the lack of palm trees and bushes in the place.\nB: Some people like to visit old buildings that share the space with the green of nature. Some reporters are impressed by the amount of palm trees and bushes in the place. People also love to walk along the paths and observe all kinds of trees. There are also transports on site that can take visitors to more distant locations. People are also delighted with the rooms decorated with beautiful maps and period objects and tables\nC: People often dislike walking along the paths and observing all kinds of trees.\nD: Few people dislike visiting old buildings that are surrounded by nature.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: Many reporters are unimpressed by the lack of palm trees and bushes in the place.\nB: Some people like to visit old buildings that share the space with the green of nature. Some reporters are impressed by the amount of palm trees and bushes in the place. People also love to walk along the paths and observe all kinds of trees. There are also transports on site that can take visitors to more distant locations. People are also delighted with the rooms decorated with beautiful maps and period objects and tables\nC: People often dislike walking along the paths and observing all kinds of trees.\nD: Few people dislike visiting old buildings that are surrounded by nature.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_6_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_6_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_6_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_6_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_6_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: everyone came to help clean up the swamp .even jethro , the most bitter participant of them all , helped out .it was a good chance to catch up with family .everyone ate hot dogs .they all worked together and had a good time .\nB: nobody came to help clean up the swamp .even jethro , the most bitter participant of them all , did not help out .it was a bad chance to catch up with family .nobody ate hot dogs .they all worked alone and had a bad time .\nC: some people came to help clean up the swamp .even jethro , the most bitter participant of them all , helped out only a little bit .it was a mediocre chance to catch up with family .some people ate hot dogs .they all worked together but had a mediocre time .\nD: only a few people came to help clean up the swamp .even jethro , the most bitter participant of them all , did not help out .it was not a chance to catch up with family .only a few people ate hot dogs .they all worked together but had a mediocre time .",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: everyone came to help clean up the swamp .even jethro , the most bitter participant of them all , helped out .it was a good chance to catch up with family .everyone ate hot dogs .they all worked together and had a good time .\nB: nobody came to help clean up the swamp .even jethro , the most bitter participant of them all , did not help out .it was a bad chance to catch up with family .nobody ate hot dogs .they all worked alone and had a bad time .\nC: some people came to help clean up the swamp .even jethro , the most bitter participant of them all , helped out only a little bit .it was a mediocre chance to catch up with family .some people ate hot dogs .they all worked together but had a mediocre time .\nD: only a few people came to help clean up the swamp .even jethro , the most bitter participant of them all , did not help out .it was not a chance to catch up with family .only a few people ate hot dogs .they all worked together but had a mediocre time .",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_7_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_7_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_7_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_7_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_7_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: Midday there was a ship that anchored at a pier. At the pier, a sailor was folding a gangway. He looked inside his tool box to see if there is any tool that could help him with the operation. After a couple of hours passed by, another ship anchored. When the ship anchored, the gangway was unfolded, for the passengers\nB: Late in the evening there was a helicopter that took off from a helipad. At the helipad, a woman was unfolding a helicopter blade. She looked inside her tool box to see if there is any tool that could help her with the operation. After a couple of hours passed by, another helicopter took off. When the helicopter took off, the helicopter blade was folded.\nC: In the afternoon there was a train that arrived at a platform. At the platform, a conductor was folding a platform bridge. He looked inside his tool box to see if there is any tool that could help him with the operation. After a couple of hours passed by, another train arrived. When the train arrived, the platform bridge was unfolded, for the passengers\nD: Early in the morning there was an airplane the landed on an airport. On the air port, a men was folding a jet bridge. He looked inside his tool box to see if there is any tool the could help him with the operation. After a couple of hours passed by, another plane landed. When the plane landed, the jet bridge was unfolded, for the passengers",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: Midday there was a ship that anchored at a pier. At the pier, a sailor was folding a gangway. He looked inside his tool box to see if there is any tool that could help him with the operation. After a couple of hours passed by, another ship anchored. When the ship anchored, the gangway was unfolded, for the passengers\nB: Late in the evening there was a helicopter that took off from a helipad. At the helipad, a woman was unfolding a helicopter blade. She looked inside her tool box to see if there is any tool that could help her with the operation. After a couple of hours passed by, another helicopter took off. When the helicopter took off, the helicopter blade was folded.\nC: In the afternoon there was a train that arrived at a platform. At the platform, a conductor was folding a platform bridge. He looked inside his tool box to see if there is any tool that could help him with the operation. After a couple of hours passed by, another train arrived. When the train arrived, the platform bridge was unfolded, for the passengers\nD: Early in the morning there was an airplane the landed on an airport. On the air port, a men was folding a jet bridge. He looked inside his tool box to see if there is any tool the could help him with the operation. After a couple of hours passed by, another plane landed. When the plane landed, the jet bridge was unfolded, for the passengers",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_8_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_8_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_8_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_8_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_8_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: The images depict a marketplace, not a forest or hillside scene.\nB: No, I haven't heard about the Forest Hill side sales on green bananas.\nC: There are no people in the images, only bananas.\nD: Hey, have you heard about the Forest Hill side sales on green bananas? One dude is just staring at the camera. Dude, like, more peeps saw the camera. Hey, let's boost those bike sales in Banana! More people sell bananas on bikes.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: The images depict a marketplace, not a forest or hillside scene.\nB: No, I haven't heard about the Forest Hill side sales on green bananas.\nC: There are no people in the images, only bananas.\nD: Hey, have you heard about the Forest Hill side sales on green bananas? One dude is just staring at the camera. Dude, like, more peeps saw the camera. Hey, let's boost those bike sales in Banana! More people sell bananas on bikes.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_9_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_9_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_9_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_9_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_9_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: it is a busy marketplace with colorful stalls and shops.\nB: it is an amazing colorful palace, to relax and learn about his ancient palace. The site has so many sites and gardens people enjoy the peaceful stroll thru the castle. Tourist enjoy a clean place to stroll thru the green colorful gardens of this beautiful palace. The view is magnificent and the museum is very clean place to visit. The place has entertaining maps and exhibitions, making sure you don't cross the velvet red ropes.\nC: it is a quiet and serene beach with crystal clear water.\nD: it is a modern skyscraper with a bustling city around it.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: it is a busy marketplace with colorful stalls and shops.\nB: it is an amazing colorful palace, to relax and learn about his ancient palace. The site has so many sites and gardens people enjoy the peaceful stroll thru the castle. Tourist enjoy a clean place to stroll thru the green colorful gardens of this beautiful palace. The view is magnificent and the museum is very clean place to visit. The place has entertaining maps and exhibitions, making sure you don't cross the velvet red ropes.\nC: it is a quiet and serene beach with crystal clear water.\nD: it is a modern skyscraper with a bustling city around it.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_10_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_10_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_10_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_10_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_10_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: we were anxious for our fun to halt .a ride on the swings was nerve-wracking .a little tension from the roller coaster was just what we feared .of course , we still had time to try out our aiming skills .we had so much pressure that we decided to leave in the evening .\nB: we were unsure about our fun from the start .a ride on the swings was boring .a little disappointment from the roller coaster was just what we didn't need .of course , we still had time to attempt our aiming skills .we had so much boredom that we left in the evening .\nC: we were ready for our fun to begin .a ride on the swings was exhilarating .a little thrill from the roller coaster was just what we needed .of course , we still had time to test out our shooting skills .we had so much fun that we made sure to stay into the evening .\nD: we were unprepared for our fun to end .a ride on the swings was terrifying .a little scare from the roller coaster was just what we dreaded .of course , we still had time to practice our aiming skills .we had so much stress that we were forced to leave in the evening .",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: we were anxious for our fun to halt .a ride on the swings was nerve-wracking .a little tension from the roller coaster was just what we feared .of course , we still had time to try out our aiming skills .we had so much pressure that we decided to leave in the evening .\nB: we were unsure about our fun from the start .a ride on the swings was boring .a little disappointment from the roller coaster was just what we didn't need .of course , we still had time to attempt our aiming skills .we had so much boredom that we left in the evening .\nC: we were ready for our fun to begin .a ride on the swings was exhilarating .a little thrill from the roller coaster was just what we needed .of course , we still had time to test out our shooting skills .we had so much fun that we made sure to stay into the evening .\nD: we were unprepared for our fun to end .a ride on the swings was terrifying .a little scare from the roller coaster was just what we dreaded .of course , we still had time to practice our aiming skills .we had so much stress that we were forced to leave in the evening .",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_11_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_11_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_11_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_11_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_11_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: During a soccer game, a group of girls line up, getting ready to play. The referee blows the whistle, and the girls start sprinting at different speeds.\nB: The coach instructs his players on when the game will begin. A group of girls line up, waiting for the game to start.\nC: A group of students gather in the gym, preparing for a relay race. The students take off and run at varying speeds. One of the girls in pink falls behind and eventually stops running.\nD: In PE class, a gym teacher instructs her students on when a race will begin. A group of girls line up, waiting for the race to start. Off they go, and the girls sprint at different paces. A girl in pink runs but notices that she is falling behind. The girl stops, is out of breath, and decides to not run anymore.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: During a soccer game, a group of girls line up, getting ready to play. The referee blows the whistle, and the girls start sprinting at different speeds.\nB: The coach instructs his players on when the game will begin. A group of girls line up, waiting for the game to start.\nC: A group of students gather in the gym, preparing for a relay race. The students take off and run at varying speeds. One of the girls in pink falls behind and eventually stops running.\nD: In PE class, a gym teacher instructs her students on when a race will begin. A group of girls line up, waiting for the race to start. Off they go, and the girls sprint at different paces. A girl in pink runs but notices that she is falling behind. The girl stops, is out of breath, and decides to not run anymore.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_12_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_12_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_12_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_12_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_12_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: the soccer player took a penalty kick .the referee checked the time on the scoreboard .the ball hit the goalpost and missed the goal . the other team's player tried to intercept the ball . the goalkeeper dived to save the ball .\nB: the basketball player made a dunk shot .the scoreboard displayed the team scores .the player was fouled and given free throws . he missed the free throws .he collided with another player and got injured .the game was delayed for medical attention .\nC: the baseball pitcher threw a curveball .the scoreboard showed the inning and outs .the batter hit a home run . the fans cheered and waved their banners .the team celebrated with high fives and hugs .\nD: the tennis player got ready to serve the ball .the board showed the score of the two players .it was the other player 's turn to serve . she was about to serve the ball .she accidentally fell down mid serve , and had to get help .the court was cleared off for the next match .",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: the soccer player took a penalty kick .the referee checked the time on the scoreboard .the ball hit the goalpost and missed the goal . the other team's player tried to intercept the ball . the goalkeeper dived to save the ball .\nB: the basketball player made a dunk shot .the scoreboard displayed the team scores .the player was fouled and given free throws . he missed the free throws .he collided with another player and got injured .the game was delayed for medical attention .\nC: the baseball pitcher threw a curveball .the scoreboard showed the inning and outs .the batter hit a home run . the fans cheered and waved their banners .the team celebrated with high fives and hugs .\nD: the tennis player got ready to serve the ball .the board showed the score of the two players .it was the other player 's turn to serve . she was about to serve the ball .she accidentally fell down mid serve , and had to get help .the court was cleared off for the next match .",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_13_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_13_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_13_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_13_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_13_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: we had an outing at the theme park with [male] .[male] got to sit in his own part of the train .he was really cheerful to be there .he kept asking what ride we would go on next .when the ride was over though , he wanted to go again .\nB: we had a day at the kid park with [female] .[female] got to sit in her own part of the train .she was really happy to be there .she kept asking what ride we would go on next .when the ride was over though , she wanted to go again .\nC: we spent a day at the amusement park with [female] .[female] sat in her own part of the train .she was really happy to be there .she kept asking what ride we would attend next .when the ride was over though , she wanted to go again .\nD: they had a day at the water park with [male] .[male] got to sit in his own part of the train .he was really happy to be there .he kept asking what ride we would go on next .when the ride was over though , he wanted to go again .",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: we had an outing at the theme park with [male] .[male] got to sit in his own part of the train .he was really cheerful to be there .he kept asking what ride we would go on next .when the ride was over though , he wanted to go again .\nB: we had a day at the kid park with [female] .[female] got to sit in her own part of the train .she was really happy to be there .she kept asking what ride we would go on next .when the ride was over though , she wanted to go again .\nC: we spent a day at the amusement park with [female] .[female] sat in her own part of the train .she was really happy to be there .she kept asking what ride we would attend next .when the ride was over though , she wanted to go again .\nD: they had a day at the water park with [male] .[male] got to sit in his own part of the train .he was really happy to be there .he kept asking what ride we would go on next .when the ride was over though , he wanted to go again .",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_14_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_14_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_14_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_14_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_14_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: The family got lost at the airport. The young girl and her mother were confused to see their dad. They wandered around and looked lost. They decided to go to the beach on a cloudy day. Someone from the park took a picture of them. They looked puzzled as the person took the picture.\nB: The family had a fight at the airport. The young girl and her mother were angry to see their dad. They argued with each other and looked mad. They decided to go to the movies on a rainy day. Someone from the park took a picture of them. They scowled as the person took the picture.\nC: The family finally met at the airport. The young girl and her mother were happy to see their dad. They hugged each other and smiled. They decided to head to the park on a beautiful sunny day. Someone from the park took a picture of them. They smiled as the person took the picture.\nD: The family missed their flight at the airport. The young girl and her mother were upset to see their dad. They cried and looked unhappy. They decided to stay home on a gloomy day. Someone from the park took a picture of them. They frowned as the person took the picture.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: The family got lost at the airport. The young girl and her mother were confused to see their dad. They wandered around and looked lost. They decided to go to the beach on a cloudy day. Someone from the park took a picture of them. They looked puzzled as the person took the picture.\nB: The family had a fight at the airport. The young girl and her mother were angry to see their dad. They argued with each other and looked mad. They decided to go to the movies on a rainy day. Someone from the park took a picture of them. They scowled as the person took the picture.\nC: The family finally met at the airport. The young girl and her mother were happy to see their dad. They hugged each other and smiled. They decided to head to the park on a beautiful sunny day. Someone from the park took a picture of them. They smiled as the person took the picture.\nD: The family missed their flight at the airport. The young girl and her mother were upset to see their dad. They cried and looked unhappy. They decided to stay home on a gloomy day. Someone from the park took a picture of them. They frowned as the person took the picture.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_15_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_15_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_15_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_15_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_15_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: [male] was all set for the beach .he drove to the beach in his fancy car .he arrived at his fancy hotel .and he looked out the hotel window .still, the beach turned out to be quite ordinary .\nB: [male] was prepared for the beach trip in his old car at his ordinary hotel .he gazed out of his hotel window .nonetheless, the beach he intended to visit was not mundane at all .\nC: [female] was ready for the trip to the beach .he jumped in his luxury car .he made it to his luxury hotel .and he looked out his luxury window .however , the beach he planned to go to , was not luxurious at all .\nD: [female] was getting ready to go to the beach .she drove her car to her hotel .she had a view of the beach from her hotel .however, the beach looked disappointing .",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: [male] was all set for the beach .he drove to the beach in his fancy car .he arrived at his fancy hotel .and he looked out the hotel window .still, the beach turned out to be quite ordinary .\nB: [male] was prepared for the beach trip in his old car at his ordinary hotel .he gazed out of his hotel window .nonetheless, the beach he intended to visit was not mundane at all .\nC: [female] was ready for the trip to the beach .he jumped in his luxury car .he made it to his luxury hotel .and he looked out his luxury window .however , the beach he planned to go to , was not luxurious at all .\nD: [female] was getting ready to go to the beach .she drove her car to her hotel .she had a view of the beach from her hotel .however, the beach looked disappointing .",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_16_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_16_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_16_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_16_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_16_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: the day at location location for the jones ' started with the tilt-a-whirl .next they went on the cyclone , a famous coaster that set the dad 's teeth on edge .the kids begged to go on the drop dive , which delivers a sheer drop from 70 feet .tired , they searched for a bathroom but all were out of order .finally , they found some public restrooms near the beach , and settled in for the afternoon .\nB: the day at location location for the smiths ' started with the merry-go-round .next they went on the giant drop, a thrilling ride that made the dad scream .the kids begged to go on the log flume, which splashes down from a great height .tired, they searched for a drink stand but all were sold out .finally, they found some lemonade stands near the beach, and refreshed themselves for the afternoon .\nC: the day at location location for the jones ' started with the carousel .next they went on the ferris wheel, a popular ride that made the dad feel dizzy .the kids begged to go on the roller coaster, which has loops and twists .tired, they searched for a food stall but all were closed .finally, they found some ice cream stalls near the beach, and treated themselves for the afternoon .\nD: the day at location location for the parkers ' started with the bumper cars .next they went on the haunted house, a spooky attraction that made the dad jump .the kids begged to go on the pirate ship, which swings back and forth .tired, they searched for a souvenir shop but all were closed .finally, they found some beach shops near the beach, and shopped for the afternoon .",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: the day at location location for the jones ' started with the tilt-a-whirl .next they went on the cyclone , a famous coaster that set the dad 's teeth on edge .the kids begged to go on the drop dive , which delivers a sheer drop from 70 feet .tired , they searched for a bathroom but all were out of order .finally , they found some public restrooms near the beach , and settled in for the afternoon .\nB: the day at location location for the smiths ' started with the merry-go-round .next they went on the giant drop, a thrilling ride that made the dad scream .the kids begged to go on the log flume, which splashes down from a great height .tired, they searched for a drink stand but all were sold out .finally, they found some lemonade stands near the beach, and refreshed themselves for the afternoon .\nC: the day at location location for the jones ' started with the carousel .next they went on the ferris wheel, a popular ride that made the dad feel dizzy .the kids begged to go on the roller coaster, which has loops and twists .tired, they searched for a food stall but all were closed .finally, they found some ice cream stalls near the beach, and treated themselves for the afternoon .\nD: the day at location location for the parkers ' started with the bumper cars .next they went on the haunted house, a spooky attraction that made the dad jump .the kids begged to go on the pirate ship, which swings back and forth .tired, they searched for a souvenir shop but all were closed .finally, they found some beach shops near the beach, and shopped for the afternoon .",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_17_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_17_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_17_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_17_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_17_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: it was the annual meeting for pinoymac .employees celebrated the occasion .even those who could n't get away from the computer had fun .family and friends were invited .everyone got a free sweatshirt to commemorate the occasion .\nB: it was the birthday celebration for pinoymac .employees celebrated the occasion .even those who could n't get away from the computer had fun .family and friends were invited .everyone got a free sweatshirt to commemorate the occasion .\nC: it was the anniversary party for pinoymac .employees celebrated the occasion .even those who couldn't get away from the computer had fun .family and friends were invited .everyone got a free sweatshirt to commemorate the occasion .\nD: it was the farewell party for pinoymac .employees celebrated the occasion .even those who could n't get away from the computer had fun .family and friends were invited .everyone got a free sweatshirt to commemorate the occasion .",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: it was the annual meeting for pinoymac .employees celebrated the occasion .even those who could n't get away from the computer had fun .family and friends were invited .everyone got a free sweatshirt to commemorate the occasion .\nB: it was the birthday celebration for pinoymac .employees celebrated the occasion .even those who could n't get away from the computer had fun .family and friends were invited .everyone got a free sweatshirt to commemorate the occasion .\nC: it was the anniversary party for pinoymac .employees celebrated the occasion .even those who couldn't get away from the computer had fun .family and friends were invited .everyone got a free sweatshirt to commemorate the occasion .\nD: it was the farewell party for pinoymac .employees celebrated the occasion .even those who could n't get away from the computer had fun .family and friends were invited .everyone got a free sweatshirt to commemorate the occasion .",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_18_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_18_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_18_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_18_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_18_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: Everyone was optimistic about the future despite knowing the end was near.\nB: The group was filled with despair and sadness as they prepared for the end.\nC: we had all finally gathered together to make the plan come true .all of our history we talked about . we knew what needed to be done , but instead of somberness we found joy in the tasks .we sat together for the last time for the last meal we would have with each other .[male] was helping with the cooking tonight . his job was to make sure the special ingredient was added .after eating we all knew the end was coming . [female] touched my face , and we kissed the kiss of the damned .\nD: The group was somber and serious as they discussed their plan.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: Everyone was optimistic about the future despite knowing the end was near.\nB: The group was filled with despair and sadness as they prepared for the end.\nC: we had all finally gathered together to make the plan come true .all of our history we talked about . we knew what needed to be done , but instead of somberness we found joy in the tasks .we sat together for the last time for the last meal we would have with each other .[male] was helping with the cooking tonight . his job was to make sure the special ingredient was added .after eating we all knew the end was coming . [female] touched my face , and we kissed the kiss of the damned .\nD: The group was somber and serious as they discussed their plan.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_19_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_19_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_19_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_19_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_19_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: The girls\u2019 track coach gathered the team in front of her. A couple of the Orange Hills Crosscountry girls were not sure the meant them. Coach dismissed them, letting them both know that the training was not for their specialty. She then turned to the remaining three girls and gave them their training assignment. The three girls headed down the paved path, running together.\nB: The boys\u2019 track coach dismissed the team\nC: The girls were not interested in the training\nD: No one knew what the coach meant",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: The girls\u2019 track coach gathered the team in front of her. A couple of the Orange Hills Crosscountry girls were not sure the meant them. Coach dismissed them, letting them both know that the training was not for their specialty. She then turned to the remaining three girls and gave them their training assignment. The three girls headed down the paved path, running together.\nB: The boys\u2019 track coach dismissed the team\nC: The girls were not interested in the training\nD: No one knew what the coach meant",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_20_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_20_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_20_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_20_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_20_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: A quiet day at the beach. The fishermen didn't catch much initially, but later they spotted some fish. The fish were quite large in size. Let's see how the day unfolds.\nB: A busy day at the park. The fishermen were disappointed at first, but later they were able to catch some bigger fish. The fish were of various sizes. Let's see what happens next.\nC: A fun day at the river. The fishermen found many fish at the beginning. Later, they spotted some large fish. The fish were enormous. Let's see what happens later.\nD: Another day at the lake. There were not very many fish seen at first. Eventually, the guys started to see some activity. The fish were pretty small. They will see what the rest of the day brings.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: A quiet day at the beach. The fishermen didn't catch much initially, but later they spotted some fish. The fish were quite large in size. Let's see how the day unfolds.\nB: A busy day at the park. The fishermen were disappointed at first, but later they were able to catch some bigger fish. The fish were of various sizes. Let's see what happens next.\nC: A fun day at the river. The fishermen found many fish at the beginning. Later, they spotted some large fish. The fish were enormous. Let's see what happens later.\nD: Another day at the lake. There were not very many fish seen at first. Eventually, the guys started to see some activity. The fish were pretty small. They will see what the rest of the day brings.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_21_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_21_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_21_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_21_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_21_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: A look at a gorgeous lake on a cold winters day. One man decides its the perfect condition for fishing. The man caught a huge fish hes about to pack up and take home. After packing the fish in a cooler he looks back at it admiring his catch. The man then returns to his car and gives his dog a pet and a loving gaze before heading out.\nB: A view of a beautiful garden on a sunny afternoon. One woman decides its the perfect condition for gardening. The woman caught a butterfly and is about to release it.\nC: A view of a crowded beach on a hot summer day. One woman decides its the perfect condition for sunbathing. The woman caught a huge wave and is about to go surfing.\nD: A look at a cloudy sky on a rainy day. One man decides its the perfect condition for a hike. The man found a treasure and is about to take it home.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: A look at a gorgeous lake on a cold winters day. One man decides its the perfect condition for fishing. The man caught a huge fish hes about to pack up and take home. After packing the fish in a cooler he looks back at it admiring his catch. The man then returns to his car and gives his dog a pet and a loving gaze before heading out.\nB: A view of a beautiful garden on a sunny afternoon. One woman decides its the perfect condition for gardening. The woman caught a butterfly and is about to release it.\nC: A view of a crowded beach on a hot summer day. One woman decides its the perfect condition for sunbathing. The woman caught a huge wave and is about to go surfing.\nD: A look at a cloudy sky on a rainy day. One man decides its the perfect condition for a hike. The man found a treasure and is about to take it home.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_22_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_22_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_22_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_22_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_22_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: a couple enjoying a picnic in the park\nB: a group of people taking a photo in front of a building\nC: a lady selfie shot with balloons and man a man see looks a ocean another selfie shot take  with man a man click the photo riding boat in ocean men and women enjoying and take selfie at boat\nD: a man playing the guitar on the beach",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: a couple enjoying a picnic in the park\nB: a group of people taking a photo in front of a building\nC: a lady selfie shot with balloons and man a man see looks a ocean another selfie shot take  with man a man click the photo riding boat in ocean men and women enjoying and take selfie at boat\nD: a man playing the guitar on the beach",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_23_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_23_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_23_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_23_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_23_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: [male] was performing at a large concert hall with a band. The audience was cheering and clapping loudly. He felt proud of his performance.\nB: [female] was performing at a small gathering of people . she played her guitar and sang .the people did not go near her and stayed at the far end of the room .[female] wondered was her singing bad ? she decided to just play the guitar .she knew a lot of songs and played them all .when [female] was finished , it was location 's turn to play guitar for the crowd .\nC: [female] was sitting alone in her room, playing the guitar and singing. She enjoyed the peaceful atmosphere and the sound of her music.\nD: A group of people were having a dance party in a spacious club. The DJ was playing energetic music and everyone was dancing enthusiastically.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: [male] was performing at a large concert hall with a band. The audience was cheering and clapping loudly. He felt proud of his performance.\nB: [female] was performing at a small gathering of people . she played her guitar and sang .the people did not go near her and stayed at the far end of the room .[female] wondered was her singing bad ? she decided to just play the guitar .she knew a lot of songs and played them all .when [female] was finished , it was location 's turn to play guitar for the crowd .\nC: [female] was sitting alone in her room, playing the guitar and singing. She enjoyed the peaceful atmosphere and the sound of her music.\nD: A group of people were having a dance party in a spacious club. The DJ was playing energetic music and everyone was dancing enthusiastically.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_24_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_24_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_24_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_24_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_24_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: They are 5 Children Ready to jump water They are swim to water They are keep going to under water They are enjoying with the water fall Finely They are going to home\nB: They are 5 Children Ready to jump water They are walk to water They are keep going to under water They are enjoying with the water fall Finely They are going to home\nC: They are 5 Adults Ready to jump water They are swim to water They are keep going to under water They are enjoying with the water fall Finely They are going to home\nD: They are 5 Children Ready to jump fire They are swim to water They are keep going to under water They are enjoying with the water fall Finely They are going to home",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: They are 5 Children Ready to jump water They are swim to water They are keep going to under water They are enjoying with the water fall Finely They are going to home\nB: They are 5 Children Ready to jump water They are walk to water They are keep going to under water They are enjoying with the water fall Finely They are going to home\nC: They are 5 Adults Ready to jump water They are swim to water They are keep going to under water They are enjoying with the water fall Finely They are going to home\nD: They are 5 Children Ready to jump fire They are swim to water They are keep going to under water They are enjoying with the water fall Finely They are going to home",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_25_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_25_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_25_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_25_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_25_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: the fans are excited to finally see the race take place .the red race car zooms past the onlookers .not far behind was another race car .the fans cheered with excitement to see the cars go by so quickly .the crowd clears out because the race is finished .\nB: the fans are disappointed to miss the race\nC: the race cars move slowly as the fans lose interest\nD: the red race car crashes into the onlookers",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: the fans are excited to finally see the race take place .the red race car zooms past the onlookers .not far behind was another race car .the fans cheered with excitement to see the cars go by so quickly .the crowd clears out because the race is finished .\nB: the fans are disappointed to miss the race\nC: the race cars move slowly as the fans lose interest\nD: the red race car crashes into the onlookers",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_26_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_26_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_26_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_26_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_26_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: A man is delivering a package on his orange motorcycle.\nB: A man is waiting for his delivery to come.\nC: A [female] woman securely places her delivery within a burlap bag hosted on her orange moped. A husband[male] and wife[female] patiently wait for their delivery to come. The [woman] wife begins to prepare for her work while she is awaiting, lighting a cauldron. The [woman] delivery driver makes sure to follow safety precautions and puts on a white helmet. Ready to go, the [woman] delivery driver rides off on her orange moped.\nD: The husband is helping the wife with her work.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: A man is delivering a package on his orange motorcycle.\nB: A man is waiting for his delivery to come.\nC: A [female] woman securely places her delivery within a burlap bag hosted on her orange moped. A husband[male] and wife[female] patiently wait for their delivery to come. The [woman] wife begins to prepare for her work while she is awaiting, lighting a cauldron. The [woman] delivery driver makes sure to follow safety precautions and puts on a white helmet. Ready to go, the [woman] delivery driver rides off on her orange moped.\nD: The husband is helping the wife with her work.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_27_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_27_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_27_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_27_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_27_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: A family enjoying a picnic in the park.\nB: A couple taking a romantic stroll by the beach.\nC: A group of friends meet up to do some biking. Nice bike! One mentions to the kid with the red bike. And they head on down the trail. Bosco [male] stops for a selfie, It's such a nice day. At the end of their journey one does a quick trick to show off before leaving.\nD: A group of students studying for an exam in the library.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: A family enjoying a picnic in the park.\nB: A couple taking a romantic stroll by the beach.\nC: A group of friends meet up to do some biking. Nice bike! One mentions to the kid with the red bike. And they head on down the trail. Bosco [male] stops for a selfie, It's such a nice day. At the end of their journey one does a quick trick to show off before leaving.\nD: A group of students studying for an exam in the library.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_28_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_28_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_28_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_28_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_28_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: The family is struggling to push the stroller in bad weather.\nB: The family is arguing and can't decide where to go next.\nC: The family is lost and doesn't know where to go next.\nD: The family decides to visit a new location and takes a selfie. The male and female are pushing a stroller on a sunny day. The family huddle up to talk about what to do next at the chosen location. The female is happy and ready to explore. The family starts walking to the next location.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: The family is struggling to push the stroller in bad weather.\nB: The family is arguing and can't decide where to go next.\nC: The family is lost and doesn't know where to go next.\nD: The family decides to visit a new location and takes a selfie. The male and female are pushing a stroller on a sunny day. The family huddle up to talk about what to do next at the chosen location. The female is happy and ready to explore. The family starts walking to the next location.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_29_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_29_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_29_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_29_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_29_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: Jessy was in bright green clothes and was calm and relaxed.\nB: hotel heaven room  no  112 was packed with journalists patiently waiting  for the guest. and particularly jessy in white clothes was looked edgy, to see the three sisters elena, marina, and sabrina who were her arch rivals then the guest madam mercury entered the  room no 112 looking puzzled to see a mysery  woman wearing a white pearl on her ear.....she was the owner of the hotel ms gomes\nC: The hotel room was empty and no one was waiting for any guest.\nD: There were no journalists in the hotel room and no one was waiting for any guest.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: Jessy was in bright green clothes and was calm and relaxed.\nB: hotel heaven room  no  112 was packed with journalists patiently waiting  for the guest. and particularly jessy in white clothes was looked edgy, to see the three sisters elena, marina, and sabrina who were her arch rivals then the guest madam mercury entered the  room no 112 looking puzzled to see a mysery  woman wearing a white pearl on her ear.....she was the owner of the hotel ms gomes\nC: The hotel room was empty and no one was waiting for any guest.\nD: There were no journalists in the hotel room and no one was waiting for any guest.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_30_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_30_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_30_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_30_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_30_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: they were on a business trip to a wealthy country where they experienced luxurious living\nB: they explored a developed nation with modern infrastructure and high standard of living\nC: we went on a vacation to location to explore the country .we found a lot of nice people and interesting sights .the country overall was fairly poor and we felt bad at times .the rivers were wild and mostly used for fishing .many people make their living fishing\nD: they visited an affluent country with beautiful landscapes and prosperous people",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: they were on a business trip to a wealthy country where they experienced luxurious living\nB: they explored a developed nation with modern infrastructure and high standard of living\nC: we went on a vacation to location to explore the country .we found a lot of nice people and interesting sights .the country overall was fairly poor and we felt bad at times .the rivers were wild and mostly used for fishing .many people make their living fishing\nD: they visited an affluent country with beautiful landscapes and prosperous people",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_31_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_31_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_31_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_31_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_31_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: tourists love to take photos of the river and old buildings .lovers come to the bridge to symbolize their love with locks .the city's local art work is popular among the locals .statues are a significant part of the city's attractions .\nB: the bridge has a rustic charm that attracts visitors .lovers often leave locks on the bridge as a symbol of commitment .many tourists enjoy posing in front of the beautiful art work .the city is known for its famous statues .\nC: the bridge is a lovely part of the city .people show their love by placing locks on the bridge .people from all over like to pose in front of local art work .statues are always a welcome sight for tourists. & # 13 ;ice skating is a wonderful time for many .\nD: the river is the main attraction of the city .tourists love to take photos in front of the old buildings .people like to attach love locks on the buildings .the local art work draws a lot of attention .",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: tourists love to take photos of the river and old buildings .lovers come to the bridge to symbolize their love with locks .the city's local art work is popular among the locals .statues are a significant part of the city's attractions .\nB: the bridge has a rustic charm that attracts visitors .lovers often leave locks on the bridge as a symbol of commitment .many tourists enjoy posing in front of the beautiful art work .the city is known for its famous statues .\nC: the bridge is a lovely part of the city .people show their love by placing locks on the bridge .people from all over like to pose in front of local art work .statues are always a welcome sight for tourists. & # 13 ;ice skating is a wonderful time for many .\nD: the river is the main attraction of the city .tourists love to take photos in front of the old buildings .people like to attach love locks on the buildings .the local art work draws a lot of attention .",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_32_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_32_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_32_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_32_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_32_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: a group of people are gathered around outside in a gloomy and polluted winter day\nB: the villagers skate, ski, and snowboard to get to their destination in the desert\nC: there is a group of people scattered around and specific corner of the market\nD: a community of people are gathered around outside on a nice, beautiful and green summer day The villagers walk, bike, and drive to get to their destination at the market, there is a group of people gathered around and specific corner of the market The men gather around the market square the deal was on bananas",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: a group of people are gathered around outside in a gloomy and polluted winter day\nB: the villagers skate, ski, and snowboard to get to their destination in the desert\nC: there is a group of people scattered around and specific corner of the market\nD: a community of people are gathered around outside on a nice, beautiful and green summer day The villagers walk, bike, and drive to get to their destination at the market, there is a group of people gathered around and specific corner of the market The men gather around the market square the deal was on bananas",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_33_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_33_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_33_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_33_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_33_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: An elderly woman is spotted at the festival wearing colorful headwear.\nB: A woman is holding a child during a festival while the child looks at the camera. An elderly woman is spotted at the festival wearing colorful headwear. Another elderly woman can also be seen at the festival wearing colorful headwear. Here is a group of performers in uniform from the festival. It is a hot and sunny day and the performers are tired.\nC: A woman is holding a child during a festival while the child looks at the camera.\nD: A group of performers in uniform from the festival are excited and energized.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: An elderly woman is spotted at the festival wearing colorful headwear.\nB: A woman is holding a child during a festival while the child looks at the camera. An elderly woman is spotted at the festival wearing colorful headwear. Another elderly woman can also be seen at the festival wearing colorful headwear. Here is a group of performers in uniform from the festival. It is a hot and sunny day and the performers are tired.\nC: A woman is holding a child during a festival while the child looks at the camera.\nD: A group of performers in uniform from the festival are excited and energized.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_34_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_34_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_34_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_34_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_34_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: A closet has a lot of sneakers within it. There is a pair of yellow sneakers on a shelf within the closet. It appears all the shelves are filled with sneakers. A pair of red sneakers has a chevron stripe on them. Someone comes into the closet to select a pair of sneakers.\nB: A shoe store with different types of shoes on display.\nC: A pantry with various food items neatly placed inside it.\nD: A wardrobe filled with clothes and accessories.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: A closet has a lot of sneakers within it. There is a pair of yellow sneakers on a shelf within the closet. It appears all the shelves are filled with sneakers. A pair of red sneakers has a chevron stripe on them. Someone comes into the closet to select a pair of sneakers.\nB: A shoe store with different types of shoes on display.\nC: A pantry with various food items neatly placed inside it.\nD: A wardrobe filled with clothes and accessories.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_35_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_35_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_35_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_35_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_35_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: the mountains have a snowy weather the land is filled with dense forests the hills are used for rock climbing there is a river flowing between the fields the man is hiking alone\nB: the mountains have a rainy weather the land is covered with tall buildings the hills are used for grazing cattle there is no road between the fields the man is riding a bicycle\nC: the mountains have a clear weather the land is barren without any trees the hills have no farms there is no road between the fields the man is sitting in the jeep and not showing the place to the blue shirt man\nD: the mountains have a cloudy weather the land is having cut farms with trees the hills have farms for the cultivation of rice there was a road in between the fields the man is driving a jeep and showing the place to the blue shirt man",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: the mountains have a snowy weather the land is filled with dense forests the hills are used for rock climbing there is a river flowing between the fields the man is hiking alone\nB: the mountains have a rainy weather the land is covered with tall buildings the hills are used for grazing cattle there is no road between the fields the man is riding a bicycle\nC: the mountains have a clear weather the land is barren without any trees the hills have no farms there is no road between the fields the man is sitting in the jeep and not showing the place to the blue shirt man\nD: the mountains have a cloudy weather the land is having cut farms with trees the hills have farms for the cultivation of rice there was a road in between the fields the man is driving a jeep and showing the place to the blue shirt man",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_36_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_36_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_36_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_36_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_36_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: it was an adventurous day for tavi and his friends .they had some snacks while waiting for everyone to arrive .then came the challenges , each thrilling him more .and what adventure would be complete without a trophy !at the end of the day he reminds us he is now 8 .\nB: it was a very special birthday for tavi this year .he and his friends had some snacks while waiting for everyone to arrive .then came the gifts , each exiting him more .and what birthday would be complete without a cake !at the end of the party he reminds us he is now 4 .\nC: it was an ordinary day in the park for tavi and his friends .they had some snacks while waiting for everyone to arrive .then came the games , each exciting him more .and what day would be complete without a song !at the end of the day he reminds us he is now 6 .\nD: it was a very boring day for tavi this year .he and his friends had some snacks while waiting for everyone to arrive .then came the homework , each exhausting him more .and what day would be complete without a nap !at the end of the day he reminds us he is now 10 .",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: it was an adventurous day for tavi and his friends .they had some snacks while waiting for everyone to arrive .then came the challenges , each thrilling him more .and what adventure would be complete without a trophy !at the end of the day he reminds us he is now 8 .\nB: it was a very special birthday for tavi this year .he and his friends had some snacks while waiting for everyone to arrive .then came the gifts , each exiting him more .and what birthday would be complete without a cake !at the end of the party he reminds us he is now 4 .\nC: it was an ordinary day in the park for tavi and his friends .they had some snacks while waiting for everyone to arrive .then came the games , each exciting him more .and what day would be complete without a song !at the end of the day he reminds us he is now 6 .\nD: it was a very boring day for tavi this year .he and his friends had some snacks while waiting for everyone to arrive .then came the homework , each exhausting him more .and what day would be complete without a nap !at the end of the day he reminds us he is now 10 .",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_37_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_37_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_37_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_37_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_37_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: One day a family went on a road trip in their van. They set out in the early morning, took the scenic route, and made several stops at picturesque locations. As the sun set, they arrived back home, concluding their eventful road trip.\nB: One day a man took a bicycle and went on a mountain biking trip. He started his journey in the afternoon, took several turns and enjoyed the adventure. Finally, he reached a hilltop at sunset, concluding his thrilling mountain biking trip.\nC: One day a man take a car and went a trip for somewhere. He wanted to take video of his trip. Then he take his mobile phone and fix in the win shield like front glass of the car. He started his car in the morning, went on the high ways, don't take any turns. He enjoyed. In the end, he came to his car shed at night, his travel trip was the end and full of high ways.\nD: One day a woman drove her car to the beach and filmed the beautiful sunset. She enjoyed the cool breeze and the sound of the waves. As the night fell, she returned home, completing her tranquil beach trip.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: One day a family went on a road trip in their van. They set out in the early morning, took the scenic route, and made several stops at picturesque locations. As the sun set, they arrived back home, concluding their eventful road trip.\nB: One day a man took a bicycle and went on a mountain biking trip. He started his journey in the afternoon, took several turns and enjoyed the adventure. Finally, he reached a hilltop at sunset, concluding his thrilling mountain biking trip.\nC: One day a man take a car and went a trip for somewhere. He wanted to take video of his trip. Then he take his mobile phone and fix in the win shield like front glass of the car. He started his car in the morning, went on the high ways, don't take any turns. He enjoyed. In the end, he came to his car shed at night, his travel trip was the end and full of high ways.\nD: One day a woman drove her car to the beach and filmed the beautiful sunset. She enjoyed the cool breeze and the sound of the waves. As the night fell, she returned home, completing her tranquil beach trip.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_38_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_38_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_38_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_38_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_38_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: the sisters are having drinks to celebrate .it 's our little sister 's baby shower and she 's looking at the gifts .she opened the present to find it 's a gift for a baby toy .another present is a baby 's outfit .everyone is gathered around the table to watch her open her gifts .\nB: the brothers are having a party with drinks\nC: it 's a wedding celebration with gifts and presents\nD: a group of friends are having a birthday party",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: the sisters are having drinks to celebrate .it 's our little sister 's baby shower and she 's looking at the gifts .she opened the present to find it 's a gift for a baby toy .another present is a baby 's outfit .everyone is gathered around the table to watch her open her gifts .\nB: the brothers are having a party with drinks\nC: it 's a wedding celebration with gifts and presents\nD: a group of friends are having a birthday party",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_39_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_39_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_39_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_39_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_39_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: the girl has created a powerful stick creature on the atoll\nB: the stick man is merely a symbol of the girl's creation\nC: the giant stick man will conquer all the humans .he stands mightily on his mound of pixels .this forceful center is the source all of his power .all hail the giant stick creature !it is really just atoll created by this girl .\nD: the giant stick man is controlling a small group of people",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: the girl has created a powerful stick creature on the atoll\nB: the stick man is merely a symbol of the girl's creation\nC: the giant stick man will conquer all the humans .he stands mightily on his mound of pixels .this forceful center is the source all of his power .all hail the giant stick creature !it is really just atoll created by this girl .\nD: the giant stick man is controlling a small group of people",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_40_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_40_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_40_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_40_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_40_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: the family enjoyed a peaceful day at the park .they looked at a massive aquarium .they could even see a range of animals  .after the sunset, they admired a stunning view of the city .they went to the market and talked about the fantastic trip they had .\nB: a group of friends spent a relaxing day at the lake .they checked out a gigantic stadium .they could even observe numerous athletes   .as the night descended, they beheld the spectacular sights of the skyscrapers .they visited the stores and reminisced about the amazing adventure they had .\nC: a couple had a pleasant outing by the river .they witnessed a colossal gathering .they even noticed all the individuals   .when the night came, they witnessed the lovely hues of the cityscape .they journeyed to the malls and reflected on the wonderful outing they had .\nD: the family took a nice trip to the beach .they saw an enormous organization organization .they even got to see all the people on the beach .when the sun fell , they could see the beautiful colors on the skyline .they traveled to the shops and thought about the great day they had .",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: the family enjoyed a peaceful day at the park .they looked at a massive aquarium .they could even see a range of animals  .after the sunset, they admired a stunning view of the city .they went to the market and talked about the fantastic trip they had .\nB: a group of friends spent a relaxing day at the lake .they checked out a gigantic stadium .they could even observe numerous athletes   .as the night descended, they beheld the spectacular sights of the skyscrapers .they visited the stores and reminisced about the amazing adventure they had .\nC: a couple had a pleasant outing by the river .they witnessed a colossal gathering .they even noticed all the individuals   .when the night came, they witnessed the lovely hues of the cityscape .they journeyed to the malls and reflected on the wonderful outing they had .\nD: the family took a nice trip to the beach .they saw an enormous organization organization .they even got to see all the people on the beach .when the sun fell , they could see the beautiful colors on the skyline .they traveled to the shops and thought about the great day they had .",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_41_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_41_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_41_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_41_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_41_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: the images depict a busy market with vendors selling various items\nB: the images represent a construction site with workers building a new structure\nC: the images show a city street with a person walking a dog\nD: we are in a temple with a bell under a wooden arch painted red a woman with a red backpack is walking people walk in the aisles outside a small car is parked in front of a porch a man installs things on a large table",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: the images depict a busy market with vendors selling various items\nB: the images represent a construction site with workers building a new structure\nC: the images show a city street with a person walking a dog\nD: we are in a temple with a bell under a wooden arch painted red a woman with a red backpack is walking people walk in the aisles outside a small car is parked in front of a porch a man installs things on a large table",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_42_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_42_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_42_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_42_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_42_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: A man in a red bandana is seen talking to food merchants. Here is some food being sold at the stall. The man is sitting down with his food order and talking. The man tries his food with chopsticks. Noodles and vegetables in brown sauce between the chopsticks.\nB: A man is buying groceries at a market.\nC: A man is having a conversation with friends at a restaurant.\nD: A man is sampling different dishes at a food festival.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: A man in a red bandana is seen talking to food merchants. Here is some food being sold at the stall. The man is sitting down with his food order and talking. The man tries his food with chopsticks. Noodles and vegetables in brown sauce between the chopsticks.\nB: A man is buying groceries at a market.\nC: A man is having a conversation with friends at a restaurant.\nD: A man is sampling different dishes at a food festival.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_43_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_43_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_43_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_43_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_43_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: one of my favorite eateries from vacation this year was boudin . this bakery is not popular and not recommended.\nB: one of my favorite eateries from vacation this year was boudin . this bakery is found in many locations in the location .sculpted bread is a favorite , especially with our kids ! look at the cute turtle .the breads are made from sourdough , hearth breads , and other specialties and the prices are reasonable .we even had the opportunity to watch the bakers in action .if you visit a town with a organization organization organization organization organization , be sure to stop in !\nC: The breads are not fresh and made from low-quality ingredients.\nD: This bakery is only found in one location and it's not very good.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: one of my favorite eateries from vacation this year was boudin . this bakery is not popular and not recommended.\nB: one of my favorite eateries from vacation this year was boudin . this bakery is found in many locations in the location .sculpted bread is a favorite , especially with our kids ! look at the cute turtle .the breads are made from sourdough , hearth breads , and other specialties and the prices are reasonable .we even had the opportunity to watch the bakers in action .if you visit a town with a organization organization organization organization organization , be sure to stop in !\nC: The breads are not fresh and made from low-quality ingredients.\nD: This bakery is only found in one location and it's not very good.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_44_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_44_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_44_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_44_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_44_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: they all went to a fancy dinner at a restaurant\nB: we all met at the club to celebrate his big night .she was sure happy to see me there .we started to take goofy pictures after a few drinks .i think he was even happier to see his friend .it was a great time had by all .\nC: they went to the beach and played volleyball\nD: it was a quiet and boring evening at home",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: they all went to a fancy dinner at a restaurant\nB: we all met at the club to celebrate his big night .she was sure happy to see me there .we started to take goofy pictures after a few drinks .i think he was even happier to see his friend .it was a great time had by all .\nC: they went to the beach and played volleyball\nD: it was a quiet and boring evening at home",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_45_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_45_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_45_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_45_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_45_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: we went to the beach for a picnic .the weather was perfect .we played games and enjoyed the sunshine .it was a great day .\nB: we gathered at the state park for the annual fireworks show .it started with a bang ! literally .the show was breathtaking . the kids loved every minute .is that a heart in there ? i think it was supposed to be .all the oohs and aahs finally came to an end . the kids wanted more !\nC: the family had a barbecue in the backyard .we grilled burgers and hot dogs .there were lots of laughs and good food .it was a memorable evening .\nD: we visited the museum and saw amazing exhibits .the kids were fascinated by the artifacts .we learned a lot and had a great time . it was a lovely experience .",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: we went to the beach for a picnic .the weather was perfect .we played games and enjoyed the sunshine .it was a great day .\nB: we gathered at the state park for the annual fireworks show .it started with a bang ! literally .the show was breathtaking . the kids loved every minute .is that a heart in there ? i think it was supposed to be .all the oohs and aahs finally came to an end . the kids wanted more !\nC: the family had a barbecue in the backyard .we grilled burgers and hot dogs .there were lots of laughs and good food .it was a memorable evening .\nD: we visited the museum and saw amazing exhibits .the kids were fascinated by the artifacts .we learned a lot and had a great time . it was a lovely experience .",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_46_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_46_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_46_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_46_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_46_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: this photo was taken on a boring day with nothing special to see\nB: i don't really like any of these pictures, they're all pretty boring\nC: these photos are not worth sharing, they're not interesting at all\nD: i had to post some of these amazing photos from vacationi love this photo of the rocks out in the oceanwe spent this day walking the beach looking for shellsthere are some amazing view of the beautiful waterthis is one of my favorites on the last day visiting",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: this photo was taken on a boring day with nothing special to see\nB: i don't really like any of these pictures, they're all pretty boring\nC: these photos are not worth sharing, they're not interesting at all\nD: i had to post some of these amazing photos from vacationi love this photo of the rocks out in the oceanwe spent this day walking the beach looking for shellsthere are some amazing view of the beautiful waterthis is one of my favorites on the last day visiting",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_47_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_47_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_47_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_47_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_47_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: here is the car that we took to the national park ! how cool is that ?we arrived safely at location location location , and were excited to start our trip .here is a beautiful little waterfall .the views were so spectacular ! look at those mountains !we got some great fly-fishing in . we had such a memorable , fun trip .\nB: here is the plane that we took to the national park ! how cool is that ?we arrived safely at location location location , and were excited to start our trip .here is a beautiful little waterfall .the views were so spectacular ! look at those mountains !we got some great fly-fishing in . we had such a memorable , fun trip .\nC: here is the train that we took to the national park ! how cool is that ?we arrived safely at location location location , and were excited to start our trip .here is a beautiful little waterfall .the views were so spectacular ! look at those mountains !we got some great fly-fishing in . we had such a memorable , fun trip .\nD: here is the boat that we took to the national park ! how cool is that ?we arrived safely at location location location , and were excited to start our trip .here is a beautiful little waterfall .the views were so spectacular ! look at those mountains !we got some great fly-fishing in . we had such a memorable , fun trip .",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: here is the car that we took to the national park ! how cool is that ?we arrived safely at location location location , and were excited to start our trip .here is a beautiful little waterfall .the views were so spectacular ! look at those mountains !we got some great fly-fishing in . we had such a memorable , fun trip .\nB: here is the plane that we took to the national park ! how cool is that ?we arrived safely at location location location , and were excited to start our trip .here is a beautiful little waterfall .the views were so spectacular ! look at those mountains !we got some great fly-fishing in . we had such a memorable , fun trip .\nC: here is the train that we took to the national park ! how cool is that ?we arrived safely at location location location , and were excited to start our trip .here is a beautiful little waterfall .the views were so spectacular ! look at those mountains !we got some great fly-fishing in . we had such a memorable , fun trip .\nD: here is the boat that we took to the national park ! how cool is that ?we arrived safely at location location location , and were excited to start our trip .here is a beautiful little waterfall .the views were so spectacular ! look at those mountains !we got some great fly-fishing in . we had such a memorable , fun trip .",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_48_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_48_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_48_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_48_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_48_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: [child] is opening presents for their birthday.\nB: [male] is pregnant and having a baby shower .her first gift looks like a big one .there 's an organization book in it .she also pulls out a onesie .and finally , some little hand booties so the baby does not scratch their cheeks .\nC: [mother] is shopping for baby clothes and accessories.\nD: [female] is attending a baby shower and receiving gifts.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: [child] is opening presents for their birthday.\nB: [male] is pregnant and having a baby shower .her first gift looks like a big one .there 's an organization book in it .she also pulls out a onesie .and finally , some little hand booties so the baby does not scratch their cheeks .\nC: [mother] is shopping for baby clothes and accessories.\nD: [female] is attending a baby shower and receiving gifts.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_49_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_49_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_49_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_49_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_49_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: A male subject hads food on a plate and two other males are watching. Food is being prepaired on a grill with the use of tongs. Several food items are being displayed from above on a table. A group of children are gathered around a table with food located in front of them. Two females are seated on a couch on is talking and the other is eating.\nB: A female subject is cooking on a grill while others are watching.\nC: Two males are seated on a couch, one is talking and the other is eating.\nD: A group of adults are gathered around a table with food located in front of them.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: A male subject hads food on a plate and two other males are watching. Food is being prepaired on a grill with the use of tongs. Several food items are being displayed from above on a table. A group of children are gathered around a table with food located in front of them. Two females are seated on a couch on is talking and the other is eating.\nB: A female subject is cooking on a grill while others are watching.\nC: Two males are seated on a couch, one is talking and the other is eating.\nD: A group of adults are gathered around a table with food located in front of them.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_50_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_50_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_50_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_50_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_50_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: The couple was dancing on the beach. They practised some dance moves. They stopped dancing. They decided to go for a swim. The couple started to run towards the ocean.\nB: The couple was shopping in the city. They visited different stores. They bought souvenirs. They ended the day with a nice dinner.\nC: The couple was having a picnic on the beach. They enjoyed some food. They flew a kite. They relaxed on the sand.\nD: The couple was hiking in the mountains. They explored the trails. They took photos of the scenery. They enjoyed the fresh air.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: The couple was dancing on the beach. They practised some dance moves. They stopped dancing. They decided to go for a swim. The couple started to run towards the ocean.\nB: The couple was shopping in the city. They visited different stores. They bought souvenirs. They ended the day with a nice dinner.\nC: The couple was having a picnic on the beach. They enjoyed some food. They flew a kite. They relaxed on the sand.\nD: The couple was hiking in the mountains. They explored the trails. They took photos of the scenery. They enjoyed the fresh air.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_51_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_51_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_51_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_51_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_51_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: One day I woke up really tired and stayed in bed all day. I didn't feel like doing anything. I spent the day alone and didn't have anyone to talk to. It was a really boring and uneventful day.\nB: One day I woke up really hungry and headed downtown to my favorite cafe. The food was really good. I met up with my friends and their kid and we all had some laughs. We said our goodbyes and then headed our separate ways. As I drove back home I reflected on the beautiful day and felt appreciation for my life.\nC: One day I woke up feeling unwell and decided to stay home. I watched movies all day and ordered some food delivery. I didn't feel like going out at all and just wanted to rest.\nD: One day I woke up early and went for a long walk in the park. I enjoyed the fresh air and the beautiful scenery. I saw some cute animals and took some photos. It was a peaceful and relaxing day.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: One day I woke up really tired and stayed in bed all day. I didn't feel like doing anything. I spent the day alone and didn't have anyone to talk to. It was a really boring and uneventful day.\nB: One day I woke up really hungry and headed downtown to my favorite cafe. The food was really good. I met up with my friends and their kid and we all had some laughs. We said our goodbyes and then headed our separate ways. As I drove back home I reflected on the beautiful day and felt appreciation for my life.\nC: One day I woke up feeling unwell and decided to stay home. I watched movies all day and ordered some food delivery. I didn't feel like going out at all and just wanted to rest.\nD: One day I woke up early and went for a long walk in the park. I enjoyed the fresh air and the beautiful scenery. I saw some cute animals and took some photos. It was a peaceful and relaxing day.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_52_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_52_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_52_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_52_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_52_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: A woman holding a bag of apples\nB: A group of people walking in the city\nC: A male carrying bananas. A person riding a bike with a tree on it. A beautiful sky up in the mountains. Trail up in the mountains. Several people riding bikes with trees on their backs.\nD: A car driving through a desert",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: A woman holding a bag of apples\nB: A group of people walking in the city\nC: A male carrying bananas. A person riding a bike with a tree on it. A beautiful sky up in the mountains. Trail up in the mountains. Several people riding bikes with trees on their backs.\nD: A car driving through a desert",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_53_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_53_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_53_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_53_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_53_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: A family picnicking in a park\nB: Hiking in a snowstorm\nC: A small group of people prepare to climb a rocky hill. A person in a red jacket is the first to climb. Another person that is climbing is wearing  a green jacket. The person in the red jacket is wearing black boots. One of the members of the small group is wearing a scarf to cover most of their face.\nD: Rock climbing in a desert",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: A family picnicking in a park\nB: Hiking in a snowstorm\nC: A small group of people prepare to climb a rocky hill. A person in a red jacket is the first to climb. Another person that is climbing is wearing  a green jacket. The person in the red jacket is wearing black boots. One of the members of the small group is wearing a scarf to cover most of their face.\nD: Rock climbing in a desert",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_54_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_54_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_54_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_54_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_54_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: they went for a hike in the mountains\nB: they spent the day shopping in the city\nC: it was a fun day at the beach\nD: it was time for a little bike ridingfirst it was one on onethen it become two on twoit was a great time at the racethey enjoyed dinner afterwards",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: they went for a hike in the mountains\nB: they spent the day shopping in the city\nC: it was a fun day at the beach\nD: it was time for a little bike ridingfirst it was one on onethen it become two on twoit was a great time at the racethey enjoyed dinner afterwards",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_55_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_55_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_55_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_55_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_55_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: The man and woman were cooking dinner at home.\nB: The man and woman were attending a formal dinner event.\nC: The couple were having a picnic in the park.\nD: The woman and man were eating some street food. The food was on the stick. They dipped the stick in the broth. The man enjoyed the meal. A woman smiled at him.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: The man and woman were cooking dinner at home.\nB: The man and woman were attending a formal dinner event.\nC: The couple were having a picnic in the park.\nD: The woman and man were eating some street food. The food was on the stick. They dipped the stick in the broth. The man enjoyed the meal. A woman smiled at him.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_56_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_56_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_56_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_56_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_56_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: A guy is waving at a family. The family look back at him. A kid is trying to sell something. The guys buy from that family. The kid does book keeping.\nB: A guy is yelling at a family. The family ignore him. A kid is trying to sell something. The guys buy from that family. The kid does book keeping.\nC: A guy is waving at a family. The family look back at him. A kid is trying to sell something. The guys ignore the family. The kid does book keeping.\nD: A guy is waving at a family. The family look back at him. A kid is playing with toys. The guys buy from that family. The kid does book keeping.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: A guy is waving at a family. The family look back at him. A kid is trying to sell something. The guys buy from that family. The kid does book keeping.\nB: A guy is yelling at a family. The family ignore him. A kid is trying to sell something. The guys buy from that family. The kid does book keeping.\nC: A guy is waving at a family. The family look back at him. A kid is trying to sell something. The guys ignore the family. The kid does book keeping.\nD: A guy is waving at a family. The family look back at him. A kid is playing with toys. The guys buy from that family. The kid does book keeping.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_57_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_57_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_57_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_57_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_57_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: a crowded hotel in a beautiful location with well-maintained surroundings.\nB: an upscale resort with luxurious accommodations and stunning views.\nC: a touristy area with clean streets and high-end shopping.\nD: my friend and i went traveling around location last summer .we walked through the market and met a man selling meat .we stayed in this dumpy hotel .the view from the hotel was a graffitied out abandoned building .there was an artist who set up toys on the ground outside our hotel .",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: a crowded hotel in a beautiful location with well-maintained surroundings.\nB: an upscale resort with luxurious accommodations and stunning views.\nC: a touristy area with clean streets and high-end shopping.\nD: my friend and i went traveling around location last summer .we walked through the market and met a man selling meat .we stayed in this dumpy hotel .the view from the hotel was a graffitied out abandoned building .there was an artist who set up toys on the ground outside our hotel .",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_58_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_58_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_58_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_58_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_58_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: the album was in the basement\nB: there were no old photos in the album\nC: i found the album at the store\nD: i was going through an old album last week .there were so many old pieces of memorabilia in there .i had a lot of fun looking at all of the old photos .they all reminded me of good times .i was very happy to have found the album in the attic .",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: the album was in the basement\nB: there were no old photos in the album\nC: i found the album at the store\nD: i was going through an old album last week .there were so many old pieces of memorabilia in there .i had a lot of fun looking at all of the old photos .they all reminded me of good times .i was very happy to have found the album in the attic .",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_59_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_59_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_59_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_59_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_59_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: A male is standing eating a hamburger. The male with the hot dog has a red ball cap on.\nB: A male has answered the door and a female in a black shirt is standing on the other side.\nC: A female has answered the door and a male in a black shirt is standing on the other side. Another male is standing eating a hot dog. The male with the hot dog has a blue ball cap on. A female is peering out from behind a window from the inside of her home. Two women are working in an office.\nD: A female is peering out from behind a door from the inside of her home. Two men are working in an office.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: A male is standing eating a hamburger. The male with the hot dog has a red ball cap on.\nB: A male has answered the door and a female in a black shirt is standing on the other side.\nC: A female has answered the door and a male in a black shirt is standing on the other side. Another male is standing eating a hot dog. The male with the hot dog has a blue ball cap on. A female is peering out from behind a window from the inside of her home. Two women are working in an office.\nD: A female is peering out from behind a door from the inside of her home. Two men are working in an office.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_60_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_60_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_60_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_60_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_60_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: They are at a music concert. They are dancing and singing along with the crowd.\nB: They are in a off road bike rally. They enjoying their weekend in this event. A small boy also running his cycle in this place. They bring their bikes for repairing in the workshop. All girls are enjoying the race and this one girl is drinking the juice.\nC: They are at a beach party. They are playing volleyball and sunbathing.\nD: They are at a car show. They are admiring the vintage cars.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: They are at a music concert. They are dancing and singing along with the crowd.\nB: They are in a off road bike rally. They enjoying their weekend in this event. A small boy also running his cycle in this place. They bring their bikes for repairing in the workshop. All girls are enjoying the race and this one girl is drinking the juice.\nC: They are at a beach party. They are playing volleyball and sunbathing.\nD: They are at a car show. They are admiring the vintage cars.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_61_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_61_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_61_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_61_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_61_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: The family enjoys a peaceful evening\nB: our family always does a fireworks display for the 4th of july .my brother shot one off of the balcony that nearly caught the house on fire .we finally got the hang of it by the third shot fired .they lit the sky up in magnificent beauty .i had to be careful though because the fireworks were a bit faulty and were not wanting to burn right all night long .\nC: The family enjoys a bonfire on 4th of July\nD: The family does not celebrate any festival",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: The family enjoys a peaceful evening\nB: our family always does a fireworks display for the 4th of july .my brother shot one off of the balcony that nearly caught the house on fire .we finally got the hang of it by the third shot fired .they lit the sky up in magnificent beauty .i had to be careful though because the fireworks were a bit faulty and were not wanting to burn right all night long .\nC: The family enjoys a bonfire on 4th of July\nD: The family does not celebrate any festival",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_62_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_62_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_62_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_62_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_62_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: this area is barren with no natural beauty .the ocean water is murky and polluted .the rocks are eroded and unattractive .the beach is surrounded by unsightly buildings .\nB: this area is filled with nature 's beauty , like the clean sand and blue ocean water .the rocks carved by the waves are stunning to look at .the naturally formed arched rock feature is especially stunning to look at .the rock formation perfectly frames the ocean waves .such a beautiful beach is starkly contrasted by the asphalt road nearby .\nC: this area is a desert with no water or vegetation .the rocks are featureless and dull .the beach is littered with garbage and waste .the asphalt road is the only notable feature .\nD: this area is filled with industrial structures and pollution .the rocks are carved with graffiti and vandalism .the rock formation looks ordinary and unimpressive .the beach is crowded with litter and debris .",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: this area is barren with no natural beauty .the ocean water is murky and polluted .the rocks are eroded and unattractive .the beach is surrounded by unsightly buildings .\nB: this area is filled with nature 's beauty , like the clean sand and blue ocean water .the rocks carved by the waves are stunning to look at .the naturally formed arched rock feature is especially stunning to look at .the rock formation perfectly frames the ocean waves .such a beautiful beach is starkly contrasted by the asphalt road nearby .\nC: this area is a desert with no water or vegetation .the rocks are featureless and dull .the beach is littered with garbage and waste .the asphalt road is the only notable feature .\nD: this area is filled with industrial structures and pollution .the rocks are carved with graffiti and vandalism .the rock formation looks ordinary and unimpressive .the beach is crowded with litter and debris .",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_63_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_63_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_63_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_63_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_63_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: this weekend we went to a serene beach .the sandy beach was calm , and peaceful .the ocean was quiet and tranquil , it was soothing to swim in the water !there was a pier for fishing .the wooden and serene pier was relaxing .\nB: this weekend we went to a bustling city .the crowded streets were lively, and noisy .the market was busy and crowded , it was chaotic to walk in the crowd !there was a shopping complex for entertainment .the vibrant and colorful shops were eye-catching .\nC: this weekend we went to a peace garden .the colorful flowers were beautiful , and smelled wonderful .the gazebo was quiet and peaceful , it was relaxing to sit in the shade !there was a religious garden for meditation .the bright and colorful flowers were breathtaking .\nD: this weekend we went to a busy park .the green trees were beautiful , and smelled fresh .the playground was noisy and crowded , it was tiring to play in the sun !there was a fountain for photography .the clear and beautiful water was refreshing .",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: this weekend we went to a serene beach .the sandy beach was calm , and peaceful .the ocean was quiet and tranquil , it was soothing to swim in the water !there was a pier for fishing .the wooden and serene pier was relaxing .\nB: this weekend we went to a bustling city .the crowded streets were lively, and noisy .the market was busy and crowded , it was chaotic to walk in the crowd !there was a shopping complex for entertainment .the vibrant and colorful shops were eye-catching .\nC: this weekend we went to a peace garden .the colorful flowers were beautiful , and smelled wonderful .the gazebo was quiet and peaceful , it was relaxing to sit in the shade !there was a religious garden for meditation .the bright and colorful flowers were breathtaking .\nD: this weekend we went to a busy park .the green trees were beautiful , and smelled fresh .the playground was noisy and crowded , it was tiring to play in the sun !there was a fountain for photography .the clear and beautiful water was refreshing .",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_64_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_64_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_64_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_64_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_64_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: A peaceful village in the countryside. Children playing in the meadows. Cows grazing in the fields.\nB: A rainy city street in an Asian city. Two girls walk into a ship in an Asian City. Femals walking around a shop with lights hanging from the ceiling. A bunny eating grass out of a wooden box. Someone petting a hedgehog with a glove on.\nC: An office building in a busy city. Employees rushing in and out of the building. Traffic jam on the road.\nD: A sunny beach in a tropical island. Coconut trees sway in the breeze. People surfing in the clear blue water.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: A peaceful village in the countryside. Children playing in the meadows. Cows grazing in the fields.\nB: A rainy city street in an Asian city. Two girls walk into a ship in an Asian City. Femals walking around a shop with lights hanging from the ceiling. A bunny eating grass out of a wooden box. Someone petting a hedgehog with a glove on.\nC: An office building in a busy city. Employees rushing in and out of the building. Traffic jam on the road.\nD: A sunny beach in a tropical island. Coconut trees sway in the breeze. People surfing in the clear blue water.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_65_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_65_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_65_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_65_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_65_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: the staff members were genuinely friendly towards each other.\nB: the school function was a disaster.\nC: the kids were bored and uninterested in the activities.\nD: the bingo party was going well at the school function !the staff members even pretended to be friendly for pictures when they really all hated each other !mothers and fathers could only really cope with being at the function through alcohol .the kids were having fun though , even if they did n't want to be there in the first place .the spelling bee went well ! the school took first place !",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: the staff members were genuinely friendly towards each other.\nB: the school function was a disaster.\nC: the kids were bored and uninterested in the activities.\nD: the bingo party was going well at the school function !the staff members even pretended to be friendly for pictures when they really all hated each other !mothers and fathers could only really cope with being at the function through alcohol .the kids were having fun though , even if they did n't want to be there in the first place .the spelling bee went well ! the school took first place !",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_66_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_66_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_66_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_66_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_66_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: the family was excited for their first vacation together .first stop was to the crowded beach .next they went to the amusement park for some rest .then they headed towards the countryside for dinner .they had a common dessert to cap off their first day of relaxation .\nB: the couple was not prepared for their first vacation together .first stop was to the unpopular beach .next they went to the countryside for some rest .then they headed towards the office for dinner .they had a typical dessert to cap off their first day of work .\nC: the couple was ready for their first vacation together .first stop was to the landmark beach .next they went to the boardwalk for some rest .then they headed towards town for dinner .they had a unique dessert to cap off their first day of vacation .\nD: the friends were ready for their first day of work together .first stop was to the unimpressive park .next they went to the shopping mall for some rest .then they headed towards the city for dinner .they had an ordinary dessert to cap off their first day of work .",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: the family was excited for their first vacation together .first stop was to the crowded beach .next they went to the amusement park for some rest .then they headed towards the countryside for dinner .they had a common dessert to cap off their first day of relaxation .\nB: the couple was not prepared for their first vacation together .first stop was to the unpopular beach .next they went to the countryside for some rest .then they headed towards the office for dinner .they had a typical dessert to cap off their first day of work .\nC: the couple was ready for their first vacation together .first stop was to the landmark beach .next they went to the boardwalk for some rest .then they headed towards town for dinner .they had a unique dessert to cap off their first day of vacation .\nD: the friends were ready for their first day of work together .first stop was to the unimpressive park .next they went to the shopping mall for some rest .then they headed towards the city for dinner .they had an ordinary dessert to cap off their first day of work .",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_67_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_67_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_67_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_67_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_67_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: everyone entered the room , ready for the presentation .there were great speakers , who provided good education .the camera man made sure to document the important event .the audience enjoyed the presentation and company of others .they were satisfied with the material presented .\nB: some people left the room , unprepared for the display .there were mediocre presenters , who delivered average training .the videographer made sure to record the insignificant event .the viewers endured the presentation and solitude of others .they were dissatisfied with the information presented .\nC: few people entered the room , prepared for the speech .there were outstanding orators , who offered excellent guidance .the photographer made sure to capture the significant occasion .the spectators relished the talk and socialization with others .they were content with the content presented .\nD: nobody arrived in the room , waiting for the discussion .there were terrible lecturers , who provided bad instruction .the photographer made sure to neglect the unimportant event .the participants disliked the presentation and absence of others .they were displeased with the material presented .",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: everyone entered the room , ready for the presentation .there were great speakers , who provided good education .the camera man made sure to document the important event .the audience enjoyed the presentation and company of others .they were satisfied with the material presented .\nB: some people left the room , unprepared for the display .there were mediocre presenters , who delivered average training .the videographer made sure to record the insignificant event .the viewers endured the presentation and solitude of others .they were dissatisfied with the information presented .\nC: few people entered the room , prepared for the speech .there were outstanding orators , who offered excellent guidance .the photographer made sure to capture the significant occasion .the spectators relished the talk and socialization with others .they were content with the content presented .\nD: nobody arrived in the room , waiting for the discussion .there were terrible lecturers , who provided bad instruction .the photographer made sure to neglect the unimportant event .the participants disliked the presentation and absence of others .they were displeased with the material presented .",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_68_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_68_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_68_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_68_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_68_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: She was smiling at the camera.\nB: She was looking down. She was curious about something. She was putting something on her food. While he looked on. She was looking down at something. She was stirring the object in the bowl.\nC: She was playing with a dog.\nD: She was walking on the beach.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: She was smiling at the camera.\nB: She was looking down. She was curious about something. She was putting something on her food. While he looked on. She was looking down at something. She was stirring the object in the bowl.\nC: She was playing with a dog.\nD: She was walking on the beach.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_69_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_69_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_69_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_69_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_69_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: people having a picnic in the park\nB: getting help with the kayak for a day on the water .friends and family having fun on the lake .what great exercise while skiing on the water .friends and family having fun on a large raft on the water .teaching baby how to swim while making sure he 's safe .\nC: children playing in the snow\nD: individuals riding horses on the beach",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: people having a picnic in the park\nB: getting help with the kayak for a day on the water .friends and family having fun on the lake .what great exercise while skiing on the water .friends and family having fun on a large raft on the water .teaching baby how to swim while making sure he 's safe .\nC: children playing in the snow\nD: individuals riding horses on the beach",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_70_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_70_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_70_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_70_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_70_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: he woke up extra early that morning , for his first day of his new job .he was very nervous , but could n't manage to wake up on his own .he had a few coffees on his way out .he had dressed his best , but the coffee had kept him jittery .at his desk , he found that he had a lot of paperwork to fill out just for his first day . what a pain .\nB: he slept in and missed his first day of work\nC: he woke up late and missed his first day of work\nD: he was calm and relaxed on his first day of work",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: he woke up extra early that morning , for his first day of his new job .he was very nervous , but could n't manage to wake up on his own .he had a few coffees on his way out .he had dressed his best , but the coffee had kept him jittery .at his desk , he found that he had a lot of paperwork to fill out just for his first day . what a pain .\nB: he slept in and missed his first day of work\nC: he woke up late and missed his first day of work\nD: he was calm and relaxed on his first day of work",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_71_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_71_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_71_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_71_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_71_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: the boat party was just what the [male] 's family needed .the elderly women gathered in the corner to talk .mr. rodriguez sat in the corner alone for a while until his wife showed up .his wife [female] brought their son and they had fun .i took a picture of the interior of the boat just for memory 's sake .\nB: the boat party was just what the [female] 's family needed .the elderly men gathered in the corner to talk .mr. rodriguez sat in the corner alone for a while until his wife showed up .his wife [female] brought their daughter and they had fun .i took a picture of the exterior of the boat just for memory 's sake .\nC: the boat party was crowded with people .the elderly men gathered in the corner to talk .mr. rodriguez sat in the corner alone for a while until his wife showed up .his wife brought their son and they had fun .i took a picture of the sunset just for memory 's sake .\nD: the boat party was boring .the elderly men gathered in the corner to talk .mr. rodriguez was alone in the corner until his wife showed up .his wife brought their daughter and they had fun .i took a picture of the exterior of the boat just for memory 's sake .",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: the boat party was just what the [male] 's family needed .the elderly women gathered in the corner to talk .mr. rodriguez sat in the corner alone for a while until his wife showed up .his wife [female] brought their son and they had fun .i took a picture of the interior of the boat just for memory 's sake .\nB: the boat party was just what the [female] 's family needed .the elderly men gathered in the corner to talk .mr. rodriguez sat in the corner alone for a while until his wife showed up .his wife [female] brought their daughter and they had fun .i took a picture of the exterior of the boat just for memory 's sake .\nC: the boat party was crowded with people .the elderly men gathered in the corner to talk .mr. rodriguez sat in the corner alone for a while until his wife showed up .his wife brought their son and they had fun .i took a picture of the sunset just for memory 's sake .\nD: the boat party was boring .the elderly men gathered in the corner to talk .mr. rodriguez was alone in the corner until his wife showed up .his wife brought their daughter and they had fun .i took a picture of the exterior of the boat just for memory 's sake .",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_72_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_72_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_72_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_72_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_72_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: a group of children visit an amusement park for a birthday celebration .they ride on different attractions and have a delicious lunch .the birthday child hugs one of their friends tightly after the meal .the children are thrilled to receive presents .the group rides the carousel before leaving the park .\nB: a boy rides a bike to the park .he meets his friends and they all have a picnic lunch .the boy shakes hands with his friend after the meal .the boy is eager to open his birthday presents .the boy opens his presents and thanks his friends before they leave .\nC: a family goes to the zoo for the day .they see many animals and have a tasty lunch .the family takes a group photo after their meal .the young girl is excited to see a lion .the family leaves the zoo after a fun day .\nD: a girl goes to chucky cheese 's for her birthday party .many of the girl 's friends show up to her party and enjoy a nice meal .the birthday girl hugs one of her friends tightly after dinner .the birthday girl is very excited about her cake .before leaving , the birthday girl blows out her candle and prepares to eat her cake .",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: a group of children visit an amusement park for a birthday celebration .they ride on different attractions and have a delicious lunch .the birthday child hugs one of their friends tightly after the meal .the children are thrilled to receive presents .the group rides the carousel before leaving the park .\nB: a boy rides a bike to the park .he meets his friends and they all have a picnic lunch .the boy shakes hands with his friend after the meal .the boy is eager to open his birthday presents .the boy opens his presents and thanks his friends before they leave .\nC: a family goes to the zoo for the day .they see many animals and have a tasty lunch .the family takes a group photo after their meal .the young girl is excited to see a lion .the family leaves the zoo after a fun day .\nD: a girl goes to chucky cheese 's for her birthday party .many of the girl 's friends show up to her party and enjoy a nice meal .the birthday girl hugs one of her friends tightly after dinner .the birthday girl is very excited about her cake .before leaving , the birthday girl blows out her candle and prepares to eat her cake .",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_73_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_73_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_73_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_73_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_73_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: we got into location on july 4th .we had to ride these long escalators to get to the exit .some of the advertisements in the terminal caught our eye , in particular this one for the tourism board ...and this one advertising the beauty of location location .as we left , these friendly police officers wished us a good vacation !\nB: we arrived at the destination on july 4th .we had to take these long elevators to reach the exit .some of the billboards in the terminal attracted our attention , especially this one for the travel bureau ...and this one promoting the charm of destination destination .as we departed , these amiable security personnel bid us a pleasant holiday !\nC: we arrived at the venue on july 4th .we had to climb these long stairs to get to the exit .some of the banners in the terminal caught our attention , in particular this one for the travel agency ...and this one promoting the attractiveness of destination destination .as we left , these friendly security guards wished us a great holiday !\nD: we entered the place on july 4th .we had to use these tall stairs to access the exit .some of the posters in the terminal grabbed our attention , particularly this one for the sightseeing committee ...and this one marketing the allure of place place .as we departed , these affable law enforcement officers wished us a wonderful trip !",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: we got into location on july 4th .we had to ride these long escalators to get to the exit .some of the advertisements in the terminal caught our eye , in particular this one for the tourism board ...and this one advertising the beauty of location location .as we left , these friendly police officers wished us a good vacation !\nB: we arrived at the destination on july 4th .we had to take these long elevators to reach the exit .some of the billboards in the terminal attracted our attention , especially this one for the travel bureau ...and this one promoting the charm of destination destination .as we departed , these amiable security personnel bid us a pleasant holiday !\nC: we arrived at the venue on july 4th .we had to climb these long stairs to get to the exit .some of the banners in the terminal caught our attention , in particular this one for the travel agency ...and this one promoting the attractiveness of destination destination .as we left , these friendly security guards wished us a great holiday !\nD: we entered the place on july 4th .we had to use these tall stairs to access the exit .some of the posters in the terminal grabbed our attention , particularly this one for the sightseeing committee ...and this one marketing the allure of place place .as we departed , these affable law enforcement officers wished us a wonderful trip !",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_74_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_74_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_74_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_74_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_74_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: jen was all dressed up for her 30th birthday party .friends and coworkers joined in the celebration .[male] and [male] were their usual crazy selves .missy made this beautiful cake , accented with her favorite flower - gerbera daisies .it was a shame to cut the masterpiece , but everyone was hungry !\nB: jen was all dressed up for her 30th birthday party .friends and coworkers joined in the celebration .[female] and [female] were their usual crazy selves .missy made this beautiful cake , accented with her favorite flower - gerbera daisies .it was a shame to cut the masterpiece , but everyone was hungry !\nC: jen was all dressed up for her 25th birthday party .friends and coworkers joined in the celebration .[male] and [male] were their usual crazy selves .missy made this beautiful cake , accented with her favorite flower - gerbera daisies .it was a shame to cut the masterpiece , but everyone was hungry !\nD: jen was all dressed up for her 30th birthday party .friends and coworkers joined in the celebration .[male] and [female] were their usual crazy selves .missy made this beautiful cake , accented with her favorite flower - gerbera daisies .it was a shame to cut the masterpiece , but everyone was hungry !",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: jen was all dressed up for her 30th birthday party .friends and coworkers joined in the celebration .[male] and [male] were their usual crazy selves .missy made this beautiful cake , accented with her favorite flower - gerbera daisies .it was a shame to cut the masterpiece , but everyone was hungry !\nB: jen was all dressed up for her 30th birthday party .friends and coworkers joined in the celebration .[female] and [female] were their usual crazy selves .missy made this beautiful cake , accented with her favorite flower - gerbera daisies .it was a shame to cut the masterpiece , but everyone was hungry !\nC: jen was all dressed up for her 25th birthday party .friends and coworkers joined in the celebration .[male] and [male] were their usual crazy selves .missy made this beautiful cake , accented with her favorite flower - gerbera daisies .it was a shame to cut the masterpiece , but everyone was hungry !\nD: jen was all dressed up for her 30th birthday party .friends and coworkers joined in the celebration .[male] and [female] were their usual crazy selves .missy made this beautiful cake , accented with her favorite flower - gerbera daisies .it was a shame to cut the masterpiece , but everyone was hungry !",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_75_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_75_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_75_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_75_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_75_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: A young boy and his dog explore a forest trail. The boy climbs over a fallen tree on the path. The boy stops to play with his dog. Then they continue their exploration and come across a hidden pond. Feeling tired, they take a rest and relax by the water.\nB: A young boy and his dog go for a walk in the park. The boy jumps over a small branch on the path. The boy stops to pick up some flowers. After that, the boy continues his walk and finds a beautiful garden. Tired from the walk, he sits down and enjoys the view.\nC: A young boy and his dog go for a run down a dirt path. The boy hops a dead log that is blocking the dirt path. Boy stops at dead log and takes a moment to reflect. After reflecting the boy continues along the path and discovers a body of water. Exhausted from the adventure, he lays down on the bank and enjoys the afternoon.\nD: A young boy and his dog take a stroll in the countryside. The boy jumps over a small obstacle on the path. The boy stops to take a selfie. Then they continue their walk and find a hidden waterfall. Feeling tired, they sit down and admire the scenery.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: A young boy and his dog explore a forest trail. The boy climbs over a fallen tree on the path. The boy stops to play with his dog. Then they continue their exploration and come across a hidden pond. Feeling tired, they take a rest and relax by the water.\nB: A young boy and his dog go for a walk in the park. The boy jumps over a small branch on the path. The boy stops to pick up some flowers. After that, the boy continues his walk and finds a beautiful garden. Tired from the walk, he sits down and enjoys the view.\nC: A young boy and his dog go for a run down a dirt path. The boy hops a dead log that is blocking the dirt path. Boy stops at dead log and takes a moment to reflect. After reflecting the boy continues along the path and discovers a body of water. Exhausted from the adventure, he lays down on the bank and enjoys the afternoon.\nD: A young boy and his dog take a stroll in the countryside. The boy jumps over a small obstacle on the path. The boy stops to take a selfie. Then they continue their walk and find a hidden waterfall. Feeling tired, they sit down and admire the scenery.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_76_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_76_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_76_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_76_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_76_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: A group of children were playing on the beach.\nB: The [female] was with her mouth wide open in shock. The [female] walked in the [location] with two other [females] Five people walked up the stairs to the airplane. The airplane started to take off down the runway. The male smiled while working.\nC: The boy was playing with his toys in the park.\nD: The man was singing loudly on stage.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: A group of children were playing on the beach.\nB: The [female] was with her mouth wide open in shock. The [female] walked in the [location] with two other [females] Five people walked up the stairs to the airplane. The airplane started to take off down the runway. The male smiled while working.\nC: The boy was playing with his toys in the park.\nD: The man was singing loudly on stage.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_77_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_77_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_77_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_77_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_77_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: A family picnic in the mountains\nB: A man is helping a child climb the mountain. A group of people climbing a snowy mountain. A group of people climbing a mountain for an adventure. A picture of a person using a stick and snowboots to climb a dangerous mountain. The face of a person exhausted from climbing a dangerous mountain.\nC: A leisurely walk in the park\nD: A group of hikers exploring a forest trail",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: A family picnic in the mountains\nB: A man is helping a child climb the mountain. A group of people climbing a snowy mountain. A group of people climbing a mountain for an adventure. A picture of a person using a stick and snowboots to climb a dangerous mountain. The face of a person exhausted from climbing a dangerous mountain.\nC: A leisurely walk in the park\nD: A group of hikers exploring a forest trail",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_78_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_78_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_78_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_78_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_78_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: it was a terrible night full of darkness\nB: the night was dim and uneventful\nC: it was a great night full of lightsthe night shined bright throughout the citythe buildings were amazing to look atand the food was just as goodthis was the perfect night for a night out\nD: the buildings were unimpressive and dull",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: it was a terrible night full of darkness\nB: the night was dim and uneventful\nC: it was a great night full of lightsthe night shined bright throughout the citythe buildings were amazing to look atand the food was just as goodthis was the perfect night for a night out\nD: the buildings were unimpressive and dull",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_79_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_79_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_79_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_79_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_79_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: Silvia and Grace are at a party, dancing and having fun. Grace feels confident with her solo and is happy to brag about this. Silvia is excited and seems unable to hide her joy. Grace stays away from the party and watches Silvia flaunt her moves. Only time will tell, there is 6 hours till the show.\nB: Grace and Silvia are in the studio, practising for the show. Grace feels confident with her solo and is happy to brag about this. Silvia is very nervous and seems unable to hide her fear. Silvia stays away from the practice and watches Grace flaunt her moves. Only time will tell, there is 6 hours till the show.\nC: Grace and Silvia are at the beach, relaxing and enjoying the sun. Grace feels confident with her solo and is happy to brag about this. Silvia is very nervous and seems unable to hide her fear. Silvia stays away from the beach and watches Grace flaunt her moves. Only time will tell, there is 6 hours till the show.\nD: Grace and Silvia are in the studio, practising for the show. Grace is nervous and seems unable to hide her fear. Grace stays away from the practice and watches Silvia flaunt her moves. Only time will tell, there is 6 hours till the show.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: Silvia and Grace are at a party, dancing and having fun. Grace feels confident with her solo and is happy to brag about this. Silvia is excited and seems unable to hide her joy. Grace stays away from the party and watches Silvia flaunt her moves. Only time will tell, there is 6 hours till the show.\nB: Grace and Silvia are in the studio, practising for the show. Grace feels confident with her solo and is happy to brag about this. Silvia is very nervous and seems unable to hide her fear. Silvia stays away from the practice and watches Grace flaunt her moves. Only time will tell, there is 6 hours till the show.\nC: Grace and Silvia are at the beach, relaxing and enjoying the sun. Grace feels confident with her solo and is happy to brag about this. Silvia is very nervous and seems unable to hide her fear. Silvia stays away from the beach and watches Grace flaunt her moves. Only time will tell, there is 6 hours till the show.\nD: Grace and Silvia are in the studio, practising for the show. Grace is nervous and seems unable to hide her fear. Grace stays away from the practice and watches Silvia flaunt her moves. Only time will tell, there is 6 hours till the show.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_80_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_80_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_80_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_80_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_80_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: on a camping trip in the forest . the kids were worn out from the long hike .we set up a campfire in the woods . roasted marshmallows and hot dogs with the family .we had a lot of fun fishing in the river .here i am taking a selfie by the campfire .afterwards i went hiking to the waterfall . it was a lot of fun .\nB: at an amusement park . the children were tired from the roller coaster rides .we set up a picnic area in the park . burgers and sodas with the family .we had a lot of fun playing games and riding the merry-go-round .here i am taking a selfie on the ferris wheel .afterwards i went on the giant water slide . it was a lot of fun .\nC: on our way to the beach today . the boys passed out from the long drive .we set up on the beach . food and drinks with the family .we had a lot of fun setting up in the sand .here i am taking a selfie in the sun .afterwards i went surfing in the water . it was a lot of fun .\nD: on a snowy day in the mountains . the kids were exhausted from the long hike .we set up a tent in the snow . hot cocoa and marshmallows with the family .we enjoyed building a snowman together .here i am skiing down the slope .afterwards i went ice skating on the frozen lake . it was a lot of fun .",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: on a camping trip in the forest . the kids were worn out from the long hike .we set up a campfire in the woods . roasted marshmallows and hot dogs with the family .we had a lot of fun fishing in the river .here i am taking a selfie by the campfire .afterwards i went hiking to the waterfall . it was a lot of fun .\nB: at an amusement park . the children were tired from the roller coaster rides .we set up a picnic area in the park . burgers and sodas with the family .we had a lot of fun playing games and riding the merry-go-round .here i am taking a selfie on the ferris wheel .afterwards i went on the giant water slide . it was a lot of fun .\nC: on our way to the beach today . the boys passed out from the long drive .we set up on the beach . food and drinks with the family .we had a lot of fun setting up in the sand .here i am taking a selfie in the sun .afterwards i went surfing in the water . it was a lot of fun .\nD: on a snowy day in the mountains . the kids were exhausted from the long hike .we set up a tent in the snow . hot cocoa and marshmallows with the family .we enjoyed building a snowman together .here i am skiing down the slope .afterwards i went ice skating on the frozen lake . it was a lot of fun .",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_81_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_81_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_81_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_81_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_81_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: I took a cab to return to the hotel\nB: the front of the mall was somewhat crowded .i ran past them and took the escalator down .after shopping for a few hours , i returned to the street .i tried to catch a cab but a bush blocked me .i decided to just walk back to my hotel .\nC: the mall was empty and I took the stairs up\nD: I quickly caught a bus to my hotel",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: I took a cab to return to the hotel\nB: the front of the mall was somewhat crowded .i ran past them and took the escalator down .after shopping for a few hours , i returned to the street .i tried to catch a cab but a bush blocked me .i decided to just walk back to my hotel .\nC: the mall was empty and I took the stairs up\nD: I quickly caught a bus to my hotel",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_82_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_82_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_82_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_82_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_82_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: there were many sharks in the water .we were scared .\nB: we went to the beach today .there was a lot of seals .then we saw a castle .there was a lot of cool decorations .we had a really good day .\nC: we went to a mountain and climbed to the top .\nD: we visited the zoo and saw some tigers .",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: there were many sharks in the water .we were scared .\nB: we went to the beach today .there was a lot of seals .then we saw a castle .there was a lot of cool decorations .we had a really good day .\nC: we went to a mountain and climbed to the top .\nD: we visited the zoo and saw some tigers .",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_83_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_83_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_83_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_83_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_83_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: on the last day of our vacation we decided to visit snow mountain .there were a lot of beautiful snow-capped peaks .we noticed as we got there there were skiers enjoying the slopes .one skier was carrying skis .he told us part of the mountain was closed due to an avalanche so we ended up leaving .\nB: on the last day of our vacation we decided to visit rock mountain .there were a lot of beautiful mountains .we noticed as we got there there were workers doing work .one worker was carrying a huge rock .he told us part of the park was closed due to the construction so we ended up leaving .\nC: on the first day of our vacation we decided to visit beach mountain .there were a lot of beautiful beaches .we noticed as we got there there were lifeguards on duty .one lifeguard was carrying a surfboard .he told us part of the beach was closed due to the high waves so we ended up leaving .\nD: on the last day of our vacation we decided to visit forest mountain .there were a lot of beautiful trees and wildlife .we noticed as we got there there were rangers patrolling .one ranger was carrying a backpack .he told us part of the forest was closed due to the fire risk so we ended up leaving .",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: on the last day of our vacation we decided to visit snow mountain .there were a lot of beautiful snow-capped peaks .we noticed as we got there there were skiers enjoying the slopes .one skier was carrying skis .he told us part of the mountain was closed due to an avalanche so we ended up leaving .\nB: on the last day of our vacation we decided to visit rock mountain .there were a lot of beautiful mountains .we noticed as we got there there were workers doing work .one worker was carrying a huge rock .he told us part of the park was closed due to the construction so we ended up leaving .\nC: on the first day of our vacation we decided to visit beach mountain .there were a lot of beautiful beaches .we noticed as we got there there were lifeguards on duty .one lifeguard was carrying a surfboard .he told us part of the beach was closed due to the high waves so we ended up leaving .\nD: on the last day of our vacation we decided to visit forest mountain .there were a lot of beautiful trees and wildlife .we noticed as we got there there were rangers patrolling .one ranger was carrying a backpack .he told us part of the forest was closed due to the fire risk so we ended up leaving .",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_84_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_84_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_84_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_84_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_84_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: the family went on a snowy hike .the youngest child was having fun , dressed in red .he explored the trees in the brisk air .they held their hands out with sunflower seeds to feed the birds .later , the young boy hitched a ride on the dads back .\nB: the family went on a beach picnic .the youngest child was having fun , dressed in red .he explored the trees in the brisk air .they held their hands out with sunflower seeds to feed the birds .later , the young boy hitched a ride on the dads back .\nC: the family went on a mountain climbing trip .the youngest child was having fun , dressed in red .he explored the trees in the brisk air .they held their hands out with sunflower seeds to feed the birds .later , the young boy hitched a ride on the dads back .\nD: the family went on a tropical vacation .the youngest child was having fun , dressed in red .he explored the trees in the brisk air .they held their hands out with sunflower seeds to feed the birds .later , the young boy hitched a ride on the dads back .",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: the family went on a snowy hike .the youngest child was having fun , dressed in red .he explored the trees in the brisk air .they held their hands out with sunflower seeds to feed the birds .later , the young boy hitched a ride on the dads back .\nB: the family went on a beach picnic .the youngest child was having fun , dressed in red .he explored the trees in the brisk air .they held their hands out with sunflower seeds to feed the birds .later , the young boy hitched a ride on the dads back .\nC: the family went on a mountain climbing trip .the youngest child was having fun , dressed in red .he explored the trees in the brisk air .they held their hands out with sunflower seeds to feed the birds .later , the young boy hitched a ride on the dads back .\nD: the family went on a tropical vacation .the youngest child was having fun , dressed in red .he explored the trees in the brisk air .they held their hands out with sunflower seeds to feed the birds .later , the young boy hitched a ride on the dads back .",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_85_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_85_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_85_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_85_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_85_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: We spotted a small fishing boat.\nB: The water was crystal clear and blue today.\nC: The cargo ship was brand new.\nD: We saw the coolest thing today while on the water, it was a cargo ship house! The water today resembled a murky green color. The cargo ship we saw today on the ocean was huge! Its crazy to think how it floats on the ocean. Today we saw a crane remove the accommodation from the cargo ship! The cargo ship seemed to have rusted over the years.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: We spotted a small fishing boat.\nB: The water was crystal clear and blue today.\nC: The cargo ship was brand new.\nD: We saw the coolest thing today while on the water, it was a cargo ship house! The water today resembled a murky green color. The cargo ship we saw today on the ocean was huge! Its crazy to think how it floats on the ocean. Today we saw a crane remove the accommodation from the cargo ship! The cargo ship seemed to have rusted over the years.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_86_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_86_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_86_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_86_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_86_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: i never ride roller coasters\nB: i am scared of amusement parks\nC: i prefer to stay at home on my birthday\nD: i always ride the brain buster roller coaster on my birthday . a broken transmission is n't going to stop me .the only other transportation available at my home is designed for a 2-year-old .nothing was going to stop me , so i walked the train tracks .it took forever but i finally made it to the amusement park .that 's me , in the second car , turning green and about to lose my lunch . i ca n't imagine a better way to spend my birthday .",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: i never ride roller coasters\nB: i am scared of amusement parks\nC: i prefer to stay at home on my birthday\nD: i always ride the brain buster roller coaster on my birthday . a broken transmission is n't going to stop me .the only other transportation available at my home is designed for a 2-year-old .nothing was going to stop me , so i walked the train tracks .it took forever but i finally made it to the amusement park .that 's me , in the second car , turning green and about to lose my lunch . i ca n't imagine a better way to spend my birthday .",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_87_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_87_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_87_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_87_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_87_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: A group of friends go on a hike in the mountains. They have a picnic at a scenic spot and take photos. They spot a deer in the distance and watch it as it wanders off. Later, they recount their adventure to their families.\nB: A man loads his dogs in the car for a trip to the beach. The man has made it to the beach and has some coffee while taking a walk. He stops to look at a seal on a rock, who is surrounded by seagulls. The seal dives into the water as the birds swim around it. The man is back home, telling his viewers about his trip.\nC: A family goes on a road trip to visit a zoo. They enjoy observing the various animals and have a picnic lunch. They see a monkey swinging in the trees and a peacock displaying its feathers. They reminisce about their fun day during dinner.\nD: A woman takes her cats to a park for a picnic. She enjoys her lunch while sitting on a bench and watching the ducks in the pond. She notices a turtle on a log, which quickly disappears into the water. She returns home and shares her experience with her friends.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: A group of friends go on a hike in the mountains. They have a picnic at a scenic spot and take photos. They spot a deer in the distance and watch it as it wanders off. Later, they recount their adventure to their families.\nB: A man loads his dogs in the car for a trip to the beach. The man has made it to the beach and has some coffee while taking a walk. He stops to look at a seal on a rock, who is surrounded by seagulls. The seal dives into the water as the birds swim around it. The man is back home, telling his viewers about his trip.\nC: A family goes on a road trip to visit a zoo. They enjoy observing the various animals and have a picnic lunch. They see a monkey swinging in the trees and a peacock displaying its feathers. They reminisce about their fun day during dinner.\nD: A woman takes her cats to a park for a picnic. She enjoys her lunch while sitting on a bench and watching the ducks in the pond. She notices a turtle on a log, which quickly disappears into the water. She returns home and shares her experience with her friends.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_88_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_88_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_88_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_88_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_88_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: A man riding a bike in the mountains.\nB: A family having a picnic in the woods.\nC: In the beautiful sea, a man take his boat with his dog. He enjoyed to ride the boat and the end he came to sea shore for stop the boat. He had a tattoos on his full of hand. He wants to take a pictures of his tattoos. He had fun the trip with his dog.\nD: A woman walking her cat in a park.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: A man riding a bike in the mountains.\nB: A family having a picnic in the woods.\nC: In the beautiful sea, a man take his boat with his dog. He enjoyed to ride the boat and the end he came to sea shore for stop the boat. He had a tattoos on his full of hand. He wants to take a pictures of his tattoos. He had fun the trip with his dog.\nD: A woman walking her cat in a park.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_89_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_89_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_89_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_89_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_89_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: these images capture the peaceful serenity of natural landscapes and scenic views\nB: these images illustrate a bustling city with various attractions and activities to explore\nC: we have always enjoyed travelling to far away places .with so many opportunities to enjoy unique experiences .many times , even the places we stayed provided new experiences , like sleeping in bunk beds .we also enjoyed staying in places where we were able to gather with other visitors and share stories .with so many sites to see , it always seems like we have to leave far too soon .\nD: these images depict a cozy home where we can relax and unwind after a long day of exploring",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: these images capture the peaceful serenity of natural landscapes and scenic views\nB: these images illustrate a bustling city with various attractions and activities to explore\nC: we have always enjoyed travelling to far away places .with so many opportunities to enjoy unique experiences .many times , even the places we stayed provided new experiences , like sleeping in bunk beds .we also enjoyed staying in places where we were able to gather with other visitors and share stories .with so many sites to see , it always seems like we have to leave far too soon .\nD: these images depict a cozy home where we can relax and unwind after a long day of exploring",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_90_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_90_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_90_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_90_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_90_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: a political rally with a famous speaker\nB: the statue was finally in place ...the crowd began to gather to hear the donation ceremony and the speech .finally , pastor smith took the stage and began to speak .the veterans that served with the colonel were all present .and , they could not have been happier with the beautiful statue that honored their friend .\nC: a group of people gathering for a wedding ceremony\nD: a memorial service for a fallen soldier",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: a political rally with a famous speaker\nB: the statue was finally in place ...the crowd began to gather to hear the donation ceremony and the speech .finally , pastor smith took the stage and began to speak .the veterans that served with the colonel were all present .and , they could not have been happier with the beautiful statue that honored their friend .\nC: a group of people gathering for a wedding ceremony\nD: a memorial service for a fallen soldier",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_91_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_91_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_91_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_91_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_91_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: we arrived at our destination as the moon was going down . that was fine because we were wide awake from the adventurous trip .after a restless night we went adventuring and came across a deserted pier . we meandered down the pier for a bit .when we reached the end we found ourselves on a pebbly beach where we spent some time looking for starfish .in the distance the clouds gathered and we thought we would get caught in a storm .fortunately , the clouds dispersed and we enjoyed the rest of our day at the seashore .\nB: we arrived at our destination as the sun was going down . that was fine because we were pretty tired from the long trip .after a good nights sleep we went exploring and came across a long the dock . we walked down the dock for a while .when we came to the end we found ourselves on a rocky beach where we spent some time looking for crabs .in the distance the clouds rolled in and we thought we would get caught in the rain .fortunately , the clouds passed and we enjoyed the rest of our day at the seashore .\nC: we arrived at our destination as the sun was setting . that was fine because we were exhausted from the lengthy trip .after a rough night's sleep we went exploring and came across a wooden dock . we trudged down the dock for a bit .when we came to the end we found ourselves on a sandy beach where we spent some time looking for shells .in the distance the clouds dissipated and we thought we would have clear skies .unfortunately , the clouds gathered and we got caught in the rain .fortunately , the clouds dispersed and we enjoyed the rest of our day at the beach .\nD: we arrived at our destination as the sun was coming up . that was fine because we were well-rested from the short trip .after a sleepless night we went wandering and came across a short pier . we strolled down the pier for a short time .when we reached the end we found ourselves on a sandy beach where we spent some time looking for seashells .in the distance the clouds dispersed and we thought we would have clear skies .unfortunately , the clouds gathered and we got caught in the rain .fortunately , the clouds passed and we enjoyed the rest of our day at the seashore .",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: we arrived at our destination as the moon was going down . that was fine because we were wide awake from the adventurous trip .after a restless night we went adventuring and came across a deserted pier . we meandered down the pier for a bit .when we reached the end we found ourselves on a pebbly beach where we spent some time looking for starfish .in the distance the clouds gathered and we thought we would get caught in a storm .fortunately , the clouds dispersed and we enjoyed the rest of our day at the seashore .\nB: we arrived at our destination as the sun was going down . that was fine because we were pretty tired from the long trip .after a good nights sleep we went exploring and came across a long the dock . we walked down the dock for a while .when we came to the end we found ourselves on a rocky beach where we spent some time looking for crabs .in the distance the clouds rolled in and we thought we would get caught in the rain .fortunately , the clouds passed and we enjoyed the rest of our day at the seashore .\nC: we arrived at our destination as the sun was setting . that was fine because we were exhausted from the lengthy trip .after a rough night's sleep we went exploring and came across a wooden dock . we trudged down the dock for a bit .when we came to the end we found ourselves on a sandy beach where we spent some time looking for shells .in the distance the clouds dissipated and we thought we would have clear skies .unfortunately , the clouds gathered and we got caught in the rain .fortunately , the clouds dispersed and we enjoyed the rest of our day at the beach .\nD: we arrived at our destination as the sun was coming up . that was fine because we were well-rested from the short trip .after a sleepless night we went wandering and came across a short pier . we strolled down the pier for a short time .when we reached the end we found ourselves on a sandy beach where we spent some time looking for seashells .in the distance the clouds dispersed and we thought we would have clear skies .unfortunately , the clouds gathered and we got caught in the rain .fortunately , the clouds passed and we enjoyed the rest of our day at the seashore .",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_92_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_92_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_92_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_92_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_92_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: Getting in an argument\nB: Hearing a happy news\nC: Family members visiting for a small celebration in an old age home\nD: Kids being grounded",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: Getting in an argument\nB: Hearing a happy news\nC: Family members visiting for a small celebration in an old age home\nD: Kids being grounded",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_93_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_93_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_93_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_93_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_93_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: The girls father never arrives after he found out where she was. the three friends stood calmly against the older male The older male rephrases his first sentence about a disciplinary action he will take if she doesnt g the female reflects on her prior decision making and furture consequences One of the younger males decides against her by saying something to the father\nB: The girls mother finally arrives after he found out where she was. the three friends stood defiantly against the older male The older male rephrases his first sentence about a disciplinary action he will take if she doesnt g the female reflects on her prior decision making and furture consequences One of the younger males decides for her by saying something to the mother\nC: The girls father finally arrives after he found out where she was. the three friends stood defiantly against the older male The older male rephrases his first sentence about a disciplinary action he will take if she doesnt g the female reflects on her prior decision making and furture consequences One of the younger males decides for her by saying something to the father\nD: The boys father finally arrives after he found out where he was. the three friends stood defiantly against the older male The older male rephrases his first sentence about a disciplinary action he will take if she doesnt g the female reflects on her prior decision making and furture consequences One of the younger females decides for her by saying something to the father",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: The girls father never arrives after he found out where she was. the three friends stood calmly against the older male The older male rephrases his first sentence about a disciplinary action he will take if she doesnt g the female reflects on her prior decision making and furture consequences One of the younger males decides against her by saying something to the father\nB: The girls mother finally arrives after he found out where she was. the three friends stood defiantly against the older male The older male rephrases his first sentence about a disciplinary action he will take if she doesnt g the female reflects on her prior decision making and furture consequences One of the younger males decides for her by saying something to the mother\nC: The girls father finally arrives after he found out where she was. the three friends stood defiantly against the older male The older male rephrases his first sentence about a disciplinary action he will take if she doesnt g the female reflects on her prior decision making and furture consequences One of the younger males decides for her by saying something to the father\nD: The boys father finally arrives after he found out where he was. the three friends stood defiantly against the older male The older male rephrases his first sentence about a disciplinary action he will take if she doesnt g the female reflects on her prior decision making and furture consequences One of the younger females decides for her by saying something to the father",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_94_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_94_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_94_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_94_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_94_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: The man is cooking in the kitchen. His son is also helping him. They bake a delicious cake!\nB: The man is swimming in the pool. His son is also swimming with him. They find a lost toy!\nC: The man is fishing on the dock. His son is also fishing with him. They catch a fish! The son is super excited to have caught his first fish. He then catches another fish right away!\nD: The man is gardening in the backyard. His son is also helping him. They plant a new tree!",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: The man is cooking in the kitchen. His son is also helping him. They bake a delicious cake!\nB: The man is swimming in the pool. His son is also swimming with him. They find a lost toy!\nC: The man is fishing on the dock. His son is also fishing with him. They catch a fish! The son is super excited to have caught his first fish. He then catches another fish right away!\nD: The man is gardening in the backyard. His son is also helping him. They plant a new tree!",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_95_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_95_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_95_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_95_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_95_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: a rocky beach in a black and white photograph .the waves come crashing onto the beach .a bench in the middle of the beach .a couple walk hand in hand along the rocky coastline .sunset over a forgotten beach .\nB: a busy city street with tall buildings\nC: a sunny beach with palm trees and clear blue water\nD: a snowy mountain peak with skiers",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: a rocky beach in a black and white photograph .the waves come crashing onto the beach .a bench in the middle of the beach .a couple walk hand in hand along the rocky coastline .sunset over a forgotten beach .\nB: a busy city street with tall buildings\nC: a sunny beach with palm trees and clear blue water\nD: a snowy mountain peak with skiers",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_96_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_96_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_96_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_96_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_96_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: woman cooking in the living room while the children eat\nB: man playing video games in the kitchen while children watch TV\nC: man prepare a food in the kitchen and grill some bacon and meet to prepare arrange the food to serve children's are eat the food in the table with happyness and the mom and grandma  watching the children's with happy ness\nD: no one in the kitchen while children prepare their own food",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: woman cooking in the living room while the children eat\nB: man playing video games in the kitchen while children watch TV\nC: man prepare a food in the kitchen and grill some bacon and meet to prepare arrange the food to serve children's are eat the food in the table with happyness and the mom and grandma  watching the children's with happy ness\nD: no one in the kitchen while children prepare their own food",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_97_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_97_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_97_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_97_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_97_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: the couples posed for a picture in the crowdmany of the couples gathered to celebrate achievements in the communitythey all gathered inside to socializemany pictures were taken between friendsand also solo pictures were taken\nB: pictures of friends taken at an event\nC: a gathering of people for socializing and photography\nD: the people gathered for a group photo at a partycelebrations to mark community successes",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: the couples posed for a picture in the crowdmany of the couples gathered to celebrate achievements in the communitythey all gathered inside to socializemany pictures were taken between friendsand also solo pictures were taken\nB: pictures of friends taken at an event\nC: a gathering of people for socializing and photography\nD: the people gathered for a group photo at a partycelebrations to mark community successes",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_98_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_98_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_98_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_98_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_98_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: The kid looks bery excited about the wedding. The guy looks pretty shocked about something. Shes trying to figure out what dress she would like to have for her wedding. She seem to be happy about the dress she picked out. All of her family memebers seem to like it as well.\nB: She's confused about what dress she wants for the wedding.\nC: The kid looks bored at the wedding.\nD: The guy looks amused by something.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: The kid looks bery excited about the wedding. The guy looks pretty shocked about something. Shes trying to figure out what dress she would like to have for her wedding. She seem to be happy about the dress she picked out. All of her family memebers seem to like it as well.\nB: She's confused about what dress she wants for the wedding.\nC: The kid looks bored at the wedding.\nD: The guy looks amused by something.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_99_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_99_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_99_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_99_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_99_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: They are create new one They mom seen this They mom looking some surprice The mom angry with they The little boy afraid here\nB: The family is having a picnic\nC: The boy is happy and excited\nD: The mom is peacefully reading a book",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: They are create new one They mom seen this They mom looking some surprice The mom angry with they The little boy afraid here\nB: The family is having a picnic\nC: The boy is happy and excited\nD: The mom is peacefully reading a book",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_100_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_100_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_100_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_100_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_100_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: There was no river in the hill station.\nB: A man is driving in the car.\nC: The car was moving back from the hill station.\nD: A woman is driving in the car. The car was moving ahead to the hill station. There was a river in the hill station which makes the hill station beautiful. She going to the river. She is close to the river.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: There was no river in the hill station.\nB: A man is driving in the car.\nC: The car was moving back from the hill station.\nD: A woman is driving in the car. The car was moving ahead to the hill station. There was a river in the hill station which makes the hill station beautiful. She going to the river. She is close to the river.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_101_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_101_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_101_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_101_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_101_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: i have been working hard all day .time to have a nice meal to end my day .then some drinks with family .it is nice enough day to take a drive .then try on some new shoes at the store .\nB: i have been working hard all day .time to have a nice meal to end my day .then some drinks alone .it is nice enough day to take a drive .then try on some new shoes at the store .\nC: i have been working hard all day .time to have a plain meal to end my day .then some drinks with family .it is nice enough day to take a drive .then try on some old shoes at the store .\nD: i have been relaxing all day .time to have a nice meal to end my day .then some drinks with family .it is nice enough day to take a drive .then try on some new shoes at the store .",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: i have been working hard all day .time to have a nice meal to end my day .then some drinks with family .it is nice enough day to take a drive .then try on some new shoes at the store .\nB: i have been working hard all day .time to have a nice meal to end my day .then some drinks alone .it is nice enough day to take a drive .then try on some new shoes at the store .\nC: i have been working hard all day .time to have a plain meal to end my day .then some drinks with family .it is nice enough day to take a drive .then try on some old shoes at the store .\nD: i have been relaxing all day .time to have a nice meal to end my day .then some drinks with family .it is nice enough day to take a drive .then try on some new shoes at the store .",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_102_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_102_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_102_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_102_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_102_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: The climbers are struggling with extreme cold and harsh weather conditions, making it difficult for them to continue their journey.\nB: The climbers are enjoying a leisurely hike on a sunny day with no obstacles.\nC: The climbers are at the base of the mountain and are just starting their ascent.\nD: The climbers seems to be almost at the zenith of the daring mountain. They struggle to overcome changes in elevation thru the ice filled rocky terrain. Both climbers seem exhausted from the adventure and the weight they are carrying. However, they make it to the top in an outstanding show of dedication to enjoy the view. The rest of the journey awaits as they begin to trek down the slippery slopes.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: The climbers are struggling with extreme cold and harsh weather conditions, making it difficult for them to continue their journey.\nB: The climbers are enjoying a leisurely hike on a sunny day with no obstacles.\nC: The climbers are at the base of the mountain and are just starting their ascent.\nD: The climbers seems to be almost at the zenith of the daring mountain. They struggle to overcome changes in elevation thru the ice filled rocky terrain. Both climbers seem exhausted from the adventure and the weight they are carrying. However, they make it to the top in an outstanding show of dedication to enjoy the view. The rest of the journey awaits as they begin to trek down the slippery slopes.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_103_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_103_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_103_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_103_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_103_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: A group of friends visited a zoo and saw various animals\nB: A family went to the beach for a picnic\nC: On a regular day, two boys went to a library\nD: On the weekend, five lovely girls went to a playground park. They arrived with a guide who is knowledgeable about the park.  Before entering the park, she gave the playing role instructions. They had a wonderful time and participated in a variety of activities.  They played golf, which was incredibly interesting to them. finally, they immensely enjoyed it.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: A group of friends visited a zoo and saw various animals\nB: A family went to the beach for a picnic\nC: On a regular day, two boys went to a library\nD: On the weekend, five lovely girls went to a playground park. They arrived with a guide who is knowledgeable about the park.  Before entering the park, she gave the playing role instructions. They had a wonderful time and participated in a variety of activities.  They played golf, which was incredibly interesting to them. finally, they immensely enjoyed it.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_104_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_104_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_104_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_104_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_104_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: A gathering takes place in the market of people hailing from numerous countries in Africa. As the number of people present increases, a growing number of them are also working alongside one another in order to make a living. Some are trying to make a living by selling food, cleaning clothing, and providing for their family through these activities. When it comes to discovering a wide variety of opportunities, Africa is among the most prominent regions to be located in. Those who are willing to keep pushing the limit through increasingly difficult circumstances experience severe hunger.\nB: A group of people at a concert enjoying music.\nC: A group of people sitting in a classroom studying.\nD: A group of people having a picnic in a park.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: A gathering takes place in the market of people hailing from numerous countries in Africa. As the number of people present increases, a growing number of them are also working alongside one another in order to make a living. Some are trying to make a living by selling food, cleaning clothing, and providing for their family through these activities. When it comes to discovering a wide variety of opportunities, Africa is among the most prominent regions to be located in. Those who are willing to keep pushing the limit through increasingly difficult circumstances experience severe hunger.\nB: A group of people at a concert enjoying music.\nC: A group of people sitting in a classroom studying.\nD: A group of people having a picnic in a park.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_105_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_105_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_105_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_105_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_105_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: A group of locals visited a foreigner and asked for recommendations on what to do in the area.\nB: A traveler visited a market and bought some local food to try.\nC: A foreigner visited locals ina  restaurant. He asked what was good to eat. They suggested some options for them. They discussed this as a group. He decided on what he wanted.\nD: A local visited foreigners in a restaurant and asked for recommendations on what to eat.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: A group of locals visited a foreigner and asked for recommendations on what to do in the area.\nB: A traveler visited a market and bought some local food to try.\nC: A foreigner visited locals ina  restaurant. He asked what was good to eat. They suggested some options for them. They discussed this as a group. He decided on what he wanted.\nD: A local visited foreigners in a restaurant and asked for recommendations on what to eat.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_106_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_106_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_106_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_106_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_106_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: These people are going for a picnic in the park.\nB: These people are gathering to go ride their bikes. They took a selfie on their bikes before riding. They rode down a steep hill on their bikes. The man takes another selfie of himself with his gear on. Then he pops a wheelie on his bike.\nC: These people are having a barbecue in their backyard.\nD: These people are preparing to go swimming at the beach.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: These people are going for a picnic in the park.\nB: These people are gathering to go ride their bikes. They took a selfie on their bikes before riding. They rode down a steep hill on their bikes. The man takes another selfie of himself with his gear on. Then he pops a wheelie on his bike.\nC: These people are having a barbecue in their backyard.\nD: These people are preparing to go swimming at the beach.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_107_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_107_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_107_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_107_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_107_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: this is a snapshot of a busy city street.\nB: this is a photo of a crowded marketplace.\nC: this is our first preview of the new restaurant .the decorations did n't seem like much .we had heard that this restaurant would be good though .we had a view of the train out of the window .the walls were plastered with these designs .\nD: this is a collection of ancient artifacts.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: this is a snapshot of a busy city street.\nB: this is a photo of a crowded marketplace.\nC: this is our first preview of the new restaurant .the decorations did n't seem like much .we had heard that this restaurant would be good though .we had a view of the train out of the window .the walls were plastered with these designs .\nD: this is a collection of ancient artifacts.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_108_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_108_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_108_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_108_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_108_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: The graffiti in our city is not good. The paintings on the building are mediocre. The people and statues are not impressive.\nB: The graffiti in our city is terrible and ugly. The paintings depict ugly people and statues.\nC: our city is really a lovely place .even the graffiti is done in good taste .paintings on the building depicting beautiful people .and gorgeous statues .and who could resist the baby penguin named [female] ?\nD: The graffiti in this city is so-so. The building looks unimpressive. The people and statues are nothing special.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: The graffiti in our city is not good. The paintings on the building are mediocre. The people and statues are not impressive.\nB: The graffiti in our city is terrible and ugly. The paintings depict ugly people and statues.\nC: our city is really a lovely place .even the graffiti is done in good taste .paintings on the building depicting beautiful people .and gorgeous statues .and who could resist the baby penguin named [female] ?\nD: The graffiti in this city is so-so. The building looks unimpressive. The people and statues are nothing special.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_109_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_109_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_109_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_109_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_109_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: Woman with blonde hair in front of a purple wall. Woman with black hair in front of what seems to be a painting. White woman serving food to a hispanic man in mid-twenties. Woman with strong (greek) nose looking down. Hands pouring soup in front of water.\nB: Woman with brown hair in front of a purple wall. Woman with black hair in front of what seems to be a painting. Black man serving food to a hispanic man in mid-twenties. Woman with strong (greek) nose looking down. Hands pouring soup in front of water.\nC: Man with blonde hair in front of a purple wall. Woman with black hair in front of what seems to be a painting. White woman serving food to a hispanic man in mid-twenties. Woman with strong (greek) nose looking down. Hands pouring soup in front of water.\nD: Woman with blonde hair in front of a yellow wall. Woman with black hair in front of what seems to be a painting. White woman serving food to an asian man in mid-twenties. Woman with strong (greek) nose looking down. Hands pouring soup in front of water.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: Woman with blonde hair in front of a purple wall. Woman with black hair in front of what seems to be a painting. White woman serving food to a hispanic man in mid-twenties. Woman with strong (greek) nose looking down. Hands pouring soup in front of water.\nB: Woman with brown hair in front of a purple wall. Woman with black hair in front of what seems to be a painting. Black man serving food to a hispanic man in mid-twenties. Woman with strong (greek) nose looking down. Hands pouring soup in front of water.\nC: Man with blonde hair in front of a purple wall. Woman with black hair in front of what seems to be a painting. White woman serving food to a hispanic man in mid-twenties. Woman with strong (greek) nose looking down. Hands pouring soup in front of water.\nD: Woman with blonde hair in front of a yellow wall. Woman with black hair in front of what seems to be a painting. White woman serving food to an asian man in mid-twenties. Woman with strong (greek) nose looking down. Hands pouring soup in front of water.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_110_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_110_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_110_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_110_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_110_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: I took my daughter out to eat for her birthday. Then we went to the arcade and we played some games.\nB: I took my son out to eat for his birthday. Then we went to the arcade and we played some games. He picked out some prizes to get with the tickets he won. After that we played some arcade video games together. We both enjoyed playing duck hunt the most!\nC: I took my son out to eat for his birthday. Then we went to the movies and we watched a movie.\nD: I took my son out to eat for his birthday. Then we went to the park and we played on the swings.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: I took my daughter out to eat for her birthday. Then we went to the arcade and we played some games.\nB: I took my son out to eat for his birthday. Then we went to the arcade and we played some games. He picked out some prizes to get with the tickets he won. After that we played some arcade video games together. We both enjoyed playing duck hunt the most!\nC: I took my son out to eat for his birthday. Then we went to the movies and we watched a movie.\nD: I took my son out to eat for his birthday. Then we went to the park and we played on the swings.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_111_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_111_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_111_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_111_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_111_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: The images capture a scenic mountain hike.\nB: The images depict a casual walk in the park.\nC: The images show a day at the beach with friends.\nD: today started off like an ordinary work day . little did i know that today would be different than all the rest !i walked along the crowded location location streets until i came to my office building . i went in and started to work .at the end of the day , i finally got the courage to ask my coworker out . we walked the town and talked a lot .we had a great dinner in a restaurant here . we really hit it off .we talked until dawn , and then i walked her home to her apartment . i ca n't wait to see her again !",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: The images capture a scenic mountain hike.\nB: The images depict a casual walk in the park.\nC: The images show a day at the beach with friends.\nD: today started off like an ordinary work day . little did i know that today would be different than all the rest !i walked along the crowded location location streets until i came to my office building . i went in and started to work .at the end of the day , i finally got the courage to ask my coworker out . we walked the town and talked a lot .we had a great dinner in a restaurant here . we really hit it off .we talked until dawn , and then i walked her home to her apartment . i ca n't wait to see her again !",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_112_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_112_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_112_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_112_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_112_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: there is a birthday party celebration with vegetarian dishes being served and family members are bored\nB: there is a family party preparation going on there non veg dishes in the party like expensive party family members are enjoying kids are enjoying so much\nC: the images show a formal event with only vegetarian dishes and the guests seem uninterested\nD: the pictures depict a casual gathering with no food and kids are getting bored",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: there is a birthday party celebration with vegetarian dishes being served and family members are bored\nB: there is a family party preparation going on there non veg dishes in the party like expensive party family members are enjoying kids are enjoying so much\nC: the images show a formal event with only vegetarian dishes and the guests seem uninterested\nD: the pictures depict a casual gathering with no food and kids are getting bored",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_113_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_113_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_113_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_113_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_113_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: the fireworks were a great addition ,to the annual celebration .[female] is holding up her vegetables on sticks ,while her uncle cooks on the grill .the view was breathtaking .\nB: the fireworks are a great addition ,to the annual celebration .[female] is holding up her meat on sticks ,while her uncle cooks on the grill .the view was breathtaking .\nC: the parade was a great addition ,to the annual celebration .[female] is holding up her meat on sticks ,while her uncle cooks on the grill .the view was breathtaking .\nD: the picnic was a great addition ,to the annual celebration .[male] is holding up his meat on sticks ,while his nephew cooks on the grill .the view was breathtaking .",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: the fireworks were a great addition ,to the annual celebration .[female] is holding up her vegetables on sticks ,while her uncle cooks on the grill .the view was breathtaking .\nB: the fireworks are a great addition ,to the annual celebration .[female] is holding up her meat on sticks ,while her uncle cooks on the grill .the view was breathtaking .\nC: the parade was a great addition ,to the annual celebration .[female] is holding up her meat on sticks ,while her uncle cooks on the grill .the view was breathtaking .\nD: the picnic was a great addition ,to the annual celebration .[male] is holding up his meat on sticks ,while his nephew cooks on the grill .the view was breathtaking .",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_114_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_114_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_114_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_114_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_114_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: The guests were not happy with the food.\nB: today is a good day to invite family over .it is such a nice day today , and we finished cleaning up the yard .we should water all the grass before people come over .some of the guests made a campfire to cook some of the food .everyone is having a great time , the meal was great today .\nC: It is raining heavily today.\nD: The yard is not clean.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: The guests were not happy with the food.\nB: today is a good day to invite family over .it is such a nice day today , and we finished cleaning up the yard .we should water all the grass before people come over .some of the guests made a campfire to cook some of the food .everyone is having a great time , the meal was great today .\nC: It is raining heavily today.\nD: The yard is not clean.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_115_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_115_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_115_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_115_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_115_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: she had a terrible time\nB: she fell only once\nC: i taught my daughter how to ride her bike today .she had a great time .she only fell four times .she was okay though .afterward i bought a cake for her .\nD: she was injured",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: she had a terrible time\nB: she fell only once\nC: i taught my daughter how to ride her bike today .she had a great time .she only fell four times .she was okay though .afterward i bought a cake for her .\nD: she was injured",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_116_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_116_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_116_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_116_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_116_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: heres a certificate for our baby , told ya mom he was real !the whole family met to celebrate , generations all under one roof .mom loves taking pictures with us , she cant let go of the baby .grandma too had to have some pictures . we didnt mind though , we are one big happy family .[male] [male] might be too young for his cake but we all loved it for him .\nB: A couple sharing a special moment with their newborn baby.\nC: A family reunion with multiple generations coming together.\nD: A group of friends celebrating a birthday party.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: heres a certificate for our baby , told ya mom he was real !the whole family met to celebrate , generations all under one roof .mom loves taking pictures with us , she cant let go of the baby .grandma too had to have some pictures . we didnt mind though , we are one big happy family .[male] [male] might be too young for his cake but we all loved it for him .\nB: A couple sharing a special moment with their newborn baby.\nC: A family reunion with multiple generations coming together.\nD: A group of friends celebrating a birthday party.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_117_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_117_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_117_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_117_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_117_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: the evening started out relaxed and laid-back, everyone was confident. they had a short warm journey ahead of them. they had to walk a long distance to get to their destination. once there, everyone was reserved and cautious. the d.j.`s started to set up their booth for a casual setting and soothing music.\nB: the night started out calm and peaceful, everyone was on the same page. they had a short warm journey ahead of them. they had to drive a short distance to get to their destination. once there, everyone stayed reserved and stayed formal. the d.j.`s started to set up their booth for a calm atmosphere and slow music.\nC: the day started out a bit hectic , everyone was a little confused .they had a long cold journey ahead of them .they had to walk a bit of a distance to get to their destination .once there , everyone pretty much let loose and got comfortable .the d.j.`s started to set up their booth for a good time and good music .\nD: the morning started out exciting and energetic, everyone was highly motivated. they had a short hot journey ahead of them. they had to run a long distance to get to their destination. once there, everyone eagerly got to work and got busy. the d.j.`s started to set up their booth for an early start and hyped music.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: the evening started out relaxed and laid-back, everyone was confident. they had a short warm journey ahead of them. they had to walk a long distance to get to their destination. once there, everyone was reserved and cautious. the d.j.`s started to set up their booth for a casual setting and soothing music.\nB: the night started out calm and peaceful, everyone was on the same page. they had a short warm journey ahead of them. they had to drive a short distance to get to their destination. once there, everyone stayed reserved and stayed formal. the d.j.`s started to set up their booth for a calm atmosphere and slow music.\nC: the day started out a bit hectic , everyone was a little confused .they had a long cold journey ahead of them .they had to walk a bit of a distance to get to their destination .once there , everyone pretty much let loose and got comfortable .the d.j.`s started to set up their booth for a good time and good music .\nD: the morning started out exciting and energetic, everyone was highly motivated. they had a short hot journey ahead of them. they had to run a long distance to get to their destination. once there, everyone eagerly got to work and got busy. the d.j.`s started to set up their booth for an early start and hyped music.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_118_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_118_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_118_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_118_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_118_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: the vacation spot was empty .nobody wanted to spend memorial weekend here .the pool was dirty and unappealing .the garden was an eyesore .the canyon was a boring place to visit .\nB: the vacation spot was lively and vibrant .many people came out here to spend memorial weekend .the pool was a crowded and noisy place to relax .this garden was a beautiful and picturesque sight .a few miles away from the vacation spot there was a quiet woodland people liked to visit .\nC: the vacation spot was crowded .everyone came out here to spend memorial weekend .the pool was a refreshing way to relax .this garden was an attractive sight .a few miles away from the vacation spot there was a canyon people liked to visit .\nD: the vacation spot was peaceful and serene .hardly anyone came out here to spend memorial weekend .the pool was closed and inaccessible .this garden was a dull and uninteresting sight .a few miles away from the vacation spot there was a bustling city people liked to visit .",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: the vacation spot was empty .nobody wanted to spend memorial weekend here .the pool was dirty and unappealing .the garden was an eyesore .the canyon was a boring place to visit .\nB: the vacation spot was lively and vibrant .many people came out here to spend memorial weekend .the pool was a crowded and noisy place to relax .this garden was a beautiful and picturesque sight .a few miles away from the vacation spot there was a quiet woodland people liked to visit .\nC: the vacation spot was crowded .everyone came out here to spend memorial weekend .the pool was a refreshing way to relax .this garden was an attractive sight .a few miles away from the vacation spot there was a canyon people liked to visit .\nD: the vacation spot was peaceful and serene .hardly anyone came out here to spend memorial weekend .the pool was closed and inaccessible .this garden was a dull and uninteresting sight .a few miles away from the vacation spot there was a bustling city people liked to visit .",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_119_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_119_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_119_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_119_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_119_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: the family went to the beach to watch the sunrise\nB: the family went to the beach to see the sunset .it was a beautiful day !there were a lot of people sitting in the sand .our dog enjoyed the trip , too !then we came home and had a delicious dinner !\nC: the family went to the mountains for a hike\nD: the family went to the park to have a picnic",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: the family went to the beach to watch the sunrise\nB: the family went to the beach to see the sunset .it was a beautiful day !there were a lot of people sitting in the sand .our dog enjoyed the trip , too !then we came home and had a delicious dinner !\nC: the family went to the mountains for a hike\nD: the family went to the park to have a picnic",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_120_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_120_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_120_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_120_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_120_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: She appeared nervous and unprepared for the tough competition, lacking the confidence to give her best swing.\nB: She looked indifferent about bowling, uninterested in the tough competition, and unwilling to give her best swing.\nC: She seemed disinterested in bowling and unprepared for the tough competition, showing no intention to give her best swing.\nD: She looked excited to be bowling, she knew the competition was tough, but she loved a challenege. She was about to gove her best swing, bevause we all wish for a 20 pin knock out. The anticipation as you watch the ball roll down the runway, although it only takes a second Its the competition throw now, i hope he doesnt roll as good as me! You Question how many oins can you STRIKE OUT",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: She appeared nervous and unprepared for the tough competition, lacking the confidence to give her best swing.\nB: She looked indifferent about bowling, uninterested in the tough competition, and unwilling to give her best swing.\nC: She seemed disinterested in bowling and unprepared for the tough competition, showing no intention to give her best swing.\nD: She looked excited to be bowling, she knew the competition was tough, but she loved a challenege. She was about to gove her best swing, bevause we all wish for a 20 pin knock out. The anticipation as you watch the ball roll down the runway, although it only takes a second Its the competition throw now, i hope he doesnt roll as good as me! You Question how many oins can you STRIKE OUT",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_121_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_121_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_121_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_121_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_121_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: a good day to fish at lake a guy is ready to fish he catches one and puts in cooler he closes the cooler he greets dog\nB: a woman is feeding birds in the park\nC: a man is dancing in the rain with his dog\nD: a group of friends having a picnic by the beach",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: a good day to fish at lake a guy is ready to fish he catches one and puts in cooler he closes the cooler he greets dog\nB: a woman is feeding birds in the park\nC: a man is dancing in the rain with his dog\nD: a group of friends having a picnic by the beach",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_122_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_122_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_122_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_122_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_122_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: Shows a wooden house from the bottom of the deck a man is using a fire with logs to make something the smoke exits through a gap in the cieling theres a ring around the fire the house is held up by beams\nB: Features a serene beach with crystal clear water and palm trees\nC: Portrays a snow-covered mountain peak with skiers in the distance\nD: Depicts a modern cityscape with tall skyscrapers and bustling streets",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: Shows a wooden house from the bottom of the deck a man is using a fire with logs to make something the smoke exits through a gap in the cieling theres a ring around the fire the house is held up by beams\nB: Features a serene beach with crystal clear water and palm trees\nC: Portrays a snow-covered mountain peak with skiers in the distance\nD: Depicts a modern cityscape with tall skyscrapers and bustling streets",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_123_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_123_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_123_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_123_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_123_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: A group of friends enjoying a scooter ride in the mountains.\nB: A group of professional off-road cyclists.\nC: A family enjoying a day out in the countryside.\nD: That is the best scooterist in Poland So sick! Off roading, I didn't even think it was possible. Tandem off roading, oh my god! Kids! Cabbage soup is ready!",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: A group of friends enjoying a scooter ride in the mountains.\nB: A group of professional off-road cyclists.\nC: A family enjoying a day out in the countryside.\nD: That is the best scooterist in Poland So sick! Off roading, I didn't even think it was possible. Tandem off roading, oh my god! Kids! Cabbage soup is ready!",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_124_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_124_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_124_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_124_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_124_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: we waited in line for a long time and it was frustrating !\nB: the field was small and unimpressive !\nC: we were on the bus and ready to have some fun !we waited in line , but only for a little while !it was time to go and enjoy ourselves !while in the stadium , we gazed at all of the awesome sites !the field was huge and beautiful ! we had fun !\nD: we were bored and didn't enjoy the stadium at all !",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: we waited in line for a long time and it was frustrating !\nB: the field was small and unimpressive !\nC: we were on the bus and ready to have some fun !we waited in line , but only for a little while !it was time to go and enjoy ourselves !while in the stadium , we gazed at all of the awesome sites !the field was huge and beautiful ! we had fun !\nD: we were bored and didn't enjoy the stadium at all !",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_125_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_125_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_125_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_125_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_125_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: we first went at the shores to get a closer look at the incredible view of the city .we can see the apartment we rented for the week . it has a great view .i took this pictures of the mallards . they enjoying the light breeze of the afternoon .i asked the local about this log , apparently it came from a old crusade boat that sinked here .this is where we sat and admired the vast location sea .\nB: The city view from the shores was disappointing\nC: The apartment we rented had a terrible view\nD: The mallards were scared of the light breeze",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: we first went at the shores to get a closer look at the incredible view of the city .we can see the apartment we rented for the week . it has a great view .i took this pictures of the mallards . they enjoying the light breeze of the afternoon .i asked the local about this log , apparently it came from a old crusade boat that sinked here .this is where we sat and admired the vast location sea .\nB: The city view from the shores was disappointing\nC: The apartment we rented had a terrible view\nD: The mallards were scared of the light breeze",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_126_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_126_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_126_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_126_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_126_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: Diana is getting ready for a party. Marzia warns him about the offensive poster. Marzia approaches Diana and asks him to remove the poster. Marzia apologizes to everyone in the room. They both remove the manifest.\nB: Diana is planning for a meeting. Marzia advises him to change the poster displayed. Marzia whispers something in Diana's ear and suggests removing the poster. Marzia is pleased to reconcile with everyone in the room. They both take down the poster.\nC: Diana is preparing for a rally. Marzia approaches him to warn that the poster displayed is offensive Marzia approaches Diana's ear and suggests that he remove the poster Marzia is happy to apologize with everyone  in the room They both remove the manifest.\nD: Diana is packing for a trip. Marzia informs him about an offensive poster. Marzia talks to Diana and advises him to take down the poster. Marzia is seen apologizing to everyone in the room. They both remove the manifesto.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: Diana is getting ready for a party. Marzia warns him about the offensive poster. Marzia approaches Diana and asks him to remove the poster. Marzia apologizes to everyone in the room. They both remove the manifest.\nB: Diana is planning for a meeting. Marzia advises him to change the poster displayed. Marzia whispers something in Diana's ear and suggests removing the poster. Marzia is pleased to reconcile with everyone in the room. They both take down the poster.\nC: Diana is preparing for a rally. Marzia approaches him to warn that the poster displayed is offensive Marzia approaches Diana's ear and suggests that he remove the poster Marzia is happy to apologize with everyone  in the room They both remove the manifest.\nD: Diana is packing for a trip. Marzia informs him about an offensive poster. Marzia talks to Diana and advises him to take down the poster. Marzia is seen apologizing to everyone in the room. They both remove the manifesto.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_127_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_127_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_127_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_127_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_127_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: a busy street market in Japan\nB: a fast-food restaurant serving traditional Japanese cuisine\nC: as a japanese exchange student i 'm asked quite often about what i find different .actually , we are quite the same . we have our version of fast-food restaurants .we also have our traditional restaurants and markets .miso soup ! our steaming hot fast-food eaten rapidly with ohashi ( chop sticks ) .my car . see , we are quite similar to you . : )\nD: a group of friends eating sushi at a traditional restaurant",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: a busy street market in Japan\nB: a fast-food restaurant serving traditional Japanese cuisine\nC: as a japanese exchange student i 'm asked quite often about what i find different .actually , we are quite the same . we have our version of fast-food restaurants .we also have our traditional restaurants and markets .miso soup ! our steaming hot fast-food eaten rapidly with ohashi ( chop sticks ) .my car . see , we are quite similar to you . : )\nD: a group of friends eating sushi at a traditional restaurant",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_128_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_128_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_128_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_128_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_128_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: we saw terrible sand sculptures\nB: we had a terrible weekend together\nC: the weather was terrible for hanging out at the beach\nD: we had a great weekend together .before hitting the beach , we stopped at a burger stand .the weather was great for hanging out at the beach .we saw awesome sand sculptures .we ca n't wait to come back here !",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: we saw terrible sand sculptures\nB: we had a terrible weekend together\nC: the weather was terrible for hanging out at the beach\nD: we had a great weekend together .before hitting the beach , we stopped at a burger stand .the weather was great for hanging out at the beach .we saw awesome sand sculptures .we ca n't wait to come back here !",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_129_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_129_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_129_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_129_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_129_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: At the park on a sunny day, the Thompson family enjoyed a picnic. They played games and had a great time.\nB: Late one Friday night, the Smith family gathered for dinner. The kids were well-behaved and polite.\nC: Early one Sunday morning,  the Sandler family gathered for breakfast. Of course,  Brody made a mess while eating his cereal. The girls were gabbing. The family planned their day. Dad suggested going to the zoo.\nD: On a quiet Saturday afternoon, the Johnson family cleaned their house together.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: At the park on a sunny day, the Thompson family enjoyed a picnic. They played games and had a great time.\nB: Late one Friday night, the Smith family gathered for dinner. The kids were well-behaved and polite.\nC: Early one Sunday morning,  the Sandler family gathered for breakfast. Of course,  Brody made a mess while eating his cereal. The girls were gabbing. The family planned their day. Dad suggested going to the zoo.\nD: On a quiet Saturday afternoon, the Johnson family cleaned their house together.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_130_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_130_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_130_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_130_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_130_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: The man was selling bananas at a fruit market\nB: The man was walking in the mountains\nC: The man was riding a bicycle in the city\nD: The man was admiring the bunches of bananas he had picked. He had his load of bananas tied to his sbike as he road them on the curved road. The moutains where ahead and you could see the grey sky promising rain. The mountains went higher and higher with trees and dirt. Soon the man was joined with others on bikes with bananas as well on the road.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: The man was selling bananas at a fruit market\nB: The man was walking in the mountains\nC: The man was riding a bicycle in the city\nD: The man was admiring the bunches of bananas he had picked. He had his load of bananas tied to his sbike as he road them on the curved road. The moutains where ahead and you could see the grey sky promising rain. The mountains went higher and higher with trees and dirt. Soon the man was joined with others on bikes with bananas as well on the road.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_131_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_131_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_131_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_131_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_131_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: The male is standing out of his vehicle. They are with their partner looking happy. They head down the road to see the sites. Whilst in Canada they see boats and the lakes. They are excited for their adventures.\nB: The couple is sitting inside the car. They look tired and frustrated. They are stuck in traffic and unable to move. While in Canada they are stressed about the long journey. They worry about the delays.\nC: The female is alone in the car. She looks bored and uninterested. She drives to a deserted place. While in Canada she is lost and confused. She is afraid of the unknown.\nD: There are no people in the images. Only empty vehicles and a desolate road. The surroundings are gloomy and unwelcoming. There is no sense of excitement or adventure. The atmosphere is dull and depressing.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: The male is standing out of his vehicle. They are with their partner looking happy. They head down the road to see the sites. Whilst in Canada they see boats and the lakes. They are excited for their adventures.\nB: The couple is sitting inside the car. They look tired and frustrated. They are stuck in traffic and unable to move. While in Canada they are stressed about the long journey. They worry about the delays.\nC: The female is alone in the car. She looks bored and uninterested. She drives to a deserted place. While in Canada she is lost and confused. She is afraid of the unknown.\nD: There are no people in the images. Only empty vehicles and a desolate road. The surroundings are gloomy and unwelcoming. There is no sense of excitement or adventure. The atmosphere is dull and depressing.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_132_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_132_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_132_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_132_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_132_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: local art gallery showcasing professional artists' work.\nB: local art museum to host children 's art ,artwork done by 5th grader that reminded her of spring .8th grade student created sculptures to capture winter 's ice .12th grade student converted moms van into a shaded oasis and a great view of hills .10th grader who was amazed by organisms in the body , drew this for health class .\nC: exhibition of historical artifacts and ancient relics.\nD: collection of artwork from famous painters around the world.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: local art gallery showcasing professional artists' work.\nB: local art museum to host children 's art ,artwork done by 5th grader that reminded her of spring .8th grade student created sculptures to capture winter 's ice .12th grade student converted moms van into a shaded oasis and a great view of hills .10th grader who was amazed by organisms in the body , drew this for health class .\nC: exhibition of historical artifacts and ancient relics.\nD: collection of artwork from famous painters around the world.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_133_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_133_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_133_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_133_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_133_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: these pictures showcase the experience of a day at the amusement park\nB: here we are on the first day of our trip to the beach .we were so excited that we both had to take pictures .we took a short break from the beach , but we got lost .however , we found more beach and it was more peaceful .we finally got in the water after a while .\nC: this is a collection of photos from a hiking trip in the mountains\nD: these images capture the excitement of a city tour",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: these pictures showcase the experience of a day at the amusement park\nB: here we are on the first day of our trip to the beach .we were so excited that we both had to take pictures .we took a short break from the beach , but we got lost .however , we found more beach and it was more peaceful .we finally got in the water after a while .\nC: this is a collection of photos from a hiking trip in the mountains\nD: these images capture the excitement of a city tour",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_134_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_134_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_134_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_134_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_134_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: Nobody enjoyed themselves\nB: Some people were sleeping\nC: The party was boring\nD: i went to my friend 's party last night .some of the guest were dressed up .i had a lot of fun talking to everyone there .we spent some time playing games in the living room .after a few hours everyone was very tired .",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: Nobody enjoyed themselves\nB: Some people were sleeping\nC: The party was boring\nD: i went to my friend 's party last night .some of the guest were dressed up .i had a lot of fun talking to everyone there .we spent some time playing games in the living room .after a few hours everyone was very tired .",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_135_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_135_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_135_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_135_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_135_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: Rain was pouring on the swim school sign. Inside the school there were swimwears lined up. The boy was practicing swimming. The instructor was teaching the strokes. They started to swim.\nB: Moon was shining on the gym sign. Inside the gym there were yoga mats lined up. The woman was practicing yoga. The instructor was demonstrating the poses. They started to meditate.\nC: The sun was setting on the football field. Inside the field there were football jerseys lined up. The boy was practicing football. The coach was explaining the game plan. They started to play.\nD: Sun was shining on the ballet school sign. Inside the school there was dresses lined up. The girl was practising ballet. The trainer was showing them the moves. They started to dance.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: Rain was pouring on the swim school sign. Inside the school there were swimwears lined up. The boy was practicing swimming. The instructor was teaching the strokes. They started to swim.\nB: Moon was shining on the gym sign. Inside the gym there were yoga mats lined up. The woman was practicing yoga. The instructor was demonstrating the poses. They started to meditate.\nC: The sun was setting on the football field. Inside the field there were football jerseys lined up. The boy was practicing football. The coach was explaining the game plan. They started to play.\nD: Sun was shining on the ballet school sign. Inside the school there was dresses lined up. The girl was practising ballet. The trainer was showing them the moves. They started to dance.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_136_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_136_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_136_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_136_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_136_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: The family were cleaning. They set up cleaning supplies.\nB: The family were playing games. They set up a board game.\nC: The family were gardening. They planted new flowers.\nD: The family were preparing food. They set up the grill. Food was ready to be eaten. The kids were happy that food was ready. Everyone started eating.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: The family were cleaning. They set up cleaning supplies.\nB: The family were playing games. They set up a board game.\nC: The family were gardening. They planted new flowers.\nD: The family were preparing food. They set up the grill. Food was ready to be eaten. The kids were happy that food was ready. Everyone started eating.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_137_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_137_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_137_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_137_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_137_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: The climbers are struggling with extreme cold and harsh weather conditions, making it difficult for them to continue their journey.\nB: The climbers are enjoying a leisurely hike on a sunny day with no obstacles.\nC: The climbers seems to be almost at the zenith of the daring mountain. They struggle to overcome changes in elevation thru the ice filled rocky terrain. Both climbers seem exhausted from the adventure and the weight they are carrying. However, they make it to the top in an outstanding show of dedication to enjoy the view. The rest of the journey awaits as they begin to trek down the slippery slopes.\nD: The climbers are at the base of the mountain and are just starting their ascent.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: The climbers are struggling with extreme cold and harsh weather conditions, making it difficult for them to continue their journey.\nB: The climbers are enjoying a leisurely hike on a sunny day with no obstacles.\nC: The climbers seems to be almost at the zenith of the daring mountain. They struggle to overcome changes in elevation thru the ice filled rocky terrain. Both climbers seem exhausted from the adventure and the weight they are carrying. However, they make it to the top in an outstanding show of dedication to enjoy the view. The rest of the journey awaits as they begin to trek down the slippery slopes.\nD: The climbers are at the base of the mountain and are just starting their ascent.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_138_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_138_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_138_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_138_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_138_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: The nature walk was not enjoyable.\nB: on our nature walk we explored nature on a wooden bridge .the beautiful greenery covered a bright sunshine .fences had ivy growing up them , and were noticed surrounding the park .we had treasures to take home from our nature walk .i picked some plants to bring home and place in to my garden .\nC: There were no fences and ivy in the park.\nD: The wooden bridge was covered in snow and ice.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: The nature walk was not enjoyable.\nB: on our nature walk we explored nature on a wooden bridge .the beautiful greenery covered a bright sunshine .fences had ivy growing up them , and were noticed surrounding the park .we had treasures to take home from our nature walk .i picked some plants to bring home and place in to my garden .\nC: There were no fences and ivy in the park.\nD: The wooden bridge was covered in snow and ice.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_139_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_139_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_139_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_139_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_139_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: The view is set in front of a modern office with a red door garage. A woman is talking on her phone. She looks surprised. Enters another female figure. She's looking at the woman talking on her phone. The second female figure (working) goes to the coffee machine and makes a cup. The second female figure returns to the talking first with the cup.\nB: The view is set in front of an old home with a white door garage. A woman is reading a book. She looks relaxed. Entering another female figure. She's looking around the room. The second female figure (working) goes to the fridge and takes out a sandwich. The second female figure returns to the relaxed first with the sandwich.\nC: The view is set in front of a modern home with a brown door garage. A man is typing on his laptop. He looks focused. Enters another male figure. He's looking over at the man who is focusing on his laptop. The second male figure (nonworking) goes to the fridge and cracks it open. The second male figure returns to the working first empty handed.\nD: The view is set in front of a traditional house with a wooden door garage. A woman is painting on a canvas. She looks artistic. Enters another male figure. He's looking at the woman painting. The second male figure (working) goes to the shelf and picks up a book. The second male figure returns to the painting first with the book.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: The view is set in front of a modern office with a red door garage. A woman is talking on her phone. She looks surprised. Enters another female figure. She's looking at the woman talking on her phone. The second female figure (working) goes to the coffee machine and makes a cup. The second female figure returns to the talking first with the cup.\nB: The view is set in front of an old home with a white door garage. A woman is reading a book. She looks relaxed. Entering another female figure. She's looking around the room. The second female figure (working) goes to the fridge and takes out a sandwich. The second female figure returns to the relaxed first with the sandwich.\nC: The view is set in front of a modern home with a brown door garage. A man is typing on his laptop. He looks focused. Enters another male figure. He's looking over at the man who is focusing on his laptop. The second male figure (nonworking) goes to the fridge and cracks it open. The second male figure returns to the working first empty handed.\nD: The view is set in front of a traditional house with a wooden door garage. A woman is painting on a canvas. She looks artistic. Enters another male figure. He's looking at the woman painting. The second male figure (working) goes to the shelf and picks up a book. The second male figure returns to the painting first with the book.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_140_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_140_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_140_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_140_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_140_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: The little girl is unhappy with her outfit choice.\nB: I, the little girl, want to show Mom my outfit for the day so i will start with my black dress. Mom looks impressed sitting on little girls bed. Little girl likes the reaction from mom and feels good about the choice. Little girl now decides to dress up in something different and ganders in the mirror. Little girl turns to strike a pose and blow a kiss.\nC: The little girl wants to show her friend her new dress.\nD: The little girl wants to show Dad her new dress.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: The little girl is unhappy with her outfit choice.\nB: I, the little girl, want to show Mom my outfit for the day so i will start with my black dress. Mom looks impressed sitting on little girls bed. Little girl likes the reaction from mom and feels good about the choice. Little girl now decides to dress up in something different and ganders in the mirror. Little girl turns to strike a pose and blow a kiss.\nC: The little girl wants to show her friend her new dress.\nD: The little girl wants to show Dad her new dress.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_141_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_141_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_141_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_141_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_141_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: The boy jumped into the clear blue ocean. The boy who had jumped into the ocean lookrd out for his friend. His friend finally joined him on the shoreline where they both donned swim trunks. The boys realized the water was warm as theybstared at eachother in surprise. The boys took each others hands and jumped into the water as they played all day.\nB: The girl ran into the stormy ocean. The girl who had run into the ocean watched out for her friend. Her friend finally joined her on the shoreline where they both wore raincoats. The girls realized the water was cold as they looked at each other in surprise. The girls took each others hands and jumped into the water as they played all day.\nC: The boy walked away from the dull ocean. The boy who had walked away from the ocean looked out for his friend. His friend finally joined him on the shoreline where they both wore heavy jackets. The boys realized the water was cold as they stared at each other in surprise. The boys took each others hands and jumped into the water as they played all day.\nD: The boy sat by the calm blue ocean. The boy who had sat by the ocean looked out for his friend. His friend finally joined him on the shoreline where they both wore sandals. The boys realized the water was cold as they looked at each other in disappointment. The boys took each others hands and jumped into the water as they played all day.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: The boy jumped into the clear blue ocean. The boy who had jumped into the ocean lookrd out for his friend. His friend finally joined him on the shoreline where they both donned swim trunks. The boys realized the water was warm as theybstared at eachother in surprise. The boys took each others hands and jumped into the water as they played all day.\nB: The girl ran into the stormy ocean. The girl who had run into the ocean watched out for her friend. Her friend finally joined her on the shoreline where they both wore raincoats. The girls realized the water was cold as they looked at each other in surprise. The girls took each others hands and jumped into the water as they played all day.\nC: The boy walked away from the dull ocean. The boy who had walked away from the ocean looked out for his friend. His friend finally joined him on the shoreline where they both wore heavy jackets. The boys realized the water was cold as they stared at each other in surprise. The boys took each others hands and jumped into the water as they played all day.\nD: The boy sat by the calm blue ocean. The boy who had sat by the ocean looked out for his friend. His friend finally joined him on the shoreline where they both wore sandals. The boys realized the water was cold as they looked at each other in disappointment. The boys took each others hands and jumped into the water as they played all day.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_142_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_142_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_142_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_142_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_142_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: we ended up in a crowded garden with modern statues and no sight of the beach or pier.\nB: they took us to a dirty garden with broken statues and no beach or pier in sight.\nC: they dropped us in an immaculate garden . filled with bronzed statues ...like the protector of the camp , a strong man with a spear .we then walked to the beach and saw the gorgeous coastline..and , the first concrete pier i had ever seen .that is where i took my final photo of the trip .\nD: we were left in a messy garden with old statues and no beach or pier nearby.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: we ended up in a crowded garden with modern statues and no sight of the beach or pier.\nB: they took us to a dirty garden with broken statues and no beach or pier in sight.\nC: they dropped us in an immaculate garden . filled with bronzed statues ...like the protector of the camp , a strong man with a spear .we then walked to the beach and saw the gorgeous coastline..and , the first concrete pier i had ever seen .that is where i took my final photo of the trip .\nD: we were left in a messy garden with old statues and no beach or pier nearby.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_143_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_143_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_143_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_143_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_143_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: they went to the beach .they relaxed on the shore .this was a familiar structure .the waves crashed peacefully .finally, they got a worse view of the waters .\nB: they explored the city .they admired the cityscape .this was a well-known structure .the waves were calm and serene .at last, they got a great view of the city .\nC: they went on a forest hike .they took a break to rest .this was a famous structure .the leaves rustled softly .in the end, they got an amazing view of the forest .\nD: we took a trip to the mountains .we took a break to enjoy the view .this was a structure we were unfamiliar with .the waves crashed violently .lastly , we got a better view of the waters .",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: they went to the beach .they relaxed on the shore .this was a familiar structure .the waves crashed peacefully .finally, they got a worse view of the waters .\nB: they explored the city .they admired the cityscape .this was a well-known structure .the waves were calm and serene .at last, they got a great view of the city .\nC: they went on a forest hike .they took a break to rest .this was a famous structure .the leaves rustled softly .in the end, they got an amazing view of the forest .\nD: we took a trip to the mountains .we took a break to enjoy the view .this was a structure we were unfamiliar with .the waves crashed violently .lastly , we got a better view of the waters .",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_144_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_144_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_144_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_144_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_144_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: the girls got silly after a few beers .\nB: the deer head was looking at us from the wall of the bar .\nC: the deer head stared at us from the wall of the bar .after a few beers the girls got a little sillyjust a bunch of friends out for a good time .the all enjoyed the drinks and the company .this man was planning for after he and his buxom lady got home .\nD: a group of friends out for a good time .",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: the girls got silly after a few beers .\nB: the deer head was looking at us from the wall of the bar .\nC: the deer head stared at us from the wall of the bar .after a few beers the girls got a little sillyjust a bunch of friends out for a good time .the all enjoyed the drinks and the company .this man was planning for after he and his buxom lady got home .\nD: a group of friends out for a good time .",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_145_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_145_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_145_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_145_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_145_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: Group of females standing with green hills jerseys standing in front of a city\nB: Females standing happily in front of a mountain range\nC: Female standing normally with a tent behind her Female unhappy with an orange hills jersey on and a field behind her Group of females standing with orange hills jerseys standing in front of a field Same female from the first picture standing happy with a tent behind her Same female from second picture with orange jersey with field behind her\nD: Male standing with a tent behind him",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: Group of females standing with green hills jerseys standing in front of a city\nB: Females standing happily in front of a mountain range\nC: Female standing normally with a tent behind her Female unhappy with an orange hills jersey on and a field behind her Group of females standing with orange hills jerseys standing in front of a field Same female from the first picture standing happy with a tent behind her Same female from second picture with orange jersey with field behind her\nD: Male standing with a tent behind him",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_146_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_146_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_146_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_146_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_146_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: Take the moment to celebrate this special date.\nB: At this event we are pleased to introduce Nadi! This artist who has been very successful in recent years. At the end of the event, anyone who wants to take a photo will be available. Take the moment to celebrate this special date. And our dishes are delicious.\nC: This artist who has been very successful in recent years.\nD: At the end of the event, anyone who wants to take a photo will be available.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: Take the moment to celebrate this special date.\nB: At this event we are pleased to introduce Nadi! This artist who has been very successful in recent years. At the end of the event, anyone who wants to take a photo will be available. Take the moment to celebrate this special date. And our dishes are delicious.\nC: This artist who has been very successful in recent years.\nD: At the end of the event, anyone who wants to take a photo will be available.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_147_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_147_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_147_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_147_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_147_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: Asked a question by a foreigner. both of them watching the ice and raw fish. Another question asked by foreigner. Cutting the fish. Frying the fish on fire.\nB: Both of them watching a movie. Asked a question by a friend. Cutting the vegetables. Frying the vegetables on fire.\nC: Both of them watching TV. Asked a question by a local. Cutting the bread. Frying the bread on fire.\nD: Both of them watching the game. Asked a question by a tourist. Cutting the meat. Frying the meat on fire.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: Asked a question by a foreigner. both of them watching the ice and raw fish. Another question asked by foreigner. Cutting the fish. Frying the fish on fire.\nB: Both of them watching a movie. Asked a question by a friend. Cutting the vegetables. Frying the vegetables on fire.\nC: Both of them watching TV. Asked a question by a local. Cutting the bread. Frying the bread on fire.\nD: Both of them watching the game. Asked a question by a tourist. Cutting the meat. Frying the meat on fire.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_148_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_148_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_148_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_148_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_148_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: i hate animals and they knew i would hate the dog frisbee competition as part of the entertainment for the day\nB: i thought they might have forgotten but they really surprised me when we got home with a birthday cake just for me but of course i will eat it all\nC: i thought they might have forgotten and they really did forget when we got home with a birthday cake just for me and i didn't have to share it\nD: they told me it was my day , they were taking me out for a good time and that i deserved it .there is nothing like good friends/family , carnival style food and good entertainment to really lift a girl 's spirits .i love animals and they knew i would just love the dog frisbee competition as part of the entertainment for the day .i thought they might have forgotten but they really surprised me when we got home with a birthday cake just for me but of course i will share it .this really was my day and i am so lucky to have such special family and friends to spend time with that would surprise me with this cake which is just so pretty to me that i had to take a good picture of it for my scrap book so i never forget it .",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: i hate animals and they knew i would hate the dog frisbee competition as part of the entertainment for the day\nB: i thought they might have forgotten but they really surprised me when we got home with a birthday cake just for me but of course i will eat it all\nC: i thought they might have forgotten and they really did forget when we got home with a birthday cake just for me and i didn't have to share it\nD: they told me it was my day , they were taking me out for a good time and that i deserved it .there is nothing like good friends/family , carnival style food and good entertainment to really lift a girl 's spirits .i love animals and they knew i would just love the dog frisbee competition as part of the entertainment for the day .i thought they might have forgotten but they really surprised me when we got home with a birthday cake just for me but of course i will share it .this really was my day and i am so lucky to have such special family and friends to spend time with that would surprise me with this cake which is just so pretty to me that i had to take a good picture of it for my scrap book so i never forget it .",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_149_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_149_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_149_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_149_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_149_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: it 's a fun night out and this guy just wants to have fun .he meets with his friend who looks very disinterested in hanging out .he meets someone else who shows more interest in hanging with him .he pops a cigar in his mouth and smiles .time for a group picture while continuing to smoke his cigar .\nB: it 's a casual night out and this guy just wants to relax .he meets with his friend who looks interested in hanging out .he meets someone else who shows more interest in hanging with him .he pops a cigar in his mouth and frowns .time for a group picture while putting away his cigar .\nC: it 's a boring night out and this guy just wants to be alone .he meets with his friend who looks very disinterested in hanging out .he meets someone else who shows no interest in hanging with him .he pops a cigar in his mouth and frowns .time for a group picture while continuing to smoke his cigar .\nD: it 's a exciting night out and this guy just wants to have fun .he meets with his friend who looks very interested in hanging out .he meets someone else who shows more interest in hanging with him .he pops a cigarette in his mouth and smiles .time for a group picture while continuing to smoke his cigar .",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: it 's a fun night out and this guy just wants to have fun .he meets with his friend who looks very disinterested in hanging out .he meets someone else who shows more interest in hanging with him .he pops a cigar in his mouth and smiles .time for a group picture while continuing to smoke his cigar .\nB: it 's a casual night out and this guy just wants to relax .he meets with his friend who looks interested in hanging out .he meets someone else who shows more interest in hanging with him .he pops a cigar in his mouth and frowns .time for a group picture while putting away his cigar .\nC: it 's a boring night out and this guy just wants to be alone .he meets with his friend who looks very disinterested in hanging out .he meets someone else who shows no interest in hanging with him .he pops a cigar in his mouth and frowns .time for a group picture while continuing to smoke his cigar .\nD: it 's a exciting night out and this guy just wants to have fun .he meets with his friend who looks very interested in hanging out .he meets someone else who shows more interest in hanging with him .he pops a cigarette in his mouth and smiles .time for a group picture while continuing to smoke his cigar .",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_150_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_150_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_150_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_150_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_150_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: we stayed at a desert with sand dunes\nB: we visited a big city with skyscrapers\nC: on our vacation we traveled through several small towns , each one of them unique .outside of the towns we often saw rolling meadows lined with flowers .the meadows , in turn , were lined with rocky mountain cliffs , such as this one .laying between the mountains and the meadows were some beautiful rivers .finally after passing over one last mountain , we arrived at the beach . our vacation destination was achieved .\nD: we explored a forest with dense foliage",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: we stayed at a desert with sand dunes\nB: we visited a big city with skyscrapers\nC: on our vacation we traveled through several small towns , each one of them unique .outside of the towns we often saw rolling meadows lined with flowers .the meadows , in turn , were lined with rocky mountain cliffs , such as this one .laying between the mountains and the meadows were some beautiful rivers .finally after passing over one last mountain , we arrived at the beach . our vacation destination was achieved .\nD: we explored a forest with dense foliage",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_151_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_151_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_151_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_151_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_151_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: occasionally , the whole family gets together for a camping trip .some of the family have small campers .others have huge rvs .all that really matters is that they get to spend time together .they even do activities like ride bikes through the woods .\nB: every year , the whole family gets together for a camping trip .some of the family have small campers .others have huge rvs .all that really matters is that they get to spend time together .they even do activities like ride bikes through the woods .\nC: each year , the entire family gets together for a beach trip .some of the family have small tents .others have big houses .all that really matters is that they get to spend time together .they even do activities like play volleyball on the sand .\nD: every year , the whole family gets together for a hiking trip .some of the family have small tents .others have large cabins .all that really matters is that they get to spend time together .they even do activities like go for a swim in the river .",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: occasionally , the whole family gets together for a camping trip .some of the family have small campers .others have huge rvs .all that really matters is that they get to spend time together .they even do activities like ride bikes through the woods .\nB: every year , the whole family gets together for a camping trip .some of the family have small campers .others have huge rvs .all that really matters is that they get to spend time together .they even do activities like ride bikes through the woods .\nC: each year , the entire family gets together for a beach trip .some of the family have small tents .others have big houses .all that really matters is that they get to spend time together .they even do activities like play volleyball on the sand .\nD: every year , the whole family gets together for a hiking trip .some of the family have small tents .others have large cabins .all that really matters is that they get to spend time together .they even do activities like go for a swim in the river .",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_152_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_152_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_152_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_152_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_152_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: i went for a bike ride in the suburbs .then i discovered a bustling city .all the paths in the city were quite busy .later i spotted some historic mail boxes .finally i got to a crowded playground .\nB: i went for a run in the mountains .then i found a deserted town .all the pathways in the town were extremely dusty .next i saw some modern mail boxes .finally i arrived at a big park .\nC: i went for a swim in the city .then i stumbled upon a big city .all the streets in the city were very clean .later i found some new mail boxes .finally i reached a large stadium .\nD: i took a walk on in the country .then i came across a small neighborhood .all the roads in the neighborhood were really muddy .after that i came across some really old mail boxes .at the end of my walk i came to a small playground .",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: i went for a bike ride in the suburbs .then i discovered a bustling city .all the paths in the city were quite busy .later i spotted some historic mail boxes .finally i got to a crowded playground .\nB: i went for a run in the mountains .then i found a deserted town .all the pathways in the town were extremely dusty .next i saw some modern mail boxes .finally i arrived at a big park .\nC: i went for a swim in the city .then i stumbled upon a big city .all the streets in the city were very clean .later i found some new mail boxes .finally i reached a large stadium .\nD: i took a walk on in the country .then i came across a small neighborhood .all the roads in the neighborhood were really muddy .after that i came across some really old mail boxes .at the end of my walk i came to a small playground .",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_153_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_153_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_153_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_153_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_153_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: boo , what a great place he told them .\nB: wow , what a terrible place he told them .\nC: eh , what a mediocre place he told them .\nD: wow , what a great place he told them .casino ' .live entertainment .excellent libraries .and exotic foods .",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: boo , what a great place he told them .\nB: wow , what a terrible place he told them .\nC: eh , what a mediocre place he told them .\nD: wow , what a great place he told them .casino ' .live entertainment .excellent libraries .and exotic foods .",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_154_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_154_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_154_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_154_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_154_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: A black car is parked outside of the cabin and it's sunny.\nB: A white van is waiting outside of the cabin and its raining.Behind the van lots of trees can be seen A woman sitting on the floor of the cabin, making bun of her hair and light is on. The woman is listening very keenly what is going outside of the cabin. She is trying to open the door, as if she is being locked in this cabin. She is bending her head and thinking how to escape from this cabin.\nC: A man is standing outside of the cabin and the lights are off.\nD: A woman is dancing inside the cabin and there is no door.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: A black car is parked outside of the cabin and it's sunny.\nB: A white van is waiting outside of the cabin and its raining.Behind the van lots of trees can be seen A woman sitting on the floor of the cabin, making bun of her hair and light is on. The woman is listening very keenly what is going outside of the cabin. She is trying to open the door, as if she is being locked in this cabin. She is bending her head and thinking how to escape from this cabin.\nC: A man is standing outside of the cabin and the lights are off.\nD: A woman is dancing inside the cabin and there is no door.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_155_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_155_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_155_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_155_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_155_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: This is a grocery shop there are variety of fruits here This is all fresh fruits The red apple is so nice They bought green apple\nB: This a shirt shop there are variet yof shirts here This is all second-hand shirt The red color shirt is so nice They bought blue color shirt\nC: This a shoe shop there are verity of shoe here This is all brand new shoe The red color shoe is so nice They bought black color shoe\nD: This is a furniture shop with a variety of tables and chairs This is all used furniture The red color chair is so nice They bought green color table",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: This is a grocery shop there are variety of fruits here This is all fresh fruits The red apple is so nice They bought green apple\nB: This a shirt shop there are variet yof shirts here This is all second-hand shirt The red color shirt is so nice They bought blue color shirt\nC: This a shoe shop there are verity of shoe here This is all brand new shoe The red color shoe is so nice They bought black color shoe\nD: This is a furniture shop with a variety of tables and chairs This is all used furniture The red color chair is so nice They bought green color table",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_156_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_156_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_156_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_156_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_156_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: a pink , purple and orange cake is the desert of choice at the gathering .guests sat at green tablecloth tables and ate their dinner .several people enjoyed the chocolate fountain .a guest speaker stole the show before the cake was served .kids and adults danced after dinner .\nB: a blue , white and yellow cake is the desert of choice at the gathering .guests sat at red tablecloth tables and ate their dinner .few people enjoyed the chocolate fountain .a guest speaker stole the show before the cake was served .kids and adults danced after dinner .\nC: a green , black and orange cake is the desert of choice at the gathering .guests sat at blue tablecloth tables and ate their dinner .some people enjoyed the chocolate fountain .a guest speaker stole the show before the cake was served .kids and adults danced after dinner .\nD: a brown , white and purple cake is the desert of choice at the gathering .guests sat at white tablecloth tables and ate their dinner .many people enjoyed the chocolate fountain .a guest speaker stole the show before the cake was served .kids and adults danced after dinner .",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: a pink , purple and orange cake is the desert of choice at the gathering .guests sat at green tablecloth tables and ate their dinner .several people enjoyed the chocolate fountain .a guest speaker stole the show before the cake was served .kids and adults danced after dinner .\nB: a blue , white and yellow cake is the desert of choice at the gathering .guests sat at red tablecloth tables and ate their dinner .few people enjoyed the chocolate fountain .a guest speaker stole the show before the cake was served .kids and adults danced after dinner .\nC: a green , black and orange cake is the desert of choice at the gathering .guests sat at blue tablecloth tables and ate their dinner .some people enjoyed the chocolate fountain .a guest speaker stole the show before the cake was served .kids and adults danced after dinner .\nD: a brown , white and purple cake is the desert of choice at the gathering .guests sat at white tablecloth tables and ate their dinner .many people enjoyed the chocolate fountain .a guest speaker stole the show before the cake was served .kids and adults danced after dinner .",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_157_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_157_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_157_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_157_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_157_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: A man exploring the city with his family and capturing memorable moments\nB: A photographer capturing the natural beauty of the city with stunning landscapes\nC: One woman wanted to film the city's beauty abroad. She hopes everyone likes her page. She started with people videos. After that, watch the movie on the buildings, paying particular attention to the architecture. Finally, she posts statue videos to her profile and becomes popular.\nD: A group of tourists enjoying the local cuisine and cultural activities",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: A man exploring the city with his family and capturing memorable moments\nB: A photographer capturing the natural beauty of the city with stunning landscapes\nC: One woman wanted to film the city's beauty abroad. She hopes everyone likes her page. She started with people videos. After that, watch the movie on the buildings, paying particular attention to the architecture. Finally, she posts statue videos to her profile and becomes popular.\nD: A group of tourists enjoying the local cuisine and cultural activities",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_158_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_158_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_158_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_158_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_158_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: the town square was a deserted place to shop .townspeople walked through the square to get to their boats in the morning .fishermen usually left the docks in the morning to work .the ocean was a fruitful way to make a living .this coastal city provided a way of life for the residents that kept everyone fed and productive .\nB: the town square was a vibrant place to shop .townspeople walked through the square to get to their boats in the morning .fishermen usually left the docks in the morning to work .the ocean was a fruitful way to make a living .this coastal city provided a way of life for the residents that kept everyone fed and productive .\nC: the town square was a crowded place to shop .townspeople walked through the square to get to their boats in the morning .fishermen usually left the docks in the morning to work .the ocean was a fruitful way to make a living .this coastal city provided a way of life for the residents that kept everyone fed and productive .\nD: the town square was a quiet place to shop .townspeople walked through the square to get to their boats in the morning .fishermen usually left the docks in the morning to work .the ocean was a fruitful way to make a living .this coastal city provided a way of life for the residents that kept everyone fed and productive .",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: the town square was a deserted place to shop .townspeople walked through the square to get to their boats in the morning .fishermen usually left the docks in the morning to work .the ocean was a fruitful way to make a living .this coastal city provided a way of life for the residents that kept everyone fed and productive .\nB: the town square was a vibrant place to shop .townspeople walked through the square to get to their boats in the morning .fishermen usually left the docks in the morning to work .the ocean was a fruitful way to make a living .this coastal city provided a way of life for the residents that kept everyone fed and productive .\nC: the town square was a crowded place to shop .townspeople walked through the square to get to their boats in the morning .fishermen usually left the docks in the morning to work .the ocean was a fruitful way to make a living .this coastal city provided a way of life for the residents that kept everyone fed and productive .\nD: the town square was a quiet place to shop .townspeople walked through the square to get to their boats in the morning .fishermen usually left the docks in the morning to work .the ocean was a fruitful way to make a living .this coastal city provided a way of life for the residents that kept everyone fed and productive .",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_159_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_159_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_159_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_159_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_159_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: Sitting in a park on a sunny day enjoying a picnic\nB: Walking through a forest and looking at the tall trees\nC: Sitting on the river side on a beautiful day fishing my life away. Got one hooked now to real it in nice and slow to not rip the fishes lip. Caught just a little baby fish but its okay because we catch and release. Releasing the fish because I don't see the point in keeping it when I am still fishing. Throw your line back in and just keep on catching them.\nD: Standing on the beach on a cloudy day watching the waves",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: Sitting in a park on a sunny day enjoying a picnic\nB: Walking through a forest and looking at the tall trees\nC: Sitting on the river side on a beautiful day fishing my life away. Got one hooked now to real it in nice and slow to not rip the fishes lip. Caught just a little baby fish but its okay because we catch and release. Releasing the fish because I don't see the point in keeping it when I am still fishing. Throw your line back in and just keep on catching them.\nD: Standing on the beach on a cloudy day watching the waves",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_160_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_160_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_160_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_160_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_160_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: Kat visited her best friend alone\nB: They all went to a restaurant\nC: Kat took her little sister Julie along to visit her best friend. They arrived at Michelle's house and Julie knocked on the door. Michelle opened the door and greeted them. Kat explained how Julie wanted to tag along. They all went into the living room to plan on what to do.\nD: They went to Michelle's house but no one was home",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: Kat visited her best friend alone\nB: They all went to a restaurant\nC: Kat took her little sister Julie along to visit her best friend. They arrived at Michelle's house and Julie knocked on the door. Michelle opened the door and greeted them. Kat explained how Julie wanted to tag along. They all went into the living room to plan on what to do.\nD: They went to Michelle's house but no one was home",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_161_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_161_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_161_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_161_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_161_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: Friend supporting the girl boss\nB: Girl showing respect to her elders\nC: Girl boss wants to show off that she is better even better than this old Mom person in here This other Friend of hers says something to her It seems to be something mean because the Mom is here too Then another girls shows up as being the best one\nD: Mom scolding the girl boss",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: Friend supporting the girl boss\nB: Girl showing respect to her elders\nC: Girl boss wants to show off that she is better even better than this old Mom person in here This other Friend of hers says something to her It seems to be something mean because the Mom is here too Then another girls shows up as being the best one\nD: Mom scolding the girl boss",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_162_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_162_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_162_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_162_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_162_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: The captain gives flight time as flight 997 leaves Hartfield-Jackson Airport on its way to Seattle One of the window seat passengers is amazed by birds' eye view' About mid-flight Angel, one of the attendants get ready to prepare the meals for the passengers. Angel checks with the passengers to see who is ready for a meal and explains what she has. Angel and Tam move aside in the kitchen so Charles, another attendant could get through.\nB: The passengers are preparing for a sports event.\nC: The passengers are getting ready for a music performance.\nD: The passengers are enjoying a movie on the flight.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: The captain gives flight time as flight 997 leaves Hartfield-Jackson Airport on its way to Seattle One of the window seat passengers is amazed by birds' eye view' About mid-flight Angel, one of the attendants get ready to prepare the meals for the passengers. Angel checks with the passengers to see who is ready for a meal and explains what she has. Angel and Tam move aside in the kitchen so Charles, another attendant could get through.\nB: The passengers are preparing for a sports event.\nC: The passengers are getting ready for a music performance.\nD: The passengers are enjoying a movie on the flight.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_163_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_163_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_163_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_163_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_163_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: Someone they dont know is walking up to their house. The stranger asked to take a selfie. The men took a selfie together. The man left abruptly so everyone was confused. The 2 men discussed the man leaving abruptly.\nB: A group of friends celebrating a birthday at a restaurant.\nC: A family having a picnic in the park.\nD: A couple shopping for groceries at the supermarket.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: Someone they dont know is walking up to their house. The stranger asked to take a selfie. The men took a selfie together. The man left abruptly so everyone was confused. The 2 men discussed the man leaving abruptly.\nB: A group of friends celebrating a birthday at a restaurant.\nC: A family having a picnic in the park.\nD: A couple shopping for groceries at the supermarket.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_164_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_164_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_164_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_164_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_164_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: excited to be at the birthday party .waiting for the cake to be cut .sitting and watching the balloons on the ground .running in the hallway .ready to blow out the candles and have some cake .\nB: so happy to be at the birthday party .waiting for the cake to be cut .enjoying running through the balloons on the ground .playing hide and seek in the hallway .ready to blow out the candles and have some cake .\nC: feeling bored at the party .waiting for the cake to be cut .tripping on the balloons on the ground .hiding in the hallway .ready to blow out the candles and have some cake .\nD: thrilled to be at the birthday party .waiting for the cake to be cut .popping the balloons on the ground .standing around in the hallway .ready to blow out the candles and have some cake .",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: excited to be at the birthday party .waiting for the cake to be cut .sitting and watching the balloons on the ground .running in the hallway .ready to blow out the candles and have some cake .\nB: so happy to be at the birthday party .waiting for the cake to be cut .enjoying running through the balloons on the ground .playing hide and seek in the hallway .ready to blow out the candles and have some cake .\nC: feeling bored at the party .waiting for the cake to be cut .tripping on the balloons on the ground .hiding in the hallway .ready to blow out the candles and have some cake .\nD: thrilled to be at the birthday party .waiting for the cake to be cut .popping the balloons on the ground .standing around in the hallway .ready to blow out the candles and have some cake .",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_165_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_165_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_165_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_165_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_165_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: A woman and a man are at a car dealership test driving different cars. They try out a few models before deciding on a red convertible. They then negotiate the price with the salesperson before completing the purchase.\nB: A group of friends are at an amusement park trying out different rides. They first go on the roller coaster and then the Ferris wheel. They finally end their day with some snacks and decide to go home.\nC: A mother and son are at the toys store looking for toys that the kid may enjoy. He is looking through the aisles looking at whatever he might be interested in. He takes notice of the car type toys and he's very interested in driving it. The mother wants to have a slight bit of fun so she hops in the car with him to have fun. They both decide on which toys to get and the proceed to the checkout to buy them.\nD: A father and daughter are at the bakery looking for a cake to buy. The daughter looks through the cakes and chooses a chocolate cake. The father wants to surprise her, so he decides to buy a strawberry cake instead.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: A woman and a man are at a car dealership test driving different cars. They try out a few models before deciding on a red convertible. They then negotiate the price with the salesperson before completing the purchase.\nB: A group of friends are at an amusement park trying out different rides. They first go on the roller coaster and then the Ferris wheel. They finally end their day with some snacks and decide to go home.\nC: A mother and son are at the toys store looking for toys that the kid may enjoy. He is looking through the aisles looking at whatever he might be interested in. He takes notice of the car type toys and he's very interested in driving it. The mother wants to have a slight bit of fun so she hops in the car with him to have fun. They both decide on which toys to get and the proceed to the checkout to buy them.\nD: A father and daughter are at the bakery looking for a cake to buy. The daughter looks through the cakes and chooses a chocolate cake. The father wants to surprise her, so he decides to buy a strawberry cake instead.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_166_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_166_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_166_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_166_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_166_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: it was a quiet day with only a few people at the stadium\nB: the stadium was old and in poor condition\nC: it was opening day and the crowd was lining up to cheer their team onthe new stadium was a great undertaking but it came out looking really goodit was a sold out ball game for the opening day and the weather was beautifulthe crowd roared as the first pitch was thrownafter the ball game the crowd quickly dispersed to their local watering holes\nD: the game was cancelled due to bad weather",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: it was a quiet day with only a few people at the stadium\nB: the stadium was old and in poor condition\nC: it was opening day and the crowd was lining up to cheer their team onthe new stadium was a great undertaking but it came out looking really goodit was a sold out ball game for the opening day and the weather was beautifulthe crowd roared as the first pitch was thrownafter the ball game the crowd quickly dispersed to their local watering holes\nD: the game was cancelled due to bad weather",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_167_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_167_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_167_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_167_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_167_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: A man is talking to a man. The men are not responding. They are having a conversation. They are drinking their coffee. The guy is sitting on the chair.\nB: A man is asking the woman something. The women are not responding. They are having a gathering. They are eating their food. The guy is around the table.\nC: A woman is talking to a man. The man is not responding. They are having a meeting. They are discussing a project. The lady is sitting at the desk.\nD: A woman is talking to a woman. The women are not responding. They are having a chat. They are reading their books. The lady is standing by the window.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: A man is talking to a man. The men are not responding. They are having a conversation. They are drinking their coffee. The guy is sitting on the chair.\nB: A man is asking the woman something. The women are not responding. They are having a gathering. They are eating their food. The guy is around the table.\nC: A woman is talking to a man. The man is not responding. They are having a meeting. They are discussing a project. The lady is sitting at the desk.\nD: A woman is talking to a woman. The women are not responding. They are having a chat. They are reading their books. The lady is standing by the window.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_168_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_168_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_168_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_168_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_168_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: he had put up the decorations for the celebration .he was looking forward to a day off .he thought he 'd go for a hike .maybe do a little fishing .if the fish were n't biting he could always go hunting .\nB: he was getting ready for a party .he wanted to spend the day with friends .he was excited to socialize .\nC: he was preparing for a quiet day at home .he was not interested in going out .he was planning to stay indoors .\nD: he was disappointed with the decorations .he was not satisfied with the arrangements .he was frustrated with the celebration .",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: he had put up the decorations for the celebration .he was looking forward to a day off .he thought he 'd go for a hike .maybe do a little fishing .if the fish were n't biting he could always go hunting .\nB: he was getting ready for a party .he wanted to spend the day with friends .he was excited to socialize .\nC: he was preparing for a quiet day at home .he was not interested in going out .he was planning to stay indoors .\nD: he was disappointed with the decorations .he was not satisfied with the arrangements .he was frustrated with the celebration .",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_169_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_169_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_169_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_169_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_169_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: if you work in television and do a good job , you 've probably been invited to our awards night .there are some real legends that show up every year .i loved meeting the cast of `` wait , wait , do n't tell me ! '' what a thrill !there 's a lot of meeting and greeting time too -- -it 's a good networking opportunity .the coveted golden television award is always the highlight of the night .\nB: if you work in radio and do a good job , you 've probably been invited to our awards night .there are some fake legends that show up every year .i hated meeting the cast of `` wait , wait , do n't tell me ! '' what a thrill !there 's not a lot of meeting and greeting time too -- -it 's not a good networking opportunity .the dreaded golden radio award is always the highlight of the night .\nC: if you work in radio and do a bad job , you 've probably been invited to our awards night .there are some real legends that show up every year .i loved meeting the cast of `` wait , wait , do n't tell me ! '' what a thrill !there 's a lot of meeting and greeting time too -- -it 's a good networking opportunity .the coveted golden radio award is always the highlight of the night .\nD: if you work in radio and do a good job , you 've probably been invited to our awards night .there are some real legends that show up every year .i loved meeting the cast of `` wait , wait , do n't tell me ! '' what a thrill !there 's a lot of meeting and greeting time too -- -it 's a good networking opportunity .the coveted golden radio award is always the highlight of the night .",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: if you work in television and do a good job , you 've probably been invited to our awards night .there are some real legends that show up every year .i loved meeting the cast of `` wait , wait , do n't tell me ! '' what a thrill !there 's a lot of meeting and greeting time too -- -it 's a good networking opportunity .the coveted golden television award is always the highlight of the night .\nB: if you work in radio and do a good job , you 've probably been invited to our awards night .there are some fake legends that show up every year .i hated meeting the cast of `` wait , wait , do n't tell me ! '' what a thrill !there 's not a lot of meeting and greeting time too -- -it 's not a good networking opportunity .the dreaded golden radio award is always the highlight of the night .\nC: if you work in radio and do a bad job , you 've probably been invited to our awards night .there are some real legends that show up every year .i loved meeting the cast of `` wait , wait , do n't tell me ! '' what a thrill !there 's a lot of meeting and greeting time too -- -it 's a good networking opportunity .the coveted golden radio award is always the highlight of the night .\nD: if you work in radio and do a good job , you 've probably been invited to our awards night .there are some real legends that show up every year .i loved meeting the cast of `` wait , wait , do n't tell me ! '' what a thrill !there 's a lot of meeting and greeting time too -- -it 's a good networking opportunity .the coveted golden radio award is always the highlight of the night .",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_170_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_170_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_170_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_170_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_170_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: two men talk next to a bus stop three kids play on a dock a man records with a little brown-haired girl on the sand people cross the pathway the family walks on the pavement\nB: two women discuss next to a parking meter three adults walk on a pontoon a woman makes a video with a little blonde boy on the sand people cross the street the family walks on the sidewalk\nC: two men discuss next to a parking meter three children walk on a pontoon a man makes a video with a little blonde girl on the sand people cross the road the family walks on the side of the road\nD: two men argue next to a parking meter three teenagers walk on a bridge a woman takes a photo with a little brunette girl on the sand people cross the intersection the family walks on the edge of the road",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: two men talk next to a bus stop three kids play on a dock a man records with a little brown-haired girl on the sand people cross the pathway the family walks on the pavement\nB: two women discuss next to a parking meter three adults walk on a pontoon a woman makes a video with a little blonde boy on the sand people cross the street the family walks on the sidewalk\nC: two men discuss next to a parking meter three children walk on a pontoon a man makes a video with a little blonde girl on the sand people cross the road the family walks on the side of the road\nD: two men argue next to a parking meter three teenagers walk on a bridge a woman takes a photo with a little brunette girl on the sand people cross the intersection the family walks on the edge of the road",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_171_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_171_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_171_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_171_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_171_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: i waited for the bus to arrive to take us to the beach .it was a beautiful day for the beach .after the beach we went for a walk .there were a lot of people on the pier walking as well .after our walk we got back in the bus . we were very tired .\nB: people waiting at a train station\nC: sunny day at a ski resort\nD: crowded city street",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: i waited for the bus to arrive to take us to the beach .it was a beautiful day for the beach .after the beach we went for a walk .there were a lot of people on the pier walking as well .after our walk we got back in the bus . we were very tired .\nB: people waiting at a train station\nC: sunny day at a ski resort\nD: crowded city street",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_172_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_172_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_172_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_172_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_172_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: The guests at the wedding stand up from their tables. The couple makes a speech. The couple make a joke. The guests laugh at the joke. The guests await the marriage ceremony.\nB: There is no speech or joke at the wedding.\nC: The guests are not laughing at any joke at the wedding.\nD: The guests are sitting at their tables during the wedding.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: The guests at the wedding stand up from their tables. The couple makes a speech. The couple make a joke. The guests laugh at the joke. The guests await the marriage ceremony.\nB: There is no speech or joke at the wedding.\nC: The guests are not laughing at any joke at the wedding.\nD: The guests are sitting at their tables during the wedding.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_173_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_173_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_173_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_173_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_173_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: The teammates are supportive and cheering for the girl with a ponytail.\nB: The coach is happy with her team's performance.\nC: The girl with a ponytail is content with her coach.\nD: The blonde coach looks disapprovingly at her team. The girl with a ponytail is standing by herself and she is not happy with her coach. The girl's three teammates are disappointed with what the coach is saying, but they stay quiet. The blonde coach begins her dressing down of the team. The girl on her own talks back to her coach in a disrespectful manner.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: The teammates are supportive and cheering for the girl with a ponytail.\nB: The coach is happy with her team's performance.\nC: The girl with a ponytail is content with her coach.\nD: The blonde coach looks disapprovingly at her team. The girl with a ponytail is standing by herself and she is not happy with her coach. The girl's three teammates are disappointed with what the coach is saying, but they stay quiet. The blonde coach begins her dressing down of the team. The girl on her own talks back to her coach in a disrespectful manner.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_174_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_174_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_174_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_174_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_174_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: i took a selfie when my husband and i went to the beach .he made the decision to play pool after the beach .i did n't realize how well he played and he picked a great bar to play at .we had a good time laughing and playing .i took another selfie at the end of the day because i was so happy with the time i had .\nB: we went to the movies and watched a comedy.\nC: i went to the beach with a friend and we played volleyball.\nD: my husband and i went to a restaurant and had a nice dinner.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: i took a selfie when my husband and i went to the beach .he made the decision to play pool after the beach .i did n't realize how well he played and he picked a great bar to play at .we had a good time laughing and playing .i took another selfie at the end of the day because i was so happy with the time i had .\nB: we went to the movies and watched a comedy.\nC: i went to the beach with a friend and we played volleyball.\nD: my husband and i went to a restaurant and had a nice dinner.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_175_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_175_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_175_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_175_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_175_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: this girl is happy and beautiful !this llama looks crazy !what a great time at the fair !this clown is so funny and entertaining !what a beautiful and lit up sight !\nB: this girl looks sad and dull !this llama looks calm !what a boring and gloomy time at the fair !this clown is serious and uninteresting !what a ugly and dark sight !\nC: this girl looks worried and plain !this llama looks normal !what a mediocre and uninteresting time at the fair !this clown is not funny and unentertaining !what a dull and unexciting sight !\nD: this girl looks angry and ugly !this llama looks peaceful !what a terrible and scary time at the fair !this clown is boring and dull !what a horrible and dull sight !",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: this girl is happy and beautiful !this llama looks crazy !what a great time at the fair !this clown is so funny and entertaining !what a beautiful and lit up sight !\nB: this girl looks sad and dull !this llama looks calm !what a boring and gloomy time at the fair !this clown is serious and uninteresting !what a ugly and dark sight !\nC: this girl looks worried and plain !this llama looks normal !what a mediocre and uninteresting time at the fair !this clown is not funny and unentertaining !what a dull and unexciting sight !\nD: this girl looks angry and ugly !this llama looks peaceful !what a terrible and scary time at the fair !this clown is boring and dull !what a horrible and dull sight !",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_176_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_176_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_176_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_176_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_176_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: A woman is shown looking through a crowd. Women sitting next to each other do not face the camera. People are watching someone present. People are gather at long dining tables to eat food. A man and woman are happily looking towards the front of the room.\nB: A woman is shown looking through a crowd. Women sitting next to each other turn and face the camera. People are watching someone present. People are gather at long dining tables to eat food. A man and woman are happily looking towards the front of the room.\nC: A man is shown looking through a crowd. Women sitting next to each other turn and face the camera. People are watching someone present. People are gather at long dining tables to eat food. A woman and man are happily looking towards the front of the room.\nD: A woman is shown looking through a crowd. Men sitting next to each other turn and face the camera. People are watching someone present. People are gather at long dining tables to eat food. A man and woman are happily looking towards the front of the room.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: A woman is shown looking through a crowd. Women sitting next to each other do not face the camera. People are watching someone present. People are gather at long dining tables to eat food. A man and woman are happily looking towards the front of the room.\nB: A woman is shown looking through a crowd. Women sitting next to each other turn and face the camera. People are watching someone present. People are gather at long dining tables to eat food. A man and woman are happily looking towards the front of the room.\nC: A man is shown looking through a crowd. Women sitting next to each other turn and face the camera. People are watching someone present. People are gather at long dining tables to eat food. A woman and man are happily looking towards the front of the room.\nD: A woman is shown looking through a crowd. Men sitting next to each other turn and face the camera. People are watching someone present. People are gather at long dining tables to eat food. A man and woman are happily looking towards the front of the room.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_177_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_177_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_177_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_177_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_177_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: The images reveal a small and unremarkable garden.\nB: The images depict a crowded and untidy garden.\nC: The images show a dreary and uninviting garden.\nD: i am very fond of garden we spand some hover over these flowers there are many beautiful places we have a very beautiful garden in our house we gather twice a week in this room",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: The images reveal a small and unremarkable garden.\nB: The images depict a crowded and untidy garden.\nC: The images show a dreary and uninviting garden.\nD: i am very fond of garden we spand some hover over these flowers there are many beautiful places we have a very beautiful garden in our house we gather twice a week in this room",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_178_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_178_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_178_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_178_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_178_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: The family was at a barbecue. They were roasting marshmallows on the fire. The dad was grilling. His kids were playing in the background. Mom joined them with a smile.\nB: The family was hiking. They were roasting marshmallows on the fire. The dad was cooking. His kids were fighting in the background. Mom joined them with a smile.\nC: The family was at a picnic. They were roasting marshmallows on the fire. The dad was cooking. His kids were reading in the background. Mom ignored them with a frown.\nD: The family was camping. They were grilling meat on the fire. The dad was eating. His kids were playing in the background. Mom joined them with a smile.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: The family was at a barbecue. They were roasting marshmallows on the fire. The dad was grilling. His kids were playing in the background. Mom joined them with a smile.\nB: The family was hiking. They were roasting marshmallows on the fire. The dad was cooking. His kids were fighting in the background. Mom joined them with a smile.\nC: The family was at a picnic. They were roasting marshmallows on the fire. The dad was cooking. His kids were reading in the background. Mom ignored them with a frown.\nD: The family was camping. They were grilling meat on the fire. The dad was eating. His kids were playing in the background. Mom joined them with a smile.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_179_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_179_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_179_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_179_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_179_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: a group picture on the beach with beach toys and a picnic basket\nB: a scenic view of the mountains with a river flowing through\nC: a family gathering at a park with a barbecue grill\nD: our first breakfast on the cruise ship .we enjoyed a little 7up while eating .then we took a stroll on the deck .we checked out the cars in the garage of the shipwe asked a bystander to take this photo of the three of us",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: a group picture on the beach with beach toys and a picnic basket\nB: a scenic view of the mountains with a river flowing through\nC: a family gathering at a park with a barbecue grill\nD: our first breakfast on the cruise ship .we enjoyed a little 7up while eating .then we took a stroll on the deck .we checked out the cars in the garage of the shipwe asked a bystander to take this photo of the three of us",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_180_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_180_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_180_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_180_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_180_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: Two young women in green are seen looking at the screen of their mobile devices.\nB: A young woman dressed in blue is seen looking at the screen of her mobile device. She is accompanied by a man who is dressed entirely in purple, and they are currently strolling around a city.\nC: A young man and a young woman are seen glancing at the screen of their mobile devices.\nD: A young man dressed in purple is seen glancing at the screen of his mobile device. He is accompanied by another man who is dressed entirely in purple, and they are currently strolling around a city. They are located on a street in a coastal city that is directly across from a beach. As they stroll alongside one another, they are pondering the question, \"Where are we going?\n\u201d. The first person in the pair looks through a trash can to see if there's anything they can use.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: Two young women in green are seen looking at the screen of their mobile devices.\nB: A young woman dressed in blue is seen looking at the screen of her mobile device. She is accompanied by a man who is dressed entirely in purple, and they are currently strolling around a city.\nC: A young man and a young woman are seen glancing at the screen of their mobile devices.\nD: A young man dressed in purple is seen glancing at the screen of his mobile device. He is accompanied by another man who is dressed entirely in purple, and they are currently strolling around a city. They are located on a street in a coastal city that is directly across from a beach. As they stroll alongside one another, they are pondering the question, \"Where are we going?\n\u201d. The first person in the pair looks through a trash can to see if there's anything they can use.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_181_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_181_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_181_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_181_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_181_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: Two men are sitting on a bench waiting for their friends to arrive.\nB: Three men are standing by the car chatting happily.\nC: Three men are standing in front of a building waiting for someone meeting them there. A man shows up with news and two of the men step closer to hear him better. He tells the 3 men that their other friend couldn't come with him, because his mom needed him today. One man thinks that is so funny he stops and asks Are you serious man? with a clownish face He can't b it and asks the other men if they believed their friend had to stay home with mom.\nD: A group of people are posing for a picture in front of a building.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: Two men are sitting on a bench waiting for their friends to arrive.\nB: Three men are standing by the car chatting happily.\nC: Three men are standing in front of a building waiting for someone meeting them there. A man shows up with news and two of the men step closer to hear him better. He tells the 3 men that their other friend couldn't come with him, because his mom needed him today. One man thinks that is so funny he stops and asks Are you serious man? with a clownish face He can't b it and asks the other men if they believed their friend had to stay home with mom.\nD: A group of people are posing for a picture in front of a building.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_182_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_182_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_182_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_182_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_182_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: The flight attendants are not providing any services.\nB: A flight attendant addresses someone who is out of screen shot. A flight attendant jokes with another person in the galley. A view out the window shows another jet at the airport. The airplane is crowded. Flight attendants demonstrate safety equipment.\nC: The airplane is empty and not ready for a flight.\nD: The passengers are quiet and seated comfortably.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: The flight attendants are not providing any services.\nB: A flight attendant addresses someone who is out of screen shot. A flight attendant jokes with another person in the galley. A view out the window shows another jet at the airport. The airplane is crowded. Flight attendants demonstrate safety equipment.\nC: The airplane is empty and not ready for a flight.\nD: The passengers are quiet and seated comfortably.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_183_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_183_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_183_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_183_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_183_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: The man is fixing a car.\nB: The two women are arguing about money.\nC: We are in our room. We are talking about cleaning. The woman wants to read instead. The old woman talks to us. We grow old together.\nD: They are outdoors having a picnic.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: The man is fixing a car.\nB: The two women are arguing about money.\nC: We are in our room. We are talking about cleaning. The woman wants to read instead. The old woman talks to us. We grow old together.\nD: They are outdoors having a picnic.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_184_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_184_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_184_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_184_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_184_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: the sea stretched before us in an endless ocean of blue .in the evening , the sun began to set in the west over the water .rock climbing was an interesting experience .i was afraid to fall , but took a chance and waved .the water is so blue , it almost looks like ice .\nB: the city skyline was stunning as the sun set .i was mesmerized by the beautiful colors of the sky .i couldn't take my eyes off the horizon .the buildings were silhouetted against the orange and pink sky .\nC: we climbed a huge mountain and saw a breathtaking view .the sky was clear and the sun was shining brightly .i couldn't believe the stunning scenery .i felt alive and free at the top .\nD: the sky was clear and the water was warm .it was a perfect day for swimming .i was amazed by the beautiful sunset .the water looked so inviting .",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: the sea stretched before us in an endless ocean of blue .in the evening , the sun began to set in the west over the water .rock climbing was an interesting experience .i was afraid to fall , but took a chance and waved .the water is so blue , it almost looks like ice .\nB: the city skyline was stunning as the sun set .i was mesmerized by the beautiful colors of the sky .i couldn't take my eyes off the horizon .the buildings were silhouetted against the orange and pink sky .\nC: we climbed a huge mountain and saw a breathtaking view .the sky was clear and the sun was shining brightly .i couldn't believe the stunning scenery .i felt alive and free at the top .\nD: the sky was clear and the water was warm .it was a perfect day for swimming .i was amazed by the beautiful sunset .the water looked so inviting .",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_185_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_185_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_185_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_185_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_185_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: we decided to go to city on vacation this year .city was absolutely beautiful , i love how the buildings sit right on the street .the architecture was amazing , we do n't have buildings like these in the states .we took a taxi ride down the road , it was very romatic .the food was very different than i 'm used to , i never was quite sure what i was eating .\nB: we decided to go to beach on vacation this year .beach was absolutely beautiful , i love how the buildings sit right on the sand .the architecture was amazing , we do n't have buildings like these in the states .we took a boat ride down the river , it was very romatic .the food was very different than i 'm used to , i never was quite sure what i was eating .\nC: we decided to go to village on vacation this year .village was absolutely beautiful , i love how the buildings sit right on the hill .the architecture was amazing , we do n't have buildings like these in the states .we took a bicycle ride down the path , it was very romatic .the food was very different than i 'm used to , i never was quite sure what i was eating .\nD: we decided to go to location on vacation this year .location was absolutely beautiful , i love how the buildings sit right on the water .the architecture was amazing , we do n't have buildings like these in the states .we took a gondola down the canal , it was very romatic .the food was very different than i 'm used to , i never was quite sure what i was eating .",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: we decided to go to city on vacation this year .city was absolutely beautiful , i love how the buildings sit right on the street .the architecture was amazing , we do n't have buildings like these in the states .we took a taxi ride down the road , it was very romatic .the food was very different than i 'm used to , i never was quite sure what i was eating .\nB: we decided to go to beach on vacation this year .beach was absolutely beautiful , i love how the buildings sit right on the sand .the architecture was amazing , we do n't have buildings like these in the states .we took a boat ride down the river , it was very romatic .the food was very different than i 'm used to , i never was quite sure what i was eating .\nC: we decided to go to village on vacation this year .village was absolutely beautiful , i love how the buildings sit right on the hill .the architecture was amazing , we do n't have buildings like these in the states .we took a bicycle ride down the path , it was very romatic .the food was very different than i 'm used to , i never was quite sure what i was eating .\nD: we decided to go to location on vacation this year .location was absolutely beautiful , i love how the buildings sit right on the water .the architecture was amazing , we do n't have buildings like these in the states .we took a gondola down the canal , it was very romatic .the food was very different than i 'm used to , i never was quite sure what i was eating .",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_186_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_186_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_186_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_186_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_186_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: The city was quiet one morning. The cage was barely visible. A strange man appeared with a tour guide. A cat emerged to be pet by the new strange man. The guide was in awe of the stranger petting the cat.\nB: The city was loud one morning. The cage was clearly visible. A familiar man appeared with a tour guide. A dog emerged to be pet by the new familiar man. The guide was uninterested in the stranger petting the dog.\nC: The city was empty one morning. The cage was invisible. A mysterious man appeared with a tour guide. A mouse emerged to be pet by the new mysterious man. The guide was scared of the stranger petting the mouse.\nD: The city was bustling one morning. The cage was completely hidden. A typical man appeared with a tour guide. A bird emerged to be pet by the new typical man. The guide was skeptical of the stranger petting the bird.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: The city was quiet one morning. The cage was barely visible. A strange man appeared with a tour guide. A cat emerged to be pet by the new strange man. The guide was in awe of the stranger petting the cat.\nB: The city was loud one morning. The cage was clearly visible. A familiar man appeared with a tour guide. A dog emerged to be pet by the new familiar man. The guide was uninterested in the stranger petting the dog.\nC: The city was empty one morning. The cage was invisible. A mysterious man appeared with a tour guide. A mouse emerged to be pet by the new mysterious man. The guide was scared of the stranger petting the mouse.\nD: The city was bustling one morning. The cage was completely hidden. A typical man appeared with a tour guide. A bird emerged to be pet by the new typical man. The guide was skeptical of the stranger petting the bird.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_187_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_187_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_187_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_187_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_187_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: we worked the game that day .the crowd was so excited for the team .the show at halftime was entertaining .the marcee displayed many important players for the event .the evening ended with fireworks .\nB: they sang the anthem before the game .the crowd was disappointed with the team .the halftime show was boring .the marcee was not well-prepared for the event .the evening ended quietly .\nC: the players competed in the game .the crowd was silent during the game .there was no halftime show .the marcee was not present at the event .the evening ended with no special event .\nD: the players practiced before the game .the crowd was indifferent to the team .there was no halftime show .the marcee was not present at the event .the evening ended with no special event .",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: we worked the game that day .the crowd was so excited for the team .the show at halftime was entertaining .the marcee displayed many important players for the event .the evening ended with fireworks .\nB: they sang the anthem before the game .the crowd was disappointed with the team .the halftime show was boring .the marcee was not well-prepared for the event .the evening ended quietly .\nC: the players competed in the game .the crowd was silent during the game .there was no halftime show .the marcee was not present at the event .the evening ended with no special event .\nD: the players practiced before the game .the crowd was indifferent to the team .there was no halftime show .the marcee was not present at the event .the evening ended with no special event .",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_188_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_188_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_188_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_188_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_188_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: it was a boring night with nothing much happening .people didn't enjoy themselves .broken relationships remained broken .no fun stories were told .and a boring speech was heard by all .\nB: it was a chaotic night with no sense of accomplishment .people had a terrible time together .broken relationships remained broken .many sad stories were told .and a terrible speech was heard by all .\nC: it was a quiet night with no excitement .people had a boring time together .lost relationships stayed lost .no stories were told .and a quiet speech was heard by all .\nD: it was a night to celebrate great accomplishments .people had a great time together .lost relationships were rekindled .many fun stories were told .and a great speech was heard by all .",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: it was a boring night with nothing much happening .people didn't enjoy themselves .broken relationships remained broken .no fun stories were told .and a boring speech was heard by all .\nB: it was a chaotic night with no sense of accomplishment .people had a terrible time together .broken relationships remained broken .many sad stories were told .and a terrible speech was heard by all .\nC: it was a quiet night with no excitement .people had a boring time together .lost relationships stayed lost .no stories were told .and a quiet speech was heard by all .\nD: it was a night to celebrate great accomplishments .people had a great time together .lost relationships were rekindled .many fun stories were told .and a great speech was heard by all .",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_189_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_189_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_189_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_189_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_189_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: the karaoke bar was empty when [female] stepped in\nB: the karaoke bar was a mess when [female] stepped in .there was a fat man trying to sing a [female] [male] song .[female] decided to go up on the table and fight for the microphone .it did n't work because some other fat man won it over .[female] gave up after the man in blue decided that he was eminem .\nC: the karaoke bar was quiet when [female] stepped in\nD: the karaoke bar was crowded when [female] stepped in",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: the karaoke bar was empty when [female] stepped in\nB: the karaoke bar was a mess when [female] stepped in .there was a fat man trying to sing a [female] [male] song .[female] decided to go up on the table and fight for the microphone .it did n't work because some other fat man won it over .[female] gave up after the man in blue decided that he was eminem .\nC: the karaoke bar was quiet when [female] stepped in\nD: the karaoke bar was crowded when [female] stepped in",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_190_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_190_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_190_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_190_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_190_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: a group of people playing beach volleyball by the sea.\nB: a person swimming in a tropical ocean with colorful fish.\nC: there are two people racing in the snow. they are enjoying the snow skating. then, they go on a mountain top to take a snowy risk at the highest peaks. The mountain peak is high and beautiful.  This picture expresses the suffering of this person between snow and mountains\nD: a couple hiking in the desert under the hot sun.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: a group of people playing beach volleyball by the sea.\nB: a person swimming in a tropical ocean with colorful fish.\nC: there are two people racing in the snow. they are enjoying the snow skating. then, they go on a mountain top to take a snowy risk at the highest peaks. The mountain peak is high and beautiful.  This picture expresses the suffering of this person between snow and mountains\nD: a couple hiking in the desert under the hot sun.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_191_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_191_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_191_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_191_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_191_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: Two boys are playing in the park.\nB: A group of students are sitting in a classroom.\nC: A girl is walking in school uniform. In a lush green walk way surrounded with green mountain also, two boys are on the way to school in uniform with a girl holding school bag and walking in school uniform.\nD: A girl is riding a bicycle in a city street.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: Two boys are playing in the park.\nB: A group of students are sitting in a classroom.\nC: A girl is walking in school uniform. In a lush green walk way surrounded with green mountain also, two boys are on the way to school in uniform with a girl holding school bag and walking in school uniform.\nD: A girl is riding a bicycle in a city street.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_192_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_192_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_192_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_192_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_192_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: After reaching the snow coated area, we immediately turned back\nB: During the climb, we were hit by an unexpected snowstorm\nC: At the snow covered area, we decided to turn back\nD: As i approach the snow coated area I climbed the snow covered rocks But we had to stop to put on our gloves and ensure safety measures We then reach our destination, the hot water springs After spending time there , we head home",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: After reaching the snow coated area, we immediately turned back\nB: During the climb, we were hit by an unexpected snowstorm\nC: At the snow covered area, we decided to turn back\nD: As i approach the snow coated area I climbed the snow covered rocks But we had to stop to put on our gloves and ensure safety measures We then reach our destination, the hot water springs After spending time there , we head home",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_193_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_193_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_193_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_193_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_193_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: The group arrived at the coffee table. The group was discussing the various drinks available. Some drinks were served into the cups. The group was chatting and enjoying their drinks.\nB: The female arrived at the dining table. The female was talking with the two males about how good the soup was. The female had served the males some soup into each one of their bowls. The males were happily enjoying their soup. The female sat down to join the males and eat her own bowl of soup.\nC: The male arrived at the dining table. The male was talking with the two females about how good the soup was. The male had served the females some soup into each one of their bowls. The females were happily enjoying their soup. The male sat down to join the females and eat his own bowl of soup.\nD: The male arrived at the dining table. The male was talking with the two females about how good the salad was. The male had served the females some salad into each one of their bowls. The females were happily enjoying their salad. The male sat down to join the females and eat his own bowl of salad.",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: The group arrived at the coffee table. The group was discussing the various drinks available. Some drinks were served into the cups. The group was chatting and enjoying their drinks.\nB: The female arrived at the dining table. The female was talking with the two males about how good the soup was. The female had served the males some soup into each one of their bowls. The males were happily enjoying their soup. The female sat down to join the males and eat her own bowl of soup.\nC: The male arrived at the dining table. The male was talking with the two females about how good the soup was. The male had served the females some soup into each one of their bowls. The females were happily enjoying their soup. The male sat down to join the females and eat his own bowl of soup.\nD: The male arrived at the dining table. The male was talking with the two females about how good the salad was. The male had served the females some salad into each one of their bowls. The females were happily enjoying their salad. The male sat down to join the females and eat his own bowl of salad.",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_194_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_194_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_194_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_194_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_194_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "SSID",
+    "options": "A: this people seem to have fun on top of these horses they are going trough these mountains they are going to have to pass through out some little weird doors it seems to be like a very long ride and at the end they just come back to these little houses\nB: these people are skiing down the mountain\nC: these people are riding bicycles in the city\nD: these people are hiking in the forest",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: this people seem to have fun on top of these horses they are going trough these mountains they are going to have to pass through out some little weird doors it seems to be like a very long ride and at the end they just come back to these little houses\nB: these people are skiing down the mountain\nC: these people are riding bicycles in the city\nD: these people are hiking in the forest",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_195_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_195_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_195_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_195_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_195_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: the sun is rising behind the hills on a beautiful day .\nB: the boat is seen in the distance in front of the hills .the sun is attempting to peak out from the clouds .the sun is hidden by the clouds .the waterway and hillside are blended as one .the sun is setting behind the hills on a beautiful night .\nC: the boat is seen in the distance in front of the mountains .the sun is attempting to peak out from the clouds .the sun is hidden by the clouds .the waterway and mountain are blended as one .the sun is setting behind the mountains on a beautiful night .\nD: the boat is seen up close in front of the hills .the sun is hidden by the clouds .the waterway and hillside are blended as one .the sun is setting behind the hills on a beautiful night .",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: the sun is rising behind the hills on a beautiful day .\nB: the boat is seen in the distance in front of the hills .the sun is attempting to peak out from the clouds .the sun is hidden by the clouds .the waterway and hillside are blended as one .the sun is setting behind the hills on a beautiful night .\nC: the boat is seen in the distance in front of the mountains .the sun is attempting to peak out from the clouds .the sun is hidden by the clouds .the waterway and mountain are blended as one .the sun is setting behind the mountains on a beautiful night .\nD: the boat is seen up close in front of the hills .the sun is hidden by the clouds .the waterway and hillside are blended as one .the sun is setting behind the hills on a beautiful night .",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_196_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_196_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_196_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_196_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_196_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: a mountain trip out to the island .lunch at the mountain top . had a terrible time , do go .the mountain restaurant had awful food .the water was polluted , and the seagulls were aggressivenext time we will go on a nature hike .\nB: a day trip out to the island .lunch at the location location location . got indigestion , do n't go .the harbor restaurant had better food .but the water smelled , and the seagulls were pestsnext time we will go on a historical battlefield tour .\nC: a night trip out to the island .dinner at the location location location . had a great time , do go .the harbor restaurant had terrible food .the water smelled , and the seagulls were a nuisancenext time we will go on a shopping spree .\nD: a beach trip out to the island .lunch at the beach bar . had a great time , do n't go .the beach restaurant had delicious food .but the water was clear , and the seagulls were friendlynext time we will go on a boat cruise .",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: a mountain trip out to the island .lunch at the mountain top . had a terrible time , do go .the mountain restaurant had awful food .the water was polluted , and the seagulls were aggressivenext time we will go on a nature hike .\nB: a day trip out to the island .lunch at the location location location . got indigestion , do n't go .the harbor restaurant had better food .but the water smelled , and the seagulls were pestsnext time we will go on a historical battlefield tour .\nC: a night trip out to the island .dinner at the location location location . had a great time , do go .the harbor restaurant had terrible food .the water smelled , and the seagulls were a nuisancenext time we will go on a shopping spree .\nD: a beach trip out to the island .lunch at the beach bar . had a great time , do n't go .the beach restaurant had delicious food .but the water was clear , and the seagulls were friendlynext time we will go on a boat cruise .",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_197_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_197_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_197_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_197_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_197_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: a child refusing to eat his meal\nB: a group of strangers in the park playing games\nC: our loving family enjoyed our pool play day !father and son enjoy the water activities !here our little guy is getting ready to eat .he is digging into his delicious meal here !he is having a great time eating the cake !\nD: a solo man enjoying some water activities",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: a child refusing to eat his meal\nB: a group of strangers in the park playing games\nC: our loving family enjoyed our pool play day !father and son enjoy the water activities !here our little guy is getting ready to eat .he is digging into his delicious meal here !he is having a great time eating the cake !\nD: a solo man enjoying some water activities",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_198_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_198_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_198_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_198_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_198_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "multiple_image_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "VIST",
+    "options": "A: the weather was terrible !we had to stay indoors the whole time .the food was awful and we were all sick by the end .\nB: our family trip around location was so awesome !we saw many wild animals , including big crocodiles .the historical sites were interesting .there were so many things to see , it was hard choosing !finally we went back to the airport to go home .\nC: the scenery was beautiful !we saw amazing landscapes and breathtaking views .the people were so friendly and welcoming .we had a truly wonderful time !\nD: it was a boring trip .there was nothing exciting to see or do at all .we regretted going there .",
+    "question": "Describe this set of images briefly.",
+    "context": "Select from the following choices.\nA: the weather was terrible !we had to stay indoors the whole time .the food was awful and we were all sick by the end .\nB: our family trip around location was so awesome !we saw many wild animals , including big crocodiles .the historical sites were interesting .there were so many things to see , it was hard choosing !finally we went back to the airport to go home .\nC: the scenery was beautiful !we saw amazing landscapes and breathtaking views .the people were so friendly and welcoming .we had a truly wonderful time !\nD: it was a boring trip .there was nothing exciting to see or do at all .we regretted going there .",
+    "input_image_path": [
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_199_0.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_199_1.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_199_2.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_199_3.jpg",
+      "../MMIU-Benchmark/multiple_image_captioning/multiple_image_captioning_199_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_0_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_0_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_0_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_0_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_1_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_1_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_1_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_1_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_2_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_2_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_2_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_2_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_3_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_3_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_3_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_3_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_4_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_4_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_4_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_4_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_5_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_5_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_5_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_5_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_6_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_6_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_6_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_6_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_7_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_7_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_7_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_7_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_8_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_8_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_8_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_8_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_9_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_9_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_9_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_9_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_10_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_10_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_10_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_10_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_11_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_11_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_11_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_11_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_12_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_12_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_12_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_12_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_13_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_13_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_13_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_13_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_14_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_14_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_14_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_14_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_15_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_15_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_15_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_15_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_16_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_16_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_16_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_16_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_17_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_17_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_17_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_17_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_18_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_18_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_18_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_18_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_19_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_19_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_19_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_19_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_20_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_20_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_20_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_20_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_21_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_21_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_21_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_21_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_22_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_22_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_22_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_22_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_23_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_23_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_23_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_23_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_24_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_24_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_24_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_24_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_25_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_25_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_25_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_25_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_26_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_26_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_26_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_26_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_27_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_27_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_27_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_27_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_28_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_28_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_28_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_28_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_29_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_29_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_29_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_29_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_30_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_30_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_30_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_30_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_31_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_31_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_31_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_31_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_32_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_32_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_32_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_32_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_33_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_33_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_33_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_33_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_34_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_34_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_34_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_34_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_35_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_35_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_35_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_35_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_36_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_36_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_36_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_36_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_37_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_37_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_37_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_37_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_38_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_38_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_38_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_38_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_39_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_39_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_39_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_39_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_40_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_40_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_40_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_40_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_41_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_41_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_41_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_41_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_42_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_42_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_42_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_42_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_43_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_43_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_43_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_43_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_44_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_44_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_44_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_44_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_45_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_45_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_45_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_45_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_46_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_46_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_46_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_46_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_47_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_47_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_47_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_47_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_48_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_48_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_48_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_48_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_49_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_49_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_49_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_49_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_50_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_50_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_50_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_50_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_51_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_51_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_51_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_51_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_52_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_52_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_52_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_52_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_53_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_53_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_53_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_53_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_54_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_54_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_54_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_54_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_55_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_55_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_55_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_55_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_56_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_56_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_56_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_56_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_57_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_57_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_57_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_57_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_58_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_58_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_58_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_58_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_59_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_59_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_59_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_59_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_60_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_60_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_60_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_60_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_61_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_61_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_61_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_61_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_62_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_62_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_62_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_62_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_63_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_63_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_63_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_63_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_64_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_64_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_64_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_64_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_65_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_65_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_65_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_65_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_66_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_66_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_66_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_66_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_67_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_67_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_67_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_67_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_68_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_68_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_68_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_68_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_69_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_69_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_69_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_69_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_70_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_70_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_70_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_70_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_71_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_71_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_71_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_71_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_72_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_72_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_72_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_72_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_73_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_73_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_73_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_73_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_74_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_74_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_74_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_74_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_75_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_75_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_75_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_75_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_76_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_76_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_76_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_76_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_77_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_77_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_77_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_77_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_78_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_78_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_78_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_78_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_79_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_79_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_79_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_79_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_80_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_80_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_80_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_80_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_81_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_81_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_81_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_81_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_82_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_82_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_82_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_82_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_83_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_83_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_83_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_83_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_84_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_84_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_84_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_84_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_85_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_85_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_85_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_85_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_86_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_86_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_86_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_86_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_87_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_87_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_87_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_87_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_88_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_88_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_88_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_88_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_89_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_89_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_89_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_89_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_90_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_90_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_90_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_90_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_91_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_91_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_91_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_91_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_92_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_92_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_92_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_92_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_93_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_93_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_93_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_93_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_94_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_94_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_94_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_94_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_95_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_95_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_95_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_95_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_96_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_96_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_96_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_96_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_97_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_97_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_97_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_97_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_98_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_98_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_98_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_98_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_99_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_99_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_99_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_99_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_100_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_100_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_100_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_100_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_101_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_101_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_101_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_101_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_102_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_102_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_102_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_102_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_103_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_103_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_103_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_103_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_104_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_104_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_104_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_104_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_105_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_105_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_105_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_105_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_106_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_106_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_106_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_106_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_107_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_107_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_107_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_107_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_108_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_108_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_108_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_108_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_109_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_109_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_109_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_109_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_110_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_110_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_110_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_110_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_111_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_111_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_111_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_111_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_112_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_112_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_112_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_112_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_113_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_113_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_113_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_113_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_114_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_114_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_114_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_114_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_115_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_115_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_115_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_115_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_116_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_116_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_116_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_116_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_117_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_117_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_117_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_117_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_118_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_118_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_118_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_118_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_119_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_119_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_119_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_119_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_120_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_120_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_120_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_120_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_121_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_121_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_121_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_121_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_122_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_122_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_122_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_122_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_123_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_123_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_123_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_123_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_124_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_124_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_124_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_124_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_125_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_125_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_125_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_125_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_126_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_126_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_126_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_126_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_127_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_127_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_127_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_127_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_128_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_128_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_128_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_128_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_129_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_129_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_129_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_129_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_130_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_130_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_130_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_130_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_131_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_131_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_131_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_131_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_132_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_132_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_132_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_132_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_133_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_133_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_133_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_133_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_134_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_134_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_134_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_134_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_135_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_135_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_135_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_135_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_136_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_136_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_136_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_136_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_137_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_137_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_137_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_137_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_138_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_138_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_138_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_138_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_139_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_139_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_139_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_139_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_140_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_140_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_140_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_140_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_141_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_141_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_141_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_141_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_142_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_142_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_142_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_142_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_143_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_143_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_143_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_143_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_144_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_144_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_144_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_144_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_145_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_145_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_145_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_145_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_146_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_146_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_146_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_146_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_147_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_147_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_147_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_147_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_148_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_148_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_148_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_148_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_149_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_149_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_149_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_149_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_150_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_150_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_150_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_150_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_151_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_151_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_151_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_151_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_152_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_152_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_152_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_152_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_153_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_153_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_153_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_153_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_154_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_154_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_154_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_154_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_155_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_155_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_155_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_155_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_156_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_156_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_156_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_156_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_157_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_157_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_157_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_157_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_158_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_158_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_158_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_158_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_159_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_159_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_159_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_159_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_160_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_160_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_160_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_160_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_161_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_161_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_161_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_161_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_162_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_162_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_162_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_162_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_163_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_163_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_163_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_163_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_164_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_164_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_164_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_164_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_165_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_165_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_165_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_165_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_166_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_166_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_166_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_166_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_167_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_167_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_167_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_167_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_168_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_168_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_168_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_168_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_169_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_169_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_169_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_169_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_170_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_170_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_170_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_170_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_171_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_171_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_171_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_171_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_172_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_172_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_172_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_172_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_173_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_173_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_173_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_173_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_174_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_174_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_174_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_174_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_175_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_175_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_175_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_175_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_176_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_176_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_176_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_176_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_177_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_177_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_177_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_177_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_178_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_178_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_178_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_178_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_179_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_179_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_179_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_179_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_180_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_180_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_180_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_180_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_181_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_181_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_181_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_181_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_182_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_182_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_182_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_182_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_183_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_183_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_183_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_183_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_184_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_184_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_184_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_184_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_185_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_185_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_185_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_185_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_186_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_186_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_186_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_186_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_187_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_187_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_187_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_187_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_188_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_188_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_188_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_188_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_189_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_189_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_189_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_189_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_190_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_190_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_190_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_190_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_191_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_191_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_191_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_191_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_192_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_192_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_192_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_192_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_193_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_193_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_193_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_193_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_194_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_194_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_194_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_194_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_195_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_195_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_195_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_195_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "quickdraw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_196_0.png",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_196_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_196_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_196_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_197_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_197_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_197_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_197_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_198_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_198_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_198_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_198_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "sketch2image_retrieval",
+    "visual_input_component": "['natural_image', 'sketch_image']",
+    "source": "DomainNet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_199_0.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_199_1.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_199_2.jpg",
+      "../MMIU-Benchmark/sketch2image_retrieval/sketch2image_retrieval_199_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_0_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_0_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_0_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_0_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_0_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_1_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_1_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_1_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_1_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_1_4.JPEG"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_2_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_2_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_2_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_2_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_2_4.JPEG"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_3_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_3_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_3_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_3_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_3_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_4_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_4_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_4_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_4_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_4_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_5_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_5_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_5_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_5_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_5_4.JPEG"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_6_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_6_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_6_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_6_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_6_4.JPEG"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_7_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_7_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_7_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_7_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_7_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_8_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_8_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_8_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_8_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_8_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_9_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_9_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_9_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_9_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_9_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_10_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_10_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_10_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_10_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_10_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_11_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_11_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_11_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_11_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_11_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_12_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_12_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_12_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_12_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_12_4.JPEG"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_13_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_13_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_13_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_13_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_13_4.JPEG"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_14_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_14_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_14_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_14_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_14_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_15_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_15_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_15_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_15_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_15_4.JPEG"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_16_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_16_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_16_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_16_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_16_4.JPEG"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_17_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_17_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_17_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_17_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_17_4.JPEG"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_18_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_18_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_18_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_18_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_18_4.JPEG"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_19_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_19_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_19_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_19_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_19_4.JPEG"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_20_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_20_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_20_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_20_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_20_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_21_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_21_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_21_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_21_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_21_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_22_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_22_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_22_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_22_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_22_4.JPEG"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_23_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_23_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_23_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_23_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_23_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_24_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_24_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_24_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_24_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_24_4.JPEG"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_25_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_25_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_25_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_25_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_25_4.JPEG"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_26_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_26_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_26_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_26_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_26_4.JPEG"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_27_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_27_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_27_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_27_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_27_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_28_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_28_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_28_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_28_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_28_4.JPEG"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_29_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_29_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_29_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_29_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_29_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_30_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_30_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_30_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_30_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_30_4.JPEG"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_31_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_31_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_31_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_31_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_31_4.JPEG"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_32_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_32_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_32_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_32_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_32_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_33_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_33_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_33_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_33_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_33_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_34_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_34_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_34_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_34_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_34_4.JPEG"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_35_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_35_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_35_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_35_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_35_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_36_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_36_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_36_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_36_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_36_4.JPEG"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_37_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_37_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_37_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_37_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_37_4.JPEG"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_38_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_38_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_38_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_38_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_38_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_39_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_39_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_39_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_39_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_39_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_40_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_40_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_40_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_40_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_40_4.JPEG"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_41_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_41_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_41_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_41_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_41_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_42_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_42_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_42_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_42_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_42_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_43_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_43_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_43_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_43_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_43_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_44_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_44_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_44_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_44_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_44_4.JPEG"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_45_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_45_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_45_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_45_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_45_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_46_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_46_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_46_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_46_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_46_4.JPEG"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_47_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_47_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_47_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_47_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_47_4.JPEG"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_48_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_48_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_48_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_48_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_48_4.JPEG"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_49_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_49_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_49_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_49_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_49_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_50_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_50_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_50_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_50_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_50_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_51_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_51_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_51_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_51_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_51_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_52_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_52_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_52_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_52_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_52_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_53_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_53_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_53_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_53_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_53_4.JPEG"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_54_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_54_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_54_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_54_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_54_4.JPEG"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_55_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_55_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_55_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_55_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_55_4.JPEG"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_56_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_56_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_56_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_56_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_56_4.JPEG"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_57_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_57_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_57_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_57_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_57_4.JPEG"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_58_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_58_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_58_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_58_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_58_4.JPEG"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_59_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_59_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_59_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_59_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_59_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_60_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_60_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_60_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_60_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_60_4.JPEG"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_61_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_61_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_61_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_61_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_61_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_62_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_62_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_62_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_62_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_62_4.JPEG"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_63_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_63_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_63_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_63_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_63_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_64_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_64_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_64_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_64_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_64_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_65_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_65_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_65_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_65_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_65_4.JPEG"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_66_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_66_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_66_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_66_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_66_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_67_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_67_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_67_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_67_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_67_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_68_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_68_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_68_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_68_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_68_4.JPEG"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_69_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_69_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_69_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_69_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_69_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_70_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_70_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_70_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_70_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_70_4.JPEG"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_71_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_71_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_71_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_71_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_71_4.JPEG"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_72_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_72_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_72_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_72_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_72_4.JPEG"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_73_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_73_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_73_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_73_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_73_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_74_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_74_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_74_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_74_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_74_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_75_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_75_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_75_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_75_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_75_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_76_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_76_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_76_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_76_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_76_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_77_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_77_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_77_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_77_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_77_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_78_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_78_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_78_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_78_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_78_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_79_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_79_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_79_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_79_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_79_4.JPEG"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_80_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_80_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_80_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_80_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_80_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_81_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_81_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_81_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_81_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_81_4.JPEG"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_82_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_82_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_82_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_82_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_82_4.JPEG"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_83_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_83_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_83_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_83_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_83_4.JPEG"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_84_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_84_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_84_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_84_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_84_4.JPEG"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_85_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_85_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_85_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_85_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_85_4.JPEG"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_86_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_86_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_86_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_86_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_86_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_87_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_87_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_87_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_87_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_87_4.JPEG"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_88_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_88_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_88_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_88_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_88_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_89_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_89_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_89_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_89_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_89_4.JPEG"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_90_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_90_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_90_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_90_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_90_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_91_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_91_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_91_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_91_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_91_4.JPEG"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_92_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_92_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_92_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_92_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_92_4.JPEG"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_93_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_93_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_93_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_93_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_93_4.JPEG"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_94_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_94_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_94_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_94_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_94_4.JPEG"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_95_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_95_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_95_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_95_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_95_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_96_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_96_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_96_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_96_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_96_4.JPEG"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_97_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_97_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_97_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_97_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_97_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_98_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_98_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_98_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_98_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_98_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_99_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_99_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_99_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_99_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_99_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_100_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_100_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_100_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_100_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_100_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_101_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_101_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_101_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_101_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_101_4.JPEG"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_102_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_102_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_102_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_102_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_102_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_103_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_103_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_103_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_103_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_103_4.JPEG"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_104_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_104_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_104_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_104_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_104_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_105_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_105_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_105_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_105_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_105_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_106_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_106_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_106_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_106_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_106_4.JPEG"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_107_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_107_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_107_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_107_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_107_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_108_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_108_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_108_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_108_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_108_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_109_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_109_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_109_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_109_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_109_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_110_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_110_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_110_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_110_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_110_4.JPEG"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_111_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_111_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_111_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_111_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_111_4.JPEG"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_112_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_112_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_112_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_112_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_112_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_113_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_113_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_113_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_113_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_113_4.JPEG"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_114_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_114_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_114_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_114_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_114_4.JPEG"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_115_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_115_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_115_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_115_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_115_4.JPEG"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_116_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_116_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_116_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_116_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_116_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_117_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_117_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_117_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_117_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_117_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_118_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_118_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_118_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_118_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_118_4.JPEG"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_119_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_119_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_119_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_119_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_119_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_120_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_120_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_120_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_120_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_120_4.JPEG"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_121_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_121_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_121_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_121_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_121_4.JPEG"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_122_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_122_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_122_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_122_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_122_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_123_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_123_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_123_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_123_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_123_4.JPEG"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_124_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_124_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_124_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_124_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_124_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_125_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_125_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_125_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_125_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_125_4.JPEG"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_126_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_126_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_126_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_126_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_126_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_127_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_127_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_127_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_127_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_127_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_128_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_128_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_128_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_128_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_128_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_129_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_129_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_129_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_129_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_129_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_130_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_130_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_130_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_130_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_130_4.JPEG"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_131_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_131_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_131_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_131_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_131_4.JPEG"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_132_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_132_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_132_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_132_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_132_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_133_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_133_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_133_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_133_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_133_4.JPEG"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_134_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_134_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_134_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_134_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_134_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_135_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_135_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_135_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_135_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_135_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_136_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_136_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_136_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_136_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_136_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_137_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_137_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_137_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_137_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_137_4.JPEG"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_138_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_138_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_138_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_138_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_138_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_139_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_139_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_139_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_139_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_139_4.JPEG"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_140_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_140_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_140_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_140_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_140_4.JPEG"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_141_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_141_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_141_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_141_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_141_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_142_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_142_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_142_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_142_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_142_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_143_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_143_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_143_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_143_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_143_4.JPEG"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_144_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_144_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_144_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_144_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_144_4.JPEG"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_145_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_145_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_145_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_145_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_145_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_146_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_146_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_146_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_146_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_146_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_147_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_147_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_147_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_147_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_147_4.JPEG"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_148_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_148_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_148_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_148_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_148_4.JPEG"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_149_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_149_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_149_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_149_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_149_4.JPEG"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_150_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_150_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_150_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_150_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_150_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_151_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_151_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_151_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_151_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_151_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_152_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_152_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_152_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_152_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_152_4.JPEG"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_153_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_153_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_153_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_153_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_153_4.JPEG"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_154_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_154_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_154_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_154_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_154_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_155_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_155_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_155_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_155_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_155_4.JPEG"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_156_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_156_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_156_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_156_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_156_4.JPEG"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_157_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_157_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_157_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_157_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_157_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_158_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_158_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_158_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_158_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_158_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_159_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_159_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_159_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_159_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_159_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_160_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_160_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_160_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_160_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_160_4.JPEG"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_161_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_161_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_161_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_161_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_161_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_162_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_162_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_162_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_162_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_162_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_163_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_163_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_163_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_163_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_163_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_164_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_164_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_164_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_164_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_164_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_165_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_165_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_165_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_165_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_165_4.JPEG"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_166_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_166_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_166_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_166_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_166_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_167_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_167_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_167_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_167_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_167_4.JPEG"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_168_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_168_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_168_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_168_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_168_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_169_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_169_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_169_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_169_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_169_4.JPEG"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_170_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_170_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_170_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_170_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_170_4.JPEG"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_171_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_171_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_171_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_171_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_171_4.JPEG"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_172_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_172_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_172_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_172_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_172_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_173_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_173_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_173_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_173_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_173_4.JPEG"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_174_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_174_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_174_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_174_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_174_4.JPEG"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_175_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_175_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_175_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_175_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_175_4.JPEG"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_176_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_176_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_176_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_176_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_176_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_177_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_177_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_177_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_177_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_177_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_178_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_178_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_178_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_178_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_178_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_179_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_179_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_179_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_179_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_179_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_180_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_180_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_180_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_180_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_180_4.JPEG"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_181_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_181_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_181_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_181_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_181_4.JPEG"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_182_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_182_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_182_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_182_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_182_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_183_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_183_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_183_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_183_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_183_4.JPEG"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_184_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_184_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_184_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_184_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_184_4.JPEG"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_185_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_185_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_185_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_185_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_185_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_186_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_186_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_186_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_186_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_186_4.JPEG"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_187_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_187_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_187_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_187_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_187_4.JPEG"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_188_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_188_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_188_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_188_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_188_4.JPEG"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_189_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_189_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_189_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_189_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_189_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_190_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_190_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_190_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_190_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_190_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_191_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_191_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_191_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_191_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_191_4.JPEG"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_192_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_192_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_192_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_192_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_192_4.JPEG"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_193_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_193_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_193_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_193_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_193_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_194_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_194_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_194_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_194_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_194_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_195_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_195_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_195_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_195_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_195_4.JPEG"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_196_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_196_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_196_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_196_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_196_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_197_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_197_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_197_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_197_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_197_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "tinyimagenet_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_198_0.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_198_1.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_198_2.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_198_3.JPEG",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_198_4.JPEG"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "image2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "places365_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_199_0.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_199_1.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_199_2.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_199_3.jpg",
+      "../MMIU-Benchmark/image2image_retrieval/image2image_retrieval_199_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_0_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_0_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_0_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_0_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_0_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_1_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_1_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_1_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_1_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_1_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_2_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_2_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_2_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_2_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_2_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_3_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_3_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_3_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_3_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_3_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_4_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_4_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_4_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_4_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_4_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_5_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_5_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_5_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_5_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_5_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_6_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_6_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_6_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_6_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_6_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_7_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_7_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_7_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_7_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_7_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_8_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_8_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_8_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_8_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_8_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_9_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_9_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_9_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_9_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_9_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_10_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_10_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_10_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_10_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_10_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_11_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_11_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_11_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_11_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_11_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_12_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_12_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_12_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_12_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_12_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_13_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_13_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_13_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_13_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_13_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_14_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_14_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_14_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_14_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_14_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_15_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_15_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_15_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_15_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_15_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_16_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_16_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_16_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_16_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_16_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_17_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_17_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_17_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_17_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_17_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_18_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_18_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_18_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_18_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_18_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_19_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_19_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_19_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_19_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_19_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_20_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_20_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_20_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_20_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_20_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_21_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_21_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_21_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_21_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_21_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_22_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_22_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_22_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_22_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_22_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_23_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_23_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_23_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_23_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_23_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_24_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_24_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_24_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_24_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_24_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_25_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_25_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_25_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_25_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_25_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_26_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_26_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_26_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_26_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_26_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_27_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_27_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_27_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_27_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_27_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_28_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_28_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_28_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_28_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_28_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_29_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_29_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_29_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_29_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_29_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_30_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_30_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_30_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_30_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_30_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_31_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_31_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_31_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_31_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_31_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_32_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_32_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_32_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_32_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_32_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_33_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_33_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_33_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_33_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_33_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_34_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_34_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_34_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_34_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_34_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_35_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_35_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_35_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_35_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_35_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_36_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_36_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_36_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_36_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_36_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_37_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_37_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_37_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_37_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_37_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_38_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_38_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_38_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_38_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_38_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_39_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_39_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_39_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_39_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_39_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_40_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_40_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_40_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_40_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_40_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_41_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_41_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_41_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_41_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_41_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_42_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_42_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_42_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_42_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_42_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_43_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_43_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_43_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_43_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_43_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_44_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_44_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_44_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_44_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_44_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_45_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_45_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_45_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_45_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_45_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_46_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_46_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_46_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_46_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_46_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_47_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_47_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_47_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_47_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_47_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_48_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_48_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_48_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_48_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_48_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_49_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_49_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_49_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_49_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_49_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_50_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_50_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_50_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_50_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_50_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_51_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_51_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_51_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_51_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_51_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_52_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_52_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_52_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_52_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_52_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_53_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_53_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_53_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_53_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_53_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_54_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_54_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_54_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_54_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_54_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_55_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_55_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_55_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_55_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_55_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_56_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_56_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_56_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_56_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_56_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_57_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_57_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_57_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_57_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_57_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_58_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_58_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_58_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_58_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_58_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_59_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_59_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_59_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_59_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_59_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_60_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_60_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_60_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_60_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_60_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_61_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_61_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_61_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_61_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_61_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_62_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_62_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_62_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_62_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_62_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_63_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_63_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_63_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_63_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_63_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_64_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_64_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_64_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_64_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_64_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_65_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_65_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_65_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_65_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_65_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_66_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_66_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_66_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_66_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_66_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_67_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_67_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_67_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_67_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_67_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_68_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_68_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_68_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_68_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_68_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_69_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_69_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_69_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_69_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_69_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_70_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_70_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_70_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_70_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_70_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_71_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_71_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_71_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_71_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_71_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_72_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_72_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_72_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_72_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_72_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_73_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_73_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_73_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_73_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_73_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_74_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_74_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_74_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_74_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_74_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_75_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_75_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_75_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_75_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_75_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_76_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_76_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_76_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_76_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_76_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_77_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_77_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_77_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_77_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_77_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_78_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_78_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_78_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_78_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_78_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_79_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_79_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_79_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_79_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_79_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_80_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_80_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_80_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_80_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_80_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_81_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_81_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_81_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_81_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_81_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_82_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_82_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_82_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_82_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_82_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_83_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_83_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_83_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_83_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_83_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_84_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_84_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_84_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_84_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_84_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_85_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_85_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_85_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_85_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_85_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_86_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_86_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_86_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_86_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_86_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_87_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_87_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_87_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_87_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_87_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_88_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_88_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_88_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_88_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_88_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_89_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_89_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_89_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_89_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_89_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_90_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_90_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_90_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_90_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_90_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_91_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_91_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_91_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_91_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_91_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_92_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_92_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_92_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_92_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_92_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_93_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_93_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_93_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_93_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_93_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_94_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_94_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_94_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_94_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_94_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_95_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_95_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_95_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_95_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_95_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_96_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_96_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_96_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_96_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_96_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_97_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_97_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_97_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_97_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_97_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_98_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_98_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_98_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_98_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_98_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_99_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_99_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_99_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_99_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_99_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_100_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_100_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_100_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_100_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_100_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_101_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_101_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_101_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_101_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_101_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_102_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_102_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_102_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_102_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_102_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_103_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_103_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_103_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_103_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_103_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_104_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_104_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_104_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_104_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_104_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_105_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_105_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_105_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_105_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_105_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_106_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_106_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_106_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_106_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_106_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_107_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_107_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_107_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_107_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_107_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_108_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_108_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_108_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_108_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_108_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_109_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_109_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_109_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_109_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_109_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_110_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_110_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_110_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_110_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_110_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_111_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_111_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_111_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_111_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_111_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_112_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_112_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_112_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_112_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_112_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_113_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_113_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_113_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_113_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_113_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_114_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_114_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_114_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_114_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_114_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_115_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_115_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_115_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_115_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_115_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_116_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_116_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_116_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_116_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_116_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_117_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_117_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_117_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_117_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_117_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_118_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_118_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_118_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_118_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_118_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_119_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_119_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_119_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_119_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_119_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_120_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_120_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_120_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_120_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_120_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_121_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_121_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_121_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_121_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_121_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_122_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_122_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_122_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_122_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_122_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_123_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_123_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_123_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_123_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_123_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_124_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_124_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_124_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_124_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_124_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_125_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_125_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_125_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_125_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_125_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_126_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_126_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_126_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_126_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_126_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_127_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_127_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_127_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_127_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_127_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_128_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_128_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_128_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_128_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_128_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_129_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_129_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_129_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_129_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_129_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_130_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_130_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_130_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_130_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_130_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_131_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_131_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_131_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_131_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_131_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_132_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_132_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_132_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_132_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_132_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_133_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_133_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_133_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_133_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_133_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_134_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_134_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_134_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_134_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_134_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_135_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_135_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_135_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_135_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_135_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_136_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_136_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_136_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_136_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_136_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_137_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_137_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_137_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_137_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_137_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_138_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_138_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_138_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_138_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_138_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_139_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_139_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_139_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_139_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_139_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_140_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_140_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_140_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_140_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_140_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_141_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_141_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_141_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_141_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_141_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_142_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_142_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_142_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_142_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_142_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_143_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_143_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_143_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_143_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_143_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_144_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_144_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_144_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_144_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_144_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_145_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_145_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_145_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_145_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_145_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_146_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_146_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_146_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_146_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_146_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_147_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_147_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_147_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_147_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_147_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_148_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_148_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_148_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_148_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_148_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_149_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_149_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_149_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_149_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_149_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_150_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_150_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_150_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_150_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_150_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_151_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_151_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_151_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_151_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_151_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_152_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_152_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_152_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_152_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_152_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_153_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_153_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_153_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_153_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_153_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_154_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_154_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_154_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_154_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_154_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_155_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_155_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_155_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_155_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_155_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_156_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_156_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_156_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_156_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_156_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_157_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_157_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_157_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_157_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_157_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_158_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_158_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_158_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_158_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_158_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_159_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_159_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_159_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_159_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_159_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_160_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_160_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_160_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_160_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_160_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_161_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_161_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_161_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_161_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_161_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_162_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_162_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_162_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_162_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_162_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_163_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_163_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_163_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_163_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_163_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_164_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_164_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_164_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_164_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_164_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_165_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_165_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_165_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_165_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_165_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_166_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_166_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_166_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_166_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_166_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_167_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_167_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_167_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_167_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_167_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_168_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_168_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_168_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_168_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_168_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_169_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_169_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_169_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_169_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_169_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_170_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_170_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_170_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_170_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_170_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_171_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_171_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_171_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_171_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_171_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_172_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_172_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_172_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_172_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_172_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_173_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_173_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_173_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_173_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_173_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_174_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_174_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_174_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_174_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_174_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_175_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_175_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_175_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_175_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_175_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_176_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_176_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_176_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_176_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_176_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_177_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_177_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_177_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_177_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_177_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_178_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_178_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_178_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_178_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_178_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_179_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_179_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_179_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_179_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_179_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_180_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_180_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_180_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_180_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_180_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_181_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_181_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_181_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_181_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_181_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_182_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_182_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_182_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_182_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_182_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_183_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_183_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_183_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_183_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_183_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_184_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_184_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_184_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_184_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_184_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_185_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_185_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_185_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_185_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_185_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_186_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_186_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_186_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_186_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_186_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_187_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_187_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_187_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_187_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_187_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_188_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_188_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_188_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_188_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_188_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_189_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_189_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_189_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_189_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_189_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_190_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_190_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_190_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_190_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_190_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_191_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_191_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_191_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_191_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_191_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_192_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_192_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_192_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_192_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_192_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_193_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_193_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_193_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_193_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_193_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_194_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_194_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_194_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_194_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_194_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_195_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_195_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_195_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_195_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_195_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_196_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_196_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_196_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_196_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_196_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_197_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_197_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_197_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_197_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_197_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_198_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_198_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_198_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_198_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_198_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "vehicle_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "veri_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_199_0.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_199_1.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_199_2.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_199_3.jpg",
+      "../MMIU-Benchmark/vehicle_retrieval/vehicle_retrieval_199_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Scrape\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_0_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_0_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mash/Pound\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_1_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_1_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Brush/Dust\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_2_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_2_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_3_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_3_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mash/Pound\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_4_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_4_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Brush/Dust\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_5_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_5_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Scrape\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_6_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_6_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Brush/Dust\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_7_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_7_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mash/Pound\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_8_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_8_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mix\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_9_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_9_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Brush/Dust\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_10_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_10_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Poke\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_11_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_11_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Pour\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_12_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_12_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mash/Pound\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_13_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_13_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Scrape\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_14_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_14_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Pour\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_15_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_15_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Brush/Dust\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_16_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_16_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mash/Pound\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_17_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_17_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Scoop\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_18_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_18_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Brush/Dust\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_19_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_19_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mash/Pound\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_20_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_20_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Pour\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_21_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_21_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Pull out a nail\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_22_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_22_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Brush/Dust\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_23_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_23_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_24_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_24_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Pour\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_25_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_25_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mash/Pound\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_26_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_26_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Pour\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_27_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_27_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_28_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_28_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mash/Pound\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_29_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_29_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mash/Pound\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_30_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_30_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Scrape\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_31_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_31_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Poke\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_32_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_32_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mix\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_33_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_33_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mix\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_34_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_34_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mix\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_35_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_35_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Pour\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_36_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_36_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_37_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_37_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Pour\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_38_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_38_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Brush/Dust\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_39_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_39_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mash/Pound\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_40_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_40_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Brush/Dust\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_41_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_41_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Brush/Dust\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_42_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_42_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Brush/Dust\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_43_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_43_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mix\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_44_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_44_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Pull out a nail\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_45_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_45_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Pour\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_46_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_46_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_47_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_47_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mix\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_48_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_48_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Pull out a nail\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_49_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_49_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mix\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_50_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_50_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Poke\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_51_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_51_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Pull out a nail\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_52_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_52_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Pull out a nail\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_53_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_53_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Pour\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_54_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_54_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Scoop\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_55_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_55_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mash/Pound\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_56_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_56_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Pull out a nail\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_57_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_57_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mix\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_58_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_58_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Brush/Dust\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_59_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_59_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mix\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_60_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_60_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Poke\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_61_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_61_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mix\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_62_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_62_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Scrape\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_63_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_63_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mash/Pound\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_64_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_64_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Pull out a nail\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_65_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_65_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Pour\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_66_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_66_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mix\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_67_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_67_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Pull out a nail\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_68_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_68_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Scoop\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_69_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_69_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Pull out a nail\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_70_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_70_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Pour\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_71_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_71_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mix\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_72_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_72_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Scrape\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_73_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_73_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mash/Pound\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_74_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_74_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mash/Pound\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_75_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_75_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mix\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_76_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_76_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mash/Pound\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_77_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_77_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Scoop\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_78_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_78_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_79_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_79_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Scoop\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_80_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_80_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mash/Pound\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_81_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_81_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Scrape\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_82_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_82_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Scoop\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_83_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_83_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Scrape\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_84_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_84_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Pull out a nail\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_85_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_85_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Brush/Dust\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_86_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_86_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mix\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_87_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_87_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_88_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_88_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Brush/Dust\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_89_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_89_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Pull out a nail\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_90_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_90_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mash/Pound\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_91_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_91_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Pour\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_92_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_92_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Scoop\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_93_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_93_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Brush/Dust\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_94_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_94_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Pull out a nail\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_95_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_95_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Scrape\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_96_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_96_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mix\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_97_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_97_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mix\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_98_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_98_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mix\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_99_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_99_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Pour\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_100_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_100_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mash/Pound\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_101_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_101_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Pour\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_102_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_102_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Scrape\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_103_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_103_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Pour\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_104_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_104_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Pull out a nail\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_105_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_105_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mix\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_106_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_106_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_107_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_107_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Pull out a nail\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_108_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_108_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Scrape\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_109_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_109_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Brush/Dust\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_110_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_110_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mix\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_111_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_111_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_112_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_112_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Poke\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_113_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_113_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Scrape\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_114_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_114_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Pull out a nail\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_115_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_115_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Poke\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_116_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_116_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mash/Pound\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_117_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_117_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Pull out a nail\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_118_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_118_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_119_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_119_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mash/Pound\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_120_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_120_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_121_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_121_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Pour\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_122_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_122_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Scrape\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_123_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_123_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_124_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_124_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_125_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_125_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Scoop\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_126_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_126_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_127_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_127_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Brush/Dust\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_128_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_128_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Pull out a nail\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_129_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_blink/functional_correspondence_blink_129_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this bird is entirely black and has a pointed black beak.\na black bird with slick feathers, and a black bill.\nthis is a black bird with a white eye and a large black beak.\nthis bird is black with green eyes and has a long, pointy beak.\nthis bird is almost all black with the exception of yellow eyes.\nsolid black bird with a medium beak and a yellow eye.\nthis bird has wings that are black and has yellow eyes\nthis bird is solid black, with a penetrating gaze and a sharp bill.\nthis particular bird has a belly that is black with white eye rings\nthis bird has a black bill and crown and black breast, belly, and wings.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_0_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_0_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_0_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_0_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: a bird with a very long wing span and a long pointed beak.\nthe long-beaked bird has a white body with long brown wings.\nthis is a white bird with brown wings and a large pointy beak.\nthis large bird has long bill, a white breast, belly & head and a black back & wings.\nbird has an extremely long wingspan with a darker top and white belly and head.\nthis bird has wings that are brown and has a white belly\nthis bird has extended wings and a white head and body.\nthis bird is white and brown in color, with a long curved beak.\nthis white and grey bird has an enormous wing span.\nthis bird has wings that are brown and has a white body\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_1_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_1_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_1_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_1_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the petals on this flower are numerous colors: green white, purple, and brown.\nthis unusual flower has light green pointed petals with green and yellow stamens, purple sigma and filamented blue and white petals on the inside.\nthis flower has visible sepals, petals, corona filaments that are frilly and three pronged stamen at the top making it a very unique flower.\nthis flower is green white and blue in color, with petals that are oval shaped.\nthe petals of this flower are white and green with a long stigma\nthe many petals of this flower are white and purple and the pedicel is green\nthis flower has ten evenly spaced petals protruding from its center.\nthe petals of the flower are a solid white color, and the thin stamens are a blue color, with white at the base.\nthis flower has petals that are white and has purple stamen\nthis flower is white, purple and green in color, and has petals that are oval shaped.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_2_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_2_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_2_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_2_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this small bird has a very shiny black body and feathers, a tiny black bill and bright yellow eyes.\nthis is a blue bird with a white eye and a pointy beak.\nthis bird, with a very prominent yellow eye, is all black but with light, the fur can appear dark bluish or purple.\nthis is a medium sized bird with a black head, black beak, black wings and black feet, the eye is a bright white and the black has an iridescent sheen.\na medium sized bird that has shiny feathers and a narrow pointed bill\nthis particular bird has a black breast with metallic blue wingbars\na bird with a black bill, black crown and black secondaries.\nthe bird has a black beak and yellow eyes and a very color ful gradient on its body.\na very small black bird with a small black bill, it has long legs for its body which are black as well.\nblack head blue body, with black wings.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_3_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_3_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_3_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_3_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: purple flower with curvy string-like petals and a group of large yellow stamen in the center.\nthis purple flower has unusual parts and consists of perianth segmants, corona filaments, and a three pronged stigma.\nthe petals of this flower are purple with a long stigma\nreally cool squiggly lavender petals with purple stigma and white stamen.\nthis flower is purple and white in color, with petals that are oval shaped.\nthis flower has purple petals as well as a green pistil.\nthis flower has purple petals and purple and white stamen.\nthis flower is purple and white in color, and has petals that are oval shaped.\nthis flower has purple peyals that has long and stringy stamen\nthe flower on this particular picture has petals as well as a pistil.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_4_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_4_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_4_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_4_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower has large green-yellow stamens and two types of purple petals.\nthe flower has medium size petals that are purple and smaller skinny white petals.\nlight purple and white petals white and dark purple middle petals green and yellow middle dark green leaves\nthis flower has petals that are pink with purple stringy stamen\nthis flower is pink and white in color, with petals that are oval shaped.\nthis group of two flowers have pink petals that bend backward exposing large light green stamen in the center.\nthe flower is so big with petals that are pink and arranged in disc like manner below the disc of stamen\nthis flower has thick purple petals under a layer of white fringe.\nthis flower is pink, white, and green in color, and has petals that are oval shaped.\nthis flower has petals that are pink and has grteen stamen\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_5_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_5_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_5_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_5_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower has protruding green stamen and several layers of distinctly-shaped light purple petals.\nthis flower is purple and white in color, and has petals that are oddly shaped and skinny.\nthe petals of the flower are purple in color and have a center made of yellow stigmas.\nthin light purple petals with frayed light purple, white and darker purple petals above them and a white, pink, green pistil.\nthe stringy petals are purple with the green pollen tubes in the middle\na flower with long and narrow pistils that are purple.\nthe long, narrow lavender petals are overlaid with spaghetti-like lavender strands, and striped white and purple in the center near the pistil.\nthis flower has petals that are purple and has stringy stamern\nthis flower has a back row of light violet petals with a second row of thin petals with purple and white stripes.\nthis flower has light purple petals and a corona of striped purple and white filaments between the petals and the stamens.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_6_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_6_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_6_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_6_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this is a white bird with grey wings and a dark brown head.\nthis is a white bird with grey wings and a long dark beak.\nthis bird has a white belly and breast, with a brown crown and a long blunt bill.\nthis bird is white, brown, black in color with a curved black beak, and black eye rings.\na bird with a brown nape and a white back, with black inner rectrices.\nthis bird has a dark bill and brown head along with a pale colored body and tan to grey wings.\nthis bird has wings that are brown and has a long black bill\nthis is a gray bird with a dark brown head and webbed pink feet.\nthis bird is brown in color, with a black beak.\nthis particular bird has a belly that is white and brown\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_7_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_7_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_7_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_7_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the bird has a white body with a grey rump and a yellow bill.\nthis bird is white, with a large beak and black wings.\nthis bird is white with black wings, and has a long, orange beak.\nthis bird has a white head, the bill is long and curved, with a white belly and black wings.\na medium sized bird that is mostly white with a very large hooked bill\na very large bird with mostly white body, and a large beak.\nthis bird has wings that are black and has a yellow bill\nthe bird has black wings, with a white breast, a white neck, white head, and a yellow beak.\nthe bird has white feathers on its body and black wing feathers, it has a thick orange bill.\nthis bird has a snow white breast color and a long curved bill\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_8_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_8_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_8_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_8_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: outer petals are light purple in color and klarger,inner petals are needle shaped\nthis flower has a layer of white petals on bottom, a layer of light purple petals in the middle, and a layer of very thing purple petals on top.\nthis flower is white and purple in color, with petals that are oval shaped.\nthis flower has petals that are purple with stringy stamen\nthis is a strange flower with purple and white petals and green stamen.\nthe flower has five pale purple petals, five with petals and an oddly shaped green pistil.\nthis flower has rounded green and purple petals and a fringe of purple hairs.\nthis flower has petals that are purple and white and has stringy stamen\na flower with long and narrow petals that are purple.\nthis flower has a row of long green and purple petals under a row of long needle petals.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_9_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_9_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_9_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_9_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this is a grey bird with a black beak and a white eye.\nbird with goofy round head, wide curved bill and black all over its body.\nthis is a dark grey bird with a white eye and a large black beak.\nthis bird is white and grey in color with a curved black beak, and white eye rings.\na gray bird with a wide beak and webbed feet.\na small bird with a grey head and black nape, with blue and grey covering the rest of its body\nthis bird is black and gray in color, with a large curved beak.\nthis bird is gray and black in color, with a large black beak.\nthis small bird has a black flat bill, fuzzy black feathers and small feet.\nthis bird has wings that are lack and has white eyes\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_10_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_10_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_10_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_10_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this is a purple flower with long purple anthers on it.\nthe petals of this flower are purple with a long stigma\nthis flower has petals that are purple with purple stamen\nthis flower is white and purple in color, with petals that are oval shaped.\nthis is a large flower with purple petals and white stigma.\nthe flower is so big with petals that are soft, smooth and separately arranged in single disc layer below the layer of blue curly stamen\nthis flower has purple petals as well as a green stamen.\nthis flower is purple and white in color, and has petals that are oval shaped.\nthis flower has petals that are purple and has stringy purple stamen\nit has very frilly leaves!\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_11_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_11_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_11_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_11_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower is purple and white in color, with petals that are oval shaped.\nthe flower petals is spiked and lite purple and dark purple\nthis flower has long purple petals and long purple stamen in the middle\nthe flower is so big and large with disc like arrangements of petals and stamen with stamen disc on top of petals disc\nthis flower has purple petals and green pistil as its main features\nthis flower is white and purple in color, and has petals that are oval shaped.\nthe petals are purple and white and the stamens are green and yellow with brown spots.\nthis flower has very long purple filaments on top of a layer of flat purple petals with bright yellow stamens and green pistils.\nthis flower has petals that are white and has stringy stamen\nthe petals on this flower are mostly stringy purple, and yellow, green is the color of the stamen.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_12_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_12_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_12_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_12_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this bird has a long wide beak and feathers over the eyes like lashes.\nthis bird has a yellow orange bill with white eyebrow and grey breast.\nthis bird has a black body with an orange beak\nthe bird as a grey belly, the bill is short and pointed, with black and grey covering the rest of its body\nthis waterbird features a rather large, yellow beak and small, red eyes.\nthis bird has wings that are black wtih a short yellow bill\nthis bird has wings that are black and has a yellow bill\na medium size bird with a short, pointed, orange beak.\ngrey chest, black head, yellow beak\nthis bird has wings that are black and has a yellow bill\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_13_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_13_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_13_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_13_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: a black and brown bird with small brown feet with a small bill\na black bird with a long tail and a large gray beak\nthis is a black bird with a large pointy grey beak.\nthis crested black bird has spiky plumage, long tail feathers, and a short, thick gray beak.\nthis bird is completely black with a thick blunt bill.\nthis bird has wings that are black with long tail feathers\nthis bird is mostly black with a long tail and a larger beak.\na bird with a very large grey beak and black and white feathers.\nthis bird has wings that are black and has a short bill\nthis bird has a black crown with black belly and black sides.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_14_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_14_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_14_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_14_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: a black bird with a yellow tarsus and a tall black bill.\nthis bird is black with a super thick and short beak.\nthis is a black bird with black feet and a large black beak.\nthis bird is black in color with a large black beak, and black eye rings.\nlarge black bird with long tail feathers and a thick stout black beak.\na black bird with a short, slightly curved beak, a somewhat elongated body and tapering, black tail feathers.\nthis bird has a large wide beak and black feathers covering the rest of its body.\na very tall black bird with a large black beak.\nlong black bird with a short and fat beak.\nthis bird has wings that are black and has a thick bill\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_15_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_15_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_15_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_15_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the bird has a small orange bill that is stubbed.\na black and white bird with a short blunt orange beak.\na small bird with black feathers covering its nape, back, and wings, along with a white and grey speckled throat.\nthis bird is white and black in color with a red beak, and white eye rings.\nthis is a black bird with a white belly and a red beka.\nthis bird has a white belly, black wings and crown, and a small red bill.\nthis is a black bird with a white spot on the head and a white breast with an orange beak.\nthis particular bird has a belly that is white with black spots\na bird with black wings and back, white belly and breast, the bill is short and flat\nthis bird is black and white in color, and has a bright orange beak.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_16_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_16_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_16_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_16_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: lower petals are white in color,and larger in size,inner petals are purple in clor\nwhite petals with blue white and purple petals purple green and yellow middle brown scam and green leaves\nthis flower has petals that are white, with purple and white filaments, and green stigma.\nthis flower is white and blue in color, with petals that are oval shaped.\nthis flower has petals that are white with purple stringy stamen\nthe petals of this flower are white and arranged in a star formation around a blue pistil and green stamen.\nthis flower has petals that are white and has purple stringy stamen\nthis flower has rounded green petals under a purple and white fringe and thick stamen.\nthis flower is white and purple in color, and has petals that are oval shaped.\nthis flower has purple thin petals and large light green sepals.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_17_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_17_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_17_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_17_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the flower has white sepal with purple and green pollen tubes\nthis flower has large green stamen and pistil surrounded by a purple fringe and white petals with rounded tips.\nthis flower has long white petals and long purple stamen in the middle\nthe white and purple flower has waxy leaves.\nthis flower is white and purple in color, with petals that are oval shaped.\nthis flower has white petals and large purple, white, and blue stamen.\nthis flower has white and purple petals and a green pedicel\nthis flower is white, blue, and green in color, and has petals that are oval shaped.\nthis flower has white petals that has long stringy and purple stamen\na flower with long and narrow petals that are white.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_18_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_18_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_18_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_18_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower has petals that are purple with stringy purple stamen\nthis flower has purple petals with the inner ones very thin and stringy with purple pollen tube and a green stigma\nthe violet flower has petals that are soft, smooth and separately arranged in disc like manner that is below the disc of stamens that are curly through out their length\nthis flower is white and purple in color, with petals that are oval shaped.\nthe flower has petals that are lavender, with wavy lavender filaments and white anther.\nthis flower has flat oblong purple petals with a wavy layer of filaments and a tall stamen and pistil.\nthis flower has petals that are purple and has yellow stamen\nthis flower has purple petals as well as a yellow pistil.\nthis flower has long rounded petals under a fringe of frizzy purple hairs.\nthis flower is purple and white in color, and has petals that are oval shaped.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_19_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_19_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_19_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_19_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower has petals that are purple with green steman\nthis flower is white and purple in color, with petals that are pointed.\nthis flower has purple petals and a lot of purple stamen coming out\nthis pale purple flower has string like petals that are above light purple rounded petals below.\nthe flower shown has green pollen tubes with stringy petals and a purple center\nthis flower has tall yellow stamen and green pistils, curvy white filaments, and white petals.\nthe pale purple flowers are long and thin, with darker purple encircling the base of each petal.\nthis flower has petals that are white and has purple stringy stamen\nthe flower has a lavendar petals with thin purple stamens around the green pollen tube\nthis flower has skinny, curly-looking light purple petals and large anther.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_20_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_20_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_20_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_20_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower is white in color, with petals that are oval shaped.\nthis flower's leaves appear to be large and pointy at the ends. a burnt orange color makes it exquisite to look at. the stamen is yellow in color, easy to see, and easy to get to for pollination. the pollen tube is green in color and long, sticking up above flower. the whole pistil is green in color and easy to see and get to.\nthis flower has protruding pollen tube and stamen surrounded by several slightly pointed white petals.\nthis unique white flower has lots of thin blue petals with a dark green center and purple lines.\nthe flower is so big and has petals that are white, separated, thick and arranged in a disc like manner below the stamen which is also arranged in a disclike manner that are blue, white and brown\nthis flower has ten white petals and a corona of fine, vivid purple appendages between the petals and the light green stamens.\nthis flower has white petals with blue and white filaments arranged in a flat disk formation.\nthis flower has thick pale green petals under thick purple and white fringe.\nthis flower has petals that are white and has a stringy stamen\nthis flower is white, green, and purple in color, with oval shaped petals.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_21_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_21_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_21_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_21_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the bird has a long black bill that is somewhat curved.\nthe bird has a long black bill that is curved as well.\nthis grey bird has an impressive wingspan, a grey bill, and a white stripe that surrounds the feathers near the bill.\nthis large bird is mostly grey with a long hooked bill.\nbird with long fat beak that is curved at tip, and the head is proportional to its body size with whole body covered in black\nlarge bird that is complete brown, with white stripes littering it's wings and a long blunted bill.\na black bird with very long wings and short tail, black beak with some white markings on the face around the beak and eyes\nthis bird is all black and has a long, pointy beak.\nthis appears to be a large bird that is almost completely black. it also has a very large black bill with slight white on the face and crown.\nthis bird has long triangular wings and a thick heavy beak.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_22_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_22_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_22_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_22_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the bottom layer of the flower is purple and the middle is light purple made of strings.\na purple flower with little stands curling from the bottom and large petals on top.\nthis flower has a white pistil with green pedicel and purple petals\nthis flower has petals that are purple with purple stamen\nthis flower is purple in color, with petals that are oval shaped.\nthis flower has purple petals and a purplish-white colored stamen.\nthis flower is purple and white in color, and has petals that are oval shaped.\nthis flower has petals that are purple and has stringy stamen\na flower that has long and curly pistils that is purple.\nthis flower has long purple petals bent back from a center with many long fringes of purple and an elaborate center.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_23_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_23_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_23_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_23_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this bird is completely black.\na medium bird with all black body, tarsus, and beak.\nthis bird is black in color, with a black beak and a black eye ring.\nall black bird with white eye ring, black tarsus and feet.\na bird with a black crown and a black body.\nthis bird is all black and has a long, pointy beak.\nthis bird is large and black with white eyes and black pupil and long black tail.\nthe bird is pitch black including its feet and beak, it has yellow eyes.\nthis muted black bird unveils a distinctive yellow eye behind a long pointed beak.\nthis bird has a jet black body, a short beak and bright white eyes.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_24_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_24_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_24_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_24_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: prominent purple stigma,petals are white inc olor\nthis flower has bright purple, spiky petals, and greenish sepals below them.\nthis flower has a row of white petals with the multi-colored stamens and a pistil at the centre\nthis flower is white and blue in color, with petals that are oval shaped.\nthis flower has petals that are green with stringy purple stamen\nthis flower has flat elongated creamy petals around a fringe of purple white and brown petals and large stamen.\nthis flower has petals that are white and has purple stamen\nthis flower has blue petals as well as a green and purple pistil.\nthis flower is purple, blue and white in color, and has stamen that are very long and skinny.\nthis flower has white oblong petals with blue filaments, purple pistils, and green stamen.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_25_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_25_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_25_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_25_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: an all black bird with small beady eyes.\nbird has black body feathers, black breast feather, and black beak\nthis is a black bird with a long black tail wing and a pointy beak.\nthese two birds are black all over and have very long retrices and gray bills.\nthis bird is all black with a very long tail and the area around its eye is lacking feathers.\nthis bird has feathers that are black and has a thick black bill\na small black bird, with a flat tail, and a short bill.\na small bird with a black color covering and long tail.\nthis bird is dark black and featherless around is eyes, and has a short black beak.\nthis bird has wings that are black and has a thick bill\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_26_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_26_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_26_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_26_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower is white and blue in color, with petals that are oval shaped.\ninner petals are needle shaped and are purple incolor\nthis flower has large green pistil and skinny purple and white petals as its main features\nthe petals of this flower are green with a long stigma\nthis flower has petals that are white with purple stamen\nthis flower has yellow anthers and green filaments and purple petals.\nthis purple flower has many pointed petals and a yellow and purple stamen.\nthis flower has petals that are white and has yellow stamen and\nthin, needle-like, purple petals, yellow-green anthers and a green stigma.\nthis unique flower has a lower row of white petals and an upper row of long, thin purple petals.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_27_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_27_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_27_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_27_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower has petals that are green wiht purple stiamne\nthe center of the flower is of various colors such as purple, maroon, white and yellow.\nthe flower has many colors such as blue white purple with a green style\nthis flower has white petals with a white and purple colored stigma.\na flower with a lot of different colors with a large ovary\nflower that has long, skinny, fringed petals with a white stigma and dark brown anther.\nthis flower has blue petals as well as a green pistil.\nthis flower is white and blue in color, with oval shaped petals.\nthis flower has green petals with long and string purple stamen\na flower that has long skinny petals that are purple and white.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_28_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_28_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_28_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_28_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: a flower with very thing purple, white, and red petals with very large red anther and filaments\nthe petals of this flower are green with a long stigma\nthis flower is white and purple in color, with petals that are oval shaped.\nthis flower has petals that are yellow with stringy stamen\na star shaped flower with long multiple colored stamen surrounded by green and white flat petals.\nthis flower has petals that are white and has purple stringy stamen\nthis flower has very prominent green stamen and purple pistils that stand upright in contrast to a flat ring of blue filaments and white petals.\nthis flower is white, green, and purple in color, and has petals that are oval shaped.\na flower with long and pointed pistils that are blue and yellow.\nthis flower has large white petals with a light green and brown pistil.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_29_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_29_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_29_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_29_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: leaves are green in color,outer petals are white green in color\nthis flower is white and purple in color, with petals that are oval shaped.\nthis purple flower has regular petals and noodle-shaped petals accompanied by one big pistil.\nthe petals of this flower are green with a long stigma\nthis flower has petals that are green with purple stamen\nthis flower has thick green petals surrounding a layer of thin hairlike purple petals.\nthis petal has purple and light green colors throughout its long, string-like petals.\na flower with long and narrow pedals that are white.\nan odd looking pinwheel shaped flower with wavy stringy pedals and a large center.\nthis flower has white petals that has longy stringy and purple stamen\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_30_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_30_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_30_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_30_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the bird has head and beak proportional to its body and the bird is brown in color.\nthis bird has grey neck, head, wings and back, it has white around its bill, and a long tall bill that is curved and black at its tip.\nthis bird is brown in color with a long curved beak and dark eye rings.\nthis bird is grey with some white and has a long, pointy beak.\nthis is a solid brown bird with webbed feet and a long slightly hooked bill.\nthis particular bird has a brown body and brown bill\nthe brown colored albatross has white ring at the base of its beak, white undertail and white eyebrow.\nthis bird has wings that are brown and has a big bill\nthis bird is brown in color, with a large curved beak.\na large bird with a grey coloring and long beak.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_31_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_31_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_31_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_31_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this bird is stark black from outer rectrices to beak, with bright orange wing bars and yellow secondaries.\nthis is a black bird with orange wings and a pointy black beak.\nthis is a black bird that has orange coverts and a yellow wingbar.\nthis bird is black and orange in color with a black beak, and black eye rings.\nblack bird with bright orange stripes on the wing bars and black eyes.\nthis is a bird with a black body, head and beak and it has red and yellow patches on both of it's wings.\nthis bird has wings that are black and has an orange and yellow patch\nthis bird is black with red and has a very short beak.\nthis bird is black with red and has a long, pointy beak.\na small black bird, with 1 yellow bar, and a sharp bill.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_32_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_32_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_32_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_32_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this purple flower has thin string like petals and green stamen.\nthis flower has long rounded white petals on the outer row and long, thin, wavy purple petals on the inner row.\nthis flower is purple and yellow in color with skinny wavy petals.\nthis is a flower with light purple petals and flowing flaments.\nthis flower has lavender peddles with thorns and hair like purple stigmas and white pistil.\nthis flower has petals that are purple and has stringy stamen\nthe flower has long, thin purple petals and long green stamens.\nthis flower has white petals, purple stamen and yellow pistil\nthis purple flower has pointed petals and pale purple sepals.\nthis flower has purple petals and purple stigma in a flat circle shape.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_33_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_33_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_33_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_33_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: a bird with a meedium sized, medium width pointed bill, all black feathers, a small head, and yellowish eyes\nthis bird has a black pointed beak, and a black body.\nthis is a black bird with black feet and a pointy black beak.\nthis bird is completely black, with a short pointed bill and yellow eyes.\nthis bird is all black, except for a region around its head that is so black it almost looks blue, and its eyes are yellow.\nthis is a black scary looking bird with beading eyes.\nthis bird has feathers that are black and has a black bill\nthis bird has wings that are black and has a black bill\nthis bird has wings that are black and has a thick bill\nthis bird has a deep black crown and a back bill and black wings.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_34_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_34_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_34_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_34_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the flower has a green prominent pisil and stamen that are green in color\nthis flower is white and purple in color, with petals that are oval shaped.\nthis flower has petals that are white with very stringy stamen\nthe flower has five anthers with skinny purple petals.\nthis flower has white petals as well as an interior row of thin, purple petals.\nthis flower has white petals with purple stamen and green stigma in the center.\nthe flower is so big with petals that are soft, smooth and arranged separately in disc like manner below the the disc of stamen\nthe petals are white in color underneath small, thin petals that are purple and large green stigma\nthe purple and white petals of the flower, are hidden by the purple stamens surrounding the petals.\nthis flower is characterized by its white-purple outer petals, deep purple, stringy inner petals as well as green stamen and purple pistil at the center.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_35_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_35_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_35_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_35_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the flower has white petals and a flat center with purple and white filaments and green stamen.\nthis flower is white and blue in color, with petals that are oval shaped.\nthis flower has the white petals arranged in the bottom and the top the purple shaded long pistils closely arranged in the circle order\nthe flower has light green petals with thin purple petals on top.\nthis flower has petals that are white with long purple steman\nthis flower has white petals with flat blue filaments and green stamen.\nthis flower has green symetrical filaments and long thin petals.\nthe stamens of the flower are of a hair like texture, and have a distinctive color pattern.\nthis flower has petals that are white and has stringy stamen\nthis flower is green, white, and purple in color, and has stamen that are very long and skinny.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_36_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_36_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_36_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_36_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: a large bird with a mohawk, and gray feathers, and an orange beak.\nthis bird has an orange bill, with solid brown feathers covering the rest of its body except for the small cheek patch which is cream and speckled with light brown.\nthis is a brown bird with a small orange beak.\nthis is a brown and grey bird with a small orange beak.\nthis bird has a short orange pointed bill and a brown/black mottled breast & body.\nthis bird has wings that are grey and has an orange bill\nthis bird is brown with a white line coming from its eye, it has a curled feather that comes up before its orange beak.\nthis bird is grey with white and has a very short beak.\nthis bird is gray and brown in color, and has a bright orange beak.\nthis bird has a dark orange bill, with a brown back.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_37_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_37_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_37_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_37_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the petals of this flower are magenta with a long stigma\nthis flower has large green stamen and pollen tube, and fringed purple hairlike petals, surrounded by longer white and purple petals with rounded edges.\nthis flower has petals that are purple with many stamen\nthis flower is white and purple in color, with petals that are oval shaped.\na flower with lavender petals and lavender squiggly pistils showing.\nthis flower has rounded pale purple petals and a fringe of frizzy purple hairs.\nthis flower has flat purple petals with purple stringy stamen forming from the center.\nthis flower is purple in color, and has petals that are oval shaped.\nthis flower has purple petals that has stringy stamen and as yellow style\nthe petals are long and thin and purple and form a flat flower.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_38_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_38_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_38_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_38_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: flower with white long white petals and very long purple stamen\nthis flower is white and pink in color, with petals that are oval shaped.\nthis flower has petals that are green with stringy purple stamen\na flower with stringy looking purple and yellow petals and a green and yellow center.\nthis flower has long white petals beneath a row of slender white and lavender petals surrounding a large erect pistil of green filaments topped with white anthers.\nthis flower has tall green stamen on top of a layer of wavy blue filaments and oblong white petals.\nthis flower is white, purple and yellow in color, and has petals that are oval shaped.\nthe flower on this particular picture has petals as well as a stamen.\nthis flower has petals that are white and has stringy purple stamen\nthe petals of this flower are stringy purple and white and the pistil is green\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_39_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_39_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_39_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_39_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: flower with purple petals and very long purple stamen\nthis flower is purple and white in color, with petals that are oval shaped.\nthis flower has a large amount of long and very thin purple petals that look like they are wiggling.\nthis flower has petals that are purple with long stringy stamen\nthis flower has long stringy light purple petals , with alternating layers of white and dark purple, with a yellow receptacle.\nthe flower is so big with petals that are soft, smooth and arranged in disc like manner below the disc of curly stamens\nthis flower is purple and white in color, and has petals that are oval shaped.\nthis ornate flower has a white, wheel like center, with indigo feathers, and a very unique indigo styles that string out like spaghetti.\nthis flower has a bottom layer of oblong purple petals followed by wavy purple filaments and tall white stamens and pistil.\nthis flower has petals that are purple and has stringy stamen\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_40_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_40_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_40_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_40_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the bird has grey webbed feet and a bright orange beak.\nbird has gray body feathers,white breast feather, and orange beak\na grey bird with webbed feet, a short and blunt orange bill, grey head and wings and has white eyes, a white stripe behind its eyes and white belly and breast.\nthis black and white bird has a short and fat body with a small orange beak.\nthis bird is white with black on its head and has a very short beak.\nthis bird has feathers that are black and has an orange bill\nthis bird has wings that are black and has an orange bill\nthis bird is white with black and has a very short beak.\nthis bird is white and gray in color, and has a vivid orange beak.\nthis bird has a small snubbed orange bill with a bright white eyering\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_41_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_41_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_41_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_41_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower is purple and white in color, with petals that are oval shaped.\nthis flower has a first layer of white petals with thin purple brush like petals inside along with a large green style.\nthis flower has a lower layer of white petals with an upper layer of very long and thin purple petals\nthis flower has petals in the shape of a circle and are purple\nthere are large white petals with thin purple filaments and green stamen.\nthis flower is white and purple in color, and has petals that are oval shaped.\nthis flower has petals that are white and has purple stringy stamen\nthis unique flower has long slender purple petals with stamen and stigma standing straight up from the middle.\nthe flower is so big with petals that are so soft, smooth and arranged separately below the disc of separately arranged purple stamens\nthis flower has rounded green petals under a thick fringe colored dark purple.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_42_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_42_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_42_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_42_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this bird has a completely black body and white eyes.\na fat and shiny all black bird with white eyes.\na large chested black bird with white eyes.\nthis bird is mostly black with a bright yellow eye.\nthis is a large black bird with a white eye and a pointy black beak.\nthis particular bird has a puffy black breast and belly and yellow eyering\nthis round, black bird's small head sticks out prominently, and its yellow eye rings stand out from the the rest of the head.\nthis black bird has pure white color eye ring bulged belly and a sharp pointed beak\nthis bird is black in color, and has a black beak.\nthis bird is all black and has a long, pointy beak.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_43_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_43_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_43_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_43_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: a bird with a very large, hooked bill with black tip and all white plumage across its body.\na large white bird with a long curved bill. an all white body, black eye rings, and black wing feathers.\na larger sized bird with a glowing white body and a large orange beak.\nthis large white bird has a large yellow beak which points down towards the end.\nthe bird is very large and has a white belly, breast, and head with a long orange beak.\na long beaked bird with mostly white and black feathers.\nthe bird has a white belly, long peach bill that is curved and a white crown.\nthis bird has wings that are black and has a long bill\na large bird with a white breast, throat, and head with black eye rings and a large pointed beak.\na pale beak and smokey looking eyebrows on this white breasted bird are striking.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_44_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_44_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_44_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_44_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: a bird with a short, rounded beak which ends in a point, stark white eyes, and white throat.\nthis bird has a black eyering, a bright red bill, and a white throat.\nthis bird looks black and white lines drawn on it, it's small beak is blood red and it's eyes are alert, white with a small black pupil.\nthis bird has a speckled belly and breast with a blunt orange bill.\na black and white bird with white eyes and a short beak.\nthis bird has a white throat and a short orange bill\nthe bill of the bird is short, puffy, and a distinctive red color.\nthis bird has a long neck and has a red bill\nthis bird has a stubby red bill and white throat with a black crown and white eyes.\nthis bird has a black and dark orange bill, with white eyes.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_45_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_45_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_45_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_45_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: a black bird with orange and yellow wingbars and black eyes.\nthis bird is all-black except for a blaze of red on the coverts with a short, pointy black beak and black eyes.\nbird has black body feathers, black breast feather, and pointed beak\nthe black bird has a bright orange stripe on its wings and a large tail.\nthis bird is black in color with a black beak, and black eye rings.\nthis bird is black with red and has a long, pointy beak.\nthis bird is all black with a little bit of red and yellow on it's wings and a very sharp beak.\nthis black bird has a black bill and a red patch on its wings.\na dark black bird with a short black pointed beak.\nthis bird is black and red in color, and has a black beak.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_46_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_46_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_46_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_46_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower has petals that are pink with stringy white stamen\nthe petals of the flower are pink in color and have green leaves.\nthe petals of this flower are pink with a long stigma\nthis flower is pink and white in color, with petals that are oval shaped.\nthe flower is big and has disc of petals that are soft, smooth and has disc of stamens in the above that are white\nthis flower has purple petals as well as a green pistil.\nthis flower has petals that are pink and has stringy stamen\nthis flower is pink and white in color, and has petals that are oval shaped.\nthe main color of the pedals are deep pink with white filaments poking out.\nthis flower has long, skinny purple petals and white and dark purple stamen.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_47_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_47_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_47_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_47_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: a bird with a pointed bill, red and white wingbars, and with black covering the rest of its body.\nthis bird is black with a red and white spot on its wing and has a long, pointy beak.\na bird that is covered in solid black feathers with the exception of its wings which have a few orange and light yellow feathers on it.\nthis black bird features brief areas of red and white on its wings, and it has a sharp, modest size beak.\nthis bird is nearly all black with a red and white covert and short pointy bill.\nthis black bird has a red and white patch on its wing, along with a sharply pointed small black beak.\nthis bird is black white and red in color, with a black beak.\nthe bird is almost entirely jet black with a sharp bright patch of red and yellow on the wings.\nthis bird has wings that are black and has a short bill\nthis bird has a black bill and crown and breast and black wings with a white and red wingbar.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_48_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_48_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_48_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_48_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this imposing bird is all black including its eyes, feet, and sharp pointed bill and it has longer tail feathers.\nthis bird is all black.\na small bird covered in black feathers from head to tail, with a sharp but short pointed beak.\nthe wholly black bird features a strong, thick beak and beady black eyes.\na black bird that is medium in size very long outer rectrices.\nthis all black bird has long rectrices compared to the rest of its body and a short black bill.\nthis medium sized bird appears to be all black.\nthis bird is all black and has a very short beak.\nthe bird is small with a pointed bill, and the belly is black.\nthe bird has a black belly, black back and a black bill.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_49_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_49_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_49_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_49_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this bird has thick, white feet and tarsi with a striking red bill.\nthis is a bird with a white belly, black wings and a white eye.\nthis bird has a white eye ring, belly and vent along with a white with grey speckled throat and breast.\nthis bird has a white belly and breast with a black wing and crown.\na black bird with a wide orange beak, white chest, short tarsus and elongated foot.\nthis bird has wings that are black and has a orange bill\nblack back and crown with very long feet.\nthis bird has wings that are black and has a white belly\nthis bird has a black back, wings and head, with a white belly and speckled nape.\na small black bird, with a white belly, and webbed feet.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_50_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_50_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_50_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_50_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the gray bird has a light grey head and grey webbed feet.\nthis bird has a large, straight bill, large black feet, and a white and gray crown.\na large flying bird with and all grey body, long dark wings, webbed feet, and a long sharp bill.\na medium bird with a gray body, feet,wings and bill.\nthis bird is black with white on its tail and has a long, pointy beak.\nthis is a very large charcoal colored bird with a huge wing span and webbed feet.\nthis bird has wings that are brown and has a long bill\nthis large black bird has a long wingspan and webbed feet.\nthis bird has large feet and a broad wingspan with all grey coloration\nthis bird has large, black, webbed feet, and is covered in gray plumage.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_51_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_51_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_51_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_51_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower has petals that are purple with long stamen\nthe pedals of this flower are purple and magenta with a long stigma\nthis flower has long and skinny purple petals followed by even thinner petals above them.\nthis vibrant purple bloom features a ring of coronal filaments over the rounded sepals and a tall pistil ringed with purple filaments, each capped with a bold yellow anther.\nthis flower is yellow and purple in color, with petals that are oval shaped.\nthis flower has long purple petals with a yellow and pink pistil.\nthe petals of this flower are purple and the pedicel is green\nthis flower has bright purple oblong petals underneath a layer of long blue filaments with curly ends and tall yellow stamen with purple pistils.\nthis flower has petals that are purple and has stringy stamen\nthis unquie flower has a lot f strange looking purple petals to it\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_52_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_52_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_52_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_52_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this is a bird with a white belly and a black back\nwhite and black bird with a short orange beak and white eyes.\nthis bird has yellow abdomen and breast, black coverts and orange bill.\nbird has orange beak white belly the rest of the bird is black.\nthis goofy looking creature has a white belly and breast, black head, and short bright orange bill.\nthis particular bird has a white belly and a black breast\nthis bird is black with white and has a very short beak.\na black bird with a white breast and black feet.\nthis bird has a black crown, black primaries, and a white belly.\na dark black bird with a white belly and flank.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_53_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_53_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_53_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_53_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower has prominent green stamen surrounded by a hairlike purple fringe and long rounded white and purple petals.\nthe petals on this flower are pink with a long stigma\nthis flower is white and pink in color, with petals that are oval shaped.\nthis flower has petals that are purple with stringy stamen\nthe petals on this flower are pink with an elaborate pistil.\nthis flower has a double row of elongated petals under a row of needle shape petals.\nthis flower has pink and white petals and has purple stamen\nthis flower has bright green stamen, with an inner layer of dark purple petals and an outer layer of light pink petals.\nthis flower is white and pink in color, and has petals that are oval shaped.\nthis flower has purple and green petals and a hair-like purple fringe in its center.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_54_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_54_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_54_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_54_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the petals of the flower are pink and have short white filaments with green leaves.\nthis flower has petals that are purple with stringy stamen\nthis flower's petals are thing and long, changing from white to maroon at the edges, and they surround a pair of long, red stamens.\na pink petal flower with white filaments and yellow and green anther.\nthis flower is purple and white in color, with petals that are oval shaped.\nthis flower has petals that are pink and has stringy stamen\nthis flower has long pinkish petals with white stripes and white filaments.\nthe flower on this particular picture has petals as well as a sepal.\nthis flower has a flat layer of long pink petals underneath another flat layer of white filaments with tall green stamen and purple pistils at the center.\na large flower with a long purple and white pedals and a green center.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_55_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_55_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_55_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_55_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower has green anther and a purple stigma on top of white and purple petals.\nthis flower has long purple petals and long purple stamen coming out of it\nthis flower has petals that are light blue with purple and stringy stamen\nthis flower is purple in color, with petals that are oval shaped.\nthis flower has petals that are lavender with wavy filaments and white anthers.\nthe petals of the flower are light purple in color with broad green leaves.\nthe petals are long and thin and purple with long pointy pistil.\na flat purple flower with purple petals in two layers surrounding many green pistils.\nthis flower is purple in color, and has petals that are oval shaped.\nthis flower has petals that are purple and has stringy stamen\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_56_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_56_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_56_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_56_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower has star shaped white petals as its main feature.\nthis flower has prominent green and purple stamen and pollen tube surrounded by two layers of thin purple and wide white petals.\nthe petals of this flower are white with a long stigma\nthe petals of this flower are white with long purple stigma\nthis flower is white and blue in color, with petals that are oval shaped.\na large pinwheel shaped and gray flower with a large colorful center.\na flower with white petals and purple stamen.\nthis flower is white, purple, and green in color, and has petals that are oval shaped.\nthis flower has petals that are white and has stringy stamen\nthe flower has white long thin petals and purple anthers.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_57_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_57_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_57_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_57_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the bird has a long curved black bill and a black eyering.\nthis a dark grey bird with a brown head and a white eye.\na small bird with a brown head and brown nape, with brown covering the rest of its body, and the head is small compared to the body.\nthis is a dark grey bird with a white eye and a large beak.\nthis bird is black with long wings and has a long, pointy beak.\na large brown bird with long wings, a long blunt beak and white around the eyes.\nthis is a bird with a long tail and beak with a reddish head and black wings.\nthis brown bird has large eyes.\nthis bird has wings that are black and has a long bill\nslate grey smooth feathered bird with a large head and a wide wingspan.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_58_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_58_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_58_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_58_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the petals on this flower are purple with an elaborate pistil.\nthe flower has purple petals on it and the stamen is seen.\nthis flower has long, finger like purple petals and long, white and purple stamen.\nthis flower is purple and yellow in color, with petals that are oval shaped.\nthis flower has petals that are purple with stringy stamen\nthis flower is purple and white in color, and has petals that are oval shaped.\na flower with long and narrow petals that are light purple.\nthe petals on this flower are mostly pink and purple, with yellow stamen.\nthis flower has thick purple petals under a fringe of purple and thick stamen.\nthis flower has petals that are pink and has stringy stamen\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_59_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_59_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_59_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_59_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower has petals that are green with stringy purple stamen\nthis flower is white and blue in color, with petals that are oval shaped.\nthe petals on this flower are white with an elaborate pistil.\nthe flower is unique because the petals aren't separated and they have a round tip\nthis flower has blue petals as well as a green and purple pistil.\nthis flower has thick and pale green petals under a thick fringe of purple and white.\nthis flower has petals that are white and has stringy stamen\nthis flower has white oblong petals and white flat filaments.\na flower with long and narrow petals that are whtie.\na flower with long and narrow petals that are whtie.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_60_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_60_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_60_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_60_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the petals on this flower are white with an elaborate pistil.\nthis flower has petals that are green with purple stringy stgma\ninnr petals are needle shaped and are puple inc olor\nthis flower is white and blue in color, with petals that are oval shaped.\na white pedaled flower with wavy blue pedals coming from the green stigma.\nthis flower has blue petals as well as a green and purple pistil.\nthis white, purple, and blue flower has pointed petals and light green sepals.\nthis flower has long white petals and a light green and yellow pistil.\nthis flower has petals that are white and has stringy purple stamen\na flower with many white petals and long purple and white stamen at it's core.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_61_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_61_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_61_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_61_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the bird is completely black with thick feather and a thick beak.\nthis bird has a black back, a black outer rectrices, and a short bill\nmedium sized fully black bird with thin white strips on the tail feathers, and short stubby black beak.\nthis bird is all black with a large, curved beak and long, narrow tail feathers.\nthis is a black bird with tan on the crown and wings.\nthis bird is black with white and has a very short beak.\nthis is a black bird with a long tail and a thick, short beak.\nthis bird is all black and has a long, pointy beak.\nthis bird is jet black in color with a long tail in comparison to it's body length\nthis bird has wings that are black and has a thick bill\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_62_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_62_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_62_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_62_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: a dark grey bird with white eyes and a very short red beak.\na small bird with an orange bill and grey crown and breast.\nthe bird is grey and white with a white eye and white throat.\nthis is a black and white spotted bird with a white throat, eye and red beak.\nthis bird is black and white in color with a red beak, and black eye rings.\nthis bird is black with white and has a very short beak.\nthis bird is black with white and has a very short beak.\nthis mottled white and black bird has white eyes, a white throat, and a stubby dark orange beak.\nthis bird has black eyes with a white throat and a red beak.\nthis bird is black with white and has a very short beak.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_63_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_63_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_63_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_63_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower is red and white in color, with petals that are oval shaped.\na flower with small crimson petals with flowing red and white stamen leading into a central manyfold pollen tube cluster.\nthis flower has red thin like petals and white and yellow pistil.\nthis flower has a large red petal and a bright white anther filament\nthe flower has red petals with yellow and white pollen tubes in the center\nthis burgundy flower has 8 prominent stamen surrounded by a layer thin filament like petals and a layer of fuller petals.\nthis flower has petals that are red and has yellow stamen\nthe flower is has petals that are maroon in color and a center that is white in color.\nthis flower has maroon petals underneath and then long hair-like petals on the top.\nthe flower has large red pedals with long red and white needle like pedals and bright yellow stamen\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_64_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_64_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_64_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_64_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the stamen are longer with larger brown in color anthers\nthe petals of this flower are blue with a large stigma\nthis flower is purple and white in color, with petals that are oval shaped.\nthis flower has petals that are green with stringy purple stamen\nthis is a strange, multicolored flower with long, skinny petals.\nthe thin blue petals point outward while the stigma droops over the stamen.\npurple flower with interesting center and long petals\nthis flower has large white petals and a strange shaped pistil.\nthis flower is white and purple in color, and has petals that are oval shaped.\nthis flower has petals that are green and has purple stamen\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_65_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_65_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_65_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_65_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the bird has black feathers, white feet, and a long black beak.\na blue blackish large bird with a yellow striped long beak, and a yellow rings outlining the eyes.\nthis bird is an almost gray/brown on its nape, back, breast, and belly, it's wings and rump are black, and it has white a white eyering.\na large dark smoky grey body white and black eyes beak is thick\nthis bird has a really long black bill and gray feathers\ngrey bird with white eye ring and beak is black with yellow line in it, feet is white color, and tail is black color.\nthis medium-sized bird is solid dark grey, almost black, with a large, thick bill and a white ring around black eyes.\nthis hefty gray bird has white eyerings and a long black bill.\nthis bird is grey with black and has a long, pointy beak.\na bird with a white eyering, all gray chest and belly and a black and yellow bill.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_66_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_66_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_66_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_66_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower has petals that are white with purple stringy stamen\nthis flower is white and purple in color, with petals that are oval shaped.\nthis flower has green pollen tubes and purple stringy petals as its main features\nthe petals of this flower are purple with a logn stigma\nthe plants has white petals with many more violet stamens\nthis flower has petals that are white and has purple stringy stamen\nthis flower has long white petals and wavy purple filaments.\nthis flower has long purple petals with a large white pistil.\nthis flower has rounded pale green petals under a fringe of paler purple.\nthis flower has white petals with stringy purple stamen in the middle of it.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_67_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_67_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_67_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_67_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the bird had a white and grey speckled chest with a short orange beak.\nthis bird has a black crown and back, with small accents, as well as a black and white spotted breast.\nthe bird has a grey body and a white and grey speckled chest along with an orange beak.\nthe birds has a white throat, breast, belly, side, abdomen and vent area.\nthis bird is white with black on its head and has a very short beak.\na small white and black bird. it has nearly completely white eyes, and a white breast with light black spots. the throat is completely white.\nthis bird is white with black and has a very short beak.\nthis bird has a black head with a white breast that is speckled with black.\nthis bird has a small orange bill with grey crown and white and grey spotted belly.\nthe small bird has a bright white eye, a short black and orange bill, and a spotted belly.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_68_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_68_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_68_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_68_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this bird has gray tarsus and feet with an orange beak and white eyes.\nthis mostly black bird has a bright orange bill, white eyes, a white stripe leading from his eyes down his head and a feathery lifted crown.\nthis insane looking bird has a greyish-black body, a very short, crushed orange beak, and pinhole pupils.\na bird with a short rounded orange bill, stark whtie eye with white brown, and feathered point coming off its superciliary.\nthe bird has an orange bill with feathers sticking up in front of it as well as an entirely black body.\na brown medium size bird with a short orange beak.\nthis bird has black primaries with a black crown and black belly.\nthe bird has black feathers and an orange bill. it also has a line of white feathers behind its eye and a large black tuft of feathers between its eyes.\nthis particular bird has a belly that is black with white eye rings\nthis bird has wings that are black and has an orange bill\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_69_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_69_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_69_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_69_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: a white bird with black tipped wings and a long grey beak.\nthis bird has a white chest with a long pointed white beak.\nthis bird has a curved white bill, a white belly, and black primaries.\nthis white bird has black along the ends of its wings and a pale, long beak.\nthis medium sized to large bird has a white belly, breast, head and tail with a long, pointed beak.\nthe bird has a long white bill and long black secondaries.\nthe large bird has a long light colored bill, a white rump, and a white belly.\nthis large bird has a white head and belly, white wings with black on the ends of the feathers, and a white tail.\nthis bird has wings that are black and white and has a long bill\nthis bird is white with black and has a long, pointy beak.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_70_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_70_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_70_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_70_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this is a strange flower with purple petals and a white stigma.\nthe petals of this flower are purple with a long stigma\nthis flower is purple and white in color, with petals that are oval shaped.\nthe petals on this flower are purple with an elaborate pistil.\nthis flower has petals that are purple with yellow stamen\nthis flower has flat purple petals, blue stamen, and stigma with yellow and white coloring.\nthis flower has green sepals, purple petals stamen arranged alternate pattern\nthis flower has purple petals with long purple pistils that go outwards.\nthis flower has thick purple petals under a round fringe of darker purple.\nthis flower has petals that are purple and has stringy stamen\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_71_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_71_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_71_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_71_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this mostly black bird has a red and white spot on its shoulder\nblack bird shaped like a cylinder with red and orange wing bars\nthe bird has a small black bill, a black eyering and black belly.\nthis small, all black bird's only coloring is on its shoulders. it has a small patch of bright red and white.\nthis bird has a black crown, a black bill, and a red spot on the wing\nblack bird with red and white strip on the shoulder of the wing. he has a black beak. he is standing on a blade of beige straw\nthis bird has wings that are black and has a red and yellow patch\nthis bird has a black crown as well as a orange wingbar\nthis bird is black with red and has a long, pointy beak.\nthis bird is black with red and white on it wing, a long tail, long black legs, a small head, and pointy beak.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_72_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_72_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_72_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_72_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower has white petals with a green ovule and stigma.\nthis flower is white and purple in color, with petals that are oval shaped.\nthis flower has petals that are green with string stamen\nthis flower has long white petals and long white stamen in the middle and a green stigma\nthis flower has three green prominent stigma over a base of cream colored narrow petals with purple bases, and fine entangled stamen.\na flower with long and narrow petals that are light purple.\nthis flower is white, purple, and green in color, with oval shaped petals.\nthis flower has white petals that have long ad stringy stamen with a green style\nthe petals of the flower are light white in color with an inner ring that is purple.\nthis unique flower has purple and white petals and a prominent pistil.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_73_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_73_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_73_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_73_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the color of the flower is purple and has a stamen.\nthis flower has petals that are purple with purple stringy stamen\nthis flower is purple and yellow in color, with petals that are oval shaped.\nthe petals on this flower are purple with an elaborate pistil.\na large purple and white pedaled flower with a large green and purple stigma.\nthis flower has purple petals with purple stamen in the center of it .\nthe pointed leaves of this flower are white graduating to a purple outline and purple rays in the center.\nthis flower is green and purple in color, and has petals that are oval shaped.\nthis flower has petals that are purple and has stringy purple stamen\nthis flower has greenish yellow stamen, purple filaments and magenta petals.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_74_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_74_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_74_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_74_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: flower has larger anthers that are green in color with purple petals\nthis flower has long purple petals and long white stamen in the center\nthe sepal on this flower is purple with white stamen\nthis flower has petals that are purple with white stamen\nthis flower is white and purple in color, with petals that are oval shaped.\nthis flower has light purple oblong petals with long white filaments that lay flat over the petals.\nthis flower has petals that are purple and has stringy stamen\nthe flower has a purple petals with many white stamens around the light green pollen tube\nthis flower has purple petals with many white stamen in the middle, the stigma seems to be split into three and is also white.\nthis flower has a row pf purple petals under a row of long white needle shaped petals.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_75_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_75_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_75_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_75_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower has petals that are yellow with purple stamen\nthe petals of the flower are white in color and have thin white filaments with a yellow center.\nthis flower has petals that are white with purple filaments and green anthers.\nthis flower is white and purple in color, with petals that are oval shaped.\nthis white petal flower has a green stigma and purple and white filament.\nthis flower is white and purple in color, and has petals that are oval shaped.\nthe petals on this flower are white and purple and the pedicel is green\nthis flower has petals that are white and has purple stamen\nthe flower shown has a large green pollen tube with white sepal and white and purple petals.\nthe busy stamens on the white and purple petals around the green pollen tube\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_76_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_76_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_76_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_76_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: a medium sized bird that has a white belly and a very short stout bill\nthis is a black bird with a white eyering and a white belly and a orange bill\nthis is a bird with a white belly, black back and an orange beak.\na large goofy looking bird with a blunt beak and white stripe behind its eye.\nblack bird with white belly and breast. distinctive orange rounded small bill and white eyebrow.\nthis bird is black with white and has a very short beak.\nthis bird is white and black in color, with a bright colored beak.\nbird with a black back and white belly with a distinctive white superciliary.\nthis bird is white and black in color, and has a orange beak.\nthis is a large bird with a white breast and belly, black, crown, throat, wings, back, an unusual short orange bill, and white eyes.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_77_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_77_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_77_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_77_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: a black bird with white feathers like whiskers and a white stripe on the side of its head.\nthis bird has a long beak with a black and white body.\na big rounded yellow bill, long white eyebrow, eyering, long neck.\na black bird with white eyebrows, white whiskers, and a horn-like structure at the base of its big orange beak.\nthis black headed bird has a pale yellow eye and white eyebrow and white malar stripe.\nthis particular bird has a white cheek patch and white eyebrows\nthis bird has a striped head and an orange bill\nthis bird has black plumage, with a large orange beak and white stripes on its head.\nwhite eyebrows and a bright orange bill with a black outline are a stark contrast to the black feathers on the birds head.\na bird with a black head with a white stripe on its eyebrow and cheek, it has a medium length broad orange bill.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_78_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_78_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_78_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_78_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower is purple and yellow in color, with petals that are oval shaped.\nthis flower has a white pollen tube, white stigma, purple petals, and purple anther and filament.\nthe flower is so big and has petals that are soft, thick, smooth and separately arranged in a disc like manner below the disc of stamen that has curly tip\nthe flower has petals that are lavender with purple filaments and large center with white stigma.\nthe petals of the flower are purple with a yellow center and have thin filaments coming from the petals.\nthis flower has long white petals and a white pistil.\nthe petals of this flower are purple and stringy and the pedicel is green\nthis flower is purple and yellow in color, and has petals that are oval shaped.\nthis flower has white petals laying under lavender hairy like stamen.\nthis flower has petals that are white and has purple stringy stamen\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_79_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_79_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_79_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_79_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: small bird with jet black body and beak and bright white eyes.\nbird is totally black with small beak and long retrices.\nthis bird is almost entirely black, except for a grey patch on the breast.\nthis bird has all black feathers and yellow eyes, and black feet.\nthis bird has an all black body and black feet.\nthis is a black bird with iridescent feathers on its breast.\nthis bird is shiny black in color, with a black beak.\na large black bodied bird with a small head that is a darker obsidian.\nthis particular bird has a belly that is black and gray\nthis bird has wings that are black and has a thick bill\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_80_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_80_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_80_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_80_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower has petals that are white with purple filaments and pale green stamen.\nthis flower is white and purple in color, with petals that are oval shaped.\nthis flower has petals that are white with string stamen\na yellow and purple flower that has large rounded petals on the bottom and skinny purple filament.\nthe petals on this flower are white with an elaborate pistil.\nthis flower has outer cream petals and inner light purple straight line petals that become darker purple towards the center that displays enlarged yellow stamens.\nthis flower has purple petals as well as a yellow pistil.\nthis flower has petals that are green and has stringy stamen\nthe magenta filament and anthers fan out from the yellow pistil region to extend over the petals.\nthe petals of the flower are white in color with light purple inner stringy petals.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_81_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_81_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_81_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_81_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower has thick star shaped maroon petals as its main feature.\na red petal flower with white filament and yellow anther.\nthis flower is white and red in color, with petals that are oval shaped.\ninteresting red petals and red filament with white anthers.\nflower has petals that are burgundy with white stamen and burgundy filaments.\nthis red flower has pointed petals, white stamen and a green pedicel.\nthis purple flower has petals as well as a stamen.\nthis flower has petals that are red and has stringy stamen\na flower with long and narrow petals that are purple and long pistils.\nthis flower is red and white in color, and has petals that are oval shaped.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_82_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_82_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_82_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_82_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: a small bird has a short neck with a black breast, a black crown, and black secondaries.\nsmall black bird with a short black beak and bright orange wingbars.\nthis is a black bird with an orange wing and a pointy beak.\nthe bird has a black body, wings, head and tail with a orange spot on side.\na black fat looking bird with a black beak and a fully black body and a red patch on its' upper wing.\nthis bird is black with red and has a very short beak.\na small bird black breast, crown, wing, orange patch near covert.\nthis small bird is primarily dark gray, with a short black beak, a and a bright orange spot on it's breast.\nthis bird is black and orange in color, with a black beak.\na black bird with an orange spot on its wing.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_83_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_83_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_83_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_83_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this is a strange flower with multicolored petals and green stamen.\nthis flower has petals that are purple with purple stringy stamen\nthis flower has different shades of purple and a large green pistil.\nthe petals on this flower are purple with an elaborate pistil.\nthis flower is white and pink in color, with petals that are oval shaped.\nthe big flower has petals that are so soft, smooth and arranged separately forming disc like shape below the disc of purple stamens\nthis flower has large pink and white petals with a prominent green pistil.\nthis flower has tapered lavender petals surrounding a later of dark purple petals which surround the green stamen, pistil, and ovary.\nthis flower has petals that are pink and white and has stringy stamen\nthis flower has light purple and white petals with blue filaments.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_84_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_84_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_84_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_84_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower has prominent green and purple stamen and pollen tube, surrounded by two layers of thin purple and wide white petals.\nthe flower has green sepal with green anther and purple filament\nthis flower is white and purple in color, with petals that are oval shaped.\nthe flower shown has light petals and purple, white and burgundy as its main feature.\nthis flower has large yellow petals and long and blue stamen in the middle\nthis flower is blue and yellow in color, and has petals that are oval shaped.\nthis flower has petals that are green and has purple stamen\nthis flower has beautiful purple and black petals and the pistil is green\nthis unique flower has a lower row of white petals and an upper row of long, thin purple petals.\nthis flower has a open row of green petals under a row of lavender white and maroon needle shaped petals.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_85_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_85_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_85_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_85_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this colorful bird has a black head and black coverts , an orange flat beak and white body.\nthis bird has a long black neck, black crown and back and a wide, orange bill.\na small bird with a white belly and orange beak.\nthis bird has a white breast, black crown, and short bright orange bill.\na bird with a black head, white eyerings and white cheek patches. the bill is short, round and orange. the neck is black and grey and the belly is light grey. the coverts are brown.\nthis bird is black with white and has a long, pointy beak.\nthis bird is white and black in color, with a bright orange beak.\nthe bird has a black head with an orange beak.\nthis bird has a white belly with white eyes and an orange beak.\npretty bird with a round orange bill, a white eye and white superciliary, a white throat, chest and belly, and a black head.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_86_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_86_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_86_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_86_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the petals of the flowers are white and the leaves are green in color.\nthis flower is purple and white in color, with petals that are oval shaped.\na purple and white flower with white and green filaments and anther.\nthis flower has stringy purple petals and green pistil as its main features\nthis pale purple flower has very prominent stamen and stigma with long, skinny petals.\nthis flower has white petals that have stringy purple stamen\na flower with long and curly pistils that are pale purple.\nthe flower on this particular picture has petals as well as a stamen.\nthis unique flower has purple petals and a very prominent pistol and stamen.\nthis flower has blue petals as well as a green pedicel.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_87_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_87_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_87_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_87_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this birds color is blue on his body and a gray long tail.\nsmall black bird with long tail feathers and a short beak\na black bird with relatively long tail feathers and a short but pronounced beak.\nthis small bird has a black & blue colored body, and a black bill.\nthe small black bird has a short, stout beak and beady black eyes.\nthis bird has wings that are black and has a thick bill\nthe bird has a black eyering, long black outer rectrices, and black back.\nthe long tail on the black body bird with a grey bill\nthis bird has a short, downward-curved grey bill, a long tail, and black plumage covering its body.\nthis bird has wings that are black and has a white belly\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_88_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_88_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_88_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_88_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: a medium sized bird with black feathers and an orange beak.\nthis bird has a brown crown, a pointed bill, and a brown back.\nthis is a brown and black bird with a yellow eye and an orange beak.\na medium sized bird that has tones of dark brown with a large sized bill\nthis bird is black in color with a orange beak, and black eye rings.\nthis bird is black with white and has a very short beak.\nthis bird has a yellow bill, dark grey crown with white superciliary and cheek patch, dark primaries and secondaries\nthis bird has wings that are brown and has a yellow belly\nthis bird has an orange beak with a mostly black body.\na bird with an orange colored downward-curved beak, also has a black colored crown with a stripe of white.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_89_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_89_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_89_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_89_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the flower is so big and large and has petals that appears like sepals and also it has stamen and petals arranged in disc like manner one above the other\nthis flower has long white petals, long white stamen and a tall green stigma on it\nthis flower has petals that are white with purple and stringy stamen\nthis flower has a bottom layer of white petals with an upper layer of very thin and long petals that are dark purple at the base, white in the center and light purple at the ends.\nthis flower is white and purple in color, with petals that are oval shaped.\nthis flower has petals that are white and has stringy stamen\nthis flower has rounded pale green lower petals surrounding a layer of thin hair-like purple upper petals.\nthis flower is white and purple in color, with oval shaped petals.\nthis white flower has one layer of flat white petals layered under a broader row of small needle like white and purple tipped petals.\nthis flower has white petals arranged in a disk formation with purple stigma and green stamen.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_90_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_90_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_90_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_90_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the bird has webbed feet that are pale pink as well as skinny tarsus.\na large sea bird with blue feat and a large orange bill. it's body and head are white and wings are dark grey to black.\nthis white bird has a long, curved-at-the-end beak and webbed feet.\nthis large sea bird is white with black wings, has a long orange bill that curves downward at the end with a black tip, and its feet are pink.\nthis bird has a white crown, a long neck, and an orange bill\nthis bird has wings that are black and has a yellow bill\nthis bird has wings that are black and has a white belly\nthis large bird is mostly white with dark grey wings, light purplish feet, and a long orange beak.\nthis large bird has white feathers and webbed feet.\nthis white and brown bird has a long bill and webbed feet.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_91_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_91_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_91_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_91_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the stamen are towering over the stigma which cannot be seen.\nthe flower is so and has disc of petals below the disc of stamens that are blue, white and violet\nthe petals of this flower are blue and white with a long stigma\nthis flower is white and blue in color, with petals that are oval shaped.\nthis flower has petals that are white with purple stamen\nthere is a single layer of brilliant white long oval petals in a star configuration below a single layer of bright blue and white bristle like petals surrounding bright green stamen and a deep purple pistil.\nthis flower has white petals with a second row of purple and white striped needle-like petals on top.\nthis flower has blue petals as well as a green and purple pistil.\nthis flower has petals that are whgite and has purple stamen\nthis blue white and purple flower has pointed petals and lime green and purple stamen.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_92_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_92_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_92_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_92_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this bird has a bright white head, throat, breast and belly, dark grey wings and tail, and a long beak with a hooked upper beak, the area in front of the eye is black.\na large bird with a white head, neck, nape, throat, and breast, with black feathers covering the rest of its body except for white tips on some feathers.\nthis bird is black with white on its chest and head and has a long, pointy beak.\nthis bird has a white head and breast, with black covering the rest of its body.\nthis bird has a white head and chest and a beak that curves down.\nthis bird has wings that are black with a white belly\nthis bird is white and black in color, with a large white beak.\na white and black bird with black eyes sitting on the ground.\nbird with white beak and curved at the end, crown, nape, throat, breast, belly and abdomen are white, primaries and secondaries are black.\nthis bird has a white breast with a long bill.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_93_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_93_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_93_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_93_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the bird has a white chest and belly and a black body with a red bill.\nthis is a black and white bird with webbed feet and a thick beak.\nthis bird has a white belly, with webbed feet, beady white eyes and a small bill.\na medium sized bird with distinctive webbed feet and small tarsi with a bright orange, short bill and white belly with black head and wings.\na black bird with webbed feet, a white belly, and a short orange bill.\nthis black bird has a white breast and belly, gray webbed feet and a short, thick pink beak.\na white bellied bird with a black body and red beak.\nthis bird has a short bright orange bill, white eyes, and is black across the top and white across the bottom.\nthis bird has a black crown and wings with a white breast and webbed feet.\nthis bird has wings that are black and has an orange bill\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_94_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_94_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_94_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_94_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the petals of the flower are purple in color with thin purple filaments.\nthis flower is purple and green in color, with petals that are oval shaped.\nthe petals on this flower are purple with an elaborate pistil.\nthis flower has purple medium size petals topped by tooth-picked shape petals and green stamens\nthe flower is so big and has a disc of separate petals below a disc of separate stamen\nthis flower is pink, purple, and green in color, and has petals that are oval shaped.\nthis flower has petals that are pink and has purple stamen\nthis flower's ovule has a unique and characteristic design. the pedals are purple and splay out in a non conformed pattern.\nthis flower has light purple and grey petals around with dark purple stamen spread in front of the petals in a sun ray shape and a green huge pistol and stigma that have a unique shape.\nthis flower has white and pink petals with bright blue filaments and yellow stamen.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_95_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_95_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_95_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_95_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the petals of this flower are green with a long stigma\nthis flower has petals that are green with purple filaments and green anthers.\nthe flower has petals that are green, thick and separately arranged in disc like formation and also has a disc of stamens on top of it\nthis flower is blue and green in color, with petals that are oval shaped.\nthis flower has petals that are green with purple and stringy stamen\nthis flower has petals that are green and has stringy purple stamen\nthis large flower has long, thin purple petals surrounded by wider green petals with rounded edges.\nthis flower has light green petals with light blue filaments along with green stamens.\nthick green pollen tubes sit on top of the purple, white and blue stick like petals.\nthis flower has petals as well as a pistil. it is green and purple\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_96_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_96_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_96_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_96_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this bird has white eyes as well as a white breast and sides with black tarsus.\na small sized bird that has a grey belly and a very short pointed bill\na small bird with white throat, breast and belly, black small head and short bill, black feet and tarsus.\na small white and black duck with white eye and short black beak.\nwhite bellied bird with black crown and bill with webbed feet.\na bird with a white breast and a black crown and black webbed feet.\nthe bird has a white eyering, large white belly and grey back.\nthe bird has a head that is grey in color and a chest that is white.\nthis bird has wings that are black and has a white belly\nthis bird has wings that are black and has a white belly\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_97_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_97_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_97_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_97_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this large bird has a bright orange bill, a white colored belly, and white eyebrows and cheek patches.\nthe bird has a white side and belly, with a yellow bill and black coverts.\nthis bird has a curved orange bill, a white cheek patch, and a white breast.\nthis bird has an orange bill and a white chest.\nthis bird has a large orange bill,a white belly, a brown & black side, and a white supercilliary.\nthis bird has an orange bill, a white belly and white eyebrows.\nthis bird has wings that are brown and has a white belly\nthis bird is white with black and has a very short beak.\nthis bird has an orange beak and a white belly.\nthis bird is brown with white and has a very short beak.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_98_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_98_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_98_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_98_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this small bird has mostly black featrues with white speckles through out and a dab of red at the stem of his wings.\nthis is a black bird with white spots and a small beak.\na small sized black bird that has white spots and a short pointed bill\nthe bird has a black crown and a small black eyering.\nthis is a small, black bird with white spots on the nape and wingbars.\nthis bird has wings that are black and has a rotund body\nthis bird has wings that are black and has a small bill\nthis particular bird has a belly that is black with white patches\nthis bird has a black belly with black feathers and a yellowish black beak.\nthe bird has a black bill and a black back and crown.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_99_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_99_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_99_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_99_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: large flower with fading purple-white petals, stamens yellow-green, at the receptacle are sprouting numerous white ,long filaments\nthis pale purple flower with pointed medium sized petals and arranged in leaves.\nthe flower is large with big petals that are mostly pink and has stamen forming a disc like layer on top of the petals around the pistil\nthis flower has purple petals that are long and pointed with long white petals coming from the receptacle and yellow stamens.\nthis flower is purple and white in color, and has petals that are long and multi colored.\nthe petals of this flower are purple and white and the stigma is light green.\nthis flower is pink and white in color, and has petals that are oval shaped.\na flower that has purple and white petals with green stamen.\nthis flower has pink and white petals that have stringy stigma\nthis flower has white and lavender petals with white stamen in the center and a green stigma.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_100_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_100_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_100_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_100_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the large brown bird has a big bill and white throat\nthis medium sized bird is primarily black and has a large wingspan and a long black bill with a strip of white at the beginning of it.\nthis bird has crown, a black bill, and a large wingspan\nthis bird features a broad wingspan and a slightly curved, dark bill.\nthis larger bird is black and has a large black beak\nthis bird is mostly black with white around the base of the large curved bill.\nthis is a mostly black and grey bird with a spectrum of white and grey secondaries and wingbars.\nthis bird is all black and has a long, pointy beak.\na medium sized bird with a long bill and brown wings\nthis bird is black with white and has a long, pointy beak.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_101_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_101_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_101_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_101_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the bird has a white and tan belly with a brown head.\nthis small, white-bellied bird has a brown head and a red-tipped beak.\nthis bird is black with a white chest and belly and has a long neck\na medium sized bird that has a white belly that has black spots on it\na medium bird with a white chest and light eyes.\nthis particular bird has a white belly and breasts and black head and back\nthis bird has wings that are black and has a white belly\nthis bird is white and brown in color, with a stubby beak.\nthis bird has wings that are brown and has a white belly\nthis bird has a white breast and belly with grey wings and a grey crown.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_102_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_102_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_102_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_102_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the outer petals of the flower are white while the spiky petals are blue, white, and maroon.\nthis distinctive flower features an inner layer of purple and white petals framed by a layer of white elongated petals.\nthis flower has a very unuqie look and color\nthis plant has multiple blue and white stamens that hang just above the white petals.\nthis flower has white petals and many filaments that are colored purple, white, and blue.\nthis is a light blue flower, with thick and long petals on the outside and thin, and short ombre colored petals on the inside.\nthis flower has rounded green outer petals and pale, thin, pointed purple and white inner petals.\nthis flower has two large yellow pistils, several bright pink and green stamen, a layer of bright purple, blue, and white petals, and an outer layer of light green petals.\nthis flower has long white green petals and blue tip anthers.\nthis flower has petals that are white and has string stamen\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_103_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_103_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_103_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_103_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower is white and purple in color, with petals that are oval shaped.\nthis flower has long white petals and long purple stamen in the center of it\nthis flower has petals that are white with long purple steman\nthe flower shown has white petals as its main feature with green and purple stamen\nthis strange flower has pointed petals and is white ,purple and green.\nthe flower has long oval white petals and purple and green stamen and pistil.\nthis flower has white petals with purple stamen and a big style\nthis flower has white petals as well as a green and purple pistil.\na flower with white petals and white stamen.\nthis flower has white petals with fuzzy white and yellow stamen in the center of it.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_104_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_104_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_104_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_104_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the bird is black with a yellow eye and long black tarsals.\nthis is an all black bird with a pointy beak and a white eye.\nthis is a black and gray bird with a yellow bill and a black crown\nthis bird is all black with a short beak and buggy yellow and black eyes.\nthe small bird is entirely colored black. it's black bill is short and pointed. it's tarsus and foot are also black.\nthis bird has a yellow eye ring, and black feathers covering the rest of its body.\nthis bird has wings that are black and has a short bill\nthis bird has dark black, slightly iridescent feathers all over its body, with bright yellow eyes and a thin, pointed beak.\nthis bird is black in color, and has a black beak.\nthis bird has wings that are black and has white eyes\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_105_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_105_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_105_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_105_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: a bird with a slightly downward curved bill, which also has a white point coming off of it, stark orange eyes, and gray covering its body.\nthe bird is grey and black with an orange bill and white eyes.\na black water-bird with stout orange beak and eyes, and has a white tooth-like structure at the origin of the beak.\nthe bird has an orange bill that is outlined in black.\nthis is a grey bird with an orange bill and black on the wingbars.\nthis bird has feathers that are black and has a long yellow bill\nthis is a black bird with a thick orange beak that has a white piece at the base of it.\na large black bird with a dull range curved beak.\na bird with a black wings, throat and crown and the bill is short and curved\nan all black bird with a orange short slightly round bill\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_106_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_106_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_106_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_106_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this dark bird has an orange beak, light orange tarsus and feet, dark gray wings, whitish breast and belly, and grey eyes.\nthis is a black bird with a white belly and a large orange beak.\nthis is a black and white bird with a orange bill and long wingbars\na bird with an orange thick bill and a black coat with white at its throat.\nthis bird has a orange beak, black throat, and a black and white belly.\nthis bird is black with grey and has a long, pointy beak.\nthe color of the bird is black with an orange beak and a grey belly.\nthe bird has a black belly that is long and two large wingbars.\nthis is a large all dark colored bird with a yellow beak\na medium-sized black bird with a white superciliary, white malar stripe, and small orange beak\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_107_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_107_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_107_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_107_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the bird has black crown and nape, gray throat, breast, belly and abdomen, the tarsus and feet are light brown.\nthis bird has a black face with white long feathers sticking out of its head at random points of location, a grey throat, belly and tarsus, and black feathers covering the rest of its body.\na larger bird with a black and grey body, a curved orange beak, and yellow claws.\nthis is an all grey bird with a light grey breast and a bright orange bill.\na grey bird with an orange beak and distinctive feathering on the upper and lower face.\nthis bird has wings that are black and has a white belly\nthis bird has wings that are black and has a white bill\na large grey orange billed bird, with long beige cheek patch feathers.\nthis particular bird has a belly that is white with gray patches\na small bird with black back feathers, grey belly and throat, white malar stripe, and a small orange beak.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_108_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_108_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_108_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_108_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower has star shaped purple petals as its main feature.\nthis flower has petals that are white with purple stamen\nthis flower is purple and whte in color, with petals that are oval shaped.\nthe flower has several boat shaped purple petals, and pale yellow stamen\nthe flower is so big and has petals that are soft, smooth, thick and are separately arranged separately around the sepal below the disc of separate of stamen with curly tip\nthis flower is purple, and green in color, and has petals that are oval shaped.\nthis flower has white petals, purple stamen and yellow pistils\nthis purple flower has pointed petals, yellow stamen, and green sepals.\nthis flower has petals that are white and has purple stringy stamen\nthis is a purple flower with a green pistil and long curvy petals.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_109_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_109_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_109_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_109_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: black feathers on the top of the bird with gray feathers on the breast and underside of bird orange color on the face of bird and long gray claws\na large and tall bird that is gray and black in color, and an orange mouth.\nthis goofy looking, large bird has a bright orange break with a musky gray body and charcoal wing feathers.\nthis bird is grey in color with a vibrant orange beak, and white eye rings.\nthis bird has a hairy crown, a gray belly, breast, throat, tarsus & feet, and a bright orange area surrounding its bill.\na very distinctive gray bird with a black fringe on its crown, white eyes that contrast with its darker gray head coloring, and vibrant orange coloring surrounding its beak.\na gray and white body bird with a small head in comparison to the body.\nthis bird has lovely orange colored throat with white colored beak\nthis bird is great it has a feather mohock on it had it has a very bright orange face and a white colored beak\nthis bird has a black plum raising from its head, with grey crown and bright orange moon shape markings from eye to eye, with dirty grey white belly colors.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_110_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_110_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_110_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_110_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this bird is black all over its body with orange in its beak.\nthis is a grey bird with a white eye and a large orange beak.\nthe crown of this exotic bird is made to attract eligible females.\nthis black bird has a orange bill with hair coming out of it, small pupils, and a white line across its face.\nthis bird resembles one in cartoons, sleek and shiny black with a white check patch to match it's large round white eyes and a very colorful beak that has a long feather patch resting upon it.\nthis bird has wings that are black and has an orange bill\nthis particular bird has a belly that is gray and black\nthis bird is distinct due to it's single hair part coming off the head then over the beak.\nthis bird has a long black crest and an orange beak.\na small black bird with white eyes, a white malar stripe, and a small orange beak.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_111_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_111_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_111_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_111_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower has protuberant green and purple stamen and pollen tube surrounded by fringed thin purple petals, which are in turn surrounded by slightly pointed wide white petals.\nthe flower shown has white petals and purple and red anthers as its main feature.\nthis flower is white and blue in color, with petals that are oval shaped.\nthis flower has a feathered purple receptacle and white leaves.\nthis flower has a circular shaped sepal as its main feature.\nthis flower has white petals with a layer of thin purple petals.\nthis flower has light green petals surrounding a hair-like purple fringe and prominent green stamen.\nthis flower has petals that are white and has purple stamen\nthe stamens of the flower have a distinctive color pattern, and in a circle formation.\nthe petals of this flower are white and purple and the pedicel is green\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_112_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_112_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_112_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_112_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the crown and most of the body is black with patches of white on the wings and under tail.\nthe bird has a black overall body color aside from several white patches all over it.\nthe bird has a white eyering and a black small bill.\nthis bird has beautiful black feathers with some white spots and a white undertail covets\nthis bird has a black overall body color except from several white patches.\nthis bird is black with white and has a very short beak.\nthis bird is black with white and has a very short beak.\nthis bird is a bigger bird, it has a yellow wand brown belly, the top is mainly black and brown.\nthis bird has a black crown and bill with black wings flecked in grey spots.\nthe bird has a full belly and a black and white back.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_113_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_113_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_113_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_113_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: a bird with stark white eyes, a rounded, snubbed orange bill, and very large feet.\nthis small bird has a curved, orange bill, a black head and back and wings and a white breast and belly.\nthe bird has white breast and abdomen with black crown and coverts as well as an orange colored bill.\nthis bird has a speckled belly and breast with a short orange bill.\nthis bird is black and white in color with a red beak, and white eye rings.\nthis strange bird has a while belly and black back with a white cheek patch and an orange bill.\na medium-sized bird with a yellow belly, blue-grey feet, a speckled grey and white breast, and black back and head.\nthis bird is white with black and has a very short beak.\na large and plump bird with an unique orange beak.\na medium sized black bird, with a white belly, and webbed feet.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_114_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_114_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_114_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_114_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower has prominent green and purple stamen and pollen tube surrounded by two layers of thin purple and wide white petals.\nthe petals of this flower are purple with a long stigma\nthis flower is white and blue in color, with petals that are oval shaped.\nthe flower has tapered white petals and tiny thin blue petals and a large three part stigma.\nthis complex flower has white petals, a purple pollen tube and purple and white filaments.\nthis flower is white, blue, purple, and yellow in color, and has petals that are oval shaped.\nthis flower has a wheel of white oblong petals underneath a layer of flat blue filaments with a purple pistil and green stamen.\nthis flower has petals that are white and has purple stamen\nthis flower is made up of long white petals under a row of lavender white and purple needle shaped petals.\nthis blue and white flower has pointed petals and green sepals.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_115_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_115_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_115_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_115_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower has purple sepal and white pistil as its main features\nthis flower is purple and white in color, with petals that are oval shaped.\nthis flower has petals that are purple with stringy stamen\nthis flower has the row of blue petals and a row of string like structures with the prominent white stamens at the middle\nthis flower has large purple petals under a fringe of long thin purple hairs.\npurple sepal surround the large yellow pollen tubes on this flower.\na flower with long and narrow petals that round at the top.\nthis flower has petals that are purple and has stringy stamen\nthis flower has light blue petals with long and flat purple filaments with curvy ends.\nthis flower has light blue petals with long and flat purple filaments with curvy ends.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_116_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_116_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_116_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_116_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower has petals that are green with purple and white stamen\nthis purple blue and white flower had larger petals underneath thin wispy petals.\nlong white petals with multicolored blue, white, and purple stamen\nthis flower has a lower layer of white petals with an upper layer of very long and thin purple petals that are white at the base.\nthis flower is white and blue in color, with petals that are oval shaped.\nthis flower has petals that are green and has purple stamen\nthis flower has long rounded white petals with skinny white and blue petals on top of those.\nthis flower has a green ovule, green stamen , a purple pollen tube and pure white leaves.\nthis flower has white petals in a ring followed by a ring of blue filaments.\nthere are needle like blue and white petals, a purple stamen, oily leaves, and wide sepals.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_117_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_117_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_117_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_117_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the flower shown has stringy purple petals which are its dominant feature\nthe inner petals are light purple in color and are needle shaped\nthe petals of this flower are purple with a large stigma\nthis flower is white and purple in color, with petals that are oval shaped.\nthis flower has petals that are purple with purple stamen\nthis purple flower has a base of string-like leaves with a white stamen in a criss-cross pattern.\nthis flower has purple petals and a wide fringe of hair-like purple.\nthis flower has petals that are purple and has stringy stamen\nthis flower has purple petals as well as a yellow pistil.\nthis flower has very fine, curl ended vivid purple petals and yellow stamen.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_118_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_118_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_118_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_118_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower has thick green stamen and purple fringe surrounded by wide white petals with pointed tips.\nthis flower has thin wiry petals that are dark purple from the center and white on the outer edges.\nthis flower is purple and white in color, with petals that are oval shaped.\nthis flower has petals that are green with white and purple stamen\nthis is a strange flower with white petals and purple near the ovary.\nthe flower is so big with petals that are soft, smooth and arranged in disc like manner below the disc of curly white disc layer of stamens\na flower with long and narrow pistils that are curly.\nthis flower has petals that are white and has a big green style\nthis flower has a large upright green pistils and yellow stamen with a layer of wavy white filaments and oblong petals.\na wavy flower with a green long stigma in the center.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_119_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_119_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_119_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_119_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the bird is black with a white detail on the wing and speck of orange on the wingbar.\nthis is a black bird with orange in the wingbar with black feet\na completely black bird except a white and orange wingbar.\nthis is a small bird with black fur and feathers.\na black bird with long legs and a white and orange stripe on its wing.\nthe body of the bird is black while the wingbars are white and orange.\nthis striking bird is entirely black with the exception of its orange and white wingbars.\nthis bird is black with red and has a long, pointy beak.\nblack bird with a thin, pointy bill and a distinct red and white stripe on the wing,\nthis bird has a pointed black bill with a black back.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_120_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_120_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_120_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_120_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: there are several shapes, sizes, and colors of petals on this complex flower.\nthe stamen are seen and the filaments pointing out.\nthis flower is purple and white in color, with petals that are oval shaped.\nthe petals are very slim and wavy in shape and are white and purple-striped in color.\nthis flower has petals that are green with stringy purple stamen\nthe flower has long, stringy purple petals with long green stamens and green sepals.\nthis flower has petals that are white and has a stringy stamen\nthis flower has purple petals as well as a green pistil.\nthis purple flower's petals are like thick purple threads surrounding a pale yellow center with pale green stamen.\nthis flower is white, purple and yellow in color, and has petals that are multi shaped.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_121_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_121_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_121_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_121_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: a large bird is black and has a bright patch on it's coverts, a sharp black bill, and a black crown.\na black bird that has red and yellow spots on its wingbars.\na large contrast between the black body and orange/red spots on the wings with long legs and average sized bill and eyes.\nthis black bird has beautiful orange-yellow wingbars, with hints of orange peaking through the black primary feathers on its back.\nthis is a black bird and gray feet with a orange and yellow on the coverts\nthis bird is mostly black with a red covert and yellow wingbar.\nthis bird has wings that are black and has a orange and yellow patch\nthis bird has wings that are black and has a red and yellow patch\nthe bird is small with a pointed bill, black except for a bit of red and yellow on the covert area.\nthis bird has wings that are black and has a red and yellow patchj\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_122_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_122_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_122_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_122_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the blossom has a layer of rounded purple and white petals topped by a layer of fringed purple petals.\nthis flower has a long pink petal and a lot of blot stamen in the center\nthe flower has purple petals as well as a green stigma surrounded by purple.\nthis flower is pink, white, and purple in color, and has petals that are very skinny.\nthe flower has purple and white petals and green stigma.\nthis flower has light purple and white petals with blue filaments in a disk formation.\nthis flower has petals that are pink and white and has purple stamen\nthis flower has a purple fringe surrounded by white and green petals.\nthe anthers are very large, and the stigmas are very large.\nthis flower is white, pink, and purple in color, and has petals that are oval shaped.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_123_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_123_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_123_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_123_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this bird is mostly black, shows significant head, a short, pointy black bill, and red coverts with white tipping.\nthis bird is black with red on its wing and has a very short beak.\na bird with an all black body with bright red coverts, and black tarsus.\nthis large bird is black with small accents of red and yellow on its wings.\nthis bird is mostly black and has a red and white covert.\nthis particular bird has a black body with a red patch on its coverts\na black bird with bright red on its wing, its head is small as well as its beak, the beak is pointy.\nthis is a black bird that has red and yellow coverts.\nthis bird is predominantly black but either the covert or the secondary colors on the wings has red and the tip is white or orange/yellow.\nthis bird's body, bill, and feet are entirely black, except for a splash of bright red on the wings and a yellow wingbar.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_124_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_124_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_124_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_124_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this black bird has red and yellow on its wings and a long black beak.\nthe bird has a small black bill and black wingbar.\nthis bird is nearly all black with orange coverts, and yellow wingbars.\nblack bird with red and yellow on wing coverts\nthis bird is black with red on its wing and has a long, pointy beak.\nthis is a large black bird with a red-orange and yellow coverts.\nthis is a large mostly black bird with a red and yellow marking on its wing.\na medium size bird with a black covering and orange coverts.\nthe bird is jet black in color and has a side feather that is orange in color.\na black bird with red coverts on it's wings.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_125_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_125_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_125_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_125_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this small bird has a white and brown speckled breast and belly with a bright orange beak.\na brown and white auklet with orange is has white colored spotted with brown on its ventral side and has brown wings.\nthis bird is brown and white with speckling on the lighter-colored belly, white eyes and a short beak.\nthis bird has a speckled belly and breast with a short pointy bill.\nthis bird has a white eye, an orange bill, and a breast that is spotted\nthis bird is brown with white and has a very short beak.\nthis particular bird has a belly that is gray and white\nthis brown bird has a white speckled belly and breast and a short orange and brown bill\na medium sized bird with a short bill, and white eyes\nthis bird has a grey crown and small orange beak with grey back.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_126_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_126_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_126_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_126_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: a bird with a curved and hooked, large bill, black eyes, white breast and brown primaries and secondaries, and a large head.\nthis large bird is white with black wings that have orange accents, along with an orange beak and touch of black on the tips of the tail feathers.\nthe bird is white, though the wings and tip of the tail are brown and grey.\nthis white bird features brown wings and a long, orange beak with black shading around its black eyes.\nthis bird is large with a white head and chest and brown wings and an orange beak.\nthis bird has wings that are grey and has a long orange bill\nthis bird is white with grey and has a long, pointy beak.\nthis bird has wings that are brown and has a white belly\nthis bird has a white crown as well as a orange bill\nthis bird has a white body and head, brown wings, and a long slightly curved orange beak.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_127_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_127_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_127_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_127_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower has noodle like petals that are dull pink.\na flower with white petals and squiggly pistils accompanied by large anther filaments.\nthis flower is white and purple in color, with petals that are oval shaped.\nthis is a strange flower with purple petals and yellow stigma.\nthis flower has petals that are white with purple stamen\nthis flower has long white petals with a wavy layer of filaments.\nthis flower has blue petals as well as a green pistil.\nthe petals are thin and string with purple and white stripes and the filament are green with yellow anther.\nthis flower has petals that are white and has purple string stamen\nthis flower has long white petals and a large white pistil.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_128_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_128_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_128_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_128_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower has petals that are purple and very stringy\nthe petals on this flower are purple with an elaborate pistil.\nthis flower is purple and white in color, with petals that are oval shaped.\nthis is strange flower with purple petals and yellow stigma.\ninner petal;s are needle shped and are purple in color\nthe flower on this particular picture has petals as well as a sepal.\nthis flower has purple petals as well as a white stamen.\nthis flower has pink petals that have long, stringy and purple stamen\na large plant with purple pedals and a white tip and a white stigma.\na flower with long and narrow petals that are purple.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_129_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_129_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_129_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_129_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the flower is so big and has disc of petals below the disc of blue, white and violet stamens\nthis flower has petals that are white with purple filaments and green anthers.\npetals are white in color,inner petals are needle shpaed\nthis flower is white and blue in color, with petals that are oval shaped.\nthis flower has petals that are green with purple and stringy stamen\nthe beautiful big flower has petals that are soft, smooth and separately arranged in single layer forming disc like shape below the layer of disc like arranged stamens\nthis flower is white and blue in color, and has petals that are oval shaped.\na multi petaled white flower with visible pistons, stamens, and numerous blue filaments in the center.\nthis flower has white oblong petals underneath a layer of bright blue filaments with upright stamen and pistils.\nthis flower has petals that are pink and has purple stringy stamen\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_130_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_130_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_130_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_130_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower is blue and white in color, with petals that are oval shaped.\nthis is a flower with green sepal, purple and white spiky petals, purple style, and green stamen.\nthe flower has petals that are white with purple filaments.\nthis flower has an outer layer of white petals with an inner layer of blue and white petals surrounding stamen that are in alternating colors of purple and green.\nthis dramatic and complex flower displays a geometrical arrangement of purple and green stamens and pistils at the center, surrounded by many petals like filaments in stripes of purple, white and periwinkle, complemented by ten sepals whose inner surface is pale green.\nthis flower has blue petals as well as a green and purple pistil.\nthis flower has oval creamy whitish green petals with an inner layer of fringed petals of white and blue and very large stamen.\nthis flower has large white petals with a dark purple stigma.\nthis flower is white, and blue in color, and has petals that are oval shaped.\nthis flower has petals that are green and has purple stamen\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_131_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_131_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_131_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_131_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: here we have a bird soaring above the water and it's color is dark brown, he seems to have a long dark colored bill with a white ring around it on the upper part near the bill.\nthis large bird is black all over, with a large flattened bill.\na large bird covered n green feathers except for the bits of white on the edges of its wings, and the circle around its bill.\nthis is a large grey bird with a large grey beak.\nthis bird is brown and has a black bill with a white bit surrounding it.\nthis bird has uniformly brown plumage, with a white ring around the base of its long, brown beak.\nthis bird has wings that are brown and has a thick bill\nthis dark brown bird has long, angular wings, short rectrices, and a blunt, medium length black bill.\nthis bird has a large beak and is brown with a white ring on its face.\na large brown and black bird with large thick bill has white stripe along bill.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_132_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_132_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_132_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_132_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: a black bird with long tail feathers and a large curved beak.\nthis bird has a black crown, a short and thick bill and a black belly\nthe bird is black with a thick black hooked beak and a long black tail.\na long bodied bird that is entirely black with a large beak\nthis bird is black in color with a black beak, and black eye rings.\nthis bird is all black and has a very short beak.\nthis bird is black in color, with a black beak.\nthis bird has wings that are black and has a thick bill\nthis is a black bird with a long tail and a thick beak that curves downward.\nthis large black bird has a large, thick and blunted black beak.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_133_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_133_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_133_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_133_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the flower is so big and has stamens that are arranged in a disc like manner with curly tips above the petals that are soft, smooth and arranged separately forming a disc\nthe flower shown has yellow and green pistil with purple petals\nthe flower has petals that are purple with green stigma and wavy filaments.\nthis flower is purple in color, with petals that are oval shaped.\nthe flower has oval purple petals and skinny purple petals on top.\nthis flower has purple petals, with lavender stamen in the center.\nthe flower has elongated purple petals with purple squiggly needle shaped on top.\nthis flower has large purple petals under a thick purple fringe and green stamen.\nthis flower has purple petals as well as a purple sepal.\nthis flower has petals that are purple and a big green style\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_134_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_134_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_134_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_134_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the flower shown has green pollen tubes with stringy white petals\nwhite petals green white yellow and purple middle green and light green leaves\nthis flower has petals that are white and very stringy\nthe petals are tendril like and purple in color and stamen are visible.\nthis flower is white and purple in color, with petals that are oval shaped.\nthis flower has long white petals and a light green pistil.\nthis flower has a ring of oblong white petals topped with a layer of wavy filaments with tall pistils and stamen at the center.\nthis flower has a row of green petals under a row of very long curvy needle petals on top.\nthis flower has many stringy fibers sticking out from the pistil in the center.\nthis flower has petals that are white and has stringy stamen\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_135_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_135_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_135_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_135_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower has long thin petals that are black at the base, white at the center, and blue at the tips.\nthis flower is white and blue in color, with petals that are oval shaped.\nthis blue and green flower has a distinctive stigma and what looks like fringe along the inner style.\nthis pretty flower has long thin petal that are white and blue.\nthis flower has one row of white petals and an inner row of tri-colored petals, the inner row is medium blue at the tips, white in the middle, and purple toward the center of the flower.\nthis flower is white, blue, and green in color, and has petals that are oval shaped.\nthis flower has petals that are green and has purple stringy stamen\nthis is a beautiful unique flowers with multiple colors of petals and stems that is easy on the eyes.\nthis flower is characterized by its light green petals, vibrant blue and white stamen, and ornate stigma.\nthis flower has large upright green stamen and purple pistils along with white petals.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_136_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_136_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_136_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_136_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower has white petals with white anthers and a yellow and green stigma.\nthis flower has petals that are green with long stigma\nthis flower is white, purple, and yellow in color, with petals that are oval shaped.\nthis flower has the simple row of white petals at the bottom with the double colored flattened stamens in the circle order\nthis strange looking flower is green and has pedal with a point\na flower with a pinwheel shape base and a large wavy and multicolored stigma.\nthis flower has rounded pale green petals underneath a fringe of white and purple.\nthis flower has petals that are white and has green stamen\nthis flower has has a lower layer of white petals, a second layer of thin strands of wavy, white petals and thick yellow stigma.\nthis flower has white oblong petals and long white filaments with curly ends.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_137_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_137_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_137_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_137_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: a dark grey and white bird with a wide speckled breast and webbed feet.\nthis bird has a gray speckled appearance with large gray webbed feet.\na bird with stark white eyes, webbed feet, and small orange tipped bill.\nblack back, wings, neck, nape, crown, and feet. with white spotted breast. white streaks on eyebrows and superciliary\nthis black and white bird is mostly black with flecks of white on the head, chest, throat and feet.\nmedium sized dark grey bird with white spots, webbed feet and white eyes.\nthis unique bird is mottled black and white, with striking white eyering, a large breast, and a tiny bill.\nthis web-footed bird has a short beak, white eyes, and grey and white mottled feathers.\nthe wide the feet are gray, the eye is white and black, the corvets are black and gray\na medium size bird with a black and white mixture color.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_138_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_138_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_138_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_138_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the wings are brown, long and narrow, and have white markings on the secondaries, the bill is blunt tipped and black, the head is light brown with a white thin marking between the eye and the beak.\ngrey bird with black flat beak with grey and white big wings\nthe dark brown bird has black eye ring and black rectrices.\nthis bird's most distinct feature is its long, flat beak as well as its large wingspan.\na bird with a large black bill with downward curve, white superciliaries and brown plumage.\nthis bird has a large black bill with a white ring around the base of the bill.\nthis bird has wings that are black and has a long black bill\nthe bird has a curved black bill and two large brown wingbars.\nthis bird has a wide wing span covered in brown, grey and white feathers with a broad, blunt beak.\nthis bird has a brown crown, brown primaries, and a brown throat.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_139_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_139_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_139_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_139_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower has blue and white petals which circle the green and purple pollen tubes\nthis unique flower features a strange arrangement of white-blue petals and alien-like pistil.\nthe flower is big and disc shaped with petals and stamen forming two layers around the green pistil\na flower with little pistil and is surrounded by leaves\nthis colorful flower has star shaped petals and a bright blue and white pistil.\nthis flower is white, blue, and purple in color, and has petals that are oval shaped.\na flower with large shiny white oval sepal, blue and white bristle like petals, large green stamen, and a deep purple pistil.\nthis flower has petals that are white and has purple stamen\nthis flower has white petals with blue, white, and purple filaments.\na flower that has long narrow pistils that are white and blue.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_140_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_140_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_140_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_140_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: some sort of web-footed bird with an orange beak sits atop a rock.\na black bird with wide black feet, a white eye, and bright orange bill.\nthe bird has a white eyering and a black throat that is medium.\nthis bird is mostly grey with webbed feet, and blunt orange bill.\na multi-toned orange and white beak, webbed feet, and white eyeing make up the dramatic characteristics of this small bird.\nthis is a gray bird with webbed feet, white eye, an orange beak and a feather on its crown that stands straight up.\nthis is a medium sized black bird, with white yes, a short bill and webbed feet.\nthis bird is black with black webbed feet, a short tail, a pale eye, a black plume on its forehead, and a short thick beak with bright orange blotches.\na black bird with a short orange beak, a white eye, and a black feather on the top of its head.\na small bird with blue feet and black wings with a orange rounded beak.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_141_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_141_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_141_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_141_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower is blue and green in color, with petals that are oval shaped.\nthe petals of the flower are bright blue with white lines, and the stamen is bright green with black dots.\nthe flower has stamen and the petals are green in color.\nthis flower has green petals and purple and green stamen.\nthe flower shown has green pollen tubes with green sepal and blue petals\na large flower with neon colors and a large green stigma.\nthis blue and white flower has pointed petals and green sepals.\nthis flower has a flat row of pointed white petals and a flat row of thin blue filament on top of that.\nthis flower has petals that are green and has purple stringy stamen\nthis flower has large green petals under a fringed set of purple and white quills.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_142_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_142_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_142_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_142_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the bird has a black crown, belly, wings, the color covers its entire body.\nthe bird has a black belly, wings and a black tail.\nthis bird is black in color with a broad tail longer in length compared to the rest of its body.\nthis bird is black in color with a black beak, and black eye rings.\na black bird with a large, broad tail and a broad bill.\na completly black bird, with a rounded bill, and long tail.\nthis bird is black in color, with a curved black beak.\nthis bird is entirely black with a wide retrices and large top bill.\nthis all black bird has a long tail and a medium sized black bill.\na medium sized black bird, with a tail that is large for its body.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_143_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_143_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_143_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_143_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the flower has very thin pointy petals colored in blue white and purple\nthis flower is white and blue in color, with petals that are oval shaped.\nthis medium white flower has rows of thin blue petals and thick stamen.\nthis flower has petals and sepals shaped like a plate with an exposed pistil\nthis flower has prominent green and purple stamen and pollen tube surrounded by two layers of thin purple and wide white petals.\na clear pedal base flower with a white and purple stigma.\nthis flower is green, blue, and white in color, and has petals that are oval shaped.\nthe flower has a white petals with many stamen around the green pollen tube with a tan receptacle\nthis flower has light white petals with a small amount of green, shooting stamen that are white at the root and purple at the tips\nthis flower has petals that are green and has purple stringy stamen\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_144_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_144_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_144_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_144_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the bird is grey with a white body and small red beak.\nthis is a bird with a white belly, black back and an orange beak.\nthis is a bird with black and white feathers and a small straight beak.\na white bodied bird with black top feathers and small white eyes.\na small bird with a white spotted belly, black feet, black back and crown and bright yellow eyes.\nthis bird has wings that are black and white with a short orange beak\nthis bird has wings that are black and has an orange bill\nthis bird has a short orange bill, white breast and belly, and black crown and webbed feet.\nthis bird is black with a white throat, breast and belly that have some gray spots, the black feet are webbed and beak is pink with dark gray.\nthis bird has a grey crown with a small orange beak and grey feet.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_145_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_145_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_145_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_145_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the bird has a white breast and belly as well as a curved bill.\nthis is a white bird with a black wing and a large beak.\nthis is a tall white bird with a brown inner retrices and a long bill\na medium sized bird with a bill that curves downwards, and a white belly\nthe bird has a white body, with black primary and secondary wings, and black retrices.\nthis white and black bird has a long beak which curves downward.\nthis is a white bird with brown wings and a beak that curves downwards.\nthis bird has black winds and a white body with a long curved beak\na tall white bird with black wings, black eyes and a long curved bill with a black tip.\nthis bird is white with black and has a long, pointy beak.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_146_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_146_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_146_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_146_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this bird is all black with a slight blue tinge, and has a puffed body.\nthis bird is black with blue iridescent throughout and yellow eyes.\nthis bird is black with long wings and a very short beak.\nmedium black and blue bird with short tarsus and medium black beak\nthis bird is black with flecks of deep blue throughout, round yellow eyes with black pupils, and a beak that comes to a straight point.\nthis bird is black with blue and has a very short beak.\nthis bird is mostly black with sort of iridescent green and blue to the wings and body.\nthis bird has wings that are black and has yellow eyes\nthis bird is black with brown and has a very short beak.\na small black bird, with white eyes, and a sharp bill.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_147_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_147_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_147_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_147_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower is white and purple in color, with petals that are oval shaped.\nthe petals of the flower are mostly white while the inner layer is detailed with purple.\nthis flower has a white petal and a lot of purple anthers surrounding the petals\nthis flower has a large pistil with several stamen, purple and white frills, and white, skinny petals.\nthis medium white flower has rows of thin blue petals and thick stamen.\nthis flower has wide rounded pale petals surrounding a fringe of hair-like purple petals.\nthe purple and white petals are thin and the sepals are green and wrinkly.\nthis flower has petals that are white and has stringy stamen\nthis flower has a lower row of white petals, an upper row of long, pointed purple petals and a prominent stamen and pistil.\na small flower with thin purple and white petals surrounded by white broad leaf petals.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_148_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_148_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_148_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_148_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: a large bird has a stumpy bill, large tufts of black feathers on its breast, and a black crown.\na small bird with a black eye, black head, and dark bill.\na solid black bird with long tail feathers and a rounded beak that looks vey unusual.\nmedium black white and brown bird with medium black tarsus and medium black and white beak\nall black bird with a small bird and all black eyes.\nthis particular bird has all black feathers and a black bill and black eyes\nthe bird is completely black with a small head and rounded beak that blends into the head.\nthis bird has shiny black feathers and a curved, short beak.\nthis bird is all black and has a very short beak.\nthis bird has wings that are black and has a thick bill\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_149_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_149_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_149_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_149_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower is pink in color, with petals that are oval shaped.\nthis light purple flower has long oval petals with light green stamen.\na purple and green flower with thick filament and anther.\nflower with star shaped purple petals and long blue stamen.\nthis flower has green pistil and purple petals as its main features\nthis flower has petals that are pink and has stringy stamen\nthis flower has purple petals, purple and white stamen and green anther filaments.\nthis particular flower has petals that are long and purple with a light green center\nthis flower can be characterized by its beautiful purple petals, blue stamen organzied in a circle around the flower, as well as its ornate stigma jetting out of the flower.\nthis flower has tall purple pistils, tall green stamen, purple petals and filaments.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_150_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_150_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_150_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_150_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this bird has a gray belly and breast with darker gray head, and wings.\ndark gray bird with extremely long wings, large pointed beak, and no neck\nthis is a grey bird with a white back and a long pointed beak.\nthe bird has a white eyering and long secondaries that are dark grey.\nbird is really big with medium bill, it has dark grey and black feathers.\nthis bird has a long wingspan, and smooth feathers that are light gray tapering to dark gray on its head.\nthe bird has long black wingbars, a black eyering and curved black bill.\nthis dark grey bird has a wide wingspan with a white back and bright light eyes.\nthis bird has a black crown with a black bill and long wings with black secondaries.\nthis bird has very long wings that are mostly dark with some white in there. small eyes and small beak.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_151_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_151_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_151_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_151_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the bird has a green breast, black belly, and yellow eyering.\nthis bird is completely black and has a very short beak.\nthis bird has a large, curved, black bill, a blue throat, and a yellow eyering.\nthis bird is mostly black with a blue irdescent ring around it's neck.\nthis bird has a shiny black body and long black tail feathers, a pointy black bill and bright yellow eyes.\na small bird containing all black feathers except for the splash of teal feathers along its neck.\nthe bird has a yellow eyering, long outer rectrice, and black back.\nthis bird is mostly colored a very dark green (almost black) and has a blue throat, short bill, and black feet.\nthis bird is black and blue in color, with a black beak.\nthis bird has a blue and black breast coloration with a bright yellow eyering\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_152_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_152_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_152_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_152_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower is white and purple in color, with petals that are oval shaped.\nthe petals on this flower are white with an elaborate pistil.\nthis flower has bright white petals with purple filaments and purple anthers.\nthis flower has wide rounded pale green petals and a hair-like fringe of purple.\nthis flower is white and purple in color, and has petals that are oval shaped.\nthis flower has petals that are white and has purple stringy stamen\nthis flower has white petals as well as a green pistil.\nthis flower has long light green petals under needle like lavender white and purple petals.\nthis flower has long light green petals under needle like lavender white and purple petals.\nthis flower has long light green petals under needle like lavender white and purple petals.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_153_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_153_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_153_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_153_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: a larger black and grey bird with an orange beak.\nmedium black bird with medium black and orange beak and a small orange eye\nthis large bird has a large head compared to the body size, black wings, and a grey throat, belly and side.\nthis large bird is mostly black with a long blunt bill.\nmedium to large grey and black bird with medium black and orange beak\nthis bird has wings that are black and has an orange bill\na strange looking bird with a curved beak and small head in proportion to its body.\nthis bird is black with white and has a long, pointy beak.\nthis bird has wings that are black and has an orange bill\na black bird with a large orange beak and white eyebrows.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_154_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_154_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_154_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_154_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the petals are purple, the flower is completely open reveling the off white stamen.\nthis flower has petals that are purple with stringy purple stamen\nthe flower shown has purple petals with a yellow pistil\nthis flower is purple and white in color, with petals that are oval shaped.\nthis flower has a bottom layer of dark purple petals with a top layer of very long and thin purple petals that look like they are wiggling.\nthis flower has petals that are white and has stringy stamen\nthe petals of the flower has a hair like texture, and consist of various shades or purple and blue.\nthis flower has short petals and hairy anthers.\nthis flower has long purple petals and a layer of purple filaments with wavy ends.\nthis flower has long purple petals and a large pale yellow pistil.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_155_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_155_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_155_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_155_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the petals of this flower are green with a long stigma\nthis flower is purple and white in color, with petals that are oval shaped.\nthis flower has petals that are yellow with purple stamen\nthis flower has thick green stamen and purple fringe surrounded by wide white petals with pointed tips.\nan odd looking flower with string like purple petals over long white petals which surround large white stigma.\nthe variegated purple and white petals look like fringe.\nthis flower has petals that are white and has purple stamen\nthis flower is purple and white in color, and has petals that are oval shaped.,\nthis flower has white petals with stringy purple stamen overlapping it.\nthis flower has a bottom layer of white petals followed by a layer of wavy purple filaments.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_156_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_156_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_156_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_156_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: gray and charcoal bird with a large wing span and pointed beak\na medium bird with a gray body, wings, and dark gray face and bill.\nthe bird has a white eyering and a long grey bill.\nthis bird is grey with white on its chest and has a long, pointy beak.\nthe bird has a white belly and back with a black head and striped wings and tail.\nthis bird is grey with black and has a long, pointy beak.\nthis bird has a bright white eye ring, black head, and grey and white feathers covering the rest of its body.\nthis bird has no neck and a long beak and is gray.\nthis bird is gray it has a very white eye-it has a long beak and big wings\nthis bird has a black crown with a grey belly and long grey wings.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_157_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_157_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_157_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_157_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the flower has white and purple petals, with the yellow stamen clearly visible.\nthis flower is purple and white in color, with petals that are oval shaped.\nthe flower has stringy purple petals with green pollen tubes in the middle\nthis flower has petals that are purple with green stamen\nthis flower has protuberant green and purple stamen and pollen tube surrounded by fringed thin purple petals, which are in turn surrounded by slightly pointed wide white petals.\nthis flower has petals that are purple and has stringy purple stamen\nthis flower has white petals with blue filaments and light yellow pistils.\nthis flower has purple and green petals under a purple fringe and thick green stamen.\nthis flower is white and purple in color, and has petals that are multi shaped.\nthere is a bottom row of pale purple petals, and an inner row with numerous thin, dark purple spikes, and a prominent pale green pistol.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_158_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_158_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_158_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_158_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: small grey bird with white feathers, white stripe and orange beak.\nthis bird is grey in color with a orange short beak and white eye ring.\nthis bird is mainly all grey in color except for the white spots all over its body, and its white vent.\nthis small bird has an orange bill and a brown body with white spots.\nthis is a black bird with a white eye and an orange bill.\nthis bird has wings that are brown and has an orange bill\nthis bird has a short rounded orange bill, grey to dark grey crown, white cheek patch, and grey with white spots brease.\nthis bird who is swimming in the water has an orange beak, white streaks going down its head, and a dark gray body.\nthis bird has wings that are grey and has an orange vbill\nthis is a swimming bird and has an orange beck with white eyes and gray and white body.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_159_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_159_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_159_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_159_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the flower shown has purple petals with purple anther and filament and a green pollen tube\nthis unique white flower has long thin purple petals and a strange green center.\nthis strange flower has light-pink and purple petals with a ridiculous set of green stamen and pistil.\nthe petals of the flower are light purple and the stamen is pale green.\nlower fleshy pale pink petals with upper filament corolla and prominent stamen and stigma.\nthis flower has light and dark purple petals and the pedicel is green\nthis flower has flat elongated oval petals of a creamy lavender with central petals that are a spiky purple and large greenish stamen.\nthis flower has petals that are pink and has purple stamen\nthe pistal is very busy with lavendar petals at the bottom and many stamens and pollen tube\na flower with long and narrow petals that are purple.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_160_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_160_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_160_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_160_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: white petals on the outside purple white blue pedals inside purple yellow and white middle dark green leaves\nthe flower has large petals that are white and thin blue petals.\nthis flower is white and blue in color, with petals that are oval shaped.\nthis flower has petals that are white with stringy purple stamen\nthis flower has white petals and green and purple stamen.\nthis flower is whit , blue and green in color, and has petals that are oval shaped.\nthis flower has petals that are white and has stringy purple stamen\na clear pedaled flower base with a white and blue with a long stigma.\na flower with long and narrow petals that are white.\nthis flower has rounded pale-green petals under a thick fringe of quill-like white and purple.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_161_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_161_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_161_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_161_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the petals of the flower are pink in color and have filaments that are pink in color.\na purple flower with stringy petals on top and large petals underneath.\nthe flower has waved petals that are lavender and very thin.\nthis flower is purple and green in color, with petals that are oval shaped.\nthe petals on this flower are purple with an elaborate pistil\nthis flower has elongated purple petals under a ring of squiggley lavender white and plumb needle like petals.\nthis flower is purple in color, and has petals that are oval shaped.\nthis flower has light lavender petals layered with purple fringe petals and a large pistil.\nthis flower has horizontal lavender petals with many squiggly stamen laying sideways.\nthis flower has petals that are purple and has stringy stamen\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_162_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_162_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_162_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_162_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the bird has a white eyering and a small red bill.\na small bird with black and white markings and bright white eyes.\nthis is a black bird with a white spotted belly, a white eye and a red beak.\nthis funny looking medium sized bird has a red beak and is grey and white\nthis bird has a large white and grey breast, with a red beak and round eyes.\nthis bird is black with white and has a very short beak.\nthe small bird has a large white eye, a short orange and gray bill, and dark colored secondaries.\nthis is a highly unusual bird with a white eye with a small black pupil, and a short red beak.\nthis erect bird has big, almost white eyes, a white belly spotted with black, and white spotting at the crown.\nthis colorful bird has red beak white eyes ring gray all over\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_163_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_163_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_163_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_163_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower has light greens sepals and thin lavender petals.\nthis flower is purple and green in color, with petals that are oval shaped.\nthis flower has petals that are green with purple stamen\nthe flower has large oval white petals and thin purple petals.\nthe petals on this flower are green with an elaborate pistil.\nthis flower has wide green petals beneath a round layer of purple hairy fringe.\nthis flower has petals that are green and has stringy purple stamen\nthe stamens of the flower are in the shape of a circle, and have various shades of purple, white, and maroon throughout.\nthis flower is white, green, and purple in color, and has petals that are oval shaped.\nthis flower has large open green petals topped by needle shaped lavender white and maroon petals.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_164_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_164_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_164_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_164_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: a small bird which is black all over with a long tail and a very fat black bill.\nthe all black large bird has a large bill and small eyerings.\nthis is a black bird with a large crooked bill.\na small black bird with a large black bill and a spiked crown.\nthis bird is black in color with a black beak, and black eye rings.\nmedium to large black bird with large black beak and medium black eyes\nthis particular bird has a black body with a short black bill\na black body bird with a regularly sized head in comparison to the body.\na black bird with a large beak.\nthis bird is black it has a very large beak it looks to be a large bird\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_165_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_165_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_165_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_165_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: outer petals are green in color and klarger,inner petals are needle shaped\nthe flower has petals that are purple and white, with spread purple filaments, and green stamen.\nthis flower has long purple petals and long purple stamen in the middle\nthe pretty flower has large light purple petals with long thin stamen in it center.\nthis flower is pink and purple in color, with petals that are oval shaped.\nthis flower has flat petals that are light purple and white in color along with a layer of flat, blue filaments.\na flower with long and pointed pistils that are dark puruple.\nthis flower has pink and white petals with long stringy and purple stamen\nthis flower is pink and purple in color, and has petals that are oval shaped.\na light purple flower with thick stigma and dark purple stamen hiding under green leaves with purple veins.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_166_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_166_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_166_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_166_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: brown headed bird with a long, black, curved bill and darker tail feathers.\nthis is a medium sized grey bird with light coverts and dark primary and secondary wings.\na large grey bird with black cheek patches and a black bill.\nthis is a larger type bird with a gray body and a darker face.\nvery large bird with a long large black beak and white eyes.\nthis bird has wings that are grey and has a black bill\na brown bird with black eyebrow, the bill is long and curved and the eyebrow is black\nthis bird has brown waterproof plumage and a curved black bill.\nthis bird is grey with darker brown and black tint on the face and ends of the wings and tail feathers.\na medium size bird with grey coloring and black beak.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_167_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_167_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_167_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_167_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the bird is dark gray in color, has a stubby bill with some type of feather growths coming out of the top of it.\nthis bird has feathers pointing upward just before its beak, small white eyes, and brown feathers covering the rest of its body.\na grayish black bird with white eyes and orange stubby beak that has a tuft of gray feathers on it.\nthis magnificent specimen is mostly brown, with a white superciliary and large plumes on top of it's bill.\nan exotic looking brown bird with a white highlight on its face and orange beak.\nsmall bird with a black body, white eyes, orange beak, and black comb.\nthis bird has wings that are brown and has a long orange bill\nthis black colored bird has a bright orange beak and white eyes.\na medium size bird with a white eye and thick, orange beak.\nthis bird has wings that are brown and has an orange belly\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_168_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_168_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_168_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_168_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this unique flower has a strange set of pistil and stamen with white-pink petals.\nthis flower pale purple and has lost of petals around a green center.\nthe stamen of this purple flower appear to be a purple color as well.\nthis flower has green anther and filaments surmounting a cluster of fine purple petals and larger leaf shaped mauve petals.\nthe flower features light purple petals and dark, vibrant purple stamen surrounding many green style.\nthis flower has pale purple petals with many purple stamen and green stigma in the center.\nthis flower has petals that are pink and has stringy stamen\nthis flower has a layer of flat oblong white petals underneath a separate layer of flat blue filaments with upright stamen and pistils.\nthis flower has wide purple and green petals surrounding thick green stamen and a purple fringe.\na flower with long and narrow petals that are light purple.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_169_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_169_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_169_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_169_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this unique bird has webbed feed, is all-around black and, with a sharp malar stripe from its striking white eyes, leading from a bright orange bill and wispy, thin black feathers adorning the front of the face stemming from its crown.\na large bird with a white eye and orange bill, with a gray breast and gray belly.\nthis bird is black in color, with a vibrant orange beak and a white eye ring.\nthis distinct black bird has a bright orange bill and a feathered hat on its nose.\nthis is a dark gray bird with a white eye and a small orange bill.\nthis medium sized bird has a very long neck, a bright orange beak and a tall feather on it's crown.\nthis bird has a white eye ring and eye brow, a grey chest, belly and vent and black feathers on the rest of its body.\nthe bird has black feathers and a bright orange bill. it has white eyes and an odd black plume near it's bill.\nthis bird has a long neck covered in black feathers with white eyes and a white stripe behind his eyes and feathers standing vertical to it's short orange beak.\nthis bird has wings that are black and has an orange bill\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_170_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_170_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_170_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_170_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this bird has a spiky black crown and white eyebrow. the rest of the bird is gray and it has webbed feet.\nthis bird is mostly gray, with a short orange bill and white superciliary.\nsmall black bird with large grey feet, grey breast and belly, small white cheek patch and malar stripe, black feathered crown, and short rounded orange beak.\na small bird with a grey belly and black back with a feather on its beak.\nthis sautty bird has a grey chest and breast area and white eyes.\nthis distinctive bird has white eyes, a small orange beak, a grayish black body, and black feathers on its bill.\nthis particular bird has a gray belly and breasts and a short orange bill\na black bird with a gray breast and white eyes.\nthis bird has a yellow bill, and a tuft of feathers that stand up right at the top of the bill.\nthis bird is black and gray in color, and has a bright orange beak.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_171_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_171_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_171_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_171_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower is pink and purple in color, with petals that are oval shaped.\nthe petals on this flower are purple with green stamen.\nthis flower has petals that are pink with purple stamen\nthis flower is bright purple with purple anthers and filaments and yellow stigma.\nthis flower has large green stamen surrounded by a purple fringe and wide purple and white petals with rounded edges.\nthe petals are long and light purple and the stamens are purple with green anther.\nthis flower has petals that are pink and has stringy purple stamen\nthis flower has long tapered lavender petals that surround long, thin purple petals and surround yellow stamen with a dark purple stigma and pollen tube.\nthis purple and pink flower has many pointed petals with green and yellow anthers.\nthis flower is purple in color, and has petals that are oval shaped .,\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_172_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_172_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_172_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_172_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this dark grey bird has a orange bill with white eyes and a feather hanging over its bill.\nthis bird is all black, with black webbed feet, a black plume, and orange beak.\nthis bird is black with an orange, short, stubby beak.\nthis bird is mostly gray with a short orange bill and webbed feet.\nthis tall black bird has an orange bill and a long feather protruding from its face.\nthis bird has wings that are black with an orange beak\nthis bird is all black and has a very short beak.\nthis black bird has white eyes and black plumage on top of a bright orange shortened beak.\nthis bird has wings that are black and has an orange bill\nthis bird has wings that are black and has an orange bill\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_173_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_173_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_173_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_173_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the medium sized bird has a dark grey color, a black downward curved beak, and long wings.\nthe bird is dark grey brown with a thick curved bill and a flat shaped tail.\nbird has brown body feathers, white breast feathers and black beak\nthis bird has a dark brown overall body color, with a small white patch around the base of the bill.\nthe bird has very long and large brown wings, as well as a black body and a long black beak.\nit is a type of albatross with black wings, tail, back and beak, and has a white ring at the base of its beak.\nthis bird has brown plumage and a white ring at the base of its long, curved brown beak.\nthe entire body is dark brown, as is the bill, with a white band encircling where the bill meets the head.\nthis bird is gray in color, with a large curved beak.\na large gray bird with a long wingspan and a long black beak.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_174_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_174_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_174_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_174_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower is white and purple in color, with petals that are oval shaped.\nthe flower has big and long skinny petals that are light purple.\nthe petals on this flower are purple with an elaborate pistil.\nthis flower has petals that are purple with stringy stamen\nthis flower is bright purple with purple petals and anthers and a yellow stigma.\nthis flower is purple and white in color, and has petals that are oval shaped.\nthis flower has oblong shaped purple petals covered by long and wavy purple filaments.\nthis flower has a bottom row of lavender rounded petals and a top row of hair-like curly lavender petals with white and dark purple stripes.\nthis flower has petals that are purple and has stringy stamen\nthis flower has dozens of stringy light purple petals that have alternating white and dark purple rings towards the ovule.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_175_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_175_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_175_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_175_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this bird is brown with a lighter brown crest.\naquatic large bird with long hooked bill, white face, and brown body.\nbird has brown body feathers, brown breast feathers, and brown beak\nthis bird has a white superciliary and brown all around its body with a long bill\nthis is a brown bird with a white face and a long downward pointing beak.\nthis bird is brown with white and has a long, pointy beak.\nbrown duck playing on the lake making a poodle\nthis bird has wings that are brown and has a long bill\nthis bird has long brown bill, with a brown body.\nthis is a medium sized brown bird, with a long pointed bill.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_176_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_176_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_176_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_176_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the anthers are on think filaments that are curved, with tiny purple petals.\nthis flower is white and blue in color, with petals that are oval shaped.\nthis blossom has very large sepals, the pedals are long and very narrow, the ovary and pistil are very large with elaborate shapes.\na odd shaped flower with a center series of flower petals in the shape of a star with long purple, yellow and green stamen.\nthis flower has petals that are green with stringy stamen\nthis flower is white, yellow, and blue in color, and has petals that are oval shaped.\nthis flower has green petals as well as a purple and green pistil.\nthis flower is multicolored, with light green wedge shaped petals and blue-tipped stigma.\nthis flower has petals that are green and has purple stringy stamen\nthis blue, white, and purple flower has pointed pedals and green sepals.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_177_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_177_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_177_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_177_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this is a bird, which is rather large and is white and black in color flying through the sky.\nthe bird has a white underbelly, black feathers in the wings, a large wingspan, and a white beak.\nthis bird is a large bird with dark gray wings with hints of light gray its body and head all all white, the beak is also white but with a black tip.\nthis bird has a white breast and crown, yellow bill and black tipped primaries.\nthis large black bird has a white throat, breast, and abdomen.\nthis bird has a long wingspan, a white belly, and a white crown\nthis bird has a white crown, throat, belly, and abdomen with black inner rectrices.\nthis particular bird has a white belly and breasts with a large black wingspan\nthis particular bird has a belly that is white and has black patches on it\nthis bird is black with white and has a long, pointy beak.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_178_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_178_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_178_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_178_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the pistil area of this flower appears similar to a venus fly trap in design.\na purple and white flower with skinny long petals and a very large pistil.\nthere are many purple stamens and one large pistil with two colored overlapiing petals that are white and purple in color\nthis flower is purple and white in color, with petals that are oval shaped.\nthe flower shown has purple and white petals with a green pollen tube\nthis flower has petals that are pink and white and has purple stamen\nthis flower has a tall green pistil, dark blue filaments, and purple and white petals.\nthis flower is white, pink, and purple in color, and has petals that are oval shaped.\nthis flower has purple petals as well as a green pistil.\nthis flower has a row of alternating purple and white petals.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_179_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_179_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_179_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_179_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: a unique looking flower that has multiple colors and long petals\nthe flower has white thin petals around blue stamen with green pistil in its centre\na very unique flower with white and lavender petals and purple anther filaments.\nthis flower is white and purple in color, and has petals that are round and long.\nthe stigma is purple and the stamen are purple whereas the petals are white.\nthe flower on this particular picture has petals as well as a pistil.\nthis flower is green white and purple in color, and has petals that are oval shaped.\nthis flower has petals that are white and has stringy purple stamen\nthis flower has white and purple petals and a green pedicel\nthis flower has white petals arranged in a disk type of shape and light blue filaments on top of the petals.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_180_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_180_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_180_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_180_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower has petals in the shape of a circle and are purple and stringy\nthis flower has violet petals with light green stamen and bell-shaped anthers.\nthe flower is so big and has petals that are soft, smooth, thick and separately arranged around the stamen forming a bowl like shape\nthis flower is white and purple in color, with petals that are oval shaped.\nthis flower is very unique as it has different kinds of petals and three branches of stamen and pistil.\nthis flower has petals that are purple and has stringy stamen\na flower with long and narrow petals that are white.\nthis flower has a ghostly lavender petals surrounding curly green stamen.\nthe flower on this particular picture has petals as well as a stamen.\nthis flower has elongate green and purple petals below a ring of purple and white needle like petals.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_181_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_181_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_181_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_181_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this bird has an orange and brown bill and a grey breast.\na larger bird with a large mostly yellow beak and two toned brown feathers.\nthis is a black bird with a grey breast and an orange beak.\nthis is a water bird with an orange bill, brown feathers and yellow eyes.\nthis is a brown, swimming bird with an orange bill and tan on the breast.\nthis large bird is solid dark gray color with a bit of white mixed in on its belly.\nthis bird has wings that are black and has a big orange bill\nthis particular bird has a belly that is brown and black\nthis bird has a black crown as well as a yellow bill\nthis large bird has a predator's beak, with a big head, black body, and brown chest,\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_182_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_182_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_182_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_182_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this bird has a dark grey color, with a large bill and long wingspan.\nthis grey bird has an impressive wingspan, a black head, and comparatively medium-sized black bill.\nthis is a grey bird with black wings and a black head and beak.\nthis bird has a gray body with a black head and very long darker gray wings.\nthis bird has large black wings and head, black bill and white abdomen.\nthis is a bird with a light grey body, darker grey wings, a black bill and lighter grey wingbars.\nthe large bird has a dark colored bill, long dark wings, and a gray back.\nthis bird is very small with a black crown and point black beak, but has a very expansive wingspan.\nbird has black crown and beak, grey back, long black wings and tail.\nthis bird has a wide wing span, with a long black bill.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_183_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_183_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_183_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_183_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this bird has a long beak, white face, brown wings and white legs.\na large bird with a large wingspan, covered in brown feathers from its back, to its wings, with a white head, and rump.\na bird with a large downward curved bill, white throat and head, brown breast and white abdomen.\na brown winged bird with a white rump and head, a brown tail and a long light yellow beak with a slightly curved tip.\nthis seabird has a white head and brown wings, with a nice shapely yellow beak.\nthe head of the bird is white in color and the body is grey.\nthis bird has a long pointy bill and a white head.\nthis bird has wings that are brown and has a white belly\nthis bird has a white crown, brown primaries, and a white throat.\na cigar-shaped white bird with long brown wings and hooked long bill.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_184_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_184_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_184_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_184_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this bird is mostly gray with a bright orange bill.\na bird with a gray body and wings, a white eye with cheek patch, and orange bill.\na large bird with an all dark grey body, grey and black wing feathers, and a curved bright orange bill.\nthis bird is mostly grey with a short bright orange bill.\nthis is a gray bird with a white eyering and a large gray wingbar\nthis bird is black with white and has a very short beak.\na black bird with white eyes and a orange bill.\nthis is a medium sized black bird, with a short yellow bill.\nthis bird has an ashy black coloring with a streak of white behind the eyes and a small pink beak.\na medium sized bird with black feet and a black breast and belly with a orange and black beak.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_185_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_185_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_185_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_185_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower has green pistil and purple and white petals as its main features\nthis flower is blue and white in color, with petals that are oval shaped.\nthe petals of this flower are purple with a long stigma\nthis flower has petals that are are white with purple filaments and purple anthers.\ninner petals are purple in color and are needle shaped,outer petals are white in color\nthis blue purple and white flower has pointed petals and white sepals.\nthis flower has very thin stamen that are colored dark purple, lavender, and white.\nthis flower has petals that are white and has purple stamen\nthis flower has pale green rounded petals with a fringe of purple and white quills.\nthis flower has a single row of white oblong petals followed by a row of flat blue filaments.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_186_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_186_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_186_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_186_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flying bird has long brown wings and a black bill with a white stripe around it.\nbird has brown body feathers, brown breast feathers, and brown thick beak\na large brown bird with white secondaries, a black bill and yellow eyes.\nthis bird is black with brown on its stomach and has a long, pointy beak.\nthis is a brown bird with a white eye and a long and pointy bill.\nthis bird is mostly dark grey and has a white ring around its bill.\nthis is a brown bird with an incredible wingspan and an extended bill for its size.\na large long bill, wide wing span dark brown bird with light beige tipped primaries.\nthe large bird has a black bill, crown and nape and long wings that have brown secondaries and black converts together with a brown belly and tail.\nthis bird is brown with black and has a long, pointy beak.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_187_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_187_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_187_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_187_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: two layers of pale violet petals are present, including a lower ring of pointed petals and an upper ring of wavy, filament-like petals, surrounding a pale green stamen.\nthe flowers has needles petals around the the stigmas and stamens.\nthis flower is white and purple in color, with petals that are oval shaped.\nthis flower has long white stamen and petals that are white in color\nthe outer petals are green in color,inner peals are purple and needle shaped\nthis purple and white flower has pointed petals and green yellow stamen.\nthe flower has bright green and purple petals with green and yellow stamens.\nthis flower has hair like petals with a yellow stamen in the center of it.\nthis flower has petals that are white and has purple stringy stamen\nthis flower is purple, white, and yellow in color, and has petals that are oval shaped.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_188_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_188_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_188_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_188_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this bird's coloration is varying shades of gray and has dark primaries, a dark crown and a long, slender bill.\na larger bird that is gray and white with a large wingspan.\nthe bird has a white and grey body with a grey beak and grey wings.\na large gray bird with a dark gray beak and gray wings.\nthis large bird has a gray body, white eyering, and long hooked bill.\nthis bird has a white and gray back, a white eyering and a black bill; the rest of its body is varying shades of gray.\nthis particular bird has a white back and gray secondaries\nthe bird has a white eyering and two large black wingbars.\nthis bird has a very large wing span, and a long black bill.\nthis bird is gray and black in color, and has a long black beak.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_189_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_189_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_189_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_189_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower has an elaborate golden stamen and two different types of purple petals.\nthis flower is purple and pink in color, with petals that are oval shaped.\nthis flower has purple petals with blue and purple filaments and a yellow and green stigma.\nthe petals of this flower are purple with a long stigma\nthis flower has petals that are pink with purple stamen\na flower with long and narrow petals that are light purple.\nthis multi-purple flower has large petals ranging in 2 shades of purple with a stigma that is the color of light green and yellow.\nthe flower has white and pink petals and the pedicel is green\nthis flower has petals that are pink and has purple stamen\nthis flower has wide purple and green petals surrounding green stamen and a purple hair-like fringe.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_190_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_190_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_190_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_190_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the bird has a black crown, and has long white feathers on its cheek patches.\na close up of a bird with black crown, white eyebrow, white malar stripe, a blueish throat and brast, and a yellow beak with a blakened top to the bill.\nthis is a grey bird with a black head and a pointy orange beak.\nlarge bird with a short orange beak with a curve on it; has white whiskers.\nthis bird is black with white on its feathers and has a long, pointy beak.\nthis particular bird has a gray neck and black head with white cheek patches\nthis is a large bird with a light blue body dark blue wings and a darker head with white on it and a yellow beak.\nthis bird has wings that are black and has a yellow bill\nthis gray bird has a dark gray head, white cheek patches, and an orange bill.\nthis bird is grey with black and has a long, pointy beak.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_191_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_191_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_191_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_191_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this weird flower has a dense arrangement of blue-white petals and an ornate pistil-stamen arrangement.\na unique blue and white flower with green anther filaments.\nthis unusual flower has long thin blue and white petals and a green center.\nthe flower is big with petals and stamen formed like layers of discs with pistil sticking out in the centre\nthis flower is white and purple in color, and has petals that are light green.\nthis flower is white and purple in color, with oval shaped petals.\na unique flower with large white and green petals, long purple stamen sprouting from an ovary covered pistil.\nthis flower has petals that are green and has stringy purple stamen\na flower with long and narrow pistils that are blue with white centers.\na very beautiful yellow flower with exquisite petals and striking beauty of a plant at its finest moment\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_192_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_192_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_192_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_192_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower is white and purple in color, with petals that are oval shaped.\ninner petals are needle shaped,and are purple in color\nthis flower has petals that are white with purple stamen\nthe petals on this flower are white with an elaborate pistil.\nthis flower has large white petals and a light green pistil.\nthis flower is purple, white, and green in color, and has petals that are oval shaped.\nthis flower has white petals and has long and stringy stamen\nthe flower has thin long white petals with green stamen.\nthis flower has a row of elongated greenish petals under a row of needle like lavender white and purple petals.\nthis flower has a row of elongated greenish petals under a row of needle like lavender white and purple petals.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_193_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_193_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_193_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_193_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this large bird has a buff colored belly, a long heavy beak on a white-fronted face , and long, dark brown wings.\nthis bird has a large, curved, gray bill, a white cheek patch, and a gray breast and belly.\na medium sized bird with a grey body and a bill that curves down wards\nthis gray bird has black wings and a white head, and a long beak.\nthis bird is black and brown in color with a curved black beak, and black eye rings.\nthe black wings have brown wingbars, the bill is short and pointed, and the head is small compared to the body.\nthis bird has brown and white wings, grey breast, belly and vent, and a white ring around its bill.\nthis bird has wings that are grey and has a long black bill\na large bird with large wings and bill.\nthis bird has a slight hook shaped beak and a wide wing span, it's body is a light brown color.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_194_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_194_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_194_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_194_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this is a gray bird with a whitish belly and a short orange bill.\nan average sized bird, its black and white body blends in with the rock it is sitting on\na large footed bird with a short, blunt, orange bill, distinct long white feathers coming from its eye, is white from breast down around to its undertail coverts, and grey from its head, on its wings and to its tail.\nbird has gray body feathers,white breast feather, and orange beak\nthis bird has a short orange bill, a white belly & breast, white tarsus & feet, and a gray crown.\nthis bird is grey with white on its chest and has a very short beak.\nthis stout bird has a white belly, a bright orange bill, a white eyering, with a gray on the side and the wings.\na small bird with a white belly, a small orange beak, and a white feather sticking out from its eye.\nthis bird has wings that are black and has a white belly\nthis bird has wings that are black and has a white belly\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_195_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_195_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_195_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_195_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower has petals that are white with purple stringy stamen\nthis flower is blue and white in color, with petals that are oval shaped.\nthe flower is so big and has disc of petals below the disc of blue and white stamens\nseapls are green in color,petals are white in color and inner petals are purple\nthis flower has petals that are white with purple filaments and green anthers.\nthis flower has pure white petals with lavender stigma and a green stamen.\nthis flower has white petals with purple and white anthers.\nthis flower has thin white petals and stringy purple stamen\nthis flower has white petals with bright blue filaments, purple pistils, and green stamens.\nthis flower has rounded pale green petals and a layer of thick purple and white quills.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_196_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_196_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_196_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_196_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the bird has a white eyering and a white breast and belly.\na large bird with a long black bill, white back, breast and belly, and white and grey wings.\nthis big bird has a white belly and back, black wings and head, and a blue bill.\nthis bird has a black crown, a flat bill, and a white breast\na medium bird with a white belly and back, gray rump and a large gray head and bill.\nthis bird has wings that are black and has a white belly and chest\nthis bird is white and black in color, with a large black beak.\nthis bird has a large black and white body, a large black head, a long black beak that curves downward.\nlarge bird with a white chest and brownish grey wings and head. its beak is rounded and somewhat long.\nthis bird has wings that are black and has a white belly\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_197_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_197_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_197_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_197_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "Oxford_102_flower_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the pistil is white and is very noticeable, and the petals are purple.\nthis flower is purple and white in color, and has petals that are very skinny like strings.\nthe flower shown has green sepal and lots of purple and white anther\nthe rounded and notched bright green leaves of this plant surround a vibrant purple bloom that features curling lavender petals, rounded lavender sepals, and a tall, white pistil.\nthis purple flower has two different types of petals, one type is stink like and the others are oval shaped.\nthis flower has large purple petals and a white pistil.\nthis flower has many purple petals as well as some strange curly hair-like whitish-purple things.\npurple string like petals above another wider purple set. bright yellow pistil and stamen.\nthis flower has petals that are purple and has stringy stamen\nthis flower has stringy purple petals and a green pedicel.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_198_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_198_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_198_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_198_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "text2image_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CUB220_2011_retrieval",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this longtailed black bird has a black and white spotted breast.\nthis bird is shiny blue in color with a small black beak and black eye rings.\nthis is a dark blue bird with white eyes and a small beak.\nthis bird has solid black wings and a solid black head.\nthis bird has a rounded breast, a small bill, and a short neck\nthis is a jet black bird with mottled black and white belly and long black tail feathers.\nblack bird with long tail sitting on a rail.\nthis bird has a black pointed beak, with yellow eyes.\nthe bird has a small bill and a black back and belly.\nthis bird has a black pointed bill, with a black breast.\n",
+    "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_199_0.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_199_1.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_199_2.jpg",
+      "../MMIU-Benchmark/text2image_retrieval/text2image_retrieval_199_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_0_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_0_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_0_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_0_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_0_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_1_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_1_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_1_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_1_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_1_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_2_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_2_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_2_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_2_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_2_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_3_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_3_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_3_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_3_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_3_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_4_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_4_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_4_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_4_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_4_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_5_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_5_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_5_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_5_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_5_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_6_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_6_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_6_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_6_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_6_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_7_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_7_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_7_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_7_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_7_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_8_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_8_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_8_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_8_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_8_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_9_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_9_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_9_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_9_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_9_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_10_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_10_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_10_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_10_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_10_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_11_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_11_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_11_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_11_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_11_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_12_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_12_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_12_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_12_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_12_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_13_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_13_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_13_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_13_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_13_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_14_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_14_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_14_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_14_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_14_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_15_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_15_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_15_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_15_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_15_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_16_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_16_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_16_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_16_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_16_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_17_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_17_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_17_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_17_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_17_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_18_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_18_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_18_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_18_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_18_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_19_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_19_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_19_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_19_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_19_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_20_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_20_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_20_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_20_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_20_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_21_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_21_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_21_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_21_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_21_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_22_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_22_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_22_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_22_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_22_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_23_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_23_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_23_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_23_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_23_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_24_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_24_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_24_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_24_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_24_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_25_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_25_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_25_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_25_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_25_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_26_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_26_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_26_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_26_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_26_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_27_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_27_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_27_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_27_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_27_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_28_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_28_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_28_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_28_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_28_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_29_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_29_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_29_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_29_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_29_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_30_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_30_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_30_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_30_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_30_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_31_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_31_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_31_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_31_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_31_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_32_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_32_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_32_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_32_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_32_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_33_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_33_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_33_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_33_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_33_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_34_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_34_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_34_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_34_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_34_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_35_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_35_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_35_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_35_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_35_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_36_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_36_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_36_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_36_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_36_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_37_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_37_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_37_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_37_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_37_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_38_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_38_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_38_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_38_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_38_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_39_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_39_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_39_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_39_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_39_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_40_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_40_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_40_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_40_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_40_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_41_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_41_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_41_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_41_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_41_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_42_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_42_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_42_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_42_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_42_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_43_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_43_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_43_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_43_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_43_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_44_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_44_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_44_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_44_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_44_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_45_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_45_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_45_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_45_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_45_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_46_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_46_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_46_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_46_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_46_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_47_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_47_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_47_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_47_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_47_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_48_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_48_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_48_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_48_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_48_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_49_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_49_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_49_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_49_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_49_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_50_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_50_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_50_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_50_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_50_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_51_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_51_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_51_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_51_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_51_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_52_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_52_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_52_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_52_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_52_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_53_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_53_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_53_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_53_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_53_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_54_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_54_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_54_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_54_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_54_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_55_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_55_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_55_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_55_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_55_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_56_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_56_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_56_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_56_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_56_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_57_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_57_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_57_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_57_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_57_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_58_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_58_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_58_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_58_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_58_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_59_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_59_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_59_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_59_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_59_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_60_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_60_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_60_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_60_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_60_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_61_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_61_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_61_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_61_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_61_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_62_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_62_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_62_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_62_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_62_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_63_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_63_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_63_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_63_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_63_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_64_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_64_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_64_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_64_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_64_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_65_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_65_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_65_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_65_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_65_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_66_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_66_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_66_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_66_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_66_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_67_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_67_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_67_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_67_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_67_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_68_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_68_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_68_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_68_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_68_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_69_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_69_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_69_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_69_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_69_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_70_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_70_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_70_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_70_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_70_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_71_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_71_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_71_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_71_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_71_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_72_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_72_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_72_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_72_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_72_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_73_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_73_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_73_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_73_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_73_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_74_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_74_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_74_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_74_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_74_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_75_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_75_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_75_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_75_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_75_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_76_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_76_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_76_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_76_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_76_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_77_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_77_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_77_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_77_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_77_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_78_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_78_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_78_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_78_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_78_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_79_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_79_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_79_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_79_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_79_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_80_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_80_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_80_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_80_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_80_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_81_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_81_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_81_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_81_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_81_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_82_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_82_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_82_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_82_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_82_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_83_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_83_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_83_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_83_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_83_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_84_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_84_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_84_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_84_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_84_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_85_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_85_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_85_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_85_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_85_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_86_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_86_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_86_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_86_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_86_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_87_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_87_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_87_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_87_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_87_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_88_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_88_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_88_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_88_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_88_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_89_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_89_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_89_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_89_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_89_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_90_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_90_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_90_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_90_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_90_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_91_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_91_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_91_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_91_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_91_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_92_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_92_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_92_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_92_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_92_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_93_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_93_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_93_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_93_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_93_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_94_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_94_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_94_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_94_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_94_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_95_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_95_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_95_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_95_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_95_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_96_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_96_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_96_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_96_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_96_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_97_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_97_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_97_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_97_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_97_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_98_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_98_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_98_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_98_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_98_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_99_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_99_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_99_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_99_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_99_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_100_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_100_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_100_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_100_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_100_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_101_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_101_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_101_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_101_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_101_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_102_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_102_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_102_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_102_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_102_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_103_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_103_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_103_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_103_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_103_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_104_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_104_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_104_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_104_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_104_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_105_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_105_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_105_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_105_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_105_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_106_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_106_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_106_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_106_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_106_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_107_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_107_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_107_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_107_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_107_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_108_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_108_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_108_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_108_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_108_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_109_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_109_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_109_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_109_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_109_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_110_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_110_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_110_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_110_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_110_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_111_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_111_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_111_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_111_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_111_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_112_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_112_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_112_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_112_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_112_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_113_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_113_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_113_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_113_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_113_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_114_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_114_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_114_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_114_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_114_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_115_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_115_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_115_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_115_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_115_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_116_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_116_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_116_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_116_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_116_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_117_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_117_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_117_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_117_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_117_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_118_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_118_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_118_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_118_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_118_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_119_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_119_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_119_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_119_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_119_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_120_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_120_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_120_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_120_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_120_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_121_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_121_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_121_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_121_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_121_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_122_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_122_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_122_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_122_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_122_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_123_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_123_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_123_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_123_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_123_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_124_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_124_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_124_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_124_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_124_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_125_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_125_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_125_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_125_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_125_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_126_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_126_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_126_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_126_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_126_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_127_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_127_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_127_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_127_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_127_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_128_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_128_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_128_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_128_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_128_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_129_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_129_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_129_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_129_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_129_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_130_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_130_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_130_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_130_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_130_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_131_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_131_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_131_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_131_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_131_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_132_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_132_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_132_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_132_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_132_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_133_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_133_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_133_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_133_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_133_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_134_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_134_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_134_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_134_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_134_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_135_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_135_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_135_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_135_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_135_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_136_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_136_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_136_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_136_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_136_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_137_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_137_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_137_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_137_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_137_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_138_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_138_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_138_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_138_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_138_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_139_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_139_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_139_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_139_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_139_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_140_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_140_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_140_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_140_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_140_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_141_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_141_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_141_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_141_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_141_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_142_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_142_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_142_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_142_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_142_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_143_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_143_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_143_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_143_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_143_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_144_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_144_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_144_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_144_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_144_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_145_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_145_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_145_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_145_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_145_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_146_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_146_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_146_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_146_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_146_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_147_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_147_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_147_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_147_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_147_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_148_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_148_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_148_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_148_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_148_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_149_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_149_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_149_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_149_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_149_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_150_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_150_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_150_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_150_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_150_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_151_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_151_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_151_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_151_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_151_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_152_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_152_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_152_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_152_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_152_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_153_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_153_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_153_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_153_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_153_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_154_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_154_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_154_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_154_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_154_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_155_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_155_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_155_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_155_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_155_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_156_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_156_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_156_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_156_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_156_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_157_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_157_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_157_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_157_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_157_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_158_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_158_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_158_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_158_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_158_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_159_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_159_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_159_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_159_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_159_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_160_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_160_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_160_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_160_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_160_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_161_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_161_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_161_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_161_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_161_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_162_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_162_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_162_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_162_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_162_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_163_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_163_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_163_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_163_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_163_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_164_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_164_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_164_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_164_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_164_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_165_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_165_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_165_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_165_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_165_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_166_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_166_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_166_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_166_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_166_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_167_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_167_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_167_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_167_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_167_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_168_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_168_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_168_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_168_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_168_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_169_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_169_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_169_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_169_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_169_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_170_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_170_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_170_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_170_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_170_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_171_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_171_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_171_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_171_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_171_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_172_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_172_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_172_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_172_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_172_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_173_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_173_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_173_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_173_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_173_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_174_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_174_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_174_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_174_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_174_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_175_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_175_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_175_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_175_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_175_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_176_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_176_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_176_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_176_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_176_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_177_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_177_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_177_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_177_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_177_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_178_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_178_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_178_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_178_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_178_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_179_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_179_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_179_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_179_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_179_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_180_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_180_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_180_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_180_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_180_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_181_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_181_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_181_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_181_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_181_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_182_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_182_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_182_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_182_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_182_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_183_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_183_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_183_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_183_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_183_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_184_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_184_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_184_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_184_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_184_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_185_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_185_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_185_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_185_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_185_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_186_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_186_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_186_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_186_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_186_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_187_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_187_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_187_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_187_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_187_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_188_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_188_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_188_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_188_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_188_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_189_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_189_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_189_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_189_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_189_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_190_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_190_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_190_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_190_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_190_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_191_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_191_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_191_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_191_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_191_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_192_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_192_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_192_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_192_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_192_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_193_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_193_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_193_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_193_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_193_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_194_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_194_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_194_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_194_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_194_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_195_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_195_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_195_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_195_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_195_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_196_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_196_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_196_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_196_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_196_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_197_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_197_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_197_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_197_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_197_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "CelebA_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_198_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_198_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_198_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_198_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_198_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "face_retrieval",
+    "visual_input_component": "['natural_image']",
+    "source": "lfw_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_199_0.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_199_1.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_199_2.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_199_3.jpg",
+      "../MMIU-Benchmark/face_retrieval/face_retrieval_199_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_0_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_0_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_0_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_0_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_0_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_1_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_1_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_1_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_1_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_1_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_2_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_2_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_2_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_2_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_2_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_3_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_3_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_3_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_3_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_3_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_4_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_4_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_4_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_4_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_4_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_5_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_5_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_5_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_5_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_5_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_6_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_6_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_6_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_6_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_6_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_7_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_7_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_7_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_7_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_7_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_8_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_8_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_8_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_8_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_8_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_9_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_9_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_9_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_9_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_9_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_10_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_10_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_10_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_10_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_10_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_11_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_11_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_11_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_11_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_11_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_12_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_12_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_12_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_12_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_12_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_13_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_13_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_13_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_13_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_13_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_14_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_14_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_14_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_14_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_14_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_15_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_15_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_15_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_15_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_15_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_16_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_16_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_16_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_16_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_16_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_17_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_17_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_17_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_17_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_17_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_18_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_18_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_18_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_18_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_18_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_19_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_19_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_19_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_19_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_19_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_20_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_20_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_20_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_20_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_20_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_21_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_21_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_21_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_21_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_21_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_22_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_22_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_22_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_22_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_22_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_23_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_23_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_23_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_23_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_23_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_24_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_24_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_24_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_24_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_24_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_25_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_25_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_25_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_25_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_25_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_26_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_26_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_26_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_26_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_26_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_27_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_27_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_27_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_27_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_27_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_28_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_28_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_28_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_28_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_28_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_29_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_29_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_29_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_29_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_29_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_30_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_30_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_30_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_30_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_30_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_31_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_31_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_31_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_31_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_31_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_32_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_32_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_32_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_32_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_32_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_33_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_33_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_33_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_33_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_33_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_34_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_34_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_34_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_34_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_34_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_35_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_35_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_35_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_35_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_35_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_36_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_36_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_36_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_36_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_36_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_37_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_37_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_37_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_37_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_37_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_38_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_38_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_38_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_38_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_38_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_39_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_39_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_39_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_39_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_39_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_40_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_40_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_40_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_40_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_40_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_41_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_41_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_41_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_41_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_41_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_42_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_42_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_42_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_42_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_42_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_43_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_43_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_43_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_43_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_43_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_44_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_44_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_44_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_44_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_44_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_45_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_45_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_45_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_45_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_45_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_46_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_46_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_46_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_46_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_46_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_47_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_47_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_47_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_47_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_47_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_48_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_48_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_48_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_48_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_48_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_49_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_49_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_49_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_49_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_49_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_50_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_50_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_50_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_50_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_50_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_51_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_51_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_51_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_51_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_51_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_52_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_52_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_52_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_52_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_52_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_53_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_53_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_53_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_53_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_53_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_54_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_54_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_54_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_54_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_54_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_55_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_55_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_55_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_55_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_55_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_56_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_56_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_56_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_56_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_56_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_57_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_57_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_57_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_57_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_57_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_58_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_58_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_58_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_58_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_58_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_59_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_59_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_59_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_59_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_59_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_60_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_60_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_60_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_60_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_60_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_61_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_61_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_61_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_61_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_61_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_62_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_62_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_62_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_62_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_62_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_63_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_63_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_63_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_63_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_63_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_64_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_64_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_64_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_64_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_64_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_65_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_65_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_65_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_65_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_65_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_66_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_66_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_66_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_66_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_66_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_67_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_67_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_67_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_67_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_67_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_68_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_68_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_68_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_68_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_68_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_69_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_69_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_69_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_69_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_69_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_70_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_70_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_70_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_70_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_70_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_71_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_71_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_71_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_71_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_71_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_72_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_72_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_72_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_72_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_72_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_73_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_73_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_73_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_73_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_73_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_74_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_74_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_74_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_74_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_74_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_75_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_75_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_75_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_75_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_75_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_76_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_76_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_76_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_76_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_76_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_77_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_77_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_77_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_77_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_77_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_78_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_78_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_78_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_78_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_78_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_79_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_79_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_79_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_79_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_79_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_80_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_80_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_80_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_80_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_80_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_81_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_81_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_81_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_81_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_81_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_82_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_82_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_82_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_82_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_82_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_83_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_83_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_83_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_83_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_83_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_84_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_84_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_84_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_84_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_84_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_85_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_85_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_85_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_85_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_85_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_86_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_86_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_86_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_86_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_86_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_87_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_87_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_87_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_87_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_87_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_88_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_88_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_88_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_88_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_88_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_89_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_89_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_89_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_89_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_89_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_90_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_90_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_90_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_90_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_90_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_91_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_91_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_91_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_91_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_91_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_92_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_92_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_92_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_92_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_92_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_93_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_93_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_93_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_93_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_93_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_94_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_94_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_94_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_94_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_94_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_95_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_95_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_95_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_95_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_95_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_96_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_96_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_96_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_96_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_96_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_97_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_97_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_97_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_97_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_97_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_98_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_98_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_98_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_98_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_98_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_99_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_99_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_99_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_99_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_99_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_100_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_100_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_100_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_100_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_100_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_101_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_101_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_101_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_101_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_101_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_102_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_102_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_102_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_102_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_102_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_103_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_103_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_103_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_103_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_103_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_104_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_104_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_104_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_104_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_104_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_105_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_105_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_105_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_105_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_105_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_106_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_106_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_106_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_106_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_106_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_107_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_107_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_107_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_107_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_107_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_108_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_108_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_108_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_108_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_108_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_109_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_109_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_109_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_109_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_109_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_110_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_110_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_110_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_110_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_110_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_111_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_111_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_111_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_111_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_111_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_112_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_112_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_112_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_112_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_112_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_113_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_113_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_113_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_113_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_113_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_114_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_114_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_114_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_114_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_114_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_115_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_115_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_115_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_115_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_115_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_116_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_116_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_116_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_116_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_116_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_117_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_117_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_117_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_117_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_117_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_118_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_118_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_118_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_118_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_118_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_119_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_119_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_119_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_119_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_119_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_120_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_120_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_120_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_120_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_120_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_121_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_121_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_121_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_121_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_121_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_122_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_122_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_122_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_122_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_122_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_123_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_123_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_123_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_123_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_123_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_124_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_124_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_124_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_124_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_124_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_125_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_125_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_125_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_125_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_125_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_126_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_126_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_126_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_126_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_126_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_127_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_127_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_127_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_127_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_127_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_128_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_128_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_128_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_128_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_128_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_129_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_129_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_129_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_129_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_129_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_130_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_130_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_130_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_130_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_130_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_131_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_131_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_131_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_131_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_131_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_132_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_132_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_132_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_132_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_132_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_133_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_133_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_133_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_133_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_133_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_134_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_134_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_134_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_134_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_134_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_135_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_135_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_135_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_135_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_135_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_136_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_136_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_136_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_136_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_136_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_137_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_137_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_137_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_137_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_137_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_138_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_138_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_138_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_138_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_138_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_139_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_139_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_139_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_139_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_139_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_140_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_140_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_140_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_140_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_140_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_141_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_141_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_141_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_141_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_141_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_142_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_142_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_142_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_142_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_142_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_143_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_143_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_143_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_143_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_143_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_144_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_144_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_144_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_144_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_144_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_145_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_145_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_145_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_145_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_145_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_146_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_146_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_146_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_146_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_146_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_147_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_147_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_147_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_147_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_147_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_148_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_148_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_148_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_148_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_148_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_149_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_149_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_149_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_149_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_149_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_150_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_150_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_150_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_150_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_150_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_151_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_151_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_151_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_151_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_151_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_152_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_152_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_152_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_152_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_152_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_153_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_153_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_153_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_153_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_153_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_154_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_154_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_154_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_154_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_154_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_155_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_155_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_155_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_155_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_155_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_156_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_156_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_156_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_156_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_156_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_157_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_157_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_157_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_157_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_157_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_158_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_158_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_158_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_158_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_158_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_159_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_159_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_159_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_159_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_159_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_160_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_160_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_160_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_160_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_160_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_161_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_161_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_161_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_161_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_161_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_162_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_162_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_162_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_162_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_162_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_163_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_163_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_163_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_163_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_163_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_164_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_164_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_164_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_164_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_164_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_165_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_165_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_165_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_165_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_165_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_166_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_166_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_166_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_166_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_166_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_167_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_167_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_167_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_167_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_167_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_168_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_168_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_168_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_168_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_168_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_169_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_169_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_169_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_169_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_169_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_170_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_170_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_170_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_170_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_170_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_171_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_171_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_171_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_171_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_171_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_172_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_172_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_172_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_172_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_172_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_173_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_173_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_173_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_173_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_173_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_174_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_174_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_174_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_174_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_174_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_175_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_175_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_175_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_175_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_175_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_176_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_176_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_176_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_176_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_176_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_177_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_177_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_177_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_177_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_177_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_178_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_178_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_178_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_178_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_178_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_179_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_179_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_179_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_179_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_179_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_180_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_180_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_180_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_180_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_180_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_181_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_181_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_181_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_181_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_181_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_182_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_182_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_182_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_182_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_182_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_183_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_183_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_183_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_183_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_183_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_184_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_184_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_184_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_184_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_184_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_185_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_185_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_185_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_185_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_185_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_186_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_186_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_186_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_186_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_186_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_187_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_187_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_187_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_187_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_187_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_188_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_188_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_188_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_188_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_188_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_189_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_189_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_189_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_189_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_189_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_190_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_190_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_190_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_190_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_190_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_191_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_191_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_191_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_191_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_191_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_192_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_192_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_192_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_192_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_192_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_193_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_193_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_193_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_193_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_193_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_194_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_194_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_194_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_194_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_194_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_195_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_195_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_195_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_195_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_195_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_196_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_196_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_196_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_196_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_196_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_197_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_197_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_197_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_197_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_197_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_198_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_198_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_198_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_198_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_198_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "handwritten_retrieval",
+    "visual_input_component": "['text-rich_image']",
+    "source": "iam_handwritten_retrieval",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_199_0.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_199_1.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_199_2.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_199_3.png",
+      "../MMIU-Benchmark/handwritten_retrieval/handwritten_retrieval_199_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "There are three dogs. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_0_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_0_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "An image contains a sitting baboon who is holding a roundish, yellowish fruit. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_1_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_1_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "There are two jellyfish and they both appear to have long tails trailing below them. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_2_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_2_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "All the cars are convertible and red. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_3_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_3_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The carts have only single riders on them. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_4_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_4_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "One image features a multi-door scene with one tree and floating pink petal shapes, and the other image features a multi-door scene with a tree on each side. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_5_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_5_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "A man is riding between two animals. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_6_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_6_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The two dogs' bodies are pointing in opposite directions from each other. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_7_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_7_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "One of the images contains at least three graduates with gold-colored sashes around their necks. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_8_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_8_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "There are at least 2 animals in the left image. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_9_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_9_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "An image contains two dingoes. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_10_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_10_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "An image shows a baby chimp and baby gorilla sitting side by side and interacting. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_11_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_11_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "An arched doorway sits under the stairway in one of the images. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_12_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_12_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "There are two dogs that are staring straight ahead. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_13_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_13_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "Each image contains an old-fashioned TV with controls on the right of its screen, and no TV has a lit screen or picture displayed on the screen. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_14_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_14_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "An image contains a single chimp, which is eating something nut-like and holding more food in its hand. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_15_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_15_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "A dog is sitting on the grass in the image on the left. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_16_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_16_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "There is a dog on a pool raft in the left image. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_17_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_17_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "There are two ibex in the left image. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_18_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_18_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "One set of lips is not glossy. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_19_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_19_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "One of the dogs is on wood. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_20_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_20_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "Both staircases have vertical post designed railings. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_21_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_21_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The dumbbells in the image on the right are shown in a variety of colors. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_22_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_22_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "One pizza is in a box. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_23_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_23_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "There is a single dog in the right image and it is wearing a red collar. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_24_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_24_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The right image contains exactly three dogs. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_25_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_25_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "There are at least two rodents in the right image. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_26_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_26_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In the left image, a person is lifting a free weight. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_27_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_27_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In at least one image there is a saxophone with keys that are a different color from the base instrument. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_28_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_28_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "All the instruments are standing on their ends. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_29_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_29_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "There are fewer than twenty golf balls. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_30_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_30_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "Multiple people are riding in a two wheeled cart pulled along a dirt path by one water buffalo. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_31_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_31_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The left image contains exactly two dispensers. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_32_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_32_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The right image contains two dogs wearing life vests. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_33_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_33_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "One image includes closed multi-compartment zipper cases shown in six solid-color options. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_34_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_34_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "There are two dogs on the left image, and three dogs on the right. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_35_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_35_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "All the jellyfish have long tentacles. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_36_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_36_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "One elephant has long tusks. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_37_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_37_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "A man is sitting. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_38_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_38_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "All of the cheetahs are eating. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_39_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_39_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "One of the crashed buses has at least two tires up in the air. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_40_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_40_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "There are more desserts in the image on the left. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_41_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_41_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "There are exactly two beakers in the right image. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_42_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_42_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "Each image shows one vase with an open top, a short base, a tear-drop shape, and no handles. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_43_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_43_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "A cartoon cat appears once in each image, and the left image features a cartoon cat posed sitting with face forward and leg to the right. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_44_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_44_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "There are only two adult skunks. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_45_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_45_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "There is a stairway with an arched doorway under the stairs Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_46_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_46_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In at least one image there is a flag and flagpole in front of a monastery Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_47_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_47_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "One image features exactly two side-by-side black-and-white dogs, and the other features one reclining tri-colored dog with both front paws extended forward. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_48_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_48_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In at least one image there is a brown rectangle tv with two silver knobs on it. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_49_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_49_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "An image shows a group of safety pins arranged in the shape of a flower. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_50_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_50_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The right image contains a vertical stack of two pillows. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_51_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_51_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "All kneepads are modeled by a human body, and at least one image shows only one kneepad and one bare knee. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_52_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_52_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In at least one image someone is using a kitchen utensil. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_53_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_53_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "One image features a squarish light-colored building with a tiered green roof and columns in front of an arch doorway. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_54_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_54_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "All dogs are posed on some outdoor structure made of wood and are gazing generally toward the camera. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_55_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_55_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "One of the images shows two guinea pigs diving into a pool. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_56_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_56_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "One image shows no more than five zebras running and kicking up dust, and the other image shows a large herd of zebras running and splashing across a wet green field. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_57_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_57_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "An image shows a group of three pet rodents in a container, and all share the same fur coloration Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_58_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_58_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "There are exactly four birds perched on a branch in the pair of images. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_59_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_59_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The left and right image contains the same number of horses pulling a cart. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_60_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_60_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "Each image appears to contain only zebra-type animals, and in at least one image, the zebras are massed together so its hard to distinguish individual animals. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_61_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_61_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The right image shows several dinosaur shaped balloons hung in a room with a beige sofa and a TV hanging on the wall. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_62_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_62_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In one of the images you can see something that is not a towel. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_63_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_63_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In at least one image there is only a single zebra with a closed mouth look straight forward. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_64_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_64_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "One image contains no more than two short-haired guinea pigs posed on a blue surface, and the other image shows a single long-haired brown-and-white guinea pig. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_65_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_65_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In at least one image there is a total of three women in bikinis with at least one holding a drink. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_66_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_66_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "there are 7 pencil puches in the image pair Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_67_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_67_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "An image features a model in a pink bikini standing with her arms over her head. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_68_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_68_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The left and right image contains the same number of black weights. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_69_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_69_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "Two identical dining tables, each with chairs arranged for seating at least four, are placed side by side and are empty except for a centerpiece on each. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_70_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_70_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "A person can be seen holding more than one puppy. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_71_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_71_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The combined images include at least one two-wheeled cart with a wagon bed on it, exactly one man, and exactly one horse. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_72_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_72_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "An image shows a hamster clutching a snack while posed with its hind legs raised off the ground and at least one pink sole showing. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_73_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_73_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "Both trains are headed diagonally down towards the left. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_74_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_74_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "All of the dogs are standing. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_75_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_75_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "At least one of the bottles has a kind of loop on the lid, and the bottles on the left and right are different styles. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_76_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_76_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "One of the pictures includes a patch of brown rock. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_77_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_77_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "Each image contains a wolf, and one image shows a black wolf and a dark doberman in a face-to-face confrontation. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_78_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_78_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "At least one cheetah is near a pool of water, and two cheetahs have their heads lowered. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_79_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_79_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The right image shows one panda posed on its back with at least one front paw raised and mouth open. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_80_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_80_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "There are two glass becker. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_81_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_81_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "One image shows three side-by-side gray-and-white husky puppies in upright sitting poses, and all dogs in both images are puppies. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_82_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_82_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The right image contains two black beetles. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_83_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_83_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "There are over a dozen pictures of women with lipstick. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_84_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_84_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In the image on the right there are exactly 5 pillows. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_85_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_85_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "Each image contains a dog with black spots on white fur, and the large spotted dog is in a reclining pose near a french bulldog in one image. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_86_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_86_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The carts have only single riders on them. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_87_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_87_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "Both beds have round top drapes. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_88_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_88_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The image on the left shows a single white dog being fed something. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_89_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_89_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "One image shows a stack of two pillows with pointed corners, and the other image shows flat-edged pillows, with one pillow leaning against a pillow that is lying flat. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_90_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_90_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "A tiny skunk with a thin white stripe down its forehead is sleeping on the side of its head. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_91_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_91_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "There are six or more vending machines that have food or drinks. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_92_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_92_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "All horned animal figures are standing facing rightward, and each image contains just one horned animal form. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_93_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_93_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In at least one image there is a dark brown staircase end facing left with dark brown railing. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_94_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_94_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "One image contains a trio of black pugs out of costume, and the other image includes no uncostumed dogs and includes at least one dog wearing a fur hood. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_95_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_95_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The left image shows a stack of at least four round patties topped with a dollop of white cream and sprinkled with green ring shapes, all on a white dish. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_96_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_96_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "Each image shows a corgi dog in a sitting pose, and the dog on the right has an open mouth while the dog on the left does not. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_97_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_97_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The left and right image contains the same number empty clear glass soap dispenser. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_98_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_98_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "At least one case is pinkish and depicts the Eiffel tower on its front. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_99_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_99_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "There is an ibex in a wooded area with trees behind it Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_100_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_100_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "A skin product in one image is in a short fat beige jar with a brown lid the same width as the jar. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_101_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_101_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "An image shows at least six faces modeling lipstick, with eyes visible and all images displayed in side-by-side pairs. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_102_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_102_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "There are exactly two beakers in the right image. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_103_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_103_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "Some penguins are swimming in water. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_104_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_104_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "Each image shows one white dog with an open mouth, but the dog depicted in the left image also has its eyes shut. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_105_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_105_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "An image shows a workout with only women holding a weight in each hand raised in front of their bodies. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_106_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_106_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The dog in the image on the right has its mouth open. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_107_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_107_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "There are exactly two empty containers. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_108_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_108_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "Each image includes three zebras posed in a row with their bodies parallel to one another. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_109_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_109_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "There is only one guinea pig in each of the images. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_110_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_110_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "Each image includes a bus with a non-flat front in the foreground, and multiple buses are visible in each image. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_111_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_111_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In one of the images, a dog is sleeping on their back in a belly up position. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_112_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_112_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The girl in the left image is blonde. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_113_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_113_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "There is fruit salad in a white bowl. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_114_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_114_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "People are posed and visible, including torsos and some legs. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_115_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_115_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In exactly one image there are sliced kiwis in a dessert bowl. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_116_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_116_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In the image to the right, you can see the person's fingers. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_117_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_117_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "At least 2 giant safety pins are hanging next to a sign that has the word Laundry on it. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_118_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_118_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "A dog is laying on its back Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_119_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_119_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "Two models are standing. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_120_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_120_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The left and right image contains the same number of brown bookshelves. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_121_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_121_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "At least some of the zebras in the image on the left are standing on dirt. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_122_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_122_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "There are no more than five penguins. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_123_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_123_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "There are four birds in the pair of images. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_124_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_124_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "One image contains only whole, unpeeled lemons, while the other image contains one lemon cut in half, and at least as many unpeeled lemons as the image with only unpeeled lemons. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_125_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_125_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "One of the nets is pink. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_126_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_126_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The weights in the right image are in use by a man, unlike the weights in the left image. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_127_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_127_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "One image shows multiple pandas on a structure made of wooden logs, and the other shows two pandas by something that extends from the bottom of the image. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_128_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_128_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "At least one perfume bottle cap has pink flowers. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_129_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_129_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "There is a bottle of pepper sauce with a gold-colored sealing band. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_130_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_130_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "white painted stripes are painted horizontally on the train engine Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_131_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_131_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "A person has a hand around the neck of a camera-facing pug in the left image, and the right image contains exactly two dogs. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_132_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_132_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "There are 3 phones on the left and two phones on the right. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_133_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_133_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "Human legs model kneepads in both images, and at least one image contains a single kneepad. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_134_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_134_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "One dog is black. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_135_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_135_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "A roll of paper towel is in a stainless steel holder. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_136_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_136_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In one image, a dog has its mouth open. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_137_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_137_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "At least one image shows a room with clusters of lights suspended from an exposed beam ceiling over rectangular tables and bright orange chairs. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_138_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_138_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "One image in the pair shows a single pig swimming and the other shows at least two pigs swimming. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_139_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_139_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The image on the right has no more than three jellyfish. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_140_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_140_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "Each image contains just one container used for drinking, and the front of at least one of the containers depicts a city skyline. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_141_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_141_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The right image contains one chimpanzee that is exposing its teeth. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_142_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_142_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "There is a dog on a pure white background. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_143_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_143_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The combined images include at least one standing adult wild pig and at least one standing baby piglet with distinctive brown and beige striped fur. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_144_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_144_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In both images a plant is sprouting out of a vase. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_145_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_145_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The left and right image contains the same number of mugs. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_146_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_146_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "Two pandas are eating. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_147_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_147_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "There is only one pillow in the right image. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_148_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_148_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "There is one stand that is both glass top and wider than the TV it is holding. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_149_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_149_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In the image to the right, you can see the person's fingers. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_150_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_150_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "There is only one pillow in the right image. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_151_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_151_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "Every safety pin in the images is closed. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_152_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_152_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "One flower arrangement is not in a vase. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_153_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_153_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In one of the images you can see something that is not a towel. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_154_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_154_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "Loose rolls are sitting on a package of toilet paper. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_155_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_155_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "At least five light-colored dogs are running forward over a field of grass in the left image. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_156_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_156_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The animal in the left image has an open mouth, and the skunk of the right is standing on all fours with its body in profile. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_157_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_157_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "Exactly two bowls of mashed potatoes are in round containers, only one with a spoon. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_158_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_158_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "There are no more than 3 monkeys. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_159_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_159_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "There are both gray and white section of  fur on a single wolf whose body is facing right with their head tilted left forward. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_160_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_160_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In two of the images, the unbaked puffed pastry dough is folded and has finger poke marks on top. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_161_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_161_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "One image includes closed multi-compartment zipper cases shown in six solid-color options. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_162_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_162_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "there is at least one tube of lipstick with a silver base and gold accents Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_163_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_163_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "One of the images shows a spider-like creature next to a blush or beige colored urchin, while the other image shows a pink urchin without the spider creature. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_164_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_164_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "One of the pictures shows a doctor holding a syringe on the right. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_165_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_165_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The golf balls in one of the pictures are arranged in three rows and four columns. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_166_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_166_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "the right side has bananas as dolphins Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_167_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_167_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "Each image shows one vase with an open top, a short base, a tear-drop shape, and no handles. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_168_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_168_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The dog on the left has a smiling face, and the dog on the right is baring its teeth. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_169_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_169_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "All of the horses are light brown Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_170_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_170_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The left image features a blond girl in a pink tank top that covers her midriff standing in front of at least one person and posing with a red dumbbell in each hand, lifted with the elbow forming a right angle. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_171_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_171_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The left and right image contains the same number of horses pulling a cart. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_172_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_172_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The left and right image contains a circle and square canopy. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_173_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_173_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "Each image shows the bare framework of a yurt-type building under construction, and the right image shows a ladder under the framework. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_174_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_174_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "There is a total of eight drink bottles. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_175_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_175_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "An entire bracelet is visible in the left image. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_176_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_176_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "There is at least one dog that has its mouth open. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_177_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_177_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "One image shows a pair of finger-exposing gloves with a panda face on each glove front, and the other image shows one pair of fir-trimmed hand coverings. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_178_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_178_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The right and left images each show a two-wheeled cart with one female passenger, and each cart is pulled by one horse and headed in the same direction. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_179_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_179_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "There are only two dogs and both of them have their tails curled over their backs. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_180_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_180_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "A slice is separated. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_181_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_181_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "One image contains a single whole orange and one half orange. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_182_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_182_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "There is a machine in the image on the right near a trash can. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_183_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_183_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "One image features a clear glass with a flat bottom holding water and one variety of flowers in it, and the other image includes at least one pink flower in something with a roundish shape. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_184_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_184_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In each image, a black ring binder notebook is standing on end with open edges to the back, and the binder end visible with a label attached. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_185_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_185_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "A roll of paper towel is in a stainless steel holder. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_186_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_186_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "There are at most four shoes. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_187_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_187_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In at least one image tehr is a brown mother dog looking after at least one puppy. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_188_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_188_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In the right image, people in purple attire are lined up in front of a temple. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_189_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_189_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "Three blue birds are perched outside. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_190_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_190_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "a pair of warthogs are facing each other with noses touching Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_191_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_191_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The left and right image contains a total of four women in bikinis. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_192_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_192_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "There are books stacked flat on the floor next to the bookshelves. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_193_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_193_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The wagons in both pictures are parked in a grassy area. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_194_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_194_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "Each image shows exactly one girl, who is wearing matching knitted mittens and cap, her hands pointing up towards her face, and a large pompom on her hat. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_195_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_195_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "There are no more than 3 people in the image on the right. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_196_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_196_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "All of the pictures have at least one dog with a baby. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_197_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_197_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "At least one of the dogs in the image on the right is wearing a Santa hat. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_198_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_198_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_nlvr2",
+    "visual_input_component": "natural image",
+    "source": "nlvr2",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "Each image contains one dark gray puppy with upright ears sitting on a fabric surface and facing forward with open eyes. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_199_0.png",
+      "../MMIU-Benchmark/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_199_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_0_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_0_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_1_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_1_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_2_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_2_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_3_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_3_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_4_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_4_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_5_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_5_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_6_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_6_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_7_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_7_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_8_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_8_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_9_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_9_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_10_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_10_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_11_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_11_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_12_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_12_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_13_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_13_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_14_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_14_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_15_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_15_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_16_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_16_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_17_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_17_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_18_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_18_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_19_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_19_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_20_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_20_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_21_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_21_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_22_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_22_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_23_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_23_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_24_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_24_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_25_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_25_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_26_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_26_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_27_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_27_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_28_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_28_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_29_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_29_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_30_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_30_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_31_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_31_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_32_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_32_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_33_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_33_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_34_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_34_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_35_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_35_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_36_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_36_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_37_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_37_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_38_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_38_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_39_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_39_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_40_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_40_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_41_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_41_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_42_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_42_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_43_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_43_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_44_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_44_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_45_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_45_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_46_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_46_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_47_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_47_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_48_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_48_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_49_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_49_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_50_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_50_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_51_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_51_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_52_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_52_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_53_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_53_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_54_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_54_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_55_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_55_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_56_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_56_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_57_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_57_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_58_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_58_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_59_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_59_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_60_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_60_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_61_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_61_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_62_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_62_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_63_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_63_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_64_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_64_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_65_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_65_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_66_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_66_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_67_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_67_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_68_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_68_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_69_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_69_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_70_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_70_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_71_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_71_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_72_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_72_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_73_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_73_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_74_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_74_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_75_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_75_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_76_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_76_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_77_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_77_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_78_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_78_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_79_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_79_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_80_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_80_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_81_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_81_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_82_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_82_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_83_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_83_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_84_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_84_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_85_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_85_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_86_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_86_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_87_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_87_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_88_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_88_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_89_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_89_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_90_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_90_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_91_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_91_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_92_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_92_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_93_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_93_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_94_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_94_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_95_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_95_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_96_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_96_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_97_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_97_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_98_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_98_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_99_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_99_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_100_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_100_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_101_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_101_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_102_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_102_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_103_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_103_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_104_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_104_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_105_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_105_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_106_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_106_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_107_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_107_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_108_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_108_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_109_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_109_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_110_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_110_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_111_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_111_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_112_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_112_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_113_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_113_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_114_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_114_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_115_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_115_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_116_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_116_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_117_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_117_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_118_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_118_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_119_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_119_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_120_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_120_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_121_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_121_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_122_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_122_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_123_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_123_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_124_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_124_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_125_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_125_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_126_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_126_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_127_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_127_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_128_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_128_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_129_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_129_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_130_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_130_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_131_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_131_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_132_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_132_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_133_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_133_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_134_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_134_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_135_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_135_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_136_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_136_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_137_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_137_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_138_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_138_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_139_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_139_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_140_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_140_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_141_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_141_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_142_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_142_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_143_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_143_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_144_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_144_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_145_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_145_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_146_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_146_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_147_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_147_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_148_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_148_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_149_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_149_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_150_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_150_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_151_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_151_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_152_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_152_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_153_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_153_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_154_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_154_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_155_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_155_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_156_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_156_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_157_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_157_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_158_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_158_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_159_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_159_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_160_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_160_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_161_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_161_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_162_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_162_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_163_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_163_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_164_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_164_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_165_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_165_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_166_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_166_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_167_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_167_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_168_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_168_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_169_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_169_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_170_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_170_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_171_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_blink/visual_correspondence_blink_171_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_0_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_0_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_1_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_1_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_2_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_2_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_3_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_3_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_4_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_4_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_5_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_5_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_6_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_6_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_7_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_7_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_8_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_8_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_9_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_9_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_10_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_10_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_11_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_11_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_12_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_12_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_13_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_13_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_14_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_14_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_15_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_15_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_16_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_16_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_17_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_17_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_18_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_18_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_19_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_19_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_20_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_20_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_21_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_21_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_22_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_22_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_23_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_23_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_24_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_24_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_25_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_25_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_26_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_26_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_27_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_27_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_28_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_28_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_29_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_29_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_30_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_30_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_31_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_31_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_32_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_32_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_33_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_33_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_34_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_34_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_35_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_35_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_36_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_36_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_37_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_37_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_38_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_38_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_39_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_39_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_40_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_40_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_41_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_41_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_42_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_42_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_43_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_43_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_44_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_44_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_45_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_45_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_46_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_46_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_47_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_47_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_48_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_48_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_49_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_49_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_50_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_50_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_51_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_51_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_52_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_52_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_53_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_53_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_54_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_54_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_55_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_55_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_56_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_56_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_57_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_57_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_58_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_58_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_59_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_59_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_60_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_60_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_61_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_61_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_62_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_62_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_63_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_63_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_64_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_64_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_65_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_65_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_66_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_66_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_67_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_67_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_68_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_68_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_69_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_69_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_70_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_70_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_71_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_71_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_72_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_72_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_73_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_73_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_74_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_74_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_75_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_75_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_76_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_76_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_77_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_77_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_78_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_78_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_79_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_79_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_80_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_80_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_81_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_81_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_82_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_82_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_83_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_83_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_84_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_84_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_85_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_85_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_86_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_86_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_87_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_87_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_88_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_88_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_89_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_89_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_90_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_90_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_91_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_91_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_92_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_92_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_93_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_93_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_94_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_94_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_95_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_95_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_96_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_96_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_97_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_97_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_98_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_98_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_99_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_99_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_100_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_100_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_101_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_101_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_102_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_102_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_103_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_103_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_104_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_104_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_105_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_105_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_106_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_106_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_107_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_107_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_108_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_108_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_109_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_109_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_110_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_110_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_111_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_111_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_112_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_112_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_113_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_113_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_114_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_114_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_115_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_115_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_116_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_116_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_117_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_117_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_118_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_118_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_119_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_119_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_120_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_120_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_121_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_121_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_122_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_122_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_123_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_123_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_124_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_124_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_125_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_125_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_126_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_126_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_127_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_127_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_128_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_128_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_129_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_129_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_130_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_130_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_131_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_131_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_132_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_132_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_133_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_133_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_134_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_134_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_135_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_135_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_136_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_136_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_137_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_137_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_138_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_138_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_139_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_139_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_140_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_140_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_141_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_141_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_142_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_142_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_143_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_143_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_144_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_144_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_145_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_145_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_146_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_146_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_147_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_147_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_148_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_148_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_149_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_149_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_150_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_150_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_151_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_151_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_152_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_152_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_153_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_153_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_154_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_154_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_155_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_155_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_156_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_156_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_157_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_157_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_158_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_158_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_159_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_159_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_160_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_160_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_161_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_161_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_162_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_162_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_163_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_163_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_164_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_164_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_165_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_165_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_166_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_166_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_167_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_167_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_168_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_168_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_169_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_169_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_170_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_170_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_171_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_171_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_172_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_172_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_173_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_173_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_174_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_174_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_175_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_175_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_176_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_176_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_177_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_177_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_178_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_178_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_179_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_179_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_180_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_180_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_181_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_181_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_182_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_182_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_183_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_183_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_184_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_184_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_185_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_185_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_186_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_186_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_187_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_187_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_188_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_188_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_189_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_189_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_190_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_190_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_191_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_191_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_192_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_192_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_193_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_193_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_194_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_194_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_195_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_195_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_196_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_196_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_197_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_197_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_198_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_198_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_correspondence_scannet",
+    "visual_input_component": "2 natural images",
+    "source": "scannet",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_199_0.jpg",
+      "../MMIU-Benchmark/visual_correspondence_scannet/visual_correspondence_scannet_199_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_0_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_0_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_1_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_1_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_2_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_2_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_3_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_3_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_4_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_4_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_5_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_5_1.JPEG"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_6_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_6_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_7_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_7_1.JPEG"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_8_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_8_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_9_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_9_1.JPEG"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_10_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_10_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_11_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_11_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_12_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_12_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_13_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_13_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_14_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_14_1.JPEG"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_15_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_15_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_16_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_16_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_17_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_17_1.JPEG"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_18_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_18_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_19_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_19_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_20_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_20_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_21_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_21_1.JPEG"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_22_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_22_1.JPEG"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_23_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_23_1.JPEG"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_24_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_24_1.JPEG"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_25_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_25_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_26_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_26_1.JPEG"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_27_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_27_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_28_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_28_1.JPEG"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_29_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_29_1.JPEG"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_30_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_30_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_31_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_31_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_32_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_32_1.JPEG"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_33_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_33_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_34_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_34_1.JPEG"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_35_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_35_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_36_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_36_1.JPEG"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_37_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_37_1.JPEG"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_38_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_38_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_39_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_39_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_40_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_40_1.JPEG"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_41_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_41_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_42_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_42_1.JPEG"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_43_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_43_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_44_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_44_1.JPEG"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_45_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_45_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_46_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_46_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_47_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_47_1.JPEG"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_48_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_48_1.JPEG"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_49_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_49_1.JPEG"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_50_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_50_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_51_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_51_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_52_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_52_1.JPEG"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_53_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_53_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_54_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_54_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_55_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_55_1.JPEG"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_56_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_56_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_57_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_57_1.JPEG"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_58_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_58_1.JPEG"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_59_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_59_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_60_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_60_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_61_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_61_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_62_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_62_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_63_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_63_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_64_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_64_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_65_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_65_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_66_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_66_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_67_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_67_1.JPEG"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_68_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_68_1.JPEG"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_69_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_69_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_70_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_70_1.JPEG"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_71_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_71_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_72_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_72_1.JPEG"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_73_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_73_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_74_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_74_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_75_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_75_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_76_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_76_1.JPEG"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_77_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_77_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_78_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_78_1.JPEG"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_79_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_79_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_80_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_80_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_81_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_81_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_82_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_82_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_83_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_83_1.JPEG"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_84_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_84_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_85_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_85_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_86_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_86_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_87_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_87_1.JPEG"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_88_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_88_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_89_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_89_1.JPEG"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_90_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_90_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_91_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_91_1.JPEG"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_92_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_92_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_93_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_93_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_94_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_94_1.JPEG"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_95_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_95_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_96_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_96_1.JPEG"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_97_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_97_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_98_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_98_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_99_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_99_1.JPEG"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_100_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_100_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_101_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_101_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_102_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_102_1.JPEG"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_103_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_103_1.JPEG"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_104_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_104_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_105_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_105_1.JPEG"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_106_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_106_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_107_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_107_1.JPEG"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_108_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_108_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_109_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_109_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_110_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_110_1.JPEG"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_111_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_111_1.JPEG"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_112_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_112_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_113_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_113_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_114_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_114_1.JPEG"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_115_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_115_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_116_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_116_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_117_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_117_1.JPEG"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_118_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_118_1.JPEG"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_119_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_119_1.JPEG"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_120_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_120_1.JPEG"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_121_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_121_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_122_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_122_1.JPEG"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_123_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_123_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_124_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_124_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_125_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_125_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_126_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_126_1.JPEG"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_127_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_127_1.JPEG"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_128_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_128_1.JPEG"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_129_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_129_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_130_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_130_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_131_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_131_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_132_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_132_1.JPEG"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_133_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_133_1.JPEG"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_134_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_134_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_135_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_135_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_136_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_136_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_137_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_137_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_138_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_138_1.JPEG"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_139_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_139_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_140_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_140_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_141_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_141_1.JPEG"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_142_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_142_1.JPEG"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_143_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_143_1.JPEG"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_144_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_144_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_145_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_145_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_146_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_146_1.JPEG"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_147_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_147_1.JPEG"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_148_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_148_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_149_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_149_1.JPEG"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_150_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_150_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_151_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_151_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_152_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_152_1.JPEG"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_153_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_153_1.JPEG"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_154_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_154_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_155_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_155_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_156_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_156_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_157_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_157_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_158_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_158_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_159_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_159_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_160_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_160_1.JPEG"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_161_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_161_1.JPEG"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_162_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_162_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_163_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_163_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_164_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_164_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_165_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_165_1.JPEG"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_166_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_166_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_167_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_167_1.JPEG"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_168_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_168_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_169_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_169_1.JPEG"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_170_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_170_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_171_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_171_1.JPEG"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_172_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_172_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_173_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_173_1.JPEG"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_174_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_174_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_175_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_175_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_176_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_176_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_177_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_177_1.JPEG"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_178_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_178_1.JPEG"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_179_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_179_1.JPEG"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_180_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_180_1.JPEG"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_181_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_181_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_182_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_182_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_183_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_183_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_184_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_184_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_185_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_185_1.JPEG"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_186_0.jpg",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_186_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_187_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_187_1.JPEG"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_188_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_188_1.JPEG"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_189_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_189_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_190_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_190_1.JPEG"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_191_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_191_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_192_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_192_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_193_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_193_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_194_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_194_1.JPEG"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_195_0.JPEG",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_195_1.JPEG"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_196_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_196_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_197_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_197_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_198_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_198_1.JPEG"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "functional_correspondence_funk_point",
+    "visual_input_component": "2 natural images",
+    "source": "funk_point",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_199_0.png",
+      "../MMIU-Benchmark/functional_correspondence_funk_point/functional_correspondence_funk_point_199_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: the guy is no longer by his car he is further towards the middle of the parking lot\nB: The man is now riding a bicycle in the park.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: the guy is no longer by his car he is further towards the middle of the parking lot\nB: The man is now riding a bicycle in the park.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_0_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_0_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: the 2 people on the sidewalk are gone\nB: A colorful mural of a cityscape fills the entire wall, with no people in sight.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: the 2 people on the sidewalk are gone\nB: A colorful mural of a cityscape fills the entire wall, with no people in sight.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_1_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_1_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: Empty streets with no signs of activity\nB: less cars parked",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: Empty streets with no signs of activity\nB: less cars parked",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_2_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_2_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: A group of friends enjoying a picnic in the park\nB: the silver car left the parking lot",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: A group of friends enjoying a picnic in the park\nB: the silver car left the parking lot",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_3_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_3_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: two pepole standing by building is no longer there\nB: A colorful hot air balloon floating in the sky over a peaceful lake with a small cabin on the shore.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: two pepole standing by building is no longer there\nB: A colorful hot air balloon floating in the sky over a peaceful lake with a small cabin on the shore.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_4_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_4_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: The second picture features a diverse collection of plants from various climates and regions.\nB: there are more people in the second picture",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: The second picture features a diverse collection of plants from various climates and regions.\nB: there are more people in the second picture",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_5_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_5_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: there are now two people by the yellow poles\nB: A colorful parrot sits on a tree branch, preening its feathers.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: there are now two people by the yellow poles\nB: A colorful parrot sits on a tree branch, preening its feathers.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_6_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_6_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: there s one more car in the group of cars on the far left of the parking lot and also in the group in the center of the lot\nB: A flock of birds is flying in the clear blue sky above the parking lot",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: there s one more car in the group of cars on the far left of the parking lot and also in the group in the center of the lot\nB: A flock of birds is flying in the clear blue sky above the parking lot",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_7_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_7_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: The images appear to be in motion due to the blur effect, creating a sense of dynamism and energy.\nB: there are some people in the right image",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: The images appear to be in motion due to the blur effect, creating a sense of dynamism and energy.\nB: there are some people in the right image",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_8_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_8_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: The color of the sky is a vibrant shade of purple, and the ground is covered in a shimmering layer of silver dust.\nB: there are 6 people on the right side and none on the left",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: The color of the sky is a vibrant shade of purple, and the ground is covered in a shimmering layer of silver dust.\nB: there are 6 people on the right side and none on the left",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_9_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_9_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: The people in the left picture are wearing hats.\nB: there are four more people in the right picture",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: The people in the left picture are wearing hats.\nB: there are four more people in the right picture",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_10_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_10_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: Both images show a beautiful landscape with colorful flowers and lush greenery. In the distance, a majestic mountain range rises against the horizon, creating a stunning backdrop for the scene.\nB: there are more people in the one on the right than on the left",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: Both images show a beautiful landscape with colorful flowers and lush greenery. In the distance, a majestic mountain range rises against the horizon, creating a stunning backdrop for the scene.\nB: there are more people in the one on the right than on the left",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_11_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_11_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: The images show a difference in the weather between them.\nB: there are less people in the after image",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: The images show a difference in the weather between them.\nB: there are less people in the after image",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_12_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_12_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: The road is now empty with no signs of any vehicles or people.\nB: a person on a motor cycle is no longer in the image",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: The road is now empty with no signs of any vehicles or people.\nB: a person on a motor cycle is no longer in the image",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_13_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_13_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: four people have appeared on the picture\nB: The image shows a colorful landscape with a beautiful sunset.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: four people have appeared on the picture\nB: The image shows a colorful landscape with a beautiful sunset.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_14_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_14_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: The colors in the first image sure are vibrant! I love how the light hits the scenery.\nB: the picture on the right has less people than the one on the left",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: The colors in the first image sure are vibrant! I love how the light hits the scenery.\nB: the picture on the right has less people than the one on the left",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_15_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_15_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: The colorful mural on the wall seems to come to life as the vibrant hues blend together in a mesmerizing dance.\nB: the people on the stairs are closer",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: The colorful mural on the wall seems to come to life as the vibrant hues blend together in a mesmerizing dance.\nB: the people on the stairs are closer",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_16_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_16_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: more pedestrians visiable\nB: A group of colorful hot air balloons soaring through the sky",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: more pedestrians visiable\nB: A group of colorful hot air balloons soaring through the sky",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_17_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_17_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: there was six people standing around and now there are two\nB: The room was brightly lit with colorful decorations and lively music playing in the background.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: there was six people standing around and now there are two\nB: The room was brightly lit with colorful decorations and lively music playing in the background.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_18_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_18_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: no noticeable differences\nB: A red apple sits on a wooden table, next to a stack of books.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: no noticeable differences\nB: A red apple sits on a wooden table, next to a stack of books.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_19_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_19_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: A colorful meadow with a blue sky and fluffy clouds.\nB: there is no difference",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: A colorful meadow with a blue sky and fluffy clouds.\nB: there is no difference",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_20_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_20_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: no noticeable changes have occurred\nB: An interesting juxtaposition of colors and shadows is evident.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: no noticeable changes have occurred\nB: An interesting juxtaposition of colors and shadows is evident.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_21_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_21_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: the white car is no longer on the road adjacent to the car parking lot\nB: A group of people are playing on a sunny beach near the ocean.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: the white car is no longer on the road adjacent to the car parking lot\nB: A group of people are playing on a sunny beach near the ocean.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_22_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_22_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: A cat is sitting on a tree branch\nB: there are two people standing in the street",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: A cat is sitting on a tree branch\nB: there are two people standing in the street",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_23_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_23_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: A small red house sits on top of a hill with a beautiful sunset in the background.\nB: the after image contains two individuals standing near the yellow poles in the upper left hand quadrant of the image",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: A small red house sits on top of a hill with a beautiful sunset in the background.\nB: the after image contains two individuals standing near the yellow poles in the upper left hand quadrant of the image",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_24_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_24_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: the left picture has people standing around talking\nB: The left picture shows a colorful array of geometric shapes floating in mid-air.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: the left picture has people standing around talking\nB: The left picture shows a colorful array of geometric shapes floating in mid-air.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_25_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_25_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: The sky is filled with colorful hot air balloons.\nB: the man is road",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: The sky is filled with colorful hot air balloons.\nB: the man is road",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_26_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_26_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: more cars in picture\nB: A group of people playing frisbee in a park.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: more cars in picture\nB: A group of people playing frisbee in a park.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_27_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_27_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: one person has left the image from the before to after image\nB: The color blue dominates the scene, with various shades blending together in an abstract pattern.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: one person has left the image from the before to after image\nB: The color blue dominates the scene, with various shades blending together in an abstract pattern.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_28_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_28_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: all the people are new\nB: The room is decorated with vintage furniture and colorful murals.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: all the people are new\nB: The room is decorated with vintage furniture and colorful murals.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_29_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_29_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: The left image features a beautiful sunset, while the right image captures a bustling city at night.\nB: there are only 3 people in the left image but the right image has 7 people",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: The left image features a beautiful sunset, while the right image captures a bustling city at night.\nB: there are only 3 people in the left image but the right image has 7 people",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_30_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_30_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: there are less pedestrians\nB: The buildings in the background have a unique architecture that stands out in the cityscape.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: there are less pedestrians\nB: The buildings in the background have a unique architecture that stands out in the cityscape.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_31_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_31_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: The blue car is now the only one in the parking lot\nB: the red car is no longer there",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: The blue car is now the only one in the parking lot\nB: the red car is no longer there",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_32_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_32_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: the group of people has moved to the left\nB: A colorful mural adorns the side of the building, depicting various abstract shapes and patterns.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: the group of people has moved to the left\nB: A colorful mural adorns the side of the building, depicting various abstract shapes and patterns.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_33_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_33_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: the white truck is not there anumore\nB: A group of birds is flying in the clear blue sky.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: the white truck is not there anumore\nB: A group of birds is flying in the clear blue sky.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_34_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_34_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: The clouds overhead resemble an elaborate maze, with no clear path to navigate.\nB: the people in the lot have only moved slightly",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: The clouds overhead resemble an elaborate maze, with no clear path to navigate.\nB: the people in the lot have only moved slightly",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_35_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_35_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: nothing in the second shot appears to have changed\nB: The second image features a completely different setting and subject matter than the first image",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: nothing in the second shot appears to have changed\nB: The second image features a completely different setting and subject matter than the first image",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_36_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_36_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: A group of people are enjoying a picnic in a park\nB: the van has backed out the parking space it was in",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: A group of people are enjoying a picnic in a park\nB: the van has backed out the parking space it was in",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_37_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_37_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: the white car is no longer there\nB: An unexpected gathering of birds occurred in the vicinity.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: the white car is no longer there\nB: An unexpected gathering of birds occurred in the vicinity.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_38_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_38_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: The van seems to be covered in graffiti and shows signs of wear and tear, while a squirrel is perched on top of it.\nB: there appears to be a person standing next to the van which is parked closest to the building",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: The van seems to be covered in graffiti and shows signs of wear and tear, while a squirrel is perched on top of it.\nB: there appears to be a person standing next to the van which is parked closest to the building",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_39_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_39_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: the group of people have changed places in the circle\nB: The sun is setting, casting an orange glow across the landscape.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: the group of people have changed places in the circle\nB: The sun is setting, casting an orange glow across the landscape.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_40_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_40_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: A group of birds is flying over the lake between the trees.\nB: there are two fewer people standing near the nellow poles",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: A group of birds is flying over the lake between the trees.\nB: there are two fewer people standing near the nellow poles",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_41_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_41_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: people are in the photo in after\nB: The photo features a vibrant sunset over a peaceful lake with a lone boat drifting across the water.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: people are in the photo in after\nB: The photo features a vibrant sunset over a peaceful lake with a lone boat drifting across the water.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_42_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_42_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: the two people are no longer there near the dividers\nB: The empty chairs overlook the vast desert landscape.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: the two people are no longer there near the dividers\nB: The empty chairs overlook the vast desert landscape.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_43_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_43_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: there are 6 people rather than 2 in the drive\nB: The picture shows a scenic mountain landscape with a lake, instead of an urban cityscape.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: there are 6 people rather than 2 in the drive\nB: The picture shows a scenic mountain landscape with a lake, instead of an urban cityscape.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_44_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_44_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: the left image has two people in it and the right image has the top of one person in it\nB: The left image features a beautiful sunset over the ocean, while the right image shows a busy city street with tall buildings.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: the left image has two people in it and the right image has the top of one person in it\nB: The left image features a beautiful sunset over the ocean, while the right image shows a busy city street with tall buildings.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_45_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_45_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: The birds are gathering on the roof to form a plan for world domination.\nB: a truck in the corner has dissapeared",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: The birds are gathering on the roof to form a plan for world domination.\nB: a truck in the corner has dissapeared",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_46_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_46_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: A group of friends enjoying a picnic in the park\nB: only two people outside of parking lot",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: A group of friends enjoying a picnic in the park\nB: only two people outside of parking lot",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_47_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_47_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: A flock of birds is flying high in the sky, casting shadows on the ground.\nB: a car is leaving the parking lot near the top of the picture to the left of the white box truck",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: A flock of birds is flying high in the sky, casting shadows on the ground.\nB: a car is leaving the parking lot near the top of the picture to the left of the white box truck",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_48_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_48_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: in the before photo there appear to be 5 people huddled close together and in the after photo there are about 7 people a little more spread out with some beginning to walk away from the others\nB: In the first image, a butterfly is resting on a flower, while in the second image, a squirrel is scampering up a tree.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: in the before photo there appear to be 5 people huddled close together and in the after photo there are about 7 people a little more spread out with some beginning to walk away from the others\nB: In the first image, a butterfly is resting on a flower, while in the second image, a squirrel is scampering up a tree.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_49_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_49_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: A group of colorful balloons floating in the sky\nB: the three humanis availble",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: A group of colorful balloons floating in the sky\nB: the three humanis availble",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_50_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_50_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: The second image features a vibrant and diverse collection of flowers in a garden, with a rainbow visible in the background.\nB: there are less people on 2nd photo also they are located off the photo",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: The second image features a vibrant and diverse collection of flowers in a garden, with a rainbow visible in the background.\nB: there are less people on 2nd photo also they are located off the photo",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_51_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_51_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: The colors of the sunset are vibrant and striking, painting the sky in a beautiful array of hues.\nB: the car driving is not there anymore",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: The colors of the sunset are vibrant and striking, painting the sky in a beautiful array of hues.\nB: the car driving is not there anymore",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_52_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_52_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: The colorful balloons filled the room as the clown performed magic tricks for the delighted audience.\nB: the kids appear again to play a game in the lot",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: The colorful balloons filled the room as the clown performed magic tricks for the delighted audience.\nB: the kids appear again to play a game in the lot",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_53_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_53_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: A group of people are having a picnic on the grass near the parking lot.\nB: there is a red car parked at the far end of the middle row of cars",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: A group of people are having a picnic on the grass near the parking lot.\nB: there is a red car parked at the far end of the middle row of cars",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_54_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_54_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: The yellow sun sets behind the mountains, casting a warm glow over the landscape.\nB: group is gone",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: The yellow sun sets behind the mountains, casting a warm glow over the landscape.\nB: group is gone",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_55_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_55_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: before picture truck leaving after picture car entering before picture people beside red car before picture white car between blue and grey car after picture more cars after picture person walking across parking lot\nB: The photos capture the transformation of a barren landscape into a vibrant garden with colorful flowers and trees.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: before picture truck leaving after picture car entering before picture people beside red car before picture white car between blue and grey car after picture more cars after picture person walking across parking lot\nB: The photos capture the transformation of a barren landscape into a vibrant garden with colorful flowers and trees.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_56_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_56_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: the cars in the intersection have moved ahead\nB: The traffic lights in the intersection are all green",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: the cars in the intersection have moved ahead\nB: The traffic lights in the intersection are all green",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_57_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_57_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: A flock of colorful birds flies across the clear blue sky.\nB: there is a person walking with a red umbrella",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: A flock of colorful birds flies across the clear blue sky.\nB: there is a person walking with a red umbrella",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_58_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_58_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: a silver car has pulled up near the dumpster\nB: A group of friends are enjoying a picnic in a sunny park",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: a silver car has pulled up near the dumpster\nB: A group of friends are enjoying a picnic in a sunny park",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_59_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_59_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: The painting on the wall seems to be incomplete, with just one corner colored in.\nB: the people in the stairs are in different locations",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: The painting on the wall seems to be incomplete, with just one corner colored in.\nB: the people in the stairs are in different locations",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_60_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_60_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: no changes were observed\nB: A swirling vortex of colorful shapes and patterns filled the frame, creating a hypnotic and mesmerizing effect.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: no changes were observed\nB: A swirling vortex of colorful shapes and patterns filled the frame, creating a hypnotic and mesmerizing effect.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_61_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_61_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: In the new image, the background has been completely transformed into a lush green forest with a waterfall in the distance.\nB: the group of people on the right hand image has changed to where the person to the far left is in a red shirt and the other image it is a blue shirt",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: In the new image, the background has been completely transformed into a lush green forest with a waterfall in the distance.\nB: the group of people on the right hand image has changed to where the person to the far left is in a red shirt and the other image it is a blue shirt",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_62_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_62_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: there is more cars\nB: The sky is filled with vibrant colors and swirling patterns while birds fly in formation.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: there is more cars\nB: The sky is filled with vibrant colors and swirling patterns while birds fly in formation.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_63_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_63_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: the crowd of people in the parking lot have shifted slightly\nB: A flock of seagulls flew overhead, casting long shadows across the deserted beach.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: the crowd of people in the parking lot have shifted slightly\nB: A flock of seagulls flew overhead, casting long shadows across the deserted beach.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_64_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_64_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: The color of the sky has changed from blue to purple in the blink of an eye\nB: the people have moved slightly",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: The color of the sky has changed from blue to purple in the blink of an eye\nB: the people have moved slightly",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_65_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_65_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: the black has moved slightly down the road\nB: A thick layer of fog envelops the entire city, creating an eerie and mysterious atmosphere.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: the black has moved slightly down the road\nB: A thick layer of fog envelops the entire city, creating an eerie and mysterious atmosphere.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_66_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_66_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: The image does not feature a group of people playing sports on a grass field.\nB: the picture on the right does not have a dark colored vehicle backing out of a parking spot",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: The image does not feature a group of people playing sports on a grass field.\nB: the picture on the right does not have a dark colored vehicle backing out of a parking spot",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_67_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_67_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: A group of colorful hot air balloons floating in the sky.\nB: the people at the end of th elot have moved",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: A group of colorful hot air balloons floating in the sky.\nB: the people at the end of th elot have moved",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_68_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_68_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: there is three more people\nB: The room is filled with colorful balloons and streamers, creating a festive atmosphere.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: there is three more people\nB: The room is filled with colorful balloons and streamers, creating a festive atmosphere.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_69_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_69_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: people have moved towards the left frame with some leaving the frame\nB: The sunlight creates interesting patterns on the ground, with shadows forming unique shapes.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: people have moved towards the left frame with some leaving the frame\nB: The sunlight creates interesting patterns on the ground, with shadows forming unique shapes.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_70_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_70_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: in the after image the people walking in the lot are in a different location than in the before image\nB: The before and after images show a comparison of different types of flowers blooming in a garden.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: in the after image the people walking in the lot are in a different location than in the before image\nB: The before and after images show a comparison of different types of flowers blooming in a garden.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_71_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_71_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: there is now a person walking on the left hand side of the lot\nB: A group of birds is perched on the building's roof.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: there is now a person walking on the left hand side of the lot\nB: A group of birds is perched on the building's roof.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_72_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_72_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: The second image features a vibrant display of colorful flowers and plants, with a soothing waterfall in the background.\nB: there is four more people standing around in the second photo",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: The second image features a vibrant display of colorful flowers and plants, with a soothing waterfall in the background.\nB: there is four more people standing around in the second photo",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_73_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_73_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: there are people now in the front\nB: The image shows a vibrant city skyline with a beautiful sunset casting colorful hues across the sky.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: there are people now in the front\nB: The image shows a vibrant city skyline with a beautiful sunset casting colorful hues across the sky.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_74_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_74_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: there are two cars on the road on the right side of the screen\nB: A group of people is playing beach volleyball under a clear blue sky.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: there are two cars on the road on the right side of the screen\nB: A group of people is playing beach volleyball under a clear blue sky.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_75_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_75_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: A small kitten playing with a ball of yarn\nB: a group of people walking",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: A small kitten playing with a ball of yarn\nB: a group of people walking",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_76_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_76_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: there are now 6 people standing near the yellow poles that weren t there before\nB: A group of birds are perched on top of the yellow poles, looking out over the water below.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: there are now 6 people standing near the yellow poles that weren t there before\nB: A group of birds are perched on top of the yellow poles, looking out over the water below.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_77_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_77_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: there is a person walking next to a car\nB: A pair of colorful parrots are perched on a tree branch.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: there is a person walking next to a car\nB: A pair of colorful parrots are perched on a tree branch.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_78_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_78_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: the picture on the right contains more people near the yellow poles\nB: The image depicts a bustling city with colorful umbrellas scattered throughout the scene, giving it a vibrant and lively atmosphere.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: the picture on the right contains more people near the yellow poles\nB: The image depicts a bustling city with colorful umbrellas scattered throughout the scene, giving it a vibrant and lively atmosphere.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_79_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_79_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: there ar cars on the side road\nB: A group of people are gathering on the beach and enjoying a bonfire.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: there ar cars on the side road\nB: A group of people are gathering on the beach and enjoying a bonfire.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_80_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_80_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: The left picture shows a beautiful garden with colorful flowers and a small pond in the middle.\nB: right picture has a few less people standing around",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: The left picture shows a beautiful garden with colorful flowers and a small pond in the middle.\nB: right picture has a few less people standing around",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_81_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_81_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: The image depicts a variety of colorful flowers arranged in an aesthetically pleasing pattern.\nB: there are people visible in different sections of the frame",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: The image depicts a variety of colorful flowers arranged in an aesthetically pleasing pattern.\nB: there are people visible in different sections of the frame",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_82_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_82_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: bus is no longer there\nB: The sky is painted green in this image.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: bus is no longer there\nB: The sky is painted green in this image.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_83_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_83_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: this picture about two by the stairs and one by the red door\nB: The ravishing orange sunset illuminates the tranquil lake, casting a mesmerizing reflection in the water.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: this picture about two by the stairs and one by the red door\nB: The ravishing orange sunset illuminates the tranquil lake, casting a mesmerizing reflection in the water.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_84_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_84_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: The first image captures the beautiful sunrise over a calm lake, while the second image showcases a colorful street market filled with vibrant flowers and produce.\nB: the photo on the left has 2 people and the photo on the right has 6",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: The first image captures the beautiful sunrise over a calm lake, while the second image showcases a colorful street market filled with vibrant flowers and produce.\nB: the photo on the left has 2 people and the photo on the right has 6",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_85_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_85_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: The first image showcases a busy city street with cars and pedestrians, while the second image features a serene beach with crashing waves and a clear blue sky.\nB: their is 6 people in the first photo and none in the other",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: The first image showcases a busy city street with cars and pedestrians, while the second image features a serene beach with crashing waves and a clear blue sky.\nB: their is 6 people in the first photo and none in the other",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_86_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_86_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: A cat is sleeping on a windowsill in the sunlight.\nB: the group of people have moved slightly",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: A cat is sleeping on a windowsill in the sunlight.\nB: the group of people have moved slightly",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_87_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_87_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: more people and the driver is walking away from his car\nB: A colorful hot air balloon is floating in the sky above a vast green field.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: more people and the driver is walking away from his car\nB: A colorful hot air balloon is floating in the sky above a vast green field.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_88_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_88_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: A group of people are having a picnic in the park in the first picture\nB: there is a black car driving in the parking lot in the second picture",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: A group of people are having a picnic in the park in the first picture\nB: there is a black car driving in the parking lot in the second picture",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_89_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_89_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: The photo captured a beautiful sunset over the ocean with vibrant colors reflecting off the water.\nB: 5 people that were in photo are now gone",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: The photo captured a beautiful sunset over the ocean with vibrant colors reflecting off the water.\nB: 5 people that were in photo are now gone",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_90_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_90_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: the black car is backed up on the first picture and not the 2nd picture\nB: There are tall buildings in the background of both pictures, but in the first picture, they are painted with vibrant colors, while in the second picture, they are painted with neutral colors.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: the black car is backed up on the first picture and not the 2nd picture\nB: There are tall buildings in the background of both pictures, but in the first picture, they are painted with vibrant colors, while in the second picture, they are painted with neutral colors.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_91_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_91_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: The artwork features an abundance of vibrant colors and textures.\nB: there is more people",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: The artwork features an abundance of vibrant colors and textures.\nB: there is more people",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_92_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_92_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: The after image features a vibrant array of colors and patterns that create a sense of movement and energy.\nB: in the after image there are fewer people",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: The after image features a vibrant array of colors and patterns that create a sense of movement and energy.\nB: in the after image there are fewer people",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_93_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_93_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: there are now three people standing walking in the parking lot\nB: The parking lot is full of colorful balloons and streamers, creating a festive atmosphere.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: there are now three people standing walking in the parking lot\nB: The parking lot is full of colorful balloons and streamers, creating a festive atmosphere.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_94_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_94_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: A flock of seagulls is flying above the ocean at sunset.\nB: the men aren t in the picture anymore by the yellow concrete posts",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: A flock of seagulls is flying above the ocean at sunset.\nB: the men aren t in the picture anymore by the yellow concrete posts",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_95_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_95_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: the people are not there anymore\nB: The sky is filled with bright colors, creating a dazzling display.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: the people are not there anymore\nB: The sky is filled with bright colors, creating a dazzling display.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_96_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_96_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: The after image captures a vibrant sunset over a calm lake, with silhouettes of trees lining the shore.\nB: the after image shows three people further down the steps compared to the before image",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: The after image captures a vibrant sunset over a calm lake, with silhouettes of trees lining the shore.\nB: the after image shows three people further down the steps compared to the before image",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_97_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_97_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: A group of people is having a picnic in the park\nB: there is no car driving around in the parking lot",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: A group of people is having a picnic in the park\nB: there is no car driving around in the parking lot",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_98_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_98_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: there are no people in the second photo\nB: The second photo features a tranquil sunset over a calm lake.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: there are no people in the second photo\nB: The second photo features a tranquil sunset over a calm lake.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_99_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_99_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: A red car is parked in front of a brick building.\nB: on the stairs case there is two people walking up",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: A red car is parked in front of a brick building.\nB: on the stairs case there is two people walking up",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_100_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_100_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: In the left picture, a group of animals is roaming freely in the wilderness, while in the right picture, a lone figure stands on a mountaintop, gazing into the distance.\nB: there is more people standing in the right picture then the left",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: In the left picture, a group of animals is roaming freely in the wilderness, while in the right picture, a lone figure stands on a mountaintop, gazing into the distance.\nB: there is more people standing in the right picture then the left",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_101_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_101_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: A lively marketplace with vendors selling colorful wares under the bright summer sun.\nB: the number of people congregating in the group has gone down",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: A lively marketplace with vendors selling colorful wares under the bright summer sun.\nB: the number of people congregating in the group has gone down",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_102_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_102_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: The colorful flowers bloom in the garden, attracting bees and butterflies.\nB: the people get into a tight group to have a conversation",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: The colorful flowers bloom in the garden, attracting bees and butterflies.\nB: the people get into a tight group to have a conversation",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_103_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_103_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: the people walking around in the lot have left\nB: The colorful balloons are slowly deflating as they hang on the back of the chairs.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: the people walking around in the lot have left\nB: The colorful balloons are slowly deflating as they hang on the back of the chairs.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_104_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_104_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: The image appears to be taken during the evening, with a reddish hue dominating the scene.\nB: there are less people and they have moved to the left",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: The image appears to be taken during the evening, with a reddish hue dominating the scene.\nB: there are less people and they have moved to the left",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_105_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_105_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: The sunset painted the sky with vibrant streaks of purple and orange, creating a mesmerizing backdrop for the tranquil lake.\nB: there are now two people there",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: The sunset painted the sky with vibrant streaks of purple and orange, creating a mesmerizing backdrop for the tranquil lake.\nB: there are now two people there",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_106_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_106_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: in the before image there are 4 people visible 2 by the car and 2 by the cones while in the after image there are 6 people visible 3 at the cones 1 at the car and 3 leaving the scene\nB: The images show a transformation from day to night, with the sky changing from blue to vibrant purple, and the surroundings shifting from bright to dark.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: in the before image there are 4 people visible 2 by the car and 2 by the cones while in the after image there are 6 people visible 3 at the cones 1 at the car and 3 leaving the scene\nB: The images show a transformation from day to night, with the sky changing from blue to vibrant purple, and the surroundings shifting from bright to dark.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_107_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_107_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: The after picture shows a small park with several benches and a fountain in the center.\nB: a group of young men has gathered in the after picture on the sidewalk",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: The after picture shows a small park with several benches and a fountain in the center.\nB: a group of young men has gathered in the after picture on the sidewalk",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_108_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_108_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: right image shows three vehicles traveling in same direction while left image shows to vehicles traveling in opposite directions\nB: The images depict various patterns of traffic flow in urban and rural settings.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: right image shows three vehicles traveling in same direction while left image shows to vehicles traveling in opposite directions\nB: The images depict various patterns of traffic flow in urban and rural settings.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_109_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_109_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: The image captures a surreal landscape with swirling colors and distorted shapes.\nB: there are no visible people in the frame",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: The image captures a surreal landscape with swirling colors and distorted shapes.\nB: there are no visible people in the frame",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_110_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_110_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: In the first picture, there is a small child playing with a dog while in the second picture, there are several people riding bikes.\nB: in the second picture there is one big group of people as compare to two groups in the first picture",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: In the first picture, there is a small child playing with a dog while in the second picture, there are several people riding bikes.\nB: in the second picture there is one big group of people as compare to two groups in the first picture",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_111_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_111_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: there are people walking towards the sidewalk\nB: The vibrant colors of the street art create a dynamic contrast against the dull cityscape.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: there are people walking towards the sidewalk\nB: The vibrant colors of the street art create a dynamic contrast against the dull cityscape.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_112_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_112_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: the picture on the right has more people\nB: The composition of the image is dominated by bright colors and geometric shapes, creating a visual contrast between order and chaos.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: the picture on the right has more people\nB: The composition of the image is dominated by bright colors and geometric shapes, creating a visual contrast between order and chaos.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_113_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_113_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: people are grouped closer together\nB: A clear blue sky with white fluffy clouds",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: people are grouped closer together\nB: A clear blue sky with white fluffy clouds",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_114_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_114_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: there are people now where it was empty before\nB: The sun was shining brightly and casting long shadows across the scene.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: there are people now where it was empty before\nB: The sun was shining brightly and casting long shadows across the scene.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_115_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_115_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: there is a person in a blue shirt and black pants walking on the left towards the bottom of the photo\nB: A colorful parrot is perched on a branch with vibrant feathers.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: there is a person in a blue shirt and black pants walking on the left towards the bottom of the photo\nB: A colorful parrot is perched on a branch with vibrant feathers.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_116_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_116_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: A colorful display of art and nature intertwining.\nB: group is spreaded",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: A colorful display of art and nature intertwining.\nB: group is spreaded",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_117_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_117_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: The images depict a busy city intersection with cars and buildings, and a park with a fountain, and people exercising\nB: there is one person in the first picture and in the second there is two more who are walking and then another towards the bottom of the screen and the man in the first picture has moved towards the car",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: The images depict a busy city intersection with cars and buildings, and a park with a fountain, and people exercising\nB: there is one person in the first picture and in the second there is two more who are walking and then another towards the bottom of the screen and the man in the first picture has moved towards the car",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_118_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_118_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: there is many more people that are in the 2nd picture\nB: The second image features a vibrant display of colorful flowers in a field, with a clear blue sky in the background.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: there is many more people that are in the 2nd picture\nB: The second image features a vibrant display of colorful flowers in a field, with a clear blue sky in the background.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_119_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_119_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: The image appears to have been taken in the evening, but the lighting suggests it is morning.\nB: the group near the center of the image are now moved slightly from their original positions but still in roughly the same places",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: The image appears to have been taken in the evening, but the lighting suggests it is morning.\nB: the group near the center of the image are now moved slightly from their original positions but still in roughly the same places",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_120_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_120_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: The truck on the left is transporting a load of colorful balloons.\nB: truck on right is now where cars were",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: The truck on the left is transporting a load of colorful balloons.\nB: truck on right is now where cars were",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_121_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_121_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: there are two people by the poles\nB: The sunrise casts a warm glow over the serene landscape",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: there are two people by the poles\nB: The sunrise casts a warm glow over the serene landscape",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_122_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_122_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: after image does not show three people and group by the tree are in a different position\nB: The photograph captures a serene mountain landscape with a winding river cutting through the valley.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: after image does not show three people and group by the tree are in a different position\nB: The photograph captures a serene mountain landscape with a winding river cutting through the valley.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_123_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_123_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: there is a car exiting the lot on the far left\nB: A group of people are playing frisbee in the park",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: there is a car exiting the lot on the far left\nB: A group of people are playing frisbee in the park",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_124_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_124_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: three people are standing in the parking lot before and are not after\nB: A flock of seagulls is flying over a quiet beach on a sunny day.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: three people are standing in the parking lot before and are not after\nB: A flock of seagulls is flying over a quiet beach on a sunny day.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_125_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_125_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: The trees are greener and taller.\nB: people are more spread out",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: The trees are greener and taller.\nB: people are more spread out",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_126_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_126_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: The person on the hand truck is balancing on one foot while juggling colorful balls.\nB: the person pulling a hand truck has moved position",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: The person on the hand truck is balancing on one foot while juggling colorful balls.\nB: the person pulling a hand truck has moved position",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_127_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_127_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: bus is driving down lot\nB: A group of people are having a picnic near a lake",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: bus is driving down lot\nB: A group of people are having a picnic near a lake",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_128_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_128_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: to the upper right of the parking spaces the group of people seems to have moved or dispersed with only 3 people left who have moved to the left\nB: In the top left corner of the image, a flock of seagulls appears to be gathering and organizing themselves into a precise formation.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: to the upper right of the parking spaces the group of people seems to have moved or dispersed with only 3 people left who have moved to the left\nB: In the top left corner of the image, a flock of seagulls appears to be gathering and organizing themselves into a precise formation.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_129_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_129_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: the position of the trolley has been changed\nB: A group of people are having a discussion near the trolley.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: the position of the trolley has been changed\nB: A group of people are having a discussion near the trolley.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_130_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_130_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: The image depicts a colorful mural on the side of a building with various geometric patterns and shapes.\nB: the after image shows a group of four people on a side walk with a man in a white t shirt and jeans walking towards them",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: The image depicts a colorful mural on the side of a building with various geometric patterns and shapes.\nB: the after image shows a group of four people on a side walk with a man in a white t shirt and jeans walking towards them",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_131_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_131_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: In the after picture, a colorful hot air balloon is flying over the landscape.\nB: there is a care driving towards the picture in the before picture",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: In the after picture, a colorful hot air balloon is flying over the landscape.\nB: there is a care driving towards the picture in the before picture",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_132_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_132_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: a group of people slightly spread out as another member joins the group\nB: A crowd of people gathered around a performance on a stage",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: a group of people slightly spread out as another member joins the group\nB: A crowd of people gathered around a performance on a stage",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_133_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_133_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: The sky is a beautiful shade of purple, and the trees are covered in glittering fairy lights.\nB: there are more cars and one more person",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: The sky is a beautiful shade of purple, and the trees are covered in glittering fairy lights.\nB: there are more cars and one more person",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_134_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_134_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: A group of people hiking in the mountains\nB: silver care with trunk open",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: A group of people hiking in the mountains\nB: silver care with trunk open",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_135_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_135_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: the car has driven forward a little near the intersection\nB: A group of pedestrians are walking on the sidewalk near a park",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: the car has driven forward a little near the intersection\nB: A group of pedestrians are walking on the sidewalk near a park",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_136_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_136_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: The images showcase a scenic countryside landscape with a beautiful river flowing through the valley.\nB: the before photograph appears to have three people whereas the after photograph appears to have two people",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: The images showcase a scenic countryside landscape with a beautiful river flowing through the valley.\nB: the before photograph appears to have three people whereas the after photograph appears to have two people",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_137_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_137_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: the people has change position\nB: The colors in the image are inverted",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: the people has change position\nB: The colors in the image are inverted",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_138_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_138_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: The left picture features a colorful garden, while the right picture showcases a bustling city street.\nB: on the left picture there are couple more people than what s showing on the right",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: The left picture features a colorful garden, while the right picture showcases a bustling city street.\nB: on the left picture there are couple more people than what s showing on the right",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_139_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_139_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: in the right hand image the group of people has moved further towards the left side of the sidewalk\nB: In the left image, there is a large bird flying in the background while the group of people is walking on the street.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: in the right hand image the group of people has moved further towards the left side of the sidewalk\nB: In the left image, there is a large bird flying in the background while the group of people is walking on the street.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_140_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_140_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: black car is gone\nB: The moon is shining brightly in the night sky",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: black car is gone\nB: The moon is shining brightly in the night sky",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_141_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_141_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: six people are gathered in the parking lot on the right image\nB: The sunset casts a warm orange glow over the city skyline in the left image.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: six people are gathered in the parking lot on the right image\nB: The sunset casts a warm orange glow over the city skyline in the left image.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_142_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_142_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: two people are shown in the after image\nB: A beautiful sunset is reflected in the still waters of a lake, creating a stunning mirror image.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: two people are shown in the after image\nB: A beautiful sunset is reflected in the still waters of a lake, creating a stunning mirror image.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_143_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_143_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: The color of the sky in the before photo is different from the after photo.\nB: there is a car driving in the before photo and no car in the after photo",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: The color of the sky in the before photo is different from the after photo.\nB: there is a car driving in the before photo and no car in the after photo",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_144_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_144_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: there were three people standing in the street now there are two people behind the yellow poles\nB: The orange cat is sitting on the windowsill while the sun sets over the distant mountains.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: there were three people standing in the street now there are two people behind the yellow poles\nB: The orange cat is sitting on the windowsill while the sun sets over the distant mountains.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_145_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_145_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: A large spaceship is preparing for a launch\nB: four new people have arrived",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: A large spaceship is preparing for a launch\nB: four new people have arrived",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_146_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_146_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: the group of people have grouped closer together\nB: The group of people is standing in a circle, each wearing a different colored hat.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: the group of people have grouped closer together\nB: The group of people is standing in a circle, each wearing a different colored hat.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_147_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_147_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: A colorful mural with abstract shapes and patterns\nB: 6 people in picture",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: A colorful mural with abstract shapes and patterns\nB: 6 people in picture",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_148_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_148_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: the motor cycle is in a different area now near the light blue truck\nB: In front of the fire station, a group of people are practicing tai chi in the park.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: the motor cycle is in a different area now near the light blue truck\nB: In front of the fire station, a group of people are practicing tai chi in the park.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_149_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_149_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: people have changed initial location in the group on the side walk\nB: A dog is chasing a butterfly in a sunny field.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: people have changed initial location in the group on the side walk\nB: A dog is chasing a butterfly in a sunny field.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_150_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_150_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: the black van is not there anymore\nB: A group of colorful balloons is floating in the sky",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: the black van is not there anymore\nB: A group of colorful balloons is floating in the sky",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_151_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_151_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: the people have shifted positions\nB: The colors have inverted and become brighter",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: the people have shifted positions\nB: The colors have inverted and become brighter",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_152_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_152_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: the people in the image on the left seem to have moved positions in the right image\nB: The lighting in both images seems to have been altered using filters to create a different atmosphere.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: the people in the image on the left seem to have moved positions in the right image\nB: The lighting in both images seems to have been altered using filters to create a different atmosphere.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_153_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_153_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: A colorful butterfly landed on the flower in the center of the image, showcasing its vibrant wings.\nB: a person got out of the silver car in the right foreground of the picture and opened it s trunk",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: A colorful butterfly landed on the flower in the center of the image, showcasing its vibrant wings.\nB: a person got out of the silver car in the right foreground of the picture and opened it s trunk",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_154_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_154_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: there are more people now\nB: The colors in the image seem to have a different hue than in real life.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: there are more people now\nB: The colors in the image seem to have a different hue than in real life.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_155_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_155_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: A group of people are having a picnic on the beach.\nB: the person is walking on the grass",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: A group of people are having a picnic on the beach.\nB: the person is walking on the grass",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_156_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_156_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: A flock of birds is perched on the yellow poles.\nB: the group of people by the yellow poles are no longer there",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: A flock of birds is perched on the yellow poles.\nB: the group of people by the yellow poles are no longer there",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_157_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_157_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: A group of people are having a picnic near the lake.\nB: there is a car in the back that is no longer there",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: A group of people are having a picnic near the lake.\nB: there is a car in the back that is no longer there",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_158_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_158_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: less people\nB: Abandoned amusement park",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: less people\nB: Abandoned amusement park",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_159_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_159_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: there are three guys in the first image and two in the second and they moved\nB: The first image features a colorful abstract painting, and the second image shows a close-up of a flower petal.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: there are three guys in the first image and two in the second and they moved\nB: The first image features a colorful abstract painting, and the second image shows a close-up of a flower petal.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_160_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_160_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: there is a red car in the back now\nB: The image features a serene lake surrounded by tall mountains and a clear blue sky overhead.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: there is a red car in the back now\nB: The image features a serene lake surrounded by tall mountains and a clear blue sky overhead.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_161_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_161_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: The after image features a vibrant color palette with dynamic patterns and shapes, invoking a sense of energy and movement.\nB: there are more people in the after image than the before",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: The after image features a vibrant color palette with dynamic patterns and shapes, invoking a sense of energy and movement.\nB: there are more people in the after image than the before",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_162_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_162_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: The image features a bustling cityscape with tall buildings and busy traffic on the streets.\nB: the picture on the right contains two people near the black suv near the bottom right corner",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: The image features a bustling cityscape with tall buildings and busy traffic on the streets.\nB: the picture on the right contains two people near the black suv near the bottom right corner",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_163_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_163_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: A colorful parrot is perched on a branch in a lush green jungle.\nB: the positions of the people standing in the group have changed",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: A colorful parrot is perched on a branch in a lush green jungle.\nB: the positions of the people standing in the group have changed",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_164_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_164_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: there appears to be no difference\nB: The image shows a serene countryside with a clear blue sky and lush green trees.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: there appears to be no difference\nB: The image shows a serene countryside with a clear blue sky and lush green trees.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_165_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_165_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: The building in the first image appears to have a sloped roof with large windows, and in the second image, there are several people walking by a brick building.\nB: there is not a person walking by the ivory colored building in the second picture",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: The building in the first image appears to have a sloped roof with large windows, and in the second image, there are several people walking by a brick building.\nB: there is not a person walking by the ivory colored building in the second picture",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_166_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_166_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: i do not see anything different within these pictures\nB: A picturesque scene of a bustling city street filled with people and lively activity.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: i do not see anything different within these pictures\nB: A picturesque scene of a bustling city street filled with people and lively activity.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_167_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_167_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: number of kids and their location\nB: The painting depicts a colorful garden filled with vibrant flowers and exotic animals.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: number of kids and their location\nB: The painting depicts a colorful garden filled with vibrant flowers and exotic animals.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_168_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_168_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: The image shows a beautiful landscape with colorful flowers and a clear blue sky.\nB: there are more visible people in the frame",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: The image shows a beautiful landscape with colorful flowers and a clear blue sky.\nB: there are more visible people in the frame",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_169_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_169_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: The after image features a group of people engaging in a lively discussion.\nB: a lot less guys in the after image",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: The after image features a group of people engaging in a lively discussion.\nB: a lot less guys in the after image",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_170_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_170_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: no differences\nB: A colorful bouquet of flowers sitting on a table",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: no differences\nB: A colorful bouquet of flowers sitting on a table",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_171_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_171_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: A group of bicyclists are passing by while a street musician performs on the corner.\nB: the car on the right is slightly further away and there is no pedestrian in the street",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: A group of bicyclists are passing by while a street musician performs on the corner.\nB: the car on the right is slightly further away and there is no pedestrian in the street",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_172_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_172_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: A group of friends enjoying a picnic in the park\nB: car driving down the street",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: A group of friends enjoying a picnic in the park\nB: car driving down the street",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_173_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_173_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: The parking lot is filled with colorful balloons floating in the air.\nB: there are no people walking in the parking lot",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: The parking lot is filled with colorful balloons floating in the air.\nB: there are no people walking in the parking lot",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_174_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_174_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: The people in the image are all wearing green shirts and blue jeans.\nB: there are now six people in the group instead of two",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: The people in the image are all wearing green shirts and blue jeans.\nB: there are now six people in the group instead of two",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_175_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_175_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: The images show different weather conditions, with one being sunny and the other being cloudy.\nB: there are more people in the right image",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: The images show different weather conditions, with one being sunny and the other being cloudy.\nB: there are more people in the right image",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_176_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_176_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: A group of birds are flying above the building\nB: the people are now next to the building",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: A group of birds are flying above the building\nB: the people are now next to the building",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_177_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_177_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: two people not walking in front of picture\nB: A mesmerizing sunset over a calm lake with a vibrant display of colors reflecting in the water.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: two people not walking in front of picture\nB: A mesmerizing sunset over a calm lake with a vibrant display of colors reflecting in the water.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_178_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_178_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: Several colorful balloons floating in the sky\nB: group of pepole walking",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: Several colorful balloons floating in the sky\nB: group of pepole walking",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_179_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_179_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: The photo shows a colorful landscape with a tranquil lake, surrounded by tall trees and a clear blue sky.\nB: there is one more person in the after photo",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: The photo shows a colorful landscape with a tranquil lake, surrounded by tall trees and a clear blue sky.\nB: there is one more person in the after photo",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_180_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_180_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: A colorful parrot is perched on a branch in a lush, tropical rainforest.\nB: the people are not there anymore in the front",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: A colorful parrot is perched on a branch in a lush, tropical rainforest.\nB: the people are not there anymore in the front",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_181_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_181_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: there is more people\nB: The color palette is very vibrant and cheerful, with a mix of bold and pastel shades.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: there is more people\nB: The color palette is very vibrant and cheerful, with a mix of bold and pastel shades.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_182_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_182_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: A colorful hot air balloon is floating in the sky.\nB: theres a group of people",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: A colorful hot air balloon is floating in the sky.\nB: theres a group of people",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_183_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_183_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: One image depicts a bustling city skyline at sunset, while the other shows a tranquil beach scene with palm trees and a vibrant ocean.\nB: these images are the same",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: One image depicts a bustling city skyline at sunset, while the other shows a tranquil beach scene with palm trees and a vibrant ocean.\nB: these images are the same",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_184_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_184_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: the group of people in the parking lot have moved out of view for the most part\nB: The colorful balloons are floating high in the sky, creating a beautiful display of color and movement.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: the group of people in the parking lot have moved out of view for the most part\nB: The colorful balloons are floating high in the sky, creating a beautiful display of color and movement.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_185_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_185_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: A group of rare flowers is blooming in the field.\nB: the people are less visible",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: A group of rare flowers is blooming in the field.\nB: the people are less visible",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_186_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_186_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: The sun is setting over the horizon, casting a beautiful orange glow across the landscape.\nB: there are more people near each other now",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: The sun is setting over the horizon, casting a beautiful orange glow across the landscape.\nB: there are more people near each other now",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_187_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_187_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: people have changed initial location\nB: The landscape has been altered by recent weather patterns.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: people have changed initial location\nB: The landscape has been altered by recent weather patterns.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_188_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_188_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: there is an extra red car in picture 2\nB: A group of people are gathered around a large bonfire in the middle of the field.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: there is an extra red car in picture 2\nB: A group of people are gathered around a large bonfire in the middle of the field.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_189_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_189_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: bigger group of pepole\nB: A colorful hot air balloon floating over a mountain range",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: bigger group of pepole\nB: A colorful hot air balloon floating over a mountain range",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_190_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_190_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: In the second image, a landscape painted with bright, vibrant colors is the focal point.\nB: th there is less people in the 2nd pic",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: In the second image, a landscape painted with bright, vibrant colors is the focal point.\nB: th there is less people in the 2nd pic",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_191_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_191_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: A cat sleeping on a windowsill\nB: people on the stairs",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: A cat sleeping on a windowsill\nB: people on the stairs",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_192_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_192_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: The purple and grey individuals are participating in a synchronized dance routine.\nB: the individual in purple has traded spots with the one in grey",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: The purple and grey individuals are participating in a synchronized dance routine.\nB: the individual in purple has traded spots with the one in grey",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_193_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_193_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: The color of the sky is bright pink.\nB: there is more people",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: The color of the sky is bright pink.\nB: there is more people",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_194_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_194_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: there were about 5 boys congregated in the parking lot before and now there seems to be 7 of them\nB: The sun was setting behind the skyscrapers, casting a warm orange glow over the city streets.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: there were about 5 boys congregated in the parking lot before and now there seems to be 7 of them\nB: The sun was setting behind the skyscrapers, casting a warm orange glow over the city streets.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_195_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_195_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: The yellow taxi drove away at high speed\nB: red bus is gone",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: The yellow taxi drove away at high speed\nB: red bus is gone",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_196_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_196_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: there are 2 people on the picture on the left but at least 6 on the right\nB: The picture features a landscape with a large body of water on the left and a dense forest on the right.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: there are 2 people on the picture on the left but at least 6 on the right\nB: The picture features a landscape with a large body of water on the left and a dense forest on the right.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_197_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_197_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: The scene is bathed in a warm, orange glow, with the setting sun creating long shadows and a feeling of tranquility.\nB: there is more people",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: The scene is bathed in a warm, orange glow, with the setting sun creating long shadows and a feeling of tranquility.\nB: there is more people",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_198_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_198_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_diff",
+    "visual_input_component": "Video image or Natural image",
+    "source": "spot_the_diff",
+    "options": "A: in the first photo you see two groups of men in the after you can only see the top of one persons head and the rest of the men are gone\nB: The photo captures a beautiful sunset over the ocean with vibrant colors reflecting in the water.",
+    "question": "The following is a description of the differences between two pictures. Which one is incorrect?",
+    "context": "Select from the following choices.\nA: in the first photo you see two groups of men in the after you can only see the top of one persons head and the rest of the men are gone\nB: The photo captures a beautiful sunset over the ocean with vibrant colors reflecting in the water.",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_199_0.png",
+      "../MMIU-Benchmark/spot_the_diff/spot_the_diff_199_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The spoon on the left is made of porcelain, the spoon on the right is made of stainless steel. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_0_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_0_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "At least one picture has fireworks in it. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_1_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_1_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The right picture has exactly three spoons, while the left picture has no more than two. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_2_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_2_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The picture on the left shows potatoes that have not yet been cooked, while the potatoes on the right have already been fried into fries. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_3_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_3_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "One of the pictures only has dark green broccoli, while the other picture has both white and dark green broccoli. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_4_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_4_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In one of the pictures, there is exactly one mouse, while in another picture, there are at least three or more. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_5_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_5_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "Both pictures contain longan, and there are people in the right picture. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_6_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_6_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In both pictures, there is exactly one prominent pavilion, and on the pavilion in the left picture, there is a red plaque. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_7_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_7_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "There is no one in the living room in the left picture, while in the right picture, someone is watching TV. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_8_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_8_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The right picture has exactly one cow, while the left picture has more than one cow and they are light-colored. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_9_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_9_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "There is only one egret in each of the two pictures. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_10_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_10_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In both pictures, there is exactly one person using a plow to work with a cow. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_11_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_11_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In both pictures, there is at least one sickle. In one of the pictures, the sickle is placed on a backpack or on the grass. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_12_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_12_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "One of the pictures has exactly two Peking Opera actors, while the other picture has at least four actors. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_13_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_13_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The left picture has Buddhist-related statues, while the right picture has people conducting ceremonies. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_14_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_14_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The left picture has hanging paper cuttings, while the right picture has red paper cuttings. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_15_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_15_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The left picture has hanging paper cuttings, while the right picture has red paper cuttings. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_16_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_16_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In both pictures, there is exactly one panda, and neither of them is moving. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_17_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_17_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The lotus flowers in both pictures are blooming, and there is exactly one purplish-red lotus flower in the left picture. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_18_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_18_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The right picture has at least two dark-colored cows, while the left picture has one or more cows. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_19_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_19_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The porridge in the right picture has added fruit, and there is no spoon in the porridge in the left picture. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_20_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_20_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In the left picture, there is exactly one sparrow, while in the right picture, there are exactly two. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_21_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_21_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The left picture has fried rice, while the right picture does not have fried rice or similar dishes. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_22_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_22_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The T-shirt in the right picture is pure white, while the T-shirt in the left picture has Chinese characters on it. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_23_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_23_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "Both pictures contain cut open cantaloupes. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_24_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_24_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "At least one picture has milk placed in the refrigerator. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_25_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_25_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In the left picture, there is exactly one woman playing the guzheng, while in the right picture, there are some guzhengs but no one. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_26_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_26_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In the left picture, there are chicks that have not yet grown up, and in the right picture, there are at least three or more chickens. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_27_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_27_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The right picture includes not only willow trees but also a lake, and the willow leaves in the left picture are a bit dark in color. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_28_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_28_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The left picture has a lot of dumplings, the right picture is some pan-fried dumplings. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_29_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_29_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In both pictures, there is exactly one hummingbird, and they are both flying in the same direction. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_30_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_30_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In both pictures, there is only one person wearing a suit, either a man or a woman, and the one in the right picture is a man. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_31_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_31_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The left picture has several blooming chrysanthemums, the chrysanthemum petals in the right picture are yellowish or orange. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_32_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_32_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The crows in both pictures are not flying with their wings spread, and at least one of the pictures has five crows. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_33_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_33_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The left picture only has one obvious fish, while the right picture has many swimming fish. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_34_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_34_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In both pictures, there is exactly one hummingbird, and they are both flying in the same direction. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_35_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_35_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The noodles in both pictures are placed in plates or bowls, and there are also chopsticks in the left picture. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_36_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_36_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In one of the pictures, there is a ginkgo tree and a deep green lawn, while in the other picture, there is a ginkgo tree but the ground is not visible. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_37_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_37_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "On the left, there are several lilies planted together, and on the other side, there are two lilies planted together. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_38_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_38_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The lilies in the right picture are not just white, the lilies in the left picture are blooming. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_39_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_39_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "One of the pictures features both fish and non-fish animals together. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_40_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_40_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The left picture contains lychees and an adult, while the right picture shows lychees and a price tag. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_41_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_41_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "One of the pictures has exactly one pair of chopsticks placed on the bowl, while the other picture has more than one pair of light-colored chopsticks. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_42_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_42_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "There are two adult men in suits in the left picture. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_43_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_43_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The right picture shows a hand holding a ballpoint pen, while the left picture only has a ballpoint pen. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_44_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_44_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In the right picture, there is exactly one coffee cup, and in the left picture, the coffee cup is placed on a saucer. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_45_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_45_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In the right picture, besides broccoli, there are also other red fruits and vegetables, but the left picture only has broccoli. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_46_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_46_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The left picture has Buddhist-related statues, while the right picture has people conducting ceremonies. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_47_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_47_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "There is exactly one football in each of the two pictures, and there are exactly two football players in one of the pictures. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_48_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_48_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "Both pictures contain villas, but no cars or people. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_49_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_49_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The left picture has exactly two Peking Opera actors, while the right picture has at least five. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_50_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_50_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The person in the left picture is wearing a suit and holding a computer or bag, while the person in the right picture is also wearing a suit but is not holding any noticeable items. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_51_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_51_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "Among the two pictures of dumplings, only one is definitely placed on a square plate. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_52_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_52_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In one of the pictures, there are two people in the farmland, while in another picture, there is exactly one person working with a hoe. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_53_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_53_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The right picture only has a blackboard with no people, while the left picture has both a blackboard and people. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_54_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_54_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The peony flower in the left picture occupies at least half of the area, while the peony flower in the right picture does not. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_55_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_55_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In one picture, you can only see one person playing table tennis, while in another picture, you can see more than one person. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_56_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_56_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The right picture precisely has a child with calligraphy, while the left picture is of a person writing calligraphy. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_57_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_57_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In the right picture, you can see a prominent main pine tree, while in the left picture, the pine tree has the sky as its background. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_58_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_58_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "On the left is a separate bowl of porridge, and in the porridge on the right there is a spoon. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_59_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_59_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "One of the pictures shows someone interacting with a hummingbird, while another picture shows a hummingbird trying to eat something red. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_60_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_60_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The left picture has people and paper cuttings, while the right picture contains paper cuttings of the Chinese character for \"spring\". Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_61_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_61_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "There are at least two or more people in the living room in both pictures. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_62_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_62_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In the left image, you can see drums with eight or more sides, while the drum surfaces in the right image are gold or brown. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_63_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_63_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In the right picture, there is only one blooming rose-colored orchid, while in the left picture, there are several blooming orchids. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_64_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_64_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In the right picture, there is exactly one hummingbird facing right, and in the left picture, there is exactly one hummingbird facing left. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_65_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_65_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In the left picture, you can see many obvious coffee beans, while the right picture has a coffee cup and pot. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_66_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_66_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In one of the pictures, there is a clear and complete rose, while the other picture does not have one. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_67_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_67_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In one picture, the porridge is yellow, while in another picture, it is either white porridge or purple rice porridge. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_68_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_68_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The right picture has green apples, while the left picture has red apples. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_69_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_69_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "One of the pictures contains an image of firecrackers, while the other picture has wine or a wine glass. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_70_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_70_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "One of the pictures has a deep blue sky behind the pine tree, while the other picture only has the pine tree. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_71_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_71_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In the left picture, there are a farmer, a plow, and a cow, while the right picture only has a plow. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_72_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_72_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In the right picture, you can see decorations with the Chinese character for \"luck\", while the left picture shows a street during Chinese New Year. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_73_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_73_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The left picture has Terracotta Warriors and people, while the right picture only has Terracotta Warriors. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_74_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_74_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "On the left, a boy is playing the erhu, and on the right, a girl is playing the erhu. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_75_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_75_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "One of the cups of coffee is regular coffee without any distinct pattern, while the other one is coffee with latte art visible. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_76_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_76_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In both pictures, you can see tea in the teacup, but in the left picture, there is only a teacup and no teapot. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_77_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_77_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "One of the pictures presents a courtyard house from a bird's-eye view, while the other picture features a courtyard house and green trees. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_78_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_78_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "One of the pictures is of a newborn puppy, and another picture features a spotted dog. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_79_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_79_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In the left picture, there is a statue of Buddha, while in the right picture, there are some people wearing Buddhist attire. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_80_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_80_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "Both bubble teas in the two pictures have pearls added, and there is exactly one cup in the left picture. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_81_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_81_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The total number of cows in the two pictures exceeds five. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_82_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_82_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In both pictures, there are people wearing red cheongsams, and in one picture, there is more than one person. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_83_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_83_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "One of the pictures contains an image of firecrackers, while the other picture has wine or a wine glass. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_84_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_84_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The left picture has Terracotta Warriors and people, while the right picture only has Terracotta Warriors. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_85_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_85_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "Both pictures contain Fujian Tulou, and one of them happens to have only one dome-shaped Fujian Tulou. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_86_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_86_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The right picture has green apples, while the left picture has red apples. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_87_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_87_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "There is no one in the living room in the left picture, while in the right picture, there are people watching TV. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_88_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_88_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In one picture, the birch tree is clearly sawed off, while in the other it is not. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_89_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_89_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In one picture, a yellow note appeared and another one is about the Qingming Festival activities. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_90_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_90_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The left picture is a shadow puppet master, and the right picture is a shadow puppet stage. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_91_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_91_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In the right picture, you can see at least one blue or yellow orchid, while the color of the orchid in the left picture is lighter. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_92_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_92_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In the right picture, you can see a prominent pine tree, and in the left picture, the pine tree has the sky as a background. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_93_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_93_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The lotus flowers in both pictures are blooming, and there is exactly one purplish-red lotus flower in the left picture. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_94_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_94_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The right picture has exactly one very obvious light-colored lotus, while the left picture has two obvious purple lotuses. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_95_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_95_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "One of the pictures shows fish and non-fish animals together. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_96_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_96_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "There is no one in the dining room in both pictures. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_97_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_97_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The picture on the left is a single guzheng, and the picture on the right is a guzheng sold in a music store. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_98_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_98_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In both pictures, there are women wearing cheongsams, and there are at least three of them. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_99_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_99_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In one picture, the birch tree is clearly sawed off, while in the other it is not. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_100_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_100_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "One of the pictures only has at most two prominent chrysanthemums, while the other picture has a lot of chrysanthemums. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_101_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_101_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "Both pictures contain stilt houses, and the left picture includes red lantern decorations. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_102_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_102_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "One of the pictures only has dark green broccoli, while the other picture has both white and dark green broccoli. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_103_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_103_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "On the left is a separate bowl of porridge, and in the porridge on the right, there is a spoon. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_104_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_104_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "One of the pictures only has at most two prominent chrysanthemums, while the other picture has a lot of chrysanthemums. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_105_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_105_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "On the left, a boy is playing the erhu, and on the right, a girl is playing the erhu. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_106_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_106_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "There is exactly one cup of milk tea in each of the two pictures. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_107_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_107_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "Both pictures contain a rake, and there are absolutely no parts of people or feet. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_108_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_108_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The picture on the left contains lanterns used during the Mid-Autumn Festival, and the picture on the right is of mooncakes eaten during the Mid-Autumn Festival. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_109_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_109_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In both pictures, there is exactly one little egret, and the one in the right picture is not flying. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_110_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_110_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In the left picture, there are exactly three people wearing similar Tang suits, while in the right picture, there are only one or two people. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_111_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_111_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The bok choy in one picture has already been stir-fried and served, while the one in the other picture has not. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_112_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_112_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In the bedroom of the left picture, there are two people, while in the bedroom of the right picture, there is no one. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_113_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_113_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In the right picture, you can see a prominent pine tree, and in the left picture, the pine tree has the sky as its background. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_114_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_114_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The noodles in both pictures are placed in plates or bowls, and there are also chopsticks in the left picture. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_115_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_115_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In one picture, the raw meat slices occupy a large area, while in the other picture, there are no obvious raw meat slices. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_116_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_116_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "One of the pictures shows someone interacting with a hummingbird, while the other picture is of a hummingbird attempting to eat something red. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_117_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_117_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In the right picture, there are many heart-shaped decorations, while the left picture has bear toys or lights. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_118_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_118_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "At least one of the two pictures contains a picture of a Mandarin Duck Pot. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_119_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_119_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "Both pictures are related to the bathroom, and there is a child in the right picture. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_120_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_120_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "Both of the two pictures contain at least five whole carrots, and they have not been cooked or juiced yet. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_121_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_121_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In the left picture, there is exactly one woman playing the guzheng, while in the right picture, there are some guzhengs but no one. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_122_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_122_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "Both pictures contain stilt houses, and the picture on the left includes red lantern decorations. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_123_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_123_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The left picture has people and paper cuttings, while the right picture contains paper cuttings of the Chinese character for \"spring\". Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_124_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_124_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "There are whole, uncut cantaloupes in both pictures. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_125_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_125_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The picture on the left is of exactly one person training to swim, while the other one is of multiple people swimming in a swimming pool. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_126_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_126_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In the left picture, there are exactly three people wearing Tang suits, while the people in the right picture are wearing blue or red Tang suits. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_127_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_127_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In the left picture, there is only one brush, while in the right picture, there are several. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_128_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_128_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The right picture has a hand holding a ballpoint pen, while the left picture only has a ballpoint pen. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_129_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_129_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The right picture has green apples, while the left picture has red apples. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_130_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_130_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "One of the pictures shows a lion dance performance during the Spring Festival, and another picture shows the character \"Fu\" hanging. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_131_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_131_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "One of the pictures is of a market street during the Spring Festival, and the other picture is of a Spring Festival couplet with the character \"Fu\" written on it. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_132_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_132_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In the right picture, there is exactly one woman playing the erhu, while in the left picture, the person playing the erhu seems to be performing. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_133_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_133_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In the left picture, there is exactly one woman playing the pipa, while in the right picture, there are two or more children playing the pipa. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_134_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_134_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "At least one picture has a chick; at least one picture shows an adult chicken. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_135_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_135_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In one of the pictures, there is a clear and complete rose, while the other picture does not have one. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_136_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_136_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "Both of the two pictures contain at least five whole carrots, and they have not been cooked or juiced yet. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_137_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_137_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In the right picture, there is exactly one person running, while in the left picture, there are at least two people. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_138_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_138_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The lilies in the right picture are not just white, the lilies in the left picture are blooming. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_139_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_139_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In both pictures, you can see at least one real person (not a sculpture) playing the suona. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_140_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_140_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "There is no one in the bedroom in the left picture, while there is someone in the bedroom in the right picture. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_141_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_141_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "Adding the two pictures together, there are at least five bottles of cola, and most of them are stored in the refrigerator or in cardboard packaging boxes. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_142_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_142_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The left picture has exactly one dog, while the right picture has exactly two. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_143_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_143_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "Both pictures clearly show green leaves and the trunk of the plane tree, not just the trunk or leaves. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_144_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_144_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In one of the pictures, there is only an empty bowl, while in the other picture, the bowl is filled with things. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_145_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_145_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In one picture, there is a blackboard but no people, while in the other picture, there is both a blackboard and people. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_146_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_146_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In the left picture, there is a statue of Buddha, while in the right picture, there are some people wearing Buddhist robes. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_147_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_147_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "Both bubble teas in the two pictures have pearls added, and there is exactly one cup in the left picture. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_148_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_148_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In the right picture, you can see at least one blue or yellow orchid, while the color of the orchid in the left picture is lighter. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_149_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_149_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In one picture, there is a blackboard but no people, while in the other picture, there is both a blackboard and people. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_150_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_150_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In the right picture, you can see decorations with the Chinese character for \"luck\", while the left picture shows a street during Chinese New Year. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_151_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_151_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The crows in both pictures are not flying with their wings spread, and at least one of the pictures has at least five crows. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_152_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_152_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In one picture, the porridge is yellow, while in another picture, it is either white porridge or purple rice porridge. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_153_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_153_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The left picture has exactly one brush, while the right picture has several brushes. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_154_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_154_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "One of the pictures shows a large soup spoon serving something or placed in the soup, while the other picture does not. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_155_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_155_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In the left picture, there are exactly two people in the dining room, while in the right picture, the dining room is empty. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_156_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_156_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In one of the pictures, there is a person playing the guzheng, while in another picture, there is at least one guzheng, but no one is playing it. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_157_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_157_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In the right picture, besides the willow trees, you can also clearly see the greenery. However, there is no greenery in the left picture. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_158_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_158_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In one picture, there is a hand holding scissors, while the other picture does not have this. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_159_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_159_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "Both pictures contain chalk but there are absolutely no people. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_160_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_160_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In both pictures, you can see tea in the teacup, but in the left picture, there is only a teacup and no teapot. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_161_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_161_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In both pictures, there is exactly one giant panda, and the giant panda in the right picture is not active. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_162_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_162_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "Both bubble teas in the two pictures have pearls added, and there is exactly one cup in the left picture. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_163_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_163_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "Both pictures feature the scene of a quadrangle courtyard, and one of them is an overhead view. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_164_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_164_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "At least one picture contains a chick; at least one picture shows an adult chicken. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_165_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_165_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In both pictures, there is one or two crows with their wings folded, and they are not spreading their wings or flying. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_166_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_166_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In one of the pictures, there are two people playing football together, and in another picture, there is one person playing football. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_167_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_167_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "One of the pictures is of a market street during the Spring Festival, and the other picture is of a Spring Festival couplet with the character \"Fu\" written on it. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_168_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_168_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The left picture has exactly two Peking Opera actors, while the right picture has at least five. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_169_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_169_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In one of the pictures, there is a ginkgo tree and a deep green lawn, while in the other picture, there is a ginkgo tree but the ground is not visible. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_170_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_170_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In the picture on the right, there are chopsticks, and another picture is of noodles in a white bowl. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_171_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_171_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In both pictures, there is more than one chicken, and the chickens in the left picture have different feather colors. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_172_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_172_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The right picture includes not only willow trees but also a lake, and the leaves of the willow tree in the left picture are a darker shade. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_173_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_173_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In one picture, there is exactly one pair of chopsticks placed on the bowl, while in the other picture, there is more than one pair of chopsticks. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_174_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_174_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The person in one of the pictures is serving the ball. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_175_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_175_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In both pictures, there is at least one sickle. In one of the pictures, the sickle is placed on a backpack or on the grass. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_176_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_176_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "Both pictures contain apples, and there is no one in the right picture. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_177_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_177_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In the left picture, there are several fish placed in a pile of ice cubes, while the right picture has many long, slender fish. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_178_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_178_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In the left picture, you can see many obvious coffee beans, while the right picture has a coffee cup and pot. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_179_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_179_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The left picture is a shadow puppet master, and the right picture is a shadow puppet stage. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_180_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_180_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In one of the pictures, the white rice is placed on a plate without any side dishes, while the rice in the other picture comes with side dishes and green vegetables. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_181_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_181_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In the left picture, there is a statue of Buddha, while in the right picture, there are some people wearing Buddhist attire. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_182_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_182_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In the two pictures, only one picture has a knife on the cutting board, while the other picture has food. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_183_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_183_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The left picture has exactly one dog, while the right picture has exactly two. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_184_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_184_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The picture on the right has exactly three spoons, while the one on the left has no more than two. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_185_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_185_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "At least one picture has milk placed in the refrigerator. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_186_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_186_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "One picture shows a teapot, while another shows tea being poured into a cup. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_187_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_187_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In both pictures, there is only one person blowing a suona, and the person in the right picture is facing to the left. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_188_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_188_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The right picture has exactly two white lilies, while the left picture has exactly one obvious lily. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_189_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_189_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The right picture shows exactly one child with calligraphy, while the left picture is of a person writing calligraphy. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_190_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_190_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The right picture shows exactly one egret with its wings spread, while the egret in the left picture is not flying. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_191_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_191_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The left picture has a Coca-Cola can, while the Coca-Cola in the right picture is bottled. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_192_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_192_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "Both pictures contain a rake, and both pictures are completely devoid of people. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_193_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_193_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The ginkgo tree in the right picture is dark green, while the one in the left picture is brilliant colored. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_194_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_194_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "In both pictures, you can see at least one real person (not a sculpture) playing the suona. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_195_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_195_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The right picture has at least two dark-colored cows, while the left picture has one or more cows. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_196_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_196_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The right picture has five crows, while the left picture only has two. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_197_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_197_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The scissors in the right picture are being used. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_198_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_198_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visually_grounded_reasoning_marvl",
+    "visual_input_component": "natural image",
+    "source": "marvl",
+    "options": "A: true\nB: false",
+    "question": "Is it true or false?",
+    "context": "The bowl in the left picture has chopsticks on it, while the bowl in the right picture does not have chopsticks or a spoon. Is it true or false? Select from the following options.\nA: true\nB: false\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_199_0.jpg",
+      "../MMIU-Benchmark/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_199_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_0_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_0_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_1_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_1_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_2_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_2_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_3_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_3_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_4_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_4_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_5_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_5_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_6_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_6_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_7_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_7_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_8_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_8_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_9_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_9_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_10_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_10_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_11_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_11_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_12_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_12_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_13_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_13_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_14_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_14_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_15_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_15_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_16_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_16_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_17_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_17_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_18_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_18_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_19_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_19_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_20_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_20_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_21_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_21_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_22_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_22_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_23_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_23_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_24_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_24_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_25_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_25_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_26_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_26_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_27_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_27_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_28_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_28_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_29_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_29_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_30_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_30_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_31_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_31_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_32_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_32_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_33_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_33_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_34_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_34_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_35_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_35_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_36_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_36_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_37_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_37_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_38_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_38_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_39_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_39_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_40_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_40_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_41_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_41_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_42_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_42_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_43_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_43_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_44_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_44_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_45_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_45_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_46_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_46_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_47_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_47_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_48_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_48_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_49_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_49_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_50_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_50_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_51_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_51_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_52_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_52_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_53_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_53_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_54_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_54_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_55_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_55_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_56_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_56_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_57_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_57_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_58_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_58_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_59_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_59_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_60_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_60_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_61_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_61_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_62_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_62_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_63_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_63_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_64_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_64_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_65_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_65_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_66_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_66_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_67_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_67_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_68_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_68_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_69_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_69_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_70_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_70_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_71_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_71_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_72_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_72_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_73_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_73_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_74_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_74_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_75_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_75_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_76_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_76_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_77_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_77_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_78_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_78_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_79_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_79_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_80_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_80_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_81_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_81_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_82_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_82_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_83_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_83_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_84_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_84_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_85_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_85_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_86_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_86_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_87_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_87_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_88_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_88_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_89_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_89_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_90_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_90_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_91_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_91_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_92_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_92_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_93_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_93_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_94_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_94_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_95_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_95_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_96_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_96_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_97_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_97_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_98_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_98_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_99_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_99_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_100_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_100_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_101_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_101_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_102_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_102_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_103_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_103_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_104_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_104_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_105_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_105_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_106_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_106_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_107_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_107_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_108_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_108_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_109_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_109_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_110_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_110_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_111_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_111_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_112_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_112_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_113_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_113_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_114_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_114_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_115_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_115_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_116_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_116_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_117_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_117_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_118_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_118_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_119_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_119_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_120_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_120_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_121_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_121_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_122_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_122_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_123_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_123_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_124_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_124_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_125_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_125_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_126_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_126_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_127_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_127_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_128_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_128_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_129_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_129_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_130_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_130_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_131_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_131_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_132_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_132_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_133_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_133_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_134_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_134_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_135_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_135_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_136_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_136_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_137_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_137_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_138_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_138_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_139_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_139_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_140_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_140_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_141_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_141_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_142_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_142_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_143_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_143_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_144_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_144_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_145_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_145_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_146_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_146_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_147_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_147_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_148_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_148_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_149_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_149_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_150_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_150_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_151_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_151_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_152_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_152_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_153_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_153_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_154_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_154_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_155_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_155_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_156_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_156_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_157_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_157_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_158_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_158_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_159_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_159_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_160_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_160_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_161_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_161_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_162_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_162_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_163_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_163_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_164_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_164_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_165_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_165_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_166_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_166_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_167_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_167_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_168_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_168_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_169_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_169_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_170_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_170_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_171_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_171_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_172_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_172_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_173_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_173_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_174_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_174_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_175_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_175_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_176_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_176_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_177_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_177_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_178_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_178_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_179_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_179_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_180_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_180_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_181_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_181_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_182_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_182_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_183_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_183_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_184_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_184_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_185_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_185_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_186_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_186_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_187_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_187_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_188_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_188_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_189_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_189_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_190_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_190_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_191_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_191_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_192_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_192_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_193_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_193_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_194_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_194_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_195_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_195_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_196_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_196_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_197_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_197_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_198_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_198_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_misc210k",
+    "visual_input_component": "2 natural images",
+    "source": "misc210k",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_199_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_misc210k/semantic_correspondence_misc210k_199_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_0_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_0_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_0_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_0_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_0_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_1_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_1_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_1_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_1_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_1_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_2_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_2_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_2_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_2_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_2_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_3_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_3_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_3_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_3_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_3_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_4_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_4_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_4_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_4_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_4_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_5_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_5_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_5_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_5_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_5_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_6_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_6_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_6_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_6_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_6_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_7_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_7_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_7_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_7_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_7_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_8_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_8_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_8_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_8_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_8_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_9_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_9_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_9_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_9_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_9_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_10_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_10_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_10_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_10_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_10_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_11_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_11_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_11_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_11_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_11_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_12_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_12_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_12_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_12_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_12_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_13_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_13_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_13_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_13_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_13_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_14_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_14_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_14_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_14_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_14_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_15_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_15_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_15_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_15_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_15_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_16_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_16_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_16_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_16_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_16_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_17_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_17_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_17_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_17_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_17_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_18_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_18_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_18_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_18_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_18_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_19_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_19_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_19_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_19_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_19_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_20_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_20_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_20_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_20_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_20_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_21_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_21_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_21_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_21_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_21_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_22_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_22_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_22_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_22_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_22_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_23_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_23_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_23_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_23_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_23_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_24_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_24_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_24_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_24_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_24_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_25_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_25_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_25_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_25_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_25_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_26_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_26_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_26_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_26_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_26_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_27_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_27_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_27_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_27_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_27_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_28_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_28_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_28_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_28_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_28_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_29_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_29_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_29_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_29_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_29_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_30_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_30_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_30_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_30_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_30_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_31_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_31_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_31_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_31_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_31_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_32_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_32_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_32_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_32_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_32_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_33_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_33_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_33_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_33_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_33_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_34_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_34_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_34_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_34_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_34_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_35_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_35_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_35_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_35_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_35_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_36_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_36_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_36_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_36_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_36_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_37_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_37_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_37_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_37_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_37_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_38_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_38_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_38_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_38_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_38_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_39_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_39_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_39_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_39_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_39_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_40_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_40_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_40_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_40_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_40_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_41_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_41_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_41_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_41_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_41_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_42_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_42_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_42_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_42_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_42_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_43_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_43_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_43_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_43_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_43_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_44_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_44_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_44_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_44_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_44_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_45_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_45_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_45_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_45_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_45_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_46_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_46_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_46_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_46_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_46_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_47_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_47_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_47_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_47_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_47_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_48_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_48_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_48_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_48_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_48_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_49_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_49_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_49_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_49_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_49_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_50_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_50_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_50_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_50_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_50_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_51_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_51_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_51_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_51_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_51_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_52_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_52_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_52_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_52_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_52_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_53_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_53_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_53_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_53_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_53_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_54_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_54_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_54_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_54_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_54_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_55_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_55_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_55_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_55_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_55_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_56_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_56_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_56_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_56_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_56_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_57_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_57_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_57_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_57_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_57_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_58_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_58_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_58_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_58_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_58_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_59_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_59_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_59_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_59_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_59_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_60_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_60_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_60_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_60_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_60_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_61_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_61_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_61_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_61_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_61_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_62_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_62_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_62_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_62_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_62_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_63_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_63_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_63_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_63_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_63_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_64_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_64_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_64_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_64_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_64_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_65_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_65_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_65_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_65_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_65_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_66_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_66_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_66_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_66_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_66_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_67_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_67_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_67_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_67_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_67_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_68_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_68_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_68_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_68_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_68_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_69_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_69_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_69_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_69_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_69_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_70_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_70_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_70_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_70_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_70_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_71_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_71_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_71_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_71_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_71_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_72_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_72_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_72_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_72_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_72_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_73_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_73_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_73_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_73_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_73_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_74_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_74_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_74_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_74_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_74_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_75_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_75_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_75_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_75_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_75_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_76_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_76_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_76_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_76_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_76_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_77_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_77_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_77_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_77_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_77_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_78_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_78_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_78_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_78_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_78_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_79_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_79_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_79_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_79_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_79_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_80_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_80_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_80_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_80_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_80_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_81_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_81_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_81_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_81_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_81_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_82_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_82_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_82_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_82_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_82_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_83_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_83_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_83_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_83_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_83_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_84_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_84_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_84_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_84_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_84_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_85_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_85_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_85_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_85_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_85_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_86_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_86_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_86_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_86_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_86_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_87_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_87_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_87_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_87_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_87_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_88_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_88_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_88_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_88_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_88_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_89_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_89_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_89_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_89_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_89_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_90_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_90_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_90_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_90_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_90_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_91_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_91_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_91_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_91_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_91_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_92_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_92_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_92_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_92_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_92_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_93_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_93_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_93_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_93_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_93_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_94_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_94_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_94_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_94_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_94_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_95_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_95_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_95_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_95_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_95_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_96_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_96_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_96_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_96_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_96_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_97_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_97_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_97_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_97_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_97_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_98_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_98_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_98_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_98_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_98_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_99_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_99_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_99_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_99_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_99_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_100_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_100_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_100_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_100_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_100_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_101_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_101_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_101_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_101_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_101_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_102_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_102_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_102_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_102_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_102_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_103_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_103_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_103_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_103_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_103_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_104_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_104_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_104_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_104_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_104_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_105_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_105_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_105_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_105_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_105_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_106_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_106_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_106_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_106_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_106_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_107_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_107_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_107_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_107_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_107_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_108_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_108_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_108_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_108_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_108_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_109_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_109_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_109_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_109_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_109_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_110_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_110_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_110_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_110_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_110_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_111_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_111_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_111_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_111_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_111_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_112_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_112_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_112_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_112_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_112_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_113_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_113_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_113_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_113_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_113_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_114_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_114_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_114_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_114_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_114_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_115_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_115_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_115_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_115_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_115_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_116_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_116_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_116_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_116_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_116_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_117_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_117_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_117_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_117_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_117_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_118_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_118_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_118_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_118_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_118_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_119_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_119_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_119_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_119_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_119_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_120_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_120_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_120_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_120_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_120_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_121_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_121_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_121_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_121_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_121_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_122_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_122_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_122_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_122_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_122_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_123_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_123_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_123_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_123_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_123_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_124_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_124_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_124_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_124_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_124_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_125_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_125_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_125_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_125_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_125_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_126_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_126_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_126_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_126_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_126_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_127_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_127_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_127_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_127_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_127_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_128_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_128_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_128_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_128_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_128_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_129_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_129_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_129_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_129_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_129_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_130_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_130_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_130_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_130_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_130_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_131_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_131_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_131_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_131_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_131_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_132_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_132_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_132_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_132_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_132_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_133_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_133_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_133_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_133_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_133_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_134_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_134_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_134_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_134_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_134_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_135_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_135_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_135_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_135_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_135_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_136_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_136_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_136_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_136_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_136_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_137_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_137_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_137_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_137_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_137_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_138_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_138_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_138_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_138_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_138_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_139_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_139_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_139_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_139_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_139_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_140_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_140_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_140_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_140_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_140_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_141_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_141_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_141_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_141_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_141_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_142_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_142_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_142_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_142_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_142_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_143_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_143_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_143_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_143_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_143_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_144_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_144_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_144_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_144_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_144_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_145_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_145_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_145_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_145_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_145_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_146_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_146_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_146_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_146_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_146_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_147_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_147_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_147_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_147_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_147_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_148_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_148_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_148_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_148_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_148_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_149_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_149_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_149_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_149_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_149_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_150_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_150_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_150_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_150_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_150_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_151_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_151_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_151_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_151_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_151_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_152_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_152_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_152_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_152_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_152_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_153_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_153_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_153_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_153_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_153_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_154_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_154_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_154_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_154_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_154_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_155_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_155_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_155_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_155_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_155_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_156_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_156_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_156_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_156_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_156_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_157_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_157_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_157_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_157_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_157_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_158_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_158_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_158_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_158_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_158_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_159_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_159_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_159_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_159_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_159_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_160_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_160_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_160_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_160_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_160_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_161_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_161_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_161_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_161_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_161_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_162_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_162_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_162_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_162_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_162_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_163_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_163_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_163_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_163_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_163_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_164_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_164_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_164_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_164_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_164_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_165_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_165_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_165_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_165_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_165_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_166_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_166_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_166_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_166_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_166_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_167_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_167_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_167_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_167_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_167_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_168_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_168_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_168_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_168_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_168_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_169_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_169_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_169_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_169_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_169_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_170_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_170_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_170_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_170_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_170_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_171_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_171_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_171_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_171_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_171_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_172_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_172_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_172_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_172_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_172_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_173_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_173_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_173_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_173_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_173_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_174_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_174_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_174_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_174_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_174_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_175_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_175_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_175_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_175_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_175_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_176_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_176_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_176_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_176_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_176_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_177_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_177_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_177_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_177_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_177_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_178_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_178_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_178_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_178_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_178_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_179_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_179_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_179_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_179_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_179_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_180_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_180_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_180_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_180_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_180_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_181_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_181_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_181_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_181_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_181_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_182_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_182_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_182_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_182_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_182_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_183_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_183_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_183_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_183_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_183_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_184_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_184_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_184_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_184_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_184_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_185_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_185_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_185_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_185_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_185_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_186_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_186_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_186_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_186_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_186_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_187_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_187_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_187_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_187_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_187_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_188_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_188_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_188_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_188_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_188_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_189_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_189_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_189_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_189_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_189_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_190_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_190_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_190_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_190_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_190_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_191_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_191_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_191_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_191_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_191_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_192_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_192_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_192_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_192_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_192_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_193_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_193_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_193_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_193_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_193_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_194_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_194_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_194_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_194_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_194_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_195_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_195_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_195_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_195_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_195_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_196_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_196_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_196_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_196_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_196_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_197_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_197_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_197_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_197_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_197_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_198_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_198_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_198_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_198_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_198_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "person_reid",
+    "visual_input_component": "['natural_image']",
+    "source": "market_1501",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.",
+    "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/person_reid/person_reid_199_0.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_199_1.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_199_2.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_199_3.jpg",
+      "../MMIU-Benchmark/person_reid/person_reid_199_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_0_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_0_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_1_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_1_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_2_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_2_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_3_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_3_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_4_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_4_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_5_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_5_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_6_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_6_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_7_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_7_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_8_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_8_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_9_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_9_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_10_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_10_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_11_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_11_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_12_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_12_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_13_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_13_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_14_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_14_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_15_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_15_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_16_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_16_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_17_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_17_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_18_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_18_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_19_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_19_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_20_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_20_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_21_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_21_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_22_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_22_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_23_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_23_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_24_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_24_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_25_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_25_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_26_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_26_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_27_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_27_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_28_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_28_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_29_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_29_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_30_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_30_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_31_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_31_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_32_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_32_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_33_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_33_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_34_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_34_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_35_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_35_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_36_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_36_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_37_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_37_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_38_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_38_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_39_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_39_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_40_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_40_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_41_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_41_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_42_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_42_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_43_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_43_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_44_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_44_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_45_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_45_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_46_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_46_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_47_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_47_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_48_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_48_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_49_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_49_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_50_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_50_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_51_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_51_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_52_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_52_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_53_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_53_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_54_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_54_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_55_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_55_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_56_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_56_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_57_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_57_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_58_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_58_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_59_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_59_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_60_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_60_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_61_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_61_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_62_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_62_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_63_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_63_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_64_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_64_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_65_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_65_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_66_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_66_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_67_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_67_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_68_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_68_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_69_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_69_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_70_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_70_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_71_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_71_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_72_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_72_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_73_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_73_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_74_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_74_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_75_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_75_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_76_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_76_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_77_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_77_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_78_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_78_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_79_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_79_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_80_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_80_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_81_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_81_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_82_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_82_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_83_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_83_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_84_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_84_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_85_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_85_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_86_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_86_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_87_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_87_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_88_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_88_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_89_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_89_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_90_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_90_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_91_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_91_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_92_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_92_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_93_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_93_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_94_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_94_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_95_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_95_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_96_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_96_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_97_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_97_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_98_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_98_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_99_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_99_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_100_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_100_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_101_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_101_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_102_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_102_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_103_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_103_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_104_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_104_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_105_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_105_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_106_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_106_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_107_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_107_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_108_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_108_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_109_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_109_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_110_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_110_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_111_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_111_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_112_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_112_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_113_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_113_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_114_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_114_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_115_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_115_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_116_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_116_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_117_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_117_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_118_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_118_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_119_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_119_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_120_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_120_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_121_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_121_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_122_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_122_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_123_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_123_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_124_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_124_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_125_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_125_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_126_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_126_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_127_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_127_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_128_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_128_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_129_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_129_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_130_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_130_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_131_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_131_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_132_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_132_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_133_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_133_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_134_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_134_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_135_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_135_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_136_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_136_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_137_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_137_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "semantic_correspondence_blink",
+    "visual_input_component": "2 natural images",
+    "source": "blink",
+    "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D",
+    "question": "Which point is corresponding to the reference point?",
+    "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_138_0.jpg",
+      "../MMIU-Benchmark/semantic_correspondence_blink/semantic_correspondence_blink_138_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_0_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_0_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_1_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_1_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_2_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_2_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_3_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_3_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_4_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_4_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_5_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_5_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_6_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_6_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_7_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_7_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_8_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_8_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_9_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_9_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_10_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_10_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_11_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_11_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_12_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_12_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_13_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_13_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_14_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_14_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_15_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_15_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_16_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_16_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_17_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_17_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_18_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_18_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_19_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_19_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_20_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_20_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_21_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_21_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_22_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_22_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_23_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_23_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_24_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_24_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_25_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_25_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_26_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_26_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_27_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_27_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_28_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_28_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_29_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_29_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_30_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_30_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_31_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_31_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_32_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_32_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_33_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_33_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_34_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_34_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_35_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_35_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_36_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_36_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_37_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_37_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_38_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_38_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_39_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_39_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_40_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_40_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_41_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_41_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_42_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_42_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_43_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_43_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_44_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_44_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_45_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_45_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_46_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_46_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_47_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_47_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_48_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_48_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_49_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_49_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_50_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_50_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_51_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_51_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_52_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_52_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_53_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_53_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_54_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_54_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_55_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_55_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_56_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_56_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_57_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_57_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_58_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_58_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_59_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_59_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_60_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_60_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_61_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_61_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_62_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_62_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_63_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_63_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_64_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_64_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_65_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_65_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_66_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_66_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_67_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_67_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_68_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_68_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_69_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_69_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_70_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_70_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_71_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_71_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_72_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_72_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_73_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_73_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_74_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_74_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_75_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_75_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_76_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_76_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_77_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_77_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_78_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_78_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_79_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_79_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_80_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_80_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_81_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_81_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_82_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_82_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_83_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_83_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_84_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_84_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_85_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_85_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_86_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_86_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_87_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_87_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_88_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_88_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_89_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_89_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_90_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_90_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_91_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_91_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_92_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_92_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_93_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_93_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_94_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_94_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_95_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_95_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_96_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_96_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_97_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_97_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_98_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_98_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_99_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_99_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_100_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_100_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_101_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_101_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_102_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_102_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_103_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_103_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_104_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_104_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_105_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_105_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_106_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_106_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_107_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_107_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_108_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_108_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_109_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_109_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_110_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_110_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_111_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_111_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_112_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_112_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_113_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_113_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_114_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_114_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_115_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_115_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_116_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_116_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_117_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_117_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_118_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_118_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_119_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_119_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_120_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_120_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_121_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_121_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_122_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_122_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_123_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_123_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_124_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_124_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_125_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_125_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_126_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_126_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_127_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_127_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_128_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_128_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_129_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_129_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_130_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_130_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_131_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_131_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_132_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_132_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_133_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_133_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_134_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_134_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_135_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_135_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_136_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_136_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_137_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_137_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_138_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_138_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_139_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_139_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_140_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_140_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_141_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_141_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_142_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_142_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_143_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_143_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_144_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_144_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_145_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_145_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_146_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_146_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_147_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_147_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_148_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_148_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_149_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_149_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_150_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_150_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_151_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_151_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_152_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_152_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_153_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_153_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_154_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_154_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_155_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_155_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_156_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_156_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_157_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_157_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_158_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_158_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_159_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_159_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_160_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_160_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_161_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_161_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_162_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_162_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_163_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_163_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_164_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_164_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_165_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_165_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_166_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_166_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_167_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_167_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_168_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_168_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_169_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_169_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_170_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_170_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_171_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_171_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_172_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_172_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_173_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_173_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_174_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_174_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_175_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_175_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_176_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_176_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_177_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_177_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_178_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_178_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_179_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_179_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_180_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_180_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_181_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_181_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_182_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_182_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_183_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_183_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_184_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_184_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_185_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_185_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_186_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_186_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_187_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_187_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_188_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_188_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_189_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_189_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_190_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_190_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_191_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_191_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_192_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_192_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_193_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_193_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_194_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_194_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_195_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_195_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: Yes\nB: No",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: Yes\nB: No",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_196_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_196_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "image_alike",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_197_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_197_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_198_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_198_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "spot_the_similarity",
+    "visual_input_component": "['natural_image']",
+    "source": "Totally_Looks_Like_Data",
+    "options": "A: No\nB: Yes",
+    "question": "Are there any similarities between the two pictures?",
+    "context": "Select from the following choices.\nA: No\nB: Yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_199_0.jpg",
+      "../MMIU-Benchmark/spot_the_similarity/spot_the_similarity_199_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: tying knot (not on a tie)\nB: knitting\nC: ironing\nD: weaving basket",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: tying knot (not on a tie)\nB: knitting\nC: ironing\nD: weaving basket",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_0_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_0_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_0_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_0_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_0_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_0_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_0_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_0_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_0_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_0_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_0_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_0_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_0_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_0_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_0_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_0_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: trampoline jump\nB: balancing on trampoline\nC: flipping on trampoline\nD: bouncing on trampoline",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: trampoline jump\nB: balancing on trampoline\nC: flipping on trampoline\nD: bouncing on trampoline",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_1_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_1_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_1_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_1_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_1_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_1_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_1_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_1_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_1_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_1_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_1_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_1_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_1_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_1_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_1_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_1_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: dunking basketball\nB: playing kickball\nC: shooting goal (soccer)\nD: playing basketball",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: dunking basketball\nB: playing kickball\nC: shooting goal (soccer)\nD: playing basketball",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_2_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_2_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_2_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_2_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_2_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_2_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_2_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_2_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_2_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_2_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_2_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_2_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_2_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_2_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_2_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_2_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: carrying baby\nB: using segway\nC: pushing wheelchair\nD: cleaning windows",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: carrying baby\nB: using segway\nC: pushing wheelchair\nD: cleaning windows",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_3_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_3_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_3_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_3_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_3_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_3_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_3_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_3_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_3_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_3_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_3_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_3_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_3_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_3_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_3_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_3_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: playing kickball\nB: playing chess\nC: playing controller\nD: playing basketball",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: playing kickball\nB: playing chess\nC: playing controller\nD: playing basketball",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_4_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_4_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_4_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_4_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_4_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_4_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_4_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_4_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_4_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_4_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_4_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_4_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_4_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_4_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_4_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_4_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: kicking soccer ball\nB: high kick\nC: parkour\nD: dunking basketball",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: kicking soccer ball\nB: high kick\nC: parkour\nD: dunking basketball",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_5_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_5_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_5_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_5_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_5_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_5_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_5_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_5_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_5_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_5_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_5_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_5_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_5_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_5_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_5_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_5_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: jumpstyle dancing\nB: swinging legs\nC: hula hooping\nD: tango dancing",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: jumpstyle dancing\nB: swinging legs\nC: hula hooping\nD: tango dancing",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_6_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_6_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_6_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_6_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_6_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_6_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_6_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_6_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_6_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_6_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_6_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_6_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_6_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_6_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_6_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_6_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: washing dishes\nB: cleaning pool\nC: washing hands\nD: cleaning windows",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: washing dishes\nB: cleaning pool\nC: washing hands\nD: cleaning windows",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_7_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_7_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_7_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_7_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_7_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_7_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_7_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_7_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_7_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_7_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_7_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_7_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_7_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_7_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_7_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_7_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: braiding hair\nB: shining shoes\nC: cutting watermelon\nD: tapping guitar",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: braiding hair\nB: shining shoes\nC: cutting watermelon\nD: tapping guitar",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_8_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_8_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_8_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_8_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_8_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_8_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_8_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_8_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_8_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_8_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_8_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_8_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_8_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_8_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_8_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_8_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: doing laundry\nB: cleaning pool\nC: washing dishes\nD: washing hands",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: doing laundry\nB: cleaning pool\nC: washing dishes\nD: washing hands",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_9_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_9_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_9_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_9_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_9_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_9_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_9_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_9_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_9_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_9_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_9_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_9_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_9_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_9_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_9_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_9_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: snowboarding\nB: ice climbing\nC: skiing crosscountry\nD: biking through snow",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: snowboarding\nB: ice climbing\nC: skiing crosscountry\nD: biking through snow",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_10_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_10_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_10_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_10_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_10_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_10_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_10_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_10_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_10_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_10_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_10_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_10_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_10_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_10_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_10_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_10_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: playing kickball\nB: frisbee catching or throwing\nC: catching or throwing frisbee\nD: biking through snow",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: playing kickball\nB: frisbee catching or throwing\nC: catching or throwing frisbee\nD: biking through snow",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_11_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_11_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_11_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_11_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_11_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_11_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_11_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_11_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_11_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_11_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_11_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_11_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_11_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_11_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_11_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_11_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: packing truck\nB: loading truck\nC: driving truck\nD: unloading truck",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: packing truck\nB: loading truck\nC: driving truck\nD: unloading truck",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_12_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_12_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_12_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_12_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_12_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_12_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_12_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_12_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_12_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_12_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_12_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_12_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_12_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_12_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_12_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_12_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: shaving legs\nB: surfing water\nC: swimming breast stroke\nD: diving cliff",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: shaving legs\nB: surfing water\nC: swimming breast stroke\nD: diving cliff",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_13_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_13_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_13_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_13_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_13_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_13_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_13_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_13_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_13_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_13_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_13_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_13_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_13_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_13_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_13_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_13_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: eating chips\nB: tossing salad\nC: bouncing on trampoline\nD: feeding birds",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: eating chips\nB: tossing salad\nC: bouncing on trampoline\nD: feeding birds",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_14_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_14_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_14_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_14_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_14_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_14_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_14_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_14_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_14_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_14_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_14_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_14_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_14_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_14_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_14_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_14_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: spray painting\nB: baking cookies\nC: using remote controller (not gaming)\nD: blowing out candles",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: spray painting\nB: baking cookies\nC: using remote controller (not gaming)\nD: blowing out candles",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_15_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_15_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_15_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_15_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_15_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_15_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_15_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_15_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_15_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_15_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_15_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_15_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_15_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_15_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_15_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_15_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: punching bag\nB: boxing\nC: wrestling\nD: kickboxing",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: punching bag\nB: boxing\nC: wrestling\nD: kickboxing",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_16_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_16_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_16_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_16_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_16_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_16_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_16_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_16_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_16_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_16_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_16_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_16_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_16_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_16_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_16_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_16_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: unloading truck\nB: tapping pen\nC: playing poker\nD: blowing leaves",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: unloading truck\nB: tapping pen\nC: playing poker\nD: blowing leaves",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_17_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_17_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_17_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_17_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_17_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_17_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_17_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_17_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_17_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_17_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_17_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_17_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_17_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_17_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_17_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_17_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: playing kickball\nB: polishing silverware\nC: sneezing\nD: shining shoes",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: playing kickball\nB: polishing silverware\nC: sneezing\nD: shining shoes",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_18_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_18_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_18_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_18_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_18_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_18_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_18_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_18_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_18_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_18_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_18_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_18_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_18_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_18_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_18_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_18_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: playing saxophone\nB: playing flute\nC: playing trumpet\nD: playing guitar",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: playing saxophone\nB: playing flute\nC: playing trumpet\nD: playing guitar",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_19_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_19_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_19_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_19_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_19_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_19_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_19_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_19_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_19_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_19_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_19_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_19_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_19_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_19_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_19_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_19_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: playing basketball\nB: playing kickball\nC: kicking soccer ball\nD: dodgeball",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: playing basketball\nB: playing kickball\nC: kicking soccer ball\nD: dodgeball",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_20_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_20_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_20_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_20_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_20_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_20_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_20_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_20_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_20_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_20_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_20_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_20_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_20_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_20_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_20_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_20_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: riding scooter\nB: driving tractor\nC: pushing car\nD: cleaning windows",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: riding scooter\nB: driving tractor\nC: pushing car\nD: cleaning windows",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_21_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_21_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_21_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_21_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_21_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_21_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_21_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_21_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_21_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_21_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_21_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_21_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_21_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_21_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_21_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_21_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: snowboarding\nB: skiing crosscountry\nC: surfing water\nD: water skiing",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: snowboarding\nB: skiing crosscountry\nC: surfing water\nD: water skiing",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_22_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_22_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_22_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_22_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_22_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_22_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_22_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_22_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_22_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_22_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_22_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_22_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_22_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_22_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_22_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_22_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: playing guitar\nB: playing didgeridoo\nC: playing keyboard\nD: playing cymbals",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: playing guitar\nB: playing didgeridoo\nC: playing keyboard\nD: playing cymbals",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_23_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_23_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_23_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_23_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_23_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_23_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_23_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_23_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_23_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_23_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_23_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_23_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_23_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_23_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_23_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_23_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: skiing crosscountry\nB: ice fishing\nC: flying kite\nD: snowboarding",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: skiing crosscountry\nB: ice fishing\nC: flying kite\nD: snowboarding",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_24_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_24_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_24_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_24_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_24_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_24_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_24_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_24_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_24_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_24_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_24_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_24_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_24_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_24_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_24_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_24_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: folding clothes\nB: building cabinet\nC: moving furniture\nD: cleaning floor",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: folding clothes\nB: building cabinet\nC: moving furniture\nD: cleaning floor",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_25_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_25_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_25_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_25_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_25_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_25_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_25_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_25_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_25_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_25_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_25_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_25_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_25_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_25_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_25_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_25_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: bench pressing\nB: arm wrestling\nC: squat\nD: deadlifting",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: bench pressing\nB: arm wrestling\nC: squat\nD: deadlifting",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_26_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_26_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_26_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_26_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_26_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_26_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_26_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_26_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_26_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_26_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_26_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_26_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_26_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_26_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_26_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_26_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: cutting watermelon\nB: sanding floor\nC: trimming trees\nD: pruning trees",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: cutting watermelon\nB: sanding floor\nC: trimming trees\nD: pruning trees",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_27_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_27_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_27_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_27_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_27_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_27_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_27_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_27_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_27_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_27_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_27_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_27_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_27_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_27_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_27_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_27_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: riding mountain bike\nB: snowboarding\nC: biking through snow\nD: skiing crosscountry",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: riding mountain bike\nB: snowboarding\nC: biking through snow\nD: skiing crosscountry",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_28_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_28_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_28_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_28_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_28_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_28_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_28_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_28_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_28_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_28_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_28_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_28_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_28_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_28_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_28_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_28_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: playing guitar\nB: playing trombone\nC: playing cymbals\nD: playing harp",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: playing guitar\nB: playing trombone\nC: playing cymbals\nD: playing harp",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_29_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_29_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_29_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_29_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_29_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_29_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_29_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_29_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_29_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_29_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_29_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_29_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_29_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_29_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_29_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_29_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: tango dancing\nB: swing dancing\nC: dancing charleston\nD: jumpstyle dancing",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: tango dancing\nB: swing dancing\nC: dancing charleston\nD: jumpstyle dancing",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_30_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_30_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_30_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_30_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_30_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_30_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_30_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_30_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_30_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_30_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_30_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_30_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_30_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_30_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_30_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_30_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: playing harp\nB: shuffling cards\nC: tango dancing\nD: swing dancing",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: playing harp\nB: shuffling cards\nC: tango dancing\nD: swing dancing",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_31_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_31_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_31_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_31_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_31_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_31_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_31_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_31_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_31_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_31_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_31_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_31_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_31_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_31_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_31_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_31_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: cleaning pool\nB: washing windows\nC: cleaning windows\nD: shining shoes",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: cleaning pool\nB: washing windows\nC: cleaning windows\nD: shining shoes",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_32_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_32_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_32_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_32_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_32_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_32_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_32_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_32_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_32_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_32_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_32_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_32_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_32_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_32_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_32_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_32_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: swinging legs\nB: gymnastics tumbling\nC: squat\nD: stretching leg",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: swinging legs\nB: gymnastics tumbling\nC: squat\nD: stretching leg",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_33_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_33_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_33_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_33_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_33_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_33_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_33_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_33_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_33_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_33_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_33_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_33_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_33_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_33_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_33_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_33_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: teaching sign language\nB: communicating with sign language\nC: sign language interpreting\nD: sign language translation",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: teaching sign language\nB: communicating with sign language\nC: sign language interpreting\nD: sign language translation",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_34_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_34_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_34_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_34_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_34_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_34_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_34_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_34_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_34_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_34_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_34_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_34_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_34_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_34_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_34_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_34_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: opening present\nB: making a cake\nC: unboxing\nD: baking cookies",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: opening present\nB: making a cake\nC: unboxing\nD: baking cookies",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_35_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_35_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_35_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_35_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_35_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_35_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_35_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_35_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_35_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_35_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_35_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_35_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_35_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_35_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_35_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_35_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: shaving head\nB: pumping fist\nC: cleaning toilet\nD: shredding paper",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: shaving head\nB: pumping fist\nC: cleaning toilet\nD: shredding paper",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_36_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_36_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_36_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_36_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_36_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_36_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_36_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_36_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_36_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_36_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_36_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_36_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_36_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_36_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_36_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_36_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: horseback riding\nB: riding mule\nC: riding mountain bike\nD: petting animal (not cat)",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: horseback riding\nB: riding mule\nC: riding mountain bike\nD: petting animal (not cat)",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_37_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_37_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_37_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_37_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_37_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_37_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_37_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_37_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_37_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_37_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_37_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_37_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_37_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_37_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_37_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_37_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: waxing eyebrows\nB: shaving legs\nC: trimming trees\nD: waxing chest",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: waxing eyebrows\nB: shaving legs\nC: trimming trees\nD: waxing chest",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_38_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_38_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_38_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_38_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_38_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_38_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_38_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_38_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_38_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_38_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_38_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_38_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_38_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_38_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_38_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_38_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: grooming horse\nB: milking cow\nC: petting animal (not cat)\nD: feeding goats",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: grooming horse\nB: milking cow\nC: petting animal (not cat)\nD: feeding goats",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_39_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_39_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_39_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_39_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_39_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_39_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_39_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_39_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_39_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_39_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_39_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_39_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_39_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_39_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_39_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_39_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: spray painting\nB: ripping paper\nC: shredding paper\nD: filling eyebrows",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: spray painting\nB: ripping paper\nC: shredding paper\nD: filling eyebrows",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_40_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_40_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_40_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_40_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_40_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_40_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_40_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_40_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_40_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_40_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_40_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_40_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_40_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_40_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_40_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_40_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: massaging person's head\nB: massaging feet\nC: petting cat\nD: petting animal (not cat)",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: massaging person's head\nB: massaging feet\nC: petting cat\nD: petting animal (not cat)",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_41_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_41_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_41_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_41_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_41_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_41_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_41_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_41_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_41_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_41_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_41_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_41_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_41_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_41_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_41_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_41_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: marching\nB: nodding head\nC: clapping\nD: shaking head",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: marching\nB: nodding head\nC: clapping\nD: shaking head",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_42_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_42_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_42_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_42_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_42_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_42_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_42_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_42_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_42_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_42_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_42_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_42_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_42_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_42_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_42_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_42_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: playing kickball\nB: marching\nC: cutting watermelon\nD: blowing leaves",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: playing kickball\nB: marching\nC: cutting watermelon\nD: blowing leaves",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_43_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_43_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_43_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_43_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_43_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_43_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_43_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_43_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_43_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_43_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_43_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_43_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_43_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_43_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_43_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_43_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: rock scissors paper\nB: clapping\nC: bouncing on trampoline\nD: tango dancing",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: rock scissors paper\nB: clapping\nC: bouncing on trampoline\nD: tango dancing",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_44_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_44_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_44_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_44_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_44_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_44_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_44_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_44_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_44_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_44_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_44_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_44_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_44_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_44_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_44_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_44_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: using remote controller (not gaming)\nB: watching TV\nC: playing guitar\nD: typing on keyboard",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: using remote controller (not gaming)\nB: watching TV\nC: playing guitar\nD: typing on keyboard",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_45_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_45_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_45_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_45_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_45_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_45_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_45_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_45_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_45_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_45_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_45_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_45_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_45_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_45_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_45_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_45_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: playing kickball\nB: dribbling basketball\nC: playing basketball\nD: tossing coin",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: playing kickball\nB: dribbling basketball\nC: playing basketball\nD: tossing coin",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_46_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_46_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_46_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_46_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_46_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_46_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_46_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_46_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_46_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_46_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_46_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_46_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_46_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_46_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_46_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_46_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: skiing crosscountry\nB: snowboarding\nC: playing squash or racquetball\nD: riding mountain bike",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: skiing crosscountry\nB: snowboarding\nC: playing squash or racquetball\nD: riding mountain bike",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_47_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_47_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_47_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_47_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_47_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_47_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_47_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_47_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_47_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_47_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_47_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_47_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_47_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_47_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_47_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_47_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: slapping\nB: faceplanting\nC: back raises\nD: massaging person's head",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: slapping\nB: faceplanting\nC: back raises\nD: massaging person's head",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_48_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_48_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_48_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_48_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_48_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_48_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_48_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_48_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_48_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_48_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_48_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_48_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_48_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_48_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_48_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_48_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: rock scissors paper\nB: sword fighting\nC: fencing\nD: balloon blowing",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: rock scissors paper\nB: sword fighting\nC: fencing\nD: balloon blowing",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_49_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_49_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_49_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_49_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_49_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_49_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_49_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_49_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_49_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_49_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_49_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_49_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_49_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_49_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_49_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_49_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: weaving basket\nB: juggling fire\nC: cooking chicken\nD: playing badminton",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: weaving basket\nB: juggling fire\nC: cooking chicken\nD: playing badminton",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_50_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_50_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_50_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_50_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_50_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_50_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_50_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_50_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_50_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_50_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_50_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_50_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_50_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_50_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_50_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_50_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: drumming fingers\nB: shuffling cards\nC: beatboxing\nD: playing cymbals",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: drumming fingers\nB: shuffling cards\nC: beatboxing\nD: playing cymbals",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_51_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_51_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_51_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_51_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_51_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_51_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_51_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_51_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_51_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_51_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_51_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_51_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_51_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_51_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_51_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_51_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: waxing chest\nB: cutting nails\nC: shaving head\nD: trimming trees",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: waxing chest\nB: cutting nails\nC: shaving head\nD: trimming trees",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_52_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_52_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_52_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_52_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_52_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_52_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_52_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_52_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_52_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_52_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_52_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_52_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_52_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_52_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_52_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_52_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: jetskiing\nB: motorcycling\nC: riding mountain bike\nD: snowboarding",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: jetskiing\nB: motorcycling\nC: riding mountain bike\nD: snowboarding",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_53_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_53_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_53_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_53_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_53_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_53_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_53_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_53_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_53_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_53_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_53_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_53_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_53_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_53_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_53_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_53_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: sneezing\nB: breading or breadcrumbing\nC: rock scissors paper\nD: spinning poi",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: sneezing\nB: breading or breadcrumbing\nC: rock scissors paper\nD: spinning poi",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_54_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_54_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_54_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_54_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_54_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_54_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_54_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_54_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_54_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_54_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_54_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_54_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_54_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_54_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_54_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_54_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: climbing tree\nB: watering plants\nC: planting trees\nD: raking leaves",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: climbing tree\nB: watering plants\nC: planting trees\nD: raking leaves",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_55_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_55_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_55_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_55_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_55_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_55_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_55_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_55_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_55_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_55_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_55_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_55_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_55_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_55_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_55_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_55_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: riding mountain bike\nB: tango dancing\nC: playing kickball\nD: country line dancing",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: riding mountain bike\nB: tango dancing\nC: playing kickball\nD: country line dancing",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_56_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_56_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_56_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_56_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_56_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_56_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_56_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_56_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_56_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_56_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_56_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_56_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_56_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_56_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_56_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_56_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: playing kickball\nB: playing didgeridoo\nC: playing basketball\nD: washing dishes",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: playing kickball\nB: playing didgeridoo\nC: playing basketball\nD: washing dishes",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_57_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_57_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_57_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_57_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_57_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_57_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_57_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_57_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_57_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_57_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_57_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_57_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_57_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_57_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_57_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_57_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: cleaning pool\nB: washing dishes\nC: watering plants\nD: doing laundry",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: cleaning pool\nB: washing dishes\nC: watering plants\nD: doing laundry",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_58_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_58_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_58_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_58_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_58_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_58_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_58_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_58_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_58_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_58_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_58_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_58_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_58_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_58_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_58_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_58_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: bench pressing\nB: deadlifting\nC: snatch weight lifting\nD: clean and jerk",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: bench pressing\nB: deadlifting\nC: snatch weight lifting\nD: clean and jerk",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_59_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_59_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_59_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_59_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_59_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_59_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_59_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_59_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_59_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_59_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_59_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_59_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_59_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_59_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_59_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_59_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: tapping guitar\nB: tapping pen\nC: playing guitar\nD: strumming guitar",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: tapping guitar\nB: tapping pen\nC: playing guitar\nD: strumming guitar",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_60_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_60_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_60_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_60_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_60_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_60_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_60_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_60_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_60_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_60_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_60_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_60_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_60_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_60_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_60_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_60_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: frying eggs\nB: grilling meat\nC: boiling pasta\nD: cooking sausages",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: frying eggs\nB: grilling meat\nC: boiling pasta\nD: cooking sausages",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_61_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_61_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_61_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_61_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_61_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_61_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_61_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_61_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_61_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_61_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_61_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_61_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_61_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_61_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_61_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_61_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: playing flute\nB: playing cymbals\nC: playing guitar\nD: playing keyboard",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: playing flute\nB: playing cymbals\nC: playing guitar\nD: playing keyboard",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_62_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_62_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_62_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_62_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_62_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_62_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_62_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_62_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_62_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_62_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_62_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_62_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_62_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_62_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_62_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_62_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: mopping floor\nB: vacuuming\nC: cleaning windows\nD: sweeping floor",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: mopping floor\nB: vacuuming\nC: cleaning windows\nD: sweeping floor",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_63_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_63_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_63_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_63_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_63_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_63_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_63_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_63_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_63_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_63_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_63_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_63_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_63_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_63_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_63_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_63_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: gymnastics tumbling\nB: baking cookies\nC: chopping wood\nD: stretching leg",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: gymnastics tumbling\nB: baking cookies\nC: chopping wood\nD: stretching leg",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_64_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_64_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_64_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_64_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_64_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_64_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_64_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_64_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_64_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_64_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_64_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_64_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_64_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_64_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_64_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_64_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: cutting nails\nB: fixing hair\nC: shaving legs\nD: braiding hair",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: cutting nails\nB: fixing hair\nC: shaving legs\nD: braiding hair",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_65_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_65_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_65_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_65_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_65_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_65_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_65_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_65_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_65_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_65_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_65_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_65_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_65_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_65_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_65_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_65_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: sneezing\nB: auctioning\nC: testifying\nD: sign language interpreting",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: sneezing\nB: auctioning\nC: testifying\nD: sign language interpreting",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_66_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_66_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_66_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_66_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_66_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_66_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_66_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_66_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_66_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_66_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_66_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_66_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_66_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_66_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_66_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_66_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: eating ice cream\nB: licking lips\nC: tasting food\nD: baking cookies",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: eating ice cream\nB: licking lips\nC: tasting food\nD: baking cookies",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_67_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_67_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_67_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_67_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_67_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_67_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_67_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_67_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_67_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_67_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_67_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_67_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_67_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_67_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_67_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_67_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: climbing ladder\nB: krumping\nC: breakdancing\nD: robot dancing",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: climbing ladder\nB: krumping\nC: breakdancing\nD: robot dancing",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_68_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_68_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_68_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_68_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_68_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_68_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_68_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_68_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_68_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_68_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_68_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_68_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_68_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_68_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_68_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_68_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: skateboarding\nB: snowboarding\nC: riding mountain bike\nD: skiing crosscountry",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: skateboarding\nB: snowboarding\nC: riding mountain bike\nD: skiing crosscountry",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_69_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_69_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_69_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_69_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_69_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_69_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_69_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_69_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_69_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_69_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_69_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_69_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_69_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_69_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_69_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_69_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: riding mountain bike\nB: skiing (not slalom or crosscountry)\nC: snowboarding\nD: playing kickball",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: riding mountain bike\nB: skiing (not slalom or crosscountry)\nC: snowboarding\nD: playing kickball",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_70_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_70_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_70_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_70_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_70_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_70_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_70_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_70_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_70_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_70_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_70_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_70_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_70_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_70_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_70_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_70_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: kissing\nB: snuggling\nC: romantic dancing\nD: flirting",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: kissing\nB: snuggling\nC: romantic dancing\nD: flirting",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_71_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_71_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_71_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_71_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_71_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_71_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_71_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_71_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_71_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_71_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_71_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_71_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_71_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_71_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_71_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_71_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: washing feet\nB: playing kickball\nC: playing basketball\nD: playing squash or racquetball",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: washing feet\nB: playing kickball\nC: playing basketball\nD: playing squash or racquetball",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_72_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_72_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_72_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_72_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_72_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_72_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_72_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_72_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_72_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_72_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_72_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_72_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_72_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_72_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_72_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_72_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: waxing chest\nB: shooting goal (soccer)\nC: playing basketball\nD: riding scooter",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: waxing chest\nB: shooting goal (soccer)\nC: playing basketball\nD: riding scooter",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_73_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_73_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_73_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_73_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_73_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_73_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_73_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_73_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_73_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_73_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_73_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_73_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_73_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_73_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_73_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_73_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: hurdling\nB: high jump\nC: pole vault\nD: triple jump",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: hurdling\nB: high jump\nC: pole vault\nD: triple jump",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_74_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_74_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_74_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_74_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_74_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_74_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_74_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_74_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_74_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_74_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_74_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_74_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_74_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_74_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_74_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_74_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: fixing hair\nB: curling hair\nC: braiding hair\nD: cutting nails",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: fixing hair\nB: curling hair\nC: braiding hair\nD: cutting nails",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_75_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_75_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_75_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_75_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_75_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_75_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_75_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_75_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_75_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_75_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_75_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_75_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_75_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_75_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_75_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_75_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: preparing salad\nB: baking cookies\nC: cooking chicken\nD: making a cake",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: preparing salad\nB: baking cookies\nC: cooking chicken\nD: making a cake",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_76_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_76_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_76_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_76_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_76_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_76_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_76_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_76_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_76_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_76_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_76_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_76_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_76_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_76_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_76_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_76_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: waxing chest\nB: arms wrestling\nC: shaving legs\nD: shaving head",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: waxing chest\nB: arms wrestling\nC: shaving legs\nD: shaving head",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_77_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_77_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_77_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_77_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_77_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_77_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_77_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_77_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_77_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_77_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_77_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_77_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_77_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_77_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_77_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_77_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: filling eyebrows\nB: waxing eyebrows\nC: shaving legs\nD: cutting watermelon",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: filling eyebrows\nB: waxing eyebrows\nC: shaving legs\nD: cutting watermelon",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_78_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_78_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_78_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_78_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_78_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_78_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_78_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_78_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_78_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_78_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_78_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_78_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_78_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_78_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_78_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_78_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: milking cow\nB: petting animal (not cat)\nC: feeding birds\nD: holding snake",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: milking cow\nB: petting animal (not cat)\nC: feeding birds\nD: holding snake",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_79_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_79_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_79_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_79_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_79_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_79_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_79_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_79_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_79_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_79_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_79_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_79_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_79_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_79_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_79_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_79_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: shot put\nB: throwing discus\nC: hurdling\nD: discus throw",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: shot put\nB: throwing discus\nC: hurdling\nD: discus throw",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_80_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_80_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_80_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_80_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_80_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_80_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_80_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_80_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_80_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_80_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_80_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_80_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_80_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_80_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_80_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_80_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: making bed\nB: folding paper\nC: folding clothes\nD: ironing",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: making bed\nB: folding paper\nC: folding clothes\nD: ironing",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_81_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_81_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_81_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_81_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_81_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_81_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_81_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_81_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_81_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_81_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_81_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_81_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_81_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_81_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_81_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_81_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: stretching leg\nB: bouncing on trampoline\nC: jumpstyle dancing\nD: exercising with an exercise ball",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: stretching leg\nB: bouncing on trampoline\nC: jumpstyle dancing\nD: exercising with an exercise ball",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_82_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_82_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_82_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_82_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_82_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_82_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_82_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_82_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_82_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_82_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_82_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_82_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_82_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_82_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_82_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_82_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: washing hair\nB: combing hair\nC: styling hair\nD: brushing hair",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: washing hair\nB: combing hair\nC: styling hair\nD: brushing hair",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_83_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_83_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_83_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_83_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_83_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_83_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_83_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_83_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_83_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_83_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_83_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_83_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_83_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_83_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_83_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_83_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: swinging legs\nB: front raises\nC: bouncing on trampoline\nD: stretching leg",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: swinging legs\nB: front raises\nC: bouncing on trampoline\nD: stretching leg",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_84_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_84_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_84_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_84_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_84_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_84_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_84_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_84_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_84_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_84_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_84_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_84_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_84_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_84_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_84_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_84_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: shaving legs\nB: climbing ladder\nC: changing oil\nD: sanding floor",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: shaving legs\nB: climbing ladder\nC: changing oil\nD: sanding floor",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_85_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_85_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_85_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_85_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_85_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_85_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_85_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_85_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_85_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_85_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_85_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_85_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_85_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_85_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_85_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_85_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: laying bricks\nB: swing dancing\nC: ironing\nD: climbing tree",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: laying bricks\nB: swing dancing\nC: ironing\nD: climbing tree",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_86_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_86_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_86_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_86_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_86_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_86_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_86_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_86_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_86_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_86_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_86_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_86_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_86_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_86_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_86_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_86_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: punching bag\nB: eating burger\nC: driving tractor\nD: drinking shots",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: punching bag\nB: eating burger\nC: driving tractor\nD: drinking shots",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_87_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_87_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_87_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_87_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_87_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_87_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_87_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_87_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_87_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_87_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_87_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_87_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_87_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_87_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_87_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_87_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: headbanging\nB: playing guitar\nC: drumming fingers\nD: bouncing on trampoline",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: headbanging\nB: playing guitar\nC: drumming fingers\nD: bouncing on trampoline",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_88_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_88_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_88_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_88_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_88_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_88_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_88_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_88_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_88_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_88_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_88_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_88_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_88_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_88_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_88_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_88_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: breading or breadcrumbing\nB: shoveling snow\nC: springboard diving\nD: decorating the christmas tree",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: breading or breadcrumbing\nB: shoveling snow\nC: springboard diving\nD: decorating the christmas tree",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_89_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_89_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_89_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_89_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_89_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_89_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_89_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_89_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_89_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_89_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_89_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_89_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_89_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_89_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_89_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_89_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: applauding\nB: cheering\nC: clapping\nD: snapping fingers",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: applauding\nB: cheering\nC: clapping\nD: snapping fingers",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_90_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_90_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_90_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_90_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_90_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_90_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_90_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_90_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_90_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_90_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_90_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_90_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_90_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_90_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_90_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_90_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: climbing ladder\nB: playing sitar\nC: playing harp\nD: strumming guitar",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: climbing ladder\nB: playing sitar\nC: playing harp\nD: strumming guitar",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_91_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_91_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_91_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_91_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_91_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_91_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_91_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_91_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_91_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_91_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_91_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_91_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_91_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_91_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_91_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_91_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: playing guitar\nB: making jewelry\nC: filming movie\nD: unboxing",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: playing guitar\nB: making jewelry\nC: filming movie\nD: unboxing",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_92_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_92_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_92_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_92_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_92_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_92_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_92_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_92_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_92_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_92_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_92_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_92_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_92_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_92_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_92_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_92_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: playing cello\nB: playing guitar\nC: bouncing on trampoline\nD: recording music",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: playing cello\nB: playing guitar\nC: bouncing on trampoline\nD: recording music",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_93_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_93_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_93_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_93_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_93_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_93_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_93_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_93_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_93_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_93_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_93_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_93_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_93_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_93_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_93_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_93_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: folding clothes\nB: weaving basket\nC: baking cookies\nD: making jewelry",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: folding clothes\nB: weaving basket\nC: baking cookies\nD: making jewelry",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_94_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_94_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_94_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_94_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_94_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_94_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_94_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_94_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_94_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_94_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_94_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_94_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_94_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_94_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_94_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_94_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: biking through snow\nB: paragliding\nC: riding mountain bike\nD: windsurfing",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: biking through snow\nB: paragliding\nC: riding mountain bike\nD: windsurfing",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_95_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_95_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_95_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_95_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_95_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_95_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_95_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_95_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_95_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_95_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_95_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_95_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_95_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_95_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_95_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_95_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: peeling apples\nB: eating watermelon\nC: shaving legs\nD: cutting watermelon",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: peeling apples\nB: eating watermelon\nC: shaving legs\nD: cutting watermelon",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_96_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_96_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_96_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_96_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_96_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_96_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_96_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_96_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_96_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_96_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_96_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_96_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_96_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_96_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_96_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_96_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: skiing crosscountry\nB: biking through snow\nC: riding mountain bike\nD: snowboarding",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: skiing crosscountry\nB: biking through snow\nC: riding mountain bike\nD: snowboarding",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_97_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_97_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_97_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_97_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_97_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_97_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_97_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_97_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_97_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_97_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_97_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_97_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_97_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_97_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_97_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_97_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: sailing\nB: tapping pen\nC: playing organ\nD: riding mule",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: sailing\nB: tapping pen\nC: playing organ\nD: riding mule",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_98_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_98_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_98_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_98_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_98_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_98_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_98_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_98_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_98_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_98_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_98_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_98_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_98_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_98_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_98_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_98_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: playing kickball\nB: dancing gangnam style\nC: playing basketball\nD: playing paintball",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: playing kickball\nB: dancing gangnam style\nC: playing basketball\nD: playing paintball",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_99_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_99_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_99_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_99_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_99_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_99_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_99_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_99_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_99_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_99_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_99_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_99_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_99_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_99_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_99_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_99_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: deadlifting\nB: bench pressing\nC: pull ups\nD: jogging",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: deadlifting\nB: bench pressing\nC: pull ups\nD: jogging",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_100_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_100_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_100_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_100_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_100_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_100_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_100_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_100_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_100_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_100_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_100_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_100_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_100_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_100_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_100_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_100_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: playing didgeridoo\nB: tossing coin\nC: typing\nD: tapping pen",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: playing didgeridoo\nB: tossing coin\nC: typing\nD: tapping pen",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_101_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_101_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_101_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_101_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_101_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_101_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_101_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_101_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_101_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_101_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_101_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_101_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_101_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_101_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_101_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_101_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: drumming fingers\nB: playing guitar\nC: tapping pen\nD: shuffling cards",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: drumming fingers\nB: playing guitar\nC: tapping pen\nD: shuffling cards",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_102_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_102_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_102_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_102_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_102_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_102_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_102_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_102_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_102_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_102_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_102_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_102_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_102_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_102_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_102_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_102_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: sneezing\nB: blowing leaves\nC: riding mule\nD: passing American football (in game)",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: sneezing\nB: blowing leaves\nC: riding mule\nD: passing American football (in game)",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_103_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_103_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_103_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_103_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_103_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_103_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_103_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_103_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_103_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_103_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_103_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_103_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_103_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_103_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_103_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_103_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: writing\nB: playing guitar\nC: typing on keyboard\nD: playing piano",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: writing\nB: playing guitar\nC: typing on keyboard\nD: playing piano",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_104_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_104_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_104_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_104_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_104_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_104_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_104_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_104_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_104_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_104_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_104_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_104_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_104_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_104_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_104_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_104_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: riding elephant\nB: playing kickball\nC: golf putting\nD: playing golf chipping",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: riding elephant\nB: playing kickball\nC: golf putting\nD: playing golf chipping",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_105_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_105_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_105_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_105_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_105_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_105_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_105_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_105_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_105_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_105_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_105_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_105_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_105_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_105_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_105_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_105_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: assembling computer\nB: sharpening knives\nC: building cabinet\nD: making tea",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: assembling computer\nB: sharpening knives\nC: building cabinet\nD: making tea",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_106_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_106_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_106_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_106_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_106_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_106_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_106_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_106_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_106_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_106_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_106_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_106_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_106_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_106_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_106_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_106_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: shaving legs\nB: sanding floor\nC: cutting watermelon\nD: sharpening knives",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: shaving legs\nB: sanding floor\nC: cutting watermelon\nD: sharpening knives",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_107_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_107_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_107_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_107_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_107_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_107_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_107_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_107_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_107_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_107_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_107_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_107_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_107_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_107_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_107_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_107_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: front raises\nB: bending back\nC: push up\nD: situp",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: front raises\nB: bending back\nC: push up\nD: situp",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_108_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_108_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_108_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_108_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_108_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_108_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_108_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_108_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_108_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_108_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_108_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_108_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_108_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_108_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_108_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_108_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: playing kickball\nB: hoverboarding\nC: riding scooter\nD: using segway",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: playing kickball\nB: hoverboarding\nC: riding scooter\nD: using segway",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_109_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_109_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_109_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_109_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_109_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_109_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_109_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_109_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_109_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_109_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_109_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_109_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_109_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_109_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_109_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_109_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: grooming horse\nB: peeling apples\nC: tickling\nD: cooking on campfire",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: grooming horse\nB: peeling apples\nC: tickling\nD: cooking on campfire",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_110_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_110_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_110_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_110_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_110_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_110_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_110_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_110_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_110_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_110_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_110_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_110_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_110_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_110_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_110_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_110_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: eating cake\nB: eating chips\nC: tasting food\nD: eating watermelon",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: eating cake\nB: eating chips\nC: tasting food\nD: eating watermelon",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_111_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_111_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_111_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_111_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_111_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_111_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_111_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_111_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_111_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_111_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_111_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_111_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_111_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_111_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_111_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_111_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: snatch weight lifting\nB: deadlifting\nC: clean and jerk\nD: bouncing on trampoline",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: snatch weight lifting\nB: deadlifting\nC: clean and jerk\nD: bouncing on trampoline",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_112_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_112_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_112_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_112_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_112_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_112_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_112_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_112_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_112_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_112_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_112_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_112_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_112_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_112_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_112_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_112_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: peeling apples\nB: weaving basket\nC: feeding birds\nD: crossing river",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: peeling apples\nB: weaving basket\nC: feeding birds\nD: crossing river",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_113_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_113_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_113_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_113_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_113_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_113_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_113_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_113_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_113_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_113_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_113_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_113_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_113_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_113_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_113_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_113_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: climbing tree\nB: swing dancing\nC: rock scissors paper\nD: abseiling",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: climbing tree\nB: swing dancing\nC: rock scissors paper\nD: abseiling",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_114_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_114_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_114_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_114_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_114_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_114_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_114_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_114_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_114_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_114_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_114_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_114_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_114_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_114_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_114_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_114_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: grilling fish\nB: baking cookies\nC: cooking chicken\nD: breading or breadcrumbing",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: grilling fish\nB: baking cookies\nC: cooking chicken\nD: breading or breadcrumbing",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_115_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_115_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_115_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_115_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_115_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_115_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_115_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_115_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_115_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_115_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_115_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_115_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_115_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_115_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_115_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_115_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: hockey stop\nB: bobsledding\nC: dribbling basketball\nD: swimming breast stroke",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: hockey stop\nB: bobsledding\nC: dribbling basketball\nD: swimming breast stroke",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_116_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_116_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_116_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_116_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_116_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_116_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_116_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_116_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_116_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_116_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_116_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_116_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_116_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_116_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_116_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_116_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: eating cake\nB: eating chips\nC: eating burger\nD: eating watermelon",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: eating cake\nB: eating chips\nC: eating burger\nD: eating watermelon",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_117_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_117_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_117_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_117_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_117_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_117_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_117_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_117_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_117_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_117_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_117_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_117_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_117_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_117_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_117_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_117_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: bouncing on trampoline\nB: collecting garbage\nC: cleaning pool\nD: garbage collecting",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: bouncing on trampoline\nB: collecting garbage\nC: cleaning pool\nD: garbage collecting",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_118_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_118_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_118_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_118_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_118_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_118_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_118_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_118_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_118_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_118_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_118_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_118_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_118_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_118_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_118_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_118_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: boiling water\nB: mixing drink\nC: brewing coffee\nD: making tea",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: boiling water\nB: mixing drink\nC: brewing coffee\nD: making tea",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_119_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_119_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_119_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_119_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_119_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_119_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_119_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_119_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_119_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_119_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_119_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_119_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_119_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_119_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_119_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_119_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: playing guitar\nB: playing flute\nC: strumming guitar\nD: playing saxophone",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: playing guitar\nB: playing flute\nC: strumming guitar\nD: playing saxophone",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_120_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_120_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_120_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_120_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_120_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_120_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_120_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_120_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_120_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_120_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_120_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_120_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_120_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_120_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_120_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_120_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: beating drum\nB: tasting food\nC: stomping grapes\nD: dancing charleston",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: beating drum\nB: tasting food\nC: stomping grapes\nD: dancing charleston",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_121_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_121_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_121_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_121_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_121_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_121_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_121_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_121_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_121_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_121_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_121_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_121_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_121_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_121_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_121_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_121_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: making a cake\nB: grilling steak\nC: cooking on campfire\nD: baking cookies",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: making a cake\nB: grilling steak\nC: cooking on campfire\nD: baking cookies",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_122_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_122_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_122_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_122_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_122_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_122_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_122_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_122_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_122_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_122_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_122_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_122_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_122_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_122_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_122_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_122_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: springboard diving\nB: bouncing on trampoline\nC: swimming breast stroke\nD: diving cliff",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: springboard diving\nB: bouncing on trampoline\nC: swimming breast stroke\nD: diving cliff",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_123_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_123_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_123_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_123_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_123_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_123_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_123_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_123_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_123_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_123_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_123_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_123_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_123_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_123_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_123_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_123_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: grinding meat\nB: sanding floor\nC: sharpening knives\nD: welding",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: grinding meat\nB: sanding floor\nC: sharpening knives\nD: welding",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_124_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_124_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_124_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_124_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_124_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_124_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_124_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_124_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_124_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_124_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_124_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_124_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_124_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_124_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_124_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_124_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: plastering\nB: driving tractor\nC: playing kickball\nD: riding mountain bike",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: plastering\nB: driving tractor\nC: playing kickball\nD: riding mountain bike",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_125_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_125_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_125_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_125_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_125_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_125_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_125_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_125_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_125_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_125_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_125_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_125_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_125_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_125_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_125_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_125_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: cleaning floor\nB: sweeping floor\nC: mopping floor\nD: washing floor",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: cleaning floor\nB: sweeping floor\nC: mopping floor\nD: washing floor",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_126_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_126_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_126_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_126_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_126_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_126_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_126_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_126_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_126_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_126_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_126_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_126_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_126_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_126_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_126_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_126_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: rock climbing\nB: parkour\nC: ice climbing\nD: free running",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: rock climbing\nB: parkour\nC: ice climbing\nD: free running",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_127_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_127_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_127_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_127_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_127_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_127_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_127_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_127_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_127_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_127_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_127_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_127_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_127_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_127_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_127_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_127_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: riding mountain bike\nB: kicking soccer ball\nC: shooting goal (soccer)\nD: playing basketball",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: riding mountain bike\nB: kicking soccer ball\nC: shooting goal (soccer)\nD: playing basketball",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_128_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_128_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_128_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_128_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_128_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_128_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_128_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_128_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_128_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_128_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_128_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_128_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_128_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_128_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_128_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_128_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: playing violin\nB: sneezing\nC: sailing\nD: tango dancing",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: playing violin\nB: sneezing\nC: sailing\nD: tango dancing",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_129_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_129_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_129_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_129_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_129_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_129_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_129_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_129_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_129_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_129_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_129_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_129_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_129_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_129_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_129_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_129_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: jumpstyle dancing\nB: jumping on trampoline\nC: skipping rope\nD: slacklining",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: jumpstyle dancing\nB: jumping on trampoline\nC: skipping rope\nD: slacklining",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_130_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_130_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_130_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_130_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_130_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_130_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_130_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_130_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_130_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_130_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_130_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_130_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_130_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_130_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_130_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_130_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: riding elephant\nB: petting animal (not cat)\nC: waxing chest\nD: grooming horse",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: riding elephant\nB: petting animal (not cat)\nC: waxing chest\nD: grooming horse",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_131_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_131_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_131_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_131_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_131_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_131_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_131_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_131_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_131_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_131_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_131_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_131_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_131_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_131_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_131_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_131_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: capoeira\nB: hoverboarding\nC: playing kickball\nD: kicking soccer ball",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: capoeira\nB: hoverboarding\nC: playing kickball\nD: kicking soccer ball",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_132_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_132_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_132_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_132_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_132_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_132_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_132_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_132_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_132_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_132_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_132_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_132_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_132_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_132_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_132_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_132_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: saut\u00e9ing vegetables\nB: scrambling eggs\nC: beating eggs\nD: cooking on campfire",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: saut\u00e9ing vegetables\nB: scrambling eggs\nC: beating eggs\nD: cooking on campfire",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_133_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_133_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_133_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_133_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_133_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_133_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_133_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_133_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_133_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_133_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_133_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_133_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_133_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_133_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_133_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_133_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: playing paintball\nB: doing laundry\nC: robot dancing\nD: tango dancing",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: playing paintball\nB: doing laundry\nC: robot dancing\nD: tango dancing",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_134_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_134_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_134_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_134_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_134_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_134_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_134_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_134_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_134_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_134_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_134_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_134_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_134_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_134_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_134_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_134_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: fishing\nB: climbing mountain\nC: rowing boat\nD: crossing river",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: fishing\nB: climbing mountain\nC: rowing boat\nD: crossing river",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_135_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_135_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_135_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_135_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_135_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_135_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_135_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_135_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_135_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_135_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_135_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_135_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_135_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_135_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_135_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_135_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: jumpstyle dancing\nB: jogging\nC: exercising with an exercise ball\nD: running on treadmill",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: jumpstyle dancing\nB: jogging\nC: exercising with an exercise ball\nD: running on treadmill",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_136_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_136_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_136_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_136_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_136_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_136_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_136_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_136_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_136_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_136_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_136_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_136_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_136_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_136_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_136_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_136_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: lunges\nB: leg press\nC: squat\nD: push ups",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: lunges\nB: leg press\nC: squat\nD: push ups",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_137_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_137_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_137_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_137_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_137_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_137_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_137_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_137_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_137_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_137_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_137_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_137_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_137_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_137_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_137_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_137_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: trimming trees\nB: building cabinet\nC: tossing coin\nD: folding paper",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: trimming trees\nB: building cabinet\nC: tossing coin\nD: folding paper",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_138_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_138_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_138_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_138_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_138_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_138_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_138_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_138_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_138_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_138_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_138_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_138_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_138_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_138_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_138_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_138_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: trimming trees\nB: getting a haircut\nC: barbequing\nD: shaving head",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: trimming trees\nB: getting a haircut\nC: barbequing\nD: shaving head",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_139_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_139_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_139_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_139_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_139_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_139_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_139_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_139_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_139_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_139_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_139_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_139_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_139_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_139_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_139_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_139_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: breading or breadcrumbing\nB: making tea\nC: hunting rabbits\nD: egg hunting",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: breading or breadcrumbing\nB: making tea\nC: hunting rabbits\nD: egg hunting",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_140_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_140_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_140_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_140_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_140_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_140_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_140_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_140_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_140_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_140_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_140_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_140_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_140_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_140_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_140_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_140_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: laying bricks\nB: wall painting\nC: plastering\nD: rock scissors paper",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: laying bricks\nB: wall painting\nC: plastering\nD: rock scissors paper",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_141_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_141_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_141_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_141_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_141_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_141_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_141_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_141_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_141_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_141_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_141_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_141_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_141_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_141_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_141_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_141_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: brush painting\nB: breading or breadcrumbing\nC: shining shoes\nD: grinding meat",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: brush painting\nB: breading or breadcrumbing\nC: shining shoes\nD: grinding meat",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_142_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_142_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_142_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_142_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_142_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_142_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_142_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_142_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_142_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_142_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_142_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_142_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_142_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_142_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_142_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_142_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: bending back\nB: spinning poi\nC: juggling fire\nD: parkour",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: bending back\nB: spinning poi\nC: juggling fire\nD: parkour",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_143_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_143_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_143_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_143_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_143_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_143_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_143_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_143_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_143_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_143_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_143_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_143_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_143_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_143_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_143_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_143_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: jetskiing\nB: windsurfing\nC: water skiing\nD: surfing water",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: jetskiing\nB: windsurfing\nC: water skiing\nD: surfing water",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_144_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_144_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_144_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_144_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_144_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_144_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_144_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_144_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_144_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_144_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_144_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_144_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_144_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_144_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_144_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_144_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: tightrope walking\nB: swinging legs\nC: slacklining\nD: rock scissors paper",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: tightrope walking\nB: swinging legs\nC: slacklining\nD: rock scissors paper",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_145_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_145_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_145_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_145_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_145_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_145_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_145_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_145_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_145_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_145_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_145_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_145_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_145_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_145_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_145_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_145_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: parkour\nB: bending back\nC: spray painting\nD: ice climbing",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: parkour\nB: bending back\nC: spray painting\nD: ice climbing",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_146_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_146_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_146_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_146_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_146_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_146_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_146_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_146_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_146_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_146_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_146_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_146_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_146_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_146_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_146_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_146_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: bee keeping\nB: watering plants\nC: baking cookies\nD: trimming trees",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: bee keeping\nB: watering plants\nC: baking cookies\nD: trimming trees",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_147_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_147_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_147_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_147_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_147_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_147_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_147_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_147_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_147_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_147_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_147_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_147_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_147_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_147_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_147_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_147_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: playing chess\nB: rock scissors paper\nC: playing piano\nD: balloon blowing",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: playing chess\nB: rock scissors paper\nC: playing piano\nD: balloon blowing",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_148_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_148_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_148_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_148_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_148_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_148_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_148_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_148_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_148_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_148_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_148_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_148_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_148_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_148_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_148_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_148_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: water skiing\nB: planting trees\nC: watering plants\nD: gardening",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: water skiing\nB: planting trees\nC: watering plants\nD: gardening",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_149_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_149_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_149_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_149_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_149_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_149_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_149_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_149_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_149_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_149_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_149_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_149_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_149_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_149_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_149_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_149_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: swinging legs\nB: pull ups\nC: bouncing on trampoline\nD: dunking basketball",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: swinging legs\nB: pull ups\nC: bouncing on trampoline\nD: dunking basketball",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_150_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_150_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_150_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_150_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_150_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_150_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_150_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_150_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_150_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_150_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_150_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_150_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_150_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_150_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_150_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_150_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: feeding birds\nB: riding camel\nC: riding mule\nD: sailing",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: feeding birds\nB: riding camel\nC: riding mule\nD: sailing",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_151_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_151_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_151_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_151_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_151_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_151_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_151_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_151_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_151_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_151_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_151_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_151_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_151_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_151_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_151_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_151_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: windsurfing\nB: kayaking\nC: sailing\nD: rowing boat",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: windsurfing\nB: kayaking\nC: sailing\nD: rowing boat",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_152_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_152_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_152_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_152_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_152_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_152_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_152_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_152_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_152_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_152_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_152_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_152_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_152_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_152_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_152_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_152_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: squat\nB: yoga\nC: applauding\nD: doing nails",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: squat\nB: yoga\nC: applauding\nD: doing nails",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_153_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_153_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_153_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_153_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_153_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_153_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_153_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_153_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_153_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_153_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_153_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_153_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_153_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_153_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_153_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_153_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: tango dancing\nB: playing organ\nC: strumming guitar\nD: playing bass guitar",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: tango dancing\nB: playing organ\nC: strumming guitar\nD: playing bass guitar",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_154_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_154_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_154_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_154_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_154_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_154_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_154_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_154_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_154_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_154_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_154_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_154_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_154_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_154_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_154_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_154_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: kite surfing\nB: flying kite\nC: swinging legs\nD: bouncing on trampoline",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: kite surfing\nB: flying kite\nC: swinging legs\nD: bouncing on trampoline",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_155_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_155_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_155_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_155_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_155_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_155_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_155_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_155_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_155_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_155_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_155_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_155_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_155_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_155_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_155_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_155_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: cooking sausages\nB: eating burger\nC: grinding meat\nD: breading or breadcrumbing",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: cooking sausages\nB: eating burger\nC: grinding meat\nD: breading or breadcrumbing",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_156_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_156_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_156_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_156_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_156_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_156_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_156_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_156_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_156_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_156_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_156_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_156_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_156_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_156_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_156_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_156_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: petting animal (not cat)\nB: carrying baby\nC: feeding birds\nD: changing oil",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: petting animal (not cat)\nB: carrying baby\nC: feeding birds\nD: changing oil",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_157_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_157_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_157_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_157_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_157_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_157_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_157_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_157_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_157_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_157_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_157_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_157_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_157_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_157_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_157_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_157_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: bouncing on trampoline\nB: snorkeling\nC: surfing water\nD: water skiing",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: bouncing on trampoline\nB: snorkeling\nC: surfing water\nD: water skiing",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_158_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_158_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_158_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_158_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_158_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_158_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_158_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_158_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_158_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_158_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_158_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_158_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_158_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_158_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_158_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_158_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: riding mule\nB: driving tractor\nC: skiing (not slalom or crosscountry)\nD: riding scooter",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: riding mule\nB: driving tractor\nC: skiing (not slalom or crosscountry)\nD: riding scooter",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_159_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_159_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_159_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_159_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_159_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_159_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_159_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_159_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_159_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_159_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_159_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_159_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_159_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_159_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_159_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_159_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: eating cake\nB: making a cake\nC: eating chips\nD: baking cookies",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: eating cake\nB: making a cake\nC: eating chips\nD: baking cookies",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_160_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_160_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_160_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_160_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_160_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_160_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_160_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_160_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_160_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_160_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_160_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_160_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_160_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_160_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_160_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_160_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: drumming fingers\nB: tapping guitar\nC: playing bass guitar\nD: strumming guitar",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: drumming fingers\nB: tapping guitar\nC: playing bass guitar\nD: strumming guitar",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_161_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_161_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_161_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_161_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_161_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_161_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_161_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_161_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_161_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_161_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_161_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_161_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_161_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_161_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_161_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_161_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: playing saxophone\nB: drinking beer\nC: smoking\nD: snorkeling",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: playing saxophone\nB: drinking beer\nC: smoking\nD: snorkeling",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_162_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_162_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_162_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_162_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_162_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_162_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_162_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_162_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_162_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_162_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_162_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_162_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_162_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_162_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_162_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_162_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: marching\nB: auctioning\nC: bouncing on trampoline\nD: checking tires",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: marching\nB: auctioning\nC: bouncing on trampoline\nD: checking tires",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_163_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_163_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_163_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_163_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_163_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_163_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_163_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_163_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_163_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_163_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_163_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_163_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_163_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_163_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_163_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_163_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: cutting watermelon\nB: shredding paper\nC: ripping paper\nD: sweeping floor",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: cutting watermelon\nB: shredding paper\nC: ripping paper\nD: sweeping floor",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_164_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_164_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_164_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_164_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_164_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_164_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_164_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_164_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_164_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_164_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_164_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_164_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_164_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_164_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_164_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_164_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: playing soccer\nB: dribbling basketball\nC: kicking soccer ball\nD: juggling soccer ball",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: playing soccer\nB: dribbling basketball\nC: kicking soccer ball\nD: juggling soccer ball",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_165_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_165_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_165_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_165_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_165_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_165_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_165_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_165_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_165_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_165_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_165_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_165_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_165_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_165_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_165_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_165_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: applying cream\nB: shaving legs\nC: washing feet\nD: massaging person's head",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: applying cream\nB: shaving legs\nC: washing feet\nD: massaging person's head",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_166_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_166_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_166_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_166_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_166_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_166_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_166_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_166_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_166_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_166_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_166_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_166_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_166_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_166_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_166_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_166_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: rock scissors paper\nB: front raises\nC: springboard diving\nD: bungee jumping",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: rock scissors paper\nB: front raises\nC: springboard diving\nD: bungee jumping",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_167_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_167_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_167_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_167_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_167_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_167_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_167_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_167_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_167_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_167_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_167_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_167_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_167_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_167_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_167_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_167_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: tire rotation\nB: changing oil\nC: inspecting engine\nD: checking tires",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: tire rotation\nB: changing oil\nC: inspecting engine\nD: checking tires",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_168_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_168_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_168_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_168_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_168_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_168_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_168_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_168_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_168_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_168_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_168_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_168_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_168_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_168_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_168_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_168_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: massaging feet\nB: milking cow\nC: petting animal (not cat)\nD: feeding birds",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: massaging feet\nB: milking cow\nC: petting animal (not cat)\nD: feeding birds",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_169_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_169_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_169_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_169_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_169_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_169_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_169_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_169_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_169_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_169_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_169_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_169_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_169_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_169_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_169_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_169_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: grilling meat\nB: eating burger\nC: cooking sausages\nD: eating chips",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: grilling meat\nB: eating burger\nC: cooking sausages\nD: eating chips",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_170_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_170_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_170_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_170_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_170_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_170_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_170_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_170_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_170_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_170_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_170_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_170_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_170_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_170_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_170_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_170_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: springboard diving\nB: hurdling\nC: zumba\nD: faceplanting",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: springboard diving\nB: hurdling\nC: zumba\nD: faceplanting",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_171_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_171_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_171_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_171_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_171_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_171_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_171_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_171_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_171_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_171_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_171_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_171_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_171_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_171_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_171_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_171_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: eating chips\nB: drinking beer\nC: tasting food\nD: drinking shots",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: eating chips\nB: drinking beer\nC: tasting food\nD: drinking shots",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_172_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_172_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_172_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_172_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_172_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_172_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_172_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_172_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_172_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_172_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_172_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_172_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_172_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_172_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_172_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_172_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: eating hotdog\nB: peeling apples\nC: baking cookies\nD: eating watermelon",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: eating hotdog\nB: peeling apples\nC: baking cookies\nD: eating watermelon",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_173_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_173_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_173_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_173_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_173_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_173_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_173_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_173_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_173_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_173_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_173_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_173_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_173_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_173_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_173_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_173_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: throwing discus\nB: passing American football (in game)\nC: playing kickball\nD: playing basketball",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: throwing discus\nB: passing American football (in game)\nC: playing kickball\nD: playing basketball",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_174_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_174_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_174_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_174_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_174_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_174_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_174_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_174_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_174_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_174_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_174_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_174_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_174_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_174_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_174_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_174_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: dancing charleston\nB: swing dancing\nC: tango dancing\nD: bungee jumping",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: dancing charleston\nB: swing dancing\nC: tango dancing\nD: bungee jumping",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_175_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_175_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_175_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_175_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_175_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_175_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_175_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_175_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_175_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_175_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_175_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_175_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_175_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_175_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_175_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_175_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: deadlifting\nB: springboard diving\nC: trapezing\nD: rock scissors paper",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: deadlifting\nB: springboard diving\nC: trapezing\nD: rock scissors paper",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_176_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_176_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_176_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_176_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_176_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_176_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_176_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_176_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_176_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_176_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_176_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_176_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_176_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_176_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_176_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_176_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: strumming guitar\nB: weaving basket\nC: peeling apples\nD: baking cookies",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: strumming guitar\nB: weaving basket\nC: peeling apples\nD: baking cookies",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_177_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_177_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_177_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_177_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_177_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_177_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_177_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_177_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_177_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_177_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_177_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_177_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_177_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_177_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_177_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_177_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: sailing\nB: ice climbing\nC: skiing crosscountry\nD: snowmobiling",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: sailing\nB: ice climbing\nC: skiing crosscountry\nD: snowmobiling",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_178_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_178_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_178_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_178_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_178_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_178_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_178_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_178_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_178_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_178_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_178_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_178_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_178_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_178_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_178_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_178_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: petting cat\nB: petting animal (not cat)\nC: feeding birds\nD: stroking dog",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: petting cat\nB: petting animal (not cat)\nC: feeding birds\nD: stroking dog",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_179_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_179_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_179_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_179_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_179_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_179_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_179_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_179_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_179_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_179_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_179_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_179_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_179_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_179_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_179_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_179_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: cooking on campfire\nB: smoking\nC: grilling fish\nD: barbequing",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: cooking on campfire\nB: smoking\nC: grilling fish\nD: barbequing",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_180_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_180_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_180_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_180_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_180_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_180_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_180_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_180_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_180_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_180_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_180_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_180_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_180_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_180_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_180_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_180_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: playing saxophone\nB: playing kickball\nC: playing guitar\nD: playing didgeridoo",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: playing saxophone\nB: playing kickball\nC: playing guitar\nD: playing didgeridoo",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_181_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_181_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_181_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_181_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_181_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_181_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_181_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_181_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_181_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_181_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_181_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_181_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_181_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_181_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_181_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_181_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: playing kickball\nB: driving tractor\nC: playing guitar\nD: air drumming",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: playing kickball\nB: driving tractor\nC: playing guitar\nD: air drumming",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_182_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_182_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_182_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_182_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_182_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_182_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_182_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_182_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_182_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_182_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_182_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_182_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_182_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_182_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_182_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_182_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: skateboarding\nB: playing kickball\nC: skiing crosscountry\nD: hoverboarding",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: skateboarding\nB: playing kickball\nC: skiing crosscountry\nD: hoverboarding",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_183_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_183_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_183_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_183_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_183_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_183_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_183_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_183_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_183_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_183_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_183_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_183_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_183_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_183_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_183_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_183_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: triple jump\nB: hurdling\nC: bouncing on trampoline\nD: high jump",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: triple jump\nB: hurdling\nC: bouncing on trampoline\nD: high jump",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_184_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_184_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_184_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_184_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_184_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_184_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_184_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_184_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_184_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_184_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_184_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_184_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_184_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_184_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_184_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_184_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: kicking soccer ball\nB: playing kickball\nC: playing basketball\nD: playing paintball",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: kicking soccer ball\nB: playing kickball\nC: playing basketball\nD: playing paintball",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_185_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_185_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_185_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_185_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_185_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_185_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_185_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_185_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_185_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_185_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_185_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_185_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_185_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_185_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_185_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_185_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: playing kickball\nB: tossing coin\nC: rock scissors paper\nD: throwing axe",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: playing kickball\nB: tossing coin\nC: rock scissors paper\nD: throwing axe",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_186_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_186_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_186_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_186_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_186_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_186_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_186_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_186_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_186_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_186_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_186_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_186_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_186_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_186_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_186_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_186_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: playing paintball\nB: shooting goal (soccer)\nC: brush painting\nD: celebrating",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: playing paintball\nB: shooting goal (soccer)\nC: brush painting\nD: celebrating",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_187_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_187_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_187_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_187_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_187_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_187_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_187_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_187_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_187_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_187_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_187_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_187_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_187_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_187_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_187_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_187_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: riding mule\nB: shooting goal (soccer)\nC: cutting watermelon\nD: chopping wood",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: riding mule\nB: shooting goal (soccer)\nC: cutting watermelon\nD: chopping wood",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_188_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_188_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_188_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_188_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_188_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_188_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_188_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_188_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_188_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_188_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_188_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_188_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_188_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_188_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_188_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_188_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: assembling computer\nB: grinding meat\nC: moving furniture\nD: brushing hair",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: assembling computer\nB: grinding meat\nC: moving furniture\nD: brushing hair",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_189_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_189_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_189_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_189_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_189_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_189_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_189_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_189_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_189_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_189_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_189_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_189_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_189_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_189_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_189_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_189_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: cooking chicken\nB: weaving basket\nC: making a cake\nD: baking cookies",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: cooking chicken\nB: weaving basket\nC: making a cake\nD: baking cookies",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_190_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_190_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_190_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_190_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_190_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_190_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_190_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_190_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_190_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_190_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_190_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_190_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_190_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_190_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_190_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_190_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: eating chips\nB: cooking on campfire\nC: dining\nD: playing poker",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: eating chips\nB: cooking on campfire\nC: dining\nD: playing poker",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_191_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_191_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_191_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_191_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_191_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_191_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_191_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_191_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_191_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_191_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_191_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_191_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_191_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_191_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_191_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_191_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: throwing discus\nB: cutting watermelon\nC: bending back\nD: throwing axe",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: throwing discus\nB: cutting watermelon\nC: bending back\nD: throwing axe",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_192_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_192_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_192_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_192_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_192_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_192_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_192_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_192_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_192_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_192_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_192_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_192_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_192_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_192_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_192_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_192_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: surfing water\nB: hoverboarding\nC: skateboarding\nD: riding scooter",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: surfing water\nB: hoverboarding\nC: skateboarding\nD: riding scooter",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_193_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_193_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_193_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_193_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_193_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_193_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_193_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_193_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_193_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_193_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_193_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_193_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_193_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_193_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_193_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_193_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: playing saxophone\nB: playing drums\nC: playing guitar\nD: playing trumpet",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: playing saxophone\nB: playing drums\nC: playing guitar\nD: playing trumpet",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_194_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_194_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_194_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_194_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_194_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_194_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_194_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_194_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_194_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_194_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_194_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_194_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_194_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_194_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_194_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_194_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: playing kickball\nB: kicking soccer ball\nC: dribbling basketball\nD: playing basketball",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: playing kickball\nB: kicking soccer ball\nC: dribbling basketball\nD: playing basketball",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_195_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_195_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_195_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_195_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_195_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_195_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_195_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_195_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_195_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_195_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_195_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_195_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_195_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_195_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_195_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_195_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: windsurfing\nB: snorkeling\nC: surfing water\nD: sailing",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: windsurfing\nB: snorkeling\nC: surfing water\nD: sailing",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_196_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_196_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_196_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_196_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_196_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_196_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_196_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_196_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_196_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_196_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_196_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_196_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_196_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_196_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_196_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_196_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: washing feet\nB: earning a hair cut\nC: using segway\nD: cleaning toilet",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: washing feet\nB: earning a hair cut\nC: using segway\nD: cleaning toilet",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_197_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_197_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_197_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_197_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_197_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_197_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_197_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_197_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_197_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_197_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_197_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_197_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_197_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_197_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_197_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_197_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: folding clothes\nB: stretching leg\nC: lunge\nD: situp",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: folding clothes\nB: stretching leg\nC: lunge\nD: situp",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_198_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_198_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_198_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_198_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_198_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_198_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_198_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_198_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_198_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_198_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_198_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_198_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_198_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_198_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_198_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_198_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "general_action_recognition",
+    "visual_input_component": "Video image or Natural image",
+    "source": "kinetics400",
+    "options": "A: auctioning\nB: shuffling cards\nC: rock scissors paper\nD: news anchoring",
+    "question": "What is the action performed by the person in the video?",
+    "context": "Select from the following choices.\nA: auctioning\nB: shuffling cards\nC: rock scissors paper\nD: news anchoring",
+    "input_image_path": [
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_199_0.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_199_1.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_199_2.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_199_3.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_199_4.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_199_5.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_199_6.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_199_7.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_199_8.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_199_9.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_199_10.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_199_11.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_199_12.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_199_13.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_199_14.png",
+      "../MMIU-Benchmark/general_action_recognition/general_action_recognition_199_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 13.72\nB: 14.35\nC: 16.28\nD: 14.93",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 13.72\nB: 14.35\nC: 16.28\nD: 14.93",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_0_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_0_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_0_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_0_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_0_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_0_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_0_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_0_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_0_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_0_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_0_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_0_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_0_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_0_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_0_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_0_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 15.26\nB: 16.11\nC: 13.57\nD: 14.42",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 15.26\nB: 16.11\nC: 13.57\nD: 14.42",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_1_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_1_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_1_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_1_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_1_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_1_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_1_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_1_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_1_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_1_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_1_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_1_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_1_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_1_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_1_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_1_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 81.6\nB: 96.6\nC: 40.54\nD: 51.19",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 81.6\nB: 96.6\nC: 40.54\nD: 51.19",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_2_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_2_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_2_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_2_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_2_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_2_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_2_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_2_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_2_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_2_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_2_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_2_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_2_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_2_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_2_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_2_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 101.25\nB: 23.09\nC: 48.72\nD: 68.8",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 101.25\nB: 23.09\nC: 48.72\nD: 68.8",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_3_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_3_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_3_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_3_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_3_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_3_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_3_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_3_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_3_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_3_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_3_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_3_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_3_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_3_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_3_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_3_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 51.2\nB: 25.68\nC: 94.09\nD: 77.42",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 51.2\nB: 25.68\nC: 94.09\nD: 77.42",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_4_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_4_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_4_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_4_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_4_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_4_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_4_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_4_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_4_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_4_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_4_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_4_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_4_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_4_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_4_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_4_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 72.9\nB: 42.15\nC: 86.68\nD: 61.07",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 72.9\nB: 42.15\nC: 86.68\nD: 61.07",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_5_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_5_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_5_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_5_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_5_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_5_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_5_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_5_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_5_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_5_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_5_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_5_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_5_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_5_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_5_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_5_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 28.0\nB: 21.54\nC: 43.89\nD: 15.37",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 28.0\nB: 21.54\nC: 43.89\nD: 15.37",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_6_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_6_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_6_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_6_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_6_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_6_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_6_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_6_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_6_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_6_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_6_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_6_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_6_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_6_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_6_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_6_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 63.74\nB: 50.8\nC: 91.69\nD: 47.0",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 63.74\nB: 50.8\nC: 91.69\nD: 47.0",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_7_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_7_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_7_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_7_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_7_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_7_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_7_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_7_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_7_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_7_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_7_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_7_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_7_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_7_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_7_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_7_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 92.75\nB: 27.27\nC: 74.47\nD: 42.73",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 92.75\nB: 27.27\nC: 74.47\nD: 42.73",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_8_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_8_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_8_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_8_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_8_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_8_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_8_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_8_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_8_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_8_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_8_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_8_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_8_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_8_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_8_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_8_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 36.08\nB: 43.14\nC: 23.0\nD: 13.43",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 36.08\nB: 43.14\nC: 23.0\nD: 13.43",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_9_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_9_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_9_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_9_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_9_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_9_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_9_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_9_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_9_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_9_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_9_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_9_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_9_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_9_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_9_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_9_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 14.7\nB: 15.8\nC: 15.61\nD: 13.11",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 14.7\nB: 15.8\nC: 15.61\nD: 13.11",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_10_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_10_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_10_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_10_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_10_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_10_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_10_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_10_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_10_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_10_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_10_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_10_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_10_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_10_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_10_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_10_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 38.64\nB: 78.37\nC: 62.38\nD: 65.49",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 38.64\nB: 78.37\nC: 62.38\nD: 65.49",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_11_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_11_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_11_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_11_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_11_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_11_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_11_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_11_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_11_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_11_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_11_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_11_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_11_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_11_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_11_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_11_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 90.59\nB: 72.12\nC: 53.48\nD: 37.87",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 90.59\nB: 72.12\nC: 53.48\nD: 37.87",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_12_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_12_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_12_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_12_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_12_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_12_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_12_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_12_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_12_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_12_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_12_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_12_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_12_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_12_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_12_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_12_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 73.13\nB: 42.05\nC: 88.2\nD: 34.17",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 73.13\nB: 42.05\nC: 88.2\nD: 34.17",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_13_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_13_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_13_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_13_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_13_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_13_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_13_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_13_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_13_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_13_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_13_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_13_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_13_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_13_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_13_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_13_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 41.65\nB: 31.16\nC: 24.74\nD: 17.0",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 41.65\nB: 31.16\nC: 24.74\nD: 17.0",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_14_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_14_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_14_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_14_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_14_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_14_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_14_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_14_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_14_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_14_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_14_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_14_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_14_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_14_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_14_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_14_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 90.09\nB: 49.05\nC: 65.44\nD: 62.51",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 90.09\nB: 49.05\nC: 65.44\nD: 62.51",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_15_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_15_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_15_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_15_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_15_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_15_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_15_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_15_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_15_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_15_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_15_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_15_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_15_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_15_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_15_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_15_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 66.33\nB: 88.78\nC: 39.89\nD: 57.84",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 66.33\nB: 88.78\nC: 39.89\nD: 57.84",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_16_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_16_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_16_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_16_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_16_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_16_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_16_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_16_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_16_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_16_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_16_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_16_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_16_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_16_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_16_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_16_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 66.72\nB: 92.78\nC: 51.35\nD: 21.6",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 66.72\nB: 92.78\nC: 51.35\nD: 21.6",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_17_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_17_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_17_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_17_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_17_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_17_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_17_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_17_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_17_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_17_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_17_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_17_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_17_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_17_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_17_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_17_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 13.6\nB: 14.75\nC: 16.56\nD: 15.4",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 13.6\nB: 14.75\nC: 16.56\nD: 15.4",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_18_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_18_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_18_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_18_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_18_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_18_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_18_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_18_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_18_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_18_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_18_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_18_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_18_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_18_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_18_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_18_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 47.0\nB: 35.44\nC: 19.47\nD: 14.22",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 47.0\nB: 35.44\nC: 19.47\nD: 14.22",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_19_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_19_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_19_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_19_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_19_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_19_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_19_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_19_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_19_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_19_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_19_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_19_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_19_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_19_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_19_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_19_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 14.04\nB: 15.74\nC: 13.89\nD: 15.78",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 14.04\nB: 15.74\nC: 13.89\nD: 15.78",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_20_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_20_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_20_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_20_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_20_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_20_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_20_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_20_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_20_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_20_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_20_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_20_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_20_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_20_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_20_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_20_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 20.0\nB: 49.69\nC: 31.96\nD: 12.28",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 20.0\nB: 49.69\nC: 31.96\nD: 12.28",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_21_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_21_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_21_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_21_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_21_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_21_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_21_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_21_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_21_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_21_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_21_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_21_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_21_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_21_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_21_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_21_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 38.52\nB: 21.11\nC: 16.2\nD: 31.0",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 38.52\nB: 21.11\nC: 16.2\nD: 31.0",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_22_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_22_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_22_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_22_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_22_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_22_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_22_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_22_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_22_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_22_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_22_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_22_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_22_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_22_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_22_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_22_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 58.15\nB: 66.19\nC: 34.82\nD: 81.2",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 58.15\nB: 66.19\nC: 34.82\nD: 81.2",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_23_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_23_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_23_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_23_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_23_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_23_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_23_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_23_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_23_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_23_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_23_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_23_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_23_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_23_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_23_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_23_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 74.85\nB: 45.75\nC: 84.63\nD: 56.85",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 74.85\nB: 45.75\nC: 84.63\nD: 56.85",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_24_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_24_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_24_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_24_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_24_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_24_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_24_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_24_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_24_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_24_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_24_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_24_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_24_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_24_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_24_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_24_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 31.78\nB: 16.54\nC: 38.0\nD: 23.93",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 31.78\nB: 16.54\nC: 38.0\nD: 23.93",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_25_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_25_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_25_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_25_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_25_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_25_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_25_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_25_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_25_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_25_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_25_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_25_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_25_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_25_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_25_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_25_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 24.67\nB: 33.0\nC: 49.16\nD: 9.13",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 24.67\nB: 33.0\nC: 49.16\nD: 9.13",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_26_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_26_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_26_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_26_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_26_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_26_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_26_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_26_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_26_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_26_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_26_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_26_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_26_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_26_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_26_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_26_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 16.09\nB: 13.82\nC: 15.43\nD: 14.88",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 16.09\nB: 13.82\nC: 15.43\nD: 14.88",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_27_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_27_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_27_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_27_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_27_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_27_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_27_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_27_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_27_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_27_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_27_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_27_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_27_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_27_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_27_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_27_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 37.4\nB: 56.4\nC: 69.5\nD: 101.82",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 37.4\nB: 56.4\nC: 69.5\nD: 101.82",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_28_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_28_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_28_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_28_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_28_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_28_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_28_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_28_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_28_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_28_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_28_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_28_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_28_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_28_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_28_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_28_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 41.0\nB: 8.77\nC: 19.67\nD: 35.16",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 41.0\nB: 8.77\nC: 19.67\nD: 35.16",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_29_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_29_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_29_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_29_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_29_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_29_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_29_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_29_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_29_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_29_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_29_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_29_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_29_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_29_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_29_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_29_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 56.82\nB: 84.15\nC: 26.52\nD: 66.29",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 56.82\nB: 84.15\nC: 26.52\nD: 66.29",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_30_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_30_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_30_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_30_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_30_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_30_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_30_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_30_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_30_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_30_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_30_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_30_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_30_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_30_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_30_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_30_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 74.25\nB: 82.65\nC: 56.79\nD: 24.3",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 74.25\nB: 82.65\nC: 56.79\nD: 24.3",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_31_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_31_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_31_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_31_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_31_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_31_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_31_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_31_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_31_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_31_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_31_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_31_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_31_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_31_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_31_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_31_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 54.04\nB: 76.33\nC: 37.87\nD: 87.47",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 54.04\nB: 76.33\nC: 37.87\nD: 87.47",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_32_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_32_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_32_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_32_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_32_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_32_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_32_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_32_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_32_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_32_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_32_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_32_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_32_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_32_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_32_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_32_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 37.95\nB: 66.54\nC: 52.06\nD: 98.65",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 37.95\nB: 66.54\nC: 52.06\nD: 98.65",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_33_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_33_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_33_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_33_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_33_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_33_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_33_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_33_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_33_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_33_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_33_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_33_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_33_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_33_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_33_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_33_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 65.39\nB: 77.95\nC: 54.0\nD: 94.23",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 65.39\nB: 77.95\nC: 54.0\nD: 94.23",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_34_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_34_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_34_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_34_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_34_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_34_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_34_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_34_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_34_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_34_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_34_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_34_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_34_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_34_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_34_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_34_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 41.04\nB: 18.12\nC: 33.0\nD: 11.99",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 41.04\nB: 18.12\nC: 33.0\nD: 11.99",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_35_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_35_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_35_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_35_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_35_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_35_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_35_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_35_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_35_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_35_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_35_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_35_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_35_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_35_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_35_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_35_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 50.0\nB: 13.02\nC: 19.88\nD: 34.9",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 50.0\nB: 13.02\nC: 19.88\nD: 34.9",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_36_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_36_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_36_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_36_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_36_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_36_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_36_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_36_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_36_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_36_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_36_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_36_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_36_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_36_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_36_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_36_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 21.52\nB: 30.43\nC: 15.58\nD: 47.0",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 21.52\nB: 30.43\nC: 15.58\nD: 47.0",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_37_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_37_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_37_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_37_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_37_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_37_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_37_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_37_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_37_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_37_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_37_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_37_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_37_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_37_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_37_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_37_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 51.1\nB: 86.7\nC: 69.2\nD: 94.61",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 51.1\nB: 86.7\nC: 69.2\nD: 94.61",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_38_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_38_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_38_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_38_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_38_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_38_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_38_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_38_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_38_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_38_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_38_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_38_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_38_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_38_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_38_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_38_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 71.06\nB: 94.05\nC: 55.11\nD: 37.78",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 71.06\nB: 94.05\nC: 55.11\nD: 37.78",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_39_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_39_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_39_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_39_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_39_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_39_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_39_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_39_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_39_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_39_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_39_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_39_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_39_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_39_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_39_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_39_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 13.81\nB: 15.16\nC: 16.04\nD: 14.68",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 13.81\nB: 15.16\nC: 16.04\nD: 14.68",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_40_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_40_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_40_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_40_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_40_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_40_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_40_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_40_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_40_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_40_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_40_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_40_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_40_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_40_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_40_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_40_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 13.67\nB: 14.93\nC: 14.61\nD: 16.1",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 13.67\nB: 14.93\nC: 14.61\nD: 16.1",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_41_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_41_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_41_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_41_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_41_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_41_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_41_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_41_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_41_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_41_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_41_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_41_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_41_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_41_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_41_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_41_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 48.49\nB: 88.9\nC: 64.35\nD: 37.08",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 48.49\nB: 88.9\nC: 64.35\nD: 37.08",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_42_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_42_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_42_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_42_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_42_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_42_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_42_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_42_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_42_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_42_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_42_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_42_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_42_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_42_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_42_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_42_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 37.22\nB: 46.55\nC: 24.0\nD: 9.84",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 37.22\nB: 46.55\nC: 24.0\nD: 9.84",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_43_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_43_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_43_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_43_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_43_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_43_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_43_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_43_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_43_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_43_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_43_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_43_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_43_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_43_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_43_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_43_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 79.2\nB: 57.54\nC: 41.45\nD: 87.34",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 79.2\nB: 57.54\nC: 41.45\nD: 87.34",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_44_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_44_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_44_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_44_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_44_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_44_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_44_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_44_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_44_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_44_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_44_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_44_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_44_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_44_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_44_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_44_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 69.3\nB: 54.14\nC: 30.81\nD: 86.4",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 69.3\nB: 54.14\nC: 30.81\nD: 86.4",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_45_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_45_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_45_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_45_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_45_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_45_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_45_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_45_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_45_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_45_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_45_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_45_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_45_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_45_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_45_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_45_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 14.18\nB: 14.89\nC: 16.62\nD: 13.77",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 14.18\nB: 14.89\nC: 16.62\nD: 13.77",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_46_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_46_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_46_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_46_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_46_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_46_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_46_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_46_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_46_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_46_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_46_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_46_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_46_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_46_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_46_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_46_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 16.13\nB: 15.7\nC: 13.4\nD: 14.0",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 16.13\nB: 15.7\nC: 13.4\nD: 14.0",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_47_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_47_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_47_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_47_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_47_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_47_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_47_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_47_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_47_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_47_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_47_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_47_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_47_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_47_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_47_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_47_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 16.07\nB: 14.31\nC: 13.42\nD: 15.77",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 16.07\nB: 14.31\nC: 13.42\nD: 15.77",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_48_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_48_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_48_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_48_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_48_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_48_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_48_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_48_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_48_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_48_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_48_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_48_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_48_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_48_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_48_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_48_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 55.37\nB: 32.43\nC: 86.4\nD: 71.99",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 55.37\nB: 32.43\nC: 86.4\nD: 71.99",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_49_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_49_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_49_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_49_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_49_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_49_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_49_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_49_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_49_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_49_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_49_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_49_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_49_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_49_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_49_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_49_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 48.34\nB: 69.75\nC: 86.77\nD: 99.17",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 48.34\nB: 69.75\nC: 86.77\nD: 99.17",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_50_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_50_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_50_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_50_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_50_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_50_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_50_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_50_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_50_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_50_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_50_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_50_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_50_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_50_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_50_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_50_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 15.1\nB: 14.74\nC: 15.78\nD: 13.81",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 15.1\nB: 14.74\nC: 15.78\nD: 13.81",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_51_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_51_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_51_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_51_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_51_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_51_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_51_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_51_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_51_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_51_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_51_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_51_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_51_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_51_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_51_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_51_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 39.77\nB: 93.48\nC: 76.5\nD: 43.86",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 39.77\nB: 93.48\nC: 76.5\nD: 43.86",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_52_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_52_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_52_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_52_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_52_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_52_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_52_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_52_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_52_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_52_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_52_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_52_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_52_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_52_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_52_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_52_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 15.3\nB: 13.45\nC: 14.3\nD: 15.81",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 15.3\nB: 13.45\nC: 14.3\nD: 15.81",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_53_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_53_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_53_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_53_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_53_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_53_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_53_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_53_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_53_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_53_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_53_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_53_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_53_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_53_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_53_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_53_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 15.38\nB: 16.05\nC: 13.85\nD: 14.2",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 15.38\nB: 16.05\nC: 13.85\nD: 14.2",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_54_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_54_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_54_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_54_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_54_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_54_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_54_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_54_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_54_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_54_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_54_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_54_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_54_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_54_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_54_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_54_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 40.46\nB: 77.72\nC: 86.4\nD: 42.0",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 40.46\nB: 77.72\nC: 86.4\nD: 42.0",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_55_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_55_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_55_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_55_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_55_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_55_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_55_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_55_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_55_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_55_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_55_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_55_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_55_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_55_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_55_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_55_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 73.6\nB: 57.54\nC: 28.87\nD: 92.2",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 73.6\nB: 57.54\nC: 28.87\nD: 92.2",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_56_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_56_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_56_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_56_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_56_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_56_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_56_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_56_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_56_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_56_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_56_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_56_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_56_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_56_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_56_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_56_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 32.99\nB: 51.0\nC: 90.75\nD: 66.64",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 32.99\nB: 51.0\nC: 90.75\nD: 66.64",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_57_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_57_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_57_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_57_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_57_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_57_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_57_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_57_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_57_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_57_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_57_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_57_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_57_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_57_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_57_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_57_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 60.36\nB: 72.34\nC: 80.64\nD: 92.23",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 60.36\nB: 72.34\nC: 80.64\nD: 92.23",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_58_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_58_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_58_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_58_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_58_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_58_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_58_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_58_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_58_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_58_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_58_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_58_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_58_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_58_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_58_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_58_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 14.2\nB: 13.73\nC: 15.59\nD: 16.57",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 14.2\nB: 13.73\nC: 15.59\nD: 16.57",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_59_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_59_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_59_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_59_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_59_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_59_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_59_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_59_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_59_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_59_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_59_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_59_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_59_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_59_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_59_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_59_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 13.97\nB: 14.16\nC: 15.7\nD: 16.2",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 13.97\nB: 14.16\nC: 15.7\nD: 16.2",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_60_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_60_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_60_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_60_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_60_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_60_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_60_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_60_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_60_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_60_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_60_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_60_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_60_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_60_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_60_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_60_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 45.92\nB: 11.52\nC: 25.38\nD: 36.0",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 45.92\nB: 11.52\nC: 25.38\nD: 36.0",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_61_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_61_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_61_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_61_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_61_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_61_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_61_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_61_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_61_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_61_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_61_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_61_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_61_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_61_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_61_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_61_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 35.36\nB: 74.12\nC: 49.39\nD: 93.6",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 35.36\nB: 74.12\nC: 49.39\nD: 93.6",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_62_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_62_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_62_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_62_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_62_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_62_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_62_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_62_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_62_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_62_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_62_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_62_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_62_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_62_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_62_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_62_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 62.18\nB: 42.9\nC: 80.12\nD: 63.81",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 62.18\nB: 42.9\nC: 80.12\nD: 63.81",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_63_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_63_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_63_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_63_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_63_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_63_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_63_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_63_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_63_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_63_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_63_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_63_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_63_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_63_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_63_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_63_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 18.7\nB: 40.55\nC: 33.0\nD: 11.12",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 18.7\nB: 40.55\nC: 33.0\nD: 11.12",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_64_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_64_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_64_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_64_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_64_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_64_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_64_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_64_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_64_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_64_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_64_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_64_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_64_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_64_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_64_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_64_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 94.83\nB: 59.75\nC: 84.99\nD: 65.28",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 94.83\nB: 59.75\nC: 84.99\nD: 65.28",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_65_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_65_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_65_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_65_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_65_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_65_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_65_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_65_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_65_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_65_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_65_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_65_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_65_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_65_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_65_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_65_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 36.11\nB: 77.04\nC: 44.05\nD: 90.75",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 36.11\nB: 77.04\nC: 44.05\nD: 90.75",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_66_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_66_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_66_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_66_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_66_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_66_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_66_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_66_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_66_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_66_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_66_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_66_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_66_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_66_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_66_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_66_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 15.7\nB: 23.32\nC: 35.3\nD: 40.0",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 15.7\nB: 23.32\nC: 35.3\nD: 40.0",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_67_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_67_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_67_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_67_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_67_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_67_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_67_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_67_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_67_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_67_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_67_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_67_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_67_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_67_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_67_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_67_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 41.0\nB: 16.81\nC: 37.0\nD: 26.08",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 41.0\nB: 16.81\nC: 37.0\nD: 26.08",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_68_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_68_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_68_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_68_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_68_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_68_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_68_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_68_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_68_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_68_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_68_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_68_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_68_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_68_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_68_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_68_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 26.98\nB: 14.1\nC: 37.94\nD: 47.0",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 26.98\nB: 14.1\nC: 37.94\nD: 47.0",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_69_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_69_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_69_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_69_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_69_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_69_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_69_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_69_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_69_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_69_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_69_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_69_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_69_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_69_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_69_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_69_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 15.92\nB: 13.69\nC: 15.11\nD: 14.18",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 15.92\nB: 13.69\nC: 15.11\nD: 14.18",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_70_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_70_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_70_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_70_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_70_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_70_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_70_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_70_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_70_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_70_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_70_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_70_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_70_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_70_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_70_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_70_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 14.7\nB: 16.47\nC: 13.22\nD: 15.67",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 14.7\nB: 16.47\nC: 13.22\nD: 15.67",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_71_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_71_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_71_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_71_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_71_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_71_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_71_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_71_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_71_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_71_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_71_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_71_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_71_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_71_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_71_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_71_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 14.15\nB: 16.15\nC: 14.98\nD: 13.84",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 14.15\nB: 16.15\nC: 14.98\nD: 13.84",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_72_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_72_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_72_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_72_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_72_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_72_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_72_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_72_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_72_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_72_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_72_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_72_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_72_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_72_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_72_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_72_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 22.27\nB: 31.0\nC: 16.79\nD: 42.84",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 22.27\nB: 31.0\nC: 16.79\nD: 42.84",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_73_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_73_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_73_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_73_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_73_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_73_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_73_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_73_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_73_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_73_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_73_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_73_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_73_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_73_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_73_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_73_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 35.12\nB: 8.34\nC: 19.09\nD: 41.0",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 35.12\nB: 8.34\nC: 19.09\nD: 41.0",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_74_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_74_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_74_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_74_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_74_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_74_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_74_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_74_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_74_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_74_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_74_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_74_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_74_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_74_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_74_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_74_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 45.54\nB: 41.25\nC: 88.05\nD: 65.27",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 45.54\nB: 41.25\nC: 88.05\nD: 65.27",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_75_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_75_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_75_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_75_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_75_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_75_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_75_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_75_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_75_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_75_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_75_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_75_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_75_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_75_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_75_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_75_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 15.13\nB: 16.53\nC: 13.86\nD: 14.76",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 15.13\nB: 16.53\nC: 13.86\nD: 14.76",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_76_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_76_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_76_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_76_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_76_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_76_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_76_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_76_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_76_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_76_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_76_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_76_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_76_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_76_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_76_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_76_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 13.44\nB: 15.84\nC: 14.1\nD: 15.39",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 13.44\nB: 15.84\nC: 14.1\nD: 15.39",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_77_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_77_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_77_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_77_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_77_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_77_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_77_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_77_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_77_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_77_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_77_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_77_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_77_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_77_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_77_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_77_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 89.31\nB: 76.24\nC: 49.14\nD: 45.07",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 89.31\nB: 76.24\nC: 49.14\nD: 45.07",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_78_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_78_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_78_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_78_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_78_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_78_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_78_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_78_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_78_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_78_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_78_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_78_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_78_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_78_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_78_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_78_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 55.81\nB: 63.15\nC: 82.13\nD: 87.69",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 55.81\nB: 63.15\nC: 82.13\nD: 87.69",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_79_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_79_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_79_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_79_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_79_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_79_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_79_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_79_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_79_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_79_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_79_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_79_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_79_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_79_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_79_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_79_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 25.35\nB: 36.25\nC: 46.29\nD: 11.0",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 25.35\nB: 36.25\nC: 46.29\nD: 11.0",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_80_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_80_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_80_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_80_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_80_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_80_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_80_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_80_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_80_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_80_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_80_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_80_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_80_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_80_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_80_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_80_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 16.37\nB: 14.69\nC: 14.96\nD: 13.68",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 16.37\nB: 14.69\nC: 14.96\nD: 13.68",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_81_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_81_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_81_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_81_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_81_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_81_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_81_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_81_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_81_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_81_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_81_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_81_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_81_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_81_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_81_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_81_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 13.2\nB: 16.1\nC: 14.73\nD: 15.48",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 13.2\nB: 16.1\nC: 14.73\nD: 15.48",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_82_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_82_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_82_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_82_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_82_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_82_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_82_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_82_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_82_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_82_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_82_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_82_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_82_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_82_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_82_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_82_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 53.37\nB: 89.89\nC: 65.12\nD: 80.19",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 53.37\nB: 89.89\nC: 65.12\nD: 80.19",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_83_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_83_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_83_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_83_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_83_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_83_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_83_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_83_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_83_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_83_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_83_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_83_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_83_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_83_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_83_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_83_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 15.87\nB: 15.68\nC: 13.35\nD: 14.7",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 15.87\nB: 15.68\nC: 13.35\nD: 14.7",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_84_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_84_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_84_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_84_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_84_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_84_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_84_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_84_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_84_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_84_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_84_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_84_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_84_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_84_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_84_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_84_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 99.08\nB: 71.08\nC: 82.07\nD: 53.4",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 99.08\nB: 71.08\nC: 82.07\nD: 53.4",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_85_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_85_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_85_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_85_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_85_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_85_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_85_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_85_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_85_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_85_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_85_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_85_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_85_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_85_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_85_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_85_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 40.56\nB: 91.83\nC: 52.96\nD: 68.4",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 40.56\nB: 91.83\nC: 52.96\nD: 68.4",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_86_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_86_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_86_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_86_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_86_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_86_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_86_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_86_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_86_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_86_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_86_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_86_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_86_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_86_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_86_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_86_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 15.95\nB: 15.3\nC: 13.18\nD: 14.5",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 15.95\nB: 15.3\nC: 13.18\nD: 14.5",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_87_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_87_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_87_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_87_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_87_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_87_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_87_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_87_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_87_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_87_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_87_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_87_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_87_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_87_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_87_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_87_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 51.62\nB: 31.41\nC: 75.9\nD: 83.4",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 51.62\nB: 31.41\nC: 75.9\nD: 83.4",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_88_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_88_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_88_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_88_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_88_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_88_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_88_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_88_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_88_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_88_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_88_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_88_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_88_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_88_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_88_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_88_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 48.83\nB: 25.57\nC: 98.96\nD: 67.3",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 48.83\nB: 25.57\nC: 98.96\nD: 67.3",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_89_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_89_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_89_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_89_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_89_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_89_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_89_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_89_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_89_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_89_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_89_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_89_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_89_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_89_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_89_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_89_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 22.27\nB: 45.0\nC: 31.7\nD: 16.21",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 22.27\nB: 45.0\nC: 31.7\nD: 16.21",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_90_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_90_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_90_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_90_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_90_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_90_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_90_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_90_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_90_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_90_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_90_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_90_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_90_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_90_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_90_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_90_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 30.46\nB: 66.0\nC: 85.93\nD: 57.57",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 30.46\nB: 66.0\nC: 85.93\nD: 57.57",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_91_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_91_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_91_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_91_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_91_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_91_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_91_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_91_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_91_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_91_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_91_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_91_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_91_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_91_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_91_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_91_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 13.0\nB: 32.72\nC: 24.42\nD: 44.28",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 13.0\nB: 32.72\nC: 24.42\nD: 44.28",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_92_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_92_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_92_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_92_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_92_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_92_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_92_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_92_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_92_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_92_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_92_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_92_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_92_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_92_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_92_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_92_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 14.83\nB: 13.36\nC: 15.78\nD: 15.59",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 14.83\nB: 13.36\nC: 15.78\nD: 15.59",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_93_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_93_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_93_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_93_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_93_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_93_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_93_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_93_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_93_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_93_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_93_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_93_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_93_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_93_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_93_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_93_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 61.36\nB: 72.0\nC: 29.41\nD: 90.82",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 61.36\nB: 72.0\nC: 29.41\nD: 90.82",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_94_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_94_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_94_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_94_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_94_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_94_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_94_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_94_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_94_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_94_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_94_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_94_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_94_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_94_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_94_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_94_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 92.8\nB: 55.03\nC: 40.44\nD: 79.92",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 92.8\nB: 55.03\nC: 40.44\nD: 79.92",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_95_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_95_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_95_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_95_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_95_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_95_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_95_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_95_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_95_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_95_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_95_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_95_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_95_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_95_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_95_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_95_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 62.61\nB: 70.36\nC: 37.92\nD: 83.27",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 62.61\nB: 70.36\nC: 37.92\nD: 83.27",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_96_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_96_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_96_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_96_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_96_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_96_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_96_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_96_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_96_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_96_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_96_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_96_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_96_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_96_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_96_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_96_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 57.41\nB: 88.26\nC: 68.75\nD: 43.6",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 57.41\nB: 88.26\nC: 68.75\nD: 43.6",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_97_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_97_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_97_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_97_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_97_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_97_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_97_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_97_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_97_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_97_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_97_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_97_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_97_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_97_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_97_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_97_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 72.66\nB: 52.95\nC: 37.59\nD: 78.8",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 72.66\nB: 52.95\nC: 37.59\nD: 78.8",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_98_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_98_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_98_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_98_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_98_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_98_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_98_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_98_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_98_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_98_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_98_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_98_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_98_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_98_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_98_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_98_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 31.43\nB: 41.0\nC: 23.56\nD: 11.6",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 31.43\nB: 41.0\nC: 23.56\nD: 11.6",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_99_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_99_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_99_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_99_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_99_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_99_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_99_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_99_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_99_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_99_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_99_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_99_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_99_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_99_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_99_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_99_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 15.83\nB: 14.74\nC: 15.03\nD: 13.86",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 15.83\nB: 14.74\nC: 15.03\nD: 13.86",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_100_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_100_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_100_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_100_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_100_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_100_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_100_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_100_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_100_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_100_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_100_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_100_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_100_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_100_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_100_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_100_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 16.34\nB: 14.83\nC: 13.65\nD: 14.95",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 16.34\nB: 14.83\nC: 13.65\nD: 14.95",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_101_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_101_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_101_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_101_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_101_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_101_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_101_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_101_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_101_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_101_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_101_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_101_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_101_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_101_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_101_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_101_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 36.1\nB: 62.13\nC: 86.47\nD: 69.52",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 36.1\nB: 62.13\nC: 86.47\nD: 69.52",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_102_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_102_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_102_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_102_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_102_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_102_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_102_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_102_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_102_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_102_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_102_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_102_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_102_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_102_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_102_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_102_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 95.33\nB: 48.04\nC: 78.15\nD: 66.6",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 95.33\nB: 48.04\nC: 78.15\nD: 66.6",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_103_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_103_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_103_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_103_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_103_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_103_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_103_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_103_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_103_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_103_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_103_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_103_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_103_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_103_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_103_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_103_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 83.75\nB: 40.4\nC: 70.73\nD: 54.88",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 83.75\nB: 40.4\nC: 70.73\nD: 54.88",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_104_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_104_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_104_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_104_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_104_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_104_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_104_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_104_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_104_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_104_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_104_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_104_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_104_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_104_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_104_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_104_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 19.0\nB: 44.42\nC: 17.03\nD: 30.18",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 19.0\nB: 44.42\nC: 17.03\nD: 30.18",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_105_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_105_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_105_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_105_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_105_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_105_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_105_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_105_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_105_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_105_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_105_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_105_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_105_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_105_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_105_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_105_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 13.88\nB: 31.45\nC: 40.87\nD: 24.0",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 13.88\nB: 31.45\nC: 40.87\nD: 24.0",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_106_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_106_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_106_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_106_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_106_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_106_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_106_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_106_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_106_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_106_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_106_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_106_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_106_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_106_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_106_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_106_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 67.5\nB: 45.43\nC: 96.95\nD: 33.01",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 67.5\nB: 45.43\nC: 96.95\nD: 33.01",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_107_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_107_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_107_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_107_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_107_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_107_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_107_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_107_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_107_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_107_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_107_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_107_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_107_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_107_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_107_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_107_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 68.32\nB: 43.26\nC: 50.74\nD: 81.04",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 68.32\nB: 43.26\nC: 50.74\nD: 81.04",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_108_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_108_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_108_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_108_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_108_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_108_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_108_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_108_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_108_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_108_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_108_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_108_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_108_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_108_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_108_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_108_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 68.16\nB: 95.11\nC: 83.52\nD: 61.72",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 68.16\nB: 95.11\nC: 83.52\nD: 61.72",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_109_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_109_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_109_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_109_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_109_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_109_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_109_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_109_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_109_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_109_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_109_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_109_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_109_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_109_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_109_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_109_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 86.14\nB: 52.58\nC: 90.32\nD: 68.4",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 86.14\nB: 52.58\nC: 90.32\nD: 68.4",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_110_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_110_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_110_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_110_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_110_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_110_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_110_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_110_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_110_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_110_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_110_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_110_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_110_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_110_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_110_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_110_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 94.54\nB: 23.49\nC: 73.6\nD: 58.19",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 94.54\nB: 23.49\nC: 73.6\nD: 58.19",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_111_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_111_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_111_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_111_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_111_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_111_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_111_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_111_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_111_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_111_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_111_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_111_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_111_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_111_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_111_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_111_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 96.12\nB: 52.2\nC: 63.19\nD: 85.65",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 96.12\nB: 52.2\nC: 63.19\nD: 85.65",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_112_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_112_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_112_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_112_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_112_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_112_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_112_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_112_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_112_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_112_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_112_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_112_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_112_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_112_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_112_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_112_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 60.04\nB: 48.45\nC: 85.05\nD: 73.29",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 60.04\nB: 48.45\nC: 85.05\nD: 73.29",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_113_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_113_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_113_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_113_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_113_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_113_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_113_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_113_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_113_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_113_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_113_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_113_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_113_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_113_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_113_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_113_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 76.5\nB: 46.5\nC: 24.38\nD: 83.3",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 76.5\nB: 46.5\nC: 24.38\nD: 83.3",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_114_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_114_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_114_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_114_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_114_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_114_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_114_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_114_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_114_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_114_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_114_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_114_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_114_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_114_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_114_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_114_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 13.96\nB: 16.01\nC: 14.28\nD: 15.62",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 13.96\nB: 16.01\nC: 14.28\nD: 15.62",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_115_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_115_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_115_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_115_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_115_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_115_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_115_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_115_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_115_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_115_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_115_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_115_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_115_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_115_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_115_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_115_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 84.03\nB: 61.73\nC: 74.46\nD: 91.2",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 84.03\nB: 61.73\nC: 74.46\nD: 91.2",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_116_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_116_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_116_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_116_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_116_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_116_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_116_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_116_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_116_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_116_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_116_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_116_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_116_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_116_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_116_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_116_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 37.57\nB: 89.35\nC: 60.88\nD: 71.91",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 37.57\nB: 89.35\nC: 60.88\nD: 71.91",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_117_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_117_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_117_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_117_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_117_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_117_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_117_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_117_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_117_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_117_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_117_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_117_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_117_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_117_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_117_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_117_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 81.6\nB: 32.15\nC: 50.53\nD: 102.55",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 81.6\nB: 32.15\nC: 50.53\nD: 102.55",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_118_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_118_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_118_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_118_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_118_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_118_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_118_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_118_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_118_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_118_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_118_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_118_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_118_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_118_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_118_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_118_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 84.35\nB: 51.95\nC: 74.77\nD: 39.8",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 84.35\nB: 51.95\nC: 74.77\nD: 39.8",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_119_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_119_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_119_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_119_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_119_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_119_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_119_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_119_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_119_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_119_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_119_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_119_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_119_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_119_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_119_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_119_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 17.44\nB: 41.32\nC: 22.07\nD: 38.0",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 17.44\nB: 41.32\nC: 22.07\nD: 38.0",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_120_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_120_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_120_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_120_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_120_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_120_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_120_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_120_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_120_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_120_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_120_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_120_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_120_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_120_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_120_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_120_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 78.4\nB: 61.99\nC: 95.04\nD: 38.43",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 78.4\nB: 61.99\nC: 95.04\nD: 38.43",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_121_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_121_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_121_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_121_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_121_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_121_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_121_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_121_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_121_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_121_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_121_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_121_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_121_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_121_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_121_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_121_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 15.97\nB: 13.55\nC: 15.16\nD: 14.35",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 15.97\nB: 13.55\nC: 15.16\nD: 14.35",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_122_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_122_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_122_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_122_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_122_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_122_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_122_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_122_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_122_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_122_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_122_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_122_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_122_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_122_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_122_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_122_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 78.54\nB: 73.19\nC: 89.42\nD: 51.58",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 78.54\nB: 73.19\nC: 89.42\nD: 51.58",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_123_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_123_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_123_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_123_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_123_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_123_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_123_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_123_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_123_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_123_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_123_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_123_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_123_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_123_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_123_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_123_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 42.07\nB: 14.32\nC: 21.13\nD: 35.0",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 42.07\nB: 14.32\nC: 21.13\nD: 35.0",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_124_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_124_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_124_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_124_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_124_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_124_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_124_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_124_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_124_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_124_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_124_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_124_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_124_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_124_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_124_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_124_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 48.93\nB: 98.95\nC: 23.75\nD: 77.7",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 48.93\nB: 98.95\nC: 23.75\nD: 77.7",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_125_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_125_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_125_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_125_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_125_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_125_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_125_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_125_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_125_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_125_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_125_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_125_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_125_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_125_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_125_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_125_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 67.61\nB: 60.31\nC: 83.25\nD: 31.09",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 67.61\nB: 60.31\nC: 83.25\nD: 31.09",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_126_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_126_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_126_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_126_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_126_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_126_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_126_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_126_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_126_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_126_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_126_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_126_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_126_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_126_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_126_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_126_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 14.46\nB: 13.93\nC: 14.95\nD: 16.22",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 14.46\nB: 13.93\nC: 14.95\nD: 16.22",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_127_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_127_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_127_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_127_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_127_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_127_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_127_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_127_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_127_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_127_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_127_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_127_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_127_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_127_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_127_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_127_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 38.86\nB: 39.64\nC: 8.48\nD: 26.0",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 38.86\nB: 39.64\nC: 8.48\nD: 26.0",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_128_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_128_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_128_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_128_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_128_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_128_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_128_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_128_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_128_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_128_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_128_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_128_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_128_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_128_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_128_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_128_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 37.25\nB: 12.0\nC: 29.63\nD: 19.34",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 37.25\nB: 12.0\nC: 29.63\nD: 19.34",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_129_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_129_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_129_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_129_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_129_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_129_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_129_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_129_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_129_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_129_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_129_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_129_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_129_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_129_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_129_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_129_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 14.81\nB: 15.5\nC: 16.04\nD: 13.6",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 14.81\nB: 15.5\nC: 16.04\nD: 13.6",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_130_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_130_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_130_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_130_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_130_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_130_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_130_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_130_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_130_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_130_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_130_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_130_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_130_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_130_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_130_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_130_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 44.49\nB: 86.4\nC: 64.43\nD: 31.81",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 44.49\nB: 86.4\nC: 64.43\nD: 31.81",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_131_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_131_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_131_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_131_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_131_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_131_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_131_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_131_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_131_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_131_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_131_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_131_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_131_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_131_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_131_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_131_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 44.76\nB: 68.61\nC: 83.25\nD: 27.55",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 44.76\nB: 68.61\nC: 83.25\nD: 27.55",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_132_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_132_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_132_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_132_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_132_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_132_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_132_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_132_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_132_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_132_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_132_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_132_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_132_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_132_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_132_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_132_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 16.14\nB: 15.63\nC: 14.3\nD: 13.88",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 16.14\nB: 15.63\nC: 14.3\nD: 13.88",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_133_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_133_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_133_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_133_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_133_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_133_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_133_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_133_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_133_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_133_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_133_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_133_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_133_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_133_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_133_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_133_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 13.73\nB: 15.1\nC: 14.23\nD: 15.92",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 13.73\nB: 15.1\nC: 14.23\nD: 15.92",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_134_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_134_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_134_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_134_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_134_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_134_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_134_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_134_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_134_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_134_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_134_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_134_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_134_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_134_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_134_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_134_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 16.24\nB: 14.52\nC: 13.88\nD: 14.96",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 16.24\nB: 14.52\nC: 13.88\nD: 14.96",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_135_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_135_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_135_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_135_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_135_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_135_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_135_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_135_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_135_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_135_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_135_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_135_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_135_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_135_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_135_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_135_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 71.13\nB: 88.76\nC: 45.99\nD: 63.02",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 71.13\nB: 88.76\nC: 45.99\nD: 63.02",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_136_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_136_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_136_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_136_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_136_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_136_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_136_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_136_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_136_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_136_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_136_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_136_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_136_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_136_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_136_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_136_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 16.45\nB: 15.03\nC: 14.0\nD: 13.32",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 16.45\nB: 15.03\nC: 14.0\nD: 13.32",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_137_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_137_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_137_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_137_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_137_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_137_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_137_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_137_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_137_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_137_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_137_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_137_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_137_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_137_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_137_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_137_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 32.49\nB: 9.52\nC: 44.0\nD: 24.05",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 32.49\nB: 9.52\nC: 44.0\nD: 24.05",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_138_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_138_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_138_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_138_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_138_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_138_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_138_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_138_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_138_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_138_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_138_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_138_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_138_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_138_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_138_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_138_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 25.67\nB: 76.96\nC: 87.45\nD: 43.98",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 25.67\nB: 76.96\nC: 87.45\nD: 43.98",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_139_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_139_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_139_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_139_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_139_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_139_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_139_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_139_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_139_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_139_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_139_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_139_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_139_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_139_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_139_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_139_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 72.41\nB: 41.9\nC: 32.38\nD: 93.6",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 72.41\nB: 41.9\nC: 32.38\nD: 93.6",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_140_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_140_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_140_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_140_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_140_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_140_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_140_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_140_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_140_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_140_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_140_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_140_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_140_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_140_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_140_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_140_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 35.05\nB: 79.28\nC: 46.7\nD: 84.6",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 35.05\nB: 79.28\nC: 46.7\nD: 84.6",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_141_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_141_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_141_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_141_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_141_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_141_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_141_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_141_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_141_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_141_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_141_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_141_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_141_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_141_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_141_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_141_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 101.34\nB: 42.34\nC: 36.95\nD: 77.4",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 101.34\nB: 42.34\nC: 36.95\nD: 77.4",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_142_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_142_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_142_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_142_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_142_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_142_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_142_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_142_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_142_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_142_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_142_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_142_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_142_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_142_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_142_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_142_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 39.85\nB: 23.26\nC: 11.0\nD: 29.1",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 39.85\nB: 23.26\nC: 11.0\nD: 29.1",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_143_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_143_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_143_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_143_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_143_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_143_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_143_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_143_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_143_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_143_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_143_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_143_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_143_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_143_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_143_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_143_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 69.17\nB: 82.13\nC: 88.66\nD: 54.0",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 69.17\nB: 82.13\nC: 88.66\nD: 54.0",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_144_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_144_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_144_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_144_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_144_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_144_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_144_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_144_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_144_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_144_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_144_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_144_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_144_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_144_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_144_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_144_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 16.63\nB: 14.83\nC: 13.58\nD: 15.26",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 16.63\nB: 14.83\nC: 13.58\nD: 15.26",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_145_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_145_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_145_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_145_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_145_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_145_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_145_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_145_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_145_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_145_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_145_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_145_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_145_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_145_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_145_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_145_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 91.08\nB: 67.02\nC: 86.56\nD: 51.84",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 91.08\nB: 67.02\nC: 86.56\nD: 51.84",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_146_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_146_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_146_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_146_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_146_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_146_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_146_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_146_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_146_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_146_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_146_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_146_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_146_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_146_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_146_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_146_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 93.6\nB: 34.32\nC: 45.3\nD: 80.35",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 93.6\nB: 34.32\nC: 45.3\nD: 80.35",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_147_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_147_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_147_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_147_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_147_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_147_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_147_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_147_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_147_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_147_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_147_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_147_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_147_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_147_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_147_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_147_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 92.76\nB: 67.5\nC: 29.61\nD: 57.89",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 92.76\nB: 67.5\nC: 29.61\nD: 57.89",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_148_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_148_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_148_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_148_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_148_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_148_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_148_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_148_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_148_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_148_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_148_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_148_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_148_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_148_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_148_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_148_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 54.53\nB: 83.14\nC: 70.61\nD: 37.4",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 54.53\nB: 83.14\nC: 70.61\nD: 37.4",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_149_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_149_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_149_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_149_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_149_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_149_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_149_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_149_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_149_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_149_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_149_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_149_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_149_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_149_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_149_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_149_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 16.27\nB: 14.39\nC: 14.98\nD: 13.46",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 16.27\nB: 14.39\nC: 14.98\nD: 13.46",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_150_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_150_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_150_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_150_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_150_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_150_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_150_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_150_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_150_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_150_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_150_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_150_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_150_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_150_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_150_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_150_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 76.5\nB: 84.69\nC: 28.49\nD: 56.71",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 76.5\nB: 84.69\nC: 28.49\nD: 56.71",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_151_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_151_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_151_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_151_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_151_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_151_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_151_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_151_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_151_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_151_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_151_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_151_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_151_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_151_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_151_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_151_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 14.6\nB: 16.51\nC: 13.21\nD: 14.95",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 14.6\nB: 16.51\nC: 13.21\nD: 14.95",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_152_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_152_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_152_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_152_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_152_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_152_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_152_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_152_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_152_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_152_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_152_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_152_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_152_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_152_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_152_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_152_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 14.04\nB: 16.4\nC: 13.4\nD: 15.01",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 14.04\nB: 16.4\nC: 13.4\nD: 15.01",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_153_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_153_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_153_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_153_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_153_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_153_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_153_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_153_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_153_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_153_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_153_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_153_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_153_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_153_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_153_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_153_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 82.8\nB: 70.51\nC: 42.58\nD: 37.94",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 82.8\nB: 70.51\nC: 42.58\nD: 37.94",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_154_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_154_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_154_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_154_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_154_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_154_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_154_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_154_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_154_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_154_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_154_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_154_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_154_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_154_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_154_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_154_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 48.64\nB: 74.5\nC: 83.35\nD: 55.78",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 48.64\nB: 74.5\nC: 83.35\nD: 55.78",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_155_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_155_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_155_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_155_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_155_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_155_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_155_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_155_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_155_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_155_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_155_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_155_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_155_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_155_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_155_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_155_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 14.95\nB: 13.58\nC: 16.0\nD: 14.83",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 14.95\nB: 13.58\nC: 16.0\nD: 14.83",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_156_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_156_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_156_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_156_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_156_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_156_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_156_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_156_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_156_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_156_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_156_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_156_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_156_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_156_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_156_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_156_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 97.5\nB: 73.72\nC: 80.44\nD: 51.6",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 97.5\nB: 73.72\nC: 80.44\nD: 51.6",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_157_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_157_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_157_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_157_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_157_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_157_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_157_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_157_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_157_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_157_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_157_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_157_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_157_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_157_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_157_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_157_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 80.0\nB: 86.4\nC: 39.44\nD: 55.46",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 80.0\nB: 86.4\nC: 39.44\nD: 55.46",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_158_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_158_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_158_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_158_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_158_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_158_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_158_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_158_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_158_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_158_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_158_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_158_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_158_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_158_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_158_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_158_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 32.39\nB: 43.5\nC: 15.24\nD: 29.0",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 32.39\nB: 43.5\nC: 15.24\nD: 29.0",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_159_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_159_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_159_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_159_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_159_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_159_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_159_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_159_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_159_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_159_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_159_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_159_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_159_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_159_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_159_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_159_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 82.8\nB: 37.35\nC: 78.23\nD: 49.83",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 82.8\nB: 37.35\nC: 78.23\nD: 49.83",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_160_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_160_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_160_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_160_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_160_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_160_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_160_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_160_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_160_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_160_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_160_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_160_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_160_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_160_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_160_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_160_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 40.75\nB: 19.58\nC: 10.08\nD: 36.0",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 40.75\nB: 19.58\nC: 10.08\nD: 36.0",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_161_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_161_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_161_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_161_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_161_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_161_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_161_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_161_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_161_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_161_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_161_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_161_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_161_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_161_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_161_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_161_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 14.68\nB: 13.47\nC: 15.27\nD: 16.06",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 14.68\nB: 13.47\nC: 15.27\nD: 16.06",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_162_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_162_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_162_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_162_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_162_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_162_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_162_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_162_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_162_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_162_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_162_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_162_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_162_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_162_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_162_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_162_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 16.0\nB: 14.89\nC: 14.5\nD: 13.53",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 16.0\nB: 14.89\nC: 14.5\nD: 13.53",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_163_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_163_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_163_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_163_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_163_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_163_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_163_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_163_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_163_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_163_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_163_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_163_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_163_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_163_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_163_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_163_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 15.03\nB: 14.01\nC: 13.19\nD: 16.56",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 15.03\nB: 14.01\nC: 13.19\nD: 16.56",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_164_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_164_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_164_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_164_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_164_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_164_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_164_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_164_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_164_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_164_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_164_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_164_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_164_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_164_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_164_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_164_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 14.2\nB: 16.29\nC: 13.27\nD: 15.71",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 14.2\nB: 16.29\nC: 13.27\nD: 15.71",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_165_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_165_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_165_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_165_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_165_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_165_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_165_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_165_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_165_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_165_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_165_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_165_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_165_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_165_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_165_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_165_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 65.74\nB: 51.2\nC: 98.5\nD: 32.59",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 65.74\nB: 51.2\nC: 98.5\nD: 32.59",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_166_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_166_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_166_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_166_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_166_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_166_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_166_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_166_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_166_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_166_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_166_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_166_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_166_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_166_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_166_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_166_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 35.99\nB: 28.0\nC: 11.27\nD: 44.98",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 35.99\nB: 28.0\nC: 11.27\nD: 44.98",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_167_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_167_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_167_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_167_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_167_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_167_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_167_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_167_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_167_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_167_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_167_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_167_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_167_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_167_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_167_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_167_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 93.6\nB: 24.93\nC: 46.53\nD: 76.42",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 93.6\nB: 24.93\nC: 46.53\nD: 76.42",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_168_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_168_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_168_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_168_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_168_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_168_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_168_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_168_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_168_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_168_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_168_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_168_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_168_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_168_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_168_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_168_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 28.75\nB: 38.43\nC: 44.48\nD: 10.0",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 28.75\nB: 38.43\nC: 44.48\nD: 10.0",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_169_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_169_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_169_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_169_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_169_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_169_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_169_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_169_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_169_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_169_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_169_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_169_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_169_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_169_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_169_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_169_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 49.6\nB: 41.25\nC: 92.75\nD: 70.1",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 49.6\nB: 41.25\nC: 92.75\nD: 70.1",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_170_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_170_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_170_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_170_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_170_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_170_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_170_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_170_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_170_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_170_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_170_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_170_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_170_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_170_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_170_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_170_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 48.18\nB: 22.07\nC: 92.38\nD: 81.6",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 48.18\nB: 22.07\nC: 92.38\nD: 81.6",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_171_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_171_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_171_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_171_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_171_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_171_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_171_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_171_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_171_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_171_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_171_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_171_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_171_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_171_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_171_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_171_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 87.45\nB: 34.76\nC: 77.5\nD: 41.85",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 87.45\nB: 34.76\nC: 77.5\nD: 41.85",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_172_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_172_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_172_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_172_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_172_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_172_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_172_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_172_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_172_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_172_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_172_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_172_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_172_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_172_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_172_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_172_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 16.16\nB: 34.32\nC: 45.45\nD: 29.0",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 16.16\nB: 34.32\nC: 45.45\nD: 29.0",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_173_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_173_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_173_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_173_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_173_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_173_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_173_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_173_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_173_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_173_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_173_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_173_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_173_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_173_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_173_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_173_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 57.09\nB: 81.93\nC: 45.29\nD: 72.71",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 57.09\nB: 81.93\nC: 45.29\nD: 72.71",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_174_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_174_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_174_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_174_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_174_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_174_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_174_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_174_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_174_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_174_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_174_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_174_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_174_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_174_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_174_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_174_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 14.94\nB: 13.47\nC: 15.9\nD: 14.73",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 14.94\nB: 13.47\nC: 15.9\nD: 14.73",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_175_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_175_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_175_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_175_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_175_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_175_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_175_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_175_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_175_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_175_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_175_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_175_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_175_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_175_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_175_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_175_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 35.17\nB: 54.56\nC: 66.0\nD: 90.91",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 35.17\nB: 54.56\nC: 66.0\nD: 90.91",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_176_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_176_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_176_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_176_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_176_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_176_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_176_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_176_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_176_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_176_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_176_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_176_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_176_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_176_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_176_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_176_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 14.59\nB: 13.65\nC: 16.31\nD: 15.23",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 14.59\nB: 13.65\nC: 16.31\nD: 15.23",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_177_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_177_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_177_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_177_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_177_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_177_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_177_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_177_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_177_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_177_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_177_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_177_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_177_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_177_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_177_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_177_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 27.96\nB: 41.0\nC: 11.9\nD: 32.86",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 27.96\nB: 41.0\nC: 11.9\nD: 32.86",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_178_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_178_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_178_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_178_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_178_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_178_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_178_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_178_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_178_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_178_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_178_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_178_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_178_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_178_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_178_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_178_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 74.71\nB: 51.0\nC: 75.78\nD: 99.25",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 74.71\nB: 51.0\nC: 75.78\nD: 99.25",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_179_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_179_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_179_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_179_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_179_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_179_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_179_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_179_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_179_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_179_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_179_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_179_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_179_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_179_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_179_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_179_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 101.34\nB: 74.25\nC: 34.1\nD: 46.12",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 101.34\nB: 74.25\nC: 34.1\nD: 46.12",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_180_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_180_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_180_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_180_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_180_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_180_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_180_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_180_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_180_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_180_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_180_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_180_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_180_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_180_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_180_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_180_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 17.75\nB: 42.44\nC: 31.0\nD: 11.74",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 17.75\nB: 42.44\nC: 31.0\nD: 11.74",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_181_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_181_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_181_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_181_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_181_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_181_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_181_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_181_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_181_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_181_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_181_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_181_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_181_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_181_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_181_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_181_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 77.89\nB: 55.89\nC: 27.81\nD: 83.2",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 77.89\nB: 55.89\nC: 27.81\nD: 83.2",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_182_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_182_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_182_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_182_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_182_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_182_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_182_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_182_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_182_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_182_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_182_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_182_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_182_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_182_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_182_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_182_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 94.56\nB: 58.85\nC: 65.26\nD: 81.6",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 94.56\nB: 58.85\nC: 65.26\nD: 81.6",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_183_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_183_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_183_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_183_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_183_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_183_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_183_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_183_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_183_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_183_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_183_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_183_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_183_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_183_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_183_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_183_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 23.0\nB: 56.64\nC: 75.07\nD: 102.6",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 23.0\nB: 56.64\nC: 75.07\nD: 102.6",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_184_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_184_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_184_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_184_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_184_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_184_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_184_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_184_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_184_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_184_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_184_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_184_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_184_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_184_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_184_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_184_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 14.38\nB: 13.93\nC: 16.39\nD: 15.1",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 14.38\nB: 13.93\nC: 16.39\nD: 15.1",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_185_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_185_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_185_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_185_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_185_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_185_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_185_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_185_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_185_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_185_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_185_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_185_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_185_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_185_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_185_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_185_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 43.81\nB: 73.78\nC: 84.79\nD: 57.46",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 43.81\nB: 73.78\nC: 84.79\nD: 57.46",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_186_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_186_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_186_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_186_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_186_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_186_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_186_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_186_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_186_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_186_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_186_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_186_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_186_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_186_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_186_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_186_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 26.45\nB: 78.21\nC: 52.14\nD: 100.8",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 26.45\nB: 78.21\nC: 52.14\nD: 100.8",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_187_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_187_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_187_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_187_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_187_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_187_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_187_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_187_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_187_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_187_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_187_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_187_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_187_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_187_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_187_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_187_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 94.63\nB: 70.2\nC: 25.23\nD: 54.96",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 94.63\nB: 70.2\nC: 25.23\nD: 54.96",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_188_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_188_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_188_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_188_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_188_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_188_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_188_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_188_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_188_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_188_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_188_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_188_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_188_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_188_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_188_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_188_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 62.72\nB: 91.88\nC: 64.2\nD: 40.77",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 62.72\nB: 91.88\nC: 64.2\nD: 40.77",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_189_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_189_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_189_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_189_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_189_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_189_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_189_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_189_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_189_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_189_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_189_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_189_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_189_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_189_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_189_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_189_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 47.0\nB: 9.51\nC: 38.33\nD: 20.67",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 47.0\nB: 9.51\nC: 38.33\nD: 20.67",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_190_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_190_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_190_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_190_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_190_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_190_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_190_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_190_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_190_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_190_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_190_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_190_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_190_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_190_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_190_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_190_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 78.4\nB: 24.91\nC: 86.85\nD: 60.54",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 78.4\nB: 24.91\nC: 86.85\nD: 60.54",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_191_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_191_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_191_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_191_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_191_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_191_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_191_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_191_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_191_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_191_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_191_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_191_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_191_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_191_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_191_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_191_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 63.76\nB: 54.54\nC: 84.48\nD: 95.09",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 63.76\nB: 54.54\nC: 84.48\nD: 95.09",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_192_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_192_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_192_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_192_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_192_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_192_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_192_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_192_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_192_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_192_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_192_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_192_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_192_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_192_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_192_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_192_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 41.0\nB: 10.89\nC: 36.0\nD: 24.91",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 41.0\nB: 10.89\nC: 36.0\nD: 24.91",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_193_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_193_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_193_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_193_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_193_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_193_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_193_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_193_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_193_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_193_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_193_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_193_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_193_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_193_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_193_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_193_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 88.27\nB: 32.81\nC: 43.39\nD: 81.0",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 88.27\nB: 32.81\nC: 43.39\nD: 81.0",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_194_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_194_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_194_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_194_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_194_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_194_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_194_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_194_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_194_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_194_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_194_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_194_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_194_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_194_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_194_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_194_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 36.57\nB: 90.89\nC: 60.88\nD: 64.62",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 36.57\nB: 90.89\nC: 60.88\nD: 64.62",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_195_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_195_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_195_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_195_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_195_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_195_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_195_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_195_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_195_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_195_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_195_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_195_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_195_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_195_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_195_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_195_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 79.28\nB: 49.74\nC: 39.01\nD: 69.35",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 79.28\nB: 49.74\nC: 39.01\nD: 69.35",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_196_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_196_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_196_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_196_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_196_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_196_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_196_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_196_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_196_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_196_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_196_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_196_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_196_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_196_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_196_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_196_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 23.36\nB: 79.8\nC: 89.26\nD: 44.0",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 23.36\nB: 79.8\nC: 89.26\nD: 44.0",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_197_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_197_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_197_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_197_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_197_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_197_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_197_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_197_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_197_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_197_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_197_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_197_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_197_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_197_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_197_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_197_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "AQA7",
+    "options": "A: 60.76\nB: 79.8\nC: 90.7\nD: 57.88",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 60.76\nB: 79.8\nC: 90.7\nD: 57.88",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_198_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_198_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_198_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_198_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_198_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_198_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_198_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_198_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_198_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_198_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_198_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_198_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_198_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_198_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_198_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_198_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "action_quality_assessment",
+    "visual_input_component": "Video image or Natural image",
+    "source": "UNLV",
+    "options": "A: 16.67\nB: 14.55\nC: 13.7\nD: 14.9",
+    "question": "What is the most probable action quality assessment number obtained by the person in the video?",
+    "context": "Select from the following choices.\nA: 16.67\nB: 14.55\nC: 13.7\nD: 14.9",
+    "input_image_path": [
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_199_0.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_199_1.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_199_2.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_199_3.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_199_4.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_199_5.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_199_6.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_199_7.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_199_8.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_199_9.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_199_10.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_199_11.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_199_12.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_199_13.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_199_14.png",
+      "../MMIU-Benchmark/action_quality_assessment/action_quality_assessment_199_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_0_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_0_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_0_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_0_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_0_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_1_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_1_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_1_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_1_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_1_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_2_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_2_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_2_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_2_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_2_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_3_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_3_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_3_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_3_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_3_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_4_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_4_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_4_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_4_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_4_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_5_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_5_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_5_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_5_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_5_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_6_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_6_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_6_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_6_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_6_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_7_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_7_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_7_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_7_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_7_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_8_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_8_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_8_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_8_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_8_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_9_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_9_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_9_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_9_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_9_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_10_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_10_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_10_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_10_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_10_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_11_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_11_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_11_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_11_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_11_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_12_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_12_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_12_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_12_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_12_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_13_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_13_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_13_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_13_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_13_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_14_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_14_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_14_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_14_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_14_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_15_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_15_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_15_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_15_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_15_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_16_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_16_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_16_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_16_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_16_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_17_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_17_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_17_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_17_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_17_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_18_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_18_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_18_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_18_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_18_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_19_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_19_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_19_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_19_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_19_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_20_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_20_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_20_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_20_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_20_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_21_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_21_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_21_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_21_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_21_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_22_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_22_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_22_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_22_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_22_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_23_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_23_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_23_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_23_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_23_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_24_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_24_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_24_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_24_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_24_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_25_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_25_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_25_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_25_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_25_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_26_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_26_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_26_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_26_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_26_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_27_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_27_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_27_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_27_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_27_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_28_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_28_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_28_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_28_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_28_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_29_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_29_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_29_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_29_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_29_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_30_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_30_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_30_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_30_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_30_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_31_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_31_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_31_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_31_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_31_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_32_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_32_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_32_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_32_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_32_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_33_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_33_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_33_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_33_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_33_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_34_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_34_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_34_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_34_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_34_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_35_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_35_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_35_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_35_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_35_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_36_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_36_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_36_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_36_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_36_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_37_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_37_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_37_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_37_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_37_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_38_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_38_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_38_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_38_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_38_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_39_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_39_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_39_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_39_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_39_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_40_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_40_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_40_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_40_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_40_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_41_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_41_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_41_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_41_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_41_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_42_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_42_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_42_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_42_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_42_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_43_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_43_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_43_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_43_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_43_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_44_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_44_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_44_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_44_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_44_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_45_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_45_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_45_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_45_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_45_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_46_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_46_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_46_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_46_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_46_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_47_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_47_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_47_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_47_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_47_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_48_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_48_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_48_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_48_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_48_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_49_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_49_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_49_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_49_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_49_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_50_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_50_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_50_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_50_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_50_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_51_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_51_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_51_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_51_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_51_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_52_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_52_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_52_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_52_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_52_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_53_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_53_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_53_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_53_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_53_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_54_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_54_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_54_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_54_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_54_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_55_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_55_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_55_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_55_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_55_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_56_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_56_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_56_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_56_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_56_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_57_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_57_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_57_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_57_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_57_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_58_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_58_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_58_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_58_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_58_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_59_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_59_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_59_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_59_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_59_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_60_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_60_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_60_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_60_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_60_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_61_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_61_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_61_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_61_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_61_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_62_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_62_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_62_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_62_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_62_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_63_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_63_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_63_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_63_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_63_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_64_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_64_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_64_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_64_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_64_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_65_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_65_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_65_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_65_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_65_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_66_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_66_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_66_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_66_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_66_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_67_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_67_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_67_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_67_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_67_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_68_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_68_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_68_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_68_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_68_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_69_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_69_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_69_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_69_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_69_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_70_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_70_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_70_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_70_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_70_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_71_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_71_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_71_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_71_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_71_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_72_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_72_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_72_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_72_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_72_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_73_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_73_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_73_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_73_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_73_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_74_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_74_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_74_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_74_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_74_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_75_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_75_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_75_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_75_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_75_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_76_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_76_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_76_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_76_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_76_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_77_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_77_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_77_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_77_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_77_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_78_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_78_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_78_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_78_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_78_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_79_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_79_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_79_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_79_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_79_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_80_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_80_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_80_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_80_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_80_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_81_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_81_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_81_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_81_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_81_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_82_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_82_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_82_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_82_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_82_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_83_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_83_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_83_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_83_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_83_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_84_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_84_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_84_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_84_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_84_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_85_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_85_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_85_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_85_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_85_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_86_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_86_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_86_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_86_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_86_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_87_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_87_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_87_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_87_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_87_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_88_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_88_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_88_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_88_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_88_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_89_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_89_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_89_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_89_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_89_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_90_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_90_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_90_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_90_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_90_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_91_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_91_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_91_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_91_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_91_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_92_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_92_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_92_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_92_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_92_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_93_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_93_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_93_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_93_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_93_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_94_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_94_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_94_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_94_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_94_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_95_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_95_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_95_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_95_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_95_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_96_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_96_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_96_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_96_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_96_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_97_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_97_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_97_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_97_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_97_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_98_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_98_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_98_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_98_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_98_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_99_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_99_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_99_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_99_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_99_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_100_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_100_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_100_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_100_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_100_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_101_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_101_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_101_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_101_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_101_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_102_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_102_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_102_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_102_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_102_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_103_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_103_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_103_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_103_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_103_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_104_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_104_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_104_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_104_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_104_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_105_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_105_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_105_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_105_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_105_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_106_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_106_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_106_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_106_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_106_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_107_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_107_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_107_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_107_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_107_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_108_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_108_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_108_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_108_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_108_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_109_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_109_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_109_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_109_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_109_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_110_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_110_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_110_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_110_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_110_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_111_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_111_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_111_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_111_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_111_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_112_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_112_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_112_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_112_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_112_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_113_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_113_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_113_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_113_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_113_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_114_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_114_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_114_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_114_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_114_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_115_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_115_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_115_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_115_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_115_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_116_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_116_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_116_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_116_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_116_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_117_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_117_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_117_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_117_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_117_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_118_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_118_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_118_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_118_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_118_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_119_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_119_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_119_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_119_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_119_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_120_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_120_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_120_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_120_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_120_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_121_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_121_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_121_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_121_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_121_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_122_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_122_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_122_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_122_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_122_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_123_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_123_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_123_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_123_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_123_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_124_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_124_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_124_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_124_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_124_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_125_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_125_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_125_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_125_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_125_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_126_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_126_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_126_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_126_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_126_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_127_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_127_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_127_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_127_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_127_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_128_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_128_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_128_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_128_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_128_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_129_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_129_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_129_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_129_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_129_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_130_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_130_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_130_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_130_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_130_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_131_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_131_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_131_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_131_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_131_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_132_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_132_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_132_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_132_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_132_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_133_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_133_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_133_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_133_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_133_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_134_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_134_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_134_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_134_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_134_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_135_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_135_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_135_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_135_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_135_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_136_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_136_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_136_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_136_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_136_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_137_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_137_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_137_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_137_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_137_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_138_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_138_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_138_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_138_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_138_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_139_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_139_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_139_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_139_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_139_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_140_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_140_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_140_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_140_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_140_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_141_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_141_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_141_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_141_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_141_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_142_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_142_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_142_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_142_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_142_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_143_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_143_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_143_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_143_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_143_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_144_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_144_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_144_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_144_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_144_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_145_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_145_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_145_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_145_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_145_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_146_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_146_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_146_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_146_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_146_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_147_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_147_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_147_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_147_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_147_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_148_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_148_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_148_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_148_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_148_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_149_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_149_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_149_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_149_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_149_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_150_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_150_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_150_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_150_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_150_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_151_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_151_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_151_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_151_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_151_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_152_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_152_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_152_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_152_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_152_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_153_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_153_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_153_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_153_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_153_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_154_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_154_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_154_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_154_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_154_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_155_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_155_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_155_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_155_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_155_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_156_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_156_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_156_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_156_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_156_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_157_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_157_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_157_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_157_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_157_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_158_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_158_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_158_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_158_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_158_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_159_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_159_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_159_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_159_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_159_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_160_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_160_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_160_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_160_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_160_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_161_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_161_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_161_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_161_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_161_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_162_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_162_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_162_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_162_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_162_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_163_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_163_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_163_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_163_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_163_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_164_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_164_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_164_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_164_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_164_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_165_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_165_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_165_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_165_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_165_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_166_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_166_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_166_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_166_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_166_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_167_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_167_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_167_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_167_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_167_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_168_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_168_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_168_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_168_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_168_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_169_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_169_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_169_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_169_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_169_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_170_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_170_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_170_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_170_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_170_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_171_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_171_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_171_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_171_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_171_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_172_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_172_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_172_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_172_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_172_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_173_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_173_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_173_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_173_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_173_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_174_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_174_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_174_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_174_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_174_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_175_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_175_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_175_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_175_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_175_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_176_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_176_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_176_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_176_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_176_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_177_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_177_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_177_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_177_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_177_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_178_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_178_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_178_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_178_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_178_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_179_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_179_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_179_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_179_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_179_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_180_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_180_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_180_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_180_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_180_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_181_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_181_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_181_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_181_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_181_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_182_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_182_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_182_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_182_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_182_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_183_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_183_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_183_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_183_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_183_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_184_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_184_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_184_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_184_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_184_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_185_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_185_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_185_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_185_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_185_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_186_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_186_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_186_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_186_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_186_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_187_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_187_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_187_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_187_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_187_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_188_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_188_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_188_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_188_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_188_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_189_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_189_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_189_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_189_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_189_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_190_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_190_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_190_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_190_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_190_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_191_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_191_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_191_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_191_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_191_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_192_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_192_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_192_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_192_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_192_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_193_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_193_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_193_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_193_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_193_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_194_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_194_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_194_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_194_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_194_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_195_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_195_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_195_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_195_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_195_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_196_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_196_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_196_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_196_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_196_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_197_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_197_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_197_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_197_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_197_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_198_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_198_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_198_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_198_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_198_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "next_img_prediction",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MovingMNIST",
+    "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image",
+    "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_199_0.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_199_1.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_199_2.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_199_3.png",
+      "../MMIU-Benchmark/next_img_prediction/next_img_prediction_199_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: Tolerable by humans, intolerable by dogs.\nB: Okay for humans, not okay for dogs.\nC: Acceptable for people, unacceptable for canines.\nD: Endurable by humans, not bearable by dogs.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: Tolerable by humans, intolerable by dogs.\nB: Okay for humans, not okay for dogs.\nC: Acceptable for people, unacceptable for canines.\nD: Endurable by humans, not bearable by dogs.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_0_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_0_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_0_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_0_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_0_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_0_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_0_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_0_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_0_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_0_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_0_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_0_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_0_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_0_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_0_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_0_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The woman was surprised to find a fish in the bag.\nB: A woman was feeding the fish and accidentally dropped the bag.\nC: The woman found a funny note inside the bag instead of a fish.\nD: A woman was carrying a red plastic bag and had the intention of releasing the fish inside. Unfortunately, when she opened the bag, the fish was already dead.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The woman was surprised to find a fish in the bag.\nB: A woman was feeding the fish and accidentally dropped the bag.\nC: The woman found a funny note inside the bag instead of a fish.\nD: A woman was carrying a red plastic bag and had the intention of releasing the fish inside. Unfortunately, when she opened the bag, the fish was already dead.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_1_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_1_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_1_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_1_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_1_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_1_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_1_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_1_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_1_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_1_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_1_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_1_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_1_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_1_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_1_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_1_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The button was strategically placed in an inconvenient location, resulting in a humorous sequence where the shoes were utilized to press it.\nB: An unexpected technical glitch caused the button to be out of reach, prompting the comical use of shoes in the video.\nC: The mishap occurred due to the person's clumsiness, leading to a comedic scene involving the use of shoes to activate the button.\nD: The distance was too great, making it impossible to press the button by hand, resulting in a funny situation where the shoes were used to press the button instead.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The button was strategically placed in an inconvenient location, resulting in a humorous sequence where the shoes were utilized to press it.\nB: An unexpected technical glitch caused the button to be out of reach, prompting the comical use of shoes in the video.\nC: The mishap occurred due to the person's clumsiness, leading to a comedic scene involving the use of shoes to activate the button.\nD: The distance was too great, making it impossible to press the button by hand, resulting in a funny situation where the shoes were used to press the button instead.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_2_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_2_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_2_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_2_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_2_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_2_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_2_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_2_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_2_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_2_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_2_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_2_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_2_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_2_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_2_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_2_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The person's hair turns neon green after the escape.\nB: The person miraculously grows a full head of hair after the escape.\nC: Following a miraculous escape, the individual is left with a completely bald head.\nD: The individual ends up with a stylish new haircut after the escape.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The person's hair turns neon green after the escape.\nB: The person miraculously grows a full head of hair after the escape.\nC: Following a miraculous escape, the individual is left with a completely bald head.\nD: The individual ends up with a stylish new haircut after the escape.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_3_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_3_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_3_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_3_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_3_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_3_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_3_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_3_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_3_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_3_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_3_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_3_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_3_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_3_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_3_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_3_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The video is funny because the fish is actually a toy fish, and the store owner wrote \"a sleeping fish\" which is a humorous way of expressing it.\nB: The video is funny because the fish is actually a fake fish, and the store owner wrote \"a sleeping fish\" which is a humorous way of expressing it.\nC:  The video is funny because the fish is actually a dead fish, and the store owner wrote \"a sleeping fish\" which is a humorous way of expressing it.\nD: The video is funny because the fish is actually a live fish, and the store owner wrote \"a sleeping fish\" which is a humorous way of expressing it.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The video is funny because the fish is actually a toy fish, and the store owner wrote \"a sleeping fish\" which is a humorous way of expressing it.\nB: The video is funny because the fish is actually a fake fish, and the store owner wrote \"a sleeping fish\" which is a humorous way of expressing it.\nC:  The video is funny because the fish is actually a dead fish, and the store owner wrote \"a sleeping fish\" which is a humorous way of expressing it.\nD: The video is funny because the fish is actually a live fish, and the store owner wrote \"a sleeping fish\" which is a humorous way of expressing it.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_4_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_4_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_4_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_4_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_4_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_4_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_4_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_4_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_4_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_4_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_4_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_4_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_4_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_4_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_4_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_4_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The shot reveals a room occupied by people dressed uniformly in red apparel. A female stands at the rear exit while a young man rises to his feet, placing his right hand on his head. One of the males seated at the front turns his head before twisting it back to its original position. The young man who stood up, pats the male seated in front with his hand.\nB: The scene captures a group of people wearing different colored outfits, engaged in various activities as they move around the room.\nC: In the video, a man and a woman engage in a serious conversation while others around them are busy with their activities.\nD: The video features a group of individuals in casual clothing, standing in a room filled with colorful decorations.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The shot reveals a room occupied by people dressed uniformly in red apparel. A female stands at the rear exit while a young man rises to his feet, placing his right hand on his head. One of the males seated at the front turns his head before twisting it back to its original position. The young man who stood up, pats the male seated in front with his hand.\nB: The scene captures a group of people wearing different colored outfits, engaged in various activities as they move around the room.\nC: In the video, a man and a woman engage in a serious conversation while others around them are busy with their activities.\nD: The video features a group of individuals in casual clothing, standing in a room filled with colorful decorations.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_5_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_5_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_5_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_5_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_5_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_5_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_5_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_5_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_5_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_5_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_5_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_5_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_5_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_5_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_5_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_5_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The comedic nature of the video stems from the individual's portrayal of a character from a well-known game, \"League of Legends,\" in an economical way.\nB: The video is humorous because the individual is mimicking a character from League of Legends called \"Master Yi\" in a cost-effective manner.\nC: The humor in the video comes from the individual's imitation of a character from a game known as \"Master Yi\" in a cost-efficient manner.\nD: The video is funny because the person is impersonating a character from a popular video game called \"League of Legends\" in a budget-friendly way.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The comedic nature of the video stems from the individual's portrayal of a character from a well-known game, \"League of Legends,\" in an economical way.\nB: The video is humorous because the individual is mimicking a character from League of Legends called \"Master Yi\" in a cost-effective manner.\nC: The humor in the video comes from the individual's imitation of a character from a game known as \"Master Yi\" in a cost-efficient manner.\nD: The video is funny because the person is impersonating a character from a popular video game called \"League of Legends\" in a budget-friendly way.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_6_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_6_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_6_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_6_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_6_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_6_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_6_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_6_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_6_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_6_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_6_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_6_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_6_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_6_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_6_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_6_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: A person in a yellow coat and brown hat stood near a flock of sheep. The person made a sudden gesture, causing the sheep to disperse in haste.\nB: A woman in a blue dress and white hat was surrounded by zebras. She shouted, causing the zebras to scatter in fear.\nC: A man wearing a green shirt and yellow cap approached a group of chickens. The man made a sudden movement, causing the chickens to run away in panic.\nD: A lady donning a red dress and a black hat is standing in front of the camera, accompanied by a few cows. The woman swiftly turns her face and scowls, resulting in the cows next to her hastily fleeing.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: A person in a yellow coat and brown hat stood near a flock of sheep. The person made a sudden gesture, causing the sheep to disperse in haste.\nB: A woman in a blue dress and white hat was surrounded by zebras. She shouted, causing the zebras to scatter in fear.\nC: A man wearing a green shirt and yellow cap approached a group of chickens. The man made a sudden movement, causing the chickens to run away in panic.\nD: A lady donning a red dress and a black hat is standing in front of the camera, accompanied by a few cows. The woman swiftly turns her face and scowls, resulting in the cows next to her hastily fleeing.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_7_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_7_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_7_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_7_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_7_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_7_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_7_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_7_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_7_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_7_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_7_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_7_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_7_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_7_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_7_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_7_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: A person enjoying a leisurely run.\nB:  A person running under compulsion.\nC: A person sprinting in a race for exercise.\nD: A person participating in a marathon for fun.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: A person enjoying a leisurely run.\nB:  A person running under compulsion.\nC: A person sprinting in a race for exercise.\nD: A person participating in a marathon for fun.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_8_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_8_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_8_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_8_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_8_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_8_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_8_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_8_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_8_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_8_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_8_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_8_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_8_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_8_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_8_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_8_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: Fizzy drink challenge\nB: Comparing home-made vs store-bought cola\nC: Real versus artificial cola\nD: Spot the difference: natural vs synthetic soda",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: Fizzy drink challenge\nB: Comparing home-made vs store-bought cola\nC: Real versus artificial cola\nD: Spot the difference: natural vs synthetic soda",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_9_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_9_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_9_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_9_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_9_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_9_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_9_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_9_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_9_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_9_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_9_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_9_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_9_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_9_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_9_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_9_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The hairdresser realizes the hairdryer is not working and tries to fix it, but accidentally sucks the customer's hair into the dryer.\nB: The hairdresser accidentally sets the customer's hair on fire with the hairdryer.\nC: With the help of a hairdryer, the hairdresser blows the customer's hair, causing all of it to be blown off.\nD: The hairdresser uses the hairdryer to blow the customer's wig off.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The hairdresser realizes the hairdryer is not working and tries to fix it, but accidentally sucks the customer's hair into the dryer.\nB: The hairdresser accidentally sets the customer's hair on fire with the hairdryer.\nC: With the help of a hairdryer, the hairdresser blows the customer's hair, causing all of it to be blown off.\nD: The hairdresser uses the hairdryer to blow the customer's wig off.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_10_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_10_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_10_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_10_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_10_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_10_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_10_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_10_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_10_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_10_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_10_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_10_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_10_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_10_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_10_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_10_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: What's the big deal with the belt?\nB: Feeling stressed about a belt?\nC: Why are you feeling anxious over picking up a simple belt?\nD: Why all the fuss over a belt?",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: What's the big deal with the belt?\nB: Feeling stressed about a belt?\nC: Why are you feeling anxious over picking up a simple belt?\nD: Why all the fuss over a belt?",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_11_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_11_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_11_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_11_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_11_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_11_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_11_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_11_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_11_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_11_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_11_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_11_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_11_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_11_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_11_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_11_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The upbeat music and flashing lights create an energetic atmosphere, and the child's constant nodding gives the impression of a dance party.\nB: The vibrant music and flickering lights create an atmosphere reminiscent of a jumping rave, and the child's constant nodding gives the impression of an adult enjoying themselves. Therefore, in my opinion, this video is quite captivating.\nC: The slow music and steady lights create a serene atmosphere, and the child's constant nodding gives the impression of boredom.\nD: The dim music and flickering lights create a spooky atmosphere, and the child's constant nodding gives the impression of being hypnotized.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The upbeat music and flashing lights create an energetic atmosphere, and the child's constant nodding gives the impression of a dance party.\nB: The vibrant music and flickering lights create an atmosphere reminiscent of a jumping rave, and the child's constant nodding gives the impression of an adult enjoying themselves. Therefore, in my opinion, this video is quite captivating.\nC: The slow music and steady lights create a serene atmosphere, and the child's constant nodding gives the impression of boredom.\nD: The dim music and flickering lights create a spooky atmosphere, and the child's constant nodding gives the impression of being hypnotized.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_12_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_12_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_12_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_12_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_12_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_12_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_12_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_12_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_12_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_12_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_12_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_12_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_12_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_12_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_12_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_12_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The comical struggle of oversized head and chair\nB: The unexpected snare of the oversized head\nC: The chair's revenge on the oversized head\nD: The oversized head ensnared by the chair's grip.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The comical struggle of oversized head and chair\nB: The unexpected snare of the oversized head\nC: The chair's revenge on the oversized head\nD: The oversized head ensnared by the chair's grip.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_13_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_13_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_13_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_13_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_13_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_13_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_13_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_13_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_13_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_13_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_13_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_13_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_13_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_13_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_13_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_13_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: Ping pong played with imaginary ball and paddle in the air.\nB: Hilarious attempt at playing table tennis with invisible opponents.\nC: Playing ping pong in an alternate dimension with invisible equipment.\nD: A funny game of table tennis without the table or the actual ball.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: Ping pong played with imaginary ball and paddle in the air.\nB: Hilarious attempt at playing table tennis with invisible opponents.\nC: Playing ping pong in an alternate dimension with invisible equipment.\nD: A funny game of table tennis without the table or the actual ball.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_14_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_14_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_14_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_14_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_14_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_14_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_14_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_14_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_14_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_14_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_14_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_14_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_14_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_14_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_14_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_14_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: As the man stripped, he revealed a funny hat instead of his bald head, leading to laughter from the woman.\nB: The man tripped and fell, causing the woman to burst into laughter and help him up.\nC: While dancing, the man's pants fell down, leading to laughter from the woman who then helped him cover up.\nD: While removing his clothes, the man accidentally lifted his wig, exposing his baldness, which caused the woman to burst into laughter. Without hesitation, she stood up and helped him secure the wig back in place.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: As the man stripped, he revealed a funny hat instead of his bald head, leading to laughter from the woman.\nB: The man tripped and fell, causing the woman to burst into laughter and help him up.\nC: While dancing, the man's pants fell down, leading to laughter from the woman who then helped him cover up.\nD: While removing his clothes, the man accidentally lifted his wig, exposing his baldness, which caused the woman to burst into laughter. Without hesitation, she stood up and helped him secure the wig back in place.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_15_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_15_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_15_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_15_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_15_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_15_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_15_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_15_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_15_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_15_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_15_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_15_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_15_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_15_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_15_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_15_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: A Golden Retriever is running away from its owner who is trying to give it a bath.\nB: A Golden Retriever is sitting in the bathtub with its owner playing music in the background.\nC: A Golden Retriever is barking at its owner in the bathroom.\nD:  A Golden Retriever is lying in front of the toilet bowl, making a sound, and its owner is patting its back from behind.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: A Golden Retriever is running away from its owner who is trying to give it a bath.\nB: A Golden Retriever is sitting in the bathtub with its owner playing music in the background.\nC: A Golden Retriever is barking at its owner in the bathroom.\nD:  A Golden Retriever is lying in front of the toilet bowl, making a sound, and its owner is patting its back from behind.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_16_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_16_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_16_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_16_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_16_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_16_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_16_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_16_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_16_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_16_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_16_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_16_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_16_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_16_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_16_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_16_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: A person slipped and slid down the icy stairs to the very bottom, and the second person, who observed the first person's fall, also slipped and fell.\nB: The first person managed to avoid slipping and falling down the icy stairs.\nC: The second person smoothly maneuvered around the icy stairs without any accidents.\nD: A person gracefully descended the icy stairs without any mishaps.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: A person slipped and slid down the icy stairs to the very bottom, and the second person, who observed the first person's fall, also slipped and fell.\nB: The first person managed to avoid slipping and falling down the icy stairs.\nC: The second person smoothly maneuvered around the icy stairs without any accidents.\nD: A person gracefully descended the icy stairs without any mishaps.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_17_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_17_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_17_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_17_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_17_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_17_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_17_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_17_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_17_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_17_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_17_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_17_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_17_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_17_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_17_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_17_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The humorous part is that the tap is triggered by a sensor with a small time lag, and turning it on manually is too quick to wash thoroughly. This scenario appears rather amusing.\nB: The humorous aspect is the delay in the water flow caused by the sensor, leading to a comical situation.\nC: The humor comes from the unexpected water flow timing which catches the person off-guard.\nD: The comedic effect is achieved by the water faucet turning on automatically while the person's hand is still nearby.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The humorous part is that the tap is triggered by a sensor with a small time lag, and turning it on manually is too quick to wash thoroughly. This scenario appears rather amusing.\nB: The humorous aspect is the delay in the water flow caused by the sensor, leading to a comical situation.\nC: The humor comes from the unexpected water flow timing which catches the person off-guard.\nD: The comedic effect is achieved by the water faucet turning on automatically while the person's hand is still nearby.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_18_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_18_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_18_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_18_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_18_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_18_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_18_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_18_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_18_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_18_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_18_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_18_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_18_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_18_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_18_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_18_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: A dog ran across the screen, completely ignoring the boy's laughter.\nB: The little boy started crying instead of laughing, which made everyone around him start crying too.\nC: A group of adults were standing in the background, looking bored and uninterested.\nD:  A little boy in a suit stood in the crowd, laughing so hard that he fell to the ground and tore his pants.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: A dog ran across the screen, completely ignoring the boy's laughter.\nB: The little boy started crying instead of laughing, which made everyone around him start crying too.\nC: A group of adults were standing in the background, looking bored and uninterested.\nD:  A little boy in a suit stood in the crowd, laughing so hard that he fell to the ground and tore his pants.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_19_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_19_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_19_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_19_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_19_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_19_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_19_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_19_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_19_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_19_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_19_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_19_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_19_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_19_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_19_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_19_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The man's attempt to place the final tile was a failure, and he became frustrated as the tiles fell one by one.\nB: The man's attempt to place the final tile was successful, but soon after, the tiles began to fall one by one until all of them had fallen. The man found it tragic and cried.\nC: The man's attempt to place the final tile was successful, but soon after, the tiles began to fall one by one until all of them had fallen. The man found it amusing and laughed.\nD: The man's attempt to place the final tile was successful, but soon after, the tiles began to fall one by one until all of them had fallen. The man found it confusing and became silent.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The man's attempt to place the final tile was a failure, and he became frustrated as the tiles fell one by one.\nB: The man's attempt to place the final tile was successful, but soon after, the tiles began to fall one by one until all of them had fallen. The man found it tragic and cried.\nC: The man's attempt to place the final tile was successful, but soon after, the tiles began to fall one by one until all of them had fallen. The man found it amusing and laughed.\nD: The man's attempt to place the final tile was successful, but soon after, the tiles began to fall one by one until all of them had fallen. The man found it confusing and became silent.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_20_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_20_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_20_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_20_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_20_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_20_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_20_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_20_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_20_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_20_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_20_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_20_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_20_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_20_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_20_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_20_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The video shows a cat wearing a hat with a funny expression on its face.\nB:  In the frame, there are two fingers with a piece of fabric resembling eyes and a mouth, cut into three openings. Subsequently, the fabric is placed on the cat's head.\nC: In the video, the cat is shown playing with a toy that resembles a human face.\nD: The cat in the video has been digitally altered to look like it is talking.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The video shows a cat wearing a hat with a funny expression on its face.\nB:  In the frame, there are two fingers with a piece of fabric resembling eyes and a mouth, cut into three openings. Subsequently, the fabric is placed on the cat's head.\nC: In the video, the cat is shown playing with a toy that resembles a human face.\nD: The cat in the video has been digitally altered to look like it is talking.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_21_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_21_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_21_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_21_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_21_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_21_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_21_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_21_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_21_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_21_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_21_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_21_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_21_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_21_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_21_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_21_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The humor in the video relied too heavily on offensive jokes.\nB: The video failed to capture the serious nature of the wedding event.\nC: The comical element of the video was overshadowed by awkwardness and discomfort.\nD: The groomsman's inquiry was inappropriate for the occasion of the wedding, but it stirred up thoughts of a comical relationship with a friend, resulting in amusement.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The humor in the video relied too heavily on offensive jokes.\nB: The video failed to capture the serious nature of the wedding event.\nC: The comical element of the video was overshadowed by awkwardness and discomfort.\nD: The groomsman's inquiry was inappropriate for the occasion of the wedding, but it stirred up thoughts of a comical relationship with a friend, resulting in amusement.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_22_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_22_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_22_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_22_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_22_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_22_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_22_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_22_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_22_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_22_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_22_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_22_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_22_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_22_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_22_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_22_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: Is Ultraman looking for a new mode of transportation?\nB: Is Ultraman practicing cow wrangling techniques?\nC: Is Ultraman a professional cow herder?\nD: Does Ultraman also require the services of a cow to be led?",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: Is Ultraman looking for a new mode of transportation?\nB: Is Ultraman practicing cow wrangling techniques?\nC: Is Ultraman a professional cow herder?\nD: Does Ultraman also require the services of a cow to be led?",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_23_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_23_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_23_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_23_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_23_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_23_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_23_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_23_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_23_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_23_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_23_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_23_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_23_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_23_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_23_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_23_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: A man closed the door of the bathroom.\nB:  A man pushed open the door of the bathroom.\nC: A man opened the door of the bedroom.\nD: A woman opened the door of the bathroom.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: A man closed the door of the bathroom.\nB:  A man pushed open the door of the bathroom.\nC: A man opened the door of the bedroom.\nD: A woman opened the door of the bathroom.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_24_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_24_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_24_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_24_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_24_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_24_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_24_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_24_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_24_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_24_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_24_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_24_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_24_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_24_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_24_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_24_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The first person's hilarious fall down the stairs, one step at a time, was topped by the second person who repeated the same and fell down as well.\nB: The unexpected nature of the falls\nC: The synchronized falling of two people\nD: The well-choreographed tumble down the stairs",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The first person's hilarious fall down the stairs, one step at a time, was topped by the second person who repeated the same and fell down as well.\nB: The unexpected nature of the falls\nC: The synchronized falling of two people\nD: The well-choreographed tumble down the stairs",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_25_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_25_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_25_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_25_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_25_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_25_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_25_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_25_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_25_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_25_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_25_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_25_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_25_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_25_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_25_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_25_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: What's the point of this game?\nB: Can we just stop playing?\nC: Are we done with the game yet?\nD: Why aren't we continuing the game?",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: What's the point of this game?\nB: Can we just stop playing?\nC: Are we done with the game yet?\nD: Why aren't we continuing the game?",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_26_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_26_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_26_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_26_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_26_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_26_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_26_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_26_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_26_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_26_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_26_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_26_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_26_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_26_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_26_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_26_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: Singing dog.\nB: Resting fish.\nC: Flying elephant.\nD: Dancing cat.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: Singing dog.\nB: Resting fish.\nC: Flying elephant.\nD: Dancing cat.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_27_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_27_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_27_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_27_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_27_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_27_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_27_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_27_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_27_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_27_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_27_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_27_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_27_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_27_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_27_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_27_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: Footwear, come and lend a hand.\nB: Boots, come and support me.\nC:  Shoes, come and help me.\nD: Sneakers, come and assist me.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: Footwear, come and lend a hand.\nB: Boots, come and support me.\nC:  Shoes, come and help me.\nD: Sneakers, come and assist me.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_28_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_28_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_28_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_28_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_28_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_28_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_28_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_28_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_28_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_28_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_28_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_28_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_28_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_28_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_28_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_28_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: Uncontrollable eruption of chuckles\nB: I had a sudden laugh of release.\nC: Unexpected burst of laughter\nD: Spontaneous outburst of giggles",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: Uncontrollable eruption of chuckles\nB: I had a sudden laugh of release.\nC: Unexpected burst of laughter\nD: Spontaneous outburst of giggles",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_29_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_29_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_29_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_29_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_29_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_29_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_29_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_29_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_29_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_29_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_29_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_29_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_29_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_29_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_29_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_29_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The child's reaction to realizing the cola was different from what he expected, resulting in the humor of the video.\nB: The child mistakenly took a bite of the girl's hand instead of the cola, which caused the humor.\nC: The girl intentionally tricked the child into biting her hand instead of the cola, leading to the comedic effect.\nD: Believing that the cola in the girl's hand was the same as the one she handed him, the child took a bite only to realize it wasn't, and the situation was humorous.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The child's reaction to realizing the cola was different from what he expected, resulting in the humor of the video.\nB: The child mistakenly took a bite of the girl's hand instead of the cola, which caused the humor.\nC: The girl intentionally tricked the child into biting her hand instead of the cola, leading to the comedic effect.\nD: Believing that the cola in the girl's hand was the same as the one she handed him, the child took a bite only to realize it wasn't, and the situation was humorous.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_30_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_30_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_30_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_30_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_30_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_30_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_30_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_30_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_30_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_30_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_30_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_30_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_30_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_30_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_30_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_30_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: Liberation? No, it's murder!\nB: Emancipation? No, it's carnage!\nC: Release? No, it's chaos!\nD: Freedom? No, it's mayhem!",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: Liberation? No, it's murder!\nB: Emancipation? No, it's carnage!\nC: Release? No, it's chaos!\nD: Freedom? No, it's mayhem!",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_31_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_31_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_31_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_31_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_31_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_31_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_31_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_31_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_31_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_31_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_31_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_31_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_31_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_31_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_31_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_31_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: Swift Sword Savior\nB: Rapid Dagger Martyr\nC: Quick Knife Sinner\nD: Fast Blade Saint",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: Swift Sword Savior\nB: Rapid Dagger Martyr\nC: Quick Knife Sinner\nD: Fast Blade Saint",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_32_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_32_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_32_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_32_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_32_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_32_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_32_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_32_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_32_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_32_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_32_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_32_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_32_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_32_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_32_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_32_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The fast movement of the dog.\nB: The unexpected appearance of a panda in the video.\nC: The funny dance moves of the person in the panda costume.\nD: The person at the back dressed in a panda costume cannot manage the dog in front, and hence, has to run alongside the dog.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The fast movement of the dog.\nB: The unexpected appearance of a panda in the video.\nC: The funny dance moves of the person in the panda costume.\nD: The person at the back dressed in a panda costume cannot manage the dog in front, and hence, has to run alongside the dog.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_33_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_33_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_33_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_33_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_33_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_33_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_33_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_33_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_33_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_33_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_33_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_33_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_33_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_33_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_33_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_33_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The best man tripped and fell between the groom and the bride while carrying the rings.\nB: The best man was standing next to the groom and the bride and made a serious expression.\nC:  The best man stands between the groom and the bride and asks the groom, \"What about me?\"\nD: The best man asked the bride instead of the groom, \"What about me?\"",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The best man tripped and fell between the groom and the bride while carrying the rings.\nB: The best man was standing next to the groom and the bride and made a serious expression.\nC:  The best man stands between the groom and the bride and asks the groom, \"What about me?\"\nD: The best man asked the bride instead of the groom, \"What about me?\"",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_34_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_34_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_34_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_34_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_34_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_34_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_34_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_34_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_34_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_34_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_34_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_34_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_34_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_34_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_34_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_34_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: A man trying to dance but failing miserably.\nB: A young kid who is enthusiastically bopping to the rhythm of hip-hop music.\nC: A group of elderly people doing a slow dance to classical music.\nD: A cat enthusiastically bopping to the rhythm of heavy metal music.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: A man trying to dance but failing miserably.\nB: A young kid who is enthusiastically bopping to the rhythm of hip-hop music.\nC: A group of elderly people doing a slow dance to classical music.\nD: A cat enthusiastically bopping to the rhythm of heavy metal music.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_35_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_35_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_35_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_35_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_35_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_35_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_35_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_35_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_35_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_35_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_35_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_35_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_35_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_35_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_35_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_35_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: When you wash your hands at the sink, if you forget to turn off the tap, the water might keep running, leading to an unexpected wet surprise.\nB: When you wash your hands at the sink, if you select the correct faucet, the water might spill out of the adjacent faucet, causing you to overlook cleaning your hands.\nC: When you wash your hands at the sink, if you select the toilet instead of the sink, the water might spill out, causing a funny mix-up.\nD: When you wash your hands at the sink, if you select the incorrect faucet, the water might spill out of the adjacent faucet, causing you to overlook cleaning your hands.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: When you wash your hands at the sink, if you forget to turn off the tap, the water might keep running, leading to an unexpected wet surprise.\nB: When you wash your hands at the sink, if you select the correct faucet, the water might spill out of the adjacent faucet, causing you to overlook cleaning your hands.\nC: When you wash your hands at the sink, if you select the toilet instead of the sink, the water might spill out, causing a funny mix-up.\nD: When you wash your hands at the sink, if you select the incorrect faucet, the water might spill out of the adjacent faucet, causing you to overlook cleaning your hands.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_36_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_36_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_36_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_36_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_36_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_36_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_36_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_36_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_36_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_36_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_36_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_36_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_36_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_36_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_36_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_36_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: A person gets their head stuck in a chair and struggles to get free\nB: The man tries to perform a stunt but fails miserably\nC: With the intention of squeezing through the gap between the chair's seats, the man finds his head tightly trapped, leading to a series of unsuccessful spins without achieving freedom.\nD: The man accidentally falls off the chair while attempting a funny move",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: A person gets their head stuck in a chair and struggles to get free\nB: The man tries to perform a stunt but fails miserably\nC: With the intention of squeezing through the gap between the chair's seats, the man finds his head tightly trapped, leading to a series of unsuccessful spins without achieving freedom.\nD: The man accidentally falls off the chair while attempting a funny move",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_37_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_37_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_37_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_37_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_37_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_37_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_37_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_37_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_37_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_37_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_37_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_37_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_37_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_37_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_37_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_37_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The funny part is the woman's fear of the cow, which is both surprising and amusing.\nB: The humor in this comes from the cow's reaction, which is unexpected and hilarious.\nC: The humor in this lies in the woman's attempt to scare the cow, which is an absurd and comical act.\nD: The video's humor stems from the cow's confusion, creating a lighthearted and entertaining moment.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The funny part is the woman's fear of the cow, which is both surprising and amusing.\nB: The humor in this comes from the cow's reaction, which is unexpected and hilarious.\nC: The humor in this lies in the woman's attempt to scare the cow, which is an absurd and comical act.\nD: The video's humor stems from the cow's confusion, creating a lighthearted and entertaining moment.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_38_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_38_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_38_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_38_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_38_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_38_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_38_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_38_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_38_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_38_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_38_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_38_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_38_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_38_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_38_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_38_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: When your outfit doesn't impress the cat\nB: A cat's opinion on your fashion choices\nC: The cat is not a fan of your mask\nD: Your mask might not be appreciated by cats.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: When your outfit doesn't impress the cat\nB: A cat's opinion on your fashion choices\nC: The cat is not a fan of your mask\nD: Your mask might not be appreciated by cats.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_39_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_39_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_39_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_39_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_39_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_39_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_39_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_39_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_39_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_39_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_39_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_39_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_39_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_39_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_39_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_39_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: Transporting wig parts to their end location.\nB: Shipping wig pieces to their destination.\nC: Sending wig components to their ultimate destination.\nD: Delivering hairpieces to their final stop.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: Transporting wig parts to their end location.\nB: Shipping wig pieces to their destination.\nC: Sending wig components to their ultimate destination.\nD: Delivering hairpieces to their final stop.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_40_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_40_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_40_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_40_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_40_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_40_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_40_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_40_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_40_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_40_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_40_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_40_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_40_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_40_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_40_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_40_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: A child and two dogs are playing with a ball on the beach.\nB: The dogs are barking at the child while playing on the beach.\nC: The child is chasing the dogs on the beach.\nD:  A child and two dogs are lying on the beach. The child kicks one of the dogs with their foot, and the dog gets up and retaliates by digging sand back at the child.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: A child and two dogs are playing with a ball on the beach.\nB: The dogs are barking at the child while playing on the beach.\nC: The child is chasing the dogs on the beach.\nD:  A child and two dogs are lying on the beach. The child kicks one of the dogs with their foot, and the dog gets up and retaliates by digging sand back at the child.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_41_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_41_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_41_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_41_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_41_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_41_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_41_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_41_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_41_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_41_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_41_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_41_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_41_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_41_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_41_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_41_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The use of dramatic lighting and sound effects contributes to the humor.\nB: The unexpected appearance of a superhero in a cowboy role creates the humor.\nC: The camera captures a rural setting, where a man is seen dressed up as Ultraman and performing the actions of a cowboy. This contrast in the environment and the character of Ultraman makes the scene quite amusing.\nD: The use of futuristic technology in a traditional rural setting adds to the humor.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The use of dramatic lighting and sound effects contributes to the humor.\nB: The unexpected appearance of a superhero in a cowboy role creates the humor.\nC: The camera captures a rural setting, where a man is seen dressed up as Ultraman and performing the actions of a cowboy. This contrast in the environment and the character of Ultraman makes the scene quite amusing.\nD: The use of futuristic technology in a traditional rural setting adds to the humor.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_42_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_42_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_42_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_42_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_42_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_42_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_42_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_42_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_42_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_42_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_42_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_42_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_42_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_42_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_42_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_42_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: Makeover for a bald head.\nB: Transformation for a bald head.\nC: Haircut for a bald head.\nD: Shaving day for a bald head.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: Makeover for a bald head.\nB: Transformation for a bald head.\nC: Haircut for a bald head.\nD: Shaving day for a bald head.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_43_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_43_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_43_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_43_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_43_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_43_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_43_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_43_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_43_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_43_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_43_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_43_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_43_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_43_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_43_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_43_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: Pants: Allow me to begin with a hearty laugh.\nB: Skirt: Let's kick off with a bored yawn.\nC: Shorts: Let's kick off with a hearty cry.\nD: Shirt: Let's start with a sad sigh.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: Pants: Allow me to begin with a hearty laugh.\nB: Skirt: Let's kick off with a bored yawn.\nC: Shorts: Let's kick off with a hearty cry.\nD: Shirt: Let's start with a sad sigh.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_44_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_44_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_44_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_44_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_44_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_44_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_44_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_44_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_44_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_44_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_44_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_44_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_44_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_44_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_44_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_44_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The tense music in the background adds to the suspense and makes the situation funnier.\nB:  The first two people thought that the bathroom door needed to be pulled open, and they pulled hard but couldn't open it. Then a man who came later easily pushed the door open and went into the bathroom to wash his hands. This contrast is very funny.\nC: The unexpected twist at the end, where the man effortlessly opens the door, creates the comedic effect.\nD: The exaggerated facial expressions of the characters make the situation comical.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The tense music in the background adds to the suspense and makes the situation funnier.\nB:  The first two people thought that the bathroom door needed to be pulled open, and they pulled hard but couldn't open it. Then a man who came later easily pushed the door open and went into the bathroom to wash his hands. This contrast is very funny.\nC: The unexpected twist at the end, where the man effortlessly opens the door, creates the comedic effect.\nD: The exaggerated facial expressions of the characters make the situation comical.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_45_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_45_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_45_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_45_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_45_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_45_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_45_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_45_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_45_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_45_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_45_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_45_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_45_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_45_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_45_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_45_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The humor in the video arises from the Golden Retriever's thoughtful decision-making process before consuming the beer.\nB: The humor is derived from the Golden Retriever's graceful and elegant behavior while drinking beer, showcasing his refined taste and manners.\nC: The video's humor comes from the Golden Retriever's responsible drinking habits and his ability to handle alcohol well.\nD: The Golden Retriever started by sneakily drinking beer, but his limited capacity for alcohol led to him clutching the toilet bowl and vomiting. It was quite a comical sight to see the Golden Retriever holding onto the toilet bowl while throwing up.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The humor in the video arises from the Golden Retriever's thoughtful decision-making process before consuming the beer.\nB: The humor is derived from the Golden Retriever's graceful and elegant behavior while drinking beer, showcasing his refined taste and manners.\nC: The video's humor comes from the Golden Retriever's responsible drinking habits and his ability to handle alcohol well.\nD: The Golden Retriever started by sneakily drinking beer, but his limited capacity for alcohol led to him clutching the toilet bowl and vomiting. It was quite a comical sight to see the Golden Retriever holding onto the toilet bowl while throwing up.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_46_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_46_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_46_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_46_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_46_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_46_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_46_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_46_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_46_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_46_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_46_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_46_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_46_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_46_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_46_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_46_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The person's initial approach seemed like they were catching a dangerous animal, but the final outcome of catching a belt was mundane.\nB: The person's initial careful approach seemed like they were catching a live fish, but the final outcome of catching a belt was amusing.\nC: The person's initial careful approach seemed like they were catching a snake, but the final outcome of catching a belt was comical.\nD: The person's initial careful approach seemed like they were catching a valuable item, but the final outcome of catching a belt was disappointing.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The person's initial approach seemed like they were catching a dangerous animal, but the final outcome of catching a belt was mundane.\nB: The person's initial careful approach seemed like they were catching a live fish, but the final outcome of catching a belt was amusing.\nC: The person's initial careful approach seemed like they were catching a snake, but the final outcome of catching a belt was comical.\nD: The person's initial careful approach seemed like they were catching a valuable item, but the final outcome of catching a belt was disappointing.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_47_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_47_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_47_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_47_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_47_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_47_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_47_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_47_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_47_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_47_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_47_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_47_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_47_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_47_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_47_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_47_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The comedic effect is due to the impressive special effects used to create the illusion of a bouncing ball.\nB: The humor comes from the intense and competitive facial expressions of the people playing table tennis.\nC: The video is not comedic; it is a serious demonstration of table tennis skills.\nD: Two people were pretending to play table tennis, but there was no ball involved. The sound of the ball hitting the paddle was actually the person next to them patting their stomach, which was hilarious.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The comedic effect is due to the impressive special effects used to create the illusion of a bouncing ball.\nB: The humor comes from the intense and competitive facial expressions of the people playing table tennis.\nC: The video is not comedic; it is a serious demonstration of table tennis skills.\nD: Two people were pretending to play table tennis, but there was no ball involved. The sound of the ball hitting the paddle was actually the person next to them patting their stomach, which was hilarious.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_48_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_48_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_48_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_48_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_48_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_48_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_48_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_48_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_48_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_48_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_48_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_48_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_48_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_48_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_48_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_48_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: A red-haired woman looked in the mirror and combed her long hair elegantly.\nB: A red-haired woman with long hair approached the mirror and was surprised to find a different hairstyle.\nC: A red-haired woman with long hair approached the mirror and lowered her head, only to find a bald patch on top of her head.\nD: A red-haired woman with long hair approached the mirror and laughed at her reflection.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: A red-haired woman looked in the mirror and combed her long hair elegantly.\nB: A red-haired woman with long hair approached the mirror and was surprised to find a different hairstyle.\nC: A red-haired woman with long hair approached the mirror and lowered her head, only to find a bald patch on top of her head.\nD: A red-haired woman with long hair approached the mirror and laughed at her reflection.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_49_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_49_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_49_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_49_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_49_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_49_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_49_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_49_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_49_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_49_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_49_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_49_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_49_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_49_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_49_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_49_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: As the lid of the wooden board is lifted, a cat suddenly pops out and starts meowing loudly before darting away.\nB: After lifting the lid of the wooden board, a loud horn noise is heard, and a clown pops out with confetti before disappearing.\nC: When one person lifts the lid of a wooden board, the other person immediately appears, sticking their head out. The second person shakes slightly, and then turns around, leaving the scene.\nD: One person opens the lid of a plastic container, and the other person jumps out with a scary expression before quickly running off.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: As the lid of the wooden board is lifted, a cat suddenly pops out and starts meowing loudly before darting away.\nB: After lifting the lid of the wooden board, a loud horn noise is heard, and a clown pops out with confetti before disappearing.\nC: When one person lifts the lid of a wooden board, the other person immediately appears, sticking their head out. The second person shakes slightly, and then turns around, leaving the scene.\nD: One person opens the lid of a plastic container, and the other person jumps out with a scary expression before quickly running off.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_50_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_50_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_50_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_50_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_50_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_50_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_50_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_50_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_50_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_50_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_50_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_50_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_50_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_50_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_50_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_50_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: A man is seated on a ledge near the water, dressed in only a single pair of pants. He extends his arms, and a small dog leaps and unintentionally crashes into the man's groin area. Subsequently, the man doubles over, grimacing in pain as he holds his groin.\nB: The man is practicing yoga on the edge of the water and suddenly loses balance.\nC: A man is sitting calmly by the water, enjoying the peaceful scenery.\nD: The man stands up and starts dancing to the music playing in the background.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: A man is seated on a ledge near the water, dressed in only a single pair of pants. He extends his arms, and a small dog leaps and unintentionally crashes into the man's groin area. Subsequently, the man doubles over, grimacing in pain as he holds his groin.\nB: The man is practicing yoga on the edge of the water and suddenly loses balance.\nC: A man is sitting calmly by the water, enjoying the peaceful scenery.\nD: The man stands up and starts dancing to the music playing in the background.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_51_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_51_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_51_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_51_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_51_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_51_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_51_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_51_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_51_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_51_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_51_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_51_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_51_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_51_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_51_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_51_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The video's humor arises from the slow-motion replay of the husky's fall, exaggerating its clumsiness.\nB: The video's humor is derived from the realistic CGI effects used to make the husky's fall look convincing.\nC: The video's amusement factor stems from the husky's carefree gait causing it to tumble off the bridge, which is undoubtedly humorous.\nD: The video's humor comes from the serious music in the background creating a contrasting effect with the husky's actions.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The video's humor arises from the slow-motion replay of the husky's fall, exaggerating its clumsiness.\nB: The video's humor is derived from the realistic CGI effects used to make the husky's fall look convincing.\nC: The video's amusement factor stems from the husky's carefree gait causing it to tumble off the bridge, which is undoubtedly humorous.\nD: The video's humor comes from the serious music in the background creating a contrasting effect with the husky's actions.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_52_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_52_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_52_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_52_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_52_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_52_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_52_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_52_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_52_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_52_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_52_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_52_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_52_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_52_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_52_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_52_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The video lacks any humor or comedic elements, and is intended to be informative.\nB: The video is serious and educational, aimed at teaching important life lessons.\nC: The video is a heartwarming and emotional portrayal of the student-coach relationship, without any comedic elements.\nD: The coach's perplexing behavior in order to teach the student was amusing.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The video lacks any humor or comedic elements, and is intended to be informative.\nB: The video is serious and educational, aimed at teaching important life lessons.\nC: The video is a heartwarming and emotional portrayal of the student-coach relationship, without any comedic elements.\nD: The coach's perplexing behavior in order to teach the student was amusing.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_53_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_53_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_53_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_53_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_53_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_53_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_53_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_53_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_53_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_53_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_53_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_53_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_53_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_53_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_53_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_53_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: A dog is carrying a red balloon on its back while running happily.\nB: A plastic bottle emitting blue smoke is being carried on the back of a dog while it walks ahead.\nC: A cat is pulling a cart with a red bottle on it while walking leisurely.\nD: A squirrel is carrying a yellow bag on its back while scurrying around.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: A dog is carrying a red balloon on its back while running happily.\nB: A plastic bottle emitting blue smoke is being carried on the back of a dog while it walks ahead.\nC: A cat is pulling a cart with a red bottle on it while walking leisurely.\nD: A squirrel is carrying a yellow bag on its back while scurrying around.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_54_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_54_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_54_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_54_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_54_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_54_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_54_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_54_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_54_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_54_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_54_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_54_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_54_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_54_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_54_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_54_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: Hardcore Sit-ups to Heavy Metal\nB: Rock and Roll Yoga Session\nC: Dance Party Workout\nD: A sit-up exercise routine inspired by rock and roll movements.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: Hardcore Sit-ups to Heavy Metal\nB: Rock and Roll Yoga Session\nC: Dance Party Workout\nD: A sit-up exercise routine inspired by rock and roll movements.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_55_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_55_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_55_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_55_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_55_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_55_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_55_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_55_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_55_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_55_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_55_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_55_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_55_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_55_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_55_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_55_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The man's frustration with the hand sanitizer dispenser is understandable and not intended to be humorous.\nB: The man's reaction to the malfunctioning hand sanitizer dispenser is distressing and not funny at all.\nC: The man finds the malfunctioning hand sanitizer dispenser annoying but not in a humorous way.\nD: The man becomes infuriated as the hand sanitizer dispenser keeps dispensing the liquid after he's done cleaning his hands, causing him to toss the cloth he was using, which is quite comical.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The man's frustration with the hand sanitizer dispenser is understandable and not intended to be humorous.\nB: The man's reaction to the malfunctioning hand sanitizer dispenser is distressing and not funny at all.\nC: The man finds the malfunctioning hand sanitizer dispenser annoying but not in a humorous way.\nD: The man becomes infuriated as the hand sanitizer dispenser keeps dispensing the liquid after he's done cleaning his hands, causing him to toss the cloth he was using, which is quite comical.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_56_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_56_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_56_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_56_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_56_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_56_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_56_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_56_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_56_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_56_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_56_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_56_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_56_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_56_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_56_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_56_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The perfectly executed somersault by the child\nB: The child's impeccable balance and gracefulness\nC:  The child fell down because his left hand did not support him, which he did not anticipate. This accident seemed comical to the onlookers.\nD: The child's careful and calculated movement",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The perfectly executed somersault by the child\nB: The child's impeccable balance and gracefulness\nC:  The child fell down because his left hand did not support him, which he did not anticipate. This accident seemed comical to the onlookers.\nD: The child's careful and calculated movement",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_57_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_57_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_57_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_57_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_57_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_57_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_57_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_57_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_57_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_57_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_57_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_57_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_57_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_57_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_57_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_57_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: A man in a chicken costume playing the guitar in front of a crowd.\nB: A black and white cow-patterned feline is crammed inside a water glass and wildly shaking its head.\nC: A small brown dog wearing a hat and sunglasses is riding a skateboard in the park.\nD: A group of colorful parrots singing and dancing on a tree branch.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: A man in a chicken costume playing the guitar in front of a crowd.\nB: A black and white cow-patterned feline is crammed inside a water glass and wildly shaking its head.\nC: A small brown dog wearing a hat and sunglasses is riding a skateboard in the park.\nD: A group of colorful parrots singing and dancing on a tree branch.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_58_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_58_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_58_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_58_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_58_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_58_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_58_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_58_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_58_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_58_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_58_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_58_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_58_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_58_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_58_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_58_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The man's never-ending fall is a source of laughter.\nB: The background music adds to the comedic effect.\nC: The sudden change in lighting creates a hilarious atmosphere.\nD: The unexpected appearance of a dancing dog steals the show.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The man's never-ending fall is a source of laughter.\nB: The background music adds to the comedic effect.\nC: The sudden change in lighting creates a hilarious atmosphere.\nD: The unexpected appearance of a dancing dog steals the show.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_59_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_59_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_59_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_59_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_59_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_59_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_59_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_59_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_59_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_59_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_59_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_59_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_59_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_59_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_59_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_59_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: Pretending to recognize and be familiar with strangers.\nB: Pretending to know the lyrics to a song at a karaoke night.\nC: Acting like a professional chef in a cooking show.\nD: Attempting to blend in with a group of tourists.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: Pretending to recognize and be familiar with strangers.\nB: Pretending to know the lyrics to a song at a karaoke night.\nC: Acting like a professional chef in a cooking show.\nD: Attempting to blend in with a group of tourists.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_60_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_60_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_60_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_60_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_60_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_60_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_60_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_60_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_60_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_60_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_60_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_60_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_60_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_60_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_60_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_60_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The secret to clean hands revealed.\nB:  The problem of washing hands.\nC: The art of washing hands.\nD: A guide to perfect hand hygiene.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The secret to clean hands revealed.\nB:  The problem of washing hands.\nC: The art of washing hands.\nD: A guide to perfect hand hygiene.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_61_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_61_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_61_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_61_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_61_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_61_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_61_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_61_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_61_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_61_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_61_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_61_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_61_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_61_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_61_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_61_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A:  The chair is about to collapse due to someone sitting on it. A special effect is used to show a person struggling to hold the legs of two chairs with a rope, which looks a bit funny.\nB: The video is comedic because the chairs are perfectly stable and nothing interesting happens.\nC: The video is comedic because the person sitting on the chair is not struggling at all and everything seems normal.\nD: The video is comedic because the special effect makes the chair collapse in a serious and dangerous way.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA:  The chair is about to collapse due to someone sitting on it. A special effect is used to show a person struggling to hold the legs of two chairs with a rope, which looks a bit funny.\nB: The video is comedic because the chairs are perfectly stable and nothing interesting happens.\nC: The video is comedic because the person sitting on the chair is not struggling at all and everything seems normal.\nD: The video is comedic because the special effect makes the chair collapse in a serious and dangerous way.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_62_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_62_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_62_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_62_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_62_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_62_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_62_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_62_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_62_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_62_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_62_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_62_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_62_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_62_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_62_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_62_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: A man is standing on a pedestal with one foot on a brick, while another man attempts to break the brick with his hand. The brick unexpectedly shifts, causing the man on the pedestal to lose his balance and fall.\nB: A man is attempting a dangerous stunt on a tall pedestal\nC: Two men are engaging in a physical fight on a platform\nD: A man is performing a risky acrobatic maneuver with another person",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: A man is standing on a pedestal with one foot on a brick, while another man attempts to break the brick with his hand. The brick unexpectedly shifts, causing the man on the pedestal to lose his balance and fall.\nB: A man is attempting a dangerous stunt on a tall pedestal\nC: Two men are engaging in a physical fight on a platform\nD: A man is performing a risky acrobatic maneuver with another person",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_63_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_63_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_63_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_63_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_63_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_63_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_63_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_63_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_63_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_63_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_63_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_63_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_63_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_63_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_63_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_63_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The umbrella was carried away gently by the wind, without causing any trouble.\nB: Both individuals were laughing and having a great time even after the umbrella was taken away.\nC: The two individuals were enjoying a pleasant walk under the umbrella.\nD: Two individuals were finding it difficult to walk together under one umbrella, but suddenly a strong wind carried the umbrella away. One of them gazed sadly at the disappearing umbrella.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The umbrella was carried away gently by the wind, without causing any trouble.\nB: Both individuals were laughing and having a great time even after the umbrella was taken away.\nC: The two individuals were enjoying a pleasant walk under the umbrella.\nD: Two individuals were finding it difficult to walk together under one umbrella, but suddenly a strong wind carried the umbrella away. One of them gazed sadly at the disappearing umbrella.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_64_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_64_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_64_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_64_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_64_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_64_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_64_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_64_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_64_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_64_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_64_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_64_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_64_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_64_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_64_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_64_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: Old high school buddies meeting again after a lifetime.\nB: Two best friends caught in a hilarious mix-up.\nC: Long-lost cousins reunited after a decade apart.\nD: Brothers who had lost each other's whereabouts for many years.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: Old high school buddies meeting again after a lifetime.\nB: Two best friends caught in a hilarious mix-up.\nC: Long-lost cousins reunited after a decade apart.\nD: Brothers who had lost each other's whereabouts for many years.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_65_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_65_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_65_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_65_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_65_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_65_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_65_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_65_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_65_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_65_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_65_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_65_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_65_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_65_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_65_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_65_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The video had no humor, it was just a serious demonstration of drink balancing\nB: Every drink was oddly askew, but the cups managed to catch not even a single spill.\nC: The way the drinks were perfectly aligned and the cups caught every spill\nD: The drinks were spilled all over the place and the cups were knocked over",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The video had no humor, it was just a serious demonstration of drink balancing\nB: Every drink was oddly askew, but the cups managed to catch not even a single spill.\nC: The way the drinks were perfectly aligned and the cups caught every spill\nD: The drinks were spilled all over the place and the cups were knocked over",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_66_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_66_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_66_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_66_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_66_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_66_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_66_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_66_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_66_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_66_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_66_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_66_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_66_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_66_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_66_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_66_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The video's humor comes from Miller's impressive magic skills displayed in the video.\nB: The video humorously depicts Miller successfully pulling off the magic trick.\nC: The video's comical element stems from its revelation of Miller's magic trick from a different perspective, making the entire process appear foolish and amusing.\nD: The video is funny because of Miller's serious demeanor during the magic trick.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The video's humor comes from Miller's impressive magic skills displayed in the video.\nB: The video humorously depicts Miller successfully pulling off the magic trick.\nC: The video's comical element stems from its revelation of Miller's magic trick from a different perspective, making the entire process appear foolish and amusing.\nD: The video is funny because of Miller's serious demeanor during the magic trick.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_67_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_67_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_67_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_67_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_67_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_67_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_67_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_67_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_67_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_67_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_67_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_67_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_67_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_67_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_67_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_67_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A:  The male protagonist inserted the key of the tractor into the mainframe and turned it hard. The sound of the engine igniting was accompanied by the background, and then the computer was turned on.\nB: The male protagonist accidentally started the tractor while trying to turn on the computer.\nC: The male protagonist tried to start the tractor using his phone, but it didn't work.\nD: The female protagonist struggled to start the tractor, but eventually managed to get it running.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA:  The male protagonist inserted the key of the tractor into the mainframe and turned it hard. The sound of the engine igniting was accompanied by the background, and then the computer was turned on.\nB: The male protagonist accidentally started the tractor while trying to turn on the computer.\nC: The male protagonist tried to start the tractor using his phone, but it didn't work.\nD: The female protagonist struggled to start the tractor, but eventually managed to get it running.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_68_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_68_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_68_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_68_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_68_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_68_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_68_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_68_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_68_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_68_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_68_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_68_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_68_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_68_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_68_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_68_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The best way to cure baldness is using the right shampoo.\nB: The usage of shampoo is not necessary for individuals who have a bald head.\nC: Using shampoo for bald people helps in growing hair back.\nD: Shampoo can make bald people look even more bald.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The best way to cure baldness is using the right shampoo.\nB: The usage of shampoo is not necessary for individuals who have a bald head.\nC: Using shampoo for bald people helps in growing hair back.\nD: Shampoo can make bald people look even more bald.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_69_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_69_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_69_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_69_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_69_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_69_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_69_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_69_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_69_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_69_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_69_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_69_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_69_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_69_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_69_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_69_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: We are distinct.\nB: We are indistinct.\nC: They are identical.\nD: Their differences are unclear.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: We are distinct.\nB: We are indistinct.\nC: They are identical.\nD: Their differences are unclear.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_70_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_70_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_70_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_70_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_70_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_70_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_70_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_70_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_70_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_70_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_70_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_70_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_70_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_70_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_70_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_70_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The man was standing still when the tomato sauce explosion happened, and he did not react at all.\nB: The man was anxiously eating his burger and then he spilled the ketchup.\nC: The man was peacefully eating his burger and nothing interesting happened.\nD: The man was comfortably munching on his burger when he was suddenly hit with a tomato sauce explosion, and his expression of disbelief was absolutely hysterical.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The man was standing still when the tomato sauce explosion happened, and he did not react at all.\nB: The man was anxiously eating his burger and then he spilled the ketchup.\nC: The man was peacefully eating his burger and nothing interesting happened.\nD: The man was comfortably munching on his burger when he was suddenly hit with a tomato sauce explosion, and his expression of disbelief was absolutely hysterical.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_71_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_71_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_71_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_71_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_71_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_71_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_71_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_71_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_71_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_71_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_71_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_71_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_71_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_71_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_71_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_71_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: When the video started, only the legs were visible, but as the camera turned, a man was seen engrossed in playing games on his mobile phone.\nB: Initially, it looked like a serious interview, but then the interviewee started singing a popular song loudly.\nC: At the beginning of the video, it seemed like a cooking show, but suddenly, a cat appeared and knocked down all the ingredients.\nD: In the opening scene, it appeared to be a wildlife documentary, but then the camera zoomed out to reveal it was just a pet cat acting as if it was in the wild.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: When the video started, only the legs were visible, but as the camera turned, a man was seen engrossed in playing games on his mobile phone.\nB: Initially, it looked like a serious interview, but then the interviewee started singing a popular song loudly.\nC: At the beginning of the video, it seemed like a cooking show, but suddenly, a cat appeared and knocked down all the ingredients.\nD: In the opening scene, it appeared to be a wildlife documentary, but then the camera zoomed out to reveal it was just a pet cat acting as if it was in the wild.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_72_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_72_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_72_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_72_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_72_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_72_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_72_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_72_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_72_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_72_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_72_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_72_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_72_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_72_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_72_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_72_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: A canine is present on the couch while a man carrying a backpack and a sound system tied to his hand is hopping and jumping from left to right, mimicking a bird's movements, and the dog's head is tracking his actions.\nB: The man and the dog are both sleeping on the couch, and the sound system is playing music in the background.\nC: A man is sitting on the couch with the dog, while a bird is hopping and jumping from left to right and the man's head is tracking its movements.\nD: The man is standing still, while the dog is barking and running around the room, trying to catch the bird.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: A canine is present on the couch while a man carrying a backpack and a sound system tied to his hand is hopping and jumping from left to right, mimicking a bird's movements, and the dog's head is tracking his actions.\nB: The man and the dog are both sleeping on the couch, and the sound system is playing music in the background.\nC: A man is sitting on the couch with the dog, while a bird is hopping and jumping from left to right and the man's head is tracking its movements.\nD: The man is standing still, while the dog is barking and running around the room, trying to catch the bird.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_73_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_73_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_73_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_73_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_73_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_73_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_73_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_73_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_73_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_73_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_73_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_73_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_73_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_73_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_73_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_73_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The unexpected appearance of a young woman\nB: The use of fast-paced music in the background\nC: The presence of a colorful, tropical backdrop\nD: Initially, it may be assumed that these legs are of a female, but surprisingly they belong to a bald-headed old man, which creates a comical contrast.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The unexpected appearance of a young woman\nB: The use of fast-paced music in the background\nC: The presence of a colorful, tropical backdrop\nD: Initially, it may be assumed that these legs are of a female, but surprisingly they belong to a bald-headed old man, which creates a comical contrast.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_74_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_74_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_74_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_74_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_74_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_74_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_74_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_74_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_74_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_74_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_74_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_74_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_74_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_74_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_74_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_74_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: Life as a coach is a breeze!\nB: Being a coach is a challenging task.\nC: Being a coach is a walk in the park.\nD: Coaching is an easy job.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: Life as a coach is a breeze!\nB: Being a coach is a challenging task.\nC: Being a coach is a walk in the park.\nD: Coaching is an easy job.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_75_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_75_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_75_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_75_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_75_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_75_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_75_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_75_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_75_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_75_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_75_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_75_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_75_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_75_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_75_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_75_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A:  A person is holding a rope with both hands, each hand grabbing one end of the rope.\nB: A person is using a rope to tie a knot in the video.\nC: A person is holding a snake with both hands in the video.\nD: A person is holding a hose with both hands, spraying water in the video.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA:  A person is holding a rope with both hands, each hand grabbing one end of the rope.\nB: A person is using a rope to tie a knot in the video.\nC: A person is holding a snake with both hands in the video.\nD: A person is holding a hose with both hands, spraying water in the video.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_76_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_76_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_76_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_76_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_76_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_76_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_76_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_76_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_76_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_76_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_76_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_76_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_76_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_76_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_76_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_76_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The video is comedic because of the serious soundtrack playing in the background, creating a contrast with the funny visuals.\nB: The sight of dogs pushing forward as if they are wearing a jetpack on their back is amusing.\nC: The comedic effect of the video is due to the slow-motion footage of the dogs, making their movements comically exaggerated.\nD: The video is comedic because of the unexpected plot twist at the end, catching the viewers by surprise.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The video is comedic because of the serious soundtrack playing in the background, creating a contrast with the funny visuals.\nB: The sight of dogs pushing forward as if they are wearing a jetpack on their back is amusing.\nC: The comedic effect of the video is due to the slow-motion footage of the dogs, making their movements comically exaggerated.\nD: The video is comedic because of the unexpected plot twist at the end, catching the viewers by surprise.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_77_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_77_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_77_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_77_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_77_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_77_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_77_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_77_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_77_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_77_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_77_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_77_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_77_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_77_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_77_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_77_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: In Front of the Trickery\nB: Beneath the Enchantment\nC: Underneath the Illusion\nD:  Behind the Magic",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: In Front of the Trickery\nB: Beneath the Enchantment\nC: Underneath the Illusion\nD:  Behind the Magic",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_78_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_78_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_78_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_78_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_78_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_78_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_78_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_78_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_78_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_78_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_78_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_78_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_78_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_78_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_78_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_78_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A:  When washing a bald head, the shampoo applied slips down easily.\nB: The video features a person using a soap dispenser only to have the soap squirt directly onto their face.\nC: The video depicts a person attempting to sit on a wet chair and sliding off.\nD: The video shows a person slipping on a banana peel.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA:  When washing a bald head, the shampoo applied slips down easily.\nB: The video features a person using a soap dispenser only to have the soap squirt directly onto their face.\nC: The video depicts a person attempting to sit on a wet chair and sliding off.\nD: The video shows a person slipping on a banana peel.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_79_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_79_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_79_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_79_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_79_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_79_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_79_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_79_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_79_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_79_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_79_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_79_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_79_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_79_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_79_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_79_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: A man in a white shirt is standing on a skateboard and doing stunts.\nB: A child in a green shirt is playing with a ball and jumping on a trampoline.\nC: A person wearing a blue jacket is dancing in the rain with an umbrella.\nD:  A child wearing an orange shirt is holding a stick in their right hand, extending their left hand, and then falling to the left on the ground.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: A man in a white shirt is standing on a skateboard and doing stunts.\nB: A child in a green shirt is playing with a ball and jumping on a trampoline.\nC: A person wearing a blue jacket is dancing in the rain with an umbrella.\nD:  A child wearing an orange shirt is holding a stick in their right hand, extending their left hand, and then falling to the left on the ground.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_80_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_80_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_80_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_80_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_80_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_80_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_80_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_80_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_80_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_80_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_80_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_80_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_80_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_80_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_80_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_80_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: A blue car carrying a large bird with a striped fur pattern, sleeping on the car roof.\nB: A white car carrying a big cat with a rainbow fur pattern, hiding inside the car.\nC: A red car carrying a medium-sized rabbit with a polka dot fur pattern, playing with the car window.\nD: A black car carries a small dog with a white and black fur pattern, peering out from the window. The scene then shifts to a person standing on a nearby street with a white and black pattern painted on their face.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: A blue car carrying a large bird with a striped fur pattern, sleeping on the car roof.\nB: A white car carrying a big cat with a rainbow fur pattern, hiding inside the car.\nC: A red car carrying a medium-sized rabbit with a polka dot fur pattern, playing with the car window.\nD: A black car carries a small dog with a white and black fur pattern, peering out from the window. The scene then shifts to a person standing on a nearby street with a white and black pattern painted on their face.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_81_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_81_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_81_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_81_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_81_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_81_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_81_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_81_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_81_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_81_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_81_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_81_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_81_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_81_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_81_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_81_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The sound of a female exercising can be heard, her hair falling over her face as she does sit-ups.\nB: A male voice is heard, while a person sits on the couch watching TV.\nC: There is loud music playing in the background as a person struggles to open a jar.\nD: The noise of a dog barking can be heard as a person takes a nap.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The sound of a female exercising can be heard, her hair falling over her face as she does sit-ups.\nB: A male voice is heard, while a person sits on the couch watching TV.\nC: There is loud music playing in the background as a person struggles to open a jar.\nD: The noise of a dog barking can be heard as a person takes a nap.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_82_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_82_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_82_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_82_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_82_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_82_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_82_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_82_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_82_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_82_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_82_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_82_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_82_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_82_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_82_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_82_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: Beverage: Roaming through the cup's entire world.\nB: Liquid travel: Exploring the cup's universe.\nC: Drink: A journey across the cup's universe.\nD:  Beverage: Passing through the whole world of the cup.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: Beverage: Roaming through the cup's entire world.\nB: Liquid travel: Exploring the cup's universe.\nC: Drink: A journey across the cup's universe.\nD:  Beverage: Passing through the whole world of the cup.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_83_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_83_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_83_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_83_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_83_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_83_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_83_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_83_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_83_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_83_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_83_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_83_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_83_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_83_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_83_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_83_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The dizzying camera work creates a sense of confusion that leads to laughter\nB: It is as if humans are incapable of washing themselves entirely clean, as foam keeps reappearing in different places. The juxtaposition of the before and after states is rather amusing.\nC: The use of unrealistic special effects adds to the comedic effect\nD: The exaggerated facial expressions of the characters evoke laughter",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The dizzying camera work creates a sense of confusion that leads to laughter\nB: It is as if humans are incapable of washing themselves entirely clean, as foam keeps reappearing in different places. The juxtaposition of the before and after states is rather amusing.\nC: The use of unrealistic special effects adds to the comedic effect\nD: The exaggerated facial expressions of the characters evoke laughter",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_84_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_84_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_84_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_84_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_84_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_84_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_84_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_84_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_84_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_84_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_84_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_84_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_84_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_84_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_84_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_84_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: Does it look like I am angry?\nB: Is it evident that I am happy?\nC: Do you think I am sad?\nD: Am I clearly upset?",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: Does it look like I am angry?\nB: Is it evident that I am happy?\nC: Do you think I am sad?\nD: Am I clearly upset?",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_85_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_85_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_85_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_85_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_85_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_85_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_85_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_85_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_85_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_85_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_85_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_85_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_85_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_85_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_85_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_85_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The dog's action was intentional and not a mistake, making it funny.\nB: The man extended his arms, hoping to catch the small dog, but the dog misinterpreted his gesture and directly hit his groin area. It seems that the dog didn't grasp the man's intention, creating this hilarious scene.\nC: The man's painful expression added to the comedic effect.\nD: The scene was staged, and the man and dog were actually friends, which made it less funny.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The dog's action was intentional and not a mistake, making it funny.\nB: The man extended his arms, hoping to catch the small dog, but the dog misinterpreted his gesture and directly hit his groin area. It seems that the dog didn't grasp the man's intention, creating this hilarious scene.\nC: The man's painful expression added to the comedic effect.\nD: The scene was staged, and the man and dog were actually friends, which made it less funny.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_86_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_86_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_86_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_86_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_86_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_86_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_86_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_86_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_86_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_86_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_86_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_86_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_86_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_86_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_86_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_86_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The hidden person was actually a friend playing a prank, so it was not unexpected.\nB: The person was not startled, it was all part of a planned act.\nC:  The person was startled by the person hidden under the cover, which was a bit unexpected.\nD: The person was not surprised by the person hidden under the cover, it was all staged.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The hidden person was actually a friend playing a prank, so it was not unexpected.\nB: The person was not startled, it was all part of a planned act.\nC:  The person was startled by the person hidden under the cover, which was a bit unexpected.\nD: The person was not surprised by the person hidden under the cover, it was all staged.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_87_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_87_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_87_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_87_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_87_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_87_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_87_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_87_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_87_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_87_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_87_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_87_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_87_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_87_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_87_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_87_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: A road-ignorant Husky.\nB: A confused snow dog.\nC: A lost Husky on the highway.\nD: A bewildered canine traveler.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: A road-ignorant Husky.\nB: A confused snow dog.\nC: A lost Husky on the highway.\nD: A bewildered canine traveler.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_88_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_88_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_88_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_88_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_88_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_88_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_88_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_88_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_88_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_88_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_88_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_88_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_88_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_88_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_88_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_88_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: Instead of splitting the brick, breakdance on it.\nB: If the brick cannot be split, try juggling with it.\nC: If the brick cannot be split, do a backflip instead.\nD: When the brick is unbreakable, start a magic show with it.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: Instead of splitting the brick, breakdance on it.\nB: If the brick cannot be split, try juggling with it.\nC: If the brick cannot be split, do a backflip instead.\nD: When the brick is unbreakable, start a magic show with it.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_89_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_89_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_89_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_89_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_89_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_89_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_89_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_89_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_89_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_89_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_89_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_89_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_89_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_89_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_89_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_89_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The \"Chicken\" is starting up its primary engine.\nB: The cow is starting up its primary engine.\nC: The cat is starting up its primary engine.\nD: The dog is starting up its primary engine.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The \"Chicken\" is starting up its primary engine.\nB: The cow is starting up its primary engine.\nC: The cat is starting up its primary engine.\nD: The dog is starting up its primary engine.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_90_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_90_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_90_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_90_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_90_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_90_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_90_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_90_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_90_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_90_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_90_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_90_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_90_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_90_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_90_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_90_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The cat is trying to steal the human's cup.\nB: The cat is not interested in the water inside a human's cup.\nC: The cat prefers tea over water from a human's cup.\nD: According to the cat, the water inside a human's cup is the most preferable to quench its thirst.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The cat is trying to steal the human's cup.\nB: The cat is not interested in the water inside a human's cup.\nC: The cat prefers tea over water from a human's cup.\nD: According to the cat, the water inside a human's cup is the most preferable to quench its thirst.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_91_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_91_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_91_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_91_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_91_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_91_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_91_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_91_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_91_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_91_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_91_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_91_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_91_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_91_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_91_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_91_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: Infinite staircase with no end in sight\nB: Staircase that goes on endlessly without any conclusion.\nC: Endless staircase with no resolution in sight\nD: Never-ending stairs leading to nowhere",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: Infinite staircase with no end in sight\nB: Staircase that goes on endlessly without any conclusion.\nC: Endless staircase with no resolution in sight\nD: Never-ending stairs leading to nowhere",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_92_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_92_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_92_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_92_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_92_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_92_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_92_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_92_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_92_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_92_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_92_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_92_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_92_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_92_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_92_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_92_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: A man said hi to a group of people who welcomed him warmly, and then they all hugged each other.\nB: A man walked past a group of people who were waving at him, and then he stopped to take a selfie with them.\nC: A man said hi to a group of people who welcomed him warmly, but he decided to bypass them and greeted the people after them.\nD: A man greeted a group of people and received a lot of high-fives from them before dancing with them.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: A man said hi to a group of people who welcomed him warmly, and then they all hugged each other.\nB: A man walked past a group of people who were waving at him, and then he stopped to take a selfie with them.\nC: A man said hi to a group of people who welcomed him warmly, but he decided to bypass them and greeted the people after them.\nD: A man greeted a group of people and received a lot of high-fives from them before dancing with them.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_93_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_93_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_93_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_93_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_93_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_93_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_93_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_93_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_93_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_93_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_93_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_93_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_93_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_93_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_93_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_93_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The ashamed master and the collected pup.\nB: The confident master and the disobedient pup.\nC: The disappointed master and the scattered pup.\nD: The proud master and the chaotic pup.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The ashamed master and the collected pup.\nB: The confident master and the disobedient pup.\nC: The disappointed master and the scattered pup.\nD: The proud master and the chaotic pup.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_94_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_94_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_94_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_94_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_94_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_94_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_94_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_94_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_94_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_94_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_94_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_94_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_94_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_94_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_94_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_94_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: A person is trying to catch something, but it slips and hits them on the face.\nB: Someone leans their face towards an object, shakes it, and the object tumbles down, hitting them on the face.\nC: Someone tries to balance an object on their head, but it falls and hits them on the face.\nD: An individual is attempting to juggle objects, but they accidentally drop one, and it hits them on the face.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: A person is trying to catch something, but it slips and hits them on the face.\nB: Someone leans their face towards an object, shakes it, and the object tumbles down, hitting them on the face.\nC: Someone tries to balance an object on their head, but it falls and hits them on the face.\nD: An individual is attempting to juggle objects, but they accidentally drop one, and it hits them on the face.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_95_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_95_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_95_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_95_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_95_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_95_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_95_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_95_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_95_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_95_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_95_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_95_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_95_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_95_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_95_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_95_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: A strong gust of wind tore the umbrella's fabric from the holders' grip, leading to a hilarious situation.\nB: The fabric of the umbrella got caught in the wind, causing a comical struggle for the individuals trying to hold onto it.\nC: The umbrella fabric got tangled in the wind and caused chaos among the people holding it.\nD: The wind disrupted the umbrella fabric, which was held by two people, and one of them watched the departing fabric.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: A strong gust of wind tore the umbrella's fabric from the holders' grip, leading to a hilarious situation.\nB: The fabric of the umbrella got caught in the wind, causing a comical struggle for the individuals trying to hold onto it.\nC: The umbrella fabric got tangled in the wind and caused chaos among the people holding it.\nD: The wind disrupted the umbrella fabric, which was held by two people, and one of them watched the departing fabric.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_96_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_96_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_96_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_96_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_96_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_96_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_96_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_96_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_96_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_96_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_96_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_96_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_96_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_96_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_96_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_96_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The person in question mistakes the soap dispenser for a hand sanitizer and tries to use it.\nB: The person in question accidentally sprays themselves with the cleaning liquid.\nC: The person in question hurls the cleaning cloth they are holding towards the soap dispenser.\nD: The person in question slips on the wet floor and falls down.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The person in question mistakes the soap dispenser for a hand sanitizer and tries to use it.\nB: The person in question accidentally sprays themselves with the cleaning liquid.\nC: The person in question hurls the cleaning cloth they are holding towards the soap dispenser.\nD: The person in question slips on the wet floor and falls down.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_97_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_97_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_97_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_97_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_97_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_97_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_97_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_97_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_97_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_97_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_97_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_97_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_97_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_97_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_97_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_97_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The video is comedic because it shows a serious battle between characters with no humor\nB: The video is comedic due to the intense and dramatic music in the background\nC: Ultraman is significantly less tall than the monster and appears a bit overweight while riding a dinosaur, which adds to the comical and amusing sight of him chasing the monster from behind.\nD: The video is comedic because it depicts a realistic and serious situation",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The video is comedic because it shows a serious battle between characters with no humor\nB: The video is comedic due to the intense and dramatic music in the background\nC: Ultraman is significantly less tall than the monster and appears a bit overweight while riding a dinosaur, which adds to the comical and amusing sight of him chasing the monster from behind.\nD: The video is comedic because it depicts a realistic and serious situation",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_98_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_98_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_98_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_98_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_98_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_98_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_98_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_98_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_98_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_98_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_98_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_98_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_98_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_98_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_98_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_98_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The humor in the video comes from the customer accidentally hitting the owner instead of the targets, which leads to a series of comical mishaps.\nB: The humor in the video stems from the customer's exceptional marksmanship, hitting every target with precision. While this may be a tragedy for the owner, it is a source of amusement for the viewers.\nC: The video's humor is derived from the customers' lack of aim, resulting in a failed and uneventful experience for the owner.\nD: The humor arises from the customer's inability to hit any target, causing chaos and frustration for the owner.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The humor in the video comes from the customer accidentally hitting the owner instead of the targets, which leads to a series of comical mishaps.\nB: The humor in the video stems from the customer's exceptional marksmanship, hitting every target with precision. While this may be a tragedy for the owner, it is a source of amusement for the viewers.\nC: The video's humor is derived from the customers' lack of aim, resulting in a failed and uneventful experience for the owner.\nD: The humor arises from the customer's inability to hit any target, causing chaos and frustration for the owner.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_99_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_99_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_99_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_99_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_99_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_99_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_99_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_99_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_99_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_99_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_99_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_99_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_99_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_99_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_99_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_99_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: Creative approach to staying dry in the rain.\nB: Innovative rain protection technique.\nC: Unexpected rain shelter solution.\nD:  New way of sheltering from the rain.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: Creative approach to staying dry in the rain.\nB: Innovative rain protection technique.\nC: Unexpected rain shelter solution.\nD:  New way of sheltering from the rain.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_100_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_100_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_100_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_100_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_100_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_100_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_100_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_100_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_100_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_100_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_100_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_100_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_100_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_100_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_100_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_100_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The beginning occurrence was completely unplanned.\nB: Provide proof that the first instance happened coincidentally.\nC: A hilarious twist of fate led to the initial occurrence.\nD: Proof that the initial incident was a random stroke of luck.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The beginning occurrence was completely unplanned.\nB: Provide proof that the first instance happened coincidentally.\nC: A hilarious twist of fate led to the initial occurrence.\nD: Proof that the initial incident was a random stroke of luck.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_101_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_101_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_101_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_101_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_101_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_101_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_101_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_101_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_101_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_101_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_101_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_101_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_101_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_101_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_101_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_101_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: A bird flying through a window.\nB: A cat being thrown by a slingshot.\nC:  The dog launched into the air through a catapult mechanism.\nD: A dog jumping over a fence.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: A bird flying through a window.\nB: A cat being thrown by a slingshot.\nC:  The dog launched into the air through a catapult mechanism.\nD: A dog jumping over a fence.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_102_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_102_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_102_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_102_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_102_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_102_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_102_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_102_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_102_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_102_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_102_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_102_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_102_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_102_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_102_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_102_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The electric bike hit the car, causing the car to shake and lose control.\nB:  A car on the road hit an electric bike, and the nearby electric bike shook and overturned.\nC: A police car arrived and stopped the car from hitting the electric bike.\nD: The electric bike swerved to avoid the car, causing it to crash into a nearby wall.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The electric bike hit the car, causing the car to shake and lose control.\nB:  A car on the road hit an electric bike, and the nearby electric bike shook and overturned.\nC: A police car arrived and stopped the car from hitting the electric bike.\nD: The electric bike swerved to avoid the car, causing it to crash into a nearby wall.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_103_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_103_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_103_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_103_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_103_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_103_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_103_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_103_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_103_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_103_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_103_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_103_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_103_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_103_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_103_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_103_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The use of vibrant colors adds humor to the video\nB: The unexpected use of sound effects makes it funny\nC: The comedic element is present in two unforeseen events. Firstly, the adult finishes the strawberry, and instead of being upset, the child fakes a smile. Secondly, the child swiftly goes from laughing to crying, providing another twist in the story.\nD: The slow-motion effect on the adult's reaction creates the humor",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The use of vibrant colors adds humor to the video\nB: The unexpected use of sound effects makes it funny\nC: The comedic element is present in two unforeseen events. Firstly, the adult finishes the strawberry, and instead of being upset, the child fakes a smile. Secondly, the child swiftly goes from laughing to crying, providing another twist in the story.\nD: The slow-motion effect on the adult's reaction creates the humor",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_104_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_104_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_104_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_104_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_104_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_104_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_104_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_104_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_104_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_104_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_104_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_104_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_104_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_104_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_104_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_104_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: High jumping by humans.\nB: Alien invasion rehearsal\nC: A new world record in jumping\nD: Extreme sports competition",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: High jumping by humans.\nB: Alien invasion rehearsal\nC: A new world record in jumping\nD: Extreme sports competition",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_105_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_105_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_105_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_105_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_105_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_105_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_105_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_105_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_105_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_105_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_105_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_105_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_105_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_105_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_105_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_105_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The man positioned on the left kept on hitting, whereas the man on the right showed no reaction. After the man on the right gave a blow with his stick, the man on the left stumbled.\nB: The man on the left was actually the aggressor, but the man on the right surprised him with a swift blow, causing the man on the left to lose his balance.\nC: The man on the left was the one who didn't show any reaction while the man on the right continued to hit. Later, the man on the left unexpectedly retaliated and the man on the right stumbled.\nD: Initially, both men were engaged in hitting each other, but the man on the right suddenly stopped, causing the man on the left to lose balance and stumble.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The man positioned on the left kept on hitting, whereas the man on the right showed no reaction. After the man on the right gave a blow with his stick, the man on the left stumbled.\nB: The man on the left was actually the aggressor, but the man on the right surprised him with a swift blow, causing the man on the left to lose his balance.\nC: The man on the left was the one who didn't show any reaction while the man on the right continued to hit. Later, the man on the left unexpectedly retaliated and the man on the right stumbled.\nD: Initially, both men were engaged in hitting each other, but the man on the right suddenly stopped, causing the man on the left to lose balance and stumble.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_106_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_106_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_106_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_106_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_106_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_106_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_106_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_106_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_106_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_106_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_106_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_106_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_106_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_106_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_106_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_106_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The video is funny because of the smooth and effortless running of the student without any mishaps.\nB: The video is funny because it shows a student responsibly avoiding the utility pole and reaching the destination safely.\nC:  The video is hilarious as a student running didn't pay attention to the front and comically crashed into a roadside utility pole, and most importantly, even knocked down the pole.\nD: The video is funny because of the serious and dangerous accident involving a student and a utility pole.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The video is funny because of the smooth and effortless running of the student without any mishaps.\nB: The video is funny because it shows a student responsibly avoiding the utility pole and reaching the destination safely.\nC:  The video is hilarious as a student running didn't pay attention to the front and comically crashed into a roadside utility pole, and most importantly, even knocked down the pole.\nD: The video is funny because of the serious and dangerous accident involving a student and a utility pole.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_107_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_107_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_107_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_107_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_107_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_107_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_107_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_107_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_107_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_107_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_107_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_107_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_107_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_107_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_107_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_107_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The man played a game of catch with the bags, skillfully avoiding getting hit.\nB: While disposing of his trash, the man tossed a bag upwards and it landed on him by accident. He proceeded to kick another bag down to the ground, but unfortunately missed his target.\nC: The man threw the bag in the air and it landed perfectly in the garbage can.\nD: The man picked up the bags and placed them neatly by the curb.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The man played a game of catch with the bags, skillfully avoiding getting hit.\nB: While disposing of his trash, the man tossed a bag upwards and it landed on him by accident. He proceeded to kick another bag down to the ground, but unfortunately missed his target.\nC: The man threw the bag in the air and it landed perfectly in the garbage can.\nD: The man picked up the bags and placed them neatly by the curb.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_108_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_108_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_108_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_108_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_108_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_108_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_108_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_108_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_108_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_108_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_108_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_108_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_108_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_108_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_108_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_108_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The owner accidentally backs the car into the roadside bushes despite the reverse camera system.\nB: The car's reverse camera system fails to beep as the car approaches the roadside bushes, leading to a collision.\nC: The car's reverse camera system beeps erratically, causing confusion for the owner.\nD: The car's reverse camera system is constantly beeping as the car approaches the roadside bushes, but upon the owner's exit from the driver's seat and inspection, it is found that the bushes are still a considerable distance away.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The owner accidentally backs the car into the roadside bushes despite the reverse camera system.\nB: The car's reverse camera system fails to beep as the car approaches the roadside bushes, leading to a collision.\nC: The car's reverse camera system beeps erratically, causing confusion for the owner.\nD: The car's reverse camera system is constantly beeping as the car approaches the roadside bushes, but upon the owner's exit from the driver's seat and inspection, it is found that the bushes are still a considerable distance away.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_109_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_109_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_109_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_109_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_109_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_109_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_109_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_109_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_109_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_109_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_109_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_109_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_109_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_109_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_109_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_109_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The video's funny moment arises from the man's overacting, as he attempts to catch a soda can with his hand but instead gets hit on the head, resulting in a tragicomic scene.\nB: The video's humor comes from the unexpected failure of the man to open the soda can, resulting in a frustrating and disappointing moment.\nC: The comedic aspect of the video is derived from the man's serious and reserved demeanor as he avoids the soda can, creating a tension-filled scene.\nD: The humor in the video is due to the man being successful in catching the soda can with his hand, leading to a heartwarming moment.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The video's funny moment arises from the man's overacting, as he attempts to catch a soda can with his hand but instead gets hit on the head, resulting in a tragicomic scene.\nB: The video's humor comes from the unexpected failure of the man to open the soda can, resulting in a frustrating and disappointing moment.\nC: The comedic aspect of the video is derived from the man's serious and reserved demeanor as he avoids the soda can, creating a tension-filled scene.\nD: The humor in the video is due to the man being successful in catching the soda can with his hand, leading to a heartwarming moment.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_110_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_110_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_110_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_110_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_110_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_110_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_110_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_110_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_110_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_110_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_110_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_110_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_110_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_110_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_110_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_110_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: Abrupt shock.\nB: Instant shock.\nC:  Sudden surprise.\nD: Unexpected shock.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: Abrupt shock.\nB: Instant shock.\nC:  Sudden surprise.\nD: Unexpected shock.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_111_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_111_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_111_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_111_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_111_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_111_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_111_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_111_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_111_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_111_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_111_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_111_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_111_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_111_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_111_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_111_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: From behind the glass, a spotted leopard and a young child are seen batting at each other playfully.\nB: A zebra and a little boy are engaging in a dangerous game of tag.\nC: A lion and a toddler are having a serious fight inside a cage.\nD: A monkey and a kid are seen throwing items at each other from behind a glass enclosure.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: From behind the glass, a spotted leopard and a young child are seen batting at each other playfully.\nB: A zebra and a little boy are engaging in a dangerous game of tag.\nC: A lion and a toddler are having a serious fight inside a cage.\nD: A monkey and a kid are seen throwing items at each other from behind a glass enclosure.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_112_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_112_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_112_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_112_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_112_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_112_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_112_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_112_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_112_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_112_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_112_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_112_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_112_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_112_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_112_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_112_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: Oh no, I lost my pet rock again, where did it go?\nB: My cellular device appears to be missing, any idea where it could be?\nC: Has anyone seen my imaginary friend? I think they ran away again.\nD: I seem to have misplaced my invisible hat, any thoughts on its whereabouts?",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: Oh no, I lost my pet rock again, where did it go?\nB: My cellular device appears to be missing, any idea where it could be?\nC: Has anyone seen my imaginary friend? I think they ran away again.\nD: I seem to have misplaced my invisible hat, any thoughts on its whereabouts?",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_113_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_113_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_113_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_113_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_113_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_113_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_113_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_113_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_113_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_113_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_113_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_113_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_113_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_113_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_113_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_113_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: What makes the video funny is the dog using the person as a marker and peeing directly on the woman. The woman's unfortunate experience adds an element of tragicomedy to the scene.\nB: The video's humor comes from the unexpected interaction between the dog and the woman.\nC: The funny aspect of the video is the woman's unexpected encounter with the dog.\nD: The humor is created by the dog's unusual behavior towards the woman.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: What makes the video funny is the dog using the person as a marker and peeing directly on the woman. The woman's unfortunate experience adds an element of tragicomedy to the scene.\nB: The video's humor comes from the unexpected interaction between the dog and the woman.\nC: The funny aspect of the video is the woman's unexpected encounter with the dog.\nD: The humor is created by the dog's unusual behavior towards the woman.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_114_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_114_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_114_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_114_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_114_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_114_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_114_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_114_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_114_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_114_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_114_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_114_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_114_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_114_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_114_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_114_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: Where was this video filmed?\nB: What time is the lunch break?\nC: How many dogs are in the video?\nD: Who is the designated security personnel?",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: Where was this video filmed?\nB: What time is the lunch break?\nC: How many dogs are in the video?\nD: Who is the designated security personnel?",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_115_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_115_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_115_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_115_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_115_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_115_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_115_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_115_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_115_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_115_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_115_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_115_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_115_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_115_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_115_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_115_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: From beneath the toilet door panel, a hand is reaching out with an upward-facing palm to receive chopsticks and a spoon from someone outside.\nB: The hand is asking for help to get out of the bathroom.\nC: The hand is actually reaching out for a handshake.\nD: A person is handing over toilet paper instead of chopsticks and a spoon.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: From beneath the toilet door panel, a hand is reaching out with an upward-facing palm to receive chopsticks and a spoon from someone outside.\nB: The hand is asking for help to get out of the bathroom.\nC: The hand is actually reaching out for a handshake.\nD: A person is handing over toilet paper instead of chopsticks and a spoon.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_116_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_116_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_116_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_116_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_116_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_116_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_116_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_116_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_116_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_116_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_116_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_116_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_116_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_116_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_116_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_116_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: A peaceful resolution with positive outcome.\nB: A successful attack with unexpected results.\nC: A well-executed plan that achieves the desired goal.\nD: A failed attack that yields no results.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: A peaceful resolution with positive outcome.\nB: A successful attack with unexpected results.\nC: A well-executed plan that achieves the desired goal.\nD: A failed attack that yields no results.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_117_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_117_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_117_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_117_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_117_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_117_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_117_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_117_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_117_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_117_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_117_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_117_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_117_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_117_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_117_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_117_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: A lady is reclining on a lounger when she sways twice, and tips over, falling to the ground.\nB: The lady jumps from the lounger and starts dancing.\nC: The lounger breaks and the lady falls abruptly.\nD: A lady is sitting calmly on a lounger and gets up gracefully.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: A lady is reclining on a lounger when she sways twice, and tips over, falling to the ground.\nB: The lady jumps from the lounger and starts dancing.\nC: The lounger breaks and the lady falls abruptly.\nD: A lady is sitting calmly on a lounger and gets up gracefully.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_118_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_118_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_118_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_118_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_118_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_118_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_118_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_118_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_118_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_118_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_118_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_118_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_118_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_118_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_118_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_118_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The cat exclaimed in frustration, \"Why is it always me who has to face unlucky incidents?\"\nB: The Cat's Fortune\nC: Unlucky Cat Adventures\nD: Feline Frustration",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The cat exclaimed in frustration, \"Why is it always me who has to face unlucky incidents?\"\nB: The Cat's Fortune\nC: Unlucky Cat Adventures\nD: Feline Frustration",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_119_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_119_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_119_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_119_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_119_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_119_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_119_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_119_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_119_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_119_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_119_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_119_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_119_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_119_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_119_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_119_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The video is entertaining because it shows a person teasing a small dog and the dog responds by tugging on the person's clothing, creating a humorous moment.\nB:  The video is hilarious because a person is teasing a small dog by shaking their buttocks and the dog bites off their pants, which is very funny.\nC: The video is funny because a person is playing with a small dog and the dog tugs on their clothes, resulting in a humorous situation.\nD: The video is amusing because it depicts a person dancing with a small dog and the dog reacts by pulling on the person's clothes, which is comical.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The video is entertaining because it shows a person teasing a small dog and the dog responds by tugging on the person's clothing, creating a humorous moment.\nB:  The video is hilarious because a person is teasing a small dog by shaking their buttocks and the dog bites off their pants, which is very funny.\nC: The video is funny because a person is playing with a small dog and the dog tugs on their clothes, resulting in a humorous situation.\nD: The video is amusing because it depicts a person dancing with a small dog and the dog reacts by pulling on the person's clothes, which is comical.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_120_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_120_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_120_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_120_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_120_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_120_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_120_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_120_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_120_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_120_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_120_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_120_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_120_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_120_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_120_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_120_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: Three guys are dancing on a bench to a popular song, trying to impress their friends.\nB: A group of people are struggling to move a heavy bench across the room.\nC: Three friends are sitting on a bench and watching a funny video on their phone.\nD: Inside a KTV, three males are sitting on a long bench, touching each other's backs, and simultaneously making rowing movements, causing the bench to move backwards.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: Three guys are dancing on a bench to a popular song, trying to impress their friends.\nB: A group of people are struggling to move a heavy bench across the room.\nC: Three friends are sitting on a bench and watching a funny video on their phone.\nD: Inside a KTV, three males are sitting on a long bench, touching each other's backs, and simultaneously making rowing movements, causing the bench to move backwards.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_121_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_121_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_121_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_121_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_121_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_121_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_121_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_121_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_121_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_121_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_121_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_121_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_121_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_121_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_121_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_121_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: After drinking, the black dog is telling others not to drink anymore.\nB: The black dog is warning others about the dangers of overeating.\nC: The black dog is encouraging others to drink more alcohol.\nD: The black dog is enjoying a drinking spree with its friends.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: After drinking, the black dog is telling others not to drink anymore.\nB: The black dog is warning others about the dangers of overeating.\nC: The black dog is encouraging others to drink more alcohol.\nD: The black dog is enjoying a drinking spree with its friends.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_122_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_122_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_122_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_122_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_122_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_122_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_122_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_122_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_122_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_122_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_122_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_122_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_122_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_122_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_122_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_122_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The entire video is not funny, it's just plain awkward\nB: The behavior doesn't strike me as funny, but for some people, the fact that the man on top was unexpectedly awoken could be a source of amusement.\nC: The humor comes from the man's startled reaction\nD: The funny part is the loud noise that wakes up the man",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The entire video is not funny, it's just plain awkward\nB: The behavior doesn't strike me as funny, but for some people, the fact that the man on top was unexpectedly awoken could be a source of amusement.\nC: The humor comes from the man's startled reaction\nD: The funny part is the loud noise that wakes up the man",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_123_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_123_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_123_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_123_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_123_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_123_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_123_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_123_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_123_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_123_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_123_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_123_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_123_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_123_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_123_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_123_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The security personnel lifted their hands and conducted a check on the individual coming towards them.\nB: The security personnel danced with the individual instead of conducting a check.\nC: The security personnel ignored the individual and continued chatting with each other.\nD: The security personnel mistook the individual for someone else and waved them through without a check.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The security personnel lifted their hands and conducted a check on the individual coming towards them.\nB: The security personnel danced with the individual instead of conducting a check.\nC: The security personnel ignored the individual and continued chatting with each other.\nD: The security personnel mistook the individual for someone else and waved them through without a check.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_124_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_124_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_124_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_124_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_124_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_124_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_124_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_124_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_124_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_124_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_124_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_124_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_124_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_124_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_124_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_124_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The dog's determination to succeed\nB: It is often said that success breeds success, but it was hilarious to see this dog hit the glass on its second try after succeeding once.\nC: The glass representing a barrier for the dog\nD: The unexpected outcome of the dog hitting the glass on its second try",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The dog's determination to succeed\nB: It is often said that success breeds success, but it was hilarious to see this dog hit the glass on its second try after succeeding once.\nC: The glass representing a barrier for the dog\nD: The unexpected outcome of the dog hitting the glass on its second try",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_125_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_125_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_125_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_125_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_125_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_125_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_125_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_125_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_125_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_125_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_125_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_125_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_125_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_125_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_125_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_125_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The video is comedic because of the dull and uninteresting tapping between the flower leopard and the child through the glass.\nB: The video is comedic because of the serious interaction between the flower leopard and the child through the glass.\nC: The video is comedic because of the aggressive behavior of the flower leopard towards the child through the glass.\nD: The playful tapping between the flower leopard and the child through the glass gives the impression that the leopard has transformed into an adorable and humorous child.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The video is comedic because of the dull and uninteresting tapping between the flower leopard and the child through the glass.\nB: The video is comedic because of the serious interaction between the flower leopard and the child through the glass.\nC: The video is comedic because of the aggressive behavior of the flower leopard towards the child through the glass.\nD: The playful tapping between the flower leopard and the child through the glass gives the impression that the leopard has transformed into an adorable and humorous child.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_126_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_126_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_126_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_126_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_126_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_126_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_126_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_126_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_126_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_126_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_126_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_126_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_126_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_126_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_126_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_126_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The audience's expectations were completely met, and there was nothing unexpected or humorous about the situation.\nB:  The child was holding a basin of snow with a serious expression, intending to pour it towards her father in a swift and standard manner. As a result, the full basin of snow, with the child's full force, fell back onto her own face, greatly disappointing the audience's expectations, which was really hard to hold back the laughter.\nC: The child's serious expression and intention to pour snow towards her father were not funny at all.\nD: The fall of snow onto the child's face was a tragic and painful event, not a comedic one.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The audience's expectations were completely met, and there was nothing unexpected or humorous about the situation.\nB:  The child was holding a basin of snow with a serious expression, intending to pour it towards her father in a swift and standard manner. As a result, the full basin of snow, with the child's full force, fell back onto her own face, greatly disappointing the audience's expectations, which was really hard to hold back the laughter.\nC: The child's serious expression and intention to pour snow towards her father were not funny at all.\nD: The fall of snow onto the child's face was a tragic and painful event, not a comedic one.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_127_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_127_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_127_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_127_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_127_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_127_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_127_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_127_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_127_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_127_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_127_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_127_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_127_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_127_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_127_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_127_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: A sudden gust of wind blows away the umbrella, leaving everyone drenched and shocked.\nB: The scene of a group of people huddled under a large umbrella to escape the rain, which has a capacity for many individuals, is quite humorous.\nC: The unexpected appearance of a clown riding a unicycle in the background adds to the comedic effect.\nD: The group of people suddenly break into a spontaneous dance party, much to the amusement of onlookers.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: A sudden gust of wind blows away the umbrella, leaving everyone drenched and shocked.\nB: The scene of a group of people huddled under a large umbrella to escape the rain, which has a capacity for many individuals, is quite humorous.\nC: The unexpected appearance of a clown riding a unicycle in the background adds to the comedic effect.\nD: The group of people suddenly break into a spontaneous dance party, much to the amusement of onlookers.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_128_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_128_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_128_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_128_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_128_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_128_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_128_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_128_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_128_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_128_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_128_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_128_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_128_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_128_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_128_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_128_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: Feely flip caution\nB: Touchy reverse alert\nC: Sensitive retro warning\nD: Tactile backward alarm",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: Feely flip caution\nB: Touchy reverse alert\nC: Sensitive retro warning\nD: Tactile backward alarm",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_129_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_129_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_129_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_129_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_129_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_129_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_129_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_129_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_129_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_129_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_129_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_129_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_129_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_129_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_129_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_129_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The man on skis falls down after hitting a rock.\nB: The men are playing a friendly game of snow football.\nC: Two men are skiing together on the snow.\nD:  Two men are pulling a squatting man on skis through the snow. The squatting man sticks out his right foot and trips the man on his right.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The man on skis falls down after hitting a rock.\nB: The men are playing a friendly game of snow football.\nC: Two men are skiing together on the snow.\nD:  Two men are pulling a squatting man on skis through the snow. The squatting man sticks out his right foot and trips the man on his right.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_130_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_130_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_130_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_130_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_130_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_130_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_130_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_130_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_130_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_130_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_130_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_130_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_130_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_130_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_130_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_130_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The strength difference between the two men is evident. The man on the right exudes a Dragon Aotian-like protagonist vibe, and even a slight move from him renders the other party defenseless.\nB: The video maintains its humorous tone by incorporating slapstick comedy and clever visual effects.\nC: The video maintains its humorous tone through exaggerated facial expressions and dramatic music cues.\nD: The humorous tone is achieved through witty dialogue and unexpected plot twists.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The strength difference between the two men is evident. The man on the right exudes a Dragon Aotian-like protagonist vibe, and even a slight move from him renders the other party defenseless.\nB: The video maintains its humorous tone by incorporating slapstick comedy and clever visual effects.\nC: The video maintains its humorous tone through exaggerated facial expressions and dramatic music cues.\nD: The humorous tone is achieved through witty dialogue and unexpected plot twists.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_131_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_131_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_131_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_131_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_131_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_131_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_131_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_131_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_131_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_131_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_131_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_131_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_131_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_131_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_131_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_131_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: While walking, a man accidentally kicked a soda can which hit him in the face.\nB: A man jumped on a wooden plank and a soda can flew and hit him on the head.\nC: A man slipped on a piece of wood, causing a soda can to hit him in the head.\nD: Stepping on a wooden board, a man caused a soda can to be flung into the air from the other end. The man tried to catch it with his hand, but the can missed his grip and struck him in the head.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: While walking, a man accidentally kicked a soda can which hit him in the face.\nB: A man jumped on a wooden plank and a soda can flew and hit him on the head.\nC: A man slipped on a piece of wood, causing a soda can to hit him in the head.\nD: Stepping on a wooden board, a man caused a soda can to be flung into the air from the other end. The man tried to catch it with his hand, but the can missed his grip and struck him in the head.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_132_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_132_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_132_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_132_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_132_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_132_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_132_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_132_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_132_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_132_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_132_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_132_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_132_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_132_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_132_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_132_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The king of accuracy.\nB: The champion of sharpshooting.\nC: The master of archery.\nD: The ruler of marksmanship.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The king of accuracy.\nB: The champion of sharpshooting.\nC: The master of archery.\nD: The ruler of marksmanship.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_133_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_133_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_133_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_133_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_133_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_133_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_133_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_133_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_133_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_133_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_133_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_133_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_133_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_133_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_133_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_133_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: Colossal monster is leading the way for Tiny Ultraman.\nB: Giant Ultraman is running from a tiny monster.\nC: Tiny Ultraman is in pursuit of a colossal monster.\nD: Tiny Ultraman is relaxing with the colossal monster.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: Colossal monster is leading the way for Tiny Ultraman.\nB: Giant Ultraman is running from a tiny monster.\nC: Tiny Ultraman is in pursuit of a colossal monster.\nD: Tiny Ultraman is relaxing with the colossal monster.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_134_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_134_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_134_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_134_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_134_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_134_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_134_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_134_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_134_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_134_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_134_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_134_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_134_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_134_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_134_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_134_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A:  An adult is holding a child wearing a white hat. The adult holds a strawberry and reaches towards the child's mouth, then puts it into their own mouth. The child initially smiles but then shows a crying face.\nB: The adult and the child are sharing a strawberry happily.\nC: A child is feeding a strawberry to an adult wearing a white hat.\nD: The child is teasing the adult with a strawberry, causing the adult to make a funny face.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA:  An adult is holding a child wearing a white hat. The adult holds a strawberry and reaches towards the child's mouth, then puts it into their own mouth. The child initially smiles but then shows a crying face.\nB: The adult and the child are sharing a strawberry happily.\nC: A child is feeding a strawberry to an adult wearing a white hat.\nD: The child is teasing the adult with a strawberry, causing the adult to make a funny face.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_135_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_135_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_135_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_135_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_135_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_135_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_135_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_135_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_135_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_135_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_135_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_135_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_135_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_135_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_135_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_135_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: Your trousers have been gnawed away.\nB: Seems like your pants were devoured.\nC: Looks like your trousers got eaten.\nD: Your pants have been chewed up.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: Your trousers have been gnawed away.\nB: Seems like your pants were devoured.\nC: Looks like your trousers got eaten.\nD: Your pants have been chewed up.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_136_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_136_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_136_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_136_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_136_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_136_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_136_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_136_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_136_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_136_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_136_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_136_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_136_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_136_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_136_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_136_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: Two men are walking on a sturdy bridge over a field. The last man starts jumping on the bridge, causing it to sway. The first man cannot maintain his balance, and the last man falls down, dropping his phone into the water.\nB: Four men are walking on a suspension bridge over water. The last man starts jumping on the bridge, causing it to sway. The first three men cannot maintain their balance, and the last man falls down, dropping his hat into the water.\nC: Three women are walking on a suspension bridge over water. The last two women start singing, causing the bridge to sway. The first woman cannot maintain her balance, and all women fall down, dropping their phones into the water.\nD:  Three women are walking on a suspension bridge over water. The last woman starts jumping on the bridge, causing it to sway. The first two women cannot maintain their balance, and the last woman falls down, dropping her phone into the water.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: Two men are walking on a sturdy bridge over a field. The last man starts jumping on the bridge, causing it to sway. The first man cannot maintain his balance, and the last man falls down, dropping his phone into the water.\nB: Four men are walking on a suspension bridge over water. The last man starts jumping on the bridge, causing it to sway. The first three men cannot maintain their balance, and the last man falls down, dropping his hat into the water.\nC: Three women are walking on a suspension bridge over water. The last two women start singing, causing the bridge to sway. The first woman cannot maintain her balance, and all women fall down, dropping their phones into the water.\nD:  Three women are walking on a suspension bridge over water. The last woman starts jumping on the bridge, causing it to sway. The first two women cannot maintain their balance, and the last woman falls down, dropping her phone into the water.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_137_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_137_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_137_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_137_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_137_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_137_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_137_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_137_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_137_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_137_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_137_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_137_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_137_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_137_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_137_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_137_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The video's humor is based on the use of chopsticks and spoons, implying that the person in the video is consuming feces with these tools, which are typically used for eating.\nB: The video is comedic due to the use of advanced special effects that make the scene look realistic\nC: The video is comedic because it depicts a serious situation with dramatic music, creating a suspenseful atmosphere\nD: The video is comedic because it features famous celebrity cameos, adding a touch of glamour and sophistication",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The video's humor is based on the use of chopsticks and spoons, implying that the person in the video is consuming feces with these tools, which are typically used for eating.\nB: The video is comedic due to the use of advanced special effects that make the scene look realistic\nC: The video is comedic because it depicts a serious situation with dramatic music, creating a suspenseful atmosphere\nD: The video is comedic because it features famous celebrity cameos, adding a touch of glamour and sophistication",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_138_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_138_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_138_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_138_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_138_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_138_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_138_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_138_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_138_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_138_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_138_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_138_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_138_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_138_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_138_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_138_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: Unexpected collision with a roadside pole.\nB: To hit a utility pole head-on with great impact.\nC: The moment of impact with a utility pole.\nD: Car crash into a pole at full speed.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: Unexpected collision with a roadside pole.\nB: To hit a utility pole head-on with great impact.\nC: The moment of impact with a utility pole.\nD: Car crash into a pole at full speed.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_139_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_139_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_139_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_139_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_139_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_139_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_139_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_139_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_139_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_139_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_139_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_139_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_139_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_139_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_139_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_139_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: Four dogs are surrounding a water source, drinking. However, once the black dog finishes, it begins to dig at the water bowl with its paws, hindering the other dogs from getting a drink.\nB: One dog is drinking while the other dogs are playing around it.\nC: The dogs are enjoying a peaceful drink together without any disruptions.\nD: The dogs are fighting over a treat in the water bowl.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: Four dogs are surrounding a water source, drinking. However, once the black dog finishes, it begins to dig at the water bowl with its paws, hindering the other dogs from getting a drink.\nB: One dog is drinking while the other dogs are playing around it.\nC: The dogs are enjoying a peaceful drink together without any disruptions.\nD: The dogs are fighting over a treat in the water bowl.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_140_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_140_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_140_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_140_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_140_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_140_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_140_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_140_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_140_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_140_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_140_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_140_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_140_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_140_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_140_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_140_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: Marking is a beloved activity of dogs to indicate their territory.\nB: Dogs have a unique way of expressing their love for grass.\nC: Territorial disputes among dogs can be quite colorful.\nD: Dogs take pride in leaving their mark everywhere they go.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: Marking is a beloved activity of dogs to indicate their territory.\nB: Dogs have a unique way of expressing their love for grass.\nC: Territorial disputes among dogs can be quite colorful.\nD: Dogs take pride in leaving their mark everywhere they go.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_141_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_141_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_141_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_141_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_141_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_141_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_141_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_141_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_141_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_141_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_141_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_141_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_141_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_141_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_141_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_141_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A:  A dog stretched out its paws and scratched towards a snake. Suddenly, when the dog pulled the snake over, it jumped into the air. After falling to the ground, the dog quickly rolled over and stood up to stare at the snake.\nB: A cat meowed loudly and scared the dog away.\nC: A bird flew down and perched on the dog's nose, making it sneeze.\nD: A squirrel ran past the dog, causing it to chase after the squirrel.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA:  A dog stretched out its paws and scratched towards a snake. Suddenly, when the dog pulled the snake over, it jumped into the air. After falling to the ground, the dog quickly rolled over and stood up to stare at the snake.\nB: A cat meowed loudly and scared the dog away.\nC: A bird flew down and perched on the dog's nose, making it sneeze.\nD: A squirrel ran past the dog, causing it to chase after the squirrel.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_142_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_142_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_142_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_142_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_142_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_142_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_142_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_142_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_142_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_142_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_142_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_142_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_142_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_142_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_142_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_142_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: Two cats gracefully danced down the slope in perfect synchronization.\nB: The cats climbed the slope skillfully and reached the top without any mishaps.\nC: The cats peacefully enjoyed the view while sitting on the slope.\nD: On a smooth incline, one cat lost its balance and knocked down another cat, resulting in both cats falling off the slope while intertwined.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: Two cats gracefully danced down the slope in perfect synchronization.\nB: The cats climbed the slope skillfully and reached the top without any mishaps.\nC: The cats peacefully enjoyed the view while sitting on the slope.\nD: On a smooth incline, one cat lost its balance and knocked down another cat, resulting in both cats falling off the slope while intertwined.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_143_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_143_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_143_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_143_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_143_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_143_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_143_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_143_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_143_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_143_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_143_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_143_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_143_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_143_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_143_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_143_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The audience did not find the video amusing.\nB: In an attempt to jump with height, the person collided with the pole and fell onto the mat. The series of events that followed were smooth and entertaining.\nC: The video was a serious depiction of a dangerous stunt.\nD: The person effortlessly executed the jump without any mishaps.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The audience did not find the video amusing.\nB: In an attempt to jump with height, the person collided with the pole and fell onto the mat. The series of events that followed were smooth and entertaining.\nC: The video was a serious depiction of a dangerous stunt.\nD: The person effortlessly executed the jump without any mishaps.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_144_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_144_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_144_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_144_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_144_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_144_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_144_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_144_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_144_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_144_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_144_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_144_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_144_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_144_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_144_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_144_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: Hidden damage.\nB: Secretive injury\nC: Concealed harm\nD: Unexpected destruction",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: Hidden damage.\nB: Secretive injury\nC: Concealed harm\nD: Unexpected destruction",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_145_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_145_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_145_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_145_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_145_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_145_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_145_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_145_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_145_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_145_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_145_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_145_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_145_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_145_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_145_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_145_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The video is funny because it shows three men skydiving from a plane with serious expressions.\nB:  The video is funny because three men were having a great time in KTV, pretending that the chairs were boats and paddling with funny movements.\nC: The video is funny because it captures a serious business meeting with three men sitting in an office.\nD: The video is funny because it features three men participating in a cooking competition with intense concentration.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The video is funny because it shows three men skydiving from a plane with serious expressions.\nB:  The video is funny because three men were having a great time in KTV, pretending that the chairs were boats and paddling with funny movements.\nC: The video is funny because it captures a serious business meeting with three men sitting in an office.\nD: The video is funny because it features three men participating in a cooking competition with intense concentration.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_146_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_146_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_146_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_146_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_146_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_146_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_146_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_146_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_146_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_146_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_146_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_146_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_146_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_146_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_146_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_146_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The part where he successfully kicked the garbage was the funniest moment.\nB: The video was not funny at all.\nC: The moment when the garbage that he had thrown out hit him was not funny at all.\nD: The moment when the garbage that he had thrown out hit him was amusing enough. However, the situation turned even more hilarious when he missed kicking it.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The part where he successfully kicked the garbage was the funniest moment.\nB: The video was not funny at all.\nC: The moment when the garbage that he had thrown out hit him was not funny at all.\nD: The moment when the garbage that he had thrown out hit him was amusing enough. However, the situation turned even more hilarious when he missed kicking it.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_147_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_147_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_147_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_147_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_147_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_147_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_147_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_147_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_147_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_147_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_147_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_147_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_147_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_147_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_147_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_147_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: Two people on the lower bunk are kicking the bed board of the upper bunk with their feet, causing the person on top to be shaken awake.\nB: The person on the lower bunk is quietly reading a book while the person on the upper bunk is peacefully sleeping.\nC: The person on the lower bunk is playing a guitar, creating a calming atmosphere, while the person on the upper bunk is enjoying the music.\nD: Three people are having a pillow fight on the lower bunk, while the person on the upper bunk is fast asleep.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: Two people on the lower bunk are kicking the bed board of the upper bunk with their feet, causing the person on top to be shaken awake.\nB: The person on the lower bunk is quietly reading a book while the person on the upper bunk is peacefully sleeping.\nC: The person on the lower bunk is playing a guitar, creating a calming atmosphere, while the person on the upper bunk is enjoying the music.\nD: Three people are having a pillow fight on the lower bunk, while the person on the upper bunk is fast asleep.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_148_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_148_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_148_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_148_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_148_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_148_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_148_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_148_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_148_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_148_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_148_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_148_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_148_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_148_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_148_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_148_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The woman's action is actually quite dangerous and not suitable for a humorous video.\nB: The woman's movement is ordinary and predictable, which makes the video boring.\nC: It's entertaining to observe the woman's unconventional way of descending to the ground, with her head facing directly downward.\nD: The video lacks any unexpected or unconventional elements, making it uninteresting.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The woman's action is actually quite dangerous and not suitable for a humorous video.\nB: The woman's movement is ordinary and predictable, which makes the video boring.\nC: It's entertaining to observe the woman's unconventional way of descending to the ground, with her head facing directly downward.\nD: The video lacks any unexpected or unconventional elements, making it uninteresting.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_149_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_149_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_149_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_149_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_149_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_149_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_149_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_149_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_149_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_149_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_149_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_149_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_149_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_149_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_149_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_149_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: Library study session\nB: Funeral procession\nC: Political debate\nD:  Social dance party.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: Library study session\nB: Funeral procession\nC: Political debate\nD:  Social dance party.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_150_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_150_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_150_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_150_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_150_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_150_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_150_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_150_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_150_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_150_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_150_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_150_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_150_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_150_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_150_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_150_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The child and the adult break into a choreographed dance routine instead of playing the prank.\nB: The child accidentally trips and falls while trying to switch off the light.\nC: The computer screen suddenly freezes, causing confusion for the child and the adult.\nD: The child who was seated in front of the computer rapidly turns it off and switches off the light. When the adult opens the door and finds nothing amiss, the child promptly switches on the lights and spots the adult standing behind them.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The child and the adult break into a choreographed dance routine instead of playing the prank.\nB: The child accidentally trips and falls while trying to switch off the light.\nC: The computer screen suddenly freezes, causing confusion for the child and the adult.\nD: The child who was seated in front of the computer rapidly turns it off and switches off the light. When the adult opens the door and finds nothing amiss, the child promptly switches on the lights and spots the adult standing behind them.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_151_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_151_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_151_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_151_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_151_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_151_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_151_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_151_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_151_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_151_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_151_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_151_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_151_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_151_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_151_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_151_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: Upon hearing the staff member boldly remark, \"Slow down, it's a big fat guy,\" directly in front of the man, he instantly turned his head in utter shock and fixed his gaze on the staff member. His wide-eyed stare revealed a mixture of astonishment, perplexity, and dazedness. Without giving the man a chance to respond, the staff member ruthlessly pushed him and initiated the cable car ride.\nB: The video maintains its humorous tone through the use of dark and offensive humor, which may not be suitable for everyone.\nC: The video maintains its humorous tone by incorporating slapstick comedy and exaggerated physical gestures.\nD: The video maintains its humorous tone by emphasizing awkward social interactions and uncomfortable situations.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: Upon hearing the staff member boldly remark, \"Slow down, it's a big fat guy,\" directly in front of the man, he instantly turned his head in utter shock and fixed his gaze on the staff member. His wide-eyed stare revealed a mixture of astonishment, perplexity, and dazedness. Without giving the man a chance to respond, the staff member ruthlessly pushed him and initiated the cable car ride.\nB: The video maintains its humorous tone through the use of dark and offensive humor, which may not be suitable for everyone.\nC: The video maintains its humorous tone by incorporating slapstick comedy and exaggerated physical gestures.\nD: The video maintains its humorous tone by emphasizing awkward social interactions and uncomfortable situations.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_152_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_152_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_152_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_152_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_152_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_152_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_152_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_152_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_152_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_152_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_152_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_152_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_152_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_152_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_152_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_152_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The Dance Floor Champion\nB: The Air Guitar Virtuoso\nC: The Lip-Sync Sensation\nD: The Karaoke Master",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The Dance Floor Champion\nB: The Air Guitar Virtuoso\nC: The Lip-Sync Sensation\nD: The Karaoke Master",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_153_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_153_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_153_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_153_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_153_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_153_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_153_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_153_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_153_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_153_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_153_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_153_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_153_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_153_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_153_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_153_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The video maintains its humorous tone by using dark and gloomy lighting to set a serious mood.\nB: The video maintains its humorous tone by adding intense music to create tension and suspense.\nC: The small canine was hesitant to swim, so the bigger dog lifted it up and took it along, creating a humorous scene.\nD: The video maintains its humorous tone by including sad and heartwrenching soundtrack to evoke emotional response.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The video maintains its humorous tone by using dark and gloomy lighting to set a serious mood.\nB: The video maintains its humorous tone by adding intense music to create tension and suspense.\nC: The small canine was hesitant to swim, so the bigger dog lifted it up and took it along, creating a humorous scene.\nD: The video maintains its humorous tone by including sad and heartwrenching soundtrack to evoke emotional response.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_154_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_154_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_154_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_154_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_154_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_154_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_154_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_154_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_154_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_154_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_154_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_154_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_154_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_154_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_154_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_154_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: A blue cake is being given to two young cats by a person, but one of the cats unexpectedly reaches out and bats the cake away with its claws.\nB: A red cake is being given to two dogs by a person, but one of the dogs unexpectedly reaches out and barks at the cake.\nC: A yellow cake is being given to two kittens by a person, but one of the kittens unexpectedly reaches out and licks the cake.\nD: A green cake is being given to two puppies by a person, but one of the puppies unexpectedly reaches out and eats the cake.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: A blue cake is being given to two young cats by a person, but one of the cats unexpectedly reaches out and bats the cake away with its claws.\nB: A red cake is being given to two dogs by a person, but one of the dogs unexpectedly reaches out and barks at the cake.\nC: A yellow cake is being given to two kittens by a person, but one of the kittens unexpectedly reaches out and licks the cake.\nD: A green cake is being given to two puppies by a person, but one of the puppies unexpectedly reaches out and eats the cake.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_155_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_155_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_155_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_155_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_155_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_155_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_155_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_155_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_155_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_155_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_155_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_155_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_155_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_155_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_155_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_155_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: Reflection of hands in the mirror\nB: Mystical hand movements in the reflected image\nC: Shadowy figures reflected in the glass\nD: Ghostly apparition in the mirror",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: Reflection of hands in the mirror\nB: Mystical hand movements in the reflected image\nC: Shadowy figures reflected in the glass\nD: Ghostly apparition in the mirror",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_156_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_156_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_156_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_156_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_156_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_156_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_156_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_156_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_156_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_156_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_156_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_156_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_156_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_156_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_156_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_156_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: Following the cleaning session, the individual tucked the broom beneath their armpit and subsequently couldn't locate it.\nB: The person comically mistook a mop for a broom and started using it as a microphone instead.\nC: After finishing the cleaning, the person accidentally tripped over the broom and fell comically.\nD: Once done with cleaning, the person used the broom as a guitar and performed a funny air guitar solo.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: Following the cleaning session, the individual tucked the broom beneath their armpit and subsequently couldn't locate it.\nB: The person comically mistook a mop for a broom and started using it as a microphone instead.\nC: After finishing the cleaning, the person accidentally tripped over the broom and fell comically.\nD: Once done with cleaning, the person used the broom as a guitar and performed a funny air guitar solo.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_157_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_157_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_157_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_157_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_157_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_157_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_157_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_157_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_157_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_157_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_157_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_157_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_157_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_157_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_157_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_157_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: A high-speed chase\nB: A discarded vehicle.\nC: A luxury car showroom\nD: A traffic jam on the highway",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: A high-speed chase\nB: A discarded vehicle.\nC: A luxury car showroom\nD: A traffic jam on the highway",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_158_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_158_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_158_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_158_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_158_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_158_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_158_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_158_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_158_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_158_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_158_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_158_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_158_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_158_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_158_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_158_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: Dogs scared of needles during vet visit\nB: Canine vaccination anxiety revealed\nC: Canine pups are also frightened of getting vaccinated.\nD: Adorable puppies facing their worst fear",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: Dogs scared of needles during vet visit\nB: Canine vaccination anxiety revealed\nC: Canine pups are also frightened of getting vaccinated.\nD: Adorable puppies facing their worst fear",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_159_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_159_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_159_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_159_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_159_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_159_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_159_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_159_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_159_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_159_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_159_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_159_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_159_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_159_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_159_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_159_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The unexpected twist in the storyline creates a funny outcome.\nB: The use of exaggerated facial expressions makes the video amusing.\nC: The humorous background music adds to the comedic effect.\nD:  The funny thing is the expression on the lady's face, as if she is putting in a lot of effort, which is quite comical.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The unexpected twist in the storyline creates a funny outcome.\nB: The use of exaggerated facial expressions makes the video amusing.\nC: The humorous background music adds to the comedic effect.\nD:  The funny thing is the expression on the lady's face, as if she is putting in a lot of effort, which is quite comical.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_160_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_160_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_160_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_160_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_160_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_160_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_160_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_160_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_160_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_160_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_160_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_160_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_160_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_160_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_160_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_160_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: At a crowded market, a monkey was laughing loudly while stealing fruits.\nB: In a library, a cat was meowing loudly while knocking down books.\nC: On the coastal pathway, one bird was screeching with an open beak, followed by another bird's call.\nD: In a classroom, a dog was howling loudly while running around the desk.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: At a crowded market, a monkey was laughing loudly while stealing fruits.\nB: In a library, a cat was meowing loudly while knocking down books.\nC: On the coastal pathway, one bird was screeching with an open beak, followed by another bird's call.\nD: In a classroom, a dog was howling loudly while running around the desk.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_161_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_161_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_161_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_161_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_161_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_161_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_161_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_161_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_161_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_161_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_161_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_161_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_161_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_161_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_161_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_161_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: Discovering the hidden secrets of the universe through music.\nB: A funny look at pet care techniques.\nC: Mastering the art of cooking without a recipe.\nD: The ideal applications of a stethoscope.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: Discovering the hidden secrets of the universe through music.\nB: A funny look at pet care techniques.\nC: Mastering the art of cooking without a recipe.\nD: The ideal applications of a stethoscope.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_162_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_162_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_162_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_162_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_162_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_162_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_162_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_162_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_162_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_162_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_162_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_162_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_162_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_162_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_162_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_162_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The Joker and Batman are dancing in perfect sync.\nB: The head of Kung Fu Panda is perfectly aligned with Mickey Mouse's body, with the addition of a pair of human legs.\nC: Minnie Mouse and Donald Duck are having a tea party together.\nD: SpongeBob SquarePants is playing the piano with all four arms.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The Joker and Batman are dancing in perfect sync.\nB: The head of Kung Fu Panda is perfectly aligned with Mickey Mouse's body, with the addition of a pair of human legs.\nC: Minnie Mouse and Donald Duck are having a tea party together.\nD: SpongeBob SquarePants is playing the piano with all four arms.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_163_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_163_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_163_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_163_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_163_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_163_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_163_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_163_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_163_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_163_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_163_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_163_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_163_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_163_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_163_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_163_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: Three friends jumping on a trampoline.\nB: Two women are trying to balance on a seesaw.\nC: Sisters falling together from a swing.\nD:  If they are sisters, they will fall together on level ground.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: Three friends jumping on a trampoline.\nB: Two women are trying to balance on a seesaw.\nC: Sisters falling together from a swing.\nD:  If they are sisters, they will fall together on level ground.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_164_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_164_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_164_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_164_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_164_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_164_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_164_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_164_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_164_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_164_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_164_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_164_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_164_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_164_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_164_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_164_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The playful music in the background\nB: The comical facial expressions of the onlookers\nC: It was a funny sight to see the woman drop from the swing.\nD: The unexpectedness of the woman dropping from the swing",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The playful music in the background\nB: The comical facial expressions of the onlookers\nC: It was a funny sight to see the woman drop from the swing.\nD: The unexpectedness of the woman dropping from the swing",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_165_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_165_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_165_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_165_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_165_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_165_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_165_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_165_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_165_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_165_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_165_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_165_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_165_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_165_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_165_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_165_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: A dog suddenly ran onto the mat, causing the high jumper to lose balance and fall.\nB: The high jump bar fell due to a strong wind, causing chaos in the competition.\nC: Someone jumped high and landed on the edge of the mat, resulting in the mat being overturned and the high jump apparatus being knocked down.\nD: The person attempted a somersault but failed and crashed into the audience.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: A dog suddenly ran onto the mat, causing the high jumper to lose balance and fall.\nB: The high jump bar fell due to a strong wind, causing chaos in the competition.\nC: Someone jumped high and landed on the edge of the mat, resulting in the mat being overturned and the high jump apparatus being knocked down.\nD: The person attempted a somersault but failed and crashed into the audience.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_166_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_166_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_166_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_166_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_166_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_166_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_166_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_166_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_166_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_166_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_166_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_166_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_166_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_166_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_166_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_166_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: Boring Haircut.\nB: Flaming Hairstyle.\nC: Soggy Hairstyle.\nD: Plain Hairstyle.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: Boring Haircut.\nB: Flaming Hairstyle.\nC: Soggy Hairstyle.\nD: Plain Hairstyle.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_167_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_167_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_167_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_167_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_167_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_167_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_167_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_167_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_167_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_167_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_167_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_167_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_167_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_167_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_167_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_167_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: With its eyes covered, a donkey is being walked around a millstone, which causes it to eventually start nibbling on the grains on the plate.\nB: The millstone is spinning while the donkey watches from a distance.\nC: A donkey is leisurely walking around a millstone, observing its surroundings.\nD: The donkey is standing still, not showing any reaction to the millstone.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: With its eyes covered, a donkey is being walked around a millstone, which causes it to eventually start nibbling on the grains on the plate.\nB: The millstone is spinning while the donkey watches from a distance.\nC: A donkey is leisurely walking around a millstone, observing its surroundings.\nD: The donkey is standing still, not showing any reaction to the millstone.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_168_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_168_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_168_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_168_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_168_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_168_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_168_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_168_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_168_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_168_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_168_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_168_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_168_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_168_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_168_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_168_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: A small dog jumping through a hula hoop.\nB: A little dog leaping over a bungee cord.\nC: A tiny pup skipping over an elastic cord.\nD: A miniature puppy hopping over a rubber band.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: A small dog jumping through a hula hoop.\nB: A little dog leaping over a bungee cord.\nC: A tiny pup skipping over an elastic cord.\nD: A miniature puppy hopping over a rubber band.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_169_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_169_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_169_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_169_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_169_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_169_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_169_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_169_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_169_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_169_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_169_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_169_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_169_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_169_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_169_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_169_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: A dump truck is driving on a highway with multiple lanes. The sun is setting in the background, casting long shadows from the surrounding trees.\nB: A delivery van is parked outside a cafe, with people passing by and clouds drifting slowly across the sky.\nC: A cement mixer truck is moving through a tunnel with a series of lights along both sides of the tunnel. The ladder at the rear of the truck produces shadows in varying locations as it moves beneath the different lights.\nD: A bicycle is traversing a bridge with colorful graffiti on the walls. The river below reflects the bridge's arches in a mesmerizing pattern.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: A dump truck is driving on a highway with multiple lanes. The sun is setting in the background, casting long shadows from the surrounding trees.\nB: A delivery van is parked outside a cafe, with people passing by and clouds drifting slowly across the sky.\nC: A cement mixer truck is moving through a tunnel with a series of lights along both sides of the tunnel. The ladder at the rear of the truck produces shadows in varying locations as it moves beneath the different lights.\nD: A bicycle is traversing a bridge with colorful graffiti on the walls. The river below reflects the bridge's arches in a mesmerizing pattern.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_170_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_170_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_170_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_170_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_170_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_170_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_170_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_170_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_170_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_170_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_170_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_170_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_170_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_170_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_170_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_170_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: Way off target.\nB: Bullseye!\nC:  Right on target.\nD: Missed the mark.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: Way off target.\nB: Bullseye!\nC:  Right on target.\nD: Missed the mark.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_171_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_171_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_171_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_171_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_171_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_171_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_171_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_171_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_171_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_171_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_171_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_171_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_171_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_171_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_171_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_171_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The video is funny because the man singing is very passionate, while the man sitting next to him appears to be bored and uninterested, reducing the humor.\nB: The video is funny because the man singing is very passionate, while the man sitting next to him looks very uncomfortable and in pain after hearing it. The contrast between the two is amusing, showing that the man's singing is actually quite good, which makes people laugh.\nC: The video is funny because the man singing is very passionate, while the man sitting next to him is also enjoying the singing, which adds to the fun atmosphere.\nD:  The video is funny because the man singing is very passionate, while the man sitting next to him looks very uncomfortable and in pain after hearing it. The contrast between the two is amusing, showing that the man's singing is so bad yet he is still so into it, which makes people laugh.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The video is funny because the man singing is very passionate, while the man sitting next to him appears to be bored and uninterested, reducing the humor.\nB: The video is funny because the man singing is very passionate, while the man sitting next to him looks very uncomfortable and in pain after hearing it. The contrast between the two is amusing, showing that the man's singing is actually quite good, which makes people laugh.\nC: The video is funny because the man singing is very passionate, while the man sitting next to him is also enjoying the singing, which adds to the fun atmosphere.\nD:  The video is funny because the man singing is very passionate, while the man sitting next to him looks very uncomfortable and in pain after hearing it. The contrast between the two is amusing, showing that the man's singing is so bad yet he is still so into it, which makes people laugh.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_172_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_172_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_172_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_172_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_172_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_172_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_172_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_172_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_172_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_172_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_172_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_172_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_172_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_172_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_172_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_172_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The video maintains its humorous tone by showing a person and a cat dance in unison, which is a rare sight, and the cat's happy and excited expression adds a touch of humor to the situation.\nB: Seeing a person and a dog dance in unison is a rare sight, and the little pup's confused and disoriented expression adds a touch of humor to the situation.\nC: The video maintains its humorous tone by showing a cat and a dog dancing in unison, which is a rare sight, and the cat's confused and disoriented expression adds a touch of humor to the situation.\nD: The video maintains its humorous tone by showing a person and a dog dance in unison, which is a common sight, and the little dog's confident and focused expression adds a touch of seriousness to the situation.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The video maintains its humorous tone by showing a person and a cat dance in unison, which is a rare sight, and the cat's happy and excited expression adds a touch of humor to the situation.\nB: Seeing a person and a dog dance in unison is a rare sight, and the little pup's confused and disoriented expression adds a touch of humor to the situation.\nC: The video maintains its humorous tone by showing a cat and a dog dancing in unison, which is a rare sight, and the cat's confused and disoriented expression adds a touch of humor to the situation.\nD: The video maintains its humorous tone by showing a person and a dog dance in unison, which is a common sight, and the little dog's confident and focused expression adds a touch of seriousness to the situation.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_173_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_173_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_173_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_173_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_173_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_173_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_173_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_173_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_173_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_173_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_173_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_173_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_173_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_173_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_173_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_173_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: There are no other puppies in the video, it's just the dog shaking up and down.\nB:  The dog is shaking up and down in mid-air, while its feet keep stepping on another puppy's head repeatedly. It looks like it's doing pull-ups, and the movement is very strange.\nC: The dog is actually flying and not shaking up and down.\nD: The dog is calmly sitting on the ground and not moving at all.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: There are no other puppies in the video, it's just the dog shaking up and down.\nB:  The dog is shaking up and down in mid-air, while its feet keep stepping on another puppy's head repeatedly. It looks like it's doing pull-ups, and the movement is very strange.\nC: The dog is actually flying and not shaking up and down.\nD: The dog is calmly sitting on the ground and not moving at all.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_174_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_174_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_174_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_174_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_174_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_174_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_174_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_174_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_174_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_174_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_174_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_174_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_174_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_174_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_174_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_174_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: Amidst the background noise of a person's loud laughter, two birds begin to cackle with their beaks wide open, much like human laughter. It appears as if they are responding to the human's merriment, making the scene quite comical.\nB: The video is humorous because the birds are actually speaking in human language and telling jokes.\nC: The comical element comes from the birds nervously laughing in response to the person's aggressive behavior.\nD: The humor arises from the birds mimicking the laughter of the person, creating an eerie and unsettling atmosphere.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: Amidst the background noise of a person's loud laughter, two birds begin to cackle with their beaks wide open, much like human laughter. It appears as if they are responding to the human's merriment, making the scene quite comical.\nB: The video is humorous because the birds are actually speaking in human language and telling jokes.\nC: The comical element comes from the birds nervously laughing in response to the person's aggressive behavior.\nD: The humor arises from the birds mimicking the laughter of the person, creating an eerie and unsettling atmosphere.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_175_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_175_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_175_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_175_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_175_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_175_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_175_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_175_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_175_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_175_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_175_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_175_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_175_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_175_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_175_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_175_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The kitty is very hungry.\nB:  This kitty won't eat.\nC: The cat is eagerly devouring its food.\nD: This cat is enjoying a meal.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The kitty is very hungry.\nB:  This kitty won't eat.\nC: The cat is eagerly devouring its food.\nD: This cat is enjoying a meal.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_176_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_176_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_176_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_176_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_176_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_176_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_176_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_176_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_176_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_176_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_176_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_176_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_176_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_176_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_176_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_176_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The ladder's shadows looked scary and ominous in the tunnel.\nB:  In the tunnel, the cement truck kept moving forward. The shadows of the ladder under different lights seemed to be dancing, making the ladder very interesting.\nC: The cement truck's movement was unpredictable and chaotic.\nD: The video was boring and lacked any comedic elements.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The ladder's shadows looked scary and ominous in the tunnel.\nB:  In the tunnel, the cement truck kept moving forward. The shadows of the ladder under different lights seemed to be dancing, making the ladder very interesting.\nC: The cement truck's movement was unpredictable and chaotic.\nD: The video was boring and lacked any comedic elements.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_177_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_177_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_177_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_177_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_177_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_177_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_177_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_177_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_177_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_177_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_177_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_177_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_177_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_177_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_177_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_177_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: A random spectator enjoying the tug-of-war match.\nB: The most exceptional player in the audience during a tug-of-war event.\nC: The unexpected hero of a sports event.\nD: The surprising winner of a competitive game.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: A random spectator enjoying the tug-of-war match.\nB: The most exceptional player in the audience during a tug-of-war event.\nC: The unexpected hero of a sports event.\nD: The surprising winner of a competitive game.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_178_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_178_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_178_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_178_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_178_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_178_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_178_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_178_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_178_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_178_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_178_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_178_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_178_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_178_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_178_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_178_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The old man tries to use the stethoscope as a microphone.\nB: The woman uses the stethoscope to listen to the old man's heartbeat.\nC: The old man uses the stethoscope to listen to music.\nD:  The old man puts the stethoscope on his ears, and the woman speaks into the other end.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The old man tries to use the stethoscope as a microphone.\nB: The woman uses the stethoscope to listen to the old man's heartbeat.\nC: The old man uses the stethoscope to listen to music.\nD:  The old man puts the stethoscope on his ears, and the woman speaks into the other end.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_179_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_179_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_179_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_179_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_179_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_179_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_179_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_179_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_179_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_179_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_179_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_179_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_179_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_179_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_179_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_179_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: Geospatial disturbance.\nB: Dimensional shift.\nC: Temporal distortion.\nD: Spatial displacement.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: Geospatial disturbance.\nB: Dimensional shift.\nC: Temporal distortion.\nD: Spatial displacement.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_180_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_180_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_180_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_180_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_180_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_180_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_180_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_180_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_180_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_180_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_180_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_180_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_180_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_180_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_180_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_180_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The video is comedic because the person fails to jump onto the cushion and falls down clumsily.\nB: After jumping onto a cushion, the person manages to flip the entire cushion over, which is an unforeseen turn of events.\nC: The video is comedic because the person successfully jumps onto the cushion but then just walks away.\nD: The video is comedic because the cushion remains unaffected and the person looks disappointed.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The video is comedic because the person fails to jump onto the cushion and falls down clumsily.\nB: After jumping onto a cushion, the person manages to flip the entire cushion over, which is an unforeseen turn of events.\nC: The video is comedic because the person successfully jumps onto the cushion but then just walks away.\nD: The video is comedic because the cushion remains unaffected and the person looks disappointed.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_181_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_181_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_181_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_181_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_181_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_181_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_181_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_181_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_181_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_181_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_181_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_181_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_181_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_181_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_181_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_181_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The video is not funny because the man's failed stunt results in a serious injury and is actually quite disturbing.\nB: The video is funny because the man successfully performs a cool stunt by leaping over a garbage bin with elegance and grace.\nC: The video is humorous because the man attempts to pull off a cool stunt by leaping over a garbage bin, but his efforts result in failure and a comical groin injury.\nD: The video is funny because the man is not attempting any stunt and is just walking casually.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The video is not funny because the man's failed stunt results in a serious injury and is actually quite disturbing.\nB: The video is funny because the man successfully performs a cool stunt by leaping over a garbage bin with elegance and grace.\nC: The video is humorous because the man attempts to pull off a cool stunt by leaping over a garbage bin, but his efforts result in failure and a comical groin injury.\nD: The video is funny because the man is not attempting any stunt and is just walking casually.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_182_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_182_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_182_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_182_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_182_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_182_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_182_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_182_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_182_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_182_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_182_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_182_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_182_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_182_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_182_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_182_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: A dog ran through and caused chaos by knocking over a table.\nB: A man suddenly tripped and knocked over a stack of chairs.\nC: When a woman fell, she inadvertently caused another woman to lose her balance.\nD: Two people collided while trying to catch a flying object.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: A dog ran through and caused chaos by knocking over a table.\nB: A man suddenly tripped and knocked over a stack of chairs.\nC: When a woman fell, she inadvertently caused another woman to lose her balance.\nD: Two people collided while trying to catch a flying object.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_183_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_183_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_183_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_183_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_183_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_183_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_183_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_183_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_183_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_183_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_183_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_183_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_183_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_183_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_183_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_183_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The sound of the puppy's bark is uncannily similar to that of a baby's weeping. Standard puppy barks sound like \"woof woof woof,\" but this one's barks are \"ah ah ah\" with a piercing sound.\nB: The video maintains its humorous tone by juxtaposing the adorable visuals of the puppy with the unexpected high-pitched barks, creating a humorous and surprising experience for the viewers.\nC: The video maintains its humorous tone by incorporating funny captions that highlight the unusual sounds of the puppy's barks, enhancing the comedic effect of the meme.\nD: The video maintains its humorous tone through the clever use of unexpected sound effects, creating a comical contrast between the puppy's appearance and its unique barks.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The sound of the puppy's bark is uncannily similar to that of a baby's weeping. Standard puppy barks sound like \"woof woof woof,\" but this one's barks are \"ah ah ah\" with a piercing sound.\nB: The video maintains its humorous tone by juxtaposing the adorable visuals of the puppy with the unexpected high-pitched barks, creating a humorous and surprising experience for the viewers.\nC: The video maintains its humorous tone by incorporating funny captions that highlight the unusual sounds of the puppy's barks, enhancing the comedic effect of the meme.\nD: The video maintains its humorous tone through the clever use of unexpected sound effects, creating a comical contrast between the puppy's appearance and its unique barks.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_184_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_184_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_184_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_184_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_184_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_184_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_184_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_184_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_184_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_184_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_184_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_184_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_184_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_184_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_184_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_184_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: A man is playing guitar at a concert with a deep, soothing voice and standing still.\nB: A man is reciting poetry in a library with a soft, calm voice and remaining seated.\nC: A man is rapping on stage with a loud, energetic voice and dancing wildly.\nD: A man is singing at KTV with a sharp, high-pitched voice and performing a forward-leaning arrow step stance.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: A man is playing guitar at a concert with a deep, soothing voice and standing still.\nB: A man is reciting poetry in a library with a soft, calm voice and remaining seated.\nC: A man is rapping on stage with a loud, energetic voice and dancing wildly.\nD: A man is singing at KTV with a sharp, high-pitched voice and performing a forward-leaning arrow step stance.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_185_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_185_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_185_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_185_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_185_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_185_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_185_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_185_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_185_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_185_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_185_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_185_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_185_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_185_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_185_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_185_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The man was too scared to start the zip-line.\nB: The staff member pushed the man without any reason.\nC: The man started the zip-line without looking around.\nD: Upon hearing the words of the staff member nearby, the man who was about to start the zip-line turned to stare at them before being swiftly pushed to begin the course.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The man was too scared to start the zip-line.\nB: The staff member pushed the man without any reason.\nC: The man started the zip-line without looking around.\nD: Upon hearing the words of the staff member nearby, the man who was about to start the zip-line turned to stare at them before being swiftly pushed to begin the course.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_186_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_186_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_186_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_186_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_186_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_186_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_186_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_186_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_186_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_186_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_186_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_186_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_186_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_186_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_186_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_186_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: At a birthday party, a man is painting with a brush in his hand, while another person is playing the guitar with intense focus.\nB: Inside a library, a person is quietly reading a book, while another individual is loudly discussing a topic on their phone.\nC: In a classroom, a teacher is scolding a student for not paying attention, while a group of students is laughing and having fun.\nD: Inside a KTV, a man is singing with a microphone in his grasp, keeping both hands open. On the adjacent sofa, a man is sitting with a fierce expression, unable to look straight ahead, with a downturned mouth.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: At a birthday party, a man is painting with a brush in his hand, while another person is playing the guitar with intense focus.\nB: Inside a library, a person is quietly reading a book, while another individual is loudly discussing a topic on their phone.\nC: In a classroom, a teacher is scolding a student for not paying attention, while a group of students is laughing and having fun.\nD: Inside a KTV, a man is singing with a microphone in his grasp, keeping both hands open. On the adjacent sofa, a man is sitting with a fierce expression, unable to look straight ahead, with a downturned mouth.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_187_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_187_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_187_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_187_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_187_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_187_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_187_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_187_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_187_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_187_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_187_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_187_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_187_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_187_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_187_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_187_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A:  A little dog doing pull-ups.\nB: A young bird learning to fly for the first time.\nC: A small cat napping on a tree branch.\nD: A tiny hamster running on a miniature treadmill.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA:  A little dog doing pull-ups.\nB: A young bird learning to fly for the first time.\nC: A small cat napping on a tree branch.\nD: A tiny hamster running on a miniature treadmill.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_188_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_188_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_188_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_188_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_188_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_188_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_188_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_188_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_188_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_188_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_188_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_188_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_188_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_188_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_188_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_188_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The well-dressed main character exudes sophistication.\nB: The run-down look of the car appears comical.\nC: The high-quality production value enhances the realism.\nD: The dramatic music in the background adds to the serious tone.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The well-dressed main character exudes sophistication.\nB: The run-down look of the car appears comical.\nC: The high-quality production value enhances the realism.\nD: The dramatic music in the background adds to the serious tone.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_189_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_189_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_189_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_189_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_189_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_189_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_189_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_189_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_189_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_189_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_189_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_189_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_189_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_189_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_189_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_189_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The person secured a rubber band to the puppy and initiated a game of jump rope.\nB: The puppy started dancing to a classical music piece.\nC: The puppy started speaking in fluent English.\nD: The person tried to ride the puppy like a bull.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The person secured a rubber band to the puppy and initiated a game of jump rope.\nB: The puppy started dancing to a classical music piece.\nC: The puppy started speaking in fluent English.\nD: The person tried to ride the puppy like a bull.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_190_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_190_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_190_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_190_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_190_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_190_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_190_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_190_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_190_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_190_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_190_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_190_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_190_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_190_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_190_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_190_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The use of dramatic music in the background adds to the comical effect.\nB: The slow motion effect used in the video enhances the comedic timing.\nC: The sudden appearance of a clown is unexpected and causes laughter.\nD: The situation of someone being unable to locate the broom despite placing it there themselves is humorous.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The use of dramatic music in the background adds to the comical effect.\nB: The slow motion effect used in the video enhances the comedic timing.\nC: The sudden appearance of a clown is unexpected and causes laughter.\nD: The situation of someone being unable to locate the broom despite placing it there themselves is humorous.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_191_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_191_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_191_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_191_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_191_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_191_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_191_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_191_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_191_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_191_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_191_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_191_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_191_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_191_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_191_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_191_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: The man and woman were about to leave the dance floor when the male bystander stopped them.\nB: The man and woman were arguing until the male bystander interrupted them.\nC: A man and woman were holding each other and dancing at a ball. A male bystander came over and tapped the man on his back, and soon after, the two men began dancing together.\nD: The man and woman were performing a traditional dance until the male bystander joined in.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: The man and woman were about to leave the dance floor when the male bystander stopped them.\nB: The man and woman were arguing until the male bystander interrupted them.\nC: A man and woman were holding each other and dancing at a ball. A male bystander came over and tapped the man on his back, and soon after, the two men began dancing together.\nD: The man and woman were performing a traditional dance until the male bystander joined in.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_192_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_192_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_192_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_192_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_192_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_192_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_192_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_192_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_192_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_192_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_192_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_192_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_192_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_192_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_192_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_192_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: Using flames to style hair is an uncommon sight, and the man's baldness on the upper portion of his head adds a comical element.\nB: The use of water to style hair is a common sight, and the man's baldness on the upper portion of his head adds a serious element.\nC: The use of feathers to style hair is an uncommon sight, and the man's full head of hair adds a comical element.\nD: The use of confetti to style hair is a common sight, and the man's baldness on the upper portion of his head adds a dramatic element.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: Using flames to style hair is an uncommon sight, and the man's baldness on the upper portion of his head adds a comical element.\nB: The use of water to style hair is a common sight, and the man's baldness on the upper portion of his head adds a serious element.\nC: The use of feathers to style hair is an uncommon sight, and the man's full head of hair adds a comical element.\nD: The use of confetti to style hair is a common sight, and the man's baldness on the upper portion of his head adds a dramatic element.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_193_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_193_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_193_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_193_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_193_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_193_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_193_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_193_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_193_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_193_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_193_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_193_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_193_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_193_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_193_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_193_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: Using his hand to tap the mirror, the man then linked his hands together, only to reveal that he had a phone gripped in his mouth.\nB: The man used his hand to tap the mirror, then he linked his hands together, and finally he showed that he had a wallet stuck in his mouth.\nC: Initially tapping the mirror, the man then interlocked his fingers, ultimately exposing that he was holding a toy car in his mouth.\nD: After tapping the mirror with his hand, the man formed a heart shape with his fingers, leading to the reveal of a sandwich he was holding in his mouth.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: Using his hand to tap the mirror, the man then linked his hands together, only to reveal that he had a phone gripped in his mouth.\nB: The man used his hand to tap the mirror, then he linked his hands together, and finally he showed that he had a wallet stuck in his mouth.\nC: Initially tapping the mirror, the man then interlocked his fingers, ultimately exposing that he was holding a toy car in his mouth.\nD: After tapping the mirror with his hand, the man formed a heart shape with his fingers, leading to the reveal of a sandwich he was holding in his mouth.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_194_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_194_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_194_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_194_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_194_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_194_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_194_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_194_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_194_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_194_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_194_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_194_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_194_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_194_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_194_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_194_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: A dog is resting atop another dog.\nB: Two dogs playing together in the park.\nC: A cat is sleeping on a dog's back.\nD: A dog is jumping over another dog.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: A dog is resting atop another dog.\nB: Two dogs playing together in the park.\nC: A cat is sleeping on a dog's back.\nD: A dog is jumping over another dog.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_195_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_195_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_195_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_195_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_195_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_195_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_195_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_195_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_195_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_195_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_195_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_195_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_195_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_195_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_195_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_195_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: Martial Arts Mickey Mouse.\nB: Ninja SpongeBob SquarePants.\nC: Karate Bugs Bunny.\nD: Kung Fu Donald Duck.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: Martial Arts Mickey Mouse.\nB: Ninja SpongeBob SquarePants.\nC: Karate Bugs Bunny.\nD: Kung Fu Donald Duck.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_196_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_196_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_196_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_196_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_196_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_196_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_196_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_196_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_196_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_196_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_196_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_196_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_196_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_196_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_196_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_196_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: There is no music playing in the video, it is silent.\nB: The men in the video are not dancing, but are standing still.\nC: The dog is not involved in the dance, but is just sitting in the background.\nD: A dog is part of the dance as two men groove to the music.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: There is no music playing in the video, it is silent.\nB: The men in the video are not dancing, but are standing still.\nC: The dog is not involved in the dance, but is just sitting in the background.\nD: A dog is part of the dance as two men groove to the music.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_197_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_197_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_197_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_197_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_197_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_197_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_197_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_197_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_197_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_197_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_197_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_197_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_197_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_197_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_197_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_197_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: Sitting on a swing, the woman couldn't maintain stability and ended up falling down.\nB: The woman gracefully mastered the art of swinging and looked like a professional gymnast.\nC: The swing gently rocked back and forth as the woman enjoyed a peaceful moment.\nD: The woman effortlessly balanced on the swing, demonstrating impressive acrobatic skills.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: Sitting on a swing, the woman couldn't maintain stability and ended up falling down.\nB: The woman gracefully mastered the art of swinging and looked like a professional gymnast.\nC: The swing gently rocked back and forth as the woman enjoyed a peaceful moment.\nD: The woman effortlessly balanced on the swing, demonstrating impressive acrobatic skills.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_198_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_198_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_198_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_198_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_198_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_198_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_198_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_198_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_198_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_198_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_198_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_198_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_198_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_198_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_198_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_198_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "meme_vedio_understanding",
+    "visual_input_component": "Video image or Natural image",
+    "source": "fun_qa",
+    "options": "A: Blindfolded cooking experiment\nB: Unexpected blindfold challenge\nC: Blindfolded taste test gone wrong!\nD: The love for food persists even when blindfolded.",
+    "question": "Please generate a description for this meme",
+    "context": "Select from the following choices.\nA: Blindfolded cooking experiment\nB: Unexpected blindfold challenge\nC: Blindfolded taste test gone wrong!\nD: The love for food persists even when blindfolded.",
+    "input_image_path": [
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_199_0.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_199_1.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_199_2.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_199_3.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_199_4.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_199_5.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_199_6.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_199_7.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_199_8.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_199_9.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_199_10.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_199_11.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_199_12.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_199_13.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_199_14.png",
+      "../MMIU-Benchmark/meme_vedio_understanding/meme_vedio_understanding_199_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: BaseballPitch.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_0_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_0_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_0_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_0_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: GolfSwing.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_1_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_1_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_1_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_1_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: Billiards.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_2_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_2_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_2_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_2_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: HighJump.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_3_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_3_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_3_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_3_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: Shotput.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_4_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_4_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_4_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_4_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: HighJump.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_5_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_5_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_5_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_5_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: CricketBowling.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_6_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_6_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_6_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_6_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: BaseballPitch.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_7_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_7_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_7_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_7_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: CricketShot.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_8_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_8_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_8_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_8_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: JavelinThrow.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_9_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_9_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_9_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_9_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: PoleVault.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_10_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_10_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_10_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_10_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: VolleyballSpiking.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_11_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_11_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_11_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_11_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: LongJump.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_12_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_12_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_12_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_12_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: HighJump.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_13_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_13_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_13_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_13_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: BasketballDunk.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_14_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_14_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_14_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_14_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: TennisSwing.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_15_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_15_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_15_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_15_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: SoccerPenalty.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_16_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_16_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_16_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_16_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: JavelinThrow.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_17_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_17_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_17_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_17_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: CleanAndJerk.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_18_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_18_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_18_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_18_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: CricketShot.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_19_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_19_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_19_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_19_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: CleanAndJerk.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_20_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_20_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_20_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_20_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: CricketShot.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_21_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_21_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_21_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_21_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: CleanAndJerk.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_22_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_22_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_22_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_22_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: PoleVault.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_23_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_23_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_23_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_23_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: Shotput.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_24_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_24_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_24_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_24_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: CliffDiving.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_25_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_25_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_25_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_25_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: Shotput.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_26_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_26_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_26_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_26_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: HammerThrow.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_27_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_27_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_27_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_27_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: LongJump.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_28_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_28_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_28_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_28_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: JavelinThrow.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_29_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_29_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_29_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_29_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: CricketBowling.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_30_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_30_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_30_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_30_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: Billiards.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_31_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_31_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_31_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_31_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: LongJump.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_32_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_32_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_32_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_32_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: Billiards.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_33_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_33_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_33_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_33_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: HammerThrow.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_34_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_34_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_34_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_34_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: TennisSwing.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_35_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_35_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_35_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_35_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: Diving.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_36_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_36_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_36_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_36_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: HighJump.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_37_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_37_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_37_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_37_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: BaseballPitch.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_38_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_38_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_38_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_38_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: Shotput.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_39_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_39_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_39_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_39_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: PoleVault.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_40_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_40_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_40_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_40_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: BasketballDunk.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_41_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_41_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_41_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_41_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: CleanAndJerk.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_42_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_42_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_42_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_42_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: Diving.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_43_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_43_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_43_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_43_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: CliffDiving.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_44_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_44_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_44_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_44_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: GolfSwing.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_45_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_45_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_45_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_45_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: ThrowDiscus.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_46_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_46_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_46_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_46_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: PoleVault.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_47_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_47_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_47_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_47_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: GolfSwing.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_48_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_48_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_48_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_48_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: CliffDiving.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_49_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_49_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_49_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_49_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: FrisbeeCatch.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_50_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_50_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_50_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_50_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: FrisbeeCatch.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_51_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_51_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_51_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_51_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: CricketBowling.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_52_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_52_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_52_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_52_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: GolfSwing.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_53_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_53_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_53_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_53_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: ThrowDiscus.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_54_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_54_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_54_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_54_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: CliffDiving.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_55_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_55_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_55_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_55_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: SoccerPenalty.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_56_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_56_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_56_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_56_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: Diving.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_57_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_57_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_57_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_57_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: CleanAndJerk.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_58_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_58_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_58_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_58_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: VolleyballSpiking.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_59_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_59_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_59_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_59_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: BaseballPitch.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_60_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_60_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_60_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_60_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: PoleVault.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_61_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_61_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_61_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_61_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: CricketBowling.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_62_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_62_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_62_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_62_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: Diving.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_63_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_63_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_63_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_63_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: BasketballDunk.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_64_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_64_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_64_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_64_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: LongJump.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_65_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_65_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_65_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_65_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: HighJump.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_66_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_66_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_66_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_66_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: LongJump.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_67_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_67_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_67_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_67_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: Diving.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_68_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_68_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_68_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_68_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: PoleVault.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_69_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_69_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_69_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_69_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: JavelinThrow.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_70_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_70_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_70_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_70_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: CricketShot.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_71_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_71_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_71_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_71_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: CricketShot.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_72_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_72_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_72_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_72_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: HighJump.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_73_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_73_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_73_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_73_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: VolleyballSpiking.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_74_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_74_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_74_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_74_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: BaseballPitch.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_75_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_75_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_75_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_75_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: GolfSwing.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_76_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_76_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_76_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_76_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: Shotput.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_77_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_77_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_77_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_77_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: HammerThrow.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_78_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_78_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_78_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_78_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: CricketShot.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_79_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_79_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_79_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_79_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: FrisbeeCatch.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_80_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_80_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_80_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_80_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: TennisSwing.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_81_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_81_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_81_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_81_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: CricketBowling.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_82_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_82_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_82_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_82_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: Diving.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_83_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_83_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_83_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_83_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: Diving.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_84_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_84_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_84_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_84_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: Diving.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_85_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_85_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_85_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_85_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: CricketShot.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_86_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_86_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_86_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_86_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: Shotput.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_87_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_87_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_87_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_87_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: TennisSwing.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_88_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_88_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_88_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_88_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: CricketBowling.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_89_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_89_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_89_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_89_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: VolleyballSpiking.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_90_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_90_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_90_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_90_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: ThrowDiscus.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_91_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_91_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_91_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_91_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: CricketBowling.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_92_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_92_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_92_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_92_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: VolleyballSpiking.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_93_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_93_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_93_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_93_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: TennisSwing.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_94_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_94_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_94_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_94_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: BasketballDunk.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_95_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_95_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_95_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_95_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: VolleyballSpiking.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_96_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_96_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_96_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_96_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: CleanAndJerk.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_97_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_97_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_97_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_97_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: CricketBowling.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_98_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_98_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_98_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_98_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: VolleyballSpiking.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_99_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_99_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_99_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_99_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: HammerThrow.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_100_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_100_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_100_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_100_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: HighJump.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_101_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_101_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_101_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_101_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: Diving.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_102_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_102_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_102_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_102_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: BasketballDunk.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_103_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_103_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_103_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_103_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: GolfSwing.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_104_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_104_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_104_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_104_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: CricketBowling.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_105_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_105_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_105_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_105_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: ThrowDiscus.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_106_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_106_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_106_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_106_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: SoccerPenalty.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_107_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_107_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_107_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_107_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: SoccerPenalty.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_108_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_108_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_108_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_108_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: ThrowDiscus.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_109_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_109_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_109_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_109_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: Diving.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_110_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_110_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_110_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_110_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: VolleyballSpiking.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_111_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_111_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_111_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_111_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: Diving.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_112_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_112_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_112_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_112_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: CricketBowling.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_113_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_113_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_113_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_113_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: SoccerPenalty.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_114_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_114_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_114_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_114_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: PoleVault.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_115_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_115_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_115_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_115_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: JavelinThrow.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_116_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_116_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_116_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_116_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: CliffDiving.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_117_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_117_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_117_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_117_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: CliffDiving.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_118_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_118_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_118_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_118_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: Shotput.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_119_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_119_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_119_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_119_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: CricketShot.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_120_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_120_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_120_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_120_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: JavelinThrow.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_121_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_121_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_121_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_121_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: CricketShot.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_122_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_122_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_122_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_122_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: CricketShot.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_123_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_123_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_123_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_123_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: GolfSwing.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_124_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_124_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_124_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_124_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: CricketBowling.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_125_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_125_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_125_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_125_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: ThrowDiscus.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_126_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_126_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_126_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_126_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: BaseballPitch.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_127_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_127_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_127_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_127_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: SoccerPenalty.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_128_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_128_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_128_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_128_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: BaseballPitch.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_129_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_129_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_129_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_129_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: HammerThrow.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_130_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_130_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_130_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_130_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: TennisSwing.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_131_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_131_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_131_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_131_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: Shotput.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_132_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_132_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_132_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_132_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: FrisbeeCatch.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_133_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_133_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_133_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_133_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: Diving.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_134_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_134_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_134_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_134_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: ThrowDiscus.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_135_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_135_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_135_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_135_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: GolfSwing.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_136_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_136_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_136_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_136_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: LongJump.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_137_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_137_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_137_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_137_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: BaseballPitch.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_138_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_138_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_138_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_138_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: Billiards.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_139_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_139_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_139_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_139_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: Billiards.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_140_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_140_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_140_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_140_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: Billiards.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_141_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_141_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_141_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_141_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: PoleVault.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_142_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_142_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_142_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_142_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: Diving.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_143_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_143_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_143_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_143_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: TennisSwing.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_144_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_144_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_144_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_144_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: Billiards.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_145_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_145_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_145_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_145_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: FrisbeeCatch.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_146_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_146_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_146_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_146_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: Billiards.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_147_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_147_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_147_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_147_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: HammerThrow.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_148_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_148_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_148_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_148_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: VolleyballSpiking.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_149_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_149_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_149_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_149_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: GolfSwing.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_150_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_150_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_150_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_150_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: CliffDiving.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_151_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_151_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_151_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_151_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: Diving.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_152_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_152_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_152_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_152_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: CricketBowling.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_153_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_153_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_153_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_153_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: BaseballPitch.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_154_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_154_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_154_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_154_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: Diving.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_155_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_155_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_155_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_155_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: CricketBowling.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_156_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_156_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_156_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_156_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: BasketballDunk.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_157_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_157_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_157_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_157_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: CleanAndJerk.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_158_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_158_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_158_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_158_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: CricketBowling.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_159_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_159_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_159_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_159_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: Diving.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_160_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_160_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_160_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_160_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: SoccerPenalty.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_161_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_161_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_161_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_161_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: CricketBowling.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_162_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_162_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_162_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_162_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: CricketBowling.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_163_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_163_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_163_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_163_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: JavelinThrow.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_164_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_164_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_164_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_164_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: SoccerPenalty.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_165_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_165_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_165_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_165_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: PoleVault.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_166_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_166_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_166_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_166_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: SoccerPenalty.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_167_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_167_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_167_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_167_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: FrisbeeCatch.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_168_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_168_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_168_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_168_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: PoleVault.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_169_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_169_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_169_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_169_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: HammerThrow.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_170_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_170_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_170_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_170_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: ThrowDiscus.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_171_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_171_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_171_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_171_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: GolfSwing.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_172_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_172_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_172_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_172_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: SoccerPenalty.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_173_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_173_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_173_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_173_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: HighJump.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_174_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_174_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_174_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_174_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: ThrowDiscus.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_175_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_175_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_175_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_175_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: JavelinThrow.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_176_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_176_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_176_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_176_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: CliffDiving.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_177_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_177_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_177_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_177_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: BasketballDunk.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_178_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_178_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_178_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_178_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: CricketShot.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_179_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_179_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_179_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_179_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: Diving.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_180_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_180_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_180_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_180_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: VolleyballSpiking.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_181_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_181_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_181_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_181_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: TennisSwing.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_182_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_182_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_182_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_182_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: CleanAndJerk.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_183_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_183_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_183_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_183_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: BasketballDunk.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_184_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_184_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_184_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_184_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: Shotput.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_185_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_185_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_185_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_185_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: HighJump.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_186_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_186_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_186_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_186_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: TennisSwing.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_187_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_187_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_187_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_187_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: CricketShot.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_188_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_188_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_188_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_188_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: HammerThrow.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_189_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_189_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_189_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_189_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: LongJump.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_190_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_190_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_190_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_190_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: HammerThrow.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_191_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_191_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_191_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_191_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_localization",
+    "visual_input_component": "Video image or Natural image",
+    "source": "THUMOS14",
+    "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "question": "Given the sequence of images, please identify the image consistent with the text description: Billiards.",
+    "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_192_0.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_192_1.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_192_2.png",
+      "../MMIU-Benchmark/temporal_localization/temporal_localization_192_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.108, 0.0, 0.425, 0.999]\nB: [0.092, 0.001, 0.409, 1.0]\nC: [0.108, 0.0, 0.383, 1.167]\nD: [0.64, 0.324, 0.772, 0.771]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1460 and the height is 864.\nCAPTION: Man kneeling and feeding pandas then standing up and moving around",
+    "context": "Select from the following choices.\nA: [0.108, 0.0, 0.425, 0.999]\nB: [0.092, 0.001, 0.409, 1.0]\nC: [0.108, 0.0, 0.383, 1.167]\nD: [0.64, 0.324, 0.772, 0.771]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_0_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_0_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_0_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_0_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_0_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_0_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_0_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_0_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_0_8.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.459, 0.453, 0.692, 0.601]\nB: [0.499, 0.224, 0.658, 0.74]\nC: [0.546, 0.275, 0.855, 0.561]\nD: [0.648, 0.603, 0.942, 0.669]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1280 and the height is 720.\nCAPTION: Monkey jumping to the right",
+    "context": "Select from the following choices.\nA: [0.459, 0.453, 0.692, 0.601]\nB: [0.499, 0.224, 0.658, 0.74]\nC: [0.546, 0.275, 0.855, 0.561]\nD: [0.648, 0.603, 0.942, 0.669]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_1_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_1_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_1_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_1_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_1_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_1_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_1_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_1_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_1_8.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.471, 0.597, 0.96, 0.716]\nB: [0.104, 0.011, 0.433, 0.256]\nC: [0.204, 0.0, 0.999, 0.792]\nD: [0.0, 0.081, 0.795, 0.873]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1766 and the height is 945.\nCAPTION: Airplane moving from left to right",
+    "context": "Select from the following choices.\nA: [0.471, 0.597, 0.96, 0.716]\nB: [0.104, 0.011, 0.433, 0.256]\nC: [0.204, 0.0, 0.999, 0.792]\nD: [0.0, 0.081, 0.795, 0.873]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_2_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_2_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_2_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_2_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_2_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_2_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_2_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_2_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_2_8.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.751, 0.411, 0.953, 0.67]\nB: [0.404, 0.633, 0.585, 0.895]\nC: [0.331, 0.387, 0.736, 0.809]\nD: [0.734, 0.306, 0.936, 0.565]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: moving bike",
+    "context": "Select from the following choices.\nA: [0.751, 0.411, 0.953, 0.67]\nB: [0.404, 0.633, 0.585, 0.895]\nC: [0.331, 0.387, 0.736, 0.809]\nD: [0.734, 0.306, 0.936, 0.565]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_3_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_3_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_3_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_3_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_3_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_3_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_3_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_3_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_3_8.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.515, 0.0, 0.668, 0.425]\nB: [0.489, 0.0, 0.646, 0.49]\nC: [0.515, 0.0, 0.672, 0.49]\nD: [0.818, 0.565, 0.854, 0.915]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: person holding a rope",
+    "context": "Select from the following choices.\nA: [0.515, 0.0, 0.668, 0.425]\nB: [0.489, 0.0, 0.646, 0.49]\nC: [0.515, 0.0, 0.672, 0.49]\nD: [0.818, 0.565, 0.854, 0.915]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_4_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_4_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_4_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_4_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_4_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_4_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_4_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_4_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_4_8.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.229, 0.095, 0.414, 0.583]\nB: [0.6, 0.377, 0.902, 0.706]\nC: [0.34, 0.421, 0.4, 0.718]\nD: [0.34, 0.421, 0.4, 0.663]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 512 and the height is 252.\nCAPTION: The man who pulls the horse and runs in circles",
+    "context": "Select from the following choices.\nA: [0.229, 0.095, 0.414, 0.583]\nB: [0.6, 0.377, 0.902, 0.706]\nC: [0.34, 0.421, 0.4, 0.718]\nD: [0.34, 0.421, 0.4, 0.663]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_5_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_5_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_5_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_5_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_5_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_5_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_5_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_5_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_5_8.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.938, 0.165, 0.965, 0.472]\nB: [0.002, 0.107, 0.289, 0.46]\nC: [0.243, 0.441, 0.277, 0.48]\nD: [0.243, 0.441, 0.28, 0.48]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 544.\nCAPTION: Stationary truck",
+    "context": "Select from the following choices.\nA: [0.938, 0.165, 0.965, 0.472]\nB: [0.002, 0.107, 0.289, 0.46]\nC: [0.243, 0.441, 0.277, 0.48]\nD: [0.243, 0.441, 0.28, 0.48]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_6_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_6_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_6_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_6_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_6_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_6_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_6_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_6_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_6_8.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.36, 0.405, 0.87, 0.536]\nB: [0.36, 0.405, 0.802, 0.554]\nC: [0.627, 0.3, 0.791, 0.407]\nD: [0.36, 0.405, 0.799, 0.534]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 496.\nCAPTION: plane move faster",
+    "context": "Select from the following choices.\nA: [0.36, 0.405, 0.87, 0.536]\nB: [0.36, 0.405, 0.802, 0.554]\nC: [0.627, 0.3, 0.791, 0.407]\nD: [0.36, 0.405, 0.799, 0.534]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_7_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_7_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_7_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_7_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_7_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_7_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_7_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_7_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_7_8.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.312, 0.268, 0.537, 0.651]\nB: [0.189, 0.408, 0.598, 0.683]\nC: [0.266, 0.106, 0.491, 0.488]\nD: [0.313, 0.153, 0.539, 0.535]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1280 and the height is 596.\nCAPTION: panda sit and eat, then walking to the leftmost",
+    "context": "Select from the following choices.\nA: [0.312, 0.268, 0.537, 0.651]\nB: [0.189, 0.408, 0.598, 0.683]\nC: [0.266, 0.106, 0.491, 0.488]\nD: [0.313, 0.153, 0.539, 0.535]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_8_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_8_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_8_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_8_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_8_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_8_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_8_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_8_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_8_8.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.517, 0.399, 0.793, 0.95]\nB: [0.672, 0.449, 0.947, 1.0]\nC: [0.64, 0.192, 0.916, 0.743]\nD: [0.56, 0.448, 0.835, 0.999]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: fish stay still and is the final to move, then swim around",
+    "context": "Select from the following choices.\nA: [0.517, 0.399, 0.793, 0.95]\nB: [0.672, 0.449, 0.947, 1.0]\nC: [0.64, 0.192, 0.916, 0.743]\nD: [0.56, 0.448, 0.835, 0.999]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_9_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_9_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_9_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_9_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_9_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_9_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_9_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_9_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_9_8.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.098, 0.465, 0.303, 0.585]\nB: [0.121, 0.454, 0.326, 0.574]\nC: [0.121, 0.454, 0.324, 0.579]\nD: [0.121, 0.454, 0.326, 0.594]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 544.\nCAPTION: black one that turns and goes left",
+    "context": "Select from the following choices.\nA: [0.098, 0.465, 0.303, 0.585]\nB: [0.121, 0.454, 0.326, 0.574]\nC: [0.121, 0.454, 0.324, 0.579]\nD: [0.121, 0.454, 0.326, 0.594]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_10_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_10_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_10_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_10_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_10_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_10_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_10_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_10_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_10_8.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.075, 0.629, 0.129, 0.875]\nB: [0.714, 0.632, 0.746, 0.732]\nC: [0.356, 0.713, 0.678, 0.908]\nD: [0.714, 0.632, 0.74, 0.728]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 544.\nCAPTION: The horse running alongside the white railing.",
+    "context": "Select from the following choices.\nA: [0.075, 0.629, 0.129, 0.875]\nB: [0.714, 0.632, 0.746, 0.732]\nC: [0.356, 0.713, 0.678, 0.908]\nD: [0.714, 0.632, 0.74, 0.728]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_11_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_11_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_11_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_11_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_11_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_11_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_11_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_11_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_11_8.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.0, 0.458, 0.149, 0.824]\nB: [0.309, 0.0, 0.473, 0.107]\nC: [0.274, 0.0, 0.438, 0.107]\nD: [0.274, 0.0, 0.436, 0.1]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 720.\nCAPTION: people standing",
+    "context": "Select from the following choices.\nA: [0.0, 0.458, 0.149, 0.824]\nB: [0.309, 0.0, 0.473, 0.107]\nC: [0.274, 0.0, 0.438, 0.107]\nD: [0.274, 0.0, 0.436, 0.1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_12_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_12_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_12_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_12_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_12_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_12_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_12_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_12_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_12_8.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.221, 0.11, 0.562, 0.656]\nB: [0.137, 0.273, 0.503, 0.921]\nC: [0.137, 0.273, 0.426, 0.741]\nD: [0.137, 0.273, 0.478, 0.819]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: front elephant walking to backwards",
+    "context": "Select from the following choices.\nA: [0.221, 0.11, 0.562, 0.656]\nB: [0.137, 0.273, 0.503, 0.921]\nC: [0.137, 0.273, 0.426, 0.741]\nD: [0.137, 0.273, 0.478, 0.819]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_13_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_13_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_13_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_13_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_13_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_13_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_13_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_13_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_13_8.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.443, 0.427, 0.702, 0.837]\nB: [0.443, 0.427, 0.732, 0.895]\nC: [0.532, 0.417, 0.86, 0.458]\nD: [0.221, 0.395, 0.5, 0.881]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1460 and the height is 864.\nCAPTION: sit down and eat, then walk and stand up using the back legs",
+    "context": "Select from the following choices.\nA: [0.443, 0.427, 0.702, 0.837]\nB: [0.443, 0.427, 0.732, 0.895]\nC: [0.532, 0.417, 0.86, 0.458]\nD: [0.221, 0.395, 0.5, 0.881]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_14_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_14_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_14_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_14_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_14_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_14_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_14_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_14_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_14_8.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.034, 0.165, 0.329, 0.683]\nB: [0.084, 0.018, 0.378, 0.536]\nC: [0.177, 0.224, 0.471, 0.742]\nD: [0.198, 0.454, 0.492, 0.972]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1280 and the height is 720.\nCAPTION: The yellow truck in motion.",
+    "context": "Select from the following choices.\nA: [0.034, 0.165, 0.329, 0.683]\nB: [0.084, 0.018, 0.378, 0.536]\nC: [0.177, 0.224, 0.471, 0.742]\nD: [0.198, 0.454, 0.492, 0.972]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_15_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_15_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_15_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_15_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_15_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_15_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_15_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_15_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_15_8.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.484, 0.053, 0.529, 0.188]\nB: [0.394, 0.024, 0.831, 0.342]\nC: [0.481, 0.01, 0.526, 0.145]\nD: [0.503, 0.0, 0.548, 0.135]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: the right one of the two sitting people in the distance",
+    "context": "Select from the following choices.\nA: [0.484, 0.053, 0.529, 0.188]\nB: [0.394, 0.024, 0.831, 0.342]\nC: [0.481, 0.01, 0.526, 0.145]\nD: [0.503, 0.0, 0.548, 0.135]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_16_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_16_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_16_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_16_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_16_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_16_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_16_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_16_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_16_8.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.546, 0.484, 0.821, 0.866]\nB: [0.354, 0.538, 0.431, 0.933]\nC: [0.105, 0.437, 0.253, 0.914]\nD: [0.373, 0.403, 0.451, 0.797]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1567 and the height is 730.\nCAPTION: standing still eating the grass without changing position",
+    "context": "Select from the following choices.\nA: [0.546, 0.484, 0.821, 0.866]\nB: [0.354, 0.538, 0.431, 0.933]\nC: [0.105, 0.437, 0.253, 0.914]\nD: [0.373, 0.403, 0.451, 0.797]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_17_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_17_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_17_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_17_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_17_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_17_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_17_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_17_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_17_8.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.077, 0.644, 0.468, 1.0]\nB: [0.0, 0.644, 0.391, 1.0]\nC: [0.123, 0.642, 0.514, 0.999]\nD: [0.123, 0.642, 0.574, 0.992]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 794.\nCAPTION: The bear that was pinned to the ground during the fight",
+    "context": "Select from the following choices.\nA: [0.077, 0.644, 0.468, 1.0]\nB: [0.0, 0.644, 0.391, 1.0]\nC: [0.123, 0.642, 0.514, 0.999]\nD: [0.123, 0.642, 0.574, 0.992]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_18_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_18_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_18_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_18_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_18_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_18_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_18_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_18_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_18_8.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.899, 0.384, 1.0, 0.721]\nB: [0.865, 0.253, 0.966, 0.59]\nC: [0.865, 0.253, 0.982, 0.587]\nD: [0.815, 0.165, 0.916, 0.502]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: Girl riding bicycle in a circle",
+    "context": "Select from the following choices.\nA: [0.899, 0.384, 1.0, 0.721]\nB: [0.865, 0.253, 0.966, 0.59]\nC: [0.865, 0.253, 0.982, 0.587]\nD: [0.815, 0.165, 0.916, 0.502]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_19_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_19_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_19_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_19_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_19_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_19_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_19_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_19_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_19_8.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.387, 0.396, 0.516, 0.804]\nB: [0.384, 0.324, 0.498, 0.683]\nC: [0.384, 0.324, 0.505, 0.725]\nD: [0.384, 0.324, 0.514, 0.732]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1280 and the height is 720.\nCAPTION: Elephant putting its trunk on another elephant's back",
+    "context": "Select from the following choices.\nA: [0.387, 0.396, 0.516, 0.804]\nB: [0.384, 0.324, 0.498, 0.683]\nC: [0.384, 0.324, 0.505, 0.725]\nD: [0.384, 0.324, 0.514, 0.732]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_20_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_20_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_20_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_20_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_20_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_20_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_20_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_20_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_20_8.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.42, 0.233, 0.816, 0.666]\nB: [0.494, 0.229, 0.558, 0.489]\nC: [0.758, 0.261, 0.86, 0.384]\nD: [0.494, 0.229, 0.55, 0.444]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: The monkey pulling the cart",
+    "context": "Select from the following choices.\nA: [0.42, 0.233, 0.816, 0.666]\nB: [0.494, 0.229, 0.558, 0.489]\nC: [0.758, 0.261, 0.86, 0.384]\nD: [0.494, 0.229, 0.55, 0.444]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_21_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_21_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_21_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_21_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_21_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_21_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_21_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_21_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_21_8.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.041, 0.394, 0.379, 0.856]\nB: [0.217, 0.306, 0.414, 0.613]\nC: [0.217, 0.306, 0.447, 0.583]\nD: [0.06, 0.273, 0.247, 0.523]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 636 and the height is 480.\nCAPTION: tiger walking from distance and lying down on the ground to drink water",
+    "context": "Select from the following choices.\nA: [0.041, 0.394, 0.379, 0.856]\nB: [0.217, 0.306, 0.414, 0.613]\nC: [0.217, 0.306, 0.447, 0.583]\nD: [0.06, 0.273, 0.247, 0.523]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_22_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_22_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_22_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_22_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_22_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_22_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_22_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_22_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_22_8.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.095, 0.373, 0.885, 1.12]\nB: [0.096, 0.374, 1.0, 1.0]\nC: [0.455, 0.355, 0.526, 0.388]\nD: [0.095, 0.373, 0.999, 0.999]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: The person with the lizard in their hand.",
+    "context": "Select from the following choices.\nA: [0.095, 0.373, 0.885, 1.12]\nB: [0.096, 0.374, 1.0, 1.0]\nC: [0.455, 0.355, 0.526, 0.388]\nD: [0.095, 0.373, 0.999, 0.999]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_23_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_23_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_23_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_23_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_23_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_23_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_23_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_23_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_23_8.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.376, 0.443, 0.562, 0.608]\nB: [0.402, 0.431, 0.588, 0.596]\nC: [0.2, 0.171, 0.318, 0.419]\nD: [0.376, 0.443, 0.567, 0.621]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: eat the food on the man's hand then walk away to the food box",
+    "context": "Select from the following choices.\nA: [0.376, 0.443, 0.562, 0.608]\nB: [0.402, 0.431, 0.588, 0.596]\nC: [0.2, 0.171, 0.318, 0.419]\nD: [0.376, 0.443, 0.567, 0.621]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_24_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_24_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_24_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_24_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_24_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_24_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_24_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_24_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_24_8.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.0, 0.194, 0.136, 0.298]\nB: [0.0, 0.194, 0.162, 0.31]\nC: [0.26, 0.81, 0.487, 0.87]\nD: [0.0, 0.194, 0.122, 0.303]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1080 and the height is 1920.\nCAPTION: The fish in the top most.",
+    "context": "Select from the following choices.\nA: [0.0, 0.194, 0.136, 0.298]\nB: [0.0, 0.194, 0.162, 0.31]\nC: [0.26, 0.81, 0.487, 0.87]\nD: [0.0, 0.194, 0.122, 0.303]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_25_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_25_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_25_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_25_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_25_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_25_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_25_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_25_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_25_8.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.416, 0.166, 0.608, 0.431]\nB: [0.142, 0.332, 0.275, 0.644]\nC: [0.148, 0.279, 0.225, 0.488]\nD: [0.331, 0.507, 0.795, 0.96]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1280 and the height is 596.\nCAPTION: The giant panda that has been sitting and eating without moving its position",
+    "context": "Select from the following choices.\nA: [0.416, 0.166, 0.608, 0.431]\nB: [0.142, 0.332, 0.275, 0.644]\nC: [0.148, 0.279, 0.225, 0.488]\nD: [0.331, 0.507, 0.795, 0.96]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_26_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_26_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_26_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_26_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_26_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_26_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_26_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_26_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_26_8.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.0, 0.0, 0.449, 0.46]\nB: [0.0, 0.0, 0.493, 0.471]\nC: [0.0, 0.0, 0.522, 0.436]\nD: [0.065, 0.0, 0.514, 0.46]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 945.\nCAPTION: turtle swimming to the left",
+    "context": "Select from the following choices.\nA: [0.0, 0.0, 0.449, 0.46]\nB: [0.0, 0.0, 0.493, 0.471]\nC: [0.0, 0.0, 0.522, 0.436]\nD: [0.065, 0.0, 0.514, 0.46]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_27_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_27_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_27_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_27_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_27_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_27_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_27_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_27_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_27_8.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.397, 0.443, 0.607, 0.575]\nB: [0.499, 0.579, 0.999, 0.86]\nC: [0.397, 0.443, 0.599, 0.557]\nD: [0.208, 0.079, 0.706, 0.226]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 544.\nCAPTION: black car move and turn left",
+    "context": "Select from the following choices.\nA: [0.397, 0.443, 0.607, 0.575]\nB: [0.499, 0.579, 0.999, 0.86]\nC: [0.397, 0.443, 0.599, 0.557]\nD: [0.208, 0.079, 0.706, 0.226]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_28_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_28_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_28_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_28_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_28_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_28_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_28_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_28_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_28_8.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.287, 0.225, 0.303, 0.372]\nB: [0.292, 0.285, 0.309, 0.432]\nC: [0.292, 0.285, 0.31, 0.431]\nD: [0.642, 0.415, 0.974, 0.576]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1280 and the height is 720.\nCAPTION: man coming out and leaning against the door",
+    "context": "Select from the following choices.\nA: [0.287, 0.225, 0.303, 0.372]\nB: [0.292, 0.285, 0.309, 0.432]\nC: [0.292, 0.285, 0.31, 0.431]\nD: [0.642, 0.415, 0.974, 0.576]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_29_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_29_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_29_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_29_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_29_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_29_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_29_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_29_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_29_8.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.0, 0.27, 0.367, 0.342]\nB: [0.0, 0.27, 0.415, 0.372]\nC: [0.0, 0.27, 0.35, 0.359]\nD: [0.0, 0.27, 0.318, 0.343]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1080 and the height is 1920.\nCAPTION: most top long fish",
+    "context": "Select from the following choices.\nA: [0.0, 0.27, 0.367, 0.342]\nB: [0.0, 0.27, 0.415, 0.372]\nC: [0.0, 0.27, 0.35, 0.359]\nD: [0.0, 0.27, 0.318, 0.343]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_30_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_30_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_30_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_30_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_30_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_30_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_30_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_30_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_30_8.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.36, 0.0, 0.58, 0.605]\nB: [0.36, 0.0, 0.627, 0.794]\nC: [0.36, 0.0, 0.575, 0.559]\nD: [0.36, 0.0, 0.588, 0.686]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 544.\nCAPTION: moving boy",
+    "context": "Select from the following choices.\nA: [0.36, 0.0, 0.58, 0.605]\nB: [0.36, 0.0, 0.627, 0.794]\nC: [0.36, 0.0, 0.575, 0.559]\nD: [0.36, 0.0, 0.588, 0.686]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_31_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_31_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_31_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_31_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_31_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_31_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_31_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_31_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_31_8.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.121, 0.024, 0.819, 0.716]\nB: [0.302, 0.163, 0.999, 0.856]\nC: [0.302, 0.163, 1.105, 0.84]\nD: [0.0, 0.0, 0.697, 0.692]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1004.\nCAPTION: The turtle that descended from above and reached the pool's depths.",
+    "context": "Select from the following choices.\nA: [0.121, 0.024, 0.819, 0.716]\nB: [0.302, 0.163, 0.999, 0.856]\nC: [0.302, 0.163, 1.105, 0.84]\nD: [0.0, 0.0, 0.697, 0.692]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_32_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_32_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_32_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_32_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_32_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_32_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_32_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_32_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_32_8.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.8, 0.246, 0.854, 0.596]\nB: [0.8, 0.246, 0.849, 0.552]\nC: [0.168, 0.408, 0.398, 0.881]\nD: [0.8, 0.246, 0.846, 0.541]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 2340 and the height is 1080.\nCAPTION: Man clapping his hands",
+    "context": "Select from the following choices.\nA: [0.8, 0.246, 0.854, 0.596]\nB: [0.8, 0.246, 0.849, 0.552]\nC: [0.168, 0.408, 0.398, 0.881]\nD: [0.8, 0.246, 0.846, 0.541]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_33_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_33_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_33_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_33_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_33_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_33_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_33_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_33_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_33_8.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.306, 0.732, 0.552, 0.961]\nB: [0.317, 0.557, 0.562, 0.787]\nC: [0.258, 0.651, 0.504, 0.881]\nD: [0.258, 0.651, 0.507, 0.851]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 544.\nCAPTION: The turtle moving to the left.",
+    "context": "Select from the following choices.\nA: [0.306, 0.732, 0.552, 0.961]\nB: [0.317, 0.557, 0.562, 0.787]\nC: [0.258, 0.651, 0.504, 0.881]\nD: [0.258, 0.651, 0.507, 0.851]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_34_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_34_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_34_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_34_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_34_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_34_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_34_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_34_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_34_8.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.706, 0.375, 0.747, 0.484]\nB: [0.703, 0.348, 0.743, 0.456]\nC: [0.706, 0.375, 0.745, 0.492]\nD: [0.68, 0.46, 0.774, 0.86]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1567 and the height is 730.\nCAPTION: The leading cow among the group.",
+    "context": "Select from the following choices.\nA: [0.706, 0.375, 0.747, 0.484]\nB: [0.703, 0.348, 0.743, 0.456]\nC: [0.706, 0.375, 0.745, 0.492]\nD: [0.68, 0.46, 0.774, 0.86]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_35_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_35_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_35_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_35_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_35_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_35_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_35_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_35_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_35_8.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.133, 0.533, 0.149, 0.608]\nB: [0.417, 0.292, 0.566, 0.682]\nC: [0.368, 0.36, 0.517, 0.75]\nD: [0.472, 0.233, 0.621, 0.623]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 544.\nCAPTION: little dog running",
+    "context": "Select from the following choices.\nA: [0.133, 0.533, 0.149, 0.608]\nB: [0.417, 0.292, 0.566, 0.682]\nC: [0.368, 0.36, 0.517, 0.75]\nD: [0.472, 0.233, 0.621, 0.623]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_36_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_36_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_36_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_36_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_36_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_36_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_36_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_36_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_36_8.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.265, 0.246, 0.601, 0.517]\nB: [0.243, 0.441, 0.282, 0.498]\nC: [0.255, 0.417, 0.289, 0.471]\nD: [0.243, 0.441, 0.276, 0.494]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 544.\nCAPTION: truck park",
+    "context": "Select from the following choices.\nA: [0.265, 0.246, 0.601, 0.517]\nB: [0.243, 0.441, 0.282, 0.498]\nC: [0.255, 0.417, 0.289, 0.471]\nD: [0.243, 0.441, 0.276, 0.494]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_37_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_37_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_37_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_37_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_37_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_37_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_37_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_37_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_37_8.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.501, 0.24, 0.833, 0.356]\nB: [0.408, 0.508, 0.505, 0.782]\nC: [0.408, 0.508, 0.521, 0.744]\nD: [0.395, 0.518, 0.492, 0.792]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: monkey sit still",
+    "context": "Select from the following choices.\nA: [0.501, 0.24, 0.833, 0.356]\nB: [0.408, 0.508, 0.505, 0.782]\nC: [0.408, 0.508, 0.521, 0.744]\nD: [0.395, 0.518, 0.492, 0.792]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_38_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_38_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_38_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_38_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_38_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_38_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_38_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_38_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_38_8.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.352, 0.431, 0.601, 0.832]\nB: [0.435, 0.428, 0.684, 0.83]\nC: [0.532, 0.471, 0.78, 0.873]\nD: [0.435, 0.428, 0.724, 0.907]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1460 and the height is 864.\nCAPTION: The panda standing up",
+    "context": "Select from the following choices.\nA: [0.352, 0.431, 0.601, 0.832]\nB: [0.435, 0.428, 0.684, 0.83]\nC: [0.532, 0.471, 0.78, 0.873]\nD: [0.435, 0.428, 0.724, 0.907]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_39_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_39_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_39_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_39_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_39_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_39_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_39_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_39_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_39_8.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.626, 0.397, 0.948, 0.649]\nB: [0.191, 0.181, 0.532, 0.751]\nC: [0.191, 0.181, 0.465, 0.806]\nD: [0.608, 0.556, 0.637, 0.939]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1280 and the height is 720.\nCAPTION: yellow truck move forward",
+    "context": "Select from the following choices.\nA: [0.626, 0.397, 0.948, 0.649]\nB: [0.191, 0.181, 0.532, 0.751]\nC: [0.191, 0.181, 0.465, 0.806]\nD: [0.608, 0.556, 0.637, 0.939]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_40_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_40_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_40_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_40_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_40_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_40_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_40_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_40_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_40_8.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.0, 0.191, 0.63, 0.999]\nB: [0.0, 0.192, 0.63, 1.0]\nC: [0.248, 0.0, 0.878, 0.808]\nD: [0.353, 0.572, 0.852, 0.749]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: A black horse always facing the right",
+    "context": "Select from the following choices.\nA: [0.0, 0.191, 0.63, 0.999]\nB: [0.0, 0.192, 0.63, 1.0]\nC: [0.248, 0.0, 0.878, 0.808]\nD: [0.353, 0.572, 0.852, 0.749]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_41_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_41_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_41_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_41_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_41_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_41_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_41_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_41_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_41_8.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.377, 0.561, 0.715, 0.654]\nB: [0.125, 0.545, 0.651, 1.0]\nC: [0.13, 0.544, 0.655, 0.999]\nD: [0.158, 0.519, 0.683, 0.974]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1652 and the height is 1080.\nCAPTION: rabbit jumping over another rabbit",
+    "context": "Select from the following choices.\nA: [0.377, 0.561, 0.715, 0.654]\nB: [0.125, 0.545, 0.651, 1.0]\nC: [0.13, 0.544, 0.655, 0.999]\nD: [0.158, 0.519, 0.683, 0.974]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_42_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_42_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_42_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_42_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_42_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_42_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_42_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_42_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_42_8.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.366, 0.147, 0.625, 0.316]\nB: [0.316, 0.168, 0.999, 0.911]\nC: [0.316, 0.168, 1.068, 1.012]\nD: [0.316, 0.168, 0.984, 0.77]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1004.\nCAPTION: The sea turtle that swam down from the surface and reached the bottom of the pool.",
+    "context": "Select from the following choices.\nA: [0.366, 0.147, 0.625, 0.316]\nB: [0.316, 0.168, 0.999, 0.911]\nC: [0.316, 0.168, 1.068, 1.012]\nD: [0.316, 0.168, 0.984, 0.77]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_43_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_43_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_43_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_43_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_43_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_43_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_43_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_43_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_43_8.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.245, 0.485, 0.546, 0.903]\nB: [0.186, 0.281, 0.36, 0.562]\nC: [0.378, 0.582, 0.679, 1.0]\nD: [0.447, 0.47, 0.62, 0.658]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: The first lizard to be taken and gripped by hand.",
+    "context": "Select from the following choices.\nA: [0.245, 0.485, 0.546, 0.903]\nB: [0.186, 0.281, 0.36, 0.562]\nC: [0.378, 0.582, 0.679, 1.0]\nD: [0.447, 0.47, 0.62, 0.658]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_44_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_44_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_44_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_44_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_44_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_44_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_44_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_44_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_44_8.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.348, 0.596, 0.573, 0.807]\nB: [0.314, 0.491, 0.537, 0.721]\nC: [0.348, 0.596, 0.572, 0.825]\nD: [0.41, 0.34, 0.572, 0.592]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 544.\nCAPTION: Turtle turning without changing position",
+    "context": "Select from the following choices.\nA: [0.348, 0.596, 0.573, 0.807]\nB: [0.314, 0.491, 0.537, 0.721]\nC: [0.348, 0.596, 0.572, 0.825]\nD: [0.41, 0.34, 0.572, 0.592]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_45_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_45_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_45_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_45_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_45_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_45_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_45_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_45_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_45_8.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.059, 0.252, 0.199, 0.493]\nB: [0.287, 0.193, 0.524, 0.522]\nC: [0.344, 0.539, 0.829, 0.673]\nD: [0.287, 0.193, 0.568, 0.468]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1280 and the height is 596.\nCAPTION: panda sit and eat without any moving",
+    "context": "Select from the following choices.\nA: [0.059, 0.252, 0.199, 0.493]\nB: [0.287, 0.193, 0.524, 0.522]\nC: [0.344, 0.539, 0.829, 0.673]\nD: [0.287, 0.193, 0.568, 0.468]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_46_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_46_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_46_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_46_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_46_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_46_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_46_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_46_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_46_8.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.76, 0.422, 0.815, 0.712]\nB: [0.76, 0.422, 0.813, 0.716]\nC: [0.764, 0.456, 0.818, 0.746]\nD: [0.76, 0.422, 0.81, 0.669]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 2340 and the height is 1080.\nCAPTION: The bear on the right hula hooping with its neck",
+    "context": "Select from the following choices.\nA: [0.76, 0.422, 0.815, 0.712]\nB: [0.76, 0.422, 0.813, 0.716]\nC: [0.764, 0.456, 0.818, 0.746]\nD: [0.76, 0.422, 0.81, 0.669]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_47_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_47_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_47_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_47_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_47_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_47_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_47_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_47_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_47_8.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.567, 0.246, 0.699, 0.561]\nB: [0.567, 0.246, 0.697, 0.532]\nC: [0.567, 0.246, 0.723, 0.553]\nD: [0.567, 0.246, 0.678, 0.553]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: black dog play with the other dog",
+    "context": "Select from the following choices.\nA: [0.567, 0.246, 0.699, 0.561]\nB: [0.567, 0.246, 0.697, 0.532]\nC: [0.567, 0.246, 0.723, 0.553]\nD: [0.567, 0.246, 0.678, 0.553]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_48_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_48_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_48_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_48_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_48_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_48_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_48_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_48_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_48_8.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.024, 0.223, 0.053, 0.419]\nB: [0.014, 0.286, 0.119, 0.484]\nC: [0.433, 0.522, 0.598, 0.968]\nD: [0.044, 0.236, 0.148, 0.434]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: Fish swimming to the left then right",
+    "context": "Select from the following choices.\nA: [0.024, 0.223, 0.053, 0.419]\nB: [0.014, 0.286, 0.119, 0.484]\nC: [0.433, 0.522, 0.598, 0.968]\nD: [0.044, 0.236, 0.148, 0.434]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_49_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_49_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_49_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_49_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_49_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_49_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_49_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_49_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_49_8.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.362, 0.256, 0.483, 0.664]\nB: [0.301, 0.582, 0.714, 0.651]\nC: [0.417, 0.501, 0.538, 0.91]\nD: [0.389, 0.314, 0.51, 0.722]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1280 and the height is 720.\nCAPTION: elephant walking behind and putting its trunk on others ",
+    "context": "Select from the following choices.\nA: [0.362, 0.256, 0.483, 0.664]\nB: [0.301, 0.582, 0.714, 0.651]\nC: [0.417, 0.501, 0.538, 0.91]\nD: [0.389, 0.314, 0.51, 0.722]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_50_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_50_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_50_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_50_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_50_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_50_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_50_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_50_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_50_8.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.553, 0.399, 0.586, 0.565]\nB: [0.553, 0.399, 0.588, 0.567]\nC: [0.553, 0.399, 0.582, 0.542]\nD: [0.553, 0.399, 0.588, 0.559]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: man standing near bicycles",
+    "context": "Select from the following choices.\nA: [0.553, 0.399, 0.586, 0.565]\nB: [0.553, 0.399, 0.588, 0.567]\nC: [0.553, 0.399, 0.582, 0.542]\nD: [0.553, 0.399, 0.588, 0.559]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_51_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_51_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_51_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_51_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_51_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_51_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_51_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_51_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_51_8.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.461, 0.381, 0.531, 0.663]\nB: [0.449, 0.417, 0.527, 0.714]\nC: [0.461, 0.381, 0.539, 0.679]\nD: [0.395, 0.484, 0.838, 0.81]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 512 and the height is 252.\nCAPTION: person pull a horse",
+    "context": "Select from the following choices.\nA: [0.461, 0.381, 0.531, 0.663]\nB: [0.449, 0.417, 0.527, 0.714]\nC: [0.461, 0.381, 0.539, 0.679]\nD: [0.395, 0.484, 0.838, 0.81]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_52_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_52_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_52_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_52_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_52_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_52_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_52_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_52_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_52_8.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.683, 0.564, 0.694, 0.605]\nB: [0.688, 0.575, 0.7, 0.612]\nC: [0.683, 0.564, 0.696, 0.601]\nD: [0.683, 0.564, 0.697, 0.607]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 544.\nCAPTION: The rider on the horse running beside the white railing.",
+    "context": "Select from the following choices.\nA: [0.683, 0.564, 0.694, 0.605]\nB: [0.688, 0.575, 0.7, 0.612]\nC: [0.683, 0.564, 0.696, 0.601]\nD: [0.683, 0.564, 0.697, 0.607]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_53_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_53_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_53_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_53_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_53_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_53_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_53_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_53_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_53_8.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.588, 0.919, 0.692, 0.999]\nB: [0.588, 0.919, 0.673, 1.015]\nC: [0.573, 0.92, 0.677, 1.0]\nD: [0.346, 0.162, 0.584, 0.436]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1652 and the height is 1080.\nCAPTION: The rabbit that is having food in the lower right corner.",
+    "context": "Select from the following choices.\nA: [0.588, 0.919, 0.692, 0.999]\nB: [0.588, 0.919, 0.673, 1.015]\nC: [0.573, 0.92, 0.677, 1.0]\nD: [0.346, 0.162, 0.584, 0.436]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_54_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_54_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_54_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_54_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_54_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_54_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_54_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_54_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_54_8.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.148, 0.223, 0.467, 0.894]\nB: [0.743, 0.431, 0.787, 0.731]\nC: [0.148, 0.223, 0.43, 0.912]\nD: [0.002, 0.119, 0.32, 0.79]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: Kitten looking around without moving position",
+    "context": "Select from the following choices.\nA: [0.148, 0.223, 0.467, 0.894]\nB: [0.743, 0.431, 0.787, 0.731]\nC: [0.148, 0.223, 0.43, 0.912]\nD: [0.002, 0.119, 0.32, 0.79]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_55_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_55_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_55_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_55_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_55_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_55_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_55_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_55_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_55_8.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.317, 0.46, 0.443, 0.741]\nB: [0.34, 0.559, 0.466, 0.84]\nC: [0.317, 0.46, 0.456, 0.772]\nD: [0.059, 0.752, 0.133, 0.831]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 544.\nCAPTION: bike move around",
+    "context": "Select from the following choices.\nA: [0.317, 0.46, 0.443, 0.741]\nB: [0.34, 0.559, 0.466, 0.84]\nC: [0.317, 0.46, 0.456, 0.772]\nD: [0.059, 0.752, 0.133, 0.831]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_56_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_56_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_56_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_56_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_56_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_56_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_56_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_56_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_56_8.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.424, 0.525, 0.676, 0.927]\nB: [0.424, 0.525, 0.701, 1.007]\nC: [0.424, 0.525, 0.631, 0.91]\nD: [0.166, 0.488, 0.56, 0.691]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1460 and the height is 864.\nCAPTION: The panda lying down and eating",
+    "context": "Select from the following choices.\nA: [0.424, 0.525, 0.676, 0.927]\nB: [0.424, 0.525, 0.701, 1.007]\nC: [0.424, 0.525, 0.631, 0.91]\nD: [0.166, 0.488, 0.56, 0.691]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_57_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_57_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_57_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_57_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_57_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_57_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_57_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_57_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_57_8.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.245, 0.404, 0.611, 0.725]\nB: [0.167, 0.029, 0.418, 0.199]\nC: [0.378, 0.483, 0.745, 0.804]\nD: [0.378, 0.483, 0.699, 0.8]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1280 and the height is 720.\nCAPTION: the elephant that was attacked",
+    "context": "Select from the following choices.\nA: [0.245, 0.404, 0.611, 0.725]\nB: [0.167, 0.029, 0.418, 0.199]\nC: [0.378, 0.483, 0.745, 0.804]\nD: [0.378, 0.483, 0.699, 0.8]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_58_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_58_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_58_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_58_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_58_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_58_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_58_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_58_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_58_8.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.559, 0.398, 0.82, 1.094]\nB: [0.559, 0.398, 0.867, 0.998]\nC: [0.559, 0.398, 0.923, 0.881]\nD: [0.559, 0.398, 0.924, 0.884]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1280 and the height is 596.\nCAPTION: Panda sitting down to eat then moving to right to eat again",
+    "context": "Select from the following choices.\nA: [0.559, 0.398, 0.82, 1.094]\nB: [0.559, 0.398, 0.867, 0.998]\nC: [0.559, 0.398, 0.923, 0.881]\nD: [0.559, 0.398, 0.924, 0.884]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_59_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_59_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_59_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_59_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_59_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_59_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_59_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_59_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_59_8.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.593, 0.415, 0.788, 0.656]\nB: [0.65, 0.351, 0.891, 0.636]\nC: [0.593, 0.415, 0.833, 0.7]\nD: [0.698, 0.435, 0.939, 0.72]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: Sheep with the black head facing down to eat then walking forward",
+    "context": "Select from the following choices.\nA: [0.593, 0.415, 0.788, 0.656]\nB: [0.65, 0.351, 0.891, 0.636]\nC: [0.593, 0.415, 0.833, 0.7]\nD: [0.698, 0.435, 0.939, 0.72]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_60_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_60_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_60_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_60_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_60_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_60_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_60_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_60_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_60_8.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.648, 0.0, 1.052, 0.953]\nB: [0.648, 0.0, 0.999, 0.929]\nC: [0.559, 0.708, 0.731, 0.883]\nD: [0.648, 0.0, 0.965, 0.952]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: cat stand and climb at right",
+    "context": "Select from the following choices.\nA: [0.648, 0.0, 1.052, 0.953]\nB: [0.648, 0.0, 0.999, 0.929]\nC: [0.559, 0.708, 0.731, 0.883]\nD: [0.648, 0.0, 0.965, 0.952]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_61_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_61_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_61_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_61_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_61_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_61_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_61_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_61_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_61_8.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.474, 0.155, 0.787, 0.23]\nB: [0.528, 0.295, 0.707, 0.66]\nC: [0.653, 0.775, 0.872, 0.809]\nD: [0.528, 0.295, 0.729, 0.73]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: The last lizard to be taken and held in hand.",
+    "context": "Select from the following choices.\nA: [0.474, 0.155, 0.787, 0.23]\nB: [0.528, 0.295, 0.707, 0.66]\nC: [0.653, 0.775, 0.872, 0.809]\nD: [0.528, 0.295, 0.729, 0.73]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_62_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_62_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_62_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_62_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_62_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_62_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_62_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_62_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_62_8.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.0, 0.0, 0.155, 0.999]\nB: [0.058, 0.001, 0.214, 1.0]\nC: [0.0, 0.0, 0.127, 1.029]\nD: [0.01, 0.001, 0.165, 1.0]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 945.\nCAPTION: person standing behind little girl feeding rabbit",
+    "context": "Select from the following choices.\nA: [0.0, 0.0, 0.155, 0.999]\nB: [0.058, 0.001, 0.214, 1.0]\nC: [0.0, 0.0, 0.127, 1.029]\nD: [0.01, 0.001, 0.165, 1.0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_63_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_63_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_63_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_63_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_63_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_63_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_63_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_63_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_63_8.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.399, 0.593, 0.583, 0.76]\nB: [0.399, 0.593, 0.573, 0.762]\nC: [0.399, 0.593, 0.547, 0.764]\nD: [0.398, 0.662, 0.572, 0.832]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1080 and the height is 1152.\nCAPTION: Bear walking forward and jumping over a barricade",
+    "context": "Select from the following choices.\nA: [0.399, 0.593, 0.583, 0.76]\nB: [0.399, 0.593, 0.573, 0.762]\nC: [0.399, 0.593, 0.547, 0.764]\nD: [0.398, 0.662, 0.572, 0.832]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_64_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_64_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_64_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_64_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_64_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_64_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_64_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_64_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_64_8.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.098, 0.372, 0.469, 0.908]\nB: [0.0, 0.428, 0.323, 1.0]\nC: [0.098, 0.372, 0.421, 0.944]\nD: [0.098, 0.372, 0.402, 1.019]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1652 and the height is 1080.\nCAPTION: rabbit eating and walking",
+    "context": "Select from the following choices.\nA: [0.098, 0.372, 0.469, 0.908]\nB: [0.0, 0.428, 0.323, 1.0]\nC: [0.098, 0.372, 0.421, 0.944]\nD: [0.098, 0.372, 0.402, 1.019]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_65_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_65_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_65_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_65_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_65_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_65_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_65_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_65_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_65_8.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.434, 0.53, 0.716, 0.826]\nB: [0.419, 0.025, 0.783, 0.21]\nC: [0.45, 0.638, 0.695, 1.0]\nD: [0.434, 0.53, 0.68, 0.892]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 640 and the height is 362.\nCAPTION: The final cow that stepped forward.",
+    "context": "Select from the following choices.\nA: [0.434, 0.53, 0.716, 0.826]\nB: [0.419, 0.025, 0.783, 0.21]\nC: [0.45, 0.638, 0.695, 1.0]\nD: [0.434, 0.53, 0.68, 0.892]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_66_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_66_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_66_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_66_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_66_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_66_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_66_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_66_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_66_8.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.433, 0.508, 0.731, 0.844]\nB: [0.375, 0.594, 0.627, 1.0]\nC: [0.433, 0.508, 0.684, 0.914]\nD: [0.315, 0.626, 0.408, 0.671]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1460 and the height is 864.\nCAPTION: sit on the ground and eat then lay down and turn over",
+    "context": "Select from the following choices.\nA: [0.433, 0.508, 0.731, 0.844]\nB: [0.375, 0.594, 0.627, 1.0]\nC: [0.433, 0.508, 0.684, 0.914]\nD: [0.315, 0.626, 0.408, 0.671]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_67_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_67_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_67_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_67_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_67_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_67_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_67_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_67_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_67_8.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.036, 0.0, 0.554, 1.066]\nB: [0.106, 0.0, 0.632, 0.999]\nC: [0.036, 0.0, 0.563, 0.999]\nD: [0.102, 0.04, 0.442, 0.333]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 945.\nCAPTION: little girl feeding rabbit",
+    "context": "Select from the following choices.\nA: [0.036, 0.0, 0.554, 1.066]\nB: [0.106, 0.0, 0.632, 0.999]\nC: [0.036, 0.0, 0.563, 0.999]\nD: [0.102, 0.04, 0.442, 0.333]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_68_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_68_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_68_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_68_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_68_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_68_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_68_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_68_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_68_8.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.411, 0.501, 0.508, 0.771]\nB: [0.288, 0.192, 0.605, 0.634]\nC: [0.397, 0.621, 0.494, 0.892]\nD: [0.411, 0.501, 0.521, 0.733]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: The monkey who has been sitting",
+    "context": "Select from the following choices.\nA: [0.411, 0.501, 0.508, 0.771]\nB: [0.288, 0.192, 0.605, 0.634]\nC: [0.397, 0.621, 0.494, 0.892]\nD: [0.411, 0.501, 0.521, 0.733]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_69_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_69_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_69_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_69_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_69_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_69_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_69_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_69_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_69_8.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.642, 0.553, 0.662, 0.59]\nB: [0.642, 0.553, 0.66, 0.597]\nC: [0.642, 0.553, 0.66, 0.601]\nD: [0.644, 0.559, 0.662, 0.603]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 544.\nCAPTION: The equestrian on the horse running alongside the white fence.",
+    "context": "Select from the following choices.\nA: [0.642, 0.553, 0.662, 0.59]\nB: [0.642, 0.553, 0.66, 0.597]\nC: [0.642, 0.553, 0.66, 0.601]\nD: [0.644, 0.559, 0.662, 0.603]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_70_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_70_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_70_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_70_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_70_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_70_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_70_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_70_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_70_8.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.0, 0.481, 0.272, 0.998]\nB: [0.0, 0.483, 0.272, 1.0]\nC: [0.502, 0.515, 0.584, 0.925]\nD: [0.0, 0.481, 0.32, 1.033]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 640 and the height is 480.\nCAPTION: the one that was picked up by hand",
+    "context": "Select from the following choices.\nA: [0.0, 0.481, 0.272, 0.998]\nB: [0.0, 0.483, 0.272, 1.0]\nC: [0.502, 0.515, 0.584, 0.925]\nD: [0.0, 0.481, 0.32, 1.033]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_71_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_71_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_71_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_71_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_71_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_71_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_71_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_71_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_71_8.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.176, 0.451, 0.526, 0.801]\nB: [0.176, 0.451, 0.481, 0.763]\nC: [0.176, 0.451, 0.493, 0.833]\nD: [0.24, 0.387, 0.557, 0.769]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: The lizard that was captured and immediately held in hand.",
+    "context": "Select from the following choices.\nA: [0.176, 0.451, 0.526, 0.801]\nB: [0.176, 0.451, 0.481, 0.763]\nC: [0.176, 0.451, 0.493, 0.833]\nD: [0.24, 0.387, 0.557, 0.769]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_72_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_72_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_72_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_72_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_72_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_72_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_72_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_72_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_72_8.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.152, 0.0, 1.001, 0.211]\nB: [0.67, 0.739, 0.865, 0.901]\nC: [0.152, 0.0, 0.948, 0.25]\nD: [0.152, 0.0, 0.999, 0.26]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1766 and the height is 945.\nCAPTION: move forward towards us",
+    "context": "Select from the following choices.\nA: [0.152, 0.0, 1.001, 0.211]\nB: [0.67, 0.739, 0.865, 0.901]\nC: [0.152, 0.0, 0.948, 0.25]\nD: [0.152, 0.0, 0.999, 0.26]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_73_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_73_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_73_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_73_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_73_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_73_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_73_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_73_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_73_8.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.65, 0.397, 0.915, 0.586]\nB: [0.65, 0.397, 0.921, 0.608]\nC: [0.735, 0.314, 1.0, 0.504]\nD: [0.662, 0.298, 0.681, 0.551]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 544.\nCAPTION: silver turning and driving to left",
+    "context": "Select from the following choices.\nA: [0.65, 0.397, 0.915, 0.586]\nB: [0.65, 0.397, 0.921, 0.608]\nC: [0.735, 0.314, 1.0, 0.504]\nD: [0.662, 0.298, 0.681, 0.551]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_74_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_74_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_74_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_74_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_74_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_74_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_74_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_74_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_74_8.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.942, 0.121, 1.0, 0.271]\nB: [0.306, 0.752, 0.469, 0.964]\nC: [0.941, 0.179, 0.999, 0.329]\nD: [0.941, 0.179, 1.006, 0.32]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: The motionless black car parked.",
+    "context": "Select from the following choices.\nA: [0.942, 0.121, 1.0, 0.271]\nB: [0.306, 0.752, 0.469, 0.964]\nC: [0.941, 0.179, 0.999, 0.329]\nD: [0.941, 0.179, 1.006, 0.32]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_75_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_75_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_75_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_75_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_75_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_75_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_75_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_75_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_75_8.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.111, 0.693, 0.399, 0.884]\nB: [0.233, 0.673, 0.51, 0.825]\nC: [0.233, 0.673, 0.521, 0.864]\nD: [0.479, 0.156, 0.73, 0.526]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 544.\nCAPTION: moving from middle to left",
+    "context": "Select from the following choices.\nA: [0.111, 0.693, 0.399, 0.884]\nB: [0.233, 0.673, 0.51, 0.825]\nC: [0.233, 0.673, 0.521, 0.864]\nD: [0.479, 0.156, 0.73, 0.526]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_76_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_76_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_76_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_76_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_76_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_76_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_76_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_76_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_76_8.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.328, 0.296, 0.403, 0.708]\nB: [0.328, 0.296, 0.414, 0.676]\nC: [0.321, 0.452, 0.406, 0.833]\nD: [0.31, 0.485, 0.396, 0.866]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 544.\nCAPTION: people move around",
+    "context": "Select from the following choices.\nA: [0.328, 0.296, 0.403, 0.708]\nB: [0.328, 0.296, 0.414, 0.676]\nC: [0.321, 0.452, 0.406, 0.833]\nD: [0.31, 0.485, 0.396, 0.866]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_77_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_77_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_77_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_77_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_77_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_77_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_77_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_77_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_77_8.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.329, 0.896, 0.467, 0.965]\nB: [0.0, 0.333, 0.02, 0.431]\nC: [0.0, 0.381, 0.02, 0.479]\nD: [0.0, 0.333, 0.023, 0.423]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 480.\nCAPTION: bird standing still on cage",
+    "context": "Select from the following choices.\nA: [0.329, 0.896, 0.467, 0.965]\nB: [0.0, 0.333, 0.02, 0.431]\nC: [0.0, 0.381, 0.02, 0.479]\nD: [0.0, 0.333, 0.023, 0.423]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_78_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_78_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_78_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_78_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_78_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_78_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_78_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_78_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_78_8.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.418, 0.227, 0.47, 0.406]\nB: [0.396, 0.311, 0.448, 0.49]\nC: [0.414, 0.231, 0.466, 0.41]\nD: [0.414, 0.231, 0.456, 0.437]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: Monkey stand on a cart, then walk around",
+    "context": "Select from the following choices.\nA: [0.418, 0.227, 0.47, 0.406]\nB: [0.396, 0.311, 0.448, 0.49]\nC: [0.414, 0.231, 0.466, 0.41]\nD: [0.414, 0.231, 0.456, 0.437]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_79_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_79_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_79_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_79_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_79_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_79_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_79_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_79_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_79_8.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.113, 0.338, 0.473, 0.76]\nB: [0.49, 0.271, 0.523, 0.461]\nC: [0.505, 0.233, 0.538, 0.424]\nD: [0.505, 0.233, 0.544, 0.429]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1280 and the height is 720.\nCAPTION: The monkey sitting crouched at the center of the hole.",
+    "context": "Select from the following choices.\nA: [0.113, 0.338, 0.473, 0.76]\nB: [0.49, 0.271, 0.523, 0.461]\nC: [0.505, 0.233, 0.538, 0.424]\nD: [0.505, 0.233, 0.544, 0.429]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_80_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_80_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_80_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_80_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_80_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_80_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_80_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_80_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_80_8.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.407, 0.359, 0.537, 0.715]\nB: [0.407, 0.359, 0.534, 0.764]\nC: [0.407, 0.359, 0.517, 0.71]\nD: [0.407, 0.359, 0.539, 0.777]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1567 and the height is 730.\nCAPTION: the cow eating without moving position",
+    "context": "Select from the following choices.\nA: [0.407, 0.359, 0.537, 0.715]\nB: [0.407, 0.359, 0.534, 0.764]\nC: [0.407, 0.359, 0.517, 0.71]\nD: [0.407, 0.359, 0.539, 0.777]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_81_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_81_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_81_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_81_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_81_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_81_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_81_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_81_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_81_8.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.809, 0.0, 0.998, 0.577]\nB: [0.58, 0.129, 0.956, 0.394]\nC: [0.811, 0.092, 1.0, 0.669]\nD: [0.777, 0.058, 0.966, 0.635]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 640 and the height is 480.\nCAPTION: Hand of human holding food and pulling lizard up",
+    "context": "Select from the following choices.\nA: [0.809, 0.0, 0.998, 0.577]\nB: [0.58, 0.129, 0.956, 0.394]\nC: [0.811, 0.092, 1.0, 0.669]\nD: [0.777, 0.058, 0.966, 0.635]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_82_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_82_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_82_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_82_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_82_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_82_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_82_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_82_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_82_8.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.236, 0.463, 0.27, 0.502]\nB: [0.214, 0.235, 0.507, 0.347]\nC: [0.341, 0.844, 0.493, 0.93]\nD: [0.236, 0.463, 0.275, 0.502]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 544.\nCAPTION: The first car moving in a straight line.",
+    "context": "Select from the following choices.\nA: [0.236, 0.463, 0.27, 0.502]\nB: [0.214, 0.235, 0.507, 0.347]\nC: [0.341, 0.844, 0.493, 0.93]\nD: [0.236, 0.463, 0.275, 0.502]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_83_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_83_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_83_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_83_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_83_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_83_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_83_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_83_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_83_8.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.31, 0.168, 0.824, 0.958]\nB: [0.07, 0.769, 0.559, 0.911]\nC: [0.321, 0.693, 0.776, 0.816]\nD: [0.266, 0.163, 0.781, 0.954]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: The horse facing right then turning and facing left",
+    "context": "Select from the following choices.\nA: [0.31, 0.168, 0.824, 0.958]\nB: [0.07, 0.769, 0.559, 0.911]\nC: [0.321, 0.693, 0.776, 0.816]\nD: [0.266, 0.163, 0.781, 0.954]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_84_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_84_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_84_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_84_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_84_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_84_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_84_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_84_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_84_8.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.687, 0.43, 0.892, 0.709]\nB: [0.389, 0.685, 0.827, 0.74]\nC: [0.686, 0.508, 0.87, 0.754]\nD: [0.687, 0.43, 0.871, 0.675]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: Bicycle moving in a circle",
+    "context": "Select from the following choices.\nA: [0.687, 0.43, 0.892, 0.709]\nB: [0.389, 0.685, 0.827, 0.74]\nC: [0.686, 0.508, 0.87, 0.754]\nD: [0.687, 0.43, 0.871, 0.675]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_85_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_85_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_85_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_85_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_85_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_85_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_85_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_85_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_85_8.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.267, 0.388, 0.508, 0.823]\nB: [0.115, 0.0, 0.357, 0.435]\nC: [0.169, 0.184, 0.41, 0.619]\nD: [0.122, 0.181, 0.364, 0.617]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: The second bird to reach the bottom of the cage.",
+    "context": "Select from the following choices.\nA: [0.267, 0.388, 0.508, 0.823]\nB: [0.115, 0.0, 0.357, 0.435]\nC: [0.169, 0.184, 0.41, 0.619]\nD: [0.122, 0.181, 0.364, 0.617]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_86_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_86_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_86_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_86_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_86_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_86_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_86_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_86_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_86_8.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.324, 0.576, 0.95, 0.734]\nB: [0.369, 0.517, 0.994, 0.676]\nC: [0.35, 0.59, 0.976, 0.748]\nD: [0.333, 0.072, 0.674, 0.306]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1080 and the height is 1920.\nCAPTION: white fish swiming and moving a bit",
+    "context": "Select from the following choices.\nA: [0.324, 0.576, 0.95, 0.734]\nB: [0.369, 0.517, 0.994, 0.676]\nC: [0.35, 0.59, 0.976, 0.748]\nD: [0.333, 0.072, 0.674, 0.306]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_87_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_87_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_87_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_87_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_87_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_87_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_87_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_87_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_87_8.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.324, 0.006, 0.572, 0.471]\nB: [0.765, 0.374, 0.807, 0.671]\nC: [0.765, 0.374, 0.806, 0.701]\nD: [0.026, 0.234, 0.297, 0.251]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 2340 and the height is 1080.\nCAPTION: A bear standing upright, twirling a hula hoop with its neck.",
+    "context": "Select from the following choices.\nA: [0.324, 0.006, 0.572, 0.471]\nB: [0.765, 0.374, 0.807, 0.671]\nC: [0.765, 0.374, 0.806, 0.701]\nD: [0.026, 0.234, 0.297, 0.251]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_88_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_88_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_88_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_88_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_88_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_88_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_88_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_88_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_88_8.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.038, 0.141, 0.311, 0.554]\nB: [0.352, 0.368, 0.706, 0.46]\nC: [0.16, 0.219, 0.481, 0.745]\nD: [0.229, 0.09, 0.551, 0.617]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1652 and the height is 1080.\nCAPTION: The white rabbit that hasn't moved from its position.",
+    "context": "Select from the following choices.\nA: [0.038, 0.141, 0.311, 0.554]\nB: [0.352, 0.368, 0.706, 0.46]\nC: [0.16, 0.219, 0.481, 0.745]\nD: [0.229, 0.09, 0.551, 0.617]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_89_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_89_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_89_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_89_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_89_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_89_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_89_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_89_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_89_8.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.286, 0.38, 0.393, 0.766]\nB: [0.283, 0.341, 0.39, 0.727]\nC: [0.217, 0.722, 0.513, 0.913]\nD: [0.254, 0.176, 0.36, 0.562]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: turn and walk away from us",
+    "context": "Select from the following choices.\nA: [0.286, 0.38, 0.393, 0.766]\nB: [0.283, 0.341, 0.39, 0.727]\nC: [0.217, 0.722, 0.513, 0.913]\nD: [0.254, 0.176, 0.36, 0.562]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_90_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_90_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_90_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_90_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_90_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_90_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_90_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_90_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_90_8.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.111, 0.251, 0.29, 0.589]\nB: [0.051, 0.513, 0.218, 0.995]\nC: [0.041, 0.149, 0.22, 0.487]\nD: [0.078, 0.387, 0.43, 0.512]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: The bird that arrived at the cage bottom in second place.",
+    "context": "Select from the following choices.\nA: [0.111, 0.251, 0.29, 0.589]\nB: [0.051, 0.513, 0.218, 0.995]\nC: [0.041, 0.149, 0.22, 0.487]\nD: [0.078, 0.387, 0.43, 0.512]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_91_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_91_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_91_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_91_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_91_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_91_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_91_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_91_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_91_8.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.561, 0.561, 0.783, 0.887]\nB: [0.456, 0.659, 0.679, 0.985]\nC: [0.561, 0.561, 0.76, 0.854]\nD: [0.561, 0.561, 0.783, 0.858]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: sheep eating food from bowl then eating food from human hand",
+    "context": "Select from the following choices.\nA: [0.561, 0.561, 0.783, 0.887]\nB: [0.456, 0.659, 0.679, 0.985]\nC: [0.561, 0.561, 0.76, 0.854]\nD: [0.561, 0.561, 0.783, 0.858]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_92_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_92_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_92_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_92_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_92_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_92_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_92_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_92_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_92_8.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.027, 0.0, 0.999, 0.999]\nB: [0.349, 0.066, 0.418, 0.457]\nC: [0.002, 0.0, 0.974, 0.999]\nD: [0.027, 0.0, 0.853, 1.096]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: The individual gripping the lizard.",
+    "context": "Select from the following choices.\nA: [0.027, 0.0, 0.999, 0.999]\nB: [0.349, 0.066, 0.418, 0.457]\nC: [0.002, 0.0, 0.974, 0.999]\nD: [0.027, 0.0, 0.853, 1.096]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_93_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_93_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_93_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_93_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_93_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_93_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_93_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_93_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_93_8.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.223, 0.608, 0.363, 1.0]\nB: [0.233, 0.388, 0.373, 0.779]\nC: [0.27, 0.458, 0.41, 0.85]\nD: [0.27, 0.458, 0.384, 0.852]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 636 and the height is 480.\nCAPTION: The tiger that moved from the right to the left.",
+    "context": "Select from the following choices.\nA: [0.223, 0.608, 0.363, 1.0]\nB: [0.233, 0.388, 0.373, 0.779]\nC: [0.27, 0.458, 0.41, 0.85]\nD: [0.27, 0.458, 0.384, 0.852]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_94_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_94_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_94_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_94_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_94_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_94_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_94_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_94_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_94_8.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.398, 0.841, 0.564, 1.0]\nB: [0.493, 0.807, 0.658, 0.967]\nC: [0.465, 0.84, 0.661, 1.019]\nD: [0.465, 0.84, 0.63, 0.999]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: fish stay still without moving position",
+    "context": "Select from the following choices.\nA: [0.398, 0.841, 0.564, 1.0]\nB: [0.493, 0.807, 0.658, 0.967]\nC: [0.465, 0.84, 0.661, 1.019]\nD: [0.465, 0.84, 0.63, 0.999]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_95_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_95_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_95_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_95_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_95_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_95_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_95_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_95_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_95_8.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.226, 0.56, 0.609, 0.968]\nB: [0.144, 0.591, 0.527, 0.998]\nC: [0.313, 0.592, 0.697, 1.0]\nD: [0.144, 0.591, 0.553, 0.941]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1280 and the height is 596.\nCAPTION: Panda turning around and moving from leftmost to the middle then lying down to eat",
+    "context": "Select from the following choices.\nA: [0.226, 0.56, 0.609, 0.968]\nB: [0.144, 0.591, 0.527, 0.998]\nC: [0.313, 0.592, 0.697, 1.0]\nD: [0.144, 0.591, 0.553, 0.941]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_96_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_96_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_96_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_96_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_96_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_96_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_96_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_96_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_96_8.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.078, 0.251, 0.358, 0.994]\nB: [0.028, 0.251, 0.308, 0.994]\nC: [0.869, 0.508, 0.948, 0.934]\nD: [0.078, 0.251, 0.383, 1.017]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 640 and the height is 362.\nCAPTION: The walking cow that was the first to approach.",
+    "context": "Select from the following choices.\nA: [0.078, 0.251, 0.358, 0.994]\nB: [0.028, 0.251, 0.308, 0.994]\nC: [0.869, 0.508, 0.948, 0.934]\nD: [0.078, 0.251, 0.383, 1.017]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_97_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_97_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_97_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_97_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_97_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_97_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_97_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_97_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_97_8.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.327, 0.329, 0.722, 0.667]\nB: [0.278, 0.179, 0.715, 0.517]\nC: [0.234, 0.444, 0.671, 0.781]\nD: [0.327, 0.329, 0.764, 0.667]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 636 and the height is 480.\nCAPTION: Tiger walking from right to left",
+    "context": "Select from the following choices.\nA: [0.327, 0.329, 0.722, 0.667]\nB: [0.278, 0.179, 0.715, 0.517]\nC: [0.234, 0.444, 0.671, 0.781]\nD: [0.327, 0.329, 0.764, 0.667]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_98_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_98_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_98_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_98_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_98_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_98_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_98_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_98_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_98_8.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.595, 0.534, 0.818, 0.857]\nB: [0.297, 0.076, 0.501, 1.018]\nC: [0.259, 0.125, 0.494, 0.987]\nD: [0.297, 0.076, 0.532, 0.938]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: cat does not change position but lowered head",
+    "context": "Select from the following choices.\nA: [0.595, 0.534, 0.818, 0.857]\nB: [0.297, 0.076, 0.501, 1.018]\nC: [0.259, 0.125, 0.494, 0.987]\nD: [0.297, 0.076, 0.532, 0.938]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_99_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_99_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_99_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_99_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_99_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_99_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_99_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_99_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_99_8.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.123, 0.256, 0.179, 0.609]\nB: [0.123, 0.256, 0.182, 0.572]\nC: [0.112, 0.396, 0.179, 0.707]\nD: [0.123, 0.256, 0.19, 0.567]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 2340 and the height is 1080.\nCAPTION: The trainer on the left, guiding the bear to perform the hula hoop trick using its mouth.",
+    "context": "Select from the following choices.\nA: [0.123, 0.256, 0.179, 0.609]\nB: [0.123, 0.256, 0.182, 0.572]\nC: [0.112, 0.396, 0.179, 0.707]\nD: [0.123, 0.256, 0.19, 0.567]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_100_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_100_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_100_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_100_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_100_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_100_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_100_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_100_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_100_8.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.667, 0.375, 0.726, 0.688]\nB: [0.667, 0.375, 0.742, 0.667]\nC: [0.667, 0.375, 0.74, 0.696]\nD: [0.672, 0.296, 0.745, 0.617]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 480.\nCAPTION: The bird that is standing still on the wooden pole in the right cage.",
+    "context": "Select from the following choices.\nA: [0.667, 0.375, 0.726, 0.688]\nB: [0.667, 0.375, 0.742, 0.667]\nC: [0.667, 0.375, 0.74, 0.696]\nD: [0.672, 0.296, 0.745, 0.617]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_101_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_101_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_101_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_101_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_101_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_101_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_101_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_101_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_101_8.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.338, 0.908, 0.401, 0.919]\nB: [0.232, 0.002, 1.0, 1.0]\nC: [0.232, 0.0, 1.0, 0.998]\nD: [0.231, 0.0, 0.999, 0.998]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: turtle does not chang position, eating sands then look up",
+    "context": "Select from the following choices.\nA: [0.338, 0.908, 0.401, 0.919]\nB: [0.232, 0.002, 1.0, 1.0]\nC: [0.232, 0.0, 1.0, 0.998]\nD: [0.231, 0.0, 0.999, 0.998]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_102_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_102_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_102_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_102_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_102_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_102_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_102_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_102_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_102_8.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.318, 0.467, 0.571, 0.866]\nB: [0.435, 0.299, 0.688, 0.698]\nC: [0.122, 0.596, 0.486, 0.725]\nD: [0.435, 0.299, 0.668, 0.681]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: The lizard that was finally picked up by hand.",
+    "context": "Select from the following choices.\nA: [0.318, 0.467, 0.571, 0.866]\nB: [0.435, 0.299, 0.688, 0.698]\nC: [0.122, 0.596, 0.486, 0.725]\nD: [0.435, 0.299, 0.668, 0.681]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_103_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_103_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_103_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_103_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_103_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_103_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_103_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_103_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_103_8.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.122, 0.013, 0.556, 1.099]\nB: [0.239, 0.853, 0.337, 0.969]\nC: [0.377, 0.331, 0.759, 0.392]\nD: [0.122, 0.013, 0.617, 0.943]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: The cat that first climbed the cat tree.",
+    "context": "Select from the following choices.\nA: [0.122, 0.013, 0.556, 1.099]\nB: [0.239, 0.853, 0.337, 0.969]\nC: [0.377, 0.331, 0.759, 0.392]\nD: [0.122, 0.013, 0.617, 0.943]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_104_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_104_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_104_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_104_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_104_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_104_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_104_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_104_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_104_8.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.085, 0.445, 0.406, 1.039]\nB: [0.085, 0.445, 0.37, 0.998]\nC: [0.086, 0.446, 0.37, 1.0]\nD: [0.185, 0.292, 0.47, 0.846]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1280 and the height is 596.\nCAPTION: Panda turning around and moving forward from leftmost to rightmost",
+    "context": "Select from the following choices.\nA: [0.085, 0.445, 0.406, 1.039]\nB: [0.085, 0.445, 0.37, 0.998]\nC: [0.086, 0.446, 0.37, 1.0]\nD: [0.185, 0.292, 0.47, 0.846]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_105_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_105_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_105_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_105_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_105_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_105_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_105_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_105_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_105_8.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.216, 0.701, 0.324, 1.0]\nB: [0.214, 0.693, 0.322, 0.992]\nC: [0.214, 0.571, 0.322, 0.87]\nD: [0.24, 0.699, 0.348, 0.998]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: The bird that reached the cage bottom before others.",
+    "context": "Select from the following choices.\nA: [0.216, 0.701, 0.324, 1.0]\nB: [0.214, 0.693, 0.322, 0.992]\nC: [0.214, 0.571, 0.322, 0.87]\nD: [0.24, 0.699, 0.348, 0.998]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_106_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_106_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_106_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_106_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_106_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_106_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_106_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_106_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_106_8.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.587, 0.322, 0.825, 0.853]\nB: [0.272, 0.179, 0.45, 0.271]\nC: [0.63, 0.44, 0.868, 0.971]\nD: [0.587, 0.322, 0.799, 0.957]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1280 and the height is 720.\nCAPTION: monkey crawling around on rocks then jumping to the left",
+    "context": "Select from the following choices.\nA: [0.587, 0.322, 0.825, 0.853]\nB: [0.272, 0.179, 0.45, 0.271]\nC: [0.63, 0.44, 0.868, 0.971]\nD: [0.587, 0.322, 0.799, 0.957]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_107_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_107_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_107_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_107_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_107_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_107_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_107_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_107_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_107_8.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.196, 0.0, 0.481, 0.836]\nB: [0.092, 0.0, 0.377, 0.836]\nC: [0.196, 0.0, 0.463, 0.77]\nD: [0.156, 0.0, 0.442, 0.836]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 544.\nCAPTION: Child walking and holding dog",
+    "context": "Select from the following choices.\nA: [0.196, 0.0, 0.481, 0.836]\nB: [0.092, 0.0, 0.377, 0.836]\nC: [0.196, 0.0, 0.463, 0.77]\nD: [0.156, 0.0, 0.442, 0.836]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_108_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_108_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_108_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_108_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_108_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_108_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_108_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_108_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_108_8.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.613, 0.489, 0.919, 0.603]\nB: [0.47, 0.017, 0.69, 0.178]\nC: [0.613, 0.489, 0.871, 0.598]\nD: [0.613, 0.489, 0.891, 0.598]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 604 and the height is 1280.\nCAPTION: dog playing with monkey",
+    "context": "Select from the following choices.\nA: [0.613, 0.489, 0.919, 0.603]\nB: [0.47, 0.017, 0.69, 0.178]\nC: [0.613, 0.489, 0.871, 0.598]\nD: [0.613, 0.489, 0.891, 0.598]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_109_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_109_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_109_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_109_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_109_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_109_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_109_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_109_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_109_8.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.53, 0.183, 0.836, 0.911]\nB: [0.53, 0.183, 0.866, 0.99]\nC: [0.384, 0.272, 0.69, 1.0]\nD: [0.396, 0.0, 0.702, 0.728]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: The darker-colored one among the two dogs playing together.",
+    "context": "Select from the following choices.\nA: [0.53, 0.183, 0.836, 0.911]\nB: [0.53, 0.183, 0.866, 0.99]\nC: [0.384, 0.272, 0.69, 1.0]\nD: [0.396, 0.0, 0.702, 0.728]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_110_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_110_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_110_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_110_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_110_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_110_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_110_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_110_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_110_8.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.067, 0.082, 0.288, 0.254]\nB: [0.72, 0.243, 0.741, 0.544]\nC: [0.961, 0.199, 1.001, 0.396]\nD: [0.961, 0.199, 0.999, 0.386]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: The black car parked without moving.",
+    "context": "Select from the following choices.\nA: [0.067, 0.082, 0.288, 0.254]\nB: [0.72, 0.243, 0.741, 0.544]\nC: [0.961, 0.199, 1.001, 0.396]\nD: [0.961, 0.199, 0.999, 0.386]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_111_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_111_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_111_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_111_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_111_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_111_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_111_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_111_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_111_8.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.177, 0.391, 0.631, 0.549]\nB: [0.291, 0.486, 0.47, 0.681]\nC: [0.263, 0.063, 0.436, 0.385]\nD: [0.243, 0.53, 0.422, 0.724]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1080 and the height is 1152.\nCAPTION: The big bear is moving with three small bear cubs in tow across the road.",
+    "context": "Select from the following choices.\nA: [0.177, 0.391, 0.631, 0.549]\nB: [0.291, 0.486, 0.47, 0.681]\nC: [0.263, 0.063, 0.436, 0.385]\nD: [0.243, 0.53, 0.422, 0.724]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_112_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_112_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_112_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_112_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_112_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_112_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_112_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_112_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_112_8.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.826, 0.279, 0.853, 0.622]\nB: [0.826, 0.279, 0.857, 0.632]\nC: [0.391, 0.739, 0.438, 0.837]\nD: [0.826, 0.279, 0.861, 0.641]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 2340 and the height is 1080.\nCAPTION: A man clapping and stepping back while next to the hula-hooping bear.",
+    "context": "Select from the following choices.\nA: [0.826, 0.279, 0.853, 0.622]\nB: [0.826, 0.279, 0.857, 0.632]\nC: [0.391, 0.739, 0.438, 0.837]\nD: [0.826, 0.279, 0.861, 0.641]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_113_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_113_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_113_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_113_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_113_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_113_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_113_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_113_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_113_8.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.479, 0.202, 0.684, 0.675]\nB: [0.285, 0.381, 0.652, 0.762]\nC: [0.416, 0.167, 0.621, 0.639]\nD: [0.479, 0.202, 0.705, 0.627]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 512 and the height is 252.\nCAPTION: Horse moving around",
+    "context": "Select from the following choices.\nA: [0.479, 0.202, 0.684, 0.675]\nB: [0.285, 0.381, 0.652, 0.762]\nC: [0.416, 0.167, 0.621, 0.639]\nD: [0.479, 0.202, 0.705, 0.627]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_114_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_114_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_114_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_114_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_114_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_114_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_114_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_114_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_114_8.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.127, 0.294, 0.187, 0.575]\nB: [0.101, 0.391, 0.16, 0.672]\nC: [0.113, 0.282, 0.162, 0.612]\nD: [0.113, 0.282, 0.173, 0.564]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 2340 and the height is 1080.\nCAPTION: Man walking backwards and taking a hoop from a bear",
+    "context": "Select from the following choices.\nA: [0.127, 0.294, 0.187, 0.575]\nB: [0.101, 0.391, 0.16, 0.672]\nC: [0.113, 0.282, 0.162, 0.612]\nD: [0.113, 0.282, 0.173, 0.564]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_115_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_115_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_115_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_115_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_115_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_115_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_115_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_115_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_115_8.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.714, 0.632, 0.748, 0.746]\nB: [0.714, 0.632, 0.746, 0.732]\nC: [0.714, 0.632, 0.74, 0.733]\nD: [0.714, 0.632, 0.751, 0.748]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 544.\nCAPTION: The running horse next to the white fence.",
+    "context": "Select from the following choices.\nA: [0.714, 0.632, 0.748, 0.746]\nB: [0.714, 0.632, 0.746, 0.732]\nC: [0.714, 0.632, 0.74, 0.733]\nD: [0.714, 0.632, 0.751, 0.748]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_116_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_116_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_116_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_116_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_116_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_116_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_116_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_116_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_116_8.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.032, 0.599, 0.339, 0.829]\nB: [0.376, 0.424, 0.446, 0.621]\nC: [0.359, 0.484, 0.43, 0.681]\nD: [0.477, 0.795, 0.566, 0.988]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: elephant in the distance moving to left",
+    "context": "Select from the following choices.\nA: [0.032, 0.599, 0.339, 0.829]\nB: [0.376, 0.424, 0.446, 0.621]\nC: [0.359, 0.484, 0.43, 0.681]\nD: [0.477, 0.795, 0.566, 0.988]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_117_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_117_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_117_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_117_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_117_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_117_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_117_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_117_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_117_8.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.692, 0.905, 0.802, 0.999]\nB: [0.712, 0.881, 0.822, 0.976]\nC: [0.709, 0.906, 0.819, 1.0]\nD: [0.139, 0.554, 0.377, 0.619]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1652 and the height is 1080.\nCAPTION: black eating rabbit on the rightmost",
+    "context": "Select from the following choices.\nA: [0.692, 0.905, 0.802, 0.999]\nB: [0.712, 0.881, 0.822, 0.976]\nC: [0.709, 0.906, 0.819, 1.0]\nD: [0.139, 0.554, 0.377, 0.619]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_118_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_118_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_118_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_118_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_118_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_118_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_118_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_118_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_118_8.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.024, 0.528, 0.244, 0.761]\nB: [0.589, 0.096, 0.719, 0.403]\nC: [0.108, 0.634, 0.328, 0.868]\nD: [0.631, 0.408, 0.876, 0.443]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 544.\nCAPTION: turtle swimming right",
+    "context": "Select from the following choices.\nA: [0.024, 0.528, 0.244, 0.761]\nB: [0.589, 0.096, 0.719, 0.403]\nC: [0.108, 0.634, 0.328, 0.868]\nD: [0.631, 0.408, 0.876, 0.443]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_119_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_119_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_119_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_119_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_119_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_119_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_119_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_119_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_119_8.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.434, 0.53, 0.636, 0.854]\nB: [0.342, 0.586, 0.588, 0.948]\nC: [0.472, 0.373, 0.717, 0.735]\nD: [0.434, 0.53, 0.68, 0.892]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 640 and the height is 362.\nCAPTION: Small cow walking to the front",
+    "context": "Select from the following choices.\nA: [0.434, 0.53, 0.636, 0.854]\nB: [0.342, 0.586, 0.588, 0.948]\nC: [0.472, 0.373, 0.717, 0.735]\nD: [0.434, 0.53, 0.68, 0.892]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_120_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_120_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_120_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_120_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_120_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_120_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_120_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_120_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_120_8.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.0, 0.001, 0.165, 1.0]\nB: [0.61, 0.789, 0.838, 0.937]\nC: [0.03, 0.0, 0.195, 0.999]\nD: [0.0, 0.0, 0.165, 0.999]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 945.\nCAPTION: Person showing up in the final and extending arm forward",
+    "context": "Select from the following choices.\nA: [0.0, 0.001, 0.165, 1.0]\nB: [0.61, 0.789, 0.838, 0.937]\nC: [0.03, 0.0, 0.195, 0.999]\nD: [0.0, 0.0, 0.165, 0.999]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_121_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_121_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_121_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_121_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_121_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_121_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_121_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_121_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_121_8.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.349, 0.14, 0.56, 0.604]\nB: [0.188, 0.331, 0.589, 0.633]\nC: [0.665, 0.365, 0.74, 0.694]\nD: [0.665, 0.365, 0.747, 0.71]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 480.\nCAPTION: bird standing still on bamboo pole",
+    "context": "Select from the following choices.\nA: [0.349, 0.14, 0.56, 0.604]\nB: [0.188, 0.331, 0.589, 0.633]\nC: [0.665, 0.365, 0.74, 0.694]\nD: [0.665, 0.365, 0.747, 0.71]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_122_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_122_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_122_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_122_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_122_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_122_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_122_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_122_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_122_8.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.122, 0.308, 0.178, 0.597]\nB: [0.122, 0.308, 0.188, 0.552]\nC: [0.12, 0.26, 0.176, 0.549]\nD: [0.122, 0.308, 0.174, 0.604]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 2340 and the height is 1080.\nCAPTION: The bear on the left turning the hula hoop with its head",
+    "context": "Select from the following choices.\nA: [0.122, 0.308, 0.178, 0.597]\nB: [0.122, 0.308, 0.188, 0.552]\nC: [0.12, 0.26, 0.176, 0.549]\nD: [0.122, 0.308, 0.174, 0.604]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_123_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_123_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_123_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_123_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_123_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_123_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_123_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_123_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_123_8.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.348, 0.444, 0.829, 0.892]\nB: [0.008, 0.375, 0.028, 0.477]\nC: [0.009, 0.29, 0.029, 0.392]\nD: [0.0, 0.329, 0.02, 0.431]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 480.\nCAPTION: The bird that is standing still on the wooden perch inside the cage on the left side.",
+    "context": "Select from the following choices.\nA: [0.348, 0.444, 0.829, 0.892]\nB: [0.008, 0.375, 0.028, 0.477]\nC: [0.009, 0.29, 0.029, 0.392]\nD: [0.0, 0.329, 0.02, 0.431]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_124_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_124_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_124_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_124_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_124_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_124_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_124_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_124_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_124_8.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.435, 0.353, 0.534, 0.581]\nB: [0.385, 0.399, 0.498, 0.685]\nC: [0.435, 0.353, 0.548, 0.639]\nD: [0.435, 0.353, 0.545, 0.657]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1280 and the height is 720.\nCAPTION: elephant walking ahead",
+    "context": "Select from the following choices.\nA: [0.435, 0.353, 0.534, 0.581]\nB: [0.385, 0.399, 0.498, 0.685]\nC: [0.435, 0.353, 0.548, 0.639]\nD: [0.435, 0.353, 0.545, 0.657]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_125_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_125_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_125_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_125_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_125_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_125_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_125_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_125_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_125_8.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.706, 0.289, 0.802, 0.651]\nB: [0.716, 0.293, 0.812, 0.655]\nC: [0.675, 0.248, 0.771, 0.61]\nD: [0.716, 0.293, 0.802, 0.694]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: little girl riding a bicycle",
+    "context": "Select from the following choices.\nA: [0.706, 0.289, 0.802, 0.651]\nB: [0.716, 0.293, 0.812, 0.655]\nC: [0.675, 0.248, 0.771, 0.61]\nD: [0.716, 0.293, 0.802, 0.694]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_126_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_126_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_126_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_126_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_126_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_126_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_126_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_126_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_126_8.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.139, 0.183, 0.458, 0.93]\nB: [0.421, 0.427, 0.828, 0.869]\nC: [0.139, 0.183, 0.51, 1.015]\nD: [0.19, 0.254, 0.509, 1.0]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: cat sitting on the leftmost without moving position",
+    "context": "Select from the following choices.\nA: [0.139, 0.183, 0.458, 0.93]\nB: [0.421, 0.427, 0.828, 0.869]\nC: [0.139, 0.183, 0.51, 1.015]\nD: [0.19, 0.254, 0.509, 1.0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_127_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_127_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_127_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_127_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_127_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_127_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_127_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_127_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_127_8.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.477, 0.429, 0.921, 0.702]\nB: [0.386, 0.437, 0.622, 0.575]\nC: [0.386, 0.437, 0.594, 0.569]\nD: [0.055, 0.35, 0.388, 0.759]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: The first sheep that is eating food from the man's hand.",
+    "context": "Select from the following choices.\nA: [0.477, 0.429, 0.921, 0.702]\nB: [0.386, 0.437, 0.622, 0.575]\nC: [0.386, 0.437, 0.594, 0.569]\nD: [0.055, 0.35, 0.388, 0.759]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_128_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_128_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_128_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_128_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_128_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_128_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_128_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_128_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_128_8.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.497, 0.478, 0.571, 0.551]\nB: [0.472, 0.45, 0.546, 0.524]\nC: [0.068, 0.333, 0.529, 0.676]\nD: [0.45, 0.46, 0.524, 0.533]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 544.\nCAPTION: black SUV moving directly forward across the road",
+    "context": "Select from the following choices.\nA: [0.497, 0.478, 0.571, 0.551]\nB: [0.472, 0.45, 0.546, 0.524]\nC: [0.068, 0.333, 0.529, 0.676]\nD: [0.45, 0.46, 0.524, 0.533]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_129_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_129_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_129_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_129_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_129_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_129_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_129_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_129_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_129_8.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.692, 0.905, 0.802, 0.999]\nB: [0.692, 0.905, 0.806, 1.003]\nC: [0.482, 0.202, 0.765, 0.613]\nD: [0.692, 0.905, 0.793, 0.99]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1652 and the height is 1080.\nCAPTION: black rabbit eating",
+    "context": "Select from the following choices.\nA: [0.692, 0.905, 0.802, 0.999]\nB: [0.692, 0.905, 0.806, 1.003]\nC: [0.482, 0.202, 0.765, 0.613]\nD: [0.692, 0.905, 0.793, 0.99]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_130_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_130_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_130_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_130_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_130_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_130_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_130_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_130_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_130_8.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.449, 0.012, 1.0, 0.501]\nB: [0.448, 0.0, 0.942, 0.505]\nC: [0.449, 0.0, 1.0, 0.489]\nD: [0.448, 0.0, 0.999, 0.489]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1440 and the height is 1080.\nCAPTION: puppy that overwhelms another puppy",
+    "context": "Select from the following choices.\nA: [0.449, 0.012, 1.0, 0.501]\nB: [0.448, 0.0, 0.942, 0.505]\nC: [0.449, 0.0, 1.0, 0.489]\nD: [0.448, 0.0, 0.999, 0.489]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_131_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_131_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_131_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_131_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_131_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_131_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_131_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_131_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_131_8.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.152, 0.0, 0.999, 0.26]\nB: [0.152, 0.0, 0.941, 0.298]\nC: [0.0, 0.0, 0.848, 0.26]\nD: [0.152, 0.0, 1.129, 0.251]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1766 and the height is 945.\nCAPTION: Airplane moving forward",
+    "context": "Select from the following choices.\nA: [0.152, 0.0, 0.999, 0.26]\nB: [0.152, 0.0, 0.941, 0.298]\nC: [0.0, 0.0, 0.848, 0.26]\nD: [0.152, 0.0, 1.129, 0.251]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_132_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_132_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_132_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_132_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_132_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_132_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_132_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_132_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_132_8.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.061, 0.222, 0.382, 0.782]\nB: [0.061, 0.222, 0.411, 0.733]\nC: [0.061, 0.222, 0.393, 0.89]\nD: [0.061, 0.222, 0.441, 0.809]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 528.\nCAPTION: lighter one of two fighting yaks",
+    "context": "Select from the following choices.\nA: [0.061, 0.222, 0.382, 0.782]\nB: [0.061, 0.222, 0.411, 0.733]\nC: [0.061, 0.222, 0.393, 0.89]\nD: [0.061, 0.222, 0.441, 0.809]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_133_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_133_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_133_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_133_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_133_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_133_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_133_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_133_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_133_8.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.367, 0.479, 0.452, 0.642]\nB: [0.417, 0.868, 0.645, 0.895]\nC: [0.126, 0.444, 0.287, 1.0]\nD: [0.149, 0.442, 0.31, 0.999]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1567 and the height is 730.\nCAPTION: cow shaking head and looking us",
+    "context": "Select from the following choices.\nA: [0.367, 0.479, 0.452, 0.642]\nB: [0.417, 0.868, 0.645, 0.895]\nC: [0.126, 0.444, 0.287, 1.0]\nD: [0.149, 0.442, 0.31, 0.999]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_134_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_134_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_134_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_134_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_134_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_134_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_134_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_134_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_134_8.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.473, 0.165, 0.517, 0.544]\nB: [0.311, 0.415, 0.404, 0.794]\nC: [0.35, 0.307, 0.443, 0.686]\nD: [0.36, 0.165, 0.453, 0.544]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 544.\nCAPTION: Man pushing bicycle around in a circle",
+    "context": "Select from the following choices.\nA: [0.473, 0.165, 0.517, 0.544]\nB: [0.311, 0.415, 0.404, 0.794]\nC: [0.35, 0.307, 0.443, 0.686]\nD: [0.36, 0.165, 0.453, 0.544]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_135_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_135_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_135_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_135_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_135_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_135_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_135_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_135_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_135_8.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.587, 0.235, 0.684, 0.65]\nB: [0.129, 0.113, 0.32, 0.35]\nC: [0.447, 0.354, 0.567, 0.633]\nD: [0.447, 0.354, 0.592, 0.637]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1280 and the height is 720.\nCAPTION: Elephant crushed by another elephant's trunk",
+    "context": "Select from the following choices.\nA: [0.587, 0.235, 0.684, 0.65]\nB: [0.129, 0.113, 0.32, 0.35]\nC: [0.447, 0.354, 0.567, 0.633]\nD: [0.447, 0.354, 0.592, 0.637]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_136_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_136_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_136_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_136_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_136_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_136_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_136_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_136_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_136_8.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.487, 0.379, 0.941, 0.53]\nB: [0.323, 0.292, 0.682, 0.742]\nC: [0.295, 0.403, 0.71, 0.556]\nD: [0.295, 0.403, 0.748, 0.554]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 496.\nCAPTION: The model airplane with a faster moving speed.",
+    "context": "Select from the following choices.\nA: [0.487, 0.379, 0.941, 0.53]\nB: [0.323, 0.292, 0.682, 0.742]\nC: [0.295, 0.403, 0.71, 0.556]\nD: [0.295, 0.403, 0.748, 0.554]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_137_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_137_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_137_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_137_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_137_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_137_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_137_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_137_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_137_8.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.435, 0.128, 0.633, 0.545]\nB: [0.435, 0.128, 0.597, 0.566]\nC: [0.234, 0.647, 0.36, 0.912]\nD: [0.065, 0.056, 0.176, 0.237]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: white dog play with the other dog",
+    "context": "Select from the following choices.\nA: [0.435, 0.128, 0.633, 0.545]\nB: [0.435, 0.128, 0.597, 0.566]\nC: [0.234, 0.647, 0.36, 0.912]\nD: [0.065, 0.056, 0.176, 0.237]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_138_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_138_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_138_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_138_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_138_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_138_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_138_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_138_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_138_8.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.457, 0.498, 0.581, 0.721]\nB: [0.429, 0.533, 0.553, 0.756]\nC: [0.431, 0.441, 0.555, 0.664]\nD: [0.431, 0.441, 0.562, 0.669]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 544.\nCAPTION: parking bike",
+    "context": "Select from the following choices.\nA: [0.457, 0.498, 0.581, 0.721]\nB: [0.429, 0.533, 0.553, 0.756]\nC: [0.431, 0.441, 0.555, 0.664]\nD: [0.431, 0.441, 0.562, 0.669]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_139_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_139_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_139_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_139_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_139_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_139_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_139_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_139_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_139_8.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.0, 0.105, 0.665, 0.999]\nB: [0.0, 0.105, 0.54, 1.054]\nC: [0.264, 0.106, 0.929, 1.0]\nD: [0.094, 0.106, 0.759, 1.0]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: The horse whose head is being struck by the tail of another horse",
+    "context": "Select from the following choices.\nA: [0.0, 0.105, 0.665, 0.999]\nB: [0.0, 0.105, 0.54, 1.054]\nC: [0.264, 0.106, 0.929, 1.0]\nD: [0.094, 0.106, 0.759, 1.0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_140_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_140_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_140_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_140_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_140_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_140_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_140_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_140_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_140_8.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.263, 0.682, 0.502, 0.997]\nB: [0.246, 0.685, 0.486, 1.0]\nC: [0.263, 0.682, 0.533, 0.971]\nD: [0.423, 0.255, 0.673, 0.324]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1280 and the height is 648.\nCAPTION: Bear chasing another bear around by walking in circle",
+    "context": "Select from the following choices.\nA: [0.263, 0.682, 0.502, 0.997]\nB: [0.246, 0.685, 0.486, 1.0]\nC: [0.263, 0.682, 0.533, 0.971]\nD: [0.423, 0.255, 0.673, 0.324]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_141_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_141_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_141_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_141_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_141_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_141_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_141_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_141_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_141_8.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.356, 0.0, 0.485, 0.056]\nB: [0.516, 0.864, 0.689, 0.976]\nC: [0.433, 0.097, 0.614, 0.318]\nD: [0.008, 0.274, 0.424, 0.511]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 720.\nCAPTION: The person standing beside, observing the fight between the two dogs.",
+    "context": "Select from the following choices.\nA: [0.356, 0.0, 0.485, 0.056]\nB: [0.516, 0.864, 0.689, 0.976]\nC: [0.433, 0.097, 0.614, 0.318]\nD: [0.008, 0.274, 0.424, 0.511]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_142_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_142_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_142_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_142_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_142_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_142_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_142_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_142_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_142_8.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.067, 0.447, 0.347, 0.492]\nB: [0.107, 0.317, 0.526, 0.785]\nC: [0.107, 0.317, 0.539, 0.871]\nD: [0.285, 0.191, 0.769, 0.504]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1652 and the height is 1080.\nCAPTION: Rabbit moving around and eating leaves",
+    "context": "Select from the following choices.\nA: [0.067, 0.447, 0.347, 0.492]\nB: [0.107, 0.317, 0.526, 0.785]\nC: [0.107, 0.317, 0.539, 0.871]\nD: [0.285, 0.191, 0.769, 0.504]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_143_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_143_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_143_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_143_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_143_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_143_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_143_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_143_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_143_8.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.315, 0.147, 1.082, 0.965]\nB: [0.315, 0.147, 1.085, 0.92]\nC: [0.315, 0.147, 1.037, 1.001]\nD: [0.315, 0.147, 0.969, 0.947]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: The horse, initially facing right, then turns to face left",
+    "context": "Select from the following choices.\nA: [0.315, 0.147, 1.082, 0.965]\nB: [0.315, 0.147, 1.085, 0.92]\nC: [0.315, 0.147, 1.037, 1.001]\nD: [0.315, 0.147, 0.969, 0.947]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_144_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_144_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_144_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_144_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_144_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_144_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_144_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_144_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_144_8.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.872, 0.209, 0.98, 0.326]\nB: [0.852, 0.23, 0.959, 0.346]\nC: [0.852, 0.23, 0.947, 0.324]\nD: [0.807, 0.175, 0.914, 0.292]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: parking white car",
+    "context": "Select from the following choices.\nA: [0.872, 0.209, 0.98, 0.326]\nB: [0.852, 0.23, 0.959, 0.346]\nC: [0.852, 0.23, 0.947, 0.324]\nD: [0.807, 0.175, 0.914, 0.292]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_145_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_145_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_145_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_145_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_145_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_145_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_145_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_145_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_145_8.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.003, 0.086, 0.25, 0.3]\nB: [0.085, 0.061, 0.332, 0.275]\nC: [0.096, 0.09, 0.343, 0.304]\nD: [0.003, 0.086, 0.254, 0.325]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: The bird that arrived at the bottom of the cage first.",
+    "context": "Select from the following choices.\nA: [0.003, 0.086, 0.25, 0.3]\nB: [0.085, 0.061, 0.332, 0.275]\nC: [0.096, 0.09, 0.343, 0.304]\nD: [0.003, 0.086, 0.254, 0.325]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_146_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_146_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_146_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_146_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_146_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_146_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_146_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_146_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_146_8.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.584, 0.354, 0.932, 0.634]\nB: [0.364, 0.439, 0.436, 0.622]\nC: [0.34, 0.475, 0.412, 0.658]\nD: [0.364, 0.439, 0.432, 0.608]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: The elephant walking to the left.",
+    "context": "Select from the following choices.\nA: [0.584, 0.354, 0.932, 0.634]\nB: [0.364, 0.439, 0.436, 0.622]\nC: [0.34, 0.475, 0.412, 0.658]\nD: [0.364, 0.439, 0.432, 0.608]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_147_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_147_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_147_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_147_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_147_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_147_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_147_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_147_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_147_8.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.31, 0.426, 0.433, 0.642]\nB: [0.31, 0.426, 0.423, 0.608]\nC: [0.31, 0.426, 0.45, 0.673]\nD: [0.309, 0.434, 0.432, 0.649]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 544.\nCAPTION: Bicycle being pushed around in a circle",
+    "context": "Select from the following choices.\nA: [0.31, 0.426, 0.433, 0.642]\nB: [0.31, 0.426, 0.423, 0.608]\nC: [0.31, 0.426, 0.45, 0.673]\nD: [0.309, 0.434, 0.432, 0.649]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_148_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_148_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_148_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_148_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_148_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_148_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_148_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_148_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_148_8.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.321, 0.44, 0.514, 0.715]\nB: [0.599, 0.537, 0.865, 0.898]\nC: [0.248, 0.394, 0.442, 0.669]\nD: [0.341, 0.617, 0.824, 0.998]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 636 and the height is 480.\nCAPTION: The tiger that came to drink water last.",
+    "context": "Select from the following choices.\nA: [0.321, 0.44, 0.514, 0.715]\nB: [0.599, 0.537, 0.865, 0.898]\nC: [0.248, 0.394, 0.442, 0.669]\nD: [0.341, 0.617, 0.824, 0.998]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_149_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_149_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_149_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_149_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_149_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_149_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_149_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_149_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_149_8.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.317, 0.473, 0.775, 0.875]\nB: [0.317, 0.473, 0.744, 0.815]\nC: [0.453, 0.381, 0.88, 0.723]\nD: [0.317, 0.473, 0.662, 0.806]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 640 and the height is 480.\nCAPTION: The lizard that first ate the food.",
+    "context": "Select from the following choices.\nA: [0.317, 0.473, 0.775, 0.875]\nB: [0.317, 0.473, 0.744, 0.815]\nC: [0.453, 0.381, 0.88, 0.723]\nD: [0.317, 0.473, 0.662, 0.806]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_150_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_150_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_150_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_150_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_150_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_150_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_150_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_150_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_150_8.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.0, 0.0, 0.708, 0.978]\nB: [0.291, 0.0, 0.999, 0.978]\nC: [0.265, 0.0, 0.973, 0.978]\nD: [0.291, 0.0, 0.882, 1.101]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: Turtle biting at the ground then looking up",
+    "context": "Select from the following choices.\nA: [0.0, 0.0, 0.708, 0.978]\nB: [0.291, 0.0, 0.999, 0.978]\nC: [0.265, 0.0, 0.973, 0.978]\nD: [0.291, 0.0, 0.882, 1.101]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_151_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_151_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_151_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_151_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_151_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_151_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_151_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_151_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_151_8.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.178, 0.354, 0.671, 0.415]\nB: [0.529, 0.094, 0.652, 0.226]\nC: [0.52, 0.228, 0.9, 0.739]\nD: [0.376, 0.194, 0.494, 0.64]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1280 and the height is 720.\nCAPTION: The red truck in motion.",
+    "context": "Select from the following choices.\nA: [0.178, 0.354, 0.671, 0.415]\nB: [0.529, 0.094, 0.652, 0.226]\nC: [0.52, 0.228, 0.9, 0.739]\nD: [0.376, 0.194, 0.494, 0.64]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_152_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_152_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_152_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_152_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_152_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_152_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_152_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_152_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_152_8.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.351, 0.528, 0.504, 0.777]\nB: [0.321, 0.242, 0.374, 0.512]\nC: [0.195, 0.202, 0.392, 0.886]\nD: [0.215, 0.316, 0.411, 1.0]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: Cat walking forward",
+    "context": "Select from the following choices.\nA: [0.351, 0.528, 0.504, 0.777]\nB: [0.321, 0.242, 0.374, 0.512]\nC: [0.195, 0.202, 0.392, 0.886]\nD: [0.215, 0.316, 0.411, 1.0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_153_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_153_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_153_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_153_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_153_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_153_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_153_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_153_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_153_8.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.709, 0.471, 0.932, 0.591]\nB: [0.227, 0.343, 0.601, 0.803]\nC: [0.194, 0.624, 0.248, 0.951]\nD: [0.709, 0.471, 0.955, 0.591]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 604 and the height is 1280.\nCAPTION: Puppy running around ",
+    "context": "Select from the following choices.\nA: [0.709, 0.471, 0.932, 0.591]\nB: [0.227, 0.343, 0.601, 0.803]\nC: [0.194, 0.624, 0.248, 0.951]\nD: [0.709, 0.471, 0.955, 0.591]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_154_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_154_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_154_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_154_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_154_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_154_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_154_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_154_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_154_8.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.374, 0.222, 0.81, 0.677]\nB: [0.374, 0.222, 0.764, 0.631]\nC: [0.464, 0.105, 0.854, 0.513]\nD: [0.374, 0.222, 0.692, 0.671]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1440 and the height is 1080.\nCAPTION: Puppy lying on the ground with its belly leaking",
+    "context": "Select from the following choices.\nA: [0.374, 0.222, 0.81, 0.677]\nB: [0.374, 0.222, 0.764, 0.631]\nC: [0.464, 0.105, 0.854, 0.513]\nD: [0.374, 0.222, 0.692, 0.671]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_155_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_155_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_155_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_155_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_155_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_155_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_155_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_155_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_155_8.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.544, 0.351, 0.595, 0.461]\nB: [0.544, 0.351, 0.588, 0.456]\nC: [0.562, 0.326, 0.613, 0.436]\nD: [0.523, 0.379, 0.573, 0.489]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1280 and the height is 720.\nCAPTION: turtle is moved by monkeys on stone",
+    "context": "Select from the following choices.\nA: [0.544, 0.351, 0.595, 0.461]\nB: [0.544, 0.351, 0.588, 0.456]\nC: [0.562, 0.326, 0.613, 0.436]\nD: [0.523, 0.379, 0.573, 0.489]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_156_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_156_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_156_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_156_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_156_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_156_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_156_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_156_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_156_8.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.199, 0.282, 0.295, 0.529]\nB: [0.392, 0.079, 0.849, 0.268]\nC: [0.199, 0.282, 0.276, 0.526]\nD: [0.044, 0.596, 0.499, 0.908]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: The bird that has not reached the bottom of the cage yet.",
+    "context": "Select from the following choices.\nA: [0.199, 0.282, 0.295, 0.529]\nB: [0.392, 0.079, 0.849, 0.268]\nC: [0.199, 0.282, 0.276, 0.526]\nD: [0.044, 0.596, 0.499, 0.908]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_157_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_157_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_157_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_157_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_157_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_157_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_157_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_157_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_157_8.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.209, 0.29, 0.602, 0.59]\nB: [0.449, 0.438, 0.581, 0.682]\nC: [0.699, 0.158, 0.923, 0.535]\nD: [0.449, 0.438, 0.566, 0.653]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 544.\nCAPTION: Stationary bicycle",
+    "context": "Select from the following choices.\nA: [0.209, 0.29, 0.602, 0.59]\nB: [0.449, 0.438, 0.581, 0.682]\nC: [0.699, 0.158, 0.923, 0.535]\nD: [0.449, 0.438, 0.566, 0.653]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_158_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_158_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_158_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_158_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_158_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_158_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_158_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_158_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_158_8.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.466, 0.0, 0.511, 0.153]\nB: [0.466, 0.0, 0.506, 0.154]\nC: [0.466, 0.0, 0.505, 0.136]\nD: [0.73, 0.279, 0.812, 0.482]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: the left one of the two sitting people in the distance",
+    "context": "Select from the following choices.\nA: [0.466, 0.0, 0.511, 0.153]\nB: [0.466, 0.0, 0.506, 0.154]\nC: [0.466, 0.0, 0.505, 0.136]\nD: [0.73, 0.279, 0.812, 0.482]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_159_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_159_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_159_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_159_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_159_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_159_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_159_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_159_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_159_8.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.33, 0.105, 0.536, 0.932]\nB: [0.385, 0.088, 0.592, 0.916]\nC: [0.33, 0.105, 0.545, 0.894]\nD: [0.017, 0.121, 0.509, 0.611]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: cat with a yellow ring around its neck",
+    "context": "Select from the following choices.\nA: [0.33, 0.105, 0.536, 0.932]\nB: [0.385, 0.088, 0.592, 0.916]\nC: [0.33, 0.105, 0.545, 0.894]\nD: [0.017, 0.121, 0.509, 0.611]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_160_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_160_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_160_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_160_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_160_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_160_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_160_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_160_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_160_8.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.221, 0.155, 0.509, 0.54]\nB: [0.221, 0.155, 0.524, 0.703]\nC: [0.107, 0.125, 0.37, 0.606]\nD: [0.221, 0.155, 0.483, 0.636]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 528.\nCAPTION: darker one of two fighting yaks",
+    "context": "Select from the following choices.\nA: [0.221, 0.155, 0.509, 0.54]\nB: [0.221, 0.155, 0.524, 0.703]\nC: [0.107, 0.125, 0.37, 0.606]\nD: [0.221, 0.155, 0.483, 0.636]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_161_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_161_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_161_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_161_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_161_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_161_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_161_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_161_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_161_8.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.031, 0.234, 0.411, 0.404]\nB: [0.847, 0.229, 0.953, 0.344]\nC: [0.842, 0.222, 0.947, 0.337]\nD: [0.826, 0.244, 0.932, 0.359]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: parking black car",
+    "context": "Select from the following choices.\nA: [0.031, 0.234, 0.411, 0.404]\nB: [0.847, 0.229, 0.953, 0.344]\nC: [0.842, 0.222, 0.947, 0.337]\nD: [0.826, 0.244, 0.932, 0.359]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_162_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_162_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_162_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_162_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_162_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_162_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_162_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_162_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_162_8.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.164, 0.276, 0.317, 0.447]\nB: [0.164, 0.276, 0.3, 0.456]\nC: [0.412, 0.111, 0.527, 0.517]\nD: [0.164, 0.276, 0.277, 0.46]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1280 and the height is 720.\nCAPTION: Monkey running out of a hole to the left then right",
+    "context": "Select from the following choices.\nA: [0.164, 0.276, 0.317, 0.447]\nB: [0.164, 0.276, 0.3, 0.456]\nC: [0.412, 0.111, 0.527, 0.517]\nD: [0.164, 0.276, 0.277, 0.46]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_163_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_163_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_163_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_163_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_163_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_163_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_163_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_163_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_163_8.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.172, 0.283, 0.258, 0.559]\nB: [0.548, 0.524, 0.785, 0.948]\nC: [0.172, 0.283, 0.271, 0.557]\nD: [0.131, 0.323, 0.23, 0.597]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: The bird that hasn't touched the cage floor so far.",
+    "context": "Select from the following choices.\nA: [0.172, 0.283, 0.258, 0.559]\nB: [0.548, 0.524, 0.785, 0.948]\nC: [0.172, 0.283, 0.271, 0.557]\nD: [0.131, 0.323, 0.23, 0.597]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_164_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_164_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_164_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_164_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_164_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_164_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_164_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_164_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_164_8.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.214, 0.404, 0.521, 0.765]\nB: [0.322, 0.46, 0.629, 0.821]\nC: [0.257, 0.354, 0.564, 0.715]\nD: [0.3, 0.597, 0.607, 0.958]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1280 and the height is 720.\nCAPTION: The elephant being assaulted and harassed by its companions.",
+    "context": "Select from the following choices.\nA: [0.214, 0.404, 0.521, 0.765]\nB: [0.322, 0.46, 0.629, 0.821]\nC: [0.257, 0.354, 0.564, 0.715]\nD: [0.3, 0.597, 0.607, 0.958]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_165_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_165_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_165_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_165_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_165_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_165_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_165_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_165_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_165_8.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.173, 0.303, 0.32, 0.724]\nB: [0.18, 0.397, 0.327, 0.819]\nC: [0.276, 0.579, 0.423, 1.0]\nD: [0.203, 0.44, 0.349, 0.861]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: Man squatting down and opening his palm",
+    "context": "Select from the following choices.\nA: [0.173, 0.303, 0.32, 0.724]\nB: [0.18, 0.397, 0.327, 0.819]\nC: [0.276, 0.579, 0.423, 1.0]\nD: [0.203, 0.44, 0.349, 0.861]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_166_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_166_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_166_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_166_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_166_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_166_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_166_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_166_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_166_8.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.185, 0.433, 0.398, 0.844]\nB: [0.192, 0.389, 0.377, 0.817]\nC: [0.185, 0.433, 0.369, 0.861]\nD: [0.278, 0.1, 0.616, 0.4]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: person feeding sheeps",
+    "context": "Select from the following choices.\nA: [0.185, 0.433, 0.398, 0.844]\nB: [0.192, 0.389, 0.377, 0.817]\nC: [0.185, 0.433, 0.369, 0.861]\nD: [0.278, 0.1, 0.616, 0.4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_167_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_167_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_167_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_167_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_167_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_167_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_167_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_167_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_167_8.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.182, 0.029, 0.661, 0.508]\nB: [0.184, 0.168, 0.605, 0.803]\nC: [0.216, 0.331, 0.577, 0.948]\nD: [0.216, 0.331, 0.636, 0.967]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: cat chasing cat teaser",
+    "context": "Select from the following choices.\nA: [0.182, 0.029, 0.661, 0.508]\nB: [0.184, 0.168, 0.605, 0.803]\nC: [0.216, 0.331, 0.577, 0.948]\nD: [0.216, 0.331, 0.636, 0.967]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_168_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_168_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_168_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_168_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_168_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_168_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_168_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_168_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_168_8.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.022, 0.206, 0.13, 0.376]\nB: [0.02, 0.272, 0.128, 0.443]\nC: [0.0, 0.228, 0.108, 0.399]\nD: [0.38, 0.124, 0.665, 0.614]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1280 and the height is 720.\nCAPTION: monkey walking in and out of the gate",
+    "context": "Select from the following choices.\nA: [0.022, 0.206, 0.13, 0.376]\nB: [0.02, 0.272, 0.128, 0.443]\nC: [0.0, 0.228, 0.108, 0.399]\nD: [0.38, 0.124, 0.665, 0.614]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_169_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_169_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_169_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_169_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_169_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_169_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_169_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_169_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_169_8.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.591, 0.321, 0.819, 0.879]\nB: [0.591, 0.321, 0.816, 0.849]\nC: [0.591, 0.321, 0.797, 0.938]\nD: [0.591, 0.321, 0.828, 0.908]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1280 and the height is 720.\nCAPTION: Monkey moving around and jumping to the left",
+    "context": "Select from the following choices.\nA: [0.591, 0.321, 0.819, 0.879]\nB: [0.591, 0.321, 0.816, 0.849]\nC: [0.591, 0.321, 0.797, 0.938]\nD: [0.591, 0.321, 0.828, 0.908]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_170_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_170_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_170_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_170_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_170_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_170_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_170_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_170_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_170_8.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.824, 0.389, 0.91, 0.531]\nB: [0.447, 0.617, 0.93, 0.662]\nC: [0.85, 0.361, 0.936, 0.503]\nD: [0.814, 0.372, 0.9, 0.514]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1280 and the height is 720.\nCAPTION: The one that ran away from the right first",
+    "context": "Select from the following choices.\nA: [0.824, 0.389, 0.91, 0.531]\nB: [0.447, 0.617, 0.93, 0.662]\nC: [0.85, 0.361, 0.936, 0.503]\nD: [0.814, 0.372, 0.9, 0.514]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_171_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_171_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_171_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_171_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_171_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_171_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_171_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_171_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_171_8.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.089, 0.316, 0.117, 0.569]\nB: [0.0, 0.0, 0.938, 0.907]\nC: [0.0, 0.0, 0.999, 0.817]\nD: [0.001, 0.006, 1.0, 0.823]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1766 and the height is 945.\nCAPTION: Aircraft moving rightward",
+    "context": "Select from the following choices.\nA: [0.089, 0.316, 0.117, 0.569]\nB: [0.0, 0.0, 0.938, 0.907]\nC: [0.0, 0.0, 0.999, 0.817]\nD: [0.001, 0.006, 1.0, 0.823]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_172_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_172_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_172_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_172_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_172_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_172_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_172_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_172_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_172_8.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.348, 0.861, 0.555, 0.968]\nB: [0.286, 0.694, 0.507, 0.998]\nC: [0.286, 0.694, 0.47, 1.019]\nD: [0.243, 0.59, 0.464, 0.894]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1280 and the height is 648.\nCAPTION: bear fight then sit",
+    "context": "Select from the following choices.\nA: [0.348, 0.861, 0.555, 0.968]\nB: [0.286, 0.694, 0.507, 0.998]\nC: [0.286, 0.694, 0.47, 1.019]\nD: [0.243, 0.59, 0.464, 0.894]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_173_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_173_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_173_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_173_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_173_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_173_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_173_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_173_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_173_8.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.061, 0.194, 0.192, 0.585]\nB: [0.219, 0.111, 0.325, 0.363]\nC: [0.165, 0.65, 0.391, 0.915]\nD: [0.09, 0.733, 0.316, 0.998]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1280 and the height is 648.\nCAPTION: The bear moving in reverse.",
+    "context": "Select from the following choices.\nA: [0.061, 0.194, 0.192, 0.585]\nB: [0.219, 0.111, 0.325, 0.363]\nC: [0.165, 0.65, 0.391, 0.915]\nD: [0.09, 0.733, 0.316, 0.998]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_174_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_174_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_174_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_174_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_174_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_174_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_174_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_174_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_174_8.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.432, 0.037, 0.81, 0.448]\nB: [0.119, 0.0, 0.913, 0.438]\nC: [0.0, 0.0, 0.794, 0.438]\nD: [0.119, 0.0, 0.864, 0.437]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1766 and the height is 945.\nCAPTION: aircraft moving leftward",
+    "context": "Select from the following choices.\nA: [0.432, 0.037, 0.81, 0.448]\nB: [0.119, 0.0, 0.913, 0.438]\nC: [0.0, 0.0, 0.794, 0.438]\nD: [0.119, 0.0, 0.864, 0.437]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_175_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_175_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_175_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_175_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_175_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_175_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_175_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_175_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_175_8.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.393, 0.381, 0.514, 0.996]\nB: [0.414, 0.385, 0.535, 1.0]\nC: [0.096, 0.222, 0.52, 0.615]\nD: [0.861, 0.472, 0.906, 0.813]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 512 and the height is 252.\nCAPTION: man moving to right and watching the horse running in circles",
+    "context": "Select from the following choices.\nA: [0.393, 0.381, 0.514, 0.996]\nB: [0.414, 0.385, 0.535, 1.0]\nC: [0.096, 0.222, 0.52, 0.615]\nD: [0.861, 0.472, 0.906, 0.813]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_176_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_176_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_176_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_176_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_176_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_176_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_176_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_176_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_176_8.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.366, 0.448, 0.75, 0.929]\nB: [0.361, 0.037, 0.847, 0.281]\nC: [0.366, 0.448, 0.784, 0.86]\nD: [0.366, 0.448, 0.861, 0.896]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 640 and the height is 480.\nCAPTION: the one who eats the food first",
+    "context": "Select from the following choices.\nA: [0.366, 0.448, 0.75, 0.929]\nB: [0.361, 0.037, 0.847, 0.281]\nC: [0.366, 0.448, 0.784, 0.86]\nD: [0.366, 0.448, 0.861, 0.896]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_177_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_177_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_177_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_177_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_177_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_177_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_177_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_177_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_177_8.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.796, 0.477, 0.999, 0.998]\nB: [0.504, 0.03, 0.769, 0.436]\nC: [0.797, 0.478, 1.0, 1.0]\nD: [0.484, 0.49, 0.883, 0.916]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1280 and the height is 596.\nCAPTION: panda sit down and eat, then move, then sit down and eat",
+    "context": "Select from the following choices.\nA: [0.796, 0.477, 0.999, 0.998]\nB: [0.504, 0.03, 0.769, 0.436]\nC: [0.797, 0.478, 1.0, 1.0]\nD: [0.484, 0.49, 0.883, 0.916]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_178_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_178_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_178_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_178_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_178_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_178_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_178_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_178_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_178_8.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.0, 0.0, 0.536, 0.817]\nB: [0.527, 0.349, 0.567, 0.619]\nC: [0.028, 0.459, 0.397, 0.856]\nD: [0.0, 0.0, 0.456, 0.998]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 945.\nCAPTION: Girl holding leaf with right hand and pulling away then standing up",
+    "context": "Select from the following choices.\nA: [0.0, 0.0, 0.536, 0.817]\nB: [0.527, 0.349, 0.567, 0.619]\nC: [0.028, 0.459, 0.397, 0.856]\nD: [0.0, 0.0, 0.456, 0.998]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_179_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_179_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_179_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_179_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_179_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_179_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_179_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_179_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_179_8.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.0, 0.0, 0.394, 0.32]\nB: [0.0, 0.0, 0.369, 0.36]\nC: [0.0, 0.0, 0.421, 0.277]\nD: [0.315, 0.304, 0.751, 0.604]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1766 and the height is 945.\nCAPTION: Airplane moving from right to left",
+    "context": "Select from the following choices.\nA: [0.0, 0.0, 0.394, 0.32]\nB: [0.0, 0.0, 0.369, 0.36]\nC: [0.0, 0.0, 0.421, 0.277]\nD: [0.315, 0.304, 0.751, 0.604]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_180_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_180_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_180_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_180_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_180_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_180_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_180_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_180_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_180_8.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.406, 0.49, 0.778, 0.986]\nB: [0.176, 0.182, 0.362, 0.224]\nC: [0.307, 0.268, 0.679, 0.764]\nD: [0.307, 0.268, 0.648, 0.8]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1280 and the height is 720.\nCAPTION: monkey jumping right to the rock then crawling around",
+    "context": "Select from the following choices.\nA: [0.406, 0.49, 0.778, 0.986]\nB: [0.176, 0.182, 0.362, 0.224]\nC: [0.307, 0.268, 0.679, 0.764]\nD: [0.307, 0.268, 0.648, 0.8]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_181_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_181_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_181_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_181_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_181_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_181_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_181_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_181_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_181_8.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.544, 0.447, 0.603, 0.518]\nB: [0.527, 0.443, 0.586, 0.515]\nC: [0.527, 0.443, 0.584, 0.5]\nD: [0.527, 0.443, 0.583, 0.506]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 544.\nCAPTION: The second vehicle going straight at the crossroads.",
+    "context": "Select from the following choices.\nA: [0.544, 0.447, 0.603, 0.518]\nB: [0.527, 0.443, 0.586, 0.515]\nC: [0.527, 0.443, 0.584, 0.5]\nD: [0.527, 0.443, 0.583, 0.506]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_182_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_182_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_182_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_182_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_182_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_182_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_182_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_182_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_182_8.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.659, 0.6, 0.999, 0.999]\nB: [0.267, 0.368, 0.319, 0.865]\nC: [0.213, 0.071, 0.666, 0.468]\nD: [0.609, 0.601, 0.949, 1.0]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 945.\nCAPTION: Rabbit on the left bending forward then hopping in a round to rightmost",
+    "context": "Select from the following choices.\nA: [0.659, 0.6, 0.999, 0.999]\nB: [0.267, 0.368, 0.319, 0.865]\nC: [0.213, 0.071, 0.666, 0.468]\nD: [0.609, 0.601, 0.949, 1.0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_183_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_183_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_183_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_183_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_183_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_183_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_183_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_183_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_183_8.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.122, 0.699, 0.352, 1.026]\nB: [0.077, 0.701, 0.33, 1.0]\nC: [0.122, 0.699, 0.341, 1.006]\nD: [0.122, 0.699, 0.376, 0.998]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1280 and the height is 648.\nCAPTION: Bear being chased and walking backward",
+    "context": "Select from the following choices.\nA: [0.122, 0.699, 0.352, 1.026]\nB: [0.077, 0.701, 0.33, 1.0]\nC: [0.122, 0.699, 0.341, 1.006]\nD: [0.122, 0.699, 0.376, 0.998]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_184_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_184_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_184_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_184_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_184_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_184_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_184_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_184_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_184_8.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.43, 0.075, 1.022, 0.917]\nB: [0.439, 0.323, 0.531, 0.685]\nC: [0.134, 0.398, 0.588, 0.696]\nD: [0.43, 0.075, 0.942, 0.878]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 640 and the height is 362.\nCAPTION: cow waving head without moving position",
+    "context": "Select from the following choices.\nA: [0.43, 0.075, 1.022, 0.917]\nB: [0.439, 0.323, 0.531, 0.685]\nC: [0.134, 0.398, 0.588, 0.696]\nD: [0.43, 0.075, 0.942, 0.878]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_185_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_185_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_185_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_185_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_185_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_185_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_185_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_185_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_185_8.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.381, 0.381, 0.479, 0.889]\nB: [0.381, 0.381, 0.484, 0.996]\nC: [0.762, 0.214, 0.949, 0.595]\nD: [0.41, 0.226, 0.514, 0.841]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 512 and the height is 252.\nCAPTION: person walk forward",
+    "context": "Select from the following choices.\nA: [0.381, 0.381, 0.479, 0.889]\nB: [0.381, 0.381, 0.484, 0.996]\nC: [0.762, 0.214, 0.949, 0.595]\nD: [0.41, 0.226, 0.514, 0.841]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_186_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_186_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_186_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_186_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_186_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_186_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_186_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_186_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_186_8.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.505, 0.235, 0.539, 0.421]\nB: [0.502, 0.315, 0.537, 0.501]\nC: [0.505, 0.235, 0.534, 0.415]\nD: [0.505, 0.235, 0.534, 0.406]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1280 and the height is 720.\nCAPTION: The monkey squatting in the middle of the cave entrance.",
+    "context": "Select from the following choices.\nA: [0.505, 0.235, 0.539, 0.421]\nB: [0.502, 0.315, 0.537, 0.501]\nC: [0.505, 0.235, 0.534, 0.415]\nD: [0.505, 0.235, 0.534, 0.406]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_187_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_187_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_187_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_187_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_187_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_187_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_187_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_187_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_187_8.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.531, 0.29, 0.714, 0.687]\nB: [0.527, 0.434, 0.703, 0.857]\nC: [0.449, 0.314, 0.633, 0.711]\nD: [0.527, 0.434, 0.71, 0.831]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: still bike",
+    "context": "Select from the following choices.\nA: [0.531, 0.29, 0.714, 0.687]\nB: [0.527, 0.434, 0.703, 0.857]\nC: [0.449, 0.314, 0.633, 0.711]\nD: [0.527, 0.434, 0.71, 0.831]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_188_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_188_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_188_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_188_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_188_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_188_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_188_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_188_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_188_8.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.156, 0.229, 0.402, 0.967]\nB: [0.295, 0.05, 0.639, 0.536]\nC: [0.156, 0.229, 0.389, 0.956]\nD: [0.156, 0.229, 0.366, 0.89]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 640 and the height is 362.\nCAPTION: Cow walking to the front first",
+    "context": "Select from the following choices.\nA: [0.156, 0.229, 0.402, 0.967]\nB: [0.295, 0.05, 0.639, 0.536]\nC: [0.156, 0.229, 0.389, 0.956]\nD: [0.156, 0.229, 0.366, 0.89]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_189_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_189_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_189_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_189_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_189_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_189_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_189_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_189_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_189_8.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.577, 0.806, 0.624, 0.838]\nB: [0.39, 0.0, 0.999, 0.999]\nC: [0.391, 0.0, 1.0, 0.999]\nD: [0.208, 0.041, 0.483, 0.497]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: cat climbing on cat tree",
+    "context": "Select from the following choices.\nA: [0.577, 0.806, 0.624, 0.838]\nB: [0.39, 0.0, 0.999, 0.999]\nC: [0.391, 0.0, 1.0, 0.999]\nD: [0.208, 0.041, 0.483, 0.497]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_190_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_190_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_190_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_190_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_190_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_190_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_190_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_190_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_190_8.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.576, 0.004, 0.609, 0.46]\nB: [0.141, 0.397, 0.406, 0.802]\nC: [0.199, 0.353, 0.465, 0.758]\nD: [0.199, 0.353, 0.412, 0.679]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 512 and the height is 252.\nCAPTION: horse run in circles",
+    "context": "Select from the following choices.\nA: [0.576, 0.004, 0.609, 0.46]\nB: [0.141, 0.397, 0.406, 0.802]\nC: [0.199, 0.353, 0.465, 0.758]\nD: [0.199, 0.353, 0.412, 0.679]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_191_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_191_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_191_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_191_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_191_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_191_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_191_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_191_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_191_8.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.459, 0.0, 0.998, 0.925]\nB: [0.459, 0.0, 1.07, 1.094]\nC: [0.291, 0.075, 0.83, 1.0]\nD: [0.459, 0.0, 0.994, 1.006]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 640 and the height is 362.\nCAPTION: Cow looking around without moving",
+    "context": "Select from the following choices.\nA: [0.459, 0.0, 0.998, 0.925]\nB: [0.459, 0.0, 1.07, 1.094]\nC: [0.291, 0.075, 0.83, 1.0]\nD: [0.459, 0.0, 0.994, 1.006]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_192_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_192_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_192_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_192_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_192_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_192_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_192_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_192_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_192_8.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.334, 0.0, 0.448, 0.665]\nB: [0.334, 0.0, 0.444, 0.569]\nC: [0.334, 0.0, 0.444, 0.528]\nD: [0.334, 0.0, 0.46, 0.55]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: Person moving around holding leashes",
+    "context": "Select from the following choices.\nA: [0.334, 0.0, 0.448, 0.665]\nB: [0.334, 0.0, 0.444, 0.569]\nC: [0.334, 0.0, 0.444, 0.528]\nD: [0.334, 0.0, 0.46, 0.55]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_193_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_193_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_193_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_193_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_193_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_193_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_193_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_193_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_193_8.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.662, 0.542, 0.828, 0.8]\nB: [0.455, 0.368, 0.599, 0.715]\nC: [0.455, 0.368, 0.595, 0.741]\nD: [0.455, 0.368, 0.585, 0.759]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 544.\nCAPTION: Dog walking then turn around",
+    "context": "Select from the following choices.\nA: [0.662, 0.542, 0.828, 0.8]\nB: [0.455, 0.368, 0.599, 0.715]\nC: [0.455, 0.368, 0.595, 0.741]\nD: [0.455, 0.368, 0.585, 0.759]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_194_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_194_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_194_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_194_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_194_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_194_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_194_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_194_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_194_8.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.542, 0.445, 0.727, 0.843]\nB: [0.478, 0.457, 0.662, 0.855]\nC: [0.505, 0.394, 0.689, 0.792]\nD: [0.465, 0.516, 0.649, 0.913]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: Stationary bicycle",
+    "context": "Select from the following choices.\nA: [0.542, 0.445, 0.727, 0.843]\nB: [0.478, 0.457, 0.662, 0.855]\nC: [0.505, 0.394, 0.689, 0.792]\nD: [0.465, 0.516, 0.649, 0.913]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_195_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_195_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_195_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_195_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_195_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_195_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_195_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_195_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_195_8.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.866, 0.0, 0.99, 0.22]\nB: [0.722, 0.25, 0.802, 0.602]\nC: [0.866, 0.0, 0.973, 0.201]\nD: [0.51, 0.383, 0.97, 0.458]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 528.\nCAPTION: yak standing near two fighting yaks",
+    "context": "Select from the following choices.\nA: [0.866, 0.0, 0.99, 0.22]\nB: [0.722, 0.25, 0.802, 0.602]\nC: [0.866, 0.0, 0.973, 0.201]\nD: [0.51, 0.383, 0.97, 0.458]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_196_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_196_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_196_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_196_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_196_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_196_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_196_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_196_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_196_8.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.747, 0.274, 0.786, 0.366]\nB: [0.762, 0.793, 0.96, 0.873]\nC: [0.743, 0.214, 0.783, 0.305]\nD: [0.753, 0.242, 0.793, 0.334]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1567 and the height is 730.\nCAPTION: The cow taking the lead in the herd.",
+    "context": "Select from the following choices.\nA: [0.747, 0.274, 0.786, 0.366]\nB: [0.762, 0.793, 0.96, 0.873]\nC: [0.743, 0.214, 0.783, 0.305]\nD: [0.753, 0.242, 0.793, 0.334]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_197_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_197_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_197_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_197_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_197_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_197_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_197_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_197_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_197_8.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.225, 0.104, 0.586, 0.619]\nB: [0.401, 0.539, 0.679, 0.902]\nC: [0.479, 0.144, 0.7, 0.563]\nD: [0.103, 0.0, 0.464, 0.515]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1652 and the height is 1080.\nCAPTION: standing still and eating white rabbit",
+    "context": "Select from the following choices.\nA: [0.225, 0.104, 0.586, 0.619]\nB: [0.401, 0.539, 0.679, 0.902]\nC: [0.479, 0.144, 0.7, 0.563]\nD: [0.103, 0.0, 0.464, 0.515]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_198_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_198_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_198_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_198_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_198_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_198_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_198_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_198_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_198_8.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "mevis",
+    "visual_input_component": "Video image or Natural image",
+    "source": "MeViS",
+    "options": "A: [0.223, 0.415, 0.498, 0.772]\nB: [0.223, 0.415, 0.496, 0.722]\nC: [0.223, 0.415, 0.53, 0.769]\nD: [0.12, 0.554, 0.443, 0.631]",
+    "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1652 and the height is 1080.\nCAPTION: rabbit being jumped by another rabbit",
+    "context": "Select from the following choices.\nA: [0.223, 0.415, 0.498, 0.772]\nB: [0.223, 0.415, 0.496, 0.722]\nC: [0.223, 0.415, 0.53, 0.769]\nD: [0.12, 0.554, 0.443, 0.631]",
+    "input_image_path": [
+      "../MMIU-Benchmark/mevis/mevis_199_0.jpg",
+      "../MMIU-Benchmark/mevis/mevis_199_1.jpg",
+      "../MMIU-Benchmark/mevis/mevis_199_2.jpg",
+      "../MMIU-Benchmark/mevis/mevis_199_3.jpg",
+      "../MMIU-Benchmark/mevis/mevis_199_4.jpg",
+      "../MMIU-Benchmark/mevis/mevis_199_5.jpg",
+      "../MMIU-Benchmark/mevis/mevis_199_6.jpg",
+      "../MMIU-Benchmark/mevis/mevis_199_7.jpg",
+      "../MMIU-Benchmark/mevis/mevis_199_8.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a woman with a guitar sings on a farm\nB: a man with a guitar plays in a park\nC: a man with a guitar sings on a farm\nD: a man with a guitar sings in a city",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a woman with a guitar sings on a farm\nB: a man with a guitar plays in a park\nC: a man with a guitar sings on a farm\nD: a man with a guitar sings in a city",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_0_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_0_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_0_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_0_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_0_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_0_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_0_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_0_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_0_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_0_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_0_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_0_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_0_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_0_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_0_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_0_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a woman is performing on a talent show\nB: a girl is singing in a karaoke competition\nC: a man is auditioning for a role in a musical\nD: a boy is trying out for a part on the voice kids",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a woman is performing on a talent show\nB: a girl is singing in a karaoke competition\nC: a man is auditioning for a role in a musical\nD: a boy is trying out for a part on the voice kids",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_1_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_1_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_1_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_1_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_1_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_1_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_1_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_1_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_1_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_1_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_1_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_1_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_1_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_1_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_1_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_1_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a girl is riding a bicycle\nB: a man is painting a house\nC: a woman is playing the piano\nD: a boy is eating pizza",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a girl is riding a bicycle\nB: a man is painting a house\nC: a woman is playing the piano\nD: a boy is eating pizza",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_2_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_2_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_2_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_2_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_2_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_2_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_2_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_2_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_2_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_2_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_2_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_2_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_2_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_2_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_2_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_2_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a football match in progress\nB: a cat drinking milk from a bowl\nC: a chef preparing a gourmet meal\nD: the actor playing thor talking about the new movie",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a football match in progress\nB: a cat drinking milk from a bowl\nC: a chef preparing a gourmet meal\nD: the actor playing thor talking about the new movie",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_3_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_3_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_3_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_3_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_3_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_3_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_3_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_3_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_3_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_3_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_3_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_3_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_3_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_3_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_3_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_3_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a female reporter is interviewing a man on the beach when she falls into a sand castle\nB: a group of people playing volleyball on the beach\nC: a male reporter is interviewing a woman in a restaurant when he spills his coffee\nD: a man building a sand castle while talking on the phone",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a female reporter is interviewing a man on the beach when she falls into a sand castle\nB: a group of people playing volleyball on the beach\nC: a male reporter is interviewing a woman in a restaurant when he spills his coffee\nD: a man building a sand castle while talking on the phone",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_4_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_4_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_4_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_4_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_4_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_4_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_4_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_4_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_4_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_4_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_4_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_4_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_4_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_4_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_4_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_4_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a dog is jumping on a trampoline\nB: a cat is sleeping on a sofa\nC: a car is driving on the highway\nD: a bird is flying in the sky",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a dog is jumping on a trampoline\nB: a cat is sleeping on a sofa\nC: a car is driving on the highway\nD: a bird is flying in the sky",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_5_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_5_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_5_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_5_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_5_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_5_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_5_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_5_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_5_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_5_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_5_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_5_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_5_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_5_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_5_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_5_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a woman is planting flowers in the garden\nB: a group of people is having a picnic in the park\nC: a man is opening a box and showing a machine gun\nD: a child is playing with a toy car",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a woman is planting flowers in the garden\nB: a group of people is having a picnic in the park\nC: a man is opening a box and showing a machine gun\nD: a child is playing with a toy car",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_6_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_6_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_6_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_6_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_6_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_6_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_6_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_6_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_6_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_6_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_6_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_6_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_6_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_6_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_6_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_6_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a woman eating an orange substance\nB: a woman cooking an orange substance\nC: a woman peeling an orange substance\nD: a woman cutting an orange substance",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a woman eating an orange substance\nB: a woman cooking an orange substance\nC: a woman peeling an orange substance\nD: a woman cutting an orange substance",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_7_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_7_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_7_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_7_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_7_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_7_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_7_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_7_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_7_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_7_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_7_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_7_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_7_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_7_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_7_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_7_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: the chef is slicing tomatoes\nB: the woman is breading pork chop\nC: the man is grilling vegetables\nD: the girl is painting a landscape",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: the chef is slicing tomatoes\nB: the woman is breading pork chop\nC: the man is grilling vegetables\nD: the girl is painting a landscape",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_8_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_8_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_8_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_8_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_8_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_8_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_8_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_8_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_8_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_8_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_8_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_8_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_8_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_8_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_8_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_8_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: the camera focuses on a flower blooming in slow motion\nB: the screen shows a person sleeping in bed\nC: there is no sound while the screen shows a person playing a computer game\nD: there is sound of birds chirping in the background",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: the camera focuses on a flower blooming in slow motion\nB: the screen shows a person sleeping in bed\nC: there is no sound while the screen shows a person playing a computer game\nD: there is sound of birds chirping in the background",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_9_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_9_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_9_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_9_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_9_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_9_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_9_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_9_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_9_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_9_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_9_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_9_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_9_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_9_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_9_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_9_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: someone stirring soup\nB: someone writing on a chalkboard\nC: someone watering plants\nD: someone slicing vegetable",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: someone stirring soup\nB: someone writing on a chalkboard\nC: someone watering plants\nD: someone slicing vegetable",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_10_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_10_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_10_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_10_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_10_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_10_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_10_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_10_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_10_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_10_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_10_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_10_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_10_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_10_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_10_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_10_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a person in a red top is holding a yellow drink\nB: a woman in a yellow top is holding a red drink\nC: a woman in a blue top is holding a yellow drink\nD: a man in a green shirt is holding a blue drink",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a person in a red top is holding a yellow drink\nB: a woman in a yellow top is holding a red drink\nC: a woman in a blue top is holding a yellow drink\nD: a man in a green shirt is holding a blue drink",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_11_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_11_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_11_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_11_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_11_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_11_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_11_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_11_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_11_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_11_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_11_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_11_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_11_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_11_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_11_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_11_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: an artist is sketching the face of a cartoon woman on paper with her mouth wide open\nB: a doctor is performing surgery in the operating room\nC: a child is playing with a toy car on the floor\nD: a chef is cooking a meal in the kitchen",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: an artist is sketching the face of a cartoon woman on paper with her mouth wide open\nB: a doctor is performing surgery in the operating room\nC: a child is playing with a toy car on the floor\nD: a chef is cooking a meal in the kitchen",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_12_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_12_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_12_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_12_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_12_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_12_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_12_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_12_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_12_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_12_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_12_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_12_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_12_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_12_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_12_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_12_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a warrior is cooking a meal\nB: a warrior is gardening\nC: a warrior is fighting a battle\nD: a warrior is reading a book",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a warrior is cooking a meal\nB: a warrior is gardening\nC: a warrior is fighting a battle\nD: a warrior is reading a book",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_13_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_13_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_13_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_13_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_13_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_13_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_13_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_13_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_13_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_13_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_13_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_13_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_13_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_13_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_13_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_13_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: chefs are cooking in a restaurant kitchen\nB: spectators are watching a football match in a stadium\nC: dancers are performing on a stage\nD: models are walking the runway as part of a fashion show",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: chefs are cooking in a restaurant kitchen\nB: spectators are watching a football match in a stadium\nC: dancers are performing on a stage\nD: models are walking the runway as part of a fashion show",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_14_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_14_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_14_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_14_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_14_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_14_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_14_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_14_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_14_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_14_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_14_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_14_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_14_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_14_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_14_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_14_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a woman works as a chef in a restaurant\nB: a man teaches piano lessons in his music studio\nC: a woman looks after abandoned children for free in her home\nD: a group of children play in a water park",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a woman works as a chef in a restaurant\nB: a man teaches piano lessons in his music studio\nC: a woman looks after abandoned children for free in her home\nD: a group of children play in a water park",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_15_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_15_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_15_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_15_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_15_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_15_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_15_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_15_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_15_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_15_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_15_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_15_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_15_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_15_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_15_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_15_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: two players are playing chess\nB: two players are playing badminton\nC: two players are playing volleyball\nD: two players are playing table tennis",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: two players are playing chess\nB: two players are playing badminton\nC: two players are playing volleyball\nD: two players are playing table tennis",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_16_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_16_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_16_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_16_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_16_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_16_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_16_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_16_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_16_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_16_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_16_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_16_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_16_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_16_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_16_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_16_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a person is cutting a pineapple\nB: a chef is grilling a steak\nC: someone is sorting vegetables\nD: someone is peeling a prawn",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a person is cutting a pineapple\nB: a chef is grilling a steak\nC: someone is sorting vegetables\nD: someone is peeling a prawn",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_17_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_17_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_17_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_17_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_17_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_17_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_17_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_17_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_17_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_17_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_17_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_17_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_17_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_17_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_17_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_17_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a child is riding a bike\nB: a woman is watering plants\nC: a dog is fetching a ball\nD: a man is tilling a field",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a child is riding a bike\nB: a woman is watering plants\nC: a dog is fetching a ball\nD: a man is tilling a field",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_18_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_18_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_18_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_18_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_18_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_18_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_18_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_18_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_18_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_18_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_18_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_18_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_18_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_18_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_18_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_18_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a hamster is eating broccoli\nB: a cat is sitting on a chair\nC: a bird is flying in the sky\nD: a dog is playing with a ball",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a hamster is eating broccoli\nB: a cat is sitting on a chair\nC: a bird is flying in the sky\nD: a dog is playing with a ball",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_19_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_19_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_19_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_19_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_19_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_19_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_19_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_19_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_19_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_19_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_19_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_19_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_19_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_19_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_19_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_19_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a man effortlessly jumps over a railing and continues walking without any problems\nB: a man slides down a railing and falls and hurts himself badly\nC: a man gracefully slides down a railing and lands safely at the bottom\nD: a man climbs up a railing and maintains perfect balance",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a man effortlessly jumps over a railing and continues walking without any problems\nB: a man slides down a railing and falls and hurts himself badly\nC: a man gracefully slides down a railing and lands safely at the bottom\nD: a man climbs up a railing and maintains perfect balance",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_20_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_20_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_20_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_20_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_20_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_20_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_20_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_20_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_20_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_20_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_20_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_20_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_20_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_20_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_20_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_20_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: the car is driving on the highway\nB: the car is parked in a parking lot\nC: the car is crashed in the road\nD: the car is parked in the garage",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: the car is driving on the highway\nB: the car is parked in a parking lot\nC: the car is crashed in the road\nD: the car is parked in the garage",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_21_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_21_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_21_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_21_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_21_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_21_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_21_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_21_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_21_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_21_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_21_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_21_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_21_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_21_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_21_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_21_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a person is dancing in a disco club\nB: a child is playing a drum set\nC: a woman is playing a guitar\nD: a man is playing an electric keyboard",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a person is dancing in a disco club\nB: a child is playing a drum set\nC: a woman is playing a guitar\nD: a man is playing an electric keyboard",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_22_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_22_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_22_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_22_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_22_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_22_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_22_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_22_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_22_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_22_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_22_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_22_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_22_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_22_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_22_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_22_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: advertisement for home decor\nB: display of sports shoes\nC: promotion of kitchen utensils\nD: advertisement of seat basket",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: advertisement for home decor\nB: display of sports shoes\nC: promotion of kitchen utensils\nD: advertisement of seat basket",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_23_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_23_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_23_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_23_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_23_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_23_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_23_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_23_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_23_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_23_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_23_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_23_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_23_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_23_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_23_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_23_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a girl using her smartphone\nB: a boy playing with a toy car\nC: a man cooking in the kitchen\nD: a woman reading a book",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a girl using her smartphone\nB: a boy playing with a toy car\nC: a man cooking in the kitchen\nD: a woman reading a book",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_24_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_24_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_24_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_24_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_24_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_24_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_24_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_24_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_24_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_24_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_24_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_24_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_24_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_24_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_24_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_24_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a man is driving\nB: a man is running\nC: a man is standing\nD: a man is shooting",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a man is driving\nB: a man is running\nC: a man is standing\nD: a man is shooting",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_25_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_25_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_25_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_25_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_25_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_25_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_25_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_25_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_25_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_25_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_25_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_25_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_25_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_25_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_25_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_25_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a man clean another man s shirt\nB: a man folding a shirt\nC: a man ironing a shirt\nD: a man drying a shirt",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a man clean another man s shirt\nB: a man folding a shirt\nC: a man ironing a shirt\nD: a man drying a shirt",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_26_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_26_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_26_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_26_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_26_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_26_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_26_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_26_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_26_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_26_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_26_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_26_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_26_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_26_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_26_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_26_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a chef cooking a meal in the kitchen\nB: a child playing with toys\nC: a woman exercising in the park\nD: a man describing how to do something in windows",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a chef cooking a meal in the kitchen\nB: a child playing with toys\nC: a woman exercising in the park\nD: a man describing how to do something in windows",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_27_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_27_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_27_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_27_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_27_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_27_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_27_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_27_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_27_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_27_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_27_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_27_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_27_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_27_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_27_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_27_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a man sings while playing the guitar\nB: a woman dances in a ballet performance\nC: a child rides a bicycle in the park\nD: a woman explains playing violin",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a man sings while playing the guitar\nB: a woman dances in a ballet performance\nC: a child rides a bicycle in the park\nD: a woman explains playing violin",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_28_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_28_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_28_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_28_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_28_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_28_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_28_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_28_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_28_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_28_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_28_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_28_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_28_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_28_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_28_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_28_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: two little girls playing outside\nB: a man sitting on a chair\nC: a dog running in the park\nD: one little girl is sleeping on bed",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: two little girls playing outside\nB: a man sitting on a chair\nC: a dog running in the park\nD: one little girl is sleeping on bed",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_29_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_29_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_29_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_29_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_29_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_29_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_29_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_29_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_29_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_29_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_29_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_29_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_29_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_29_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_29_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_29_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a tv channel named how to cook great foodcom is telling how to prepare a dish\nB: a cooking show featuring recipes from famous chefs\nC: a baking tutorial on a popular cooking website\nD: a video showing the steps to create a delicious meal",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a tv channel named how to cook great foodcom is telling how to prepare a dish\nB: a cooking show featuring recipes from famous chefs\nC: a baking tutorial on a popular cooking website\nD: a video showing the steps to create a delicious meal",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_30_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_30_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_30_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_30_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_30_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_30_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_30_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_30_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_30_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_30_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_30_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_30_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_30_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_30_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_30_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_30_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a child plays with a toy robot\nB: a man talks about dna force\nC: an old man reads a newspaper\nD: a woman discusses environmental conservation",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a child plays with a toy robot\nB: a man talks about dna force\nC: an old man reads a newspaper\nD: a woman discusses environmental conservation",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_31_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_31_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_31_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_31_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_31_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_31_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_31_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_31_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_31_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_31_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_31_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_31_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_31_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_31_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_31_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_31_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a boy is playing the violin\nB: a girl is painting a picture\nC: a man is cooking in the kitchen\nD: a woman is riding a bicycle",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a boy is playing the violin\nB: a girl is painting a picture\nC: a man is cooking in the kitchen\nD: a woman is riding a bicycle",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_32_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_32_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_32_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_32_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_32_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_32_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_32_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_32_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_32_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_32_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_32_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_32_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_32_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_32_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_32_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_32_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a man is riding the brown horse\nB: a person is taking pictures of the white horse\nC: the brown horse is playing with a child\nD: person is recording the brown horse which is having fun",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a man is riding the brown horse\nB: a person is taking pictures of the white horse\nC: the brown horse is playing with a child\nD: person is recording the brown horse which is having fun",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_33_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_33_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_33_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_33_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_33_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_33_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_33_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_33_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_33_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_33_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_33_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_33_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_33_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_33_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_33_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_33_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a boy in green jacket talking about sofa cover\nB: a man in blue shirt playing with the pillow\nC: a woman in red dress explaining about cushion seat\nD: a girl in yellow dress sitting on the couch",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a boy in green jacket talking about sofa cover\nB: a man in blue shirt playing with the pillow\nC: a woman in red dress explaining about cushion seat\nD: a girl in yellow dress sitting on the couch",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_34_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_34_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_34_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_34_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_34_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_34_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_34_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_34_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_34_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_34_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_34_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_34_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_34_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_34_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_34_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_34_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a bird is chirping at the wall\nB: a dog is barking at the wall\nC: a cat is sleeping on the wall\nD: a cat is meowing at the wall",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a bird is chirping at the wall\nB: a dog is barking at the wall\nC: a cat is sleeping on the wall\nD: a cat is meowing at the wall",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_35_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_35_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_35_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_35_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_35_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_35_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_35_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_35_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_35_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_35_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_35_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_35_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_35_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_35_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_35_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_35_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a group of people dancing in a park\nB: a chef cooking in a restaurant kitchen\nC: someone speaking about a violent act regarding the police\nD: a cat sleeping on a windowsill",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a group of people dancing in a park\nB: a chef cooking in a restaurant kitchen\nC: someone speaking about a violent act regarding the police\nD: a cat sleeping on a windowsill",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_36_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_36_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_36_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_36_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_36_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_36_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_36_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_36_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_36_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_36_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_36_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_36_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_36_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_36_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_36_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_36_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a man is playing a large guitar on the stage\nB: a woman is palying a small guitar or a ukulele on the street\nC: a person is sitting and holding a violin in the concert\nD: a girl is dancing with a small guitar in the park",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a man is playing a large guitar on the stage\nB: a woman is palying a small guitar or a ukulele on the street\nC: a person is sitting and holding a violin in the concert\nD: a girl is dancing with a small guitar in the park",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_37_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_37_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_37_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_37_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_37_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_37_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_37_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_37_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_37_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_37_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_37_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_37_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_37_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_37_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_37_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_37_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a peaceful sunset at the beach\nB: a cute puppy playing in the park\nC: a fatality from mortal kombat is shown\nD: a group of friends laughing and having fun",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a peaceful sunset at the beach\nB: a cute puppy playing in the park\nC: a fatality from mortal kombat is shown\nD: a group of friends laughing and having fun",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_38_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_38_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_38_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_38_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_38_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_38_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_38_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_38_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_38_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_38_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_38_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_38_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_38_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_38_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_38_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_38_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: the boy is riding the cycle\nB: the boy is driving a car\nC: the boy is walking the dog\nD: the girl is riding the cycle",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: the boy is riding the cycle\nB: the boy is driving a car\nC: the boy is walking the dog\nD: the girl is riding the cycle",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_39_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_39_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_39_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_39_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_39_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_39_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_39_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_39_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_39_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_39_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_39_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_39_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_39_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_39_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_39_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_39_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: people talking about their trip and how they are taken care of\nB: people sitting silently and looking bored\nC: a person alone and lost in a foreign land\nD: a group of people arguing and fighting",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: people talking about their trip and how they are taken care of\nB: people sitting silently and looking bored\nC: a person alone and lost in a foreign land\nD: a group of people arguing and fighting",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_40_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_40_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_40_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_40_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_40_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_40_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_40_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_40_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_40_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_40_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_40_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_40_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_40_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_40_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_40_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_40_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a man and a woman painting a picture\nB: a man and a woman playing basketball\nC: a man and a woman dancing at a party\nD: a man a woman cooking on a cooking show",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a man and a woman painting a picture\nB: a man and a woman playing basketball\nC: a man and a woman dancing at a party\nD: a man a woman cooking on a cooking show",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_41_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_41_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_41_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_41_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_41_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_41_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_41_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_41_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_41_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_41_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_41_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_41_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_41_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_41_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_41_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_41_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a woman shopping for groceries in a supermarket\nB: a group of friends having a picnic in a park\nC: a man explains the condition of someone in the hospital to the press outside of a building\nD: a child playing in a playground",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a woman shopping for groceries in a supermarket\nB: a group of friends having a picnic in a park\nC: a man explains the condition of someone in the hospital to the press outside of a building\nD: a child playing in a playground",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_42_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_42_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_42_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_42_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_42_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_42_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_42_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_42_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_42_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_42_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_42_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_42_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_42_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_42_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_42_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_42_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: the beautiful scene on the screen\nB: an unattractive scene on the screen\nC: a dull landscape on the screen\nD: a boring view on the screen",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: the beautiful scene on the screen\nB: an unattractive scene on the screen\nC: a dull landscape on the screen\nD: a boring view on the screen",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_43_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_43_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_43_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_43_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_43_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_43_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_43_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_43_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_43_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_43_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_43_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_43_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_43_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_43_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_43_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_43_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a thin guy with a tie is looking at a woman\nB: a fat guy with a tie is looking at a man\nC: an overweight man with a bowtie is staring at a person\nD: a chubby man with a necktie is observing a gentleman",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a thin guy with a tie is looking at a woman\nB: a fat guy with a tie is looking at a man\nC: an overweight man with a bowtie is staring at a person\nD: a chubby man with a necktie is observing a gentleman",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_44_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_44_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_44_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_44_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_44_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_44_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_44_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_44_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_44_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_44_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_44_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_44_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_44_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_44_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_44_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_44_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a cat is taking a nap\nB: a dog is fetching a ball\nC: a man is riding a bicycle\nD: a woman is showing nail polish",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a cat is taking a nap\nB: a dog is fetching a ball\nC: a man is riding a bicycle\nD: a woman is showing nail polish",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_45_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_45_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_45_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_45_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_45_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_45_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_45_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_45_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_45_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_45_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_45_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_45_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_45_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_45_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_45_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_45_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a child wearing a yellow t-shirt running in the park\nB: a man in a blue shirt standing in front of the shelves\nC: a woman in a red dress sitting at the table\nD: the woman in the purple blouse talk as the shelves are behind her",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a child wearing a yellow t-shirt running in the park\nB: a man in a blue shirt standing in front of the shelves\nC: a woman in a red dress sitting at the table\nD: the woman in the purple blouse talk as the shelves are behind her",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_46_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_46_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_46_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_46_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_46_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_46_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_46_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_46_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_46_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_46_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_46_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_46_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_46_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_46_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_46_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_46_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: an old man shakes hands with another man and then they hug each other\nB: a young woman dances alone in a dark room\nC: a person cooks food in a kitchen\nD: a group of children play soccer in a field",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: an old man shakes hands with another man and then they hug each other\nB: a young woman dances alone in a dark room\nC: a person cooks food in a kitchen\nD: a group of children play soccer in a field",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_47_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_47_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_47_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_47_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_47_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_47_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_47_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_47_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_47_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_47_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_47_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_47_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_47_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_47_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_47_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_47_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: two different men pretending to be one person\nB: woman pretends to be two different people\nC: man pretends to be two different people\nD: man changes his appearance",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: two different men pretending to be one person\nB: woman pretends to be two different people\nC: man pretends to be two different people\nD: man changes his appearance",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_48_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_48_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_48_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_48_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_48_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_48_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_48_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_48_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_48_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_48_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_48_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_48_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_48_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_48_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_48_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_48_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: everyone can strive for mediocrity acting drums or planets\nB: some people may just be average playing pianos or moons\nC: nobody can reach excellence dancing violins or galaxies\nD: we can all be overachievers playing guitars or stars",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: everyone can strive for mediocrity acting drums or planets\nB: some people may just be average playing pianos or moons\nC: nobody can reach excellence dancing violins or galaxies\nD: we can all be overachievers playing guitars or stars",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_49_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_49_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_49_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_49_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_49_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_49_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_49_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_49_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_49_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_49_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_49_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_49_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_49_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_49_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_49_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_49_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a group of cartoon faces are singing and dancing\nB: a bunch of cartoon faces are chomping their teeth and making eating gestures\nC: a collection of cartoon faces are laughing and clapping\nD: several cartoon faces are looking sad and crying",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a group of cartoon faces are singing and dancing\nB: a bunch of cartoon faces are chomping their teeth and making eating gestures\nC: a collection of cartoon faces are laughing and clapping\nD: several cartoon faces are looking sad and crying",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_50_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_50_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_50_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_50_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_50_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_50_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_50_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_50_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_50_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_50_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_50_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_50_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_50_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_50_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_50_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_50_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: an elderly man is gardening\nB: a little girl is dancing\nC: a little boy is playing basketball\nD: two cats are fighting",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: an elderly man is gardening\nB: a little girl is dancing\nC: a little boy is playing basketball\nD: two cats are fighting",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_51_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_51_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_51_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_51_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_51_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_51_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_51_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_51_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_51_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_51_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_51_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_51_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_51_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_51_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_51_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_51_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a woman and man are talking with each other\nB: a woman and man are laughing with each other\nC: a woman and man are staring at each other\nD: a woman and man are arguing with each other",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a woman and man are talking with each other\nB: a woman and man are laughing with each other\nC: a woman and man are staring at each other\nD: a woman and man are arguing with each other",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_52_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_52_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_52_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_52_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_52_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_52_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_52_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_52_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_52_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_52_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_52_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_52_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_52_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_52_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_52_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_52_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a monkey is eating a banana\nB: a dog is sniffing a baby duck\nC: a bird is swimming in the water\nD: a cat is playing with a ball",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a monkey is eating a banana\nB: a dog is sniffing a baby duck\nC: a bird is swimming in the water\nD: a cat is playing with a ball",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_53_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_53_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_53_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_53_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_53_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_53_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_53_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_53_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_53_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_53_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_53_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_53_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_53_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_53_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_53_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_53_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a person dancing at a party\nB: a person giving his opinion on how crowded the world is\nC: a person painting a landscape\nD: a person cooking in the kitchen",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a person dancing at a party\nB: a person giving his opinion on how crowded the world is\nC: a person painting a landscape\nD: a person cooking in the kitchen",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_54_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_54_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_54_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_54_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_54_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_54_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_54_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_54_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_54_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_54_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_54_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_54_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_54_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_54_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_54_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_54_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: someone tore a triangular red paper\nB: someone unfolded a round blue paper\nC: someone folded a square yellow paper\nD: someone crumpled a rectangular green paper",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: someone tore a triangular red paper\nB: someone unfolded a round blue paper\nC: someone folded a square yellow paper\nD: someone crumpled a rectangular green paper",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_55_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_55_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_55_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_55_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_55_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_55_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_55_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_55_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_55_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_55_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_55_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_55_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_55_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_55_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_55_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_55_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: the man is playing the guitar\nB: a woman is dancing\nC: a child is riding a bicycle\nD: a cat is playing the piano",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: the man is playing the guitar\nB: a woman is dancing\nC: a child is riding a bicycle\nD: a cat is playing the piano",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_56_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_56_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_56_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_56_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_56_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_56_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_56_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_56_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_56_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_56_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_56_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_56_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_56_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_56_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_56_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_56_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: two women are embracing\nB: a man and a woman are dancing\nC: a woman is crying alone\nD: two men are fighting",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: two women are embracing\nB: a man and a woman are dancing\nC: a woman is crying alone\nD: two men are fighting",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_57_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_57_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_57_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_57_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_57_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_57_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_57_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_57_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_57_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_57_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_57_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_57_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_57_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_57_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_57_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_57_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a bird is swimming in a lake\nB: a dog is driving a car\nC: a cat is sitting on a bicycle\nD: a monkey is riding a bus",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a bird is swimming in a lake\nB: a dog is driving a car\nC: a cat is sitting on a bicycle\nD: a monkey is riding a bus",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_58_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_58_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_58_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_58_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_58_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_58_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_58_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_58_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_58_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_58_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_58_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_58_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_58_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_58_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_58_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_58_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a man demonstrating how to assemble a bicycle\nB: a group of people doing yoga in a park\nC: two ladies in a cookery show explain how to marinate chicken already cleaned and ready with salt and cilantro sprigs\nD: a person painting a landscape with a brush and palette",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a man demonstrating how to assemble a bicycle\nB: a group of people doing yoga in a park\nC: two ladies in a cookery show explain how to marinate chicken already cleaned and ready with salt and cilantro sprigs\nD: a person painting a landscape with a brush and palette",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_59_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_59_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_59_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_59_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_59_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_59_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_59_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_59_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_59_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_59_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_59_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_59_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_59_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_59_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_59_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_59_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a car is driving along a busy street\nB: a computer is displaying a software code\nC: several people are talking in a room\nD: a flower and other natural scenes are displaying",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a car is driving along a busy street\nB: a computer is displaying a software code\nC: several people are talking in a room\nD: a flower and other natural scenes are displaying",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_60_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_60_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_60_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_60_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_60_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_60_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_60_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_60_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_60_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_60_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_60_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_60_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_60_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_60_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_60_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_60_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a player is putting a basket ball into the basket from distance\nB: a player is shooting a soccer ball into the goal from close range\nC: a player is hitting a tennis ball across the net with a racket\nD: a player is throwing a football into the end zone from a distance",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a player is putting a basket ball into the basket from distance\nB: a player is shooting a soccer ball into the goal from close range\nC: a player is hitting a tennis ball across the net with a racket\nD: a player is throwing a football into the end zone from a distance",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_61_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_61_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_61_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_61_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_61_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_61_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_61_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_61_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_61_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_61_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_61_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_61_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_61_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_61_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_61_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_61_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a video of animals running in a jungle\nB: a video is shown showing different cars\nC: a video of a chef cooking in a restaurant\nD: a video of people shopping in a mall",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a video of animals running in a jungle\nB: a video is shown showing different cars\nC: a video of a chef cooking in a restaurant\nD: a video of people shopping in a mall",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_62_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_62_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_62_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_62_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_62_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_62_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_62_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_62_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_62_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_62_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_62_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_62_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_62_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_62_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_62_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_62_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a child is riding a bicycle in the park\nB: a man is swimming in a pool\nC: a man is playing baseball\nD: a woman is cooking in the kitchen",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a child is riding a bicycle in the park\nB: a man is swimming in a pool\nC: a man is playing baseball\nD: a woman is cooking in the kitchen",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_63_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_63_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_63_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_63_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_63_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_63_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_63_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_63_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_63_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_63_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_63_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_63_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_63_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_63_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_63_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_63_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a man throwing a ball and a man missing the catch\nB: a man kicking a ball and a man dropping the ball\nC: a man hitting a ball and man catching the ball in the field\nD: a man hitting a ball and the ball rolling away",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a man throwing a ball and a man missing the catch\nB: a man kicking a ball and a man dropping the ball\nC: a man hitting a ball and man catching the ball in the field\nD: a man hitting a ball and the ball rolling away",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_64_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_64_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_64_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_64_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_64_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_64_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_64_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_64_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_64_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_64_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_64_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_64_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_64_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_64_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_64_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_64_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: the chef is grilling chicken\nB: the man is chopping tomatoes\nC: the man is cooking onions\nD: the woman is frying eggs",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: the chef is grilling chicken\nB: the man is chopping tomatoes\nC: the man is cooking onions\nD: the woman is frying eggs",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_65_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_65_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_65_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_65_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_65_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_65_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_65_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_65_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_65_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_65_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_65_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_65_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_65_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_65_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_65_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_65_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: young man in a black jacket playing the guitar\nB: blonde woman in red dress dancing in the kitchen\nC: elderly woman in purple sweater knitting a scarf\nD: bearded guy in grey tshirt talking to the camera",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: young man in a black jacket playing the guitar\nB: blonde woman in red dress dancing in the kitchen\nC: elderly woman in purple sweater knitting a scarf\nD: bearded guy in grey tshirt talking to the camera",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_66_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_66_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_66_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_66_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_66_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_66_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_66_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_66_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_66_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_66_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_66_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_66_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_66_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_66_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_66_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_66_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: the couple walked arm in arm down the path\nB: two people ran together along the sidewalk\nC: a single person strolled casually along the road\nD: the group skipped happily through the field",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: the couple walked arm in arm down the path\nB: two people ran together along the sidewalk\nC: a single person strolled casually along the road\nD: the group skipped happily through the field",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_67_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_67_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_67_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_67_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_67_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_67_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_67_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_67_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_67_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_67_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_67_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_67_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_67_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_67_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_67_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_67_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a chef prepares a delicious meal\nB: a woman opens a door to find a surprise party\nC: a cat plays with a ball of yarn\nD: a man punches a faucet to show how much better bottled water is",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a chef prepares a delicious meal\nB: a woman opens a door to find a surprise party\nC: a cat plays with a ball of yarn\nD: a man punches a faucet to show how much better bottled water is",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_68_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_68_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_68_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_68_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_68_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_68_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_68_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_68_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_68_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_68_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_68_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_68_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_68_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_68_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_68_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_68_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a yellow bird is swimming in the ocean\nB: a black poodle is giving a man a highfive\nC: a brown dog is playing with a ball\nD: a white cat is riding a bicycle",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a yellow bird is swimming in the ocean\nB: a black poodle is giving a man a highfive\nC: a brown dog is playing with a ball\nD: a white cat is riding a bicycle",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_69_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_69_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_69_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_69_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_69_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_69_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_69_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_69_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_69_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_69_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_69_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_69_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_69_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_69_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_69_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_69_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a dog is chasing a squirrel\nB: a bird is building a nest\nC: someone is holding a skunk\nD: a cat is playing with a ball",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a dog is chasing a squirrel\nB: a bird is building a nest\nC: someone is holding a skunk\nD: a cat is playing with a ball",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_70_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_70_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_70_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_70_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_70_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_70_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_70_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_70_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_70_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_70_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_70_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_70_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_70_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_70_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_70_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_70_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a woman wearing a red dress dances in front of a mirror\nB: two dogs playing in the park\nC: a chef cooking in the kitchen\nD: a guy wearing a black shirt talks and shows a chart on the tv screen",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a woman wearing a red dress dances in front of a mirror\nB: two dogs playing in the park\nC: a chef cooking in the kitchen\nD: a guy wearing a black shirt talks and shows a chart on the tv screen",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_71_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_71_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_71_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_71_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_71_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_71_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_71_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_71_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_71_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_71_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_71_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_71_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_71_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_71_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_71_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_71_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a panda is searching something\nB: a tiger is hunting for prey\nC: a bear is swimming in the river\nD: a monkey is swinging from tree to tree",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a panda is searching something\nB: a tiger is hunting for prey\nC: a bear is swimming in the river\nD: a monkey is swinging from tree to tree",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_72_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_72_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_72_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_72_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_72_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_72_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_72_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_72_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_72_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_72_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_72_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_72_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_72_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_72_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_72_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_72_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a chef prepares a meal\nB: an artist explains a sketch\nC: a teacher solves a math problem\nD: a construction worker operates heavy machinery",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a chef prepares a meal\nB: an artist explains a sketch\nC: a teacher solves a math problem\nD: a construction worker operates heavy machinery",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_73_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_73_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_73_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_73_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_73_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_73_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_73_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_73_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_73_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_73_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_73_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_73_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_73_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_73_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_73_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_73_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a woman is standing on a table\nB: a man is sitting on a chair\nC: a cat is jumping on the bed\nD: a dog is lying on the floor",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a woman is standing on a table\nB: a man is sitting on a chair\nC: a cat is jumping on the bed\nD: a dog is lying on the floor",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_74_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_74_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_74_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_74_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_74_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_74_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_74_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_74_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_74_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_74_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_74_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_74_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_74_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_74_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_74_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_74_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a woman discusses her favorite movie and its impact on her life\nB: a man talks about a new diet plan he follows every day\nC: a woman talks about a skin care treatment she takes with her everwhere\nD: a group of people participate in a cooking competition and share their recipes with the audience",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a woman discusses her favorite movie and its impact on her life\nB: a man talks about a new diet plan he follows every day\nC: a woman talks about a skin care treatment she takes with her everwhere\nD: a group of people participate in a cooking competition and share their recipes with the audience",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_75_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_75_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_75_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_75_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_75_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_75_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_75_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_75_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_75_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_75_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_75_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_75_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_75_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_75_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_75_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_75_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: women are getting into the taxi\nB: men are walking away from the taxi\nC: girls are getting down from the taxi\nD: boys are getting down from the bus",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: women are getting into the taxi\nB: men are walking away from the taxi\nC: girls are getting down from the taxi\nD: boys are getting down from the bus",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_76_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_76_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_76_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_76_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_76_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_76_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_76_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_76_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_76_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_76_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_76_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_76_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_76_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_76_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_76_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_76_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a chef slices a tomato with a kitchen knife\nB: a man chops a chicken in two with an axe\nC: a woman cuts a watermelon with a knife\nD: a man cooks scrambled eggs in a pan",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a chef slices a tomato with a kitchen knife\nB: a man chops a chicken in two with an axe\nC: a woman cuts a watermelon with a knife\nD: a man cooks scrambled eggs in a pan",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_77_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_77_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_77_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_77_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_77_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_77_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_77_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_77_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_77_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_77_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_77_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_77_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_77_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_77_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_77_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_77_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: film critics interpreting the meaning behind movie moments\nB: couples describing the logic behind movie scenes\nC: individuals analyzing the emotions in movie clips\nD: actors discussing their favorite movie scenes",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: film critics interpreting the meaning behind movie moments\nB: couples describing the logic behind movie scenes\nC: individuals analyzing the emotions in movie clips\nD: actors discussing their favorite movie scenes",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_78_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_78_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_78_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_78_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_78_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_78_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_78_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_78_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_78_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_78_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_78_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_78_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_78_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_78_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_78_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_78_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a person rides a bike downhill\nB: a person drives a car on the highway\nC: a person walks a dog in the park\nD: a person flies a kite in the field",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a person rides a bike downhill\nB: a person drives a car on the highway\nC: a person walks a dog in the park\nD: a person flies a kite in the field",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_79_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_79_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_79_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_79_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_79_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_79_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_79_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_79_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_79_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_79_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_79_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_79_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_79_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_79_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_79_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_79_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a child is playing with his toy\nB: a man is drinking his drink\nC: a person is sleeping on the bed\nD: a woman is eating her food",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a child is playing with his toy\nB: a man is drinking his drink\nC: a person is sleeping on the bed\nD: a woman is eating her food",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_80_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_80_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_80_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_80_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_80_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_80_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_80_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_80_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_80_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_80_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_80_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_80_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_80_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_80_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_80_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_80_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a child is playing with toys\nB: a man is shaving\nC: a woman is putting on makeup\nD: a woman is cooking a meal",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a child is playing with toys\nB: a man is shaving\nC: a woman is putting on makeup\nD: a woman is cooking a meal",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_81_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_81_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_81_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_81_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_81_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_81_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_81_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_81_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_81_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_81_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_81_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_81_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_81_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_81_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_81_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_81_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: two animals are fighting over a bowl of fruit\nB: a single animal is eating from a plate\nC: two animals are eating what appears to be apple slices from a pan within a cage\nD: two animals are playing with a ball inside a cage",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: two animals are fighting over a bowl of fruit\nB: a single animal is eating from a plate\nC: two animals are eating what appears to be apple slices from a pan within a cage\nD: two animals are playing with a ball inside a cage",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_82_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_82_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_82_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_82_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_82_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_82_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_82_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_82_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_82_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_82_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_82_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_82_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_82_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_82_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_82_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_82_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a person is riding a bicycle on a mountain trail\nB: a person is swimming in a pool\nC: a person is being pushed on a stretcher\nD: a person is playing with a dog in a park",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a person is riding a bicycle on a mountain trail\nB: a person is swimming in a pool\nC: a person is being pushed on a stretcher\nD: a person is playing with a dog in a park",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_83_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_83_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_83_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_83_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_83_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_83_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_83_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_83_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_83_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_83_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_83_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_83_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_83_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_83_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_83_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_83_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a bird flies across the room\nB: a dog jumps over a table\nC: a toddler walks by pushing a chair\nD: a kitten runs by pulling a cart",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a bird flies across the room\nB: a dog jumps over a table\nC: a toddler walks by pushing a chair\nD: a kitten runs by pulling a cart",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_84_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_84_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_84_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_84_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_84_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_84_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_84_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_84_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_84_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_84_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_84_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_84_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_84_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_84_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_84_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_84_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a bird is flying in the sky\nB: a dog is chasing a ball\nC: a person is reading a book\nD: a cat is eating food",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a bird is flying in the sky\nB: a dog is chasing a ball\nC: a person is reading a book\nD: a cat is eating food",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_85_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_85_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_85_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_85_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_85_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_85_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_85_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_85_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_85_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_85_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_85_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_85_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_85_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_85_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_85_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_85_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a man with a cap and walking\nB: a boy wearing a helmet and playing\nC: a woman with a scarf and standing still\nD: a girl with a hat on and dancing",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a man with a cap and walking\nB: a boy wearing a helmet and playing\nC: a woman with a scarf and standing still\nD: a girl with a hat on and dancing",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_86_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_86_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_86_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_86_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_86_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_86_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_86_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_86_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_86_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_86_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_86_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_86_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_86_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_86_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_86_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_86_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a female tennis player takes a break while the audience watches in silence\nB: a male basketball player receives a penalty while being booed by the crowd\nC: a female soccer player accepts a reward while being cheered on by the crowd\nD: a male football player scores a goal and celebrates alone on the field",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a female tennis player takes a break while the audience watches in silence\nB: a male basketball player receives a penalty while being booed by the crowd\nC: a female soccer player accepts a reward while being cheered on by the crowd\nD: a male football player scores a goal and celebrates alone on the field",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_87_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_87_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_87_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_87_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_87_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_87_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_87_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_87_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_87_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_87_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_87_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_87_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_87_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_87_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_87_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_87_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a clip from fox news on the shelby north carolina shooting\nB: a clip from msnbc on the shelby north carolina shooting\nC: a clip from abc news on the shelby north carolina shooting\nD: a clip from cnn on the shelby north carolina shooting",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a clip from fox news on the shelby north carolina shooting\nB: a clip from msnbc on the shelby north carolina shooting\nC: a clip from abc news on the shelby north carolina shooting\nD: a clip from cnn on the shelby north carolina shooting",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_88_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_88_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_88_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_88_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_88_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_88_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_88_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_88_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_88_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_88_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_88_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_88_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_88_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_88_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_88_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_88_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a woman cooking in the kitchen\nB: a man shows how a video game works\nC: a child playing with a pet in the park\nD: a group of students studying in a library",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a woman cooking in the kitchen\nB: a man shows how a video game works\nC: a child playing with a pet in the park\nD: a group of students studying in a library",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_89_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_89_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_89_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_89_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_89_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_89_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_89_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_89_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_89_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_89_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_89_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_89_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_89_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_89_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_89_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_89_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: people are setting up for a cooking show\nB: a group of individuals are rehearsing for a theater performance\nC: men are getting ready for a music program\nD: women are preparing for a dance competition",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: people are setting up for a cooking show\nB: a group of individuals are rehearsing for a theater performance\nC: men are getting ready for a music program\nD: women are preparing for a dance competition",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_90_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_90_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_90_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_90_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_90_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_90_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_90_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_90_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_90_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_90_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_90_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_90_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_90_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_90_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_90_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_90_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a woman is cutting a vegetable into slices with a knife\nB: a chef is grilling a steak on a barbecue\nC: a man is slicing bread with a fork\nD: a person is peeling an apple with a spoon",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a woman is cutting a vegetable into slices with a knife\nB: a chef is grilling a steak on a barbecue\nC: a man is slicing bread with a fork\nD: a person is peeling an apple with a spoon",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_91_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_91_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_91_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_91_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_91_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_91_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_91_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_91_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_91_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_91_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_91_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_91_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_91_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_91_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_91_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_91_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: the cat is sleeping on the sofa\nB: the man is cooking in the kitchen\nC: the child is playing with a toy\nD: the woman has a baby monitor",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: the cat is sleeping on the sofa\nB: the man is cooking in the kitchen\nC: the child is playing with a toy\nD: the woman has a baby monitor",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_92_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_92_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_92_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_92_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_92_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_92_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_92_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_92_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_92_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_92_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_92_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_92_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_92_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_92_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_92_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_92_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a person is cooking\nB: a person is sleeping\nC: a person is eating\nD: a person is swimming",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a person is cooking\nB: a person is sleeping\nC: a person is eating\nD: a person is swimming",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_93_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_93_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_93_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_93_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_93_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_93_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_93_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_93_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_93_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_93_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_93_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_93_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_93_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_93_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_93_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_93_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a man is doing stunts on a motorcycle\nB: a girl is swimming in the pool\nC: a boy is playing basketball\nD: a woman is riding a bicycle",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a man is doing stunts on a motorcycle\nB: a girl is swimming in the pool\nC: a boy is playing basketball\nD: a woman is riding a bicycle",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_94_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_94_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_94_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_94_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_94_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_94_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_94_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_94_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_94_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_94_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_94_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_94_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_94_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_94_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_94_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_94_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: making coffee in the microwave for flavor\nB: preparing tea in the oven for taste\nC: boiling water on the stove for fragrance\nD: brewing hot chocolate in the kettle for aroma",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: making coffee in the microwave for flavor\nB: preparing tea in the oven for taste\nC: boiling water on the stove for fragrance\nD: brewing hot chocolate in the kettle for aroma",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_95_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_95_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_95_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_95_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_95_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_95_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_95_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_95_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_95_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_95_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_95_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_95_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_95_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_95_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_95_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_95_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: lego stormtroppers are in a facility\nB: lego stormtroppers in a spaceship\nC: lego stormtroppers at a beach\nD: lego stormtroppers in a forest",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: lego stormtroppers are in a facility\nB: lego stormtroppers in a spaceship\nC: lego stormtroppers at a beach\nD: lego stormtroppers in a forest",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_96_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_96_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_96_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_96_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_96_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_96_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_96_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_96_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_96_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_96_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_96_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_96_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_96_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_96_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_96_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_96_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a green SUV with a group of people admiring the SUV\nB: a red pickup truck with a woman speaking about the truck\nC: a blue sedan with a person walking around the sedan\nD: a yellow sports car with a guy speaking about the car",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a green SUV with a group of people admiring the SUV\nB: a red pickup truck with a woman speaking about the truck\nC: a blue sedan with a person walking around the sedan\nD: a yellow sports car with a guy speaking about the car",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_97_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_97_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_97_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_97_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_97_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_97_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_97_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_97_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_97_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_97_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_97_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_97_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_97_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_97_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_97_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_97_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: the children are walking\nB: the women are walking\nC: the men are walking\nD: the women are running",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: the children are walking\nB: the women are walking\nC: the men are walking\nD: the women are running",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_98_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_98_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_98_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_98_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_98_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_98_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_98_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_98_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_98_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_98_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_98_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_98_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_98_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_98_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_98_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_98_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a cat is sitting on a tree\nB: one micky mouse is talking to other\nC: three cars are racing on a track\nD: two dogs are playing with a ball",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a cat is sitting on a tree\nB: one micky mouse is talking to other\nC: three cars are racing on a track\nD: two dogs are playing with a ball",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_99_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_99_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_99_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_99_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_99_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_99_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_99_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_99_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_99_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_99_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_99_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_99_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_99_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_99_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_99_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_99_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a child is playing in the sand\nB: a woman is pulled into the water\nC: a woman is walking on the beach\nD: a man is pulled into the water",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a child is playing in the sand\nB: a woman is pulled into the water\nC: a woman is walking on the beach\nD: a man is pulled into the water",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_100_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_100_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_100_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_100_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_100_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_100_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_100_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_100_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_100_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_100_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_100_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_100_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_100_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_100_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_100_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_100_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a helicopter is flying over a peaceful city\nB: a helicopter is shown flying in what seems to be a war zone in syria\nC: a helicopter is delivering supplies to a humanitarian mission in a conflict zone\nD: a helicopter is performing aerial acrobatics in a deserted area",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a helicopter is flying over a peaceful city\nB: a helicopter is shown flying in what seems to be a war zone in syria\nC: a helicopter is delivering supplies to a humanitarian mission in a conflict zone\nD: a helicopter is performing aerial acrobatics in a deserted area",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_101_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_101_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_101_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_101_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_101_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_101_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_101_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_101_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_101_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_101_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_101_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_101_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_101_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_101_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_101_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_101_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a man rolls down a flight of stairs\nB: a man climbs up a flight of stairs\nC: a man rides a bicycle down a hill\nD: a man walks across a bridge",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a man rolls down a flight of stairs\nB: a man climbs up a flight of stairs\nC: a man rides a bicycle down a hill\nD: a man walks across a bridge",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_102_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_102_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_102_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_102_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_102_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_102_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_102_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_102_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_102_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_102_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_102_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_102_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_102_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_102_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_102_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_102_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: the joggers walked slowly on the path\nB: the athletes dashed through the stadium\nC: the runners jogged leisurely on the course\nD: the racers sprinted down the track",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: the joggers walked slowly on the path\nB: the athletes dashed through the stadium\nC: the runners jogged leisurely on the course\nD: the racers sprinted down the track",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_103_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_103_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_103_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_103_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_103_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_103_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_103_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_103_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_103_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_103_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_103_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_103_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_103_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_103_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_103_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_103_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a woman is demonstrating a nail painting technique\nB: a child is playing with a toy car\nC: a chef is preparing sushi\nD: a man is fixing a bicycle tire",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a woman is demonstrating a nail painting technique\nB: a child is playing with a toy car\nC: a chef is preparing sushi\nD: a man is fixing a bicycle tire",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_104_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_104_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_104_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_104_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_104_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_104_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_104_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_104_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_104_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_104_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_104_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_104_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_104_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_104_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_104_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_104_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a group of people are dancing\nB: a child is playing with toys\nC: a man is cooking on the stove\nD: a woman is mixing ingrediants",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a group of people are dancing\nB: a child is playing with toys\nC: a man is cooking on the stove\nD: a woman is mixing ingrediants",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_105_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_105_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_105_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_105_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_105_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_105_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_105_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_105_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_105_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_105_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_105_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_105_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_105_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_105_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_105_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_105_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: vest of sports vines\nB: collection of outdoor flowers\nC: jacket of workout branches\nD: shirt of athletic trees",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: vest of sports vines\nB: collection of outdoor flowers\nC: jacket of workout branches\nD: shirt of athletic trees",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_106_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_106_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_106_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_106_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_106_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_106_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_106_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_106_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_106_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_106_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_106_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_106_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_106_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_106_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_106_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_106_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a bird is building a nest in a tree\nB: two guinea pigs are eating leaves\nC: a cat is sleeping on a chair\nD: two dogs are chasing a ball",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a bird is building a nest in a tree\nB: two guinea pigs are eating leaves\nC: a cat is sleeping on a chair\nD: two dogs are chasing a ball",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_107_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_107_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_107_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_107_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_107_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_107_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_107_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_107_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_107_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_107_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_107_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_107_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_107_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_107_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_107_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_107_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: two lions are hunting for food in the jungle\nB: a group of monkeys are playing in the snow\nC: two elphants are cleaning themselves in some water\nD: a herd of zebras are grazing in the savanna",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: two lions are hunting for food in the jungle\nB: a group of monkeys are playing in the snow\nC: two elphants are cleaning themselves in some water\nD: a herd of zebras are grazing in the savanna",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_108_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_108_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_108_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_108_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_108_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_108_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_108_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_108_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_108_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_108_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_108_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_108_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_108_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_108_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_108_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_108_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a woman is cooking in the kitchen\nB: a child is playing with toys\nC: a dog is barking at strangers\nD: a man is talking about appliances",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a woman is cooking in the kitchen\nB: a child is playing with toys\nC: a dog is barking at strangers\nD: a man is talking about appliances",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_109_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_109_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_109_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_109_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_109_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_109_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_109_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_109_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_109_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_109_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_109_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_109_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_109_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_109_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_109_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_109_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: threee kids sing together on the voice\nB: two kids sing alone on the voice\nC: four kids dance together on the voice\nD: three kids talk together on the voice",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: threee kids sing together on the voice\nB: two kids sing alone on the voice\nC: four kids dance together on the voice\nD: three kids talk together on the voice",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_110_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_110_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_110_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_110_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_110_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_110_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_110_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_110_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_110_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_110_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_110_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_110_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_110_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_110_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_110_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_110_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a boy is getting off a bicycle\nB: a guy is getting out of a plane\nC: a woman is getting out of a car\nD: a man is swimming in a pool",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a boy is getting off a bicycle\nB: a guy is getting out of a plane\nC: a woman is getting out of a car\nD: a man is swimming in a pool",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_111_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_111_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_111_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_111_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_111_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_111_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_111_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_111_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_111_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_111_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_111_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_111_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_111_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_111_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_111_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_111_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a child is riding a bike\nB: a man is playing guitar\nC: a dog is chasing a ball\nD: a lady is cutting onion",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a child is riding a bike\nB: a man is playing guitar\nC: a dog is chasing a ball\nD: a lady is cutting onion",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_112_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_112_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_112_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_112_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_112_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_112_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_112_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_112_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_112_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_112_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_112_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_112_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_112_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_112_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_112_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_112_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a cat is licking a toy\nB: a dog is licking a baby\nC: a cat is licking a baby\nD: a baby is licking a cat",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a cat is licking a toy\nB: a dog is licking a baby\nC: a cat is licking a baby\nD: a baby is licking a cat",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_113_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_113_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_113_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_113_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_113_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_113_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_113_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_113_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_113_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_113_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_113_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_113_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_113_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_113_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_113_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_113_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a group of babies are sitting in the stage\nB: a group of adults are standing in the stage\nC: a group of babies are standing in the stage\nD: a crowd of children are running on the stage",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a group of babies are sitting in the stage\nB: a group of adults are standing in the stage\nC: a group of babies are standing in the stage\nD: a crowd of children are running on the stage",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_114_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_114_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_114_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_114_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_114_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_114_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_114_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_114_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_114_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_114_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_114_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_114_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_114_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_114_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_114_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_114_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a man is washing a car with a hose\nB: a dog is chasing a ball in the park\nC: a woman is sitting in a chair reading a book\nD: a man is lifting the back end of a small pickup up off the ground",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a man is washing a car with a hose\nB: a dog is chasing a ball in the park\nC: a woman is sitting in a chair reading a book\nD: a man is lifting the back end of a small pickup up off the ground",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_115_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_115_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_115_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_115_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_115_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_115_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_115_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_115_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_115_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_115_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_115_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_115_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_115_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_115_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_115_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_115_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a person is driving a car\nB: a bird is flying in the sky\nC: a dog is playing with a ball in the yard\nD: a cat is washing its head under a tap",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a person is driving a car\nB: a bird is flying in the sky\nC: a dog is playing with a ball in the yard\nD: a cat is washing its head under a tap",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_116_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_116_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_116_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_116_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_116_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_116_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_116_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_116_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_116_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_116_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_116_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_116_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_116_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_116_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_116_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_116_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: two fairy with green hair is sitting on a cloud\nB: a mermaid swimming with blue hair\nC: two mermaid with red hair is sitting on a rock\nD: a mermaid with purple hair sitting on a rock",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: two fairy with green hair is sitting on a cloud\nB: a mermaid swimming with blue hair\nC: two mermaid with red hair is sitting on a rock\nD: a mermaid with purple hair sitting on a rock",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_117_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_117_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_117_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_117_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_117_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_117_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_117_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_117_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_117_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_117_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_117_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_117_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_117_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_117_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_117_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_117_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: characters from video games are dancing to old mc donald had a farm\nB: characters from video games are singing old mc donald had a farm\nC: real-life people playing old mc donald had a farm on musical instruments\nD: animated animals dancing to old mc donald had a farm",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: characters from video games are dancing to old mc donald had a farm\nB: characters from video games are singing old mc donald had a farm\nC: real-life people playing old mc donald had a farm on musical instruments\nD: animated animals dancing to old mc donald had a farm",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_118_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_118_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_118_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_118_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_118_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_118_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_118_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_118_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_118_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_118_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_118_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_118_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_118_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_118_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_118_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_118_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: the animals are fighting over food\nB: the animals are ignoring each other and not eating\nC: the animals are having nice time together and eating food\nD: the animals are alone and not interacting with each other",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: the animals are fighting over food\nB: the animals are ignoring each other and not eating\nC: the animals are having nice time together and eating food\nD: the animals are alone and not interacting with each other",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_119_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_119_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_119_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_119_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_119_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_119_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_119_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_119_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_119_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_119_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_119_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_119_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_119_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_119_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_119_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_119_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a group of people are running\nB: a man is singing while walking\nC: a woman is playing guitar\nD: a man is dancing alone",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a group of people are running\nB: a man is singing while walking\nC: a woman is playing guitar\nD: a man is dancing alone",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_120_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_120_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_120_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_120_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_120_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_120_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_120_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_120_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_120_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_120_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_120_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_120_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_120_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_120_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_120_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_120_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a man is playing a guitar\nB: a woman is watering plants in the garden\nC: a man is typing on a computer keyboard\nD: a child is riding a bicycle",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a man is playing a guitar\nB: a woman is watering plants in the garden\nC: a man is typing on a computer keyboard\nD: a child is riding a bicycle",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_121_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_121_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_121_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_121_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_121_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_121_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_121_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_121_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_121_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_121_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_121_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_121_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_121_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_121_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_121_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_121_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a girl and a man are talking to each other\nB: a boy and a girl are playing a game\nC: a group of people are having a picnic\nD: a woman and a man are arguing with each other",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a girl and a man are talking to each other\nB: a boy and a girl are playing a game\nC: a group of people are having a picnic\nD: a woman and a man are arguing with each other",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_122_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_122_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_122_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_122_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_122_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_122_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_122_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_122_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_122_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_122_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_122_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_122_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_122_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_122_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_122_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_122_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a man is fixing a car\nB: a child is playing with toys\nC: a woman is putting on makeup\nD: a chef is cooking in the kitchen",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a man is fixing a car\nB: a child is playing with toys\nC: a woman is putting on makeup\nD: a chef is cooking in the kitchen",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_123_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_123_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_123_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_123_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_123_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_123_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_123_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_123_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_123_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_123_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_123_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_123_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_123_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_123_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_123_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_123_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: couple walking in a park\nB: children playing in the snow\nC: people swimming in a pool\nD: boys and girls dancing and singing on beach",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: couple walking in a park\nB: children playing in the snow\nC: people swimming in a pool\nD: boys and girls dancing and singing on beach",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_124_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_124_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_124_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_124_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_124_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_124_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_124_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_124_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_124_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_124_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_124_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_124_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_124_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_124_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_124_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_124_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: how to open a banana properly\nB: methods for slicing a cucumber\nC: techniques for cutting an apple\nD: ways to peel a potato",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: how to open a banana properly\nB: methods for slicing a cucumber\nC: techniques for cutting an apple\nD: ways to peel a potato",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_125_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_125_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_125_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_125_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_125_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_125_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_125_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_125_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_125_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_125_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_125_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_125_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_125_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_125_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_125_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_125_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: two birds in a nest one white chick and one green adult\nB: two pigeons in a nest one white baby and one green parent\nC: three parrots in a bird cage one white chick and two green adults\nD: two parrots in a bird cage one white chick and on green adult",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: two birds in a nest one white chick and one green adult\nB: two pigeons in a nest one white baby and one green parent\nC: three parrots in a bird cage one white chick and two green adults\nD: two parrots in a bird cage one white chick and on green adult",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_126_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_126_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_126_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_126_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_126_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_126_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_126_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_126_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_126_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_126_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_126_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_126_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_126_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_126_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_126_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_126_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a group of people in winter clothing skiing down a snowy slope\nB: a man in striped collared shirt discusses jobs in news room of bloomberg\nC: a woman in a floral dress gardening in her backyard\nD: a child in a superhero costume playing in a playground",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a group of people in winter clothing skiing down a snowy slope\nB: a man in striped collared shirt discusses jobs in news room of bloomberg\nC: a woman in a floral dress gardening in her backyard\nD: a child in a superhero costume playing in a playground",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_127_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_127_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_127_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_127_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_127_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_127_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_127_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_127_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_127_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_127_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_127_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_127_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_127_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_127_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_127_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_127_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: people are dancing but there are no advertisements\nB: no music is playing and there are no advertisements\nC: the video is silent and there are no advertisements\nD: music is playing and advertisements was showing",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: people are dancing but there are no advertisements\nB: no music is playing and there are no advertisements\nC: the video is silent and there are no advertisements\nD: music is playing and advertisements was showing",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_128_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_128_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_128_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_128_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_128_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_128_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_128_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_128_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_128_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_128_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_128_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_128_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_128_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_128_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_128_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_128_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: audience watches a magic show\nB: street performer entertains a small group\nC: boy band performs for a crowd\nD: solo artist sings in an empty room",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: audience watches a magic show\nB: street performer entertains a small group\nC: boy band performs for a crowd\nD: solo artist sings in an empty room",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_129_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_129_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_129_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_129_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_129_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_129_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_129_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_129_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_129_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_129_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_129_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_129_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_129_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_129_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_129_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_129_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a person is knitting a sweater\nB: a person is brushing a cat\nC: a person is cutting a cake\nD: a person is washing a car",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a person is knitting a sweater\nB: a person is brushing a cat\nC: a person is cutting a cake\nD: a person is washing a car",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_130_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_130_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_130_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_130_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_130_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_130_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_130_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_130_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_130_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_130_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_130_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_130_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_130_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_130_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_130_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_130_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a baby throws a phone receiver\nB: a baby plays with a phone receiver\nC: a baby puts down a phone receiver\nD: a baby picks up a phone receiver",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a baby throws a phone receiver\nB: a baby plays with a phone receiver\nC: a baby puts down a phone receiver\nD: a baby picks up a phone receiver",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_131_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_131_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_131_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_131_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_131_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_131_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_131_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_131_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_131_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_131_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_131_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_131_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_131_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_131_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_131_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_131_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: many people are doing a native dance\nB: a group of people are singing a popular song\nC: a couple of people are playing a traditional game\nD: a few people are quietly watching",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: many people are doing a native dance\nB: a group of people are singing a popular song\nC: a couple of people are playing a traditional game\nD: a few people are quietly watching",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_132_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_132_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_132_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_132_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_132_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_132_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_132_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_132_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_132_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_132_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_132_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_132_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_132_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_132_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_132_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_132_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a man is kicking and punching water filled jars\nB: a man is cooking a meal in the kitchen\nC: a man is swimming in a pool\nD: a man is gardening in the backyard",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a man is kicking and punching water filled jars\nB: a man is cooking a meal in the kitchen\nC: a man is swimming in a pool\nD: a man is gardening in the backyard",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_133_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_133_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_133_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_133_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_133_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_133_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_133_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_133_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_133_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_133_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_133_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_133_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_133_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_133_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_133_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_133_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a man is discussing his favorite movies and the top one is Jurassic Park\nB: a young girl is talking about her favorite books and the top one is Harry Potter\nC: a woman is describing her preferred TV shows and the second one is Game of Thrones\nD: a women is talking about the books she likes and the second favourite one is the amc the walking dead",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a man is discussing his favorite movies and the top one is Jurassic Park\nB: a young girl is talking about her favorite books and the top one is Harry Potter\nC: a woman is describing her preferred TV shows and the second one is Game of Thrones\nD: a women is talking about the books she likes and the second favourite one is the amc the walking dead",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_134_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_134_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_134_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_134_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_134_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_134_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_134_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_134_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_134_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_134_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_134_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_134_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_134_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_134_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_134_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_134_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: there is a man walking alone in a park\nB: there is a man shooting other people in a corridor\nC: a woman shopping for groceries in a supermarket\nD: two kids playing with a ball in a playground",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: there is a man walking alone in a park\nB: there is a man shooting other people in a corridor\nC: a woman shopping for groceries in a supermarket\nD: two kids playing with a ball in a playground",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_135_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_135_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_135_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_135_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_135_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_135_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_135_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_135_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_135_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_135_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_135_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_135_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_135_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_135_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_135_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_135_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: cartoon of a dolphin on a scooter looking up at a treehouse\nB: cartoon of a squid on a bike looking up at a treehouse\nC: drawing of a cat on a skateboard looking up at a treehouse\nD: animated picture of a monkey on a motorcycle looking up at a treehouse",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: cartoon of a dolphin on a scooter looking up at a treehouse\nB: cartoon of a squid on a bike looking up at a treehouse\nC: drawing of a cat on a skateboard looking up at a treehouse\nD: animated picture of a monkey on a motorcycle looking up at a treehouse",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_136_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_136_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_136_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_136_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_136_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_136_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_136_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_136_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_136_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_136_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_136_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_136_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_136_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_136_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_136_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_136_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a dog sleeping on the couch\nB: a child playing in the park\nC: a woman cleaning the bathroom\nD: a man cooking his kichen",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a dog sleeping on the couch\nB: a child playing in the park\nC: a woman cleaning the bathroom\nD: a man cooking his kichen",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_137_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_137_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_137_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_137_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_137_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_137_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_137_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_137_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_137_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_137_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_137_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_137_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_137_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_137_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_137_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_137_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a black cat sleeps on a sunny window sill\nB: two children play in a park with a ball\nC: a brunette woman stands in the kitchen cooking dinner\nD: a blonde man lies on a bed with a little baby",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a black cat sleeps on a sunny window sill\nB: two children play in a park with a ball\nC: a brunette woman stands in the kitchen cooking dinner\nD: a blonde man lies on a bed with a little baby",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_138_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_138_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_138_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_138_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_138_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_138_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_138_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_138_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_138_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_138_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_138_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_138_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_138_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_138_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_138_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_138_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: the man rode a mechanical bull\nB: the man rode a real bull\nC: the man rode a bicycle\nD: the man rode a horse",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: the man rode a mechanical bull\nB: the man rode a real bull\nC: the man rode a bicycle\nD: the man rode a horse",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_139_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_139_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_139_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_139_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_139_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_139_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_139_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_139_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_139_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_139_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_139_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_139_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_139_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_139_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_139_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_139_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a group of people in formal attire sitting at a table and having a business meeting\nB: two men in casual attire sitting in a car and discussing about a project\nC: a woman in traditional dress dancing on stage with a group of musicians playing instruments\nD: two girls in design dress wearing cloth standing holding mic in hand on street and person walking beside discusing on topic",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a group of people in formal attire sitting at a table and having a business meeting\nB: two men in casual attire sitting in a car and discussing about a project\nC: a woman in traditional dress dancing on stage with a group of musicians playing instruments\nD: two girls in design dress wearing cloth standing holding mic in hand on street and person walking beside discusing on topic",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_140_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_140_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_140_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_140_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_140_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_140_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_140_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_140_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_140_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_140_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_140_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_140_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_140_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_140_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_140_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_140_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a man cutting carrots with a knife in a slow manner\nB: a person chopping onions with a fork slowly\nC: a woman peeling potatoes with a peeler quickly\nD: a man slicing tomatoes with a spoon in a fast manner",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a man cutting carrots with a knife in a slow manner\nB: a person chopping onions with a fork slowly\nC: a woman peeling potatoes with a peeler quickly\nD: a man slicing tomatoes with a spoon in a fast manner",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_141_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_141_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_141_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_141_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_141_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_141_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_141_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_141_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_141_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_141_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_141_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_141_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_141_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_141_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_141_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_141_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a cat is sleeping on the windowsill\nB: an indian woman is applying makeup between her hair\nC: a chef is cooking in the kitchen\nD: a young man is playing guitar on the beach",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a cat is sleeping on the windowsill\nB: an indian woman is applying makeup between her hair\nC: a chef is cooking in the kitchen\nD: a young man is playing guitar on the beach",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_142_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_142_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_142_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_142_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_142_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_142_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_142_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_142_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_142_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_142_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_142_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_142_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_142_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_142_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_142_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_142_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a man is sitting on a bench\nB: a man is asking for lift\nC: a man is talking on the phone\nD: a man is walking in the park",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a man is sitting on a bench\nB: a man is asking for lift\nC: a man is talking on the phone\nD: a man is walking in the park",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_143_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_143_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_143_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_143_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_143_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_143_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_143_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_143_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_143_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_143_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_143_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_143_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_143_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_143_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_143_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_143_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a woman is speaking into a microphone\nB: a man is playing guitar on stage\nC: a man is speaking into a microphone\nD: a man is typing on a computer keyboard",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a woman is speaking into a microphone\nB: a man is playing guitar on stage\nC: a man is speaking into a microphone\nD: a man is typing on a computer keyboard",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_144_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_144_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_144_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_144_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_144_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_144_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_144_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_144_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_144_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_144_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_144_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_144_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_144_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_144_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_144_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_144_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: some kittens are eating\nB: a cat is sleeping\nC: dogs are playing\nD: birds are flying",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: some kittens are eating\nB: a cat is sleeping\nC: dogs are playing\nD: birds are flying",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_145_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_145_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_145_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_145_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_145_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_145_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_145_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_145_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_145_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_145_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_145_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_145_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_145_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_145_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_145_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_145_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: the football players practicing on the field in the evening\nB: two people running on the track in the morning\nC: the tennis players wearing blue and red t shirts and play the tennis in the tennis court at the night time\nD: a group of people playing basketball in the park during daytime",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: the football players practicing on the field in the evening\nB: two people running on the track in the morning\nC: the tennis players wearing blue and red t shirts and play the tennis in the tennis court at the night time\nD: a group of people playing basketball in the park during daytime",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_146_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_146_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_146_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_146_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_146_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_146_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_146_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_146_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_146_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_146_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_146_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_146_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_146_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_146_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_146_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_146_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a woman walked by\nB: a child sat on a swing\nC: a man jumped high\nD: a man fell down",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a woman walked by\nB: a child sat on a swing\nC: a man jumped high\nD: a man fell down",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_147_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_147_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_147_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_147_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_147_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_147_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_147_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_147_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_147_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_147_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_147_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_147_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_147_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_147_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_147_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_147_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a group of people are riding on a raft in a body of water\nB: a person is standing alone on a boat in a lake\nC: a canoe with people rowing in a river\nD: a group of people are swimming in a pool",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a group of people are riding on a raft in a body of water\nB: a person is standing alone on a boat in a lake\nC: a canoe with people rowing in a river\nD: a group of people are swimming in a pool",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_148_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_148_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_148_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_148_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_148_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_148_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_148_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_148_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_148_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_148_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_148_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_148_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_148_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_148_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_148_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_148_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: someone pours a pot of milk into a larger pot\nB: someone sprinkles salt and pepper onto a plate of food\nC: someone pours a pot of tomato sauce into a larger pot\nD: someone mixes flour and water in a bowl",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: someone pours a pot of milk into a larger pot\nB: someone sprinkles salt and pepper onto a plate of food\nC: someone pours a pot of tomato sauce into a larger pot\nD: someone mixes flour and water in a bowl",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_149_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_149_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_149_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_149_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_149_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_149_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_149_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_149_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_149_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_149_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_149_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_149_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_149_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_149_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_149_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_149_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a person is singing\nB: a person is running\nC: a person is sleeping\nD: a person is cooking",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a person is singing\nB: a person is running\nC: a person is sleeping\nD: a person is cooking",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_150_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_150_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_150_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_150_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_150_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_150_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_150_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_150_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_150_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_150_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_150_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_150_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_150_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_150_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_150_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_150_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: three baseballs are being hit by a bat\nB: three basketballs are bouncing on a court\nC: three soccer balls are laying in a field and then three men in black athletic cloths attempt to shoot a goal\nD: three tennis balls are being served by players",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: three baseballs are being hit by a bat\nB: three basketballs are bouncing on a court\nC: three soccer balls are laying in a field and then three men in black athletic cloths attempt to shoot a goal\nD: three tennis balls are being served by players",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_151_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_151_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_151_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_151_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_151_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_151_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_151_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_151_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_151_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_151_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_151_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_151_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_151_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_151_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_151_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_151_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: some people making a human pyramid\nB: a cat sleeping on a chair\nC: a chef cooking in a restaurant kitchen\nD: a basketball player shooting a three-pointer",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: some people making a human pyramid\nB: a cat sleeping on a chair\nC: a chef cooking in a restaurant kitchen\nD: a basketball player shooting a three-pointer",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_152_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_152_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_152_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_152_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_152_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_152_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_152_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_152_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_152_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_152_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_152_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_152_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_152_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_152_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_152_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_152_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: depicting a group of people sitting\nB: featuring some persons running\nC: showing people walking slowly\nD: displaying individuals standing still",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: depicting a group of people sitting\nB: featuring some persons running\nC: showing people walking slowly\nD: displaying individuals standing still",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_153_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_153_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_153_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_153_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_153_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_153_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_153_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_153_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_153_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_153_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_153_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_153_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_153_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_153_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_153_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_153_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: two women are wrestling\nB: two men playing tennis\nC: a man and a woman dancing\nD: a group of people doing yoga",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: two women are wrestling\nB: two men playing tennis\nC: a man and a woman dancing\nD: a group of people doing yoga",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_154_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_154_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_154_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_154_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_154_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_154_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_154_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_154_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_154_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_154_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_154_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_154_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_154_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_154_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_154_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_154_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a transvestite shows what she bought for her dog including shampoo and conditioner made by martha stuart\nB: a woman presents her latest DIY home renovation tools and supplies\nC: a man showcases his new car detailing products from a popular brand\nD: a pet owner displays a range of organic treats and toys for their cat",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a transvestite shows what she bought for her dog including shampoo and conditioner made by martha stuart\nB: a woman presents her latest DIY home renovation tools and supplies\nC: a man showcases his new car detailing products from a popular brand\nD: a pet owner displays a range of organic treats and toys for their cat",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_155_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_155_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_155_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_155_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_155_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_155_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_155_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_155_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_155_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_155_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_155_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_155_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_155_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_155_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_155_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_155_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a dog is sitting on a couch\nB: a fish is swimming in the ocean\nC: a cat is drinking water from a faucet\nD: a bird is flying in the sky",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a dog is sitting on a couch\nB: a fish is swimming in the ocean\nC: a cat is drinking water from a faucet\nD: a bird is flying in the sky",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_156_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_156_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_156_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_156_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_156_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_156_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_156_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_156_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_156_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_156_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_156_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_156_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_156_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_156_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_156_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_156_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: the rabbit played with a pink stuffed rabbit\nB: a bird perched on a tree branch\nC: the dog barked at a squirrel\nD: a cat chased a red ball",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: the rabbit played with a pink stuffed rabbit\nB: a bird perched on a tree branch\nC: the dog barked at a squirrel\nD: a cat chased a red ball",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_157_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_157_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_157_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_157_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_157_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_157_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_157_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_157_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_157_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_157_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_157_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_157_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_157_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_157_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_157_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_157_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: the man is sitting on the something\nB: the woman is cooking the something\nC: the girl is playing with the something\nD: the boy is eating the something",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: the man is sitting on the something\nB: the woman is cooking the something\nC: the girl is playing with the something\nD: the boy is eating the something",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_158_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_158_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_158_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_158_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_158_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_158_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_158_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_158_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_158_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_158_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_158_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_158_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_158_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_158_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_158_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_158_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: sleeping and watching TV with 4 boys and 1 girl\nB: standing and staring at 3 kids and 1 adult\nC: sitting and converstion 2 lady and 2 gents\nD: walking and talking to 2 men and 3 women",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: sleeping and watching TV with 4 boys and 1 girl\nB: standing and staring at 3 kids and 1 adult\nC: sitting and converstion 2 lady and 2 gents\nD: walking and talking to 2 men and 3 women",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_159_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_159_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_159_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_159_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_159_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_159_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_159_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_159_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_159_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_159_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_159_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_159_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_159_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_159_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_159_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_159_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a woman is painting on a canvas\nB: a man is cooking in the kitchen\nC: a girl is riding a bicycle\nD: a boy is playing a piano",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a woman is painting on a canvas\nB: a man is cooking in the kitchen\nC: a girl is riding a bicycle\nD: a boy is playing a piano",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_160_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_160_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_160_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_160_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_160_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_160_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_160_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_160_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_160_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_160_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_160_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_160_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_160_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_160_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_160_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_160_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: stunning acting is too good\nB: mohan acting is not impressive\nC: vintage acting is too good\nD: mohan acting is too good",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: stunning acting is too good\nB: mohan acting is not impressive\nC: vintage acting is too good\nD: mohan acting is too good",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_161_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_161_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_161_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_161_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_161_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_161_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_161_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_161_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_161_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_161_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_161_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_161_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_161_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_161_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_161_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_161_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a man calmly watches the snakes slither around the room\nB: the man dances with the snakes in a friendly manner\nC: the snakes attack the man and force him to leave the room\nD: a man grabs at snakes and throws them around the room",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a man calmly watches the snakes slither around the room\nB: the man dances with the snakes in a friendly manner\nC: the snakes attack the man and force him to leave the room\nD: a man grabs at snakes and throws them around the room",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_162_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_162_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_162_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_162_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_162_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_162_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_162_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_162_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_162_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_162_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_162_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_162_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_162_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_162_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_162_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_162_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a person plays a popular song on a musical instrument\nB: a person dances to a popular song\nC: a person covers a popular song\nD: a person sings an original song",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a person plays a popular song on a musical instrument\nB: a person dances to a popular song\nC: a person covers a popular song\nD: a person sings an original song",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_163_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_163_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_163_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_163_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_163_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_163_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_163_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_163_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_163_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_163_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_163_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_163_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_163_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_163_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_163_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_163_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a baby is playing with a bull dog\nB: a baby is playing with a kitten\nC: a baby is playing with a teddy bear\nD: a dog is playing with a baby",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a baby is playing with a bull dog\nB: a baby is playing with a kitten\nC: a baby is playing with a teddy bear\nD: a dog is playing with a baby",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_164_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_164_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_164_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_164_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_164_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_164_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_164_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_164_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_164_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_164_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_164_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_164_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_164_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_164_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_164_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_164_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: waiters walking calmly in a professional kitchen\nB: behind the scenes in a professional kitchen as the chefs work and the waiters run food can be a very noisy experience\nC: a calm and quiet atmosphere in a professional kitchen\nD: chefs leisurely preparing food in a professional kitchen",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: waiters walking calmly in a professional kitchen\nB: behind the scenes in a professional kitchen as the chefs work and the waiters run food can be a very noisy experience\nC: a calm and quiet atmosphere in a professional kitchen\nD: chefs leisurely preparing food in a professional kitchen",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_165_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_165_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_165_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_165_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_165_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_165_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_165_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_165_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_165_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_165_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_165_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_165_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_165_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_165_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_165_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_165_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a man is washing his car\nB: a child is playing with a toy\nC: a woman is making an eyeshadow\nD: a chef is cooking a dessert",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a man is washing his car\nB: a child is playing with a toy\nC: a woman is making an eyeshadow\nD: a chef is cooking a dessert",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_166_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_166_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_166_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_166_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_166_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_166_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_166_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_166_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_166_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_166_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_166_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_166_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_166_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_166_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_166_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_166_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a tv shows review program hosts discuss about the performance and staying on air of star trek\nB: a group of friends are shown enjoying a beach vacation in Thailand\nC: a cooking show features the preparation of traditional Italian cuisine\nD: a documentary film depicts the history and significance of ancient Egyptian pyramids",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a tv shows review program hosts discuss about the performance and staying on air of star trek\nB: a group of friends are shown enjoying a beach vacation in Thailand\nC: a cooking show features the preparation of traditional Italian cuisine\nD: a documentary film depicts the history and significance of ancient Egyptian pyramids",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_167_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_167_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_167_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_167_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_167_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_167_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_167_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_167_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_167_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_167_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_167_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_167_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_167_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_167_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_167_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_167_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: people are waiting for a bus at a bus stop\nB: passengers are boarding a train at a station\nC: passersby are crossing a road at a traffic signal\nD: peoples are disembarking from a train in a station",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: people are waiting for a bus at a bus stop\nB: passengers are boarding a train at a station\nC: passersby are crossing a road at a traffic signal\nD: peoples are disembarking from a train in a station",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_168_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_168_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_168_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_168_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_168_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_168_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_168_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_168_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_168_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_168_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_168_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_168_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_168_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_168_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_168_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_168_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a child is playing with a toy car\nB: a chef is cutting vegetables in the kitchen\nC: a woman is mixing food in a mixing bowl\nD: a man is painting a wall with a brush",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a child is playing with a toy car\nB: a chef is cutting vegetables in the kitchen\nC: a woman is mixing food in a mixing bowl\nD: a man is painting a wall with a brush",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_169_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_169_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_169_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_169_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_169_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_169_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_169_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_169_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_169_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_169_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_169_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_169_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_169_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_169_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_169_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_169_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a dense forest with a winding dirt path\nB: cabins on a sandy beach have walkways going up to their porches\nC: modern skyscrapers overlooking a busy city street\nD: colorful umbrellas on a rocky beach with no buildings in sight",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a dense forest with a winding dirt path\nB: cabins on a sandy beach have walkways going up to their porches\nC: modern skyscrapers overlooking a busy city street\nD: colorful umbrellas on a rocky beach with no buildings in sight",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_170_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_170_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_170_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_170_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_170_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_170_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_170_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_170_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_170_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_170_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_170_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_170_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_170_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_170_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_170_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_170_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a panda bear climbs on a tree trunk\nB: a koala climbs on a tree trunk\nC: a raccoon climbs on a tree trunk\nD: a bear sits on a tree trunk",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a panda bear climbs on a tree trunk\nB: a koala climbs on a tree trunk\nC: a raccoon climbs on a tree trunk\nD: a bear sits on a tree trunk",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_171_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_171_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_171_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_171_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_171_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_171_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_171_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_171_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_171_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_171_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_171_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_171_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_171_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_171_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_171_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_171_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a man is mowing the lawn\nB: a dog is barking at the mailman\nC: a woman is chopping herbs\nD: a child is playing with a toy",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a man is mowing the lawn\nB: a dog is barking at the mailman\nC: a woman is chopping herbs\nD: a child is playing with a toy",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_172_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_172_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_172_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_172_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_172_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_172_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_172_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_172_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_172_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_172_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_172_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_172_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_172_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_172_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_172_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_172_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a woman discusses a peace treaty between two countries\nB: a teacher gives a lecture on ancient civilizations\nC: a man talks about a war between two generals one of which became king\nD: a child narrates a story about a magical kingdom",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a woman discusses a peace treaty between two countries\nB: a teacher gives a lecture on ancient civilizations\nC: a man talks about a war between two generals one of which became king\nD: a child narrates a story about a magical kingdom",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_173_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_173_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_173_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_173_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_173_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_173_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_173_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_173_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_173_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_173_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_173_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_173_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_173_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_173_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_173_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_173_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a hockey player scores a goal during a hockey game\nB: a basketball player dunks a basketball during a basketball game\nC: a baseball player hits a home run during a baseball game\nD: a soccer player shoots a goal during a soccer game",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a hockey player scores a goal during a hockey game\nB: a basketball player dunks a basketball during a basketball game\nC: a baseball player hits a home run during a baseball game\nD: a soccer player shoots a goal during a soccer game",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_174_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_174_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_174_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_174_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_174_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_174_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_174_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_174_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_174_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_174_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_174_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_174_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_174_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_174_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_174_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_174_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a child is playing in the park\nB: a woman is cooking in the kitchen\nC: a dog is barking at the door\nD: a man is reading a book on a couch",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a child is playing in the park\nB: a woman is cooking in the kitchen\nC: a dog is barking at the door\nD: a man is reading a book on a couch",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_175_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_175_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_175_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_175_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_175_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_175_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_175_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_175_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_175_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_175_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_175_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_175_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_175_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_175_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_175_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_175_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: men pushing a car up assembly line\nB: men pushing a car down assembly line\nC: men pulling a car down assembly line\nD: women pushing a car down assembly line",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: men pushing a car up assembly line\nB: men pushing a car down assembly line\nC: men pulling a car down assembly line\nD: women pushing a car down assembly line",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_176_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_176_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_176_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_176_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_176_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_176_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_176_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_176_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_176_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_176_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_176_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_176_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_176_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_176_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_176_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_176_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: two people are dancing\nB: a man and woman is eating\nC: a man is cooking alone\nD: a woman is eating alone",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: two people are dancing\nB: a man and woman is eating\nC: a man is cooking alone\nD: a woman is eating alone",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_177_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_177_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_177_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_177_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_177_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_177_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_177_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_177_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_177_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_177_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_177_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_177_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_177_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_177_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_177_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_177_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: butter melting in a frying pan\nB: soup being boiled in a pot\nC: chocolate melting in hot water\nD: cheese melting in hot sauce",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: butter melting in a frying pan\nB: soup being boiled in a pot\nC: chocolate melting in hot water\nD: cheese melting in hot sauce",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_178_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_178_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_178_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_178_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_178_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_178_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_178_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_178_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_178_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_178_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_178_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_178_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_178_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_178_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_178_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_178_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: two boys are playing in the park with a dog and laughing\nB: two girls are sitting in the bed with a cat and talking\nC: a man is cooking in the kitchen with a dog and dancing\nD: two girls are sitting on the bench with a cat and reading",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: two boys are playing in the park with a dog and laughing\nB: two girls are sitting in the bed with a cat and talking\nC: a man is cooking in the kitchen with a dog and dancing\nD: two girls are sitting on the bench with a cat and reading",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_179_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_179_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_179_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_179_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_179_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_179_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_179_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_179_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_179_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_179_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_179_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_179_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_179_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_179_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_179_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_179_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: employees are working in an office\nB: players are playing a basketball match\nC: spectators are watching a tennis match\nD: dancers are performing on a stage",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: employees are working in an office\nB: players are playing a basketball match\nC: spectators are watching a tennis match\nD: dancers are performing on a stage",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_180_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_180_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_180_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_180_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_180_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_180_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_180_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_180_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_180_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_180_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_180_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_180_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_180_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_180_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_180_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_180_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a girl getting dressed\nB: a boy playing basketball\nC: a woman reading a book\nD: a man cooking in the kitchen",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a girl getting dressed\nB: a boy playing basketball\nC: a woman reading a book\nD: a man cooking in the kitchen",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_181_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_181_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_181_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_181_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_181_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_181_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_181_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_181_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_181_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_181_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_181_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_181_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_181_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_181_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_181_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_181_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a group of people are eating together\nB: women are dancing in silence\nC: men are singing a song\nD: children are playing a game",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a group of people are eating together\nB: women are dancing in silence\nC: men are singing a song\nD: children are playing a game",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_182_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_182_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_182_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_182_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_182_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_182_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_182_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_182_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_182_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_182_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_182_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_182_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_182_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_182_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_182_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_182_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: an elderly man cooking in the kitchen\nB: a group of children riding bicycles in the park\nC: a middle aged woman giving another woman a message\nD: a young girl playing with a dog",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: an elderly man cooking in the kitchen\nB: a group of children riding bicycles in the park\nC: a middle aged woman giving another woman a message\nD: a young girl playing with a dog",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_183_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_183_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_183_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_183_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_183_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_183_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_183_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_183_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_183_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_183_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_183_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_183_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_183_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_183_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_183_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_183_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a man explains the details of a historical event\nB: a man talks about the publication of a nasa technical report\nC: a person describes the process of creating a new recipe\nD: a woman discusses the release of a new music album",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a man explains the details of a historical event\nB: a man talks about the publication of a nasa technical report\nC: a person describes the process of creating a new recipe\nD: a woman discusses the release of a new music album",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_184_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_184_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_184_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_184_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_184_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_184_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_184_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_184_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_184_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_184_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_184_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_184_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_184_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_184_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_184_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_184_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: the girl danced in the street\nB: the girl played with her hair\nC: the girl applied stickers to her face\nD: the girl put on a hat",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: the girl danced in the street\nB: the girl played with her hair\nC: the girl applied stickers to her face\nD: the girl put on a hat",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_185_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_185_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_185_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_185_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_185_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_185_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_185_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_185_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_185_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_185_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_185_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_185_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_185_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_185_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_185_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_185_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a slideshow of landscape photographs\nB: a video showing footage from sporting events\nC: a documentary about wildlife conservation efforts\nD: a tutorial on cooking techniques",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a slideshow of landscape photographs\nB: a video showing footage from sporting events\nC: a documentary about wildlife conservation efforts\nD: a tutorial on cooking techniques",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_186_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_186_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_186_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_186_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_186_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_186_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_186_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_186_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_186_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_186_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_186_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_186_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_186_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_186_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_186_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_186_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a man drives his car down the road\nB: a group of people having a picnic in the garden\nC: a child riding a bicycle on the sidewalk\nD: a woman walking her dog in the park",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a man drives his car down the road\nB: a group of people having a picnic in the garden\nC: a child riding a bicycle on the sidewalk\nD: a woman walking her dog in the park",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_187_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_187_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_187_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_187_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_187_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_187_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_187_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_187_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_187_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_187_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_187_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_187_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_187_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_187_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_187_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_187_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a man cries uncontrollably\nB: a woman laughs until she chokes\nC: a child laughs with joy\nD: a group of people stare blankly",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a man cries uncontrollably\nB: a woman laughs until she chokes\nC: a child laughs with joy\nD: a group of people stare blankly",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_188_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_188_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_188_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_188_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_188_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_188_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_188_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_188_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_188_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_188_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_188_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_188_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_188_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_188_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_188_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_188_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a man jumps off a diving board\nB: a woman swims in a pool\nC: a cat walks on a treadmill\nD: a child plays on a swing",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a man jumps off a diving board\nB: a woman swims in a pool\nC: a cat walks on a treadmill\nD: a child plays on a swing",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_189_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_189_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_189_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_189_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_189_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_189_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_189_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_189_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_189_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_189_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_189_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_189_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_189_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_189_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_189_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_189_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a female inside a white themed bathroom while someone else makes her makeup\nB: a female outside a white themed bathroom while making her own makeup\nC: a female inside a colorful themed bathroom while someone else makes her makeup\nD: a male inside a white themed bathroom while someone else makes his makeup",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a female inside a white themed bathroom while someone else makes her makeup\nB: a female outside a white themed bathroom while making her own makeup\nC: a female inside a colorful themed bathroom while someone else makes her makeup\nD: a male inside a white themed bathroom while someone else makes his makeup",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_190_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_190_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_190_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_190_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_190_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_190_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_190_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_190_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_190_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_190_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_190_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_190_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_190_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_190_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_190_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_190_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a documentary about lions in the wild\nB: a commercial for the website called eharmony\nC: a tutorial on how to bake a cake\nD: an advertisement for a new mobile phone",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a documentary about lions in the wild\nB: a commercial for the website called eharmony\nC: a tutorial on how to bake a cake\nD: an advertisement for a new mobile phone",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_191_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_191_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_191_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_191_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_191_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_191_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_191_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_191_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_191_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_191_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_191_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_191_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_191_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_191_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_191_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_191_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: smart poultry cop\nB: astute fowl officer\nC: intelligent chicken law enforcement\nD: very cleaver chicken police",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: smart poultry cop\nB: astute fowl officer\nC: intelligent chicken law enforcement\nD: very cleaver chicken police",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_192_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_192_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_192_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_192_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_192_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_192_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_192_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_192_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_192_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_192_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_192_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_192_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_192_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_192_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_192_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_192_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a scene from spongebob squarepants where the townspeople are dancing in a parade\nB: a scene from spongebob squarepants where the townspeople are carrying torches and chasing a giant squidward\nC: a scene from spongebob squarepants where the townspeople are peacefully enjoying a picnic\nD: a scene from spongebob squarepants where the townspeople are having a friendly conversation with a giant squidward",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a scene from spongebob squarepants where the townspeople are dancing in a parade\nB: a scene from spongebob squarepants where the townspeople are carrying torches and chasing a giant squidward\nC: a scene from spongebob squarepants where the townspeople are peacefully enjoying a picnic\nD: a scene from spongebob squarepants where the townspeople are having a friendly conversation with a giant squidward",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_193_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_193_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_193_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_193_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_193_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_193_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_193_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_193_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_193_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_193_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_193_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_193_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_193_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_193_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_193_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_193_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: patrick is destroying memories of him with mr\nB: sandy is hiding memories of her with mr\nC: squidward is ignoring memories of him with mr\nD: spongebob is showing memories of him with mr",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: patrick is destroying memories of him with mr\nB: sandy is hiding memories of her with mr\nC: squidward is ignoring memories of him with mr\nD: spongebob is showing memories of him with mr",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_194_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_194_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_194_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_194_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_194_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_194_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_194_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_194_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_194_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_194_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_194_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_194_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_194_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_194_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_194_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_194_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: valencia and hokit participating in a boxing match\nB: valencia and hokit competing in a tennis match\nC: valencia and hokit facing off in a chess tournament\nD: valencia vesus hokit in a wrestling match",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: valencia and hokit participating in a boxing match\nB: valencia and hokit competing in a tennis match\nC: valencia and hokit facing off in a chess tournament\nD: valencia vesus hokit in a wrestling match",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_195_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_195_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_195_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_195_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_195_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_195_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_195_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_195_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_195_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_195_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_195_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_195_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_195_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_195_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_195_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_195_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a child is playing with a toy car\nB: a man is driving a motorcycle\nC: a woman is riding a bicycle\nD: a person is walking a dog",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a child is playing with a toy car\nB: a man is driving a motorcycle\nC: a woman is riding a bicycle\nD: a person is walking a dog",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_196_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_196_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_196_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_196_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_196_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_196_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_196_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_196_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_196_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_196_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_196_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_196_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_196_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_196_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_196_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_196_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a child plays with a ball in the park\nB: a man peels a potato with a spoon\nC: a woman cuts an onion in half with a knife\nD: a chef grills a steak on a barbecue",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a child plays with a ball in the park\nB: a man peels a potato with a spoon\nC: a woman cuts an onion in half with a knife\nD: a chef grills a steak on a barbecue",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_197_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_197_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_197_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_197_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_197_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_197_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_197_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_197_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_197_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_197_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_197_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_197_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_197_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_197_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_197_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_197_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: a cartoon shows two dogs talking to a bird\nB: an animated film depicting a family of rabbits playing with a turtle\nC: a sci-fi movie featuring robots communicating with aliens\nD: a documentary about cats hunting in the wild",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: a cartoon shows two dogs talking to a bird\nB: an animated film depicting a family of rabbits playing with a turtle\nC: a sci-fi movie featuring robots communicating with aliens\nD: a documentary about cats hunting in the wild",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_198_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_198_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_198_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_198_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_198_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_198_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_198_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_198_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_198_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_198_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_198_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_198_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_198_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_198_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_198_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_198_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "video_captioning",
+    "visual_input_component": "Video image or Natural image",
+    "source": "source",
+    "options": "A: there is a vehicle riding dangerously through forest\nB: a boat sailing calmly in a lake\nC: a person walking peacefully in a garden\nD: a group of animals playing in a zoo",
+    "question": "Please generate textual descriptions for a sequence of video frames.",
+    "context": "Select from the following choices.\nA: there is a vehicle riding dangerously through forest\nB: a boat sailing calmly in a lake\nC: a person walking peacefully in a garden\nD: a group of animals playing in a zoo",
+    "input_image_path": [
+      "../MMIU-Benchmark/video_captioning/video_captioning_199_0.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_199_1.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_199_2.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_199_3.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_199_4.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_199_5.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_199_6.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_199_7.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_199_8.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_199_9.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_199_10.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_199_11.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_199_12.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_199_13.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_199_14.png",
+      "../MMIU-Benchmark/video_captioning/video_captioning_199_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [2, 0, 1, 3]\nB: [1, 0, 3, 2]\nC: [3, 1, 0, 2]\nD: [0, 2, 1, 3]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [2, 0, 1, 3]\nB: [1, 0, 3, 2]\nC: [3, 1, 0, 2]\nD: [0, 2, 1, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_0_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_0_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_0_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_0_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [10, 0, 8, 2, 3, 11, 7, 5, 6, 1, 4, 9]\nB: [1, 8, 6, 5, 3, 2, 10, 9, 0, 4, 11, 7]\nC: [4, 6, 10, 8, 1, 3, 2, 5, 0, 9, 7, 11]\nD: [10, 9, 0, 5, 8, 11, 6, 3, 1, 7, 4, 2]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [10, 0, 8, 2, 3, 11, 7, 5, 6, 1, 4, 9]\nB: [1, 8, 6, 5, 3, 2, 10, 9, 0, 4, 11, 7]\nC: [4, 6, 10, 8, 1, 3, 2, 5, 0, 9, 7, 11]\nD: [10, 9, 0, 5, 8, 11, 6, 3, 1, 7, 4, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_1_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_1_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_1_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_1_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_1_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_1_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_1_6.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_1_7.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_1_8.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_1_9.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_1_10.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_1_11.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [2, 0, 3, 1]\nB: [2, 1, 0, 3]\nC: [2, 1, 3, 0]\nD: [1, 2, 3, 0]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [2, 0, 3, 1]\nB: [2, 1, 0, 3]\nC: [2, 1, 3, 0]\nD: [1, 2, 3, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_2_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_2_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_2_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_2_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [0, 5, 4, 3, 8, 2, 1, 6, 7]\nB: [3, 6, 4, 5, 2, 8, 0, 1, 7]\nC: [6, 1, 0, 3, 8, 5, 7, 4, 2]\nD: [2, 8, 0, 7, 5, 6, 3, 4, 1]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [0, 5, 4, 3, 8, 2, 1, 6, 7]\nB: [3, 6, 4, 5, 2, 8, 0, 1, 7]\nC: [6, 1, 0, 3, 8, 5, 7, 4, 2]\nD: [2, 8, 0, 7, 5, 6, 3, 4, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_3_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_3_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_3_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_3_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_3_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_3_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_3_6.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_3_7.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_3_8.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [0, 1, 3, 2]\nB: [0, 3, 2, 1]\nC: [2, 1, 0, 3]\nD: [2, 3, 1, 0]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [0, 1, 3, 2]\nB: [0, 3, 2, 1]\nC: [2, 1, 0, 3]\nD: [2, 3, 1, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_4_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_4_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_4_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_4_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [2, 1, 0, 3]\nB: [1, 0, 3, 2]\nC: [3, 1, 2, 0]\nD: [2, 0, 1, 3]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [2, 1, 0, 3]\nB: [1, 0, 3, 2]\nC: [3, 1, 2, 0]\nD: [2, 0, 1, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_5_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_5_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_5_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_5_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [3, 1, 0, 2]\nB: [1, 2, 0, 3]\nC: [1, 3, 0, 2]\nD: [0, 2, 3, 1]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [3, 1, 0, 2]\nB: [1, 2, 0, 3]\nC: [1, 3, 0, 2]\nD: [0, 2, 3, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_6_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_6_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_6_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_6_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [15, 11, 17, 9, 12, 10, 4, 7, 0, 8, 5, 1, 14, 3, 2, 16, 13, 6]\nB: [5, 4, 13, 11, 1, 17, 16, 12, 0, 10, 2, 3, 9, 15, 8, 14, 6, 7]\nC: [4, 5, 11, 1, 16, 12, 10, 13, 8, 7, 15, 3, 9, 14, 17, 2, 6, 0]\nD: [7, 9, 0, 3, 2, 12, 1, 17, 4, 10, 13, 6, 14, 8, 16, 15, 5, 11]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [15, 11, 17, 9, 12, 10, 4, 7, 0, 8, 5, 1, 14, 3, 2, 16, 13, 6]\nB: [5, 4, 13, 11, 1, 17, 16, 12, 0, 10, 2, 3, 9, 15, 8, 14, 6, 7]\nC: [4, 5, 11, 1, 16, 12, 10, 13, 8, 7, 15, 3, 9, 14, 17, 2, 6, 0]\nD: [7, 9, 0, 3, 2, 12, 1, 17, 4, 10, 13, 6, 14, 8, 16, 15, 5, 11]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_7_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_7_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_7_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_7_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_7_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_7_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_7_6.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_7_7.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_7_8.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_7_9.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_7_10.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_7_11.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_7_12.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_7_13.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_7_14.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_7_15.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_7_16.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_7_17.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [3, 0, 5, 2, 1, 4]\nB: [0, 2, 1, 4, 5, 3]\nC: [1, 2, 4, 5, 0, 3]\nD: [0, 3, 1, 4, 2, 5]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [3, 0, 5, 2, 1, 4]\nB: [0, 2, 1, 4, 5, 3]\nC: [1, 2, 4, 5, 0, 3]\nD: [0, 3, 1, 4, 2, 5]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_8_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_8_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_8_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_8_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_8_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_8_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [0, 3, 1, 2]\nB: [2, 1, 3, 0]\nC: [0, 1, 3, 2]\nD: [0, 3, 2, 1]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [0, 3, 1, 2]\nB: [2, 1, 3, 0]\nC: [0, 1, 3, 2]\nD: [0, 3, 2, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_9_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_9_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_9_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_9_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [3, 5, 1, 2, 0, 4]\nB: [4, 5, 0, 2, 3, 1]\nC: [1, 2, 0, 4, 5, 3]\nD: [2, 1, 0, 3, 5, 4]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [3, 5, 1, 2, 0, 4]\nB: [4, 5, 0, 2, 3, 1]\nC: [1, 2, 0, 4, 5, 3]\nD: [2, 1, 0, 3, 5, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_10_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_10_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_10_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_10_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_10_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_10_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [2, 1, 0, 3]\nB: [0, 2, 1, 3]\nC: [1, 2, 3, 0]\nD: [3, 2, 0, 1]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [2, 1, 0, 3]\nB: [0, 2, 1, 3]\nC: [1, 2, 3, 0]\nD: [3, 2, 0, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_11_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_11_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_11_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_11_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [4, 2, 0, 1, 3]\nB: [3, 0, 2, 1, 4]\nC: [4, 0, 2, 1, 3]\nD: [4, 3, 2, 1, 0]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [4, 2, 0, 1, 3]\nB: [3, 0, 2, 1, 4]\nC: [4, 0, 2, 1, 3]\nD: [4, 3, 2, 1, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_12_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_12_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_12_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_12_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_12_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [3, 1, 2, 0]\nB: [1, 2, 3, 0]\nC: [2, 0, 3, 1]\nD: [0, 3, 1, 2]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [3, 1, 2, 0]\nB: [1, 2, 3, 0]\nC: [2, 0, 3, 1]\nD: [0, 3, 1, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_13_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_13_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_13_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_13_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [1, 4, 2, 0, 3]\nB: [0, 3, 4, 2, 1]\nC: [3, 0, 4, 1, 2]\nD: [1, 2, 0, 3, 4]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [1, 4, 2, 0, 3]\nB: [0, 3, 4, 2, 1]\nC: [3, 0, 4, 1, 2]\nD: [1, 2, 0, 3, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_14_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_14_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_14_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_14_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_14_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [0, 1, 2, 3]\nB: [0, 2, 1, 3]\nC: [1, 3, 0, 2]\nD: [1, 3, 2, 0]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 2, 1, 3]\nC: [1, 3, 0, 2]\nD: [1, 3, 2, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_15_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_15_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_15_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_15_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [2, 1, 3, 0, 4]\nB: [3, 4, 0, 1, 2]\nC: [1, 4, 3, 0, 2]\nD: [0, 3, 2, 1, 4]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [2, 1, 3, 0, 4]\nB: [3, 4, 0, 1, 2]\nC: [1, 4, 3, 0, 2]\nD: [0, 3, 2, 1, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_16_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_16_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_16_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_16_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_16_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [0, 1, 2, 3]\nB: [3, 2, 0, 1]\nC: [3, 2, 0, 1]\nD: [2, 0, 3, 1]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 2, 0, 1]\nC: [3, 2, 0, 1]\nD: [2, 0, 3, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_17_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_17_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_17_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_17_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [2, 3, 0, 1]\nB: [1, 0, 3, 2]\nC: [3, 2, 1, 0]\nD: [1, 3, 2, 0]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [2, 3, 0, 1]\nB: [1, 0, 3, 2]\nC: [3, 2, 1, 0]\nD: [1, 3, 2, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_18_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_18_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_18_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_18_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [3, 2, 1, 0]\nB: [0, 2, 3, 1]\nC: [2, 0, 1, 3]\nD: [1, 2, 0, 3]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [3, 2, 1, 0]\nB: [0, 2, 3, 1]\nC: [2, 0, 1, 3]\nD: [1, 2, 0, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_19_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_19_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_19_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_19_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [1, 3, 2, 0]\nB: [0, 2, 1, 3]\nC: [3, 2, 0, 1]\nD: [3, 1, 0, 2]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [1, 3, 2, 0]\nB: [0, 2, 1, 3]\nC: [3, 2, 0, 1]\nD: [3, 1, 0, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_20_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_20_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_20_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_20_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [0, 10, 3, 17, 4, 6, 8, 7, 1, 12, 2, 14, 5, 19, 15, 20, 9, 23, 11, 24, 22, 16, 13, 21, 18]\nB: [23, 3, 10, 16, 20, 2, 17, 19, 21, 7, 15, 11, 0, 9, 12, 24, 4, 13, 18, 5, 1, 22, 6, 8, 14]\nC: [15, 23, 6, 13, 17, 18, 7, 1, 16, 10, 8, 11, 20, 0, 21, 2, 9, 22, 14, 5, 24, 3, 4, 12, 19]\nD: [14, 7, 3, 5, 4, 6, 10, 21, 23, 18, 16, 22, 0, 19, 1, 12, 17, 8, 24, 11, 13, 9, 20, 15, 2]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [0, 10, 3, 17, 4, 6, 8, 7, 1, 12, 2, 14, 5, 19, 15, 20, 9, 23, 11, 24, 22, 16, 13, 21, 18]\nB: [23, 3, 10, 16, 20, 2, 17, 19, 21, 7, 15, 11, 0, 9, 12, 24, 4, 13, 18, 5, 1, 22, 6, 8, 14]\nC: [15, 23, 6, 13, 17, 18, 7, 1, 16, 10, 8, 11, 20, 0, 21, 2, 9, 22, 14, 5, 24, 3, 4, 12, 19]\nD: [14, 7, 3, 5, 4, 6, 10, 21, 23, 18, 16, 22, 0, 19, 1, 12, 17, 8, 24, 11, 13, 9, 20, 15, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_21_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_21_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_21_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_21_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_21_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_21_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_21_6.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_21_7.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_21_8.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_21_9.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_21_10.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_21_11.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_21_12.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_21_13.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_21_14.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_21_15.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_21_16.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_21_17.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_21_18.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_21_19.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_21_20.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_21_21.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_21_22.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_21_23.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_21_24.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [1, 5, 6, 0, 3, 2, 4]\nB: [6, 4, 5, 0, 2, 3, 1]\nC: [4, 6, 1, 3, 2, 0, 5]\nD: [6, 3, 1, 0, 2, 5, 4]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [1, 5, 6, 0, 3, 2, 4]\nB: [6, 4, 5, 0, 2, 3, 1]\nC: [4, 6, 1, 3, 2, 0, 5]\nD: [6, 3, 1, 0, 2, 5, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_22_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_22_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_22_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_22_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_22_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_22_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_22_6.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [3, 5, 0, 2, 1, 4, 6]\nB: [0, 5, 1, 2, 4, 6, 3]\nC: [0, 1, 3, 2, 5, 6, 4]\nD: [3, 1, 0, 5, 4, 2, 6]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [3, 5, 0, 2, 1, 4, 6]\nB: [0, 5, 1, 2, 4, 6, 3]\nC: [0, 1, 3, 2, 5, 6, 4]\nD: [3, 1, 0, 5, 4, 2, 6]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_23_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_23_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_23_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_23_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_23_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_23_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_23_6.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [7, 2, 1, 9, 8, 3, 5, 4, 6, 0]\nB: [8, 0, 2, 1, 6, 5, 9, 7, 4, 3]\nC: [7, 9, 5, 4, 3, 6, 8, 1, 0, 2]\nD: [5, 6, 8, 4, 0, 9, 1, 3, 7, 2]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [7, 2, 1, 9, 8, 3, 5, 4, 6, 0]\nB: [8, 0, 2, 1, 6, 5, 9, 7, 4, 3]\nC: [7, 9, 5, 4, 3, 6, 8, 1, 0, 2]\nD: [5, 6, 8, 4, 0, 9, 1, 3, 7, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_24_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_24_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_24_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_24_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_24_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_24_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_24_6.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_24_7.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_24_8.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_24_9.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [2, 3, 4, 1, 0]\nB: [1, 3, 2, 0, 4]\nC: [2, 1, 0, 4, 3]\nD: [4, 2, 3, 1, 0]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [2, 3, 4, 1, 0]\nB: [1, 3, 2, 0, 4]\nC: [2, 1, 0, 4, 3]\nD: [4, 2, 3, 1, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_25_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_25_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_25_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_25_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_25_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [11, 14, 9, 7, 0, 6, 2, 3, 13, 1, 8, 10, 5, 4, 12]\nB: [5, 6, 10, 2, 9, 0, 14, 8, 13, 3, 1, 12, 7, 4, 11]\nC: [13, 9, 14, 8, 6, 10, 11, 4, 1, 2, 7, 12, 3, 0, 5]\nD: [10, 7, 13, 12, 3, 5, 1, 9, 6, 14, 8, 11, 0, 4, 2]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [11, 14, 9, 7, 0, 6, 2, 3, 13, 1, 8, 10, 5, 4, 12]\nB: [5, 6, 10, 2, 9, 0, 14, 8, 13, 3, 1, 12, 7, 4, 11]\nC: [13, 9, 14, 8, 6, 10, 11, 4, 1, 2, 7, 12, 3, 0, 5]\nD: [10, 7, 13, 12, 3, 5, 1, 9, 6, 14, 8, 11, 0, 4, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_26_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_26_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_26_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_26_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_26_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_26_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_26_6.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_26_7.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_26_8.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_26_9.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_26_10.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_26_11.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_26_12.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_26_13.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_26_14.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [6, 5, 3, 1, 2, 4, 0]\nB: [4, 6, 1, 5, 0, 3, 2]\nC: [3, 0, 6, 5, 1, 4, 2]\nD: [5, 2, 6, 1, 4, 3, 0]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [6, 5, 3, 1, 2, 4, 0]\nB: [4, 6, 1, 5, 0, 3, 2]\nC: [3, 0, 6, 5, 1, 4, 2]\nD: [5, 2, 6, 1, 4, 3, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_27_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_27_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_27_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_27_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_27_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_27_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_27_6.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [3, 1, 2, 0]\nB: [1, 2, 3, 0]\nC: [0, 2, 1, 3]\nD: [2, 1, 0, 3]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [3, 1, 2, 0]\nB: [1, 2, 3, 0]\nC: [0, 2, 1, 3]\nD: [2, 1, 0, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_28_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_28_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_28_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_28_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [7, 8, 2, 3, 1, 0, 5, 4, 6]\nB: [1, 6, 2, 4, 5, 7, 0, 8, 3]\nC: [8, 0, 6, 3, 7, 1, 4, 2, 5]\nD: [8, 6, 2, 7, 0, 4, 1, 3, 5]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [7, 8, 2, 3, 1, 0, 5, 4, 6]\nB: [1, 6, 2, 4, 5, 7, 0, 8, 3]\nC: [8, 0, 6, 3, 7, 1, 4, 2, 5]\nD: [8, 6, 2, 7, 0, 4, 1, 3, 5]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_29_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_29_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_29_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_29_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_29_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_29_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_29_6.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_29_7.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_29_8.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [2, 5, 0, 4, 1, 3]\nB: [0, 3, 5, 4, 2, 1]\nC: [5, 1, 0, 3, 4, 2]\nD: [5, 4, 3, 1, 2, 0]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [2, 5, 0, 4, 1, 3]\nB: [0, 3, 5, 4, 2, 1]\nC: [5, 1, 0, 3, 4, 2]\nD: [5, 4, 3, 1, 2, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_30_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_30_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_30_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_30_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_30_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_30_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [3, 0, 4, 2, 5, 6, 1]\nB: [0, 6, 3, 1, 2, 5, 4]\nC: [6, 5, 0, 3, 2, 4, 1]\nD: [3, 6, 5, 0, 4, 1, 2]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [3, 0, 4, 2, 5, 6, 1]\nB: [0, 6, 3, 1, 2, 5, 4]\nC: [6, 5, 0, 3, 2, 4, 1]\nD: [3, 6, 5, 0, 4, 1, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_31_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_31_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_31_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_31_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_31_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_31_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_31_6.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [2, 0, 1, 3]\nB: [3, 0, 2, 1]\nC: [0, 1, 3, 2]\nD: [2, 1, 3, 0]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [2, 0, 1, 3]\nB: [3, 0, 2, 1]\nC: [0, 1, 3, 2]\nD: [2, 1, 3, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_32_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_32_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_32_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_32_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [0, 1, 4, 2, 3]\nB: [3, 0, 1, 2, 4]\nC: [4, 2, 0, 3, 1]\nD: [1, 0, 4, 3, 2]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [0, 1, 4, 2, 3]\nB: [3, 0, 1, 2, 4]\nC: [4, 2, 0, 3, 1]\nD: [1, 0, 4, 3, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_33_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_33_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_33_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_33_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_33_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [3, 2, 1, 0, 4, 5]\nB: [5, 0, 4, 1, 2, 3]\nC: [5, 0, 1, 3, 4, 2]\nD: [1, 0, 5, 4, 3, 2]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [3, 2, 1, 0, 4, 5]\nB: [5, 0, 4, 1, 2, 3]\nC: [5, 0, 1, 3, 4, 2]\nD: [1, 0, 5, 4, 3, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_34_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_34_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_34_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_34_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_34_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_34_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [3, 0, 2, 1]\nB: [0, 2, 1, 3]\nC: [0, 3, 2, 1]\nD: [1, 3, 0, 2]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [3, 0, 2, 1]\nB: [0, 2, 1, 3]\nC: [0, 3, 2, 1]\nD: [1, 3, 0, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_35_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_35_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_35_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_35_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [3, 2, 0, 1]\nB: [1, 3, 0, 2]\nC: [0, 2, 1, 3]\nD: [1, 3, 0, 2]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [3, 2, 0, 1]\nB: [1, 3, 0, 2]\nC: [0, 2, 1, 3]\nD: [1, 3, 0, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_36_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_36_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_36_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_36_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [4, 1, 2, 0, 3]\nB: [0, 3, 2, 1, 4]\nC: [3, 2, 4, 0, 1]\nD: [1, 3, 4, 2, 0]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [4, 1, 2, 0, 3]\nB: [0, 3, 2, 1, 4]\nC: [3, 2, 4, 0, 1]\nD: [1, 3, 4, 2, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_37_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_37_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_37_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_37_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_37_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [1, 2, 3, 0]\nB: [3, 0, 2, 1]\nC: [1, 3, 0, 2]\nD: [1, 0, 2, 3]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [1, 2, 3, 0]\nB: [3, 0, 2, 1]\nC: [1, 3, 0, 2]\nD: [1, 0, 2, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_38_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_38_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_38_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_38_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [0, 4, 3, 2, 1]\nB: [1, 0, 3, 4, 2]\nC: [4, 3, 2, 0, 1]\nD: [3, 1, 2, 0, 4]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [0, 4, 3, 2, 1]\nB: [1, 0, 3, 4, 2]\nC: [4, 3, 2, 0, 1]\nD: [3, 1, 2, 0, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_39_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_39_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_39_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_39_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_39_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [7, 1, 6, 0, 2, 5, 3, 4]\nB: [4, 2, 7, 6, 0, 3, 5, 1]\nC: [2, 0, 5, 1, 4, 3, 7, 6]\nD: [5, 3, 1, 2, 4, 7, 6, 0]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [7, 1, 6, 0, 2, 5, 3, 4]\nB: [4, 2, 7, 6, 0, 3, 5, 1]\nC: [2, 0, 5, 1, 4, 3, 7, 6]\nD: [5, 3, 1, 2, 4, 7, 6, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_40_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_40_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_40_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_40_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_40_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_40_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_40_6.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_40_7.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [1, 2, 3, 0]\nB: [1, 2, 3, 0]\nC: [2, 0, 3, 1]\nD: [3, 2, 0, 1]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [1, 2, 3, 0]\nB: [1, 2, 3, 0]\nC: [2, 0, 3, 1]\nD: [3, 2, 0, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_41_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_41_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_41_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_41_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [3, 1, 0, 4, 2]\nB: [4, 2, 1, 0, 3]\nC: [2, 3, 0, 1, 4]\nD: [2, 3, 4, 1, 0]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [3, 1, 0, 4, 2]\nB: [4, 2, 1, 0, 3]\nC: [2, 3, 0, 1, 4]\nD: [2, 3, 4, 1, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_42_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_42_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_42_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_42_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_42_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [3, 0, 1, 2]\nB: [0, 1, 2, 3]\nC: [1, 3, 2, 0]\nD: [1, 3, 2, 0]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [3, 0, 1, 2]\nB: [0, 1, 2, 3]\nC: [1, 3, 2, 0]\nD: [1, 3, 2, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_43_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_43_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_43_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_43_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [9, 0, 2, 5, 8, 3, 1, 6, 10, 4, 7]\nB: [9, 2, 1, 4, 7, 8, 3, 0, 10, 5, 6]\nC: [2, 5, 0, 9, 6, 4, 10, 7, 3, 1, 8]\nD: [6, 0, 2, 7, 5, 10, 3, 1, 9, 8, 4]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [9, 0, 2, 5, 8, 3, 1, 6, 10, 4, 7]\nB: [9, 2, 1, 4, 7, 8, 3, 0, 10, 5, 6]\nC: [2, 5, 0, 9, 6, 4, 10, 7, 3, 1, 8]\nD: [6, 0, 2, 7, 5, 10, 3, 1, 9, 8, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_44_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_44_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_44_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_44_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_44_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_44_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_44_6.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_44_7.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_44_8.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_44_9.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_44_10.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [4, 2, 1, 0, 3]\nB: [3, 4, 0, 1, 2]\nC: [0, 1, 4, 3, 2]\nD: [3, 0, 4, 1, 2]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [4, 2, 1, 0, 3]\nB: [3, 4, 0, 1, 2]\nC: [0, 1, 4, 3, 2]\nD: [3, 0, 4, 1, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_45_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_45_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_45_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_45_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_45_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [4, 2, 3, 1, 5, 0]\nB: [1, 5, 0, 2, 4, 3]\nC: [3, 1, 0, 4, 5, 2]\nD: [2, 3, 1, 4, 0, 5]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [4, 2, 3, 1, 5, 0]\nB: [1, 5, 0, 2, 4, 3]\nC: [3, 1, 0, 4, 5, 2]\nD: [2, 3, 1, 4, 0, 5]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_46_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_46_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_46_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_46_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_46_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_46_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [0, 2, 3, 1]\nB: [2, 3, 0, 1]\nC: [3, 2, 1, 0]\nD: [1, 2, 3, 0]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [0, 2, 3, 1]\nB: [2, 3, 0, 1]\nC: [3, 2, 1, 0]\nD: [1, 2, 3, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_47_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_47_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_47_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_47_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [8, 3, 1, 4, 9, 5, 6, 10, 7, 0, 2]\nB: [8, 6, 3, 0, 9, 4, 7, 5, 10, 2, 1]\nC: [0, 2, 9, 10, 6, 7, 8, 3, 4, 1, 5]\nD: [7, 3, 1, 10, 6, 2, 0, 8, 5, 9, 4]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [8, 3, 1, 4, 9, 5, 6, 10, 7, 0, 2]\nB: [8, 6, 3, 0, 9, 4, 7, 5, 10, 2, 1]\nC: [0, 2, 9, 10, 6, 7, 8, 3, 4, 1, 5]\nD: [7, 3, 1, 10, 6, 2, 0, 8, 5, 9, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_48_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_48_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_48_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_48_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_48_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_48_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_48_6.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_48_7.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_48_8.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_48_9.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_48_10.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [4, 6, 12, 5, 11, 23, 16, 21, 9, 25, 0, 8, 20, 13, 1, 10, 7, 3, 24, 14, 19, 17, 22, 15, 18, 2]\nB: [7, 1, 14, 9, 13, 20, 0, 3, 10, 21, 6, 16, 17, 22, 25, 2, 15, 24, 5, 23, 4, 11, 12, 18, 8, 19]\nC: [11, 7, 8, 12, 2, 13, 25, 21, 17, 15, 10, 1, 9, 23, 4, 19, 3, 6, 20, 16, 22, 14, 24, 5, 18, 0]\nD: [0, 3, 15, 18, 12, 4, 5, 24, 19, 1, 16, 9, 25, 22, 8, 2, 13, 14, 20, 6, 21, 23, 17, 10, 7, 11]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [4, 6, 12, 5, 11, 23, 16, 21, 9, 25, 0, 8, 20, 13, 1, 10, 7, 3, 24, 14, 19, 17, 22, 15, 18, 2]\nB: [7, 1, 14, 9, 13, 20, 0, 3, 10, 21, 6, 16, 17, 22, 25, 2, 15, 24, 5, 23, 4, 11, 12, 18, 8, 19]\nC: [11, 7, 8, 12, 2, 13, 25, 21, 17, 15, 10, 1, 9, 23, 4, 19, 3, 6, 20, 16, 22, 14, 24, 5, 18, 0]\nD: [0, 3, 15, 18, 12, 4, 5, 24, 19, 1, 16, 9, 25, 22, 8, 2, 13, 14, 20, 6, 21, 23, 17, 10, 7, 11]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_49_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_49_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_49_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_49_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_49_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_49_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_49_6.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_49_7.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_49_8.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_49_9.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_49_10.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_49_11.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_49_12.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_49_13.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_49_14.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_49_15.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_49_16.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_49_17.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_49_18.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_49_19.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_49_20.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_49_21.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_49_22.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_49_23.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_49_24.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_49_25.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [1, 8, 2, 6, 5, 0, 9, 7, 4, 3]\nB: [9, 5, 4, 2, 3, 7, 1, 8, 0, 6]\nC: [5, 0, 1, 2, 7, 8, 3, 4, 6, 9]\nD: [1, 4, 3, 5, 6, 8, 9, 0, 7, 2]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [1, 8, 2, 6, 5, 0, 9, 7, 4, 3]\nB: [9, 5, 4, 2, 3, 7, 1, 8, 0, 6]\nC: [5, 0, 1, 2, 7, 8, 3, 4, 6, 9]\nD: [1, 4, 3, 5, 6, 8, 9, 0, 7, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_50_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_50_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_50_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_50_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_50_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_50_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_50_6.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_50_7.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_50_8.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_50_9.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [6, 1, 0, 5, 2, 3, 4]\nB: [5, 4, 0, 3, 6, 2, 1]\nC: [2, 4, 3, 1, 5, 0, 6]\nD: [5, 4, 6, 1, 3, 0, 2]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [6, 1, 0, 5, 2, 3, 4]\nB: [5, 4, 0, 3, 6, 2, 1]\nC: [2, 4, 3, 1, 5, 0, 6]\nD: [5, 4, 6, 1, 3, 0, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_51_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_51_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_51_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_51_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_51_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_51_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_51_6.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [0, 3, 2, 1, 4]\nB: [4, 1, 0, 3, 2]\nC: [3, 4, 2, 1, 0]\nD: [4, 0, 1, 3, 2]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [0, 3, 2, 1, 4]\nB: [4, 1, 0, 3, 2]\nC: [3, 4, 2, 1, 0]\nD: [4, 0, 1, 3, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_52_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_52_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_52_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_52_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_52_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [3, 1, 0, 2]\nB: [1, 0, 2, 3]\nC: [0, 1, 2, 3]\nD: [1, 3, 2, 0]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [3, 1, 0, 2]\nB: [1, 0, 2, 3]\nC: [0, 1, 2, 3]\nD: [1, 3, 2, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_53_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_53_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_53_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_53_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [7, 6, 0, 2, 1, 3, 4, 5]\nB: [6, 7, 2, 4, 1, 3, 0, 5]\nC: [3, 4, 6, 7, 1, 2, 5, 0]\nD: [2, 7, 5, 0, 6, 1, 4, 3]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [7, 6, 0, 2, 1, 3, 4, 5]\nB: [6, 7, 2, 4, 1, 3, 0, 5]\nC: [3, 4, 6, 7, 1, 2, 5, 0]\nD: [2, 7, 5, 0, 6, 1, 4, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_54_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_54_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_54_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_54_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_54_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_54_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_54_6.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_54_7.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [2, 3, 0, 5, 6, 1, 8, 4, 7]\nB: [2, 5, 8, 3, 0, 6, 1, 7, 4]\nC: [1, 0, 6, 7, 5, 4, 2, 3, 8]\nD: [5, 8, 0, 3, 4, 1, 6, 7, 2]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [2, 3, 0, 5, 6, 1, 8, 4, 7]\nB: [2, 5, 8, 3, 0, 6, 1, 7, 4]\nC: [1, 0, 6, 7, 5, 4, 2, 3, 8]\nD: [5, 8, 0, 3, 4, 1, 6, 7, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_55_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_55_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_55_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_55_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_55_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_55_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_55_6.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_55_7.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_55_8.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [4, 1, 3, 5, 2, 0]\nB: [5, 2, 4, 3, 0, 1]\nC: [5, 3, 2, 0, 1, 4]\nD: [4, 1, 2, 3, 5, 0]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [4, 1, 3, 5, 2, 0]\nB: [5, 2, 4, 3, 0, 1]\nC: [5, 3, 2, 0, 1, 4]\nD: [4, 1, 2, 3, 5, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_56_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_56_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_56_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_56_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_56_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_56_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [1, 3, 4, 0, 2, 5]\nB: [5, 2, 0, 1, 3, 4]\nC: [3, 1, 2, 4, 5, 0]\nD: [0, 1, 5, 3, 2, 4]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [1, 3, 4, 0, 2, 5]\nB: [5, 2, 0, 1, 3, 4]\nC: [3, 1, 2, 4, 5, 0]\nD: [0, 1, 5, 3, 2, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_57_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_57_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_57_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_57_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_57_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_57_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [2, 3, 1, 0]\nB: [0, 2, 3, 1]\nC: [0, 2, 3, 1]\nD: [0, 2, 3, 1]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [2, 3, 1, 0]\nB: [0, 2, 3, 1]\nC: [0, 2, 3, 1]\nD: [0, 2, 3, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_58_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_58_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_58_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_58_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [0, 3, 5, 15, 12, 11, 2, 13, 1, 10, 6, 14, 9, 4, 8, 7]\nB: [9, 3, 12, 11, 6, 0, 5, 14, 4, 10, 15, 8, 7, 13, 2, 1]\nC: [5, 1, 2, 3, 14, 7, 0, 13, 11, 15, 8, 9, 12, 4, 10, 6]\nD: [13, 6, 14, 10, 2, 3, 5, 12, 7, 15, 9, 8, 4, 1, 0, 11]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [0, 3, 5, 15, 12, 11, 2, 13, 1, 10, 6, 14, 9, 4, 8, 7]\nB: [9, 3, 12, 11, 6, 0, 5, 14, 4, 10, 15, 8, 7, 13, 2, 1]\nC: [5, 1, 2, 3, 14, 7, 0, 13, 11, 15, 8, 9, 12, 4, 10, 6]\nD: [13, 6, 14, 10, 2, 3, 5, 12, 7, 15, 9, 8, 4, 1, 0, 11]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_59_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_59_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_59_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_59_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_59_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_59_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_59_6.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_59_7.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_59_8.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_59_9.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_59_10.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_59_11.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_59_12.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_59_13.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_59_14.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_59_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [9, 8, 4, 6, 2, 5, 1, 7, 0, 10, 3, 11]\nB: [11, 4, 2, 5, 3, 6, 8, 10, 9, 7, 1, 0]\nC: [7, 8, 1, 6, 11, 5, 4, 3, 0, 2, 9, 10]\nD: [4, 6, 8, 2, 5, 0, 3, 1, 10, 7, 9, 11]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [9, 8, 4, 6, 2, 5, 1, 7, 0, 10, 3, 11]\nB: [11, 4, 2, 5, 3, 6, 8, 10, 9, 7, 1, 0]\nC: [7, 8, 1, 6, 11, 5, 4, 3, 0, 2, 9, 10]\nD: [4, 6, 8, 2, 5, 0, 3, 1, 10, 7, 9, 11]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_60_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_60_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_60_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_60_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_60_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_60_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_60_6.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_60_7.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_60_8.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_60_9.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_60_10.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_60_11.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [0, 4, 1, 2, 5, 3]\nB: [4, 2, 5, 0, 3, 1]\nC: [3, 1, 0, 2, 5, 4]\nD: [0, 2, 3, 1, 4, 5]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [0, 4, 1, 2, 5, 3]\nB: [4, 2, 5, 0, 3, 1]\nC: [3, 1, 0, 2, 5, 4]\nD: [0, 2, 3, 1, 4, 5]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_61_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_61_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_61_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_61_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_61_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_61_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [3, 2, 0, 1]\nB: [0, 3, 2, 1]\nC: [1, 2, 0, 3]\nD: [1, 3, 2, 0]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [3, 2, 0, 1]\nB: [0, 3, 2, 1]\nC: [1, 2, 0, 3]\nD: [1, 3, 2, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_62_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_62_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_62_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_62_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "pouring",
+    "options": "A: [8, 1, 0, 6, 5, 7, 2, 3, 4]\nB: [2, 6, 8, 7, 4, 0, 1, 5, 3]\nC: [5, 8, 2, 7, 1, 3, 0, 4, 6]\nD: [1, 3, 2, 6, 5, 8, 0, 7, 4]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [8, 1, 0, 6, 5, 7, 2, 3, 4]\nB: [2, 6, 8, 7, 4, 0, 1, 5, 3]\nC: [5, 8, 2, 7, 1, 3, 0, 4, 6]\nD: [1, 3, 2, 6, 5, 8, 0, 7, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_63_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_63_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_63_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_63_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_63_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_63_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_63_6.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_63_7.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_63_8.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [3, 4, 2, 5, 0, 1]\nB: [4, 5, 1, 2, 0, 3]\nC: [4, 2, 0, 1, 3, 5]\nD: [0, 4, 3, 1, 2, 5]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [3, 4, 2, 5, 0, 1]\nB: [4, 5, 1, 2, 0, 3]\nC: [4, 2, 0, 1, 3, 5]\nD: [0, 4, 3, 1, 2, 5]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_64_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_64_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_64_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_64_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_64_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_64_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [3, 4, 0, 2, 1]\nB: [4, 1, 0, 2, 3]\nC: [1, 0, 2, 3, 4]\nD: [0, 1, 3, 4, 2]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [3, 4, 0, 2, 1]\nB: [4, 1, 0, 2, 3]\nC: [1, 0, 2, 3, 4]\nD: [0, 1, 3, 4, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_65_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_65_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_65_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_65_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_65_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [0, 5, 4, 3, 2, 1]\nB: [1, 3, 2, 4, 5, 0]\nC: [4, 2, 5, 1, 0, 3]\nD: [4, 0, 3, 5, 2, 1]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [0, 5, 4, 3, 2, 1]\nB: [1, 3, 2, 4, 5, 0]\nC: [4, 2, 5, 1, 0, 3]\nD: [4, 0, 3, 5, 2, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_66_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_66_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_66_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_66_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_66_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_66_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [0, 1, 5, 4, 2, 3]\nB: [0, 5, 1, 2, 4, 3]\nC: [1, 3, 0, 4, 5, 2]\nD: [4, 0, 3, 1, 2, 5]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [0, 1, 5, 4, 2, 3]\nB: [0, 5, 1, 2, 4, 3]\nC: [1, 3, 0, 4, 5, 2]\nD: [4, 0, 3, 1, 2, 5]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_67_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_67_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_67_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_67_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_67_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_67_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [0, 6, 2, 5, 7, 3, 1, 4]\nB: [5, 1, 0, 6, 4, 7, 3, 2]\nC: [6, 4, 2, 5, 7, 1, 0, 3]\nD: [0, 2, 4, 7, 3, 5, 6, 1]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [0, 6, 2, 5, 7, 3, 1, 4]\nB: [5, 1, 0, 6, 4, 7, 3, 2]\nC: [6, 4, 2, 5, 7, 1, 0, 3]\nD: [0, 2, 4, 7, 3, 5, 6, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_68_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_68_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_68_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_68_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_68_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_68_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_68_6.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_68_7.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [3, 1, 2, 0]\nB: [3, 1, 2, 0]\nC: [0, 1, 2, 3]\nD: [1, 3, 0, 2]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [3, 1, 2, 0]\nB: [3, 1, 2, 0]\nC: [0, 1, 2, 3]\nD: [1, 3, 0, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_69_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_69_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_69_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_69_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [1, 2, 0, 3]\nB: [1, 3, 0, 2]\nC: [2, 0, 1, 3]\nD: [0, 1, 3, 2]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [1, 2, 0, 3]\nB: [1, 3, 0, 2]\nC: [2, 0, 1, 3]\nD: [0, 1, 3, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_70_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_70_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_70_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_70_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [3, 0, 2, 1]\nB: [3, 0, 2, 1]\nC: [3, 2, 0, 1]\nD: [2, 1, 3, 0]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [3, 0, 2, 1]\nB: [3, 0, 2, 1]\nC: [3, 2, 0, 1]\nD: [2, 1, 3, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_71_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_71_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_71_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_71_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [5, 3, 1, 0, 2, 4]\nB: [5, 4, 0, 2, 1, 3]\nC: [3, 5, 4, 0, 2, 1]\nD: [2, 0, 3, 5, 1, 4]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [5, 3, 1, 0, 2, 4]\nB: [5, 4, 0, 2, 1, 3]\nC: [3, 5, 4, 0, 2, 1]\nD: [2, 0, 3, 5, 1, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_72_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_72_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_72_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_72_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_72_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_72_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [0, 1, 3, 2]\nB: [3, 2, 0, 1]\nC: [3, 1, 2, 0]\nD: [3, 0, 2, 1]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [0, 1, 3, 2]\nB: [3, 2, 0, 1]\nC: [3, 1, 2, 0]\nD: [3, 0, 2, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_73_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_73_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_73_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_73_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "pouring",
+    "options": "A: [3, 4, 5, 6, 2, 1, 0]\nB: [2, 6, 3, 0, 1, 5, 4]\nC: [5, 2, 4, 6, 3, 1, 0]\nD: [4, 0, 6, 1, 2, 5, 3]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [3, 4, 5, 6, 2, 1, 0]\nB: [2, 6, 3, 0, 1, 5, 4]\nC: [5, 2, 4, 6, 3, 1, 0]\nD: [4, 0, 6, 1, 2, 5, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_74_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_74_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_74_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_74_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_74_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_74_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_74_6.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [2, 5, 3, 4, 1, 0, 6]\nB: [5, 6, 1, 0, 4, 2, 3]\nC: [0, 4, 2, 5, 1, 6, 3]\nD: [4, 3, 2, 6, 1, 5, 0]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [2, 5, 3, 4, 1, 0, 6]\nB: [5, 6, 1, 0, 4, 2, 3]\nC: [0, 4, 2, 5, 1, 6, 3]\nD: [4, 3, 2, 6, 1, 5, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_75_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_75_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_75_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_75_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_75_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_75_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_75_6.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [5, 2, 0, 6, 3, 4, 1]\nB: [5, 4, 3, 6, 2, 0, 1]\nC: [4, 6, 3, 2, 5, 0, 1]\nD: [3, 4, 2, 5, 0, 1, 6]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [5, 2, 0, 6, 3, 4, 1]\nB: [5, 4, 3, 6, 2, 0, 1]\nC: [4, 6, 3, 2, 5, 0, 1]\nD: [3, 4, 2, 5, 0, 1, 6]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_76_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_76_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_76_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_76_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_76_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_76_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_76_6.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [0, 3, 2, 1]\nB: [2, 1, 0, 3]\nC: [1, 0, 2, 3]\nD: [1, 0, 2, 3]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [0, 3, 2, 1]\nB: [2, 1, 0, 3]\nC: [1, 0, 2, 3]\nD: [1, 0, 2, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_77_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_77_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_77_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_77_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "pouring",
+    "options": "A: [7, 9, 0, 8, 4, 2, 5, 6, 1, 3]\nB: [3, 4, 8, 9, 2, 1, 6, 7, 0, 5]\nC: [4, 0, 9, 1, 6, 7, 3, 5, 8, 2]\nD: [7, 2, 1, 9, 4, 6, 5, 3, 8, 0]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [7, 9, 0, 8, 4, 2, 5, 6, 1, 3]\nB: [3, 4, 8, 9, 2, 1, 6, 7, 0, 5]\nC: [4, 0, 9, 1, 6, 7, 3, 5, 8, 2]\nD: [7, 2, 1, 9, 4, 6, 5, 3, 8, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_78_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_78_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_78_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_78_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_78_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_78_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_78_6.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_78_7.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_78_8.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_78_9.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [1, 2, 3, 0]\nB: [2, 0, 3, 1]\nC: [0, 1, 3, 2]\nD: [3, 2, 1, 0]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [1, 2, 3, 0]\nB: [2, 0, 3, 1]\nC: [0, 1, 3, 2]\nD: [3, 2, 1, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_79_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_79_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_79_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_79_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [12, 19, 11, 1, 15, 16, 5, 3, 7, 6, 4, 14, 9, 17, 2, 10, 0, 8, 18, 13]\nB: [8, 19, 1, 11, 9, 14, 10, 4, 0, 7, 6, 12, 15, 13, 17, 2, 5, 16, 18, 3]\nC: [13, 2, 0, 15, 10, 17, 11, 7, 4, 1, 19, 16, 5, 18, 9, 8, 6, 14, 12, 3]\nD: [19, 11, 3, 18, 7, 8, 12, 16, 17, 2, 0, 10, 6, 15, 1, 13, 14, 5, 9, 4]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [12, 19, 11, 1, 15, 16, 5, 3, 7, 6, 4, 14, 9, 17, 2, 10, 0, 8, 18, 13]\nB: [8, 19, 1, 11, 9, 14, 10, 4, 0, 7, 6, 12, 15, 13, 17, 2, 5, 16, 18, 3]\nC: [13, 2, 0, 15, 10, 17, 11, 7, 4, 1, 19, 16, 5, 18, 9, 8, 6, 14, 12, 3]\nD: [19, 11, 3, 18, 7, 8, 12, 16, 17, 2, 0, 10, 6, 15, 1, 13, 14, 5, 9, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_80_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_80_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_80_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_80_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_80_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_80_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_80_6.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_80_7.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_80_8.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_80_9.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_80_10.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_80_11.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_80_12.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_80_13.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_80_14.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_80_15.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_80_16.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_80_17.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_80_18.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_80_19.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [0, 3, 2, 1]\nB: [1, 2, 3, 0]\nC: [2, 3, 1, 0]\nD: [1, 3, 0, 2]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [0, 3, 2, 1]\nB: [1, 2, 3, 0]\nC: [2, 3, 1, 0]\nD: [1, 3, 0, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_81_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_81_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_81_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_81_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "pouring",
+    "options": "A: [3, 5, 1, 9, 10, 2, 4, 6, 0, 8, 7]\nB: [3, 7, 6, 8, 2, 4, 9, 5, 0, 1, 10]\nC: [0, 9, 6, 7, 10, 1, 2, 3, 4, 5, 8]\nD: [3, 9, 10, 0, 4, 8, 1, 5, 7, 2, 6]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [3, 5, 1, 9, 10, 2, 4, 6, 0, 8, 7]\nB: [3, 7, 6, 8, 2, 4, 9, 5, 0, 1, 10]\nC: [0, 9, 6, 7, 10, 1, 2, 3, 4, 5, 8]\nD: [3, 9, 10, 0, 4, 8, 1, 5, 7, 2, 6]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_82_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_82_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_82_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_82_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_82_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_82_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_82_6.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_82_7.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_82_8.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_82_9.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_82_10.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "pouring",
+    "options": "A: [8, 6, 1, 7, 0, 5, 3, 4, 2, 9]\nB: [2, 0, 9, 8, 6, 3, 4, 5, 7, 1]\nC: [4, 7, 3, 0, 1, 2, 5, 6, 8, 9]\nD: [9, 5, 2, 3, 6, 0, 1, 7, 4, 8]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [8, 6, 1, 7, 0, 5, 3, 4, 2, 9]\nB: [2, 0, 9, 8, 6, 3, 4, 5, 7, 1]\nC: [4, 7, 3, 0, 1, 2, 5, 6, 8, 9]\nD: [9, 5, 2, 3, 6, 0, 1, 7, 4, 8]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_83_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_83_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_83_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_83_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_83_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_83_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_83_6.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_83_7.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_83_8.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_83_9.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [4, 1, 6, 2, 5, 0, 3]\nB: [6, 5, 3, 0, 4, 2, 1]\nC: [3, 5, 6, 2, 4, 0, 1]\nD: [5, 3, 4, 1, 0, 6, 2]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [4, 1, 6, 2, 5, 0, 3]\nB: [6, 5, 3, 0, 4, 2, 1]\nC: [3, 5, 6, 2, 4, 0, 1]\nD: [5, 3, 4, 1, 0, 6, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_84_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_84_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_84_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_84_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_84_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_84_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_84_6.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [7, 0, 5, 6, 3, 4, 1, 2]\nB: [2, 6, 7, 0, 5, 1, 3, 4]\nC: [0, 2, 3, 5, 4, 6, 7, 1]\nD: [0, 4, 2, 5, 3, 7, 1, 6]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [7, 0, 5, 6, 3, 4, 1, 2]\nB: [2, 6, 7, 0, 5, 1, 3, 4]\nC: [0, 2, 3, 5, 4, 6, 7, 1]\nD: [0, 4, 2, 5, 3, 7, 1, 6]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_85_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_85_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_85_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_85_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_85_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_85_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_85_6.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_85_7.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [5, 1, 6, 2, 0, 7, 3, 4]\nB: [6, 0, 3, 1, 5, 2, 7, 4]\nC: [1, 5, 6, 0, 4, 7, 3, 2]\nD: [0, 4, 7, 3, 2, 6, 1, 5]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [5, 1, 6, 2, 0, 7, 3, 4]\nB: [6, 0, 3, 1, 5, 2, 7, 4]\nC: [1, 5, 6, 0, 4, 7, 3, 2]\nD: [0, 4, 7, 3, 2, 6, 1, 5]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_86_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_86_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_86_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_86_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_86_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_86_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_86_6.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_86_7.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [1, 5, 0, 4, 3, 2, 6]\nB: [3, 6, 1, 5, 0, 2, 4]\nC: [5, 6, 4, 1, 0, 2, 3]\nD: [6, 2, 1, 5, 4, 0, 3]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [1, 5, 0, 4, 3, 2, 6]\nB: [3, 6, 1, 5, 0, 2, 4]\nC: [5, 6, 4, 1, 0, 2, 3]\nD: [6, 2, 1, 5, 4, 0, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_87_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_87_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_87_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_87_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_87_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_87_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_87_6.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [0, 13, 28, 14, 21, 5, 27, 1, 24, 23, 2, 12, 18, 22, 29, 9, 17, 30, 6, 11, 10, 3, 16, 8, 25, 7, 19, 15, 20, 4, 26]\nB: [5, 2, 10, 30, 8, 15, 29, 22, 27, 12, 13, 3, 24, 11, 0, 25, 19, 16, 23, 17, 18, 21, 7, 20, 6, 1, 28, 14, 26, 9, 4]\nC: [1, 2, 29, 6, 0, 22, 19, 21, 9, 13, 12, 18, 23, 3, 30, 5, 7, 20, 8, 25, 16, 28, 10, 11, 26, 15, 14, 4, 24, 27, 17]\nD: [21, 24, 8, 10, 5, 29, 13, 19, 26, 17, 28, 20, 7, 16, 14, 25, 15, 0, 6, 9, 3, 2, 4, 1, 22, 12, 18, 30, 23, 27, 11]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [0, 13, 28, 14, 21, 5, 27, 1, 24, 23, 2, 12, 18, 22, 29, 9, 17, 30, 6, 11, 10, 3, 16, 8, 25, 7, 19, 15, 20, 4, 26]\nB: [5, 2, 10, 30, 8, 15, 29, 22, 27, 12, 13, 3, 24, 11, 0, 25, 19, 16, 23, 17, 18, 21, 7, 20, 6, 1, 28, 14, 26, 9, 4]\nC: [1, 2, 29, 6, 0, 22, 19, 21, 9, 13, 12, 18, 23, 3, 30, 5, 7, 20, 8, 25, 16, 28, 10, 11, 26, 15, 14, 4, 24, 27, 17]\nD: [21, 24, 8, 10, 5, 29, 13, 19, 26, 17, 28, 20, 7, 16, 14, 25, 15, 0, 6, 9, 3, 2, 4, 1, 22, 12, 18, 30, 23, 27, 11]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_88_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_88_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_88_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_88_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_88_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_88_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_88_6.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_88_7.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_88_8.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_88_9.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_88_10.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_88_11.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_88_12.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_88_13.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_88_14.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_88_15.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_88_16.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_88_17.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_88_18.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_88_19.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_88_20.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_88_21.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_88_22.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_88_23.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_88_24.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_88_25.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_88_26.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_88_27.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_88_28.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_88_29.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_88_30.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [0, 2, 1, 3]\nB: [2, 1, 3, 0]\nC: [2, 3, 1, 0]\nD: [2, 3, 0, 1]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [0, 2, 1, 3]\nB: [2, 1, 3, 0]\nC: [2, 3, 1, 0]\nD: [2, 3, 0, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_89_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_89_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_89_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_89_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [0, 1, 3, 2]\nB: [1, 0, 2, 3]\nC: [3, 0, 1, 2]\nD: [0, 2, 1, 3]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [0, 1, 3, 2]\nB: [1, 0, 2, 3]\nC: [3, 0, 1, 2]\nD: [0, 2, 1, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_90_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_90_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_90_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_90_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [2, 1, 3, 0]\nB: [1, 0, 3, 2]\nC: [0, 1, 3, 2]\nD: [0, 1, 3, 2]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [2, 1, 3, 0]\nB: [1, 0, 3, 2]\nC: [0, 1, 3, 2]\nD: [0, 1, 3, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_91_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_91_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_91_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_91_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [1, 2, 0, 3]\nB: [3, 2, 0, 1]\nC: [0, 2, 3, 1]\nD: [2, 0, 1, 3]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [1, 2, 0, 3]\nB: [3, 2, 0, 1]\nC: [0, 2, 3, 1]\nD: [2, 0, 1, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_92_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_92_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_92_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_92_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [4, 2, 1, 0, 3]\nB: [2, 3, 4, 1, 0]\nC: [3, 1, 4, 0, 2]\nD: [4, 3, 0, 2, 1]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [4, 2, 1, 0, 3]\nB: [2, 3, 4, 1, 0]\nC: [3, 1, 4, 0, 2]\nD: [4, 3, 0, 2, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_93_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_93_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_93_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_93_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_93_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [3, 4, 1, 5, 2, 0]\nB: [0, 2, 4, 3, 1, 5]\nC: [5, 1, 2, 0, 3, 4]\nD: [0, 3, 4, 5, 1, 2]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [3, 4, 1, 5, 2, 0]\nB: [0, 2, 4, 3, 1, 5]\nC: [5, 1, 2, 0, 3, 4]\nD: [0, 3, 4, 5, 1, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_94_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_94_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_94_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_94_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_94_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_94_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [0, 2, 1, 3]\nB: [0, 2, 3, 1]\nC: [0, 1, 3, 2]\nD: [3, 1, 0, 2]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [0, 2, 1, 3]\nB: [0, 2, 3, 1]\nC: [0, 1, 3, 2]\nD: [3, 1, 0, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_95_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_95_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_95_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_95_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [4, 1, 0, 3, 2]\nB: [0, 1, 3, 4, 2]\nC: [1, 4, 3, 2, 0]\nD: [2, 4, 0, 3, 1]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [4, 1, 0, 3, 2]\nB: [0, 1, 3, 4, 2]\nC: [1, 4, 3, 2, 0]\nD: [2, 4, 0, 3, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_96_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_96_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_96_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_96_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_96_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [3, 0, 4, 2, 6, 1, 5]\nB: [1, 6, 4, 0, 2, 5, 3]\nC: [4, 6, 3, 1, 0, 2, 5]\nD: [3, 1, 5, 2, 6, 0, 4]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [3, 0, 4, 2, 6, 1, 5]\nB: [1, 6, 4, 0, 2, 5, 3]\nC: [4, 6, 3, 1, 0, 2, 5]\nD: [3, 1, 5, 2, 6, 0, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_97_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_97_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_97_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_97_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_97_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_97_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_97_6.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "pouring",
+    "options": "A: [5, 3, 6, 0, 2, 1, 4]\nB: [3, 0, 2, 4, 5, 6, 1]\nC: [3, 6, 5, 1, 0, 2, 4]\nD: [3, 1, 2, 0, 4, 6, 5]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [5, 3, 6, 0, 2, 1, 4]\nB: [3, 0, 2, 4, 5, 6, 1]\nC: [3, 6, 5, 1, 0, 2, 4]\nD: [3, 1, 2, 0, 4, 6, 5]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_98_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_98_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_98_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_98_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_98_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_98_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_98_6.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [0, 3, 4, 2, 1]\nB: [1, 0, 4, 2, 3]\nC: [2, 1, 4, 3, 0]\nD: [2, 0, 4, 1, 3]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [0, 3, 4, 2, 1]\nB: [1, 0, 4, 2, 3]\nC: [2, 1, 4, 3, 0]\nD: [2, 0, 4, 1, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_99_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_99_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_99_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_99_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_99_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [5, 2, 3, 0, 4, 1]\nB: [3, 5, 4, 1, 2, 0]\nC: [1, 0, 4, 5, 2, 3]\nD: [0, 1, 5, 3, 4, 2]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [5, 2, 3, 0, 4, 1]\nB: [3, 5, 4, 1, 2, 0]\nC: [1, 0, 4, 5, 2, 3]\nD: [0, 1, 5, 3, 4, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_100_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_100_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_100_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_100_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_100_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_100_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [2, 1, 3, 0]\nB: [3, 0, 2, 1]\nC: [0, 1, 2, 3]\nD: [3, 1, 0, 2]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [2, 1, 3, 0]\nB: [3, 0, 2, 1]\nC: [0, 1, 2, 3]\nD: [3, 1, 0, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_101_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_101_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_101_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_101_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [3, 2, 1, 0]\nB: [3, 2, 1, 0]\nC: [1, 2, 3, 0]\nD: [1, 3, 0, 2]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [3, 2, 1, 0]\nB: [3, 2, 1, 0]\nC: [1, 2, 3, 0]\nD: [1, 3, 0, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_102_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_102_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_102_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_102_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "pouring",
+    "options": "A: [3, 1, 2, 0, 6, 4, 5]\nB: [6, 2, 4, 3, 5, 1, 0]\nC: [3, 4, 2, 6, 5, 1, 0]\nD: [5, 0, 1, 6, 2, 4, 3]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [3, 1, 2, 0, 6, 4, 5]\nB: [6, 2, 4, 3, 5, 1, 0]\nC: [3, 4, 2, 6, 5, 1, 0]\nD: [5, 0, 1, 6, 2, 4, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_103_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_103_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_103_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_103_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_103_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_103_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_103_6.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [8, 2, 0, 5, 7, 4, 6, 3, 1]\nB: [7, 8, 1, 0, 6, 4, 5, 3, 2]\nC: [7, 1, 6, 5, 4, 2, 8, 0, 3]\nD: [0, 4, 5, 1, 2, 6, 3, 7, 8]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [8, 2, 0, 5, 7, 4, 6, 3, 1]\nB: [7, 8, 1, 0, 6, 4, 5, 3, 2]\nC: [7, 1, 6, 5, 4, 2, 8, 0, 3]\nD: [0, 4, 5, 1, 2, 6, 3, 7, 8]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_104_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_104_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_104_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_104_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_104_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_104_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_104_6.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_104_7.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_104_8.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [1, 2, 3, 0]\nB: [2, 0, 3, 1]\nC: [1, 2, 3, 0]\nD: [0, 3, 2, 1]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [1, 2, 3, 0]\nB: [2, 0, 3, 1]\nC: [1, 2, 3, 0]\nD: [0, 3, 2, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_105_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_105_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_105_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_105_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [0, 3, 1, 2]\nB: [2, 3, 0, 1]\nC: [1, 0, 3, 2]\nD: [1, 0, 2, 3]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [0, 3, 1, 2]\nB: [2, 3, 0, 1]\nC: [1, 0, 3, 2]\nD: [1, 0, 2, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_106_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_106_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_106_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_106_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [3, 4, 7, 10, 2, 6, 8, 11, 0, 9, 12, 13, 5, 1]\nB: [8, 1, 12, 0, 9, 2, 11, 13, 5, 6, 4, 3, 10, 7]\nC: [5, 11, 12, 4, 13, 6, 3, 0, 9, 10, 1, 2, 8, 7]\nD: [4, 13, 1, 11, 2, 8, 10, 9, 6, 5, 0, 7, 3, 12]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [3, 4, 7, 10, 2, 6, 8, 11, 0, 9, 12, 13, 5, 1]\nB: [8, 1, 12, 0, 9, 2, 11, 13, 5, 6, 4, 3, 10, 7]\nC: [5, 11, 12, 4, 13, 6, 3, 0, 9, 10, 1, 2, 8, 7]\nD: [4, 13, 1, 11, 2, 8, 10, 9, 6, 5, 0, 7, 3, 12]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_107_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_107_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_107_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_107_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_107_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_107_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_107_6.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_107_7.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_107_8.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_107_9.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_107_10.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_107_11.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_107_12.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_107_13.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [0, 2, 9, 10, 6, 11, 5, 3, 8, 13, 4, 7, 1, 12]\nB: [1, 6, 10, 7, 9, 13, 2, 0, 5, 3, 4, 8, 12, 11]\nC: [6, 12, 3, 8, 5, 11, 4, 10, 1, 0, 13, 2, 7, 9]\nD: [2, 7, 9, 4, 10, 11, 3, 8, 0, 13, 6, 12, 1, 5]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [0, 2, 9, 10, 6, 11, 5, 3, 8, 13, 4, 7, 1, 12]\nB: [1, 6, 10, 7, 9, 13, 2, 0, 5, 3, 4, 8, 12, 11]\nC: [6, 12, 3, 8, 5, 11, 4, 10, 1, 0, 13, 2, 7, 9]\nD: [2, 7, 9, 4, 10, 11, 3, 8, 0, 13, 6, 12, 1, 5]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_108_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_108_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_108_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_108_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_108_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_108_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_108_6.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_108_7.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_108_8.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_108_9.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_108_10.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_108_11.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_108_12.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_108_13.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [7, 5, 6, 2, 0, 1, 3, 4]\nB: [7, 6, 4, 0, 1, 3, 5, 2]\nC: [4, 5, 2, 1, 6, 7, 0, 3]\nD: [4, 5, 6, 2, 7, 1, 0, 3]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [7, 5, 6, 2, 0, 1, 3, 4]\nB: [7, 6, 4, 0, 1, 3, 5, 2]\nC: [4, 5, 2, 1, 6, 7, 0, 3]\nD: [4, 5, 6, 2, 7, 1, 0, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_109_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_109_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_109_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_109_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_109_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_109_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_109_6.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_109_7.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [3, 2, 1, 0]\nB: [3, 0, 2, 1]\nC: [0, 1, 3, 2]\nD: [3, 0, 2, 1]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [3, 2, 1, 0]\nB: [3, 0, 2, 1]\nC: [0, 1, 3, 2]\nD: [3, 0, 2, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_110_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_110_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_110_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_110_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [1, 4, 0, 3, 2]\nB: [2, 3, 1, 0, 4]\nC: [4, 1, 0, 3, 2]\nD: [3, 0, 2, 4, 1]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [1, 4, 0, 3, 2]\nB: [2, 3, 1, 0, 4]\nC: [4, 1, 0, 3, 2]\nD: [3, 0, 2, 4, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_111_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_111_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_111_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_111_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_111_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "pouring",
+    "options": "A: [4, 0, 1, 7, 5, 2, 3, 8, 6]\nB: [4, 2, 3, 1, 7, 8, 0, 5, 6]\nC: [5, 6, 7, 1, 4, 0, 3, 2, 8]\nD: [3, 1, 8, 7, 5, 6, 0, 4, 2]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [4, 0, 1, 7, 5, 2, 3, 8, 6]\nB: [4, 2, 3, 1, 7, 8, 0, 5, 6]\nC: [5, 6, 7, 1, 4, 0, 3, 2, 8]\nD: [3, 1, 8, 7, 5, 6, 0, 4, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_112_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_112_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_112_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_112_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_112_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_112_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_112_6.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_112_7.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_112_8.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [2, 1, 3, 0, 4]\nB: [4, 2, 0, 1, 3]\nC: [2, 0, 4, 3, 1]\nD: [1, 3, 2, 0, 4]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [2, 1, 3, 0, 4]\nB: [4, 2, 0, 1, 3]\nC: [2, 0, 4, 3, 1]\nD: [1, 3, 2, 0, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_113_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_113_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_113_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_113_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_113_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [3, 2, 0, 1, 4, 5]\nB: [3, 0, 1, 5, 2, 4]\nC: [5, 3, 0, 4, 2, 1]\nD: [4, 1, 5, 2, 3, 0]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [3, 2, 0, 1, 4, 5]\nB: [3, 0, 1, 5, 2, 4]\nC: [5, 3, 0, 4, 2, 1]\nD: [4, 1, 5, 2, 3, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_114_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_114_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_114_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_114_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_114_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_114_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [1, 0, 3, 2]\nB: [0, 3, 2, 1]\nC: [2, 0, 3, 1]\nD: [1, 3, 0, 2]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [1, 0, 3, 2]\nB: [0, 3, 2, 1]\nC: [2, 0, 3, 1]\nD: [1, 3, 0, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_115_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_115_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_115_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_115_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [1, 0, 2, 4, 3]\nB: [4, 3, 1, 0, 2]\nC: [0, 2, 4, 1, 3]\nD: [4, 3, 2, 1, 0]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [1, 0, 2, 4, 3]\nB: [4, 3, 1, 0, 2]\nC: [0, 2, 4, 1, 3]\nD: [4, 3, 2, 1, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_116_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_116_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_116_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_116_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_116_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [2, 4, 3, 0, 1]\nB: [1, 0, 3, 2, 4]\nC: [1, 2, 4, 0, 3]\nD: [2, 4, 3, 1, 0]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [2, 4, 3, 0, 1]\nB: [1, 0, 3, 2, 4]\nC: [1, 2, 4, 0, 3]\nD: [2, 4, 3, 1, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_117_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_117_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_117_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_117_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_117_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [2, 3, 0, 1, 4, 6, 8, 5, 7]\nB: [3, 1, 7, 4, 0, 8, 2, 5, 6]\nC: [3, 5, 8, 1, 7, 0, 2, 4, 6]\nD: [5, 7, 0, 1, 4, 6, 3, 8, 2]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [2, 3, 0, 1, 4, 6, 8, 5, 7]\nB: [3, 1, 7, 4, 0, 8, 2, 5, 6]\nC: [3, 5, 8, 1, 7, 0, 2, 4, 6]\nD: [5, 7, 0, 1, 4, 6, 3, 8, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_118_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_118_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_118_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_118_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_118_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_118_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_118_6.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_118_7.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_118_8.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [2, 4, 0, 3, 1]\nB: [2, 4, 3, 1, 0]\nC: [0, 3, 4, 1, 2]\nD: [1, 2, 4, 3, 0]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [2, 4, 0, 3, 1]\nB: [2, 4, 3, 1, 0]\nC: [0, 3, 4, 1, 2]\nD: [1, 2, 4, 3, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_119_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_119_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_119_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_119_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_119_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [1, 0, 3, 2]\nB: [1, 3, 2, 0]\nC: [1, 2, 3, 0]\nD: [2, 0, 1, 3]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [1, 0, 3, 2]\nB: [1, 3, 2, 0]\nC: [1, 2, 3, 0]\nD: [2, 0, 1, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_120_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_120_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_120_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_120_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [1, 2, 3, 0]\nB: [0, 1, 3, 2]\nC: [1, 2, 3, 0]\nD: [3, 1, 0, 2]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [1, 2, 3, 0]\nB: [0, 1, 3, 2]\nC: [1, 2, 3, 0]\nD: [3, 1, 0, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_121_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_121_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_121_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_121_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [3, 2, 4, 1, 0]\nB: [1, 3, 0, 4, 2]\nC: [4, 0, 1, 2, 3]\nD: [1, 4, 2, 0, 3]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [3, 2, 4, 1, 0]\nB: [1, 3, 0, 4, 2]\nC: [4, 0, 1, 2, 3]\nD: [1, 4, 2, 0, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_122_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_122_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_122_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_122_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_122_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [5, 3, 4, 1, 7, 12, 13, 10, 2, 6, 0, 9, 11, 8]\nB: [11, 10, 2, 4, 13, 8, 5, 3, 0, 6, 7, 9, 12, 1]\nC: [4, 6, 12, 1, 5, 11, 2, 8, 13, 3, 0, 9, 10, 7]\nD: [6, 2, 5, 7, 12, 10, 8, 0, 3, 13, 9, 11, 1, 4]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [5, 3, 4, 1, 7, 12, 13, 10, 2, 6, 0, 9, 11, 8]\nB: [11, 10, 2, 4, 13, 8, 5, 3, 0, 6, 7, 9, 12, 1]\nC: [4, 6, 12, 1, 5, 11, 2, 8, 13, 3, 0, 9, 10, 7]\nD: [6, 2, 5, 7, 12, 10, 8, 0, 3, 13, 9, 11, 1, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_123_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_123_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_123_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_123_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_123_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_123_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_123_6.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_123_7.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_123_8.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_123_9.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_123_10.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_123_11.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_123_12.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_123_13.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [1, 2, 3, 0]\nB: [2, 3, 1, 0]\nC: [3, 1, 2, 0]\nD: [1, 2, 0, 3]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [1, 2, 3, 0]\nB: [2, 3, 1, 0]\nC: [3, 1, 2, 0]\nD: [1, 2, 0, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_124_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_124_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_124_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_124_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [0, 5, 3, 1, 2, 10, 19, 16, 8, 7, 21, 15, 22, 18, 17, 4, 14, 13, 20, 11, 9, 6, 12]\nB: [19, 2, 1, 20, 8, 14, 18, 7, 15, 12, 13, 17, 10, 5, 9, 3, 21, 4, 6, 11, 16, 22, 0]\nC: [3, 0, 7, 10, 15, 21, 14, 6, 22, 11, 1, 8, 18, 5, 17, 2, 4, 12, 19, 13, 20, 9, 16]\nD: [3, 11, 7, 6, 22, 15, 1, 9, 20, 19, 4, 18, 10, 21, 12, 2, 13, 17, 16, 0, 8, 14, 5]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [0, 5, 3, 1, 2, 10, 19, 16, 8, 7, 21, 15, 22, 18, 17, 4, 14, 13, 20, 11, 9, 6, 12]\nB: [19, 2, 1, 20, 8, 14, 18, 7, 15, 12, 13, 17, 10, 5, 9, 3, 21, 4, 6, 11, 16, 22, 0]\nC: [3, 0, 7, 10, 15, 21, 14, 6, 22, 11, 1, 8, 18, 5, 17, 2, 4, 12, 19, 13, 20, 9, 16]\nD: [3, 11, 7, 6, 22, 15, 1, 9, 20, 19, 4, 18, 10, 21, 12, 2, 13, 17, 16, 0, 8, 14, 5]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_125_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_125_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_125_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_125_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_125_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_125_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_125_6.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_125_7.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_125_8.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_125_9.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_125_10.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_125_11.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_125_12.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_125_13.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_125_14.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_125_15.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_125_16.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_125_17.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_125_18.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_125_19.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_125_20.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_125_21.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_125_22.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [3, 2, 1, 0, 4]\nB: [4, 1, 3, 0, 2]\nC: [1, 4, 2, 3, 0]\nD: [1, 3, 4, 0, 2]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [3, 2, 1, 0, 4]\nB: [4, 1, 3, 0, 2]\nC: [1, 4, 2, 3, 0]\nD: [1, 3, 4, 0, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_126_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_126_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_126_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_126_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_126_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [1, 7, 6, 9, 5, 8, 3, 0, 4, 2]\nB: [4, 2, 5, 1, 0, 3, 7, 6, 8, 9]\nC: [0, 9, 2, 7, 1, 3, 5, 8, 4, 6]\nD: [6, 7, 4, 2, 9, 0, 5, 1, 8, 3]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [1, 7, 6, 9, 5, 8, 3, 0, 4, 2]\nB: [4, 2, 5, 1, 0, 3, 7, 6, 8, 9]\nC: [0, 9, 2, 7, 1, 3, 5, 8, 4, 6]\nD: [6, 7, 4, 2, 9, 0, 5, 1, 8, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_127_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_127_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_127_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_127_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_127_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_127_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_127_6.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_127_7.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_127_8.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_127_9.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [0, 1, 2, 3]\nB: [2, 1, 3, 0]\nC: [0, 2, 3, 1]\nD: [3, 1, 0, 2]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 1, 3, 0]\nC: [0, 2, 3, 1]\nD: [3, 1, 0, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_128_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_128_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_128_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_128_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [4, 5, 0, 7, 1, 3, 6, 2]\nB: [2, 6, 3, 1, 0, 4, 5, 7]\nC: [7, 1, 6, 0, 5, 2, 4, 3]\nD: [4, 3, 7, 6, 2, 0, 5, 1]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [4, 5, 0, 7, 1, 3, 6, 2]\nB: [2, 6, 3, 1, 0, 4, 5, 7]\nC: [7, 1, 6, 0, 5, 2, 4, 3]\nD: [4, 3, 7, 6, 2, 0, 5, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_129_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_129_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_129_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_129_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_129_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_129_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_129_6.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_129_7.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [1, 2, 3, 0]\nB: [2, 3, 1, 0]\nC: [0, 1, 2, 3]\nD: [1, 3, 0, 2]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [1, 2, 3, 0]\nB: [2, 3, 1, 0]\nC: [0, 1, 2, 3]\nD: [1, 3, 0, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_130_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_130_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_130_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_130_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [1, 2, 0, 3]\nB: [3, 0, 1, 2]\nC: [1, 3, 0, 2]\nD: [0, 2, 1, 3]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [1, 2, 0, 3]\nB: [3, 0, 1, 2]\nC: [1, 3, 0, 2]\nD: [0, 2, 1, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_131_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_131_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_131_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_131_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [1, 3, 2, 0]\nB: [2, 3, 0, 1]\nC: [3, 0, 2, 1]\nD: [1, 2, 0, 3]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [1, 3, 2, 0]\nB: [2, 3, 0, 1]\nC: [3, 0, 2, 1]\nD: [1, 2, 0, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_132_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_132_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_132_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_132_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [3, 4, 2, 5, 1, 0, 6]\nB: [4, 6, 3, 2, 5, 0, 1]\nC: [4, 5, 3, 1, 6, 2, 0]\nD: [5, 0, 2, 6, 3, 1, 4]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [3, 4, 2, 5, 1, 0, 6]\nB: [4, 6, 3, 2, 5, 0, 1]\nC: [4, 5, 3, 1, 6, 2, 0]\nD: [5, 0, 2, 6, 3, 1, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_133_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_133_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_133_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_133_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_133_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_133_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_133_6.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [3, 1, 0, 2]\nB: [1, 3, 0, 2]\nC: [0, 2, 3, 1]\nD: [3, 1, 0, 2]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [3, 1, 0, 2]\nB: [1, 3, 0, 2]\nC: [0, 2, 3, 1]\nD: [3, 1, 0, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_134_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_134_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_134_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_134_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [7, 6, 4, 8, 3, 2, 0, 1, 5]\nB: [0, 1, 7, 8, 3, 4, 5, 2, 6]\nC: [3, 2, 7, 4, 1, 8, 6, 0, 5]\nD: [5, 7, 2, 1, 6, 0, 8, 4, 3]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [7, 6, 4, 8, 3, 2, 0, 1, 5]\nB: [0, 1, 7, 8, 3, 4, 5, 2, 6]\nC: [3, 2, 7, 4, 1, 8, 6, 0, 5]\nD: [5, 7, 2, 1, 6, 0, 8, 4, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_135_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_135_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_135_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_135_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_135_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_135_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_135_6.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_135_7.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_135_8.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [0, 2, 1, 3]\nB: [0, 1, 2, 3]\nC: [3, 0, 2, 1]\nD: [3, 1, 0, 2]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [0, 2, 1, 3]\nB: [0, 1, 2, 3]\nC: [3, 0, 2, 1]\nD: [3, 1, 0, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_136_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_136_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_136_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_136_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [20, 10, 5, 3, 21, 12, 19, 16, 9, 7, 2, 8, 11, 6, 1, 13, 17, 0, 18, 15, 14, 4]\nB: [19, 10, 12, 21, 3, 17, 11, 1, 20, 18, 9, 14, 5, 8, 6, 13, 15, 2, 4, 0, 16, 7]\nC: [19, 6, 4, 2, 11, 13, 17, 5, 10, 7, 20, 8, 1, 15, 16, 21, 18, 9, 12, 3, 0, 14]\nD: [5, 15, 18, 21, 1, 19, 14, 16, 9, 3, 12, 2, 8, 20, 7, 4, 10, 13, 11, 17, 0, 6]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [20, 10, 5, 3, 21, 12, 19, 16, 9, 7, 2, 8, 11, 6, 1, 13, 17, 0, 18, 15, 14, 4]\nB: [19, 10, 12, 21, 3, 17, 11, 1, 20, 18, 9, 14, 5, 8, 6, 13, 15, 2, 4, 0, 16, 7]\nC: [19, 6, 4, 2, 11, 13, 17, 5, 10, 7, 20, 8, 1, 15, 16, 21, 18, 9, 12, 3, 0, 14]\nD: [5, 15, 18, 21, 1, 19, 14, 16, 9, 3, 12, 2, 8, 20, 7, 4, 10, 13, 11, 17, 0, 6]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_137_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_137_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_137_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_137_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_137_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_137_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_137_6.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_137_7.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_137_8.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_137_9.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_137_10.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_137_11.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_137_12.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_137_13.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_137_14.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_137_15.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_137_16.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_137_17.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_137_18.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_137_19.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_137_20.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_137_21.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [4, 3, 1, 6, 2, 0, 5]\nB: [5, 1, 2, 0, 6, 3, 4]\nC: [1, 6, 2, 4, 0, 5, 3]\nD: [0, 5, 3, 1, 2, 4, 6]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [4, 3, 1, 6, 2, 0, 5]\nB: [5, 1, 2, 0, 6, 3, 4]\nC: [1, 6, 2, 4, 0, 5, 3]\nD: [0, 5, 3, 1, 2, 4, 6]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_138_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_138_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_138_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_138_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_138_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_138_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_138_6.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [2, 3, 0, 1]\nB: [2, 1, 0, 3]\nC: [0, 1, 2, 3]\nD: [2, 3, 1, 0]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [2, 3, 0, 1]\nB: [2, 1, 0, 3]\nC: [0, 1, 2, 3]\nD: [2, 3, 1, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_139_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_139_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_139_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_139_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [3, 0, 1, 2]\nB: [0, 1, 2, 3]\nC: [2, 0, 1, 3]\nD: [1, 3, 2, 0]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [3, 0, 1, 2]\nB: [0, 1, 2, 3]\nC: [2, 0, 1, 3]\nD: [1, 3, 2, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_140_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_140_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_140_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_140_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [6, 4, 1, 5, 3, 2, 0]\nB: [3, 6, 5, 0, 2, 4, 1]\nC: [5, 3, 2, 1, 6, 0, 4]\nD: [0, 4, 6, 2, 1, 3, 5]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [6, 4, 1, 5, 3, 2, 0]\nB: [3, 6, 5, 0, 2, 4, 1]\nC: [5, 3, 2, 1, 6, 0, 4]\nD: [0, 4, 6, 2, 1, 3, 5]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_141_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_141_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_141_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_141_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_141_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_141_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_141_6.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [5, 2, 7, 3, 1, 6, 0, 4]\nB: [3, 5, 6, 0, 7, 4, 2, 1]\nC: [0, 1, 5, 4, 3, 7, 6, 2]\nD: [1, 0, 3, 5, 7, 6, 2, 4]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [5, 2, 7, 3, 1, 6, 0, 4]\nB: [3, 5, 6, 0, 7, 4, 2, 1]\nC: [0, 1, 5, 4, 3, 7, 6, 2]\nD: [1, 0, 3, 5, 7, 6, 2, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_142_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_142_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_142_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_142_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_142_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_142_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_142_6.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_142_7.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [1, 3, 2, 0]\nB: [2, 0, 3, 1]\nC: [2, 3, 1, 0]\nD: [0, 2, 1, 3]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [1, 3, 2, 0]\nB: [2, 0, 3, 1]\nC: [2, 3, 1, 0]\nD: [0, 2, 1, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_143_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_143_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_143_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_143_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "pouring",
+    "options": "A: [0, 4, 2, 5, 1, 3]\nB: [3, 2, 0, 5, 1, 4]\nC: [1, 3, 0, 4, 5, 2]\nD: [3, 5, 4, 2, 0, 1]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [0, 4, 2, 5, 1, 3]\nB: [3, 2, 0, 5, 1, 4]\nC: [1, 3, 0, 4, 5, 2]\nD: [3, 5, 4, 2, 0, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_144_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_144_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_144_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_144_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_144_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_144_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "pouring",
+    "options": "A: [7, 6, 4, 2, 5, 1, 3, 9, 8, 0]\nB: [5, 8, 6, 1, 9, 3, 4, 0, 2, 7]\nC: [7, 8, 5, 1, 0, 4, 9, 6, 3, 2]\nD: [7, 9, 6, 3, 5, 0, 8, 2, 1, 4]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [7, 6, 4, 2, 5, 1, 3, 9, 8, 0]\nB: [5, 8, 6, 1, 9, 3, 4, 0, 2, 7]\nC: [7, 8, 5, 1, 0, 4, 9, 6, 3, 2]\nD: [7, 9, 6, 3, 5, 0, 8, 2, 1, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_145_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_145_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_145_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_145_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_145_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_145_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_145_6.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_145_7.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_145_8.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_145_9.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [3, 0, 1, 2]\nB: [1, 3, 2, 0]\nC: [0, 1, 3, 2]\nD: [3, 2, 1, 0]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [3, 0, 1, 2]\nB: [1, 3, 2, 0]\nC: [0, 1, 3, 2]\nD: [3, 2, 1, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_146_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_146_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_146_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_146_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [2, 3, 1, 0]\nB: [1, 0, 2, 3]\nC: [1, 2, 0, 3]\nD: [1, 3, 2, 0]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [2, 3, 1, 0]\nB: [1, 0, 2, 3]\nC: [1, 2, 0, 3]\nD: [1, 3, 2, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_147_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_147_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_147_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_147_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [1, 0, 3, 4, 2]\nB: [1, 3, 4, 2, 0]\nC: [0, 3, 4, 1, 2]\nD: [1, 3, 4, 0, 2]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [1, 0, 3, 4, 2]\nB: [1, 3, 4, 2, 0]\nC: [0, 3, 4, 1, 2]\nD: [1, 3, 4, 0, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_148_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_148_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_148_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_148_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_148_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [0, 3, 2, 1]\nB: [3, 2, 0, 1]\nC: [2, 1, 3, 0]\nD: [2, 3, 0, 1]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [0, 3, 2, 1]\nB: [3, 2, 0, 1]\nC: [2, 1, 3, 0]\nD: [2, 3, 0, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_149_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_149_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_149_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_149_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [5, 3, 1, 2, 4, 0, 6]\nB: [1, 3, 0, 6, 2, 5, 4]\nC: [4, 5, 3, 0, 1, 6, 2]\nD: [2, 1, 0, 6, 4, 5, 3]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [5, 3, 1, 2, 4, 0, 6]\nB: [1, 3, 0, 6, 2, 5, 4]\nC: [4, 5, 3, 0, 1, 6, 2]\nD: [2, 1, 0, 6, 4, 5, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_150_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_150_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_150_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_150_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_150_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_150_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_150_6.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [4, 6, 0, 1, 2, 3, 5]\nB: [5, 1, 0, 6, 2, 4, 3]\nC: [2, 6, 5, 4, 0, 1, 3]\nD: [0, 4, 1, 3, 6, 2, 5]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [4, 6, 0, 1, 2, 3, 5]\nB: [5, 1, 0, 6, 2, 4, 3]\nC: [2, 6, 5, 4, 0, 1, 3]\nD: [0, 4, 1, 3, 6, 2, 5]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_151_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_151_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_151_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_151_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_151_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_151_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_151_6.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [3, 0, 2, 1]\nB: [0, 3, 2, 1]\nC: [1, 2, 3, 0]\nD: [1, 0, 2, 3]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [3, 0, 2, 1]\nB: [0, 3, 2, 1]\nC: [1, 2, 3, 0]\nD: [1, 0, 2, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_152_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_152_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_152_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_152_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [0, 7, 12, 10, 2, 5, 3, 9, 6, 4, 13, 8, 11, 1]\nB: [11, 13, 7, 3, 1, 8, 5, 0, 4, 6, 12, 9, 10, 2]\nC: [0, 2, 11, 13, 12, 9, 10, 1, 8, 4, 3, 6, 7, 5]\nD: [10, 5, 4, 13, 11, 12, 0, 6, 8, 7, 2, 9, 1, 3]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [0, 7, 12, 10, 2, 5, 3, 9, 6, 4, 13, 8, 11, 1]\nB: [11, 13, 7, 3, 1, 8, 5, 0, 4, 6, 12, 9, 10, 2]\nC: [0, 2, 11, 13, 12, 9, 10, 1, 8, 4, 3, 6, 7, 5]\nD: [10, 5, 4, 13, 11, 12, 0, 6, 8, 7, 2, 9, 1, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_153_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_153_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_153_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_153_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_153_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_153_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_153_6.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_153_7.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_153_8.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_153_9.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_153_10.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_153_11.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_153_12.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_153_13.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [2, 3, 0, 1, 4]\nB: [1, 0, 2, 3, 4]\nC: [4, 2, 3, 1, 0]\nD: [0, 1, 2, 4, 3]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [2, 3, 0, 1, 4]\nB: [1, 0, 2, 3, 4]\nC: [4, 2, 3, 1, 0]\nD: [0, 1, 2, 4, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_154_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_154_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_154_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_154_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_154_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [0, 3, 1, 2]\nB: [0, 1, 2, 3]\nC: [1, 0, 3, 2]\nD: [0, 2, 1, 3]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [0, 3, 1, 2]\nB: [0, 1, 2, 3]\nC: [1, 0, 3, 2]\nD: [0, 2, 1, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_155_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_155_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_155_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_155_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [2, 3, 0, 1]\nB: [0, 1, 2, 3]\nC: [0, 1, 2, 3]\nD: [0, 2, 1, 3]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [2, 3, 0, 1]\nB: [0, 1, 2, 3]\nC: [0, 1, 2, 3]\nD: [0, 2, 1, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_156_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_156_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_156_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_156_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [1, 4, 3, 0, 2]\nB: [2, 4, 0, 3, 1]\nC: [1, 4, 2, 0, 3]\nD: [4, 1, 0, 2, 3]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [1, 4, 3, 0, 2]\nB: [2, 4, 0, 3, 1]\nC: [1, 4, 2, 0, 3]\nD: [4, 1, 0, 2, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_157_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_157_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_157_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_157_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_157_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [0, 4, 1, 2, 3, 5]\nB: [1, 5, 2, 4, 3, 0]\nC: [3, 0, 1, 2, 4, 5]\nD: [3, 2, 1, 4, 0, 5]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [0, 4, 1, 2, 3, 5]\nB: [1, 5, 2, 4, 3, 0]\nC: [3, 0, 1, 2, 4, 5]\nD: [3, 2, 1, 4, 0, 5]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_158_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_158_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_158_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_158_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_158_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_158_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [5, 1, 0, 8, 9, 4, 2, 6, 3, 7]\nB: [5, 1, 7, 6, 0, 4, 9, 3, 8, 2]\nC: [2, 1, 8, 6, 0, 3, 7, 9, 4, 5]\nD: [5, 7, 9, 8, 2, 0, 6, 4, 3, 1]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [5, 1, 0, 8, 9, 4, 2, 6, 3, 7]\nB: [5, 1, 7, 6, 0, 4, 9, 3, 8, 2]\nC: [2, 1, 8, 6, 0, 3, 7, 9, 4, 5]\nD: [5, 7, 9, 8, 2, 0, 6, 4, 3, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_159_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_159_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_159_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_159_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_159_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_159_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_159_6.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_159_7.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_159_8.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_159_9.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [0, 2, 6, 7, 4, 5, 10, 1, 8, 3, 9]\nB: [8, 4, 9, 2, 5, 0, 6, 3, 1, 10, 7]\nC: [6, 8, 3, 10, 2, 7, 5, 4, 9, 0, 1]\nD: [5, 7, 10, 0, 9, 4, 3, 6, 1, 2, 8]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [0, 2, 6, 7, 4, 5, 10, 1, 8, 3, 9]\nB: [8, 4, 9, 2, 5, 0, 6, 3, 1, 10, 7]\nC: [6, 8, 3, 10, 2, 7, 5, 4, 9, 0, 1]\nD: [5, 7, 10, 0, 9, 4, 3, 6, 1, 2, 8]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_160_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_160_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_160_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_160_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_160_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_160_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_160_6.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_160_7.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_160_8.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_160_9.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_160_10.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [3, 0, 2, 1]\nB: [2, 0, 1, 3]\nC: [0, 2, 1, 3]\nD: [1, 3, 2, 0]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [3, 0, 2, 1]\nB: [2, 0, 1, 3]\nC: [0, 2, 1, 3]\nD: [1, 3, 2, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_161_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_161_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_161_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_161_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [0, 3, 1, 4, 5, 2]\nB: [3, 2, 4, 0, 5, 1]\nC: [3, 2, 5, 0, 1, 4]\nD: [5, 1, 2, 3, 4, 0]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [0, 3, 1, 4, 5, 2]\nB: [3, 2, 4, 0, 5, 1]\nC: [3, 2, 5, 0, 1, 4]\nD: [5, 1, 2, 3, 4, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_162_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_162_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_162_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_162_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_162_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_162_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [0, 1, 5, 2, 4, 3]\nB: [3, 1, 5, 4, 0, 2]\nC: [5, 0, 4, 3, 2, 1]\nD: [1, 3, 5, 2, 0, 4]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [0, 1, 5, 2, 4, 3]\nB: [3, 1, 5, 4, 0, 2]\nC: [5, 0, 4, 3, 2, 1]\nD: [1, 3, 5, 2, 0, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_163_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_163_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_163_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_163_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_163_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_163_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [0, 1, 2, 9, 4, 7, 6, 5, 10, 8, 3]\nB: [5, 3, 2, 1, 0, 4, 6, 8, 9, 10, 7]\nC: [9, 5, 8, 6, 10, 2, 1, 3, 0, 4, 7]\nD: [4, 10, 3, 2, 8, 7, 0, 9, 5, 6, 1]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [0, 1, 2, 9, 4, 7, 6, 5, 10, 8, 3]\nB: [5, 3, 2, 1, 0, 4, 6, 8, 9, 10, 7]\nC: [9, 5, 8, 6, 10, 2, 1, 3, 0, 4, 7]\nD: [4, 10, 3, 2, 8, 7, 0, 9, 5, 6, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_164_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_164_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_164_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_164_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_164_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_164_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_164_6.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_164_7.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_164_8.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_164_9.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_164_10.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [3, 2, 1, 4, 0]\nB: [1, 4, 3, 2, 0]\nC: [1, 4, 3, 0, 2]\nD: [2, 4, 1, 3, 0]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [3, 2, 1, 4, 0]\nB: [1, 4, 3, 2, 0]\nC: [1, 4, 3, 0, 2]\nD: [2, 4, 1, 3, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_165_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_165_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_165_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_165_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_165_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "pouring",
+    "options": "A: [4, 0, 6, 2, 7, 3, 5, 1]\nB: [4, 1, 0, 7, 3, 6, 5, 2]\nC: [5, 7, 4, 6, 2, 0, 3, 1]\nD: [7, 2, 5, 4, 1, 0, 6, 3]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [4, 0, 6, 2, 7, 3, 5, 1]\nB: [4, 1, 0, 7, 3, 6, 5, 2]\nC: [5, 7, 4, 6, 2, 0, 3, 1]\nD: [7, 2, 5, 4, 1, 0, 6, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_166_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_166_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_166_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_166_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_166_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_166_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_166_6.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_166_7.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [2, 10, 6, 0, 12, 1, 13, 4, 9, 8, 3, 7, 5, 11, 14]\nB: [2, 7, 5, 6, 11, 3, 1, 14, 8, 4, 12, 10, 0, 13, 9]\nC: [4, 6, 7, 1, 5, 10, 12, 2, 0, 14, 13, 8, 9, 3, 11]\nD: [6, 2, 5, 14, 9, 4, 11, 1, 10, 12, 7, 13, 8, 3, 0]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [2, 10, 6, 0, 12, 1, 13, 4, 9, 8, 3, 7, 5, 11, 14]\nB: [2, 7, 5, 6, 11, 3, 1, 14, 8, 4, 12, 10, 0, 13, 9]\nC: [4, 6, 7, 1, 5, 10, 12, 2, 0, 14, 13, 8, 9, 3, 11]\nD: [6, 2, 5, 14, 9, 4, 11, 1, 10, 12, 7, 13, 8, 3, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_167_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_167_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_167_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_167_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_167_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_167_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_167_6.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_167_7.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_167_8.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_167_9.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_167_10.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_167_11.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_167_12.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_167_13.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_167_14.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [0, 2, 3, 1]\nB: [1, 2, 0, 3]\nC: [2, 0, 1, 3]\nD: [1, 0, 3, 2]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [0, 2, 3, 1]\nB: [1, 2, 0, 3]\nC: [2, 0, 1, 3]\nD: [1, 0, 3, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_168_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_168_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_168_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_168_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [0, 5, 1, 3, 4, 2]\nB: [0, 5, 1, 3, 2, 4]\nC: [3, 0, 4, 1, 2, 5]\nD: [2, 4, 1, 0, 5, 3]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [0, 5, 1, 3, 4, 2]\nB: [0, 5, 1, 3, 2, 4]\nC: [3, 0, 4, 1, 2, 5]\nD: [2, 4, 1, 0, 5, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_169_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_169_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_169_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_169_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_169_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_169_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "pouring",
+    "options": "A: [8, 0, 7, 5, 1, 2, 3, 4, 6]\nB: [0, 7, 8, 4, 1, 6, 5, 3, 2]\nC: [1, 5, 3, 0, 4, 2, 6, 7, 8]\nD: [7, 2, 4, 1, 5, 0, 3, 6, 8]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [8, 0, 7, 5, 1, 2, 3, 4, 6]\nB: [0, 7, 8, 4, 1, 6, 5, 3, 2]\nC: [1, 5, 3, 0, 4, 2, 6, 7, 8]\nD: [7, 2, 4, 1, 5, 0, 3, 6, 8]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_170_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_170_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_170_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_170_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_170_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_170_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_170_6.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_170_7.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_170_8.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [0, 4, 3, 2, 1, 5]\nB: [3, 5, 0, 1, 4, 2]\nC: [2, 5, 3, 1, 4, 0]\nD: [4, 2, 0, 5, 1, 3]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [0, 4, 3, 2, 1, 5]\nB: [3, 5, 0, 1, 4, 2]\nC: [2, 5, 3, 1, 4, 0]\nD: [4, 2, 0, 5, 1, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_171_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_171_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_171_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_171_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_171_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_171_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [4, 5, 2, 3, 1, 0]\nB: [2, 1, 5, 0, 4, 3]\nC: [0, 5, 2, 1, 3, 4]\nD: [1, 0, 3, 4, 2, 5]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [4, 5, 2, 3, 1, 0]\nB: [2, 1, 5, 0, 4, 3]\nC: [0, 5, 2, 1, 3, 4]\nD: [1, 0, 3, 4, 2, 5]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_172_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_172_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_172_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_172_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_172_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_172_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [3, 1, 0, 2]\nB: [3, 2, 1, 0]\nC: [2, 1, 3, 0]\nD: [3, 2, 0, 1]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [3, 1, 0, 2]\nB: [3, 2, 1, 0]\nC: [2, 1, 3, 0]\nD: [3, 2, 0, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_173_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_173_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_173_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_173_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [6, 2, 1, 0, 4, 5, 3]\nB: [5, 3, 1, 0, 2, 4, 6]\nC: [1, 0, 5, 3, 4, 6, 2]\nD: [4, 6, 5, 0, 1, 2, 3]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [6, 2, 1, 0, 4, 5, 3]\nB: [5, 3, 1, 0, 2, 4, 6]\nC: [1, 0, 5, 3, 4, 6, 2]\nD: [4, 6, 5, 0, 1, 2, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_174_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_174_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_174_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_174_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_174_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_174_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_174_6.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [1, 4, 5, 2, 3, 0]\nB: [5, 0, 1, 2, 4, 3]\nC: [2, 4, 3, 0, 1, 5]\nD: [5, 2, 0, 4, 3, 1]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [1, 4, 5, 2, 3, 0]\nB: [5, 0, 1, 2, 4, 3]\nC: [2, 4, 3, 0, 1, 5]\nD: [5, 2, 0, 4, 3, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_175_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_175_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_175_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_175_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_175_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_175_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [2, 1, 0, 3]\nB: [0, 1, 3, 2]\nC: [2, 1, 3, 0]\nD: [3, 2, 0, 1]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [2, 1, 0, 3]\nB: [0, 1, 3, 2]\nC: [2, 1, 3, 0]\nD: [3, 2, 0, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_176_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_176_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_176_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_176_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [3, 1, 4, 5, 0, 2]\nB: [2, 4, 3, 5, 1, 0]\nC: [5, 3, 4, 0, 2, 1]\nD: [2, 3, 0, 4, 1, 5]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [3, 1, 4, 5, 0, 2]\nB: [2, 4, 3, 5, 1, 0]\nC: [5, 3, 4, 0, 2, 1]\nD: [2, 3, 0, 4, 1, 5]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_177_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_177_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_177_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_177_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_177_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_177_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [1, 0, 3, 2]\nB: [3, 2, 1, 0]\nC: [1, 2, 0, 3]\nD: [2, 3, 0, 1]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [1, 0, 3, 2]\nB: [3, 2, 1, 0]\nC: [1, 2, 0, 3]\nD: [2, 3, 0, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_178_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_178_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_178_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_178_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [5, 0, 3, 2, 1, 4]\nB: [4, 1, 5, 3, 2, 0]\nC: [0, 3, 4, 2, 5, 1]\nD: [5, 4, 2, 0, 1, 3]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [5, 0, 3, 2, 1, 4]\nB: [4, 1, 5, 3, 2, 0]\nC: [0, 3, 4, 2, 5, 1]\nD: [5, 4, 2, 0, 1, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_179_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_179_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_179_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_179_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_179_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_179_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [3, 1, 2, 0]\nB: [0, 2, 1, 3]\nC: [3, 1, 0, 2]\nD: [2, 3, 1, 0]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [3, 1, 2, 0]\nB: [0, 2, 1, 3]\nC: [3, 1, 0, 2]\nD: [2, 3, 1, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_180_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_180_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_180_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_180_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [2, 1, 3, 0]\nB: [1, 3, 2, 0]\nC: [3, 2, 1, 0]\nD: [0, 1, 3, 2]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [2, 1, 3, 0]\nB: [1, 3, 2, 0]\nC: [3, 2, 1, 0]\nD: [0, 1, 3, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_181_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_181_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_181_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_181_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [11, 1, 2, 5, 15, 3, 0, 12, 6, 10, 8, 9, 14, 7, 13, 4]\nB: [9, 5, 6, 7, 2, 3, 4, 8, 0, 11, 12, 10, 15, 13, 14, 1]\nC: [10, 5, 6, 4, 15, 14, 12, 9, 11, 3, 13, 2, 0, 1, 8, 7]\nD: [10, 11, 7, 3, 0, 14, 4, 12, 8, 2, 1, 13, 5, 6, 15, 9]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [11, 1, 2, 5, 15, 3, 0, 12, 6, 10, 8, 9, 14, 7, 13, 4]\nB: [9, 5, 6, 7, 2, 3, 4, 8, 0, 11, 12, 10, 15, 13, 14, 1]\nC: [10, 5, 6, 4, 15, 14, 12, 9, 11, 3, 13, 2, 0, 1, 8, 7]\nD: [10, 11, 7, 3, 0, 14, 4, 12, 8, 2, 1, 13, 5, 6, 15, 9]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_182_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_182_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_182_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_182_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_182_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_182_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_182_6.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_182_7.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_182_8.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_182_9.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_182_10.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_182_11.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_182_12.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_182_13.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_182_14.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_182_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [1, 5, 0, 7, 4, 2, 3, 6]\nB: [6, 4, 7, 1, 3, 0, 5, 2]\nC: [0, 3, 6, 7, 1, 4, 2, 5]\nD: [5, 0, 3, 7, 2, 1, 6, 4]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [1, 5, 0, 7, 4, 2, 3, 6]\nB: [6, 4, 7, 1, 3, 0, 5, 2]\nC: [0, 3, 6, 7, 1, 4, 2, 5]\nD: [5, 0, 3, 7, 2, 1, 6, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_183_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_183_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_183_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_183_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_183_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_183_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_183_6.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_183_7.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [2, 4, 0, 3, 1]\nB: [1, 0, 4, 3, 2]\nC: [0, 3, 1, 2, 4]\nD: [4, 0, 3, 2, 1]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [2, 4, 0, 3, 1]\nB: [1, 0, 4, 3, 2]\nC: [0, 3, 1, 2, 4]\nD: [4, 0, 3, 2, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_184_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_184_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_184_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_184_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_184_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [0, 1, 9, 4, 10, 2, 7, 8, 5, 6, 3]\nB: [7, 1, 10, 6, 8, 3, 0, 9, 4, 2, 5]\nC: [9, 10, 8, 0, 2, 1, 4, 3, 7, 5, 6]\nD: [10, 5, 0, 3, 8, 9, 4, 2, 1, 7, 6]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [0, 1, 9, 4, 10, 2, 7, 8, 5, 6, 3]\nB: [7, 1, 10, 6, 8, 3, 0, 9, 4, 2, 5]\nC: [9, 10, 8, 0, 2, 1, 4, 3, 7, 5, 6]\nD: [10, 5, 0, 3, 8, 9, 4, 2, 1, 7, 6]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_185_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_185_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_185_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_185_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_185_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_185_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_185_6.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_185_7.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_185_8.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_185_9.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_185_10.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [6, 0, 2, 7, 8, 4, 1, 3, 5, 9, 10]\nB: [9, 5, 1, 3, 10, 4, 2, 0, 7, 6, 8]\nC: [5, 10, 1, 8, 4, 2, 3, 6, 9, 7, 0]\nD: [5, 7, 6, 0, 4, 3, 1, 2, 9, 8, 10]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [6, 0, 2, 7, 8, 4, 1, 3, 5, 9, 10]\nB: [9, 5, 1, 3, 10, 4, 2, 0, 7, 6, 8]\nC: [5, 10, 1, 8, 4, 2, 3, 6, 9, 7, 0]\nD: [5, 7, 6, 0, 4, 3, 1, 2, 9, 8, 10]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_186_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_186_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_186_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_186_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_186_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_186_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_186_6.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_186_7.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_186_8.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_186_9.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_186_10.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [4, 0, 1, 2, 3]\nB: [4, 3, 2, 1, 0]\nC: [1, 0, 3, 2, 4]\nD: [0, 4, 1, 2, 3]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [4, 0, 1, 2, 3]\nB: [4, 3, 2, 1, 0]\nC: [1, 0, 3, 2, 4]\nD: [0, 4, 1, 2, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_187_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_187_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_187_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_187_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_187_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [3, 0, 1, 2]\nB: [1, 3, 0, 2]\nC: [2, 3, 0, 1]\nD: [3, 1, 2, 0]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [3, 0, 1, 2]\nB: [1, 3, 0, 2]\nC: [2, 3, 0, 1]\nD: [3, 1, 2, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_188_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_188_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_188_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_188_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "pouring",
+    "options": "A: [5, 4, 2, 6, 3, 1, 0]\nB: [5, 2, 3, 0, 6, 4, 1]\nC: [5, 1, 4, 2, 3, 0, 6]\nD: [2, 4, 0, 5, 6, 1, 3]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [5, 4, 2, 6, 3, 1, 0]\nB: [5, 2, 3, 0, 6, 4, 1]\nC: [5, 1, 4, 2, 3, 0, 6]\nD: [2, 4, 0, 5, 6, 1, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_189_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_189_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_189_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_189_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_189_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_189_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_189_6.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [3, 2, 0, 1]\nB: [3, 0, 2, 1]\nC: [2, 3, 0, 1]\nD: [2, 3, 1, 0]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [3, 2, 0, 1]\nB: [3, 0, 2, 1]\nC: [2, 3, 0, 1]\nD: [2, 3, 1, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_190_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_190_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_190_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_190_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [10, 0, 8, 5, 11, 4, 1, 7, 3, 2, 9, 6]\nB: [10, 5, 7, 2, 3, 4, 1, 6, 11, 8, 0, 9]\nC: [11, 9, 4, 10, 7, 6, 8, 1, 5, 2, 0, 3]\nD: [2, 0, 1, 4, 9, 10, 5, 6, 8, 3, 11, 7]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [10, 0, 8, 5, 11, 4, 1, 7, 3, 2, 9, 6]\nB: [10, 5, 7, 2, 3, 4, 1, 6, 11, 8, 0, 9]\nC: [11, 9, 4, 10, 7, 6, 8, 1, 5, 2, 0, 3]\nD: [2, 0, 1, 4, 9, 10, 5, 6, 8, 3, 11, 7]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_191_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_191_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_191_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_191_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_191_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_191_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_191_6.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_191_7.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_191_8.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_191_9.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_191_10.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_191_11.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [5, 2, 4, 0, 6, 1, 3]\nB: [1, 4, 6, 0, 3, 2, 5]\nC: [6, 2, 3, 4, 0, 5, 1]\nD: [6, 2, 3, 4, 0, 1, 5]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [5, 2, 4, 0, 6, 1, 3]\nB: [1, 4, 6, 0, 3, 2, 5]\nC: [6, 2, 3, 4, 0, 5, 1]\nD: [6, 2, 3, 4, 0, 1, 5]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_192_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_192_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_192_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_192_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_192_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_192_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_192_6.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [3, 0, 2, 1]\nB: [0, 2, 1, 3]\nC: [0, 3, 1, 2]\nD: [2, 3, 1, 0]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [3, 0, 2, 1]\nB: [0, 2, 1, 3]\nC: [0, 3, 1, 2]\nD: [2, 3, 1, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_193_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_193_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_193_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_193_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [6, 5, 3, 1, 2, 4, 0]\nB: [3, 0, 6, 5, 1, 4, 2]\nC: [4, 6, 1, 5, 0, 3, 2]\nD: [5, 2, 6, 1, 4, 3, 0]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [6, 5, 3, 1, 2, 4, 0]\nB: [3, 0, 6, 5, 1, 4, 2]\nC: [4, 6, 1, 5, 0, 3, 2]\nD: [5, 2, 6, 1, 4, 3, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_194_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_194_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_194_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_194_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_194_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_194_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_194_6.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [0, 1, 2, 3]\nB: [2, 0, 3, 1]\nC: [2, 3, 1, 0]\nD: [2, 3, 1, 0]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 0, 3, 1]\nC: [2, 3, 1, 0]\nD: [2, 3, 1, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_195_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_195_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_195_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_195_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [4, 3, 2, 0, 1]\nB: [3, 0, 1, 2, 4]\nC: [0, 2, 4, 1, 3]\nD: [0, 3, 4, 1, 2]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [4, 3, 2, 0, 1]\nB: [3, 0, 1, 2, 4]\nC: [0, 2, 4, 1, 3]\nD: [0, 3, 4, 1, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_196_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_196_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_196_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_196_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_196_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [6, 0, 1, 7, 5, 4, 3, 2]\nB: [5, 0, 1, 6, 4, 7, 3, 2]\nC: [2, 1, 6, 7, 4, 3, 0, 5]\nD: [3, 6, 0, 1, 7, 4, 5, 2]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [6, 0, 1, 7, 5, 4, 3, 2]\nB: [5, 0, 1, 6, 4, 7, 3, 2]\nC: [2, 1, 6, 7, 4, 3, 0, 5]\nD: [3, 6, 0, 1, 7, 4, 5, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_197_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_197_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_197_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_197_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_197_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_197_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_197_6.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_197_7.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [1, 2, 3, 4, 0]\nB: [3, 2, 1, 0, 4]\nC: [4, 1, 0, 2, 3]\nD: [0, 3, 2, 4, 1]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [1, 2, 3, 4, 0]\nB: [3, 2, 1, 0, 4]\nC: [4, 1, 0, 2, 3]\nD: [0, 3, 2, 4, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_198_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_198_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_198_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_198_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_198_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "temporal_ordering",
+    "visual_input_component": "Video image or Natural image",
+    "source": "penn_action",
+    "options": "A: [0, 8, 2, 6, 5, 4, 7, 3, 9, 1, 11, 10]\nB: [8, 7, 1, 2, 11, 10, 5, 9, 4, 6, 3, 0]\nC: [1, 4, 11, 9, 3, 0, 10, 5, 7, 6, 2, 8]\nD: [3, 8, 2, 11, 1, 7, 5, 10, 0, 4, 9, 6]",
+    "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.",
+    "context": "Select from the following choices.\nA: [0, 8, 2, 6, 5, 4, 7, 3, 9, 1, 11, 10]\nB: [8, 7, 1, 2, 11, 10, 5, 9, 4, 6, 3, 0]\nC: [1, 4, 11, 9, 3, 0, 10, 5, 7, 6, 2, 8]\nD: [3, 8, 2, 11, 1, 7, 5, 10, 0, 4, 9, 6]",
+    "input_image_path": [
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_199_0.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_199_1.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_199_2.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_199_3.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_199_4.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_199_5.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_199_6.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_199_7.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_199_8.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_199_9.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_199_10.jpg",
+      "../MMIU-Benchmark/temporal_ordering/temporal_ordering_199_11.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Baconpancakes\nB: 7. Con't Sweet & Sour Sauce\nC: 8. Thicken the Sauce\nD: Ingredients",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Garlic & Quorn', 'Bake', 'Freezing and Serving']",
+    "context": "Here is the context of these images:\n. Quorn 600gr - or ground meat or just veggies .. perhaps fish could work?\nOnions, garlic, corn, baked beans, paprika powder and everything else you need to make a filling. \nSet the oven for medium heat. I use 220'C. . Add oil and butter at medium heat to a big pot. Do not burn the butter. . Add chilli flakes to the pot and remove from heat. Chop some onions and add to the pot. Put the heat on again and stir. Do not burn. \nI find that adding the chilli in this way releases the flavour more evenly. . Add chopped garlic and quorn to the pot and stir. Let it fry a bit. . Add tomato sauce , taco sauce, water , spices, paprika powder and top it off with some mustard. Stir and let simmer for a while. . Make a basic sweet white dough. I'm doing a double load in the bread maker , enough for two loaves. My basic recepie is ;\nFlour 1,8 liter\nWater 0,6 liter\nOil 0,2 liter\nSalt 1tablespoon\nSugar 8 tablespoons\nDry jeast 1 packet\nYou'll need twice this for this much sauce. . Divide you dough into 16 pieces. Roll a piece round and flatten it to a disc. Put the disc on a bakingplate with a cookie sheet on. \nAdd a bit of filling, cover with a slice of cheese and brush the edges with water. Fold the edges up to make a small package. \nRepeat. . Bake for 10-15 minutes at 220'C or until slightly brown. . If you made them to freeze like I do make the sauce a bit spicier since it looses some after freezing. \nHeat from frozen to edible in a microwave at full power for three minutes. Serve with a salad for a complete meal. \nHeat on a BBQ by letting them thaw out to room temperature first. \nRead the question below and select from the following choices.\nA: Baconpancakes\nB: 7. Con't Sweet & Sour Sauce\nC: 8. Thicken the Sauce\nD: Ingredients",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_0_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_0_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_0_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_0_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_0_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_0_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_0_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_0_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_0_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_0_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_0_10.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Outback \"Copy Cat Recipe\" From Aunt Jo\nB: Next, Make a Bath\nC: Fold Down Retention Band\nD: Fold and Crimp",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Cut Top Off', 'Cut the Retention Band', 'Measure and Cut Flaps', '@placeholder']",
+    "context": "Here is the context of these images:\n. You'll need one standard milk cartonbox cutterpencil or permanent markerruler. Measure 1 cm down from the top edge of the carton body.  Mark an horizontal line on each side.  Be careful, knives are sharp!  Cut along these lines through all four sides.  Remove the top and discard.. Measure down 2.5 cm from the new top edge.  Mark an horizontal line at this point on three sides.  Cut along line through three sides.  Leave band attached to one side of carton body.. Measure up 9 cm from bottom of carton, make a mark at corner junction.  Make a cut from this mark, through the corner up to the band.. There should be three flaps and one flap with an attached band.. Fold three side flaps down evenly into the cavity of the carton.. Finally fold down the last flap with the retention band.  Work the band down around the side of the carton to form the closure.  And Voila!This is my very first instructable!  Please comment!  And I hope you will enjoy!\nRead the question below and select from the following choices.\nA: Outback \"Copy Cat Recipe\" From Aunt Jo\nB: Next, Make a Bath\nC: Fold Down Retention Band\nD: Fold and Crimp",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_1_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_1_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_1_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_1_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_1_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_1_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_1_6.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: How to Make Marmalade\nB: How to Make Brioche\nC: BONUS\nD: Store",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Gather Walnuts From Ground', 'There Are Two Methods to Remove the Green Skin of Walnuts', 'Protect Your Hands With Gloves Or...', '@placeholder']",
+    "context": "Here is the context of these images:\n. After we hit the walnuts from walnut tree, walnuts dropt to the ground, and we filled our sacks with walnuts. . There are two methods to remove the green skin of walnuts:1. Use knife to cut directly.2. Use crevises to divide the green shell with your hands with gloves. (Protect your hands not to get coloured)While my father is cutting walnuts, I prefered to use second option to peel walnuts. . Use your hands to extract all walnuts one by one to peel.. Although I used gloves to protect my hands, still I get colour changes on my skins... I can't understand how my gloves unable to protect my hands from walnut's shell.. Next time, I will use two gloves each to my hands.. Check for subtitles for your native language. Video is Turkish. .That's all !. This year, we collected a little bit early, but I think just in time because next week it rained, and made tree and walnuts wet. We protected walnuts from wet to prevent walnuts become moldy. I used three gloves on my hand, but still got the reddish color on my hands :) \nRead the question below and select from the following choices.\nA: How to Make Marmalade\nB: How to Make Brioche\nC: BONUS\nD: Store",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_2_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_2_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_2_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_2_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_2_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_2_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_2_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_2_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_2_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_2_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_2_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_2_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_2_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_2_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_2_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_2_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_2_16.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Homemade in the Jar Pickles\nB: Homemade Pumpkin Spice\nC: Screw on Blade\nD: You Will Need",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Gather Your Ingredients', 'Limes', '@placeholder', 'Eat']",
+    "context": "Here is the context of these images:\n. The food:\n1 14.5 oz can of peeled whole tomatoes\n1 can of chipotle peppers in adobo sauce (can size doesn\u2019t matter as you need less than one pepper)\nLimes (you\u2019ll need \u00bc cup of freshly squeezed lime juice, so you\u2019ll need either 1 large lime or 2 small limes)\n1 clove of garlic\n\u00bc teaspoon of salt\n\u00bc cup (overflowing) of cilantroThe tools:\nBlender\nKnife\nCan opener\nCutting board\nWhile I use a Magic Bullet when I make my salsa, any blender will work. This recipe was just designed to perfectly fill a Magic Bullet cup.. Using your can opener, open your can of tomatoes and pour the can\u2019s contents into the blender.Warning: Be very careful on this step as the can lid will be very sharp after you cut it.Holly\u2019s Helpful Hints:\nI recommend washing the top of the can before you cut it. As you cut the can, some tomato juice will get on the lid, and who knows where that lid has been.. Open your can of chipotle peppers and take out one pepper. On your cutting board, cut the pepper in half and put one of the halves in the blender with the tomatoes.Warning:\u00a0 Be very careful again on this step because, like the previous step, after you open the can, the lid will be very sharp.Holly\u2019s Helpful Hints:\nBe sure to wash this lid too because you never know who could have touched it before you.\nAlso, the point of the chipotle pepper is to add some spice your salsa. If you can handle the heat, add more than half. If you prefer your food milder, add a little less or no pepper at all.\nYou can usually find chipotle peppers in the same aisle in the grocery store as the canned tomatoes.. On your cutting board, slice your lime into quarters. Then squeeze each quarter into your \u00bc cup. Keep squeezing your limes until you have a full \u00bc cup. Then pour the lime juice into your blender.Holly\u2019s Helpful Hints:\nCutting the limes into quarters makes it easier to squeeze and get the most juice out of every lime.\nAlso, I recommend washing your lime before cutting it to make sure all of the pesticides and dirt are off it.. Take your garlic and peel off the outer, papery layer. Then choose one of the smaller cloves, cut both ends off the clove, and peel off its outer layer until you reach the smooth, shiny skin of the clove. Put this clove in the blender.Holly\u2019s Helpful Hints:\nThe easiest way to peel garlic is with a knife. Use the sharp edge of the knife to help you fray the edges of the papery skin, and then pinch the skin with the blade and your finger and pull off the skin.\nGarlic cloves have green roots inside them. They can\u2019t hurt you, but I always think it is better to remove them. Cut your clove in half, and with the blade of the knife pull out the root.. Grab your salt and fill up \u00bc teaspoon. Then pour the salt into your blender container.Holly\u2019s Helpful Hints:\nIf you\u2019re trying to watch your weight, use less salt. Salt makes your body store more water, which increases your weight.\nSea salt or regular salt are equally acceptable in this recipe. Use whichever you prefer!. Grab a bushel of cilantro and wash it thoroughly. Cilantro commonly has dirt and (if it\u2019s not organic) pesticides on it, so you need to make sure your cilantro is thoroughly washed. Then start pulling off the cilantro leaves. You don\u2019t want the stalk of the cilantro, so be sure to pull off the leaves with as little stalk as possible. Fill up an overflowing \u00bc cup of cilantro, and I mean really overflowing. Then empty the cup into the blender.Holly\u2019s Helpful Hints:\nEven if you get the freshest cilantro at the store, it\u2019s still going to have a few leaves that are black or yellow. Just discard those. You only want leaves that are a beautiful, healthy green color.. Screw the blade onto the blender container. Make sure it\u2019s nice and tight as to not lose any of your delicious salsa!. Start blending. With my blender, I let it blend for anywhere from 5 to 10 seconds. It all depends on the power of your blender.Holly\u2019s Helpful Hints:\nIf you like chunky salsa, make it chunky! If you like smoother salsa, make it smooth. The less time you blend your salsa, the chunkier it is going to be.. Take off the blender blade, grab a bag of tortilla chips, and enjoy!. This step is completely optional. I just wanted to suggest some ingredients to make this salsa even more your own.Holly\u2019s Helpful Hints:\nTo make this salsa even more personalized, add some different flavors to it. My favorite extra ingredient to add is corn. Cut the corn right off the husk and add it to your already-blended salsa. The corn gives the salsa an added sweetness that makes this salsa even more irresistible. Frozen corn works well too. Take the bag out of the freezer, and without letting it defrost, toss some kernels in your salsa, just makes sure you don\u2019t add a bunch of ice crystals too. I don\u2019t recommend blending the corn because blended corn really dilutes the corn flavor.Other ingredients to try:\nMango \u2013 Cut up fresh mango into tiny chunks and add them to your blended salsa. Or try blending the mango with the salsa. It will add that sweet, savory taste to the entire batch of salsa.\nRaspberries or Blackberries \u2013 Another way to add some sweet to your spice! With either of these berries, I\u2019d put them in before you blend your salsa. If you add them after blending, the whole berry might be too chunky since neither slices well.\nJalapenos - Try substituting jalapenos for chipotle peppers. Jalapenos are the more classic ingredient used to add spice to salsa, but they have a much stronger taste than chipotle peppers. But like I've said, it's all about the flavor you want.\nRead the question below and select from the following choices.\nA: Homemade in the Jar Pickles\nB: Homemade Pumpkin Spice\nC: Screw on Blade\nD: You Will Need",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_3_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_3_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_3_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_3_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_3_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_3_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_3_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_3_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_3_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_3_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_3_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_3_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_3_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_3_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_3_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_3_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_3_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_3_17.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_3_18.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_3_19.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_3_20.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_3_21.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_3_22.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_3_23.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_3_24.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_3_25.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_3_26.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_3_27.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Strawberries & Lemon  Cheesecake\nB: Materials Needed\nC: Dissolve Gelatin\nD: Recipe",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Click to Watch Video Instructions', 'List of Ingredient', '@placeholder', 'Serve Chilled']",
+    "context": "Here is the context of these images:\n. This cheesecake needs at least 8 hours to set in the refrigerator, plan ahead.. Ingredients:1 oz (28 g) powdered gelatin  1 cup water (at room temperature)  1 lb (454 g) cream cheese (at room temperature)  8 sachets sugar substitute (or to taste)  Lemon juice & zest (of 1 lemon, or to taste)  Pinch of saltFor garnishing:Some lemon slices & shredded lemon zest. Soak and gently heat the gelatin until dissolved.. Combine all ingredients and beat until well blended.. Use loose base cake pan for easy removal. Keep refrigerated for at least 8 hours or overnight.. \nRead the question below and select from the following choices.\nA: Strawberries & Lemon  Cheesecake\nB: Materials Needed\nC: Dissolve Gelatin\nD: Recipe",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_4_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_4_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_4_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_4_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_4_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_4_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Slutty Brownies\nB: Making Dough\nC: Ingradients\nD: Take Brownies Out of Oven",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Get Ingredients', 'Mix Together', 'Cook Brownies', '@placeholder']",
+    "context": "Here is the context of these images:\n. 1) 3 Tablespoons of Water 2) 1/2 Cup Vegetable Oil 3) 2 Eggs . Mix all ingredients together and poor into 9x13 pan . Cook for 26-28 minutes . Let cool. Cut brownies into desired size and Enjoy! \nRead the question below and select from the following choices.\nA: Slutty Brownies\nB: Making Dough\nC: Ingradients\nD: Take Brownies Out of Oven",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_5_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_5_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_5_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_5_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_5_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_5_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_5_6.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Ingredients\nB: Perfect Pizza Dough Recipe\nC: \u200bTools\nD: Mix",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Add the Flour', 'Mix in the Milk', 'Fold in the Chocolate Chips + Additional Mix in Ideas']",
+    "context": "Here is the context of these images:\n. 1/2 cup (1 stick) unsalted butter, softened3/4 cup brown sugar (light or dark - whatever you prefer)1 teaspoon vanilla extract1/2 teaspoon salt1 cup all purpose flour2 tablespoons milk1 cup chocolate chips of choiceBoom. Easy peasy!This amount of ingredients will make 25-30 cookie dough truffles. Recipe adapted from CenterCutCook.com.. Cream the butter and sugar together until nice and fluffy.Then add in the vanilla and salt and mix until well combined.. Mix the flour in until you can't see any dry spots and it's well incoporated.The dough will be very crumbly at this point and that's okay - we'll fix it on the next step.. Now add the two tablespoons of milk and mix again.The dough will get nice and creamy and look just like a typical cookie dough at this point. :). Once the dough is completed, you can add chocolate chips or whatever else you desire!This dough can accept 1 to 1 1/2 cups mix-ins - it all just depends on what it is. Quick cook oats, chopped nuts, shredded coconut or dried fruit would all be lovely. :D. I recommend rolling them into about 1.5 inch balls - you can freeze them on a cookie sheet and then transfer them to a freezer bag. That way you'll have bite size cookie dough whenever you want. They'll keep in the freezer about 3 months. You can also roll them in cocoa powder, coconut, or whatever other topping you like. Yay quick truffles! :D\nRead the question below and select from the following choices.\nA: Ingredients\nB: Perfect Pizza Dough Recipe\nC: \u200bTools\nD: Mix",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_6_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_6_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_6_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_6_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_6_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_6_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_6_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_6_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_6_8.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Rainbow Rasgullas\nB: Layer Your Fruit\nC: Transfer\nD: Add Your Liquids",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Butter Your Pan', 'Marshmallows', '@placeholder', 'Let Cool']",
+    "context": "Here is the context of these images:\n. You'll need the following:4 Cups of Froot Loops* (or cereal of your choice...Trix? Lucky Charms?)1 Stick of Butter1 bag of Mini MarshmallowsTools:SaucepanSpoonMeasuring CupWax PaperBaking Dish*NOTE: I picked Froot Loops for the color aspect. Try other cereals too. Trix or Lucky Charms could be fun ones to experiment with. Ideally, when you are picking your cereal, you'd like to pick a crunchy corn or rice based cereal. This will allow for you to have a krispies treat with some crunch. . The whole process of making these delicious treats happens relatively fast, so you'll want to prep your pan first so that when the marshmallow coating on the cereal is still gooey you can transfer it easily. Butter your baking dish thoroughly. Make sure to butter the sides as well as the bottom. This will make it so your krispie treats will slide out when they are done instead of sticking to the pan. . Place your stick of butter in your sauce pan on your stove. Melt the butter over low heat so that you do not burn your butter. . Once the butter has melted you can add your marshmallows. I used the entire bag of mini marshmallows for this Instructable. Stir constantly, insuring even mixing of your melting marshmallows and the butter. Keep heating until the marshmallows have melted completely and you can no longer distinguish single marshmallows. . Once you have a uniform mixture of butter and marshmallows, add your Froot Loops. Mix gently with a wooden spoon until the cereal is coated in the marshmallow mixture. Since Froot Loops are much bigger than regular Rice Krispies, you'll want to be careful when mixing so that you don't break the loops up. . Once adequately mixed, transfer your cereal marshmallow mixture to your buttered baking dish. Then, using a piece of wax paper, press your krispies down so that they have a uniform shape and top. . Let your krispies treats cool for at least 10 minutes, allowing the marshmallow to harden. After they have cooled you can remove them from a pan (they should slide right out with the buttering you did earlier). Cut with a sharp knife and serve!If you plan on storing them, place them in an air tight container. They will keep for a few days. . Enjoy your new twist on the classic rice krispies treat! Take them to potlucks, parties, and wherever else you need a little rainbow marshmallow goodness. \nRead the question below and select from the following choices.\nA: Rainbow Rasgullas\nB: Layer Your Fruit\nC: Transfer\nD: Add Your Liquids",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_7_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_7_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_7_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_7_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_7_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_7_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_7_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_7_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_7_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_7_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_7_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_7_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_7_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_7_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_7_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_7_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Fresh Pumpkin Pie\nB: Prep\nC: Instructions\nD: Blend",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['The Pumpkins', '@placeholder', 'Roast', 'Puree']",
+    "context": "Here is the context of these images:\n. Sugar pumpkins are usually recommended for baking because they aren't as stringy as larger pumpkins, they're also supposed to be sweeter too. I've baked with purees made from both and haven't noticed a big difference aside from sugar pumpkins being easier to manage due to their smaller size. However, depending on how many cups of puree you need, one big pumpkin might be cheaper than a couple small sugar pumpkins. 1 sugar pumpkin can usually yield 3-4 cups of puree. . Preheat oven to 400.I'm in the habit of washing all my fruits and vegetables before use, even though you don't use the skin of the pumpkin in puree, I still wash it. Cut your pumpkins down the middle so you have two even sides. Scoop out all of the pumpkin guts and seeds. (Seeds can be saved for roasting). On a lined baking sheet, place your pumpkin halves and stick them in the oven. How long the pumpkins take to roast depends on how big they are, but after 25 minutes I start poking them with a fork every 5-10 minutes until the fork goes in and out smoothly (kind of like a baked potato). When the pumpkins are done, remove from oven and let cool. . Pumpkin flesh is easier to remove when it's still hot, but also harder to handle. So as soon as you're able, start scooping all of the flesh from the pumpkin and put it in your blender. Once all of the pumpkin flesh is scooped, blend the flesh until you have a puree. Sometimes I will add a 1/8th cup of water or more, but this isn't usually needed.. Now that your puree is done you can use it right away in a recipe, store in the fridge up to 7 days, or in the freezer for about 3 months. \nRead the question below and select from the following choices.\nA: Fresh Pumpkin Pie\nB: Prep\nC: Instructions\nD: Blend",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_8_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_8_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_8_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_8_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_8_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_8_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_8_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_8_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_8_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_8_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_8_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_8_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_8_12.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Minecraft Chocolate Chip Cookies IRL\nB: Healthy Oatmeal Chocolate Chip Cookies\nC: Finish\nD: BAKING AND COOLING OF CHOCOLATE CHIP COOKIES",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['ADDING AND BEATING OF EGGS', 'ADDING AND BLENDING THE CHOCOLATE CHIPS', '@placeholder', 'HOW  WOULD I EAT THESE FRESH CHOCOLATE CHIP COOKIE']",
+    "context": "Here is the context of these images:\n. List of ingredients for Chocolate Chip Cookies 2 \u00a0 \u00a0 \u00a0 cups (12 oz package) of NESTLE TOLL HOUSE\u00a0Semi-Sweet Chocolate Morsels 2 1/4 cup all-purpose flour 1 \u00a0 \u00a0 \u00a0 \u00a0teaspoon baking soda 1 \u00a0 \u00a0 \u00a0 \u00a0teaspoon soda 1 \u00a0 \u00a0 \u00a0 \u00a0cup (2 sticks) butter 3/4 \u00a0 \u00a0 cup brown sugar (packed) 3/4 \u00a0 \u00a0 cup granulated sugar 1 \u00a0 \u00a0 \u00a0 \u00a0 teaspoon vanilla 2 \u00a0 \u00a0 \u00a0 \u00a0 large eggs. In a small mixing bowl combine: \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 \u00a0\u00a02\u00a01/4 \u00a0 \u00a0cups all-purpose flour \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 1 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 teaspoon salt \u00a0 \u00a0 \u00a0 \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 \u00a0 1 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 teaspoon baking soda After placing the above dry ingredients in a bowl, use whisk (pictured) or large spoon and mix ingredients thoroughly together. \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0. In a large mixing bowl place the following ingredients: \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 3/4 \u00a0cup \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 brown sugar (packed) \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a03/4 \u00a0cup \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 granulated sugar \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a01\u00a0\u00a0 \u00a0cup \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0butter ( 2 sticks, softened) \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a01 \u00a0 \u00a0teaspoon \u00a0 vanilla extract\u00a0 After combining the above ingredients, blend until creamy with a hand mixer (shown) or a table top mixer (not shown).. Add eggs to mixture, one at a time. Make sure first egg is completely mixed in before adding the second egg. Do the same with the second egg before going onto the next step. \u00a0. Pour 2 cups or 1 package (12 oz) \u00a0of NESTLE TOLL HOUSE Semi-Sweet Chocolate Morsels into the mixing bowl, then blend with a large spoon.. Using a tablespoon, drop a rounded spoonful of cookie dough onto a cookie sheet (about an inch apart). \u00a0Place cookie sheet in a preheated oven of 375 degrees and bake for 9 to 11 minutes or until golden brown. \u00a0Allow cookies to cool on baking sheet for 2-4 minutes before moving to cooling rack. \u00a0. After making these chocolate chip cookies I like to kick back with a large glass of cold milk and down a few warm fresh delicious cookies. \u00a0Or I could be talked into eating them with a bowl of my favorite ice cream. \u00a0I would eat them with Jello or pudding. In fact I would eat them just by themselves. That's how I would eat these delightful cookies. \u00a0How would you eat these delicious cookies.. \u00a0\u00a0\nRead the question below and select from the following choices.\nA: Minecraft Chocolate Chip Cookies IRL\nB: Healthy Oatmeal Chocolate Chip Cookies\nC: Finish\nD: BAKING AND COOLING OF CHOCOLATE CHIP COOKIES",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_9_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_9_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_9_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_9_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_9_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_9_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_9_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_9_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_9_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_9_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_9_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_9_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_9_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_9_13.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Soundboard Cake With Working Volt Meters\nB: Final Product\nC: Prepare Your Cake Pans\nD: Step Five",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Fill and Crumb Coat Your Cakes', 'Prepare for Your Fondant', 'Create and Attach Your Girl With a Pearl Earring']",
+    "context": "Here is the context of these images:\n. For this particular design, I decided to create a very tall 6\" cake which required 3 pans. If you want to make a slightly shorter, yet still very generously sized cake, use 2 pans. Prior to baking my cake, I had to prep my pans by greasing them with Crisco and lining them with parchment paper. I also made sure to preheat my oven to 350 degrees.Supplies: 3 round 6\" pans   Crisco   parchment paper   pencil   scissorsSteps:Preheat oven to 350 degrees.  Generously apply Crisco to the inside of your pans, spread it on the bottom and sides.  Place the parchment paper over your pans and lightly trace the rim.  Cut out each circle you traced and cut about 1/4 of an inch inside of the line you drew. (This will allow for your parchment paper to fit inside the pan).  Press your parchment paper to the bottom of your pan and smooth it out.  Set aside your pans and move on to baking your cakes.. This cake is going to be a delicious vanilla cake made with a doctored cake mix. For those of you who don't know, a doctored mix is just a regular store bought cake mix that is adjusted so that it has more of a homemade taste and sturdier structure in case you want to use it for decorative purposes, like a fondant cake. This cake will come out moist and delicious and does not require as much work as a homemade cake.IMPORTANT NOTE: As I mentioned in the previous slide, I wanted to make this a very tall 6\" cake. Since I decided to make it so tall, I had to make two batches of the cake recipe for my three pans. (If I tried making both batches at once it would be too much for my mixer). If you want to make a slightly shorter cake using just two pans, make just one batch of the cake recipe.Ingredients:1 box yellow (or white) cake mix1/2 cup water  1/3 cup oil  1 small package vanilla instant pudding  IMPORTANT NOTE: If using a cake mix that already has pudding in it (like Betty Crocker or Pillsbury) DO NOT add in the instant pudding1 cup sour creamSupplies:greased cake pans  food scale (if you have one)Steps:Notes: Though I will give you the order I used to mix my cake, the order of how you mix this does not matter. Be sure to combine each ingredient individually before moving on to the next one. If you have a standing mixer that is great, if not, a handheld will work just fine. Pour your cake mix into your mixer bowl.  Pour your egg whites over the cake mix and mix on low until just combined, you should still see small lumps of cake mix when you stop the mixer.   Dump in your sour cream (use a spatula to scrap all of it in). Once again, mix it until it is just combined.   Pour in your packet of instant vanilla pudding and mix until just combined.   Pour in your oil and, again, mix on low until combined.  Pour in your water and using your spatula, scrape down the sides and bottom of the bowl.   Use your spatula to combine the batter with the water (this will prevent the water from splattering when you turn on the mixer).   Start the mixer on low for 5 seconds and then put it on high for 2 minutes to thoroughly mix your batter.   Pour 1 pound of batter into each pan. (If you do not have a scale, fill the pans about halfway).   Tap your pans against the counter in order to settle the batter and remove any air bubbles. . Every oven is different so be sure to watch your cakes and test them when baking. Supplies:toothpicks knife  3 round 6\" cake boardsSteps: Bake your cakes at 350 degrees for 35 minutes Using a toothpick, test your cake. If a toothpick inserted in the middle comes out clean, your cake is ready. If the toothpick has wet batter on it, put the cakes back in for 5 minutes at a time until the toothpick comes out clean.  Let your cakes cool COMPLETELY in their pans. (If you try to handle the cakes while they are still warm they will break apart).  Run a knife around the sides of your cake to make sure that it is loose.  Quickly flip your cake over onto a clean counter space or a sheet of parchment paper. It should fall out easily. (If it doesn't you can gently tap the bottom of the pan or flip it over and use the knife again to loosen the cake.) Once you pull the pan away, the cake should remain on the counter.  Gently peel away the parchment paper.  Place a cake board on the bottom of each cake Quickly flip you cakes over so the cake boards are underneath Place your cakes in the fridge in order to let them chill. You can choose to wrap them in plastic wrap or not. If you will not be working on them for a while, you should wrap them. . Before your begin to level, split, and fill your cake, you should gather the necessary supplies and prepare your frosting.Supplies: 1 round 8\" cake board (You will notice that I am using a 10x4 inch board in my pictures. It doesn't matter what size board you use as long as it is bigger than your cake so you can easily carry it without touching the cake)1 round 6\" cake boardscotch tapebig serrated knifecake leveler (If you do not have one, a serrated knife will work fine.)3 decorating bags4 1/2 cups of frosting (I made my own Italian Meringue Buttercream BUT you can use any frosting you want - I suggest about 2-3 cans of frosting for this project)If you would like a quick, easy, and delicious frosting you can make at home, do the following: (1) Combine 2 softened sticks of butter, with one teaspoon of vanilla extract until light and fluffy. (2) Then add in 4 cups of powdered sugar (1 cup at a time) and mix thoroughly after each cup. (3) Add 2-3 tablespoons of milk (one tablespoon at a time) and beat on high speed until you get the consistency you would like. This is a delicious basic American Buttercream recipe. red food coloringblue food coloringtoothpicks3 bowlsoffset spatulabench scraper1/2 C measuring cupturntableSteps: Put 1/2 cup of frosting in the first bowl. Leave it white.Put 2 cups of frosting in the second bowl. Dye it red by using a toothpick to add a glob of red color and begin to mix it in with a toothpick. Then switch over to a spoon to make sure the frosting and color are thoroughly mixed together.Put two more cups of frosting in the third bowl and dye it blue using the same method you used for the red.Bring out your chilled cakesPlace some scotch tape on the bottom of your final 6\" round cake board.Attach the 6\" cake board to the wrong side of the 8\" cake board (we will refer to this as your work board from now on). You want the colors to contrast so you can see where your cake ends and the work board begins.Note: In my picture, I am using a rectangular workboard.Make sure that the board is firmly attached and place it onto your turntable.. You have three options as to how you level your cakes. You can use a cake leveler, which is what I use because it's easier and requires less effort; you could use a ruler and a serrated knife, or you could eyeball your cake and use a serrated knife. I am going to go over how to do this with a leveler but for those of you who have a serrated knife, all you have to do is measure how high you want your cake to be and cut off the excess on top so the cakes are all the same height. You can also reference the pictures above of how I eyeball my cakes and use the serrated knife to split them.  Supplies:3 baked and chilled cakes cake leveler or serrated knife bowl for scrapsSteps:Choose the appropriate height on your leveler that will allow all of your cakes to be the same height. For me, that was level 5.  Make sure that you have enough table space to run your leveler across without it slipping off the sides. That could lead to a very uneven cake.  Place your hand on top of your cake and gently move your leveler across the cake in a back and forth motion. Take your time.  Remove the pieces you cut off and place the excess in a bowl, you can eat it to taste test your cake or if you have enough you could make cake pops.  Repeat this process with the remaining two cakes. Set your cakes aside. Return to the work board that you taped your cake board to and place a dab of frosting in the middle.  Spread the frosting out so there is a thin even layer in the middle of your cake board. Create a tic-tac-toe board in the frosting on your cake board. The lines you create will allow air to move underneath the cake you are going to place on this board and ensure it sticks properly.  Grab one of your chilled cakes and gently flip it over onto your hand.  Gently remove the board attached to the bottom. You may have to pull a little.  Flip your cake back over and firmly press it onto the frosted cake board that is attached to your work board.  Eyeball your cake and determine where the middle is on the side, you want to split your cake parallel to the base. Once you find the middle, place your knife there and gently begin to cut into your cake. Do not try to cut straight through your cake in one try. Instead, run your knife along the side of your cake, keeping it level, and gently turn your cake on the turntable. (Think of it like slicing a bagel in half.) As you turn your cake, continue to cut deeper and deeper until you cut it all the way through.  Separate your two cake pieces then place the top part back on the bottom and set the cake aside.  Repeat this process for the other two cakes but DO NOT attach the other two cakes to their boards with frosting. . Now we are getting to the fun part. Try not to eat too much of your frosting as you do this. It really is delicious.Supplies:3 decorating bags 3 bowls of frosting (red, white, and blue) with spoons in each color scissors bag ties or rubber bands (or you can go without this) turntable offset spatulaSteps: Get a decorating bag (or a sandwich bag) and open it up.  Fold over the top so it creates an opening for the frosting. Using a spoon, fill your bag with your first color of frosting. Close the top of the bag and twist it shut. Fold over the twisted portion and secure it with a bag tie (or rubber band, or just hold it in place when you use the bag)  Repeat this process for the remaining two frostings. Cut off the tip of the decorating bag so you have a penny sized hole. Starting with your first split cake, remove the top layer and pipe a layer of frosting on the cake. Use your turntable so you can pipe one consistent layer. Use the same pressure throughout and take your time.  Note: My frosting pattern is blue, red, white, red, blue.Once you have piped your layer of frosting, use your offset spatula to spread the frosting evenly. Don't worry if the frosting squeezes out of the sides.  Take the next cake layer and stack it on top of your first frosted layer. Gently press down and make sure it is secure before piping your next layer of frosting. Make sure to wipe your spatula between colors so your frosting doesn't blend together. Repeat this process of piping, spreading, and stacking until you have stacked all your cake layers.  Once your cake is completely stacked, use the remaining frosting to crumb coat your cake. Begin by piping lines of frosting vertically down the sides of the cake. (Don't worry about the colors, you can mix them now.) Pipe some frosting on top of the cake as well.  Using your spatula, begin to spread the frosting on top of the cake. Since this is a crumb coat, you want to make sure the frosting presses into the cake and seals the crumbs in.  Be sure to scrape off any excess frosting into a bowl. Run your offset spatula along the side of your cake as you turn the turntable. This will allow you to spread the frosting evenly and cover the entire cake.  Continue to remove any excess frosting and repeat the process of running the spatula across the cake as many times as necessary until it is completely covered.  Be sure to cover all parts of the cake and press in the crumbs. Set your crumb coated cake into the fridge to chill for at least 30 minutes. . Once your cake is nice and chilled, meaning the frosting is firm, it is time to give it a good final coat of frosting. Prior to frosting your cake, you will want to remove your 6\" cake board from the work board and place it on a cake base. You can use a large offset spatula to remove the cake from its workboard and place it on the cake base. You can also apply a dab of frosting (or tape) to the cake base in order to help secure the cake board once you place it. Supplies:Chilled and crumb coated cake A decorator bag filled with two cups of white frosting turntable bench scraper offset spatulaSteps:Place two cups of white frosting into the same decorator bag you used earlier (or if you want to, you can use a new one).  While turning your cake on the turntable, gently pipe around the entire cake a nice thick layer of frosting.  Pipe an additional layer of frosting on top of the cake.  Place your bench scraper at a 90-degree angle to your cake.  Gently turn the cake on the turntable and allow your bench scraper to smooth out your frosting, let the turntable do most of the work.  Smooth out the top of your cake as well, make sure to scrape away any excess frosting into a bowl. You can always add more later but you want to make the cake as smooth as possible so it may be necessary to remove some frosting.  As you smooth your cake, if you notice any holes, just fill them in with the piping bag and smooth it over again.  Take your time, be patient, and repeat the process as many times as necessary until your cake is as smooth as possible. I still struggle to get it super smooth so I did the best I could.  Set your cake in the fridge to chill for at least 30 minutes. . I chose to get my colors ready for my Girl with a Pearl Earring before covering my cake with black fondant. I honestly could have covered my cake first and it would have worked too. I gathered all the supplies I would need to both cover my cake and create my girl, before focusing on each step individually.Supplies:One box of black fondant (I used Wilton's decorator preferred)fondant roller (or regular rolling pin - it really doesn't matter)rulerpowdered sugar (or cornstarch)sharp knifefondant cutter (if you have one, if not, keep using the sharp knife)fondant smoother (not pictured)about 2 ounces of white fondant (once again, I used Wilton's decorator preferred)Exacto knife (not pictured)a printout of the Girl with a Pearl Earring that is the exact size you want to use on your cakescissorscutting boardsome vodka (not pictured)7 small containers (bowls, espresso cups, ramekin, etc.)orange, red, brown, copper, blue, black, yellow, green food coloringsilver pearl luster dustpaintbrushes*If you are like me and like to have things thoroughly planned out in advance, you could print out another picture of a Girl with a Pearl Earring and use it to write down the colors you need to mix in order to achieve the right shade for each part of the painting. **I also pulled up a picture of a Girl with a Pearl Earring on my laptop since my printer didn't print the colors properly. I wanted to get my version as close to the actual painting as possible. Steps:As part of my preparation, and as you can see in the pictures above, I mixed my colors in advance. I had to play around with it a little and test it on a spare piece of white fondant before determining which combination I liked best, I suggest you do the same. Some of these colors I mixed as a base and eventually darkened to add depth to the overall painting. Don't worry about this yet, I will explain it more thoroughly in the following steps. As you can see in the picture above, these are the colors I mixed for each part:For her skin: orange, red, and brownFor her clothes: copper and brown For the blue portion of her turban: just blueFor the yellow portion of her turban: yellow and brownFor her pearl earring: silver luster dust For the blue and yellow portion of her turban: I did not create a color For her lips: mostly red with a hint of copperFor her eyes: mostly green with a hint of black to make them darker . This part of the cake can be a little frustrating but do not let it defeat you! Initially, I tried to cover my cake the right way by rolling out one piece of fondant that would completely cover the cake smoothly. However, since the fondant I used tends to be a bit dry, it immediately began to rip when I tried to put it on my cake. I could have rolled it out thicker but I really didn't want a thick layer of fondant on my cake. So instead, I decided to panel my fondant which is what I'm going to describe here. Supplies: box of black fondant    frosted and chilled cake     fondant roller    fondant smoothers    ruler    powdered sugar    turntable     fondant cutter (if you have one)    sharp knife Steps:Measure your cake's diameter and height     Calculate the circumference of your cake.     spread some powdered sugar on your clean work surface (you may want some on your hands too)     take out your fondant and begin kneading it until it's pliable     mold a piece of fondant (not the whole thing) into a rectangular shape (make sure all the folds and creases are on the bottom)    roll your fondant (to about 1/4-1/8 inch) out until it is as long as (or longer than) half of your circumference and as high as your cake (you may want to give yourself an extra inch both ways, just in case)     Using your ruler, trim your fondant so it is exactly as long and as high as you need it    Stick your fondant to one side of your cake and use the fondant smoother to firmly attach it     Use your sharp knife to cut off any excess fondant that may be sticking out on top    Repeat steps 3-9 to cover the other side of your cake     Take a piece of fondant and mold it into a circle with all the creases and folds underneath    roll out your fondant (to about 1/4-1/8 inch) until it is big enough to cover the top of your cake     Place your fondant on the top of your cake and firmly attach it with your fondant smoother    Using your fondant cutter (or sharp knife) carefully trim away the excess fondant hanging over the top    Use a paintbrush to brush away any powdered sugar on your fondant.. Now you are on your final, and in my opinion most fun, part! You get to be an artist and paint your own Girl with a Pearl Earring! My biggest piece of advice for this part is to have fun with it. Unless you are extremely artistically talented, you probably aren't going to recreate Vermeer's work but I'm sure you can pull off an awesome likeness.Supplies:your mixed colorscovered cakepaintbrushes (I only used a very fine tip one)2 ounces of white fondantprint out of Girl with a Pearl Earring that is the exact size you want it to be on your cakeExacto knifesharp knifecutting boardfondant rollerSteps:If you haven't done so already, cut out your print out of a Girl with a Pearl EarringSpread some powdered sugar on your clean work surfaceKnead your white fondantRoll out your fondant (about 1/8 inch thick) so that it is big enough to fit the cut out of the GirlIf at any point you need to leave your fondant alone, cover it with plastic wrap so it does not dry outPlace your fondant on a cutting board and place the print out on topCarefully cut your fondant in the shape of your girl by tracing it with an Exacto knifeOnce you have a solid silhouette of your Girl, place your print out back on the fondant (make sure it lines up perfectly)Using a toothpick or sculpting tool (like the one I have pictured) carefully trace the key features of the girl so you have guidelines for when you are painting.Do not press down too hard or you will leave indentsOutline the eyes, ears, nose, mouth, turban, clothes, earringThough it may be a little difficult to see, you will have a good impression on your fondant that will guide your paintingBegin painting slowly and lightly, it is much easier to add more color than it is to remove.remember that her face is illuminated on one side so do not paint the entire face the same colorPaint a light base of her face, clothes, and blue portion of her turbanPaint a light base of her lips and the yellow portion of her turbanat this point, I added some concentrated (directly from the container) blue to the middle portion of her turban and added more brown to the yellow to create shadows in her yellow portionI also painted the ends of her turban and her pearl earringThen I added black to blue color so I could paint the back of her blue turban and began coloring in her eyesI then darkened her skin color with more brown so I could create shadows in her face and I darkened her clothes color with more brown so I could add distinction to her clothesI gave her eyebrows and gave her a very light (by adding vodka to her skin color container) sweep of color on the lighter side of her faceI added concentrated (directly from the container) red to her lipsI added black to her eyesAt any point, if I added to much color, I used vodka to \"erase\" my mistake. Be careful though because too much vodka can ruin the fondant.Experiment with this part and do whatever makes you happy.Once you are happy with your Girl, take a knife and lightly make some impressions all over her (if you look closely at the painting you will notice that it appears cracked)Let her dry for a few minutes then carefully pick her up (without touching the food coloring) and using a broader paintbrush, paint some vodka on the area of the cake you want to place herquickly secure her to the vodka (it dries quickly)Add more vodka strokes to any areas that are not attached yet. DO NOT USE THE FONDANT SMOOTHER - it will ruin your art.Step back, admire your work...and eat your cake! =)*If you want nice clean slices like the one pictured at the beginning, use a sharp chef's knife and wipe the blade in-between each slice.\nRead the question below and select from the following choices.\nA: Soundboard Cake With Working Volt Meters\nB: Final Product\nC: Prepare Your Cake Pans\nD: Step Five",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_10_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_10_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_10_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_10_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_10_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_10_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_10_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_10_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_10_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_10_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_10_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_10_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_10_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_10_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_10_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_10_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_10_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_10_17.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_10_18.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_10_19.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_10_20.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_10_21.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_10_22.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_10_23.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_10_24.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_10_25.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_10_26.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_10_27.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_10_28.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_10_29.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_10_30.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_10_31.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_10_32.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_10_33.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_10_34.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_10_35.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_10_36.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_10_37.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_10_38.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_10_39.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_10_40.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_10_41.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_10_42.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_10_43.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_10_44.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_10_45.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_10_46.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_10_47.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_10_48.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_10_49.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_10_50.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_10_51.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_10_52.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_10_53.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_10_54.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_10_55.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_10_56.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Mince the Pepper & Ginger\nB: Hot Pepper Jelly\nC: Make the Cake/Cupcakes\nD: Frosting Ingredients",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Ingredients', '@placeholder', 'Add the Pectin Mixture', 'Fill Sterile Jars']",
+    "context": "Here is the context of these images:\n. \n\tPLEASE PLEASEPLEASE \n\t\u2022 Use gloves when handling hot peppers.\n\t\u2022 If the Capsaicin (which is what makes the heat in Chilies) gets in your eyes you will be miserable for a while.\n\tCapsaicin affects epithelial tissue especially in the non keratinized epithelium in the mouth, esophagus, nose, and eyes.What increases the heat?\n\t\t\u2022 Water washes away the oils or mucus that protects tissues and so will increase the heat from Capsaicin.\n\t\t\u2022 Anything that is salty or contains alcohol will increase the heat as well.What decreases the heat?\n\t\t\u2022 The fat in Cold milk bring the Capsaicin into solution and thus decrease a burning sensation (and according to Wikipedia caseins in milk have a detergent effect bringing capsaicin into solution to disolve it).\n\t\t\u2022 Cold sugar solution (10%) at 20 \u00b0C (68 \u00b0F) is almost as effective.\n\t\t\u00a0. \n\t\t3/4 pound of washed and chunked mixed hot Peppers\n\t\t1/3 cup fresh peeled Ginger\n\t\t4 cups sugar (I often mix 2 cups sugar with the Stevia equivalent of 2 cups sugar)\n\t\t2 cups of 5% apple cider vinegar\n\t\t2 packets of low sugar dry pectin. In a food processor finely mince the peppers with the ginger and set aside. Mix the dry pectin with about 1/2 cup of sugar and set aside.. \u2022 Mix the vinegar and remaining sugar\n\u2022 Add the minced pepper & ginger to the pot\n\u2022 Boil for 10 minutes over medium heat, while stirring periodically, to prevent burning.. \u2022 Remove the pot from heat\n\u2022 Add the pectin sugar mixture to the pot and stir briskly,\n\u2022 Return the mix to the heat and boil hard for 1 minute, stirring constantly. \u2022 chill a metal tablespoon by sitting it in an ice water bath,\n\u2022 Take a half spoonful of the pepper mix and let it cool on top of the ice to room temp\nIf it thickens up to the consistency of jelly it is ready. If not, mix in a little more pectin (about 1/3 to 1/2 of another package) and bring to a boil for 1 minute or cook a bit longer.. For the pepper jelly I use 8-12 ounce jars. I prepare the jars by running them and their their caps through the dishwasher. They can also be boiled in a large pot prior to filling\n\u2022 Fill jars to within 1/8-inch of the top and screw on covers tightly\n\u2022 Place in boiling bath 10 min and cool\nOnce cooled, the caps should be concave.ENJOY!\nRead the question below and select from the following choices.\nA: Mince the Pepper & Ginger\nB: Hot Pepper Jelly\nC: Make the Cake/Cupcakes\nD: Frosting Ingredients",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_11_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_11_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_11_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_11_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_11_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_11_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_11_6.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Vegan Coconut Chocolate Ice Cream\nB: and OPTIONS\nC: Making the Enchilada\nD: Ice Cream Spaghetti",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Ingredients', 'Mix Ingredients & Set Aside for 2 Hours', 'Add Ice Cream Filling', '@placeholder']",
+    "context": "Here is the context of these images:\n. Chocolate Tortilla\n1/2 cup flour\n1/2 cup sugar\n3 tablespoons unsweetened cocoa\n1/4 cup milk\n2 eggs \n1/4 cup vegetable oil\n1 teaspoon vanilla\ndash of salt\nFilling\nice cream  (any flavor)\nGarnish\nchocolate syrup or sauce\nfresh strawberries. Combine ingredients and hand mix for about 5 minutes until texture is smooth.\nCover and store in refrigerator for about 2 hours.\n. Heat a nonstick skillet over medium heat.\nPour 1/4 cup mixture in center and tilt pan to spread batter into a circle.\nLet cook for about 2 minutes then flip to cook other side for another 1-2 minutes.\nBe careful not to burn or else the tortilla will be too stiff for folding.\n. Add 3 to 4 heaping spoons of your favorite ice cream on top of the chocolate tortilla.\nI used strawberry. \n. Fold both sides of the chocolate tortilla to wrap the ice cream filling.. Drizzle with chocolate syrup and garnish with whole or cut strawberries.\nRead the question below and select from the following choices.\nA: Vegan Coconut Chocolate Ice Cream\nB: and OPTIONS\nC: Making the Enchilada\nD: Ice Cream Spaghetti",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_12_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_12_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_12_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_12_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_12_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_12_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_12_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_12_7.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: 3.14 Quick and Easy Mini Pies\nB: Assembling the Crust\nC: Get Ready With the Pastry\nD: Things You Will Need",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Gather the Elements', '@placeholder', 'Assemble Your Mince Pies....', 'Bake, and Enjoy']",
+    "context": "Here is the context of these images:\n. You need (for 12 mince pies like these)1 x ready rolled puff pastry - take out of the fridge and allow to get to room temperature1 x jar mincemeat (actually you only need 12 tea-spoonsfull - about 120g or 4Oz)1 x egg (for optional egg wash)a baking tray, and an oven... pre-heat to gas mark 6, 200C, 400F, 'moderately hot', (180C or equivalent if its a fan oven)and a couple of tools.. Allow around 10 minutes to reach room temperatureCollect your tools : - a teaspoon and a knife work for me, with a fork to beat the egg.....Beat the egg while the pastry, and the oven, warm up.. Open out the pastry sheetUse the knife to turn into 12 rectanglesAdd a tea-spoonful of mincemeat to one end of each rectangle, leaving enough space to seal the pastry when you fold it over.Make a parcel, and press the sides together to keep the mincemeat in the pastry.Prick each parcel, to let steam escapeApply the eggwash - as uniformly as you like..... The pastry comes on its own baking paper which can go into the oven on a baking tray.Cook for 20-25 minutes (see below)leave to cool on a rackEnjoy!I have used supermarket (Aldi) pastry, and JusRol. The Aldi pastry is a slightly bigger sheet, so the pies are bigger and take slightly longer to cook. They taste just as good!\nRead the question below and select from the following choices.\nA: 3.14 Quick and Easy Mini Pies\nB: Assembling the Crust\nC: Get Ready With the Pastry\nD: Things You Will Need",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_13_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_13_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_13_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_13_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_13_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_13_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_13_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_13_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_13_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_13_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_13_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_13_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_13_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_13_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_13_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_13_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_13_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_13_17.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Eggs Florentine\nB: Crack Egg\nC: Do Your Homework\nD: Prep and Cooking",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Get a Coop', 'Put Them Outside', 'Meet the Girls']",
+    "context": "Here is the context of these images:\n. Are chickens allowed in your neighborhood/city?Do your neighbors mind if you have chickens?Do you have a feed store nearby for chicken feed?Do you have room for chickens?If you can answer yes to these questions, then ask:What kind of chickens do I want? All of this will cause you to ask more questions and then you can ask your significant other, \"Honey, do you mind if we get some chickens?\". There are several types of chicken coops available to build or buy.  Purina offers a great free chicken coop plan on their site that I really liked.  But since my husband was going to be the one building it and I was going to have to buy the materials, I spent a couple weeks trolling craigslist until I found one just like if for $75 and rented a trailer.  My husband was more than happy to spend a morning picking one up instead of two weekends building one from scratch and I saved about $150 in building materials.. There are several places to order chicks from.  I recommend you visit sites and read the reviews about the temperament and laying habits of the different breeds to find out which will suit you best.  I used Murray McMurray Hatchery in Iowa.  They shipped me my one day old chicks on October 1.  They arrived at the post office the next day.  The US Post Office was excellent in giving me a call to let me know they were there.  I could hear them chirping in the backroom when I arrived.When they arrive Murray McMurray has awesome support tips on their website to ensure you can take care of them.  Basically you dip their beaks in food and water and they figure out the rest.  Be sure to put a lamp on them as they need to stay over 90 degrees.  Here in Texas we still have 90 degree weather in Oct, but we kept a lamp on them anyway.  . Pretty soon, like in 3 - 4 weeks they start hopping out of the box (or brooder) and they need to go out to the coop.  Plus they stink.  . The next 3 months are spent trying to determine how the heck you can keep them from soiling their food and water.  After several debates we decided on a 4 inch PVC pipe with a Y connection on the bottom strapped inside the coop for their food.   Once they could get out of the coop we left their water outside.  . The last thing you want is drama in the hen house.  Any stress will cause the gals not to lay and lead to bad relations all around.  Anyone that is too noisy will cause your neighbors to call you in as owning a nuisance pet.  These birds make better dinner guests.. Then on February 8 we got our first eggs. The first two held four yolks.  Now most of them have one yolk and they give us five eggs a day. . We kept five hens, Lily, Ivory, Ebony, Ruby and Dotty.  They will have to do their own instructable about how they make the eggs.  It amazes me everyday that they just create food. They are very sweet, get along with the dogs and greet me when I come home from work.  They eat bugs out of the yard and are just easy going nice pets that offer food too!\nRead the question below and select from the following choices.\nA: Eggs Florentine\nB: Crack Egg\nC: Do Your Homework\nD: Prep and Cooking",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_14_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_14_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_14_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_14_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_14_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_14_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_14_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_14_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_14_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_14_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_14_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_14_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_14_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_14_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_14_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_14_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_14_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_14_17.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_14_18.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_14_19.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_14_20.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_14_21.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: RECIPE | MANGO AVOCADO SALAD\nB: Frozen Peas\nC: Make the Dressing\nD: Subscibe/Follow Us to Get the Latest Updates",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Ingredients', '@placeholder', 'Cut the Potatoes', 'Combining + Serving']",
+    "context": "Here is the context of these images:\n. 2-3 pounds golf ball sized new potatoes1/3 cup olive oil2 shallots, finely diced3 tablespoons white vinegar1 tablespoon dijon mustardpinch of sugar1 teaspoon salt (plus more for salting the cooking water!)black pepper to taste1/2 cup fresh herbs - I'm using half parsley, half dillThe recipe above is fairly salty - if you're sensitive to salt, maybe do 1/2 teaspoon mixed with the dressing at first and then try more if you like it. :DThis recipe adapted from a CHOW recipe found here. . Place the potatoes in a large pot and cover with water. Add a couple huge pinches of salt.Bring to a boil and let cook for 10-15 minutes. Then check a few of the largest poatoes with a paring knife - if you can easily insert into the potatoes you are good to go! If not, set the timer for a couple more minutes and check again. Try not to overcook them too much - they'll burst their skins!I think mine took right around 15 minutes to cook.Once they're done cooking, drain them and set them aside to cool. Speed up the cooling by laying them out on a baking sheet. :). As the potatoes cool, dice the shallots pretty finely. Throw them into a bowl large enough to toss the potatoes. Add the vinegar, olive oil, salt, dijon mustard and a pinch of sugar. Whisk everything together until it's emulsified and taste test. Add whatever else you'd like! Now set it aside until the potatoes are nice and cool.. Once the potatoes are cool, cut them into bite sized pieces. Some I cut right in half, larger ones I cut into three slices.. Add the potatoes into the bowl on top of the dressing. Chop the herbs finely and add them on top. I normally add a crack of black pepper to the top too. Use a spatula to stir everything gently so all the potatoes get covered with the dressing. It's fabulous just as it is right now, but I think it tastes even better after a night in the fridge. I recommend making it one day ahead of when you want to serve it. You can serve it cold or bring it to room temp - both are tasty! The dill really comes through after a bit of time mingling. :D\nRead the question below and select from the following choices.\nA: RECIPE | MANGO AVOCADO SALAD\nB: Frozen Peas\nC: Make the Dressing\nD: Subscibe/Follow Us to Get the Latest Updates",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_15_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_15_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_15_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_15_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_15_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_15_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_15_6.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Elvis Ice Cream\nB: Shake Vigorously (up and Down AND Side to Side to Allow for Even Distribution of the Cold) for Approximately 10 Minutes, or Until the Mixture in the Smaller Bag Thickens Into Ice Cream.\nC: Ingredients\nD: Seal the Smaller Bag Tightly, and Place It Inside the Bigger Bag. Then, Seal the Bigger Bag Tightly, Ensuring the Smaller Bag Is Inside.",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Ice Cream Mix (1)', 'Ice Cream Mix (2)', 'Finishing the Ice Cream']",
+    "context": "Here is the context of these images:\n. We will need:1 1/2 cup of milk whole or non-fat (I used whole milk) 3/4 cup of sugar1 cup of whipping cream 4 egg yolks4 tablespoons of vanila flavorYellow colorant (optional)A medium bowlA large bowlElectric mixerA medium saucepan. 1. Place the whipping cream and the large bowl in the freezer for 30  - 45 minutes.2. Meanwhile, pour the milk into the sauce pan and add the 4 vanila flavor tablespoons. Heat until 55 - 60 C (130 - 140 F). Set aside and let it cool until 40 C (104 F)3. In the small bowl stir the egg yolks until light. 4. Add sugar gradually and continue mixing until spreading consistency5. Add the milk to the yolks and continue mixing until you have an homogeneus mix.Note: These are extra steps added to prevent any Salmonella risks due to the raw egg and is strongly recommended to do it. Moreover, if you are going to make ice cream for your kids you must do it. However, if you feel comfortable knowing that you are going to consume raw egg, you can skip the following steps but do not tell me later that I didn't warn you about the risk.6. Heat the mix up to 65 C (140 F) for 10 minutes. Do not forget to stir the mix with a spoon while heating and do no let the temperature rise too much. Try to keep the temperature constant. A tip is to heat up to 75 C (170 F), then turn off the stove and continue stirring with the spoon by 10 minutes. After 10 minutes the temperature should be more or less 65  C.7. Cool the mix up to room temperature. Do not forget to continue stirring with the spoon while cooling. . It's time to use the large bowl and the whipping cream we have on the frezzer1. Pour a cup of the whipping cream into the bowl and beat it with an electric mixer until stiff peaks form. 2. Fold milk mixture into whipped cream3. Continue mixing at a low speed for five minutes more.4. Cover the bowl and freeze it for two hours . After two hours, the edges should be hard. 1. Using a spoon, break the edges and incorporate it into the mix.2. Stir gentle with the electric mixer and freeze again After an hour, repeat the above steps and continue freezing for 4 hours more or until the ice cream hardens completely.Congratulations, you have made ice cream. Enjoy it!A final note: The freezing times may depend on your freezer. It is advisable to let the ice cream rip for 24 hours before serving. However if you do not want to wait, you can enjoy it as soon is hard enough for scooping. \nRead the question below and select from the following choices.\nA: Elvis Ice Cream\nB: Shake Vigorously (up and Down AND Side to Side to Allow for Even Distribution of the Cold) for Approximately 10 Minutes, or Until the Mixture in the Smaller Bag Thickens Into Ice Cream.\nC: Ingredients\nD: Seal the Smaller Bag Tightly, and Place It Inside the Bigger Bag. Then, Seal the Bigger Bag Tightly, Ensuring the Smaller Bag Is Inside.",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_16_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_16_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_16_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_16_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_16_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_16_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_16_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_16_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_16_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_16_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_16_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_16_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_16_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_16_13.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Cous Cous and Halloumi Cheese\nB: Griddle and Flip\nC: Enjoy\nD: The Final Product",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Ingredients', 'Dice', 'Shape It', '@placeholder']",
+    "context": "Here is the context of these images:\n. 1 small Camembert (125g - that equals about 4.5 ounces)1,5 heaped teaspoons of powdered sweet paprika0,5 teaspoon caraway seeds1 tablespoon of fine diced onion (I cut them as small as possible, the smaller they are the better they integrate / dissolve into the spread...)3 heaped tablespoons cream cheese (we used a chili spiced variety but any cream cheese will do the job - you can also use sour cream, your obatzer may just become a little softer)1 or 2 dashes of salt and pepperThe Camembert should be ripe/aged and soft. There is no need to use fancy imported French Camembert for this recipe, I always use the cheaper German Camembert. You could use other Camembert-like cheese like Brie as well.I don't recommend using low fat cheese for this recipe, it's consistency isn't creamy enough to get the Obatzda right.I highly recommend to allow the Camembert to come down to room temperature. If you take the cheese just from the fridge it will be much sturdier and harder to mash (also the aroma evolves at room temperature).My mother used to add butter (about two tablespoons) and also sometimes used processed cheese (about a tablespoon) and therefore she reduced the cream cheese to about one tablespoon.But my grandma should watch her weight so \"our\" recipe replaces the butter with cream cheese - and because processed cheese gives me itchy teeth we don't add it either... This recipe is pretty forgiving, in fact before I wrote this 'ible I always just eyeballed the amounts and always turned out nice. So feel free to experiment with the different ingredients and find your own favorite mixture.. Place your Camembert on a plate and find a pretty Hand model who cuts the Camembert into little dices. No need to be especially accurate, it will be mashed soon anyway... . Add 1,5 teaspoons of sweet powdered paprika, half a teaspoon on caraway seeds and about a tablespoon of super fine dices onions. Mix everything together.. Add about three tablespoons of cream cheese. Continue mishmashing squishsquashing everything together. Use a tool according to your preferences: My grandma prefers to use a knife, I like to use a fork. I never used a food processor or a hand mixer for this process (nor did my mother) I'm not sure if machines are a able to \"batz\" as nicely as humans do.... Mix and mash and squash until the spread reaches your desired consistency. It usually takes about three to five minutes to get it right.. Traditionally the Obatzda is served on a plate and shaped like a dome, but my grandma prefers to shape it like a disk. Use a knife to shape the cheese spread according to your styling preferences. We use the same plate for mixing and serving, you can use a paper towel to clean up the rim of the plate. And of course you could as well serve your Obatzda in a bowl.We like to sprinkle some chive on top - it looks nice and also the aroma fits well.... I enjoy Obatzda the most one or two hours after it's mashed together, I think the flavors develop even better after a little resting time. But you can consume it right away as well. (Obatzda tastes the best the day it is made, the next day the flavors are even more advanced - which is still fine for my grandma, but maybe not for everyone)In the third picture you can see my grandmother and me enjoying Obatzda with some fresh pretzels ;). This rustic hearty snack goes very well with a glass of beer or Radler (beer and lemonade).Pretzels and lye rolls are a perfect base for this spread. Rustic dark bread fits as well.You can serve it with fresh radishes, they are a nice company for Obatzda.You may sprinkle some chives on top enhance visual appearance and flavor. If you don't like onions just skip them, but I think onionless Obatzda is just half as nice as the original... Another option might be to slightly stew the onions to melt down their harsh flavor.Some people like to garnish the Obatzda with onion rings instead of integrating diced ones, this may be a good option if an onion lover and an onion hater want to share a portion of this spread.Some people like to use stronger soft cheeses (German Romadour or Limburger). If Camembert isn't hearty enough for you you might to try those.Some Obbatzda recipes integrate a splash of beer into the mix but I've never tried this myself. As long as you don't share your Obatzda with children you might give this it a try. (In Frankonia they add a splash wine instead of beer - feel free to experiment)Fun Fact: Originally Obaztda was just used as a way to use up overaged cheese. \nRead the question below and select from the following choices.\nA: Cous Cous and Halloumi Cheese\nB: Griddle and Flip\nC: Enjoy\nD: The Final Product",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_17_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_17_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_17_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_17_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_17_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_17_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_17_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_17_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_17_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_17_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_17_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_17_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_17_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_17_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_17_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_17_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_17_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_17_17.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_17_18.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_17_19.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_17_20.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_17_21.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Graveyard Brownies\nB: Mix the Dough\nC: Bake the Brownies\nD: Mix Up the Brownies",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Blend the Brownies', 'Place the Base', 'Enjoy!']",
+    "context": "Here is the context of these images:\n. Preheat the oven to 350\u00b0 Fahrenheit to prepare for the baking of your deliciously decadent creation. Then, spray PAM in each cup of the cupcake pan and wipe cooking spray evenly with a napkin throughout.. You will then prepare the cookie dough. I always prefer and use Betty Crocker\u2019s Chocolate Chip Cookie mix for the base of the brownie because it's just my favorite tasting cookie dough. Simply follow the instructions on the packaging to prepare the dough by adding 1 stick of softened (not melted) butter and 1 large egg to the dry mix and mixing well together.. Once the cookie dough is mixed, you will begin to prepare the brownie mix. I chose to use Pillsbury Chocolate Fudge Brownie mix. Again, you will follow those instructions by adding the dry mix with 2 eggs, 1/3 cup of water, and 1/3 cup of vegetable oil and blending.. Place a little smaller than a spoonful of cookie dough into one of the cups of the pan so that there is about a half inch thick layer of cookie dough that covers the base of the cup. Repeat this until all cups have the same layer of cookie dough filled.. Next, you will add one Oreo to the center of each cup and press firmly so the cookie stays in place while baking. Be sure not to press too hard, otherwise the cookie will break!. Use a spoon or ladle and fill the remaining area of the cups with brownie mix, but be careful not to fill them directly to the brim because they will overflow; just a little below the brim is perfect.. Place the cupcake pans in the oven set at 350\u00b0 Fahrenheit and let bake for about 18 minutes. Once cooled, take a knife to carefully separate the brownie from the pan and remove onto a plate.. Take a bite and enjoy the sweet taste of your delicious Slutty Brownies!\nRead the question below and select from the following choices.\nA: Graveyard Brownies\nB: Mix the Dough\nC: Bake the Brownies\nD: Mix Up the Brownies",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_18_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_18_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_18_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_18_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_18_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_18_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_18_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_18_7.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Pizza Sauce\nB: A Simple and Delicious Pizza Sauce\nC: Make the Alfredo Sauce\nD: Make the Fettuccine Noodles",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Supplies and Ingredients', '@placeholder', 'Pizza Dough', 'Putting Your Pizza Together']",
+    "context": "Here is the context of these images:\n. \n          Supplies for Pizza Sauce:   Medium skillet or saucepan     Knife     Measuring spoons     Measuring cups     Stove  Supplies for Pizza Dough:   Large bowl     Kitchen towel     Timer     Baking sheet     Parchment paper     Measuring spoons     Measuring cups     Rolling pin     Oven  Ingredients for Pizza Sauce:   2-tablespoons olive oil     \u00bd-cup onion, chopped     1-teaspoon sugar     1 garlic clove, minced (or minced garlic out of jar - 1/2 teaspoon = 1 clove)     15oz can tomato sauce     1-tablespoon italian seasoning     Salt and pepper to taste  Ingredients for Pizza Dough:   1-teaspoon sugar     2 \u00bc-teaspoon yeast (one store bought package)     1-cup warm water     2 \u00bc-cup flour, all-purpose (2 cups of flour go into dough, other 1/4-cup is for flouring surfaces and rolling pin)     1/4-teaspoon salt     Cooking spray  Supplies for Safety:   Apron or other clothing you don't mind getting dirty     Potholders  Note: One pizza should yield about 12 pieces (squares). The number of pieces will depend on how thick or thin you want the crust. You will yield almost 2 cups of tomato sauce. If not all sauce is used in one pizza prep you can certainly freeze the sauce for next time.\n        .   Safety First: Please be careful. Sauce will be hot. If children are helping you cook, please supervise them at all times.\u00a0  Time: 25 minutes  \u00a0   Heat olive oil in a small skillet or saucepan over medium heat.     Add the onions, sprinkle with the sugar, then lower the heat.     Cook the onions and sugar over medium-low heat\u00a0for 10 minutes.     Add garlic and cook one minute longer.     Add tomato sauce and\u00a0italian seasoning.     Cook on low, uncovered until thick (about 15 minutes - the longer you let it go the better it is).     Season to taste with salt and pepper. .   Suggestion: Wear an apron or other item of clothing you don't mind getting dirty. The flour tends to get everywhere!  Time: 1 hour, 15 minutes  \u00a0   Dissolve sugar and yeast in warm water in large bowl, let stand 5 minutes.     Add 1-cup flour and \u00bc-teaspoon salt to yeast, mix well.     Add 1-cup flour, stirring well.     Turn dough out of bowl\u00a0onto lightly floured surface. Knead until smooth and elastic, about 10 minutes. (If you have a mixer with a dough hook you can let the machine do the work for you.)\u00a0Add additional flour as necessary to keep dough from sticking.     Place dough in large bowl coated with cooking spray, turning to coat.     Cover bowl with kitchen towel and let rise 45 minutes.     Punch dough down; cover and let rest 5 minutes. .   Safety First: Please be careful. Pizza will be hot. If children are helping you cook, please supervise them at all times. Use potholders to protect hands.  Time: 20 minutes  \u00a0   Preheat oven to 450-degrees.     Roll dough into desired size and shape (smaller = thicker crust, larger = thinner crust - round or rectangular shape is up to you)     Place dough on baking sheet covered with parchment.     Top with desired sauce and other toppings.     Place in oven for 12-15 minutes (the thicker the crust the longer the bake time), until crust is golden brown and cheese is melted. \nRead the question below and select from the following choices.\nA: Pizza Sauce\nB: A Simple and Delicious Pizza Sauce\nC: Make the Alfredo Sauce\nD: Make the Fettuccine Noodles",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_19_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_19_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_19_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_19_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_19_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_19_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_19_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_19_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_19_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_19_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_19_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_19_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_19_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_19_13.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Cambodian Beef Salad (Lok Lak)\nB: Satay\nC: Chop Vegetables\nD: Peanut Sauce",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Add Water, Boil', 'Let Cool for Beef or Chicken Broth and Slim Fat Before Storage', 'Store and Use Later']",
+    "context": "Here is the context of these images:\n. The main ingredients are simple...OnionGarlicCeleryCarrotBell PepperWaterAdd to this some meaty beef bones, or chicken and simmer for a couple of hours and you have it!. I save everything in the freezer. Veggie trimmings, chicken bones and skin, meaty bones for the pot of beef broth. Not much waste in our house. You can of course use a whole chicken, but I have enough \"scrap\" to make mine.  I do buy beef - normally neck bones and stew meat for this because we don't normally get a lot of beef trimmings in our normal menu items.. Just chunk all the veggies up, no peeling required, no precision chopping.  Just get it in the pot.. I have an herb garden outside my kitchen so I add them to most everything! Not a required ingredient at all, but a few bay leaves and some parsley just add a little something. I also add thyme to mine, just because I can.  I also throw in a good amount of whole peppercorns.. Once all your ingredients are in the pot, cover completely with water, bring to a boil, reduce to a simmer and let it go for at least an hour.  I usually let mine go for a couple of hours just because I get easily distracted by shiny things, but that is a whole different story!!!!. Strain the liquid from the solids and throw them away, they have done their job. All the flavor has been cooked out and the liquid you have left is a rich, golden color, full of flavor!. I always chill the completed beef and chicken broth, that makes it super easy to skim off any fat that has hardened, giving you a fat free, sodium free broth to add to your dishes.  It makes me feel better knowing that I am in control of the salt in my diet. And no preservatives either!. I got these awesome containers and use them for EVERYTHING! I freeze the broth and pull it out to use as needed. Label everything, once frozen it's hard to tell veggie from chicken broth.  I thaw a container and keep it in the fridge to add to dishes. You could also freeze some in ice cube trays for those times when you just need a splash of something. I hope you enjoy!. Watch more videos at Jeanna's Good Lookin' Cookin' on Youtube!Also visit my Facebook page Jeanna's Good Lookin' Cookin' for quick tips and ideas!\nRead the question below and select from the following choices.\nA: Cambodian Beef Salad (Lok Lak)\nB: Satay\nC: Chop Vegetables\nD: Peanut Sauce",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_20_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_20_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_20_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_20_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_20_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_20_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_20_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_20_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_20_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_20_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_20_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_20_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_20_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_20_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_20_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_20_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_20_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_20_17.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_20_18.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_20_19.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_20_20.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Spag Bol With a Thai Twist.\nB: Additional Notes/nutrition\nC: Vegetable Preparations.\nD: Potato Leek Soup With Cheerio Croutons",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Sauteing the Vegetables in Butter.', 'Adding Spice', 'Allow to Cook']",
+    "context": "Here is the context of these images:\n. I like to have the vegetables chopped up first and leave them neatly in bowls so that i can use them later. all of the ingredients that go in to the soup are eventually going to be blended. This means that you don't need to have all the vegetables neatly chopped, just give them a rough chop but try to ensure that they are all the same size.Picture 1. Chop the garlic to a puree consistency, it doesn't need to be perfect, but roughly equal is fine.Picture 2. Same goes for the onions Chop them finely in to pieces in to roughly the same size.Picture 3. Finally chop the carrot up, the larger pieces that you chop this up in to the longer it will take to cook in the pan.. You can chose to use just normal oil at this stage, but i wanted to give this dish a lovely rich flavour and so decided to saute all my vegetables in butter instead. Using just normal oil would make the soup suitable for vegans.Picture 1 Melt the 250g  of salted butter in the pan until the butter browns slightly. Picture 2. Add the onions and garlic in to the pan and allow to soften. this will often take about 5 minutes or so on a medium heat.. Adding the spices at this I think allows the flavours of the spices to integrate better. Once you have added the spices to the onions mix them in thoroughly and then allow the onions to cook in the spice mix for a few minutes. Picture 1-3 Add in 1Tbsp of Cumin, Coriander powder and Turmeric Picture 4. Combine all of the ingredients with the onions.Picture 5. Allow to cook together for a few more minutes. . Time to form the base of the soup with the bulk of the ingredients, you want to lightly cook the chopped carrot in the hot butter before adding the stock. Picture 1. Pour the 900g of chopped carrots in to the pot containing your cooked onions/ garlic with spices.Picture 2. Stir in the carrots as to cover them in the onions and spices, cook them slightly at this stage in the hot butter they only need a few minutes.Picture 3. After you have cooked them for a few minutes time to add your stock. . The carrots need to cook entirely before you can move on to the blending stage, for this they need to simmer gently on a low heat until the carrots are completely soft i would recommend around 20 minutes then check upon them to see if they need slightly longer. Picture 1. Cook the carrots in the stock for around 20 minutes check they are cooked before you blend them.. This is going to be a thick soup, spend your time blending it entirely. You can if you would like pass this soup through a sieve to get a finer texture, but your stick blender should be able to sufficiently blend up the soup.Picture 1. Once the carrots have cooked until soft and they are soft enough to be crushed against the side of the pot with a fork, the soup is ready to be blended.Picture 2. Blend the ingredients until smooth.. I have chosen to garnish this dish with a poached egg, you can garnish with other ingredients if you wish. When poaching always ensure that you use fresh eggs, the older the egg the less of the egg white is going to stay around the egg yolk when you cook it. Picture 1. In a small pot of water boil seasoned water with a touch of olive oil. Picture 2. Always put the egg in a cup, opposed to cracking it directly in to the pot this. this allows you to pour the egg in slower are more gently. Picture 3. Once the water is boiling create a gentle spin in the water and pour the egg in the middle. the spin will pull the egg around its self. once the egg has risen to the top your poached egg is cooked this will take around 2 minutes or so. . I have garnished this soup with a small amount of paprika, pepper, olive oil a poached egg and coriander leaves. I would recommend eating this soup on the day that it has been cooked, soup tends to be rather difficult to store in the fridge. This soup can also been frozen but i get i wouldn't recommend it as you would will lose flavour heating it to a high temperature in order to make it safe to eat again. Thank you for reading , if you have any comments or queries please contact me and I will try to answer them as soon as possible.\nRead the question below and select from the following choices.\nA: Spag Bol With a Thai Twist.\nB: Additional Notes/nutrition\nC: Vegetable Preparations.\nD: Potato Leek Soup With Cheerio Croutons",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_21_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_21_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_21_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_21_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_21_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_21_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_21_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_21_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_21_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_21_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_21_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_21_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_21_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_21_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_21_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_21_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_21_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_21_17.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_21_18.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_21_19.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_21_20.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_21_21.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_21_22.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_21_23.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Adding Pure and Simple H20!!!\nB: Hibiscus Tea\nC: Final Measurements and Glue\nD: The \"Real\" Persian Tea",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Strange Brew!!!', 'Boil Me Tender...', 'When Life Gives You Lemons...', '@placeholder']",
+    "context": "Here is the context of these images:\n. The ingredients in this wonderful and OFFICIAL Peet's recipe are as follows:1/4 lb. of Hibiscus C tea12 oz. of boiling water12 oz. lemonade1/2 cup of white sugar4 oz. cold waterIce cubes to coolY'all will also need a tea-kettle, an iced-tea pitcher, and a tea pot of some sort. Boil 12 oz. of water, DO NOT REMOVE KETTLE UNTIL THE WHISTLE BLOWS.. Pour the boiling water over the Hibiscus C tea into a teapot (or other heat-safe container, in our case we used a coffee presspot) and let steep for 5 minutes. (If you prefer a stronger tea taste, feel free to let it steep a bit longer). After the tea has steeped for 5 minutes or so, use a strainer to separate the hot liquid from the loose tea into an iced-tea pitcher, and immediately afterward add the 1/2 cup of sugar. This is critical to do directly after the tea has steeped so the sugar can dissolve in the hot liquid. Gently stir to ensure that all sugar is dissolved.. ... pour lemonade into the mix y'all!After the sugar is dissolved into the concentrated tea, pour 12 oz. of cold lemonade into the pitch.Continue to stir the mixture. This step is simple y'all, while stirring, pour 4 oz. of cold fresh water into the pitcher. yep, that's all for this step.. So what would an iced tea cooler be without the ice, right? Once the mixture is completely stirred together, add a few handfuls of ice cubes to chill the drink. If you really want to get festive, you can use fun ice cube shapes...we used puzzle and triangle ice-cube molds. Special ice shapes are the perfect mundane detail to dazzle your friends and show up Martha!!!. Add some more flare to this fabulous drink by pouring it into your favorite cocktail glass and adding a colorful garnish like a slice of lime or a lemon twist.Your friends and dog will love it!!. This drink is best served chilled on a hot day. Add some banana loungers, Girl Talk's \"Feed the Animals\" album and a friend or two and you have an instant party!!\nRead the question below and select from the following choices.\nA: Adding Pure and Simple H20!!!\nB: Hibiscus Tea\nC: Final Measurements and Glue\nD: The \"Real\" Persian Tea",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_22_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_22_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_22_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_22_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_22_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_22_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_22_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_22_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_22_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_22_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_22_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_22_11.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Easy Banana Bread\nB: Making the Dough\nC: Oil and Spices\nD: Ingredients",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Prepare the Bread', '@placeholder', 'Bake the Croutons', 'Done!']",
+    "context": "Here is the context of these images:\n. The first step is to remove the crust from the bread. This bread I got from a cook, who couldn\u2019t serve it anymore as it was slightly dry. The crust was delicious, but it\u2019s easier to give the croutons an even look without it. However if you\u2019re strongly against wasting food it\u2019s possible to leave the crust on.When the crust has been removed, cut the bread into little squares. Mine were around 2 x 2 x 2 cm. Then add the little bread cubes to a bowl.. The next step is to add oil and spices. I chose to add thyme, but other spices can be used as well. (Sometimes garlic is great). I added the oil first, so that the spices had something to stick to. Then I added salt and pepper \u2013 just add what you feel like here. Then I mixed the croutons very well, to make sure that the spices and oil was evenly spread and on all the little bread cubes. It might be necessary to add a bit more oil, but be careful not to add too much.. All that\u2019sleft now is to bake the croutons in the oven. Spread the croutons on a baking tray, and make sure they\u2019re not covering each other.Bake the croutons in the oven at 200\u00b0C for 8 \u2013 10 min. They are ready when they\u2019re golden brown and crunchy.. Now your croutons are ready to eat! They can lest for a couple of weeks if kept in a closed container. As mentioned in the intro they taste great, and (as you've seen now) are easy to make. :) I hope you'll enjoy this instructable, stay creative out there! :D \nRead the question below and select from the following choices.\nA: Easy Banana Bread\nB: Making the Dough\nC: Oil and Spices\nD: Ingredients",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_23_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_23_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_23_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_23_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_23_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_23_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_23_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_23_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_23_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_23_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_23_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_23_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_23_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_23_13.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Make Delicious Haleem\nB: Ingredients\nC: Leipziger Lerchen (typical Saxon Speciality)\nD: Time for a Coffee Break",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Add Some', 'Garlic We Need Garlic', '@placeholder', 'The Last Step']",
+    "context": "Here is the context of these images:\n. To make this delicous Ajam Ketjap you will need:500 grams of chicken breast (Ajam)a splash of oil 2 large onions 6 cloves garlic 1 small can of tomato puree 1 theespoon of salt 6 tablespoons of Sweet soy sauce (Ketjap manis)You also need a cutting board and knife a stirrer and a large pan.. Cut the onion into pieces, put a little bit of oil in your pan and add the sliced onion and tomato puree together in the pan and fry until the onions are translucent. (it is very importent to put them in together, for the taste of the end product). Whille you fry the unions an tomato puree, Cut the chicken breasts in dices, when the unions are translucent add the chicken and fry it until the chicken is brown.. crush the garlic and put it in the pan stir and fry for 1 or 2 minutes. (Some times people say that 6 cloves is to much and there breath will be terible afterwards. But you do not have to be afraid this wont hapen.). Now add the Theespoon of salt and 6 tablesppoons of Sweet soy sauce also called Ketjap manis, stir it and add about 1 cup of water ( the chicken has to be covered with the sauce you made.. Put the lid on youre pan and let it simmer for about 15 minutes occasionaly stir it, this is a good time to get yourself a nice cup of coffee.. After about 15 minutes get the lid off of your pan and let it simer for another 5 to 10 minutes depending on the amount of watehr that was added in step 5, this has to be done for 2 very important reasons, first of all the excess liquid wil vaporize and second every body in the house will come towards the kitchen wondering what it is that smells so good.You can eat this with bread or rice, both is delicious.Enjoy your meal!\nRead the question below and select from the following choices.\nA: Make Delicious Haleem\nB: Ingredients\nC: Leipziger Lerchen (typical Saxon Speciality)\nD: Time for a Coffee Break",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_24_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_24_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_24_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_24_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_24_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_24_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_24_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_24_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_24_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_24_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_24_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_24_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_24_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_24_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_24_14.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: How to Make Baka Bana\nB: \nC: The Final Product!!!\nD: Eat!",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Grating the Cheese!', 'Pasta Cooking and Butter Melting.', 'Making the Sauce!', '@placeholder']",
+    "context": "Here is the context of these images:\n. \n          For this recipe, you will need:1 cup heavy cream1 pound fettuccine6 tbs. unsalted butter2 cups grated parmigiano reggiano cheesesalt and pepper to tasteYou'll also need a large pot for boiling the pasta, a grater, a strainer, and a large frying or saute pan to toss everything in and melt the butter. . I do this the old fashioned way because I lack money for fancy food processors and I like the physcial act of grating the cheese. It smells amazing!Unwrap your cheese and cut the rind off. Put the rind in a plastic bag. You can keep this in the freezer and use it in soups! I recommend putting it into a herby bean soup - those are the best!Then, cut your cheese into two smaller pieces if it's especially large.At this point, either tear off a sheet of parchment paper or use a large plate to set the grater on. That way, you'll be grating onto a surface that will allow you to dump the cheese right into the pot. So grate away! Eyeball the amount. Use less or more according to your level of cheese desire. . Fill the big pot with water (I'm using a stockpot) and throw in a couple of generous pinches of salt. Bring this to a boil.As soon as the water comes to a boil, dump your pasta in. You need to cook the pasta so that it is slightly underdone. You want it to still be able to accept some liquid so that it'll soak up the cream later!Put the 6 tbs. of butter into the saute pan over medium/low heat and start it melting while the pasta cooks. As soon as it's melted, turn off the heat. You don't want the butter to brown.When the pasta is done cooking, drain it and we'll move on to the next step. . When the pasta is draining, reheat the butter. You'll want the heat to be medium high. Now, turn the pasta into the pan and pour the cream over top. You'll fold this mixture together for a few minutes until the pasta soaks up nearly all of the cream.Then, add the cheese and fold again. It'll take a little elbow grease to get it all combined.Once it's combined, taste test and add pepper and salt as desired. I like a ton of pepper on mine. :D. Enjoy your pasta. :D\nAnd maybe invite some friends or family over to help you finish it off?\nRead the question below and select from the following choices.\nA: How to Make Baka Bana\nB: \nC: The Final Product!!!\nD: Eat!",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_25_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_25_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_25_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_25_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_25_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_25_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_25_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_25_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_25_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_25_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_25_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_25_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_25_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_25_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_25_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_25_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_25_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_25_17.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_25_18.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_25_19.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_25_20.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: ( Pumkin Part )\nB: Bruin Beskuit (Multigrain Rusks)\nC: Cooking of the Pancakes\nD: Preparing for Cooking of the Pancakes",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', '( Beatroot Part )', '( Beatroot Part )', '( Beatroot Part )']",
+    "context": "Here is the context of these images:\n. - Two table spoon Ghee - 300 ml Milk - 100 gms grated Pumkin- 100 gms grated Beatroot- 100 gms sugar- Pinch of chilly flakes . - Heat 1 tablespoon Ghee in a pan at medium flame. - Add 100gms of grated Pumkin and stir it continuously becomes soft ( 4 to 5 mins  approx ).. Add 150ml of Milk and continue to stir till the milk evaporates ( 4 to 5 mins approx ). Add 50gms of sugar and continue to stir it till it becomes Yellow Shiny.Place the Yellow Shiny Pumkin dessert in a bowl.. Heat 1 tablespoon of Ghee in a pan at medium flame. Add 100gms of Beatroot and stir it continuously till it becomes soft on medium flame. ( 4 to 5 mins approx ) . Add 150 ml of Milk and continue to stir it till Milk evaporates. . Add 50gms of Sugar and continue to stir till it becomes shiny red ( Approx 4 to 5 mins ). Sprinkle pinch of  chilly flakes and serve the Dessert ( Hot ) in a plate or bowl.- 2 Servings\nRead the question below and select from the following choices.\nA: ( Pumkin Part )\nB: Bruin Beskuit (Multigrain Rusks)\nC: Cooking of the Pancakes\nD: Preparing for Cooking of the Pancakes",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_26_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_26_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_26_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_26_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_26_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_26_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_26_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_26_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_26_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_26_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_26_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_26_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_26_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_26_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_26_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_26_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_26_16.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: How to Make Coffee at TechShop\nB: Make Chipped Ice\nC: Streaming the Milk (Leche).\nD: Coffee Is Cooking.",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Ingredients', '@placeholder', 'Make Yogurt Mixture', 'Pour the Coffee Already Made to the Glass']",
+    "context": "Here is the context of these images:\n. A jar of yogurt  Two teaspoons of condensed milk  Chipped ice  A  teaspoon of coconut milk  1 slice of lemon  20ml of coffee (see here)  A blender. Firstly, you use blender to make some chipped ice then put them into a glass..  Put condensed milk, yogurt, coconut milk and a few drops of lemon juice then blend it for 3 minutes.Then pour this mixture onto the chipped ice.. You can make a cup of milk coffee or black coffee. . Finally you pour 20ml of coffee onto the chipped ice and yogurt mixture. In the summer, Vietnamese people often drink this kind of drink.Enjoy the great taste. \nRead the question below and select from the following choices.\nA: How to Make Coffee at TechShop\nB: Make Chipped Ice\nC: Streaming the Milk (Leche).\nD: Coffee Is Cooking.",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_27_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_27_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_27_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_27_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_27_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Pumpkin Butter\nB: Processing\nC: Smear\nD: Storing",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Blend', '@placeholder', 'Top', 'Eat']",
+    "context": "Here is the context of these images:\n. Combine the walnuts, olive oil, salt, and honey and blend them. You may have to add more honey or olive oil or even a small amount of water for texture or taste. You should end up with the texture of peanut butter but with little pieces of walnuts.. At the restaurant we tried this at they have us four simple steps to eating this walnut bitter! So after making the butter there are only a few simple steps left until this delicious snack is in your mouth! Smear the walnut butter on any type of bread! It is really good on whole wheat which is also very healthy!. Cut strawberries into small slices and add them on top of the bread and walnut butter.. Drizzle some honey onto the strawberries.. Finally it's time to enjoy your delicious snack! Refrigerate the extra walnut butter for about a week at the most if you can make it last that long!\nRead the question below and select from the following choices.\nA: Pumpkin Butter\nB: Processing\nC: Smear\nD: Storing",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_28_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_28_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_28_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_28_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_28_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Peanut Butter Pie\nB: Don't Make the Oven Too Hot! 275*\nC: Bake the Cookies\nD: Wet Ingredients",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Whip It Whip It Good!', 'Fill Me Up Buttercup!', '@placeholder', 'ENJOY!']",
+    "context": "Here is the context of these images:\n. Crust:1 cup of roasted peanuts2 Packages of Oreos (I have three because that way when I eat some I wont run out)Full size Reese's PB cups 10-12 or so1 stick of butter (I'm from the south cannot help myself)A spring-form pan. Filling:3 Blocks of cream cheese1 8oz sour cream4 Eggs3/4th cup of Brown Sugar1 cup of Peanut butter (if you can find Reese PB get it)1 Teaspoon of Vanilla extract1/2 cup of heavy whipping creamone package of mini Reese's PB cups. First remember do not use all of the crust in the pan because you will want some for the top.Take the first package of Oreos and pulverize it to dust with the 1 cup of Roasted peanuts. This will be the base or bottom of the pan crust.Hand chop 10 - 12 of the regular cups place in bowlHand chop or hand crush the other package of the Oreos it takes time but its worth it if you refrigerate them before hand they will crush and chop with ease.Mix the hand chopped items with some of the powdered crust.Melt a stick of butter.Mix melted butter with both parts of crust. No need to be 100% exact just make both parts pliable. Butter up the pan with some left over butter or some margarine.Form the bottom of the pan with a thin layer of crumbed crust. Mix the leftovers with the cut up chunks.Form a thick wall around the pan. Be sure to push on the bottom edges so they meet.Place crust in freezer.. Mix each of the blocks of cream cheese and the 8 ozs of sour cream together one at a time on medium low. Its easier to use a bowl scraper attachment but my bowl is bent so I cannot really do that... Just scrape the bowl in between for an even mix. you will know when you are done when its all smooth and creamy.. Turn up to a medium setting like 5-6For the brown sugar make sure you grate it blend it or do whatever you need to to make sure there are no clumps. Nothing is worse in a cheesecake than hard clumps.Mix each egg individually until each is incorporated fully in.Pretty much just dump that cup of Peanut Butter right in there. One TSP or tablespoon of Vanilla extract. what ever is to your liking. I normally do a tablespoon because PB is such a strong flavor.Hand chop the minis really the amount is whatever is to your liking. I normally do 16-20 thrown them in and mix.. 1/2 cup of whipping cream then mix on high for 25 seconds to get that air in there.After this you can fold more PB cups into the batter. Really its up to you its your cheesecake.. Take the shell out of the freezer and pour the batter in the pan. Try and make sure there is about 1 in of wiggle room for the Cheesecake to rise.. Important part. Put a pan or pot of water on lower rack. I'm not all about that water bath.Oven should be set to 275* you are now probably asking hey why not 350 everyone is all about the 350. Eggs that's why. Eggs expand rapidly when exposed to high temps at don't quote me but 325. Rapidly expanding means cracked cheesecake and less creamy cheesecake. Bake for 1 to 1 and 1/2 hours depends on your oven. You know when it is done because the top of the cheesecake will be far from the sides of the crust. Be sure to clean the bowls and kitchen while this is baking. Also I normally do some sort of cardio or pushups and sit ups to prepare myself for the Calories I am about to consume.. Here is where you preform a self marshmallow test. Leave in oven while off for 20 minutes Open door for 20 minutesPlace on cooling rack for another hour until cool. Normally it takes about 2 hours for the whole process. Stages are because cheesecakes frighten easy and are prone to temperature shock. Which will have the effect like popping a balloon. When it is completely room temperature and not warm to the touch sprinkle the rest of the crust on top generously. Cling wrap it and place in fridge for minimum of 9 hours.. Here are some NSFD photos.Use a knife warmed by hot water to cut and enjoy.Make sure if you take this to a party one put a paper towel under the pan before you open it crumbs are crummy gifts to give.Also make sure no one with peanut allergies will be there. Because this \"may contain nuts\"\nRead the question below and select from the following choices.\nA: Peanut Butter Pie\nB: Don't Make the Oven Too Hot! 275*\nC: Bake the Cookies\nD: Wet Ingredients",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_29_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_29_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_29_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_29_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_29_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_29_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_29_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_29_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_29_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_29_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_29_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_29_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_29_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_29_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_29_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_29_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_29_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_29_17.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_29_18.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_29_19.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_29_20.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_29_21.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_29_22.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_29_23.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_29_24.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_29_25.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_29_26.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_29_27.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_29_28.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_29_29.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Sheet Pan Mediterranean Chicken and Potatoes\nB: Fill 'er Up!\nC: Oh Yea.. Forgot the Shameless Pandering\nD: Ingredients",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Cooking', 'Enjoy', 'Food Science']",
+    "context": "Here is the context of these images:\n. I went with 6 Russet potatoes (mostly because that was what we had in the pantry) but you can use any spud of your choosing one large white onion three bell peppers one package mild Italian sausage one package hot Italian sausage. before I detail each delicious layer of yummy goodness .. i just need to say ... all hail the mighty mandolin slicer :)  It makes food prep so easy.apply a light coat of olive oil to the panlayer in your potato slices to cover the bottom of the pan (can you think of a better way to soak up all the flavor as it drips down through the layers?)generously distribute your assorted peppers for maximum color enhancementlayer in the onions ... or as I like to call it .. the sausage suspension systemand now .. the final piece of the puzzle ... the sausage.  Don't be afraid to cram the pieces in tightly, remember there will be shrinkage during cooking.. Set your oven to 375 degrees Cover the pan with foil Bake for an hour and 40 minutes Uncover the pan Bake for one more hour. Now comes the hard part ....\u00a0 deciding how you want to serve your meal. Do you delicately disassemble your creation and serve each tasty layer in its own glory? OR Do you just ladle a heaping spoonful of heaven into a roll, top it with some steaming hot tomato sauce and chow down? Choose your path ... AND ENJOY!!!. Sensory Analysis (Affective Testing) - Also known as consumer testing, this type of testing is concerned with obtaining subjective data, or how well products are likely to be accepted. Using a focus group of 8 people covering the ages 2 to 55 demographic I was able to obtain the following reaction results very flavorful .. especially the potatoes *two thumbs up* Terrific Scrumptions Delicious Thanks for cooking so I didn't have to yummy it was very good .. thank youFood Physics \u2013 the physical aspects of foods (such as viscosity, creaminess, and texture) By baking the dish uncovered for the last hour, the sausages browned nicely. \u00a0 There was a 15-20% size reduction, which is to be expected. The peppers and onions softened nicely but retained more of their flavor than i was expecting. by far the nicest surprise of the dish had to be the potatoes.\u00a0 Because of their location at the base of the dish they soaked in an amazing blend of flavors.\u00a0 \nRead the question below and select from the following choices.\nA: Sheet Pan Mediterranean Chicken and Potatoes\nB: Fill 'er Up!\nC: Oh Yea.. Forgot the Shameless Pandering\nD: Ingredients",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_30_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_30_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_30_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_30_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_30_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_30_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_30_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_30_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_30_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_30_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_30_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_30_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_30_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_30_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_30_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_30_15.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Bubur Lambuk\nB: Carving Apple Version 2\nC: It's Showtime\nD: Assembly",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Gather the Materials', 'Add the Cold Beverages of Your Choice', 'Build BBQ Tool Holders', '@placeholder']",
+    "context": "Here is the context of these images:\n. The materials you'll need are...\nFor the Drink Helmet\n1. duct tape\n2. two 14.5 ounce cans\n3. about 3 feet of plastic tubing\n4. a cold beverage of your choice\n5. a general plan\nFor the Utility Belt\n5. duct tape\n6. grilling utensils\n7. condiments. Start the Drink Helmet by making a headband.\n1. Measure off about 27 inches of duct tape, then two more about 16 inches each.\n2. Fold the strips in half then tape the sticky sides together to make a band.\n3. The longer band will serve go around your head like a ball cap. The shorter bands will go over your head from ear-to-ear and from forehead to the back of your head.\n4. Secure the contact points and intersections of the band as shown in the pic.. Next, duct tape two 14.5 ounce cans to the sides of the headband. See pics.... 1. Cut 2 pieces of plastic tubing about 16 inches in length.\n2. Insert high-quality cold beverages.\n3. Insert tubing.. 1. Place Drink Helmet (with drinks) on your head.\n2. Suck.. 1. Strip off enough duct tape to go around your waist (lengths vary).\n2. Fold it in half to (a) get rid of the sticky stuff and (b) make a waistband.. 1. Pick your favorite condiments.\n2. Make holders for each by first making duct tape bands for the girth and then the circumference of each condiment.\n3. Attach and secure the intersections of each band as shown in the pics.\n4.Attach the condiment holders to the utility belt as shown in the pics.. 1. Build your tong and/or flipper holders for each side.\n2. Just make two short bands as before then attach to the belt. See pics...\n3. You're now ready to put it on and get to cookin'/showin' out. Video on next step.. \n          For a man, grilling out is less preparing food and more communing in the age-old rite of cooking over fire. The only difference between now and the men described in the book of Leviticus is that today grilling out is an art form--a dance between man, machine, fire, and food. To wit, watch the video...\nRead the question below and select from the following choices.\nA: Bubur Lambuk\nB: Carving Apple Version 2\nC: It's Showtime\nD: Assembly",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_31_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_31_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_31_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_31_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_31_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_31_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_31_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_31_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_31_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_31_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_31_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_31_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_31_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_31_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_31_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_31_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_31_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_31_17.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_31_18.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_31_19.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_31_20.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_31_21.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_31_22.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_31_23.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_31_24.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Summertime Peach Melba Pie\nB: The Directions!\nC: Bake and Enjoy!\nD: Make the Meringue",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Assemble the Ingredients', 'Prepare the Crust', 'Mix the Filling', '@placeholder']",
+    "context": "Here is the context of these images:\n. For the filling:\n5 egg yolks\n1 15 oz can of sweetened condensed milk (like Eagle Brand)\n1/2 cup dairy sour cream\n1/2 cup fresh squeezed red grapefruit juice\u00a0 OR\nfor a more intense flavor, 1 cup of grapefruit juice, simmered over low heat until reduced to 1/2 cup\n2 teaspoons grapefruit zest\n3 Tablespoons fresh lemon juice\n2 teaspoons Rose's Grenadine syrup\npinch of salt\na drop or two of red food coloring (optional)\nFor the topping:\n5 egg whites, room temperature (\"old\" eggs provide the better whites for meringue than really fresh eggs.\u00a0 It helps to leave the eggs out at room temperature for 12-24 hours before assembling your pie)\n3/4 cup white sugar\nFor the crust:\n1\u00a0 1/2 cups finely ground graham cracker crumbs\n1/2 cup finely chopped hazelnuts\n1/4 cup brown sugar\n1/2 cup (1 stick) salted butter, melted. Preheat oven to 350 degrees F (175 C).\nCrush the graham crackers by placing one package in a paper sack and roll with a rolling pin OR roughly break up with your hands, place in a deep bowl and use an immersion blender to pulverize OR add crackers to a conventional blender or food processer.\u00a0 Do the same with the hazelnuts.\nToss the graham cracker crumbs, hazelnuts, brown sugar and melted butter in a bowl and blend thoroughly.\u00a0 Place the crumb mixture in a 9\" pie pan and pat gently onto the bottom and up the sides of the pan.\u00a0 Brush away any loose crumbs (these may burn during baking).\nBake the pie shell at 350 degrees for 8 minutes.\u00a0 Remove from oven and cool completely.. Separate the 5 eggs, being very careful to get no yolk in the whites.\u00a0 Put the whites in a clean, deep bowl, and set aside so they may come to room temperature.\nJuice a large Ruby Red or Rio Star grapefruit;\u00a0 strain out any seeds or pulp.\u00a0 Set aside 1/2 cup of juice.\u00a0 Zest grapefruit, add 2 tsp of the zest to the juice..**on edit-\u00a0 I have made this pie twice more, and I find that you'll get a much more intense grapefruit flavor if you start with 1 full cup of freshly squeezed juice and simmer it until it reduces to 1/2 cup.\u00a0 This tends to eliminate the need for lemon juice for added tartness, as well.**\nPlace the yolks in a larger bowl and blend briefly with a hand\u00a0 or stand mixer.\u00a0 Add the sweetened condensed milk and sour cream, and blend briefly again.\u00a0 Add the grapefruit juice, and Grenadine -\u00a0 blend just long enough to thoroughly mix the ingredients.\u00a0 Do not over blend.\nTaste the filling and adjust the tartness by adding the lemon juice one Tablespoon at a time until you are satisfied. with the level of acid.**Taste FIRST.\u00a0 Lemon juice may not be necessary if you have used reduced grapefruit juice**\u00a0 Add a drop or two of red food coloring if you wish to enhance the pink color of the filling.\u00a0 Blend just briefly again.\nPour the filling into the cooled graham cracker crust and let pie sit for 10 minutes to set up. Bake at 350 degrees F (175 C) for 15 minutes.\u00a0 Take the pie out of the oven, assemble your meringue and top while the pie filling is still hot.. Using a stand or hand mixer, beat the room temperature egg whites on high speed until they form soft peaks.\u00a0 Slowly sprinkle in the sugar and continue to beat until the meringue is stiff, thick and glossy.\u00a0 Check to make sure the sugar is dissolved (meringue not gritty).\nSpread meringue over the still-hot pie filling, making sure that meringue reaches past the filling to the pie pan to \"seal\" the edges and prevent shrinkage.\u00a0 Create peaks in the topping with the back of your spoon\nBake the pie at 350 degrees (175 C) for 8 to 10 minutes or until meringue is delicately browned.\nCool pie completely before serving (but do not place in refrigerator).\nMay also be served with whipped cream if preferred.\nGrapefruit Pie- a trippy tropical treat!\nRead the question below and select from the following choices.\nA: Summertime Peach Melba Pie\nB: The Directions!\nC: Bake and Enjoy!\nD: Make the Meringue",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_32_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_32_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_32_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_32_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_32_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_32_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_32_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_32_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_32_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_32_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_32_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_32_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_32_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_32_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_32_14.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Beer Can Mug\nB: Cook the Chicken\nC: Gather Your Ingredients\nD: Preparing the Marinade",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Prepare the Chicken', 'Prepare Your Beer Can', 'Move the Chicken to the Grill.', '@placeholder']",
+    "context": "Here is the context of these images:\n. Start your grill preheating (you only need one burner on low heat). Then gather the ingredients for your spice rub.\nAny good spice rub will work with this recipe. It's okay to use a store-bought rub if you have a chicken or rib bbq rub that you like. Or, you can mix one up using spices from your pantry. Feel free to experiment to fit your tastes!\nThe one in this picture used:\n1/4 cup paprika\n1 tbls brown sugar\n1 tbls granulated sugr\n2 tsp salt\n1 tsp onion salt\n1 tsp  black pepper\n2 tsp cayenne pepper\n1 teaspoon garlic powder\n1 teaspoon onion powder\nMix all ingredients together in a bowl.. Clean out the cavity of the chicken to make sure the kidneys and giblets are removed.\nRinse the chicken in cold water, then pat with paper towels to dry.\nAnd of course make sure that you thoroughly sanitize your hands and workspaces after handling the chicken.. Sprinkle a tablespoon of spice into the cavity. Insert another tablespoon under the skin of the chicken and try and spread it out evenly. Finally, rub another tablespoon of spice all over the skin on both sides.. You will need a tall can of beer to hold your chicken. Any brand will do, so if you have a 20 ounce can of beer on hand, feel free to use that. Otherwise, you can buy single cans of beer in the refrigerator section of most liquor stores. I chose fosters because their keg-style can is wider and sturdier than the slightly cheaper budweiser.\nPop open the tab on the can and empty out 1/4 of the beer. I'm not into beer so I poured it down the drain, but feel free to drink it if you like!\nYou want to make a few more small holes to let the vapors escape into the chicken. You can use the pointy end of a beer bottle/can opener for this. I did not have one handy, so I tapped an awl into the top a few times.\nPour the extra spice rub into the beer can.. Keeping the can upright, place it into the cavity of the chicken.\nPlug the top of the chicken so that the vapors are sealed in. I used a peeled onion, but you could also use a potato, lemon, or lime.. Since you started preheating your grill during the first step, it should be nice and hot by now. \nMove the chicken to the grill and stand it up with the legs spread apart (this will help maintain balance). You want to cook it indirectly, so put it next to the burner that is turned on.\n. Close the grill and let the chicken cook on low indirect heat until the internal temperature reaches 185-190 degrees (F). If your chicken come with one of those handy pop-tabs it will tell you when it is done. Otherwise, you can monitor the tempetature with a meat thermometer.\nMy six pound chicken took an hour and fourty five minutes. . Like I said before, you will know your chicken has finished when it has reached the right internal temperature (185-190 F). When your chicken is finished, the outside will be crispy, but the meat will be fall-off-the-bone tender. Carefully remove the chicken from the grill. Place the finished chicken on a plate and discard the beer and can.. Let the chicken rest for 5 minutes before carving. Serve with your choice of sides; I reccomend mashed potatoes and corn. \nBon appetite!\nFor those of you who are interested, this cost $1.50-$2 a serving. My six serving chicken cost:\n$6 chicken\n$3 beer\n(?) spices - you are only using a little of each spice, so we'll say a couple dollars.\nRead the question below and select from the following choices.\nA: Beer Can Mug\nB: Cook the Chicken\nC: Gather Your Ingredients\nD: Preparing the Marinade",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_33_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_33_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_33_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_33_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_33_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_33_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_33_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_33_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_33_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_33_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_33_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_33_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_33_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_33_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_33_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_33_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_33_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_33_17.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_33_18.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Easy Rainbow Cookies\nB: Add Milk and Stir\nC: Cupcake #3\nD: Cupcake #2",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Ingredients/Tools', 'Add Butter', '@placeholder', 'Cut Out the Biscuits']",
+    "context": "Here is the context of these images:\n. Ingredients:2 cups of all-purpose flour (256g)2 Tbsp. of baking powder (30g)1 tsp. salt (5g)1 Tbsp. white granulated sugar (12.5g)5 Tbsp. of cold unsalted butter (70g)1 cup of cold milk (whole milk, 1%, 2%) (240ml)Tools:SifterFood processor or Pastry blenderBowlsForkPlastic wrap  or towelBiscuit Cutter or GlassBaking sheet. First let's mix all of our dry ingredients together using a sifter. If you don't have a sifter you can use a fine mesh hand strainer as well. So we will mix our flour, baking powder, salt, and sugar. . Now using a food processor or alternatively you can use a pastry cutter and fork, we will blend in our cold butter. I have the butter cut up into little pieces, which just helps to blend it in. Using the pulse setting on the food processor will work fine. . Now we just transfer the flour and butter mixture back to our large bowl and add in our cold milk. Then using a fork or spoon mix the ingredients together until a dough forms. Now if your dough is sticky, add a couple more tablespoons of flour to it. The dough should not be sticky. . Now we dust our work surface with some flour and place our biscuit dough on it. Then shape it in your hands until it is a flat rectangle, then fold it over on itself once and push down, then flatten it out again, and fold it over, etc. So essentially we are just doing a quick basic kneading of the dough. Do this for 2 or 3 times, we don't need to knead it like we would a yeast dough. Now just shape it out into a rectangle or circle about 3/4 to 1 inch thick. Then cover it with plastic wrap and let the dough rest for 20 to 30 minutes. . Let's go ahead and preheat our oven to 425 degrees F. (218 C) Now we just need to cut out our biscuits, you can use a biscuit cutter or use a glass. Take your biscuit cutter or glass and push down on the dough, don't twist it around too much, then place your cut out biscuit on an un-greased baking sheet. You can take the leftover pieces and gather them back up and push them together then form them back into a rectangle, in order to cut out more biscuit shapes. Once we have cut out all of our biscuits lets bake them in the oven for 10 to 15 minutes until they puff up and get nice and golden brown on the top. . All right once they come out of the oven they will look like this. They are ready to be eaten! However you would like to eat them. One of my favorite ways is to cut them in half and then butter them and add honey. Mmm yummy! Enjoy!. Now watch those steps in action by checking out the video tutorial!\nRead the question below and select from the following choices.\nA: Easy Rainbow Cookies\nB: Add Milk and Stir\nC: Cupcake #3\nD: Cupcake #2",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_34_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_34_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_34_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_34_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_34_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_34_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_34_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_34_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_34_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_34_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_34_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_34_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_34_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_34_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_34_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_34_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_34_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_34_17.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_34_18.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_34_19.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_34_20.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_34_21.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_34_22.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_34_23.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_34_24.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_34_25.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_34_26.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_34_27.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_34_28.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_34_29.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_34_30.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Easy Homemade Ice Cream\nB: Bottom Layer\nC: Stack the Ice Cream Sandwiches\nD: Add Fruit.",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Ingredients', '@placeholder', 'Pour Into Pan', 'Freeze']",
+    "context": "Here is the context of these images:\n. For this dessert you will need the following:-Bryer 's Vanilla Ice Cream-12 oz of raspberries, fresh or frozen-Raspberry Jell-O (or any flavor gelatin in a 6 oz. package.). I couldn't find a small packet of Raspberry Jell-O,  so I measured out half the packet on a kitchen scale. It was a little bit less than a half cup. *****REVISION USE ALL 6 OZ OF JELLO! IT COMES OUT WAY BETTER!*****I heated up a little more than a cup of water in the microwave, and stirred the gelatin into the water.. Allow the ice cream to soften for 30 minutes before this step.Once the ice cream is soft, place it in a metal mixing bowl and mash it with a whisk or a potato masher. Slowly mix in the hot gelatin mixture with the ice cream.. This part is fun. Pick up the raspberries and squish them in your hands. Give it a quick stir to coat all the raspberries and then let it sit for about 3 minutes.. I like to use glass or ceramic baking dishes for this dessert - they hold the cold in a little bit better than a metal square pan could. . This generally takes a little more than an hour to set-up. But once its firm to the touch, it's pretty much ready.. If you let this freeze long enough, it stays pretty solid. It's easy to serve with just a knife, and goes fast. It's a great frozen treat in the summer, and you can try all different kinds of fruits and Jell-O flavors.Enjoy!\nRead the question below and select from the following choices.\nA: Easy Homemade Ice Cream\nB: Bottom Layer\nC: Stack the Ice Cream Sandwiches\nD: Add Fruit.",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_35_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_35_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_35_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_35_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_35_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_35_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_35_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_35_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_35_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_35_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_35_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_35_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_35_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_35_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_35_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_35_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_35_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_35_17.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_35_18.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_35_19.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_35_20.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_35_21.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_35_22.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_35_23.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Slime Juice\nB: Add the Juice\nC: Pour the Kombucha!\nD: Waiting...",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Preparing Berries', 'Acid and Water', '@placeholder', 'Filtering']",
+    "context": "Here is the context of these images:\n. Clean the berries and crush them in a bucket.It is easier to crush them in small batches. Make sure that most of the berries are crushed. Some berries can be whole, we will later on re-use the leftovers and crush them again.. Tartaric acid is crystallic white powder. Dissolve 25g of acid to about half a litre of water.Add the acid to the bucket with the crushed berries.Add water until you have approximately 3,5 litres to 4 litres of berry-water mix.. Store your berry-water mix in a cool place covered with lid. Wait for 3 days and your juice is ready for filtering.. You can start by mixing the water-berry mixture a bit.Place sieve to the other bucket and place the filtering cloth in it.Scoop the berry mixture to the sieve and let it filter. This is slow process and easier with bit smaller batches.Collect all filtered berry mash for re-use.. Now you should have about 2 litres of juice.You need 1/2 kg of sugar for each litre of juice. So if you have 2 litres of juice add 1 kg of sugar and mix until the sugar is dissolved.Clean bottles and use funnel to fill them.Cap the bottles and your juice is ready!Store the juice in cool place and enjoy your vitamins through the long dark winter.Bonus round:You can re-use the collected berry mash. Just follow the same instructions with once used berry mash and you get almost as good lingonberry juice for the second run. It will probably have slightly lighter color, but the taste is there.\nRead the question below and select from the following choices.\nA: Slime Juice\nB: Add the Juice\nC: Pour the Kombucha!\nD: Waiting...",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_36_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_36_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_36_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_36_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_36_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_36_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_36_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_36_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_36_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_36_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_36_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_36_11.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Caramel Apple Cheesecake\nB: Pie Assembly Part 2\nC: Prepare the Garnishes\nD: Pour Your Cooled Apples on Top",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Making the Pie Filling', 'MEANWHILE... Let the Creativity Flow', '@placeholder', 'Put Some Heat to the Creativity']",
+    "context": "Here is the context of these images:\n. I decided to make pretty much everything from scratch for this pie, including the crust.  If you wish to cut time by buying pre-made crust, then by all means GO AHEAD. This is 2014. But just keep in mind it may not be as tasty.  TIME: 20 MIN total for crust Crust is not too painful to make. You will need for crust   1 1/2 cups of Gram Cracker crumbs or 12 Gram Crackers made to crumbs.       3 tbsp (tablespoon) of sugar      3/4 cup of non salted butter melted      1 tsp (teaspoon) of cinnamon      Springform pan 9\" or 10\" pie plate  Turn your oven on to 375 degrees Whisk or with a fork mix together the graham cracker crumbs, sugar and cinnamon. (The cinnamon will give it some nice flavor.) Melt the unsalted  butter in the microwave and use a fork to mix the butter with the crumb mixture until they are all are moistened. It will look clumpy and much darker and that's a good thing. Spread the crumbs into a 9-inch spring-form pan OR a 10-inch pie plate and press them firmly into an even layer over the bottom and half of the way up the sides of the pan. Bake for 6 to 8 minutes, or until golden brown. Let it sit for 8 minutes to cool, or just stick it in the fridge to save time. By baking the crust you will get a more crunchy crust. Which will go beautifully with the crunchy top I have planned for this pie =). The secret is toasting the nuts!! Forgive me Grandma!! Haha, I'm just kidding. No, but seriously. Any time you have a dish with nuts, the secret to ultimate flavor is to toast them. It only takes 5 minutes, and enhances the flavors so much! TIME for Sauce: 8 MINWhat you will need for the Special Caramel sauce.    1 packet of apple-cinnamon granola from Quaker Oats.       3/4 cup of chopped pecans      1 cup salted caramel sauce. I used sugar-free in order to not go overboard with the sugar.      small cooking sheet for toasting the nut mixture in the oven.   Open the packet of granola and pour in a nut chopper as well as the pecans. You could also break them up yourself by putting them on a cooking sheet and breaking with a spoon, but it may get messy.  Since the oven is already going because the crust was just made toss the nuts in!  After 5 min of toasting pull them out. They should smell amazing. Take the crust out of the fridge. It should be cooled by now. Pour the caramel on top of the crust and sprinkle the toasted nut mixture on top of the caramel.  Place the springform pan into the fridge to chill out. MAKE SURE YOU SAVE SOME TOASTED NUTS FOR LATER. ;) You will use them as a garnish. The Infinity pie is based off an apple cheesecake pie. So making the apple pie part is very much like making a regular apple pie as you would have guessed. You can either BUY (it's 2014) your apple pie filling OR you can make it. I chose to make it because I want a delicious pie this time. Dedicating something to my hero only deserves the best! ;)  NOTE: if you are using a can of apple pie filling you only need to use half!! TIME for Pie filling 40-50 min (depending of if you have an apple corer)What you will need for Apple pie filling   5 small granny smith apples. They must be peeled and cored and cut them thinly (slave work)      3/4 cup of unsalted butter      1/2 cup of light brown sugar      2 tsp of cinnamon      a couple dashes of salt       a large pan for cooking on the stove  I DON'T have an apple corer. So this part took extra long.... my boyfriend wasn't too thrilled. But it's only 5 little apples. While you are peeling apples, put the butter on the stove and begin melting it. It will only take a few minutes. When it's melted add the brown sugar and cinnamon to the butter and mix until gently bubbling. Again it only takes a few minutes so you probably won't be done with your apples. The\" brown apple syndrome\" will happen and it's alright. These apples are destined to go into a brown sugar liquid and cooked extremely soft. No harm so don't stress! ;) when you're finished with the apples slide them in the large cooking pan and coat them well with the liquid. Put a lid on the pan and stir occasionally for 10 min. Remove the lid and up the temperature to med-hi to boil off most of the remaining liquid. Throw a few dashes of salt in. After another 15 min the apples should be very very soft and that's what you're looking for.  LET SIT FOR 20 min to cool before adding to your pie crust. Getting tired yet??. You can turn the stove off if you want to save electricity for 20 min while the apple pie filling cools....  But OK, you have 20 min to make a design to top your Infinity pie. Me, because I didn't want to have to make a batch just for crust I broke down and bought my pre-made crust. FORGIVE ME GRANDMA. ;) haha Pre-made crust is very easy to work with. You just unroll and cut out whatever design you want. I see pie design tops (much like pumpkins today) as a big fad soon. It is taking off but not like I think it will soon. But anyways, cut out whatever your heart desires! If you mess up, crust is easy to erase... just flatten out and try again. For stencils, I just found shooting stars online, printed them out, and laid them over the dough and cut it. Easy as pie. My shooting star is dedicated to Carl Sagan and the infinite universe. =). Exactly as the title says... pour the cooled apples on top of the cooled caramel mixture that's been chilling in the fridge.  This is the easiest step! ;). I love cheesecake. If it were me I'd put cheesecake in everything. But I probably wouldn't live long. Anyways, again, this is only technically half a cheesecake so the ingredients aren't as heavy. Turn that stove back on to 350 degreesWhat you will need for cheesecake topping:   8 ounces of soft at room temperature cream cheese   1/2 cup of granulated sugar   1 egg medium sized      1 tsp of vanilla extract      1tbsp of lemon juice      lemon wedge for lemon zest      electric mixer and a medium sized bowl  First you will need to beat the cream cheese and sugar together on medium speed for about a minute. They must be well mixed. Then add the egg and beat it in until it is combined for about a minute.  Then add the lemon juice and vanilla extract and beat for another minute. Zest the lemon wedge in. Just a few times is all it needs. Pour the cheesecake batter over the apples in the pan, smoothing it into an even layer as much as you can. Bake until the cheesecake is set about 25-30 minutes. While this is happening, as you will see in the next step, coat your design you made with the pie crust with egg whites and bake at the same time in the oven with the pie.. Because pie crust is usually not belonging on cheesecake, I decided to bake it separately on a cooking sheet.  I coated it with an egg white to give it shine and baked it next to the cheesecake for 5-8 min. When it was done, I pulled it out and sprinkled it with sugar while it was still hot to give it some sweetness.  The cheesecake should be done within 30 min. Transfer the cheesecake pan to a wire rack to cool, the cheesecake must refrigerate for at least 4 hours or overnight. (For me, since it was already midnight when we were finished... lol, we ended up just chilling over night.).   Before you put your infinity pie in the fridge...., ta dahhhh, the toasted crust design goes on top of the cheesecake like a crowning jewel! Then, add some of the remaining crunchy toasted nuts on top and along the outsides to bring it to life. Then, put that sucker in the fridge overnight. I know it will be REAL HARD. But trust me, it needs to cool for at least 4 hours.  When serving your Infinity pie, put some caramel on the plate along with the special crunchy nut mixture. It will definitely knock someone's socks off! Pair with vanilla bean ice cream for a real desert! Be sure to refrigerate any leftovers.\"The sky calls to us; if we do not destroy ourselves. We will one day, venture to the stars\"  -Carl Sagan  This one's for you Carl! Enjoy your Infinity pie everyone =) PLEASE let me know if you make it!\nRead the question below and select from the following choices.\nA: Caramel Apple Cheesecake\nB: Pie Assembly Part 2\nC: Prepare the Garnishes\nD: Pour Your Cooled Apples on Top",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_37_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_37_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_37_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_37_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_37_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_37_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_37_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_37_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_37_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_37_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_37_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_37_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_37_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_37_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_37_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_37_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_37_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_37_17.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_37_18.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_37_19.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_37_20.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_37_21.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_37_22.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_37_23.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_37_24.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_37_25.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_37_26.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_37_27.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_37_28.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_37_29.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_37_30.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_37_31.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: pumpkins / Pokemon Rice Crispy Ball\nB: Cover With Chocolate (four Coats!)\nC: Supplies\nD: Melting",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['To Start', 'Make Rice Crispy Treats & Melt the Chocolate', '@placeholder', 'Done!']",
+    "context": "Here is the context of these images:\n. Ingredients:\n3 tablespoons butter\n10 oz (1 package) mini marshmallows - separate out about 12 for the mini marshmallow centers\n6 cups rice crispy cereal\n1\u00a0\u00bc oz almond paste (optional)\njumbo marshmallows (optional - for the large surprise center)Toppings:\n21 oz good quality chocolate (six 3.5 oz bars)\ncoconut flakes (optional)\nchocolate powder (optional)Special Tools:Lollipop sticks\nNon stick aluminium foil\nSpray cooking oil (for your hands)\nStyrofoam for sticking pops into\nI\u00a0re-purposed\u00a0a pasta drying stand with binder clips to hang the pops to set\na double boiler to melt the chocolate in (or a metal bowl that can fit atop a small pot)\nI made 3 different sizes and got 22 \u00a0pops from this recipe. \u00a0\n(2 jumbo pops, 8 small plain bite size, 12 with marshmallow centers). In a large pot over medium/low heat:\nmelt 3 tablespoons butter\nadd 10 ounces mini marshmallows\nadd 1.25 ounces almond paste (optional) - the almond taste was barely noticeable next time I may add more.\nstir until melted and smooth\nstir in 6 cups rice crispy cereal\nPour out onto a a cookie sheet lined with non-stick aluminium foil.\nLet cool for a few minutes before forming into balls.\nMeanwhile heat up a small amount of water in your double boiler or pot. \u00a0The higher quality chocolate you use the easier it melts so the water should not be boiling hot. \u00a0When the water is steamy hot remove it from the stove and place the metal bowl atop the pot and break your chocolate into small pieces into the bowl. \u00a0The chocolate will melt and the stay liquid for quite a long time without having to be put back onto the stove. \u00a0If your chocolate does start to harden just heat up the water more without the bowl on top. \u00a0If the chocolate gets too hot it may seize up and you'll have to start over with new chocolate.\u00a0\nNow you can chocolate cover some mini marshmallows for the center of the pops.\nI stuck the marshmallow onto a toothpick to dip it into the chocolate and another toothpick to remove it. \u00a0Put them into the freezer for a few minutes so they set up quickly and can be handled when forming the rice crispy balls.. Once the treats have cooled enough to be handled (a few minutes) spray your hands lightly with cooking oil. \u00a0Just a little bit. \u00a0This will help so much and your hands will be so soft afterwards.\nForm the rice crispy treats into balls.\nJust form the rice cereal like a bowl around a marshmallow to make a surprise center.\nLet the balls sit for a few minutes to firm up.\nOnce they feel firm push the lollipop stick in and then remove it and fill the hole with a bit of chocolate to \"glue\" the stick in. \u00a0\nPlace in the freezer for a few minutes to set up quickly.Note: \u00a0I used different colors of pen to mark the bottoms of the sticks so I knew which ones had which centers.. Coat each pop with chocolate and let the excess drip off.\u00a0\nI used binder clips on the bowl to hold the pops so the excess could drain back into the bowl.\nThe picture shows only one binder clip set but I made three sets so by the time I covered the third pop with chocolate the first could be taken out and hung from the pasta dryer to set for a few minutes.\nThis was done to keep the excess chocolate from dripping down the stick. \u00a0If you use candy coating instead of real chocolate then I think you can just skip this and place them into the styrofoam sheet.\nAfter 8 pops were done they were placed into a styrofoam sheet and put into the freezer for a few minutes to set up quickly.\nI repeated this four times for each pop so they have a THICK coating of chocolate (I love chocolate).\nBefore the last coat dries completely add any additional toppings: coconut flakes, chocolate powder, etc.\nI used a chocolate mocha hot chocolate powder to coat one of the pops. \u00a0Sweet chocolate with coffee flavor - yum!. As I added each coat of chocolate I lost the perfect roundness of my rice crispy balls, especially the one with the jumbo marshmallow center. \u00a0It's heavy & lopsided so I had to eat it myself :)\nSince I used real chocolate and used a freezer to set the chocolate quickly and not candy coating melts I suggest keeping these in the refrigerator to keep the chocolate from blooming - which isn't pretty but still tasty. \u00a0\nTake them out of the fridge and come to room temperature before serving for the rice crispy treat to soften up to eat.\nRead the question below and select from the following choices.\nA: pumpkins / Pokemon Rice Crispy Ball\nB: Cover With Chocolate (four Coats!)\nC: Supplies\nD: Melting",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_38_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_38_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_38_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_38_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_38_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_38_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_38_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_38_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_38_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_38_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_38_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_38_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_38_12.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Classic Chicken Noodle Soup\nB: Prep Work\nC: Enjoy!\nD: Prep Work",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Ramen Noodle Breadcrumbs', 'Breading', 'Cooking', '@placeholder']",
+    "context": "Here is the context of these images:\n. This recipe serves two people or one extremely hungry student :)Ingredientsone chicken breast ($1.50)one ramen noodle packet with seasoning ($0.25)one egg ($0.20)Total Cost: $1.95That's all the ingredients that there is to it! Feel free to double or even triple the recipe if there are more people.. 1. Start off buy putting your ramen in a plastic bag. Gallon bags work best but are not necessary. 2. Pour as much seasoning as you like. I usually use 3/4 the packet but add more or less to taste. Seal the plastic bag so that you don't make a big mess.3. Put your textbooks to work by using them to crush the ramen into small bits. Make the pieces bigger for crunchier nuggets or extremely fine for a softer nugget.4. Pour the ramen breadcrumbs into a bowl. If the plastic baggie you used to crush the noodles hasn't broken, you don't have to do this step, but most likely there will be a few small tears from where the textbook has stabbed it.. 1. Take your chicken breast and use a knife (ones that have broken and have been repaired by duct tape are fine) to cut it into bite sized pieces like in the second photo.2. Crack one egg into a bowl and whisk it using a fork/chopsticks/whatever.. 1. Start off by making sure you have a plate close by to put the breaded chicken on. Then place some chicken bits into the egg mixture making sure that every bit is coated.2. Place the eggy chicken into your crushed ramen noodles and use your hand to make sure every part of the chicken is coated and that there are no bare spots.3. Put the nuggets onto a place and get ready to cook!Tip: Have one hand do the wet stuff (coating the chicken with egg) and your other hand do the dry stuff (coating the chicken with ramen noodle, placing nuggets onto a plate). . 1. Pour some oil onto a pan. Doesn't really matter what kind of oil or what kind of pan, whatever you have. I used olive oil for this demonstration. Also, the more oil you use, the more tender and generally tastier the nuggets will be. Heat the pan on medium until the oil is hot.2. Place all of the nuggets in an even layer on the pan. Don't worry if some of the ramen noodle coating falls off, you can pick those up later.3. Cook until the bottom of the nuggets are a golden brown. The nugget in the third photo isn't done yet, it needs to cook for longer.4. Once the nuggets are golden brown like in the last photo, turn them over so that the other side can cook. Once you can see that the other side is also golden, remove the nuggets from the pan and transfer them to a plate (or just eat them out of the pan, less dishes amirite?)Tip: Don't put the cover on the pan! Condensation will form and drip onto your chicken nuggets making them soggy and wet.. Eat your nuggets when they are still warm and enjoy your delicious meal! If you liked this Instructible, please take a second to vote for me in the DIY University Contest! It would mean the world to me!Have a fantastic day,thederpyninja\nRead the question below and select from the following choices.\nA: Classic Chicken Noodle Soup\nB: Prep Work\nC: Enjoy!\nD: Prep Work",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_39_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_39_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_39_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_39_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_39_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_39_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_39_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_39_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_39_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_39_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_39_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_39_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_39_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_39_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_39_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_39_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_39_16.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Hot Jenever Toddy\nB: Eat and Enjoy!\nC: What You Need\nD: Blitz It!",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Mix It Up', 'Baking', 'Spread on the Butter', '@placeholder']",
+    "context": "Here is the context of these images:\n. . Knead the dough in a mixer, or by hand for 5 min., until it's smooth, soft, and pretty slack. Then dust it with flour and put it in a plastic bag. Close the bag, (leave room for the dough to expand) and Let it rest for 30 min.. Split the dough into 8 equal pieces. Let the pieces rest, not covered, for 5 minutes.. Roll the pieces of dough into a thin rope, about 22 inches long. Twist each rope into a pretzel shape. Brush each of the pretzels with the warm water mixed with a teaspoon of sugar and set them on the baking sheets. Sprinkle them with a little salt. Then let them rest for 10 min. uncovered.. Bake the pretzels in a preheated, 500F oven for 8 to 10 min. or until they're golden brown. But don't forget to reverse the baking sheets half way through.. Take the pretzels out of the oven, and thoroughly brush on three tablespoons of melted butter on the pretzels. It may seem like a lot, but that's what gives them their yummy flavor.. I hope you enjoy the pretzels! They taste best when eaten warm.\nRead the question below and select from the following choices.\nA: Hot Jenever Toddy\nB: Eat and Enjoy!\nC: What You Need\nD: Blitz It!",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_40_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_40_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_40_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_40_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_40_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_40_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_40_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_40_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_40_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_40_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_40_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_40_11.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Azul Camaron Mariposa\nB: The Final Product\nC: Mash Browns\nD: Place on the Grill",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Ingredients List', \"Assemble Your ATB's\", '@placeholder', 'Remove From Grill']",
+    "context": "Here is the context of these images:\n. 5 large Jalape\u00f1os 1 Pack of Bacon 1 container of cream cheeseBBQ RubBBQ Sauce (optional). First you will need to prepare your ABT's using all the ingredients above. Start by slicing the Jalapeno in half and with a spoon cleaning out the seeds and membrane. Next you will fill the half Jalapeno with cream cheese. Make sure to fill it full. Next you are going to want to apply some of your bbq rub onto the cream cheese. Lastly you will need to wrap your stuffed jalapenos with one full slice of bacon. TIP: Make sure to wrap it firm, this will help it cook together and you won't have to use toothpicks.. One your Atomic Buffalo Turds have been put together you will them place them onto your grill using indirect cooking with a tempreture of around 300-325 degrees. Place a small chunk of hardwood in for smoking (optional) and then close the lid and begin cooking for 1 hour 15 minutes.. After 1hr 15 mins, your bacon wrapped jalape\u00f1os should be done. If you like your bacon more cooked feel free to leave them on for a few more minutes or until your preferred doneness. Let them cool for a few minutes to allow the cream cheese to cool a bit. Serve it up with your favourite BBQ Sauce, Ranch Dressing or blue cheese sauce and enjoy.\nRead the question below and select from the following choices.\nA: Azul Camaron Mariposa\nB: The Final Product\nC: Mash Browns\nD: Place on the Grill",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_41_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_41_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_41_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_41_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_41_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_41_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_41_6.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Green Mountain Gronala\nB: Ingredients.\nC: Green Bean Bundles\nD: Enjoy!",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Prep Avacados', 'Add Some Flava!', 'Monster Mash', '@placeholder']",
+    "context": "Here is the context of these images:\n. 2 Avocados\nLime Juice\nSalt\nGarlic\nOptional:\nDiced Tomato\nOnion Flakes. Cut avocados in half length wise.\nRemove pits and set aside.\nUse knife and cut slits in avocado vertically and horizontally, don't cut through skin.\nScoop out avocado meat with spoon.. Add salt to taste. And Yes, I melted my salt shaker to improve flavor..\nAdd lime juice to taste, fresh or concentrated.\nAdd garlic to taste.\nOptional:\nAdd onion flakes to taste, I was out :(\nAdd diced tomatoes to taste. I am mashing with a pastry cutter with works wonderfully.\nYou can also mash with fork, spoon, potato masher or fingers.\nOnce you've completed mashing, add avocado pits. They keep the guacamole from turning brown as fast!. There you have it! Some gooey, green, just darn good guacamole!!\nEven my cute critic like it!\nServe with chips, on burgers, or eat it off the spoon!\nRead the question below and select from the following choices.\nA: Green Mountain Gronala\nB: Ingredients.\nC: Green Bean Bundles\nD: Enjoy!",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_42_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_42_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_42_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_42_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_42_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_42_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_42_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_42_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_42_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_42_9.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Chocolate Coverd Peanut Butter Cups (Reese's)\nB: Topping 'em Up!\nC: Enjoy\nD: Melt and Mix Ingredients",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Mix Up the Filling', 'Important Chocolate!', \"Fill 'er Up!\", '@placeholder']",
+    "context": "Here is the context of these images:\n. Healthier-For-You Peanut Butter CupsX tbsps. of peanut butter (X being the number of peanut butter cups that you want)icing sugarchocolate chips (or molding chocolate wafers. I just used what I had at home at the time of the craving, which was chocolate chips!)Mix peanut butter and icing sugar to taste. Seriously, I cannot give you an exact amount, because everyone likes a different amount of sweetness. I used chunky peanut butter, but you can use all-natural or already sweetened or hey, why not almond butter! Add enough icing sugar to the peanut butter that it becomes a sort of dough. It has to be able to be rolled into little balls of about a tbsp. each. If you put in too much icing sugar, it will crack, but too little icing sugar and the peanut butter will stick to your hands. Roll the peanut butter mixture into little balls, the size of the molds. I happen to have a candy mold that I bought on sale in a craft store, but frankly, you could use the molded plastic that chocolates often come in, when they're in layers in boxes. Or even a mini muffin tin in a pinch.. Melt chocolate chips or chocolate wafers in a mug in the microwave, stirring every 30 seconds until fully melted. Paint the inside of the molds, and set the mold in the freezer until the chocolate is set.. Push a ball of the peanut butter mixture into every mold. Press down with your thumb to make sure that it fills up the space. I also made up some wine jelly bonbons. These are great for a more adult end to a nice dinner, to serve with coffee! I used some wine jelly that I had lying around the kitchen, possibly from this recipe. You follow the same instructions as above in terms of the chocolate coating, but just use the wine jelly as an alternative to the peanut butter filling. Easy-peasy homemade fancy chocolates! Can we say holiday entertaining? *grin*. Brush the top with the melted chocolate and return to freezer (technically, a fridge is better, but my cravings have no patience). When set, pop out the peanut butter cups from the molds, and voila! Healthier-for-you peanut butter cups!Want to fancy them up? Why not sprinkle a little something on top the chocolates before they set up? Maybe some fleur de sel? Some chopped peanuts? A candied violet for the wine bonbons? It's up to you! For more yummy and easy recipes, check out my blog, at www.approachingfood.com! You can also follow me on twitter @approachingfood, or on pinterest @approachingfood. Even on fb: www.facebook.com/approachingfood! \nRead the question below and select from the following choices.\nA: Chocolate Coverd Peanut Butter Cups (Reese's)\nB: Topping 'em Up!\nC: Enjoy\nD: Melt and Mix Ingredients",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_43_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_43_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_43_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_43_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: What You Will Need\nB: How to Make Bulgogi\nC: Melt Your Chocolate\nD: \"Paint\" the Sides of the Mold",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Add the Vegetable Stock and Mix Well With a Spoon.', 'Knead Again for About a Minute and Let Rest for 10 Minutes.', 'Cook Seitan']",
+    "context": "Here is the context of these images:\n. Ingredients:1/4 cup flour1/4 cup corn flour (or fine grind corn meal)1/4 cup soy flour1 1/2 cup vital wheat gluten*1/2 tsp salt1/2 tsp baking powder1 1/2 cups vegetable stock**another 2 quarts vegetable stock**    *Vital wheat gluten or wheat gluten can be found in regular grocery stores and health food stores.    **The vegetable stock gives the seitan it's flavor, feel free to add other spices or seasoning to your taste ie; garlic, soy sauce, tamari etc.Equipment:measuring cups and spoonslarge bowlspoon for stirringsifterlarge cooking pot. . . . . . Knead again, this time putting pressure in the centre of the dough so that it forms a ring.\u00a0Cut apart the ring.\u00a0Then slice ~1cm thick pieces.. Bring the 2 quarts vegetable stock to a boil and add the slices of dough.\u00a0Stir occasionally so that they don't stick together.Turn the heat down and let simmer for 20 minutes.\u00a0After 20 minutes, remove pieces from the pot and place on a plate to cool.. Seitan is best sauted before eaten.\u00a0 It's yummy in stir-fries or try it in fajitas.\u00a0\u00a0 Seitan can be stored \"as is\" in the freezer, I usually divide the pieces up into meal size portions and store it in a freezers bags.\nRead the question below and select from the following choices.\nA: What You Will Need\nB: How to Make Bulgogi\nC: Melt Your Chocolate\nD: \"Paint\" the Sides of the Mold",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_44_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_44_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_44_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_44_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_44_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_44_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_44_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_44_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_44_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_44_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_44_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_44_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_44_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_44_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_44_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_44_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_44_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_44_17.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_44_18.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_44_19.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_44_20.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_44_21.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_44_22.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_44_23.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_44_24.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_44_25.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_44_26.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_44_27.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_44_28.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Making the Egg.\nB: Cheers to Valentine's Day!\nC: Preparing the Oven for Broiling\nD: Broiling the Biscuits",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Getting Ready and Mixing It Up.', 'Wait a Few Minutes and Start on the Egg Mcmuffin.', '@placeholder', 'Make the Sandwich.']",
+    "context": "Here is the context of these images:\n. Ingredients 2 cups flour (all purpose or whole wheat) 1/3 cup oil 2/3 cup buttermilk 2 tsp baking powder 1 tsp baking soda Pinch of salt. 1 - 2 eggs per Mcmuffin. 1 tsp water per Mcmuffin. 1 small dab butter per Mcmuffin. 1 slice cheese per unit (or make your own) 1 slice luncheon meat per unit Mayonaise. (Tons of instructables on making home made mayonnaise) Utensils: Stove capable of supporting 425 degrees F. Microwave Microwave proof egg containers Biscuit cutter Medium to large cookie sheet pan. Large sturdy wooden spoon Rolling pin or equivalent. 1 mini food processor Large bowl to combine measure 1 tsp measuring spoon 1 cup measuring cup 1 tbl measuring spoon. \n          Preheat oven to 425 degrees Fahrenheit. In the large mixing bowl add the dry ingredients (salt, baking powder, baking soda, and flour) stir around to evenly distribute them. Take the cup and fill it one third full of oil. Does not have to be perfect but close is important. Fill the rest of the cup with buttermilk. Pour that into the flour mixture. Stir it all until it becomes a dough and is in a ball. Do not over mix. Take the ball and spread it out evenly as much as you can on a prepared counter, You can either use your hands like I do or use the rolling pin.\u00a0 Use a biscuit cutter ( I use a tin can with both ends removed) to make biscuit dough shapes Put in the baking pan. Between the oil and the buttermilk, you should not have to grease the pan. Though it would not hurt to do it is the first time you do this recipe.   Note: I used whole wheat flour instead and that accounts for the darker color dough in the next step).. By this time the oven should be ready. Notice the time. Put the cookie pan in the oven. Keep an eye on the biscuits so they does not burn. Should not take more than 12 minutes at most. Watch carefully!! The edges of the biscuits will plump up and turn brown. After 10 to 12 minutes put on the cooking mitt and remove the cookie pan. Note: Biscuit cutter was made from a small mushroom can.\u00a0. Yes, put the cookie pan on top of the stove or somewhere safe from the hot bottom of the cookie pan so the biscuits can cool and rest a bit. While that is cooling, lets put together the other parts. If you want to make your own cheese, see https://www.instructables.com/id/Our-pizza/. I will probably use just a slice of store bought cheese for this set up. Take the slice of cheese and cut it with the biscuit cutter. Do the same with the luncheon meat. Note: I probably could of made the biscuits a bit bigger and thicker.. Get your microwave safe little bowl and crack one egg. then add one teaspoon of water plus a dab of butter. Stir well. Cover the egg container, but do not seal it closed. Cook in the microwave about 45-50 seconds per egg.\u00a0 Your microwave may vary. The egg should be well cooked and firm.. Take one of the biscuits and slice it into two disks so to speak. Add mayo, mustard, or whatever on the inside sides of the biscuit. Add the micro-waved egg, meat, and cheese. MMMM goody goody! Coming Soon: Beat the cost of a fast food hamburger.. if you like biscuits and gravy like I do, if you have time you may want to go one step further. Take several slices of bacon or other meat that will make grease when you fry it. Fry the meat, but keep the stove on. (substitution: take solidified fat, lard, or butter) Remove the meat (if you did not use the fat or butter) and put on some paper towels. Remove all the oil except for about a tablespoon and a half. Add about a tablespoon and a half of flour. mix well with a heat proof whisk. till it is light brown. Add a cup or so of milk. Stir till you get a nice thick brown slurry. Add a little black pepper and mix it in. Pour\u00a0 the mixture on the biscuits. Crumple the fried bacon on top. (if you fried the bacon) Eat and you are in high heaven.. Biscuits do not have to be round. You can make them any shape you want.\nRead the question below and select from the following choices.\nA: Making the Egg.\nB: Cheers to Valentine's Day!\nC: Preparing the Oven for Broiling\nD: Broiling the Biscuits",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_45_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_45_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_45_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_45_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_45_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_45_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_45_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_45_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_45_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_45_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_45_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_45_11.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Lazy Valentine's Brownie\nB: Add Marshmallow Fluff and Brownie Mix\nC: Prepare the Flour\nD: Measure",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Melt the Chocolate', '@placeholder', 'Making the Brownie Mixture', 'Baking the Brownie']",
+    "context": "Here is the context of these images:\n. 1. Plain Flour: 1 1/4 cups 2. Baking soda: 1/2 teaspoon 3. Salt: 1/2 teaspoon 4. Chocolate chips: 1 cup 5. Unsalted Butter: 150 grams6. Vanilla extract: 3 teaspoons7. Cocoa Powder: 3 tablespoons8. Yogurt: 3/4 cup 9. Skim milk: 1/4 cup 10. Sugar: 1 cup . 1. Add the chocolate chips and butter to a microwave safe bowl and microwave for 1 min2. Mix the melted chocolate and butter together until combined3. Add the vanilla extract and cocoa powder to the chocolate mixture and combine to form a smooth batter. 1. Add the flour to a mixing bowl2. Add the baking soda and salt to the flour 3. Mix well and set aside . 1. Add yogurt, milk and sugar to a bowl and mix together2. Add the chocolate mixture to this and combine 3. Add the plain flour in and combine everything using a whisk to form a cakey batter . 1. Line a baking dish with baking paper and grease with butter or oil spray 2. Pour the brownie mixture into the baking dish3. Bake the brownie at 180 degrees for 35 mins, let it cool down and enjoy!\nRead the question below and select from the following choices.\nA: Lazy Valentine's Brownie\nB: Add Marshmallow Fluff and Brownie Mix\nC: Prepare the Flour\nD: Measure",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_46_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_46_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_46_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_46_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_46_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_46_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_46_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_46_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_46_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_46_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_46_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_46_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_46_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_46_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_46_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_46_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_46_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_46_17.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_46_18.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_46_19.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_46_20.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Spicy Chicken Curry\nB: Put Them Together!\nC: Prep Ingredients\nD: And Enjoy!",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Ingredients', '@placeholder', 'Put It All Together', 'Enjoy']",
+    "context": "Here is the context of these images:\n. Here are the ingredients you will need2 Chicken breast or any meat you have 1 Onion- chopped 1 can of Chopped tomatoes or 5 large tomatoes- cubed 1 tablespoon of Tomato puree 2 Scotch bonnet (habanero) 1 Red Bell Pepper (Tatashe) 1 tablespoon of vegetable oil I clove of garlic (optional) Curry  Thyme 2 stock cubes Salt to taste.   Chop bell peppers and onions into cubes Finely chop 2 Habanero peppersThen using a different chopping board cut the chicken breast into bite sized chunks.  Heat up the vegetable oil in a large pan (you need as little oil as possible-just enough to stop the chicken from sticking to the pan).When the pan gets hot, fry the chicken for 3 minutes or till in turns white.Then add the peppers, onions and garlic and continue to cook till the chicken is golden brown.Then add the scotch bonnet, curry, thyme and stock cubes.Then add the tomato puree and stir fry.After 1 minute of stirring,  add the can of chopped tomatoes and salt to taste, then simmer on medium heat for 10 minutes or  till the tomatoes are cooked.. And voila all done!!!Serve as a dip for chips, fries, baked potatoes or on a bed of steamed  rice and enjoy :) Nemi.\nRead the question below and select from the following choices.\nA: Spicy Chicken Curry\nB: Put Them Together!\nC: Prep Ingredients\nD: And Enjoy!",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_47_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_47_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_47_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_47_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_47_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_47_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_47_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_47_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_47_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_47_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_47_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_47_11.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Crispy Fried Tofu\nB: Marinade Chicken\nC: Tofu\nD: Make Dry Mixture",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Oil', 'Golden', 'Done!']",
+    "context": "Here is the context of these images:\n. First off grab your tofu, firm or extra-firm works best. Cut it into small cubes with a sharp knife. Blot it with a paper towel to remove excess moisture. Tip: Store the leftover tofu in a container filled with fresh cold water and keep it in the fridge. Change the water every day to keep your tofu fresh.. Turn your stovetop on high and heat up some oil (enough to generously cover the bottom) in a pan (I used extra virgin olive oil). Use this time to pick out some seasonings for your tofu. I went with just salt, and some Japanese mixed spices (the one I used is called S&B - Nanami Togarashi and includes\u00a0chili pepper, orange peel, sesame seeds, Japanese pepper, ginger, and seaweed).. To test if your oil is hot enough drop one small piece of tofu into the pan and if it bubbles, it's ready. Drop all your tofu in and cover the pan so that oil doesn't splash everywhere and stir occasionally (stir gently in the beginning).. Add in your seasoning to taste and stir.. Eventually the tofu will start to brown. Check often and stir to make sure they aren't burning. It's ready when the cubes have shrunk to about \u00be of the original size and they are evenly golden brown on the outside. You may also prefer to cook them less.. Pour the tofu cubes into a bowl lined with paper towel and let cool slightly before serving. Enjoy!\nRead the question below and select from the following choices.\nA: Crispy Fried Tofu\nB: Marinade Chicken\nC: Tofu\nD: Make Dry Mixture",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_48_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_48_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_48_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_48_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_48_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_48_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_48_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_48_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_48_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_48_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_48_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_48_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_48_12.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Cheescake Cookies\nB: Adding Jam\nC: Ingredients and Supplies\nD: Shape the Dough",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Color the Dough', '@placeholder', 'Cut the Cookies', 'The Final Cookies']",
+    "context": "Here is the context of these images:\n. I mostly love this cookie recipe because it is a one bowl cookie dough! You will be kneading and mixing in food coloring so the dough will be well mixed without needing to sift the dry ingredients separately.Icebox Sugar Cookie\n1 c. butter\n1 c. sugar\n1 egg\n1 t. vanilla\n2 c. flour\n1 1/2 t. baking powder\nIn a large mixing bowl cream butter and sugar until combined. Add egg and vanilla mixing well. Sift flour on top of wet mixture and before stirring add the baking powder. When you start to combine the wet and dry ingredients, the flour and baking powder will incorporate well enough throughout the dough\nDump dough onto a piece of waxed paper and divide dough into six equal pieces. You can obviously use more or less colors but I chose six.. After dough is divided choose food coloring colors and mix into dough using your hands. I find that there is enough butter in the recipe that your hands don't take on the food coloring but you could use gloves if you want make sure you don't have rainbow hands when you are finished!. Turn your imagination on high and start making your patterns. This is a relatively easy process but it does take some time and patience.\nTo make a bullseye shape:\n1. Choose a color and form a cylinder.\n2. Roll out another color large and long enough to wrap the cylinder.\n3. Gently press or squeeze dough to make sure the pieces stick together well.\n4. Keep wrapping with colors until you have the bullseye you want.\n5. Roll the completed bullseye into a longer log shape.\n6. Cut in half, thirds or as many as needed.\nTo make a flower shape:\n1. Choose a color that will be the center of the flower and form a cylinder.\n2. Roll out another color large and long enough to wrap the cylinder.\n3. Gently press or squeeze dough to make sure the pieces stick together well.\n4. Make a coil of dough and pinch the top to make a triangular shape. Repeat for amount of petals you want.\n5. Stick the triangles on the sides of the covered cylinder.\n6. Press another color of dough in between the triangles.\n7. Wrap the entire cylinder again with an outer color.\n8. Cut in half, thirds or as many as needed.\nThose are the two basic techniques I used but be creative and make anything you want!. Once you have all of the patterns of dough made, arrange them together to get the final pattern that will be the finished cookie.\nRoll patterns into longer logs if you want the pattern to be smaller in the final cookie. If necessary cut logs so they are all the same height.\nWrap in plastic wrap and freeze until hard - 2 to 3 hours.. Preheat oven to 350 degrees F.\nRemove cookie dough from the freezer and slice in 1/4 inch pieces. Repeat with all of your patterns (if you have more than one).. Bake for 7-9 minutes depending on how soft or crisp you like your cookies. Remove cookies from oven and let cool on baking sheet.\nColor will not fade while baking.. Eat and enjoy! Yum!\nRead the question below and select from the following choices.\nA: Cheescake Cookies\nB: Adding Jam\nC: Ingredients and Supplies\nD: Shape the Dough",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_49_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_49_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_49_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_49_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_49_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_49_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_49_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_49_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_49_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_49_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_49_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_49_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_49_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_49_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_49_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_49_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_49_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_49_17.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_49_18.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_49_19.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_49_20.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_49_21.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_49_22.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_49_23.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_49_24.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_49_25.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_49_26.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_49_27.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_49_28.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_49_29.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_49_30.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_49_31.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Chocolate Shadow Cake\nB: Bake the Cake\nC: Mixture\nD: Cut the Cake Board for the Top of the Cake",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Optional', \"Fill 'the Box' With Chocolates\", '@placeholder', \"Lean 'the Lid' on the Box\"]",
+    "context": "Here is the context of these images:\n. \nrectange cake board\ncircular cake boards\nspatula\ncake mix or your favorite scratch recipe (see my\u00a0Old fashioned sour cream fudge recipe\u00a0below)\nheart shaped cake\u00a0pan\ncake release\nrolling pin\nsaran wrap\nred or pink pearl dust\nclean (new) make- up brush\npliers\ndowel\nscissors\nsharp knife\nblack marker\nroller cutter (optional)\nred gel paste food coloring (if using white fondant)\nfondant ( you can use white and color or purchase red fondant)\ncandy cups\nchocolates (at least\u00a024\u00a0 )\nfood wrap and tin foil to cover\u00a0cake board\u00a0(optional)\nyour favorite buttercream icing (see my favorite below)\ngumtex or tylose or use\u00a0gumpaste insteadOld Fashioned Sour cream fudge cake\u00a0\u00a0Ingredients:AmountIngredient2 \u00bc cupscake and pastry flour2 tsp.Baking soda\u00bd cupbutter, softened2 \u00bc cupsfirmly packed brown sugar1/2 tspsalt3eggs1 1/2 tspvanilla1 cupboiling water3 ouncesbakers unsweetened chocolate (melted01 cupsour cream (cooled)\u00a0 \u00a0 Directions:Sift together flour, baking soda and salt; set aside. Cream butter. If you use salted butter (skip the salt). Gradually add brown sugar and continue beating for 5 minutes. Add eggs one at a time, beating well after each addition. Add vanilla and chocolate. Alternately blend in flour mixture and sour cream, one third at a time, on low speed of electric mixer. Add boiling water; blend well. (Batter will be thin.) Pour into one greased and floured, waxed paper lined 9 \u00bd inches layer pan. Bake at 350 degrees for 35 to 40 minutes, or until cake tester inserted into center comes out clean. Cool in pans for 10 minutes. Remove and finish cooling on racks.Optional Filling: Kirsh Cream with Strawberries\n\t250 ml. Heavy cream 250 g. chopped strawberries (about 1 \u00bd cups)\n\t1 to 1 \u00bd tbsp. Kirsh cream or any other\n\tfruit liquer.\n\tBeat cream until whipped. Fold in strawberries and liquer and fill cake. \n\t\u00a0\u00a0Frosting: 5 squares Unsweetened Chocolate \u00bd cup butter, softened 1/3 cup water 3 cups icing sugar 1 egg Melt chocolate with butter and water over low heat; cool. (Mixture may appear curdled.) Add icing sugar and egg. Blend; then beat on low speed of electric mixer for 2 minutes. Chill until of spreading consistency.\u00a0Alternative Frosting (Bittersweet Chocolate Frosting): Amount is for a wedding cake therefore cut in half. 1 lb. Bittersweet chocolate, chopped \u00be cup heavy cream 3 tbsp. Unsalted butter In medium saucepan, boil water. In medium steel bowl combine approximately 2/3 of the chocolate and cream. Place bowl over saucepan and sir frequently until melted and smooth. Remove from heat and stir in remaining chocolate until smooth. Gradually beat in butter, 1 tablespoon at a time. Let stand until cooled to room temperature. \u00a0Bittersweet Chocolate Whipped Cream Buttercream IcingIngredientsPart One 1 lb. powdered sugar (sifted) 2 1/2 cups Crisco, 4 oz melted bittersweet chocolatePart Two 3/4 cup granulated sugar 1/2 tsp. salt 2 TBSP. Meringue powder (add 1 additional TBSP for slight crusting) 1/2 cup BOILING water (less 2 TBSP) 1 TBSP Vanilla (or flavor of your choice)InstructionsPart one... put crisco in bowl and gradually add powdered sugar. Beat about 5 minutes until mixture is very creamy and fluffy then add melted chocolate squares.\u00a0Set this aside.Part two... In a very clean bowl mix dry ingredients. Add BOILING water and immediately mix on high speed. Beat until stiff peaks form, about 8 minutes. When mixture begins to get stiff add flavoring.NOW combine both mixtures\u00a0and beat together for another 8 minutes. When finished, use a rubber spatula to down beat a little to remove some of the air bubbles. Frosting will be very light and creamy. Cover. DO NOT REFRIGERATE.The frosting may be kept at room temperature for 3 months. Whip with a spoon each time you use it to restore fluffiness.. Optional: Line the cake board with tin foil and food safe plastic wrap (this is not necessary but makes it easier to wipe messes off the board) I usually use neutral gold or silver gift wrap I purchase at Michael's , but I had run out. . Bake 2 heart shaped cakes. I always use a generous amount of Cake Release to prevent the cake from sticking. Level the cake, but cut \u00a0the one for the top (lid) of the cake a little shorter\u00a0than the bottom and place it on a circular cake board. Put it aside.\nPut the bottom cake on the main rectangular cake board. Fill the bottom cake \u00a0with filling of your choice (this is optional). Ice the cake, being sure to fill in the area where the cake was cut to fill, if you filled it. This doesn't have to be a thick layer covering everything, only a crumb coat. If a few crumbs mix in, it's not a big deal. Smooth as best as you can.\nRepeat for the top of the cake. It is important to get the top of the cake very smooth, as you will be placing fondant on top of it.\nTip: Take a metal spatula, soak it in boiling water (I use a pot on the stove)\u00a0 and wipe the\u00a0water off on a clean tea towel, then\u00a0smooth the icing with the dry hot spatula over the surface of the cake.\u00a0\u00a0Then remove excess icing off spatula. \u00a0Keep repeating until your cake is smooth. Add\u00a0about 2 Tblsp of \u00a0tylose or gumtex to\u00a0your\u00a0fondant, roll out and cut into a long strip. Alternatively, you can use gumpaste that can be purchase at Michael's craft store or any cake decorating store. But you will still have to color it.\nMake sure the strip is wide enough to go about a 3/4 of an inch above the cake (measure with chocolate on top) and let\u00a0the strip\u00a0dry for about 15 minutes. It needs to be dry enough so it won't sag or droop.\nCarefully\u00a0place the\u00a0strip (you will\u00a0likely need 2) \u00a0around the cake,\u00a0and close the seam at the back with a little water.\u00a0. If you have a\u00a0sugarcraft gun,\u00a0 then use the rope attachment to make the rope border.\nIf you do not, then roll out 3 narrow strips with the flat of your\u00a0hands and twist the pieces\u00a0\u00a0together. Don't worry if it doesn't go all the way around. You can do it in pieces and use a little water to 'glue' it together - it won't be noticeable.\nThen 'glue' the strips on the cake with a\u00a0 little water. Do a little strip of rope\u00a0for the seam at the back. And you will also do this for the top of the cake when the time comes. . Fill the surface of the cake with chocolates in candy cups (you can buy at Michaels;) . You will need at least a 24 chocolates. . Outline the circular cakeboard and cut to fit under the top\u00a0cake. You will need this to support the cake. . Roll out colored fondant (1/8\" thick ) and cover the top of the cake. I usually just guage how much I need by looking at it. But you can tell approximately how much you'll need be measuring the cake across and adding a couple inches all around. You can cut off the excess and reuse. If you have no idea how to smooth fondant on a cake, google it - there are lots of tutorials. Some prefer a smoother, but I use my hands (wedding ring off!)\nPlace on cake, smooth and trim.\nTip #1: Stick\u00a0the top of the cake\u00a0in the freezer for 10 minutes while you roll out your fondant - this makes it easier to cover with fondant. Don't leave it longer than 10 minutes!\nTip#2: To transfer the fondant, I roll it up a little with my rolling pin and gently unroll over the cake. . \nRoll out the remaining \u00a0fondant with gumtex or tylose (or gumpaste) \u00a0as thinly as possible (as least half as thin\u00a0as you rolled it to cover the cake)\n\u00a0Cut two lengths of the\u00a0fondant (or gumpaste) \u00a0the same length and width. These will form the loops. I generally cut mine around 7.5 cm/3 inches wide and about 15 cm/6 inches long. The length of these loops will determine the size of you bow, so If you want a bow about 10cms/4 inches long the loops will need to be a little more than double that length when you cut them. Its a little bit of trial and error, but the length can be adjusted after they' ve been cut quite easily.\nTurn one loop piece over and pinch the ends together, then do the same with the other end, and pinch the two ends together. Put some saran wrap in the bow pieces to set it in place.\nRoll out the tails of the bow in the same manner as the loops but make them a little thinner, maybe \u00be of the width of the loop pieces. Cut the ends at a 45 degree angle. Pinch them as you did the loop piece.\nMake the centre knot with another piece of fondant, rolled and marked in the same manner as the other pieces, but only make it about \u00bd the length of the tail pieces. The knot is just a short strip (maybe 1' by 1\") and it is just wrapped around all the other scrunched up ends so that there arent any rough edges showing. It doesnt need to go all the way around the back of the bow, just tuck the edges under so they dont show.\n\"Glue\" the pieces together with a little water on a paint brush\nCut a long, narrow strip and put directly on the cake\nDry brush on the red or pink pearl dust (I use a never used new make-up brush).\nThen place the bow on the cake on top of the narrow strip.. Take a wooden dowel (I use the wooden ones you can buy in art section at Michaels and boil it) , cut to size with pliers and sharpen with pencil sharpener. It should stick out about 1- 1 1/2 \"\u00a0above chocolates. Carefully place the top of the cake onto the sharpened dowel. You may need to poke a little hole in it from behind first (through the back and into the cake board.) You want it resting just above the rim of the bottom cake, so it doesn't put weight on the rim and wreck it.\nServe and enjoy!\nRead the question below and select from the following choices.\nA: Chocolate Shadow Cake\nB: Bake the Cake\nC: Mixture\nD: Cut the Cake Board for the Top of the Cake",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_50_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_50_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_50_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_50_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_50_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_50_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_50_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_50_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_50_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_50_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_50_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_50_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_50_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_50_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_50_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_50_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_50_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_50_17.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_50_18.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_50_19.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_50_20.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_50_21.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_50_22.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_50_23.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_50_24.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_50_25.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_50_26.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_50_27.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_50_28.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_50_29.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_50_30.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_50_31.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_50_32.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Cowboy Beans\nB: Prep the Beans\nC: Looking Good\nD: Popeye's Red Beans and Rice...Hacked!",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Boil Water', 'Saute the Beans', 'Serve']",
+    "context": "Here is the context of these images:\n. green beans (figure 1/4 pound per person)almond slivers2 tbsp unsalted buttersalt and pepper. To prepare the beans, the stems and ends of the beans must be either cut, or pulled off.  Full length green beans are also a bit unwieldy to eat, so it's a good idea to cut or break them in half.Place the prepped beans into a bowl or colander and rinse under cool running water.. Bring a large pot of salted water to a rolling boil.. Cook the green beans in boiling water for 5 minutes.. Strain beans into a colander and run them under cold water.  Better yet, place them in an ice bath in order to fully stop the blanching process.If you are cooking the beans ahead of time, you can take them and put them into the fridge at this point.. Saute the green beans in 2 tbsp of unsalted butter.  Salt and pepper the beans to taste.  I like to add in some (around 1/4 cup) of almond slivers just before the beans are finished cooking. . Plate the beans, serve and enjoy!\nRead the question below and select from the following choices.\nA: Cowboy Beans\nB: Prep the Beans\nC: Looking Good\nD: Popeye's Red Beans and Rice...Hacked!",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_51_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_51_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_51_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_51_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_51_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_51_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_51_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_51_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_51_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_51_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_51_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_51_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_51_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_51_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_51_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_51_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: under\nB: Call Me Bloody\nC: Drain and Cool\nD: Color Me Red",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Dredge', 'Fry', '@placeholder', 'Serve']",
+    "context": "Here is the context of these images:\n. This is probably going to be the hard part.Snakes do a fine job keeping the world free of unnecessary rodents; don't kill them unless absolutely necessary!  That said, if you do kill a snake, or find one dead, don't let it go to waste.The snake in this Instructable was run over by a car; Eric found it a couple minutes later, its heart still beating, in the process of expiring by the side of the road.  Since we knew both time and cause1 of death, and refrigerated the carcass promptly, it was safe to eat.  A bit of internet research identified it as a probable Black Rat Snake, a non-poisonous Indiana resident. 1 Note that snakes can also die from eating poisoned rodents.  You dont want to eat a snake dosed up with warfarin or other toxin2.  Pay attention to context.2 It's apparently fine to cook and eat poisonous snakes- cooking is sufficient to inactivate any venomous residue.  . Cut off the head, strip off the skin, and remove the guts as described in this Instructable.Rinse the carcass, and wipe down with a clean paper towel, then cut the body in to manageable lengths with a sharp knife or pair of poultry shears.  . We're going to treat the snake much like you would a small lake fish, though you can also treat it like chicken.  This is my favorite way to cook bluegill.  \nI dipped the segments in a bit of egg white (milk would also do) before dredging them in a pepper and sweet cornmeal mix (actually just Jiffy mix with some extra black pepper).\nKnock off the excess.. Heat about 3/4\" of canola, vegetable, or peanut oil in a heavy frying pan (I prefer cast iron) until quite hot.  A bit of dry batter should bubble nicely.\nAdd the snake pieces one at a time to avoid dropping the temperature in the pan too quickly.  \nUse tongs to keep your fingers away from the sizzling hot oil, watch for dangerous splatters, and use a screen if necessary to prevent mess.\nTurn the snake pieces just as the batter begins to turn golden- by the time it starts to brown the snake will be overcooked.  There's not much meat on the bones, and the muscles are thin and lean.  (Yes, we  mostly overcooked ours, but it was still tasty.). Remove the snake pieces before they're quite done- they'll continue to cook after removal from the pan- and set them on paper towels to drain and cool.\nIf you've still got more batter, chop up some veggies, dip them in the egg whites and/or milk, dredge in batter, and fry.  You can also just mix the liquid into the batter and fry hushpuppies.  It's all good.\nWe fried some fresh okra from the farmers' market.. Serve your fried snake bits warm, and provide napkins- this is finger food.  Accompany with most anything you'd serve with fried fish. \nThere should be a line of muscle along either side of the spine; this is the thickest piece of meat on the snake's body.  The ribs are quite firmly attached to the spine, so scrape your teeth over them firmly to remove the rest of the meat from the ribs.\nSince our snake was a bit overcooked it mostly tasted fried, but some of the thicker bits had a distinctive nutty snake flavor.  I'm definitely looking forward to getting my hands on another (hopefully bigger) snake and trying this again!\nRead the question below and select from the following choices.\nA: under\nB: Call Me Bloody\nC: Drain and Cool\nD: Color Me Red",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_52_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_52_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_52_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_52_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_52_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_52_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_52_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_52_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_52_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_52_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_52_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_52_11.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The 5 Minute 35&cent; Pizza\nB: Da Bac\u00f6n\nC: Moar Paper\nD: Finishing Touches",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Ingredients', 'Preparation', 'Cooking', '@placeholder']",
+    "context": "Here is the context of these images:\n. For this pizza, you will need...\n     -2 english muffins\n     -Cheese of your choice (I used mozzerella and cheddar)\n     -Topping (optional, I used the classic pepperoni)\n     -Microwave (duh)\n     -Freezer (optional)\n     -Ketchup. Cut the english muffins in half by piercing the sides with forks, add the cheese and topping (whatever they may be) and arrange on a plate.. Put the pizza muffins in the microwave for 1- 1 and a half minutes.  Cooking times might be different depending on the kind of microwave you have, so just try it out, and see how long it takes to melt the cheese. Once te cheese is melted, take it out of the microwave and marvel at the cheesy goodness.. I heard everyone reading step one, you all said, \"What do we need a freezer for??? Is he out of his mind?\" I assure you, I am out of my mind, but not on this subject. Put your pizza muffins in the freezer for about a minute to aid in the cooling process, it makes them cool quickly, but not too quickly.. Cut your pizza muffins in half and add ketchup to the side. Viola! Enjoy the cheesy goodness you have been craving. adios amigos.\nRead the question below and select from the following choices.\nA: The 5 Minute 35&cent; Pizza\nB: Da Bac\u00f6n\nC: Moar Paper\nD: Finishing Touches",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_53_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_53_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_53_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_53_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_53_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_53_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Outback \"Copy Cat Recipe\" From Aunt Jo\nB: Next, Make a Bath\nC: Fold and Crimp\nD: Cut the Retention Band",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Cut Top Off', '@placeholder', 'Fold Flaps', 'Fold Down Retention Band']",
+    "context": "Here is the context of these images:\n. You'll need one standard milk cartonbox cutterpencil or permanent markerruler. Measure 1 cm down from the top edge of the carton body.  Mark an horizontal line on each side.  Be careful, knives are sharp!  Cut along these lines through all four sides.  Remove the top and discard.. Measure down 2.5 cm from the new top edge.  Mark an horizontal line at this point on three sides.  Cut along line through three sides.  Leave band attached to one side of carton body.. Measure up 9 cm from bottom of carton, make a mark at corner junction.  Make a cut from this mark, through the corner up to the band.. There should be three flaps and one flap with an attached band.. Fold three side flaps down evenly into the cavity of the carton.. Finally fold down the last flap with the retention band.  Work the band down around the side of the carton to form the closure.  And Voila!This is my very first instructable!  Please comment!  And I hope you will enjoy!\nRead the question below and select from the following choices.\nA: Outback \"Copy Cat Recipe\" From Aunt Jo\nB: Next, Make a Bath\nC: Fold and Crimp\nD: Cut the Retention Band",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_54_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_54_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_54_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_54_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_54_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_54_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_54_6.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Classic Hot Ham and Cheese With a Garlic Italian Twist\nB: Add the Two Creams\nC: Boiling and Stirring Until Thickened\nD: Drain the Pastaand",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Preparing to Cook the Diced Chicken Breast', '@placeholder', 'Tossing to Coat Pasta With Sauce', 'Plate and Serve']",
+    "context": "Here is the context of these images:\n. Cook 2 cups of whole wheat Penne pasta according to directions on package. Drain well and set aside. While you are boiling the pasta you can mix the spices, including the corn starch in a small bowl, then set aside. If you have allergies to corn, than you can use arrowroot starch. You will find this starch in your local \"Whole foods Market\" or other local health food store. The spices you will need: 1 tablespoon sun-dried tomatoes flaked or powder 1 teaspoon coarse sea salt 1 teaspoon marjoram 1 teaspoon dried thyme 1 teaspoon dried oregano 1 \u00bd teaspoon sugar 1 tablespoons dried onion 1 teaspoon corn starch or arrowroot powder \u00bd teaspoon black pepper \u00bc teaspoon garlic powder You may not find flaked or powdered sun-dried tomatoes. Just buy whole dried sun-dried tomatoes and place one or two small dried tomatoes in a food processor and process to flakes or powder. It is your preference. By the way the way the colander in the image was part of a prize I won here at Instructables. It included this colander, a Instructable T-shirt and a 257 page book called \"GLUTEN - FREE ON A Shoestring\" (prize value was worth $65.00). The book includes 125 easy to make gluten free recipes. You can view the Instructable I entered in the contest, then you can do so by linking here: Almond Flour Honey Cake. If you haven't entered a contest here at Instructables, you really need to do so. It is a lot of fun. But first on to the next step ------------------->>>. You will need 1 pound of chicken breast, and cut it into 1 inch chunks. One pound is usually 2 or 3 breast.. Melt 2 tbsp. of unsalted butter in a large skillet over medium-high heat. Add the diced chicken; cook and stir 5 minutes or until lightly browned. You should not see any pink remaining in\nthe meat before proceeding to the next step.  . Next add spices, then a 14.5 oz. can of petite cut tomatoes, undrained.  Next add 1 cup of cream or half and half followed by 1/2 cup of fresh grated Parmesan-Rigatino cheese. Mix all the ingredients until well incorporated. Bring\nto a boil, and stirring constantly until well blended and starts to thicken some.\nReduce the heat to low and simmer for 5 minutes.. . After the sauce has thickened stir in pasta and...... . toss gently to coat. . cover the pasta mix and let stand with heat off for 5 minutes.. \n          Plate and serve with additional cheese, if desired. Prepare your favorite salad to accompany the meal. We have more recipes here at Recipes for a Healthy You as well as here at Instructables. Thanks always for taking the time to view my Instructables. Here is one that I entered into a contest, I would appreciate your vote: Blinding Baking Pie Crust [includes pie recipes] Follow us on Twitter Like us on Face Book Join our group at Google  Check out what we've pinned at Pinterest Thanks for your time and viewing our Instructable. Eat and Be Healthy!! Regards, Randywww.savorthefood.com Click image and:  \u00a0\nRead the question below and select from the following choices.\nA: Classic Hot Ham and Cheese With a Garlic Italian Twist\nB: Add the Two Creams\nC: Boiling and Stirring Until Thickened\nD: Drain the Pastaand",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_55_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_55_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_55_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_55_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_55_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_55_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_55_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_55_7.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Curried Sweet Potato Soup With Bacon\nB: Moroccan Three Bean and Kale Soup\nC: Clean the Kale\nD: For the Pros Out There",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Dice the Onion', 'n', 'Results']",
+    "context": "Here is the context of these images:\n.    1 Large\u00a0(or 2 small) Bundles\u00a0of Kale, Torn into Bit Sized Pieces\u00a0     10 Whole Red Potatoes, Sliced Thin\u00a0     1 Medium Onion, Diced\u00a0     2 Tbsp Olive Oil\u00a0     2 Cloves of Garlic- Minced     1-1/2 Pounds Italian Sausage\u00a0     1 Tbsp\u00a0Crushed Red Pepper\u00a0(adjust to taste)     1/2 Tbsp\u00a0oregano\u00a0     2 cups Chicken Broth\u00a0     2 cups Whole Milk\u00a0     4 cups Half and Half     Splash of Heavy Cream\u00a0     Salt and Pepper to Taste **Ignore the flour in the picture, there is no flour in this recipe. For some reason I had a brain fart when taking this picture and added it in. **. First tear kale into bit sized pieces and rinse with cold water. Set aside.\u00a0. Thinly slice the red potatoes. Boil sliced potatoes until tender. Drain. . Heat oil over medium high heat, add minced garlic and onion and cook until slightly browned. About 3-5 minutes.\u00a0. Add italian sausage and crumble while it cooks. Drain off as much fat as possible.\u00a0. . Add Whole Milk and Half and Half, let simmer for about 30 minutes.\u00a0. . I like to only add about 1/3 of the kale and then save the rest of it for right before I heat up the left overs so I have some semi-crunchy kale in the soup as well as cooked kale.\u00a0. Add a splash of Heavy Cream at this point, unless you are making this soup ahead of time or for left overs, and then leave the added heavy cream for right before you eat your serving of soup.\u00a0. This soup is amazing for those cold winter nights. It's filling and warm and so delicious. Who needs to go to Olive Garden when you can whip this soup up quickly and have leftovers for many bowls to come.\u00a0\nRead the question below and select from the following choices.\nA: Curried Sweet Potato Soup With Bacon\nB: Moroccan Three Bean and Kale Soup\nC: Clean the Kale\nD: For the Pros Out There",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_56_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_56_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_56_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_56_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_56_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_56_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_56_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_56_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_56_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_56_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_56_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_56_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_56_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_56_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_56_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_56_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_56_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_56_17.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Soft Pretzel Bites\nB: Combining Ingredients.\nC: Ingredients\nD: Egg Wash (Optional..but Optimal!)",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'The Dough', 'Boiling Your Pretzels', 'Baking and Finishing']",
+    "context": "Here is the context of these images:\n. 1 package of yeast1 1/4 cup of warm water (110 degrees)1 tsp sugar3 cups of bread flour3 tablespoons brown sugar1 tablespoon butter melted1 tsp salt6 cups of water1/2 cup of baking soda1 /2 cup sugar1 tablespoon cinnamon1/2 cup butter melted. Lots of people are afraid to cook with yeast.  It is really easy to do.  Mix the yeast, 1 tsp of sugar and 1 1/4 cup of warm water.  Make sure the water is around 110 degrees.  Also check the date of the yeast.  It may not work if it is old.  You are going to let it rest for 10 minutes.  It will start to look like it has bubbles.. Mix the yeast mixture, brown sugar, salt and 1 tablespoon of butter.  Mix this with the wire whisk attachment.  Now start adding the flour.  When it starts to separate from the bowl, change the attachment to the hook.  Now finish mixing with the hook. Finish mixing with the hook and add the rest of the flour.  It will be sticky but easy to pull out of the bowl.. Put a little oil in your bowl and cover the bottom and sides.  Now add the dough and cover with plastic wrap.  Put the bowl in a warm spot.  Let it rise for a hour.. When the dough has risen you are going to heat your water.  Put 6 cups of water in a large pan with the baking soda.  Let the water start to heat.  Turn your oven on to 425 degrees.  Also put a piece of parchment paper on your cookie sheet.  Take the dough out and cut in to four pieces.  Now roll out each piece in a strip that is around 30 inches long.  Cut each rope into pieces an inch wide.  Drop the pieces into the hot water for 10 seconds.  Drain and put on a cookie sheet with parchment paper.. Bake the pretzel bites for 10 minutes.  After you take them out of the oven melt the butter in a glass bowl.  Put the pretzel bites in the butter and then drain and put in a separate container.  Mix the sugar and cinnamon.  Put the pretzel bites in a brown paper bag and pour in the sugar mixture.  Shake and put them in a container with a lid.  Try not to eat them all while they are warm.  They taste just like Auntie Anne's.\nRead the question below and select from the following choices.\nA: Soft Pretzel Bites\nB: Combining Ingredients.\nC: Ingredients\nD: Egg Wash (Optional..but Optimal!)",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_57_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_57_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_57_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_57_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_57_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_57_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_57_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_57_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_57_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_57_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_57_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_57_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_57_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_57_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_57_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_57_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_57_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_57_17.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_57_18.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Press/form\nB: Lumberjack Cookies C. 1917\nC: Bacon Cheesecake Brownies\nD: Video Recipe and Youtube Channel",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['\"Cream\" the Bacon Fat', 'Add Flour', '@placeholder', 'Bake']",
+    "context": "Here is the context of these images:\n. Add 1c. Bacon fat to 2 cup white sugar, 1/4 cup brown sugar, 2 tsp salt, and 4 tbs molasses. In a stand mixer, mix on high for a few minutes.. Ad 2 eggs and cream again. Your batter should form stiff peaks before you add spices and flour. . Add the following to the mixture:\n1 1/2 teaspoon ground cinnamon\n1 1/2 teaspoon ground ginger\n3/4 teaspoon ground cardamom\n1/4 tsp black pepper. Add 2 1/2 cup flour, mix on high. . I had a cookie press so I decided to use that for these cookies - but as long as each cookie is about the size of a tablespoon they should be the right size. They flatten out while baking.. Bake the cookies in a 350 degree oven for 8-10 minutes - you don't want the bottoms to get burnt.. Great with milk, or ice cream. They are rich! This recipe yields many cookies, so bring them into work, or share with friends. \nRead the question below and select from the following choices.\nA: Press/form\nB: Lumberjack Cookies C. 1917\nC: Bacon Cheesecake Brownies\nD: Video Recipe and Youtube Channel",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_58_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_58_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_58_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_58_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_58_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_58_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_58_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_58_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_58_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_58_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_58_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_58_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_58_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_58_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_58_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_58_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_58_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_58_17.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_58_18.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_58_19.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_58_20.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_58_21.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Potato Volcanoes\nB: Wash\nC: Volcano Potato\nD: Deep Fry Time",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Prep the Ingredients', 'Roll and Coat', '@placeholder', \"It's Serving Time\"]",
+    "context": "Here is the context of these images:\n. For this mouth watering snack, you'll need to include the following in your shopping cart:PotatoesBeansCarrotsGreen ChiliesBread CrumbsCorn Flour SlurryGinger Garlic PasteSalt Oil. The first and the most important thing to do is to prep all the ingredients before starting to cook. This is a pretty straight forward recipe once all the ingredients are ready.Boil a few potatoes and mash them well in a bowl. Also chop down the beans and carrots into small pieces and boil them till they're soft enough. . The next thing to do is to prepare the mixture that's going to be deep fried. Add the boiled vegetables in the bowl of the mashed potatoes and mix it well. Add salt, Red Chili Powder and a Teaspoon of cumin seeds to it. Also add in about half a cup of bread crumbs to it and mix it well.. Next part is the fun part. Take the mashed veggies and roughly make it into a rough sphere in your hands. Now take small chunks from that sphere and roll them into small balls using your hands and keep them aside. It's time now for coating the balls. Dip them in the corn flour slurry and then in bread crumbs. Make sure the balls are completely covered in bread crumbs. Coating the kebab balls is a completely optional step, but this definitely adds a lot of crispiness to the lollipops when they're fried. . The next step is to deep fry the kebab balls. Heat some oil in a deep frying pan. Now dust off any extra bread crumbs sticking on to the balls and slowly drop them into oil and fry it until it turns golden yellow in color. Once it's all fried properly, remove it out of the oil and drain out any excess oil from it by placing it on a paper napkin.. After deep frying the balls, it can be savored directly, but what's a lollipop without a stick.So now take a few tooth picks and gently fix them into each of the deep fried balls. Now top it up with some coriander leaves and also some chili flakes if you'd like. Serve it while hot with some tomato ketchup or yogurt dip.Bon Apetit !!\nRead the question below and select from the following choices.\nA: Potato Volcanoes\nB: Wash\nC: Volcano Potato\nD: Deep Fry Time",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_59_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_59_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_59_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_59_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_59_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_59_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_59_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_59_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_59_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_59_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_59_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_59_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_59_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_59_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_59_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_59_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_59_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_59_17.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_59_18.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_59_19.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_59_20.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_59_21.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_59_22.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: How to Make Your Own Sprinkles at Home\nB: The Legs and Shelf\nC: The Mould\nD: Burn Off the Inside",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['The Dough', '@placeholder', 'The Mould', 'The Glaze']",
+    "context": "Here is the context of these images:\n. I used Serious Eats' \"Best Chocolate Chip Cookie\" recipe, but bastardized it* by adding a half cup of flour and beating the dough longer than recommended, so it would be easier to roll out. Use whatever dough you want, just make sure it's stiff enough that you can manipulate it a little.\u00a0 *I did make one attempt with the original recipe--more on that towards the end.. Make sure to use mini-chips and/or hand-chopped chocolate. If your chunks are too big, you get weird holes rolling out the dough.. I used a mini popover pan lined with strips of parchment paper for the outside, and wrapped corks in foil for for the center piece.. Not sure what would hold up the best, I experimented with another centerpiece made from a paper towel tube and masking tape (which yes, is safe in the oven).. I used plenty of flour and a pizza cutter to get my lines straight.\u00a0 My first attempt was wrapping the dough around the original shot glass, but I found it more effective to use the centerpiece that would eventually go in during the baking process. And it's much easier to get the parchment paper around before you put it in the pan. Use a real shot glass to make a circle for the bottom of the cookie shot glass, the use the aluminum foil-cork-plug to smash it all together. Straighten out the top edge with your finger if it gets crooked. It won't fix itself while it bakes. I greased some of them and found that (with this pan, at least) it didn't make a difference.. My preferred temperature setting was basking at 375F for 10-14 minutes.\u00a0 I tried the first batch at 350F and they were meltier and puffier, while the higher temperature made the next batch take their shape more quickly. The aluminum foil was definitely more effective than the cardboard. On second batches, I filled the cardboard with foil so the cookie wouldn't puff up inside, but the cardboard middles were a lot stickier to pull out. It's best to pull out the centers while they're still warm, before they fully set, just don't burn yourself. You can see the one that I didn't give a plug to just filled right in. Cookie Shot Glass fail.. \n          My obsessive tendencies won--two days later, I had to test the glaze ideas. First, wait 'til your cookie shotglasses are completely cool before glazing. Here are the two types I tested: 1) A \"Confectioners Glaze\" -- this is what I grew up knowing as \"cinnamon roll frosting.\" A little powdered sugar, splash of vanilla, and a couple teaspoons of milk. I like to make mine thick--like a thick paint. 2) A variation on Royal Icing -- an idea inspired by my days of using Royal Icing to glue together my gingerbread mansion/castle/lighthouse/city, this stuff hardens like glue when it dries. For my little test batch, I used 1 pasteurized egg white, a splash of vanilla, beat it in a mixer til frothy, and added about 3/4 c. powdered sugar. I used clean paintbrushes to brush the inside, and also tested the pour-n'-swirl method--which wasn't really as effective since my glazes were fairly thick. I recommend continuing to re-distribute the frostings with the paintbrush as they dry, since they sink to the bottom. You can also let them dry on their sides and roll em around as you see fit. For each different glaze, I tested one single coat vs. two coats.\u00a0 I tested one unheated test a couple hours after applying a glaze to see if it was holding at all--and seemed to work pretty well  I painted a few more, and let them sit overnight.. \n          In the morning, they were dry. I reheated one of each type in the oven at 350F for 5 minutes, and then added milk:  The confectioner's glaze worked great for both the single and double coats, but the royal icing was not a success (which makes sense, now that I think about it).\u00a0. The regular cookie dough (which had less flour and was beaten less, making it softer and more malleable) did not yield a successful cookie shot glass. Its structure was much looser and, while still delicious, was extremely porous and was difficult to get the foil plug out of. Also, if you don't line the outside of the cookie shot glass with parchment paper, it doesn't stick, but it does spill over the edge, making a weird, muffin-top-y-style glass. Lastly, when the bottoms of my first batch turned out thicker than I would've liked, I tried putting no bottom at all on some in the second batch, hoping that the dough would melt and drip down... ...but since I switched to a higher temperature on the second batch, this didn't happen at all, so I got a cookie spyglass instead.. Edit: leaving this here so you guys can see the failure caused by not glazing. They only kind of worked. A couple of them held milk for almost a few seconds before seepage occurred, and since this article\u00a0that revealed that Dominique Ansel uses a glaze inside the cookie to keep the milk in didn't come out until I was finished with my experimentation, I haven't had a chance to try that....yet. But I'll update you once I do..   Final thoughts: for all the work, $3 a pop for Dominique Ansel's seems pretty reasonable to me if you're in NYC. I hear they come with complimentary vanilla milk refills. I'll definitely be checking them out when I visit in April!\nRead the question below and select from the following choices.\nA: How to Make Your Own Sprinkles at Home\nB: The Legs and Shelf\nC: The Mould\nD: Burn Off the Inside",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_60_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_60_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_60_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_60_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_60_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_60_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_60_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_60_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_60_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_60_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_60_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_60_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_60_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_60_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_60_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_60_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_60_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_60_17.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_60_18.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_60_19.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_60_20.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_60_21.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_60_22.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_60_23.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_60_24.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_60_25.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_60_26.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_60_27.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_60_28.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_60_29.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_60_30.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_60_31.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_60_32.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_60_33.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_60_34.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_60_35.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_60_36.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_60_37.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_60_38.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Conchiglioni With Herbs\nB: Additional Notes/nutrition\nC: Fast Asparagus\nD: Let\u00e2\u0080\u0099s Make the Cheese Spread!",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Start the Cooking', '@placeholder', 'Quick Cheat', 'Plate Up']",
+    "context": "Here is the context of these images:\n. To make this dish for 2 you will need the following:2 fillets of sea bass, your fishmonger can do this for youSalt and pepper1 knob of butter1 yellow bell pepper, thickly sliced and the seeds removedAround 10 cherry or vine tomatoes, roughly sliced in half or left whole1 bunch of asparagus with the tough ends broken off, this will be anywhere between 10 and 15 pieces, depends on the thickness of the asparagus. If they are a very thick then give them a little longer or slice them length ways2 Tbsp of spicy pesto, or any pesto you like, simply from a jar100ml dry white wine1 handful of flat leaf parsley, you will only need the leaves which you can just pick offoptional, a little water in case you want to loosen the saucesome mixed leaf to your liking, you could use rocket, watercress, spinach or what ever you like in any combination you like. The vegetables with the exception of the asparagus will be cooked in a griddle pan, you want charing so this needs to be as HOT as possible, put this on full wack.NOTE: do not put oil on the griddle otherwise you will be frantically opening windows and hitting the smoke detector. You wont need oil and it will only burn which will ruin lunch and no one wants that!Put another pan on a medium heat and allow to heat up while seasoning the fish, season both sides with salt and pepper. Salt on the skin will help to give you that infamous crispy skin. You could score the skin if it is a thick fillet which will actually help the heat to permeate allowing the fish to cook quicker. But there really isn't any need.TIP: Never put put your fish into a cold pan, you want it to be up to temperature first for a nice crispy skin.. Put your peppers on the griddle, these are going to char and blacken which is just what we want, they will go soft and sweeten as the natural sugars cook.In the fish pan, put a good knob of butter, let this melt down for a few seconds and move the butter around the pan. Then, gently lay the fish in the pan skin side down - do not touch the fish or the pan now, it can be tempting to mess around with the fish but you want the skin the crisp up and the meat to gently cook.TIP: Don't be tempted to move the pan around and mess with the fish, just let it cook.. Keep an eye on your peppers, move them around.After 4 - 5 minutes you will see the edge of the fish at the thinnest points start to turn a light white colour, when this happens it is time to turn the fish. Take a fish slice and very carefully turn the fish over, keep close to the pan so not to splash butter everywhere and keep the delicate fish in one piece. Cook the fish for 2 - 3 minutes more, keep checking it to make sure it doesn't overcook/ burn.Get some foil or a plate ready for when the fish is cooked to put it to one side.Check the fish by gently lifting it with the fish slice and peaking underneath, it should be just brown, remove from the pan and put to one side.TIP: Fish is considered tricky and many people over cook it but if you keep an eye on it then it is really easy, as soon as the fish looses it raw colour and the flakes of meat just start to come away from each other it is ready. Just be patient and as soon as it is done, get it out of the pan.. Now we are coming to the end and the last of the ingredients cook super fast.Turn the peppers again and throw the wine in the fish pan, you want to save all the delicious flavour from the pan so don't wash it first. This is called deglazing the pan.Put the asparagus in the wine and put a lid on top, the asparagus will take around 2 minutes to become tender and steaming them in wine and the fish butter will make them shinny and delicious.At the same time, put your tomatoes on the griddle, they will cook fast because of the sweet sugars and the soft flesh. They will be ready around the same time as the asparagus.. Asparagus really doesn't take very long, as soon as the stems are tender use some tongs and get them out of the pan, put to one side for plating up later.Don't throw the wine away from the fish pan, this is going to be the base for the super simple sauce - the flavours of the fish and asparagus are too good to waste.. When it comes to sauces there is nothing more rewarding than making your own from scratch but sometimes you want something quick and easy so there is no shame in using a nifty cheat here and there.For this one the secret is pesto (you could even make your own pesto), here we used a spicy tomato pesto. Add your pesto to the wine in the pan and mix in. You may need to add a splash of water to loosen the sauce. Add the flat leaf parsley at the end and stir in.Take the vegetables off the heat and put in a bowl, set to one side. It is best to get the veg out of the pan, the griddle is a big heavy chunk of metal and will hold the heat for a while, consequently continuing to cook the food in it.TIP: When you are making sauces, a splash of water in many cases can do wonders. If you take a sauce too far or kept it warm a little too long, reduced a little too much then a dash of water can be your saving grace.. Bring your dish together and serve with a glass of white wine, spoon the sauce on and around your perfectly cooked fish.Add a light garnish of green leaf, peppery rocket works a treat here. Enjoy as a great quick lunch, alfresco if you can :) \nRead the question below and select from the following choices.\nA: Conchiglioni With Herbs\nB: Additional Notes/nutrition\nC: Fast Asparagus\nD: Let\u00e2\u0080\u0099s Make the Cheese Spread!",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_61_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_61_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_61_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_61_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_61_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_61_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_61_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_61_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_61_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_61_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_61_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_61_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_61_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_61_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_61_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_61_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_61_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_61_17.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_61_18.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_61_19.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_61_20.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_61_21.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_61_22.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_61_23.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_61_24.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_61_25.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_61_26.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_61_27.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_61_28.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_61_29.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_61_30.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_61_31.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_61_32.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_61_33.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_61_34.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_61_35.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Marshmellow Peanutbutter Bananabarbequeboats With Honeydip\nB: Cook Chicken, Green Pepper, and Tomato\nC: Let the Dough Rest\nD: Pizzaiola",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Gather Ingredients and Supplies', 'Prepare Green Pepper, Tomato, and Chicken for Cooking', '@placeholder', 'Enjoy Your Quesadillas']",
+    "context": "Here is the context of these images:\n. The following ingredients will make four quesadillas.\u00a0\u00a0 One quesadilla will feed approximately one adult, so adjust the recipe accordingly for the desired number of quesadillas.\u00a0Ingredients:\u00b7\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 8 \u2013 Flour Tortillas\u00b7\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 4 \u2013 Thawed Chicken Breast\u00b7\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 3 Cups Shredded Cheddar Cheese\u00b7\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 1 Packet Taco Seasoning\u00b7\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 1 Tomato\u00b7\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 1 Green Pepper\u00b7\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Sour Cream\u00b7\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 SalsaSupplies:\u00b7\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Pizzazz Pizza Oven\u00b7\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Medium Size Nonstick Frying Pan\u00b7\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Steak Knife\u00b7\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Plastic Spatula. In this step you will prepare the vegetables and the chicken for cooking.Green Pepper1.\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Wash the green pepper.2.\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Cut the green pepper in half and clean out the seeds and stem.\u00a03.\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Chop the green pepper into pieces that are approximately \u00bd in by \u00bd in (see picture below) and set aside.\u00a0Tomato1.\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Wash the tomato.2.\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Cut stem and core out of the tomato and discard.3.\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Chop the rest of the tomato into \u00bd in by \u00bd in pieces (see picture below) like the green pepper, and add to the green pepper.\u00a0Chicken1.\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Make sure that the chicken is fully thawed.2.\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Cut the chicken into small strips that are about \u00bd in by 2 in (see picture below).\u00a03.\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Set the chicken aside, separate from the vegetables as you will be cooking it first.\u00a0Once you have prepared the green pepper, tomato, and chicken you are ready to begin cooking.\u00a0. Begin by putting the chicken in the nonstick frying pan; do not put the vegetables in at this time.\u00a0Place the frying pan on the stovetop burner and set to medium heat.\u00a0Stir and flip the chicken as needed in order to keep it from sticking.\u00a0Cook the chicken until it is cooked through, this will take about 8 minutes.\u00a0In order to test if the chicken is done, cut a piece of chicken in half.\u00a0If it is a white color inside, and all of the pink is gone it is done; if it is still pink, continue cooking until done.\u00a0Once the chicken is done, push it up the sides of the pan to make an opening in the center of the pan for the tomato and green pepper.\u00a0Turn the burner down to medium-low and add the vegetables to the center of the pan.\u00a0Stir the vegetables as needed in order to cook all sides.\u00a0Cook the vegetables for about 4 minutes or until they are warm and tender.\u00a0. Once the vegetables are done as described in Step 3, mix the chicken and the vegetables together in the frying pan.\u00a0Turn the burner down to low or simmer.\u00a0You want enough heat to keep the mixture warm, but not too much heat that it dries out and burns.\u00a0Add the taco seasoning to the center of the chicken and vegetable mixture.\u00a0Dump 1/3 cup water on top of seasoning.\u00a0Stir the seasoning and water in the chicken and vegetables until everything is evenly coated.\u00a0Continue to let the mixture simmer until you are ready to use it in Step 6.\u00a0. Place two tortilla shells on the Pizzazz, offset as in the picture below.\u00a0Since the Pizzazz isn\u2019t quite large enough, the tortilla shells will overlap.\u00a0Turn the Pizzazz on the double burner setting so heat will come from the top and the bottom.\u00a0Add 1/2 Cup Cheese to the top tortilla shell so it covers the entire shell.\u00a0Allow the Pizzazz to cook for approximately two minutes, or until the cheese begins to melt.\u00a0. Once the cheese has begun to melt, place one fourth of the chicken and vegetable mixture on top of the melted cheese.\u00a0Then sprinkle a little more cheese on top of the chicken and vegetable mixture on the tortilla.\u00a0Note that it may be easier to unplug the Pizzazz to keep it from rotating while adding the chicken and vegetables and the cheese.\u00a0Just remember to plug it back in after you are done.\u00a0Allow the freshly placed cheese to melt on top of the chicken and vegetables, by letting the Pizzazz cook for another minute.\u00a0\u00a0After the cheese on top is melted, take the tortilla shell that is on the bottom of the Pizzazz and place on top of the other tortilla shell, chicken, vegetables, and cheese.\u00a0Apply pressure with the spatula to get both tortilla shells to stick to the middle ingredients.\u00a0Next allow the quesadilla to cook on the Pizzazz for approximately one more minute, so it becomes a little crispy.\u00a0Remove the quesadilla from the Pizzazz using the spatula, and place on a dinner plate.\u00a0In order to make the next three quesadillas, repeat Steps 5 and 6 three more times.\u00a0Once you have used up all of the chicken and vegetable mixture, remember to turn off the stove.\u00a0Also unplug the Pizzazz once your last quesadilla is done.\u00a0. Now that the quesadillas are cooked it is time to enjoy them. I prefer to spread sour cream and salsa to the top of mine. Serve the quesadillas with chips and salsa as a side to finish of a great meal. Note that there are many possible variations to this recipe that allow you to personalize this meal to your taste. For example try adding jalapeno peppers to chicken and vegetable mixture for a little spicier version. You could also add mushrooms and onions to the chicken mixture for added flavor. Try different combinations of ingredients until you find what you like best. Most importantly, enjoy your new found recipe, and use for a Pizzazz pizza oven. \nRead the question below and select from the following choices.\nA: Marshmellow Peanutbutter Bananabarbequeboats With Honeydip\nB: Cook Chicken, Green Pepper, and Tomato\nC: Let the Dough Rest\nD: Pizzaiola",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_62_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_62_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_62_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_62_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_62_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_62_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_62_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_62_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_62_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_62_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_62_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_62_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_62_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_62_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_62_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_62_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_62_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_62_17.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_62_18.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Goi Cuon (Vietnamese Summer Rolls)\nB: Make Sugar Syrup\nC: Into the Oven\nD: Decorate and Garnish Your Dessert",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Prepare the Lemon', 'Blend Together the Drink', '@placeholder', 'Serve and Enjoy']",
+    "context": "Here is the context of these images:\n. This dessert focuses on utilizing different parts of the lemon to create a tasty drink. First you will need to cut the lemon into two halves. Next you will need to cut a thin slice about a 1/4 of an inch thick from one of the lemon halves. Set this aside on a piece of paper towel. Now, juice both halves of the lemon and pour through a strainer into a bowl to remove pulp and seeds. You can now set this aside as well. Finally we will zest the juiced lemon halves on a small-holed grater, until the shiny surface of the lemon has been removed. Lay all of your lemon parts aside on piece of paper towel and store in the fridge.. For this next step you will need a blender. I would recommend gathering your ingredients ahead of time so that the ice cream does not melt in between adding them. While these are the measurements I found to work effectively you can adjust them based off of personal preference via taste.Measure and add to your blender:1 cup ice cream1/2 cup milk1 teaspoon vanilla1/4 teaspoon peppermint extract1 Tablespoon of your lemon zest2-4 Tablespoons of your lemon juice depending on personal preference. (I added 3)Blend together until mixed, but still relatively thick. I would suggest using a low blender setting. If you are not serving the dessert immediately, put it in the fridge or freezer accordingly until needed.. Use the drink that you blended in the last step and pour until brimming in a glass of your choice. Use the flat edge of a knife to level off the drink. Now sprinkle a light coat of your lemon zest on top.You will now need your lemon slice. Cut it first in half. Cut a small slit in one half and place it on the edge of your glass. Now cut the other half into four slices and cut small slits on the peel side and place on the edge of your glass as shown in the picture.Take a pinch of cinnamon powder and sprinkle it in a decorative pattern over the top of your drink.Your Drink Is Now Complete!. Add a straw to your drink and enjoy outside in the warm summer weather.\nRead the question below and select from the following choices.\nA: Goi Cuon (Vietnamese Summer Rolls)\nB: Make Sugar Syrup\nC: Into the Oven\nD: Decorate and Garnish Your Dessert",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_63_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_63_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_63_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_63_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_63_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_63_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_63_6.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Pi Is Starting to Look Like a PI!\nB: William Jones' Pi Pie (mathematically Infused)\nC: Come Together....right Now....over Me....\nD: Egghead?! Custard in the Making....",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Gather Your Ingredients!', 'There Are More Wonderful Things to Add to the Bowl!', '@placeholder', 'Throw Me in the Fire!']",
+    "context": "Here is the context of these images:\n. Ingredients:\n1 pie crust (store bought or home made)\n1/2 cup pine nuts (1/4 cup chopped very finely, 1/4 chopped and reserved)\n1 tablespoon olive oil\n1 onion (chopped)\n2 cloves garlic (minced)\n2 cups spinach\n1/4 cup sun dried tomatoes (chopped)\n1/4 cup Greek olives (chopped)\n1/2 cup feta cheese (crumbled)\n2 eggs\n1/4 cup milk\n1 teaspoon sea salt\n1 teaspoon black cracked pepper\n1 teaspoon onion powder\n1/4 teaspoon cayenne pepper\n10 sheets phyllo dough (thawed)\n3-4 tablespoons butter (melted). Preheat your oven to 350 degrees.\nOn a lightly floured surface, lay out your pie dough. Sprinkle with finely chopped pine nuts and gently roll the nuts into the crust using a rolling pin.\u00a0 . Lay the pie dough into the bottom of a glass pie plate (9 inch). Set aside.. In a saute pan, on medium heat, add olive oil.\u00a0 once oil is hot, add onion.\u00a0 Saute for 3.14 minutes and then add garlic to the pan.\u00a0 Continue to saute for an additional 3.14 minutes.\u00a0 Add spinach to the pan and saute for an additional 3.14 minutes. Remove from heat and move mixture to a mixing bowl.\u00a0 Let cool for 3.14 minutes.. Once the spinach mix has cooled a bit, add sun dried tomatoes, olives, pine nuts and feta cheese to the spinach mix and stir to combine.. In a small bowl, lightly beat the eggs and milk together until combined. Add the salt, pepper, onion powder and cayenne pepper to the egg mixture and stir to combine.. Add the egg mixture to the spinach mixture, and stir to combine.. Pour the spinach mixture into the prepared pie plate.. Lay a sheet of phyllo dough on top of the spinach mixture and brush with a bit of melted butter.\u00a0 Layer another sheet of phyllo, and then another bit of butter.\u00a0 Continue until you have 10 layers of phyllo, and make sure you brush that top layer with butter! . Bring the pie dough up along the sides and pinch the dough over top of the phyllo, making the crust.\u00a0 . Place pie into the oven.\u00a0 Bake for 30-35 minutes or until the top is a golden brown and the filling has set.\u00a0 . Serve hot, warm or room temp! \nRead the question below and select from the following choices.\nA: Pi Is Starting to Look Like a PI!\nB: William Jones' Pi Pie (mathematically Infused)\nC: Come Together....right Now....over Me....\nD: Egghead?! Custard in the Making....",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_64_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_64_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_64_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_64_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_64_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_64_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_64_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_64_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_64_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_64_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_64_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_64_11.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Oven Baked Jerky\nB: Broil or Bake!\nC: Oven Grilled Chicken\nD: Filling the Dishpan",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['What You Need for This Dish', 'The Smoked Sausage', '@placeholder', 'The Magic']",
+    "context": "Here is the context of these images:\n. 1 a package of sauerkraut contains 0,5 kg 2 bags of instant mashed potatoes (per bag you need to add 0,5 liter boiled water 3 1 large hand of raisins 4 butter to grease the dishpan 5 1 can of pineapple pieces (also about 0,5 kg) 6 1 smoked sausage 7 curry powder (these are the packages as they are\u00a0available in the Netherlands, try to get about the same content if you use other packages). Open the can with pineapple pieces and pour the juice in the small pan. Add the sauerkraut, and put the pan on the stove. Once the juice/water boils put the stove on low heat and let it boil for 5 min.. In this case you need to add water \u00e1nd milk\u00a0according to the package of the instant version. Since I am allergic to milk I only add water, the same amount as the milk and water would be together. So I boil 1 liter water and add half of it to my measuring bowl. I add the instant powder, mix and let it sit for a while. In the mean time I grease the dish pan.. I don't know if you ever had a change to eat it but I love it! Especially the ones HEMA sells, so if you visit the Netherlands you definitely need to visit a HEMA for their ROOKWORST. Of course this sausage doesn't taste the same, but is also very good. I slice it the way the picture shows and usually end up eating the leftover part.... Also pre-heat the oven at 180 degrees Celsius / 356 Fahrenheit.. 1st layer: mashed potatoes 2nd layer: sauerkraut, pineapple pieces and raisins 3rd layer: mashed potatoes 4th layer: sliced smoked sausage & curry powder About the 2nd layer: Once the sauerkraut has boiled for 5 min. I pour off the juice/water it boiled in and pour it in the measuring bowl. I add enough water to get to half a liter and make the second bag of mashed potatoes with it.\u00a0. After you've prepared the dish, put it in the preheated oven for 30 min. After that, serve and enjoy your meal.\nRead the question below and select from the following choices.\nA: Oven Baked Jerky\nB: Broil or Bake!\nC: Oven Grilled Chicken\nD: Filling the Dishpan",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_65_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_65_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_65_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_65_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_65_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_65_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Menudo\nB: Continue With a Whole Bunch of Buns\nC: Load Up Katamari\nD: Whip the Cream",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Construct Prince', 'Prince Placement', '@placeholder', 'Background']",
+    "context": "Here is the context of these images:\n. \nA head of cabbage makes an excellent katamari and radishes are the perfect size to use as the ends...\n\t\tWash and cut radishes in half\n\t\tSlice off the very end off each half\n\t\tPush toothpicks into cabbage, leaving about 1/2\" exposed\n\t\tPress radish halves onto toothpicks\n\t\tEvenly distribute radishes around entire cabbage. \nThe Prince is constructed from 2 cucumbers, a baby carrot and 4 green beans...Head\n\t\tCut both ends off\u00a0 two small cucumbers\n\t\tCut one of the cucumbers in half\n\t\tTake one of the halves and carve out a rectangle from the outer peel\n\t\tRemove small band of peel from both ends\n\t\tTake two of the end pieces and attach one to each side of the head with toothpicks\n\t\tStick a baby carrot in the top of the head to make the antennaBody\n\t\tTake the other half of the cucumber and press one or two toothpick in one end\n\t\tAttach head to body using the aforementioned toothpicksLegs/Feet\n\t\tRun toothpicks through two green beans, leaving a bit of toothpick exposed on one end\n\t\tPress legs into body at toothpick end\n\t\tTake the ends cut off the second cucumber (these will be the feet)\n\t\tCut a small circle out of the middle of each foot (approximately green bean in diameter)Arms\n\t\tRun toothpicks through two green beans, leaving a bit of toothpick exposed on one end\n\t\tPress toothpick ends into body at a reasonable arm position. \nSet The Prince up next to the cabbage katamari in a rolling stance.\nNow, The Prince did remain standing for the duration of the display, but i won't lie, it was precarious.\u00a0 I recommend setting up the veggies in the same place it will remain throughout the event.\u00a0 Also, make sure that the cabbage is stable, as it provides most of the support for The Prince.. \nToothpicks and/or skewers of fruits, veggies and cheeses can now be added...\n...along with turnip flowers : ). \nAdd Brussels sprout bushes, mixed green grass and weird fruit trees of strawberry and melon atop artichokes (or whatever weird fruit trees you can imagine).\nAnd don't discard those rinds!\u00a0 They can be filled with dips or salsa.\u00a0 The lemon pictured here is happily holding a yogurt fruit dip.\nThe example here, while a little out of control, is a very simple example of what can be done with the Katamari theme.\u00a0 It could be applied to a wide variety of foods and/or represent different levels of the game.\u00a0 A skilled garnish maker could do an amazing representation of the flower level...\n...and yes, that is a request.\nRead the question below and select from the following choices.\nA: Menudo\nB: Continue With a Whole Bunch of Buns\nC: Load Up Katamari\nD: Whip the Cream",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_66_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_66_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_66_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_66_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_66_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_66_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_66_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_66_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_66_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_66_9.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Enjoy!\nB: How to Make a Portal Cake\nC: \nD: ",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Poke Holes Into the Cake', 'Pour Condensed Milk Onto Cake', 'Pour Chocolate Syrup on Cake', '@placeholder']",
+    "context": "Here is the context of these images:\n. Gather the following list of ingredients and utensils: Mixing Bowl Mixing Spoon or Fork Can Opener Spatula Chopstick, Straw or other similar utensil for poking holes in cake 9 by 13 pan with lid Measuring cup 1 - 14 oz can Sweetened Condensed Milk 1 - Bottle of Chocolate Syrup 1 - 8 oz Tub of Whipped Topping 1 - Bag of English Toffee bits 1 - Box of Cake Mix The following ingredients vary depending on the type of cake mix used: 3 Eggs 1 1/4 Cup water 1/3 Cup Vegetable Oil Non-stick spray. 1. Bake your favorite cake in a 9 by 13 pan using the instructions on the box or your own recipe. I have only ever used chocolate cake, but vanilla cake would work as well. 2. Let the cake cool enough to be handled by bare hands before moving on to the next step. This should take around 15 mins and will help make the rest of the process easier.. Take a chopstick, straw or other utensil of similar size and shape, and poke holes into the cake spacing them approximately 1/2 inch apart. Be sure that whatever utensil you use to make the holes leaves a hole big enough that the condensed milk will be able to flow into the holes. I have found that a plastic straw works best or a utensil that is 1/4 inch in diameter.. Pour one 14-ounce can of condensed milk onto the cake evenly. It should drain into the holes and settle on top of the cake as shown in the picture.. Pour chocolate syrup onto the cake evenly making sure that some of it drains into the holes. I use approximately one cup of chocolate syrup for this step, but more or less could be used to suit personal taste.. Cover the cake and place it in a refrigerator to chill for at least 45 minutes.. Spread whipped topping on the cake using a spatula making the layer about 1/2 an inch thick. More or less whipped topping can be used to suit personal taste. Be sure that the whipped topping is completely thawed, otherwise spreading it on the cake will be a difficult and messy job.. Drizzle some more chocolate syrup onto the whipped topping in any pattern you want. The amount you use for this step is completely up to you and what you want your cake to look and taste like.. Sprinkle English toffee bits evenly on top of the cake.. Place the cake back in the fridge to chill some more. This step is optional as the cake is ready to eat anytime, however, the cake should be stored in the fridge to prevent the whipped topping from melting and keeping all the other ingredients from spoiling.. Cut yourself a piece and enjoy the fruits of your labor!\nRead the question below and select from the following choices.\nA: Enjoy!\nB: How to Make a Portal Cake\nC: \nD: ",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_67_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_67_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_67_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_67_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_67_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_67_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_67_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_67_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_67_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_67_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_67_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_67_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_67_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_67_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_67_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_67_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Eggless Muffins\nB: Fill 'em Up\nC: Mix It All Up\nD: Blackberries",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['In Another Bowl..', '@placeholder', 'Fill the Muffin Pan', \"Bakin'\"]",
+    "context": "Here is the context of these images:\n. You will need:1 c blackberries2 1/2 c all-purpose flour2 eggs1/2 tsp salt1/2 tsp cinnamon1/2 tsp baking soda2 tbsp milk1/2 tsp ground cloves1 c yogurt (I used the light vanilla kind)1 tbsp baking powder8 tbsp melted butter1 c sugar1 tsp vanilla extract. In a large bowl, you are going to combine the flour, baking powder, baking soda, cinnamon, salt and ground cloves.. In a separate bowl combine the sugar, eggs, vanilla, milk, butter and yogurt together.. Pour the second bowl into the first and mix all the ingredients together.. Cut the blackberries in half, then sprinkle them with 2 tbsp flour. This will keep them from sinking to the bottom of the mixture.. Fold the blackberries into the mixture.. Spray the muffin pan with non-stick cooking spray. Fill the batter to the top of the pan.. Bake in an oven that's been preheated to 400\u00b0 for 17-20 minutes, or until you can stick a fork in a muffin and it comes out clean.. Thanks for checking this recipe out. If you make them, comment and let me know what you think!\nRead the question below and select from the following choices.\nA: Eggless Muffins\nB: Fill 'em Up\nC: Mix It All Up\nD: Blackberries",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_68_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_68_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_68_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_68_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_68_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_68_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_68_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_68_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_68_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_68_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_68_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_68_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_68_12.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Edible Rosebush\nB: Decorate\nC: Bake and Pipe\nD: Make Eyes",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Ingredients', 'Prep Crackers', 'Add Pretzel Legs', '@placeholder']",
+    "context": "Here is the context of these images:\n. Here's what you will need to make your own spider snacks:round crackers peanut butter pretzel sticks small candies or mini chocolate chipsbutter knife. Spread some peanut butter on two round crackers. This part will be the spider's body.. Place 8 pretzels on one of the crackers, 4 on each side and sandwich the other cracker on top. The pretzel sticks are the spider's legs.. Use two small dabs of peanut butter on top as glue to stick two small candies or mini chocolate chips for eyes! I used mini M&M's.. Now, eat the spiders and enjoy!\nRead the question below and select from the following choices.\nA: Edible Rosebush\nB: Decorate\nC: Bake and Pipe\nD: Make Eyes",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_69_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_69_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_69_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_69_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_69_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_69_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_69_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_69_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_69_8.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Cake Decorating With Fondant\nB: Preparing the Cake\nC: Making Frida Kahlo Topper\nD: What You Will Make & Learn",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Why Should You Follow Along With Me?', 'About Me', 'Tools for Cake Making & Decorating']",
+    "context": "Here is the context of these images:\n. Whether you are a total amateur or looking to start your own cake decorating business, these projects are for you and I'm so glad you're here to learn! Throughout the lessons in the collection I will offer tips and tricks, secrets of the trade, and suggestions for making cake decorating a very simple process. Here is a rundown of what you will learn!Dark Chocolate Cake RecipeLeveling a Cake & Cutting LayersGanache Recipe & Applying GanacheVanilla Buttercream RecipeFilling & Stacking Cake LayersDark Chocolate Truffle RecipeApplying Rolled FondantSculpting Fondant DecorationsDuring the lessons in the collection you will design and create two different cakes \u2014 a trendy, decadent drip cake topped with sweet confections and an adorable fondant covered cake decorated with flowers and birds. Both will start with a rich double chocolate cake covered in dark chocolate ganache as a sturdy (and delicious) base!Making a Drip CakeDecorating a Fondant CakeWhat is a drip cake? A drip cake is traditionally a ganache or buttercream covered cake that has thinned ganache (or glaze) dripping down the sides. Drip cakes can be decorated in any way imaginable. We will be using store-bought candies and fondant flowers to decorate with. This cake will be perfect for a small wedding, an anniversary, or a birthday party.What is a fondant cake? A fondant cake is covered with an edible sugar paste that can be molded and formed over a cake. It can also be used to create sculptural and decorative items. Fondant adds nice structure to a cake that will hold up in the elements.. There are tons of cake tutorials and classes out there, what's different about mine?When\n I first started decorating cakes there weren't many videos or classes \navailable so I learned mostly by trial-and-error. I made mistakes\u2014lots \nof them! Now, you can find a video tutorial for almost anything related \nto cake decorating. I have watched several short Vimeo and YouTube \nvideos to see how other cake decorators make various things. There is \ncertainly no lack of videos and they range from working with fondant \nmolds to making isomalt jewels but there aren't many comprehensive \nlessons/classes out there that can take you from the first ingredient to a \ncompletely finished and decorated cake! I look forward to any questions you might have along the way and making things as easy for you as possible. Feel free to shoot me a message on any project or a private message to my account. I can't wait to see what you create.. Hi, my name is Jen Wold. While I'm not working away at Instructables I run a cake decorating business called Clever Wren Cakes & Sweets located along the mighty Mississippi River and \nan Etsy shop called Thermies. I've been creating custom cakes for over 10 years and my kitchen is \nalways filled with the delicious aroma of cake! My other creative hobbies include: sewing, quilting, embroidery, fused glass, stained glass, weaving, and trying and creating new recipes.. Now that you have a basic overview of how these lessons are structured and what you will be learning, let's find out about the required tools and ingredients to make fun decorated cakes!If you have ever been to the cake decorating section at your local craft supply store you may have noticed that there are literally hundreds of specialty tools and supplies for cake making and decorating. Since one could spend a small fortune buying all of these items (some completely unnecessary) I will limit the amount of specialty tools used to keep costs down.REQUIRED TOOLS - All of the tools in this list are necessary to complete the lessons in this collection.Mixing bowlsMeasuring cupsMeasuring spoonsTwo - 6-inch x 2-inch round cake pansParchment paperOne - 8 inch round cake base, cake drum, or cake standOffset spatulaFour - 6 inch round cake boardsRubber spatulaBench scraper or Plastic rulerSerrated bread knifeParing knifeRolling pinHand mixerRound cookie cuttersGeometric cookie cuttersFondant smootherFood coloringSmall sauce panSpoonMasking tapePaper towelOPTIONAL TOOLS - The tools listed below will make some of the steps in these lessons easier, but none of them are required to create beautiful cakes. I will offer creative solutions and substitutions for the following items that you might find in your home kitchen.Turntable12-inch piping bagPiping tip (Wilton tip 2A)Large coupler Cake levelerRolling (pastry) mat, or other silicone matClear vanilla extract or vodkaStand mixerCooling rackPetal foamBall head sculpting tool1-inch cookie scoop or Melon ballerSmall flower shaped cutterSugar pearlsSheet pan\nRead the question below and select from the following choices.\nA: Cake Decorating With Fondant\nB: Preparing the Cake\nC: Making Frida Kahlo Topper\nD: What You Will Make & Learn",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_70_0.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Preparing the Fruit\nB: Salmon Carpaccio\nC: Roll Up and Slice\nD: Bake and Enjoy!",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Preparing the Salmon', 'Proper Wrapping and Baking', 'Garnish and Enjoy']",
+    "context": "Here is the context of these images:\n. \n\tAfter obtaining a pineapple, wash it well to remove any dirt and debris. Then, using a long sharp knife, cut off the top and bottom of the fruit to remove the leaves and stalk. Again use the knife to slice down the side of the pineapple, cutting roughly 4 inch wide strips, deep enough to leave no skin. Continue cutting the fruit, until the skin is completely removed. The skinned fruit can then be eaten, avoiding the circular core.\n\tTo garnish the plate, cut 6 leaves with slightly varying lengths from the foliage of the fruit, and set aside.\n\t\u00a0. - Set the oven to 375\n- Tear a sheet of tin foil large enough to wrap around the largest pineapple slice, and place that slice in the middle of the foil\n- Spread a dash of ginger and salt over the pineapple slice and place the thawed or fresh salmon fillet over that.\n- Spread another dash of ginger and salt over the fillet, and top with another pineapple slice.\n- Then use another pineapple slice and cut it down the middle, length-wise. Use both slices to\u00a0cover the sides of the fillet.. \n\tThis folding method is essential to keep the fish from drying out and the juices from spilling.\n\t- Fold in 2 opposite sides\u00a0of the tin foil until they meet in the middle, then pinch them together and roll them down\u00a0tight.\n\t-\u00a0Next, roll in the un-touched sides of the foil to create an enclosed envelope.\n\t- Place the foil wrap in the middle of the oven for 15 minutes, then flip to cook for another 15 minutes. *\n\t*Cook time may vary slightly due to the thickness of pineapple slices and the\u00a0salmon fillet.. - Carefully remove the foil wrap from the oven and un-roll the edges.\n-\u00a0Remove the top and side pineapple slices, and set on a plate in a stacked pyramid design.\n- Lift up the final pineapple slice with the salmon on top, and place on the plate with the pineapple leaves under it.\n- Lastly, use a toothpick to spear a triangle of fresh pineapple to the salmon.\nMore salt and ginger can be added, depending on tastes. The pineapple slices can be juiced onto the fish, or the fruit eaten off the skin.\nTo further complement the dish\u00a0I used homemade sweet potato fries and a shot of Barbancourt rhum (Haitian rum).\nRead the question below and select from the following choices.\nA: Preparing the Fruit\nB: Salmon Carpaccio\nC: Roll Up and Slice\nD: Bake and Enjoy!",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_71_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_71_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_71_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_71_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_71_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_71_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_71_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_71_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_71_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_71_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_71_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_71_11.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Sunday Go to Meeting Meatloaf\nB: Egg Noodles\nC: Combine All Ingredients\nD: How to Cook",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Preheat Oven', 'Cut the Bacon and Onion', 'Eggs', '@placeholder']",
+    "context": "Here is the context of these images:\n. \u2022 1/2 pound sliced bacon (diced)\u2022 1/2 medium sweet onion (chopped)\u2022 3 eggs (lightly beaten)\u20222 cups frozen shredded hash brown potatoes(thawed)\u2022 1 cup shredded cheddar cheese\u2022 3/4 cups 4% cottage cheese\u2022 5/8 cups shredded Swiss cheese6 servings. Preheat oven to 350 degrees.  . Cut up the bacon and onion.  Dice the bacon and chop the onion.. In a large skillet cook the bacon and onion on medium heat until the bacon is crisp.  If you need to put the bacon in the microwave start with 30 seconds and add any additional time needed.  When it is cooked drain the bacon and onion.. Lightly beat the eggs and put them in a large bowl.. Shred the potatoes or just buy shredded hash browns and put them in the large bowl.. Add the remaining ingredients into the large bowl. (Shredded cheddar cheese, cottage cheese, shredded Swiss cheese, bacon and onions). Next transfer the ingredients to a 9 inch round or square dish. Put the dish in the oven for 35-40 minutes.  When done let stand for 10 minutes and enjoy your \"Go To Omish Egg Caserole\".\nRead the question below and select from the following choices.\nA: Sunday Go to Meeting Meatloaf\nB: Egg Noodles\nC: Combine All Ingredients\nD: How to Cook",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_72_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_72_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_72_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_72_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_72_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_72_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_72_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_72_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_72_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_72_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_72_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_72_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_72_12.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Sweet Potato Ravioli With Coconut Curry Sauce\nB: Add Other Ingredients\nC: Vegetable Rice Cooked With Coconut Milk\nD: Ingredients",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Half Boil the Potatoes', 'Stir Fry the Potato Ubes', '@placeholder', 'Add Grated Coconut']",
+    "context": "Here is the context of these images:\n. Six to seven medium sized potatoesGrated coconut from half a piece of coconut.Salt to tasteOne teaspoon of red chili powderOne teaspoon of cumin seed powderOne tablespoon of cooking oil. Wash and place the potatoes in a pressure cooker Pour enough water to cover the potatoesPressure cook for about five minutes or one whistle. Do not over-cookRemove from stove, release pressure and pour cold water over the potatoes. Keep the potatoes in cold water and peel the skinCut potatoes into 3/4th inch cubes. Place a frying pan over medium heat and add a tablespoon of cooking oilAdd the potatoes and stir fry till the cubes turn to golden brown. Once the potato cubes turn golden brown, add a teaspoon of saltThen add one teaspoon of red chili powder and one teaspoon of cumin seed powderMix all ingredients together and cook for five more minutes. Once all other ingredients are properly cooked with potato cubes, add the grated coconutsStir fry till the grated coconuts turn golden brownRemove from stove and transfer to serving dishThis can be used as a side dish with rice and chapatis.\nRead the question below and select from the following choices.\nA: Sweet Potato Ravioli With Coconut Curry Sauce\nB: Add Other Ingredients\nC: Vegetable Rice Cooked With Coconut Milk\nD: Ingredients",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_73_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_73_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_73_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_73_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_73_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_73_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_73_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_73_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_73_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_73_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_73_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_73_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_73_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_73_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_73_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_73_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_73_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_73_17.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_73_18.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_73_19.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_73_20.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_73_21.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: gado and Cowboy Bread\nB: Shaping the Loaves.\nC: Tempering\nD: Steaming",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Ingredients', 'Method', 'Next Morning.', '@placeholder']",
+    "context": "Here is the context of these images:\n. This recipe makes 2 large loaves and I small loaf:7\u00bd cups                 all purpose flour2 teaspoons           instant dried yeast3 cups                   warm water3 Tablespoons      dark brown sugar (not Demerara)1/3 cup liquid malt1 large                  egg (beaten)\u00bc cup                  oil (eg olive oil)1 teaspoon          salt2\u00bd teaspoons     mixed spice (usually cinnamon, ginger & cloves)2 cups                 raisins1 cup                   dried cranberriesAnd for giving a glaze to the finished loaves:2 Tablespoons     milk1 Tablespoons     sugarNotes: 1.  You can use all white flour or switch one or two cups to whole wheat.2.  You will probably need up to half a cup of extra flour to get a good dough.Malt is a type of sugar that can be found in most homemade beer supply stores.  It is a thick brown liquid that adds a wonderful flavour to the bread and makes it a little chewy.  You can NOT substitute molasses for malt.  Molasses is a different thing completely.  And do not think \u2018if 1/3 cup is called for, \u00bd cup must be better\u2019 as malt can do funny things to bread.  Use in moderation is the key.If you can\u2019t find malt, you could add another tablespoon of brown sugar instead of the malt and you will have a close copy of my version of Barmbrack.. This is what I do for all my bread.1. Before going to bed get a BIG bowl and add:   3 cups   white flour   2 tsp      yeast   3 cups   warm water (body temperature) and stir to a thin batter.  Cover well and put in a warm place overnight.You will notice there is no sugar in this over night stage (called a poolish).  Sugar is totally unnecessary in bread because an enzyme in flour converts the starch to sugar that the yeast can use.  In my every-day bread I never use sugar at all partly because we have too much sugar in our diets already but mostly because it is unnecessary.  This bread, being a sweet bread, has sugar added for flavor, not for the yeast\u2019s benefit.A friend once put the batter up on top of the kitchen cupboards overnight but he didn\u2019t use a big enough bowl.  In the morning there was batter dripping down the cupboards and spreading out all over the counter!  What a mess to clean up! So a big bowl. 2. Also before going to bed, in a second bowl pre-mix:    4\u00bd cups    flour (may be 2 white and 1\u00bd whole wheat)    1 tsp          salt    2\u00bd tsp       spices    \u00bc cup       oil  (OK it\u2019s not dry but this is a good time to mix it in)    3 cups     dried fruit. Next morning you will find the gluten has developed all by itself and you will have a lump of gooey gluten sitting in a very watery fluid.  So, now pour off some of the watery liquid into a small bowl and dissolve the sugar and malt in that before returning it to the main mixture.  (Or you could add the sugar to the main bowl, but it is easier in a small bowl).  Then add the contents of the second bowl and the beaten egg.  Stir until you can\u2019t stir any more and then get your hands in it to make an even ball of dough.  You will probably need about \u00bd cup more flour depending on humidity etc.  You may work it on the kitchen counter, though on this occasion I did not.  Then put the dough ball back in the bowl, covered, in its warm place for about 30 minutes.  The gluten will develop during this time without, needless to say, the need to knead.This 30 minutes is a good opportunity to butter/grease your bread tins.. During the 30 minutes in your warm place the gluten develops nicely.  Tip & scrape the dough onto your work surface and knead it a few times.  Stretch and fold, turn, stretch and fold again.  Then divide the dough into 3 pieces.  This recipe made 6\u00bc pounds of dough, so for the 2 large tins I used 2\u00bc lbs (1 kg) and for the smaller tin 1\u00be lbs (800 gms).  Stretch and fold each piece of the dough to make a sausage shape that will go into your tins.. I have an old apartment size fridge that I have converted to a warming cabinet by removing all the fridge stuff and putting a 60 watt light bulb at the bottom with a thermostat at the top.  I can set whatever temperature I choose and know it will be constant.Allow the dough to rise in your warm place for 45 \u2013 60 minutes and when well risen bake at 350 degrees F (180 C) for 45 minutes.  The sugar in the bread will caramelize and make a nice brown crust.  Immediately the bread comes out of the oven, brush over the top with the milk/sugar syrup to give a nice glaze.  Two or three coats in quick succession may be necessary to get a nice shiny glaze.  Allow to cool.  And then you know what to do\u2026..\nRead the question below and select from the following choices.\nA: gado and Cowboy Bread\nB: Shaping the Loaves.\nC: Tempering\nD: Steaming",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_74_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_74_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_74_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_74_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_74_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_74_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_74_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_74_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_74_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_74_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_74_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_74_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_74_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_74_13.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Materials\nB: Pineapple Upside Down Cake Updated\nC: Preparation of Cake Batter\nD: free Upside Down Cake",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Pineapple Filling', 'Assembly', 'Om Nom Nom...']",
+    "context": "Here is the context of these images:\n. For this recipe you will need:\n20 oz. can pineapple, chopped or crushed\n20 oz. can pineapple rings\n3/4 cup sugar\n3 tblsp corn starch\n1 tblsp lemon juice\n1 pre-made dbl. crust pie crust\nYou will also need:\nA cupcake pan (jumbo if you can get it; here I've used the standard size)\nMedium Saucepan\nA wooden spoon\nA spoon\nA fork\nA rolling pin\nA can opener\nMeasuring cup and spoons\nShortening to grease the cupcake pan\nA 4\" circular cookie cutter. Pour your can of crushed or cut pineapple, juice and all, into your saucepan. Add cornstarch, sugar, and lemon juice and put on medium heat until the mixture thickens.\nSet aside.. Open up your pre-made pie crust and use your cookie cutter to cut out as many circles as you can. You'll need twelve to fill the 1 cupcake pan. Six of the circles need to be rolled thinner so that you can line each entire cup.\nYou can ball up and re-roll the scraps if you need more crust, or plan to try and fill up a second pan.. Pre-heat your oven to 425 Fahrenheit and grease your pan.\nOpen and drain your pineapple rings. Place one in the bottom of each cup of your cupcake pan. If it won't fit, just cut it up so that it will. It didn't occur to me until after I'd made the pies, but you could use a smaller cookie cutter on the pineapple rings to make them fit into your cups. Either way, once you've got your bit of pineapple on the bottom, set your wider, thinner circle of crust on top of it, lining the cup.\nSpoon in your filling until you are level with the top of the pan.\nPlace your smaller circle of pie crust on top and seal the seams.\nPlace into the oven and bake for about 25 minutes.. When the pies are done, the crust will be that nice golden-brown colour indicative of all that is wonderful in the world. Give them a moment to cool before trying to remove them from the pan. I had no trouble just plucking them up by the edges of the crust, so don't worry about trying to flip them over onto a pan or anything like that...\nServe upside down and with anything you want. I went with a nice vanilla ice cream because I've found that pineapple and vanilla go wonderfully together, but that's a story for another Instructable.\nEnjoy!\nRead the question below and select from the following choices.\nA: Materials\nB: Pineapple Upside Down Cake Updated\nC: Preparation of Cake Batter\nD: free Upside Down Cake",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_75_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_75_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_75_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_75_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_75_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_75_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_75_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_75_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_75_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_75_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_75_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_75_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_75_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_75_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_75_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_75_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The Best Part\nB: Homemade Salted Caramel\nC: All in the Pan\nD: Stir and Boil",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Scrub a Dub', 'Prep', '@placeholder', 'Finish']",
+    "context": "Here is the context of these images:\n. Add all of the caramel sauce ingredients into a sauce pot and cook on medium low heat. Check the sauce often and make sure to stir constantly to avoid the sauce from burning and sticking. Using a candy thermometer,cook the sauce until it reaches a temperature of 235 degrees OR just cook the sauce for about 30 minutes until it's thick and creamy!. While the caramel sauce is cooking, this is a great time to remove the stem of the apples and add your sticks. I used wooden cake dowels that I cut in half. You can also use Popsicle sticks, actual sticks (this looks beautiful and rustic) or perhaps a long lollipop stick. Once that's done, place your apple one by one in the hot water and allow to sit for about 10-15 seconds to melt the outside wax. Using a clean towel, gently rub away that wax. You'll know the wax is gone because the apple won't be as shiny. Without this step, the caramel might slide off. . Grab a plate or counter space and lay down some waxed or parchment paper. Apply about a tablespoon of shortening to the waxed paper or a good amount of nonstick spray to prevent the apples from sticking. If you haven't done so already, prepare all of your toppings in bowls. . As the title says, this is the best part. Now that your caramel sauce is ready, remove it from the stove and place with the rest of your setup. Carefully dip an apple (holding it by the stick) into the caramel sauce and roll the apple to coat the entire surface. Hold the apple over the sauce, allowing some to drip away. Immediately dip the apple into the toppings of your choice and place on the parchment paper. Do this for all of your apples. Be aware that the caramel sauce will start to stiffen up. Get it back to it's consistency by placing it back on the stove for a couple of minutes while stirring, then continue. . Drizzle your caramel apples with white chocolate, dark chocolate or colorful candy melts. For a Christmas theme, try drizzling red and/or green candy melts onto the apples. The caramel will stiffen pretty quickly, so if using sprinkle candies apply them with haste or sprinkle into a bowl to dip the apple into. Allow the apples to sit in the fridge for at least 15 minutes to ensure that the caramels holds onto those toppings. . Turn these apples into gourmet gifts by tying a silk ribbon around the stick and placing into treat bags. I highly encourage you to recreate this recipe rather than melting caramel candies. The quality of the caramel is to die for. Enjoy and Happy Holidays! \nRead the question below and select from the following choices.\nA: The Best Part\nB: Homemade Salted Caramel\nC: All in the Pan\nD: Stir and Boil",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_76_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_76_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_76_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_76_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_76_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_76_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_76_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_76_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_76_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_76_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_76_10.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Kinder Bueno Cheesecake\nB: Make the Crust\nC: Make the Filling and Bake\nD: The Cheese Filling",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['The Ingredients', 'Preparing the Tin', 'The Crust', '@placeholder']",
+    "context": "Here is the context of these images:\n. ingredients:\n- 1 pack (200g) of Butterkekse\n- 100g butter\n- 330g each of cream cheese and quark\n- 1 egg\n- 180g of sugar\n- 1 pack of vanilla sugar or vanilla extract or 1 vanilla bean\n- 1 bowl (200g) of sour cream\ntools:\n- 1 springform tin (26 cm diameter)\n- handheld mixer\n- baking paper\ni am from germany and butterkekse (butter cookies) and quark are easily available. i am not completely sure how you could subsitute them. maybe graham crackers and some sugar instead of the butter cookies and the double amount of cream cheese and no quark.. take a piece of baking paper and put it over the bottom of the springform pan before puttung it together. this way it will be much easier later to remove the cake from the tin once it is finished.\npreheat the oven to 180 C.. crush the cookies:\n- either put them in a freezer bag and crush them with a rolling pin\n- or put them in a big enough bowl and crush them with something heavy (like a meat tenderizer) - this is my favored method, see pictures\n- or put them in a food processor\nmelt the butter in a pan or the microwave and mix it with the cookie crumbs.\nput the crumb mixture into the prepared tin and push it flat evenly.. with the handheld mixer in a bowl mix the quark, cream cheese, sugar, egg, and vanilla until everything is thoroughly mixed.\n(you could also use a wire wisk)\npour on top of the cookie crumbs and flatten if necessary.\nput the cake into the middle of the preheated oven and bake for 35 min.. take the cake out of the oven and spread the sour cream evenly on top. then put it back in the oven for 5 min.\nthis will not make a big difference on the taste, but the cake will have a nice white top and won't look yellowish.\ntake the cake out of the oven and leave it to cool completely.\ni like to decorate it with a couple of dried flowers. (see picture from the intro)\nenjoy!\nRead the question below and select from the following choices.\nA: Kinder Bueno Cheesecake\nB: Make the Crust\nC: Make the Filling and Bake\nD: The Cheese Filling",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_77_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_77_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_77_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_77_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_77_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_77_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_77_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_77_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_77_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_77_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_77_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_77_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_77_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_77_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_77_14.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Old Fashioned Cream Scones\nB: Your Cream Tea\nC: You Will Need...\nD: Make Coffee",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Rub Flour and Butter Together', 'Mix to a Ball and Rest', 'Roll Out', '@placeholder']",
+    "context": "Here is the context of these images:\n. Pre-heat your oven to Gas #7 / 220 C / 425 FIn a chilled mixing bowl, rub together the butter cubes with the flour and the baking powder until the mixture resembles bread crumbs.. Add into the mix both the sugar and sultanas.Add both the eggs and, using a wooden spoon, mix well.If the mix is too stiff, add the milk a little at a time. On average, I end up using about 100ml of milk. Do not make the mixture too wet.Your mix should be able to 'clean' the bowl when mixed to form a ball - see next step. Once you have added the wet ingredients, you should be able to use your hand to mould the mix into a ball.It should not be sticky but should be able to clean the bowl of all its ingredients.Once this is done, wrap your ball in some cling film or place a tea towel over the bowl and place the mix and the bowl in the refrigerator for about 20 minutes.You can now take a rest yourself, or do the dishes . Turn the chilled mixture out onto a floured work surface and roll out so they are about 2cm thick and, using your scone cutter, cup, glass, or mug, cut shapes outIf you don't have a rolling pin, shape and pat flat before cutting.Handle as little as you possibly can.. Once rolled out, place on your greased baking tray and brush with milk. You can brush with beaten egg if you prefer.Bake in the centre of the oven for 15 mins. Allow to cool completely before eating. You will need:Tea - I adore Earl Grey, but house tea is fine. And nowhere does it say you shouldn't change the tea for a coffee.SconesClotted CreamJam (Jelly) / preserve'Proper' Cream Tea etiquette would be as follows: A lose leaf tea is bestAllow tea to brew for at least three minutesTea before milk, never milk before teaOnce stirred, your spoon should be placed on your saucerA good scone should easily break apart, you shouldn't have to cut it in halfSpoon the clotted cream and jam onto your plate before spreading onto your sconeCheck out the big debate on next step. There has always been the debate of what goes first, jam or cream?Now, etiquette would dictate that it is jam before cream. But you know, I'm a rebel and, as you can see, I'm a cream before jam kinda lass :)So, it is at this point that I will apologise to all the traditionalists that are reading this, but I love mine this wayWhy not let me know your preference.Most importantly - ENJOY!\nRead the question below and select from the following choices.\nA: Old Fashioned Cream Scones\nB: Your Cream Tea\nC: You Will Need...\nD: Make Coffee",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_78_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_78_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_78_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_78_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_78_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_78_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_78_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_78_7.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Clean and Chop Squid\nB: Catalina Salad\nC: INGREDIENTS (3 Entrees or 4 to 6 Side Salads)\nD: PREPARATION",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Boil Squid', 'Add Squid and Marinate', 'Serve']",
+    "context": "Here is the context of these images:\n. Pull the head/tentacles away from the body. Scoop any remaining guts out of the body, and remove the thin plasticy quill. Rinse the body inside and out, then slice into approximately 3/4-1 inch thick rings.\nSqueeze behind the head to extrude the beak, and remove it from the center of the tentacles. Cut just below the eyes to free the tentacles, then add them to the bowl with the body rings.\nTentacles are the best part. No, really- they're fantastic.. Bring a pot of water to a boil.  Add a bit of salt and a bit (1-2 Tablespoons) of wine or citrus juice. Drop the squid into the water in batches, removing it just as it turns opaque.  This should take less than a minute, so be ready with a slotted spoon.Deposit the cooked squid on a paper towel to cool and dry.. Combine:\njuice of 2 limes\n~1 Tablespoon hot chili/garlic sauce (sriracha)\n~1 teaspoon sesame oil\n~1/2 teaspoon fish sauce (or to taste)\n~1 teaspoon rice vinegar\n1 kaffir lime leaf, finely minced (zest from those limes makes a fine substitute)\n3 finely minced shallots\n2 Tablespoons brown or palm sugar (honey or agave nectar are good substitutes)\nhandful fresh mint, finely minced\nhandful fresh cilantro, finely minced\nsalt and pepper to taste\nStir it up and taste.  It should be aromatic, spicy, and acidic with a touch of sweet.  Adjust the seasonings as necessary to make the sauce taste good to you.\nNote that I resisted the temptation to add a grated garlic clove to the mix- there's already garlic in the sriracha, and I didn't want to overpower the squid.. Add squid and give it a stir.  Let it sit in the marinade for a bit, preferably in the refrigerator for about half an hour.  More marination certainly won't hurt; you can leave it overnight if you like.. Serve cold.  The longer the squid marinates the better the flavors will penetrate.  This will keep for a day or two, but like any seafood it shouldn't be left to moulder in the refrigerator.  We've never had any problems of this type, as this salad disappears quickly.\nGarnish with any of the herbs used in the salad and serve on funny-looking plates.  For best results, make sure all the tentacles are showing.\nRead the question below and select from the following choices.\nA: Clean and Chop Squid\nB: Catalina Salad\nC: INGREDIENTS (3 Entrees or 4 to 6 Side Salads)\nD: PREPARATION",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_79_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_79_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_79_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_79_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_79_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_79_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_79_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_79_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_79_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_79_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_79_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_79_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_79_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_79_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_79_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_79_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Make It Cheezy\nB: Macaroni and Cheese\nC: Ritz Cracker Topping Ingredients\nD: Ingredients",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Start the Macaroni Water', 'Cook the Pasta, Preheat the Oven', 'Make the Roux', '@placeholder']",
+    "context": "Here is the context of these images:\n. Clear off your kitchen counterspace to give you enough room for safe food preparation. If you have a dishrack or dishwasher full of clean dishes, put them away so there's room for dirty dishes in your way.Once you have the space, get your your supplies ready: 2 medium-large saucepans (pots for the stove, about 4 quart capacity)Heatproof kitchen colander (usually made of metal)Heat-resistant spoon (such as a wooden spoon)Heat-resistant whisk (such as a metal whisk)Kitchen scissorsLiquid measuring cup (2 cup capacity)Measuring spoons. Fill the larger of your two pots 3/4 full of water, or 8 cups, whichever is smaller.  Place it on a burner and turn the burner on to medium-high.  . With a kitchen knife, cut 1/4 cup of butter (use the lines on the butter package - it will likely be 1/2 a cube.) Place the butter in your second medium-large saucepan (do not turn on burner yet.)Measure out 3 Tablespoons of flour, place in a small bowl near your workspace.Measure out 2 cups of milk, ideally in a liquid measuring pourer, keep near your workplace.Measure out 1 cup of breadcrumbs, set aside.Be sure your salt, pepper, and mustard containers are nearby.Get out a 8\"x11\" baking dish (lasagne pan or similar), and spray the bottom and sides with oil. If you don't have spray oil, pour about 1 tsp of oil in the base of the baking dish and using a paper towel rub the oil all along the inside surface.. As the water is boiling, check the package of your pasta to learn how long it takes to cook.  You'll see this package indicates 9 - 11 minutes. Once the water is boiling, carefully pour 12 oz of your pasta into the boiling water.  Then, set a kitchen timer for the number of minutes needed for cooking (I recommend the lowest of the times, in this case 9 minutes.)  Let the noodles boil, without a lid (uncovered.)While the pasta is boiling, turn on your oven to 400 degrees.When your timer goes off, drain your pasta in a metal collander.  Be very careful - the pot will be heavy, if in doubt ask an adult or older sibling to help with this step.Once the pasta has drained, place it back into the cooking pot you just emptied, and mix in a little bit (1 teaspoon) of oil to make sure the pasta doesn't stick to itself. . Place your second saucepan (the one that should have 1/4 cup butter in it) on a burner, turn to medium heat.The butter will start melting.  When it is fully melted, add your pre-measured flour and stir with a wooden or heat-resistant spoon. It'll be sticky, keep mixing until you can't see white bits of flour.Then, keep mixing for about 5 minutes, over medium heat.  You should notice the mixture turns slightly yellow, maybe even a little brown, and get a little thinner.Once it has turned this color & texture, get your whisk ready.  Slowly and steadily, pour the milk into the saucepan and whisk it into the flour & butter mixture.  It'll sizzle a little at first. Stir with the whisk quickly, with the heat still on.As you stir with your whisk (it's called 'whisking'), you should notice that after a few minutes the mixture will suddenly get thick.  This is the flour-butter mixture reacting with the milk.  Once you notice this thickness, turn the burner off.. As soon as you turn your burner off, mix in the 12 oz of grated cheese.  The warmth of the roux will melt it in.Then, measure out & add 1 Tablespoon mustard1/2 teaspoon salt1/2 teaspoon pepper.  You can measure each over a plate to the side of the pot, then once measured carefully pour directly into the pot.  That way, if you spill by mistake the spilt ingredient won't over-season what is already in the pot.You should now have a gooey sauce in your saucepan.. You should now have one pot of macaroni, and one pot of cheesey sauce.  Pour whichever is in the smaller pot into the larger pot (e.g. hopefully the sauce is in the smaller pot, so you'll pour the sauce into the pot with the pasta).  If it looks too small, you can get a large bowl and put the contents of both pots into one bowl.  Mix well with your wooden spoon, then pour into the oiled baking dish.Top with pre-measured 1 cup of breadcrumbs.. Carefully place in the pre-heated oven, and set a timer for 15 minutes.While the Macaroni & Cheese bakes, set your table with the following items per person:Large plateForkGlass for beverage of choiceIf you are serving with a side salad, now's a great time to put that on the table too.  A fruit salad or a green salad goes well with this meal.When the timer goes off, check to see if the Macaroni & Cheese looks done.  It should be a little brown on top, and/or bubbling a little on the sides.  If it doesn't look ready, set your timer for another 5 minutes.When done, get your oven mitts or potholders and carefully pull out of the oven.  If your family is careful, you can put the dish on a trivet or two potholders on your dining table and serve at the table.Bon Apetit!. Now that you've made Macaroni and Cheese once, you can get creative!Try using other cheeses! You can hand-grate your favorites, or chop into small cubes (the smaller they are, the better they will melt)  This recipe is a great way to use extra cheese you have in the refrigerator.  Add in other ingredients: after the cheese has melted you can add many other items.  Cooked bacon, sliced sausages, even some chopped-up vegetables can be tasty.  Try different spices: instead of 1 Tablespoon of mustard, try:1 teaspoon cumin and 1/2 tsp hot sauce for a Mexican-style flavor1 teaspoon Italian seasoning (or oregano, basil, or thyme) and 2 Tablespoons grated parmesan cheese for an Italian-style flavor\nRead the question below and select from the following choices.\nA: Make It Cheezy\nB: Macaroni and Cheese\nC: Ritz Cracker Topping Ingredients\nD: Ingredients",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_80_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_80_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_80_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_80_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_80_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_80_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_80_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_80_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_80_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_80_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_80_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_80_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_80_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_80_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_80_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_80_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_80_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_80_17.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_80_18.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_80_19.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_80_20.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_80_21.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_80_22.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_80_23.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_80_24.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_80_25.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: FOLLOW, COMMENT AND SUGGEST\nB: Epic Twice Baked Potatoes\nC: Second Baking\nD: Tortilla Chips and Tostada Bases",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Watch the Video for This Recipe', 'Ingredients', 'Directions', '@placeholder']",
+    "context": "Here is the context of these images:\n. www.youtube.com/watch?v=rkOF_wMFo6I. 4 Large Potatoes300g (10.5 oz) Tomatoes300g (10.5 oz) Leeks150g (5.3 oz)  Chorizo100g (3.5 oz) Cheddar2 Tablespoons  Worcestershire Sauce1 Tablespoon Butter for Frying. Wash potatoes and place them onto a baking tray lined with baking paper. Bake in a preheated oven at 200C/400F for 50 minutes or until soft in the middle. Help the potatoes roast faster by piercing them with a skewer about 20 minutes into baking.Meanwhile: Wash the leeks & slice them thin. Throw them into a frying pan with melted butter and saut\u00e9 2-3 minutes before adding chopped tomatoes. Saut\u00e9 for 2 more minutes or until the tomatoes have softened. Add finely diced chorizo and saut\u00e9 for a further 3-4 minutes. Turn off the heat and set aside.When the potatoes are done, let them cool completely before cutting them in half (lengthwise).Scoop out the inside of each potato. In a bowl, mash the potatoes with fork. Throw in the chorizo mixture, Worcestershire sauce and mix until well combined. Taste it and season with salt & pepper, if needed.Fill the potato shells with the mixture. Top with cheddar cheese & bake in a preheated oven at 200C/400F for 15 minutes.Serve as a main, started or side!. \u25ba DON\u2019T FORGET TO FOLLOW ME ON INSTRUCTABLES, OVER 200 RECIPES AND TUTORIALS!LEAVE YOUR COMMENTS, QUESTIONS, IDEAS AND SUGGESTIONS! \u25ba Website:  www.happyfoodstube.com\u25ba Pinterest:  https://www.pinterest.com/happyfoodstube\u25ba YouTube:  www.youtube.com/happyfoodstube\u25ba Google+:  https://plus.google.com/+happyfoodstube\u25ba Facebook:  https://www.facebook.com/happyfoodstube\u25ba Instagram:  http://instagram.com/happyfoodstube\u25ba Twitter:  https://twitter.com/happyfoodstube\nRead the question below and select from the following choices.\nA: FOLLOW, COMMENT AND SUGGEST\nB: Epic Twice Baked Potatoes\nC: Second Baking\nD: Tortilla Chips and Tostada Bases",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_81_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_81_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_81_2.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Mint Chip Chocolate Cake Batter Cookies\nB: Bake\nC: Mixing in the Oats\nD: Mixing the Dough",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Ingredients', 'Preperation', '@placeholder', 'Storing']",
+    "context": "Here is the context of these images:\n. 3 large egg whites - let them warm up to room temperature for best results\n1/4 teaspoon cream of tartar\n1 cup superfine granulated sugar\n1/2 tsp peppermint extract\n1/2 cup miniature semi-sweet chocolate chips\nNote:\nIf you cannot find superfine granulated sugar, measure a little over 1 cup of granulated table sugar into a food processor and process about 2 minutes.\u00a0 . Chill mixing bowl and beaters for 15 minutes.\nPreheat oven to 250 F.\nLine cookie sheets with parchment paper.\nBeat egg whites and cream of tartar on high speed until soft peaks form. (this is the part where that first prize Kitchen aid stand mixer would be ooooohhhh so convenient *hint hint*)\nAdd sugar a tablespoon at a time until all sugar is incorporated and melted into the meringue. The meringue should be shiny and form stiff peaks when you lift your beaters out.\nyou need to be careful not to overbeat your mix,\u00a0 if your peaks start to soften .. stop mixing immediately.\u00a0 At that point the more you mix the mushier it will get.\nFold in mint or peppermint extract, miniature chocolate chips.\nSome people like their minty confections to be green so if you feel you need to color code your goodies now is when you would add green food coloring.\u00a0 About 3 drops should do the trick. (don't worry the color will lighten up as they cook)\nUsing a pastry bag (or a teaspoon) drop small dollops of cookie mix onto prepared cookie sheets, placing cookies about 1 inch apart.\nNote:\nIf you don't have a pastry bag you can easily make one by putting your filling into a sealable plastic bag and then snipping off one corner with a pair of scissors.. Bake for one hour.\u00a0 After an hour turn off the oven but leave the cookies in to cool and harden for 2 hours.. I have never had these cookies last long enough in my house to go stale, but if you think it may be a possibility then you will want to store the cookies in an airtight container.\nRead the question below and select from the following choices.\nA: Mint Chip Chocolate Cake Batter Cookies\nB: Bake\nC: Mixing in the Oats\nD: Mixing the Dough",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_82_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_82_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_82_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_82_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_82_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_82_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_82_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_82_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_82_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_82_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_82_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_82_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_82_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_82_13.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Fiery Pumpkin Samosas\nB: Finish Off the Chocolate\nC: Boil and Bubble, Toil and Trouble...\nD: Bag Method",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['We Need Pumpkin...', 'Filler Up!', '@placeholder', 'Putting It All Togeather..']",
+    "context": "Here is the context of these images:\n. One of the most fun parts of this recipe is getting the \"ingredients\" out of the pumpkin... I decided that the Instructables Robot would give me added inspiration so I carved him into my pumpkin! Since I did not need a whole lot of pumpkin flesh I decided to do the \"newer\" method of carving where you shave off the outer skin to let the light shine through the flesh. I thought this would be perfect for the orange Instructables Robot.\nIf you want to know how I carved the pumpkin I created a second Instructable here: https://www.instructables.com/id/Instructables-Robot-Halloween-Pumpkin/\nOtherwise we continue with the ingredient gathering... Separate the seeds from the pumpkin guts and spread out on a cookie sheet and let dry for a bit. Pre-heat the oven to 350 degrees and sprinkle a bit of olive oil and a good amount of salt over the seeds and roast in the oven until they turn golden brown.. Fry one clove of minced garlic along with half a medium onion in butter. Once the onion is tender add the finely chopped pumpkin and cook on medium high until it is fully cooked and the fibres have broken down. Add salt, pepper, a dash of cloves & nutmeg. Set aside and let cool.\nTake one Won-Ton wrapper and put a small spoon full of pumpkin mix just off center. Wet the edges of the Won-Ton and fold it over to make a triangle. Press the edges with a fork to join the \"Ravioli\" and create a seal. Repeat until you use up all of the Won-Tons.. To make the Caramelized Onion \"sauce\" fry one clove of garlic in butter until slightly browned on medium / high heat. Add a whole onion that has been sliced thinly and fry until it is tender and has browned. Add more butter to the pan and cook for 5 more minutes. Remove the onions and butter from the frying pan and set aside. Do not clean the frying pan at this time (you will see why a bit later).. In a pot of boiling salted water drop a few raviolis at a time and cook until they float (the first one in the pot tends to stick to the bottom so you may want to nudge it with a fork after it has cooked for a minute). It does not take long to cook these as the pumpkin is already cooked so you are just cooking the \"Pasta\". When the raviolis are done cooking they will float to the top of the pot just remove them with a strainer and let them drain.\nPut the frying pan (that you fried the onions in) back on the heat and quickly fry each ravioli in the left-over butter. This will add additional flavor and texture to the raviolis and give it some nice color.. Place the Raviolis on a plate and with a spoon drop the fried onions and butter over the top of each one. Take a handful of the pumpkin seeds and sprinkle over the plate. Lastly shave some Parmigianino Reggiano over everything. It is amazing how much flavor is in the pumpkin. This dish is a perfect example of sweet / salty & soft / crunchy all working together it is like a party in your mouth!\nAll that is left to do is to light your Jack-O-Lantern (https://www.instructables.com/id/Instructables-Robot-Halloween-Pumpkin/)\u00a0turn down the lights and enjoy dinner!Full RecipeIngredients\n1 Package Won-Ton Wrappers\n2 Cups Pumpkin\n1\u00a01/2\u00a0Onions\n2 Cloves Garlic\n3 Teaspoons Salt\n1 Teaspoon Pepper\n1 Teaspoon Cloves (Ground)\n1 Teaspoon Nutmeg (Ground)\n5 Tablespoons Butter\n1 Teaspoon Olive OilDirections:Filling:\n1) In a saucepan fry 1 clove of garlic in butter on medium high heat.\n2) When garlic is browned add 1/2 onion finely chopped.\n3) Add 2 cups pumpkin and cook until soft about 30 minutes.\n4) Add 1 teaspoon salt, pepper, nutmeg & cloves and stir.\n5) Take off the heat and set aside to cool.Roasted Pumpkin Seeds:\n1) Spread pumpkin seed on cookie sheet and let dry for 30 minutes.\n2) Preheat oven to 325 degrees.\n3) Sprinkle Olive Oil and 2 teaspoons of salt over the seeds.\n4) Roast seeds until the turn golden brown about 40 minutes.Raviolis:\n1) Separate the Won-Ton wrappers.\n2) Place a small spoonful of Pumpkin on the wrapper.\n3) Wet the edges of the wrapper and fold over into a triangle.\n4) With a fork press edges to seal and create decorative edge.\n5) Boil in salted water until the raviolis float about 7 minutes.\n6) Remove from pot and strain.Caramelized Onions:\n1) In a saucepan fry 1 clove of garlic in butter on medium high heat.\n2) When garlic is browned add 1 onion thinly sliced.\n3) Cook until onion is soft and browned about 20 minutes.\n4) Add rest of butter and cook down for another 5 minutes.Plating:\n1) Fry raviolis in the pan used to fry the onions 2 minutes each.\n2) Place on plate and spoon over caramelized onions and butter.\n3) Sprinkle with roasted pumpkin seeds and shaved parmigianino reggiano.\nRead the question below and select from the following choices.\nA: Fiery Pumpkin Samosas\nB: Finish Off the Chocolate\nC: Boil and Bubble, Toil and Trouble...\nD: Bag Method",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_83_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_83_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_83_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_83_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_83_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_83_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_83_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_83_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_83_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_83_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_83_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_83_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_83_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_83_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_83_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_83_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_83_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_83_17.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_83_18.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_83_19.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_83_20.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_83_21.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_83_22.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_83_23.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_83_24.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_83_25.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Plumb Smoothie\nB: Things You Will Need\nC: Wash Hands, Vegetables, and All Food Preparation Surfaces\nD: Cut Into Pieces",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Remove Seeds', 'Cut Into Slices, Peel', '@placeholder', 'Serve']",
+    "context": "Here is the context of these images:\n. Cut the honeydew in half and then remove and discard the seeds using a metal spoon.. Cut the honeydew melon into slices and then peel the skin off of the individual slices using a vegetable peeler.. \nCut the honeydew melon into roughly 2 x 3 inch chunks. Place half of the pieces in the freezer, and the other half in the refrigerator. Let them chill for 1 hour.. Remove the honeydew pieces from the freezer and refrigerator and place half of them them in a blender with half of the other ingredients. Put the lid on the blender and blend on high until the mixture is completely blended, about 45 seconds. Pour the mixture into glasses, then repeat this process with the other half of the ingredients.. \nServe immediately with straws and a small slices of fresh honeydew, if desired. Makes about 4 servings.\nRead the question below and select from the following choices.\nA: Plumb Smoothie\nB: Things You Will Need\nC: Wash Hands, Vegetables, and All Food Preparation Surfaces\nD: Cut Into Pieces",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_84_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_84_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_84_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_84_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_84_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: How to Make Bread Without an Oven!\nB: Removing Idli From the Mould\nC: The Ingredients\nD: Use Sugru to Make an Ice Mould",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Prepare the Egg and Sugar Mixture', 'Churn', 'Serve Up and Eat']",
+    "context": "Here is the context of these images:\n. \n          You will need:\n375ml full cream milk220g sugar4 bourbon vanilla beans8 drops food grade bergamot oil8 egg yolks600ml double cream. Add the milk to a saucepan.With each vanilla bean, slice it down the length and scrape the seeds out. Add the seeds and the beans to the milk.Add 8 drops of bergamot oil.Very slowly heat up the mixture till it almost boils.. \n\tWhilst the milk mixture is heating up, add the egg yolks and sugar to a mixing bowl. Beat these until the mixture is thick and pale.. \n\tSlowly beat in the milk mixture to the egg/sugar mixture in the mixing bowl. Discard the vanilla beans at the bottom of the saucepan, but keep any seeds that may have accumulated at the bottom.\n\tReturn the mixture to the saucepan and heat at a medium heat, continually stirring. Keep on stirring till the mixture thickens up, then take it off the heat.. \n\tReturn the mixture to the mixing bowl and beat it lightly to release a bit of heat. Place the mixing bowl in an ice bath and let it cool, stirring it every 3 or 4 minutes.\n\tWhen it has cooled down (10-15 minutes), add the double cream and stir it through.. \n\tPlace the mixing bowl in the freezer, taking it out every hour or so to beat it. This can be done with a stand mixer or by hand.\n\t\u00a0\n\tWhen the mixture is sufficiently thick (5-8 hours), transfer it to a storage container, (e.g. icecream tub or metal tin) then let it freeze overnight.. Use your imagination. I served it up with some toffee, but if you had some bergamot fruit you could make a syrupy marmalade and serve it with that.\nRead the question below and select from the following choices.\nA: How to Make Bread Without an Oven!\nB: Removing Idli From the Mould\nC: The Ingredients\nD: Use Sugru to Make an Ice Mould",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_85_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_85_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_85_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_85_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_85_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_85_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_85_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_85_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_85_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_85_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_85_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_85_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_85_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_85_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_85_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_85_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_85_16.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: In\nB: Go to Store\nC: Check Your Cupboards\nD: Dry Ingredients",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Melt Some Butter', 'Finish the Batter', 'Prep the Pan']",
+    "context": "Here is the context of these images:\n. You will need:2 cups flour4 tbsp sugar5 tsp baking powder1 tsp salt2-1/2 cups milk6 tbsp butter 2 eggsQuart of blueberriesSemisweet Chocolate Chipsvegetable oilPowdered sugar (optional)Maple syrup (optional). In a large bowl mix together 2 cups of flour, 4 tablespoons of sugar, 5 teaspoons of baking powder and 1 teaspoon of salt salt.. Melt 6 tablespoons of butter in a small bowl. I typically microwave it for a minute, but you can do it the old fashioned way if you are patient.Be careful when taking it out of the microwave because some dishes tend to heat up. . Put two eggs in a small bowl and lightly beat them together. . Mix 2-1/2 cups of milk into the flour mixture. Follow this with the melted butter and the two eggs. Mix until the flour is completely wet and the mixture has an even color in the bowl. Don't over mix it. . Turn on the burner to a medium to a medium-high flame. Pour a little bit of oil into a large frying pan and spread it around to coat the bottom. Or if you have a griddle, just use that. One neat trick I learned recently was to wipe up the excess oil with a paper towel and set is aside somewhere safe. Between pancakes, you can use this paper towel to simultaneously wipe the pan clean and re-grease it. . Stir a generous amount of blueberries into your batter. Also, stir in a few handfuls of chocolate chips, but keep in mind that these have a tendency to sink to the bottom. Make certain that you have some leftover blueberries and chocolate chips lying around so that, when you start cooking, you can ensure an optimal \"Blueberry to Chocolate Chip\" ratio. . Pour some batter into your pan. Through visual measurement, make certain that there is roughly an even number of blueberries to chocolate chips by volume in the pancake you are cooking. This 1:1 relationship is considered an optimal \"Blueberry to Chocolate Chip Ratio.\" If after you pour some batter into the frying pan, you feel that the \"Blueberry to Chocolate Chip Ratio\"is imbalanced, you can fix it by adding either blueberries or chocolate chips as appropriate.After your pancake has started to cook on one side, you will see the wet batter on the other side start to bubble. After it has been bubbling for a short while, this is indication to flip it over. Slide your spatula underneath and gently flip it onto the other side.If left to their own devices, the blueberries will elevate the pancake off the surface of the pan and cause improper browning. I find it ideal to use the spatula to push down upon the pancake to flatten it out. This may cause some blueberries to hiss and explode. Don't worry about that.\u00a0They had it coming. Once the pancake has been sitting there for a minute or two flip it over once more. Make certain that the underside had cooked and then use your spatula and set it aside on a plateRepeat this process until all the batter has been used up. Serve immediately with powdered sugar and maple syrup. \nRead the question below and select from the following choices.\nA: In\nB: Go to Store\nC: Check Your Cupboards\nD: Dry Ingredients",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_86_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_86_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_86_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_86_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_86_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_86_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_86_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_86_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_86_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_86_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_86_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_86_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_86_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_86_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_86_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_86_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_86_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_86_17.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_86_18.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_86_19.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_86_20.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_86_21.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_86_22.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_86_23.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_86_24.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_86_25.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_86_26.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_86_27.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_86_28.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_86_29.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_86_30.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_86_31.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Form the Crust\nB: Banana Pudding\nC: Ingredients\nD: Magically Make Powder Into Pudding",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Ingredients', '@placeholder', 'Now Whip It', 'Finished']",
+    "context": "Here is the context of these images:\n.  Gather and measure out the following ingredients: * 3 cups of cold milk* 1 TSP of vanilla extract* 1 (8 ounce) package cream cheese* 1 (14 ounce) can sweetened condensed milk* 1 (5 ounce) package instant vanilla pudding* 1 (8 ounce) container frozen whipped topping, thawedAdditional ingredients required to set aside:* 5 large bananas, sliced* 1 (12 ounce) package vanilla wafersServings: 12 . Take the block of cream cheese and blend in a large mixing bowl until fluffy. After reaching a fluffy consistency, gather the additional ingredients that make up the filling: condensed milk, pudding mix, cold milk and vanilla extract into your large mixing bowl. . Continue mixing until you notice a beautiful spreadable texture. I like to use a 5-Speed Hand Mixer and gradually go from a medium speed to high. . Line the bottom of a 9x13 inch dish with vanilla wafers. It's important to line the dish completely with wafers leaving very little open space. This bottom layer of wafers eventually serves as the crust upon the application of the additional layers to come.. Take the filling from your large mixing bowl and slowly spread over the wafers with an even distribution from right to left.. Take the the other half of the remaining whipped topping and spread over the filling and for smooth results try using a cake spatula. After this layer is applied, there should still be a little whipped cream left over. Set aside the remaining whipped topping left in the container. . Arrange sliced bananas evenly around the perimeter of the pan and place them row by row across the entire dish.. Finally take the last bit of whipped cream and spread across the layer of bananas. The bananas will tend to darken the second day so this coating is also protective. Additionally, everything tastes better with a little whipped cream on top. Next, you will need to take a handful of wafers crush them all the way up and sprinkle on top. Quick Tip: I like to use my NutriBullet Blender for a refined cookie crumble.Chill for three hours until serving.*Additional Tips*A friend told told me if you soak the bananas in acidulated water it prevents them from turning brown.For a reduced fat version, you can also opt for light cream cheese, 1% milk, sugar free pudding and light whipped cream.. Voil\u00e0, all done!\nRead the question below and select from the following choices.\nA: Form the Crust\nB: Banana Pudding\nC: Ingredients\nD: Magically Make Powder Into Pudding",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_87_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_87_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_87_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_87_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_87_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_87_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_87_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_87_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_87_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_87_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_87_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_87_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_87_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_87_13.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Khara Boondi (Savory Fried Balls)\nB: Fry It Up and Serve Warm\nC: Serve and Enjoy!\nD: The Fried Chicken",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['For the Filling', 'Microwave Mochi', 'Put It Together', '@placeholder']",
+    "context": "Here is the context of these images:\n. (If you're using premade fillings like anko, red bean paste, skip this step)Wash, Peel, Chop, and put the Potato in Boiling Water (Boil for 15-20min)Mash the PotatoAdd the flavoring you want while the potato is still hot to fit your taste(pinch of salt, lil bit of coconut milk...go crazy)Then set it aside. In a Microwavable Bowl, mix together the Mochiko, Water, and Sugar. You can add food coloring if you'd like at this point.Microwave for 2-4min With the Cornstarch, spread it on a large plate or working surface and scoop the mochi on top (ITS REALLY HOT so take precaution) Sprinkle cornstarch on top of the mochi, cutting knife, and also your hands (cornstarch helps keep the mochi from being too sticky)Cut the mochi into 4 equal pieces. Stretch out one of the mochi pieces a little and fill with 1/2 - 1 tbsp. of filling.Wrap and pinch to close Brush off cornstarch (Note* You can stop at this point if you want regular mochi treat)Flatten the mochi to a 1/2inch circular disc.Repeat with the rest. On medium heat, put the oil in a large frying pan Fry till golden brown, 3-5min, then flip and fry the other side  Serve warm/hot\nRead the question below and select from the following choices.\nA: Khara Boondi (Savory Fried Balls)\nB: Fry It Up and Serve Warm\nC: Serve and Enjoy!\nD: The Fried Chicken",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_88_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_88_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_88_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_88_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_88_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_88_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_88_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_88_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_88_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_88_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_88_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_88_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_88_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_88_13.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Coriander Sweet Potato\nB: Simmer and Enjoy!\nC: Serving\nD: Sweet Potato Dough",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Ingredients', 'Cook Down the Onion and Sweet Potato', 'Add in the Coconut Milk and Chickpeas', '@placeholder']",
+    "context": "Here is the context of these images:\n. 1 sweet potato cut into 1/2 inch chunks1 onion, diced 3 cloves garlic, minced1 inch ginger, grated 2+ tablespoons curry powder/garam masala of choice hot pepper of choice to taste - I'm using chipotle pepper powder 1/2 teaspoon+ amchoor powder 14.5 oz can chickpeas + their liquid 14.5 oz can coconut milkturmeric for color (optional) brown sugar to tastesalt to tastefresh limes for serving and seasoningrice for servingparsley and cilantro for servingcoconut oil or other oil for cookingFor this recipe, I suggest using a bright and spicy curry powder - nothing too sweet! The coconut milk and sweet potatoes are already quite sweet, so you want to balance it out. I used mostly curry powder and a bit of garam masala to round it out. I'm using amchoor powder here for tartness, but you can also just add fresh lime juice near the end of cooking. I normally do that any way. :). Heat a pan over medium heat. Add a bit of coconut oil to your pan and let it melt. Add in the sweet potatoes and onions and a pinch of salt. Let this cook for 5-10 minutes, or until the onions have softened and are the sweet potatoes start to darken in color a bit. . Add in garlic, ginger and dry spices. You may need to add more coconut oil if it's very dry!Mix this around and let it cook for a few minutes, just until everything smells awesome. At this point I normally add about a 1/2 teaspoon of chipotle pepper powder, too! The smokiness works really well with the coconut milk and sweet potatoes. But go easy if you're not too into spicy food - do less and work your way up!. Pour in the coconut milk and chickpeas and give it a good stir. Sometimes I'll add a good pinch of turmeric at this point just to give it extra color. :)This curry will darken slightly as it cooks, but turmeric gives it a nice boost!. Once the liquids are in, bring the curry to a boil and then reduce to a simmer. I normally simmer it covered for 15 minutes, and then with the lid off for 15 minutes. The first 15 minutes, it's all about getting the potatoes nice and soft. After that, you'll want to reduce the cooking liquid so it's nice and thick. :)After it's been a half hour, turn the heat all the way down and start tasting. If it needs to be more sweet, add a little of the brown sugar! If it's too sweet, try adding more salt or some lime juice. Salt with also boost the flavors of the spices, so if it seems a little bland, add salt!Keep adding and tasting until it's perfect for you. I like to serve it over rice with fresh cilantro and parsley over the top and a bit of lime on the side so you can season it just right! :D\nRead the question below and select from the following choices.\nA: Coriander Sweet Potato\nB: Simmer and Enjoy!\nC: Serving\nD: Sweet Potato Dough",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_89_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_89_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_89_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_89_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_89_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_89_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_89_6.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: How to Make Mickey Mouse Cookies\nB: Flatten Dough\nC: Roll It, Shape It and Bake It\nD: Make Cookie Dough",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Roll the Dough', '@placeholder', 'Bake the Cookies', 'My Finished Cookies']",
+    "context": "Here is the context of these images:\n. Roll the dough into eight dough balls. For cute snack time cookies make the dough\u00a0balls\u00a0smaller. . Flatten the dough slightly.. Grease a baking tray then place the cookies on and bake at 180 degrees c/ 350 degrees F/ Fan 160 degrees c/ Gas mark 4 for 10 to 12 minutes.. Here are some photos of the finished cookies and the baby snack cookies as they proccesed.\nRead the question below and select from the following choices.\nA: How to Make Mickey Mouse Cookies\nB: Flatten Dough\nC: Roll It, Shape It and Bake It\nD: Make Cookie Dough",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_90_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_90_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_90_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_90_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_90_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_90_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Delicious Colombian Arepa\nB: Add Oil and Vanilla\nC: Eat/serve\nD: Mix Well",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['What You Will Need', 'Chopping Up and Preparing the Pizza.', 'The Outline.', '@placeholder']",
+    "context": "Here is the context of these images:\n. \u2022Scissors \n\u2022Knife\n\u2022Pepperoni\n\u2022tomato sauce (pasta type works fine)\n\u2022two plain pizzas\n\u2022orange peppers \n\u2022 pineapples \n\u2022Can opener \n\u2022Cheddar or mozzarella cheese. (Or in my case both.\n\nWhen you have all these things ready move in to the next step.\n\n\n. For the yellow birds eyebrow you will want the pepper to be quite thick as well as flat.\n\nYou will want to quarter the olives because in a minute you will use them to make the outline....\n\nFor eyes just half them.\n\n\n\nPizza:  \nGet your sauce and spread it all over the  pizza. And add all the cheese you want (I would use mozzarella if you have both as it is white.)\n\n. You may want an image with you for this or just do it off memory like I did. \n\ngrab all of the quartered olives and begin to place them around the pizza until you get an outline like mine. \n\n. First of all I would begin by adding the cheese at the bottom of the bird then work your way up from there. Then slighty above half way begin working on the eyes and eyebrows. Note - for the white parts of the eye you may want to use mozzarella. \n\nI was trying to use a flat piece of cheese for a beak but I kind of messed up.... But I'm sure you clever people will find a way around that. (<-_->)\n\nBy the way guys - your pizzas will nearly definitely end up better looking then mine... I'm not a very artistic person :D. I just whacked both of the pizzas in the oven at about 200 degrees for ten minutes. The red one came out much better than the yellow one. \n\nBut it's on your judgement to decide wether it's cooked or not.. There you go guys thanks for looking at this instructable (my first of many hopefully) and I hope your pizza is delicious and artistic - and better than mine. If any of you guys do the beak successfully then please leave a comment :D\n\nAnd don't forget to favourite! I would really like a pizza oven! :D\nRead the question below and select from the following choices.\nA: Delicious Colombian Arepa\nB: Add Oil and Vanilla\nC: Eat/serve\nD: Mix Well",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_91_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_91_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_91_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_91_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_91_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_91_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_91_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_91_7.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Form the Mac'N'Cheese Bites\nB: Spanakopita Bites\nC: All Equipment Used\nD: Mix All Dry Ingredients Together",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Make the Atomic Filling', '@placeholder', 'Coat the Macaroni', \"Fry 'em Up!\"]",
+    "context": "Here is the context of these images:\n. Mix your macaroni and cheese, hot sauce, cheese, cayenne, and 2 tbs of flour in a medium bowl until evenly distributed. . Take about 2 tbs of the mixture and squeeze it to compress the filling. Roll it around to form a ball and place on a sheet pan. If you find it's sticking to your hands or not rolling nicely, dampen your hands with a touch of cold water. It's okay if a piece of macaroni is sticking out a bit, it doesn't need to be perfect! Repeat this process with the remaining mixture. You should make about 16 - 1 1/2 in balls. Chill the balls in the refrigerator for 1 hour to let them set up. . You will need 3 small bowls. In the first bowl put the remaining 1/2 cup of flour. In the second, whisk the egg with the milk. Finally, in the third bowl mix the breadcrumbs, salt and pepper. Arrange the bowls in a line: flour, egg, breadcrumbs.. Remove the macaroni balls from the refrigerator. Take one ball and give it a quick squeeze and roll to finalize the shape. Roll it in the flour, shaking off the excess. Then, dunk it into the egg wash. Finally, roll it into the bread crumbs and place back on the sheet. Repeat with the remaining balls. TIP!: Use one hand for the flour and breadcrumbs, and the other for the egg wash or you will end up with super coated finger tips!. Heat a deep fryer to 325 degrees. If you don't have a deep fryer, you can fill a small sauce pan 2/3 full with oil and monitor the temperature with a candy thermometer. Fry the balls in batches for 2-3 minutes until the coating reaches a deep golden color. Remove from oil and drain on paper towels for 5 minutes. Now, eat them!! If they are a bit too spicy for your liking, you can dip them in a little ranch or blue cheese dressing.\nRead the question below and select from the following choices.\nA: Form the Mac'N'Cheese Bites\nB: Spanakopita Bites\nC: All Equipment Used\nD: Mix All Dry Ingredients Together",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_92_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_92_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_92_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_92_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_92_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_92_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_92_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_92_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_92_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_92_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_92_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_92_11.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Bacon Infused Venison Burgers!\nB: Bacon Flavored Caramel Syrup!\nC: Cut Bacon\nD: Put a Lid on It and Chill Out.",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Cook Up Bacon', 'Remove Bacon Fat', 'Filter Out the Bacon Bits']",
+    "context": "Here is the context of these images:\n. Open up your package of bacon, and begin to slice it into smallish pieces, about 1/2\" x 1/2\". Cook up the bacon in a skillet. Pour the bacon grease and bits into a glass pitcher or sizable equivalent. Mix in the vodka!!!!  . Put the vodka into the freezer for at least 30 minutes. Take the vodka out of the freezer, and remove what I like to call the moon pie of bacon fat that has congealed at the top of the pitcher.  . Filter the bacon bits and other floaty pieces out of the vodka by pouring it through a strainer of some sort.  I started with a mesh strainer to remove all the bigger pieces, then I started pouring it through a finer filter to get out the small stuff.   I used the grease guard that I normally put over skillets when I'm cooking things that shoot off hot grease.  I recommend repeating this step several times to remove as many particulates as you possibly can.  . Pour your vodka into a fancy glass container, and then enjoy!  \nRead the question below and select from the following choices.\nA: Bacon Infused Venison Burgers!\nB: Bacon Flavored Caramel Syrup!\nC: Cut Bacon\nD: Put a Lid on It and Chill Out.",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_93_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_93_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_93_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_93_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_93_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_93_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_93_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_93_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_93_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_93_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_93_10.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: minute Tea Kettle Meal for One\nB: Dressing\nC: Mix Up the Jello!\nD: Refrigerate and Add Cheese!",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Saute Celery and Onions', '@placeholder', 'Assemble Chicken', 'Serve']",
+    "context": "Here is the context of these images:\n. Ingredients for the chicken:2 Chicken wings  (make sure they are right and left side) I forgot and did not save the correct wings to match the body of the chicken.2 chicken legs2 Tablespoons of butter2 Tablespoons of olive oilPoultry seasoning to tasteThyme to tasteYour choice of salt To tasteFresh ground black pepper to tasteIngredients for the Stuffing for the chicken body:1 box of stuffing mix or make the recipe below.5 cups soft old bread crumbs cubed.2 Celery stalks chopped.1 Medium onion chopped.Approximately 1- 1 1/2 cups chicken stock canned or homemade. Please note: I started out with 1/2 cup and it was not enough so I am not sure how much more I had to add. This is an estimate.Craisins to taste optional4 Tablespoons real butter and (extra for cooking celery and onions) add olive oil to the butter to prevent the butter from browning.Poultry seasoning, dried or fresh thyme, salt and fresh ground black pepper to taste. I recommend pink salt or sea salt to receive the best nutritional benefits from the natural minerals.I modified Betty Crocker's stuffing recipe by reducing the ingredients to make a much smaller portion  and added Craisins, olive oil replacing a ton of butter and replaced the water with homemade chicken broth. Extra Ingredients for the chicken body:1 medium size russet potato.1/4 Cup frozen peas, corn,or diced carrots. Garnishes you prefer:I used an orange, small tomatoes,cooked squash, fresh rosemary,and fresh dark greens.Utensils:Basic utensils were used: Cast iron skillet with oven safe lid but any multi purpose cooker would probably work, crock pot, or I used the stove top and oven,aluminium foil, spoons,fork, knife, bowls, cutting board, scissors to cut the skewer, tooth picks, and skewers to attach the legs.. Procedure:Pre-heat oven to 350 degrees F. Measure ingredients.Wash potato and poke a hole in it to vent it.Place the potato in the oven with or without aluminium foil. Bake 30-45 minutes. About half way done.If you have not already washed and chopped the celery and onions do so now.Place raw chicken on a plate and dry it with a paper towel or napkin. This will make the skins crispy.Place butter and olive oil into a skillet and melt the butter. . Method:Saute the celery and onions in the butter and olive oil and cook for several minutes then . . . Add the chicken broth or water to the mixture and season according to taste.Cook until the onions and celery are translucent.While this mixture is cooking rub olive oil all over the chicken legs and wings.If the skins are not covering the legs, use toothpicks to secure the loose skin for appearance. I should have waited to season the chicken until after I used the toothpicks. Season the legs and wings.Remove the celery and onions from the heat without the broth and set aside. Place the broth into a separate bowl. If you decide to cook the stuffing and chicken all at the same time, then go on to the next step and season the chicken how you like . . . otherwise follow these instructions the way I made our chicken.In a clean skillet, melt the butter and olive oil. Add the raw chicken. Start cooking the chicken browning on both sides. You may wish to add more seasonings to the chicken along the way.When the chicken is half way cooked remove it from the heat. . . and to attach them to the chicken body. If it takes longer than 5 or ten minutes to add it to the body of the chicken, I would go ahead and fully cook the chicken to prevent Salmonella. You can add the legs and wings to the chicken's body after the dressing has been baked, by using the skewers and the tin foil for support.  Remove the tooth picks. It does not take very long at all to position the chicken pieces to shape the body of the chicken and as soon as the body is formed it goes immediately back into the hot oven. Be safe and be smart, pay close attention to how long the chicken has been out of the oven. It is best to cook the chicken all the way through than to risk getting sick.. Procedure:If you will be making a box mix just follow the instructions for making the mix except you will use the stuffing to form the shape of a chicken and then bake it in the oven or multi purpose cookware.If you will be making the stuffing recipe, the instructions are:4-5 cups of soft bread crumbs (cubed) preferably older bread.Gently mix the celery and onion mixture with the bread crumbs. Add water or chicken stock, a little at a time until the dressing will form a ball without falling apart. Add more stock or more bread as needed to form a ball that will keep its form. I made my own stock from scratch.Form the chicken body in the next step.Check on the potato if you haven't already, you need it for the next step.. Method: Pre-heat oven to 350 degrees FRemove some of the bake potato using a spoon as shown in the picture.Place aluminium foil over a cutting board or heavy piece of cardboard overlapping the sides. The foil will firm up the chicken's body so it can be transferred into a skillet or multi purpose cookware using the edges of the foil for hand grips.Place the potato in the center of the cutting board over the foil. Pick up a handful of the dressing and begin to form the chicken's body with the dressing mixture.Lay the chicken legs and wings along the side of the chicken dressing form; making sure you have the right and left wings on the correct side.Place a skewer through the chicken legs and push the legs slightly into the body of the chicken.Trim off the extra length of the skewer; so it won't be in the way of the tin foil when you position the chicken into the skillet.Tie the legs together using the bakers twine for a nice presentation.Position the chicken wings in place and press up against the body of the chicken so that they are not going to move. Position the wings pretty. Stabilize the wings with a skewer if needed. I did not use anything.Carefully lift the chicken  into the skillet and position the wings if needed; making sure they are attached to the sides of the chicken so during the cooking process the dressing will act as a bond to keep the wings into place when you remove the chicken from the skillet.Cover the skillet with foil or an oven proof lid that won't mash the bird.If baking squash, place it into the oven. I did not use a baking dish for it and placed it directly on the oven rack. Bake until it is done and then remove it from the oven. Timing is the key element for making this recipe so everything is finished baking at the same time. My turkey legs were too large for the amount of dressing I made so I had to make an adjustment by making chicken legs which were much smaller or increasing the amount of dressing I would be making. The chicken legs were perfect for this recipe.Set the timer for 20 minutes and check the bird. Check the inside temperature of the dressing and the meat every fifteen minutes until done. If I am correct the internal temperature should be at least 165-170 degrees F for several minutes to destroy any bacteria.Remove the aluminium foil the last fifteen minutes of the cooking time to brown the chicken and dressing. The dressing takes about 30 minutes covered and 15 minutes uncovered to cook. This is a good base point to estimate how long to cook a chicken leg according to your preference. We cook ours very well done so I can't give you an idea other than what I mentioned. I usually cook whole chickens.. Presentation:Lay the platter on the table.Position the dark greens along the outer edges of the platter as shown.Cut the garnishes into pretty shapes but do not arrange them on the platter until after the chicken is centered on the platter.When the chicken is done, allow it to cool for a few minutes.Carefully center the chicken on the platter and tuck the aluminium foil under the chicken so it does not show. Remove the skewers and double check to make sure the tooth picks are removed. I did not get any pictures when I removed the skewers. Add the rest of the garnishes if you are using them.. Crank up some festive music and light the candles and enjoy your holiday meal~  . This recipe is easy to make and does not require a lot of ingredients to make it a fancy affair. It is healthy and delicious~ perfect for a single person or couple who do not wish to make a full holiday meal with all the trimmings. Even a college student who wants to impress their date and has limited kitchen resources could make this if they had a multi cooker.  I will be making another turkey platter before Christmas and will share how I made it with you if it is a success. This was such a fun cooking project.Thanks so much for stopping by . . . be safe and happy~sunshiine~\nRead the question below and select from the following choices.\nA: minute Tea Kettle Meal for One\nB: Dressing\nC: Mix Up the Jello!\nD: Refrigerate and Add Cheese!",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_94_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_94_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_94_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_94_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_94_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_94_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_94_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_94_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_94_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_94_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_94_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_94_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_94_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_94_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_94_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_94_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_94_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_94_17.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_94_18.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_94_19.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_94_20.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_94_21.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_94_22.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_94_23.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_94_24.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_94_25.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_94_26.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_94_27.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_94_28.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_94_29.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_94_30.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_94_31.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_94_32.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_94_33.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_94_34.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_94_35.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_94_36.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_94_37.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Sugar Skull Cake\nB: R2D2 Projector Cake\nC: Cake Stand\nD: Gather Your Ingredients",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Making the Skull', 'Skull Candy', 'The Finishing Touches']",
+    "context": "Here is the context of these images:\n. For this cake you will need:1 Cup unsalted butter, at room temperature1 1/2 Cups sugar4 large eggs, room temperature1 Teaspoons vanilla extract1 Teaspoon almond extract2 Tablespoons espresso powder1/2 Cup sour cream1/2 Cup whole milk2 Cups plus 4 Tablespoons flour1 Cup unsweetened cocoa powder (I prefer the special dark)1 Teaspoons salt1 Teaspoons baking sodaTo decorate the cake you will need:White chocolate skull (recipe to follow)1 can of chocolate frosting2 cups of Nilla wafers2 cups of chocolate cookie waferscocoa powdergreen food coloringblack food coloringyellow food coloringMint leavesCooking sprayTo make the skull you will need:3 bags of white candy meltscocoa powdercooking sprayvodkaYou will also need a standard 9X9 cake pan and a skull mold. An optional food safe airbrush is also suggested.A few weeks ago I found this absolutely incredible mold online. It\u2019s a bit pricey, but if you can swing it, I can\u2019t recommend it enough. It\u2019s a bit of a beast to mold, but I\u2019ll walk you through it.. Start by first setting your oven to 350F/175C and allowing it to pre-heat.Prep your cake pan by thoroughly buttering and flouring it.Cream your butter and sugar until light and fluffy. Add in your eggs one at a time, scraping down the sides of your bowl as you mix. Add in your vanilla, almond, and espresso powder.In a separate bowl, whisk together your sour cream and milk until well incorporated and then add to your butter and sugar mixture.Sift together your flour, cocoa powder, salt and baking soda and add the mixture slowly to your wet ingredients. Blend until thoroughly mixed but don\u2019t overwork.Pour your batter into your cake pan and bake in your oven for 30-35 minutes or until a toothpick inserted comes out clean.Allow to cool for at least 10 minutes before attempting to remove from the pan.. Next we\u2019ll make our chocolate skull.   For the purposesof this recipe, we will only be using the cranial section of the mold.  The jaw (which is a separate piece altogether) will not be used.  The first thing you want to do with this mold is to make sure it\u2019s good and greased.  Normally you don\u2019t have to oil up a silicone mold, but I\u2019ve found with trial and error on this beast that everything you can possibly do to make it release your chocolate works in your favor.I spray the whole thing down with cooking spray and then go back over it again with a pastry brush to make sure the spray is in the deep nooks and crannies.  The brush also helps to spread out any areas where it might pool.  You want a thin coat of spray\u2026Assemble the two halves of the upper cranium and secure.  I placed mine inside a box that just happens to be almost the perfect size to hold the two halves together.  I brace the sides with a little extra foam to keep it from wiggling.Melt down one bag of candy melts.  You can do this either by placing them in a crock pot or electric fondue pot set to low, or by zapping in the microwave for 30 seconds at a time and stirring between cookings.Once your candy melts are melted and smooth, pour the entire pot into the half of your mold that makes up the upper cranium.  Tilt the mold back and forth to make sure you get an even coat on all sides.  A pastry brush can also assist in getting the chocolate into the grooves and spots that might be a bit tougher to reach just by tilting.  Set this aside and allow the chocolate to cool.Melt your second bag and repeat the process with the lower portion of the mold, again allowing it to cool and harden.Melt your third and final bag of candy melts, but this time allow it to cool almost to room temperature.  You want to be able to pour it into your mold without having it melt through the layer you\u2019ve already poured.When it\u2019s cooled down enough, pour the entire bag into the upper half of your cranium and then assemble the mold, placing the two halves together.Now comes the fun part\u2026rotational casting.Make sure your mold halves are secured together.  I use a strap wrapped around the entire thing to make sure all the pieces stay where they are supposed to stay.. Carefully start rotating your mold around 360 degrees. You want to make sure that the liquid chocolate inside the mold fully coats and covers every inch of the mold which means you have to turn it upside down and all around.Do this for a good 20 minutes.  It\u2019s a workout, but worth it.Now place your mold in the fridge.  Every two minutes for the next 30 minutes, rotate your mold by flipping it onto each side.At the end of those thirty minutes, turn the whole thing upside down and leave it alone for 2 hours!  WALK AWAY.  Go watch a movie.  Take a stroll.  Do whatever you want, but leave the mold alone.When it comes time to open the mold, do it carefully.  Gently rock the silicone pieces back and forth to help release their hold on your chocolate.Be prepared, you\u2019re going to have breaks.  It happens\u2026but for this cake, it\u2019s okay\u2026it\u2019s supposed to look worn and old.  If it happens, save the pieces and you can either glue it back together using more liquid candy melt, or simply leave it broken and tell everyone you meant to do that.  It\u2019s art\u2026it\u2019s subjective.  Do what makes you happy.Now that your skull is out of the mold, it\u2019s time to age it down.. For this project, I decided to inscribe it with ancient Welsh symbols for love.  I used a skewer and carved them into the chocolate and then brushed the whole thing with cocoa powder mixed with vodka to give it an aged and worn look.Now that that\u2019s done, it\u2019s time to begin assembling.. Gently press your skull into your cake where you would like to have it rest. You want to push hard enough to leave a dent or mark, but not so hard that you run the risk of crushing either the cake or the skull. Now remove your skull and set it aside while we prep the cake.With a sharp knife, carve out the areas where the skull was pressed into the cake.Frost the entire cake with a thick layer of your dark chocolate frosting. Don\u2019t worry about filling in the holes we just carved. The frosting will act like a glue and help hold the skull in place.Crumble up your dark chocolate wafer cookies. You can do this either in a food processor or in a Ziplock bag using a rolling pin.Sprinkle this down on top of your frosting\u2026it will be your dirt layer. Once you are happy with your dirt, add in your skull.Crumble up your Nilla wafers the same way. You want as fine a powder as you can possibly get.When your Nilla wafers are good and pulverized, add in your green food coloring to the crumbs and either pulse in your food processor to coat evenly or place in a Ziplock bag and knead until all the green coats your cookie crumbs. This will be your moss.Sprinkle your moss crumbs down over your dirt and your skull. You can use a bit of frosting or more vodka to wet down the skull to help the moss stick. A little cocoa powder can also help add more depth and contrast.I admit that I used my airbrush to add in more color. This is purely optional and doesn\u2019t have to be done\u2026a paintbrush and food coloring works just as well.You can see how the addition of more green, yellow and a little black helps add to the aged look of the skull and helps sell the realism.. Finally, garnish with your edible mint leaves, arranging them as though they\u2019re naturally growing out and around your embedded chocolate skull.And there you have it\u2026your mossy skull cake is complete! If you want even more creepy recipes like this for Halloween, swing by my  main Instructables page or check out my horror themed food blog,  The Necro Nom-nom-nomicon.Bone appetite!\nRead the question below and select from the following choices.\nA: Sugar Skull Cake\nB: R2D2 Projector Cake\nC: Cake Stand\nD: Gather Your Ingredients",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_95_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_95_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_95_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_95_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_95_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_95_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_95_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_95_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_95_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_95_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_95_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_95_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_95_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_95_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_95_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_95_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_95_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_95_17.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_95_18.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_95_19.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_95_20.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_95_21.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_95_22.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Sweet Dragon Wings With Spicy Peanut Sauce\nB: Process\nC: Slice Your Veggies\nD: Making the Sauce",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['The Ingredients & Equipment', '@placeholder', 'Cook the Chorizo & Scallops', 'Plate Up Your Meal']",
+    "context": "Here is the context of these images:\n. This recipe will serve 2 as a starter, and should only take less than 20 minutes to cook if you're prepared\nYou could make the sauce even a day in advance and just warm it through if you were cooking for guests, so you'd just need 5 minutes to cook the scallops and you're ready to go!Ingredients:\n6 - 10 Scallops, mine were already shelled, but fresh in the shell would have been even better\nChorizo, preferably raw but cured will do as wellFor the Sauce:\n1 small onion\n1 clove garlic\n150ml passata\n1/3 - 1/2 red bell pepper, or any sweet red pepper\nWorcestershire sauce ( a good dash to taste)\nHot pepper sauce ( depending how hot you like it)\n1 tsp Dijon mustard\n1tsp honey\n1 tsp Paprika\n2 tsp tomato ketchup\n1 tbsp soft brown sugar\nsome Olive oil or flavourless oil (such as vegetable, sunflower etc)\na small knob of butter\nSome salad leaves to serveEquipment:\nSmall Saucepan\nFrying pan\nKnife\nChopping board\nA heat sauce\nKitchen utensils for stirring and flipping the scallops, serving the sauce etc.\nA hand blender ( if you dont have one chop the veggies fine and serve as a slightly chunky sauce)\n2 smart plates to serve. Method:\n\t\tChop the garlic, onion and pepper. I went for fine dice, but if you're blending the sauce chunky is fine.\n\t\tGently fry to onion in the saucepan with a little oil until it starts to go translucent, then add the garlic and pepper. (don't add the garlic with the onion or it will cook to quickly, burn and make the sauce a little bitter) cook until the veggies have softened.\n\t\tAdd all the other sauce ingredients, stir thoroughly, bring to a gentle simmer for 5-10 minutes or so until slightly reduced.\n\t\tBlend the sauce with a hand blender. Preparing the Scallops & Chorizo:\n\t\tIf they're in the shell you will need to open them and take them out\n\t\tremove the coral and the grey membrane around the main meat (the corals can be saved to use for a sauce, if you like)\n\t\tRinse the scallop meats under a running tap then pat dry with kitchen paper\n\t\tChop the Chorizo into small dice, or fine discs\n\t\tFeed the scraps to your cat if she's hassling you for attenton/foodCooking the meat:\n\t\tPut some oil and butter into the frying pan over a medium high heat (the oil will stop the butter burning to quickly)\n\t\tAdd the chorizo and fry until the oils are released and the chorizo is starting to brown, then remove the chorizo and drain on kitchen paper.\n\t\tIn the same pan you now add the Scallops to the oil, which is now flavoured with the Chorizo. Fry the Scallops for 30secs-2mins on each side, depending how well cooked you like them. don't overcook them, or they will be like little rubber bullets.\nThat's it, we're ready to serve up!\n        . \n          Plating your food is an art in it's own right. You may like my presentation, or you may wish to do something even fancier. I've just started beautifying my food after reading 'Working the Plate' by Christoper Styler.\nHow I plated my dish\n\n\t\tStart with a pile of small mixed salad leaves and shredded beetroot off centre\n\t\tPlace 3 - 5 teaspoon dollops of the sauce around the salad leaves\n\t\tCarefully put a cooked scallop on top of each dollop\n\t\tSprinkle the chorizo around and on top of the scallops\nPresent your dish to your guests, and bask in their praise.\nFeel free to vary the recipe, you may fancy replacing the Chorizo with Black Pudding, cripsy Prosciutto or bacon for example\n        \nRead the question below and select from the following choices.\nA: Sweet Dragon Wings With Spicy Peanut Sauce\nB: Process\nC: Slice Your Veggies\nD: Making the Sauce",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_96_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_96_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_96_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_96_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_96_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_96_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_96_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_96_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_96_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_96_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_96_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_96_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_96_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_96_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_96_14.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Carrot Lentil Soup\nB: Add Stock and Simmer\nC: Stir, Top With Herbs, and Serve!\nD: Vegetable Beef Soup",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Chop & Soften Onions & Celery', '@placeholder', 'Eat the Bone Marrow', 'Finish and Serve']",
+    "context": "Here is the context of these images:\n. Ingredients:2 pounds grass-fed beef shanks (usually 3-4 depending on size)2 pounds carrots, shredded2 pounds parsnips, shredded (select small parsnips, or remove woody center from larger ones)2 medium onions, chopped4 stalks celery, chopped2 Tablespoons butter or oil of your preference2 bay leaves8 cups homemade beef stock (or 8 cups water plus 8 1-cup bullion cubes) (can swap out other stock - I used some chicken here)watersalt & pepperthyme and/or oregano (fresh or dried) - mostly optional but I find Thyme a necessary flavor1/2 teaspoon Worcestershire sauce (optional)handful parsley, choppedfull-fat sour cream (serve at the table)shredded cheddar cheese (optional and can distract from the flavor, but kids often like it)Tools:8 quart dutch oven or soup potgrater or food processor with shredding disc (see intro note about shredding ahead of time for travel!)knifecutting boardwooden spoon or spatula. Heat the pot on over medium heat, and drop in 2T butter or the fat of your choice.Chop onions and celery into small bite-size bits, and dump into the butter with a pinch of salt to speed breakdown. Break two bay leaves in half, and add to the pot. Cook, stirring as needed, until the vegetables are soft.. Increase heat to medium/high, scrape vegetables aside or remove from the pot temporarily, and place beef shanks directly on bottom of pan. You can scoop the veggies over top while they cook, but ensure the meat is directly in contact with the bottom of the pan for best browning. I'm too lazy to scoop the veggies out and get another plate dirty, so tend to do it this way.When they start to get brown and curl up, scoot the veggies again and flip your shanks, again ensuring the meat is in direct contact with the pan bottom. If the meat starts to stick or burn, add a bit more butter to the pan and/or lower the heat. You can also deglaze with sherry or the booze of your choice before adding the root veg and stock.. Wash, peel, and grate your root vegetables, then add them to the pot with the browned meat and onion/celery mix. If the pot is hot and dry, you may want to add water or stock at the same time to avoid burning.Note that big parsnips become woody in the center - select the smallest parsnips available. The best ones are similar in size to carrots. If you must deal with giant parsnips, you may need to cut out the centers. Note that this will change your weights - you want roughly the same amount of carrot and parsnip in your soup.I generally prep my veggies and run them through the food processor while the meat is browning, or deputize the 3-year-old to do the shredding. As noted in the intro, you can pre-shred the veggies and vacuum-pack with a bit of olive oil to prevent oxidation if you're traveling and want the veggies ready when you arrive. Highly recommended.. Add stock or water/boullion to cover, and simmer on low/medium, stirring occasionally, for roughly 3 hours. Make sure the liquid always covers the vegetables - add more if needed.. Remove your beef shanks, and let them cool on the cutting board for a few minutes. Use a sharp knife to cut all meat and cartilage from the bone, then chop into small bite-size pieces.  Return meat, cartilage, and bones to the pot and continue simmering for approximately another hour.  Chop and add fresh thyme or oregano if you have them, as well as a half-teaspoon of Worcestershire sauce.The meat will likely still be a bit tough, but further cooking will soften it up the rest of the way. . Before returning the bones to the pot, pop out any bone marrow. Let it cool a bit, sprinkle with salt, and eat. You're making everyone else delicious soup - this is your treat.  Consider sharing with your kids (their little brains are growing and could use the excellent fat!) but make sure you get at least one piece for yourself.. When the beef is tender, adjust seasonings. Make sure there's enough salt, pepper, thyme, and oregano for your taste. If you need a bit more umami kick, add a bit more worchestershire sauce.Remove bones and bay leaves, stir in a handful of chopped fresh parsley, and serve. I like to provide a tub of good (read: full-fat) sour cream for people to add to their soup bowls at the table. The fat is a great flavor binder, and it helps cool the hot soup more quickly.This soup saves well and reheats beautifully. Microwave some for lunch the next day.\nRead the question below and select from the following choices.\nA: Carrot Lentil Soup\nB: Add Stock and Simmer\nC: Stir, Top With Herbs, and Serve!\nD: Vegetable Beef Soup",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_97_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_97_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_97_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_97_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_97_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_97_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_97_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_97_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_97_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_97_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_97_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_97_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_97_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_97_13.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Stir Frying!\nB: CHICKEN FONTINA\nC: Prepare Avocados\nD: Prepare Serrano Pepper",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Ingredients', 'Do Your Prep Work', '@placeholder', 'Add the Basil and Serve!']",
+    "context": "Here is the context of these images:\n. \n\t\t1 pound ground chicken\n\t\t2 tablespoons grated or minced ginger\n\t\t2 tablespoons minced garlic\n\t\t1 tablespoon sesame oil\n\t\t1 1/2 tablespoons hoisin sauce\n\t\t2 tablespoons soy sauce\n\t\t1 tablespoon rice vinegar\n\t\ta jalapeno or serrano pepper, minced, de-seed it if you want to!\n\t\thandful of basil\n\t\t1 tablespoon chili/garlic sauce (optional)\nIf you can find Thai basil, use that! I have a hard time finding it at my local grocery store, so I go for normal basil. I think that this is best when it's spicy, so if you'd like a little kick, add some chili/garlic sauce at the end. It'll also give it nice color. :)\n        . Because everything comes together so quick, make sure sure have everything minced and measured out.\nPut the pepper, garlic and ginger in a bowl together and measure out the soy sauce, hoisin and rice vinegar into another bowl and mix.. Heat a large skillet over medium high heat and pour in a tablespoon of sesame oil. When it's really hot, dump in the peppers, ginger and garlic.\nStir these around for a minute or so, until nice and fragrant. Then add in the ground chicken and break it up into smaller pieces. You won't want to cook it through all the way here, because we're about to simmer it for a few minutes. Cooking it all the way through here will make it tough!\nAs soon as the chicken is all in 1/2 inch or smaller pieces, turn the heat down to medium low. . Add in the liquids, stir it well, and let it simmer for a few minutes. I like to cover it but you don't have to. Stir it every so often, until most of the liquid is gone.. Turn the heat back up to medium high, and add in a handful of basil. (I've attached a photo of what I mean by \"handful of basil\") I like to rip it into slightly smaller pieces!\nStir this around just until the basil is wilted and then turn off the heat. Add in your chili garlic sauce now if you want it. Serve it hot over rice or noodles - it's perfect that way :D\nRead the question below and select from the following choices.\nA: Stir Frying!\nB: CHICKEN FONTINA\nC: Prepare Avocados\nD: Prepare Serrano Pepper",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_98_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_98_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_98_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_98_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_98_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_98_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_98_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_98_7.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Deathberry Scones\nB: Cook\nC: Finished\nD: Ingredients and Equipment",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Weigh the Dry Ingredients', 'Roll and Cut', 'Eat!']",
+    "context": "Here is the context of these images:\n. This recipe makes around a dozen 3 inch diameter scones.For the scones:3 cups self raising flour1/2 tsp salt1 1/2 tsp dried lavender, ground finely (buy culinary or organic lavender to make sure it is free of pesticides & other unwanted chemicals)170 g cold unsalted butter1 large egg, lightly beaten3/4 cup of buttermilk (or you can make an easy buttermilk substitute by taking 3/4 cup of milk and stirring in the juice of half a lemon. Leave it to sour for 15 minutes, then use as the recipe directs)1/4 cup honey1/2 tsp bicarbonate of sodaFor the glaze:2 tbsp honey1 tbsp water1 tsp dried lavenderEquipment:Mortar and pestle, or similar for grinding the lavenderScales or measuring cupsMixing bowlSmall bowlsSpoons, knives, spatulas etc.WhiskMicrowave/stovetopSieveRolling pinRound scone/cookie cutterBaking traysBaking parchmentPreheat your oven to 180C/350F/Gas 4(For anyone without scales, a good weight to volume conversion guide can be found here.). Measure the flour into a large mixing bowl. Add the bicarbonate of soda and salt, and give the whole thing a good whisk to aerate the flour and combine the ingredients thoroughly.. Cut your butter into small cubes and add it to the flour mix. Rub the butter into the flour with your fingertips until there are no large pieces left and the mixture resembles breadcrumbs. Stir in the ground lavender.. Make a dip in the centre of your flour mixture and pour in the egg, honey and 1/2 a cup of buttermilk.Stir the mixture together with a flat bladed knife until a rough dough forms. If the the mixture seems too dry, add more buttermilk, a little at a time, until you have incorporated all of the flour.Tip the dough out onto a well floured surface and knead it gently and briefly to bring it together into a ball. Handle the dough gently to keep your scones light and airy.. Roll out your dough to around 3/4 of an inch thick. The dough will be fairly soft but should't be too sticky. If you have problems rolling it out put the dough in the fridge for half an hour or so to firm it up.Cut out the scones with your cutter of choice, it's a good idea to dip the cutter in flour every so often to stop it getting too sticky with dough. Cut out as many scones as you can, then re-roll the scraps to cut out some more. Again, treating the dough gently is the best way to get fluffy scones, so use a light touch with the rolling pin, and try not to re-roll the scraps too many times.Place your scones on a lined baking sheet, and bake for 12-15 minutes. While the scones are cooking, prepare the glaze.. For the glaze measure the 2 tbsps of honey and 1 tbsp of water into a bowl. Stir in the dried lavender. Microwave the bowl in a couple of 20 second bursts until the glaze is hot and steaming. (You can also heat the glaze on the stovetop.)Once the glaze is hot leave it to stand for five minutes to let the lavender infuse, then strain out the flowers.. When the scones are nearly done, after 12-15 minutes in the oven when they are nicely golden, remove the tray from the oven and brush the scones with the glaze.Return the scones to the oven for another 2-3 minutes to set the glaze, then take them out and leave to cool a little.. Scones are best served warm, so eat soon after baking, or rewarm them in the oven or microwave. To serve, split the scones in half and spread with clotted cream, whipped cream, or whipped butter (with some added honey of you're feeling decadent). They are also great filled with lemon curd. Eat them for breakfast, brunch or an afternoon snack with a nice cup of tea.\nRead the question below and select from the following choices.\nA: Deathberry Scones\nB: Cook\nC: Finished\nD: Ingredients and Equipment",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_99_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_99_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_99_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_99_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_99_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_99_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_99_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_99_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_99_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_99_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_99_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_99_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_99_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_99_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_99_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_99_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_99_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_99_17.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_99_18.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_99_19.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_99_20.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_99_21.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_99_22.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_99_23.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_99_24.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_99_25.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_99_26.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_99_27.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_99_28.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_99_29.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_99_30.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Chocolate Covered Matzo\nB: Chocolate Croquembouche\nC: Chocolate Pastry Cream\nD: Arrange Your Matzo",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Ingredients', '@placeholder', 'Melt the Butter and Sugar', 'Start Cracking!']",
+    "context": "Here is the context of these images:\n.   Recipe adapted from the queen of all things delicious: Smitten Kitchen  You will need:  4 - 6 Matzos  12 oz. semi-sweet chocolate  2 sticks of butter (I use salted)  1 cup light brown sugar, packed  A handful of slivered almonds or chopped walnuts.   Place your matzo on the baking sheet so that it fits. You can break it up and pieces can overlap to cover the area. We also wrapped the baking sheet in aluminum foil for easy clean up.. Preheat the oven to 350 degrees. Place the butter and sugar in a saucepan over medium heat. Melt and stir until the mixture starts to thicken and bubble. Once it begins to bubble, leave it on the heat, continuing to stir, for a few more minutes. At the last minute, add about a half teaspoon vanilla (if you desire) and stir in.. Pour the mixture on top of your matzo and spread quickly before it begins to set. Once it is thoroughly distributed, place the sheet in the oven and bake for about 15 minutes. Keep an eye on it and turn the heat down if it starts to bubble or brown too much.. Once your matzo and toffee have set in the oven, remove it and immediately sprinkle on the chocolate chips, distributing evenly. Let them sit to melt on the hot toffee for a few minutes, then spread the chocolate with a spatula. When the chocolate is still melted, add your almonds or walnuts (if desired). Let the entire sheet cool. You can put it in the fridge to the speed up the process, it should take about an hour in the fridge. . We just used our hands to crack the matzo deliciousness into smaller pieces. They'll be uneven, but trying to cut them with a knife proved unsuccessful. They are just as delightful, no matter what shape they are. . This is the part where you have to stop yourself. It's too good. Share your matzo crack with some friends. \nRead the question below and select from the following choices.\nA: Chocolate Covered Matzo\nB: Chocolate Croquembouche\nC: Chocolate Pastry Cream\nD: Arrange Your Matzo",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_100_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_100_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_100_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_100_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_100_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_100_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_100_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_100_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_100_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_100_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_100_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_100_11.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Barbecued Honey Seafood\nB: Not Quite Yet!\nC: Prepare the Ingredients\nD: Bread",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.[\"Let's Get Ingredients\", '@placeholder', 'Prepare the Other Ingredients', 'Make the Sauce']",
+    "context": "Here is the context of these images:\n. It doesn't matter how you get these, whether it be through a grocery store heist, or an Indiana Jones style quest. \u2022 1 pound of shrimp. These shrimp should be uncooked and gray. If they're already cooked they'll be tough when you re-cook them. \u20221/2 pound of scallops. \u20221/2 pound of squid tubes. These are basically squid scalps. They're the skin on the outside of the squids head. They look like white rubbery wizard hats.\u20221 pound of clams and oysters. There won't be a pound of clam meat, because the shells make up a lot of the weight, but a pound should be enough, even including the shells.\u2022Linguini. One package will probably be enough. It expands a lot when it's boiled.\u2022butter and olive oil. \u2022heavy cream. \u2022Take the shrimp. Chop off the tail, and peel the thin shell off the creature. Play death metal to set the mood. Now take a fish knife and cut a small slit in the shrimps belly. You should see a black line running through it. This is either it's spine or some type of vein. Either way it often has sand in it, so make sure and clean it thoroughly, removing the line. After all your shrimp have been thoroughly eviscerated be sure and wash them under water.\u2022 Depending on the size of your scallops you may need to chop them into smaller, more bite size pieces. This helps it to cook better and to mix with the rest of the dish.\u2022Take the squid tubes and chop them up, so that they form rings of squid. If you've ever had calamari shaped like rings, this is how they do it.. \u2022Take your clams and oysters and set them in a steamer and put it over the stove to boil. If you don't own a steamer, take a colander and put it onto a pot of water. Set the clams in the colander, but make sure that the water doesn't touch them. The steam will cause them to open up, but we don't want them to be cooked just yet. Now pry them out of the shell with your thumb and wash them\u2022Take the box of linguine noodles. Set it on the table and allow it to continue to be linguine. We don't need it just yet.. Take a pan and mix butter and olive oil together. Heat up the mixture until everything is completely warm and melted. Toss in the seafood and let it cook. Fry to taste, but make sure the shrimp turns pink before you take it off.Now boil some water and toss in the linguine. . I chose to do a white sauce. Take some butter, and some heavy cream and mix it in the pan (after you remove the seafood). Let it melt, and toss in some chives, or lemongrass. Onions go well with this sauce too, but make sure they're very finely chopped.You could choose other sauces as well. I hear white wine sauce goes well with this type of dish.. Take the linguine and the seafood and mix them thoroughly. Leave the sauce in the pan so that diners can take enough sauce to suit their preferences.Have a linguine party! \nRead the question below and select from the following choices.\nA: Barbecued Honey Seafood\nB: Not Quite Yet!\nC: Prepare the Ingredients\nD: Bread",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_101_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_101_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_101_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_101_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: How to Make a Vampie\nB: Prepare\nC: Add Pudding and Whipped Cream, Then Repeat\nD: How to Make a BLT",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Chop Cake and Layer', 'Drizzle With Liqueur', 'Add Fruit', '@placeholder']",
+    "context": "Here is the context of these images:\n. You'll need:angelfood cake (made from a box is dandy; purchased pre-made is also fine, and faster) 2 boxes french vanilla pudding (I used Jell-O instant) 4 cups whole milk (I often substitute half-and-half for extra richness)frozen fruit (I used cherries, blueberries, and raspberries from Trader Joe's) liqueur(s) of your choice (I like amaretto, irish cream, and any fruit-flavored liqueur) 2 cups heavy/whipping cream (make this fresh)1 teaspoon vanilla extractpinch of saltbrown sugar to taste (I use about 1/4 cup)grated chocolate on top (optional)Go ahead and bake your angelfood cake, mix up your pudding, and whip your cream.  These can be stored until you're ready to assemble the trifle.I've made angelfood cake from a box and from scratch, and the difference isn't major- here you're combining it with so many other flavors that any such distinction would be lost.  Purchased pre-made angelfood cakes will do, but tend to be a bit dry and sometimes taste off.  Pound cake may be traditional, but I prefer angelfood because it's lighter, more absorbent, and complements the berries better.I used a trifle bowl (purchased at Target for $14), but you can use any bowl.  Glass is preferable, because then you can see all the pretty layers.. Chop your cake into smaller pieces, about 1 inch thick, and spread them in a dense layer across the bottom of your trifle dish or bowl.  The white interior of the cake looks best facing outwards for contrast with the fruit, so keep the darker edge pieces facing up.  They'll disappear into the layers.. Pick the liqueur of your choice to drizzle over the cake pieces.  You can use a different liqueur for each layer if you like- that worked quite nicely for me this time. This time I used plum brandy on the bottom layer, marsala in the middle layer, and kirschwasser (cherry) on the top layer.    Other good choices:  amaretto (almond), goldschlager (cinnamon), chambord (raspberry), triple sec (orange), irish cream (one of my favorites), or kahlua.  Sherry is traditional, but doesn't add much flavor.  Adjust quantity and proof of liqueur to your preferences.You can skip this step if you don't want the alcohol, or add a bit of fruit juice to help soften the cake.  There are enough wet ingredients that everything will turn out well anyway. . Cover the cake layer with the fruit of your choice.I used frozen cherries, raspberries, and blueberries:  you can use most any fresh fruit available, though I find berries best complement the texture.  Don't worry about thawing frozen fruit, as it will thoroughly melt while the trifle sits.. Cover the berries with a layer of vanilla pudding, then a thin layer of whipped cream..  Don't worry about 100% coverage or being tidy- everything will get layered over, so just dump it on there.  Keep an eye on the sides to get a nice layered look.Now add another layer of cake, drizzle it with liqueur, sprinkle with berries, and add more pudding and whipped cream.  Continue until you've filled your trifle bowl or run out of ingredients.  My trifle bowl held 3 layers of cake and fruit, with two layers of pudding and cream.  Depending on the size of your trifle bowl, you'll likely have leftovers.  Grab a glass bowl or some wine glasses, and make more little trifles with the extras, then stash them in the back of your fridge.  They'll make excellent leftovers.Cover the top in a final layer of whipped cream, then grate chocolate over the top if you like for bonus style points.. Cover the finished trifle in plastic wrap, and store it in the refrigerator until ready to serve.  It can easily be made the night before, or earlier in the day, as the flavors only improve upon sitting and mingling.Garnish with a couple of fresh berries, a dusting of spice, or a sprig of mint if you're feeling particularly giddy, then just add a large spoon and step back to avoid the ravening hordes.\nRead the question below and select from the following choices.\nA: How to Make a Vampie\nB: Prepare\nC: Add Pudding and Whipped Cream, Then Repeat\nD: How to Make a BLT",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_102_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_102_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_102_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_102_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_102_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_102_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_102_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_102_7.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Broccoli and Potato Soup\nB: Saute the Veggies\nC: Add Chicken Broth, Bring to a Boil, Add Rice\nD: Eat It",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Ingredients', 'Start', 'Blendeer', '@placeholder']",
+    "context": "Here is the context of these images:\n. 1 Head of broccoli.\n1 Head of\u00a0cauliflower.\n2 Onions (you can use two white onions if you like I use a red one because I have one).\n3 stalks of celery (this one is to your liking).\nhalf stick of butter.\n3 cloves of Garlic.\nenough chicken stock to cook all the ingredients.. First chop the celery and the onions, you want medium size\u00a0pieces\u00a0not to thin.\nSecond turn on the stove and melt the butter in a frying pan large enough onion and celery, when the butter melts add the vegetables and saute.. While you saute the\u00a0vegetables prepare the chicken stock and check the salt level, this is the moment to add more water or salt if required, next cut the broccoli and\u00a0cauliflower you want\u00a0medium\u00a0to big pieces and remember remove the stalk from the broccoli.\u00a0\nWhen the onions and the celery are\u00a0sauteed add then to the stock with the chopped broccoli and cauliflower\u00a0and\u00a0let them cook for a 15 - 20 minutes.. When you fell the broccoli and the cauliflower tender turn off the stove and carefully\u00a0(Because that soup is really hot)\u00a0place the soup in the blender until you have the right consistency, do it little by little and \u00a1voil\u00e0!\u00a0you finish the\u00a0preparation.. Serve a little on a plate and enjoy, you can garnish\u00a0with a\u00a0little\u00a0of laurel or other green herb like coriander but remember this is for decoration only.\nRead the question below and select from the following choices.\nA: Broccoli and Potato Soup\nB: Saute the Veggies\nC: Add Chicken Broth, Bring to a Boil, Add Rice\nD: Eat It",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_103_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_103_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_103_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_103_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_103_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_103_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_103_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_103_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_103_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_103_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_103_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_103_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_103_12.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Mixing Molasses Syrup With Popcorn\nB: Popcorn Ball Brains\nC: Cover the Snowmen\nD: Stack the Snowmen",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Ingredients', 'Getting Your Work Space Ready', '@placeholder', 'Cleanup, Storage, and Later Eating']",
+    "context": "Here is the context of these images:\n. Remember, this is for a pretty large batch. I would recommend halving this recipe.\n*12 1/2\u00a0 qts of popped popcorn (for this I used ~1 cup of popcorn kernels\nvegetable oil\n3 c. light molasses\n2 c. white karo syrup (I think this is just corn syrup)\n1 1/2\u00a0 c. white table sugar\n4 tbsp. butter + 1/4 c. butter at room temperature\nSpecial Equipment:\nCandy thermometer\nLARGE bowl (heat proof)\nmedium bowl (heat proof)\nLarge pot\n* 1 quart = 4 cups. This is not hard but it may be a little strange for some people who have never cooked popcorn without a microwave. Follow the instructions and everything will be fine!\n1. Place 3-4 corn kernels in your large pot and pour enough oil in the pot to come up about half way to three quarters of the way up the kernels. Turn on the stove to medium heat and wait for your 3 kernels to pop. They will tell you when the oil is hot enough. PLACE THE LID ON YOUR POT. You do NOT want hot kernels or oil jumping out of the pot and hurting you, your loved ones, or fuzzy friends.\n2. Once your 3-4 kernels have popped place enough popcorn kernels in the pot to cover the bottom of the pot and not stack on top of each other. Put the lid back on the pot.\n3. The popping will start out slowly and quicken. Once your popcorn has slowed its popping (but has not stopped) remove it from the heat and pour into a medium sized heat-proof bowl.\n4. Use a heat proof utensil to scrape anything left in the pot that you don't want burning when you do the next batch of popcorn.\n5. Place 3-4 more kernals in the pot and put more oil in as you did before.\n6. While oil is heating up shake your bowl with the popcorn in it a little bit so the kernels that didn't pop move down to the bottom.\n7. CAREFULLY scoop handfuls of popcorn out of the bowl its in and into your very large heat-proof bowl. Try to avoid unpopped kernals because they can be VERY hot for a little while after you pour them into the bowl. Do not try to reuse these unpopped kernals.\n8. Once the oil is heated add more popcorn kernals to it as you did before in and repeat steps #2 - #8. You will most likely make 3-4 batches of popcorn if you are using the proportions in this recipe. . This may seem like a strange step but when you mix the molasses syrup with the popcorn and then form the balls you're going to take up a bit of space.\nWherever you're going to do this make sure that you put wax paper down on your work space (see picture)\nTake your 1/4 cup of butter at room temperature and put it on a plate. This will be the only thing that keeps your hands from sticky madness.. Ok! The popcorn is made and your work area is ready!\n9. Pour the molasses, karo syrup, and sugar into your pot. Stir it up until mostly homogeneous. Put the lids back on your molasses jars and karo syrup container and flip it upside down so that the syrup pools to the top of the container. This way you get everything you paid for! Pour the rest of the syrups into the pot.\n10. Set your candy thermometer in pot. Turn on the heat medium or medium high.DO NOT STIR YOUR SYRUP AT ALL ONCE IT HAS STARTED HEATING! RESIST THE URGE!\n11. Once your syrup starts boiling throw in 1 tbsp of butter and DO NOT STIR.\n12. When the syrup reaches your desired temperature (discussed below) remove it from heat and add the other 3 tbsp of butter and let it melt on top of the syrup. Once it is completely melted it probably wont hurt anything to gently stir it in so it's mixed well.TEMPERATURE:\nPLEASE don't let this section intimidate you.\nNana has the syrup go to 238 degrees in Tucson, AZ (Elevation: ~2,389') according to google)\nI looked on a website and it said that you should reduce the cooking temperature by 1 degree every 500' increase in elevation. I currently live in Colorado at an elevation of 8437'. The difference in elevation is ~6000' so by this rule I should bring the syrup down 12 degrees to 226 degrees, which is what I did. I might take it down 1-2 degrees next time but it still came out very well. If you would like to figure out what temperature you should cook it to I would use this 1 degree per 500' rule. Don't forget that if you live at a LOWER elevation than Tucson you will add 1 degree per 500'. To find your elevation you can check google. I hope this makes sense. . This is a very messy part. There will be lots of popcorn that spilled over your bowl. That's okay - I always like to eat the jumpers while someone mixes the popcorn (if I am lucky enough to have someone else mix it for me).\nAgain, splitting your popcorn into two bowls is not ideal because of this part. If you put too much syrup in one bowl the other is not going to have a very nice coating of molasses.\n13. Pour about 1/2 of your molasses batch over the popcorn.\n14. Mix the popcorn gently by scooping spoons/spatulas down the sides of the bowl to the bottom. Gently bring the spoons up through the popcorn so that the molasses that drips to the bottom is brought up to the top of the popcorn. Repeat this step many times until most of the molasses is coating the popcorn and not pooling at the bottom. The motions are the same as if you were tossing a garden salad. This may take a little while. Be patient. Then pour more molasses over and mix again. Repeat until all of the molasses has been mixed into the popcorn.. This step is my favorite part.\n15. Once the molasses syrup is mixed thoroughly into the popcorn WASH YOUR HANDS.\n16. Once your hands are clean and sanitized scoop up some of the room temperature butter (remember, you placed 1/4 cup on a small plate in your work area). Smear it all over the inside of your hands and between your fingers if need be. This will keep the popcorn from sticking to you.\n17. Grab some popcorn (make sure its not too hot to touch) and form it into balls. It may fall apart if its too warm or the cooking temperature was too low. That's ok. Wait for it to cool a little longer and then reform them. Don't be too rough - you don't want to squish the popcorn!\n18. At this point you can eat or store the popcorn balls. For information on storage see the next step. ENJOY!\nNOM NOM NOM!. The bits of popcorn left in the bowl are good to eat and help you postpone tapping into your popcorn ball stash (even if it's for an hour). If you let your bowls and utensils soak in hot water for a little while you will have no trouble removing the molasses syrup.\nWe always freeze about 2/3 of the popcorn balls in ziploc bags. When you want to eat one you can just take it out and microwave it for ~30 seconds.\nThe ones that you don't freeze can just be kept in a ziploc on the counter or stacked on a cake holder with a lid.\nOne last thing - if you have sensitive teeth or you just want to have your popcorn ball softer just throw it in the microwave for 10-20 seconds. They're not hard but you may just want yours a little softer.\nI hope you enjoy these. Have a wonderful day!\nRead the question below and select from the following choices.\nA: Mixing Molasses Syrup With Popcorn\nB: Popcorn Ball Brains\nC: Cover the Snowmen\nD: Stack the Snowmen",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_104_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_104_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_104_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_104_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_104_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_104_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_104_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_104_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_104_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_104_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_104_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_104_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_104_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_104_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_104_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_104_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_104_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_104_17.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_104_18.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_104_19.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_104_20.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Tenderizing the Veggies\nB: Easy Shepherd\u2019s Pie Recipe\nC: Easy Croissants Recipe\nD: Conclusion",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Ingredients', 'Preparing the Meat Broth', 'Preparing the Veggies', '@placeholder']",
+    "context": "Here is the context of these images:\n. 1 lb of boneless meat (beef sirloin, pork loin or chops)   2 large onions (white or red)   3 bay leaves 4 medium-sized beets   1-2 Tbsp lemon juice   3-4 medium-sized potatoes, preferably Yukon Gold    2 carrots    4 celery sticks    2 red bell peppers    1/4 - 1/2 of small head of cabbage    4 tomatoes 2 Tbsp olive oil    1 can of beans, preferably Cannellini white, but black or red will also work    1 bunch of parsley    1/2 - 1 bunch of dill (optional) 5 cloves of garlic    Salt, pepper, sugar    Sour cream (optional)   Tomato paste (optional). Note: I recommend completing this step a day in advance, since it takes a while.Caution - Meat Handling: Keep meat frozen when storing. Move it to the refrigerator at least a few hours before cooking, or defrost. Wash your hands and equipment after handling raw meat.Cut meat into about 1/2-inch cubes and toss into a medium-sized pot    Fill the pot 3/4 of the way with water    Peel the skin off of 1 onion and add the whole onion to the pot   Add bay leaves, and salt and pepper to taste    Bring the pot to a boil; then lower the heat to medium and let it simmer for about 2 hours    When the meat is ready (soft and chewy), throw away the used onion   Set aside the finished broth and let it cool; Refrigerate until ready to use. Cut off the leaves and stems, keeping only the heads of the beets   Peel the beets   Place beets into a pot, add enough water to cover them, and cook until you see bubbles start accumulating. That means the beets are about to boil.Note: Do not boil or the beets will lose their bright color    Strain the beets over a separate container to keep the liquid     Add lemon juice to the liquid to preserve color. Dice the onions  Cut the potatoes and tomatoes into bite-sized cubes  Cut carrots, celery, bell peppers, cabbage, and cooked beets into thin strips   Chop garlic into tiny pieces or grate it. Place the large pot over medium heat and add 2 Tbsp. of olive oil   Add onions, celery and carrots and saut\u00e9 (cook them in the oil) for 4 - 5 minutes   Add cabbage, strained beets, potatoes, bell peppers, 1.5 - 2 cups of the beet liquid prepared earlier, salt, and pepper    Cover with a lid and cook on low heat until the veggies are tenderNote: The veggies should still have some texture and crunch; otherwise they're too soft  Lift the lid about every 10 minutes, mix, and check if the veggies are ready--it should take about 30 minutes  Add tomatoes, salt, pepper, and sugar as needed, and cook for a few more minutes. Add beans, the rest of the beet liquid, and meat with broth. If needed to fill up the pot, add some water and bring to boil. Let simmer for a few minutesTaste the dish and add extra salt, pepper, and sugar as desired. It should taste sour-sweet (not too sour or too sweet)Note: Actually taste the dish and see if you like it at this step--this is the most important step     Add chopped parsley and dill and mix it with the rest of the ingredients and bring it to boil again. Turn off the heat completely immediately after it starts boiling or the greens will overcook   Add garlic right after turning off the heat   Partially cover the pot with the lid, so the veggies can keep cooking in the hot water   Let it cool, then serve   Add a teaspoon of sour cream to your bowl if you want and enjoy :)\nRead the question below and select from the following choices.\nA: Tenderizing the Veggies\nB: Easy Shepherd\u2019s Pie Recipe\nC: Easy Croissants Recipe\nD: Conclusion",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_105_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_105_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_105_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_105_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_105_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_105_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_105_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_105_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_105_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_105_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_105_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_105_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_105_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_105_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_105_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_105_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_105_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_105_17.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: 100% Whole Wheat Focaccia\nB: Knead Dough, Shape, Cover, & Let Rise\nC: Ingredients\nD: Egg Wash & Cover With Remaining Mixed Seeds",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Grate!', 'Stir Em Up', \"It's Ready for Perfection\"]",
+    "context": "Here is the context of these images:\n. Gather your Ingredients. 2 Cups Whole Wheat Flour 1/2 Cup White Flour 1/2 Cup Flax- You can use ground if your family doesnt like flax or omit it and just add a extra 1/2 cup of white flour 1 1/2 cups \u00a0Salted Butter- Cold 1 Tbsp Vinegar 4 Tbsp Cold Water. 1 Egg. Add your flours, and flax into a bowl. Grate your Butter into the Bowl. Grating it on the largest setting on your Grater is a wonderful was to get the perfect size of Butter pieces without trying to mix it into pea sizes pieces. \u00a0. Add remaining ingredients and mix until it forms a ball.\u00a0 Stick that bad boy in the fridge for at least a half hour before using.\u00a0. now use it to your hearts desire! \u00a0\u00a0 i used mine to make mini Meat Pies!\nRead the question below and select from the following choices.\nA: 100% Whole Wheat Focaccia\nB: Knead Dough, Shape, Cover, & Let Rise\nC: Ingredients\nD: Egg Wash & Cover With Remaining Mixed Seeds",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_106_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_106_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_106_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_106_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Green Chile Cheeseburgers\nB: Frying the Onion Rings\nC: Making the Patties\nD: Onion Bacon Cheeseball",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Chilling the Meat', '@placeholder', 'Time to Serve', 'Enjoying Your Meal']",
+    "context": "Here is the context of these images:\n. For two pounds of burgers:1/2 cup of fresh minced green onion1 tablespoon of Worcestershire sauce3 cloves of minced garlic1 teaspoon of salt1/2 teaspoon of black pepper1 teaspoon of Italian seasoning. Add the 2 pounds of venison burger to the bowl and mix thoroughly. . For best flavor, it is advised that you let the meat rest in the refrigerator for at least an hour. . After letting the meat cool down in the refrigerator, it is time to make the patties. I made 8 1/4 pound patties with 2 pounds of meet. . Grill the patties for 3 minutes on each side. There should be an amazing smell and grill marks on your patties. . This part is optional. After grilling for 6 minutes, I turned the heat off and put cheese slices on top for some delicious cheesy taste. I would recommend to leave at least one without cheese so you can have a full and unaltered taste of the burger. . I toasted kaiser buns and regular buns. It's optional but I enjoy the buttery crunch. You can treat these like a cheeseburger and put on lettuce, tomato, onions, etc. . With everything done, it's time to dig in! I went traditional with my sides. I ate the burgers with baked beans and a salad. It was a great meal. I hope you enjoyed!\nRead the question below and select from the following choices.\nA: Green Chile Cheeseburgers\nB: Frying the Onion Rings\nC: Making the Patties\nD: Onion Bacon Cheeseball",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_107_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_107_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_107_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_107_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_107_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_107_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_107_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_107_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_107_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_107_9.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Campfire Boiled Eggs\nB: Check the Rice\nC: Done\nD: The Triple Rinse",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Purchasing Raw Dried Peanuts', '@placeholder', 'Seasoning and Cooking', 'Enjoy and Spread That Peanut Love']",
+    "context": "Here is the context of these images:\n. For this, the ingredients are simple.1.5 pounds of raw, dried peanuts1/2 cup of saltPressure CookerWater. For folks not in the southern United States, green peanuts are very difficult (or impossible) to find. Raw dried peanuts, though not found in every grocery store, can be found in specialty and health stores. It's important to note that dried roasted peanuts are completely different. They will not work! For this, I bought about 1.5 lbs from the local health food store, and paid $3.00. The peanut on the left is a raw, dried peanut. The right is a dried roasted peanut.. Peanuts grow from the ground, therefore they're, well...dirty. The first three steps are rinse. rinse. rinse.  I collect the water from my rinses so that I can compare the cloudiness of each rinse to the next. Three solid rinses should have the peanuts ready to boil.. Boiled peanuts require time. With few ingredients, they need time to soak in whatever seasonings they're boiling with. I prefer the original method, plain old salt and water. For this round I purchased about 1.5 lbs of raw, dried peanuts, and I set aside about a 1/2 cup of salt. After rinsing thoroughly three times, I put my peanuts into a pressure cooker, filled to the fill line with water. It's important that you don't overfill with water! I then added about 1/3 of the salt that I set aside.Pressure cookers are great because they cook about 8X faster than a pot on a stovetop, and you don't need to continuously add water.Place on high heat until boiling, the rocker on the top of the pressure cooker will begin to shake furiously. Once it does, lower the heat until you get a soft, side to side rock from the rocker at the top. After an hour and a half remove from heat for about 5-10 minutes to let the pressure subside, then slowly twist off the top of the pressure cooker. Add the rest of the salt, stir it up, twist the lid back on and place back onto high heat and bring to boil, repeating the simmer process. After about an hour and a half remove from heat, but this time let it cool completely without removing the lid. When you do, you'll notice an oily sheen on top of the water, that's normal!The rest is up to you! Like your peanuts a little more firm? Remove from the water once cooled. If you like them on the mushy side, leave them in for as long as you'd like. If the peanuts aren't salty enough DON'T add more salt, just let them sit and soak.. After your labor of love comes the very best step, the eating! Enjoy your boiled peanuts much like you enjoy sunflower seeds, and toss the shells aside. Great boiled peanuts require a bit of trial and error, but once you get it right you can never go wrong! Share them with a friend and spread that peanut love wherever you are!\nRead the question below and select from the following choices.\nA: Campfire Boiled Eggs\nB: Check the Rice\nC: Done\nD: The Triple Rinse",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_108_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_108_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_108_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_108_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_108_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_108_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_108_6.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: (perfect) Lemon Meringue Pie\nB: Continued Crust\nC: Making of the Crust\nD: Ingredients & Equipment",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Baking the Cupcakes', 'Filling the Cupcakes', 'Making the Meringue']",
+    "context": "Here is the context of these images:\n. \n          For the cupcakes:\n\t\t215 g self raising flour\n\t\t60 g caster sugar\n\t\t1 egg\n\t\t1 egg yolk\n\t\t170 ml milk\n\t\t90 unsalted butter (melted)\n\t\t1 tsp finely grated lemon zest\n\t\t1/2 tsp vanilla extract\nFor the filling:\n\t\tLemon curd (shop bought or home made)\nFor the meringue topping:\n\t\t125 g caster sugar\n\t\t2 egg whites\nPreheat your oven to 180 C (350 F / Gas 4)\nFor the Americans or those without scales, here is a good guide for conversion of weight to cups.\nYou'll also need bowls, spoons, and general kitchen equipment, a 12 cup cupcake tin, and an electric whisk will really save your arm when it comes to making the meringue. A piping bag will also come in handy.. To make the cupcakes, first stir together the flour, sugar and lemon zest in a large bowl. Make a well in the centre of the dry ingredients.\nPut the egg, egg yolk, milk and vanilla into a medium bowl and pour in the melted butter. Mix together and then pour the liquid into the flour well.\nFold the wet ingredients through the dry ingredients until just combined.. Spoon the batter into the cupcake cases. Put the tin in the oven (180 C / 350 F / Gas 4) and bake for around fifteen minutes, until risen and golden.\nRemove the cakes from the oven and allow to cool for ten minutes. Leave the oven on for the meringue topping later.. Once the cakes have cooled a little, you need to hollow out the middle to create a space to put the lemon curd. If you have an apple corer this will be a breeze, or you can do as I did and just cut a small cone out of the centre with a small, sharp knife. (The leftover cake middles are the bakers perks)\nFill the hole with lemon curd. A piping bag will make this easier, or you can carefully spoon it in.. The final stage is to make the meringue topping.\u00a0\nPut the two egg whites in a clean, dry bowl (any grease or dirt will prevent them from whipping up), and whisk them until they form a firm peak. This means that when you remove the whisk the mixture will cling to it in fluffy clouds, and peaks will stand up from the bowl like small snowy mountains.\nAt this stage you can start adding the sugar. Do this gradually, in about four lots, making sure to beat well in between to incorporate the sugar properly. When the last of the sugar has been added keep beating until you get stiff, glossy peaks. These should stand firm, and the mixture will have a silky shine.\n(A classic test of your meringue is to hold the bowl upside down over your head and if ready it will not fall out. This requires extreme confidence in your meringue, or you'll be washing sticky sugar out of your hair for the next half hour).\nNow you need to put the meringue topping on your cupcakes.. Once the meringue is ready, pipe or spread a little on top of each of the cupcakes, making sure to cover all of the lemon curd.\nSprinkle the tops of the meringue with a litte caster sugar, return the cakes to the oven and bake for about five minutes, until the meringues are slightly golden brown.\nLeave the cakes to cool for around ten minutes (red hot sugar is not kind to the tongue), and then devour!\nRead the question below and select from the following choices.\nA: (perfect) Lemon Meringue Pie\nB: Continued Crust\nC: Making of the Crust\nD: Ingredients & Equipment",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_109_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_109_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_109_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_109_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_109_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_109_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_109_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_109_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_109_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_109_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_109_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_109_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_109_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_109_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_109_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_109_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_109_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_109_17.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Make Coffee Using the French Press\nB: Making the Meringue\nC: Macaronage\nD: Clean the Carafe",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Dispose of Grounds', '@placeholder', 'Clean the Filter', 'Reassemble Your Press']",
+    "context": "Here is the context of these images:\n. 1.A - Remove the lid and plunger (and don't worry about grounds on the side of the carafe)1.B - Fill about half full with water 1.C - Dump contents into a wire mesh strainer (over the sink). The water will run clear and the grounds will be kept out of the sink. Again, don't worry about a few grounds on the side of the carafe. If there is still a lot of grounds, repeat 1.B and 1.C.1.D Dump strained grounds into the compost or trash. Tap the strainer if needed to dislodge any remaining grounds. . 2.A - Rinse the filter from above to knock off most of the grounds2.B - Add warm soapy water to fill half the carafe2.C - Use the plunger up and down a few times to clean the carafe. This helps get all of the oils out of carafe. 2.D - Rinse clear. 3.A - Remove the filter from the plunger - usually it just screws on near the base. 3.B - Now separate the filter parts and wash them wish soapy water. 3.C - Reassemble. Go grind some beans, because you have a clean press ready to make a fresh pot of coffee!\nRead the question below and select from the following choices.\nA: Make Coffee Using the French Press\nB: Making the Meringue\nC: Macaronage\nD: Clean the Carafe",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_110_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_110_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_110_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_110_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_110_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_110_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_110_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_110_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_110_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_110_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_110_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_110_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_110_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_110_13.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Adding the Basket\nB: Spiced Up Jalapeno Poppers!\nC: In/out Cupcake Chess Set\nD: Ingredients",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Connect the Drive Unit.', '@placeholder', 'Strainer & Cutter', 'The Lid & Juice']",
+    "context": "Here is the context of these images:\n. Remove the accessories port cover on the kitchen-aid.  Take the juicer base and turn it until the drive shaft slides into the slot.  Then turn the drive base until it lines up with the accessory port notch.  Tighten the lock knob to hold it in place. . Slide the basket down onto the base assembly.  You should feel some resistance as you do this from the the gasket sealing the central shaft and the basket juice holder.  Add the basket wiper into the basket.  It should simply insert and remain loosely in place. . Choose the basket that you want to use based on the type of juice you want.  The hole size on the strainer denotes how thick the finished juice will be.  The smaller the holes the thinner the juice.  Larger hole strainers are recommended for making sauces and soups.  Place the strainer in the center and slide down in place on the drive shaft.  Twist the basket so that the arrows line up on the basket and the strainer.  Insert the cutter insert on top of the drive shaft and push down into place.  The outside of the cutter should line up even with the basket. . Add the lid to the top of the basket.  Line the unlock area up with the left top of the basket until it fits over top the basket.  Once lined up twist the basket right until it slides completely into the locked position.  This takes a considerable amount of force to get in place because the lid pushes down on a safety switch that allows the whole system to rotate.  If the unit does not spin when the mixer is on, then the lid is not completely locked in place.  Place the juice pitcher under the juice spout, and the pulp container under the pulp shoot. Turn the mixer on to setting 10, and add what you want to juice.  You are Done!\nRead the question below and select from the following choices.\nA: Adding the Basket\nB: Spiced Up Jalapeno Poppers!\nC: In/out Cupcake Chess Set\nD: Ingredients",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_111_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_111_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_111_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_111_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_111_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_111_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_111_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_111_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_111_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_111_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_111_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_111_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_111_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_111_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_111_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_111_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Add Tomatoes\nB: Curry Omurice\nC: Choping\nD: Mash It Up",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Dry Roast Masala Paste Ingredients', 'Add Chopped Onions', '@placeholder', 'Add Sauted Brinjal Slices']",
+    "context": "Here is the context of these images:\n. -cut brinjals to long slices and keep ready-Take a pan add little oil in it.Add sliced brinjals and shallow fry until they turn little brown color. -Main flavour of this dish is with masala paste.For preparing masala paste take 1tbsp sesame seeds,2 tbsp dry dessicated coconut and few peanuts in blender. -Take a pan add dry roast sesame seeds,peanuts and dry dessicated coconut on low flame.. -Add above roasted ingredients in blender and blend by adding water to paste. -chop onions,tomatoes and one potato and keep ready-Take all spices as per mesurement in ingredient list. -First step is to take a wide vessel and add little oil.when is heated add cumin,mustard seeds.-when they crackle add split black gram and few curry leaves. -Next add finely chopped onions and saute for few minutes-Once onions are little cooked add one diced potato .. -Add one or 2 chopped tomatoes.choose tomatoes which are fully ripen or add mushy tomato puree.-Add tomatoes and saute until they turn mushy.. -Next step is to add blended masala paste and mix with cooked onion,tomatoes.. -Add a pinch of turmeric powder and 2tsp coriander powder and mix. -Even add tsp red chilli powder and salt as per taste.Mix spices properly with masala paste. -Finally add shallow fried brinjal slices and mix every thing- If the gravy is too thick add little water,cover and cook for some time. - Garnish with spring onions and serve curry hot with biryani or chapathi\nRead the question below and select from the following choices.\nA: Add Tomatoes\nB: Curry Omurice\nC: Choping\nD: Mash It Up",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_112_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_112_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_112_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_112_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_112_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_112_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_112_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_112_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_112_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_112_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_112_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_112_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_112_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_112_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_112_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_112_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_112_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_112_17.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_112_18.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_112_19.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_112_20.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_112_21.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_112_22.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_112_23.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_112_24.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Chicken Ballotine\nB: Heat Things Up.\nC: Prepare the Chicken\nD: Chop the Onions Lengthwise",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['More Chopping.', 'Relax Your Chicken!', '@placeholder', 'Recipe']",
+    "context": "Here is the context of these images:\n. Soak the pot and saucer in cool water for at least 15 minutes prior to putting it in the oven.  Prepare all your food before and during the soak, so you can quickly fill it and put it in the oven  Your oven MUST be cold when you begin or the rapid change in temp may crack your clay pot.  Let the pot heat up gradually with the oven.. Chop your vegetables while the pot is soaking.  You can use whatever you like for this, root vegetables mixed with onions are always a nice base.  This time I used leeks, bell peppers, garlic and red onions. . Chop everything up and set aside.  To avoid crying while chopping onions people suggest sucking on a piece of white bread, slicing under cool water, breathing through your nose... none of these work however, because it is a gas that is in the air coming in contact with your eyes, not the fine mist of onion juice going in your mouth or nose.  Try wearing a gas mask, which can be purchased from most army supply stores.. Massage your chicken until it is very relaxed.  Coat the chicken liberally with seasalt, fresh black pepper and your favorite poultry spice rub (my sister swears by Paul Prudhommes Poulty Seasoning).  Pour all your veggies in the saucer, plop your bird on top of that cozy nest... . Cover the whole party with the flowerpot, and pop it in the COLD oven.  Close the door and put the temp at 325f. degrees for 1 hour.  You can also drop a remote thermometer sensor down through the hole and into the thickest part of your meat, whatever it may be, and set it to go off when it is about 10 degrees lower than your target temp.  I put a pizza tray or cookie sheet under it to catch juices.  You will likely have to remove all your oven racks to fit it in.. Grate a mountain (about 1 loose cup) of your favorite hard cheese (asiago, romano, parmesan etc.)  using a microplane if you have one, a fine shredder of any sort will do.  A decent food processor will save you some time here.  \nWhen the hour is up (or your temp alarm goes off) open the oven, pull the pot out far enough to remove the top (using heavy duty burn protection, not just a kitchen towel, please).  Sprinkle the cheese over the bird and cook uncovered for ten minutes more. . You should end up with something like this at the end.  Refrain from tearing into it immediately.  Let it cool for about ten minutes so the juices don't squirt out.  Your rice should be done just when it is time to cut the chicken.. Mmmmm! Scrumptious Delights!  Be sure to drizzle some of that gravy onto your rice as well.. Add a side of asparagus, an artichoke or some other favorite green vegetable and you have yourself a simple, succulent feast!  Ala cuisine!. For those who work better with a detailed recipe, this is my recipe from the first time I used this flower pot at my father's house when I dug it out of the pile of dirt behind the shed.  It varies from the instructable only in the extra vegetables used as a \"nest\" but the process and seasoning is very close.Rupa's Flowerpot Chicken Geyserville1 chicken, approx 4 lbs. 2 lbs red or white new potatoes1 lb plum or roma tomatoes2 med. onions (approximately 2 cups chopped)2 med. green bell peppers3-7 cloves of garlic (depending upon your taste)1 tsp marjoram and/or thyme1 tsp salt1 tsp fresh ground black pepper (double if using preground)1/4 tsp cayenne pepper1/8 tsp nutmeg2 sprigs fresh rosemary3/4 cup red wine1/4 cup Parmesan, asiago or other hard cheese, gratedInstructions:In advance if possible set your chicken in an cold brine to soak--at least 30 minutes per pound, but not more than 8 hours total.1 quart cool water1/2 cup kosher Salt1/2 cup sugar12 peppercorns6 allspice berries3 whole cloves1 sprig fresh rosemaryMix the salt and sugar in the water, add the  whole spices and submurge the chicken in the pot, cover and place in refrigerator's bottom shelf.  Be careful not to let the water drip on anything!  Place the pot in a shallow dish such as a pie plate lined with a few layers of paper towels to be extra safe.Make a rub mixture of all dry spices (marjoram and or thyme, salt, black pepper, cayenne pepper and nutmeg).Lay onions, tomatoes, garlic and green peppers into pre-soaked flower pot and pour 1/4 cup red wine over the veg mix.Remove chicken from brine and dry with paper towels from under the pot, then lightly coat with olive oil and rub chicken liberally with spice mix, being sure to coat all over, inside and out!Place chicken on the bed of vegies in the saucer, add a sprig of fresh rosemary and cover with the flower pot. Place the whole thing into cold oven, turn temperature to 350* and bake for one hour without interuption.At one hour open and remove top. Baste liberally with juices from under chicken. Turn oven temp up to 500*Sprinkle with fresh grated parmesan or other hard cheese of your choice and cook for another ten minutes uncovered.Serve over brown rice.\nRead the question below and select from the following choices.\nA: Chicken Ballotine\nB: Heat Things Up.\nC: Prepare the Chicken\nD: Chop the Onions Lengthwise",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_113_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_113_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_113_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_113_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_113_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_113_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_113_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_113_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_113_8.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Icing\nB: Dragon Bowl\nC: Modular Cheeseball\nD: Eat and Enjoy!",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Supplies', 'Assembly', '@placeholder', 'Wings & Fire']",
+    "context": "Here is the context of these images:\n. You will need-\n2 9-inch cakes\n6 cups of frosting, your choice of colour  (more or less, depending on how thick you like your icing. \n8 Keebler chocolate covered graham crackers\n8 Sunkist fruit gems\n2 Sunkist fruit slices \n2 Blue Fruit Roll-Ups\n1 Red Fruit Roll-Up\n1 Yellow Fruit Roll-Up\n1 Marshmallow\n2 Chocolate Chips\n2 Hershey's Kisses\n2 Wooden Skewers\nYou will want all of your fruit gems, slices, and icing to be all colour co-ordinated-I chose orange candies and yellow icing, but you can do what ever colours will match the plates or napkins you will have at the party.\n. Take a serrated bread knife and cut the middle part out of all the rectangular crackers so that  you have 2 triangular pieces out of each one. Next, slice all of the Fruit Gems in half.. Now, let's start assembling!Take one of your 9 inch round cakes and slice it in half. Take your frosting and spread some on one half, then place the other half on top, and place the whole thing on a cardboard cake board or aluminum foil covered cardboard.Next, you will cut up the remaining cake. The diagram below shows exactly how to do it. Don't worry if it isn't exact-you can always make a smallish piece bigger looking with icing. ;-). Next, assemble the cake pieces according to the picture below, trimming any pieces if necessary.\nNow, you will frost Mr. Dragon. I find that using a flat icing tip in your full icing bag works wonders on those difficult, moist and crummy parts, and once you've covered up the crumbs, you can use your spatula to smooth things over. This method keeps those crumbs from showing up in your icing.. Now, you will decorate the Dragon!\nFirst, to make the Dragon look like he is scaly, use a child's marker cap to imprint the design onto the creamy frosting. Arrange the Graham crackers along his back and tail; place Fruit Gems and Slices on his toes and head. Press the Hershey's Kisses pointy side in onto the end of his snout, and cut the marshmallow in half and place the chocolate chips on top.. To make his wings, you'll need a skewer and a  blue fruit roll-up. Trim off a corner of the roll-up to keep them from looking too bulky, and after rolling it up the skewer, trim off the edges bat wing style. Repeat for other wing. Make sure not to stick these in until right before serving, because they are heavy and will sag over time.\nFor the fire, trim your yellow and red roll-ups into curvy, twisty pieces, and position near mouth.\nAnd, there you have it!! ENJOY!!\nRead the question below and select from the following choices.\nA: Icing\nB: Dragon Bowl\nC: Modular Cheeseball\nD: Eat and Enjoy!",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_114_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_114_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_114_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_114_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_114_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_114_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_114_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_114_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_114_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_114_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_114_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_114_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_114_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_114_13.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: free Pizza\nB: Suggestions\nC: Toppings\nD: Sauces",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Add Base', 'Make It Meaty', 'Cheese Please!', '@placeholder']",
+    "context": "Here is the context of these images:\n. Tip #1 Recycle leftovers into new dishes! Heat some olive oil in a skillet and add in diced onions and garlic.\u00a0 Let onions and garlic cook until soft. Take a low carb tortilla and spread a layer of pesto to coat the base. Finally add on the garlic and onions.. Next add on your leftover meat!\u00a0 I had leftover meatballs and sausage, but anything will work!\u00a0 Maybe some grilled chicken or some roasted veggies.. Now add on the cheese!\u00a0 I used muenster because I had no mozzarella, but it actually melted really well and I would use it again!\u00a0 Choose whichever cheese you prefer; for a Greek twist try some feta with kalamata olives, or maybe some goat cheese and figs.\u00a0 Now add on some extra marinara sauce, fresh basil leaves, garlic powder, red pepper flakes, and oregano. Put it in the oven on 350 and let it cook until edges are golden brown and cheese is bubbly!. Suggestions:   Serve with a side salad and some red wine.     Try out all of the different options!\u00a0 Greek, Italian, maybe even try a Mexican style pizza.     Use the marinara recipe from my food blog everythingbutfish.tumblr.com\nRead the question below and select from the following choices.\nA: free Pizza\nB: Suggestions\nC: Toppings\nD: Sauces",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_115_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_115_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_115_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_115_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Cut the Metal Strips\nB: Make Your Own Colored Decorating Sugars!\nC: Hide\nD: Make Your Own Kahlua!",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Materials', '@placeholder', 'Bend', 'Finis']",
+    "context": "Here is the context of these images:\n. sheet metal (hobby store)\ntin snips\npliers for bending metal\nfile for smoothing edges\nnuts and bolts\ndrill\nclamps\nsafety glasses. cut metal strips about 3/4\" thick or greater.\nsmooth the rough edges with a file. bend the metal into the desired shape. secure the closure with clamps.\nalign the metal edges so the interior of the curve is smoothest.  Put the screw hole near the edge so the amount of metal overlapping in the interior is minimized.\ndrill a hole for the screw to fit\nassemble so the head of the screw is inside of the form and the remainder of the screw extends outward.  Secure with a bolt.\n. tada!  your own custom cookie shapes.  pretty cool\nRead the question below and select from the following choices.\nA: Cut the Metal Strips\nB: Make Your Own Colored Decorating Sugars!\nC: Hide\nD: Make Your Own Kahlua!",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_116_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_116_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_116_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_116_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_116_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_116_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_116_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_116_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_116_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_116_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_116_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_116_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_116_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_116_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_116_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_116_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Fresh Veggie Tart\nB: Gather Your Ingredients and Supplies...\nC: Make the Tart Crust...\nD: Arrange the Strawberries & Blueberries",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Make the Pastry Cream', 'Place Strawberries & Fill W/ Pastry Cream', '@placeholder', 'Enjoy!']",
+    "context": "Here is the context of these images:\n. Here's a list of the ingredients that you'll need: For the fruit topping: strawberries blueberries 1 nectarine or 2 plums (I have made it with both) For the pastry cream: 2 cups half-and-half 1/2 cup sugar A pinch of salt 5 large egg yolks 3 tbsp. cornstarch 4 tbsp. cold unsalted butter, cut into 4 pieces 1 1/2 tsp. vanilla extract (we modeled our recipe after this one) 1 pie crust We didn't have very much time to make this so we ended up purchasing a graham cracker crust, like one you would use for a cheesecake.. To make the pastry cream, combine egg yolks and 2 tablespoons of sugar in a medium bowl and whisk until the sugar has begun to dissolve and the mixture is creamy, about 15 seconds. \u00a0Then, whisk in the cornstarch until the mixture is pale yellow and thick, about 30 seconds. On the stove, heat the half-and-half, 6 tablespoons of sugar and salt in a saucepan over medium-high heat until simmering, stirring occasionally to dissolve the sugar.\u00a0When the half-and-half mixture has reached a simmer, slowly add it to the egg yolk mixture, whisking constantly.\u00a0 Return the mixture to a simmer over medium heat, whisking constantly, until a few bubbles burst on the surface and the mixture is thickened and glossy, about 30 seconds.\u00a0 Off the heat, whisk in the butter and vanilla. \u00a0 Strain the pastry cream through a\u00a0 fine mesh sieve. You can use a spatula or a spoon to push the pastry cream through the mesh. After you've finished, place plastic wrap directly on the surface of the pastry cream. This will help to prevent a skin from forming. Refrigerate the pastry cream until it's cold and set. We didn't have much time so we took ours out after only about 2 hours, other recipes recommend at least 3 hours and up to 2 days.. Slice the plum into thin slivers and cut up the strawberries into flat pieces. You can arrange the plum slices in a ring to get an idea for how it'll look once it's assembled on the tart.. Line the bottom of the crust with the sliced strawberries. Most people won't expect to find fruit at the bottom so it'll be a delicious surprise! Then, evenly fill the crust with pastry cream. Smooth out the surface so that it's relatively flat.. This is the most fun part! creating the fruit topping. First, start by arranging the slivers of nectarine around the border of the tart. Leave a little space between each one. Try as best as you can to keep a circular hole in the center of the tart that isn't covered in nectarine. You can see that we didn't do the best job at this. No worries if it isn't looking quite right. Later on it'll be covered in strawberries and blueberries.. After you're happy with how the nectarines look, arrange the strawberries in a floral shape in the center of the tart. (unfortunately I was so focused on this step that I forgot to take pictures!) When arranging the strawberries, use the larger pieces for the outer ring and work your way inwards, placing one on top of another. Layer them in an alternating pattern. Then, where there's just a little space in the center add the blueberries until there's a small pile of them.. Step back and admire your work! It's delicious too.\nRead the question below and select from the following choices.\nA: Fresh Veggie Tart\nB: Gather Your Ingredients and Supplies...\nC: Make the Tart Crust...\nD: Arrange the Strawberries & Blueberries",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_117_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_117_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_117_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_117_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_117_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_117_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_117_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_117_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_117_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_117_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_117_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_117_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_117_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_117_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_117_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_117_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_117_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_117_17.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_117_18.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Prepare the Filling\nB: Gol Guppas\nC: Set Up the Workspace\nD: Melt the Dark Cocoa Candy Melts",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Making a Fractal', 'Making Cheese Dough', '@placeholder', 'Fill, Arrange, Bake!']",
+    "context": "Here is the context of these images:\n. The crust design is an Apollonian Gasket.\u00a0Wikipedia has a great article all about Apollonian Gaskets\nhttp://en.wikipedia.org/wiki/Apollonian_gasket\nThe essential things to know are:\n- An Apollonian Gasket is a space-filling fractal. In theory you could make many more circles to continue to fill the top of the pie, but in practice I found that 16 provided a nice design and kept circles at a reasonable size to work with.\n- Curvature  is how \"sharp\" a curve is, and is inversely proportional to the radius of curvature . A straight line would have an infinite radius of curvature, the edge of a large circle would curve slowly (low curvature, large radius), while a small circle would curve very quickly (high curvature, small radius). The designs shown on Wikipedia list the relative curvatures  of the circles within the fractal (with the first, negative, number listed being the radius of the largest \"frame circle\"), so we need to do a little number-crunching to figure out the actual radii we want to use.\nI decided to use the {-12, 25, 25, 28, 48} pattern that is shown on Wikipedia (I prefer the almost-D3 symmetry). To calculate the radii of the circles you will use, you need to take the radius of your pie dish (my 9.5 in dish has a 4.25 in radius), and multiply that radius by the first number in the pattern (in my case 12, the negative number) then divide by the curvature in question.\nThe pattern I chose to follow\nhttp://en.wikipedia.org/wiki/File:ApollonianGasket-12_25_25_28-Labels.png\nFor example, to find the radius of the \"25\" circle in the pattern, I used my spreadsheet to multiply 4.25 in * 12 / 25 = 2.28 in, or approximately 2 + 4/16 inches. If you are using a different sized pie dish, you can take each number below, multiply it by the diameter of your pie dish, and divide by 9.5\nThe radii, in inches, for the sixteen circles I used are as follows\nCurvature \u00a0 Radius (decimal) \u00a0 \u00a0Radius (fraction)\n25 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a02.280 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 2 \u00a04/16\n25 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a02.280 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 2 \u00a04/16\n28 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a02.036 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 2 \u00a01/16\n48 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a01.188 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 1 \u00a03/16\n57 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a01.000 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 1 \u00a00/16\n57 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a01.000 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 1 \u00a00/16\n97 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a00.588 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 0 \u00a09/16\n97 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a00.588 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 0 \u00a09/16\n112 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a00.509 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 0 \u00a08/16\n112 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a00.509 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 0 \u00a08/16\n121 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a00.471 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 0 \u00a08/16\n121 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a00.471 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 0 \u00a08/16\n168 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a00.339 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 0 \u00a05/16\n208 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a00.274 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 0 \u00a04/16\n232 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a00.246 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 0 \u00a04/16\n232 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a00.246 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 0 \u00a04/16\nOnce you calculate the radii of the circles you intend to cut out, use your trusty compass to carefully measure off each radius on the ruler, then draw the circles on a large sheet of parchment paper. Do NOT cut them out yet. I also found it helpful to label each circle by its curvature, for reference later.... You will need enough cheese dough to line the pie dish as well as cover the pie, so if you feel like buying pre-made dough, just get enough for two 9.5 inch dishes.\nTo make your own dough:\nIn a large bowl, mix the flour, sugar, salt, and cheese.\nDice the frozen butter into small cubes, then pulse in a food processor with the dry ingredients until just mixed. Alternatively, cut the butter into the flour mix by hand. The dough should still be powdery and there should be small balls of butter throughout.\u00a0\nAdd 6 tablespoons of ice water to the mix to moisten. If the dough still seems too dry, add 2-3 more tablespoons until you can form the dough into a thin disk.\nWrap the disk in plastic and refrigerate for 30 minutes.. Once your dough has chilled, separate it into two even portions. One for the dish, one for the design. Make sure to flour your cutting board for easy removal of the dough once it is rolled.\nHeat your oven to 400 degrees Fahrenheit.\nFor the dish, roll the dough out until it is a large circle 1/4 inch thick, then press the dough into a 9.5 inch pie dish. Dock the dough with a fork.\nFor the crust design, roll the dough out to 1/4 inch thick, and lightly press the parchment paper with the circles on to the rolled dough. With the tip of a sharp knife, carefully cut out each circle, and leave the marked paper pressed on to each circle of dough for easy labeling and transport. Place the circles, parchment side down, on a large cookie sheet.\nIf you have left over dough, cut it into fun shapes and cook it along side the circles. Cheese dough is quite tasty on its own.\nFreeze the dish and the circles for 10 minutes to set the butter. Then cover the dish with foil or parchment, and add pie weights or beans to help the dish hold its shape. Cover the circles with one large sheet of foil as well.\nBake both the dish and sheet of circles for 10 minutes. Remove the weights and bake both for 10 more minutes until the crust is lightly brown.\nLet dough chill while you prepare the filling.. Prepare your filling in a heavy skillet over medium heat.\nIf you are using bacon, start by cooking the bacon first, until crisp. Then add the onions and apples and saute them with the bacon until they soften.\nIf you are using prosciutto, saute the onions and apples first in good olive oil (a couple of tablespoons), then add the prosciutto at the end for just a few minutes to add to the mix.\nIn a bowl, whisk the milk and eggs with the spices, while the pan filling cools a bit. A little spice goes a long way!\nGrate another 3/4 cup cheese to go on the filling.. Add the sauted filling to the pie dish first.\nNext, top the filling with the grated cheese.\nNext, pour the egg and milk mixture over everything.\nLastly, carefully arrange the dough circles according to the Apollonian Gasket pattern you are following. The circles should absolutely touch each other, and may still overlap the edge of the pie a little.\u00a0\nBake at 400F for 30-35 minutes or until the filling is golden.\nAllow to cool for 10 minutes before serving. It tastes great warm or chilled from the refrigerator and keeps for a few days.\nEnjoy!\nRead the question below and select from the following choices.\nA: Prepare the Filling\nB: Gol Guppas\nC: Set Up the Workspace\nD: Melt the Dark Cocoa Candy Melts",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_118_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_118_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_118_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_118_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_118_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_118_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_118_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_118_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_118_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_118_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_118_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_118_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_118_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_118_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_118_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_118_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_118_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_118_17.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_118_18.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_118_19.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_118_20.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_118_21.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_118_22.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_118_23.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Fig Newt Gingriches\nB: Mitt Hominy (a.k.a. Grit Romney)\nC: Making the Frangipane\nD: Prawn Pauls",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Making the Frangipane', 'Chilling the Layers', 'Trimming']",
+    "context": "Here is the context of these images:\n. Frangipane (Almond Cake)One of the secrets to good baking is weighing your ingredients. Weighing rather than measuring ingredients by volume gives more consistent results. If you don't have a food scale, you can pick them up fairly cheap online. We had gotten ours originally for weighing coffee, but it's been great for baking as well. I did try to put an estimate of the quantity in parenthesis because it definitely makes it easier when you're shopping.Also, you'll see a couple of ingredients listed twice. This is because you will use them at different times, so it's just easier to weigh them separately. Note: All your cake ingredients should be at room temperature13 ounce Almond Paste (Almost two boxes)   4 ounce Sugar   1 ounce Egg (You'll use around 9-10 large eggs total for this recipe.)   9 ounce Sugar   6.5 ounce Unsalted Butter (Around a stick and a quarter)   6.5 ounce Shortening (About a cup)   12 ounce Eggs   1/4 ounce Vanilla extract (About 1/8 a cup)   6 ounce Cake Flour (Sifted) (Around a cup)Bottom CoveringMarzipan Dough - Chilled (We used a 7 oz. box, but you could always make your own.)FillingA jar of your favorite preserves (You could also make your own fruit filling. Just make sure it's isn't too chunky. We thinned ours with a little warm water, just to make it more spreadable.) Ganache Coating / Filling16 oz Dark Chocolate (Four big Ghirardelli-sized bars of chocolate)   16 oz Heavy Cream (One pint)DecorationYour favorite buttercream icing, royal icing, fondant, etc. You could also use dragees (the little hard shelled silver candies) or sprinklesNote: You'll need 8 to 10 pounds of weights (for compressing the cake pans later).. Pre-heat your oven to 375\u00b0F. If you're like us, you have half-sheet baking pans (13\" x 18\"). This recipe fills a full-sheet pan or two half-sheets. If you're using half-sheets, you'll also need a third one (you don't have to flour / grease it) for transferring. If you're using full-sheets you'll need a second one. You'll want to use a sheet with a lip.Take a baking sheet and grease it with shortening or butter. Lay a sheet of parchment on top of the greased baking sheet and smooth it down. The first bit of grease will ensure the parchment sticks nice and flat to the pan. Now, grease the parchment so the cake won't stick to it. Sift a little flour into the pan and shake it around making sure it's well-coated before pouring off the excess.This seems like a lot of work, but when your cake practically slides off the pan onto your table  it will all pay off.. Blend together all the almond paste (13 ounces) and sugar (4 ounces). Break up the almond paste and get the whole thing looking like sand. We used the paddle attachment on the mixer, but a food processor would have worked too. If you used a food processor, you will definitely want to transfer it to a mixer after you've combined the paste and sugar. Your end goal is to create a smooth lump-free mix. Lightly whisk the egg (1 ounce) and then slowly add it to the almond paste / sugar mix and blend until smooth. While you want a smooth mix, you don't want to over mix. Add the sugar (9 ounces), butter (6.5 ounces), and shortening (6.5 ounces) and mix together until light and fluffy. Slowly add the rest of the eggs (12 ounces) and vanilla (1/4 ounce). Slowly add in the sifted cake flour (6 ounces) and blend until smooth and creamy. Like with the almond mixture, you don't want to over mix.. Spread the batter into the lined baking sheet. The smoother you make the top the smoother your final cake will be. A cake spatula is the perfect tool for this.Bake at 375\u00b0F for 10 to 12 minutes. You want the cake to be firm but don't let the edges get dry. Dry = crumbly = wasted cake!. Ganache is super simple to make. You'll make it twice for this recipe. Once as a filling and once as a coating. If you make the ganache before you cook the frangipane, it will have enough time to cool to a spreadable consistency. For coating the cakes, you'll want to be ready to dip.For the filling, we used two bars and 8 ounces of the heavy cream. Break up your chocolate (you can use a food processor, just don't melt it). The finer the chocolate is chopped the easier it will melt. Using a double boiler, bring the cream to a barely a boil. We use medium-high heat on my stove. Pour the cream over the chocolate. Let stand for ten minutes. Gently stir until the chocolate and cream are smoothly mixed. You don't want to overwork it. The results should be smooth and glossy.For the filling, let it cool until it's spreadable.. Lay parchment on the back of your clean pan and lay it on top of the finished cake. Flip it over to transfer the cake from the pan to the back of the sheet. If you're using a half-sheet, you're going to want to cut the cake into thirds. (You need three equal-sized pieces of cake to make the \"sandwich.\") Don't worry about being pretty during this step, we'll trim the edges later on.Lay one sheet of cake on the back of a parchment-lined baking pan, and spread a 1/8\" thin layer of your jam (we thinned the jam with a little warm water). Top with the second sheet of cake and add your chocolate. Top with the third and final layer of cake, but this time spread a very thin layer of jam or chocolate (your choice) on top.. Roll out a 1/16\" thick sheet of marzipan about the same size as the cake. Roll it loosely around the rolling pin and unroll it on top of the cake. Run the rolling pin over the top (carefully, you don't want to pick the marzipan back up). This will become the bottom of the cake. The marzipan keeps the cake moist and it gives it a smooth bottom.. Put a piece of parchment on top of the marzipan and put your second baking pan on top.Use that pan to flip the entire cake upside down. Remove the original pan, and wrap the entire cake and bottom pan with plastic wrap. Now place the empty baking pan on top. Put weights on top of this pan. A couple of hand weights (no more than 10 pounds total) will work. (We used a big bowl of left over chili.) This squishes the cake layers together and makes sure everything gets sealed down.  Put the whole thing in the refrigerator and chill the cake overnight.. Now comes the fun part. Trim the cake so you have clean, smooth edges. (The trimmings are also delicious. Especially with a bowl of ice cream.) In the photos, we're using a pizza cutter. However, a serrated knife would have been a better choice for less crumbs and a smoother profile. Cut the cake into 1\" x 1\" pieces.. Follow the same recipe for ganache as before, but don't let it cool down. Traditionally, the cakes sit on a wire mesh rack and the ganache gets poured over them. Our wire racks were a little too wide for the cakes, so we skewered the cakes and twirled them in the ganache.We also took a few shortcuts when it came to the decorating. You can do the same, or you can lose yourself in the world of decorating. (Here\u2019s a collection of my favorite cake decorating instructables.) In the end, I chose to go with simple off-the-shelf cans of icing because I didn\u2019t have a lot of time. My decorations were very minimal, but I really think you do need some kind of decoration to make the cakes really pop.\nRead the question below and select from the following choices.\nA: Fig Newt Gingriches\nB: Mitt Hominy (a.k.a. Grit Romney)\nC: Making the Frangipane\nD: Prawn Pauls",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_119_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_119_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_119_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_119_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_119_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_119_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_119_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_119_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_119_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_119_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_119_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_119_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_119_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_119_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_119_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_119_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_119_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_119_17.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_119_18.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_119_19.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_119_20.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_119_21.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_119_22.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_119_23.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_119_24.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_119_25.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_119_26.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_119_27.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_119_28.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_119_29.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_119_30.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_119_31.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_119_32.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_119_33.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_119_34.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_119_35.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_119_36.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_119_37.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_119_38.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Blue Cheese Palmiers\nB: Deep Fry\nC: Making the Patties\nD: Roll Them Up",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['What You Will Need', '@placeholder', 'Cooking the Patties', 'Cleanup']",
+    "context": "Here is the context of these images:\n. 1 1/2 lbs ground beef (leanness is your preference.  Less lean will be juicier and more lean healthier.  I used 85% lean)2oz. blue cheese1/4 cup chives (chopped)1/8 teaspoon hot sauce1/2 teaspoon Worcestershire sauce1/2 teaspoon pepper3/4 teaspoon salt1/2 teaspoon dry mustardBuns (something hearty, like pretzel). Large plastic bowlMisc. measuring spoons and cups (standard)Cutting boardKnife (for chopping chives)Plastic wrap. WARNING:  Raw beef may contain bacteria and cause food poisoning.  Wash your hands immediately after handlingthe beef1.      Using the cutting board and knife chop \u00bc cup worth of fresh chives2.      Combine all ingredients into the bowl3.      Mix thoroughly (hands are best used to mix evenly)4.      Leave in the bowl and cover in plastic wrap and let set in the refrigerator for around 2 hours5.      After the mixture has set, divide into individual patties (makes 4-6 patties.). Grill the patties until cooked to your liking (well done, medium, etc.).  If you do not have a grill, broil them in a large Pyrex dish and flip them after 5-10 minutes (I used this method personally, works great)Caution:  Keep patties on a lower rack in the oven if broiling.  This will keep them from cooking too quickly on the outside and leaving the inside under cooked. Serve on the buns and enjoy. Thoroughly wash all utensils and surfaces, particularly those that came into contact with the beef.  Use either disinfectant wipes or soap and hot water to clean any drops or spills.\nRead the question below and select from the following choices.\nA: Blue Cheese Palmiers\nB: Deep Fry\nC: Making the Patties\nD: Roll Them Up",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_120_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_120_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Second Layer of Chips\nB: CARTS Nachos\nC: Ingredients\nD: Start Filling",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['First Layer of Chips', 'First Layer of Cheese', '@placeholder', 'Heating Up the Nachos']",
+    "context": "Here is the context of these images:\n. So you take the chips and put them on the plate. Put as much as u want . Now put the first layer of cheese on top of the chips again put as much as u want remember it's only the first layer. Now it's time to put the second layer of chips so just put chips on top of the first layer . It's time to up the second layer of cheese on top of the second layer of chips and u can put as much cheese because this is ur last layer. Now heat up the nachos for 30 seconds in the microwave . Now you get to eat it!!!You can add different toppings if u want it's totally up to you \nRead the question below and select from the following choices.\nA: Second Layer of Chips\nB: CARTS Nachos\nC: Ingredients\nD: Start Filling",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_121_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_121_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_121_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_121_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_121_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_121_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Make Your Own Yogurt\nB: Save the Bones\nC: Heat 1 Cup of Water on Medium Heat in a Saucepan\nD: Add Brown Sugar",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Add Ingrediences to Crock Pot', 'Separate Liquid From Solids', 'Strain Liquid']",
+    "context": "Here is the context of these images:\n. Whenever you cook a chicken, whichever way you make it, there will be bones left over. Don't throw them away! Put them in the refrigerator or freeze them. Don't let them spoil. When you are ready to use them,place the bones on a cookie sheet. Heat oven to 400 degrees F. Place the cookie sheet on the top shelf of the oven and bake for 30 minutes.. I use a 4 quart crock pot. Add vegetables such as celery, carrots, onion, garlic and parsley, if you like.. Add water to fill. Turn the heat to low. Cook for 8-10 hours. The temperature on high will cook faster. . When complete, place a colander in a large pan and empty the contents into the colander. The liquid is extremely hot at this point. Be careful not to burn yourself. Discard the depleted bones and vegetables.. Place a strainer in a funnel and pour liquid from the pan into quart jars. If planning to freeze, only fill to the shoulder of the jar, otherwise the jar will break when the ice expands. I keep one jar in the refrigerator and freeze the rest. Some sediment may get through the strainer. If sediment is not wanted, cover the strainer with cheese cloth to filter this out.Making your own broth creates a for better tasting soup or gravy than any commercial product I have ever tried. It is worth the effort!. There will be a lot of chicken fat on all the surfaces. Use lots of detergent or run them in the dishwasher.. \nRead the question below and select from the following choices.\nA: Make Your Own Yogurt\nB: Save the Bones\nC: Heat 1 Cup of Water on Medium Heat in a Saucepan\nD: Add Brown Sugar",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_122_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_122_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_122_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_122_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_122_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_122_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_122_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_122_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_122_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_122_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_122_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_122_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_122_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_122_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_122_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_122_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_122_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_122_17.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_122_18.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_122_19.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Faux Bordeaux Candy\nB: Ruby Gem Candy\nC: Gather Your Ingredients\nD: Wrap Thinks Up...",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Hulling the Pods', 'Making the Pieces', 'Enjoy']",
+    "context": "Here is the context of these images:\n. Here is what you will need:\n1 pound Tamarind pods\n1 to 1 1/2 cups of water\n2 1/2 cups sugar (save a 1/2 cup for later)\n1 teaspoon salt\n1 to 2 tablespoons Chile de arbol (optional and to taste) or cayenne. This step probably takes the longest. You have to remove the outer shell and inner strings to get to the sticky fruit. After you remove the shell, break them apart in smaller pieces. You can either leave the seeds or remove them. I normally just leave them as that is how this candy is normally made in Mexico.\nGive them a quick rinse with some water to remove any debris from open pods or stuck shell pieces. Make sure its a quick rinse.\u00a0. Pour half of the water over the fruit and cook it over a low to medium heat while stirring until the fruit breaks down and looks more like a paste. You can help it along by mashing the fruit as it cooks. If needed you can add water little by little to get a thick consistancy.\u00a0. Once you have a nice thick paste, add the salt and start adding the sugar. Add 1 cup of sugar at a time until it is completely incorporated. You can add the Chile de Arbol at this time as well if you choose to do so. Turn up the heat a little, stirring constantly until it boils.. Make sure you use something that will fit in your Fridge.\nUsing a cookie sheet and some wax paper I dropped a little more than a tablespoon for each. I then placed them in the fridge till they cooled and somewhat hardened. About 1 to 2 hours.. After they cooled, I used the rest of the sugar and rolled each piece. I find it useful to use one hand to grab the candy and the other to roll it in the sugar.. I made some with the Chile de Arbol and some without. The sweet, sour and spicy together is great.\nRead the question below and select from the following choices.\nA: Faux Bordeaux Candy\nB: Ruby Gem Candy\nC: Gather Your Ingredients\nD: Wrap Thinks Up...",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_123_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_123_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_123_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_123_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_123_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_123_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_123_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_123_7.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Scary (ok, Cute) Spider Pumpkin Cupcakes\nB: Pumpkin Top Cupcakes\nC: OREO PUMPKIN CUPCAKES!\nD: Supplies",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Supplies', 'Dunking Oreos', 'Putting Together Your Pumpkin Patch', '@placeholder']",
+    "context": "Here is the context of these images:\n. Edible Supplies:\t\tCupcake Mix (any flavor)\t\tFrosting (any flavor, but you'll need it to be green)Mini Oreos (I bought 2 $1 packs from Target and that was plenty)\t\tNormal Sized Oreos Brown Candy Melts (you'll need very few) Green Candy Melts (you'll need very few, way less than pictured) Orange Candy Melts (I used a whole bag to dunk all of the mini oreos shown and three big oreos) Brown SprinklesSupplies You Shouldn't Eat:\t\tCupcake Pan\t\tCupcake Liners\t\tContainer to melt melts\t\tCling Wrap\t\tZip Lock Bag\t\tWilton Piping Tips, I used the  #2 round and  #5 round\t\tWax Paper. You'll need to make your cupcakes. \u00a0There is nothing special there, just make them and give them time to cool.\nTo get ready to dunk your Oreos, get a piece of wax paper out. \u00a0You will need a pretty big piece if you do as many as I did. \u00a0Then, in a container,\u00a0carefully\u00a0melt your candy melts. \u00a0Gather your Oreos and you are ready to go!\nMy initial idea for dunking these Oreos, was to stick toothpicks in the cream and then dunk them like that so I could easily remove the toothpick after they were dunked. \u00a0This does not work. \u00a0Because the candy melts are so dense, they cause the toothpick to act as a lever and instead of dunking it in and taking it out all nicely, it pries your Oreo apart and causes a mess.\nSo, to dunk my oreos, I just threw them in the candy melts and used a fork to get them out. \u00a0Sometimes they had too much coating and when they did I would lightly press them against the side of the bowl to get off some of the excess. \u00a0Once they were dunked, I carefully set them on the wax paper. \u00a0I put them so they were standing on their side if I could.. Use the same method for melting candy melts and getting them ready with the wilton tips as I did with the Skeleton Cupcakes\u00a0(Step 3). \u00a0You will not need many candy melts of green or brown at all. \u00a0You are only doing small details and it goes a long way. \u00a0I had extra after I did everything and so I drew out chocolate bats and did green vines, which I did use later.Stems:\nHeat up your chocolate candy melts first. \u00a0Prepare a ziplock bag and you will be using a #4 round tip. \u00a0I show in the pictures above how I did the stems. \u00a0It's fairly simple. \u00a0All I really tried to make sure I did was got a nice thick stem that sort of stuck up. \u00a0Their stems aren't always that long, so you just need a little stubby one on top.Vines:\nHeat up your green candy melts for your stems. \u00a0I used 12 pieces and it was\u00a0definitely\u00a0enough. \u00a0Now just draw some vines on your pumpkins. \u00a0I did a couple leaves using the same method as the stems, except, in stead of pulling up and away from the pumpkin, I kinda of went along the pumpkin. \u00a0You can see a little leaf in Photo 5. \u00a0With your extra green, draw some vines on your wax paper. \u00a0I put these on some of the cupcakes later, just for a little extra something, something.\n*Tip: Since you don't really get the zip lock dirty because the candy melts are wrapped in cling wrap, you can use both corners of the bag. \u00a0Then you only need one bag to do the stems and the vines.\n**Another Tip: Make sure when you put the melts in the cling wrap, that you really twist the ends and get the candy melts all grouped in the middle. \u00a0Otherwise they will spread out in the cling wrap as they melt and as you smush them.. Now all you need to do is frost up your cupcakes. \u00a0Throw on some sprinkles and put on a pumpkin or too. \u00a0Do not press the pumpkin in like you did with the bones in the Skeleton Cupcakes. \u00a0This won't push them in the cupcake because the pumpkins are too fat. \u00a0This will just make a mess of the frosting. \u00a0Just set them on top. \u00a0They should stay fairly well. \u00a0The more frosting you use the better,\u00a0because\u00a0while they won't push into the cupcake, you can bury them in the frosting. \u00a0I put some more sprinkles around the base of the pumpkin once it was on the cupcake.\nFor the Great Pumpkin, you are going to need to cut a slice out of the pumpkin. \u00a0See photos 8 - 10. \u00a0Once you cut out the slice and frost it, make sure you remember where it is because it is hard to tell once the cupcake is frosted :)\nNow you can put your pumpkins on all of your cupcakes and throw some vines in as well. \u00a0I tried to make it look like the vines were coming from under the pumpkins (though, I know the vines would be around the stems).. I always take so many pictures of my\u00a0finished\u00a0projects to get just the right one. \u00a0So I am sharing a bunch with you here :)\nRead the question below and select from the following choices.\nA: Scary (ok, Cute) Spider Pumpkin Cupcakes\nB: Pumpkin Top Cupcakes\nC: OREO PUMPKIN CUPCAKES!\nD: Supplies",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_124_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_124_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_124_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_124_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_124_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_124_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_124_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_124_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_124_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_124_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_124_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_124_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_124_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_124_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_124_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_124_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_124_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_124_17.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_124_18.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_124_19.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_124_20.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_124_21.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: cream\nB: How to Make Papaya Ice Cream\nC: How to Turn Royal Icing Into Homemade Sprinkles\nD: Cooking",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Ingredients', 'Mixture', 'Packaging', '@placeholder']",
+    "context": "Here is the context of these images:\n. Milk - 200 mlHalf&Half - 200 mlSugar - 5 tbspZip Lock Bag - 2 unitsIce and Salt. We'll put 200 ml of milk and 200 ml of Half & Half. Then we'll put 5 tablespoon of sugar and mix it. You can mixed it in a bag. . We take zip lock bag and put it mixture in zip lock bag. Mixture you don't have much air in your zip lock bag.. Then we'll put ice in other bag and put a salt. Next we'll close bag and shaking. Then you want a gloves if you don't want freeze your hands. Next we'll put mixture in bag with ice about 5-10 minutes. Then you want mixing and shaking around.In 6 minutes the ice cream froze.. We put it in a bowl. If you want make more ice cream you just double everything. It's very delicious homemade ice cream. Quick and simple recipe.Thank you for watching! =)\nRead the question below and select from the following choices.\nA: cream\nB: How to Make Papaya Ice Cream\nC: How to Turn Royal Icing Into Homemade Sprinkles\nD: Cooking",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_125_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_125_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_125_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_125_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_125_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_125_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_125_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_125_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_125_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_125_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_125_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_125_11.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Chocolate Mousse\nB: Chocolate Mehndi Mousse Cakes\nC: Time for the White Chocolate\nD: Add the Whipped Topping",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Ingredients', 'Prepare the Wantons', '@placeholder', 'Filling the Wantons']",
+    "context": "Here is the context of these images:\n. You will need:1 Package of Wantons 1 Tablespoon Butter1/4 Cup Sugar 1 Tablespoon Cocoa 2 Tablespoons Powdered Sugar1/2 Cup Heavy Whipping Cream. Start by melting the butter in the microwave. Lay a wanton on a plate or clean counter. Brush on the melted butter and then sprinkle a pinch of sugar evenly over it. Flip it over and do the same on that side. Repeat with 23 more wantons.. Center a wanton over a hole in a mini muffin tin. Gently press the center down to the bottom of the tin. The sides of the wanton should start to fold toward the center. Pinch them lightly and then press them against the sides of the tin. Repeat with the rest. The first one may be a bit tricky, but after you finish one, the others should only take a few seconds each. Bake at 375\u00b0F for 6 minutes, or until the corners are golden brown. Let them sit in the pan for a minute and then place them on a cooling rack.. While the Wantons cool, make the chocolate mousse. Add the heavy cream, cocoa, and powdered sugar to a mixing bowl. Beat with a hand mixer on medium speed until stiff peaks form. . Make sure the wantons are cool before filling. Scoop the mousse into a piping bag and cut the tip off of it, about a 1/4\" up. Pipe the mousse into the wanton cups. Eat them immediately or keep refrigerated up to 1 day.The mousse won't store for very long, but you can make the wanton cups and store for several days.\nRead the question below and select from the following choices.\nA: Chocolate Mousse\nB: Chocolate Mehndi Mousse Cakes\nC: Time for the White Chocolate\nD: Add the Whipped Topping",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_126_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_126_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_126_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_126_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_126_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_126_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_126_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_126_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_126_8.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: White Chocolate Chip and Macadamia Nut Cookies !!\nB: Ingredients\nC: Vegan Spelt Chocolate Chip Creamcheese Cookies\nD: Optional (add Chocolate Chips)",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Cream the Bown Sugar With the Egg', '@placeholder', 'Fill Up the Paper Cups', 'Video Tutorial']",
+    "context": "Here is the context of these images:\n. First we will go ahead and preheat our oven to 350 degrees F. Now let's cream the brown sugar with the egg.. Next let's go ahead and pour the almond milk in, and mix. . Now let's combine our gluten-free flour mix, ground flaxseed, baking powder, cinnamon, and nutmeg. Now stir it all together with a spoon or wooden spoon. . And if you like sweets like the girls, and when I wasn't looking, they added a cup of dark chocolate chips (dairy free).. After the chocolate chips are folded in, fill up the paper cups about 2/3 to 3/4 depending on the size you want your muffins to be. Then put them in the oven and bake them for 30 minutes at 350 degrees F. . Leah wanted to have a dance party while we waited for the muffins to be done. Feel free to have one as well. :) or you know, just watch some tv or something. . Now eat them and enjoy!\nRead the question below and select from the following choices.\nA: White Chocolate Chip and Macadamia Nut Cookies !!\nB: Ingredients\nC: Vegan Spelt Chocolate Chip Creamcheese Cookies\nD: Optional (add Chocolate Chips)",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_127_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_127_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_127_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_127_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_127_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_127_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_127_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_127_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_127_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_127_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_127_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_127_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_127_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_127_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_127_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_127_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_127_16.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Chocolate Mice\nB: Beautiful*\nC: Spun Sugar\nD: Chocolate Sled",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Melting Chocolate', 'Strawberries! Yum', 'Petals', '@placeholder']",
+    "context": "Here is the context of these images:\n. You will need the following:\n> 8-12oz of chocolate chips(this makes about two roses)> 1/2cup of corn syrup\n> strawberries (as many as the amount of roses you wants to make)> water> plastic bag> plastic container\n> bamboo sticks or any strong wood\u00a0sticks\n> pair of hands:D. there are two ways to melt chocolate in the microwave or on the stove. I chose the stove because it the easiest way not scorch or burn it.\n\u00a0\u00a0\u00a0 to melt on the stove put the fire on low(if you put it on high its most likely gonna burn)\n\u00a0\u00a0\u00a0 then boil the water in a saucepan and put a plate on top (make sure the bottom of the plate doesnt touch the boiling water)\n\u00a0\u00a0\u00a0 put chocolate chips inthe plate little at a time\n\u00a0\u00a0\u00a0 stir continuously\u00a0\u00a0\u00a0\n\u00a0\u00a0\u00a0 when its completely liquid turn off the stove\nTo melt chocolate in the micro wave:\n\u00a0\u00a0\u00a0\u00a0 choose a microwavable container\n\u00a0\u00a0\u00a0\u00a0 put in half of the amount of chocolate and put as much time as it\nneeds to melt(if it burns or scoches add the remaining chocolate and stir.. \n\tAfter melting the chocolate add 1/2 a cup of corn syrup and mix untill the glossy coat is gone.when your done mixing pour into a plastic container and put it into the freezer for 15 min. after the 15 minutes take out of the freezer and put into plastic wrap (i used a plastic bag its the same) and put it in the refrigerator\u00a0for half\u00a0an hour to harden a bit more.(\u00a0the chocolate should come out of the plastic container easily like soft clay). While the chocolate hardens, wash the strawberries and chop off leaves(or if you want leaves for the rose leave them on). the strawberries have to be perfect so if they are a bit freeformed shaped then with a knife shape them perfectly.after shaping place them in a cup of ice because we dont want the chocolate to melt.\u00a0since the chocolate will fall off if it is wet it is preferable if you dry them with a dry towel. after cutting,cooling and drying wash you hands with freezing water and dry completely and get ready to work with the chocolate.. For this step you will have to take the chocolate out of the refrigerator and cover the strawberries with it. First, take a chunk of chocolate and play with it alittle untill it is moldable. Then, cover the strawberry completely (make sure the chocolate cover is not to thin or it will tear and also make sure not to work too much w/ the chocolate or it will melt). After that let the chocolate cool before you work on the petals.. for the petals you will have to be super careful(they tear easily). First make a roll of chocolate to make it easier to make petals. with a knife slice a piece of the roll about 2cm wide.repeat that about 15 times\u00a0. when you have finished that pat them down around the edges so the inside is fatter than the outside and begin placing them around the chocolate covered strawberry. place them to were you think looks best.. The only thing left to do is to\u00a0insert a bamboo stick on the bottom of the rose and put it in the freezer to chill and TAHDAH!!!!!you have a beatiful flower!!!you can decorate it with edible paint or glitter. you can also put it on a cake for a great topper!!! i\u00a0would really love\u00a0to see your flowers so dont forget to send me pictures of them!!!<3\nRead the question below and select from the following choices.\nA: Chocolate Mice\nB: Beautiful*\nC: Spun Sugar\nD: Chocolate Sled",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_128_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_128_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_128_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_128_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_128_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_128_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_128_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_128_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_128_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_128_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_128_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_128_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_128_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_128_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_128_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_128_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_128_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_128_17.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_128_18.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_128_19.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_128_20.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_128_21.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_128_22.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_128_23.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_128_24.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_128_25.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_128_26.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: White Chocolate Mousse\nB: Temper Chocolate\nC: Chill\nD: Fill the Molds",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Add the Chocolate', 'Add the Whites to the Chocolate Mixture', 'Add the Whipping Cream to the Chocolate Mixture', '@placeholder']",
+    "context": "Here is the context of these images:\n. Gather your ingredients. -Approximately 9 ounces of dark or semi-sweet chocolate-Four large eggs-1/2 pint of heavy whipping cream (one cup)-One cup of baker's sugar (regular sugar will work; more or less to taste)You will also need four medium or large bowls and a mixing utensil (preferably flexible in order to scoop up the most chocolaty goodness possible).  . Separate the eggs into two bowls. Put the whites in one bowl and set aside for later. Put the yolks in another bowl. We will be using the yolks in the next step.Separate the whites from the yolks by splitting the egg in half and carefully sliding the yolk from one half to the other so that the whites will fall into the bowl but the yolk will stay in the egg. . Just as the title implies, add one cup of sugar to the yolks and beat until the yolks are very light. The new mixture should be an off-white or cream color. While you are beating the yolks, you should put the chocolate into the microwave for two and a half to three minutes at 40% power. Stir the chocolate until it is smooth and allow it to cool while you continue beating the yolks. . Add the chocolate to the egg yolks and fold until it is well mixed. After you have mixed the chocolate and egg yolks, you should start beating the egg whites. . The egg whites should be beaten until very stiff. Once the egg whites are stiff, they should be folded into the chocolate mixture. After you have thoroughly mixed the whites into the chocolate mixture, you  can start beating the whipping cream. . When you are finished beating the whipping cream, it should be stiff; even more stiff than the egg whites were. Fold the whipped cream into the chocolate mixture. . After you have folded the whipped cream into the chocolate mixture, you should let it cool in the fridge for about two hours. You could eat the mousse now, but if you let it chill first it will stiffen and the bits of chocolate will harden and provide a better texture.You can add flavors, such as a teaspoon of instant coffee or some white chocolate, to add some variety to your dessert. \nRead the question below and select from the following choices.\nA: White Chocolate Mousse\nB: Temper Chocolate\nC: Chill\nD: Fill the Molds",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_129_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_129_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_129_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_129_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_129_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_129_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_129_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_129_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_129_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_129_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_129_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_129_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_129_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_129_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_129_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_129_15.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: How to Bedazzle Your Pimp Cup!\nB: Cook Until Golden Brown\nC: Heat the Oil to Medium Heat\nD: Get Ready to Print",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Find an Image', 'Edit the Image', '@placeholder', 'Print Already']",
+    "context": "Here is the context of these images:\n. Find the image you want to etch onto your piece of candy. The ring maker in the aforementioned video chose to replicate The King's face. I decided to replicate Lebron's logo, an L and J with a crown on top. Google Images is the best.. An image by itself is boring, right? Use Adobe Illustrator or CorelDRAW to edit your image so it's only black and white. It's also important to keep it simple. Remember, you're etching onto a small surface so don't overdo it. I added Lebron's last name and number to this jem.. If using a 60 watt Epilog laser such as the one I used at TechShop San Francisco, set your speed to 100% and power to 35%. It worked for me, so obviously it will work wonders for you.. Print your awesome design! This 8th wonder of the world took just 7 seconds to appear. That's more time than Lebron needs to sink a game winning field goal.. Show off your one of a kind piece of candy to kings and friends alike. Which ring do you like better?\nRead the question below and select from the following choices.\nA: How to Bedazzle Your Pimp Cup!\nB: Cook Until Golden Brown\nC: Heat the Oil to Medium Heat\nD: Get Ready to Print",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_130_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_130_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_130_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_130_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_130_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: You Will Need...\nB: Easy One Egg Omelet\nC: Assemble\nD: \"Coquito\" Puerto Rican Egg Nog.",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Technique', 'Cooking Egg Foo Yung', 'Turning the Egg Foo Yung']",
+    "context": "Here is the context of these images:\n. For this recipe you will need:1/2 cup Cooked [chopped]meat (i'm using turkey but you can really use anything or no meat at all)1 cup Cooked Vegetables1/4 cup chopped cooked onion3 Eggs3 Tablespoons Soy SauceCooked RiceCooking spray or oil Gravy (optional)cooking mold (optional)Saute the onions (and the veggies if you're using raw vegetables).  Combine the cooked onion with chopped meat, veggies, onion, eggs and soy sauce.. This dish till rise or fall on your cooking technique, so pay close attention!  Get your skillet REALLY HOT.  If this is your first time making egg foo yung consider using a cooking mold.  I've been known to use the lid of a mason jar as a mold.  If you happen to have cooking mold GREAT!  If you don't have anything to use as a mold, don't panic.  In lieu of a cooling mold use a 1/3 c measuring cup.. Once your skillet is VERY HOT spray the cooking mold and skillet with cooking spray (or drizzle it with oil).  Immediately spoon the egg mixture into the mold.  If you're using a measuring cup (instead of a mold), pour 1/3 cup of the egg mixture into the hot skillet.  Without a mold you'll need to use the edge of your spatula to keep the eggs from running.Allow the mixture to cook for approx 3 minutes or until the bottom of the egg foo yung is golden brown and the egg is not runny.. Once the first side is brown, remove the cooking mold and turn the egg foo yung over.  Cook the second side until golden brown.  Press each egg foo yung with a spatula to make sure that all of the egg is cooked at the center.    . Heat leftover rice and gravy.  Serve the egg foo yung over rice.Enjoy!for more recipes check me out at www.OneBrownMom.com\nRead the question below and select from the following choices.\nA: You Will Need...\nB: Easy One Egg Omelet\nC: Assemble\nD: \"Coquito\" Puerto Rican Egg Nog.",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_131_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_131_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_131_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_131_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_131_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_131_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_131_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_131_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_131_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_131_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_131_10.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: the Countryside, Doublewide, BLT Grilled Cheese\nB: Finish Cooking and Cut\nC: Build the Sandwich\nD: Combining the Two Pieces of Bread",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Preheat the Oven', 'Place the Bread in the Frypan', '@placeholder', 'Brown the Bread']",
+    "context": "Here is the context of these images:\n. Ingredients:1 1/2 tbsp. of butter  2 slices of white bread  1 slice of muenster cheese  1 slice of gouda cheese  1 slice of American cheeseSupplies:Stove   Frypan  Butter Knife  Spatula  Plate. Preheat a burner on the stove to medium heat. Place the frypan on the burner to be used.WARNING: The burner and frypan will be hot. Avoid touching both of these.. Use the knife to place 1/2 tbsp. of butter in the frypan. Try to melt the butter evenly throughout the frypan. Margarine can be used instead if preferred.. Use a butter knife to apply 1/4 tbsp. of butter to only one side of the white bread. Apply the butter in multiple areas so that it melts evenly. Repeat the previous steps for the other piece of bread. Again, margarine can be used instead if preferred. Also, other varieties of bread can be used instead.. Make sure the butter in the frypan has fully melted. Place the bread in the frypan with the buttered side face down.. Remove all plastic or paper from the 3 cheese slices.  Place the gouda cheese slice on a slice of bread.  Place the American cheese slice on the other slice of bread.  Place the muenster cheese slice on top of the American one.. Wait approximately 2-4 minutes until the cheese has melted. Use the spatula to place the 2 slices of bread together so that the all of the cheeses are now touching.. Use the spatula to flip the sandwich periodically to brown the sides. Perform this step for however long you prefer.The longer you keep the sandwich cooking, the more burnt the bread is.. Transfer the grilled cheese from the frypan to a plate using the spatula. Wait a minute to let the sandwich cool down and then enjoy!. The Grilled Three Cheese is a quick, delicious meal that will have you wanting more. While it already is tasty, feel free to experiment with it by possibly adding ham. It's simple enough to make for an amateur at cooking while also delicious enough for anyone. I hope you enjoy your Grilled Three Cheese!\nRead the question below and select from the following choices.\nA: the Countryside, Doublewide, BLT Grilled Cheese\nB: Finish Cooking and Cut\nC: Build the Sandwich\nD: Combining the Two Pieces of Bread",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_132_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_132_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_132_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_132_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_132_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_132_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_132_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_132_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_132_8.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: How to Make Oatmeal Chocolate Chip Cookies\nB: Ingredients\nC: Choosing the Right Drink\nD: Recipe",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Add Chocolate', \"Don't Overcook Your Cookies\", 'Rock Out With Your Cookie']",
+    "context": "Here is the context of these images:\n. Dry ingredients: - 1 cup flour - 1 tsp baking soda - 1/4 tsp salt Stir in small bowlWet ingredients: - 1 cup light brown sugar, packed* - 1 cup crunchy peanut butter - 1 stick (1/4 pound or 8 tbsp) unsalted butter, room temperature - 1 tsp vanilla extract - 1 large egg - 1/4 cup honey Blend in medium bowl Stir the dry ingredients into the wet ingredients in two additions.*Yes, brown sugar is technically dry, but since it's mixed with all of the rest of the wet ingredients I just put it there. I've used chocolate chunks hacked from a huge chocolate bar, broken up smaller chocolate bars, and semi-sweet chocolate chips. All worked out great.In these photos I'm using 6 oz. semi-sweet chocolate chips. Stir them into the mix.Cover the bowl and put in the fridge for 30-40 minutes so it's easier to handle.. Ovens lie. It's a fact of life that some ovens will tell you that they're at the temperature you want and be off by a full 80 degrees. In my previous apartment, the oven was nice and new and had lovely digital controls. It was also a horrible liar hell-bent on ruining my baking attempts. If I wanted the oven to be at 450, it would let me know it was ready when it was only 370. Five minutes later it would stabilize at 430. That's a full five minutes of your dough being cooked at the wrong temperature, completely throwing you off. So long story short, buy a thermometer and stick it in the oven. It doesn't have to be pretty, it just has to work so you'll know what the true temperature is. For this recipe, you'll want it at 350 Fahrenheit or 175 Celsius.. I roll my dough into balls about 1.25\" (3cm) wide. You can go a little bigger if you want. I've found that a little bit of variation does not have a noticeable effect on the cooking time. Put them onto a buttered cookie tray.The trays I use are insulated ones. Getting fancy cookie trays may seem a bit extreme, but there was a period last year where I got a bit obsessed and these have been totally worth it.So now that you're sure your oven is at 350F/175C and it's stable at that temperature, put the dough in for 12 minutes. . After 12 minutes the cookies will not look ready, but take them out anyway and put the whole pan out on the counter to cool. Do not touch them!Oh, you'll want to touch them. You'll want to put one in your mouth right away, your tender tongue be damned, but don't do it. Really.OK, so you did touch one and it seems fragile and undercooked. That's why you need to let them cool for 5 minutes before moving them to a rack or a plate. . Yay! You waited for the cookies to cool and now they're good to go! You can eat them now or save some for later. These are good on their own or combined with other sugary treats. Like ice cream. Ice cream loves these cookies and vice versa. Let the love flow. To get the fresh baked warmth and goodness feeling back, quickly zap in a microwave or briefly toss into a toaster. Eat.\nRead the question below and select from the following choices.\nA: How to Make Oatmeal Chocolate Chip Cookies\nB: Ingredients\nC: Choosing the Right Drink\nD: Recipe",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_133_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_133_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_133_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_133_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_133_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_133_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_133_6.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: How to Cook Papadums\nB: Wait...\nC: Peel the Peppers\nD: Mix",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Prepare the Cooksite', 'Put the Turkey Under the Can', '@placeholder', 'The Moment of Truth...']",
+    "context": "Here is the context of these images:\n. The turkey needs a small amount of preparation.  The giblets and neck should be removed.  Bend the turkey wings behind the bird (i.e. put the bird in a full-nelson). If you are injecting with marinade, inject the night before up to an hour or so before cooking.  Salt and pepper the outside and inside of bird.  Just before cooking, divide the stick of butter in half.  Shove a half between the skin and meat over each breast.. Prepare the cooksite about 30 minutes to one hour before starting to cook.Crumple some news paper and stuff into the bottom of the charcoal chimneys.  Set down upright on a non-flammable, out of the way surface.  Fill each chimney with charcoal.  Douse with lighter fluid to your level of risk tolerance.  Light the paper.  In my experience, if I can get the kindling to light well, the charcoal takes about 30 minutes to get going.While the charcoal gets going, lay out about a 2 ft x 2 ft square of heavy duty aluminum foil on the ground.  I usually have to fold together two pieces of foil.  Weight the corners with rocks or bricks.  Drive the stake into the center, so that 12 to 18 inches is above the foil.  There should be just enough above the ground so that the turkey's legs just touch the foil when hanged on, and the paint can will invert over the bird and sit firmly on the ground.. Hang the turkey on the stake.  The legs of the turkey should just touch the aluminum foil, but the turkey must be low enough that, when the can is inverted over the bird, it sits firmly on the ground.  Adjust the stake for optimal bird placement.Invert the paint can over the bird.  Be certain the opening of the can sits flat on the ground.  Dump the now whitish charcoals in the chimneys around the can.  Be sure to wear the welding gloves when you do this! Use the charcoal tongs to evenly spread the coals around the can.When you are initially sighting the turkey cooking spot, chose a place away from any structures, and in an inconspicuous place.  The heat of the charcoal will scorch any grass under the foil, so chose an out of the way place (in front of the front door is probably not a great idea).. Now we wait...1 hour and 50 minutes, to be exact.  Do not peek.  Do not raise the can.  If it is cool or windy (less than 50 F, steady wind), you might want to add another chimney of charcoal after 1 hour.  This is a judgment call.So, just relax.  My uncle says the bird takes a 6 pack of beers to cook.  If my aunt is within earshot, he says it takes three beers.. After 1 hour and 50 minutes have passed, you can remove the can.  You should hear the bird sizzling.  Put on the welders gloves.  Use the charcoal tongs to pull back the charcoal from the can.Fetch all of your skeptical guests...Remove the can and bask in the oohs and ahhs.. Cover the coals with another piece of foil, or fold over the foil on the ground to cover the coals.  Place a large pan near the bird hanging on the stake.Wearing the welders gloves, carefully remove the bird from the stake.  At this point, if this is your first time, you will notice that the bird is very VERY tender.  It will have a tendency to fall apart, and into the coals if you let it.  Put the pan close, cover the coals as best as you can, remove the bird as swiftly and cleanly as possible, and pray.Nothing stifles those oohs and ahhs more quickly than a bird dropped in the charcoal.  If you do drop the bird, raise your arms high and shout, \"Fear not, I am uninjured.\"  This may distract your guests just long enough to brush the ash off the turkey...Take the bird to the kitchen or picnic table, carve, and serve.  This is the most tender, juicy turkey.  You will rule the day.  Everyone will want to be your friend.  Enjoy your moment in the sun!Clean up is pretty easy.  I usually wash the stake in the dishwasher.  I scrub some of the cooked on ash and fat off of the paint can with a steel wool pad.  When the can is fairly clean, I rub the can inside with some vegetable oil, to keep the rust down.\nRead the question below and select from the following choices.\nA: How to Cook Papadums\nB: Wait...\nC: Peel the Peppers\nD: Mix",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_134_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_134_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_134_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_134_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_134_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_134_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_134_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_134_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_134_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_134_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_134_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_134_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_134_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_134_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_134_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_134_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_134_16.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Chewy Chocolate Chip Cookies\nB: Make Those Cookies!!\nC: Ingredients\nD: Dough",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Mix Dry Ingredients', 'Bake It Up!!', 'Eat Those Cookies!!']",
+    "context": "Here is the context of these images:\n. Yield: 4 3/4 dozen cookies \u00a0 \u00a0 Prep Time: 15 minutes \u00a0 \u00a0Cook Time: 10-13 minutes\nIngredients:\n2 1/2 cups all-purpose flour\n1 teaspoon baking soda\n1/4 teaspoon salt\n3/4 cup cocoa\n1 cup butter, at room temperature\n1 cup granulated sugar\n1 cup brown sugar\n2 large eggs\n2 teaspoons vanilla extract\n1 cup semi-sweet chocolate chips\n1 cup nuts, I used a mix of walnuts and pecans\n1 cup Jet-Puffed Mallow Bits\nTools:\n1 large bowl\n1 medium bowl\nwhisk or sifter\nmeasuring cups and spoons\nbaking sheet\nparchment paper or silicone baking mat\nmixer or wooden spoon\nmini ice cream scoop\nI used a stand mixer for this recipe, but you can mix it up with a hand mixer or a wooden spoon as well. For the best results use parchment paper or a silicone baking mat. You don't need to replace the parchment paper for each batch, just continue to use parchment until all the dough has been baked. \u00a0I also use a mini ice cream scooper to make perfectly portioned cookies.. Preheat oven to 350 degrees F.\u00a0\nSpread nuts evenly on a baking sheet. Bake 10 to 15 minutes until nuts are lightly toasted.\nWhen nuts are cool, chop roughly.\nLine a baking sheet with parchment or silicone baking mat and set aside.. In a medium bowl, combine flour, baking soda, salt and cocoa.\nWhisk well or sift together with a sifter. Set aside.. With a mixer, cream butter. \u00a0\nAdd sugars and mix until smooth.\nAdd in eggs, one at a time.\nNext, add in vanilla extract and mix until blended.. Slowly add flour mixture to sugar mixture a little at a time until flour disappears. Scraping the sides of the bowl\u00a0occasionally.\nStir in chocolate chips, nuts, and mallow bits.. Drop cookie dough by rounded tablespoons or using a mini ice cream scooper onto prepared baking sheet, about 2 inches apart.\nBake cookies for 10 to 13\u00a0minutes, or until the cookies are set around the edges, but still soft in the center. When I bake 2 cookie sheets at a time I rotate the sheets and switch shelves halfway through the baking time.\nRemove from oven and let sit on baking sheet for 3 to 5 minutes. Move to a cooling rack and cool completely.. Mmmmm chocolaty gooey goodness!! Pour yourself a glass of milk and munch away!\nThanks for checking out my instructable! I hope you enjoy this recipe!\nRead the question below and select from the following choices.\nA: Chewy Chocolate Chip Cookies\nB: Make Those Cookies!!\nC: Ingredients\nD: Dough",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_135_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_135_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_135_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_135_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_135_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_135_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_135_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_135_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_135_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_135_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_135_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_135_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_135_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_135_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_135_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_135_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_135_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_135_17.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_135_18.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_135_19.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_135_20.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_135_21.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_135_22.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_135_23.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_135_24.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Slow Cooked Italian Beef!\nB: Finish\nC: Add Chicken and Spices to Slow Cooker\nD: Shred the Chicken",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['What You Will Need', 'Cut the Tofu in Half', 'Cut the Mushrooms', '@placeholder']",
+    "context": "Here is the context of these images:\n. 1 block of extra firm tofu1.5 cups of mushrooms (baby Bella mushrooms pictured)2 cups of your favorite sauce (teriyaki pictured)1 can of nonstick cooking spray1 cutting knife1 cutting board1 waffle iron1 slow cookerPrep time: 30 minutesCook time: Flexible. Carefully, use your cutting knife to half the tofu block so you are left with two ~3/4 inch thick slabs.  Note: This thickness was chosen to optimize the surface area to internal-tofu ratio on my waffle iron.  Your personal taste or waffle iron may call for different thickness tofu slabs.. The tofu is cooked on the waffle iron to increase the surface area and remove some of the water in the tofu to make room for the flavorful fluids to come in and later explode with deliciousness.Preheat the waffle iron completely.  Spray the waffle iron with nonstick spray and quickly add the tofu then carefully close the waffle iron. After approximately 5 minutes when the tofu is lightly browned and crispy to the touch on both sides, remove it from the waffle iron and set to the side to rest. Repeat with all remaining slabs of tofu.Note: You may find that slight pressure is required to close the waffle iron completely.. Cut the mushrooms as desired. I prefer ~1/4 inch slices.Note: These instructions use mushrooms; however, you can use whatever you want: broccoli, bok choy, carrots, meat, nothing, etc.. Cut the tofu into bit size (or larger) morsels.If the waffle pattern is similar to that shown, I suggest cutting so each waffle well is quartered. Do so by repeating the cut seen on the left twice to receive tofu blocks similiar to those on the right.  Consider trying a block of the tofu now to compare the before and after of the slow cooking.. Combine ingredients in slow cooker and cover with desired sauce. These instructions used a generic store bought teriyaki sauce from Trader Joes; however, any sauce will do.  If necessary use broth to increase the fluid to solid ratio, but keep in mind that mushrooms and tofu both tend to release some fluid and cook down.Turn on high heat for and cook for a minimum of 1-3 hours depending on your preferred texture.. Cook a carbohydrate to go with your delicious tofu dish and absorb extra sauce. I recommend short grained rice or soba noodles. Seen above is short grain brown rice cooked in a rice cooker for ~90 minutes. Cooking times will vary so plan accordingly.. Plate your food in your favorite bowl or plate and enjoy the delicious meal in front of you.\nRead the question below and select from the following choices.\nA: Slow Cooked Italian Beef!\nB: Finish\nC: Add Chicken and Spices to Slow Cooker\nD: Shred the Chicken",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_136_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_136_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_136_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_136_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_136_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_136_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_136_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_136_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_136_8.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Minute Boiled Dumplings\nB: Bring Water to a Boil.\nC: Boiled Beef Tongue\nD: Acquire Tongue",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Salt to Water Ratio', 'Soak!', '@placeholder', 'Are They Done Yet?']",
+    "context": "Here is the context of these images:\n. For this recipe, you only need the following:PeanutsSaltWaterBig ol' stock pot.Yup. That's it. Leave it to southerners to figure out this amazing snack, with such simple stuff. They know.. From what I had experienced, I was certain that the magic was in how salty the water is. I did a little digging around with otherfoodexperts, to figure out how much salt and water they were using.What the experts said varied, but I ended up going with a ratio of 3 Tbs of salt for every 5 quarts of water - and maybe adding a little bit more as it cooks.This snack is designed to sit in this water after cooking is complete, so it will get some of it's salt from remaining in the brine.. The raw peanuts are really dirty. After rinsing them a few times, it's a good idea to let them soak in a warm bath for about 45 minutes, stirring occasionally.I was impressed how much silt came off of them. (Warning: When I made these for a second time, I was a little lax on soak time - and they came out kind of gritty. YUCK.). In a big 15 quart stock pot, I brought about 10 quarts of water and the salt to a boil - it took a while to really get ripping, so maybe boil the water as the peanuts soak.. Add the peanuts to the salt water, and try and maintain a rolling boil. Keep a wooden spoon near by, and stir every 20 minutes. Keep covered when not stirring.Do NOT Simmer - you want this guy to be bubbling throughout the entire cook time. Depending on your range, you'll have to figure out what setting to cook these at, but the flames were about medium-high here in our test kitchen.. When they are done, the husks are soft and the peanuts inside are not crunchy at all. It should have the texture similar to a refried bean :)It should take anywhere between 3 and 4 hours to get them this mushy and perfectly brined.. When they are cool enough to scoop with a slatted spoon, serve in small bowls, and maybe offer a second bowl for shells.I knew these would go fast, but I wasn't expecting people around the office to be as voracious as they were - mikeasaurus proclaimed \"this is my new favorite thing\", JON-A-TRON and jessyratfink - the actual southerners I work with were stoked, and others never knew that a peanut could be cooked and served in this fashion. My roommate asked \"are you sure this isn't crack?\"Success!\nRead the question below and select from the following choices.\nA: Minute Boiled Dumplings\nB: Bring Water to a Boil.\nC: Boiled Beef Tongue\nD: Acquire Tongue",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_137_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_137_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_137_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_137_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_137_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_137_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_137_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_137_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_137_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_137_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_137_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_137_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_137_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_137_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_137_14.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Cheesecake Pops\nB: Supplies\nC: Make Fingers\nD: Haggis Pops",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Make', 'Cover and Stick', 'Decorate and Enjoy']",
+    "context": "Here is the context of these images:\n. \n          So, what are you going to need?Supplies:\n\n\t\tCake Mix of ChoiceMarshmallow Fondant\u00a0- I made a double batch for the Valentine Rice Krispie Treats and had about half left (which is the 8oz recipe), this seemed good for coving the cake pops, but if you want to decorate them with more fondant, you are going to need to make a bit more.\n\t\tBag of Marshmallows (I used 8 oz)\n\t\tSticks\n\t\tSprinkles if you would likeHeart Cookie Cutters (or you can freehand it) \u00a0I like to get a lot of use out of things I buy so it's nice to invest if you can be inventive :). Most know the procedure, but if you don't, here it is :)\nBake a cake.\nLet it cool. \u00a0I broke it into 6 pieces to try to speed the cooling.\nCrumble the cake.\nHeat up marshmallows in microwave. \u00a0I went about a minute at a time at half power and then stirred and heated again till melted.\nMix melted marshmallow and cake. \u00a0You can put this in the fridge for a bit, but I found it wasn't necessarily helpful.\nTime to form them!\u00a0\nI went with a similar method as the Rice Krispie Treats. \u00a0I shoved the mixture into the cookie cutter, but then I heaped it a bit to make it a bit more round. \u00a0 Photo 7 shows this.\nDon't put these in the fridge or freezer if you can help it. \u00a0If the cake mixture is cold, when you put the marshmallow fondant on them it gets all wet and sticky.. Time to cover them in fondant. \u00a0I didn't really find a best way to do this, but I'll talk about some ways I tried.\nUsing the next size up cookie cutter, I cut out two hearts in fondant. \u00a0I then wrapped this around the heart. \u00a0I think this would have worked nicely except the dip in the top of the hearts didn't meet up very well at the top of the heart.\nNext I treated it like the rice krispie treats and just wrapped it around starting from the front and closed it up in the back.\nAnother way I tried that I didn't photograph is I wrapped it around the heart (like ^) but wrapped it around the top and then cut the excess of the sides of the heart.\nI'd love to hear any tips on covering cake pops in fondant.\nOnce it is covered, stick a stick in the bottom.. You can decorate them if you would like. \u00a0I tried out a few things. \u00a0\nCut out smaller hearts to decorate them. \u00a0Use corn syrup to stick them to each other.\nUs corn syrup to stick sprinkles to the pop. \u00a0You can do a specific design like I did with the pearl sprinkles on the pink one in the back right or just completely cover part of the pop with corn syrup and drench it in sprinkles like with the white one in the middle back.\nNow eat them!\nRead the question below and select from the following choices.\nA: Cheesecake Pops\nB: Supplies\nC: Make Fingers\nD: Haggis Pops",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_138_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_138_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_138_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_138_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_138_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_138_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_138_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_138_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_138_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_138_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_138_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_138_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_138_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_138_13.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Wiring Your Home Brewery\nB: Attempt to Eat\nC: Play With Your Food\nD: Harvest Those Hops!",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['How to Know When Your Hops Are Ready to Harvest', '@placeholder', 'Dry Hopping With Wet Hops', 'Dry Hopping or Adding to Boil']",
+    "context": "Here is the context of these images:\n. Native to Europe, Asia and North America- hops now grow on many continents and in many countries. In the Pacific NW many feral Cascade hop plants can be found growing on telephone poles, shared fences etc. Please always be courteous to the people that grow or cultivate hops and ask them before you harvest hops about town unless it is clear that they are truly \"wild\" plants.I grow many different varietals of hops (they are all the same species, just different cultivated varietals, just like wine grapes) in my own yard and have many friends that grow them for the pleasant shade they provide when allowed to grow on pergolas and fences. It takes about two years to have a crop of hop flowers large enough to make a batch or two of homebrewed beer from, but after that the plants can be very prolific growers and producers. About this time of year I end up with a large hop harvest from my own hops (Cascade, Galena, Golding, Sterling, Willamette and Zeus) and extra Cascade from my neighbors. To know that the hops are ready to be harvested, they should be fully grown (more than an inch long for most varietals) and the blades or petals of the cone should be a little bit papery. Some of the hops may already have some browning on the tips. They are not ready if they are very springy and wet feeling, they are over-ready if they have opened up into full bloom and have turned yellow/brown. Please wear gloves that go as far up your wrists as possible when picking hops or pulling down bines because they cause \"hop-rash.\" They seriously do, and it's no fun.. Get a good helper like I did and fill some paper bags with fresh hops-Then get them in the fridge or in your brewing beer asap!. Well... this certainly sounds like an oxymoron.Fresh hops right off of the bine (yes, bine- not vine) must be used quickly or they will start to mold or go all cheesy and gross-tasting due to the oxidation of the oils in the hops. They are called both \"Fresh Hops\" and \"Wet Hops\" interchangeably. Using wet hops or fresh hops is super fun and makes for some amazingly floral beers, but if you've got more than you can use in a harvest, don't throw them away, see my other instructable for building a Hop Drying Oast. . Let's talk about avoiding bacteria or wild yeast infection a bit-Depending on where you are with your brewing process you can either:1. Add the fresh hops into the boil of a new batch of beer to make it a fresh hop beer. In this method the boil kills the bad bugs you don't want infecting your beer.2. Add the fresh hops to the secondary fermentation vessel (in this case a glass carboy) after the vigorous primary fermentation stage and enough alcohol and CO2 has been created to kill the nasty bugs you don't want in your beer. 3. Add them into a filter cartridge or what is called a Randall between your keg's output line and the tapMany people add fresh or wet hops to other stages of beer making like after flame-out as a steeped hop addition for aromatics, but try at your own risk of spoilage :)\nRead the question below and select from the following choices.\nA: Wiring Your Home Brewery\nB: Attempt to Eat\nC: Play With Your Food\nD: Harvest Those Hops!",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_139_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_139_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_139_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_139_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_139_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Chicken Yakisoba\nB: Sterilize Jars\nC: Serve and Enjoy\nD: white  Chicken",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Finish Jar Prep', 'Wrapping Up', 'Conclusions']",
+    "context": "Here is the context of these images:\n. If you are using old jars, it's a good idea to sterilize them.  If they are new jars, it still wouldn't hurt, but you are probably fine to skip this step.Fill a pot with about 1/2 to 1 inch of water and let it boil.  Then place the jars upside-down in the pot for about 5 minutes.  Then remove them from the pot to cool down.. Cut the chicken into medium-largeish sized pieces and stuff them into the jars leaving about 1/4 inch of room at the top.. Clean around the opening of the jars with a clean, wet wash towel.  The main reason for this is to prevent the jars from sticking when you finally want to eat the chicken.Put new lids on the jars with fitted rings.(IMPORTANT!!!  The lids need to be new or the seal won't work!  The fitted rings can be as old as you want). Fill the pressure cooker with:5 quarts of water1 Tablespoon Vinegar (this prevents your pressure cooker from turning black)Then place your jars in face up (jars should not be completely submerged, the water should be somewhere below the fitted rings).. Seal off the pressure cooker and place over high heat.  Let the water boil, which will cause steam to escape from the top.10 minutes after it began to steam, skillfully place the 5 pound weight on top.Now set your timer for 1 hour and 30 minutes.  Once the weight starts spinning \"quickly,\" turn down the heat a little bit.. After the timer goes off, turn off the heat.Let the pressure cooker cool before removing the lid (there should be a pressure indicator on your cooker, once that goes down it means the pressure inside the cooker is the same as the pressure outside the cooker).  This is important for two reasons, one is your safety (lots of really hot steam will shoot at you) and the quick change in pressure can break the jars.  So just be patient.Remove the jars and let them cool.Store the chicken until hungry.. Chicken does not need to be refrigerated.  This stuff will last for a long time (at least a year, maybe more).The main reason we do this is for quick and easy meals.  The chicken is already cooked so we just throw it in our favorite recipe and heat it up.  The chicken is very tender and easy to shred after this process.What can you make with Canned Chicken:Any meal that requires shredded chicken  Chicken Salad  Chicken Taco Soup  Chicken Enchiladas  Hawaiian Haystacks  Chicken Sandwiches  Chicken and Ritz Casserole (probably our absolute favorite, so we made an instructable for this meal!\nRead the question below and select from the following choices.\nA: Chicken Yakisoba\nB: Sterilize Jars\nC: Serve and Enjoy\nD: white  Chicken",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_140_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_140_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_140_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_140_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_140_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_140_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_140_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_140_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_140_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_140_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_140_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_140_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_140_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_140_13.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Bruin Beskuit (Multigrain Rusks)\nB: Items\nC: Bubur Lambuk\nD: Cooking",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Ingredients (all 4 of Them!)', '@placeholder', 'A Bit of History', 'Bonus Round!']",
+    "context": "Here is the context of these images:\n. Ingredient 1: Potatoes - How many depends on both the size of the taters and the number of people you want to feed. One tater per person is a good rule of thumb. In my case I had some big ol' russets and just 4 taters was enough for 6 people.\u00a0Ingredient 2: Barley Flour - Don't sweat it, you can find this at most grocery stores. Regular all-purpose is fine too, if you have allergies to look out for then by all means. Barley flour is just what we've always used.Ingredient 3: Salt Pork - Call it sailors' meat, call it white bacon, call it whatever you like. This sodium rich swine is just what's needed to give these taters some real flavour. A small package goes a long way.Ingredient 4:\u00a0It's a secret! - but don't worry, I'll spill the beans in due course. Bear with me.. Step 1:Peel your tatersStep 2: Shred your taters - I've always used a cheese grater for this but presumably there are other tools that would do the job just as well.Step 3:Add the secret ingredient\u00a0- Caraway seeds!\u00a0What a revelation! I sprinkled in about 1 tablespoon worth to achieve a present but not overwhelming flavour.\u00a0Step 4: Add flour and mix about\u00a0- Stir in a 1/4 cup at a time until you reach a wet dough consistency (or until your arm falls off, whichever comes first). The idea is to soak up some of the water from the potatoes and thicken the mixture, but not so much that it becomes dry or crumbly. Some potatoes are more watery than others so the amount of flour needed will vary. I used just shy of two cups for this batch. After you've achieved the desired consistency, or something close too it (believe me it's not a precision process), you can set it aside.\n*note*\u00a0If the mixture looks thoroughly unappetizing at this point you're doing it right.Step 5:Cut salt pork - I made 1inch cubes, but a little bigger or a little smaller is fine. Set them aside when your done.Step 6: Form your raspekaker\u00a0-First wet your hands, then grab a tennis ball-sized amount of the potato mixture and form it into a ball. Next grab one of your salt pork cubes and push it into the middle of the ball. Reform the surface of the ball\u00a0concealing the meat inside. Set the ball aside on a plate and continue to form raspekaker until you've used the entire mixture. It's advisable to clean off your hands after forming each ball as the sticky mixture can accumulate making the process more difficult than it has to be.. First get out your biggest pot, fill it with water and bring to a boil.\u00a0Next begin dropping your raspekaker into the pot (not on top of each other but beside). Use a spoon or laddle for this operation if you want to avoid spashling hot water around when you plop them in the pot.\nSimmer for 1 hour. It's a good idea to use a spoon push them around the pot once in a while, just to make sure they're not stuck to the bottom.. While it's cooking I'll take a moment here to recount the history of the humble raspekaker within my family. It entered our diet by way of my father, who picked it up from his mother who learned it up from her husband who was himself Norwegian. As the recipe traveled from one kitchen to the next so to did it evolve. My father recalls eating it with cooked bacon in the middle instead of salt pork, and the drippings from the bacon was saved and drizzled over the raspekaker after they'd finished cooking. On special occasions, namely Christmas morning, they would drape slices of\u00a0gjetost over the steaming raspekaker, which would melt and create a delicious cheesy coating. Gjetost, another little piece of Norwegian influence that my grandfather brought with him, is a brown goat cheese... the flavour is very unique.\u00a0\n*note* When I say brown goat cheese I don't mean cheese from a brown goat, but brown cheese from a ordinary coloured goat (brown being but one possibility in the spectrum of goat colours).\nFor me raspekaker is something of a comfort food but it can also be a very practical part of your diet. I find 1 ball makes for an adequate breakfast. They freeze well and for their size pack a lot of calories. It's a nice change up to oatmeal or breakfast cereal in the morning. They also lend themselves well to experimentation. At its most basic it's just potatoes and flour; a blank slate. This is probably why if you look around everybody is making raspekaker in slightly different ways. You can pick what meat and seasoning you like best, or forgo such complications and just dress it up after cooking with a sauce or spead of your choice.\nAnyways, that's enough of me expounding on the virtues of raspekaker, lets get back to the kitchen and see what we've got.. Use a ladle to retrieve the raspekaker and let the water drip off them as you do so. Now they're ready to serve.\nI like to cut them in half exposing the\u00a0succulent\u00a0porcine core. Then I drizzle some melted butter on top for added flavour and sprinkle it with a bit of parsley for looks. That's that! Eat up.. It's almost always the case that you'll have leftover raspekaker, but if you think that means you made too much think again. Arguably the leftovers are the best part. Just pop 'em in the fridge or freezer until you're ready for them. When it's time for more raspekaker just bust out the frying pan and brown 'em up in some butter. Oh boy do they taste great like this!\nWell folks, that's all for this instructable. From my family to yours, happy cooking.\nRead the question below and select from the following choices.\nA: Bruin Beskuit (Multigrain Rusks)\nB: Items\nC: Bubur Lambuk\nD: Cooking",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_141_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_141_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_141_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_141_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_141_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_141_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_141_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_141_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_141_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_141_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_141_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_141_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_141_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_141_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_141_14.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Watermelon Limeade With Chervil Infusion\nB: Make Watermelon and Chervil PopCorn\nC: Milk\nD: Watermelon Cucumber Smoothie",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Sugar', 'Chervil', 'Drink and Enjoy!']",
+    "context": "Here is the context of these images:\n. Serves 2.- 1 and a half cups of roughly chopped watermelon flesh- 2 and a quarter cups of milk- 2 tablespoons of cream- 2 teaspoons of white sugar- 1 teaspoon of finely chopped chervil. Roughly chop enough watermelon to equal 2 cups. Remove seeds and place in blender.. Measure out 2 and a quarter cups of icy cold milk and add to blender along with 2 tablespoons of cream.. To this add 2 teaspoons of white sugar.. Finely chop enough chervil to fill a teaspoon and add to blender mix.. Hold down lid of blender and hit the start button. Blend until all ingredients are combined and mixture becomes light and frothy.. Warning: this milkshake is sooo good you mightn't want to share!Pour blended mixture into glasses and enjoy!\nRead the question below and select from the following choices.\nA: Watermelon Limeade With Chervil Infusion\nB: Make Watermelon and Chervil PopCorn\nC: Milk\nD: Watermelon Cucumber Smoothie",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_142_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_142_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_142_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_142_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_142_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Bake!\nB: Simple Bread Dough\nC: \nD: ",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Mix the Dries, Add the Wets.', 'You Need to Knead.', 'Cut and Roll', '@placeholder']",
+    "context": "Here is the context of these images:\n. Take your 2 1/4 Cups warm water and add your sugar to it.  I used brown sugar, which is why it looks dark.Add your yeast, stir it in, and set it aside while you work on the rest of it.  You can do this before setting up for baking, because it takes a little time.  During this time, set your oven on low if you are using it for rising.  Proving the yeast allows the little yeastie beasties to wake up and get busy reproducing and converting the sugars.  After a couple minutes, you should have some foam on top.  Sometimes I get real good foam, sometimes weak foam, but I think it's usually temperature, and the bread comes out fine unless the yeastie beasties are deceasedie.  . Mix most of the flour (5 cups or a little more) and the 1 T salt.  A mixer is not necessary, but I've got one, you know?Add the proved yeast, add the 1/4C vegetable or olive oil, and mix it up!  Saving some of the flour is so that it doesn't get too dry, because measurements sometimes change and it's better to have wet dough for rising.  If it's really wet and sticky, add more of the flour until you reach the full 6 cups.  . Once it's well mixed and still kinda sticky, turn it onto a floured counter and knead it until nice and smooth.  Add flour so it stops sticking to you, but err on the wet side.  The picture is pre-kneading.  It will turn into a beautiful ball afterwards.  . Get a big bowl with a little oil, roll your dough in the oil, and set it in a warm spot.  If using the oven, you will want to preheat it on its lowest setting and turn it off before putting the dough in.  I forget to turn it off sometimes, so I just leave the door open for a couple minutes to let it cool to a nice warm cozy temperature for the yeasties.  Leave it for 60-90 minutes, until about doubled in size.  . After the dough has risen take it out of the oven, and turn the oven to 475 F (245 C).  Put your baking pan or cookie sheet in the oven as it heats, you want it to be hot!For this size recipe, you can cut the dough into 16 equal pieces, make them into little balls, and roll them flat.  Give them a good coat of flour so they don't stick to the pan.  Roll them about 6 inches in diameter, so that you can hold one in a hand fairly flat.  I only rolled them two at a time to save counter space.  . Hold a flat piece in one hand.  With an oven mitt on the other hand, open the oven, and pull out the hot pan just enough to flop your bread onto it, nice and flat.  You can grab and pull it if it folds, but don't burn yourself!It'll take about 5 minutes for each one, and you'll see them puff up a bit.  If you're lucky, you'll get full balloon-type pitas, but the others are good too!  Bake a little longer if you want some darker crust.  That's it!These go well with just about anything.  Bake/microwave with some cheese, eat it with chili, dip it in your borscht, or just heat it up and give it some butter.  Enjoy!\nRead the question below and select from the following choices.\nA: Bake!\nB: Simple Bread Dough\nC: \nD: ",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_143_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_143_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_143_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_143_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_143_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_143_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_143_6.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Saos and Cream Cheese {yum}\nB: Fry the Bread\nC: The Mix\nD: Cous Cous and Halloumi Cheese",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Lets Go Shopping', 'Pre Step.. Cuttiing the Cheese.......', '@placeholder', 'Serve It Up']",
+    "context": "Here is the context of these images:\n. A simple line up of ingredients done to my taste. I like velvetta, a processed cheese, as well as chedder. Miricle Whip is a bit more heart friendly than mayonaise but its kind of sacrilidges in this dish. For the kicked up spice I'm using jalapeno's in adobo sauce.\n\u00a0Velvetta cheese\n\u00a0 Chedder cheese (Im using white extra sharp)\n\u00a0 Jalapenos in adobo sauce\n\u00a0 Grater\n\u00a0 Container with lid\n\u00a0 Knife. Velvetta cheese is very soft and difficult to grate, I cut off a chunk, stick it in a plastic bag and into the freezer for about a half hour. For a very simple tool a grater can be\u00a0 very dangerous to your fingertips and knuckles so be careful here unless you like blood in your food and bandages on your fingers.\n\u00a0Grate your cheese or cheeses into the container your going to mix and store the cheese in, this will save on washing up. take a couple of your jalapenos and some adobo sauce, put it all on a plate or other cutting surface. Chop, slice, dice, mash or smash the peppers. scrape, spoon or otherwise get the peppers and sauce from the previous step into the bowl of grated cheese. Add your mayonaise or miricle whip and\u00a0stir. Does it look to dry? (pic2) add more miricle whip and stir some more until it looks like pic 4. There's a lot of ways to enjoy pimento cheese, the basic but very popular spread on white bread, as a dip with dorito's, corn chips or pita crisps, on crackers or as Im doing here on lightly toasted whole grain. It also makes an awesome grilled cheese sandwich\nRead the question below and select from the following choices.\nA: Saos and Cream Cheese {yum}\nB: Fry the Bread\nC: The Mix\nD: Cous Cous and Halloumi Cheese",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_144_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_144_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_144_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_144_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_144_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_144_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_144_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_144_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_144_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_144_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_144_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_144_11.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Professor Phineas' Infamous Ginger Beer\nB: Chip Cookies\nC: Measure Butter\nD: Ingredients and Equipment",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Flour and Spices', 'Putting It All Together', 'Bake']",
+    "context": "Here is the context of these images:\n. \u00bd cup butter\n1 cup packed golden or dark brown sugar\n1 large egg\n\u00bc cup molasses\n1 tsp vanilla\n3-6\u00a0 pieces candied ginger, chopped\n2 cups all-purpose flour\n1 tbsp fresh grated ginger\n1 tbsp ground ginger\n1 tsp ground cinnamon\n\u00bd tsp ground cloves\npinch ground cardmom\n2 tsp baking soda\n\u00bc tsp salt\ngranulated sugar for rolling. I don't like measuring butter after it's softened like you're supposed to do for cookie recipes, when it's already marked out in handy increments on the wrapper. Take the butter out of the fridge and cut off \u00bd cup, which is a quarter of a brick, one stick, 113 grams or 4 oz, however you want to quantify it.. Measuring brown sugar is my favourite part of cookie baking, next to sampling the result! Spoon sugar into your measuring cup, pressing down with the spoon as you go. When the cup is full, unmold the cute little cake of sugar into the bowl with the butter. If you packed it enough, the sugar won't fall apart when you do this.. Now that the butter and sugar are together, you'll have to wait for the butter to soften enough to be able to cream them. Cutting the block of butter into small pieces will help it warm up faster while you get the rest of the ingredients ready.. Put the two cups of flour into another bowl, and stir in all the ground spices, baking soda, and salt. I like to save the cardamom seeds from the cardamom pods I get in expensive tea blends after I've drunk the tea, and grind them into whatever I'm baking.. Chop the candied ginger and peel the bottom inch or inch and a half of the fresh ginger. Don't cut the peeled part off, you'll have something to hold on to when you grate it.. By now, your butter should have softened enough to mix. If not, take a tea break, or zap it in the microwave for a few seconds if you're impatient and using a microwave safe bowl.\nBeat the butter and sugar together on low speed until thoroughly mixed. Then add the egg, molasses, and vanilla. Grate the ginger and add that too. Beat again until smooth and creamy.. This is the part where I clean the beaters on the mixer because I won't be needing it anymore. When combining the wet and dry mixtures in any baking, you don't want to overmix it because that will develop the gluten in the flour and make your baked goods tough. I'm mixing this with a spatula.\nAdd half the flour mixture and the chopped ginger to the creamed mixture. Stir just until mixed. Add the rest of the flour and stir just until mixed. A little bit of dry stuff around the edges is ok.. Now, set up a little manufacturing station with your dough, a saucer of sugar, and your baking sheet. I found that using parchment on the baking sheet makes for chewy cookies. Baking them directly on the greased sheet results in crisp cookies. I like my cookies crisp on the outside and chewy inside, so I will be baking them on parchment.\nPreheat your oven to 350\u00baF now, and start rolling the dough into 1\" balls. I like making the cookies small so I can eat more of them.\nI do most of the forming with the teaspoons to avoid handling the dough too much.\u00a0 Sorry there aren't more pictures of this process, but it's a two-handed job. What I do is scoop up a small lump of dough with one spoon, and then scrape it into the other spoon. Scrape it back into the first spoon. Repeat this a few times until the lump of dough is rounder and more compressed. Then I give it a quick roll between my hands and drop it into the dish of sugar. Roll it in the sugar until all sides are covered, and place on the cookie sheet.\nIf you stagger the lines of cookies, you can fit more of them on the sheet.\nKeep on rolling cookies until the oven is ready. After I put the sheet in, I start putting the cookies-in-waiting on a plate.. Bake for 8-10 minutes. When you take the cookies out, they will be puffy, but they will flatten when they cool. While they are baking, finish rolling the rest of the dough.\nIdeally, you should have two cookie sheets, so the next one will loaded with cookies and ready to go in the oven by the time the first one is done. I only have one sheet, so I move all the cookies off it after a minute or two, and then let the sheet cool before putting on the next batch of cookies. You don't have to let the sheet cool all the way - if you can pick up the sheet with your bare hands then it's ready for more cookies.. You're done! This recipe makes about 60 cookies.\nThese cookies travel very well, so share them with your friends. Or don't, and keep them all for your teatime.\nRead the question below and select from the following choices.\nA: Professor Phineas' Infamous Ginger Beer\nB: Chip Cookies\nC: Measure Butter\nD: Ingredients and Equipment",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_145_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_145_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_145_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_145_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_145_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_145_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_145_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_145_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_145_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_145_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_145_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_145_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_145_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_145_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_145_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_145_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_145_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_145_17.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_145_18.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_145_19.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_145_20.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_145_21.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_145_22.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_145_23.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_145_24.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_145_25.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_145_26.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_145_27.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_145_28.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_145_29.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_145_30.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_145_31.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_145_32.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_145_33.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_145_34.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_145_35.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Cake Base\nB: Rainbow Layer Cake\nC: How Many Guests Do You Need to Feed?\nD: Make Your Cakes, Icing, Fondant, Etc.",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Apple Layer', 'Poppy Seeds Layer', 'Walnut Layer']",
+    "context": "Here is the context of these images:\n. 500 g flour250 g butter100 g icing sugar3 yolks1 dl white wine of your choiceMix dry ingredientsAdd in yolks and butter, work until mixture is crumblyPour in wine and kneadWrap dough with cling wrap and chill in the fridge for 2 hMeanwhile, prepare the other layers. 800 g green apples (tart but sweet)2 tbsp honey1 lime, juiced and grated the skinenough ground cloves and cinnamon powder (to taste)Skin and grate apple, then mix with the rest of the ingredients. 200 g poppy seeds, ground until fine100 g sugar50 g raisins1 dl white wine of your choiceMix everything in a pot, heat over low-medium heat until thickened (sugar dissolved)Cool to room temperature. 200 g ground walnut100 g sugar50 g raisins1 dl white wine of your choiceMix everything in a pot and heat over low-medium heat til thickenedCool to room temperature. Divide dough into 4 portions, roll flat each of themPlace one layer as bottom, spread walnut filling, top with another layer of dough, spread poppy seeds layer, then top again with another dough, spread with apple layer and top with the last doughBrush top dough with an egg wash and bake at 170 C for 60-65 minutesJust before slicing, sprinkle top with generous icing sugarEnjoy :)PS: You can place them into mini cupcake liners \nRead the question below and select from the following choices.\nA: Cake Base\nB: Rainbow Layer Cake\nC: How Many Guests Do You Need to Feed?\nD: Make Your Cakes, Icing, Fondant, Etc.",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_146_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_146_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_146_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_146_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_146_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_146_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_146_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_146_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_146_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_146_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_146_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_146_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_146_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_146_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_146_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_146_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_146_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_146_17.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_146_18.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Making the Pie Crust\nB: Scratch Pumpkin Pie\nC: Roll, Dry, and Cut the Dough\nD: Mix the Dough",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Scooping Your Pumpkin', 'Pumpkin Mixture', '@placeholder', 'Filling and Baking']",
+    "context": "Here is the context of these images:\n. Sugar pumpkins are the best, butternut squash is a great substitute.   First wash the pumpkin then cut in half vertically. Remove the seeds and strings(reserve the seeds for toasting). Place on a baking sheet shell side up and bake at 325 deg. for 1 hour or more depending on size.  Bake until tender.( I bought this pumpkin from Home Depot.... This is the first time the shell did not collapse, usually the shell gets soft  so don't get worried if it gets soft ...that's about the time its done. ). When baked tender scoop out all  pulp and place into a processor or blender, blend until smooth. In a mixing bowl mix well 2 cups pumpkin puree.1 1/2 cups evaporated milk1/4 cup dark brown sugar1\\2 cup white sugar1/2 tsp. salt1 tsp. cinnamon1/2 tsp. ground ginger1\\4 tsp. ground nutmeg 1\\8 tsp. ground cloves2 slightly beaten eggs. I use Sweetzels spiced wafers for the shell.  in a processor  add 2, 7 oz. boxes of wafers. and blend until fine.add: 2 tbsp. light brown sugar and a dash of salt. With the processor still running add 6 tbsp melted butter until combined.Line a 9 inch pie pan with the crumb mixture about 1\\8 inch thick all around..Bake at 350 deg. for 6 to 8 minutes and let cool.. Fill the pie shell with the pumpkin mixture. very carefully place into a 425 deg. oven for 15 min.  Reduce heat to 350. deg. and bake for 45 minutes more or until an inserted knife comes out clean.  Any extra mixture can baked in a small ramekin .   Enjoy!!!! \nRead the question below and select from the following choices.\nA: Making the Pie Crust\nB: Scratch Pumpkin Pie\nC: Roll, Dry, and Cut the Dough\nD: Mix the Dough",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_147_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_147_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_147_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_147_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_147_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_147_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_147_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_147_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_147_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_147_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_147_10.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Chicken Shwarma\nB: Relax Your Chicken!\nC: More Chopping.\nD: Enjoy!",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Preparing the Samosa Sheets', 'Pasting in to  Triangle', 'Frying', '@placeholder']",
+    "context": "Here is the context of these images:\n. For the\u00a0 filling\nChicken pieces with out bones (1/2 Kg)\nOnions(double the quantity of the chicken)\nGarlic\nGinger\nLime\n3-4 cloves\nFew green chillies (option)\nSalt\nChili powder/Paprika powder\nOil\nCurry powder\nTurmeric PowderFor the dough\nFlour\nSalt\nOil\nWater. Combine salt and water together mix and knead well. With your floured hands roll out ,make small balls in to pairs,double the size of a marble and flatten them with your finger tips.Apply oil to one side and place another flatten dough on top.Press them together ,roll out a bit.\nDust a little flour on the working board and roll them again\u00a0 to oval shape to about 7-8 inches in length.If you want you can roll it a little bit longer\u00a0 ,so that you can have bigger Samosa's. Have them all ready, so that you can warm it in the\u00a0 pan one by one. Toss them in to the heated non stick pan for few seconds.Just one side half done would be fine\nDon't wait until it is fully done.Take it out and slowly remove the two pasted sheets.Since I applied oil in between the doughs once it is heated slightly, it removes with out any problem\nYou can make the Samosa sheets as much as you want and freeze them for later use..Defrost\u00a0 very well before you use.. Make a paste by mixing flour and water.\nFold from\u00a0 the\u00a0 right side of the dough, with your finger tips apply the paste marked along the dots.(see picture)Press well and paste.\nNow fold from the left side and paste on top of the first fold.Se detailed\u00a0 picture.Once you do the pasting it should be in a\u00a0 shape of a triangle.. Combine chicken,salt and turmeric powder,cook\u00a0 on slow fire until is done.Drain the water out and chop the chicken in to tiny pieces.\nChop the onions and make the garlic and ginger paste.\nHeat oil and stir fry the garlic first,and add the ginger paste,when it is a bit golden\u00a0 in color add the onions ,salt. and curry leaves.\nIf you like to have hot ,add green chillies cut in to small rounds.The onions will reduce to half of the quantity once it is cooked\u00a0 on slow fire.Then add the chicken,stir well,add chili powder ,turmeric powder,curry powder one by one on slow fire.\nWhen it is fully cooked add half lime juice ,stir,cover and let it simmer in slow fire.Time to time m last mix well .This gives a very nice flavor.. I filled up all the triangle cones with the yummy chicken filling.\nFollow the rest of the pictures you will understand.\nHave them all ready in a try to be fried.. Depending\u00a0 on how big your frying pan, put a couple of the Samosa's in hot oil and fry them until golden brown.Turn them on both sides while frying.Transfer it to a sieve for the oil to drip and then to a kitchen tissue.Semi fry if you want to freeze them for later use.\nWhen it is cool pack them up and freeze it..Defrost two hours before and re fry to golden color.. Have them hot hot as it is very crispy.I love this Chicken Samosa so much that I end up eating about 8-10 at\u00a0 one go.\nEnjoy\u00a0 and thanks for reading.Note:\nI have posted this in the Hurriecane contest too.\nI am a day care mom who is working full time,and do not have enough time for my sewing and crafting work which I love it very much.If ever I win the laser cutter I will give up my full time job or reduce it to a couple of hours(because I do a lot of craft work\u00a0 with the kids and they will be dissapointed to see me no more)and start my own business at\u00a0 home as I have enough and more sewing orders which I can not do it on time.Every instructable I do with the help of the laser cutter,I will definetly post it on this great site.\nRead the question below and select from the following choices.\nA: Chicken Shwarma\nB: Relax Your Chicken!\nC: More Chopping.\nD: Enjoy!",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_148_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_148_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_148_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_148_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_148_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_148_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_148_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_148_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_148_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_148_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_148_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_148_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_148_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_148_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_148_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_148_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_148_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_148_17.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_148_18.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_148_19.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_148_20.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_148_21.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_148_22.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_148_23.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_148_24.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_148_25.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_148_26.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Blue Hawaii Parfait\nB: You Will Need...\nC: Just a Note...\nD: Snow Cone Surip",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Raspberries', 'Water', '@placeholder', 'Molding']",
+    "context": "Here is the context of these images:\n. First you'll need 17 raspberries strawberry jello mix sugar  Hawaiian bunch snow cone syrup and Popsicle molds. . Take the raspberries and crush them in a bowl. Take one table spoon of jello mix and put it with the raspberries. The jello will help it harden quicker.. Add 1/4 a cup of warm water I the bowl and mix.. Add 1/4 a cup of snow cone mix and mix.. Once mixed pour the mix in the molds and freeze and in 15-20 minutes and enjoy    \nRead the question below and select from the following choices.\nA: Blue Hawaii Parfait\nB: You Will Need...\nC: Just a Note...\nD: Snow Cone Surip",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_149_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_149_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_149_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_149_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_149_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_149_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_149_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_149_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_149_8.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Man Cupcake Cake\nB: The Cake\nC: Spongebob Cake\nD: Decorate/Eat",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Gather Your Supplies', 'Vanilla Extract', 'Microwave', '@placeholder']",
+    "context": "Here is the context of these images:\n. For this project you will needA coffee cupA 700 or 1000 watt microwaveTwo forks or One fork and One whisk5 Tablespoons, or one tablespoon you continue to whip off as you goA small bowl . For this recipe you will need: FlourSugarVanilla ExtractAn eggOil, (vegetable preferred)Milk. Measure out four tablespoons of flour and four tablespoons of sugar into your coffee cup, then mix them together with your fork. . Whisk an egg with either another fork or a whisk in a separate bowl. Then pour it into the cup with your flour and sugar. Next, stir all three ingredients together evenly.  . Now, measure out three tablespoons of milk and three tablespoons oil and add them to your coffee cup. Blend them evenly with the other ingredients. . Add one table spoon of vanilla extract and stir it in. . It's time to microwave.If it's a 700 wattIt needs to cook 4 minutes.If it's a 1000 wattIt needs to cook 3 minutes.. If you would like you can gather frosting or other toppings and decorate your 'Cup' Cake before you eat it. I, personally, did not and just ate it plain. \nRead the question below and select from the following choices.\nA: Man Cupcake Cake\nB: The Cake\nC: Spongebob Cake\nD: Decorate/Eat",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_150_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_150_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_150_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_150_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_150_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_150_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Pancake Mix to Banana Pancakes\nB: To Cover or Not to Cover\nC: Enjoy\nD: Clean and a Word From Emma",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Spray and Pour', 'Wait and Take Out', 'Serve and Enjoy!', '@placeholder']",
+    "context": "Here is the context of these images:\n. Add...\u2022 Your 1 cup of pancake mix\u2022 3/4 cup of water\u2022 And the optional sprinkles Mix...\u2022 Mix all ingredients until you have a good mix- not watery and not lumpy . Spray...Take your bakers spray and spray the top and bottom of your makerPour...Pour your batter in each of the donut holes. Wait...Wait until each donut has fully cooked threw-until golden and you checked with a tooth pickTake Out...Take the donuts out when you checked them all to see if they are cooked threw. Serve... Serve the donuts on a plate of course!Enjoy...Then enjoy your donuts! If you like put syrup and/or whipped cream in a bowl to dip in! (I look bad!). Cleaning...\u2022After you are done wipe the counters, maker....\u2022Put all of your ingredients away_____________________________________And you are done!Hope you enjoyed!Like, and comment and other donut flavors that I should try!- Emma (iluvmy2pets)Btw this was my first DIY on here!\nRead the question below and select from the following choices.\nA: Pancake Mix to Banana Pancakes\nB: To Cover or Not to Cover\nC: Enjoy\nD: Clean and a Word From Emma",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_151_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_151_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_151_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_151_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_151_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_151_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_151_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_151_7.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Slow Cooker Potato Soup\nB: Heat Milk & Chocolate\nC: Whisk Milk\nD: ins",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Mix All Your Ingredients in Your Slow Cooker', 'Let the Slow Cooker Do Its Thing', '@placeholder', 'Travel to the Party and Enjoy']",
+    "context": "Here is the context of these images:\n. To make the hot chocolate you will need: 1 bag of semi-sweet chocolate chips (~2 cups) 1 carton of cream (16oz) 1 can sweetened condensed milk (14oz) 6 cups of milk (I used 2% because that is what we drink) 1 Tablespoon of vanilla Mix everything together in the slow cooker. Alternatively you can use whatever your favorite hot chocolate recipe is and mix it all in the slow cooker. Subbing your favorite type of chocolate in for the semisweet chips is also a way to mix it up (white chocolate chips also make a very yummy hot chocolate).. Once everything is mix set the slow cooker to low and let it heat for about 2 hours. I whisked the mixture after about 1 hour and then 1.5 hours in. After 2 hours I whisked again to make sure all was incorporated, after 2 hrs all the chocolate should have melted and it should be a delicious hot chocolate.. The hot chocolate on it's own is pretty awesome, but everyone loves a little extra too! The event I am bring this to has both drinker and non-drinkers so I left everything to the side so people could choose what they want to add! Ideas for non-alcoholic mix-ins Mini marshmallows (or big ones) whipped cream caramel sauce Ideas for alcoholic mix-ins Peppermint Schnappps Butterscotch Schnapps Irish Cream RumChata Kahlua Infused vodkas Alcohol infused creams. Once the hot chocolate is made you can just transport your slow cooker to where ever the party is, plug it in and have it set to warm or low. People can ladle hot chocolate into cups and mix with all the extras! Yum! I forgot to take a picture before I went and there wasn't any left afterwards to take a picture of!\nRead the question below and select from the following choices.\nA: Slow Cooker Potato Soup\nB: Heat Milk & Chocolate\nC: Whisk Milk\nD: ins",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_152_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_152_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_152_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_152_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_152_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Bubur Lambuk\nB: Gut Stuffing Prep\nC: Garlic & Quorn\nD: Sauce",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Bloody Sundried Tomato Oil', '@placeholder', 'Boiling Pasta', 'Finish Them Guts']",
+    "context": "Here is the context of these images:\n. \n          This instructable requires basic cooking and cutting skills.\nYou will be needing\u200b\n250g Spiral Pasta1 diced onion6 diced garlic cloves1 diced carrot1 stalk diced celery1 cup frozen peas3 sun dried tomatos18 large basil leafssalt and pepper100g Pancetta1 egg100ml cream100g parmesan cheese50ml olive oil (plus extra)1 length 30mm Sausage casingsFor the sausage casings, I did not want to have to recook the meal so I used non animal based casings. These required being soaked in warm water for at least 45 minutes. \u00a0After which I rinsed and let sit in bowl.\n        . This step is easy, I used three Sundried tomatoes and a bit of reserved oil from the jar. \u00a0Dice up the tomatoes finely and mix with the oil... taadaaa!! Next step, the Sausage Stuffing Device.. I wish I had a proper sausage stuffing horn, but I don't. \u00a0So instead I made one. \u00a0I found a cylindrical plastic wine glass and cut off the bottom. \u00a0You can see a marking on the picture where the I measured a cut. \u00a0I aimed to make it around 25mm wide.\nFrom there I took a rinsed sausage casing and tied a not in the base of it, pretty simple ey'. \u00a0The other end I carefully slid on my new homemade sausage casing horn...\u00a0. \n          Another relatively simple step. \u00a0\nGet a large pot and put in heaps of water, pasta likes to be able to swim around a bit...Get that water hot, I do this by igniting the small burning element underneath the pot.Salt your water, you should aim to make your water as salty as the ocean...\u00a0Take note, you will need about a cup of water from this step for the next step, either reserve a bit at the end or start the next step at the same time like a champ.Add pasta and give it a stir for a few seconds so that it doesn't clump together. \u00a0Notice the wooded spoon on top, that is to help prevent the water from boiling over.After around 5-7 minutes I begin testing the pasta, to do this I carefully scoop out a noodle and take a bit. \u00a0You want the pasta to be quite soft for this meal, a bit past al dente. If it is not done, wait a few more minutes and try again.When the pasta is swell, pour it into a strainer and give it a quick rinse. \u00a0I want the pasta to be warm still since I will be serving immediately.\n\u00a0\n        . I place the pancetta in a pan on low with a tablespoon of olive oil then\u00a0let it slowly warm up, this encourages as much of the delicious fats to ooze out of this delectable Italian style bacon.After about 10 minutes of occasionally stirring I add the onions, garlic, carrots, and celery.Increase the heat to medium and continue cooking for another 5-10 minutes occasionally stirring.After the onions begin to brown and carrots soften, I begin to take some of the pasta water out and pour it into the pan. \u00a0This is a process known as deglazing. \u00a0You may have noticed lots of browned bits in the bottom of the pan, but adding some liquid and stirring with a wooden spoon was scrape those delicious bits off and create a bit of a sauce at the same time.I next add the cream and the peas.. \n          By now you may be drooling a bit... That is to be expected. \u00a0The scent of garlic and fatty cured pork belly does that sort of thing.\nI next add the basil... thats it, just put the basil in the pan.Now I temper the eggs. \u00a0This is a process to slowly warm up eggs so that they do not curdle when added to a sauce. \u00a0To do this I mix the egg in a small bowl then add a few spoonfulls of the carbonara sauce.Next I pour the sauce into a bowl on top of the pasta...\u00a0Now throw on the parmesan cheese and egg mix. \u00a0The heat from the sauce and pasta will be enough to cook the egg and thicken the sauce a bit...Yeah, now it is getting cheesy and delicious...You can eat this now, or you can continue this instructable and make it look like disgusting intestines... your choice.. Now comes the fun and giggles...\nBe careful, I waited about 10 minutes for the pasta to cool down a bit so I did not burn myself.\nTo do this, I add a spoon full of pasta into the sausage horn and cram it down with the end of a spoon. \u00a0Not the most sophisticated of methods, but it works... \u00a0Do pay attention to the lovely noises the video makes, doesn't that sound delicious.\nKeep adding a bit more and cramming it down into the the casing. \u00a0Make sure you make a lot of childish jokes and unleash your true immaturity during this part.\nEventually you should have about a meter or so of dinner, tie off the other end and repeat the process until you are out of pasta. \u00a0I was able to make about 3 lengths of intestines with this.... Mix with the bloody oil, light some candles, and unleash your inner zombie.\nThese look disgusting, they really do... I am kinda cringing about making something that looks so gross, but it does taste quite delicious... so bon appetite... unless you are the walking dead, in which case I say aarrghh, yarrgghh, aaarrhhghgh.\nRead the question below and select from the following choices.\nA: Bubur Lambuk\nB: Gut Stuffing Prep\nC: Garlic & Quorn\nD: Sauce",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_153_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_153_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_153_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_153_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_153_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_153_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_153_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_153_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_153_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_153_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_153_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_153_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_153_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_153_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_153_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_153_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_153_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_153_17.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_153_18.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_153_19.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_153_20.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_153_21.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_153_22.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_153_23.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_153_24.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_153_25.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_153_26.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_153_27.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Salmon Carpaccio\nB: Cooking on the Stove\nC: Prepare Coronet Batter\nD: Greens",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Prepare Creme Fraiche', 'Prepare Salmon Tartare', 'Assemble and Eat']",
+    "context": "Here is the context of these images:\n. To properly roll the cookies into their ice cream cone shapes, you'll need a conic form that can withstand some time in a 400F oven.\nThe cookbook suggests a #35 4 1/2\" coronet mold, but since this was a one-off for a French Laundry themed party we decided to make our own out of paper.\nAfter some rummaging, I found a 4\" diameter circular object for tracing (the base of a pitcher) and made some circles on a manila folder.  I also made one on a sheet of glossy paper, the thick stock used as the cover of an expensive yuppie magazine we magically get for free.  Note that I'm NOT putting the glossy stuff into the oven for fear of toxic bleeding or outgassing; that's what the manila folder is for.\nDraw another circle on the glossy paper ~1/2\" outside the original circle, and add a tab.  Now cut around the outside circle and inside of the 4\" circle to make a 4\" diameter stencil.\nCut out the manila circles; I used 5.  These need to be shaped into cones for use as your forms, so you've got to get them nice and tight.  I wanted to staple them into position, but they're too small to successfully staple.  We also nixed glue, tape, and rubber bands as unable to stand up to oven conditions.  Pinning sounded good in theory, but probably would have ended in tears.  I finally ended up sewing them in place, which was surprisingly fast.    The key is to pass the thread directly THROUGH the cone, then wrap around the flap as you prepare for your next pass.  After three or so stabs across the cone, exit next to the original knot (you should have made a BIG knot, and left an inch or so of tail) and tie off with the tail.  These worked beautifully, and looked sort of spooky. . Ingredients:\n1/4c + 3T all-purpose flou\n1T + 1t sugar\n1t kosher salt\n8T (1 stick) unsalted butter, soft but still cool\n2 large egg whites, cold\n2T black sesame seeds\nMix flour, sugar, and salt together.  Separately, whisk butter until it's completely smooth; I used my Kitchenaid with the whisk attachment.  Add egg whites to the dry ingredients, and mix thoroughly with a stiff spatula.  Dump the egg mixture into the butter, and whisk until batter is creamy and without lumps.\nI don't have a picture of the bowl of pasty goo, so here's some of it in the stencil.. Get out your Silpat.  If you don't have one, head to any kitchen store and shell out $15.  Once you have a Silpat you'll find a million uses for it.\nPlace the stencil on the Silpat, and scoop some batter into the center.  Use the sharp-edged spatula of your choice to spread the batter in an even layer over the stencil; scoop off any extra.  If it's grossly uneven you'll get localized browning/burning.  Don't leave any holes.  Lift stencil and repeat.  I did five coronets per sheet, which seemed like plenty. Also, I only had the patience to sew five molds- don't lay down more coronets than you have molds.\nSprinkle black sesame seeds over the top of each coronet.. Put the Silpat on a baking sheet, and transfer to your preheated 400F oven.  Cook for 4-6 minutes, until the batter is just set and you can see the batter ripple a bit.  They'll start sliding around on little melted-butter trails if your baking sheet isn't entirely flat, but this is easily fixable.\nPull the sheet out and sit it on the open oven door to keep warm while you work.  Hold the top of your paper mold with your off hand, and use a tool to manipulate the coronet with your dominant hand.  Be careful- the coronet is hot and greasy; you REALLY don't want to touch it directly. Roll the coronet around the mold as tightly as you can, and finish with the seam side down.  Roll the other coronets and place them up against each other to prevent unrolling.\nPop the sheet of rolled coronets back into the oven for 3-4 minutes to set the seams and let them color up a bit.  The French Laundry seems to make coronets that are entirely golden-brown, but I took mine out earlier for fear of burning. This worked just fine.\nLet the coronets cool/solidify on paper towels for a few minutes before removing the paper forms.. Ingredients:\n1T finely minced red onions\n1/2c creme fraiche\n1/4t kosher salt, or to taste\nfreshly ground white pepper to taste\nRinse red onions in a sieve under cold water, then dry on paper towels.  Whisk creme fraiche in a small metal bowl for 30sec-1minute, or until it holds soft peaks when you lift the whisk.  Fold in onions, then season with salt and pepper.  Refrigerate until ready to serve, up to 6 hours.\nI never got the creme fraiche to reach soft peaks, so shoved it in the fridge and hoped for the best.  It gets a bit more solid as it chills, but... not a lot.  Also, wash more than 1T onions as some get lost in the sieve; measure the 1T off of the paper towels.. Ingredients:\n4oz sashimi-grade salmon fillet (belly preferred), skin and any pin bones removed and very finely minced\n3/4t extra virgin olive oil\n3/4t lemon oil (zest is a potential substitute)\n1 1/2t finely minced chives\n1 1/2t finely minced shallots\n1/2t kosher salt, or to taste\npinch freshly ground white pepper, or to taste\nFind a nice big SHARP knife to mince the heck out of the salmon fillet.  They claim a food processor would ruin the texture; it would certainly be less fun.  Mix in remaining ingredients, then chill for 30 min to 12 hours.. Assembly is easy:  a dollop of each ingredient, presented like an ice cream cone.  They recommend serving them in a lucite holder, but I got lazy and it wouldn't have worked anyway (see below).  If you can't get at a laser cutter or machine tools, you could wedge the cones in rock salt, peppercorns, or the like for a snazzy presentation.\nFirst, scoop a bit of the creme fraiche into the top of the coronet.  Pipe it in with a pastry bag for bonus points.  Apparently if you prepared it properly, it will be thick enough to stick at the top of the cone; mine chose to be too runny for this to work.  Thus, the horizontal cone trick:  I poured the creme fraiche in, then kept it as close to level as possible while adding the salmon, and served it lying on a plate.\nYou can use a melonballer to create cute little salmon scoops, or just do it quickly with a small spoon and/or clean fingers.   Stick a chive tip out the top of the salmon ball to look extra classy, or possibly more like a Teletubby.  Eat immediately if not sooner.\nEither way, they were fantastically tasty.  If I do this again, I'd probably skip the cones and just plop the half-baked coronet rounds into mini-muffin pans to make non-leaky shells to hold my ingredients.  I'd probably substitute a mix of cream cheese with either sour cream or yogurt for the creme fraiche, as it's a lot cheaper, and it mainly provides a fatty foil for the salmon.  Could be made lower-fat if you care about these things.\nCertainly worthy of a repeat, though.\nThis made approximately 20 coronets.\nRead the question below and select from the following choices.\nA: Salmon Carpaccio\nB: Cooking on the Stove\nC: Prepare Coronet Batter\nD: Greens",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_154_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_154_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_154_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_154_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_154_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_154_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_154_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_154_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_154_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_154_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_154_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_154_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_154_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_154_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_154_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_154_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_154_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_154_17.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_154_18.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_154_19.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_154_20.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_154_21.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_154_22.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_154_23.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: cooked Rice & Chicken Recipe\nB: Rinse Rice & Lentil\nC: Chicken Pot Pie Recipe\nD: Ingredients",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Add Onion', 'Add Rice, Red Lentil and Water', 'Serve']",
+    "context": "Here is the context of these images:\n. Step 1: Take 2 tbsp of lentil and 1/2 tbsp of rice into a bowl. Rinse couple of times and keep aside. Step 2 : Heat a sauce pan and pour 1/2 tsp of oil... Step 3 : Add ginger and saute for 20 seconds... Step 4: Add the Onion and saute for a minute... Step 5: Add the chopped chicken, spice powders, salt and cook for 2 minutes on medium-high heat... Step 6: Add the shredded vegetables, coriander leaves and cook for 1 minute... Step 7 : Add the rice, red lentil, 1 cup water and mix... Step 8: Put the heat and bring it to a boil. Step 9: When it boils, reduce the heat to low and cook for 20 minutes... To Serve your toddler: Take 1 Cup of cooked meal in a bowl, shred the chicken into tiny pieces and serve when warm :)..Hope your little-one enjoys this meal :)\nRead the question below and select from the following choices.\nA: cooked Rice & Chicken Recipe\nB: Rinse Rice & Lentil\nC: Chicken Pot Pie Recipe\nD: Ingredients",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_155_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_155_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_155_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_155_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_155_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_155_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_155_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_155_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_155_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_155_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_155_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_155_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_155_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_155_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_155_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_155_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_155_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_155_17.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_155_18.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Heck of a Mellony Dish\nB: Icing Time\nC: Flavor Combination #1\nD: Chilling and Storage",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['villi Orange Cake', 'Crumb Coating', '@placeholder', 'The Final Cakes']",
+    "context": "Here is the context of these images:\n. The first stage of this cake is to make a lemon Victoria sponge.IngredientsMargarine (469g)  Caster sugar (469g)  Eggs (7 eggs)  Self raising flour (469g)  Baking powder (1.5 teaspoons)  Lemons (zest and juice of two lemons)To get a great Victoria sponge, weigh the eggs in their shells and then match that weight with each of the other ingredients. This cake was a 7 egg sponge and these weighed 469g.  Pre-heat the oven at 180C and line a rectangular tin. First, cream together the margarine and caster sugar until pale. Slowly add the egg to the mixture. Next add the flour and 1.5 teaspoons of baking powder. Once everything is combined add the zest and juice to the mixture. Put into the tin and smooth out and pop into the oven. This will take about 30-40 minutes. Try and resist the urge to open the oven as this will make the cake collapse. At the end of the time, check the cake is ready, pop a knife in the middle, if it comes out clean then it is done. If not, leave it for another 5 minutes. Leave to cool.. Now it's time for the gluten free damaged orange cake. This is adapted from a recipe from Nigella Lawson https://www.nigella.com/recipes/lemon-polenta-cake It was increased to being a 4 egg cake.IngredientsSoft butter (266g)  Caster sugar (266g)  Ground almonds (250g)  Polenta (150g)  Baking powder (1.5 teaspoons)  Eggs 4  Zest of two oranges and the juice of 1Preheat the oven to 180C and line the tin. Cream the butter and sugar together until pale. Combine the ground almonds, polenta and baking powder together. Add 1/3 to the butter and sugar mixture and then add some of the egg. Alternate the dry goods and the eggs. At the end add the zest and the juice together. Pop this in the tin and then in the oven for 40 minutes. Check that it is done in the same way as before. This cake will not rise as much as the sponge.. Once the lemon cake is cooled, cut it in half.Cut one half of the cake into strips length ways approximately 3 cm wide. These will make the beginning of the villi.Now cut the strips into smaller rectangles and carve off the top of the rectangles to make cylinder shapes.. An important part of decorative cake making is the crumb coat of icing. This is a thin layer of buttercream icing which catches up all the crumbs of a cake which you have cut. First whip up some buttercream.Buttercream icingSoft butter (160g) Icing sugar (500g) Vanilla essence (0.5 teaspoons) Splash of milkAllow the butter to be room temperature, often I will leave it out from when I start the cake making process. Mix the butter with a little bit of the icing sugar. Slowly incorporate the icing sugar and add the vanilla essence. Once all the icing sugar is incorporated, add a dash of milk and mix for at least 5 minutes. This allows the buttercream to become very soft and easy to work with.To spread it over the cake, take some hot water and place a knife in it. Using a warm knife allows you to spread the icing like butter! Cover both cakes with a thin layer of icing, keep popping the knife into the water to help spread the icing.Once covered, place the cakes into the fridge for at least 30 minutes to allow the icing to get firm.. As with the cakes, now add a crumb coat to each of the healthy villi. To attach these to the cake, place a cocktail stick in the bottom of the villi and then pop on the cake. This will give added security. Place the villi on the cake in a random fashion.For the damaged villi, these are made out of fondant icing. Roll a palm full of icing into a ball and then flatten to create disks of icing. Now place on the cake in a similar distribution as the healthy villi.. This is may favourite part of cake decorating. Making the icing the right colour, it's a bit like playing with playdoh!I use just off the shelf fondant icing. First, you need to work the fondant. I add a little cornflour to my hands so the fondant doesn't stick to me. Work the fondant for a minute or two until it is soft. For this cake I used 1.5kg of icing. 1/3 of that was used for the darker pink for the sides of the cake and the other 2/3 were to cover the top of the cake. When you are not using the fondant make sure you cover it in cling film so it doesn't dry out.To colour them I use gel colours, they are a bit more expensive but they last a long time and a little goes a long way. With a cocktail stick, add a little of the colouring to the fondant icing. The covering was a pink colour and the outside used a chestnut colour as this can give a realistic skin colour.I roll my icing out on a non-stick mat covered in cornflour. If you do not have this I would recommend clingfilm or non-stick baking paper. . Roll out the icing for the top of the flattened villi cake first, this is the easier of the two cakes to cover. Try not to roll this too thin otherwise you will not be able to lift it off the rolling mat.Once it is big enough to cover the cake, place the rolling pin in the middle of the fondant and flip on side of the icing over the pin. This will enable you to then lift the icing up and place it carefully on top of the cake. Slowly rub the icing down over the flattened villi until the icing is draped over the sides. Now cut the excess icing away. Roll out the darker skin colour and cut to fit the sides. Press this onto the cake and leave for about 10-20 mins so that the icing hardens somewhat and then trim off the bottom excess.If at any point you get a small split or crack in the icing, a top tip is using Trex (a solid vegetable fat) to smooth things out.. For the healthy villi cake, start in the same way as the flattened cake. Roll out the pink icing and place it on top of the cake. Work the icing down the villi, do not worry if the icing rips. This is where the Trex comes into its own. Mix it with some of the coloured icing and you can use it like plaster to cover up the cracks and patch up the cake. Add the sides to the cake.. Display your proud villi cakes!Now Enjoy!\nRead the question below and select from the following choices.\nA: Heck of a Mellony Dish\nB: Icing Time\nC: Flavor Combination #1\nD: Chilling and Storage",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_156_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_156_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_156_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_156_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_156_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_156_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_156_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_156_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_156_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_156_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_156_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_156_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_156_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_156_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_156_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_156_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_156_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_156_17.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_156_18.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_156_19.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_156_20.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_156_21.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_156_22.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_156_23.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_156_24.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_156_25.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_156_26.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_156_27.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_156_28.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_156_29.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_156_30.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_156_31.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_156_32.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_156_33.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_156_34.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_156_35.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Mix Your Lemonade\nB: Lemonade Slushy\nC: Mix and Serve!\nD: Basil Lemonade",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['What You Need', 'Juicy!', 'Prepare the Water', '@placeholder']",
+    "context": "Here is the context of these images:\n. The ingredients' list is very simple:5 big lemons (mine are from Puglia, South Italy :D)4-5 stems of fresh rosemaryGrinded cinnamon1.5 liters (50.70oz) of bottled mineral waterOrganic honeyOther tools and equipment:A big jarA squeezer (electric or manual)Knife, spoonPotAs you can see, nothing special!Let's move on.. Start by cutting the lemons in half with a long knife.Don't scratch your eyes while cutting lemons!!! I've learned it the hard way XDThen, squeeze all the juice you can with your squeezers.My beloved Kenwood blender has a special accessory, which transform it in a electric and powerful squeezer. Very useful!At the end, you should have from 400ml to 600ml of fresh, 100% pure lemon juice!Pour it in the jar, you can help yourselves with a funnel, like I did.. The next step consist in boiling the bottled water.Why, you ask? Two reasons:help the melting of honey and cinnamonextract the essence from the rosemarySo, start by pouring the water in a pot, with a strong fire under it.When the water is hot (but still not boiling!), add 3 big spoons of organic honey!If you like your lemonade more sweet, add a 4th spoon.Mix everything until it's homogeneous.When the water starts boiling, it's time to add 2-3 stem of rosemary to the mix.This step is fundamental to add that special flavor to our lemonade!Boil everything for nothing more than 3 minutes!We want a drink, not a broth.Stop the fire and pour a full spoon of cinnamon into the hot water, and then mix everything until you have a light brow mixture.Wait for the mix to be at room temperature.. Now, it's time to remove the boiled rosemary.It look like an algae, now! :DIt's not a good idea leaving a boiled, squishy plant inside a fresh lemonade.So, we are replacing the cooked one with a new, fresh stem of rosemary, that we can put directly into the jar with lemons juice.Now, take you mix (be sure it's at room temperature!) and pour it into the jar.You can always get a help from the funnel! ;)Stir the drink with a long spoon for a minute or two.Then, let it rest in the fridge for some hours.. After some hours of fridge, your special lemonade is ready!Enjoy this new taste, feel the rosemary in the background and the strong lemons flavor on your tongue!Refreshing and thirst quenching!This lemonade has a very strong flavor, due to the high amount of lemon juice.It's also very healthy, because the only sugar we are putting in is from honey!Also, you can transform it:fill a mug with your drink and microwave it for 2 minutes. Now you have a purifying and digestive tea!I hope you like this recipe, and get the most out of summer!Like always, from Italy, this is Filippo!Ciao a tutti! :D\nRead the question below and select from the following choices.\nA: Mix Your Lemonade\nB: Lemonade Slushy\nC: Mix and Serve!\nD: Basil Lemonade",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_157_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_157_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_157_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_157_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_157_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_157_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_157_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_157_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_157_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_157_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_157_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_157_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_157_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_157_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_157_14.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Ingredients\nB: Millet, Rye and Beer Bread\nC: Rise and Knead.\nD: Mixing the Sponge",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'stuff', 'Dry Ingredients', 'BAKE']",
+    "context": "Here is the context of these images:\n. You will need the following ingredients:\nFlour\nBaking Powder\nSalt\nSugar\nBeer (Shock Top or Blue Moon are my favorite)\nButter. 1. Preheat oven to 375 degrees\n2. Grease bread pan.\u00a0 I always use part of the butter I have set aside for the recipe.\n3. Sift flour. Mix Dry Ingredients\n\u00a0\u00a0\u00a0\u00a0\u00a0 3 cups flour- sifted\n\u00a0\u00a0\u00a0\u00a0\u00a0 3 teaspoons of baking powder\n\u00a0\u00a0\u00a0\u00a0\u00a0 1 teaspoon salt\n\u00a0\u00a0\u00a0\u00a0\u00a0 1/4 cup sugar. \nAdd room temperature Belgian White Beer and mix with dry ingredients.\u00a0 It will be slightly lumpy.. \n1. Pour the dough into your greased bread pan.\n2. Melt 6 tablespoons of butter.\n3. Pour melted butter over the bread dough in the pan.. \n1. Bake the dough in the preheated 375 degree oven for 1 hour.\n2. Remove from oven and remove from pan.\n3. Cool for 15 minutes, serve and enjoy.\nRead the question below and select from the following choices.\nA: Ingredients\nB: Millet, Rye and Beer Bread\nC: Rise and Knead.\nD: Mixing the Sponge",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_158_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_158_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_158_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_158_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_158_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_158_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_158_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_158_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_158_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_158_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_158_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_158_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_158_12.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Simple Tomato Basil Sauce\nB: Soak Seeds\nC: Spoon Onto Bread If Included Tomatoes, Otherwise Dip Bread Into Mixture\nD: Gently Stir With Spoon to Keep Tomato Chunks Intact",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['INGREDIENTS  and Equipment', '@placeholder', 'Food Processor', 'Dehydrating']",
+    "context": "Here is the context of these images:\n. 1 cup flax seeds (ground optional for better nutrient digestion) soaked in\n   2 1/2 cups filtered water\n1/4 cup chia seeds soaked in\n   1 1/4 cup filtered water\n1/2 cup almonds raw (soaked and peeled optional)\n1 1/2 cups cashews raw \n1 cup sunflower seeds raw\n1/2 cup pumpkin seeds raw\n1 large or 2 medium red bell peppers large dice\n1 medium red onion  half large dice half small dice\n2-3 cloves garlic\n1 large lemon juiced\n4 Tablespoons nutritional yeast\n1 Tablespoon apple cider vinegar \n1 teaspoon salt\n1 bunch leafy greens chopped small\n10-15 large basil leaves chopped small\n1 cup tomatoes medium dice\n3 green onions small dice\n7-10 sheets nori seaweed \n\nEquipment\n\nFood processor\nRubber spatula or tool for spreading\nDehydrator . Soak flax seeds and chia seeds in water separately. Cover and let stand  for 10-24 hours on counter top. When seeds have soaked, put them together and set aside. De-seed and chop bell peppers. Add to the food processor. . Cut onion in half. Roughly chop half and small dice the other half. Add rough chop to the food processor saving small diced onion for later. . Add the cashews, lemon juice, apple cider vinegar, nutritional yeast and salt to the bell pepper, onion and garlic already in the food processor. Blend until mostly smooth or desired texture. . Add almonds and pulse until they are chopped up. It should yield about 2 cups  . Combine the vegetable and nut mixture with the seed slurry until thoroughly mixed. Add the sunflower and pumpkin seeds and mix well. Add the remaining ingredients saving the tomatoes for last. Take your nori sheets and cut them in half. Lay the rough side up on your dehydrator. Be sure to do the next steps on your dehydrator trays because you will not be able to move them one assembled. . Place a nice amount in a line down your nori. . Spread evenly making sure to get all the corners. You can do as thin a layer as you like but the thicker you go the longer it will take to dehydrate. I tend to do a 1/4 inch with this recipe. \n. When done assembling, cover and dehydrate until desired crunchiness. 24 hours is usually enough but on occasion 48 hours is needed. . 1. If you have an aversion to seaweed or prefer a lighter cracker you can use the plastic inlay that comes with some dehydrators and spread the mixture directly on it. \n\n2. To spice it up a little I sometimes put a tablespoon or two of Harrissa in the mixture.  \n. These crackers are wonderful with a soft cheese like brie. Also hummus, harissa and sliced cucumbers are a favorite. Pretty much anything you would put on a traditional cracker will be nice with your tomato basil flax crackers. \nThank you, enjoy \nand eat your way to healthy\nRead the question below and select from the following choices.\nA: Simple Tomato Basil Sauce\nB: Soak Seeds\nC: Spoon Onto Bread If Included Tomatoes, Otherwise Dip Bread Into Mixture\nD: Gently Stir With Spoon to Keep Tomato Chunks Intact",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_159_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_159_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_159_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_159_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_159_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_159_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_159_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_159_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_159_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_159_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_159_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_159_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_159_12.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Peel and Cut\nB: Sonoran Hot Dog\nC: Put Them on the Grill\nD: Cook Them Until They Are Done",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Supplies!', '@placeholder', 'Grill and Sautee!', 'Add the Works!']",
+    "context": "Here is the context of these images:\n. So, these can be Vegan dogs...but perfect for tricking kids! You will need:CarrotsPeeler/KnifeMarinade: 1/4 cup soy sauce1/2 cup water1 Tablespoon Worcestershire sauce1 Tablespoon rice vinegar1/2 Tablespoon apple cider vinegar few shakes of pepper and garlic powder. First, peel and cut big carrots to the same shape and length as hot dogs.Use a knife to cut a star pattern on the ends too...Boil in water for 5 minutes. Quickly remove them and put in cold water to cease cooking.Add them to your mixed up marinade in a plastic container.Put carrots in the marinade...completely submersed overnight in the fridge. (This marinade will turn them brown, so make sure they are submerged completely)Overnight is best...but even 4 hours is great!. Next day, grill them up to give them some grill lines...just a few minutes. (optional)Then add them to a frying pan with 1/2 cup of the marinade and cook until browned.They look like any other hot dog I've ever cooked! They have about the same texture as a hot dog.They taste salty and a little \"meaty\".... Now just add the fixings you want! Presentation is everything!!!We even did this same technique and roasted the carrots on the fire.They are awesome for a low calorie version!Check out my blog Doodlecraft for more awesome ideas!Please vote for me in the Pranks contest!\nRead the question below and select from the following choices.\nA: Peel and Cut\nB: Sonoran Hot Dog\nC: Put Them on the Grill\nD: Cook Them Until They Are Done",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_160_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_160_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_160_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_160_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_160_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_160_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_160_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_160_7.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Bacon Wrapped Weenies\nB: Boil the Pretzel Dogs\nC: Grill\nD: Bake Pretzel Dogs",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Slice the Jalapenos', 'Bacon!!', '@placeholder', 'Mmmmmmm']",
+    "context": "Here is the context of these images:\n. Ingredients:\nHot dogs\nBacon\nCanned/pickled Jalapenos\nSliced cheese\nHot dog buns\nFavorite condiments\nCooking spray\nMaterials:\nTongs\nGrill\nPlates\nSpray bottle. \nSlice the jalapenos so that they are little rings.\u00a0 Then cut open the rings. Remove seeds if desired.\u00a0 Place sliced jalapenos on hotdogs.\n**warning: wash hands after working with jalapenos.\u00a0 DO NOT TOUCH EYES. (cuts are probably a bad thing to touch as well.). \nCut or tear cheese slices in half.\u00a0 Place 1 or 2 on each hotdog, on top of the jalapenos.. Wrap the hotdog in bacon, making sure to not let the cheese or jalapenos fall out.\u00a0 We used 2 slices of bacon per hotdog.\u00a0 Adjust as desired.. Preheat grill for 5 minutes, then turn the heat to medium.\nCarefully place bacon wrapped hotdogs onto grill using tongs.\nFlip (or carefully roll) the dogs every 7-10 minutes until bacon is crispy and delicious.\nSpray any little fire with water to prevent the dogs from burning.\n** It might be wise to spray grill with cooking spray beforehand.\u00a0 oops.. Remove from grill when finished cooking.\nPlace hotdogs on buns.\nAdd favorite condiments.\u00a0 (We recommend Goulden's spicy brown mustard)\nEAT EAT EAT.\nRead the question below and select from the following choices.\nA: Bacon Wrapped Weenies\nB: Boil the Pretzel Dogs\nC: Grill\nD: Bake Pretzel Dogs",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_161_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_161_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_161_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_161_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_161_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_161_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_161_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_161_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_161_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_161_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_161_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_161_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_161_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_161_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_161_14.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Start the Maceration\nB: Homemade Ardennes Pate\nC: Rolling\nD: Ta Da!",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Prepare the Botanicals', '@placeholder', 'Fitrate Macerate', 'Gin Distillation']",
+    "context": "Here is the context of these images:\n. There are nine ingredients. Per 1 L of grain spirit (36-40% ABV) prepare:12 g juniper berry/cone  (Juniperis communis)12 g coriander seeds (Coriandrum sativum)12 g cucumber slices (Cucumis sativus)3 g angelica root (Angelica archangelica)3 g rose petals (Rosa x centifolia)2 g cubeb pepper (Piper cubeba)1\u00d74 cm slice orange peel/zest1\u00d74 cm slice lemon peel/zest0.5 g caraway (Carum carvi)Weight out the ingredients and put into a jar.When preparing orange and lemon peel avoid the white on the back. I.e. only use the zest. If possible, use fresh cucumber from the garden or market. Its aroma is more intense. All ingredients are regular spices.. Transfer all ingredients into a jar. Fill up the jar with grain spirit (36-40% ABV).Close the jar. Store it for one week in the dark.. After one week the macerate coloured nicely. On opening the jar, beware of the explosion of flavours. I love that bouquet. It is something between summer flowers, herbal pharmacy, and gin.Pass the macerate through a fine mesh or coffee filter to remove the spices.. The macerate is now filtrated. At this moment it is way to strong to serve as regular gin. It needs to be distilled to obtain its mild flavour.Dilute 1 part Macerate with 3 parts potable water. The resulting wine will have about 8 % ABV. That is just perfect to be distilled in the microstill.Collect the spirit from the microstill and indulge your Egon Gin! To your health!\nRead the question below and select from the following choices.\nA: Start the Maceration\nB: Homemade Ardennes Pate\nC: Rolling\nD: Ta Da!",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_162_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_162_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_162_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_162_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_162_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_162_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Vegetarian Meatloaf\nB: Dice\nC: Make the Broth\nD: Dice",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Ingredients', '@placeholder', 'Prepare the Fixins', 'Finish It']",
+    "context": "Here is the context of these images:\n. Broth\n60 Oz Vegetarian Broth\n8 star anise\n1-2 sticks cinnamon\n8 cloves\n1 thumb sized piece of ginger, sliced\n8 cloves of garlic, quartered\n1-2 tablespoons soy sauce\n2 onions, quartered\n3 shallots, halved\n6 cups waterVeggies (just some suggestions)\ncarrots\nbok choy\nbroccoli\ncauliflowerNoodles\n1 package thin rice noodlesPho Fixins\nfried tofu\njalapeno\nlime\nbean sprouts\ncilantro\nbasil\nmint\nSriacha hot chili sauce. - Heat a large pot over medium high heat.\n- Add the garlic, onions, star anise, cloves, ginger to to pot. Stir over heat until it begins to brown.\n- Add the broth, water and soy sauce.\n- Bring to a boil. Reduce heat and simmer for 30 minutes.\n- Strain the broth into a new pot, reheat.. - Wash the cilantro, basil, bean sprouts and mint.\n- Slice the jalapeno.\n- Quarter the lime.\n- Arrange everything on a plate.. Prepare the noodles as described on the package. The one I used were immersed in boiling water for 10-15 seconds, then rinsed.. - Add about a cup of noodles to the bottom of each bowl.\n- Pour hot broth over the noodles.\n- Add vegetables (carrots, broccoli, etc), allow to sit for a few minutes.\n- Add tofu.\n- Serve with chopsticks, a spoon, Sriacha hot chili sauce and the plate of fixins.\n- Everyone should add the fixins to their soup as desired, remove the jalapenos when it starts getting too spicy.\nRead the question below and select from the following choices.\nA: Vegetarian Meatloaf\nB: Dice\nC: Make the Broth\nD: Dice",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_163_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_163_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_163_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_163_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_163_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_163_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_163_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_163_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_163_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_163_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_163_10.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Place on Rack\nB: On the Bbq\nC: Baloney Sandwich...\nD: Mouthpeice",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Begin Your Flavor Layering.', '@placeholder', 'Fill Up Rack', 'Plate of Caramelized Teriyaki Bacon']",
+    "context": "Here is the context of these images:\n. Line a baking sheet with foil. Spray your rack with olive oil spray\u00a0 - or canola spray -- \u00a0on both sides of rack. You wil throw away the foil so no need to spray it. If you don't use foil, you will have caramelized sugar forever stuck to your pan !. Begin preparing your bacon flavor layering by dipping the bacon in sauce.\u00a0 I used Kikoman's Low Sodium Teriyaki sauce to flavor the bacon.\n.. Ordinary light brown sugar is used in this recipe.\u00a0\nBreak up the lumps, if any, with a whisk or a pastry cutter or a fork. I started with about 1 1/2 cups of brown sugar, but I did add some more later so it is really about 2 cups that you need for this recipe.. Pour some teriyaki sauce in bowl. I started with about 3/4 cup... but I didn't actually measure it.\u00a0 You can always add more to your bowl, if you need it. . Meaty, thick-cut sliced bacon, like this package here, works GREAT.\u00a0. Pull out 6 or 7 slices. Using kitchen shears, cut slices in half. Why ? Because that will make them the correct size to lay on your hamburger or sandwich later on when finished.\u00a0. Assembly line: Teriyaki sauce, brown sugar, then on to rack.. First, dunk each slab of bacon into your teriyaki sauce to coat both sides.\u00a0\u00a0 Coat it well for really good flavor.. Next, dredge the sauce-coated bacon piece in your brow sugar.\u00a0 Flip and coat both sides.\u00a0. Place your sugar-coated bacon pieces on your sprayed rack.\u00a0\u00a0 Be SURE you sprayed your rack with canola oil or some sort of non-stick spray, other wise your bacon may become stuck to it permanently .\u00a0\u00a0. Fill up the rack\u00a0with coated bacon pieces.\u00a0\n(... lots of reflection going on in this photo from the over head light, the foil, \u00a0and the camera flash.. almost looks like the side of my pan has flames going, but it does not.\u00a0)\u00a0. Properly wrap and store the rest of your bacon in the refrigerator.\u00a0 We both know that thick bacon is not cheaply priced, so treat the remainder like it is gold and store it properly so you can repeat this recipe next week for your buddies.\u00a0 After you've told them how delicious this is and that YOU made it, you know they will be wanting you to make some for them too.\u00a0\u00a0\u00a0. Baking and caramelizing in the oven. Cook for at least 20 minutes, then take out, turn over, and put back in for another 12 to 15 minutes.\u00a0 YES, this takes a LOT of time. No wonder the burger shoppes charge an arm and a leg $ for this on your burger. LOL. Holy Cow that looks GOOD !!\u00a0 YUMMY !!. Out of the oven. Cooled on rack for a little bit, then transfered to a glass plate that I lightly sprayed with the olive oil cooking spray. Let bacon cool.\nHowever, if it still does not look \"done'' then by all means put it back into the oven until it does get \"done'' .. it should \"crisp up\" as it cools if properly cooked.. WOW, does that look GOOD !!\u00a0\u00a0 This is now ready to add to your burger or sandwich, or eaten plain.\u00a0\u00a0 We put our Caramelized Teriyaki Bacon on grilled turkey burgers with pineapple slices and red onions, all on a whole wheat bun.\u00a0 YUMMM-O !!\u00a0\u00a0\nRead the question below and select from the following choices.\nA: Place on Rack\nB: On the Bbq\nC: Baloney Sandwich...\nD: Mouthpeice",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_164_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_164_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_164_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_164_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_164_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_164_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_164_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_164_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_164_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_164_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_164_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_164_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_164_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_164_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_164_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_164_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_164_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_164_17.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Roll Each Piece in Your Hands to Form a Ball\nB: Delicious Colombian Arepa\nC: Baking Time\nD: Cheese",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Add the Milk Mixture to the Yeast in the Large Bowl', '@placeholder', '\\u200bPlace the Rolls With Smooth Side Facing Up', '20 Minutes']",
+    "context": "Here is the context of these images:\n. First pour 1/4 cup warm water into a large bowl. Empty the yeast package into the bowl of water. Stir until the yeast dissolves (see images) and leave this alone for about 10 minutes so it can get foamy.Note: \"Proofing\" is just a fancy term for dissolving active dry yeast into warm water in order to activate it.. Stir this mixture until the sugar dissolves and until the butter is completely melted (see image). Allow the mixture to completely cool down.Caution: Failure to let this mixture cool down could cause the egg (used in step 4) to scramble or be cooked when added.. . Be sure the egg is fully mixed in (see images).Caution: Be careful not to get any of the egg shell into the mixture when cracking the egg.. Be sure to only mix in 1 cup of flour at a time, until all 4 cups are mixed in. Mix until this forms a soft dough (see images). The dough may not be fully mixed yet. . It is important that the surface is floured well so that the dough doesn't stick to the counter (see image).. Kneading dough involves folding the dough in half towards you and then using the palm of your hands to push down into the dough (see images). Add more flour to the dough if it feels too sticky. The dough should feel soft and elastic when done kneading.. This step is to prevent the dough from sticking to the bowl (see image).. Cover the bowl with a clean kitchen towel or cloth and let the dough rise until it has doubled in size (see images).Troubleshooting: If the dough doesn't rise enough, it will not make light and fluffy rolls, so you may need to restart.. Punch down the dough and follow step 7 on how to knead the dough. Knead for about 4-5 minutes (see images).Caution:The dough should be slightly sticky. Limit the amount of flour on the surface in order to prevent the dough from getting too tough or dry. The rolls won't be as light and fluffy if this happens.. This is important to prevent the rolls from sticking to the pan (see image).. First cut the dough into four large pieces with a butter knife (see image). Then cut each piece into four more pieces so that you end up with 16.. This will help you get the dough into the shape of a roll (see image).. This helps the roll have a smoother surface (see image). Repeat this step for all 16 pieces of dough.. Note: I only made 15 rolls as the picture shows (see image), but 16 can be made.. This may take about take 30 - 45 minutes. The reason it is important to let the rolls re-rise is because in step 10, the dough was re-kneaded which pushed all the air out of the dough (see image).. This helps the rolls have a crispy outside after being baked. . The rolls should be golden brown when pulling them out of the oven (see image in next step).. You have just successfully made fluffy and buttery dinner rolls! Whether you choose to eat them with a family holiday meal or just as a snack in your lunch, they will satisfy your craving for delicious rolls! You can eat these rolls with butter or dipping them in your soup. My personal favorite is eating them with honey. I'd advise trying every possibility until you find your favorite!\nRead the question below and select from the following choices.\nA: Roll Each Piece in Your Hands to Form a Ball\nB: Delicious Colombian Arepa\nC: Baking Time\nD: Cheese",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_165_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_165_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_165_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_165_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_165_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_165_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_165_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_165_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_165_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_165_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_165_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_165_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_165_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_165_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_165_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_165_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_165_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_165_17.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_165_18.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_165_19.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_165_20.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_165_21.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_165_22.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_165_23.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_165_24.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_165_25.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_165_26.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_165_27.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: How to Make Flourless Chocolate Cake\nB: How to Make an Amazing Chocolate Vegan Cake\nC: Decorate the Cake!!!!\nD: Have Fun With Chocolate Part 1",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Difference Between Chocolate', '@placeholder', 'Have Fun With Chocolate Part 2', 'Have Fun With Chocolate Part 3']",
+    "context": "Here is the context of these images:\n. In general, there are two types of chocolate: couverture and compound. Couverture chocolate is made with cocoa mass and cocoa butter, while in compound chocolate, the cocoa butter is substituted with vegetable oil. Compound chocolates are cheap and don't need tempering, however it doesn't taste good, and the vegetable oil inside is also not good for your body, so it's also known as the \"fake chocolate\", some cheap bakery shops will use them as they will lower the expense for the shop.Couverture chocolates are the real chocolates with cocoa butter, they taste really nice and melt in the mouth. However cocoa butter is very sensitive to temperature, if you handled it wrongly it might not set, and will probably melt too fast in room temperature.. Well it's a bit hard to understand for people without any experience. We all know that chocolate will melt at a high temperature and will set at a low temperature. But some chocolate have white oil stripes on it, or taste grainy, or melt so fast that you cannot even touch it with bare hands, or don't shine well, while other good quality chocolates will have a snap when break it, and touch dry to the finger, hold shape better, have perfect shine and melt in the mouth perfectly.That's the difference between non-tempered chocolate and tempered chocolate. We temper chocolate for cake decorations to have a better shape, better taste, better look and longer shelf life. . For dark chocolate, milk chocolate and white chocolate, the temperature requirement is different. In general, the higher the milk content, the lower temperature it needs for tempering.There are also a few ways to do it, in commercial kitchen we always use tabling method-- to spread melted chocolate on the marble surface or bench to cool it down. However for home bakers, it's quite messy and needs more tools, so at home I always use seeding method, which is to add cold chocolate bits into melted chocolate. So I have some white cooking chocolates, I'll put some in a glass bowl, and the others I'll chop them into pieces for later use. 1. First melt the chocolate in the glass bowl. You can use a double boiler, but microwave will do the job just fine. Heat for 20 seconds, stir, another 10 seconds, stir, and repeat until it's completely melted. DO NOT OVERHEAT THE CHOCOLATE! It will burn very easily in the microwave and you won't be able to use a burnt chocolate.2. Add chopped chocolate pieces into the melted chocolate gradually, stir everytime until it melts. Check the temperature of the chocolate and keep adding cold chocolate pieces until the temperature drops to 28-29 Celsius Degree (about 83\u00b0F). 3. Now your chocolate is tempered and ready to use! Make sure you work fast, if the chocolates sets before you finish your work, you can reheat it on a double boiler and make sure the temperature doesn't go above 83\u00b0F, otherwise you'll have to temper it again. For milk chocolate, the temperature to work is about 29-30\u00b0C (84-85\u00b0F), and for dark chocolate, it's 30-31\u00b0C (86-87\u00b0F).  . This is the easiest way to work with tempered chocolate. Just simply fill the tempered chocolate in any mold you like, scrap the excess and once it's set, you can just use it. There are a lot of cute molds available online and in store, so use your imagination and find your favorite! I used a sea shell mold. You can use plastic mold or silicon mold, they both works for tempered chocolate. . You can fill a small piping bag or a paper piping bag with tempered chocolate, and paint whatever you like on a silicon paper or silicon mat. Once the chocolate is set, you can take out the pattern you just painted. Here I showed how to make some snowflakes which will look perfect on a Christmas cupcake. You can definitely pipe anything you like! If you pipe letters, make sure they connects to each other, or it will be impossible to take out the whole design without break apart. . This is another technique that you need to work with chocolate when it's half dry. Most of the fancy decorations are made this way. Here I'm using a chocolate feather as an example.1.  Pipe the basic shape of the decoration you want to make on a piece of silicon paper (or acetate).2. Use another sheet of silicon paper to put on top, shape it with thumb and take the top sheet away to make it thin and even (you can also use small spatula). 3. Wait till the surface is touch dry but not completely set yet. This step is crucial as if you didn't wait enough time, the chocolate is still wet, and it will be very messy if you start to work with it. However if the chocolate is completely set, it will crack easily and will be impossible to work with any more. 4. Make some cuts and lines with the tip of a toothpick.5. Pipe a line in the middle as the \"bone\" of the feather. . Here are some more ideas of chocolate decorations with half dry method. For the first cake with three hoops, I applied tempered chocolate on a rectangular shape silicon paper, and make the both ends connect to each other when it's half dry. Then I use chocolate to stick all three circles together to make the design. For the second cake with molded chocolate, I also made a chocolate \"coconut shell\" by applying two layers of chocolate: dark chocolate on the bottom, and white chocolate on top with coconut flakes.For the third cake, I used chocolate cigarette and used round cutter to make the design. Just apply tempered chocolate on a piece of silicon paper, and use cookie cutters (or round cutters) to cut out the shape you like.You can also use a comb to scrap half the chocolate off, and you will get chocolate strips afterwards.Sky is the limit, and use your imagination!Hope you will find this instructable useful! If you like it, please vote for me in the \"cake decoration\" contest, thanks a lot!\nRead the question below and select from the following choices.\nA: How to Make Flourless Chocolate Cake\nB: How to Make an Amazing Chocolate Vegan Cake\nC: Decorate the Cake!!!!\nD: Have Fun With Chocolate Part 1",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_166_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_166_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_166_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_166_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_166_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_166_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_166_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_166_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_166_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_166_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_166_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_166_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_166_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_166_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_166_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_166_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_166_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_166_17.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_166_18.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_166_19.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Bonus\nB: Ice Cream Cake\nC: Ingredients\nD: Banana Ice Cream",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Tools', 'Gingerbread', 'Store', '@placeholder']",
+    "context": "Here is the context of these images:\n. \u00d75 dl (2.1 cups) of cream \u00d7a tin can of condensed milk, 2 dl (0.8 cups) \u00d720 pieces of gingerbread or more! \u00d7A bowl\u00d7Electric mixer \u00d7Spatula \u00d7Bread form\u00d7Freezer\u00d74 hours of time in feeezer. Fill the bowl with 5 dl (2.1 cups) of cream and whisk them with an electric mixer. After give some to your dog! . Add a tin can of condensed milk, about 2 dl (0.8 cups). Mix it around with a spatula. . Crush 20 pieces or more if you like, into the bowl. Mix it with the spatula. . Fill the ice cream into a bread form. Put some gingerbread crumbs on top. Store it in freezer for 4 hours minimum to make it hard ice cream. . After 4 hours of freezer time it's done! . The other day I did the same recepie exchanging gingerbread to blueberries. \nRead the question below and select from the following choices.\nA: Bonus\nB: Ice Cream Cake\nC: Ingredients\nD: Banana Ice Cream",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_167_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_167_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_167_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_167_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_167_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_167_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_167_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_167_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_167_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_167_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_167_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_167_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_167_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_167_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_167_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_167_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_167_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_167_17.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_167_18.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_167_19.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_167_20.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_167_21.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_167_22.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: How to Preserve Cucumbers\nB: How to Make Sauerkraut\nC: Pack the Jalapenos and Carrots Into the Pickling Jar\nD: Ingredients",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Prep Work Time!', '@placeholder', 'Heat the Pickling Liquid and Pour It Over', 'Cooling + Storing']",
+    "context": "Here is the context of these images:\n. Ingredients:6-10 jalape\u00f1o peppers (this will all depend on the size of the jalape\u00f1os - start with six and work your way up!)1 medium carrot1/4 red onion1 cup white vinegar 1 cup  water2 cloves garlic, peeled and lightly crushed1-2 tablespoons salt (start with 1 tablespoon, 2 is pretty dang salty.)1-2 teaspoons sugar1/2 teaspoon dried oreganoa few black peppercorns, lightly crushed1 dried bay leafYou can absolutely add more sugar if you like sweeter pickles, but the carrots add a little bit of sweetness so I scaled the sugar down from the original recipe. Tools:32 ounce jar for canninggloves wouldn't be a bad idea either, you will have jalape\u00f1o hands for two days after this without them.. Slice the jalape\u00f1os into rounds as thin as you like, I normally do 1/4 inch. Test the jalape\u00f1os and remove the seeds and membranes if you think they'll be too spicy for you - this is your chance to make it as spicy as you like! You could also throw in a habanero (pierced with a knife) or some serranos to increase the heat. Slice the carrots as well - if you like them softer, go thinner than 1/4 inch. Slice the onion thinly, and crush the garlic cloves and remove the skins. Now you'll want to combine the water, vinegar, oregano, peppercorns, bay leaf, garlic, salt and sugar in a microwave-safe measuring cup. . Wash your jar and hands VERY well with hot water and dry well. You want everything to be as clean as possible.Put as much of the prepped jalapenos, onions and carrots into the jar that you can. Really push it down! . Heat the pickling mix in the microwave for 3-4 minutes, or until it's hot enough to boil.Pour it over the jalape\u00f1os and carrots you've packed in the jar. It might not all fit, but it should be close!IMPORTANT NOTE:This method results in slightly firmer pickles than the original recipe. If you like them to be softer, follow the method in the original recipe by heating up the pickling liquid in a pan until it boils, and then add in the jalapenos, carrots and onions and cook it for a minute before taking it off the heat. Then you'll need to pull the veggies out of the liquid, pack them in the jar, and pour the liquid over. . Once the veggies and liquid is in the jar, let it sit out open until it cools down to room temp. Now you'll want to seal the jar and place it in the fridge.The original recipe states that these pickles keep very well in the fridge for a month, but after that they're just not as tasty. Though I have to say we go through one of these 32 ounce jars in about two weeks, so I can't confirm that. We've pretty much been eating these with every meal since I started making them. ;)\nRead the question below and select from the following choices.\nA: How to Preserve Cucumbers\nB: How to Make Sauerkraut\nC: Pack the Jalapenos and Carrots Into the Pickling Jar\nD: Ingredients",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_168_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_168_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_168_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_168_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_168_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_168_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_168_6.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Tortellini Kale Soup\nB: Add the Golden Goodness\nC: Soups On!\nD: Spice Things Up",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Ingredients', 'Garlic Saute', 'Add Stocks', '@placeholder']",
+    "context": "Here is the context of these images:\n. \n          Use ingredients \"off-the-shelf\".\u00a0 Quantities reflect typical units of sale.\u00a0 Standard measurements are included for convenience.\n\t\t(3 tbsp) Olive Oil\n\t\t(1 tbsp) Chopped Garlic\n\t\t50 fl-oz. can of Chicken Broth (~8 cups) \n\t\t15 fl-oz. can of Chopped Stew Tomatoes( ~21/2 cups) \n\t\t10 oz. package of\u00a0Chopped Frozen Spinach(~2 cups) \n\t\t(1 tsp) Basil\n\t\t8 oz. package of Dry Cheese Tortellini(~11/2 cups) \n\t\t(1/4 cup) Grated Parmesan CheeseIngredients #'s 1,2,6 and 8 are only sold in units larger than necessary.. Put the oil#1 in a large soup pot on medium heat.Open the can of Chicken Stock#3, but leave aside for now.Add in the garlic#2 and saute until brown.\u00a0 This will happen fast, so take a precious extra minute to avoid burning the garlic.Quench! the saute by dumping the Chicken Stock#3 into the pot.. Add the stew tomatoes#4.Add the spinach#5.Add the basil#6.\n\t\tBring to a boil.Cover and reduce to simmer for 30 minutes.\u00a0 Avoid boiling off the soup.. Add the tortellini#7.Add the Parmesan cheese#8.Recover and simmer another 20-30 minutes, until the tortellini is finished (rehydrated).. \n          Done, but before eating, allow the soup to cool.\u00a0 The spinach and oil conspire to trap super-heated water.\u00a0 A bit of venting is necessary to release the trapped energy.\nEnjoy with hearty bread and rich butter. Skip the sour cream.\nServe extra Parmesan cheese on the side, rather than adding during cooking.\nYou can use more tortellini, but you're risking a stew, in my experience.Reduce the amount of sodium in the soup by using (in order of magnitude):\n- - - - Sodium-free chicken stock,\u00a0 - - - Provolone cheese,\u00a0 \u00a0 - - Natural stew tomatoes,\u00a0 \u00a0 \u00a0 - Fresh or Frozen Tortellini\n\tThis soup does not freeze well, but will refrigerate for a few days. The flavors improve a bit after 24 hours.\u00a0 The real genius of this recipe- it's a \"kit\".\u00a0 Stock the ingredients and enjoy fresh when you want.\u00a0\n\tCooking... proactively, what every busy person wants to do!\nRead the question below and select from the following choices.\nA: Tortellini Kale Soup\nB: Add the Golden Goodness\nC: Soups On!\nD: Spice Things Up",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_169_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_169_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_169_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_169_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_169_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_169_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: How to Eat for Free Everyday (at a University)\nB: Casseroles\nC: A Little Off the Top...\nD: How to Open and Enjoy a Bottle of Wine.",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Plan the Types of Meals You Will Make', 'Gather Supplies', 'Lunch Sandwiches', '@placeholder']",
+    "context": "Here is the context of these images:\n. Before getting started cooking I researched the types of foods that would freeze and reheat well.  Fortunately, there are lots of options for delicious meals you can make and freeze In advance! Unfortunately, potatoes and cream sauces are not the best opinions, so I chose recipes that weren't cream sauce based and also avoided potatoes; it's probably healthier this way, but that's of little consolation...It's also important to consider the date you'll start preparing your meals and the date you'll need them to last through. Some foods freeze well for several months, while others only do well for about a month or less frozen. I tried to stick with meals that would hold up well in the freezer for 3 months.Overall, the meals I prepped fit into the following categories:- Grab and Go Breakfasts- Lunch Sandwiches- Stews - Casseroles- Slow Cooker Meals- Family Favorites (meatballs, chili, Salisbury steak). You'll need:(1) Containers to freeze foods in.  I used a combination of: - Glass Storage Containers- Canning Jars- Disposable Aluminum Pans- Tin Foil- Plastic Wrap- Plastic Freezer Bags* Lots of people freeze their foods directly in plastic freezer bags. Whenever possible I avoid storing my food directly in plastic, opting instead to use it primarily as an outer layer of extra freezer protection.* Also consider the portion sizes you'll want to store your food in when deciding what types of storage containers will work best for you! (2) Recipes!! Gather up your personal favorites, or search online for some exciting new options!! (3) Groceries - For me, I've been gradually adding to my freezer stash for the past five weeks. This has been the right pace for me. You might be crazy ambitious and want to do all your cooking and freezer prep work in one massive batch, but that wasn't my approach. Before you clear out the grocery store, be realistic about when you'll do your cooking and only buy supplies for the meals you'll prep right away. You won't save any money letting a massive stockpile of food slowly go bad in the fridge.. 30 Breakfast Meals in No Time!!I focused mainly on breakfast sandwiches and smoothies. Pancakes and waffles are also great choices to make in advance and freeze. I would make a huge batch of pancakes if I had any freezer space remaining!  **BREAKFAST SANDWICHES ABD BURRITOS**- Baked a big pan of eggs. I used a dozen eggs whisked with a cup of milk and baked at 375F for about 30 minutes. - While my eggs baked, I cooked my meats (bacon, sausage). - Set up an assembly line of supplies: aluminum foil squares for sandwiches, English muffin on each square (or flour tortilla for the burritos), get your cheese out and ready...- When the eggs were done, I used an upside down drinking glass about the same size as my English Muffins to cut egg circles.  (Hint: the extra eggs after cutting out circles work perfectly in breakfast burritos!!)- Placed an egg circle on each muffin, added meat and cheese and muffin top- Wrapped muffin in aluminum foil- Labeled a plastic freezer bag and placed 4 individually wrapped muffins in each bag- Froze bags of muffins/burritosWhen you're ready to eat a sandwich, simply take one out and heat it in your microwave for ~2 min, or toaster oven / oven for ~15 min.Through this process, I made...8 - Bacon, Egg & Cheese English muffins 3 - Ham, Egg & Cheese Burritos7 - Sausage, Egg & Cheese burritos4 - Ham Egg and Cheese English Muffins2 - Egg and Cheese English muffins**BREAKFAST SMOOTHIES**Did you know you can make smoothies in advance and freeze them?!  I did not know this, but it's true! They're good and you can make lots of them at once and clean your blender out one time vs. daily!! Woo hoo!!I made 6 smoothies and stored them in glass containers.  You could make your smoothies with whatever you want, here are the ingredients I used: yogurt, milk, apple juice, kale, chia seeds, apple, banana, blueberry.  The trick to freezer smoothies is to make them, pour into your storage containers and then put them in the fridge for at least 4 hours BEFORE putting in your freezer.You do need to thaw a smoothie slightly before you eat it.  You can move one to the fridge from the freezer the night before or thaw in cold water if you need it sooner!I made 6 smoothies through this process.. Hot sandwiches are the perfect addition to a freezer stockpile! And they're something you can eat with one hand and are easy to eat on the go!Really you should customize these to your liking.  I kept them very simple, just meat and cheese on a bun. We can add condiments when we heat them up.The process is very similar to the breakfast sandwiches. Spread any condiments you want onto your bun (or don't, and add these when you're ready to eat), add your meat, add your cheese and wrap the sandwich in its foil. Label the outside of the foil with the sandwich contents and also label a plastic freezer bag. I bagged my sandwiches 4 to a bag. Then freeze -that's it!When you're ready to eat, microwave for ~2 minutes, or place in the toaster oven or oven for ~12-15 minutes and you'll have a delicious hot sandwich!I followed this process to make:10 - ham & cheddar sandwiches6 - roast beef & provolone sandwiches. Stews freeze exceptionally well and really retain their flavor when reheated. I made a few different options from recipes I found online that noted they freeze well. If you have a favorite stew recipe, make a great big batch of it and divide it out into containers portioned for what your family eats in a meal; freeze it and reheat it when you're your ready for an easy no effort home cooked meal!I followed the following recipes to make my freezer meals:- Made 3 freezer packages each with 2 servings of Sunday Stew (recipe from The Pioneer Woman)http://thepioneerwoman.com/cooking/sunday-night-stew/- Made 4 freezer packages with 1 serving each of Hungarian Meatball Stew (recipe from Rachel Ray)http://www.rachaelraymag.com/Recipes/rachael-ray-magazine-recipe-search/dinner-recipes/hungarian-meatball-stew- Made 2 freezer packages with 3 servings each of Braised Beef with Sweet Potatoes (recipe from Real Simple)http://www.realsimple.com/food-recipes/browse-all-recipes/spiced-braised-beef-sweet-potatoes. It wouldn't be freezer cooking without a couple casseroles!  Casseroles are classic freezer staples because they freeze and reheat really well.  Make a couple pans of you favorite lasagna recipe or pretty much any other pasta dish and stash them away in your freezer for a low effort meal!I made 2 trays of Lasagna Primavera, each with about 8 servings.  These will be perfect to pop in the oven for visitors!  I used the following recipe from Martha Stewart.  I haven't tried this yet, but it looked and smelled delicious while prepping!http://www.marthastewart.com/340876/freeze-ahead-lasagna-primaveraI also made 4 containers with about 3-4 servings per container of Baked Ziti, using our favorite recipe from The Pioneer Woman:http://thepioneerwoman.com/cooking/baked-ziti/. For fresh cooked food straight from your freezer, slow cooker meals are an excellent option.  Basically, you package together the ingredients of your slow cooker recipe and freeze them. The night before you want to prepare your easy home cooked meal, defrost them in your fridge. In the morning dump the ingredients into your slow cooker and cook! The benefit is you have all the prep work done (ingredients gathered, vegetables and meats prepped, spices added, etc) and all you have to do is let your slow cooker do the rest of the work! These meals come out tasting like you slaved away in the kitchen all day! These meals are a great option for when you'll have visitors over or when you just want an extra special dinner!I have the following slow cooker meals prepped and ready for a fresh from the freezer meal:Crock Pot Mongolian beef (approximately 4 servings)http://whoneedsacape.com/2012/11/easy-crockpot-mongolian-beef/Honey Teriyaki Chicken (approximately 6 servings)http://www.twindragonflydesigns.com/crock-pot-freeSalsa Chicken (approximately 6 servings)http://www.twindragonflydesigns.com/crock-pot-freeCrockpot beef vegetable soup (approximately 6 servings)Spiced Braised Beef with Sweet Potatoes (approximately 6 servings)http://www.realsimple.com/food-recipes/browse-all-recipes/spiced-braised-beef-sweet-potatoes. I also wanted to be sure we had plenty of small portions of our family staples frozen and ready to be reheated.  Think about your standby meals and how they'll hold up when frozen and reheated to customize these options with what will work best for you.  I made:Chili: I made a big pot and divvied it into 6 freezer containers with about 2 servings eachMeatballs: I filled 5 freezer containers each packed with 2-3 servingsChicken Pot Pies in Pocket form (for easy eating on the go): I made 11 pocketsSloppy Joes: I made 3 containers each with 5-6 servingsSalisbury Steaks: I made 6 \"steaks\" and packaged in 3 freezer packages of 2 steaks each. Finally, don't forget to stock up on side dishes and anything else you may need to accompany your freezer meals! I added several packages of buns to my freezer supply (so we have something to put all that sloppy joe meat on). I also bought some loaves of frozen garlic bread to go with our Baked Ziti and Lasagna Primavera.  In addition, I made sure we have several bags of frozen vegetables to use as sides. I also made sure our pantry supply includes enough pasta for all those yummy meatballs, and other grain choices like Quinoa, rice and couscous.. * Package your freezer meals leaving as little air as possible in the container.  * Remember water expands when frozen!! This is especially important when freezing meals in glass!! So when freezing a meal in glass containers, leave head room of about 1\" in the jar before freezing, this will ensure you don't open your freezer to see a cracked glass and a meal you worked on destroyed...* Never put hot meals straight into the freezer (again, super critical when freezing in glass, and a good best practice no matter what type of container you are freezing in).* Always label your meals before freezing so you know what everything is. Include instructions on reheating, suggestions for side dishes or any ingredients to add when cooking or reheating.* Make double or triple whatever you're cooking for dinner and freeze the rest for an easy way to build up your freezer stockpile!!* Look for sales on meats or vegetables you eat regularly and stock up then! Use these opportunities to prepare several portions of your favorites!* Keep notes on the recipes you used. As you heat your meals and serve them, add to your notes your own rating of the meal and any adjustments you'd incorporate if you prepared that meal again. This way, over time, you'll develop your own twists on recipes and a staple supply of your own freezer meal personal favorites!I hope this Instructable gave you some ideas for how you might stock up your own freezer in advance of a big event!! It was such an easy process, this just might become my new approach to cooking all the time!!\nRead the question below and select from the following choices.\nA: How to Eat for Free Everyday (at a University)\nB: Casseroles\nC: A Little Off the Top...\nD: How to Open and Enjoy a Bottle of Wine.",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_170_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_170_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_170_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_170_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_170_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_170_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_170_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_170_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_170_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_170_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_170_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_170_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_170_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_170_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_170_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_170_15.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Summer Fruit Iced Tea\nB: Top Layer\nC: Peel and Cut the Mango\nD: Top Layer",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Peel and Cut the Watermelon', 'Peel and Cut the Cucumber', 'Remove Kiwi Skin and Cut the Kiwi', '@placeholder']",
+    "context": "Here is the context of these images:\n. You will need the following materials/ingredients:-1 2 liter bottle of Sprite- 1/2 mini watermelon- 1 large mango- 1 cucumber- 2 kiwis- Tajin chili powder- Pitcher- Knife- Cutting board- Fruit peeler- Spoon. . Place your Sprite in the freezer until right before it starts to freeze. You do not want to add ice to your drink because it will water down the flavor.. Using a fruit peeler, remove the entire peel from the cucumber and remove about 1/2 inch from each end. Cut the cucumber into several long strips. Then, cut the strips into small cubes. Place the diced cucumber into a pitcher.. The easiest way to dice the kiwis are to first cut the kiwis in half. Then, using a spoon, scoop the kiwi fruit out of it's peel. After scooping the kiwi out, dice the kiwis into equal sized pieces. Finally, place all of the diced pieces into the pitcher. . Carefully peel the skin off the mango using a fruit peeler. Dice the mango  as shown on the pictures. For presentation purposes only, try to dice the mango into even pieces. Then, placed the dices mango into the pitcher. . Season the diced fruit with Tajin chili powder.. Pour the entire 2-liter bottle of Sprite into the pitcher with the diced fruit.. Grab a spoon and enjoy a refreshing and delicious summer treat!\nRead the question below and select from the following choices.\nA: Summer Fruit Iced Tea\nB: Top Layer\nC: Peel and Cut the Mango\nD: Top Layer",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_171_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_171_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_171_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_171_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_171_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_171_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_171_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_171_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_171_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_171_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_171_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_171_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_171_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_171_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_171_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_171_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_171_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_171_17.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_171_18.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_171_19.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_171_20.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_171_21.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_171_22.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_171_23.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_171_24.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_171_25.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_171_26.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_171_27.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_171_28.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_171_29.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_171_30.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_171_31.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_171_32.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_171_33.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_171_34.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_171_35.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_171_36.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Add Flour Mixture\nB: Sweet Delicious Banana Bread\nC: Final Product\nD: Easy Homeade Banana Bread",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Coat Loaf Pan', '@placeholder', 'Bake', 'Cool']",
+    "context": "Here is the context of these images:\n. Coat a 9-by-5-inch loaf pan with butter and dust it with flour, tapping out the excess.. Whisk together the measured flour, baking powder, salt, baking soda, and cinnamon in a large bowl to aerate and break up any lumps. Set aside.. Place the sugar, eggs, oil, and vanilla in the bowl of a stand mixer fitted with a paddle attachment and beat on medium speed until thoroughly combined, about 2 minutes.. Add the bananas and sour cream and mix until just combined.. Scrape down the sides of the bowl, add the flour mixture, and mix until just combined. Turn the batter into the prepared loaf pan.. Preheat oven to 350\u00b0F. Bake until a toothpick inserted in the center comes out clean, the top is golden brown, and the bread is pulling away from the sides of the pan, about 50-60 minutes. . Transfer to a wire rack to cool for 10 minutes. Slide a knife around the perimeter of the pan, invert to release the bread, and cool completely on the wire rack before serving.\nRead the question below and select from the following choices.\nA: Add Flour Mixture\nB: Sweet Delicious Banana Bread\nC: Final Product\nD: Easy Homeade Banana Bread",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_172_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_172_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_172_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_172_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_172_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_172_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_172_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_172_7.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Add Corn Syrup or Sugar\nB: Chewy Chocolate Chip and Walnut Cookies\nC: Bake the Cookies\nD: Scooping and Baking!",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Gather Your Supplies', '@placeholder', 'Add More Ingredients', 'Bake Until Golden']",
+    "context": "Here is the context of these images:\n. You're going to need a little more stuff than you normally would to achieve freshly-baked chocolate chip cookie perfection. Why is that? Chewy cookies hold more moisture than crispy cookies, so therefore we need to use ingredients that hold on to more moisture than your average cookie recipe will have you use. Ingredients like corn starch and corn syrup hold on to more water when they bake, whereas more traditional ingredients bake-up crispier (you can also make cookies crispier by baking them longer and at a lower temperature.)So what's the secret to that long-lasting \"freshly-baked\" texture that so many store-bought cookies seem to have? You need to mix it up. You will need to make two batches of dough. One traditional batch with \"crispy\" ingredients, and one batch that substitutes the traditional ingredients for \"chewy\" ingredients. This is a trick that can be used with any recipe. The specifics of our recipe can be found on page 282 of Cooking for Geeks.Here's what you'll needButterBrown sugarWhite sugarCorn starchCorn syrupEggFlourOatmealSaltVanillaChocolate ChipsLemon juice. Using two separate bowls (one \"chewy\" and one \"crispy\"), cobble-together your ingredients. Add the butter and brown sugar per the recipe you are using to each bowl. This is where the batches will differ. Next you will add the white sugar to the traditional, \"crispy\" bowl and mix   To the \"chewy\" bowl, you will add corn syrup and mix. Next, you will add the following ingredients to both batches and mix:vanilla  lemon juice  egg  oatmeal  flour  salt. Add your cornstarch to the \"chewy\" batch and mix.. Add the rest of your ingredientschocolate chips walnuts (optional). To create your fresh-baked cookies that will stay as freshly-baked as they were out of the oven, here is how you put them together. Scoop a bit of \"crispy\" dough onto your parchment-lined baking tray Now take a slightly smaller scoop of \"chewy\" dough and press it into the middle of the \"crispy\" dough. Bake your cookies until they reach golden cookie perfection. Your cookies will be crispy on the outside and chewy in the center, and they should stay that way!. For the specifics on this and other scientifically-minded cookery, check out my book, Cooking for Geeks! This recipe appears on page 282 of the book. You can try it for yourself by clicking here and reading two chapters for free!If you liked this Instructable, you can like and subscribe to my YouTube channel. Cooking for Geeks is available on Amazon\nRead the question below and select from the following choices.\nA: Add Corn Syrup or Sugar\nB: Chewy Chocolate Chip and Walnut Cookies\nC: Bake the Cookies\nD: Scooping and Baking!",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_173_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_173_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_173_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_173_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_173_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_173_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_173_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_173_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_173_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_173_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_173_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_173_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_173_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_173_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_173_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_173_15.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Vegan Rainbow Cupcakes\nB: Haunted Grave Cupcakes\nC: Add the Decorations\nD: Veganir Butterycreamy",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Bakus Cakus!', 'Colouria Batterun', '@placeholder', 'Finitus Delicious']",
+    "context": "Here is the context of these images:\n. IngredientsCake 250ml soy/vegan milk 1tsp cider vinegar 350g self raising flour 2 & 1/2 tbsp Corn flour 2 & 1/2 tbsp Icing sugar 80ml vegetable oil 180g Caster sugar Red/Blue/Green/Yellow food colouringButtercream 100g Veggie Shortening 100g Vegan butter 500g Icing sugar Any food colouring that isn't Red/Blue/Green/yellow. *WARNING* Images used may contain magical apparatusStart by pre-heating your oven to 175\u00baC/350\u00baF/gas 4 Get 250ml of soy/vegan milk and add a tsp of cider vinegar to it. Leave this to one side.  In a bowl sieve in 350g of self raising flour, add 2& 1/2 tbsp of corn flour and 2 & 1/2 tbsp icing sugar. Mix together.. Add to it 180g of caster sugar and mix together. Add in your milk/vinegar mix and mix again. Pour 80ml of rapeseed or vegetable oil. You guessed it, mix again. Add 2 tsp vanilla extract or any other flavour you wish. Take out 4x50-60g of the mixture and put into their own bowls. Colour them the house colours (Red, Blue, Green & Yellow). Take an ice-cream scoop and scoop the plain mixture into your cases, just under one scoop should be enough for each case. Take a teaspoon of any colour and put into the centre of the mixture by cork-screwing the teaspoon around in the middle. This will give the core the hidden house colour.Put into your oven for 10-15 minutes or until a knife comes out clean. I like to put tinfoil on the top to stop Hornwoggles stealing my dreams. But also, it makes them rise more evenly. Take the tinfoil off for the last couple of minutes baking to let the top solidify. Once ready, take out and leave to cool on a wire rack.. Now to make your buttercream. Its vegan buttercream, so that works well with the cakes. Take 100g of vegan butter and 100g of vegetable shortening and mix this in a bowl with 500g of icing to make it a good stiff consistency. Add Vanilla extract or any other flavour- for... flavour. Add some colouring to make it a colour, any colour you want as long as it isn't one of the house colours.. Take 200g of fondant icing and colour it brown to match the sorting hat. You can also use the brown food colouring and a paintbrush to add some texture to the hats. To sculpt the hat, take a small piece around the size of a chick pea, and turn it into a flat disc. It doesn't have to be perfectly round, as the Sorting Hat is a wise, old hat who has seen many things. Take another piece and roll it into a ball. The size can vary, but as long as the ball sits in the middle of the disc you just made with enough room around the outside for a brim. Roll one part of the ball up so it becomes tear drop shaped. Sculpt the eyes and mouth just above half way up, and add some indents to the tear drop. Push the whole thing down gently into the middle of the disc, and fold over the pointy end of the tear drop so the hat looks like its folded over.. Spoon the buttercream into a piping bag and pipe onto the cupcakes. Place the hats on top of each cupcake. When you bite into them, you will find out which house you have been sorted into. Make a game of it, or eat them all!Let me know what houses you get. Watch the video  for my first Muggle Baking class and another way to learn.See you soon Colin\nRead the question below and select from the following choices.\nA: Vegan Rainbow Cupcakes\nB: Haunted Grave Cupcakes\nC: Add the Decorations\nD: Veganir Butterycreamy",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_174_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_174_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_174_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_174_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_174_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_174_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_174_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_174_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_174_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_174_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_174_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_174_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_174_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_174_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_174_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_174_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_174_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_174_17.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_174_18.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_174_19.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_174_20.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_174_21.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_174_22.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_174_23.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_174_24.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_174_25.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_174_26.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The Slices of the Lamb\nB: Coconut Curry Lamb\nC: Its Ready?\nD: Pack",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Spices and Sauce', '@placeholder', 'Tay', 'Ring the Dinner Bell']",
+    "context": "Here is the context of these images:\n. SpicesGinger - about a 2\" piece, skin removedGarlic - 5 cloves or more to suit youBlack Cumin - 1 teaspoon (tsp)Cumin - 1 tablespoon (TB)Coriander - 2 tspCardamom - 1/2 tspBlack Pepper - 1 tspRed Pepper - 1 tsp to 1 TB depending on how hot you want it.Turmeric - 2 tspFenugreek Leaves - 1 TBAll of these spices can be adjusted to whatever you feel like. Add different ones, leave out some, make it your own.SauceSkin the onions and cut them into quarters and toss them into a blender. Add 1 cup of yogurt and dump in all of your spices. If the mixture has trouble getting ground up in the blender just add a little water or milk to get it going. NotesSome of my spices I just left whole. I knew this particular blender would pulverize them and make a smooth blend of it all.. I used lamb cuts from the leg. I rarely cook a whole leg of lamb so I had the butchers cut the legs into 3/4\" steaks. I used about 2.5 pounds. Cut the meat up in to 1 to 2 inch pieces and toss them into your cooking vessel with a tablespoon of salt. . Add the sauce from the blender to the meat in the pot and add a cup of cream. Mix all of that together and turn on the heat. Bring it up to a simmer and then cover and cook covered on low to medium-low for 1 1/2 hours. Give it a stir occasionally to make sure nothing is sticking to the bottom.. While your meat is cooking away on the stovetop grab some red or gold potatoes (1 1/2 to 2 pounds total) and cut them into quarters. I like the smaller potatoes for this. Peel them if you wish, I don't.When you reach the end of the 1 1/2 hours of cooking add the potatoes and cook for another 30 to 45 minutes until the potatoes are tender.. While your curry is cooking away and making your house smell like an Indian street bazaar you should probably make basmati rice and dice up some cilantro to top your dish. Store bought naan is a lovely thing to keep around too.. Serve it up on the rice with some hot, buttered naan and sprinkle with cilantro. It doesn't get much better. Tender chunks of lamb in a thick and flavorful sauce.For those that are weird about eating lamb you can substitute whatever you like. If you use chicken though you'll want to cook the sauce separately for a while to reduce down and mix the flavors and then add your chicken later so you don't end up with chewy hockey pucks in your curry.\nRead the question below and select from the following choices.\nA: The Slices of the Lamb\nB: Coconut Curry Lamb\nC: Its Ready?\nD: Pack",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_175_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_175_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_175_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_175_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_175_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_175_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_175_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_175_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_175_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_175_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_175_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_175_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_175_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_175_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_175_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_175_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_175_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_175_17.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_175_18.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_175_19.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_175_20.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Ingredients\nB: Halloween Cake\nC: Yummy Ready\nD: Almost Ready",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Batter Up!', 'Prepare Your Frosting', 'Go to Town!']",
+    "context": "Here is the context of these images:\n. ToolsYou'll want some basic cupcake making devices for this endeavor:Cupcake panCupcake linersHand or stand mixerRubber spatulaSpoonsMeasuring spoons and cupsLag screws or marbles - something like that.\u00a0 You know, to make the skull shapes.Best Cupcake Recipe EVAROf course you can use a box mix to speed up this process, or your own stand-by recipe for perfect cupcakes.\u00a0 But if you're looking for something new, look no further.\u00a0 With a whole new technique and tangy taste, these vanilla cupcakes will blow your socks off!3 cups (360g) of flour1 cup (2 sticks / 227g) of salted butter - room temperature2 cups (110g) of sugar1 TBSP +1 tsp (20mL) baking powder4 large eggs1/2 cup (4 oz / 237mL) whole milk1/2 cup (4 oz\u00a0 / 237 mL)\u00a0of plain yogurt2 tsp. (10mL) vanilla extractCan't Believe It's So Good and So Simple Buttercream Frosting RecipeThat's about all I\u00a0need to say about it.\u00a0 Only four ingredients and five minutes stand between you and frosting you'll actually want to eat with a spoon (this coming from someone who scrapes off the frosting to get to the good stuff!)1 cup (2 sticks / 227 g) of unsalted butter, room temperature4 cups(1 lb / 448 g)\u00a0 of powdered sugar1/2 cup (4 oz / 237mL) of milk2 tsp (10mL) of vanilla extractOpt.\u00a0 Chocolate chipsOpt. Food ColoringTasty ToppingsThis can be left up to your imagination and will depend on what devilish devices you have planned!I used M&Ms to create eyes, nostrils and beaks, Junior Mints for skully eye sockets, mini Oreos for owly eyes and red hots for special zing where needed.\u00a0 Get creative with what you can find!. \n          Let's make some cupcakes!\u00a0 Now pay attention, because this isn't going to go down like you expect.First, place cupcake liners in the pan. \u00a0 Okay, you probably expected that.Now, preheat your oven to 500oF (260oC)What??\u00a0Scooch have you lost your mind?\u00a0 See, I\u00a0know what you're thinking.\u00a0 But no, I have not lost my mind!\u00a0 This tip comes to me from a.nony.mouse via Cupcake Cones:if you want the crowned tops to be as high as possible, pre-heat the oven to 500 degrees F instead, and as soon as you put the cupcakes in there, lower the temperature to 350 and cook, it gets the crown nice and high! (it's the trick they use at bakeries on those giant cupcakes and muffins you always wished you could re-create!)Now for some mixed-up mixing!You don't need to sift the flour if you measured it properly like I\u00a0taught you!\u00a0 Stir in sugar and baking powder to combine well.\u00a0Chop up your butter into small pieces and add to the flour mixture.\u00a0Say what?\u00a0 That's right. Add your butter to your dry ingredients.\u00a0 Now blend, blend, blend until it resembles the texture of breadcrumbs.\u00a0In a separate bowl, blend together eggs, milk, yogurt and vanilla extract. You have a whisk, you say?\u00a0 Even better!\u00a0 I\u00a0just used my regular mixer beaters.\u00a0 Get it nice and frothy.Add this to the bowl with the dry ingredients and beat until just combined.\u00a0Try to abstain from licking the beaters.\u00a0 I can't condone eating uncooked eggs.\u00a0 But if you slip and get some in your mouth by mistake - YUM!!\u00a0 You'll notice right away what a unique tang the plain yogurt adds.\u00a0 This flavor will mellow out in the baking process, but I kind of wish it didn't!\n        . \n          As you know, you need bolts to make good skulls.Well, skull-shaped cupcakes.Not just bolts, I guess. Could be lag screws like I had on hand.\u00a0 Could be marbles as I've seen used before.\u00a0 You know, whatever you got that isn't going to melt or smell funny while baking.For Skull-shaped cupcakes, drop your implements into the cupcake pans on the OUTSIDE\u00a0of the cupcake papers where you want the jaws to form.\u00a0For non-skull-shaped cakelets, fill the cups 2/3 full, as is.Place cupcakes in preheated oven and immediately lower the temperatureto 350oF (180oC)Bake for 15-20 minutes, until the tops spring back when you poke 'em.. This technique for the most amazing buttercream recipe is going to require both your faith in me and some patience.\u00a0Can we agree to this?\u00a0 Prepare to be filled with disbelief before you are filled with awe.\u00a0 Let's go.You know by now that cutting your room-temperature butter into chunks is essential to successful baking, so I\u00a0won't even condescend to mention it to you.Add your butter (1 cup, remember?)\u00a0 to your powdered sugar (AKA confectioner's sugar, AKA icing sugar, AKA\u00a010x sugar - never really understood that last one) (4 cups / 1 lb / one whole entire box, yes).\u00a0Stir to fold the butter and sugar together.\u00a0 This is only a preventative step to keep from creating an enormous dust storm when you introduce the mixer.\u00a0 Not sure if it will work for you.\u00a0 Never does for me, but heck, give it a go and let me know how it goes.Mix on low speed until well blended and then increase speed to medium and beat for another 3 minutes.This is the point where your faith will be tested.\u00a0 At some point during minute two or three, you'll be ready to throw in the towel and crack open your can of Duncan Heinz.\u00a0 DON'T\u00a0DO\u00a0IT.\u00a0 Nothing can compare to the magic you are about to make when you just keep the mixer on for a moment more. Once these two ingredients have been magically transformed into something that actually resembles frosting instead of the dusty, sticky mess is did two seconds ago, it's time to add the rest.Add milk (1/2 c)\u00a0and vanilla (2 tsp)\u00a0and continue to blend on medium speed for one minute more. \u00a0To make chocolate frosting:Melt chocolate chips in a bowl - 14 oz to turn the full batch into chocolatey goodness, less if you're divvying it up.Once the chips have cooled, stir into prepared buttercream frosting.\u00a0 Adjust to taste!To make colored frosting:Add food coloring to suit your needs.\u00a0 I didn't really need to make a whole step about that, did I.. \n          On your cupcakes, I\u00a0mean. Not literally.\u00a0 Unless you need to pick something up from the market to finish them.\u00a0 That's annoying when that happens, isn't it?\u00a0 You end up using lag screws in your baking just to avoid another trip to the store!For Skullcakes:Junior MInts make great, quirky eye sockets, halved M&Ms stand in for nostrils, and sliced chocolate chips define teeth.\u00a0\u00a0For Owlcakes:Halved mini-Oreos make great eyes with M&M pupils.\u00a0 M's also make adorable beaks.\u00a0 Pipe extra chocolate frosting into expressive furrows.\u00a0\u00a0\u00a0For Mummies:Red hots are great for spooky red eyes (M&Ms provide other colors), and white and chocolate buttercream frosting do the rest!\u00a0For Braaaaiiiiinnnnnns:Mix red and green food coloring into the white butercream frosting.\u00a0 Pipe zig-zag lines through the cut tip of a plastic baggie or piping bag.\u00a0 If you have it, I\u00a0imagine a flat icing tip would do wonders here.\u00a0Make sure to post pictures of your own fiendish creations.\u00a0 Enjoy!\nRead the question below and select from the following choices.\nA: Ingredients\nB: Halloween Cake\nC: Yummy Ready\nD: Almost Ready",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_176_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_176_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_176_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_176_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_176_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_176_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_176_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_176_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_176_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_176_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_176_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_176_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_176_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_176_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_176_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_176_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_176_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_176_17.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_176_18.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Douwe Egbert Coffee Maker Hack\nB: Drink the Micky's\nC: Drill a Pilot Hole\nD: Get Your Cat and Start Building!",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Drill Holes for the Test Tubes', 'Stain or Seal', 'Make Some Coffee!']",
+    "context": "Here is the context of these images:\n. 3/8\u201d Iron Pipe I used 3/8\" pipes and fittings from the plumbing department at my local Home Depot. The straight pipes, T-fittings, 90 degree elbows and couplings are all standard. The nuts are standard as well, but I've been to a few Home Depots that don't have them in stock. If you have a hard time finding the nuts, you can always order them from McMasters.  I used the following 3/8\" diameter pipes and fittings:  1 T-Fitting 1 90\u02daElbow 1 2\" Pipe 1 5.5\" Pipe 1 1.5\" Pipe 1 Coupling 2 LocknutsFunnel I used a 100mm glass funnel to hold the coffee filters.2x6 I used a short piece of scrap 2x6 to make the base. A piece of 2x8 or 2x10 would also work.Chemex Coffee Filters Chemex coffee filters work well. If you get ones that are too big, it's easy to cut them down with scissors.RYOBI 18 Volt Cordless DrillRYOBI 18 Volt Circular SawRYOBI Orbital Sander. I cut a 16\" long section of 2x6 to use as the base.. I drilled a 1/16\" pilot hole to serve as a guide for the larger holes I will drill to recess the pipes.. I used a 1 1/2\" diameter drill bit to drill a hole a little less than halfway through the 2x6. Then I centered the bit on the pre-drilled pilot hole. This hole will accommodate the lock nut and pipe end while allowing the 2x6 to sit flush on a table or shelf.. Flip the board over and drill a 3/4\" diameter hole centered on the pilot hole. This hole should go all the way through the board.. Select a drill bit that has a slightly larger diameter than the test tubes. You don\u2019t want a fit that is too tight or you might break the test tubes. I used a piece of blue painter's tape to mark 1\" from the tip of the drill bit. When I drilled the holes, I used this tape as a marker for knowing when I had drilled deep enough.. I used an orbital sander to sand the 2x6. I started with 100 grit sandpaper to round down the edges and then finished with 220 grit.. I used Danish oil to finish the 2x6. Bioshield or an acrylic finish would have also worked and would offer a bit more protection.. I used the 2 locknuts and the 1.5\" long piece of pipe to clamp on to the 2x6. The 3/8\" diameter pipe fits through the 3/4\" diameter hole and then a locknut on each side secures the pipe to the wood. The large 1 1/2\" diameter hole on the underside of the board hides the bottom locknut.. Insert the glass funnel into the top T-fitting and you're ready to add a filter, coffee grounds and hot water! I used Chemex filters, but accidentally bought the large ones, so I had to use scissors to cut them down to size.. Good luck making your own pipe coffee maker and please email or tweet photos to @benuyeda or ben@homemade-modern.com. For more DIY ideas and projects, visit us at HomeMade Modern.\nRead the question below and select from the following choices.\nA: Douwe Egbert Coffee Maker Hack\nB: Drink the Micky's\nC: Drill a Pilot Hole\nD: Get Your Cat and Start Building!",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_177_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_177_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_177_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_177_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_177_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_177_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_177_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_177_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_177_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_177_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_177_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_177_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_177_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_177_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_177_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_177_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_177_16.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Prepping Your Meat\nB: How to Make an Arnold Palmer Mocktail\nC: Eating the Core\nD: Almost Finished...",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['The Smoker', 'The Slab', '@placeholder', 'Smoking (finally!)']",
+    "context": "Here is the context of these images:\n. Smoking is a unique method of cooking since it uses such low temperatures. Most ways of cooking meat focus on developing flavors through heavy use of the Maillard Reaction and Caramelization (don't worry too much about what that first one is it's kind of complex, but if you do want to look into it, it's actually really interesting). They do this by heating the target food to high temperatures for relatively short periods of time. These methods work great, but there's more to cooking than just slightly burning things, and you can't say you've lived until you've experienced that first hand. The key to smoking meat is maintaining a lower temperature -- usually around 200 - 300 Fahrenheit or 93 - 150 Celsius -- for long periods of time (and I do mean long). This will help to break down connective tissues in the meat (namely collagen) and render fats. The collagen is very important to meat because it becomes gelatin when it's broken down; gelatin gives meat a nice tender texture and fuller flavor. But, because of the length of time it takes and the volume of dry air that passes by the meat, it can become very dry, so the fat needs to render into the meat to keep it moist. Of course the big difference between roasting and smoking is smoke.Smoke is a combination of gasses, liquids, and very fine particles. Most smoke you see is very white and puffy; this is because of the amount of large ash particles. Ash is not something you want in your food. When you're smoking meat, you usually want to barely be able to see the smoke. This means there's a relatively low amount of ash and that the particles in the smoke are very fine. The best smokes are pale blue in color, so try and stay away from those towering pillars you might imagine coming out of the smoker. Ah, yes the smoker, well, that's a whole different issue.... At this point you might be saying to yourself \"Gee whiz, this whole smoking business sure does sound swell, but, shucks, I can't afford to go out and buy one of those newfangled contraptions to cook it!\" This is a common sentiment among would-be pitmasters, and it's one that has undoubtedly finished many great careers before they've even had a chance to begin. The real shame is that you don't need a smoker to smoke. All you need is a good sized grill with enough space between the grates and the bottom of the grill. If you don't have a good grill and you're on a tight budget, you can get away with just a kettle grill like a Weber. But smokers are designed with smoking in mind, so if you can afford one, I suggest you buy one.If you're in the market for a smoker, you'll find there are three main kinds: electric, gas, and wood. At the end of the day, real barbecue comes from a wood smoker, but they can be kind of fickle. I don't know much about electric smokers, but I've used gas smokers a few times when I'm visiting my father. All I can say about them is to save your money. Every time I've used a gas smoker, it can't reliably get cold enough to properly smoke meat. I'm not going to come out and say that gas smokers are all evil, I'm just going to advise you to not buy one. I'll talk a bit more about wood smokers since all that talk of gas smokers has left a bad taste in my mouth.As far as wood smokers go, you'll find there are three kinds: cabinets, horizontal drums, and vertical drums. I personally use a humble horizontal drum with an offset smoke box -- I like having the wood and the meat in different areas; it just makes my brain happy to have things organized like that -- but the vertical drum smokers are very popular. I won't go into too much detail here, because I'm sure it's not why most of you are reading this. This is a great page detailing the choices that are out there. Now that we have that out of the way, we can get down to the meat of this problem.. As I said in the first section, the two key elements in smoked meat are collagen and fat. These are the things you need to keep in mind when you're buying a slab of meat. You'll typically get a large slab of meat like a pork butt which is used for pulled pork. I also said in the first section that smoking and roasting are very similar, so anything labeled \"roast\" will work as well. You want to see good marbling in the meat because you need fat to keep it tender. Ribs are always a good choice, of course, but be aware of what kind of ribs you're getting. If you like to know more about ribs, you can go here and read about them, but I'm going to just breeze over them for now. All in all, just look for something that looks tasty and is pretty cheap. Barbecue isn't about expensive meats and fancy cooking, after all.I've included some pictures of good and bad choices for meats I saw while shopping at Walmart you can look at for reference. I must apologize, though; since I wasn't planning on taking the pictures and just used my phone and they are kind of hard to see. I'm going to be showing you pictures of some pork ribs as I talk more about the process of smoking, since that's what I've cooked most recently.. In the world of smoking, there's actually a lot of debate about wood. How much difference does the type of wood actually make? How big an impact does the origin play in the flavor of woods? Is it really necessary to soak wood before using it? These are just a few of the topics pitmasters argue about. I'll try and stay away from these issues, since there's a lot of very strong opinions out there and not too much evidence either way. Besides, most of these pertain only to high level competitive smokers. That being said, I still kind of need to talk about wood considering this Instructable is about smoking, so I'll try and not step on too many toes.Woods have different flavors depending on what species they are. Some woods, like hickory and mesquite, have very strong flavors, while others, like cherry or pecan, have pretty mild flavors. Once you start talking about pairings, people start to get opinionated. I understand that different people do things differently, so if you don't want to listen to me be my guest and skip to the next paragraph. I like the flavor of meat; there aren't too many things I like more than a nice medium-rare steak. Woods with strong flavors can drown out the nice meaty tones of beef, so I usually use milder woods on red meats and game. I'll save apple and the like for my pork since, frankly, pork doesn't have too much of it's own flavor. As far as poultry and fish are concerned, I like to play around with woods that don't get used as often like alder or acacia; there's no particular reason for this, I just don't care that much what happens as much as I do with meat. A lot of people disagree with me very strongly on this, saying that red meat is best suited for the intense flavors of hickory and the like. This is fine, but I want you to know that these people can't be trusted.Once you've picked out a wood, you're pretty much set. I do want to bring up a point, though, about soaking your wood. This is another topic people like to argue about. It's conventional wisdom to soak your wood before you use it, but there's some evidence saying that it's not a necessary step. The idea behind soaking your wood is to make it harder to ignite, releasing nasty carcinogens into your food. The thing is that soaking doesn't add all that much water to the wood, so it might not be an important step. I personally do soak my wood, since I live in the Mojave Desert where it's usually under 10% humidity and over 100 degrees Fahrenheit.If it were just about necessity I'd probably urge you to soak your wood anyways, but it might actually be beneficial to not soak your wood. If you soak your wood for, say, 12 hours, you'll see that the water has turned a brown color. It's not widely known what's soaked out of the wood, but some people think that certain aromatic chemicals may seep out of wood while you soak it. So give some thought to what wood you might want to use, and play around with soaking it; it's important you form your own opinions about this stuff instead of just listening to people like me on the internet.. You don't need to do much to most cuts of meat to get them ready to smoke. Certain cuts, however, like a beef tenderloin, have a tough membrane on them called silverskin. Silverskin is made of elastin, a type of tissue which is very tough and won't break down when cooked. It looks like you might imagine it would; it's a white patch of oddly metallic tissue. It's easy enough to remove, though and just takes a bit of care to not mutilate the meat. You need to get under the elastin and just run your knife through at an angle pointing into the meat (so you don't cut out of the silverskin and have to start over). Once your meat is elastin free, you need to look at the shape of the cut. The parts of the meat which are kind of skinny will cook and dry out faster than the rest of the meat, leaving you with dried up bits of jerky sticking out of your dinner.If you're dealing with pork or poultry, now's the time to prepare your brine. Brining cuts of meat helps to retain moisture and adds flavor. A brine is a salt-water solution, usually with some sugar and spices, that you soak the meat in. Salt is know far and wide for it's ability to dry out meats, so how is this supposed to make the meat more moist? The answer is a process you've probably forgotten since high school biology called Osmosis. The brine has a much higher concentration of salt than the meat, so the brine moves into the meat to balance out the concentrations. This is good because you'll wind up with more liquid in the meat than you started with. The Salt in the brine will also help break down proteins and make the meat more tender. This is a great brine for smoked pork. You can see from that page that brines do take a good amount of time (12 hours for that one), so plan ahead.Before you put your meat in the smoker, you'll probably want to rub down the meat. Rubs are mixtures of seasonings that get applied to the outside of meat. You typically want to use fairly strong spices for rubs since they're just on the outside of the meat. I usually just grab whatever sounds good for the cut, but if you aren't that comfortable with spices, you can find lots of great recipes online. To actually apply the rub, you'll usually want to put down a layer of sauce or mustard or maple syrup or really any kind of saucy condiment (less tender cuts like beef ribs don't necessarily need this extra layer, but it's still a good opportunity to get more flavor into the meat). This will add more flavor and keep the bark from becoming too hard (you can see on the pictures I skipped this on my last batch of ribs. That was a mistake; I could have gotten away with it on beef, but the pork ribs were just way too tender). Once you've worked the sauce into the meat, you need to sprinkle on a generous amount of rub. Now, the name rub can be deceiving, because you don't want to actually rub the rub. You want to massage the rub into the meat. If you rub the rub, it will ball up in the sauce and you won't get good coverage with the rub. This will need to rest for a little bit so let's go get the smoker going.. Even though we aren't going to cook over one, you're going to need to start a fire in your smoker. You can start it however you like, but I'd suggest not using lighter fluid because it tastes disgusting (don't ask how I know that). Personally, I put down a layer of newspaper, a layer of kindling, and a layer of charcoal. Once you get the newspaper going, each layer lights the one above it. Charcoal chimneys are very nice pieces of equipment and they usually don't cost too much; in fact, you can probably find some here on Instructables for free. I don't use one just because of my smoker set up; I'm not comfortable needing to reach into my side-box, since I've lost a lot of arm hair doing that. Once your fire is started, wait it out until you're just above your target temperature.Once you're nearly at the temp you want (usually 210-230 for pork, 210-240 for beef, 225-250 for poultry, and 150ish for fish) , you can add some wood chunks. Just put them in there on the embers so they can start smoldering. We aren't waiting for it to cool down all the way to the cooking temp because we want the wood to get going before we start cooking, and your temperature can do different things when you add wood (it depends on the wood's structure and whether you've soaked it and for how long). You can go fetch your meat as soon as the temperature is right and arrange it on the grate so that each piece will get a good coverage of smoke. You may see some recipes out there that say to wrap the meat in foil part of the way through the cooking. This is called the 'Texas Crutch'  and it sacrifices a lot of flavor for a relatively small amount of moisture. I don't suggest doing this; you can get the same moisture out of a cut of meat through proper preparation (brines are great) and still get the flavor you lose with the crutch.Now, we wait. You'll want to keep an eye on the smoke. If it looks white and fluffy, you have a problem, probably a fire. If it looks grey, you probably have a fire. If it looks black, you probably have a fire. Really, if you can see it without straining, you want to check it. Remember that the best smoke is barely visible and a light blue-grey. Of course, you also need to monitor temperature; most smokers have the little spinning vents you can close to limit oxygen if it gets too hot. This is really the test of a good pitmaster; you need to have the patience to just wait for hours on end and not take your eye off the smoker. Good luck.. Well, I think that's just about all I have to say, so enjoy your meal and take pride in the fact that you've created it out of a piece of muscle and a chunk of a tree. If you want to know anything, post a comment and I'll try and respond to you quickly; I can't make any promises, though, I'm going to be awfully busy coming up here. Good luck, and Good cooking. \nRead the question below and select from the following choices.\nA: Prepping Your Meat\nB: How to Make an Arnold Palmer Mocktail\nC: Eating the Core\nD: Almost Finished...",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_178_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_178_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_178_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_178_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_178_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_178_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_178_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_178_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_178_8.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Changing Martini\nB: Add Ice Cream and Milk to Bullet or Blender\nC: \u200bIn Skillet\nD: Pour Your Liquid Smiles Into Your Prepared Glass",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Add Vermouth to Glass', 'Fill the Shaker With Goodness', '@placeholder', 'Enjoy Resonsibly']",
+    "context": "Here is the context of these images:\n. -A good quality Gin.  If you put it in the freezer and it becomes a block, then it is not good quality.\nI use Seagrams gin.  There is much better stuff, but it is not bad on a budget.\n-Vermouth.  Get a nice dry vermouth.  Gallo works but it is all I can really find easily.  There are better options, but lets not get all snooty over ingredients.\n-A nice clean, simple martini glass.  Leave the bendy, bubbly, goofy and eccentric ones in the cabinet.  Those are for the goofy new fangled martinis that girls like.  We are men here, we don't want no stinkin' chocolate in our martini, let alone vodka.  (don't get me started)\n-Shaker.  Use what you have.  Not everybody has 15 shakers in their house like me. . I use the plain ole big ice cubes you make in your freezer.  Add them to a martini glass.  If it is already cold it is a bonus.. Pour the vermouth over the ice cubes.  Just use a splash.  Maybe a half fingers worth or less.. Fill your shaker with ice.  Then pour your clear deliciousness into the shaker. \"How much gin do I use? It really depends on your shaker size and how much you plan on drinking.  I am using a medium sized shaker full of ice and I filled it maybe halfway.. Shake shake shake.\nI wrap a towel around the shaker since I will be shaking it for about 2 minutes.  The metal gets really cold and frosty and will get you from shaking it long enough.  Especially if your fingers go numb and fall off.\nRemember kids, protect your fingers from cold stuff when making dad a martini.. Take that vermouth in the ice cubed glass....\ndump it out.\nyes....dump it out.\nSpin the glass a bit so the vermouth kind of coats the glass, almost like a vermouthy glaze.\nGive it a few flicks so there is very little vermouth at all in the glass at all.\nThe less vermouth, the dryer the martini.  Some people actually don't use vermouth at all.  Most bars will give you about 20 times too much.\nExperiment with quantities to find your pleasure zone.. Unpop your shaker top and pour that liquid happiness into your freshly made glass.  I like it when there are little tiny shards of ice suspended in the drink.  You mouth never feels them, but they look cool in the glass for about a minute.. Now, try out your new found Martini making knowledge and make a few drinks.  Be sure to bring a few people who have problems with keeping their clothes on to taste test for you, and you will assuredly have a good time.\nRead the question below and select from the following choices.\nA: Changing Martini\nB: Add Ice Cream and Milk to Bullet or Blender\nC: \u200bIn Skillet\nD: Pour Your Liquid Smiles Into Your Prepared Glass",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_179_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_179_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_179_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_179_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_179_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_179_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_179_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_179_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_179_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_179_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_179_10.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Chop Chop Chop N Grate !!\nB: Plum Tarts for a Wannabe Fancy Pants\nC: Ahh, the Romance.\nD: Ingredients",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Ingredients & Supplies.', \"Let's Start With the Curd !!\", '@placeholder', 'Mix the Veggies, Barbeque It Then Go NOM Nom NOM Nom !']",
+    "context": "Here is the context of these images:\n. Ingredients: 1) 1/2 kg Yogurt 2) 1 tablespoon Garlic 3) 1 tablespoon Ginger 4) 3/4th tablespoon Green Chilies 5) Half bowl chopped Mint leaves 6) Salt to taste 7) 1/2 teaspoon Cumin Powder 8) 1/4 teaspoon Garam Masala ( It is readily available in market ) 9) 1/4 teaspoon Dhaniya-jeera powder (Coriander - Cumin Spice Blend .... Now we make this at home but it is also readily available) 10)1/4 teaspoon Chili powder 11) 1/4 teaspoon Rock salt Supplies: For the Yoghurt: 1) Muslin cloth preferred (but you can use others) For the Rest: 1) Knife 2) Kitchen Grater/ shredder 3) a bowl to mix all ingredients 4) a big spoon ( to help mixing) This makes approx. 7 -8 skewers. Also, you can Increase or decrease the no. of spices as per your taste.. This is a very simple yet an important step. Pour the yogurt on the muslin cloth and tie it up like shown. Then let it hang for an hour or two. Usually, we hang it on the sink tap. This process allows the water in the yoghurt to drain out, leading to a nice solid yoghurt mass. The importance of letting the water drain is so that the final mixture sticks to all the vegetables nicely, which would otherwise lead to a liquidy marination causing it to drip during the BBQ process.. I like to keep all my ingredients ready before i start the mixing process and that's y... Finely chop the Mint leaves & Green Chilies. Finely grate Ginger & Garlic cloves.. Place the curd/yoghurt into the bowl and mix it with a spoon. Then add all the ingredients to it and mix it again. Your marinate is ready !!. Mix all the vegetables in the marinate and refrigerate it for a couple of hours. This step, helps in enhancing the flavor a lot. I used onions, Green chili peppers, Mushrooms And paneer(cottage cheese). You can add potatoes too, but i do not prefer them. Put the veggies, in any order u like, on the skewers. We have a small grill, so i keep them short. Now go ahead and BBQ them!! Do not forget to let me know how it turned out !! Feel free to make any changes to the recipe. As i was saying, you can add chicken pieces to it too. Serve it with a soda/ beer or orange juice..Happiii Barbecuing !!\nRead the question below and select from the following choices.\nA: Chop Chop Chop N Grate !!\nB: Plum Tarts for a Wannabe Fancy Pants\nC: Ahh, the Romance.\nD: Ingredients",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_180_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_180_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_180_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_180_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_180_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_180_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_180_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_180_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_180_8.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: How to Make Rice\nB: Filling Stuffing\nC: Preparation\nD: Making Carrot Parantha",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Preparation', '@placeholder', 'Cook Rice', 'Mix Cooked Rice With Ingredients in the Pan']",
+    "context": "Here is the context of these images:\n. One raw Mango2 Medium sized Onions5 to 6 Garlic clovesOne inch long piece of Ginger2 to 3 Green chillies2 TomatoesBunch of Mint leavesCoriander leavesHandful of Curry leavesOne teaspoon of Mustard seeds mixed with split black gram lentilsOne teaspoon of Turmeric powderHalf a teaspoon of Asafoetida powder2 to 3 Dried Red chilliesOne teaspoon of Cumin seeds2 cups of Rice2 Tablespoons of Cooking OilSpices in small quantities (Optional)Star AniseClovesCinnamonCardamon pods. Cut both ends of raw mango and peel off the skinGrate the mango  and take about one cup of it. Finely chop the onionsDice the tomatoes into small piecesShuck Garlic and remove ginger skinMake Ginger Garlic paste using a mixer grinderSplit Green Chillies in halves. Heat a frying pan over medium flame and add 2 teaspoons of cooking oilAdd mustard seeds and cumin seeds to the oilBreak the spices like cloves, star anise, cinnamon and cardamom pods into pieces and add to the oilBreak the dried Red chillies into pieces and add to the oilAdd handful of curry leaves and split green chillies to oil and saute for few seconds. Add chopped onions and Ginger-Garlic paste to the pan and stir fry till raw smell disappearsAdd handful of mint leaves and mix well. Once the raw smell from onions and ginger-garlic paste disappears, add the chopped tomatoes to the panCook till water content evaporates from the mix and oil starts oozing out at sidesNow add the grated raw mango to the pan and mix wellYou can add little amount of salt to the mix. Most people add salt while cooking rice also, so take care not to add too much of salt to the mix. In the mean time, you can also cook the rice in a pressure cooker and keep aside till the pressure drops to normal.Here, the rice should be slightly under-cooked, other wise it will make a paste-like mix with the ingredients in the pan.. Take 2 cups of cooked rice and add to the ingredients in the frying pan.Mix everything together over low flameOnce it is properly mixed, transfer to a serving bowlGarnish with coriander leaves and serve hotNo side dishes are required with the Mango rice. However you can use potato chips or fried crispies with it. This dish is also good for taking along during outings / picnic with children\nRead the question below and select from the following choices.\nA: How to Make Rice\nB: Filling Stuffing\nC: Preparation\nD: Making Carrot Parantha",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_181_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_181_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_181_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_181_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_181_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_181_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_181_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_181_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_181_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_181_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_181_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_181_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_181_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_181_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_181_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_181_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_181_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_181_17.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_181_18.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_181_19.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_181_20.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_181_21.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_181_22.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_181_23.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_181_24.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_181_25.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_181_26.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_181_27.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_181_28.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_181_29.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_181_30.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_181_31.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_181_32.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_181_33.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_181_34.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_181_35.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_181_36.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_181_37.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Rolly Pollies\nB: Topping and Finishing the Basketballizza\nC: Croque Madame\nD: Fried Eggs",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Supplies and Ingredients', 'Hamburger and Rice', 'Brown Gravy', '@placeholder']",
+    "context": "Here is the context of these images:\n. For this recipe you will need the following:4 cups cooked rice (I cooked my rice in beef broth, ground thyme, 1 tablespoon margarine, 1 tsp salt & 1 tsp minced onion) 1 pound lean ground hamburger, 4 large eggs, brown gravy mix.. Cook the rice according to package directions. While the rice is cooking, divide the hamburger meat into 4 equal parts, form into patties and cook in a frying pan over medium heat until done. 3 or 4 minutes on each side, until no longer pink in the center.. Empty the brown gravy mix into a skillet and add water. cook according to package directions, continue stirring with a whisk until thickened.. Place 2 Tablespoons vegetable oil in a skillet, cook eggs over-easy. Place 1 cup of cooked rice on each plate. Top with a hamburger patty and gravy. Carefully place an egg on top of this and there you go! Loco Moco. Delicious and filling. Serves 4.Enjoy!\nRead the question below and select from the following choices.\nA: Rolly Pollies\nB: Topping and Finishing the Basketballizza\nC: Croque Madame\nD: Fried Eggs",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_182_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_182_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_182_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_182_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_182_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Preparing the Mixture to Freeze\nB: Sangria Ice Cream\nC: Homemade Ice Cream\nD: Mix an Freeze",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Mixing the Main Ingredient', 'Adding the Second Ingredient', '@placeholder', 'After 12 Hours']",
+    "context": "Here is the context of these images:\n. List of ingredients:1 can of sweetened condensed milk of 396 g.1 pint of Heavy Whipping Cream (473 ml or 16 FL OZ)1 can of HERSEHEY'S COCOA 100% CACAO & Natural Unsweetened of 8 OZ. Take a plastic container of 1 gallon more or less like the blue one of the photo.. Add the Heavy Whipping Cream inside the blue container.. Once the Heavy Whipping Cream has been mixed, you will  have a soft cream.. Now, you should add the can of condensed milk and mix it with the cream done.. Add 4-Tbsp of HERSHEY'S COCOA mixing it with the rest of the ingredients.. Take a refractory glass so that you can pour the mixture made previously.  Then get plastic wrapping to cover it and so you can freeze during 12 hours.  . After 12 hours, you will have a delicious homemade chocolate ice cream.\nRead the question below and select from the following choices.\nA: Preparing the Mixture to Freeze\nB: Sangria Ice Cream\nC: Homemade Ice Cream\nD: Mix an Freeze",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_183_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_183_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_183_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_183_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_183_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_183_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_183_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_183_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_183_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_183_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_183_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_183_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_183_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_183_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_183_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_183_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_183_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_183_17.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_183_18.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_183_19.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_183_20.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_183_21.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_183_22.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_183_23.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_183_24.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_183_25.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_183_26.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_183_27.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Quickie Homemade Mayonnaise\nB: Flavorize It...\nC: You Have Mayonnaise!\nD: Top It Off!",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Your Ingredients', 'Tools You Need', 'Add Sunflower Oil While Mixing', '@placeholder']",
+    "context": "Here is the context of these images:\n. These are the ingredients you need:1/2 cup of soy milk (100 ml) 1 cup of sunflower oil (250 ml) 1 tablespoon of apple cider vinegar 1 teaspoon of mustard A pinch of salt & some pepper to tastePlease make sure you buy unsweetened soy milk. I like my soy milk made solely from water & soy beans, without any additives.Or you can make your own soy milk at home! I favored this great instructable from jen7714https://www.instructables.com/id/How-to-Make-Homema...You can replace the sunflower oil by any vegetable oil, but I believe the sunflower oil gives the best results. Some people like to use extra vierge olive oil.. Grab your hand-held blender! Never tried it with an upright blender, but it should work just fine as well.. Pour the soy milk + mustard + apple cider vinegar + salt + pepper in a jug.Blend for 3 seconds, just to get everything mixed.. I like to pour the sunflower oil slooowly into the mixture while blending...But that is because I like some kitchen drama. If you add the oil all in once, it still works. Just blend and watch the magic happen!It takes about 30 seconds to get your mayonnaise.. All done! Stored in a jar, it keeps up to 2 weeks in your fridge. Top tip: add some grated garlic, spring onion, rosemary or curry to your mayonnaise. Or my favorite, fresh dill!And guys, there is a Superbowl this Sunday, so I heard? Perfect, try your snacks with this mayonnaise. You won't be disappointed! Enjoy!\nRead the question below and select from the following choices.\nA: Quickie Homemade Mayonnaise\nB: Flavorize It...\nC: You Have Mayonnaise!\nD: Top It Off!",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_184_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_184_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_184_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_184_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_184_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Green Beans\nB: BEEF ENCHILADAS\nC: Defrost/shred the Potatoes\nD: Preheat and Combine!",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Hamburger Meat', '@placeholder', 'Stewed Tomatoes', 'Shredded Cheese']",
+    "context": "Here is the context of these images:\n. one pound hamburger. Place hamburger into a pot with 1/3 cup of water, season with salt and pepper. Once the hamburger is browned and broken up.Add 1/3 cup diced onions 1/4 cup diced celery and continue cooking until done. While the hamburger is being browned In a separate pot boil potatoes, to make mashed potatoes. place the cooked hamburger into an oven proof baking dishtop with canned french style green beans that have been drained drain and mash the potatoes in a separate pan. now top with Stewed Tomatoesadd salt and pepper to taste. layer the mashed potatoes nextand cover with 1 cup shredded cheese. all that is left to do is heat at 350 degrees for 30 minutes, just to heat it through and melt the cheese Enjoy your one dish beef casserole\nRead the question below and select from the following choices.\nA: Green Beans\nB: BEEF ENCHILADAS\nC: Defrost/shred the Potatoes\nD: Preheat and Combine!",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_185_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_185_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_185_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_185_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_185_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_185_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_185_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_185_7.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Cheesy Jalapeno Bread\nB: Ingredients\nC: Make Dough Balls\nD: Melt the Garlic Butter and Brush It on the Top of the Bread.",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Cut and Grate', '@placeholder', 'Add Fixing', 'Bake']",
+    "context": "Here is the context of these images:\n. Get the following ingredients ready to use:Whole wheat flourAll purpose white flourActive dry yeastTable saltMozzarella cheeseFresh herbs (I used parsley, dill, cilantro, basil and chives) . 300grams water (about 1 1/4 cups)1 tsp salt1/2 tsp yeast100grams whole wheat flour (about 3/4 cup)300grams all-purpose white flour (about 2 1/2 cups)Mix together, leave it to rest for 3 hours and then mix again. Leave it again for another 2 hours and then sprinkle with about a tablespoon of flour.Remove from the bowl and knead on a clean surface to work out the bubbles. . Chop up the herbs into fine bits. Grate 100 grams of cheese. . Weigh out about 50 grams of dough.Sprinkle your surface with a bit of four and roll each bun dough blog into a flat blob. . Add a small amount of cheese and herbs to the flat dough blob. Roll up the blob ensuring the end sticks together so it doesn't unroll. . Arrange the buns on parchment paper and cover with a tinfoil lid. Note/tip: the tinfoil top can easily be made using a large round bowl. Let the buns rise for 15 minutes before baking at 222C (430F) for 22 minutes. Remove the foil cap and broil at 233C (450F) for 5 minutes to make the top golden. Let cool, tear the buns apart and enjoy. \nRead the question below and select from the following choices.\nA: Cheesy Jalapeno Bread\nB: Ingredients\nC: Make Dough Balls\nD: Melt the Garlic Butter and Brush It on the Top of the Bread.",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_186_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_186_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_186_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_186_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_186_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_186_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_186_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_186_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_186_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_186_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_186_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_186_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_186_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_186_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_186_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_186_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_186_16.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Cheesecake Jello\nB: Get a Good Ginger Grater\nC: Marinate With Brown Sugar and Garlic\nD: Cool",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Milk', 'Sweetener', 'Unflavored Gelatin', '@placeholder']",
+    "context": "Here is the context of these images:\n. Boil the milk medium heat. Turn off heat and add two tablespoons of sweetener stevia. Remember that this substitute has the feature to sweeten twice.. Add the warm water to hydrate the  unflavored gelatin  until a paste, then pour in the milk and stir until dissolved.. Finally, we must add the teaspoon of vanilla extract and the dye color of your choice and pour into glasses or moldsNote: It is important to use vanilla extract, not confused with the essence because if it is used not desired to be achieved vanilla. You can also change the extract by any flavor of your choice.. It should be refrigerated about 4 hours.Ready to eat!\nRead the question below and select from the following choices.\nA: Cheesecake Jello\nB: Get a Good Ginger Grater\nC: Marinate With Brown Sugar and Garlic\nD: Cool",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_187_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_187_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_187_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_187_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_187_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_187_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_187_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_187_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_187_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_187_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_187_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_187_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_187_12.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Chicken Divan Soup\nB: Add Chicken Broth, Bring to a Boil, Add Rice\nC: The Shopping List\nD: Chicken Soup Bones",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Physical Benefits of Chicken Soup', 'Cooking the Soup', '@placeholder', \"Sunshiine's Final Thoughts\"]",
+    "context": "Here is the context of these images:\n. Homemade soup served by the hands of a loving wife or mother has been used as a home remedy with remarkable results. It\u00a0soothes\u00a0the throat and warms the body. \u00a0Canning large batches will serve you well when a family member comes home sick. \u00a0. \u00a0How many of you have ever tasted home canned food? The truth is home canned foods \u00a0have more nutritional value than store bought foods. You control what goes into the foods you can. It is very beneficial to can what you grow yourself, \u00a0because most farmers use harmful chemicals on their fields. If you can't grow it yourself consider buying produce that is grown organically. The flavor of home grown and canned produce is amazing!\u00a0 I grew up in a time when many people were still growing and canning their own produce. I know what a real dill pickle taste like and what you buy in the stores today don't even come close!\u00a0 Canning\u00a0 takes \u00a0time but if your time is limited consider growing your own garden and freezing what you grow. The benefits are worth the extra effort.\nIn this guide I have canned Grannie's soup recipe the lazy way. I canned the soup but have frozen it instead of using the pressure canner or pressure cooker method. This is an inexpensive way to get started and see if it is something you might be interested in doing. From there you will gain confidence and may decide to go for the real deal. I personally have canned fruits and jellies but have never attempted canning meats. Canning some foods require education because of the dangers involved if you don't do it properly.. \n\tThis is what you will need to make the soup:\n\t1 Boiled whole chicken adding only salt when cooking it.\n\tSave all the chicken broth.\n\tRemove the meat using a strainer if you have one, save and freeze the skins and bones if you have dogs or cats. I will show what to do with them later.\u00a0\n\tCut chicken in small bite size pieces.\n\t1 cup peeled chopped carrots.\u00a0\n\t1 Cup chopped celery.\n\t1 Cup chopped onion.\n\t1 Chopped jalapeno.\n\t4 garlic cloves.\n\t1 Lemon juiced. This is to add to the soup after it is cooked.\n\t2 Cups of fresh chopped tomatoes.\n\t1 Cup chives I used the tops of 6 green onions because I did not have chives.\n\t2 Chicken Bouillon cubes.\n\tI used curly noodles but you can add egg noodles as well.\u00a0\n\tThe secret to this recipe is use as many green spices as you can. I use what I have on hand.\u00a0You can add just about any kind of vegetable to this recipe and receive benefits from it.\u00a0 This is the recipe we have used for a very long time.\u00a0 I often use what ever I have at the time.\u00a0 Nothing is in stone.\u00a0 You can add parsnips, sweet potato and turnips for even better results.\u00a0 I did not have any on hand.\u00a0\n\tSpices:\u00a0 I adjusted my recipe for a larger group of taste buds.\u00a0 I like mine more seasoned and with more pepper.\u00a0Taste it after you add everything and adjust it for your taste buds.\u00a0 The more spices the better it works.\u00a0\n\t1/8Th Teaspoon of each of the following as desired:\n\tBasil\n\tParsley\n\tOregano\n\tPaprika\n\tChili Powder\n\tBay Leaves\n\tSage\n\tCumin\n\tRed pepper\n\tCilantro\n\tItalian seasoning\n\tDill weed\n\tCinnamon\n\tNutmeg\n\tSea salt\n\tPepper if desired\n\tYou may omit the peppers if your family is sensitive to it. Peppers help clean out the sinuses.\n\tUtensils:\n\t1 Large stock pot\n\t1 Large spoon\n\t1 Medium funnel with large opening\n\t1 Sharp knife\n\t1 Cutting board\n\tMixing bowls\n\tFood strainer if you have one.\n\tClean canning jars or heavy jars and lids with wide mouths. If this is your first time freezing in a jar just can/freeze a few to get the feel of it.\u00a0\n\tPlastic bags the number of jars you will be freezing.\n\tPlease note:\u00a0 If you are a\u00a0vegetarian you may substitute the chicken broth for a vegetarian broth and add rice and beans to make a complete protein.\u00a0\n\t\u00a0. Place the broth in the stock pot or cook it in a crock pot.\u00a0\nAdd all the spices.\nAdd the chicken.\nAdd all the vegetables reserving\u00a01 cup of\u00a0the tomatoes and a few green onion tops or chives for garnish.\nStir well.\nTurn on the burner and cook until the carrots are done but not over cooked.\nAdd the lemon juice to the cooked mixture.. Add the remaining tomatoes and chives to the jars.\nDo not fill the jars above the neck line. Leave at least 1 inch at the top for small jars and 2 inches for larger jars to allow for expansion. If you don't allow enough the jars could break. As it turned out my jars did not expand that much but it is best to be safe than sorry.\nLadle the soup into the jars.\nAllow to cool completely to ovoid breakage.\nWhen they are cooled completely carefully place them in the freezer with the lids off!\u00a0 As a safety measure: Place the jars into the plastic bags to prevent any glass from getting on other foods if the jar breaks.\nAfter they are completely frozen place the lids on the jars and screw down the lids.\nPut back in the freezer. There is no need to place them back into the plastic bags because they are frozen and there is no danger in them breaking.\nThat is all there is to it!\nWhen you thaw out the soup allow it to thaw in a bowl with cool water if you will be around to start cooking it when it is thawed.\u00a0 I personally feel safer defrosting it in the fridge. Avoid rapid thawing to prevent breakage.. I\u00a0promised\u00a0that I would add the link to my chicken soup bones recipe. \u00a0I made a completely different tutorial about how to cook the chicken bones to feed you dog/cat. \u00a0I had been visiting my sister and she was feeding her dogs chicken bones. \u00a0I never knew you could actually safely give them dog bones and they are very good for them. This tutorial also gives tips on how to potty train your dog and useful grooming tips on\u00a0\u00a0friendly products. Step 4 is about the dog food. \u00a0 \u00a0Here is the link on how to safely do that: \u00a0https://www.instructables.com/id/Potty-Training-Grooming-Nutrition-And-Choosing-/. I have pictures here of ways you can package the soup for gift ideas. You can begin to make the soup now and avoid that last minute holiday rush. It is important to place a large note on the package and tell them that the jar must be placed in the freezer or fridge asap or eaten within a few days. I know this is a repeat but it is very important and you would sure hate to find out that someone got sick on the soup you canned. The jars are not sealed so they need to be frozen until they will be used. Do not let them sit on the counter all day because bacteria can make you very ill. Thaw them in a bowl of cool water if you are going to be around to check on it often. Otherwise thaw in the fridge. Cook frozen soup as soon as you can remove it safely from the jar.\nFor a care package\u00a0 just add stuff one would take for a cold along with the soup. You can add a little or add a lot. You could make a family package because a lot of times everyone in the family gets sick. You can make the soup in a crock pot and take the entire pot to a sick family. Many different options you could do for this type of gift. Add bath salts recipe here: https://www.instructables.com/id/How-To-Make-Bath-Bombs/\u00a0\u00a0\u00a0 Lip balm: https://www.instructables.com/id/Delicious-Chocolate-Chapstick-Honey-Balm/, \u00a0candle, cough drops how to here:\u00a0https://www.instructables.com/id/Cough-Drops/ , Vapor rub\u00a0\u00a0, Orange juice, Vitamin C, Tea, Get well rock, Throat spray, or footie's just to name a few.\nThere are people who have concerns of storing foods in plastic containers or bags and this is a good alternative for them.\u00a0 You can use plastic to store them in and that is an option you might consider.\u00a0 This is a great way to get you comfortable putting up your own food.\u00a0To freeze broth simply place the broth in the fridge until the fat settles to the top.\u00a0 Skim off the fat and pour\u00a0the broth into a freezer bag and work out the air.\u00a0 Lay flat single layered on the freezer shelf.\u00a0 After it is completely frozen you may stack it to make more room in the freezer.\u00a0\nI am currently working on an Instructable using chicken bones for cat/dog treats. \u00a0When it is finished I will add a link here.\u00a0\nThank you for stopping by and have a super day!\nRead the question below and select from the following choices.\nA: Chicken Divan Soup\nB: Add Chicken Broth, Bring to a Boil, Add Rice\nC: The Shopping List\nD: Chicken Soup Bones",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_188_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_188_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_188_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_188_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_188_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_188_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_188_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_188_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_188_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_188_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_188_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_188_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_188_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_188_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_188_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_188_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_188_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_188_17.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_188_18.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_188_19.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_188_20.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_188_21.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_188_22.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_188_23.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_188_24.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_188_25.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_188_26.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_188_27.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_188_28.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_188_29.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Framing the Rose!\nB: Watermelon Shark\nC: Recipe for You to Print\nD: Watermelon Sherbet Watermelon",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Clean It Up!', 'Outer Petals Pt. 1', '@placeholder', 'Finishing Touches!']",
+    "context": "Here is the context of these images:\n. You'll need: One watermelon Paring knife Medium circle cookie cutter. Start by using your paring knife to trace a circle on the side of the watermelon you want to carve. Once you have cut out a circle template to follow, use the paring knife to peel away the green part of the watermelon rind, leaving the white fleshy part exposed.Note: Save some of the bigger pieces of green that you peel off. Those will be needed down the line!. After peeling away the green bits, you'll see that there are darker parts of the rind that you might have missed due to uneven cutting. No problem, just grab the paring knife and clean it up by peeling away as much of the greenish coloring as possible until only white rind is showing. It doesn't have to be perfect though so don't spend hours on it or you'll end up carving away all the white part and hit the fruit instead!. So remember those green parts I had you save! Grab those and cut out a few leaves. (I made 3 total for mine) Then carve out some detail and put those aside again for later. At this point you can dispose of all the rest of the unwanted rind. . To start the rose, grab your circle cookie cutter and press it gently into the watermelon, no more than 1/3 of the cookie cutter in. You aren't cutting out shapes, just making an outline to follow. After you cut the circle, use the paring knife to cut a ring around the circle and carve out some of the watermelon around it, making it appear more 3D.. Take your paring knife and cut a small sliver off the circle in the shape of a crescent. Make this first cut straight down. Next cut a sliver in front of the one you previously cut, but make this one at an angle towards the first cut.\u00a0 Pull the second sliver you made out and you have your first petal made. Repeat this process around the circle for your first row of petals. For a better understanding, take a look at the pictures posted that shows this process.. For the second row, you are going to continue cutting a thin crescent shaped sliver straight down, then cutting a second thin crescent sliver parallel to in at an angle towards your first cut sliver. Pull the second angled sliver out to complete the petal. Repeat this process around the circle to complete your second row. Again, for a better understanding, refer to the photos I posted. . Keep using the sliver cutting method further into the circle to create more rows of petals, getting smaller with each row closer to the center. . When you get to the point of just a tiny bit of uncut circle in the middle. Use your paring knife to cut a hole in the middle of the last bit of the center. The center of your rose is now finished, time to move on to the outer layers!. Unlike the inner layers, all cuts will be angled in this stage. The first cut you make should be angled at about 45 degrees. Make a wavy/curvy petal shaped cut. Then behind that first cut ,at about a 20 degree angle, slice a rounded cut encircling your first cut. Pull this sliver out to make the first cut you made pop out from the melon. This sounds more complicated than it really is, for a better understanding see the photos.. Continue making wavy/curvy petal cuts and cutting away part of the melon behind them to make them appear 3D. The further from the center you get, the bigger the cuts and petals should become. See photos for reference!. \n          Go as far out as you would like, making the rose as big as you want. You can have it take up the whole melon, or part of it so you have multiple roses on one melon.   In the photos you can see where I stopped, this is how big I wanted this rose. I included a side perspective so you can see the angles and such. . Once you have completed the rose by making as many petals as you want. Finish it by framing it. Like the sunflowers, cut a large circle around the flower at an angle and pull out the bit of melon. This will make the rose look like it is popping out from the watermelon even more than it already does. You can repeat this process to make more roses on your melon, or maybe mix it up with sunflowers and other things. Your choice! For this example I made 2 roses on my watermelon.. Now that you have your roses all carved up, grab the leaves you made from the watermelon rind earlier and place them where you would like on the roses. Just slide them into the gaps. If you need to, use your paring knife to deepen the gaps so the leaves don't fall out and you are finished! Show off to friends and family, then slice it up and serve! Also included an additional photo of a recent melon I carved that does include using the roses and my first attempt at flames. \nRead the question below and select from the following choices.\nA: Framing the Rose!\nB: Watermelon Shark\nC: Recipe for You to Print\nD: Watermelon Sherbet Watermelon",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_189_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_189_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_189_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_189_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_189_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_189_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_189_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_189_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_189_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_189_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_189_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_189_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_189_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_189_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_189_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_189_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_189_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_189_17.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_189_18.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_189_19.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_189_20.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_189_21.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_189_22.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_189_23.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_189_24.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_189_25.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_189_26.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_189_27.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_189_28.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_189_29.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_189_30.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_189_31.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_189_32.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_189_33.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_189_34.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_189_35.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_189_36.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_189_37.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_189_38.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_189_39.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_189_40.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_189_41.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_189_42.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_189_43.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_189_44.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_189_45.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_189_46.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_189_47.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_189_48.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_189_49.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_189_50.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_189_51.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_189_52.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_189_53.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_189_54.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_189_55.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_189_56.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_189_57.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_189_58.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_189_59.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_189_60.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_189_61.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Bake It\nB: How to Make Homemade Bacon\nC: Important Information on Canning\nD: Getting Started",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Boil 1 Lb. of Dried Shell Pasta', 'Fry 10 Pieces of Thick Cut Hardwood Smoked Bacon', \"Assemble Bacon Mac N' Cheese\", '@placeholder']",
+    "context": "Here is the context of these images:\n. You will need to gather the following ingredients:1 lb dried shell pasta4.5 cups of whole milk5 tbsp butter4 tbsp all-purpose flour8 oz. of New York Sharp Cheddar cheese, shredded7 oz. Mozzarella cheese, shredded8 oz. of Monterrey Jack cheese, shredded1/2 tsp rosemary1/2 tsp thyme2 tbsp basil2 tbsp of fresh green onion, chopped2 cloves fresh garlic, minced nutmegcayenne pepper saltground black pepper 10 slices of thick cut hardwood smoked bacon Seasoned breadcrumbs Caution:Do not use pre-shredded cheese for this meal. Pre-shredded cheese is covered in an anti-caking powder that will cause the cheese not to melt well. This will cause the cheese sauce to have a weird consistency.Tip: I recommend gathering, chopping, and shredding all ingredients prior to starting.Other materials or objects you will need are:A sharp chefs knifeA cutting boardA whiskA spoonA measuring utensils including a measuring cup, 1/2 teaspoon, and 1/2 tablespoonA large potA large panA 13\"X9\" glass baking panA cheese graterA noodle strainerPam cooking sprayPaper towelsA good pair of oven mitts. Fill a large pot with water.Put a pinch of salt in the water.Bring the water to a boil.Insert pasta into the boiling water.Strain the noodles once they are soft and tender.Warning: Boiling water can cause severe burns. Pour your noodles into the strainer with care.Rinse the strained noodles under cold tap water to stop them from cooking any more.Caution: Not running cold tap water might cause the noodles to over cook and become mushy.Set the noodles aside until later.Tip: Save a cup of the starch water you used to cook your pasta in. This may or may not be used later.. Heat up a large pan on medium high.Fry bacon in the pan flipping occasionally until semi crispy.Place the bacon onto of a paper towel.Blot the bacon with another paper towel to remove the grease.Caution: Not blotting the grease away will affect the consistency of the cheese sauce.Cut the bacon into thinly sliced strips.Warning: Fried bacon will be hot. Wait for the bacon to cool before handling.Set the bacon aside until later.. Heat up the same pot you used to boil the noodles on medium heat.Preheat your oven to 400 degrees Fahrenheit.Melt 5 tbsp of butter until it starts to bubble or simmer.Add 4 tbsp of all purpose flower.Whisk vigorously until a smooth even texture is achieved.Note: for steps 6-21 continue whisking vigorously to achieve and smooth even texture.Caution:  Not whisking vigorously for steps 6-21 may case flower to sink, clump, and burn at the bottom of the pot.Let the mixture cook for 1-2 minutes to get rid of the raw flour taste.Add 1 cup of milk.Add 1/2 a tsp of rosemary.Add 1/2 a tsp of thyme.Add 2 tbsp of basil.Add 2 generous pinches of nutmeg.Add 2 generous pinches of  cayenne pepper.Add the last 3.5 cups of milk.Add 2 tbsp of fresh chopped green onions.Add 2 cloves of fresh minced garlic.Add salt and/or black pepper as desired.Tip: taste it! adjust the seasoning accordingly.Increase the heat to high to thicken the sauce.Note: The sauce should be thick enough to add the cheese when it will coat the back of a utensil.Lower the heat back down to medium.Add 8 oz. of shredded New York Cheddar cheese.Add 7 oz. of shredded Mozzarella cheese.Add 8 oz. of shredded Monterrey Jack cheese.Turn off heat.Note: The sauce should be thick and creamy. If you believe the sauce is too thick, add some of the starch water you saved from step 2.. Lightly grease a glass baking pan with Pam. Other cooking spays or butter will also work.Pour 1/3 of the noodles in the bottom of the pan.Place 1/3 of the chopped bacon.Pour 1/3 of the sauce onto the bacon and noodles.Mix around with a spoon.Repeat 2 through 5 until all the ingredients are used up.Note: There should be an even distribution of bacon and noodles throughout the pasta.Cover the top with a thick layer of bread crumbs.. The oven should be at 400 degrees Fahrenheit due to you preheating it in step 4.Place the glass pan filled with mac n' cheese on the center rack.Bake for 35-35 minutes or until the sauce is burbling and the bread crumbs are golden brown.Remove the pan from the oven.Warning: The pan and mac n' cheese will be extremely hot! It is extremely recommended that you use a pair of good oven mitts.Let sit on the counter for 5-10 minutes to cool.Warning: The product will be extremely hot. Make sure it is cool enough before eating or you may burn your mouth.You now have a delicious meal to serve to your family or horde to yourself if you so choose. You may experiment using different types of cheeses to get different tastes, or add different seasonings. For example I sometimes use Pepper Jack instead of Monterrey Jack and add some hot sauce into the cheese sauce.\nRead the question below and select from the following choices.\nA: Bake It\nB: How to Make Homemade Bacon\nC: Important Information on Canning\nD: Getting Started",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_190_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_190_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_190_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_190_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_190_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_190_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_190_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_190_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_190_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_190_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_190_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_190_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_190_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_190_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_190_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_190_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_190_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_190_17.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_190_18.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_190_19.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_190_20.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Chocolate Chip Cookies for Dummies\nB: White Chocolate Chip Matcha Cookies\nC: Decorate\nD: Sugar Mixture",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Ingredients', 'Eggs and Butter', '@placeholder', 'Flour Mixture and Chocolate Chips']",
+    "context": "Here is the context of these images:\n. Ingredients:\n- 2 eggs\n- 1 cup of butter (2 sticks) softened\n- 3/4 cup brown sugar (packed firmly)\n- 1/4 cup white sugar\n- 1 tsp. vanilla extract\n- 1 package of vanilla pudding\n- 2 1/4 cup white flour\n- 1 tsp. baking soda\n- 1 tsp. salt\n- 1 1/2 \u00a0chocolate chips\n- 1 cup chopped pecans (optional). Combine the eggs and butter and mix until well blended. Mix together sugars, vanilla extract and pudding. The pudding mix is what really makes this recipe different as well as better than others, although in a pinch you can substitute it with an extra 1/2 cup of white sugar. Beat together with egg mixture for about 2 minutes or until well blended.. Combine flour, baking soda, and salt in a separate bowl. Gradually add in to sugar and egg mixture until just blended. Add in chocolate chips and pecans if desired. DO NOT OVER MIX.. Drop by rounded teaspoonfuls onto 2 baking sheets. I have found that it usually works best when I put them in a 4x6 grid. .Bake at \u00a0 375\u00a0\u00a0for 8-10 minutes. Let cool for 5-7 minutes. Enjoy!\nRead the question below and select from the following choices.\nA: Chocolate Chip Cookies for Dummies\nB: White Chocolate Chip Matcha Cookies\nC: Decorate\nD: Sugar Mixture",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_191_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_191_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_191_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_191_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_191_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_191_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_191_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_191_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_191_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_191_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_191_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_191_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_191_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_191_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_191_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_191_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_191_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_191_17.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_191_18.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_191_19.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_191_20.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_191_21.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_191_22.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_191_23.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_191_24.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Easy Brioche Recipe\nB: \u200bIF USING POWDERED CHILES, Start Here\nC: Optional Step\nD: \u200bIF USING DRIED CHILES, Start Here",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['The Ingredients', 'Add All of the Dry Ingredients.', 'Putting It Into the Microwave', '@placeholder']",
+    "context": "Here is the context of these images:\n. Ingredients: - 35g / 1/4 cup / 60ml of Self-Raising Flour - 55g / 1/4 cup / 60ml of Sugar - 2 Tablespoons of Cocoa Powder - 2 Tablespoons of Cooking Oil (doesn't have to be sunflower) - 3 Tablespoons of Water - Ice Cream (optional)Utensils: - A Spatula - A Wooden Spoon - A Mixing Bowl - A Mug - A Serving Plate (optional) - A Tablespoon - A Kitchen Scale or a Measuring Cup - Something to Eat With. - A sieve. The Dry Ingredients: Put all of the dry ingredients (35g flour, 55g sugar, 2 tablespoons of cocoa powder) into the bowl and mix them with a wooden spoon! It is important that you mix them otherwise it will not rise properly (I forgot to mix them...) It is recommended that you put them into the bowl flour first, then sugar, then sieve the cocoa powder to get rid of lumps.. The Rest Put all of the other ingredients (3 tablespoons of water, 2 tablespoons of oil) into the bowl with the mixed dry ingredients. Once you have done this, mix it with a spoon. You should mix until the mixture becomes quite thick and consistent (smooth). Try and get all the annoying ingredients that get stuck to the edge of the bowl in as well.. Putting the Mixture in the Mug Use your spoon or spatula to scrape out all of the mixture from the bowl and into the mug.. Microwave Set the microwave to 1 minute 40 and put in the mug. Press start!!. Microwave Take out the mug form the microwave (careful, it's hot!) If you're messy (like me) then the inside edge of the mug will have battle scars, but it doesn't matter unless you really want perfect presentation, in which case I'm not sure how to help. You can serve it with ice cream, or just eat it :) The next step is completely optional.Thank you for looking at this mug-brownie recipe. If you made it, please put it in the comments and if you didn't enjoy it, tell me why and I'll try to help. Thanks!. Optional Step This step is if you want to put your mug-brownie on a plate. Simply turn the mug upside-down onto the plate. If it doesn't seem to be working, put it back in the microwave for 30 seconds. However, putting it in the microwave hardens it up and makes the edge chewy.\nRead the question below and select from the following choices.\nA: Easy Brioche Recipe\nB: \u200bIF USING POWDERED CHILES, Start Here\nC: Optional Step\nD: \u200bIF USING DRIED CHILES, Start Here",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_192_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_192_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_192_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_192_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_192_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_192_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_192_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_192_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_192_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_192_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_192_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_192_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_192_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_192_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_192_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_192_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_192_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_192_17.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_192_18.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_192_19.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_192_20.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_192_21.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Enjoy\nB: SOB Sprite\nC: Line 'Em Up!\nD: It's Flour Time",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Sort the Candy', 'What to Do With the Two Red Flavors', 'Add the Sprite/7Up', '@placeholder']",
+    "context": "Here is the context of these images:\n. Make sure you wash and dry the vials thoroughly. . Separate the Jolly Rancher flavors into the five small bowls.. Add four green apples to the first vial. This will be your green potion.Add four blue raspberry to the second vial. This will be your blue potion.Add four grape to the third vial. This will be your purple potion.Go To Next Step To Learn What To Do With The Two Red Flavors. Take two cherry flavored JRs and put them in the fourth vial.Add two watermelon flavored JRs to the vial.This will be your red potion.. You can use either Sprite or 7Up for this step.Use a funnel to pour it into the vials.. Fill the vials to the desired height.Place the lids or corks on LOOSELY. Be careful with them as you transfer them from the counter to the fridge. . CAREFULLY place drinks in the fridge. Make sure the lids are LOOSE as to keep the tops from popping off.. It may take a whole night for the candies to dissolve.When they do, you can keep them in the fridge or serve them immediately. If you are making these for an event, you will have to make them a day or two ahead.\nRead the question below and select from the following choices.\nA: Enjoy\nB: SOB Sprite\nC: Line 'Em Up!\nD: It's Flour Time",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_193_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_193_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_193_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_193_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_193_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_193_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_193_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_193_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_193_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_193_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_193_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_193_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_193_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_193_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_193_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_193_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_193_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_193_17.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_193_18.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Top Hem\nB: Coffee Tea Bag\nC: Mis En Place\nD: Making the Ganache",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Draw String', 'Weights', \"Sunshiine's Final Thoughts\"]",
+    "context": "Here is the context of these images:\n. What you will need: \n\t\tScraps of muslin, sheer, lace, or Netting cut into 5 X 5 squares\n\t\tEmbroidery thread\n\t\tSewing needle\n\t\tScissors\n\t\tRuler\n\t\tMarking pen\n\t\tLoose tea leaves\n\t\tGlue\u00a0\n\t\tButtons\n\t\tGlass jewels\u00a0\n\t\tTea spoonPlease note: \u00a0I liked the glass jewels but buttons glued together will be sufficient weight. \u00a0\n        . Cutting out the pattern:\n\t\tYou will need to cut a square piece of fabric 5X5 for each weighted tea bag.. Seams:\n\t\tFold the square in half with right sides together.\n\t\tThread the needle and sew along the bottom and the side as shown.\n\t\tTie it off.\n\t\tClip the threads.. Hemming the top:\n\t\tFold the top edge down (about the 1 1/2 inches\u00a0and:\n\t\tTurn it under the same amount again.\u00a0\n\t\tYou will sew it in the next step.\n\t\tThe pictures show one sewn and one not sewn. \u00a0. Method: This is the draw string.\n\n\t\tCarefully turn the bag right side out without disturbing the hem.\n\t\tBegin sewing at the seam ( through all the layers, \u00a0leaving the bag open as you sew,) \u00a0across the top just below the top edge.\n\t\tStitch all the way around until you meet up with the beginning stitch. \u00a0Don't trim the threads\u00a0or back stitch.\n\t\t\u00e2\u0080\u008b Mine are a little too long so I would cut them about 9 inches.\n\t\tTrim off what you don't want.\u00a0\n\t\tTie each end\u00a0in a knot.\u00a0. Method:\n\t\tSpread the draw strings out as shown.\n\t\tDab some glue on the center of the glass jewels.\n\t\tStick the string in the glue.\n\t\tCenter the button over the string and press.. Filling\u00a0the tea bag:\n\t\tFill the desired amount of tea into the bag.\n\t\tAdd 1 or 2 \u00a0glass jewels.\u00a0\n\t\tClose the bag.\nPlease note: I used 1 glass jewel but later added another one to balance it out better.\n        . I have been wanting to make these weights for sometime. \u00a0I do not like removing the string from my tea that has fallen into the cup and how the tea bag rises to the top of the water. \u00a0These are easy to clean by shaking out the leaves, rinse\u00a0the bag under running water and let them air dry. \u00a0It reminds me of the old days. \u00a0I am\u00a0using\u00a0 these decorated cans to store them in and for gift packaging. \u00a0I thought they were very stylish. \u00a0If I were to win the Shopbot challenge I would make my own wooden buttons and many things that would require precision cutting. \u00a0I can't draw but I sure could use a pattern to cut my own stuff. \u00a0\nThank you for stopping by and do have a safe and happy December.\nSunshiine\nRead the question below and select from the following choices.\nA: Top Hem\nB: Coffee Tea Bag\nC: Mis En Place\nD: Making the Ganache",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_194_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_194_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_194_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_194_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_194_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_194_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_194_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_194_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_194_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_194_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_194_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_194_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_194_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_194_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_194_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_194_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_194_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_194_17.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_194_18.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_194_19.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Husk and Halve the 3 Large Garlic Cloves.\nB: How to Make Salsa Verde\nC: Combine and Blend\nD: How to Make Salsa",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Open the Canned Tomatoes', 'Peal the Red Bell Pepper and Jalapeno', 'Blend and Add Salt and Red Pepper Flake to Blender']",
+    "context": "Here is the context of these images:\n. For this recipe you will need:Ingredients:\n\u00a0\u00a0 \u00a0 \u2022 1 Red Bell Pepper\n\u00a0\u00a0 \u00a0 \u2022 1 Jalapeno\n\u00a0\u00a0 \u00a0 \u2022 \u00bd White Onion\n\u00a0\u00a0 \u00a0 \u2022 3 Large Cloves of Garlic\n\u00a0\u00a0 \u00a0 \u2022 1 Handful of Cilantro\n\u00a0\u00a0 \u00a0 \u2022 42 ounces of Canned Tomatoes\n\u00a0\u00a0 \u00a0 \u2022 1 Lime\n\u00a0\u00a0 \u00a0 \u2022 Salt\n\u00a0\u00a0 \u00a0 \u2022 Crushed Red PepperUtensils:\n\u00a0\u00a0 \u00a0 \u2022 Oven\n\u00a0\u00a0 \u00a0 \u2022 Aluminum Foil\n\u00a0\u00a0 \u00a0 \u2022 Cutting Board\n\u00a0\u00a0 \u00a0 \u2022 Large Kitchen Knife\n\u00a0\u00a0 \u00a0 \u2022 Can Opener\n\u00a0\u00a0 \u00a0 \u2022 Blender/Food Processor\n\u00a0\u00a0 \u00a0 \u2022 Bowl\n\u00a0. Place the red bell pepper and the jalapeno on the aluminum foil and set on the bottom oven rack. \u00a0Turn the oven to broil. Warning: Ovens produce heat quickly so when handling the red bell pepper and jalapeno be careful to not touch inside of the oven.Note: \u00a0For this recipe we want the red bell pepper and jalapeno to be \u2018Fire Roasted\u2019 so we will place them in the oven to perform this task. Broiling consists of cooking directly under heat so on your oven it only turns the top burner on and cooks from this heat, in contrast to regular cooking which cooks by heating up the air inside the oven. You will need to check on the red bell pepper and jalapeno approximately every 5 minutes rotating them to a new side when one side has blistered.. Cut the white onion in half.\na) Remove outside skin and inner sprout. Warning: When using a knife keep fingers clear of blade.Note: \u00a0As a general rule when cooking, if it doesn\u2019t look like something you want to eat then throw it out. This rule applies to the onion; there is a sprout at the core of the onion remove that from the onion. Likewise the outside dry skin of an onion will alter the taste and you should remove that as well. For this recipe we will only need half of the onion unless you really like onion then add more as you see fit.\nb) Mince the onion into many fine pieces.Note: \u00a0Mincing simply means to chop into many small pieces. So go ahead and chop the onion up into many small pieces.\nc) Deposit the minced onion into the blender/food processor.\n\u00a0. Husk and Halve the 3 Large Garlic Cloves\na) Husk the Garlic.Note: \u00a0For those of you that don\u2019t know how to husk a clove of garlic, simply place the broad side of your large kitchen knife on the garlic and smash down the knife with your hand. This will break the clove and you can now remove the husk.\nb) Halve the GarlicNote: \u00a0Cut the clove of garlic in half and remove the sprout in the middle. This is typically an off white color, slender stem inside the garlic clove.\nc) Cut off the end.Note: On both of the pointed ends of the garlic a woody stem needs to be removed.\nd) Mince the garlic and add to blender.. Chop the Cilantro in Half\na) Chop the cilantro in half.Note: \u00a0Take a large handful of cilantro and place on the cutting board. You want to remove the bottom portion of the stem, from the leaves down and throw this away.\nb) Mince the leafy part of the cilantro and add to blender.. Remove the red bell pepper and jalapeno from the oven and place in a bowl. Cover the top of the bowl with the aluminum foil. Allow the red bell pepper and jalapeno to steam for approximately eight minutes inside covered bowl.\u00a0. Open and drain the water from the canned tomatoes then add to the blender. \u00a0Warning: Using the can opener leaves a rough edge on the can so be careful not to cut yourself holding the can lid down.. I like a stronger lime taste so I squeeze both halves of the lime into my salsa.\nTip: If you have a firm lime, microwave it for 10 seconds to soften it and make it easier to squeeze.. a) Remove the aluminum foil from the bowl and place the red bell pepper and jalapeno on the cutting board. The skin should be partially separated from the inside of the pepper.\nb) Cut the skin away from the red bell pepper and jalapeno and discard the skin. Warning: There is hot juice inside the red bell pepper and jalapeno and when you cut it open it will squirt and potentially burn you.\nc) Remove the seeds from the red bell pepper and discard the seeds.\nd) Chop the red bell pepper and jalapeno into small pieces and place in the blender.. Taste test and add salt and red pepper flake until you reach desired flavor.. Now that you have made the salsa you should have the confidence to make it again. All of the measurements for the ingredients are subject to my own specific taste and you should play around and alter them to fit your liking. The secret to great salsa begins with incorporating the right ratio of each ingredient for your taste. Once you know how you like it, you can make any amount following your own ratio guide.\nRead the question below and select from the following choices.\nA: Husk and Halve the 3 Large Garlic Cloves.\nB: How to Make Salsa Verde\nC: Combine and Blend\nD: How to Make Salsa",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_195_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_195_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_195_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_195_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_195_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_195_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_195_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_195_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_195_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_195_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_195_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_195_11.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Fo'shizzle Peanut Butter Cookies\nB: Add Flour\nC: Add Oats and Vanilla\nD: Ingredients",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Make the Batter', 'Chocolate Topping', 'Add the Ganache and Serve!']",
+    "context": "Here is the context of these images:\n. For this recipe you will need:a package of brownie mix or the dry ingredients from your favorite brownie recipe1/4 cup peanut oil1/4 cup water1 cup heavy cream1 egg1 1/4 cup smooth peanut butter12 oz. chocolate chips. Preheat your oven to 325 degrees F and lightly coat a 9\"x9\" pan with peanut oil or other cooking spray.Take your package of brownie mix and add the egg, 1/4 cup heavy cream, 1/4 cup water, 1/4 cup peanut oil, and mix it all in until the batter is smooth and there are no lumps in it.  Then take 1 cup of peanut butter and gently fold it into the batter.  You don't want to mix it in all the way, but just mix it enough so that it's evenly distributed.  The light color of the peanut butter looks beautiful marbled with the dark color of the brownie batter.. Put the batter in your prepared pan  in an even layer and bake for 40-50 minutes at 325 degrees.  Check after 40 minutes and if the middle is still really really soup-y put them in for 10 more minutes.Once it's done, remove from oven and set aside to cool.. The chocolate ganache layer on top really makes this dessert special.  It might looks intimidating, but it's super easy.  Here we go!Take the rest of your heavy cream and put it in a small saucepan over low-medium heat and wait for it to boil.  Keep a close eye on it because you don't want it to burn or boil over.  As soon as the cream starts boiling, add it to the package of chocolate chips in a heat safe bowl and whisk it together until all the chocolate chips have disolved and the mixture is smooth.  It will look kind of weird at first, but that's ok, just keep whisking!. Once your ganache is all smooth, pour it over your cooled brownies and smooth it into an even layer.  To make the pretty pattern, take the rest of your peanut butter and put it in the microwave for 20-30 seconds.  Use a piping bag or even just a measuring cup with a spout to pour thin lines of liquid peanut butter diagonally across the pan.  Then take a toothpick or skewer and drag through the lines of peanut butter in the opposite diagonal.  Then do it again in the other direction to get the chevron pattern.  Set it aside to cool a bit and let the ganache solidify a bit and you're done!  Serve them and impress your friends and and win over your enemies!\nRead the question below and select from the following choices.\nA: Fo'shizzle Peanut Butter Cookies\nB: Add Flour\nC: Add Oats and Vanilla\nD: Ingredients",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_196_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_196_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_196_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_196_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_196_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_196_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_196_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_196_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_196_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_196_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_196_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_196_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_196_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_196_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_196_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_196_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_196_16.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Orange and Peach Trifle\nB: Pie Assembly\nC: Relax, Eat, and Enjoy!!!\nD: Pie Assembly",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Ingredients', 'Pie Crust and Maceration', 'Filling the Tarts', '@placeholder']",
+    "context": "Here is the context of these images:\n. 1 Single Pie Crust Rolled Flat (I made mine using this recipe, A Healthier Flaky Double Pie Crust w/ Coconut Oil) 4 Ripe Peaches - Pared and Sliced1/4 Cup Granulated Sugar1/4 Cup Brown Sugar 1/2 Tablespoon Lemon Juice1 Table spoon Cornstarch 1/16 Teaspoon Salt (literally a pinch)* For the pie crust, because I only needed a single crust, so I divided the ingredients by 50% and it worked like a charm. . I highly recommend making your own crust, for this recipe. It's super easy and 100% worth it in the end, plus there are tons of great recipes out there. I use A Healthier Flaky Double Pie Crust w/ Coconut Oil for all of my pies and tarts. It takes 5 minutes to pull together and only an hour to chill. The final results are melt in your mouth flaky.While the pie crust is chilling, combine the sliced peaches, granulated sugar, and brown sugar in a medium sized bowl. Cover with plastic wrap and set it in the refrigerator for 1 hour.* I call this prep work, because both of these items will need to be done, well in advance and will need to sit for at least an hour. ** Every 20 minutes or so, I give the peaches a toss in the sugar mixture as they macerate.. Once the pie crust has chilled, remove it and let the dough disk rest on a floured surface for about 10 minutes.Gently, roll out the dough evenly in all directions to form a 14\"x14\" sheet. Now cut 6 - 6\" rounds out of the dough sheet (I used the lid of a pot, which worked great, however any round object will do as a template). You may need to reshape the dough disk and roll it out a second time, to get all 6 rounds.Next, place each of the dough rounds in to the muffin pan, carefully working each round into the shape of the cup (if any small holes develop, you can easily patch them with a small piece of the the scrap dough).Finally, to top the tarts, using a small cookie cutter, stamp out 6 shapes (be creative, there are a million cool cookie cutters out there...I used a star shape) and transfer them to a foil lined baking sheet.Cover both the baking sheet and the muffin pan with plastic wrap and put them back into the refrigerator to chill.. Remove the macerated peach slices from the refrigerator and drain well, reserving the liquid in a medium sized pot and returning the peach slices to the bowl. Next, add lemon juice, cornstarch, and salt to the pot with the reserved peach juice. Bring to a rolling boil over medium heat, stirring constantly until the mixture begins to thicken (5-6 minutes).Once thickened to your desired consistency (I stir for about 10-12 minutes) , pour it back into the peach slices and stir until combined.. Preheat the oven to 425 degrees and move the oven rack to the lowest position.Next, spoon the peach tart filling into the prepared crusts and top with your decoration of choice.Finally, bake at 425 degrees until the edge of the crusts are a light golden brown. Reduce the temperature to 375 degrees and continue baking until the edge crusts are golden brown. . Remove the tarts from the oven and allow to cool in the pan for 4-5 minutes, until set. Then remove the tarts from the muffin pan and cool on a wire rack for 1 hour. Now it's time to sit back, relax, and enjoy. I promise these tarts will not disappoint. Plus, they are the perfect size to hold 1 scoop of vanilla ice cream on top, for a prefect peaches and cream experience. Warm or cold, they are delectable...In fact, I wish I had one right now (seriously). I really hope you've enjoyed this Instructable! Happy Baking!!!Just a note: Please, when ever possible, support your local growers. Not everyone is lucky enough to have access to locally grown produce, if you do, it's important to help keep it alive. Thanks! \nRead the question below and select from the following choices.\nA: Orange and Peach Trifle\nB: Pie Assembly\nC: Relax, Eat, and Enjoy!!!\nD: Pie Assembly",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_197_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_197_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_197_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_197_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_197_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_197_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_197_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_197_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_197_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_197_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_197_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_197_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_197_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_197_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_197_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_197_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_197_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_197_17.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Simple Caramel Apples\nB: Materials & Ingredients\nC: Caramel Sauce!\nD: Sift 1/2 Cup Powdered Sugar, Then Stir Into the Sauce.",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Apple Prep', 'The Caramel', 'Paint It Black']",
+    "context": "Here is the context of these images:\n. -Apples-3-4 drops black food gel-1 bag of caramels-1/4-1/2 cup of cream-SticksThis recipe is for 6-8 small apples. . Wash your apples as you normally would and dry them. If you like, you can also use a piece of clean sandpaper and lightly sand the outside of the apples---this is just for better caramel adherence. Remove any stems and stab your apples with whatever sticks you plan to use. I found these great Halloween themed candy sticks at Michaels and had to get them. They come in brown or black. The green and white sticks I found at Walmart for $1.84. While these do appear fairly sturdy, pre-stab your apple before putting the stick in, otherwise the stick will bend. . You could make your own, but I find buying a bag of caramels to be easier and less time consuming. Unwrap all of the caramels and put into a pot. I used my smallest pot for this, figuring it would be easier to keep things from getting messy. Add cream. The more cream you add the thinner your caramel coating. Thin or thick, there's no wrong way to do this, it's just preference. Additionally, the thinner your caramel, the more there is to go around. I cooked the caramel over medium heat, stirring often and rarely taking my eyes off of the pot. . As soon as all the caramels seemed to be melted---maybe 5 minutes, I added the black food gel.A little goes a long way with black food gel, start with a little and add more as needed. ***Warning: too much gel and your mouth will turn black***One by one, take one of your apples by the stick and carefully swirl it around in the pot of caramel. I left the burner on low, so the caramel wouldn't cool off too quickly. Once you are satisfied with the caramel coverage on your apple, let the excess drip off (or have a friend help you wipe it off) and place on a sheet of parchment paper. Continue until all of your apples and coated.\nRead the question below and select from the following choices.\nA: Simple Caramel Apples\nB: Materials & Ingredients\nC: Caramel Sauce!\nD: Sift 1/2 Cup Powdered Sugar, Then Stir Into the Sauce.",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_198_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_198_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_198_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_198_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_198_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_198_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_198_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_198_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_198_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_198_9.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "textual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: Omlette in a Bag!\nB: Other Ingredients\nC: Pate a Choux\nD: Mise En Place!",
+    "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Making the Spice Paste', 'Casing', 'Fire']",
+    "context": "Here is the context of these images:\n. Five-Spice is one of the main ingredients in making Beggar's Chicken. It is a roasted, powdered blend of spices like Cinnamon, Fennel seeds, Clove, Star Anise and Sichuan Pepper. It is mainly used in Chinese cooking as well in most of the Asian countries. All these spices are growing in our area too. Though I got all raw spices, I could get Sichuan pepper in powdered form only. Thanks to\u00a0lmnopeas for her instructable on \"Chinese Five Spice Powder\" at:\u00a0https://www.instructables.com/id/Chinese-Five-Spice-Powder/ I followed her instructable in making the Five-spice powder for the Beggar's Chicken recipe.. \n          Ingredients required: I have slightly modified the ingredients required for the original Beggar's Chicken Recipe. The following are the ingredients I have used:   One kg of Chicken cleaned     Two tablespoons of Five-Spice Powder     Three tablespoons of Soy Sauce     Four medium sized Onions     One full Garlic bulb     One inch Ginger piece     Three tablespoons of cooking oil     One\u00a0tablespoon Sugar     Salt to taste     One\u00a0tablespoon Red Chilly powder     Two Tomatoes     Four Green Chilies (for the aroma) .   Making the Spice Paste   Peeled Ginger and Garlic and Made\u00a0paste in a Mixer / Grinder     Peeled Onion and made\u00a0a paste     Pureed two tomatoes     Mixed all ingredients including Five-Spice Powder,\u00a0Soy sauce, Sugar, Salt, Cooking oil and Red Chilly Powder together \u00a0along with above pastes in a bowl (other than the Green Chilies)  The spice paste is ready.\u00a0I have not added any water to the mix.. Placed the cleaned chicken in a large sized bowl and applied the spice paste. Most of the spice paste went between the skin and flesh. Also applied the spice paste on the outer and inner sides of the chicken. Sliced green chilies were inserted behind the skin for added aroma.\u00a0 Now the chicken is ready to be wrapped.. The Chinese version of the recipe caters for a Lotus leaf for wrapping the spiced chicken. Here I have used a large sized tender Banana leaf in lieu of the Lotus leaf as\u00a0wrapping with banana leaves may confer an aroma to the chicken. The fresh Banana leaf tends to tear down in places while wrapping. Curing the leaf over low flame will prevent it from tearing down. Placed the spiced chicken in the cured Banana leaf, wrapped it around the chicken and tied it with cotton thread. The spiced chicken is now ready for Mud-casing. Mud-Casing the wrapped chicken is done prior to roasting it in open fire. I have dug out some red soil about six inches below the surface after clearing the top soil from our garden. Added little water to the soil and applied the casing over the banana leaf wrapped spiced chicken. Now we need to collect fire wood and make an open fire to roast it.. We made an open fire with old wood collected from the garden. Dried out leaves of coconut tree also helped us to keep the fire going. Initially, we placed the mud casing over an aluminium plate on a metal stand. When the bottom side of the casing was almost hard, we removed the plate and roasted it on open fire. Then turned it upside down to make it cooked evenly on all sides. It took us about two hours to roast and make the outer mud casing hard on all sides.. Removed the roasted Chicken from the fire and allowed it to cool for some time, then broke the hardened mud-casing. Untied the cotton thread and removed the banana leaf wrapping from around the cooked chicken. The roasted chicken was rightly done, neither overdone nor under-cooked. Now the Beggar's Chicken is ready to be served.\nRead the question below and select from the following choices.\nA: Omlette in a Bag!\nB: Other Ingredients\nC: Pate a Choux\nD: Mise En Place!",
+    "input_image_path": [
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_199_0.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_199_1.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_199_2.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_199_3.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_199_4.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_199_5.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_199_6.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_199_7.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_199_8.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_199_9.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_199_10.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_199_11.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_199_12.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_199_13.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_199_14.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_199_15.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_199_16.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_199_17.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_199_18.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_199_19.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_199_20.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_199_21.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_199_22.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_199_23.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_199_24.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_199_25.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_199_26.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_199_27.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_199_28.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_199_29.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_199_30.jpg",
+      "../MMIU-Benchmark/textual_cloze/textual_cloze_199_31.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Zedge Wallpapers & Ringtones', 'Google Play Store']\nB: ['Photos', 'iNaturalist']\nC: ['Google Photos', 'PlantNet']\nD: ['Pinterest', 'Setting']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (642, 124)\nstep 2: CLICK: (90, 74)\nstep 3: CLICK: (693, 76)\nstep 4: CLICK: (780, 80)\nstep 5: TYPE: art deco\nstep 6: CLICK: (321, 153)\nstep 7: CLICK: (384, 660)\nstep 8: CLICK: (906, 78)\nstep 9: CLICK: (172, 757)\nstep 10: PRESS_RECENT\nstep 11: CLICK: (68, 530)\nstep 12: CLICK: (540, 276)\nstep 13: CLICK: (202, 215)\nstep 14: CLICK: (172, 238)\nstep 15: CLICK: (851, 70)\nstep 16: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (642, 124)\nstep 2: CLICK: (90, 74)\nstep 3: CLICK: (693, 76)\nstep 4: CLICK: (780, 80)\nstep 5: TYPE: art deco\nstep 6: CLICK: (321, 153)\nstep 7: CLICK: (384, 660)\nstep 8: CLICK: (906, 78)\nstep 9: CLICK: (172, 757)\nstep 10: PRESS_RECENT\nstep 11: CLICK: (68, 530)\nstep 12: CLICK: (540, 276)\nstep 13: CLICK: (202, 215)\nstep 14: CLICK: (172, 238)\nstep 15: CLICK: (851, 70)\nstep 16: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Zedge Wallpapers & Ringtones', 'Google Play Store']\nB: ['Photos', 'iNaturalist']\nC: ['Google Photos', 'PlantNet']\nD: ['Pinterest', 'Setting']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_0_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_0_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_0_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_0_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_0_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_0_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_0_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_0_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_0_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_0_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_0_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_0_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_0_12.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_0_13.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_0_14.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_0_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Contacts', 'Farfetch']\nB: ['Vaulty:Hide Pictures Videos', 'Net-a-Porte']\nC: ['Google Play Store', 'Meesho']\nD: ['PlantNet', 'Target']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (168, 104)\nstep 2: CLICK: (479, 927)\nstep 3: CLICK: (474, 76)\nstep 4: TYPE: Meesho\nstep 5: CLICK: (919, 896)\nstep 6: CLICK: (770, 324)\nstep 7: PRESS_HOME\nstep 8: CLICK: (819, 400)\nstep 9: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (168, 104)\nstep 2: CLICK: (479, 927)\nstep 3: CLICK: (474, 76)\nstep 4: TYPE: Meesho\nstep 5: CLICK: (919, 896)\nstep 6: CLICK: (770, 324)\nstep 7: PRESS_HOME\nstep 8: CLICK: (819, 400)\nstep 9: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Contacts', 'Farfetch']\nB: ['Vaulty:Hide Pictures Videos', 'Net-a-Porte']\nC: ['Google Play Store', 'Meesho']\nD: ['PlantNet', 'Target']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_1_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_1_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_1_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_1_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_1_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_1_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_1_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_1_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_1_8.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Wikipedia', 'Tiktok']\nB: ['Opera', 'Shorts VotTak: Short Video App']\nC: ['Firefox', 'Tubi: Movies & Live TV']\nD: ['Chrome', 'Netflix']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (611, 740)\nstep 2: CLICK: (830, 81)\nstep 3: CLICK: (691, 362)\nstep 4: CLICK: (858, 890)\nstep 5: TYPE: mystery movie on tubi\nstep 6: CLICK: (894, 876)\nstep 7: CLICK: (283, 484)\nstep 8: CLICK: (774, 570)\nstep 9: CLICK: (872, 90)\nstep 10: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (611, 740)\nstep 2: CLICK: (830, 81)\nstep 3: CLICK: (691, 362)\nstep 4: CLICK: (858, 890)\nstep 5: TYPE: mystery movie on tubi\nstep 6: CLICK: (894, 876)\nstep 7: CLICK: (283, 484)\nstep 8: CLICK: (774, 570)\nstep 9: CLICK: (872, 90)\nstep 10: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Wikipedia', 'Tiktok']\nB: ['Opera', 'Shorts VotTak: Short Video App']\nC: ['Firefox', 'Tubi: Movies & Live TV']\nD: ['Chrome', 'Netflix']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_2_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_2_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_2_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_2_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_2_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_2_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_2_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_2_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_2_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_2_9.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['X', 'Google Meet']\nB: ['Threads', 'Zoho Meeting']\nC: ['Gmail', 'Microsoft Teams']\nD: ['Facebook', 'ZOOM Cloud Meetings']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (152, 525)\nstep 2: CLICK: (871, 929)\nstep 3: CLICK: (362, 201)\nstep 4: CLICK: (886, 823)\nstep 5: PRESS_HOME\nstep 6: CLICK: (346, 273)\nstep 7: CLICK: (906, 949)\nstep 8: CLICK: (364, 153)\nstep 9: CLICK: (380, 952)\nstep 10: TYPE: meet.google.com/wtm-nmdy-dav\nstep 11: CLICK: (924, 640)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (152, 525)\nstep 2: CLICK: (871, 929)\nstep 3: CLICK: (362, 201)\nstep 4: CLICK: (886, 823)\nstep 5: PRESS_HOME\nstep 6: CLICK: (346, 273)\nstep 7: CLICK: (906, 949)\nstep 8: CLICK: (364, 153)\nstep 9: CLICK: (380, 952)\nstep 10: TYPE: meet.google.com/wtm-nmdy-dav\nstep 11: CLICK: (924, 640)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['X', 'Google Meet']\nB: ['Threads', 'Zoho Meeting']\nC: ['Gmail', 'Microsoft Teams']\nD: ['Facebook', 'ZOOM Cloud Meetings']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_3_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_3_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_3_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_3_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_3_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_3_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_3_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_3_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_3_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_3_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_3_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_3_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['WPS office', 'Firefox']\nB: ['Google Docs', 'wikiHow']\nC: ['Simplenote', 'Bing: chat with AI & GPT4']\nD: ['Dropbox Paper', 'Chrome']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (156, 132)\nstep 2: CLICK: (848, 188)\nstep 3: TYPE: Seoul weather tomorrow\nstep 4: CLICK: (929, 186)\nstep 5: PRESS_HOME\nstep 6: CLICK: (831, 402)\nstep 7: CLICK: (942, 76)\nstep 8: CLICK: (237, 574)\nstep 9: CLICK: (880, 814)\nstep 10: CLICK: (145, 662)\nstep 11: CLICK: (197, 270)\nstep 12: CLICK: (411, 438)\nstep 13: TYPE: Seoul, tomorrow: rain  todolist: stay at home and play lol\nstep 14: CLICK: (45, 80)\nstep 15: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (156, 132)\nstep 2: CLICK: (848, 188)\nstep 3: TYPE: Seoul weather tomorrow\nstep 4: CLICK: (929, 186)\nstep 5: PRESS_HOME\nstep 6: CLICK: (831, 402)\nstep 7: CLICK: (942, 76)\nstep 8: CLICK: (237, 574)\nstep 9: CLICK: (880, 814)\nstep 10: CLICK: (145, 662)\nstep 11: CLICK: (197, 270)\nstep 12: CLICK: (411, 438)\nstep 13: TYPE: Seoul, tomorrow: rain  todolist: stay at home and play lol\nstep 14: CLICK: (45, 80)\nstep 15: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['WPS office', 'Firefox']\nB: ['Google Docs', 'wikiHow']\nC: ['Simplenote', 'Bing: chat with AI & GPT4']\nD: ['Dropbox Paper', 'Chrome']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_4_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_4_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_4_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_4_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_4_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_4_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_4_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_4_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_4_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_4_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_4_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_4_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_4_12.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_4_13.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_4_14.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['ClevCalc - Calculator', 'DuckDuckGo']\nB: ['Clock', 'Bing: chat with AI & GPT4']\nC: ['Calendar', 'Firefox']\nD: ['DigiCal Calendar Agenda', 'Opera']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (915, 507)\nstep 2: SCROLL: DOWN\nstep 3: CLICK: (628, 72)\nstep 4: TYPE: when is next Olympics opening ceremony\nstep 5: CLICK: (307, 241)\nstep 6: PRESS_RECENT\nstep 7: CLICK: (591, 922)\nstep 8: CLICK: (401, 425)\nstep 9: CLICK: (906, 888)\nstep 10: CLICK: (909, 865)\nstep 11: TYPE: Olympics opening ceremony\nstep 12: CLICK: (433, 198)\nstep 13: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (915, 507)\nstep 2: SCROLL: DOWN\nstep 3: CLICK: (628, 72)\nstep 4: TYPE: when is next Olympics opening ceremony\nstep 5: CLICK: (307, 241)\nstep 6: PRESS_RECENT\nstep 7: CLICK: (591, 922)\nstep 8: CLICK: (401, 425)\nstep 9: CLICK: (906, 888)\nstep 10: CLICK: (909, 865)\nstep 11: TYPE: Olympics opening ceremony\nstep 12: CLICK: (433, 198)\nstep 13: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['ClevCalc - Calculator', 'DuckDuckGo']\nB: ['Clock', 'Bing: chat with AI & GPT4']\nC: ['Calendar', 'Firefox']\nD: ['DigiCal Calendar Agenda', 'Opera']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_5_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_5_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_5_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_5_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_5_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_5_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_5_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_5_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_5_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_5_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_5_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_5_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_5_12.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Yahoo Sports', 'Google Wallet']\nB: ['AP News', 'PayPal - Send, Shop, Manage']\nC: ['Bloomberg: Finance Market News', 'Investing.com']\nD: ['BBC News', 'Venmo']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (674, 351)\nstep 2: CLICK: (908, 74)\nstep 3: CLICK: (302, 69)\nstep 4: TYPE: Nvidia\nstep 5: SCROLL: UP\nstep 6: CLICK: (374, 435)\nstep 7: PRESS_HOME\nstep 8: CLICK: (315, 342)\nstep 9: CLICK: (915, 77)\nstep 10: TYPE: Nvidia\nstep 11: CLICK: (649, 310)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (674, 351)\nstep 2: CLICK: (908, 74)\nstep 3: CLICK: (302, 69)\nstep 4: TYPE: Nvidia\nstep 5: SCROLL: UP\nstep 6: CLICK: (374, 435)\nstep 7: PRESS_HOME\nstep 8: CLICK: (315, 342)\nstep 9: CLICK: (915, 77)\nstep 10: TYPE: Nvidia\nstep 11: CLICK: (649, 310)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Yahoo Sports', 'Google Wallet']\nB: ['AP News', 'PayPal - Send, Shop, Manage']\nC: ['Bloomberg: Finance Market News', 'Investing.com']\nD: ['BBC News', 'Venmo']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_6_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_6_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_6_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_6_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_6_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_6_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_6_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_6_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_6_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_6_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_6_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_6_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Opera', 'Microsoft Word']\nB: ['wikiHow', 'Google Docs']\nC: ['Chrome', 'Simplenote']\nD: ['Edge', 'BasicNote - Notes, Notepad']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (661, 731)\nstep 2: CLICK: (363, 74)\nstep 3: TYPE: weather in Istanbul tomorrow\nstep 4: CLICK: (441, 156)\nstep 5: SCROLL: UP\nstep 6: PRESS_HOME\nstep 7: CLICK: (691, 354)\nstep 8: CLICK: (897, 873)\nstep 9: CLICK: (316, 312)\nstep 10: TYPE: Istanbul,tomorrow:mostly cloudy Todolist:take a trip outsides.\nstep 11: CLICK: (77, 85)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (661, 731)\nstep 2: CLICK: (363, 74)\nstep 3: TYPE: weather in Istanbul tomorrow\nstep 4: CLICK: (441, 156)\nstep 5: SCROLL: UP\nstep 6: PRESS_HOME\nstep 7: CLICK: (691, 354)\nstep 8: CLICK: (897, 873)\nstep 9: CLICK: (316, 312)\nstep 10: TYPE: Istanbul,tomorrow:mostly cloudy Todolist:take a trip outsides.\nstep 11: CLICK: (77, 85)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Opera', 'Microsoft Word']\nB: ['wikiHow', 'Google Docs']\nC: ['Chrome', 'Simplenote']\nD: ['Edge', 'BasicNote - Notes, Notepad']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_7_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_7_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_7_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_7_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_7_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_7_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_7_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_7_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_7_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_7_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_7_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_7_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['PlantNet', 'Triller', 'Firefox']\nB: ['TradingView: Track All Markets', 'Pluto TV - Live TV and Movies', 'wikiHow']\nC: ['Google Play Store', 'Tiktok', 'Chrome']\nD: ['Setting', 'Youtube', 'Opera']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (844, 124)\nstep 2: CLICK: (677, 943)\nstep 3: CLICK: (516, 911)\nstep 4: CLICK: (259, 136)\nstep 5: TYPE: cooking recipes on youtube\nstep 6: CLICK: (949, 921)\nstep 7: CLICK: (170, 337)\nstep 8: PRESS_HOME\nstep 9: CLICK: (384, 834)\nstep 10: CLICK: (268, 960)\nstep 11: CLICK: (442, 325)\nstep 12: CLICK: (797, 93)\nstep 13: SCROLL: RIGHT\nstep 14: SCROLL: UP\nstep 15: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (844, 124)\nstep 2: CLICK: (677, 943)\nstep 3: CLICK: (516, 911)\nstep 4: CLICK: (259, 136)\nstep 5: TYPE: cooking recipes on youtube\nstep 6: CLICK: (949, 921)\nstep 7: CLICK: (170, 337)\nstep 8: PRESS_HOME\nstep 9: CLICK: (384, 834)\nstep 10: CLICK: (268, 960)\nstep 11: CLICK: (442, 325)\nstep 12: CLICK: (797, 93)\nstep 13: SCROLL: RIGHT\nstep 14: SCROLL: UP\nstep 15: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['PlantNet', 'Triller', 'Firefox']\nB: ['TradingView: Track All Markets', 'Pluto TV - Live TV and Movies', 'wikiHow']\nC: ['Google Play Store', 'Tiktok', 'Chrome']\nD: ['Setting', 'Youtube', 'Opera']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_8_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_8_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_8_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_8_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_8_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_8_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_8_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_8_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_8_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_8_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_8_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_8_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_8_12.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_8_13.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_8_14.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Google Play Store', 'Opera']\nB: ['TradingView: Track All Markets', 'Chrome']\nC: ['PlantNet', 'DuckDuckGo']\nD: ['Vaulty:Hide Pictures Videos', 'Edge']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (614, 404)\nstep 2: CLICK: (366, 137)\nstep 3: TYPE: Grocery Shopping Apps\nstep 4: CLICK: (923, 916)\nstep 5: SCROLL: UP\nstep 6: PRESS_HOME\nstep 7: SCROLL: RIGHT\nstep 8: CLICK: (161, 656)\nstep 9: CLICK: (363, 77)\nstep 10: TYPE: AnyList\nstep 11: CLICK: (918, 909)\nstep 12: CLICK: (860, 311)\nstep 13: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (614, 404)\nstep 2: CLICK: (366, 137)\nstep 3: TYPE: Grocery Shopping Apps\nstep 4: CLICK: (923, 916)\nstep 5: SCROLL: UP\nstep 6: PRESS_HOME\nstep 7: SCROLL: RIGHT\nstep 8: CLICK: (161, 656)\nstep 9: CLICK: (363, 77)\nstep 10: TYPE: AnyList\nstep 11: CLICK: (918, 909)\nstep 12: CLICK: (860, 311)\nstep 13: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Google Play Store', 'Opera']\nB: ['TradingView: Track All Markets', 'Chrome']\nC: ['PlantNet', 'DuckDuckGo']\nD: ['Vaulty:Hide Pictures Videos', 'Edge']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_9_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_9_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_9_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_9_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_9_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_9_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_9_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_9_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_9_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_9_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_9_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_9_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_9_12.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Threads', 'iHeart: Music, Radio, Podcasts']\nB: ['Instagram', 'Spotify']\nC: ['X', 'Pandora']\nD: ['Gmail', 'YT Music']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (141, 718)\nstep 2: CLICK: (410, 57)\nstep 3: CLICK: (985, 63)\nstep 4: TYPE: Pop\nstep 5: CLICK: (892, 674)\nstep 6: CLICK: (485, 141)\nstep 7: CLICK: (416, 327)\nstep 8: CLICK: (863, 613)\nstep 9: PRESS_HOME\nstep 10: CLICK: (576, 104)\nstep 11: CLICK: (19, 481)\nstep 12: CLICK: (156, 138)\nstep 13: CLICK: (40, 204)\nstep 14: TYPE: Popular\nstep 15: CLICK: (962, 423)\nstep 16: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (141, 718)\nstep 2: CLICK: (410, 57)\nstep 3: CLICK: (985, 63)\nstep 4: TYPE: Pop\nstep 5: CLICK: (892, 674)\nstep 6: CLICK: (485, 141)\nstep 7: CLICK: (416, 327)\nstep 8: CLICK: (863, 613)\nstep 9: PRESS_HOME\nstep 10: CLICK: (576, 104)\nstep 11: CLICK: (19, 481)\nstep 12: CLICK: (156, 138)\nstep 13: CLICK: (40, 204)\nstep 14: TYPE: Popular\nstep 15: CLICK: (962, 423)\nstep 16: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Threads', 'iHeart: Music, Radio, Podcasts']\nB: ['Instagram', 'Spotify']\nC: ['X', 'Pandora']\nD: ['Gmail', 'YT Music']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_10_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_10_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_10_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_10_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_10_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_10_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_10_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_10_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_10_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_10_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_10_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_10_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_10_12.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_10_13.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_10_14.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_10_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Pluto TV - Live TV and Movies', 'Plantin']\nB: ['Tiktok', 'Picturethis']\nC: ['Triller', 'Tripadvisor']\nD: ['Likee', 'Google Play Store']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (855, 743)\nstep 2: CLICK: (563, 248)\nstep 3: PRESS_HOME\nstep 4: CLICK: (397, 726)\nstep 5: PRESS_HOME\nstep 6: CLICK: (822, 739)\nstep 7: CLICK: (72, 76)\nstep 8: CLICK: (869, 476)\nstep 9: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (855, 743)\nstep 2: CLICK: (563, 248)\nstep 3: PRESS_HOME\nstep 4: CLICK: (397, 726)\nstep 5: PRESS_HOME\nstep 6: CLICK: (822, 739)\nstep 7: CLICK: (72, 76)\nstep 8: CLICK: (869, 476)\nstep 9: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Pluto TV - Live TV and Movies', 'Plantin']\nB: ['Tiktok', 'Picturethis']\nC: ['Triller', 'Tripadvisor']\nD: ['Likee', 'Google Play Store']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_11_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_11_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_11_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_11_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_11_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_11_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_11_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_11_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_11_8.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Youtube', 'Contacts', 'iNaturalist']\nB: ['Netflix', 'PlantNet', 'Applock Pro - APP Lock & Guard']\nC: ['Triller', 'Google Play Store', 'Setting']\nD: ['Tubi: Movies & Live TV', 'Vaulty:Hide Pictures Videos', 'Tripadvisor']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (668, 916)\nstep 2: CLICK: (384, 79)\nstep 3: TYPE: triller\nstep 4: CLICK: (880, 891)\nstep 5: CLICK: (872, 437)\nstep 6: PRESS_HOME\nstep 7: CLICK: (558, 929)\nstep 8: CLICK: (455, 498)\nstep 9: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (668, 916)\nstep 2: CLICK: (384, 79)\nstep 3: TYPE: triller\nstep 4: CLICK: (880, 891)\nstep 5: CLICK: (872, 437)\nstep 6: PRESS_HOME\nstep 7: CLICK: (558, 929)\nstep 8: CLICK: (455, 498)\nstep 9: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Youtube', 'Contacts', 'iNaturalist']\nB: ['Netflix', 'PlantNet', 'Applock Pro - APP Lock & Guard']\nC: ['Triller', 'Google Play Store', 'Setting']\nD: ['Tubi: Movies & Live TV', 'Vaulty:Hide Pictures Videos', 'Tripadvisor']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_12_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_12_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_12_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_12_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_12_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_12_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_12_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_12_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_12_8.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Zoho Meeting', 'Threads']\nB: ['Google Meet', 'Instagram']\nC: ['Microsoft Teams', 'Facebook']\nD: ['ZOOM Cloud Meetings', 'Messenger']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (783, 692)\nstep 2: CLICK: (293, 376)\nstep 3: CLICK: (496, 488)\nstep 4: CLICK: (489, 76)\nstep 5: PRESS_HOME\nstep 6: CLICK: (670, 160)\nstep 7: CLICK: (403, 528)\nstep 8: CLICK: (464, 910)\nstep 9: TYPE: 9298916954\nstep 10: SCROLL: UP\nstep 11: CLICK: (763, 515)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (783, 692)\nstep 2: CLICK: (293, 376)\nstep 3: CLICK: (496, 488)\nstep 4: CLICK: (489, 76)\nstep 5: PRESS_HOME\nstep 6: CLICK: (670, 160)\nstep 7: CLICK: (403, 528)\nstep 8: CLICK: (464, 910)\nstep 9: TYPE: 9298916954\nstep 10: SCROLL: UP\nstep 11: CLICK: (763, 515)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Zoho Meeting', 'Threads']\nB: ['Google Meet', 'Instagram']\nC: ['Microsoft Teams', 'Facebook']\nD: ['ZOOM Cloud Meetings', 'Messenger']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_13_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_13_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_13_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_13_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_13_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_13_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_13_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_13_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_13_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_13_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_13_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_13_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['ChatOn - AI Chat Bot Assistant', 'Quora']\nB: ['WOMBO Dream-AI Art Generator', 'Bing: chat with AI & GPT4']\nC: ['Microsoft Copilot', 'Opera']\nD: ['ChatGPT', 'wikiHow']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (565, 403)\nstep 2: TYPE: tell me about Bolzano-Weierstrass theorem\nstep 3: CLICK: (694, 418)\nstep 4: SCROLL: UP\nstep 5: PRESS_HOME\nstep 6: CLICK: (138, 413)\nstep 7: CLICK: (210, 111)\nstep 8: TYPE: Bolzano-Weierstrass theorem\nstep 9: CLICK: (889, 697)\nstep 10: SCROLL: UP\nstep 11: CLICK: (357, 615)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (565, 403)\nstep 2: TYPE: tell me about Bolzano-Weierstrass theorem\nstep 3: CLICK: (694, 418)\nstep 4: SCROLL: UP\nstep 5: PRESS_HOME\nstep 6: CLICK: (138, 413)\nstep 7: CLICK: (210, 111)\nstep 8: TYPE: Bolzano-Weierstrass theorem\nstep 9: CLICK: (889, 697)\nstep 10: SCROLL: UP\nstep 11: CLICK: (357, 615)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['ChatOn - AI Chat Bot Assistant', 'Quora']\nB: ['WOMBO Dream-AI Art Generator', 'Bing: chat with AI & GPT4']\nC: ['Microsoft Copilot', 'Opera']\nD: ['ChatGPT', 'wikiHow']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_14_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_14_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_14_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_14_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_14_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_14_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_14_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_14_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_14_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_14_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_14_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_14_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Threads', 'DuckDuckgo']\nB: ['Whatsapp', 'Firefox']\nC: ['Tumblr', 'Edge']\nD: ['Instagram', 'Bing: chat with AI & GPT4']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (573, 704)\nstep 2: CLICK: (170, 67)\nstep 3: TYPE: Japan\nstep 4: CLICK: (903, 693)\nstep 5: CLICK: (128, 454)\nstep 6: CLICK: (983, 68)\nstep 7: CLICK: (855, 193)\nstep 8: CLICK: (706, 599)\nstep 9: PRESS_HOME\nstep 10: CLICK: (148, 254)\nstep 11: CLICK: (495, 921)\nstep 12: CLICK: (474, 492)\nstep 13: CLICK: (687, 424)\nstep 14: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (573, 704)\nstep 2: CLICK: (170, 67)\nstep 3: TYPE: Japan\nstep 4: CLICK: (903, 693)\nstep 5: CLICK: (128, 454)\nstep 6: CLICK: (983, 68)\nstep 7: CLICK: (855, 193)\nstep 8: CLICK: (706, 599)\nstep 9: PRESS_HOME\nstep 10: CLICK: (148, 254)\nstep 11: CLICK: (495, 921)\nstep 12: CLICK: (474, 492)\nstep 13: CLICK: (687, 424)\nstep 14: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Threads', 'DuckDuckgo']\nB: ['Whatsapp', 'Firefox']\nC: ['Tumblr', 'Edge']\nD: ['Instagram', 'Bing: chat with AI & GPT4']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_15_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_15_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_15_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_15_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_15_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_15_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_15_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_15_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_15_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_15_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_15_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_15_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_15_12.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_15_13.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Calculator', 'Tripadvisor']\nB: ['aCalendar', 'TradingView: Track All Markets']\nC: ['ClevCalc - Calculator', 'Contacts']\nD: ['Clock', 'Setting']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (453, 491)\nstep 2: SCROLL: UP\nstep 3: CLICK: (160, 682)\nstep 4: SCROLL: UP\nstep 5: CLICK: (437, 672)\nstep 6: CLICK: (480, 427)\nstep 7: CLICK: (480, 427)\nstep 8: PRESS_HOME\nstep 9: SCROLL: UP\nstep 10: CLICK: (795, 766)\nstep 11: CLICK: (781, 846)\nstep 12: CLICK: (651, 599)\nstep 13: CLICK: (796, 724)\nstep 14: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (453, 491)\nstep 2: SCROLL: UP\nstep 3: CLICK: (160, 682)\nstep 4: SCROLL: UP\nstep 5: CLICK: (437, 672)\nstep 6: CLICK: (480, 427)\nstep 7: CLICK: (480, 427)\nstep 8: PRESS_HOME\nstep 9: SCROLL: UP\nstep 10: CLICK: (795, 766)\nstep 11: CLICK: (781, 846)\nstep 12: CLICK: (651, 599)\nstep 13: CLICK: (796, 724)\nstep 14: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Calculator', 'Tripadvisor']\nB: ['aCalendar', 'TradingView: Track All Markets']\nC: ['ClevCalc - Calculator', 'Contacts']\nD: ['Clock', 'Setting']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_16_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_16_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_16_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_16_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_16_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_16_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_16_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_16_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_16_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_16_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_16_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_16_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_16_12.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_16_13.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Waze Navigation & Live Traffic', 'Plantin']\nB: ['Uber', 'TradingView: Track All Markets']\nC: ['Yandex Navigator', 'Contacts']\nD: ['Citymapper', 'Google Play Store']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (837, 522)\nstep 2: CLICK: (505, 466)\nstep 3: CLICK: (371, 294)\nstep 4: TYPE: park\nstep 5: CLICK: (540, 465)\nstep 6: PRESS_RECENT\nstep 7: SCROLL: RIGHT\nstep 8: CLICK: (415, 561)\nstep 9: CLICK: (333, 86)\nstep 10: CLICK: (304, 429)\nstep 11: CLICK: (384, 554)\nstep 12: CLICK: (427, 317)\nstep 13: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (837, 522)\nstep 2: CLICK: (505, 466)\nstep 3: CLICK: (371, 294)\nstep 4: TYPE: park\nstep 5: CLICK: (540, 465)\nstep 6: PRESS_RECENT\nstep 7: SCROLL: RIGHT\nstep 8: CLICK: (415, 561)\nstep 9: CLICK: (333, 86)\nstep 10: CLICK: (304, 429)\nstep 11: CLICK: (384, 554)\nstep 12: CLICK: (427, 317)\nstep 13: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Waze Navigation & Live Traffic', 'Plantin']\nB: ['Uber', 'TradingView: Track All Markets']\nC: ['Yandex Navigator', 'Contacts']\nD: ['Citymapper', 'Google Play Store']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_17_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_17_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_17_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_17_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_17_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_17_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_17_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_17_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_17_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_17_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_17_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_17_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_17_12.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['iNaturalist', 'X']\nB: ['Applock Pro - APP Lock & Guard', 'Messenger']\nC: ['PlantNet', 'Tumblr']\nD: ['Vaulty:Hide Pictures Videos', 'Instagram']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (603, 526)\nstep 2: CLICK: (380, 251)\nstep 3: CLICK: (299, 150)\nstep 4: CLICK: (447, 558)\nstep 5: CLICK: (237, 942)\nstep 6: SCROLL: UP\nstep 7: CLICK: (862, 520)\nstep 8: CLICK: (382, 483)\nstep 9: CLICK: (920, 72)\nstep 10: CLICK: (920, 72)\nstep 11: CLICK: (451, 956)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (603, 526)\nstep 2: CLICK: (380, 251)\nstep 3: CLICK: (299, 150)\nstep 4: CLICK: (447, 558)\nstep 5: CLICK: (237, 942)\nstep 6: SCROLL: UP\nstep 7: CLICK: (862, 520)\nstep 8: CLICK: (382, 483)\nstep 9: CLICK: (920, 72)\nstep 10: CLICK: (920, 72)\nstep 11: CLICK: (451, 956)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['iNaturalist', 'X']\nB: ['Applock Pro - APP Lock & Guard', 'Messenger']\nC: ['PlantNet', 'Tumblr']\nD: ['Vaulty:Hide Pictures Videos', 'Instagram']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_18_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_18_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_18_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_18_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_18_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_18_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_18_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_18_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_18_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_18_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_18_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_18_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Google Keep', 'X']\nB: ['Simplenote', 'Whatsapp']\nC: ['Microsoft Word', 'Threads']\nD: ['WPS office', 'Facebook']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (918, 676)\nstep 2: CLICK: (961, 330)\nstep 3: CLICK: (672, 417)\nstep 4: SCROLL: UP\nstep 5: CLICK: (393, 595)\nstep 6: CLICK: (738, 62)\nstep 7: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (918, 676)\nstep 2: CLICK: (961, 330)\nstep 3: CLICK: (672, 417)\nstep 4: SCROLL: UP\nstep 5: CLICK: (393, 595)\nstep 6: CLICK: (738, 62)\nstep 7: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Google Keep', 'X']\nB: ['Simplenote', 'Whatsapp']\nC: ['Microsoft Word', 'Threads']\nD: ['WPS office', 'Facebook']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_19_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_19_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_19_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_19_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_19_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_19_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_19_6.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Instagram', 'Google Play Store', 'Setting']\nB: ['Gmail', 'iNaturalist', 'Applock Pro - APP Lock & Guard']\nC: ['Whatsapp', 'PlantNet', 'TradingView: Track All Markets']\nD: ['Tumblr', 'Vaulty:Hide Pictures Videos', 'Google Play Store']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (131, 706)\nstep 2: CLICK: (150, 76)\nstep 3: TYPE: tiktok\nstep 4: CLICK: (156, 143)\nstep 5: CLICK: (657, 372)\nstep 6: PRESS_HOME\nstep 7: CLICK: (417, 711)\nstep 8: PRESS_HOME\nstep 9: CLICK: (136, 699)\nstep 10: CLICK: (138, 56)\nstep 11: CLICK: (980, 64)\nstep 12: TYPE: Instagram\nstep 13: CLICK: (225, 135)\nstep 14: CLICK: (540, 373)\nstep 15: CLICK: (691, 376)\nstep 16: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (131, 706)\nstep 2: CLICK: (150, 76)\nstep 3: TYPE: tiktok\nstep 4: CLICK: (156, 143)\nstep 5: CLICK: (657, 372)\nstep 6: PRESS_HOME\nstep 7: CLICK: (417, 711)\nstep 8: PRESS_HOME\nstep 9: CLICK: (136, 699)\nstep 10: CLICK: (138, 56)\nstep 11: CLICK: (980, 64)\nstep 12: TYPE: Instagram\nstep 13: CLICK: (225, 135)\nstep 14: CLICK: (540, 373)\nstep 15: CLICK: (691, 376)\nstep 16: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Instagram', 'Google Play Store', 'Setting']\nB: ['Gmail', 'iNaturalist', 'Applock Pro - APP Lock & Guard']\nC: ['Whatsapp', 'PlantNet', 'TradingView: Track All Markets']\nD: ['Tumblr', 'Vaulty:Hide Pictures Videos', 'Google Play Store']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_20_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_20_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_20_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_20_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_20_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_20_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_20_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_20_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_20_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_20_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_20_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_20_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_20_12.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_20_13.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_20_14.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_20_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Google Play Store', 'Chrome']\nB: ['PlantNet', 'Bing: chat with AI & GPT4']\nC: ['TradingView: Track All Markets', 'Quora']\nD: ['Contacts', 'wikiHow']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (687, 745)\nstep 2: CLICK: (358, 388)\nstep 3: TYPE: Task Manager Apps\nstep 4: CLICK: (920, 881)\nstep 5: PRESS_HOME\nstep 6: CLICK: (502, 755)\nstep 7: CLICK: (383, 84)\nstep 8: TYPE: Todoist\nstep 9: CLICK: (918, 882)\nstep 10: CLICK: (861, 465)\nstep 11: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (687, 745)\nstep 2: CLICK: (358, 388)\nstep 3: TYPE: Task Manager Apps\nstep 4: CLICK: (920, 881)\nstep 5: PRESS_HOME\nstep 6: CLICK: (502, 755)\nstep 7: CLICK: (383, 84)\nstep 8: TYPE: Todoist\nstep 9: CLICK: (918, 882)\nstep 10: CLICK: (861, 465)\nstep 11: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Google Play Store', 'Chrome']\nB: ['PlantNet', 'Bing: chat with AI & GPT4']\nC: ['TradingView: Track All Markets', 'Quora']\nD: ['Contacts', 'wikiHow']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_21_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_21_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_21_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_21_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_21_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_21_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_21_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_21_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_21_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_21_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_21_10.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Threads', 'Chatbot AI & Smart Assistant']\nB: ['Tumblr', 'WOMBO Dream-AI Art Generator']\nC: ['Whatsapp', 'Remix:AI Image Creator']\nD: ['X', 'Picsart AI Photo Editor,Video']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (158, 514)\nstep 2: CLICK: (469, 935)\nstep 3: CLICK: (498, 315)\nstep 4: CLICK: (231, 869)\nstep 5: CLICK: (85, 509)\nstep 6: SCROLL: UP\nstep 7: CLICK: (931, 74)\nstep 8: CLICK: (775, 83)\nstep 9: CLICK: (647, 655)\nstep 10: CLICK: (323, 772)\nstep 11: CLICK: (899, 934)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (158, 514)\nstep 2: CLICK: (469, 935)\nstep 3: CLICK: (498, 315)\nstep 4: CLICK: (231, 869)\nstep 5: CLICK: (85, 509)\nstep 6: SCROLL: UP\nstep 7: CLICK: (931, 74)\nstep 8: CLICK: (775, 83)\nstep 9: CLICK: (647, 655)\nstep 10: CLICK: (323, 772)\nstep 11: CLICK: (899, 934)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Threads', 'Chatbot AI & Smart Assistant']\nB: ['Tumblr', 'WOMBO Dream-AI Art Generator']\nC: ['Whatsapp', 'Remix:AI Image Creator']\nD: ['X', 'Picsart AI Photo Editor,Video']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_22_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_22_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_22_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_22_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_22_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_22_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_22_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_22_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_22_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_22_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_22_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_22_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Vaulty:Hide Pictures Videos', 'Triller', 'Tripadvisor']\nB: ['Contacts', 'Pluto TV - Live TV and Movies', 'Vaulty:Hide Pictures Videos']\nC: ['Applock Pro - APP Lock & Guard', 'Youtube', 'Picturethis']\nD: ['Google Play Store', 'Likee', 'Setting']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (830, 742)\nstep 2: CLICK: (863, 471)\nstep 3: PRESS_HOME\nstep 4: CLICK: (352, 742)\nstep 5: CLICK: (338, 465)\nstep 6: SCROLL: RIGHT\nstep 7: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (830, 742)\nstep 2: CLICK: (863, 471)\nstep 3: PRESS_HOME\nstep 4: CLICK: (352, 742)\nstep 5: CLICK: (338, 465)\nstep 6: SCROLL: RIGHT\nstep 7: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Vaulty:Hide Pictures Videos', 'Triller', 'Tripadvisor']\nB: ['Contacts', 'Pluto TV - Live TV and Movies', 'Vaulty:Hide Pictures Videos']\nC: ['Applock Pro - APP Lock & Guard', 'Youtube', 'Picturethis']\nD: ['Google Play Store', 'Likee', 'Setting']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_23_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_23_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_23_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_23_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_23_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_23_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_23_6.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Netflix', 'Audible: Audio Entertainment']\nB: ['Likee', 'Amazon Kindle']\nC: ['Tiktok', 'Google Play Books & Audiobooks']\nD: ['Pluto TV - Live TV and Movies', 'Libby, by OverDrive']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (402, 641)\nstep 2: CLICK: (937, 77)\nstep 3: TYPE: American Civil War\nstep 4: CLICK: (888, 73)\nstep 5: CLICK: (204, 576)\nstep 6: PRESS_HOME\nstep 7: SCROLL: LEFT\nstep 8: CLICK: (387, 122)\nstep 9: CLICK: (385, 76)\nstep 10: TYPE: American Civil War\nstep 11: CLICK: (303, 292)\nstep 12: CLICK: (708, 313)\nstep 13: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (402, 641)\nstep 2: CLICK: (937, 77)\nstep 3: TYPE: American Civil War\nstep 4: CLICK: (888, 73)\nstep 5: CLICK: (204, 576)\nstep 6: PRESS_HOME\nstep 7: SCROLL: LEFT\nstep 8: CLICK: (387, 122)\nstep 9: CLICK: (385, 76)\nstep 10: TYPE: American Civil War\nstep 11: CLICK: (303, 292)\nstep 12: CLICK: (708, 313)\nstep 13: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Netflix', 'Audible: Audio Entertainment']\nB: ['Likee', 'Amazon Kindle']\nC: ['Tiktok', 'Google Play Books & Audiobooks']\nD: ['Pluto TV - Live TV and Movies', 'Libby, by OverDrive']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_24_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_24_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_24_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_24_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_24_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_24_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_24_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_24_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_24_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_24_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_24_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_24_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_24_12.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Everand', 'Picturethis']\nB: ['Libby, by OverDrive', 'iNaturalist']\nC: ['Ploter - Ebook, Audiobook, PDF', 'Setting']\nD: ['Pocket FM: Audio Series', 'Tripadvisor']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (381, 534)\nstep 2: CLICK: (218, 958)\nstep 3: CLICK: (879, 656)\nstep 4: PRESS_HOME\nstep 5: CLICK: (210, 526)\nstep 6: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (381, 534)\nstep 2: CLICK: (218, 958)\nstep 3: CLICK: (879, 656)\nstep 4: PRESS_HOME\nstep 5: CLICK: (210, 526)\nstep 6: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Everand', 'Picturethis']\nB: ['Libby, by OverDrive', 'iNaturalist']\nC: ['Ploter - Ebook, Audiobook, PDF', 'Setting']\nD: ['Pocket FM: Audio Series', 'Tripadvisor']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_25_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_25_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_25_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_25_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_25_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_25_5.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Plantin', 'Pocket FM: Audio Series']\nB: ['Contacts', 'Ploter - Ebook, Audiobook, PDF']\nC: ['Setting', 'Libby, by OverDrive']\nD: ['Picturethis', 'Amazon Kindle']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (547, 654)\nstep 2: SCROLL: UP\nstep 3: CLICK: (477, 647)\nstep 4: CLICK: (904, 717)\nstep 5: PRESS_HOME\nstep 6: CLICK: (820, 513)\nstep 7: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (547, 654)\nstep 2: SCROLL: UP\nstep 3: CLICK: (477, 647)\nstep 4: CLICK: (904, 717)\nstep 5: PRESS_HOME\nstep 6: CLICK: (820, 513)\nstep 7: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Plantin', 'Pocket FM: Audio Series']\nB: ['Contacts', 'Ploter - Ebook, Audiobook, PDF']\nC: ['Setting', 'Libby, by OverDrive']\nD: ['Picturethis', 'Amazon Kindle']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_26_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_26_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_26_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_26_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_26_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_26_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_26_6.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Bing: chat with AI & GPT4', 'Google Keep']\nB: ['Quora', 'Dropbox Paper']\nC: ['Wikipedia', 'Notepad - Notes and To Do List']\nD: ['DuckDuckgo', 'Simplenote']\n",
+    "question": "The corresponding actions are: step 1: PRESS_HOME\nstep 2: CLICK: (293, 118)\nstep 3: CLICK: (354, 46)\nstep 4: CLICK: (970, 68)\nstep 5: TYPE: carrot cake ingredients\nstep 6: CLICK: (167, 154)\nstep 7: PRESS_HOME\nstep 8: CLICK: (289, 417)\nstep 9: CLICK: (271, 891)\nstep 10: TYPE: shopping list for making carrot cake: baking powder, carrot, ground allspice\nstep 11: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: PRESS_HOME\nstep 2: CLICK: (293, 118)\nstep 3: CLICK: (354, 46)\nstep 4: CLICK: (970, 68)\nstep 5: TYPE: carrot cake ingredients\nstep 6: CLICK: (167, 154)\nstep 7: PRESS_HOME\nstep 8: CLICK: (289, 417)\nstep 9: CLICK: (271, 891)\nstep 10: TYPE: shopping list for making carrot cake: baking powder, carrot, ground allspice\nstep 11: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Bing: chat with AI & GPT4', 'Google Keep']\nB: ['Quora', 'Dropbox Paper']\nC: ['Wikipedia', 'Notepad - Notes and To Do List']\nD: ['DuckDuckgo', 'Simplenote']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_27_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_27_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_27_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_27_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_27_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_27_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_27_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_27_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_27_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_27_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_27_10.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Contacts', 'Threads']\nB: ['Plantin', 'X']\nC: ['Setting', 'Facebook']\nD: ['Google Play Store', 'Instagram']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (936, 76)\nstep 2: TYPE: tiktok\nstep 3: CLICK: (933, 909)\nstep 4: PRESS_HOME\nstep 5: CLICK: (375, 822)\nstep 6: PRESS_HOME\nstep 7: CLICK: (844, 838)\nstep 8: CLICK: (819, 70)\nstep 9: CLICK: (953, 69)\nstep 10: TYPE: instagram\nstep 11: CLICK: (900, 912)\nstep 12: CLICK: (864, 323)\nstep 13: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (936, 76)\nstep 2: TYPE: tiktok\nstep 3: CLICK: (933, 909)\nstep 4: PRESS_HOME\nstep 5: CLICK: (375, 822)\nstep 6: PRESS_HOME\nstep 7: CLICK: (844, 838)\nstep 8: CLICK: (819, 70)\nstep 9: CLICK: (953, 69)\nstep 10: TYPE: instagram\nstep 11: CLICK: (900, 912)\nstep 12: CLICK: (864, 323)\nstep 13: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Contacts', 'Threads']\nB: ['Plantin', 'X']\nC: ['Setting', 'Facebook']\nD: ['Google Play Store', 'Instagram']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_28_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_28_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_28_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_28_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_28_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_28_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_28_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_28_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_28_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_28_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_28_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_28_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_28_12.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Lightroom Photo & Video Editor', 'Tumblr']\nB: ['Adobe Express: AI Video Design', 'Whatsapp']\nC: ['Gallery-photo gallery,album', 'Instagram']\nD: ['Textify- Art Font Photo Editor', 'Facebook']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (822, 533)\nstep 2: CLICK: (342, 357)\nstep 3: SCROLL: UP\nstep 4: CLICK: (241, 641)\nstep 5: CLICK: (74, 934)\nstep 6: SCROLL: LEFT\nstep 7: CLICK: (918, 68)\nstep 8: CLICK: (938, 66)\nstep 9: CLICK: (784, 704)\nstep 10: SCROLL: UP\nstep 11: SCROLL: UP\nstep 12: CLICK: (114, 526)\nstep 13: CLICK: (400, 535)\nstep 14: CLICK: (875, 74)\nstep 15: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (822, 533)\nstep 2: CLICK: (342, 357)\nstep 3: SCROLL: UP\nstep 4: CLICK: (241, 641)\nstep 5: CLICK: (74, 934)\nstep 6: SCROLL: LEFT\nstep 7: CLICK: (918, 68)\nstep 8: CLICK: (938, 66)\nstep 9: CLICK: (784, 704)\nstep 10: SCROLL: UP\nstep 11: SCROLL: UP\nstep 12: CLICK: (114, 526)\nstep 13: CLICK: (400, 535)\nstep 14: CLICK: (875, 74)\nstep 15: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Lightroom Photo & Video Editor', 'Tumblr']\nB: ['Adobe Express: AI Video Design', 'Whatsapp']\nC: ['Gallery-photo gallery,album', 'Instagram']\nD: ['Textify- Art Font Photo Editor', 'Facebook']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_29_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_29_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_29_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_29_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_29_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_29_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_29_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_29_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_29_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_29_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_29_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_29_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_29_12.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_29_13.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_29_14.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Calculator', 'Tumblr']\nB: ['Google Drive', 'Instagram']\nC: ['Basic Calculator: GPA & Math', 'Threads']\nD: ['All-In-One Calculator', 'Messenger']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (148, 537)\nstep 2: SCROLL: UP\nstep 3: CLICK: (905, 352)\nstep 4: CLICK: (273, 525)\nstep 5: PRESS_HOME\nstep 6: CLICK: (604, 160)\nstep 7: CLICK: (884, 147)\nstep 8: CLICK: (304, 677)\nstep 9: CLICK: (166, 903)\nstep 10: TYPE: https://docs.google.com/document/d/1F1mYgUNic6AwXcwpgI0fdfotTjP8R_v4ogXTNTK6u7U/edit?usp=drivesdk\nstep 11: CLICK: (862, 431)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (148, 537)\nstep 2: SCROLL: UP\nstep 3: CLICK: (905, 352)\nstep 4: CLICK: (273, 525)\nstep 5: PRESS_HOME\nstep 6: CLICK: (604, 160)\nstep 7: CLICK: (884, 147)\nstep 8: CLICK: (304, 677)\nstep 9: CLICK: (166, 903)\nstep 10: TYPE: https://docs.google.com/document/d/1F1mYgUNic6AwXcwpgI0fdfotTjP8R_v4ogXTNTK6u7U/edit?usp=drivesdk\nstep 11: CLICK: (862, 431)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Calculator', 'Tumblr']\nB: ['Google Drive', 'Instagram']\nC: ['Basic Calculator: GPA & Math', 'Threads']\nD: ['All-In-One Calculator', 'Messenger']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_30_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_30_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_30_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_30_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_30_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_30_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_30_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_30_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_30_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_30_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_30_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_30_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Google Play Store', 'Triller']\nB: ['Plantin', 'Tiktok']\nC: ['iNaturalist', 'Pluto TV - Live TV and Movies']\nD: ['Setting', 'YouTube']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (846, 824)\nstep 2: CLICK: (346, 511)\nstep 3: CLICK: (315, 623)\nstep 4: CLICK: (824, 74)\nstep 5: TYPE: YouTube\nstep 6: CLICK: (259, 147)\nstep 7: CLICK: (340, 520)\nstep 8: CLICK: (902, 404)\nstep 9: PRESS_HOME\nstep 10: SCROLL: RIGHT\nstep 11: CLICK: (846, 658)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (846, 824)\nstep 2: CLICK: (346, 511)\nstep 3: CLICK: (315, 623)\nstep 4: CLICK: (824, 74)\nstep 5: TYPE: YouTube\nstep 6: CLICK: (259, 147)\nstep 7: CLICK: (340, 520)\nstep 8: CLICK: (902, 404)\nstep 9: PRESS_HOME\nstep 10: SCROLL: RIGHT\nstep 11: CLICK: (846, 658)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Google Play Store', 'Triller']\nB: ['Plantin', 'Tiktok']\nC: ['iNaturalist', 'Pluto TV - Live TV and Movies']\nD: ['Setting', 'YouTube']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_31_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_31_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_31_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_31_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_31_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_31_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_31_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_31_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_31_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_31_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_31_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_31_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Calendar', 'Opera News']\nB: ['Simple Calendar - easy planner', 'NewsBreak']\nC: ['Files', 'Microsoft News']\nD: ['aCalendar', 'SmartNews:News That Matters']\n",
+    "question": "The corresponding actions are: step 1: PRESS_HOME\nstep 2: SCROLL: LEFT\nstep 3: CLICK: (143, 141)\nstep 4: CLICK: (947, 70)\nstep 5: TYPE: ai\nstep 6: CLICK: (306, 122)\nstep 7: SCROLL: UP\nstep 8: IMPOSSIBLE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: PRESS_HOME\nstep 2: SCROLL: LEFT\nstep 3: CLICK: (143, 141)\nstep 4: CLICK: (947, 70)\nstep 5: TYPE: ai\nstep 6: CLICK: (306, 122)\nstep 7: SCROLL: UP\nstep 8: IMPOSSIBLE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Calendar', 'Opera News']\nB: ['Simple Calendar - easy planner', 'NewsBreak']\nC: ['Files', 'Microsoft News']\nD: ['aCalendar', 'SmartNews:News That Matters']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_32_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_32_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_32_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_32_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_32_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_32_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_32_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_32_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Facebook', 'Lazada']\nB: ['X', 'AliExpress']\nC: ['Instagram', 'SSENSE']\nD: ['Gmail', 'Net-a-Porte']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (609, 497)\nstep 2: CLICK: (82, 52)\nstep 3: CLICK: (387, 54)\nstep 4: CLICK: (918, 55)\nstep 5: TYPE: portable speaker recommendation\nstep 6: CLICK: (914, 913)\nstep 7: CLICK: (576, 675)\nstep 8: PRESS_HOME\nstep 9: CLICK: (797, 116)\nstep 10: CLICK: (190, 109)\nstep 11: TYPE: Anker Soundcore\nstep 12: CLICK: (927, 919)\nstep 13: CLICK: (213, 534)\nstep 14: CLICK: (455, 939)\nstep 15: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (609, 497)\nstep 2: CLICK: (82, 52)\nstep 3: CLICK: (387, 54)\nstep 4: CLICK: (918, 55)\nstep 5: TYPE: portable speaker recommendation\nstep 6: CLICK: (914, 913)\nstep 7: CLICK: (576, 675)\nstep 8: PRESS_HOME\nstep 9: CLICK: (797, 116)\nstep 10: CLICK: (190, 109)\nstep 11: TYPE: Anker Soundcore\nstep 12: CLICK: (927, 919)\nstep 13: CLICK: (213, 534)\nstep 14: CLICK: (455, 939)\nstep 15: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Facebook', 'Lazada']\nB: ['X', 'AliExpress']\nC: ['Instagram', 'SSENSE']\nD: ['Gmail', 'Net-a-Porte']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_33_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_33_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_33_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_33_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_33_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_33_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_33_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_33_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_33_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_33_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_33_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_33_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_33_12.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_33_13.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_33_14.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['GPS', 'Vaulty:Hide Pictures Videos']\nB: ['GPS, Maps, Voice Navigation', 'Picturethis']\nC: ['Lyft', 'iNaturalist']\nD: ['Yandex Navigator', 'Google Play Store']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (661, 505)\nstep 2: CLICK: (67, 751)\nstep 3: SCROLL: RIGHT\nstep 4: CLICK: (502, 59)\nstep 5: CLICK: (719, 78)\nstep 6: TYPE: sports arena\nstep 7: CLICK: (327, 168)\nstep 8: IMPOSSIBLE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (661, 505)\nstep 2: CLICK: (67, 751)\nstep 3: SCROLL: RIGHT\nstep 4: CLICK: (502, 59)\nstep 5: CLICK: (719, 78)\nstep 6: TYPE: sports arena\nstep 7: CLICK: (327, 168)\nstep 8: IMPOSSIBLE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['GPS', 'Vaulty:Hide Pictures Videos']\nB: ['GPS, Maps, Voice Navigation', 'Picturethis']\nC: ['Lyft', 'iNaturalist']\nD: ['Yandex Navigator', 'Google Play Store']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_34_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_34_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_34_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_34_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_34_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_34_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_34_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_34_7.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Google Play Store', 'Setting']\nB: ['Plantin', 'PlantNet']\nC: ['Picturethis', 'Picturethis']\nD: ['PlantNet', 'Contacts']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (427, 137)\nstep 2: CLICK: (307, 136)\nstep 3: TYPE: Flipkart\nstep 4: CLICK: (936, 773)\nstep 5: CLICK: (263, 394)\nstep 6: CLICK: (681, 331)\nstep 7: CLICK: (630, 549)\nstep 8: PRESS_HOME\nstep 9: CLICK: (576, 146)\nstep 10: CLICK: (174, 457)\nstep 11: CLICK: (498, 804)\nstep 12: CLICK: (939, 129)\nstep 13: TYPE: Flipkart\nstep 14: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (427, 137)\nstep 2: CLICK: (307, 136)\nstep 3: TYPE: Flipkart\nstep 4: CLICK: (936, 773)\nstep 5: CLICK: (263, 394)\nstep 6: CLICK: (681, 331)\nstep 7: CLICK: (630, 549)\nstep 8: PRESS_HOME\nstep 9: CLICK: (576, 146)\nstep 10: CLICK: (174, 457)\nstep 11: CLICK: (498, 804)\nstep 12: CLICK: (939, 129)\nstep 13: TYPE: Flipkart\nstep 14: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Google Play Store', 'Setting']\nB: ['Plantin', 'PlantNet']\nC: ['Picturethis', 'Picturethis']\nD: ['PlantNet', 'Contacts']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_35_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_35_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_35_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_35_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_35_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_35_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_35_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_35_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_35_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_35_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_35_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_35_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_35_12.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_35_13.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Google Keep', 'Bing: chat with AI & GPT4']\nB: ['Simplenote', 'Opera']\nC: ['Microsoft word', 'DuckDuckgo']\nD: ['WPS office', 'Quora']\n",
+    "question": "The corresponding actions are: step 1: PRESS_HOME\nstep 2: CLICK: (567, 921)\nstep 3: CLICK: (453, 90)\nstep 4: CLICK: (959, 84)\nstep 5: TYPE: sushi ingredients\nstep 6: CLICK: (238, 170)\nstep 7: PRESS_HOME\nstep 8: CLICK: (927, 667)\nstep 9: SCROLL: RIGHT\nstep 10: SCROLL: RIGHT\nstep 11: CLICK: (583, 503)\nstep 12: CLICK: (240, 532)\nstep 13: TYPE: shopping list for sushi:rice, vinegar, wine\nstep 14: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: PRESS_HOME\nstep 2: CLICK: (567, 921)\nstep 3: CLICK: (453, 90)\nstep 4: CLICK: (959, 84)\nstep 5: TYPE: sushi ingredients\nstep 6: CLICK: (238, 170)\nstep 7: PRESS_HOME\nstep 8: CLICK: (927, 667)\nstep 9: SCROLL: RIGHT\nstep 10: SCROLL: RIGHT\nstep 11: CLICK: (583, 503)\nstep 12: CLICK: (240, 532)\nstep 13: TYPE: shopping list for sushi:rice, vinegar, wine\nstep 14: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Google Keep', 'Bing: chat with AI & GPT4']\nB: ['Simplenote', 'Opera']\nC: ['Microsoft word', 'DuckDuckgo']\nD: ['WPS office', 'Quora']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_36_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_36_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_36_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_36_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_36_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_36_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_36_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_36_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_36_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_36_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_36_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_36_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_36_12.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_36_13.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Chrome', 'Microsoft to do']\nB: ['Wikipedia', 'Things']\nC: ['Opera', 'TickTick']\nD: ['DuckDuckGo', 'To-Do List']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (386, 135)\nstep 2: CLICK: (500, 73)\nstep 3: TYPE: when is the next super bowl game\nstep 4: CLICK: (951, 917)\nstep 5: PRESS_HOME\nstep 6: CLICK: (627, 657)\nstep 7: CLICK: (918, 853)\nstep 8: TYPE: Feb9 2025 super bowl game\nstep 9: CLICK: (90, 572)\nstep 10: TYPE:  \nstep 11: CLICK: (922, 646)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (386, 135)\nstep 2: CLICK: (500, 73)\nstep 3: TYPE: when is the next super bowl game\nstep 4: CLICK: (951, 917)\nstep 5: PRESS_HOME\nstep 6: CLICK: (627, 657)\nstep 7: CLICK: (918, 853)\nstep 8: TYPE: Feb9 2025 super bowl game\nstep 9: CLICK: (90, 572)\nstep 10: TYPE:  \nstep 11: CLICK: (922, 646)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Chrome', 'Microsoft to do']\nB: ['Wikipedia', 'Things']\nC: ['Opera', 'TickTick']\nD: ['DuckDuckGo', 'To-Do List']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_37_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_37_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_37_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_37_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_37_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_37_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_37_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_37_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_37_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_37_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_37_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_37_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Booking.com', 'Waze Navigation & Live Traffic']\nB: ['Shazam: Find Music & Concerts', 'Lyft']\nC: ['Apartments.com Rental Search', 'Google Map']\nD: ['Traveloka', 'Yandex Navigator']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (432, 106)\nstep 2: CLICK: (327, 362)\nstep 3: PRESS_HOME\nstep 4: CLICK: (292, 242)\nstep 5: CLICK: (110, 82)\nstep 6: TYPE: 2580-2590 California St\nstep 7: CLICK: (183, 182)\nstep 8: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (432, 106)\nstep 2: CLICK: (327, 362)\nstep 3: PRESS_HOME\nstep 4: CLICK: (292, 242)\nstep 5: CLICK: (110, 82)\nstep 6: TYPE: 2580-2590 California St\nstep 7: CLICK: (183, 182)\nstep 8: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Booking.com', 'Waze Navigation & Live Traffic']\nB: ['Shazam: Find Music & Concerts', 'Lyft']\nC: ['Apartments.com Rental Search', 'Google Map']\nD: ['Traveloka', 'Yandex Navigator']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_38_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_38_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_38_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_38_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_38_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_38_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_38_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_38_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Google Play Store', 'Tokopedia']\nB: ['Tripadvisor', 'Booking.com']\nC: ['Setting', 'Expedia']\nD: ['Picturethis', 'trip.com']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (215, 670)\nstep 2: CLICK: (275, 73)\nstep 3: TYPE: Tokopedia\nstep 4: CLICK: (850, 884)\nstep 5: CLICK: (726, 388)\nstep 6: CLICK: (889, 384)\nstep 7: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (215, 670)\nstep 2: CLICK: (275, 73)\nstep 3: TYPE: Tokopedia\nstep 4: CLICK: (850, 884)\nstep 5: CLICK: (726, 388)\nstep 6: CLICK: (889, 384)\nstep 7: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Google Play Store', 'Tokopedia']\nB: ['Tripadvisor', 'Booking.com']\nC: ['Setting', 'Expedia']\nD: ['Picturethis', 'trip.com']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_39_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_39_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_39_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_39_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_39_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_39_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_39_6.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Yahoo Sports', 'Instagram']\nB: ['AP News', 'Messenger']\nC: ['CNN Breaking US & World News', 'Whatsapp']\nD: ['NewsBreak', 'Facebook']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (929, 676)\nstep 2: CLICK: (561, 612)\nstep 3: CLICK: (516, 514)\nstep 4: CLICK: (716, 74)\nstep 5: CLICK: (356, 85)\nstep 6: TYPE: Cybersecurity Threats\nstep 7: CLICK: (856, 874)\nstep 8: CLICK: (452, 314)\nstep 9: CLICK: (724, 915)\nstep 10: CLICK: (508, 873)\nstep 11: CLICK: (733, 79)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (929, 676)\nstep 2: CLICK: (561, 612)\nstep 3: CLICK: (516, 514)\nstep 4: CLICK: (716, 74)\nstep 5: CLICK: (356, 85)\nstep 6: TYPE: Cybersecurity Threats\nstep 7: CLICK: (856, 874)\nstep 8: CLICK: (452, 314)\nstep 9: CLICK: (724, 915)\nstep 10: CLICK: (508, 873)\nstep 11: CLICK: (733, 79)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Yahoo Sports', 'Instagram']\nB: ['AP News', 'Messenger']\nC: ['CNN Breaking US & World News', 'Whatsapp']\nD: ['NewsBreak', 'Facebook']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_40_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_40_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_40_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_40_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_40_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_40_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_40_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_40_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_40_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_40_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_40_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_40_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Firefox', 'Google Docs']\nB: ['DuckDuckGo', 'Notepad - Notes and To Do List']\nC: ['Edge', 'Simplenote']\nD: ['Opera', 'Google Keep']\n",
+    "question": "The corresponding actions are: step 1: PRESS_HOME\nstep 2: CLICK: (705, 223)\nstep 3: CLICK: (813, 256)\nstep 4: TYPE: weather in Shanghai tomorrow\nstep 5: CLICK: (930, 248)\nstep 6: PRESS_HOME\nstep 7: CLICK: (858, 212)\nstep 8: CLICK: (902, 895)\nstep 9: CLICK: (824, 792)\nstep 10: TYPE: Shanghai,tomorrow Todolist: buy a flight to shanghai.\nstep 11: CLICK: (99, 96)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: PRESS_HOME\nstep 2: CLICK: (705, 223)\nstep 3: CLICK: (813, 256)\nstep 4: TYPE: weather in Shanghai tomorrow\nstep 5: CLICK: (930, 248)\nstep 6: PRESS_HOME\nstep 7: CLICK: (858, 212)\nstep 8: CLICK: (902, 895)\nstep 9: CLICK: (824, 792)\nstep 10: TYPE: Shanghai,tomorrow Todolist: buy a flight to shanghai.\nstep 11: CLICK: (99, 96)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Firefox', 'Google Docs']\nB: ['DuckDuckGo', 'Notepad - Notes and To Do List']\nC: ['Edge', 'Simplenote']\nD: ['Opera', 'Google Keep']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_41_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_41_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_41_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_41_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_41_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_41_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_41_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_41_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_41_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_41_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_41_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_41_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['DigiCal Calendar Agenda', 'Quora']\nB: ['All-In-One Calculator', 'Firefox']\nC: ['Calendar', 'Chrome']\nD: ['Simple Calendar - easy planner', 'Edge']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (642, 790)\nstep 2: CLICK: (788, 202)\nstep 3: TYPE: the latest Transformers movie\nstep 4: CLICK: (926, 905)\nstep 5: PRESS_HOME\nstep 6: SCROLL: UP\nstep 7: CLICK: (595, 368)\nstep 8: CLICK: (836, 716)\nstep 9: CLICK: (355, 842)\nstep 10: TYPE: watch the movie Transformers: the Rise of the Beasts\nstep 11: CLICK: (890, 89)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (642, 790)\nstep 2: CLICK: (788, 202)\nstep 3: TYPE: the latest Transformers movie\nstep 4: CLICK: (926, 905)\nstep 5: PRESS_HOME\nstep 6: SCROLL: UP\nstep 7: CLICK: (595, 368)\nstep 8: CLICK: (836, 716)\nstep 9: CLICK: (355, 842)\nstep 10: TYPE: watch the movie Transformers: the Rise of the Beasts\nstep 11: CLICK: (890, 89)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['DigiCal Calendar Agenda', 'Quora']\nB: ['All-In-One Calculator', 'Firefox']\nC: ['Calendar', 'Chrome']\nD: ['Simple Calendar - easy planner', 'Edge']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_42_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_42_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_42_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_42_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_42_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_42_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_42_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_42_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_42_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_42_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_42_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_42_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Youtube', 'Google Play Store']\nB: ['Tubi: Movies & Live TV', 'PlantNet']\nC: ['Pluto TV - Live TV and Movies', 'Plantin']\nD: ['Tiktok', 'TradingView: Track All Markets']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (129, 808)\nstep 2: PRESS_HOME\nstep 3: CLICK: (834, 809)\nstep 4: TYPE: Adidas Training App\nstep 5: CLICK: (294, 155)\nstep 6: CLICK: (425, 603)\nstep 7: CLICK: (688, 271)\nstep 8: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (129, 808)\nstep 2: PRESS_HOME\nstep 3: CLICK: (834, 809)\nstep 4: TYPE: Adidas Training App\nstep 5: CLICK: (294, 155)\nstep 6: CLICK: (425, 603)\nstep 7: CLICK: (688, 271)\nstep 8: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Youtube', 'Google Play Store']\nB: ['Tubi: Movies & Live TV', 'PlantNet']\nC: ['Pluto TV - Live TV and Movies', 'Plantin']\nD: ['Tiktok', 'TradingView: Track All Markets']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_43_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_43_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_43_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_43_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_43_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_43_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_43_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_43_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Triller', 'Vaulty:Hide Pictures Videos']\nB: ['Youtube', 'Google Play Store']\nC: ['Shorts VotTak: Short Video App', 'Contacts']\nD: ['Pluto TV - Live TV and Movies', 'Picturethis']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (585, 837)\nstep 2: PRESS_HOME\nstep 3: CLICK: (835, 842)\nstep 4: CLICK: (815, 73)\nstep 5: CLICK: (927, 75)\nstep 6: TYPE: Centr App\nstep 7: CLICK: (915, 912)\nstep 8: CLICK: (860, 337)\nstep 9: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (585, 837)\nstep 2: PRESS_HOME\nstep 3: CLICK: (835, 842)\nstep 4: CLICK: (815, 73)\nstep 5: CLICK: (927, 75)\nstep 6: TYPE: Centr App\nstep 7: CLICK: (915, 912)\nstep 8: CLICK: (860, 337)\nstep 9: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Triller', 'Vaulty:Hide Pictures Videos']\nB: ['Youtube', 'Google Play Store']\nC: ['Shorts VotTak: Short Video App', 'Contacts']\nD: ['Pluto TV - Live TV and Movies', 'Picturethis']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_44_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_44_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_44_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_44_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_44_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_44_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_44_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_44_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_44_8.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Kobo Books - eBooks Audiobooks', 'DuckDuckGo']\nB: ['Amazon Kindle', 'Firefox']\nC: ['Audible: Audio Entertainment', 'Chrome']\nD: ['Pocket FM: Audio Series', 'wikiHow']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (802, 311)\nstep 2: CLICK: (249, 82)\nstep 3: TYPE: The Renaissance\nstep 4: CLICK: (166, 216)\nstep 5: CLICK: (170, 417)\nstep 6: PRESS_HOME\nstep 7: CLICK: (561, 311)\nstep 8: CLICK: (196, 86)\nstep 9: TYPE: The Renaissance\nstep 10: CLICK: (158, 167)\nstep 11: CLICK: (316, 327)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (802, 311)\nstep 2: CLICK: (249, 82)\nstep 3: TYPE: The Renaissance\nstep 4: CLICK: (166, 216)\nstep 5: CLICK: (170, 417)\nstep 6: PRESS_HOME\nstep 7: CLICK: (561, 311)\nstep 8: CLICK: (196, 86)\nstep 9: TYPE: The Renaissance\nstep 10: CLICK: (158, 167)\nstep 11: CLICK: (316, 327)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Kobo Books - eBooks Audiobooks', 'DuckDuckGo']\nB: ['Amazon Kindle', 'Firefox']\nC: ['Audible: Audio Entertainment', 'Chrome']\nD: ['Pocket FM: Audio Series', 'wikiHow']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_45_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_45_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_45_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_45_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_45_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_45_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_45_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_45_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_45_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_45_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_45_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_45_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Youtube', 'Setting']\nB: ['Triller', 'Picturethis']\nC: ['Tubi: Movies & Live TV', 'iNaturalist']\nD: ['Netflix', 'Contacts']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (813, 470)\nstep 2: CLICK: (211, 417)\nstep 3: PRESS_HOME\nstep 4: CLICK: (866, 607)\nstep 5: CLICK: (233, 776)\nstep 6: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (813, 470)\nstep 2: CLICK: (211, 417)\nstep 3: PRESS_HOME\nstep 4: CLICK: (866, 607)\nstep 5: CLICK: (233, 776)\nstep 6: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Youtube', 'Setting']\nB: ['Triller', 'Picturethis']\nC: ['Tubi: Movies & Live TV', 'iNaturalist']\nD: ['Netflix', 'Contacts']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_46_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_46_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_46_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_46_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_46_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_46_5.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Firefox', 'Google Docs']\nB: ['Quora', 'Simplenote']\nC: ['DuckDuckgo', 'Google Keep']\nD: ['Opera', 'BasicNote - Notes, Notepad']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (632, 123)\nstep 2: CLICK: (462, 80)\nstep 3: CLICK: (936, 73)\nstep 4: TYPE: sushi ingredients\nstep 5: CLICK: (502, 123)\nstep 6: PRESS_HOME\nstep 7: CLICK: (351, 524)\nstep 8: CLICK: (902, 916)\nstep 9: TYPE: rice, rice vinegar, shaoxing wine\nstep 10: CLICK: (190, 151)\nstep 11: TYPE: shopping list for making sushi\nstep 12: CLICK: (74, 86)\nstep 13: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (632, 123)\nstep 2: CLICK: (462, 80)\nstep 3: CLICK: (936, 73)\nstep 4: TYPE: sushi ingredients\nstep 5: CLICK: (502, 123)\nstep 6: PRESS_HOME\nstep 7: CLICK: (351, 524)\nstep 8: CLICK: (902, 916)\nstep 9: TYPE: rice, rice vinegar, shaoxing wine\nstep 10: CLICK: (190, 151)\nstep 11: TYPE: shopping list for making sushi\nstep 12: CLICK: (74, 86)\nstep 13: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Firefox', 'Google Docs']\nB: ['Quora', 'Simplenote']\nC: ['DuckDuckgo', 'Google Keep']\nD: ['Opera', 'BasicNote - Notes, Notepad']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_47_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_47_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_47_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_47_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_47_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_47_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_47_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_47_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_47_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_47_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_47_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_47_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_47_12.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['X', 'iHeart: Music, Radio, Podcasts']\nB: ['Instagram', 'Spotify']\nC: ['Threads', 'YT Music']\nD: ['Messenger', 'Pandora']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (294, 293)\nstep 2: CLICK: (963, 71)\nstep 3: CLICK: (49, 598)\nstep 4: CLICK: (947, 289)\nstep 5: TYPE: Punk\nstep 6: SCROLL: UP\nstep 7: CLICK: (917, 921)\nstep 8: CLICK: (855, 924)\nstep 9: PRESS_HOME\nstep 10: CLICK: (866, 897)\nstep 11: CLICK: (811, 255)\nstep 12: CLICK: (45, 274)\nstep 13: TYPE: Punk\nstep 14: CLICK: (329, 963)\nstep 15: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (294, 293)\nstep 2: CLICK: (963, 71)\nstep 3: CLICK: (49, 598)\nstep 4: CLICK: (947, 289)\nstep 5: TYPE: Punk\nstep 6: SCROLL: UP\nstep 7: CLICK: (917, 921)\nstep 8: CLICK: (855, 924)\nstep 9: PRESS_HOME\nstep 10: CLICK: (866, 897)\nstep 11: CLICK: (811, 255)\nstep 12: CLICK: (45, 274)\nstep 13: TYPE: Punk\nstep 14: CLICK: (329, 963)\nstep 15: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['X', 'iHeart: Music, Radio, Podcasts']\nB: ['Instagram', 'Spotify']\nC: ['Threads', 'YT Music']\nD: ['Messenger', 'Pandora']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_48_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_48_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_48_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_48_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_48_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_48_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_48_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_48_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_48_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_48_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_48_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_48_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_48_12.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_48_13.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_48_14.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Kobo Books - eBooks Audiobooks', 'Contacts']\nB: ['Everand', 'iNaturalist']\nC: ['Audible: Audio Entertainment', 'Google Play Store']\nD: ['Ploter - Ebook, Audiobook, PDF', 'Setting']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (869, 616)\nstep 2: SCROLL: UP\nstep 3: CLICK: (256, 938)\nstep 4: SCROLL: UP\nstep 5: CLICK: (879, 599)\nstep 6: PRESS_HOME\nstep 7: SCROLL: LEFT\nstep 8: CLICK: (874, 475)\nstep 9: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (869, 616)\nstep 2: SCROLL: UP\nstep 3: CLICK: (256, 938)\nstep 4: SCROLL: UP\nstep 5: CLICK: (879, 599)\nstep 6: PRESS_HOME\nstep 7: SCROLL: LEFT\nstep 8: CLICK: (874, 475)\nstep 9: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Kobo Books - eBooks Audiobooks', 'Contacts']\nB: ['Everand', 'iNaturalist']\nC: ['Audible: Audio Entertainment', 'Google Play Store']\nD: ['Ploter - Ebook, Audiobook, PDF', 'Setting']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_49_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_49_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_49_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_49_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_49_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_49_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_49_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_49_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_49_8.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Quora', 'Any.do']\nB: ['Chrome', 'Microsoft to do']\nC: ['Firefox', 'Todoist']\nD: ['DuckDuckGo', 'To-Do List']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (319, 516)\nstep 2: CLICK: (672, 110)\nstep 3: TYPE: how to learn photography\nstep 4: CLICK: (463, 177)\nstep 5: CLICK: (447, 227)\nstep 6: CLICK: (697, 224)\nstep 7: CLICK: (529, 536)\nstep 8: CLICK: (467, 636)\nstep 9: PRESS_HOME\nstep 10: CLICK: (88, 498)\nstep 11: CLICK: (302, 888)\nstep 12: TYPE: learn photography in \nstep 13: CLICK: (585, 566)\nstep 14: CLICK: (962, 494)\nstep 15: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (319, 516)\nstep 2: CLICK: (672, 110)\nstep 3: TYPE: how to learn photography\nstep 4: CLICK: (463, 177)\nstep 5: CLICK: (447, 227)\nstep 6: CLICK: (697, 224)\nstep 7: CLICK: (529, 536)\nstep 8: CLICK: (467, 636)\nstep 9: PRESS_HOME\nstep 10: CLICK: (88, 498)\nstep 11: CLICK: (302, 888)\nstep 12: TYPE: learn photography in \nstep 13: CLICK: (585, 566)\nstep 14: CLICK: (962, 494)\nstep 15: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Quora', 'Any.do']\nB: ['Chrome', 'Microsoft to do']\nC: ['Firefox', 'Todoist']\nD: ['DuckDuckGo', 'To-Do List']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_50_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_50_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_50_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_50_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_50_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_50_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_50_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_50_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_50_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_50_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_50_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_50_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_50_12.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_50_13.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_50_14.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Tiktok', 'Calculator Plus with History']\nB: ['Tubi: Movies & Live TV', 'DigiCal Calendar Agenda']\nC: ['Shorts VotTak: Short Video App', 'Calendar']\nD: ['Likee', 'Clock']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (566, 122)\nstep 2: CLICK: (398, 914)\nstep 3: CLICK: (311, 48)\nstep 4: TYPE: relaxing soundscape\nstep 5: CLICK: (602, 746)\nstep 6: CLICK: (344, 389)\nstep 7: PRESS_HOME\nstep 8: SCROLL: RIGHT\nstep 9: CLICK: (216, 427)\nstep 10: CLICK: (857, 362)\nstep 11: CLICK: (766, 651)\nstep 12: CLICK: (759, 858)\nstep 13: PRESS_RECENT\nstep 14: CLICK: (57, 261)\nstep 15: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (566, 122)\nstep 2: CLICK: (398, 914)\nstep 3: CLICK: (311, 48)\nstep 4: TYPE: relaxing soundscape\nstep 5: CLICK: (602, 746)\nstep 6: CLICK: (344, 389)\nstep 7: PRESS_HOME\nstep 8: SCROLL: RIGHT\nstep 9: CLICK: (216, 427)\nstep 10: CLICK: (857, 362)\nstep 11: CLICK: (766, 651)\nstep 12: CLICK: (759, 858)\nstep 13: PRESS_RECENT\nstep 14: CLICK: (57, 261)\nstep 15: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Tiktok', 'Calculator Plus with History']\nB: ['Tubi: Movies & Live TV', 'DigiCal Calendar Agenda']\nC: ['Shorts VotTak: Short Video App', 'Calendar']\nD: ['Likee', 'Clock']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_51_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_51_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_51_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_51_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_51_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_51_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_51_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_51_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_51_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_51_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_51_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_51_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_51_12.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_51_13.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_51_14.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['iNaturalist', 'Uber']\nB: ['TradingView: Track All Markets', 'Waze Navigation & Live Traffic']\nC: ['Picturethis', 'Maps']\nD: ['Google Play Store', 'Petal Maps - GPS & Navigation']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (505, 470)\nstep 2: CLICK: (333, 568)\nstep 3: TYPE: gym\nstep 4: CLICK: (447, 251)\nstep 5: PRESS_HOME\nstep 6: CLICK: (524, 748)\nstep 7: CLICK: (333, 65)\nstep 8: CLICK: (80, 956)\nstep 9: CLICK: (252, 631)\nstep 10: CLICK: (349, 776)\nstep 11: CLICK: (488, 446)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (505, 470)\nstep 2: CLICK: (333, 568)\nstep 3: TYPE: gym\nstep 4: CLICK: (447, 251)\nstep 5: PRESS_HOME\nstep 6: CLICK: (524, 748)\nstep 7: CLICK: (333, 65)\nstep 8: CLICK: (80, 956)\nstep 9: CLICK: (252, 631)\nstep 10: CLICK: (349, 776)\nstep 11: CLICK: (488, 446)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['iNaturalist', 'Uber']\nB: ['TradingView: Track All Markets', 'Waze Navigation & Live Traffic']\nC: ['Picturethis', 'Maps']\nD: ['Google Play Store', 'Petal Maps - GPS & Navigation']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_52_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_52_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_52_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_52_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_52_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_52_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_52_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_52_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_52_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_52_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_52_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_52_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Gmail', 'Plantin']\nB: ['Instagram', 'iNaturalist']\nC: ['Threads', 'Google Play Store']\nD: ['Tumblr', 'Vaulty:Hide Pictures Videos']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (542, 654)\nstep 2: CLICK: (193, 344)\nstep 3: CLICK: (267, 248)\nstep 4: CLICK: (554, 472)\nstep 5: CLICK: (360, 905)\nstep 6: CLICK: (781, 695)\nstep 7: CLICK: (753, 921)\nstep 8: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (542, 654)\nstep 2: CLICK: (193, 344)\nstep 3: CLICK: (267, 248)\nstep 4: CLICK: (554, 472)\nstep 5: CLICK: (360, 905)\nstep 6: CLICK: (781, 695)\nstep 7: CLICK: (753, 921)\nstep 8: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Gmail', 'Plantin']\nB: ['Instagram', 'iNaturalist']\nC: ['Threads', 'Google Play Store']\nD: ['Tumblr', 'Vaulty:Hide Pictures Videos']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_53_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_53_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_53_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_53_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_53_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_53_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_53_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_53_7.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Wikipedia', 'Gmail']\nB: ['DuckDuckgo', 'Facebook']\nC: ['wikiHow', 'Whatsapp']\nD: ['Opera', 'Messenger']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (573, 719)\nstep 2: TYPE: Notre-Dame Cathedral in Pairs\nstep 3: CLICK: (909, 688)\nstep 4: CLICK: (468, 884)\nstep 5: CLICK: (358, 554)\nstep 6: CLICK: (372, 868)\nstep 7: CLICK: (978, 61)\nstep 8: CLICK: (843, 194)\nstep 9: CLICK: (683, 861)\nstep 10: CLICK: (686, 51)\nstep 11: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (573, 719)\nstep 2: TYPE: Notre-Dame Cathedral in Pairs\nstep 3: CLICK: (909, 688)\nstep 4: CLICK: (468, 884)\nstep 5: CLICK: (358, 554)\nstep 6: CLICK: (372, 868)\nstep 7: CLICK: (978, 61)\nstep 8: CLICK: (843, 194)\nstep 9: CLICK: (683, 861)\nstep 10: CLICK: (686, 51)\nstep 11: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Wikipedia', 'Gmail']\nB: ['DuckDuckgo', 'Facebook']\nC: ['wikiHow', 'Whatsapp']\nD: ['Opera', 'Messenger']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_54_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_54_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_54_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_54_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_54_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_54_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_54_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_54_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_54_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_54_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_54_10.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Quora', 'Likee', 'Contacts']\nB: ['wikiHow', 'Shorts VotTak: Short Video App', 'Tripadvisor']\nC: ['Chrome', 'Triller', 'Google Play Store']\nD: ['Opera', 'Youtube', 'Setting']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (206, 908)\nstep 2: CLICK: (932, 227)\nstep 3: PRESS_HOME\nstep 4: CLICK: (556, 923)\nstep 5: CLICK: (515, 417)\nstep 6: CLICK: (787, 89)\nstep 7: PRESS_HOME\nstep 8: CLICK: (311, 931)\nstep 9: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (206, 908)\nstep 2: CLICK: (932, 227)\nstep 3: PRESS_HOME\nstep 4: CLICK: (556, 923)\nstep 5: CLICK: (515, 417)\nstep 6: CLICK: (787, 89)\nstep 7: PRESS_HOME\nstep 8: CLICK: (311, 931)\nstep 9: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Quora', 'Likee', 'Contacts']\nB: ['wikiHow', 'Shorts VotTak: Short Video App', 'Tripadvisor']\nC: ['Chrome', 'Triller', 'Google Play Store']\nD: ['Opera', 'Youtube', 'Setting']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_55_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_55_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_55_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_55_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_55_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_55_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_55_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_55_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_55_8.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Chatty - AI Assistant', 'Wikipedia']\nB: ['ChatOn - AI Chat Bot Assistant', 'Chrome']\nC: ['GenZArt:Fast AI Art Generator', 'Bing: chat with AI & GPT4']\nD: ['Microsoft Copilot', 'Firefox']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (394, 409)\nstep 2: TYPE: tell me about Theorem of Green\nstep 3: CLICK: (909, 620)\nstep 4: PRESS_HOME\nstep 5: CLICK: (386, 258)\nstep 6: CLICK: (371, 81)\nstep 7: TYPE: Theorem of Green\nstep 8: CLICK: (918, 909)\nstep 9: CLICK: (280, 652)\nstep 10: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (394, 409)\nstep 2: TYPE: tell me about Theorem of Green\nstep 3: CLICK: (909, 620)\nstep 4: PRESS_HOME\nstep 5: CLICK: (386, 258)\nstep 6: CLICK: (371, 81)\nstep 7: TYPE: Theorem of Green\nstep 8: CLICK: (918, 909)\nstep 9: CLICK: (280, 652)\nstep 10: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Chatty - AI Assistant', 'Wikipedia']\nB: ['ChatOn - AI Chat Bot Assistant', 'Chrome']\nC: ['GenZArt:Fast AI Art Generator', 'Bing: chat with AI & GPT4']\nD: ['Microsoft Copilot', 'Firefox']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_56_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_56_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_56_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_56_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_56_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_56_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_56_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_56_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_56_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_56_9.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Petal Maps - GPS & Navigation', 'GPS, Maps, Voice Navigation']\nB: ['Waze Navigation & Live Traffic', 'Waze Navigation & Live Traffic']\nC: ['GPS', 'Lyft']\nD: ['Maps', 'Uber']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (155, 490)\nstep 2: CLICK: (505, 334)\nstep 3: CLICK: (283, 678)\nstep 4: TYPE: bakery\nstep 5: CLICK: (933, 884)\nstep 6: CLICK: (413, 296)\nstep 7: PRESS_HOME\nstep 8: CLICK: (374, 492)\nstep 9: CLICK: (272, 584)\nstep 10: TYPE: Yamasa\nstep 11: CLICK: (511, 431)\nstep 12: CLICK: (533, 890)\nstep 13: CLICK: (616, 893)\nstep 14: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (155, 490)\nstep 2: CLICK: (505, 334)\nstep 3: CLICK: (283, 678)\nstep 4: TYPE: bakery\nstep 5: CLICK: (933, 884)\nstep 6: CLICK: (413, 296)\nstep 7: PRESS_HOME\nstep 8: CLICK: (374, 492)\nstep 9: CLICK: (272, 584)\nstep 10: TYPE: Yamasa\nstep 11: CLICK: (511, 431)\nstep 12: CLICK: (533, 890)\nstep 13: CLICK: (616, 893)\nstep 14: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Petal Maps - GPS & Navigation', 'GPS, Maps, Voice Navigation']\nB: ['Waze Navigation & Live Traffic', 'Waze Navigation & Live Traffic']\nC: ['GPS', 'Lyft']\nD: ['Maps', 'Uber']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_57_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_57_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_57_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_57_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_57_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_57_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_57_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_57_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_57_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_57_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_57_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_57_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_57_12.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_57_13.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['X', 'TradingView: Track All Markets', 'Tripadvisor']\nB: ['Facebook', 'Google Play Store', 'iNaturalist']\nC: ['Instagram', 'Setting', 'Google Play Store']\nD: ['Threads', 'Picturethis', 'Contacts']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (858, 818)\nstep 2: TYPE: tiktok\nstep 3: CLICK: (911, 889)\nstep 4: PRESS_HOME\nstep 5: CLICK: (404, 813)\nstep 6: PRESS_HOME\nstep 7: CLICK: (856, 808)\nstep 8: CLICK: (804, 69)\nstep 9: CLICK: (926, 83)\nstep 10: TYPE: ins\nstep 11: CLICK: (890, 908)\nstep 12: CLICK: (868, 348)\nstep 13: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (858, 818)\nstep 2: TYPE: tiktok\nstep 3: CLICK: (911, 889)\nstep 4: PRESS_HOME\nstep 5: CLICK: (404, 813)\nstep 6: PRESS_HOME\nstep 7: CLICK: (856, 808)\nstep 8: CLICK: (804, 69)\nstep 9: CLICK: (926, 83)\nstep 10: TYPE: ins\nstep 11: CLICK: (890, 908)\nstep 12: CLICK: (868, 348)\nstep 13: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['X', 'TradingView: Track All Markets', 'Tripadvisor']\nB: ['Facebook', 'Google Play Store', 'iNaturalist']\nC: ['Instagram', 'Setting', 'Google Play Store']\nD: ['Threads', 'Picturethis', 'Contacts']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_58_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_58_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_58_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_58_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_58_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_58_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_58_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_58_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_58_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_58_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_58_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_58_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_58_12.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['BBC News', 'Threads']\nB: ['Yahoo Sports', 'Whatsapp']\nC: ['ESPN', 'Messenger']\nD: ['Opera News', 'Gmail']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (118, 493)\nstep 2: CLICK: (863, 907)\nstep 3: CLICK: (713, 218)\nstep 4: TYPE: Global Economic Trends\nstep 5: CLICK: (933, 871)\nstep 6: CLICK: (222, 425)\nstep 7: CLICK: (595, 153)\nstep 8: SCROLL: UP\nstep 9: SCROLL: UP\nstep 10: CLICK: (486, 667)\nstep 11: CLICK: (874, 479)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (118, 493)\nstep 2: CLICK: (863, 907)\nstep 3: CLICK: (713, 218)\nstep 4: TYPE: Global Economic Trends\nstep 5: CLICK: (933, 871)\nstep 6: CLICK: (222, 425)\nstep 7: CLICK: (595, 153)\nstep 8: SCROLL: UP\nstep 9: SCROLL: UP\nstep 10: CLICK: (486, 667)\nstep 11: CLICK: (874, 479)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['BBC News', 'Threads']\nB: ['Yahoo Sports', 'Whatsapp']\nC: ['ESPN', 'Messenger']\nD: ['Opera News', 'Gmail']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_59_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_59_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_59_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_59_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_59_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_59_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_59_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_59_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_59_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_59_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_59_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_59_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['TradingView: Track All Markets', 'Triller', 'Applock Pro - APP Lock & Guard']\nB: ['Google Play Store', 'Likee', 'Setting']\nC: ['iNaturalist', 'Tubi: Movies & Live TV', 'Tripadvisor']\nD: ['PlantNet', 'Shorts VotTak: Short Video App', 'Google Play Store']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (664, 921)\nstep 2: CLICK: (848, 430)\nstep 3: PRESS_HOME\nstep 4: CLICK: (559, 918)\nstep 5: CLICK: (509, 421)\nstep 6: CLICK: (567, 668)\nstep 7: CLICK: (927, 551)\nstep 8: CLICK: (395, 76)\nstep 9: CLICK: (471, 530)\nstep 10: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (664, 921)\nstep 2: CLICK: (848, 430)\nstep 3: PRESS_HOME\nstep 4: CLICK: (559, 918)\nstep 5: CLICK: (509, 421)\nstep 6: CLICK: (567, 668)\nstep 7: CLICK: (927, 551)\nstep 8: CLICK: (395, 76)\nstep 9: CLICK: (471, 530)\nstep 10: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['TradingView: Track All Markets', 'Triller', 'Applock Pro - APP Lock & Guard']\nB: ['Google Play Store', 'Likee', 'Setting']\nC: ['iNaturalist', 'Tubi: Movies & Live TV', 'Tripadvisor']\nD: ['PlantNet', 'Shorts VotTak: Short Video App', 'Google Play Store']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_60_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_60_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_60_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_60_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_60_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_60_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_60_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_60_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_60_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_60_9.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Contacts', 'Cash App']\nB: ['Applock Pro - APP Lock & Guard', 'PayPal - Send, Shop, Manage']\nC: ['Plantin', 'Venmo']\nD: ['Setting', 'Investing.com']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (397, 372)\nstep 2: CLICK: (237, 604)\nstep 3: CLICK: (492, 615)\nstep 4: CLICK: (789, 629)\nstep 5: CLICK: (209, 718)\nstep 6: CLICK: (527, 721)\nstep 7: CLICK: (759, 725)\nstep 8: SCROLL: UP\nstep 9: SCROLL: UP\nstep 10: CLICK: (865, 738)\nstep 11: PRESS_HOME\nstep 12: CLICK: (873, 245)\nstep 13: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (397, 372)\nstep 2: CLICK: (237, 604)\nstep 3: CLICK: (492, 615)\nstep 4: CLICK: (789, 629)\nstep 5: CLICK: (209, 718)\nstep 6: CLICK: (527, 721)\nstep 7: CLICK: (759, 725)\nstep 8: SCROLL: UP\nstep 9: SCROLL: UP\nstep 10: CLICK: (865, 738)\nstep 11: PRESS_HOME\nstep 12: CLICK: (873, 245)\nstep 13: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Contacts', 'Cash App']\nB: ['Applock Pro - APP Lock & Guard', 'PayPal - Send, Shop, Manage']\nC: ['Plantin', 'Venmo']\nD: ['Setting', 'Investing.com']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_61_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_61_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_61_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_61_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_61_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_61_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_61_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_61_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_61_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_61_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_61_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_61_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_61_12.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['wikiHow', 'Vaulty:Hide Pictures Videos']\nB: ['Quora', 'Contacts']\nC: ['DuckDuckGo', 'Picturethis']\nD: ['Chrome', 'TradingView: Track All Markets']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (439, 913)\nstep 2: CLICK: (310, 433)\nstep 3: TYPE: AMD's stock market news\nstep 4: CLICK: (862, 882)\nstep 5: CLICK: (265, 482)\nstep 6: CLICK: (947, 866)\nstep 7: PRESS_HOME\nstep 8: CLICK: (922, 494)\nstep 9: CLICK: (382, 76)\nstep 10: TYPE: AMD\nstep 11: CLICK: (145, 204)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (439, 913)\nstep 2: CLICK: (310, 433)\nstep 3: TYPE: AMD's stock market news\nstep 4: CLICK: (862, 882)\nstep 5: CLICK: (265, 482)\nstep 6: CLICK: (947, 866)\nstep 7: PRESS_HOME\nstep 8: CLICK: (922, 494)\nstep 9: CLICK: (382, 76)\nstep 10: TYPE: AMD\nstep 11: CLICK: (145, 204)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['wikiHow', 'Vaulty:Hide Pictures Videos']\nB: ['Quora', 'Contacts']\nC: ['DuckDuckGo', 'Picturethis']\nD: ['Chrome', 'TradingView: Track All Markets']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_62_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_62_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_62_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_62_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_62_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_62_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_62_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_62_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_62_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_62_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_62_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_62_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Google Play Store', 'Setting']\nB: ['PlantNet', 'Vaulty:Hide Pictures Videos']\nC: ['Vaulty:Hide Pictures Videos', 'Google Play Store']\nD: ['Plantin', 'PlantNet']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (432, 724)\nstep 2: SCROLL: UP\nstep 3: SCROLL: UP\nstep 4: CLICK: (115, 758)\nstep 5: CLICK: (518, 311)\nstep 6: CLICK: (498, 372)\nstep 7: CLICK: (488, 524)\nstep 8: CLICK: (974, 68)\nstep 9: TYPE: Hindi\nstep 10: CLICK: (413, 247)\nstep 11: SCROLL: UP\nstep 12: CLICK: (822, 608)\nstep 13: PRESS_HOME\nstep 14: CLICK: (140, 718)\nstep 15: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (432, 724)\nstep 2: SCROLL: UP\nstep 3: SCROLL: UP\nstep 4: CLICK: (115, 758)\nstep 5: CLICK: (518, 311)\nstep 6: CLICK: (498, 372)\nstep 7: CLICK: (488, 524)\nstep 8: CLICK: (974, 68)\nstep 9: TYPE: Hindi\nstep 10: CLICK: (413, 247)\nstep 11: SCROLL: UP\nstep 12: CLICK: (822, 608)\nstep 13: PRESS_HOME\nstep 14: CLICK: (140, 718)\nstep 15: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Google Play Store', 'Setting']\nB: ['PlantNet', 'Vaulty:Hide Pictures Videos']\nC: ['Vaulty:Hide Pictures Videos', 'Google Play Store']\nD: ['Plantin', 'PlantNet']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_63_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_63_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_63_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_63_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_63_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_63_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_63_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_63_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_63_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_63_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_63_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_63_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_63_12.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_63_13.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_63_14.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Google Map', 'Uber']\nB: ['Lyft', 'GPS, Maps, Voice Navigation']\nC: ['Yandex Navigator', 'Citymapper']\nD: ['GPS', 'Maps']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (578, 324)\nstep 2: CLICK: (239, 90)\nstep 3: TYPE: hospital\nstep 4: CLICK: (237, 255)\nstep 5: PRESS_HOME\nstep 6: CLICK: (687, 149)\nstep 7: CLICK: (393, 227)\nstep 8: TYPE: Hospital Helipad\nstep 9: SCROLL: UP\nstep 10: CLICK: (460, 709)\nstep 11: CLICK: (435, 335)\nstep 12: CLICK: (442, 885)\nstep 13: CLICK: (437, 924)\nstep 14: CLICK: (448, 887)\nstep 15: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (578, 324)\nstep 2: CLICK: (239, 90)\nstep 3: TYPE: hospital\nstep 4: CLICK: (237, 255)\nstep 5: PRESS_HOME\nstep 6: CLICK: (687, 149)\nstep 7: CLICK: (393, 227)\nstep 8: TYPE: Hospital Helipad\nstep 9: SCROLL: UP\nstep 10: CLICK: (460, 709)\nstep 11: CLICK: (435, 335)\nstep 12: CLICK: (442, 885)\nstep 13: CLICK: (437, 924)\nstep 14: CLICK: (448, 887)\nstep 15: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Google Map', 'Uber']\nB: ['Lyft', 'GPS, Maps, Voice Navigation']\nC: ['Yandex Navigator', 'Citymapper']\nD: ['GPS', 'Maps']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_64_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_64_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_64_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_64_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_64_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_64_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_64_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_64_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_64_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_64_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_64_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_64_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_64_12.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_64_13.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_64_14.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['PlantNet', 'Gmail']\nB: ['Applock Pro - APP Lock & Guard', 'Messenger']\nC: ['Tripadvisor', 'Facebook']\nD: ['Setting', 'Instagram']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (199, 628)\nstep 2: CLICK: (394, 820)\nstep 3: SCROLL: UP\nstep 4: CLICK: (419, 573)\nstep 5: CLICK: (799, 151)\nstep 6: TYPE: Instagram\nstep 7: CLICK: (291, 371)\nstep 8: CLICK: (266, 792)\nstep 9: CLICK: (802, 684)\nstep 10: CLICK: (80, 160)\nstep 11: CLICK: (233, 657)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (199, 628)\nstep 2: CLICK: (394, 820)\nstep 3: SCROLL: UP\nstep 4: CLICK: (419, 573)\nstep 5: CLICK: (799, 151)\nstep 6: TYPE: Instagram\nstep 7: CLICK: (291, 371)\nstep 8: CLICK: (266, 792)\nstep 9: CLICK: (802, 684)\nstep 10: CLICK: (80, 160)\nstep 11: CLICK: (233, 657)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['PlantNet', 'Gmail']\nB: ['Applock Pro - APP Lock & Guard', 'Messenger']\nC: ['Tripadvisor', 'Facebook']\nD: ['Setting', 'Instagram']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_65_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_65_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_65_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_65_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_65_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_65_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_65_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_65_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_65_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_65_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_65_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_65_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Pluto TV - Live TV and Movies', 'DigiCal Calendar Agenda']\nB: ['Triller', 'Calendar']\nC: ['Youtube', 'Clock']\nD: ['Tiktok', 'aCalendar']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (884, 636)\nstep 2: TYPE: nature landscape\nstep 3: CLICK: (920, 941)\nstep 4: PRESS_HOME\nstep 5: CLICK: (366, 275)\nstep 6: TYPE: 30000\nstep 7: CLICK: (462, 834)\nstep 8: SCROLL: RIGHT\nstep 9: SCROLL: RIGHT\nstep 10: CLICK: (500, 820)\nstep 11: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (884, 636)\nstep 2: TYPE: nature landscape\nstep 3: CLICK: (920, 941)\nstep 4: PRESS_HOME\nstep 5: CLICK: (366, 275)\nstep 6: TYPE: 30000\nstep 7: CLICK: (462, 834)\nstep 8: SCROLL: RIGHT\nstep 9: SCROLL: RIGHT\nstep 10: CLICK: (500, 820)\nstep 11: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Pluto TV - Live TV and Movies', 'DigiCal Calendar Agenda']\nB: ['Triller', 'Calendar']\nC: ['Youtube', 'Clock']\nD: ['Tiktok', 'aCalendar']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_66_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_66_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_66_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_66_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_66_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_66_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_66_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_66_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_66_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_66_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_66_10.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Tripadvisor', 'Netflix']\nB: ['Applock Pro - APP Lock & Guard', 'Pluto TV - Live TV and Movies']\nC: ['Vaulty:Hide Pictures Videos', 'Shorts VotTak: Short Video App']\nD: ['Google Play Store', 'Youtube']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (158, 742)\nstep 2: PRESS_HOME\nstep 3: SCROLL: RIGHT\nstep 4: CLICK: (158, 601)\nstep 5: CLICK: (708, 378)\nstep 6: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (158, 742)\nstep 2: PRESS_HOME\nstep 3: SCROLL: RIGHT\nstep 4: CLICK: (158, 601)\nstep 5: CLICK: (708, 378)\nstep 6: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Tripadvisor', 'Netflix']\nB: ['Applock Pro - APP Lock & Guard', 'Pluto TV - Live TV and Movies']\nC: ['Vaulty:Hide Pictures Videos', 'Shorts VotTak: Short Video App']\nD: ['Google Play Store', 'Youtube']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_67_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_67_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_67_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_67_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_67_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_67_5.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Any.do', 'Tubi: Movies & Live TV']\nB: ['To-Do List', 'Youtube']\nC: ['TickTick', 'Shorts VotTak: Short Video App']\nD: ['Things', 'Likee']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (732, 236)\nstep 2: CLICK: (516, 924)\nstep 3: CLICK: (503, 334)\nstep 4: CLICK: (669, 807)\nstep 5: TYPE: Swimming in the morning\nstep 6: CLICK: (681, 908)\nstep 7: PRESS_HOME\nstep 8: CLICK: (599, 914)\nstep 9: CLICK: (972, 44)\nstep 10: TYPE: swimming tutorial\nstep 11: CLICK: (294, 111)\nstep 12: CLICK: (485, 556)\nstep 13: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (732, 236)\nstep 2: CLICK: (516, 924)\nstep 3: CLICK: (503, 334)\nstep 4: CLICK: (669, 807)\nstep 5: TYPE: Swimming in the morning\nstep 6: CLICK: (681, 908)\nstep 7: PRESS_HOME\nstep 8: CLICK: (599, 914)\nstep 9: CLICK: (972, 44)\nstep 10: TYPE: swimming tutorial\nstep 11: CLICK: (294, 111)\nstep 12: CLICK: (485, 556)\nstep 13: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Any.do', 'Tubi: Movies & Live TV']\nB: ['To-Do List', 'Youtube']\nC: ['TickTick', 'Shorts VotTak: Short Video App']\nD: ['Things', 'Likee']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_68_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_68_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_68_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_68_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_68_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_68_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_68_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_68_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_68_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_68_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_68_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_68_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_68_12.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Traveloka', 'Gmail']\nB: ['Booking.com', 'Instagram']\nC: ['Tokopedia', 'Whatsapp']\nD: ['TickPick - Live Event Tickets', 'Messenger']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (310, 71)\nstep 2: TYPE: Santorini, Greece itinerary\nstep 3: CLICK: (943, 906)\nstep 4: CLICK: (153, 237)\nstep 5: CLICK: (521, 806)\nstep 6: IMPOSSIBLE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (310, 71)\nstep 2: TYPE: Santorini, Greece itinerary\nstep 3: CLICK: (943, 906)\nstep 4: CLICK: (153, 237)\nstep 5: CLICK: (521, 806)\nstep 6: IMPOSSIBLE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Traveloka', 'Gmail']\nB: ['Booking.com', 'Instagram']\nC: ['Tokopedia', 'Whatsapp']\nD: ['TickPick - Live Event Tickets', 'Messenger']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_69_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_69_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_69_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_69_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_69_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_69_5.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Spotify', 'Any.do']\nB: ['iHeart: Music, Radio, Podcasts', 'TickTick']\nC: ['Amazon Music', 'To-Do List']\nD: ['Pandora', 'Things']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (363, 209)\nstep 2: CLICK: (697, 596)\nstep 3: SCROLL: UP\nstep 4: CLICK: (336, 821)\nstep 5: PRESS_HOME\nstep 6: SCROLL: RIGHT\nstep 7: CLICK: (108, 212)\nstep 8: CLICK: (841, 887)\nstep 9: CLICK: (274, 435)\nstep 10: CLICK: (580, 571)\nstep 11: TYPE:  do yoga with this\nstep 12: CLICK: (488, 357)\nstep 13: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (363, 209)\nstep 2: CLICK: (697, 596)\nstep 3: SCROLL: UP\nstep 4: CLICK: (336, 821)\nstep 5: PRESS_HOME\nstep 6: SCROLL: RIGHT\nstep 7: CLICK: (108, 212)\nstep 8: CLICK: (841, 887)\nstep 9: CLICK: (274, 435)\nstep 10: CLICK: (580, 571)\nstep 11: TYPE:  do yoga with this\nstep 12: CLICK: (488, 357)\nstep 13: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Spotify', 'Any.do']\nB: ['iHeart: Music, Radio, Podcasts', 'TickTick']\nC: ['Amazon Music', 'To-Do List']\nD: ['Pandora', 'Things']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_70_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_70_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_70_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_70_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_70_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_70_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_70_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_70_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_70_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_70_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_70_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_70_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_70_12.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Edge', 'WPS office']\nB: ['Quora', 'Microsoft Word']\nC: ['Chrome', 'Google Keep']\nD: ['Firefox', 'Dropbox Paper']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (674, 739)\nstep 2: CLICK: (766, 260)\nstep 3: TYPE: 2019 Nobel-Prize Winners in Physics\nstep 4: CLICK: (508, 173)\nstep 5: CLICK: (908, 881)\nstep 6: SCROLL: UP\nstep 7: PRESS_HOME\nstep 8: CLICK: (102, 340)\nstep 9: CLICK: (830, 849)\nstep 10: CLICK: (149, 199)\nstep 11: LONG_PRESS: (141, 198)\nstep 12: CLICK: (119, 123)\nstep 13: CLICK: (247, 310)\nstep 14: TYPE: James Beebles, michel Mayor, Didier Quelvz\nstep 15: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (674, 739)\nstep 2: CLICK: (766, 260)\nstep 3: TYPE: 2019 Nobel-Prize Winners in Physics\nstep 4: CLICK: (508, 173)\nstep 5: CLICK: (908, 881)\nstep 6: SCROLL: UP\nstep 7: PRESS_HOME\nstep 8: CLICK: (102, 340)\nstep 9: CLICK: (830, 849)\nstep 10: CLICK: (149, 199)\nstep 11: LONG_PRESS: (141, 198)\nstep 12: CLICK: (119, 123)\nstep 13: CLICK: (247, 310)\nstep 14: TYPE: James Beebles, michel Mayor, Didier Quelvz\nstep 15: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Edge', 'WPS office']\nB: ['Quora', 'Microsoft Word']\nC: ['Chrome', 'Google Keep']\nD: ['Firefox', 'Dropbox Paper']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_71_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_71_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_71_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_71_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_71_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_71_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_71_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_71_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_71_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_71_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_71_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_71_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_71_12.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_71_13.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_71_14.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Gallery-photo gallery,album', 'Whatsapp']\nB: ['Adobe Express: AI Video Design', 'Messenger']\nC: ['Textify- Art Font Photo Editor', 'X']\nD: ['Lightroom Photo & Video Editor', 'Threads']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (124, 499)\nstep 2: CLICK: (855, 803)\nstep 3: CLICK: (477, 487)\nstep 4: CLICK: (887, 173)\nstep 5: CLICK: (741, 715)\nstep 6: CLICK: (881, 164)\nstep 7: CLICK: (931, 171)\nstep 8: SCROLL: LEFT\nstep 9: SCROLL: LEFT\nstep 10: SCROLL: LEFT\nstep 11: CLICK: (915, 845)\nstep 12: CLICK: (127, 681)\nstep 13: CLICK: (851, 542)\nstep 14: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (124, 499)\nstep 2: CLICK: (855, 803)\nstep 3: CLICK: (477, 487)\nstep 4: CLICK: (887, 173)\nstep 5: CLICK: (741, 715)\nstep 6: CLICK: (881, 164)\nstep 7: CLICK: (931, 171)\nstep 8: SCROLL: LEFT\nstep 9: SCROLL: LEFT\nstep 10: SCROLL: LEFT\nstep 11: CLICK: (915, 845)\nstep 12: CLICK: (127, 681)\nstep 13: CLICK: (851, 542)\nstep 14: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Gallery-photo gallery,album', 'Whatsapp']\nB: ['Adobe Express: AI Video Design', 'Messenger']\nC: ['Textify- Art Font Photo Editor', 'X']\nD: ['Lightroom Photo & Video Editor', 'Threads']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_72_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_72_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_72_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_72_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_72_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_72_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_72_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_72_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_72_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_72_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_72_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_72_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_72_12.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_72_13.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Trulia: Homes For Sale & Rent', 'Citymapper']\nB: ['Zillow: Homes For Sale & Rent', 'GPS']\nC: ['Realtor.com: Buy, Sell & Rent', 'Uber']\nD: ['Redfin Houses for Sale & Rent', 'Waze Navigation & Live Traffic']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (404, 246)\nstep 2: CLICK: (731, 565)\nstep 3: PRESS_HOME\nstep 4: CLICK: (603, 522)\nstep 5: CLICK: (240, 599)\nstep 6: TYPE: 527 Mollno St\nstep 7: CLICK: (914, 911)\nstep 8: CLICK: (350, 647)\nstep 9: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (404, 246)\nstep 2: CLICK: (731, 565)\nstep 3: PRESS_HOME\nstep 4: CLICK: (603, 522)\nstep 5: CLICK: (240, 599)\nstep 6: TYPE: 527 Mollno St\nstep 7: CLICK: (914, 911)\nstep 8: CLICK: (350, 647)\nstep 9: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Trulia: Homes For Sale & Rent', 'Citymapper']\nB: ['Zillow: Homes For Sale & Rent', 'GPS']\nC: ['Realtor.com: Buy, Sell & Rent', 'Uber']\nD: ['Redfin Houses for Sale & Rent', 'Waze Navigation & Live Traffic']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_73_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_73_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_73_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_73_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_73_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_73_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_73_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_73_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_73_8.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Google Play Store', 'Investing.com']\nB: ['Picturethis', 'PayPal - Send, Shop, Manage']\nC: ['Applock Pro - APP Lock & Guard', 'Cash App']\nD: ['Tripadvisor', 'Google Wallet']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (591, 381)\nstep 2: CLICK: (226, 562)\nstep 3: CLICK: (234, 679)\nstep 4: CLICK: (218, 777)\nstep 5: CLICK: (521, 554)\nstep 6: CLICK: (511, 659)\nstep 7: CLICK: (511, 767)\nstep 8: CLICK: (879, 547)\nstep 9: PRESS_HOME\nstep 10: CLICK: (840, 533)\nstep 11: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (591, 381)\nstep 2: CLICK: (226, 562)\nstep 3: CLICK: (234, 679)\nstep 4: CLICK: (218, 777)\nstep 5: CLICK: (521, 554)\nstep 6: CLICK: (511, 659)\nstep 7: CLICK: (511, 767)\nstep 8: CLICK: (879, 547)\nstep 9: PRESS_HOME\nstep 10: CLICK: (840, 533)\nstep 11: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Google Play Store', 'Investing.com']\nB: ['Picturethis', 'PayPal - Send, Shop, Manage']\nC: ['Applock Pro - APP Lock & Guard', 'Cash App']\nD: ['Tripadvisor', 'Google Wallet']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_74_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_74_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_74_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_74_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_74_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_74_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_74_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_74_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_74_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_74_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_74_10.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Netflix', 'Dropbox Paper']\nB: ['Youtube', 'Google Keep']\nC: ['Triller', 'Simplenote']\nD: ['Tubi: Movies & Live TV', 'BasicNote - Notes, Notepad']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (846, 634)\nstep 2: CLICK: (946, 46)\nstep 3: TYPE: 3D Printing Course\nstep 4: CLICK: (488, 96)\nstep 5: CLICK: (388, 343)\nstep 6: CLICK: (907, 397)\nstep 7: CLICK: (260, 844)\nstep 8: PRESS_HOME\nstep 9: CLICK: (417, 490)\nstep 10: CLICK: (877, 903)\nstep 11: CLICK: (444, 654)\nstep 12: CLICK: (145, 134)\nstep 13: TYPE: 3D Printer Course for Beginners \nstep 14: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (846, 634)\nstep 2: CLICK: (946, 46)\nstep 3: TYPE: 3D Printing Course\nstep 4: CLICK: (488, 96)\nstep 5: CLICK: (388, 343)\nstep 6: CLICK: (907, 397)\nstep 7: CLICK: (260, 844)\nstep 8: PRESS_HOME\nstep 9: CLICK: (417, 490)\nstep 10: CLICK: (877, 903)\nstep 11: CLICK: (444, 654)\nstep 12: CLICK: (145, 134)\nstep 13: TYPE: 3D Printer Course for Beginners \nstep 14: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Netflix', 'Dropbox Paper']\nB: ['Youtube', 'Google Keep']\nC: ['Triller', 'Simplenote']\nD: ['Tubi: Movies & Live TV', 'BasicNote - Notes, Notepad']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_75_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_75_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_75_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_75_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_75_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_75_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_75_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_75_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_75_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_75_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_75_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_75_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_75_12.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_75_13.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Firefox', 'Contacts']\nB: ['DuckDuckGo', 'Google Play Store']\nC: ['Wikipedia', 'PlantNet']\nD: ['Chrome', 'TradingView: Track All Markets']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (443, 917)\nstep 2: CLICK: (205, 436)\nstep 3: TYPE: Nvidia's stock market news\nstep 4: CLICK: (855, 897)\nstep 5: CLICK: (164, 720)\nstep 6: PRESS_HOME\nstep 7: CLICK: (915, 490)\nstep 8: CLICK: (389, 82)\nstep 9: TYPE: Nvidia\nstep 10: CLICK: (185, 323)\nstep 11: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (443, 917)\nstep 2: CLICK: (205, 436)\nstep 3: TYPE: Nvidia's stock market news\nstep 4: CLICK: (855, 897)\nstep 5: CLICK: (164, 720)\nstep 6: PRESS_HOME\nstep 7: CLICK: (915, 490)\nstep 8: CLICK: (389, 82)\nstep 9: TYPE: Nvidia\nstep 10: CLICK: (185, 323)\nstep 11: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Firefox', 'Contacts']\nB: ['DuckDuckGo', 'Google Play Store']\nC: ['Wikipedia', 'PlantNet']\nD: ['Chrome', 'TradingView: Track All Markets']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_76_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_76_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_76_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_76_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_76_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_76_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_76_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_76_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_76_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_76_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_76_10.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Yahoo Sports', 'Instagram']\nB: ['AP News', 'Messenger']\nC: ['Microsoft News', 'Facebook']\nD: ['Breaking News: local & Alerts', 'Gmail']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (847, 490)\nstep 2: CLICK: (915, 140)\nstep 3: TYPE: Electric Vehicles\nstep 4: CLICK: (909, 865)\nstep 5: CLICK: (481, 403)\nstep 6: CLICK: (668, 141)\nstep 7: CLICK: (127, 663)\nstep 8: CLICK: (840, 337)\nstep 9: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (847, 490)\nstep 2: CLICK: (915, 140)\nstep 3: TYPE: Electric Vehicles\nstep 4: CLICK: (909, 865)\nstep 5: CLICK: (481, 403)\nstep 6: CLICK: (668, 141)\nstep 7: CLICK: (127, 663)\nstep 8: CLICK: (840, 337)\nstep 9: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Yahoo Sports', 'Instagram']\nB: ['AP News', 'Messenger']\nC: ['Microsoft News', 'Facebook']\nD: ['Breaking News: local & Alerts', 'Gmail']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_77_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_77_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_77_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_77_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_77_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_77_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_77_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_77_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_77_8.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Google Docs', 'Firefox', 'Setting']\nB: ['Dropbox Paper', 'Wikipedia', 'Picturethis']\nC: ['Simplenote', 'Opera', 'PlantNet']\nD: ['Microsoft Word', 'Edge', 'Vaulty:Hide Pictures Videos']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (382, 264)\nstep 2: CLICK: (435, 146)\nstep 3: TYPE: Bristlecone\nstep 4: CLICK: (453, 165)\nstep 5: CLICK: (409, 280)\nstep 6: CLICK: (951, 87)\nstep 7: CLICK: (741, 97)\nstep 8: PRESS_HOME\nstep 9: CLICK: (340, 396)\nstep 10: CLICK: (476, 915)\nstep 11: CLICK: (183, 885)\nstep 12: CLICK: (880, 630)\nstep 13: SCROLL: RIGHT\nstep 14: SCROLL: LEFT\nstep 15: CLICK: (476, 878)\nstep 16: IMPOSSIBLE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (382, 264)\nstep 2: CLICK: (435, 146)\nstep 3: TYPE: Bristlecone\nstep 4: CLICK: (453, 165)\nstep 5: CLICK: (409, 280)\nstep 6: CLICK: (951, 87)\nstep 7: CLICK: (741, 97)\nstep 8: PRESS_HOME\nstep 9: CLICK: (340, 396)\nstep 10: CLICK: (476, 915)\nstep 11: CLICK: (183, 885)\nstep 12: CLICK: (880, 630)\nstep 13: SCROLL: RIGHT\nstep 14: SCROLL: LEFT\nstep 15: CLICK: (476, 878)\nstep 16: IMPOSSIBLE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Google Docs', 'Firefox', 'Setting']\nB: ['Dropbox Paper', 'Wikipedia', 'Picturethis']\nC: ['Simplenote', 'Opera', 'PlantNet']\nD: ['Microsoft Word', 'Edge', 'Vaulty:Hide Pictures Videos']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_78_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_78_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_78_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_78_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_78_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_78_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_78_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_78_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_78_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_78_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_78_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_78_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_78_12.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_78_13.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_78_14.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_78_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Applock Pro - APP Lock & Guard', 'Amazon Kindle']\nB: ['Setting', 'Everand']\nC: ['iNaturalist', 'Pocket FM: Audio Series']\nD: ['Picturethis', 'Audible: Audio Entertainment']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (616, 654)\nstep 2: SCROLL: UP\nstep 3: CLICK: (318, 682)\nstep 4: CLICK: (888, 724)\nstep 5: PRESS_HOME\nstep 6: CLICK: (364, 629)\nstep 7: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (616, 654)\nstep 2: SCROLL: UP\nstep 3: CLICK: (318, 682)\nstep 4: CLICK: (888, 724)\nstep 5: PRESS_HOME\nstep 6: CLICK: (364, 629)\nstep 7: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Applock Pro - APP Lock & Guard', 'Amazon Kindle']\nB: ['Setting', 'Everand']\nC: ['iNaturalist', 'Pocket FM: Audio Series']\nD: ['Picturethis', 'Audible: Audio Entertainment']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_79_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_79_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_79_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_79_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_79_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_79_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_79_6.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Tubi: Movies & Live TV', 'TradingView: Track All Markets']\nB: ['Netflix', 'Contacts']\nC: ['Youtube', 'Vaulty:Hide Pictures Videos']\nD: ['Shorts VotTak: Short Video App', 'Google Play Store']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (841, 767)\nstep 2: CLICK: (772, 68)\nstep 3: CLICK: (913, 74)\nstep 4: TYPE: tiktok\nstep 5: PRESS_HOME\nstep 6: CLICK: (394, 726)\nstep 7: SCROLL: RIGHT\nstep 8: CLICK: (922, 79)\nstep 9: TYPE: vottak\nstep 10: CLICK: (880, 885)\nstep 11: CLICK: (833, 449)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (841, 767)\nstep 2: CLICK: (772, 68)\nstep 3: CLICK: (913, 74)\nstep 4: TYPE: tiktok\nstep 5: PRESS_HOME\nstep 6: CLICK: (394, 726)\nstep 7: SCROLL: RIGHT\nstep 8: CLICK: (922, 79)\nstep 9: TYPE: vottak\nstep 10: CLICK: (880, 885)\nstep 11: CLICK: (833, 449)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Tubi: Movies & Live TV', 'TradingView: Track All Markets']\nB: ['Netflix', 'Contacts']\nC: ['Youtube', 'Vaulty:Hide Pictures Videos']\nD: ['Shorts VotTak: Short Video App', 'Google Play Store']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_80_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_80_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_80_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_80_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_80_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_80_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_80_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_80_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_80_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_80_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_80_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_80_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Setting', 'Shorts VotTak: Short Video App', 'Google Play Store']\nB: ['Applock Pro - APP Lock & Guard', 'Tubi: Movies & Live TV', 'Vaulty:Hide Pictures Videos']\nC: ['Google Play Store', 'Pluto TV - Live TV and Movies', 'Contacts']\nD: ['PlantNet', 'Youtube', 'TradingView: Track All Markets']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (857, 849)\nstep 2: TYPE: vottak\nstep 3: CLICK: (324, 137)\nstep 4: CLICK: (855, 325)\nstep 5: CLICK: (913, 65)\nstep 6: CLICK: (674, 657)\nstep 7: PRESS_HOME\nstep 8: CLICK: (395, 828)\nstep 9: CLICK: (465, 531)\nstep 10: CLICK: (362, 335)\nstep 11: CLICK: (156, 413)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (857, 849)\nstep 2: TYPE: vottak\nstep 3: CLICK: (324, 137)\nstep 4: CLICK: (855, 325)\nstep 5: CLICK: (913, 65)\nstep 6: CLICK: (674, 657)\nstep 7: PRESS_HOME\nstep 8: CLICK: (395, 828)\nstep 9: CLICK: (465, 531)\nstep 10: CLICK: (362, 335)\nstep 11: CLICK: (156, 413)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Setting', 'Shorts VotTak: Short Video App', 'Google Play Store']\nB: ['Applock Pro - APP Lock & Guard', 'Tubi: Movies & Live TV', 'Vaulty:Hide Pictures Videos']\nC: ['Google Play Store', 'Pluto TV - Live TV and Movies', 'Contacts']\nD: ['PlantNet', 'Youtube', 'TradingView: Track All Markets']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_81_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_81_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_81_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_81_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_81_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_81_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_81_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_81_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_81_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_81_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_81_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_81_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Tiktok', 'Setting']\nB: ['Youtube', 'iNaturalist']\nC: ['Shorts VotTak: Short Video App', 'PlantNet']\nD: ['Pluto TV - Live TV and Movies', 'Applock Pro - APP Lock & Guard']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (835, 522)\nstep 2: CLICK: (922, 296)\nstep 3: PRESS_HOME\nstep 4: CLICK: (393, 529)\nstep 5: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (835, 522)\nstep 2: CLICK: (922, 296)\nstep 3: PRESS_HOME\nstep 4: CLICK: (393, 529)\nstep 5: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Tiktok', 'Setting']\nB: ['Youtube', 'iNaturalist']\nC: ['Shorts VotTak: Short Video App', 'PlantNet']\nD: ['Pluto TV - Live TV and Movies', 'Applock Pro - APP Lock & Guard']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_82_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_82_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_82_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_82_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_82_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Youtube', 'Google Play Store']\nB: ['Shorts VotTak: Short Video App', 'Applock Pro - APP Lock & Guard']\nC: ['Tiktok', 'TradingView: Track All Markets']\nD: ['Triller', 'Contacts']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (614, 806)\nstep 2: PRESS_HOME\nstep 3: CLICK: (829, 815)\nstep 4: CLICK: (807, 53)\nstep 5: TYPE: Peloton App\nstep 6: CLICK: (940, 909)\nstep 7: CLICK: (843, 327)\nstep 8: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (614, 806)\nstep 2: PRESS_HOME\nstep 3: CLICK: (829, 815)\nstep 4: CLICK: (807, 53)\nstep 5: TYPE: Peloton App\nstep 6: CLICK: (940, 909)\nstep 7: CLICK: (843, 327)\nstep 8: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Youtube', 'Google Play Store']\nB: ['Shorts VotTak: Short Video App', 'Applock Pro - APP Lock & Guard']\nC: ['Tiktok', 'TradingView: Track All Markets']\nD: ['Triller', 'Contacts']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_83_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_83_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_83_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_83_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_83_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_83_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_83_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_83_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Pandora', 'Tumblr']\nB: ['Spotify', 'Facebook']\nC: ['Amazon Music', 'Messenger']\nD: ['YT Music', 'Instagram']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (880, 116)\nstep 2: CLICK: (408, 55)\nstep 3: TYPE: Electronic\nstep 4: CLICK: (912, 914)\nstep 5: SCROLL: LEFT\nstep 6: CLICK: (862, 119)\nstep 7: CLICK: (928, 189)\nstep 8: PRESS_HOME\nstep 9: CLICK: (617, 252)\nstep 10: TYPE: Electronic\nstep 11: CLICK: (895, 600)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (880, 116)\nstep 2: CLICK: (408, 55)\nstep 3: TYPE: Electronic\nstep 4: CLICK: (912, 914)\nstep 5: SCROLL: LEFT\nstep 6: CLICK: (862, 119)\nstep 7: CLICK: (928, 189)\nstep 8: PRESS_HOME\nstep 9: CLICK: (617, 252)\nstep 10: TYPE: Electronic\nstep 11: CLICK: (895, 600)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Pandora', 'Tumblr']\nB: ['Spotify', 'Facebook']\nC: ['Amazon Music', 'Messenger']\nD: ['YT Music', 'Instagram']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_84_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_84_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_84_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_84_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_84_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_84_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_84_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_84_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_84_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_84_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_84_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_84_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Setting', 'Contacts', 'Shorts VotTak: Short Video App']\nB: ['Plantin', 'PlantNet', 'Tubi: Movies & Live TV']\nC: ['iNaturalist', 'Google Play Store', 'Tiktok']\nD: ['Google Play Store', 'Setting', 'Likee']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (858, 821)\nstep 2: CLICK: (773, 244)\nstep 3: PRESS_HOME\nstep 4: CLICK: (437, 809)\nstep 5: CLICK: (279, 516)\nstep 6: CLICK: (836, 404)\nstep 7: CLICK: (77, 52)\nstep 8: CLICK: (188, 412)\nstep 9: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (858, 821)\nstep 2: CLICK: (773, 244)\nstep 3: PRESS_HOME\nstep 4: CLICK: (437, 809)\nstep 5: CLICK: (279, 516)\nstep 6: CLICK: (836, 404)\nstep 7: CLICK: (77, 52)\nstep 8: CLICK: (188, 412)\nstep 9: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Setting', 'Contacts', 'Shorts VotTak: Short Video App']\nB: ['Plantin', 'PlantNet', 'Tubi: Movies & Live TV']\nC: ['iNaturalist', 'Google Play Store', 'Tiktok']\nD: ['Google Play Store', 'Setting', 'Likee']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_85_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_85_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_85_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_85_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_85_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_85_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_85_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_85_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_85_8.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Chatbot AI & Smart Assistant', 'Firefox']\nB: ['Microsoft Copilot', 'Bing: chat with AI & GPT4']\nC: ['Chatty - AI Assistant', 'DuckDuckGo']\nD: ['ChatOn - AI Chat Bot Assistant', 'Opera']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (377, 377)\nstep 2: TYPE: tell me about Central Limit Theorem\nstep 3: CLICK: (896, 576)\nstep 4: PRESS_HOME\nstep 5: CLICK: (408, 235)\nstep 6: CLICK: (438, 319)\nstep 7: TYPE: Central Limit Theorem\nstep 8: CLICK: (301, 151)\nstep 9: CLICK: (133, 542)\nstep 10: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (377, 377)\nstep 2: TYPE: tell me about Central Limit Theorem\nstep 3: CLICK: (896, 576)\nstep 4: PRESS_HOME\nstep 5: CLICK: (408, 235)\nstep 6: CLICK: (438, 319)\nstep 7: TYPE: Central Limit Theorem\nstep 8: CLICK: (301, 151)\nstep 9: CLICK: (133, 542)\nstep 10: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Chatbot AI & Smart Assistant', 'Firefox']\nB: ['Microsoft Copilot', 'Bing: chat with AI & GPT4']\nC: ['Chatty - AI Assistant', 'DuckDuckGo']\nD: ['ChatOn - AI Chat Bot Assistant', 'Opera']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_86_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_86_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_86_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_86_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_86_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_86_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_86_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_86_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_86_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_86_9.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Lazada', 'Setting']\nB: ['REVOLVE', 'PlantNet']\nC: ['SSENSE', 'Google Play Store']\nD: ['Alibaba.com - B2B marketplace', 'Vaulty:Hide Pictures Videos']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (372, 667)\nstep 2: CLICK: (369, 84)\nstep 3: TYPE: SSENSE\nstep 4: CLICK: (938, 918)\nstep 5: CLICK: (753, 336)\nstep 6: CLICK: (838, 333)\nstep 7: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (372, 667)\nstep 2: CLICK: (369, 84)\nstep 3: TYPE: SSENSE\nstep 4: CLICK: (938, 918)\nstep 5: CLICK: (753, 336)\nstep 6: CLICK: (838, 333)\nstep 7: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Lazada', 'Setting']\nB: ['REVOLVE', 'PlantNet']\nC: ['SSENSE', 'Google Play Store']\nD: ['Alibaba.com - B2B marketplace', 'Vaulty:Hide Pictures Videos']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_87_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_87_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_87_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_87_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_87_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_87_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_87_6.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Tripadvisor', 'Netflix']\nB: ['Applock Pro - APP Lock & Guard', 'Shorts VotTak: Short Video App']\nC: ['Setting', 'Pluto TV - Live TV and Movies']\nD: ['Google Play Store', 'Youtube']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (317, 911)\nstep 2: PRESS_HOME\nstep 3: CLICK: (663, 919)\nstep 4: TYPE: Aaptiv\nstep 5: CLICK: (208, 176)\nstep 6: CLICK: (620, 431)\nstep 7: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (317, 911)\nstep 2: PRESS_HOME\nstep 3: CLICK: (663, 919)\nstep 4: TYPE: Aaptiv\nstep 5: CLICK: (208, 176)\nstep 6: CLICK: (620, 431)\nstep 7: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Tripadvisor', 'Netflix']\nB: ['Applock Pro - APP Lock & Guard', 'Shorts VotTak: Short Video App']\nC: ['Setting', 'Pluto TV - Live TV and Movies']\nD: ['Google Play Store', 'Youtube']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_88_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_88_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_88_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_88_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_88_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_88_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_88_6.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['iNaturalist', 'Temu']\nB: ['Google Play Store', 'THE OUTNET']\nC: ['Applock Pro - APP Lock & Guard', 'Amazon']\nD: ['Vaulty:Hide Pictures Videos', 'MATCHES']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (447, 152)\nstep 2: CLICK: (295, 127)\nstep 3: TYPE: THE OUTNET\nstep 4: CLICK: (920, 742)\nstep 5: CLICK: (915, 228)\nstep 6: CLICK: (905, 297)\nstep 7: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (447, 152)\nstep 2: CLICK: (295, 127)\nstep 3: TYPE: THE OUTNET\nstep 4: CLICK: (920, 742)\nstep 5: CLICK: (915, 228)\nstep 6: CLICK: (905, 297)\nstep 7: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['iNaturalist', 'Temu']\nB: ['Google Play Store', 'THE OUTNET']\nC: ['Applock Pro - APP Lock & Guard', 'Amazon']\nD: ['Vaulty:Hide Pictures Videos', 'MATCHES']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_89_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_89_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_89_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_89_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_89_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_89_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_89_6.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Edge', 'Picturethis']\nB: ['Firefox', 'Contacts']\nC: ['Bing: chat with AI & GPT4', 'Google Play Store']\nD: ['Duckduckgo', 'Setting']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (188, 607)\nstep 2: CLICK: (305, 829)\nstep 3: SCROLL: UP\nstep 4: CLICK: (319, 690)\nstep 5: CLICK: (808, 154)\nstep 6: TYPE: Duckduckgo\nstep 7: CLICK: (347, 381)\nstep 8: CLICK: (333, 787)\nstep 9: CLICK: (786, 670)\nstep 10: CLICK: (86, 157)\nstep 11: CLICK: (227, 631)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (188, 607)\nstep 2: CLICK: (305, 829)\nstep 3: SCROLL: UP\nstep 4: CLICK: (319, 690)\nstep 5: CLICK: (808, 154)\nstep 6: TYPE: Duckduckgo\nstep 7: CLICK: (347, 381)\nstep 8: CLICK: (333, 787)\nstep 9: CLICK: (786, 670)\nstep 10: CLICK: (86, 157)\nstep 11: CLICK: (227, 631)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Edge', 'Picturethis']\nB: ['Firefox', 'Contacts']\nC: ['Bing: chat with AI & GPT4', 'Google Play Store']\nD: ['Duckduckgo', 'Setting']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_90_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_90_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_90_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_90_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_90_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_90_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_90_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_90_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_90_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_90_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_90_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_90_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['DeepL translate', 'Yahoo Sports', 'Microsoft Word']\nB: ['Language Translator: Translate', 'Yahoo Finance: Stock News', 'Simplenote']\nC: ['Microsoft Translator', 'CNN Breaking US & World News', 'Google Keep']\nD: ['Google Translate', 'AP News', 'Google Docs']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (659, 123)\nstep 2: CLICK: (409, 482)\nstep 3: PRESS_HOME\nstep 4: CLICK: (158, 261)\nstep 5: CLICK: (255, 771)\nstep 6: TYPE: Highway collapse kills dozens in southern China\nstep 7: LONG_PRESS: (413, 373)\nstep 8: PRESS_HOME\nstep 9: CLICK: (182, 397)\nstep 10: CLICK: (831, 896)\nstep 11: CLICK: (501, 652)\nstep 12: CLICK: (168, 155)\nstep 13: TYPE: Highway collapse kills dozens in southern China\nstep 14: CLICK: (83, 78)\nstep 15: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (659, 123)\nstep 2: CLICK: (409, 482)\nstep 3: PRESS_HOME\nstep 4: CLICK: (158, 261)\nstep 5: CLICK: (255, 771)\nstep 6: TYPE: Highway collapse kills dozens in southern China\nstep 7: LONG_PRESS: (413, 373)\nstep 8: PRESS_HOME\nstep 9: CLICK: (182, 397)\nstep 10: CLICK: (831, 896)\nstep 11: CLICK: (501, 652)\nstep 12: CLICK: (168, 155)\nstep 13: TYPE: Highway collapse kills dozens in southern China\nstep 14: CLICK: (83, 78)\nstep 15: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['DeepL translate', 'Yahoo Sports', 'Microsoft Word']\nB: ['Language Translator: Translate', 'Yahoo Finance: Stock News', 'Simplenote']\nC: ['Microsoft Translator', 'CNN Breaking US & World News', 'Google Keep']\nD: ['Google Translate', 'AP News', 'Google Docs']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_91_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_91_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_91_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_91_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_91_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_91_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_91_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_91_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_91_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_91_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_91_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_91_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_91_12.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_91_13.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_91_14.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Wish', 'Edge', 'Edge']\nB: ['AliExpress', 'wikiHow', 'Firefox']\nC: ['Temu', 'Firefox', 'Bing: chat with AI & GPT4']\nD: ['Amazon', 'Chrome', 'wikiHow']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (368, 361)\nstep 2: CLICK: (465, 59)\nstep 3: TYPE: book about poetry\nstep 4: CLICK: (933, 915)\nstep 5: PRESS_HOME\nstep 6: CLICK: (384, 240)\nstep 7: CLICK: (936, 56)\nstep 8: TYPE: the ode less travelled book\nstep 9: CLICK: (925, 902)\nstep 10: PRESS_HOME\nstep 11: CLICK: (137, 113)\nstep 12: CLICK: (368, 54)\nstep 13: TYPE: the ode less travelled book\nstep 14: CLICK: (928, 905)\nstep 15: SCROLL: UP\nstep 16: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (368, 361)\nstep 2: CLICK: (465, 59)\nstep 3: TYPE: book about poetry\nstep 4: CLICK: (933, 915)\nstep 5: PRESS_HOME\nstep 6: CLICK: (384, 240)\nstep 7: CLICK: (936, 56)\nstep 8: TYPE: the ode less travelled book\nstep 9: CLICK: (925, 902)\nstep 10: PRESS_HOME\nstep 11: CLICK: (137, 113)\nstep 12: CLICK: (368, 54)\nstep 13: TYPE: the ode less travelled book\nstep 14: CLICK: (928, 905)\nstep 15: SCROLL: UP\nstep 16: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Wish', 'Edge', 'Edge']\nB: ['AliExpress', 'wikiHow', 'Firefox']\nC: ['Temu', 'Firefox', 'Bing: chat with AI & GPT4']\nD: ['Amazon', 'Chrome', 'wikiHow']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_92_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_92_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_92_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_92_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_92_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_92_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_92_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_92_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_92_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_92_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_92_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_92_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_92_12.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_92_13.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_92_14.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_92_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Gmail', 'Tumblr']\nB: ['Facebook', 'Messenger']\nC: ['Instagram', 'Gmail']\nD: ['Threads', 'Instagram']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (559, 146)\nstep 2: CLICK: (668, 65)\nstep 3: CLICK: (636, 134)\nstep 4: CLICK: (324, 228)\nstep 5: TYPE: I am sad now\nstep 6: CLICK: (739, 62)\nstep 7: SCROLL: UP\nstep 8: CLICK: (718, 598)\nstep 9: CLICK: (739, 844)\nstep 10: CLICK: (365, 704)\nstep 11: CLICK: (735, 471)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (559, 146)\nstep 2: CLICK: (668, 65)\nstep 3: CLICK: (636, 134)\nstep 4: CLICK: (324, 228)\nstep 5: TYPE: I am sad now\nstep 6: CLICK: (739, 62)\nstep 7: SCROLL: UP\nstep 8: CLICK: (718, 598)\nstep 9: CLICK: (739, 844)\nstep 10: CLICK: (365, 704)\nstep 11: CLICK: (735, 471)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Gmail', 'Tumblr']\nB: ['Facebook', 'Messenger']\nC: ['Instagram', 'Gmail']\nD: ['Threads', 'Instagram']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_93_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_93_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_93_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_93_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_93_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_93_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_93_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_93_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_93_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_93_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_93_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_93_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Photos', 'Applock Pro - APP Lock & Guard']\nB: ['Google Photos', 'Tripadvisor']\nC: ['ABPV', 'Contacts']\nD: ['Mapillary', 'Setting']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (712, 544)\nstep 2: CLICK: (583, 577)\nstep 3: CLICK: (480, 560)\nstep 4: CLICK: (424, 555)\nstep 5: CLICK: (596, 659)\nstep 6: CLICK: (497, 664)\nstep 7: CLICK: (410, 666)\nstep 8: SCROLL: UP\nstep 9: CLICK: (675, 921)\nstep 10: PRESS_HOME\nstep 11: CLICK: (856, 710)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (712, 544)\nstep 2: CLICK: (583, 577)\nstep 3: CLICK: (480, 560)\nstep 4: CLICK: (424, 555)\nstep 5: CLICK: (596, 659)\nstep 6: CLICK: (497, 664)\nstep 7: CLICK: (410, 666)\nstep 8: SCROLL: UP\nstep 9: CLICK: (675, 921)\nstep 10: PRESS_HOME\nstep 11: CLICK: (856, 710)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Photos', 'Applock Pro - APP Lock & Guard']\nB: ['Google Photos', 'Tripadvisor']\nC: ['ABPV', 'Contacts']\nD: ['Mapillary', 'Setting']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_94_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_94_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_94_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_94_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_94_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_94_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_94_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_94_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_94_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_94_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_94_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_94_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Uber', 'Tubi: Movies & Live TV']\nB: ['GPS', 'Tiktok']\nC: ['Citymapper', 'Shorts VotTak: Short Video App']\nD: ['Google Map', 'Netflix']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (413, 273)\nstep 2: CLICK: (953, 75)\nstep 3: TYPE: the best chinese dim sum restaurant in Los Angeles\nstep 4: CLICK: (958, 922)\nstep 5: CLICK: (264, 467)\nstep 6: PRESS_HOME\nstep 7: CLICK: (127, 154)\nstep 8: CLICK: (556, 179)\nstep 9: CLICK: (232, 832)\nstep 10: TYPE: Won Kok Restaurant \nstep 11: CLICK: (922, 912)\nstep 12: CLICK: (326, 265)\nstep 13: CLICK: (911, 916)\nstep 14: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (413, 273)\nstep 2: CLICK: (953, 75)\nstep 3: TYPE: the best chinese dim sum restaurant in Los Angeles\nstep 4: CLICK: (958, 922)\nstep 5: CLICK: (264, 467)\nstep 6: PRESS_HOME\nstep 7: CLICK: (127, 154)\nstep 8: CLICK: (556, 179)\nstep 9: CLICK: (232, 832)\nstep 10: TYPE: Won Kok Restaurant \nstep 11: CLICK: (922, 912)\nstep 12: CLICK: (326, 265)\nstep 13: CLICK: (911, 916)\nstep 14: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Uber', 'Tubi: Movies & Live TV']\nB: ['GPS', 'Tiktok']\nC: ['Citymapper', 'Shorts VotTak: Short Video App']\nD: ['Google Map', 'Netflix']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_95_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_95_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_95_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_95_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_95_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_95_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_95_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_95_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_95_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_95_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_95_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_95_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_95_12.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_95_13.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['TickTick', 'DuckDuckGo']\nB: ['To-Do List', 'Firefox']\nC: ['Microsoft to do', 'Chrome']\nD: ['Any.do', 'Bing: chat with AI & GPT4']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (605, 815)\nstep 2: CLICK: (795, 184)\nstep 3: TYPE: when is the next fashion week in Paris\nstep 4: CLICK: (444, 167)\nstep 5: PRESS_HOME\nstep 6: CLICK: (393, 647)\nstep 7: CLICK: (424, 937)\nstep 8: CLICK: (469, 884)\nstep 9: CLICK: (770, 362)\nstep 10: CLICK: (749, 364)\nstep 11: CLICK: (412, 588)\nstep 12: CLICK: (750, 726)\nstep 13: CLICK: (242, 891)\nstep 14: TYPE: the next fashion show in Paris\nstep 15: CLICK: (936, 902)\nstep 16: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (605, 815)\nstep 2: CLICK: (795, 184)\nstep 3: TYPE: when is the next fashion week in Paris\nstep 4: CLICK: (444, 167)\nstep 5: PRESS_HOME\nstep 6: CLICK: (393, 647)\nstep 7: CLICK: (424, 937)\nstep 8: CLICK: (469, 884)\nstep 9: CLICK: (770, 362)\nstep 10: CLICK: (749, 364)\nstep 11: CLICK: (412, 588)\nstep 12: CLICK: (750, 726)\nstep 13: CLICK: (242, 891)\nstep 14: TYPE: the next fashion show in Paris\nstep 15: CLICK: (936, 902)\nstep 16: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['TickTick', 'DuckDuckGo']\nB: ['To-Do List', 'Firefox']\nC: ['Microsoft to do', 'Chrome']\nD: ['Any.do', 'Bing: chat with AI & GPT4']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_96_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_96_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_96_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_96_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_96_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_96_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_96_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_96_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_96_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_96_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_96_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_96_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_96_12.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_96_13.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_96_14.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_96_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Basic Calculator: GPA & Math', 'Triller']\nB: ['Calculator', 'Netflix']\nC: ['Google Drive', 'Likee']\nD: ['Clock', 'Youtube']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (331, 912)\nstep 2: TYPE: Origami figure\nstep 3: CLICK: (873, 884)\nstep 4: CLICK: (335, 613)\nstep 5: PRESS_HOME\nstep 6: CLICK: (430, 471)\nstep 7: TYPE: 630\nstep 8: CLICK: (751, 842)\nstep 9: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (331, 912)\nstep 2: TYPE: Origami figure\nstep 3: CLICK: (873, 884)\nstep 4: CLICK: (335, 613)\nstep 5: PRESS_HOME\nstep 6: CLICK: (430, 471)\nstep 7: TYPE: 630\nstep 8: CLICK: (751, 842)\nstep 9: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Basic Calculator: GPA & Math', 'Triller']\nB: ['Calculator', 'Netflix']\nC: ['Google Drive', 'Likee']\nD: ['Clock', 'Youtube']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_97_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_97_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_97_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_97_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_97_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_97_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_97_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_97_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_97_8.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Edge', 'Pluto TV - Live TV and Movies']\nB: ['wikiHow', 'Likee']\nC: ['Quora', 'Tubi: Movies & Live TV']\nD: ['Opera', 'Youtube']\n",
+    "question": "The corresponding actions are: step 1: PRESS_HOME\nstep 2: CLICK: (824, 126)\nstep 3: CLICK: (325, 112)\nstep 4: TYPE: free horror movie on youtube\nstep 5: CLICK: (892, 903)\nstep 6: CLICK: (169, 424)\nstep 7: CLICK: (795, 114)\nstep 8: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: PRESS_HOME\nstep 2: CLICK: (824, 126)\nstep 3: CLICK: (325, 112)\nstep 4: TYPE: free horror movie on youtube\nstep 5: CLICK: (892, 903)\nstep 6: CLICK: (169, 424)\nstep 7: CLICK: (795, 114)\nstep 8: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Edge', 'Pluto TV - Live TV and Movies']\nB: ['wikiHow', 'Likee']\nC: ['Quora', 'Tubi: Movies & Live TV']\nD: ['Opera', 'Youtube']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_98_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_98_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_98_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_98_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_98_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_98_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_98_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_98_7.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Calendar', 'Wikipedia']\nB: ['Calculator', 'DuckDuckGo']\nC: ['Clock', 'Chrome']\nD: ['ClevCalc - Calculator', 'Opera']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (624, 813)\nstep 2: CLICK: (466, 171)\nstep 3: CLICK: (904, 121)\nstep 4: TYPE: nature sound video\nstep 5: CLICK: (912, 905)\nstep 6: CLICK: (551, 344)\nstep 7: PRESS_HOME\nstep 8: CLICK: (598, 368)\nstep 9: CLICK: (870, 140)\nstep 10: TYPE: 10000\nstep 11: CLICK: (566, 821)\nstep 12: PRESS_RECENT\nstep 13: CLICK: (52, 182)\nstep 14: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (624, 813)\nstep 2: CLICK: (466, 171)\nstep 3: CLICK: (904, 121)\nstep 4: TYPE: nature sound video\nstep 5: CLICK: (912, 905)\nstep 6: CLICK: (551, 344)\nstep 7: PRESS_HOME\nstep 8: CLICK: (598, 368)\nstep 9: CLICK: (870, 140)\nstep 10: TYPE: 10000\nstep 11: CLICK: (566, 821)\nstep 12: PRESS_RECENT\nstep 13: CLICK: (52, 182)\nstep 14: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Calendar', 'Wikipedia']\nB: ['Calculator', 'DuckDuckGo']\nC: ['Clock', 'Chrome']\nD: ['ClevCalc - Calculator', 'Opera']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_99_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_99_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_99_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_99_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_99_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_99_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_99_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_99_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_99_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_99_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_99_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_99_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_99_12.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_99_13.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Amazon Kindle', 'Tripadvisor']\nB: ['Kobo Books - eBooks Audiobooks', 'Picturethis']\nC: ['Google Play Books & Audiobooks', 'Setting']\nD: ['Pocket FM: Audio Series', 'Google Play Store']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (342, 266)\nstep 2: SCROLL: UP\nstep 3: CLICK: (374, 550)\nstep 4: CLICK: (902, 652)\nstep 5: PRESS_HOME\nstep 6: CLICK: (179, 268)\nstep 7: CLICK: (444, 554)\nstep 8: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (342, 266)\nstep 2: SCROLL: UP\nstep 3: CLICK: (374, 550)\nstep 4: CLICK: (902, 652)\nstep 5: PRESS_HOME\nstep 6: CLICK: (179, 268)\nstep 7: CLICK: (444, 554)\nstep 8: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Amazon Kindle', 'Tripadvisor']\nB: ['Kobo Books - eBooks Audiobooks', 'Picturethis']\nC: ['Google Play Books & Audiobooks', 'Setting']\nD: ['Pocket FM: Audio Series', 'Google Play Store']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_100_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_100_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_100_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_100_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_100_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_100_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_100_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_100_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Picturethis', 'Tripadvisor', 'Netflix']\nB: ['Plantin', 'Vaulty:Hide Pictures Videos', 'Tiktok']\nC: ['Google Play Store', 'Setting', 'Shorts VotTak: Short Video App']\nD: ['iNaturalist', 'Picturethis', 'Likee']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (827, 749)\nstep 2: CLICK: (830, 468)\nstep 3: PRESS_HOME\nstep 4: CLICK: (374, 746)\nstep 5: CLICK: (208, 576)\nstep 6: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (827, 749)\nstep 2: CLICK: (830, 468)\nstep 3: PRESS_HOME\nstep 4: CLICK: (374, 746)\nstep 5: CLICK: (208, 576)\nstep 6: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Picturethis', 'Tripadvisor', 'Netflix']\nB: ['Plantin', 'Vaulty:Hide Pictures Videos', 'Tiktok']\nC: ['Google Play Store', 'Setting', 'Shorts VotTak: Short Video App']\nD: ['iNaturalist', 'Picturethis', 'Likee']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_101_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_101_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_101_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_101_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_101_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_101_5.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['X', 'AliExpress']\nB: ['Tumblr', 'Wish']\nC: ['Gmail', 'Temu']\nD: ['Instagram', 'SSENSE']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (237, 734)\nstep 2: CLICK: (927, 701)\nstep 3: CLICK: (63, 658)\nstep 4: TYPE: action camera recommendation\nstep 5: CLICK: (748, 96)\nstep 6: PRESS_HOME\nstep 7: CLICK: (70, 580)\nstep 8: CLICK: (69, 531)\nstep 9: CLICK: (58, 123)\nstep 10: TYPE: Insta360 One\nstep 11: CLICK: (743, 34)\nstep 12: CLICK: (716, 636)\nstep 13: CLICK: (927, 644)\nstep 14: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (237, 734)\nstep 2: CLICK: (927, 701)\nstep 3: CLICK: (63, 658)\nstep 4: TYPE: action camera recommendation\nstep 5: CLICK: (748, 96)\nstep 6: PRESS_HOME\nstep 7: CLICK: (70, 580)\nstep 8: CLICK: (69, 531)\nstep 9: CLICK: (58, 123)\nstep 10: TYPE: Insta360 One\nstep 11: CLICK: (743, 34)\nstep 12: CLICK: (716, 636)\nstep 13: CLICK: (927, 644)\nstep 14: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['X', 'AliExpress']\nB: ['Tumblr', 'Wish']\nC: ['Gmail', 'Temu']\nD: ['Instagram', 'SSENSE']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_102_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_102_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_102_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_102_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_102_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_102_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_102_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_102_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_102_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_102_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_102_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_102_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_102_12.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_102_13.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Google Play Store', 'Youtube']\nB: ['iNaturalist', 'Tiktok']\nC: ['PlantNet', 'Netflix']\nD: ['Vaulty:Hide Pictures Videos', 'Triller']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (625, 844)\nstep 2: PRESS_HOME\nstep 3: CLICK: (840, 825)\nstep 4: CLICK: (819, 76)\nstep 5: CLICK: (936, 78)\nstep 6: TYPE: Apple Fitness Plus\nstep 7: CLICK: (924, 906)\nstep 8: CLICK: (721, 552)\nstep 9: CLICK: (712, 235)\nstep 10: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (625, 844)\nstep 2: PRESS_HOME\nstep 3: CLICK: (840, 825)\nstep 4: CLICK: (819, 76)\nstep 5: CLICK: (936, 78)\nstep 6: TYPE: Apple Fitness Plus\nstep 7: CLICK: (924, 906)\nstep 8: CLICK: (721, 552)\nstep 9: CLICK: (712, 235)\nstep 10: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Google Play Store', 'Youtube']\nB: ['iNaturalist', 'Tiktok']\nC: ['PlantNet', 'Netflix']\nD: ['Vaulty:Hide Pictures Videos', 'Triller']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_103_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_103_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_103_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_103_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_103_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_103_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_103_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_103_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_103_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_103_9.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Google Play Store', 'Setting']\nB: ['Picturethis', 'Picturethis']\nC: ['Applock Pro - APP Lock & Guard', 'Contacts']\nD: ['Vaulty:Hide Pictures Videos', 'Tripadvisor']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (86, 652)\nstep 2: CLICK: (279, 93)\nstep 3: TYPE: eBay\nstep 4: CLICK: (859, 887)\nstep 5: CLICK: (612, 441)\nstep 6: CLICK: (697, 535)\nstep 7: PRESS_HOME\nstep 8: CLICK: (425, 654)\nstep 9: CLICK: (205, 475)\nstep 10: CLICK: (464, 857)\nstep 11: CLICK: (916, 83)\nstep 12: TYPE: eBay\nstep 13: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (86, 652)\nstep 2: CLICK: (279, 93)\nstep 3: TYPE: eBay\nstep 4: CLICK: (859, 887)\nstep 5: CLICK: (612, 441)\nstep 6: CLICK: (697, 535)\nstep 7: PRESS_HOME\nstep 8: CLICK: (425, 654)\nstep 9: CLICK: (205, 475)\nstep 10: CLICK: (464, 857)\nstep 11: CLICK: (916, 83)\nstep 12: TYPE: eBay\nstep 13: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Google Play Store', 'Setting']\nB: ['Picturethis', 'Picturethis']\nC: ['Applock Pro - APP Lock & Guard', 'Contacts']\nD: ['Vaulty:Hide Pictures Videos', 'Tripadvisor']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_104_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_104_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_104_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_104_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_104_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_104_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_104_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_104_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_104_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_104_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_104_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_104_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_104_12.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Applock Pro - APP Lock & Guard', 'Google Pay']\nB: ['Setting', 'Chime - Mobile Banking']\nC: ['iNaturalist', 'Cash App']\nD: ['Plantin', 'Investing.com']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (809, 503)\nstep 2: CLICK: (210, 567)\nstep 3: CLICK: (530, 564)\nstep 4: CLICK: (816, 580)\nstep 5: CLICK: (791, 687)\nstep 6: CLICK: (478, 677)\nstep 7: CLICK: (194, 677)\nstep 8: SCROLL: UP\nstep 9: CLICK: (874, 854)\nstep 10: PRESS_HOME\nstep 11: CLICK: (105, 634)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (809, 503)\nstep 2: CLICK: (210, 567)\nstep 3: CLICK: (530, 564)\nstep 4: CLICK: (816, 580)\nstep 5: CLICK: (791, 687)\nstep 6: CLICK: (478, 677)\nstep 7: CLICK: (194, 677)\nstep 8: SCROLL: UP\nstep 9: CLICK: (874, 854)\nstep 10: PRESS_HOME\nstep 11: CLICK: (105, 634)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Applock Pro - APP Lock & Guard', 'Google Pay']\nB: ['Setting', 'Chime - Mobile Banking']\nC: ['iNaturalist', 'Cash App']\nD: ['Plantin', 'Investing.com']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_105_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_105_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_105_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_105_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_105_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_105_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_105_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_105_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_105_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_105_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_105_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_105_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['X', 'Facebook']\nB: ['Threads', 'Instagram']\nC: ['Gmail', 'X']\nD: ['Instagram', 'Whatsapp']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (554, 156)\nstep 2: CLICK: (727, 70)\nstep 3: TYPE:  climate change rally\nstep 4: CLICK: (860, 887)\nstep 5: CLICK: (496, 138)\nstep 6: CLICK: (332, 240)\nstep 7: SCROLL: UP\nstep 8: SCROLL: UP\nstep 9: PRESS_HOME\nstep 10: CLICK: (559, 330)\nstep 11: CLICK: (407, 180)\nstep 12: CLICK: (225, 918)\nstep 13: TYPE: Honk for Climate Action will come on JUN 1\nstep 14: CLICK: (959, 488)\nstep 15: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (554, 156)\nstep 2: CLICK: (727, 70)\nstep 3: TYPE:  climate change rally\nstep 4: CLICK: (860, 887)\nstep 5: CLICK: (496, 138)\nstep 6: CLICK: (332, 240)\nstep 7: SCROLL: UP\nstep 8: SCROLL: UP\nstep 9: PRESS_HOME\nstep 10: CLICK: (559, 330)\nstep 11: CLICK: (407, 180)\nstep 12: CLICK: (225, 918)\nstep 13: TYPE: Honk for Climate Action will come on JUN 1\nstep 14: CLICK: (959, 488)\nstep 15: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['X', 'Facebook']\nB: ['Threads', 'Instagram']\nC: ['Gmail', 'X']\nD: ['Instagram', 'Whatsapp']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_106_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_106_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_106_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_106_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_106_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_106_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_106_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_106_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_106_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_106_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_106_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_106_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_106_12.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_106_13.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_106_14.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Vaulty:Hide Pictures Videos', 'Applock Pro - APP Lock & Guard', 'Youtube']\nB: ['PlantNet', 'Contacts', 'Likee']\nC: ['TradingView: Track All Markets', 'Plantin', 'Tubi: Movies & Live TV']\nD: ['Setting', 'Google Play Store', 'Shorts VotTak: Short Video App']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (672, 909)\nstep 2: CLICK: (872, 450)\nstep 3: CLICK: (716, 29)\nstep 4: PRESS_HOME\nstep 5: CLICK: (552, 934)\nstep 6: CLICK: (918, 543)\nstep 7: CLICK: (386, 72)\nstep 8: CLICK: (483, 511)\nstep 9: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (672, 909)\nstep 2: CLICK: (872, 450)\nstep 3: CLICK: (716, 29)\nstep 4: PRESS_HOME\nstep 5: CLICK: (552, 934)\nstep 6: CLICK: (918, 543)\nstep 7: CLICK: (386, 72)\nstep 8: CLICK: (483, 511)\nstep 9: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Vaulty:Hide Pictures Videos', 'Applock Pro - APP Lock & Guard', 'Youtube']\nB: ['PlantNet', 'Contacts', 'Likee']\nC: ['TradingView: Track All Markets', 'Plantin', 'Tubi: Movies & Live TV']\nD: ['Setting', 'Google Play Store', 'Shorts VotTak: Short Video App']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_107_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_107_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_107_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_107_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_107_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_107_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_107_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_107_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_107_8.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Setting', 'Wish']\nB: ['Tripadvisor', 'Joom']\nC: ['PlantNet', 'SHEIN']\nD: ['Google Play Store', 'Lazada']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (143, 116)\nstep 2: CLICK: (535, 933)\nstep 3: CLICK: (467, 76)\nstep 4: TYPE: Lazada\nstep 5: CLICK: (897, 904)\nstep 6: CLICK: (773, 327)\nstep 7: PRESS_HOME\nstep 8: CLICK: (374, 391)\nstep 9: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (143, 116)\nstep 2: CLICK: (535, 933)\nstep 3: CLICK: (467, 76)\nstep 4: TYPE: Lazada\nstep 5: CLICK: (897, 904)\nstep 6: CLICK: (773, 327)\nstep 7: PRESS_HOME\nstep 8: CLICK: (374, 391)\nstep 9: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Setting', 'Wish']\nB: ['Tripadvisor', 'Joom']\nC: ['PlantNet', 'SHEIN']\nD: ['Google Play Store', 'Lazada']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_108_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_108_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_108_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_108_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_108_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_108_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_108_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_108_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_108_8.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['TradingView: Track All Markets', 'TradingView: Track All Markets']\nB: ['Google Play Store', 'Setting']\nC: ['Picturethis', 'Tripadvisor']\nD: ['Tripadvisor', 'Google Play Store']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (323, 498)\nstep 2: SCROLL: UP\nstep 3: SCROLL: UP\nstep 4: CLICK: (153, 937)\nstep 5: CLICK: (569, 365)\nstep 6: CLICK: (552, 426)\nstep 7: CLICK: (519, 624)\nstep 8: CLICK: (967, 72)\nstep 9: TYPE: Danish\nstep 10: CLICK: (432, 176)\nstep 11: CLICK: (454, 167)\nstep 12: SCROLL: UP\nstep 13: CLICK: (865, 655)\nstep 14: PRESS_HOME\nstep 15: CLICK: (93, 655)\nstep 16: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (323, 498)\nstep 2: SCROLL: UP\nstep 3: SCROLL: UP\nstep 4: CLICK: (153, 937)\nstep 5: CLICK: (569, 365)\nstep 6: CLICK: (552, 426)\nstep 7: CLICK: (519, 624)\nstep 8: CLICK: (967, 72)\nstep 9: TYPE: Danish\nstep 10: CLICK: (432, 176)\nstep 11: CLICK: (454, 167)\nstep 12: SCROLL: UP\nstep 13: CLICK: (865, 655)\nstep 14: PRESS_HOME\nstep 15: CLICK: (93, 655)\nstep 16: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['TradingView: Track All Markets', 'TradingView: Track All Markets']\nB: ['Google Play Store', 'Setting']\nC: ['Picturethis', 'Tripadvisor']\nD: ['Tripadvisor', 'Google Play Store']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_109_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_109_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_109_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_109_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_109_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_109_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_109_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_109_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_109_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_109_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_109_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_109_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_109_12.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_109_13.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_109_14.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_109_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Yandex Translate', 'Spotify']\nB: ['Microsoft Translator', 'iHeart: Music, Radio, Podcasts']\nC: ['Google Translate', 'Amazon Music']\nD: ['Language Translator: Translate', 'YT Music']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (572, 494)\nstep 2: CLICK: (72, 172)\nstep 3: CLICK: (435, 213)\nstep 4: TYPE: The Music of the Night\nstep 5: CLICK: (407, 441)\nstep 6: CLICK: (129, 592)\nstep 7: SCROLL: UP\nstep 8: PRESS_HOME\nstep 9: CLICK: (794, 655)\nstep 10: CLICK: (744, 147)\nstep 11: CLICK: (192, 306)\nstep 12: TYPE: Italian\nstep 13: CLICK: (179, 329)\nstep 14: CLICK: (126, 216)\nstep 15: TYPE: Nighttime sharpens, heightens each sensation\nstep 16: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (572, 494)\nstep 2: CLICK: (72, 172)\nstep 3: CLICK: (435, 213)\nstep 4: TYPE: The Music of the Night\nstep 5: CLICK: (407, 441)\nstep 6: CLICK: (129, 592)\nstep 7: SCROLL: UP\nstep 8: PRESS_HOME\nstep 9: CLICK: (794, 655)\nstep 10: CLICK: (744, 147)\nstep 11: CLICK: (192, 306)\nstep 12: TYPE: Italian\nstep 13: CLICK: (179, 329)\nstep 14: CLICK: (126, 216)\nstep 15: TYPE: Nighttime sharpens, heightens each sensation\nstep 16: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Yandex Translate', 'Spotify']\nB: ['Microsoft Translator', 'iHeart: Music, Radio, Podcasts']\nC: ['Google Translate', 'Amazon Music']\nD: ['Language Translator: Translate', 'YT Music']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_110_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_110_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_110_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_110_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_110_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_110_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_110_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_110_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_110_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_110_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_110_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_110_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_110_12.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_110_13.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_110_14.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_110_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Duolingo', 'To-Do List']\nB: ['Rosetta Stone: Learn, Practice', 'Microsoft to do']\nC: ['Babbel - Learn Languages', 'TickTick']\nD: ['Memrise: speak a new language', 'Things']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (386, 406)\nstep 2: CLICK: (78, 65)\nstep 3: CLICK: (315, 154)\nstep 4: CLICK: (163, 490)\nstep 5: CLICK: (395, 928)\nstep 6: PRESS_HOME\nstep 7: CLICK: (817, 531)\nstep 8: CLICK: (929, 875)\nstep 9: TYPE: Italin Learning\nstep 10: CLICK: (920, 646)\nstep 11: CLICK: (306, 377)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (386, 406)\nstep 2: CLICK: (78, 65)\nstep 3: CLICK: (315, 154)\nstep 4: CLICK: (163, 490)\nstep 5: CLICK: (395, 928)\nstep 6: PRESS_HOME\nstep 7: CLICK: (817, 531)\nstep 8: CLICK: (929, 875)\nstep 9: TYPE: Italin Learning\nstep 10: CLICK: (920, 646)\nstep 11: CLICK: (306, 377)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Duolingo', 'To-Do List']\nB: ['Rosetta Stone: Learn, Practice', 'Microsoft to do']\nC: ['Babbel - Learn Languages', 'TickTick']\nD: ['Memrise: speak a new language', 'Things']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_111_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_111_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_111_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_111_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_111_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_111_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_111_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_111_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_111_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_111_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_111_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_111_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Uber', 'Yandex Navigator']\nB: ['Waze Navigation & Live Traffic', 'Citymapper']\nC: ['Yandex Navigator', 'Maps']\nD: ['Google Map', 'Lyft']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (422, 271)\nstep 2: CLICK: (141, 73)\nstep 3: TYPE: bakery\nstep 4: CLICK: (169, 246)\nstep 5: PRESS_HOME\nstep 6: CLICK: (583, 413)\nstep 7: CLICK: (339, 668)\nstep 8: TYPE: 19459 Stevens Creek Blvd #100\nstep 9: CLICK: (435, 281)\nstep 10: CLICK: (474, 889)\nstep 11: CLICK: (466, 903)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (422, 271)\nstep 2: CLICK: (141, 73)\nstep 3: TYPE: bakery\nstep 4: CLICK: (169, 246)\nstep 5: PRESS_HOME\nstep 6: CLICK: (583, 413)\nstep 7: CLICK: (339, 668)\nstep 8: TYPE: 19459 Stevens Creek Blvd #100\nstep 9: CLICK: (435, 281)\nstep 10: CLICK: (474, 889)\nstep 11: CLICK: (466, 903)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Uber', 'Yandex Navigator']\nB: ['Waze Navigation & Live Traffic', 'Citymapper']\nC: ['Yandex Navigator', 'Maps']\nD: ['Google Map', 'Lyft']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_112_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_112_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_112_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_112_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_112_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_112_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_112_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_112_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_112_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_112_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_112_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_112_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Facebook', 'Threads']\nB: ['Gmail', 'Whatsapp']\nC: ['X', 'Facebook']\nD: ['Threads', 'Gmail']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (384, 222)\nstep 2: CLICK: (500, 934)\nstep 3: TYPE: a\nstep 4: CLICK: (916, 592)\nstep 5: CLICK: (437, 393)\nstep 6: CLICK: (439, 915)\nstep 7: PRESS_HOME\nstep 8: CLICK: (366, 94)\nstep 9: CLICK: (260, 107)\nstep 10: CLICK: (474, 136)\nstep 11: CLICK: (299, 224)\nstep 12: CLICK: (586, 479)\nstep 13: CLICK: (403, 937)\nstep 14: CLICK: (474, 663)\nstep 15: CLICK: (912, 605)\nstep 16: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (384, 222)\nstep 2: CLICK: (500, 934)\nstep 3: TYPE: a\nstep 4: CLICK: (916, 592)\nstep 5: CLICK: (437, 393)\nstep 6: CLICK: (439, 915)\nstep 7: PRESS_HOME\nstep 8: CLICK: (366, 94)\nstep 9: CLICK: (260, 107)\nstep 10: CLICK: (474, 136)\nstep 11: CLICK: (299, 224)\nstep 12: CLICK: (586, 479)\nstep 13: CLICK: (403, 937)\nstep 14: CLICK: (474, 663)\nstep 15: CLICK: (912, 605)\nstep 16: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Facebook', 'Threads']\nB: ['Gmail', 'Whatsapp']\nC: ['X', 'Facebook']\nD: ['Threads', 'Gmail']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_113_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_113_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_113_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_113_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_113_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_113_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_113_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_113_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_113_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_113_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_113_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_113_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_113_12.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_113_13.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_113_14.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_113_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['TradingView: Track All Markets', 'Tubi: Movies & Live TV', 'TradingView: Track All Markets']\nB: ['iNaturalist', 'Pluto TV - Live TV and Movies', 'Vaulty:Hide Pictures Videos']\nC: ['Setting', 'Tiktok', 'Google Play Store']\nD: ['Google Play Store', 'Netflix', 'PlantNet']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (124, 705)\nstep 2: CLICK: (653, 379)\nstep 3: PRESS_HOME\nstep 4: CLICK: (434, 710)\nstep 5: SCROLL: UP\nstep 6: SCROLL: DOWN\nstep 7: CLICK: (152, 414)\nstep 8: CLICK: (483, 352)\nstep 9: CLICK: (424, 580)\nstep 10: CLICK: (936, 474)\nstep 11: CLICK: (394, 74)\nstep 12: CLICK: (478, 456)\nstep 13: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (124, 705)\nstep 2: CLICK: (653, 379)\nstep 3: PRESS_HOME\nstep 4: CLICK: (434, 710)\nstep 5: SCROLL: UP\nstep 6: SCROLL: DOWN\nstep 7: CLICK: (152, 414)\nstep 8: CLICK: (483, 352)\nstep 9: CLICK: (424, 580)\nstep 10: CLICK: (936, 474)\nstep 11: CLICK: (394, 74)\nstep 12: CLICK: (478, 456)\nstep 13: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['TradingView: Track All Markets', 'Tubi: Movies & Live TV', 'TradingView: Track All Markets']\nB: ['iNaturalist', 'Pluto TV - Live TV and Movies', 'Vaulty:Hide Pictures Videos']\nC: ['Setting', 'Tiktok', 'Google Play Store']\nD: ['Google Play Store', 'Netflix', 'PlantNet']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_114_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_114_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_114_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_114_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_114_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_114_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_114_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_114_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_114_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_114_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_114_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_114_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_114_12.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['wikiHow', 'Applock Pro - APP Lock & Guard']\nB: ['Opera', 'PlantNet']\nC: ['Wikipedia', 'Plantin']\nD: ['Firefox', 'TradingView: Track All Markets']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (152, 237)\nstep 2: CLICK: (344, 56)\nstep 3: TYPE: Alibaba's stock market news\nstep 4: CLICK: (943, 908)\nstep 5: CLICK: (316, 321)\nstep 6: PRESS_HOME\nstep 7: CLICK: (127, 500)\nstep 8: CLICK: (921, 56)\nstep 9: TYPE: Alibaba\nstep 10: CLICK: (421, 157)\nstep 11: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (152, 237)\nstep 2: CLICK: (344, 56)\nstep 3: TYPE: Alibaba's stock market news\nstep 4: CLICK: (943, 908)\nstep 5: CLICK: (316, 321)\nstep 6: PRESS_HOME\nstep 7: CLICK: (127, 500)\nstep 8: CLICK: (921, 56)\nstep 9: TYPE: Alibaba\nstep 10: CLICK: (421, 157)\nstep 11: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['wikiHow', 'Applock Pro - APP Lock & Guard']\nB: ['Opera', 'PlantNet']\nC: ['Wikipedia', 'Plantin']\nD: ['Firefox', 'TradingView: Track All Markets']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_115_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_115_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_115_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_115_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_115_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_115_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_115_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_115_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_115_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_115_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_115_10.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Wikipedia', 'X']\nB: ['Firefox', 'Whatsapp']\nC: ['Chrome', 'Messenger']\nD: ['DuckDuckgo', 'Instagram']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (380, 670)\nstep 2: TYPE: sports game events\nstep 3: CLICK: (931, 927)\nstep 4: CLICK: (333, 454)\nstep 5: SCROLL: UP\nstep 6: PRESS_HOME\nstep 7: CLICK: (581, 147)\nstep 8: CLICK: (922, 67)\nstep 9: CLICK: (302, 143)\nstep 10: CLICK: (141, 193)\nstep 11: CLICK: (161, 955)\nstep 12: TYPE: In the MLB,Twins:White Sox is 10:5\nstep 13: CLICK: (904, 635)\nstep 14: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (380, 670)\nstep 2: TYPE: sports game events\nstep 3: CLICK: (931, 927)\nstep 4: CLICK: (333, 454)\nstep 5: SCROLL: UP\nstep 6: PRESS_HOME\nstep 7: CLICK: (581, 147)\nstep 8: CLICK: (922, 67)\nstep 9: CLICK: (302, 143)\nstep 10: CLICK: (141, 193)\nstep 11: CLICK: (161, 955)\nstep 12: TYPE: In the MLB,Twins:White Sox is 10:5\nstep 13: CLICK: (904, 635)\nstep 14: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Wikipedia', 'X']\nB: ['Firefox', 'Whatsapp']\nC: ['Chrome', 'Messenger']\nD: ['DuckDuckgo', 'Instagram']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_116_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_116_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_116_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_116_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_116_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_116_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_116_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_116_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_116_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_116_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_116_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_116_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_116_12.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_116_13.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Any.do', 'Tubi: Movies & Live TV']\nB: ['Todoist', 'Netflix']\nC: ['TickTick', 'Triller']\nD: ['Things', 'Tiktok']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (151, 364)\nstep 2: CLICK: (853, 922)\nstep 3: CLICK: (340, 480)\nstep 4: TYPE: do yoga in the morning\nstep 5: CLICK: (493, 367)\nstep 6: PRESS_HOME\nstep 7: CLICK: (809, 807)\nstep 8: CLICK: (238, 332)\nstep 9: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (151, 364)\nstep 2: CLICK: (853, 922)\nstep 3: CLICK: (340, 480)\nstep 4: TYPE: do yoga in the morning\nstep 5: CLICK: (493, 367)\nstep 6: PRESS_HOME\nstep 7: CLICK: (809, 807)\nstep 8: CLICK: (238, 332)\nstep 9: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Any.do', 'Tubi: Movies & Live TV']\nB: ['Todoist', 'Netflix']\nC: ['TickTick', 'Triller']\nD: ['Things', 'Tiktok']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_117_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_117_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_117_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_117_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_117_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_117_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_117_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_117_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_117_8.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Adobe Express: AI Video Design', 'Tumblr']\nB: ['Gallery-photo gallery,album', 'Whatsapp']\nC: ['Lightroom Photo & Video Editor', 'Messenger']\nD: ['Textify- Art Font Photo Editor', 'X']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (808, 503)\nstep 2: CLICK: (497, 918)\nstep 3: CLICK: (657, 572)\nstep 4: CLICK: (105, 894)\nstep 5: CLICK: (33, 464)\nstep 6: SCROLL: RIGHT\nstep 7: CLICK: (955, 22)\nstep 8: CLICK: (879, 53)\nstep 9: CLICK: (336, 654)\nstep 10: CLICK: (494, 704)\nstep 11: CLICK: (951, 911)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (808, 503)\nstep 2: CLICK: (497, 918)\nstep 3: CLICK: (657, 572)\nstep 4: CLICK: (105, 894)\nstep 5: CLICK: (33, 464)\nstep 6: SCROLL: RIGHT\nstep 7: CLICK: (955, 22)\nstep 8: CLICK: (879, 53)\nstep 9: CLICK: (336, 654)\nstep 10: CLICK: (494, 704)\nstep 11: CLICK: (951, 911)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Adobe Express: AI Video Design', 'Tumblr']\nB: ['Gallery-photo gallery,album', 'Whatsapp']\nC: ['Lightroom Photo & Video Editor', 'Messenger']\nD: ['Textify- Art Font Photo Editor', 'X']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_118_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_118_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_118_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_118_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_118_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_118_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_118_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_118_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_118_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_118_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_118_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_118_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Chatbot AI & Smart Assistant', 'Chrome']\nB: ['WOMBO Dream-AI Art Generator', 'Firefox']\nC: ['ChatGPT', 'Edge']\nD: ['GenZArt:Fast AI Art Generator', 'Opera']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (429, 423)\nstep 2: CLICK: (360, 873)\nstep 3: TYPE: tell me about Binomial theorem\nstep 4: CLICK: (696, 429)\nstep 5: PRESS_HOME\nstep 6: CLICK: (511, 907)\nstep 7: CLICK: (282, 363)\nstep 8: TYPE: Binomial theorem\nstep 9: CLICK: (887, 681)\nstep 10: SCROLL: UP\nstep 11: SCROLL: UP\nstep 12: CLICK: (284, 839)\nstep 13: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (429, 423)\nstep 2: CLICK: (360, 873)\nstep 3: TYPE: tell me about Binomial theorem\nstep 4: CLICK: (696, 429)\nstep 5: PRESS_HOME\nstep 6: CLICK: (511, 907)\nstep 7: CLICK: (282, 363)\nstep 8: TYPE: Binomial theorem\nstep 9: CLICK: (887, 681)\nstep 10: SCROLL: UP\nstep 11: SCROLL: UP\nstep 12: CLICK: (284, 839)\nstep 13: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Chatbot AI & Smart Assistant', 'Chrome']\nB: ['WOMBO Dream-AI Art Generator', 'Firefox']\nC: ['ChatGPT', 'Edge']\nD: ['GenZArt:Fast AI Art Generator', 'Opera']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_119_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_119_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_119_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_119_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_119_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_119_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_119_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_119_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_119_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_119_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_119_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_119_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_119_12.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Pluto TV - Live TV and Movies', 'Simplenote']\nB: ['Youtube', 'WPS office']\nC: ['Triller', 'BasicNote - Notes, Notepad']\nD: ['Tiktok', 'Notepad - Notes and To Do List']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (592, 892)\nstep 2: CLICK: (703, 256)\nstep 3: CLICK: (101, 266)\nstep 4: PRESS_HOME\nstep 5: CLICK: (710, 397)\nstep 6: CLICK: (31, 436)\nstep 7: CLICK: (240, 498)\nstep 8: CLICK: (648, 562)\nstep 9: CLICK: (971, 472)\nstep 10: CLICK: (21, 63)\nstep 11: CLICK: (108, 396)\nstep 12: CLICK: (511, 506)\nstep 13: TYPE:  Smart Cities: Technology and Urban Planning\nstep 14: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (592, 892)\nstep 2: CLICK: (703, 256)\nstep 3: CLICK: (101, 266)\nstep 4: PRESS_HOME\nstep 5: CLICK: (710, 397)\nstep 6: CLICK: (31, 436)\nstep 7: CLICK: (240, 498)\nstep 8: CLICK: (648, 562)\nstep 9: CLICK: (971, 472)\nstep 10: CLICK: (21, 63)\nstep 11: CLICK: (108, 396)\nstep 12: CLICK: (511, 506)\nstep 13: TYPE:  Smart Cities: Technology and Urban Planning\nstep 14: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Pluto TV - Live TV and Movies', 'Simplenote']\nB: ['Youtube', 'WPS office']\nC: ['Triller', 'BasicNote - Notes, Notepad']\nD: ['Tiktok', 'Notepad - Notes and To Do List']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_120_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_120_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_120_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_120_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_120_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_120_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_120_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_120_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_120_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_120_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_120_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_120_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_120_12.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_120_13.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Bing: chat with AI & GPT4', 'Triller', 'PlantNet']\nB: ['DuckDuckGo', 'Tiktok', 'Google Play Store']\nC: ['Opera', 'Youtube', 'Setting']\nD: ['Firefox', 'Tubi: Movies & Live TV', 'Vaulty:Hide Pictures Videos']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (819, 144)\nstep 2: CLICK: (683, 73)\nstep 3: TYPE: DIY crafts blogs on youtube\nstep 4: CLICK: (924, 907)\nstep 5: CLICK: (201, 347)\nstep 6: CLICK: (851, 127)\nstep 7: PRESS_HOME\nstep 8: CLICK: (384, 839)\nstep 9: CLICK: (333, 328)\nstep 10: CLICK: (473, 84)\nstep 11: SCROLL: RIGHT\nstep 12: CLICK: (940, 324)\nstep 13: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (819, 144)\nstep 2: CLICK: (683, 73)\nstep 3: TYPE: DIY crafts blogs on youtube\nstep 4: CLICK: (924, 907)\nstep 5: CLICK: (201, 347)\nstep 6: CLICK: (851, 127)\nstep 7: PRESS_HOME\nstep 8: CLICK: (384, 839)\nstep 9: CLICK: (333, 328)\nstep 10: CLICK: (473, 84)\nstep 11: SCROLL: RIGHT\nstep 12: CLICK: (940, 324)\nstep 13: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Bing: chat with AI & GPT4', 'Triller', 'PlantNet']\nB: ['DuckDuckGo', 'Tiktok', 'Google Play Store']\nC: ['Opera', 'Youtube', 'Setting']\nD: ['Firefox', 'Tubi: Movies & Live TV', 'Vaulty:Hide Pictures Videos']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_121_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_121_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_121_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_121_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_121_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_121_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_121_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_121_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_121_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_121_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_121_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_121_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_121_12.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Pluto TV - Live TV and Movies', 'Plantin', 'iNaturalist']\nB: ['Tubi: Movies & Live TV', 'Contacts', 'Applock Pro - APP Lock & Guard']\nC: ['Tiktok', 'Applock Pro - APP Lock & Guard', 'Vaulty:Hide Pictures Videos']\nD: ['Triller', 'Google Play Store', 'Setting']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (851, 839)\nstep 2: CLICK: (848, 330)\nstep 3: PRESS_HOME\nstep 4: CLICK: (375, 836)\nstep 5: CLICK: (875, 404)\nstep 6: CLICK: (52, 78)\nstep 7: CLICK: (199, 387)\nstep 8: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (851, 839)\nstep 2: CLICK: (848, 330)\nstep 3: PRESS_HOME\nstep 4: CLICK: (375, 836)\nstep 5: CLICK: (875, 404)\nstep 6: CLICK: (52, 78)\nstep 7: CLICK: (199, 387)\nstep 8: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Pluto TV - Live TV and Movies', 'Plantin', 'iNaturalist']\nB: ['Tubi: Movies & Live TV', 'Contacts', 'Applock Pro - APP Lock & Guard']\nC: ['Tiktok', 'Applock Pro - APP Lock & Guard', 'Vaulty:Hide Pictures Videos']\nD: ['Triller', 'Google Play Store', 'Setting']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_122_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_122_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_122_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_122_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_122_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_122_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_122_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_122_7.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Google Docs', 'wikiHow']\nB: ['Microsoft Word', 'Bing: chat with AI & GPT4']\nC: ['Dropbox Paper', 'Opera']\nD: ['WPS office', 'Firefox']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (705, 217)\nstep 2: CLICK: (286, 251)\nstep 3: CLICK: (916, 154)\nstep 4: TYPE: 2022 nobel prize winners in physics\nstep 5: CLICK: (897, 859)\nstep 6: LONG_PRESS: (99, 549)\nstep 7: SCROLL: RIGHT\nstep 8: CLICK: (144, 489)\nstep 9: PRESS_HOME\nstep 10: CLICK: (844, 499)\nstep 11: CLICK: (236, 357)\nstep 12: CLICK: (236, 357)\nstep 13: TYPE: 2022 nobel prize winners in physics:\nstep 14: CLICK: (561, 559)\nstep 15: CLICK: (99, 78)\nstep 16: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (705, 217)\nstep 2: CLICK: (286, 251)\nstep 3: CLICK: (916, 154)\nstep 4: TYPE: 2022 nobel prize winners in physics\nstep 5: CLICK: (897, 859)\nstep 6: LONG_PRESS: (99, 549)\nstep 7: SCROLL: RIGHT\nstep 8: CLICK: (144, 489)\nstep 9: PRESS_HOME\nstep 10: CLICK: (844, 499)\nstep 11: CLICK: (236, 357)\nstep 12: CLICK: (236, 357)\nstep 13: TYPE: 2022 nobel prize winners in physics:\nstep 14: CLICK: (561, 559)\nstep 15: CLICK: (99, 78)\nstep 16: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Google Docs', 'wikiHow']\nB: ['Microsoft Word', 'Bing: chat with AI & GPT4']\nC: ['Dropbox Paper', 'Opera']\nD: ['WPS office', 'Firefox']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_123_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_123_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_123_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_123_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_123_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_123_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_123_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_123_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_123_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_123_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_123_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_123_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_123_12.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_123_13.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_123_14.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_123_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Youtube', 'Google Play Store']\nB: ['Triller', 'Tripadvisor']\nC: ['Netflix', 'TradingView: Track All Markets']\nD: ['Tiktok', 'iNaturalist']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (127, 745)\nstep 2: PRESS_HOME\nstep 3: SCROLL: RIGHT\nstep 4: CLICK: (127, 604)\nstep 5: CLICK: (788, 74)\nstep 6: TYPE: Apple Fitness Plus\nstep 7: CLICK: (924, 864)\nstep 8: CLICK: (652, 693)\nstep 9: CLICK: (605, 496)\nstep 10: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (127, 745)\nstep 2: PRESS_HOME\nstep 3: SCROLL: RIGHT\nstep 4: CLICK: (127, 604)\nstep 5: CLICK: (788, 74)\nstep 6: TYPE: Apple Fitness Plus\nstep 7: CLICK: (924, 864)\nstep 8: CLICK: (652, 693)\nstep 9: CLICK: (605, 496)\nstep 10: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Youtube', 'Google Play Store']\nB: ['Triller', 'Tripadvisor']\nC: ['Netflix', 'TradingView: Track All Markets']\nD: ['Tiktok', 'iNaturalist']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_124_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_124_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_124_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_124_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_124_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_124_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_124_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_124_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_124_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_124_9.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Amazon Music', 'To-Do List']\nB: ['Spotify', 'TickTick']\nC: ['iHeart: Music, Radio, Podcasts', 'Things']\nD: ['Pandora', 'Microsoft to do']\n",
+    "question": "The corresponding actions are: step 1: SCROLL: LEFT\nstep 2: CLICK: (134, 107)\nstep 3: CLICK: (431, 464)\nstep 4: CLICK: (694, 858)\nstep 5: PRESS_HOME\nstep 6: SCROLL: RIGHT\nstep 7: CLICK: (590, 387)\nstep 8: CLICK: (873, 862)\nstep 9: CLICK: (670, 690)\nstep 10: TYPE:  do yoga with this\nstep 11: CLICK: (900, 632)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: SCROLL: LEFT\nstep 2: CLICK: (134, 107)\nstep 3: CLICK: (431, 464)\nstep 4: CLICK: (694, 858)\nstep 5: PRESS_HOME\nstep 6: SCROLL: RIGHT\nstep 7: CLICK: (590, 387)\nstep 8: CLICK: (873, 862)\nstep 9: CLICK: (670, 690)\nstep 10: TYPE:  do yoga with this\nstep 11: CLICK: (900, 632)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Amazon Music', 'To-Do List']\nB: ['Spotify', 'TickTick']\nC: ['iHeart: Music, Radio, Podcasts', 'Things']\nD: ['Pandora', 'Microsoft to do']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_125_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_125_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_125_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_125_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_125_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_125_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_125_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_125_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_125_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_125_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_125_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_125_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Messenger', 'Instagram']\nB: ['Whatsapp', 'Whatsapp']\nC: ['Tumblr', 'Facebook']\nD: ['Instagram', 'Tumblr']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (565, 133)\nstep 2: CLICK: (718, 78)\nstep 3: TYPE: political debate\nstep 4: CLICK: (841, 878)\nstep 5: CLICK: (497, 139)\nstep 6: CLICK: (553, 447)\nstep 7: SCROLL: UP\nstep 8: SCROLL: UP\nstep 9: PRESS_HOME\nstep 10: CLICK: (666, 310)\nstep 11: CLICK: (566, 907)\nstep 12: CLICK: (404, 260)\nstep 13: CLICK: (464, 839)\nstep 14: TYPE: BBC's political debate will come at 6:30PM today\nstep 15: CLICK: (763, 488)\nstep 16: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (565, 133)\nstep 2: CLICK: (718, 78)\nstep 3: TYPE: political debate\nstep 4: CLICK: (841, 878)\nstep 5: CLICK: (497, 139)\nstep 6: CLICK: (553, 447)\nstep 7: SCROLL: UP\nstep 8: SCROLL: UP\nstep 9: PRESS_HOME\nstep 10: CLICK: (666, 310)\nstep 11: CLICK: (566, 907)\nstep 12: CLICK: (404, 260)\nstep 13: CLICK: (464, 839)\nstep 14: TYPE: BBC's political debate will come at 6:30PM today\nstep 15: CLICK: (763, 488)\nstep 16: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Messenger', 'Instagram']\nB: ['Whatsapp', 'Whatsapp']\nC: ['Tumblr', 'Facebook']\nD: ['Instagram', 'Tumblr']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_126_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_126_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_126_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_126_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_126_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_126_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_126_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_126_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_126_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_126_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_126_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_126_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_126_12.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_126_13.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_126_14.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_126_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Facebook', 'Zoho Meeting']\nB: ['X', 'Google Meet']\nC: ['Instagram', 'ZOOM Cloud Meetings']\nD: ['Threads', 'Microsoft Teams']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (694, 653)\nstep 2: CLICK: (376, 869)\nstep 3: CLICK: (198, 250)\nstep 4: CLICK: (820, 724)\nstep 5: PRESS_HOME\nstep 6: CLICK: (572, 315)\nstep 7: CLICK: (894, 907)\nstep 8: CLICK: (360, 196)\nstep 9: CLICK: (426, 908)\nstep 10: TYPE: meet.google.com/adk-mceh-cwe\nstep 11: CLICK: (940, 485)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (694, 653)\nstep 2: CLICK: (376, 869)\nstep 3: CLICK: (198, 250)\nstep 4: CLICK: (820, 724)\nstep 5: PRESS_HOME\nstep 6: CLICK: (572, 315)\nstep 7: CLICK: (894, 907)\nstep 8: CLICK: (360, 196)\nstep 9: CLICK: (426, 908)\nstep 10: TYPE: meet.google.com/adk-mceh-cwe\nstep 11: CLICK: (940, 485)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Facebook', 'Zoho Meeting']\nB: ['X', 'Google Meet']\nC: ['Instagram', 'ZOOM Cloud Meetings']\nD: ['Threads', 'Microsoft Teams']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_127_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_127_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_127_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_127_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_127_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_127_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_127_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_127_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_127_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_127_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_127_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_127_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Instagram', 'YT Music']\nB: ['Messenger', 'Spotify']\nC: ['Threads', 'Pandora']\nD: ['Facebook', 'Amazon Music']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (630, 501)\nstep 2: CLICK: (381, 909)\nstep 3: CLICK: (469, 317)\nstep 4: TYPE: punk\nstep 5: CLICK: (941, 872)\nstep 6: CLICK: (430, 254)\nstep 7: CLICK: (690, 492)\nstep 8: CLICK: (909, 617)\nstep 9: PRESS_HOME\nstep 10: CLICK: (849, 150)\nstep 11: CLICK: (568, 624)\nstep 12: CLICK: (622, 892)\nstep 13: TYPE: Punk Goes 80's\nstep 14: CLICK: (926, 492)\nstep 15: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (630, 501)\nstep 2: CLICK: (381, 909)\nstep 3: CLICK: (469, 317)\nstep 4: TYPE: punk\nstep 5: CLICK: (941, 872)\nstep 6: CLICK: (430, 254)\nstep 7: CLICK: (690, 492)\nstep 8: CLICK: (909, 617)\nstep 9: PRESS_HOME\nstep 10: CLICK: (849, 150)\nstep 11: CLICK: (568, 624)\nstep 12: CLICK: (622, 892)\nstep 13: TYPE: Punk Goes 80's\nstep 14: CLICK: (926, 492)\nstep 15: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Instagram', 'YT Music']\nB: ['Messenger', 'Spotify']\nC: ['Threads', 'Pandora']\nD: ['Facebook', 'Amazon Music']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_128_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_128_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_128_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_128_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_128_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_128_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_128_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_128_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_128_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_128_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_128_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_128_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_128_12.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_128_13.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_128_14.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['iNaturalist', 'Cash App']\nB: ['Plantin', 'Google Wallet']\nC: ['Google Play Store', 'Venmo']\nD: ['Applock Pro - APP Lock & Guard', 'PayPal - Send, Shop, Manage']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (787, 324)\nstep 2: CLICK: (400, 683)\nstep 3: CLICK: (505, 695)\nstep 4: CLICK: (605, 697)\nstep 5: CLICK: (387, 882)\nstep 6: CLICK: (484, 901)\nstep 7: CLICK: (601, 898)\nstep 8: SCROLL: UP\nstep 9: SCROLL: UP\nstep 10: CLICK: (727, 878)\nstep 11: PRESS_HOME\nstep 12: CLICK: (683, 498)\nstep 13: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (787, 324)\nstep 2: CLICK: (400, 683)\nstep 3: CLICK: (505, 695)\nstep 4: CLICK: (605, 697)\nstep 5: CLICK: (387, 882)\nstep 6: CLICK: (484, 901)\nstep 7: CLICK: (601, 898)\nstep 8: SCROLL: UP\nstep 9: SCROLL: UP\nstep 10: CLICK: (727, 878)\nstep 11: PRESS_HOME\nstep 12: CLICK: (683, 498)\nstep 13: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['iNaturalist', 'Cash App']\nB: ['Plantin', 'Google Wallet']\nC: ['Google Play Store', 'Venmo']\nD: ['Applock Pro - APP Lock & Guard', 'PayPal - Send, Shop, Manage']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_129_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_129_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_129_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_129_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_129_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_129_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_129_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_129_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_129_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_129_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_129_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_129_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_129_12.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['To-Do List', 'Pandora']\nB: ['Todoist', 'Amazon Music']\nC: ['Things', 'YT Music']\nD: ['TickTick', 'Spotify']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (610, 243)\nstep 2: CLICK: (822, 504)\nstep 3: CLICK: (85, 854)\nstep 4: PRESS_HOME\nstep 5: CLICK: (136, 381)\nstep 6: CLICK: (904, 927)\nstep 7: CLICK: (289, 473)\nstep 8: CLICK: (525, 647)\nstep 9: TYPE:  do yoga with this\nstep 10: CLICK: (472, 431)\nstep 11: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (610, 243)\nstep 2: CLICK: (822, 504)\nstep 3: CLICK: (85, 854)\nstep 4: PRESS_HOME\nstep 5: CLICK: (136, 381)\nstep 6: CLICK: (904, 927)\nstep 7: CLICK: (289, 473)\nstep 8: CLICK: (525, 647)\nstep 9: TYPE:  do yoga with this\nstep 10: CLICK: (472, 431)\nstep 11: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['To-Do List', 'Pandora']\nB: ['Todoist', 'Amazon Music']\nC: ['Things', 'YT Music']\nD: ['TickTick', 'Spotify']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_130_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_130_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_130_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_130_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_130_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_130_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_130_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_130_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_130_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_130_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_130_10.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Setting', 'Shorts VotTak: Short Video App', 'Google Play Store']\nB: ['PlantNet', 'Youtube', 'Picturethis']\nC: ['Picturethis', 'Likee', 'Applock Pro - APP Lock & Guard']\nD: ['iNaturalist', 'Triller', 'Setting']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (140, 706)\nstep 2: CLICK: (23, 66)\nstep 3: PRESS_HOME\nstep 4: CLICK: (436, 708)\nstep 5: PRESS_HOME\nstep 6: CLICK: (142, 703)\nstep 7: CLICK: (148, 63)\nstep 8: CLICK: (977, 61)\nstep 9: TYPE: vottak\nstep 10: CLICK: (258, 128)\nstep 11: CLICK: (692, 337)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (140, 706)\nstep 2: CLICK: (23, 66)\nstep 3: PRESS_HOME\nstep 4: CLICK: (436, 708)\nstep 5: PRESS_HOME\nstep 6: CLICK: (142, 703)\nstep 7: CLICK: (148, 63)\nstep 8: CLICK: (977, 61)\nstep 9: TYPE: vottak\nstep 10: CLICK: (258, 128)\nstep 11: CLICK: (692, 337)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Setting', 'Shorts VotTak: Short Video App', 'Google Play Store']\nB: ['PlantNet', 'Youtube', 'Picturethis']\nC: ['Picturethis', 'Likee', 'Applock Pro - APP Lock & Guard']\nD: ['iNaturalist', 'Triller', 'Setting']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_131_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_131_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_131_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_131_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_131_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_131_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_131_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_131_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_131_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_131_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_131_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_131_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Gmail', 'Chatty - AI Assistant']\nB: ['Facebook', 'GenZArt:Fast AI Art Generator']\nC: ['Threads', 'Chatbot AI & Smart Assistant']\nD: ['Instagram', 'ChatGPT']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (161, 515)\nstep 2: CLICK: (516, 899)\nstep 3: CLICK: (393, 444)\nstep 4: TYPE:  Sunflower\nstep 5: CLICK: (930, 882)\nstep 6: CLICK: (702, 787)\nstep 7: CLICK: (855, 734)\nstep 8: CLICK: (308, 829)\nstep 9: CLICK: (862, 146)\nstep 10: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (161, 515)\nstep 2: CLICK: (516, 899)\nstep 3: CLICK: (393, 444)\nstep 4: TYPE:  Sunflower\nstep 5: CLICK: (930, 882)\nstep 6: CLICK: (702, 787)\nstep 7: CLICK: (855, 734)\nstep 8: CLICK: (308, 829)\nstep 9: CLICK: (862, 146)\nstep 10: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Gmail', 'Chatty - AI Assistant']\nB: ['Facebook', 'GenZArt:Fast AI Art Generator']\nC: ['Threads', 'Chatbot AI & Smart Assistant']\nD: ['Instagram', 'ChatGPT']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_132_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_132_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_132_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_132_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_132_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_132_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_132_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_132_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_132_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_132_9.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Target', 'Tumblr']\nB: ['Net-a-Porte', 'X']\nC: ['Amazon', 'Instagram']\nD: ['DealMoon', 'Whatsapp']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (233, 576)\nstep 2: CLICK: (360, 960)\nstep 3: CLICK: (71, 705)\nstep 4: TYPE: instant camera recommendation\nstep 5: CLICK: (745, 78)\nstep 6: CLICK: (741, 765)\nstep 7: PRESS_BACK\nstep 8: SCROLL: LEFT\nstep 9: SCROLL: LEFT\nstep 10: IMPOSSIBLE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (233, 576)\nstep 2: CLICK: (360, 960)\nstep 3: CLICK: (71, 705)\nstep 4: TYPE: instant camera recommendation\nstep 5: CLICK: (745, 78)\nstep 6: CLICK: (741, 765)\nstep 7: PRESS_BACK\nstep 8: SCROLL: LEFT\nstep 9: SCROLL: LEFT\nstep 10: IMPOSSIBLE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Target', 'Tumblr']\nB: ['Net-a-Porte', 'X']\nC: ['Amazon', 'Instagram']\nD: ['DealMoon', 'Whatsapp']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_133_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_133_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_133_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_133_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_133_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_133_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_133_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_133_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_133_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_133_9.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Triller', 'Plantin']\nB: ['Likee', 'iNaturalist']\nC: ['Tubi: Movies & Live TV', 'Vaulty:Hide Pictures Videos']\nD: ['Youtube', 'Google Play Store']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (624, 809)\nstep 2: PRESS_HOME\nstep 3: CLICK: (838, 809)\nstep 4: CLICK: (474, 65)\nstep 5: TYPE: Aaptiv\nstep 6: CLICK: (921, 912)\nstep 7: CLICK: (819, 323)\nstep 8: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (624, 809)\nstep 2: PRESS_HOME\nstep 3: CLICK: (838, 809)\nstep 4: CLICK: (474, 65)\nstep 5: TYPE: Aaptiv\nstep 6: CLICK: (921, 912)\nstep 7: CLICK: (819, 323)\nstep 8: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Triller', 'Plantin']\nB: ['Likee', 'iNaturalist']\nC: ['Tubi: Movies & Live TV', 'Vaulty:Hide Pictures Videos']\nD: ['Youtube', 'Google Play Store']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_134_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_134_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_134_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_134_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_134_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_134_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_134_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_134_7.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Firefox', 'Tripadvisor']\nB: ['Chrome', 'TradingView: Track All Markets']\nC: ['Bing: chat with AI & GPT4', 'iNaturalist']\nD: ['Edge', 'Applock Pro - APP Lock & Guard']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (440, 914)\nstep 2: CLICK: (274, 415)\nstep 3: TYPE: Facebook's stock market news\nstep 4: CLICK: (884, 878)\nstep 5: SCROLL: UP\nstep 6: CLICK: (246, 786)\nstep 7: PRESS_HOME\nstep 8: CLICK: (913, 494)\nstep 9: CLICK: (379, 74)\nstep 10: TYPE: Facebook\nstep 11: CLICK: (153, 223)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (440, 914)\nstep 2: CLICK: (274, 415)\nstep 3: TYPE: Facebook's stock market news\nstep 4: CLICK: (884, 878)\nstep 5: SCROLL: UP\nstep 6: CLICK: (246, 786)\nstep 7: PRESS_HOME\nstep 8: CLICK: (913, 494)\nstep 9: CLICK: (379, 74)\nstep 10: TYPE: Facebook\nstep 11: CLICK: (153, 223)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Firefox', 'Tripadvisor']\nB: ['Chrome', 'TradingView: Track All Markets']\nC: ['Bing: chat with AI & GPT4', 'iNaturalist']\nD: ['Edge', 'Applock Pro - APP Lock & Guard']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_135_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_135_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_135_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_135_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_135_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_135_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_135_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_135_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_135_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_135_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_135_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_135_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Bing: chat with AI & GPT4', 'Threads']\nB: ['Chrome', 'Gmail']\nC: ['Wikipedia', 'Facebook']\nD: ['Opera', 'Instagram']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (210, 645)\nstep 2: CLICK: (298, 411)\nstep 3: TYPE: food festival events\nstep 4: CLICK: (856, 897)\nstep 5: PRESS_HOME\nstep 6: CLICK: (839, 338)\nstep 7: CLICK: (31, 189)\nstep 8: CLICK: (43, 164)\nstep 9: TYPE: caba62244@gmail.com\nstep 10: CLICK: (448, 355)\nstep 11: CLICK: (352, 420)\nstep 12: TYPE: Assyrian Food Festival will come on AUG 17\nstep 13: CLICK: (904, 76)\nstep 14: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (210, 645)\nstep 2: CLICK: (298, 411)\nstep 3: TYPE: food festival events\nstep 4: CLICK: (856, 897)\nstep 5: PRESS_HOME\nstep 6: CLICK: (839, 338)\nstep 7: CLICK: (31, 189)\nstep 8: CLICK: (43, 164)\nstep 9: TYPE: caba62244@gmail.com\nstep 10: CLICK: (448, 355)\nstep 11: CLICK: (352, 420)\nstep 12: TYPE: Assyrian Food Festival will come on AUG 17\nstep 13: CLICK: (904, 76)\nstep 14: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Bing: chat with AI & GPT4', 'Threads']\nB: ['Chrome', 'Gmail']\nC: ['Wikipedia', 'Facebook']\nD: ['Opera', 'Instagram']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_136_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_136_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_136_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_136_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_136_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_136_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_136_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_136_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_136_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_136_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_136_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_136_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_136_12.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_136_13.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['X', 'Photos']\nB: ['Gmail', 'Google Photos']\nC: ['Tumblr', 'ABPV']\nD: ['Instagram', 'Mapillary']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (121, 519)\nstep 2: CLICK: (364, 437)\nstep 3: CLICK: (155, 936)\nstep 4: CLICK: (527, 778)\nstep 5: TYPE: caba62244@gmail.com\nstep 6: CLICK: (378, 259)\nstep 7: CLICK: (826, 63)\nstep 8: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (121, 519)\nstep 2: CLICK: (364, 437)\nstep 3: CLICK: (155, 936)\nstep 4: CLICK: (527, 778)\nstep 5: TYPE: caba62244@gmail.com\nstep 6: CLICK: (378, 259)\nstep 7: CLICK: (826, 63)\nstep 8: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['X', 'Photos']\nB: ['Gmail', 'Google Photos']\nC: ['Tumblr', 'ABPV']\nD: ['Instagram', 'Mapillary']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_137_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_137_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_137_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_137_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_137_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_137_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_137_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_137_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['X', 'Calendar']\nB: ['Instagram', 'All-In-One Calculator']\nC: ['Whatsapp', 'Simple Calendar']\nD: ['Messenger', 'Simple Calculator']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (568, 506)\nstep 2: CLICK: (256, 79)\nstep 3: TYPE: what is the latest Terminator movie\nstep 4: CLICK: (865, 889)\nstep 5: SCROLL: UP\nstep 6: PRESS_HOME\nstep 7: CLICK: (561, 935)\nstep 8: CLICK: (774, 806)\nstep 9: CLICK: (187, 806)\nstep 10: TYPE: watch the movie Terminator:DarkFate\nstep 11: CLICK: (418, 196)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (568, 506)\nstep 2: CLICK: (256, 79)\nstep 3: TYPE: what is the latest Terminator movie\nstep 4: CLICK: (865, 889)\nstep 5: SCROLL: UP\nstep 6: PRESS_HOME\nstep 7: CLICK: (561, 935)\nstep 8: CLICK: (774, 806)\nstep 9: CLICK: (187, 806)\nstep 10: TYPE: watch the movie Terminator:DarkFate\nstep 11: CLICK: (418, 196)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['X', 'Calendar']\nB: ['Instagram', 'All-In-One Calculator']\nC: ['Whatsapp', 'Simple Calendar']\nD: ['Messenger', 'Simple Calculator']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_138_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_138_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_138_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_138_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_138_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_138_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_138_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_138_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_138_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_138_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_138_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_138_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Pluto TV - Live TV and Movies', 'Simple Calculator']\nB: ['Youtube', 'Clock']\nC: ['Triller', 'ClevCalc - Calculator']\nD: ['Tubi: Movies & Live TV', 'Calculator']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (627, 834)\nstep 2: TYPE: Language pronunciation\nstep 3: CLICK: (960, 907)\nstep 4: CLICK: (371, 658)\nstep 5: PRESS_HOME\nstep 6: CLICK: (384, 258)\nstep 7: CLICK: (891, 151)\nstep 8: TYPE: 500\nstep 9: CLICK: (507, 804)\nstep 10: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (627, 834)\nstep 2: TYPE: Language pronunciation\nstep 3: CLICK: (960, 907)\nstep 4: CLICK: (371, 658)\nstep 5: PRESS_HOME\nstep 6: CLICK: (384, 258)\nstep 7: CLICK: (891, 151)\nstep 8: TYPE: 500\nstep 9: CLICK: (507, 804)\nstep 10: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Pluto TV - Live TV and Movies', 'Simple Calculator']\nB: ['Youtube', 'Clock']\nC: ['Triller', 'ClevCalc - Calculator']\nD: ['Tubi: Movies & Live TV', 'Calculator']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_139_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_139_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_139_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_139_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_139_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_139_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_139_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_139_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_139_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_139_9.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Tumblr', 'Pandora']\nB: ['Whatsapp', 'YT Music']\nC: ['Instagram', 'iHeart: Music, Radio, Podcasts']\nD: ['Facebook', 'Spotify']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (541, 479)\nstep 2: CLICK: (586, 904)\nstep 3: CLICK: (205, 92)\nstep 4: TYPE: Electronic\nstep 5: SCROLL: LEFT\nstep 6: CLICK: (900, 164)\nstep 7: CLICK: (917, 398)\nstep 8: PRESS_HOME\nstep 9: CLICK: (686, 328)\nstep 10: CLICK: (584, 915)\nstep 11: CLICK: (445, 298)\nstep 12: CLICK: (444, 846)\nstep 13: TYPE: Electronic\nstep 14: CLICK: (757, 507)\nstep 15: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (541, 479)\nstep 2: CLICK: (586, 904)\nstep 3: CLICK: (205, 92)\nstep 4: TYPE: Electronic\nstep 5: SCROLL: LEFT\nstep 6: CLICK: (900, 164)\nstep 7: CLICK: (917, 398)\nstep 8: PRESS_HOME\nstep 9: CLICK: (686, 328)\nstep 10: CLICK: (584, 915)\nstep 11: CLICK: (445, 298)\nstep 12: CLICK: (444, 846)\nstep 13: TYPE: Electronic\nstep 14: CLICK: (757, 507)\nstep 15: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Tumblr', 'Pandora']\nB: ['Whatsapp', 'YT Music']\nC: ['Instagram', 'iHeart: Music, Radio, Podcasts']\nD: ['Facebook', 'Spotify']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_140_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_140_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_140_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_140_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_140_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_140_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_140_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_140_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_140_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_140_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_140_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_140_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_140_12.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_140_13.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_140_14.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Yahoo Sports', 'Gmail']\nB: ['NewsBreak', 'Instagram']\nC: ['BBC Sports', 'Tumblr']\nD: ['Microsoft News', 'X']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (138, 547)\nstep 2: CLICK: (968, 67)\nstep 3: TYPE: football match\nstep 4: CLICK: (972, 67)\nstep 5: TYPE: football\nstep 6: CLICK: (129, 218)\nstep 7: PRESS_HOME\nstep 8: CLICK: (732, 94)\nstep 9: CLICK: (30, 144)\nstep 10: TYPE: caba62244@gmail.com\nstep 11: CLICK: (262, 303)\nstep 12: CLICK: (238, 364)\nstep 13: TYPE: Washington:Minchigan is 13:34\nstep 14: CLICK: (938, 68)\nstep 15: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (138, 547)\nstep 2: CLICK: (968, 67)\nstep 3: TYPE: football match\nstep 4: CLICK: (972, 67)\nstep 5: TYPE: football\nstep 6: CLICK: (129, 218)\nstep 7: PRESS_HOME\nstep 8: CLICK: (732, 94)\nstep 9: CLICK: (30, 144)\nstep 10: TYPE: caba62244@gmail.com\nstep 11: CLICK: (262, 303)\nstep 12: CLICK: (238, 364)\nstep 13: TYPE: Washington:Minchigan is 13:34\nstep 14: CLICK: (938, 68)\nstep 15: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Yahoo Sports', 'Gmail']\nB: ['NewsBreak', 'Instagram']\nC: ['BBC Sports', 'Tumblr']\nD: ['Microsoft News', 'X']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_141_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_141_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_141_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_141_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_141_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_141_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_141_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_141_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_141_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_141_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_141_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_141_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_141_12.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_141_13.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_141_14.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['iNaturalist', 'Opera']\nB: ['Setting', 'Edge']\nC: ['Google Play Store', 'Chrome']\nD: ['Applock Pro - APP Lock & Guard', 'DuckDuckGo']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (637, 824)\nstep 2: CLICK: (450, 274)\nstep 3: TYPE: Email Client Apps\nstep 4: CLICK: (925, 912)\nstep 5: PRESS_HOME\nstep 6: CLICK: (149, 651)\nstep 7: CLICK: (376, 56)\nstep 8: TYPE: Microsoft Outlook\nstep 9: CLICK: (906, 908)\nstep 10: CLICK: (757, 321)\nstep 11: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (637, 824)\nstep 2: CLICK: (450, 274)\nstep 3: TYPE: Email Client Apps\nstep 4: CLICK: (925, 912)\nstep 5: PRESS_HOME\nstep 6: CLICK: (149, 651)\nstep 7: CLICK: (376, 56)\nstep 8: TYPE: Microsoft Outlook\nstep 9: CLICK: (906, 908)\nstep 10: CLICK: (757, 321)\nstep 11: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['iNaturalist', 'Opera']\nB: ['Setting', 'Edge']\nC: ['Google Play Store', 'Chrome']\nD: ['Applock Pro - APP Lock & Guard', 'DuckDuckGo']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_142_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_142_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_142_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_142_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_142_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_142_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_142_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_142_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_142_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_142_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_142_10.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Amazon Kindle', 'Chrome']\nB: ['Libby, by OverDrive', 'Firefox']\nC: ['Ploter - Ebook, Audiobook, PDF', 'Edge']\nD: ['Pocket FM: Audio Series', 'DuckDuckGo']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (607, 831)\nstep 2: CLICK: (411, 284)\nstep 3: TYPE: The Vietnam War\nstep 4: CLICK: (344, 121)\nstep 5: CLICK: (463, 697)\nstep 6: PRESS_HOME\nstep 7: SCROLL: LEFT\nstep 8: CLICK: (374, 107)\nstep 9: CLICK: (309, 62)\nstep 10: TYPE: The Vietnam War\nstep 11: CLICK: (346, 119)\nstep 12: CLICK: (436, 264)\nstep 13: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (607, 831)\nstep 2: CLICK: (411, 284)\nstep 3: TYPE: The Vietnam War\nstep 4: CLICK: (344, 121)\nstep 5: CLICK: (463, 697)\nstep 6: PRESS_HOME\nstep 7: SCROLL: LEFT\nstep 8: CLICK: (374, 107)\nstep 9: CLICK: (309, 62)\nstep 10: TYPE: The Vietnam War\nstep 11: CLICK: (346, 119)\nstep 12: CLICK: (436, 264)\nstep 13: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Amazon Kindle', 'Chrome']\nB: ['Libby, by OverDrive', 'Firefox']\nC: ['Ploter - Ebook, Audiobook, PDF', 'Edge']\nD: ['Pocket FM: Audio Series', 'DuckDuckGo']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_143_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_143_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_143_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_143_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_143_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_143_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_143_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_143_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_143_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_143_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_143_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_143_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_143_12.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Google Map', 'Picturethis']\nB: ['Yandex Navigator', 'Google Play Store']\nC: ['Lyft', 'iNaturalist']\nD: ['Maps', 'Contacts']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (627, 649)\nstep 2: CLICK: (114, 941)\nstep 3: CLICK: (353, 106)\nstep 4: TYPE: swimming pool\nstep 5: CLICK: (422, 191)\nstep 6: CLICK: (159, 581)\nstep 7: PRESS_HOME\nstep 8: CLICK: (884, 821)\nstep 9: CLICK: (313, 88)\nstep 10: TYPE: fitness tracking apps\nstep 11: CLICK: (270, 137)\nstep 12: CLICK: (393, 565)\nstep 13: CLICK: (516, 323)\nstep 14: SCROLL: UP\nstep 15: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (627, 649)\nstep 2: CLICK: (114, 941)\nstep 3: CLICK: (353, 106)\nstep 4: TYPE: swimming pool\nstep 5: CLICK: (422, 191)\nstep 6: CLICK: (159, 581)\nstep 7: PRESS_HOME\nstep 8: CLICK: (884, 821)\nstep 9: CLICK: (313, 88)\nstep 10: TYPE: fitness tracking apps\nstep 11: CLICK: (270, 137)\nstep 12: CLICK: (393, 565)\nstep 13: CLICK: (516, 323)\nstep 14: SCROLL: UP\nstep 15: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Google Map', 'Picturethis']\nB: ['Yandex Navigator', 'Google Play Store']\nC: ['Lyft', 'iNaturalist']\nD: ['Maps', 'Contacts']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_144_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_144_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_144_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_144_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_144_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_144_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_144_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_144_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_144_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_144_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_144_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_144_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_144_12.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_144_13.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_144_14.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Wikipedia', 'Facebook']\nB: ['Chrome', 'Threads']\nC: ['Edge', 'Gmail']\nD: ['Firefox', 'X']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (190, 689)\nstep 2: CLICK: (352, 435)\nstep 3: TYPE: cricket scores\nstep 4: CLICK: (847, 883)\nstep 5: SCROLL: UP\nstep 6: SCROLL: UP\nstep 7: PRESS_HOME\nstep 8: CLICK: (922, 136)\nstep 9: CLICK: (502, 910)\nstep 10: TYPE: RCB : DC is 20:19.1\nstep 11: CLICK: (741, 489)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (190, 689)\nstep 2: CLICK: (352, 435)\nstep 3: TYPE: cricket scores\nstep 4: CLICK: (847, 883)\nstep 5: SCROLL: UP\nstep 6: SCROLL: UP\nstep 7: PRESS_HOME\nstep 8: CLICK: (922, 136)\nstep 9: CLICK: (502, 910)\nstep 10: TYPE: RCB : DC is 20:19.1\nstep 11: CLICK: (741, 489)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Wikipedia', 'Facebook']\nB: ['Chrome', 'Threads']\nC: ['Edge', 'Gmail']\nD: ['Firefox', 'X']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_145_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_145_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_145_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_145_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_145_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_145_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_145_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_145_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_145_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_145_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_145_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_145_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['TradingView: Track All Markets', 'Edge']\nB: ['Applock Pro - APP Lock & Guard', 'Wikipedia']\nC: ['Picturethis', 'wikiHow']\nD: ['Google Play Store', 'Opera']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (592, 234)\nstep 2: CLICK: (331, 123)\nstep 3: TYPE: Language Learning Apps\nstep 4: CLICK: (918, 912)\nstep 5: PRESS_HOME\nstep 6: SCROLL: RIGHT\nstep 7: CLICK: (145, 629)\nstep 8: CLICK: (316, 46)\nstep 9: TYPE: Duolingo\nstep 10: CLICK: (906, 912)\nstep 11: CLICK: (846, 187)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (592, 234)\nstep 2: CLICK: (331, 123)\nstep 3: TYPE: Language Learning Apps\nstep 4: CLICK: (918, 912)\nstep 5: PRESS_HOME\nstep 6: SCROLL: RIGHT\nstep 7: CLICK: (145, 629)\nstep 8: CLICK: (316, 46)\nstep 9: TYPE: Duolingo\nstep 10: CLICK: (906, 912)\nstep 11: CLICK: (846, 187)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['TradingView: Track All Markets', 'Edge']\nB: ['Applock Pro - APP Lock & Guard', 'Wikipedia']\nC: ['Picturethis', 'wikiHow']\nD: ['Google Play Store', 'Opera']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_146_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_146_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_146_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_146_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_146_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_146_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_146_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_146_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_146_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_146_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_146_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_146_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['ChatGPT', 'Chrome']\nB: ['Remix:AI Image Creator', 'Opera']\nC: ['Picsart AI Photo Editor,Video', 'Bing: chat with AI & GPT4']\nD: ['Chatbot AI & Smart Assistant', 'DuckDuckGo']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (425, 401)\nstep 2: CLICK: (390, 891)\nstep 3: TYPE: tell me about Fundamental theorem of calculus\nstep 4: CLICK: (689, 419)\nstep 5: PRESS_HOME\nstep 6: CLICK: (284, 391)\nstep 7: TYPE: Fundamental theorem of calculus\nstep 8: CLICK: (883, 692)\nstep 9: CLICK: (270, 938)\nstep 10: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (425, 401)\nstep 2: CLICK: (390, 891)\nstep 3: TYPE: tell me about Fundamental theorem of calculus\nstep 4: CLICK: (689, 419)\nstep 5: PRESS_HOME\nstep 6: CLICK: (284, 391)\nstep 7: TYPE: Fundamental theorem of calculus\nstep 8: CLICK: (883, 692)\nstep 9: CLICK: (270, 938)\nstep 10: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['ChatGPT', 'Chrome']\nB: ['Remix:AI Image Creator', 'Opera']\nC: ['Picsart AI Photo Editor,Video', 'Bing: chat with AI & GPT4']\nD: ['Chatbot AI & Smart Assistant', 'DuckDuckGo']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_147_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_147_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_147_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_147_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_147_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_147_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_147_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_147_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_147_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_147_9.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Whatsapp', 'TradingView: Track All Markets']\nB: ['Instagram', 'Setting']\nC: ['Messenger', 'Google Play Store']\nD: ['X', 'Picturethis']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (846, 810)\nstep 2: SCROLL: UP\nstep 3: CLICK: (315, 426)\nstep 4: SCROLL: RIGHT\nstep 5: SCROLL: RIGHT\nstep 6: CLICK: (339, 300)\nstep 7: PRESS_HOME\nstep 8: SCROLL: UP\nstep 9: SCROLL: UP\nstep 10: CLICK: (610, 558)\nstep 11: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (846, 810)\nstep 2: SCROLL: UP\nstep 3: CLICK: (315, 426)\nstep 4: SCROLL: RIGHT\nstep 5: SCROLL: RIGHT\nstep 6: CLICK: (339, 300)\nstep 7: PRESS_HOME\nstep 8: SCROLL: UP\nstep 9: SCROLL: UP\nstep 10: CLICK: (610, 558)\nstep 11: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Whatsapp', 'TradingView: Track All Markets']\nB: ['Instagram', 'Setting']\nC: ['Messenger', 'Google Play Store']\nD: ['X', 'Picturethis']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_148_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_148_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_148_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_148_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_148_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_148_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_148_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_148_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_148_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_148_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_148_10.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Picsart AI Photo Editor,Video', 'Instagram']\nB: ['Microsoft Copilot', 'X']\nC: ['Chatty - AI Assistant', 'Facebook']\nD: ['GenZArt:Fast AI Art Generator', 'Messenger']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (663, 677)\nstep 2: CLICK: (490, 899)\nstep 3: CLICK: (390, 337)\nstep 4: TYPE:  backpack\nstep 5: CLICK: (848, 877)\nstep 6: CLICK: (515, 776)\nstep 7: CLICK: (725, 777)\nstep 8: CLICK: (365, 673)\nstep 9: CLICK: (725, 444)\nstep 10: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (663, 677)\nstep 2: CLICK: (490, 899)\nstep 3: CLICK: (390, 337)\nstep 4: TYPE:  backpack\nstep 5: CLICK: (848, 877)\nstep 6: CLICK: (515, 776)\nstep 7: CLICK: (725, 777)\nstep 8: CLICK: (365, 673)\nstep 9: CLICK: (725, 444)\nstep 10: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Picsart AI Photo Editor,Video', 'Instagram']\nB: ['Microsoft Copilot', 'X']\nC: ['Chatty - AI Assistant', 'Facebook']\nD: ['GenZArt:Fast AI Art Generator', 'Messenger']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_149_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_149_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_149_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_149_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_149_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_149_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_149_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_149_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_149_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_149_9.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Opera', 'Messenger']\nB: ['Wikipedia', 'Instagram']\nC: ['Chrome', 'Gmail']\nD: ['DuckDuckGo', 'Tumblr']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (869, 675)\nstep 2: CLICK: (277, 145)\nstep 3: TYPE: craft beer tasting events\nstep 4: CLICK: (904, 929)\nstep 5: PRESS_HOME\nstep 6: CLICK: (846, 147)\nstep 7: CLICK: (500, 421)\nstep 8: CLICK: (560, 946)\nstep 9: TYPE: The Mary Wallopers will come on May 3\nstep 10: CLICK: (951, 636)\nstep 11: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (869, 675)\nstep 2: CLICK: (277, 145)\nstep 3: TYPE: craft beer tasting events\nstep 4: CLICK: (904, 929)\nstep 5: PRESS_HOME\nstep 6: CLICK: (846, 147)\nstep 7: CLICK: (500, 421)\nstep 8: CLICK: (560, 946)\nstep 9: TYPE: The Mary Wallopers will come on May 3\nstep 10: CLICK: (951, 636)\nstep 11: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Opera', 'Messenger']\nB: ['Wikipedia', 'Instagram']\nC: ['Chrome', 'Gmail']\nD: ['DuckDuckGo', 'Tumblr']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_150_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_150_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_150_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_150_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_150_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_150_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_150_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_150_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_150_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_150_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_150_10.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Tumblr', 'ZOOM Cloud Meetings']\nB: ['Gmail', 'Zoho Meeting']\nC: ['Threads', 'Google Meet']\nD: ['Facebook', 'Microsoft Teams']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (409, 517)\nstep 2: CLICK: (806, 79)\nstep 3: CLICK: (208, 884)\nstep 4: CLICK: (340, 845)\nstep 5: PRESS_HOME\nstep 6: CLICK: (165, 150)\nstep 7: CLICK: (261, 123)\nstep 8: CLICK: (440, 153)\nstep 9: CLICK: (304, 241)\nstep 10: CLICK: (496, 491)\nstep 11: CLICK: (605, 951)\nstep 12: TYPE: https://teams.live.com/meet/9383055761460?p=3reyMWIRXcgNuvHI\nstep 13: CLICK: (931, 638)\nstep 14: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (409, 517)\nstep 2: CLICK: (806, 79)\nstep 3: CLICK: (208, 884)\nstep 4: CLICK: (340, 845)\nstep 5: PRESS_HOME\nstep 6: CLICK: (165, 150)\nstep 7: CLICK: (261, 123)\nstep 8: CLICK: (440, 153)\nstep 9: CLICK: (304, 241)\nstep 10: CLICK: (496, 491)\nstep 11: CLICK: (605, 951)\nstep 12: TYPE: https://teams.live.com/meet/9383055761460?p=3reyMWIRXcgNuvHI\nstep 13: CLICK: (931, 638)\nstep 14: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Tumblr', 'ZOOM Cloud Meetings']\nB: ['Gmail', 'Zoho Meeting']\nC: ['Threads', 'Google Meet']\nD: ['Facebook', 'Microsoft Teams']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_151_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_151_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_151_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_151_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_151_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_151_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_151_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_151_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_151_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_151_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_151_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_151_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_151_12.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_151_13.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Notepad - Notes and To Do List', 'Opera']\nB: ['Microsoft Word', 'Quora']\nC: ['Google Keep', 'Firefox']\nD: ['Dropbox Paper', 'Chrome']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (389, 131)\nstep 2: CLICK: (793, 195)\nstep 3: CLICK: (485, 515)\nstep 4: LONG_PRESS: (107, 287)\nstep 5: SCROLL: RIGHT\nstep 6: CLICK: (449, 256)\nstep 7: PRESS_HOME\nstep 8: SCROLL: LEFT\nstep 9: SCROLL: LEFT\nstep 10: CLICK: (371, 412)\nstep 11: CLICK: (913, 936)\nstep 12: CLICK: (866, 816)\nstep 13: TYPE: 2016 Nobel-Prize winners in physics\nstep 14: CLICK: (511, 178)\nstep 15: CLICK: (478, 684)\nstep 16: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (389, 131)\nstep 2: CLICK: (793, 195)\nstep 3: CLICK: (485, 515)\nstep 4: LONG_PRESS: (107, 287)\nstep 5: SCROLL: RIGHT\nstep 6: CLICK: (449, 256)\nstep 7: PRESS_HOME\nstep 8: SCROLL: LEFT\nstep 9: SCROLL: LEFT\nstep 10: CLICK: (371, 412)\nstep 11: CLICK: (913, 936)\nstep 12: CLICK: (866, 816)\nstep 13: TYPE: 2016 Nobel-Prize winners in physics\nstep 14: CLICK: (511, 178)\nstep 15: CLICK: (478, 684)\nstep 16: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Notepad - Notes and To Do List', 'Opera']\nB: ['Microsoft Word', 'Quora']\nC: ['Google Keep', 'Firefox']\nD: ['Dropbox Paper', 'Chrome']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_152_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_152_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_152_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_152_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_152_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_152_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_152_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_152_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_152_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_152_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_152_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_152_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_152_12.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_152_13.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_152_14.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_152_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['BasicNote - Notes, Notepad', 'Tiktok']\nB: ['Microsoft word', 'Youtube']\nC: ['Notepad - Notes and To Do List', 'Triller']\nD: ['Dropbox Paper', 'Shorts VotTak: Short Video App']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (316, 926)\nstep 2: CLICK: (205, 672)\nstep 3: CLICK: (101, 405)\nstep 4: PRESS_HOME\nstep 5: CLICK: (901, 479)\nstep 6: CLICK: (550, 395)\nstep 7: CLICK: (432, 584)\nstep 8: TYPE:  Google's AI Course for Beginners(in 10 minutes)!\nstep 9: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (316, 926)\nstep 2: CLICK: (205, 672)\nstep 3: CLICK: (101, 405)\nstep 4: PRESS_HOME\nstep 5: CLICK: (901, 479)\nstep 6: CLICK: (550, 395)\nstep 7: CLICK: (432, 584)\nstep 8: TYPE:  Google's AI Course for Beginners(in 10 minutes)!\nstep 9: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['BasicNote - Notes, Notepad', 'Tiktok']\nB: ['Microsoft word', 'Youtube']\nC: ['Notepad - Notes and To Do List', 'Triller']\nD: ['Dropbox Paper', 'Shorts VotTak: Short Video App']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_153_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_153_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_153_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_153_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_153_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_153_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_153_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_153_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_153_8.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Vaulty:Hide Pictures Videos', 'Amazon Kindle']\nB: ['Plantin', 'Kobo Books - eBooks Audiobooks']\nC: ['Setting', 'Libby, by OverDrive']\nD: ['Tripadvisor', 'Everand']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (334, 495)\nstep 2: SCROLL: UP\nstep 3: CLICK: (167, 703)\nstep 4: CLICK: (942, 905)\nstep 5: PRESS_HOME\nstep 6: CLICK: (672, 501)\nstep 7: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (334, 495)\nstep 2: SCROLL: UP\nstep 3: CLICK: (167, 703)\nstep 4: CLICK: (942, 905)\nstep 5: PRESS_HOME\nstep 6: CLICK: (672, 501)\nstep 7: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Vaulty:Hide Pictures Videos', 'Amazon Kindle']\nB: ['Plantin', 'Kobo Books - eBooks Audiobooks']\nC: ['Setting', 'Libby, by OverDrive']\nD: ['Tripadvisor', 'Everand']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_154_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_154_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_154_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_154_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_154_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_154_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_154_6.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Picsart AI Photo Editor,Video', 'Firefox']\nB: ['Remix:AI Image Creator', 'Quora']\nC: ['GenZArt:Fast AI Art Generator', 'DuckDuckGo']\nD: ['Chatbot AI & Smart Assistant', 'Chrome']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (119, 347)\nstep 2: CLICK: (77, 92)\nstep 3: CLICK: (488, 717)\nstep 4: CLICK: (305, 885)\nstep 5: TYPE: tell me about Pythagorean theorem\nstep 6: CLICK: (920, 482)\nstep 7: PRESS_HOME\nstep 8: CLICK: (681, 747)\nstep 9: CLICK: (352, 398)\nstep 10: TYPE: Pythagorean theorem\nstep 11: CLICK: (912, 882)\nstep 12: SCROLL: UP\nstep 13: CLICK: (729, 852)\nstep 14: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (119, 347)\nstep 2: CLICK: (77, 92)\nstep 3: CLICK: (488, 717)\nstep 4: CLICK: (305, 885)\nstep 5: TYPE: tell me about Pythagorean theorem\nstep 6: CLICK: (920, 482)\nstep 7: PRESS_HOME\nstep 8: CLICK: (681, 747)\nstep 9: CLICK: (352, 398)\nstep 10: TYPE: Pythagorean theorem\nstep 11: CLICK: (912, 882)\nstep 12: SCROLL: UP\nstep 13: CLICK: (729, 852)\nstep 14: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Picsart AI Photo Editor,Video', 'Firefox']\nB: ['Remix:AI Image Creator', 'Quora']\nC: ['GenZArt:Fast AI Art Generator', 'DuckDuckGo']\nD: ['Chatbot AI & Smart Assistant', 'Chrome']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_155_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_155_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_155_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_155_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_155_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_155_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_155_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_155_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_155_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_155_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_155_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_155_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_155_12.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_155_13.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Bing: chat with AI & GPT4', 'PayPal - Send, Shop, Manage']\nB: ['Firefox', 'Google Pay']\nC: ['Quora', 'Chime - Mobile Banking']\nD: ['Chrome', 'Investing.com']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (514, 901)\nstep 2: CLICK: (332, 355)\nstep 3: TYPE: Coca-Cola's stock market news\nstep 4: CLICK: (908, 688)\nstep 5: SCROLL: UP\nstep 6: CLICK: (333, 585)\nstep 7: PRESS_HOME\nstep 8: CLICK: (426, 397)\nstep 9: CLICK: (697, 69)\nstep 10: TYPE: Coca-Cola\nstep 11: CLICK: (389, 241)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (514, 901)\nstep 2: CLICK: (332, 355)\nstep 3: TYPE: Coca-Cola's stock market news\nstep 4: CLICK: (908, 688)\nstep 5: SCROLL: UP\nstep 6: CLICK: (333, 585)\nstep 7: PRESS_HOME\nstep 8: CLICK: (426, 397)\nstep 9: CLICK: (697, 69)\nstep 10: TYPE: Coca-Cola\nstep 11: CLICK: (389, 241)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Bing: chat with AI & GPT4', 'PayPal - Send, Shop, Manage']\nB: ['Firefox', 'Google Pay']\nC: ['Quora', 'Chime - Mobile Banking']\nD: ['Chrome', 'Investing.com']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_156_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_156_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_156_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_156_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_156_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_156_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_156_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_156_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_156_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_156_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_156_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_156_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Audible: Audio Entertainment', 'Applock Pro - APP Lock & Guard']\nB: ['Amazon Kindle', 'iNaturalist']\nC: ['Ploter - Ebook, Audiobook, PDF', 'Plantin']\nD: ['Google Play Books & Audiobooks', 'Setting']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (433, 710)\nstep 2: CLICK: (152, 944)\nstep 3: CLICK: (963, 801)\nstep 4: PRESS_HOME\nstep 5: SCROLL: LEFT\nstep 6: CLICK: (290, 113)\nstep 7: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (433, 710)\nstep 2: CLICK: (152, 944)\nstep 3: CLICK: (963, 801)\nstep 4: PRESS_HOME\nstep 5: SCROLL: LEFT\nstep 6: CLICK: (290, 113)\nstep 7: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Audible: Audio Entertainment', 'Applock Pro - APP Lock & Guard']\nB: ['Amazon Kindle', 'iNaturalist']\nC: ['Ploter - Ebook, Audiobook, PDF', 'Plantin']\nD: ['Google Play Books & Audiobooks', 'Setting']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_157_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_157_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_157_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_157_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_157_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_157_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_157_6.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Quora', 'Simple Calculator']\nB: ['DuckDuckGo', 'Calculator Plus with History']\nC: ['Edge', 'All-In-One Calculator']\nD: ['Chrome', 'Clock']\n",
+    "question": "The corresponding actions are: step 1: TYPE: nature soundscape video\nstep 2: CLICK: (899, 871)\nstep 3: CLICK: (580, 581)\nstep 4: SCROLL: UP\nstep 5: PRESS_HOME\nstep 6: CLICK: (416, 217)\nstep 7: TYPE: 8000\nstep 8: TYPE: 0\nstep 9: CLICK: (452, 717)\nstep 10: SCROLL: RIGHT\nstep 11: CLICK: (411, 543)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: TYPE: nature soundscape video\nstep 2: CLICK: (899, 871)\nstep 3: CLICK: (580, 581)\nstep 4: SCROLL: UP\nstep 5: PRESS_HOME\nstep 6: CLICK: (416, 217)\nstep 7: TYPE: 8000\nstep 8: TYPE: 0\nstep 9: CLICK: (452, 717)\nstep 10: SCROLL: RIGHT\nstep 11: CLICK: (411, 543)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Quora', 'Simple Calculator']\nB: ['DuckDuckGo', 'Calculator Plus with History']\nC: ['Edge', 'All-In-One Calculator']\nD: ['Chrome', 'Clock']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_158_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_158_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_158_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_158_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_158_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_158_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_158_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_158_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_158_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_158_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_158_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_158_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Tumblr', 'Meesho']\nB: ['Gmail', 'Tata Neu']\nC: ['Instagram', 'Net-a-Porte']\nD: ['Threads', 'Flipkart']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (412, 385)\nstep 2: CLICK: (78, 71)\nstep 3: CLICK: (900, 79)\nstep 4: TYPE: laptop recommendation\nstep 5: CLICK: (884, 907)\nstep 6: SCROLL: UP\nstep 7: SCROLL: UP\nstep 8: CLICK: (529, 525)\nstep 9: PRESS_HOME\nstep 10: CLICK: (137, 255)\nstep 11: CLICK: (418, 132)\nstep 12: TYPE: HP Envy\nstep 13: CLICK: (928, 927)\nstep 14: CLICK: (584, 211)\nstep 15: CLICK: (202, 940)\nstep 16: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (412, 385)\nstep 2: CLICK: (78, 71)\nstep 3: CLICK: (900, 79)\nstep 4: TYPE: laptop recommendation\nstep 5: CLICK: (884, 907)\nstep 6: SCROLL: UP\nstep 7: SCROLL: UP\nstep 8: CLICK: (529, 525)\nstep 9: PRESS_HOME\nstep 10: CLICK: (137, 255)\nstep 11: CLICK: (418, 132)\nstep 12: TYPE: HP Envy\nstep 13: CLICK: (928, 927)\nstep 14: CLICK: (584, 211)\nstep 15: CLICK: (202, 940)\nstep 16: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Tumblr', 'Meesho']\nB: ['Gmail', 'Tata Neu']\nC: ['Instagram', 'Net-a-Porte']\nD: ['Threads', 'Flipkart']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_159_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_159_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_159_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_159_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_159_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_159_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_159_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_159_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_159_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_159_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_159_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_159_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_159_12.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_159_13.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_159_14.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_159_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Threads', 'BBC Sports']\nB: ['Messenger', 'SmartNews:News That Matters']\nC: ['Instagram', 'Yahoo Sports']\nD: ['Whatsapp', 'Breaking News: local & Alerts']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (133, 549)\nstep 2: CLICK: (978, 67)\nstep 3: TYPE: tennis\nstep 4: CLICK: (113, 217)\nstep 5: CLICK: (216, 209)\nstep 6: CLICK: (217, 208)\nstep 7: PRESS_HOME\nstep 8: CLICK: (583, 118)\nstep 9: CLICK: (22, 502)\nstep 10: CLICK: (110, 136)\nstep 11: CLICK: (38, 201)\nstep 12: TYPE: S.Tsitsipas:C.Ruud is 53:76\nstep 13: CLICK: (958, 446)\nstep 14: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (133, 549)\nstep 2: CLICK: (978, 67)\nstep 3: TYPE: tennis\nstep 4: CLICK: (113, 217)\nstep 5: CLICK: (216, 209)\nstep 6: CLICK: (217, 208)\nstep 7: PRESS_HOME\nstep 8: CLICK: (583, 118)\nstep 9: CLICK: (22, 502)\nstep 10: CLICK: (110, 136)\nstep 11: CLICK: (38, 201)\nstep 12: TYPE: S.Tsitsipas:C.Ruud is 53:76\nstep 13: CLICK: (958, 446)\nstep 14: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Threads', 'BBC Sports']\nB: ['Messenger', 'SmartNews:News That Matters']\nC: ['Instagram', 'Yahoo Sports']\nD: ['Whatsapp', 'Breaking News: local & Alerts']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_160_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_160_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_160_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_160_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_160_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_160_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_160_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_160_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_160_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_160_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_160_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_160_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_160_12.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_160_13.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Opera', 'Chime - Mobile Banking']\nB: ['Firefox', 'Google Wallet']\nC: ['DuckDuckGo', 'Cash App']\nD: ['Chrome', 'Investing.com']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (516, 901)\nstep 2: TYPE: McDonald's stock market news\nstep 3: CLICK: (896, 688)\nstep 4: SCROLL: UP\nstep 5: CLICK: (393, 411)\nstep 6: PRESS_HOME\nstep 7: CLICK: (424, 401)\nstep 8: CLICK: (702, 68)\nstep 9: TYPE: McDonald's\nstep 10: CLICK: (428, 261)\nstep 11: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (516, 901)\nstep 2: TYPE: McDonald's stock market news\nstep 3: CLICK: (896, 688)\nstep 4: SCROLL: UP\nstep 5: CLICK: (393, 411)\nstep 6: PRESS_HOME\nstep 7: CLICK: (424, 401)\nstep 8: CLICK: (702, 68)\nstep 9: TYPE: McDonald's\nstep 10: CLICK: (428, 261)\nstep 11: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Opera', 'Chime - Mobile Banking']\nB: ['Firefox', 'Google Wallet']\nC: ['DuckDuckGo', 'Cash App']\nD: ['Chrome', 'Investing.com']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_161_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_161_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_161_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_161_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_161_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_161_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_161_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_161_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_161_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_161_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_161_10.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['GPS, Maps, Voice Navigation', 'Uber']\nB: ['Yandex Navigator', 'Citymapper']\nC: ['Google Map', 'Lyft']\nD: ['Maps', 'Google Map']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (561, 308)\nstep 2: CLICK: (165, 70)\nstep 3: TYPE: book store\nstep 4: CLICK: (171, 160)\nstep 5: PRESS_HOME\nstep 6: CLICK: (549, 172)\nstep 7: CLICK: (350, 623)\nstep 8: TYPE: The Last Bookstore\nstep 9: CLICK: (434, 317)\nstep 10: CLICK: (433, 881)\nstep 11: CLICK: (394, 895)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (561, 308)\nstep 2: CLICK: (165, 70)\nstep 3: TYPE: book store\nstep 4: CLICK: (171, 160)\nstep 5: PRESS_HOME\nstep 6: CLICK: (549, 172)\nstep 7: CLICK: (350, 623)\nstep 8: TYPE: The Last Bookstore\nstep 9: CLICK: (434, 317)\nstep 10: CLICK: (433, 881)\nstep 11: CLICK: (394, 895)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['GPS, Maps, Voice Navigation', 'Uber']\nB: ['Yandex Navigator', 'Citymapper']\nC: ['Google Map', 'Lyft']\nD: ['Maps', 'Google Map']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_162_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_162_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_162_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_162_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_162_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_162_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_162_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_162_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_162_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_162_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_162_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_162_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Todoist', 'Memrise: speak a new language']\nB: ['Things', 'Duolingo']\nC: ['To-Do List', 'Rosetta Stone: Learn, Practice']\nD: ['Any.do', 'Babbel - Learn Languages']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (688, 145)\nstep 2: CLICK: (182, 67)\nstep 3: SCROLL: UP\nstep 4: CLICK: (267, 536)\nstep 5: CLICK: (185, 461)\nstep 6: CLICK: (124, 395)\nstep 7: CLICK: (741, 784)\nstep 8: PRESS_HOME\nstep 9: CLICK: (931, 313)\nstep 10: CLICK: (744, 789)\nstep 11: TYPE: Korean Learning\nstep 12: CLICK: (753, 498)\nstep 13: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (688, 145)\nstep 2: CLICK: (182, 67)\nstep 3: SCROLL: UP\nstep 4: CLICK: (267, 536)\nstep 5: CLICK: (185, 461)\nstep 6: CLICK: (124, 395)\nstep 7: CLICK: (741, 784)\nstep 8: PRESS_HOME\nstep 9: CLICK: (931, 313)\nstep 10: CLICK: (744, 789)\nstep 11: TYPE: Korean Learning\nstep 12: CLICK: (753, 498)\nstep 13: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Todoist', 'Memrise: speak a new language']\nB: ['Things', 'Duolingo']\nC: ['To-Do List', 'Rosetta Stone: Learn, Practice']\nD: ['Any.do', 'Babbel - Learn Languages']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_163_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_163_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_163_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_163_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_163_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_163_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_163_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_163_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_163_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_163_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_163_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_163_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_163_12.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Bloomberg: Finance Market News', 'TradingView: Track All Markets']\nB: ['Yahoo Finance: Stock News', 'Plantin']\nC: ['CNN Breaking US & World News', 'Setting']\nD: ['SmartNews:News That Matters', 'Contacts']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (832, 266)\nstep 2: CLICK: (930, 70)\nstep 3: CLICK: (140, 70)\nstep 4: TYPE: Coca-Cola\nstep 5: CLICK: (264, 449)\nstep 6: PRESS_HOME\nstep 7: CLICK: (140, 398)\nstep 8: CLICK: (941, 77)\nstep 9: TYPE: Coca-Cola\nstep 10: CLICK: (350, 192)\nstep 11: IMPOSSIBLE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (832, 266)\nstep 2: CLICK: (930, 70)\nstep 3: CLICK: (140, 70)\nstep 4: TYPE: Coca-Cola\nstep 5: CLICK: (264, 449)\nstep 6: PRESS_HOME\nstep 7: CLICK: (140, 398)\nstep 8: CLICK: (941, 77)\nstep 9: TYPE: Coca-Cola\nstep 10: CLICK: (350, 192)\nstep 11: IMPOSSIBLE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Bloomberg: Finance Market News', 'TradingView: Track All Markets']\nB: ['Yahoo Finance: Stock News', 'Plantin']\nC: ['CNN Breaking US & World News', 'Setting']\nD: ['SmartNews:News That Matters', 'Contacts']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_164_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_164_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_164_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_164_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_164_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_164_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_164_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_164_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_164_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_164_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_164_10.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Tubi: Movies & Live TV', 'Firefox']\nB: ['Pluto TV - Live TV and Movies', 'Opera']\nC: ['Shorts VotTak: Short Video App', 'Bing: chat with AI & GPT4']\nD: ['Netflix', 'Quora']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (624, 818)\nstep 2: CLICK: (185, 433)\nstep 3: CLICK: (819, 563)\nstep 4: CLICK: (566, 182)\nstep 5: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (624, 818)\nstep 2: CLICK: (185, 433)\nstep 3: CLICK: (819, 563)\nstep 4: CLICK: (566, 182)\nstep 5: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Tubi: Movies & Live TV', 'Firefox']\nB: ['Pluto TV - Live TV and Movies', 'Opera']\nC: ['Shorts VotTak: Short Video App', 'Bing: chat with AI & GPT4']\nD: ['Netflix', 'Quora']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_165_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_165_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_165_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_165_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_165_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Quora', 'Apartment List', 'iHeart: Music, Radio, Podcasts']\nB: ['Chrome', 'StubHub', 'Amazon Music']\nC: ['Opera', 'Airbnb', 'Pandora']\nD: ['Wikipedia', 'Agoda', 'YT Music']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (140, 104)\nstep 2: CLICK: (17, 57)\nstep 3: CLICK: (228, 367)\nstep 4: TYPE: popular pop music band now\nstep 5: CLICK: (304, 192)\nstep 6: CLICK: (308, 724)\nstep 7: PRESS_HOME\nstep 8: SCROLL: RIGHT\nstep 9: CLICK: (713, 706)\nstep 10: CLICK: (143, 308)\nstep 11: CLICK: (281, 543)\nstep 12: PRESS_HOME\nstep 13: SCROLL: LEFT\nstep 14: CLICK: (288, 103)\nstep 15: CLICK: (695, 358)\nstep 16: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (140, 104)\nstep 2: CLICK: (17, 57)\nstep 3: CLICK: (228, 367)\nstep 4: TYPE: popular pop music band now\nstep 5: CLICK: (304, 192)\nstep 6: CLICK: (308, 724)\nstep 7: PRESS_HOME\nstep 8: SCROLL: RIGHT\nstep 9: CLICK: (713, 706)\nstep 10: CLICK: (143, 308)\nstep 11: CLICK: (281, 543)\nstep 12: PRESS_HOME\nstep 13: SCROLL: LEFT\nstep 14: CLICK: (288, 103)\nstep 15: CLICK: (695, 358)\nstep 16: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Quora', 'Apartment List', 'iHeart: Music, Radio, Podcasts']\nB: ['Chrome', 'StubHub', 'Amazon Music']\nC: ['Opera', 'Airbnb', 'Pandora']\nD: ['Wikipedia', 'Agoda', 'YT Music']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_166_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_166_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_166_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_166_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_166_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_166_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_166_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_166_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_166_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_166_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_166_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_166_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_166_12.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_166_13.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_166_14.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_166_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['iNaturalist', 'Meesho']\nB: ['Google Play Store', 'Joom']\nC: ['Setting', 'YOOX']\nD: ['Picturethis', 'Amazon']\n",
+    "question": "The corresponding actions are: step 1: SCROLL: RIGHT\nstep 2: CLICK: (96, 662)\nstep 3: CLICK: (346, 89)\nstep 4: TYPE: Joom\nstep 5: CLICK: (855, 871)\nstep 6: CLICK: (784, 425)\nstep 7: CLICK: (798, 431)\nstep 8: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: SCROLL: RIGHT\nstep 2: CLICK: (96, 662)\nstep 3: CLICK: (346, 89)\nstep 4: TYPE: Joom\nstep 5: CLICK: (855, 871)\nstep 6: CLICK: (784, 425)\nstep 7: CLICK: (798, 431)\nstep 8: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['iNaturalist', 'Meesho']\nB: ['Google Play Store', 'Joom']\nC: ['Setting', 'YOOX']\nD: ['Picturethis', 'Amazon']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_167_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_167_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_167_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_167_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_167_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_167_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_167_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_167_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Job Search by ZipRecruiter', 'Simplenote']\nB: ['LinkedIn: Jobs & Business News', 'Google Keep']\nC: ['Indeed Job Search', 'BasicNote - Notes, Notepad']\nD: ['Indeed Job Search', 'WPS office']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (319, 353)\nstep 2: CLICK: (344, 97)\nstep 3: TYPE: mobile app developer\nstep 4: CLICK: (920, 884)\nstep 5: PRESS_HOME\nstep 6: CLICK: (499, 232)\nstep 7: CLICK: (881, 887)\nstep 8: CLICK: (155, 169)\nstep 9: TYPE: Decker\nstep 10: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (319, 353)\nstep 2: CLICK: (344, 97)\nstep 3: TYPE: mobile app developer\nstep 4: CLICK: (920, 884)\nstep 5: PRESS_HOME\nstep 6: CLICK: (499, 232)\nstep 7: CLICK: (881, 887)\nstep 8: CLICK: (155, 169)\nstep 9: TYPE: Decker\nstep 10: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Job Search by ZipRecruiter', 'Simplenote']\nB: ['LinkedIn: Jobs & Business News', 'Google Keep']\nC: ['Indeed Job Search', 'BasicNote - Notes, Notepad']\nD: ['Indeed Job Search', 'WPS office']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_168_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_168_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_168_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_168_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_168_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_168_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_168_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_168_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_168_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_168_9.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Setting', 'Amazon Kindle']\nB: ['Vaulty:Hide Pictures Videos', 'Ploter - Ebook, Audiobook, PDF']\nC: ['Tripadvisor', 'Libby, by OverDrive']\nD: ['Applock Pro - APP Lock & Guard', 'Everand']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (834, 524)\nstep 2: SCROLL: UP\nstep 3: CLICK: (376, 627)\nstep 4: CLICK: (897, 656)\nstep 5: PRESS_HOME\nstep 6: SCROLL: LEFT\nstep 7: CLICK: (633, 136)\nstep 8: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (834, 524)\nstep 2: SCROLL: UP\nstep 3: CLICK: (376, 627)\nstep 4: CLICK: (897, 656)\nstep 5: PRESS_HOME\nstep 6: SCROLL: LEFT\nstep 7: CLICK: (633, 136)\nstep 8: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Setting', 'Amazon Kindle']\nB: ['Vaulty:Hide Pictures Videos', 'Ploter - Ebook, Audiobook, PDF']\nC: ['Tripadvisor', 'Libby, by OverDrive']\nD: ['Applock Pro - APP Lock & Guard', 'Everand']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_169_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_169_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_169_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_169_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_169_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_169_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_169_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_169_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Wikipedia', 'Any.do']\nB: ['Opera', 'To-Do List']\nC: ['Firefox', 'Things']\nD: ['Chrome', 'TickTick']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (276, 277)\nstep 2: CLICK: (515, 153)\nstep 3: CLICK: (969, 901)\nstep 4: TYPE: Do Yoga Morning\nstep 5: CLICK: (687, 872)\nstep 6: PRESS_HOME\nstep 7: CLICK: (520, 916)\nstep 8: CLICK: (255, 119)\nstep 9: TYPE: Yoga video for beginners\nstep 10: CLICK: (268, 172)\nstep 11: CLICK: (361, 411)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (276, 277)\nstep 2: CLICK: (515, 153)\nstep 3: CLICK: (969, 901)\nstep 4: TYPE: Do Yoga Morning\nstep 5: CLICK: (687, 872)\nstep 6: PRESS_HOME\nstep 7: CLICK: (520, 916)\nstep 8: CLICK: (255, 119)\nstep 9: TYPE: Yoga video for beginners\nstep 10: CLICK: (268, 172)\nstep 11: CLICK: (361, 411)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Wikipedia', 'Any.do']\nB: ['Opera', 'To-Do List']\nC: ['Firefox', 'Things']\nD: ['Chrome', 'TickTick']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_170_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_170_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_170_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_170_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_170_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_170_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_170_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_170_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_170_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_170_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_170_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_170_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Apartment List', 'Citymapper']\nB: ['Apartments.com Rental Search', 'Lyft']\nC: ['Booking.com', 'Maps']\nD: ['TickPick - Live Event Tickets', 'Petal Maps - GPS & Navigation']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (146, 374)\nstep 2: CLICK: (790, 658)\nstep 3: PRESS_HOME\nstep 4: CLICK: (381, 509)\nstep 5: CLICK: (381, 712)\nstep 6: TYPE: 825 E 4th St\nstep 7: CLICK: (447, 272)\nstep 8: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (146, 374)\nstep 2: CLICK: (790, 658)\nstep 3: PRESS_HOME\nstep 4: CLICK: (381, 509)\nstep 5: CLICK: (381, 712)\nstep 6: TYPE: 825 E 4th St\nstep 7: CLICK: (447, 272)\nstep 8: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Apartment List', 'Citymapper']\nB: ['Apartments.com Rental Search', 'Lyft']\nC: ['Booking.com', 'Maps']\nD: ['TickPick - Live Event Tickets', 'Petal Maps - GPS & Navigation']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_171_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_171_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_171_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_171_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_171_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_171_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_171_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_171_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Chrome', 'Messenger']\nB: ['Firefox', 'Tumblr']\nC: ['DuckDuckgo', 'Gmail']\nD: ['Edge', 'Whatsapp']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (203, 491)\nstep 2: TYPE: China\nstep 3: CLICK: (908, 888)\nstep 4: CLICK: (567, 636)\nstep 5: SCROLL: UP\nstep 6: SCROLL: DOWN\nstep 7: CLICK: (975, 89)\nstep 8: CLICK: (739, 240)\nstep 9: CLICK: (475, 688)\nstep 10: TYPE: caba62244@gmail.com\nstep 11: CLICK: (422, 333)\nstep 12: CLICK: (900, 82)\nstep 13: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (203, 491)\nstep 2: TYPE: China\nstep 3: CLICK: (908, 888)\nstep 4: CLICK: (567, 636)\nstep 5: SCROLL: UP\nstep 6: SCROLL: DOWN\nstep 7: CLICK: (975, 89)\nstep 8: CLICK: (739, 240)\nstep 9: CLICK: (475, 688)\nstep 10: TYPE: caba62244@gmail.com\nstep 11: CLICK: (422, 333)\nstep 12: CLICK: (900, 82)\nstep 13: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Chrome', 'Messenger']\nB: ['Firefox', 'Tumblr']\nC: ['DuckDuckgo', 'Gmail']\nD: ['Edge', 'Whatsapp']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_172_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_172_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_172_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_172_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_172_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_172_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_172_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_172_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_172_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_172_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_172_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_172_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_172_12.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Wikipedia', 'Whatsapp']\nB: ['DuckDuckGo', 'Facebook']\nC: ['Bing: chat with AI & GPT4', 'Messenger']\nD: ['Chrome', 'Instagram']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (138, 643)\nstep 2: CLICK: (420, 246)\nstep 3: TYPE:  technology conference events\nstep 4: CLICK: (916, 900)\nstep 5: CLICK: (459, 477)\nstep 6: PRESS_HOME\nstep 7: CLICK: (615, 110)\nstep 8: CLICK: (924, 72)\nstep 9: CLICK: (418, 148)\nstep 10: CLICK: (143, 216)\nstep 11: CLICK: (260, 938)\nstep 12: TYPE: 2024 RSA Conference is on now\nstep 13: CLICK: (909, 588)\nstep 14: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (138, 643)\nstep 2: CLICK: (420, 246)\nstep 3: TYPE:  technology conference events\nstep 4: CLICK: (916, 900)\nstep 5: CLICK: (459, 477)\nstep 6: PRESS_HOME\nstep 7: CLICK: (615, 110)\nstep 8: CLICK: (924, 72)\nstep 9: CLICK: (418, 148)\nstep 10: CLICK: (143, 216)\nstep 11: CLICK: (260, 938)\nstep 12: TYPE: 2024 RSA Conference is on now\nstep 13: CLICK: (909, 588)\nstep 14: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Wikipedia', 'Whatsapp']\nB: ['DuckDuckGo', 'Facebook']\nC: ['Bing: chat with AI & GPT4', 'Messenger']\nD: ['Chrome', 'Instagram']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_173_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_173_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_173_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_173_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_173_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_173_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_173_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_173_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_173_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_173_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_173_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_173_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_173_12.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_173_13.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Edge', 'Whatsapp']\nB: ['Opera', 'Facebook']\nC: ['DuckDuckGo', 'Tumblr']\nD: ['Bing: chat with AI & GPT4', 'Messenger']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (884, 599)\nstep 2: CLICK: (509, 249)\nstep 3: TYPE:  Mississippi River\nstep 4: CLICK: (922, 877)\nstep 5: SCROLL: UP\nstep 6: CLICK: (745, 778)\nstep 7: CLICK: (915, 158)\nstep 8: CLICK: (581, 256)\nstep 9: CLICK: (870, 853)\nstep 10: CLICK: (897, 146)\nstep 11: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (884, 599)\nstep 2: CLICK: (509, 249)\nstep 3: TYPE:  Mississippi River\nstep 4: CLICK: (922, 877)\nstep 5: SCROLL: UP\nstep 6: CLICK: (745, 778)\nstep 7: CLICK: (915, 158)\nstep 8: CLICK: (581, 256)\nstep 9: CLICK: (870, 853)\nstep 10: CLICK: (897, 146)\nstep 11: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Edge', 'Whatsapp']\nB: ['Opera', 'Facebook']\nC: ['DuckDuckGo', 'Tumblr']\nD: ['Bing: chat with AI & GPT4', 'Messenger']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_174_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_174_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_174_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_174_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_174_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_174_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_174_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_174_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_174_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_174_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_174_10.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Tiktok', 'Applock Pro - APP Lock & Guard']\nB: ['Shorts VotTak: Short Video App', 'Picturethis']\nC: ['Tubi: Movies & Live TV', 'Contacts']\nD: ['Youtube', 'Google Play Store']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (607, 813)\nstep 2: PRESS_HOME\nstep 3: CLICK: (819, 815)\nstep 4: CLICK: (802, 59)\nstep 5: CLICK: (931, 56)\nstep 6: TYPE: fiton\nstep 7: CLICK: (934, 899)\nstep 8: CLICK: (862, 329)\nstep 9: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (607, 813)\nstep 2: PRESS_HOME\nstep 3: CLICK: (819, 815)\nstep 4: CLICK: (802, 59)\nstep 5: CLICK: (931, 56)\nstep 6: TYPE: fiton\nstep 7: CLICK: (934, 899)\nstep 8: CLICK: (862, 329)\nstep 9: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Tiktok', 'Applock Pro - APP Lock & Guard']\nB: ['Shorts VotTak: Short Video App', 'Picturethis']\nC: ['Tubi: Movies & Live TV', 'Contacts']\nD: ['Youtube', 'Google Play Store']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_175_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_175_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_175_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_175_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_175_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_175_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_175_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_175_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_175_8.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['OfferUp: Buy. Sell. Letgo.', 'Setting']\nB: ['SHEIN', 'Google Play Store']\nC: ['REVOLVE', 'TradingView: Track All Markets']\nD: ['Alibaba.com - B2B marketplace', 'Contacts']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (210, 684)\nstep 2: CLICK: (308, 113)\nstep 3: TYPE: SHEIN\nstep 4: CLICK: (866, 877)\nstep 5: CLICK: (738, 431)\nstep 6: CLICK: (915, 438)\nstep 7: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (210, 684)\nstep 2: CLICK: (308, 113)\nstep 3: TYPE: SHEIN\nstep 4: CLICK: (866, 877)\nstep 5: CLICK: (738, 431)\nstep 6: CLICK: (915, 438)\nstep 7: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['OfferUp: Buy. Sell. Letgo.', 'Setting']\nB: ['SHEIN', 'Google Play Store']\nC: ['REVOLVE', 'TradingView: Track All Markets']\nD: ['Alibaba.com - B2B marketplace', 'Contacts']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_176_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_176_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_176_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_176_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_176_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_176_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_176_6.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['TradingView: Track All Markets', 'X', 'Vaulty:Hide Pictures Videos']\nB: ['PlantNet', 'Tumblr', 'PlantNet']\nC: ['Picturethis', 'Gmail', 'Contacts']\nD: ['Google Play Store', 'Instagram', 'Setting']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (822, 817)\nstep 2: CLICK: (824, 329)\nstep 3: PRESS_HOME\nstep 4: CLICK: (381, 820)\nstep 5: CLICK: (284, 509)\nstep 6: CLICK: (834, 406)\nstep 7: PRESS_BACK\nstep 8: CLICK: (184, 402)\nstep 9: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (822, 817)\nstep 2: CLICK: (824, 329)\nstep 3: PRESS_HOME\nstep 4: CLICK: (381, 820)\nstep 5: CLICK: (284, 509)\nstep 6: CLICK: (834, 406)\nstep 7: PRESS_BACK\nstep 8: CLICK: (184, 402)\nstep 9: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['TradingView: Track All Markets', 'X', 'Vaulty:Hide Pictures Videos']\nB: ['PlantNet', 'Tumblr', 'PlantNet']\nC: ['Picturethis', 'Gmail', 'Contacts']\nD: ['Google Play Store', 'Instagram', 'Setting']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_177_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_177_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_177_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_177_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_177_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_177_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_177_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_177_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_177_8.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['iNaturalist', 'Vaulty:Hide Pictures Videos', 'Shorts VotTak: Short Video App']\nB: ['Tripadvisor', 'Applock Pro - APP Lock & Guard', 'Youtube']\nC: ['Setting', 'Google Play Store', 'Likee']\nD: ['PlantNet', 'Plantin', 'Tiktok']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (819, 815)\nstep 2: CLICK: (292, 79)\nstep 3: TYPE: Likee\nstep 4: CLICK: (906, 914)\nstep 5: CLICK: (865, 421)\nstep 6: PRESS_HOME\nstep 7: CLICK: (386, 824)\nstep 8: CLICK: (345, 572)\nstep 9: CLICK: (530, 347)\nstep 10: CLICK: (642, 542)\nstep 11: CLICK: (817, 440)\nstep 12: PRESS_BACK\nstep 13: CLICK: (131, 436)\nstep 14: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (819, 815)\nstep 2: CLICK: (292, 79)\nstep 3: TYPE: Likee\nstep 4: CLICK: (906, 914)\nstep 5: CLICK: (865, 421)\nstep 6: PRESS_HOME\nstep 7: CLICK: (386, 824)\nstep 8: CLICK: (345, 572)\nstep 9: CLICK: (530, 347)\nstep 10: CLICK: (642, 542)\nstep 11: CLICK: (817, 440)\nstep 12: PRESS_BACK\nstep 13: CLICK: (131, 436)\nstep 14: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['iNaturalist', 'Vaulty:Hide Pictures Videos', 'Shorts VotTak: Short Video App']\nB: ['Tripadvisor', 'Applock Pro - APP Lock & Guard', 'Youtube']\nC: ['Setting', 'Google Play Store', 'Likee']\nD: ['PlantNet', 'Plantin', 'Tiktok']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_178_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_178_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_178_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_178_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_178_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_178_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_178_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_178_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_178_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_178_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_178_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_178_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_178_12.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_178_13.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Bing: chat with AI & GPT4', 'Whatsapp']\nB: ['DuckDuckgo', 'Messenger']\nC: ['Wikipedia', 'X']\nD: ['Edge', 'Instagram']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (360, 669)\nstep 2: TYPE:  political debate events\nstep 3: CLICK: (891, 922)\nstep 4: CLICK: (328, 242)\nstep 5: SCROLL: UP\nstep 6: SCROLL: UP\nstep 7: PRESS_HOME\nstep 8: CLICK: (819, 132)\nstep 9: CLICK: (366, 470)\nstep 10: CLICK: (523, 948)\nstep 11: TYPE: I saw an ad about NSDA\nstep 12: CLICK: (947, 645)\nstep 13: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (360, 669)\nstep 2: TYPE:  political debate events\nstep 3: CLICK: (891, 922)\nstep 4: CLICK: (328, 242)\nstep 5: SCROLL: UP\nstep 6: SCROLL: UP\nstep 7: PRESS_HOME\nstep 8: CLICK: (819, 132)\nstep 9: CLICK: (366, 470)\nstep 10: CLICK: (523, 948)\nstep 11: TYPE: I saw an ad about NSDA\nstep 12: CLICK: (947, 645)\nstep 13: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Bing: chat with AI & GPT4', 'Whatsapp']\nB: ['DuckDuckgo', 'Messenger']\nC: ['Wikipedia', 'X']\nD: ['Edge', 'Instagram']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_179_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_179_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_179_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_179_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_179_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_179_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_179_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_179_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_179_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_179_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_179_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_179_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_179_12.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['ClevCalc - Calculator', 'iNaturalist']\nB: ['Calendar', 'Setting']\nC: ['Simple Calendar - easy planner', 'Picturethis']\nD: ['Basic Calculator: GPA & Math', 'Vaulty:Hide Pictures Videos']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (325, 497)\nstep 2: SCROLL: UP\nstep 3: SCROLL: UP\nstep 4: CLICK: (195, 932)\nstep 5: CLICK: (567, 349)\nstep 6: CLICK: (528, 421)\nstep 7: CLICK: (535, 619)\nstep 8: CLICK: (970, 90)\nstep 9: TYPE: Spanish\nstep 10: CLICK: (504, 220)\nstep 11: SCROLL: UP\nstep 12: CLICK: (869, 646)\nstep 13: PRESS_HOME\nstep 14: CLICK: (88, 501)\nstep 15: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (325, 497)\nstep 2: SCROLL: UP\nstep 3: SCROLL: UP\nstep 4: CLICK: (195, 932)\nstep 5: CLICK: (567, 349)\nstep 6: CLICK: (528, 421)\nstep 7: CLICK: (535, 619)\nstep 8: CLICK: (970, 90)\nstep 9: TYPE: Spanish\nstep 10: CLICK: (504, 220)\nstep 11: SCROLL: UP\nstep 12: CLICK: (869, 646)\nstep 13: PRESS_HOME\nstep 14: CLICK: (88, 501)\nstep 15: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['ClevCalc - Calculator', 'iNaturalist']\nB: ['Calendar', 'Setting']\nC: ['Simple Calendar - easy planner', 'Picturethis']\nD: ['Basic Calculator: GPA & Math', 'Vaulty:Hide Pictures Videos']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_180_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_180_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_180_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_180_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_180_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_180_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_180_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_180_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_180_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_180_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_180_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_180_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_180_12.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_180_13.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_180_14.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Booking.com', 'Maps']\nB: ['Apartments.com Rental Search', 'Waze Navigation & Live Traffic']\nC: ['Traveloka', 'Yandex Navigator']\nD: ['Apartment List', 'Lyft']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (566, 654)\nstep 2: CLICK: (201, 234)\nstep 3: PRESS_HOME\nstep 4: CLICK: (918, 150)\nstep 5: CLICK: (236, 695)\nstep 6: TYPE: 257 S Spring St\nstep 7: CLICK: (275, 237)\nstep 8: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (566, 654)\nstep 2: CLICK: (201, 234)\nstep 3: PRESS_HOME\nstep 4: CLICK: (918, 150)\nstep 5: CLICK: (236, 695)\nstep 6: TYPE: 257 S Spring St\nstep 7: CLICK: (275, 237)\nstep 8: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Booking.com', 'Maps']\nB: ['Apartments.com Rental Search', 'Waze Navigation & Live Traffic']\nC: ['Traveloka', 'Yandex Navigator']\nD: ['Apartment List', 'Lyft']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_181_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_181_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_181_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_181_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_181_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_181_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_181_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_181_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Instagram', 'Chrome']\nB: ['Messenger', 'Wikipedia']\nC: ['Gmail', 'Bing: chat with AI & GPT4']\nD: ['X', 'Opera']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (161, 598)\nstep 2: CLICK: (351, 379)\nstep 3: TYPE: baseball scores\nstep 4: CLICK: (912, 884)\nstep 5: PRESS_HOME\nstep 6: CLICK: (618, 149)\nstep 7: CLICK: (951, 144)\nstep 8: CLICK: (293, 639)\nstep 9: CLICK: (224, 901)\nstep 10: TYPE: Mariners : Twins is 3:6\nstep 11: CLICK: (844, 481)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (161, 598)\nstep 2: CLICK: (351, 379)\nstep 3: TYPE: baseball scores\nstep 4: CLICK: (912, 884)\nstep 5: PRESS_HOME\nstep 6: CLICK: (618, 149)\nstep 7: CLICK: (951, 144)\nstep 8: CLICK: (293, 639)\nstep 9: CLICK: (224, 901)\nstep 10: TYPE: Mariners : Twins is 3:6\nstep 11: CLICK: (844, 481)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Instagram', 'Chrome']\nB: ['Messenger', 'Wikipedia']\nC: ['Gmail', 'Bing: chat with AI & GPT4']\nD: ['X', 'Opera']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_182_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_182_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_182_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_182_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_182_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_182_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_182_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_182_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_182_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_182_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_182_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_182_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Triller', 'Simplenote']\nB: ['Pluto TV - Live TV and Movies', 'Google Keep']\nC: ['Youtube', 'Microsoft word']\nD: ['Likee', 'Google Docs']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (596, 892)\nstep 2: CLICK: (703, 228)\nstep 3: CLICK: (99, 272)\nstep 4: PRESS_HOME\nstep 5: CLICK: (858, 383)\nstep 6: CLICK: (360, 387)\nstep 7: CLICK: (128, 387)\nstep 8: CLICK: (130, 383)\nstep 9: CLICK: (423, 502)\nstep 10: TYPE:  Smart Cities: Technology and Urban Planning\nstep 11: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (596, 892)\nstep 2: CLICK: (703, 228)\nstep 3: CLICK: (99, 272)\nstep 4: PRESS_HOME\nstep 5: CLICK: (858, 383)\nstep 6: CLICK: (360, 387)\nstep 7: CLICK: (128, 387)\nstep 8: CLICK: (130, 383)\nstep 9: CLICK: (423, 502)\nstep 10: TYPE:  Smart Cities: Technology and Urban Planning\nstep 11: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Triller', 'Simplenote']\nB: ['Pluto TV - Live TV and Movies', 'Google Keep']\nC: ['Youtube', 'Microsoft word']\nD: ['Likee', 'Google Docs']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_183_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_183_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_183_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_183_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_183_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_183_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_183_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_183_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_183_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_183_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_183_10.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Threads', 'Tumblr']\nB: ['X', 'X']\nC: ['Messenger', 'Messenger']\nD: ['Instagram', 'Threads']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (604, 290)\nstep 2: CLICK: (499, 916)\nstep 3: TYPE: I am sad\nstep 4: CLICK: (847, 496)\nstep 5: CLICK: (909, 924)\nstep 6: SCROLL: UP\nstep 7: CLICK: (545, 597)\nstep 8: CLICK: (561, 882)\nstep 9: PRESS_HOME\nstep 10: CLICK: (576, 170)\nstep 11: CLICK: (894, 169)\nstep 12: CLICK: (394, 648)\nstep 13: CLICK: (341, 917)\nstep 14: TYPE: https://www.threads.net/@ct.5024/post/C60c4dFO2YQ/?xmt=AQGzuhpc3kkbQPp3APiJPjh-3Y8uPVOjnWk8g2lv940IEg\nstep 15: CLICK: (856, 447)\nstep 16: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (604, 290)\nstep 2: CLICK: (499, 916)\nstep 3: TYPE: I am sad\nstep 4: CLICK: (847, 496)\nstep 5: CLICK: (909, 924)\nstep 6: SCROLL: UP\nstep 7: CLICK: (545, 597)\nstep 8: CLICK: (561, 882)\nstep 9: PRESS_HOME\nstep 10: CLICK: (576, 170)\nstep 11: CLICK: (894, 169)\nstep 12: CLICK: (394, 648)\nstep 13: CLICK: (341, 917)\nstep 14: TYPE: https://www.threads.net/@ct.5024/post/C60c4dFO2YQ/?xmt=AQGzuhpc3kkbQPp3APiJPjh-3Y8uPVOjnWk8g2lv940IEg\nstep 15: CLICK: (856, 447)\nstep 16: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Threads', 'Tumblr']\nB: ['X', 'X']\nC: ['Messenger', 'Messenger']\nD: ['Instagram', 'Threads']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_184_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_184_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_184_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_184_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_184_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_184_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_184_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_184_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_184_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_184_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_184_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_184_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_184_12.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_184_13.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_184_14.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_184_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['PlantNet', 'Chime - Mobile Banking']\nB: ['Applock Pro - APP Lock & Guard', 'Venmo']\nC: ['Picturethis', 'Cash App']\nD: ['Setting', 'Google Pay']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (341, 393)\nstep 2: CLICK: (242, 573)\nstep 3: CLICK: (524, 567)\nstep 4: CLICK: (799, 576)\nstep 5: CLICK: (272, 664)\nstep 6: CLICK: (492, 671)\nstep 7: CLICK: (815, 671)\nstep 8: SCROLL: UP\nstep 9: SCROLL: UP\nstep 10: CLICK: (876, 623)\nstep 11: PRESS_HOME\nstep 12: CLICK: (133, 527)\nstep 13: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (341, 393)\nstep 2: CLICK: (242, 573)\nstep 3: CLICK: (524, 567)\nstep 4: CLICK: (799, 576)\nstep 5: CLICK: (272, 664)\nstep 6: CLICK: (492, 671)\nstep 7: CLICK: (815, 671)\nstep 8: SCROLL: UP\nstep 9: SCROLL: UP\nstep 10: CLICK: (876, 623)\nstep 11: PRESS_HOME\nstep 12: CLICK: (133, 527)\nstep 13: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['PlantNet', 'Chime - Mobile Banking']\nB: ['Applock Pro - APP Lock & Guard', 'Venmo']\nC: ['Picturethis', 'Cash App']\nD: ['Setting', 'Google Pay']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_185_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_185_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_185_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_185_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_185_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_185_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_185_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_185_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_185_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_185_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_185_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_185_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_185_12.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Google Play Store', 'Setting']\nB: ['Picturethis', 'Contacts']\nC: ['PlantNet', 'Google Play Store']\nD: ['Vaulty:Hide Pictures Videos', 'Plantin']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (391, 149)\nstep 2: CLICK: (391, 149)\nstep 3: TYPE: Tokopedia\nstep 4: CLICK: (933, 882)\nstep 5: CLICK: (616, 499)\nstep 6: CLICK: (324, 395)\nstep 7: CLICK: (761, 565)\nstep 8: PRESS_HOME\nstep 9: CLICK: (186, 635)\nstep 10: CLICK: (288, 823)\nstep 11: SCROLL: UP\nstep 12: CLICK: (441, 645)\nstep 13: CLICK: (802, 157)\nstep 14: TYPE: Tokopedia\nstep 15: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (391, 149)\nstep 2: CLICK: (391, 149)\nstep 3: TYPE: Tokopedia\nstep 4: CLICK: (933, 882)\nstep 5: CLICK: (616, 499)\nstep 6: CLICK: (324, 395)\nstep 7: CLICK: (761, 565)\nstep 8: PRESS_HOME\nstep 9: CLICK: (186, 635)\nstep 10: CLICK: (288, 823)\nstep 11: SCROLL: UP\nstep 12: CLICK: (441, 645)\nstep 13: CLICK: (802, 157)\nstep 14: TYPE: Tokopedia\nstep 15: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Google Play Store', 'Setting']\nB: ['Picturethis', 'Contacts']\nC: ['PlantNet', 'Google Play Store']\nD: ['Vaulty:Hide Pictures Videos', 'Plantin']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_186_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_186_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_186_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_186_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_186_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_186_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_186_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_186_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_186_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_186_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_186_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_186_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_186_12.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_186_13.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_186_14.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Simplenote', 'Youtube']\nB: ['Dropbox Paper', 'Shorts VotTak: Short Video App']\nC: ['BasicNote - Notes, Notepad', 'Tubi: Movies & Live TV']\nD: ['Microsoft Word', 'Triller']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (828, 656)\nstep 2: CLICK: (360, 503)\nstep 3: CLICK: (295, 864)\nstep 4: PRESS_HOME\nstep 5: CLICK: (605, 510)\nstep 6: CLICK: (895, 933)\nstep 7: CLICK: (317, 223)\nstep 8: CLICK: (520, 688)\nstep 9: TYPE:  Google's AI Course for Beginners(in 10 minutes)!\nstep 10: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (828, 656)\nstep 2: CLICK: (360, 503)\nstep 3: CLICK: (295, 864)\nstep 4: PRESS_HOME\nstep 5: CLICK: (605, 510)\nstep 6: CLICK: (895, 933)\nstep 7: CLICK: (317, 223)\nstep 8: CLICK: (520, 688)\nstep 9: TYPE:  Google's AI Course for Beginners(in 10 minutes)!\nstep 10: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Simplenote', 'Youtube']\nB: ['Dropbox Paper', 'Shorts VotTak: Short Video App']\nC: ['BasicNote - Notes, Notepad', 'Tubi: Movies & Live TV']\nD: ['Microsoft Word', 'Triller']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_187_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_187_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_187_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_187_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_187_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_187_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_187_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_187_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_187_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_187_9.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Whatsapp', 'Wikipedia']\nB: ['Threads', 'Chrome']\nC: ['Tumblr', 'Quora']\nD: ['X', 'Edge']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (127, 663)\nstep 2: CLICK: (313, 275)\nstep 3: TYPE:  Steve Jobs\nstep 4: CLICK: (911, 924)\nstep 5: CLICK: (594, 820)\nstep 6: CLICK: (951, 90)\nstep 7: CLICK: (623, 514)\nstep 8: CLICK: (485, 909)\nstep 9: PRESS_HOME\nstep 10: CLICK: (815, 282)\nstep 11: CLICK: (485, 946)\nstep 12: TYPE: https://en.wikipedia.org/wiki/Steve_Jobs\nstep 13: CLICK: (900, 638)\nstep 14: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (127, 663)\nstep 2: CLICK: (313, 275)\nstep 3: TYPE:  Steve Jobs\nstep 4: CLICK: (911, 924)\nstep 5: CLICK: (594, 820)\nstep 6: CLICK: (951, 90)\nstep 7: CLICK: (623, 514)\nstep 8: CLICK: (485, 909)\nstep 9: PRESS_HOME\nstep 10: CLICK: (815, 282)\nstep 11: CLICK: (485, 946)\nstep 12: TYPE: https://en.wikipedia.org/wiki/Steve_Jobs\nstep 13: CLICK: (900, 638)\nstep 14: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Whatsapp', 'Wikipedia']\nB: ['Threads', 'Chrome']\nC: ['Tumblr', 'Quora']\nD: ['X', 'Edge']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_188_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_188_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_188_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_188_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_188_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_188_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_188_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_188_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_188_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_188_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_188_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_188_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_188_12.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_188_13.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Dropbox Paper', 'Indeed Job Search']\nB: ['WPS office', 'LinkedIn: Jobs & Business News']\nC: ['Notepad - Notes and To Do List', 'Indeed Job Search']\nD: ['Google Keep', 'Job Search by ZipRecruiter']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (801, 667)\nstep 2: CLICK: (361, 95)\nstep 3: TYPE: UI/UX designer\nstep 4: CLICK: (874, 878)\nstep 5: CLICK: (485, 603)\nstep 6: PRESS_HOME\nstep 7: CLICK: (908, 482)\nstep 8: CLICK: (435, 869)\nstep 9: TYPE: Merit America\nstep 10: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (801, 667)\nstep 2: CLICK: (361, 95)\nstep 3: TYPE: UI/UX designer\nstep 4: CLICK: (874, 878)\nstep 5: CLICK: (485, 603)\nstep 6: PRESS_HOME\nstep 7: CLICK: (908, 482)\nstep 8: CLICK: (435, 869)\nstep 9: TYPE: Merit America\nstep 10: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Dropbox Paper', 'Indeed Job Search']\nB: ['WPS office', 'LinkedIn: Jobs & Business News']\nC: ['Notepad - Notes and To Do List', 'Indeed Job Search']\nD: ['Google Keep', 'Job Search by ZipRecruiter']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_189_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_189_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_189_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_189_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_189_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_189_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_189_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_189_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_189_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_189_9.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['TickTick', 'Spotify']\nB: ['Microsoft to do', 'Pandora']\nC: ['Any.do', 'Amazon Music']\nD: ['To-Do List', 'iHeart: Music, Radio, Podcasts']\n",
+    "question": "The corresponding actions are: step 1: SCROLL: LEFT\nstep 2: CLICK: (395, 262)\nstep 3: CLICK: (56, 79)\nstep 4: CLICK: (844, 80)\nstep 5: CLICK: (386, 585)\nstep 6: PRESS_HOME\nstep 7: SCROLL: RIGHT\nstep 8: CLICK: (619, 403)\nstep 9: CLICK: (502, 686)\nstep 10: TYPE:  do yoga with this\nstep 11: CLICK: (882, 640)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: SCROLL: LEFT\nstep 2: CLICK: (395, 262)\nstep 3: CLICK: (56, 79)\nstep 4: CLICK: (844, 80)\nstep 5: CLICK: (386, 585)\nstep 6: PRESS_HOME\nstep 7: SCROLL: RIGHT\nstep 8: CLICK: (619, 403)\nstep 9: CLICK: (502, 686)\nstep 10: TYPE:  do yoga with this\nstep 11: CLICK: (882, 640)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['TickTick', 'Spotify']\nB: ['Microsoft to do', 'Pandora']\nC: ['Any.do', 'Amazon Music']\nD: ['To-Do List', 'iHeart: Music, Radio, Podcasts']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_190_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_190_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_190_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_190_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_190_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_190_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_190_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_190_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_190_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_190_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_190_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_190_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Microsoft Copilot', 'wikiHow']\nB: ['GenZArt:Fast AI Art Generator', 'Chrome']\nC: ['Chatty - AI Assistant', 'Firefox']\nD: ['WOMBO Dream-AI Art Generator', 'DuckDuckGo']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (115, 387)\nstep 2: CLICK: (255, 870)\nstep 3: TYPE: tell me about Bayes' theorem\nstep 4: CLICK: (911, 590)\nstep 5: SCROLL: DOWN\nstep 6: PRESS_HOME\nstep 7: CLICK: (148, 235)\nstep 8: CLICK: (354, 60)\nstep 9: TYPE: Bayes' theorem\nstep 10: CLICK: (214, 199)\nstep 11: CLICK: (217, 488)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (115, 387)\nstep 2: CLICK: (255, 870)\nstep 3: TYPE: tell me about Bayes' theorem\nstep 4: CLICK: (911, 590)\nstep 5: SCROLL: DOWN\nstep 6: PRESS_HOME\nstep 7: CLICK: (148, 235)\nstep 8: CLICK: (354, 60)\nstep 9: TYPE: Bayes' theorem\nstep 10: CLICK: (214, 199)\nstep 11: CLICK: (217, 488)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Microsoft Copilot', 'wikiHow']\nB: ['GenZArt:Fast AI Art Generator', 'Chrome']\nC: ['Chatty - AI Assistant', 'Firefox']\nD: ['WOMBO Dream-AI Art Generator', 'DuckDuckGo']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_191_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_191_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_191_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_191_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_191_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_191_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_191_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_191_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_191_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_191_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_191_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_191_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Simple Calendar', 'Tiktok']\nB: ['Clock', 'Youtube']\nC: ['Calculator Plus with History', 'Pluto TV - Live TV and Movies']\nD: ['ClevCalc - Calculator', 'Shorts VotTak: Short Video App']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (811, 612)\nstep 2: TYPE: Makeup look\nstep 3: CLICK: (899, 885)\nstep 4: CLICK: (472, 318)\nstep 5: PRESS_HOME\nstep 6: CLICK: (386, 215)\nstep 7: TYPE: 1800\nstep 8: CLICK: (427, 751)\nstep 9: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (811, 612)\nstep 2: TYPE: Makeup look\nstep 3: CLICK: (899, 885)\nstep 4: CLICK: (472, 318)\nstep 5: PRESS_HOME\nstep 6: CLICK: (386, 215)\nstep 7: TYPE: 1800\nstep 8: CLICK: (427, 751)\nstep 9: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Simple Calendar', 'Tiktok']\nB: ['Clock', 'Youtube']\nC: ['Calculator Plus with History', 'Pluto TV - Live TV and Movies']\nD: ['ClevCalc - Calculator', 'Shorts VotTak: Short Video App']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_192_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_192_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_192_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_192_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_192_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_192_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_192_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_192_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_192_8.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['DuckDuckgo', 'Setting']\nB: ['wikiHow', 'PlantNet']\nC: ['Quora', 'Contacts']\nD: ['Firefox', 'Tripadvisor']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (374, 110)\nstep 2: CLICK: (389, 77)\nstep 3: TYPE: minimalist pictures\nstep 4: CLICK: (443, 143)\nstep 5: CLICK: (238, 133)\nstep 6: CLICK: (260, 608)\nstep 7: SCROLL: UP\nstep 8: CLICK: (488, 522)\nstep 9: PRESS_HOME\nstep 10: SCROLL: LEFT\nstep 11: CLICK: (153, 246)\nstep 12: CLICK: (588, 244)\nstep 13: CLICK: (279, 402)\nstep 14: CLICK: (175, 255)\nstep 15: CLICK: (822, 76)\nstep 16: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (374, 110)\nstep 2: CLICK: (389, 77)\nstep 3: TYPE: minimalist pictures\nstep 4: CLICK: (443, 143)\nstep 5: CLICK: (238, 133)\nstep 6: CLICK: (260, 608)\nstep 7: SCROLL: UP\nstep 8: CLICK: (488, 522)\nstep 9: PRESS_HOME\nstep 10: SCROLL: LEFT\nstep 11: CLICK: (153, 246)\nstep 12: CLICK: (588, 244)\nstep 13: CLICK: (279, 402)\nstep 14: CLICK: (175, 255)\nstep 15: CLICK: (822, 76)\nstep 16: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['DuckDuckgo', 'Setting']\nB: ['wikiHow', 'PlantNet']\nC: ['Quora', 'Contacts']\nD: ['Firefox', 'Tripadvisor']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_193_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_193_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_193_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_193_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_193_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_193_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_193_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_193_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_193_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_193_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_193_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_193_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_193_12.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_193_13.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_193_14.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_193_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Zoho Meeting', 'Messenger']\nB: ['Microsoft Teams', 'Whatsapp']\nC: ['ZOOM Cloud Meetings', 'Instagram']\nD: ['Google Meet', 'Tumblr']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (406, 380)\nstep 2: CLICK: (153, 149)\nstep 3: CLICK: (401, 310)\nstep 4: CLICK: (488, 942)\nstep 5: CLICK: (547, 100)\nstep 6: PRESS_HOME\nstep 7: CLICK: (620, 121)\nstep 8: CLICK: (938, 76)\nstep 9: CLICK: (406, 419)\nstep 10: CLICK: (289, 933)\nstep 11: TYPE: 9298916954\nstep 12: SCROLL: UP\nstep 13: CLICK: (858, 594)\nstep 14: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (406, 380)\nstep 2: CLICK: (153, 149)\nstep 3: CLICK: (401, 310)\nstep 4: CLICK: (488, 942)\nstep 5: CLICK: (547, 100)\nstep 6: PRESS_HOME\nstep 7: CLICK: (620, 121)\nstep 8: CLICK: (938, 76)\nstep 9: CLICK: (406, 419)\nstep 10: CLICK: (289, 933)\nstep 11: TYPE: 9298916954\nstep 12: SCROLL: UP\nstep 13: CLICK: (858, 594)\nstep 14: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Zoho Meeting', 'Messenger']\nB: ['Microsoft Teams', 'Whatsapp']\nC: ['ZOOM Cloud Meetings', 'Instagram']\nD: ['Google Meet', 'Tumblr']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_194_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_194_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_194_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_194_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_194_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_194_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_194_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_194_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_194_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_194_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_194_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_194_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_194_12.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_194_13.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Tumblr', 'Google Docs']\nB: ['X', 'Dropbox Paper']\nC: ['Facebook', 'BasicNote - Notes, Notepad']\nD: ['Whatsapp', 'Notepad - Notes and To Do List']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (153, 377)\nstep 2: CLICK: (929, 242)\nstep 3: CLICK: (336, 445)\nstep 4: PRESS_HOME\nstep 5: CLICK: (138, 249)\nstep 6: CLICK: (624, 944)\nstep 7: CLICK: (481, 223)\nstep 8: CLICK: (352, 896)\nstep 9: TYPE: https://docs.google.com/document/d/1SxaVLphFkdlDbR8lDUqNgyrJ4X0NOi9NJZcavp01Cjo/edit?usp=drivesdk\nstep 10: CLICK: (914, 601)\nstep 11: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (153, 377)\nstep 2: CLICK: (929, 242)\nstep 3: CLICK: (336, 445)\nstep 4: PRESS_HOME\nstep 5: CLICK: (138, 249)\nstep 6: CLICK: (624, 944)\nstep 7: CLICK: (481, 223)\nstep 8: CLICK: (352, 896)\nstep 9: TYPE: https://docs.google.com/document/d/1SxaVLphFkdlDbR8lDUqNgyrJ4X0NOi9NJZcavp01Cjo/edit?usp=drivesdk\nstep 10: CLICK: (914, 601)\nstep 11: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Tumblr', 'Google Docs']\nB: ['X', 'Dropbox Paper']\nC: ['Facebook', 'BasicNote - Notes, Notepad']\nD: ['Whatsapp', 'Notepad - Notes and To Do List']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_195_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_195_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_195_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_195_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_195_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_195_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_195_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_195_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_195_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_195_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_195_10.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Firefox', 'Investing.com']\nB: ['Opera', 'Cash App']\nC: ['Quora', 'PayPal - Send, Shop, Manage']\nD: ['Wikipedia', 'Venmo']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (133, 259)\nstep 2: CLICK: (275, 83)\nstep 3: TYPE: Facebook's stock market news\nstep 4: CLICK: (942, 897)\nstep 5: SCROLL: UP\nstep 6: CLICK: (290, 718)\nstep 7: PRESS_HOME\nstep 8: CLICK: (393, 380)\nstep 9: CLICK: (934, 267)\nstep 10: CLICK: (911, 76)\nstep 11: TYPE: Facebook\nstep 12: CLICK: (370, 242)\nstep 13: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (133, 259)\nstep 2: CLICK: (275, 83)\nstep 3: TYPE: Facebook's stock market news\nstep 4: CLICK: (942, 897)\nstep 5: SCROLL: UP\nstep 6: CLICK: (290, 718)\nstep 7: PRESS_HOME\nstep 8: CLICK: (393, 380)\nstep 9: CLICK: (934, 267)\nstep 10: CLICK: (911, 76)\nstep 11: TYPE: Facebook\nstep 12: CLICK: (370, 242)\nstep 13: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Firefox', 'Investing.com']\nB: ['Opera', 'Cash App']\nC: ['Quora', 'PayPal - Send, Shop, Manage']\nD: ['Wikipedia', 'Venmo']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_196_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_196_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_196_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_196_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_196_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_196_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_196_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_196_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_196_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_196_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_196_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_196_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_196_12.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Setting', 'iNaturalist']\nB: ['Plantin', 'TradingView: Track All Markets']\nC: ['Google Play Store', 'Setting']\nD: ['TradingView: Track All Markets', 'Picturethis']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (423, 707)\nstep 2: SCROLL: UP\nstep 3: SCROLL: UP\nstep 4: CLICK: (123, 697)\nstep 5: CLICK: (518, 316)\nstep 6: CLICK: (473, 388)\nstep 7: CLICK: (482, 514)\nstep 8: CLICK: (978, 73)\nstep 9: TYPE: Turkish\nstep 10: CLICK: (419, 139)\nstep 11: CLICK: (413, 171)\nstep 12: SCROLL: UP\nstep 13: CLICK: (828, 608)\nstep 14: PRESS_HOME\nstep 15: CLICK: (131, 714)\nstep 16: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (423, 707)\nstep 2: SCROLL: UP\nstep 3: SCROLL: UP\nstep 4: CLICK: (123, 697)\nstep 5: CLICK: (518, 316)\nstep 6: CLICK: (473, 388)\nstep 7: CLICK: (482, 514)\nstep 8: CLICK: (978, 73)\nstep 9: TYPE: Turkish\nstep 10: CLICK: (419, 139)\nstep 11: CLICK: (413, 171)\nstep 12: SCROLL: UP\nstep 13: CLICK: (828, 608)\nstep 14: PRESS_HOME\nstep 15: CLICK: (131, 714)\nstep 16: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Setting', 'iNaturalist']\nB: ['Plantin', 'TradingView: Track All Markets']\nC: ['Google Play Store', 'Setting']\nD: ['TradingView: Track All Markets', 'Picturethis']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_197_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_197_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_197_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_197_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_197_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_197_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_197_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_197_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_197_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_197_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_197_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_197_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_197_12.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_197_13.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_197_14.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_197_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Google Meet', 'Facebook']\nB: ['ZOOM Cloud Meetings', 'Gmail']\nC: ['Zoho Meeting', 'Instagram']\nD: ['Microsoft Teams', 'Threads']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (596, 555)\nstep 2: CLICK: (98, 147)\nstep 3: CLICK: (431, 291)\nstep 4: CLICK: (538, 958)\nstep 5: CLICK: (462, 93)\nstep 6: PRESS_HOME\nstep 7: SCROLL: UP\nstep 8: CLICK: (375, 151)\nstep 9: CLICK: (697, 876)\nstep 10: TYPE: caba62244@gmail.com\nstep 11: CLICK: (103, 264)\nstep 12: CLICK: (221, 312)\nstep 13: TYPE: 9198916954\nstep 14: CLICK: (846, 76)\nstep 15: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (596, 555)\nstep 2: CLICK: (98, 147)\nstep 3: CLICK: (431, 291)\nstep 4: CLICK: (538, 958)\nstep 5: CLICK: (462, 93)\nstep 6: PRESS_HOME\nstep 7: SCROLL: UP\nstep 8: CLICK: (375, 151)\nstep 9: CLICK: (697, 876)\nstep 10: TYPE: caba62244@gmail.com\nstep 11: CLICK: (103, 264)\nstep 12: CLICK: (221, 312)\nstep 13: TYPE: 9198916954\nstep 14: CLICK: (846, 76)\nstep 15: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Google Meet', 'Facebook']\nB: ['ZOOM Cloud Meetings', 'Gmail']\nC: ['Zoho Meeting', 'Instagram']\nD: ['Microsoft Teams', 'Threads']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_198_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_198_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_198_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_198_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_198_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_198_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_198_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_198_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_198_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_198_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_198_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_198_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_198_12.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_198_13.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_198_14.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_app_recognition",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: ['Microsoft to do', 'DuckDuckgo']\nB: ['To-Do List', 'Bing: chat with AI & GPT4']\nC: ['TickTick', 'Edge']\nD: ['Todoist', 'Quora']\n",
+    "question": "The corresponding actions are: step 1: CLICK: (791, 491)\nstep 2: CLICK: (472, 82)\nstep 3: TYPE: spaceX rocket launch\nstep 4: CLICK: (192, 166)\nstep 5: CLICK: (471, 566)\nstep 6: CLICK: (714, 715)\nstep 7: CLICK: (278, 88)\nstep 8: TYPE: next spaceX rocket launch\nstep 9: CLICK: (240, 166)\nstep 10: CLICK: (408, 330)\nstep 11: PRESS_RECENT\nstep 12: CLICK: (103, 287)\nstep 13: CLICK: (477, 841)\nstep 14: TYPE: SpaceX rocket launch\nstep 15: CLICK: (956, 432)\nstep 16: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?",
+    "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (791, 491)\nstep 2: CLICK: (472, 82)\nstep 3: TYPE: spaceX rocket launch\nstep 4: CLICK: (192, 166)\nstep 5: CLICK: (471, 566)\nstep 6: CLICK: (714, 715)\nstep 7: CLICK: (278, 88)\nstep 8: TYPE: next spaceX rocket launch\nstep 9: CLICK: (240, 166)\nstep 10: CLICK: (408, 330)\nstep 11: PRESS_RECENT\nstep 12: CLICK: (103, 287)\nstep 13: CLICK: (477, 841)\nstep 14: TYPE: SpaceX rocket launch\nstep 15: CLICK: (956, 432)\nstep 16: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Microsoft to do', 'DuckDuckgo']\nB: ['To-Do List', 'Bing: chat with AI & GPT4']\nC: ['TickTick', 'Edge']\nD: ['Todoist', 'Quora']\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_199_0.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_199_1.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_199_2.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_199_3.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_199_4.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_199_5.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_199_6.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_199_7.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_199_8.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_199_9.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_199_10.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_199_11.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_199_12.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_199_13.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_199_14.png",
+      "../MMIU-Benchmark/gui_app_recognition/gui_app_recognition_199_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: TYPE: THE OUTNET\nB: CLICK: (494, 328)\nC: PRESS_HOME\nD: CLICK: (141, 243)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (149, 240)\nstep 3: CLICK: (174, 516)\nstep 4: CLICK: (586, 708)\nstep 5: CLICK: (184, 534)\nstep 6: CLICK: (587, 714)\nstep 7: CLICK: (136, 626)\nstep 8: CLICK: (877, 777)\nstep 9: CLICK: (208, 701)\nstep 10: CLICK: (858, 899)\nI want to Utilize 'Basic Calculator: GPA & Math' to compute today's total cost by adding 67 and 4. Once calculated, record the result in 'Wallet: Budget Money Manager'. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Utilize 'Basic Calculator: GPA & Math' to compute today's total cost by adding 67 and 4. Once calculated, record the result in 'Wallet: Budget Money Manager'.\nThe historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (149, 240)\nstep 3: CLICK: (174, 516)\nstep 4: CLICK: (586, 708)\nstep 5: CLICK: (184, 534)\nstep 6: CLICK: (587, 714)\nstep 7: CLICK: (136, 626)\nstep 8: CLICK: (877, 777)\nstep 9: CLICK: (208, 701)\nstep 10: CLICK: (858, 899)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: TYPE: THE OUTNET\nB: CLICK: (494, 328)\nC: PRESS_HOME\nD: CLICK: (141, 243)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_0_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_0_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_0_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_0_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_0_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_0_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_0_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_0_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_0_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_0_9.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_0_10.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (422, 378)\nB: CLICK: (712, 596)\nC: SCROLL: UP\nD: CLICK: (523, 613)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (317, 139)\nstep 2: CLICK: (205, 613)\nstep 3: CLICK: (331, 255)\nstep 4: TYPE: Business\nstep 5: CLICK: (749, 63)\nstep 6: CLICK: (588, 602)\nstep 7: PRESS_HOME\nstep 8: CLICK: (337, 147)\nstep 9: SCROLL: UP\nstep 10: CLICK: (560, 656)\nI want to Arrange a business meeting with caba62244@gmail.com, ensure to send out the invitations via Gmail, and use ZOOM Cloud Meetings for the meeting. Don't forget to set an alarm clock for the scheduled time using the Clock app. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Arrange a business meeting with caba62244@gmail.com, ensure to send out the invitations via Gmail, and use ZOOM Cloud Meetings for the meeting. Don't forget to set an alarm clock for the scheduled time using the Clock app.\nThe historical actions are: step 1: CLICK: (317, 139)\nstep 2: CLICK: (205, 613)\nstep 3: CLICK: (331, 255)\nstep 4: TYPE: Business\nstep 5: CLICK: (749, 63)\nstep 6: CLICK: (588, 602)\nstep 7: PRESS_HOME\nstep 8: CLICK: (337, 147)\nstep 9: SCROLL: UP\nstep 10: CLICK: (560, 656)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (422, 378)\nB: CLICK: (712, 596)\nC: SCROLL: UP\nD: CLICK: (523, 613)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_1_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_1_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_1_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_1_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_1_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_1_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_1_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_1_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_1_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_1_9.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_1_10.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (938, 907)\nB: CLICK: (941, 892)\nC: CLICK: (949, 239)\nD: TYPE: craft beer tasting events\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (547, 135)\nstep 2: CLICK: (754, 149)\nstep 3: PRESS_HOME\nstep 4: CLICK: (908, 315)\nI want to Open Spotify and listen to a podcast episode on yoga for beginners, then use Things to create a to-do list for your tasks tomorrow. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Open Spotify and listen to a podcast episode on yoga for beginners, then use Things to create a to-do list for your tasks tomorrow.\nThe historical actions are: step 1: CLICK: (547, 135)\nstep 2: CLICK: (754, 149)\nstep 3: PRESS_HOME\nstep 4: CLICK: (908, 315)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (938, 907)\nB: CLICK: (941, 892)\nC: CLICK: (949, 239)\nD: TYPE: craft beer tasting events\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_2_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_2_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_2_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_2_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_2_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: SCROLL: UP\nB: CLICK: (195, 529)\nC: COMPLETE\nD: TYPE: Weather\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (568, 506)\nstep 2: CLICK: (256, 79)\nstep 3: TYPE: what is the latest Terminator movie\nstep 4: CLICK: (865, 889)\nI want to Use app X to find the most recent Terminator movie, then consult your Calendar app to identify a free evening to watch it. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Use app X to find the most recent Terminator movie, then consult your Calendar app to identify a free evening to watch it.\nThe historical actions are: step 1: CLICK: (568, 506)\nstep 2: CLICK: (256, 79)\nstep 3: TYPE: what is the latest Terminator movie\nstep 4: CLICK: (865, 889)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: SCROLL: UP\nB: CLICK: (195, 529)\nC: COMPLETE\nD: TYPE: Weather\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_3_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_3_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_3_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_3_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_3_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (347, 249)\nB: CLICK: (576, 247)\nC: TYPE: when is the Fashion week in Paris\nD: CLICK: (321, 153)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (694, 546)\nstep 2: TYPE: the best farmers market in San Jose\nstep 3: CLICK: (899, 686)\nstep 4: PRESS_HOME\nI want to Using Duckduckgo to find the best farmers market in your local city, and then navigate to it with Firefox. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Using Duckduckgo to find the best farmers market in your local city, and then navigate to it with Firefox.\nThe historical actions are: step 1: CLICK: (694, 546)\nstep 2: TYPE: the best farmers market in San Jose\nstep 3: CLICK: (899, 686)\nstep 4: PRESS_HOME\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (347, 249)\nB: CLICK: (576, 247)\nC: TYPE: when is the Fashion week in Paris\nD: CLICK: (321, 153)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_4_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_4_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_4_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_4_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_4_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: TYPE: I am happy \nB: CLICK: (889, 924)\nC: COMPLETE\nD: CLICK: (822, 133)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (129, 170)\nstep 2: CLICK: (679, 145)\nstep 3: CLICK: (651, 222)\nstep 4: CLICK: (422, 395)\nI want to Post your current feelings on Facebook and then share the same post with Victor Jame via Messenger. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Post your current feelings on Facebook and then share the same post with Victor Jame via Messenger.\nThe historical actions are: step 1: CLICK: (129, 170)\nstep 2: CLICK: (679, 145)\nstep 3: CLICK: (651, 222)\nstep 4: CLICK: (422, 395)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: TYPE: I am happy \nB: CLICK: (889, 924)\nC: COMPLETE\nD: CLICK: (822, 133)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_5_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_5_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_5_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_5_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_5_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (824, 254)\nB: TYPE:  fashion show events\nC: PRESS_HOME\nD: CLICK: (164, 720)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (443, 917)\nstep 2: CLICK: (205, 436)\nstep 3: TYPE: Nvidia's stock market news\nstep 4: CLICK: (855, 897)\nI want to Use Chrome to search for today's stock market news of the company Nvidia, and then open TradingView: Track All Markets to check the stock price trends. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Use Chrome to search for today's stock market news of the company Nvidia, and then open TradingView: Track All Markets to check the stock price trends.\nThe historical actions are: step 1: CLICK: (443, 917)\nstep 2: CLICK: (205, 436)\nstep 3: TYPE: Nvidia's stock market news\nstep 4: CLICK: (855, 897)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (824, 254)\nB: TYPE:  fashion show events\nC: PRESS_HOME\nD: CLICK: (164, 720)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_6_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_6_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_6_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_6_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_6_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (931, 915)\nB: CLICK: (508, 366)\nC: CLICK: (680, 485)\nD: TYPE: hiking trail\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (873, 628)\nstep 2: CLICK: (494, 246)\nstep 3: TYPE:  biographical movies\nstep 4: CLICK: (961, 892)\nstep 5: SCROLL: UP\nstep 6: CLICK: (461, 458)\nstep 7: CLICK: (179, 457)\nstep 8: PRESS_HOME\nstep 9: CLICK: (123, 498)\nstep 10: CLICK: (431, 144)\nstep 11: TYPE: snacks\nstep 12: CLICK: (949, 878)\nstep 13: SCROLL: UP\nstep 14: CLICK: (636, 586)\nstep 15: SCROLL: UP\nI want to Organize a movie night by selecting a biographical film on Opera, adding snacks to your Amazon cart, sending out invites to Victor James via Facebook Messenger, and setting a reminder on the Clock app. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Organize a movie night by selecting a biographical film on Opera, adding snacks to your Amazon cart, sending out invites to Victor James via Facebook Messenger, and setting a reminder on the Clock app.\nThe historical actions are: step 1: CLICK: (873, 628)\nstep 2: CLICK: (494, 246)\nstep 3: TYPE:  biographical movies\nstep 4: CLICK: (961, 892)\nstep 5: SCROLL: UP\nstep 6: CLICK: (461, 458)\nstep 7: CLICK: (179, 457)\nstep 8: PRESS_HOME\nstep 9: CLICK: (123, 498)\nstep 10: CLICK: (431, 144)\nstep 11: TYPE: snacks\nstep 12: CLICK: (949, 878)\nstep 13: SCROLL: UP\nstep 14: CLICK: (636, 586)\nstep 15: SCROLL: UP\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (931, 915)\nB: CLICK: (508, 366)\nC: CLICK: (680, 485)\nD: TYPE: hiking trail\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_7_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_7_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_7_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_7_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_7_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_7_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_7_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_7_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_7_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_7_9.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_7_10.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_7_11.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_7_12.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_7_13.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_7_14.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_7_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (171, 160)\nB: SCROLL: UP\nC: CLICK: (264, 542)\nD: CLICK: (401, 729)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (511, 906)\nstep 2: CLICK: (406, 44)\nstep 3: CLICK: (388, 108)\nstep 4: TYPE: advanture movie\nstep 5: CLICK: (295, 263)\nstep 6: SCROLL: UP\nI want to Use Chrome to search for an Adventure movie, then watch it on Pluto TV - Live TV and Movies. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Use Chrome to search for an Adventure movie, then watch it on Pluto TV - Live TV and Movies.\nThe historical actions are: step 1: CLICK: (511, 906)\nstep 2: CLICK: (406, 44)\nstep 3: CLICK: (388, 108)\nstep 4: TYPE: advanture movie\nstep 5: CLICK: (295, 263)\nstep 6: SCROLL: UP\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (171, 160)\nB: SCROLL: UP\nC: CLICK: (264, 542)\nD: CLICK: (401, 729)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_8_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_8_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_8_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_8_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_8_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_8_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_8_6.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (611, 358)\nB: TYPE: 2022 nobel prize winners in physics:\nC: CLICK: (430, 932)\nD: CLICK: (945, 599)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (705, 217)\nstep 2: CLICK: (286, 251)\nstep 3: CLICK: (916, 154)\nstep 4: TYPE: 2022 nobel prize winners in physics\nstep 5: CLICK: (897, 859)\nstep 6: LONG_PRESS: (99, 549)\nstep 7: SCROLL: RIGHT\nstep 8: CLICK: (144, 489)\nstep 9: PRESS_HOME\nstep 10: CLICK: (844, 499)\nstep 11: CLICK: (236, 357)\nstep 12: CLICK: (236, 357)\nI want to Using Firefox, search for the winners of the 2022 Nobel Prize in Physics and then use WPS Office to record the information. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Using Firefox, search for the winners of the 2022 Nobel Prize in Physics and then use WPS Office to record the information.\nThe historical actions are: step 1: CLICK: (705, 217)\nstep 2: CLICK: (286, 251)\nstep 3: CLICK: (916, 154)\nstep 4: TYPE: 2022 nobel prize winners in physics\nstep 5: CLICK: (897, 859)\nstep 6: LONG_PRESS: (99, 549)\nstep 7: SCROLL: RIGHT\nstep 8: CLICK: (144, 489)\nstep 9: PRESS_HOME\nstep 10: CLICK: (844, 499)\nstep 11: CLICK: (236, 357)\nstep 12: CLICK: (236, 357)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (611, 358)\nB: TYPE: 2022 nobel prize winners in physics:\nC: CLICK: (430, 932)\nD: CLICK: (945, 599)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_9_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_9_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_9_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_9_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_9_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_9_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_9_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_9_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_9_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_9_9.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_9_10.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_9_11.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_9_12.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (535, 201)\nB: CLICK: (798, 431)\nC: CLICK: (893, 928)\nD: CLICK: (934, 437)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (413, 363)\nstep 2: CLICK: (936, 87)\nstep 3: TYPE: the best ice cream parlor in Los Angeles\nstep 4: CLICK: (965, 62)\nstep 5: CLICK: (199, 395)\nstep 6: PRESS_HOME\nstep 7: CLICK: (190, 140)\nstep 8: CLICK: (165, 840)\nI want to Utilize GPS and Tiktok to locate the top-rated ice cream parlor in your local city and navigate to it. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Utilize GPS and Tiktok to locate the top-rated ice cream parlor in your local city and navigate to it.\nThe historical actions are: step 1: CLICK: (413, 363)\nstep 2: CLICK: (936, 87)\nstep 3: TYPE: the best ice cream parlor in Los Angeles\nstep 4: CLICK: (965, 62)\nstep 5: CLICK: (199, 395)\nstep 6: PRESS_HOME\nstep 7: CLICK: (190, 140)\nstep 8: CLICK: (165, 840)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (535, 201)\nB: CLICK: (798, 431)\nC: CLICK: (893, 928)\nD: CLICK: (934, 437)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_10_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_10_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_10_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_10_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_10_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_10_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_10_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_10_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_10_8.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: TYPE: Chinese\nB: PRESS_HOME\nC: CLICK: (572, 509)\nD: TYPE: Washington:Minchigan is 13:34\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (831, 502)\nstep 2: SCROLL: UP\nstep 3: SCROLL: UP\nstep 4: CLICK: (399, 730)\nstep 5: CLICK: (368, 272)\nstep 6: CLICK: (374, 327)\nstep 7: CLICK: (344, 473)\nstep 8: CLICK: (936, 59)\nI want to First, navigate to the 'Setting' app on your phone and switch the language setting to Chinese (Simplified). Then, open the 'Photos' app to verify the change. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: First, navigate to the 'Setting' app on your phone and switch the language setting to Chinese (Simplified). Then, open the 'Photos' app to verify the change.\nThe historical actions are: step 1: CLICK: (831, 502)\nstep 2: SCROLL: UP\nstep 3: SCROLL: UP\nstep 4: CLICK: (399, 730)\nstep 5: CLICK: (368, 272)\nstep 6: CLICK: (374, 327)\nstep 7: CLICK: (344, 473)\nstep 8: CLICK: (936, 59)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: TYPE: Chinese\nB: PRESS_HOME\nC: CLICK: (572, 509)\nD: TYPE: Washington:Minchigan is 13:34\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_11_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_11_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_11_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_11_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_11_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_11_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_11_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_11_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_11_8.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (58, 357)\nB: PRESS_HOME\nC: CLICK: (438, 929)\nD: TYPE: https://sg.docworkspace.com/d/sIDzG2tP9AcPpp7EG\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (830, 485)\nstep 2: CLICK: (318, 460)\nstep 3: CLICK: (802, 150)\nstep 4: CLICK: (926, 155)\nstep 5: CLICK: (601, 423)\nstep 6: CLICK: (783, 150)\nstep 7: CLICK: (713, 674)\nstep 8: PRESS_HOME\nstep 9: CLICK: (384, 150)\nstep 10: CLICK: (699, 817)\nstep 11: TYPE: caba62244@gmail.com\nstep 12: CLICK: (465, 496)\nstep 13: SCROLL: UP\nstep 14: CLICK: (268, 478)\nI want to Locate the working file on your phone using WPS Office and then email it to caba62244@gmail.com via Gmail. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Locate the working file on your phone using WPS Office and then email it to caba62244@gmail.com via Gmail.\nThe historical actions are: step 1: CLICK: (830, 485)\nstep 2: CLICK: (318, 460)\nstep 3: CLICK: (802, 150)\nstep 4: CLICK: (926, 155)\nstep 5: CLICK: (601, 423)\nstep 6: CLICK: (783, 150)\nstep 7: CLICK: (713, 674)\nstep 8: PRESS_HOME\nstep 9: CLICK: (384, 150)\nstep 10: CLICK: (699, 817)\nstep 11: TYPE: caba62244@gmail.com\nstep 12: CLICK: (465, 496)\nstep 13: SCROLL: UP\nstep 14: CLICK: (268, 478)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (58, 357)\nB: PRESS_HOME\nC: CLICK: (438, 929)\nD: TYPE: https://sg.docworkspace.com/d/sIDzG2tP9AcPpp7EG\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_12_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_12_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_12_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_12_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_12_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_12_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_12_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_12_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_12_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_12_9.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_12_10.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_12_11.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_12_12.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_12_13.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_12_14.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (139, 640)\nB: COMPLETE\nC: PRESS_HOME\nD: CLICK: (937, 898)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (177, 616)\nstep 2: CLICK: (355, 371)\nstep 3: TYPE: hiking trail\nstep 4: CLICK: (930, 869)\nstep 5: CLICK: (490, 761)\nstep 6: PRESS_HOME\nstep 7: CLICK: (602, 494)\nstep 8: CLICK: (923, 567)\nstep 9: TYPE: California\nI want to Use Chrome to search for a new hiking trail, check the weekend weather forecast on Windy.com-Weather Forecast, and then invite Tzhau Jau to join the hike through Instagram. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Use Chrome to search for a new hiking trail, check the weekend weather forecast on Windy.com-Weather Forecast, and then invite Tzhau Jau to join the hike through Instagram.\nThe historical actions are: step 1: CLICK: (177, 616)\nstep 2: CLICK: (355, 371)\nstep 3: TYPE: hiking trail\nstep 4: CLICK: (930, 869)\nstep 5: CLICK: (490, 761)\nstep 6: PRESS_HOME\nstep 7: CLICK: (602, 494)\nstep 8: CLICK: (923, 567)\nstep 9: TYPE: California\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (139, 640)\nB: COMPLETE\nC: PRESS_HOME\nD: CLICK: (937, 898)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_13_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_13_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_13_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_13_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_13_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_13_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_13_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_13_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_13_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_13_9.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: SCROLL: LEFT\nB: SCROLL: UP\nC: SCROLL: DOWN\nD: CLICK: (31, 960)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (603, 801)\nstep 2: CLICK: (190, 427)\nstep 3: CLICK: (834, 565)\nI want to Open the Firefox Browser to search for the best video blogs on travel vlogs. Then, go to the Setting app to turn up the brightness on your phone. Finally, open the YouTube app to follow the video blogs you found. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Open the Firefox Browser to search for the best video blogs on travel vlogs. Then, go to the Setting app to turn up the brightness on your phone. Finally, open the YouTube app to follow the video blogs you found.\nThe historical actions are: step 1: CLICK: (603, 801)\nstep 2: CLICK: (190, 427)\nstep 3: CLICK: (834, 565)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: SCROLL: LEFT\nB: SCROLL: UP\nC: SCROLL: DOWN\nD: CLICK: (31, 960)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_14_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_14_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_14_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_14_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (537, 937)\nB: TYPE: Palo Alto Junior Museum and Zoo\nC: PRESS_HOME\nD: COMPLETE\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (425, 262)\nstep 2: CLICK: (161, 71)\nstep 3: TYPE: zoo\nstep 4: CLICK: (867, 684)\nstep 5: PRESS_HOME\nstep 6: CLICK: (433, 409)\nstep 7: CLICK: (350, 616)\nI want to Locate the nearest zoo using Google Map and then book a ride with Uber. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Locate the nearest zoo using Google Map and then book a ride with Uber.\nThe historical actions are: step 1: CLICK: (425, 262)\nstep 2: CLICK: (161, 71)\nstep 3: TYPE: zoo\nstep 4: CLICK: (867, 684)\nstep 5: PRESS_HOME\nstep 6: CLICK: (433, 409)\nstep 7: CLICK: (350, 616)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (537, 937)\nB: TYPE: Palo Alto Junior Museum and Zoo\nC: PRESS_HOME\nD: COMPLETE\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_15_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_15_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_15_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_15_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_15_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_15_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_15_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_15_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (516, 921)\nB: TYPE: Load\nC: CLICK: (544, 353)\nD: COMPLETE\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (864, 553)\nstep 3: CLICK: (654, 748)\nstep 4: CLICK: (747, 758)\nstep 5: CLICK: (840, 749)\nstep 6: CLICK: (581, 751)\nstep 7: CLICK: (630, 761)\nstep 8: PRESS_HOME\nstep 9: CLICK: (709, 713)\nstep 10: CLICK: (399, 889)\nstep 11: CLICK: (506, 266)\nstep 12: CLICK: (552, 516)\nstep 13: CLICK: (484, 618)\nI want to Utilize the 'Scientific calculator plus 991' to compute the sum of 23 and 12 for today's total cost. Once calculated, document this total in the 'Monefy' app for record-keeping purposes. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Utilize the 'Scientific calculator plus 991' to compute the sum of 23 and 12 for today's total cost. Once calculated, document this total in the 'Monefy' app for record-keeping purposes.\nThe historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (864, 553)\nstep 3: CLICK: (654, 748)\nstep 4: CLICK: (747, 758)\nstep 5: CLICK: (840, 749)\nstep 6: CLICK: (581, 751)\nstep 7: CLICK: (630, 761)\nstep 8: PRESS_HOME\nstep 9: CLICK: (709, 713)\nstep 10: CLICK: (399, 889)\nstep 11: CLICK: (506, 266)\nstep 12: CLICK: (552, 516)\nstep 13: CLICK: (484, 618)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (516, 921)\nB: TYPE: Load\nC: CLICK: (544, 353)\nD: COMPLETE\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_16_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_16_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_16_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_16_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_16_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_16_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_16_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_16_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_16_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_16_9.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_16_10.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_16_11.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_16_12.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_16_13.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: SCROLL: UP\nB: CLICK: (858, 182)\nC: CLICK: (946, 903)\nD: TYPE: Fever dream high in the quiet of the night\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (846, 810)\nstep 2: SCROLL: UP\nstep 3: CLICK: (315, 426)\nstep 4: SCROLL: RIGHT\nstep 5: SCROLL: RIGHT\nstep 6: CLICK: (339, 300)\nstep 7: PRESS_HOME\nstep 8: SCROLL: UP\nI want to Open Instagram to watch a trending video and then use the Settings app to turn up the volume on your phone. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Open Instagram to watch a trending video and then use the Settings app to turn up the volume on your phone.\nThe historical actions are: step 1: CLICK: (846, 810)\nstep 2: SCROLL: UP\nstep 3: CLICK: (315, 426)\nstep 4: SCROLL: RIGHT\nstep 5: SCROLL: RIGHT\nstep 6: CLICK: (339, 300)\nstep 7: PRESS_HOME\nstep 8: SCROLL: UP\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: SCROLL: UP\nB: CLICK: (858, 182)\nC: CLICK: (946, 903)\nD: TYPE: Fever dream high in the quiet of the night\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_17_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_17_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_17_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_17_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_17_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_17_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_17_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_17_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_17_8.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (101, 614)\nB: CLICK: (58, 56)\nC: SCROLL: LEFT\nD: CLICK: (925, 918)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: SCROLL: LEFT\nstep 2: CLICK: (637, 372)\nstep 3: CLICK: (64, 51)\nstep 4: CLICK: (887, 80)\nstep 5: TYPE: statue of liberty\nstep 6: CLICK: (437, 248)\nstep 7: CLICK: (836, 51)\nstep 8: CLICK: (901, 648)\nstep 9: PRESS_HOME\nstep 10: CLICK: (179, 636)\nstep 11: CLICK: (914, 904)\nstep 12: CLICK: (882, 797)\nstep 13: TYPE: a travel guide to Statue of Liberty\nstep 14: CLICK: (199, 163)\nstep 15: CLICK: (420, 654)\nI want to Using Tripadvisor, look up a travel guide for visiting the Statue of Liberty in New York and note down the resource website in Notepad - Notes and To Do List. After that, check AccuWeather to choose a rain-free day for your visit. Finally, book your flight from San Francisco through Expedia. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Using Tripadvisor, look up a travel guide for visiting the Statue of Liberty in New York and note down the resource website in Notepad - Notes and To Do List. After that, check AccuWeather to choose a rain-free day for your visit. Finally, book your flight from San Francisco through Expedia.\nThe historical actions are: step 1: SCROLL: LEFT\nstep 2: CLICK: (637, 372)\nstep 3: CLICK: (64, 51)\nstep 4: CLICK: (887, 80)\nstep 5: TYPE: statue of liberty\nstep 6: CLICK: (437, 248)\nstep 7: CLICK: (836, 51)\nstep 8: CLICK: (901, 648)\nstep 9: PRESS_HOME\nstep 10: CLICK: (179, 636)\nstep 11: CLICK: (914, 904)\nstep 12: CLICK: (882, 797)\nstep 13: TYPE: a travel guide to Statue of Liberty\nstep 14: CLICK: (199, 163)\nstep 15: CLICK: (420, 654)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (101, 614)\nB: CLICK: (58, 56)\nC: SCROLL: LEFT\nD: CLICK: (925, 918)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_18_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_18_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_18_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_18_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_18_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_18_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_18_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_18_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_18_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_18_9.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_18_10.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_18_11.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_18_12.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_18_13.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_18_14.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_18_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: TYPE: Deebot\nB: CLICK: (916, 864)\nC: SCROLL: UP\nD: PRESS_HOME\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (524, 81)\nstep 2: CLICK: (766, 85)\nstep 3: TYPE: eclectic\nI want to Using Pinterest, locate an image that showcases an eclectic style. Once found, navigate to the Setting app on your phone to set this image as your wallpaper. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Using Pinterest, locate an image that showcases an eclectic style. Once found, navigate to the Setting app on your phone to set this image as your wallpaper.\nThe historical actions are: step 1: CLICK: (524, 81)\nstep 2: CLICK: (766, 85)\nstep 3: TYPE: eclectic\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: TYPE: Deebot\nB: CLICK: (916, 864)\nC: SCROLL: UP\nD: PRESS_HOME\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_19_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_19_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_19_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_19_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: TYPE: https://www.threads.net/@ct.5024/post/C60c4dFO2YQ/?xmt=AQGzuhpc3kkbQPp3APiJPjh-3Y8uPVOjnWk8g2lv940IEg\nB: CLICK: (308, 253)\nC: SCROLL: UP\nD: COMPLETE\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (604, 290)\nstep 2: CLICK: (499, 916)\nstep 3: TYPE: I am sad\nstep 4: CLICK: (847, 496)\nstep 5: CLICK: (909, 924)\nstep 6: SCROLL: UP\nstep 7: CLICK: (545, 597)\nstep 8: CLICK: (561, 882)\nstep 9: PRESS_HOME\nstep 10: CLICK: (576, 170)\nstep 11: CLICK: (894, 169)\nstep 12: CLICK: (394, 648)\nstep 13: CLICK: (341, 917)\nI want to Use Instagram to post today's feelings, then share that post on Threads with Tzhau Jau. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Use Instagram to post today's feelings, then share that post on Threads with Tzhau Jau.\nThe historical actions are: step 1: CLICK: (604, 290)\nstep 2: CLICK: (499, 916)\nstep 3: TYPE: I am sad\nstep 4: CLICK: (847, 496)\nstep 5: CLICK: (909, 924)\nstep 6: SCROLL: UP\nstep 7: CLICK: (545, 597)\nstep 8: CLICK: (561, 882)\nstep 9: PRESS_HOME\nstep 10: CLICK: (576, 170)\nstep 11: CLICK: (894, 169)\nstep 12: CLICK: (394, 648)\nstep 13: CLICK: (341, 917)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: TYPE: https://www.threads.net/@ct.5024/post/C60c4dFO2YQ/?xmt=AQGzuhpc3kkbQPp3APiJPjh-3Y8uPVOjnWk8g2lv940IEg\nB: CLICK: (308, 253)\nC: SCROLL: UP\nD: COMPLETE\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_20_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_20_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_20_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_20_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_20_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_20_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_20_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_20_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_20_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_20_9.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_20_10.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_20_11.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_20_12.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_20_13.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (418, 649)\nB: TYPE: vanilla extract\nC: CLICK: (871, 688)\nD: TYPE: 3D Printer Course for Beginners\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (311, 349)\nstep 2: CLICK: (915, 82)\nstep 3: TYPE: knitting class\nstep 4: CLICK: (924, 875)\nstep 5: CLICK: (224, 678)\nstep 6: PRESS_HOME\nstep 7: CLICK: (687, 69)\nstep 8: CLICK: (388, 66)\nstep 9: TYPE: knitting wool\nstep 10: CLICK: (944, 879)\nI want to Find a knitting class on TikTok, purchase the needed materials through SHEIN, and create a calendar event to remind you to study using your Calendar app. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Find a knitting class on TikTok, purchase the needed materials through SHEIN, and create a calendar event to remind you to study using your Calendar app.\nThe historical actions are: step 1: CLICK: (311, 349)\nstep 2: CLICK: (915, 82)\nstep 3: TYPE: knitting class\nstep 4: CLICK: (924, 875)\nstep 5: CLICK: (224, 678)\nstep 6: PRESS_HOME\nstep 7: CLICK: (687, 69)\nstep 8: CLICK: (388, 66)\nstep 9: TYPE: knitting wool\nstep 10: CLICK: (944, 879)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (418, 649)\nB: TYPE: vanilla extract\nC: CLICK: (871, 688)\nD: TYPE: 3D Printer Course for Beginners\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_21_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_21_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_21_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_21_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_21_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_21_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_21_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_21_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_21_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_21_9.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_21_10.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: TYPE: tiktok\nB: PRESS_HOME\nC: CLICK: (386, 650)\nD: CLICK: (324, 928)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (632, 797)\nstep 2: CLICK: (783, 175)\nstep 3: TYPE: bmw most popular car\nstep 4: CLICK: (427, 168)\nI want to Identify the most popular BMW vehicle and verify its price using the CarWale: Buy-Sell New/Used Car app and Chrome. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Identify the most popular BMW vehicle and verify its price using the CarWale: Buy-Sell New/Used Car app and Chrome.\nThe historical actions are: step 1: CLICK: (632, 797)\nstep 2: CLICK: (783, 175)\nstep 3: TYPE: bmw most popular car\nstep 4: CLICK: (427, 168)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: TYPE: tiktok\nB: PRESS_HOME\nC: CLICK: (386, 650)\nD: CLICK: (324, 928)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_22_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_22_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_22_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_22_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_22_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: PRESS_HOME\nB: TYPE: https://www.threads.net/@ct.5024/post/C60dWMKOvyV/?xmt=AQGzrQXOCtihDH5Csh-v2jLg95hKd8qdzO3HygJBDKRrZA\nC: CLICK: (488, 522)\nD: CLICK: (519, 621)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (374, 110)\nstep 2: CLICK: (389, 77)\nstep 3: TYPE: minimalist pictures\nstep 4: CLICK: (443, 143)\nstep 5: CLICK: (238, 133)\nstep 6: CLICK: (260, 608)\nstep 7: SCROLL: UP\nI want to Using DuckDuckGo, search for an image that embodies a minimalist style. Once you have found an appropriate picture, open the Settings app on your phone and set the image as your wallpaper. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Using DuckDuckGo, search for an image that embodies a minimalist style. Once you have found an appropriate picture, open the Settings app on your phone and set the image as your wallpaper.\nThe historical actions are: step 1: CLICK: (374, 110)\nstep 2: CLICK: (389, 77)\nstep 3: TYPE: minimalist pictures\nstep 4: CLICK: (443, 143)\nstep 5: CLICK: (238, 133)\nstep 6: CLICK: (260, 608)\nstep 7: SCROLL: UP\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: PRESS_HOME\nB: TYPE: https://www.threads.net/@ct.5024/post/C60dWMKOvyV/?xmt=AQGzrQXOCtihDH5Csh-v2jLg95hKd8qdzO3HygJBDKRrZA\nC: CLICK: (488, 522)\nD: CLICK: (519, 621)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_23_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_23_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_23_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_23_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_23_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_23_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_23_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_23_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (330, 583)\nB: COMPLETE\nC: SCROLL: UP\nD: TYPE: Colosseum, Rome\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: SCROLL: LEFT\nstep 2: CLICK: (392, 249)\nstep 3: CLICK: (628, 65)\nstep 4: TYPE: Dyson V11 Torque Drive\nstep 5: CLICK: (885, 907)\nI want to Check and compare the prices for a Dyson V11 Torque Drive across Target and AliExpress shopping apps, and make sure to add the one with the lowest price to your cart. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Check and compare the prices for a Dyson V11 Torque Drive across Target and AliExpress shopping apps, and make sure to add the one with the lowest price to your cart.\nThe historical actions are: step 1: SCROLL: LEFT\nstep 2: CLICK: (392, 249)\nstep 3: CLICK: (628, 65)\nstep 4: TYPE: Dyson V11 Torque Drive\nstep 5: CLICK: (885, 907)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (330, 583)\nB: COMPLETE\nC: SCROLL: UP\nD: TYPE: Colosseum, Rome\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_24_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_24_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_24_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_24_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_24_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_24_5.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (567, 684)\nB: PRESS_HOME\nC: CLICK: (802, 74)\nD: CLICK: (95, 962)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (674, 351)\nstep 2: CLICK: (227, 812)\nstep 3: TYPE: tell me about Fundamental theorem of calculus\nstep 4: CLICK: (902, 486)\nI want to Utilize the 'Bing: chat with AI & GPT4' app to inquire about the Fundamental Theorem of Calculus. Then, confirm the details by searching in your browser using the 'ChatOn - AI Chat Bot Assistant' app. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Utilize the 'Bing: chat with AI & GPT4' app to inquire about the Fundamental Theorem of Calculus. Then, confirm the details by searching in your browser using the 'ChatOn - AI Chat Bot Assistant' app.\nThe historical actions are: step 1: CLICK: (674, 351)\nstep 2: CLICK: (227, 812)\nstep 3: TYPE: tell me about Fundamental theorem of calculus\nstep 4: CLICK: (902, 486)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (567, 684)\nB: PRESS_HOME\nC: CLICK: (802, 74)\nD: CLICK: (95, 962)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_25_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_25_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_25_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_25_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_25_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (922, 908)\nB: CLICK: (918, 898)\nC: CLICK: (840, 884)\nD: CLICK: (145, 623)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (853, 374)\nstep 2: CLICK: (824, 49)\nstep 3: TYPE: Video Editing Apps\nstep 4: CLICK: (918, 905)\nstep 5: SCROLL: UP\nstep 6: SCROLL: UP\nstep 7: SCROLL: UP\nstep 8: PRESS_HOME\nstep 9: SCROLL: RIGHT\nI want to Investigate video editing applications, select and download one using 'Google Play Store'. You can also use 'Facebook' to seek recommendations or reviews. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Investigate video editing applications, select and download one using 'Google Play Store'. You can also use 'Facebook' to seek recommendations or reviews.\nThe historical actions are: step 1: CLICK: (853, 374)\nstep 2: CLICK: (824, 49)\nstep 3: TYPE: Video Editing Apps\nstep 4: CLICK: (918, 905)\nstep 5: SCROLL: UP\nstep 6: SCROLL: UP\nstep 7: SCROLL: UP\nstep 8: PRESS_HOME\nstep 9: SCROLL: RIGHT\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (922, 908)\nB: CLICK: (918, 898)\nC: CLICK: (840, 884)\nD: CLICK: (145, 623)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_26_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_26_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_26_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_26_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_26_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_26_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_26_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_26_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_26_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_26_9.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (488, 921)\nB: PRESS_HOME\nC: CLICK: (223, 124)\nD: CLICK: (219, 943)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (362, 617)\nstep 2: TYPE: fantasy movies\nstep 3: CLICK: (897, 892)\nstep 4: CLICK: (443, 430)\nstep 5: SCROLL: UP\nstep 6: SCROLL: UP\nstep 7: PRESS_HOME\nstep 8: CLICK: (595, 510)\nI want to Organize a movie night by selecting a fantasy movie on DuckDuckgo, adding some snacks to your Amazon shopping cart, sending out invitations to katsunaksu via Tumblr app, and setting a reminder on Clock. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Organize a movie night by selecting a fantasy movie on DuckDuckgo, adding some snacks to your Amazon shopping cart, sending out invitations to katsunaksu via Tumblr app, and setting a reminder on Clock.\nThe historical actions are: step 1: CLICK: (362, 617)\nstep 2: TYPE: fantasy movies\nstep 3: CLICK: (897, 892)\nstep 4: CLICK: (443, 430)\nstep 5: SCROLL: UP\nstep 6: SCROLL: UP\nstep 7: PRESS_HOME\nstep 8: CLICK: (595, 510)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (488, 921)\nB: PRESS_HOME\nC: CLICK: (223, 124)\nD: CLICK: (219, 943)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_27_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_27_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_27_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_27_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_27_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_27_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_27_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_27_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_27_8.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (371, 528)\nB: CLICK: (917, 880)\nC: CLICK: (667, 672)\nD: CLICK: (609, 372)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (791, 478)\nstep 2: CLICK: (720, 69)\nstep 3: CLICK: (405, 71)\nstep 4: TYPE: cooking class\nstep 5: CLICK: (881, 880)\nstep 6: CLICK: (434, 280)\nstep 7: CLICK: (326, 446)\nstep 8: PRESS_HOME\nstep 9: CLICK: (552, 478)\nstep 10: CLICK: (277, 71)\nstep 11: TYPE: cooking pan\nstep 12: CLICK: (880, 884)\nstep 13: CLICK: (155, 507)\nstep 14: CLICK: (461, 904)\nstep 15: PRESS_HOME\nI want to Search for a Cooking class on Likee, purchase the required materials through Wish, and schedule a study reminder in Calendar. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Search for a Cooking class on Likee, purchase the required materials through Wish, and schedule a study reminder in Calendar.\nThe historical actions are: step 1: CLICK: (791, 478)\nstep 2: CLICK: (720, 69)\nstep 3: CLICK: (405, 71)\nstep 4: TYPE: cooking class\nstep 5: CLICK: (881, 880)\nstep 6: CLICK: (434, 280)\nstep 7: CLICK: (326, 446)\nstep 8: PRESS_HOME\nstep 9: CLICK: (552, 478)\nstep 10: CLICK: (277, 71)\nstep 11: TYPE: cooking pan\nstep 12: CLICK: (880, 884)\nstep 13: CLICK: (155, 507)\nstep 14: CLICK: (461, 904)\nstep 15: PRESS_HOME\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (371, 528)\nB: CLICK: (917, 880)\nC: CLICK: (667, 672)\nD: CLICK: (609, 372)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_28_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_28_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_28_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_28_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_28_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_28_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_28_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_28_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_28_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_28_9.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_28_10.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_28_11.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_28_12.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_28_13.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_28_14.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_28_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: PRESS_HOME\nB: TYPE: properties of hexagon\nC: CLICK: (419, 708)\nD: SCROLL: UP\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (141, 703)\nstep 2: CLICK: (683, 368)\nstep 3: PRESS_HOME\nI want to First, access the Google Play Store to install the Tiktok app. Once installed, open Tiktok, then navigate to the device's Settings to disable notifications for the app. After adjusting the settings, reopen Tiktok to watch a video. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: First, access the Google Play Store to install the Tiktok app. Once installed, open Tiktok, then navigate to the device's Settings to disable notifications for the app. After adjusting the settings, reopen Tiktok to watch a video.\nThe historical actions are: step 1: CLICK: (141, 703)\nstep 2: CLICK: (683, 368)\nstep 3: PRESS_HOME\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: PRESS_HOME\nB: TYPE: properties of hexagon\nC: CLICK: (419, 708)\nD: SCROLL: UP\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_29_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_29_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_29_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_29_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (527, 869)\nB: PRESS_HOME\nC: CLICK: (427, 466)\nD: CLICK: (858, 871)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (397, 499)\nstep 2: CLICK: (76, 160)\nstep 3: TYPE: Plastic Pollution Solutions\nstep 4: CLICK: (940, 888)\nI want to Use Google News to search for the latest articles on Plastic Pollution Solutions, then share the article on X with liudehu19294094. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Use Google News to search for the latest articles on Plastic Pollution Solutions, then share the article on X with liudehu19294094.\nThe historical actions are: step 1: CLICK: (397, 499)\nstep 2: CLICK: (76, 160)\nstep 3: TYPE: Plastic Pollution Solutions\nstep 4: CLICK: (940, 888)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (527, 869)\nB: PRESS_HOME\nC: CLICK: (427, 466)\nD: CLICK: (858, 871)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_30_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_30_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_30_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_30_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_30_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: PRESS_HOME\nB: CLICK: (671, 539)\nC: CLICK: (929, 80)\nD: CLICK: (192, 631)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (610, 833)\nstep 2: PRESS_HOME\nstep 3: CLICK: (855, 835)\nstep 4: CLICK: (828, 72)\nI want to Watch a video on YouTube that discusses recommendations for fitness tracking apps, then head over to the Google Play Store and download one of the suggested apps. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Watch a video on YouTube that discusses recommendations for fitness tracking apps, then head over to the Google Play Store and download one of the suggested apps.\nThe historical actions are: step 1: CLICK: (610, 833)\nstep 2: PRESS_HOME\nstep 3: CLICK: (855, 835)\nstep 4: CLICK: (828, 72)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: PRESS_HOME\nB: CLICK: (671, 539)\nC: CLICK: (929, 80)\nD: CLICK: (192, 631)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_31_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_31_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_31_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_31_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_31_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (926, 879)\nB: CLICK: (409, 132)\nC: CLICK: (666, 183)\nD: CLICK: (519, 103)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (666, 496)\nstep 3: CLICK: (124, 101)\nstep 4: CLICK: (633, 114)\nstep 5: CLICK: (894, 103)\nI want to Locate a nearby fitness training center using Waze Navigation & Live Traffic, then proceed to the Google Play Store to download a fitness tracking app for setting your goals. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Locate a nearby fitness training center using Waze Navigation & Live Traffic, then proceed to the Google Play Store to download a fitness tracking app for setting your goals.\nThe historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (666, 496)\nstep 3: CLICK: (124, 101)\nstep 4: CLICK: (633, 114)\nstep 5: CLICK: (894, 103)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (926, 879)\nB: CLICK: (409, 132)\nC: CLICK: (666, 183)\nD: CLICK: (519, 103)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_32_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_32_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_32_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_32_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_32_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_32_5.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (249, 164)\nB: CLICK: (362, 71)\nC: CLICK: (271, 934)\nD: PRESS_HOME\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (384, 520)\nstep 2: CLICK: (88, 71)\nstep 3: CLICK: (624, 927)\nI want to Listen to an Electronic style album on Pandora, and then share the name of the album with katsunaksu on Tumblr. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Listen to an Electronic style album on Pandora, and then share the name of the album with katsunaksu on Tumblr.\nThe historical actions are: step 1: CLICK: (384, 520)\nstep 2: CLICK: (88, 71)\nstep 3: CLICK: (624, 927)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (249, 164)\nB: CLICK: (362, 71)\nC: CLICK: (271, 934)\nD: PRESS_HOME\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_33_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_33_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_33_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_33_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (316, 475)\nB: CLICK: (257, 257)\nC: CLICK: (559, 183)\nD: SCROLL: UP\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (154, 532)\nstep 2: CLICK: (514, 931)\nstep 3: CLICK: (250, 258)\nstep 4: TYPE: vintage camera\nstep 5: CLICK: (898, 936)\nstep 6: CLICK: (648, 852)\nstep 7: CLICK: (873, 846)\nstep 8: CLICK: (308, 790)\nstep 9: CLICK: (308, 192)\nstep 10: TYPE: caba62244@gmail.com\nI want to Use GenZArt:Fast AI Art Generator to create an image focused on a vintage camera, then share it through Gmail with caba62244@gmail.com. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Use GenZArt:Fast AI Art Generator to create an image focused on a vintage camera, then share it through Gmail with caba62244@gmail.com.\nThe historical actions are: step 1: CLICK: (154, 532)\nstep 2: CLICK: (514, 931)\nstep 3: CLICK: (250, 258)\nstep 4: TYPE: vintage camera\nstep 5: CLICK: (898, 936)\nstep 6: CLICK: (648, 852)\nstep 7: CLICK: (873, 846)\nstep 8: CLICK: (308, 790)\nstep 9: CLICK: (308, 192)\nstep 10: TYPE: caba62244@gmail.com\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (316, 475)\nB: CLICK: (257, 257)\nC: CLICK: (559, 183)\nD: SCROLL: UP\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_34_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_34_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_34_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_34_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_34_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_34_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_34_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_34_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_34_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_34_9.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_34_10.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: SCROLL: LEFT\nB: PRESS_RECENT\nC: TYPE: fitness tracking apps\nD: CLICK: (228, 798)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (788, 340)\nstep 3: CLICK: (45, 682)\nstep 4: CLICK: (387, 241)\nstep 5: CLICK: (201, 234)\nstep 6: TYPE: boxing gym\nstep 7: CLICK: (193, 179)\nstep 8: PRESS_RECENT\nstep 9: CLICK: (94, 266)\nstep 10: CLICK: (298, 99)\nstep 11: TYPE: fitness tracking apps\nstep 12: CLICK: (214, 172)\nI want to Utilize Yandex Navigator to locate a nearby Boxing gym, and subsequently, visit the Google Play Store to download a fitness tracking app for setting your fitness goals. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Utilize Yandex Navigator to locate a nearby Boxing gym, and subsequently, visit the Google Play Store to download a fitness tracking app for setting your fitness goals.\nThe historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (788, 340)\nstep 3: CLICK: (45, 682)\nstep 4: CLICK: (387, 241)\nstep 5: CLICK: (201, 234)\nstep 6: TYPE: boxing gym\nstep 7: CLICK: (193, 179)\nstep 8: PRESS_RECENT\nstep 9: CLICK: (94, 266)\nstep 10: CLICK: (298, 99)\nstep 11: TYPE: fitness tracking apps\nstep 12: CLICK: (214, 172)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: SCROLL: LEFT\nB: PRESS_RECENT\nC: TYPE: fitness tracking apps\nD: CLICK: (228, 798)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_35_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_35_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_35_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_35_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_35_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_35_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_35_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_35_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_35_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_35_9.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_35_10.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_35_11.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_35_12.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (148, 806)\nB: CLICK: (59, 76)\nC: SCROLL: RIGHT\nD: CLICK: (248, 121)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (629, 809)\nstep 2: CLICK: (253, 435)\nstep 3: CLICK: (836, 560)\nstep 4: PRESS_HOME\nstep 5: CLICK: (389, 806)\nstep 6: CLICK: (292, 344)\nstep 7: CLICK: (693, 76)\nstep 8: PRESS_HOME\nI want to Search for the best video blogs on DIY crafts using the Firefox Browser, then increase the brightness on your phone through the Settings app. Finally, open the Youtube app and follow along with the videos. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Search for the best video blogs on DIY crafts using the Firefox Browser, then increase the brightness on your phone through the Settings app. Finally, open the Youtube app and follow along with the videos.\nThe historical actions are: step 1: CLICK: (629, 809)\nstep 2: CLICK: (253, 435)\nstep 3: CLICK: (836, 560)\nstep 4: PRESS_HOME\nstep 5: CLICK: (389, 806)\nstep 6: CLICK: (292, 344)\nstep 7: CLICK: (693, 76)\nstep 8: PRESS_HOME\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (148, 806)\nB: CLICK: (59, 76)\nC: SCROLL: RIGHT\nD: CLICK: (248, 121)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_36_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_36_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_36_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_36_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_36_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_36_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_36_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_36_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_36_8.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: TYPE: AI\nB: CLICK: (369, 261)\nC: SCROLL: UP\nD: CLICK: (647, 519)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (399, 374)\nstep 3: CLICK: (107, 476)\nstep 4: CLICK: (345, 624)\nstep 5: CLICK: (717, 590)\nstep 6: CLICK: (321, 649)\nstep 7: CLICK: (642, 841)\nstep 8: CLICK: (610, 689)\nstep 9: CLICK: (629, 588)\nstep 10: CLICK: (895, 754)\nstep 11: CLICK: (593, 751)\nstep 12: CLICK: (185, 685)\nstep 13: CLICK: (838, 845)\nstep 14: PRESS_HOME\nI want to Use 'ClevCalc - Calculator' to compute the sum of 5.69 and 34 for today's total cost, and then log the result in 'Wallet: Budget Money Manager'. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Use 'ClevCalc - Calculator' to compute the sum of 5.69 and 34 for today's total cost, and then log the result in 'Wallet: Budget Money Manager'.\nThe historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (399, 374)\nstep 3: CLICK: (107, 476)\nstep 4: CLICK: (345, 624)\nstep 5: CLICK: (717, 590)\nstep 6: CLICK: (321, 649)\nstep 7: CLICK: (642, 841)\nstep 8: CLICK: (610, 689)\nstep 9: CLICK: (629, 588)\nstep 10: CLICK: (895, 754)\nstep 11: CLICK: (593, 751)\nstep 12: CLICK: (185, 685)\nstep 13: CLICK: (838, 845)\nstep 14: PRESS_HOME\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: TYPE: AI\nB: CLICK: (369, 261)\nC: SCROLL: UP\nD: CLICK: (647, 519)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_37_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_37_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_37_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_37_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_37_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_37_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_37_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_37_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_37_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_37_9.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_37_10.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_37_11.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_37_12.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_37_13.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_37_14.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (338, 431)\nB: TYPE: May 2 Cost\nC: TYPE: 'Dune' by Frank Herbert book review\nD: CLICK: (802, 64)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (613, 624)\nstep 2: CLICK: (831, 50)\nstep 3: TYPE: portable speaker recommendation\nstep 4: CLICK: (916, 922)\nstep 5: SCROLL: UP\nstep 6: SCROLL: UP\nstep 7: SCROLL: UP\nstep 8: SCROLL: UP\nstep 9: SCROLL: UP\nstep 10: CLICK: (634, 570)\nstep 11: PRESS_HOME\nstep 12: CLICK: (414, 109)\nstep 13: CLICK: (712, 65)\nI want to Utilize Amazon to search for and identify a highly recommended portable speaker. Once you have selected a speaker, use Amazon to make the purchase. Additionally, you can check Facebook for user reviews and recommendations before finalizing your decision. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Utilize Amazon to search for and identify a highly recommended portable speaker. Once you have selected a speaker, use Amazon to make the purchase. Additionally, you can check Facebook for user reviews and recommendations before finalizing your decision.\nThe historical actions are: step 1: CLICK: (613, 624)\nstep 2: CLICK: (831, 50)\nstep 3: TYPE: portable speaker recommendation\nstep 4: CLICK: (916, 922)\nstep 5: SCROLL: UP\nstep 6: SCROLL: UP\nstep 7: SCROLL: UP\nstep 8: SCROLL: UP\nstep 9: SCROLL: UP\nstep 10: CLICK: (634, 570)\nstep 11: PRESS_HOME\nstep 12: CLICK: (414, 109)\nstep 13: CLICK: (712, 65)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (338, 431)\nB: TYPE: May 2 Cost\nC: TYPE: 'Dune' by Frank Herbert book review\nD: CLICK: (802, 64)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_38_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_38_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_38_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_38_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_38_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_38_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_38_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_38_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_38_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_38_9.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_38_10.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_38_11.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_38_12.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_38_13.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: COMPLETE\nB: CLICK: (496, 924)\nC: CLICK: (873, 188)\nD: SCROLL: LEFT\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (410, 815)\nstep 2: CLICK: (918, 317)\nstep 3: PRESS_HOME\nstep 4: CLICK: (130, 244)\nI want to Open Triller to watch a trending video, and then use the Setting app to turn the volume on your phone to maximum. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Open Triller to watch a trending video, and then use the Setting app to turn the volume on your phone to maximum.\nThe historical actions are: step 1: CLICK: (410, 815)\nstep 2: CLICK: (918, 317)\nstep 3: PRESS_HOME\nstep 4: CLICK: (130, 244)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: COMPLETE\nB: CLICK: (496, 924)\nC: CLICK: (873, 188)\nD: SCROLL: LEFT\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_39_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_39_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_39_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_39_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_39_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (138, 93)\nB: TYPE: 'The Midnight Library' by Matt Haig book review\nC: COMPLETE\nD: CLICK: (491, 146)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (727, 547)\nstep 2: CLICK: (373, 146)\nstep 3: TYPE: Autonomous Driving Technologies\nstep 4: CLICK: (887, 689)\nI want to Using X and Microsoft News, search for the latest news articles on Autonomous Driving Technologies and share the most relevant article. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Using X and Microsoft News, search for the latest news articles on Autonomous Driving Technologies and share the most relevant article.\nThe historical actions are: step 1: CLICK: (727, 547)\nstep 2: CLICK: (373, 146)\nstep 3: TYPE: Autonomous Driving Technologies\nstep 4: CLICK: (887, 689)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (138, 93)\nB: TYPE: 'The Midnight Library' by Matt Haig book review\nC: COMPLETE\nD: CLICK: (491, 146)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_40_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_40_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_40_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_40_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_40_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: TYPE: cricket\nB: TYPE: Fall of the Roman Empire\nC: CLICK: (865, 738)\nD: CLICK: (245, 92)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (397, 372)\nstep 2: CLICK: (237, 604)\nstep 3: CLICK: (492, 615)\nstep 4: CLICK: (789, 629)\nstep 5: CLICK: (209, 718)\nstep 6: CLICK: (527, 721)\nstep 7: CLICK: (759, 725)\nstep 8: SCROLL: UP\nstep 9: SCROLL: UP\nI want to First, use Applock Pro - APP Lock & Guard to lock the PayPal - Send, Shop, Manage app. Then, open the PayPal - Send, Shop, Manage app to verify the lock by entering the PIN 123456. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: First, use Applock Pro - APP Lock & Guard to lock the PayPal - Send, Shop, Manage app. Then, open the PayPal - Send, Shop, Manage app to verify the lock by entering the PIN 123456.\nThe historical actions are: step 1: CLICK: (397, 372)\nstep 2: CLICK: (237, 604)\nstep 3: CLICK: (492, 615)\nstep 4: CLICK: (789, 629)\nstep 5: CLICK: (209, 718)\nstep 6: CLICK: (527, 721)\nstep 7: CLICK: (759, 725)\nstep 8: SCROLL: UP\nstep 9: SCROLL: UP\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: TYPE: cricket\nB: TYPE: Fall of the Roman Empire\nC: CLICK: (865, 738)\nD: CLICK: (245, 92)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_41_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_41_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_41_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_41_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_41_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_41_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_41_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_41_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_41_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_41_9.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: COMPLETE\nB: CLICK: (698, 449)\nC: CLICK: (142, 152)\nD: CLICK: (694, 431)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (795, 910)\nstep 2: PRESS_HOME\nstep 3: CLICK: (681, 909)\nstep 4: CLICK: (180, 80)\nstep 5: TYPE: Home Workout No Equipment App\nstep 6: CLICK: (864, 873)\nstep 7: CLICK: (177, 818)\nI want to Open YouTube APP and watch a video about fitness tracking app recommendations. Then, go to the Google Play Store and download one of the recommended apps. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Open YouTube APP and watch a video about fitness tracking app recommendations. Then, go to the Google Play Store and download one of the recommended apps.\nThe historical actions are: step 1: CLICK: (795, 910)\nstep 2: PRESS_HOME\nstep 3: CLICK: (681, 909)\nstep 4: CLICK: (180, 80)\nstep 5: TYPE: Home Workout No Equipment App\nstep 6: CLICK: (864, 873)\nstep 7: CLICK: (177, 818)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: COMPLETE\nB: CLICK: (698, 449)\nC: CLICK: (142, 152)\nD: CLICK: (694, 431)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_42_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_42_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_42_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_42_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_42_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_42_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_42_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_42_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (581, 382)\nB: CLICK: (616, 383)\nC: LONG_PRESS: (420, 547)\nD: CLICK: (865, 839)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (130, 365)\nstep 3: CLICK: (199, 783)\nstep 4: CLICK: (194, 783)\nstep 5: CLICK: (889, 785)\nstep 6: CLICK: (423, 785)\nstep 7: CLICK: (423, 785)\nstep 8: CLICK: (907, 891)\nstep 9: PRESS_HOME\nI want to Use Calculator Plus with History to compute today's total cost by adding 11 and 22, then record the result in Monefy. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Use Calculator Plus with History to compute today's total cost by adding 11 and 22, then record the result in Monefy.\nThe historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (130, 365)\nstep 3: CLICK: (199, 783)\nstep 4: CLICK: (194, 783)\nstep 5: CLICK: (889, 785)\nstep 6: CLICK: (423, 785)\nstep 7: CLICK: (423, 785)\nstep 8: CLICK: (907, 891)\nstep 9: PRESS_HOME\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (581, 382)\nB: CLICK: (616, 383)\nC: LONG_PRESS: (420, 547)\nD: CLICK: (865, 839)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_43_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_43_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_43_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_43_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_43_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_43_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_43_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_43_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_43_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_43_9.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (791, 303)\nB: CLICK: (938, 385)\nC: CLICK: (138, 554)\nD: SCROLL: UP\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: SCROLL: RIGHT\nstep 2: CLICK: (831, 619)\nstep 3: CLICK: (904, 180)\nstep 4: TYPE: ford most popular car\nstep 5: CLICK: (924, 903)\nstep 6: PRESS_HOME\nstep 7: SCROLL: LEFT\nstep 8: CLICK: (858, 622)\nstep 9: CLICK: (490, 317)\nstep 10: CLICK: (187, 129)\nstep 11: CLICK: (269, 167)\nstep 12: TYPE: f\nI want to Using Firefox, identify the most popular car product of Chevrolet and verify its price on the AutoScout24: Buy & sell cars app. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Using Firefox, identify the most popular car product of Chevrolet and verify its price on the AutoScout24: Buy & sell cars app.\nThe historical actions are: step 1: SCROLL: RIGHT\nstep 2: CLICK: (831, 619)\nstep 3: CLICK: (904, 180)\nstep 4: TYPE: ford most popular car\nstep 5: CLICK: (924, 903)\nstep 6: PRESS_HOME\nstep 7: SCROLL: LEFT\nstep 8: CLICK: (858, 622)\nstep 9: CLICK: (490, 317)\nstep 10: CLICK: (187, 129)\nstep 11: CLICK: (269, 167)\nstep 12: TYPE: f\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (791, 303)\nB: CLICK: (938, 385)\nC: CLICK: (138, 554)\nD: SCROLL: UP\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_44_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_44_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_44_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_44_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_44_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_44_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_44_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_44_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_44_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_44_9.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_44_10.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_44_11.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_44_12.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (858, 212)\nB: CLICK: (424, 249)\nC: CLICK: (527, 894)\nD: CLICK: (807, 539)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (705, 223)\nstep 3: CLICK: (813, 256)\nstep 4: TYPE: weather in Shanghai tomorrow\nstep 5: CLICK: (930, 248)\nstep 6: PRESS_HOME\nI want to Using Firefox, find out what the weather will be like in Shanghai tomorrow and then use Google Docs to create a to-do list based on that forecast. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Using Firefox, find out what the weather will be like in Shanghai tomorrow and then use Google Docs to create a to-do list based on that forecast.\nThe historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (705, 223)\nstep 3: CLICK: (813, 256)\nstep 4: TYPE: weather in Shanghai tomorrow\nstep 5: CLICK: (930, 248)\nstep 6: PRESS_HOME\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (858, 212)\nB: CLICK: (424, 249)\nC: CLICK: (527, 894)\nD: CLICK: (807, 539)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_45_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_45_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_45_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_45_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_45_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_45_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_45_6.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (804, 656)\nB: CLICK: (848, 406)\nC: CLICK: (344, 851)\nD: PRESS_HOME\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (154, 484)\nstep 2: CLICK: (884, 793)\nstep 3: CLICK: (547, 548)\nstep 4: CLICK: (919, 167)\nstep 5: CLICK: (766, 742)\nstep 6: CLICK: (873, 166)\nstep 7: CLICK: (806, 164)\nI want to Use Adobe Express: AI Video Design to resize a photo and then share it to Facebook Moments. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Use Adobe Express: AI Video Design to resize a photo and then share it to Facebook Moments.\nThe historical actions are: step 1: CLICK: (154, 484)\nstep 2: CLICK: (884, 793)\nstep 3: CLICK: (547, 548)\nstep 4: CLICK: (919, 167)\nstep 5: CLICK: (766, 742)\nstep 6: CLICK: (873, 166)\nstep 7: CLICK: (806, 164)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (804, 656)\nB: CLICK: (848, 406)\nC: CLICK: (344, 851)\nD: PRESS_HOME\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_46_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_46_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_46_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_46_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_46_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_46_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_46_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_46_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: TYPE: Hindi\nB: SCROLL: UP\nC: TYPE:  do yoga with this\nD: TYPE: the beach boys\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (317, 495)\nstep 2: SCROLL: UP\nstep 3: SCROLL: UP\nstep 4: SCROLL: UP\nstep 5: CLICK: (119, 643)\nstep 6: CLICK: (546, 354)\nstep 7: CLICK: (589, 438)\nstep 8: CLICK: (529, 628)\nstep 9: CLICK: (962, 75)\nI want to Switch the phone's language to Hindi, then open the 'Setting' app to confirm the change, followed by the 'Photos' app to ensure everything is working properly. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Switch the phone's language to Hindi, then open the 'Setting' app to confirm the change, followed by the 'Photos' app to ensure everything is working properly.\nThe historical actions are: step 1: CLICK: (317, 495)\nstep 2: SCROLL: UP\nstep 3: SCROLL: UP\nstep 4: SCROLL: UP\nstep 5: CLICK: (119, 643)\nstep 6: CLICK: (546, 354)\nstep 7: CLICK: (589, 438)\nstep 8: CLICK: (529, 628)\nstep 9: CLICK: (962, 75)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: TYPE: Hindi\nB: SCROLL: UP\nC: TYPE:  do yoga with this\nD: TYPE: the beach boys\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_47_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_47_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_47_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_47_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_47_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_47_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_47_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_47_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_47_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_47_9.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (610, 610)\nB: PRESS_HOME\nC: CLICK: (422, 581)\nD: SCROLL: UP\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (794, 337)\nstep 2: CLICK: (395, 612)\nstep 3: CLICK: (514, 610)\nI want to Using Applock Pro - APP Lock & Guard, secure the Google Pay app. Once done, unlock Google Pay and verify its functionality using the PIN 123789. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Using Applock Pro - APP Lock & Guard, secure the Google Pay app. Once done, unlock Google Pay and verify its functionality using the PIN 123789.\nThe historical actions are: step 1: CLICK: (794, 337)\nstep 2: CLICK: (395, 612)\nstep 3: CLICK: (514, 610)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (610, 610)\nB: PRESS_HOME\nC: CLICK: (422, 581)\nD: SCROLL: UP\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_48_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_48_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_48_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_48_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (76, 299)\nB: TYPE: eBay\nC: CLICK: (598, 649)\nD: SCROLL: UP\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (185, 645)\nstep 2: CLICK: (299, 425)\nstep 3: TYPE: documentary movies\nstep 4: CLICK: (927, 897)\nstep 5: SCROLL: UP\nstep 6: CLICK: (253, 635)\nstep 7: PRESS_HOME\nstep 8: CLICK: (555, 475)\nstep 9: CLICK: (379, 156)\nstep 10: TYPE: snacks\nstep 11: CLICK: (899, 884)\nstep 12: CLICK: (827, 284)\nI want to Organize a movie night by choosing a documentary film using Chrome, adding snacks to your Ebay cart, sending invitations to liudehu19294094 via X, and setting a reminder on Clock. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Organize a movie night by choosing a documentary film using Chrome, adding snacks to your Ebay cart, sending invitations to liudehu19294094 via X, and setting a reminder on Clock.\nThe historical actions are: step 1: CLICK: (185, 645)\nstep 2: CLICK: (299, 425)\nstep 3: TYPE: documentary movies\nstep 4: CLICK: (927, 897)\nstep 5: SCROLL: UP\nstep 6: CLICK: (253, 635)\nstep 7: PRESS_HOME\nstep 8: CLICK: (555, 475)\nstep 9: CLICK: (379, 156)\nstep 10: TYPE: snacks\nstep 11: CLICK: (899, 884)\nstep 12: CLICK: (827, 284)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (76, 299)\nB: TYPE: eBay\nC: CLICK: (598, 649)\nD: SCROLL: UP\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_49_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_49_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_49_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_49_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_49_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_49_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_49_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_49_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_49_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_49_9.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_49_10.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_49_11.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_49_12.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (480, 626)\nB: PRESS_HOME\nC: COMPLETE\nD: CLICK: (127, 471)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (855, 743)\nstep 2: CLICK: (563, 248)\nstep 3: PRESS_HOME\nstep 4: CLICK: (397, 726)\nI want to First, go to the App Store and uninstall the TikTok app. Confirm that TikTok is successfully uninstalled. Next, download the Likee app from the Google Play Store. Once the download is complete, open the Likee app. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: First, go to the App Store and uninstall the TikTok app. Confirm that TikTok is successfully uninstalled. Next, download the Likee app from the Google Play Store. Once the download is complete, open the Likee app.\nThe historical actions are: step 1: CLICK: (855, 743)\nstep 2: CLICK: (563, 248)\nstep 3: PRESS_HOME\nstep 4: CLICK: (397, 726)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (480, 626)\nB: PRESS_HOME\nC: COMPLETE\nD: CLICK: (127, 471)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_50_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_50_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_50_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_50_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_50_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: SCROLL: UP\nB: CLICK: (641, 277)\nC: CLICK: (926, 63)\nD: CLICK: (931, 917)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (870, 512)\nstep 2: CLICK: (378, 231)\nstep 3: CLICK: (931, 62)\nstep 4: CLICK: (593, 442)\nstep 5: CLICK: (634, 249)\nstep 6: PRESS_HOME\nstep 7: CLICK: (578, 125)\nI want to Locate the working file on your phone using Google Docs, then share it with Tzhau Jau via Instagram. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Locate the working file on your phone using Google Docs, then share it with Tzhau Jau via Instagram.\nThe historical actions are: step 1: CLICK: (870, 512)\nstep 2: CLICK: (378, 231)\nstep 3: CLICK: (931, 62)\nstep 4: CLICK: (593, 442)\nstep 5: CLICK: (634, 249)\nstep 6: PRESS_HOME\nstep 7: CLICK: (578, 125)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: SCROLL: UP\nB: CLICK: (641, 277)\nC: CLICK: (926, 63)\nD: CLICK: (931, 917)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_51_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_51_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_51_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_51_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_51_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_51_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_51_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_51_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (944, 930)\nB: TYPE: 2019 nobel prize winners in physics\nC: CLICK: (872, 787)\nD: CLICK: (627, 943)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (159, 646)\nstep 2: CLICK: (427, 280)\nstep 3: TYPE:  Francis Crick\nI want to Open Chrome, search for an introduction about Francis Crick, and share the link to the webpage on Tumblr with katsunaksu. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Open Chrome, search for an introduction about Francis Crick, and share the link to the webpage on Tumblr with katsunaksu.\nThe historical actions are: step 1: CLICK: (159, 646)\nstep 2: CLICK: (427, 280)\nstep 3: TYPE:  Francis Crick\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (944, 930)\nB: TYPE: 2019 nobel prize winners in physics\nC: CLICK: (872, 787)\nD: CLICK: (627, 943)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_52_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_52_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_52_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_52_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: SCROLL: UP\nB: COMPLETE\nC: PRESS_HOME\nD: TYPE:  do yoga with this\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (134, 137)\nstep 2: CLICK: (523, 194)\nstep 3: CLICK: (217, 832)\nstep 4: TYPE: bank\nstep 5: CLICK: (922, 895)\nstep 6: CLICK: (304, 154)\nstep 7: PRESS_HOME\nstep 8: CLICK: (143, 397)\nstep 9: CLICK: (121, 502)\nstep 10: TYPE: Citi\nstep 11: CLICK: (210, 317)\nstep 12: CLICK: (607, 929)\nstep 13: CLICK: (386, 944)\nI want to Locate a nearby bank using GPS and then book a ride with Uber. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Locate a nearby bank using GPS and then book a ride with Uber.\nThe historical actions are: step 1: CLICK: (134, 137)\nstep 2: CLICK: (523, 194)\nstep 3: CLICK: (217, 832)\nstep 4: TYPE: bank\nstep 5: CLICK: (922, 895)\nstep 6: CLICK: (304, 154)\nstep 7: PRESS_HOME\nstep 8: CLICK: (143, 397)\nstep 9: CLICK: (121, 502)\nstep 10: TYPE: Citi\nstep 11: CLICK: (210, 317)\nstep 12: CLICK: (607, 929)\nstep 13: CLICK: (386, 944)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: SCROLL: UP\nB: COMPLETE\nC: PRESS_HOME\nD: TYPE:  do yoga with this\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_53_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_53_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_53_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_53_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_53_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_53_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_53_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_53_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_53_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_53_9.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_53_10.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_53_11.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_53_12.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_53_13.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (409, 290)\nB: CLICK: (795, 184)\nC: CLICK: (322, 228)\nD: CLICK: (485, 764)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (688, 673)\nstep 2: CLICK: (431, 898)\nstep 3: CLICK: (220, 253)\nstep 4: CLICK: (463, 884)\nstep 5: CLICK: (422, 796)\nstep 6: SCROLL: UP\nstep 7: CLICK: (831, 686)\nI want to Use Google Meet to set up an online meeting, and then share the meeting link with Victor James via Facebook. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Use Google Meet to set up an online meeting, and then share the meeting link with Victor James via Facebook.\nThe historical actions are: step 1: CLICK: (688, 673)\nstep 2: CLICK: (431, 898)\nstep 3: CLICK: (220, 253)\nstep 4: CLICK: (463, 884)\nstep 5: CLICK: (422, 796)\nstep 6: SCROLL: UP\nstep 7: CLICK: (831, 686)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (409, 290)\nB: CLICK: (795, 184)\nC: CLICK: (322, 228)\nD: CLICK: (485, 764)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_54_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_54_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_54_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_54_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_54_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_54_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_54_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_54_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: TYPE: Amazfit BIP\nB: CLICK: (529, 912)\nC: CLICK: (252, 107)\nD: TYPE: Love story\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (128, 508)\nstep 2: CLICK: (629, 925)\nstep 3: CLICK: (885, 929)\nstep 4: CLICK: (206, 86)\nstep 5: TYPE: International Space Station\nstep 6: CLICK: (916, 891)\nstep 7: CLICK: (143, 289)\nstep 8: CLICK: (641, 54)\nstep 9: CLICK: (889, 661)\nstep 10: PRESS_HOME\nstep 11: CLICK: (403, 107)\nI want to Utilize the BBC News app to find the most recent articles related to the International Space Station, and then share one of the articles on Facebook. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Utilize the BBC News app to find the most recent articles related to the International Space Station, and then share one of the articles on Facebook.\nThe historical actions are: step 1: CLICK: (128, 508)\nstep 2: CLICK: (629, 925)\nstep 3: CLICK: (885, 929)\nstep 4: CLICK: (206, 86)\nstep 5: TYPE: International Space Station\nstep 6: CLICK: (916, 891)\nstep 7: CLICK: (143, 289)\nstep 8: CLICK: (641, 54)\nstep 9: CLICK: (889, 661)\nstep 10: PRESS_HOME\nstep 11: CLICK: (403, 107)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: TYPE: Amazfit BIP\nB: CLICK: (529, 912)\nC: CLICK: (252, 107)\nD: TYPE: Love story\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_55_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_55_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_55_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_55_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_55_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_55_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_55_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_55_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_55_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_55_9.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_55_10.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_55_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (703, 940)\nB: CLICK: (921, 909)\nC: COMPLETE\nD: SCROLL: UP\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (889, 635)\nstep 2: CLICK: (379, 131)\nstep 3: TYPE: animation movies\nI want to Organize a movie night by first selecting an animated film on Opera, then adding some snacks to your cart on Amazon. Next, send an invitation to caba62244@gmail.com using Gmail, and finally, set a reminder on Clock to ensure you don't forget. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Organize a movie night by first selecting an animated film on Opera, then adding some snacks to your cart on Amazon. Next, send an invitation to caba62244@gmail.com using Gmail, and finally, set a reminder on Clock to ensure you don't forget.\nThe historical actions are: step 1: CLICK: (889, 635)\nstep 2: CLICK: (379, 131)\nstep 3: TYPE: animation movies\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (703, 940)\nB: CLICK: (921, 909)\nC: COMPLETE\nD: SCROLL: UP\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_56_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_56_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_56_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_56_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: COMPLETE\nB: TYPE: park\nC: CLICK: (875, 876)\nD: SCROLL: RIGHT\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (837, 522)\nstep 2: CLICK: (505, 466)\nstep 3: CLICK: (371, 294)\nI want to Utilize Citymapper to locate a nearby park and then visit the Google Play Store to download a fitness tracking app to set your fitness goals. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Utilize Citymapper to locate a nearby park and then visit the Google Play Store to download a fitness tracking app to set your fitness goals.\nThe historical actions are: step 1: CLICK: (837, 522)\nstep 2: CLICK: (505, 466)\nstep 3: CLICK: (371, 294)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: COMPLETE\nB: TYPE: park\nC: CLICK: (875, 876)\nD: SCROLL: RIGHT\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_57_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_57_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_57_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_57_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: PRESS_HOME\nB: CLICK: (185, 645)\nC: CLICK: (865, 77)\nD: COMPLETE\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (843, 523)\nstep 2: CLICK: (897, 849)\nstep 3: CLICK: (131, 374)\nstep 4: CLICK: (899, 80)\nstep 5: CLICK: (731, 812)\nstep 6: CLICK: (916, 88)\nstep 7: CLICK: (809, 76)\nstep 8: CLICK: (943, 81)\nstep 9: CLICK: (297, 896)\nI want to Use Adobe Express: AI Video Design to resize a photo, then share the resized photo to Facebook moments. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Use Adobe Express: AI Video Design to resize a photo, then share the resized photo to Facebook moments.\nThe historical actions are: step 1: CLICK: (843, 523)\nstep 2: CLICK: (897, 849)\nstep 3: CLICK: (131, 374)\nstep 4: CLICK: (899, 80)\nstep 5: CLICK: (731, 812)\nstep 6: CLICK: (916, 88)\nstep 7: CLICK: (809, 76)\nstep 8: CLICK: (943, 81)\nstep 9: CLICK: (297, 896)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: PRESS_HOME\nB: CLICK: (185, 645)\nC: CLICK: (865, 77)\nD: COMPLETE\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_58_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_58_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_58_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_58_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_58_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_58_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_58_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_58_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_58_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_58_9.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: PRESS_HOME\nB: CLICK: (315, 298)\nC: CLICK: (174, 374)\nD: SCROLL: UP\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (165, 110)\nstep 2: CLICK: (340, 570)\nstep 3: CLICK: (206, 683)\nstep 4: CLICK: (814, 72)\nstep 5: TYPE: Uber\nstep 6: CLICK: (209, 148)\nstep 7: CLICK: (146, 541)\nstep 8: CLICK: (838, 449)\nstep 9: PRESS_HOME\nstep 10: CLICK: (386, 240)\nI want to Switch the notifications on or off for any application on your phone, and then proceed to launch the app. Make sure you use the 'Setting' app to adjust the notifications and then open the 'Uber' app. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Switch the notifications on or off for any application on your phone, and then proceed to launch the app. Make sure you use the 'Setting' app to adjust the notifications and then open the 'Uber' app.\nThe historical actions are: step 1: CLICK: (165, 110)\nstep 2: CLICK: (340, 570)\nstep 3: CLICK: (206, 683)\nstep 4: CLICK: (814, 72)\nstep 5: TYPE: Uber\nstep 6: CLICK: (209, 148)\nstep 7: CLICK: (146, 541)\nstep 8: CLICK: (838, 449)\nstep 9: PRESS_HOME\nstep 10: CLICK: (386, 240)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: PRESS_HOME\nB: CLICK: (315, 298)\nC: CLICK: (174, 374)\nD: SCROLL: UP\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_59_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_59_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_59_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_59_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_59_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_59_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_59_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_59_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_59_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_59_9.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_59_10.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (369, 842)\nB: CLICK: (538, 106)\nC: SCROLL: UP\nD: SCROLL: LEFT\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (453, 491)\nstep 2: SCROLL: UP\nstep 3: CLICK: (160, 682)\nstep 4: SCROLL: UP\nstep 5: CLICK: (437, 672)\nstep 6: CLICK: (480, 427)\nstep 7: CLICK: (480, 427)\nstep 8: PRESS_HOME\nI want to Utilize the 'Setting' app to activate 'Do not disturb' mode on your phone and then use the 'Clock' app to set an alarm for 6:00 AM to wake you up. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Utilize the 'Setting' app to activate 'Do not disturb' mode on your phone and then use the 'Clock' app to set an alarm for 6:00 AM to wake you up.\nThe historical actions are: step 1: CLICK: (453, 491)\nstep 2: SCROLL: UP\nstep 3: CLICK: (160, 682)\nstep 4: SCROLL: UP\nstep 5: CLICK: (437, 672)\nstep 6: CLICK: (480, 427)\nstep 7: CLICK: (480, 427)\nstep 8: PRESS_HOME\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (369, 842)\nB: CLICK: (538, 106)\nC: SCROLL: UP\nD: SCROLL: LEFT\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_60_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_60_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_60_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_60_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_60_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_60_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_60_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_60_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_60_8.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: TYPE: o\nB: CLICK: (795, 676)\nC: PRESS_HOME\nD: TYPE: https://docs.google.com/document/d/1F1mYgUNic6AwXcwpgI0fdfotTjP8R_v4ogXTNTK6u7U/edit?usp=drivesdk\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (150, 520)\nstep 2: CLICK: (909, 533)\nstep 3: CLICK: (297, 383)\nstep 4: PRESS_HOME\nstep 5: CLICK: (386, 272)\nstep 6: CLICK: (482, 158)\nstep 7: CLICK: (422, 948)\nI want to Locate the working file on your phone using app Google Drive, and then share it with kiudehu19294094 via X. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Locate the working file on your phone using app Google Drive, and then share it with kiudehu19294094 via X.\nThe historical actions are: step 1: CLICK: (150, 520)\nstep 2: CLICK: (909, 533)\nstep 3: CLICK: (297, 383)\nstep 4: PRESS_HOME\nstep 5: CLICK: (386, 272)\nstep 6: CLICK: (482, 158)\nstep 7: CLICK: (422, 948)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: TYPE: o\nB: CLICK: (795, 676)\nC: PRESS_HOME\nD: TYPE: https://docs.google.com/document/d/1F1mYgUNic6AwXcwpgI0fdfotTjP8R_v4ogXTNTK6u7U/edit?usp=drivesdk\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_61_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_61_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_61_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_61_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_61_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_61_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_61_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_61_7.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (291, 331)\nB: PRESS_HOME\nC: TYPE: Healthy lunch plan\nD: CLICK: (643, 639)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (802, 598)\nstep 2: CLICK: (913, 492)\nstep 3: CLICK: (288, 781)\nstep 4: PRESS_HOME\nstep 5: CLICK: (622, 459)\nstep 6: CLICK: (897, 892)\nI want to Watch a YouTube lecture on 3D Printing and then record the course's name in Simplenote. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Watch a YouTube lecture on 3D Printing and then record the course's name in Simplenote.\nThe historical actions are: step 1: CLICK: (802, 598)\nstep 2: CLICK: (913, 492)\nstep 3: CLICK: (288, 781)\nstep 4: PRESS_HOME\nstep 5: CLICK: (622, 459)\nstep 6: CLICK: (897, 892)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (291, 331)\nB: PRESS_HOME\nC: TYPE: Healthy lunch plan\nD: CLICK: (643, 639)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_62_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_62_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_62_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_62_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_62_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_62_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_62_6.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: SCROLL: UP\nB: CLICK: (83, 78)\nC: CLICK: (763, 917)\nD: CLICK: (744, 789)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (889, 692)\nstep 2: CLICK: (488, 921)\nstep 3: CLICK: (325, 305)\nstep 4: CLICK: (370, 530)\nstep 5: CLICK: (367, 730)\nstep 6: CLICK: (489, 914)\nstep 7: CLICK: (513, 239)\nstep 8: TYPE:  watermelon\nstep 9: CLICK: (740, 78)\nstep 10: CLICK: (626, 895)\nstep 11: CLICK: (319, 914)\nstep 12: CLICK: (610, 722)\nI want to Use the Remix:AI Image Creator app to design an image featuring a watermelon and then share it on Tumblr with katsunaksu. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Use the Remix:AI Image Creator app to design an image featuring a watermelon and then share it on Tumblr with katsunaksu.\nThe historical actions are: step 1: CLICK: (889, 692)\nstep 2: CLICK: (488, 921)\nstep 3: CLICK: (325, 305)\nstep 4: CLICK: (370, 530)\nstep 5: CLICK: (367, 730)\nstep 6: CLICK: (489, 914)\nstep 7: CLICK: (513, 239)\nstep 8: TYPE:  watermelon\nstep 9: CLICK: (740, 78)\nstep 10: CLICK: (626, 895)\nstep 11: CLICK: (319, 914)\nstep 12: CLICK: (610, 722)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: SCROLL: UP\nB: CLICK: (83, 78)\nC: CLICK: (763, 917)\nD: CLICK: (744, 789)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_63_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_63_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_63_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_63_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_63_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_63_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_63_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_63_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_63_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_63_9.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_63_10.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_63_11.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_63_12.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (151, 655)\nB: CLICK: (111, 572)\nC: CLICK: (428, 487)\nD: SCROLL: UP\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (386, 650)\nstep 2: CLICK: (870, 78)\nstep 3: CLICK: (477, 353)\nstep 4: CLICK: (374, 135)\nstep 5: SCROLL: UP\nstep 6: SCROLL: UP\nstep 7: SCROLL: UP\nstep 8: SCROLL: UP\nstep 9: SCROLL: UP\nstep 10: CLICK: (457, 695)\nstep 11: CLICK: (177, 274)\nstep 12: CLICK: (858, 537)\nstep 13: PRESS_HOME\nI want to Remove the Pandora app using the Google Play Store and then navigate to Settings to confirm that the app is no longer present in the app resources. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Remove the Pandora app using the Google Play Store and then navigate to Settings to confirm that the app is no longer present in the app resources.\nThe historical actions are: step 1: CLICK: (386, 650)\nstep 2: CLICK: (870, 78)\nstep 3: CLICK: (477, 353)\nstep 4: CLICK: (374, 135)\nstep 5: SCROLL: UP\nstep 6: SCROLL: UP\nstep 7: SCROLL: UP\nstep 8: SCROLL: UP\nstep 9: SCROLL: UP\nstep 10: CLICK: (457, 695)\nstep 11: CLICK: (177, 274)\nstep 12: CLICK: (858, 537)\nstep 13: PRESS_HOME\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (151, 655)\nB: CLICK: (111, 572)\nC: CLICK: (428, 487)\nD: SCROLL: UP\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_64_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_64_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_64_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_64_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_64_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_64_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_64_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_64_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_64_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_64_9.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_64_10.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_64_11.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_64_12.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_64_13.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (886, 547)\nB: CLICK: (865, 412)\nC: CLICK: (897, 885)\nD: TYPE: Petco\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (567, 328)\nstep 2: CLICK: (192, 73)\nstep 3: TYPE: pet store\nstep 4: CLICK: (245, 173)\nstep 5: PRESS_HOME\nstep 6: CLICK: (660, 138)\nstep 7: CLICK: (438, 233)\nI want to Utilize Google Map to locate a nearby pet store and then use Uber to arrange a ride to the found location. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Utilize Google Map to locate a nearby pet store and then use Uber to arrange a ride to the found location.\nThe historical actions are: step 1: CLICK: (567, 328)\nstep 2: CLICK: (192, 73)\nstep 3: TYPE: pet store\nstep 4: CLICK: (245, 173)\nstep 5: PRESS_HOME\nstep 6: CLICK: (660, 138)\nstep 7: CLICK: (438, 233)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (886, 547)\nB: CLICK: (865, 412)\nC: CLICK: (897, 885)\nD: TYPE: Petco\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_65_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_65_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_65_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_65_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_65_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_65_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_65_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_65_7.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: SCROLL: UP\nB: PRESS_HOME\nC: CLICK: (409, 247)\nD: CLICK: (595, 353)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (123, 397)\nstep 2: CLICK: (855, 944)\nstep 3: CLICK: (281, 478)\nstep 4: TYPE: do yoga in the morning\nstep 5: CLICK: (556, 384)\nstep 6: PRESS_HOME\nstep 7: CLICK: (828, 654)\nI want to Find a beginner Yoga workout video on YouTube and schedule a reminder in Things to do it tomorrow morning. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Find a beginner Yoga workout video on YouTube and schedule a reminder in Things to do it tomorrow morning.\nThe historical actions are: step 1: CLICK: (123, 397)\nstep 2: CLICK: (855, 944)\nstep 3: CLICK: (281, 478)\nstep 4: TYPE: do yoga in the morning\nstep 5: CLICK: (556, 384)\nstep 6: PRESS_HOME\nstep 7: CLICK: (828, 654)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: SCROLL: UP\nB: PRESS_HOME\nC: CLICK: (409, 247)\nD: CLICK: (595, 353)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_66_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_66_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_66_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_66_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_66_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_66_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_66_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_66_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (384, 529)\nB: CLICK: (730, 690)\nC: CLICK: (963, 904)\nD: CLICK: (491, 231)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (138, 228)\nstep 3: CLICK: (352, 78)\nstep 4: TYPE: 2022 nobel-prize winners in physics\nstep 5: CLICK: (913, 871)\nI want to Utilize DuckDuckgo to search for the 2022 Nobel-Prize winners in physics, and then document the gathered information in Google Docs. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Utilize DuckDuckgo to search for the 2022 Nobel-Prize winners in physics, and then document the gathered information in Google Docs.\nThe historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (138, 228)\nstep 3: CLICK: (352, 78)\nstep 4: TYPE: 2022 nobel-prize winners in physics\nstep 5: CLICK: (913, 871)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (384, 529)\nB: CLICK: (730, 690)\nC: CLICK: (963, 904)\nD: CLICK: (491, 231)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_67_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_67_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_67_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_67_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_67_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_67_5.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: SCROLL: DOWN\nB: CLICK: (177, 274)\nC: CLICK: (584, 780)\nD: CLICK: (559, 271)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (386, 650)\nstep 2: CLICK: (870, 78)\nstep 3: CLICK: (477, 353)\nstep 4: CLICK: (374, 135)\nstep 5: SCROLL: UP\nstep 6: SCROLL: UP\nstep 7: SCROLL: UP\nstep 8: SCROLL: UP\nstep 9: SCROLL: UP\nstep 10: CLICK: (457, 695)\nI want to Remove the Pandora app using the Google Play Store and then navigate to Settings to confirm that the app is no longer present in the app resources. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Remove the Pandora app using the Google Play Store and then navigate to Settings to confirm that the app is no longer present in the app resources.\nThe historical actions are: step 1: CLICK: (386, 650)\nstep 2: CLICK: (870, 78)\nstep 3: CLICK: (477, 353)\nstep 4: CLICK: (374, 135)\nstep 5: SCROLL: UP\nstep 6: SCROLL: UP\nstep 7: SCROLL: UP\nstep 8: SCROLL: UP\nstep 9: SCROLL: UP\nstep 10: CLICK: (457, 695)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: SCROLL: DOWN\nB: CLICK: (177, 274)\nC: CLICK: (584, 780)\nD: CLICK: (559, 271)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_68_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_68_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_68_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_68_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_68_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_68_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_68_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_68_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_68_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_68_9.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_68_10.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: SCROLL: LEFT\nB: TYPE: do yoga in the morning\nC: PRESS_HOME\nD: CLICK: (496, 680)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (592, 913)\nstep 2: CLICK: (16, 58)\nstep 3: CLICK: (830, 53)\nstep 4: TYPE: virtual reality\nstep 5: CLICK: (132, 127)\nstep 6: CLICK: (443, 518)\nstep 7: CLICK: (709, 236)\nstep 8: CLICK: (98, 261)\nI want to Watch a lecture on Virtual Reality on YouTube and then record the name of the course in Microsoft Word. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Watch a lecture on Virtual Reality on YouTube and then record the name of the course in Microsoft Word.\nThe historical actions are: step 1: CLICK: (592, 913)\nstep 2: CLICK: (16, 58)\nstep 3: CLICK: (830, 53)\nstep 4: TYPE: virtual reality\nstep 5: CLICK: (132, 127)\nstep 6: CLICK: (443, 518)\nstep 7: CLICK: (709, 236)\nstep 8: CLICK: (98, 261)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: SCROLL: LEFT\nB: TYPE: do yoga in the morning\nC: PRESS_HOME\nD: CLICK: (496, 680)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_69_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_69_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_69_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_69_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_69_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_69_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_69_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_69_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_69_8.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (123, 349)\nB: TYPE: paintbrush\nC: CLICK: (353, 155)\nD: CLICK: (863, 613)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (141, 718)\nstep 2: CLICK: (410, 57)\nstep 3: CLICK: (985, 63)\nstep 4: TYPE: Pop\nstep 5: CLICK: (892, 674)\nstep 6: CLICK: (485, 141)\nstep 7: CLICK: (416, 327)\nI want to Start by listening to a Pop album on Spotify, then share the name of the album on Instagram. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Start by listening to a Pop album on Spotify, then share the name of the album on Instagram.\nThe historical actions are: step 1: CLICK: (141, 718)\nstep 2: CLICK: (410, 57)\nstep 3: CLICK: (985, 63)\nstep 4: TYPE: Pop\nstep 5: CLICK: (892, 674)\nstep 6: CLICK: (485, 141)\nstep 7: CLICK: (416, 327)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (123, 349)\nB: TYPE: paintbrush\nC: CLICK: (353, 155)\nD: CLICK: (863, 613)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_70_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_70_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_70_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_70_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_70_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_70_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_70_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_70_7.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: COMPLETE\nB: CLICK: (54, 81)\nC: CLICK: (518, 375)\nD: CLICK: (308, 71)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (165, 380)\nstep 2: CLICK: (571, 942)\nstep 3: CLICK: (793, 943)\nstep 4: CLICK: (401, 96)\nstep 5: TYPE: hiking trail\nstep 6: CLICK: (537, 242)\nstep 7: PRESS_HOME\nstep 8: CLICK: (338, 501)\nstep 9: CLICK: (929, 76)\nstep 10: CLICK: (479, 153)\nstep 11: TYPE: Los Angeles\nstep 12: CLICK: (457, 247)\nstep 13: SCROLL: UP\nstep 14: SCROLL: UP\nI want to Use Mapillary to find a new hiking trail, then check the weekend weather forecast using Weather & Radar. Finally, invite Victor James to join the hike through Messenger. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Use Mapillary to find a new hiking trail, then check the weekend weather forecast using Weather & Radar. Finally, invite Victor James to join the hike through Messenger.\nThe historical actions are: step 1: CLICK: (165, 380)\nstep 2: CLICK: (571, 942)\nstep 3: CLICK: (793, 943)\nstep 4: CLICK: (401, 96)\nstep 5: TYPE: hiking trail\nstep 6: CLICK: (537, 242)\nstep 7: PRESS_HOME\nstep 8: CLICK: (338, 501)\nstep 9: CLICK: (929, 76)\nstep 10: CLICK: (479, 153)\nstep 11: TYPE: Los Angeles\nstep 12: CLICK: (457, 247)\nstep 13: SCROLL: UP\nstep 14: SCROLL: UP\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: COMPLETE\nB: CLICK: (54, 81)\nC: CLICK: (518, 375)\nD: CLICK: (308, 71)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_71_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_71_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_71_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_71_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_71_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_71_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_71_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_71_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_71_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_71_9.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_71_10.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_71_11.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_71_12.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_71_13.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_71_14.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (311, 467)\nB: SCROLL: UP\nC: COMPLETE\nD: CLICK: (297, 896)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (573, 704)\nstep 2: CLICK: (170, 67)\nstep 3: TYPE: Japan\nstep 4: CLICK: (903, 693)\nstep 5: CLICK: (128, 454)\nstep 6: CLICK: (983, 68)\nstep 7: CLICK: (855, 193)\nstep 8: CLICK: (706, 599)\nstep 9: PRESS_HOME\nstep 10: CLICK: (148, 254)\nstep 11: CLICK: (495, 921)\nstep 12: CLICK: (474, 492)\nstep 13: CLICK: (687, 424)\nI want to Utilize DuckDuckGo to search for an introduction about Japan and then use Threads to share the link to the webpage you find. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Utilize DuckDuckGo to search for an introduction about Japan and then use Threads to share the link to the webpage you find.\nThe historical actions are: step 1: CLICK: (573, 704)\nstep 2: CLICK: (170, 67)\nstep 3: TYPE: Japan\nstep 4: CLICK: (903, 693)\nstep 5: CLICK: (128, 454)\nstep 6: CLICK: (983, 68)\nstep 7: CLICK: (855, 193)\nstep 8: CLICK: (706, 599)\nstep 9: PRESS_HOME\nstep 10: CLICK: (148, 254)\nstep 11: CLICK: (495, 921)\nstep 12: CLICK: (474, 492)\nstep 13: CLICK: (687, 424)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (311, 467)\nB: SCROLL: UP\nC: COMPLETE\nD: CLICK: (297, 896)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_72_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_72_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_72_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_72_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_72_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_72_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_72_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_72_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_72_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_72_9.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_72_10.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_72_11.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_72_12.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_72_13.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (415, 114)\nB: TYPE: do strength training in the morning\nC: CLICK: (428, 226)\nD: SCROLL: RIGHT\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (138, 253)\nstep 2: CLICK: (392, 924)\nstep 3: CLICK: (369, 81)\nstep 4: TYPE: ASMR\nstep 5: CLICK: (351, 121)\nstep 6: CLICK: (583, 212)\nstep 7: CLICK: (353, 327)\nstep 8: PRESS_HOME\nI want to Launch the Triller app to play a soothing soundscape video, then use the Clock app to set a wake-up timer. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Launch the Triller app to play a soothing soundscape video, then use the Clock app to set a wake-up timer.\nThe historical actions are: step 1: CLICK: (138, 253)\nstep 2: CLICK: (392, 924)\nstep 3: CLICK: (369, 81)\nstep 4: TYPE: ASMR\nstep 5: CLICK: (351, 121)\nstep 6: CLICK: (583, 212)\nstep 7: CLICK: (353, 327)\nstep 8: PRESS_HOME\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (415, 114)\nB: TYPE: do strength training in the morning\nC: CLICK: (428, 226)\nD: SCROLL: RIGHT\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_73_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_73_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_73_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_73_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_73_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_73_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_73_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_73_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_73_8.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: PRESS_RECENT\nB: CLICK: (581, 865)\nC: SCROLL: UP\nD: CLICK: (876, 341)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (695, 656)\nstep 3: CLICK: (576, 763)\nstep 4: CLICK: (566, 652)\nstep 5: CLICK: (861, 746)\nstep 6: CLICK: (635, 760)\nstep 7: CLICK: (581, 865)\nI want to Utilize the 'Scientific calculator plus 991' to compute the sum of today's expenses, which are '14+200+8', and then document the total cost in the 'Monefy' app. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Utilize the 'Scientific calculator plus 991' to compute the sum of today's expenses, which are '14+200+8', and then document the total cost in the 'Monefy' app.\nThe historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (695, 656)\nstep 3: CLICK: (576, 763)\nstep 4: CLICK: (566, 652)\nstep 5: CLICK: (861, 746)\nstep 6: CLICK: (635, 760)\nstep 7: CLICK: (581, 865)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: PRESS_RECENT\nB: CLICK: (581, 865)\nC: SCROLL: UP\nD: CLICK: (876, 341)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_74_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_74_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_74_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_74_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_74_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_74_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_74_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_74_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: SCROLL: UP\nB: CLICK: (367, 345)\nC: TYPE: Same Old Blues\nD: CLICK: (868, 889)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (865, 397)\nstep 2: SCROLL: UP\nstep 3: SCROLL: UP\nstep 4: CLICK: (290, 801)\nstep 5: CLICK: (413, 294)\nI want to Switch your phone's language to German and then verify the change by opening either the Clock or Setting app. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Switch your phone's language to German and then verify the change by opening either the Clock or Setting app.\nThe historical actions are: step 1: CLICK: (865, 397)\nstep 2: SCROLL: UP\nstep 3: SCROLL: UP\nstep 4: CLICK: (290, 801)\nstep 5: CLICK: (413, 294)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: SCROLL: UP\nB: CLICK: (367, 345)\nC: TYPE: Same Old Blues\nD: CLICK: (868, 889)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_75_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_75_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_75_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_75_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_75_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_75_5.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: COMPLETE\nB: CLICK: (420, 574)\nC: CLICK: (860, 444)\nD: TYPE: Do you want to go to Los Angeles with me for a hiking trail next Sunday?\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (160, 145)\nstep 2: SCROLL: UP\nstep 3: SCROLL: UP\nstep 4: SCROLL: UP\nI want to Open the AP News app and read an English news article. Use DeepL translate to translate the title of the article into French. Finally, record the translated title in Google Docs. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Open the AP News app and read an English news article. Use DeepL translate to translate the title of the article into French. Finally, record the translated title in Google Docs.\nThe historical actions are: step 1: CLICK: (160, 145)\nstep 2: SCROLL: UP\nstep 3: SCROLL: UP\nstep 4: SCROLL: UP\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: COMPLETE\nB: CLICK: (420, 574)\nC: CLICK: (860, 444)\nD: TYPE: Do you want to go to Los Angeles with me for a hiking trail next Sunday?\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_76_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_76_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_76_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_76_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_76_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (785, 315)\nB: CLICK: (715, 571)\nC: PRESS_HOME\nD: COMPLETE\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (424, 710)\nstep 2: CLICK: (155, 934)\nstep 3: CLICK: (966, 781)\nstep 4: PRESS_HOME\nI want to Switch your device to dark mode using the Settings app and then launch the Libby, by OverDrive reading app. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Switch your device to dark mode using the Settings app and then launch the Libby, by OverDrive reading app.\nThe historical actions are: step 1: CLICK: (424, 710)\nstep 2: CLICK: (155, 934)\nstep 3: CLICK: (966, 781)\nstep 4: PRESS_HOME\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (785, 315)\nB: CLICK: (715, 571)\nC: PRESS_HOME\nD: COMPLETE\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_77_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_77_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_77_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_77_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_77_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (401, 907)\nB: CLICK: (461, 458)\nC: CLICK: (585, 566)\nD: SCROLL: UP\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (319, 516)\nstep 2: CLICK: (672, 110)\nstep 3: TYPE: how to learn photography\nstep 4: CLICK: (463, 177)\nstep 5: CLICK: (447, 227)\nstep 6: CLICK: (697, 224)\nstep 7: CLICK: (529, 536)\nstep 8: CLICK: (467, 636)\nstep 9: PRESS_HOME\nstep 10: CLICK: (88, 498)\nstep 11: CLICK: (302, 888)\nstep 12: TYPE: learn photography in \nI want to Investigate ways to learn photography using Quora, and then use Any.do to create a reminder to start the tutorial on the website. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Investigate ways to learn photography using Quora, and then use Any.do to create a reminder to start the tutorial on the website.\nThe historical actions are: step 1: CLICK: (319, 516)\nstep 2: CLICK: (672, 110)\nstep 3: TYPE: how to learn photography\nstep 4: CLICK: (463, 177)\nstep 5: CLICK: (447, 227)\nstep 6: CLICK: (697, 224)\nstep 7: CLICK: (529, 536)\nstep 8: CLICK: (467, 636)\nstep 9: PRESS_HOME\nstep 10: CLICK: (88, 498)\nstep 11: CLICK: (302, 888)\nstep 12: TYPE: learn photography in \n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (401, 907)\nB: CLICK: (461, 458)\nC: CLICK: (585, 566)\nD: SCROLL: UP\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_78_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_78_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_78_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_78_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_78_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_78_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_78_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_78_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_78_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_78_9.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_78_10.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_78_11.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_78_12.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: TYPE: Bayes' theorem\nB: CLICK: (464, 172)\nC: TYPE: Paris, tomorrow: intervals of clouds and sunshine  todolist: finish th homework and wash the dirty clothes\nD: CLICK: (359, 745)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (441, 914)\nstep 2: CLICK: (269, 419)\nstep 3: TYPE: book about romance\nstep 4: CLICK: (884, 886)\nstep 5: PRESS_HOME\nstep 6: CLICK: (903, 309)\nstep 7: CLICK: (713, 67)\nstep 8: TYPE: outlander book\nstep 9: CLICK: (895, 880)\nstep 10: SCROLL: UP\nstep 11: SCROLL: UP\nstep 12: SCROLL: UP\nstep 13: SCROLL: UP\nstep 14: PRESS_HOME\nstep 15: CLICK: (918, 132)\nI want to Utilize Chrome to search for a renowned romance novel, then browse Instagram for reviews and feedback on that book, and finally, purchase the book via AliExpress. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Utilize Chrome to search for a renowned romance novel, then browse Instagram for reviews and feedback on that book, and finally, purchase the book via AliExpress.\nThe historical actions are: step 1: CLICK: (441, 914)\nstep 2: CLICK: (269, 419)\nstep 3: TYPE: book about romance\nstep 4: CLICK: (884, 886)\nstep 5: PRESS_HOME\nstep 6: CLICK: (903, 309)\nstep 7: CLICK: (713, 67)\nstep 8: TYPE: outlander book\nstep 9: CLICK: (895, 880)\nstep 10: SCROLL: UP\nstep 11: SCROLL: UP\nstep 12: SCROLL: UP\nstep 13: SCROLL: UP\nstep 14: PRESS_HOME\nstep 15: CLICK: (918, 132)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: TYPE: Bayes' theorem\nB: CLICK: (464, 172)\nC: TYPE: Paris, tomorrow: intervals of clouds and sunshine  todolist: finish th homework and wash the dirty clothes\nD: CLICK: (359, 745)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_79_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_79_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_79_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_79_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_79_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_79_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_79_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_79_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_79_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_79_9.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_79_10.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_79_11.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_79_12.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_79_13.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_79_14.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_79_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: SCROLL: UP\nB: TYPE: sewing class\nC: CLICK: (138, 239)\nD: CLICK: (925, 920)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (599, 390)\nstep 2: CLICK: (164, 146)\nstep 3: TYPE: account manager\nI want to Using Indeed Job Search, find an account manager job and then record the company name in Google Keep. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Using Indeed Job Search, find an account manager job and then record the company name in Google Keep.\nThe historical actions are: step 1: CLICK: (599, 390)\nstep 2: CLICK: (164, 146)\nstep 3: TYPE: account manager\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: SCROLL: UP\nB: TYPE: sewing class\nC: CLICK: (138, 239)\nD: CLICK: (925, 920)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_80_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_80_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_80_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_80_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: COMPLETE\nB: CLICK: (203, 290)\nC: CLICK: (498, 640)\nD: CLICK: (906, 593)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (369, 145)\nstep 2: CLICK: (420, 719)\nstep 3: PRESS_HOME\nstep 4: CLICK: (614, 265)\nstep 5: CLICK: (313, 329)\nstep 6: TYPE: Tractor-trailer dirver 'executed' in road rage shooting, fooicials say\nstep 7: CLICK: (929, 641)\nstep 8: PRESS_HOME\nstep 9: CLICK: (855, 407)\nstep 10: CLICK: (797, 824)\nstep 11: CLICK: (145, 652)\nI want to Using Opera News, read an English news article. Translate its title into Korean with DeepL translate, and then document the translated title in WPS office. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Using Opera News, read an English news article. Translate its title into Korean with DeepL translate, and then document the translated title in WPS office.\nThe historical actions are: step 1: CLICK: (369, 145)\nstep 2: CLICK: (420, 719)\nstep 3: PRESS_HOME\nstep 4: CLICK: (614, 265)\nstep 5: CLICK: (313, 329)\nstep 6: TYPE: Tractor-trailer dirver 'executed' in road rage shooting, fooicials say\nstep 7: CLICK: (929, 641)\nstep 8: PRESS_HOME\nstep 9: CLICK: (855, 407)\nstep 10: CLICK: (797, 824)\nstep 11: CLICK: (145, 652)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: COMPLETE\nB: CLICK: (203, 290)\nC: CLICK: (498, 640)\nD: CLICK: (906, 593)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_81_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_81_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_81_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_81_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_81_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_81_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_81_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_81_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_81_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_81_9.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_81_10.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_81_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: PRESS_HOME\nB: CLICK: (385, 78)\nC: CLICK: (499, 460)\nD: SCROLL: RIGHT\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (336, 470)\nstep 3: CLICK: (247, 745)\nstep 4: CLICK: (252, 64)\nstep 5: TYPE: tennis court\nstep 6: CLICK: (922, 873)\nstep 7: CLICK: (394, 165)\nstep 8: PRESS_RECENT\nstep 9: CLICK: (66, 526)\nstep 10: CLICK: (327, 89)\nstep 11: TYPE: fitness tracking apps\nstep 12: CLICK: (511, 165)\nstep 13: CLICK: (452, 787)\nI want to First, locate a nearby tennis court using the 'GPS, Maps, Voice Navigation' app. Once you've found a suitable location, head to the 'Google Play Store' to download a fitness tracking app to set your fitness goals. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: First, locate a nearby tennis court using the 'GPS, Maps, Voice Navigation' app. Once you've found a suitable location, head to the 'Google Play Store' to download a fitness tracking app to set your fitness goals.\nThe historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (336, 470)\nstep 3: CLICK: (247, 745)\nstep 4: CLICK: (252, 64)\nstep 5: TYPE: tennis court\nstep 6: CLICK: (922, 873)\nstep 7: CLICK: (394, 165)\nstep 8: PRESS_RECENT\nstep 9: CLICK: (66, 526)\nstep 10: CLICK: (327, 89)\nstep 11: TYPE: fitness tracking apps\nstep 12: CLICK: (511, 165)\nstep 13: CLICK: (452, 787)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: PRESS_HOME\nB: CLICK: (385, 78)\nC: CLICK: (499, 460)\nD: SCROLL: RIGHT\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_82_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_82_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_82_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_82_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_82_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_82_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_82_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_82_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_82_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_82_9.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_82_10.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_82_11.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_82_12.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_82_13.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: COMPLETE\nB: CLICK: (361, 243)\nC: CLICK: (824, 420)\nD: TYPE: Recipe Organizer Apps\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (474, 220)\nstep 2: CLICK: (949, 72)\nstep 3: TYPE: The Fall of the Berlin Wall\nstep 4: CLICK: (347, 153)\nstep 5: CLICK: (397, 338)\nstep 6: PRESS_HOME\nstep 7: CLICK: (869, 106)\nstep 8: CLICK: (227, 79)\nstep 9: TYPE: The Fall of the Berlin Wall\nstep 10: CLICK: (929, 879)\nstep 11: CLICK: (605, 416)\nstep 12: CLICK: (522, 633)\nstep 13: CLICK: (301, 180)\nstep 14: TYPE: The Fall of the Berlin Wall\nI want to Delve into the history of the Fall of the Berlin Wall by exploring relevant videos on YouTube, and enhance your understanding by reading or listening to a related book on Amazon Kindle. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Delve into the history of the Fall of the Berlin Wall by exploring relevant videos on YouTube, and enhance your understanding by reading or listening to a related book on Amazon Kindle.\nThe historical actions are: step 1: CLICK: (474, 220)\nstep 2: CLICK: (949, 72)\nstep 3: TYPE: The Fall of the Berlin Wall\nstep 4: CLICK: (347, 153)\nstep 5: CLICK: (397, 338)\nstep 6: PRESS_HOME\nstep 7: CLICK: (869, 106)\nstep 8: CLICK: (227, 79)\nstep 9: TYPE: The Fall of the Berlin Wall\nstep 10: CLICK: (929, 879)\nstep 11: CLICK: (605, 416)\nstep 12: CLICK: (522, 633)\nstep 13: CLICK: (301, 180)\nstep 14: TYPE: The Fall of the Berlin Wall\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: COMPLETE\nB: CLICK: (361, 243)\nC: CLICK: (824, 420)\nD: TYPE: Recipe Organizer Apps\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_83_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_83_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_83_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_83_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_83_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_83_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_83_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_83_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_83_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_83_9.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_83_10.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_83_11.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_83_12.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_83_13.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_83_14.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: PRESS_HOME\nB: TYPE:  3D Printer Course for Beginners\nC: CLICK: (307, 490)\nD: CLICK: (159, 646)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (159, 349)\nstep 3: CLICK: (287, 922)\nstep 4: CLICK: (191, 156)\nstep 5: TYPE: Tokyo, Japan itinerary\nstep 6: CLICK: (890, 864)\nstep 7: CLICK: (537, 614)\nI want to Using Threads for research, find an itinerary for visiting Tokyo, Japan, and then proceed to book accommodations through Airbnb. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Using Threads for research, find an itinerary for visiting Tokyo, Japan, and then proceed to book accommodations through Airbnb.\nThe historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (159, 349)\nstep 3: CLICK: (287, 922)\nstep 4: CLICK: (191, 156)\nstep 5: TYPE: Tokyo, Japan itinerary\nstep 6: CLICK: (890, 864)\nstep 7: CLICK: (537, 614)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: PRESS_HOME\nB: TYPE:  3D Printer Course for Beginners\nC: CLICK: (307, 490)\nD: CLICK: (159, 646)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_84_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_84_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_84_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_84_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_84_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_84_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_84_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_84_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: TYPE: how to make vegetable stir fry\nB: CLICK: (409, 649)\nC: CLICK: (387, 66)\nD: COMPLETE\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (194, 478)\nstep 2: CLICK: (281, 439)\nstep 3: CLICK: (161, 286)\nstep 4: CLICK: (333, 908)\nstep 5: CLICK: (308, 664)\nstep 6: CLICK: (859, 917)\nI want to Open 'Gallery-photo gallery, album', select a photo, and share it on the social app 'X' with liudehu19294094. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Open 'Gallery-photo gallery, album', select a photo, and share it on the social app 'X' with liudehu19294094.\nThe historical actions are: step 1: CLICK: (194, 478)\nstep 2: CLICK: (281, 439)\nstep 3: CLICK: (161, 286)\nstep 4: CLICK: (333, 908)\nstep 5: CLICK: (308, 664)\nstep 6: CLICK: (859, 917)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: TYPE: how to make vegetable stir fry\nB: CLICK: (409, 649)\nC: CLICK: (387, 66)\nD: COMPLETE\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_85_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_85_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_85_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_85_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_85_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_85_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_85_6.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (631, 285)\nB: CLICK: (856, 710)\nC: SCROLL: UP\nD: TYPE: caba62244@gmail.com\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (712, 544)\nstep 2: CLICK: (583, 577)\nstep 3: CLICK: (480, 560)\nstep 4: CLICK: (424, 555)\nstep 5: CLICK: (596, 659)\nstep 6: CLICK: (497, 664)\nstep 7: CLICK: (410, 666)\nstep 8: SCROLL: UP\nstep 9: CLICK: (675, 921)\nstep 10: PRESS_HOME\nI want to Use Applock Pro - APP Lock & Guard to lock the Photos app, then open Photos to verify the lock with PIN 321654. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Use Applock Pro - APP Lock & Guard to lock the Photos app, then open Photos to verify the lock with PIN 321654.\nThe historical actions are: step 1: CLICK: (712, 544)\nstep 2: CLICK: (583, 577)\nstep 3: CLICK: (480, 560)\nstep 4: CLICK: (424, 555)\nstep 5: CLICK: (596, 659)\nstep 6: CLICK: (497, 664)\nstep 7: CLICK: (410, 666)\nstep 8: SCROLL: UP\nstep 9: CLICK: (675, 921)\nstep 10: PRESS_HOME\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (631, 285)\nB: CLICK: (856, 710)\nC: SCROLL: UP\nD: TYPE: caba62244@gmail.com\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_86_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_86_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_86_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_86_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_86_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_86_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_86_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_86_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_86_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_86_9.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_86_10.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: SCROLL: UP\nB: SCROLL: RIGHT\nC: PRESS_HOME\nD: TYPE: NBA game\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: PRESS_HOME\nstep 2: SCROLL: LEFT\nstep 3: CLICK: (415, 127)\nstep 4: CLICK: (947, 76)\nstep 5: TYPE: NBA\nstep 6: SCROLL: LEFT\nstep 7: CLICK: (697, 135)\nstep 8: PRESS_HOME\nstep 9: CLICK: (192, 147)\nstep 10: CLICK: (625, 576)\nstep 11: CLICK: (933, 917)\nI want to Using ESPN, find the details for the next NBA game and then set a reminder for it in Microsoft To Do. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Using ESPN, find the details for the next NBA game and then set a reminder for it in Microsoft To Do.\nThe historical actions are: step 1: PRESS_HOME\nstep 2: SCROLL: LEFT\nstep 3: CLICK: (415, 127)\nstep 4: CLICK: (947, 76)\nstep 5: TYPE: NBA\nstep 6: SCROLL: LEFT\nstep 7: CLICK: (697, 135)\nstep 8: PRESS_HOME\nstep 9: CLICK: (192, 147)\nstep 10: CLICK: (625, 576)\nstep 11: CLICK: (933, 917)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: SCROLL: UP\nB: SCROLL: RIGHT\nC: PRESS_HOME\nD: TYPE: NBA game\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_87_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_87_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_87_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_87_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_87_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_87_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_87_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_87_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_87_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_87_9.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_87_10.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_87_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (859, 584)\nB: COMPLETE\nC: TYPE: Beautifuiil realised and quietil beguiling,a vivid evocation of nature.\nD: TYPE: hiking trail\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (140, 269)\nstep 2: CLICK: (485, 88)\nstep 3: TYPE: Shopify's stock market news\nstep 4: CLICK: (490, 181)\nstep 5: CLICK: (369, 338)\nstep 6: PRESS_HOME\nstep 7: CLICK: (607, 273)\nstep 8: CLICK: (944, 78)\nstep 9: TYPE: Shopify\nstep 10: CLICK: (218, 220)\nI want to Using Firefox, search for today's stock market news related to Shopify. After gathering the news, open Investing.com to check the current stock price trends of Shopify. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Using Firefox, search for today's stock market news related to Shopify. After gathering the news, open Investing.com to check the current stock price trends of Shopify.\nThe historical actions are: step 1: CLICK: (140, 269)\nstep 2: CLICK: (485, 88)\nstep 3: TYPE: Shopify's stock market news\nstep 4: CLICK: (490, 181)\nstep 5: CLICK: (369, 338)\nstep 6: PRESS_HOME\nstep 7: CLICK: (607, 273)\nstep 8: CLICK: (944, 78)\nstep 9: TYPE: Shopify\nstep 10: CLICK: (218, 220)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (859, 584)\nB: COMPLETE\nC: TYPE: Beautifuiil realised and quietil beguiling,a vivid evocation of nature.\nD: TYPE: hiking trail\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_88_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_88_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_88_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_88_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_88_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_88_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_88_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_88_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_88_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_88_9.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_88_10.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (777, 842)\nB: CLICK: (920, 261)\nC: COMPLETE\nD: SCROLL: UP\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (791, 914)\nstep 2: PRESS_HOME\nstep 3: CLICK: (903, 136)\nI want to Using the Firefox Browser, search for a popular K-Pop music band, then listen to their latest album on Pandora. Finally, check on TickPick - Live Event Tickets to see if you can purchase a ticket for an upcoming concert. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Using the Firefox Browser, search for a popular K-Pop music band, then listen to their latest album on Pandora. Finally, check on TickPick - Live Event Tickets to see if you can purchase a ticket for an upcoming concert.\nThe historical actions are: step 1: CLICK: (791, 914)\nstep 2: PRESS_HOME\nstep 3: CLICK: (903, 136)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (777, 842)\nB: CLICK: (920, 261)\nC: COMPLETE\nD: SCROLL: UP\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_89_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_89_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_89_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_89_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (191, 520)\nB: SCROLL: UP\nC: COMPLETE\nD: TYPE:  technology conference events\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (785, 660)\nstep 2: CLICK: (241, 434)\nstep 3: CLICK: (415, 479)\nstep 4: CLICK: (514, 73)\nstep 5: PRESS_HOME\nstep 6: CLICK: (557, 313)\nstep 7: CLICK: (373, 174)\nstep 8: CLICK: (330, 911)\nstep 9: TYPE: 9298916954\nI want to Set up an online meeting using ZOOM Cloud Meetings and share the meeting link with liudehu19294094 via app X. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Set up an online meeting using ZOOM Cloud Meetings and share the meeting link with liudehu19294094 via app X.\nThe historical actions are: step 1: CLICK: (785, 660)\nstep 2: CLICK: (241, 434)\nstep 3: CLICK: (415, 479)\nstep 4: CLICK: (514, 73)\nstep 5: PRESS_HOME\nstep 6: CLICK: (557, 313)\nstep 7: CLICK: (373, 174)\nstep 8: CLICK: (330, 911)\nstep 9: TYPE: 9298916954\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (191, 520)\nB: SCROLL: UP\nC: COMPLETE\nD: TYPE:  technology conference events\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_90_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_90_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_90_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_90_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_90_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_90_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_90_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_90_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_90_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_90_9.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: TYPE: 2013 nobel prize winners in physics\nB: CLICK: (616, 133)\nC: TYPE: statue of liberty\nD: SCROLL: UP\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (370, 126)\nstep 3: CLICK: (352, 72)\nstep 4: TYPE: properties of circle\nstep 5: CLICK: (469, 145)\nstep 6: CLICK: (255, 276)\nstep 7: SCROLL: UP\nstep 8: SCROLL: UP\nI want to Utilize DuckDuckgo to gather information on the properties of a Circle, and then compile your findings into a brief document using Microsoft Word. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Utilize DuckDuckgo to gather information on the properties of a Circle, and then compile your findings into a brief document using Microsoft Word.\nThe historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (370, 126)\nstep 3: CLICK: (352, 72)\nstep 4: TYPE: properties of circle\nstep 5: CLICK: (469, 145)\nstep 6: CLICK: (255, 276)\nstep 7: SCROLL: UP\nstep 8: SCROLL: UP\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: TYPE: 2013 nobel prize winners in physics\nB: CLICK: (616, 133)\nC: TYPE: statue of liberty\nD: SCROLL: UP\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_91_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_91_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_91_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_91_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_91_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_91_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_91_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_91_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_91_8.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: TYPE: Instagram\nB: TYPE: GoPro HERO10\nC: COMPLETE\nD: SCROLL: UP\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (595, 913)\nstep 2: CLICK: (717, 234)\nstep 3: CLICK: (95, 261)\nstep 4: PRESS_HOME\nstep 5: CLICK: (717, 412)\nstep 6: CLICK: (235, 487)\nstep 7: CLICK: (81, 398)\nstep 8: CLICK: (470, 498)\nstep 9: TYPE:  VR Development Full Course: Oculus Quest\nI want to Watch a lecture on Virtual Reality on YouTube and then record the course name in the WPS Office notes app. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Watch a lecture on Virtual Reality on YouTube and then record the course name in the WPS Office notes app.\nThe historical actions are: step 1: CLICK: (595, 913)\nstep 2: CLICK: (717, 234)\nstep 3: CLICK: (95, 261)\nstep 4: PRESS_HOME\nstep 5: CLICK: (717, 412)\nstep 6: CLICK: (235, 487)\nstep 7: CLICK: (81, 398)\nstep 8: CLICK: (470, 498)\nstep 9: TYPE:  VR Development Full Course: Oculus Quest\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: TYPE: Instagram\nB: TYPE: GoPro HERO10\nC: COMPLETE\nD: SCROLL: UP\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_92_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_92_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_92_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_92_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_92_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_92_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_92_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_92_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_92_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_92_9.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (158, 742)\nB: CLICK: (907, 242)\nC: CLICK: (264, 272)\nD: CLICK: (471, 264)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (140, 641)\nstep 2: CLICK: (945, 70)\nstep 3: TYPE: World War I\nstep 4: CLICK: (937, 901)\nI want to Delve into a historical event from World War I and either read a related book on Amazon Kindle or listen to a related video on YouTube. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Delve into a historical event from World War I and either read a related book on Amazon Kindle or listen to a related video on YouTube.\nThe historical actions are: step 1: CLICK: (140, 641)\nstep 2: CLICK: (945, 70)\nstep 3: TYPE: World War I\nstep 4: CLICK: (937, 901)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (158, 742)\nB: CLICK: (907, 242)\nC: CLICK: (264, 272)\nD: CLICK: (471, 264)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_93_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_93_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_93_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_93_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_93_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: SCROLL: UP\nB: COMPLETE\nC: PRESS_HOME\nD: SCROLL: DOWN\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (791, 921)\nstep 2: TYPE: popular k-pop music band\nstep 3: CLICK: (863, 872)\nstep 4: PRESS_HOME\nstep 5: CLICK: (557, 128)\nstep 6: CLICK: (73, 169)\nstep 7: CLICK: (403, 215)\nstep 8: TYPE: BTS\nstep 9: CLICK: (882, 895)\nstep 10: CLICK: (352, 236)\nstep 11: CLICK: (949, 457)\nI want to Use Firefox Browser to find a well-known K-Pop band, listen to their latest album on Spotify, and see if you can purchase a concert ticket on StubHub. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Use Firefox Browser to find a well-known K-Pop band, listen to their latest album on Spotify, and see if you can purchase a concert ticket on StubHub.\nThe historical actions are: step 1: CLICK: (791, 921)\nstep 2: TYPE: popular k-pop music band\nstep 3: CLICK: (863, 872)\nstep 4: PRESS_HOME\nstep 5: CLICK: (557, 128)\nstep 6: CLICK: (73, 169)\nstep 7: CLICK: (403, 215)\nstep 8: TYPE: BTS\nstep 9: CLICK: (882, 895)\nstep 10: CLICK: (352, 236)\nstep 11: CLICK: (949, 457)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: SCROLL: UP\nB: COMPLETE\nC: PRESS_HOME\nD: SCROLL: DOWN\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_94_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_94_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_94_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_94_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_94_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_94_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_94_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_94_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_94_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_94_9.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_94_10.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_94_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (966, 604)\nB: CLICK: (97, 165)\nC: TYPE: Swimming Morning\nD: CLICK: (309, 608)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (722, 254)\nstep 2: CLICK: (523, 912)\nstep 3: CLICK: (680, 807)\nI want to Search for a beginner swimming workout video on Likee, and then use the To-Do List app to set a reminder to perform the workout tomorrow morning. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Search for a beginner swimming workout video on Likee, and then use the To-Do List app to set a reminder to perform the workout tomorrow morning.\nThe historical actions are: step 1: CLICK: (722, 254)\nstep 2: CLICK: (523, 912)\nstep 3: CLICK: (680, 807)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (966, 604)\nB: CLICK: (97, 165)\nC: TYPE: Swimming Morning\nD: CLICK: (309, 608)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_95_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_95_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_95_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_95_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (534, 608)\nB: CLICK: (494, 485)\nC: CLICK: (457, 569)\nD: COMPLETE\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (499, 346)\nstep 3: CLICK: (45, 71)\nstep 4: CLICK: (45, 71)\nstep 5: CLICK: (45, 71)\nstep 6: CLICK: (808, 71)\nstep 7: TYPE: 'The Seven Husbands of Evelyn Hugo' by Taylor Jenkins Reid book review\nstep 8: CLICK: (916, 868)\nstep 9: SCROLL: UP\nstep 10: CLICK: (886, 507)\nstep 11: PRESS_HOME\nI want to Please read a book review online on Quora for the book 'The Seven Husbands of Evelyn Hugo' by Taylor Jenkins Reid. Afterwards, purchase either the ebook or the physical copy using Google Play Books & Audiobooks. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Please read a book review online on Quora for the book 'The Seven Husbands of Evelyn Hugo' by Taylor Jenkins Reid. Afterwards, purchase either the ebook or the physical copy using Google Play Books & Audiobooks.\nThe historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (499, 346)\nstep 3: CLICK: (45, 71)\nstep 4: CLICK: (45, 71)\nstep 5: CLICK: (45, 71)\nstep 6: CLICK: (808, 71)\nstep 7: TYPE: 'The Seven Husbands of Evelyn Hugo' by Taylor Jenkins Reid book review\nstep 8: CLICK: (916, 868)\nstep 9: SCROLL: UP\nstep 10: CLICK: (886, 507)\nstep 11: PRESS_HOME\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (534, 608)\nB: CLICK: (494, 485)\nC: CLICK: (457, 569)\nD: COMPLETE\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_96_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_96_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_96_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_96_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_96_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_96_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_96_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_96_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_96_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_96_9.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_96_10.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_96_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: TYPE: 'The Seven Husbands of Evelyn Hugo' by Taylor Jenkins Reid book review\nB: TYPE: 314 6th St Unit 608\nC: PRESS_HOME\nD: CLICK: (361, 735)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (499, 346)\nstep 3: CLICK: (45, 71)\nstep 4: CLICK: (45, 71)\nstep 5: CLICK: (45, 71)\nstep 6: CLICK: (808, 71)\nI want to Please read a book review online on Quora for the book 'The Seven Husbands of Evelyn Hugo' by Taylor Jenkins Reid. Afterwards, purchase either the ebook or the physical copy using Google Play Books & Audiobooks. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Please read a book review online on Quora for the book 'The Seven Husbands of Evelyn Hugo' by Taylor Jenkins Reid. Afterwards, purchase either the ebook or the physical copy using Google Play Books & Audiobooks.\nThe historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (499, 346)\nstep 3: CLICK: (45, 71)\nstep 4: CLICK: (45, 71)\nstep 5: CLICK: (45, 71)\nstep 6: CLICK: (808, 71)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: TYPE: 'The Seven Husbands of Evelyn Hugo' by Taylor Jenkins Reid book review\nB: TYPE: 314 6th St Unit 608\nC: PRESS_HOME\nD: CLICK: (361, 735)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_97_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_97_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_97_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_97_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_97_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_97_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_97_6.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: TYPE: 9298916954\nB: PRESS_BACK\nC: COMPLETE\nD: PRESS_HOME\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (158, 103)\nstep 2: CLICK: (234, 830)\nstep 3: CLICK: (532, 205)\nstep 4: CLICK: (481, 758)\nstep 5: TYPE: shopping mall\nstep 6: CLICK: (895, 896)\nstep 7: CLICK: (527, 153)\nI want to Utilize the GPS app to locate a nearby shopping mall, and then use the Uber app to book a ride to that location. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Utilize the GPS app to locate a nearby shopping mall, and then use the Uber app to book a ride to that location.\nThe historical actions are: step 1: CLICK: (158, 103)\nstep 2: CLICK: (234, 830)\nstep 3: CLICK: (532, 205)\nstep 4: CLICK: (481, 758)\nstep 5: TYPE: shopping mall\nstep 6: CLICK: (895, 896)\nstep 7: CLICK: (527, 153)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: TYPE: 9298916954\nB: PRESS_BACK\nC: COMPLETE\nD: PRESS_HOME\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_98_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_98_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_98_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_98_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_98_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_98_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_98_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_98_7.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: SCROLL: UP\nB: TYPE: Coca-Cola's stock market news\nC: TYPE: the best Italian in San Jose\nD: TYPE: Tokopedia\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (561, 152)\nstep 2: CLICK: (341, 138)\nstep 3: TYPE: Healthy lunch plan\nstep 4: CLICK: (900, 85)\nI want to Choose a healthy lunch plan for the next day, document it, and watch a video on how to prepare one of the dishes. Use TikTok for the video, Opera browser with AI for research, and Google Docs for taking notes. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Choose a healthy lunch plan for the next day, document it, and watch a video on how to prepare one of the dishes. Use TikTok for the video, Opera browser with AI for research, and Google Docs for taking notes.\nThe historical actions are: step 1: CLICK: (561, 152)\nstep 2: CLICK: (341, 138)\nstep 3: TYPE: Healthy lunch plan\nstep 4: CLICK: (900, 85)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: SCROLL: UP\nB: TYPE: Coca-Cola's stock market news\nC: TYPE: the best Italian in San Jose\nD: TYPE: Tokopedia\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_99_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_99_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_99_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_99_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_99_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: SCROLL: RIGHT\nB: CLICK: (877, 862)\nC: COMPLETE\nD: CLICK: (824, 73)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (846, 368)\nstep 2: CLICK: (837, 42)\nstep 3: TYPE: Task Manager Apps\nstep 4: CLICK: (906, 902)\nstep 5: SCROLL: UP\nstep 6: SCROLL: UP\nstep 7: SCROLL: UP\nstep 8: CLICK: (137, 439)\nstep 9: PRESS_HOME\nI want to Investigate Task Manager applications and choose one to download from the Google Play Store. Share your experience or findings on Facebook. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Investigate Task Manager applications and choose one to download from the Google Play Store. Share your experience or findings on Facebook.\nThe historical actions are: step 1: CLICK: (846, 368)\nstep 2: CLICK: (837, 42)\nstep 3: TYPE: Task Manager Apps\nstep 4: CLICK: (906, 902)\nstep 5: SCROLL: UP\nstep 6: SCROLL: UP\nstep 7: SCROLL: UP\nstep 8: CLICK: (137, 439)\nstep 9: PRESS_HOME\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: SCROLL: RIGHT\nB: CLICK: (877, 862)\nC: COMPLETE\nD: CLICK: (824, 73)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_100_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_100_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_100_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_100_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_100_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_100_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_100_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_100_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_100_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_100_9.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (906, 915)\nB: SCROLL: UP\nC: CLICK: (327, 960)\nD: CLICK: (741, 172)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: SCROLL: LEFT\nstep 2: CLICK: (851, 131)\nstep 3: CLICK: (460, 71)\nstep 4: TYPE: the best bookstore in San Jose\nstep 5: CLICK: (913, 909)\nstep 6: PRESS_HOME\nstep 7: CLICK: (168, 125)\nstep 8: CLICK: (567, 211)\nstep 9: CLICK: (297, 847)\nstep 10: TYPE: Recyle Bookstore\nstep 11: CLICK: (902, 924)\nstep 12: CLICK: (395, 158)\nI want to Utilize Firefox to search for the top-rated bookstore in your local city, and then use GPS to navigate to that location. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Utilize Firefox to search for the top-rated bookstore in your local city, and then use GPS to navigate to that location.\nThe historical actions are: step 1: SCROLL: LEFT\nstep 2: CLICK: (851, 131)\nstep 3: CLICK: (460, 71)\nstep 4: TYPE: the best bookstore in San Jose\nstep 5: CLICK: (913, 909)\nstep 6: PRESS_HOME\nstep 7: CLICK: (168, 125)\nstep 8: CLICK: (567, 211)\nstep 9: CLICK: (297, 847)\nstep 10: TYPE: Recyle Bookstore\nstep 11: CLICK: (902, 924)\nstep 12: CLICK: (395, 158)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (906, 915)\nB: SCROLL: UP\nC: CLICK: (327, 960)\nD: CLICK: (741, 172)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_101_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_101_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_101_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_101_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_101_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_101_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_101_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_101_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_101_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_101_9.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_101_10.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_101_11.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_101_12.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (318, 558)\nB: CLICK: (234, 868)\nC: CLICK: (139, 557)\nD: CLICK: (819, 942)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (509, 910)\nstep 3: CLICK: (978, 57)\nstep 4: CLICK: (295, 116)\nstep 5: TYPE: 'Where the Crawdads Sing' by Delia Owens book review\nstep 6: CLICK: (879, 693)\nstep 7: SCROLL: UP\nstep 8: CLICK: (335, 622)\nstep 9: CLICK: (672, 746)\nstep 10: PRESS_HOME\nI want to Open Chrome and search for a book review of 'Where the Crawdads Sing' by Delia Owens. After reading the review, use OfferUp: Buy. Sell. Letgo. to purchase either the ebook or a physical copy of the book. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Open Chrome and search for a book review of 'Where the Crawdads Sing' by Delia Owens. After reading the review, use OfferUp: Buy. Sell. Letgo. to purchase either the ebook or a physical copy of the book.\nThe historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (509, 910)\nstep 3: CLICK: (978, 57)\nstep 4: CLICK: (295, 116)\nstep 5: TYPE: 'Where the Crawdads Sing' by Delia Owens book review\nstep 6: CLICK: (879, 693)\nstep 7: SCROLL: UP\nstep 8: CLICK: (335, 622)\nstep 9: CLICK: (672, 746)\nstep 10: PRESS_HOME\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (318, 558)\nB: CLICK: (234, 868)\nC: CLICK: (139, 557)\nD: CLICK: (819, 942)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_102_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_102_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_102_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_102_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_102_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_102_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_102_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_102_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_102_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_102_9.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_102_10.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (861, 391)\nB: CLICK: (559, 927)\nC: SCROLL: UP\nD: PRESS_HOME\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (802, 916)\nstep 2: CLICK: (920, 147)\nstep 3: CLICK: (692, 558)\nstep 4: PRESS_HOME\nI want to Look up the top video blogs on DIY crafts using Firefox Browser, adjust the phone's brightness through Settings, then open YouTube to watch and follow the content. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Look up the top video blogs on DIY crafts using Firefox Browser, adjust the phone's brightness through Settings, then open YouTube to watch and follow the content.\nThe historical actions are: step 1: CLICK: (802, 916)\nstep 2: CLICK: (920, 147)\nstep 3: CLICK: (692, 558)\nstep 4: PRESS_HOME\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (861, 391)\nB: CLICK: (559, 927)\nC: SCROLL: UP\nD: PRESS_HOME\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_103_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_103_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_103_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_103_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_103_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (787, 511)\nB: CLICK: (352, 398)\nC: PRESS_HOME\nD: CLICK: (184, 234)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: PRESS_HOME\nstep 2: SCROLL: LEFT\nstep 3: CLICK: (846, 123)\nstep 4: SCROLL: UP\nstep 5: SCROLL: UP\nstep 6: SCROLL: UP\nstep 7: SCROLL: DOWN\nstep 8: SCROLL: DOWN\nstep 9: PRESS_HOME\nstep 10: SCROLL: RIGHT\nI want to Utilize DuckDuckgo to search for the 2023 Nobel-Prize winners in physics, and then document the gathered information in Google Docs. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Utilize DuckDuckgo to search for the 2023 Nobel-Prize winners in physics, and then document the gathered information in Google Docs.\nThe historical actions are: step 1: PRESS_HOME\nstep 2: SCROLL: LEFT\nstep 3: CLICK: (846, 123)\nstep 4: SCROLL: UP\nstep 5: SCROLL: UP\nstep 6: SCROLL: UP\nstep 7: SCROLL: DOWN\nstep 8: SCROLL: DOWN\nstep 9: PRESS_HOME\nstep 10: SCROLL: RIGHT\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (787, 511)\nB: CLICK: (352, 398)\nC: PRESS_HOME\nD: CLICK: (184, 234)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_104_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_104_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_104_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_104_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_104_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_104_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_104_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_104_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_104_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_104_9.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_104_10.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (965, 62)\nB: CLICK: (432, 324)\nC: CLICK: (904, 88)\nD: CLICK: (491, 314)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (929, 488)\nstep 2: CLICK: (335, 347)\nstep 3: CLICK: (413, 328)\nstep 4: CLICK: (310, 901)\nstep 5: CLICK: (327, 680)\nstep 6: CLICK: (325, 281)\nstep 7: TYPE: caba62244@gmail.com\nstep 8: CLICK: (360, 364)\nI want to Find a photo in Gallery-photo gallery,album, and share it through Gmail with caba62244@gmail.com. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Find a photo in Gallery-photo gallery,album, and share it through Gmail with caba62244@gmail.com.\nThe historical actions are: step 1: CLICK: (929, 488)\nstep 2: CLICK: (335, 347)\nstep 3: CLICK: (413, 328)\nstep 4: CLICK: (310, 901)\nstep 5: CLICK: (327, 680)\nstep 6: CLICK: (325, 281)\nstep 7: TYPE: caba62244@gmail.com\nstep 8: CLICK: (360, 364)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (965, 62)\nB: CLICK: (432, 324)\nC: CLICK: (904, 88)\nD: CLICK: (491, 314)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_105_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_105_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_105_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_105_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_105_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_105_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_105_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_105_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_105_8.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: SCROLL: UP\nB: CLICK: (288, 890)\nC: LONG_PRESS: (146, 398)\nD: CLICK: (423, 927)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (638, 383)\nstep 2: TYPE: book about biography\nstep 3: CLICK: (909, 913)\nI want to Utilize DuckDuckGo to search for a renowned biography book, then use Instagram to read reviews about it, and finally, head over to Amazon to purchase the book. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Utilize DuckDuckGo to search for a renowned biography book, then use Instagram to read reviews about it, and finally, head over to Amazon to purchase the book.\nThe historical actions are: step 1: CLICK: (638, 383)\nstep 2: TYPE: book about biography\nstep 3: CLICK: (909, 913)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: SCROLL: UP\nB: CLICK: (288, 890)\nC: LONG_PRESS: (146, 398)\nD: CLICK: (423, 927)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_106_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_106_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_106_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_106_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: COMPLETE\nB: CLICK: (114, 175)\nC: CLICK: (913, 878)\nD: CLICK: (250, 303)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (619, 836)\nstep 2: CLICK: (802, 190)\nstep 3: TYPE: Pad Thai ingredients\nstep 4: CLICK: (221, 183)\nstep 5: CLICK: (940, 582)\nstep 6: PRESS_HOME\nstep 7: CLICK: (844, 396)\nstep 8: CLICK: (875, 817)\nstep 9: CLICK: (172, 650)\nI want to Utilize Chrome to search for the ingredients required for Pad Thai. Once you have found the main ingredients, use WPS Office to compile and create a shopping list of these essential items. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Utilize Chrome to search for the ingredients required for Pad Thai. Once you have found the main ingredients, use WPS Office to compile and create a shopping list of these essential items.\nThe historical actions are: step 1: CLICK: (619, 836)\nstep 2: CLICK: (802, 190)\nstep 3: TYPE: Pad Thai ingredients\nstep 4: CLICK: (221, 183)\nstep 5: CLICK: (940, 582)\nstep 6: PRESS_HOME\nstep 7: CLICK: (844, 396)\nstep 8: CLICK: (875, 817)\nstep 9: CLICK: (172, 650)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: COMPLETE\nB: CLICK: (114, 175)\nC: CLICK: (913, 878)\nD: CLICK: (250, 303)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_107_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_107_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_107_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_107_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_107_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_107_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_107_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_107_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_107_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_107_9.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (872, 74)\nB: COMPLETE\nC: TYPE: https://share.newsbreak.com/6uadf7u2\nD: CLICK: (411, 243)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (188, 658)\nstep 2: CLICK: (303, 580)\nstep 3: CLICK: (288, 677)\nstep 4: CLICK: (804, 89)\nstep 5: TYPE: Todoist\nI want to Enable or disable notifications for any application on the phone and subsequently launch the app. Please ensure you use 'Todoist' and 'Setting' to accomplish this. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Enable or disable notifications for any application on the phone and subsequently launch the app. Please ensure you use 'Todoist' and 'Setting' to accomplish this.\nThe historical actions are: step 1: CLICK: (188, 658)\nstep 2: CLICK: (303, 580)\nstep 3: CLICK: (288, 677)\nstep 4: CLICK: (804, 89)\nstep 5: TYPE: Todoist\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (872, 74)\nB: COMPLETE\nC: TYPE: https://share.newsbreak.com/6uadf7u2\nD: CLICK: (411, 243)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_108_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_108_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_108_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_108_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_108_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_108_5.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: TYPE: digital marketing class\nB: CLICK: (326, 274)\nC: CLICK: (517, 942)\nD: CLICK: (374, 62)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (360, 522)\nstep 2: CLICK: (824, 67)\nstep 3: TYPE: basketball\nI want to Use ESPN to find the latest basketball game score, then send the result to caba62244@gmail.com via Gmail. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Use ESPN to find the latest basketball game score, then send the result to caba62244@gmail.com via Gmail.\nThe historical actions are: step 1: CLICK: (360, 522)\nstep 2: CLICK: (824, 67)\nstep 3: TYPE: basketball\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: TYPE: digital marketing class\nB: CLICK: (326, 274)\nC: CLICK: (517, 942)\nD: CLICK: (374, 62)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_109_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_109_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_109_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_109_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (677, 588)\nB: TYPE: Italin Learning\nC: TYPE: Deep Sea Exploration\nD: COMPLETE\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (386, 406)\nstep 2: CLICK: (78, 65)\nstep 3: CLICK: (315, 154)\nstep 4: CLICK: (163, 490)\nstep 5: CLICK: (395, 928)\nstep 6: PRESS_HOME\nstep 7: CLICK: (817, 531)\nstep 8: CLICK: (929, 875)\nI want to Engage in an Italian language lesson on Duolingo and create a comprehensive learning plan using To-Do List. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Engage in an Italian language lesson on Duolingo and create a comprehensive learning plan using To-Do List.\nThe historical actions are: step 1: CLICK: (386, 406)\nstep 2: CLICK: (78, 65)\nstep 3: CLICK: (315, 154)\nstep 4: CLICK: (163, 490)\nstep 5: CLICK: (395, 928)\nstep 6: PRESS_HOME\nstep 7: CLICK: (817, 531)\nstep 8: CLICK: (929, 875)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (677, 588)\nB: TYPE: Italin Learning\nC: TYPE: Deep Sea Exploration\nD: COMPLETE\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_110_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_110_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_110_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_110_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_110_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_110_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_110_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_110_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_110_8.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (920, 767)\nB: CLICK: (239, 90)\nC: CLICK: (508, 892)\nD: CLICK: (163, 832)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (376, 730)\nstep 3: CLICK: (52, 94)\nstep 4: TYPE: 'The Book Thief' by Markus Zusak book review\nstep 5: CLICK: (735, 97)\nstep 6: CLICK: (299, 65)\nstep 7: PRESS_HOME\nstep 8: CLICK: (76, 886)\nI want to Browse Quora to find a book review for 'The Book Thief' by Markus Zusak and then head over to eBay to buy either the ebook or a physical copy. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Browse Quora to find a book review for 'The Book Thief' by Markus Zusak and then head over to eBay to buy either the ebook or a physical copy.\nThe historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (376, 730)\nstep 3: CLICK: (52, 94)\nstep 4: TYPE: 'The Book Thief' by Markus Zusak book review\nstep 5: CLICK: (735, 97)\nstep 6: CLICK: (299, 65)\nstep 7: PRESS_HOME\nstep 8: CLICK: (76, 886)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (920, 767)\nB: CLICK: (239, 90)\nC: CLICK: (508, 892)\nD: CLICK: (163, 832)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_111_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_111_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_111_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_111_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_111_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_111_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_111_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_111_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_111_8.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: PRESS_HOME\nB: SCROLL: UP\nC: CLICK: (495, 672)\nD: CLICK: (875, 910)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (865, 126)\nstep 2: CLICK: (90, 671)\nstep 3: TYPE: Garmin Fenix 6\nstep 4: CLICK: (836, 94)\nstep 5: SCROLL: LEFT\nstep 6: SCROLL: LEFT\nstep 7: SCROLL: LEFT\nI want to Research the prices for a Garmin Fenix 6 across various shopping platforms, specifically Alibaba.com - B2B marketplace and Flipkart, and make sure to add the most affordable option to your cart. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Research the prices for a Garmin Fenix 6 across various shopping platforms, specifically Alibaba.com - B2B marketplace and Flipkart, and make sure to add the most affordable option to your cart.\nThe historical actions are: step 1: CLICK: (865, 126)\nstep 2: CLICK: (90, 671)\nstep 3: TYPE: Garmin Fenix 6\nstep 4: CLICK: (836, 94)\nstep 5: SCROLL: LEFT\nstep 6: SCROLL: LEFT\nstep 7: SCROLL: LEFT\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: PRESS_HOME\nB: SCROLL: UP\nC: CLICK: (495, 672)\nD: CLICK: (875, 910)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_112_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_112_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_112_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_112_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_112_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_112_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_112_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_112_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: TYPE: how to make egg salad sandwich\nB: CLICK: (100, 485)\nC: COMPLETE\nD: CLICK: (919, 74)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (858, 818)\nstep 2: TYPE: tiktok\nstep 3: CLICK: (877, 890)\nstep 4: PRESS_HOME\nstep 5: CLICK: (374, 806)\nstep 6: PRESS_HOME\nstep 7: CLICK: (858, 818)\nstep 8: CLICK: (802, 74)\nI want to Begin by uninstalling the TikTok app using the Google Play Store. Once done, navigate to Settings to verify that TikTok has been successfully uninstalled. After confirmation, download a similar app such as Triller from the Google Play Store, and then proceed to open the Triller app. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Begin by uninstalling the TikTok app using the Google Play Store. Once done, navigate to Settings to verify that TikTok has been successfully uninstalled. After confirmation, download a similar app such as Triller from the Google Play Store, and then proceed to open the Triller app.\nThe historical actions are: step 1: CLICK: (858, 818)\nstep 2: TYPE: tiktok\nstep 3: CLICK: (877, 890)\nstep 4: PRESS_HOME\nstep 5: CLICK: (374, 806)\nstep 6: PRESS_HOME\nstep 7: CLICK: (858, 818)\nstep 8: CLICK: (802, 74)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: TYPE: how to make egg salad sandwich\nB: CLICK: (100, 485)\nC: COMPLETE\nD: CLICK: (919, 74)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_113_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_113_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_113_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_113_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_113_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_113_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_113_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_113_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_113_8.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: PRESS_HOME\nB: SCROLL: UP\nC: CLICK: (875, 379)\nD: CLICK: (264, 873)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (797, 486)\nstep 2: CLICK: (153, 894)\nstep 3: TYPE: Love story\nstep 4: CLICK: (86, 252)\nstep 5: CLICK: (390, 433)\nstep 6: CLICK: (546, 892)\nstep 7: PRESS_HOME\nstep 8: CLICK: (789, 660)\nstep 9: CLICK: (746, 144)\nstep 10: SCROLL: UP\nI want to Using Amazon Music, play the song 'Love Story' and then utilize Yandex Translate to convert the first line of the lyrics into Dutch. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Using Amazon Music, play the song 'Love Story' and then utilize Yandex Translate to convert the first line of the lyrics into Dutch.\nThe historical actions are: step 1: CLICK: (797, 486)\nstep 2: CLICK: (153, 894)\nstep 3: TYPE: Love story\nstep 4: CLICK: (86, 252)\nstep 5: CLICK: (390, 433)\nstep 6: CLICK: (546, 892)\nstep 7: PRESS_HOME\nstep 8: CLICK: (789, 660)\nstep 9: CLICK: (746, 144)\nstep 10: SCROLL: UP\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: PRESS_HOME\nB: SCROLL: UP\nC: CLICK: (875, 379)\nD: CLICK: (264, 873)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_114_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_114_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_114_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_114_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_114_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_114_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_114_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_114_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_114_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_114_9.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_114_10.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: TYPE: Razer\nB: CLICK: (174, 731)\nC: CLICK: (637, 110)\nD: TYPE: Duolingo\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: PRESS_HOME\nstep 2: SCROLL: RIGHT\nstep 3: CLICK: (313, 590)\nstep 4: CLICK: (166, 459)\nstep 5: CLICK: (316, 632)\nstep 6: CLICK: (855, 590)\nI want to Using 'ClevCalc - Calculator', compute the sum of 1.5 and 98, then document today's total cost in 'Wallet: Budget Money Manager'. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Using 'ClevCalc - Calculator', compute the sum of 1.5 and 98, then document today's total cost in 'Wallet: Budget Money Manager'.\nThe historical actions are: step 1: PRESS_HOME\nstep 2: SCROLL: RIGHT\nstep 3: CLICK: (313, 590)\nstep 4: CLICK: (166, 459)\nstep 5: CLICK: (316, 632)\nstep 6: CLICK: (855, 590)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: TYPE: Razer\nB: CLICK: (174, 731)\nC: CLICK: (637, 110)\nD: TYPE: Duolingo\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_115_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_115_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_115_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_115_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_115_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_115_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_115_6.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (436, 484)\nB: TYPE: caba62244@gmail.com\nC: PRESS_HOME\nD: CLICK: (886, 895)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (331, 912)\nstep 2: TYPE: Hairstyling technique\nstep 3: CLICK: (869, 891)\nstep 4: CLICK: (675, 628)\nstep 5: PRESS_HOME\nI want to Watch a makeup tutorial on YouTube and practice the look while keeping track of the time with the Clock app. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Watch a makeup tutorial on YouTube and practice the look while keeping track of the time with the Clock app.\nThe historical actions are: step 1: CLICK: (331, 912)\nstep 2: TYPE: Hairstyling technique\nstep 3: CLICK: (869, 891)\nstep 4: CLICK: (675, 628)\nstep 5: PRESS_HOME\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (436, 484)\nB: TYPE: caba62244@gmail.com\nC: PRESS_HOME\nD: CLICK: (886, 895)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_116_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_116_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_116_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_116_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_116_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_116_5.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (494, 311)\nB: TYPE: Business\nC: TYPE: yoga for beginners\nD: TYPE: tiktok\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (426, 714)\nstep 2: SCROLL: UP\nstep 3: SCROLL: UP\nstep 4: CLICK: (123, 697)\nI want to Update your phone's language settings to Danish and then launch the Photos app to verify the change. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Update your phone's language settings to Danish and then launch the Photos app to verify the change.\nThe historical actions are: step 1: CLICK: (426, 714)\nstep 2: SCROLL: UP\nstep 3: SCROLL: UP\nstep 4: CLICK: (123, 697)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (494, 311)\nB: TYPE: Business\nC: TYPE: yoga for beginners\nD: TYPE: tiktok\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_117_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_117_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_117_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_117_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_117_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (605, 885)\nB: CLICK: (555, 585)\nC: CLICK: (524, 578)\nD: CLICK: (938, 885)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (608, 515)\nstep 2: CLICK: (572, 668)\nstep 3: TYPE: hotel\nstep 4: CLICK: (919, 885)\nstep 5: PRESS_HOME\nstep 6: CLICK: (422, 492)\nstep 7: CLICK: (397, 612)\nstep 8: TYPE: Russ Hotel\nstep 9: CLICK: (366, 432)\nstep 10: CLICK: (608, 867)\nI want to Locate a nearby hotel using Waze Navigation & Live Traffic, and then request a ride through Lyft. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Locate a nearby hotel using Waze Navigation & Live Traffic, and then request a ride through Lyft.\nThe historical actions are: step 1: CLICK: (608, 515)\nstep 2: CLICK: (572, 668)\nstep 3: TYPE: hotel\nstep 4: CLICK: (919, 885)\nstep 5: PRESS_HOME\nstep 6: CLICK: (422, 492)\nstep 7: CLICK: (397, 612)\nstep 8: TYPE: Russ Hotel\nstep 9: CLICK: (366, 432)\nstep 10: CLICK: (608, 867)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (605, 885)\nB: CLICK: (555, 585)\nC: CLICK: (524, 578)\nD: CLICK: (938, 885)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_118_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_118_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_118_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_118_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_118_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_118_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_118_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_118_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_118_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_118_9.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_118_10.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (304, 305)\nB: CLICK: (855, 603)\nC: PRESS_HOME\nD: CLICK: (905, 892)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (130, 142)\nstep 2: CLICK: (813, 164)\nstep 3: TYPE: the best boutique hotel in Los Angeles\nI want to Utilize Duckduckgo to search for the top-rated boutique hotel in your local city, and then use Lyft to navigate to it. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Utilize Duckduckgo to search for the top-rated boutique hotel in your local city, and then use Lyft to navigate to it.\nThe historical actions are: step 1: CLICK: (130, 142)\nstep 2: CLICK: (813, 164)\nstep 3: TYPE: the best boutique hotel in Los Angeles\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (304, 305)\nB: CLICK: (855, 603)\nC: PRESS_HOME\nD: CLICK: (905, 892)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_119_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_119_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_119_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_119_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: COMPLETE\nB: TYPE: how to make garden salad\nC: TYPE: AirPods Pro 2\nD: SCROLL: UP\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (400, 655)\nstep 3: CLICK: (842, 74)\nstep 4: TYPE: 'The Night Circus' by Erin Morgenstern book review\nstep 5: CLICK: (916, 919)\nstep 6: CLICK: (342, 116)\nstep 7: CLICK: (858, 237)\nstep 8: PRESS_HOME\nstep 9: CLICK: (618, 116)\nstep 10: CLICK: (43, 71)\nstep 11: CLICK: (630, 80)\nstep 12: CLICK: (817, 77)\nstep 13: TYPE: 'The Night Circus' by Erin Morgenstern\nstep 14: CLICK: (939, 919)\nstep 15: CLICK: (210, 259)\nI want to Look up a book review for 'The Night Circus' by Erin Morgenstern online and then buy either the ebook or a physical copy using Ebay or Facebook. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Look up a book review for 'The Night Circus' by Erin Morgenstern online and then buy either the ebook or a physical copy using Ebay or Facebook.\nThe historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (400, 655)\nstep 3: CLICK: (842, 74)\nstep 4: TYPE: 'The Night Circus' by Erin Morgenstern book review\nstep 5: CLICK: (916, 919)\nstep 6: CLICK: (342, 116)\nstep 7: CLICK: (858, 237)\nstep 8: PRESS_HOME\nstep 9: CLICK: (618, 116)\nstep 10: CLICK: (43, 71)\nstep 11: CLICK: (630, 80)\nstep 12: CLICK: (817, 77)\nstep 13: TYPE: 'The Night Circus' by Erin Morgenstern\nstep 14: CLICK: (939, 919)\nstep 15: CLICK: (210, 259)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: COMPLETE\nB: TYPE: how to make garden salad\nC: TYPE: AirPods Pro 2\nD: SCROLL: UP\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_120_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_120_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_120_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_120_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_120_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_120_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_120_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_120_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_120_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_120_9.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_120_10.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_120_11.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_120_12.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_120_13.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_120_14.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_120_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: TYPE: 'Educated' by Tara Westover\nB: CLICK: (463, 651)\nC: CLICK: (26, 971)\nD: SCROLL: UP\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (594, 732)\nstep 2: PRESS_HOME\nstep 3: CLICK: (594, 328)\nstep 4: CLICK: (555, 615)\nstep 5: CLICK: (852, 893)\nstep 6: TYPE: Chocolate chip cookies: 150g salted butter, softened 80g light brown muscovado sugar, 80g granulated sugar, 2 tsp vanilla extract, 1 large egg, 225g plain flour, 1/2 tsp bicarbonate of soda, 1/4 tsp salt, 200g plain chocolate chips or chunks\nstep 7: CLICK: (883, 317)\nstep 8: PRESS_HOME\nstep 9: CLICK: (611, 462)\nstep 10: CLICK: (588, 74)\nstep 11: CLICK: (919, 84)\nstep 12: TYPE: vanilla extract\nstep 13: CLICK: (888, 889)\nI want to Open Firefox Browser and search for a recipe for Chocolate chip cookies. Once you've found a suitable recipe, use Microsoft To Do to create a shopping list of the main ingredients. Finally, add these ingredients to your cart on Amazon. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Open Firefox Browser and search for a recipe for Chocolate chip cookies. Once you've found a suitable recipe, use Microsoft To Do to create a shopping list of the main ingredients. Finally, add these ingredients to your cart on Amazon.\nThe historical actions are: step 1: CLICK: (594, 732)\nstep 2: PRESS_HOME\nstep 3: CLICK: (594, 328)\nstep 4: CLICK: (555, 615)\nstep 5: CLICK: (852, 893)\nstep 6: TYPE: Chocolate chip cookies: 150g salted butter, softened 80g light brown muscovado sugar, 80g granulated sugar, 2 tsp vanilla extract, 1 large egg, 225g plain flour, 1/2 tsp bicarbonate of soda, 1/4 tsp salt, 200g plain chocolate chips or chunks\nstep 7: CLICK: (883, 317)\nstep 8: PRESS_HOME\nstep 9: CLICK: (611, 462)\nstep 10: CLICK: (588, 74)\nstep 11: CLICK: (919, 84)\nstep 12: TYPE: vanilla extract\nstep 13: CLICK: (888, 889)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: TYPE: 'Educated' by Tara Westover\nB: CLICK: (463, 651)\nC: CLICK: (26, 971)\nD: SCROLL: UP\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_121_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_121_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_121_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_121_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_121_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_121_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_121_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_121_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_121_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_121_9.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_121_10.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_121_11.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_121_12.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_121_13.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (306, 498)\nB: SCROLL: UP\nC: SCROLL: LEFT\nD: CLICK: (308, 71)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (580, 215)\nstep 2: PRESS_HOME\nstep 3: CLICK: (330, 231)\nstep 4: CLICK: (111, 78)\nstep 5: CLICK: (602, 917)\nI want to Using the Firefox Browser, find a well-known K-Pop band, listen to their most recent album on Pandora, and determine if concert tickets are available for purchase through StubHub. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Using the Firefox Browser, find a well-known K-Pop band, listen to their most recent album on Pandora, and determine if concert tickets are available for purchase through StubHub.\nThe historical actions are: step 1: CLICK: (580, 215)\nstep 2: PRESS_HOME\nstep 3: CLICK: (330, 231)\nstep 4: CLICK: (111, 78)\nstep 5: CLICK: (602, 917)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (306, 498)\nB: SCROLL: UP\nC: SCROLL: LEFT\nD: CLICK: (308, 71)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_122_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_122_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_122_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_122_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_122_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_122_5.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: SCROLL: LEFT\nB: CLICK: (504, 927)\nC: CLICK: (305, 219)\nD: COMPLETE\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: SCROLL: LEFT\nstep 2: CLICK: (872, 117)\nstep 3: CLICK: (280, 56)\nstep 4: TYPE: Nvidia RTX 3080\nstep 5: CLICK: (907, 904)\nstep 6: SCROLL: UP\nstep 7: SCROLL: UP\nstep 8: PRESS_HOME\nstep 9: CLICK: (143, 102)\nstep 10: CLICK: (240, 61)\nstep 11: TYPE: Nvidia RTX 3080\nstep 12: CLICK: (954, 907)\nstep 13: SCROLL: UP\nstep 14: PRESS_HOME\nstep 15: CLICK: (842, 118)\nI want to Look up the prices for an Nvidia RTX 3080 across different shopping platforms, specifically Amazon and AliExpress, then add the most affordable option to your cart. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Look up the prices for an Nvidia RTX 3080 across different shopping platforms, specifically Amazon and AliExpress, then add the most affordable option to your cart.\nThe historical actions are: step 1: SCROLL: LEFT\nstep 2: CLICK: (872, 117)\nstep 3: CLICK: (280, 56)\nstep 4: TYPE: Nvidia RTX 3080\nstep 5: CLICK: (907, 904)\nstep 6: SCROLL: UP\nstep 7: SCROLL: UP\nstep 8: PRESS_HOME\nstep 9: CLICK: (143, 102)\nstep 10: CLICK: (240, 61)\nstep 11: TYPE: Nvidia RTX 3080\nstep 12: CLICK: (954, 907)\nstep 13: SCROLL: UP\nstep 14: PRESS_HOME\nstep 15: CLICK: (842, 118)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: SCROLL: LEFT\nB: CLICK: (504, 927)\nC: CLICK: (305, 219)\nD: COMPLETE\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_123_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_123_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_123_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_123_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_123_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_123_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_123_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_123_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_123_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_123_9.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_123_10.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_123_11.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_123_12.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_123_13.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_123_14.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_123_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: TYPE: 2012 Nobel-Prize winners in physics\nB: CLICK: (876, 895)\nC: CLICK: (593, 514)\nD: SCROLL: UP\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (140, 626)\nstep 2: CLICK: (400, 288)\nstep 3: TYPE: hiking trail\nstep 4: CLICK: (916, 904)\nstep 5: SCROLL: UP\nstep 6: SCROLL: UP\nstep 7: SCROLL: UP\nstep 8: CLICK: (581, 604)\nstep 9: SCROLL: UP\nstep 10: PRESS_HOME\nI want to Use Chrome to find a new hiking trail, check the weekend weather forecast using Weather & Radar, and then invite katsunaksu to join the adventure through Tumblr. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Use Chrome to find a new hiking trail, check the weekend weather forecast using Weather & Radar, and then invite katsunaksu to join the adventure through Tumblr.\nThe historical actions are: step 1: CLICK: (140, 626)\nstep 2: CLICK: (400, 288)\nstep 3: TYPE: hiking trail\nstep 4: CLICK: (916, 904)\nstep 5: SCROLL: UP\nstep 6: SCROLL: UP\nstep 7: SCROLL: UP\nstep 8: CLICK: (581, 604)\nstep 9: SCROLL: UP\nstep 10: PRESS_HOME\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: TYPE: 2012 Nobel-Prize winners in physics\nB: CLICK: (876, 895)\nC: CLICK: (593, 514)\nD: SCROLL: UP\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_124_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_124_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_124_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_124_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_124_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_124_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_124_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_124_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_124_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_124_9.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_124_10.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (913, 74)\nB: CLICK: (555, 475)\nC: SCROLL: UP\nD: TYPE: business\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (858, 505)\nstep 2: CLICK: (524, 563)\nstep 3: CLICK: (64, 54)\nI want to Locate a recent business-related news story using Google News and then add this event to your Calendar. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Locate a recent business-related news story using Google News and then add this event to your Calendar.\nThe historical actions are: step 1: CLICK: (858, 505)\nstep 2: CLICK: (524, 563)\nstep 3: CLICK: (64, 54)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (913, 74)\nB: CLICK: (555, 475)\nC: SCROLL: UP\nD: TYPE: business\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_125_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_125_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_125_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_125_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: TYPE: how to train strength\nB: CLICK: (284, 307)\nC: PRESS_HOME\nD: CLICK: (408, 867)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (573, 244)\nstep 2: CLICK: (950, 909)\nstep 3: TYPE: strength training in the morning\nstep 4: CLICK: (974, 843)\nstep 5: PRESS_HOME\nstep 6: CLICK: (576, 903)\nstep 7: CLICK: (973, 44)\nI want to Search for a beginner-friendly strength training workout video on YouTube and create a reminder in Microsoft To Do to perform it tomorrow morning. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Search for a beginner-friendly strength training workout video on YouTube and create a reminder in Microsoft To Do to perform it tomorrow morning.\nThe historical actions are: step 1: CLICK: (573, 244)\nstep 2: CLICK: (950, 909)\nstep 3: TYPE: strength training in the morning\nstep 4: CLICK: (974, 843)\nstep 5: PRESS_HOME\nstep 6: CLICK: (576, 903)\nstep 7: CLICK: (973, 44)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: TYPE: how to train strength\nB: CLICK: (284, 307)\nC: PRESS_HOME\nD: CLICK: (408, 867)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_126_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_126_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_126_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_126_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_126_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_126_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_126_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_126_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: COMPLETE\nB: TYPE: football\nC: CLICK: (421, 593)\nD: SCROLL: UP\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (419, 654)\nstep 2: TYPE: family movies\nstep 3: CLICK: (913, 924)\nstep 4: CLICK: (170, 434)\nstep 5: SCROLL: UP\nstep 6: SCROLL: UP\nI want to Coordinate a family movie night by selecting a movie suitable for all ages on DuckDuckgo, adding snacks to your cart on Amazon, and sending invitations to Tzhau Jau via Instagram. Don't forget to set a reminder on your Clock app to ensure everything goes smoothly. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Coordinate a family movie night by selecting a movie suitable for all ages on DuckDuckgo, adding snacks to your cart on Amazon, and sending invitations to Tzhau Jau via Instagram. Don't forget to set a reminder on your Clock app to ensure everything goes smoothly.\nThe historical actions are: step 1: CLICK: (419, 654)\nstep 2: TYPE: family movies\nstep 3: CLICK: (913, 924)\nstep 4: CLICK: (170, 434)\nstep 5: SCROLL: UP\nstep 6: SCROLL: UP\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: COMPLETE\nB: TYPE: football\nC: CLICK: (421, 593)\nD: SCROLL: UP\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_127_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_127_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_127_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_127_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_127_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_127_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_127_6.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: TYPE: Cape Town,tomorrowLcloudy todolist: buy a flight to Cape Town\nB: SCROLL: UP\nC: CLICK: (411, 548)\nD: CLICK: (277, 296)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (561, 918)\nstep 2: CLICK: (523, 79)\nstep 3: CLICK: (957, 91)\nstep 4: TYPE: Cape Town weather tomorrow\nstep 5: CLICK: (883, 884)\nstep 6: PRESS_HOME\nstep 7: CLICK: (71, 331)\nstep 8: CLICK: (124, 398)\nI want to Use Daily Forecast to check the weather prediction for Cape Town for the upcoming day. Based on the forecast, create and organize a to-do list using WPS Office. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Use Daily Forecast to check the weather prediction for Cape Town for the upcoming day. Based on the forecast, create and organize a to-do list using WPS Office.\nThe historical actions are: step 1: CLICK: (561, 918)\nstep 2: CLICK: (523, 79)\nstep 3: CLICK: (957, 91)\nstep 4: TYPE: Cape Town weather tomorrow\nstep 5: CLICK: (883, 884)\nstep 6: PRESS_HOME\nstep 7: CLICK: (71, 331)\nstep 8: CLICK: (124, 398)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: TYPE: Cape Town,tomorrowLcloudy todolist: buy a flight to Cape Town\nB: SCROLL: UP\nC: CLICK: (411, 548)\nD: CLICK: (277, 296)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_128_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_128_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_128_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_128_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_128_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_128_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_128_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_128_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_128_8.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: SCROLL: UP\nB: CLICK: (916, 815)\nC: COMPLETE\nD: CLICK: (340, 328)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (884, 614)\nstep 2: SCROLL: UP\nstep 3: SCROLL: UP\nstep 4: SCROLL: UP\nstep 5: CLICK: (443, 644)\nstep 6: CLICK: (323, 408)\nstep 7: CLICK: (316, 475)\nstep 8: CLICK: (297, 674)\nstep 9: CLICK: (938, 87)\nstep 10: TYPE: Finnish\nstep 11: CLICK: (131, 193)\nstep 12: SCROLL: UP\nstep 13: CLICK: (837, 663)\nstep 14: PRESS_HOME\nstep 15: CLICK: (495, 607)\nI want to Switch the language on your phone to Finnish, then open the Settings app to verify the change. Additionally, open the Contacts app to ensure the language update is reflected there as well. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Switch the language on your phone to Finnish, then open the Settings app to verify the change. Additionally, open the Contacts app to ensure the language update is reflected there as well.\nThe historical actions are: step 1: CLICK: (884, 614)\nstep 2: SCROLL: UP\nstep 3: SCROLL: UP\nstep 4: SCROLL: UP\nstep 5: CLICK: (443, 644)\nstep 6: CLICK: (323, 408)\nstep 7: CLICK: (316, 475)\nstep 8: CLICK: (297, 674)\nstep 9: CLICK: (938, 87)\nstep 10: TYPE: Finnish\nstep 11: CLICK: (131, 193)\nstep 12: SCROLL: UP\nstep 13: CLICK: (837, 663)\nstep 14: PRESS_HOME\nstep 15: CLICK: (495, 607)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: SCROLL: UP\nB: CLICK: (916, 815)\nC: COMPLETE\nD: CLICK: (340, 328)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_129_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_129_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_129_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_129_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_129_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_129_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_129_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_129_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_129_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_129_9.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_129_10.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_129_11.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_129_12.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_129_13.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_129_14.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_129_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: TYPE: digital marketing class\nB: SCROLL: UP\nC: CLICK: (441, 645)\nD: CLICK: (206, 427)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (493, 94)\nstep 3: CLICK: (574, 91)\nstep 4: TYPE: KitchenAid Artisan Stand Mixer\nstep 5: CLICK: (872, 860)\nstep 6: SCROLL: UP\nI want to Investigate the prices of a KitchenAid Artisan Stand Mixer across the shopping apps Target and Amazon, then place the most affordable option into your cart. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Investigate the prices of a KitchenAid Artisan Stand Mixer across the shopping apps Target and Amazon, then place the most affordable option into your cart.\nThe historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (493, 94)\nstep 3: CLICK: (574, 91)\nstep 4: TYPE: KitchenAid Artisan Stand Mixer\nstep 5: CLICK: (872, 860)\nstep 6: SCROLL: UP\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: TYPE: digital marketing class\nB: SCROLL: UP\nC: CLICK: (441, 645)\nD: CLICK: (206, 427)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_130_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_130_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_130_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_130_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_130_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_130_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_130_6.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: SCROLL: UP\nB: CLICK: (957, 91)\nC: CLICK: (627, 943)\nD: CLICK: (770, 897)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (797, 482)\nstep 2: CLICK: (148, 892)\nstep 3: TYPE: Love story\nstep 4: CLICK: (105, 248)\nstep 5: CLICK: (384, 438)\nstep 6: CLICK: (565, 897)\nstep 7: PRESS_HOME\nstep 8: CLICK: (558, 667)\nstep 9: CLICK: (391, 683)\nstep 10: CLICK: (657, 909)\nI want to Use Amazon Music to listen to the song 'Love Story' and then open Microsoft Translator to translate the first line of the lyrics into Danish. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Use Amazon Music to listen to the song 'Love Story' and then open Microsoft Translator to translate the first line of the lyrics into Danish.\nThe historical actions are: step 1: CLICK: (797, 482)\nstep 2: CLICK: (148, 892)\nstep 3: TYPE: Love story\nstep 4: CLICK: (105, 248)\nstep 5: CLICK: (384, 438)\nstep 6: CLICK: (565, 897)\nstep 7: PRESS_HOME\nstep 8: CLICK: (558, 667)\nstep 9: CLICK: (391, 683)\nstep 10: CLICK: (657, 909)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: SCROLL: UP\nB: CLICK: (957, 91)\nC: CLICK: (627, 943)\nD: CLICK: (770, 897)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_131_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_131_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_131_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_131_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_131_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_131_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_131_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_131_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_131_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_131_9.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_131_10.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (274, 169)\nB: CLICK: (50, 970)\nC: CLICK: (744, 488)\nD: CLICK: (815, 73)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (585, 837)\nstep 2: PRESS_HOME\nstep 3: CLICK: (835, 842)\nI want to Open YouTube to watch a video that recommends various fitness tracking apps, and then proceed to download one of the suggested apps from the Google Play Store. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Open YouTube to watch a video that recommends various fitness tracking apps, and then proceed to download one of the suggested apps from the Google Play Store.\nThe historical actions are: step 1: CLICK: (585, 837)\nstep 2: PRESS_HOME\nstep 3: CLICK: (835, 842)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (274, 169)\nB: CLICK: (50, 970)\nC: CLICK: (744, 488)\nD: CLICK: (815, 73)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_132_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_132_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_132_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_132_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (773, 544)\nB: CLICK: (819, 144)\nC: LONG_PRESS: (307, 479)\nD: TYPE:  3D Printer Course for Beginners \n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (393, 666)\nstep 2: CLICK: (417, 98)\nstep 3: TYPE: Farfetch\nstep 4: CLICK: (901, 893)\nstep 5: CLICK: (483, 331)\nstep 6: CLICK: (352, 275)\nI want to Remove the Farfetch app using Google Play Store, then navigate to Settings to verify if the app resources are still present. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Remove the Farfetch app using Google Play Store, then navigate to Settings to verify if the app resources are still present.\nThe historical actions are: step 1: CLICK: (393, 666)\nstep 2: CLICK: (417, 98)\nstep 3: TYPE: Farfetch\nstep 4: CLICK: (901, 893)\nstep 5: CLICK: (483, 331)\nstep 6: CLICK: (352, 275)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (773, 544)\nB: CLICK: (819, 144)\nC: LONG_PRESS: (307, 479)\nD: TYPE:  3D Printer Course for Beginners \n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_133_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_133_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_133_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_133_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_133_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_133_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_133_6.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: PRESS_HOME\nB: CLICK: (605, 885)\nC: LONG_PRESS: (106, 287)\nD: COMPLETE\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (637, 812)\nstep 2: CLICK: (797, 180)\nstep 3: CLICK: (508, 584)\nstep 4: CLICK: (912, 703)\nI want to Using Chrome, search for the 2020 Nobel-Prize winners in physics. Once you have the information, record it in Google Docs. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Using Chrome, search for the 2020 Nobel-Prize winners in physics. Once you have the information, record it in Google Docs.\nThe historical actions are: step 1: CLICK: (637, 812)\nstep 2: CLICK: (797, 180)\nstep 3: CLICK: (508, 584)\nstep 4: CLICK: (912, 703)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: PRESS_HOME\nB: CLICK: (605, 885)\nC: LONG_PRESS: (106, 287)\nD: COMPLETE\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_134_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_134_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_134_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_134_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_134_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (413, 83)\nB: PRESS_HOME\nC: CLICK: (184, 249)\nD: CLICK: (143, 706)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (656, 813)\nstep 2: CLICK: (486, 55)\nstep 3: TYPE: properties of rectangle\nstep 4: CLICK: (486, 114)\nstep 5: LONG_PRESS: (55, 533)\nstep 6: SCROLL: RIGHT\nstep 7: CLICK: (155, 485)\nstep 8: PRESS_HOME\nI want to Use Chrome to search for information about the properties of a Rectangle, and then use Google Docs to create a brief document summarizing your findings. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Use Chrome to search for information about the properties of a Rectangle, and then use Google Docs to create a brief document summarizing your findings.\nThe historical actions are: step 1: CLICK: (656, 813)\nstep 2: CLICK: (486, 55)\nstep 3: TYPE: properties of rectangle\nstep 4: CLICK: (486, 114)\nstep 5: LONG_PRESS: (55, 533)\nstep 6: SCROLL: RIGHT\nstep 7: CLICK: (155, 485)\nstep 8: PRESS_HOME\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (413, 83)\nB: PRESS_HOME\nC: CLICK: (184, 249)\nD: CLICK: (143, 706)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_135_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_135_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_135_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_135_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_135_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_135_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_135_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_135_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_135_8.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (607, 941)\nB: COMPLETE\nC: CLICK: (914, 29)\nD: CLICK: (467, 833)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (109, 509)\nstep 2: CLICK: (483, 925)\nstep 3: CLICK: (369, 261)\nstep 4: TYPE: chessboard\nstep 5: CLICK: (118, 971)\nstep 6: CLICK: (435, 843)\nstep 7: CLICK: (861, 837)\nstep 8: SCROLL: UP\nstep 9: CLICK: (299, 772)\nstep 10: CLICK: (194, 472)\nstep 11: CLICK: (914, 23)\nI want to Use the GenZArt:Fast AI Art Generator app to create an image with a chessboard theme, and then share it on Instagram with moments. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Use the GenZArt:Fast AI Art Generator app to create an image with a chessboard theme, and then share it on Instagram with moments.\nThe historical actions are: step 1: CLICK: (109, 509)\nstep 2: CLICK: (483, 925)\nstep 3: CLICK: (369, 261)\nstep 4: TYPE: chessboard\nstep 5: CLICK: (118, 971)\nstep 6: CLICK: (435, 843)\nstep 7: CLICK: (861, 837)\nstep 8: SCROLL: UP\nstep 9: CLICK: (299, 772)\nstep 10: CLICK: (194, 472)\nstep 11: CLICK: (914, 23)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (607, 941)\nB: COMPLETE\nC: CLICK: (914, 29)\nD: CLICK: (467, 833)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_136_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_136_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_136_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_136_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_136_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_136_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_136_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_136_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_136_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_136_9.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_136_10.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_136_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: SCROLL: UP\nB: CLICK: (921, 817)\nC: PRESS_HOME\nD: TYPE: snacks\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (415, 365)\nstep 2: CLICK: (202, 72)\nstep 3: CLICK: (751, 550)\nstep 4: CLICK: (302, 292)\nstep 5: PRESS_HOME\nstep 6: CLICK: (605, 523)\nI want to Engage in an Arabic language lesson and use Todoist to set a structured learning plan, while practicing with Rosetta Stone: Learn, Practice. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Engage in an Arabic language lesson and use Todoist to set a structured learning plan, while practicing with Rosetta Stone: Learn, Practice.\nThe historical actions are: step 1: CLICK: (415, 365)\nstep 2: CLICK: (202, 72)\nstep 3: CLICK: (751, 550)\nstep 4: CLICK: (302, 292)\nstep 5: PRESS_HOME\nstep 6: CLICK: (605, 523)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: SCROLL: UP\nB: CLICK: (921, 817)\nC: PRESS_HOME\nD: TYPE: snacks\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_137_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_137_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_137_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_137_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_137_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_137_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_137_6.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (347, 347)\nB: CLICK: (311, 828)\nC: COMPLETE\nD: PRESS_HOME\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (432, 721)\nstep 2: CLICK: (148, 940)\nstep 3: CLICK: (955, 781)\nstep 4: PRESS_HOME\nstep 5: SCROLL: LEFT\nstep 6: CLICK: (442, 108)\nI want to Switch to dark mode in the Settings app, then open the Amazon Kindle reading app. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Switch to dark mode in the Settings app, then open the Amazon Kindle reading app.\nThe historical actions are: step 1: CLICK: (432, 721)\nstep 2: CLICK: (148, 940)\nstep 3: CLICK: (955, 781)\nstep 4: PRESS_HOME\nstep 5: SCROLL: LEFT\nstep 6: CLICK: (442, 108)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (347, 347)\nB: CLICK: (311, 828)\nC: COMPLETE\nD: PRESS_HOME\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_138_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_138_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_138_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_138_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_138_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_138_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_138_6.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (424, 514)\nB: PRESS_HOME\nC: COMPLETE\nD: CLICK: (432, 93)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (639, 643)\nstep 2: SCROLL: UP\nstep 3: CLICK: (341, 714)\nstep 4: CLICK: (906, 717)\nstep 5: PRESS_HOME\nstep 6: CLICK: (136, 644)\nI want to Switch to dark mode in Setting and then launch the 'Ploter - Ebook, Audiobook, PDF' app. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Switch to dark mode in Setting and then launch the 'Ploter - Ebook, Audiobook, PDF' app.\nThe historical actions are: step 1: CLICK: (639, 643)\nstep 2: SCROLL: UP\nstep 3: CLICK: (341, 714)\nstep 4: CLICK: (906, 717)\nstep 5: PRESS_HOME\nstep 6: CLICK: (136, 644)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (424, 514)\nB: PRESS_HOME\nC: COMPLETE\nD: CLICK: (432, 93)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_139_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_139_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_139_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_139_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_139_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_139_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_139_6.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: TYPE: fitness tracking apps\nB: TYPE: Meeting\nC: CLICK: (42, 69)\nD: COMPLETE\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (834, 504)\nstep 3: CLICK: (447, 693)\nstep 4: TYPE: football field\nstep 5: CLICK: (571, 208)\nstep 6: PRESS_RECENT\nstep 7: SCROLL: RIGHT\nstep 8: CLICK: (524, 529)\nstep 9: CLICK: (357, 67)\nI want to Start by utilizing Petal Maps - GPS & Navigation to locate a nearby Football field. Once found, proceed to the Google Play Store to download a fitness tracking app and set your fitness goals. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Start by utilizing Petal Maps - GPS & Navigation to locate a nearby Football field. Once found, proceed to the Google Play Store to download a fitness tracking app and set your fitness goals.\nThe historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (834, 504)\nstep 3: CLICK: (447, 693)\nstep 4: TYPE: football field\nstep 5: CLICK: (571, 208)\nstep 6: PRESS_RECENT\nstep 7: SCROLL: RIGHT\nstep 8: CLICK: (524, 529)\nstep 9: CLICK: (357, 67)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: TYPE: fitness tracking apps\nB: TYPE: Meeting\nC: CLICK: (42, 69)\nD: COMPLETE\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_140_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_140_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_140_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_140_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_140_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_140_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_140_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_140_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_140_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_140_9.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (454, 354)\nB: SCROLL: LEFT\nC: CLICK: (288, 823)\nD: CLICK: (884, 885)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (572, 922)\nstep 3: CLICK: (459, 878)\nstep 4: CLICK: (146, 90)\nstep 5: TYPE: modern\nstep 6: CLICK: (880, 887)\nstep 7: CLICK: (514, 495)\nstep 8: CLICK: (460, 96)\nstep 9: CLICK: (217, 686)\nstep 10: PRESS_HOME\nstep 11: CLICK: (454, 508)\nstep 12: CLICK: (552, 391)\nstep 13: CLICK: (465, 271)\nI want to Use Pinterest to find a modern-style picture and set it as your phone's wallpaper through the Settings app. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Use Pinterest to find a modern-style picture and set it as your phone's wallpaper through the Settings app.\nThe historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (572, 922)\nstep 3: CLICK: (459, 878)\nstep 4: CLICK: (146, 90)\nstep 5: TYPE: modern\nstep 6: CLICK: (880, 887)\nstep 7: CLICK: (514, 495)\nstep 8: CLICK: (460, 96)\nstep 9: CLICK: (217, 686)\nstep 10: PRESS_HOME\nstep 11: CLICK: (454, 508)\nstep 12: CLICK: (552, 391)\nstep 13: CLICK: (465, 271)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (454, 354)\nB: SCROLL: LEFT\nC: CLICK: (288, 823)\nD: CLICK: (884, 885)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_141_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_141_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_141_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_141_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_141_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_141_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_141_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_141_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_141_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_141_9.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_141_10.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_141_11.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_141_12.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_141_13.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: TYPE: Hospital Helipad\nB: CLICK: (526, 586)\nC: CLICK: (738, 62)\nD: PRESS_HOME\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (244, 749)\nstep 2: CLICK: (931, 655)\nstep 3: CLICK: (73, 796)\nstep 4: TYPE: Shanghai, China itinerary\nstep 5: CLICK: (744, 63)\nstep 6: SCROLL: LEFT\nstep 7: SCROLL: LEFT\nstep 8: SCROLL: LEFT\nstep 9: SCROLL: LEFT\nstep 10: SCROLL: LEFT\nstep 11: SCROLL: LEFT\nI want to Using the X app and Tripadvisor, find an itinerary for visiting Shanghai, China and book your accommodations. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Using the X app and Tripadvisor, find an itinerary for visiting Shanghai, China and book your accommodations.\nThe historical actions are: step 1: CLICK: (244, 749)\nstep 2: CLICK: (931, 655)\nstep 3: CLICK: (73, 796)\nstep 4: TYPE: Shanghai, China itinerary\nstep 5: CLICK: (744, 63)\nstep 6: SCROLL: LEFT\nstep 7: SCROLL: LEFT\nstep 8: SCROLL: LEFT\nstep 9: SCROLL: LEFT\nstep 10: SCROLL: LEFT\nstep 11: SCROLL: LEFT\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: TYPE: Hospital Helipad\nB: CLICK: (526, 586)\nC: CLICK: (738, 62)\nD: PRESS_HOME\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_142_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_142_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_142_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_142_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_142_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_142_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_142_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_142_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_142_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_142_9.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_142_10.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_142_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: TYPE: PURDUE:UCONN is 60:75\nB: CLICK: (97, 308)\nC: CLICK: (212, 494)\nD: CLICK: (441, 645)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (55, 660)\nstep 2: CLICK: (408, 80)\nstep 3: TYPE: Meesho\nstep 4: CLICK: (902, 873)\nstep 5: CLICK: (604, 438)\nstep 6: CLICK: (680, 514)\nstep 7: PRESS_HOME\nI want to First, go to the Google Play Store and uninstall the Meesho app. Afterward, navigate to the Settings app to verify whether the Meesho app is still listed in the app resources. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: First, go to the Google Play Store and uninstall the Meesho app. Afterward, navigate to the Settings app to verify whether the Meesho app is still listed in the app resources.\nThe historical actions are: step 1: CLICK: (55, 660)\nstep 2: CLICK: (408, 80)\nstep 3: TYPE: Meesho\nstep 4: CLICK: (902, 873)\nstep 5: CLICK: (604, 438)\nstep 6: CLICK: (680, 514)\nstep 7: PRESS_HOME\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: TYPE: PURDUE:UCONN is 60:75\nB: CLICK: (97, 308)\nC: CLICK: (212, 494)\nD: CLICK: (441, 645)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_143_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_143_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_143_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_143_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_143_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_143_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_143_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_143_7.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: TYPE: AnyList\nB: CLICK: (807, 552)\nC: CLICK: (910, 691)\nD: TYPE: the best ice cream parlor in Los Angeles\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (122, 154)\nstep 2: CLICK: (836, 156)\nstep 3: CLICK: (424, 249)\nstep 4: CLICK: (813, 164)\nI want to Use Duckduckgo to identify the top ice cream parlor in your local city, then utilize GPS to navigate to it. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Use Duckduckgo to identify the top ice cream parlor in your local city, then utilize GPS to navigate to it.\nThe historical actions are: step 1: CLICK: (122, 154)\nstep 2: CLICK: (836, 156)\nstep 3: CLICK: (424, 249)\nstep 4: CLICK: (813, 164)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: TYPE: AnyList\nB: CLICK: (807, 552)\nC: CLICK: (910, 691)\nD: TYPE: the best ice cream parlor in Los Angeles\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_144_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_144_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_144_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_144_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_144_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: PRESS_HOME\nB: CLICK: (451, 447)\nC: CLICK: (869, 890)\nD: CLICK: (634, 640)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (702, 143)\nstep 2: CLICK: (155, 287)\nstep 3: TYPE: Healthy lunch plan\nstep 4: CLICK: (577, 449)\nstep 5: SCROLL: UP\nstep 6: CLICK: (605, 771)\nstep 7: SCROLL: UP\nstep 8: PRESS_HOME\nstep 9: CLICK: (524, 262)\nI want to Pick a nutritious lunch option for tomorrow, jot it down in Simplenote, and then watch a Tiktok video on how to prepare one of the dishes. You can also check Quora for additional information or tips. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Pick a nutritious lunch option for tomorrow, jot it down in Simplenote, and then watch a Tiktok video on how to prepare one of the dishes. You can also check Quora for additional information or tips.\nThe historical actions are: step 1: CLICK: (702, 143)\nstep 2: CLICK: (155, 287)\nstep 3: TYPE: Healthy lunch plan\nstep 4: CLICK: (577, 449)\nstep 5: SCROLL: UP\nstep 6: CLICK: (605, 771)\nstep 7: SCROLL: UP\nstep 8: PRESS_HOME\nstep 9: CLICK: (524, 262)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: PRESS_HOME\nB: CLICK: (451, 447)\nC: CLICK: (869, 890)\nD: CLICK: (634, 640)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_145_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_145_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_145_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_145_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_145_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_145_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_145_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_145_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_145_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_145_9.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: TYPE: Club Factory Shopping India\nB: TYPE: learn to grow herbs indoors \nC: PRESS_HOME\nD: CLICK: (363, 151)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (836, 82)\nstep 2: CLICK: (58, 73)\nstep 3: CLICK: (258, 424)\nI want to Utilize ABPV to find a coastal style picture and then set it as the wallpaper on your phone using the Setting app. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Utilize ABPV to find a coastal style picture and then set it as the wallpaper on your phone using the Setting app.\nThe historical actions are: step 1: CLICK: (836, 82)\nstep 2: CLICK: (58, 73)\nstep 3: CLICK: (258, 424)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: TYPE: Club Factory Shopping India\nB: TYPE: learn to grow herbs indoors \nC: PRESS_HOME\nD: CLICK: (363, 151)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_146_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_146_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_146_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_146_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (199, 934)\nB: CLICK: (179, 909)\nC: CLICK: (892, 899)\nD: TYPE: do yoga in the morning\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (634, 813)\nstep 2: PRESS_HOME\nstep 3: CLICK: (861, 803)\nstep 4: CLICK: (810, 53)\nstep 5: TYPE: Fitbod\nI want to Watch a video on YouTube about fitness tracking app recommendations and then head over to the Google Play Store to download one of them. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Watch a video on YouTube about fitness tracking app recommendations and then head over to the Google Play Store to download one of them.\nThe historical actions are: step 1: CLICK: (634, 813)\nstep 2: PRESS_HOME\nstep 3: CLICK: (861, 803)\nstep 4: CLICK: (810, 53)\nstep 5: TYPE: Fitbod\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (199, 934)\nB: CLICK: (179, 909)\nC: CLICK: (892, 899)\nD: TYPE: do yoga in the morning\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_147_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_147_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_147_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_147_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_147_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_147_5.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (899, 320)\nB: TYPE: 5000\nC: CLICK: (885, 900)\nD: CLICK: (362, 810)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (855, 813)\nstep 2: CLICK: (783, 215)\nstep 3: PRESS_HOME\nI want to First, head over to the 'Google Play Store' and install the 'Tiktok' app. Once the installation is complete, open the 'Tiktok' app. Next, navigate to the 'Setting' and turn off the notifications for 'Tiktok'. Finally, reopen the 'Tiktok' app to enjoy watching a video. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: First, head over to the 'Google Play Store' and install the 'Tiktok' app. Once the installation is complete, open the 'Tiktok' app. Next, navigate to the 'Setting' and turn off the notifications for 'Tiktok'. Finally, reopen the 'Tiktok' app to enjoy watching a video.\nThe historical actions are: step 1: CLICK: (855, 813)\nstep 2: CLICK: (783, 215)\nstep 3: PRESS_HOME\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (899, 320)\nB: TYPE: 5000\nC: CLICK: (885, 900)\nD: CLICK: (362, 810)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_148_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_148_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_148_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_148_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: SCROLL: UP\nB: CLICK: (337, 525)\nC: CLICK: (919, 58)\nD: CLICK: (347, 936)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (309, 214)\nstep 2: CLICK: (520, 230)\nstep 3: TYPE: The Music of the Night\nstep 4: CLICK: (323, 478)\nstep 5: CLICK: (383, 817)\nI want to Open Spotify and listen to the song 'The Music of the Night.' After listening, use Microsoft Translator to translate the first line of the lyrics into Dutch. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Open Spotify and listen to the song 'The Music of the Night.' After listening, use Microsoft Translator to translate the first line of the lyrics into Dutch.\nThe historical actions are: step 1: CLICK: (309, 214)\nstep 2: CLICK: (520, 230)\nstep 3: TYPE: The Music of the Night\nstep 4: CLICK: (323, 478)\nstep 5: CLICK: (383, 817)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: SCROLL: UP\nB: CLICK: (337, 525)\nC: CLICK: (919, 58)\nD: CLICK: (347, 936)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_149_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_149_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_149_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_149_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_149_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_149_5.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (909, 83)\nB: COMPLETE\nC: CLICK: (402, 879)\nD: CLICK: (897, 146)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (146, 718)\nstep 2: CLICK: (287, 252)\nstep 3: CLICK: (982, 173)\nstep 4: TYPE: esport score\nstep 5: CLICK: (887, 687)\nstep 6: CLICK: (313, 509)\nstep 7: CLICK: (582, 478)\nstep 8: PRESS_HOME\nstep 9: CLICK: (148, 107)\nstep 10: CLICK: (562, 929)\nstep 11: CLICK: (523, 122)\nstep 12: CLICK: (362, 203)\nI want to Using Chrome, find the score of a recent eSports competition and then share the result on Tumblr. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Using Chrome, find the score of a recent eSports competition and then share the result on Tumblr.\nThe historical actions are: step 1: CLICK: (146, 718)\nstep 2: CLICK: (287, 252)\nstep 3: CLICK: (982, 173)\nstep 4: TYPE: esport score\nstep 5: CLICK: (887, 687)\nstep 6: CLICK: (313, 509)\nstep 7: CLICK: (582, 478)\nstep 8: PRESS_HOME\nstep 9: CLICK: (148, 107)\nstep 10: CLICK: (562, 929)\nstep 11: CLICK: (523, 122)\nstep 12: CLICK: (362, 203)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (909, 83)\nB: COMPLETE\nC: CLICK: (402, 879)\nD: CLICK: (897, 146)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_150_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_150_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_150_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_150_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_150_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_150_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_150_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_150_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_150_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_150_9.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_150_10.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_150_11.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_150_12.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: SCROLL: UP\nB: CLICK: (865, 937)\nC: CLICK: (154, 387)\nD: PRESS_HOME\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (390, 245)\nstep 2: CLICK: (186, 73)\nstep 3: TYPE: yoga class\nstep 4: CLICK: (924, 908)\nstep 5: CLICK: (252, 509)\nstep 6: PRESS_HOME\nstep 7: CLICK: (843, 104)\nstep 8: CLICK: (283, 73)\nstep 9: TYPE: yoga mat\nstep 10: CLICK: (934, 911)\nstep 11: CLICK: (927, 583)\nstep 12: CLICK: (477, 935)\nstep 13: PRESS_HOME\nstep 14: CLICK: (367, 520)\nI want to Search for a Yoga class using DuckDuckGo, purchase the necessary items for the class on SHEIN, and then set a reminder in your Calendar to study. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Search for a Yoga class using DuckDuckGo, purchase the necessary items for the class on SHEIN, and then set a reminder in your Calendar to study.\nThe historical actions are: step 1: CLICK: (390, 245)\nstep 2: CLICK: (186, 73)\nstep 3: TYPE: yoga class\nstep 4: CLICK: (924, 908)\nstep 5: CLICK: (252, 509)\nstep 6: PRESS_HOME\nstep 7: CLICK: (843, 104)\nstep 8: CLICK: (283, 73)\nstep 9: TYPE: yoga mat\nstep 10: CLICK: (934, 911)\nstep 11: CLICK: (927, 583)\nstep 12: CLICK: (477, 935)\nstep 13: PRESS_HOME\nstep 14: CLICK: (367, 520)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: SCROLL: UP\nB: CLICK: (865, 937)\nC: CLICK: (154, 387)\nD: PRESS_HOME\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_151_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_151_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_151_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_151_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_151_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_151_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_151_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_151_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_151_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_151_9.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_151_10.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_151_11.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_151_12.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_151_13.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_151_14.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (161, 332)\nB: CLICK: (451, 194)\nC: SCROLL: UP\nD: PRESS_HOME\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (830, 204)\nstep 2: CLICK: (297, 85)\nstep 3: TYPE: Microsoft's stock market news\nstep 4: CLICK: (912, 882)\nstep 5: CLICK: (798, 673)\nI want to Launch Firefox to search for today's stock market news regarding Microsoft. Then, open the TradingView: Track All Markets app to check the stock price trends. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Launch Firefox to search for today's stock market news regarding Microsoft. Then, open the TradingView: Track All Markets app to check the stock price trends.\nThe historical actions are: step 1: CLICK: (830, 204)\nstep 2: CLICK: (297, 85)\nstep 3: TYPE: Microsoft's stock market news\nstep 4: CLICK: (912, 882)\nstep 5: CLICK: (798, 673)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (161, 332)\nB: CLICK: (451, 194)\nC: SCROLL: UP\nD: PRESS_HOME\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_152_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_152_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_152_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_152_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_152_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_152_5.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: SCROLL: DOWN\nB: CLICK: (362, 812)\nC: TYPE: Disco\nD: CLICK: (116, 493)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (362, 383)\nstep 2: CLICK: (67, 76)\nstep 3: CLICK: (348, 152)\nstep 4: CLICK: (217, 436)\nstep 5: CLICK: (313, 941)\nstep 6: PRESS_HOME\nstep 7: CLICK: (165, 527)\nstep 8: CLICK: (884, 877)\nstep 9: TYPE: German Learning\nstep 10: CLICK: (174, 594)\nstep 11: TYPE: 10 mins per day\nstep 12: CLICK: (938, 645)\nI want to Engage in a German language lesson using Duolingo and organize a learning plan with the help of TickTick. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Engage in a German language lesson using Duolingo and organize a learning plan with the help of TickTick.\nThe historical actions are: step 1: CLICK: (362, 383)\nstep 2: CLICK: (67, 76)\nstep 3: CLICK: (348, 152)\nstep 4: CLICK: (217, 436)\nstep 5: CLICK: (313, 941)\nstep 6: PRESS_HOME\nstep 7: CLICK: (165, 527)\nstep 8: CLICK: (884, 877)\nstep 9: TYPE: German Learning\nstep 10: CLICK: (174, 594)\nstep 11: TYPE: 10 mins per day\nstep 12: CLICK: (938, 645)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: SCROLL: DOWN\nB: CLICK: (362, 812)\nC: TYPE: Disco\nD: CLICK: (116, 493)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_153_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_153_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_153_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_153_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_153_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_153_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_153_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_153_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_153_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_153_9.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_153_10.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_153_11.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_153_12.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: TYPE: paintbrush\nB: CLICK: (85, 385)\nC: CLICK: (520, 524)\nD: CLICK: (319, 733)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (842, 836)\nstep 2: PRESS_HOME\nstep 3: CLICK: (127, 267)\nstep 4: CLICK: (880, 912)\nstep 5: TYPE: Chocolate chip cookies: 150g salted butter, softened 80g light brown muscovado sugar, 80g granulated sugar, 2 tsp vanilla extract, 1 large egg, 225g plain flour, 1/2 tsp bicarbonate of soda, 1/4 tsp salt, 200g plain chocolate chips or chunks\nstep 6: PRESS_HOME\nstep 7: CLICK: (159, 519)\nstep 8: CLICK: (47, 74)\nI want to Search for a Chocolate chip cookie recipe using Opera, compile a shopping list of the main ingredients in Google Keep, and then add these items to your cart on Ebay. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Search for a Chocolate chip cookie recipe using Opera, compile a shopping list of the main ingredients in Google Keep, and then add these items to your cart on Ebay.\nThe historical actions are: step 1: CLICK: (842, 836)\nstep 2: PRESS_HOME\nstep 3: CLICK: (127, 267)\nstep 4: CLICK: (880, 912)\nstep 5: TYPE: Chocolate chip cookies: 150g salted butter, softened 80g light brown muscovado sugar, 80g granulated sugar, 2 tsp vanilla extract, 1 large egg, 225g plain flour, 1/2 tsp bicarbonate of soda, 1/4 tsp salt, 200g plain chocolate chips or chunks\nstep 6: PRESS_HOME\nstep 7: CLICK: (159, 519)\nstep 8: CLICK: (47, 74)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: TYPE: paintbrush\nB: CLICK: (85, 385)\nC: CLICK: (520, 524)\nD: CLICK: (319, 733)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_154_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_154_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_154_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_154_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_154_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_154_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_154_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_154_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_154_8.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: TYPE: Chocolate chip cookies: 150g salted butter, softened 80g light brown muscovado sugar, 80g granulated sugar, 2 tsp vanilla extract, 1 large egg, 225g plain flour, 1/2 tsp bicarbonate of soda, 1/4 tsp salt, 200g plain chocolate chips or chunks\nB: TYPE: Nighttime sharpens, heightens each sensation\nC: CLICK: (673, 59)\nD: CLICK: (422, 123)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (716, 119)\nstep 2: CLICK: (319, 64)\nstep 3: TYPE: book about self-help\nstep 4: CLICK: (882, 697)\nstep 5: SCROLL: UP\nstep 6: PRESS_HOME\nstep 7: CLICK: (287, 102)\nstep 8: CLICK: (25, 352)\nstep 9: CLICK: (172, 68)\nstep 10: TYPE: how to win friends and influence people book\nstep 11: CLICK: (901, 692)\nstep 12: CLICK: (548, 434)\nstep 13: PRESS_HOME\nstep 14: CLICK: (571, 270)\nI want to Utilize Instagram, AliExpress, and Firefox to find a renowned self-help book. Start by searching for recommendations and reviews using Instagram and Firefox. Once you have identified the book, proceed to purchase it on AliExpress. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Utilize Instagram, AliExpress, and Firefox to find a renowned self-help book. Start by searching for recommendations and reviews using Instagram and Firefox. Once you have identified the book, proceed to purchase it on AliExpress.\nThe historical actions are: step 1: CLICK: (716, 119)\nstep 2: CLICK: (319, 64)\nstep 3: TYPE: book about self-help\nstep 4: CLICK: (882, 697)\nstep 5: SCROLL: UP\nstep 6: PRESS_HOME\nstep 7: CLICK: (287, 102)\nstep 8: CLICK: (25, 352)\nstep 9: CLICK: (172, 68)\nstep 10: TYPE: how to win friends and influence people book\nstep 11: CLICK: (901, 692)\nstep 12: CLICK: (548, 434)\nstep 13: PRESS_HOME\nstep 14: CLICK: (571, 270)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: TYPE: Chocolate chip cookies: 150g salted butter, softened 80g light brown muscovado sugar, 80g granulated sugar, 2 tsp vanilla extract, 1 large egg, 225g plain flour, 1/2 tsp bicarbonate of soda, 1/4 tsp salt, 200g plain chocolate chips or chunks\nB: TYPE: Nighttime sharpens, heightens each sensation\nC: CLICK: (673, 59)\nD: CLICK: (422, 123)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_155_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_155_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_155_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_155_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_155_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_155_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_155_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_155_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_155_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_155_9.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_155_10.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_155_11.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_155_12.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_155_13.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_155_14.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (352, 204)\nB: CLICK: (366, 897)\nC: CLICK: (139, 211)\nD: CLICK: (938, 874)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (450, 920)\nstep 2: CLICK: (858, 300)\nstep 3: TYPE: when is next comic-con international\nstep 4: CLICK: (236, 287)\nstep 5: PRESS_HOME\nstep 6: CLICK: (330, 136)\nI want to Use Chrome to search for the dates of the next Comic-Con International event and then set a reminder for it in TickTick. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Use Chrome to search for the dates of the next Comic-Con International event and then set a reminder for it in TickTick.\nThe historical actions are: step 1: CLICK: (450, 920)\nstep 2: CLICK: (858, 300)\nstep 3: TYPE: when is next comic-con international\nstep 4: CLICK: (236, 287)\nstep 5: PRESS_HOME\nstep 6: CLICK: (330, 136)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (352, 204)\nB: CLICK: (366, 897)\nC: CLICK: (139, 211)\nD: CLICK: (938, 874)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_156_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_156_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_156_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_156_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_156_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_156_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_156_6.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (163, 159)\nB: TYPE: 125\nC: CLICK: (822, 78)\nD: CLICK: (751, 90)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (863, 505)\nstep 2: CLICK: (895, 243)\nstep 3: CLICK: (54, 62)\nstep 4: CLICK: (765, 70)\nstep 5: TYPE: rock\nstep 6: CLICK: (430, 424)\nstep 7: CLICK: (656, 429)\nstep 8: PRESS_HOME\nstep 9: CLICK: (379, 104)\nstep 10: CLICK: (722, 879)\nstep 11: TYPE: caba62244@gmail.com\nstep 12: CLICK: (263, 324)\nstep 13: CLICK: (253, 341)\nstep 14: TYPE: Rocky\nI want to Listen to a Rock-style album using Pocket FM: Audio Series, then share the name of the album via Gmail with caba62244@gmail.com. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Listen to a Rock-style album using Pocket FM: Audio Series, then share the name of the album via Gmail with caba62244@gmail.com.\nThe historical actions are: step 1: CLICK: (863, 505)\nstep 2: CLICK: (895, 243)\nstep 3: CLICK: (54, 62)\nstep 4: CLICK: (765, 70)\nstep 5: TYPE: rock\nstep 6: CLICK: (430, 424)\nstep 7: CLICK: (656, 429)\nstep 8: PRESS_HOME\nstep 9: CLICK: (379, 104)\nstep 10: CLICK: (722, 879)\nstep 11: TYPE: caba62244@gmail.com\nstep 12: CLICK: (263, 324)\nstep 13: CLICK: (253, 341)\nstep 14: TYPE: Rocky\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (163, 159)\nB: TYPE: 125\nC: CLICK: (822, 78)\nD: CLICK: (751, 90)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_157_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_157_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_157_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_157_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_157_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_157_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_157_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_157_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_157_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_157_9.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_157_10.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_157_11.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_157_12.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_157_13.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_157_14.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (223, 415)\nB: TYPE: The Crusades\nC: CLICK: (683, 589)\nD: COMPLETE\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (910, 324)\nstep 2: CLICK: (764, 811)\nstep 3: CLICK: (555, 573)\nstep 4: CLICK: (746, 86)\nstep 5: CLICK: (628, 752)\nstep 6: CLICK: (751, 90)\nstep 7: CLICK: (766, 86)\nstep 8: SCROLL: LEFT\nstep 9: CLICK: (730, 850)\nstep 10: CLICK: (607, 686)\nstep 11: CLICK: (511, 585)\nstep 12: CLICK: (751, 915)\nI want to Utilize Adobe Express: AI Video Design to edit a photo and then share the edited image on Tumblr, specifically to katsunaksu. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Utilize Adobe Express: AI Video Design to edit a photo and then share the edited image on Tumblr, specifically to katsunaksu.\nThe historical actions are: step 1: CLICK: (910, 324)\nstep 2: CLICK: (764, 811)\nstep 3: CLICK: (555, 573)\nstep 4: CLICK: (746, 86)\nstep 5: CLICK: (628, 752)\nstep 6: CLICK: (751, 90)\nstep 7: CLICK: (766, 86)\nstep 8: SCROLL: LEFT\nstep 9: CLICK: (730, 850)\nstep 10: CLICK: (607, 686)\nstep 11: CLICK: (511, 585)\nstep 12: CLICK: (751, 915)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (223, 415)\nB: TYPE: The Crusades\nC: CLICK: (683, 589)\nD: COMPLETE\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_158_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_158_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_158_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_158_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_158_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_158_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_158_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_158_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_158_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_158_9.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_158_10.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_158_11.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_158_12.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: PRESS_BACK\nB: CLICK: (169, 343)\nC: COMPLETE\nD: TYPE: goodfood recipe\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (288, 262)\nstep 2: CLICK: (21, 358)\nstep 3: CLICK: (124, 67)\nstep 4: TYPE: smart light bulbs recommendation\nstep 5: CLICK: (885, 704)\nstep 6: CLICK: (173, 556)\nstep 7: SCROLL: UP\nstep 8: SCROLL: UP\nI want to Utilize Instagram to discover highly recommended smart light bulbs and then proceed to purchase one through Flipkart. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Utilize Instagram to discover highly recommended smart light bulbs and then proceed to purchase one through Flipkart.\nThe historical actions are: step 1: CLICK: (288, 262)\nstep 2: CLICK: (21, 358)\nstep 3: CLICK: (124, 67)\nstep 4: TYPE: smart light bulbs recommendation\nstep 5: CLICK: (885, 704)\nstep 6: CLICK: (173, 556)\nstep 7: SCROLL: UP\nstep 8: SCROLL: UP\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: PRESS_BACK\nB: CLICK: (169, 343)\nC: COMPLETE\nD: TYPE: goodfood recipe\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_159_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_159_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_159_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_159_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_159_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_159_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_159_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_159_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_159_8.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (424, 646)\nB: CLICK: (482, 292)\nC: CLICK: (379, 74)\nD: COMPLETE\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (440, 914)\nstep 2: CLICK: (274, 415)\nstep 3: TYPE: Facebook's stock market news\nstep 4: CLICK: (884, 878)\nstep 5: SCROLL: UP\nstep 6: CLICK: (246, 786)\nstep 7: PRESS_HOME\nstep 8: CLICK: (913, 494)\nI want to Use Chrome to search for today's stock market news about Facebook, and then open TradingView: Track All Markets to check the stock price trends. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Use Chrome to search for today's stock market news about Facebook, and then open TradingView: Track All Markets to check the stock price trends.\nThe historical actions are: step 1: CLICK: (440, 914)\nstep 2: CLICK: (274, 415)\nstep 3: TYPE: Facebook's stock market news\nstep 4: CLICK: (884, 878)\nstep 5: SCROLL: UP\nstep 6: CLICK: (246, 786)\nstep 7: PRESS_HOME\nstep 8: CLICK: (913, 494)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (424, 646)\nB: CLICK: (482, 292)\nC: CLICK: (379, 74)\nD: COMPLETE\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_160_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_160_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_160_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_160_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_160_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_160_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_160_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_160_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_160_8.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: COMPLETE\nB: CLICK: (113, 808)\nC: CLICK: (666, 804)\nD: CLICK: (264, 887)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: SCROLL: LEFT\nstep 2: CLICK: (584, 553)\nstep 3: CLICK: (347, 546)\nstep 4: CLICK: (440, 809)\nstep 5: CLICK: (343, 658)\nstep 6: CLICK: (545, 896)\nstep 7: CLICK: (554, 806)\nI want to First, use the Calculator app to determine the sum of '27.3+13' for today's total cost. Once you have the total, record this amount in either a document or the Google Keep app. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: First, use the Calculator app to determine the sum of '27.3+13' for today's total cost. Once you have the total, record this amount in either a document or the Google Keep app.\nThe historical actions are: step 1: SCROLL: LEFT\nstep 2: CLICK: (584, 553)\nstep 3: CLICK: (347, 546)\nstep 4: CLICK: (440, 809)\nstep 5: CLICK: (343, 658)\nstep 6: CLICK: (545, 896)\nstep 7: CLICK: (554, 806)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: COMPLETE\nB: CLICK: (113, 808)\nC: CLICK: (666, 804)\nD: CLICK: (264, 887)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_161_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_161_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_161_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_161_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_161_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_161_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_161_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_161_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: PRESS_HOME\nB: CLICK: (819, 243)\nC: CLICK: (919, 69)\nD: CLICK: (370, 319)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: PRESS_HOME\nstep 2: SCROLL: RIGHT\nstep 3: SCROLL: RIGHT\nstep 4: SCROLL: LEFT\nstep 5: CLICK: (858, 114)\nstep 6: SCROLL: UP\nstep 7: SCROLL: UP\nstep 8: SCROLL: UP\nstep 9: SCROLL: UP\nstep 10: SCROLL: DOWN\nstep 11: PRESS_HOME\nstep 12: SCROLL: RIGHT\nI want to Using DuckDuckGo to search for the 2019 Nobel Prize winners in Physics, and then record the gathered information in Simplenote. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Using DuckDuckGo to search for the 2019 Nobel Prize winners in Physics, and then record the gathered information in Simplenote.\nThe historical actions are: step 1: PRESS_HOME\nstep 2: SCROLL: RIGHT\nstep 3: SCROLL: RIGHT\nstep 4: SCROLL: LEFT\nstep 5: CLICK: (858, 114)\nstep 6: SCROLL: UP\nstep 7: SCROLL: UP\nstep 8: SCROLL: UP\nstep 9: SCROLL: UP\nstep 10: SCROLL: DOWN\nstep 11: PRESS_HOME\nstep 12: SCROLL: RIGHT\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: PRESS_HOME\nB: CLICK: (819, 243)\nC: CLICK: (919, 69)\nD: CLICK: (370, 319)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_162_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_162_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_162_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_162_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_162_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_162_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_162_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_162_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_162_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_162_9.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_162_10.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_162_11.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_162_12.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: SCROLL: UP\nB: SCROLL: LEFT\nC: COMPLETE\nD: TYPE: Healthy lunch plan\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (143, 665)\nstep 2: CLICK: (947, 76)\nstep 3: CLICK: (406, 151)\nstep 4: TYPE: Moscow\nstep 5: CLICK: (270, 240)\nstep 6: SCROLL: UP\nstep 7: SCROLL: UP\nstep 8: CLICK: (284, 608)\nstep 9: PRESS_HOME\nstep 10: CLICK: (411, 400)\nstep 11: CLICK: (893, 921)\nstep 12: CLICK: (893, 873)\nstep 13: TYPE: Moscow,tomorrow: cloudy and some sun todolist:buy a flight to Moscow.\nstep 14: CLICK: (69, 86)\nI want to Utilize the Weather & Radar app to check the weather forecast for Moscow tomorrow, and based on that information, create a detailed to-do list using Google Docs. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Utilize the Weather & Radar app to check the weather forecast for Moscow tomorrow, and based on that information, create a detailed to-do list using Google Docs.\nThe historical actions are: step 1: CLICK: (143, 665)\nstep 2: CLICK: (947, 76)\nstep 3: CLICK: (406, 151)\nstep 4: TYPE: Moscow\nstep 5: CLICK: (270, 240)\nstep 6: SCROLL: UP\nstep 7: SCROLL: UP\nstep 8: CLICK: (284, 608)\nstep 9: PRESS_HOME\nstep 10: CLICK: (411, 400)\nstep 11: CLICK: (893, 921)\nstep 12: CLICK: (893, 873)\nstep 13: TYPE: Moscow,tomorrow: cloudy and some sun todolist:buy a flight to Moscow.\nstep 14: CLICK: (69, 86)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: SCROLL: UP\nB: SCROLL: LEFT\nC: COMPLETE\nD: TYPE: Healthy lunch plan\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_163_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_163_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_163_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_163_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_163_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_163_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_163_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_163_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_163_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_163_9.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_163_10.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_163_11.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_163_12.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_163_13.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_163_14.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (879, 693)\nB: TYPE: plain chocolate chips\nC: CLICK: (182, 236)\nD: PRESS_HOME\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (716, 106)\nstep 2: CLICK: (292, 68)\nstep 3: TYPE: book about cookbook\nstep 4: CLICK: (894, 683)\nstep 5: SCROLL: UP\nstep 6: SCROLL: UP\nstep 7: PRESS_HOME\nstep 8: CLICK: (140, 97)\nstep 9: CLICK: (668, 68)\nstep 10: TYPE: cook this book\nI want to Utilize Firefox to search for a renowned cookbook and read various reviews about it. Once decided, check Ebay for listings and proceed with the purchase. Additionally, consider browsing Facebook for any available recommendations or seller reviews before finalizing your purchase. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Utilize Firefox to search for a renowned cookbook and read various reviews about it. Once decided, check Ebay for listings and proceed with the purchase. Additionally, consider browsing Facebook for any available recommendations or seller reviews before finalizing your purchase.\nThe historical actions are: step 1: CLICK: (716, 106)\nstep 2: CLICK: (292, 68)\nstep 3: TYPE: book about cookbook\nstep 4: CLICK: (894, 683)\nstep 5: SCROLL: UP\nstep 6: SCROLL: UP\nstep 7: PRESS_HOME\nstep 8: CLICK: (140, 97)\nstep 9: CLICK: (668, 68)\nstep 10: TYPE: cook this book\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (879, 693)\nB: TYPE: plain chocolate chips\nC: CLICK: (182, 236)\nD: PRESS_HOME\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_164_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_164_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_164_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_164_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_164_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_164_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_164_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_164_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_164_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_164_9.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_164_10.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (543, 277)\nB: CLICK: (860, 887)\nC: PRESS_HOME\nD: CLICK: (931, 638)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (554, 156)\nstep 2: CLICK: (727, 70)\nstep 3: TYPE:  climate change rally\nI want to Locate an event related to a climate change rally using X and Facebook, then have a discussion about it with liudehu19294094. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Locate an event related to a climate change rally using X and Facebook, then have a discussion about it with liudehu19294094.\nThe historical actions are: step 1: CLICK: (554, 156)\nstep 2: CLICK: (727, 70)\nstep 3: TYPE:  climate change rally\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (543, 277)\nB: CLICK: (860, 887)\nC: PRESS_HOME\nD: CLICK: (931, 638)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_165_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_165_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_165_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_165_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: PRESS_HOME\nB: CLICK: (299, 229)\nC: CLICK: (920, 910)\nD: SCROLL: UP\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (141, 670)\nstep 2: CLICK: (317, 282)\nstep 3: TYPE: hiking trail\nstep 4: CLICK: (915, 919)\nstep 5: CLICK: (500, 511)\nstep 6: PRESS_HOME\nstep 7: CLICK: (148, 393)\nstep 8: SCROLL: DOWN\nstep 9: CLICK: (951, 78)\nstep 10: CLICK: (308, 153)\nstep 11: TYPE: California\nI want to Using Chrome, search for a new hiking trail. Then, check the weekend weather forecast using Weather & Radar. Finally, send an invitation to join the hike to caba62244@gmail.com through Gmail. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Using Chrome, search for a new hiking trail. Then, check the weekend weather forecast using Weather & Radar. Finally, send an invitation to join the hike to caba62244@gmail.com through Gmail.\nThe historical actions are: step 1: CLICK: (141, 670)\nstep 2: CLICK: (317, 282)\nstep 3: TYPE: hiking trail\nstep 4: CLICK: (915, 919)\nstep 5: CLICK: (500, 511)\nstep 6: PRESS_HOME\nstep 7: CLICK: (148, 393)\nstep 8: SCROLL: DOWN\nstep 9: CLICK: (951, 78)\nstep 10: CLICK: (308, 153)\nstep 11: TYPE: California\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: PRESS_HOME\nB: CLICK: (299, 229)\nC: CLICK: (920, 910)\nD: SCROLL: UP\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_166_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_166_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_166_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_166_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_166_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_166_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_166_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_166_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_166_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_166_9.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_166_10.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_166_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: PRESS_HOME\nB: SCROLL: UP\nC: TYPE: happy\nD: SCROLL: DOWN\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (415, 663)\nstep 2: CLICK: (172, 84)\nstep 3: SCROLL: UP\nstep 4: CLICK: (196, 707)\nstep 5: CLICK: (459, 327)\nI want to Attend a Portuguese language lesson using Rosetta Stone: Learn, Practice and set a learning plan in Things. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Attend a Portuguese language lesson using Rosetta Stone: Learn, Practice and set a learning plan in Things.\nThe historical actions are: step 1: CLICK: (415, 663)\nstep 2: CLICK: (172, 84)\nstep 3: SCROLL: UP\nstep 4: CLICK: (196, 707)\nstep 5: CLICK: (459, 327)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: PRESS_HOME\nB: SCROLL: UP\nC: TYPE: happy\nD: SCROLL: DOWN\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_167_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_167_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_167_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_167_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_167_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_167_5.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: PRESS_HOME\nB: CLICK: (23, 462)\nC: TYPE: instagram\nD: COMPLETE\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (567, 263)\nstep 2: CLICK: (378, 651)\nstep 3: CLICK: (899, 67)\nstep 4: LONG_PRESS: (463, 287)\nstep 5: SCROLL: LEFT\nstep 6: SCROLL: DOWN\nstep 7: CLICK: (296, 216)\nstep 8: PRESS_HOME\nstep 9: CLICK: (714, 398)\nstep 10: CLICK: (631, 564)\nstep 11: CLICK: (729, 412)\nI want to Open Opera News and read an English news article. Translate the title of the article into German using DeepL translate. Finally, record the translated title in Simplenote. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Open Opera News and read an English news article. Translate the title of the article into German using DeepL translate. Finally, record the translated title in Simplenote.\nThe historical actions are: step 1: CLICK: (567, 263)\nstep 2: CLICK: (378, 651)\nstep 3: CLICK: (899, 67)\nstep 4: LONG_PRESS: (463, 287)\nstep 5: SCROLL: LEFT\nstep 6: SCROLL: DOWN\nstep 7: CLICK: (296, 216)\nstep 8: PRESS_HOME\nstep 9: CLICK: (714, 398)\nstep 10: CLICK: (631, 564)\nstep 11: CLICK: (729, 412)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: PRESS_HOME\nB: CLICK: (23, 462)\nC: TYPE: instagram\nD: COMPLETE\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_168_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_168_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_168_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_168_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_168_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_168_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_168_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_168_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_168_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_168_9.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_168_10.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_168_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (924, 70)\nB: CLICK: (489, 941)\nC: CLICK: (177, 756)\nD: CLICK: (376, 827)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (824, 817)\nstep 2: CLICK: (846, 212)\nstep 3: PRESS_HOME\nI want to First, use the Google Play Store to install the Tiktok app. Once installed, launch Tiktok. After launching, navigate to the Setting app to disable notifications for Tiktok. Finally, reopen Tiktok to enjoy watching a video. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: First, use the Google Play Store to install the Tiktok app. Once installed, launch Tiktok. After launching, navigate to the Setting app to disable notifications for Tiktok. Finally, reopen Tiktok to enjoy watching a video.\nThe historical actions are: step 1: CLICK: (824, 817)\nstep 2: CLICK: (846, 212)\nstep 3: PRESS_HOME\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (924, 70)\nB: CLICK: (489, 941)\nC: CLICK: (177, 756)\nD: CLICK: (376, 827)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_169_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_169_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_169_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_169_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (463, 651)\nB: CLICK: (719, 278)\nC: CLICK: (399, 142)\nD: PRESS_RECENT\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (419, 106)\nstep 2: CLICK: (278, 264)\nstep 3: CLICK: (69, 189)\nstep 4: CLICK: (919, 598)\nstep 5: CLICK: (79, 888)\nstep 6: CLICK: (79, 588)\nstep 7: CLICK: (923, 692)\nstep 8: LONG_PRESS: (228, 387)\nstep 9: SCROLL: RIGHT\nstep 10: CLICK: (465, 332)\nstep 11: PRESS_HOME\nI want to Using Opera, look up the 2021 Nobel-Prize winners in physics and document the details in Microsoft Word. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Using Opera, look up the 2021 Nobel-Prize winners in physics and document the details in Microsoft Word.\nThe historical actions are: step 1: CLICK: (419, 106)\nstep 2: CLICK: (278, 264)\nstep 3: CLICK: (69, 189)\nstep 4: CLICK: (919, 598)\nstep 5: CLICK: (79, 888)\nstep 6: CLICK: (79, 588)\nstep 7: CLICK: (923, 692)\nstep 8: LONG_PRESS: (228, 387)\nstep 9: SCROLL: RIGHT\nstep 10: CLICK: (465, 332)\nstep 11: PRESS_HOME\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (463, 651)\nB: CLICK: (719, 278)\nC: CLICK: (399, 142)\nD: PRESS_RECENT\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_170_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_170_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_170_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_170_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_170_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_170_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_170_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_170_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_170_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_170_9.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_170_10.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_170_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: SCROLL: UP\nB: CLICK: (402, 866)\nC: CLICK: (502, 331)\nD: TYPE: Bayes' theorem\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (216, 682)\nstep 2: CLICK: (205, 425)\nstep 3: TYPE: drama movies\nstep 4: CLICK: (841, 903)\nI want to Organize a movie night by selecting a drama film using Chrome, adding snacks to your cart on eBay, sending out invitations to caba62244@gmail.com via Gmail, and setting a reminder on the Clock app. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Organize a movie night by selecting a drama film using Chrome, adding snacks to your cart on eBay, sending out invitations to caba62244@gmail.com via Gmail, and setting a reminder on the Clock app.\nThe historical actions are: step 1: CLICK: (216, 682)\nstep 2: CLICK: (205, 425)\nstep 3: TYPE: drama movies\nstep 4: CLICK: (841, 903)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: SCROLL: UP\nB: CLICK: (402, 866)\nC: CLICK: (502, 331)\nD: TYPE: Bayes' theorem\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_171_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_171_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_171_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_171_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_171_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: PRESS_HOME\nB: CLICK: (287, 817)\nC: CLICK: (425, 575)\nD: SCROLL: UP\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (130, 362)\nstep 2: CLICK: (226, 866)\nstep 3: TYPE: tell me about Binomial theorem\nstep 4: CLICK: (906, 596)\nstep 5: CLICK: (112, 965)\nstep 6: PRESS_HOME\nstep 7: CLICK: (130, 244)\nstep 8: CLICK: (337, 56)\nstep 9: TYPE: Binomial theorem\nstep 10: CLICK: (928, 905)\nI want to Utilize ChatOn - AI Chat Bot Assistant to inquire about the Binomial theorem, then cross-verify the information using Firefox to conduct a browser search. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Utilize ChatOn - AI Chat Bot Assistant to inquire about the Binomial theorem, then cross-verify the information using Firefox to conduct a browser search.\nThe historical actions are: step 1: CLICK: (130, 362)\nstep 2: CLICK: (226, 866)\nstep 3: TYPE: tell me about Binomial theorem\nstep 4: CLICK: (906, 596)\nstep 5: CLICK: (112, 965)\nstep 6: PRESS_HOME\nstep 7: CLICK: (130, 244)\nstep 8: CLICK: (337, 56)\nstep 9: TYPE: Binomial theorem\nstep 10: CLICK: (928, 905)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: PRESS_HOME\nB: CLICK: (287, 817)\nC: CLICK: (425, 575)\nD: SCROLL: UP\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_172_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_172_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_172_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_172_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_172_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_172_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_172_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_172_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_172_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_172_9.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_172_10.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (898, 69)\nB: CLICK: (323, 80)\nC: COMPLETE\nD: CLICK: (333, 127)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (691, 214)\nstep 2: TYPE: Fitness Tracker Apps\nstep 3: CLICK: (894, 878)\nstep 4: CLICK: (454, 857)\nstep 5: SCROLL: UP\nstep 6: PRESS_HOME\nstep 7: CLICK: (513, 760)\nI want to Using DuckDuckGo to research various Fitness Tracker apps and then download one that suits your needs from the Google Play Store. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Using DuckDuckGo to research various Fitness Tracker apps and then download one that suits your needs from the Google Play Store.\nThe historical actions are: step 1: CLICK: (691, 214)\nstep 2: TYPE: Fitness Tracker Apps\nstep 3: CLICK: (894, 878)\nstep 4: CLICK: (454, 857)\nstep 5: SCROLL: UP\nstep 6: PRESS_HOME\nstep 7: CLICK: (513, 760)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (898, 69)\nB: CLICK: (323, 80)\nC: COMPLETE\nD: CLICK: (333, 127)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_173_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_173_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_173_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_173_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_173_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_173_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_173_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_173_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (240, 779)\nB: CLICK: (532, 928)\nC: CLICK: (163, 147)\nD: CLICK: (408, 492)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (599, 911)\nstep 2: PRESS_HOME\nstep 3: SCROLL: RIGHT\nstep 4: CLICK: (124, 699)\nstep 5: TYPE: centr app\nI want to Open YouTube and watch a video about fitness tracking app recommendations, then head over to the Google Play Store and download one of the suggested apps. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Open YouTube and watch a video about fitness tracking app recommendations, then head over to the Google Play Store and download one of the suggested apps.\nThe historical actions are: step 1: CLICK: (599, 911)\nstep 2: PRESS_HOME\nstep 3: SCROLL: RIGHT\nstep 4: CLICK: (124, 699)\nstep 5: TYPE: centr app\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (240, 779)\nB: CLICK: (532, 928)\nC: CLICK: (163, 147)\nD: CLICK: (408, 492)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_174_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_174_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_174_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_174_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_174_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_174_5.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: TYPE: Meesho\nB: SCROLL: UP\nC: CLICK: (122, 161)\nD: CLICK: (234, 564)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (144, 706)\nstep 2: CLICK: (287, 112)\nstep 3: TYPE: new hiking trail\nstep 4: CLICK: (889, 683)\nstep 5: CLICK: (337, 924)\nstep 6: SCROLL: UP\nstep 7: PRESS_HOME\nstep 8: CLICK: (144, 574)\nstep 9: CLICK: (973, 69)\nstep 10: CLICK: (427, 169)\nstep 11: TYPE: Hong Kong\nstep 12: CLICK: (383, 274)\nstep 13: SCROLL: UP\nstep 14: SCROLL: UP\nI want to Browse for a new hiking trail using Chrome, check the weekend weather with Weather & Radar, and invite Victor James to join through Messenger. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Browse for a new hiking trail using Chrome, check the weekend weather with Weather & Radar, and invite Victor James to join through Messenger.\nThe historical actions are: step 1: CLICK: (144, 706)\nstep 2: CLICK: (287, 112)\nstep 3: TYPE: new hiking trail\nstep 4: CLICK: (889, 683)\nstep 5: CLICK: (337, 924)\nstep 6: SCROLL: UP\nstep 7: PRESS_HOME\nstep 8: CLICK: (144, 574)\nstep 9: CLICK: (973, 69)\nstep 10: CLICK: (427, 169)\nstep 11: TYPE: Hong Kong\nstep 12: CLICK: (383, 274)\nstep 13: SCROLL: UP\nstep 14: SCROLL: UP\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: TYPE: Meesho\nB: SCROLL: UP\nC: CLICK: (122, 161)\nD: CLICK: (234, 564)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_175_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_175_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_175_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_175_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_175_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_175_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_175_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_175_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_175_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_175_9.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_175_10.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_175_11.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_175_12.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_175_13.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_175_14.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (342, 154)\nB: TYPE: TickTick\nC: TYPE: the latest Spider-Man movie\nD: COMPLETE\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (432, 131)\nstep 3: CLICK: (761, 121)\nI want to Utilize Opera to search for the latest Spider-Man movie, and then consult aCalendar to find an available evening to watch it. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Utilize Opera to search for the latest Spider-Man movie, and then consult aCalendar to find an available evening to watch it.\nThe historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (432, 131)\nstep 3: CLICK: (761, 121)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (342, 154)\nB: TYPE: TickTick\nC: TYPE: the latest Spider-Man movie\nD: COMPLETE\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_176_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_176_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_176_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_176_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (581, 368)\nB: TYPE: New York City, USA itinerary\nC: CLICK: (578, 333)\nD: PRESS_HOME\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (229, 269)\nstep 3: CLICK: (59, 950)\nstep 4: CLICK: (67, 52)\nI want to Create an ideal itinerary for a trip to New York City, USA, and secure your stay using Airbnb. Additionally, utilize Threads to discuss and finalize your itinerary details. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Create an ideal itinerary for a trip to New York City, USA, and secure your stay using Airbnb. Additionally, utilize Threads to discuss and finalize your itinerary details.\nThe historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (229, 269)\nstep 3: CLICK: (59, 950)\nstep 4: CLICK: (67, 52)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (581, 368)\nB: TYPE: New York City, USA itinerary\nC: CLICK: (578, 333)\nD: PRESS_HOME\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_177_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_177_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_177_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_177_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_177_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: COMPLETE\nB: PRESS_HOME\nC: CLICK: (816, 125)\nD: CLICK: (518, 892)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (171, 397)\nstep 2: CLICK: (355, 134)\nstep 3: TYPE: book about technology\nstep 4: CLICK: (915, 907)\nstep 5: PRESS_HOME\nstep 6: CLICK: (840, 265)\nstep 7: CLICK: (293, 949)\nstep 8: CLICK: (467, 87)\nstep 9: TYPE: the age of AI book\nstep 10: CLICK: (930, 916)\nstep 11: CLICK: (467, 801)\nstep 12: SCROLL: UP\nI want to Use Instagram to discover a famous technology book, then switch to Opera to read reviews about it, and finally, head over to Amazon to purchase the book. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Use Instagram to discover a famous technology book, then switch to Opera to read reviews about it, and finally, head over to Amazon to purchase the book.\nThe historical actions are: step 1: CLICK: (171, 397)\nstep 2: CLICK: (355, 134)\nstep 3: TYPE: book about technology\nstep 4: CLICK: (915, 907)\nstep 5: PRESS_HOME\nstep 6: CLICK: (840, 265)\nstep 7: CLICK: (293, 949)\nstep 8: CLICK: (467, 87)\nstep 9: TYPE: the age of AI book\nstep 10: CLICK: (930, 916)\nstep 11: CLICK: (467, 801)\nstep 12: SCROLL: UP\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: COMPLETE\nB: PRESS_HOME\nC: CLICK: (816, 125)\nD: CLICK: (518, 892)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_178_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_178_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_178_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_178_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_178_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_178_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_178_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_178_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_178_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_178_9.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_178_10.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_178_11.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_178_12.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: COMPLETE\nB: CLICK: (911, 80)\nC: CLICK: (464, 812)\nD: TYPE: triller\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (670, 922)\nstep 2: TYPE: tiktok\nstep 3: CLICK: (148, 174)\nstep 4: PRESS_HOME\nstep 5: CLICK: (537, 929)\nstep 6: PRESS_HOME\nstep 7: CLICK: (664, 916)\nstep 8: CLICK: (406, 69)\nstep 9: CLICK: (964, 76)\nI want to First, open the App Store and uninstall the TikTok app. Afterward, navigate to the Setting app to verify if TikTok has been successfully uninstalled. Then, head over to Google Play Store, download the Triller app, and finally, open the newly installed app. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: First, open the App Store and uninstall the TikTok app. Afterward, navigate to the Setting app to verify if TikTok has been successfully uninstalled. Then, head over to Google Play Store, download the Triller app, and finally, open the newly installed app.\nThe historical actions are: step 1: CLICK: (670, 922)\nstep 2: TYPE: tiktok\nstep 3: CLICK: (148, 174)\nstep 4: PRESS_HOME\nstep 5: CLICK: (537, 929)\nstep 6: PRESS_HOME\nstep 7: CLICK: (664, 916)\nstep 8: CLICK: (406, 69)\nstep 9: CLICK: (964, 76)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: COMPLETE\nB: CLICK: (911, 80)\nC: CLICK: (464, 812)\nD: TYPE: triller\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_179_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_179_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_179_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_179_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_179_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_179_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_179_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_179_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_179_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_179_9.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: PRESS_HOME\nB: CLICK: (191, 554)\nC: COMPLETE\nD: CLICK: (837, 80)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (385, 643)\nstep 2: CLICK: (440, 57)\nstep 3: TYPE:  biographical movies\nstep 4: CLICK: (921, 901)\nstep 5: CLICK: (316, 235)\nstep 6: SCROLL: UP\nI want to Organize a movie night by selecting a biographical film on DuckDuckgo, adding snacks to your cart on Amazon, sending an invitation to caba62244@gmail.com via Gmail, and setting a reminder on Clock. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Organize a movie night by selecting a biographical film on DuckDuckgo, adding snacks to your cart on Amazon, sending an invitation to caba62244@gmail.com via Gmail, and setting a reminder on Clock.\nThe historical actions are: step 1: CLICK: (385, 643)\nstep 2: CLICK: (440, 57)\nstep 3: TYPE:  biographical movies\nstep 4: CLICK: (921, 901)\nstep 5: CLICK: (316, 235)\nstep 6: SCROLL: UP\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: PRESS_HOME\nB: CLICK: (191, 554)\nC: COMPLETE\nD: CLICK: (837, 80)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_180_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_180_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_180_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_180_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_180_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_180_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_180_6.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (922, 911)\nB: PRESS_HOME\nC: TYPE: hiking trail\nD: SCROLL: UP\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (160, 145)\nstep 2: SCROLL: UP\nstep 3: SCROLL: UP\nstep 4: SCROLL: UP\nstep 5: CLICK: (420, 574)\nstep 6: LONG_PRESS: (486, 424)\nstep 7: SCROLL: LEFT\nstep 8: SCROLL: DOWN\nstep 9: CLICK: (148, 376)\nI want to Open the AP News app and read an English news article. Use DeepL translate to translate the title of the article into French. Finally, record the translated title in Google Docs. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Open the AP News app and read an English news article. Use DeepL translate to translate the title of the article into French. Finally, record the translated title in Google Docs.\nThe historical actions are: step 1: CLICK: (160, 145)\nstep 2: SCROLL: UP\nstep 3: SCROLL: UP\nstep 4: SCROLL: UP\nstep 5: CLICK: (420, 574)\nstep 6: LONG_PRESS: (486, 424)\nstep 7: SCROLL: LEFT\nstep 8: SCROLL: DOWN\nstep 9: CLICK: (148, 376)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (922, 911)\nB: PRESS_HOME\nC: TYPE: hiking trail\nD: SCROLL: UP\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_181_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_181_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_181_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_181_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_181_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_181_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_181_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_181_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_181_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_181_9.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: SCROLL: RIGHT\nB: CLICK: (931, 908)\nC: PRESS_HOME\nD: TYPE: properties of cylinder\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (377, 144)\nstep 2: CLICK: (578, 199)\nstep 3: CLICK: (938, 131)\nI want to Utilize Opera to research information on the properties of a Cylinder, then compile a brief document using WPS Office. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Utilize Opera to research information on the properties of a Cylinder, then compile a brief document using WPS Office.\nThe historical actions are: step 1: CLICK: (377, 144)\nstep 2: CLICK: (578, 199)\nstep 3: CLICK: (938, 131)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: SCROLL: RIGHT\nB: CLICK: (931, 908)\nC: PRESS_HOME\nD: TYPE: properties of cylinder\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_182_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_182_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_182_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_182_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (872, 789)\nB: CLICK: (516, 901)\nC: CLICK: (880, 699)\nD: CLICK: (892, 716)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (285, 99)\nstep 2: CLICK: (131, 53)\nstep 3: TYPE: mobile app developer\nI want to Utilize LinkedIn: Jobs & Business News to find a mobile app developer job, then use WPS office to record the company name. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Utilize LinkedIn: Jobs & Business News to find a mobile app developer job, then use WPS office to record the company name.\nThe historical actions are: step 1: CLICK: (285, 99)\nstep 2: CLICK: (131, 53)\nstep 3: TYPE: mobile app developer\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (872, 789)\nB: CLICK: (516, 901)\nC: CLICK: (880, 699)\nD: CLICK: (892, 716)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_183_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_183_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_183_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_183_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (183, 82)\nB: CLICK: (347, 381)\nC: CLICK: (174, 345)\nD: CLICK: (936, 919)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (174, 237)\nstep 2: CLICK: (374, 182)\nstep 3: CLICK: (919, 104)\nstep 4: CLICK: (928, 111)\nstep 5: TYPE: when is the next Coachella music festival\nI want to Using Opera, search for the next Coachella music festival dates and then set a reminder in Microsoft To Do. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Using Opera, search for the next Coachella music festival dates and then set a reminder in Microsoft To Do.\nThe historical actions are: step 1: CLICK: (174, 237)\nstep 2: CLICK: (374, 182)\nstep 3: CLICK: (919, 104)\nstep 4: CLICK: (928, 111)\nstep 5: TYPE: when is the next Coachella music festival\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (183, 82)\nB: CLICK: (347, 381)\nC: CLICK: (174, 345)\nD: CLICK: (936, 919)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_184_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_184_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_184_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_184_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_184_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_184_5.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: PRESS_HOME\nB: CLICK: (815, 671)\nC: CLICK: (929, 916)\nD: CLICK: (876, 484)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (341, 393)\nstep 2: CLICK: (242, 573)\nstep 3: CLICK: (524, 567)\nstep 4: CLICK: (799, 576)\nstep 5: CLICK: (272, 664)\nstep 6: CLICK: (492, 671)\nI want to Use Applock Pro - APP Lock & Guard to secure the Venmo app with a lock, and then open Venmo to verify the lock is active. The PIN for the lock is 123456. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Use Applock Pro - APP Lock & Guard to secure the Venmo app with a lock, and then open Venmo to verify the lock is active. The PIN for the lock is 123456.\nThe historical actions are: step 1: CLICK: (341, 393)\nstep 2: CLICK: (242, 573)\nstep 3: CLICK: (524, 567)\nstep 4: CLICK: (799, 576)\nstep 5: CLICK: (272, 664)\nstep 6: CLICK: (492, 671)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: PRESS_HOME\nB: CLICK: (815, 671)\nC: CLICK: (929, 916)\nD: CLICK: (876, 484)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_185_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_185_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_185_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_185_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_185_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_185_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_185_6.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (116, 249)\nB: CLICK: (848, 916)\nC: CLICK: (487, 641)\nD: PRESS_HOME\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (429, 723)\nstep 2: CLICK: (160, 461)\nstep 3: CLICK: (502, 807)\nstep 4: CLICK: (941, 129)\nstep 5: TYPE: TickTick\nstep 6: CLICK: (488, 282)\nI want to Toggle the notifications for any application on your phone using the 'Setting' app, and then open 'TickTick'. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Toggle the notifications for any application on your phone using the 'Setting' app, and then open 'TickTick'.\nThe historical actions are: step 1: CLICK: (429, 723)\nstep 2: CLICK: (160, 461)\nstep 3: CLICK: (502, 807)\nstep 4: CLICK: (941, 129)\nstep 5: TYPE: TickTick\nstep 6: CLICK: (488, 282)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (116, 249)\nB: CLICK: (848, 916)\nC: CLICK: (487, 641)\nD: PRESS_HOME\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_186_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_186_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_186_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_186_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_186_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_186_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_186_6.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (169, 649)\nB: CLICK: (972, 582)\nC: TYPE: Los Angeles\nD: SCROLL: UP\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (130, 699)\nstep 2: CLICK: (305, 109)\nstep 3: TYPE: new hiking trail\nstep 4: CLICK: (880, 693)\nstep 5: SCROLL: UP\nstep 6: CLICK: (429, 158)\nstep 7: SCROLL: UP\nstep 8: PRESS_HOME\nstep 9: CLICK: (423, 546)\nI want to Use Chrome to search for a new hiking trail, then consult Windy.com-Weather Forecast to check the weekend weather. Finally, hop onto Tumblr to invite katsunaksu to join. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Use Chrome to search for a new hiking trail, then consult Windy.com-Weather Forecast to check the weekend weather. Finally, hop onto Tumblr to invite katsunaksu to join.\nThe historical actions are: step 1: CLICK: (130, 699)\nstep 2: CLICK: (305, 109)\nstep 3: TYPE: new hiking trail\nstep 4: CLICK: (880, 693)\nstep 5: SCROLL: UP\nstep 6: CLICK: (429, 158)\nstep 7: SCROLL: UP\nstep 8: PRESS_HOME\nstep 9: CLICK: (423, 546)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (169, 649)\nB: CLICK: (972, 582)\nC: TYPE: Los Angeles\nD: SCROLL: UP\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_187_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_187_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_187_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_187_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_187_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_187_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_187_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_187_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_187_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_187_9.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (46, 499)\nB: CLICK: (955, 53)\nC: CLICK: (450, 927)\nD: CLICK: (474, 582)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (666, 833)\nstep 2: CLICK: (301, 279)\nstep 3: TYPE: The Age of Exploration\nstep 4: CLICK: (270, 139)\nI want to Research a historical event from The Age of Exploration using Chrome, and then read or listen to a related book on Amazon Kindle. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Research a historical event from The Age of Exploration using Chrome, and then read or listen to a related book on Amazon Kindle.\nThe historical actions are: step 1: CLICK: (666, 833)\nstep 2: CLICK: (301, 279)\nstep 3: TYPE: The Age of Exploration\nstep 4: CLICK: (270, 139)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (46, 499)\nB: CLICK: (955, 53)\nC: CLICK: (450, 927)\nD: CLICK: (474, 582)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_188_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_188_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_188_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_188_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_188_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: COMPLETE\nB: PRESS_HOME\nC: CLICK: (547, 605)\nD: TYPE: vintage\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (858, 818)\nstep 2: TYPE: tiktok\nstep 3: CLICK: (877, 890)\nstep 4: PRESS_HOME\nstep 5: CLICK: (374, 806)\nstep 6: PRESS_HOME\nstep 7: CLICK: (858, 818)\nstep 8: CLICK: (802, 74)\nstep 9: CLICK: (919, 74)\nstep 10: TYPE: triller\nstep 11: CLICK: (895, 901)\nstep 12: CLICK: (838, 349)\nI want to Begin by uninstalling the TikTok app using the Google Play Store. Once done, navigate to Settings to verify that TikTok has been successfully uninstalled. After confirmation, download a similar app such as Triller from the Google Play Store, and then proceed to open the Triller app. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Begin by uninstalling the TikTok app using the Google Play Store. Once done, navigate to Settings to verify that TikTok has been successfully uninstalled. After confirmation, download a similar app such as Triller from the Google Play Store, and then proceed to open the Triller app.\nThe historical actions are: step 1: CLICK: (858, 818)\nstep 2: TYPE: tiktok\nstep 3: CLICK: (877, 890)\nstep 4: PRESS_HOME\nstep 5: CLICK: (374, 806)\nstep 6: PRESS_HOME\nstep 7: CLICK: (858, 818)\nstep 8: CLICK: (802, 74)\nstep 9: CLICK: (919, 74)\nstep 10: TYPE: triller\nstep 11: CLICK: (895, 901)\nstep 12: CLICK: (838, 349)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: COMPLETE\nB: PRESS_HOME\nC: CLICK: (547, 605)\nD: TYPE: vintage\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_189_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_189_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_189_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_189_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_189_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_189_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_189_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_189_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_189_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_189_9.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_189_10.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_189_11.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_189_12.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: COMPLETE\nB: TYPE: 2001 Colony St\nC: CLICK: (684, 542)\nD: CLICK: (496, 328)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (722, 109)\nstep 2: CLICK: (254, 727)\nstep 3: PRESS_HOME\nstep 4: CLICK: (565, 261)\nstep 5: CLICK: (355, 599)\nI want to Use Redfin Houses for Sale & Rent to find a rental property in your city. Then, utilize Uber to determine the driving distance from your current location to the property. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Use Redfin Houses for Sale & Rent to find a rental property in your city. Then, utilize Uber to determine the driving distance from your current location to the property.\nThe historical actions are: step 1: CLICK: (722, 109)\nstep 2: CLICK: (254, 727)\nstep 3: PRESS_HOME\nstep 4: CLICK: (565, 261)\nstep 5: CLICK: (355, 599)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: COMPLETE\nB: TYPE: 2001 Colony St\nC: CLICK: (684, 542)\nD: CLICK: (496, 328)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_190_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_190_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_190_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_190_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_190_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_190_5.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: TYPE: shopping list for making sushi\nB: TYPE: Costco\nC: TYPE: when is the next Rolling Stones concert tour\nD: SCROLL: UP\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (294, 257)\nstep 2: CLICK: (261, 652)\nstep 3: TYPE: shopping mall\nstep 4: CLICK: (898, 683)\nstep 5: CLICK: (332, 278)\nstep 6: PRESS_HOME\nstep 7: CLICK: (576, 423)\nstep 8: CLICK: (437, 679)\nI want to Use Waze Navigation & Live Traffic to locate a nearby shopping mall, then book a ride with Lyft. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Use Waze Navigation & Live Traffic to locate a nearby shopping mall, then book a ride with Lyft.\nThe historical actions are: step 1: CLICK: (294, 257)\nstep 2: CLICK: (261, 652)\nstep 3: TYPE: shopping mall\nstep 4: CLICK: (898, 683)\nstep 5: CLICK: (332, 278)\nstep 6: PRESS_HOME\nstep 7: CLICK: (576, 423)\nstep 8: CLICK: (437, 679)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: TYPE: shopping list for making sushi\nB: TYPE: Costco\nC: TYPE: when is the next Rolling Stones concert tour\nD: SCROLL: UP\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_191_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_191_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_191_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_191_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_191_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_191_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_191_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_191_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_191_8.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: SCROLL: UP\nB: TYPE: Hong Kong\nC: PRESS_HOME\nD: TYPE: subaru most popular car\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: PRESS_HOME\nstep 2: SCROLL: RIGHT\nstep 3: CLICK: (166, 199)\nstep 4: CLICK: (297, 82)\nstep 5: CLICK: (924, 76)\nI want to Utilize AutoScout24: Buy & sell cars, and DuckDuckGo to identify the most popular Subaru car product and verify its price within the car app. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Utilize AutoScout24: Buy & sell cars, and DuckDuckGo to identify the most popular Subaru car product and verify its price within the car app.\nThe historical actions are: step 1: PRESS_HOME\nstep 2: SCROLL: RIGHT\nstep 3: CLICK: (166, 199)\nstep 4: CLICK: (297, 82)\nstep 5: CLICK: (924, 76)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: SCROLL: UP\nB: TYPE: Hong Kong\nC: PRESS_HOME\nD: TYPE: subaru most popular car\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_192_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_192_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_192_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_192_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_192_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_192_5.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: PRESS_HOME\nB: CLICK: (301, 123)\nC: SCROLL: RIGHT\nD: COMPLETE\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (322, 132)\nstep 2: CLICK: (272, 590)\nstep 3: CLICK: (501, 232)\nstep 4: TYPE: Business\nstep 5: CLICK: (744, 63)\nstep 6: CLICK: (593, 605)\nstep 7: PRESS_HOME\nI want to Organize a business meeting with Gentsdgoi Setgss via ZOOM Cloud Meetings, send them an invitation through Messenger, and set an alarm clock for the meeting using the Clock app. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Organize a business meeting with Gentsdgoi Setgss via ZOOM Cloud Meetings, send them an invitation through Messenger, and set an alarm clock for the meeting using the Clock app.\nThe historical actions are: step 1: CLICK: (322, 132)\nstep 2: CLICK: (272, 590)\nstep 3: CLICK: (501, 232)\nstep 4: TYPE: Business\nstep 5: CLICK: (744, 63)\nstep 6: CLICK: (593, 605)\nstep 7: PRESS_HOME\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: PRESS_HOME\nB: CLICK: (301, 123)\nC: SCROLL: RIGHT\nD: COMPLETE\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_193_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_193_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_193_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_193_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_193_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_193_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_193_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_193_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: PRESS_HOME\nB: CLICK: (336, 404)\nC: CLICK: (413, 516)\nD: CLICK: (870, 386)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (127, 647)\nstep 2: CLICK: (501, 383)\nstep 3: CLICK: (593, 561)\nstep 4: PRESS_HOME\nI want to Engage in a Spanish language lesson using Duolingo and establish a learning plan with TickTick. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Engage in a Spanish language lesson using Duolingo and establish a learning plan with TickTick.\nThe historical actions are: step 1: CLICK: (127, 647)\nstep 2: CLICK: (501, 383)\nstep 3: CLICK: (593, 561)\nstep 4: PRESS_HOME\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: PRESS_HOME\nB: CLICK: (336, 404)\nC: CLICK: (413, 516)\nD: CLICK: (870, 386)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_194_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_194_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_194_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_194_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_194_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: SCROLL: UP\nB: CLICK: (203, 290)\nC: COMPLETE\nD: CLICK: (454, 937)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (632, 388)\nstep 2: CLICK: (780, 78)\nstep 3: CLICK: (302, 878)\nstep 4: CLICK: (331, 808)\nstep 5: PRESS_HOME\nstep 6: CLICK: (841, 112)\nstep 7: CLICK: (542, 411)\nstep 8: CLICK: (566, 941)\nstep 9: TYPE: https://teams.live.com/meet/9374833361011?p=xEkF25dJMsD4Fheq\nstep 10: CLICK: (916, 596)\nI want to Set up an online meeting using Microsoft Teams and share the meeting link with Victor James through Messenger. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Set up an online meeting using Microsoft Teams and share the meeting link with Victor James through Messenger.\nThe historical actions are: step 1: CLICK: (632, 388)\nstep 2: CLICK: (780, 78)\nstep 3: CLICK: (302, 878)\nstep 4: CLICK: (331, 808)\nstep 5: PRESS_HOME\nstep 6: CLICK: (841, 112)\nstep 7: CLICK: (542, 411)\nstep 8: CLICK: (566, 941)\nstep 9: TYPE: https://teams.live.com/meet/9374833361011?p=xEkF25dJMsD4Fheq\nstep 10: CLICK: (916, 596)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: SCROLL: UP\nB: CLICK: (203, 290)\nC: COMPLETE\nD: CLICK: (454, 937)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_195_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_195_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_195_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_195_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_195_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_195_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_195_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_195_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_195_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_195_9.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_195_10.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: SCROLL: RIGHT\nB: COMPLETE\nC: TYPE: 'Becoming' by Michelle Obama\nD: CLICK: (902, 922)\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (596, 278)\nstep 2: CLICK: (52, 73)\nstep 3: CLICK: (18, 68)\nstep 4: CLICK: (866, 72)\nstep 5: CLICK: (491, 76)\nstep 6: TYPE: how to learn to code in Python\nstep 7: CLICK: (581, 165)\nstep 8: CLICK: (886, 184)\nstep 9: CLICK: (462, 683)\nstep 10: CLICK: (308, 474)\nstep 11: PRESS_HOME\nI want to Conduct research on how to learn Python coding using Quora for information, then create a reminder in TickTick to schedule a time to start the tutorial on the website. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Conduct research on how to learn Python coding using Quora for information, then create a reminder in TickTick to schedule a time to start the tutorial on the website.\nThe historical actions are: step 1: CLICK: (596, 278)\nstep 2: CLICK: (52, 73)\nstep 3: CLICK: (18, 68)\nstep 4: CLICK: (866, 72)\nstep 5: CLICK: (491, 76)\nstep 6: TYPE: how to learn to code in Python\nstep 7: CLICK: (581, 165)\nstep 8: CLICK: (886, 184)\nstep 9: CLICK: (462, 683)\nstep 10: CLICK: (308, 474)\nstep 11: PRESS_HOME\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: SCROLL: RIGHT\nB: COMPLETE\nC: TYPE: 'Becoming' by Michelle Obama\nD: CLICK: (902, 922)\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_196_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_196_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_196_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_196_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_196_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_196_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_196_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_196_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_196_8.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_196_9.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_196_10.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_196_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (883, 692)\nB: COMPLETE\nC: SCROLL: UP\nD: PRESS_HOME\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (425, 401)\nstep 2: CLICK: (390, 891)\nstep 3: TYPE: tell me about Fundamental theorem of calculus\nstep 4: CLICK: (689, 419)\nstep 5: PRESS_HOME\nstep 6: CLICK: (284, 391)\nstep 7: TYPE: Fundamental theorem of calculus\nI want to Consult the AI-related app 'Chatbot AI & Smart Assistant' to inquire about the Fundamental Theorem of Calculus, and then use the 'DuckDuckGo' browser to verify the information. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Consult the AI-related app 'Chatbot AI & Smart Assistant' to inquire about the Fundamental Theorem of Calculus, and then use the 'DuckDuckGo' browser to verify the information.\nThe historical actions are: step 1: CLICK: (425, 401)\nstep 2: CLICK: (390, 891)\nstep 3: TYPE: tell me about Fundamental theorem of calculus\nstep 4: CLICK: (689, 419)\nstep 5: PRESS_HOME\nstep 6: CLICK: (284, 391)\nstep 7: TYPE: Fundamental theorem of calculus\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (883, 692)\nB: COMPLETE\nC: SCROLL: UP\nD: PRESS_HOME\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_197_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_197_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_197_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_197_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_197_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_197_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_197_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_197_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: PRESS_HOME\nB: TYPE: Anemone\nC: COMPLETE\nD: SCROLL: UP\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (687, 220)\nstep 2: CLICK: (245, 727)\nstep 3: CLICK: (531, 724)\nstep 4: CLICK: (752, 717)\nstep 5: CLICK: (263, 821)\nstep 6: CLICK: (512, 822)\nstep 7: CLICK: (752, 825)\nstep 8: SCROLL: UP\nI want to Firstly, utilize 'Applock Pro - APP Lock & Guard' to secure 'Google Wallet'. After setting the lock, proceed to open 'Google Wallet' to verify the security settings using the PIN code 123456. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Firstly, utilize 'Applock Pro - APP Lock & Guard' to secure 'Google Wallet'. After setting the lock, proceed to open 'Google Wallet' to verify the security settings using the PIN code 123456.\nThe historical actions are: step 1: CLICK: (687, 220)\nstep 2: CLICK: (245, 727)\nstep 3: CLICK: (531, 724)\nstep 4: CLICK: (752, 717)\nstep 5: CLICK: (263, 821)\nstep 6: CLICK: (512, 822)\nstep 7: CLICK: (752, 825)\nstep 8: SCROLL: UP\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: PRESS_HOME\nB: TYPE: Anemone\nC: COMPLETE\nD: SCROLL: UP\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_198_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_198_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_198_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_198_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_198_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_198_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_198_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_198_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_198_8.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "gui_next_action_prediction",
+    "visual_input_component": "GUI image",
+    "source": "GUI-Odyssey",
+    "options": "A: CLICK: (925, 919)\nB: CLICK: (351, 134)\nC: PRESS_HOME\nD: COMPLETE\n",
+    "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (133, 212)\nstep 2: CLICK: (863, 881)\nstep 3: CLICK: (316, 453)\nstep 4: TYPE: do yoga in the morning\nstep 5: CLICK: (502, 331)\nstep 6: PRESS_HOME\nstep 7: CLICK: (136, 360)\nstep 8: CLICK: (224, 371)\nI want to Locate a beginner Yoga workout video on Likee and set a reminder in Things to do it tomorrow morning. Which action should I do next?",
+    "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Locate a beginner Yoga workout video on Likee and set a reminder in Things to do it tomorrow morning.\nThe historical actions are: step 1: CLICK: (133, 212)\nstep 2: CLICK: (863, 881)\nstep 3: CLICK: (316, 453)\nstep 4: TYPE: do yoga in the morning\nstep 5: CLICK: (502, 331)\nstep 6: PRESS_HOME\nstep 7: CLICK: (136, 360)\nstep 8: CLICK: (224, 371)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (925, 919)\nB: CLICK: (351, 134)\nC: PRESS_HOME\nD: COMPLETE\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_199_0.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_199_1.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_199_2.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_199_3.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_199_4.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_199_5.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_199_6.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_199_7.png",
+      "../MMIU-Benchmark/gui_next_action_prediction/gui_next_action_prediction_199_8.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [1, 3, 2, 0]\nC: [3, 2, 1, 0]\nD: [1, 3, 0, 2]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. - Three potatoes- One onion- Three tomatoes- Three chilies- Two pounds of basmati rice (NOTE: other types of rice not recommended)- One whole chicken- One tomato paste- One tahini - One lemon. - One tablespoon of salt- Three pay leaves- Three cloves- Three teaspoons of cumin- One tablespoon of chili powder- Two teaspoon of grounded Coriander- One-tablespoon ground ginger- Five teaspoon of cayenne paper. - Two teaspoons of black paper- Two Magie cubes for better taste.. - Wash all the ingredients with water and salt to take out the dust. Clean the chicken by adding rough sea salt while scratching it to eliminate the grass.- Cut the potatoes into cubes of medium sizes. Cut the onions in very small sizes to save you time when cooking. - Cut the three tomatoes in relatively large cubes. Cut two of the green chilies into very small pieces and leave the third for the representation. - Place the two-pound rice in a big bowl and wash it with cold water by shuffling the rice. Leave the rice aside to absorb the water. - Cut the chicken in quarters and take out the skin. Add species to the chicken for flavor.. - Turn up heat to medium temperature, add have cup of vegetable oil in a deep large pot. Stir oil till it worms up- Add onions cubes into pot and stir fry till they become golden. Add chicken flavored bouillon. - Place chicken slices into pot and stir frequently until they are half-cooked and place them on a plate to cool down on the side.. - Put potatoes cubes into the pot and add boiling water until potatoes cubes are completely submerged.- Close the pot with lid and let boil until potatoes are cooked for half an hour. - Put chicken back into the pot after potatoes are cooked; add tomatoes slices, tomato paste into the pot to be cooked with potatoes.- Add bay leaf (3) into pot.- Add 2 chilies strips into pot. - Add seasoning (coriander, chills, grounded ginger, 4 cloves)- Pour grain into pot.- Add water until rice is submerged at a depth approximately 1 inch.. - Add 5 tablespoons of tahini into bowl.- Squeeze 3 lemons into the same bowl.- Add sea salt and black peppers and some paprika (1 teaspoon each) into the bowl.- Add hot boiling water gradually into the bowl while stirring till it become smooth.- Place tahini into freezer while preparing kabsa for best taste and result.- Serve cold tahini sauce as a side with the kabsa.. -Transfer rice to plate with chicken on top. Serve with tahini sauce.. bon appetit\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 3, 2, 0]\nC: [3, 2, 1, 0]\nD: [1, 3, 0, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_0_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_0_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_0_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_0_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [0, 2, 3, 1]\nC: [3, 0, 2, 1]\nD: [1, 3, 0, 2]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Cake Ingredients: 1 cup butter-softened 1 1/2\u00a0cups sugar 4 large eggs 3 cups all-purpose white flour 3 tsp. baking powder 1 tsp. salt 1 cup milk 2 tsp. vanilla extract Paper baking cupcake liners Vegetable cooking sprayYield: 24 cupcakes or 2 dozenDirections: 1. Preheat oven to 350F. Prepare cupcake tins with liners, set aside. 2. Beat butter and sugar at medium speed with an electric mixer until creamy and smooth. Then add eggs, 1 at a time, mixing until well blended after each egg. 3. Combine flour, baking powder, and salt together in a small bowl. Add to butter mixture alternating with milk. You should begin and end with the flour mixture. Mix at low speed until bleneded. Add in vanilla extract. 4. Spoon cake mix into cups, filling 2/3 full. 5. Bake at 350 for 12-15 minutes or until toothpick inserted comes out clean. 6. Cool in pans on wire rack for 10 minutes, remove cupcakes from pans and set on wire racks to completely cool.. Filling Ingredients: 1 8oz cream cheese-softened 1/3 cup powdered sugar 2-3 Tbsp. coffee liqueur(Kahlua) or 1 Tbsp. coffee extract 1/8 tsp. saltYield 2 CupsDirections: 1. Combine all ingredients in a medium bowl, mixing until well blended. Store any remaining filling in container in refrigerator-up to 2 weeks.. Once cupcakes are completely cooled, cut tops off of the cupcakes using a serrated knife. Then spread 1 Tbsp. of Tiramisu Filling on the bottom part of the cupcake, gently place the top back on. . Frosting Ingredients: 1/2 cup butter-softened 1 8oz cream cheese-softened 2 16oz packages powdered sugar 1/4 tsp. saltYield 5 cupsDirections: 1. Beat butter and cream cheese at medium speed until creamy and smooth. 2.\u00a0Gradually add in the powdered sugar and salt, mixing at low speed. Beat at high speed for 2 minutes until creamy and thick. 3. Frost each cupcake by using a spatula, knife or piping bag and tip. . For the finishing touch dust/sprinkle with Hersheys Cocoa Powder. . After all your hard work, you can now enjoy your Tiramisu Cupcakes! Enjoy!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 2, 3, 1]\nC: [3, 0, 2, 1]\nD: [1, 3, 0, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_1_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_1_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_1_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_1_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [3, 1, 2, 0]\nC: [0, 2, 3, 1]\nD: [2, 0, 3, 1]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. This recipe makes 2 dozen small cookies.Butter - 1/2 CupBrown Sugar - 1/2 CupAll Purpose Flour - 3/4 CupMilk - 1 1/2 TablespoonChocolate Chunks - 1/4 Cup or moreVanilla Extract - 1 TeaspoonSea Salt - 1/2 Teaspoon + extra for sprinklingYou could use chocolate chips instead of chunks. For the chunks I just chopped up a bar of dark chocolate. Any chocolate would work, but i recommend not to use chocolate that has a cocoa solid content higher than 65%. These would be amazing with hazelnuts in them too!. 1. Place the butter in a pan and melt it over medium-high heat. Cook the butter till it becomes golden brown in colour. Immediately take off the heat. Let cool till it solidifies. The butter is ready to use once it solidifies.. Cream the brown butter till it is light and pale. Add the sugar and cream it till the mixture is light and creamy and the sugar has mostly dissolved. Mix in vanilla essence, 1/2 teaspoon salt and the milk. Combine well.Add the flour and 3/4 of the chocolate chunks and mix this till it comes together in a smooth ball. Refrigerate for 15 minutes.. Take 1/2 tablespoon of the dough and roll into balls. Place onto baking paper and flatten them lightly with the back of a spoon. Push the remaining chocolate chunks into the dough and sprinkle with the sea salt.. Bake these cookies in an oven that has been preheated to 350F (180C) for 15-20 minutes until the edges are light golden. Cool them slightly before moving them to a cooling rack and devouring them. Nom nom!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 1, 2, 0]\nC: [0, 2, 3, 1]\nD: [2, 0, 3, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_2_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_2_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_2_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_2_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [1, 3, 0, 2]\nC: [0, 1, 2, 3]\nD: [2, 1, 0, 3]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Here's the basic recipe.\u00a0 Start by making this:\n(this is essentially a double batch...it will yield about 70-75 cookies...depending on variety)\nYou will bake 375* F for 8-10 minutes...so go ahead and preheat the oven!\n3 sticks of Butter (1 1/2 Cups)\n1 1/2 Cup Brown Sugar\n1 Cup granulated Sugar\nMix and cream\nThen add:\n3 eggs\n2 tsp Vanilla\nmix together\nThen add:\n1 tsp Baking Soda\n1 tsp Salt\n2 Cups Oats\n4 Cups Flour\nMix together. Here's what we're going to do.\nAdd 2 full bags of chocolate chips.\nI used the top 2...dark chocolate and mini's...the melts were used later.\nThese are loaded.\u00a0 But essentially it is a double batch...so 2 bags is just right!. Split up the dough into 5 bowls.\nEach bowl will make a different variety of chocolate Overlord cookies!\n(yes, these aren't just overload cookies...they are Overlord Cookies). First:CHOCOLATE CHIP COOKIES!\nRoll out balls of dough about the size of a ping pong ball.\nWe did 15 per tray. Tray is lightly greased...but I don't think it needs it.\nBake 8-10 minutes at 375 F.\u00a0 I do 8 minutes\nThen remove them and allow them at least 5 minutes before touching them!\nThey need to completely finish cooking...they will be gooey until they cool.\n(don't judge my pans...if you know how to clean them up perfectly...\ncome over and clean them, I will give you cookies!). Next we add some fun!. SMORE'S COOKIES!\nMake a tray of regular cookies.\u00a0 Bake 8 minutes\nPull out of oven and while gooey, place 3 marshmallows on\ntop with one baking melt chocolate disk for looks!\nThen pop them under the BROILER for just a minute or\ntwo until the marshmallows are toasted!\nGolden Perfection!. COOKIES AND CREAM\nStart with your cookie dough and oreo cookies...\nwrap an oreo completely in a \"thin\" layer of cookie dough, covering it completely!\nThese turn out quite large!\u00a0 We fit 8 on one pan.\nThey bake up perfectly with all that oreo goodness inside!\nThese were way better and bigger than I expected!. SWEET AND SALTY\nTake the Chocolate Chip cookie dough and add broken up pretzel sticks to it!\nMakes a sweet and salty awesome flavor!\nRoll out and bake the same as the regular cookies!. TURTLES\nBake a batch of regular cookies, like the smore's ones.\nPull out after 8-10 minutes and lightly press a pecan or two on top.\nThen drizzle with caramel topping!\nLet cool at least 5 minutes before plating!. Then plate up all your gourmet cookies!\nAdd some little name sticks so your guests know what they are getting into!\nOkay, so yes...you did the math right.\n15 cookies of each variety except the Cookies 'n Cream...only 8 of them\nGrand total: 68 cookies!\nAwesome spread for 1 simple batch of cookies!\nIn a blind taste test...the 8 year old and 10 year old loved the\nCookies and Cream the best!\nFollowed closely by the Smore's!\u00a0 :). Best part about these cookies is they FREEZE!\nThe dough freezes, the cookies freeze...you don't have to eat them all in one night!\u00a0 And they taste good frozen!\nNow you can have a party spread with only the time spent making a batch of cookies!!!\nThanks for viewing, which one do you think you'd like the most???\nVote for me in the Cookie Contest...I'll make you some cookies!\u00a0 :)\nCheck out my blog for more silliness!\u00a0\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 3, 0, 2]\nC: [0, 1, 2, 3]\nD: [2, 1, 0, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_3_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_3_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_3_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_3_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [1, 0, 3, 2]\nC: [1, 2, 3, 0]\nD: [3, 1, 2, 0]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. -2 TOMATILLOS\n-4 LARGE RED TOMATOES\n-1 LARGE ONION\n-2 LB BEEF ROAST ( I USE CHUCK ROAST )\n-1 BIG BAG DEHYDRATED RED PEPPERS (THIS CAN BE ANY DEHYDRATED PEPPERS YOU PREFER, I LIKE THE SMALL RED ONES, SPICY!)\n-10 FLOUR OR CORN TORTILLAS\n-1 BUNCH OF CILANTRO\n-24OZ WHITE CHEESE OF YOUR CHOICE ( I USE BABY ANEJO ENCHILADO MEXICAN STYLE FARMER'S CHEESE)\n-2 TBS GARLIC SALT\n-2 TBS OIL\n-2 TBS CUMIN. SIMMER THE BEEF ON LOW/MEDIUM HEAT FOR 3 HRS.\nADD A HANDFUL OF THE RED PEPPERS ONE HOUR IN.\nPOUR AS MANY OF THE REMAINING PEPPERS AS YOU CAN HANDLE INTO A SEPERATE PAN OF BOILING WATER AND BOIL UNTIL SOFT. (ABOUT 30 MINS)THE AMOUNT OF PEPPERS WILL DETERMINE HOW SPICY THE SALSA IS. I USE ABOUT 6 BOILED SMALL RED PEPPERS.\nREMOVE THE BEEF WHEN FINISHED AND PULL APART WITH A FORK.\nSTIR IN CHOPPED ONION.. TO A BLENDER, ADD TOMATILLOS, TOMATOES, ONION, CILANTRO, CUMIN, AND A LITTLE GARLIC SALT. ADD BOILED RED PEPPERS AND LIQUIFY.\nPOUR SALSA INTO A PAN AND SIMMER FOR 15-20 MINUTES.\n. HEAT OIL IN A PAN, AND ADD TORTILLAS ONE BY ONE FOR ABOUT 10 SECONDS EACH SIDE.\nDO THIS UNTIL ALL TORTILLAS ARE FRIED, A LITTLE CRISPY, LIGHT BROWN.\n. POUR ABOUT 1/4 CUP OF THE SALSA INTO THE BOTTOM OF A BAKING PAN, SET ASIDE.\nDIP EACH FRIED TORTILLA INTO THE SIMMERING SALSA AND MOISTEN EACH SIDE CAREFULLY. PLACE ON A SEPERATE PLATE.\nADD BEEF AND CHEESE TO MOISTENED TORTILLA AND ROLL UP.\nARRANGE IN BAKING DISH.\nTOP THE ENCHILADAS WITH REMAINING SALSA AND SPRINKLE WITH CHEESE.\nBAKE 350 FOR 20 TO 30 MINUTES, UNTIL BUBBLY AND CHEESE IS MELTED.\nEAT AND  TRY NOT TO BURN YOUR TONGUE TASTELESS!!!\n\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 0, 3, 2]\nC: [1, 2, 3, 0]\nD: [3, 1, 2, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_4_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_4_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_4_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_4_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [1, 0, 2, 3]\nC: [2, 1, 0, 3]\nD: [0, 1, 3, 2]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. \nAbout 3 dozen Cherry Tomatoes\n1 pound Bacon\n1 small/medium Onion\n1 stalk Celery\n20 leaves of Spinach\n3/4 cup Miracle Whip (or what ever salad dressing you prefer)\n1 teaspoon Olive Oil\n1/2 teaspoon fresh Parsley\n3 tablespoons Grated Parmesan Cheese. First, start out by cooking the bacon.\u00a0 After bacon is cooked, put it on a plate covered with paper towels.\u00a0 Take some extra paper towel and pat the top of bacon to try to get off as much grease as you can.\u00a0 Then, chop up the bacon, put into a bowl and set aside.\nNext, dice the onion and celery.\u00a0 Put into a small skillet and saute' in 1 tablespoon of olive oil for about 10 minutes on medium heat.\nThen, dice the leaves of spinach.\nFinally, to prepare the tomatoes the first step is to cut off the very top.\u00a0 Be sure not to cut off too much.\u00a0 then take a small spoon and carefully take out the seeds and insides.\u00a0 Discard them when done.\u00a0 Once that is done then place the tomato upside down on some paper towel to let them drain.. \nFirst, add the sauteed celery and onions in the bowl with bacon and mix together.\u00a0 Then, add the parsley, parmesan cheese and spinach leaves.\u00a0 Mix well.\u00a0 Next, add the mayo and mix it all together.. The final step is to stuff the tomatoes.\u00a0 All you do is take a half spoonful of bacon mixture and spoon into tomatoes.\u00a0 Careful not to over stuff the tomatoes.\u00a0 Then arrange them on a plate to display for company, as appetizers, or with a favorite side.\nPlace in refrigerator for 1 to 2 hours before eating.\nHope you enjoy them.\u00a0 They are so yummy.\u00a0 Thank you.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 0, 2, 3]\nC: [2, 1, 0, 3]\nD: [0, 1, 3, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_5_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_5_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_5_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_5_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [1, 2, 3, 0]\nC: [0, 3, 1, 2]\nD: [2, 0, 1, 3]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. The biscuit (cookie):2 2/3 cup flour1 cup sugar1/2 teaspoon salt1/4 teaspoon baking powder1 cup butter, room temperature1 egg yolk2 teaspoons vanilla extractThe filling:1 cup shortening5 cups confectioner\u2019s sugar1 teaspoon vanilla extract1 packet unflavored gelatin1/4 cup water1 jar red jam (any flavor)Special supplies:cookie stampblack food coloringedible gold luster dustvodkapaintbrushbutcher\u2019s/parchment paperblack sealing waxwax stamp. Using a beater attachment, blend dry ingredients in a mixer (the dough will become too stiff for a whisk; believe me I tried, as you can see in my image).Then, mix in butter until ingredients are crumbly.Add egg yolk and vanilla.When stiff dough forms, gradually add black food coloring.Once fully blended, divide dough in half, wrapping both halves in plastic wrap, and chill for at least one hour.Tip: If you would prefer not to use an artificial food coloring due to allergies or preference, you can achieve a similar darkness with cocoa noir, which will add a delicious chocolaty taste. Simply add your preferred amount to the dough while it\u2019s still in the mixer. If the dough becomes too dry in doing so, add milk half a teaspoon at a time until cocoa is evenly blended but dough is still stiff.. On a floured surface, roll out one of your two balls of chilled dough to no thicker than 1/4 inch. The thinner the better; I rolled mine to as close as 1/8 inch as I dared.At this point, I used my cookie stamp to impress my skull and crossbones design all over the dough.To actually cut the dough, I used the lightly floured rim of a jam jar.Transfer cut biscuits to a cookie sheet lined in parchment, and bake in a 350 degree (F) preheated oven for 8\u201310 minutes.Let cool for two minutes on cookie sheet and then transfer to cooling rack or parchment to cool completely.Tip: Try keeping as clean a surface as possible on the dough while rolling. White flour will sometimes still show up on the black cookie even after baking.. Repeat the previous step with the second chilled ball of dough, though this time, you don\u2019t need to use the stamp, as these will be the bottoms of the sandwiches. Simply roll the dough and cut the circles with the same jam jar as before.Transfer cut biscuits to a cookie sheet lined in parchment, and bake in a 350 degree (F) preheated oven for 8\u201310 minutes.Let cool for two minutes on cookie sheet and then transfer to cooling rack or parchment to cool completely.. This step can be skipped if you decide to not stamp the biscuits or if you don\u2019t want to add the gold color to the stamped relief.After the biscuits have cooled completely, prepare the gold luster dust for painting onto the stamped design by adding a very small amount (I used the end of a butter knife to dip out perhaps 1/8 teaspoon from its vial) into a separate dish.Gently add vodka one drop at a time to make a paint (to control the addition of the vodka, I poured some into a shot glass, then dipped a paintbrush into the shot glass to retrieve a drop of liquid at a time to add to the luster dust).Once you have made a paint with the gold luster dust and vodka, you can use the paintbrush to gild the design on the biscuits. You may need to add a few drops of vodka to the luster dust while you work, as the vodka will evaporate quickly.Once you have finished gilding all the biscuits, allow them to dry completely while you complete the following step.Tip: You can use any clear grain alcohol for this step, but it must be alcohol\u2014not water. The alcohol acts as a carrying medium for the luster dust and will evaporate quickly once you\u2019ve painted the biscuit, ensuring that the biscuits do not become soggy.Important: The gold you\u2019ve painted onto the biscuits will be somewhat delicate, so in the following steps, handle the painted biscuits carefully so as not to dull or rub the gold off completely.. Dissolve unflavored gelatin in cold water.Heat in a cup until clear (or as clear as you can get it), then let cool.Using a beater attachment (the filling will be too stiff for a whisk), cream together shortening, confectioner\u2019s sugar, and vanilla extract.Add the gelatin, and beat until smooth. (This may take up to ten minutes.)Once filling is smooth, spoon it into a pastry bag (I used a resealable plastic bag that I cut the corner from).Add jam to a separate pastry bag (or resealable plastic bag).. You will be building the sandwich from the bottom up. With a bottom biscuit (the undecorated ones) in one hand and the pastry bag of filling in your other, apply the filling in a generous ring to the biscuit, leaving the center free of filling.Then, exchange the filling for the jam and fill the center of the ring with the jam. This will be the \u201cbloody\u201d surprise.For each biscuit bottom you do this to, gently press on one of the decorated top biscuits. Set aside and allow to set 30\u201460 minutes before wrapping.. The Victorians often printed obituaries, comforting poems, and biblical verses onto the wrappers of their burial biscuits in order to comfort those taking refreshment in them. While I did not do this to mine, the sky really is the limit on the possibilities this custom affords. Perhaps you could include spooky fortunes or the fictional death dates of your guests brave enough to try these elegant sandwiches. In any case, wrapping the biscuit is relatively straightforward.I created a simple envelope from paper (I recommend butcher\u2019s paper or parchment because regular paper will likely show oil stains after some time from contact with the butter in the biscuits).Secure the lip of your envelope by melting the black sealing wax onto its edge.Then gently impress the wax stamp for approximately five seconds or until the wax is cool.Gently remove the stamp and put the wrapped sandwich aside. Repeat with all the sandwiches.Tip: The paper envelope needn\u2019t be any fancier than you want it to be. You could even tear the edges to give the presentation a rougher look. In order to make my envelope, I used my cookie stamp in place of a real sandwich to create a template envelope, which I then used as a template to trace and cut the rest of the paper envelopes. This way I didn\u2019t ruin a sandwich in the process.. These burial biscuits are perfect for setting out at gatherings, as they needn\u2019t be refrigerated to stay together. However, if you make these ahead of time, I suggest keeping them in an airtight container in the fridge until you need to set up for the gathering. If you have any leftover, they will keep for 2\u20133 days this way in the fridge.Because these burial biscuits were at one time solely provided at funerals, don\u2019t be afraid to give them prominence amid your other snacks. I staged mine with a photograph of the deceased to honor his memory, a memorial cabinet card, and other somber, spooky d\u00e9cor.Best of luck! \u2013Brandon\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 2, 3, 0]\nC: [0, 3, 1, 2]\nD: [2, 0, 1, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_6_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_6_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_6_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_6_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [1, 2, 0, 3]\nC: [1, 2, 3, 0]\nD: [2, 0, 1, 3]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Cupcakes:2 1/2 cups of all purpose flour     2 teaspoons of baking powder     1/2 teaspoon of baking soda     1/2 cup of softened butter     3 eggs     1/2 cup of milk     1/2 teaspoon of salt     1 cup of sugar     1 teaspoon of vanilla extract     1/2 cup of vegetable oilFrosting:3 tablespoons of softened butter     1 egg white     2 cups of powdered sugar     1/2 teaspoon of vanilla extract     1/2 tablespoon of waterGummy Decorations:A little bit of frosting (set aside from the frosting recipe)  Graham crackers     Green and orange fruit slices     Green and orange gumdrops     Peach rings  ToothpicksEquipment:Cooling rack Mixing bowls of all sizes A non serrated knife or clean food safe scissors Hand or stationary mixer Cupcake pan Cupcake liners Rubber spatula  Storage container Plastic sandwich bag Plate Hot pads. Preparation:Preheat the oven to 350 degrees.     Set out cupcake pans and line them with the cupcake liners.    Set aside the readied pan for later.. Dry Ingredients:Stir the flour, baking powder, baking soda, and salt together in a medium bowl.Wet Ingredients:Mix together the milk, vegetable oil, and vanilla extract in a small bowl. Set aside for later use.  In a separate large bowl, use the mixer on a low to medium speed to beat the butter and sugar together.  Tip: It should look light and fluffy.Mix in the eggs one at a time in the same bowl.Tip: Make sure to fully incorporate each egg before adding the next.Combining the Ingredients:Use the same large mixing bowl from before to alternate adding small parts of the dry mixture and the wet mixture that was previously combined. Tip: Again make sure you full incorporate each addition to the batter before you add more.  Tip: The consistency of the batter will vary between thick and thin as you alternate the dry and wet ingredients respectively.. Pour the Batter and Bake:Scrape the sides of the bowl with the rubber spatula to make sure everything is mixed well together.  Pour or scoop the batter into the lined cupcake pans.  Tip: Fill each cupcake until it is no more than 3/4 full. Start with half full then go back and fill them again until you run out of batter making sure they are evenly filled.Place Cupcakes in the center rack of the oven  Warning: Chance of burns when putting cupcakes in and out of the oven, use oven mitts.Bake for 15 to 20 minutes (depending on your oven).  Tip: A good way to check if cupcakes are done is to stick a toothpick in the middle and then take it out to see if it comes away clean then its good, if it has batter on it then they are not done.Place cupcakes aside on a cooling rack to cool before decorating. . While your cupcakes are baking and cooling you can make the frosting.Frosting:Get a medium sized bowl.     Put the sugar, butter, water, vanilla, and egg white in the bowl.     Slowly mix together the ingredients until the powdered sugar doesn't fly everywhere.     Mix the ingredients at higher speeds until it is light and fluffy.      Set aside a small amount of frosting to act as glue for the eyes of the animals.Color and Texture:Put graham crackers into a bag and crush them up until they are at a sand like consistency.      Tip: Start off with around three graham and add more if needed.Pour the crushed graham crackers onto a plate so they are ready for the decorating steps.      Set aside some frosting to dye blue to have an ocean colored frosting to go on the cupcakes.. Sand:Frost the top of the cupcake.   Roll the frosted cupcake in the plate of crushed graham crackers. Turtles:To make one turtle you need a peach ring, two gumdrops, and a fruit slice.Put one of the gumdrops into the center of the peach ring for the body of the turtle.   Place the other gumdrop on the outside of the peach ring for the head of the turtle.   Cut the fruit slice in half.   Take one half and cut along the middle the long way (see image above).Cut those two new half's in half again but this time at an angle along the short way (see image above).Cut three smaller triangles one equilateral and two isosceles these will make the tail and the two back fins.Place the pieces of the turtle onto the frosted cupcake.Crabs:To make one crab you need two gumdrops, one fruit slice, a half of peach ring, and two toothpicks.Cut the tooth picks in half so you have four pieces of toothpicks.   Stick the two gumdrops on two of the toothpicks, one gumdrop per toothpick piece. These will be the eyes of the crab.    Stick the other ends of those same toothpicks into the top of the fruit slice (the flat side).   Place two drops of frosting on the gumdrops, one on each eye.Stick the candy eyes onto the now frosted gumdrops.Cut a peach ring in half.Cut the half of peach ring in half again making two claws for the crab.   Stick the two claws onto the ends of the remaining two toothpicks.   Stick the toothpicks into the sides of the fruit slice.    Place the curved side of the fruit slice on to the frosted cupcake.Tip: If the crab is having trouble staying on the cupcake then use another toothpick in the bottom to stick into the cupcake for stability.. Now that you're finished, you should have fun and cute beach/ocean themed cupcakes like the ones above for any event! Have fun sharing this delicious tasting desert while impressing your friends and family.Optional:To save on time try using store bought cupcake box mix and frosting, or invite others to bake with you!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 2, 0, 3]\nC: [1, 2, 3, 0]\nD: [2, 0, 1, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_7_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_7_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_7_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_7_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [1, 0, 3, 2]\nC: [0, 3, 1, 2]\nD: [2, 1, 0, 3]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Ingredients & Tools   Martini glass     Malibu (1.0 oz)     Cake vodka (1.0 oz)     Blue curacao (Just a splash!)     Slice of Lemon     Ice     Cocktail shaker Mixing  Mix your cake vodka, Malibu, and splash of blue curacao into the cocktail shaker with ice. Shake and strain into your martini glass and garnish with a slice of lemon.Cutting Your Lemon To make your presentation match the image, cut a thin slice of lemon in half and then place the blade of your knife between the knife and the pulp of the fruit, separating the fruit from the rind just barely. Do this up to the halfway point and then wedge your new lemon slice onto the lip of the cocktail glass.\n        . Ingredients and Tools   Jaegermeister (1.0 oz)     Black Spiced Rum (2 oz)     Ginger ale or ginger beer (top off remaining glass)     Ice     Lime wedge     Low ball glass/tumbler Mixing  Pour your Jaeger and rum into a low ball glass filled with ice, stir, then top off with ginger ale/ginger beer. Garnish with a lime wedge.\n        . Ingredients and Tools   White rum (1.5 oz)     Peach Schnapps (0.5 oz)     Pink Lemonade (top off remainder of glass)     Ice     Orange slice     Maraschino Cherry     High ball glass     Tooth pick Mixing  Pour your rum, pink lemonade, and peach schnapps into a high ball glass with ice and stir. Garnish with a pinwheel. Best enjoyed through a straw.What is a pinwheel?! It's that thing in the picture, silly! Just take your tooth pick, one maraschino cherry, and an orange slice and follow along!   Wrap your orange slice around the maraschino cherry as far as you can.     Place your tooth pick through your orange wrapped cherry so that it looks like the picture! Simple, no? There are many variations on the pinwheel. This is just the one I was taught.. Ingredients and Tools   Black cherry juice (3.5 oz)     Gin (1.0 oz)     Blackberry Brandy (0.5 oz)     Tonic water (2.5 oz)     Cherry pop rocks     Conical flask (250 ml) Prepping your Flask -Wet the lip of your flask thoroughly with tap water. -Pour your pop rocks onto a small plate -Rub your wet flask into the pop rocks until the rim is evenly coated.Mixing First line the rim of your flask with pop rocks, then pour in all of your liquid ingredients. Give it a good stir, drink, and save Townsville!I know that tonic water glows under black light, but I'm not sure what it would look like with all of these ingredients mixed in. If anyone has the means to take a picture of Chemical X under black light, please post in the comments!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 0, 3, 2]\nC: [0, 3, 1, 2]\nD: [2, 1, 0, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_8_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_8_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_8_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_8_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [2, 3, 1, 0]\nC: [1, 0, 3, 2]\nD: [1, 2, 0, 3]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. First things first: Go Wash Your Hands! Cleanliness is always appreciated when cooking.Ingredients for the Cake\n1 1/2 Cups Butter (3 sticks) - room temperature\n1 Cup White Sugar\n1 Cup Brown Sugar\n3 Large Eggs - room temperature\n2 1/2 Cups Self Rising Flour\n4 Medium (or 3 large) Apples\n1 Cup Nuts (chopped) - walnuts or pecans are my favorite\n2 teaspoons Vanilla Extract\n2 teaspoons Cinnamon\n3/4 teaspoon Ground Cloves\n3/4 teaspoon Ground NutmegIngredients for the Caramel Glaze:\n1/2 Cup Butter (1 stick)\n2 teaspoons milk\n1/2 Cup Brown Sugar. You are going to need:\nA mixer, or a mixing bowl and a strong arm\nMeasuring Spoons\nMeasuring Cups\nKnife (not shown)\nPeeler (I love mine since it cores, peels, and slices all at once)\nCake Pan of some sort\nSome spray oil\nZiploc bag (or something like it)\nOptional\nA friend to do the dishes. Go ahead and preheat your oven to 350 degrees Fahrenheit .\nPeel, core, and cut your apples into small pieces. Place those in the Ziploc bag and set them aside for now.\nIf your nuts aren't chopped, chop them up. Place those in the Ziploc bag too.\nIn your mixer, cream 1 1/2 Cups of room temperature Butter (3 sticks), 1 Cup of white granulated Sugar, and 1 Cup of Brown Sugar.\nFirst the butter and sugar will lump up into a ball. Keep mixing until the butter and sugar are completely incorporated and usually they'll stick to the bowl. The finished product should be like grainy peanut butter.. Next you will need to add your spices and eggs to the mixer.\nAdd 3 large eggs, 2 tsp of vanilla, 2 tsp of cinnamon, 3/4 tsp ground cloves, 3/4 tsp nutmeg.Mix Mix Mix\nStart mixing slowly so that the powdered spices don't just fly out. Speed it up to get everything incorporated.. Get your bag of apples and nuts out.\nPut a little flour in the bag, about 1/8 of a cup or just enough to lightly coat everything.\nThe flour will help keep the apples and nuts from sinking into the batter while it cooks.\nZip the bag up and\nSHAKE SHAKE SHAKE\nSHAKE SHAKE SHAKE\nShake your baggy\nPut the floured apples and nuts into the mixer and give that a good mix to get everything incorporated.. Get your measuring cups and your self rising flour out.\nAdd 1/2 to 1 Cup of flour at a time and mix it into the batter.\nKeep adding until you have a total of 2 1/2 Cups of Self Rising Flour added to the batter.\nTurn your mixer up and mix well.\nYour batter will change from dark lumpy goop to light, slightly fluffy goop. Your batter will also be rather thick, so don't get flustered by it. \u00a0It will be easy to spoon out, not runny like a typical yellow cake mix.. If you haven't already, get your cake pan out.\nSpray some oil into the pan and dust it with flour. You could alternatively use shortening or even more butter.\nShake out the excess flour since you don't want flour lumps on the outside of your cake.\nSpoon your cake batter into the pan and spread it evenly.\nMake sure you also bang your cake pan down a few times to release any trapped air bubbles.. Put your cake pan on the middle rack of the oven and close the oven door.\nYou'll need to cook this cake for approximately 1 hour and 10 minutes, so set your timer.\nI like to put this particular cake pan on a cookie sheet so that the bottom doesn't overcook/burn.. While you wait for your cake to bake find that optional friend.\nAsk him or her if he or she would like a big piece of apple cake.\nIf he or she says yes, tell them they'll have to work for it by washing all those dishes you just made.\nIf you aren't lucky and/or don't have an optional friend to wash dishes, then you just found something to do for the next hour.DO NOT skip ahead and make the Caramel Glaze while the cake cooks. It sets quickly and by the time the cake is done all you'll have is a pot of really thick and almost unusable caramel goop. It will taste good but you'll ruin your dinner and you'll have to make it again anyways.Other things to read about:\nThis cake recipe can be made with all white granulated sugar. I like the added taste of the brown sugar.\nThis cake recipe can be made with cooking oil instead of butter. I like the taste of butter.\nThe Caramel Glaze still needs to be made with butter and brown sugar.Substitutions:\nThe nuts are optional, they just taste good.\nIf you do not have brown sugar you can make your own:\nTo make one cup of packed light brown sugar\u00a0mix together 1 cup of white granulated sugar and 1 1/2 Tablespoons of molasses. For dark brown sugar use up to 4 Tablespoons of molasses.. Your timer should be beeping/ringing/whatever it does and that means you need to check your cake.\nOpen up your oven and poke the cake with a broom straw, or really long toothpick, or if you're lucky you have one of those metal cake testers. I personally used an ice pick.\nThere should be no cake on the pick when you take it out. Don't be alarmed if you poke an apple. The top of the cake should be browned and cracked. The edges of the cake should have pulled away from the sides of the cake pan.\nIf your cake does not meet these criteria, use your judgement and put the cake back carefully. Watch it closely until you deem it done.\nI like to turn off the oven and leave the cake in for another 10 minutes so that a nice crust forms.\nOnce the time is up take your cake out and put it on a cooling rack.. While your cake is resting get your pan, a spoon of some sort, 1 stick of butter, 2 teaspoons of milk, and 1/2 Cup of packed brown sugar.\nPut the ingredients in the pan, turn your burner on medium high, and begin to stir.\nKeep stirring.\nBring the mixture to a low rolling boil and keep stirring for about 2-3 minutes. You want the sugar crystals to completely dissolve and the Caramel Glaze to slightly thicken.\nTake the glaze off of the heat and go back to your cake.. For the first glazing, you will want to poke holes in your cake (This is another reason I used the ice pick). This is optional but highly recommended.\nSlather some of the hot Caramel Glaze onto the top of the hot cake and let it sink into the holes.\nYou can also paint the sides of the cake at this time.\nDon't use all of the glaze up though. Save some of the glaze and set it aside to cool and thicken.. After a few minutes the rest of the Caramel Glaze should be cooler and thicker.\nApply the glaze to the top of the cake.Congratulations\nYou've just finished making a delicious Fresh Apple Cake with Caramel Glaze.\nI find that this cake is best after it has cooled and settled in the\u00a0refrigerator\u00a0overnight. It also makes it easier to cut.\nIf you can't wait that long, at least wait an hour so that you don't burn your mouth on the molten lava apples inside. Trust me, not being able to taste anything for a week isn't worth the one bite.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 3, 1, 0]\nC: [1, 0, 3, 2]\nD: [1, 2, 0, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_9_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_9_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_9_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_9_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [3, 0, 2, 1]\nC: [2, 1, 0, 3]\nD: [0, 1, 3, 2]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Bacon\nPotatoes\nSea Salt - Optional\nKnife/Cutting Board\nFrying Pan\nPaper Towels\nTongs/Other Standard Kitchen Supplies. Cook up the bacon. \u00a0You won't need it, so discard it (by which, of course, I mean eat it!) \u00a0The idea here is to keep as much grease in the pan as possible.. You're looking for a slice that's a bit thicker than a sheet of paper. \u00a0Not the thinnest slice you can make, but close to it. \u00a0Peel them first if that's your preference, and by all means use your mandoline slicer if you have one. \u00a0One pound of bacon made more than enough grease to fry 3 potatoes.. Place one layer of potatoes in the pan and fry them. \u00a0You want them lightly browned. \u00a0The darker they got the more likely they were to go from tasting bacony to tasting burned.. Drain them on a paper towel, and sprinkle lightly with sea salt if you would like. \u00a0Do this while they're fresh from the pan. \u00a0Keep frying and draining, pretty much until you decide to stop. \u00a0Serve them in a bowl, or individual small bowls if you're fancy like that!\nBonus flavor options:\nBacon Onion\n- before frying the potatoes fry up a diced onion in the grease then scoop it out - the onion infuses the bacon grease with flavor\nBacon Parmesan\n- as soon as you remove the chips from the bacon grease sprinkle them with grated parmesan (the dry kind from a round can)\nFlavored Bacon\n- different types of bacon come with different flavor profiles - applewood, hickory, etc. will all give you a different chip flavors\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 0, 2, 1]\nC: [2, 1, 0, 3]\nD: [0, 1, 3, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_10_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_10_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_10_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_10_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [2, 3, 1, 0]\nC: [3, 1, 0, 2]\nD: [1, 2, 0, 3]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. \n          For the dough:\n\n\t\t3 tsp of sugar\n\t\t2 packs of yeast or 1 1/2 tbsp\n\t\t1 1/4c warm water\n\t\t3 1/4 cup of bread flour\n\t\t3 tsp of salt\n\t\t4 tbsp Olive oil\nFor the sauce:\nTwo 28oz\u00a0cans of crushed tomatoes\n\t\t2 tbsp of olive oil\n\t\t1 tsp of dried oregano\n\t\t2 tbsp of\u00a0Italian\u00a0seasoning\n\t\t1 tsp of salt\n\t\t1/4 cup of fresh grated parmasan cheese\n\t\t2 cloves of\u00a0roasted\u00a0garlic crushed\nFor the toppings: whatever you would like.\n        . \n          Good crust and sauce are what make the pizza first we will start with the dough since it needs time to proof.\n\t\tDissolve\u00a0the\u00a0sugar\u00a0in the warm water then add the yeast, let it sit for 5 minutes.\n\t\tWhile the yeast is becoming a\u00a0slurry\u00a0mix together your dry\u00a0ingredients\u00a0and form a well in the middle.\n\t\tNow pour in the yeast slurry and mix\u00a0thoroughly, after it is mixed kneed for 5 minutes.\n\t\tNow just form it into a ball set it back into the mixing bowl for proof between 90 and 120 minutes. Make sure to set it in a warm area.\nWhile it is\u00a0proofing\u00a0we will work on the\u00a0roasted\u00a0garlic and pizza sauce.\n        . \n          To make\u00a0roasted\u00a0garlic you only need two ingredients:\u00a0Olive\u00a0oil and bulbs of garlic.\n\t\tPre-heat the over to 400\u00baF\n\t\tWith a sharp knife slice through about the first 1/2 inch of the garlic\u00a0bulb.\n\t\tNext peel away most of the outer layers of the bulb and trim the outside cloves down about 1/2 an inch as well.\n\t\tNow place the clove in either a foil cupcake liner, a foil bowl, or even a cupcake pan.\n\t\tPour about 2 tbsp of olive oil over the bulb and cover it in foil.\n\t\tNow bake it for about 45\u00a0minutes,\u00a0after 30 minutes remove the foil and bake for another 10-15 minutes.\n\t\tRemove from the oven and let cool, after it has cooled crush the garlic with a fork.. For the sauce, just put all of the sauce ingredients in a large saucepan and let it simmer and boil down while the dough is proofing and you are building the pizza. Just make sure to stir the sauce occasionally so does not burn and stick to the bottom of the saucepan. You are going to want to have a thicker sauce because if the sauce is to watery, it could easily leak through the crust sides or the bottom and just go everywhere when the pizza is cut into.. \n          Now that dough has proofed your going to want to remove it from the bowl and knead it for an additional minute. Now that dough has been\u00a0kneaded\u00a0again\u00a0you might have noticed the textures changed and its more stretchy.\n\n\t\tDivide the dough ball into two dough balls one little bigger than the other.\n\t\tTake the bigger dough ball and begin stretching and pulling and flattening it out so that it will fit into the 10 inch\u00a0spring form\u00a0pan. Make sure to bring it a little bit up over the sides of the pan so that we will be able to tuck the top crust under the sides of the bottom crust.\n\t\tNow just bake the\u00a0crust in the oven set to 415\u00baF for about 5 minutes. \u00a0\n\t\tWhile the crust is baking begin preparing the ingredients will go inside the pizza pie.\u00a0For mine I used Italian sausage, pepperoni, red onions, and peppers.\n\t\tOnce the crust is done remove from the oven and let cool for about 10 minutes.\n\t\t\u00a0Once it has cooled, begin layering your ingredients. The order I went it was: pepperoni, sausage, sauce, onions, pepperoni, sausage, peppers, sauce, cheese. \u00a0\n\t\tNow were ready to add the top layers of crust. Do the same as you did for the bottom layer but this time just drape it over the cheese and pinch together the two layers of crust.\n\t\tNow just tuck the top player into the sides of the Spring form pan so every thing is sealed in.. \n\t\tTo bake the pizza put it in the preheated oven of 415\u00b0F for 45 minutes with foil over the top of the spring form pan.\n\t\tAfter 45 minutes remove foil and bake for an additional 15 minutes.\n\t\tOnce this is done\u00a0baking\u00a0remove from the oven and let sit for 10 minutes.\u00a0\nNow you can remove the outermost layer of the spring form pan, and now it is ready to cut and enjoy.\n        \nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 3, 1, 0]\nC: [3, 1, 0, 2]\nD: [1, 2, 0, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_11_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_11_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_11_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_11_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [2, 3, 0, 1]\nC: [0, 1, 3, 2]\nD: [3, 1, 0, 2]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. A Basic Set of Canning Tools with:Wide-Mouth Canning FunnelJar LifterMagnetic Lid LifterBubble Remover/Headspace Tool.* I say this is optional, because I canned for years prior to buying a basic set of canning tools. They are helpful, but not 100% required. . I have used the same brine pretty much since I started making relish. The base is very very versatile and can be flavored with a number of spices or herbs to your own personal taste. In fact, for several years, it was all I used for my pickles. It's based on a 1 cup batch, so it's easy to adjust depending on how much you intend to make.  For this, I made a 4 cup batch (which I split between relish and pickled tomatoes, 2 cups each). Ingredients: 1 Cup of White Vinegar 1/2 Tablespoon of Granulated Sugar 1/2 Tablespoon of Brown Sugar 1/2 Teaspoon of Pickling Salt The last few years, I've also added the following Spices (based on a 1 cup batch): 1/8 Teaspoon Dill Seed 1/8 Teaspoon Mustard Seed 1/4 Teaspoon Black Pepper Corn 1/8 Teaspoon Celery Seed 1/4 Teaspoon Chopped Garlic Once everything's measured out, combine all of the above ingredients in a medium sized pot. Bring to a  rolling boil, and reduce the heat to medium. Strain the brine through a fine mesh strainer and return the liquid to heat. Now the brine is ready for pickling.  * You can wrap your spice mix in cheesecloth to avoid having to strain your brine or you can leave it loose. I do both, depending on the batch.. My relish recipe is really very simple and consist of only hot peppers that I have on hand, nothing else. This years batch has Jalapeno's, Serrano's, Thai Chili's, Red Habanero's, and Cayenne (I'm still waiting to see if the jolokias are going to fruit).Take approximately 2-3 cups of your favorite peppers (seeds and all) and in a food processor, pulse until you have a nice evenly chunky, peppery mix. * You can also finely chop the peppers if you don't have a food processor handy.. Hot Water Bath Canning vs Refrigerator Pickling is a tough decisions and should be based on your goals. Either of these methods will render you with a fantastic batch of \"Hellish\" relish. The biggest difference comes down to how much you are making and storage. If you are planning on long term storage, you can follow the directions here, \"Hot Water Bath\" Canning - The Basics. It's a very straight forward method of canning and allows you to store the relish for 1-2 years. It's is a bit more time consuming and does require a large canning pot, but I find it to be well worth it, especially for large batches. If you are going with the Refrigerator Pickle method, simply wash the lids and jars thoroughly with hot soapy water, dry, and set them aside until needed. . Now that the peppers are processed (or chopped) and the jars/lids are prepped, it's time to fill them.Using a Wide-Mouth Canning Funnel, spoon the hot pepper mixture into each of the jars, leaving at least a 1/2\" of room at the top. Ladle in the pickling brine, being sure to leave at least a 1/2\" of headspace. Insert the Bubble Remover/Headspace Tool and gently move it around the inside of the jar, to release any bubbles.Finally, wipe the rim of each jar, to make sure they are clean prior to sealing the jars. . Apply the top to the jar and lastly, apply the ring and tighten to seal.If you are going to go with the Refrigerator Pickle Method, You're Done!!!! Allow the jars to cool on the counter and then store in the refrigerator for 1-3 weeks.. If you've decided to go with the \"Hot Water Bath\" method to seal your \"Hellish\" Relish, using the \"Jar Lifters\" place the jars back into in the hot water bath and put the lid back on the canning pot. Process for 15 minutes, then remove and allow to cool on the counter over night (12-24 hours).When you hear the lids start to pop, you'll know you have a solid seal!Store in a cool, dry place until you're ready to taste test and devour!I hope you've enjoyed this Instructable, on how I make \"Hellish\" hot pepper relish!!! Happy Pickling!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 3, 0, 1]\nC: [0, 1, 3, 2]\nD: [3, 1, 0, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_12_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_12_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_12_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_12_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [3, 1, 0, 2]\nC: [0, 2, 1, 3]\nD: [0, 3, 2, 1]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Bake your angelfood cake in a loaf pan for easy slicing. Line the pan with parchment for ease of release after baking and cooling. You can use a cake mix or bake from scratch using any recipe.   Baking from scratch is quick and easy if you use powdered eggs or liquid egg whites from the store. No need to find another recipe for all those yolks. (I've gone and made 1 Litre of lemon curd in the past, it's a bit much!). When your cake has cooled, Peel off parchment paper. it can now be sliced into even servings. (I like to serve 2 slices per person).  Use an angelfood cutter, forks or serrated knife to gently cut the cake.. While your cake is cooling, get the toppings you want to use.I used:70% dark chocolate,Toasted slivered almonds,Fresh raspberries.You can use any variety of cake, chocolate, nuts and berries.  You could even add soft cheeses like cream cheese or mascarpone mixed with some vanilla sugar and lemon zest to create something else grand.. Toast your cake slices in a dry griddle or frying pan.  Use a Medium Low temperature.  The sugars in the cake will burn easily on higher heats. . When the underside starts to brown, it is time to turn the cakes over. Put your chocolate on the warmed cake.  Cover with a lid to speed the melting, before the other side burns. This should only take a minute. . Transfer the cakes to a plate. Use a knife, or the back of a spoon to spread the now melted chocolate.Top with the chopped nuts and fruit.Serve and enjoy!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 1, 0, 2]\nC: [0, 2, 1, 3]\nD: [0, 3, 2, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_13_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_13_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_13_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_13_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [0, 2, 3, 1]\nC: [0, 1, 2, 3]\nD: [2, 0, 1, 3]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. The first step is the hardest part for me, but the easiest for all of my friends.You should take your parsley and wash it. Than, gently wipe extra water from parsley, and cut it.Take a middle sized bowl and mix the butter and parsley in it.Wrap the butter in a plastic food wrap and put into the freezer for 20-30 minutes.. This step is simple and easy.Take the chicken breast and \"open\" it with the sharp knife the way you will get a rectangular form of chicken.Gently beat your chicken with a food hammer from one side.Take the butter our of the refrigerator and cut it in pieces.Then, put the butter on the chicken breasts.Wrap the breast, so that the butter is inside.Now you are almost done.. Slightly beat eggs with mixer eggs and add salt and pepper.Dip the chicken in the egg mixture.Roll the breasts in the breadcrumbs. Repeat this process twice.. In a deep frying pan or saucepan, heat the oil and fry the breasts. Turn it around occasionally, until golden brown colour would appear from all of the sides.The oil should cover the breasts at least to the half.Your breasts are ready! I recommend to serve the breasts with fried or mashed potatoes.  I hope you and your family will enjoy this dish. Bon Appetit!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 2, 3, 1]\nC: [0, 1, 2, 3]\nD: [2, 0, 1, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_14_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_14_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_14_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_14_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [3, 0, 2, 1]\nC: [2, 1, 3, 0]\nD: [2, 0, 3, 1]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. \nIngredients:\n\n\t\t1 14 oz. can OR 2 cups beans of your choice (I'm using chickpeas & black beans)\n\t\t1 medium onion, cut into large chunks\n\t\t1/2 cup rolled oats (not instant)\n\t\t1 tbsp spice of your choice (I'm using chili powder in the black beans, cumin in the chickpeas)\n\t\tsalt and pepper to taste\n\t\t1 egg\n\t\toil for cooking (olive or canola)\nHardware:\n\n\t\tlarge pan for frying\n\t\tfood processor\n\t\tspatulas, forks, etc.\n\t\tcutting boards. \n(Get out your food processor - if you have one that's 4+ cups, feel free to dump everything in. If you're working with a little tiny one like me, first mix everything together in a bowl and do it in two batches.)\nOpen your can of beans and drain it - reserving the liquid, just in case. (I've not had to use it, but who knows!) Cut your onion into chunks.\nCombine the beans, 1/2 cup of oats, spices, the egg, onion, salt, and pepper in your bowl and mix together.\nThe first picture is chickpeas with cumin, the second picture is black beans with chili powder.. \nOnce you get a reasonable amount of the mixture into your food processor, you'll start to pulse. Pulse a few times, take the top off, scrape down the sides, repeat.\nYou only want to do this until things start to break down - you don't want things to get pureed. It still needs to be slightly chunky and have good body. :)\nAlso - don't add any liquid at all if you can help it. The mixture is wet enough as-is and should mix up on its own... but if, for some odd reason, your food processor starts smoking and screaming, add a tablespoon or so of liquid.\nThe last picture shows you what you should end up with - a good, firm, chunky consistency.\nWhat do do if it gets too thin, or you got too excited while pulsing:\nPut a tablespoon or two of extra oats in your second batch if you have a small food processor, or just empty everything but a small amount of the mixture and add in the oats.. Wet your hands and shake off any excess.\nWet hands are especially important for this bit - otherwise you just get gloppy bean hands, and no one wants that.\nI had the best luck making four patties out of the mixture, but you can make up to six. Divide the mixture into equal parts and roll into balls and then flatten carefully into patties.\nI found it was easiest to do this on a big flexible cutting board that I could carry around with me - I'd just make them into balls and then flatten them while they were on the board.\nDon't make them too thin or they become very hard to pick back up and they'll start to break. And don't press too hard into the surface you're working on either, or they'll stick like crazy!. \nNow the best part!\nHeat a pan over medium heat with your oil of choice. Once the oil is nice and hot, coax the patties off your work station (keep in mind you can reshape them a little in the pan if you need to) and into the pan. Don't overcrowd them, though - 2-3 patties is the maximum - if they're touching too much they will not get crispy, they'll just steam.\nCook for five minutes on one side, flip, and cook for an additional five minutes.\nMake sure to move them around a little during this time - stove tops and pans can be finicky, and you don't want one to get burnt while the other stays golden brown. ;)\nAnd if a patty breaks during flipping, no worries, just craft yourself a falafel-esque bite and consider that your tip for cooking.. \nTop these as you would any other burger - I suspect almost anything would be good on them!\nThe black bean ones were my favorite, but the chickpea ones are also quite good. I can't wait to try additional combinations and I'm looking forward to trying more veggies and fresh herbs in there!\nThe original recipe came from one of the kings of cooking, Mark Bittman, and you can check out more ideas on his bean burger recipe page - he gives a lot of great substitution ideas!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 0, 2, 1]\nC: [2, 1, 3, 0]\nD: [2, 0, 3, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_15_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_15_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_15_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_15_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [1, 2, 3, 0]\nC: [3, 0, 2, 1]\nD: [0, 2, 3, 1]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Materials:\nWhite Fondant\nPink or red food coloring\nToothpick\nSmall Circle Cookie Cutter\nFondant work mat\nRolling pin\nGloves (optional)\nSorry for the blurry picture!. Dip the toothpick in the food coloring and then rub the food coloring onto the white fondant. Put on gloves and start to work the fondant until it turns the desired color.. Take a little piece of the fondant that you just colored and roll it into a ball. Put the ball on the fondant work mat and start shaping so that it ends up looking like a hershey's kiss.. Put the slab of fondant on the Fondant Work Mat and roll it very flat. Then use the circle cookie cutter to cut as many circles from the fondant as you can. If you don't have a circle cookie cutter, you can also use a water bottle cap like I did.\u00a0. Fill a little cup with water and keep it with you the whole time you are putting together the roses. Take \u00a0a base and a circle and wet one of your fingers. Rub your wet finger on the bottom of the base and then stick the circle onto the wet part of the base. Keep repeating until the rose has as many petals as you would like it to have. To finish the roses, gently twist the very bottom and then use scissors to cut it off.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 2, 3, 0]\nC: [3, 0, 2, 1]\nD: [0, 2, 3, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_16_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_16_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_16_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_16_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [2, 1, 3, 0]\nC: [2, 3, 1, 0]\nD: [2, 0, 3, 1]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. \nI make two batches of granola every Sunday. One I make with the nuts and one without.\n12 cups of\u00a0 quick oats\n1 cup of Olive oil (or your favorite light oil, canola, grapeseed)\n1 cup of Maple syrup if you don't like the flavor of the maple syrup I have also made it with light corn syrup\n1 cup of packed dark brown sugar. If you prefer the light or golden feel free to substitute\n3 cups of shredded Coconut- again this is optional, if your allergic or just don't like the consistency don't put it in.\n3 cups of sliced almonds- I just buy the whole ones and chop mine up these are optional\n1 cup of dried fruit-again optional. In the one I make without nuts I also do not add the dried fruit. The kids add fresh strawberries, bananas to their own bowl.\n1 pan large enough to hold everything. I use my roasting pan.\n3 cups of chopped cashews- again optional\nmeasuring cup\nknife for chopping nuts. Pre-heat the oven to 250 degrees Fahrenheit.\nNow is when you will chop your nuts up if you bought them whole.\nNo need to chop them too small. I like the way they clump together while they are baking. No need for them to all be the same size either.. Ok, now your going to add all the ingredients except for the oil, syrup and fruit. Mix well.\nNow add your oil , mix.\nThen add the syrup. mix well.. \nPlace your pan into the oven.\u00a0 You are going to stir the granola every fifteen minutes for one hour and fifteen minutes. This way it gets a nice even color.\nLet cool. This is when you add your dried fruit , chocolate chips if you want.\nWe eat it over soy yogurt, with soy milk, also if you heat up your milk then pour it over the granola yummy!!\nI hope you have enjoyed my first instruct-able.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 1, 3, 0]\nC: [2, 3, 1, 0]\nD: [2, 0, 3, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_17_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_17_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_17_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_17_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [2, 0, 1, 3]\nC: [2, 1, 3, 0]\nD: [3, 2, 1, 0]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n.   So the general plan is to suspend your brewing pot/kettle in a big container and surround it with expanding foam. I found the main challenge to be getting a suitable container. For my 15 litre stock pot I found a big plant pot that would do the job (in the UK you can get one of these from B&Q or Wilkinsons, for about \u00a37). This allowed about three inches dead space around the whole pot, except at the bottom edge of the pot where the taper reduced this to two inches or so.Requirements for the container   You're looking for something a few inches (say ~3\") larger than your pot on every 'side' (including underneath)     The smaller the space, the less expanding foam you will need     The larger the space, the better the insulation, but the returns diminish since mash times are only an hour or so     You could make your own container, but be careful about materials - cardboard might go soggy     Something like a garden dustbin might be a good choice for large pots/kettles So, here's everything you'll need   Your brewing pot/kettle (duh)     A large container as discussed above     A can or two of expanding foam (one 750 ml can was enough for me - I got it from Screwfix)*     Something to raise the pot off the bottom of the container (see next step)     Aluminium foil to wrap around the pot * apparently you should steer clear of 'easy cleanup' or 'water soluble' varieties of this foam because they're too weak - thanks sharpstick for the tip.   We want insulation under the pot as well as around it, so it needs to be lifted up off the bottom of the big container. This is how I did it: just a piece of plywood with three nylon screws.  Advantages of doing it like this:   Leaves a nice air gap under the pot for filling with foam      You can easily adjust the height of the screws if the pot doesn't sit level      Three-point contact means the pot won't rock (kinematic mount)      Nylon screws rather than metal to avoid scratching the pot   Whatever you use, it doesn't need to be fixed to the bottom of the big container because the foam will bond everything together. You might want to stick it down with something light duty though, to avoid it shifting for the time being.. VERY IMPORTANT! This is what makes your insulation a removable sleeve rather than a permanent addition to your brew pot!  The idea here is to wrap the pot in foil so that, once the foam has set, the pot can be removed. Hence you can mash inside the insulation, then pull out the pot and get it straight on the heat for boiling.  This is quite an easy task if you take your time and do things in a sensible order:   Start with the part of the pot below the handles, where's it's a constant diameter, and work towards the base from there. The foil doesn't have to be pulled tight, but you don't want it baggy either. Try to not crinkle the foil too much. Secure it where necessary with some tape (I used electrical tape).     Gently fold the foil around the base of the pot and tape it again. The foam won't want to invade between tight folds of foil, so there's no need to make all 'airtight'.     Continue the foil wrap past the top of the pot and fold it inside. Carefully scrunch around the handles, avoiding any tears. If the foil tears then patch it up with tape.     Tape over any edges where one sheet of foil ends and another begins.   The thing to remember is that this doesn't have to be pretty, but the foil does need to cover all of the outside of the pot. Any gaps and the foam will make direct contact with your pot and stick to it, and this will only make your life harder when you eventually try to get the pot out..   Adding the foam has to be done in layers, so you'll need to allow enough time. My experience was that each layer added three or four inches of height to the overall level of foam, with at least five minutes for each layer to finish most of its expansion. It took me about half an hour to add all the foam.Lessons learned  You'll see in the photos that initially I thought a bag of rice would be heavy enough to hold the pot in place. WRONG! In the end I threw in as many heavy things as I could find, so seek out some ballast before you start. The base layer of foam can really push up!  Also it's easy to miss just how much the foam expands. If you watch it it barely seems to move, but in reality it will puff up to four or five times its initial volume over five or 10 minutes. Observe carefully and you'll get a feel for the expansion.Protection  WEAR GLOVES AND SCRAPPY CLOTHES. If the foam dries on your clothes it will never come out. Get it on your skin and you'll end up removing it with sandpaper, not soap :|Preparation  Gloves as already mentioned. Plastic sheet on the floor. The foam will drip out the end of the can between uses, and it's best to just let this fall onto your plastic sheet and let it dry there. Wiping this stuff is almost always a bad idea.  The foam sets in the presence of moisture, so I slightly wet the sides of my pot and container to help it along. Probably not essential though.Procedure  Ok, here goes:   First the base layer needs to be added, so remove the pot from the container (not the spacer from step 2 though) and squirt an even layer of foam across the entire base of the container. Avoid the temptation to fill it all - you only need a layer around a quarter of the height of the gap and magic will do the rest.     Replace the pot, fill it with as much mass as you can find*\u00a0and ensure that the pot is centred in the container. We don't want to move it from this point onwards if we can help it.     Wait for the base foam layer to expand and creep around the edge of the pot. Only about five minutes in my case.     Add another layer of foam directly on top of the last layer, working evenly around the pot. Aim for an initial layer height of between half an inch and an inch - it will expand four or five times this.     Wait again for expansion to slow down, but not so long that it starts to harden. Five to 10 minutes again, perhaps.     Repeat steps 4 and 5 until your latest (unexpanded) layer sits just below the rim of the pot.     Now relax and have a homebrew! The foam will take longer to completely set than the can advises, because it's such a deep gap. Leave it at least overnight to fully harden. *\u00a0you could fill the pot with water to weigh it down - thanks woolleyy for that idea.   The beauty of this foam is how easy it is to shape once it's dry. Using a sharp knife, cut around the rim of the pot so the foam is flush. Also cut or tear the protruding foil and remove the excess to expose your pot.Something unexpected  At this point I expected the pot to simply lift out of the foam-filled container, but not so. Actually there was barely any adhesion of the foam to the big plant pot, so I actually removed the brew pot + insulation as one! See photo.  In fact, the fit was so snug that it took a little while to get the pot out of its new jacket - I now realise it's because there was no way for air to enter the cavity beneath the pot. You could solve this by poking a hole trough the insulation at the bottom, but after I removed the pot the first time I found the foam had flexed enough to allow air to enter/exit down the sides of the pot. Something to think about though if you have this problem.UPDATE 2014-03-19: See new step 6 for, possibly, an even better solution to this.Voila!  Anyway, you should end up with a perfectly-fitting, custom made cozy jacket for your pot during mashing. You could even make a foam lid, although I just use towels.Performance  My temperature drops during a mash are now consistently less than 1 deg C per hour, and that's including removal of the lid to do pH checks etc. \u00a0Also I tend to do ~8 litre mashes, so a \"true\" BIAB (full volume) mash would hold even better.  \u00a0TIP: If you're doing BIAB, then you're probably adding the grain to the water rather than the other way around. If this is the case then consider heating your mash water a few degrees higher than usual, then insert the pot in the insulation with the lid off and wait for it to drop to strike temp. This way you pre-heat the insulated jacket a little and avoid any initial losses when inserting the pot into a cold jacket.. This step is a later addition to the instructable, because it suddenly hit me that you can save yourself the hassle of lifting (especially big pots) if you cut the finished jacket in two. The idea is simple: slice the set foam in half, then you can bring the halves together around the pot and secure them with a strap, tape etc.  Lifting isn't eliminated entirely, since you'll probably be lifting the pot off a burner, but life is certainly made much easier, especially for removal after the mash has finished.I recently did this for a 32 litre pot with great results (see photo).Spigotry If your pot has a spigot (tap) then there's hope for you yet... assuming you can remove the spigot for the 'moulding' phase when the foam is setting, then you can cut the foam in two, as above, but also take a little extra off so there's a gap for the spigot to poke through.  This foam is so easy to shape that I can't see this being a problem.  It could even work for some electric boilers.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 0, 1, 3]\nC: [2, 1, 3, 0]\nD: [3, 2, 1, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_18_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_18_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_18_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_18_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [0, 1, 2, 3]\nC: [0, 1, 3, 2]\nD: [3, 0, 1, 2]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. The ingredients in this wonderful and OFFICIAL Peet's recipe are as follows:1/4 lb. of Hibiscus C tea12 oz. of boiling water12 oz. lemonade1/2 cup of white sugar4 oz. cold waterIce cubes to coolY'all will also need a tea-kettle, an iced-tea pitcher, and a tea pot of some sort. Boil 12 oz. of water, DO NOT REMOVE KETTLE UNTIL THE WHISTLE BLOWS.. Pour the boiling water over the Hibiscus C tea into a teapot (or other heat-safe container, in our case we used a coffee presspot) and let steep for 5 minutes. (If you prefer a stronger tea taste, feel free to let it steep a bit longer). After the tea has steeped for 5 minutes or so, use a strainer to separate the hot liquid from the loose tea into an iced-tea pitcher, and immediately afterward add the 1/2 cup of sugar. This is critical to do directly after the tea has steeped so the sugar can dissolve in the hot liquid. Gently stir to ensure that all sugar is dissolved.. ... pour lemonade into the mix y'all!After the sugar is dissolved into the concentrated tea, pour 12 oz. of cold lemonade into the pitch.Continue to stir the mixture. This step is simple y'all, while stirring, pour 4 oz. of cold fresh water into the pitcher. yep, that's all for this step.. So what would an iced tea cooler be without the ice, right? Once the mixture is completely stirred together, add a few handfuls of ice cubes to chill the drink. If you really want to get festive, you can use fun ice cube shapes...we used puzzle and triangle ice-cube molds. Special ice shapes are the perfect mundane detail to dazzle your friends and show up Martha!!!. Add some more flare to this fabulous drink by pouring it into your favorite cocktail glass and adding a colorful garnish like a slice of lime or a lemon twist.Your friends and dog will love it!!. This drink is best served chilled on a hot day. Add some banana loungers, Girl Talk's \"Feed the Animals\" album and a friend or two and you have an instant party!!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 1, 2, 3]\nC: [0, 1, 3, 2]\nD: [3, 0, 1, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_19_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_19_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_19_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_19_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [0, 3, 1, 2]\nC: [0, 1, 3, 2]\nD: [1, 2, 3, 0]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. First off is the materials: - Heat Lamp Bulb (Any wattage will do, I had different types for different heat levels) - Hotel Style Plate Cover- Heat Lamp - Pigtail Light Bulb Socket- Outlet socket wire adapter Optional: - Extension Cord- In-Line Dimmer . The first place to start is cutting the hole. Since the bulb needs to be exposed to the food, you will need a hole that is wider than the diameter of the bulb. I first used a Dremel to get the cut started, continued with some pruning scissors (not recommended), and finished with an angle grinder. When I had gotten the hole cut out, I went around the edge with the angle grinder to smooth out the jagged edges. Since this hole will not be seen when assembled, it does not have to be perfect. Make sure to be safe and wear the appropriate gear when cutting the whole as the edges are sharp and the tools throw sparks. . Next, you take apart the heat lamp. If you chose the parts correctly, the pigtail socket should fit into the hole of the previous socket. For your lamp this may not be necessary, but for my lamp I wanted to be able to use bulbs of up to 300 watts, which no pre made lamp that I could find had. . This step is fairly self explanatory, but you need to put the wires from the pig tail socket into the outlet to wire converter. The tip for wiring the right way this is that white is right. . At this point, the hole is made and everything is ready to be put together. I decided to glue the shade to the cover before I glued the pig tail in order to have less in the way, but either way works. I put a small line of glue all around the base of the lamp shade and lined it up with the cover and made sure to make a good seal. Then, I did the same with the pigtail socket and the top of the shade. It is important that you make sure that the shade is well in place before use to be sure of no fire hazard if parts were to fall apart. At this point, you are almost done! . I chose to add an extension chord and a dimmer that both plug into the wall. This helps to adjust the heat that the light can be set on and increase the range it can go from the socket. . By now, you should just have to screw in the bulb and be off to the races! I hope that you enjoyed this simple and helpful guide on how to create your very own and to keep a warm meal for anyone who happens to miss it. Thanks! \nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 3, 1, 2]\nC: [0, 1, 3, 2]\nD: [1, 2, 3, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_20_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_20_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_20_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_20_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [2, 0, 1, 3]\nC: [0, 1, 3, 2]\nD: [1, 2, 0, 3]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. I started with around 1.5kg of silverside beef. Using a very sharp knife I cut the beef along the grain. When cutting down for normal steaks usually you are encouraged to cut across the grain. This stops the beef from being chewy, however that is sort of the effect we want from the biltong. I cut them into slices of around 1cm thick. The thicker you make the longer it will take to dry, and the thinner obviously it will be quicker. . I transferred the beef into a plastic container. Then I poured some red wine vinegar into the container and worked the beef into it all to make sure there was vinegar on all the beef. I let this marinate for around half an hour. This helps with the preservation, I made sure not to leave it in too long or else it will start to effect the final flavour too much.. This section you can mix and match as much as you like. The only one you really need to include is the salt. The salt is what kickstarts the drying and preservation. I started with some sea salt flakes, I crushed them up with my hands. Next I used some black peppercorns and crushed them in my pestle and mortar. I left them fairly course, you can of obviously crush them as fine as you like.Then I added a small handful of chilli flakes. This will add a kick to the biltong. Finally I added some crushed coriander seeds. This is the what gives it that real traditional South African biltong flavour.. I then rubbed all the salt and spices into the beef and made sure it was fully covered. Using small metal hooks I hung each bit of beef into my Biltong Box. If you don't have any hooks you can use paper clips to bend into a hook shape. It needs to dry for around 4-5 days. The longer you leave it, the drier it will get. I like mine a little wet so it is a little bit rare in the middle. . Once you're ready just take it down and slice the Biltong as thin or thick as you like it.It should last a good few weeks if you keep it covered in a sealable bag or tub. I hope you enjoyed this instructables. And remember if you make it make your own, share some photos here! \nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 0, 1, 3]\nC: [0, 1, 3, 2]\nD: [1, 2, 0, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_21_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_21_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_21_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_21_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [3, 1, 2, 0]\nC: [3, 2, 1, 0]\nD: [2, 1, 0, 3]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. \n\tDirections\n\t\t\tPreheat oven to 350 degrees. Sift flour, cinnamon, baking soda, baking powder, and 1/4 teaspoon salt into a bowl. Beat 1 stick butter and the sugars until pale and fluffy. Beat in egg and vanilla.\u00a0 (I sometime skip this step and add all these ingredients at the same time... the result is absolutely the same). \n\t2.\u00a0Mix in zucchini, oats, and walnuts. Refrigerate until firm, about 1 hour.. \n\t3. Depending on the size you want your cookies, Drop spoonfuls of dough onto parchment-lined baking sheets, spacing about 2 inches apart. Bake until edges are golden, about 17 minutes. Let cool on a wire rack.. \n\t4. Beat together remaining 1 1/2 stick butter, crisco, vanilla, milk and confectioners' sugar until smooth. (Buy cream cheese frosting in the tub for an easy alternative) Spread 1 heaping tablespoon filling onto the flat side of 1 cookie, and sandwich with another cookie. Repeat with remaining filling and cookies.. Scrumptious Veggie Cookies!!!!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 1, 2, 0]\nC: [3, 2, 1, 0]\nD: [2, 1, 0, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_22_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_22_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_22_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_22_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [3, 2, 1, 0]\nC: [0, 2, 3, 1]\nD: [3, 0, 1, 2]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. To make this delicous Ajam Ketjap you will need:500 grams of chicken breast (Ajam)a splash of oil 2 large onions 6 cloves garlic 1 small can of tomato puree 1 theespoon of salt 6 tablespoons of Sweet soy sauce (Ketjap manis)You also need a cutting board and knife a stirrer and a large pan.. Cut the onion into pieces, put a little bit of oil in your pan and add the sliced onion and tomato puree together in the pan and fry until the onions are translucent. (it is very importent to put them in together, for the taste of the end product). Whille you fry the unions an tomato puree, Cut the chicken breasts in dices, when the unions are translucent add the chicken and fry it until the chicken is brown.. crush the garlic and put it in the pan stir and fry for 1 or 2 minutes. (Some times people say that 6 cloves is to much and there breath will be terible afterwards. But you do not have to be afraid this wont hapen.). Now add the Theespoon of salt and 6 tablesppoons of Sweet soy sauce also called Ketjap manis, stir it and add about 1 cup of water ( the chicken has to be covered with the sauce you made.. Put the lid on youre pan and let it simmer for about 15 minutes occasionaly stir it, this is a good time to get yourself a nice cup of coffee.. After about 15 minutes get the lid off of your pan and let it simer for another 5 to 10 minutes depending on the amount of watehr that was added in step 5, this has to be done for 2 very important reasons, first of all the excess liquid wil vaporize and second every body in the house will come towards the kitchen wondering what it is that smells so good.You can eat this with bread or rice, both is delicious.Enjoy your meal!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 2, 1, 0]\nC: [0, 2, 3, 1]\nD: [3, 0, 1, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_23_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_23_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_23_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_23_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [3, 0, 1, 2]\nC: [3, 1, 0, 2]\nD: [3, 1, 2, 0]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. For pie cones1 cup All purpose flour  1/2 cup Cold milk ( add little by little) 1/4 cup vegetable oil  SaltFor Sweet filling16 Oz extra firm Tofu  1 tsp Cocoa powder  1/2 cup Melted chocolate or Nutella 1/3 cup Maple syrup 1/2 tsp Vanilla essenceFor Savory Filling1 cup minced turkey meat  1 medium size onion, finely chopped  4 garlic cloves, finely chopped  1 cup mixed vegetables  4 sweet mini peppers 1 tsp salt  1/2 tsp fresh ground pepper  1/4 tsp dried oregano  1 egg 1 tbsp oil cilantro for garnishingSupplies for making moldParchment paper  Aluminim foil sheet. Take A4 sheet size of both parchment paper and aluminium foil sheet. Make a fold in the middle of parchment paper and crush it on one end.Take aluminium foil sheet and wrap it over the parchment paper to attain the cone shape.. Take a mixing bowl, add in flour and salt, whisk until all combine. Add oil and mix it all well.Add a tablespoon of milk at a time, mix well using hand.Keep adding milk  and knead until the dough becomes right consistency.. Preheat oven to 400 F Divide the dough into 3 equal portion. Take one portion, dust the floor with some flour and roll the dough using rolling pin. Roll it according to the size of the mold u have made. Once you rolled it, keep the cone mold on the dough and start rolling the dough over the mold. seal the edges using water. One cone is ready now. Repeat the same process for rest of the dough. Once you have made all the 3 cones, it's time for baking. For better result, pie cones should stand upright.Keep the cones carefully in the baking dish and bake it for about 25 to 30 minutes. Take it out from the oven and let it cool down completely. Remove the cones carefully from mold.. Wash tofu in fresh running water and cut into small cubes. Take all the ingredients in blender and blend it until you get smooth paste consistency.  Transfer it to air tight container and refrigerate it for an hour.. Heat oil over medium flame, once oil heated, add garlic and saute for few seconds. Add onions, saute until it becomes translucent. Add in vegetables, season them with salt and pepper, cook for 3 minutes. Add minced turkey meat & cilantro, mix all well and cook uncovered and stir often until the meat is cooked well. Remove it from flame and let it cool down for sometime.. Take all your ingredients Pie cones   Sweet filling   Savory fillingFor sweet pie cone: Take a cone, scoop some chocolate tofu mousse into the cone and top with fresh strawberry. For savory pie cone: Take a cone, add some turkey meat fillling into the cone and top with shredder cheese.Serving suggestion: Refrigerate the sweet pie cone for an hour and serve chill. For savory pie, preheat oven to 200C, bake for 5 minutes and serve hot.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 0, 1, 2]\nC: [3, 1, 0, 2]\nD: [3, 1, 2, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_24_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_24_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_24_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_24_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [1, 3, 0, 2]\nC: [3, 0, 1, 2]\nD: [1, 3, 0, 2]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. I shall be making several sandwich suggestions in this Instructable for you to try yourselves.Seek out and buy some of many types and makes,  to discover the ones that you like the best.The point of this Instructable is to inspire you to try the thins and to maybe try a few sandwich suggestions that you have not tried before. Eat them on a camping trip, in the summer house or just lay outdoors in the sun. They are delicious.I hope that you already know how to make a sandwich, but for completeness I have gone through the steps with a few personal notes that may be of interest.Spread a thin layer of your favourite spread such as butter or an olive oil based one, as used here.This stage obviously applies to all sandwich ideas and so will not be repeated in this 'ible.. I love the way apple and cheese go together to create great fresh summer sandwiches.Of course you can try all sorts of combinations of different cheeses and applesYou may also like to try mixing cheeses as I have done in this example.. I like the apple sliced and chunky but some like it grated or chopped.Peel and slice the apple then layer it onto the thins.. Grate your cheese or cheeses and spread over the apple.Mix cheeses for extra flavour (or flavor).Here I used English Cheddar and Red Leicester.. You may leave them just as they are for a good sized handful, or cut them.In a recent survey of the Gregarious family it was agreed that diagonal slices are more 'posh' and taste best.. This is a mouth watering delight.Fresh prawns, fresh shredded lettuce and a light spread of your favourite dressing (e.g.Thousand Island, makes a gorgeous sandwich.. Rough chop the boiled eggs in a bowl. Then mix in some fresh cress. Spread liberally over the thins.Perhaps the cheapest and simplest sandwich shown in this 'ible but surely one of the tastiest and satisfying.. Thins are good toasted but we have found that it is best to lightly cover with spread first before popping them into the toaster.We find it best to set the toaster for just a short period, enough to warm and melt the spread.Do experiment with different settings to perfect your ideal toasted thin.. Get a good fresh Hummus and spread it over one thin.(You may choose to not have another spread but I prefer it that way)Then place a nice slice of corned beef on the other slice and put them together.(Note how the beef slice neatly fits onto the thin)These may also be combined very nicely with the warmed thin toast as described above.. Good back bacon (local terms may vary) grilled (broiled) until deep red and trimmed to remove all fat.. Skinned or with skins, grilled or broiled to a warm glow.Neatly arranged to complement the lovely bacon.Brought together in possibly one of the worlds most delicious sandwiches.(Especially on thins). Ok... I admit it...I could go on forever detailing all the yummy treats that can be made with sandwich thins.Before I stop, here are just a few more serving suggestions:The good old 'Chip Butty' as we call it here in the UK.Lovely thick gammon steaks.Simple thin sliced ham (maybe with tomato).I was going to include a nice steak too...but I ate it before I could take a picture.Mmmmm, I do hope that you have been inspired by these suggestions, which have certainly made me hungry just typing about them.If you make any please do upload a few pictures.EnjoyEnjoyEnjoy\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 3, 0, 2]\nC: [3, 0, 1, 2]\nD: [1, 3, 0, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_25_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_25_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_25_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_25_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [3, 0, 2, 1]\nC: [0, 2, 1, 3]\nD: [1, 2, 0, 3]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. You will need:\n-Cake Crumbles\n-Frosting\n-White Chocolate\n-Caramel Candies\n-Orange Food Dye *non water based*\n-Candy Decorator\nI used pumpkin cake and cream cheese frosting.. Although any flavor of cake and frosting works for cake balls, I decided to go with pumpkin cake.\nFor the cake, I mixed the following:\n-3 eggs\n-1 Cup sugar\n-1 teaspoon baking soda\n-3/4 Cup canned pumpkin\n-1/2 teaspoon cinnamon, ground\n-1/2 teaspoon nutmeg, ground\n-3/4 Cup flour\nThen I spread out the batter on a sheet pan and baked it at 350 for ~12 minutes.\u00a0\u00a0 Cooking time varies though, so be sure to check the cake while it is cooking.\u00a0\u00a0\nAfter the cake is cooked and cooled, crumble it.\u00a0 If it is still pretty moist, break it into small pieces.. Cream Cheese frosting pairs nicely with pumpkin cake.\u00a0\u00a0 However, just like the cake, any flavor of frosting will work.\u00a0\u00a0\nI mixed together:\n-8 oz cream cheese\n-1 Cup powdered sugar\n-2 Tablespoons butter\n-1 teaspoon cake vodka\nMake sure to taste the frosting to make sure it is sweet enough.\u00a0\u00a0 If the cream cheese flavor is too strong, add in a bit more powdered sugar.\u00a0\u00a0\nAdd some orange food coloring to the frosting so the inside color of the pumpkins is also pumpkin-like.\u00a0\u00a0\nI tried out cake vodka because we were out of vanilla.\u00a0\u00a0 It added an excellent flavor layer and I will probably use it in future frostings too.. After the cake crumbles and frosting are ready, mix them together.\u00a0\u00a0 The consistency should be a bit heavy and sticky.\u00a0\u00a0 Try not to over mix the frosting with the cake, or the texture will be mushy.\u00a0\u00a0\nRefrigerate the batter until it is chilled, so it is easier to roll into balls.\nRoll the batter into balls.\u00a0\u00a0 Keep in mind they need to be big enough for jack-o-lantern decorations, yet small enough to still be easily eatable.\u00a0\u00a0\nIf the batter is too sticky, even after chilling it, add more cake.\u00a0\u00a0 If the batter is not sticking together, add more frosting.\u00a0\u00a0\nAfter the balls are rolled, freeze them for at least an hour.\u00a0\u00a0 They need to be nice and firm for dipping.. Cut your caramel squares into stem shapes for the pumpkins.\u00a0\u00a0 I got 6 stems out of 1 square of caramel.\u00a0\u00a0\nCut your chocolate into slivers for melting.\u00a0\u00a0\n-I used almond bark for these pumpkins.\u00a0\u00a0 You can also find chocolate / white chocolate chips, big blocks of bakers chocolate, or pre-colored candy melts.\u00a0\u00a0. Melt the chocolate over a double boiler.\u00a0\u00a0 This is important because if it melts directly on the heat of the stove, it can easily overheat.\u00a0\nAdd in your food coloring as the chocolate melts.\u00a0\u00a0 Do Not use a wooden spoon as shown in the picture- it can add moisture to the chocolate which breaks it.\u00a0\u00a0 Also, make sure your dye is not water based, or the chocolate will break.\u00a0\u00a0\nOnce the chocolate is melted and dye added, it should be a nice smooth texture.\u00a0\u00a0 Turn down / off the heat for dipping.\u00a0 . When the chocolate is ready, dip the cake balls one at a time.\u00a0\u00a0\u00a0 They should be completely coated in chocolate.\u00a0\u00a0 After coated, place them on wax paper to cool.\u00a0\u00a0\nI find it is easiest to use a tooth pick for dipping, and have a second toothpick ready to help detach the pumpkin ball.\u00a0\u00a0 In the hole left from the toothpick, add a caramel \"stem\" before the chocolate is completely cooled.\u00a0\u00a0\nIf the cake balls are turning too mushy in the chocolate, put them back in the freezer until completely firm.\u00a0\nIf it is too hard to dip the balls with a toothpick, a fork can also be used.. When the pumpkins have cooled completely, they are ready to decorate!\nIf any of them have cracked on the outside, they can be patched very carefully with any remaining chocolate.\u00a0\u00a0\nAdd any features that a jack-o-lantern might have.\nI used a candy-decorator that needed to be warmed up in water to be useable.\u00a0\u00a0 After I decorated my jack-o-lanterns the candy turned solid again.\u00a0\u00a0\nEnjoy!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 0, 2, 1]\nC: [0, 2, 1, 3]\nD: [1, 2, 0, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_26_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_26_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_26_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_26_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [0, 3, 2, 1]\nC: [1, 0, 2, 3]\nD: [2, 1, 0, 3]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. I used this recipe from Food and Wine magazine, but I left out the almonds in order to make the recipe easier for a beginner. You can also add in a tablespoon of ground flax seeds for added health benefits, or swap the dried cranberries for dried cherries.  Cranberry Almond Biscotti (recipe will make about 30 biscotti)1 3/4 cups all-purpose flour1 1/2 teaspoons baking powder1/4 teaspoon salt2 large eggs3/4 cup sugar3 tablespoons unsalted butter, melted and cooled1/4 teaspoon almond extract1 cup dried cranberriesoptional, 1 tbsp. ground flax seedsBeat the eggs and sugar together until thoroughly mixed and creamy yellow. Stir in the almond extract and butter. Add in the dry ingredients (flour, baking powder, salt, and optional ground flax seeds) and mix again. (I used a stand mixer, but you can do it by hand as well). Stir in the dried cranberries last. . Wet your hands, and shape the dough into two flat logs, on a parchment covered baking tray. The dough will be very sticky, so wetting your hands (and re-wetting!) is a crucial step in shaping the dough.Bake in a 350 degree oven, for about 25 minutes, until the top of the logs are golden. Remove and let cool. . Biscotti means twice baked, and that's what we do in this recipe!Once your logs have cooled, carefully slice them on the diagonal, using a serrated bread knife. Go slowly. Using a serrated knife is critical. . Lie the slices of cookie on their sides, back on the parchment paper lined cookie tray. Bake for about 10 minutes, at 350 degrees, until the cookies are crispy and golden on their sides. The goal is to end up with a nice dry and crunchy cookie, that stores well, and is great for dipping!Let cool and enjoy with a cup of coffee or tea! If you're adventurous, you can drizzle with some melted chocolate, but they're delicious without the embellishment! Like my instructable? Vote for me in the Baking Contest! And check out my blog: www.approachingfood.com. Thanks!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 3, 2, 1]\nC: [1, 0, 2, 3]\nD: [2, 1, 0, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_27_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_27_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_27_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_27_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [3, 0, 1, 2]\nC: [2, 3, 1, 0]\nD: [3, 1, 0, 2]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Materials1.     All-Purpose Flour (from Costco)2.     Baking Yeast (from Costco)3.     Water4.     Vegetable Oil (from Costco)5.     Salt (from Walmart)6.     Flower Pepper Powder (Sold in local Asian supermarket)7.     Green Onion8.     One Chopstick9.     Rolling Pin10.  Knife11.  Cutting boardKitchen Appliances1.     Bread Machine2.     Steamer. a.   Add two cups of all-purpose flour, one teaspoon of baking yeast, and three cups of water into the bread machine.b.    The mixture is stirred in the bread machine for 30 minutes.c.    Let the dough stay in the bread machine for 50 minutes after stirring.d.    The whole internal container of the bread machine will be filled with the dough as the picture shows below.NOTE: MUST KEEP THE DOUGH IN THE BREAD MACHINE FOR ENOUGH TIME (50 MIN). Spread the dough on the cutting board with a rolling pin to form a 10 inch \u00d7 10 inch dough sheet.. Add 1 teaspoon of salt, 1 teaspoon of flower pepper powder, 2 teaspoon of vegetable oil, and prepared green onion chips onto the dough sheet homogeneously.. Roll the sheet as shown in the following picture.. Cut the roll into 16 pieces, and put one piece right on the other one.. Using a chopstick, press down at the middle of the overlapped two pieces. The edges of the balls will be opened up gradually.. a.   Put the flower buns on the upper layer of a steamer, stay for 30 minutes.b.    Add 10 cups of water into the lower layer of the steamer.c.    Heat the steamer for 15 minutes after boiling of lower layer water.NOTE: THE WELL-COOKED FLOWER BUNS COULD BE STOCKED IN A FREEZER FOR 3 MONTHS. BEFORE EAT, THE FROZED FLOWER BUNS SHOULD BE RE-COOKED AS THE ABOVE METHOD.SAFETY ISSUE: DO NOT LET THE WATER ALL BOIL AWAY.. Turn off heat and let it stay for 5 minutes, and enjoy it.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 0, 1, 2]\nC: [2, 3, 1, 0]\nD: [3, 1, 0, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_28_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_28_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_28_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_28_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [1, 0, 2, 3]\nC: [1, 2, 3, 0]\nD: [3, 1, 0, 2]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Shopping list is as follows:\nSalmon, Fresh (1 - However many will fit in your BBQ)\nSugar, Brown (I use about a 1lb of Sugar per fillet)\nSalt, Kosher (About 2 Cups per fillet)\nWood - (Your choice on wood, I used Mesquite woodchips).\n. So there are many types of brines, and for my first foray, I wanted to keep it simple.  A simple ratio of 2:1 Sugar to Salt.  The intent here is to draw moisture out of the Salmon, and add a touch of flavor.   I had to make more often, so make a large batch and you can save your leftovers; saves from \"stop and going\" with your steps as I had to.. With the brine mixture created, you need to sandwich your Salmon with brine.  You can use any type of container, but steer away from metal, as it'll interact with the process, and mess with your Salmon.  Be sure to firmly press the mixture into the Salmon, and toss uncovered into fridge for 3 - 5 hours.  Both the before and after pictures are here.. Wash off the brine from the Salmon, and pat dry with paper towels.  Put on a rack (may wish to use the smoking rack so you don't need to move it), and put it in to the fridge again for 1 - 3 hours.  Essentially, you want the Salmon to become tacky to the touch (from the salt/sugar combo drying) so that smoke can attach to it.. This would be the time to prep your wood.  Many different ways to go about it, I've decided to use chips in a bread pan.  From past experiences, and watching my Father, soaking your chips allows for a longer smoke, rather than dry chips.. This is the time to get excited, smoking is almost ready to begin.  Heat your BBQ up, and wait for the coals to go grey (or if you have propane, turn it as low as it'll go).  Typically, when I want to do this, I'll have my charcoal on the right side (where the wood chips will be), and on the far left side ( no fire) my Salmon.  If you can get you BBQ below 100 degrees Fahrenheit and maintain it for the duration, you will have excellently smoked salmon. . I smoked for about 4 hours, stocking the coals as needed, and replacing the wood chips every 45 - 60 minutes.\nOn my first journey I was a little overzealous, and smoked hot.... Because salmon is so delicate, I had excellently cooked salmon, not smoked salmon.  After a day long process, it was a little disappointing, so take my advice Colder is Better....  \nEnjoy!!\nIf you think, 150 grams of Smoked Salmon is $8\nFor 3lbs of Salmon, it's $10, add some inexpensive ingredients and you have some excellently inexpensive Smoked Salmon!!!  \nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 0, 2, 3]\nC: [1, 2, 3, 0]\nD: [3, 1, 0, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_29_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_29_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_29_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_29_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [1, 0, 3, 2]\nC: [2, 3, 0, 1]\nD: [3, 0, 1, 2]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. INGREDIENTS - 150g roasted cashews,250g Nutella hazelnut chocolate spread,20 no.s chocolate wafers.METHOD - 1) Chop the cashews finely.2) Chop the wafers.3) Combine 3/4th of the cashews with the Nutella. Refrigerate.4) Combine the remaining cashews with the wafers.5) When the Nutella mixture is firm, scoop out small balls of it.6) Roll the balls in the cashew and wafer mixture.7) Freeze until firm.. 1) Wrap the chocolates in foil.2) Cut out petals, from red crepe paper; for the roses(6-7 petals for each rose).3) Glue the petals onto the wrapped chocolates in an overlapping manner(the petals slightly overlapping each other).. 1) Glue together pieces of sponge to form the base of the boat.2) Cover the upper and lower surfaces of the boat with brown coloured card-stock and the sides of the boat with yellow card-stock.3) Line the edges of the boat with golden glitter.4) To make the mast, cover a stick in golden glitter glue and leave it to dry.5) For the sails, cut out triangles from white cotton fabric. Glue golden ribbon onto the sides of the triangles.6) Attach the sails to the stick(mast). Insert the lower end of the stick(mast), into the base of the boat(sponge); by making a hole in the card-stock covering the upper surface of the boat.. 1) Cut out leaves from green crepe paper and stick them onto the upper surface of the boat.2) Using double-sided tape, stick the roses onto the boat.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 0, 3, 2]\nC: [2, 3, 0, 1]\nD: [3, 0, 1, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_30_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_30_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_30_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_30_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [2, 3, 0, 1]\nC: [1, 3, 0, 2]\nD: [2, 1, 0, 3]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. I decided to make two separate sodas. Insert 10 two different colored skittles into two separate cups.. In my instructable I used Sierra Mist for my lemon lime soda. You want to pour the soda into each cup.. I set a timer to 10 minutes and then went back over to the drink and took out the flavorless skittles.. Take out each skittle. In my case a few of the green and red skittles dissolved into the soda.. I grabbed two ice cubes from my freezer and dropped them into both skittle sodas.. Lastly enjoy the skittle soda of your choosing.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 3, 0, 1]\nC: [1, 3, 0, 2]\nD: [2, 1, 0, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_31_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_31_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_31_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_31_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [1, 2, 0, 3]\nC: [0, 3, 1, 2]\nD: [3, 1, 0, 2]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Before you start preparing your cookies, place your oven rack in the middle of the oven and preheat your oven to 375\u00b0FIn a bowl, place 1 cup of softened margarine or butter, as well as 1 cup of both granulated and brown sugar.  Blend the ingredients until smooth, so there aren't streaks of margarine or butter.. The next step is to beat in 2 eggs and 1 tsp vanilla extract into the creamed butter and sugar.  Keep mixing the ingredients until smooth.. Now, measure and mix your dry ingredients into your bowl.  This is 2 cups all purpose flour, 1 1/2 tsp cinnamon, 1 tsp baking soda, and 1 tsp salt.*You may choose to measure these ingredients out into a separate bowl.*You may also choose to add the optional ingredients of nuts and raisins.  For this batch, I chose to add sunflower seeds and raisins.Blend everything together until the mixture starts to look uniform.. For this step, I chose to line my baking sheets with parchment paper.  Not only does it makes getting the cookies off of the pan easier, it also allows for easier clean up.Place approximately 10-12 1 1/2 inch balls of dough onto the baking sheets.  Slightly flatten with a spoon before baking (picture for reference).. Once the cookies are in the oven, set a timer for 8 minutes.  Depending on the size of cookies, they may need more than 8 minutes to cook. You'll want to take the cookies out before they look completely done, as they will continue to cook and harden after coming out of the oven.  Don't forget to take the cookies off of the baking sheet, or you will end up with hard bottoms.. \nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 2, 0, 3]\nC: [0, 3, 1, 2]\nD: [3, 1, 0, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_32_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_32_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_32_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_32_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [2, 0, 1, 3]\nC: [1, 2, 3, 0]\nD: [3, 0, 1, 2]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. You will need :BaconOreoOil (fryer)Toothpick. Split a bacon in half.Roll the first part of the bacon around the oreo.Roll the second part on the oreo (on different ''axis''). To hold the bacon around the Oreo simply use toothpick. It will be useful when you will use the fryer.. Sorry for the lack of image in this step...So when you are about to serve it, fry the Oreo Bacon.For a better taste, fry at highest temperature for about 20 seconds.. The taste is simply surprising...Good luck, hope you will enjoy it!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 0, 1, 3]\nC: [1, 2, 3, 0]\nD: [3, 0, 1, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_33_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_33_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_33_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_33_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [3, 1, 0, 2]\nC: [3, 1, 0, 2]\nD: [1, 3, 2, 0]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. (A)Dumplings-200g  glutinous rice flour50-100ml  waterFood coloring (optional)(B)Syrup-5 - 6  pandan leaves (tied in a knot)1 thumb size ginger (cut to thin slices)150g  rock sugar. Add flour into a large mixing bowl.. The amount of flour & water is very adaptable. Add water gradually and knead the dough until soft, smooth and easily kneadable.If the dough are:Too wet: sticky to the fingers (Add a bit more flour).Too dry: crumbling (Add a bit more water).. Repeat above step by adding fruit or vegetable juice.. Let the dough rest for 15 minutes.. Divide the dough (depending on the number of different colours you intend to make).Add food colouring. One drop at a time to each portion. Knead until the colour is distributed evenly.P/S:I skip this step because I am using natural food coloring in STEP 2.. Dip your fingers in water before shaping them if feels the dough dry.Shape the dough to balls or any others cute forms. (i.e. Panda, cat paw, alphabet, dice, flower etc).. The uncooked tang yuan in various shapes and colours.. Bring a pot of water (which enough to submerge them completely) to boil.Add the tang yuan into the boiling water and cook until they float to the surface.. Transfer tang yuan immediately to a bowl of (room temperature) water to cool down.This prevents them from sticking to one another or discolour the soup.. Add pandan leaves, gingers and sugar for sweet soup in a pot. Reduce heat to a simmer until the sugar is fully melted.. Add cooked dumplings to a serving bowl and ladle the sweet soup over.. Natural dyes from fruits and vegetables can be used to create a rainbow of dumplings.Eg:Orange = 2 tablespoon carrot juiceGreen = 2 tablespoon spinach juiceBlue = 3 tablespoons blueberry juicePurple = 2 tablespoons black berry juice + 1 teaspoon beet juiceBrown = 4 teaspoon coca powder\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 1, 0, 2]\nC: [3, 1, 0, 2]\nD: [1, 3, 2, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_34_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_34_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_34_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_34_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [2, 1, 3, 0]\nC: [1, 0, 2, 3]\nD: [0, 1, 2, 3]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. First, take the ground beef and place it in a large sauce pan at medium heat with a little bit of oil. Brown the meat until all of it has crumbled and pinkness is gone.. Turn off the stove and drain the liquid from the pan. Return to stove, turn your burner back on to medium, and add the garlic powder and your diced green bell pepper. At this point, you may also add salt, black pepper or any other spices you like!. Now add in the tomato paste and ketchup. Set the burner on low. Stir the ketchup and tomato paste into the beef/pepper mixture until combined.. With the ketchup and paste mixed it, it's time to add some BBQ sauce! There aren't any real measurements for the sauce, just start with a little first. At this point, you may also add more ketchup, spices, or BBQ sauce to your liking.. Finally, assemble your sandwich! You can toast your buns, or not. You can add cheese, or not. The choice is yours! (However, the hot sloppy joe mixture will melt the cheese and make it even more sloppy; not a bad choice!) Enjoy!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 1, 3, 0]\nC: [1, 0, 2, 3]\nD: [0, 1, 2, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_35_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_35_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_35_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_35_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [2, 1, 3, 0]\nC: [1, 0, 3, 2]\nD: [3, 1, 2, 0]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Measure 450 grams of chocolate chips. Put the chocolate chips on a double boiler on LOW temperature, and let it soften.. Mix the Graham and Chocolate crumbs together, once mixed add melted butter, and keep mixing. Spray your sprinform pan. Raise the crust onto the edges of the pan, and scrape off the extra to make it flat. Next put the rest onto the bottom. Chill in fridge.. Seperate 5 of the eggs, so that you have a bowl of yolks and one of whites. Add the last egg to the bowl with the yolks. Add 1/4 cup icing sugar to the egg whites. Put in mixer on High speed until stiff peaks form.. Mix 2 cups whipping cream with 2 tablespoons of icing sugar. Blend until stiff peaks begin to form.. Poor the melted chocolate into a bowl, next add the egg yolks and mix with mixer, then add one third of the stiff egg yolks and one third whipping cream and mix until light brown and smooth. Next add the rest of the egg yolks and whipping cream, instead of mixing, fold over and stir with spatula. Now pour the mousse (everything you just mixed) into the crust, and put in fridge over night.. Once you're finished you can decorate however you want.\u00a0\nEnjoy!\nThese chocolate mousse are great fundraisers, we sold over 240 and had to stop the orders.\nThanks for Reading!\nIf you have any questions just ask.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 1, 3, 0]\nC: [1, 0, 3, 2]\nD: [3, 1, 2, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_36_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_36_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_36_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_36_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [1, 0, 3, 2]\nC: [0, 3, 2, 1]\nD: [0, 2, 1, 3]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. SoftwareCrust\n\t\tabout 1 cup of mini pretzels\n\t\t1 tablespoon butter, melted\n\t\t1 tablespoon sugarPeanut Butter Layer\n\t\t1 cup peanut butter\n\t\t1/4 cup butter\n\t\t1/4 cup brown sugar\n\t\t1 1/4 cup powdered sugar\n\t\tsplash of vanillaMicrowave Caramel (makes about twice as much as you'll need, but it's fantastic over ice cream (or off of a spoon!))\n\t\t1 cup sugar\n\t\t2 tablespoons water\n\t\t2 tablespoons corn syrup\n\t\tlarge pinch of salt\n\t\t1/2 cup heavy creamChocolate Layer\n\t\tabout 8 ounces of chocolate (milk or semi-sweet work well)HardwareFor All\nmicrowave\n8\" x 8\" baking dish\nmeasuring cups and spoons\nknifeCrust\nfood processorPeanut Butter Layer\nmedium mixing bowl\nmixing spoonMicrowave Caramel\nmedium mixing bowl\nsmall mixing bowlChocolate Layer\nsmall bowl\n        . \n\t\tIn a food processor, grind the pretzels to a fine powder\n\t\tAdd melted butter and sugar\n\t\tProcess until well combined\n\t\tSpread the mixture over the bottom of the baking dish. \n\t\tCombine sugar, water, and corn syrup in a medium-sized mixing bowl\n\t\tMicrowave on high for 5 1/2 minutes or until the sugar is melted, and the mixture is a very light amber color\n\t\tAllow the mixture to sit at room temperature for about 3 minutes\n\t\tWhile it's sitting, heat the cream in a small bowl for 45 seconds in the microwave\n\t\tAdd the heated cream and a pinch of salt to the sugar mixture and mix well\n\t\tTransfer to the fridge until ready to use. \n\t\tIn a medium mixing bowl, combine peanut butter, brown sugar, butter, and powdered sugar\n\t\tMicrowave for 1 1/2 minutes\n\t\tMix to combine\n\t\tAdd vanilla and mix well\n\t\tPour the mixture over the pretzel crust and spread evenly. \n\t\tIn a small bowl, melt the chocolate in the microwave by heating for 35 seconds\n\t\tPour the melted chocolate over the peanut butter mixture and spread evenly\n\t\tPour thin stripes of caramel over the chocolate\n\t\tGently pull the tip of a knife through the caramel and chocolate to combine them slightly\n\t\tPlace dish in the fridge for about a half hour to allow the chocolate and caramel to set up\n\t\tSlice into bars and enjoy!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 0, 3, 2]\nC: [0, 3, 2, 1]\nD: [0, 2, 1, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_37_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_37_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_37_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_37_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [2, 3, 1, 0]\nC: [2, 1, 3, 0]\nD: [1, 2, 0, 3]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. To make these, you will need:\nSodium Alginate\nCalcium Lactate\nEspresso (this can be instant espresso powder + water)\nChocolate chips (ignore the fancy block of chocolate in the picture--I'll explain later)\n[Heavy cream\nSugar\nVanilla\nGelatin powder] - for panna cotta\nor\nWhipped cream (from a can or homemade)\nAnd also:\nA cylinder (paper towel roll or similar)\nWax paper\nTape\nScissors\nA scale (ideally)\nSome kind of dropper (pipette, syringe, straw, etc.)\nLots of bowls and silverware\nA fancy plate (for displaying). \n          Sushi connoisseurs know that the key to good sushi is good rice. That might not entirely be true in this case (it's hard to beat molecular gastronomy), but hey, those liquid-filled espresso spheres need to sit on top of something fluffy and white. This can be any sort of sturdy no-bake pudding (or whipped cream), but I went with panna cotta, the Italian custard set entirely with the power of gelatin.\nIf you decide to use panna cotta, you'll want to start about four hours in advance so that it'll have time to set. Here's how to make enough for 8-ish pieces of sushi, depending on how big they are.\nINGREDIENTS:\n1 1/2 cups heavy cream\n1 package gelatin\n1/4 cup sugar\n1/2 teaspoon vanilla extract\nDIRECTIONS:\n\t\tPut about half the cream in a saucepan and sprinkle the gelatin over it\n\t\tLet it sit for about 5 minutes, and then turn the heat to low and stir gently until the gelatin dissolves\n\t\tAdd the rest of the cream, the sugar, and the vanilla\n\t\tTurn heat to medium and stir until it starts steaming\n\t\tPut a cover on it and let it sit for ~15 minutes\n\t\tLet cool, then put in the refrigerator and let it sit until it sets (3 or 4 hours)\nOr just buy a can of whipped cream. Up to you.. \n          The next thing to make is the \"nori,\" a thin cylinder of chocolate that will contain all your fillings. You can use any reasonably sturdy, cylindrical object to form these--an empty paper towel roll is a good bet, but I used the cardboard roll from an empty box of aluminum foil, which was a little smaller and sturdier. You'll also need wax paper, tape, and scissors.\n\t\tThe first thing you want to do is to prep your cylinder with markings to show how tall the rolls should be. This will make it easier to make them all the same height. Take some tape and mark off a sushi-looking length on your cylinder.\n\t\tTear off a piece of wax paper big enough to wrap around your cylinder, and cut it in half.\n\t\tWrap one of the halves around the cylinder, and tape down the seam.\n\t\tWrap two pieces of tape around the cylinder, lining them up above and below the guide on the cylinder. Fold the tape under at the ends so you'll be able to pull it off easily.\nYou now have a form on which to mold your chocolate cylinder.\n        . \n          Now it's time for a hard lesson learned from the pretty block of fancy chocolate shown at the beginning, which is that tempering chocolate is actually kind of hard. If you trust yourself to do it, then go for it, but I ended up switching to a non-fancy package of mini semi-sweet chocolate chips, which I melted the old-fashioned way: in the microwave.\n\t\tPut the chocolate chips in a microwave-safe bowl.\n\t\tMicrowave at 20-second intervals until the chocolate chips are just starting to get shiny--they should still be shaped like chocolate chips.\n\t\tStir like crazy. You should have chocolate that is spreadable but not liquid.\n\t\tUsing a butter knife, spread the chocolate between the two strips of tape around the circumference of your cylinder, going lengthwise.\n\t\tGet a little more chocolate on your knife and go around the cylinder with the flat of the blade, smoothing it out. Make sure not to let the chocolate get too thin.\n\t\tLet it air-dry for a few seconds, then pull off the two horizontal strips of tape, leaving a perfect cylinder of chocolate\n\t\tCarefully pull the wax paper cylinder away from the cylinder form and stand it up in your freezer for a few minutes. The refrigerator should also work, if you leave it there longer.\n\t\tRepeat until you have as many chocolate cylinders as you want to make.\n\t\tGently push the wax paper away from the chocolate, pushing on either side of the chocolate cylinder at once. This is important, and why you can't just use the edge of the wax paper in place of one of the pieces of tape. When the paper is almost completely detached, twist it to fully disengage the paper.\n\t\tIf you didn't spread your chocolate too thin, you should now have a chocolate cylinder, ready to be filled. Put it on a plate and admire it.. So you've gotten the easy part out of the way. Any fool can make custard and make chocolate into a cylinder, right? Now it's time for what this dessert is really all about: Molecular Gastronomy.\nLet's let the phrase marinate a bit and get over some common misconceptions. Molecular Gastronomy approaches food and the art of cooking with a rigorous, scientific zeal. We know egg whites solidify at a certain temperature, but what is going on within the egg white itself? Does altitude, freshness, chicken diet, or water salinity have anything to do with it? These types of questions, seeking to isolate and pinpoint the how and why of food, are what spurred techniques in molecular gastronomy. If you'd like to learn more about the history of molecular gastronomy, as well as details about the science behind the how and why of your food, don't hesitate to check out On Food and Cooking by Harold McGee.\nMolecular gastronomy may be best known for its use in in stuffy and expensive small plate establishments (I may be bitter over the cost and wait for Alinea...), but the techniques used have been a part of our culinary language for years. Check the stabilizers and additives in any food you buy (the pimento pieces found in green olives are produced exactly the same way as this espresso roe!). If anyone raises a \"chemicals in our food\" line with you, gently remind them the food you are eating is a vast array of chemical and biological reactions cooperating and competing against each other. These ingredients have passed rigorous and lengthy health standards with flying colors, and make an amazing addition to your culinary tool-belt.\nMolecular Gastronomy has a relatively low cost of entry, as well.\nSodium Alginate-\nhttp://www.thespicehouse.com/spices/Algin#content\n(Carbohydrate derived from brown algae that forms a gel upon contact with calcium ions)\nCalcium Lactate-\nhttp://www.thespicehouse.com/spices/calcium-salt\n(Calcium ion source that dissolves in water- may taste bitter, but harmless)\nThe two ingredients used in this recipe go for less than $7.50 (+ tax and/or shipping) and are used in small quantities (on the order of grams). Inexpensive, and long lasting. If you don't have a food scale that is accurate to the gram, you may want to invest in one later- after all, food is science. But this technique is pretty resilient to estimation if you know what to look for. . \nI used a powdered espresso for this recipe, and prepared it a bit stronger than what the instructions dictate. Remember that these espresso caviar spheres will be small- you'll want them to pack a punch of flavor that you'll notice in just a few caviar \"drops\". In my case, the espresso was a bit bitter, so I added 3 tsp of sugar. You can substitute fresh espresso, or even strongly flavored fruit juices- just be careful with the acidity, as anything too acidic will throw off the reaction.\u00a0\n- three teaspoons espresso\n- three teaspoons sugar\n- 9 oz water\n__________\n158 g of espresso liquid\n\t\tAdd enough Sodium Alginate to make your espresso a 1% sodium alginate solution. Divide the grams of liquid you have by 100 to figure out how much Sodium Alginate you need to use. In my case, it was about 1.58 g. You'll be able to modify the solution after the fact if things aren't working out right or your drops aren't forming properly.\n\t\t\u00a0\n\t\tMix thoroughly. I used a latte foamer, but you can use anything you like- a hand mixer or a whisk will do just fine. The solution will be quite foamy- don't worry.\n\t\t\u00a0\n\t\tPass through a fine mesh strainer a few times, or let sit in the fridge until the foam has dissipated. Discard any lumps that may have formed- this may be due to hard water, or cross contamination. Sodium Alginate is sensitive to calcium.\n\t\t\u00a0\n\t\tSet aside and go on to prepare the calcium bath.. \nYou'll need a 1% calcium solution to serve as a bath for your spheres. As you drip the espresso into this bath, the outside of the droplet will form a carbohydrate shell, forming your caviar. The longer you let your caviar sit, the firmer the shell will be until your drop eventually solidifies.\nTime for some simple chemistry mathemagic for those of you without a kitchen scale. You may want to consult the internet/ wikipedia for densities if you have a different ingredient.\n3 cups water---> 709ml water\nDensity of water ~ 1g/ml\n709ml x 1g/ml = 709g\n709 g of water means you'll need 7.09 g of Calcium Lactate (1% solution)\nDensity of Calcium Lactate ~1.5 g/ml\n7.09 g \u00a0/ 1.5 g/ml = 4.7ml\nYou need about 4.7 ml of Calcium Lactate, or about 1 teaspoon. Easy. Mix this into your three cups of water until it dissolves.\nTime to Spherify.. For my caviar, I used what I lovingly refer to as a sawed-off pipette. Experiment with what you have available, and see what kind of sizes and shapes you can produce. Syringes (without needles, check your drugstore) of different sizes are commonly used, as well as pipettes and other types of droppers. You can even use teaspoons to produce a ravioli effect.\nTake your drop-making implement of choice and drip the espresso / Sodium Alginate mixture into the calcium bath, one drop at a time. Release the drops as close to the bath as possible so that they don't solidify in funny shapes due to the impact.\nWhen you're out of espresso-mixture or have as much caviar as you want, pour the contents of the bath through a strainer over the sink. Make sure to gently rinse your caviar to wash off the bitter calcium water solution.\nNote: if your caviar doesn't form and just dissolves, you may need to have a stronger Calcium bath. Experiment with the bath first in small quantities, followed by your Alginate solution. If the drops form too quickly, your bath may be too strong. Try adding more water and trying again. They may be fragile, depending on how long you've let them soak, so be careful.\nDue to this particular method, the caviar will continue to solidify slowly after formation- eat them quickly if you want that pop or prepare them last. There is also another method called reverse spherification, in which your liquid contains calcium and is dripped into an Alginate bath (opposite of what we've done here). This method has a thicker, but non-progressing shell for your liquid. Stay tuned in the comments for my experiment with that.\nTime for assembly.. \n          Now for the moment of truth: putting it all together.\n\t\tSpoon your panna cotta into the chocolate cylinder (or if you went the lazy route, \u00a0squirt some whipped cream in there) until the cylinder is almost full.\n\t\tCarefully spoon the espresso roe on top of your filling until the cylinder is covered.\n\t\tIf you want some plate decor, you can use your cardboard cylinder form to punch little discs of panna cotta, and then scatter some extra roe around. Or drizzle some chocolate around. Make it look nice.\n\t\tTake artsy photographs of your masterpiece. You've earned it.\nRemember, sushi is in fact traditionally picked up with the hands and eaten in one bite. If you want to be a bit more civilized with this dessert, though, a fork is recommended (or chopsticks if you make them a bit smaller).\nServe this as the dessert for a sushi dinner party, and all your friends will forever think you're 100% classier than you are in real life. Guaranteed.\n        \nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 3, 1, 0]\nC: [2, 1, 3, 0]\nD: [1, 2, 0, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_38_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_38_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_38_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_38_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [0, 2, 3, 1]\nC: [1, 3, 0, 2]\nD: [2, 3, 1, 0]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. You will need: 1 1/2 cups heavy cream 7 bags of chai tea, (or whatever flavor of tea you would like! On two different occasions I have added a whole chili for a spicy truffle, and one time I used three cinnamon sticks!) 1/4 teaspoon salt 9 ounces dark chocolate (70 percent or higher!) 2/3 cups Dutch process cocoa powder Gold dust, for garnish (I have also sprinkled cinnamon sugar mixture and finely chopped nuts) ----> here is the link for gold leaf! (.20 a sheet!). Chop the dark chocolate finely, it melts faster. Combine the heavy cream and tea bags (or cinnamon) in a small saucepan. Place the pan over medium-low heat and warm gently, stirring occasionally, until bubbles just start to form around the edges of the cream, about 5 minutes. Simmer another 3 minutes before turning off the heat. *Note you can add vanilla or allspice too if you'd like a more intense flavor!I did this when my little took a nap!. Place the chopped chocolate and salt in a medium bowl. Strain the hot cream mixture through a fine-mesh strainer over the chocolate and let sit for 3 minutes. Slowly whisk the now-melted chocolate into the cream, starting with small circles in the center of the bowl and moving into larger circles as the mixture begins to come together; stop when smooth and completely blended. . Press a piece of plastic wrap directly on top of the ganache and refrigerate for about 3 hours, or until just set and cold but still pliable. (I didn't have plastic wrap on hand but I do recommend it, I feel like it didn't chill as fast with the foil!). Place the cocoa powder in a shallow bowl. With a 1-ounce scoop, scoop level rounds of the ganache into your palm and gently roll into a ball. Roll the truffles in the cocoa powder to coat, then place in an airtight container and refrigerate until ready to serve. Just before serving, sift a touch of gold dust, if using, over the top of the truffles for sparkle. ----> I small spoon works as well it will just be messier!Recipe courtesy of Giada De Laurentiis!. Take these little gems to your next Christmas party or as gifts for the ones you love! With love, Mama Mere Bear Enjoy! \nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 2, 3, 1]\nC: [1, 3, 0, 2]\nD: [2, 3, 1, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_39_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_39_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_39_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_39_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [0, 1, 3, 2]\nC: [2, 3, 0, 1]\nD: [1, 2, 0, 3]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Ready made chocolate frosting\nHostess (or other mass produced) chocolate cupcakes - Buy as many cupcakes as you want to have Daleks. Adjust quantities of other ingredients as necessary.\nChocolate chip morsels\nChocolate sandwich cookies (like Oreos)\nMini chocolate frosted donuts (approx. size of cookies)\nFudge covered marshmallow cookies\u00a0\nSmall pretzel sticks, Chocolate covered Pocky\nCheerios (colored, if you can get them)\nGood & Plenty candy (red bits only)\nA drinking straw to use as a tool\nCookie sheets for transport & storage\nBase to set your work up on (helps with frosting process). I used styrofoam bowls, turned upside down. Set a cupcake upside down on the little card stock base that comes with it in package. This helps you move the piece later.\nPlease both on top of inverted bowl. This makes frosting and deco process easier.. In order,\non top of the inverted cupcake.\nstack\na mini donut,\nan Oreo\n& a marshmallow fudge cookie,\n(with some frosting in between each for 'glue').. Make sure your stack is nice & straight.\u00a0\nCover the donut & cupcake with chocolate frosting.\u00a0\nLeave the cookies showing.. At this stage, move on to the next Dalek, and the next, and the next......etc..\nBringing them all up to this stage of finish together.\nThe rest of the process is detail work, and I find it easiest/most satisfying to get the structural work done on all before moving on to finish.\nHave a couple of cookie sheets on hand to set them on. You can easily pick them up by the little cardboard bases to transfer them there. Then, once room is made, put them - en masse - into the refrigerator to chill until you're ready for finish.\u00a0\nOR - watch them pile up on the stove top and counter.\nThey don't really require chilling, and you may want to keep an eye on them as they multiply ; ). Add chocolate morsels around the frosted base, spacing them as they would appear on the real thing.\nI got 3 in a row top to bottom, and about 6 rows all around.\nI do this step to all Daleks before moving on.\nHave Cheerios, Pocky, pretzels, the straw and red Good & Plenty bits on hand for next steps.. You will be using the pretzel rods for arms. Sticking them into the donut layer.\nPick a grape colored Cheerio out. Dip one of the pretzel rods' end into the frosting and place the grape Cheerio on that end.\nThis is the plunger arm, it goes on the left.\nRegular, plain pretzel rod goes on the right.Chocolate covered pocky is the eye piece rod. Dip one end into frosting, same as for arm, and place a lighter colored Cheerio onto it's end. Insert completed part into front of marshmallow cookie top, centered over arms.The Good & Plenty eyes will go on top into the marshmallow cookie.\nYou need the straw tool to pre-drill the hole for the candy, so you have a clean insert, without cracking up the fudge coating on the cookie.. With all your Dalek's finished, place them back onto their cookie tray transports and seal them in the refrigerator until you are ready to release them.\u00a0\nThis last step keeps them from melting in warm weather, and stops them from escaping or signaling home.\n(This is what you tell the kids, to keep them out of the fridge until party time ; )\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 1, 3, 2]\nC: [2, 3, 0, 1]\nD: [1, 2, 0, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_40_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_40_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_40_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_40_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [0, 1, 3, 2]\nC: [1, 0, 2, 3]\nD: [2, 3, 1, 0]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. For this salty-sweet ice cream, we need:\u2022 2 eggs\u2022 2 cups of milk\u2022 1/3 cup milk\u2022 1 teaspoon vanilla extract\u2022 1 cup heavy whipping cream\u2022 Sea salt (to taste)\u2022 Blue food coloring (optional)\u2022 Popsicle molds (optional). Got all your ingredients? Great! Start by separating the egg yolks and the egg whites into separate bowls. Now beat the egg whites until they're stiff. Then mix the egg yolks and sugar until thick (don't combine the two bowls like I did the first time).. Eggs all happy and separated? Good! Now slowly bring the milk to a boil, stirring occasionally. Once boiled, slowly mix it into the egg yolk-sugar mixture, mixing well as you do. Once it's mixed, put it back into the pot, heating it up to make a slightly thick custard. DO NOT BOIL THIS! Mix this in with the egg whites. Add sea salt until it's salty-sweet. Cool this mixture in the fridge. . Is it cooled? Perfect! Add in the cream, vanilla extract, and food coloring. You may need to add a little more salt because of the newly introduced ingredients. Now, time to make ice cream! Do this step in pint-sized (2 cup) quantities. I know, long and stuff. But work with it! Grab a large bowl, fill it about halfway with ice, and then stir in 3/4 cup rock salt. Now put the smaller bowl in there, almost all the way if possible. Now, put in your pint of ice cream, and mix it for ten minutes. All mixed? Put the whole set up (nested bowls and all) covered with a towel into the freezer for 45 minutes. Kill some time. Build a keyblade. Or beat that one kingdom hearts boss *coughsephirothcough* that's impossible. Time killed? Okay, now take it out of the freezer, and stir it for five more minutes, then put the ice cream in a container and freeze for 2 hours-overnight. Repeat with the remaining ice cream.  . Boy, that was time consuming! But congratulations! You're all done now! Share and enjoy it with your friends on the top of a really tall clock tower!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 1, 3, 2]\nC: [1, 0, 2, 3]\nD: [2, 3, 1, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_41_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_41_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_41_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_41_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [3, 2, 1, 0]\nC: [1, 3, 0, 2]\nD: [1, 0, 3, 2]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Melt the butter in a saucepan over medium heat.. Add the garlic and cook until golden.. In a bowl, combine the half and half, cornstarch, Italian seasoning, oregano, salt, and pepper.. Add the mixture to the cooked garlic and stir well. Using a whisk, stir until the mixture thickens, about 5-10 minutes.. Add the cheese and stir well.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 2, 1, 0]\nC: [1, 3, 0, 2]\nD: [1, 0, 3, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_42_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_42_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_42_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_42_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [2, 1, 3, 0]\nC: [1, 2, 0, 3]\nD: [1, 0, 2, 3]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. 2 tbsp Olive Oil\n2 cloves of Garlic\n1 Onion sliced\n2 pinches of Chilli Flakes\n2 pinches of Ground Ginger\n400g of Sweet Potato\n400ml of chicken or vegetable stock\n200g Creamed Coconut\n50g Fresh Coriander leaves chopped (optional)\nSalt and Pepper to taste\nCoriander or Spinach to garnish. Heat the olive oil in a large saute pan, add the finely chopped onion and the crushed garlic. Gently fry off for 3-4 minutes until soft and golden brown.. Once the onion and garlic are cooked through add the chilli flakes, ground ginger frying for a further 2-3 minutes.\nThen add the peeled and chopped sweet potato cooking for a further 2-3 minutes.. Now add the stock (hot) \u00a0and coconut. Allow to cook for about 10 minutes.\nTake care when adding hot liquid to a hot pan.\nAt this stage if you wish to add the optional chopped coriander leaves then do so.. Remove from the heat and allow to cool before pouring into a blender or food processor.\nBeware of adding hot liquids to a blender. Hot liquids can scald!\nBlend until a smooth and even consistency.\nYou might find that you need to add a little warm water to thin but this is entirely up to you.. Reheat or store in freezer.\nIf you are going to freeze the soup then it is essential to let the soup reach room temperature before putting it in suitable containers and freezing.\nSeason to taste and garnish with Coriander (or anything green), serve with home made crusty bread. Its really delightful.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 1, 3, 0]\nC: [1, 2, 0, 3]\nD: [1, 0, 2, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_43_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_43_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_43_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_43_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [1, 2, 3, 0]\nC: [2, 0, 1, 3]\nD: [1, 0, 2, 3]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. One 14-16oz package of Eckrich Smoked Sausage 1.5 cups milk 2.5 Tablespoons of coconut oil (or use butter if you don't like coconut oil) One 8 oz block of mild or sharp cheddar cheese  A 16-17oz package of Gemelli pasta (feel free to try other styles of pasta, but I love this one! The sauce soaks into the folds in the noodles and it's fantastic!)  Panko bread crumbs (optional - but if you like an added crunch to your bite, it's delicious!) Small bag of your favorite shredded cheese blend (optional) . 1. A saut\u00e9 pan 2. A large enough Sauce Pan to boil noodles in 3. Noodle drainer 4. Cutting board & Knife5. Spatula6. Measuring cup to measure the milk 7. Glass baking pan (optional - if you would like to bake your mac n cheese with bread crumbs after it's cooked)8. A Timer9. A spoon. Dice up your sausage into bite size pieces and put it in the saut\u00e9 pan.Then, dice up your 8 oz block of cheese and leave it off to the side on the cutting board, we will need it soon. Don't add it to the dish yet. . Fill your sauce pan more than half full of water and turn the stove on medium/high heat to get the water boiling. Once the water is boiling, drop in a little over half of the bag of 17 oz gemelli noodles, then set the timer for 12 minutes.While your waiting for the water to boil, start frying the sausage in the saut\u00e9 pan on medium heat, browning the sausage. This will help add that nice smokey flavor to the mac and cheese! . While your noodles are still cooking and you have about 5-7 minutes left until the noodles are done, It's time to start making the cheese sauce. Add 2.5 Tablespoons of coconut oil to the sausage in the saut\u00e9 pan. . Once the coconut oil is fully melted, mix the 1.5 cups of milk in with the sausage and coconut oil! Stir and mix it all together. Turn the heat to medium low or about 3-4 depending on your stove at home. If your milk begins boiling, stir frequently. . After you add the milk in and stir it all up, drop in the cubed up cheese and mix it all together. Feel free to add a small handful of shredded cheese too! Turn the heat to medium and stir frequently until the cheese and milk is melted and blended together. Once the cheese is melted, turn the heat to low. The milk, cheese, and coconut oil will make a nice cheese sauce. At this point, your noodles should almost be done! If they are already finished, wait until the cheese is fully melted to add the noodles to the saut\u00e9 pan. . Once the cheese sauce is melted and blended together, drain the noodles and mix them in with the sausage and cheese sauce. Stir them all together, serve, and enjoy! The next step is optional unless you would like to bake yours with bread crumbs to add an added crunch on top! . Scoop your fabulous mac and cheese into a glass baking dish, sprinkle a handful of shredded cheese on top, then cover it with a thin layer of panko bread crumbs. Bake at 325 for 10-15 minutes. . Remove from the oven and enjoy! \nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 2, 3, 0]\nC: [2, 0, 1, 3]\nD: [1, 0, 2, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_44_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_44_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_44_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_44_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [2, 0, 3, 1]\nC: [0, 1, 3, 2]\nD: [1, 0, 2, 3]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. You can, of course, use your own preferred pastry and filling, either homemade or bought, but here's what I did: Lydia's Dia de los Muertos savoury pie, feeds 3-4, for more add a hearty side of veg! For the pastry:\u00a0 225g (8oz) plain white flour, plus extra for dusting 50g (2oz) unsalted butter, diced, plus extra for construction 50g (2oz) lard or veg shortening (or just make the whole 100g up with butter) 2 eggs splash of water - I used about 2tbsp For the filling: - adjust this to your liking! 1 medium brown onion, chopped 2 cloves garlic, chopped 4 carrots, peeled and cut into 1.5cm discs 13 mushrooms (I used white button mushrooms), cut into quarters half a small pumpkin, chopped to 1.5cm chunks 2 large sprigs of rosemary 1 tbsp olive oil 1 tsp mustard seeds 2 tbsp tomato puree 1/4 tsp celery salt 1 tbsp gravy granules salt and pepper to taste splash of worcester sauce 1/4 tsp dried oregano 1/4 tsp dried sage 1/4 tsp dried mixed herbs 1 tbsp wholegrain mustard Tools: Chopping knife and board Wooden spoon / stirrer Pastry brush (or your fingers!) Tin foil Food wrap / Food bag Baking paper Rolling pin Pie dish about 7.5 - 9 inches in diameter Large frying pan / cooking pot with lid Fridge Oven Kettle About 2 hours.. To make the pastry, rub the fats into the flour (either with your fingertips or by cutting through the mixture with a knife) until you get a bread-crumb like texture. To this add 1 lightly beaten egg (leave the other egg for an egg wash later) and add the water a bit at a time, while stirring / working with your fingers, until a dough is formed. You won't need very much water - about 2 tbsp was all I needed. Cut off one third of the dough and flatten both dough balls into discs. Wrap in food wrap / place inside two food bags and refrigerate for 30 minutes. Meanwhile, start preparing the filling.\u00a0. Heat the oil in your pan and throw in half of the fresh rosemary (chopped, stalk removed), and your herbs. Add onions and cook on medium heat for about 5 minutes, stirring. Next, add the tomato puree, then the garlic, followed by the rest of the veg. Stir well and cook, covered, on a high heat for 5 minutes. Boil about 250ml water and add to the pan - only enough to reach halfway up the veg in the pot. Reduce heat to medium. Add the gravy granules, worcester sauce and season to taste with salt and pepper.\u00a0 Stir well, pushing the veg down into the water, place the remaining rosemary sprig on top and continue to cook, covered. After about 7-10 minutes remove the rosemary and continue to cook covered until the carrots and pumpkin are almost done - about 10 more minutes. Give it a taste and adjust the herbs as necessary. Finally, uncover and cook on low to medium to allow the filling to reduce (liquid to evaporate a fair bit).\u00a0 Preheat the oven to 180\u00b0C (350\u00b0F, gas mark 4).. Shape two pieces of foil (each about 50cm long) to form the cheek indentations (see photo) and fix in place on the pie dish with a piece of butter. Roll out the large disc of pastry on a floured surface, to fit the diameter and depth of the pie dish. Lift the pastry onto the rolling pin and gently place into the pie dish, easing it into the nooks and crannies with your knuckles. Put the dish back in the fridge. Roll out the\u00a0large disc of pastry on a floured surface, to fit the diameter of the pie dish. Cut out eyes (in the shape of aviator style sunglasses) and the nasal cavity (upside-down thin heart). Roll out the cut-outs of pastry into a long thin rectangle, and cut rectangles out for the teeth. Make about 20 that are almost square and 4 that are slightly thinner, for canines. Lightly beat the remaining egg and brush onto the pastry lid in a smile shape, and arrange your pastry teeth onto it. Transfer your pastry onto a baking sheet and refrigerate until needed.. Once your filling is ready, there'll be very little liquid left. This took about 45 minutes from the moment the oil was heated right at the start. You can either leave the filling to cool to room temperature, which will prevent the filling from melting your pastry and result in a slightly better structure, or you can work quickly! Ready? GO! Place the filling in the base and distribute evenly. Brush the pastry edges with beaten egg. Place the skull-shaped top, erm... on top, and seal the edges (push the pastry top and bottom together) using your fingers. Trim the excess from the edges. Brush the top with beaten egg - be careful around the teeth as vigorous brushing may dislodge them! Bake in your preheated oven for 45-50 minutes until nicely golden. Remove from oven and leave to cool for 5-10 minutes before removing from base with a slotted spoon and digging in!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 0, 3, 1]\nC: [0, 1, 3, 2]\nD: [1, 0, 2, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_45_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_45_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_45_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_45_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [0, 1, 3, 2]\nC: [2, 0, 3, 1]\nD: [0, 3, 1, 2]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. 2-3 pounds boneless pork shoulder (aka boston butt) 1 tsp Oregano 1.5 tsp Cumin 2 tsp Salt 1 tsp Pepper 10 cloves of Garlic 1 medium Orange (or about 1/3 cup of orange juice) Onion (optional)(Note: I have coconut oil in the picture, but I didn't actually use it... ooops!). If you are using onions, then cut it into quarters and place it in your crock pot. Then, place the pork on top of the onions. If you're not using onions, then place the pork directly in the crock pot.If you don't have a fancy citrus press or a citrus reamer, then do what I did to juice the orange over the pork:Cut the orange in half.  Stick a fork in the middle of an orange half as a makeshift reamer.  Squeeze the orange shell in half while moving the fork around to break up the orange pulp. It's okay to get some pulpy bits on your pork, but you will want to remove any seeds that fall in. . I have to admit that I love garlic, so I use a lot! If you have some really large garlic cloves, then you can cut it in half like I did. Then using a paring knife, cut slits into each side of the pork and insert the garlic into the slits.. Mix the spices together in a small dish and sprinkle it evenly onto the pork. You can also pat the spices into the pork gently. . Set the crock pot to low and cook for 6-8 hours. Go to work, run some errands, hit the gym, and come home to the delicious aroma of perfectly cooked carnitas! Use two forks to shred it up and it's ready to be paired with some beans and yellow rice, stuffed into purple corn tacos, or anything! Note: If you want it crispier, you can put the shredded pork on a cookie sheet and broil for 3-5 minutes. I never get to this step because it smells so good, I have to dig in! ;-)  \nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 1, 3, 2]\nC: [2, 0, 3, 1]\nD: [0, 3, 1, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_46_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_46_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_46_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_46_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [1, 0, 2, 3]\nC: [3, 1, 0, 2]\nD: [0, 1, 2, 3]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Take your almonds and put them into a food processor and pulse them until they become crumbs.Now drain your Dates (and save the soak water for later on) and process the Dates with the Almonds until they are well combined and very sticky.Press into a lined springform pan and set aside.. I don't know about you, but I love Ice Cream! So if you want to make this part a thicker layer on the cake...just put in more frozen Bananas!Put the frozen Bananas and the Vanilla into the blender and whiz away until it's thick, creamy, and smooth. Keep an eye on it as it doesn't take a long time to change!Spread this on top of the pie crust and set aside.. Place the Dates, a 1/4 cup of the date soakwater and the Almond Butter into the food processor and process until it's the smoothest that you can get it.Spread your Caramel on top of the Banana Ice Cream layer.. Place all the ingredients into the (cleaned) food processor. If you aren't using the Agave Nectar, just use 1/4 cup of the date soakwater and 1/2 cup of Dates.Again, just process everything together until it is smooth!Layer this on top of the Caramel, being careful not to mix the Caramel and Chocolate layers together!Put the Cake into the freezer and freeze overnight.How easy can it get?!?. When the time has come and you are ready to indulge or impress your friends...Take it out of the springform pan and let it sit out for about 5-10 minutes, then cut it into the sizes you wish to serve, but just before you serve it, sprinkle a little bit of pink Himalayan salt on top. This will bring out the chocolate flavor even more!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 0, 2, 3]\nC: [3, 1, 0, 2]\nD: [0, 1, 2, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_47_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_47_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_47_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_47_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [2, 1, 3, 0]\nC: [2, 3, 1, 0]\nD: [3, 2, 0, 1]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. You will need: Food Processor or Strong Blender 2 Cups Spinach (Yup that's right, just stay with me!) 1 Cup Raw Almonds 10 Large Dates (Remove pits) 3/4 Cup Peanut Butter 1/2 Cup Honey 1/2 Cup Almond Meal 1/2 Cup Oats 1 Tbs. Flax Seed 1 Tbs. Green Superfood Powder 1 Tbs. Coconut Oil 1/2 Cup Coconut Flakes 1/4 Cup Coconut Flakes (To roll cookies in after). Place all ingredients in the food processor EXCEPT: Flax seeds Oats Almond Meal Pulse in food processor until everything is a small texture, should be very sticky at this point.\u00a0. Pulse in with the wet ingredients: Flax Seeds Oats Almond Meal Pulse everything together until it starts to ball up.. Make desired size cookie balls and roll in coconut flakes.\u00a0. Enjoy your Popeye Power Cookie immediately (and watch your muscles bulge!) or save them for later. If you want to save them, wrap each cookie in plastic wrap and put in refrigerator.\u00a0\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 1, 3, 0]\nC: [2, 3, 1, 0]\nD: [3, 2, 0, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_48_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_48_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_48_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_48_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [0, 1, 3, 2]\nC: [0, 3, 2, 1]\nD: [3, 1, 0, 2]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. \u2022 1/2 pound sliced bacon (diced)\u2022 1/2 medium sweet onion (chopped)\u2022 3 eggs (lightly beaten)\u20222 cups frozen shredded hash brown potatoes(thawed)\u2022 1 cup shredded cheddar cheese\u2022 3/4 cups 4% cottage cheese\u2022 5/8 cups shredded Swiss cheese6 servings. Preheat oven to 350 degrees.  . Cut up the bacon and onion.  Dice the bacon and chop the onion.. In a large skillet cook the bacon and onion on medium heat until the bacon is crisp.  If you need to put the bacon in the microwave start with 30 seconds and add any additional time needed.  When it is cooked drain the bacon and onion.. Lightly beat the eggs and put them in a large bowl.. Shred the potatoes or just buy shredded hash browns and put them in the large bowl.. Add the remaining ingredients into the large bowl. (Shredded cheddar cheese, cottage cheese, shredded Swiss cheese, bacon and onions). Next transfer the ingredients to a 9 inch round or square dish. Put the dish in the oven for 35-40 minutes.  When done let stand for 10 minutes and enjoy your \"Go To Omish Egg Caserole\".\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 1, 3, 2]\nC: [0, 3, 2, 1]\nD: [3, 1, 0, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_49_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_49_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_49_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_49_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [3, 1, 0, 2]\nC: [3, 1, 2, 0]\nD: [2, 1, 3, 0]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. 1 \u00bd cups pureed white part of a watermelon \u00be cup of wheat flour \u00bc cup rice flour1 teaspoon Red chili powder1 small onion2 green chilies15 chervil leavesSalt to tasteOil for frying. finely chop onion, green chilies and chervil.Mix rice flour and wheat flour together. Add red chili powder. Add salt, chopped onion, green chilies and chervil. And the pureed white part of a watermelon.Adjust consistency with water as required and make a smooth batter.Cover and keep aside for an hour. While it sits there let\u2019s make the Watermelon chili chutney. you will need:Chili sauce 2 tablespoonsWatermelon juice 2 tablespoonsSalt to tasteButter (not shown in the picture)Ginger and garlic paste (not shown in the picture)method:Add some butter to a small wok and add ginger garlic paste and saut\u00e9. Then add the chili sauce and watermelon juice and let it come to a boil and chutney is ready.. Heat a griddle with oil, when hot drop a ladleful of batter and spread into a circle with the back of your spoon. Place a chervil leaf in the centre. Let the pancake cook and flip over and cook the other side as well. Serve with hot watermelon chili chutney. :)Vote for me please. I'm aiming for one of the blenders :( I'm moving to NYC next month, I can\u2019t afford to buy furniture\u2019s and kitchen utensils all at once. Your vote can take me near to winning it. Thank you soo much xo\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 1, 0, 2]\nC: [3, 1, 2, 0]\nD: [2, 1, 3, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_50_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_50_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_50_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_50_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [1, 3, 2, 0]\nC: [3, 1, 0, 2]\nD: [0, 3, 2, 1]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. You need the following ingredients to make these delicious pizza pancakes:\n- 400 gr. flour\n- 3 eggs\n- 900 dl. milk\n- tricolor peppers\n- 2 onions\n- 250 gr. mushrooms\n- 2 packs\u00a0 of bacon\n- oil\n- a pan. \nTake a bowl and add the flower and half of the milk.\nMix the dough with a spoon or whisk untill it is smooth and firm.\nAdd the rest of the milk, two eggs and a little salt in the dough and mix it all together.\nMake the dough smooth without lumbs.. Take a cutting board and knife.\nCut the peppers in pices, the mushrooms in slices and the onions in half rings.\nHeat up the pan with some oil and bake the veggies in separate pieces.\nMake sure you have everything ready for baking and easy to reach.. \nHeat up the pan and poor a little oil in.\u00a0Make sure the pan is really hot.\nPut in 2 slices of bacon (or less/more) and quickly poor the dough in the pan.\nFill up the pan nicely, not too thick not to thin.\nWhen the dough is still wet put in some of the unions, paprika and mushrooms.\nFlip the pizza when the top is dry and the bottom brown.\nWait till the other side gets a little brown and warmed up.\nTake the pan of the stove and put the pancake on a plate\nThe pancake is ready to eat! Poor some syrup on the Pizza Pancake if you like and you can enjoy your lovely Pizza Pancake!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 3, 2, 0]\nC: [3, 1, 0, 2]\nD: [0, 3, 2, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_51_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_51_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_51_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_51_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [2, 1, 3, 0]\nC: [2, 1, 0, 3]\nD: [1, 2, 3, 0]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Dough Ball (store purchase) - oneShredded Mozzarella Cheese - 1 cupRicotta Cheese - 1/2 cup* Although I used a different brand this time,  Polly-O Ricotta Cheese (http://www.kraftbrands.com/pollyo/) is my favorite because of creaminess.Extra Virgin Olive Oil - 2 table spoonsFlour - 1/2 cupPeaches - one or twoRaspberries - 1/4 cupBlueberries - 1/4 cupFigs - a fewArugula - 1/2 cupBasil - a few leavesHoney - 1/4 cup* Sliced Almonds (option)* Cocoa Powder (option). Cut the dough ball into 3 pieces by using knife.. When you stretch the dough, you have to apply some flour so that they don't stick to your hands.. I preheated the grill really high, and then turned it down to low, so the dough doesn't get burned.However, some people like it crispier. If so, you can keep the flame high and grill it for 30 seconds or so.It's totally up to you!When you started to see dough rising, then flip it over to grill the other side.. I applied some olive oil on the top side for some flavor.. Slice peaches and figs. Drizzle some olive oil over arugula and toss them, so they stay fresh.Tear some basil leaves into smaller pieces by hand.Start with adding Mozzarella cheese on the crust. And add some Ricotta cheese just for peach and berry pizzas.#1) Arugula and figs#2) Peaches#3) Raspberries and blueberriesDrizzle some honey over the pizzas!. Put the pizzas back on the grill and cover the lid.It only takes a few minutes since I am just warming up the toppings and melt the cheese.. This step is an option. I added basil and sliced almonds on the peach pizza. And I sprinkled some cocoa powder on the berry pizza.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 1, 3, 0]\nC: [2, 1, 0, 3]\nD: [1, 2, 3, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_52_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_52_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_52_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_52_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [2, 1, 3, 0]\nC: [0, 2, 1, 3]\nD: [0, 3, 2, 1]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Fire MeadIngredients\n1 kg blackberry honey\n3 black cardamom pods\n6 cloves\n2 dried ancho chili peppers\n1 cup plain black tea\n3 small blood oranges\n1 pkg Lalvin EC-1118Equipment\n1 sanitizing vessel\nlots of sanitizer\n1 - 2 x 1 gallon (3.78 L) glass carboy or other fermenting vessel\n1 funnel\n1 big metal spoon\n1 coffee mug\n1 brewing pot\n1 pair of scissors\n1 airlock setup\n1 siphon hose, at least 4 ft in length\n11 beer bottles and caps (or 5 pressure-capable 750 ml wine bottles and corks)Notes on Ingredients and EquipmentBlackberry Honey: Any sweet honey with a nice fruity flavour will do, but blackberry is just sooo good.*Black Cardamom Pods: Other cardamoms can be used as a substitute but they won't give it a nice smoky flavour.*Blood Oranges: Try to get blood oranges because they have a certain intensity and tang, but any other sweet and sour orange will work great. *Sanitizing Vessel: This should be solid and large enough to hold all the equipment you need for each step of the brewing process. I use a big Rubbermaid container.Sanitizer: Make sure you read the instructions for your sanitizer and get something food-safe. You can use unscented household bleach diluted to 4ml per Liter of water (1 tbsp per gallon) but make sure you rinse it really well or the mead will taste off.Carboy: Any fermentation will do as long as it's nonreactive, BPA free and foodsafe with some kind of one-way air release valve or a hole that will fit an airlock setup.Coffee Mug: Doesn't have to be a coffee mug but it should be big enough to hold yeast and a cup of warm water. This will be your yeast starter vessel.Airlock Setup: Usually a 3-piece thing that fills with water only lets the CO2 escape and no contaminated air into the brew. The important thing is that no contaminated air gets into the mead. A simple balloon with a pinhole in it covering the top of the fermentation vessel works well.Beer or Wine Bottles: These must be capable of withstanding the pressure of a carbonated beverage, no bottle-splosions here! If you choose to use wine bottles get ones with swing tops or champagne corks with cages.\n*Don't listen to me about the ingredients, do whatever you want with your own mead and then post awesome recipes in the comments!\nImage: My first brewed batch of Fire Mead.. \n          Clean your kitchen and then sanitize everything. I mean everything that you need for the brewing step, that is.\nFor making the must you need to\u00a0sanitize\n\n\t\tthe fermenting vessel\n\t\tairlock parts\n\t\tfunnel\n\t\tcup or other yeast starting container\n\t\tspoon\n\t\tscissors\n\t\tyeast packet\n\t\tsealed honey container\n\t\tbrew pot and lid\nFollow the directions that come with your sanitizer.\u00a0\nIf you're wondering, must is a solution that you feed the yeast to make wine or mead. It can be honey and water or grapes and water... it could even be tomatoes, sugar and water!\nOnce everything is sanitized and ready you can boil some water for the yeast starter. Dissolve a teaspoon of honey with the recommended amount of water on the package. Once the water has cooled to the recommended temperature scatter the yeast over the top. Give them a gentle stir so that most of them fall to the bottom of the dish and they can get nice and hydrated. Let the yeast bloom and come back to life. They'll start making bubbles and floating to the top of the water, it takes about 10-15 minutes.\nMeanwhile, get the brewing pot on the stove and add half a gallon of water. Put it on the stove and turn it on low. The must should never boil, or even simmer. Honey is a very delicate flavour and applying too much heat can destroy the flavour so make sure it only steams, and doesn't bubble. Chop the ancho chili up with the scissors and add it to the pot with the cloves, cardamom and honey. Give it a good stir until the honey is dissolved and zest those oranges. Segment the oranges so that there is no pith in the must. Add the orange fruit and zest and let steam for 20 minutes. Skim off anything that rises to the surface.\nLet the must cool, putting the pot with it's lid on it in a sink of cold water.\u00a0Once\u00a0the must has cooled to the same temperature as the yeast starter pour the must into the sanitized fermenting vessel and then add the yeast starter. Top it up with some warm water and seal with an airlock. Put it somewhere dark and cool and wait for 1 -2 weeks until there is about a 2.5 cm yeast layer on the bottom of the carboy. If you don't have somewhere dark cover it with a towel.\nImages: Beer bottles with sani-brew sanitizer, sanitizing a 1 gallon carboy, the mead pitches and topped up in a 1 gallon carboy.. Sanitize another 1 gallon carboy or fermenting vessel, a siphon hose, and the airlock setup. Cover the top of the carboy with aluminum foil while the equipment sanitizes. If you don't have a second carboy then sanitize a container that can hold the mead while you clean up the original fermenting container.\nPut the mead somewhere a couple feet off the ground and siphon the mead into a new carboy. Leave the spices and chili behind with a little bit of mead and the lees. Lees are the dead yeast cells that pile up on the bottom of the fermenting container and are a normal part of the process. They're pretty good for the garden if you water it down before you use it.\nOnce most of the mead is in the new fermenting vessel, top it up with a bit of water, if necessary and pop the airlock on. Wait for another month or so, giving the mead a soft kick every week or so to loosen up the CO2.\u00a0\nImages: Lees and leftovers from racking fire mead, Siphoning out the mead.. To bottle we need to sanitize the bottles, the siphon hose, and the bottle caps. Give the kitchen or wherever you'll be bottling a good clean and clean the bottle capper too.\nIf you have a bottle filler you should sanitize it but if you don't making a kink in the siphon hose works well too.\nOnce everything is clean and sanitized add 3.5 ml (3/4 tsp) sugar to each bottle and siphon the mead in. Cap the bottles and get ready for\u00a0the hard part. Store it somewhere cool and dark and wait for 3-6 months for the mead to become carbonated, and age a while.\u00a0\nWhen you can't wait any more, pop one open and enjoy!\nImages: Siphoning the mead into bottles, siphoning the last bits of the carboy, the final product.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 1, 3, 0]\nC: [0, 2, 1, 3]\nD: [0, 3, 2, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_53_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_53_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_53_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_53_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [2, 1, 0, 3]\nC: [1, 2, 3, 0]\nD: [0, 2, 1, 3]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. 2 lb. or 907 grams Clementine Oranges 2 lb. or 907 grams GrapefruitLemon zest and juice from 1 lemon6 cups water6 cups sugarAdding a sweetener is must with marmalade; it is very sour and rich in vitamin C, you can go with Splenda or some other sweetener. Just match the recipe cup for cup, or \u00be cup sugar and \u00be cup Splenda for low sugar. For low sugar or no sugar added you will need to add pectin to get the marmalade to thicken.1 sachet of pectin powder.I used No Cook Pectin, if you use no cook pectin or regular pectin you will still need to process the marmalade. Processing allows you to store the jam at room temperature in a dark place and it doesn\u2019t affect the pectin negatively. Once opened you will need to refrigerate the marmalade, refrigerated the marmalade will keep six months.Equipment: 10 (8-ounce) canning jars with rings and lids, funnel, tongs, ladle, and 12-quart pot and a cooking thermometer.. Wash the oranges, grapefruit, and lemon thoroughly.Cut the oranges and grapefruit into thin slices, removing the seeds as you go. Stack the orange and grapefruit slices, and then cut them into quarters or more if that is your preference. Place the oranges and grapefruit into an 8-quart stainless steel pot. Add the lemon zest and juice and the water to the pot, set over high heat and bring to a boil, approximately 10 minutes. Once boiling, reduce the heat to maintain a rapid simmer and cook, stirring frequently, for 40 minutes or until the fruit is very soft.While the fruit is cooking, fill a large pot (at least 12-quart) 3/4 full with water, set over high heat and bring to a boil. Place 10 (8-ounce) jars and rings, canning funnel, ladle, and tongs into the boiling water and make sure the water covers the jars by at least an inch. Boil for 10 minutes. Turn off the heat, add the lids and leave everything in the pot until the marmalade is ready.. Increase the heat under the orange mixture to return to full boil. Add the sugar or sweetener and pectin, and stir the mixture continually, until it reaches about 222\u2070 F or 105\u2070 C on a deep fry or candy thermometer. Cook until marmalade darkens in color, approximately 15 to 20 minutes, the marmalade may not darken if you make no sugar added marmalade. You may need to adjust the heat in order to prevent boil over. Test the readiness of the marmalade by placing a teaspoon of the mixture onto the chilled plate and allowing it to sit for 30 seconds. Tilt the plate. The mixture should be a soft gel that moves slightly. If mixture is thin and runs easily, it is not ready.. Place your jars and lids in a pot of water and bring the water to a boil. Remove jars from the water and drain on a clean towel. Place a canning funnel onto the top of 1 of the jars and ladle in the marmalade just to below the bottom of the threads of the jar. Repeat until all of the mixture has been used. The amount of marmalade may vary by 1 to 2 jars. Wipe the rims and threads of the jars with a moist paper towel and top each with a lid. Place a ring on each jar and tighten.Return the jars to the pot with boiling water, being certain that they don't touch the bottom of the pot or each other. (If you don't have a jar rack, try a round cake rack, or metal mesh basket. Even a folded kitchen towel on the pot bottom will do in a pinch.) Add additional water if necessary to cover the jars by at least an inch. Boil for 10 minutes. Using canning tongs, carefully remove the jars from the water, place in a cool dry place and allow to sit at room temperature for at least 24 hours before opening. Once open, store in the refrigerator.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 1, 0, 3]\nC: [1, 2, 3, 0]\nD: [0, 2, 1, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_54_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_54_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_54_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_54_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [3, 2, 0, 1]\nC: [2, 3, 1, 0]\nD: [1, 2, 3, 0]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Ingredients:1 package of puff pastry sheets.  These are located int he frozen dessert section of the grocery store.1 apple.  I used granny smith, but I but these would be great with honey crisp too.1 TBSP caramel topping. Any brand will do.  This Smuckers stuff was so good I wanted to drink it.1 Tsp honeyA sprinkling of Cinnamon and Sugar.Tools:Keep some flour handy in case you re-roll your dough, or find it gets too sticky.Knife, rolling pin, apple corer, baking sheets, spoon, microwaveable bowl. Your very first question is probably going to be \"WHERE DID YOU GET THAT AWESOME COOKIE CUTTER?\"I have a globe trotting friend who visited the Mutter Museum in Philadelphia, which is apparently a museum of medical oddities.  If you're not conveniently located near the museum, you can get yours here:  http://www.muttermuseumstore.com/merchant2/mercha... If you're feeling really ambitious, you could probably also splice together your own cookie cutter from existing gingerbread men, or freehand a design with a knife. I recommend the official conjoined twins for consistency.Allow the frozen puff pastry to thaw.  Ideal shape cutting happens when the dough is soft enough that it no longer fights to be unwrapped, but not so warm that it is sticky.  If you're using the Mutter twins cutter, you'll get about 4 sets of twins per pastry sheet.You can gather your excess dough and re-roll it to cut more shapes.  Just remember to dust the dough with flour to keep it from sticking to your tools.*Puff pastry dough is VERY elastic.  If you re-roll and cut more twins, expect those twins to contract and become smaller and thicker than your originals.  This is a freak show, so just embrace the anomalies.. At this time, go ahead and pre-heat your oven to 400 degrees.  It'll be ready to go by the time you're done prepping apples.Use your apple corer to slice the apple into sections.  Now take those sections and use your knife to slice them thin.  It doesn't have to be paper thin, but avoid large chunky pieces for a slick, finished look.  Always slice away from your hand, unless you have vampire guests who like extra sauce.. Place your apple slices onto the siamese twin pastry shapes.  2 or 3 slices seems to work well, depending on how you want to arrange them.  I have 2 slices going down the legs because it reminded me of little pants.Pop these fancy little freaks in the oven for 10-12 minutes.  Take a look at them as you approach 10, to make sure the edges aren't getting too brown for you.  I take mine out on the early side because our oven seems to run hot.. In a small microwaveable bowl, mix 1 hearty TBSP caramel topping with 1 tsp honey.  Microwave no longer than 10-15 seconds on normal power.  They will blend together smoothly when stirred.  Try not to eat the whole cup before the tarts emerge from the oven.I've added the honey because I think it helps make the caramel easy to drizzle, and it gives an additional layer to the flavor.  This recipe is also excellent with JUST honey, if you prefer.. Remove your twins from the oven when the edges are golden brown.  You will notice that the pastry has puffed up and some areas may be raised above your apple slices.  That is to be expected, and won't interfere with the readability of your shape.While the twins are still warm, use a spoon or other utensil to drizzle your caramel/ honey mix on top.  No need to try doing fancy designs, because the sauce will warm and just do whatever it wants anyway.  Sprinkle on sugar and cinnamon to taste.. Your Siamese Twin Tarts are ready to be unleashed upon your unsuspecting guests! Keep napkins nearby, because that delicious caramel sauce tends to remain pretty gooey. Your finished tarts should look something like this.  The little guy on the left is an example of a \"re-roll\", where the dough contracted and made him smaller.. This recipe can work with a variety of creepy shapes.  For example, my other favorite cookie cutter, the Fetus.  I made Fetus tarts last x-mas (because.... baby Jesus?) and it was definitely something my guests had never seen before.  This guy was also a gift from a friend and is available at http://hogmalion.com for those who dare.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 2, 0, 1]\nC: [2, 3, 1, 0]\nD: [1, 2, 3, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_55_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_55_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_55_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_55_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [0, 3, 2, 1]\nC: [0, 1, 2, 3]\nD: [2, 0, 1, 3]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Take the fish and clean and wash,remove the innards, if you do not have cleaned the fish in the store.. Take a pot of water, bring the water to a boil and put in it tea bags. On low heat let them simmer for 5 minutes. Cool water to room temperature, put it in salt and sugar. In my case, I cut the fish head,no one will eat them.:))Put them to glass container. add pepper and bay leavespour the brine to cover the fish. put the container in the refrigerator.Two days later, took the container and turned the fish in the container. Put the container in the refrigerator for two more days. Take the fish out of the brine and hang it for 1-2 hours. Allow to drain the brine.\u0411rush the fish with olive oil.You can cut into meal or can food plastic wrap and put in the refrigerator.. \nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 3, 2, 1]\nC: [0, 1, 2, 3]\nD: [2, 0, 1, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_56_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_56_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_56_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_56_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [1, 0, 3, 2]\nC: [2, 3, 1, 0]\nD: [2, 0, 3, 1]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. For this recipe you will need:6 to 6 1/2 cups of all-purpose flour 3 tablespoons sugar 2 envelopes of quick-acting yeast (or 4 1/2 teaspoons) 2 teaspoons salt 1/4 teaspoon baking soda 1 cup buttermilk   NOTE: You can substitute 4 tablespoons of dry buttermilk powder instead and add it to the dry ingredients.  Increase water to 2 cups 1 cup water 1/3 cup butter, cut into pieces. Add 3 cups of the flour, the sugar, yeast, salt and baking soda (and buttermilk powder, if using) to the bowl of a KitchenAid mixer with a dough hook.Using your KitchenAid mixer with the dough hook, mix the dry ingredients on speed 2 (low speed on a hand mixer) for one minute to combine.  NOTE: If you do not have a mixer, just use a large bowl and a wooden spoon, or a hand mixer.. Heat buttermilk, water and butter until very warm (120 -130 degrees F); butter does not need to melt.  To gauge the temperature, use a food-grade thermometer.  If you do not have a thermometer, make sure the water is warm to the touch, not hot.  If your water is too hot, the yeast will not activate.Add to flour mixture and mix on speed 2 (low speed on a hand mixer) for one minute. While KitchenAid mixer is still on speed 2, add enough of the remaining flour, 1/2 cup at a time, to make a soft dough. NOTE: I usually use all 6 1/2 cups of flour, but weather conditions can cause this to change.  Only use what you need in order to allow the dough to \"clean\" the sides of the bowl as in picture 2.. Continue with the KitchenAid mixer on speed 2 for two minutes in order to knead the dough.  NOTE: if you do not have a mixer, instead knead with your hands for 6 - 8 minutes until dough is smooth and elastic.  If you are unfamiliar with how to knead dough, check out this excellent tutorial from Allrecipes.com: Kneading TutorialKneading the dough is important in order to make sure the gluten in the dough is well developed.  Do not skip this step!. Remove dough from bowl and shape it into a ball on a floured surface. Let it rest on the floured surface, covered with a dry towel, for 10 minutes. You will notice the dough has risen slightly and has a smoother appearance.. On a lightly floured surface, cut the dough in half. Roll out one half of the dough into a 7\" x 12\" rectangle If necessary, use your hands to gently stretch and shape the dough to create as even a rectangle as possible.. Starting with the short end, roll the dough up tight into a log. Pinch the seams together. Seal the ends by flattening with the side of your hand to create a flap. Fold the flap underneath the log. Place the log, seam-side down, in a greased 8 1/2\" x 4 1/2\" loaf pan. Repeat with the remaining dough.. Place the pans in a warm, draft-free location and cover with a dry cloth until double in size, about 30 - 45 minutes.. Bake at 375 degrees F for 30 - 35 minutes or until done. Remove from pans; if desired, brush on melted butter with a pastry brushLet cool on wire racks.Slice and Enjoy!!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 0, 3, 2]\nC: [2, 3, 1, 0]\nD: [2, 0, 3, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_57_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_57_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_57_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_57_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [0, 1, 3, 2]\nC: [0, 2, 3, 1]\nD: [3, 1, 2, 0]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Once your wooden applique has been cleaned, let it dry and place it on a non stick surface. We used a piece of glass. Melt the ComposiMold-FC and pour a thin layer over the wooden piece. Be sure it covers the entire surface, including the edges. Once this layer has cooled for approximately 20 minutes, pour another layer of ComposiMold-FC over the first. The second layer will add strength to the mold. The first layer picked up all the details. Use a toothpick to move any bubbles up and away from the surface of the wooden applique. You do not have to pop them or completely remove them, just be sure they are not touching the surface.. Once the second layer of ComposiMold-FC has cooled (another 30 mins or so) you can flip the entire mold over and bend it away from the wooden applique. . Roll your favorite fondant out to 1/8\" and lay this sheet into the mold. Press it into all the details of the mold being sure to get it into all the edges too. . Use your thumb to press and rub the excess edges of fondant away from the final piece. This will tear the extra away. Then you can gently roll the torn edge in towards the final piece to create a nice sharp edge once it's flipped back over. . Carefully flip the mold over again with the fondant in place. Then you can gently lift an edge of the mold and begin to bend it away from the fondant. You might have to hold an edge of the fondant down to get started, Then the mold will easily peel away from the fondant. Check out how the mold picked up every detail from the wooden applique. . Using edible spray paint and sugar dust we were able to paint the eagle to look like more realistic. To transfer the fondant to the side of a cake, simply roll the eagle onto a rolling pin and un-roll it onto the side of the cake. Remember to re-melt to re-use the ComposiMold-FC for all your cake decorating projects. \nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 1, 3, 2]\nC: [0, 2, 3, 1]\nD: [3, 1, 2, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_58_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_58_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_58_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_58_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [3, 0, 2, 1]\nC: [3, 0, 1, 2]\nD: [0, 2, 3, 1]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Select tender ridge gourds which are not affected by any pests.. The skin of matured ridge gourds may be fibrous and will not taste good. Also make sure that the gourds are not sprayed with pesticides.Wash and cut the ridge gourds to manageable size.Peel the skin using a vegetable peeler and collect them in a platter.. Ingredients requiredHandful of shallots. You can use one or two big onions also.Three to four Garlic clovesFour Green chilies. If you want the chutney to be spicy, you can add more.About half a cup of raw coconutSmall piece of tamarindTwo teaspoons of split black lentilsSalt to tasteAbout a tablespoon of cooking oilPeel skins of shallots and garlic. Cut all ingredients into small pieces, so that all of them can be cooked evenly.. Heat a frying pan over medium fire and add a tablespoon of cooking oilAdd the split black lentils to the heated oilAdd the piece of tamarind and saute for a minuteAdd sliced shallots and garlic.When the shallots and garlic pieces turn to a light brown color, add sliced coconut piecesSaute for one more minute. Add the ridge gourd skin to the sauteed ingredients in the panStir fry till the raw smell from the ridge gourd skin disappearsAdd salt, mix well and remove from heat.Allow the ingredients to cool to room temperature.. Once the ridge gourd skin mix cools down, transfer the ingredients to a mixer / grinder jarGrind the ingredients to make a rough paste.From time to time turn over and mix the ingredients in the jar so that all ingredients are ground evenly.Add little amount of water if required.Once the ingredients are ground to required consistency, transfer to a bowl.Two or three teaspoons of this chutney is served as a side-dish along with rice. You can also add little more water and a little bit of salt to taste and make it thin so that this can be served with Idly and Dosa.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 0, 2, 1]\nC: [3, 0, 1, 2]\nD: [0, 2, 3, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_59_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_59_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_59_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_59_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [1, 3, 0, 2]\nC: [2, 0, 3, 1]\nD: [2, 0, 3, 1]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. To make your campfire peach pies you will need:Ingredients:-Brown sugar-Butter-Pie crusts (I used the Pillsbury brand ones)-PeachOther stuff:-Aluminum foil-Bowl-Rolling pin-Roasting stick-Firepit-Firewood. To peal a peach you need to fist boil some water and then put it into a bowl/container and submerge your peach in that water for about 1-2 minutes.  Next submerge the peach into cold water for 1-2 minutes.  Then take your peach out and it should be easy to slip the peel of the peach with your hands.  . The next step is giving your peach a tasty coating.  For this you will need to make a mixture that is 2 parts brown sugar to 1 part butter.  Slather this all over your peach.  . Lay out your pie crust across a plate, and place your peach in the center.  Wrap and fold your crust up and around your peach so that it will have some excess on the top.  Remove the extra and roll that out again.  Wrap your peach until you've used all the dough from one circular pie crust.. Wrap your pie up in some tin foil with the shiny side facing in.  I found that it was best to do 3 layers so that the tin foil wouldn't burn off or rip if you moved it around (this happened the first time I tied this when I only USD one layer).  Wait until your fire has gone to coals before you put in your pie. I found that the best way to get your pie into you fire was by puncturing a hole in the tin foil at the top with a roasting stick and then sticking a prong of the roasting stick into that hole and guiding it into the firepit.  I would stick it in again to move the pie around as needed if there was a better area for cooking in the coals, or if it was getting too hot.  . Remove your pie form the fire pit using the roasting stick and place it on some pieces of wood to cool (5 minutes). One it is cooled remove the tin foil and you should have a lovely pie. You can eat it by just biting into it or cutting it up or serving it with ice cream. Enjoy!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 3, 0, 2]\nC: [2, 0, 3, 1]\nD: [2, 0, 3, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_60_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_60_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_60_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_60_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [3, 0, 1, 2]\nC: [1, 2, 0, 3]\nD: [0, 3, 1, 2]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. One bread knife, one or more round rolls (buns), a number of eggs equivalent to the number of rolls.. Cut off the top of the roll.. Take out the inside of the roll. Do not take too much as the egg will leak. There has to be enough room to accommodate one egg and some spices and top ups.. Load the egg into the roll and top it up with your favorite spices. I have chosen a bit of salt, loads of white pepper and loads of red paprika powder. U can use chilli peppers, I would go for Scotch Bonnet if I had some, (u have to remember that they r very hot!!!) cut into small pieces, without any problem making the dish very Mexican ;). I added a bit of low temperature melting fat chase as well. You egg roll is ready for the oven.. Load the roll into the oven for about 20-25 min @180 centigrades thats 356 Fahrenheit. The time will vary depending on the oven type, forced air circulation etc. so you will have to experiment with your oven go get the desired effect. I usually try to have the egg roll very soft inside with the liquid yoke and the white barely done. You simply have to observe the time required for the desired effect in your oven so next time you will be able to set the timer up.. You egg roll is nice and ready. Because no fat is used for the egg processing the white is going to be very delicate and hot, so you will have to wait a bit longer than with the regular fried egg as the bun works like a thermo flask keeping the egg nice and warm for longer. Load your favorite ketchups and garlic mayonnaises or mustard and enjoy your extremely easy but tasty and good looking appetizer.That's it hope you will like it.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 0, 1, 2]\nC: [1, 2, 0, 3]\nD: [0, 3, 1, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_61_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_61_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_61_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_61_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [2, 1, 0, 3]\nC: [2, 0, 3, 1]\nD: [0, 3, 2, 1]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Start by gathering the ingredients together.Ingredients1.       2 large bananas mashed 2.       4 eggs 3.       1 cup vegetable oil 4.       2/3 cup water 5.       3 1/2 cups all-purpose flour 6.       2 teaspoons baking soda 7.       1 1/2 teaspoons salt 8.       2 teaspoons vanilla 9.       Olive oil10.   3 cups Splenda or some other artificial sweetener, if you are not concerned about sugar content you can use white sugar.SuppliesLarge BowlPotato MasherMeasuring Cups & SpoonsLadleWhiskBrush for greasing pans.Disposable pans with lids.. The prep time to make the banana bread is 15 minutes so I put the oven on at 350\u2070 F or 175\u2070 C to preheat the oven.In a large bowl I peel and mash the bananas; mix in the eggs, oil, water, sugar, and vanilla, then whisk until well blended.When the wet ingredients are well blended whisk in the flour, baking soda, and salt.. These aluminum pans are ideal for gifting; prepare the pans by coating the pans with olive oil using a brush, I find the bread falls out of the pans with ease when you coat them this way.Pour the batter into the prepared pans; I like to use a ladle and count the ladles of batter in each of the pans. This batch should fill 4 pans to about half full, then check the oven to be sure it is up to heat.Place the pans on the middle rack of the oven and bake for about 50 minutes to one hour in the preheated oven. The loaves are done when you can insert a toothpick in the center of the loaf and it comes out clean and dry.. Once baked let the loves cool on a rack, when the loves are cool you can put the clear plastic covers on the pans or you can glaze the loves and serve.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 1, 0, 3]\nC: [2, 0, 3, 1]\nD: [0, 3, 2, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_62_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_62_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_62_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_62_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [1, 3, 0, 2]\nC: [3, 0, 1, 2]\nD: [2, 1, 0, 3]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Ingredients:500ml of half and half cream250ml of heavy creamSugarVanillaSaltNutellaPeanut butterChocolate chipsOther things you will need:Ice cream makerMeasuring cupsMeasuring spoonsBig bowlOther container to store ice cream in (not in picture). First we need to make the mixture that will later turn into ice cream!Here's the recipe:2 cups (or just a 500ml carton) half and half cream1 cup (or just a 250ml carton) heavy cream1/3 cup white sugar1 teaspoon vanillaTiny pinch of saltOnce it's all added, mix until everything is dissolved.In case you're wondering, the peanut butter, nutella and chocolate chips will be added later.. Once your mixture is ready, pour it into your ice cream maker. Make sure that your frozen canister is completely frozen. I even froze mine a couple days before, just to be sure. Now turn on the machine and set a timer for 10 minutes. If your mixture doesn't look like the last two pictures after 10 minutes, then do it for another 10 but watch it so that it doesn't freeze too much.. After the ice cream mixture is the right consistency, it's time to add the yummy stuff! You will need to add:1/3 cup of peanut butter1/3 cup of nutella (yes, I know it's not a full 1/3 cup, I didn't realize that we had so little of it)1/3 cup of chocolate chipsAdd all of that into the mixture and stir with a long spoon to mix it in thoroughly, then turn the machine back on and set the timer for another 10 minutes.. Once your 10 minutes are up, turn the machine off and scrape the ice cream off the mixer part . Then carefully transfer the ice cream into a separate container and place it in your freezer for a couple hours, just to really harden it, unless you want it soft.. Once it has been in the freezer for a while, scoop it out into bowls and serve! I really hope you enjoyed my instructable. If you did, please vote for me in the \"Frozen Treats\" contest, also favorite and comment, I'd really like to hear your feedback! Thanks!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 3, 0, 2]\nC: [3, 0, 1, 2]\nD: [2, 1, 0, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_63_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_63_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_63_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_63_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [0, 3, 2, 1]\nC: [1, 0, 3, 2]\nD: [2, 0, 1, 3]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. 1000grams apricots 500grams gelling sugar 2:1 1 tablespoon lemon juice 1 vanilla pod 2 tablespoons Marillenschnaps (apricot liquor)Tools:kitchen scale kitchen knife mason jars canning funnelhand held blender. First you have to wash and towel the fruits. Cut them in half, remove the kernel and then mince the rest of the fruit pulp. Put them in a big cooking pot.. Cut the vanilla pod with a longitudal cut, scrape out the pulp with the knife and add the pulp to the apricots.. Apricots are rich in sweetness but low in pectin. Therefor we need a gelling sugar. There are many types of gelling sugars, I picked the 2:1 version for a better taste and less sweetness. The 2:1 simply means you add twice as many fruits in weight than you have sugar. Stir everything and let it simmer. I added the rest of the vanilla pod to the mix for more flavour, it will be removed before the blending.. Time to pick a huge bowl, fill it with boiling water and lay the canning jars inside for desinfection.. Time to take out the vanilla pod and the hand held blender. . When everything is blended smoothly, increase the heat and boil for 7 minutes while stirring the mixture.. Meanwhile spread a kitchen towel on a table, take the canning jars out of the water and let them dry with the mouth down on the towel.. Now its time for the Marillenschnaps or whatever the liquor you picked. Add 2 tablespoons to the mix and stir.. Time to fill the prepared canning jars. Use a funnel to prevent spills. Then close the jars and flip them over. Let them rest upside down and cover them with a kitchen towel to keep in the heat.Have fun trying this one out!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 3, 2, 1]\nC: [1, 0, 3, 2]\nD: [2, 0, 1, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_64_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_64_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_64_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_64_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [2, 0, 1, 3]\nC: [0, 1, 3, 2]\nD: [1, 0, 2, 3]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. I use chickens eggs. You can buy them from the store if you like, although I get mine for free, well sort of. For those who believe eggs come from a supermarket please see the photos above. They actually come from Chickens originally. I like to know where my food comes from and I also can directly control the quality and treatment some of it receives. If I do ever have to buy them I try and ensure they are free range. I keep a few chickens as pets. In return for my feeding them, cleaning their coop and generally keeping them happy they provide me with eggs. I generally end up with quite a glut of them as, during the summer with all three laying, I just can't use them quickly enough.. Okay, so we're going to need a few things. What you use can vary according to your taste and what you have available. I've used Beetroot to give the pickles a pink colour and also provide a crunchy pickle themselves. Distilled Vinegar, enough to fill your chosen container (this is the clear vinegar) Eggs, I used 13 for this recipe 2 medium sized raw beetroots 2-3 carrots, sliced about 5mm thick on the diagonal 1-2 reg peppers, sliced Pickling spices (optional) A little sugar if wanted.    I put the beetroots, still with their skins on, into a pan of cold water and bring to the boil. These need to cook for 30 minutes     After 20 minutes add the sliced vegetables to the beetroots and cook for the remaining 10 minutes. Then drain them a and allow to cool     Once cooled you should be able to peel the skin from the beetroot quite easily by rubbing a knife across it. (When handling the beetroot it's advisable to wear gloves or your hands will be stained pink too)     Chop the beetroot into 1cm dice, or small chunks     Add this in with the other vegetables and stir so they're all mixed together . \n          I like to add some spicy flavour to the vinegar, but you can leave this out and just use it as is. Some people alo advocate adding up to 50% water to the vinegar to stop the eggs being too rubbery and the flavour being too strong. I like strong flavours, but feel free to experiment and dilute to your taste. Before you start it may be advisable to open a window, the toasting spices can tickle your throat and make you cough. The heated vinegar is also quite a pungent smell and can be a little unpleasant.   Put a pan on the heat and add some spices to help release their flavours and natural oils, I used the following, but you might want to create your own spice mix:        1tsp Mustard Seeds         1tsp Coriander Seeds         1tsp Crushed Dried Chilli         1tsp Black Peppercorns         2-3 bay leaves         1tbsp Sugar      Once the spices have started releasing their aroma add your vinegar. Bring this close to the boil, add the sugar and stir until it dissolves     Remove from the heat and allow the flavours to infuse as the vinegar cools     If you have made more than you need simply store in a bottle and use for the next batch. .    Put your eggs into a large pan and add plenty of cold water. Fresh eggs will lie on the bottom, slightly older ones will turn upright, bad ones will float to the surface. You should dispose of any floaters, the ones that turn up are at your discretion, though I don't tend to have any.     Now put the pan of good eggs on the heat and bring to the boil     Once on a rolling boil you need to cook them for 7 minutes, use a timer     When 7 minutes are up I then drain the eggs and fill their pan with cold water. This stops them cooking any longer and prevents that dreaded 'black ring' around the yolk that overcooking them causes.     Now you need to peel them. I do this in the pan as the water helps to wash the loose shell away. You'll notice some are easier to peel than others. I think the older eggs are easier to peel. A couple of my eggs look half chewed where the shell was difficult to remove     Remove the eggs from the pan and allow to dry whilst you prepare the other components  You can see why the floating test works for the eggs once you've peeled them. As eggs get older the airspace at the rounded end gets larger. This is so that a developing chick has space to move into before escaping from the shell. Once the shell is removed from the boiled eggs you can see how large the airspace is, most of mine barely have any as they're only about a week old. I keep the egg shells and add it to my chickens feed, it helps them to digest their food and it also gets re-absorbed and helps them to create strong shells on the eggs.\n        . \n          It is very important that the container you are going to use for preserve is sterile. Any bacteria will ruin your food and may adversely affect your health. There are a number of ways to sterilise jars, I won't go into too much detail as it's fairly straightforward, but I tend to do one of the following:    Wash the jar then dry in the oven. I start with the oven cold so as not to shatter the glass.     Put a number of open jars into the dishwasher on a high heat setting. I tend to do this when I'm making large batches of preserves  Once sterilised your jar is ready to use. Be careful and use gloves when handling hot jars.\n        . \n          Now, you want to add the ingredients into the jar in layers so it all mixes evenly.   Start with some of the vegetables     Add a layer of Eggs     Now add more vegetables     Add more eggs     Keep doing so until you've filled up the the neck of the jar  I then use a jug to pour in the spiced vinegar. Ensure that everything is covered by the vinegar before sealing\n        . The eggs are best left to soak in the vinegar for at least a month. They can stay in the vinegar for a very long time and will just get better. As I say, I like my eggs in a bag of crisps. You can eat the vegetables too and all of these go nicely as part of a salad or ploughmans lunch. The eggs will have taken on a lovely pink colour, and all the flavours form the vinegar.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 0, 1, 3]\nC: [0, 1, 3, 2]\nD: [1, 0, 2, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_65_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_65_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_65_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_65_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [3, 0, 2, 1]\nC: [2, 1, 3, 0]\nD: [3, 0, 1, 2]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Ice Mio\u00a0Caffeinated water enhancer, black cherry flavor True Lime crystalized lime Splenda Lime Maraschino cherries Club sodahttp://truelemonstore.com/products/true_lime&trk_src_ss=TLMPAYPCWEBMACSS\u00a0http://www.makeitmio.com/mio-original.aspx\u00a0. Fill glass with ice. Add 1 packet Splenda and one packet True Lime or the juice of 1/2 lime.. One squirt of Mio Caffeinated cherry flavor. Of course, you can use the non-caffeinated version\u00a0instead.\u00a0. Fill glass with club soda.. Add a Maraschino cherry and a slice of lime. Enjoy!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 0, 2, 1]\nC: [2, 1, 3, 0]\nD: [3, 0, 1, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_66_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_66_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_66_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_66_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [3, 2, 0, 1]\nC: [2, 3, 1, 0]\nD: [1, 3, 0, 2]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. First off, start by twisting all the Golden Oreos apart.Then break the chocolate into a microwavable bowl. Reduce your microwave\u2019s power to half then heat the chocolate in 20 second intervals. Stir well between each time and repeat until your chocolate is melted and smooth. Now dunk a lollipop stick into the melted white chocolate & press it into the Oreo cream filling. Place the other Oreo half on top (with the white chocolate acting as a glue) then place on a baking tray/chopping board.. Repeat the process until all of the Oreos have been \u2018glued\u2019 back together with their new lollipop sticks.Pop the tray/board into the fridge for 15-20 minutes to allow the chocolate to set completely before we move onto the next step.. When the white chocolate \u2018glue\u2019 has set, you are ready to decorate! Get a large baking tray lined with baking paper ready & set to one side.If you need to, pop the white chocolate back into the microwave and heat on half power in 20 second intervals until smooth & melted again.Now all you need to do is dunk each Oreo Pop into the white chocolate and shake of any excess. Then dunk a side into the sprinkles and place onto your prepared baking tray to set.. Repeat until all the Oreo Pops have been covered in chocolate/sprinkles.Allow them to set at room temperature. Once set, peel off the baking paper and store at room temperature in an airtight container. Consume within 2 weeks.. These would look gorgeous wrapped in a cellophane bag & tied with a bow! It\u2019s also a lovely craft to get the kids involved with. You could even get a polystyrene block/flower oasis, place it in the bottom of a small box or vase & turn them into a bouquet of Valentines Day Oreo Pops flowers with some green crepe paper. I\u2019m certain that would keep Mum very happy!For more awesome step by step recipes check out  www.kitchenmason.com\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 2, 0, 1]\nC: [2, 3, 1, 0]\nD: [1, 3, 0, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_67_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_67_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_67_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_67_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [0, 3, 1, 2]\nC: [2, 1, 0, 3]\nD: [2, 1, 0, 3]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Syrup: 1 cup water. 2 cups sugar (we use brown sugar) Optional: Vanilla and maple extract. Optional some people add a bit of buttermilk. Sometimes we substitute apple sauce or ground fruit and berries for the syrup. Pancake batter: 1 cup buttermilk (substitution: one cup milk with 1 tablespoon juice or vinegar made the night before and refrigerated) 1/3 to 1/2 cup cooking oil or melted butter. 1 room temperature egg. 1 teaspoon salt. 1 cup all-purpose flour (up to 1/2 cup of whole wheat flour can be substituted). 1 teaspoon baking powder 1 teaspoon or less of baking soda.. With a sauce pan bring the water to a boil. Add the sugar and karo syrup if any, Boil for about one minute. Turn off the heat. When the mixture is still slightly warm, add the vanilla and maple extract. Put in a sealed container and refrigerate for later usage.. Egg needs to be at room temperature. So let it sit a bit while making the syrup. In a bowl combine the flour, baking powder, salt, and baking soda. Stir well. In a separate container, crack the egg into it and whipped lightly till the yolk and the egg are combined. Into the bowl of flout add the cooking oil, buttermilk, and egg mixture Stir about 10 times. DO NOT OVER STIR.\u00a0 Lumps are okay. Let sit while you go on with the pan. Heat up a frying pan at just more than medium heat. Add oil or butter to coat the bottom. Pan needs to be warmed up (i.e. hot as cold skillet = bad hotcakes.) Ladle in the pancake mix one large spoon full at a time. Repeat till the pan is full say 3 or 4 depending on the size of the skillet. (each spoonful must not be touching each other). Let the cakes puff up. You can use a spatula to make sure they do not burn underneath. When you see a healthy amount of bubbles the cakes are ready to turn or flip. Let cook till they are like a piece of bread. (no liquid). Set aside and repeat the process till all the batter used. You can use a kitchen towel to keep the pancakes warm if they are not served immediately. (They are gone at our house as soon as they come out of the pan).. Note: pancakes do not have to be perfect in shape as they still taste just as good.. Note: you could also cook bacon and or eggs to go along with this dish. Then you need less pancakes per plate. Put two to three pancakes per plate. Add syrup and or butter. (we sometimes substitute apple sauce for the syrup). Serve.. How To Make Brown Sugar    1 Cup White Sugar     1 Tablespoon MolassesPour  the sugar in a food processor and then drizzle the molasses over the  top.  Process until the two are thoroughly mixed (about 2 minutes)   stopping to scrape the sides occasionally as needed.---Make Your Own Baking Powder     1-1/2 teaspoons baking soda     3 teaspoons cream of tartar (egg shells)     2 teaspoons cornstarchCombine the baking soda, cream of tartar, and cornstarch and use in the recipe as you would the baking powder.And here\u2019s one last thing to remember.  Moisture will make the baking powder loose its potency so never dip a damp measuring spoon into your container of baking powder.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 3, 1, 2]\nC: [2, 1, 0, 3]\nD: [2, 1, 0, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_68_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_68_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_68_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_68_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [3, 2, 1, 0]\nC: [3, 1, 0, 2]\nD: [0, 3, 1, 2]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. \nFor this you will need\n5 cups of water\n1/2 c. of dried hibiscus flowers, look in mexican food stores if you can't find them at your local store.\n1/2 c. Agave Nectar or similar sweetener\n1/2 Lemon. Take half of the amount of water you will be using put it in a tea kettle to boil. This should take about 5 minutes.. While waiting for your water to boil, take the other half of the water and put it in the pitcher that you want to be using for the tea. Add sweetener and lemon and stir around.. Once the water comes to boil you can put the flowers in a glass bowl and pour the hot water over them. Let them steep for a good 10 minutes until the water gets a dark red.. Next add ice to the sugar water to get it nice and cold.. Mix the tea you made with the flowers in to the sweet water, combining them. If the mixture is not cold enough add more ice.. At this point the drink is done. Feel free to put it in the fridge to cool some more or to save for later.\nThe hibiscus tea is very refreshing especially for those hot days and now you can make your own!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 2, 1, 0]\nC: [3, 1, 0, 2]\nD: [0, 3, 1, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_69_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_69_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_69_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_69_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [3, 2, 1, 0]\nC: [1, 2, 0, 3]\nD: [2, 0, 3, 1]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Ingredients (for 4 servings):30 quail eggs 2 scallions (20g) - 0.7 oz 1 celery stalk (30g) - 1 oz 3 pickled cornichon cucumbers (40g) - 1.4 oz 1 Tbsp chopped parsley 1 tsp Dijon mustard 1 tsp lemon zest 1 Tbsp lemon juice 1/8 \u2013 \u00bc tsp chili powder 3 \u2013 4 Tbsp mayonnaise (preferably homemade) salt and pepper to tastefor the homemade mayonnaise:1 large chicken egg  - at room temperature about 1 cup of any mild - flavored vegetable oil (I used sunflower oil)Tools:a medium saucepan a colander a large bowl (filled with water) 10 ice cubes a cutting board a sharp knife for the homemade mayonnaise: a bowl (I usually use a soup plate) and a whisk (or a wooden spoon)Abbreviations:tsp = teaspoon Tbsp = tablespoon. I make it the same way my grandmother used to (but it can be made using a blender, too).For the mayonnaise you need a fresh yolk. The yolk will be used raw, so it is very important for the egg to be as fresh as possible.Clean the egg (with water and soap) and pat it dry. Separate the yolk from the white and place the yolk in a bowl. Save the egg white for another recipe (pavlova, meringue or egg white omelet).Start whisking the yolk for about 1 minute. It doesn't matter if you whisk it clockwise or counter-clockwise as long as you don't shift direction. Use the direction that suits you best.Now you can start adding the oil. At the beginning add just a few drops of oil, whisking vigorously. Continue adding the oil, few drops at a time (whisking continuously, of course), until the emulsion seems to thicken. Now you can increase the oil volume, to about 1 tsp at a time. Whisk continuously until all the remaining oil is incorporated.Note: For this quail egg recipe you will need only about 3 - 4 Tbsp of mayonnaise. Store the remaining mayonnaise in an airtight container, refrigerate it and use it in other recipes (salads, dressings, sandwiches, fish cakes).. Carefully wash the quail eggs, place them in a pot and cover with cold water. Place the pot on the stove and wait until the water starts to boil. Reduce heat to minimum and let eggs boil for 3 minutes. Meanwhile prepare the ice bath. Fill 2/3 of a large bowl with cold water and add in about 10 ice cubes.Drain eggs using a colander, let them cool in cold water for 5 minutes and carefully peel them.. Finely chop the scallions, celery stalk, pickled cucumbers and parsley.Pat dry the eggs, roughly chop them and place them in a bowl. Add the scallions, celery stalk, cucumbers, parsley, mustard, lemon juice and 3 Tbsp of mayonnaise. Stir to combine. If the salad doesn't look creamy enough, add the remaining tablespoon of mayonnaise.. Season with lemon zest,1/8 tsp of chili powder, salt and pepper. Stir to combine and taste it. If the salad isn't spicy enough, fell free to add the remaining 1/8 tsp of chili powder. . Refrigerate for at least 15 minutes before serving. I noticed that the longer you refrigerate the salad, the better it will taste. Serve it on a slice of whole-wheat toast, garnished with arugula and radishes.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 2, 1, 0]\nC: [1, 2, 0, 3]\nD: [2, 0, 3, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_70_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_70_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_70_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_70_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [3, 0, 1, 2]\nC: [0, 3, 2, 1]\nD: [2, 0, 3, 1]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Here's what you'll need: -pasta (I typically use dried, although fresh is even better!) -grated parmesan or other hard cheese -butter -egg -pancetta (or bacon, or\u00a0prosciutto, or guanciale, whatever you can get your hands on really. \u00a0Alternatively, you could leave this out entirely for a vegetarian version.) -black pepper -salt -cream (optional) I don't usually measure my ingredients, which is why I haven't given any quantities. \u00a0As a rough guide, I usually use about 100g of pasta, one egg, a bit less than a tablespoon of butter, and one thick slice of bacon if I'm making this for myself.. Dice up your bacon or pancetta, and toss it in a frying pan over medium-high heat. As soon as that's going, start boiling your pasta in water with a bit of salt added. \u00a0(If you're using fresh pasta, you may want to wait until you've done the next step before starting it, as it will cook much more quickly.). While those are cooking, put a knob of butter and a bit of your grated cheese in a large bowl - you want something big enough to mix your pasta in. When the bacon is finished cooking, put it in a small bowl (or teacup, as the case may be), and set it aside for now. (Warning: There might be a bit of thumb-twiddling at this stage. \u00a0This is a really easy recipe. \u00a0I had time to wash a sink full of dishes while waiting for the pasta. \u00a0Just try not to eat all the bacon cubes just yet.). When your pasta has finished cooking, drain it, and immediately put it into the bowl with your butter and cheese. \u00a0It's important that it's still hot at this point. As soon as the pasta is in the bowl, crack your egg into it. \u00a0That's right, directly into the pasta. \u00a0Then mix it! \u00a0You want to get the egg distributed as evenly as possible, since if you let it sit it will curdle. \u00a0Keep stirring until the egg, butter, and cheese are mixed, and the sauce starts to thicken. The heat of the pasta cooks the raw egg, but does so slowly enough that you don't need to worry about it scrambling and making your sauce all lumpy. \u00a0The mixing distributes the egg so it cooks completely, and forms an emulsion with the butter and cheese that makes it nice and creamy. This sauce won't look quite as thick as the stuff you buy in the store - they bulk it out with cornflour - but trust me, it's tastier. \u00a0Adding cream in the next step will get you a bit closer to that store-bought taste.. Now, you can mix in your bacon, cream (if you're using it), and pepper. \u00a0Maybe sprinkle a bit of extra cheese on top.. That's it! \u00a0You're done! \u00a0Go enjoy your delicious pasta. Once you're happy with this, you can try all sorts of interesting variants. \u00a0Maybe try adding vegetables to the basic sauce, or different kinds of spices. \u00a0I had a particularly tasty batch last week while cleaning out my fridge that used smoky bacon in place of pancetta, and some stilton cheese instead of cream. \u00a0Be creative!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 0, 1, 2]\nC: [0, 3, 2, 1]\nD: [2, 0, 3, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_71_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_71_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_71_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_71_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [2, 1, 0, 3]\nC: [3, 2, 0, 1]\nD: [3, 0, 1, 2]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. BACON - 12 strips should be enough to flavor your vodka, but if you are like me...you may need more as not all of the bacon will survive to make it into the projectVODKA  - 1 liter should suffice - and with all cooking...the quality of the ingredients will effect the final product.\u00a0 I'm not saying you need to go crazy at the liquor store, but I wouldn't recommend anything that comes in a plastic bottleCONTAINER WITH LID\nCOFFEE FILTERS (OR SIMILAR)\nFREEZER. Everyone has their own preference when it comes to the crispiness of their bacon.\nFor the infusion I used bacon that was cooked to where it was still flexible and wouldn't crumble.\nJust place all of the bacon in the container with the vodka, close lid and let sit for 4-5 days at room temperature (infusions work better at room temperature and the alcohol will keep you safe). After the vodka has had time to soak up all of the bacon flavor, its time to remove the bacon.\nI used a mesh colander to catch the bacon and the larger pieces of grease, but you could just remove the bacon by hand.\nNext put the vodka in the freezer.\u00a0 This will cause the oils to solidify which will make them easier to remove.. \nAfter the oils have solidified, you'll want to strain the vodka through a coffee filter to remove the oils.\u00a0 The oils will clog the filter, so I'd do small amounts at a time and change the filter when the vodka stops dripping through.\nTake the filtered vodka and put it back in the freezer and repeat this step.\u00a0 The more times you repeat this step, the less oily the final product will be.\u00a0 I filtered three times.. Once you satisfied with the filtered vodka, its time to enjoy the fruits (or meats) of your labor.\u00a0 Here are some recommended drinks to use your bacon vodka with:\nChocolate Bacon Martini:\n2 oz of Bacon Vodka\n3/4 oz of Godiva Chocolate\u00a0 Liquor\nSplash of Half & Half\nShake all with ice and strain into a martini glass\nBacon Bloody:\n2 oz of Bacon Vodka and your favorite Bloody Mary mix.\u00a0 If you normally like your Bloodies spicy...I'd lay off the spice this time because the bacon is the star of this show\nAnd then there is just the bacon vodka chilled by itself...but the possibilities are endless\nEnjoy!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 1, 0, 3]\nC: [3, 2, 0, 1]\nD: [3, 0, 1, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_72_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_72_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_72_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_72_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [3, 2, 0, 1]\nC: [0, 2, 1, 3]\nD: [1, 0, 3, 2]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Ingredients:   (makes 1 sandwich) 3+ slices sandwich meat (I use pastrami) 4 eggs, boiled Handful of lettuce 1 slice monterey jack cheese Mayonnaise 2 slices whole wheat bread. Boil the eggs. Place eggs in a saucepan with water covering them, and place over medium-high heat. Once it reaches a boil, turn heat off. Drain out the hot water, and dump eggs into a bowl of ice water. Once cooled, immediately peel the shells off, and rinse the eggs. Transfer shelled eggs to a cutting board and slice into medium slices.. Prep the rest of your ingredients: sandwich meat, lettuce, cheese, mayonnaise, and bread.. For a nice crunch, you can toast your sandwich bread. Place cheese on the top slice.. Build your sandwich: Spread a generous dab of mayonnaise over the bottom slice. Layer with lettuce, then arrange the boiled egg slices neatly on top. Then top the eggs with the meat, then place the slice with the cheese over the top of the sandwich.. Consume\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 2, 0, 1]\nC: [0, 2, 1, 3]\nD: [1, 0, 3, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_73_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_73_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_73_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_73_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [1, 2, 3, 0]\nC: [3, 0, 2, 1]\nD: [0, 1, 3, 2]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. What you will need for this are all actually pretty common items.2 or 5 lb CO2 tank (any size will do, really, but a larger tank means fewer trips to the machine shop or welding supply store)Dual-gauge regulatorGas line with ball lock quick disconnect20z ball lock converter capRepurposed 20oz or 2L bottlesThere is some debate over whether CO2 tanks from a machine shop or welding supply shops. The truth is, these tanks and CO2 is no different than what you will find underneath the counter in bars and restaurants.Also, I purchase the CO2 tank online, thinking I might come out a little better if I could just have a 5lb tank filled on the spot. Turns out, no local places fill tanks on the spot and I ended up surrendering my pretty tank for another at the machine shop. But it wasn't a big deal, really, since it came out to the same price anyway, around $90 for a filled 5lb tank \u2013 $70 for the tank deposit (which I didn't pay, since I turned one in) and $20 for the CO2.The dual-gauge regulator I purchased works perfectly fine, but two things are worth noting. It has a safety release valve that engages at 45psi. If you want extra bubbly water, you may want to look for a different regulator. Also, you don't necessarily need a dual-gauge regulator \u2013 it just serves as a visual aid for how much CO2 is remaining in the tank.Save for the ball lock disconnect, you can easily find the hose clamps, gas line, and other connectors at your local hardware store. However, it's almost positively easier and cheaper to just order this pre-made assembly online. It's difficult to find a rubber gas line under 20' long and for less than $20. This entire assembly plus the quick disconnect is about $15 on Amazon.. You really won't need a lot of tools for this. Just some scissors, a pipe wrench or slot and groove pliers, and a screwdriver. And you should definitely consider some thread seal tape.. Begin by attaching the gas line assembly to the regulator. Slide a hose clamp over the open end of the gas line, then slide the hose over the barb on the bottom of the regulator.If you have trouble fitting this hose over the barb, simply soak the end of the hose in warm water for a minute or two, then try again.Slide the hose clamp to about 1/8\u201d from the end of the hose and use the screw driver to tighten the hose clamp over the connection.. Next, wrap some thread seal tape around the threads of the CO2 tank valve (in the direction you will be screwing the nut on, unlike what I'm doing in the above photo, because I goofed).Make sure the included nylon washer is in place, and screw the regulator onto the tank valve. Use pliers or a pipe wrench to snug the nut.. And that\u2019s it! Seriously, you\u2019re ready to carbonate some water.Turn the valve on the CO2 tank and adjust the regulator pressure to approximately 45psi, and twist the pressure valve on the regulator to the on position.Remove the cap from the bottle of water, squeeze out as much air as possible, and screw on the the ball lock converter cap \u2013 or carbonator. Then connect the bottle to the ball lock disconnect. When you do this, the bottle will immediately inflate and harden. Shake the bottle for 60 to 120 seconds and remove from the ball lock disconnect.Turn off the valve on the CO2 tank, pull the manual pressure release valve to release the remaining pressure in the gas line, and switch the regulator pressure valve back off.Twist off the cap on the bottle, pour into a glass, and enjoy some refreshing homemade sparkling water!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 2, 3, 0]\nC: [3, 0, 2, 1]\nD: [0, 1, 3, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_74_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_74_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_74_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_74_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [3, 1, 2, 0]\nC: [1, 3, 0, 2]\nD: [3, 0, 2, 1]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. This recipe serves two people or one extremely hungry student :)Ingredientsone chicken breast ($1.50)one ramen noodle packet with seasoning ($0.25)one egg ($0.20)Total Cost: $1.95That's all the ingredients that there is to it! Feel free to double or even triple the recipe if there are more people.. 1. Start off buy putting your ramen in a plastic bag. Gallon bags work best but are not necessary. 2. Pour as much seasoning as you like. I usually use 3/4 the packet but add more or less to taste. Seal the plastic bag so that you don't make a big mess.3. Put your textbooks to work by using them to crush the ramen into small bits. Make the pieces bigger for crunchier nuggets or extremely fine for a softer nugget.4. Pour the ramen breadcrumbs into a bowl. If the plastic baggie you used to crush the noodles hasn't broken, you don't have to do this step, but most likely there will be a few small tears from where the textbook has stabbed it.. 1. Take your chicken breast and use a knife (ones that have broken and have been repaired by duct tape are fine) to cut it into bite sized pieces like in the second photo.2. Crack one egg into a bowl and whisk it using a fork/chopsticks/whatever.. 1. Start off by making sure you have a plate close by to put the breaded chicken on. Then place some chicken bits into the egg mixture making sure that every bit is coated.2. Place the eggy chicken into your crushed ramen noodles and use your hand to make sure every part of the chicken is coated and that there are no bare spots.3. Put the nuggets onto a place and get ready to cook!Tip: Have one hand do the wet stuff (coating the chicken with egg) and your other hand do the dry stuff (coating the chicken with ramen noodle, placing nuggets onto a plate). . 1. Pour some oil onto a pan. Doesn't really matter what kind of oil or what kind of pan, whatever you have. I used olive oil for this demonstration. Also, the more oil you use, the more tender and generally tastier the nuggets will be. Heat the pan on medium until the oil is hot.2. Place all of the nuggets in an even layer on the pan. Don't worry if some of the ramen noodle coating falls off, you can pick those up later.3. Cook until the bottom of the nuggets are a golden brown. The nugget in the third photo isn't done yet, it needs to cook for longer.4. Once the nuggets are golden brown like in the last photo, turn them over so that the other side can cook. Once you can see that the other side is also golden, remove the nuggets from the pan and transfer them to a plate (or just eat them out of the pan, less dishes amirite?)Tip: Don't put the cover on the pan! Condensation will form and drip onto your chicken nuggets making them soggy and wet.. Eat your nuggets when they are still warm and enjoy your delicious meal! If you liked this Instructible, please take a second to vote for me in the DIY University Contest! It would mean the world to me!Have a fantastic day,thederpyninja\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 1, 2, 0]\nC: [1, 3, 0, 2]\nD: [3, 0, 2, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_75_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_75_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_75_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_75_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [2, 0, 1, 3]\nC: [0, 3, 1, 2]\nD: [0, 2, 3, 1]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Before you begin, acquire the materials that you will need to tap into your faucet supply line. Most faucets in the US are 3/8\" compression, so that is what I'm working with below. Bill of Materials3/8\" x 3/8\" x 3/8\" Compression Add-a-tee Adapter - $6.58 at Home Depot1 ft. long braided tube with 3/8\" Compression Fitting (Faucet supply line) - $4.98 at Lowes3/8\" Compression Straight Quarter-Turn Valve - $8.99 at Lowes1/2\" NPT to Male GHT Adapter - $4.39 at LowesTotal: $24.94Of these, the hardest for me to find was the Add-a-tee adapter (which, incidentally, the staff at both hardware stores I visited insisted didn't exist). For tools, you will need a small adjustable crescent wrench.. It's probably pretty cramped under your sink, so it helps to pre-assemble the parts you can to reduce how long you will need to spend bent over. Pre-assemble the adapter. The proper hookup is:Water supply --> Tee --> FaucetTee --> Valve --> Faucet supply line --> Garden Hose Adapter(Tip! Look at the diagram above, as well as the picture of the pre-assembled mechanism. In the diagram, the section shaded in purple is what you are assembling, and what corresponds to the picture.). It doesn't make sense to run hot water through our wort chiller, so we need to figure out which supply line is which. Thankfully, there are only two choices, so this is easy. Shut off the valve supplying one of the lines, and turn on your faucet. If only hot water comes out, then congratulations! That valve is hooked to the cold water line (and is the one you want to tap into). If only cold water comes out, reopen the valve and try the other one. Once you've identified the correct valve, turn off the water to both lines and test your faucet to make sure nothing comes out. NOTE - This is important! If you don't do this, at best you'll end up with a mess, at worst you could get hurt or destroy something. Be careful, be safe, and make sure to check that the water is shut off before proceeding. To attach the adapter you just built, simply unscrew the existing cold water supply line running from the water valve where it connects to the line running to the faucet, and reattach both lines to the sides of the add-a-tee. Use a small crescent wrench to tighten both lines to the sides of the add-a-tee.Once you have attached & tightened your adapter, turn the supply valves back on and check for leaks.. Congratulations! Your adapter is installed, and ready for use. Hook up your wort chiller, turn on the water, and check for leaks. These are compression fittings, so if you see a leak, try tightening the nut nearest the leak. Now, go brew some beer and test it out! You're one step closer to better homebrew!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 0, 1, 3]\nC: [0, 3, 1, 2]\nD: [0, 2, 3, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_76_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_76_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_76_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_76_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [2, 3, 1, 0]\nC: [0, 1, 2, 3]\nD: [0, 3, 1, 2]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. \n          To make the spaghetti yeti, the only ingredients you'll need are:\n\t\tA brick of super firm tofu\n\t\tSpaghetti\n\t\tWater\nYou'll probably also want to serve it with some sort of sauce, but I'll leave that up to you. The spaghetti yeti is abominably bland, so goes well in any pasta dish,\n        . Your yeti is going to be made by skewering a tofu body with dozens of strands of raw spaghetti, then boiling the whole lot together.\nThe first step is to carve your yeti's body. Try to carve it all out of a single brick of tofu, being careful not to make it too thin and flimsy at any point. There's no need for fine detail here; just go for the main features. Be sure to give it a wide, sturdy base so that it can stand up. That is to say, give it bigfeet.\nAs you can see, nearly all the structure of my yeti's body was concealed by its hair later on. I also decided that my yeti was too short, so added a separate head.. Carefully push a strand of raw spaghetti all the way through your yeti, then break it off at the desired hair length.\nRepeat in varying directions and lengths until the tofu body is riddled with spaghetti spikes. At this point it should look more like a sea urchin than a yeti.. Find a pot big enough to contain your startled-looking raw yeti. Be careful not to break any of the brittle spaghetti while you're handling your monster.. Boil your yeti until the spaghetti and the tofu are both cooked through. Try to do this at a gentle simmer, as a hard boil will send your yeti tumbling dangerously. I know that yetis are rugged enough to survive most avalanches, but they're naturally found in cold climates; at higher temperatures they become much more fragile.\nIf your yeti's hair is sticking out of the water, you may need to cover it with an upturned pot so that the steam will cook the dry spaghetti enough for it to turn limp and flop into the water.\nOnce your yeti is cooked, carefully remove it from the water and drain it in a sieve.. You may decided that you want to make and cook the head separately. This was my yeti's uncooked head, made from leftover pieces of its tofu body.. Use a pair of scissors to give your yeti a haircut, if necessary. How shaggy you choose to leave your yeti will depend upon how formal an event it will be attending.\nTry to choose appropriately sized scissors that won't shear your yeti in half.. Give your yeti a final going-over before you serve it up. I just added a pair of peppercorns as eyes, but I'm sure you'll be able to think of other ways to customise your own creation.. Pour pasta sauce, soup or whatever else you'd normally serve with spaghetti around your yeti and serve it up to some unsuspecting diners.\nBe sure to upload pictures of your own spaghetti yetis to the comments!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 3, 1, 0]\nC: [0, 1, 2, 3]\nD: [0, 3, 1, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_77_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_77_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_77_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_77_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [2, 1, 0, 3]\nC: [3, 2, 1, 0]\nD: [1, 2, 0, 3]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Dr. Pepper Glaze\nI did not have any pictures of this step as I already had some glaze made. Depending on how many chickens you have you can decrease or increase this recipe. You use about \u00bd cup of glaze per chicken\nIngredients:\n2L Dr.Pepper\nIn a pot bring Dr.Pepper to a boil. Once boiling reduce heat and simmer at medium heat until the Dr.Pepper. Once the Dr. Pepper has reached a consistency of corn syrup remove from heat and set aside till ready to use.. Brining the chicken helps add flavour and keep the meat moist when its cooked.\nRecipe\n2Gallons of Water\n2 Cups Kosher Salt\n\u00bd cup Vinegar\n3 cups Brown Sugar\n1 cup pickling spice(I bought this premade)\nIn a pot heat a \u00bc of the water, add all of your ingredients and let dissolve. Once dissolved simmer for 5 minutes then add to the\u00a0remainder of the cold water.\nOnce the water has cooled add your chickens and soak for 6-8 hours in the fridge.\nOnce chickens are done remove from brine and dry off.. I use a charcoal bbq for smoking and buy my wood from the bbq store in my city. If you have a gas bbq most bbq stores sell things you can put in your bbq to create the smoke.\nItems Needed\nBBQ/ Smoker\nWood(I used apple wood)\nWater\nChickens\nStart up your bbq/smoker .\nSoak your wood in water as this will create more smoke when the wood is added to your charcoal and help the wood take longer to burn.\nOnce your bbq is at temp(200 degrees fahrenhite)\u00a0add the wood(about three pieces to start) then add your chickens.\nGlaze your chickens with your Dr. Pepper every 15 minutes. When you notice there is very little smoke coming out of the bbq add more wood to the coals.\nKeep smoker at 200 degrees Fahrenheit\nSmoke chicken for 4 hours or till the chickens reach an internal temperature of 164 degrees Fahrenheit.. Remove Chicken from the BBQ you will now have\u00a0a nice sweet glaze on your chicken\u00a0with a nice smoky taste.\nOnce cut open the juice should run clear if the meat looks slightly pink do not worry the smoke has a tendency to do that to the meat.\nEnjoy.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 1, 0, 3]\nC: [3, 2, 1, 0]\nD: [1, 2, 0, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_78_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_78_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_78_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_78_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [0, 2, 1, 3]\nC: [2, 3, 0, 1]\nD: [3, 1, 2, 0]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. -First step is to get ready with all ingredients. cut onion slices thin and long. Even clean mushrooms and cut them in to slices .(I used  tin mushrooms which are already cooked).  If using fresh mushrooms clean  them properly and cut in to slices. -Take tomatoes in blender and blend to puree.Take a vessel and add butter to it. You can add even few tsp of oil.-When better melts add cumin seeds.when they splutter add crushed cardamom to it.. -Next add sliced onions add saute until they slightly change color.-Add ginger garlic paste to it and cook until you get rid of raw smell of ginger.. - After onions are cooked.Next add blended tomato puree and give a mix. -Then add all spices coriander powder,red chilli powder,garam masala,turmeric powder, salt and mix properly. -Add sliced mushrooms and saute for few seconds. Then close the lid and cook for few minutes until mushrooms are properly cooked(if using fresh mushrooms instead of tin mushrooms close the lid and cook them for few minutes). -Last step is to add fresh cream or 2 tsp milk.-Finally garnish with coriander leaves and serve hot with rice or naan\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 2, 1, 3]\nC: [2, 3, 0, 1]\nD: [3, 1, 2, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_79_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_79_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_79_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_79_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [3, 2, 1, 0]\nC: [1, 3, 2, 0]\nD: [3, 0, 2, 1]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. First turn stove on to medium or around that. I just didn't want to burn the chocolate.Next: add chocolate pieces to sauce pan. ( I broke mine into pieces to make it easier to melt). The next step is to add the cherries and the Chili.I added about 1/3 of the bag of cherries to make sure that I had cherry flavor in each bite.Then I added 1 tablespoon of the chili powder to start off with. You can add as much or as little as you want, but I wouldn't add too much because then it would take over the flavor of the chocolate and cherries.Cook the chocolate and remember to continually stir to prevent burning. Cook and stir until it is completely melted and thoroughly mixed.. The next step is to take the chocolate and place it in a form to harden.I had a stainless steel mixing bowl that I put my chocolate in, so that it would be easy to remove once cooled.Once in mold, it is optional to place it in the freezer to quickly set up the chocolate. I did this and it worked great because it made the chocolate very hard and easy to remove.To remove chocolate, I placed the bowl upside down on a towel and then hit the bottom with a solid object and the chocolate just popped off the bowl and into the towel.. Place chocolate in plastic bag to keep it fresh.The last step is to enjoy!!!  omm nom nom\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 2, 1, 0]\nC: [1, 3, 2, 0]\nD: [3, 0, 2, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_80_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_80_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_80_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_80_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [1, 0, 2, 3]\nC: [0, 1, 3, 2]\nD: [0, 2, 3, 1]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. You will need:Two pieces of bread.Peanut butterYour favorite jam (jelly,preserve,w/e)MargarineYou will need the following:3 Spoons1 Fry pan1 Stove top or similar device1 Flipping device (optional). Place your fry pan over the burner. Set the burner to high. We want the pan nice and hot when we start to cook as to seer the sandwich (crispy outside juicy inside).Using the back side of one of your three spoons extract a generous amount of peanut butter from the jar. Evenly spread the peanut butter against one piece of bread. The reason we use a spoon verses a butter knife is because of the slight curve to the spoon. It allows for easy application of the peanut butter. And as an added bonus you can scoop a mouthful when no one is looking(Yum!). Using the same procedure as with the peanut butter apply an even layer of jam to the other piece of bread. Having completed this step you will have two pieced of bread with something smeared on them. If what you have doesn't resemble Figure (1) you may need to go back to step one.The next thing is to combine the two pieces. Holding one of the pieces flip it over onto the second.. Using your third and final spoon put a thing layer of margarine to the top piece of bread.Grab you sandwich and place it butter side down on the fry pan. Hear that nice sizzle? From here one out it's a time challenge. Work fast or risk burning your food. Using your third spoon again apply a thin layer of margarine to the top piece of bread.. It takes a certain amount of skill to know when it's time to flip the sandwich. You can use a flipping device as shown in figure (2) to see when it is time to flip. Personally however I use a slight shaking motion to the pan. When the sandwich starts to move around easily I listen for a crispy sounding motion to come from the sandwich. When I hear that magical sound I use a G shaped motion to flip the sandwich. Do the same to the other side. Wait for a nice browning, or the magical crisp noise.. When the sandwich is done slide it onto a plate. Using your flipping device cut it corner to corner. This is a very important step, don't forget it. I don't know what will happen if you don't cut it corner to corner I have always remembered. If you choose to skip that step I take no responsibility as to what may happen. Proceed with caution.Slowly bring the sandwich up to your mouth bite down and enjoy the little piece of heaven you just found in your kitchen.Personal thoughts on eating this:This is one of the tasteyiest snacks I have ever had. And as an added bonus if you are ever feeling slightly ill, the warm peanut butter seems to coat your stomach and sooth it. I just ate the Mach 17 version of this sandwich 30 min ago and my belly is still warm and comfy.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 0, 2, 3]\nC: [0, 1, 3, 2]\nD: [0, 2, 3, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_81_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_81_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_81_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_81_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [2, 3, 1, 0]\nC: [0, 3, 1, 2]\nD: [1, 2, 0, 3]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Make chocolate cakes. Big ones! Put them on a cake board and cover them in buttercream icing. Let the buttercream set in the fridge for 30 minutes.\nDon't have cake boards? Neither did I. Cut a circle from a pizza box and cover it in tinfoil.. Cover the whole cake in a layer of fondant, tinted black. It takes a whole lot of icing colouring to get black, so dump a lot in at once. If you can find chocolate fondant, it will become black with less colouring.\nTo get the fondant onto the cake, you first need to roll it out onto a sheet of plastic.\nThen drape the fondant, still attached to the plastic, over your rolling pin.\nGently slide the fondant and plastic over the cake.\nPeel off the plastic.\nSmooth out the fondant to adhere to the buttercream.\nTrim around the bottom. I use a pizza cutter.\nNow cut out another round of fondant, this time leave it white.\nPlace it on top of the black fondant.. Mix some gum paste with fondant, leave it white. The gum paste will stiffen the fondant a bit, making it sturdier.\nPlace a strip of white gum paste+fondant (gumdant?, fonpaste?) along the top and bottom.\nTo get it to stick, dissolve a pea sized piece of gum paste in a teaspoon of water, or whatever you feel like. Once it is dissolved, you will have a sort of glue to use. Brush some on with a little clean paintbrush.\nAdd a few more accent pieces.\nNow, the silver part.\nTo make it shimmer, you will need some silver dust you can find in cake shops. Mix a bit with vodka, not water, to make a paste. Water dissolves sugar, remember? The vodka will evaporate, leaving the dust.\nPaint this paste onto your white pieces.. To make these I used just gum paste, as these can be flat and don't need to taste great. (Gum paste dries hard)\nTo cut out I painstakingly printed out the letters on a piece of paper, cut them out, traced them onto fondant. I then carefully cut the fondant with a series of different tools.\nThen I painted them with the vodka silver paste.\nThese items can be cut and painted on a flat surface, on parchment paper or something. Let them dry and they will stiffen up for handling.\nGlue them on with the sugar and water stuff you made before.\nI painted the 'J' with red icing colour.. \nEnjoy your finished product while you can.\nThen give it to who you made it for and pretend it's not being destroyed, eaten, and digested.\nBecause that was a lot of work!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 3, 1, 0]\nC: [0, 3, 1, 2]\nD: [1, 2, 0, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_82_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_82_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_82_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_82_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [2, 0, 3, 1]\nC: [0, 1, 3, 2]\nD: [2, 3, 0, 1]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. The ingredients for this recipe are...A dollop or two of whipped cream (from heavy cream)1 0.75 oz packet of hot cocoa mix2 standard candy canes2 shots of espresso (or about 4-6 oz of strongly brewed coffee)1/2 cup of cold milk (any type, including almond or soy). Pour out a bit heavy cream (just a few spoonfuls is fine, enough to have some room for whisking), and use a hand mixer with a whisk attachment to whip your cream to stiff peaks. Put your whipped cream in a refrigerator until ready to use. . Unwrap one candy cane and stick it in a plastic seal-able bag. Push out all the air from the bag, and fold it a couple times. On a non-damageable surface (ie. ground outside, hard stone surface, carpet, etc.) use a heavy object like a hammer or meat tenderizer to crush your candy cane. Make sure the candy is crushed as finely as possible. Then, run the crushed candy through a sieve to remove large particles (the particles should be separated about half and half).. Pull out a large cup. The size doesn't have to be very exact. Pour in your finely crushed candy cane (about half of the cane) along with your packet of instant cocoa powder (0.75 oz). Mix the two together, and set aside. . Now, go ahead and brew your two shots of espresso (if you don't have espresso, use strongly brewed coffee). Immediately pour your two shots into your cup, and mix to dissolve the cocoa and candy cane. . Pour out a bit of milk (about 1/2 cup) into a frothing cup, and use a milk steamer to steam the milk. You can use any sort of milk. If you don't have a milk steamer, simply heat it up on a stove. Immediately pour your milk into the cup, and scoop the foam on top of the mixture. . Pull out your whipped cream and scoop a few spoonfuls onto your mocha (however much you want). Sprinkle the top with a bit of your large bits of candy cane, and stick in your other candy cane (unwrapped) for decoration. . And that's it!!If this video/Instructable was helpful, please <3 it, and subscribe. Also, find me on Facebook, Twitter, Tumblr, Instagram, and YouTube @joshpancooking. The greatest gift to me would be if everyone subscribed to my YouTube Channel. Well, thanks for your time, and I shall see you all soon. Goodbye!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 0, 3, 1]\nC: [0, 1, 3, 2]\nD: [2, 3, 0, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_83_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_83_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_83_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_83_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [3, 1, 2, 0]\nC: [3, 2, 1, 0]\nD: [1, 2, 3, 0]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. ToolsCake board to suit     Large knife     Small knife     Rolling pin    Small decorating paint brush    Silicon fondant mat (or baking paper works just as well)Bamboo skewerCakeReady made, or make your own cake, in this instance I used a ready made Sponges.2 x 20cm x 10cm x 3cm slabs and a      15cm x 8cm x 8cm slab.DecoratingAs with the cakes, you can make your own fondant, butter and piping gel icing, but I chose to pre-purchase my fondant and butter icing as mixing black icing is time consuming and messy. I made the piping gel icing as it was unavailable where I live and it was simple enough to make (see later steps).Butter icing     Black Fondant icing     White Fondant icing     Red Piping Icing     Brown food colouring      Red food colouring. Prepare the boardClean the board and place a little butter icing where the cake will be placed to stop the cake from sliding off.Shaping the cakeWith the larger sponge cake(s) if needed remove any parts of the cake that need to be joined together so there are no cooked edges are on the internal parts of the cake.     Level off the top of the cake with a large knife.     Create the shape of the you want, in this case I went for an oval shape as I could only get a round cake board.     Clean any cake crumbs from the board and cake.Butter icingUse a small spatula or in my case a butter knife to apply the butter icing.                                                        Note: be careful with butter icing on freshly cut sponge cake as it is very crumbly, and edges are prone to breaking off.Cover the entire cake in butter icing, then smooth the icing on the cake.                                                              Note: this can by made very smooth using a cup of warm water to dip the knife in and then smoothing the icing.Add the cake mask base and final butter icingWith the smaller sponge cake cut the cake into the desired oval shape and size.    Place the cake mask base on the already iced cake.   Butter Ice the cake mask base cake and smooth.   Clean the cake board.. Note: Keep any fondant icing not in use wrapped in food wrap or a resealable plastic bag.   Fondant icing is very pliable when warm, you can warm it up easily by kneading it in your hands.Prepare the fondant icingI'm too lazy to make fondant icing myself so I buy it in prepared colours from a range of different stores.    To use this icing well as it has been made some time ago so it needs to be worked a little to make it warm and pliable again, I do this just by kneading it in my hands and on the bench, this is also a great time to add any colouring to change colours as desired.Roll the fondant icingAs I need the icing to cover a cake it needs to be rolled out, doing this I flatten the icing out a little before starting to roll it out on the fondant mat.    Begin rolling the icing out, once have reached a thickness of about 5mm, add a second fondant mat to the top of the rolled out icing and continue to roll.    Every so often flip the icing over, remove and replace the icing mat to keep icing being rolled out level and to stop it sticking to the fondant mat.    Once a thickness of around 2-3mm has been reached make sure your icing is big enough to cover all of the cake, including the curve of the mask base, the top and the sides of the cake.Placing the fondant icingRemove the top fondant mat.    Place the rolling pin at one end of the fondant icing    Roll the fondant icing and the bottom fondant mat around the rolling pin    Pick up the rolled up fondant icing in the fondant mat    Place the loose end of the fondant icing at one end of the cake, with enough fondant icing to cover the side of the cake.    Unroll the fondant icing so it sits on the cake, with the fondant mat on the top.    When all the fondant icing is unrolled, you should be able to remove the fondant mat.    Smooth the fondant icing onto the cake, carefully working the icing into edges and grooves with the sides of your hands. Any creases or bubbles should be able to be smoothed out by gently lifting the edges of the icing to the crease and gently replacing.   You can also use tools to press the icing gently into any edges if you wish.Removing excess fondant icingOnce happy the fondant icing is fully in place, with a butter knife or similar cut the excess fondant off the cake board leaving around 3-5mm of icing from each edge of the cake.   Remove the excess fondant icing.    Check the removed fondant icing and keep any that does not have any cake crumbs.. Use a reference picture to create your icing mask, I was lucky enough to have my printed reference picture just the right size to use as a template to create the icing mask.Roll the fondant icingSame as the previous steps, although this time use white icing and make it around 8-10mm thick.Cut the icing mask shapeCut the mask shape out from the template using a small knife.   Cut the eye holes   Mark all the air holes in the mask by piercing with a skewer   Mark the painted areas on the mask by tracing the are with a skewer without piercing the paper   Remove all excess paperCreate the mask icing mask air holesEither use a sharp small tubular device (I used a small syringe tube with the end cut off and then slightly sharpened) to shove into the icing at each marked place and then remove the excess piece, or bore a hole with a skewer and make it larger (this can create issues with irregular sizes and pushes icing around).Add fondant icing nose and eye definitionsUsing a small piece of white fondant icing, create a small nose shape.   Using small piece of white fondant icing , create a thin roll of icing and cut it into two pieces.   Turn the fondant mask over   Using a brush add a thin layer of water onto the areas where the nose and eyebrows are the mask.   Attach the created nose in position.   Attach the two rolls of icing around the top part of each eye hole.   Allow water to dry for a couple of minutes   Turn the fondant mask back over.Place fondant icing mask on the iced cakeThis should be simple enough by sliding a spatula or a piece of cardboard under the mask. When in the required position, lower the mask close the the cake and slide the mask into position onto the cake.Finishing touchesRound the edges of the outside, eye holes and air holes of the icing mask.   Paint the marked areas with a small brush and undiluted red food colouring.   Using a small knife create marks on the icing mask to look light damage to the mask.   Using a piece of paper towel lightly brush around the edges of the eye holes and sides of the mask to create areas that look dirty, and brush into the gouges created earlier.. I modeled the knife after a hunting style knife I thought seemed appropriate to match the cake.Knife Blade and GuardMake some grey icing by taking some of the premixed white icing and add a little of the premixed black icing and thoroughly knead them together. Add the black icing sparingly as a little goes a long way, but remember you can always add more white to get the colour you want.Side step... You can skip the above colouring of the icing and leave the icing white and just use some edible silver paint for a metallic or chrome effect to paint the icing once shaped.Roll the grey icing until it is around 5mm thick. Cut the icing into a knife shape, bit of an elongated triangle really.Shove a bamboo skewer through the middle of the knife, protruding through each end to allow the pointed end to be stuck into the cake, and the other end to have the Guard and handle attached.Roll what is to be the 'bevel' edge of the knife so the edge is around 3mm angling up to the middle of the knife where it is to remain at the 5mm thickness making it look like a blade.Once happy with the 'bevel' of the blade finish shaping into a knife. Finally add a line down the middle of the knife along the start of the 'bevel' to the 'sharp' edge of the knife.If you like you can add the wire cutters to the back of the knife blade by removing small triangular pieces from the back of the blade.With a remaining piece of the 5mm rolled grey icing cut a rectangular piece that will be the knife guard.Finish shaping the guard by rounding the corners.Place the Guard onto the bamboo skewer on the blunt end of the knife to act as the guard.Knife HandleTake some black icing and roll into a cylinder to match the size of your knife blade you have made out of icing.When you have the required diameter of your handle trim each edge of the handle to the required length with a real knife by putting the real knife sharp edge lightly on the icing handle and rolling it back and forth adding more pressure to the cutting edge of the real knife slowly working through the icing handle until you have cut all the way through.Repeat for the other end of the icing handle.Add the grip to the icing handle by repeating the above steps to make slight indentations along the icing handle in regular intervals about 0.5mm deep.Place the icing handle on the bamboo skewer completing your icing knife.Insert the Icing Knife into the CakePick up the icing knife and gently insert it into one of the eye sockets sinking the knife slightly into the cake. WARNING: Any attempts to vigorously stab or throw the icing knife may result in damage to the icing knife.Note: As you may notice from the pictures my end product knife seems quite a bit shorter than the step by step photos, Yes I did redo the knife, but not the photos, also this is in no way related to the final warning.. Thanks to Google and McGreevy Cakes I was able to locate a suitable recipe for making my edible blood, which was made from Piping GelIngredients\u2153 cup granulated sugar1 Tablespoon Corn Starch\u00bc cup lemon juice (but I\u2019ve used just a dash of lemon extract in a pinch and it works fine)\u00bc cup waterRed food colouringInstructionsAdd all the ingredients except for the food colouring into a pot (saucepan)Mix over high heat until boiling, and then cool.Tint as desired!Make a piping bagGrab a mug or a cupGrab a plastic freezer or sandwich bag and place it into the cup, roll the edges of the bag over the edges of the cupWhen cool, pour the Piping Gel Mix into the bag inserted in the cupSeal the bagPick up the piping bag and when ready cut a small section off of one corner and then let the piping gel pour into the cake, icing knife and partially on the cutting board to look like blood.. Unfortunately I didn't get to see the kids reaction to the cake but I was informed they were suitably impressed and disgusted at the sight, but loved the taste.Thanks for readingDale\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 1, 2, 0]\nC: [3, 2, 1, 0]\nD: [1, 2, 3, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_84_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_84_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_84_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_84_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [2, 0, 3, 1]\nC: [1, 2, 3, 0]\nD: [0, 2, 1, 3]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. One bread knife, one or more round rolls (buns), a number of eggs equivalent to the number of rolls.. Cut off the top of the roll.. Take out the inside of the roll. Do not take too much as the egg will leak. There has to be enough room to accommodate one egg and some spices and top ups.. Load the egg into the roll and top it up with your favorite spices. I have chosen a bit of salt, loads of white pepper and loads of red paprika powder. U can use chilli peppers, I would go for Scotch Bonnet if I had some, (u have to remember that they r very hot!!!) cut into small pieces, without any problem making the dish very Mexican ;). I added a bit of low temperature melting fat chase as well. You egg roll is ready for the oven.. Load the roll into the oven for about 20-25 min @180 centigrades thats 356 Fahrenheit. The time will vary depending on the oven type, forced air circulation etc. so you will have to experiment with your oven go get the desired effect. I usually try to have the egg roll very soft inside with the liquid yoke and the white barely done. You simply have to observe the time required for the desired effect in your oven so next time you will be able to set the timer up.. You egg roll is nice and ready. Because no fat is used for the egg processing the white is going to be very delicate and hot, so you will have to wait a bit longer than with the regular fried egg as the bun works like a thermo flask keeping the egg nice and warm for longer. Load your favorite ketchups and garlic mayonnaises or mustard and enjoy your extremely easy but tasty and good looking appetizer.That's it hope you will like it.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 0, 3, 1]\nC: [1, 2, 3, 0]\nD: [0, 2, 1, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_85_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_85_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_85_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_85_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [3, 2, 1, 0]\nC: [3, 2, 1, 0]\nD: [1, 3, 0, 2]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Boil a chicken in a large stock pot with water and a quartered onion. (or your favorite chicken stock ingredients). Use a food processor to cut carrots, broccoli stalks, onions, celery and broccoli florets. Items should be cut fairly finely. Pulse each item for best results and set aside in a bowl. Keep broccoli florets separate.. In a large skillet melt a stick of butter. Place, carrots, celery, broccoli stalks, and celery and saut\u00e9 until soft. Do not add the broccoli florets yet.. Add veggie mixture to a soup pot and pour in broth. If your chicken stock isn\u2019t enough liquid you can add stock from cans as well to supplement. Add black beans.Add white wine. I use cheap Trader Joes wine (thanks almost 2 buck chuck!) Add salt and pepper and a little hot sauce, or if you only have siracha that will do to. Add a pinch or dried oregano and I had some thyme in the garden, you can also use dried thyme too. Remember, with seasoning best to start with less and always add more. Let this simmer for at least 30 minutes.. Take the chicken from the stock (it should be fully cooked) and tear off the meat and cut/tear into little pieces.. Add the chicken and the broccoli to the soup pot. Add liquid smoke, which is like a cheat for not actually having to smoke a chicken but still getting that awesome flavor! Add Worcestershire sauce and heavy cream. Stir and serve hot with bread!Please subscribe and check out my other Youtube recipes and fun DIY projects :) Including this awesome kitchen remodel I did!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 2, 1, 0]\nC: [3, 2, 1, 0]\nD: [1, 3, 0, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_86_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_86_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_86_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_86_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [3, 2, 1, 0]\nC: [2, 1, 0, 3]\nD: [0, 2, 1, 3]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Liquid Nitrogen- 10-15 liters1/4 cup Torani Bacon Syrup\n1 cup heavy cream\n3 cups half & half\n8 egg yolks\n1 cup sugar\n1/8 tsp saltSafety Gloves\nSafety Goggles- with splash guard\nSqueeze Bottle or Marinade Injector\nIf using marinade injector, do not attach needle to tip. To make the bacon ice cream base, combine the heavy cream and half & half in a large pot. Bring mixture to a simmer, stirring occasionally.\u00a0 Once mixture begins to simmer, turn heat to low.\nNext, whisk eggs, sugar & salt in a bowl. Then gently whisk in 1/4 cup bacon syrup. Whisk 1/2 cup of the hot cream mixture into the bacon/egg yolk mixture.Repeat three times, whisking in 1/2 cup of the hot cream mixture each time. Next, return mixture to the pot with the remaining hot cream and raise the heat to medium low. Stir frequently for 5 minutes or until the hot cream mixture coats the back of a spoon. Strain mixture into a bowl and set aside for 20 minutes.. Fill marinade injectors or squeeze bottles with the bacon ice cream base.\n**Remove the needle tip before squeezing into the liquid nitrogen. Removing the needle tip will result in more uniform droplets**.. Put on your safety goggles and gloves. Carefully pour the liquid nitrogen into a large saute pan.\nYou can remove your safety gloves now. The technique requires that you move quickly, but also methodically.\u00a0 Using your marinade injector or squeeze bottles, hold the tip very close to the surface of the nitrogen and carefully squeeze out a drop a little smaller than a juniper berry. Each drop should be about the same size. Move about 1/2'' each time your squeeze a new droplet so that the drops don't land on top of each other.\u00a0. Time to indulge! Kitchen Science has transformed a classic savory flavor into a whimsical satisfying savory/sweet dessert. These little dots of bacon perfection will leave your taste buds begging for seconds, thirds..and soon they'll be all gone.. I originally tried to make the Dippin Dots in small stainless steel bowls. This resulted in the dots clumping together. I also originally used the marinade injector with the needle tip attached which also resulted in the dots clumping together.\nThe large saute pan I used to make the final Dippin Dots was the perfect vehicle for this project. It allowed adequate space for the droplets to form. Removing the needle tip allowed for more control over the injector and uniformly shaped dots.\nIn the future it would be interesting to try pouring the ice cream base through a colander into the nitrogen.\u00a0\nAny feedback or suggestions welcome!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 2, 1, 0]\nC: [2, 1, 0, 3]\nD: [0, 2, 1, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_87_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_87_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_87_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_87_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [2, 3, 0, 1]\nC: [3, 2, 0, 1]\nD: [3, 1, 2, 0]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Homemade soup served by the hands of a loving wife or mother has been used as a home remedy with remarkable results. It\u00a0soothes\u00a0the throat and warms the body. \u00a0Canning large batches will serve you well when a family member comes home sick. \u00a0. \u00a0How many of you have ever tasted home canned food? The truth is home canned foods \u00a0have more nutritional value than store bought foods. You control what goes into the foods you can. It is very beneficial to can what you grow yourself, \u00a0because most farmers use harmful chemicals on their fields. If you can't grow it yourself consider buying produce that is grown organically. The flavor of home grown and canned produce is amazing!\u00a0 I grew up in a time when many people were still growing and canning their own produce. I know what a real dill pickle taste like and what you buy in the stores today don't even come close!\u00a0 Canning\u00a0 takes \u00a0time but if your time is limited consider growing your own garden and freezing what you grow. The benefits are worth the extra effort.\nIn this guide I have canned Grannie's soup recipe the lazy way. I canned the soup but have frozen it instead of using the pressure canner or pressure cooker method. This is an inexpensive way to get started and see if it is something you might be interested in doing. From there you will gain confidence and may decide to go for the real deal. I personally have canned fruits and jellies but have never attempted canning meats. Canning some foods require education because of the dangers involved if you don't do it properly.. \n\tThis is what you will need to make the soup:\n\t1 Boiled whole chicken adding only salt when cooking it.\n\tSave all the chicken broth.\n\tRemove the meat using a strainer if you have one, save and freeze the skins and bones if you have dogs or cats. I will show what to do with them later.\u00a0\n\tCut chicken in small bite size pieces.\n\t1 cup peeled chopped carrots.\u00a0\n\t1 Cup chopped celery.\n\t1 Cup chopped onion.\n\t1 Chopped jalapeno.\n\t4 garlic cloves.\n\t1 Lemon juiced. This is to add to the soup after it is cooked.\n\t2 Cups of fresh chopped tomatoes.\n\t1 Cup chives I used the tops of 6 green onions because I did not have chives.\n\t2 Chicken Bouillon cubes.\n\tI used curly noodles but you can add egg noodles as well.\u00a0\n\tThe secret to this recipe is use as many green spices as you can. I use what I have on hand.\u00a0You can add just about any kind of vegetable to this recipe and receive benefits from it.\u00a0 This is the recipe we have used for a very long time.\u00a0 I often use what ever I have at the time.\u00a0 Nothing is in stone.\u00a0 You can add parsnips, sweet potato and turnips for even better results.\u00a0 I did not have any on hand.\u00a0\n\tSpices:\u00a0 I adjusted my recipe for a larger group of taste buds.\u00a0 I like mine more seasoned and with more pepper.\u00a0Taste it after you add everything and adjust it for your taste buds.\u00a0 The more spices the better it works.\u00a0\n\t1/8Th Teaspoon of each of the following as desired:\n\tBasil\n\tParsley\n\tOregano\n\tPaprika\n\tChili Powder\n\tBay Leaves\n\tSage\n\tCumin\n\tRed pepper\n\tCilantro\n\tItalian seasoning\n\tDill weed\n\tCinnamon\n\tNutmeg\n\tSea salt\n\tPepper if desired\n\tYou may omit the peppers if your family is sensitive to it. Peppers help clean out the sinuses.\n\tUtensils:\n\t1 Large stock pot\n\t1 Large spoon\n\t1 Medium funnel with large opening\n\t1 Sharp knife\n\t1 Cutting board\n\tMixing bowls\n\tFood strainer if you have one.\n\tClean canning jars or heavy jars and lids with wide mouths. If this is your first time freezing in a jar just can/freeze a few to get the feel of it.\u00a0\n\tPlastic bags the number of jars you will be freezing.\n\tPlease note:\u00a0 If you are a\u00a0vegetarian you may substitute the chicken broth for a vegetarian broth and add rice and beans to make a complete protein.\u00a0\n\t\u00a0. Place the broth in the stock pot or cook it in a crock pot.\u00a0\nAdd all the spices.\nAdd the chicken.\nAdd all the vegetables reserving\u00a01 cup of\u00a0the tomatoes and a few green onion tops or chives for garnish.\nStir well.\nTurn on the burner and cook until the carrots are done but not over cooked.\nAdd the lemon juice to the cooked mixture.. Add the remaining tomatoes and chives to the jars.\nDo not fill the jars above the neck line. Leave at least 1 inch at the top for small jars and 2 inches for larger jars to allow for expansion. If you don't allow enough the jars could break. As it turned out my jars did not expand that much but it is best to be safe than sorry.\nLadle the soup into the jars.\nAllow to cool completely to ovoid breakage.\nWhen they are cooled completely carefully place them in the freezer with the lids off!\u00a0 As a safety measure: Place the jars into the plastic bags to prevent any glass from getting on other foods if the jar breaks.\nAfter they are completely frozen place the lids on the jars and screw down the lids.\nPut back in the freezer. There is no need to place them back into the plastic bags because they are frozen and there is no danger in them breaking.\nThat is all there is to it!\nWhen you thaw out the soup allow it to thaw in a bowl with cool water if you will be around to start cooking it when it is thawed.\u00a0 I personally feel safer defrosting it in the fridge. Avoid rapid thawing to prevent breakage.. I\u00a0promised\u00a0that I would add the link to my chicken soup bones recipe. \u00a0I made a completely different tutorial about how to cook the chicken bones to feed you dog/cat. \u00a0I had been visiting my sister and she was feeding her dogs chicken bones. \u00a0I never knew you could actually safely give them dog bones and they are very good for them. This tutorial also gives tips on how to potty train your dog and useful grooming tips on\u00a0\u00a0friendly products. Step 4 is about the dog food. \u00a0 \u00a0Here is the link on how to safely do that: \u00a0https://www.instructables.com/id/Potty-Training-Grooming-Nutrition-And-Choosing-/. I have pictures here of ways you can package the soup for gift ideas. You can begin to make the soup now and avoid that last minute holiday rush. It is important to place a large note on the package and tell them that the jar must be placed in the freezer or fridge asap or eaten within a few days. I know this is a repeat but it is very important and you would sure hate to find out that someone got sick on the soup you canned. The jars are not sealed so they need to be frozen until they will be used. Do not let them sit on the counter all day because bacteria can make you very ill. Thaw them in a bowl of cool water if you are going to be around to check on it often. Otherwise thaw in the fridge. Cook frozen soup as soon as you can remove it safely from the jar.\nFor a care package\u00a0 just add stuff one would take for a cold along with the soup. You can add a little or add a lot. You could make a family package because a lot of times everyone in the family gets sick. You can make the soup in a crock pot and take the entire pot to a sick family. Many different options you could do for this type of gift. Add bath salts recipe here: https://www.instructables.com/id/How-To-Make-Bath-Bombs/\u00a0\u00a0\u00a0 Lip balm: https://www.instructables.com/id/Delicious-Chocolate-Chapstick-Honey-Balm/, \u00a0candle, cough drops how to here:\u00a0https://www.instructables.com/id/Cough-Drops/ , Vapor rub\u00a0\u00a0, Orange juice, Vitamin C, Tea, Get well rock, Throat spray, or footie's just to name a few.\nThere are people who have concerns of storing foods in plastic containers or bags and this is a good alternative for them.\u00a0 You can use plastic to store them in and that is an option you might consider.\u00a0 This is a great way to get you comfortable putting up your own food.\u00a0To freeze broth simply place the broth in the fridge until the fat settles to the top.\u00a0 Skim off the fat and pour\u00a0the broth into a freezer bag and work out the air.\u00a0 Lay flat single layered on the freezer shelf.\u00a0 After it is completely frozen you may stack it to make more room in the freezer.\u00a0\nI am currently working on an Instructable using chicken bones for cat/dog treats. \u00a0When it is finished I will add a link here.\u00a0\nThank you for stopping by and have a super day!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 3, 0, 1]\nC: [3, 2, 0, 1]\nD: [3, 1, 2, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_88_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_88_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_88_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_88_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [1, 0, 2, 3]\nC: [0, 1, 3, 2]\nD: [2, 3, 0, 1]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. You will need:One bowl of flattened rice (poha)One chopped tomato One chopped onion One grated carrot A few peanuts, cashews and sesame seedsA teaspoon red chili powder A teaspoon coriander powderA pinch of chat masala Salt depending on your taste Half a teaspoon of lime juice A few coriander leaves. Take a pan and add a teaspoon of oilLightly saut\u00e9 the flattened rice for few minutes.Remove the flattened rice and add the peanuts, cashews and sesame  seeds to same pan.Lightly roast them till they change color.. Now we need to add all the ingredients to the flattened rice and mix everything. First add the chopped tomatoes, grated carrot and chopped onions.. Add a pinch of chat masala, coriander powder, red chili powder and as much salt as you like.. Add the roasted nuts and coriander leaves and mix. Finally squeeze a little lime juice and serve immediately\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 0, 2, 3]\nC: [0, 1, 3, 2]\nD: [2, 3, 0, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_89_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_89_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_89_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_89_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [3, 0, 2, 1]\nC: [0, 2, 1, 3]\nD: [3, 1, 2, 0]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. This tree stump is going to be a great place for one of our cute little birds to perch. It's also going to add height to the cake making it appear larger and more complicated than it is! You will need:light brown fondantdark brown fondanttoothpickSCULPTING THE TREE STUMP 1. Cut a 1 1/4 ounce piece each of dark brown fondant and light brown fondant and place on top of each other.2. In a twisting or pulling motion mix the two colors of brown fondant together to marble them slightly. You can see I only have one distinct vein. More would be great!3. Roughly form the fondant into an egg shape and flatten the top by pressing with your fingers.4. Squeeze the sides of the stump to give the stump ridges and to elongate it.5. Press or pinch the bottom of the stump with one finger in random places to give the effect of roots.6. Pinch the top edges to sharpen the 'cut' edge of the stump.7. Using a toothpick, score or scrape lines in the sides of the stump to resemble bark and circles on the top of the stump to create rings.. We are going to stain toothpicks with food coloring to make stick legs for our birds!You will need: four toothpickspaper towel or napkinbrown food coloringMAKING THE LEGS1. Dip the toothpicks into the food coloring.2. Remove and place on the paper towel without rubbing off the excess food coloring.3. After about 10 minutes wipe off excess food coloring with the paper towel. The toothpicks should be stained a medium brown color.4. Set aside to dry.. Oftentimes the simplest of shapes make the cutest characters. Sometimes leaving it simple is best! These birds are definitely simple! You will need: light pink fondant - 1 ouncedark pink fondant - 1 ounceorange - pea sized amountbrown - teeny, tiny piece!turquoise - enough for two very small eyesclear vanilla extractfood safe paint brushrolling pinrolling matMAKING THE BIRDS1. Remove a small piece (large pea size) of light pink fondant and set aside for the wings.2. Between two hands, roll the larger piece of fondant into a ball.3. Place on your rolling mat and with your hand roll the piece back and forth with gentle pressure until you start to get a cone shape (without a point).4. Using your fingers, press or pinch the smaller end flat to create the tail and curve up by pressing gently with your fingers.5. Divide the remaining piece of fondant in half and roll one half into a ball. Press to flatten slightly.6. Slightly pinch one side, and curve the pinched/pointed side up slightly. You should have a paisley shape wing. Repeat for the other wing.7. Pour a small amount of clear vanilla extract or vodka into the cap or a small bowl and apply a very small amount of extract (using a food safe paint brush) where you want the wing to be located on the bird body. Gently press the wing onto the body. It should stick immediately! Repeat on the other side of the bird. (Extract will leave a shiny or glossy appearance on fondant so use sparingly and try to be as exact with it as possible.)8. Divide the orange fondant in half and set one half aside reserving for the next bird and roll into a ball and form into a cone with your fingertips. This is the beak.10. Apply a dab of extract and adhere the beak to the front or head of the bird.11. Roll a very small amount of turquoise fondant into a ball and flatten. This will be the iris. Apply to bird with a small amount of extract.12. Roll an even smaller amount of brown fondant into a ball and gently press it into the turquoise fondant eye creating a pupil. Since it's so small of a piece it should stick without fondant, but if it doesn't, secure it with extract. Repeat steps 11 and 12 to make another eye for the back side of the bird.13. Repeat the entire process to make one more bird in your choice of color!. There are tons of ways to make fondant roses. I will cover three uncomplicated ways to make cute roses to top our cake with!You will need:light yellow fondant (1 ounce)dark yellow fondant (1 ounce)small round cutters - 1 inch and 1 1/4 inch (you can also use a shot glass, biscuit cutter, or make a circle pattern from thick paper and trace around it with a knife)paring kniferolling matrolling pinpetal foam (or any craft foam)ball head sculpting tool (you can also use a rounded end drink stirrer, a melon baller or a dry finger)FLOWER #11. Roll out a small piece of light yellow fondant and cut out six small circles.2. Using your finger, press the edges of each circle to thin them.3. Line the circles up and gently press them together at each overlap to secure.4. Flip the line of circles over so the wrong side is facing up.5. Roll the circles up from one end to the other.6. Cut the roll in half to create two roses.7. Pinch the bottom of the roses to open them slightly.8. Peel back a few of the petals to make the rose look more natural. Set aside to harden. Repeat to make more roses.FLOWER #21. Roll out a long piece of light yellow fondant and cut a 1/2 inch by 6 inch strip.2. Remove excess fondant and set aside.3. Press ridges in one side of the fondant with your finger. Continue along the entire edge of the strip.4. Pick the fondant up and roll the strip until you reach the desired size. If 6 inches is too long or makes it too thick of a flower you can cut it short anywhere you would like.5. Pinch or squeeze the bottom of the rose to make a stem and open it slightly.6. Cut off the stem so the flower will sit upright on the top of the cake. Set aside to harden. Repeat to make more roses.FLOWER #31. Roll out a piece of dark yellow fondant.2. Cut ten to fourteen, 1 1/4 inch circles and remove excess fondant.3. Roll a small sized piece of fondant into a ball. About the size of a large marble.4. Roll one end making it into a cone shape. The same way we made the bird! This will be the center of the rose. Set aside.5. Place one circle on the petal foam and in a back and forth motion with the ball head tool flatten and ruffle the edges. Repeat with remaining circles.6. Wrap one circle around the center piece of the rose.7. Repeat with remaining circles placing them randomly so the seams don't match up. You can make the rose as small or as large as you want. Fold the petals back slightly to give the flower an open look.8. Pinch the bottom edge when finished to create a stem and cut the bottom so the rose will sit flat. Repeat the process to make more roses. . If you haven't already done so, mix the colors of fondant you wish to use in the pattern of the cake. I used eight different colors (nine shown) but more or less would be fine.This is where you get to be creative and think about repeating patterns. What shapes and sizes do you want for your cake? I chose a superellipse shape and used all the same size but you could experiment with various sizes of the same shape for a unique effect. Examples of other shapes that would make fun repeating patterns are: rhombus, stars, heart, circle, octagon, crescent, parallelogram, etc.You can also\n find a lot of cutter options in the clay modeling section of the craft \nstore. Let's get started!You will need:geometric cutter (I got my superellipse from a small set of inexpensive cutters from the Duff Goldman collection that I found at Micheal's.)rolling pinrolling matparchment or waxed paper lined sheet pan (You can also just use your countertop or a table!)MAKING THE PATTERN1. Roll out one color of fondant very thin on your rolling mat using a rolling pin. When I say 'thin' it should basically be as thin as possible without tearing.2. Using the wrong side of the cutter cut out 15-20 pieces or shapes. If you use the wrong side (folded metal side) of the cutter you will get an exact shape with sharp corners. If you use the correct or sharp side of the cutter you will get rounded edges. This drives me nuts, but some people don't mind at all!!! You can see the difference in the picture (left cut-out = wrong side of cutter; right cut-out = right side of cutter). If you are using a cutter with rounded shapes like a cloud or a flower it won't make much of a difference which side you use.3. Smooth the sides if any rough edges are present by folding or gently squeezing the frays to the back side of the shape. 4. Transfer pieces to a baking sheet lined with parchment (or waxed) paper. Make sure they are not touching so they don't stick together! If you don't have a baking sheet you can place the parchment directly on your table or countertop in a place that won't be disturbed.5. Repeat with remaining colors!. Here we will follow simple steps to apply a repeating pattern to our fondant cake.You will need:fondant covered cakeparing knifefondant shape cut-outsclear extractsugar pearlscake stand or base1. Cut one of the fondant pieces in half from point to point using a sharp knife. Do this with one of each of the remaining colors equaling 18 halved pieces total. You may need more later but this is a good starting point for now.2. Apply a small dab of clear vanilla extract to the back of the fondant piece and place the top flat edge in line with the top edge of the cake.3. In a random pattern of colors apply pieces straight down in the same manner cutting the bottom most piece as necessary. I only needed to cut a tiny little piece off to make it fit at the bottom edge of the cake.4. Continue around the entire cake trying to keep the pieces straight and the colors in a random pattern.5. Once you have finished applying the pattern press a sugar pearl into each corner where the pieces meet. If the pearls don't stick by pressing into the fondant you can apply a small amount of extract to get them to stick. Sugar pearls will acquire a strange thick gooey film where they come into contact with extract so use only if necessary!!We are ready to decorate and complete the cake!. Arrange flowers, birds, and tree stump in any manner you wish! I ended up using two birds, four roses, and six small flower cut-outs with pearl centers (same flower technique as from the drip cake) on the top of the cake. Here's an idea of how to arrange your fondant decorations:1. Decide which side you want to be the front of the cake.2. Place the tree stump toward the back of the cake just off center to the right and place a large rose on the edge of the cake near the front right. Add some small roses and flowers around in groupings or any arrangement you like. Secure with extract.3. Push toothpick legs into the birds. I left off the back wings so the bird could harden slightly when I was working on other things.4. Stick one bird so it is standing on the stump and one toward the front left. Adhere the back wings on each bird with extract.Congratulations!! You should be very proud of yourself! You have completed a beautiful decorated cake that is ready to serve! If you don't plan on serving the cake right away, leave it out at room temperature for up to 3 days until you want to slice it and serve it. Do not refrigerate!If this cake is for a birthday party and you choose to put candles in it they will easily poke through the fondant and stand securely.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 0, 2, 1]\nC: [0, 2, 1, 3]\nD: [3, 1, 2, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_90_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_90_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_90_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_90_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [1, 3, 2, 0]\nC: [0, 2, 3, 1]\nD: [1, 0, 2, 3]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. 1 (10 oz.) bag mini salted pretzels (~3 c. of mini pretzels)\n2 c. toasted oat cereal (such as Cheerios)\n2 c. crispy corn cereal squares (such as Chex)\n2 c. mini marshmallows\n1 (8.2 oz. bag) candy coated chocolate candies (such as M&Ms - I used peanut M&Ms for this version)\n1/2 c. salted peanuts\n1 (11 oz.) bag white chocolate chips\n2.5 tsp. vegetable oil. Line 2 baking sheets with wax paper/parchment paper and set aside. This is where the snack mix will go when you are done mixing it up so it can cool and set.. Combine all ingredients in a large bowl EXCEPT for the white chocolate chips and vegetable oil. Make sure that the bowl is large enough!\nIn separate microwave-safe bowl, heat white chocolate chips and vegetable oil on medium-high heat for 1 minute, stirring once. Heat for 10 second intervals, stirring after each, until chocolate is smooth and melted.\nPour chocolate over cereal mixture and stir until evenly coated (don't forget to make sure all of the goodies at the bottom get coated!).. Spread mixture onto prepared baking sheets and let cool. Break apart once mixture is cool and put into serving bowl, cellophane bags with ribbon, etc. Store leftovers in airtight container (not like there will be any left after everyone tries it!).\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 3, 2, 0]\nC: [0, 2, 3, 1]\nD: [1, 0, 2, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_91_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_91_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_91_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_91_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [2, 0, 3, 1]\nC: [0, 3, 1, 2]\nD: [2, 3, 1, 0]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Ingredients:1. One pack mutton knuckles2. One can whole kernel corn3. One chopped onion4. Four small chopped potatoes5. Olive oil6. Your favorite spices and herbs7. Half a cup rice8. Salt. 1. Wooden spoon2. Can opener3. One pot. Add to your pot a little bit olive oil, and then fry your chopped onion in it. You can add spices or herbs.. Add the mutton knuckles to your chopped onions and fry for a while.. Do Not stir!. After your meat has cooked for a while add your chopped potatoes and rice. Add also your choice of some salt,spices and herbs. Add a bit of water. Don't over do it.. Let it cook for about an Hour. You can add water if needed. Don't add to much, but add enough for the rice to cook. Do Not stir!. Have fun, just don't let your towers fall over!. After waiting, open the can and add corn to pot. Note: Dispose of all the liquid before adding. Stir in gently.. Enjoy your one pot meal! Eat it with anything you like, or enjoy it alone.Please vote for this Instructable!Thanks for reading through my Instructable. Please try it out and tell me what you think about it in the comment section.Check out our other Instructables:Easy OmeletteLichen GardenEasy Lemon Peel JuiceReal Chocolate BrowniesYou can also check out my forum topic:The Powerpoint Game Community\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 0, 3, 1]\nC: [0, 3, 1, 2]\nD: [2, 3, 1, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_92_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_92_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_92_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_92_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [0, 3, 2, 1]\nC: [3, 0, 1, 2]\nD: [2, 3, 1, 0]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. I dropped the rock crab into boiling water and cooked them for 10 minutes. I strained the water, and cleaned them. The claws are where most of the meat is found. So tap these with a hammer or something hard to crack them, then clean out the crab meat. The legs are pretty small, so these I tossed whole into the gumbo. The bodies were also a little small to clean and a lot of meat was left in the claw shells, so I put these into a pan with five cups of water and let it simmer. The kitchen began to smell swampy.... Brown your sausage-I had about 1 pound. Remove from pan and then add 1 onion, 2 green peppers, and 4 stalks of celery. Sautee these. . To your pork stock, add 1 can tomato sauce, or canned diced tomatoes and 1 can tomato paste. Combine pork stock and crab stock (after you strain out the crab shells) in one large pan, then add the sausage vegetables and spices. (2 bay leaves, 2 tbsp. paprika, 2 tbsp. red chili flakes, 1 tbsp. salt, 2 tbsp. black pepper.) . Put 1 cup of pork lard in a frying pan and melt on low heat. Slowly stir in 1 3/4 cups of flour. Continue to stir and cook for 20 minutes. . Add the crab meat, 1 pound of slice okra, and 1 pound of collard greens cut into ribbons and simmer for 45 minutes until flavors blend. If you live somewhere where they sell fil\u00e9, or (sassafras leaves), then add this spice towards the end. I live in San Francisco and couldn't find it anywhere, so I added spices to taste. . The gumbo lets you stretch the crab far! I was able to serve about 25 people gumbo over white rice. \nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 3, 2, 1]\nC: [3, 0, 1, 2]\nD: [2, 3, 1, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_93_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_93_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_93_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_93_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [0, 2, 3, 1]\nC: [1, 0, 3, 2]\nD: [2, 3, 1, 0]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. We start with the ingredients for the pastry cream:  4 egg yolks, 1/2 C sugar, 1 1/2 C whole milk, 1/4 C corn starch, 1 vanilla bean, and 2 Tbs unsalted butter.. Whisk together the yolks, sugar, and corn starch.  Whisk vigorously until the stiff mixture loosens up and turns pale yellow.  Then simmer the milk, and very slowly whisk the hot milk into the bowl with the egg mixture.  Stir constantly until well combined.. Pour the mixture into a medium sauce pan and place on the stovetop over medium heat.  Whisk this mixture vigorously and constantly.  As it comes to a boil, it will start to thicken.  Remove it from the heat as soon as you feel it tighten.  Slice open the vanilla bean, remove the caviar, and add it to the cream along with the butter and whisk to combine.  Place the cream into a bowl and cover it with plastic wrap, placing the wrap so it touches the cream.  This will prevent a skin from forming.. Take one thawed piece of puff pastry dough (one sheet from a standard package of frozen dough).  Cut eight ten centimeter circles with a cookie cutter.  Take a smaller cookie cutter and cut the insides from four of the eight circles, to form four rings.. Place the four circles on a greased sheet pan.  Brush them with egg wash (1 egg mixed with one Tbs of water).  Place the rings on top of each circle and brush those with egg wash too.  Let these rest on the counter for about 15 minutes while you preheat the oven to 400 degrees.  Bake them for 20 minutes on the center rack.. While the Vol au Vents bake, make the coulis.  A coulis is just a fruit puree that's used as a sauce.  To make it, heat a pint of fresh raspberries (reserve four berries for garnish) in a small saucepan.  Add 1/3 cup sugar, the zest of one lemon, and the juice of half of a lemon. Use a fork to smash the berries.  Add a sprig of thyme, leaving it whole.  Remove the sauce from the heat and let the thyme infuse it as is cools.  Then remove the thyme and discard.  Place the sauce in the refrigerator until you need it.. Pull a bit of pastry gently from the middle of each vol au vent so you have space for the cream.  Spoon the cream into a freezer bag and snip a 1/2 inch opening into one of the tips with scissors.  Pipe the filling into the pastries.  Garnish each of four plates with some of the coulis.  Add a raspberry and some fresh thyme leaves.  Serve and enjoy!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 2, 3, 1]\nC: [1, 0, 3, 2]\nD: [2, 3, 1, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_94_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_94_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_94_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_94_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [1, 2, 0, 3]\nC: [0, 2, 1, 3]\nD: [1, 3, 0, 2]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. First let's preheat the over to 350 degrees F. Then let's start by creaming the melted butter with the sugar. . Next we add our eggs and mix it together. . Now we add the vanilla extract and mix it in. . Next we combine the mixture we just made with our applesauce and mix it together. Just until incorporated, we don't want to go crazy with the mixing. :). Now we add our cinnamon and baking soda to the flour and whisk it together. Or if you like, you can use a sifter to mix the dry ingredients. Then add the liquid mixture to the flour and mix it together until you get a nice batter/dough. . Now we fill up the muffin cups about 2/3 to 3/4 of the way full depending on how large you want your muffins. Then put them in the oven to bake. 350 degrees F for 20 to 25 minutes. They will be nice and golden brown on top. Poke a tooth pick in and if it comes out clean and dry they are done, if they have a bit of batter on them, bake it for a few more minutes. . Now all you have to do now is enjoy them!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 2, 0, 3]\nC: [0, 2, 1, 3]\nD: [1, 3, 0, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_95_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_95_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_95_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_95_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [2, 3, 0, 1]\nC: [2, 1, 0, 3]\nD: [1, 2, 0, 3]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. \n          List of groceries:\n\n\t\t1 Can whole peaches if you can find them. \u00a0I used peach halves.\n\t\t1 Small bag\u00a0spaghetti.\n\t\tButter to taste.\n\t\tSeveral Goldfish Colors.\n\t\tSeveral\u00a0raisins.\n\t\tSeveral dried cranberries.\n\t\tYour choice of mixed vegetables for sea plants. \u00a0( I used carrots, onions, purple cabbage and parsley because that is what I had. )\n\t\tParmesan\u00a0cheese ( shredded.)\n\t\tParmesan cheese grated.\nPlease note: \u00a0I have\u00a0included a couple of hot dog versions. \u00a0You can see from the pictures I added a hot dog instead of a peach. \u00a0I also added different vegetables,\u00a0Ramon noodles\u00a0and a\u00a0bento\u00a0box.\u00a0It is basically the same idea though. \u00a0. For the octopus in a bowl you will need:\n\t\tKnife.\n\t\tCutting board.\n\t\tPan.\n\t\tColander.\n\t\tClear bowl.\n\t\tFork or spoon to stir.\n\t\tTooth Pick for poking the holes for eyes and mouth.If you want it in a\u00a0pedestal you will need this in addition:\n\t\tClear Pedestal bowl.\n\t\t2 Gallon zip lock bags.\n\t\tBlue food coloring.\n\t\tLED light.For the jar specimen you will need:\n\t\tEverything for making the clear bowl octopus plus:\n\t\t1 Small jar with lid.\n\t\tScissors.\n\t\tTape.\n\t\tBlue construction paper.\n\t\tStraight edge.\n\t\tLed light ( optional ). Spaghetti:\n\t\tCook the spaghetti according to the directions.\n\t\tStrain off the water.\n\t\tPlace in a bowl not the container it will be displayed in.\n\t\tAdd butter.\n\t\tCheese.\n\t\tSeasonings.\n\t\tStir well.\n\t\tPlace the\u00a0spaghetti carefully into the display bowl. \u00a0\u00a0\nIf you decide to add cooked vegetables instead of the raw go ahead and cook them at the same time you make the\u00a0spaghetti.. Center the octopus in the bowl:\n\t\tWith a tooth pick make the eye and mouth holes.\n\t\tPlace\u00a0raisins\u00a0and cranberries in the eyes and mouth. \u00a0\n\t\tYou might need to cut them smaller.\n\t\tPlace the octopus where you want.\n\t\tArrange the vegetables to look like a ocean scene.\n\t\tArrange the fish crackers in the front.. Jar:\n\t\tIf using hot dogs, cooked vegetables, or \u00a0noodles go ahead and cook them. \u00a0It is best to boil the hot dogs.\n\t\tWhile they are cooking start making the back drop for the jar..\n\t\tCut the construction paper half way around the jar so the octopus can be seen.\n\t\tTape it to the outside of the jar with the best side of the jar to the front.\n\t\tPlace the spaghetti or noodles in the jar carefully not to smudge the sides with oil.\n\t\tCut the legs for the dogs by looking at the pictures.\u00a0\n\t\tPlace the raisins and cranberries in the eyes and mouth of the peach or hot dog\n\t\tSet the octopus in the jar as desired.\n\t\tPlace the vegetables around the octopus.\n\t\tSet the jar on top of a LED light (\u00a0optional ).\u00a0\n\t\tAdd lid.\nI used\u00a0Vienna\u00a0sausages for the jar but\u00a0they were harder to work with.. \n          Pedestal arrangement:\n\n\t\tFollow the directions for making the octopus in the previous steps.\n\t\tPlace the spaghetti in the\u00a0pedestal\u00a0bowl.\n\t\tArrange the vegetables around the octopus.\n\t\tPlace the Led light in the bottom of the stand.\nShown are 2 different arrangements. \u00a0 I filled 2 zip lock bags with water and blue food coloring.\nI double bagged it to be safe. \u00a0I really liked the fish crackers. \u00a0I think it brought a lot of life to the arrangement. \u00a0. Here is the Bento style:\nI arranged it in the same way as I did the other arrangements only I used a Wendy's box. \u00a0I thought it turned out real cute. \u00a0I only cut 6 legs because the\u00a0Vienna's\u00a0\u00a0were too brittle. \u00a0. These were a lot of fun to make. \u00a0I liked all the different variations. \u00a0The\u00a0Vienna's were harder to work with though. \u00a0You might have noticed that I braided some of the spaghetti. I think it was hard to braid because I cooked them a little too long. \u00a0I wanted to use the onion to tie ribbons on the ends of the braid but they did not work very well. \u00a0I tried celery but they did not work either. \u00a0If you have any suggestion please share them. \u00a0I think it would be awesome to have made them with braids and bows. \u00a0\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 3, 0, 1]\nC: [2, 1, 0, 3]\nD: [1, 2, 0, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_96_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_96_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_96_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_96_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [3, 0, 2, 1]\nC: [2, 1, 3, 0]\nD: [0, 2, 3, 1]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Before you start, it is important to get an assortment of different kinds of apples. Throwing golden delicious into the mix is always a good choice.. For the filling:6 Assorted apples1 Tablespoon of flour1 teaspoon of cinnamon\n1 Tablespoon (real) maple syrup\n(not pictured)1 Tablespoon of cold butter\n(chopped)2 teaspoons of fresh lemon juice\n(1/2 teaspoon of grated lemon peel - optional)For the crust (top and bottom):4 cups flours2 sticks cold butter (1 cup)1 teaspoon salt1 cup ice water. When making this pie, I made two separate balls of dough. One for the top, and one for the bottom.Alternately, I could have made one large ball of dough, and then separated it into two smaller balls.For the the first ball, we will mix together 2 cups of flour and 1/2 a teaspoon of salt. . Once the salt and flour are sifted, the next step is to slice a stick of cold butter into small pieces and mix them in with a pastry blender. Set aside the butter wrapper.. Mix ice cold water into the dough one tablespoon at a time.Start pressing the dough together with a fork. With each subsequent tablespoon of water, as it begins to bind, mix it together with your hands. Continue adding water until the dough can be formed into a ball. . Wipe the butter wrapper around on the inside of the pan to grease it. . Place the ball of dough on a well floured surface. With a floured rolling pin, spread out the dough into a large circle. . Transfer the dough to the pan, and press it into the bottom.Trim away any dough that extends past the edge of the pan. . For best results, lightly tap the dough on the bottom of the pan with a fork. . Cover the dough with plastic wrap and leave the pie crust in the refrigerator for at least 2 hours.. Repeating the process you have just followed to make the bottom pie crust, make another one for the top. Stick this in the refrigerator as well when you are done.. Peel and slice the apples. Place the slices in the pie crust. . Mix together all of the filling ingredients with the apples. . Place the dough topping over the top of the pie, and trim it to the edge of the pan. . Cut slits into the dough topping with a sharp knife so that the pie can breathe in the oven.With a pastry brush, coat the top of the dough with milk. This will give it a nice golden crust.. Bake the pie at 425 degrees for about fifty minutes with the top covered in aluminum foil. Remove the foil and continue baking until the top turns a nice golden brown.Once it is the right color, remove it and leave it on the stove top to cool. . After about an hour (or so), your pie should be cool enough to serve and eat. You made a pie. Hooray!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 0, 2, 1]\nC: [2, 1, 3, 0]\nD: [0, 2, 3, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_97_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_97_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_97_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_97_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [0, 3, 1, 2]\nC: [2, 1, 0, 3]\nD: [1, 2, 0, 3]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Here's the complete shopping list: \u00a0\n\t\t4 large eggs\n\t\t1 cup half and half\n\t\t1/2 cup milk\n\t\t8 oz. Mexican Chorizo sausage\n\t\t8 Jalapeno Peppers- medium\n\t\t1/4 cup Red Peppers- any variety\n\t\t6 oz Chihuahua cheese- (may substitute Monterey Jack)\n\t\t8 oz Colby-Jack Cheese\n\t\t1/2 plus 1/3 cup flour\n\t\t1/2 cup yellow onion- chopped\n\t\t3 Tablespoons Butter\n\t\tGarlic Salt\n\t\tBlack Pepper\n\t\tTabasco\u00a0\n\t\tGround Cumin\nNot to worry... I'll break this list of ingredients down in each Step as needed and used. ;-) \u00a0Important Tools: \n\t\tThis pie is made with a\u00a0savory\u00a0Dutch Baby \"pie crust\". \u00a0You'll need\u00a0to use a #5\u00a0cast iron skillet -OR-\u00a0a capacity-equivalent\u00a0ironstone baking dish for peak results.\n\t\tAn electric hand mixer. \u00a0\n\t\tlatex gloves to handle the jalapeno peppers.\n\t\tPotholders that withstand extreme heat.. Great news! \u00a0This prep work can be done up to 2-3 days in advance. \u00a0\nThe only thing you'll need to remember is to remove the foods from the refrigerator at least\u00a0a 1/2 hour before bake time so they'll be at room temperature. \u00a0No worries, though... I'll remind you. ;-) \u00a0\nLet's get started.Begin with the jalapenos:\nChoose jalapenos that will fit comfortably around the inside of your skillet.\nRoast and sweat the jalapenos before cleaning them. \u00a0If you're unfamiliar with the process, here's an easy, how-to technique to get you started. \u00a0Don't be tempted to forego roasting and sweating the peppers.\u00a0They need\u00a0to be par-cooked before going into the quiche.\nPut on the latex gloves. \u00a0Cut a small pie-shaped wedge out of each jalapeno. \u00a0Dice the wedges, put in a bowl and set aside. \u00a0\nUse your gloved \u00a0thumb (or a grapefruit spoon) to scrape out the seeds and membrane. \u00a0Discard them.\nCut off a 1/2' thick slice from the block of chihuahua cheese. \u00a0Trim it into a wedge that will fit snugly inside the hollow jalapeno.\nRepeat until all of the jalapenos are stuffed. \u00a0Cover and refrigerate if you aren't using right away. \u00a0\nSet the leftover cheese aside to grate later.\nThinly slice approximately 1/4 cup red peppers. \u00a0Put them in the bowl with the diced jalapenos. \u00a0Cover and refrigerate if you aren't using right away.Pre-cook the Chorizo:\nBefore cooking, remove the chorizo from it's casing. \u00a0Put it in a medium skillet over medium high heat. Add 1/4 cup of water. Bring to a boil, reduce heat and simmer/stir for 7-8 minutes or until the water has evaporated. \u00a0Cover and refrigerate if you aren't using right away.Saute 1/2 cup chopped yellow onion\u00a0in 1 tablespoon of butter until just limp. Sprinkle with garlic salt. Cover and refrigerate if you aren't using right away.Grate the extra chihuahua cheese pieces and the colby jack cheese. \u00a0You'll need about 1 1/2 cups of cheese for the quiche filling and 1/2 cup for garnish. \u00a0Hint: You can find chihuahua cheese in any Mexican Market. \u00a0Save the grated chihuahua for the garnish. It melts beautifully.Work Station Management:\u00a0\nJalapeno Popper Pie is a bing-bang-boom-BREEZE\u00a0to prepare if you're organized. \u00a0\nMeasure and set out all of the ingredients (except the grated cheese and the butter) 1/2 hour ahead of time.\nFor the Dutch Baby crust to rise successfully, it's imperative that the skillet is blistering hot and the eggs/milk are at warm room temperature. \u00a0Take them out of the frig at least an hour\u00a0before baking. \u00a0I'll reiterate that one more time, \u00a0just to make sure we understand each other. ;-). Place skillet inside the oven and preheat to\u00a0475\u00b0 for 10 minutes.Crust Ingredients:\n\t\t2 Eggs-\u00a0Room\u00a0temperature\n\t\t1/2 cup Milk- Room\u00a0temperature\n\t\t1/2 cup flour- sifted\n\t\t1/2\u00a0teaspoon Garlic\u00a0Salt\n\t\t1 generous pinch\u00a0ground Cumin\n\t\t4-6\u00a0dashes of Tabasco Sauce (Great flavor enhancer with minimal heat)\n\t\t2 Tablespoons COLD ButterPreparation:\u00a0\nIn a medium bowl, beat eggs with an electric mixer until thoroughly combined. \u00a0\nAdd milk, garlic salt, cumin and Tabasco sauce. \u00a0Mix well. \u00a0\nGradually whisk in flour until smooth.\u00a0\nRemove skillet from oven. \u00a0Add the cold butter to the skillet. \u00a0Use a fork to move it around so the bottom and sides of the skillet are well coated. \u00a0\nPour all the batter into the center of the skillet, then return it to the oven immediately.\nBake until puffed and browned, about \u00a010-12 minutes.\u00a0\nWhile the crust is baking, prepare the filling so it can be immediately added when the dutch baby crust is finished.\n\u00a0. \n          \u00a0Egg filling ingredients:\u00a0\n\t\t2 large eggs\n\t\t1 cup half and half\n\t\t1/3 cup flour\n\t\t1/2 teaspoon Garlic seasoning saltAdditional ingredients- prepped in advance:\n\t\t1/2 cup sauteed yellow onion\n\t\t8 oz. Chorizo- pre-cooked\u00a0\n\t\t8 Jalapeno peppers-\u00a0stuffed with Chihuahua cheese\n\t\t1/4 cup red chili pepper- \u00a0thinly sliced\n\t\t2 cups Colby-Jack cheese- grated\n\t\tChihuahua cheese- grated for garnishPreparing the Quiche filling: \u00a0\nPut the eggs in a medium bowl. \u00a0Using the electric mixer, beat the eggs then add the half and half and garlic salt. \u00a0Beat on high speed until thoroughly mixed. \u00a0Slowly add the flour and beat until smooth.\nRemove the Dutch Baby from the oven.\nLayer it with onions, 3/4 cup colby-jack cheese and half of the diced peppers.\nUsing a measuring cup, pour half of the egg filling on top. Layer with 1/2 of the chorizo.\nArrange the stuffed jalapenos around the skillet. \u00a0Top with another 3/4 cup grated cheese. \u00a0Add the remaining chorizo and diced peppers.\nPour the remaining egg mixture around the jalapenos. Scatter the sliced red pepper on top.\nMove the skillet very carefully back into the oven.\nReduce the temperature to 350\u00b0. \u00a0Bake for 30 minutes, uncovered.\nCover loosely with aluminum foil, bake another 30 minutes.\nRemove from the oven, leave covered and allow the quiche to rest for 10 minutes.. Sprinkle the quiche with the extra cheese.\nServe the slices with sides of salsa, sour cream and warm flour tortillas .\nEnjoy!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 3, 1, 2]\nC: [2, 1, 0, 3]\nD: [1, 2, 0, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_98_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_98_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_98_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_98_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [3, 2, 0, 1]\nC: [2, 1, 0, 3]\nD: [1, 0, 2, 3]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. You will need the following - 1 box of your favorite pasta - 1 can of Campbell's\u00a0Cheddar Cheese - 1 can of\u00a0Campbell's\u00a0Cream of Mushroom Soup - Breadcrumbs - 4 Tbsp salt - 18 oz. of extra sharp cheddar chese - Milk - Cutting Board - Knife - Large pot - Colander\u00a0 - Large Casserole Dish with lid - Wooden Spoon Preheat you oven to 375. - Fill pot with water about 3/4 of the way full - Set on stove to boil on high. - Add 4 Tbsp of salt to water - Stir until dissolved. - Cut 18 oz. of the sharp cheddar cheese into 1inch cubes. - Add the pasta to the boiling water - Follow the directions on the pasta box tho see how long to biol the pasta. - Cook to Al Dente. Pour cooked pasta and water into a colander in your sink to drain.. Add the cream of mushroom and the cream of cheddar to the pot. (If you are not a big fan of mushrooms then use two cans of the cheddar). Fill your now empty cans with milk. Pour your two cans of milk into the pot and set to medium-low heat.. - Add the cubed cheddar cheese to the pot - Melt, stirring frequently. - Add the pasta back to the pot. - Stir until well coated. Pour Macaroni and Cheese into the casserole dish. Sprinkle breadcrumbs on top and place in oven for 30-45 minutes or until top is golden brown and bubbling.. Carefully take out of the oven and enjoy.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 2, 0, 1]\nC: [2, 1, 0, 3]\nD: [1, 0, 2, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_99_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_99_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_99_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_99_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [1, 0, 3, 2]\nC: [2, 0, 3, 1]\nD: [0, 2, 3, 1]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. As a base for the fillings, you can use wraps, tortillas or lavash bread.  Trader Joe's has a good soft lavash bread.  You may run into a cracker-type that needs to be moistened with water, but I wouldn't mess with that on your first attempt.  I used wraps in these examples, partly because they come in a variety of colors & flavors, and they work well for the heart shape.  Fold the circular wrap or tortilla in half, making a crease that will be the bottom of the heart.For fillings, choose one or two spreads, plus at least 6 other items from the list below, or use what you have!  It's good to have everything prepared, open and ready to go before you starting building your lavash.Ingredients:Wraps, tortillas, or lavash bread     Spreads:  hummus, cream cheese, goat cheese, pesto, refried beans, creme fraiche, sour cream, etc.     Sliced cooked meats, and sliced or shredded cheese     Shredded vegetables:  carrot, cabbage, jicama, radish     Fresh spinach or other greens     Sugar snap peas (whole)     Avocado     Thinly sliced tomato (drain on paper towel), bell pepper, green onion, red onion, cucumber     Grilled eggplant (thinly sliced)     Olives, pickles (gherkins or cornichons are good), capers     Condiments:  salsa, salt & pepper, fresh basil, parsley, cilantro, chives, garlicEquipment:Knives and spatulas for dipping and spreading     Grater or food processor     Plastic wrap     Sharp knife for slicing. You can build as many as will fit your workspace, but two at a time works for me.Use one or more spreads for each.  You don't need a lot, but you do need to spread it all the way to the edge.  The spreads have the advantage of being the \"glue\" that holds the whole thing together, as well as adding flavor and moisture.Examples:  Hummus and cream cheese, pesto and creme fraiche, refried beans and sour cream.. Next add your toppings in rows.  Choose compatible flavors, and contrasting colors and textures.  You want it to be full, but still be able to roll it up.  Keep in mind that when you slice the lavash, you will be seeing everything in cross section, so align spreads (if multiple) and rows of topping with the crease, not perpendicular to it.Some possible combos:Hummus, cream cheese, thinly sliced red pepper, spinach, thinly sliced tomato, shredded carrot, green onions, black olives     Refried beans (may need to thin with water for easier spreading), sour cream, shredded pepper Jack cheese, salsa, cilantro, thinly sliced red onion, thinly sliced tomato, avocado     Pesto sauce, creme fraiche, sliced ham, thinly sliced tomato, spinach, sugar snap peas, thinly sliced provolone cheese, thinly sliced yellow bell pepper.Salt & pepper or other condiments to taste!. For heart-shaped lavash, you roll each side only half way, meeting near the middle.  Fold gently along the crease.For a traditional lavash, you start at one end and roll it all the way.  A little spread on the end flap helps hold it together.. Wrap each lavash snugly in plastic.  Once you have done that you may need to slightly flatten or pinch the creased side again to create a heart shape.Chill for at least an hour before slicing; longer if possible.. When you're ready to serve, unwrap and cut the lavash into slices 1.25\u20131.5\" thick.  Use a sharp knife or serrated knife for best results.  You can gently reshape the pieces for the best look if needed.  Arrange on a platter in a single layer or stack in several layers.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 0, 3, 2]\nC: [2, 0, 3, 1]\nD: [0, 2, 3, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_100_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_100_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_100_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_100_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [3, 1, 0, 2]\nC: [1, 0, 2, 3]\nD: [0, 3, 2, 1]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. This tree stump is going to be a great place for one of our cute little birds to perch. It's also going to add height to the cake making it appear larger and more complicated than it is! You will need:light brown fondantdark brown fondanttoothpickSCULPTING THE TREE STUMP 1. Cut a 1 1/4 ounce piece each of dark brown fondant and light brown fondant and place on top of each other.2. In a twisting or pulling motion mix the two colors of brown fondant together to marble them slightly. You can see I only have one distinct vein. More would be great!3. Roughly form the fondant into an egg shape and flatten the top by pressing with your fingers.4. Squeeze the sides of the stump to give the stump ridges and to elongate it.5. Press or pinch the bottom of the stump with one finger in random places to give the effect of roots.6. Pinch the top edges to sharpen the 'cut' edge of the stump.7. Using a toothpick, score or scrape lines in the sides of the stump to resemble bark and circles on the top of the stump to create rings.. We are going to stain toothpicks with food coloring to make stick legs for our birds!You will need: four toothpickspaper towel or napkinbrown food coloringMAKING THE LEGS1. Dip the toothpicks into the food coloring.2. Remove and place on the paper towel without rubbing off the excess food coloring.3. After about 10 minutes wipe off excess food coloring with the paper towel. The toothpicks should be stained a medium brown color.4. Set aside to dry.. Oftentimes the simplest of shapes make the cutest characters. Sometimes leaving it simple is best! These birds are definitely simple! You will need: light pink fondant - 1 ouncedark pink fondant - 1 ounceorange - pea sized amountbrown - teeny, tiny piece!turquoise - enough for two very small eyesclear vanilla extractfood safe paint brushrolling pinrolling matMAKING THE BIRDS1. Remove a small piece (large pea size) of light pink fondant and set aside for the wings.2. Between two hands, roll the larger piece of fondant into a ball.3. Place on your rolling mat and with your hand roll the piece back and forth with gentle pressure until you start to get a cone shape (without a point).4. Using your fingers, press or pinch the smaller end flat to create the tail and curve up by pressing gently with your fingers.5. Divide the remaining piece of fondant in half and roll one half into a ball. Press to flatten slightly.6. Slightly pinch one side, and curve the pinched/pointed side up slightly. You should have a paisley shape wing. Repeat for the other wing.7. Pour a small amount of clear vanilla extract or vodka into the cap or a small bowl and apply a very small amount of extract (using a food safe paint brush) where you want the wing to be located on the bird body. Gently press the wing onto the body. It should stick immediately! Repeat on the other side of the bird. (Extract will leave a shiny or glossy appearance on fondant so use sparingly and try to be as exact with it as possible.)8. Divide the orange fondant in half and set one half aside reserving for the next bird and roll into a ball and form into a cone with your fingertips. This is the beak.10. Apply a dab of extract and adhere the beak to the front or head of the bird.11. Roll a very small amount of turquoise fondant into a ball and flatten. This will be the iris. Apply to bird with a small amount of extract.12. Roll an even smaller amount of brown fondant into a ball and gently press it into the turquoise fondant eye creating a pupil. Since it's so small of a piece it should stick without fondant, but if it doesn't, secure it with extract. Repeat steps 11 and 12 to make another eye for the back side of the bird.13. Repeat the entire process to make one more bird in your choice of color!. There are tons of ways to make fondant roses. I will cover three uncomplicated ways to make cute roses to top our cake with!You will need:light yellow fondant (1 ounce)dark yellow fondant (1 ounce)small round cutters - 1 inch and 1 1/4 inch (you can also use a shot glass, biscuit cutter, or make a circle pattern from thick paper and trace around it with a knife)paring kniferolling matrolling pinpetal foam (or any craft foam)ball head sculpting tool (you can also use a rounded end drink stirrer, a melon baller or a dry finger)FLOWER #11. Roll out a small piece of light yellow fondant and cut out six small circles.2. Using your finger, press the edges of each circle to thin them.3. Line the circles up and gently press them together at each overlap to secure.4. Flip the line of circles over so the wrong side is facing up.5. Roll the circles up from one end to the other.6. Cut the roll in half to create two roses.7. Pinch the bottom of the roses to open them slightly.8. Peel back a few of the petals to make the rose look more natural. Set aside to harden. Repeat to make more roses.FLOWER #21. Roll out a long piece of light yellow fondant and cut a 1/2 inch by 6 inch strip.2. Remove excess fondant and set aside.3. Press ridges in one side of the fondant with your finger. Continue along the entire edge of the strip.4. Pick the fondant up and roll the strip until you reach the desired size. If 6 inches is too long or makes it too thick of a flower you can cut it short anywhere you would like.5. Pinch or squeeze the bottom of the rose to make a stem and open it slightly.6. Cut off the stem so the flower will sit upright on the top of the cake. Set aside to harden. Repeat to make more roses.FLOWER #31. Roll out a piece of dark yellow fondant.2. Cut ten to fourteen, 1 1/4 inch circles and remove excess fondant.3. Roll a small sized piece of fondant into a ball. About the size of a large marble.4. Roll one end making it into a cone shape. The same way we made the bird! This will be the center of the rose. Set aside.5. Place one circle on the petal foam and in a back and forth motion with the ball head tool flatten and ruffle the edges. Repeat with remaining circles.6. Wrap one circle around the center piece of the rose.7. Repeat with remaining circles placing them randomly so the seams don't match up. You can make the rose as small or as large as you want. Fold the petals back slightly to give the flower an open look.8. Pinch the bottom edge when finished to create a stem and cut the bottom so the rose will sit flat. Repeat the process to make more roses. . If you haven't already done so, mix the colors of fondant you wish to use in the pattern of the cake. I used eight different colors (nine shown) but more or less would be fine.This is where you get to be creative and think about repeating patterns. What shapes and sizes do you want for your cake? I chose a superellipse shape and used all the same size but you could experiment with various sizes of the same shape for a unique effect. Examples of other shapes that would make fun repeating patterns are: rhombus, stars, heart, circle, octagon, crescent, parallelogram, etc.You can also\n find a lot of cutter options in the clay modeling section of the craft \nstore. Let's get started!You will need:geometric cutter (I got my superellipse from a small set of inexpensive cutters from the Duff Goldman collection that I found at Micheal's.)rolling pinrolling matparchment or waxed paper lined sheet pan (You can also just use your countertop or a table!)MAKING THE PATTERN1. Roll out one color of fondant very thin on your rolling mat using a rolling pin. When I say 'thin' it should basically be as thin as possible without tearing.2. Using the wrong side of the cutter cut out 15-20 pieces or shapes. If you use the wrong side (folded metal side) of the cutter you will get an exact shape with sharp corners. If you use the correct or sharp side of the cutter you will get rounded edges. This drives me nuts, but some people don't mind at all!!! You can see the difference in the picture (left cut-out = wrong side of cutter; right cut-out = right side of cutter). If you are using a cutter with rounded shapes like a cloud or a flower it won't make much of a difference which side you use.3. Smooth the sides if any rough edges are present by folding or gently squeezing the frays to the back side of the shape. 4. Transfer pieces to a baking sheet lined with parchment (or waxed) paper. Make sure they are not touching so they don't stick together! If you don't have a baking sheet you can place the parchment directly on your table or countertop in a place that won't be disturbed.5. Repeat with remaining colors!. Here we will follow simple steps to apply a repeating pattern to our fondant cake.You will need:fondant covered cakeparing knifefondant shape cut-outsclear extractsugar pearlscake stand or base1. Cut one of the fondant pieces in half from point to point using a sharp knife. Do this with one of each of the remaining colors equaling 18 halved pieces total. You may need more later but this is a good starting point for now.2. Apply a small dab of clear vanilla extract to the back of the fondant piece and place the top flat edge in line with the top edge of the cake.3. In a random pattern of colors apply pieces straight down in the same manner cutting the bottom most piece as necessary. I only needed to cut a tiny little piece off to make it fit at the bottom edge of the cake.4. Continue around the entire cake trying to keep the pieces straight and the colors in a random pattern.5. Once you have finished applying the pattern press a sugar pearl into each corner where the pieces meet. If the pearls don't stick by pressing into the fondant you can apply a small amount of extract to get them to stick. Sugar pearls will acquire a strange thick gooey film where they come into contact with extract so use only if necessary!!We are ready to decorate and complete the cake!. Arrange flowers, birds, and tree stump in any manner you wish! I ended up using two birds, four roses, and six small flower cut-outs with pearl centers (same flower technique as from the drip cake) on the top of the cake. Here's an idea of how to arrange your fondant decorations:1. Decide which side you want to be the front of the cake.2. Place the tree stump toward the back of the cake just off center to the right and place a large rose on the edge of the cake near the front right. Add some small roses and flowers around in groupings or any arrangement you like. Secure with extract.3. Push toothpick legs into the birds. I left off the back wings so the bird could harden slightly when I was working on other things.4. Stick one bird so it is standing on the stump and one toward the front left. Adhere the back wings on each bird with extract.Congratulations!! You should be very proud of yourself! You have completed a beautiful decorated cake that is ready to serve! If you don't plan on serving the cake right away, leave it out at room temperature for up to 3 days until you want to slice it and serve it. Do not refrigerate!If this cake is for a birthday party and you choose to put candles in it they will easily poke through the fondant and stand securely.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 1, 0, 2]\nC: [1, 0, 2, 3]\nD: [0, 3, 2, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_101_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_101_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_101_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_101_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [2, 3, 1, 0]\nC: [1, 0, 3, 2]\nD: [0, 3, 1, 2]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. 2 oz. Tequila - Use a good silver tequila1 1/2 oz. Orange Liqueur (i,e. Cointreau or Grand Marnier)2 oz. Margarita mix (opt. Use for a sweeter finish, otherwise omit)1 oz. Fresh Lime Juice1 oz. Orange Juice1 splash SpriteIceJalapeno stuffed olives!. Into a cocktail shaker, measure tequila and orange liqueur.. Roll lime firmly under your palm, or microwave it for 30 seconds to release more juice.Squeeze lime into shaker.. Add orange juice and ice, and shake.Top off shaker with a splash of Sprite (or other lemon-lime soda).. Pour into a chilled martini glass and garnish with three olives.Serve with shaker.Enjoy responsibly!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 3, 1, 0]\nC: [1, 0, 3, 2]\nD: [0, 3, 1, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_102_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_102_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_102_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_102_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [2, 0, 3, 1]\nC: [1, 3, 0, 2]\nD: [1, 0, 2, 3]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. \nrectange cake board\ncircular cake boards\nspatula\ncake mix or your favorite scratch recipe (see my\u00a0Old fashioned sour cream fudge recipe\u00a0below)\nheart shaped cake\u00a0pan\ncake release\nrolling pin\nsaran wrap\nred or pink pearl dust\nclean (new) make- up brush\npliers\ndowel\nscissors\nsharp knife\nblack marker\nroller cutter (optional)\nred gel paste food coloring (if using white fondant)\nfondant ( you can use white and color or purchase red fondant)\ncandy cups\nchocolates (at least\u00a024\u00a0 )\nfood wrap and tin foil to cover\u00a0cake board\u00a0(optional)\nyour favorite buttercream icing (see my favorite below)\ngumtex or tylose or use\u00a0gumpaste insteadOld Fashioned Sour cream fudge cake\u00a0\u00a0Ingredients:AmountIngredient2 \u00bc cupscake and pastry flour2 tsp.Baking soda\u00bd cupbutter, softened2 \u00bc cupsfirmly packed brown sugar1/2 tspsalt3eggs1 1/2 tspvanilla1 cupboiling water3 ouncesbakers unsweetened chocolate (melted01 cupsour cream (cooled)\u00a0 \u00a0 Directions:Sift together flour, baking soda and salt; set aside. Cream butter. If you use salted butter (skip the salt). Gradually add brown sugar and continue beating for 5 minutes. Add eggs one at a time, beating well after each addition. Add vanilla and chocolate. Alternately blend in flour mixture and sour cream, one third at a time, on low speed of electric mixer. Add boiling water; blend well. (Batter will be thin.) Pour into one greased and floured, waxed paper lined 9 \u00bd inches layer pan. Bake at 350 degrees for 35 to 40 minutes, or until cake tester inserted into center comes out clean. Cool in pans for 10 minutes. Remove and finish cooling on racks.Optional Filling: Kirsh Cream with Strawberries\n\t250 ml. Heavy cream 250 g. chopped strawberries (about 1 \u00bd cups)\n\t1 to 1 \u00bd tbsp. Kirsh cream or any other\n\tfruit liquer.\n\tBeat cream until whipped. Fold in strawberries and liquer and fill cake. \n\t\u00a0\u00a0Frosting: 5 squares Unsweetened Chocolate \u00bd cup butter, softened 1/3 cup water 3 cups icing sugar 1 egg Melt chocolate with butter and water over low heat; cool. (Mixture may appear curdled.) Add icing sugar and egg. Blend; then beat on low speed of electric mixer for 2 minutes. Chill until of spreading consistency.\u00a0Alternative Frosting (Bittersweet Chocolate Frosting): Amount is for a wedding cake therefore cut in half. 1 lb. Bittersweet chocolate, chopped \u00be cup heavy cream 3 tbsp. Unsalted butter In medium saucepan, boil water. In medium steel bowl combine approximately 2/3 of the chocolate and cream. Place bowl over saucepan and sir frequently until melted and smooth. Remove from heat and stir in remaining chocolate until smooth. Gradually beat in butter, 1 tablespoon at a time. Let stand until cooled to room temperature. \u00a0Bittersweet Chocolate Whipped Cream Buttercream IcingIngredientsPart One 1 lb. powdered sugar (sifted) 2 1/2 cups Crisco, 4 oz melted bittersweet chocolatePart Two 3/4 cup granulated sugar 1/2 tsp. salt 2 TBSP. Meringue powder (add 1 additional TBSP for slight crusting) 1/2 cup BOILING water (less 2 TBSP) 1 TBSP Vanilla (or flavor of your choice)InstructionsPart one... put crisco in bowl and gradually add powdered sugar. Beat about 5 minutes until mixture is very creamy and fluffy then add melted chocolate squares.\u00a0Set this aside.Part two... In a very clean bowl mix dry ingredients. Add BOILING water and immediately mix on high speed. Beat until stiff peaks form, about 8 minutes. When mixture begins to get stiff add flavoring.NOW combine both mixtures\u00a0and beat together for another 8 minutes. When finished, use a rubber spatula to down beat a little to remove some of the air bubbles. Frosting will be very light and creamy. Cover. DO NOT REFRIGERATE.The frosting may be kept at room temperature for 3 months. Whip with a spoon each time you use it to restore fluffiness.. Optional: Line the cake board with tin foil and food safe plastic wrap (this is not necessary but makes it easier to wipe messes off the board) I usually use neutral gold or silver gift wrap I purchase at Michael's , but I had run out. . Bake 2 heart shaped cakes. I always use a generous amount of Cake Release to prevent the cake from sticking. Level the cake, but cut \u00a0the one for the top (lid) of the cake a little shorter\u00a0than the bottom and place it on a circular cake board. Put it aside.\nPut the bottom cake on the main rectangular cake board. Fill the bottom cake \u00a0with filling of your choice (this is optional). Ice the cake, being sure to fill in the area where the cake was cut to fill, if you filled it. This doesn't have to be a thick layer covering everything, only a crumb coat. If a few crumbs mix in, it's not a big deal. Smooth as best as you can.\nRepeat for the top of the cake. It is important to get the top of the cake very smooth, as you will be placing fondant on top of it.\nTip: Take a metal spatula, soak it in boiling water (I use a pot on the stove)\u00a0 and wipe the\u00a0water off on a clean tea towel, then\u00a0smooth the icing with the dry hot spatula over the surface of the cake.\u00a0\u00a0Then remove excess icing off spatula. \u00a0Keep repeating until your cake is smooth. Add\u00a0about 2 Tblsp of \u00a0tylose or gumtex to\u00a0your\u00a0fondant, roll out and cut into a long strip. Alternatively, you can use gumpaste that can be purchase at Michael's craft store or any cake decorating store. But you will still have to color it.\nMake sure the strip is wide enough to go about a 3/4 of an inch above the cake (measure with chocolate on top) and let\u00a0the strip\u00a0dry for about 15 minutes. It needs to be dry enough so it won't sag or droop.\nCarefully\u00a0place the\u00a0strip (you will\u00a0likely need 2) \u00a0around the cake,\u00a0and close the seam at the back with a little water.\u00a0. If you have a\u00a0sugarcraft gun,\u00a0 then use the rope attachment to make the rope border.\nIf you do not, then roll out 3 narrow strips with the flat of your\u00a0hands and twist the pieces\u00a0\u00a0together. Don't worry if it doesn't go all the way around. You can do it in pieces and use a little water to 'glue' it together - it won't be noticeable.\nThen 'glue' the strips on the cake with a\u00a0 little water. Do a little strip of rope\u00a0for the seam at the back. And you will also do this for the top of the cake when the time comes. . Fill the surface of the cake with chocolates in candy cups (you can buy at Michaels;) . You will need at least a 24 chocolates. . Outline the circular cakeboard and cut to fit under the top\u00a0cake. You will need this to support the cake. . Roll out colored fondant (1/8\" thick ) and cover the top of the cake. I usually just guage how much I need by looking at it. But you can tell approximately how much you'll need be measuring the cake across and adding a couple inches all around. You can cut off the excess and reuse. If you have no idea how to smooth fondant on a cake, google it - there are lots of tutorials. Some prefer a smoother, but I use my hands (wedding ring off!)\nPlace on cake, smooth and trim.\nTip #1: Stick\u00a0the top of the cake\u00a0in the freezer for 10 minutes while you roll out your fondant - this makes it easier to cover with fondant. Don't leave it longer than 10 minutes!\nTip#2: To transfer the fondant, I roll it up a little with my rolling pin and gently unroll over the cake. . \nRoll out the remaining \u00a0fondant with gumtex or tylose (or gumpaste) \u00a0as thinly as possible (as least half as thin\u00a0as you rolled it to cover the cake)\n\u00a0Cut two lengths of the\u00a0fondant (or gumpaste) \u00a0the same length and width. These will form the loops. I generally cut mine around 7.5 cm/3 inches wide and about 15 cm/6 inches long. The length of these loops will determine the size of you bow, so If you want a bow about 10cms/4 inches long the loops will need to be a little more than double that length when you cut them. Its a little bit of trial and error, but the length can be adjusted after they' ve been cut quite easily.\nTurn one loop piece over and pinch the ends together, then do the same with the other end, and pinch the two ends together. Put some saran wrap in the bow pieces to set it in place.\nRoll out the tails of the bow in the same manner as the loops but make them a little thinner, maybe \u00be of the width of the loop pieces. Cut the ends at a 45 degree angle. Pinch them as you did the loop piece.\nMake the centre knot with another piece of fondant, rolled and marked in the same manner as the other pieces, but only make it about \u00bd the length of the tail pieces. The knot is just a short strip (maybe 1' by 1\") and it is just wrapped around all the other scrunched up ends so that there arent any rough edges showing. It doesnt need to go all the way around the back of the bow, just tuck the edges under so they dont show.\n\"Glue\" the pieces together with a little water on a paint brush\nCut a long, narrow strip and put directly on the cake\nDry brush on the red or pink pearl dust (I use a never used new make-up brush).\nThen place the bow on the cake on top of the narrow strip.. Take a wooden dowel (I use the wooden ones you can buy in art section at Michaels and boil it) , cut to size with pliers and sharpen with pencil sharpener. It should stick out about 1- 1 1/2 \"\u00a0above chocolates. Carefully place the top of the cake onto the sharpened dowel. You may need to poke a little hole in it from behind first (through the back and into the cake board.) You want it resting just above the rim of the bottom cake, so it doesn't put weight on the rim and wreck it.\nServe and enjoy!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 0, 3, 1]\nC: [1, 3, 0, 2]\nD: [1, 0, 2, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_103_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_103_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_103_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_103_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [1, 3, 0, 2]\nC: [0, 1, 3, 2]\nD: [0, 2, 3, 1]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Take large bowl and mix Suji, 1/2 tea spoon salt, baking powder. Take enough warm water to knead the flour into a hard lump. Then add basan and maida and knead it again.Then let it sit for 20 minutes in a thin wet cotton cloth.Then make small balls of all the kneaded flour as in the picture.. Now press the balls created into flat circular cakes. Make sure that the flat cakes are thin but not too thin. Keep the pressed balls on wet cotton cloth so that the flour does not dry out.(You may want to experiment some thicknesses in the next step).. Preheat vegetable oil on high.Take a sieve and a large bowl - to collect the extra oil from the deep fried gol gappasTake a plate and cover with paper towel to further collect the extra oil on the gol gappas.. Now take the pressed flat cakes and put into the hot oil one by one. Make sure that the oil is hot enough that the cakes become fluffy and round right away.Once golden brown turn all the gappas on the other side. Make sure the heat is now on low to cook the gappas well and not burn them.Take the gappas out once deep golden brown and put them on the sieve using a strainer. Be careful the oil is very hot!. Boil 3 potatoes and peel them.Then cut into small pieces and add roasted cumin seeds along with cut cilantro as in the picture. Mix it a bit but not too much (don't make it a paste).. Mix in a mixer:Mint, cilantro, mango, lemon, cumin seeds(roasted), salt, tamarind paste, black salt, sugar, waterTaste when fully mixed. You should be able to taste it as a tangy (and sweet) mix. Adjust the ingredients to suit your taste.For taste you can add boondi as well.. Now take a gol gappa and make a hole on top (on the softer side).Add a little potato mix created into the hole and add gol gappa water into it as well. If the gol gappas are made right they will not leak!Enjoy the full gol gappa in one bite. Eat as many as you have or like...\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 3, 0, 2]\nC: [0, 1, 3, 2]\nD: [0, 2, 3, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_104_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_104_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_104_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_104_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [0, 3, 2, 1]\nC: [1, 0, 3, 2]\nD: [2, 0, 1, 3]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. For the crust:\n2.8 oz cold butter\n3.75 oz flour (pastry flour, if you have it)\n1/3 tsp salt\n1/2 tsp sugar\n1/2 tsp vanilla\nSome ice water (more on that later)\nFor the filling:\n3 eggs\n6 oz sugar\n1 tsp vanilla\n2 bananas, pureed\n1/2 cup milk\nPinch salt\nPinch cinnamon\nPinch nutmeg\nFor the meringue:\n6 egg whites\n10 oz sugar. First, mix your dry ingredients (not counting butter) until everything's well dispersed. A note before we get into things: From here on out, the colder you can keep things, the better. It's perfectly okay, and preferable if you have the time and patience, to chill your ingredients and utensils before every step. And definitely let the dough rest between steps, in the fridge or freezer. Back to the directions; Next, using a pastry cutter or two knives, blend the butter into the dry mixture until your butter is about the size of rice. Then slowly add your vanilla and then your ice water, until the dough comes together. It shouldn't be wet, however, so just take it slow. Once that happens, let the dough rest a bit, then give it just a few kneads on a lightly floured surface, just so that it doesn't break too easily. Then chill it, roll it out to about 1/4 inch, and place it in your pie pan, an 8 inch one is preferable. Be careful not to break or stretch it. Trim the edges about 1/2 inch over the edge of the pan, and then roll that excess under and crimp however you like. Freeze it until it gets cold.. This filling is actually a modified pumpkin pie filling (a type of custard pie), just made with pureed bananas instead of pumpkin puree.\nFirst, mix everything except the eggs in a food processor until well blended. Then add the eggs, pulsing the processor until they're incorporated, but not over mixing. We don't want any air to get into the eggs, if possible. Next, chill this mixture. Pour it into your pie dough, egg wash the edges, and bake it at 350 for 20-40 minutes, until the center of the pie sets. Then cool the pie so that it doesn't melt the meringue.. Note: This is a method for a Swiss Meringue. You could also make an Italian Meringue if you had the know how and the inclination. I know a French Meringue (in which one just whips up sugar and egg whites) is easier, but this method is preferable if you want to avoid things like salmonella.\nBegin by heating the egg whites and sugar, whisking gently, in a double boiler. Heat this mixture until it reaches 100 degrees, whisking moderately all the while. When the mixture reaches 140 degrees, remove it from the heat and whisk (or put into a mixer and put on medium speed) until medium-stiff peaks for. Transfer this to a piping bag (or just spread it out with a spatula) ASAP and pipe on to the top of your cold pie. To finish it, use a torch to brown the tops. Then, most importantly, cut and enjoy!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 3, 2, 1]\nC: [1, 0, 3, 2]\nD: [2, 0, 1, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_105_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_105_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_105_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_105_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [2, 0, 3, 1]\nC: [2, 0, 3, 1]\nD: [1, 3, 0, 2]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Leftover mash1 eggFlour, a few tbspsbutterFor the purpose of this instructable I boiled and mashed my potatoes rather than using leftovers. I used 6 large potatoes, so, at a guess, I have enough for at least 20 potato cakes.. Peel, quarter and boil your potatoes until soft and mashable.Drain.Mash with a good knob of butter and leave to cool.If using cold, leftover mash then melt your butter first and then mash into the potato.Add 1/2 the egg and a good scoop of flour. Mix and mash until you have a soft, dough-like consistency.You may need more egg or flour depending on the type of potato and how much you have.For my vatful of spuds, I used the whole egg and about 4 tbsps of flour.. Use your hands to bring the potato together into a dough.Turn out onto a well floured board and knead a little, adding more flour as necessary.Pat out to a large circle about 1/2 inch thick. Or thicker if you like.Cut into rounds or triangles.I started off using a pastry cutter to make rounds but now find it easier to just pat out circles by hand.Make sure each cake has plenty of flour top and bottom.. Add a knob of butter to a heavy frying pan. (It just has to be butter I'm afraid!)Add a little oil to stop the butter from burning.Heat up the butter and carefully add the cakes.Don't cook too many at once otherwise they will become difficult to turn.Leave for about 5 minutes before turning.Thicker cakes will take longer, up to 8-10 per side.The butter in the pan will begin to darken so if you're cooking in batches you may need to wipe out the pan and use fresh butter. Enjoy for breakfast, lunch, dinner, whenever! Allow to cool before wrapping and freezing.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 0, 3, 1]\nC: [2, 0, 3, 1]\nD: [1, 3, 0, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_106_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_106_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_106_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_106_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [1, 0, 3, 2]\nC: [3, 2, 1, 0]\nD: [3, 1, 2, 0]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Marshmallow Fondant:2 cups mini marshmallows 3 cups powdered sugar 1 tbsp waterCake:1 box carrot cake mix 1 cup shredded carrotsFrosting:1/4 cup butter (softened) 2 cups powdered sugar 1/8 tsp vanilla extract 1/8 cup milk Plus: 1/2 cup shredded carrots to cover the sides of the cake.You will also need to print a rabbit template. Cut out the individual pieces of the rabbit's body and set the pieces aside for later. These will be used to shape cake and the fondant.. Marshmallow Fondant:2 cups mini marshmallows 3 cups powdered sugar 1 tbsp waterPut the marshmallows and water into a microwave safe bowl. Microwave 30 seconds. Remove from the microwave. The marshmallows should be very soft, fluffy and mostly melted. Stir with a spoon. Microwave again for 10 more seconds and mix until smooth. Then, fold in the sugar one cup at a time until it turns into a dough.The next step is kneading. To prevent the mixture from sticking to everything, sprinkle powdered sugar on your hands as well as on the surface you will be kneading on (you can also use butter or shortening to preventing sticking). Remove everything from the bowl and knead it with your hands for about 5 minutes until it is smooth.. Roll the fondant out into a thin sheet big enough to cover the area of the template. Lay the paper pieces on top of the fondant so you will know exactly where to cut. Use a knife to carve the pieces out.. Carrot Cake: 1 Box Carrot Cake Mix (plus whatever additional ingredients the mix calls for like eggs, water, oil, etc.) 1 cup shredded carrotsPrepare your mix by following the directions on the back of the box. Then, mix in 1 cup of freshly shredded carrots. Pour the batter into a greased 11\" x 7\" baking dish and bake as directed (Approximately 40 minutes at 325 degrees).. Once the cake has cooled, remove it from the pan so that you can level it. To level the cake, cut off the rounded top by slicing straight across with a large knife. Now that you have a level cake, you're ready make it look like a rabbit. Lay the paper template pieces on top of the cake, line your knife up with edges of the paper and cut straight down.. Frosting: 1/4 cup butter (softened)2 cups powdered sugar1/8 tsp vanilla extract1/8 cup milk  In a bowl, use an electric mixer to cream the butter. Then, gradually add in the powdered sugar. It shouldn't look like much has happened because the mixture will still look like powder. Beat in the milk and vanilla extract- this will make a creamy frosting.. Cover the sides of each piece with frosting and roll the pieces in shredded carrots. Then, spread frosting across the top and attach the fondant pieces. Put the pieces of the rabbit together and you'll have a rabbit cake!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 0, 3, 2]\nC: [3, 2, 1, 0]\nD: [3, 1, 2, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_107_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_107_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_107_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_107_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [3, 2, 0, 1]\nC: [3, 1, 0, 2]\nD: [0, 2, 3, 1]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. IMPORTANT:The icing recipe below is only enough to have a very thin layer on each waffle, including the top. It is not enough to ice the sides of the cake, as you can see in the picture. So if you want to have more, and thicker icing, I definitely suggest you double the icing recipe. (That is, if youmake 4 large Belgian waffles as the instructable says). If not, you have a minimally-iced cake, like I did, which is fine.Ingredients;For the waffles: (makes 4-5 large Belgian waffles)  3 eggs  1 cup coconut milk \u00be cup warm coffee  \u00bd cup chocolate chips, melted \u00bd cup vegetable oil 4 tablespoons sugar  1 \u00bd cups white whole wheat flour  1 cup all-purpose flour  \u00bd cup cocoa powder  4 teaspoons baking powder  \u00bc teaspoon salt  \u00bc teaspoon vanilla  For Frosting:  1 cup chocolate chips (= 8oz baking chocolate) \u2154 cup heavy cream 4 tablespoons butter 1 teaspoon vanilla extract \u00bc cup sugar 1 teaspoon corn syrup. Prepare a large Belgian waffle iron according to manufacturer\u2019s instructions.This is the one I used: Cuisinart Belgian Waffle Maker. In a separate medium bowl, combine flours, 4 tablespoons sugar, cocoa, baking powder, and salt.. Dump eggs, coconut milk, oil, and vanilla, into a large bowl. NOTE:I tried using a blender, which is why I have pictures of it; but it was a bad idea: the batter was too thick, and I ended up using a bowl instead. So don't be confused by the pictures ;). Dump the dry ingredients into the large bowl with wet ingredients, and mix until smooth and incorporated.. Heat the coffee up in a microwave-proof dish.Dump in 1/2 cup chocolate chips.Stir until melted and combined with coffee.Pour chocolate/coffee mixture into batter.Mix until combined.. Cook batter in waffle iron, according to instructions. (With my waffle iron I used the no. 3 setting)You should turn out with 4-5 large waffles. Optionally, you can use a small portion of left over batter: pour the small portion of leftover batter into one corner of the waffle maker. You can use this for decoration on the top of the cake later.. Heat heavy cream and butter in saucepan on medium heat until it just begins to boil. Make sure butter is melted. Return heat to low, and stir in corn syrup, and sugar, until sugar is dissolved. Add vanilla.. Pour hot mixture over chocolate chips in a medium metal bowl, and stir until chocolate is melted.. Place chocolate mixture in bowl in a ice bath, and stir until frosting hardens a bit. Take out of ice bath and whip with a beater until it is light and fluffy.. Don't ice the waffles until they are completely cooled.Place one waffle on a serving dish. Spread frosting thinly over it. Place the second waffle down over, and spread frosting over that. Do this with the rest of the waffles, spreading over the final part in a thin layer, not the sides.Here's where that extra optional waffle corner comes in: You can place it how you want to on the top of the cake on the icing. See picture. Place in fridge for 15 minutes before serving.NOTE:There is not much icing for the waffles, which is why you should only do thin layers, and not over the sides. But if you want to, you can double the icing, and make it look more like a standard cake.However, this look still looks good as you can see the sides of the waffles, and reduces sugar.. Enjoy!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 2, 0, 1]\nC: [3, 1, 0, 2]\nD: [0, 2, 3, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_108_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_108_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_108_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_108_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [0, 2, 1, 3]\nC: [0, 3, 2, 1]\nD: [1, 0, 3, 2]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. When I originally received my seeds from a friend, he'd not germinated any. \u00a0My first go was a failure. \u00a0Out of 20 seeds planted, I had one germinate. \u00a0It produced one edible fruit (the rest rotted before obtaining a reasonable size). I rescued the seeds and planted again this year. \u00a0Again I put it in the same special watering system pot. \u00a0I fed regularly and this year was rewarded with a monster crop from one plant. The picture shows pepperdew (left) and twilight (right - quite hot!). \u00a0The twilight is very interesting - it grows purple and has no heat. \u00a0Later on as it starts to change it gets intensely hot!. I used: 1lb of pepperdew or sweet peppers 2/3 cup of sugar 1.5 cups of vinegar 1.5 cups of water Pinch of salt You'll also need; Ice slotted spoon Saucepans knife teaspoon Sterilised jar Chopping board\u00a0. I always give my raw peppers a wash in the sink. \u00a0They're grown organically, they're not sprayed, but they may be dusty. \u00a0Also removes surface level insects. I chopped the end of the peppers off with a knife, then used the end of the spoon to remove the seeds and innards. Keep the seeds - you can grow more peppers! Once hulled, you plunge into boiling water for two minutes. \u00a0Once done, use the slotted spoon to dunk them in ice cold water. \u00a0This stops the cooking and keeps the peppers crunchy. \u00a0It'll also kill any bacteria on the peppers so when you can/bottle them they won't spoil so quickly. Put them into the sterlised jar - if you've not sterlised them before, I put them in the oven at 160'C for about 10 minutes. \u00a0The lids I boil.. Add the water, sugar, salt and vinegar into a pan. \u00a0Bring to the boil and keep it there for two minutes. Add to the peppers making sure the last 1 cm is left clear. Make sure if you've not sliced the peppers that they don't have pockets of air in them. \u00a0I poked them with a sterilised spoon until bubbles stopped coming up. Add the lid and you're done.. Leave in the fridge for 24hrs before opening. They won't last long. \u00a0Well, theoretically they'll last 2-3 weeks. \u00a0 But they're so tasty expect them to be gone in a couple of days. Stuff with soft cheese, slice up and put on salads/pizzas etc. Don't forget that those seeds are a start of new life. \u00a0Leave them on a plate with a bit of tissue to dry over the next few days. \u00a0Next year, about April time, get them in some compost and see what happens!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 2, 1, 3]\nC: [0, 3, 2, 1]\nD: [1, 0, 3, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_109_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_109_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_109_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_109_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [2, 1, 0, 3]\nC: [3, 0, 1, 2]\nD: [3, 2, 1, 0]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n.  1. You need 1 blow up  balloon                                                                                                                                          2. 1 bag of Chocolate chips                                                                                                                                              3. 1 bag of White Chocolate chips                                                                                                                                    4. Bowl and Microwave                                                                                                                                                        5. Any fruits like Strawberries,Oranges,Apples...etc slices. Now take out the milk chocolates out and put it in a bowl to microwave it until it melts or melt it in a stove pot than do the same thing to the white chocolate.. Spill the chocolate separate around the blow up balloon any pattern u want.Now let it dry into a freezer until freeze.Than take it out and pop the balloon and u get a chocolate bowl. Now put any fruits u want into the bowl and READY SET EAT!!!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 1, 0, 3]\nC: [3, 0, 1, 2]\nD: [3, 2, 1, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_110_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_110_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_110_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_110_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [3, 2, 1, 0]\nC: [0, 3, 2, 1]\nD: [2, 0, 3, 1]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. You will notice, that some of the measurements are rather thematic and then some of those you are used to seeing in volume units are in mass units. The latter is due to my cooking practices - putting a dish on a scale and adding needed amounts of ingredients seems way more convenient than working by volume. In some of the places I've added the usual measurement units as well since my excessive use of \u03c0 may \u03c0ss some of you off. :)Tools:Blender or food processor Various bowls Cake tin or something else to bake in (rhyme wasn't intended) \u00d826cm or around 10\" in my caseKitchen scale (I prefer this to measuring spoons) Measuring spoons (maybe)Ingridients for crust:420\u03c0 cm\u00b3 popcorn (5.5 cups) 40g butter 190g brown sugarIngridients for chocolate cake filling:100g flour 130g sugar 35g cocoa powder 1\u03c0 cm\u00b3 baking powder (a bit shy of 1 tsp) Pinch of baking soda (can you pinch this?) Pinch of salt Some coffee or espresso powder(?) 80g milk 35g vegetable oil 1 small egg Some vanilla extract/bean/sugar (a matter of taste) 25\u03c0 cm\u00b3 boiling water or strong coffee (1/3 cup)Ingridients for cranberry top:105\u03c0 cm\u00b3 not too sweet cranberry jam (approx 1.5 cup) 10\u03c0 cm\u00b3 gelatine (2 tbsp) A random amount of fresh or frozen cranberries for sournessA handy guide for pi day pies:1 cup (236 ml) = 75\u03c0 cm\u00b31 tbsp (15 ml) = 5\u03c0 cm\u00b31 tsp (5 ml) = 1.5\u03c0 cm\u00b31 ml = 1 cm\u00b3. This was made after a quick search online. The source was this, but I found that the amount of butter suggested is insane and used a lot less of it for better or for worse. Since I couldn't be bothered with making actual caramel for use with popcorn (although I would, if I had to make this again), I opted for simple sticky combination of brown sugar and butter with ratio of around 4 to 1 (sugar : butter).I made around 230 grams of this butter - sugar mixture in total, although more would've been even better. When you make this and mix with popcorn, go for the amount which makes the popcorn stick together enough to form the vertical sides of pie crust.The popcorn itself was simply popped in the microwave and I used a little shy of 100g unpopped popcorn for the 420\u03c0 (5.5 cups) of popped popcorn I used for the crust. A little bit of chopping is required for it, but don't overdo that! See pictures for what it looked like.Mix the processed popcorn with sugar-butter mixture and let cool a bit. Use this cooled mixture in cake tin or whatever else you use as your pie dish and form a crust. Bake this formed crust at around 180\u007f\u00b0C (350\u00b0F) for around 10 minutes so it sticks together a little bit better as the sugar melt the popcorn together.Take it out and let cool, or, if you planned well ahead pour the cake like filling (from next step) in it and bake further.. This again wasn't exactly a fruit of my imagination. After using some google-fu I found a chocolate cake recipe with great reviews and decided to go with that. It seemed easy to make as well.As far as my limited kitchen skills go, I know one thing which helps when making stuff like that: mixing the dry ingredients first and adding the liquid ones later. So this was the tactic I used here turned out well.Since I didn't have espresso powder mentioned in the linked tutorial, I opted for a natural way of brewing very strong coffee and adding that instead. Not sure if that made any difference and how strong taste one does get from the espresso powder. Couldn't complain about the taste though and that's the most important part.The mixed batter minus the boiling coffee was withheld until the crust was ready so that when I added the boiling coffee to the mix, I was able to whisk it like I have never whisked before, apparently to make it more airy.After all of this and given that you have the crust ready - just pour it in and let put back in to bake. The centuries (probably) old toothpick technique for testing doneness (put toothpick in centre, if it comes out clean - it's done), while not very scientific, works really well. It took around 20 minutes total for it to cook. As it was done I took it out and away to cool. It was the time to make the top cranberry layer.. I wanted a sweet and sour sensation while eating this and my grandma's cranberry jam with some extra frozen cranberries added seemed like the right candidate for that.Making the top layer is at the same time simple and complex. It's a simple combination, but it's somewhat hard to get the consistency right. My aim was to make it not as runny as the jam was, yet not gummy. Basically, it should feel like eating jam with the convenience of it not oozing everywhere while you eat it. The ratio in my case was around 10 to 1 jam to gelatine, although it was on the gummy side, so a little less would've been even better.You will definitely need to test this yourself with the jam you're going to use. A good way to do that is by taking a small dish, putting in some 3 teaspoons or so of jam and adding a bit of gelatine (remember how much though). Heat this up in microwave oven and let cool. See what the consistency is after it is around room temperature and decide if that suits you.The gelatine won't melt in a cold jam, so you will need to heat all of it up. No need to overdo that though. While it is possible to do it on the stove or in the microwave oven, I simply mixed the cold jam with gelatine and threw the bowl in the oven which was still warm from baking. Since the cake was still cooling down and I didn't need to hurry anywhere this was very convenient and also saved energy I would've otherwise used to heat it!As the pie was cool, I took out the jam with already molten gelatine, mixed it a little bit just in case and poured some frozen cranberries in there. I would suggest defrosting them before this though since they cool down the jam rapidly and it will gummy up or at least become harder to pour over pie.As you have the jam and cranberries mixed, pour that over the pie still in tin and leave to cool so the gelat-jam isn't runny anymore. As it cools - remove from tin and eat or add decor.. The pi day decor was pretty simple. I made a vector image on my computer, made sure it is the right size for my pie and cut it out of cardboard. The file is attached so you can print and cut or simply cut in the size it is given that your dish is around the same size (the image is 20cm or 8\" wide). One thing to keep in mind is that I forgot to add stencil like connections for the 4 (or P) so you may want to do a little bit of editing to fix that. Not a big deal at all, I solved this issue with some bits of masking tape which held the islands in place. You should probably also mark the middle point in some way. I remembered that too late already and just eyeballed it.The lettering was made using powdered sugar, yet that is probably not the best way to do it, since it soaks up from the top layer of jam eventually, and the text becomes a lot less visible. So if you do this - do that right before serving for best effects.The cardboard stencil itself was not put directly on jam because it would stick awfully and I'm not sure that it's the kind of paper I want to get in contact with my food. I used some broken toothpicks as stencil supports so it is right above the pie, yet doesn't touch it.The powdered sugar was applied through a fine net to assure somewhat even coating.. All that's left now is to serve and enjoy and I certainly hope you do so!If some questions arise - leave a comment, I try to answer all of them. If there is something where you see bad form - definitely leave a comment, as I said, this is a new medium for me and it would be great to get into good habits early on.Speaking of the end result - I liked the taste, and everyone else I gave the opportunity to taste it enjoyed it as well. Some stuff was pointed out which I already had in mind as having the potential for being fixed. To put it simply, if I had to make this taste combination again, I would make a thicker, sweet, decently caramelized popcorn crust with chocolate fudge as a not too sweet filling and leave the top cranberry layer as is.The odd pictures you see in this step are from tests I ran for taste compatibility and the feel of things in general.If you think this instructable is worthy of a robot apron or something - leave your vote on top of the page. If you like what I make in general, follow me on instagram.Until next time!Raitis\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 2, 1, 0]\nC: [0, 3, 2, 1]\nD: [2, 0, 3, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_111_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_111_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_111_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_111_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [2, 0, 1, 3]\nC: [0, 3, 1, 2]\nD: [2, 3, 0, 1]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Ingredients:\n\t\t32 elderflower heads\n\t\t1kg sugar\n\t\t55g citric acid\n\t\t4 washed, unwaxed lemons\n\t\t1.5 litres boiling waterTools & Materials:\n\t\t1 large pan\n\t\t1 jelly bag (or scrap of fine muslin)\n\t\tseveral sealable bottles. You'll need 32 elderflower heads.\nOf course, the best method is to go out and pick them. Pick any within reach from the elder tree until you have all that you need.\nOur elder tree at work however had to come down. It was starting to push a fence down and we wanted to heighten the wall as well. So I cut it down then found a small boy to forage through it picking off the heads while I finished off the tree.\nIf you leave picking the flowers too late like I almost did, try heading somewhere cooler (like up a hill or out of the town) to find some elder trees still in flower.. Pour 1.5L of boiling water over the kilo of sugar in a large pan.\nStir until it has dissolved.\nAllow the sugar syrup to cool before adding anything else.. Zest all four lemons then cut into thick slices.\nAdd to the cooled sugar syrup.. Measure out 55g of citric acid granules and add to the pan. Stir until they've dissolved.\nThe citric acid lowers the pH of the cordial and keeps bugs at bay. It also enhances the lemonie flavour.. Toss in the elderflower heads and smoosh around until they're under the water line.\nCover with a clean tea towel and store in a cool place to steep for 24-48 hours.. Sterilise the bottles in a hot oven. If reusing them, rinse well first. Mine were new so I just ovened them for 10 minutes at 150\u00baC.\nTo sterilise the lids, boil them for a few minutes.. Having steeped for 48 hours the cordial should now be ready. Remove the bottles from the oven and allow to cool enough to handle them. Fill the bottles any way you please, lab glassware not essential.\nCap immediately.. Serve in a glass with ice and lemon, diluted with sparkling water to taste.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 0, 1, 3]\nC: [0, 3, 1, 2]\nD: [2, 3, 0, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_112_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_112_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_112_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_112_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [1, 2, 3, 0]\nC: [2, 3, 1, 0]\nD: [1, 2, 3, 0]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Serves six:6 tortillas (burrito-sized) 24 oz. salsa 2 cups sharp cheddar cheese, divided 1 bell pepper1 package rice (I used Spanish Rice a Roni) 2 cans pinto beans (drained & rinsed)Preheat your oven to 350 degrees. Cook your rice according to package directions. While it's cooking, dice up your bell pepper.. After rice is done cooking, mix in your diced bell pepper, rinsed pinto beans, 1 cup cheese and a third of your salsa.. Pour another third of the salsa on the bottom a 9x13 baking pan.. Lay out six tortillas and fill with mixture evenly. Spread more cheese on top if desired.Fold burrito style and fit all six tortillas next to each other in the pan.. Cover with remaining rice mix, remaining salsa, and cup of cheese.. Bake in oven for 15-20 minutes, until cheese is melted.. \nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 2, 3, 0]\nC: [2, 3, 1, 0]\nD: [1, 2, 3, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_113_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_113_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_113_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_113_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [2, 1, 0, 3]\nC: [2, 0, 1, 3]\nD: [3, 1, 2, 0]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Dissolve yeast in warm water with a teaspoon of sugar; let stand until frothy.\u00a0 Combine yeast mixture, milk, 1/2 cup sugar and enough flour to make a medium batter (about the consistency of pancake batter).\u00a0Beat thoroughly Add melted butter, eggs and salt.\u00a0 Beat well. Add enough flour to form a soft dough Knead dough until smooth or use a mixer with the hook attachment for about 4-5 min Rub the dough with a thin layer of oil Cover and let rise in a warm place until doubled in size Punch down. Cut dough into individual portions Let rest 10 minutes Shape dough into desired forms (I have included a video of how I learned years ago when I worked at a pizza place, but do whatever works for you) Place on greased or lined baking sheets Let rise until doubled Bake at 350 degrees for 10-13 minutes. As soon as the rolls come out of the oven baste with butter And there you have it, delicious rolls similar to Texas Roadhouse's. For the cinnamon butter combine one softened stick of butter with approximately 1/4 tsp cinnamon, 1/4 teaspoon vanilla, 1 teaspoon powdered sugar, and 2 Tablespoons honey. Adjust the measurements as needed to get the sweetness level you would like.\u00a0\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 1, 0, 3]\nC: [2, 0, 1, 3]\nD: [3, 1, 2, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_114_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_114_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_114_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_114_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [1, 3, 0, 2]\nC: [3, 1, 0, 2]\nD: [1, 3, 0, 2]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Ingredients : 1) 1/2 kg Dates 2) 1 tablespoon Ghee/Butter 3) 100gms (10-12 pieces) Marie Biscuit Supplies: 1) Non Stick Utensil 2) Spatula 3) Aluminium foil 4) A Dish ( for mixing) 5) A Knife 6) A thick plastic wrap ( i recycled and used the bag in which i purchased sugar) Preparation: 1) Remove the seed from all the dates and keep the cleaned dates aside.. 1) In a non-stick pan,\u00a0Heat Ghee/Butter and cook on medium flame. 2) When the Ghee/Butter starts to melt, add the cleaned dates and saute for 8-10 min till the dates soften to form one huge shiny lump. 3) Remove from the flame and transfer it in a separate plate. Let it cool down for 5 min.. 1) Randomly break the biscuits into medium pieces( or just break the biscuit into 4)\u00a0and add it into the date mixture. Be careful from now on as you will be handeling the mixture with your bare hands and it will be quite hot. 2) Lay a square piece of aluminium foil and cut and place the thick plastic sheet on it in such a way that atleat a 2 inch border of the foil is seen. 3) Roll the entire mixture into one tight & thick cylinder.I used the plastic sheet\u00a0 to make this process easier. 4) Make sure that your roll is longer than the plastic sheet and reaches onto the foil. 5) Twist the foil on the edges in opposite direction for it to look somthing like the pictures. 6) Place it in the freezer for atleast 2 hours.. Take\u00a0the mix out\u00a0from the freezer and remove the cover. Cut it into thin slices. Thinner the better. Serve immediately and enjoy. Note: This can be made well in advance and can be stored for a very long time. It is a healthy and nutritive preparation which you can have as a snack or as a dessert. I\u00a0tend to usually have it in my freezer for my unexpected guest at home.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 3, 0, 2]\nC: [3, 1, 0, 2]\nD: [1, 3, 0, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_115_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_115_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_115_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_115_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [2, 0, 1, 3]\nC: [0, 2, 3, 1]\nD: [0, 3, 2, 1]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Equipment\nYou'll need a basic canning kit. This includes:\n- Wide mouth funnel\n- Jar Lifter\n- Magnetic Lid Lifter\n- Bubble remover/ Head Space Gauge\nThe first two are the most important however, I purchased all of them for around $7.\nYou will need a canning rack with the lifting attachments.\nAdditionally you will need:\n- A small sauce pot\n- A large canning pot (or regular pot permitting it fits your canning rack)\n- A large sauce pot\n- A wooden spoon\n- Ladle\nYou will need 6 - 250 mL jars with snap lids and screw bands. If you are purchasing jars new they should come with the snap lids and the screw bands. If you are reusing jars be sure to purchase new snap lids as these should not be reused.\nIngredients\n\u00a0- approx. 3 pints berries (4 cups, crushed)*\n- 1 package Pectin\n- Agave Nectar (3 cups)**\n- butter (1/2 tsp)\n- lemon juice (2 tbsp)\n\u00a0 \u00a0\nNote: This recipe only uses the equivalent of 3 and 3/4 cup sugar. This the \"light\" version of the recipe. If you choose to make the regular recipe it would require 5 cups sugar.\n\u00a0\n\u00a0* Last summer I used a pint each raspberries, blueberries and blackberries. Today I used 2 pints blackberries and 1 pint blueberries as our black berry bush was churning out more berries than we could eat.\u00a0\n\u00a0** If you do not have agave nectar readily available you can substitute sugar cup for cup or a ratio of 3:4.. Ingredients \u00a01. Wash* your berries. Lay your berries onto a baking sheet and use a potato masher to crush them. Measure 4 cups and place into a large sauce pot. \u00a0 2. Combine berries, butter, pectin and lemon juice. Measure agave nectar or sugar and set aside.\u00a0 Equipment 1. Place your canning rack in the bottom of your pot. Place your jars into your pot and cover with water. Bring to a simmer (180 F/ 82 C). 2. Set screw bands aside. Heat snap lids in hot water (not boiling). Keep both hot until ready to use. \u00a0 * I washed my berries with a small about of lemon juice to help remove any impurities. I additionally washed them twice as they came from my backyard and had some ants hanging about.\u00a0 . Ingredients 1. Over high heat bring fruit to a full boil.\u00a0 2. Add all the sugar stirring constantly and return to a full boil that cannot be stirred down.\u00a0 3. Boil hard for one minute*. Remove from heat and skim off any foam.\u00a0 4. Cool a few minutes. ** *My first time making jam I learnt the hard way that the pectin will not activate if you do not allow your jam to boil long enough.\u00a0 ** Some people recommend to ladle the jam into your jars while it is still hot. However if you have larger fruit chunks allow the jam to cool a few minutes to prevent all the fruit to floating to the top of your jars.\u00a0 . 1. Carefully remove a jar from the canning pot. I suggest handling the jar with a dishtowel to prevent burning yourself.\n\u00a02. Place the large mouthed funnel into the jar and quickly ladle the hot jam into the jar withing 1/4 inch from the top (this is where the head space gauge comes in handy). Add or remove jam as necessary.\n\u00a03.Using a non-metallic utensil (The head space gauge/bubble remover) remove any air bubbles from the jar.*\n4. Wipe the rim to removed any food residue and center the hot snap lid on the clean jar rim (use the magnetic lid tool to remove the snap lid from the pot of simmering water.\n5. Screw the band down until resistance is met, then increase to finger tip tight.\n6. Return to canner and repeat with remaining jars.\n\u00a07. If you have any jam left over just spoon them into a small ramekin or bowl and enjoy!\n*This jam was fairly liquid so I did not need to remove any bubbles.\nNote: The recipe should make 6 250 mL jars when using sugar. Because the agave nectar is liquid and I had an extra quarter cup of crushed berries I ended up with more, around 3- 500 mL jars and one 250 mL jars with a bit \u00a0jam left over in a ramekin.\u00a0 . 1. When your canner is full ensure that there is at least an inch of water covering the jars. Cover the pot and bring to a full roiling boil before you count processing time.\n2. For altitudes up to 1000 ft process for 10 minutes. After processing is complete remove lid from your canner and and wait five minutes.\n3. Remove the jars without tilting and place on a protected surface. Cool undisturbed for 24 hours.\n4. After cooling is completed check the seals on the jars. Sealed jar lids will curve downward and will not move when pressed.\n5. Remove screw bands and wipe jars clean.. \nAfter you have wiped down the jars you may replace the screw bands or cover the snap lids with a decorative swatch of fabric fastened with a piece of ribbon or twine. Like here:\n\u00a0Finally label your jars. There are plenty of websites that offer label templates for free. Here is a site that offers lots free printables; just choose one of your liking! \u00a0These two sites (here \u00a0& here) both offer printables exclusively for canning. You can either print on sticker paper (available at most craft stores) or an card stock and fasten with ribbon.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 0, 1, 3]\nC: [0, 2, 3, 1]\nD: [0, 3, 2, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_116_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_116_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_116_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_116_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [1, 3, 2, 0]\nC: [2, 0, 3, 1]\nD: [1, 0, 2, 3]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Whisky Glass. 1/3 ice. 1/3 Black Malibu. 1/3 Squirt Soda. Garnish with XXX Vitamin Water for color or taste preference. Enjoy Responsibly\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 3, 2, 0]\nC: [2, 0, 3, 1]\nD: [1, 0, 2, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_117_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_117_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_117_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_117_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [1, 3, 0, 2]\nC: [0, 3, 2, 1]\nD: [3, 2, 0, 1]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Pulled Pork\n\t\t1 3-4 lb pork shoulder or Boston Butt (which is a shoulder cut)\n\t\t12 oz. (1 1/2 c.) hard ciderDry Rub for Pulled Pork\n\t\t2 Tbs brown sugar\n\t\t1 Tbs kosher or other coarse salt\n\t\t1 Tbs Paprika\n\t\t1Tbs Black Pepper\n\t\t1 1/2 Tsp garlic powder\n\t\t1 1/2 tsp dry mustardPie Pastry\n\t\t3 c. flour\n\t\t2 Tbs sugar\n\t\t1 tsp salt\n\t\t1/2 c. pork lard (from pulled pork)\n\t\t1/2 c. + 2 Tbs Butter, cold\n\t\t1/2 c. ice cold water\n\t\t1 tsp cider vinegar\nPie Decorating\n\n\t\tBlack food dye\n\t\tGreen food dye\n\t\tRed food dye\n\t\t2 egg yolksTools\n\t\tRoasting Pan\n\t\tPlastic Wrap\n\t\tSpring Form Pan\n\t\tAluminum foil\n\t\tparchment paper\n\t\t3-4 c. uncooked rice\n\t\tPaper\n\t\tPencil\n\t\tScissors\n\t\tfood processor\n\t\trolling pin\n\t\tpastry brushes. Dry Rub\n\t\t2 Tbs brown sugar\n\t\t1 Tbs kosher or other coarse salt\n\t\t1 Tbs Paprika\n\t\t1Tbs Black Pepper\n\t\t1 1/2 Tsp garlic powder\n\t\t1 1/2 tsp dry mustard\nMix all dry ingredients in a bowl.\u00a0 Rub dry ingredients all over pork roast.\u00a0 Rub it deep into all cracks and folds.\u00a0 Wrap roast in plastic wrap and chill 4-12 hours.\u00a0 Preferably overnight.\u00a0. Preheat oven to 275-300 degrees F.\u00a0 Remove plastic wrap and place pork in a roaster.\u00a0\u00a0 Place roaster with pork in oven and roast for 3-4 hours, until pork begins to easily come apart with a fork.\u00a0 Crack the crusted surface of the pork and pour 12 oz. of hard cider over roast.\u00a0 Cover with aluminum foil and return to oven for 30 more minutes.. A classic British meat pie has a very dense filling, so once the pork is cooked and pulled apart, we want to compress it so it will make a nice compact portion for the inside of our pie.\u00a0\nAfter the 30 minutes of roasting with the cider, remove pork from oven and cool to room temp.\u00a0 Remove pork from roasting pan and place in bowl.\u00a0 Using a fork, pull pork apart.\u00a0 Be careful, pork will still be hot on the inside.\u00a0 The pork should be very moist and supple.\u00a0 Squeeze the pork together with your fingers and form into a ball.\u00a0\nUsing a sheet of plastic wrap, wrap the pork ball as tight as you can to further compress.\u00a0 Place under some weights, a few plates work nicely and chill for a few hours, or overnight.\nUsing a spatula, gently spoon the cider au jus into a container and chill.\u00a0 Once the liquid is chilled, the lard will solidify at the surface.\u00a0 We will reserve the pork lard for the pastry and the au jus as a serving sauce.. While the pork is roasting and compressing, we'll create our eye design and mold.\u00a0 Outline base of a spring form pan on a sheet of paper.\u00a0 Card stock is nice, as its thicker.\u00a0 I used an old manila folder.\u00a0 Using a stock image of an eye drawing, draw an eye within the circle of the spring form pan.\u00a0 This will be a guide for your eye pie.\u00a0\nNow its time to make the walls of our eye mold.\u00a0 Place the eye drawing at the base of your spring form pan.\u00a0 Fold sheets of aluminum foil and using your eye guide, place aluminum around edges of eye.\u00a0 Fold foil together until you have an eye shaped mold.\nOnce we have the eye mold walls made, we can assemble the mold.\u00a0 Begin by wrapping the circular base and the walls with aluminum foil.\u00a0 Wrap each individually.\u00a0 Place spring form base into pan walls and clip spring form base in place.\u00a0 Place the aluminum mold walls into the center of the pan.\u00a0 Fill the edges between the mold and the walls of the pan with a few cups of rice.\u00a0 The rice will give stability to the pie while it bakes.\u00a0\nPlace a sheet of parchment over your original eye design and outline outside of eye.\u00a0 Cut and place at the bottom of the foil mold.\u00a0 Cut sheets of parchment and line the walls.\u00a0 The parchment will keep the pie from sticking to any of the aluminum foil.\n\u00a0. Pie Dough:\n3 c. flour\n2 Tbs sugar\n1 tsp salt\n1/2 c. pork lard (from pulled pork)\n1/2 c. + 2 Tbs Butter, cold\n1/2 c. (8 Tbs)\u00a0 ice cold water\n1 tsp cider vinegar\nUsually I will make this recipe with just butter, but since we have the lard from the pork, we should use it!\u00a0 Lard makes an even flakier crust then butter and it adds a nice taste to the crust as well!\nOnce your au jus is chilled, the lard should have risen to the top and solidified.\u00a0 You can now easily remove the lard.\u00a0 Scrape off and pat away any au jus still on the lard using paper towel.\u00a0 The dark au jus could stain our pastry.\u00a0\nIn a food processor, mix all dry ingredients.\u00a0 Add butter and lard, pulse until well mixed.\u00a0\nIce down 1/2 c. cold water and add cider vinegar.\u00a0 While pulsing, slowly pour in icy liquid mixture until dough is chunky.\nRemove from food processor and pat into a dough ball.\u00a0 Wrap in plastic wrap and chill for at least an hour.. After dough has chilled, remove from the refrigerator.\u00a0 Cut 2/3 of dough.\u00a0 Place 1/3 of dough back in the fridge.\u00a0 On a floured surface, roll out dough in a rectangle big enough for the base and walls of the mold.\u00a0 In order for this pie to be self standing, after its baked, the dough has to be rolled out thick, about 1/4\" thick.\nPlace the dough in the base of the mold and pat it down into all the edges and crannies.\u00a0 You want a bit of an overhang on top so you can secure the top dough cover when you get it into place.\u00a0\nRemove your compressed pulled pork ball from the refrigerator.\u00a0 Remove plastic wrap.\u00a0 Place meat into pie.\u00a0 Form meat ball to fit the pie opening.\u00a0 Pour 1/4 c. of au jus over the pork.\nRemove remaining 1/3 of dough from fridge.\u00a0 Roll out 1/4\" thick.\u00a0 Using your original eye design template, cut out dough topper. Reserve dough scraps for decorating eye.\u00a0 Place dough topper into pie mold.\u00a0 Brush egg yolk over edges of pie topper and roll edges of eye dough into place.\u00a0. \nCut apart your original eye template into the parts, eye lashes, tear duct, iris, pupil, etc..\u00a0 Roll out remaining dough scraps, and cut out eye parts.\u00a0 Since this is a Pi day celebration, we're going to make the pupil Pi shapped!\u00a0 How clever.\nDilute a few drops of food coloring into a few drops of water.\u00a0 Using a pastry brush, brush appropriate colors onto eye parts, i.e. black onto eye lashes and pupil, green (or whatever eye color you like) for the iris and red for the tear duct.\nBrush undersides of eye parts with egg yolk and set into place on your eye.\u00a0. \nPreheat oven to 325.\nBefore putting pie into the oven, drip a 1 tsp of water into each of the rice wells and mix.\u00a0 This will keep any kernels from popping.\u00a0\nPlace pie into oven and bake for 30 minutes.\u00a0 After 30 minutes, remove and brush top of eye with egg wash.\u00a0 Bake for another 20 minutes.\u00a0\nNow its the moment of truth.\u00a0 Its time to remove the mold and see if this pie will stand on its own.\u00a0 Remove pie from the oven.\u00a0 Place the spring form pan into a large bowl or pan.\u00a0 Remove spring form sides and remove rice.\u00a0 Cut off aluminum foil mold and peel off parchment sides. Turn oven up to 400 degrees F.\u00a0 Brush sides and top with egg yolk.\u00a0 Put the pie on the spring form base onto a baking tray and put back into oven.\u00a0 Bake until sides are nicely golden, another 30 minutes or so.. Remove from oven and allow to sit for 15-20 minutes.\u00a0 Remove from spring form pan base and parchment bottom.\u00a0 Serve warm.\u00a0 Garnish with drizzled cider au jus. Enjoy the glorious eye pie for a little while, but not too long.\u00a0 Eat this up before it gets cold!\nServe with Hard Cider of course and get pie-eyed!\u00a0 Now that's a hell-of-a-pie.\u00a0 Happy Pi Day!\u00a0\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 3, 0, 2]\nC: [0, 3, 2, 1]\nD: [3, 2, 0, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_118_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_118_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_118_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_118_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [0, 2, 3, 1]\nC: [2, 0, 1, 3]\nD: [1, 0, 3, 2]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. For the dog cupcakes, I frosted the whole cupcake with chocolate frosting, and added a dollop of chocolate frosting toward the bottom for the snout.. I then used Wilton tip #233 (the grass/hair tip) to do brown fur originating from the snout and going outward. I didn't do it on the snout, or in the center part of the head.. Next I added ears toward the top using the same tip.. I then switched to white frosting colored with \"Ivory\" food coloring until it was a light tan color. I used the same tip (Wilton #233) to do the center of the head, and then the snout. I always made sure to start each strand of fur in the center of the top of the snout.. I added a chocolate covered raisin for the nose.. I added eyes using the same chocolate frosting, and a pink bow.. For the cat, I frosted the cupcake white and added two balls for the cheeks using Wilton tip 12.. I added eyes using chocolate frosting and piped in a pink nose as well.. For the ears, I cut a miniature marshmallow in half using my kitchen scissors, and then pinched one end to form the triangle.. I then cut 6 strands of black licorice about 2 or 2 1/2 inches long, and poked them into the cupcake until they were the desirable length.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 2, 3, 1]\nC: [2, 0, 1, 3]\nD: [1, 0, 3, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_119_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_119_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_119_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_119_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [1, 3, 0, 2]\nC: [1, 2, 3, 0]\nD: [0, 2, 3, 1]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Ingredients you'll need:\nStore-bought Corn Tortilla Chips (cooked)\n8 - 12 oz mini Chocolate chips\n2 Tablespoons Fleur de Sel or coarse Sea Salt\nCandied Jalapenos- See Step 2 for tools, ingredients and preparation\nChili-infused Toffee- See Step 4 for tools, ingredients and preparation.. Tools:\n1 cookie sheet lined with parchment paper.\n1 small saucepan\n1 small paring knife\n2 small bowls\nIngredients:\n4 to 6 Fresh green Jalapenos and red Fresno chiles. If you can't find red Fresno chiles, just stick with jalapenos.\nSugar- 1 cup total\nGreen food coloring for Jalapenos and Red food coloring for Red Peppers.\nWater\nPreheat your oven to 200 degrees. Line a cookie sheet with parchment paper or foil. Set aside.\nIn separate bowls, combine 1/4 cup sugar with a few drops of green food coloring and 1/4 cup sugar with a few drops of red food coloring. Stir them up until you're happy with the color of each sugar. Set aside.\nSlice the jalapenos in 1/4\" rounds. Use a small paring knife to cut away the excess fiber inside. Remove seeds, too. Set aside.\nIn a small sauce pan, combine 1/4 cup water with 1/2 cup sugar. Stir over medium-high heat until the sugar water/syrup boils. Reduce heat to medium-low and add the jalapeno slices. If you're using red and green jalapenos, do them in separate batches but use the same syrup.\nSlow boil for 1 minute. Remove the pan from the heat and 1 by 1, drop the sliced chiles into the coordinated colored sugar. Shake to coat then remove each sugar-coated slice onto the parchment lined baking sheet.\u00a0\nWhen all of the slices have been sugar-coated, place the baking sheet in the 200 degree oven for 15 minutes.\nAfter 15 minutes, remove them from the oven and leave them uncovered for an hour or until the sugar has dried and is set.\nIf the humidity is high in your area, you may want to bake them again at 200 degrees for 15 minutes.\nThe candied jalapenos can be made the day before and left to air-dry overnight.\nOnce dry, use small scissors to cut them in tiny pieces. Set aside.\n\u00a0. Tools and food stuff:\n2 cookie sheets covered with parchment paper.\nHand select the flattest tortilla chips from the bag and lay them on the lined cookie sheet.\nBowl of mini chocolate chips\nChopped candied jalapenos\nYou'll be building the Toffee Tortilla Chips right next to your stovetop so make room if you haven't already.\u00a0. Tools:\nA candy thermometer, a saucepan, a long wooden spoon and 2 metal spoons.\nIngredients:\n1 cup Sugar\n1 cup Butter\n1 Tablespoon Light corn syrup (optional)\n1 teaspoon Chili Powder (Use your favorite)\nClip the candy thermometer to your sauce pan, then add all of the ingredients above.\nOver medium heat, cook and stir until the sugar is melted. \u00a0\nTurn the heat to high and stir constantly until the thermometer reads 350 degrees. At this temperature you'll begin to smell the chili powder and see a little smoke.\nRemove boiling hot toffee to a cool burner.\nOne at a time, hold each tortilla chip (by a corner) over the saucepan, angled downward.\nCarefully spoon the hot toffee syrup over the top of the chip. (Only coat one side, not both sides.)\nShake gently, allowing the excess toffee syrup to drip back into the saucepan. Be careful. The toffee is HOT!\nPlace each toffee-coated chip on the lined cookie sheet.\n\u00a0\nSprinkle mini chocolate chips over the toffee while it's still \u00a0fairly hot. (This the time when an extra pair of hands will be helpful, if you can find them.) \u00a0\nGive the mini-chips a few minutes to soften, then use the back of a clean spoon to smooth the chocolate over the toffee.\nSprinkle a few bits of candied Jalapeno peppers on top of the chocolate.\nPlace the Toffee Chips in the freezer for 15-20 minutes to set.\nRemove the Toffee Chips from the freezer and pinch a tiny amount of fleur de sel (or sea salt) on top as the final garnish. \u00a0If this is done while the chocolate is hot, the salt might melt.\u00a0. One at a time, hold each tortilla chip (by a corner) over the saucepan, angled downward.\nCarefully spoon the hot toffee syrup over the top of the chip.\nShake gently, allowing the excess toffee syrup to drip back into the saucepan.*\nPlace each toffee-coated chip on the lined cookie sheet.\nSprinkle mini chocolate chips over the toffee while it's still fairly hot. (This the time when an extra pair of hands will be helpful, if you can find them.)\nGive the mini-chips a few minutes to soften, then use the back of a clean spoon to smooth the chocolate over the toffee.\nSprinkle a few bits of candied Jalapeno peppers on top of the chocolate.\nPlace the Toffee Chips in the freezer for 10-15 minutes for the chocolate to set.\nRemove the Toffee Chips from the freezer and pinch a tiny amount of fleur de sel (or sea salt) on top as the final garnish. If this is done while the chocolate is hot, the salt might melt.\n*Be really careful! The toffee is HOT and can burn you!!!. Toffee Tortilla Chips are like crunchy little candy bars!\nYou can serve them alone.... or with a bowl of Ice cream... or?\nThey taste best (crunchiest) when served the same day they're made. They are still yummy the next day and even the day after that.... just not as crunchy.\nIf you have leftover toffee just put it in a ziplock baggie and use the flat end of your meat hammer \u00a0to break it up into small pieces. They taste great by themselves, but can be used in cookie recipes and sprinkled over Ice cream or Cheesecake.\nEnjoy!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 3, 0, 2]\nC: [1, 2, 3, 0]\nD: [0, 2, 3, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_120_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_120_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_120_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_120_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [3, 1, 0, 2]\nC: [3, 1, 2, 0]\nD: [3, 0, 1, 2]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. For this recipe you will need:2 c chopped walnuts1 1/2 c powdered sugar2 c all-purpose flour1/4 tsp salt2 tsp vanilla extract5 tbsp sugar1 c unsalted butter. Melt the butter in a microwave and then cream together the butter and sugar using an electric mixer.. Mix in the salt and vanilla until combined.. Add the flour to the batter a little at a time, mixing well between additions.. Mix in the walnuts.. Put the batter in the refrigerator for an hour.. Roll 1 tbsp of the cookie dough into a ball shape and place on a baking sheet lined with parchment paper.. Bake in an oven that has been preheated to 350\u00b0 for 13 minutes.. Remove from the oven and place the whole baking sheet on top of a wire rack to cool for 2 minutes.. Now coat the cookies with powdered sugar. . After all of the cookies have been coated with the powdered sugar, coat them again.. I hope you enjoyed this instructable and if you decide to make these, let me know what you think about them!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 1, 0, 2]\nC: [3, 1, 2, 0]\nD: [3, 0, 1, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_121_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_121_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_121_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_121_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [1, 0, 3, 2]\nC: [0, 2, 1, 3]\nD: [3, 0, 1, 2]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Heavy kitchen pot2 tbsp coconut oil1/2 cup white popcorn kernels1 tbsp sugar1/2 tsp salt(Remember \"two, and a half, and one, and a half\"). Yes! Put it all in the 1/2 cup container.  It will fit.. Depending on the ambient temperature, the coconut oil may either be solid or liquid.  If solid, heat it up just enough in the pot to melt it.. Leave a little bit of the cover open to vent.. They should all pop within a few seconds of each other.  Have your vent on, as at this time the oil may smoke a bit.  If it's a lot, turn down the heat slightly.. Shake immediately once the ingredients are in.  You want to keep the kernels and sugar in motion so that the sugar does not burn on the bottom of the pan (which is very difficult to remove).. Nothing much will happen for about a minute, but the popcorn is quickly coming up to temperature.Once you get an inch or so of popped corn, vent the top to let out steam, and continue shaking. The popped corn helps keep the remaining kernels in the pot but be careful!  Occasional drops of hot oil may escape.  Protective eyewear is recommended.. If done correctly, there will be very few if any residual unpopped kernels.. As much as you will be tempted to, don't start eating it right away!  That sugar may still be nuclear hot!Wait a minute or two for all steam to evaporate and the sugar to crystallize.  The popcorn will be then be unbelievably crunchy, slightly sweet, and slightly salty.  And all with only two tablespoons of oil!Store in a sealed container, or eat it all immediately.. I actually use a heavy steel (not aluminum) Back to Basics whirly popper.  It's a bit more expensive but worth it.  You are also much less likely to burn the sugar with this device (except the first time).\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 0, 3, 2]\nC: [0, 2, 1, 3]\nD: [3, 0, 1, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_122_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_122_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_122_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_122_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [2, 1, 0, 3]\nC: [3, 2, 1, 0]\nD: [0, 2, 1, 3]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. This is the second best part. There are only 5 ingredients.1. 1lb unsalted saltine crackers. 1lbs is a box of 4 sleeves.Take my word for it and get unsalted. The flavors of this whole \"dish\" are much better when you don't feel like you just licked a salt brick.2. 1 cup canola oil3. 1 packet powdered ranch dressing mix. 1 oz is a packet in case you have a vat of ranch dressing mix.4. 2 tblsp crushed red pepper flakesYes, 2 tablespoons, not a typo. This is not a dish for the weak. Add more if you dare.5. 1/2 tsp garlic powder. Ok here is the hardest step (at least for me...), you have to find a suitable plastic container. Start searching all the food storage containers, Tupperware, Rubbermaid and zip lock tubs you have in your kitchen. You are looking for a short container that can hold as many crackers as possible, in a sealed location. A good lid/seal is important since it will get flipped over. Ideally this container will hold all 4 sleeves, but I have used 2 containers, so don't worry if you have to split it up. The plan here is to line up the crackers on edge so they are loosely siting like they would in the sleeve and when the container has the lid on and is flipped over, all the crackers stay in place.After selecting the correct vessel, take the crackers out of the sleeves and line them up in the container(s).  See the picture for more help. Don't stack up, only 1 level.. Take out a bowl, measuring cup, large mug or whatever you have and pour all the ingredience (except the crackers) into the liquid holding device and stir.1 cup canola oil1 packet (1 oz) powdered ranch dressing mix. 2 tblsp crushed red pepper flakes1/2 tsp garlic powder. Keep stirring. When you stop the ingredients settle in the bottom and it won't pour as nicely. So as you stir, pour the mixture over the crackers. Make sure to get the crackers in the corners and on the edges. If you were not able to get all the crackers into one container, than with a little math you can tell how much of the mixture to pour in each container. 4 sleeves = all the mixture3 sleeves = 3/4 the cup2 sleeves = half the mixture1 sleeve = 1/4 the cup.After  you have run out of mixture, put the lid on the container(s) tight. Than flip the container(s) over. Every 5 minutes or so, flip the container over again. Again, and again for around 20 minutes. When your patience has run out, and you don't see liquid running down the sides of the container when you flip it over, you're done.. Supposedly, you can put leftovers in a zip lock bag, seal air tight, and they last about a week or so. I have never experienced this phenomena. At best, I made some in the evening and there were left overs the next morning, but not when I got off work that afternoon. They're surprisingly addictive...A word to the wise and a disclaimer from a friend: \"As temping as this is, do not, I repeat DO NOT, eat all 4 sleeves yourself in one sitting\". No matter how good the movie is, all you will remember is the stomach pain in the morning. \nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 1, 0, 3]\nC: [3, 2, 1, 0]\nD: [0, 2, 1, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_123_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_123_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_123_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_123_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [0, 3, 1, 2]\nC: [3, 2, 0, 1]\nD: [2, 1, 0, 3]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. 1 LB GROUND BEEF\n1 ROLL OF SAUSAGE\n1/2 LARGE ONION\n2 EGGS\n1 CUP COOKED RICE\n1 LARGE GREEN CABBAGE\n1 JAR FAVORITE PASTA SAUCE\n1 PACKAGE ITALIAN CHEESE\nSALT, PEPPER, GARLIC POWDER TO YOUR TASTE.. core the cabbage(remove the hard middle) with a large knife, being careful not to cut the leaves. \nboil the cabbage for 20 to 30 minutes, or until the leaves are pliable, but not soft.\nnote: it took me some time to get the time right on this, if you make them too soft, they will be impossible to roll.\nremember that they will cook a little bit later on in the oven.. brown the ground beef, sausage, onions and seasonings in a skillet.\nadd salt and pepper to taste.\nyou MUST cool the meat in the freezer after it is fully cooked for at least 15 minutes. while the meat is cooling, cook the rice.\nadd two eggs and cooked rice to cooled meat and mix, this will be your filling.. peel one leaf at a time from the cabbage, being careful not to tear.\ncut the hard center piece of the cabbage off. (see picture for illustration)\nlay on a separate plate and fill with about 3 spoons of the meat mixture.\nadd a sprinkle of cheese, and roll up until tight.\ncut off remaining cabbage, and cabbage on end that does not contain filling.\narrange in a row in large glass baking dish.\npour your favorite spaghetti sauce over the rolls.\nfill empty sauce jar 1/2 full with water, cover, and shake.\npour water over rolls, making sure to cover every edge with water.\nadd remaining cheese over the top.\ncover the dish with foil.\nbake 350 for 1 hour 30 minutes.\nturn oven off and let rolls cool for 45 minutes.\nremove from oven and enjoy!\n\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 3, 1, 2]\nC: [3, 2, 0, 1]\nD: [2, 1, 0, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_124_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_124_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_124_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_124_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [1, 2, 3, 0]\nC: [2, 0, 1, 3]\nD: [1, 0, 3, 2]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n.   \u2022 Onions   \u2022 2 separate cups of all-purpose flour   \u2022 salt, pepper   \u2022 two cloves of crushed garlic  \u2022 12 ounces beer/ 1 bottle   \u2022 Cold water or Buttermilk   \u2022 Oil (for frying). Cut onions half an inch thick, remove dark exterior cover of onions and separate them into rings. Soak them in Cold water (or buttermilk) for 1 hour. This helps in breaking down the onions and removing their bitterness. In a pan/dutch oven heat up oil for frying them rings.. In a bowl, combine one cup of flour,salt, pepper and crushed garlic.Mix them thoroughly.In another bowl combine beer and one cup of flour and mix thoroughly.adding beer to this recipe makes the coating much more crunchy!. Remove onion rings from buttermilk, shake off excess.    Dip the rings in flour,coat properly and shake off excess, then dip in beer batter.Drop the onion rings in oil and be sure not to crowd rings as they will stick together.    When golden, remove and drain on paper napkins/towels.The onion rings can be served with plain mayonnaise like I did or you can use this recipe to make a spicy dip -combine sour cream, mayonnaise, sun-dried tomatoes, lime juice, 1/2 teaspoon pepper, 1 teaspoon salt, white wine and minced garlic and little chipotle sauce in a mixer or food processor. you can even add chopped coriander or mint. cover and refrigerate it.Hope you all liked!suggestions and comments are welcome.Also, this is my twist on an onion rings recipe I saw on instructables- https://www.instructables.com/id/Goooulish-Onion-O-...\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 2, 3, 0]\nC: [2, 0, 1, 3]\nD: [1, 0, 3, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_125_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_125_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_125_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_125_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [3, 1, 2, 0]\nC: [0, 2, 1, 3]\nD: [1, 3, 0, 2]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. layout some strip of bacon on your clean surface.. slap some sausage on your bacon. roll the bacon goodness making sure to tuck the sides in.. I cooked mine at 380f for about an hour.garnish with avocado to make yourself fell less guilty!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 1, 2, 0]\nC: [0, 2, 1, 3]\nD: [1, 3, 0, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_126_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_126_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_126_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_126_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [3, 1, 0, 2]\nC: [3, 2, 0, 1]\nD: [1, 3, 0, 2]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. You have two options for getting chips. Yay, options!\nThe first is just to buy the chips. Normal tortilla chips will work OK, but I think a slightly sweetened chip is a better fit. The good folks from \"Food Should Taste Good\" have some particularly nice options: sweet potato and chocolate are both yummy.\nThe second approach is to make your chips. Since I've tagged this snack as healthy, I'm avoiding the deep fryer, and recommend you bake tortilla chips. (If you live for deep frying, well, you can go that route too.)Ingredients\nTortillas (corn, wheat, or flour... whatever floats your boat)\nCooking spray OR vegetable oil (e.g. peanut or corn; if you're feeling crazy, try coconut oil.)\nWhite OR brown sugarDirections\nPreheat oven to 400F. If you're using oil, brush the tortillas lightly on both sides with some oil to help the chips crisp up and avoid sticking to the pan. Cut tortillas into strips or triangles or whatever shape you want your final chips to turn out. Unicorns? Why not! If you didn't use oil, spray a flat pan with cooking spray and lay out your tortilla pieces, and then spray the tortillas again with cooking spray. Otherwise, just spread out the tortillas. Sprinkle sugar lightly over the chips, and bake in the oven until crispy and a pretty shade of brown (~10 minutes).. Next step is to make your salsa. There is a lot of room for customization here, but here is something to start with:IngredientsNote: These quantities are rough... don't stress, you can't go wrong.\n1 pint of strawberries\n1 small jicama\n2 Tablespoons lime juice\n2 kiwisOptional: fresh gingerInstructions\nChop up the strawberries, kiwis, and jicama into chunks. Martha has some instructions on how to chop jicama if that's a foreign food to you. Pour in the lime juice, and if you're a ginger fan, add some (1 teaspoon) fresh grated ginger. Pop it in the refrigerator until you're ready to serve.Feel free to add other or additional fruit. Apple would make a fine jicama replacement, cherries are great, green grapes might be nice. Go wild.. Next, we make our peach dip, which is supposed to look like nacho cheese... but you already knew that.Ingredients\nGreek yogurt (plain or vanilla)\n2 peachesOptional: honeyDirections\nRemove the skin from the peaches. Sound like a pain? I did it by blanching, which means adding a small slit in the skin, boiling for a few minutes, and than shocking in ice water. The skin comes right off. (Here's a video demonstration of blanching.) Then cut up the peach into chunks and toss it into a blender until it's properly pureed. Add greek yogurt to your blender in small batches until you get a nice nacho cheese color and texture. If your peaches are sweet and you like the tangy taste of yogurt, you can stop here. Otherwise, add and blend in some honey to taste.\nIf peaches ain't your thing, try this with a different orange-colored fruit puree. Mangoes would definitely work, and apricots might be good too.. I never considered using avocado in a sweet dish until I went to Indonesia, where avocados are commonly used in fruit smoothies and milkshakes. Making a sweet version of guacamole was the motivation behind these nachos, and in my opinion, it's the best of the dips. (P.S. You should also try to make an avocado milkshake. Here is a good recipe.)Ingredients\n2 Avocados\n2-3 teaspoons condensed milk\nCherries. I used bing cherries because they've got some yellow and red in them, but whatevs.Optional: jicamaDirections\nClean and cube the avocados. De-pit* the cherries and cut into small pieces. Pour condensed milk and mix. (If you have some extra jicama, feel free to chop that up and throw it in too.)\n* If you don't have one of those fancy cherry de-pitter gizmos, a poor man's version is to put a cherry on a beer bottle and poke the pit out with a chop stick.. You know what to do...\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 1, 0, 2]\nC: [3, 2, 0, 1]\nD: [1, 3, 0, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_127_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_127_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_127_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_127_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [1, 0, 2, 3]\nC: [3, 2, 0, 1]\nD: [0, 2, 3, 1]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. 1) Beetroot - 1/3 Cup2) Carrot - 1/3 Cup3) Spinach - 1/3 Cup4) Red Cabbage - 1/3 Cup5) Turmeric - 1/8 Tsp6) Confectioner's Sugar (Powdered Sugar) - 1/2 Cup7) Corn Starch - 1 1/2 Tsp 8) Vanilla Extract (Any extract can be used)  - 1/2 Tsp9) Water - required to blend . Beets/ Carrots/ Spinach:1) Puree vegetables in a juicer or a blender. (I used Blender)2) When using a blender, add little water to blend the vegetables. 3) Then filter the vegetable puree in order to remove the crushed vegetable pieces. Red Cabbage :1) Boil the chopped red cabbage in water (covering the cabbage) till the water turns dark purple. Then strain the purple water. Keep it aside.2) Take some of the purple water and add little by little of Baking soda to obtain Blue color.Turmeric :1) It can be added directly to the batter in the powdered form in a very little quantity(If added in excess overpowers the taste) . 1) Sift the powdered sugar along with cornstarch in order to remove any lumps present.2) Take a portion of sifted powdered sugar and cornstarch, add vanilla extract and the desired vegetable puree little by little.3) Mix well to form a batter with smooth and little thick paste consistency (not very thick. should be able to pipe it using a piping bag)4) Similarly make the batter for all the vegetable puree separately.(different extracts like almond, lemon can be used)5) Transfer the mixture into disposable piping bags (I used individual Ziploc bags for each color) and cut the edge making a small hole or use a very small round tip 6) Line a cookie sheet with wax paper or parchment paper.7) Pipe out long lines across the parchment paper. 8) Let dry undisturbed in a cool place for about 24 hours or until dry to touch9) Gently break the lines into small pieces and store the sprinkles in the airtight container for up to 3 months.Naturally Made Rainbow Sprinkles are Ready ! . Boiling Method: [Boiling the vegetables in water and using the boiled water]1) The color obtained from boiling the vegetables is lighter as it is diluted. 2) water content is more. 3) As the color obtained is lighter it is needed in higher quantity. 4) The taste is diluted when compared to juice method.Juice Method : [Blending the vegetables]1) The color obtained from the vegetable juice is more vibrant.2) less water content.3) As the color obtained is vibrant it is needed in less quantity.4) It is likey to taste a little stronger than the boiling method.I tried both the method and used the Juice method for making sprinkles since I need the color with less water content.. 1) The vegetable puree can be frozen and stored for future use.(I used ice cube tray)2) Vegetable puree can be used as water color paint when it is frozen.3) The vegetable puree can be used to naturally color cupcake frosting, Play-Doh and homemade finger paints etc..\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 0, 2, 3]\nC: [3, 2, 0, 1]\nD: [0, 2, 3, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_128_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_128_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_128_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_128_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [2, 0, 1, 3]\nC: [2, 3, 0, 1]\nD: [1, 0, 3, 2]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. \n\t\tTable\n\t\tChopping Block\n\t\tBig Knife (serrated)\n\t\tPineapple\n\t\tTea Towel\n\t\tNewspaper (for composting waste). Whip off both ends of your fruit and tip them onto the waiting newspaper. The bottom end can be a bit tough so make sure you remove enough.\u00a0. Stand your pineapple on end and begin to shave away thin slices of the rough skin. Don't go too deep and ignore the small round brown holes that are left, you will deal with these in the next step.\u00a0\nTwist your pineapple as you go and the cut edge will be a guide to your next slice.\u00a0. This speckled fruit must now be flipped onto it's side ready to take out those small brown holes. You will notice the holes form two spirals going in opposite directions, either of these spirals can be followed with this technique, I prefer the longer spiral in the first photo working from left to right.\u00a0\nWith the pineapple on it's side grip hard with one hand and cut out a shallow groove in a spiral pattern using the holes as your guide. This is best done with several small cuts matching up.\u00a0\nRepeat this until all the small brown holes are gone.\u00a0. You will always be left with a few little flecks of skin, you can nip these out with the knife depending on how much effort you are willing to put in. Remember it is all roughage.. Your pristine pineapple is now ready to serve. You can slice it as thick or thin as you like, it is all down to taste. \u00a0 \u00a0\nWith it on it's side grip it firmly without too much downward pressure, this could blunt the spikes and slice away.\nAs was mentioned before the bottom end can be a bit tough so when presenting leave this bit off the dish and eat it yourself.\nThe slices can be arranged in a line or any other pattern you want.\nIf done right this simple technique leaves a dish that will catch anyones eye.\u00a0\n\u00a0\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 0, 1, 3]\nC: [2, 3, 0, 1]\nD: [1, 0, 3, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_129_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_129_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_129_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_129_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [1, 0, 2, 3]\nC: [3, 2, 0, 1]\nD: [3, 1, 0, 2]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. 1/2 lb of your favorite cheese (I'm using white cheddar)\n\t\t4 cooked potatoes (russet works well - mine were TINY so I used more)\n\n\t\t3-4 cups of cream (sure, you could use milk, but it won't be nearly as delicious)\n\n\t\ta tablespoon of butter\n\n\t\tgenerous pinch of red pepper\n\n\t\tcouple pinches Italian seasoning\n\n\t\tsalt and pepper to taste\n\n\t\toptional: green onions and bacon for topping/mixing in\nYou'll want to wash and then cook the potatoes before doing anything else.\nIn the microwave:\n\nPrick them all over with a fork, and put them on a microwave safe plate.\n\n\n\nDepending on your microwave, four potatoes should take around 20 \nminutes. I normally do ten minutes, flip them and check their doneness \nwith a knife, and then do an additional ten minutes if they're still \npretty hard, and a little less if they're beginning to soften. :)\n\n\n\n\n\nIn the oven:\n\n\nBake them at 350 F for an hour.\n\n\n\n\n\nOnce they're done, peel them and slice them in half to speed up cooling - and then leave them to cool a bit.\n        . Make sure all potatoes are peeled and cut into chunks.\n\n\nIn\n a saucepan over medium heat, drop in the tablespoon of butter, the red \npepper flakes and Italian seasoning. Let the butter melt and stir the \nseasonings around until they start smelling nice. :). Add the potatoes to the pan and then add 1/3 of the cream. Use a whisk or a potato masher to break down the potatoes until you're happy with \nthem. I like my potato soup to be a little chunky. :)\n\n\n\nOnce the potatoes are mashed enough for you, add in the rest of the cream.\n\n\n\nBring\n this up to a slow bubble, stirring every minute or so. You'll see tiny \nbubbles start to form around the edges of the pot and the soup will \nbegin to thicken a little once it's nice and hot.. Grate your cheese into the hot soup. Stir it often to make sure the cheese melts evenly.\n\n\nOnce\n the cheese is in, the soup will be nice and thick. At this point, add \nsalt and pepper to taste, as well as extra red pepper flakes and Italian\n seasoning if you want them. You might find that the cheese overpowers what you put in before. :D\n\n\n\nGarnish with bacon and green onions if you're feeling up to it or just dig right in. Enjoy!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 0, 2, 3]\nC: [3, 2, 0, 1]\nD: [3, 1, 0, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_130_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_130_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_130_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_130_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [3, 1, 2, 0]\nC: [1, 2, 3, 0]\nD: [2, 1, 3, 0]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Recipe ingredients:Pomegranate-Butter GlazeWe juiced our own but you can purchase these:2/3 cup fresh pomegranates 1/4 cup fresh cranberries.1/2 cup brown sugar1/2 cup honey1/3 stick of butter1 Tablespoon minced garlic cloves1-1/2 teaspoon  Fresh Rosemary mincedTurkey rub recipe:4 Tablespoons fresh Rosemary minced3 Tablespoons fresh Oregano minced1/2 teaspoon Rubbed Sage1 teaspoon ThymeInfused butter and basting:1 stick of butter ( cut about 20 slivers from the stick for the infusing and use the rest of the stick whole for the basting. Freeze until needed for recipe. You will also need a container of chicken broth or make your own, 1 onion, 3-4 celery stalks, 3-4 Carrots Utensils:Roasting pan, juicer (optional), tongs, long fork, slicing/paring knife,  carving knife, chef knife, cutting board, bowls, saucepans, measuring cups and spoons, turkey baster or ladle and a basting brush. . Wash and Prep:For even cooking of the bird set the turkey out and allow it to reach room temperature.  Cut the butter into small slivers (approximately twenty 1/2 cm slivers and leave the remaining butter stick whole for basting and place them in the freezer until hard; this will make the cubes easier to insert into the bird and for easier basting later).Wash the berries, vegetables and herbs.Quarter the onionCube the carrots and celeryMince 1 Tablespoon garlic. Method for dry rub:Mince 3 Tablespoons fresh Rosemary and  4 Tablespoons fresh Oregano.. Method: Measure the dry herbs and mix thoroughly:1/2 teaspoon rubbed Sage 1 teaspoon Thyme Add Sage and Thyme, to the minced Oregano and Rosemary.  Add salt and pepper to taste.. Method:Remove the neck and giblets. You can use these for the dressing or soups later.Cut slits in the turkey breast as shown  using a slicing or paring knife: about 20 incisions.Insert the frozen slivered butter into the cut turkey flesh.Repeat this method across the turkey breast.Rub some butter in the turkey cavity. . Method:Insert and rub a small handful of the herb mixture into the cavity of the turkey. Using your hands rub the herb mixture all over the turkey as shown.. Tucking the wings helps prevent the wing tips from burning. This method works most of the time.Position the bird breast side up as shown.Lay the wing against the breast of the bird to its natural position.Taking note of where the wing tip is make a small incision about an inch and a half lower than the wing's tip taking care to only separate the skin from the flesh creating a small pocket for the wing tip to rest in.Tuck the wing into this pocket.. Make the stuffing according to the box instructions.Stuff the cavity of the turkey.Note: you can skip this step for a faster cooking turkey.. Method:Pre-heat oven to 350 F (for a faster cooking time; use 325 if you're willing to wait about an hour longer)Pour the turkey broth into the roasting pan and add the carrots, celery, and onion. Place the turkey breast side down into the roasting pan. This method will increase the moisture in the white meat as the fat from the dark meat renders and drains down. Midway through the baking process (approximately 1.5 to 2 hours for a 15 lbs bird) you will flip the bird breast side up.  The full cooking time will depend on your birds size.  Our 15 lbs / fully stuffed bird took about 3.75 hrs to fully cook.Baste the the turkey every twenty to thirty minutes for duration of cooking. We will glaze the turkey during the last 15 minutes of baking time.. Pomegranate Butter Glaze Recipe and method:We juiced our own but you can purchase these:2/3 cup fresh pomegranate1/4 cup Fresh cranberries1/2 cup brown sugar 1/2 cup honey 1 Tablespoon minced garlic cloves 1-1/2 teaspoon Fresh Rosemary1/3 stick of butterAdd ingredients to a saucepan starting with the herbs, seasonings, and butter.  For a bit more rosemary flare add an additional 1 sprig of rosemary.  Pour in the juice and bring to a simmer on a medium heat while stirring.  About 2-4 minutes.  Stir in the brown sugar and honey to make a syrup.Note: the syrup will tend to boil over the pans rim so do pay attention and stir continuously.Remove saucepan from the heat.. Using a turkey baster or ladle; baste the turkey using the pan drippings and then apply the frozen butter stick throughout the basting process every 20 to 30 minutes until midway through the baking time. Cover the butter bowl and freeze in-be-tween each basting process.. Remove the turkey from the oven midway through the baking time.Baste the turkey with the pan drippings and  butter as shown. Follow the instructions in the next step before placing the turkey back into the oven to continue cooking. . Flip the Turkey using the tongs and a large fork.Baste the breast with the pan drippings and finish off with the butter.Return the turkey to the oven and finish baking time making sure to baste using the pan drippings and the butter stick every 20 to 30 minutes until the last fifteen minutes of baking time; at this point you will glaze the turkey with the pomegranate glaze.. Using a basting brush glaze the turkey with the pomegranate glaze as shown.Continue baking the turkey until done (approximately 15 more minutes)Glaze the turkey again after you remove it from the oven.Allow the turkey to rest for twenty minutes before carving.. As with all turkey recipes your guests' palate will be enticed by the savory aroma of the roast; tempting them from the kitchen.  However, once the pomegranate-butter glaze hits the skin of the hot bird from the oven; the entire experience changes to a wonderful sweet and savory captivation of the imagination.  The visual appeal of the deep mahogany feast yet to begin will certainly be a welcome table piece for your holiday.This will be a Thanksgiving experience my son and I will cherish because we enjoyed creating a culinary masterpiece from both of our recipes. One year we made a complete Thanksgiving meal using a toaster oven, grill, and a fire pit because we were camped out at my son's property with no working kitchen. My son made the turkey using a power drill to turn the pole! I wish to thank contributors for making Instructables such a delightful place to share! Have a very safe and happy holiday. Thanks for stopping by and I almost forgot to mention . . .  this is an entry for the butter contest November 2014 and if you like this instructable your vote will be much appreciated! Thanks again.sunshiine~\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 1, 2, 0]\nC: [1, 2, 3, 0]\nD: [2, 1, 3, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_131_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_131_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_131_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_131_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [3, 0, 1, 2]\nC: [0, 2, 1, 3]\nD: [2, 3, 1, 0]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Don't buy store bought crackers, make your own! They are really easy to make and people will be shocked that you made them. These crackers are based off a recipe from a food blog that I LOVE.\u00a0 www.foodinjars.com\u00a0The possibilities are endless when it comes to making a cracker flavour. \u00a0I'm going to be showing you how to make one of my favourite combo's,\u00a0ground pepper & Thyme.\u00a0Pre-Heat oven to 450 degreesYou'll Need:\nParchment paper- for baking crackersMixer with dough attachment( no mixer- do it the old fashion way *HANDS*)Ground Pepper & Thyme crackers\n1 1/2 Cups all purpose flour\n1/2 Cup Cake Flour\n3/4 Cup warm water\n3/4 tsp. Salt\n1/2 tsp. black pepper\n1 tsp. fresh thyme ( you can substitute dried)\nAdd the flours, salt, pepper and thyme to mixer. Stir to evenly distribute seasonings. Slowly add the warm water and oil.\u00a0 Mix on medium until ball dough forms.\nIsn't this easy!. Remove the ball of dough from the mixer and bring over to your floured surface\nStep 1:\n-Knead the dough ball on a floured surface until it doesn't stick to your hands\n-Let dough set for 15 minutes\n-Cut dough in half. Step 2:\n-Roll out the dough with a rolling pin until 1/4 thick.\n-Cut out your shapes and lay them on your pan thats\u00a0covered with parchment paper and bake\u00a0in preheated\u00a0oven(450 degrees) for 8-12 minutes*Since this was for a Alice In Wonderland Theme Party, I used a ridged edged heart to represent the Queen of Hearts. I also made some little rabbits because they are great for kids and who doesn't LOVE the White Rabbit from the story.You Can also just use a knife or pizza cutter and cut out odd shapes if you don't want to use cookie cutters. (I'll be showing pictures of all 3 types this way you can see what each looks like)*If you want to add extra thyme on some of the crackers, lightly moisten the top of the crackers with water(NOT TOO MUCH)\u00a0and add more thyme. Press down on the thyme to make it stick.. What you'll need:\nHomemade Pepper & Thyme Crackers\nA tangy Jelly- I used a homemade Plum Wine JellyParmesan\nCheese cutter\nWalnuts - These are from my tree but\u00a0bagged store bought\u00a0are fine tooWalnut Cracker ( if needed)Step 3:\n- Spoon a tiny bit of jelly onto heart shaped cracker\n- Add a pieces of parmesan & walnut\n- Top with a sprig of fresh thyme.\nThe red jelly and white cheese goes perfectly for Queen of Hearts theme***NOTE****\nMake sure NO ONE is allergic to nuts at your party. Make some without nuts and serve on a separate tray making sure to label both trays. Also, make some with just Jelly,\u00a0 or\u00a0sliced turkey\u00a0& provolone or Swiss Cheese. Step 4:\n- Arrange crackers on a fancy tray and serve. Don't over load the tray as it could look messy and unattractive. Think less is more.\n- Have some serve the guests. Alice served ours...**The last picture is some of the appetizers without any nuts. These just had Jelly\u00a0 or Jelly with slcie Turkey and Provolone or swiss cheese.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 0, 1, 2]\nC: [0, 2, 1, 3]\nD: [2, 3, 1, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_132_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_132_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_132_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_132_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [2, 0, 3, 1]\nC: [3, 1, 2, 0]\nD: [0, 3, 2, 1]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. \n\tNeeded ingredients:\n\t8-10 peppers(Green pepper or red pepper are both ok\u00a0, but medium spicy is better)\n\tSoybean sauce,salt,MSG(optional),water,cooking oil\n\tCooking tools:\n\tA frying pan with a Turner\n\tA\u00a0plate\n\tStoves\n\tStart:\n\t\u00a0\u00a0 First,clean the peppers and cut off the stalk.Use clean towel to wipe the water on the surface of peppers, to keep the surface dry.\n\tTips:Remember you must clean up the mild water on the pepper!Because we will then\u00a0put them in the boiling oil\u00a0and fry them, if there is even a little bit water, hot oil will splash around, it's very dangerous!So please keep them dry!If you are the first time to fry,you'd better wear\u00a0gloves or long sleeves,to make you safe.:). \u00a0\u00a0\u00a0 Now we have prepared the ingredients.Then we will start to cook the peppers.Put the saucepan\u00a0on the stove,\u00a0turn on the heat.Put moderate oil into the pan,wait for seconds until the oil are 80 percent hot.(Picture 2)\n\u00a0\u00a0\u00a0 Then turn down the heat,put peppers slowly in the pan,and make them heat evenly.After all the peppers are appropriately placed,cover the pan with the lid,fry them for a short time,about 2 minutes.\nTips:At this step,as the oil is too hot,there will be some oil spots keep spilling out.Take care of yourself and don't worry,put the lid on.:P ). \n\t\tAfter about two minutes,one side of the peppers has already well-fried.Open the lid,use the turner to turn all the peppers to the other side,put on the lid and wait another 2 minutes,to make the other side well done,too.\n\t\tWhen another 2 minutes has past,open the lid,we will start to put sauces in the pan.Put the right amount of salt(Maybe more than half a teaspoon,I forgot XD,you can test it) and soybean sauce(about 20ml).Put a little extra water into the pan,and turn the peppers for several times,to make\u00a0the sauses\u00a0well mixed.\n\t\tTurn up the heat and put on the lid again,When the juice boils,turn down to low heat,heat gently until the juice was absorbed.\n\t\tPut some MSG in the pan.(optional)\n\t\tNow we have finished all the cooking steps!Let's put them on the plate and get ready to eat!:D. Put the dish out on the plate,and now we can start eating!Look at the finished plate,do you fell like\u00a0 to bite a pepper?=)To tell the truth,this dish is a perfect match to rice and porriage!The tempting smell can spread for several meters!The spicy peppers can also open your appetite for other dishes!\nSo if you have enough time and enthusiasm,especially you are a spicy loves,try to cook a\"Braised pepper\"dish!I'm waiting for the news of your success!\u00a0 \\(^o^)/\nauthor:yuhuaabc\u00a0\u00a0cook:my mum:)\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 0, 3, 1]\nC: [3, 1, 2, 0]\nD: [0, 3, 2, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_133_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_133_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_133_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_133_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [1, 2, 3, 0]\nC: [0, 2, 1, 3]\nD: [2, 1, 3, 0]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. The ingredients for the pretzel pies is fairly easy to get, nothing weird or odd.You will need the following for a batchDough3 cups flour* ( might change a little depending on the flour*)4 tablespoons brown sugar1 tablespoon yeast2 teaspoons salt1 cup water (luke warm)Glaze1 egg1 teaspoon waterand pie filling! I used a can, but if you really want to, you can make your own (my way is easier). For this you will need some cooking weaponrymeasuring cups and spoons (duh)A heavy duty rolling pinA large cutting board( or something you can roll the dough out on)a big bowl for mixinga forka spoonOptionalA bread or dough machine, trust me, this makes it a lot easier to make. The dough is quite simple to make,\nadd the flour, water, yeast, sugar and salt together, and MIX! the dough gets really tough. The dough should end up fairly thick. If it is sticky, add a little more flour.\nWhen done, the blob of dough may not seem very large, but it will do a decent size batch of pretzel pie.. To being your pretzel pie making, cut off a golf ball sized chunk of dough.\nRoll the dough out to be longer and wider than your hand. It should be fairly thin, but still staying together well.. Before shaping, first take a spoon full of pie filling and put it in the middle of the flat dough.  Make sure that it doesn't go close to the edges, otherwise it will come out the sides and get all messy. About a spoon full and a little bit usually does it.. The shaping is quite easy, take one side and fold it over the pie filling, press all around the edges to make sure the filling wont come out. Do the same for the other side. Press the side edges down and roll them up a little bit to make sure nothing will escape. pinch all the edges and putt it onto a pan, making sure that the long edge in on the bottom.. To make the glaze, combine one egg and one teaspoon water, and beat. Then brush this onto the tops of the pies. This makes it more pretzelish. If you dont have a brush then gently smear it on.. Before baking, make some quick slits on the tops of the pies, make sure the aren't deep, and dont press into the pie, this will make them turn out better. too deep and it might ruin it, and if you press down too hard, the sides might open up, leading to a gooey mess. Now comes the baking. Put them into the oven at 425 F. Set the timer for about 12-15 minutes. Be sure to check on the regularly. Once they start to brown, remove from oven, or else they might burn on the bottom.. Remove them and enjoy. They take a few minutes too cool down, and be careful the filling will be really hot!\nSome might not turn out as well as the others. but they will all taste just as good!. If you don't pie you can do many other things. You can fill them with cheese and tomato sauce, you can fill them with meat, you can fill them with just about anything. Or if you are really extreme, you could make pretzels!! Enjoy and share! \nSince pretzel pie is such a boring name, I am looking for a new name. I need a really good professional name for them, or a really funny name!\n\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 2, 3, 0]\nC: [0, 2, 1, 3]\nD: [2, 1, 3, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_134_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_134_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_134_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_134_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [0, 2, 1, 3]\nC: [3, 0, 2, 1]\nD: [1, 0, 2, 3]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Only 2 basic ingredients for this simple recipe:handful of chia seedssweetened soy milk[OPTIONAL] any type of fruit!You'll also need a sealable container to cool the pudding - in the picture, I reused an old jam jar. . For one serving of pudding, pour a handful of chia seeds into your container. Right now, they're dry, but once you add in the liquid, the volume will double, sometimes almost triple!. Add in about double or triple the amount of soy milk as you did the chia seeds and stir. This is when the magic happens! After a couple minutes, the seeds will start to absorb the liquid, providing a gel-like \"pudding\" consistency. Now is a good idea to sample the pudding. If you decide it isn't sweet enough, add in some sort of sweetner (preferably honey or syrup, but sugar is fine if you make sure it dissolves). After you're satisfied with the taste, seal the container and put it in the fridge for a couple hours.  An interesting fact - chia seeds can hold up to 12 times their weight in water! . When you take the container out of the fridge 2-4 hours later, the chia seeds will have absorbed enough of the liquid in the soy milk so that the pudding has a more solid, viscous, texture, like that shown in the first picture. To serve, scoop out the pudding into a small bowl, top with fruit and nuts, and Enjoy!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 2, 1, 3]\nC: [3, 0, 2, 1]\nD: [1, 0, 2, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_135_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_135_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_135_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_135_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [3, 1, 2, 0]\nC: [0, 2, 1, 3]\nD: [1, 3, 2, 0]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. \nFor one 12 inch crust\n1 teaspoon corn meal\n1 3/4 to 2 \u00bc cups all-purpose flour\n1 envelope Fleischmann\u2019s Pizza Crust Yeast\n1 \u00bd teaspoon sugar\n\u00be teaspoon salt\n2/3 cup 120\u00b0 tap water\n3 tablespoons oil\n1 tablespoon chopped rosemary\u00a0\nPre heated 425\u00b0F oven. 1. Sprinkle the cornmeal over a 12 inch pizza pan.. 2. Follow the Fleischmann\u2019s Pizza Crust Yeast directions found on the package by combining 1 cup flour, the yeast, sugar and salt in a large bowl.\u00a0 Add the water and oil and with a spoon mix together with the spring onions for 1 minute until well blended.\u00a0 Gradually add \u00bd cup of the remaining flour until a soft dough ball is formed\u00a0 which will be sticky.\u00a0 Add additional flour if necessary to form the ball.\n3. Knead for about 4 minutes on a floured surface scattered with rosemary until smooth and elastic.. 4. Wet your fingers with tap water and press the dough into the pan mounding it slightly at the edges.\u00a0\u00a0 With your index fingers press the edges together to form a rim then prick the dough with a fork 15 to 20 times.. 5. Bake for five minutes on the bottom shelf of the oven.\u00a0 With a fork, pierce any bubbles that have formed.\u00a0 Allow the crust to cool long enough to handle \u2013about 5 minutes.. For the topping\n1 cup canned pumpkin (not spiced pumpkin pie filling)\n1 teaspoon Sriracha sauce\n\u00bd cup grated Parmesan cheese\n\u00bd teaspoon garlic powder\n\u00bd teaspoon onion powder\n\u00bd teaspoon salt\nchives\n6. While the pizza is par baking, make the topping.\u00a0 In a medium bowl, combine the pumpkin, Sriracha sauce, grated Parmesan cheese, garlic\u00a0 powder, onion powder and salt until thoroughly blended.. Spread the pumpkin mixture evenly over the crust then smooth it with an offset spatula.\u00a0 Using the tip of a paring knife, draw the outline of a basketball on the surface of the pizza.\u00a0 Press the chives into the outline making sure that there is a little overhang at the edges, since the chives will shrink while baking.\u00a0 Bake for 15-20 minutes until the crust is lightly browned.\u00a0 Give everyone a chance to ooh and ah over your work then cut into wedges and serve immediately.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 1, 2, 0]\nC: [0, 2, 1, 3]\nD: [1, 3, 2, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_136_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_136_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_136_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_136_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [0, 2, 1, 3]\nC: [0, 1, 2, 3]\nD: [1, 3, 2, 0]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Mileage may vary, according to your fridge scraps, but we'd say the following make for an exceptionally good soup:One carton vegetable broth1/2 roast chicken - pulled/shredded or chopped1/4 cup chopped bacon1/2 white onion, diced1 glove garlic, minced1 cup frozen southern hash brown potatoes (or diced potato)1/8 cup dried lentils1/8 cup dried split peas1/2 cup frozen baby lima beans1/2 cup frozen corn1 tsp Italian SeasoningSalt & Pepper to taste3 tbsp olive oil. Add about 3 tablespoons olive oil to dutch oven and saute onions and garlic till tender and translucent.. Pretty straightforward.  You can shred or chop the chicken as desired, then add to sauteed onion and garlic.. We never really would have thought to add bacon to chicken soup, but there it was - so we nuked our pre-cooked bacon a bit, then chopped it up and dumped it in! . Pour in your broth, stir and bring the whole thing to a light roil.. While your broth base is heating up, wash your lentils and peas, and then add those to the pot.. Add in your potatoes and limas, season as desired, and stir.  We like to leave the corn till the end, so it doesn't overcook, and has that nice sweet firmer texture to it.. Set the pot over a back burner on low, and cover.  Let the whole thing simmer 45 min. to an hour, or until lentils and split peas are tender.. Add the corn during the last 10 to 15 min. of cooking, stir and add a little more water if desired. . Enjoy your Scrap Soup as is or over some rice, and celebrate the tasty fruits of your refrigerator emptying labor!  It's a delicious soup and the bacon really makes it!  Add a little hot sauce for an extra kick.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 2, 1, 3]\nC: [0, 1, 2, 3]\nD: [1, 3, 2, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_137_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_137_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_137_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_137_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [2, 3, 1, 0]\nC: [1, 0, 2, 3]\nD: [0, 1, 3, 2]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Ingredients:\n3 medium onions, chopped\n1 tablespoon butter or olive oil\n~10 tomatoes (big juicy heirloom varieties* are best!)\n~10 cloves garlic\n~1 teaspoon salt\nfreshly ground pepper\nhandful fresh basil, chopped\n1/4 - 1/2 cup heavy creamTools:\nlarge heavy-bottomed pot\ncutting board\nsharp chef's knife\nwooden spoon\n* I get my awesome heirloom tomatoes from Wild Boar Farms at my local farmers' market.\u00a0 They sell seeds online if you want to grow your own!\u00a0 Highly recommended.. Heat pot to medium-low heat, add butter or olive oil, add onions and salt, and saute until onions are soft and just starting to brown.. While onions are cooking, coarsely chop tomatoes.\nAdd them to the pot, and stir gently to mix.\u00a0 Use tomato juice to deglaze the bottom of the pot if necessary.. Mince garlic, and add immediately after tomatoes.\u00a0 Stir to incorporate.. Bring the soup to a simmer and maintain on low heat, stirring occasionally, for about 20 minutes.\u00a0 Tomatoes will soften, and the garlic will cook down.\u00a0 You're ready for the next step when it looks like this.. Coarsely chop and add the basil.\u00a0 Stir to incorporate.\u00a0 Turn off the stove.\nYou want to heat it just enough to wilt the basil, but no more, so be sure the other ingredients are sufficiently cooked before you add the basil.. \nIf you want to add cream to your soup, do so now.\nStart with 1/4 cup of heavy cream, stir it in, then taste your soup.\u00a0 Does it need more cream?\u00a0 Then add more to your taste!\u00a0\nThe first picture below has cream added; the second picture does not.\u00a0 I used roughly 1/4 cup cream, as I like mine very lightly creamy.. Sample your soup, and add more salt and pepper to taste.\u00a0 Add more cream if desired.\nIf you want an additional umami kick a bit of Worchestershire sauce can help, but it's not necessary if you use great tomatoes.\nServe warm if you've used cream, warm or cold if you haven't.\u00a0\n- Great with a garnish of grated hard cheese or a grilled cheese sandwich.\n- Fantastic in a bread bowl.\n- Tastes even better the next day!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 3, 1, 0]\nC: [1, 0, 2, 3]\nD: [0, 1, 3, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_138_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_138_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_138_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_138_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [2, 1, 0, 3]\nC: [2, 3, 0, 1]\nD: [3, 0, 2, 1]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n.  1. Three medium sized potatoes 2. Cooking oil ( I prefer grape seed oil) 3. Seasoned salt or your own spices 4. Onion, bell pepper, or other vegetables (all these are optional) 5. Utensils needed: cooking pot, tongs, fork, paring knife, cheese grater, oven mitts. Gather three medium size potatoes and wash them under running tap water. Place all of the potatoes in a medium sized cooking pan. Heat the potatoes at a low boil for approximately 10 to 15 minutes. When potatoes are done, you should be able to pierce them all the way through using a fork. Caution: Use tongs and oven mitts to handle hot potatoes.. \u00a0After boiling them, carefully remove the potatoes from the cooking pan using tongs. Put the potatoes in a medium bowl and place them in the refrigerator for twenty minutes to allow them to cool.. Once the potatoes have cooled, remove the potato skins with a paring knife.. Shred the potatoes over a container using a cheese grater to do so.. Once you have all the potatoes shredded, place them back into the refrigerator.. Place\u00a0 skillet upon stove and pour 1/3 cup of grape seed oil into the skillet. Heat the skillet at a medium heat until ripples appear on the surface of the oil. If bell peppers or onions are desired, using tongs, place them in skillet and cook them to desired tenderness. Use spatula to turn vegetables occasionally to avoid sticking. Use tongs to add shredded potatoes to skillet. Turn ingredients of skillet occasionally to avoid sticking. Caution: Be careful to avoid splashing hot oil on oneself. If seasonings are desired, add seasoning to skillet. Once all potatoes are a crispy looking brown color, carefully remove the potatoes with a spatula and place them on a paper towel covered plate. (the towel will absorb excess oil from food) Allow hash browns to cool until they are ready to serve. . Enjoy your hashbrowns without the need to tip or the restaurant price.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 1, 0, 3]\nC: [2, 3, 0, 1]\nD: [3, 0, 2, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_139_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_139_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_139_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_139_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [1, 3, 0, 2]\nC: [2, 3, 1, 0]\nD: [1, 3, 2, 0]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. 3oz caster sugar4 oz pudding rice2 pints milk30g butter1/2 tsp ground nutmeg1/2 tsp ground cinnamon1/2 vanilla pod (or extract). Heat the oven to 140 degrees celsius.Melt the butter in a large casserole dish, add the rice and stir.Add the sugar and stir until dissolved. Keep stirring for a couple more minutes.. Stir in the milk.Add the nutmeg and cinnamon and stir.Slice the vanilla pod in half length ways and scrape out the seeds.Add to the casserole dish.Bring up to a simmer, then transfer to the oven.. The pudding will take about 90 minutes to cook.Use a teaspoon to try a little of the rice to make sure it's soft. You want it to be soft and for most of the milk to be absorbed.I find this is lovely with a few sultanas sprinkled on top. Enjoy!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 3, 0, 2]\nC: [2, 3, 1, 0]\nD: [1, 3, 2, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_140_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_140_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_140_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_140_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [2, 1, 3, 0]\nC: [2, 3, 0, 1]\nD: [0, 3, 1, 2]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Things you need are:\n- 1 cup flour (plus extra for flouring surfaces)\n- 1/2 cup warm water\n- 1/2 packet active dry yeast\n- 1 tablespoon olive oil\n- 1/2 teaspoon salt\n- 1 teaspoon sugar\n- 1-2 stalks of basil\n- 4-8 cherry tomatoes\n- shredded mozzarella cheese (as much as needed). In a small bowl, dissolve the yeast in the warm water. Let sit for 10 minutes or so, or until it's very creamy.\nIn a larger bowl, add flour, olive oil, salt, sugar, and the yeast mixture. Stir well with a spoon.\nCover with a dishcloth for 30 minutes. It should have risen and almost doubled in size.\nPreheat oven to 350 degrees F.. Wash your tomatoes well and cut off the tops. Then slice the tomatoes into rounds.\nRemove the seeds (the goopy stuff in the center :)) and chop them.\nAlso chop 1-2 leaves of basil fairly fine. Then cut a few more leaves into larger pieces.\nNOTE: It's easier to get the seeds out of the tomatoes when they're riper.. Now here's the fun part! :)\nFlour your hands and the surface you're working on. (I used aluminum foil because I could easily transfer it onto a baking sheet.)\u00a0\nTake a piece of dough small enough to fit in your palm. Spread it out a bit on the surface and place one or two finely sliced basil leaves in the middle. Layer tomatoes and cheese on top, but leave room to fold the dough up!\nFold the dough into a round, dumpling like shape. See my pictures for help.\nWe're ready to bake!\nNOTE: Don't put too much basil IN the calzone, because when basil bakes for a long time, it becomes brown and not as pleasant to eat. We will put the rest on later.. Your oven should probably be done preheating by now. Place the calzones on a baking sheet and let them cook for 20 minutes.\nAfter this,\u00a0don't\u00a0turn the oven off! \u00a0Quickly (but carefully!) take the pan out of the oven and put a piece of basil on top of each calzone. Sprinkle cheese on them and put back in the oven for 2 minutes.\nDone!. I suggest plating them like in the picture below. That way, with the cut calzone people can add any fillings if they want. (Obviously you would have to put them on the table. :) )\nEnjoy your culinary creation!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 1, 3, 0]\nC: [2, 3, 0, 1]\nD: [0, 3, 1, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_141_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_141_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_141_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_141_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [1, 3, 0, 2]\nC: [3, 2, 1, 0]\nD: [0, 1, 2, 3]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. For two people (easily multiplied):Leftover roast lamb in bite size piecesBasmati rice, 150 ml by volume, washed well and soaked in 300 ml water for half an hour, then drained250 ml stock (or water and half a stock cube)1 small onion, thinly slicedI clove garlic, grated1 teaspoon grated gingerChopped red chilli to taste (we like lots)Quarter teaspoon garam masalaHalf teaspoon salt (less if stock is very salty)Handful cashew nutsHandful coriander leaves (if you like)1 tablespoon rapeseed oilBlack pepper. Heat the oil in a heavy pan over a medium heat. Fry the cashew nuts until golden, then scoop out into kitchen paper. Fry the onion in the same oil for about 5 minutes, lowering the heat when it starts to brown. Stir in the garlic, ginger, chilli, garam masala, salt and drained rice. Fry for a couple of minutes, stirring to coat the rice with the oil.. Add the stock to the pan and cook gently for five minutes, stirring. Cover with foil and a lid and cook in the oven at 170 deg/150 deg fan/ gas Mark 4 for 10 minutes. . Add the lamb on top of the rice, recover and return to the oven for 10 minutes more. Stir the lamb in gently and taste to check the rice is cooked. Recover and leave to stand for 5 to 10 minutes.. Stir in pepper and coriander to taste, check the seasoning and turn into a serving dish. Garnish with more coriander and the cashew nuts. Enjoy! Yoghurt mixed with mint and some salt and pepper is a good accompaniment.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 3, 0, 2]\nC: [3, 2, 1, 0]\nD: [0, 1, 2, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_142_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_142_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_142_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_142_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [3, 2, 1, 0]\nC: [2, 1, 3, 0]\nD: [2, 0, 1, 3]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. A cardboard coffee cup is your best bet.Do not use a ceramic mug or cup.And avoid using a wax paper cold cup or a styrofoam coffee cup.Lightly coat the cup with cooking spray, or wipe the inside of the cup with olive oil, salad oil, butter or margarine on a paper towel.. For a large egg, a seven ounce (200 cc) paper cup is about the right size.Crack the egg and pour into the oiled paper cup.Did I mention that the cup is paper?. Yah, I know yer mom always added a little milk.But wait until you try your egg with a splash of water.A trick I learned from a pro chef.Milk makes eggs a bit rubbery.Water makes them fluffy.Start with about a teaspoon (5 cc).You do not need bottled water -- the bottle is just to make the photo more clear.. Gently stir the egg and water together with a fork, spoon, or chopsticks.Your goal is to break the yolk, and stir it and the water into the white.But you want to feel a little springy body in the mixture.Do not whip into a foam.At this point you can add optional ingredients like a little grated cheese, chopped peppers, onions.. Fold over the top edge of the paper cup.Fold the corners back.Place in your home microwave and start the process.Watch out for office, dorm, or other high power professional style microwaves. You may be scraping your eggs off the oven ceiling.Nuke for 1 minute at 30 percent.This is also called power level 3, or defrost mode on some units.. Time for some geek stuff.The microwave uses bang-bang control.  This just means that the cooking power is either on or off. No such thing as half power.Different microwaves use different schemes, but one of the most popular is to use short bursts of full power.You can test this yourself with a cup of cool water and a pencil and paper.Set the microwave for 3 minutes at power 3.The oven light will dim and the fan sound may change when the cooking power is on.Watch the seconds on the display to measure the time.Record the timing of the power bursts - it is easier to have a helper write them down as you call them out..Try different power settings and cook times and note the results.Always start each test run with cool water in a microwave-safe container. Ya don't want scalding water all over the place now do you? Likewise, never operate the microwave empty or you'll be buying a new one pretty soon.It is good, geeky fun to reverse engineer your microwave oven's power scheme.. So back to cooking the egg -- after the first heating wait ten seconds or so.Open the door.Do not open the cup.Pinch the folded edge shut and swirl the cup to distribute the heat.It should feel like half liquid with some cooked eggs floating in the center.Repeat the 1 minute at power 3 cooking process.Wait a few seconds after -- the egg is still cooking even after the power is off.Now open the cup, and peer anxiously over the edge.If it is still too liquid for your taste, close the cup, swirl it and heat it for 15 to 20 seconds on high.. In a rush?Scarf it right out of the cup with chopsticks or a fork or spoon. Watch out, parts of the egg may be scalding hot!Or pour it out  onto a fancy plate and serve with toast, jam, spices, Tabasco, ketchup - whatever makes your taste buds tingle!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 2, 1, 0]\nC: [2, 1, 3, 0]\nD: [2, 0, 1, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_143_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_143_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_143_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_143_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [2, 0, 3, 1]\nC: [0, 1, 3, 2]\nD: [1, 0, 2, 3]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. For this project you will need the following supplies... 2 Large cooking potsCooking stove (preferably outside in case you make a mess)Isopropyl alcohol Small glassForkStrainerPotato masher5 Gal. bucketMeasuring spoonAnd most importantly, Crab-apples!. One of the great things about crab-apples is that they contain a high concentration on pectin. By tapping into this natural goldmine of pectin, we are taking a tree/ fruit that is typically only used as an ornamental tree and using it to reduce preserve making costs while using all natural ingredients. In this step, you will need to pick a rather large amount of apples. You don't need to worry about removing the stems from the fruit as they can be added to the mash. Try to avoid leaves if possible. We spent about an hour collecting around 3-4 gallons which ended up making around 16 quarts of pectin.This amount will allow you to make about 350 oz or jelly or jam. . In this step you want to thoroughly rinse off the apples you picked in the last step. Be sure to remove any leaves from the mash as they won't add anything to your pectin. You can leave the stems on the crab-apples tough, there is trace amounts of pectin in the stems that we will be extracting. Once all of the dirt, bugs, leaves, and everything else is rinsed off of the fruit transfer it into your cooking pot for the next step. . This step we will begin cooking the pectin out of the fruit. So just put your filled with the apples and topped off with water on the stove and cook to a boil. You will want to leave the mash boiling for some time, around 45 minutes. The longer you cook, the more pectin you will be able to extract. Be careful not to let it burn though as the burnt flavor can carry into your jams/jellies. Once the fruit has been cooked, you can use the potato masher to smash the softened fruit to release the pectin. After smashing, let the pot cook some more. . Once you feel confident that the mash is thoroughly cooked, use the strainer to remove the liquid from the mash. This liquid is the remaining water with the pectin inside of it. You are now ready to test your homemade pectin. Be careful not to splash any on yourself or burn yourself on the steam. Both feel quite unpleasant  :( . To test the pectin, pour a small amount of isopropyl alcohol into your glass. We used a shot glass as it is a prefect size for testing. Use the measuring spoon to transfer a small amount of the liquid pectin into the glass with the isopropyl. Use the fork to gently mix the two, and slowly remove the fork, pulling directly upward. The pectin should have gelled in the glass and stuck to the fork as a opaque slime like substance. If it does not, you need to return the liquid to the stove and continue to cook, checking again after another 10 minutes or so until the pectin has been cooked out. At this point, you are all done and can either add the pectin to your jams and jellies to be or you can jar it for the next time you make your preserves. Thanks for reading and I hope this guide has helped!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 0, 3, 1]\nC: [0, 1, 3, 2]\nD: [1, 0, 2, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_144_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_144_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_144_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_144_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [3, 0, 2, 1]\nC: [2, 3, 0, 1]\nD: [2, 0, 1, 3]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. soda cancoat hangersmall hinge4 small screws and nuts ( 2 not shown)1 medium screw and nut (not shown)4 long bolts and nuts. Make a line about 2/3 through so that the bottom half is biggerThe edges are rough so cut some offdecide which is the front and bend over a bitIn the end where the iron will be placed (soda can bottom) make cuts and bend in for the iron to rest on. Make 4 holes for the bolts supporting it to go throughput the bolts through and screw on the nut. mark where the screws will go (on the top and bottom)put them in, and screw on the nuts (on the top and bottom)add Medium screw for handle. bend the coat hanger into a similar shape to this to fit on your grill (The king of random has a template for his Bitty Q in his mike hacks). Put your soldering iron in, turn it on, and start GrillingEnjoy!Please vote for me\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 0, 2, 1]\nC: [2, 3, 0, 1]\nD: [2, 0, 1, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_145_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_145_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_145_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_145_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [3, 1, 0, 2]\nC: [3, 2, 1, 0]\nD: [3, 1, 2, 0]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. \nFirst you will want to chop up your onions to the desired\u00a0consistency\u00a0for your\u00a0lasagna. \u00a0I chopped mine into different sizes.\nSaute your onions and ground beef over medium high heat, seasoning with just a dash or two of garlic salt, until the beef is well browned.. \nUsing a bit of olive oil, generously grease each well of your cupcake pan.\nLay a\u00a0won ton\u00a0wrapper into each well and press down so that it covers the bottom and sides of the tin.. \nAdd a spoonful of your\u00a0sauteed beef and onions to each well on top of the won ton wrapper, being sure that it is all contained in the wrapper and none of it touches the edges of the pan.. \nAdd a spoonful of spaghetti sauce on top of each pile of ground beef, once again making sure that it is all contained in the wrapper and none touches the edges of the pan.\nSprinkle each cupcake with\u00a0Parmesan\u00a0cheese, garlic salt, and Italian seasoning. \u00a0(It is okay to get a little messy sprinkling these on!). \nCover each pile of meat, sauce, and seasoning with a fresh won ton wrapper. \u00a0Make sure to press down and seal all of the filling inside so that only the won ton wrapper touches the edge.\nRepeat your fillings, adding your meat , Parmesan cheese, garlic salt and Italian seasonings.. \nTop off each cupcake with a generous sprinkle of shredded mozzarella and a dash of garlic salt.\nMy favorite part of these is the extra-cheesy top, so don't skimp on the mozzarella!. \nBake these in the oven at 350 degrees for about 20-30 minutes or until the cheese is the desired level of browned. \u00a0\nMake sure to keep an eye on them, once they start browning the tops can burn fast!. \nLet the cupcakes cool for about 5-10 minutes before removing from the pan. \u00a0Serve hot immediately or store in the fridge for up to 3 days for easy-to-reheat dinners and tasty packed luches.\nWe enjoyed ours with a delicious fresh salad!\nFor more awesome recipes and fun projects, visit my blog, The Procrastibaker!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 1, 0, 2]\nC: [3, 2, 1, 0]\nD: [3, 1, 2, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_146_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_146_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_146_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_146_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [1, 0, 2, 3]\nC: [3, 2, 0, 1]\nD: [1, 3, 2, 0]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Setup either a mincer or food processor with the cutting blades.I\u2019m using chuck beef; I find it has great flavour for a burger. Mince the chuck beef. I like my burgers to have a fat ratio of 70% meat to 30% fat, this will give you juicy, tender burgers.. Slice some cheese into strips.  I\u2019m using cheddar but feel free to experiment with different types of cheese.Take a hand full of the mince and mould the first patty. Then layer out the cheese in two layers.Mould the second patty but make this one 10% largerThis will allow you to fold the edge down and seal the cheese inside.Place them in the fridge for half an hour just to firm up.. Setup the BBQ for direct grilling.. Make sure to rotate the patty minute.Cook for 2 minutes before flipping.Toast your bun now if you like.Cover with the lid for 2 minutes for beautiful melted cheese in the centre.Now it\u2019s time to make this burger.. Spoon of a layer of caramelized onions.Then add some mustard to the top half of the bun before placing it on top.. \nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 0, 2, 3]\nC: [3, 2, 0, 1]\nD: [1, 3, 2, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_147_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_147_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_147_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_147_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [1, 3, 2, 0]\nC: [3, 2, 0, 1]\nD: [1, 3, 2, 0]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Here you will find all the ingredients needed for our descent into the heart of dark chocolatey deliciousness.  The dry goods1 tbsp(8g) cinnamon(not pictured)2 cups(200g) all purpose flower1 1/2(300g) cups sugar1 cup cocoa powder(I used the special dark variety)1/4 tsp(1.5g) salt3/4 tsp(3g) baking powder1 1/2 cups of cocoa nibsThe wet goods2 large eggs1/2 cup(118ml) milk1 cup(227g) of butter softened1 tbsp(15ml) vanilla extractToolsOvenCookie sheetMixing bowlElectric mixer or a lot of commitment and strong armsWhiskCooling rackSilicon baking mats or parchment paper.. Now is the time that was foretold of a time of much darkness.  As prophesied preheat your oven to 350 degrees Fahrenheit(177c).  The mixening is upon us.Set aside the Cocoa nibs.  Then combine the remaining dry goods in a mixing bowl using a whisk to incorporate them.  Your mix should be a pretty dull grey once everything is thoroughly combined.  Once you have the dry goods mixed. Dump all of the wet goods into the bowl of your stand mixer or a separate mixing bowl and cream them together.Now that the wet goods are looking creamy slowly incorporate the mixed dry goods.  By now you should have an almost black sticky dough.  You will want to add a half cup of the cocoa nibs to this mix setting aside the remainder for the next step.. Using your hands take about a ping pong or golf ball sized hunk of black dough from the mixer.  Then dip what will become the top of the cookie in cocoa nibs.  Finally place it along with its' friends on to your baking sheet leaving some room between them non nibbed side down and get ready to put them in the oven.(Now if you are like me your oven has realized it is ides of March and it is time to betray you so there is now the optional step of fixing your oven handle so you can open the oven door. Yay!)Once you have repaired your oven go ahead and bake the cookies for 15 minutes.  When the timer goes off remove your cookies from the oven and place them on a cooling rack.. Now that the cookies have cooled to the point where you won't burn your mouth the time has come to enjoy the fruits of your labor.Thank you for reading this instructable.  I hope you enjoy the cookies as much as I do.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 3, 2, 0]\nC: [3, 2, 0, 1]\nD: [1, 3, 2, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_148_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_148_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_148_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_148_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [3, 2, 1, 0]\nC: [0, 3, 2, 1]\nD: [1, 2, 3, 0]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. For the Pie Dough:\n2 1/2 Cups of Flour\n1 Tablespoon Sugar\n1 Teaspoon Salt\n1 Cup of Unsalted Butter, Cold\n1 Cup Ice Cold WaterFor the Topping:\n1/4 Cup Melted Unsalted Butter\n1/2 Cup Cinnamon SugarFor the Whip Cream:\n1 Cup Heavy Whipping Cream\n2 Tablespoon Powdered Sugar\n1 Teaspoon VanillaFor the Strawberry Filling:\n1 Pound of Fresh, Cleaned Strawberries\n1 Cup Sugar\n1 Cup Water\n3 Tablespoons Cornstarch\n1-2 Tablespoons Corn SyrupNote: You can use any filling you want with these, I just happened to use a red filling to mimic ketchup.. First, sift together the flour, sugar, and salt into a large bowl and cut the cold butter into small cubes. . Combine the cut butter with the sifted flour, salt, and sugar.\nDo this by using either a pastry blender or a fork, cutting the butter into the flour until it is evenly distributed.. After the butter is combined with the flour, add 1/2 a cup of cold water and mix.\nAdd another 1/4 a cup of cold water and continue mixing until dough begins to form and hold it's shape.\nThis may take another 1/4 cup of cold water depending on factors such as where you live or how much moisture is in the air. . When the dough has started forming, start kneading it lightly to ensure the ingredients are fully combined.\nWrap the dough in saran and refrigerate for a half hour.. In the meantime you can make the whip cream and filling of your choice.\nTo make the whip cream, combine the powdered sugar, vanilla, and heavy whipping cream in a small bowl and use a beater or whisk to mix it until stiff peaks form.\nStore in a container and refrigerate until needed. . To make the strawberry filling, cut the strawberries into the desired size, though smaller cuts work better for dipping.\nIn a small sauce pan, combine your cut strawberries with your sugar, half the cup of water, and corn syrup.\nSet on medium to medium-high heat and bring to a simmer.\nIn a separate bowl, whisk together the corn starch and the other half cup of water until combined.\nAdd the corn starch to the strawberry mixture and keep on simmer for about 10 minutes or until you mixture begins to thicken.\nTake off the heat and put in a container to chill in the fridge. . After the dough has chilled, unwrap it and divide it into 2 equal pieces to make it easier to roll out. This will help to keep the pie crust tender. The more times pie dough is rolled out, the tougher it gets.\nRoll out one of the pieces. . Cut out various sizes of fries from the pie crust dough and place them on a greased pan.. Take the butter set aside for the topping and melt it.\nBrush the pie crust fries with butter and sprinkle with cinnamon sugar.\nThen place them in the oven preheated to 375 degrees to bake for 10-15 minutes until golden brown.. Let the pie crust fries cool and then serve them up on a plate with your favorite pie filling and some whip cream!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 2, 1, 0]\nC: [0, 3, 2, 1]\nD: [1, 2, 3, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_149_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_149_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_149_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_149_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [3, 0, 1, 2]\nC: [3, 2, 0, 1]\nD: [1, 0, 2, 3]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Guests were coming for dinner. When we tried to open a bottle of red wine the cork crumbled and what you see here remained in the neck of the bottle outside the reach of this wine bottle opener and other similar corkscrew openers. Even if the openers we had could have reached the cork, crumbles of cork would have fallen down into the wine and we would have needed to strain the contents of the bottle. We considered pushing this piece of cork down into the bottle, but were concerned that it would tumble into the neck of the bottle and block the flow of the wine. Total removal was the goal.. I connected an air gun to an air compressor and attached a longer inflation needle to the air gun. This compressor does not have an attached tank. I set the air pressure for 100 psi. The cork came out quickly, but so did some wine, as you can see from the stains on the floor. Nearly the same amount of wine found the front of my shirt. You can see what was left of the cork on the floor. (Because my air compressor is tankless, far less than 100 psi. accumulated before the remainder of the cork was expelled.). Shown is the neck of an unopened wine bottle. The cork is one of the longer corks we have removed from a wine bottle. The longer needle is what I used to remove the remainder of the rotted and crumbling cork from the wine bottle we needed to open. Its threaded fitting is larger than a standard tire valve. The shorter needle is a standard needle for inflating a basketball using a bicycle hand pump or a small air compressor. Its screw fitting is the same as any Shrader tire valve, but it is too short to reach through a wine bottle cork. Also shown is a Presta to Shrader tire stem adapter.. The photo shows an old Presta to Shrader tire stem adapter I have for my bicycle. This adapter contains an \"O\" ring for a seal. I removed it with a pick. The tire stem adapter will be the right size for a bicycle pump when finished.. My tire stem adapter is corroded, but I need a bright surface for soldering parts together. I used a drill to clean the inside of the adapter.. I used some thin stranded copper wire to wrap around a piece of thin brass hobby tubing. The wire wrap will fill the space between the hobby tubing and the inside of the tire adapter.\u00a0. I used a small screwdriver to push the wire wrap into the cleaned opening in the tire stem adapter. I held the tire stem adapter in wooden vise jaws. I used a soldering gun at its higher heat to make the tire stem adapter hot enough for the solder to flow well and make a good seal. When cooled, the thin piece of hobby tubing is firmly sealed in the tire stem adapter. See the second photo. I used 100 grit sandpaper on a countertop to sand an oblique point onto the end of the needle so it pierces the cork more easily. Then I used a straight pin to make certain the hole in the tubing is fully open.. The thin brass tubing bends easily. Handle with care. I inserted it into the cork in a wine bottle. (This bottle has already been opened with an electric opener that made another hole all of the way through the cork. Air pressure did not remove this cork from this wine bottle because air escaped through the hole from the electric opener.). Attach the air hose from the pump or compressor. Hold the hose fitting so the brass tube does not bend or break. Secure the wine bottle so it does not tip over while opening it. Pump air into the bottle. My pump has a pressure gauge and it went to about 80 psi. before the cork began to move. When the cork moved, it moved quickly and came out of the bottle immediately. (Plastic 2 liter soda bottles have been tested and burst at about 120 psi. A glass wine bottle is stronger than a plastic soft drink bottle. People have been using commercial versions of air pressure wine bottle openers safely for years.)\u00a0 A long inflation needle like this makes it easy to reach down into the neck of a wine bottle in which the cork has begun to crumble. You may or may not want to open your bottles this way regularly, but it sure helps remove a cork that broke apart before it was fully removed.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 0, 1, 2]\nC: [3, 2, 0, 1]\nD: [1, 0, 2, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_150_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_150_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_150_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_150_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [1, 0, 2, 3]\nC: [3, 0, 2, 1]\nD: [0, 1, 3, 2]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. -Take 2 cups All purpose flour in bowl.Dissolve sugar in half cup warm water.-Add sugar dissolved warm water,oil and milk to flour and knead to soft dough. BASIC SHAPE NYAPSHA:-Flatten the dough using rolling pin.Cut longstrips,then make criss cross cuts to form diamond shapes as shown in my images.-Make a small cut at the center of each diamond. -Now insert the tip of diamond in to middle hole and make a twist.-You can also insert base of diamond in to middle hole and make atwist as shown in my images.. -Make similar process with all diamond shapes.. -Flatten the dough and make thin long strips-Join the tip of strips. -Make braid shape similar to how we braid our hair by swirling one strip above other.. -Here is the final braid shape. -Cut long wide strips and make cut in the centre. Insert the base of strip in to middle hole and make a twist. -Here is the final shape.. -Make long rope and join 2 ends-Then make swirls from one end and join the tip. -Fry all khapse in oil till they turn brown in color. -Enjoy crispy khapse with teaMy tip-Adjust sugar according to your sweetness.If sugar quantity is less sprinkle sugar powder after frying-Rolling should be thin and not thick for crispy khapse\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 0, 2, 3]\nC: [3, 0, 2, 1]\nD: [0, 1, 3, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_151_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_151_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_151_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_151_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [3, 1, 2, 0]\nC: [2, 3, 1, 0]\nD: [1, 0, 2, 3]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. To make this drink you will need:1 cup rice (uncooked)5 cups water1 1/4 cups milk2/3 cup sugar1 teaspoon vanilla extract2/3 teaspoon ground cinnamon. Put the rice in a blender and blend for 1-2min. Add water and let it sit overnight, and then strain the rice out of the water. You can throw the rice away, but you need to keep the water.. Add the rest of the ingredients in any order and stir until thoroughly mixed. Then put it in the fridge and let it chill.. Your done! Have fun sharing this delicious drink with your friends!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 1, 2, 0]\nC: [2, 3, 1, 0]\nD: [1, 0, 2, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_152_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_152_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_152_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_152_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [0, 1, 3, 2]\nC: [3, 2, 0, 1]\nD: [1, 2, 3, 0]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. The recipe I used is as follows.\n1 cup sugar\n1/2 cup butter\n2 eggs\n2tsp vanilla\n1 1/2 flour\n1 3/4 tsp baking powder\n1/2 cup of milk\nYou'll also need\nIcing sugar\nRaspberry Jam (some with seeds, some without\nDark and white chocolate\nRed food coloring\nA mold for the brains\nYou could also use any other recipe for the base, up to you. Cream sugar and butter. Add eggs and vanilla.\nCombine flour and baking soda\nAdd to mixture\nStir in Milk\nAdd desired amount of red food coloring\nCook 175c for 30 minutes . Let cupcakes cool for about an hour. I cored the cupcakes out and filled them with raspberry jam.\nI then capped them with the left over cupcake.. I added a small layer of white icing to help keep the brains in place and add a little contrast. I made the brains the night before as they required some time to set\nI melted some chocolate in a pyrex and added to the mold.\nI then place the mold in the fridge for about 3 hours to set\nI experimented with different shades of chocolate, and even painted on some red chocolate to help make the brains stand out more. Now, just place the brains in the middle of the cupcake.\nI heated up some seedless reapberry jam to pour over the brains to give the bloody look.. And that's it. now you can devour some zombie brains without the fear of being infected.\nI hope you have enjoyed this instructable,\nThank you\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 1, 3, 2]\nC: [3, 2, 0, 1]\nD: [1, 2, 3, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_153_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_153_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_153_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_153_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [0, 1, 3, 2]\nC: [3, 0, 1, 2]\nD: [0, 3, 1, 2]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Ingredients:16 OZ Heavy cream (Cold)Material:Large mixing bowl  Silicon Spatula  Electric beater.  Pour the cream in a large bowl and beat it using an electric beater till you can see white liquid in the bowl. It took me 20 minutes to get to this stage. Note : The stages to get to this stage is, first the cream got soft peaks, then stiff peaks, then the cream started turning a little yellow and then it gave out white liquid.  This white liquid is buttermilk and can be used to prepare bread etc. Pour the buttermilk into another bowl  This is our butter. But the butter might have some buttermilk left and if we do not remove all the buttermilk from the butter, the butter will go bad if not used within 2-3 days.  In order to remove the buttermilk from the butter completely, add ice cubes in water to get ice cold water and pour 4 tbsp of this water on the butter and beat it again for 3-4 minutes and we will get white liquid. Discard the water. Repeat this process till we get clear water.  Place the butter on a plastic wrap and roll to get a tight tube.  Place in fridge for 1-2 hours and can use as required :). Ingredients Required:Softened Butter - 4 tbsp  Parsley - 1/4 cup  Garlic - 1 tbsp  Lime zest - 1 tsp  Salt - 1/4 tspSteps Required:Mix together all the above ingredients well  Place the butter on a plastic wrap and roll to get a tight tube  Place in fridge till required. Ingredients Required:Softened Butter - 4 tbsp  Coriander Leaves - 1/4 cup  Garlic - 1 tbsp  Chilli Flakes - 1/2 tsp  Smoked Paprika - 1/2 tsp  Salt - 1/4 tspSteps Required:Mix together all the above ingredients well  Place the butter on a plastic wrap and roll to get a  tight tube  Place in fridge till required. Ingredients Required:Softened Butter - 4 tbsp  Chopped Pecans - 1/4 cup  Maple syrup - 1 to 2 tbspSteps Required:Mix together all the above ingredients well  Place the butter on a plastic wrap and roll to get a  tight tube  Place in fridge till required. Ingredients Required:Softened Butter - 4 tbsp Orange Marmalade - 2 to 3 tbspSteps Required:Mix together all the above ingredients well Place the butter on a plastic wrap and roll to get a tight tube Place in fridge till required. Ingredients Required:Softened Butter - 4 tbsp Chai Spice - 1/2 tsp (Ingredients required to prepare Chai Spice is below) Vanilla essence - 1 tspChai Spice :4 parts ground cinnamon 2 parts ground ginger  2 parts ground cardamom  1 part ground cloves  1 part ground coriander  1 part ground white pepperSteps Required:Mix together all the above ingredients well Place the butter on a plastic wrap and roll to get a  tight tube Place in fridge till required\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 1, 3, 2]\nC: [3, 0, 1, 2]\nD: [0, 3, 1, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_154_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_154_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_154_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_154_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [1, 3, 0, 2]\nC: [0, 2, 1, 3]\nD: [3, 0, 2, 1]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Ingredients\n1 1/2 cup of Sugar\n1/2 cup of Butter, softened\n1 teaspoon of Vanilla extract\n2 Eggs\n2 3/4 cup of Flour\n1 teaspoon of Baking soda\n1/2 teaspoon of Cream of Tartar\n1/4 teaspoon of Salt\n1 teaspoon of Cinnamon\na pinch of Cayenne pepper\n2 bars of Chili infused chocolate (I used Lindt, but if you want to make this from scratch it is roughly 7 ounces)\n2 teaspoons of Cinnamon\n2 Tablespoons of Sugar\na pinch of Cayenne pepperEquipment\nAn Electric mixer, either a stand or hand mixer would work fine\nA Chopping board\nA Large knife\nA Small extra bowl\nParchment paper or cooking spray. There are plenty of great ways to go about doing this, probably some easier than the way I ended up going about it, but this way seemed to work pretty well for me. If you know an easier way, please feel free to leave a comment on how you go about \"chipping\" chocolate.\nLie the chocolate bar on the cutting board. Use a long knife to press into chocolate, holding onto the handle and applying pressure on the blade. By using the knife like a see-saw, rock the pressure back and forth to cut the chocolate into strips. Turn the cutting board and using the same method, cut in the opposite direction until the chocolate is roughly chip sized. I like to vary the size of the chunks from very small to the size of a dime. I also like a lot of chocolate chips in each cookie, but feel free to vary the amount to your preference.\nIf you have a stand mixer or a second set of helping hands, you can do this while the dough is mixing.. Preheat the oven to 400 degrees F\nCombine the Sugar (1 1/2 cup), Butter (1/2 cup), Vanilla extract (1 tsp), and eggs (2). Mix well.\nSift the dry ingredients: Flour (2 3/4 cups), Baking soda (1 tsp), Cream of Tartar (1/2 tsp), Salt (1/4 tsp), Cinnamon (1 tsp), and a pinch of Cayenne pepper. Mix into the sugar mixture.\nAdd the chocolate and mix until integrated.. Mix the remaining ingredients in a small bowl: Sugar (2 Tbsp), Cinnamon (2 tsp), and a pinch of cayenne pepper. This will be the powdered coating to the cookies.\nThe dough isn't that sticky and can easily be handled. Form the dough into balls roughly the size of a ping pong ball. Roll each ball in the cinnamon sugar mixture.\nArrange balls on a prepared cookie sheet (spray it with cooking spray or use parchment paper) roughly an 1 1/2 to 2 inches apart.\nThis recipe should make 20-22 cookies.\nBake cookies for 8-10 min.\nWhen they are done, immediately move them to a cooling rack.\nLet them cool for a bit, then eat and enjoy. Cookie are the best warm out of the oven with a tall glass of milk.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 3, 0, 2]\nC: [0, 2, 1, 3]\nD: [3, 0, 2, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_155_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_155_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_155_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_155_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [0, 2, 3, 1]\nC: [0, 2, 3, 1]\nD: [1, 3, 0, 2]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Pink Mambo Monsters  (I swear this name will make sense in a few paragraphs)\n3/4 cup plus two tablespoons (or just under a cup) pink champagne\n1/8 cup soymilk\n1 teaspoon apple cider vinegar\n1 1/4 cups flour\n2 tablespoons cornstarch\n3/4 teaspoon baking powder\n1/2 teaspoon baking soda\n1/3 cup canola oil\n3/4 cup sugar\n2 teaspoons strawberry extract\nCombine all ingredients and mix well. The batter will get slightly frothy at first due to the carbonation in the champagne.\nPour into greased or lined muffin pans and bake at 350 degrees for 12 minutes (give or take). Remove and let cool. The cupcakes will be very very fluffy.\nA Word of Caution: These cupcakes were extremely boozy when I made them. If you want to cut down on the champagne, try 1/2 cup champagne and 1/2 cup soymilk.\nAfter you bake them, you may notice a slightly green tint. I\u2019ll be honest, I don\u2019t have the slightest idea why this happens. I have a theory about the baking soda and alcohol combining to form a weird chemical reaction, but maybe not. It\u2019s a mystery I guess. (see\u2026 you get why they are Pink Mambo Monsters now right? Pink Mambo because of the champagne, and Monster because they are now green tinted).. While your muffins are in the oven, cut six small strawberries in half. Make the chocolate ganache recipe below:Chocolate Ganache\n1 cup vegan chocolate chips\n1/4 cup soymilk\nsplash of maple syrup\nPlace all three ingredients in a microwave bowl and nuke it for about 30 seconds then stir. Heat in additional 10 second increments, while stirring in between, until melted.\nDip each strawberry half in the ganache then set on wax paper to harden. \u00a0. Finally, while your cupcakes are cooling and your chocolate covered strawberries are setting, make your frosting. I used strawberry buttercream which was really good. It wasn\u2019t super sweet and complimented the champagne flavor of the cupcakes well.Strawberry Buttercream Frosting\n1/2 cup shortening\n1/2 cup margarine (Earth Balance)\n3 1/2 cups powdered sugar\n1/4 cup soymilk\n1 1/2 teaspoons strawberry extract\nCream shortening and margarine together with hand mixer. Slowly add the powdered sugar a 1/2 cup at a time. Once combined, add in soymilk and strawberry extract. Blend on slow/medium speed for 5 to 7 minutes (trust me, you don\u2019t want to skimp on the 5 to 7). I added about a half a cup of fresh strawberries and folded them into the frosting as well.\nIf you want to add the strawberries, reduce the amount of soymilk just a tad. The juice from the strawberries will add too much extra liquid to the frosting and make it runny, but decreasing the soymilk will leave the frosting a bit stiff in prep for extra juice.. Once you have your cupcakes, frosting and chocolate covered strawberries set, you have to assemble them. Because I had chunky frosting from bits of strawberry, I found it easier to cut a hole in the top of my cupcake and fill it (think of it as cutting off the top of the pumpkin). Cut the hole in a slightly angled fashion so you have a cone shaped hole (this will prevent you from having to try and detach the top for the bottom and mashing your cupcakes)\nFill the hole with frosting then put the top of the cupcake back on. I did shave a little extra cake off the top piece as to not squish the frosting out the sides. Add a dab of frosting to the underside of a strawberry half and stick it on top of the cupcake.\nAnd there you have it - booze cakes.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 2, 3, 1]\nC: [0, 2, 3, 1]\nD: [1, 3, 0, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_156_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_156_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_156_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_156_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [1, 0, 2, 3]\nC: [0, 3, 2, 1]\nD: [1, 2, 3, 0]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. MIX INGREDIENT TOGETHER (butter should be at room-temperature and not melted, cut in little pieces).\nTry to touch the dough less possible and do it fast, don't make it warm, or the crust will lost his crunchiness (if you have a marble counter use it).\nMAKE A BALL and put it in transparent film, so it don't dry. Put it in fridge for at list 2 hours (or it will shrink in baking pan), better over a night.. BIT BUTTER AND SUGAR TOGETHER, until they become a smooth paste.\nADD ALL THE OTHER INGREDIENT and steer really well, until everything is smooth.. take out of the fridge the ball of dough half an hour before so it soften.\nPUT IT FLAT AND ON THE SIDE IN a PAN.\nMAKE LITTLE HOLE WITH FORK everywhere in the the bottom of dough (so it wont make air boubble when it cook).\nPUT FILLING IN.\nCOOK FOR\u00a0\u00a0 MINUTE AT\u00a0\u00a0 DEGREE.\ntake it out and let it cool down.. PUT RASPBERRIES ALL OVER YOUR CAKE.\ndo a glass of gelatin with water and sugar (read instructions on gelatin packaging) and wait few minuts for it to become a tiny bith thick, but not all the way.\nPUT GELATIN ALL OVER THE RASPBERRIES.. refrigerate cake for 5 hours to a night.\nthis cake is delicious also the day after.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 0, 2, 3]\nC: [0, 3, 2, 1]\nD: [1, 2, 3, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_157_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_157_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_157_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_157_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [1, 2, 0, 3]\nC: [0, 2, 3, 1]\nD: [3, 0, 2, 1]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Cut the layer in half. Attach it to 1/2 of an 8\" cake board with white buttercream. Stack the 2 half layers and refrigerate for 15 minutes.. Stack 2 10\" cake layers with white buttercream on a 14X14\" cake board. We are going to add our half 8\" cake layer in the back of the 10\" cake. Mark where the cake will sit. Use milkshake straws to add support in that area. Cut them off at cake level.Add buttercream in that area.. Stack the cake layers. Then give the entire cake a crumb coat.. You need 4 sugar cones.  Take one of the sugar cones and using a serrated knife cut the end off the cone. Then using buttercream attach a sugar cone to the cone you just cut. This will make a taller mountain. Cover all the sugar cones with buttercream and attach them to the cake. Put them in the refrigerator for 15 mins. Then you can add more buttercream to make them look more like mountains. Then frost the cake completely with white buttercream. . Using an offset spatula, mark out a stream running from behind the far right mountain.  Make it wider as it approaches the cliff to make a waterfall. Then make a rounded lake at the bottom of the waterfall. Leave the buttercream rough. Then use blue and white piping gel, frosting in an up and down motion to create the waterfall. . Frost the cake board. I made snowy evergreen trees for the cake. I have a video for making them very easily. Bring some of them down onto the cake board. I spread some sugar crystals around the cake and board fro some sparkle. For Elsa's Ice Castle, I made some light blue hard candy and broke it into pieces. Then add your Disney Frozen characters. You could also make this cake for other themes as well by switching out the cake toppers.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 2, 0, 3]\nC: [0, 2, 3, 1]\nD: [3, 0, 2, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_158_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_158_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_158_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_158_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [2, 1, 0, 3]\nC: [0, 3, 1, 2]\nD: [1, 0, 3, 2]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. -6 small red apples (with stems)-3/4c sugar-1/3c light corn syrup-2-3 drops red food gel-edible red glitter-candy thermometer-flavoring *optional*. Put your glitter into a bowl and set near your cooking area, if you have a friend, extra hands definitely help with this recipe. . Put all of the ingredients, except for the red gel and any flavorings, into a pot and bring to a boil. It's a lot easier if you use a candy thermometer to keep an eye on the temperature. You want the sugar to hit the \"hard crack\" stage, usually 300-310 degrees. Just before it does, add your coloring and flavors. . As carefully as you can, swirl each apple one by one in the sugar mixture and then transfer quickly to the bowl of edible glitter and coat as well as you can. Set aside on parchment and let cool. \nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 1, 0, 3]\nC: [0, 3, 1, 2]\nD: [1, 0, 3, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_159_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_159_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_159_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_159_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [3, 2, 0, 1]\nC: [1, 2, 3, 0]\nD: [0, 2, 1, 3]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Complete ingredient list:\n1 pound sole or other white fish, cut into medium-small chunks or slices\n15\u00a0 jalape\u00f1o peppers (3 for the marinade, 12 for the sauce)\n1 habanero pepper\n1 cup cider vinegar\n1/2 cup Chardonnay\n3 Tb. soy sauce\n5 or 6 bamboo skewers\n1 cup all-purpose flour\n2 tsp. coriander\n2 tsp. cumin\n1 tsp. ground black pepper\n1 cup rice vinegar\n1/2 cup chopped onion\nJuice of 1 large lemon\n1 Tb. olive oil\n1 tsp. salt\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~Jalape\u00f1o\u00a0 Marinade:\n3 jalape\u00f1o peppers\n1 habanero pepper\n1 cup cider vinegar\n1/2 cup chardonnay\n3 Tb. soy sauce\nWater (optional, if the marinade doesn't quite cover your fish). Chop and seed peppers, and discard stems.\u00a0 Using a food processor or blender, blend peppers, cider vinegar, Chardonnay, soy sauce, and black pepper until smooth.. Slice the fish into medium-small slices, and submerge in the marinade (that's the green stuff in the bowl).\u00a0 Leave it in the fridge several hours, or overnight.\u00a0 If you don't give the fish enough soaking time, it won't pick up very much flavor.\nMeasure out and mix together:\n1 cup flour\n2 tsp. coriander\n2 tsp. cumin\n1 tsp. ground black pepper\nBreak 3 eggs into a bowl with 1 Tb. water and whisk until smooth.. Rinse skewers and set aside (they don't need to be soaked).\nCoat each piece of fish in the flour-spice mixture and transfer to a plate\nFor the second coating, completely cover each piece in the beaten egg, and then roll in breadcrumbs.\u00a0 (The breadcrumbs in the second photo are Kikkoman brand panko crumbs.). Preheat oven to 375 degrees Fahrenheit,.\nThread the pieces onto skewers, about 4 per each.\nBake fish on a large baking sheet for approximately 30 minutes.\u00a0 Fish should be moist and slightly flaky when done.\u00a0 Be careful to not over-bake, especially if your fish is thinly sliced.. If you're speedy, you can put this together while the fish is baking.\n1 T. olive oil (at least)\n12 jalape\u00f1o peppers, seeded and chopped\n1/2 cup chopped onion\n1 tsp. salt\n1 cup rice vinegar\nJuice of 1 large lemon\nSaut\u00e9 onions and peppers in the olive oil over medium heat until they just start go go limp (a little bit of brown is okay).\u00a0 Pur\u00e9e them together using a blender or food processor with the salt, rice vinegar and lemon juice.. Serve your Tortured Sole by drowning it in copious amounts of jalape\u00f1o sauce. and dousing with sour cream.\u00a0 A side of strangely-colored rice spiked with cilantro leaves\u00a0 is an ideal accompaniment.\u00a0 \nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 2, 0, 1]\nC: [1, 2, 3, 0]\nD: [0, 2, 1, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_160_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_160_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_160_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_160_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [1, 3, 2, 0]\nC: [1, 0, 3, 2]\nD: [3, 0, 2, 1]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Ingredients:1) 3-4 cups of glutinous short-grained rice. Note: basmati, jasmine or other longer-grain and non-sticky rice will not work! 2) 1 packet of Hainan Chicken Rice seasoning - you can find it in most Asian grocers or easily make it from scratch using garlic, ginger, onion, sugar, oil and chicken stock cubes)Preparation:Wash and rinse the rice in a metal bowl until the water inside the bowl is clear Fill up the rice cooker with water, about 300 ml or so. Put a little more to be on the safe side if you're unsure what is the right amount. . Steaming:1) Place the metal bowl of rice inside the rice cooker, ensure the rice itself isn't immersed in water, cover the lid and let it cook for 25 mins. Normally, rice is cooked by pouring water into the rice and letting it boil. However, this will make the rice too wet even after extensive drying, leading to uneven clumps of fried and not-so-fried crispies as you can see in the latter picture.2) After 25 mins, open the lid and thoroughly mix in the full packet of Hainan Chicken Rice seasoning into the bowl of rice. Close the lid and let it steam for another 25 mins. The rice should be sticky and slightly hard. . Drying:There are 3 methods to dry the rice, via mother nature, an oven or using a dehydrator. If you live in a climate with dry and hot temperatures, drying rice in the sun is fast, effective and easy. Layout circular clumps of rice on a tray and let it sit in a full day or two of sun and it will turn into hard and dense clumps. Just beware that freshly cooked rice is decently attractive prey for all forms of wildlife including birds, ants, cats, dogs, hungry family members, you name it and it will pose a threat. A mesh screen and water moat will help keep away hungry intruders.  For every other type of climate, a cheap dehydrator or oven works just as well. Put the oven on low to low-medium heat for at least 4-6 hours. I used a dehydrator as it lets me control the exact temperature of 60-65 Deg Celsius and I leave it on overnight for 12 hours. Feel free to experiment with shapes and sizes but note that the bigger or thicker the shape, the longer and more uneven the drying will be. Here are a couple of close-ups of pre and post dried rice crispies. . Frying:1) Fill a pot with some vegetable oil and turn on high heat to bring the oil to a boil2) Fry one or two crispies at a time for about 15 seconds, it should puff up and turn golden brown pretty quickly. Take care to avoid frying crispies to touch each other as they stick to each other very easily and transform into mega crispies. Smaller shaped crispies or a wider pot helps. 3) Place the fried crispies on paper towels to soak up excess oil and let the crispies cool down to room temperature. They should get more crispy as it cools down and let it dry thoroughly before serving or storing in an air-tight container. 4) When serving, it's great to pair the rice crispies with Hainan Chicken Rice chili, which is easily found at Asian grocers, online or make your own recipe. Bonus points if you serve it with sweet dark soy sauce (a mixture of water, thick dark soy sauce and rock sugar) and minced garlic/ginger for the complete chicken rice condiment combo. If the crispies are well prepared - it can keep for at least a month in a container. I currently have 2 containers with crispies made 6 weeks ago and they are still in good shape. Enjoy making and eating a new Singapore-style snack! \nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 3, 2, 0]\nC: [1, 0, 3, 2]\nD: [3, 0, 2, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_161_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_161_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_161_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_161_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [0, 3, 1, 2]\nC: [3, 0, 1, 2]\nD: [1, 3, 0, 2]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Apples! Lots of them. Fill up your crock pot! (These should be sweet apples; or otherwise you'll have to add sugar later.)\nCrock pot\nWooden spoon\nSpices- I used ginger, cinnamon, and clove\nBrown sugar, if you want to sweeten it\nWater. Wash them core them, slice them.\nYou don't have to cut them any smaller than slices; they'll cook down.\n(Cut them smaller if you're in a hurry; they'll cook faster.)\nIf you want, peel the apples.\nI didn't, and I haven't found the skins to be obnoxious; rather, it adds texture.\nHowever, if you want to take the time, go ahead and peel the skins off.. Put all of the cut up apples in the crock pot.\nAdd a little bit of water. This is just to keep the apples from sticking to the bottom; we'll try to evaporate it out later.\nCover crock pot and cook on high for 3 hours, stirring occasionally (once an hour or so).. At the end of three hours, your apples should look something like the picture- they're starting to disintegrate.\nFeel free to help them along with the wooden spoon.\nYou can also add spices at this juncture. I used lots of ginger, a fair amount of cinnamon, and some clove.\nIt's to taste; add whatever spices you want. You can also add sugar if you want a sweeter, more caramel-gooey apple butter.\nTurn your crock pot down to Low and keep cooking, stirring intermittently, until you have first applesauce (lighter colored disintegrated apples) then apple butter (darker colored from caramelization).. Once you have a satisfactorily caramelized apple butter, you could be done!\nIf you want to condense it so that it's thicker, you could leave it on for a few more hours without the lid until enough of the water has evaporated.. The serving suggestion pictured here:\nApple butter spread on buttered multigrain toast, topped with honey Greek yogurt and nutmeg. The plate is drizzled with organic molasses.\nYou should also try apple butter on vanilla ice cream.\nIf you made a lot, apple butter is traditionally preserved in jars.\nIn case you get sick of eating it, apple butter makes a nice gift for friends and family!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 3, 1, 2]\nC: [3, 0, 1, 2]\nD: [1, 3, 0, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_162_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_162_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_162_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_162_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [3, 0, 2, 1]\nC: [0, 1, 2, 3]\nD: [3, 2, 0, 1]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. *NOTE* to save time you can use all frozen vegetables or omit some vegetables. This recipe is very versatile and you can change it as much as you like!! *NOTE*- 4 Cups COOKED short grain rice (2 cups of uncooked rice = 4 cups cooked rice)- 2-3 Tablespoons butter (you can use oil but butter gives it a lovely flavour)- 3 Eggs, whisked-1 large onion, diced- 2 sticks of celery, chopped- 2 Carrots, diced- 2 Garlic cloves (or 1 large), crushed- 1/4 Green cabbage, sliced thinly- 1/2 Cup frozen peas- 1/2 Cup frozen corn- 2 Teaspoons oyster sauce- 3-4 Tablespoons of soy sauce- Spring onion, to serve- Sweet chilli sauce, to serve. - Chopping board- Cutting knife- 1 Large frying pan/ wok- 1 Small frying pan (or you can use the same pan to cook the scrambled eggs in)- Frying spatula/ spoon- Garlic Crusher- Fork- Small bowls- Serving bowls. The first step is to prepare the vegetables, it is simple and can be done in advance to save time, if not it only takes a few minutes to chop and cut all the vegetables.Firstly wash the vegetables under cold water to remove any dirt, then finely dice the onion to your preferred size, the smaller they are the less noticeable they are in the rice. Then dice the carrots finely, chop the celery and finally thinly slice the cabbage, if you prefer large chunks of cabbage then slice it thicker. Then finally crush the garlic in a garlic press.*NOTE* if you are serving this to children you will want to chop the vegetables as fine as possible so that it is harder to notice.  *. To make the scrambled eggs crack the 3 eggs into a small bowl and whisk them together until they are uniform in colour. Then heat a small pan to medium high and place the eggs into it and cook them for about 2 minutes while stirring them to form scrambled eggs. Then place them on a side plate and break it up into small pieces, leave them to cool while you cook the vegetables.. Heat the wok onto a medium high heat then once the pan is hot add the butter and allow it to melt completely. Then add the diced onion to the pan and cook until the onions become transparent and just start to brown.  Then add the garlic and let it cook for about 1 minute.*NOTE* do not add the garlic while cooking the onions as the garlic can burn and give the dish a bitter taste. Then add the diced carrot, celery and cabbage, stirring for about 2 more minutes until the vegetables slightly soften.. Once the fresh vegetables are cooking in the pan add the pre-cooked rice and stir it for about 1 minute to break it apart. Then add the frozen peas and corn, scrambled eggs, oyster sauce and soy sauce and stir the rice and vegetables until they are evenly coated with all the different sauces. Try a small amount of the fried rice and season it accordingly (ie. ass more or less sauce until it tastes yummy!!). Also make sure that the rice is heated evenly, when it is take it off the heat and get the serving bowls!. Once the rice is cooked place it in the serving bowls and add any toppings that you like. My suggestions are freshly chopped spring onions with sweet chilli sauce on the side.  This is a great dish as it can be a meal on its own or a side dish for meat. It is delicious, quick and you can use any vegetables in your fridge!Food pairing suggestions:- Serve with Teriyaki chicken- Serve with steak- Serve with Tofu- Serve as a side dish for a group dinner\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 0, 2, 1]\nC: [0, 1, 2, 3]\nD: [3, 2, 0, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_163_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_163_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_163_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_163_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [0, 2, 3, 1]\nC: [2, 0, 3, 1]\nD: [1, 0, 2, 3]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. 400 gr spaghetti500 gr ground beefTomato sauceSalt and pepper. Cook the spaghetti in boiling water with some salt. Stir every 2 minutes and drain when it's al dente (8-10 minutes).. Put the ground beef in a saucepan in medium heat. Cook it and cut it with the scoop to make it in little pieces. When it changes its color let it cook for five more minutes in low heat.. Pour the sauce in the cooked beef and stir. Add some salt and pepper and let it boil for 3 minutes.. Put spaghetti on a dish and use a spoon to put some bolognese on top of it. You can use some parmesan cheese to give it an extra special flavor. This pasta goes great with garlic bread and salad. \nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 2, 3, 1]\nC: [2, 0, 3, 1]\nD: [1, 0, 2, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_164_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_164_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_164_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_164_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [0, 2, 3, 1]\nC: [2, 0, 3, 1]\nD: [3, 0, 1, 2]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Clean the Habanero Peppers by washing them under running water in a Colander and\u00a0Drain.Pick off any stems.Roughly Chop Peppers and Garlic just to make it easier to blend. Habaneros are ridiculously hot (100,000 to 350,000 on the Scoville Scale), so removing some of the seeds and the white membrane inside the habaneros can kick back some of the heat. For my sauce, I just removed half the seeds from the peppers by running the inside of the peppers with cold water.\u00a0. Put the chopped peppers, and garlic into the blender.Add A Pinch of SaltPour 1/2 Cup of Water and Apple Cider Vinegar\u00a0into the blender as well.\u00a0 I like using Apple Cider Vinegar, because it adds more of a fruity flavor to the sauce that complements the Habanero. It also helps darken the sauce a bit to make it similar to regular store bought Sriracha Sauce, but you can always use White Distilled Vinegar if you want.\u00a0Blend for 5 minutes, until everything is a smooth puree.\u00a0. Pour the mixture into a saucepan on medium heat.Wait till the mixture comes to a low simmer.\u00a0Slowly stir in a 1/4 cup of sugar till it dissolves.\u00a0 I used Sugar in the Raw, because that's what I prefer. Most people use brown sugar to deepen the flavor in their Sriracha sauces, but any kind of sugar here is fine.\u00a0Simmer for 20 minutes or till the mixture reduces by 1/3.\u00a0Skim off any foam that collects on the top, and discard foam.\u00a0. Pour reduced sauce into the blender again, and let it cool down for a few minutes.\u00a0Blend for another 5 minutes to further break down the warmed peppers.\u00a0. Pour the mixture through a fine mesh strainer.\u00a0Push the mixture through with a spoon to drain all that hot sauce till all that's left is dry pulp and seeds. If the mixture at this stage is still not the consistency \u00a0you want. You can add more water and vinegar to thin it out, or reduce further on a saucepan on low heat to thicken it up.\u00a0. Most recipes call for Sriracha to be fermented for a few days, but I didn't see any need for it. You're welcome to try though. This homemade Habanero Sriracha sauce is way more hotter than usual store bought Sriracha, but it's still really good. It has the same texture, the garlic-y, sweet and sour notes of regular Sriracha sauce, but has a brighter, more vibrant, and fresher color and taste. The habaneros add a slightly more fruity, and citrus notes, but use it like regular Sriracha sauce, and you will not be disappointed.\u00a0 That is, If you still have your tastebuds left after trying some. \u00a0 This should hold for about 5-6 months.\u00a0\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 2, 3, 1]\nC: [2, 0, 3, 1]\nD: [3, 0, 1, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_165_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_165_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_165_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_165_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [0, 3, 1, 2]\nC: [2, 3, 0, 1]\nD: [1, 3, 2, 0]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. \n          As for Ingredients, this is as SIMPLE as it gets... It's only *REALLY* got three essential ones... the rest are just pomp and glory... delicious pomp and glory\nWatcha gunna need:\n\t\tIcing sugar (300g, BUT I would have extra, you'll see why later)\n\t\t1x Egg white\n\t\tPeppermint flavouring (The natural stuff is best, but go with what you can get)\nPomp and Glory:\n\n\t\tOrange food colouring, or Orange sweets for his nose\n\t\tSilver Catering balls for his eyes\nUtensils:\n\t\tScales for weighing out the Icing sugar\n\t\tSpatula, or spoon\n\t\tBig bowl\n\t\tPlate\n\t\tCocktail stick. See this REALLY annoys me, when someone makes something for you to CONSUME and they don't bother to wash their hands before hand... So I have decided to make a point of a few things that TV Chefs or whatnot don't seem to be able to do\n# Remove any rings/ Jewellery that may come in contact with the food\n# Wash your hands before you cook\n# After handling Eggs, its good practice to wash your hands (Stops the spread of Salmonella and such)\n# DO NOT Lick or taste off a spoon THEN put it back into the bowl you took the sample from... EWW... If I wanted your germs I would kiss you!\nBefore I start sounding like a rambling germ freak, I'm sure you'll agree this is all common sense... But some people just don't ...get... that its not hygienic. \n          Firstly, measure out your 300g of Icing sugar and sieve it into your large bowl... I didn't sieve it, because I'm an idiot... I spent AAAAGES trying to mix the lumps out *sigh* seriously... sieve... It will save you loads of hastle.\nNow, crack your egg, and seperate your white from the yolk...personally I like to crack my egg in half, and pass the yolk from one half to another so that the white lands into my mix... there are other ways,\n\n\t\tlike cracking the egg onto a plate and manually lifting out the yolk with a spoon...\n\t\tOr... Pouring the egg into a sieve so that the white runs through the holes and the yolk stays in the sieve\u00a0\n\t\tBuy a carton of eggwhite\nWhichever you decide to use... pop your eggwhite into your Icing sugar.. It is at this time, that you need to add your flavouring... Add 2-3 cap fuls of your flavouring, but if you think it doesn't taste strong enough... add a dash more in until your happy!\nNow, using your spatula, or whatever stirring implement you have on hand give the Goop a good old mix... And if your feeling particularily inclined... dive in with your hands it's quite cathartic!. Now in step 1 I mentioned about having extra Icing sugar....\u00a0 Well...this is where it comes in (maybe)... Now not all eggs are a standard size and weight... so your mix (or hereafter refered to as Goop) may be a little bit runny, and may not hold together like it should... My Goop was abit of a runny mess, so I added little ammounts of icing sugar until it became a firm white ball. that doesn't feel wet to the touch.\nIF you add too much icing sugar... add a tiny bit of liquid (egg white, water, or more flavouring) but I mean TINY... we don't want to be constantly adding more of one thing then another. I found at this point that since I had been kneeding the Goop with my hands It was a little too warm to model, So I bunged it in the refridgerator for about 5minutes to firm up.\u00a0 You should be able to roll small balls of the mix and have it hold together. Damnation, part of my I'ble didn't save... right lets try and remember what whitty comment I made here\nNow, I'm not going to teach you how to suck eggs, as I'm assuming that most of you at some point in your lives will have made an ACCTUAL Snowman. What you need do, is to roll out a seris of balls...\nthe one on the bottom must me slightly larger than the one which will form the head... Stick one to the other and repete until your bored ;)\nIf you intend to get creative and make your decorations with the Goop, remember to leave some to one side for that task.. Now you have a little army of blind Snowmen, don't you think it's time we gave him some eyes, and a nose?\nOr well... Now's the time to go mad with your decoration... I was getting sleey at this point so I stuck on some little silver balls for eyes, and cut up some orange sweets for a nose... BUT you could draw it on with icing, or eddible ink... Or make him a scarf... I don't know, do what makes you feel happy ;)\nTo make the eyes and nose stick, I made a little hole with a toothpick and jammed them in the hole.\nIf you are making your own nose, take your left over Goop, and mix in a little orange food colouring, roll into a carrot shape and stick it to the front of your snowmans face.\nYou will have to work quick as I found that my Goop went hard very quickly... but then again I was working quite slowly. ;). \nThis is a last minute thought.\nPossible Variations:\n\t\tMake round disks of your goop, add a small ammount of green food colouring and half dip with milk chocolate.\n\t\tCut out into festive shapes and hand out as peppermint thins\n\t\tCompletly dip in chocolate and serve as a cheep alternative to After Eight mints\n\t\tGet gorey... Make into an eyeball shape (make two balls, one white, one red... or fill the inside with red jelly or cornstarch blood) Paint on an Iris and a few veins and hand out at Halloween!\nGo nuts and enjoy people :). Well, now, It's best to leave your Snowmen to set over night, either in the Refrigerator or somewhere equally as cool...\nThen when theyre set, hand them out to your nearest and dearest... or just gobble them yourself. My collegues loved them, and I'm sure your mates will love them too\nFor other Ideas go and look at my other I'bles. If the I'ble needs explaining or altering let me know... Vote, Reply, Rate and Enjoy\nCheerio\nBiggsy\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 3, 1, 2]\nC: [2, 3, 0, 1]\nD: [1, 3, 2, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_166_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_166_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_166_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_166_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [1, 3, 0, 2]\nC: [1, 3, 0, 2]\nD: [2, 0, 3, 1]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. So, how do you make this wonderful stuff? The ingredients are as follows:1 750 mL bottle of grain alcohol ( Everclear or similar, also known as rectified spirit--as long as it's potable, strong, and unflavored you'll be fine)Zest of 8 lemonsSugarWaterSimple, yes? Oh, you'll also need a glass jar in which to keep the stuff. Be sure you have lots of spare room, as you'll add more liquid later. Mine is two liters, and works great.You want to get the strongest alcohol you can get your hands on. Vodka, even the 100 proof stuff, isn't sufficient. In some states, such as Nevada, you can get 190-proof Everclear, which is 95% ethyl alcohol. Alas, California isn't one of them, so I'll make do with 151 proof (75.5% alcohol, which is still pretty stiff). You can as well, but go with the high-test if you get it. You'll dilute it down to something drinkable later; right now we need a strong but potable nonpolar solvent, and high-proof alcohol fits the bill. I understand an old catalog came with a disclaimer that Everclear was to be used \"for the production of homemade cordials,\" or some such, which is exactly what you're doing here.. First, wash the lemons thoroughly. A produce brush helps a lot with this. Some folks use a special-purpose fruit and vegetable wash solution to get them super-clean, but I've never been one for such luxuries.Next, zest the lemons. For those of you who aren't familiar with the process, lemon peel consists of two layers: zest and pith. The pith is the inner, white part, and the zest is the outer, yellow part. You only want the zest, because the pith is bitter and will impart that bitterness to your limoncello. Therefore, be careful that you don't get any bits of white in your zest.There are a lot of ways to zest lemons. Going from low-tech to high, they're as follows:A knife. You can zest lemons with a knife, but it needs to be small and very sharp, and you need to be careful with it. Blood in your limoncello is not cool, no matter how much of a goth you are.A potato peeler. Some people like these, but they probably have sharper potato peelers than I do. The first time I made this stuff, I tried this but then switched to a (just-sharpened) knife. Then I bought . . .A lemon zester. Mine's a knock-off of a nice ergonomic model from Zyliss and also includes a channel knife so you can make twists too.A Microplane or similar fine grater. This might be the ultimate zesting tool--I've heard people say they make it much easier, and they certainly look like they would, but I don't have enough use for one to justify dropping $15 or $20 on it. (Edit: On the recommendation of nattles, below, I have purchased a Microplane grating rasp, and it is everything a grater should be. Strongly recommended.)Keep in mind that smaller bits of zest will give you more surface area, and therefore more chance for the lemon oils to dissolve into the alcohol. Knives and potato peelers will each give you little chips of zest, whereas the zester will give you thin strips, and the Microplane very tiny shreds. I'd go for the lemon zester if you didn't have anything more specialized; it should only cost five bucks or thereabouts. Or if you want to splash out a bit more, get a Microplane rasp.. Next, pour the alcohol over the zest and wait a month or so. Keep the jar in a cool, dark place, and shake it every so often to mix the lemon zest around. In the meantime, maybe you could make lemonade or lemon chicken or something with all the lemons you have. Be advised that they'll spoil much sooner without their zest, so you'd better get to juicing pretty quickly.. OK! It's been a month or so, and the alcohol has taken on a very bright yellow color. This is just what we want--it shows us that the lemon oils have left the zest and entered the liquid. Now it's time to take out the lemon zest. If it's done, the booze should be lemony and the zest very pale and somewhat more brittle. This is about right.. Remember when I said we'd dilute it down to something more reasonable? Now's the time. I used 4 cups of water and 2-1/2 of sugar, which is a decent starting point. You may want to add a bit more sugar-water if you used the high-test Nevada Everclear instead of the weak stuff we get here in the California Republic, but it's easy enough to adjust the strength later. (Edit: I have a batch in now that I'm planning on preparing according to Alain80's recommendation below of a 1:1:1: ratio of alcohol to water to sugar (one gram of sugar per one milliliter of water/alcohol). I'll post my results here once it's done.)Anyway, heat the water on the stove and stir in the sugar. You don't need to boil the water, but you do need to get it hot enough so the sugar dissolves. Stir it frequently until it turns clear. The sugar-water will be markedly more refractive than plain water, because of all the dissolved sugar, but you should be able to see the bottom of the pan clearly.There's an argument that I should have taken pictures of making the syrup for completeness, but dissolving white powder in clear liquid to make another clear liquid is the sort of thing even the dimmest Photo 102 student would recognize as \"not visually interesting.\" My pedantic side demanded one, though, so it's in this batch as well.In any event, that's it! You can drink it as it is, but it'll improve with a month or so of sitting. It won't freeze unless you added a lot of water, so feel free to keep it in the freezer. Good luck!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 3, 0, 2]\nC: [1, 3, 0, 2]\nD: [2, 0, 3, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_167_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_167_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_167_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_167_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [1, 2, 3, 0]\nC: [3, 2, 0, 1]\nD: [3, 1, 0, 2]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Most of these ingredients can be found around the house without much trouble.Ingredients:-Milk (best with 2% or whole)-Sugar-Cocoa Powder-Pure Vanilla Extract-Small Pot-Mug-Tablespoon-Stove . First off fill your mug about 3/4 full with milk. I'd say about 1 1/2 cups worth. From here pour from the mug to the pot.. Get about a tablespoon or so or cocoa and put in the pot with the milk. You can add more or less depending on what you like.. Get your pure vanilla extract and put in a drop, and only a drop. You can fill up the cap to get the exact right amount. You only need a tiny bit. . Get about 2 tablespoons of sugar and add it to the mix.. Turn your stove on high and start to stir. You want to keep it from getting to a boil. To get the cocoa to mix in, mash it up against the side of the pot with the back of your spoon and rub it in.. When it seems hot enough to you, (usually when it starts to steam) pour it back into your mug. Add whatever you want, like marshmallows, although i just like mine plain. . Drink up and Enjoy!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 2, 3, 0]\nC: [3, 2, 0, 1]\nD: [3, 1, 0, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_168_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_168_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_168_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_168_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [0, 2, 1, 3]\nC: [1, 2, 3, 0]\nD: [3, 1, 2, 0]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. You will need the following: *Jello (light-colored Jello such as Lemon or Lime will work best for this project) *16oz of tonic water *Stove *Liquid measuring cup *Small pot for boiling water *Mixing bowl *Mixing spoon *Refrigerator *Small Table Lamp with fluorescent blacklight (The blacklight must be fluorescent and not simply a colored bulb. It can be purchased at most Wal-Mart locations for around $4.00). Measure out 8oz (1 cup) of tonic water in a liquid measuring cup.Safety: Use caution when removing the lid from the tonic water. If the water has been shaken, the lid will shoot off and overflow. Allow for the water to settle if the bottle is under pressure. . Pour tonic water into pot. . Put pot on burner and turn on high. . While water is boiling, pour Jello packet into mixing bowl. . Once water has boiled, pour the boiling tonic water into mixing bowl.Safety: Pot will be hot! If the pot is too hot to grab, use a potholder to remove the water from the burner. . Stir together Jello and boiling tonic water, making sure that the Jello powder fully dissolves. . Add one cup of cold tap water to the mixing bowl.Alternate method: If you would like your Jello to glow more brightly, add one cup of cold tonic water instead of tap water. However, doing this will make the Jello taste more bitter than normal. . Place the mixing bowl in the fridge and chill for four hours. . After four hours, remove from fridge. Turn out all lights and turn on the lamp with the blacklight in it. You should have glowing Jello!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 2, 1, 3]\nC: [1, 2, 3, 0]\nD: [3, 1, 2, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_169_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_169_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_169_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_169_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [2, 0, 1, 3]\nC: [3, 0, 2, 1]\nD: [3, 1, 0, 2]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Here's what you'll need to create these delectable appys and be assured of an invitation to all future parties:\n20 to 25 jalapenos peppers\n2 packages of cream cheese (softened)\n1 package of powdered ranch dressing mix\n1 cup of shredded cheddar cheese\n1 pound of bacon (you need one slice for each pepper)\nThe ingredients in the photo don't reflect the amounts needed for a full recipe as I was only making partial batches for testing purposes.. Put the cream cheese in a bowl and add the cheddar cheese. Sprinkle the ranch dressing mix over the cheeses and mix it all up with a fork. You can also use a food processor for this step but if you leave the cream cheese out until its room temperature its easy enough to do by hand. I like the texture of the cheese better if you don't chop it all up with the food processor.\nSet the cheese mixture aside while you do the next step.. Ok the first thing is to remove the stems. You don't want to cut them off because you need to keep the end intact to keep the cheese from melting out when you cook them. So just pop them off with a twisting motion.\nThen you need to cut the peppers in half lengthwise like in the picture. If you hold the pepper with the curved ends facing up you'll have a better looking end result and they'll lie down better.\nNow you've got to get the seeds out. I have found that a grapefruit spoon with the serrated end works really well for this and is a lot safer than using your fingers. Nevertheless, remember to wash your hands often. Those little alcohol pads will work the best if you can get your doctor's office to donate some. Trust me, if you have to go to the bathroom during this process you will want to be very careful about washing first. Enough said.\nNotice in the pictures I have them all cleaned out and the ends of the peppers are still intact. You have to leave a little bit of the white pithy stuff there and some of the seeds will want to hide up in there. That's ok, a couple of seeds will just make them interesting.\nNow you have a choice to make. Wild or mild? Mild will mean almost everybody will enjoy them and compliment you on what a great cook you are. Wild will mean most people will eat one and gasp and you might end up with leftovers.\nIf you want your peppers to be kind of mild, go ahead and scoop the cheese mixture into the pepper halves. Don't overdo it or the excess will just melt out onto the broiler pan which is just a waste of the good stuff.\nIf you want people to know these are jalapenos then you need to get some of those seeds you just scraped out and mix them up with the cheese. I can't tell you how much to use but a little goes a long way. I've tried tasting the mix but it changes after its cooked and also some peppers are hotter than others. If you use all the seeds I can assure you that only the strongest will survive the experience. Not really, they are still really good but most people will steer clear of them because they will be very hot.\nI got tired of the comments from some people about they aren't hot enough. So I started making both. Truth is I agreed with them. The mild ones can be disappointing if you like spicy stuff. So I do both.\nStuff half of the peppers with the seedless mixture and then put a few scoops of seeds into the remaining cheese and stuff the rest. I keep the hot ones separate from the mild ones so I can sprinkle paprika on them to mark them. The paprika doesn't change the flavor much but it works to warn the meek.. Now let's wrap this up. With bacon.\nYou need as many slices of bacon as you have peppers. We're going to cut the bacon in half like in the picture and use the half slices to wrap each one.\nHow you wrap them is important. I always start with the leading edge of the bacon just under the bottom of the pepper and hold it there while you wrap the rest around ending up usually on the top. It depends on how fat the peppers are. You can stretch it some to make sure it lays across the top so it doesn't uncurl when you broil them.\nGo ahead and wrap all the peppers and put them on a broiler pan. You can use a baking tray also but the broiler pan works best to drain away some of the bacon grease.\nLeave a little space between them or the bacon won't get done. The ones on the top row here are too close together.\nIf you made some wild ones now is when you sprinkle the paprika on them. You can see from the pictures it marks them pretty well.. Set the oven rack onto the second level down. You don't want the peppers real close to the heat or the tops will be really crispy before the rest of the bacon gets done.\nPut the peppers in and turn the oven to broil and close the door. If you're freaked out about how done your bacon needs to be then you might even put the shelf farther down and just bake them in a really hot oven. I prefer the peppers to still have some firm texture to them so I like to broil them quickly.\nTen minutes seems to work the best in my oven, yours may vary. The tops of the bacon are a little crispy, the sides are well done.\nTake them out of the even and let them sit a couple of minutes to let the cheese set.\nPut them on a tray and have at it. You are now the MVP of the super bowl party. (Unless somebody else brings a keg or something.)\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 0, 1, 3]\nC: [3, 0, 2, 1]\nD: [3, 1, 0, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_170_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_170_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_170_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_170_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [2, 0, 1, 3]\nC: [0, 3, 1, 2]\nD: [3, 0, 2, 1]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. For this project, you don't need that much, just some basic cooking ingrediants and tools.\nThe ingrediants* for the cupcakes are:\n- 2 1/4 cups Flour\n- 1 1/3 cups Sugar\n- 1 cup Milk\n- 2 Eggs\n- 1 teaspoon Vanilla\n- 3 teaspoons Baking Powder\n- 1/2 cup Shortening\n- 1/2 teaspoon Salt\n- 2 Lemons\n- 3 Tea Bags\n- Food Coloring\n* You will need to split the ingredients in half to make two different batches. With the exception of the lemons and teabags, all of the ingredients are going to be split to be used for two different batches.\nThe Ingredients for the Frosting are:\n- 3 - 4 cups Powdered Sugar\n- 1/8 cup Milk\n- Teaspoon of Vanilla\n- 1/2 stick of Butter\n- Food Coloring (green)\nThe Materials are:\n- Assorted Measuring Utensils\u00a0\n- Mixer\n- Bowls\n- Cupcake Pan\n- Paper Liners\n- Spoon\n- Knife\n- Juicer\n- Oven. The first step is to put 1 1/8 cups of\u00a0flour, 2/3 cup of sugar, 1 1/2 teaspoon of baking powder, and 1/4 teaspoon of salt in a mixing bowl. Mix together. Then, add 1/4 cup of shortening, 1/2 cup of milk, and 1/2 teaspoon of vanilla. Beat all of the ingrediants together for about one minute. The next step is to add one egg to the mixture. Beat with mixer for around a minute. Then, mix on the high speed for a minute. The next step is to take two lemons. First, use a grater to grate the lemon peel to get lemon zest. Add that to the mixture. Then, take the two lemons and cut them both in half. Collect juice from them using a juicer. Add to cupcake mix, and beat with the mixer for around 30 seconds. The final step is to take yellow food dye and add that to the mixture. It depends on what shade of yellow you want, but I added around 20 or so drops to get the color I wanted.. The first step is to put 1 1/8 cups of flour, 2/3 cup of sugar, 1 1/2 teaspoon of baking powder, and 1/4 teaspoon of salt in a mixing bowl. Mix together. Then, add 1/4 cup of shortening, 1/2 cup of milk*, and 1/2 teaspoon of vanilla. Beat all of the ingrediants together for about one minute. The next step is to add one egg to the mixture. Beat with mixer for around a minute. Then, mix on the high speed for a minute. The next step is to add food coloring. I added 1 drop of blue, 1 drop of green, 2 drops of red, and 5 drops of yellow to get a pinkish-brownish color.*Before you do anything, heat the 1/2 cup milk until hot to touch. Then, put the three tea bags of your choice in the milk and let them steep for ten to fifteen minutes. Remove tea bags before putting the milk in the cupcake mix.\u00a0. Once you have both of your cup cake mixtures, it's time to start baking them! Preheat the oven to 350 degrees (Fahrenheit). Next, grab a cupcake pan and line it with the paper liners. Then, fill it one third of the way with one of the mixes. I choose the iced tea mix to go on the bottom, but it doesn't really matter. Then, fill it up another third with the other mix, so for me, the lemonade mix. Leave about one third left, because the cupcake rises. When done filling them up, throw them in the oven for about 20 minutes. \u00a0A good way to tell if they are ready or not is to poke the center with a toothpick and if it comes out clean without anything sticking to it, the cupcakes are ready!\u00a0. After the cupcakes have cooled, it is time to frost them. The recipe is pretty simple. First, melt half a stick of butter in the microwave so it becomes soft. Then, add 1/8 cup of milk, 3-4 cups of confectioners sugar (depends on how dense you want the frosting), and one teaspoon vanilla. Blend together with mixer. Once done, add several (10-ish) drops of green food coloring. Mix that in as well so the frosting becomes a nice grass color. Once the frosting is completed, spread generously over the cupcakes. Wait for frosting to dry once done.. Once the frosting has cooled, you can now decorate your cupcakes!\n-To make a sandtrap, simply take brown sugar and put some on.\n-For a pond or lake, take a little of the left over green frosting and mix it with some blue food coloring.\n-For the rough, take a toothpick, put some green frosting on it, and put it on the cupcake. Then make movements up.\n-For the golfball and the hole, take a mini marshmallow, knead it until it becomes sticky, and coat it in confectioners sugar. For the hole, take a toothpick and dig a little one. Then, put white sprinlles in it.\n-For the flag, take a piece of paper and cut a triangle. Tape or glue the triangle on a toothpick.\u00a0\n-For the Happy Father's Day, I took the leftover blue frosting used to make the ponds/lakes and added more blue food coloring and some red food coloring to get a dark purple. Then, I took a toothpick, dipped it in the frosting, and used that to write the words.. Once the cupcakes are done, bring them over to your dad and thank him for all he's done. Enjoy this treat together for a memory that will last forever! Enjoy!\nThis is a great project that the whole family can enjoy making for their Dad. Kids love to decorate cupcakes, especially of they get to eat them too! Take the family out golfing or mini-golfing and bring these too for a perfect Father's Day!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 0, 1, 3]\nC: [0, 3, 1, 2]\nD: [3, 0, 2, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_171_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_171_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_171_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_171_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [0, 3, 1, 2]\nC: [2, 3, 1, 0]\nD: [2, 3, 0, 1]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Clockwise around Rim:2 dried red peppers, stemmed   6 white cardamom pods   6 green cardamom pods   3 black cardamom pods   1/2 teaspoon ajowan seeds   1/2 teaspoon charnushka seeds (also known as nigella or kalonji)   1 teaspoon cumin seed   1/2 teaspoon fennel seed   1/2 teaspoon fenugreek seeds   1/2 teaspoon blade mace   1 clove   1/2 teaspoon ground nutmeg (I used about half of the nutmeg piece in the picture) 1/2 inch Ceylon cinnamon stick 1/2 inch Sumatran cinnamon stickCenter:1/2 teaspoon each pink and black peppercorns. Preheat a pan on medium. Once hot, add all of the spices *except* the blade mace and the nutmeg.  Toast for 90 seconds or so, then dump the mix into a bowl to cool.  Immediately after dumping, return the chilies, cinnamon, and cardamom pods from the bowl to the stove. Toast for another 3-4 minutes, until the chilies start to blacken a little. Open the cardamom pods, and add the seeds inside to the spice mix along with the chilies and cinnamon.Let it cool down completely before grinding.. Open a 16 oz can of chickpeas, and rinse thoroughly under cold water, agitating by hand. As you mix it up, the skins will begin to come off.Put down some paper towels in a rimmed sheet tray, and pour out the chickpeas. With more paper towels on top, begin rolling the chickpeas around. This will simultaneously dry them a bit and help remove the skins.. Put the cooled spices into a spice grinder, plus the grated nutmeg and blade mace. Grind until fine.In a bowl combine:Chickpeas 1 tablespoon olive oil 1 tablespoon sweet smoked paprika 2 tablespoons garam marsala Healthy pinch of sea salt.Mix well.. Have a rimmed sheet pan preheated inside a 400 degree F oven. Add the chickpeas, avoiding clumps and multiple layers.Cook for 20-30 minutes, shaking the pan occasionally. When you pull them out, sprinkle with kosher salt.They are good finger food, or mixed in with some Greek yogurt.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 3, 1, 2]\nC: [2, 3, 1, 0]\nD: [2, 3, 0, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_172_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_172_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_172_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_172_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [1, 3, 2, 0]\nC: [1, 2, 0, 3]\nD: [2, 0, 1, 3]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Recipe: For 6 smaller fluffy vegan pancakes!Ingredients: Dry ingredients:1 cup + 1/4 cup (131 gr) sprouted whole spelt flour (I used Rude Health) 2 teaspoons baking powder Wet ingredients: 1/2 cup (137 gr) apple sauce 1/2 cup unsweetened soy milk 1/2 teaspoon home-made vanilla extract 1 tablespoon agave Passionfruit glaze:3 ripe passionfruit, each cut open, seeds & flesh spooned out 1 tablespoon agave. Place dry ingredients & wet ingredients in this order into your Vitamix container. Place fitted lid & tamper in. Blend on high-speed until fully mixed. This took me about 10 seconds. Remove lid & tamper. Your pancake mix will be thicker, like this, see photo above!. Place ripe passionfruit seeds & flesh into fitted cooking pot & add agave. Stir & heat up. Simmer for about 3-4 minutes until it forms like a gel aka jam like consistency aka glaze. Taste. It is a bit sweet & you really can taste the full passionfruit flavour & that is what you want. Turn heat off & keep warm while you fry your pancakes. it will look like this:. Shape & fry your pancakes. Take a small pancake pan & smear it in with a fruity oil. Heat up on medium-high. Spoon 2 big small spoonfuls of the dough into your pan & flatten it all out with the back of your spoon to form quickly a roundish shape for your pancake. Wait until bubbles appear into the surface of the pancake & carefully flip your pancake over with help of a pancake spatula & fork. With your pancake flipper, push the upside of the pancake down for an equal fry. When done, place onto a plate & repeat the process. I oiled my pan again & again. When all the pancake batter has been used up, serve at once, like photo above & enjoy! 2 pancakes will be enough because they really fill you up & that is what you want!The pancake itself is a bit sweet, thick & fluffy & the passionfruit glaze on top is also a bit sweet & so addictive as well.You can also read this tasty post here on my blog: http://sophiesfoodiefiles.wordpress.com/2016/07/12/vegan-pancakes-passionfruit-glaze/\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 3, 2, 0]\nC: [1, 2, 0, 3]\nD: [2, 0, 1, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_173_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_173_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_173_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_173_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [1, 0, 2, 3]\nC: [2, 3, 0, 1]\nD: [0, 1, 3, 2]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Ingredients for Orange Cake:All purpose flour - 1 1/2 cupOrange juice - 1 1/4 cupOrange zest - 1 tbspBaking soda - 1 tspBaking powder - 1/4 tspSugar - 1/2 cupSalt - 1/8 tspCanola oil - 1/3 cuplemon juice - 1 tbspVanilla extract - 1 tspIngredients for Orange Icing:Icing sugar - 1 cupOrange juice - 1/4 cupOrange zest - 1 tspFor baking9 inch springform Pan. Grease the pan with butter and add some all purpose flour in it. Swirl the pan to flour coat the bottom and sides of the pan.. preheat oven to 375 in bake mode.Take all the dry ingredients in a large mixing bowl. whisk until all incorporated.In a separate bowl, take all the wet ingredients, zest of Orange and mix all well. Now add wet mixture into the dry mixture. Mix all together without lumps.Now pour the cake batter into the prepared cake pan. Tap on the counter top, to release the air bubbles.Bake the cake for about 20 to 25 mins. Check the cake after 20 mins of baking. Insert a toothpick in the centre of cake and it should comes out clean. If it's sticky in the toothpick, bake for 2 to 3 more mins. Once it baked, remove carefully from the oven and let it cool down for sometime.. Take a cup of icing sugar in a bowl, add a tbsp of Orange juice at a time. Mix and check for the right icing pouring consistency.Add orange zest to it and mix well.Orange Icing is ready! It is soooo yumm!!. Once the cake is completely cooled down, take out the cake from cake pan and place it in cake stand/plate. [ I inverted the cake for smooth flat base]Pour the icing in middle of the cake and use butter knife or back side of knife to spread the icing all over the cake. I left the sides of cake as it is. [you can apply icing if you need]Decorate the cake with sliced oranges and orange zest.Before you cut the cake, make sure the icing sets well. Happy Baking :)\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 0, 2, 3]\nC: [2, 3, 0, 1]\nD: [0, 1, 3, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_174_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_174_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_174_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_174_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [2, 0, 3, 1]\nC: [3, 1, 0, 2]\nD: [1, 2, 0, 3]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Roughly chop one onion.The reason for sitting the pork belly on a bed of veg is to lift it off the pan and it adds a little bit of flavour to the meat. Next roughly chop a carrot. Spread the veg across the tray.. Gather together a few metal skewer and tape them together. Using the skewers begin to poke holes into the skin, this will help the skin crackle evenly.Once the skin has been completely poked, cut the pork belly in half.. Then sprinkle over a pinch of salt.Next place the pork belly on the tray lined with veg. Spread salt across the skin, this will also help the skin to crackle.. Add a few pieces of wood to the charcoal to add some smoke.. Cook for a total of 4 hours.Two hours into the cook, make sure to turn the pork belly so it evenly cooks.. Take it off the bbq and let it rest uncover for 20 to 30 minutes.. You'll need lettuce, cherry tomatoes, sliced avocado and kimchi which is salted and fermented vegs. The kimchi adds an amazing flavor which cuts through the richness of the pork belly. You can find it in most Asian supermarkets.. Lay out the pork as you would a sandwich.First, add a layer of lettuce.Then a layer of cherry tomatoes.Follow with a layer of kimchi. Finally, a layer of avocado slices. To top it off spread over some garlic aioli.. There you have it! A Pork Belly Burger, the twist is the pork belly is the bun!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 0, 3, 1]\nC: [3, 1, 0, 2]\nD: [1, 2, 0, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_175_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_175_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_175_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_175_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [2, 1, 0, 3]\nC: [0, 3, 2, 1]\nD: [1, 3, 2, 0]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. You'll need the following:4 Cups of Froot Loops* (or cereal of your choice...Trix? Lucky Charms?)1 Stick of Butter1 bag of Mini MarshmallowsTools:SaucepanSpoonMeasuring CupWax PaperBaking Dish*NOTE: I picked Froot Loops for the color aspect. Try other cereals too. Trix or Lucky Charms could be fun ones to experiment with. Ideally, when you are picking your cereal, you'd like to pick a crunchy corn or rice based cereal. This will allow for you to have a krispies treat with some crunch. . The whole process of making these delicious treats happens relatively fast, so you'll want to prep your pan first so that when the marshmallow coating on the cereal is still gooey you can transfer it easily. Butter your baking dish thoroughly. Make sure to butter the sides as well as the bottom. This will make it so your krispie treats will slide out when they are done instead of sticking to the pan. . Place your stick of butter in your sauce pan on your stove. Melt the butter over low heat so that you do not burn your butter. . Once the butter has melted you can add your marshmallows. I used the entire bag of mini marshmallows for this Instructable. Stir constantly, insuring even mixing of your melting marshmallows and the butter. Keep heating until the marshmallows have melted completely and you can no longer distinguish single marshmallows. . Once you have a uniform mixture of butter and marshmallows, add your Froot Loops. Mix gently with a wooden spoon until the cereal is coated in the marshmallow mixture. Since Froot Loops are much bigger than regular Rice Krispies, you'll want to be careful when mixing so that you don't break the loops up. . Once adequately mixed, transfer your cereal marshmallow mixture to your buttered baking dish. Then, using a piece of wax paper, press your krispies down so that they have a uniform shape and top. . Let your krispies treats cool for at least 10 minutes, allowing the marshmallow to harden. After they have cooled you can remove them from a pan (they should slide right out with the buttering you did earlier). Cut with a sharp knife and serve!If you plan on storing them, place them in an air tight container. They will keep for a few days. . Enjoy your new twist on the classic rice krispies treat! Take them to potlucks, parties, and wherever else you need a little rainbow marshmallow goodness. \nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 1, 0, 3]\nC: [0, 3, 2, 1]\nD: [1, 3, 2, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_176_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_176_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_176_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_176_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [0, 3, 2, 1]\nC: [0, 1, 2, 3]\nD: [3, 2, 1, 0]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. As with almost all of the cookbooks I've collected, this recipe came from a book found in a second hand store. So often, I find great books tossed to the curb simply because they aren't the latest, greatest, or hottest thing off the printing press from the next celebrity cook. I'm proud to say I have quite an extensive library of cookbooks, but the most expensive of all only cost a few dollars at most. This recipe produces a pickle that is reminiscent of a bread-and-butter pickle, though not as sweet. Of course, that may be resolved by simply adding a bit more sugar to the recipe, your choice. From the Cooking Light Annual Cookbook from 2008, (it is common courtesy, and often a matter of copyright, to credit your recipe source) I present to you: EASY REFRIGERATOR PICKLES 6 cups of pickling cucumbers, sliced thin (or according to your preference) (This is usually about two pounds) 2 cups of thinly sliced onions 1 1/2 cups of white vinegar 3/4 cup of white sugar 3/4 teaspoon of salt (Kosher, if you have it) 1/2 teaspoon of mustard seeds 1/2 teaspoon of celery seeds 1/2 teaspoon of ground Turmeric (adds great color!) 1/2 teaspoon of crushed red pepper (yes, that stuff from the pizza place) 1/4 teaspoon of freshly ground black pepper 4 cloves of garlic, sliced very thin, or pushed through a garlic press. To reduce clutter, and eliminate the potential to forget an ingredient, or even to prevent dropping a salt shaker into a bowl of batter below a cabinet, I prefer to gather all of my ingredients before beginning any recipe. Many fast-food restaurants offer nifty little plastic cups for carryout condiments. With a slight touch of inner hoarder, I've collected and saved many of these cups for just such an occasion as this. Grab a few lids while you're at it, and you can easily prepare in advance if you are not quite ready to cook. After measuring out all the spices, grab a few onions, a head of garlic and a bottle of white vinegar. We're going to make brine!. If you don't have a garden full of vegetables, consider visiting your local farmer's market in search of pickling cucumbers. Generally, pickling cukes are are shorter, smaller, and often knobby. Crisp, bright green and white skin is not mandatory, but typical of this snappy little veggie! No one is going to judge you for buying cucumbers at the grocery store. Sometimes it happens. . There are a few gadgets in my kitchen (pffft, that is an understatement)\u00a0 that I use rather frequently, a mandoline being one of them. MANDOLINE (note the letter 'e' on the end) - not to be confused with a mandolin, which is a musical instrument. These are very valuable kitchen tools when you have a lot of thin slices to make, but are slightly fond of your fingertips. Typically, a mandoline has an adjustable dial for various thicknesses. This gadget makes slicing vegetables an absolute breeze!. Cut the ends from each onion, remove the skin, and slice very thin. If you desire super-thin slices of onion, consider using a mandoline, also known as a slicer. It is not necessary, or even suggested, that you peel the cucumbers, though you might consider removing a bit of each end. Though some people don't mind the blossom or end nubs, I'm not one of them. Using caution, carefully cut the cucumbers into thin slices. Yes, you may cut them slighter thicker if you wish. It is entirely up to you. Another option is to use a mandoline if you have one. See step 4 for more detail about mandolines. After slicing all of the cucumbers and onions, combine them in a large glass bowl in layers of half the cucumbers (three cups), half the onions, (one cup) and repeat. Remember, you'll need to have enough room in the bowl for the brine. Cut the garlic into tiny little slices, though you may also simply send the cloves through a press if you have one. Set the garlic aside to be added to the brine process in step 6. By all means, feel free to add other veggies! Only because I did not have any on hand did I not include various colors of super-thin sliced pimentos, jalapenos, carrots, etc. They only add to the beauty of your pickles. . In a small saucepan, combine the vinegar and all of the following, (and remaining) ingredients: 3/4 cup of white sugar 3/4 teaspoon of salt 1/2 teaspoon of mustard seeds 1/2 teaspoon of celery seeds 1/2 teaspoon of ground Turmeric (adds great color!) 1/2 teaspoon of crushed red pepper (yes, that stuff from the pizza place) 1/4 teaspoon of freshly ground black pepper 4 cloves of garlic, sliced very thin, or pushed through a garlic press Stir the brine well, and bring to a boil. Allow to cook for one minute.. After you have removed the brine from the stovetop, pour it over the onions and cucumbers. Be sure to mix it well. If your brine doesn't quite cover the cucumbers, you can always put another glass bowl on top, press down, and wrap the bowls tightly with plastic wrap to keep the top bowl forcing the cucumbers to be submerged in the brine. Allow the mixture to cool, then cover and refrigerate for four days. Approximate yield is seven cups of pickles. These pickles may be stored in the refrigerator for up to one month. If your large glass bowl is taking up as much room as mine did in the refrigerator, after it has completely cooled, you may transfer the pickles and brine to a plastic container if you desire. . And what does an image of lettuce have to do with pickles? Well, nothing, actually, but I needed a picture for the health tab. Lettuce is green. And green is usually healthy, right? For label-obsessed foodies, this is yet another wonderful feature of the Cooking Light books, they tell you what you are eating! Here is a partial low-down on the pickles you've just made, assuming you stick to the 1/4 cup serving size: Calories: 28 (10% from fat) Fat: 0.1 grams (polyunsaturated) Protein: 0.3 grams Carbohydrates: 7 grams Fiber: 0.3 grams Cholesterol: 0 milligrams (yeah!) Iron: 0.1 milligrams Sodium: 64 milligrams Calcium: 7 milligrams Enjoy!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 3, 2, 1]\nC: [0, 1, 2, 3]\nD: [3, 2, 1, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_177_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_177_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_177_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_177_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [2, 0, 3, 1]\nC: [3, 1, 0, 2]\nD: [2, 1, 0, 3]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. You'll need:\n- one potato per person\n- 1/4 onion per potato\n- Two largish mushrooms per potato\n- Pepper (we use a four pepper blend)\n- Spices (1)\n- Soft butter\n- Aluminum foil\n- A barbecue (duh!)\n- Knife of slicer (2)\n(1) We use a spice blend from a popular spice company who shall remain nameless (but whose initials are Victorian Epicure).\u00a0You can make your own with chives, basil, sage, oregano and dehydrated garlic).\n(2) We use a slicer from another well known kitchen products company (who shall also remain nameless but is an anagram of\u00a0Prefaced Hemp).. Cut your potatoes, onions and mushrooms with the (unnamed) slicer. If you're not using a slicer, cut with a very sharp knife. All veggies should be sliced to approxiamtely 1/4\" thickness, but it;s not too critical to pull out the tape measure to check.. Cut a two-foot long piece of\u00a0aluminum foil. Butter it with about a tablespoon of soft butter, spreading with a fork as shown below.. Lay out your ingredients in layers on the buttered foil. As you can see from the image, lay out the potatoes first, then the onions, then the mushrooms. The photos show us making the recipe for two people. If you are making for more diners (or for fewer REALLY hungry people), add additional layers.\nNote that two layers should be your maximum. If you're making this dish for more than\u00a0four people, make a second foil container. Also remember that the ingredients that wind up in contact with the foil will brown much more than the other ingredients. In our case, the potatoes and the mushrooms will brown. If you're adding extra layers, consider which ingredient you want browned when figuring out the layers.. Drop another tablespoon of butter in nut sized clumps on top of it all. Obviously, this is not a cholesterol-free recipe...\nAdd pepper and spices to taste. Don;t feel limited by the spices we selected. Experiment! Paprika may be good here, or dehydrated jalapeno if you're into that kind of thing.. Close\u00a0up the foil by matching\u00a0the front and back\u00a0edges together and rolling the matched edges. Roll the entire package so that the resulting seam is on the side of the package. The melting butter will liquefy and putting the seam on the side helps prevent leakage.\nTwist up the ends of the foil.. Heat the barbecue on medium (about 350-400 degrees F\u00a0or 175-200 Celsius for those with a thermometer). Cook for 45-60 minutes.\nThe time variance is based on whether you cook at a lower or higher temp, and whether you cook on the bottom or top grill.. As soon as you open the package at the end of cooking, you'll know you're in for something special. This stuff smells divine and tastes even better!\nEnjoy!!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 0, 3, 1]\nC: [3, 1, 0, 2]\nD: [2, 1, 0, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_178_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_178_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_178_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_178_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [0, 2, 1, 3]\nC: [1, 3, 0, 2]\nD: [0, 1, 3, 2]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Filling:\n500 grams Bramley apples (apples for apple pie or apple sause)\n70 grams golden caster sugar\n\u00bd tsp cinnamon\nPastry:\n112 grams butter , room temperature (if you can\u2019t be bothered microwave it for abit)\n25g golden caster sugar , plus extra\n2 eggs\n175g plain flour\nTools:\nOven\nBaking sheet\nRolling pin\nMuffin tin\nCookie cutter/glass/cup (one the seize of the muffin hole and one twice that seize\nPi cookie cutter/knife (you can cut them out yourself with a knife like i did.. Peel, core and dice the apples. I recommend not making the pieces any bigger that 1cm cubed. Preferably smaller (not like i did. It makes them neater)\nLay them out on the cookie sheet with paper towels on it.. Beat the butter and sugar together until just mixed. Add one egg and one egg yolk (safe the white for glazing) beat until it\u2019s like thick pancake mix.\nAdd flour. I recommend doing it in parts as that\u2019s easier and less likely to get lumps. The last bit you\u2019ll have to do with your hands. Wrap the pastry in cling film and chill for 40-45 minutes. While that is chilling make finish the filling by mixing the rest of the sugar (of the filling part) with the cinnamon (add more or less to taste). Then mix in the apples.. Roll out the pastry thinly and cut out the shapes as pictured. (on the size of the muffing hole, one twice that size and one the shape of pi.\nPut the one twice as big into a (greased, optional) muffin hole and push in and around the sides. Put the apple filling in it. seal with the other pastry and add the pi-shaped piece on top. Put a few holes in the top (around pi) with a knife.\nBrush with the egg white\nRepeat with the rest of the pastry (I got exactly 7 out of this recipe)\nBake for 25 minutes or until slightly golden.\nLet them sit in the tin for about 5 minutes (to cool a bit) and then move to a wire rack.. eat them!\u00a0\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 2, 1, 3]\nC: [1, 3, 0, 2]\nD: [0, 1, 3, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_179_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_179_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_179_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_179_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [2, 0, 3, 1]\nC: [0, 2, 1, 3]\nD: [2, 3, 0, 1]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. use plasterstrips to get an imprint of your chest - you want the bra the right size.\nI'm not showing that step, due I don't want to see my chest on the internet ;)\n- i'm sure you'll figure out how to do that.\nInstructions are found on the package of the plasterstrips.\nWhen the imprint is dry cover it with aluminiumfoil.. you can use all cookie doughs that can be used with cookiecutters.\nthe recipe I used:\n0,5 egg\n125g flour\n62g sugar\n62g butter\n1Tablespoon Cocoapowder\n1 teaspoon vanillasugar\n1 little bit bakingpowder\nand for flavour 2 tablespoons of instant cappuccino. Form the cups of the bra on your covered mold.\nmake sure to make it evenly thick - about 0,5 cm\nbake it on 200\u00b0C for about 10minutes ( may vary with another recipe). at this point you can get as creative as you want :)\nHere's what I did:\nmelt some white choclate in the still warm oven\nspread it with a clean brush on the warm bra.\nmix some white chocolate with cacoa-powder\nand paint whatever you like :)\nbrush some chocolate on the edge of the bra and sprinkle it with chocolate-pieces\nlet everything cool down.. carefully peel the foil of the mold\ntake a corkskrew and make holes to join the two cups in the middle - be very careful!\ntie the cups together with a nice lace or string.\nYour done!. Now surprise your beloved one and have a nice cup of tea!\n- Or whatever you like\u00a0 :D\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 0, 3, 1]\nC: [0, 2, 1, 3]\nD: [2, 3, 0, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_180_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_180_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_180_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_180_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [3, 2, 0, 1]\nC: [0, 1, 3, 2]\nD: [3, 0, 2, 1]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. ingredients: \t\t\t\t\t\t\t6 rashers of bacon \t\t\t\t\t\t\t2 large potatoes \t\t\t\t\t\t\t1 medium onion \t\t\t\t\t\t\t2 cups water \t\t\t\t\t\t\t3 cups corn \t\t\t\t\t\t\tsalt + pepper \t\t\t\t\t\t\t1 cup half-and-half \t\t\t\t\t\t\t1/2 cup cheddar cheese \t\t\t\t\t\t\tcornmeal (optional)equipment: \t\t\t\t\t\t\tsharp knife \t\t\t\t\t\t\tcutting board \t\t\t\t\t\t\tlarge pot \t\t\t\t\t\t\tladle \t\t\t\t\t\t\tblender \t\t\t\t\t\t\tstove \t\t\t\t\t.. Bacon:  \t\tHeat large pot on stove and add bacon. \t\tCook bacon until crispy (about 5 minutes). \t\tRemove bacon and crumble into a separate bowl. \t\tKeep bacon drippings for cooking onions.Onion + potato: \t\tPeel onion and potatoes. \t\tdice onion and cook in bacon drippings for about a minute. \t\tCut potatoes into uniform cubes and add to pot with onion.Combine: \t\tIn pot with onion and potato add water, corn, spices and bacon bits. \t\tCover and let simmer for about 20 minutes. After simmering for a while the potatoes should be cooked through. Carefully scoop a few ladles of the chowder into a blender, make sure you get a good mix of broth and chunky-bits. Blend on high until smooth, then add blended chowder back into pot and mix.. \n          The chowder could be served now, but to really take it over the top dairy is added for that extra-smooth and amazing taste. Slowly stir cream into the blended mixture, this will lighten the colour of your chowder and give it a very creamy consistency. Then add grated cheese, I used an aged cheddar. Let chowder simmer on low for about 10 minutes. Keep that temperature on low, otherwise you risk burning the dairy. The picture here shows the marked difference between the stages of the chowder: chunky stock > blended stock > blended stock with cream. After a few minutes of the cream and cheese getting to know the rest of the ingredients it's time to serve. Scoop heaping portions into a bowl, then top with more bacon bits and cheese. A nice hot bowl of corn chowder for those chilly fall days, perfect! Did you make your own corn chowder? Post a picture of your results in the comments below and get a free Pro Membership to Instructables!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 2, 0, 1]\nC: [0, 1, 3, 2]\nD: [3, 0, 2, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_181_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_181_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_181_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_181_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [1, 3, 0, 2]\nC: [0, 3, 1, 2]\nD: [1, 2, 3, 0]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. For this you will need:- some frozen french fries (aka Freedom Fries, you know,  pommes frites)- some hot dogs, preferably the ones with skins or casing, but any type frankfurter will do. A bit harder to find, but they do sell the monster size Polish-sausage type red-hots.  I have not tried this with Italian sausage, kielbasa, chorizo, or bratwurst but it should also give the same effect.- some frozen prepared buffalo-style chicken wings or fried chicken wings- a bit of butcher's twine or cotton string or clean thread- a bit of aluminum foil- Optional: a can of vegetarian beans in tomato sauce or whatever variety/flavor if you want a full entree.  A can of beans with that slab of pork fat works also.- blood-like condiments, catsup, ketchup or tomatoe sauce . Take a part of a frozen french fry.  You can shape it like a fingernail or just find a small piece that is pointy on one end and blunt at the other.  Cut a slit or pocket into the end of the hot dog where a fingernail would be. \nTake the point of the knife and create a pilot hole to insert or stuff the french fry into the hot dog.  Be careful not to bulge the hot dog too much so that you end up ripping the skin.  Dig out some more meat if you need room to fit the french fry.\n. Take a hot dog and break it in half.  You want a ragged edge. You can also break it 1/3 of the way down.  One part can be a toe and the other part can be a part of the finger. \nTo create a finger, wrap a small piece of aluminum foil around the hot dog. Tie a string around where the joints of the finger should be.  The foil will ensure you do not cut into the hot dog and break the skin. \nCompress the hot dog slightly as you tie the string.  When it cooks, the hot dog will take a more natural shape and give you a ridge at the finger joint.  We will cook the dog with the string on and cut them after they are cooked.\nFor hors' d'oeuvres, you can just bake them till heated through and browned in a toaster oven. You can also take a big pot and bring your batch of beans to a simmer.  Gently place the \"fingers and toes\" into the pot and let them heat through with the beans.  Baste with liquid and beans in the pot.  \n. Cut off the strings and remove foil before serving.  You can mash the french fry down to form a better fingernail shape. \nThe chicken wings should be frozen prepared kind where they require short cooking or just reheating. Have the chicken wings defrosted so you can work with them easier.  Don't do this with fresh chicken parts unless you intend to fully cook them on the side or with the beans later.  \nIt probably is best to just use the drummette portion of the wing, the one with a single bone.  \nAfter cooking, peel or rip off most of the meat surrounding the bone, just be sure to remove any signs of chicken skin.  This will allow you to skewer or insert the bone into the end of the hot dog.  \nFor the fingers, jam in a chicken bone in the ragged end of the hot dog.. Garnish with your blood-like condiments, Serve with a big helping of Muuuuuuhhahhahhahahhhahahh!You can also bake with pastry dough strips wrapped around body parts and apply red sauce to look like bandages.  Or have fingers sticking out of your mom's pumpkin pie.  Hand pie, anybody?Serve grilled with grill marks for those especially deviant.  Enjoy!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 3, 0, 2]\nC: [0, 3, 1, 2]\nD: [1, 2, 3, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_182_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_182_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_182_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_182_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [3, 2, 1, 0]\nC: [1, 0, 2, 3]\nD: [2, 1, 3, 0]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Ingredients\n3 medium beets\n1/2 onion\n12 oz spinach\n3 cloves garlic\n2 packages active dry yeast\n2 teaspoon sugar\n3 cups flour\n4 tablespoons olive oil\n2 teaspoons fine grain salt\n2 Tbs black lava saltTools\nJuicer\nBaking sheet\nMixing bowls (2)\nMeasuring cups/spoons\nParchment paper\nMortar and pestle. \n\t\tRun spinach and garlic through juicer and set aside\n\t\tJuice beets and onion into a separate bowel. \nFollow steps below for both the spinach and the beet dough...\n\t\tYou'll need 1 cup warm liquid, so pour veggie juice into mixing cup and fill to 1 cup mark with warm water.\n\t\tPour liquid into mixing bowl\n\t\tAdd 1 packet yeast, 1 tsp sugar and 1/2 cup flour\n\t\tStir a couple times, cover bowl with plastic wrap and set in cool, dark place for 10 minutes (until bubbly)\n\t\tMix in 1 tsp salt (I used mortar and pestle to grind lava salt, but any fine grain salt is fine)\n\t\tMix 1 cup flour into mix\n\t\tMove dough to floured surface and knead for 3 - 5 minutes\n\t\tRinse mixing bowl, dry and lightly coat inside with olive oil\n\t\tRoll dough into ball and place in a bowl\n\t\tRecover and let sit for around an hour (until doubled in size). \n\t\tPreheat oven to 400oF\n\t\tPunch down dough, knead 2 minutes and divide into balls 1/2\" in diameter\n\t\tRoll into thin sticks - about 1/4\" thick and 8\" long\n\t\tBrush bottom 1/2\" with oil and cover with lava salt.\n\t\tBake for 8 minutes\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 2, 1, 0]\nC: [1, 0, 2, 3]\nD: [2, 1, 3, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_183_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_183_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_183_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_183_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [3, 2, 0, 1]\nC: [3, 2, 1, 0]\nD: [3, 0, 2, 1]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. This step is about what you HAVE. \u00a0Not what you need. \u00a0(Mine are usually leftovers.)\nBasically, my opinion on\u00a0Bento\u00a0is that you may have lots of cool things in stock to use, play with, shape things...but the food comes down to what you have to work with. \u00a0There are no rules. \u00a0Just try to make something cute out of what you have. \u00a0That's my opinion. \u00a0No reason to stress out about what you need to have to make something. \u00a0Just use your imagination and put things\u00a0together\u00a0that are already in your kitchen. \u00a0If you want more color, maybe it's a good idea to go shopping. \u00a0:) \u00a0It's up to you.\nOne other thing that I love - cake decorating supplies. \u00a0I use them as props when I do this stuff. \u00a0I will try to find some photos to attach to this step, to show you. \u00a0. I needed a base for the\u00a0Bento\u00a0because I like build the meal upwards. \u00a0If you start at the bottom...I don't know, it just doesn't make sense to me. \u00a0I don't know too much about\u00a0Bento, it's been less than a year and I've only done it a few times. \u00a0Generally, the food is at an even level when finished and it's separated strategically. \u00a0\nSo, for a base, I put some salad at the bottom, with a\u00a0ziploc\u00a0bag of dressing. \u00a0That should go well with the left over rib eye. \u00a0Plus...I think it's cool to call it a secret salad because no one knows its there! \u00a0Until they eat their way to it.\nAfter the salad, I put a thin layer of seaweed/nori, teriyaki flavored. \u00a0It works well because it's kinda sticky.. I cut his steak up with a sharp knife and tried to shape it as best as I could. \u00a0I can't visualize these things before I do them, they just unfold. \u00a0So I can't give you any advice on how to shape your food. \u00a0Play around with it until you're happy. \u00a0I am horrible at perspective and if I ever make anything, it's flat. \u00a0Plus, my lines suck. haha.\nI used string cheese to line the center console of the Tie Fighter. \u00a0Cut the .. string cheese horizontally and wrap it around your circle. \u00a0Pin it with a couple toothpicks and put it in the microwave for around 20 seconds. \u00a0Once it's starting to melt, take it out and press it into shape. \u00a0Set it face down and the melted gravity will help form your cheese circle. \u00a0You can then add your windows and melt them on, but I didn't want to take any chances so I left them for the end, unmelted.\nI did the same cheese melting technique with the patterns on the...edge wing things? \u00a0It took forever and it was the hardest part. \u00a0My tip for you - don't do what I did. \u00a0Use a knife or an exacto instead. \u00a0By hand, it takes decades to make the right size cheese. \u00a0Then you can melt these pieces together and shape them as needed.. My favorite part about this thing is the stars! \u00a0I love all stars of all kinds.\nOkay, situate your Tie Fighter in some ritualistic Star Wars way...I just guessed. \u00a0Then you can put some stars around your ship. \u00a0I used a tiny cookie cutter, a slice of provolone cheese, and spaced them out nicely. \u00a0For the TINY stars, specks, I just sprinkled some sesame seeds around. \u00a0\nFor the lasers I used some sour punch kinda candy. \u00a0I had it leftover from the cupcakes I was making, so I figured that I would put it to use. \u00a0(Pretty awesome to have all these random things in the house and not have to plan this out or buy things.) \u00a0Back to the green stuff...I rinsed off the sour stuff and let them dry on a plate, stuck a hard spaghetti stick into the tube and left an un-filled part of the straw to puncture as a stilt against the rest of the\u00a0Bento, to hold it up in the design.\nThen I added my windows and I was done!. \n          It's back in the fridge, waiting for my husband to wake up and take it to work. \u00a0I will probably add a sign that says to keep it upright because I don't want him to crash his Tie Fighter before he even sees it. \u00a0\nThanks for checking out my attempt at a Tie Fighter! \u00a0Hope you guys have a nice day at work! \u00a0:) \u00a0\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 2, 0, 1]\nC: [3, 2, 1, 0]\nD: [3, 0, 2, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_184_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_184_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_184_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_184_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [0, 2, 3, 1]\nC: [0, 2, 1, 3]\nD: [2, 3, 1, 0]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Start by sterilizing your jars, lids, rings, and tools by submerging them in boiling water for several minutes.. Add to a large pan one and a half cups water and two cups apple cider vinegar. Then add six cinnamon sticks and a teaspoon of cloves. Add five cups sugar. Mix all together and bring to a boil. Let simmer at least ten minutes. . I like the taste of the pickles with mint, but my wife doesn't like the mint ones on hamburgers. So depending on your preferences, you decide.. You'll need to peel, core, and slice your apples. Try to get your slices about as thick as a hamburger chip pickle would be. This is a great way to get rid of small apples. . Fish one of the cinnamon sticks out of the syrup. Place it in the jar with the apple slices. Ladle the syrup over, leaving about a third of an inch head space. A few of the cloves in the jar won't hurt, but a lot can get too strong. . Put a lid on the clean rim of your jar. Snug down a ring. Then submerge in the hot water bath. . After the water has resumed a rolling boil, process for thirty minutes. Then remove the jars.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 2, 3, 1]\nC: [0, 2, 1, 3]\nD: [2, 3, 1, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_185_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_185_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_185_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_185_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [1, 2, 0, 3]\nC: [1, 0, 2, 3]\nD: [3, 1, 2, 0]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. \nA head of cabbage makes an excellent katamari and radishes are the perfect size to use as the ends...\n\t\tWash and cut radishes in half\n\t\tSlice off the very end off each half\n\t\tPush toothpicks into cabbage, leaving about 1/2\" exposed\n\t\tPress radish halves onto toothpicks\n\t\tEvenly distribute radishes around entire cabbage. \nThe Prince is constructed from 2 cucumbers, a baby carrot and 4 green beans...Head\n\t\tCut both ends off\u00a0 two small cucumbers\n\t\tCut one of the cucumbers in half\n\t\tTake one of the halves and carve out a rectangle from the outer peel\n\t\tRemove small band of peel from both ends\n\t\tTake two of the end pieces and attach one to each side of the head with toothpicks\n\t\tStick a baby carrot in the top of the head to make the antennaBody\n\t\tTake the other half of the cucumber and press one or two toothpick in one end\n\t\tAttach head to body using the aforementioned toothpicksLegs/Feet\n\t\tRun toothpicks through two green beans, leaving a bit of toothpick exposed on one end\n\t\tPress legs into body at toothpick end\n\t\tTake the ends cut off the second cucumber (these will be the feet)\n\t\tCut a small circle out of the middle of each foot (approximately green bean in diameter)Arms\n\t\tRun toothpicks through two green beans, leaving a bit of toothpick exposed on one end\n\t\tPress toothpick ends into body at a reasonable arm position. \nSet The Prince up next to the cabbage katamari in a rolling stance.\nNow, The Prince did remain standing for the duration of the display, but i won't lie, it was precarious.\u00a0 I recommend setting up the veggies in the same place it will remain throughout the event.\u00a0 Also, make sure that the cabbage is stable, as it provides most of the support for The Prince.. \nToothpicks and/or skewers of fruits, veggies and cheeses can now be added...\n...along with turnip flowers : ). \nAdd Brussels sprout bushes, mixed green grass and weird fruit trees of strawberry and melon atop artichokes (or whatever weird fruit trees you can imagine).\nAnd don't discard those rinds!\u00a0 They can be filled with dips or salsa.\u00a0 The lemon pictured here is happily holding a yogurt fruit dip.\nThe example here, while a little out of control, is a very simple example of what can be done with the Katamari theme.\u00a0 It could be applied to a wide variety of foods and/or represent different levels of the game.\u00a0 A skilled garnish maker could do an amazing representation of the flower level...\n...and yes, that is a request.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 2, 0, 3]\nC: [1, 0, 2, 3]\nD: [3, 1, 2, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_186_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_186_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_186_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_186_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [1, 0, 3, 2]\nC: [3, 0, 1, 2]\nD: [3, 1, 0, 2]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. One package Nutter Butter cookies\n2/3 cup Nutella\n2 oz cream cheese, softened\n2 cups chocolate chips\n(optional) White chocolate for decoration. Using a food processor, grind up the Nutter Butter cookies into crumbs -- it\u2019s best to do this in two or three batches, depending on the size of your food processor.\nMix the cookie crumbs with \u00e2\u0085\u0094 cup Nutella and 2 oz cream cheese. The mixture will still be crumbly, but should hold together when pressed -- if need be, add a little more Nutella or cream cheese.. To get the balls all the same size, I use a mashed potato scooper. Pack the crumb mixture firmly into the scoop, then eject onto a piece of waxed paper. Because the mixture is so crumbly, I just left them as half-balls -- if you were making cake-balls, you could roll them into balls.\nPut the balls into the refrigerator for at least \u00bd hour to firm.. Melt two cups of chocolate chips in a double boiler. Depending on your chocolate chips, adding a little vegetable shortening can make it easier to coat the cookie balls. I usually use about 1 tsp of shortening per cup of chocolate chips. Warning -- adding too much shortening will make the chocolate too soft.\nI use a homemade dipping tool that I formed from a piece of stainless steel wire (a bicycle spoke).\nRoll each ball in the melted chocolate, and place on waxed paper.\nThe dipping tool makes it easier to scoop out the balls and shake off excess chocolate.. Optional - decorate the cookie balls\nAn easy way to decorate the cookie balls is with white-chocolate piping.\nUsing a microwave, melt some white chocolate chips in a small plastic bag. Cut the corner off the bag and drizzle the melted chocolate in a thin stream over the top of the cookie balls.\nPlace the balls in the refrigerator till the chocolate hardens.. Remove from the waxed paper & enjoy!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 0, 3, 2]\nC: [3, 0, 1, 2]\nD: [3, 1, 0, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_187_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_187_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_187_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_187_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [0, 1, 3, 2]\nC: [0, 2, 3, 1]\nD: [3, 0, 2, 1]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. This step is about what you HAVE. \u00a0Not what you need. \u00a0(Mine are usually leftovers.)\nBasically, my opinion on\u00a0Bento\u00a0is that you may have lots of cool things in stock to use, play with, shape things...but the food comes down to what you have to work with. \u00a0There are no rules. \u00a0Just try to make something cute out of what you have. \u00a0That's my opinion. \u00a0No reason to stress out about what you need to have to make something. \u00a0Just use your imagination and put things\u00a0together\u00a0that are already in your kitchen. \u00a0If you want more color, maybe it's a good idea to go shopping. \u00a0:) \u00a0It's up to you.\nOne other thing that I love - cake decorating supplies. \u00a0I use them as props when I do this stuff. \u00a0I will try to find some photos to attach to this step, to show you. \u00a0. I needed a base for the\u00a0Bento\u00a0because I like build the meal upwards. \u00a0If you start at the bottom...I don't know, it just doesn't make sense to me. \u00a0I don't know too much about\u00a0Bento, it's been less than a year and I've only done it a few times. \u00a0Generally, the food is at an even level when finished and it's separated strategically. \u00a0\nSo, for a base, I put some salad at the bottom, with a\u00a0ziploc\u00a0bag of dressing. \u00a0That should go well with the left over rib eye. \u00a0Plus...I think it's cool to call it a secret salad because no one knows its there! \u00a0Until they eat their way to it.\nAfter the salad, I put a thin layer of seaweed/nori, teriyaki flavored. \u00a0It works well because it's kinda sticky.. I cut his steak up with a sharp knife and tried to shape it as best as I could. \u00a0I can't visualize these things before I do them, they just unfold. \u00a0So I can't give you any advice on how to shape your food. \u00a0Play around with it until you're happy. \u00a0I am horrible at perspective and if I ever make anything, it's flat. \u00a0Plus, my lines suck. haha.\nI used string cheese to line the center console of the Tie Fighter. \u00a0Cut the .. string cheese horizontally and wrap it around your circle. \u00a0Pin it with a couple toothpicks and put it in the microwave for around 20 seconds. \u00a0Once it's starting to melt, take it out and press it into shape. \u00a0Set it face down and the melted gravity will help form your cheese circle. \u00a0You can then add your windows and melt them on, but I didn't want to take any chances so I left them for the end, unmelted.\nI did the same cheese melting technique with the patterns on the...edge wing things? \u00a0It took forever and it was the hardest part. \u00a0My tip for you - don't do what I did. \u00a0Use a knife or an exacto instead. \u00a0By hand, it takes decades to make the right size cheese. \u00a0Then you can melt these pieces together and shape them as needed.. My favorite part about this thing is the stars! \u00a0I love all stars of all kinds.\nOkay, situate your Tie Fighter in some ritualistic Star Wars way...I just guessed. \u00a0Then you can put some stars around your ship. \u00a0I used a tiny cookie cutter, a slice of provolone cheese, and spaced them out nicely. \u00a0For the TINY stars, specks, I just sprinkled some sesame seeds around. \u00a0\nFor the lasers I used some sour punch kinda candy. \u00a0I had it leftover from the cupcakes I was making, so I figured that I would put it to use. \u00a0(Pretty awesome to have all these random things in the house and not have to plan this out or buy things.) \u00a0Back to the green stuff...I rinsed off the sour stuff and let them dry on a plate, stuck a hard spaghetti stick into the tube and left an un-filled part of the straw to puncture as a stilt against the rest of the\u00a0Bento, to hold it up in the design.\nThen I added my windows and I was done!. \n          It's back in the fridge, waiting for my husband to wake up and take it to work. \u00a0I will probably add a sign that says to keep it upright because I don't want him to crash his Tie Fighter before he even sees it. \u00a0\nThanks for checking out my attempt at a Tie Fighter! \u00a0Hope you guys have a nice day at work! \u00a0:) \u00a0\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 1, 3, 2]\nC: [0, 2, 3, 1]\nD: [3, 0, 2, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_188_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_188_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_188_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_188_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [1, 0, 3, 2]\nC: [1, 2, 0, 3]\nD: [2, 3, 1, 0]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Makes 10 muffin sized cakes.\nIngredients:\na) 1 tablespoon icing sugar\nb) 150ml milk\nc) 175g self raising flower\nd) 150ml corn oil\ne) 150g caster sugar\nf) 2.5 tablespoons of cocoa powder\ng) 1 teaspoon bicarbonate of soda\nh) 100g dark chocolate (70% cocoa solids minimum)\ni) 397g can of Carnation Caramel\n2 eggs (i forgot to take a picture 0_0 )\nFor the heart Decorations:\nFlat red sweets. I\u00a0used a roll of sour red candy\nWhite icing. I\u00a0used \"Queen\"\u00a0white chocolate fudge writing icing because the nozzle was small)Tools:\na)\u00a0Baking sheet or cupcake tray\nb)\u00a0Spoon\nc)\u00a0Fork\nd)\u00a0Measuring spoons\ne)\u00a0Seive\nf)\u00a0Mixing bowl\ng)\u00a0Measuring jug\nh)\u00a0cake cases - i used muffin cases for extra large cupcakes, nom!\ni)\u00a0Timer\nj)\u00a0Scales\nk)\u00a0Spatula\u00a0 or wooden spoon\nl)\u00a0Small spatula (useful for icing, you could use a butter knife)\nScissors (not pictured). 1)\u00a0Pre heat your oven to 180\u00ba C2) Prepare the dry ingredients: Sieve the flour and cocoa powder into your mixing bowl3) Add the bicarbonate of soda and caster sugar4)\u00a0Use the spatula to mix to an even colour5) Prepare the wet ingredients:\u00a0 In your mixing jug measure the corn oil, and add the milk6)\u00a0Add in the eggs, and 2 tablespoons of the Carnation Caramel7) Give the oil/milk/egg mixture a good mix with a fork to combine it with the caramel. And pour the wet ingredients into the bowl of dry ingredients.8)\u00a0Mix all the ingredients together thoroughly, scrape the bottom of the bowl a couple of times to make sure there isn't a sneaky pocket of dry ingredients . 9) Lay out your paper muffin cases onto baking sheet, Spoon the mixture into the cases leaving 1cm clear at the top to give them room to rise.10) Once it's up to temperature pop them in the oven for 20 minutes. Don't open the oven while they're coking or they make not rise so well. \nTip:\u00a0To check that they're cooked insert a clean sharp knife or knitting needle into the centre of one, if it comes out clean they're done. If it comes out sticky give them 3 minutes more. \nPlace on a wire rack to cool (Use the wire rack from your grill pan (give it a bloody good clean first!) if you don't have a cake rack) . 11) Break the dark chocolate into small pieces and melt in an appropriate bowl/jug in the microwave. Inbetween 30 second bursts give it a mix with a fork to see how its melting.\n12)\u00a0Combine the remaining Carnation Caramel with the chocolate. Leave this to cool a little, 10 or 15 minutes13) While the icing is setting a little (if you spread it straight awy it'll be more inclined to run over the edge of the cupcakes)\u00a0cut the tops of the cakes off flat with a sharp knife. This gives you a nice flat surface to ice onto.14) Start by spooning a blog of icing into the centre of the cake, and use a small spatula to drag it down and around the edge of the cake. Add a little more with the spatula as you go along if you find it's not enough. Finally, drag the spatula around the top of each cake to smooth over where you've added mroe icing. . Making the hearts is better described with pictures so have a look at them before you start, You want to use sharp scissors to get nice sharp edges.15) This is a diagram of the shape you're trying to make out of the sour candy. The white dotted line indicates where i've used two pieces of candy to make each heart.16) Take your roll of sour candy and cut off a section about 6cm long, Trim it to look like the top half of diagram 1517) Cut a 'T' shape from another piece of sour candy to make the bottom part of diagram 15. This is what your two-part heart should look like. 18)\u00a0Make four hearts, and cut the fourth one in half. This will be the unfilled heart at the end. Press a heart onto each cupcake. Leave them to set for at least half an hour, this way the surface will set slightly so if you make a mistake with the icing it can be easily corrected. 19)\u00a0This diagram show where the white line goes.20) Use your white icing to pipe the thins white line around the heart. Hold it as close to the cupcake as you can without dragging it over the surface. \nTip:\u00a0Practice on a piece or paper first (or the back of your hand, yum!) If you make a mistake on the cupcakes dip your finger in water and wipe it off.21)\u00a0For the half filled heart pipe a line where the sour candy comes up to on the other hearts, and fill in with horizontal lines of icing. Pipe the white line around this like you did on the other cakes. . Thats it!\u00a0Decorate the rest of the cakes with sweets, pipe patterns with the white icing, or dust with icing sugar. They'll keep in an airtight tin for a few days, you could give them in a clean white tupperware tub lined with coloured tissue, or lay them out on a pretty plate. \nA new tutorial every Monday on www.alittlestranger.com\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 0, 3, 2]\nC: [1, 2, 0, 3]\nD: [2, 3, 1, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_189_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_189_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_189_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_189_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [0, 1, 2, 3]\nC: [0, 2, 3, 1]\nD: [2, 0, 1, 3]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. I would suggest you to use complete quantity and not to half them like i did else you would end up with half semi circles...2 eggs1cup all purpose flour 3/4 cup sugar powdered3/4 cup oil1/2 tsp baking powderFew drops vanilla ( optional)Mix egg and sugar until incorporated. Add oil and mix well. Sift flour and baking powder together. Add. Add your vanilla.I cooked my cake in microwave oven Preheat oven at 180 Celsius. Bake for 20-25min.  Once cooked let it cool and cut circles ( i used three as my cake thickness was dense.). I made this sauce only for this cake amount. It can be stored easily.4 teaspoon milk2 tablespoon nutellaWarm your milk in microwave. Add in your nutella. Mix it well. It sould be in runny constancy. Serve hot for this cake.. Warm 2/3 part of chocolate at intervals of 10 seconds. Making sure you stir nicely every time. Once your chocolate has just melted. Add in your 1/3 of remaining chocolate and mix well and quickly. If you feel that your chocolate is too cold that lumps of solid chocolate are visible abd are tough to dissolve, microwave for another 5 seconds. Your chocolate should not be too warm or it will not temper correctly.   Spread your chocolate over aluminium foil and pat it to remove air bubbles.Let it stay at room temperature for atleast 30 min to an hour. Once u can feel that chocolate is not sticking to your fingers but it not firm yet, score circles and place your chocolate sheet in refrigerator. Only take it out before serving.. Place your disc of top of your cold cake and pour warm nutella sauce on top of it . Your dessert is ready\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 1, 2, 3]\nC: [0, 2, 3, 1]\nD: [2, 0, 1, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_190_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_190_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_190_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_190_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [3, 0, 2, 1]\nC: [2, 3, 0, 1]\nD: [2, 1, 0, 3]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. There's quit some internal parts you will need. I found all the parts on the internet, but you might check your local diystore if you can find some parts there.These are the internal parts you will need:- Funnel (top diameter around 100mm)- Aluminum plate (for small parts, see next steps)- Thermostat (KSD301 105\u00b0C)- 4x M3 bolt length 6mm with Phillips or slotted head- 4x M3 nut- Temperature fuse- 2x Parallel connector- 1m flexible silicon wire 1,5mm- Ring connector for 1,5mm wire (M3 hole)- 1m Silicone hose 10mm inner diameter- 1m Silicone hose 6mm inner diameter- Heat resistant tube- 7x M4 bolt length 12mm for embossing with Phillips or slotted head- 2x M4 bolt length 6mm for embossing with Phillips or slotted head- 2x M4 bolt length 30mm with Phillips or slotted head- 13x M4 nut- Rocker switch (minimum 6A)- 4x Hose clamp 13,7mm to 15,3mm- 2x Hose clamp 10.8mm to 12,3mm- Grounded chord and plug for 220-250V- 3x Faston angle connector 4.8 width- 2x Faston connector straight (6mm width, depending on the connectors on your thermostat)- Thermal paste (1 gram)- Heat-shrink tubing 6mm to 2mm- Heating element- One way valve for water which fits the 10mm silicon hose- Plastic cup (from paint gun, so make sure it's new and clean)I ordered my parts from a German company called 'Conrad'. They supply many countries in Europe, so you can find the order number of most of the parts on this list:https://www.conrad.be/ce/nl/ShoppingList.html?view...On Conrad you can find everything except these:-Heating element:You will have to order this from a supplier for replacement components. I ordered mine from Servilux (order nr 141709)If your order somewhere else you have to make sure it's similar to the one on the picture in order to complete the coffee maker.- Plastic cup 600ccThis will serve as the water tank. Depending on where you order them they might have a different thread. So if possible order one with a thread where the hose fits around, otherwise you also need an adapter. This so you can make a transition from the thread of the cup to an outer diameter between 10mm and 13mm.I ordered mine from nonpaintstore.nl (part nr 4213505). It's not cheap but it has the right fitting on it for the silicon hose.. Here you can find the link to all the 3D-printed parts. Because of the size of the object, they take quit some time to print. I printed all of them with standard print resolution from Makerware.http://www.thingiverse.com/thing:348199NOTE: These parts were made according to the dimensions of the components which I used. Different components might give problems with the dimensions of the 3Dprinted casing. In the future I might try to learn Openscad and make a parametric model out of it for dimension adjustments.I ordered filabot filament, which is made out of recycled sources, to try out of it's possible to print with it. I first ordered a blue roll and when I tested it, it came out quit well. Only difference with normal filament was the inconsistency of the color. But that's what they also mention on their website and is because of the use of recycled sources that this might occur.So I ordered two more colours: red & black. Red turned out to be more like salmon pink instead of red. Black was normal black. Both spools seemed to be pretty consistent in color. But printing with the black spool gave some problem, which is why I printed all parts in 'red'.. Making the mold was not easy and I had to try some stuff out before making the final shape. Basically you fill up the drag with sand, then place the 3D-printed model on top and fill in the gaps on the side. I then cut down the sand, so that the model can be taken out afterwards. After applying talk powder you fill up the cope.When the cope is filled with sand I open op the mold and get the model out. I also make a hole where the aluminum will be casted through. I know it's not according to the best aluminum casting technique, but it works. If there are any advices on how I can adjust the sand mold for a better casting with this shape, let me know. Or ideas for making a better cast-able shape are also welcome.You need to cast part 1 two times as a left and right side. It's better if you sand the 3D printed model before making the mold, so the model loosens better from the sand. One of the 2 sides needs a 'bump' where we will fix the heating element later.I made the bottom of the coffee maker out of aluminum so it has a heavy and stable base. It's also stronger for fixing the heating element. But there are ways to attach the heating element in a plastic 3D-printed model without having to use aluminum. The reason I chose to use aluminum was because this project was about searching for techniques for local producing and recycling. Casting aluminum & 3D printing with plastic from recycled sources came out as useful techniques, also because of the possibilities to share digital models. I used oil based sand because I'm still learning how to cast and thought this was the safest method, instead of mixing my own sand with water like you can find on other tutorials.. I made a furnace for melting aluminum out of an old fire extinguisher by checking out other instructables: https://www.instructables.com/id/Oven/I used scrap aluminum from the Fablab, old aluminum cans and trash aluminum foil to melt and cast into the mold. Remember safety: read and learn enough about this before trying it out and use enough safety gear to protect you from any mistakes.. Saw off the not wanted aluminum from the part and smooth down the model. The sides need to be straight because there are 3D-printed parts which will be attached onto them.. The heating element is the essential part of the coffee maker. It warms up the water so it starts to boil. This is the way it works: The water goes through the heating element until it's leveled (communicating vessels). When the coffee maker is turned on the heating element starts to heat until the water starts to boil. Therefore the water wants to expand and will push upwards. In the side of the water reservoir there is a valve which makes sure the water doesn't go up on that side, therefore pushing it through the other side upwards.Here is a more visual explanation video on youtube by Crazy Builders:https://www.youtube.com/watch?v=COKlObhGt50So, to make sure the element heats up to the right temperature you have to use a thermostat. This will interrupt the electric circuit when it reaches the stop temperature of the thermostat. I used a thermostat of 105\u00b0C, just above boiling point but I'm thinking a lower one (90\u00b0C for instance) might work as well, if not better.The heating element I ordered has a small piece welded to it where you can fix the thermostat. I cut a piece of aluminum for this. On this piece it will later also be possible to connect the ground to for safety. To make sure the heat gets transferred well to the thermostat I used thermal paste between the connection of the aluminum parts, the heating element & the thermostat.. Check the thickness of the 'bump' from the casted part. Take (at least) 1 mm less and tape off your drill to make sure you don't drill through the whole part. Tap an M4 thread inside the holes.To fix the heating element to the part of the casing we need to cut some strips which we will use to clamp. By tightening the nuts, the heating element will get fixed. Make sure you put a piece heat resistant plastic between the aluminum strip and the heating element. This will make sure there is no way electricity can flow to the outside casing.Since I don't have pictures from the fixing of the heating element, I made an exploded view to try to explain how it's fastened.. Drill a hole to the size of the switch. I used the biggest size drill I could find and then rasped to the exact size. Make sure this is the side where the heating element is NOT fixed.. Connect all the parts in the right place. I made a scheme where you can see how everything is connected. Use as much silicon wires as possible since they can handle the heat better than normal wires. The wires from the chord can be protected with the silicon tube.Make sure the valve is in the right direction otherwise the water can't get to the heating element.I suggest leaving the coffee maker for a few hours before trying it out. Then make sure there isn't any water leaking through the sleeves in the bottom. Also check again to make sure the casing can't conduct any electricity with a multimeter.. Choose what flavor of coffee you like, how strong you want it to be, and how much you want. Press the button and start brewing. Enjoy!Edit: I now also casted the dripping plate in aluminum. This because the 3D-printed part is not completely water sealing. \nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 0, 2, 1]\nC: [2, 3, 0, 1]\nD: [2, 1, 0, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_191_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_191_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_191_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_191_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [3, 1, 0, 2]\nC: [0, 3, 2, 1]\nD: [2, 1, 3, 0]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Sure, you can just show up and buy stuff without any special gear. But these specialty urban shopping items will make your life a lot easier.These three gear options can work by themselves or with each other. 1. REUSABLE BAGSThere are quite a few possibilities for bags beyond the world of Paper and Plastic. Pick one that works best for carrying your groceries and matching your good style. Keep one in your office/gym bag you have with you every day for those spur-of-the-moment shopping trips on your way home.2. GRANNY CARTThe Granny Cart is for when you and your roommates start saying things like, \"Plain pasta for breakfast AGAIN?\" or \"You can just scrape mold off and it's ok, right?\" They're best for the BIG trips. They are also great for shopping with kids since they provide easy schlepping and lots of entertainment.Something important to keep in mind about Granny Carts is how you pack them. You don't want your tomatoes on the bottom where they will be squashed by things stacked on top and subsequently pureed for sidewalk gaspatcho by the metal squares of the cart. Put heavy, boxed or canned items on the bottom (any pre-made frozen stuff, canned soups and beans etc), followed by lighter containers (pasta, cereal, tea), and lastly, crushable produce. One last tip: the key to looking cool with your Granny Cart is KNOWING you look cool with your Granny Cart. So stand tall and STRUT IT.3. BICYCLE BASKETClassic, functional and an easy way to express your style. A very sensible shopping option for those who bike to work, or for trips just beyond walking distance. (But beware of hills!)There are a million bike baskets out there, from the insubstantial wicker or plastic ones to the standard metal to the super high end wood and metal. You can put a basket on the front of your bike (attached to your handlebars), on top of your rear rack or on the sides of your rear rack. Some baskets fold and some detach.There are also a variety of saddle bags (technically called \"panniers\") you can attach to the back of your bike for bigger, heavier loads. Much like with the Granny Cart, keep a stacking strategy in mind while packing it.. Take a stroll through your neighborhood and take note of all the food shops. You might be surprised by how many small grocers, fruit stands, bodegas and specialized food stores there are. Venture inside them to see just what they offer. These places can provide a lot more than bread or beer at the last minute, particularly in climate-blessed San Francisco. (See our Food & Liquor project.)Not only can these places provide equal (if not better) goods, they often provide more personal service, which can be a huge help when you have all of one minute to find an obscure item before you need catch the bus to make your meeting. Get to know your shop owners, or at least the regular check-out folk, since they can keep you updated on sales or new goods, and can even put in a request for things they don't normally carry. All just for you! And keep in mind that you don't have to shop for everything in one store. The first place might do produce better than the second, while the third place has the best butcher counter and the fourth has great baked goods and the cheapest beer. Of course it's easiest if you find a group of these stores that are near each other. San Francisco has lots of \"market streets\": from parts of Irving Street in the Sunset to stretches of Geary Avenue in the Richmond to Church Street between Duboce and Market in Duboce Triangle, and many more. . Once you've found your local stores, START SHOPPING. \nNow that you have the right gear, it'll be easy for you to stop by the stores on your way home from work or after you pick up your kids from school. Keep your big shopping trip on the weekend for staples, but try stopping by stores on your way home for produce and other spoilables during the week. This can keep your meals fresher and your weekend shopping less gargantuan. You weekend-shoppers will be surprised by how wonderfully empty a market can feel the rest of the week.\nMaking a list before you go can help ensure you get everything you need. That way you won't be that nutter crying out \"Butter!\" on the way home because you forgot to get some. Be flexible though-- you might arrive at a store to find that the pears are much more delicious looking than the apples that were on your list.\nAnd keep your mode of transportation in mind while shopping. Don't buy that case of Hansen's soda just because on it's on sale if you're walking home up hill with your groceries in hand. Thar's a job for a Granny Cart or bike basket.\nWhen you're checking out, just say, \"I have my own bags.\" Some stores will give you a discount, or maybe just a smile. This is also a good time to ask about a new product you'd like to see available or who that cute kid is in the photo taped to the cash register.\n. Now that you are fully loaded with hot urban shopping gear strategically packed with delicious things from your local stores, go home and eat!\nOf course, don't be afraid to make a few other stops along the way. Your good packing job should ensure that socializing won't be a problem. (As long as you stayed away from the frozen food aisle.) You just might be surprised by who your super hot shopping gear attracts.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 1, 0, 2]\nC: [0, 3, 2, 1]\nD: [2, 1, 3, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_192_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_192_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_192_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_192_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [3, 0, 2, 1]\nC: [2, 3, 0, 1]\nD: [2, 3, 1, 0]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Roasting the cocoa beans is essential to develop the \"chocolate\" flavor. Its also pretty easy to do (there is a lot of science to roasting cocoa, but we won't get into that here)\n1. Adjust an oven rack to the middle position and pre-heat your oven to 300F.\n2. Spread your unroasted beans out on your baking sheet so they are in a single layer.\n3. Once the oven is preheated, place the pan in and start your timer.\u00a0 Roast the beans for 30 minutes. For your first time trying this, pay close attention to the smell. When your beans first start heating, you might notice an acidic smell coming off--this is normal. What we want to do is cook that off and wait until they start smelling like brownies. Because there are endless variations on how to roast and beans vary in how much they should be roasted, I've suggested a very \"average\" roast. You can experiment with future batches.\n4. At 30 mins, pull the beans out and place the pan in front of a fan to cool.\u00a0 If you don't have a fan, don't worry, just let them cool until they are cool enough to handle.. Now comes the fun part--removing all the shells from the roasted cocoa beans!\u00a0 Remember that friend I mentioned?\u00a0 Now is their time to shine.\u00a0 Each bean needs to have its shell removed.\u00a0 After roasting, the shells will be brittle and should crack off easily (some will be harder than others).\u00a0 The nibs inside will also break apart, this is ok.\u00a0 This is tedious and can get tiring and will ruin a manicure, but the alternative is to spend lots of money and buy an industrial machine to do it for you. On second thought, you might want to get two friends to help...\n\u00a0\nAs you de-shell, keep the beans/nibs in one bowl, and the shells in another (the shell can be thrown away or composted, but keep in mind cocoa shells are still just as bad for dogs as chocolate is, so don't let Fido find them!).\u00a0 And don't worry if there are a few bits of shell in with the cocoa nibs, they will get filtered out by the juicer.. \nNow is the messy part, and that friend will come in handy here, too.\u00a0\nThe de-shelled cocoa beans need to be run through the juicer.\u00a0 If you aren't sure if your juicer can handle this, check the brochure--any juicer that can make nut butters should be suitable.\u00a0 What you want is for the juicer to grind AND heat the cocoa so that the cocoa butter present in the beans melts** .\u00a0\nWith the filter screen in the juicer, and a bowl under the \"juice\" port and one under the \"pulp\" port on the juicer, slowly start adding the nibs. Don't rush, you can overload the juicer.\u00a0 At first almost all of what you add will come out the \"pulp\" port. Once you have run it all through, do it again.\u00a0 Each subsequent pass will heat the mass and more and more will melt through the filter screen and come out the juice port, while less comes out the pulp end. Each time, run what comes out the pulp end through, you will be collecting what is called cocoa liquor--partially refined, liquified cocoa mass--flowing out the juice port. At a certain point the only thing coming out the pulp end should be cocoa shell, since you have de-shelled by hand, almost nothing should come out. You are done when nothing or very little comes out the pulp end. You should have a nice bowl of melted cocoa liquor.**If your juicer doesn't generate enough heat (i.e., nothing comes through the juice port), have that friend point a hair drier at the auger end of the juicer until everything starts melting and flowing smoothly.\u00a0 Be careful not to blow all your cocoa away with the hair drier as it comes out of the juicer (I've learned this from experience).. You will need to weigh your cocoa liquor in order to formulate your final chocolate's percentage.\u00a0 I like a dark chocolate so I never go lower than 70%.\nHere's how it works:\nif you want your final chocolate to be a 70% dark chocolate, take the weight of the cocoa liquor (in grams), and divide that number by 70. Take the resulting number and multiply that by 30--that is the grams of sugar you will need to add to make the final batch a 70% dark chocolate. See, math is fun when it makes chocolate!\nNow, weigh out the amount of sugar you calculated for your custom chocolate. Pre-grind the sugar in small batches in your coffee grinder. You only need to grind for about 30 seconds to get a nice powdered sugar. With a rubber spatula, mix the freshly powdered sugar into the cocoa liquor, making sure there are no clumps.\u00a0 While mixing, have a friend lay out a piece of parchment or foil on a baking sheet (aren't friend's great?)\nOk, i'm sure the suspense is killing you---once the sugar is mixed in, have a taste! (I'll pretend like I didn't see you tasting the cocoa liquor :)\u00a0 Once you have a few batches under your belt, you can add spices and other dehydrated goodies at this step, so long as they don't contain water or moisture. Water will ruin chocolate (even just a drop or two).. \nYou are almost done---All that's left to do is to dispense your chocolate into portions onto the lined cookie sheet.**\u00a0 If you have chocolate molds, use them.\u00a0 With a spoon, portion out the liquid chocolate onto the cookie sheet.\u00a0 When done, pop it in the fridge for 15 minutes.\u00a0 Once they are solid, they are ready to eat.\u00a0 They will melt in your hands because they are untempered, but if you keep them in a tupperware in the fridge or freezer, it will slow their melting (and slow the formation of fat and sugar \"bloom\"---the swirls and speckles in the picture). \u00a0\nPat yourself on the back, because you just made handmade chocolate from scratch!\u00a0 Now go out and brag to your friends about your accomplishment, but be prepared to share!**I'm deliberately skipping tempering the final chocolate because that is a whole science by itself, but there are plenty of websites that explain how to do it.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 0, 2, 1]\nC: [2, 3, 0, 1]\nD: [2, 3, 1, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_193_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_193_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_193_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_193_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [3, 0, 2, 1]\nC: [1, 0, 2, 3]\nD: [1, 3, 0, 2]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. 1 package of dry yeast1/4 cup very warm water (about 100-105 degrees F.)1/2 tsp white sugar1/2 cup pure pumpkin puree (not pumpkin pie filling)1/4 cup heavy cream or milk 1 tsp fine salt1/4 cup melted butter1/2 tsp vanilla extract3/4 tsp pumpkin pie spice (or 1/2 tsp ground ginger and 1/4 tsp together grounded cloves,cinnamon,nutmeg)1 large egg1/4 cup granulated sugar3 to 4 cups all purpose flour (divided), as needed  (add enough flour so that dough just barely pulls away from sides, and a very soft, slightly sticky dough is formed). Warm together water, milk  and sugar to reach about 100-105 degrees F. and add yeast and mix till incorporated and leave keep aside for 10 minutes and if the yeast mixture bubbles or increases in size, that means the yeast is active and our rolls will turn out soft and fluffy. If the yeast mixture remains the same after 10 minutes, that means either the yeast is not good or water milk mixture was not the right temperature, so need to repeat this step using fresh ingredientsMelt the butter and keep asideIn a large mixing bowl + Sugar + melted butter and mix well + egg + yeast mixture and mix well + pumpkin puree + pumpkin pie spice + Salt + Vanilla essence and mix well till incorporatedAdd in 1 cup of flour at a time so that dough just barely pulls away from sides, and a very soft,slightly sticky dough is formedSpread all sides of dough with olive oil, wrap the bowl with cling wrap and let it sit on a warm place for 1-2 hours or till doubled in size. Ingredients:3/4 cup packed brown sugar1/4 cup of granulated sugar2 tbsp ground cinnamonSteps:Mix all the above ingredients well till incorporated and keep aside until required. Place the doubled dough on a surface smeared with flour and with the help of rolling pin, roll the dough till it reaches 1/8th inchMelt 5 tbsp butter and brush on the rolled dough Spread the cinnamon sugar on the rolled doughTightly roll the dough and cut into 16 rollsPlace the cut rolls on a greased baking tray about 1/2 inch apart. I used 2 trays because i did not have a big enough trayCover with cling wrap and keep aside for 45 minutes in a warm placePreheat oven to 350F and bake for 30 minutes. Ingredients:1/4 cup room temperature cream cheese1 cup powdered sugar1/4 cup milk, or as needed1/4 tsp vanilla extract, optionalSteps:Mix all the above ingredients very well till you get smooth mixturePour on top of warm baked rolls and serveDone :)\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 0, 2, 1]\nC: [1, 0, 2, 3]\nD: [1, 3, 0, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_194_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_194_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_194_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_194_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [2, 3, 0, 1]\nC: [1, 0, 3, 2]\nD: [1, 3, 2, 0]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. This list cover 2 cups/glasses of dessert, but if you want more you only have to double this amount! :)100ml (3.38oz) of raw-fat milk100ml (3.38oz) of Italian style coffee (the best one you can find)100gr (3.52oz) of sweet white yoghurt (don't go for the sour one, this time)Muller makes an excellent 0,1% fats sweet yoghurt, so you can have a good choice with the illusion of a less fatty recipe XD200gr (7.05oz) of Stracciatella ice-creamStracciatella is like regular white ice-cream but with a lot of crunchy chocolate chips inside. This will give a great texture to the mix. If you can't find Stracciatella, you can always go for the classical white one and add some chocolate chips after. If possibile, buy homemade ice-cream. Here in Italy is very easy... 1 out 5 shops are gelato's ones :DAbout 12 ice cubes4 teaspoon of sugar, I suggest brown sugarFresh liquid cream (to whip later!) or homemade whipped creamSome gelato's shop also prepares a wonderful whipped cream that you can take away. If you prefer, you can buy it there, already whipped! But please, don't use spray whipped cream. It's awful. XDI used whipped cream directly from gelato shop! :)The tools you will need are as follow:Blender   Digital scale Whip for the liquid cream (if needed)   Spoons and cups   Sac \u00e0 poche or syringe for food decoration (to apply whipped cream)The recipe is pretty easy, in my style! :DSo let's move on, you will have your Frozen Cappuccino in minutes.. As some of my followers may notice, I love this kind of recipes: put everything in the blender and then eat! XDThey are super easy, fast, and satisfying!So, let's start by making a good coffee with yuor moka. Use the best quality coffee you can find in your market. It may sounds unnecessary, but coffee quality REALLY makes the difference.Pour 100ml (3.38oz) of coffee in a cup with 4 teaspoons of brown sugar, and mix it up.The heat of coffee will help you to do this quickly!Now, add 100ml (3.38oz) of raw-fat milk to the mixture. I suggest to use fridge-cool milk, so you will have it at room temperature without waiting.. In the meantime, take 200gr (7.05oz) of ice-cream out of the freezer, to let it soft a bit.Did you know that super-hard frozen ice-cream can easily burn the spin motor of your blender? I had some bad experiences about it... so trust me, let it soft a bit XDPut ice-cream, 100gr (3.52oz) of sweet yoghurt and 12 ice cubes in the blender, along with the sweet milk and coffee mix prepared in the previous step.If you like the taste, you can also add a portion of vanilla extract before mixing.Now blend everything until you have a smooth cream.You can change the \"texture\" of your mix by varying blending time: less time means a more grainy mixture. Your choice! ;)Now, pour all the mixture in two cups/glasses (or more if you have doubled the ingredients).We are almost done, it's time for whipped cream!. This part is very easy.Simply pour the fresh liquid cream in a large cup and whip it until it is at the right density.My Kenwood blender (I love it) has an extra accessory that can be applied on the bottom of the mixing cup, and it works like a big whip. It's also perfect for whipping white eggs. :PYou can add some powdered sugar or vanilla sugar before you whip it, to add some extra sweeteness!If you have bought fresh whipped cream, skip this part. Like me! :DNow you only have to load your sac \u00e0 poche or your decoration syringe (I use the last one) with a good amount of whipped cream.Take the cups/glasses with the coffee mixture that we have prepared in the previous step and add a whirly, magical cream top on your dessert! Time for om nom nom nom! :D. You can finish everything by adding a sprinkle of brown sugar onto the creamy top.Now it's time to taste this little sin of gluttony!Yum!Hope you'll find it very tasty and satisfying, and remeber to not overdo with this!!!My little gluttons! ;)From Italy, this is Filippo.Have sweet days.Ciao a tutti! :D\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 3, 0, 1]\nC: [1, 0, 3, 2]\nD: [1, 3, 2, 0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_195_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_195_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_195_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_195_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [2, 3, 1, 0]\nC: [1, 0, 2, 3]\nD: [1, 3, 0, 2]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. First combine the dry ingredients and mix them well. - add the flour - add the baking powder - add the salt - mix well Then cut off the appropriate amount of butter, and chop it into small pieces before adding it to the dry ingredients. Mix until the dough forms small clumps. Add milk slowly until the dough forms a ball. Dump out all of the dough onto the counter and roll out into a sheet. Use a cookie cutter to cut out biscuits. Bake for 6 minutes on an ungreased cookie sheet in a 450 degree Fahrenheit oven. I carefully rolled out each batch to 0.5 cm thickness and measured each product after baking to compare the three oils.. I baked each batch on an ungreased cookie sheet for 6 minutes in a 450 degree Fahrenheit oven on the middle rack.. After baking, I sliced one of the butter batch in half and measured the cross sectional thickness of the biscuit. The biscuit had been rolled out to 0.5 cm in thickness before baking. After baking the biscuit was 1.0 cm in thickness. The biscuit tasted fluffy and moist.. For a second batch, I replicated the same recipe with the same procedure and other ingredients but substituted olive oil for the butter. I found it easier to mix this batch since the oil was a liquid rather than a hard solid like the butter. The dough was drier and crumblier than the butter dough.. After baking, I sliced one of the olive oil batch in half and measured its thickness. Before baking these biscuits were 0.5 cm in thickness, after baking they were 0.75 cm in thickness.. Again I repeated the same procedure except this time I used coconut oil instead of butter. The coconut oil is a soft solid at room temperature so it was really easy to mix into a batter.. The biscuits made with coconut oil were flakier in texture, had a nice nutty flavor, and also had large gas pockets within the biscuit that increased their overall volume. This batch definitely looked fluffy when they came out of the oven. Before baking these biscuits were 0.5 cm in thickness. After baking they were 1.0 cm in thickness.. All three biscuits were tasty, but the texture and appearance of the coconut oil biscuit was the smoothest, fluffiest, and sweetest. In the photo of the cross sections of the biscuits, they are from left to right: butter, olive oil, and then coconut oil. Now I wonder what will happen if I switch to another type of flour? or add a sweetener to make desert biscuits? or spices to make them savory?\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 3, 1, 0]\nC: [1, 0, 2, 3]\nD: [1, 3, 0, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_196_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_196_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_196_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_196_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [3, 1, 2, 0]\nC: [0, 1, 3, 2]\nD: [3, 0, 1, 2]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. You will need:Plain flour an egg or two Milk Salt Oil - sunflower or canola (rapeseed) Curry powder (controversial, I know)...yes, not really many ingredients at all...which begs the question: what do they put in Yorkshire pudding packet mix if you still need to add an egg and milk???. You will need:dessert spoon to measure the flour with (...you will see scales in one image, but these are not necessary, I was calibrating my spoon!!!) tea spoon (5ml) fork, to beat out the lumps whisk, to whip air in mixing bowl or jug (I've tried both and the bowl wins hands-down!) sieve for the flour Yorkshire pudding/cup-cake/bun  tin to cook them in (I'm making individual ones here). Place the sieve over your mixing bowl.For 12 good-sized individual puddings you will need 4 heaped tablespoons of flour (slightly over 4oz, 125g if you want the security of using scales!)Then add about 1 teaspoon of curry powder - if you have it - if not you can use paprika, garlic granules/powder, onion salt, or whatever you fancy.Add a pinch of salt ...well no more that 1/4 tea-spoon (if you are using onion or garlic salt you may not need any further added salt!)Now sift this through the sieve into the bowl - use the tablespoon to break up any lumps of spice or flavouring.Make a well in the centre and crack a large egg into this (or two small ones). Then add about 1/4 pint (5 fl oz, 150ml) milk and mix it together with the fork and work it well to get all the lumps out. Then beat well to get some air into the mix.  Then put it into the refrigerator for about half an hour.  So... you will have worked out that if you want your puddings ready with the rest of the meal you want to make them about 1 hour in advance.... First, prepare your tin ... place a small amount of oil in each cup then use something to thoroughly coat the inside of each compartment, I use a pastry brush, but grease proof paper could also be used. Make sure there is a small puddle in the bottom of each cup.Wind the temperature of the oven up to 200C (400F) and put the tin in to heat up.... but not for too long ...you don't want to spoil your meal, you just need that tin to be really hot!Get your batter out of the fridge and whip in about another 1/4 pint of milk and a tea spoon of oil. Now whip it again to get lots of air into it ... loads of small bubbles will cover the top like a foam, stir them into the batter then whip again.Now quickly take the hot tin from the oven, pour the batter into the cups and get it back into the oven. Give it a minute or two to warm up then turn the oven back down to the cooking temperature of your other dishes.The puddings will want between 25 and 35 minutes... please see the next step for more details.... So... how do you like your puddings? Do you like them the traditional pudding texture (soft and creamy) or do you like them crisp and brittle? Or crisp, but a bit chewy?  Well, as you can see from the images, I've been calibrating the process.  The more batter you put in each cup, the more moist/soft each pudding will be. The longer you cook them the more moisture you drive out so the crisper they will be.In the first two images you can see I have filled the left-most row to the brim, the next row are half full then the third row were about 1/4 full.  In the oven at 185C (362F) each row rose up, the left-most over the top of the cups. I gave that tin 25 minutes only.  The result was that the left-most row were soft and melting, the middle crisp and chewy the third row were really crisp, but not brittle-crisp. The fifth image shows the result of filling the cups completely and giving them 35 minutes at 190C (375F), again they rise up, however do not collapse when the tin comes out of the oven.  So, to get really crisp and crunchy \"puddings\" like my kids love you need to only fill each cup 1/4 full, then cook them at 190C (375F) for 35 minutes... but beware you don't want to be removing the joint from the oven to stand too soon in the cooking cycle... so what you could do is take the meat from the oven, cover it and keep it warm to let it stand and go juicy, while you cook your Yorkshire Puddings to perfection in a hot oven.Enjoy!Later Note: A contributor from Yorkshire has kindly pointed out that traditional Yorkshire Puddings rise to 3 times their volume and are light and fluffy... unfortunately an aspiration a lot of restaurants fail to meet in my experience! To get them to rise that much you need to be whipping plenty of air into the batter, and not removing them from the oven until they have risen and set. One egg can absorb up to 1 pint of milk (just look a quiche!!!) so do not worry if you add \"too much\" milk; your puddings will be more fluffy than if you stick to just 1/2 pint for the crispy version.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 1, 2, 0]\nC: [0, 1, 3, 2]\nD: [3, 0, 1, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_197_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_197_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_197_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_197_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [3, 1, 0, 2]\nC: [1, 3, 0, 2]\nD: [3, 0, 1, 2]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Add enough curd to soak the sago for at least 6 hours.( Approximately 1/3 cup of sago requires 1/4 cup curd)After 6 hours the sago pearls have absorbed the curd. It becomes soft and almost doubles in size.(Soaking the sago for about 6 hours is very important or else they will burst in the oil while cooking). Take a mixing bowl and add the soaked sago to it.Add three cups of rice flour to sago.Add as much red chili powder and salt as you like, depending on your taste peference.. After adding the spices, mix everything. Add two tablespoons of hot oil and a small amount of water, to make a nonstick dough.The dough consistency should be such that it can easily pass through a murukku press.. Now take a murukku press or any other type of press and grease it with a little oil. Place a star shaped disc inside it.Take a small portion of dough and fill the murukku press with it.. Fry the dough slowly in batches by pressing it through the murukku press, till they turn a little brown.Enjoy the sago murukkus with some tea as an evening snack.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 1, 0, 2]\nC: [1, 3, 0, 2]\nD: [3, 0, 1, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_198_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_198_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_198_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_198_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_ordering",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: [0, 1, 2, 3]\nB: [1, 3, 0, 2]\nC: [2, 3, 0, 1]\nD: [0, 1, 3, 2]",
+    "question": "What is the correct order of the images?",
+    "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Cake\n2 lbs 80/20 Ground Chuck\n2 eggs\n1 cup breadcrumbs\n1 cup chopped onions\n1 carrot shredded\n4 grape tomatoes\n1/2 cup cheddar cheese\n2 tablespoons\u00a0Worcestershire\u00a0sauce\n1 Teaspoon Salt\n1 Teaspoon Pepper\nFrosting\n1 packet Brown Gravy\n1/8 teaspoon corn starch\n3 large potatoes (although 2 would have been enough)\n1/4 cup milk\n6 grape tomatoes. First things first. You want to get the potatoes cooking, those will take the longest\u00a0guaranteed.\nStart by washing them off. Give them a rinse, maybe a little scrub with a brush if you have one. We're going to be pealing them but there is no reason to risk getting anything in your food. \"Cleanliness is next to godliness\", is more true than ever in the kitchen.\nOnce they're washed go ahead and peel them, then chop them up. Try to keep them all about the same size so they cook evenly. Dump the chopped potatoes in a pot, fill with water till their covered then put on the lid and set them on the stove. Medium should be enough. Now don't forget about these, we'll use them later. And that pot might boil over depending how big it is. I\u00a0accidentally\u00a0grabbed one a size too small if you ask me.. Meatloaf is actually a very straightforward process.\nTake all the ingredients and put them in a bowl, mix by hand.\nNow to be more specific. I started by opening up the ground chuck and putting it in the bowl. On top of that I shredded the carrot, added the onions, and half of the breadcrumbs. I took some of the grape tomatoes and chopped them up, cleaning out as much of the seeds as i could. Just give them a little squeeze and pinch off the pulp and seed. I added the tomatoes for texture, but I wasn't after crunchy. I gave it a little mix, added the tomatoes, the rest of the breadcrumbs, the cheese, salt, pepper, and\u00a0Worcestershire\u00a0sauce. I just broke it up a little to make sure it was a bit more even all the way through.\u00a0. Now we're getting to the second long stage of the process. I took two cake pans and buttered the bottom and the sides. After that I poured in some of the breadcrumbs and shook them around until the pan was covered. They stick to the butter. This gives the meatloaf a shape and surface thats a little more uniform and cake like. Divide the meat from the bowl in half and press half into each pan, shaking some more breadcrumbs on top.\nThese two go into a 350 degree oven for half an hour. I know regular meatloaf tend to cook a bit longer, but because the surface to mass ratio is much larger than an average loaf, meaning they're much thinner so the heat doesn't need to penetrate as far, I cut the time.. Remember those potatoes? This is the perfect time to start fiddling with them again. If they are fork tender, meaning in the pot, you can stab them with a fork and not encounter much resistance, go ahead and take them off the heat. Drain out the water and then mash the potatoes. Once they're broken down fairly well add a little bit of milk. The idea is to have a very creamy texture. If you have an immersion blender give them a buzz. Keep in mind we're aiming for creamy, not soupy, so keep the moisture level down as much as possible. Add just enough milk to get them soft and smooth. \u00a0\nNow if you still have some time before your giant hamburgers come out of the oven it'd be a good time to get your gravy done. Just follow the directions on the brown gravy package although once its finished stir in an extra 1/8 of a teaspoon of corn starch. This is a thickening agent and with how we need the gravy to behave we're gonna need it.. This part things went really fast. I misjudged my time slightly, and was also at a complete and utter loss of what the heck I was doing. I have never decorated cakes before, but I am an avid fan of the Food Network. Here's how it went for me\nWhen teh patties are done pull them out of the oven, giving them a little time to cool. Take one and place it on the plate or platter you're going to use. Then take a scoop of the mashed potatoes and spread it out on top of the patty, like a layer of frosting in a double decker cake. Place the other patty on top of the mashed potatoes. Even with everything centered and made in circular pans they probably don't match up perfectly. Take a knife and go around the edge, trimming them down so they're even. After that pour the gravy on top, and using a rubber spatula to spread it out best you can. It will adhere fairly well but its not an exact science in my experience.\u00a0\nOnce the gravy is on the meatloaf, put the mashed potatoes in a zip top bag. Compress them down into a corner and give the top a twist. With a pair of scissors snip off the corner and you have a make shift piping bag. Put a few dollops of the potatoes on the cake, and then take some more grape tomatoes and put them on top of the potatoes for a frosting and cherry look. Make sure you have that one candle sitting in the middle, and when you light it up remember your favorite companion cube.\nEnjoy fooling your friends with this unconventional cake.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 3, 0, 2]\nC: [2, 3, 0, 1]\nD: [0, 1, 3, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_199_0.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_199_1.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_199_2.jpg",
+      "../MMIU-Benchmark/visual_ordering/visual_ordering_199_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. \n          Fundamentally all you need is:EggplantTomatoesMozzarellaOlive OilSaltPepperYou can also use something to season the olive oil. I had some basil from the farmer's market I needed to use up, so I decided to to add basil and garlic. Italian seasoning would also work very well.Pay no mind to the chicken in the picture - I used it in my meal but not in this dish, so I don't know why I put it in the picture.. You will be basting your eggplant and tomatoes with olive oil. You can use plain olive oil if you want, but it'll be more flavorful with herbs mixed in. If you're gonna mix things in, do it now so the flavors can mingle awhile while you prep the rest of the ingredients.\nI had some basil left over from the farmer's market, so I used that and some minced garlic. This would also be good with your favorite blend of Italian seasoning.. Cut the ends off of your eggplant and slice it the long way. Make sure that you cut all of the slices the same thickness so that they all cook evenly. Mine are about 1/4\" thick, and I wish I had a mandolin slicer so they'd be perfectly even.\nI think it would make a nicer presentation cut into rounds, so you can do that if you want, but it's easier to grill when it's sliced the long way\nEggplant (especially large ones like this one) can be bitter, so to prevent that you need to draw out the juices. Place your eggplant slices on paper towels and salt them generously on both sides. Then, place another layer or two of paper towel on top. Let them sit, and if the paper towels are too saturated, switch them out. . Slice your mozzarella into thin peices (about 1/8\" of an inch thick). If you are using a simple grilled tomato like I did, slice it into hearty pieces. I had two tomatoes, and cut them into 5 big slices with a little bit extra.If you are feeling more ambitious of having company over I would deffinately reccomend roasting your tomatoes using Canida's instructable. It would also work well with a fresh tomato sauce.. You've got extra tomato, plenty of sliced mozzarella, and some flavored olive oil. Make yourself a little appetizer!. Preheat your grill.\nAdd some salt and pepper to your tomatoes, and some pepper to your eggplant (it's already salted). Baste one side with oil.\nPut your veggies oil side down on the grill and cook for three minutes with the cover on. \nNote: you only see eggplant in the photo of this step, but I should have cooked my tomato longer so I would put it on at the same time as the eggplant in the future.. Baste the other side of the veggies and flip them over, then cook them another 3 minutes.\nFeel free to re-baste if you wish. When I flipped the eggplant I added my tomatoes to the grill. I cooked them 1 1/2 minutes on each side, but I think that I should have cooked it a little longer.. Flip the eggplant over again. Place the tomato on top (or sauce if you prefer), and the mozzarella on top of that. Cook it for 1-2 minutes with the cheese to get a little melty.. Your eggplant is ready to serve! I served this as a side dish with grilled chicken, but it would also be delicious with a lightly sauced pasta or a smaller grain like couscous. It would also be yummy in a sandwich.\nEnjoy!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_0_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_0_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_0_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_0_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Make sure you have everything you need before you start. There is nothing worse that having to stop in the middle of the process to get something you forgot.Ingredients:\n1 cup unsalted butter, softened\n1 cup brown sugar\n1 cup white sugar\n2 teaspoons vanilla extract\n2 tablespoons milk\n2 eggs, lightly beaten\n2 cups flour\n1 teaspoon salt\n1 teaspoon baking soda\n1 teaspoon baking powder\n2-1/2 cups old-fashioned oats (not instant)\n12 ounces semisweet chocolate chips \n1-1/2 cups chopped walnuts Tools:\nmeasuring cups\nspoon\nspatula\nlarge mixing bowl\nsmaller mixing bowl\nstand mixer (if you have one). Put the butter out on the counter about an hour before you start. The butter is ready when you can push a dent into it with a finger. \nCutting the butter into half inch pieces can speed up the thawing\nIf you forget of can not wait the butter can be placed into the microwave. \nStart with 10 seconds until the butter is soft.. Pour 1 cup brown sugar and 1 cup white sugar. Mix the sugar together before adding other ingredients. \nOnce the sugar is mixed add 1 teaspoon salt, 1 teaspoon baking soda, and 1 teaspoon baking powder. \nMix the ingredients again and then add 2 cups of flour. Mix the dry ingredients one last time and set them aside.\u00a0 . Crack open and lightly whisk two eggs. Then add 2 teaspoons vanilla extract\n2 tablespoons milk. Lightly whisk again and set the wet ingredients. . When mixing all of the ingredients together pour 1/4 of a cup of the dry mix into the bowl. Next put the butter in the bowl. Pour the rest of the dry mix into the bowl followed by the wet ingredients. Then mix the ingredients together until they form a dough.\u00a0 \nMake sure to scrape the side of the bowl making sure to mix all of the Ingredients into the dough.. Measure out 2 and 1/2 cups old-fashioned oats. Do not use instant oats. Pour the oats into the center of the dough. Then mix the oats into the dough.. Measure out 1 and 1/2 cups of chopped walnuts. Pour the walnuts onto a chopping board. Chop walnuts into smaller pieces. Pour the walnuts into the center of the dough. Then mix the walnuts into the dough.\nA good way to tell if the walnuts are chopped enough is to pass them through the cutting board handle. If the walnuts do not fit then they are not chopped enough.\u00a0 \n\u00a0 \u00a0. The last ingredient to mix into the dough is the chocolate chips. Measure out 12 ounces semisweet chocolate chips. Pour the chocolate chips into the center of the dough. Mix the dough until the chips are incorporated into the dough.\nOne bag of chocolate chips in a supermarket is usually 12 ounces so you can just pour the whole bag in. . Once the all of the ingredients have been mixed into the dough scrape the dough off the sides to make sure everything has been mixed. Then place the dough into the refrigerator for 1 to 24 hours. \n24 hours is ideal but the even after 1 hour the dough should be firm enough to put on the cooking trays.. Preheat the oven to 350 degrees. Well the oven is heating up take the dough out of the refrigerator. Take a table spoon and begin scooping out the dough and placing the dough on baking trays. \nFor best results place dough balls at least 1 inch apart from each other. \u00a0 . Once the oven reaches 350 degrees place the baking trays into the ovens. Set a timer for 10 to 12 minutes. \nAfter placing the baking trays into the oven set up the cooling trays to place the cookies on when they are done baking.. When the timer goes off remove the trays from the oven and allow them to sit for about 2 minutes. After 2 minutes remove the cookies from the baking trays and place the cookies on the cooling rack.. When the cookies cool it's time to eat.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_1_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_1_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_1_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_1_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Tools:Small hammerDrill1/8\" nail3/8 drill bit (wood bit works best)Round fileVice (or other flat hammering surface)Jar supplies:500ml mason jars with 70mm size opening Colored drinking straws Approximate Costs:A dozen 500ml/70mm jars are about $8Straws are about $2Four Handled 500ml/70mm jars are about $8*Always wear eye protection when using tools. Drill a hole in each lid. Slightly off-center is best. Use the drill at high speed but push softly for the cleanest hole. Pushing too hard and fast  will cause the hole to be jagged and mangled.. Use the round file to file down any rough edges or burrs. Insert the file and at an angle, run the file around the hole, rotating the file as you do so.. On the vice/hammering surface, hammer the hole a few times to further smooth the hole edges. You should be able to run your finger over the hole and it should be free of any edges.. Repeat this for all your jars.. If you mess up any if the lids, you can buy replacement tops. Also be sure to empty out the jars of any drill shavings. Please wash every jar for safety and sanitary reasons.. For a tropical theme, add an umbrella hole!Use a 1/8\" diameter nail to add an umbrella hole opposite the straw hole. Be sure to also flatten the back side of the lid with a hammer to eliminate any sharp edges.. After the jars are all complete, insert your straws and they're ready to go!You can decorate the jars with clear labels that you can print on with your inkjet printer at home. Or use stickers or even spray paint them with a stencil!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_2_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_2_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_2_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_2_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Fresh asparagus Seasonings (I use garlic salt)Extra virgin olive oilZiplock bag. Cut the base off of each asparagus stalk at a 45 degree angle. This part of the stalk is hard and not good to eat. It's usually white or brownish purple.Once cut, place them in a large ziplock bag. I use the gallon size. After they're all in the bag, pour some extra virgin olive oil. I don't measure the oil. Pour enough to coat all the asparagus. Seal the bag and shake/mix until all stalks are coated. Let it sit a bit. Every now and then shake up again. When I grill, prepping the asparagus is the first thing I do. That way it can absorb the oil and get nicely coated while grilling the rest of the food.. When grilling, asparagus is the last thing you should grill. You want to eat it as soon as you can after removed from the grill. It's best hot. Tonight I grilled my pork chops first. Then I grilled the asparagus and other veggies.Put the grill on medium heat and use tongs to place the asparagus on the grill. Be careful because the oil is flammable. Make sure to place the asparagus at an angle so it doesn't fall through the grate. The object is to get each stalk browned but not burnt.I close the lid and let them cook for five minutes. After five or so minutes, move the asparagus around so it cooks evenly. It easily rolls around so there is no need to actually flip it. At this point I usually let it go for another few minutes. If you use higher heat then make sure to watch and tend the asparagus to prevent burning. When it's browned/seared to your preference remove from the grill.. Eat as soon as you can for optimum deliciousness.Thanks for reading. Roll Tide!!!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_3_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_3_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_3_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_3_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Ingredients:\n\t\tFlour\n\t\tCanned biscuits\u00a0\n\t\t2 servings Fruit roll ups\nPlease note: \u00a0You will need 2 biscuits per serving to figure out how many cans you will be baking. \u00a0. Tools:\n\t\tDough board or cutting board\u00a0optional\u00a0\n\t\tKitchen scissors\n\t\tAltoids can for the Tic-tac-to\n\t\tSkewer\n\t\tSeveral different sizes and shapes of cookie cutters\n\t\tBaking sheet\n\t\tKnife. Instructions:\n\t\tPre-heat oven following the instructions on the can.\n\t\tFlour dough board.\nNote: If making these for the first time I would lower the baking temp and baking time because the smaller shapes bake quicker and you might want to remove them sooner.\n        . Cutting the shapes for the\u00a0puzzle:\n\t\tLay the biscuit on the floured surface and pat flat or use a rolling pin.\n\t\tCut out the shapes with the cookie cutter.\n\t\tBefore removing the cookie cutter move it gently back and forth and side to side to make the opening a little larger but still keeping the shape.\nPlease note: I baked the shapes separate from the biscuit, However I think it might be best to keep the shapes in the biscuit after they are cut and wiggled. This might make it work slightly better. I noticed some of the shapes did not fit as nicely but they still worked.\u00a0\nIf they cook together for some reason you could use a knife, \u00a0and gently trim them to remove from the biscuit. The next time I make these I will try baking half with the shapes in and the remaining half with the shapes out.\u00a0\n\u00a0\n        . Cutting the shapes:\n\t\tYou will need to flatten 1 biscuit for the board and leave it uncut.\n\t\tUse\u00a0another biscuit for the stars and hearts.\u00a0\n\t\tYou will need 2 biscuits per serving.\u00a0\n\t\tLay the biscuit on the floured surface and pat flat or use a rolling pin.\n\t\tCut out the shapes with the cookie cutter. \u00a0You will need 2 different shaped mini cutters.\n\t\tPlace the entire biscuit with the shape intact on a cookie sheet.\n\t\tBake the biscuits according to the instructions in step 3.\n\t\tI would bake \u00a0them on a slightly lower heat and also shorten the baking time, so you can keep an eye on them and remove them from the oven as soon as they are done.\n\t\tRemove from the oven.\n\t\tAllow to cool just enough not to burn your fingers.\n\t\tCarefully using the skewer push the shape through the biscuit.\u00a0 \u00a0\nBake the biscuits according to the instructions in step 3.\n\u00a0. Cutting the fruit roll ups:\n\t\tTrim 5 thin strips from the fruit roll up.\n\t\tPut one strip on the wax paper the fruit roll up came on.\n\t\tRoll it on the paper until all of the strip is covered with the paper.\n\t\tRepeat this with the rest of the strips.\n\t\tRoll it into a long small roll. \u00a0\n\t\tFold it over and place it in the Altoids can.\n\t\tPlace the shapes in the can.\n\t\tThe kids set up the board and then play the game!. The kids will enjoy decorating the board and playing the game. \u00a0The younger kids will have fun putting the puzzle together. \u00a0The cool thing about these are they are not messy at all, until they are ready to eat. \u00a0I hope this tutorial has motivated you to dig out the biscuits and start baking. \u00a0I had a lot of fun baking them. \u00a0\nThank you for stopping by and do visit again! \u00a0I am always tinkering here at Instructable's \u00a0and will have new tutorials in the near future.\nTake care and have a\u00a0stupendous\u00a0day! \u00a0\nSunshiine\u00a0\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_4_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_4_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_4_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_4_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. As mentioned, some are optional. See notes on each.The Dry300g (2.7cups) White Self Raising flour. We have also had success with standard wholemeal flour, but if you don't use self raising add some baking powder (1-2 teaspoons) so it rises.80g (0.7cups) Coco/carob powder. This can be 100% coco powder (let's call that be beginner level cake). We like the additional complex complex flavour of carob so use about 60g coco to 20g carob powder. 125-175g (0.6cups) Sugar. So we have been trying to reduce this (you know sugar is the lord of pain right?). Our experiments put a bit of a limit on it at 100g, after which people don't tend to identify it as cake and there are structural issues (though I still like it). So yeah 140g or more of sugar for the beginners cake, we went with 100g.2 tsp bi-carbonate of soda and a pinch of salt. To help with raising. 30g Dessicated coconut. Optional - make for interesting textural addition. They act like binding fibres.Small handful of cocoa nibs. Optional, can also use nuts or just leave it out.The Wet100ml (0.4cups) Whole organic milk. Lets not start the pasteurisation debate now.3 free range eggs.350ml (1.4cups) melted coconut oil. You can also use vegetable oil (but not sunflower oil, it tastes funny) but it's different A dash (1tsp) of vinila. Optional, not necessary.The Ambiguously Wet-dryOne average sized courgette. About 400g in weight. One fiery chilli pepper, or some chilli powder to taste - this is optional if you hate spicy food leave it out, no probs. Approximately 30 ripe blackberries. Another optional ingredient, You could also use blueberries (though I don't see the point as blueberries always seem to get lost in cakes and muffins, why are they such a popular choice???) or my favourite choice raspberries, or some other zingy fruit. Raisins work well too. . Two mixing vesselsA scales - or some cups or whatever witchcraft unit measuring system you go by.Measuring jug for the milkA cake tin (or muffin cases - the same mix can make muffins).  Oven Whisk. Concrete is easier to mix when you mix the ballast and cement dry before adding the water (well unless it is all very dry, then the dust is unmanageable). Anyway, same principal, mixing the dry ingredients first means no clumpying so you can get them together with ease. Weigh them out. Put them in your big mixing bowl. If you are being pro you can sieve them in. Keen enthusiasts just shove it in there, it's just the same ;) . Melt the coconut oil  (put the jar in hot water for a while) if it is solid, and mix it up with the milk, 3 eggs and vanilla.A whisk can be used, as can a wide variety of food processors and diy drill powered devices. A lowly fork will do fine too. . Time to get your grating freak on! We use the course side of the greater to great the courgette on top of the dry mix. Then the fine side to do the fresh chilli (you could also use chilli powder - which also works great for baking).Be sure to scrape the inside of the greater to get it all out, most of the chilli will cling there. Then rinse your hands before you touch eyes or nose with chilli hand.Add your chosen fruit - in our case blackberries. . We are going to mix all the mixes by adding the wet mix to the dry and stirring it up. It shouldn't take long, and a sturdy spoon is all you need. Now check your consistency. It is a fairly runny batter for a cake, think slightly too sloppy brick and block mortar. a viscosity similar to thick pouring cream, or quite stiff vomit.Taste a little bit like a pro, and smile! (assuming your not the ocd hygiene type - in that case spit that back out!)  . Grease proof paper and grease your cake tin. Observe illustrations. Avoid using petrochemical derived grease - coconut oil works well.   . Time to reap the rewards of your mixing. Pour the mix into the tin! Lick the mixing bowl clean...Put the cake in the pre heated oven at 180C, 357F, gas mark 4.Set an alarm for one hour (or less - see next step). . If you oven is like ours, and the chances are it's not, check the cake after one hour and follow testing procedure. If its a fan assisted oven, turn the fan off. If you can't turn it off, check the cake after 45 minutes instead of an hour. If you went for the muffin option, they will be done a lot quicker, check after 30 mins.Testing procedure:Don ppe (oven gloves), stand back and open oven. Hold face away from initial vigorous gas exhausting. After initial off-gassing peek inside. Shake the cake tin with with oven glove. If it wibbles like jelly, leave it in the oven and re-start the checking procedure in 20mins.If it isn't over wibbly remove and place on heat resistant surface and continue with prod test.Prod the centre of the cake with wooden prodder (or sharp knife), if it comes away clean, it is done, leave to cool. If uncooked cake mix remains on your prodder replace cake in oven and repeat prod test in 10mins.If the surface shows signs of burning, remove cake from oven and consult a qualified professional. Steaming is normal, but if on fire and visible flames can be observed then call for assistance and raise the alarm.      . Leave to cool, The longer you leave the more likely the cake will retain a fine and sturdy structure when you remove the tin. That said, if you are getting desperate and this whole affair is taking longer that expected, it is edible as soon as you can bear the temperature.Once cooled, test a portion of the cake to check it for taste and yummyness. Be sure to have a representative cross section. If deemed suitable, serve delicious chocolate cake to friends and family.I recommend telling them it is courgette chocolate cake after they have tried some as imprecise expectations of the taste can colour participants enjoyment.See photos for some serving suggestions, and please do add your own pics and modifications below. We'd love to hear about your twists on this one.      \nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_5_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_5_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_5_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_5_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. You will need: - A dog whose birthday it is, or a human who likes dog food - A dish the size of the desired cake - A banana - Some kibble - Dry oats - Mayonnaise - Food colouring - Wet dog food - Various kitchen tools. Grind up your kibbles. The amount will differ depending on the size of your cake, aim for about 1/3 of the final cake-volume. Here I used a magic bullet. But, any blender will work. If you don't have a blender, try a mortar and pestle, or use something heavy to smash them into powder. Get them as fine as you can - we are trying to mimic wheat flour.. Add some wet ingredients to hold it together. I used a banana because my dog likes bananas. If yours doesn't experiment with a few sticky things until you find something they like. Honey, peanut butter, molasses perhaps. Add just enough to hold everything together. You will play with the amount later. Add the wet dog food. Use about as much by volume as your dog-food flour.. You will need to balance the texture to make sure it stays up. Add a little milk or water if it is too thick. Add your oats, or more kibble flour if it is too runny. In the end, your batter should be about the consistency of play-dough. It should be firm, and hold its shape without assistance.\u00a0. Oil the inside of your cake pan and press your batter into it. Place the pan upside down on a plate, and tap the cake out.. Mix mayonnaise with your choice of food colouring to create designs on your cake. Use white or coloured icing as a base frosting.. Grab your mutt and celebrate!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_6_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_6_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_6_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_6_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Here's the basic recipe.\u00a0 Start by making this:\n(this is essentially a double batch...it will yield about 70-75 cookies...depending on variety)\nYou will bake 375* F for 8-10 minutes...so go ahead and preheat the oven!\n3 sticks of Butter (1 1/2 Cups)\n1 1/2 Cup Brown Sugar\n1 Cup granulated Sugar\nMix and cream\nThen add:\n3 eggs\n2 tsp Vanilla\nmix together\nThen add:\n1 tsp Baking Soda\n1 tsp Salt\n2 Cups Oats\n4 Cups Flour\nMix together. Here's what we're going to do.\nAdd 2 full bags of chocolate chips.\nI used the top 2...dark chocolate and mini's...the melts were used later.\nThese are loaded.\u00a0 But essentially it is a double batch...so 2 bags is just right!. Split up the dough into 5 bowls.\nEach bowl will make a different variety of chocolate Overlord cookies!\n(yes, these aren't just overload cookies...they are Overlord Cookies). First:CHOCOLATE CHIP COOKIES!\nRoll out balls of dough about the size of a ping pong ball.\nWe did 15 per tray. Tray is lightly greased...but I don't think it needs it.\nBake 8-10 minutes at 375 F.\u00a0 I do 8 minutes\nThen remove them and allow them at least 5 minutes before touching them!\nThey need to completely finish cooking...they will be gooey until they cool.\n(don't judge my pans...if you know how to clean them up perfectly...\ncome over and clean them, I will give you cookies!). Next we add some fun!. SMORE'S COOKIES!\nMake a tray of regular cookies.\u00a0 Bake 8 minutes\nPull out of oven and while gooey, place 3 marshmallows on\ntop with one baking melt chocolate disk for looks!\nThen pop them under the BROILER for just a minute or\ntwo until the marshmallows are toasted!\nGolden Perfection!. COOKIES AND CREAM\nStart with your cookie dough and oreo cookies...\nwrap an oreo completely in a \"thin\" layer of cookie dough, covering it completely!\nThese turn out quite large!\u00a0 We fit 8 on one pan.\nThey bake up perfectly with all that oreo goodness inside!\nThese were way better and bigger than I expected!. SWEET AND SALTY\nTake the Chocolate Chip cookie dough and add broken up pretzel sticks to it!\nMakes a sweet and salty awesome flavor!\nRoll out and bake the same as the regular cookies!. TURTLES\nBake a batch of regular cookies, like the smore's ones.\nPull out after 8-10 minutes and lightly press a pecan or two on top.\nThen drizzle with caramel topping!\nLet cool at least 5 minutes before plating!. Then plate up all your gourmet cookies!\nAdd some little name sticks so your guests know what they are getting into!\nOkay, so yes...you did the math right.\n15 cookies of each variety except the Cookies 'n Cream...only 8 of them\nGrand total: 68 cookies!\nAwesome spread for 1 simple batch of cookies!\nIn a blind taste test...the 8 year old and 10 year old loved the\nCookies and Cream the best!\nFollowed closely by the Smore's!\u00a0 :). Best part about these cookies is they FREEZE!\nThe dough freezes, the cookies freeze...you don't have to eat them all in one night!\u00a0 And they taste good frozen!\nNow you can have a party spread with only the time spent making a batch of cookies!!!\nThanks for viewing, which one do you think you'd like the most???\nVote for me in the Cookie Contest...I'll make you some cookies!\u00a0 :)\nCheck out my blog for more silliness!\u00a0\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_7_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_7_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_7_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_7_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. 8 Hardboiled eggs- peeled ( you can buy peeled hard boiled egg from the store)Tomatoes - 2 Nos medium sizeFrozen or fresh Peas- 1.5 cupsOnion- 1large Ginger- 2 small pieces about one inch eachGarlic- 6/8 large onesCorriender Leaf- 1/4 cup to garnish at the endSpicesPaprika Or Kashmir Red chili( For color) or adobe - 1 table spoon    Cayenne - Depends on taste    Salt - on taste    clove-4-6    Cinnamon-1 half inch or 1/4th teaspoon    Cumin seeds- 1 table spoon    Corriender Powder - 1and 1/2 table spoon.    Garam Masala- 1 teaspoon or according to taste   Cardemom - 2 pod or one pinch of you are using powder.(optional)   You can get these spices at any Indian Grocery store or in the international food isle of  big grocery stores carry it . . First think first remember to enjoy your cooking and there is no rules for cooking. You improve as you keep trying. Keep experimenting. If you don\u2019t like corrender add parsley, if that is what you fancy. In case you dont like peas add edmame. The most important lesson is to enjoy the journey rather than worring about the end destination. Feel like a chef, put on some music, open a bottle of wine and cook by yourself or with your friend. Furthermore, cooking is like meditation, a form of mindfulness. It will relax you and take you away form the daily stress.. 1. First Hard boil the egg. - One can do that by boiling eggs in a large pot of hot water for 15 to 20 minutes. Let it cool and then peel the skin off. Store bought hard boiled are easy and quick as they are even peeled.2. Now take these egg and slit them gently along the side from top to bottom in the center about one inch slit. do it on all 4 side. Don\u2019t worry of it is tough and you egg broke into half. Just get a fork and make whole all along the egg.3. Now heat oil and add all the eggs and REMEMBER to cover it with the lid. With the lid closed hold the handle and gently shake the pan so that the egg rotate. Do this for couple of minute till the eggs turn light brown about 3 to 5 minutes. Keep the eggs in medium heat also u will hear lot of spluttering sound which is ok. Switch off the heat and keep the lid on for few minutes.4. Now remove the lid and take the eggs out in a plate. Save the oil.Alternatively, u can skip frying the egg if you find it dificult. just take the egg poke holes on it with fork and keep it aside and follow steps to make the gravy . 1. In any blender puree all the onion, ginger, garlic, and all the  spices except salt into a nice paste. Add little water if needed to make a fine paste.2. Now add the puree in the pan with the remaining oil. Turn the heat on and fry the paste for about 10 minutes in medium heat. At this point you will notice that the paste is leaving slight oil. 3. Puree the tomato and add it to the paste along with the salt.. 4. Fry it for another 5 minutes. 5. Now add 1.5 cups if water to the paste and in the gravy(if needed add more water). Then add peas and eggs. 6. Bring it to a boil. Then lower the heat, cover the lid stirring every so often to check the gravy. If the gravy is too watery increase the heat and remove the lid, let the water evaporate.  7. Once i the gravy comes to a thick consistency switch off the heat and ass finly chopped corriender leaves. Serve this with Naan bread or pittas. It tastes very nice over a bed of rice. \nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_8_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_8_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_8_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_8_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Plug the blender in. Get a banana and peel it and then cut it up. Put it in the blender. Get an apple and cut it then put it in the blender. Secure the lid. Start mixing. Get a spoon stir it and put it in the cup. Leave it for a bit so it is not puffy. Put it in the fridge for about 30mins. Mmmmm lovely\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_9_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_9_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_9_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_9_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Chocolate sandwich cookies like Oreo or HydroxVanilla Ice Cream 2 pintsWhipped ToppingChocolate Syrup. Put sandwich cookies into a plastic bagCrush themSave 3/4 cup of mixture for toppingPour crushed cookies into the bottom of container. Let the ice cream softenScoop it out onto the cookie mixture crustSpread evenly over the top. Add the whipped topping to the top of the ice cream. Add the reserved cookie crumbs distributed evenly over the whipped topping.Drizzle chocolate syrup over the top. Freeze for at least one hour.It will be difficult to cut but will soften quickly in the heat.Running a knife through hot water before cutting will ease the process as well.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_10_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_10_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_10_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_10_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Only a few tools of the trade needed for this delicious dish.PlateButter KnifeCutting board or paper towel to put underneath the wrap (optional). The ingredients you will need include the following:A soft Burrito tortilla NutellaHoney. With the butter knife, spread Nutella all over a flat, round tortilla. Get out your honey and drizzle it out all over the nutella.  How much do you say?  All depends on your preference.. Roll your masterpiece like a Burrito!. Serve It and Eat It!I hope you enjoyed this tutorial on how to make a Nutella and Honey Wrap Burrito! This process might get a little messy so I would advise you to put a cutting board, a paper towel, or a napkin underneath before you make it. One Nutella and Honey Wrap Burrito will serve on person each, so if you are making them for a party I suggest making a few of them. Stay tuned for more Instructables by the DIS DemonsBe sure to follow the DIS Demons @ MYWLAKETECH.Made by DIS DemonGa349140\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_11_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_11_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_11_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_11_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. 1) 1/2 lb of bacon2) 2 white onions3) 2 tomatoes4) 2 Serrano peppers5) 2 quarts cooked pinto beans (see link to previous Instructable -> Pinto Bean Instructable )6) 4 cups water7) 1/2 cup of cilantro leaves8) 1 tbsp ground cumin9) Salt as needed. Cut the bacon into 1/2 inch strips. Cut the white onions into 1/2 in cubes. Cut the tomatoes into 1/2 inch cubes.Cut the Serrano peppers into 1/8 semi-circles. Brown the bacon in the pot. After the bacon is browned, add the onions and peppers into the pot. Cook until onions are translucent.. Add the tomatoes to the mixture and cook for another minute.. Add the 2 quarts of cooked pinto beans to the pot and 4 cups of water.. Add the 1 tbsp of ground cumin to the mixture.. Add 1/2 cup of fresh cilantro leaves to the mixture.. Even though the beans and the bacon have salt. I always check the salt of the beans at this point. The onion and tomatoes may dilute the seasoning of the entire mixture. I checked this batch and I had to add 1/2 tbsp of salt to get it to my preference.. Let the mixture simmer on low heat for 30 minutes.. Frijoles Rancheros are served as a side item to a main dish in a nice deep bowl.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_12_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_12_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_12_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_12_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. To make this scrumptious snack, you are going to need the following foods:Some kind of crackers: I highly recommend wheat Thins, as those have been my favorite out of all the crackers that I've tried so farCheese: I use cheddar cheese, and the only type of cheddar that I like with this is sharp cheddar cheese, so that's what I use and recommendPickled Jalapenos: They can be in a jar or in a can, I prefer them in a jar, but the ones I use were in a canAs a side note, I used a couple of mason jars to put the jalapenos in after I opened the can, so you might want a couple of those handy.. Now you just need to open up your can of jalapenos. Once it's open, then you can transfer the jalapenos into a mason jar for storing. This step is quite self explanatory... Once in the mason jar, you can refrigerate your jalapenos if you like.*This is unnecessary if you got jarred jalapenos to start off with... Its actually really completely optional, but recommended. Once again, another self explanatory step. All you really need to do is get your block of cheese, open it, and then cut a few slices from it. If you want, you can cut it into the size of the crackers. Once done cutting, you can easily store your cheese by getting a plastic sandwich bag, and just putting it over the open end of the block, then refrigerate your cheese.Oh... and uh, try not to cut yourself... it hurts. Now, for the good part... The construction of this delicacy. Get a cracker, and lay it downAdd a slice of cheese that is cut about the size of the cracker, and lay it on top of the crackerThe best Part!!!! ADD THE JALAPENOS!!!!You can also make a nice plate of this, just add some cheese, crackers, and a bowl of jalapenos to a plate and set it out for you and your friends, or... maybe just for you, either way, it's always good.... Now eat your amazing snack. In case you aren't too accustomed to a lot of spiciness, then you might want to have a glass of milk handy nearby... just saying. And if spicy isn't really your thing, well, umm, I recommend not eating this, you can go find another snack to munch on.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_13_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_13_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_13_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_13_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Ingredients for the Mushrooms' Body and Top\n\u00a0\u00a0\u00a0 1 cup butter (two sticks...yes, two sticks!)\n\u00a0\u00a0\u00a0 1/4 cup powdered sugar\n\u00a0\u00a0\u00a0 1/2 cup pecans (walnuts or hazelnuts work as well)\n\u00a0\u00a0\u00a0 1 3/4 cups all-purpose flour\n\u00a0\u00a0\u00a0 1 teaspoon pure vanilla extract\n\u00a0\u00a0\u00a0 1/4 teaspoon salt\n\u00a0\u00a0\u00a0 Pound cake (a frozen loaf is fine, that's what I used)\n\u00a0\u00a0\u00a0 Black decorator\u2019s gel\n\u00a0\u00a0\u00a0 Red and green food coloring\n\u00a0\u00a0\u00a0 White frosting or white fondantIngredients for Royal Icing (Using Egg Whites)\n\u00a0\u00a0\u00a0 1 3/4 cup powdered sugar\n\u00a0\u00a0\u00a0 1 teaspoon lemon juice\n\u00a0\u00a0\u00a0 1 egg white\nI used the above recipe, but if you have young 'uns in your household, you may want to try the below recipe for royal icing instead:Ingredients for Royal Icing (Using Meringue Powder)\n\u00a0 2 cups powdered sugar\n\u00a0 1 1/2 tablespoons meringue powder\n\u00a0 1 teaspoon lemon juice\n\u00a0 1/2 warm waterHot Tip: As with all things involving royal icing, there will be long drying times. If you\u2019d rather do these recipe in two parts (which is what I did), the top part of the cookie can be made the day before and stored in an air tight container. Then you can do all the frosting fun the following day.. To start off the cookies, beat together the softened butter and sugar until the mixture is fluffy and looks like mayo.\nOnce you have your bowl o'mayo, add the vanilla extract.\nNext, mix in the salt and flour until everything is well combined.\nAdd the pecans (or whatever nut makes your skirt fly up) and stir.\nWrap up the dough in plastic wrap and pop in the fridge for a hour or so for it to firm up. Cooling the dough isn\u2019t required, but it does make the ball forming a tiny bit easier. And isn't life hard enough to not have to fight with sticky dough?. Once you\u2019re ready to bake, preheat oven to 350\u00b0F.\nForm into one inch balls and place on a cookie sheet.\nWhile baking, they\u2019ll flatten out a tiny bit to give the mushroom top shape.\nBake for about 15 minutes, until the cookies are firm and the bottoms are golden brown. Allow the cookies to cool.. Creating the bottom part of the mushroom is easy!\nGrab a circular cookie cutter that\u2019s a little smaller than your baked cookies.\nTake a slice of pound cake that\u2019s about 1/2\u2033 thick. You'll get about a dozen and half slices out of your pound cake.\nIf you feel your energy lagging, feel free to test out one of the cookies you just baked! Delicious!. We have the shapes, now we need to decorate!\nWhen you\u2019re ready to make the royal icing, beat the egg white and the lemon juice together for a few minutes. Add the powdered sugar to the mixture until you get a thick icing. If it\u2019s too thick, add a few drops of water.\nLay down some tin foil and coat the sides of the pound cake pieces with the icing. If your icing is too thin, add a bit more powdered sugar to the icing mix.\nAllow the freshly iced pieces to dry for about an hour.. This next part gets quite fun and messy, so you\u2019ll want a lot of paper towels and a sink nearby!\nDivide the reminding icing into two bowls. Dye one green and one red.\nDip the cookies into the icing, making sure to get the tops fully colored. Place the iced cookies on a tin foil sheet. Don\u2019t worry about the icing run-off from the cookies, we\u2019ll take care of that next.\nAllow the cookies to sit for an hour.. Once the royal icing has set, move the cookies from their rather-messy location over to a clean sheet of tin foil. This way the icing run-off won\u2019t harden to the cookie. If the extra edge icing doesn\u2019t fall off the cookie when you pick it up, knock the edge of the cookie on the counter-top.\nThe cookies should sit for another 30 minutes to finish hardening, as the icing around the edge of the cookie will still be soft.\nTo finish the top of the mushroom, draw circles with white frosting or use fondant. I used tiny balls of fondant and pushed them onto the cookie.. To finish the bottom part of the mushroom, draw on eyes with black decorator\u2019s gel.\nWhew! Now to finally put them together!\nTake a bottom part of the mushroom, add a little frosting to glue the top part on, and then push on the cookie.. Makes about dozen and half delicious Super Mario Mushroom Cookies!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_14_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_14_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_14_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_14_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Here the ingredients:\nMilk\nChocolate Flavored Powder\nVanilla\nHere and the materials:\nCoffee cup\nSpoon (2)\nI c. measuring cup\nOnce you have that, you ready to move on!. Pour in 1 cup of milk into the measuring cup.\nThen, put it in the microwave for about 1 min. and 30 sec. You can put it in for longer if you choose.. Once you're done heat the milk, put about two drops of the 100% Pure Vanilla extract. Stir. Then put in two table spoons of the Chocolate Flavored Powder, stir. \nYou can add more Choclate powder if youwant but try not to add a lot of Vanilla, it can over power the entire drink.. The Pour the whole mix into the coffee cup and you're ready to relax, with your new favorite drink. \nYou can toy around with other ingredients. You can add cinnemon to get give a another different taste, but try to keep to to basics of the original recipe. \nEnjoy.\n~D\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_15_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_15_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_15_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_15_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. To properly roll the cookies into their ice cream cone shapes, you'll need a conic form that can withstand some time in a 400F oven.\nThe cookbook suggests a #35 4 1/2\" coronet mold, but since this was a one-off for a French Laundry themed party we decided to make our own out of paper.\nAfter some rummaging, I found a 4\" diameter circular object for tracing (the base of a pitcher) and made some circles on a manila folder.  I also made one on a sheet of glossy paper, the thick stock used as the cover of an expensive yuppie magazine we magically get for free.  Note that I'm NOT putting the glossy stuff into the oven for fear of toxic bleeding or outgassing; that's what the manila folder is for.\nDraw another circle on the glossy paper ~1/2\" outside the original circle, and add a tab.  Now cut around the outside circle and inside of the 4\" circle to make a 4\" diameter stencil.\nCut out the manila circles; I used 5.  These need to be shaped into cones for use as your forms, so you've got to get them nice and tight.  I wanted to staple them into position, but they're too small to successfully staple.  We also nixed glue, tape, and rubber bands as unable to stand up to oven conditions.  Pinning sounded good in theory, but probably would have ended in tears.  I finally ended up sewing them in place, which was surprisingly fast.    The key is to pass the thread directly THROUGH the cone, then wrap around the flap as you prepare for your next pass.  After three or so stabs across the cone, exit next to the original knot (you should have made a BIG knot, and left an inch or so of tail) and tie off with the tail.  These worked beautifully, and looked sort of spooky. . Ingredients:\n1/4c + 3T all-purpose flou\n1T + 1t sugar\n1t kosher salt\n8T (1 stick) unsalted butter, soft but still cool\n2 large egg whites, cold\n2T black sesame seeds\nMix flour, sugar, and salt together.  Separately, whisk butter until it's completely smooth; I used my Kitchenaid with the whisk attachment.  Add egg whites to the dry ingredients, and mix thoroughly with a stiff spatula.  Dump the egg mixture into the butter, and whisk until batter is creamy and without lumps.\nI don't have a picture of the bowl of pasty goo, so here's some of it in the stencil.. Get out your Silpat.  If you don't have one, head to any kitchen store and shell out $15.  Once you have a Silpat you'll find a million uses for it.\nPlace the stencil on the Silpat, and scoop some batter into the center.  Use the sharp-edged spatula of your choice to spread the batter in an even layer over the stencil; scoop off any extra.  If it's grossly uneven you'll get localized browning/burning.  Don't leave any holes.  Lift stencil and repeat.  I did five coronets per sheet, which seemed like plenty. Also, I only had the patience to sew five molds- don't lay down more coronets than you have molds.\nSprinkle black sesame seeds over the top of each coronet.. Put the Silpat on a baking sheet, and transfer to your preheated 400F oven.  Cook for 4-6 minutes, until the batter is just set and you can see the batter ripple a bit.  They'll start sliding around on little melted-butter trails if your baking sheet isn't entirely flat, but this is easily fixable.\nPull the sheet out and sit it on the open oven door to keep warm while you work.  Hold the top of your paper mold with your off hand, and use a tool to manipulate the coronet with your dominant hand.  Be careful- the coronet is hot and greasy; you REALLY don't want to touch it directly. Roll the coronet around the mold as tightly as you can, and finish with the seam side down.  Roll the other coronets and place them up against each other to prevent unrolling.\nPop the sheet of rolled coronets back into the oven for 3-4 minutes to set the seams and let them color up a bit.  The French Laundry seems to make coronets that are entirely golden-brown, but I took mine out earlier for fear of burning. This worked just fine.\nLet the coronets cool/solidify on paper towels for a few minutes before removing the paper forms.. Ingredients:\n1T finely minced red onions\n1/2c creme fraiche\n1/4t kosher salt, or to taste\nfreshly ground white pepper to taste\nRinse red onions in a sieve under cold water, then dry on paper towels.  Whisk creme fraiche in a small metal bowl for 30sec-1minute, or until it holds soft peaks when you lift the whisk.  Fold in onions, then season with salt and pepper.  Refrigerate until ready to serve, up to 6 hours.\nI never got the creme fraiche to reach soft peaks, so shoved it in the fridge and hoped for the best.  It gets a bit more solid as it chills, but... not a lot.  Also, wash more than 1T onions as some get lost in the sieve; measure the 1T off of the paper towels.. Ingredients:\n4oz sashimi-grade salmon fillet (belly preferred), skin and any pin bones removed and very finely minced\n3/4t extra virgin olive oil\n3/4t lemon oil (zest is a potential substitute)\n1 1/2t finely minced chives\n1 1/2t finely minced shallots\n1/2t kosher salt, or to taste\npinch freshly ground white pepper, or to taste\nFind a nice big SHARP knife to mince the heck out of the salmon fillet.  They claim a food processor would ruin the texture; it would certainly be less fun.  Mix in remaining ingredients, then chill for 30 min to 12 hours.. Assembly is easy:  a dollop of each ingredient, presented like an ice cream cone.  They recommend serving them in a lucite holder, but I got lazy and it wouldn't have worked anyway (see below).  If you can't get at a laser cutter or machine tools, you could wedge the cones in rock salt, peppercorns, or the like for a snazzy presentation.\nFirst, scoop a bit of the creme fraiche into the top of the coronet.  Pipe it in with a pastry bag for bonus points.  Apparently if you prepared it properly, it will be thick enough to stick at the top of the cone; mine chose to be too runny for this to work.  Thus, the horizontal cone trick:  I poured the creme fraiche in, then kept it as close to level as possible while adding the salmon, and served it lying on a plate.\nYou can use a melonballer to create cute little salmon scoops, or just do it quickly with a small spoon and/or clean fingers.   Stick a chive tip out the top of the salmon ball to look extra classy, or possibly more like a Teletubby.  Eat immediately if not sooner.\nEither way, they were fantastically tasty.  If I do this again, I'd probably skip the cones and just plop the half-baked coronet rounds into mini-muffin pans to make non-leaky shells to hold my ingredients.  I'd probably substitute a mix of cream cheese with either sour cream or yogurt for the creme fraiche, as it's a lot cheaper, and it mainly provides a fatty foil for the salmon.  Could be made lower-fat if you care about these things.\nCertainly worthy of a repeat, though.\nThis made approximately 20 coronets.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_16_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_16_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_16_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_16_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. What you see in the first picture here is some left-over cooked rice after dinner which is kept in a bowl.Add enough water to the bowl to cover the entire rice and leave it overnight.Next day, drain out the water from the rice and keep it aside. the rice is little fermented and becomes soft. Ingredients requiredTwo teaspoons of red chili powderTwo teaspoons of Cumin seedTwo teaspoons of Fennel seedSalt to tasteTwo medium sized onionsPreparationAdd all dry ingredients in a mixer grinder and make powderChop the onions and make a rough paste in the mixer grinder. Mash the fermented rice lightly with a spoonAdd the spice powder prepared earlier to the riceAdd the onion paste and mix wellNow our rice mix to prepare the crispies is ready. Now we need to make small rough shaped balls with the rice mixUse a large sized sifter or any such material. I have used a sifter made of bambooCover it with a clean cloth and keep it in the sunTake small amount of rice mix in your hand and make rough-shaped balls like this over the clothLeave this in the sun till all the crispies are completely dried . Once the crispies are fully dried, you can collect them and store in an air-tight container.These crispies can be deep fried in oil and served as side with Rice\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_17_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_17_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_17_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_17_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. When I offered to make a Marvel Superheroes cake I found myself wanting something cartoonish and something on the simple side. I didn't want to mimic designs of the actual characters. That would just be too hard. I searched google images and finally thought about looking at toys and came up with these Marvel plush toys. I figured it would be easiest to model my characters from these toys. **This image is not mine nor do I support the website I found the picture on. Just used it to make my characters. Here is the link to the site I found them on http://www.lostateminor.com/2011/01/15/marvel-superhero-plush-dolls/. The only ingredients in these characters are fondant and food coloring. I have made my own fondant in the past but prefer store bought. I always buy white and color it myself with Wilton Icing Colors. For Iron Man you will need the following colors of fondant: -red - or as red as you can get it! -yellow -black -light gray Okay, let's get to work on Iron Man. Of the three characters I highlight in this instructable Iron Man was the least difficult. A good place to start!Tips before we get started: -Working on waxed paper keeps pieces from sticking to your work surface -Adhere fondant pieces together with very small dabs of water -When ever a cut is made with a knife round and smooth the cut edges for a more polished lookIron Man Head - Approximately two inches tall 1. Roll a red circle of fondant in your hands and press it down to about a 1/2 inch thickness about two inches high and two inches wide with rounded edges. 2. Using your fingers stretch out a bit of fondant on either side of the circle to make small bumps. Not sure if they are ears or part of the suit. 3. Press (or roll) a piece of yellow fondant flat to about 1/16 of an inch. Cut it into a circle(ish) shape almost as large as the head. Keep the bottom rounded and cut the top like a heart. Stick to the head piece. 4. Roll two small black circles for eyes and make one small rectangle for the mouth. Stick in appropriate places and set aside!Iron Man Body - Approximately three inches tall 1. Roll an egg shape of red fondant in your palms and flatten it to about 1/2 inch. 2. Using a table knife cut out a shape like in picture 2. You are making the upper body and the arm stubs. Round off the cut edges with your fingers to give the piece dimension. 3. Make a line with your table knife for the waist without cutting all the way through.Iron Man Arms 1. Roll out two thick red snakes about one inch long with bulbous ends for the arms. Slightly press for a flat bottom. 2. Cut a thumb shape on each end of each arm in opposite directions (to make a right and left hand) and make dents for the fingers - as in picture three. We are making two arms at the same time! 3. Make two small yellow triangles about 1/16 inch thick and adhere them to the upper part of each arm where the stub meets the arm. 4. Roll two small thin red snakes and stick them on each edge of the yellow triangles. Repeat with other arm. 5. Using a small dab of water stick the arms on the arm stubs of the body piece.Iron Man Details 1. Roll a small ball of red fondant and press as flat as possible for the monobeam. 2. Roll a small ball of gray fondant slightly smaller than the red piece and stick to the top of the red piece. 3. Adhere to the center of his chest. 4. Add a belt consisting of a long red snake pressed flat with two indents to make the buckle part. 5. Roll two very tiny round balls of red fondant. Stick them on the upper part of the chest on either side.Iron Man Legs 1. Roll two yellow snakes about 1/2 inch thick and press slightly to flatten. 2. Cut two boot shapes out of red fondant. One for the right foot and one for the left foot. 3. Stick the boots to the leg pieces and edge the seam with a small snake of red fondant pressed flat. 4. Adhere the legs to the body piece with a small dab of water. 5. Use a toothpick to secure the head to the body if necessary. My head pieces were as heavy as the body pieces and since I wanted the characters standing on the cake I knew I would need a toothpick to make them more sturdy. Set Iron Man aside to dry and harden slightly. I recommend making characters in advance to avoid any damage to the characters once they are placed on the cake.. For Spider-Man you will need the following colors of fondant: -red -black -blue -white Spider-Man is by far the most time consuming character. Not necessarily as difficult as the other just time consuming! He is made in the same was as Iron Man just with different details. Same techniques and tips apply to all characters.Spider-Man Head - Approximately two inches tall 1. Roll a red circle of fondant in your hands and press it down to about a 1/2 inch thickness about two inches high and two inches wide with rounded edges. 2. Press (or roll) two pieces of black fondant flat to about 1/16 of an inch. Cut out two teardrop shapes for the eyes. Stick to the head piece. 3. Roll two small white circles for eyes. Stick on top of the teardrop shapes. 4. Roll very thin long snakes of black fondant. You will need several pieces to make the webbing on the face. I found it was easiest to roll the snakes in the palm of my hand. 5. Start by making a small circle of the fondant in the center of the face just lower than the eyes. 6. Layer on the webbing with pieces of the thin black fondant as in pictures 3-5 adhering all pieces with a bit of water. Set head aside.Spider-Man Body - Approximately three inches tall 1. Roll an egg shape of red fondant in your palms and flatten it to about 1/2 inch. 2. Using a table knife cut out the body shape and arms like in picture 6. You are making the upper body and the arms. Round off the cut edges with your fingers to give the piece dimension. 3. Line the armpits and inner sides of the body with blue fondant. This will hold the arms onto the body piece securely. 4. Cut an 'M' shape 1/2 inch thick out of blue fondant for the pants rounding the cut edges. 5. Make two boot shapes for the feet - one for left and one for right. 6. Stick pieces together in appropriate places.Spider-Man Details Now that the main body is finished it will need webbing like the face and a spider of course! 1. Pictures 8-11 show the route I took for applying the body webbing and the spider. The photos show better than words can explain! 2. Use a toothpick to secure the head to the body if necessary. My head pieces were as heavy as the body pieces and since I wanted the characters standing on the cake I knew I would need a toothpick to make them more sturdy. Set Spider-Man aside to dry and harden slightly. I recommend making characters in advance to avoid any damage to the characters once they are placed on the cake.. Wolverine was very fun to make! You will need a small piece of card stock or chipboard along with the following fondant colors for Wolverine: -black -yellow -blue -light gray -white -flesh color -red I am starting to simplify the instructions and photos by now since we are on the third character. Use that brain! Challenge yourself!Wolverine Head - Approximately two inches tall 1. Roll a yellow circle of fondant in your hands and press it down to about a 1/2 inch thickness about two inches high and two inches wide with rounded edges. 2. Press (or roll) a piece of flesh colored fondant flat to about 1/16 of an inch. Cut out a half circle the same width as the head and cut a slight 'V' in the top. Stick to the head piece. 3. Roll two small black thin strips for the sideburns. Stick on the very sides of the head from where the eyes will be down to the neck. 4. Cut card stock or chipboard into the eye shapes and cover with a thin layer of black fondant. I found if I didn't use the chipboard the eyes just sort of melted around the head piece. Make sure your guests or child knows not to eat that part. We aren't allowed to eat fondant at my house so it wasn't an issue. 5. Roll two small white circles for the eyes and place on the black fondant chipboard pieces. Adhere eye pieces to the head piece and set aside.Wolverine Body - Approximately three inches tall 1. Roll an egg shape of yellow fondant in your palms and flatten it to about 1/2 inch. 2. Using a table knife cut out the body shape and arms like in picture 2. You are making the upper body and the arm stubs. Round off the cut edges with your fingers to give the piece dimension. 3. Cut out the pelvic piece in blue fondant and stick to the yellow body piece. 4. Line the seam with a strip of red fondant.Wolverine Arms 1. Roll out two thick flesh colored snakes about 1/2 inch long. Slightly press for a flat bottom. 2. Cut two mitten shapes out of blue fondant in opposite directions (to make a right and left hand). We are making two arms at the same time! 3. Line the seams with a strip of blue fondant. 4. Stick arms to the arm stubs of the body piece.Wolverine Legs 1. Roll two yellow snakes about 1/2 inch thick and 1/2 inch long and press slightly to flatten. 2. Cut two boot shapes out of black fondant. One for the right foot and one for the left foot. 3. Stick the boots to the leg pieces with a small dab of water. 4. Top the fronts of the boots with thin pieces of blue fondant. The shape should be the same as the boot only slightly smaller.Wolverine Details 1. Make a small red rectangle with a black 'X' shape for the belt buckle. Adhere to the red belt. 2. Roll out a piece of black fondant about 1 inch square about 1/16 inch thick. 3. Cut out six small triangle shapes. 4. Stick them on the body piece as in picture 4. 5. Roll out a long thin snake with gray fondant. Cut it into six equal pieces about 1/2 inch long. 6. Make three indents on each hand using a toothpick. 7. With a dab of water stick on the claws! 8. Roll out a piece of blue fondant about 1/16 inch thick and cut two triangles about 3/4 inch long. 9. Adhere them to the seam where the arms meet the body. 10. Use a toothpick to secure the head to the body if necessary. My head pieces were as heavy as the body pieces and since I wanted the characters standing on the cake I knew I would need a toothpick to make them more sturdy. Set Wolverine aside to dry and harden slightly. I recommend making characters in advance to avoid any damage to the characters once they are placed on the cake.. Once you have the fondant characters made you will need a good home for them. A cake preferably!! I made a two tiered dark chocolate fudge cake (maybe I will post that recipe someday), covered it with colored buttercream, added some fun and very simple fondant details. I also added graphic signs drawn with sharpie markers and taped to skewers and stuck on my characters. I used a toothpick in each foot of each character to stand them upright. Except The Hulk. He is just leaning slightly on the cake. Watch your birthday boy smile and laugh when he sees the cake and eats it up with delight!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_18_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_18_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_18_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_18_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Always the same basic ingredients :\n- 17.65oz / 500g Bread Flour\n- 11oz / 312g Water\n- 1\u00bd tsp (0.21oz / 6g) Yeast\n- 1\u00bd tsp 0.31oz / 9g) Salt\nMake 2 1lb loaves(or 4 mini loaves in my case)\nRemember to use lukewarm water!. You need some basic tool. If you're lucky to have a stand mixer or bread machine (dough mode), use it!\nBut this can be made by hand\nBy Hand :\n- Large bowl\n- Large wooden spoon\n- Kitchen scale\n- Your hands\nWith Mixer:\n- Stand mixer or Bread machine\n- Dough hook or bread machine paddle\n- Kitchen scale . Mix the yeast and water.\nLet rest for 5 min.. Add the flour and salt to the bowl all at once\nMix until the dough holds its mass together.. When the dough is holding a \"shape\", start to knead.\nBy Hand:\n- Remove from the bowl and place on a oiled surface.\n- Oil your hands and start to knead.\n-\u00a0Stretch, fold then push down with your palm.\n- Turn 90 degree and repeat until the dough is elastic and smooth.\nWith Mixer:\n- Let the mixer knead until the dough is smooth.\u00a0The bowl should be almost clean when it's done.. Bread needs some time to rest and develop all it's goodness. This is called proofing.\n- Form in a round ball and place in a oiled bowl.\n- Cover with a cloth or plastic wrap.\n- Let rest for 1h or doubled in size. As useless as it sounds, your bread will be far better with this second proofing. You really don't want to skip this step.\n- Uncover.\n- Punch down with your fist.\n- Form into a ball (doesn't need to be perfect)\n- Cover with a cloth\n- Let rest for 1h (again!). Finally, it's almost done with all this waiting time!\n- Take the dough out of the bowl (try to keep the air bubbles!)\n- Separate in 2 (or 4) equal parts.\n- Shape into a log as big as your loaf pan\n- Put into a lightly oiled loaf pan\n- Cover\n- Let rest for 1h\n30 min prior to baking, preheat your oven to 425F. As with any other food, baking is the most important part of it.\n- Uncover.\n- Score the loaves as you like or leave them plain.\n- Put into your preheated oven on the middle rack.\n- Bake for about 25-30 min. (or 20-25 min. for mini loaf)\n- The crust should be golden brown.. Finaly! We're done...almost!\n- Remove from oven\n- Remove from loaf pan\n- Let cool on a cooling rack for an 1h\n- Cut into slices and eat!. Don't stop here, try new things!\nFor me, bread is a canvas for creating\u00a0wonderful\u00a0mixes. Think of it as a solid bloc pizza!\n- Add nuts, fruits, chocolate... whatever !\n- Try other flour types\n- Try other liquid (milk, beer, tea? why not)\nI hope this instructable will spark the bread baker in of you.\nHave fun!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_19_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_19_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_19_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_19_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Measure out 2 cups of heavy whipped cream and pour it into a bowl. Mix the whipped cream with your electric mixer on medium speed. If you do not have an electric mixer you may whisk by hand but I should warn you that this will take quite some time. I do not have a hand mixer so I taped a fork (I changed it to a whisk later) to a drill. You will need to mix this for quite some time, about 7 minutes, before it gets creamy. If your heavy cream is creamy and fluffy and keeps the indentations of your stirring utensil then you've got it! Do not over do the mixing or the mixture will fall apart and you will ruin the consistency.. Ok, now that your heavy cream is the right consistency pop open your can of condensed milk and add it into the heavy cream mixture. Now you can also add 1 large table spoon of vanilla extract. Mix away on medium speed, this will take approximately 5-10 minutes. Again, your ice cream mix should be fluffy and creamy. Once your ice cream is perfecto then add in your stir ins, if you so desire, and just mix them into the ice cream. Next you can transfer your ice cream into a container that has a lid. A lid is necessary to prevent the ice cream from getting icy and to preserve its creamy goodness. Place your ice cream in the freezer for 8-12 hours. . The ice cream should be firm after you take it out of the freezer- you do not want the ice cream to be too creamy or your sandwiches will fall apart. Take one cookie and place ice cream on top. The amount of ice cream you use is up to you; I used 3 tablespoons for each sandwich. Cover your ice cream cookie with another cookie and give them a gentle squeeze. You'll need to freeze these bad boys for another 6 hours. This time do not cover the cookies, you want the sandwiches to be nice and firm so that when you bite into them ice cream does not squirt out. . Now your cookies n' ice cream sandwiches are all ready. Sink your teeth into this heavenly goodness and enjoy!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_20_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_20_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_20_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_20_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. For this recipe you will need:12-15 slices bread8 ounces heavy cream3/4 cup packed brown sugar1 stick butter or margerine4 eggs1/2 tsp salt1 tsp cinnamon1/4 cup granulated sugar1 can pumpkin puree1/4 cup rum. Tear up each slice of bread into small pieces, approximately 1\" square. Place pieces into a large mixing bowl.. Melt the stick of margarine or butter and cool. In a separate mixing bowl, blend together the eggs, pumpkin, cream, sugars, salt, rum, cinnamon.Slowly add the melted margarine and stir well.. Pour mixture over bread and mix with large spoon until well coated. Pour into a greased large loaf pan.. Bake at 325 degrees for 1 hour & 15 minutes or until knife comes out clean.. Serve warm or refrigerate.. Mix 2 cups powdered sugar with 2 tablespoons rum and 2 tablespoons heavy cream. mix well, add more powdered sugar until you get the consistency you want. Pour over each slice of bread pudding just prior to serving. Enjoy!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_21_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_21_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_21_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_21_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Who has time to make stuff from scratch? Gas up that Hummer of yours and get to the mega-mart to shop for your wholey-processed convenience foods. Be prepared to pay more if you want it certified organic. If you really want to do things right, you of course can make your pie dough from scratch and even make fresh pie fruit filling if desired. Just search for the instructables on those topics.\nYou will need:\nPre-made pie crusts\ncan of apple pie filling\nan egg to use as a glaze\nexperiment with fruit jelly glaze or food coloring to decorate the display\nsome aluminum foil to make a ring mold\nbaking tray or cookie sheet to bake the pie\nCAUTION: Know how to operate your kitchen appliances, be careful with using sharp implements, know how to work with gas, clean up after yourself. If you can't stand the fn heat, get out of the fn kitchen.. Get a piece of aluminum foil to form a rectangular mold the shape of our ipod.\nGrease up a baking pan or cookie sheet with butter.\nPlace the ring mold in the center.\nDrape in a pie crust to fill the bottom and the sides.\nTrim away the excess at the top.. Fill with pie filling.\nDrape a piece of pie dough over the top.\nTrim away the excess.\nCrimp the edges all the way around.\nCut out pieces that make the display screen and control rings.\nYou may need to brush the dough with some water to get it to stick.\nUsing a fork, pierce some vent holes for excess steam to escape when baking.\nYou can carefully pick it up and do a custom inscription on the back. \u00a0Make note of the serial number for all warranty work.. At this point you should glaze your ipod. A final egg wash goes over everything to give it that baked gloss.\nBake in an oven at 350 degrees F for about 30 minutes, depending on the size of the storage space. Keep an eye on it so it doesn't burn. The pie filling may ooze over and make a mess so use a cookie tray as a drip pan. It makes it easier to get the pie out of the oven too.\nTake out of the oven to cool.\nEnjoy!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_22_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_22_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_22_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_22_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. The BBQ Pizza Dome uses a hooded 2 burner gas BBQ as the heat source. I found mine on Ebay for A$60 (minus a few non-essential parts such as the warming tray )\n4 x refractory oven tiles (300 x 300 mm) used in the floors of commercial pizza ovens\n1 bag of High temperature mortar mix.(they only seem come in really big bags)\n4 lengths of Hollow Square Section steel. (1 Metre)\n2 lengths of 12mm Solid Rod. (1 Metre)\n2 lengths of 6 mm solid steel rod. (1 Metre)\nElectric drill and Drill bits to match. 12mm and 6mm drill bits.\nTie wire.\nTape measure\nFelt tip pen (permanent)\nWater spray bottle\nRubber gloves\nMetal mesh (Chicken Wire)\nYou will need to cut 3 of the the tiles into strips so that they are square. For example my ties are 40mm thick, so I cut the tiles in to 40mm strips. \nA 300mm tile will give you 7 strips of 40mm. I needed 15 strips for my dome, plus a few spares.\nThe remaining tile is used as a pizza stone, which will sit under the dome.\n. In order to get the best results it is important to check the BBQ you intend to use for the conversion for the following features.A hood which offers good coverage of the cooking area. An inner lip inside the BBQ & this is where the hotplate and grill plate usually sit.I recommend that you scour sites like EBay and go to a few garage sales, as it\u0019s likely you might pick up what you need cheaply.Things like missing grill plates and side tables are often the reason other people choose not to buy, as most are looking for a BBQ to use in the conventional way.You could user a number of materials to accomplish this, I just happen to have some spare HSS and some steel rod left over from another project.The steps to construct the dome support are as follows.Measure up the grill area.Measure the grill area, and the inner support lip, taking into account any bolts or welded joints that may impede the placement of the steel support frame and furnace bricksMeasure and cut the steel to match the grill area.I used Hollow Square Section and some 12 mm solid round section (both mild steel). The square section was cut to length (less 2 mm for expansion) and then drilled to take the 12mm round section. Note: If you have access to an electric welder, you could produce a more rigid frame using HSS and reinforcing mesh.. Assemble the frame.\nThe frame consists of 4 pieces of Hollow Square Section (HSS) and 2 lengths of 12 mm solid round section.\n2 of the HSS pieces have 12 mm holes drilled completely through both the inner and outer faces  and the remaining 2 pieces have holes drilled only on the inner face) \n2 lengths of 12 mm solid round section were inserted through the 2 HSS with and then the HSS with the single hole  was placed on the end of the rod.\nWhen properly fitted together, the frame drops into the lip of the BBQ at roughly the same height as the original grill plate and hotplate.. Once the frame is installed you will need to confirm the maximum height the dome can be with the hood down. This will depend entirely on your BBQ and the type of hood that is fitted.Take care to look for any bolts or fittings which may impede a tight fit against the back wall of the hood.You also need to take into account the thickness of the refractory material you are using.Create a dome template (Paper)Based on your measurements in taken earlier, use a computer based drawing program to assist with the template creation and ensure that your scale is 1:1Draw a rectangle to represent the maximum height and width of the space in the BBQ.The rectangle should have a vertical centre line drawn through  it to assist with alignment.The straight side of the dome should be twice the thickness of the refractory material.  In my case the refractory tile was 40mm thick. So the straight sides need to be 80 mm.Using a tool draw a curve from the top of the inside edge of the wall to the centreline of your rectangle where it intersects with the top. Once done, copy and mirror the shape and align it on the other side.I have included a visio drawing with dimension to assist you and an enhanced metafile document for those who don't have visio.If you have a printer which only can print half of the template, you will need to ensure that you include the centre line in both prints so that you can align them using this line.Print 2 copies of the dome design and cut off the excess paper as neatly as possible.Align the two prints together using the centreline as a guide.  Use sticky tape to hold them together.TIP : If the paper you are using is thick, and you cannot see the centreline on the bottom sheet through top sheet, use hairspray or oily nail polish remover to make the top sheet transparent while you align it with the bottom one.To create a wooden former for the dome using the paper template you can use apply the template to the ply using watered down PVA glue. Coat the surface of the plywood before applying the paper template. You will need to work quickly before the glue dries.In order to make two identical templates, screw a second piece of ply to the first so that you can cut out both pieces in the one operation.It is important to remember that all adjustments to the ply template must be made to both, otherwise the fit of the refractory materials will not be neat.My dome consist of 3 main sections, the left hand side (5 pieces), the right hand side (5 pieces) and the top dome (5 pieces)The taller ones (80mm) are the two straight pieces and are made up from two 40mm tile stripsLay the tiles (dry fit) on the edges of the template to see if they fit correctly when laid on the template.Mark the point where the last of the 5 left tiles ends on the temple, repeat for the 5 tiles on the right hand side. When compled you should have something that looks like this.Use spacers to build up each side with strips of tile until you have filled in the whole template.Once completed, dismantle the dome and re-assemble in the BBQ and ensure that  the hood can be closed. Resolve any issues with fit by trimming the template and adjusting the placement of the tiles. Once you are happy with the fit label the end of each tile so you know where it goes.. Once the end tiles fit correctly, you can mortar them together.\nNote: You must use a minimum of mortar on the inside edge to ensure a strong bond. The outer surface is less critical and can be filled with mortar once you have completed the initial construction steps.\nSoak all the tile pieces in water for 30 minutes to ensure they are thoroughly wet. If the tiles strips are dry, it will instantly dry up the mortar as you apply it and the bond will not be strong.\nI made my dome in 3 main sections, the left hand side (5 pieces), the right hand side (5 pieces) and the top dome (5 pieces)\nIf you construct the dome in this way you can move the pieces easily and it allows you to fine tune their placement in the BBQ.\n. I decided that I might need to reuse the ply templates should the whole dome collapse or I decide to build a second dome.\nI made up two metal dome supports to help during the final rendering and firing processes, and it also  saved me from having to move the whole dome to remove the wooden template.\nI used 6 mm steel rod bent into the shape of the underside of the dome.  The rod is fitted into holes drilled in each support section where the steel and the tile meet.. Tie the steel supports and tiles together to help support the 3 major sections.\nIf at this point you want to test the dome (like I did) you can fire up the BBQ and take it on a test drive.\nLeave the BBQ on medium heat for 2 hours to heat through and remove any oil or reside which might taint the pizza.\nWhen you are happy with the performance of the oven, you can move to the final steps.\nOnce the BBQ is cold. Apply the mesh to the outside of the dome using the steel rod as a tie-down.. Apply mortar to the joints of the 3 segments and apply the finish coat of mortar to the outside of the dome ensuring that no mesh is visible. \nYou can apply an oxide to the mortar to match the BBQ if you wish.\nKeep a spray bottle with water handy to keep the mortar wet while you are applying and smoothing.\nThe mortar has a grey finish which is a bit boring, I decided to add a final render coat with some oxide as a colouring agent.\n. Once completely dry you should fire up the BBQ and slowly heat it. \nLeave the BBQ on medium heat for 1 hour and then check the dome for cracks and splits.\nThe one remaining refractory tile should be placed on the supports under the dome, this is used to cook the pizza on.\n. Now you are ready to cook pizza.\nYou may need to experiment with the stone height placement to ensure that the bottom stone is not too hot compared with the inner dome temperature. \nThe ideal temperature for me seems to be around 360 C which cooks a pizza between 3 and 4 minutes depending on the toppings and the dough hydration.\nI always cook with the hood closed, I check every 45 seconds and turn the pizza around halfway through the cooking about 2 minutes.\nHere are my first pizzas from the dome, cooling before serving.\nCooking with the dome is a definite improvement, I now have to re-learn how to cook them to get the best results.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_23_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_23_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_23_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_23_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Add one table spoon of minced garlic. Add more or less depending on how much garlic can handle.Mix the garlic into the butter.. Give it another mix before seasoning with salt.Once it\u2019s been mixed, set to one side.. A long loaf like this one works best.Slice across the loaf on the diagonal, leaving about an inch between each cut. Then slice the opposite direction, take your take because you don\u2019t want to rip a part the bread.. Make sure you really get it in there. You want every bite to get a beautiful taste of that garlic butter.Spread some of the butter across the top of the loaf.. When the cheese melts it will not only add an amazing texture but all help to hold the loaf together until it\u2019s ready.This is ready to hit the BBQ.. Bring the BBQ up to a medium to high temp.. Cover with the lid and let it cook for 15 to 20 minutes.. Take it off the heat, be careful as the bread is really hot.All that\u2019s left to do is serve it up.. It the perfect finger food for a party and quick and easy to make.The best part of making this way is being able to pull out squares of bread with the cheese and garlic butter.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_24_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_24_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_24_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_24_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Software\nMacaron shells\u00a0consist of four ingredients. Yes, four. That's it. What makes these tricky is the process, not the ingredients.\nFor best results, you should really get a kitchen scale. These things are magnificent and I promise that you'll be able to use them for many things other than these macarons (my mother uses ours to weigh her postal packages).\nWhen measuring ingredients, weigh your egg white (since you can't control exactly how much the chicken put in each egg) and scale this formula appropriately. In case you don't have a scale, I have included volumetric approximations below.Formula:\nEgg white\nAlmonds (whole, slivered, or ground): 1.2 X weight of egg whites\nPowdered sugar (aka icing sugar or confectioner's sugar): 2.25 X weight of egg whites\nCastor sugar (aka superfine sugar): 0.25 X weight of egg whitesVolume Conversion:\nOne large egg white (30 g)\nAlmonds (slivered): 1/4 cup\nPowdered sugar: 1/2 cup\nCastor sugar: 1/2 tablespoon. Hardware\nThis is what I'll be using, but you can use whatever works for you. I've included some alternative suggestions.\nFood processor\n\u00a0\u00a0\u00a0\u00a0 (Alternatives: ground almonds, see Step 3)\nStand mixer\n\u00a0\u00a0\u00a0\u00a0 (Alternatives: hand mixer or whisk)\nSpatula\n\u00a0\u00a0\u00a0\u00a0 (Preferably silicon)\nPiping bag and round tip\n\u00a0\u00a0\u00a0\u00a0 (Alternatives: disposable plastic bag with hole cut in the corner or parchment paper cone)\nBaking tray\n\u00a0\u00a0\u00a0\u00a0 (Alternatives: cookie sheet)\nSilicon baking mat\n\u00a0\u00a0\u00a0\u00a0 (Alternatives: parchment paper). What does \"tant pour tant\" mean? I can't translate it exactly, but roughly, it means \"equal amounts,\" referring to the almond meal and powdered sugar. In layman's terms, it's the dry ingredients.\nPlace the almonds and powdered sugar in the food processor. Pulse until the almonds are finely ground and the ingredients are completely combined.Tips for Success:Almonds :\u00a0If you don't want to use a food processor, simply buy pre-ground almonds and sift those together with the powdered sugar.Powdered sugar : Powdered sugar contains corn starch.\u00a0A little corn starch is helpful to the texture of the macaron, however, cheap brands bulk up on corn starch, too much of which will be bad for the macaron.. This recipe uses a French meringue, which means that uncooked egg whites are simply beaten with a little sugar.\nPlace the egg white in the bowl of your stand mixer. Start to whisk the egg whites. Once the whites have\u00a0soft peaks, gradually\u00a0add the sugar. Once the sugar is incorporated, increase the speed and whip the whites until they have almost firm peaks and the meringue is glossy and smooth. If the whites look watery and lumpy, you've gone too far.Tips for Success:Egg whites : To attain the most stable meringue possible you need aged egg whites. Leave the whites out at room temperature overnight (what? ew...). Egg whites have natural antimicrobial properties and baking these cookies will kill any bactera. Some recipes actually recommend a multi-day aging. Mine usually lasts about 8-12 hours. The purpose is to cause some water to evaporate, leaving a higher concentration of egg proteins. Alternately, keep the separated whites in the refrigerator for 2-3 days or add a pinch of dried egg whites to them.Whipping process : Make sure your bowl is clean! Any traces of fat will cause massive problems. Do not use plastic bowls. If you'd like, add 1/8 teaspoon of cream of tartar (not tartar sauce!), a drop of lemon juice, or a pinch of salt to the egg whites before whipping them. Also, copper bowls have been shown to produce better meingues due to chemistry I don't understand. If you have one, great. If you don't, I wouldn't suggest buying one ($$$).Superfine sugar: The sugar going into the meringue\u00a0should be as fine as possible so that it can dissolve quickly and completely. All supermarkets carry superfine sugar, but if you don't have any, simply put granulated sugar in a food processor.Coloring : If you want to create other colors, make sure not to use liquid coloring because they will loosen the meringue. Use either powders or gels and add them at the very end of the meringue-beating process.. Unlike \"tant pour tant,\" I can translate \"macaronage\": the act of creating macaron batter. Yes, the zany French have created a word specifically for this dessert. This is the most integral step of the whole process and the easiest one to mess up. The purpose of macaronage is two-fold. It combines the wet and dry ingredients and deflates the meringue just so. This is what transforms this recipe from an almond meringue to a macaron.\nBegin by adding 1/2 of the tant pour tant into the meringue. Fold until the powder is completely incorporated and add the other half. Once the second addition is fully incorporated, check the consistency of the batter by dabbing a bit on a plate. It should settle and leave no beak. If it isn't ready, continue folding and check the batter every couple of folds. Alternately, there should be a point at which the batter ceases to be completely solid. The batter is the right consistency if it sloshes slightly when you tilt the bowl. As soon as the batter appears ready, stop. It is better to undermix than overmix.\nIf you are using a stand mixer to do your macaronage, switch to the paddle attachment. Dump all the almond and sugar mixture into the mixing bowl and turn the mixer to its lowest speed. Mix until the powder is completely incorporated then check for consistency. You shouldn't need to mix for more than 10 seconds. A stand mixer can be faster, but make sure you know what consistency you're looking for. I wouldn't use one the first time through.Tips for Success:Macaronage : Recipes on the internet are littered with descriptions of how much to fold. Some say that the batter should \"flow like magma,\" but I've never been to a volcano. Others try to count folds, but the amount of batter and the differences in folding technique vary greatly. Really, a visual is necessary. Here is a helpful video (oui, c'est en francais) which shows the proper consistency:http://www.youtube.com/watch?v=yDo0SgDKLVw. \n\tOnce your macaron batter has been formed, the most difficult steps are over. Now, you just have to pipe the circular cookies. First, line your baking tray with either parchment or a silicon mat. Fit your piping bag with a relatively large round tip.\u00a0\n\tIf this is the first time you're making these (or you have a bad case of OCD) you can trace circles on the underside of parchment paper to guide the size of your macaron. I usually go for 1.5 inches in diameter, though you could easily make larger ones. Pipe as evenly as possible to prevent uneven cooking.\n\tOnce you've filled your tray, rap it a few times on the counter to settle the batter and get rid of big air bubbles.Tips for Success:Technique\u00e2\u0080\u008b: When piping, make sure that the piping bag is perfectly vertical and\u00a0perpendicular\u00a0to the baking sheet. Hold it about a centimeter above the sheet. To ensure evenness, I always squeeze with the same hand pressure and count out how long it takes to pipe each round (usually around 2 seconds).\n\t\u00a0Parchment paper: Some people claim that using parchment paper will yield straighter feet, though I have yet to prove\u00a0this. Parchment paper sold in flat sheets is preferable, because the rolled kind wants to, you know, roll up. If you only have the rolls, pipe a dab of macaron batter in each corner of the sheet to glue it to the tray.Silicon baking mat: I have observed several advantages.\u00a0They're reusable, prevent sticking extremely well, provide extra insulation to prevent the bottoms from burning, and are guaranteed to be level, so you won't end up with leaning cookies.Resting: After you pipe the macaron shells, let them rest for a few minutes before you put them in the oven. That way, they have a chance to start developing shells before you even bake them. My rule is to only preheat my oven after the shells have been piped. When my oven is ready, so are the macarons.. \n\tThere is much debate as to the best way to bake macarons. Professional suggest starting at a high temperature to dry the shells, then gently cooking the insides. However, this is too complicated for me (and my poor oven). I chose the low and slow method, with a decidedly middle-of-the-road temperature. There is no perfect temperature, rather a range with varying baking times, of course.\n\tPreheat your oven to 300 F (150 C). If your macarons are 1.5 inches in diameter, bake them for 12-13 minutes on the center rack of your oven. If they are 3 inches, you may need to go up to 15 minutes. They should rise vertically on straight feet and should not brown on top. Baking time depends on your oven. Your first time making them, I would check on the macarons (use the oven light and don't open the door!) after 10 minutes. They should be fully formed, but don't let them brown on top. When they are done, let the macarons cool on the tray. Then, pair them up by size.Tips for Success:\n\t\u00a0Baking Trays: If you find that your oven burns the bottom of your cookies, stack two baking trays on top of each other to emulate professional-quality equipment.Browned tops: If your oven makes the tops of your macarons brown before the insides are cooked, place another baking tray or some aluminium foil on the top rack of the oven to protect the macarons.. Strangely enough, the original macaron filling was nothing. they were just welded together and presented as a double-sided cookie. Many speculate that the first filling for macarons was fruit jam, which makes sense. There are a great variety of jams, and all of them blend nicely with plain macaron shells.\nHowever, we're here to talk about chocolate. For the chocoholic, there can and will only ever be one filling: chocolate ganache. Obviously, there is no way to get the amount of ganache perfect for the number of macaron shells. It really depends on how much you put\u00a0in each\u00a0cookie.\u00a0I generally plan on\u00a01 oz. of chocolate per egg white. Of course, if there's any left over, it's the chef's to eat!\n2\u00a0oz. bittersweet or semisweet chocolate, chopped\n1/4 cup milk, half-and-half, or cream\n1/2 tbsp butter\nScale this recipe as needed. Heat the milk or cream in the microwave or on the stove\u00a0until nearly boiling. Pour over the chocolate. Stir until the chocolate melts, then add the butter and incorporate.\nTo fill the macarons, simply take a butter knife and slather it on one macaron of each pair. If you want to be neater about it, you may certainly pipe the ganache. Lid them and your macarons are assembled!Tips for Success:Fillings : This isn't so much a tip as a suggestion. You can spice up your chocolate ganache with a number of different things. You could add a small pinch of salt for depth of flavor, some instant espresso powder for a mocha twist, a dash of vanilla extract for a nice undertone, or even some chili powder (!) for a Mexican-style kick. The fillings are where you get to experiment!. Once they are assembled, DO NOT eat the macarons right away! You will be very disappointed. Instead, let them sit in the refrigerator for at least several hours to mature, preferably overnight. This fixes all the potential macawrongs . Overly stiff shells loosen up, cracks become less apparent, and most importantly, the flavors meld. When you're ready to serve, let them sit at room temperature for a few minutes to take the chill off and enjoy!. The macaron-making community is quite a strong one. Those who know about macarons are usually obsessed, so there is a wealth of information to be had on the internet. Here are some of the websites that have given me the most help.\nIf you understand French, this is a very helpful video demonstration: http://www.youtube.com/watch?v=VsxzeehcI60\nMs. Humble has an incredibly detailed tutorial on her blog: http://notsohumblepie.blogspot.com/2010/04/macarons-101-french-meringue.html\nTartelette is an amazingly creative macaron baker and considered by many the \"queen of macarons\": http://www.tarteletteblog.com/2010/02/recipe-raspberry-mascarpone-macarons.html\nAnnie has created a very straightforward post for chocolate macarons: http://annies-eats.net/2010/06/04/chocolate-macarohns/\nVeronica at Kitchen Musings has a whole series of macaron articles: http://kitchenmusings.com/macaron-chronicles\nDuncan at Syrup & Tang has collected a wealth of macaron tips: http://www.syrupandtang.com/200712/la-macaronicite-1-an-introduction-to-the-macaron/\nDavid Lebovitz (who I mentioned earlier) includes authentic Parisian tips: http://www.davidlebovitz.com/2005/10/french-chocolat\nI hope this has been helpful! Don't be discouraged if your macarons fail the first time around. They may not be pretty, but they'll still taste good. It wasn't until my third batch that I finally got some decent ones. These make a unique and gourmet gift and you can experiment endlessly with flavor and color combinations. Bon appetit!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_25_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_25_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_25_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_25_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Chocolate truffles are a decadent candy that melt as soon as they touch your tongue. For truffles it's very important that you use the highest quality chocolate or they won't set up properly. I used Lindt and Ghirardelli chocolate but Guittard is better if you can find it. I definitely preferred the Lindt to the Ghirardelli. For my truffles I only used vanilla extract for added flavoring but you can infuse the cream with vanilla bean, spices, citrus zest, coffee, or a tablespoon of alcohol such as rum to get the flavor combination you want. .White Chocolate Truffles:4-oz white chocolate, chopped1/8 cup heavy cream1/4 tsp vanilla extract.White Chocolate Coconut Truffles:4-oz white chocolate, chopped1/8 cup heavy cream1/2 tbsp unsalted butter1/4 tsp vanilla extract1/4 cup coconut flakes (sweetened or unsweetened).Dark Chocolate Truffles:4-oz bittersweet chocolate, chopped; or 1/2 cup heaping bittersweet chocolate chips1/4 cup heavy cream1/2 tbsp unsalted butter1/4 tsp vanilla extract.Truffle Coatings:I usedChopped coconut flakes and powdered food coloringChopped coconut flakes and cocoa powderChopped milk chocolate chipsChopped candy meltsBut truffles can be coated in anything you want really! Melted Chocolate, Nuts, Bacon, Sprinkles, Powdered Sugar.Flower Petals:Made using Wilton Candy Melts. Candy melts are candy flavored like chocolate that you can easily melt in the microwave or in a double-boiler. . Place 4-oz of chopped white chocolate in bowl.Place 1/8 cup heavy cream in small saucepan over medium-high heat. As soon as the cream boils, add 1/4 tsp vanilla, and pour over chopped chocolateLet the mixture sit for about 30 seconds then stir until smooth.If the chocolate mixture still has lumps you can put it in the microwave for 10 second intervals (just make sure you stir in-between). Or heat in double-boiler, stirring until smooth.Cover the bowl and place it in the refrigerator until truffles are firm (about an hour). If it's not firm put it in the freezer for an hour. With a spoon, scoop out some of the truffle mixture. Roll it into a ball with your fingers and hands. This is very messy!Now you have a white chocolate truffle! We'll coat the truffles later. Truffles keep in the refrigerator for about 2 weeks or in the freezer for a few months. . Place 4-oz of chopped white chocolate in bowl.Chop 1/4 cup coconut flakes (sweetened or unsweetened) in a food processor or blender. Place 1/8 cup heavy cream and 1/2 tbsp butter in small saucepan over medium-high heat.  As soon as the cream boils, add 1/4 tsp vanilla, and pour over chopped chocolate. Let the mixture sit for about 30 seconds then stir until smooth. If the chocolate mixture still has lumps you can put it in the microwave for 10 second intervals (just make sure you stir in-between). Or heat in double-boiler, stirring until smooth.Add the chopped coconut and stir until combined. Cover the bowl and place it in the refrigerator until truffles are firm (about an hour). If it's not firm put it in the freezer for an hour. With a spoon, scoop out some of the truffle mixture.  Roll it into a ball with your fingers and hands. The mixture will stick to your hands. Now you have a white chocolate coconut truffle! We'll coat the truffles later. Truffles keep in the refrigerator for about 2 weeks or in the freezer for a few months.. Place 4-oz bittersweet chocolate, chopped; or 1/2 cup heaping bittersweet chocolate chips in bowl.Place 1/4 cup heavy cream and 1/2 tbsp butter in small saucepan over medium-high heat.  As soon as the cream boils, add 1/4 tsp vanilla, and pour over chopped chocolate.  Let the mixture sit for about 30 seconds then stir until smooth.  If the chocolate mixture still has lumps you can put it in the microwave for 10 second intervals (just make sure you stir in-between). Or heat in double-boiler, stirring until smooth. Cover the bowl and place it in the refrigerator until truffles are firm (about an hour). If it's not firm put it in the freezer for an hour.  With a spoon, scoop out some of the truffle mixture.   Roll it into a ball with your fingers and hands. This step is extremely messy!Now you have a dark chocolate truffle! We'll coat the truffles later. Truffles keep in the refrigerator for about 2 weeks or in the freezer for a few months.. A food processor or blender really comes in handy for this step. .Candy Melt Coating: place some candy melts (as many as you want) in a food processor and blend until they are in tiny pieces. Place chopped candy melts on a plate or bowl and roll truffle in coating until coated. .Colored Coconut Flakes: place some shredded coconut flakes (sweetened or unsweetened) in food processor and add a dab of food coloring. Blend until the color is well distributed. Roll truffle in coconut until coated. I was not at all happy with the yellowish-green I created, so be careful!.Cocoa Powder Coconut Flakes: place some shredded coconut flakes (sweetened or unsweetened) in food processor and add cocoa powder (1/2 tsp at a time). Blend until the color is well distributed. Add more cocoa powder if the coconut doesn't look chocolaty enough. Roll truffle in coconut until coated. .Milk Chocolate Coating: place some chocolate chips (as many as you want) in a food processor and blend until they are in tiny pieces. Roll truffle in chocolate bits until it's coated..Remember truffles can be coated in anything you want: nuts, bacon, sprinkles, cocoa powder, or powdered sugar! They are also fantastic dipped in chocolate. . Making the chocolate flower petals is the hardest part. The truffles are delicious with or without the chocolate flowers so this step is entirely optional. You can also melt the chocolate out of the bag: in a double-boiler or microwave it in a bowl, and then add it to a piping bag. .Figure out how many flowers you want to make (each flower gets a truffle) and get out that many baking cups plus a few extra. .Melting the Chocolate:Place candy melts in a plastic piping bag (one that hasn't been cut) or a freezer bag. Each flower requires 2-3 candy melts. Twist the bag and wrap a rubber band around the twist. Heat the bag of candy melts in the microwave on 50% power for 30 seconds; squeeze bag; put back in microwave on 50% power for 30 seconds; squeeze bag; repeat until chocolate is completely melted (be careful not to overheat).You can also melt the chocolate out of the bag: in a double-boiler or microwave it in a bowl, and then add it to a piping bag.Snip off the tip of the bag..Piping the Flower Petals:Cover the bottom of a mini baking cup with a thin layer of chocolate.Then, starting at the bottom of the baking cup, pipe a line of chocolate up the side of the cup that tapers off towards the top.Continue to pipe lines up the side of the baking cup, leaving a space between each line. Repeat with all baking cups..Place the baking cups with flowers on a plate or tray and place in the freezer for 10 minutes or until hardened. . To finish the flower truffles, simply place a coated truffle in each chocolate flower. You can leave the flower truffles in the mini baking cups or peel them off. Truffles keep in the refrigerator for about 2 weeks or in the freezer for a few months. Truffles are better served at room temperature. .Unwrapping the Chocolate Petals: Gently pull down on the top edge of the baking cup paper (only pull down about a 1/4 of an inch)! Then, going around the top edge, continue to carefully pull down the paper a 1/4 of an inch. Finally, pull the paper the rest of the way down and pull off the flower. If one of the petals break off, just stick it in the truffle (see picture). .Be sure to enjoy!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_26_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_26_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_26_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_26_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. . Cook the rice according to directions on the box. \nFor the brand I used, which is quick cook rice but not instant, it too about 12 mins to cook in the microwave (tightly covered with lid or if you don't have lid, use plastic wrap) then let stand for 5 mins.\n. 1 pkg. 12 oz frozen vegetables or 1 12 oz (approx.) can of mixed vegetables\nCook the vegetables according to the pkg.  (For this type it is cooked in the pkg. and takes about 7 mins. be sure to set the bag on a plate in the microwave as the bag tends to leak a bit when cooked).. 3 eggs\n1 tablespoon vegetable oil\nDash of Salt and pepper\nLightly beat the eggs with the salt and pepper (I prefer seasoned salt, but you can use regular salt).  \nHeat frying pan and add 2 tablespoons oil. When the oil is hot, add the eggs then scrambled them, but don't over cook them. . . 2 tablespoons vegetable oil\n1 tablespoon ground ginger \n5 teaspoons soy sauce or teriyaki sauce\nI use a dutch oven pan, but if you have a large wok that would work better.\nAdd 2 tablespoons oil, ginger and soy or teriyaki sauce then add the cooked rice the stir-fry it for a few minutes and then add the scrambled eggs.\nOptional:  Most fried rice has chopped in onion in (I don't as my son hates onions) it and you could chop up 1 small onion and add it to the pan and brown it in the 2 tablespoons of oil before adding the rice if desired. \n. Add the cooked mixed vegetables and stir for a few more mins then serve hot.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_27_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_27_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_27_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_27_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Chocolate (milk or dark), 90 gButter, 30 gEgg (regular size), 1Sugar, 1 tbspCocoa powder, 1 tbsp. Prepare the materials and weigh them.I used stainless bowl to melt it on hot water bath.. Above the boiled water (not boiling water), put the stainless bowl.Or use other method to properly melt them.If some water goes in, it will not melt properly! Be careful.. Egg + Sugar + Cocoa powder.Mix them with whisk.. Mix them evenly~. Don't fully fill the container.Fill 60~70% of it.It inflates while cooking.Put this in the microwave and cook it for20 secs + 20 secs + 20 secs Separately.. Great dessert with coffee.The name I know is pondant au chocolatbut I would rather call it hot moist chocolate pudding :)Especially in this cold weather this dessert gives a great relief.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_28_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_28_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_28_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_28_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Take the cup of dissolved yeast add sugar and leave covered in a warm place for a few minutes. . Place flour in a standing mixer add the yeast mixed with water..  Attach the kneading hook to the mixer.Turn the mixer on and start to add the water a little at a time. The mixture will appear crumbly and then it will start to leave the sides of the mixing bowl. Add salt, Keep mixing till it comes together in the form of a soft dough.. Leave to rise covered in a warm place till double in  size this may take 3-4 hours.. Divide the dough into equal sections. Make a ball out of each section and again set aside covered for 30 minutes.. Roll out one ball at a time with your hand, spread to desired size. If you can't manage with your hand use a rolling pin but usually a pizza is rolled out wth the help of a hand.. I keep my work simple by adding oregano salt and black pepper to tomato paste, but you can make pizza sauce yourself or buy it if you like.Spread the sauce over the rolled dough.. Cover with mozzarella, I use grated mozzarella because I find it better to handle.. Use the baked or grilled zucchini to make the curly hair.. Cut an aubergine slice in half and use it for the ears.. Cut an olive into half use each part for the eyes. Place a piece of corn inside the hole of each olive.. For the mouth use bell pepper as shown in the picture.Use a piece of olive for the nose.Place the prepared pizza in a preheated oven at 400 degrees. Bake till the cheese melts and is golden brown in colour.This is the best recipe that you can enjoy making with your kids. Involve them with you while making the face parts and they will surely have fun.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_29_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_29_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_29_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_29_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. 2 cups of any kind of low carb milk. For me, the best ones are soy milk or cashew milk. These are creamy but not high calorie. If you're not worried about the calories, you could also try half and half.Optional splash of heavy cream.Half a cup of Chia seeds.Half a cup of unsweetened cocoa powder.Six packets of Splenda or other non-sugar sweetener. (You can adjust the sweetness to know how you like it.)Quarter teaspoon of vanilla extract.Dash of salt.. You'll need a container with a leak proof lid. I use glass Rubbermaid containers. It will need to hold 4 cups.Put everything your container, starting with the wet ingredients.. . Put the lid on and make sure it's tight. . Then shake vigorously until you see that the cocoa powder is well blended in.. Put the container in the refrigerator for at least a half an hour. Try to take it out within an hour, but if you have to leave it in longer, it's OK.. After 30 to 60 minutes, take your pudding out of the refrigerator. It will have separated. You can either shake it or stir briskly with a spoon or whisk. It should be well blended at this point.Then return the covered container to the refrigerator. It will take another several hours for the liquid to be completely absorbed by the seeds and cocoa.. Give the pudding a final stir before serving. It will stay thick and puddingy at this point. You can add berries and cream for breakfast if you want or sugar free whipped cream for dessert. Or eat it just like it is - delicious!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_30_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_30_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_30_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_30_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Since this was the first time ive ever used fondant, I was not too sure how this cake would turn out, but I\u2019m happy with it and glad to share it with everyone \u263aYou\u2019ll Need: *6 cups rice cereal *6 cups miniature marshmallows *3 tbls. Butter or margerineDirections: 1. Melt butter over low heat in a large saucepan. Add marshmallows and stir until completely melted. Remove from heat. 2. Add cereal, one cup at a time, stirring until well coated.NOTE: When I made this for the first time, one batch wasn\u2019t enough so I highly recommend making two batches as that will turn out the same size as mine. Steps: 1. When you are done making your rice crispies, place in a greased/buttered round or square round medium glass bowl. 2. Since we want to achieve a cauldron look, begin by molding your shape, starting from the bottom and with the palms of your hands pack down the rice crispies forming a bowl shaped (or something like a bird\u2019s nest). It can get very sticky so if you would like to grease your hands before that might be a good idea to do so. 3. Leave about a 2 inch indent on the top of the cauldron to place the fondant props inside. *Set desired mold into freezer for at least an hour (or overnight) to set.. To make the green gooey bubbles, you\u2019ll want to make buttercream in order to get that special effect. I made my own, but you can buy premade in a can. This is how I made mine (homemade is always better ;)What you\u2019ll need: *3 cups powdered sugar *2 sticks butter, softened *1 tsp. vanilla extract (or whatever extract you desire) *2-3 tbs. milk *10 inch cake board\u00a0Recipe: 1. In a mixing bowl, whip butter until you get a nice fluffy consistency 2. Add powdered sugar, one cup at a time 3. One tablespoon at a time add the milk 4. Add vanilla and mix until creamy *Once you are done making the fondant, with a butter knife, spread a thin layer of buttercream onto the cauldron, making sure it\u2019s thin and not too thick. Place on cake board. Set rest of buttercream aside.OOPS! Moment: the first time I made this buttercream I used margarine and for some reason it got extremely runny so I would recommend using real butter as that will give you a firmer and workable material.. *When working with fondant, the key is to make sure it is soft enough to work with, but not too soft. I used Duff\u2019s brand white fondant because it is easy to work with and it tastes scrumptious \u263a I bought white because since I needed numerous colors, I decided to buy food coloring separately.What you\u2019ll need: *Duff\u2019s fondant (white) *10 inch cake board *Black food coloring *Rolling pin *powdered sugar for dustingDirections: 1. Lay your cauldron out on a flat surface. 2. Microwave 1/3 duff\u2019s fondant for about 5 seconds. 3. With black food coloring, work into fondant, making sure its covered with black (you may want to use gloves, it can get messy) 4. Sprinkle powdered sugar onto a flat surface 5. Roll out fondant about 1/8 inch thick rolling out enough to cover cake 6. Place 4 small black balls on each corner of cake board with excess fondant (about half inch thick) 7. Cover cake with fondant, making sure sides are fully coated.NOTE: when putting the fondant over my cake, it got very bumpy (which is I guess okay for a witches\u2019 cauldron cake), but I would suggest making it thick enough so it doesn\u2019t turn out too ridged. I ended up recovering it and trying a second time, it wasn't as bad. Plus, if you smooth out the butter cream beforehand it shouldn\u2019t get bumpy.. *This is the best part! Now you get to be creative and design whatever shapes/creepy molds you want! I made a bat wing, eyeballs, pigs foot, poison bottle, witches broomstick, and a bone. *Before making the fondant props, color the rest of the buttercream with lime green food coloring. *in a pastry bag, dollop around sides of cake, and inside so the props will stick to the top portion. *Color the additional fondant with whatever color you desire! *You can use a serrated knife for the edges or decorating tools to shape them. *I made fire at the bottom of the cake with my etra fondant as well, don\u2019t be afraid to get creative! *Place fondant props inside of cake, putting in whatever directions you wantVOILA!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_31_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_31_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_31_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_31_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Turn oven  on to 350 degrees.. Melt 1 stick of butter in the microwave for 1min.. In a regular size cake pan crack 1egg (well beaten). Add entire box of yellow cake mix. Pour melted butter over the egg and yellow cake mix. Mix ingredient. Press mixed ingredients into the bottom of the cake pan.. In a separate bowl add 2 eggs and beat them. Add the 8oz cream cheese to the bowl. Add 1/2 the bag of powder sugar. Then mix all with a mixer until it  is smooth .. Pour ingredients from the bowl into the cake pan.. Put the cake pan in the oven. Bake it for 45-50min. Cake will be a golden brown color.. Enjoy your cake\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_32_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_32_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_32_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_32_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Your going to need 2 apples one a little smaller than the other. And if you need it maybe a little peanut butter.. Cut some little notches about 4 on each side for the feathers. if you want them to stick out more add some peanut butter underneath them.. Cut 2 small circles on the smaller half and then take 2 little circles of pealing and put them in the circles for the eyes.. Now put the smaller apple on top of the bigger apple and if you need to put some peanut butter to help them stay.And thats it hope you enjoy.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_33_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_33_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_33_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_33_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. You will need: \u00a0\u00a0 -Ground meat \u00a0\u00a0 -Bacon strips \u00a0\u00a0 -Eggs \u00a0\u00a0 -Salt \u00a0\u00a0 -Pepper \u00a0\u00a0 -Nutmeg You need toothpicks and if you use leftovers as I did, you also need a food processor.. If you buy ground meat, you will probably not need eggs, but if you ground your own meat (in the food processor)\u00a0you will need eggs. Add an egg or two until the meet is \"solid\", by other words until you can mold it (like play-doh..). Season with salt, pepper, nutmeg and other seasoning that you want. Now you can start the shaping process.. Form little balls and then roll them\u00a0until they look like chuby sausages. Wrap\u00a0\u00a0them with a strip of bacon and insert a toothpick, so that the bacon does not escape, because nobody wants that (hmmm bacon!!). Aren't they gorgeous?! Yes they are! And\u00a0 even more tasty. \u00a0 (ignore, I'm hungry). Pre-heat the oven to 180\u00baC ( 350\u00baF). Put the mini rolls in a baking tray covered with aluminium foil and insert them in the oven. Count some painfull 20-30 minutes and remove them from the oven. Serve immediatly and eat like it's the end of the world! \u00a0This is a very\u00a0simple recipe that I made up to my lunch and\u00a0it came out pretty good! \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Thank you for watching and have a good meal!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_34_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_34_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_34_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_34_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. I decided to make pretty much everything from scratch for this pie, including the crust.  If you wish to cut time by buying pre-made crust, then by all means GO AHEAD. This is 2014. But just keep in mind it may not be as tasty.  TIME: 20 MIN total for crust Crust is not too painful to make. You will need for crust   1 1/2 cups of Gram Cracker crumbs or 12 Gram Crackers made to crumbs.       3 tbsp (tablespoon) of sugar      3/4 cup of non salted butter melted      1 tsp (teaspoon) of cinnamon      Springform pan 9\" or 10\" pie plate  Turn your oven on to 375 degrees Whisk or with a fork mix together the graham cracker crumbs, sugar and cinnamon. (The cinnamon will give it some nice flavor.) Melt the unsalted  butter in the microwave and use a fork to mix the butter with the crumb mixture until they are all are moistened. It will look clumpy and much darker and that's a good thing. Spread the crumbs into a 9-inch spring-form pan OR a 10-inch pie plate and press them firmly into an even layer over the bottom and half of the way up the sides of the pan. Bake for 6 to 8 minutes, or until golden brown. Let it sit for 8 minutes to cool, or just stick it in the fridge to save time. By baking the crust you will get a more crunchy crust. Which will go beautifully with the crunchy top I have planned for this pie =). The secret is toasting the nuts!! Forgive me Grandma!! Haha, I'm just kidding. No, but seriously. Any time you have a dish with nuts, the secret to ultimate flavor is to toast them. It only takes 5 minutes, and enhances the flavors so much! TIME for Sauce: 8 MINWhat you will need for the Special Caramel sauce.    1 packet of apple-cinnamon granola from Quaker Oats.       3/4 cup of chopped pecans      1 cup salted caramel sauce. I used sugar-free in order to not go overboard with the sugar.      small cooking sheet for toasting the nut mixture in the oven.   Open the packet of granola and pour in a nut chopper as well as the pecans. You could also break them up yourself by putting them on a cooking sheet and breaking with a spoon, but it may get messy.  Since the oven is already going because the crust was just made toss the nuts in!  After 5 min of toasting pull them out. They should smell amazing. Take the crust out of the fridge. It should be cooled by now. Pour the caramel on top of the crust and sprinkle the toasted nut mixture on top of the caramel.  Place the springform pan into the fridge to chill out. MAKE SURE YOU SAVE SOME TOASTED NUTS FOR LATER. ;) You will use them as a garnish. The Infinity pie is based off an apple cheesecake pie. So making the apple pie part is very much like making a regular apple pie as you would have guessed. You can either BUY (it's 2014) your apple pie filling OR you can make it. I chose to make it because I want a delicious pie this time. Dedicating something to my hero only deserves the best! ;)  NOTE: if you are using a can of apple pie filling you only need to use half!! TIME for Pie filling 40-50 min (depending of if you have an apple corer)What you will need for Apple pie filling   5 small granny smith apples. They must be peeled and cored and cut them thinly (slave work)      3/4 cup of unsalted butter      1/2 cup of light brown sugar      2 tsp of cinnamon      a couple dashes of salt       a large pan for cooking on the stove  I DON'T have an apple corer. So this part took extra long.... my boyfriend wasn't too thrilled. But it's only 5 little apples. While you are peeling apples, put the butter on the stove and begin melting it. It will only take a few minutes. When it's melted add the brown sugar and cinnamon to the butter and mix until gently bubbling. Again it only takes a few minutes so you probably won't be done with your apples. The\" brown apple syndrome\" will happen and it's alright. These apples are destined to go into a brown sugar liquid and cooked extremely soft. No harm so don't stress! ;) when you're finished with the apples slide them in the large cooking pan and coat them well with the liquid. Put a lid on the pan and stir occasionally for 10 min. Remove the lid and up the temperature to med-hi to boil off most of the remaining liquid. Throw a few dashes of salt in. After another 15 min the apples should be very very soft and that's what you're looking for.  LET SIT FOR 20 min to cool before adding to your pie crust. Getting tired yet??. You can turn the stove off if you want to save electricity for 20 min while the apple pie filling cools....  But OK, you have 20 min to make a design to top your Infinity pie. Me, because I didn't want to have to make a batch just for crust I broke down and bought my pre-made crust. FORGIVE ME GRANDMA. ;) haha Pre-made crust is very easy to work with. You just unroll and cut out whatever design you want. I see pie design tops (much like pumpkins today) as a big fad soon. It is taking off but not like I think it will soon. But anyways, cut out whatever your heart desires! If you mess up, crust is easy to erase... just flatten out and try again. For stencils, I just found shooting stars online, printed them out, and laid them over the dough and cut it. Easy as pie. My shooting star is dedicated to Carl Sagan and the infinite universe. =). Exactly as the title says... pour the cooled apples on top of the cooled caramel mixture that's been chilling in the fridge.  This is the easiest step! ;). I love cheesecake. If it were me I'd put cheesecake in everything. But I probably wouldn't live long. Anyways, again, this is only technically half a cheesecake so the ingredients aren't as heavy. Turn that stove back on to 350 degreesWhat you will need for cheesecake topping:   8 ounces of soft at room temperature cream cheese   1/2 cup of granulated sugar   1 egg medium sized      1 tsp of vanilla extract      1tbsp of lemon juice      lemon wedge for lemon zest      electric mixer and a medium sized bowl  First you will need to beat the cream cheese and sugar together on medium speed for about a minute. They must be well mixed. Then add the egg and beat it in until it is combined for about a minute.  Then add the lemon juice and vanilla extract and beat for another minute. Zest the lemon wedge in. Just a few times is all it needs. Pour the cheesecake batter over the apples in the pan, smoothing it into an even layer as much as you can. Bake until the cheesecake is set about 25-30 minutes. While this is happening, as you will see in the next step, coat your design you made with the pie crust with egg whites and bake at the same time in the oven with the pie.. Because pie crust is usually not belonging on cheesecake, I decided to bake it separately on a cooking sheet.  I coated it with an egg white to give it shine and baked it next to the cheesecake for 5-8 min. When it was done, I pulled it out and sprinkled it with sugar while it was still hot to give it some sweetness.  The cheesecake should be done within 30 min. Transfer the cheesecake pan to a wire rack to cool, the cheesecake must refrigerate for at least 4 hours or overnight. (For me, since it was already midnight when we were finished... lol, we ended up just chilling over night.).   Before you put your infinity pie in the fridge...., ta dahhhh, the toasted crust design goes on top of the cheesecake like a crowning jewel! Then, add some of the remaining crunchy toasted nuts on top and along the outsides to bring it to life. Then, put that sucker in the fridge overnight. I know it will be REAL HARD. But trust me, it needs to cool for at least 4 hours.  When serving your Infinity pie, put some caramel on the plate along with the special crunchy nut mixture. It will definitely knock someone's socks off! Pair with vanilla bean ice cream for a real desert! Be sure to refrigerate any leftovers.\"The sky calls to us; if we do not destroy ourselves. We will one day, venture to the stars\"  -Carl Sagan  This one's for you Carl! Enjoy your Infinity pie everyone =) PLEASE let me know if you make it!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_35_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_35_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_35_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_35_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Materials:\nSolo Cups\nCoffee Filters\nFunnel\n5-6 Flasks/Bottles - I used 10oz Orangina bottles (great to drink while making this stuff)\n5 plastic bottles - I used Gatorade bottles because of the wide mouth and screw top (also great to drink)\nIngredients:\n2 pounds of Skittles\n1.75L of Vodka\nLove. Sort out the five different skittle colors from all 2lbs, and put them in your 5 plastic bottles. \nAdd 12oz (1/5 of the 1.75L bottle) of vodka to each skittles container. Seal the bottles and shake each bottle vigorously every few hours. Takes about 24 hours to dissolve the skittles completely. \nAlso I would recommend wearing gloves when handling so many skittles and the mixtures as it stains your fingers and took four days to get out. \u00a0 \u00a0\u00a0. Once the skittles have dissolved, a foam is left on the mixtures and must be removed. Use your coffee filters and solo cups to accomplish this as quickly as possible. Wear those gloves!\nOriginally I tried using paper towels to filter the foam away but it soaked up too much of the vodka and took a really long time. Therefore I switched to the coffee filters, but these still took some time to filter the drink so I had multiple setups going at once. Make sure you only add a little vodka onto the filters at one time because the foam will build up and stop the drink from moving through. Scrap off the foam once it builds up and squeeze any vodka left in the filters into the cup below.\nThis process took a few hours and requires your attention.\u00a0. Depending on how the process has gone, you will be left with anywhere from 60-70oz of Love Potions.\u00a0\nFill up your flasks in any manner you choose, with my leftover liquid that didn't fit in the five 10oz bottles I combined them into a sixth 10oz bottle. Of course you could just have bigger bottles.\nImportant Note: Putting the drinks in the freezer will cause the drinks to condense so fill the bottles up more than necessary. \u00a0\nHopefully one of these potions will cause your Valentine to fall madly in love with you!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_36_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_36_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_36_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_36_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. For one drink2 1/4 cups of frozen, cubed seedless watermelon 2 tbsp of light rum2 tbsp of lime juice2 tbsp mint syrup (how-to = step 6)      - fresh mint      - white sugar4 fresh mint leaves*NOTE: the watermelon needs time to freeze, so it's best to prep the watermelon a day ahead so it can freeze overnight.Shopping List:3 small seedless watermelons2 limes1 bunch of fresh mint1 bottle light rumsmall bag of sugar*This amount of fruit will make approx. 4 generous drinks.Tools:blendercutting boardsharp chef's knifemeasuring cup1-2 lrg freezer bagscitrus juicersmall saucepanstirring spoonsmall strainersmall bowlserving glassesfun straws!. There are many ways to go about removing the rind from a watermelon. If you have a tried and true method, go ahead and use it. This is my favorite way:Cut a nice flat slice off of the bottom of each watermelon, so that it can sit upright with lots of stability.. Using a sharp chef's knife follow the curve of the watermelon's surface and remove the rind in strips, until the whole melon is rind free.. Slice and cube each watermelon.. Place the cubes into large freezer bags and place in the freezer overnight.. Take 1/3 of your bunch of washed, fresh mint and remove the leaves from their stems.Place the leaves in one hand, and clap down on the leaves with the other several times. This is a simple and quick way to release the flavor. Place the 'clapped' leaves along with 1/2 cup of sugar and 1/2 cup of water in a small saucepan.Bring to a boil, stirring occasionally.Remove from heat, strain, and let sit to cool to room temperature. Place in the fridge to cool it further.. Juice enough limes for as many drinks as you'd like to make. Each drink needs 2 tablespoons of juice.. Place all ingredients into a blender.. Blend.You are now one step closer to deliciousness.. Place your blended drink into a serving glass, add a sprig of mint and a fun straw.Now put your feet up, lean back in the last few rays of seasonal sunshine and enjoy!!NOTE: This is also scientifically formulated to give you even more to look forward to for next watermelon season...\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_37_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_37_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_37_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_37_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Tools\n-utility knife (Scissors work fine)\n-Oven or source of heat that can get up to 250 degrees\n-Oven mit's\n-Bowl, Knife, measuring cups, etc.\n-acetone\nMaterials\n-strawberries\n-rhubarb\n-salt\n-pie crust\n-can\n-sugar. Cut the can to how deep you want your pie to be with an exacto knife or a pair of scissors.\nIf you plan to bake inside use steel wool or a steel bristle brush first to remove the paint.. To make the pie you will need 1/2 cups of chopped rhubarb (Making sure to remove the leaves which are poisonous) into about half an inch pieces, 1/4 cup of chopped strawberries, 1/4 cup of sugar, and pie crust.First you layer the inside of your can with the pie crust, then you fill it with the ingredients listed above and then you put a the rest of the pie crust on top of your pie and cut a slot to let out steam. then you put it in the oven and/or fire and wait about 10 minutes.If you want to make you're own crust the link below will show you how i used a oil pie crust recipe because i didn't have shortining. Oil crust:http://busycooks.about.com/od/piecrustrecipes/r/oilpiecrust.htmRegular crust:http://allrecipes.com/HowTo/Making-a-Pie-Crust-Step-by-Step/Detail.aspx. Grab a spoon/fork/knife? and dig in!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_38_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_38_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_38_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_38_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. All of the 3d printed parts1 wiper motor (wiring instructions wiring instructions. I recommend the higher speed.)      1 power supply1 gearbox kit1 ring burr1 cone burr1 fastener kitOptional: 4x sorbothane feet. Insert the two bearings into the 3d printed part named \"bottom.stl\". Take the axle that comes from the gearbox kit, and thread it into the cone burr.    Add the white plastic piece that comes from the gearbox kit to the assembly, making sure the plastic tabs fit into the metal holes, and the piece seats properly.    Insert the conical burr/axle into the piece from the previous step. Add the printed part named shaft_coupler_top.stl to the assembly as shown, making sure to face the rounded corners up and into the part.Then add the internal tooth lock washer that came with the gearbox kit. Then add and screw on the nut that came with the gearbox kit. . You will need to remove the white plastic the ring burr comes with in order to perform this step. Insert the ring burr into the top as shown, making sure to align the flats of the ring burr with the nubs in the printed part. Turn the ring burr 90 degrees inside the printed part. Insert the worm/grub screws into the holes on the sides to lock the ring burr in the part. . Using template.stl, assemble and drill as shown. I recommend using a drill press and to center punch the holes to keep the drill from wandering. . Add the motor adapter you just prepared to the motor shaft.   Add the 5/16\" SAE washer   Add the m6 nut and tighten until the motor turns a little bit.   Add the shaft_coupler_bottom.stl  Add m3 washers and bolts  Attach to bottom with either m3 lock nuts or with lock washers and nuts. . Put the bottom assembly on to the motor assembly. If the bottom assembly does not seat all the way, you will need to rotate the conical burr to make sure the shaft couplers engage. The bottom of the bottom assembly should touch the top of the motor assembly. . Glue the feet on to the bottom of the motor. The grinder can get pretty noisy, and it's a good idea to use vibration dampening feet like the sorbothane feet seen here. . Screw on the adjuster (shown in blue) to the top piece (shown in yellow) as shown, taking care to line up the adjusters tabs with the nubs on the top piece. Insert the top assembly into the bottom assembly so that it is seated fully. Rotate the adjuster to adjust the grind. . \nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_39_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_39_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_39_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_39_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. For this recipe you will need:1 c blueberries2 c all-purpose flour2 1/4 c buttermilk4 tbsp melted, unsalted butter3 tbsp sugar1 tsp salt1/2 tsp vanilla extract1 tsp baking soda2 tsp baking powder2 eggs. Using an electric mixer, beat the eggs until they become frothy.. Add in all of the other ingredients except the blueberries to the eggs and stir with a spoon until they are combined, but do not over mix.. Now pour 1/3 c batter on a griddle set to medium high heat. Top the pancakes with blueberries.. Cook the pancakes for 2 minutes, or until golden brown, then flip and cook the other side for 2-3 minutes until it is golden brown also.. Keep the pancakes warm while cooking the rest by placing them on a baking sheet in an oven preheated to 200\u00b0.. 3 c blueberries1 c water1/2 c sugar1 1/2 tbsp cornstarch that's been dissolved in 3 tbsp water1/2 tsp vanilla extract. In a small saucepan, add 1 1/2 c blueberries, water, sugar and vanilla and cook on medium high heat until it starts to boil.. Stir in the cornstarch and lower the heat. Cook for an additional 2-3 minutes, or until you get the desired consistency of the syrup that you want.Remove the saucepan from the heat and stir in the rest of the blueberries.. Top the pancakes with the syrup and enjoy!If you make these, let me know what you think!!!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_40_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_40_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_40_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_40_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Ingredients: \u2022 1lb box of Duncan Hines Red Velvet Cake Mix \u2022 1 \u00be Cups of water \u2022 \u00be vegetable or canola oil \u2022 \u00be Cups of apple sauceUtensils: \u2022 Mixing bowel \u2022 Measuring cups \u2022 Non-stick pans (preferably round) \u2022 Mixer (optional). Once completely poured into the bowel, stir the mix to get out thick chunks before adding in other ingredients. Otherwise, they will be much harder to get out later on. Also, set your oven on at 350 for time to preheat. . First, add 1 \u00be cups of water. Second, add \u00be cups of\u00a0 vegetable or canola oil. Finally, add \u00be cups of apple sauce. Mix all ingredients thoroughly until it is a smooth batter. . If a non-stick pan for baking is unavailable, use grease or butter to coat another pan. If your pan is too small for all of the batter, use another one (should fill a 3 inch pan about half way). Allow for around 30min of baking time. Check on the cake once 30min is up.. Check to see if the cake is thoroughly baked by sticking a fork into it. The cake should be moist, but it shouldn't crumble or be smashed by the fork.. This cake will be much more fragile than a normal cake, so be very gentle when applying the frosting if keeping the cake intact is important. \nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_41_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_41_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_41_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_41_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Typically any old espresso grind will do, even Turkish grind can be okay if you don't mind the solids coming through.If you want to achieve superior quality though I HIGHLY recommend a nice burr grinder.I recently purchased the Hario Skerton, A few people on the internets believe its supposed to be Skeleton but the asian producers messed up. Either way its a fantastic product!Here's how I adjust mine;Tighten the grind setting until the mechanism will not turn freely (be gentle here)Now find the first notch that the lock will fit into, I call this position 0Now move the nut up a position this is position 1...Continue to position 3,This is the ideal grind for me while making espresso! I wouldn't exceed position 5.I use a 1:1 proportion when I make espresso. In a typical espresso double shot there will be 3 oz of finished espresso, Therefore I will start with 3 oz of beans or ground coffee.. Depending what drink you are making you may want different flavors in your brew. The flavor profile of your beans can be altered using different temperature water. Higher temperatures will bring out roasty/burnt tones and lower temperatures may only pull the fruity/acidic flavors. Finding a nice balance can be tricky and depends on the roast of coffee you use. I like to buy light roast coffee and brew around 150-160 degrees F to make a fruity/citrus flavor and add sugar to take off the acidic edge.. I line up the center of the (4) with the surface of the rubber plunger, this lets you reference shots (1.5 oz) on each progressive circle. Using the press upside down seals it from leaks and lets you fully saturate the coffee before steeping and pressing through.. Just add a tiny bit of water at first, this lets the coffee absorb the water fully before being saturated and helps later with frothing. stir it up and let it breathe for a few seconds.. Add the rest of your brew water and get the coffee suspended evenly. I typically fill a bit past the 2 while upside down for espresso. If you want drip style, perhaps a bit stronger, Fill to the top at this step. Cover the press with your filter and cap assembly, this will help retain some heat.Wait at least 2 minutes, or longer. The flavor may improve but this really depends on the roast your using. If you just want the caffeine boost steep for a long time! Like 10 minutes! Woo!. After waiting patiently Press that liquid gold through and watch the oils seep through in the foamy goodness! Enjoy a nice dark espresso or mix it into hot milk for a cappuccino or more milk and sugar for a latte. Makes a nice americano with hot water as well!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_42_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_42_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_42_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_42_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. You will need:Bacon - 1/4 lb or 4-5 strips/slicesBroccoli - approximately 4 cups of florets or 2 bunchesRed Onion - 1/2 large onion or approximately 1 & 1/2 cups when choppedPomegranate - 1/8 to 1/4 cup arils or approximately 1/4 of one pomegranateCheddar Cheese - 1/2 c shredded or 50 gramsMayonnaise- 1/3 cupWhite Vinegar- 1/2 cupWhite Sugar - 1/2 cupBlack Pepper- 1/4 tspSalt - 1/2 tsp Cutting Board &\u00a0Knife Cheese Grater Large Mixing Bowl & Salad Tongs Two Medium Bowls & a Spoon / Whisk Smaller Serving Bowls & Utensils. In a medium bowl, combine: 1/3 c Mayonnaise 1/4 tsp Black Pepper 1/2 tsp Salt Mix with a spoon or whisk until smoothly blended. Add 1/2 c Sugar and mix again. Add 1/2 c White Vinegar and mix once more. Refrigerate and shake or re-mix before serving on salad.. Cut your strips of bacon in half and pan fry. Drain on paper towels and let cool before breaking strips into smaller bite-sized pieces.. Seed your pomegranate ahead of time and set arils aside.https://www.instructables.com/id/How-To-Seed-A-Pomegranate/. Chop or dice broccoli & onion and set aside together in the large mixing bowl.. Grate 50 grams of cheese to make approximately 1/2 cup once shredded and add to large mixing bowl.. Add bacon bits and pomegranate arils to the large mixing bowl (which should already contain broccoli, onion and cheese).\u00a0Toss salad and serve with dressing on the side (or add dressing and toss again before serving). You can also opt to chop the ingredients more finely as shown in the third image of this step.This recipe makes enough to Serve: 5-6 people as a side dish 3-4 people as a lunch 2-3 people as an entree For a vegetarian meal, remove the bacon. For a vegan meal, remove the bacon and cheese, use vegan mayonnaise and add chopped walnuts. This recipe is gluten-free.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_43_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_43_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_43_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_43_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. 500 g blackberries100 g sugarAbout 500 ml vodkaStrip of lemon peel. Put the fruit and sugar in layers in a preserving jar. Add vodka to cover. Shake every so often on the first day to dissolve the sugar.Store in a dark place for about a month.. Strain the liquid into a clean dry bottle through a layer of paper towel. Seal and label.. Serve in small glasses - it isn't that high in alcohol but it is full of flavour and quite sweet.It is delicious mixed with sparkling wine.It is a lovely addition to desserts such as trifle.You can make similar liqueurs from any kind of berry. Raspberries are particularly nice, and the product is a beautiful ruby red colour. Black currants need more sugar.It keeps for ages - I have kept it a year, when I lost the bottle in the back of the cupboard!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_44_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_44_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_44_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_44_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Note: I usually make a few of these while cooking up other fried foods, so normally I would use a\n\u00a0 \u00a0 \u00a0 \u00a0 \u00a0 Bulb or two of garlic and whatever batter or breading That is left over. But for this, I will post a\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 mini version of one of my batter recipes , just in case you want to make the garlic by itself.\n5 bulbs of garlic\noil for fryingTHE BATTER\n1 cups flour\n1Tbsp Corn Starch\nA Pinch of salt\n1/4 tsp pepper\n1/4 tsp garlic powder\n1/4 tsp onion powder\n1/4 tbs cayenne\n1/2 tsp chili pepper\n1/2 tsp cumin\n1/4 tsp adobo seasoning\n1 beaten egg\n1/2 cup beer\n1/3 cup milk\n3 Tbsp melted Butter ( i use coconut oil)The Buffalo Sauce (optional):\n1/2 cup Louisiana hot sauce\n2 TBSP\u00a0 butter\n1 zip lock bag or small, air-tight container\n\u00a0Note: you can make half batches of the sauce for smaller amounts or garlic, if you are\u00a0 frying other treats.. 1.Mix flour, corn starch, salt, pepper, and all other dry seasonings together in large bowl\n2.In another bowl, Mix in eggs, Beer, milk, and melted butter\n3.Pour wet mixture into dry ingredients bowl and stir until well blended. It should be just a little runny\n4. Let sit in the refrigerator for 30 to 45 minutes.. 1. Peal off the garlic skin and cut off the hard bottom pieces.\n2. Heat oil in pan.\n2. Place cleaned garlic cloves into batter and coat thoroughly.\n3. Place coated garlic cloves into hot oil and cook until golden brown. (about 5-6 Minutes, flip half way\n\u00a0\u00a0\u00a0\u00a0 through)\n4. Remove from oil and drain on paper plate or towel.. 1. Combine hot sauce and butter in a small Glass or bowl.\n2. Microwave sauce on high until the butter is melted, About 45 to 60 seconds.\n3. Mix until well blended, then Pour sauce into a zip lock bag or container.\n4. Place cooked garlic cloves into zip lock bag (or container) and seat tight.\n5. Gently shake bag until all garlic cloves are completely coated.\n6. Remove garlic from bag and serve with blue cheese or ranch dressing.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_45_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_45_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_45_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_45_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. use plasterstrips to get an imprint of your chest - you want the bra the right size.\nI'm not showing that step, due I don't want to see my chest on the internet ;)\n- i'm sure you'll figure out how to do that.\nInstructions are found on the package of the plasterstrips.\nWhen the imprint is dry cover it with aluminiumfoil.. you can use all cookie doughs that can be used with cookiecutters.\nthe recipe I used:\n0,5 egg\n125g flour\n62g sugar\n62g butter\n1Tablespoon Cocoapowder\n1 teaspoon vanillasugar\n1 little bit bakingpowder\nand for flavour 2 tablespoons of instant cappuccino. Form the cups of the bra on your covered mold.\nmake sure to make it evenly thick - about 0,5 cm\nbake it on 200\u00b0C for about 10minutes ( may vary with another recipe). at this point you can get as creative as you want :)\nHere's what I did:\nmelt some white choclate in the still warm oven\nspread it with a clean brush on the warm bra.\nmix some white chocolate with cacoa-powder\nand paint whatever you like :)\nbrush some chocolate on the edge of the bra and sprinkle it with chocolate-pieces\nlet everything cool down.. carefully peel the foil of the mold\ntake a corkskrew and make holes to join the two cups in the middle - be very careful!\ntie the cups together with a nice lace or string.\nYour done!. Now surprise your beloved one and have a nice cup of tea!\n- Or whatever you like\u00a0 :D\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_46_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_46_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_46_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_46_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. You'll need the following ingredients:\n-5 ounces good quality dark chocolate, melted\n-1 pint of vanilla ice cream (there's 1.5 quarts in the picture because I always buy what's on sale--I won't tell if you do, too!)\n-2 (4 inch) cherry pies, chopped into small pieces\n-12 ounces chocolate cookies (can be plain chocolate cookies or chocolate animal crackers or chocolate graham crackers)\n-8 tablespoons unsalted butter, melted\nYou'll need the following equipment:\n-9.5 inch pie plate\n-parchment paper\n-pencil\n-food processor. This is the sketching stage. Aka the part of the project where I actually attempted to use pi. My pie plate is 9.5 inches in diameter. If you think back to high school math you might remember the following equation:\u00a0\nC=\u03c0D\ncircumference =\u00a0\u03c0 * diameter\nIn this case the diameter of my pie-plate is 9.5 inches so the circumference is approximately 29.85 inches knowing that I wanted to express\u00a0\u03c0 to 32 digits with room for ellipses, I estimated that I'd have to make each of my numerals less than an inch to fit it in. First I took a large sheet of parchment paper and I traced my pie plate. With my handy circumference estimate in mind, \u00a0in mind I drew the attached pattern free hand attempting to center the pi symbol in the middle.* Because I have no idea how food safe my charcoal drawing pencils are (and the idea of eating something that had been piped on top of charcoal was kind of gross) I affixed another blank piece of parchment on top so the pattern shows through but I can pipe on a clean surface.\n*When it came time to put my pie together everything fit, but it was a tight squeeze. I realize with the benefit of hindsight that I forgot to factor in the crust around the edges. To be more precise, you'll want to leave a 1/4 crust allowance, so in this case I'd draw my circle as 9 inches to reflect the layout with that allowance in mind.. Next fill a pastry bag with your melted chocolate and fit it with a small round tip (I used the #4 Ateco tip that came in my piping set). Carefully trace over the pattern that you created. Once you've piped all the numbers and the large pi symbol, you can pipe as many small pi symbols as you wish--I suggest at least 50 so you get crunchy chocolate bits in each bite. Place patterns with the piped chocolate in the refrigerator to harden for about 10 minutes.\u00a0. Add chocolate cookies to bowl of a food processor and pulse until crushed into tiny pieces. Pour in melted butter and pulse until thoroughly combined. Press mixture into a 9.5 inch pie plate.. In a large bowl combine 1 pint (16 ounces) worth of slightly softened vanilla ice cream with 2 (4 inch) cherry pies chopped into small pieces. Use a spatula to fold the pie pieces into the ice cream, working quickly to avoid a giant melty mess. Once the vanilla ice cream and pie have been thoroughly combined, gently fold in the small pi symbols and transfer the entire mixture into the prepared pie plate, using your spatula to spread and distribute the filling so the surface is smooth. Remove the piped chocolate pieces from the refrigerator and using the pattern as your guide place them on the top of the pie. Freeze for at least an hour to up to overnight to firm the pie up before cutting and serving. Enjoy!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_47_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_47_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_47_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_47_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Turn cake upside down so you have a nice flat top, cut a small vertical line in cake. Into three , put knife in the side and other hand on top of cake and use  turn table to cut across evenly , make two evenly spaced cuts. Fill with buttercream and jam , be careful to make sure the cut edges on cake match up , otherwise you may get a wonky cake . Use a large knife or spatula to cover cake in buttercream , use smoother and turn table to turn cake round whilst smoothing sides. Colour and kneed fondant until soft , using a little corn flour or icing sugar to prevent sticking on work surface. Roll fondant to shape of cake , approx 2-3  ml thick , keep turning fondant round to keep even width , use rolling pin to pick up fondant and place on top if cake . . Smooth with your hands and then use smoothers to smooth to fit cake, cut off spare icing , and continue to smooth using turntable to help \nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_48_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_48_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_48_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_48_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Pull the head/tentacles away from the body. Scoop any remaining guts out of the body, and remove the thin plasticy quill. Rinse the body inside and out, then slice into approximately 3/4-1 inch thick rings.\nSqueeze behind the head to extrude the beak, and remove it from the center of the tentacles. Cut just below the eyes to free the tentacles, then add them to the bowl with the body rings.\nTentacles are the best part. No, really- they're fantastic.. Bring a pot of water to a boil.  Add a bit of salt and a bit (1-2 Tablespoons) of wine or citrus juice. Drop the squid into the water in batches, removing it just as it turns opaque.  This should take less than a minute, so be ready with a slotted spoon.Deposit the cooked squid on a paper towel to cool and dry.. Combine:\njuice of 2 limes\n~1 Tablespoon hot chili/garlic sauce (sriracha)\n~1 teaspoon sesame oil\n~1/2 teaspoon fish sauce (or to taste)\n~1 teaspoon rice vinegar\n1 kaffir lime leaf, finely minced (zest from those limes makes a fine substitute)\n3 finely minced shallots\n2 Tablespoons brown or palm sugar (honey or agave nectar are good substitutes)\nhandful fresh mint, finely minced\nhandful fresh cilantro, finely minced\nsalt and pepper to taste\nStir it up and taste.  It should be aromatic, spicy, and acidic with a touch of sweet.  Adjust the seasonings as necessary to make the sauce taste good to you.\nNote that I resisted the temptation to add a grated garlic clove to the mix- there's already garlic in the sriracha, and I didn't want to overpower the squid.. Add squid and give it a stir.  Let it sit in the marinade for a bit, preferably in the refrigerator for about half an hour.  More marination certainly won't hurt; you can leave it overnight if you like.. Serve cold.  The longer the squid marinates the better the flavors will penetrate.  This will keep for a day or two, but like any seafood it shouldn't be left to moulder in the refrigerator.  We've never had any problems of this type, as this salad disappears quickly.\nGarnish with any of the herbs used in the salad and serve on funny-looking plates.  For best results, make sure all the tentacles are showing.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_49_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_49_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_49_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_49_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Ingredients for the cupcakes:-113g Unsalted butter (softened at room temperature)- 1 3/4 Cup plain flour - 2  Teaspoons baking powder - 1/4 Teaspoon salt - 1 Egg (room temperature) - 3/4 Cup milk - 2 Teaspoon vanilla bean paste - 2 Tablespoons Canola / vegetable oil Butter Cream Frosting- 3 Cups icing sugar- 113g Unsalted butter, soft - 3 Tablespoons Milk - 1 1/2 Teaspoons vanilla bean paste- Yellow food colouringDecoration/ Toppings- 1 Punnet of Fresh Strawberries (250g)- 1 Teaspoon of sugar- Some leftover dark chocolate cake (about 2 cups worth)- Grated white chocolate. Equipment- Electric beater- Large bowl- 12 hole muffin pan- 12 Paper muffin liners- 1/4 cup measurement- Whisk- Piping bag with multiple hole tip- Small sauce pan- Small sieve- Grater. The first thing to do is to make one large batch of cupcake batter by creaming the unsalted butter (113g) in the kitchen aid mixer until the butter becomes light and fluffy. Then add the 1 cup of caster sugar and continue to beat until light and fluffy. Then add the vanilla bean paste and egg to the butter and cream and continue to beat them.Add the baking powder to the plain flour and whisk it together to get rid of any lumps then add 1/3 of the flour mixture to the butter mixture and beat it together. Then add 1/3 of the milk and continue to beat the mixture. repeat adding 1/3 of the flour and milk mixture alternatively until all incorporated. Finally add the 2 tablespoons of oil to the mixture and stir until combined.. Fill the muffin tins with the paper liners.  Take the 1/4 cup measuring cup and fill each lined muffin hole with 1/4 cup of cake batter. Then bake the cupcakes in a preheated fan-forced oven at 180 degrees Celsius for 17-20 minutes until the cupcakes are golden brown and springy to the touch. Let them cool for about 10 minutes then transfer them to a wire rack to cool completely.. While the cupcakes are cooling make the strawberry 'tomato pasta' sauce as this will need to be cold when put on the cupcakes.  Wash the fresh strawberries and cut them into small quarters. Place them in a small saucepan and let the cook for about 3-5 minutes on medium low. Then add 1 teaspoon (or more) of caster sugar to the sauce and let it cook until it gets nice and thick. This will slowly break down the strawberries into a thick sauce. To make the sauce smooth you can also break up the berries even more with a spoon and/ or sieve it to remove seeds. Then place this in the fridge until it has cooled completely.. Take the 2 cups of left over mud cake (with the frosting on it) and squeeze it in your hands to roll out small chocolate 'meatballs'.  I prefer to have 3 small 'meatballs' on top of the cupcakes so you will need to make about 36 meatballs for 12 cupcakes.  You can also make large ones as well but it looks more cute with smaller ones as it is more true to actual spaghetti and meatball size. Then place the meatballs in the fridge to firm up.. Once the cupcakes and strawberry sauce have cooled completely make the butter cream frosting. Place the unsalted butter in the electric mixer and beat until it is light and fluffy. Then add 1 cup of icing sugar to the butter and beat until incorporated, then add 1 tablespoon of milk and mix, adding alternatively until all milk and icing sugar has been added. Then add 1 1/2 teaspoons of vanilla bean paste and a few drops of yellow food colouring to the icing and beat it until the colour spread equally through the icing.. One all the separate parts of the spaghetti have been made they can be assembled.  First using a small round tipped piping bag place some of the butter cream icing in it. Then in random or circular motions pipe the icing loosely on the cupcake to resemble spaghetti noodles. Then add your cold strawberry 'pasta sauce' on top of the 'noodles'. On top of the sauce place your chocolate cake 'meatballs' (3 small ones looks the best). Add a small amount of sauce on top of the cupcakes then to finish it add some grated white chocolate 'parmesan cheese'.If your friends are vegetarian do not add the 'meatballs' and if they are lactose intolerant do not add the 'cheese' as I have done for some of them.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_50_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_50_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_50_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_50_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Consumables(2) half-cup sticks of unsalted butter(4) grams of vaporizer leftovers*(1) gram of ground fresh cannabis(3/4) cup of watercheeseclothKitchen suppliesstovetop range with a low heat setting 1 quart saucepanwire mesh strainer 2-cup round dishrefrigerator*Vaporizer leftovers vary in potency. A potent strain will yield more potent leftovers. The amount of heat applied from the vaporizer will determine how dark your roast looks: lighter color generally indicates that you'll get more potency whereas a darker color indicates that you extracted the potency with the vaporizer and may need to supplement with additional leftovers or the addition of fresh herb.. Add the butter, ground herb, and water to the saucepan. Set the burner to the lowest heat setting. Set a timer for five or six hours, then wait. Stir occasionally, making sure to scrape any errant bits of material from the sides of the pan.The low heat setting ensures that you don't inadvertently overheat the THC, thereby compromising its potency. Because THC is not water-soluble and the dried herbs float, the addition of water to the mixture is solely to float the ground herb away from the heat source.Low and slow is the way to go to ensure that the THC bind with the lipids in the butter. . After your mixture has simmered for several hours, it's time to strain it to remove the solids from the mixture.To do so, simply lay a square of cheesecloth over your mesh strainer. Secure the sides with one hand while pouring the mixture over the cheesecloth. The warm butter and water will pass through the cloth and mesh, leaving behind a buttery pile of damp solids. Use a spoon to squeeze out the remaining liquids, and you should end up with a dish of greenish-brown butterwater. There may be some fine material in your cooling dish, and that's okay. It'll rinse off after the mixture separates and cools.Let the mixture cool at room temperature, then cover it and pop the dish into the fridge for several hours to help the fats solidify. I recommend leaving it overnight to solidify. If you must hurry, do not use the freezer. Instead, fill a larger dish with cold water and place your smaller buttery dish into it to speed up the chilling process.. Behold the power of specific gravity!You'll notice that the fats all rose to the top of the dish, where they solidified into a puck of butter. The water settled to the bottom, and any tiny bits of floating herb floated above the water but below the butter. To drain the water, I use a chopstick to poke a hole along one edge of the butter puck and another on the opposite side. Then simply pour the water out and discard it.My butter puck has little brown bits of floating herb stuck to it. A quick rinse under cool water washes that right off, and you're left with a clean batch of cannabutter. . You should be left with just under 2 cups of clean, green cannabis butter. Use it when baking, as a topping on toast, mixed into tea, or eat it plain. For the first few attempts at this recipe, pay close attention to the potency and use more or less vaporizer leftovers as needed. Ingested marijuana takes longer to reach your bloodstream than vaporized marijuana. It can take anywhere from 30-90 minutes to feel the effects. If you have just eaten a heavy meal, it will take longer for the THC to reach your bloodstream. Conversely, if you haven't eaten for several hours, the THC will hit your bloodstream faster. The psychoactive effect lasts longer when THC is ingested (up to eight hours) so don't drive, operate heavy machinery, or hang out with your in-laws in the hours immediately following ingestion. \nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_51_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_51_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_51_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_51_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. -Take little ghee in vessel and add chopped nuts and raisins of your choice.Saute them till they change little brown color.Remove and transfer them to plate. To same vessel add teaspoon ghee and add half cup semolina.Saute till it changes little color and you get nice aroma.. Add 4 cups milk and stir continuosly on medium flame.. Add 3/4 cup sugar and previously sauted nuts and raisins and mix. Cook for about 20 minutes by mixing in intervals till semolina is cooked and milk is reduced in volume. Finally garnish with nuts and raisins and serve .\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_52_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_52_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_52_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_52_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. We used a slight adaptation on Martha Stewart's Fresh Beet Pasta recipe (see link)Ingredients:8 ounces red or golden beets (about 2 medium)2 large eggs1 large egg yolk2 1/2 cups all-purpose flour (white or stone ground), plus more for dusting1 teaspoon salt. Preheat oven to 375F Rinse beets to remove dirtPlace beets in casserole dish and fill halfway with water We found that this took approximately an hour and a half with our large beets - would recommend an alternate roasting method (wrap in foil) or if you are also dealing with large beets, cutting them in quarters will helpCook beets until softenedBeets are done when skin rubs off easilyPuncture with a fork for an early read - should easily pierceCool beets and remove skin. Cut skinned beets to fit your food processor  Puree thoroughly - may still look a little chunky In food processor, mix the following: 1/2 cup pureed beets2 whole eggs 1 egg yolkPuree until lightened in color. Mix in a bowl  2 1/2 cups of flour (all purpose white or all purpose stone ground for a chewier texture) 1 teaspoon of saltAdd beet-egg mixture and mix until combined (should look slightly dry) Turn dough onto clean, lightly floured counter top and knead until firm  Dough should maintain form but not be crumbly. Separate dough into 3 parts Roll dough with rolling pin until very thin (thickness around a penny is ideal) Maintain a well floured surface  Frequently flip/move dough to reduce stickingNote: Divide into more pieces if you are working with less counter space. The thinness of the dough is critical to the final mouthfeel of the bowties. We used a small glass from a brewery tour as a make-shift cutting toolNote: You can buy a special tool for the more traditional crimped edge (link here) and cut pasta into wide strips, and then smaller squares and proceed . Place finger in center of circle Pinch outer edges towards center Remove finger and pinch middle (hard) to form a seal Admire your work. After forming bowties, set aside for 10-15 minutes to dry a little. This will help them maintain their form during and after cooking. . Drop bowties into simmering salted water in small batches You know the pasta is done when it floats to the top Strain pasta and drain of waterPlace cooked pasta aside, add olive oil to prevent stickingAdd any additional olive oil to tasteSprinkle goat cheese crumbles on top after plating. Rosemary Chicken (Roasted in the oven at 350F until cooked through) Chicken tendersFresh rosemaryOlive oilGarlicSalt & PepperBaby Squash Medallions (pan fried)Chopped baby squash Olive oilGarlicChopped onionsLemon juiceTruffle oilSalt & PepperShaved Parmesan3. Side saladRoasted beetsLettuceGoat cheese (crumbled on top). One serving shown here with Grapefruit Gin Fizz Cocktails\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_53_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_53_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_53_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_53_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. INGREDIENTS:\n1 lb. of thawed ground beef or deer burger\n1 tsp. of salt\n2 dashes of pepper\n1 dash of worchestershire sauce\n1 dash of liquid smoke\nA liberal covering of papaya enzyme or meat tenderizer\nA handful of chopped dried fruit (cherries, blueberries, and strawberries work well)\n. Chop the dried fruit and mix all of the ingredients by hand thoroughly.\n. Roll the meat into the size of a golf ball.  Place it on aluminum foil and press the meat flat so it dries evenly.  . Place it in your food dehydrator or the oven with the door open at 200 degrees for 4-6 hours.  . The jerky is done when it firmly bends.  If it has drops of oil on it, just dab with a paper towel.  \nTo store, just place in a ziplock and put it in the fridge.\nDelicious!\n\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_54_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_54_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_54_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_54_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. To match the colors of candy corn (yellow, orange and white), the following ingredients were used:Pineapple slices Mandarin OrangesYogurtWhipped Cream (optional)I didn't provide the amount of ingredients since it really depends on the size of your container.  To make these two small glasses, 3 pineapple slices, 12 pieces of oranges, 6 oz of yogurt and a little bit of whip cream were used.  . Dice the pineapple slices.  Add to the bottom of the container.. Add a layer of mandarin oranges over the pineapple.  . Add a layer of yogurt over the oranges.  Top with whip cream, if desired.. Now, go eat!. Added a few props, including the wood version of candy corn from last Halloween :).\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_55_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_55_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_55_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_55_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Here's how to make the fondant. (There's a bunch of instructables on this...so check them out if this isn't clear enough) It's fun to mold...and great for cupcake toppers! You'll need: Powdered sugar, marshmallows and water. Put 4 oz. of marshmallows in a microwave bowl with 1 tablespoon of water. Heat them up in the microwave for about a minute until they are puffy! Stir them up. Add powdered sugar until smooth and able to handle. (I didn't measure...you just have to keep adding it until it's clay like enough). Sprinkle powdered sugar on a silpat or counter. Put a little bit of the fondant on it.\u00a0 Roll it thin with a rolling pin. about 1/8 inch. Get your sprinkles and gel caps ready. Use scissors to cut off the ends of the gel caps.. On the rolled out fondant... Place a pearl or sprinkle... Place the gel cap over the top... And press the gel cap onto the fondant... Pick it up and it should stay in place! It's perfect and wiggly! Make some matching sets. Think of all the occasions you could use these... Add them to fondant figures for cupcake toppers! Great for Halloween!. Now for the cute pumpkins! We added some orange food coloring to some of the white fondant... And some green to a little bit of the white... Rolled the orange into balls...pumpkins! Used a toothpick to create lines... Add a green stem... And, of course, add the googly eyes! Then create a little scene of friendly pumpkins!. Great toppers for cakes or cupcakes...brownies or other fun Halloween dessert! Hope you liked this instructable--we had so much fun! Check out my blog Doodlecraft for more fun! And don't forget to vote!\u00a0 :)\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_56_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_56_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_56_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_56_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Heat your oven to 356f or 180c. Cook for 45 minutes to an hour.The potatoes are ready when they are folk tender.. . Place the sliced potatoes into a large mixing bowl.In a small bowl add two table spoons of whole egg mayonnaise, followed by one table spoon of sour cream.Mix together.. Then a cup of grated cheddar cheese.Next add our mixture of mayonnaise and sour cream.Season with salt and pepper.. To add some colour, add \u00bc of a cup of chopped chive. I like to use scissors for chives, it creates less mess and you only use what you need.Mix the chives into the salad.. To finish it off, top with the remaining cheese, bacon and chives.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_57_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_57_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_57_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_57_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. \n          Here is what you need for this tasty pie!Ingredients:\n\n\t\tFlour Tortillas (I cut up the small to make the mini pies, and use a whole large one for the pie)\n\t\tVanilla Ice Cream\n\t\tCorn Flakes\n\t\tCinnamon Sugar\n\t\tButter\n\t\tHoney (So important, but I forgot to put it in the picture!)Special\u00a0Supplies:\n\n\t\tCupcake Pan\n\t\tPie Pan\n\t\tPastry Brush (if you have it, if not, I found a spatula\u00a0worked fine)\n\t\tRamekin\u00a0(or something of similar size for cutting out mini pies)\n\t\tCutting Board and Knife\n\t\tPlates\n\t\tOven and Microwave. \n          Time to get the crusts ready. \u00a0I knew what I wanted to do, but not really how to do it. \u00a0Luckily, I found a nice tutorial online on how to bake cinnamon sugar tortillas.Mini Pies:\n\n\t\tTake your small tortilla, cutting board, knife and ramekin.\n\t\tTake your ramekin and lay it on the tortilla against one edge. \u00a0I planned it out so I could get two from each tortilla.\n\t\tCut around the ramekin.\n\t\tRepeat on other side of tortilla, they might not be perfectly round, but that doesn't matter.\n\t\tOnce you have how many you want, melt some butter in a bowl and get your pastry brush and cinnamon sugar.\n\t\tBrush butter on the tortilla and coat in cinnamon sugar. \u00a0Do this on a plage, it get's messy.\n\t\tFlip the\u00a0tortilla\u00a0and coat the other side with butter and cinnamon sugar.\n\t\tPut it in the cupcake pan, be careful not to tear it. \u00a0The sides will fold and ruffle.\n\t\tRepeat for all tortillas till the pan is full or until you have how many you want.Full Pie:\n\n\t\tGet a plate, your tortilla, melted butter, pastry brush and cinnamon sugar.\n\t\tCoat the tortilla, as you did with the minis, with butter and cinnamon sugar on both sides.\n\t\tPlace it in the pan, molding it to its shape.\nYou will be preheating your oven to 350 degrees\u00a0Fahrenheit. \u00a0I cooked both the mini and full for about 13.5 minutes. \u00a0I tried a full 15 like the linked tutorial says, but that was too much for mine. \u00a0You want it to start to get hard, but don't want them getting burned or too brown. \u00a0Let these cool. \u00a0If you use them right away, your ice cream will just melt in them. \u00a0You can make them ahead of time and store them in an airtight container until you are ready to use them.\n        . Might as well prepare your pie topping while the crusts are cooking. \u00a0Take some Corn Flakes (there is no exact amount) and crunch it up. \u00a0You can leave the corn flakes and cinnamon sugar\u00a0separate\u00a0and add them on top of the pie\u00a0individually\u00a0or you can mix them now. \u00a0If you mix them, make sure when you are putting it on the pie that you get both. \u00a0The cinnamon sugar will want to sink to below the corn flakes.. \n          Now it's time to put it all together. \u00a0So simple, isn't it!\n\t\tTake your pie crusts and put some ice cream inside. \u00a0Don't put too much in the full pie, a nice layer of up to an inch should do it. \u00a0\n\t\tFor the big pie, I put ice cream in a bowl first and smushed it up so it would be easier to put in the pie. \u00a0While the crust seems pretty strong, I didn't want to risk smashing it to pieces already. \u00a0I also used a knife to spread it in the pie pan after I put it in.\n\t\tSprinkle your pie topping on top of the ice cream.\n\t\tLastly, drizzle on some honey, it makes all the difference! \u00a0(It's kind of hard to see the honey in the pictures, but it's there! \u00a0Don't put too much now, a little goes a long way. \u00a0Best way to handle this, is put some on one of the minis and eat it and decide if you need more or less.). Eat your pie!\u00a0\nI originally wanted to do just a full pie, but didn't think it would work. \u00a0I thought that the crust (tortilla) would just crumble if you tried to cut pie slices. \u00a0Scoochmaroo suggested making minis and I thought that would be perfect!\u00a0 I love bite sized. \u00a0\nI decided to do a full sized pie at the same time, because I figured if it didn't work, I could still eat it! \u00a0In the end, it worked fine. \u00a0It cut pretty nicely, staying in tact.\u00a0\nYou can store the pies in the freezer and they seem to keep okay, but the sooner you can eat it, the better!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_58_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_58_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_58_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_58_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. What you need per serving for 2:100grams flour50grams Lievito Madre2 eggssaltYou will also need a filling, either take a pesto or mix 100grams spinach with 150grams ricotta. You can either make the Lievito Madre yourself which takes 25 days or buy a dried version.Tools:pasta makersilicon kneading matkitchen scale. This is actually easier than usually with pasta dough, because you already have a dough mixture and only add more to it. Keep in mind the Lievito Madre usually consists of flour and water in a ratio of 2 to 1. You probably use a discard of it after refreshing, keep the discard in a glass container in the fridge and when you want to prepare the pasta dough, add 100grams of flour and 2 beaten eggs. Now comes a little chemistry, the microorganisms in the Lievtio Madre are still alive, that is why you add a full teaspoon of salt. The microorganisms will die from plasmolysis, their enzymes are still active for some time. This will enrich the dough with a lot of flavour. Don't worry about the amount of salt, when you cook them in the water any excessive salt will diffuse out. Wrap the dough in cling film and store it in the fridge for 2 hours. If you think the dough is too wet then knead it on your silicon mat and gradually add more flour.. Take the dough out of the fridge and roll out the pasta on your kneading mat into thin sheets using the pasta machine. Then take a tumbler with a wide opening and press it into the dough to gain circular dough sheets.. Like I mentioned in the beginning, for the explanation I will use mashed Anko to achieve better pictures. Originally it is a mixture of herbs with fresh cheese or meat like in picture 2.. Lay one of the circular sheet between your thumb and index finger of your non-dominant hand. Put in a spoon of your filling. Push in a little fold with the index finger of your dominant hand. Then with the middle finger and index finger you make a fold from the right into the mid like in picture 3. With your thumb and middle finger you make a fold from the left like in the fourth picture.. Continue with the folding technique until you reached the end and it becomes too hard. Then just press the 2 sides together to form a tail (la coda).. Store the pasta on a plate with flour or they will stick to it. Do not dry them in the sun or the filling will leak on the bottom.. You need a big cooking pot with boiling salted water. If you need more information to improve your cooking procedure, have a look at my Nerikomi Pasta instructable. The pasta is big and it might make a splash if you just throw them in. If you have a small kitchen sieve, put the pasta in there and then gracefully lay them into the water. The pasta should be done after 5 minutes when they all swim on the surface. Drain them in your sink and saut\u00e9 them into a hot pan with butter. Add sage and then serve.. For an extraordinary effect when you serve guests, try black dough with a red filling. Don't be worried the white dough for storage could make it look grey, it will mostly wash off during cooking.Enjoy your pasta!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_59_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_59_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_59_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_59_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. What you will need:\nRice Krispies or similar\nSugar\n1 jar of smooth peanut butter, smallish size, probably 1lb but don't have any right now so can't check!\n1 1lb (454g) can of golden syrup\nChocolate for coating, 8oz (225g) or so.\n1 large saucepan\nSpoon for mixing, easier with a metal one\nLarge tray, this one is about 12\" by 18\" (31 by 36cm)\nSpatula, rolling pin or anything to smooth out a stiff mixture.\nGreaseproof paper (baking parchment)\nSome strong arms, either yours or borrow a pair from somewhere.. Put all the peanut butter and syrup into the pan, scraping it all out, then fill up the syrup can with sugar and pour that in too.\nHeat it while stirring over a low flame until the sugar has mostly dissolved and it looks as runny as it's likely to get.\u00a0 Don't heat too fast or leave it to stand for too long because it will start burning on the bottom easily - you're not making toffee, just melting it all.\nBeware, this gets very hot and will stick to you and burn if you try putting your finger in!\u00a0 Don't be tempted to try any without cooling it first.\nThe nutty smell may attract raiders from the garden, have a peanut or two on hand to fend them off!. Using the syrup can again, fill it up with Rice Krispies and tip them in.\u00a0 Do this five times, but not all at once because it'll be pretty hard to mix without spilling rice all over the place.\nNow mix it up!\u00a0 It's a very tough job so this is why you'll need the strong arms... hopefully you managed to borrow a pair.\u00a0 Keep going until all the rice is coated and it's an even mixture.\nWhen it's done, line the tray with baking paper and tip the mix onto it.\nSquish it roughly flat with a spatula or spoon and then use a rolling pin (or just the spatula) to get it as smooth as possible and covering the whole tray.\nSet it aside to cool for a bit while you melt the chocolate.\u00a0 Maybe take a short break if you think it (or you!) needs it.. Melt the chocolate the normal way over hot water, then pour it over the chew and make it even.\u00a0 It might not look like you have enough but this amount is just right.\nDone!\u00a0 Leave it for a while to cool until it's ready to cut.. Now another hard part.\u00a0 Score lines down the chocolate with a sharp knife and then slice it into squares.\nThis gets tiring quickly... I had a blister after doing several trays of this.\nAnd that's it!\u00a0 Except for the washing up.\u00a0 This may look like a nightmare but it actually comes off very easily with just warm soapy water apart from any burnt toffee on the bottom of the pan, which will need chipping off with something.\nThese freeze well, and even taste good when frozen so don't count on them being safe in the freezer.\u00a0 I've thought of other stuff like maybe adding small marshmallows with the rice but haven't tried it yet...\nEnjoy!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_60_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_60_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_60_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_60_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. So you take the chips and put them on the plate. Put as much as u want . Now put the first layer of cheese on top of the chips again put as much as u want remember it's only the first layer. Now it's time to put the second layer of chips so just put chips on top of the first layer . It's time to up the second layer of cheese on top of the second layer of chips and u can put as much cheese because this is ur last layer. Now heat up the nachos for 30 seconds in the microwave . Now you get to eat it!!!You can add different toppings if u want it's totally up to you \nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_61_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_61_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_61_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_61_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Cut ends off of the rutabaga and scrub it well under running water since its a root vegetable.\nPeel the root with a sharp knife, cutting away the outer skin all the way around the vegetable. Be careful to cut away from your body on a cutting board, the skin can be tough to cut.. Cut the root into matchstick sized pieces. Try to make the pieces as uniform as possible so they will cook in the same amount of time.. Boil a pot of salted water.\nWhen the water is simmering, add the matchsticks and let them cook for 3-4 minutes until they are just able to be pierced by a fork.. Drain the fries and shake out the water.\nPrepare a dish with olive oil and your choice of seasoning, I used salt and freshly ground pepper.\u00a0\nLay out the matchstick sized pieces on a cooking sheet in a single layer and baste the fries with the olive oil mixture.\nBake for 15 minutes. When they are ready they should be easily pierced by a fork but crispy on the surface and a little browned but not black. Be careful not to overcook the fries because they will be dry and chalky.. I prepared fries from the rutabaga (the orange colored ones) and also yucca fries (the white colored ones). Yuca fries are prepared in the same way. Boil them separately though so the flavors don't mix as rutabaga can be a strongly flavored vegetable.\nEnjoy! They are best served piping hot!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_62_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_62_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_62_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_62_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Dough\n1 package active dry yeast\n1 1/4 cup warm water\n1 tablespoon sugar\n2\u00a0 tablespoons olive oil\n1/2 teaspoon salt\n2 3/4 cups whole wheat flourFilling\n2 cups shredded mozerella\n1/2 cup tomato sauce\n2 cups of your favorite pizza toppings. 1. Mix warm water, yeast, sugar and oil. Let stand for about 10 min. Mixture should be bubbly.\n2. Add salt and flour. Mix until the dough sticks together. Knead dough until it becomes smooth, about 10 min. Add water as needed.\n3. Put the dough in a bowl, cover with and cloth and allow to rise for about an hour, or until it doubles in size. Placing the dough in a warm spot will help the dough rise faster. Prepare the filling while the dough is rising.\n4. Divide the dough into 12 equal pieces and form balls.. 1. Chop your favorite toppings into small pieces, this will be the bun filling.\n2. Stir shredded cheese and tomato sauce into chopped topping mixture.. 1. Roll each piece out until it is 4-5 inches in diameter.\n2. Place 2 tablespoons of topping in the middle of the dough.\n3. Pull the side up, twisting to seal the dough.. 1. Place the buns, seam down on small wax paper squares in the preheated rice cooker.\n2. Steam for 15-20 min. Add water to rice cooker as needed. These buns are easy to pack for lunches. They freeze and reheat in the micowave beautifully!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_63_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_63_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_63_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_63_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. \n\tThe cookie ingredients:\n\t\t2 sticks of unsalted\u00a0butter\n\t\t1 cup sugar\n\t\t1 cup brown sugar\n\t\t2 eggs\n\t\t1 tsp vanilla extract\n\t\t1 1/2 cups all-purpose flour\n\t\t1 tsp baking soda\n\t\t1 tsp salt\n\t\t1 tsp cinnamon\n\t\t2 cups oatmeal\n\t\t1\u00a0cup\u00a0sweetened shredded coconut\nAnd the additives:\n\n\t\tFritos (I crushed these up)\n\t\tMini Reese's cups\n\t\tMini chocolate chips\n\t\tToasted almond slivers (which I toasted and\u00a0crushed)\n\t\tPeanut butter chips (not in the photo since it was a late addition)\nI had many of these ingredients in my kitchen already. \u00a0So I suggest going through your cabinet, taking any delicious ingredients and adding them into the mix. \u00a0Yes, fritos are a bit outrageous, but I love them. \u00a0If that is too much, try pretzels or potato chips because crunchy and salty are a necessity folks.. Preheat the oven to 325\u00b0F. \u00a0\nNow for the pan I only had a 13\" x 9\" available so I chose to use it. \u00a0Unfortunately this made the bars a bit thick and somewhat overwhelming. \u00a0I suggest going following RecipeCarr's recommendation and using a 16\" x 12\" x 1\" pan for thinner bars. \u00a0Whichever pan you go with, make sure to grease it well.\nCream the butter and sugar. \u00a0Then add the eggs and vanilla to the mixture.. In a separate bowl whisk together flour, baking soda, salt, and cinnamon.. With the mixer on low, gradually add the dry to the wet ingredients. Then stir in the oatmeal and coconut. Make sure they are well mixed.\nAt this point I mixed in the fritos, Reese's, chocolate chips, almonds, and peanut butter chips until just combined. To make this less daunting, I added one at a time, but try not to overmix.. Press the dough into your pan. \u00a0Try to spread it evenly.\nBake for 40-45 minutes in a 13\" x 9\" pan. If you use a 16\" x 12\" pan then the time is reduced to 20-25 minutes. Make sure to check up on them and use your discretion. I waited until everything was evenly browned and the middle was no longer jiggly.\nAllow the bars to cool before cutting them into squares. \u00a0My pan yielded 24 bars which were a bit too large for some of the guests...\nEnjoy and remember to share! (Especially because eating more than one will increase your cholesterol drastically.)\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_64_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_64_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_64_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_64_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. 24 \u2013 28 - 1 quart/litre size Mason jars. Any brand will work nicely.\nLids and rings - make sure you buy/have the right size for your jars\nLarge bowls (for the peeled tomatoes)\nLarge pots for boiling water to scald the tomatoes\nSmaller pot of boiling water to simmer the seals & rings in (I like to rings to be warm when placed on the seals)\nA paring knife and a chef's knife\nMeasuring spoons\nSalt (I use sea salt - but any good table salt will work)\nClean towels\nJar lifter for grabbing those hot jars\nCanning funnel for filling the jars\nMagnetic lid lifter \u2013 for taking the hot seals and rings out of the water\nCanning Jar Wrench \u2013 I use this to empty the hot water out of the waiting jars (saves your fingers)\nPressure Canner - the one I use is a Mirro 22 Quart \u00a0weighted Gauge. Wash jars \u00a0in hot soapy water, rinse well.\nThe tomatoes will be cold pack, raw, so it is not necessary to keep the jars hot.\nI keep the jars warm,\u00a0so they don't crack when you add the water to the canner.\nFilling them 1/3 full with boiling water will keep them warm.\nNote:\u00a0 while you are washing\u00a0jars,\u00a0 put two larges pots of water on the stove to boil - these\u00a0will be used to scald the tomatoes.\n\u00a0. \u00a0Wash the tomatoes and set aside.\nTransfer the tomatoes in small batches into the boiling water for 30 to 60 seconds - until skin shows a crack.\nOnce the skin shows a crack, remove from boiling water and plunge into cold water.\nI use my kitchen sink filled with cold water, that way when the water gets warm from the tomatoes - I can release some of the warm and add more cold.\nHint:\u00a0 too many tomatoes in the pot cools the water right down - works best with 6 to 8 tomatoes in the pot.. The skin, should pull right off with a paring knife.\nAfter all the tomatoes are skinned it is time to chop and fill the jars.\u00a0 To keep the rim clean I insert the canning funnel into the jar before I fill them.\nWhile you are chopping warm the seals in rings in a pot of boiled water.\nAdd 1/2 cup white vinegar to the canner (keeps hard water stains off the jars and the inside of the canner), and 2 to 3 inches of hot water before adding the jars.\nUser the jar wrench to pick up the jar and empty the water, just before filling the jar.\nAdd 1/2 to 1tsp of salt to the jar - depending on your taste.\u00a0 I only use 1/2 tsp - just enough to flavour the tomatoes.\nRemove the top off\u00a0 the tomatoes chop into quarters and fill the jar.\u00a0\nUsing a wooden or plastic spoon press down on the tomatoes in the jar.\u00a0 You want to get as many tomatoes in the jar as possible.\u00a0\nI found that it to 8 to 10 tomatoes for each quart jar.\nOnce jar is full place a hot seal and ring on the jar, tighten down and place in pressure canner.\nThe canner will hold 7 - 1 quart jars on.\nOnce all the jars are in the canner the water should be 2 to 3 inches from the bottom of the jar.\u00a0 As you can see from the picture\u00a0 the jars raise the level of the water when they are added.\nIf the water is too high remove water with a measuring cup.. \nPut the lid on the canner\u00a0 secure tight, weighted petcock should be\u00a010 lbs pressures.\n0-1000 ft sealevel - 10lbs pressure for 15 minutes\nabove 1000 ft\u00a0 seal level - 15lbs pressure for 15 minutes\nVideo\u00a0shows 15lbs pressure.\nPicture shows 10 lbs pressure.\nTurn the burner on high, heat until the petcock is furiously dancing,\u00a0then turn the heat down to medium-high - the petcock should be doing a gentle dance at this time.\u00a0\nTime for 15 minutes.\nAfter 15 minutes remove the canner from the burner.\u00a0\nThe canner will take 20 to 30 minutes to cool and release all the pressure.\u00a0 You will know when it is safe to open the canner when\u00a0 you don't hear escaping pressure anymore.\nOpen the canner, remove the jars with the jar lifter, dry with a towel and place on a towel to cool.\u00a0\nIf you have more jars to process you can put them in the canner, make sure the water level is correct, seal the canner, turn on the heat and start again.. \nHere they are - finished and ready to store in a cool place, and enjoy until next summer!!!\nCanning the tomatoes this way gives you more versatility \u2013 you can throw them on pasta with olive oil and spices, make pasta sauce out of them, add them to stews and soups or any recipe you can dream up.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_65_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_65_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_65_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_65_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. For this cake I used a 4 inch 375ml spring form pan to make my cake, so it is pretty small because that is the way that I like it.  However, you may also use a more common 9 inch 2250ml spring form pan by just adding more crust, ice cream and bananas (I worked it out to be about 6.x this recipe).  To make this recipe for the 4 inch 375ml spring form pan you will need the following ingredients and materials:1 tablespoon butter1/4 cup Oreo baking crumbs1/2 cup strawberry ice cream1/2 cup vanilla ice cream1/2 cup chocolate ice cream1 banana1 maraschino cherrywhipped cream4 inch 375ml spring form panMeasuring spoonsIce cream scoopMeasuring cups. To make the crust, pour a half cup of Oreo baking crumbs into a small bowl.  Combine with 1 tablespoon of melted butter. Now press the mixture firmly into the bottom of your pan and put it into the freezer for 10-15 minutes. . To add the ice cream, first scoop a half cup (or enough to fill up a third of the space left in the pan) of chocolate ice cream into the pan, making sure to flatten it out using either the back of the ice cream scoop or a knife.  If you would like to have crisper borders between ice cream flavours you can clean the ice cream that has made its way onto the side of the pan with a cloth.  Put this into the freezer for about 20 minutes or until the ice cream feels hard to the touch.  Now, repeat the steps used in making the chocolate layer with the vanilla and then strawberry layer. Make sure that when you do the strawberry layer that you make it especially smooth because that is what you will see at the top of your cake. Once you are done the strawberry layer, let your cake stay in the freezer for about 2 hours or until completely solid. This way your cake will freeze through completely so you can get the best results!  To make any of the layers more smooth just use slightly more melted ice cream and freeze it for a bit longer.  After you have finished all the layers, take the cake out of the spring form pan and put it onto a plate. . Peel the banana.  Slice both ends off of the banana, and then from there, slice the banana into two pieces long enough that they would be able to fit from the bottom to the top of the ice cream cake (if you sliced the banana in half that would probably work about right, too).  Now cut both pieces lengthwise.  To put these on the cake, first make teeny notches (each of these represents where one banana will go) in the cake that divide it into quarters.  This is so that you are not ruining your entire cake once you realize that you have done a lopsided job of putting the bananas on.  When sticking the bananas to the cake  have them facing outside out and inside in.  Also, make sure that you are not moving them around too much as you put them on to avoid smudging the cake.  At this point if you are not going to serve the cake right away you should put it in the freezer until you are ready to decorate and serve it.. I think that this is the most fun of all the steps.  First, you put one small squirt of whipped cream onto the top of each of the bananas.  Now dust a small amount of leftover Oreo baking crumbs fairly centrally on the cake.  Now to put a cherry on top, we will literally put a cherry on top. First squirt a dollop of whipped cream into the center of the cake and then put your maraschino cherry on top of that.And there you have it: A Banana Split Ice Cream Cake! Thank you so much for taking the time to read through my intractable I really hope you enjoyed it and will enjoy eating your cake! :)\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_66_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_66_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_66_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_66_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. \n          Here is what you need for this tasty pie!Ingredients:\n\n\t\tFlour Tortillas (I cut up the small to make the mini pies, and use a whole large one for the pie)\n\t\tVanilla Ice Cream\n\t\tCorn Flakes\n\t\tCinnamon Sugar\n\t\tButter\n\t\tHoney (So important, but I forgot to put it in the picture!)Special\u00a0Supplies:\n\n\t\tCupcake Pan\n\t\tPie Pan\n\t\tPastry Brush (if you have it, if not, I found a spatula\u00a0worked fine)\n\t\tRamekin\u00a0(or something of similar size for cutting out mini pies)\n\t\tCutting Board and Knife\n\t\tPlates\n\t\tOven and Microwave. \n          Time to get the crusts ready. \u00a0I knew what I wanted to do, but not really how to do it. \u00a0Luckily, I found a nice tutorial online on how to bake cinnamon sugar tortillas.Mini Pies:\n\n\t\tTake your small tortilla, cutting board, knife and ramekin.\n\t\tTake your ramekin and lay it on the tortilla against one edge. \u00a0I planned it out so I could get two from each tortilla.\n\t\tCut around the ramekin.\n\t\tRepeat on other side of tortilla, they might not be perfectly round, but that doesn't matter.\n\t\tOnce you have how many you want, melt some butter in a bowl and get your pastry brush and cinnamon sugar.\n\t\tBrush butter on the tortilla and coat in cinnamon sugar. \u00a0Do this on a plage, it get's messy.\n\t\tFlip the\u00a0tortilla\u00a0and coat the other side with butter and cinnamon sugar.\n\t\tPut it in the cupcake pan, be careful not to tear it. \u00a0The sides will fold and ruffle.\n\t\tRepeat for all tortillas till the pan is full or until you have how many you want.Full Pie:\n\n\t\tGet a plate, your tortilla, melted butter, pastry brush and cinnamon sugar.\n\t\tCoat the tortilla, as you did with the minis, with butter and cinnamon sugar on both sides.\n\t\tPlace it in the pan, molding it to its shape.\nYou will be preheating your oven to 350 degrees\u00a0Fahrenheit. \u00a0I cooked both the mini and full for about 13.5 minutes. \u00a0I tried a full 15 like the linked tutorial says, but that was too much for mine. \u00a0You want it to start to get hard, but don't want them getting burned or too brown. \u00a0Let these cool. \u00a0If you use them right away, your ice cream will just melt in them. \u00a0You can make them ahead of time and store them in an airtight container until you are ready to use them.\n        . Might as well prepare your pie topping while the crusts are cooking. \u00a0Take some Corn Flakes (there is no exact amount) and crunch it up. \u00a0You can leave the corn flakes and cinnamon sugar\u00a0separate\u00a0and add them on top of the pie\u00a0individually\u00a0or you can mix them now. \u00a0If you mix them, make sure when you are putting it on the pie that you get both. \u00a0The cinnamon sugar will want to sink to below the corn flakes.. \n          Now it's time to put it all together. \u00a0So simple, isn't it!\n\t\tTake your pie crusts and put some ice cream inside. \u00a0Don't put too much in the full pie, a nice layer of up to an inch should do it. \u00a0\n\t\tFor the big pie, I put ice cream in a bowl first and smushed it up so it would be easier to put in the pie. \u00a0While the crust seems pretty strong, I didn't want to risk smashing it to pieces already. \u00a0I also used a knife to spread it in the pie pan after I put it in.\n\t\tSprinkle your pie topping on top of the ice cream.\n\t\tLastly, drizzle on some honey, it makes all the difference! \u00a0(It's kind of hard to see the honey in the pictures, but it's there! \u00a0Don't put too much now, a little goes a long way. \u00a0Best way to handle this, is put some on one of the minis and eat it and decide if you need more or less.). Eat your pie!\u00a0\nI originally wanted to do just a full pie, but didn't think it would work. \u00a0I thought that the crust (tortilla) would just crumble if you tried to cut pie slices. \u00a0Scoochmaroo suggested making minis and I thought that would be perfect!\u00a0 I love bite sized. \u00a0\nI decided to do a full sized pie at the same time, because I figured if it didn't work, I could still eat it! \u00a0In the end, it worked fine. \u00a0It cut pretty nicely, staying in tact.\u00a0\nYou can store the pies in the freezer and they seem to keep okay, but the sooner you can eat it, the better!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_67_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_67_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_67_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_67_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. After washing them, place them on the cutting board.. Make sure they're facing the same way.. Roll all the way from bottom to top\u00a0. Hold the roll down and get ready to cut.\nWatch your fingers.. Make sure to cut all the way through.. Now you have fancy garnish for your food !\u00a0. \nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_68_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_68_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_68_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_68_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Place almonds on a baking sheet and bake at 375F.\nCheck after 4-5 min, then every few minutes after that.\nShake the pan once in a while for even browning\nPull them out when you see just a hint of toasty brown.\nThe almonds will continue to bake as they cool.\nCool your almonds.\nIf you want to store them before making your hearts, you can protect them from humidity in a sealed jar.. Select nicely matched pairs of almonds, and lay them out in pairs\nUsing a mandolin or use a grater, shave or grate your almonds to make a half-heart shape.\nTrim to suit. .. Use a double boiler, or devise one.\nFit a bowl over a pan of simmering water,\nAdd your favorite dark chocolate.\nCover it to keep steam at bay.\nWait.\nCheck (stir- {er, taste}) every few minutes.\nWhen you have a smooth, easily poured substance, kill the burner, but let the chocolate sit in the hot water.. In this exercise, I just took a spatula out of my melted chocolate.\nWiping a bit of chocolate on an almond, I assembled the two pieces, using chocolate as a cement,\nWait for these to set up, before continuing on to the next step.. \nThis is pretty self explanatory.\nLet the excess chocolate drip off, before placing these on waxed paper.\nChopsticks work well as an implement. . I sprinkled my mandolin shavings onto these hearts, while the chocolate was still melty.\nCareful who you give your heart to.\nI heard a sad story of a lady who ribbon-wrapped a tin of these for her hopeful-who snarfed them down like so many candy bars on Halloween.\nI passed mine out at work, in the tissue-lined paper clip box you see here.\nNo one noticed the box.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_69_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_69_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_69_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_69_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Cake Ingredients: 1 cup butter-softened 1 1/2\u00a0cups sugar 4 large eggs 3 cups all-purpose white flour 3 tsp. baking powder 1 tsp. salt 1 cup milk 2 tsp. vanilla extract Paper baking cupcake liners Vegetable cooking sprayYield: 24 cupcakes or 2 dozenDirections: 1. Preheat oven to 350F. Prepare cupcake tins with liners, set aside. 2. Beat butter and sugar at medium speed with an electric mixer until creamy and smooth. Then add eggs, 1 at a time, mixing until well blended after each egg. 3. Combine flour, baking powder, and salt together in a small bowl. Add to butter mixture alternating with milk. You should begin and end with the flour mixture. Mix at low speed until bleneded. Add in vanilla extract. 4. Spoon cake mix into cups, filling 2/3 full. 5. Bake at 350 for 12-15 minutes or until toothpick inserted comes out clean. 6. Cool in pans on wire rack for 10 minutes, remove cupcakes from pans and set on wire racks to completely cool.. Filling Ingredients: 1 8oz cream cheese-softened 1/3 cup powdered sugar 2-3 Tbsp. coffee liqueur(Kahlua) or 1 Tbsp. coffee extract 1/8 tsp. saltYield 2 CupsDirections: 1. Combine all ingredients in a medium bowl, mixing until well blended. Store any remaining filling in container in refrigerator-up to 2 weeks.. Once cupcakes are completely cooled, cut tops off of the cupcakes using a serrated knife. Then spread 1 Tbsp. of Tiramisu Filling on the bottom part of the cupcake, gently place the top back on. . Frosting Ingredients: 1/2 cup butter-softened 1 8oz cream cheese-softened 2 16oz packages powdered sugar 1/4 tsp. saltYield 5 cupsDirections: 1. Beat butter and cream cheese at medium speed until creamy and smooth. 2.\u00a0Gradually add in the powdered sugar and salt, mixing at low speed. Beat at high speed for 2 minutes until creamy and thick. 3. Frost each cupcake by using a spatula, knife or piping bag and tip. . For the finishing touch dust/sprinkle with Hersheys Cocoa Powder. . After all your hard work, you can now enjoy your Tiramisu Cupcakes! Enjoy!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_70_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_70_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_70_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_70_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Unsalted butter is recommended, but not required. For availability's sake I used salted butter the last time I made it. From what Baba said, you could also use fresh cream. . The ghee is essentially heated butter. You'll want to melt the entire stick at high heat. When the whole stick is melted, reduce to a low heat. Remember to keep the pot uncovered while heating. One of the ideas is to remove moisture from the butter. You should see steam and bubbles escaping throughout the boil.. It takes about 10-15 minutes for the entire process. You'll know you're done when at low heat bubbles are slower and steam can't be seen. At this point there is little or no moisture left. \nThe next step will be to separate the ghee and the remaining milk solids. I used 2 bowls and a fine mesh strainer for this. Repeat the process of moving ghee from one bowl to the other until solids can't be seen. There will be solids left if you're using salted butter. Use oven mitts as ghee gets very hot. . The most confusing part of ghee is the colour. It's usually an olive oil colour. The first image is the hot ghee from this tutorial. The second is the ghee I made a year ago. The third is the first image several hours later. Ghee has a semisolid texture. \nThe cool thing is that by removing moisture and lactose sugars, bacteria have little to feed on. This gives ghee an unlimited shelf life. Refrigeration is not necessary. \nWhen finished, this oil can be used in many recipes. I would not recommend using a lot (1 Tbs is loads) since saturated fat and cholesterol are present. That means baking recipes may not be ideal. \nThis is my entry in the fried food contest. I thought that showing the creation of one of the ingredients would be clever. If you found this useful for your entries, please like or comment.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_71_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_71_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_71_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_71_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Makes 5 to 6, 250 ml jars.4 cups or 1 liter unsweetened apple sauce.\u00bd teaspoon vanilla extract.1 sachet of pectin powder.1 teaspoons of ground cinnamon.\u00bc tsp ground cloves.Wild apples are tart so adding a sweetener is necessary; you can substitute honey, maple syrup, or corn syrup for Splenda, just match the recipe cup for cup. If you do not worry about sugar intake just use 2 cups sugar. You can go sugar free with 2 cups Splenda or some other artificial sweetener, or 1 cup sugar and 1 cup Splenda for low sugar. I used No Cook Pectin, if you do not process the jam you will need to refrigerate it, refrigerated the jam will keep six months. Processing allows you to store the jam at room temperature in a dark place and it doesn\u2019t affect the pectin negatively.. I make 2 liters of apple sauce at a time. Dice 5 liters of apples and place them in an 8 liter pot or large saucepan, add 1 liter water and bring to a boil and reduce heat. Simmer and stir until apples are soft, approximately 30 minutes to an hour depending on the size you diced the apples.Using a large sieve mash the soft diced apples removing all the unwanted parts, you should have about 2 liters apple sauce.. Clean and sterilize and prepare 6 bottles.Measure 4 cups or 1 liter apple sauce and place the apple sauce into a clean saucepan and add the sweetener, cinnamon, cloves, and vanilla extract.Bring to a boil stirring often after 5 minutes reduce heat and simmer, still stirring, until puree thickens and holds its shape on a spoon.Add the no cook pectin and stir it in.. Fill the jars with jam to within \u00bc of aninch (5 mm) of the lip of the Jar.Remove air bubbles and add more jam if necessaryPlace the seals on the jars and screw on the rings loosely. Place in a pot of water covering the lids by one inch (25 mm) cover and bring to a boil for 10 to 15 minutes.. Once the Jam has processed for 10 to 15 minutes remove from the heat and let stand for 5 minutes.Remove the jars of jam from the processing pot wipe off the outsides and tighten the lids.Place the jars of jam on a rack to cool.Once cool label and place the jars in a dark place for storage.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_72_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_72_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_72_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_72_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. For the zombie eyeball martini: Gin (6 parts) Vermouth (1 part) Radishes Pimento-stuffed olives For the bloody eyeball cocktail: Vodka (2 parts) Triple sec (or any orange liquer, e.g. Cointreau, Grand Marnier; 1 part) Lime juice (1 part) Cranberry juice (2 parts) Lychees, canned and peeled in a can Cranberries (fresh or dried)Dry ice is optional for both, but makes for a great creepy effect - and of course very effectively chills the drink.. Wash a radish, taking care to retain the root (the \"optic nerve\" of the eyeball). Trim the crown off so as to leave an exposed white area the same diameter as your olives. Using a small sharp knife, carve out an olive-sized hole. Partially peel the radish, going for a venous and broken-capillary look. Pop an olive in the hole, pimento-stuffing poking out, and use the radish to garnish a martini (6:1 gin:vermouth). If you have no dry ice, shake the ingredients over ice in a cocktail mixer, and add to a chilled martini glasss. If you DO have dry ice, simply pop a chunk in the drink and serve (caution the recipient not to imbibe until the drink has stopped smoking). The original (?) recipe calls for making the eyeballs the day before, chopping off the root and freezing them into ice cubes overnight, but I imagine a frozen eyeball is harder to snack on. I recommend just making them fresh - their structural integrity is good, and it's easy to pull the eyeball out of the glass by the optic nerve and munch noisily on it.. This drink is a kamikaze and cranberry juice garnished with a fuming, rotting eyeball.\nStuff the cavity of a peeled lychee (from a can) with cranberries, as many as it takes to have them slightly protruding. Pop in a martini glass,and add triple sec (or any orange liqueur - we used Grand Marnier), then lime juice, and vodka (1:1:2) and top up with cranberry juice. Add a chunk of dry ice and serve (again, make sure the drinker knows to wait until the smoking stops before consuming). If you have no dry ice, mix the ingredients (but not the eyeball!) in a cocktail shaker with ice, shake then strain into the glass over the eyeball.\u00a0. Your guests should be encouraged to eat the eyeballs after knocking back the cocktails - both types are tasty and not just for decoration. The orange liqueur-soaked lychee+cranberry decomposing bloody eyeball is delicious, and the stuffed radish zombie eyeball is positively healthy.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_73_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_73_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_73_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_73_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Ingredients:Beef, top round or any other cheaper non-fatty type works best  1 cup Soy sauce 1 tbsp Molasses 2 tsp Liquid smoke 2 tsp Black pepper 2 tsp Garlic powder 2 tsp Onion powder 2 tsp Red pepper flakes 1 tsp Ghost pepper salt 1 tsp Cayenne pepperThis recipe makes enough marinade for about 2.5 pounds or a little over 1 kg.Equipment:A food dehydrator like this one from NescoA sharp knife  A large glass or ceramic bowl  A cutting board  Paper towels. Start the process the day before you want your finished jerky. Throw your beef in the freezer for a couple hours or, if frozen, remove from the freezer for about an hour (this will all depend on how much you have). Since thin slices of beef are ideal for jerky, having the beef partially frozen makes it easier to cut consistently thin pieces.Once the beef is thawed on the outside but still slightly frozen on the inside, put it on a well-washed cutting board and pat it dry with a paper towel. Trim as much of the fat off as possible then slice the beef into \u215b\" to \u00bc\" (3-6mm) slices. Cutting with the grain with a really sharp (not serrated) knife works best. Here I'm using a top round steak, you may use any cut of meat you like but remember that meat with a high fat content will become rancid faster, which makes this company's filet mignon jerky practical yet decadent!. In this instructable I'm using a marinade (wet method) to flavor the jerky. There are other methods you can chose, such as a dry rub, however I enjoy the flavor the marinade brings to the beef.Wash your hands and bowl well then start by adding all of your ingredients (minus the beef) in your large bowl. Separate the beef slices well, since they tend to re-freeze together when in a pile, and add the beef to the bowl a few slices at a time followed by mixing by hand. Ensure all of your beef is coated well.If you have more meat than marinade, simply prepare another bowl with marinade and repeat the steps above. It's easier to work in smaller batches than a large unmanageable pile that might risk an uneven marination of the beef.Cover and put the bowl in the refrigerator overnight or for at least 12 hours. For best results, mix the contents once or twice during this period.. The next day (anywhere from 12-24 hours later) remove the bowl from the refrigerator and wash and dry your dehydrator racks as the manufacturer recommends. If you do not have a dehydrator, wash the metal grates of your oven well and line the bottom of the oven with foil.Remove the strips of beef from the marinade and arrange on the racks in one layer without overlapping, allowing for a little bit of air flow around each piece. When removing the strips of beef from the marinade, allow them to drip-dry, you want some marinade to coat the beef strip but not too much. Assemble your dehydrator and set at 160\u00b0F (~70\u00b0C).Revisit your dehydrator every hour to check the progress and to dab away any fat that is collecting on the top of your strips. With my dehydrator, the process took about 5 hours, this will vary depending upon how thick your strips are and the model of your dehydrator.If you do not have a dehydrator, this can be done in your oven by setting it as close to 160\u00b0F as possible and laying the beefs strips across the oven's metal grates. Prop the door of the oven open slightly with a wooden spoon to allow for the warm, moist air to circulate out. Please be aware that gas ovens pose the risk of carbon monoxide/dioxide poisoning when propped open, so if you go this route make sure you have plenty of ventilation.. Your jerky is ready when you are able to tear the strips along the grain, they should be pliable but not soft and fairly stiff but not brittle. At this point, turn your dehydrator off and store your jerky in a clean and dry container lined with a paper towel and a loose fitting lid. Jerky is shelf stable for about 2 weeks at room temperature and one month in the refrigerator.Congratulations, you have now made some super simple, spicy and delicious jerky at home! I encourage you to try tweaking the recipe to your liking. Substitute in dried peppers, hot sauce, smoked salts, different herbs... the combinations are endless. Just remember to keep any added fats to an absolute minimum and if you decide to use anything but beef, cook the meat to the USDA recommended internal temperatures first before dehydrating (including game meats).\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_74_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_74_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_74_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_74_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. First, we need ingredients. Duh. A cake without ingredients is just, well, air. For the cake, you'll need...2 Tbsp Butter  2 Lg Eggs  1 \u00bc Cup Granulated Sugar  1 \u00bc Cup Flour  \u00bd Cup Cocoa Powder  1 Tsp Baking Soda  \u00bd Tsp Baking Powder  \u00bd Tsp Salt  \u00bc Cup Heavy Cream  \u00bc Cup Whole Milk  \u00bc Cup Water  1/8 Cup Vegetable Oil  1 Tsp Vanilla ExtractFor the buttercream, you'll need...1 Bar Butter  1 \u00be Cup Powdered Sugar  \u00bc Cup Cocoa Powder  1 Tsp Vanilla Extract  1 Tbsp Heavy CreamFor the ganache, you'll need...125g Chocolate Chips  \u00bd Cup Heavy CreamAnd to top your cupcakes, you can use chocolate sprinkles, Oreos, chocolate chips, chocolate chunks, cocoa powder dust, or anything else chocolate.. First, pour all your dry ingredients into a bowl. The dry ingredients are...Flour Cocoa Powder Baking Soda Baking Powder SaltMix it all together with a spoon, fork, or whisk. . The wet ingredients include...Heavy Cream Whole Milk Water Vegetable Oil Vanilla ExtractAnd whisk with a whisk (keep in mind that the oil will separate, so it doesn't have to be perfectly blended). . Beat the 2 Tbsp of butter in a mixer (or use a whisk) until smooth and creamy. Then slowly add in the granulated sugar until the mixture becomes fluffy and crumbly. Next, add in your two large eggs, one at a time. Be sure the first egg has mixed in completely before adding in the second.. Now, with the mixer on low speed, alternate adding in the dry ingredients and wet ingredients, ending with the dry.. This is super simple. Just line a cupcake/muffin tin with cupcake liners, and scoop two big tablespoons of the batter into each liner. This should come up to about 2/3 to 3/4 of the way up the cupcake liner. Give the tin a good shake to even out the tops of the tins. . If you haven't already, preheat your oven to 350 degrees F, and bake for 18 minutes. . Cream your butter but mixing it on medium speed until light and fluffy. Then sift in half of the powdered sugar along with all of the cocoa powder, and continue mixing, now on low to medium low speed. Once all the dry stuff has been well incorporated into the butter, sift in the second half of sugar and continue mixing until well incorporated. Add in the vanilla extract and mix until well incorporated. Then finally, if necessary, add in up to one tablespoon of heavy cream to lighten up your frosting. Mix until combined. . Heat the heavy cream until a low simmer, then pour over the chocolate chips and stir until completely melted (feel free to use a microwave if your chocolate still is not completely melted. Heat 15 seconds at a time).. Put your buttercream into a piping bag fitted with a large star tip. Start at the center of your cupcake, and pipe in an outward spiral. As you approach the edge of the cupcake, decrease the pressure to create a tapered end. Then, using a bottle with a squeeze tip or a piping bag, drizzle some ganache over the top, and sprinkle with some of your toppings. . And that's it! Surprising how easy that was, right? Now take that first bite. Mmmmmmm. So good. If you enjoyed this recipe, please check me out on other social media sites, including facebook, instagram, tumblr, twitter, and, of course, YouTube. You can find me as \"joshpancooking.\" Thanks for all your support, love you all, and goodbye!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_75_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_75_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_75_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_75_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Ingredients you will need:\n1\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Package of Oreos\n1/3\u00a0\u00a0\u00a0\u00a0 Cup of Butter or Margarine\n3\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Packages of Cream Cheese (8 oz. each)\n3/4\u00a0\u00a0\u00a0\u00a0 Cup of Sugar\n1\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Tsp of Vanilla\n1\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Cup of Sour Cream\n4\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Eggs\nTools and Equipment needed:\n9 in spring form pan\nFood Processor\nMixing bowls\nElectric Mixer (Optional)\nMixing spoon\nDry measuring cups\nKnife. 1. Preheat the oven to 350 degrees Fahrenheit\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 If your pan is a dark color preheat oven to 325 degrees instead2. Finely crush 28 cookies in a Food Processor\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Tip:Add only a few cookies in at a time. I find this the fastest and easiest way to crush the cookies. Also if you dont own a food\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 processor you can crush the cookies up in a large Ziploc bag with a mallet or any tool you can find. Just make sure it is a\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 fine powder before you mix it with the melted butter3. Melt the butter or margarine\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a04. Mix the crushed cookies and the melted butter\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Tip:\u00a0 Make sure all of the mixture is moist and that it is mixed well with no dry cookie powder\n\u00a0\u00a0\u00a05. Press the mixture onto the bottom of your spring form pan and 2 inches up the side of the pan\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Tip: I find it the easiest to press the mixture down with a 1/4 cup measuring cup. Also adding the mixture slowly at a scoop at a\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 time, making sure you are pressing the mixture down firmly before adding more of the mixture.. Tip: Try not to mix the batter to heavily since the more you mix the more air you add into the batter, which can lead to the cheesecake cracking when it is baked.Tip: Do not add all of the batter ingredients at once. You want to add the wet ingredients slowly so the risk of cracking your desert is as minimal as possible.Tip: Mixing the batter by hand with a wooden spoon instead of using an electric mixer your cheesecake is less likely to crack when it is baked. If you choose to mix the batter with an electric mixer, mix on the lowest speed possible.1. Warm the cream cheese to room temperature\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Warming the cream cheese should only take about 30 minutes.2. Chop the rest of the Oreo cookies into small pieces\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 WARNING: Using a sharp knife can lead to physical injury always be careful when handling sharp tools.\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Tip: I find it easiest to break the cookies up with my hands.3. Beat cream cheese and sugar in a large mixing bowl\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Make sure you are not mixing it to much. Your mixture does not have to be completely mixed before going on to the next step.\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 You will have time to make sure your mixture is smooth after all the ingredients are together.4. Add sour cream and vanilla to the mixture5. Add eggs\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Only add one egg at a time. Make sure that each egg is blended in thoroughly before adding the next egg.6. Mix in the chopped cookies\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Try as hard as you can to make sure that there as little chunks in your batter before you pour your batter into the crust.7. Pour the batter into the crust\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 . 1. Place the cheesecake in the oven\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Set the cheesecake in the center of the oven. If your cheesecake is not in the center of the oven you may burn the top of your\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 desert.2. Bake for 55 min to 1 hour\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Remove the cheesecake from the oven when you see that the center of the cheesecake is just about set.3. Run a knife lightly around the rim of the cheesecake\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 You want to make sure that the crust is not attached to the side of the pan while cooling. You only need to run the knife down\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 a few centimeters of the crust. There is no need to run the knife all the way down the crust. If you do this you may run into the\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 possibility of breaking your crust.4. Let cool at room temperature for 1 hour5. Remove Spring Form Pan's rim6. Refrigerate for 4 hours\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Your desert is not ready to eat until the cake has completely set. You can test this by slicing through the cake and if it pass\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 through with very little resistance like it is still has moisture in it refrigerate until it is firm. . Tip: If your cheesecake has large cracks in it you can smooth out the cracks by doing three easy steps.\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 1) Wet a rubber spatula\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 2) Run the spatula over the cracks to smooth out the cracks\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 3) Let dry in refrigerator\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\nNow your delicious desert is complete and you can enjoy your wonderful creation. You can add your own decorations to your cheesecake that you would like. I find drizzling chocolate syrup over the cake adds style and a great chocolate accent to your desert. Now show all your friends at your next party how great of a cheesecake you can bake!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_76_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_76_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_76_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_76_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. 1- INGREDIENTS:a) 1 box of JELL-O powder\nb) 1 cup of boiling water\nc) 1 cup of ice cold alcohol (the colder, the better)\n          Note: For the demonstration, I used plain 40% alcohol, but it works just as fine with vodka or rum.2- TOOLS:a) A big bowl\nb) Any kind of container you want to serve your shots in. (I recommend plastic shooters or plactic cups).\nc) A measuring cup that can measure at least 1 cup.. 1- Empty the Jell-O pouch into the big bowl.2- Add 1 cup of boiling water.3- Stir vigorously until the powder is completely dissolved.Note: Step 3 is crucial if you want to avoid the awful skin that you find sometimes at the bottom of your JELL-O... I heard stories about people that actually LIKE this skin.... I think it's gross.... we used to call it \"JELL-O placenta\"4- Add 1 cup of ice cold alcohol and mix well again.Note: The colder your alcohol is, the better the result. I think this has to do with the activation of the gelatin. Personally, when possible, I keep the bottle of alcohol in the freezer for several days before making my shots. Don't believe the skeptics: the bottle won't blow up!!. Once the Jell-O shot mix is ready, pour it in the containers you want to serve the shots in. Sometimes I just leave it in the bowl and then I eat it all at once... I guess you can call this a JELL-O shotgun!!!Put it all in the refrigerator for a couple of hours. Here you will have to check once in a while if the jelly has set because the exact time needed depens on the containers you use and the \"weather conditions\" inside your fridge.. It is now time to amaze your guests whit this very special treats!!!!To get the maximum effect out of your shots, I suggest you throw it as a chunk in the back of your throat. The chance of suffocation is non-existant and the less time it spends near your tongue, the more you can trunk in without being annoyed by the alcohol's taste.\nSo chug it down and feel the party heat come up from your belly!!!!!As you may or may not know, a generous amount of alcohol ingested in a short amout of time can be bad for you. In some instances, it can even kill you.Therefore, if you want to have fun with JELL-O shots for years to come, please use it responsibly.Thank you and have great parties,BILOU\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_77_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_77_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_77_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_77_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. You need a Non-ripen Guava fruit , choose it well.A knife to chop it.Chilli Powder for adding spiciness.Salt if you are looking for a better taste.. Take the Guava fruit into your hand can chop off its head as it wont be good.Now slice it vertically from the top till the bottom.Perpendicular to the sliced line again slice it in the same pattern mentioned above .. Now take all the slices separately  , you will have four slices.Take a slice of Guava and cut it into pieces as shown in the images.Repeat the same process for the rest of the slices.You will have this at the end of the process.. As you can see some nice slices of Guava don't be in any hurry and eat them ,there are still few things to be added to them.Its time to add chilli powder.Take chilli powder powder preferably into your figures and gently sprinkle the powder on the slices.It adds a beautiful look to the slices.. Now add a bit of salt to the chilli powdered slices if you don't want them to be very spicy.Add salt in such a way that it should be 1/4 th of the chilli powder that u added lately.Adding more of salt also ruins the taste of the Guava.Now just see how amazing it looks so that your mouth can't resist to have it.What are you waiting for, Have a Slice Baby:)\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_78_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_78_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_78_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_78_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. \n          You may make this with or without Cream Of Mushroom Soup. If you don't add the Cream Of Mushroom Soup; use 1 can of Tomato Soup with a mixture using the empty can \u00a0( 1/2 water to 1/2 cup milk or powdered milk. ) The powdered milk will mix better. Fresh herbs would be super with this if you have them available.\u00a0\n\t1 Can Tomato Soup\n\t1 Can Cream Of Mushroom Soup\n\t1 Can Water\n\t1 Can milk or powdered milk\n\tPinch of Baking Soda if using regular milk\n\t1/2 Teaspoon Italian seasoning using 1/4 t in the tomato soup before cooking and 1/4t \u00a0in the serving bowls.\u00a0\n\tPepper as desired at time of serving\n\tParmesan cheese as desired at time of serving. \n\t1 sauce pan\n\t1 Large spoon\n\t1 Whisk\n\t1\u00a0Ladle not shown\n\tCan opener\u00a0. \n\tPour the tomato soup into the\u00a0sauce\u00a0pan.\n\tAdd 1 Can of milk. If using regular milk add a pinch of Baking soda to help it mix together. If using powdered milk; it is not necessary.\n\tMix well.\n\tHeat on medium heat.. \n\tAdd 1 can of Cream Of Mushroom Soup.\n\tAdd 1 can water\n\tMix well.. \n\tAdd 1/4 teaspoon Italian Seasonings.\n\tMix well.\n\tCook until hot.. \n\t\u00e2\u0080\u008bPour into serving bowls.\n\tSprinkle the \u00a0 Italian seasoning , Parmesan, and Pepper to each bowl of soup if desired.\u00a0\n\tServe with crackers or . . .\n\tThis soup goes well with a grilled cheese\u00a0sandwiches!\u00a0. \n\tMy girlfriend has taught me a few tricks about adding flavor to dull foods. She always adds a little fresh ingredient to any prepared foods to give it a homemade flavor. It does not taste as wonderful as cooking from scratch but it certainly taste better! If you would like a great recipe for a grilled cheese sandwich here is my recipe:\u00a0https://www.instructables.com/id/Grilled-Cheese-And-Bacon-Sandwich/\n\tIn closing I would like to thank our instructables company, sponsors, authors, readers, and members; for making this community a great success! Many hours and hard work has been put into making this place the best DIY on the Internet. Have fun and thanks for stopping by!\n\tSunshiine\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_79_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_79_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_79_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_79_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. 1 (20 oz) pkg Oreo cookies4 Tbsp butter or margarine8 oz pkg cream cheese8 oz frozen whipped topping1 cup powdered sugar2 pkg instant vanilla pudding3 1/2 cups milk. I break the cookies up by hand, one at a time. It doesn't take very long. I divide the cookies in half and break half of them into the bottom of a 9x13 pan and the other half into the cookie container. (They will go on top at the end.). Cream together cream cheese, margarine or butter and powdered sugar.I use my stand mixer for this. You'll need to mix another part of the filling separately, so you can choose to use two separate bowls (and maybe a hand mixer for this one). I chose to scrape out the mixture with a spatula into a separate bowl. The spatula works well, and it doesn't matter if the bowl isn't completely clean because it all eventually gets mixed together.. In a separate bowl (or in empty stand mixer bowl), combine vanilla pudding and milk. I pour the milk into the bowl, and slowly pour in pudding mix as it is mixing.. Stop the mixer, dump in the whipped topping and blend well.. Add the cream cheese mixture to the filling and mix until well blended and smooth. (From my experience, some small lumps don't affect the end taste or texture. Larger lumps can sometimes affect texture. Just keep mixing!). Pour filling on top of bottom layer of crushed cookies. Sprinkle the remaining half of crushed cookies on top. (It will look like the cover picture.) Refrigerate for several hours before serving.Enjoy!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_80_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_80_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_80_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_80_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Prepare both Cake Mixes according to instructions.\nMeasure 2 c Red Velvet Cake Mix\nMeasure two 1 cup portions of White Cake Mix\nIn one of the White Cake Mix portions add food coloring to create a pink. I used the Neon Food Coloring Set and added 16 drops of Pink and 2 drops of Blue to come up with a Dusty Rose shade.. First pour in the two cups of Red Velvet Cake Mix, spoon the White Cake Mix over it. Don\u0019t worry about covering the entire area.\nLast add the Pink batter and use the back of a spoon to gently spread it over the surface. Bake according to Cake Mix Directions.. If you are using Fondant to cover your cake, you will need a 20 inch rolling pin with rings to make your fondant smooth have a uniform thickness all over.\nLay Fondant on a large cutting board.\nPlace measuring rings on the end of your rolling pin and start by working the Fondant from the center outward in both directions.\nAs Fondant thins out use a pizza cutter to trim away uneven ends while you create your 'sheet.'.\nNote: The beauty of Fondant is that it provides a super smooth surface on your cake to work on and you don't get all sticky or mess up your icing.. When ready to place Fondant over your cake roll the Fondant onto the rolling pin part way,\nLift it gently so as not to stretch your Fondant and have someone slide your cake underneath onto the cutting board and gently drape the fondant back over your cake.\nNote: It is preferable to spread a thin layer of butter cream frosting over the cake to help the Fondant stick to the cake, however the Fondant recipe I use has marshmallow in it and it shapes very well. Also the marshmallow gives your Fondant a pure white color.\nStarting from the center outward smooth the Fondant over the edge of the cake and shape it to the cakes outer edge down to the cutting board.\nUse a pizza cutter to trim away most of the excess Fondant to make it more workable and reduce weight as you lift and adjust. Leave about 1 inch excess until you have your entire cake covered.\nWhen you reach the top of the heart make a straight cut to the inverted point for easier fitting.. Once you are satisfied with the fit of your Fondant use the pizza cutter to trim away all excess.\nPrepare a cake board by laying a Grease Proof Doily designed for food use and cut around the outside edges.\nCenter Fondant Covered Cake on Doily and Cake Board.. For this Cake I used a Gel Writing Food Pen to write 'Be Mine' (always start in the center and work your way outward). \nI used a Red Icing Tube with a writing tip to create the flourishes then dabbed a small amount on Licorice Whips to hold them in place as the outline and put stripes down the sides. \nThen I switched to a star tip and put small star shapes at the top and bottom of each licorice stripe to better secure it in place. \nFinally I laid two foil wrapped chocolate cream hearts on top with a dab of icing to keep them secure.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_81_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_81_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_81_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_81_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Here is what I used to make my marinated frozen grapesWhite grapesSugarWhite WineA pitcher . I started by mixing the wine and sugar into a pitcherAdd 1/8 cup of sugar and half bottle of wine. This made 6 servings for me.After adding the ingredients I stirred the sugar and wine together until the sugar was completely dissolved in the wineOnce the mixture was complete I dumped about 1 1/4 cup of grapes into the pitcher. Enough grapes to completely submerge into the wine without being too cluttered together.. Next put your grape pitcher in the fridge to let the grapes marinate in the wine and sugar. Keep the grapes marinated for 12 to 24 hours.. Once the grapes are fully marinated I pulled the marinated grapes out and strained the grapes out then patted the grapes dry.I then added a sugar coating to the grapes. I started by putting a small coating of sugar in a bowl then i put a small handful of grapes into the sugar and rolled them around. Then I put the grapes into a plastic container.NOTE: I did not coat all of my grapes with sugar because I wanted some a little tarter.. After all the grapes wanted are coated in sugar I place the container of grapes in the freezer for about 4 hours before serving to your friends and family. You can either have these as a snack which makes a great summer snack. You can also choose to use them as ice cubes for your wine. Especially if you choose not to coat some of them with sugar. Enjoy!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_82_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_82_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_82_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_82_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. To make this dish for 2 you will need the following:2 fillets of sea bass, your fishmonger can do this for youSalt and pepper1 knob of butter1 yellow bell pepper, thickly sliced and the seeds removedAround 10 cherry or vine tomatoes, roughly sliced in half or left whole1 bunch of asparagus with the tough ends broken off, this will be anywhere between 10 and 15 pieces, depends on the thickness of the asparagus. If they are a very thick then give them a little longer or slice them length ways2 Tbsp of spicy pesto, or any pesto you like, simply from a jar100ml dry white wine1 handful of flat leaf parsley, you will only need the leaves which you can just pick offoptional, a little water in case you want to loosen the saucesome mixed leaf to your liking, you could use rocket, watercress, spinach or what ever you like in any combination you like. The vegetables with the exception of the asparagus will be cooked in a griddle pan, you want charing so this needs to be as HOT as possible, put this on full wack.NOTE: do not put oil on the griddle otherwise you will be frantically opening windows and hitting the smoke detector. You wont need oil and it will only burn which will ruin lunch and no one wants that!Put another pan on a medium heat and allow to heat up while seasoning the fish, season both sides with salt and pepper. Salt on the skin will help to give you that infamous crispy skin. You could score the skin if it is a thick fillet which will actually help the heat to permeate allowing the fish to cook quicker. But there really isn't any need.TIP: Never put put your fish into a cold pan, you want it to be up to temperature first for a nice crispy skin.. Put your peppers on the griddle, these are going to char and blacken which is just what we want, they will go soft and sweeten as the natural sugars cook.In the fish pan, put a good knob of butter, let this melt down for a few seconds and move the butter around the pan. Then, gently lay the fish in the pan skin side down - do not touch the fish or the pan now, it can be tempting to mess around with the fish but you want the skin the crisp up and the meat to gently cook.TIP: Don't be tempted to move the pan around and mess with the fish, just let it cook.. Keep an eye on your peppers, move them around.After 4 - 5 minutes you will see the edge of the fish at the thinnest points start to turn a light white colour, when this happens it is time to turn the fish. Take a fish slice and very carefully turn the fish over, keep close to the pan so not to splash butter everywhere and keep the delicate fish in one piece. Cook the fish for 2 - 3 minutes more, keep checking it to make sure it doesn't overcook/ burn.Get some foil or a plate ready for when the fish is cooked to put it to one side.Check the fish by gently lifting it with the fish slice and peaking underneath, it should be just brown, remove from the pan and put to one side.TIP: Fish is considered tricky and many people over cook it but if you keep an eye on it then it is really easy, as soon as the fish looses it raw colour and the flakes of meat just start to come away from each other it is ready. Just be patient and as soon as it is done, get it out of the pan.. Now we are coming to the end and the last of the ingredients cook super fast.Turn the peppers again and throw the wine in the fish pan, you want to save all the delicious flavour from the pan so don't wash it first. This is called deglazing the pan.Put the asparagus in the wine and put a lid on top, the asparagus will take around 2 minutes to become tender and steaming them in wine and the fish butter will make them shinny and delicious.At the same time, put your tomatoes on the griddle, they will cook fast because of the sweet sugars and the soft flesh. They will be ready around the same time as the asparagus.. Asparagus really doesn't take very long, as soon as the stems are tender use some tongs and get them out of the pan, put to one side for plating up later.Don't throw the wine away from the fish pan, this is going to be the base for the super simple sauce - the flavours of the fish and asparagus are too good to waste.. When it comes to sauces there is nothing more rewarding than making your own from scratch but sometimes you want something quick and easy so there is no shame in using a nifty cheat here and there.For this one the secret is pesto (you could even make your own pesto), here we used a spicy tomato pesto. Add your pesto to the wine in the pan and mix in. You may need to add a splash of water to loosen the sauce. Add the flat leaf parsley at the end and stir in.Take the vegetables off the heat and put in a bowl, set to one side. It is best to get the veg out of the pan, the griddle is a big heavy chunk of metal and will hold the heat for a while, consequently continuing to cook the food in it.TIP: When you are making sauces, a splash of water in many cases can do wonders. If you take a sauce too far or kept it warm a little too long, reduced a little too much then a dash of water can be your saving grace.. Bring your dish together and serve with a glass of white wine, spoon the sauce on and around your perfectly cooked fish.Add a light garnish of green leaf, peppery rocket works a treat here. Enjoy as a great quick lunch, alfresco if you can :) \nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_83_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_83_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_83_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_83_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. \nIngredients for the pie crust:\n2 cups all-purpose flour\n1 cup vegetable oil or butter\n1/2 cup cold water\nIngredients for the filling:\n3/4 cup sugar\n1/4 cup cornstarch (can be substituted with 1/2 cup flour)\n6 cups berries, frozen or fresh. I used 3 cups raspberries, 2 cups blackberries, and 1 cup blueberries, all fresh.\n1/2 or 1 lemon depending on preference\nExtra sugar to taste\nMaterials:\nMeasuring cups\nMix master or stirring spoon/fork\nRolling pin\nWax paper (or extra flour)\n2 large mixing bowls\nPlastic wrap\nPizza cutter (or normal kitchen knife)\nLemon juicer\n9-inch pie dish. \nAdd 2 cups flour in a bowl. Mix in 1 cup of oil in 1/4 cups. You can mix it either by hand with a fork or with a mix master. Mix until the flour and oil are a thick liquid that is almost solid (image 1).\nSlowly mix in 1/2 cup of cold water (it is important the water is cold!). The mixture will solidify into an oily dough (image 2).\nCut the dough into two slightly uneven \"halves\" (you want a bigger \"half\" and a smaller \"half\"). Wrap them in plastic wrap and stick them in the fridge for at least an hour, up to a day (image 3).\nNote: if you are in a time crunch, you can put the dough in the freezer for 30 minutes, but be sure to flip them over after 15 minutes. The side touching the metal grate cools faster.. \nMix 3/4 cups sugar and 1/4 cup cornstarch (or 1/2 cup flour) in a large bowl.\nAdd 3 cups of berries to a different bowl. The blueberries are there, just on the bottom (image 1).\nPour the sugar-cornstarch/flour mix over the berries (image 2).\nCoat the berries evenly using your hand. It is okay if the berries get crushed. It may be helpful to pour all the ingredients back and forth between the two different bowls (image 3).\nWhen thoroughly mixed, juice half a lemon and pour the lemon juice and the pulp into the mixture. Mix the lemon in. Let the berries stand for at least 10 minutes. You should at this point have a fruity syrup with a lot of solid fruit in it (image 4).. \nPreheat the oven to 400 degrees Fahrenheit. Use convection bake if your oven has that option.\nPut a sheet of wax paper on the counter (if you don't have wax paper, a floured surface works). Put your larger \"half\" of the dough on the wax paper, and put another piece of wax paper over it. Roll the dough to an approximately 12 inch round (image 1).\nTake off the top piece of wax paper. Oil or butter your pie dish (make it non-stick) and flip it upside down. Put the dish on top of the dough, and flip the whole thing back over. The pie crust should \"fall\" into the dish, with the piece of wax paper on top (image 2).\nPeel the wax paper off (carefully!). Fix any tears that you have. If your pie crust isn't large enough, you can take a some of the 2nd batch of dough and fill up the edges. The pie crust should hang over the dish very slightly (image 3).\nNote: you want your pie crust on the bottom to be relatively thick so it can hold the filling. Fill in gaps with extra dough, don't just pinch tears shut.\nSpoon/pour your filling into the pie crust. Be careful not to poke through the bottom of the crust. Make sure the filling is relatively flat (image 4).. \nRoll out the 2nd dough in the same way as you did the first. There are multiple options of how to top a pie. You can cover the entire thing, make a basket weave, corn rows, etc. I went with a basket weave.\nUse a pizza cutter or a knife to cut the dough into even-thickness strips (image 1).\nStart laying down strips in one of the corners. I chose top left because I'm right handed. This means that I pressed the left side of horizontal strips into the pie crust and pressed the top end of vertical strips into the pie crust. The right ends and bottom ends were not stuck down because I needed to lift them to weave an over-under pattern (image 2).\nKeep adding strips, alternating between columns and rows, weaving over-under until you have covered the top of the pie. Use the leftover strips to circle the rim of the pie dish to hold the basket weave in place, and to look prettier (image 3).\nNote: you can \"recycle\" scrap pieces of dough that are too short, that tear, etc. Just ball them up, re-roll, and re-cut.. \nAt this point, your oven should be at the correct temperature, 400 degrees F.\nSprinkle a little sugar on top of the pie to give it a nice, sweet crust. If you like your pies to be really tart, you can squeeze another 1/2 of a lemon on top of the entire thing. Try not to leave pulp on top of the dough because it tends to blacken the basket weave (like I did here).\nPut the pie in the oven, middle rack, for approximately 45 minutes or until the crust is brown and the filling settles. You can check by sticking a skewer into the middle of the pie - if it comes out clean, the pie is all set.\nTake the pie out when it's done and let it cool for 20-30 minutes at least. You can sprinkle more sugar on the top if you want. Serve it warm or cool, with or without ice cream, however you want.\nEnjoy! (image 1).\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_84_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_84_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_84_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_84_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. you will need oil - 2 table spoon cumin seeds - 2 tea spoons chopped garlic clove - 2 to 3 chopped green chilly - to taste ( i took 2 small once) chopped onion - a big one chopped tomato - about 2 plus 1 for garnishing boiled chopped potato - 1 or 2 salt - to taste garam masala - 2 tea spoon. heat oil in the pan. add the cumin seeds to it and stir for half  a minute. then add chopped garlic and green chilly and stir for a minute. add the chopped onion and let it cook till it softens. keep stirring in between. you can see the color  change from images. once done add tomatoes, chopped boiled potato, salt, garam masala powder and stir well. add some water and put the lid on and cook for 3 to 5 minutes on medium heat till everything is cooked well.take the pan off heat and let it cool.. once the mixture cools, grind it on blender. and the puree is ready.. poach the eggs. there are several instructables available on how to poach the egg and i am not going to repeat. in fact i learnt to poach egg from those instructables! . i served mine with fresh bread. and garnished with tomato. let your creativity fly. do something new....\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_85_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_85_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_85_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_85_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. You will need:For the buns7g dried yeast240ml whole milk60g caster sugar380g plain flour30g cocoa powder1 tsp cinnamon60g chilled butter (cubed)1 egg (beaten)100g chocolate chipsFor the glaze2 Tbsp of marmaladeFor the chocolate cross100g dark chocolate (70% cocoa)100ml double creamTo bakebake at 180 degree celsius for 10 minsthen, reduce temperature to 160 degrees celsius for 15 mins. Gently warm the milk so it is about body temperature, put your finger in the milk and when it feels just warm it is ready (be careful not to heat the milk too much and burn yourself). It will only take a short time to come up to temperature so keep an eye on it.Add the dried yeast and one Tbsp of the sugar and stir, the sugar and the warm milk will start to activate the yeast, you will see the mixture start to bubble. If it doesn't then it is possible the milk was too hot and you have killed the yeast, in which case you will need to start again.. Pass all the flour, cocoa powder and the cinnamon powder (optional) through a sieve, incorporate the dry ingredients and to this add chilled cubed butter. With your finger tips crumble the butter and dry ingredients together until the mixture resembles bread crumbs. You can use a food processor but using your hands is much more rewarding :). Lightly beat the egg and put to one side, make a well in the middle of the dry ingredients. Add the yeast mixture, egg and sugar to the well.. With your hands bring the wet and dry ingredients together, it will be sticky to start with but knead a little in the bowl. Then, lightly flour the surface with some plain flour and turn the formed dough out, knead the dough for around 10 minutes until it is elastic when you pinch it.. To prove the dough find a warm place and put the dough in the bowl, under the tap wet a cloth and squeeze the excess water out of it so it is damp. Cover the bowl with the cloth and leave the dough to rise for about 1 hour or until the dough has doubled in size.When the dough has proved, with your hand know it back and take out the bowl, returning to the work surface. flatten the dough.. When the dough has been flattened, pour the chocolate chips on top of the dough. You can use dark, white or milk chocolate chips for this.knead the dough for a further 3 - 5 minutes until the chocolate chips have been incorporated into the dough.. Roll the dough out into a sausage shape and cut it into 16 pieces, shape each of these pieces into a ball in your hands and place in a greased and lined baking tin. Cover the dough with the same damp cloth used earlier and leave to prove for 30 mins or until doubled in size again.. Pre heat the oven to 180 degrees celsius (fan) and bake the buns for 10 minutes then, reduce the temperature to 160 degrees celsius (fan) for a further 15 minutes. While baking continue with preparing the glaze and chocolate ganache.. Pour the cream into a cold pan and bring up to the boil, when the cream just starts bubbling, take off the heat and add the dark chocolate, stir in gently, but don't over stir and set to one side (5 mins).. In a cold pan add the marmalade and bring up to heat gently, melting the marmalade into a syrup. Stir the chocolate and leave to one side to cool slightly.Take the buns out of the oven and immediately brush on the melted marmalade to give a beautiful glossy sweet finish.. If you have a piping bag then you can use that but if you don't a food bag can work just as well.Take a food bag and put in a cup or glass to make it easier to fill. Spoon the ganache into the bag and twist the end pushing all the chocolate down to the bottom.Take some scissors and snip a small hole in the bottom of the food bag, pipe the chocolate ganache in single lines across the buns, then turn the tin and pipe in the perpendicular direction.Serve your hot cross buns with a generous amount of butter and some marmalade.If you like you can subscribe to the YouTube Channel and like the Facebook page.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_86_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_86_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_86_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_86_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. You will need water that is nonclorinated. I set some tap water out in a jug with the lid off afew days before to let the clorine evaporate. You can use bottled spring, drinking, or distilled water if you would like. It just cannot have clorine in it or it will kill the beneficial bacteria. Vegetables: \u2022Cabbages\u20221 head Bok choy\u20221 head Napa\u20222 Leeks\u20223-4Carrots \u20221 Daikon radishSpices:\u2022Garlic\u2022Red chili peppers\u2022Ginger root\u2022Himilayan sea salt\u2022Sriracha hot sauceTools:\u2022Knife\u2022Cutting board\u2022Large jars\u2022Large pot\u2022Plate (needs to fit inside pot)\u2022Plastic zipper bags\u2022Baking tray or large plate. Wash all veggies with cold water. Leeks tend to hide dirt near their base as well as the bokchoy so these need some more attention than some of the other veggies. I wash these as I cut because of this.Tips: \u2022 Don't cut peices too large for your mouth. I like cutting them large but keep in mind as you cut that the peices still need to fit comfortably in your face while you chew.\u2022Leeks: Cut the bottoms off near the roots. Cut any wilt from the tips. Cut in half lengthwise lining up with the base of the \"V\". Cut the white and yellow sections in large peices. Fan the cut sections kind of like a deck of cards as you throw them in the pot. This will separate the layers easily. I tried to get a picture of this but it is near impossible to do with one hand. Looking at the picture imagine the cutting board as your other hand. Dirt likes to hide just below the green. Set aside the outside green peices to be washed individualy as you cut up the stalk. You don't have to use the green parts but I try not to waste and don't like to set aside for additional recipes/work. The green parts take longer to soften so I cut these into thinner strips.\u2022Bokchoy: I prefer to break each leaf off and wash them. The base usually has some dirt that has collected between stalks. after I have done this I will cut into about 1 inch peices.\u2022Napa: Wash outside and remove wilt. Cut into quarters legthwise. Cut quarters into about 1 inch sections.\u2022Carrots: Scrub outside. I slice on an angle to make them longer than just having sliced coins. I have also shredded them with a potato peeler for very thin long peices. \u2022Daikon radish: Cut end off. Slice into coins. Any that seem too big can be cut in half as well. It may be easier for you to cut in half first but I personally I just cut stacks in half after slicing. Put everything into your large pot or other container. There are no specific veg measurements that I use when making this. But it does make enough that I usually have enough for a month or more.. This step starts the fermentation process off on the right foot.With all your veggies in a nonreactive container sprinkle 1Tbs salt (any salt as long as there is no iodine) per 2 cups (nonclorinated) water over your veggies. I don't add salt to many foods so my pallet doesnt require it for flavor. I add because of necessity for safe fermentation. I would recommend if you often use salt consider doubling the salt content. It is easier and best to add extra now than try to later. Using less salt will make it ferment faster as well which means spoilage if there is too little. I sprinkle one Tbs then use the 2 cups water to \"wash\" it in, then stir with my hand grabbing from the bottom and bringing to the top. Do this until the brine is about half the depth of the veggies. Compress and cover veggies with a plate, put some weight on it to push them into the brine (I use a gallon jug of water). As the salt pulls water from the veggies they will settle more and eventually submerge fully. Leave like this for 6-8 hours to let this happen. . This is where your pallet plays a large roll and will require some experimentation. For my recipe I use about 6 garlic peices, some dried red pepper, a piece of ginger about the size of my thumb and anywhere from 1/4 cup to 1/2 cup Sriracha.Remove garlic skins. Remove pepper stems. Put all spices into blender or food processor and blend until it has become a paste. . Taste. I don't rinse my Kimchi as long as it doesn't taste too salty for me. Leaving the plate in place hold the veggies in the bottom of the pot and pour brine through a strainer (no pic, used both hands). This strainer is just in place to catch whatever gets by. If yours is too salty after this step then add water (nonclorinated) to the veggies, stir and strain until you get the salt taste you like. You can save the brine if you would like for something else as it already has some culture started in it. I usually just strain into the sink. Put whatever your strainer caught back into the pot.. Mix spice paste into the veggies. use some to \"squeegee\" the spice paste out of the container blending container. Warning: If you have sensitive skin this could effect you. I would not consider my skin sensitive but my hand does feel warm after this part and touching eyes will burn. Wash hands thoroughly after.. Pack the kimchi into your jars. Push it down so that liquid covers as much of the veggies as possible. leave some space at the top of the jar. Put some plastic zipper bags on top filled with water (oops I ran out of bags). The bags are used to let gasses escape from the kimchi and keep nasties from getting in as well as keep it submerged. Place a baking tray or large plate under the jars. As they off gas the liquid may spill over. Using a clean utensil press the kimchi back down at least once a day to let trapped gasses out. (This is a great time to dish some to try its flavor development)Let it ferment at room temp for 4ish days before sealing with a lid and refrigerating. \nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_87_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_87_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_87_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_87_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Gather the following ingredients:1 Cup White Vinegar1 Cup Water4 Avocados1 Tablespoon Red Pepper1 Tablespoon Rainbow Peppercorns2 Teaspoons Pickling SaltGather the following materials:Mason Jar for Storage*Note: You could also add other spices or flavors like lemon, juniper berries, or bay leaf to your pickle brine if you'd like. I chose to keep it simple for this first avocado pickling.. For this Instructable you'll want ripe avocados on the firm side so that they hold their consistency when you pickle them. I have found that you can identify a ripe avocado easily by simply checking under the stem. To do this, hold the avocado in your hand and place your thumb on the stem. Roll your thumb against the stem until it rolls out of the avocado. If the stem separates from the avocado easily and leaves a bright green patch of flesh underneath, you've found the perfect avocado. Conversely, if your stem does not roll away from your fruit, the avocado is not ready. If it rolls away and the avocado flesh that is exposed is brown, your avocado is over ripe, mushy and not ideal for pickling. . Let's start by making your brine. This way, your brine can cool while you prepare your avocados for pickling. Place 1 cup of vinegar with 1 cup of water in a nonreactive pot on your stove. Add 2 teaspoons of salt and bring to a boil, stirring to dissolve the salt. Once boiling, reduce to a simmer until all of the salt has dissolved. Remove your pot from the heat and allow to cool. . Using a sharp knife, cut your avocados in half lengthwise by rotating the knife around the avocado. Once halved, separate the avocado halves by placing a hand on each half and rotate, twisting your hands in opposite directions. . To remove the pit of the avocado, hold the avocado half in your non-dominant hand. With your dominant hand, carefully whack your knife into the pit until it sticks firmly. Keeping the knife firmly stuck into the pit, twist the knife until the pit rotates and comes loosely away from the fruit. The avocado pit should now be stuck to the blade of your knife. To safely remove the knife from the pit, place your fingers against the pit from the back of the blade. This way when you apply force to remove the pit from the blade you are working in a direction with the blade, not against it. Apply force to the back of the pit, until it releases from the knife blade. . Gently pry away the skin from your avocado fruit using your fingers. The skin will come off in pieces but that is okay, keeping the fruit whole is more important. You can also use a spoon along the inside of the avocado fruit to loosen it from the skin, but I have found that a little patience and peeling leads to a better end result. . Place your avocado halves, cut side down on your cutting board. Cut slivers lengthwise and then cut your avocado into cubes by slicing your slivers crosswise. You should finish with a bunch of avocado cubes. . Start your jarring process with a clean mason jar. If it is not clean and clear of impurities, boil your jar in water for a minimum of 10 minutes and allow to dry. With your clean mason jar, add your tablespoon of red pepper flakes as well as your tablespoon of peppercorns. Add your sliced/cubed avocado until it's about an inch below the jar opening. . By now, your brine should have cooled and it is safe to add to your avocados. Pour carefully and slowly, taking care to completely cover your avocados with the brine. This will start the pickling process. Place your lid on your jar and store in the refrigerator for at least 6 hours. The longer you store your avocados, the more pickle like they will become, but this is a quick pickle, and your avocados will be done after 6 hours. Continue to store your pickled avocados in the refrigerator when you aren't currently enjoying them. . You're done! Your pickles have a shelf life of roughly a week--thats the longest they've lasted with me. Enjoy your pickled avocados. I enjoy putting them on top of kale salads, over rice, or with cheese and crackers. \nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_88_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_88_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_88_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_88_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Pan(s), bowls, utensils needed: You will need one 8.5  ( 8 1/2  ) inch spring form panone measuring cup of each of the following:  1 full cup 1/2 cup 1/3 cup 1/4 cupA food processor, and an electric mixer help tremendously. Along with various things as bowl spatulas, 2 mixing bowls, one double boiler and a sauce pan. Ingredients: ....For the torte itself1/2 cup blanched almonds, lightly toasted2 oz. unsweetened chocolate - dark2 tbl unsalted butter2 large eggs1 cup sugar1 tbl framboise or other raspberry brandy3/4 cup all purpose flour1 tsp. double-acting baking powder1/2 tsp salt1 cup raspberries plus additional for garnish and accompaniment....For the glaze1/3 cup raspberry jam  (seedless is more convenient)1 tbl sugar....For the Ganache1/4 cup heavy cream6 oz. fine quality bittersweet chocolate / choppedI always advise that one reads the recipe through once, gather their stuff, including utensils, then read down through the recipe again to make sure nothing was missed. . Start by grinding the almonds in the food processor for about 5 minutes or until they are of the consistency of a nut butter, and set it aside. Using the double boiler to melt the chocolate & butter, stirring occasionally. Once melted, remove the bowl from the top pan.   In the large bowl of an electric mixer beat the eggs until they are pale, adding the sugar gradually, and beat the mixture until it is very thick and pale.  Beat in the chocolate mixture, the framboise, and the reserved almond butter & beat the mixture until it is combined well.  Into the bowl sift together the flour, the baking powder, and the salt, beat the mixture until it is combined well, then fold in 1 cup of the raspberries gently.  Turn the mixture into a well buttered  8 1/2 in spring form pan, spreading it evenly and smoothing the top, then bake the torte in the middle of a preheated 3500 F oven for 40-45 minutes or until the tester comes out clean..  Let the torte cool in the pan. . Make the Glaze:   In a small heavy saucepan combine the jam and the sugar, bring the mixture to a boil,  stirring it for 3 minutes.  If you have the seeded kind, you will need to force the mixture through a fine sieve into a small bowl, pressing hard on the seeds.  Invert the torte onto the rack, set over wax paper, remove the bottom of the pan, and spread the glaze on the top and sides of the torte. Let the torte stand at room temp. for 2 hours  or chill it for 30 minutes, or until the glaze is set.  The torte may be prepared up to this point , up to 1 day  in advance and kept on the rack (cover with an inverted bowl).  . Make the Ganauche: In a small heavy saucepan bring the cream to a boil and remove the pan from the heat.  Stir in the chocolate, stirring until the the mixture is smooth, and let the ganauche cool for 3 minutes.  Pour the ganauche  over the torte, smoothing it with a spatula and letting the excess drip down the sides, and let the torte stand for 1 hour, or until the ganauche is set. Transfer the torte crefully to a serving plate, garnish it with some of the additional raspberries. If you are expecting this to be an overly sweet treat, you may be disappointed. But if you like Raspberries and chocolate, this a a great little torte ( I am not used to  torting my own horn :-) . I have had my torte now since I made it, and although I have been told that it was good (by someone else in my household), they haven't really had much of it.  So half of it is left ( no, I am not suggesting shipping a piece to the first 8 that reply :-),  and it was placed on a plate and covered with a mixing bowl, and is refrigerated.   Well, I am sure you know what is happening to it.   It is drying out. \nSo, if this happens, especially to Chocolate cake of any kind, what I like to do is cut off a piece.  Put it in a bowl (or a sturdy plate if you wish), and slather it with unsweetened applesauce.   Umm um.  The dryness doesn't seem to matter anymore and you get to enjoy the rest of your cake (and the applesauce is a healthy addition too).  You win on all counts. \n\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_89_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_89_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_89_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_89_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. To make carbonated fruit you only need to gather a few things:FruitWhen making carbonated fruit it's best to use firm fruits, like oranges, apples and pears.  I tried doing it with softer fruits like kiwis, strawberries and bananas and it just doesn't work as well.  Apples in particular seem to work the best.Bottle or ContainerYou will need a plastic bottle or a container to put the fruit into.  I have found that a wide mouth Nalgene works best.  You can use an empty 2 liter soda bottle however, just be careful not to add in too much dry ice, more on that later.  DO NOT use a glass jar.  The bottle will be under pressure and broken plastic is safer than broken glass.  If you have a vessel that is designed to take pressure, like a beer keg for example, than  by all means try using that.Dry IceThe final thing you will need is a block of dry ice.  You will only need a tiny tiny amount of dry ice to make the carbonated fruit, but its hard to buy less than a large block of the stuff.  Now, chances are that you have never seen dry ice for sale.  You can't make it on your own and you might not be able to find so easily.I used the Dry Ice Directory to find out where it was being sold locally - they have listings for all over the world.  I live in the east bay of California.  I was surprised that In all of Oakland there was only one distributor - the AM PM Gas Station on Market and Grand in West Oakland.  They oddly enough had a ton of the stuff for sale, and they are open 24/7!  I was very impressed that I could buy dry ice anytime I wanted even if it was only for sale at that one place.**Before you go to buy the dry ice please refer to thisDry Ice Safety Info website.  I am not going to go through all of the safety precautions that should be taken in this instructable, so take a minute to familiarize yourself with its possible safety hazards.**. The first step is to cut up the fruit and put it into the bottle(s).  Cut the fruit as if you were making fruit salad - no seeds or orange peels are wanted here.  \nI cut smaller pieces to fit through the narrow neck of the soda bottle and bigger ones for the wide mouth of the nalgene.  I  highly recommend using a nalgene to make carbonated fruit.. The next step is to cut off a small chunk of dry ice from the block.  You only need about 2 grams, or a piece about half the size of your thumb.  There is no harm to putting in too little dry ice - you will simply end up with only slightly fizzy fruit.  However, putting in too much dry ice IS dangerous and could make a really big mess.  Dry ice is constantly sublimating (not melting) from its solid form of CO2 to CO2 gas.  Unlike regular ice made from water, it goes directly from its solid phase to its gaseous phase - no liquid phase in between.  That is why it sublimates, rather than melts.As a result, the dry ice block will produce gaseous CO2 until there is nothing left of the solid block.  The bottles are going to be sealed tightly with their caps, so if too much CO2 gas is built up inside of the bottle they might explode (the soda bottle bursts at around 115 psi).  We are looking for only a little bit of pressure (30 psi) and so there is no need to add in a big hunk of dry ice.The dry ice in the picture below was enough for both of my bottles of fruit, so each one got about half of the small chips you see below.. As soon as I put the dry ice into the bottles and sealed the top I could see it turning into its gaseous phase.  Most of the dry ice will sublimate in an hour, so thats all the time it will take for the bottles to become fully pressurized.  Waiting overnight is a good idea to let the CO2 gas work its way into the fruit.\nI put the bottles into an empty drawer and closed it for the first hour - I have to be honest, it was the first time I was doing this and I didn't know what would happen.  After an hour I could see that the bottles were under pressure, but not in any danger of exploding, and so I transfered them to the refrigerator for the night.\nYou can only carbonate things that have water in them.  I thought about doing fizzy meat, but I don't think there is enough water in it to dissolve the CO2 into.\nI went to bed and brought the bottles with me to Instructables HQ the next morning.. Once the bottles have sat overnight you are ready to open, eat and burp.\nBleed the pressure from the bottle buy opening the cap like you would open a soda bottle that had been shaken.  \nI cut the top of the plastic soda bottle off with a sharp knife and poured it out into a bowl.  You can simply pour the fruit out of the nalgene bottle through the wide mouth of the bottle.  \nNow that the fruit is out of the bottles it's ready to eat!  It loses its fizzyness pretty quickly, so make sure you chow down in the first 15 minutes after opening the bottles.  \nCarbonated fruit tastes like regular fruit, but it tingles on your tongue.  It's a totally unique experience to eat, and makes you burp a whole lot if you have done it right.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_90_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_90_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_90_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_90_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. First make a hole in the side of the marshmallow by twisting\u00a0a pretzel into it.\u00a0 This way you will have an easier time sticking your sword in it later on.\u00a0\nDip your fork or spoon into the red food coloring and run it down the marshmallow. This will give the red blood vessel look to the \"eye.\" . Get your almond bark ready by melting it in the microwave.\u00a0Heat the almond bark in intervals so that you do not burn it.\u00a0Also, pull out a piece of wax paper\u00a0to put your pretzels on for drying. Once your almond bark is ready, dip the pretzels in\u00a0it.\u00a0 Because of the small amount that I melted, I found it easier to cover the pretzels by using my finger to spread the almond bark on the pretzel.\u00a0Place the pretzel on the wax paper and let it dry.\u00a0 Dip your pretzels one more time if they look bare. Place it on wax paper and let it dry again.\u00a0. To make the sword handles, break apart pretzels into smaller pieces. It may be easier to break a piece of the pretzel off and shave it down.\u00a0Then attach the smaller pretzels to the sword by using the almond bark as glue. Also, cover the handles with almond bark and let it dry.\nTo get the brown handle, take a block or two of your almond bark and melt it (you may have some left though). Then add in equal amounts \u00a0of red, blue, and yellow food color till you are happy with the color.\u00a0 Because I used liquid food color the almond bark turned to a gritty texture. Then apply your colored almond bark to the sword for the handle\nNow your sword is done!. Place the Sword in the fake eyeball where you made the hole earlier.\u00a0 Then add red food coloring to the sword where it sticks in the marshmallow.\u00a0\nTo make the iris and pupil, take an m&m and place a little bit of chocolate on it .\u00a0 Then dip the chocolate on the m&m in the food coloring.\u00a0 Attach the iris to the marshmallow using the almond bark as glue.\u00a0\nEnjoy and Be Careful :)\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_91_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_91_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_91_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_91_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. 1/4 Cup Extra Light Olive Oil  4 Cloves Garlic  2 or 3 slices raw Red Chili Pepper with seeds  3 Anchovy Fillets  1 Can (14.5 Fluid oz.) Petite Cut Diced No Salt Added Diced Tomatoes  1 Can (2.25 Fluid oz.) sliced Black Olives  1/2 Cup good Dry White Wine (I used Barefoot's California Riesling)  1 TBS dried crushed Oregano 1/2 teaspoon Coarse Ground Black Pepper  A sprig of Fresh Basil ,( plus 1/4 cup shredded Basil Leaves for Garnish)  1/2 lb. Fresh Raw Medium or Large Shrimp, peeled and deveined  1 Can (6.5 fluid oz.) Whole Shucked Cherrystone Clams (I used Trader Joe's' Maine Whole Cherrystone Clams)  1/2 oz. Shredded Sharp Provolone Cheese (BelGiosio's Sharp Provolone is excellent)  1/2 package (8 oz.) Fettuccine (or pasta of your choice). *Smash, peel, and finely chop the Garlic Slice the Red Chili Pepper Rinse and Drain the Canned Clams and set aside Drain the Black Olives and set aside Have your spices and the Anchovy Fillets ready to use when needed.*NOTE:  When preparing garlic, I smash the cloves with the flat side of a big knife or cleaver because this makes it easy to remove the skin (peel).  then I slice off that little brown piece (the stem end) from the garlic clove and discard it (it can add some bitterness to the garlic).  Then I finely chop or dice the clove.. *Add 3 quarts of water and a teaspoon of salt to a large pot and begin to bring it to a boil preparatory to cooking the Pasta.Heat the 1/4 cup of Oil in a heavy pan over medium heat.When the oil is hot, add the garlic, red chili pepper, and anchovy fillets.  Simmer, while mashing the Anchovy fillets with the back of a wooden spoon, just until the anchovies melt into the oil (about 2 minutes; you don't want to burn the garlic).Add the tomatoes, black olives, wine, Basil Leaves,  black pepper and oregano; bring mixture to a boil, reduce heat to low or medium low (depends on your stove; you want the sauce to \"bubble\" a little as it cooks), cover and simmer for 15 minutes.Add the shrimp; re-cover pan and continue to simmer for 4 or 5 minutes more, or just until the shrimp begin to turn pink.Add the clams; turn off the burner but leave the pot on the stove for a couple of minutes to allow the clams to heat through; stir in 1 ladle of pasta water (about 3 fluid oz.), taste sauce  and add salt or any seasonings needed, and remove pot from burner.*NOTE:  It will take approximately 10-12 minutes to cook the pasta AL DENTE (follow the instructions on the package).  Try to time it so that the pasta water is boiling and ready to receive the Fettuccine when you have about 15 minutes before the sauce is ready to serve. Plate the pasta, spread some shredded cheese over the it, top with a plenty of the Shrimp & Clam sauce, and scatter some chopped fresh Basil Leaves over it.  Since I had some of that good Riesling Wine left in the bottle, I thought I might as well enjoy a glass of it with my dinner!Mangiare!. Nutrition for this recipe was calculated by me using the MyFitnessPal Recipe Analyzer.  I serving of the Shrimp & Clam Sauce has an estimated 393 calories, but only 13 carbohydrates, and it is full of protein (an estimated 21 grams per serving).However, the Fettuccine (according to Ronzoni) will add about 200 calories and 42 grams of carbs to the dish.per 2 oz. serving. \nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_92_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_92_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_92_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_92_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Grind your coffee. There are several options here and really this is the only time I'm going to recommend a coffee-making-oriented solution. This solution is GET A COFFEE GRINDER. I bought a bur grinder, one with a hopper at the top (the glass funnel thing). If you have one, great, if you don't, get on Gumtree, Craigslist, eBay, whatever. I got a Delonghi one for $20 and it beats the pants off any bullet grinder. Bullet grinders are difficult to use, burn the coffee beans and are really inconsistent. You can use a bullet/blade/blender grinder, but be warned, it won't be an enjoyable experience.If you don't want to grind coffee, get your coffee provider to do it or buy it from the supermarket pre-ground. I'll give you an interesting statistic: a coffee barista told me that coffee grounds have an expected life of 15 minutes. Pre-ground from the supermarket? You're buying stuff that was dead and buried a long time ago. It'll do if you want one step up from instant coffee, but I'm thinking if you're reading this, you want something a bit more fancy.. Grind heaps of coffee or use heaps of pre-ground stuff. If you have a 1.5 Litre (imperial units be damned) jar, put 100 grams of coffee in. Honestly I never really measure it, but that feels about right. You could put half or double in, it would just make it stronger or weaker, but there's a point of saturation so don't overdo it or you're just wasting coffee beans.Pour room-temperature water into the jar and fill it up nearly to the top. Mix it up with a spatula or a spoon.You could also use a whisk if you want to be pointless and inefficient.. Put the jar in the corner and feel really cool about yourself. Leave it there a couple of days. Ambient temperature might play a factor so don't leave it in direct sunlight. Here's a guide - if you feel hot in the same room, it's too hot.Once a couple of days has gone by, put the jar in the fridge. My fridge door started to crack from the size of this jar so I now put this jar in the vegetable tray at the bottom. If you put it on it's side make sure it doesn't leak - coffee tastes great but looks horrible as a brown stain on your fridge shelf!. Take the jar out of the fridge once it is chilled. I leave the method to you but I prefer to use my hands.Get the things you need to filter the coffee.The filter can be:- a paper filter for a percolator- a filter for a drip-filter- or what I use, a generic paper towel. The thinner the better!Get a funnel. I have an Aeropress which comes with a weird hexagonal funnel. It fits the need perfectly.. Put the funnel into the top of the second jar. Put the paper towel into the top of the funnel and tuck the edges of the 'filter' inside the funnel. If you don't, the coffee will drip outside the jar. If you use a proper filter you should be fine to just sit it in the top.Pour the coffee into the funnel slowly, taking care you don't mix up the coffee grounds - it's best if the coffee grounds stay at the bottom of the jar. It will filter through slowly so don't overfill it.When it gets low in the funnel, top it up. The filtering should get slower and slower. Once it is too slow, take the paper towel out and wash it. Yes, you can reuse a paper towel! Put it back in, or if you're lazy just throw the paper towel out and put a new one in.. Keep going until you're finished, and then take the paper towel out and gently wring it out; carefully though, it could easily burst open and put grounds into your beautifully filtered brew. You should get another half a drink's worth if you squeeze it out well.. See this picture. It glows golden brown.That's because I shone a torch behind it. Well it looked cool.Drink it up good. It will taste light and almost tea-like, but it is very strong in caffeine.That's it!By the time I have filtered one lot I wash the first jar out and start filtering again. I never really run out that way. \nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_93_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_93_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_93_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_93_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. What you need for this is nothing out of the ordinary. Simple and quite frankly, fun! Most of this is simply to eye-ball what you need so don't be afraid of being precise with the amounts of the ingredients.you will need... A slow cookeramaz-n smoker ( to smoke the cheese. although we just modified and made our own. We will show you how we did it) smoked wood pellets (any flavor, but I recommend a hickory)------Recipe----------------------------------------------------------------------------pulled pork1 -2lb pork butt/shoulder1/2 cup of chopped onion 1 tablespoon smoked paprika4-5 chopped garlic clovessalt and pepper to taste12 ounces of a dark beer (or your favorite) for me I choose Sam Adams Winter Lagerhalf a jar of your favorite BBQ sauce plus additional for drizzling grilled cheese sandwiches1 tablespoon unsalted butter1 tablespoon flour1/3 cup of your favorite beer 1/4 cup milk3-4 ounces freshly grated smoked white cheddar cheese 4 slices slices white bread softened butter for spreading. Cheese is one of my most favorite things to eat! Even being lactose intolerant, I find myself gorging on delicious pizza or anything to satisfy my cheese cravings. It's even scientifically proven that cheese can be addicting!! Curse you cheese!For this recipe, we are using Vermont White cheddar from Boars Head.A great tasting cheese can always be elevated by something called cold smoking. It's called cold smoking because you do not use fire or heat to cook the cheese, you just use smoke to infuse it with flavor over a few hours.To start, all you need is a grill, some smoking wood pellets and something called amaz-n- smoker. However for us, we simply bought some thin grill grates and bent them to match one to save some cash. You can see how we bent the grate in the pictures provided. The grill plates cost $4.95 and it came with a pack of 3. So no brainer there, we made our own amazin-ly cheap smoker.  When ready to bend them, essentially you are making a square arch to put the pellets in. Worked great for us and we saved some $. Always a win.Light the smoked wood pellets, place the cheese on the other side of the grill (away from the smoldering pellets) and simply close the grill and let it smoke for 2 hours. Keep an eye on it and make sure the smoke is still rising from the vents.. Like your grill is having a secret party without you... After 2 hours wrap it up with plastic wrap and store it in the fridge for up to 2 weeks. The longer the cheese rests, the better the smokey flavor.  So you can smoke the cheese right before you decide to cook the pork or you can do it days ahead (recommended). In case you were wondering, No, it's not hamster food. It's smoking pellets!!~~a fun tip~~~ Did you know, cheddar is a cheese you can eat if you are lactose intolerant? Aged cheese in general most people with lactose intolerance can tolerate as most of the lactose is drained with the whey when making it. However, I wouldn't recommend gorging on them as the small traces of lactose can add up quick and cause havoc on your tummies. Trust me I know.... After you have smoked your cheddar and it is safely wrapped in its cool home in the fridge, you can now cook the pork. Sear the pork on the stove to get a nice flavor. Do this about 2-3 minutes per side.( this step is optional but adds more flavor!!) chop up your garlic and onions and place them in the slow cooker.Then place the pork on top of the chopped onions and garlic. Add the smoked paprika, dark beer, salt/pepper and BBQ sauce and then set the timer. The cook time will vary depending on your size of pork roast. It will be around 5-6  hours per pound on the low setting. I've seen recipes take 8 hours though. Low is slow is the way to go!! I'd save some BBQ sauce to use as extra toppings for the grilled cheese sandwich. Flip the pork every couple hours or so while cooking. Half way through the cook time make sure to shred the pork. . What you are making here is a sort of cheese gravy for your grilled cheese. It'll be the cheese that holds your grilled cheese sandwich together and It's gonna be super delicious!! Your cheese cravings WILL be answered! What makes this grilled cheese sandwich extra good is that the cheese has been infused with a delicious smokey flavor and then sauteed with beer.. What more could you ask! (more cheese maybe??) yeah ok I'll give ya that one. To make the smoked beer cheese sauceHeat a small saute' pan on medium heat and add 1 tbs of unsalted butter. When the butter starts bubbling, add about a tablespoon of flour and whisk it in for a minute or 2. Time to pop another bottle o' beer. I ain't complaining... This time I am using a lighter beer as to not cover the flavor of the cheddar (although I'm sure any type beer will suffice)  Add about 1/3 cup of the beer to the flour mixture continuing to whisk. Then, whisk a quarter cup of milk in. reduce heatAdd the grated smoked cheese in and continue whisking until cheese comes to a smooth consistency. Stir every few minutes while continuing to cook. tip: Remember that these measurements depend on how much you are using. If your cheese mixture is too clumpy, add a bit more milk, if too runny, turn up the heat and add a bit more flour. Now time to assemble your sandwiches. You're getting close to magic!. This is your basic, MAKE A GRILLED CHEESE sandwich part... but with more savory ingredients! The makings should be pretty straight forward, but in case you don't know...Heat another saute' pan on medium heat. Butter the outside of 4 pieces of the white bread (or more, depending on how much you cooked). I bought my own loaf of white bread to make extra big slices. Place each slice butter-side down on the skillet, then top with a few handfuls of pulled pork. Cover in the warm smoked beer-cheese sauce, then top with the other slices or bread, butter-side up. Turn the sandwich over in about 2 minutes per side. Add any more BBQ sauce you may want at any time assembling. Remember that the cheese is already melted and pork is already done, so it won't take as long to cook. Cook until each side is deeply golden and crispy and your desired effect of cheese heaven.. We found we poured more beer than was required into the cheese and it tasted amazing! This cheese recipe can also go very well with a nachos or a dip in a football party perhaps. I truly hope you enjoy your sandwiches. Let me know if you make it!! May the god of cheese be with you!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_94_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_94_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_94_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_94_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Tip #1 Start early for slow cooker recipes. Take your pork shoulder and completely coat with dry rub spices.\u00a0 If you would like a sweet taste add more brown sugar, if you prefer savory add less.\u00a0 After coating pork shoulder (5 lbs) place into crock pot with 2 cups of water on high for 6-8 hours.\u00a0 When ready remove from crock pot and using forks begin to tear pork away from bone.. Tip #2 Prepare your salsas before the meal and refrigerate for at least 30 minutes to get optimal flavors! Now time to make your salsas\u2026we have corn and tomato salsas that we are going to prepare.\u00a0 Open your can of corn and drain out the liquid.\u00a0 Place in small bowl and mix in chopped basil and\u00a0 diced red onions.\u00a0 Set aside.\u00a0 To make your tomato salsa take the diced tomatoes, mix in red onions, cilantro, and cumin\u2026that\u2019s all there is to it. Set it aside.. Now time to prep the vegetables for meat.\u00a0 Dice up red onions, green peppers, jalape\u00f1os, and garlic.. Tip #3\u00a0 Let garlic simmer in hot oil to get the most flavor! Heat olive oil in a deep skillet on medium heat. Once hot add garlic and let it it simmer until golden brown. Once lightly browned add in the rest of the vegetables, stir, and let them cook through for about 5 minutes. Add in about 2 pounds of the pulled pork, about 1 1/5 cups of water, and the spices listed above (use the rest for sandwiches, more tacos). When adding spices taste and add more as you need them.\u00a0 I found that I use a lot of cumin and paprika in comparison to the other spices.\u00a0 Also if you have a lime handy squeeze a bit in instead of using salt.\u00a0 Add in some chopped cilantro, stir, cover and let it simmer for about 30 minutes.. After 30 minutes taste the pork and adjust flavor if necessary.\u00a0 Now if pork mixture is to thin drain about 1/3 cup of the fluid into a small bowl, and mix in about 2 tablespoons of flour until smooth, and return back into the pork and stir.\u00a0 Repeat this until it is a desired thickness.. Tip #4\u00a0 Always pan fry your soft tortillas, it really makes a difference. Place a skillet on high heat and lightly grease with cooking spray.\u00a0 Add on your tortillas and flip until golden brown and bubbling on each side.. Once soft tortillas are browned you can begin to assemble!\u00a0 Place tortillas flat and add pork, followed by cheese, both salsas, and light sour cream. Now fold em\u2019 up and begin to feast!. Suggestions  \u2022 Serve with a side of tortilla chips and even add them into your taco for a bit of crunch.   \u2022 If you like your tacos extra spicy add on Sriracha or your favorite hot sauce.   \u2022 Serve this pulled pork recipe in any sort of style; nachos, burritos, or maybe a tostada. nom nom nom. For this recipe and more please visit my food blog at everythingbutfish.tumblr.com\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_95_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_95_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_95_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_95_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. - 250g caster sugar- 70mL water- 1/2-1tsp salt- 200g cold cubed butter- 120mL thickened cream (36% fat content this is important for a nice thick caramel)- 1tsp vanilla bean paste. - 1 Small saucepan- Mixing spoons- Candy thermometer- Whisk- Glass container for storage. Place the caster sugar and water in a small saucepan and cook the syrup on a medium high heat until the syrup turns a light yellow colour. (This should take 5-8 minutes).Now carefully watch the syrup until it turns a golden amber colour because this will change very suddenly and if not watched it may burn (be careful!). Once the syrup is an amber colour remove from the heat and add the thickened cream and stir in with a whisk. Be very careful when whisking this as there will be a lot of steam coming out of the pot. When the cream is mixed in place the pot back onto a high heat and allow the caramel to come up to 108 degrees Celcius while stirring (use a candy thermometer to read the temperature). Then remove the pot from the heat and allow it to cool until the caramel stops bubbling. Then add the vanilla bean paste and salt (the amount of salt depends on how salty you like your salted caramel).When the caramel has thickened  (stir for about 1 minute) then add the cold cubed butter and continue to whisk the caramel together. (If the mixture splits then you have added the butter when the caramel is too hot.). Serve the caramel while hot over ice cream or cookies. *Filling for baking/ macarons* use the caramel when cold out of the fridge.Place the caramel in a clean glass jar or container and store in the fridge for up to a week. If you need it at a runny consistency just heat in the micrwave until you have the desired consistency.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_96_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_96_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_96_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_96_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Plastic Champagne Glasses1 bag of M&Ms Valentines Colors1 bag of Chocolate HeartsHeart Shaped SuckersScissorsAdditional Items You may want to use:Some ribbon, stickers or any other item you may want to decorate your cupsGift Tags (Optional)Cling Wrap, clear or colored. Heat Sealer\nHeat Gun or Hand Held Hairdryer\nShrink Wrap Bags\n*Note: you can purchase shrink wrap bags that work with a hair dryer and not need a Heat Sealer or Gun.\n. Fill cups with M. Set a few Candy Hearts on top.. Cover Cup with desired wrap.\n*Go to Step 8 for shrink wrapping.. Poke a hole in Cling wrap and push suckers through. \nAdd a gift tag, ribbons, bows, etc.. Wah Lah!\nYour gift is finished and ready to go for a fraction of what you would pay to buy a similar item already made from any store. About $1.20 each.\n. Prepare your bag to fit over your gift. \nMy Bags were 6 X 6 inches, I folded over one side at the 41/4 inch mark.and creased the bag.. Lay your bag across the sealer and seal on the fold. Move the bag over slightly and seal again next to the first seal so your bag won\u0019t break open when shrinking.. Fold the bag in half and cut a notch in the top large enough to allow the suckers to pass through.. Place the bag over your cup and align the seam so it isn't running down the center of your cup.. Insert suckers through the hole. \nUse a Heat Gun or blow dryer to shrink the wrap to fit your cup. Because you are working with Chocolate this is a delicate operation. Make short, swift passes pausing between each one to keep your cup from becoming hot and melting your Chocolate, rotate the cup from the bottom as you work. \nAdd your final decorations.. Here is another variation of your project with curled ribbons.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_97_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_97_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_97_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_97_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Ingredients:\nHot Dogs - preferably grilled (for flavor), but since hot dogs are pre-cooked, you could use them straight from the package. I used about 2\u00bdPizza DoughCheddar Cheese (about 1 cup)Corn Meal and/or pan sprayChili (maybe around 1 cup)Onions (not pictured)Any other chili-dog-type toppings you'd like to add!Supplies:Pizza Pan (or you could use a pizza stone)Pizza Cutter\n\t\tCutting Board\n\t\tKnife\n\t\tGrill (optional)\n\t\tGrilling Utensils (optional)\n\t\tSpoon for spreading chili (not pictured). \n\t\tFirst, spray the pan with pan spray and sprinkle on some corn meal.\n\t\tThen stretch out the dough in your hands (I just do it in the air - let it hang until it's wide enough for the pan). I wish I knew how to spin it around ;)\n\t\tPut the dough in the pan and spread it to the edges.Preheat the oven to 450 degrees Fahrenheit. . Grill the hot dogs (if you want to). Hot dogs are fully cooked, so you don't need to grill them if you want to save a step. I grilled them to add some flavor.\nSlice the hot dogs into 1/4\" - 1/2\" thick pieces. I wound up using about 2.5 hot dogs. That was about perfect for 1 piece in each 1-2 bites of pizza. You could do more or less to taste.\nHeat up the chili if you want. Mine was frozen so I had to heat it. If it's fully cooked and not too cold, you could probably skip heating it. If it was in the fridge, I'd probably heat it a little.. \n\t\tMake sure the dough is still stretched out to the edges. Poke holes in the dough with your nails. (this prevents crazy bubbles but still allows for some little ones)\n\t\tSpread the chili out on the pizza. I tried to get some meat in each bite. I wound up using somewhere around 1 cup, maybe a little more.\n\t\tSprinkle on the cheese. I always do about 1 cup when I make a pizza.\n\t\tSprinkle on onions and any other extra toppings you're adding.\n\t\tEvenly space out the hot dog slices on the pizza. Mine have about 1 slice worth of space between them (or that's at least what I was going for). This turned out pretty good but you can tweak it to your taste.. The oven should be preheated to 450 now. Put the pizza in.\nBake the pizza for about 15-20 minutes. Normally when I make pizza, it takes about 20 minutes but it was faster this time. I think it was because the hot dogs and chili were warm to begin with. Turn the pizza 1/2 way through cooking to ensure even baking.\nOnce the crust is golden brown and the cheese is nice and melty, take it out of the oven and cut into slices. The hot dogs started to brown/burn on the top, but they were tasty.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_98_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_98_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_98_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_98_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. You'll need:FOR THE CUPCAKES1 cup all-purpose flour1/2 teaspoon baking soda1/4 teaspoon salt1/2 cup boiling water1/3 cup cocoa powder1/3 cup semisweet chocolate chips1 tablespoon instant espresso3/4 cup sugar1/2 cup sour cream1/2 cup vegetable oil2 large eggs1 teaspoon vanilla extractFOR THE FILLING:3 tablespoons water3/4 teaspoon unflavored gelatin4 tablespoon (1/2 stick) unsalted butter, softenedPinch salt1 teaspoon vanilla extract1 1/4 cups marshmallow cr\u00e8me (don't substitute marshmallow sauce)FOR THE GLAZE:1/2 cup semisweet chocolate chips3 tablespoons unsalted butter. Adjust oven rack to middle position and heat oven to 325 degrees F. Grease and flour 12-cup muffin tin. Combine flour, baking soda, and salt in bowl. Whisk water, cocoa, chocolate chips, and espresso in large bowl until smooth. Add sugar, sour cream, oil, eggs, and vanilla and mix until combined. Whisk in flour mixture until incorporated. Divide batter evenly among muffin cups. Bake until toothpick inserted into cupcake comes out with few dry crumbs attached, 18 to 22 minutes. Cool cupcakes in tin 10 minutes, then turn out onto wire rack and cool completely.. While the cupcakes bake, combine water and gelatin in large bowl and let sit until gelatin softens, about 5 minutes. Microwave until mixture is bubbling around edges and gelatin dissolves, about 30 seconds. Stir in butter, vanilla, and salt until combined. Let mixture cool until just warm to touch, about 5 minutes, then whisk in marshmallow creme until smooth (takes a bit of effort, just keep whisking); refrigerate until set, about 30 minutes. It will seem thin, but will set up quite thick. Transfer 1/3 cup marshmallow mixture to pastry bag fitted with small plain tip (I don't have small pastry bags, so I used a snack size Ziploc); reserve remaining mixture for filling cupcakes.. Microwave chocolate and butter in small bowl, stirring occasionally, until smooth, about 30 seconds. Cool glaze to room temperature, about 10 minutes.Cut cones from the cupcakes by inserting the tip of a paring knife at a 45-degree angle about 1/4 inch from the edge of the cupcake. Cut out and remove the cake cone. Since each cupcake cone is a little different, I placed the cones on a piece of wax paper in the same spot their cupcake was on the rack, so I could keep each cone with it's original cupcake. For example, if I cut the cone from the top left cupcake, I placed the cone on the top left corner of wax paper. Cut off all but the top 1/4 inch of the cone, leaving a circular disc of cake. Discard\u2014better yet, eat!\u2014the bottom of the cone.Using a spoon, fill each cupcake with one tablespoon marshmallow mixture and then top with the reserved, trimmed, cake cone. If the tops of your cupcakes aren't uniform, or perfectly smooth, don't fret, the glaze does a great job of coating the tops. . Frost each cupcake with 2 teaspoons cooled glaze, spreading it over the top as best as you can, and let sit 10 minutes. Using pastry bag with reserved filling, pipe curlicues across glazed cupcakes. (As you can tell, I tried to get a bit fancy with the writing, I did my best to pipe my mom's initials because I made these for her birthday.) Serve. Cupcakes can be stored in airtight container at room temperature for 2 days.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_99_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_99_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_99_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_99_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Pitted datesUnsweetened coconutVitamix or other strong blender/food processorWax paper. Add 1-2 cups of shredded coconut to your blender to make it finer. The smaller the pieces the better it will stick. Remove from blender and set aside. . Add all of your dates to your blender and slowly increase the speed, you want a pasty result. 10-15 seconds should be enough. Scoop the date puree and move to a small bowl. . Use a spoon or your hands to make small balls. Roll the balls in the coconut and then set aside on wax paper. . Freeze your date balls for a few hours to harden, then they can either be left in the freezer or moved to the fridge. Should stay good in the fridge for a couple of weeks, longer in the freezer. \nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_100_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_100_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_100_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_100_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. The materials you will need will depend on how many you want to make. Each one requires:\n3\" x 7\" piece of cardboard (preferably decorated)\n2.5\" x 2.5\" card for the front (you can make this or buy one, I'll describe the former)\n12\" ribbon\nDouble-sided tape or glue\nHolepunch (anything that will make a hole in cardboard will do)\nYou can make three of the cardboard pieces from one standard (8.5\" x 11\") piece of cardstock.\n. Once you have you 3\" x 7\" peice(s) of cardboard you should fold it in half and cut a half-circle out of the non-folded end. Just below the half-circle cut punch a hole with your holepunch. I used a square holepunch for that extra touch of fancy.\nPlace the double-sided tape along the edges (see picture below). Press firmly.\n. Cut two pieces of paper about 2.5\" x 2.5\". Make the outer paper (the nicer one) a bit smaller than the backing. Place two hole punches in the top corner. Then insert some ribbon around these two holes and tie a bow. The space inbetween the cards is where you write your note (sonnet, love poem, gum related joke, etc.).\nUse the double-sided tape or glue to stick this to one side of the larger cardboard piece we made in step 2.\n. Cut a piece of ribbon to about 12\". Thread the ribbon through your punchhole. Make sure that the ribbon is even on both sides of the container. Now take your gum and use it to push down the ribbon all the way to the bottom. Tie the ribbon in a knot(or bow) about an inch from the top of the container. Cut off any excess ribbon above the knot.\nWhen you pull on the ribbon the gum will move up the cardboard container and out of the package. To finish this make sure there are no ugly lines or cuts, exposed tape or poorly shortened ribbon.\nThese take almost no time to make and add that special 'hand-made' touch to your Valentine's Day. Thanks for reading.\n\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_101_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_101_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_101_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_101_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. A couple of months back I was given 50lbs of ground beef, yes FIFTY pounds! Its definately not premium meat something along the lines of 60/40 but it was free so... I broke it up into approx 2lb packs and froze it, thats where I started and while I was digging out the ground meat I came upon some hot italian sausage also in the freezer, popped them in a bowl of water to thaw, a couple of almost past use green peppers and a red onion, a couple eggs, some oatmeal and bread, various spices, sauces and milk. \u00a0There's a cooking term fro what I'm making in this step but I can't recall what it is, it lookis like a glass full of...welll... yea.. but I'll call it yuck, in a glass, bowl or other container put about 1 and a half cups of oatmeal, rip up a piece of bread and add about a half cup of milk and 2 eggs, mix it up and allow to soak while you do the next steps.. sorry about the 3rd pic, I guess my camera thought it looked like yuck as well. dice up your peppers and onions, add in some garlic , I had some left over spinach, you can add carrots, celery or any veggies you like. I often use a package of spicey breakfast sausage , this time I have hot italian sausage links, slice the links and peel the meat from its casing, get it in the bowl with the veggies, then add your ground meat, your oatmeal and eggs mix, any seasonings your going to use and mix completely. Ok remember when I said this wasn't the best ground beef? Well we need a way to get the grease away while cooking AND not make a mess of my oven, luckily I have a perforated baking tray (YES its clean! or as clean as it gets without a wirewheel) I covered the tray with tinfoil and using my repurposed awl/icepick (icepicks run around $5, the awl was .75) poked some holes .\nAlso remember this is about a meatloaf for SANDWICHS not your average small loaf for dinner that takes 2 slices to fill a piece of bread. Start forming your loaf and if you like create a well in the top to add some cheese, then cover with more meat, top it all off with bbq sauce. Your probably thinking.. \"He's crazy how can poking holes in the bottom to let grease out keep the oven clean\"\u00a0 ah grasshopper I may indeed be crazy but Im not stupid !\u00a0 Place a large baking dish under your meatloaf pan and fill it half way with water, okay one not so good side effect of the steam was the bottom of the loaf never browned but thats a small price to pay. Place in a preheated\u00a0350f oven for 90 minutes or so, I know thats longer than a usual meatloaf but this thing is HUGE\u00a0 and because of the steam bath its not going to dy out.\u00a0 The eagle eyed may see my cast iron frying pans that I left in the oven, it doesn't hurt the pans and helps provide regular heat with a flucuating electric stove. Once the meatloaf is done to your liking, take it out and allow to sit for 10-15 minutes before slicing or unless you cant wait like me, let it cool then place in the refrigerator for later, even after 10minutes the cheese in the middle was still molten so be careful ! MAke your sandwich and enjoy or of course you could add a baked potato and make it a meal\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_102_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_102_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_102_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_102_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. I winged the hot sauces and didn't really know what I was going to make until it was underway. You know how the creative process goes.\u00a0I put on a respirator because the air gets pretty thick when all those hot peppers are cooking. I made three hot sauces, bourbon ginger, Ardbeg thai chili and scotch jalape\u00f1o.\u00a0Bourbon Ginger Hot Sauce 6 serrano peppers 3\" of fresh ginger 2 oz bourbon 2-3 Tbs of pomegranate seeds 2 cloves garlic 2 tsp salt 1/2 lime 1/2 cup water 1 Tbs vinegar 1 Tbs olive oil Dash of pepper Split the serranos in half and put them in a cast iron skillet on high until they start to blister and turn black. Toss in the garlic for another couple of minutes. Lastly add the ginger and pomegranate and 1 oz of the bourbon. Toss it all around until the liquid has evaporated. Put all of the ingredients, including what was in the skillet, into the food processor. I left the skin on the lime and just tossed it in. Let it go for 3-5 minutes. Strain the mixture, bottle it and keep it in the fridge.Scotch Thai Chili Hot Sauce 1 cup of thai chilis 2 oz of Ardbeg scotch 1 tsp brown sugar 1 Clove garlic 1 Tbs salt 1 Tbs lime juice 1 Tbs olive oil Dash of pepper Do the same as above, but heating only the chilis and garlic.\u00a0Scotch Jalepeno Hot Sauce 4 Jalapenos 2 oz of Ardbeg scotch 1 Clove garlic 1 Tbs salt 1/2 lime whole 1/2 c. fresh cilantro 1/4 of a small onion 1 Tbs olive oil 1 tsp Cayenne 1 tsp Chipotle 1 tsp Pasilla\u00a0 Do the same as above, but heating only the Jalapenos, garlic and onion.. First off, you're going to make an 8.5x11 piece of paper from a brown paper bag. Just cut the two largest panels, iron them, and then use a piece of paper as a template. It needs to be fairly exact or your printer do its 'I hate you' noises.\u00a0. I pulled images of the ingredients off the interweb and arranged them into a nice grid. I turned the image black and white, pumped the contrast and lowered the brightness to get the mid-tones to show up on the brown paper.. For this bottle size (it's an old Soy Sauce bottle) one sheet of paper was perfect for two labels. Do a test run with a white sheet of paper first. When you print, you may need to tell your printer that you're using a heavier weight paper. Once it's printed, cut it to size, put some glue on the edges and voila. Lovely brown paper textured label.\u00a0. Another alternative is to create a bottle sized bag from the 8.5x11 print out. Since I had to fold and handle this page more, the ink started to crack and come off a bit. I used a laser printer, so you might have better luck with inkjet. I had a box of crackers about the right size and folded the paper around one side like a present. Then I creased the two sides inward. The whole project turned out pretty well and definitely got the \"I can't believe you made this!\" response.\u00a0\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_103_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_103_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_103_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_103_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Coffee:\nabout 2 cups of strong coffee\n1/4 tsp allspice\n1/2 tsp cinnamon\u00a0\nabout 1/8 of ginger\nPinch of nutmeg\n1 cinnamon stick or an additional 1/2 tsp of cinnamon\n1/2 c sugar to taste\n1/2 c milkSupplies:\nIce Tray (if making chilled)\nSomething to mix spices in\nA big pot. Make a strong cup of coffee, too watery and it'll be too sweet at the end. You want a good dark pot, it will get weaker by the end.. 1/4 tsp allspice\n1/2 tsp cinnamon\nabout 1/8 of ginger\nPinch of nutmeg\nand if not using the cinnamon stick add 1/2 tsp of cinnamon\nDo not mix the sugar yet.. Combine Coffee, cinnamon stick and spices in big pot over med-high heat. Add sugar slowly to taste.\nRemember, chai is supposed to be sweet!\nNow slowly add the milk and stir slowly. As soon as it starts to boil remove it from the heat.\nDrink up!\nIf you want to try it iced, I suggest adding a scoop or so of vanilla ice cream and blending up some frozen cubes of this blend. The cubes don't even need to freeze all the way through, I let mine freeze for about an hour and a half and the centers were a little soft but it turned out fine.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_104_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_104_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_104_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_104_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Oil Spray\nPotatoes (used Russet this time)\nAny type of seasoning you'd like, curry powder and salt/pepper was used this time around.. Preheat the oven to 400 F.\nWash your potato. If you feel like you dislike skin or find it too thick, use a peeler to get rid of the potato skin. If not, then just start slicing. I found that I enjoy not burnt chips, so I've been safeguarding myself from it by slicing them rather thickly. Harder to burn them that way.\nTip from multisync: Wash the slices in water in order to remove a bit of the starch and make the chip crispier.. Get some paper towels, line the potato slices up and pat dry of extra moisture.. Spray your baking pan or cookie sheet with the oil spray and start lining up the chips in. If you'd like, you can spray again on top of the potatoes for crispier chips.. Place the chips in the oven and wait for twenty minutes or so, depending on the thickness of your slices. The chips should be a golden brown by the time they are done. When they are baked to your approval, take them out of the oven and start seasoning them immediately, as the oil on the chips will still absorb the seasoning.\n**NOTE: You can also flavor your chips before baking, however I found that the smaller chips are most likely to burn and also burn the spices on it, making for an untasty chip. Flavoring afterwards is merely a precaution.**\nAnd you're done! Enjoy! Thanks for reading/looking. :)\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_105_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_105_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_105_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_105_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Ingredients:\n1 cup of butter (2 sticks)\n2 eggs\n1 tsp vanilla\n1 1/2 cups of sugar\n3 cups of flour\njelly or jam (flavor of choice)non-food materials: cookie press - use star attachmentMix butter, eggs, sugar, and vanilla together. Once smooth add in flour 1/2 a cup at a time.When all the ingredients are mixed let the dough sit out for 15 minutes. In my experience this makes the dough firmer and the cookies chewier.. You can buy a cookie press at most kitchen supply stores and online.  Many of the newer models operate and look like a caulk gun - this is what I use currently.  My mom has one of the really old school cookie presses.To fill the cookie press take a handful of dough and roll it in the palm of your hands so it looks like a snake.  This will help it easily slide into the cookie press tube. Once the dough is in (and not hanging out the end) put the star attachment on the top and secure it with the front ring.. Put cookies on a non-stick pan (don't grease).\u00a0 Position the cookie press as seen in the image and press down handle of cookie press and dough will come out.\u00a0 You want to fill the gap between the cookie sheet and the cookie press with dough.\u00a0 Don't press too hard, you don't want dough squeezing out the sides.\u00a0 Once you have pressed out enough dough lift the cookie press up and you will see your cookie!\nYou will get the hang of this over time.\u00a0 I get very particular about the size of my cookies.. To make room for the jam use your knuckle to press a small indentation in the center of each cookie.\nPut a little jam on the tip of a knife and use the knife to put the jam in indents of the cookies.. Bake at 350 degrees for 8-10 minutes.\u00a0 You will know they're done when the bottoms of the cookies are slightly browning.\nLet cool for 15 minutes and enjoy!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_106_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_106_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_106_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_106_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. You will need: Chocolate chess pieces mold (available online or in specialty cake supply shops) 12 oz White candy melts (they come in 12 oz bag) 12 oz Black candy melts (they come in 10 oz bags) Plastic piping bags or plastic squeeze bottles Small paring knife (not pictured) Pair of white cotton gloves--you'll thank me later on this one! You will NOT need: Anything to grease the mold--the chocolate will pop easily out of a clean, *dry* mold. (Water can make chocolate seize, so be sure to have everything dry.). Put a large handful of candy melts into the plastic piping bag or the plastic sqeeze bottle. \u00a0(I like to place mine on a kitchen towel to keep the plastic bag from melting on the hot glass turntable after a few rounds of microwaving). \u00a0Only microwave for 30 seconds at a time and remove after each 30 second session and squeeze bag/container with your hand until thoroughly mixed. \u00a0You don't want to overheat the chocolate and have it seize in the containers. \u00a0The chocolate will scorch if heated too hot and turn into a hard lump of chocolate that is unusable. \u00a0Squeezing the bag after each interval will mix up the hot spots with the cool spots and keep it more evenly melted. \u00a0Keep microwaving until chocolate is completely fluid, maybe even cutting down the time in the microwave to 15 seconds for the last lump of chocolate bits and then massaging the bag really well.. Fill the chess mold with your chocolate. \u00a0Do not fill past the edges of the reservoirs or you will give yourself more work later cleaning up the seams when joining the two halves together. \u00a0Air bubbles will have formed inside the chocolate piece so you will need to lightly tap the mold on the table several times until you see the bubbles rise to the surface. \u00a0You can then pop the bubbles with a toothpick. \u00a0Tapping the mold on the table also helps to level the liquid chocolate in the pieces, so it is a good thing to do it after filling each reservoir before the chocolate hardens at room temperature. \u00a0After all the reservoirs are filled, you can place the mold in the refrigerator for a few minutes to harden up the pieces. \u00a0Since the melted chocolate isn't too hot to begin with, it will harden rather quickly--maybe even hardening in the piping bag before you are done filling the mold. \u00a0When that happens, pop the bag or bottle back into the microwave to remelt for 30 seconds again. \u00a0. After the pieces are hard, you can remove them from the refrigerator and unmold them. \u00a0I would recommend using your hand as a brace until you can gently place the mold upside down on the table, or the fragile pieces will fall to their untimely doom. \u00a0For the more stubborn pieces, gentle pressure from your fingers should pop them from their plastic form. \u00a0Cotton gloves are recommend at this point since the warm body temperature of your hands will easily melt fingerprints into the sides of the pieces. There are two ways to attach the halves together to make a whole chess piece. 1) Use melted chocolate as a \"glue\" to attach the two halves together. \u00a0Hold halves together until pieces are stuck together. or\u00a0 2) Take a cold harden half and place it on top of the still liquid match in the mold before you place it in the refrigerator to cool. I've done both ways and prefer the sandwiching the two hardened halves together with melted chocolate. \u00a0I tend to have less to clean up on the seams later than when I have to perfectly float a piece of hard chocolate on the liquid chocolate and hope it doesn't slide off before I get it to the refrigerator. \u00a0But you may find that it works better for you. \u00a0. Using a sharp paring or exacto knife, cut the hardened chocolate seams flush with the piece. \u00a0Sometimes I will even \"buff\" out a seam with my finger if it is not too big or to fill in a small gap between the edges. \u00a0. You'll need to mold the pieces the following number of times for a complete chess set. Rooks: \u00a04 white halves for two complete pieces and 4 black halves for two complete pieces Knights: \u00a04 white halves for two complete pieces and 4 black halves for two complete pieces Bishops: \u00a04 white halves for two complete pieces and 4 black halves for two complete pieces Queen: \u00a02 white halves for one complete piece and 2 black halves for one complete piece King: \u00a02 white halves for one complete pieces and 2 black halves for one complete piece Pawns: \u00a016 white halves for eight complete pieces and 16 black halves for eight complete pieces But of course it is always wise to mold a few extra halves to account for breakage that might occur when unmolding the pieces or squeezing the two halves together too tight. \u00a0You can always throw the broken pieces back into the bag/bottle to remelt the chocolate.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_107_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_107_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_107_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_107_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Peel onion .. Find plastic lid that the onion will fit into .. (This onion is already sliced as i started the photo shoot after i was midway through the dicing process ). . Slice a checker board pattern into the onion. ( Cut into the onion starting at the right edge . Press the blade into the onion until the blade is impeded by the plastic lid .. Remove the blade from the onion and repeat the slicing process in parallel cuts across the rest of the onion to the left. Rotate the onion 90 degrees and repeat the cutting process perpendicular to the first cuts. ) ( I used a radial cut method in the photos. I describe the checker board cut pattern because I like it better.) . Turn the onion onto its side... Slice across the onion and the previous cuts creating nicely diced onions. \nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_108_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_108_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_108_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_108_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Always wash your hands before making food.. \u2665bread \u2665jelly \u2665butter \u2665paper \u2665knife \u2665toothpick \u2665pen \u2665tape \u2665plate\u2665scissors. lay out two peices of bread, side by side, on the plate.. open the butter. using the knife spread the butter on the peice of bread to your right. Put the lid back on the butter.. open the jelly. using the knife spread the jelly on the peice of bread to the right. Close the lid on the jelly.. lay the peice of bread to the left on top of the peice of bread to the right.. cut the bread into four/three lines using the knife.. fold the paper in half.. on the fold draw  flags and then cut the flags out.. slide the toothpicks inside the fold in the flag. tape th flag around the bottom to hold the toothpicks to the flags.. roll up the bread and stick your toothpick flag and stick it through the center of the roll.. . the rolls do not always roll perfectly like the one in the pictures. (Remove toothpick before eating.) Enjoy! \nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_109_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_109_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_109_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_109_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Wash, peel, and coarsely chop the carrots.\u00a0 I often weigh the carrots to help me better estimate how much salt and garlic to add.\u00a0 In the images, I'm using around half a kilogram.. \n          Use enough oil to cover the carrots by half-an-inch or so, and heat up both the oil and carrots.\u00a0 There's enough water in the carrots that under medium heat, the mixture will stay near 212\u00b0F.\u00a0 Unlike some frying recipes, like Pimientos de Padr\u00f3n where not much oil is absorbed, the carrots will soak up some oil.\u00a0 So, I\u00a0like to use exclusively high-quality olive oil.\u00a0 Steve McCulley of Apollo Olive Oil -- my favorite producer -- suggests that olive oil's polyphenols survive up to 320\u00b0F.\u00a0 Here's an email from Steve:The polyphenols are still  intact at 210 F, In fact polyphenols protect the oil when the heat rises up to  around 320 where they may begin to break down. I cook with extra virign olive  oil but others think it is waste. I cook with it because I notice a difference  in taste and I know polyphenols are still retained. Others do not notice a big  difference in taste so choose to cook with a lesser grade oil. Some people cook  with a high quality wine and others do not. Like wine you can taste the oil much  better when uncooked and drizzled fresh over cooked food. I, however, still cook  with real extra virgin olive oils because other oils are not only not really  extra virign they are frequently defective. If budget is a consideration I would  consider cooking in a lesser oil and dressing with ApolloOlive Oil.  When the carrots are cooked, you can filter and reuse the oil.\u00a0 I\u00a0keep my oil in the refrigerator: in the images, it's still cold, which is why it's an opaque\u00a0yellow.. Cook the carrots under oil until they've lost most of their water.\u00a0 The objective is to concentrate the carrots' flavor without exposing them to oxygen or temperatures much above boiling.\u00a0 You can tell when this happens because the carrots' volume will be significantly reduced, and the temperature of the oil will start creeping upwards -- I\u00a0usually call it done at around 230\u00b0F.\u00a0 Depending on your heat-level and amount of carrots, this will take 30-60 minutes.. Drain the carrots.\u00a0 I\u00a0use a kitchen sieve, and just let the oil drip back into the pot.I\u00a0then use the sieve to strain the oil for later reuse.. Add salt and garlic to taste.\u00a0 For 500 g of peeled, uncooked carrots, I\u00a0use 3-5 g of sea salt and 3 cloves of raw, micro-grated garlic.\u00a0 Note that if you want to reuse the oil, you don't want to add the salt and garlic to the oil.. Serve while still warm.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_110_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_110_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_110_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_110_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. 1 Single Pie Crust Rolled Flat (I made mine using this recipe, A Healthier Flaky Double Pie Crust w/ Coconut Oil) 4 Ripe Peaches - Pared and Sliced1/4 Cup Granulated Sugar1/4 Cup Brown Sugar 1/2 Tablespoon Lemon Juice1 Table spoon Cornstarch 1/16 Teaspoon Salt (literally a pinch)* For the pie crust, because I only needed a single crust, so I divided the ingredients by 50% and it worked like a charm. . I highly recommend making your own crust, for this recipe. It's super easy and 100% worth it in the end, plus there are tons of great recipes out there. I use A Healthier Flaky Double Pie Crust w/ Coconut Oil for all of my pies and tarts. It takes 5 minutes to pull together and only an hour to chill. The final results are melt in your mouth flaky.While the pie crust is chilling, combine the sliced peaches, granulated sugar, and brown sugar in a medium sized bowl. Cover with plastic wrap and set it in the refrigerator for 1 hour.* I call this prep work, because both of these items will need to be done, well in advance and will need to sit for at least an hour. ** Every 20 minutes or so, I give the peaches a toss in the sugar mixture as they macerate.. Once the pie crust has chilled, remove it and let the dough disk rest on a floured surface for about 10 minutes.Gently, roll out the dough evenly in all directions to form a 14\"x14\" sheet. Now cut 6 - 6\" rounds out of the dough sheet (I used the lid of a pot, which worked great, however any round object will do as a template). You may need to reshape the dough disk and roll it out a second time, to get all 6 rounds.Next, place each of the dough rounds in to the muffin pan, carefully working each round into the shape of the cup (if any small holes develop, you can easily patch them with a small piece of the the scrap dough).Finally, to top the tarts, using a small cookie cutter, stamp out 6 shapes (be creative, there are a million cool cookie cutters out there...I used a star shape) and transfer them to a foil lined baking sheet.Cover both the baking sheet and the muffin pan with plastic wrap and put them back into the refrigerator to chill.. Remove the macerated peach slices from the refrigerator and drain well, reserving the liquid in a medium sized pot and returning the peach slices to the bowl. Next, add lemon juice, cornstarch, and salt to the pot with the reserved peach juice. Bring to a rolling boil over medium heat, stirring constantly until the mixture begins to thicken (5-6 minutes).Once thickened to your desired consistency (I stir for about 10-12 minutes) , pour it back into the peach slices and stir until combined.. Preheat the oven to 425 degrees and move the oven rack to the lowest position.Next, spoon the peach tart filling into the prepared crusts and top with your decoration of choice.Finally, bake at 425 degrees until the edge of the crusts are a light golden brown. Reduce the temperature to 375 degrees and continue baking until the edge crusts are golden brown. . Remove the tarts from the oven and allow to cool in the pan for 4-5 minutes, until set. Then remove the tarts from the muffin pan and cool on a wire rack for 1 hour. Now it's time to sit back, relax, and enjoy. I promise these tarts will not disappoint. Plus, they are the perfect size to hold 1 scoop of vanilla ice cream on top, for a prefect peaches and cream experience. Warm or cold, they are delectable...In fact, I wish I had one right now (seriously). I really hope you've enjoyed this Instructable! Happy Baking!!!Just a note: Please, when ever possible, support your local growers. Not everyone is lucky enough to have access to locally grown produce, if you do, it's important to help keep it alive. Thanks! \nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_111_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_111_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_111_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_111_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. 5 large Jalape\u00f1os 1 Pack of Bacon 1 container of cream cheeseBBQ RubBBQ Sauce (optional). First you will need to prepare your ABT's using all the ingredients above. Start by slicing the Jalapeno in half and with a spoon cleaning out the seeds and membrane. Next you will fill the half Jalapeno with cream cheese. Make sure to fill it full. Next you are going to want to apply some of your bbq rub onto the cream cheese. Lastly you will need to wrap your stuffed jalapenos with one full slice of bacon. TIP: Make sure to wrap it firm, this will help it cook together and you won't have to use toothpicks.. One your Atomic Buffalo Turds have been put together you will them place them onto your grill using indirect cooking with a tempreture of around 300-325 degrees. Place a small chunk of hardwood in for smoking (optional) and then close the lid and begin cooking for 1 hour 15 minutes.. After 1hr 15 mins, your bacon wrapped jalape\u00f1os should be done. If you like your bacon more cooked feel free to leave them on for a few more minutes or until your preferred doneness. Let them cool for a few minutes to allow the cream cheese to cool a bit. Serve it up with your favourite BBQ Sauce, Ranch Dressing or blue cheese sauce and enjoy.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_112_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_112_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_112_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_112_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. For this recipe you will need:Small package vanilla instant pudding1 cup eggnog (with or without alchohol)8 ounces cream cheese8 ounces Cool WhipGraham cracker crust. Mix the instant pudding with 1 cup eggnog. Mix on medium speed for 2 minutes.. Add softened cream cheese and Cool Whip. Mix well on medium speed for 3 minutes.. Spoon filling in graham cracker pie crust. Cover and place in refrigerator for at least 1 hour.. Cut into 8 slices, garnish with chocolate candy (Optional) and serve.Enjoy!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_113_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_113_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_113_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_113_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. I know it\u2019s hard, but do your best to refrain from eating all of the cookies rightthisverysecond. See? The lite version. Looks exactly like the regular, but gives me an excuse to eat more piebecause it\u2019s lite! You may or may not need to taste the whipped topping. It depends on whether you want to save your family from potentially poisonous foods. I do, of course, so I always taste the things I bake for them. Several times. It\u2019s about their safety!. Mixing is hard work. Try to contract out to your kids if possible. Convince them that if they taste this before it\u2019s done they could die.. Beware of razor-sharp pie crust tin edges.Scrape all the pie filling into the pie dish. Taste again, because the crust could also be contaminated. You will have a big tower of deliciousness, as shown above.Who\u2019s boss by forcing it to look like a pie.  Take a scoop of the filling on your finger and eat it to make an example of what you\u2019ll do if it doesn\u2019t obey.. Freeze for 4-5 hours or until relatively solid before serving. Keep frozen if you have leftovers (HA!)\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_114_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_114_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_114_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_114_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Warm the milk in a small saucepan until it\u2019s about 110 degrees. While the milk is heating up go the to next step.. Chop up 2 tablespoons butter and let it soften in a warm place. We have found that on top of the refrigerator is handy for this.. Pour the now warm milk into a medium bowl and then sprinkle in the yeast. If you do not have yeast in an individual packet you can simply use a tablespoon of yeast. Let the yeast start propagating and growing for about 3 minutes. Once the yeast has had a chance to get going mix in the brown sugar and 1 cup of flour.Next add the softened butter that you prepped and then slowly mix in the remaining 1 1/4 cups flour and the fine salt to make a sticky dough.. Prep a clean place to knead the dough and lightly dust the surface with flour. Knead while adding more flour if needed until it is smooth but still slightly tacky. This should take about 5 minutes. Next, shape the dough into a ball and place it in a lightly greased bowl and cover it with plastic wrap. Finally let it rise in a warm spot (AGAIN, THE TOP OF THE FRIDGE IS USUALLY A GOOD SPOT). Give it a about an hour to rise; it should doubled in size.. Once the dough has risen you will want to \"punch\" it. This is exactly what it sounds like. You just punch it to deflate it some. Next preheat your oven to 450 degrees fahrenheit and then grease the cookie sheets where you will later place your finished pretzel creations. Get ready to get your hands on some dough!. Pretzel dough is a pretty easy medium to work with. If you have ever made anything with play-dough you should be well-equipped to let your imagination run wild. Our kids had a lot of fun getting creative with their creations. You can stick to more traditional shapes or go completely crazy and make everything from a butterfly to a baby like my kids did.. This is the secret ingredient that makes these pretzels taste so yummy and chewy. Don't skip it!You are going to want to dissolve the baking soda in 3 cups of warm water in a shallow baking dish. Gently dip each pretzel in the soda solution, then arrange them on the prepared cookie sheet.Finally, sprinkle them with the coarse salt before putting them in the oven. (WE USED FINE SALT AND IT WAS JUST FINE BUT WE WANT TO TRY IT WITH COURSE SALT NEXT TIME.). Place the pretzels in the preheated oven and bake until golden for 10 to 12 minutes. Then melt the remaining 8 tablespoons of butter in a shallow bowl or dish. Dip the hot pretzels in the butter or use a basting brush. Be sure to coat the entire pretzel before placing them on a wire cooling rack to let excess butter drip off.. Serve warm and enjoy! These were truly delicious and turned out really well. I hope you enjoy them as much as we did! Again, if you want to see these steps in action or just get some inspiration from our journey take a quick look at the video above.If you want to see the original foodnetwork.com recipe you can find it here. ____________________________________Thanks for joining us in this adventure! If you enjoy spending time with your kids and would appreciate fresh inspiration and instructions weekly, consider subscribing to my youtube channel where I do a weekly project with my own kids. I would love to have you join our family in the making! Blessings, Caleb\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_115_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_115_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_115_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_115_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Ingredients:\n1 bag mozzarella cheese.\n1 pack bologna.\n1 large can pasta sauce\n1 package of crackers (unsalted or lightly salted work best)\nOther ingredients to taste.\nTools\n1 microwave\n1 deep dish 8x8 that can be used in a microwave.\n1 spoon to spread sauce.\n1 microwave cover (optional). Spoon some pasta sauce in the bottom of the dish. Spread evenly. this will prevent any sticking as we are told.. Now cover the sauce with crackers. I like to put a lot on the bottom layer to make sure the lasagna will come out easily and absorb all the juices of the meats cooking. . Now it is time to lay down the Bologna (pronounced baloney in America) one level thick.. Add the cheese generously.Make sure it is spread evenly though.. Repeat the last four steps until the dish is full or you run out of ingredients. Ours was two layers thick. Be sure to remove the spoon before cooking!!!!. All microwaves are different. I would recommend cooking in more than one step so that it is not overdone or you have a mess in the microwave.Using a loose cover is a good idea.\u00a0 I think we did it in two steps for a total of three minutes. Your mileage way very.. Once the cooking is done, you might wait a minute till it cools down a bit before removing from the microwave. A touch of cilatro or italian parsley\u00a0 would give it more appeal.\u00a0 Then go ahead and eat!!!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_116_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_116_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_116_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_116_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Find your favorite vegan cupcake recipe, vegan brownie recipe, and vegan buttercream frosting recipe and get all the ingredients that you need. In addition to these ingredients, you also need food coloring (I used red for ketchup, yellow for cheese, and green for lettuce) and sesame seeds.\nI used this cupcake recipe ,\u00a0this brownie recipe , and this buttercream frosting recipe . The cupcake recipe asks for coconut oil, but if you don't have any (or your grocery store doesn't have any) you can substitute it with vegetable or olive oil.\nIn a nutshell, you need all of this:\n- apple cider vinegar\n- almond or soy milk\u00a0\n- all-purpose flour\n- white sugar\n- baking powder\n- baking soda\n- salt\n- coconut oil (or vegetable or olive oil)\n- vanilla extract\n- unsweetened cocoa powder\n- water\n- nonhydrogenated shortening\n- nonhydrogenated margarine (I used Earth Balance)\n- powdered sugar\n- food coloring (red, yellow, green)\n- sesame seeds\nAlso, if your margarine and/or shortening was\u00a0refrigerated, I would take it out of the fridge so that it can soften by the time you get to making the frosting.. Mix the batter for your cupcakes. Again, I used this recipe as a basis, but I've copied/pasted it below and tweaked it a bit.INGREDIENTS\n1 tablespoon apple cider vinegar\n1 1/2 cups almond milk\n2 cups all-purpose flour\n1 cup white sugar\n2 teaspoons baking powder\n1/2 teaspoon baking soda\n1/2 teaspoon salt\n1/2 cup coconut oil, warmed until liquid (or vegetable or olive oil)\n1 1/4 teaspoons vanilla extractDIRECTIONS\n1. Preheat oven to 350 degrees F (175 degrees C). Instead of lining your cupcake pans, grease them with a cooking spray.\n2. Measure the apple cider vinegar into a 2 cup measuring cup. Fill with almond milk to make 1 1/2 cups. Let stand until curdled, about 5 minutes. In a large bowl, Whisk together the flour, sugar, baking powder, baking soda and salt. In a separate bowl, whisk together the almond milk mixture, coconut oil and vanilla. Pour the wet ingredients into the dry ingredients and stir just until blended. Spoon the batter into the prepared cups, dividing evenly.\n3.\u00a0Make one batch fill up to half the depth of the tray (these are the top buns) and another batch that fill up to a quarter of the tray (these are the bottom buns).\n4. Bake until you can poke them with a toothpick and it comes back out clean (about 10 mins).\n5. Take them out of the oven and allow them to cool in the pan. Then carefully remove them. I used a fork to scoop them out, but be careful not to scratch the pan! If you try removing the cupcakes before they're cooled enough, they'll break apart.\nVegan cupcakes are different from regular cupcakes that use eggs and butter. For one, they don't rise as much as regular cupcakes. Secondly, they don't cut very well. Other cheeseburger cupcake recipes call for you to bake normal cupcakes and then cut through them. This doesn't work with vegan cupcakes. They tend to fall apart and crumble when you try to slice through them. Lastly, they tend to be a little stickier than regular cupcakes. If you don't grease the pan well enough, the cupcakes will burn.. Mix the batter for the brownie. You only need half the amount of the brownie mix, otherwise you'll end up with a lot of extra brownie mix.\nThis is the recipe I used with appropriate measurements (original recipe here ):INGREDIENTS\n1 cup unbleached all-purpose flour\n1 cup white sugar\n3/8 cup unsweetened cocoa powder\n1/2 teaspoon baking powder\n1/2 teaspoon salt\n1/2 cup water\n1/2 cup vegetable oil\n1/2 teaspoon vanilla extractDIRECTIONS\n1. Make sure your oven is still at 350 degrees F (175 degrees C).\n2. In a large bowl, stir together the flour, sugar, cocoa powder, baking powder and salt. Pour in water, vegetable oil and vanilla; mix until well blended.\n3.\u00a0Again, using the cupcake pans, only fill the pans half-way (or less if you want thinner paddies).\n4. Bake until a toothpick comes clean after poking the center of the brownie (about 15 mins).\n5. Allow to cool. Be careful when removing the brownies. They sometimes can break easily.\nYou can also just make chocolate cake batter if you don't like brownies. However, I liked the combination of cupcake and brownie. . Make your frosting. I used this recipe\u00a0as a basis and tweaked it.INGREDIENTS\n1/2 cup nonhydrogenated shortening (softened)\n1/2 cup nonhydrogenated margarine (softened) (a.k.a. Earth Balance)\n3 1/2 - 5 cups powdered sugar, sifted if clumpy\n1 1/2 teaspoons vanilla extract\n1/4 cup plain soy milk or soy creamerDIRECTIONS\n1. Beat the shortening and margarine together until well combined and fluffy. This is important. The consistency of your frosting depends on how fluffy you make your shortening and margarine combination.\n2. Add the vanilla and soy milk.\n3. Add the sugar and beat until fluffy.\u00a0Try to make your frosting thicker. The original recipe only calls for 3 1/2 cups of powdered sugar. To thicken the frosting, add more powdered sugar. I'd recommend using around 5 cups total of powdered sugar.\n4. Separate a decent amount of frosting per color food coloring that you'd like to make. Mix in enough food coloring until you've reached the desired color. I added in 5 drops at a time.\nIf you don't have frosting bags, you can use ziploc bags. Cut off the tip of one corner and secure in your frosting tips. Then, scoop in your frosting, making sure not to have the frosting leak through. Then, close the bags, making sure to get rid of as much air as possible.. Add frosting on the bottom bun. I put the \"lettuce\" on this layer. I put more of the green frosting on the edges so that it would stick out and can be seen once you put the brownie burger paddy on top of it.\nPlace the brownie burger paddy in the center.\nAdd more frosting on top of the burger paddy. I put the \"cheese\" on this layer. I drew a square with the yellow frosting to imitate a slice of cheese.\nTake the cupcake for the top bun and place it upside-down on top of the brownie burger paddy. This keeps the color of the top bun consistent with the bottom bun.. You don't really need  to do this, but it helps make the slider look more convincing.\nSome recipes use a lemon or an orange mixture to make the sesame seeds stick. This one uses a simple sugar mixture (1 part water, 2 parts sugar). I like this better because it doesn't add any additional taste to the cupcake.\nBrush some of the mixture on the top of the cupcake and sprinkle some sesame seeds on top.. Stuff your face! YOU'RE DONE!!! :)\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_117_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_117_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_117_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_117_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. For the Bars:1/2 cup Brown Rice Syrup (you can substitute corn syrup - but i'm not sure the bars could still be considered \"nice\") 1/2 cup Maple Syrup  1/2 cup Dark Chocolate Chips  2 TBS Coconut Oil  Pinch of Sea Salt For the Chocolate Topping:3/4 cup Dark Chocolate Chips  2 TBS Coconut OilAdditional Toppings (optional - but tasty)1/4 cup toasted nuts - chopped (almond, macadamia, hazelnut, etc.)  Sprinkling of cocoa nibs  Pinch of flaky salt. Over medium heat, bring the Brown Rice Syrup and Maple Syrup to a gentle boil.   Boil for 1 minute; continuously stirring with a heat proof spatula.. Add the almond butter, 1/2 cup Dark Chocolate Chips, 2 TBS Coconut Oil, and a pinch of sea salt. Stir, stir, stir ... Until chocolate is melted and mixture is smooth.. . Hint: Dampen your fingers with water or coconut oil before beginning this step.. In the original saucepan, over medium heat, melt 3/4 cup Dark Chocolate Chips with 2 TBS coconut oil. When the chocolate chips are melted, pour over the puffed cereal. Smooth topping with spatula. Add toppings of choice. Refrigerate for 1 hour - or until topping is firm.. Use parchment to lift mixture out of the pan, cut into 16 2-inch squares.Enjoy!Note... Best eaten within 48 hours (trust - this is not a problem).\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_118_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_118_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_118_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_118_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. you'll need:for the soup- 3 big beef tomatoes (about 400 - 500 g)- a bit more watermelon than tomato (about 600 g)- 2 cloves of garlic- a lime- olive oil- hot sauce or fresh chilli- some black olives- about 200 g of creamy sheep cheese- and of course some fresh chervil on a side note i have to say that chervil really deserves the title \"challenge ingredient\" in the watermelon contest, since it was quite the challenge to even obtain it. i think i went to five shops whitout success, when i wanted to bike to the market to see if i could find some there the mother of all rainstorms broke loose and so i finally decided to go to the supermarket to buy frozen chervil as a last resort and there it was: one last package of fresh chervil sitting amidst a bunch of basil...i hope you'll have an easier time finding it, otherwise substitute with frozen chervil or a smaller amount of tarragon which has a similar but much stronger taste.for the crispy bread- one small pita bread- a bit of fresh parsley, some herbs and spices- a bit of olive oilthis will be enough for 2 persons as a main course or 4 as an appetizer. remove the stem from the tomatoes and make a few shallow cuts into the skin.put the tomatoes in boiling water for one minute, then remove the skin. cut the skinned tomatoes into wedges and remove all the seeds. this is a quite messy affair...put the tomatoes in a blender jar or in a bowl if you don't have a blender.now cut the watermelon into chunks and remove the seeds, put the pieces into the blender jar with the tomato pieces.yeah! even more sticky mess all over the kitchen table. this would have been much faster and easier if i had bought seedless melon.... . now put the jar on the blender and blend (or puree with a handheld mixer). skin the garlic and add it to the mix, as well as some olive oil (about 50 ml) and blend some more until everything is smooth. if you use fresh chilli, add the chilli (seeds removed) to the blender as well.strain through a sieve to get rid of all the tomato and melon seeds that sneaked ther way in and any garlic that wasn't properly pureed.season your soup with salt, lime juice (i used half a lime, but it held a lot of juice) and hot sauce if you didn't add fresh chili.chill the soup in the fridge for several hours.. chop most of the chervil finely (keep some of the nicest leaves for decoration), put in a bowl and add olive oil until the consitency is like a runny pesto. let it stand for some time in a cool place.cut the olives into small pieces, cut or crumble the cheese into cubes.. preheat the oven to 200 degrees.cut the pita bread into slices about 1 cm thick and put on a baking tray next to each other. sprinkle each slice with some fresh parsley, other herbs an spices (i used paprika and some freshly ground pepper), then carefully trickle a bit of olive oil on the slices.bake the slices in the oven for about 15 minutes until crispy and golden brown.you can make the bread some hours in advance, but if you leave them sitting for too long, the bread will draw moisture and be less crispy.this bread works great with other soups and meals an is a perfect way to prevent leftover pita bread from going stale and unedible. although the crispiness will fade over time, it will stay good even for a few days if you store it in a cool and dry place.. put the cold soup into bowls and add some cheese cubes, olive pieces and a bit of chervil oil on top of each bowl.decorate with some fresh chervil and enjoy your fruity refreshing soup with the crispy pita bread.dr_peru and me enjoyed it very much!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_119_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_119_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_119_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_119_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Use a vector drawing tool to make your drawing. This can be anything from Inkscape to Illustrator.. Seperate the egg-white form the egg yolk. Put the egg-white in a small bowl and position it under the lasercutter.. Just play around with the speed and power of the laser to make the perfect white coloured egg (or black if you like). I used speed 50%, power 50% for my chicken on a LaserPro machine.. . The egg-white sculptures are perfect for animation.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_120_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_120_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_120_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_120_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Recipe ingredients 2 1/2 quart popped corn 2 cups corn chips, slightly broken 1/4 cup butter 1/4 cup Louisiana style hot sauce 1 teaspoon celery seed 1/4 teaspoon salt, optional Hardware Popcorn popper large bowl Gallon zip lock bag measuring cups, spoons large spoon or mixing thing. As a new trend for cooking ibbles lets have just the recipe on one page so i don't have use up all my ink printing pictures of your kitchen. Plus they fit better in the cookbook. Recipe ingredient 2 1/2 quart popped corn 2 cups corn chips, slightly broken 1/4 cup butter 1/4 cup Louisiana style hot sauce 1 teaspoon celery seed 1/4 teaspoon salt, optional How to make Buffalo style hot popcorn Pop the corn I used half a cup which makes about a gallon of popcorn. Place the chips in a zip lock bag and lightly brake them up. Mix the chips and pop corn save the zip lock bag for storing the popcorn In small saucepan, melt butter with hot sauce, celery seed and salt. Pour over popcorn-corn chips mixture, tossing gently to coat. This next step I skip. I think it makes the popcorn stale but it might be important for some reason. Spread on 15 x 10 inch baking sheet. (could also be left in the metal bowl) Bake at 350 degrees F for 10 minutes. Remove from baking sheet to large serving bowl. Serve immediately or store in airtight container.. Pop the corn I used half a cup which makes about a gallon of popcorn. Place the chips in a zip lock bag and lightly brake them up. Mix the chips and pop corn save the zip lock bag for storing the popcorn In small saucepan, melt butter with hot sauce, celery seed and salt. Pour over popcorn-corn chips mixture, tossing gently to coat..  In small saucepan, melt butter with hot sauce, celery seed and salt. Pour over popcorn-corn chips mixture, tossing gently to coat. Note: do not substitute celery salt their is already enough salt in this. That being said i like the salt. In fact my brothers would get me to make popcorn (because i was the only one who would not burn it) and they would hide the salt shaker so i would not over salt it.. In small saucepan, melt butter with hot sauce, celery seed and salt. Pour over popcorn-corn chips mixture, tossing gently to coat. This next step I skip. I think it makes the popcorn stale but it might be important for some reason. Spread on 15 x 10 inch baking sheet. (could also be left in the metal bowl) Bake at 350 degrees F for 10 minutes. Remove from baking sheet to large serving bowl. Serve immediately or store in airtight container (like the bag you crushed the chips in).\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_121_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_121_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_121_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_121_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Ingredients:1 1\\2 Cups water  1 Cup sugar  3 Tablespoons gelatin  3/4 Cups icing sugar  3/8 Cornflour  Pinch of cream of tartar  Teaspoon lemon essenceYou'll also need...Large microwave safe bowl  Bowl  Tablespoon  Teaspoon  Measuring cups  Cooking spray  Glass tray (I used a 21 x 21 cm tray that was about 5 cm deep.)  KnifeYou might also need...Beater. Combine water, sugar and gelatin in a large microwave safe bowl. Mix lightly. Next microwave the mixture on high for 3:30 minutes. After microwaving it, stir it well till all the sugar is dissolved, then microwave it again till it starts boiling.. Combine icing sugar, cornflour and cream of tartar in a bowl. Stir it into the sugar syrup till it's thoroughly dissolved.Note: You might need to use a beater to dissolve it completely.. After completing the previous step, microwave your mixture on high for about 3:00 minutes.Note: Your mixture should be thick when you've done microwaving it.. After microwaving your mixture, blend in the lemon essence.Note: You could also swap the lemon essence for rose water and add some pink food coloring. . Pour mixture into a lightly oiled glass tray and spread evenly. Refrigerate till firm.. You're almost there... Cut it into cubes and dust it with some more icing sugar.. Enjoy your divine Turkish delight either by eating it as a delicious treat, or by giving it as a marvelous gift!If you wanted to make your own gift box, check out Showcase Creative's awesome Instructable!Thanks for reading through my instructable and if you liked it, please consider voting for it in the upper right corner!Feel also free to comment and ask questions in the comment section below!...feel also free to watch some of out homemade short films.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_122_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_122_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_122_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_122_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Ingredients:Beef, top round or any other cheaper non-fatty type works best  1 cup Soy sauce 1 tbsp Molasses 2 tsp Liquid smoke 2 tsp Black pepper 2 tsp Garlic powder 2 tsp Onion powder 2 tsp Red pepper flakes 1 tsp Ghost pepper salt 1 tsp Cayenne pepperThis recipe makes enough marinade for about 2.5 pounds or a little over 1 kg.Equipment:A food dehydrator like this one from NescoA sharp knife  A large glass or ceramic bowl  A cutting board  Paper towels. Start the process the day before you want your finished jerky. Throw your beef in the freezer for a couple hours or, if frozen, remove from the freezer for about an hour (this will all depend on how much you have). Since thin slices of beef are ideal for jerky, having the beef partially frozen makes it easier to cut consistently thin pieces.Once the beef is thawed on the outside but still slightly frozen on the inside, put it on a well-washed cutting board and pat it dry with a paper towel. Trim as much of the fat off as possible then slice the beef into \u215b\" to \u00bc\" (3-6mm) slices. Cutting with the grain with a really sharp (not serrated) knife works best. Here I'm using a top round steak, you may use any cut of meat you like but remember that meat with a high fat content will become rancid faster, which makes this company's filet mignon jerky practical yet decadent!. In this instructable I'm using a marinade (wet method) to flavor the jerky. There are other methods you can chose, such as a dry rub, however I enjoy the flavor the marinade brings to the beef.Wash your hands and bowl well then start by adding all of your ingredients (minus the beef) in your large bowl. Separate the beef slices well, since they tend to re-freeze together when in a pile, and add the beef to the bowl a few slices at a time followed by mixing by hand. Ensure all of your beef is coated well.If you have more meat than marinade, simply prepare another bowl with marinade and repeat the steps above. It's easier to work in smaller batches than a large unmanageable pile that might risk an uneven marination of the beef.Cover and put the bowl in the refrigerator overnight or for at least 12 hours. For best results, mix the contents once or twice during this period.. The next day (anywhere from 12-24 hours later) remove the bowl from the refrigerator and wash and dry your dehydrator racks as the manufacturer recommends. If you do not have a dehydrator, wash the metal grates of your oven well and line the bottom of the oven with foil.Remove the strips of beef from the marinade and arrange on the racks in one layer without overlapping, allowing for a little bit of air flow around each piece. When removing the strips of beef from the marinade, allow them to drip-dry, you want some marinade to coat the beef strip but not too much. Assemble your dehydrator and set at 160\u00b0F (~70\u00b0C).Revisit your dehydrator every hour to check the progress and to dab away any fat that is collecting on the top of your strips. With my dehydrator, the process took about 5 hours, this will vary depending upon how thick your strips are and the model of your dehydrator.If you do not have a dehydrator, this can be done in your oven by setting it as close to 160\u00b0F as possible and laying the beefs strips across the oven's metal grates. Prop the door of the oven open slightly with a wooden spoon to allow for the warm, moist air to circulate out. Please be aware that gas ovens pose the risk of carbon monoxide/dioxide poisoning when propped open, so if you go this route make sure you have plenty of ventilation.. Your jerky is ready when you are able to tear the strips along the grain, they should be pliable but not soft and fairly stiff but not brittle. At this point, turn your dehydrator off and store your jerky in a clean and dry container lined with a paper towel and a loose fitting lid. Jerky is shelf stable for about 2 weeks at room temperature and one month in the refrigerator.Congratulations, you have now made some super simple, spicy and delicious jerky at home! I encourage you to try tweaking the recipe to your liking. Substitute in dried peppers, hot sauce, smoked salts, different herbs... the combinations are endless. Just remember to keep any added fats to an absolute minimum and if you decide to use anything but beef, cook the meat to the USDA recommended internal temperatures first before dehydrating (including game meats).\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_123_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_123_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_123_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_123_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Get your oven preheated and your friends ready to help! Here is what you need to get started:1. At least 100 cookies. They don't need to be large, make them 2\" diameter or less. I used cookie mix to expedite the process and had 8 packages of cookie mix based on how many cookies the package estimated making.If you are feeling ambitious, feel free to make your own cookies!2. A lot of frosting (I used 2 tubs)3. A ziplock bag, a piping bag, or a frosting pen4. A trifold poster board5. Lots of pens and markers for decorating the board6. A ruler or other straight edge7. Scrabble tilesIf you do not own scrabble and want to play, use print out pictures of the scrabble tiles.. Like I said, you need to make at least 100 cookies. I used different varieties of cookies because as much as I would love to each oatmeal raisin cookies for days, it's nice to have some variety.To make the cookies, follow the instructions on the package or follow your own recipe. As the cookies came out of the oven, we let them cool before getting to work.If your cookies are going to be bigger than 2\", you might need to trim them down to size. Our cookies came out rather larger so we created a little template out of a napkin of a 2\"x2\" square (our plan for the board), and cut the edges off the cookies so they fit specification. So much math all for cookies. But the plus side of trimming your cookies is you have a plate of cookie edges for snacking! . Fill your ziplock bag/piping bag/frosting pen with your frosting. If you are wanting to color your frosting, put a few drops of food dye in your frosting and mix it around a bit. I used a frosting pen which was not any easier than a ziplock bag. This process is just a test of patience, but you can do it!A list of how many tiles of each letter can be found here--. Take your trifold and cut off out fold, you don't need it. On a clean side, draw out a grid that is 30\"x30\". Divide this grid up into squares of 2\"x2\".Now using a scrabble board image for reference, mark where the special tiles on the board are. It may look like a lot of things, but it just repeats the same patterns for scores. The board and cookies are done and it's game time!We didn't have the room to write numbers at the bottom of the cookies so this is where the real scrabble tiles come in handy. You can't reach into a bag and pull out frosted cookies too easily, so we grabbed normal tiles to see our letters and when we were ready to play a word, we changed out our tiles for cookies and noted out score for that word on the side of the board.Try your hardest not to eat the cookies along the way, and if you do snack on the \"S\" tile, just put down the physical tile down instead! And when you're done with the game, enjoy the cookies with your friends!I hope you had fun playing and let me know if you had fun with this project! Drop a comment if you have any questions or other cookie based games!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_124_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_124_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_124_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_124_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Pork ShoulderDr Pepper BBQ Sauce White Onion Garlic Powder Salt & Pepper Fresh Garlic Crushed Red Pepper. Add the onions to your slow cooker on high heat.Season the meat with salt, pepper, and garlic powder on both sides. Add the 2 cloves of diced fresh garlic to the slow cooker. Add the meat to the slow cooker. Pour in 16oz of Dr Pepper. Cook for 3-4 hours. Drain off 3/4 of the juice. Shred the pork. Add a cup of BBQ Sauce and 1tsp of Crushed Red Pepper. Cook for another 30 mins.. Serve on a bun and top with onion and pickles or your favorite condiments!Recipe makes about 8 sandwiches.Pork Shoulder (2.7lbs @ 1.77/lb) ......... $4.76 Dr. Pepper (16oz) ......... $1.69 BBQ Sauce (1 cup) ...... $0.92 Buns .............................. $0.88 Onion ............................ $0.47 Garlic ............................ $0.15 Total .............................. $8.87Only $1.11 per delicious sandwich!** Dr Pepper logo and name are trademarks of the Dr Pepper Bottling Company and are in no way associated with this video other then being a very yummy ingredient.. If you enjoyed this instructable, please visit our YouTube Channel and subscribe! We have several other recipe videos available and bring you two new ones each week.Cooking With Chrissy Channel Follow us on Social Media! TwitterFacebook Google+ Instructables Tumblr Don't forget to share :D\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_125_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_125_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_125_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_125_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Obviously, you're going to need some meat.  I picked pork tenderloin for time constraints.  It is fairly lean, very tender, and cooks quickly.  With reunion plans comes a tight schedule and very little time to tend the smoke.  So, pork tenderloin, it is.I like to trim any extra fat and connective tissue from the outside of the loin.  In particular, you want to look for any silvery-looking tissue, since that is particularly chewy.  As for the fat, this will take approximately 3 hours to come up to temperature, so there won't be enough time for the fat and such to break down.  Don't worry - this dude will be tender and juicy.. If the meat is the foundation, this is the structure.  (Landscaping and curb appeal come later...)We're making a dry rub.  So, how do you make a rub with coffee?  One parts rub, one parts coffee?  Not exactly, but close.  Like cooking with wine, you want to use something you would actually drink.  (If you rub your tenderloin with instant coffee, you may deserve to be hit by a bus, but I digress...)  The beans I used were roasted within a week and ground for espresso - for the average Joe, espresso grind = dust.  You are about to eat ground coffee beans, and when you take a bite of juicy, mouth-watering pork, you don't want to crunch on big boulders of bean.  By the way, this is where things start to smell amazing.  Here is your secret weapon, i.e. the spectacular Seattle Rub:4 Tablespoons coffee (espresso grind)2 Tablespoons unsweetened cocoa powder1 Tablespoon kosher salt1 Tablespoon raw sugar2 teaspoons cumin1 teaspoon garlic powder1 teaspoon chipotle (smoked) chili powder1/2 teaspoon celery seed1/2 teaspoon paprika1/2 teaspoon cinnamon2-3 grinds of black pepperMix it like you're a DJ.  Feel free to up the quantities of salt or sugar to taste.. Chillax - a hybrid of chill and relax, AKA what the meat does now and what you do next.Pork tenderloin has it rough, so it deserves a good massage.  Give it a liberal coating of the Seattle rub, knead it in, and throw it under foil at room temperature for a couple hours.  Then what?  I don't know... make some coffee?  Take a nap?  Go to the movies?  Do what you feel.  You have time.. If you have a shred of ambition and a charcoal grill, you can BBQ like a champ.  Don't fear the charcoal.  Embrace it with gusto.  Just don't actually embrace it, or you might have to go to the hospital.Now, let's get the coals going.  Make sure the bottom of the grill is empty and not clogged with ashes.  Open the air vents in the top and bottom of the grill and clean the grate.  Fill up your charcoal chimney (easy to find at any hardware store), crumple some news paper in the bottom, and light it.  When flames are licking out the top and the upper coals are starting to turn white around the edges, pour the chimney to one side of your grill up against the wall.  If you were doing burgers or something, you could spread them out, but for barbecue, we want them off to the side in one pile.. My coals are ready - now what?  Two words to remember: Indirect.  Heat.  We're going for oven-like temperatures and not the low heat and long time required to soften something like a brisket or pork shoulder - Tenderloin just doesn't need it.  And remember, you've got hungry friends coming soon.While you are at it, drop a couple chunks of pecan, cherry, or apple wood onto the coals.  They will smolder while it cooks and add a touch of flavor to the meat that you definitely want.  You don't want white smoke churning out of the grill.  We're looking for thin wisps, at most.  You want to smell it but not see it.  Why?  If you over-smoke this bad boy, it's going to taste like a camp fire.  You might like eating a camp fire, but I've got other plans.  Balance, champ.  We want balance.As to the \"Indirect Heat\" bit - slap the meat on the grill opposite your pile of coals.  That's it.  The heat isn't underneath the meat, constantly searing it.  Instead, you have a nice clean fire, a delicious piece of rubbed meat, and the aroma of nut/fruit wood soaking into your dinner.  Punch a meat thermometer (hardware store) into the center of the loin and set it for 185.  Now, put the lid on and walk away.  When the thermometer beeps, smile, because you are about to win the prize.Remember: until your thermometer says so - hands off!  If you are looking, you aren't cooking.  In a few hours, you'll be here.... Your meat is done, but you aren't.  Pull it off, set it on a plate, and cover it with foil.  Let it rest!  It's going to be about 1/2 an hour until you can cut it.  But when you do, you're going to see this.... Mix up some pea salad and sweet-potato fries.  Make some coffee ice cream.  Enjoy the food and the conversation.  That's it.  That is the way to get people to beg you to move back home.  Don't worry - If you ever get tired of hearing \"this is the best thing I've ever eaten,\" you can always take the low road.  Just hang your head and plod back to the masses, where gas grilling and hot-dogs reign.  But if you're me, you live in a one bedroom apartment with limited resources, a beautiful wife, a charcoal grill and a deep love for all things food.  You have long-lost friends coming into town for the weekend, and a reputation as a foodie and grill-master to uphold.  It's time to make some people happy.  Get cooking!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_126_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_126_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_126_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_126_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Below ingredients are for all the variations ice cream of your choice - vanilla was mine :-) jello packs - red , yellow, green soft chocolate fudge banana sponge cake Push Pops - buy them....don't have them?? check next step. Well....we in Asia are sometimes not so lucky in having all the latest things...but if we don't have...we make do! I went out and got the larges syringes I can get from the drug store. I then carefully discarded the needle and cut off the end. The picture should say it all :-).  First you need to make your sponge cake you could buy it ...or make it. 5 eggs 250g\u00a0 sugar 1/2 cup water and 1 1/2 tsp. vanilla - mix together 150g\u00a0 cake flour + 1 1/2 tsp. baking powder + 1/2 tsp. salt - sift together Beat eggs till foamy.....add the sugar... and beat till well blended...about 5 minutes...now alternate the liquid and flour and beat. till smooth. Place in a greased tray and bake . \u00a0you could also check this instructable to make your cakeChocolate\u00a0 Fudge 400 g sweet condensed milk \u00a0 50 g sugar \u00a0 25 g cocoa powder \u00a0 50 g butter Mix melted butter and sugar and coco powder in pan till well blended...add the condensed milk and bring to boil in low heat...take off fire and pour in dish and let it cool.Jelly Buy colors of traffic light and mix according to packagingBanana slice circles and then cut with syringeOptional!color ice cream Now you may not have the required flavoring and colors in ice cream...or your child...like mine only likes vanilla!! well then...what do you do for colors??? color vanilla! yes...so I had some flavors and colors...but to satisfy one child I colored a bit of vanilla ice cream in the colors of the traffic light so he would be happy. Assembling once you have all your ingredients ready is pretty easy...have all your push pops close so it would be quicker...while one push pop is freezing you can move on to the next! Pull the syringe back each time you finish a layer...making room for the next layer The layers for the Ice Traffic Jello Push Pops goes like this - cake circle, green ice cream, freeze, green jelly and cake circle, orange ice cream, freeze, orange jelly , cake, red ice cream freeze, red jelly , cake and finally\u00a0 white ice cream or whip cream.. You've all had frozen banana's dipped in chocolate right? wasn't it delicious? well...how about in push pop form?? For Fuggy Ice Banana Push Pops the layers are Banana slice, vanilla ice cream, freeze, fudge! you could also go with Banana slice, fudge,freeze and then ice cream! both combinations are wonderful! Top up your layer with fudge!. Fudge Ice Cream Cake is one of my favorites...so I thought why not in Push Pops?? And\u00a0 so I layer!! The layers for this is - cake, ice cream, freeze, fudge, cake ice cream freeze , fudge! top it up ending with an Ice cream layer! adding meringue would be great...but obviously I cant stick the pops in the oven! \u00a0 . I enjoyed making and having my push pops...it really cooled my family down ;-) Let me know how you like it...and if you like it....please vote for the frozen food contest! Thank you!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_127_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_127_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_127_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_127_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. You will need Milk, Sugar and Cocoa powder. . In the bottom of a mug mix 1tbsp of Milk, Sugar and Cocoa powder. . Measure out 1 cup of milk in a glass measuring cup and heat in the microwave for 60 seconds.. Pour your heated milk into the mug until all the cocoa mix has dissolved.. I hope you enjoyed this post. please like and comment if you did.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_128_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_128_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_128_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_128_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Cake:- 225g Unsalted butter at room temperature (not melted)- 2 Cups of caster sugar- 2 Eggs- 3 1/2 Cups of Plain flour- 4 Teaspoons of baking powder- 1/2 Teaspoon of salt- 2-3 Teaspoons of vanilla bean paste- 4 Tablespoons of vegetable oil- 2 Teaspoons of strawberry essence (optional)Butter Cream Frosting- 225g unsalted butter, at room temperature- 6 Cups of icing sugar- 5 Tablespoons of milk- 3 Teaspoons of vanilla bean paste- Pink Food colouring. - 1 Giant cupcake baking tin (you can get these on-line I think from Wilton)- Regular muffin tin- Mini muffin tin- Whisk- Mixing bowls- Spoons- Measuring cups- A cake palete knife/ a smooth cooking tool to smooth out the icing on the cake- A star piping tip- Piping bag and different sized circle piping tips- Muffin paper liners. To make the cake batter you will need a kitchen mixer or a hand mixer. The bowl you will use needs to be large as you will make more tat 6 cups of batter.  The first thing to do is to beat the unsalted butter (225g) in the kitchen mixer until the butter becomes light and fluffy. Then add the 2 cups of caster sugar and continue to beat until light and fluffy. Then add the vanilla bean paste, strawberry essence and eggs to the butter and continue beat them.Add the baking powder to the plain flour and whisk it together to get rid of any lumps then add 1/3 of the flour mixture to the butter mixture and beat it together. Then add 1/3 of the milk and continue to beat the mixture. repeat adding 1/3 of the flour and milk mixture alternatively until all incorporated. Finally add the 4 tablespoons of oil to the mixture and stir until combined.. For the giant cake tin if it is NOT stick free coat the tin with butter or oil on all sides then dust with plain flour until it is completely coated.In the base of the giant cupcake place 3 1/2 cups of the cake batter and smooth it out. Then for the top of the cupcake (swirly pyramid shape) place 2 1/2 cups of cake batter and smooth the batter on the top.  For the regular and mini cupcakes line 1 or 2 of them with muffin paper liners.For the regular cupcakes place 1/4 cup of batter in the muffin pans. and place 1-2 teaspoons of batter in the mini cupcake pan.. Giant cupcake: Pre heat the oven to 160 degrees Celsius fan-forced and bake the cake for about 1 hour until you can tough the cake and insert a skewer or knife until it comes out slightly clean, Then let the cake rest for 20 minutes in the pan.  Let the cake cool completely on a wire cooling rack for about an hour. While this is cooling make the butter cream frosting.Normal cupcake: Bake the cupcake for about 16-18 minutes until golden brown and spongy to the touch.  Mini cupcake: Bake the mini cupcake for about 8-11 minutes until golden brown and spongy to the touch.. Place the unsalted butter in the electric mixer and beat until it is light and fluffy. Then add 1 cup of icing sugar to the butter and beat until incorporated, then add 1 tablespoon of milk and mix, adding alternatively until all milk and icing sugar has been added. Then add 3 teaspoons of vanilla bean paste and beat it all together. You will need about 4 shades of pink, each slightly lighter than the next and white icing to finish it off. . When the cake is completely cooled and the butter cream frosting is done place about a tablespoon of icing onto a plate and place the cupcake 'base' on top of it.  Then put about 1-2 tablespoons on top of the base and place the 'top' of the cupcake. Then with a think layer of butter cream frosting and your palette knife (or flat edged butter knife) go around the cake creating a crumb layer of frosting. This is just a layer that covers the cake to catch the crumbs so that they don't show up on your final layer of icing.  Then over the crumb layer put another coat of white icing.To make the cake seem more like a cupcake use the knife to make upward strokes of the icing to resemble the 'crinkles' of a cupcake paper liner.. To make the darkest shade of pink icing Take about 1/2 -3/4 cups of icing and add a lot of food colouring (i added about 1/2 a teaspoon of food colouring gel which is very potent). this will make the bottom petals of the op of the cake and the pretty mini flowers at the base of the cake. First place 3/4 of the dark icing into a piping bag with a large round tip. Then pipe a circle or oval of icing onto the cake and with your flat knife lightly scrape it away from the middle of the circle so it looks like a petal. Then place another circle on top of the streaked petal from before and continue until you go around the whole cake. Reserve the rest of the dark icing for the base of the cake.Continue this process for each lighter shade of icing until you reach the top of the cake then top it of with a small dollop of white icing. Remember to save some of each coloured icing as you will need it for the mini cupcakes.. Spread the white butter cream icing on top of the regular and mini cupcake. Then repeat the process in the previous step  for the small cakes if you are making them but with smaller piping tip holes. . With the left over dark icing place it into a piping bag with a star shaped tip and place the tip about 1/2 a centimetre away from the cake base, push on the piping bag until icing comes out then pull the bag away quickly so that it resembles a small flower. Then continue this action around the base of the cake and you are done!Then it is very important to share this with your friends I would recommend refrigerating it if it is a hot day otherwise it can stay at room temperature.. The most important part about the finished cake is to give it or share it with your friends. Trust me it makes the cake taste so much better!!! I gave mine to my friend Flavia for her birthday. She loved the cupcake family!!!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_129_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_129_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_129_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_129_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. You will need a few unusual things for this bubbly beverage.-bannana extract-seltzer-sugar (liquid sugar is easier to use)-cup. Pour a generous spoonful of the bannana extract into the cup so it forms a thin layer on the bottom.. Take your sugar (or liquid sugar), and pour several tablespoon of sugar into the cup. Then stir well. If there is anyone hesitant about that much sugar, remember, it's SODA and it has a bum load of sugar.. This step is easy because you just need to pour the seltzer into the cup without any measurements! When the liquids meet the top of the cup, stir it again.. This drink probably smells strong of bannana, but it's really great! Enjoy your bannana soda, and check out my fan fictions every once in a while! (My fan fictions appear to be more humorous than serious, so the characters I use will probably not act... Well... Normal...)P.S. If you make the bannana soda, have comments, questions, or suggestions, just write in the comments!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_130_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_130_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_130_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_130_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Eggs (as required)2 green chilies1/2 Tsp red chili powder1/4 Tsp turmeric powderA few curry leavesCoriander leaves for garnishingSalt to taste (as required). Boil the eggs.Remove the eggs shells..  Make a slit (like across) on all eggs in order to absorb the spice taste.Chopchilies, and coriander leaves.Add 1/2 Tsp turmeric.Add 2 Tsp salt.Add 3 Tsp chili powder.Add 1 Tsp oil.Mix it well.. Heat oil in a pan.Finally, add the cooked eggs.Add the mixed masala, on the eggs.Wait till the base of the egg turns golden brown.Flip the eggs after the base is turned golden brown.Stir gently.. So enjoy your meal with hot Boiled Egg Masala.That's it, Foodies.Thank you all.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_131_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_131_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_131_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_131_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. FruitA handful of blueberries1 banana but you can use more if you wantA bunch of grapesA portion of strawberriesTo serveA drizzle of agave nectar. Wash your berries under cold water and leave them to dry.Peel and evenly chop your banana.Slice the tops of your English strawberries.Then assemble into your bowl.. When you put your fruit in the bowl it will taste fantastic. I find that taking a few seconds to present your fruit enhances not only the presentation but the satisfaction of the end consumer. Good looking food really does taste better.I presented my strawberries cut side up and evenly spaces the fruit throughout the bowl, it went from being a bowl of fruit to an art piece.. Gentlly run your nectar over the fruit so that it glistens and looks good. It is ready to serve.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_132_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_132_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_132_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_132_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. One bunch of Green Amaranth leaves (About 250 grams)One medium sized Onion2 Green Chillies (Adjust to your own taste)About 100 grams of Chickpea flour3 teaspoons of Rice FlourOne teaspoon of Cumin seed powderOne teaspoon of Fennel seed powderOne teaspoon of Red Chilli powder (Adjust to your own taste)One teaspoon of Turmeric PowderHalf a teaspoon of Asafoetida powderSalt to taste. First you have to remove the hard stems from the green amaranth leaves. You can plant the stems in your garden or in a pot and It will grow and give you more green leavesRoughly chop the green leaves and keep aside in a wide pan. Finely chop the onion and green chillies and add them to the chopped Amaranth leaves. Add the rice flour and other spice powders and salt with Chickpea flour and mix well.Do not add any water. Add the chickpea flour with other ingredients to the Amaranth leaves in the panMix everything together in the pan. If required add little bit of water so that the mix can be shaped by hand without breakingIf the mix seems very dry and separates, add little amount of water. If it is watery and did not hold shape, add little bit of chickpea flourTake a lemon sized mix in your hand and shape into flattened balls as in the picture. It is very easyYou can make few flattened ball shaped pieces beforehand and keep them in a plate. Place a frying pan over medium flame and add sufficient cooking oil for deep fryingWhen the oil is heated, add one piece and check. If the temperature of oil seems ok, then you can add 4 to 5 pieces at a time to the oil.Turn over the fritters few times in oil so that both sides are evenly cooked . Once the outer shell of the fritters are deep fried to a dark brown and no longer bubbles in the oil, you can take them out with a strainer.Drain out excess oil and transfer over absorbent food-grade paperServe hot as evening snack with coffee or tea. Everybody will love it and ask for more\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_133_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_133_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_133_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_133_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Preheat your oven to 350 degree and then assemble your ingredients.Baking dish - I used a glass one1 box Gluten Free Yellow Cake Mix - Betty Crocker's is among the most reasonably priced, at about $4/box1 jar or can of fruit of your choice - peaches are wonderful, but so is pineapple or apple1/2  cup (1 stick) of butter1/3 cup Brown SugarCinnamon. Open the jar or can and dump it in.  I know, right?!  Don't bother draining your canned fruit - you'll want the juice in there.. That's it - dump your cake mix over the fruit, as evening as possible. Then spread more evenly with a fork or knife, tamping your mix down firmly into your fruit layer.. Slice pats off your stick of butter and layer evenly along the top of your cake mix layer.. Use about a 1/3 cup of brown sugar and sprinkle evenly over the top of your butter layer.  Sprinkle cinnamon as desired over the whole thing.. Bake and wait.. Remove your dump cake from the oven - you can test with toothpick or knife if you'd like.  It should come out clean.  Cool for about 10 -15 min... You've essentially got a cobbler here, and a darn good one! Top with a little bit of ice cream and you've got a lovely dessert, gluten free or otherwise!Enjoy!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_134_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_134_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_134_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_134_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Lacto vegetarian version: clean thoroughly and steam one whole small potato for 30 minutes. This will produce a drier lighter texture perfect for adding butter and cream. Vegan version: pierce potato several times with a fork, wrap tightly in aluminum foil and bake at 400 for 1 hour. This produces a stickier wetter texture that does not require the addition of dairy liquids.. Split open with a spoon and scoop out all of the steamed/baked flesh, discarding the skin, while stopping to marvel at one of the most amazing colors found in nature! *The potato pictured here has been steamed, notice the dry fluffy texture.. Place potato flesh in a mixing bowl, add approximately 1 tbs each, brown sugar and pumpkin pie spice (vegetarians who prefer a lighter creamier texture and flavor may also enjoy butter and/or cream to taste) beat with electric mixer until smooth.*The potato pictured here has been baked, notice the wet, sticky texture.. I like to add some whipped cream to the vegetarian version, and sprinkle with nutmeg for extra yumminess, enjoy!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_135_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_135_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_135_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_135_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Ingredients:\n1 -\u00a0 gallon milk\u00a0 (whole mile generally generates more curds.\n1 -\u00a0 rennet tablet (available at better grocery stores0\n1/2 cup\u00a0 - vinegar (cider vinegar tastes better to me)\nTools:\nStove\nLarge pot about 2 gallon size.\nSpider or equivalent (a large spoon will work)\nThin mesh pasta colander.\nPaper towels\nSink\nMeasuring cup\nInstant read thermometer.. Pour the milk in a large pot.\nCrush the rennet tablet into 1/4 cup water and let it dissolve well.\nAdd the rennet solution and vinegar to the milk then stir well..\nHeat the milk until it is about 100 degrees Fahrenheit. WATCH CAREFULLY!!!!\nAt 100 - 110 degrees Fahrenheit immediately TURN OFF Heat.\nLet sit till the curds have separated from the whey (greenish milky stuff).. Put some paper towels in the colander.\nSpoon out the curds into a find mesh colander that is sitting in a bowl the catch the excess whey.\nAdd salt and mix with the curds when all the curds have been retrieved.\nPut on a cover over the curds and then put a weight over the cheese curds.\nRefrigerate over night. Get rid of more unneeded liquid.\nYou can reheat the whey (second picture) to about 180 degrees and get more curds. (aka ricotta or recooked). Again Watch carefully.. You may want to add a bit more salt for preservation purposes.\u00a0 Anyway, you now have your own honest cheese. We could do a variation to make mozzarella.\u00a0 You now have cheese for quesadillas (aka piadini), pizza, spaghetti/lasagna, bagels, salads,\u00a0 cheesecake, cheese soup, and a host of other food items. A basic recipe that should be in all kitchens.. How to Make Swiss Cheese This recipe will teach you to make traditional Swiss cheese in the comforts of your own home. Difficulty: \u00a0\u00a0\u00a0 Challenging Instructions things you'll need: \u00a0\u00a0\u00a0 * 1 gallon of whole milk, 1/2 packet of direct-set thermophilic starter or 2 ounces of prepared thermophilic starter, 1/2 teaspoon of propionic shermanii powder, 1/4 teaspoon of liquid rennet or a 1/4 renbet tablet, 1 pound of cheese salt, for brine, plus a pinch of cheese salt, 1/2 gallon of cold water, for brine. curd knife, stainless steel whisk, cheesecloth. ladle -------------------------------------------------------------------------------------------------------------------------- \u00a0\u00a0\u00a0\u00a0\u00a0 Swiss Cheese \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 * 1 \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Heat the milk to 90 degrees Fahrenheit. Add the starter and mix well. \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 * 2 \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Remove 1/4 cup of milk from the pot and add the propionic shermanii to it. Mix thoroughly to dissolve the powder. Add the mixture to the milk and stir. Cover and allow the milk to ripen for approximately 10 minutes. \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 * 3 \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Make sure that the milk's temperature ALWAYS remains at 90 degrees. Add the diluted rennet and stir gently with an up-and-down motion for approximately 1 minute. If you are wanting to use farm fresh cow's milk, top stir for several minutes longer. Cover and let the milk set at 90 degrees for approximately 30 miutes. \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 * 4 \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Using a curd knife and a stainless-steel whisk, cut the curd into 1/4 inch cubes. \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 * 5 \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Keeping the curd temperatures at 90 degrees, gently stir the curds for approximately 40 minutes. This is called fore-working and helps expel whey from the curds before they are heated. \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 * 6 \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Heat the curds by one degree every minute until the temperature is 120 degrees Fahrenheit. This will take approximately 30 minutes. Maintain the temperature at 120 degrees Fahrenheit for another 30 minutes, stirring often. The curds must be cooked until they reach a stage called the \"proper break.\" To test for this, wad together a handful of curds and rub it gently between your palms. It the ball readily breaks apart into individual particles, the curds are sufficiently cooked. If they are not sufficiently cooked, they will be too soft to hold the cheese together. Let the curds set for approximately 5 minutes. \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 * 7 \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Pour off the whey and reserve it for other recipes. \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 * 8 \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Line a 1 pound mold with cheesecloth and place it in the sink or over a large pot. Quickly ladle the curds into the mold. You do not want the curds to cool. Press at 8-10 pounds of pressure for approximately 15 minutes. \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 * 9 \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Remove the cheese from the mold and gently peel away the cheesecloth. Turn over the cheese, re-dress it, and press at 14 pounds of pressure for 30 minutes. \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 * 10 \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Repeat the process but press at the same pressure of 14 pounds for 2 hours. \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 * 11 \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Repeat the process but press at 15 pounds of pressure for 12 hours. \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 * 12 \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Make a saturated brine bath by combining the salt and water in a noncorrosive pot; stir well. Remove the cheese from the mold, peel away the cheesecloth, and soak the cheese in the brine. Sprinkle the remaining pinch of salt on the surface of the floating cheese. Refrigerate the brine and let the cheese soak for 12 hours. \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 * 13 \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Remove the cheese from the brine and pat dry. You can reserve the brine for other recipe uses if you so desire. Place the cheese on a clean cheese board and store between 50 to 55 degrees Fahrenheit and at 85 percent humidity. Turn the cheese daily for one week, wiping it with a clean cheesecloth dampened in salt water. Do not wet the cheese. \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 * 14 \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Place the cheese in a warm, humid room, such as the kitchen, with the temperature between 68 and 74 degrees fahrenheit. Turn it daily and wipe it with a cheesecloth dampened in salt water. Do not wet the surface of the cheese. Let the cheese set for 2-3 weeks, until eye formation is noticeable. The cheese will swell somewhat and become slightly rounded. \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 * 15 \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Age the cheese at 45 degrees Fahrenheit. and at 80 percent humidity for at least 3 months. Turn the cheese several times a week. Remove any surface mold with cheesecloth dampened in salt water. A reddish coloration on the surface of the cheese is normal and should not be removed.This recipe will make about 1 pound of cheese. If you an additional pound, just double the recipe..   Ingredients1 Gallon Fresh Milk  1 oz. Mesophilic Starter Culture  1/4 tab Rennet  1 Tablespoon Salt   InstructionsUsing a double boiler, warm the milk to 90 F (32.25 C).  Add 1 oz of mesophilic starter culture and mix thoroughly with a whisk, the culture must be uniform throughout the milk.  Allow the milk to ripen for one hour.  Dissolve 1/4 tab rennet into 3-4 tablespoons COOL water. Hot water will DESTROY the rennet enzymes.  Slowly pour the rennet into the milk stirring constantly with a whisk.  Stir for at least 5 minutes.  Allow the milk to set for 1-2 hours until a firm curd is set and a clean break can be obtained when the curd is cut.  With a long knife, cut the curds into 1/4 inch cubes.  Allow the curds to sit for 15 minutes to firm up.  Slowly raise the temperature of the milk to 102 F (39 C). It should take as long as 45 minutes to reach this temperature. During this time, gently stir the curds every few minutes so they don't mat together.  Cook the curds at 102 F (39 C) for another 45 minutes. During this time, gently stir the curds every few minutes so they don't mat together.  Drain the whey by pouring through a cheesecloth lined colander. Do this quickly and do not allow the curds to mat.  Place the curds back into the double boiler at 102 F (39 C). Stir the curds to separate any particles that have matted. Add the tablespoon of salt and mix thoroughly.  Cook the curds at 102 F (39 C) for one hour, stirring every few minutes.  Carefully place the curds into your cheesecloth lined mold.  Press the cheese at about 20 lbs. (9 kg) for 45 minutes.  Remove the cheese from the press and flip it.  Press the cheese at about 40 lbs. (18 kg) for 3 hours.  Remove the cheese from the press and flip it.  Press the cheese at about 50 lbs. (22.75 kg) for 24 hours.  Remove the cheese from the press. Place the cheese on a cheese board and dry at room temperature for 3-5 days, until the cheese is dry to the touch.  Wax the cheese and age it in your refrigerator for 3-24 months. The longer the cheese is aged the sharper the flavor it will develop. Be sure to flip the cheese every few days. . Get a second use out of the whey you made from the cheese to be polenta.. Add some cornmeal into the boiling whey and stir until thick enough for the stirring tool to stand up straight./ Polenta\u00a0 makes an alternative to mached potatoes. Instead of the red sauce, you can use buttter garlic and or a little olive oil.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_136_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_136_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_136_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_136_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. MaterialMetric scale        A timer     A bowl at least 1 gal capacity (4 to 5 litres) and a smaller one         A large basket around 10 in (25 cm) inside diameter (I use the basket of my salad spinner)         2 kitchen towels (in linen or cotton)         A pizza peel or a metal tray         Parchment paper         A razor blade         Some plastic jar or bowl       Masson jars (for the levain)     A baking stone or a large cast iron  skillet I use the skillet)     A roasting panIngredients All purpose flour         Whole wheat flour         Rye flour         Salt         Water (room temperature). First and the most difficult is to start the \u00ab levain \u00bb Use organic flour, it will be easier Pour in a jar (half pint or half liter masson or equivalent) :25 g of all purpose flour          25 g of whole wheat flour          33 g of room temperature water (not directly from the faucet because there is some chlorine, so let it rest some hours before using it)Mix together, cover the jar with it\u2019s lid and keep in a warm place arround 25 \u00b0C (77 \u00b0F) (on the top of water heater for example).Next 2 days, once a day: Take 25 g of the previous mix (discard the rest), and add:25 g of all purpose flour          25 g of whole wheat flour          33 g of room temperature waterIf the culture has started, you may see that the mass rises (with a lot of bubbles inside) and falls after a while, then repeat the feeding preocess every 12 hours.The day after and indefinitely:Then if the culture stays very active replace the whole wheat by all purpose flour and do it once a day.To keep the culture active feed it once every 1 or 2 days and keep it at room temperature.Don\u2019t go under 15 Celcius otherwise you will have an acidic fermentation and you will have to restart it one or 2 days before using it.Then the levain (sour dough) should look like the two last pictures: before fermentation and 4 hours later. You will need 24 hours to do the complete process (to have a baked bread) but in fact it is around 20 min of work.You will also have to wait for a night before tasting the bread...You will need to have:All-purpose flour: 50 g + 150 g + 405 g + 250 g = 855 gWhole wheat flour: 200 gRye flour: 200 gSalt: 22 gWater: 35 g + 100 g + 740 g = 875 gUse organic flour,  the results will worth it.. 10 PM: Wakeup the levain (made the Rafraichi): In a bowl larger than the one used to keep the levain add:40 g levain      50 g all-purpose flour       35 g room-temperature waterMix, cover and store at 24,5 to 26,5 \u00b0C  (76 \u00b0F to 78 \u00b0F) Prepare the water for next step and store it at the same place as the levain so it will be at the good temperature: 100 g room-temperature water6 AM the next day: the levain build add to the Rafraichi: 150 g all-purpose flour and     the previously      100 g room-temperature water.So you have a levain build of 375 g Mix, cover and store at 24,5 to 26,5 \u00b0C  (76 \u00b0F to 78 \u00b0F) Prepare the water  for next step and store it at the same place as the levain so it will be at the good temperature: 740 g room-temperature water. Arround 1:45 PM: The final mix of the doughPrepare 22 g of salt   405 g of all purpose flour in a bowladd the salt to the 740 gr of room-temperature water and mix well In the big bowl pour: 200 g rye flour  200 g whole wheat flour   250 g all purpose flourand mix the flour together Then add 310 g of levain built to the flour (keep the leftover to continue your levain for next time)  Add the salted water to the flour and doughMix with one  hand (stirring and grasping) while the other keeps the bowl. The mix  should become more homogeneous after 1 to 2 minutes.  Finally add the remaining flour (405 g) and continue to mix untill all the flour is absorbed (3 to 4 minutes maximum).The total weight of the dough is: 2127 gWait 5 minutes and fold the dough 8 to 10 times.  Fold technique:  Hold the bowl with one hand, slip your fingers between the dough and  the inside of the bowl and grasp the dough, pull it and  and lay it on  the top of the dough.          Turn the bowl and repeat the movement. (on the pictures you see only one hand because the other one was holding the camera)Wait another 5 minutes and repeat 8 to 10 times. It should begin to look like a real dough and begin to detach from the bowl. Cover with a towel and keep in a warm place for 1 hour: 24,5 to 26,5 \u00b0C  (76 \u00b0F to 78 \u00b0F) 15 PM: Fold the dough 8 to 10 times Cover and keep in a warm place for 1 hour: 24,5 to 26,5 \u00b0C  (76 \u00b0F to 78 \u00b0F) 16 PM: Fold the dough 8 to 10 times Cover and keep in a warm place for 1 hour: 24,5 to 26,5 \u00b0C  (76 \u00b0F to 78 \u00b0F) 17 PM: Fold the dough 5 to 6 times Cover and keep in a warm place for 1 hour: 24,5 to 26,5 \u00b0C  (76 \u00b0F to 78 \u00b0F). Loaf formFold the dough 5 to 6 times in its bowl.Then verse the dough on a floured surface, fold it 2 to 3 times and give it a round shape.Line the basket with a towel, sprinkle flour on the towel (not too much, just enough to avoid the dough sticking to the towel). Place the dough in the basket bottom up (you will see the future base of your bread (la clef), the upside will be in the bottom of the basket)Cover and keep in a warm place and wait 1.5 to 3 hours. The dough will be ready when the total volume will be 3.5 times the initial volume. After 1 hour, check every 30 minutes: To verify if it is ready, enter a finger in the dough, when the dough is ready, the hole must close itself in 2 to 3 seconds. 1/2 hour before the end of the rising:Prepare the ovenon the bottom rack place a roasting pan (empty)      in the middle of the oven place a pizza stone or a cast iron skillet      preheat the oven to 210 \u00b0C (in convection), 230 \u00b0C in conventional (450 \u00b0F).Prepare the material necessary to put the loaf in the ovenA pizza peel or a metal tray lined with parchement paper A razor blade  2 cups of hot water. When the oven has reached the good temperaturePlace the paper on the top of the loaf and turn it upside down on the metal plate. Quickly incise a losange pattern on the top of the loaf, not too deap (arround 1/8 inch) Open the oven and quickly slide the loaf on the stone (or in the skillet). Close the oven door. Pour 1 cup of water in the roasting pan as quickly as possibleClose the door.The steam will help to obtain a better crust.So we are around 8 PM. Note carefully the hour.After half an hour (8:30 PM) Turn the loaf in the oven     Decrease the temperature to 175 \u00b0C (350 \u00b0F)After 50 to 60 minutes (8:50 PM - 9:00 PM)The loaf should be brown, and look cooked. But it is too early to stop the baking procedure. Even if it looks baked wait 5 to 10 minutes.To know if the bread is ready it should have an hollow sound when knocked on the bottom.9:10 - 9:15 PM remove the bread from the oven and put it on a rackIt should weight around 1890 gDon't cut it now, you have to wait at least 6 hours before tasting it. It is very difficult because it smells so good in the house.. The next morningHave a good french breakfast with some honey, marmelade and butter and a cafe latte.The first days the bread is better not toasted. You will be able to keep it 1 week (if you don't eat it) ...Enjoy\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_137_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_137_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_137_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_137_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. \n          Our ingredients will be...\n\t\t12-18\u00a0jalapeno\u00a0peppers (if they're huge, you need less)\n\t\t2 x\u00a08oz cream cheese\n\t\t2 x packets of bacon (NOT thick cut!!)\n\t\t1/4 cup shredded cheese (preferably cheddar)\n\t\t1 bunch of green onions/chives\n\t\t1/3 cup of BBQ sauce. First things first...\nGrab your iPad and put on an episode of Bones, because the prep will take you 42 minutes...\nDice your chives a little thin, then put your cream cheese, chives, and shredded cheese in a mixing bowl and mix.\nYou can do this by hand if you're used to baking and have the unnatural super strength in your forearms. \u00a0I don't. \u00a0I used a mixer.\nJust make sure it's blended well.. You may want to wear gloves for this. \u00a0If you don't, just be sure to not rub your eyes for a day or so. \u00a0Jalape\u00f1o oil is unforgiving...\nSlice your jalape\u00f1os length-wise. \u00a0You can leave the stem on, or take it off. \u00a0Let me give you the benefits of each:Stem on = something to hold on to, looks nicer, easier to tell when it's about to get hot (see next step)Stem off = can pop the whole stinking popper in your mouth for sheer delight. Before we go further, let me clarify...\nIt's a common misconception that the seeds have all the heat. \u00a0The membranes have just as much!\nWhat you're going to do here is grab a table spoon and scoop out the insides. \u00a0This is where you can control the amount of your heat. \u00a0For spicier poppers, leave more membrane. \u00a0For a more tame snack, scoop out all of the white. \u00a0Save a big pile of seeds for a future step.... Now you'll want to get your spacing to make sure you have enough pans, because after this we're going to get messy.\nLay out your jalape\u00f1o halves on the pan. \u00a0If they're curvy (hubba hubba), then you'll want to alternate directions so they fit better. \u00a0Something that you're looking for here is enough space so the bacon isn't touching from one popper to the next, otherwise it won't cook properly.\nI put foil on my sheets on my cookie sheets to make clean up easier. \u00a0But then again, I'm lazy.. Now it's time to pre-heat the oven. \u00a0Set it to 275 degrees - any more and the cream cheese gets hard.\nAnd now grab your filling and start filling (the English language just fails here) your pepper halves. \u00a0\nHere's a little aside...\nSince we're dealing with jalape\u00f1os, let's talk about mexican tradition. \u00a0There's this cake with a porcelain/plastic baby inside. \u00a0The tradition is whoever gets the baby has good luck for the year. \u00a0I think this is completely dumb, as I don't want to be the poor fool that is lucky just because I didn't choke on the foreign object inside my dessert. \u00a0BUT...that did inspire me to create a game of my own.\nHere, you can take all of those seeds you set aside, but them in one of the peppers, then cover with the filling and prepare it like the rest of them (you see where I'm going with this). \u00a0Then the\u00a0victim guest who gets the hot one can be the lucky person! \u00a0This is even more effectual when the rest of your guests have been saying over the last few minutes \"they're not even hot like I expected!\". Slice the bacon in half length-wise to create 2 long strips out of each piece. \u00a0Each half will wrap around a pepper. \u00a0Try not to layer the bacon too thick, or it won't cook through. \u00a0You can hold the bacon in place with a toothpick.\nAlso, when doing this step make sure not to touch other things. \u00a0Uncooked pork (and poultry if using turkey bacon) has a lot of bacteria that can cross-contaminate your kitchen. \u00a0So count your toothpicks first and lay them out, so your bacon-fingers aren' reaching into your toothpick box.\nA reminder again - make sure the peppers aren't touching, so the bacon cooks properly.\nGood news - we're almost done!. Grab your BBQ sauce, and you're going to brush/drizzle/fling/spoon your BBQ sauce on. \u00a0Sometimes if I'm in a hurry, I even just squeeze it straight out of the bottle (a lot of sauce does get wasted if you do it this way. \u00a0The most efficient way I've found is to brush it on.\nAt this point, your oven just beeped that it's ready and Bones and Booth should have just captured the killer, and you're waiting for the fluffy wrap up at the end with Hodgins and Zack.\nDo a dance, though, because you're done!. Put your poppers in the oven for 1 hour. \u00a0Then serve to your salivating guests!\nIf you made more than you need, stick them in the fridge - they taste great hot or cold!\nCongratulations! \u00a0You're done and everyone in the house is happy!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_138_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_138_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_138_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_138_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. This step will vary depending on what type of pit your dealing with and whether or not you have a fire rack. My fire pit is REALLY wide so I had to build stone supports for my cookie sheet. I did this by taking old patio blocks and leaning them together perpendicularly against the walls of my pit to make a space just big enough for my rack. Most camp sites have a relatively low and not-as wide pit and often come with a fire rack. If so, you will not require this extra step. Also, if your cookie sheet itself fits over your pit, you're good to go!Note that you do not need a rack over your pit. A rack is only necessary if your pit is too wide for your cookie sheet and you do not have the means to build a support structure inside the pit. Even though I COULD have placed a rack over my patio blocks, I didn't because the cookie sheet fit perfectly and it was unnecessary.**See step 4 regarding lowering/raising the rack/cookie sheet as necessary**. The first step in a successful fire is having all your starter materials ready before you begin: Tinder, Kindling, and Fuelwood. I made 5 piles of supplies:Leaves and tiny twigs (Tinder)Small twigs (Kindling)Medium twigs (Kindling)Large twigs (fulewood)Wood (Fulewood)Take a pile of leaves and light the underside. Once it starts going, add your smallest twigs. You want enough to catch flames, but not too many to smother the fire. Here I usually dump a bunch that are lightly packed and give the leaves a good blow with my mouth. The extra O2 helps the fire grow and catches the twigs.Once some of the small twigs have lit, immediately start adding medium twigs in the shape of a teepee over the flames. Do not worry if only a little part of it has caught, you can build on it. Add more sticks and blow as necessary. Once the medium sticks have caught, start adding larger sticks and wood in the same fashion. At this point you should have a steady flame and do not need to blow anymore.You may need to reposition your base fire to the center of the pit, or reposition sticks to allow the adding of larger wood. Make sure to leave space between the sticks and wood for your fire to \"breathe.\" If it doesn't get enough O2, it will go out!Now, sit back, relax, and enjoy your fire for a bit!. Take an OLD cookie sheet and line it with foil. Make sure it's an old one, or one you don't mind getting a little messy. Fires will tend to junk up your nice new cookie sheet! Spray the sheet with cooking spray and place your can in the center. Now you need cookie dough. I cooked my cookies in my backyard so I just followed the recipe on the back of the chocolate chip bag and made dough from scratch. After all, that's usually the kid-fun part of making cookies. If you want a little more convenience or are going camping, you might want to consider pre-made refrigerated dough. I would not recommend using frozen dough unless you thaw it beforehand. Either dough will work so it's totally up to you.Drop your cookie dough on the sheet making sure to leave room between the cookies, and around the can. Using 2 large sheets of foil, cover the top of the cookies and seal the sides. The can in the center will make a tent-like shape. This will allow air to properly circulate and cook your cookies just like a regular oven!. A burnt-out and flame-less fire is usually the worst part of making one because it means all the fun is over. However, that's where our fun begins! We want to cook with only super hot coals, not a pretty and flame-filled fire. Flames will unevenly heat the bottom of the pan and cause the cookie bottoms to burn. And well, nobody likes a burnt cookie!When the fire is mostly flameless, evenly distribute the hot, glowing coals. Place your mini cookie oven on your rack or support system and let them bake for about 8 minutes. Remove the \"oven\" using pot holders and check for doneness. At this point assess your cookie situation and determine how much longer the cookies should cook (if at all), or if you need to raise/lower your cookie oven. You need to make a judgement call here because you really don't want to keep removing the oven and letting all the heat escape by continuously checking the cookies. My cookies were REALLY underdone at the 8 minute mark (picture above) so I cooked them for another 10 minutes. After checking a second time when the 10 minutes were up, I again found rather under-cooked cookies. I quickly substituted smaller patio blocks to lower my oven closer to the heat source to help the cookies really get coking. I also rotated the cookie sheet because one side was getting a lot more heat than the other. If your cookies are browning on the bottom too quickly, you may need to raise you cookie sheet. Refer to the 1st step and rebuild/raise the rack with stones/blocks. You can also remove some coals to lower the heat level.My cookies baked for about 28 minutes total, but it could have been a lot less if I had lowered the sheet at the 8 minute check in (live and learn I guess!) Total baking time will vary based on how many coals you have, how close the oven is to the coals, and what type of cookie your cooking. . Like any cookie, you want to let it cool for a bit before removing them from the cookie sheet. I simply slid the foil off the cookie sheet heat and allowed them to completely cool right on the foil. Note how beautiful the bottoms of the cookies came out! If I had flames while cooking, they sure wouldn't look that nice!!Once cool, ENJOY your cookies!! I sure did!! Surprisingly, even though I had a small amount of smoke throughout my entire baking process (which is normal), the cookies didn't taste like camp fire at all! They looked perfect and tasted perfect!  I call that success! I had so much fun making these cookies, I can't wait to make them again! \nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_139_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_139_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_139_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_139_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Combine the yeast and honey in the lukewarm water and \"melt\" the yeast in the warm water. . To give the bread a nice even texture and look, its recommended that you grate your beetroots with a grater. But if you're lazy like me, you chop them up and use the blender. However, this does make it more chunky, and it does effect both the looks and to a certain degree also the taste of the bread (it taste slightly more like beetroot).To help blend better, add the yogurt (I used 'Cultura', the flavour doesn't really matter with such small amounts) and the yeast/honey mixture and blend some more.. Add salt, nutmeg and grate the entire skin of your organic orange. Mix.Add your flour and melted butter, and get your hands dirty (speaking off, probably better you wash your hands first!) and knead that dough!If its crumbly and hard to get to stick together, you can add a bit of orange juice, pressed from the orange you grated earlier.I used ryebread flour, which is why it looks a bit dark. I think wheat flour would have been better though, but I ran out... Onced kneaded into a smooth and even dough, roll it into a sausage shape to fit your tin, place it into your tin and let it rise for 30 min.. Once it has risen to almost twice the size, slice a cross with a knife, and bake in a preheated oven at 225\u00b0 for 30 minutes.Let it cool on a cooling rack.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_140_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_140_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_140_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_140_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. -Apples (you can use a variety of apples but i prefer red delicious with gala)-sugar-water-spices (i used cinnamon and nutmeg)-tools-blender/ food processor-an undershirt that you out grew or ripped-knife-pan-cutting board-spoon. wash the applesremove the apple cores and cut them into smaller pieces. toss the apples into a pan and add wateras the apples start to softenpour them out into a separate bowlthe apples should be soft enough for you to stick your finger into the slicesPS: I made some jello while the apples cooked. toss them into your blender or processorand wreak havoc!after the apples turn into apple sauce, dump them out onto your extra shirtand SQUEEZE and twistring out as much apple cider as you canthen after you have a good amount add spices to tasteyou can also add other juices at this time. now take those mushy apple fibers and using the same water you used to boil the applesmix them togetherand cook themwhile its cookingadd sugaras most of the sugar from the apples has already been squeezed outnow just as in the last stepring out all the juice you can get from your new mixture. Refine it if you want tomine has some mushy stuff still in itso i poured it back into the shirtand ringed it out againbut not as tightly as the first timeand then i re-spiced the ciderand added a little cream on the top\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_141_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_141_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_141_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_141_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. 4-8 sheets of nori (an edible seaweed, usually sold in sheets of 10 or 50) 8 kale leaves   a variety of julienned vegetables of your choice: red and yellow pepper, carrots, cucumbers, daikon, asparagus, or avocado  optional: tofu or tempeh   2 tablespoons sesame seeds (white, black or a gomazio mix)   soy sauce or Braggs aminos for dipping (optional). Lay the nori roll on a cutting board or chopping block. Line the end of one nori sheet with 1 kale leaf, stem removed.  Top with a row of vegetables, a few of each kind.Hint: Sometimes people like to add a layer of saran wrap between the cutting board and nori roll, but this isn't necessary if you're careful during the rolling process.. Using both hands, roll firmly (like a skinny burrito). Allow some veggies to stick out of the ends, about 1 cm.. Dip your fingers or a pastry brush in water.Spread water on the underside of the loose end, end to end. Continue spreading with water (or soy) until the seam is completely sealed.. Using a sharp knife, cut on the diagonal into 8 pieces. The easiest approach is to cut the roll in half, then in half, and half again.. Assemble pieces on a platter or woodblock, with small dishes of soy or Braggs for dipping. Garnish with fresh flowers, chopped parsley, or a sprinkle of sesame seeds.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_142_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_142_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_142_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_142_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. 1 cup of vodka or clear rum per 1.25oz of raw, unshelled almonds. . Put the almonds in a bowl and bring a cup of water to a boil. Pour the boiling water over the almonds and let them sit for about a minute. . Quickly put the almonds in a strainer and run under cold water for a few seconds. After this the skin should be easy to remove. . Give the almonds a rough chop and then put them in your bottle. Pour your alcohol over the the almonds. . Store the future extract in an airtight bottle in a cupboard or somewhere out of the way. Shake the bottle daily. In about 1-2 months the almond flavor starts to appear, it gets better with age. After a couple of months you can start using the extract, either by pouring off what you need when you need it or straining the almonds and keeping the extract. \nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_143_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_143_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_143_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_143_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Large jar of salad olives\nTwo cans of black, ripe olives\nTwo jars of black kalamata olives\n2-3 cloves of garlic\n1/4 of a medium onion\nThis is all you really need, but you can certainly include plenty of other ingredients. Good ones to try are capers, peperoncini, roasted bell peppers, dried tomatoes, or red pepper flakes.\nNOTE: I'm informed that capers are NOT optional. If you don't use capers, it will still be fabulous, but it won't really be tapenade.. A very sharp knife and a cutting board.\nYou can use a food processor, but your tapenade can get soupy and overworked. Also, I don't have a food processor. If I did, I might recommend it over the knife. Ya never know.\nIf you make this recipe with a food processor, add a comment and let us know how it worked.. These are the manufacturer's rejects. Which is fine, because they're cheaper and you're going to chop them up anyway.\n. Love these olives because they're pitted (yay!) and they're cheap (yay!).\nIn addition to these, get two 12-ounce cans of pitted ripe olives. I always get small olives, rather than large or jumbo, on the theory that more small ones will fit in the can than larges ones, and so I'm getting more total olive mass. Ya think?. Drain all your olives before beginning to chop. Be sure your knife is sharp.. Actually, I see some big hunks in there. You want a nice, fine mince.. These look kinda big, too. Who cut this stuff up?!\nEverything you put in your tapenade should be minced very fine.\nFor this quanitity of tapenade, use about 1/4 of a medium onion, or less, depending on how much you like onions. Raw onion is powerful.. Okay, this is is minced nicely. It needs to be nice and fine.\nFor this quantity of tapenade, I use two or three large cloves of garlic, but I really like garlic. You can use less. Or more!. Add hot pepper of any kind that you like, if you like foods spicy.\nLemon is a very good addition, too. Lime would also be delicious.\n(I don't have to tell you not to use salt, do I?). A zester is a very handy item. It gives you strings of the zest which you can use as is to garnish dishes and desserts, or chop fine to put in your tapenade.\nUse a light touch and take off just the yellow zest. Avoid the white, bitter pith.. Zest your lemon before you juice it.. Mince your zest and put it into the tapenade. With this quantity of tapenade, two or three lemons would not be too many.\nBefore cutting the de-zested lemon in half to juice it, roll it on the counter with your palm to break down the juice-containing cels inside the fruit.. This wooden reamer is a handy tool for juicing lemons.\nMix the lemon juice into your tapenade, cover and refrigerate for a few hours to develop flavors.\nIt will keep beautifully in the refrigerator for a couple of weeks, if covered tightly.. Tapenade makes an elegant appetizer for a party. Serve with crackers and raw vegetables such as celery and bell pepper.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_144_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_144_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_144_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_144_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. You'll need: Couscous: 1 Tbsp. olive oil 8 oz. (about 1 1/3 cups) Israeli Couscous\u00a0 14 oz. chicken broth (1 3/4 cups), if you want to keep this vegetarian, use vegetable broth or water 2 Tbsp. parsley, finely chopped 1 tsp. rosemary, finely chopped 1 tsp. thyme, finely chopped 1 medium green apple, diced (I love a Granny Smith) 1 cup dried cranberries 1/2 cup slivered almonds, toasted Vinaigrette: 1/4 cup apple cider vinegar 2 Tbsp. maple syrup 1 tsp. salt (I use kosher) 1/2 tsp. ground black pepper 1/4 tsp. cinnamon, optional 2 Tbsp. olive oil. For the couscous: In a medium saucepan, heat the olive oil over medium-high heat. Add the couscous and cook, stirring occasionally until lightly browned, about 3 to 5 minutes. \u00a0Add the chicken broth (I usually add a decent sized pinch of salt too) and bring to a boil. Cover, lower heat to medium-low and simmer for 10 to 12 minutes or until the liquid has evaporated. Set aside to cool. For the almonds: Preheat the oven to 350 degrees F. Arrange the almonds in a single layer on a foil-lined baking sheet. Bake for 8 to 10 minutes or until golden brown, stirring occasionally. Set aside to cool. (I toast mine in the toaster oven and it only takes about 5 minutes, you could also dry toast them in a skillet over medium heat until golden brown.). Finely chop the herbs, dice the apple. (I usually peel the apple and slice the apple off the core in quarters, this way I have a flat side for dicing. Slice an apple quarter into 1/8-inch slices, stack them, then make 1/8-inch horizontal cuts, then 1/8-inch lengthwise cuts\u00a0so you have 1/8-inch dice.)\u00a0 For the vinaigrette: In a large bowl, combine the vinegar, maple syrup, cinnamon (if using), salt, and pepper. Whisk in the olive oil until smooth. Add the cranberries and let them sit about 5 minutes so the cranberries soften.. Add the apple and herbs to the vinaigrette, mix well. Then add the Israeli couscous and almonds and stir everything until combined.\u00a0 As I said, this is wonderful at room temperature, and any leftovers can be covered, refrigerated, and served cold up to 2 days.\u00a0 Mangia and enjoy!\u00a0\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_145_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_145_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_145_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_145_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. You will need:1 package bread yeast2.5 cups, all-purpose flour1 tsp. salt2 Tbsp. warm milk1 Tbsp. sugar1 cup warm waterAdditionally, you may want some sesame seeds and a little melted butter to finish right before baking.. Combine the yeast, milk, sugar and water. Mix gently and set aside for 15 minutes. This allow the yeast to activate.When it looks frothy, it's ready.. In a larger container, combine 2 cups of flour and the salt. When your yeast is activated, mix the dry and wet ingredients together. When your ingredients are mixed well, sprinkle the remaining flour over the dough ball to prevent sticking. Cover and allow to rise for 1 hour.Optional, This dough can be very sticky. You can spray cooking spray onto whatever bowl you are mixing and rising in to further prevent sticking.. After an hour, punch down the dough and knead for 5 minutes. Add a small amount of extra flour to prevent sticking while kneading.  Cover again and allow to rise for an additional 25 minutes. Preheat your oven to 375 degrees F.After the second rising, knead lightly for 5 minutes and divide in three even parts. Roll the parts into snakes and pinch the ends together. Place on baking sheet and braid the pieces together carefully. Allow to rise for an additional 15 minutes. Brush lightly with butter and sprinkle with sesame seeds.Place your loaf in your 375 degree F oven and bake for 30 minutes. Allow to cool, if you can wait, and enjoy! Remember, there are no preservatives in this bread, so it will get stale after 3 of 4 days.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_146_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_146_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_146_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_146_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. 100g oats 2 bananas 150g pumpkin puree cinnamonPreparation time: 10-15 minfor 2 servings. Cook oats in the water until tender.. Add some cinnamon, 2 bananas, pumpkin puree and mix it all together.. Decorate the top with some fruit. I love to use grapefruit, grapes, banana and peanut butter for this meal.. . YOUTUBE:  https://www.youtube.com/channel/UC_GmntyQbCokHFwy...WEBSITE:  http://www.spineat.comINSTAGRAM:  https://www.instagram.com/spineatcook/TWITTER:   https://twitter.com/SpinEatCookFACEBOOK:   https://www.instagram.com/spineatcook/TWITTER:   https://www.pinterest.com/spineat/\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_147_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_147_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_147_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_147_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Avocados are properly ripe for this dish when they're slightly soft, giving a bit under pressure.  A few spots are fine.  Wash your avocado well, then slice in half, and remove the big pit.. You'll need:1 avocado half2-4 tablespoons apple cider vinegar (to taste)1-2 tablespoons olive oilItalian seasoning - a \"sprinkling\"Salt Pepper. Combine your oil and vinegar to taste inside the well of your avocado half.. Sprinkle with Italian seasoning and salt and pepper - to taste.  . You may want to use balsamic vinegar instead of apple cider vinegar, for a sweeter flavor, or use Italian dressing in place of your own oil and vinegar concoction. . Serve alone, as a side dish or with a side of corn chips.  It's a wonderful lunch, with only about 150 calories, and high in potassium and fiber. Enjoy! \nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_148_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_148_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_148_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_148_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Ingredients: (makes 4 burgers) 8 pieces of bacon (2 per burger) 1 lb. ground beef 2 large beefsteak/other large tomatoes  4 thick leaves of Romaine lettuce 4 thin slices cheddar cheese 2 tablespoons mustard  3 cups minced Mushrooms Salt/pepper for taste FOR BURGER SAUCE: \u00bc cup mayonnaise \u215b cup mustard 1 tablespoon sweet relish 1 teaspoon lemon juice. Prepare your lettuce, cheese, and tomatoes. Cut the tomatoes in thin slices. You should have about 10 - 12 slices of tomatoes. Place the lettuce, tomatoes, and cheese on a tray until further use.. Shape ground beef into patties. First, with your hands, divide the 1 lb. of beef into 4 equal balls. Break the meat up, with your hands, and then push each of the 4 balls down so that they are proper flatness.. Prepare burger sauce by combining the mayonnaise, \u215b cup mustard, relish, and lemon juice in a small bowl.. Mince the mushrooms finely. Put them in a medium bowl for future use.Place a cast iron griddle over a grill and heat it up. Or, alternatively, you can use a flat metal skillet/griddle over your stove top. Once sufficiently heated, cut your bacon pieces in half, so that you have 16 halves, and place on heated skillet. Sizzle until just cooked through. Remove bacon from griddle, and place in a bowl, and insulate it with aluminum foil. Do not turn off griddle, and leave the bacon grease hot.  Get your mushrooms ready. Dump them on top of hot bacon grease, and saute until mushrooms are very brown. Place them back in their original bowl, and insulate with aluminum foil. Do not turn off the griddle.. Now for the patties. Squirt the 1 tablespoon of mustard on the patties, and preferably smear around with a brush. Then place the thick burger patties on the hot griddle--yes the same uncleaned one you used to cook the bacon and mushrooms on.Sprinkle patties with salt and more mustard, and let cook on one side for two minutes or so.After the two minutes are up, flip the patties over, sprinkle the cooked sides with salt, and place a piece of cheese on each patty. Finish cooking on this side for another couple minutes. After the minute is up, place patties and their cheese on top on a large heat-proof plate, and cover with aluminum foil.. Now it\u2019s time to put together the burger. This part is easy. First, place the bottom burger bun on a serving plate. Then smear the mayo burger sauce over the top. Then place on 1-2 tomato slices. Then 2 halves of bacon. Then one of the cooked patties goes on top, and on top of the patty goes the additional 2 halves of bacon. On top of the bacon goes the sauteed mushrooms. Then on goes the piece of Romaine lettuce, and finally, on goes the last burger bun! Repeat 3x for a total of 4 delicious Mushroom Cheddar Bacon Burgers!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_149_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_149_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_149_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_149_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Ingredients and UtilitiesTwo eggs (If you want more eggs on the sandwich feel free to do so) Mayonnaise Two bread slices (I advise whole grain bread, but rye bread is also acceptable) Any kind of cheese (I make mine with Swiss) Deli Ham Optional:Any other meats or cheeses you would like to add (if you do add meat i suggest turkey)ButterFor step 4: A medium sized frying pan For step 5: A panini press. I suggest frying the egg over hard, because when using the panini press to sear the hag, the egg yolk might splatter over the panini press. Put butter in the medium sized frying pan and fry it till the yolk is hard (this is how you cook a over hard egg).. I usually do this as I am frying the egg, but if you do not have very much cooking experience I would prep the sandwich before or after frying the egg. Pick two bread slices of your choice and spread the desired amount of mayo on them. I would put the mayo on both sides for a better cooked sandwich experience. After spreading the mayo, put half slice of cheese on both sides; if you are using a different cheese or you just would like more Swiss, add it. Then put two slices of ham, evened out between the two bread slices, in the sandwich.Before I explain the next steps, there are two ways to cook the Hag. Step 4 explains how to pan fry the sandwich. Step 5 explains how to use a panini press to cook the sandwich. Skip to the step that you prefer the Hag, or try both!. Take your frying pan and place it over your fire. Next spread butter around the pan. Take your prepared Hag Sandwich (with the fried egg inside)and leave it on the pan until the bread is brownish-blackish, which adds a crispier experience  (or leave it on however long you want it).  After frying the Hag, put it on a plate and voil\u03ac the hag sandwich is complete! If you are using this step to cook the sandwich skip to step 6.. Before you start cooking the sandwich with the panini press, heat up the panini press. After the panini press is heated up , add the fried egg to the prepared sandwich and place the sandwich in the panini press, placing the top on after putting the sandwich in the panini press. Leave it for about two minutes (or leave it however long you want), and then flip the sandwich over; be careful though, because the panini press is very hot. Finally, put the sandwich on a plate and you are good to go. By the way, I use a cast iron panini press, but the instructions should be the same if you are using a electric panini press.. After creating the master sandwich, the next step is to finally give your mouth the privilege of feasting on a hag. Your mouth will rejoice in the glorious taste of the sandwich, and your stomach will crave the the hag. I usually leave the sandwich uncut when eating it, but if you want to you may cut the finished product to get a more formal look. I don\u2019t normally drink anything with the sandwich when eating, but if you were to drink something with it i suggest a tall, cold glass of milk.Before I leave you with the thought of making this sandwich, I would like to thank my father for teaching me my cooking skills, and the owner of this (https://www.instructables.com/id/The-Best-Turkey-Sandwich-Ever/) instructable for inspiring me to share my recipe to the world.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_150_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_150_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_150_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_150_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. for cookies:\n3/4 cup brown sugar\n1 stick softened butter\n1 egg\n1 teaspoon vanilla\n1 cup flour\n1/3 cup cocoa\n1/2 teaspoon baking powder\n1 cup semi or bitter sweet chocolate chipsglaze:\n1/2 cup powdered sugar\n2 tablespoons boiling water\n1 tablespoon cocoa powder\n1/2 teaspoon peppermint extract\nYou can use white sugar for these, but they won't taste as amazing. The brown sugar really gives them extra oomph. :D\nYou'll also need an oven preheated to 350 F and a baking sheet with parchment paper lining it.. Cream the butter and sugar together. Add one egg and a teaspoon of vanilla and mix that in.\nOnce everything's nicely combined, mix in the flour, cocoa and baking powder. I got lazy and just poured them into a sieve instead of combining them in a separate bowl before. :)\nNow add one cup of chocolate chips and mix them in.. Place twelve rounded tablespoonfuls of dough on a baking sheet and bake for 12 minutes at 350 F. You'll do this twice. :)\nPut the dough in the fridge between batches - or if you're fancy and forward thinking, do two baking sheets at once!\nOnce they're done, let them cool on the baking sheet until they're easy to handle, and then transfer them to a baking rack.. Put the cocoa powder and powdered sugar through a sieve to eliminate any clumps. Add in 1/2 teaspoon of peppermint extract and 2 tablespoons of boiling water. Stir well and then pop in the microwave for 30 seconds and stir again. It should be well combined now.\nAt first, the oil will float on the top, but as it cools it will mix in. As soon as it stops floating to the top, you know it's cool enough to glaze the cookies with.\nMake sure your cookies are cool before glazing!\nYou can either dip the cookies into the glaze or drizzle it on top. I like to do a few coats to make them extra pepperminty. :D Make sure you glaze them over a baking rack with wax paper below it - it's messy!\nThe glaze will be sticky for a little bit, but within a couple hours it'll harden up and look nice and glossy.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_151_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_151_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_151_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_151_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. INGREDIENTSQuick Pickled Cucumbers Sauce1/2 cup Rice vinegar\n1/2 cup Filtered water\n3 1/2 Tablespoons sugar\n2 Persian large Cucumbers, thinly sliced\n1 small Shallot\n1/8 teaspoon dried Chilli flakes\n1 whole fresh Thai chilli, chopped or whole depending on your spice preferenceTod Mon 1lb Chicken breast or thigh, cut into two inch pieces for easier food processing  1 lb Shrimp, De-shelled and de-veined 3-4 Tablespoons Thai green curry, home made or store bought3/4 teaspoon Sugar1/2 Teaspoon Fish sauce1 large egg1/2 large Onion, small dice, 125 grams12/ bundle of long green beans, 140 grams1/2 bundle Cilantro rough chop, stems and leaves, 45 grams25 Kaffir lime leaves, De-veined and thinly sliced, 9 grams 2 Tablespoons High heat oil for cooking. This Sauce is a wonderful compliment to the flavors of Tod Mon. You can have a few slices of cucumber with each bite of Tod Mon, which is how I like to do it, or you can simply dip it in the vinegar juice.. I like to use a mandolin to slice the cucumbers and shallots because you can set it to cut super thin, which works best for the quick pickle process. Slice and set aside. Bring your vinegar and water to a heavy simmer. Add sugar and stir until sugar has melted. Place cucumbers, shallots, dried chilli flakes and fresh Thai chilli in a mason jar. Pour the heated liquids over veggies and cover. Set aside for later. . Onions - chop your onion small. Small to medium dice is best.Green beans- cut the tips off. line them up together and cut them into 3/4 inch pieces. I find 3/4 inch pieces to be the best size for this recipe because if they are too big they tend to fall out of the balls and if cut too small, they get lost in the ball and you loose their great texture.Cilantro- I use the whole herb. Stems and leaves. The flavor is the same and they have a nice texture.  I'm not one to waste, so unless you're saving the stems for a stock, use them.Kaffir Lime Leaf- you can either pull the leaves off by grasping the stem between your fingers and pulling down on the leaves or just take a sharp knife and cut along the stem. Gather a pile of half leaves and stack them the same direction. Cut thin slices along the shortest distance.  Set ingredients aside. In a food processor add the chicken, shrimp, sugar, egg, fish sauce and curry. Process until well chopped but with some small chunks here and there. This part is really preference. I like mine a little chunkier. The kind you find in restaurants are generally well processed without chunks.. Empty the contents of the food processor into a large bowl. Add onions, green beans, cilantro and the chopped Kaffir lime leaf. Mix by hand until all ingredients are well incorporated and evenly distributed. . The best way to make the balls is by using two spoons. Take a spoon full with one spoon and use the other to shape it. when done shaping use that second spoon to pull your newly made ball into the deep fryer or frying pan. . You can cook it any way you like but the two best methods are stove top and deep fryerStove top- use a high heat oil like coconut or grape-seed oil. Over medium heat drop your spoonful onto a heated and oiled pan. Cook about a minute and a half or until browned then flip over and use a spatula to squish it down a little. Cook until browned on the other side. Set aside on paper towels or a paper bag.Deep fryer- Set it at 350 degrees. When oil is ready, drop spoonfuls into the raised basket. Lower into the oil and fry for about a minute and a half or until browned, moving them as not to stick to the bottom. Set on paper towel or grocery bag.. Serve with Cucumber sauce, sliced cucumbers and green beans on the side.Enjoy and thank you...\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_152_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_152_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_152_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_152_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. This project calls for adding a very small amount of dry ice to a closed container.\u00a0\u00a0\u00a0 This is only safe-ish because we are adding such a tiny (and well measured) amount.\nAdding too much dry ice - or using the wrong container may result in an explosion.\u00a0 In the worst case - this could seriously injure or possibly even kill you.---MANDATORY READING---\nHOW TO AVOID BLOWING YOURSELF UP:Never use a container other than a 2-liter plastic soda pop bottle-Do not use a glass container!!!!\u00a0 You really might kill yourself.\n-Do not use a smaller 16 or 20oz plastic pop container - these are not as robust - and may explode (trust me)\n-Do not use a bottle that seems damaged / stretched from pressureNEVER place the 2-liter soda pop bottle in the freezer\n- This may cause it to shatter / blow up in your face (trust me)Pay close attention to the measurements of dry ice\n- The provided measurements will be in teaspoons / grams\n- Do not confuse this with tablespoons / ounces\n- Do not add more dry ice than is specified\nWatch this video of too much dry ice in a 2-liter bottle blowing up a cinder block:. A Bag of Loose Yerba Mate\nAny brand is fine\nCheck your local Latin American grocery store\u00a0 - \u00a0they commonly have mate about 75% cheaper than other places\nAlso available at many grocery / health food stores or where loose tea is soldDry Ice\nOften available from the meat department of your grocery storeDo not handle directly - use gloves! \u00a0It can burn you quickly!\nWe'll only be using about 6-8 grams per bottle - but since it evaporates - get at least a pound or two2-Liter Plastic Pop Bottle\nNo substitutions allowed!\nBottle should be undamaged / in general good condition\nIf you want to end up with individual servings - carbonate in the 2-litter bottle - then transfer to other containers1/2 Teaspoon Measuring Spoon\nYou'll need other measuring cups / spoons - but this one is important!Hammer\nTo crush dry ice (may want to wash it if it's been sitting in your toolbox)Sturdy Metal or Plastic Bowl\nTo crush the dry ice in -\u00a0I use a small plastic measuring cup\nIf it's plastic - be prepared that it might break.Do not use glass / ceramic!French Press\nIf you don't have one - you can probably figure out an alternativeSugarLemon Juice\nLime juice also works. Dry ice is frozen CO2.\nBeverages are carbonated by dissolving CO2 into them under pressure.\u00a0One source suggests a 2-liter bottle of Pepsi contains roughly 10 grams CO2.\nTo allow a little extra safety - we're going to target 6-8 grams per bottle. \u00a0This also seems to be a nice, light carbonation level for mate.\nHowever - the 2-liter bottle isn't designed to handle the pressure required to carbonate - only to store carbonation.\nIf you added 8 grams of dry ice at once to the bottle - much of the resulting CO2 gas would not immediately dissolve into the liquid. \u00a0This would create more pressure than the same amount of CO2 would if it was already dissolved into the liquid.This level of pressure could cause an explosion!\nInstead - we will be adding a small amount (2 grams) of CO2 (dry ice) in several small doses. \u00a0Each time assuring the CO2 has fully dissolved into the liquid before adding more.\u00a0\nYou won't need a precision scale to measure the dry ice. \u00a0We'll be crushing it to a snow-like powder which reliably weighs about 2 grams per 1/2 level teaspoon.\nLet's stop and note that measurement now. \u00a0That's 1/2 level teaspoon. \u00a0Do not confuse this with tablespoon. \u00a0Do not use a \"rounded\" teaspoon.\nThe powder also provides a large surface area - and dissolves into the liquid quickly.\nCO2 dissolves more readily into cold liquids. \u00a0We don't need our mate ice-cold - but it needs to be at least cooler than room temperature to get good carbonation results.\nImage credit:\u00a0Arie Melamed-Katz. This page includes several references as to the caffeine content of yerba mate:http://www.erowid.org/plants/yerba_mate/yerba_mate_chemistry2.shtml\nIn short - using hot water to extract caffeine from mate typically yields 0.5% to 2.0% caffeine by weight. \u00a0However - most test results came in right around 1%. \u00a0We'll use that number to guesstimate.\nSo - we're making 2-liters (67.6oz) of mate in this recipe.\nLet's call that 4 generous 16.9oz servings.\nMy own measurements put 1 cup mate (volume) at 72 grams (weight).\nLet's say we put 1/2 cup (36 grams) of mate into our 2-liter batch:\n36 grams * 1% = 360 milligrams total caffeine\n360 milligrams / 4 servings = 90 milligrams per 16.9oz serving.\nFor reference Club Mate comes in at 100mg caffeine for the same size serving! \u00a0That's certainly within our margin of error.\nHowever - to get this level of caffeine - Club Mate adds extracted caffeine to their recipe. \u00a0So - using this amount of mate in our recipe yields a little \"richer\" flavor.\nMate varies in strength and density -\u00a0 be aware that it's possible these guesstimates are off by a factor of 2x in either direction.\u00a0 If you are caffeine sensitive - avoid mate.\nFor further reference - drip coffee comes in around 145mg caffeine for an 8oz serving.. Here are a few recipes to try - each one makes 2-liters mate (additional instructions in next steps)\nThe amount of sugar and citrus are based on a combination of research and my own taste.Mate Light\nSimilar in flavor to Club Mate - little less caffeine (60mg per serving)\n1/3 cup mate\n1/3 cup sugar\n1 teaspoon lemon juiceMate Extra\nA bit richer tasting than Club Mate - with similar caffeine (90mg per serving)\n1/2 cup mate\n1/3 cup + 1 tablespoon sugar\n1 1/4 teaspoon lemon juiceImperial Mate\nMore Flavor! More Caffeine! (135mg per serviing!)\n3/4 cup mate\n1/2 cup sugar\n1 1/2 teaspoon lemon juiceInsert Your Recipe Here\nTweak the variables a little - or do your own thing.\nAdd ginger, raspberry, chile powder!Note: Large amounts of strong mate can make your tummy unhappy.\u00a0 You've been warned.. Fill the 2-liter pop bottle half full of cold water (about 4 cups).\nRun the faucet a while to get the water as cold as possible.\nWhen done - put the bottle in your\u00a0refrigerator\u00a0to further chill.DO NOT PUT THE BOTTLE IN THE FREEZER\nThis will cause it to become brittle - and may shatter / explode when carbonating.\nNote - I don't actually recommend taking the bottle's label off as shown in the photo. \u00a0It leaves a sticky residue.. Pick which recipe you want to use from the prior step - then put all the ingredients into your French Press.. Add roughly 3 1/2 cups not-quite-boiling water to the French Press.\nIf you boil water - then let it sit a few minutes afterwards - you'll get the temperature about right.\nDon't worry about the exact temperature or amount of water (just not more than 4 cups - or you might run out of room in the bottle later).\nUse a spoon to stir everything (make sure the sugar isn't sticking to the bottom).\nNote: I have a giant French Press (thanks Beth!) - yours will probably look more full with the same amount of liquid.. Plunge the French Press after 5-15 minutes of steeping.\nCaffeine extracts pretty easily - you'll probably get most of it within a few minutes.\nI commonly plunge the French Press after about 7 minutes.\nBut - going a bit longer may get you a tad more caffeine - and more mate flavor.\nIt's up to you!. In order to effectively carbonate - you'll need to get the temperature of the mate in the French Press down to around room temperature.\nTo speed this process up - fill your sink with cold water - then let your French Press sit in it.\nYou should find it close to room temperature within about 15 minutes.\nWhen it's no longer warm to the touch - you're ready for the next step.. Pour the contents of the French Press into the 2-liter pop bottle.. Fill any remaining space in the 2-liter bottle with cold water. \u00a0Leave as little air in the top of bottle as possible.\nYou may optionally further chill the mate in your\u00a0refrigerator\u00a0(not freezer!) at this point. \u00a0Doing so may provide a bit better carbonation results. \u00a0The colder the liquid - the more easily the CO2 will dissolve into it.. Place a few small chunks of dry ice into your crushing container.Use gloves - the dry ice can burn you!\nLightly crush the dry ice using the hammer (you don't need to use a lot of force).\nContinue crushing until the powder resembles course snow.. Do this step over the sink! \u00a0If you don't get the cap back on quickly enough - the mate may overflow. \u00a0It may also overflow when opening the bottle. \u00a0The colder the mate - the less likely things will overflow on you.Do not add all the dry ice at once!\u00a0 It will explode.\nWe are going to be adding 1/2 level teaspoon (2 grams)\u00a0of dry ice at a time.One last time - that's 1/2 level teaspoon - \u00a0not tablespoon. \u00a0Tablespoons are bigger - and may in this case cause an explosion.And again - 1/2 level teaspoon. \u00a0Not a rounded 1/2 teaspoon.\nPerform the following steps quickly:\n1. Measure 1/2 level teaspoon of the dry ice powder\n2. Dump the 1/2 level teaspoon of dry ice into the 2-liter bottle3. Immediately place the cap back on the bottle4. Immediately shake the bottle\u00a0vigorously!\n5. Place thumb on bottle - note slight pressure building in bottle (keep shaking!)\n6. Once the pressure has peaked, then subsided - you may stop shaking\n7. Slowly remove cap from bottle (don't be alarmed if it fizzes over a bit)\nWhen done - repeat this process twice more. \u00a0Powder more dry ice if needed.\nIf you want more carbonation - you can repeat one additional time.. At this point the mate is probably a bit cooler than room temperature - drinkable - although would probably benefit from some time in the fridge.\nEnjoy your \"Tub Mate\" - but once you're comfortable with the process (or before) - hack up your own modified version!\nI (and others) have found lime and ginger make great additions to the recipe. \u00a0You probably have your own ideas.\u00a0There's lots of people making mate - check out their recipes and get inspired!https://www.noisebridge.net/wiki/Sudo_pophttps://github.com/jdukes/ShadowMatehttp://www.hackpittsburgh.org/brewing-open-mate-sodahttps://gist.github.com/1054447Blatant Self Promotion\nIf you like projects involving radioactivity, lasers or robots you should check out\u00a0http://nothinglabs.com\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_153_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_153_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_153_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_153_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. 1 cup of  wheat gluten1 cup of water1 spoon of flour1 spoon of beer yeast1 garlic cloveshoyu (soy sauce)saltpepperherbs, seeds and spices as you prefer (try parsley, poppy seed and nutmeg). Put all the dry ingredients into a bowl and mix them with a spoon. Also add the spices and herbs in this step.. Now you are ready to add a little bit of shoyu, the more you add the more brown will look the seitan, but depends on your taste. I like with lots of shoyu.After you'll add the water. Add without stir, and don't add all at the same time. Wait till the wheat gluten \"absorb\" all the water. If you see any solid put more water and mix kindly.The result needs to look something like the \"thing\" on the photo below.. Now you need to cook the seitan! Put water boiling and add some herbs if you like and a little bit more of salt. Add the pieces of seitan. If you have a big piece of seitan separate them into 2 pieces.Let the seitan cook at least half an hour.. When the seitan is cooked, put it into a slide with a pot full of water above, to make some pressure over the seitan and that way the excess water will go apart!After that you can store your seitan in the fridge (in a container full of water, shoyu and garlic) or in the freezer. Normally I put some on the fridge to use next days in some wonderful recipe, and I store other piece on the freezer to use later on.And that's it!I hope you like!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_154_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_154_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_154_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_154_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Ingredients:Four Bananas        One heaped tablespoon of Cocoa        One teaspoon of Vanilla Essence You'll also need:Knife       Handheld Blender ( I would actually suggest using a normal blender)  Spoon       Container with lid       JugIf you only have bananas, you can also make this... just follow the recipe, but leave out the vanilla essence and the cocoa. The frozen bananas are the base... what you combine then with it can be actually anything of your choice.. Cut the bananas into smaller pieces, because when your going to mash it, you don't want to break your blender.. Add your chopped bananas to your container and freeze for about 6-8 hours.. After freezing the process is quick. Blend the frozen bananas,cocoa and vanilla essence all together till smooth and creamy (like Ice cream). When blending, take it slowly, you don't want to break your blender! Tip: I would actually suggest using a normal blender.. Enjoy your healthy choc-banana alternative to ice-cream! For a more sweeter version, replace the cocoa with some chocolate spread! That won't be so healthy, but it will be much more sweeter! You can also just leave out the cocoa. Feel free to leave a comment, thanks for taking your time to look at my Instructable and Please Vote!...and visit our profile!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_155_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_155_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_155_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_155_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. For this salad, I used one package of quinoa, one bunch of kale, a package of cranberries and about a cup of feta. You can use different amounts of each, depending on what you like. I also used a few teaspoons of olive oil.I used this brown rice and quinoa mixture because it's all they had at Safeway, but you can use regular quinoa if you can find it.. The directions on my package of quinoa say to bring the quinoa, 1 and 3/4 cups of water and a teaspoon of olive oil to a boil, then turn heat to medium, add the packet of spices and cover for about 15 minutes. All the water should be absorbed when it's done.. Rinse your kale, cut off any brown ends you find and cut the rest of the kale into smaller pieces.. While the quinoa is still warm add it to the kale and mix together. Add in the cranberries and feta and toss. . Dress the salad with olive oil and lemon juice and enjoy!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_156_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_156_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_156_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_156_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Use five pieces of cabbage.. The cabbage need to be diced.. Prepare a white leek.. The leek need to be diced, too.. Prepare ground pork.. Season with soy sauce, sesame oil.. Mix it well.. Like this!!. Wrap meat in dumpling skins.. Wrap meat in dumpling skins.. Finished preparing the Chinese dumpling for cooking.. Fry it!!. Please subscribe to my channel!!BALISAGE Cooking\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_157_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_157_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_157_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_157_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. \n\t1 1/2 pounds of ground beef\n\t1 cup dried bread crumbs or corn flakes\n\t4\u00a0 Tablespoons brown sugar\n\t2 Tablespoons mustard\n\t1/2 Cup ketchup\n\t1 cup milk\n\t1 beaten egg\n\t1 Diced onion\n\t1/2 Cup diced carrots\n\t1 Cup diced celery\n\t1 Diced bell pepper\n\tSalt and pepper to tasteUtensils:\n\tBowls, measuring cups, loaf pan, knife, cutting board, spoon, whisk or fork, and oven.. \n\t1 1/2 pounds of ground beef\n\t1 cup dried bread crumbs or corn flakes\n\t4\u00a0 Tablespoons brown sugar\u00a0\n\t2 Tablespoons mustard\n\t1/2 Cup ketchup\n\t1 cup milk\n\t1 beaten egg\n\t1 Diced onion\n\t1/2 Cup diced carrots\n\t1 Cup diced celery\n\t1 Diced bell pepper\n\tSalt and pepper to taste. \n\tPreheat oven to 350 degrees.\n\tIn large bowl combine the beef, beaten egg, onion, Milk, and the bread crumbs or crushed cornflakes.\n\tSeason with salt and pepper to taste. \u00a0\n\tCombine the brown sugar, mustard and ketchup in a bowl.\n\t\u00a0. \n\tPlace the meat loaf into a lightly greased loaf pan.\n\tShape the edges.\u00a0\n\tSpread the brown sugar mixture over the top.\n\tPlace it in the oven for approx 1 hour. \u00a0We are at 3500 feet and it takes mine longer than an hour to cook until well done. \u00a0\n\tSave the left over brown sugar mixture and use as ketchup at the table for those who love the flavor.. \n\tI have used the base of this meat loaf recipe using different vegetables from left overs and garlic or other seasonings for a change.We always go back to my original, for the wonderful flavor and texture; as our favorite meat loaf recipe. \u00a0I do not speak for my daughter though. She makes it for her husband \"The Condiment King,\" who likes a little hamburger with his condiments! \u00a0She also uses corn flakes instead of bread crumbs. The texture is not as firm as mine. It is fun to experiment though. Meat loaf could be sliced ahead and taken to a\u00a0picnic for a quick slap on the grill.\u00a0\u00a0\u00a0\n\tIn closing I would like to thank our instructables company, sponsors, authors, readers, and members; for making this community a great success! Many hours and hard work has been put into making this place the best DIY on the Internet. Have fun and thanks for stopping by!\n\tSunshiine\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_158_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_158_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_158_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_158_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. For this instructable you will need only minimal, readily available ingredients:-1 jar baby fruit puree, around 100-120g-1 tsp gelatin powder-50ml boiling water-spoon to mix ingredients-jar to set and serve jelly. This is crazy easy, and simple to adapt. -Dissolve the gelatin in boiling water in your serving jar. Spoon out any stubborn chunks of gelatin that don't want to dissolve, they're just a pain and too small to make a noticeable difference. If you want a firmer jelly add another 1/2 tsp gelatin or use 1/2 teaspoon less for a very soft jelly. -Add in your puree and stir well.If it doesn't all come out pour some of the jelly mix back into the jar and give it a shake. This will loosen the puree and it should all pour it and mix evenly now. -Leave it to set in the fridge. Four hours is usually a good minimum setting time. . There it is, it's that simple! Use any puree you want, fruit or veg. It's a nice change for your little one, but still comforting to them as it is the flavours they know and love. I've made this with my own purees too. My favourite is to stew finely chopped apples and pears with a hint of cinnamon and a splash of vanilla, sugar isn't needed as the fruit is sweet enough. I cook until soft and chuck it in the blender until it reaches the right consistency then add in the gelatin mixture. . Full credit for this instructable goes to Rafferty's baby products. The original recipe can be found here https://www.raffertysgarden.com/snacks-and-treats I hope you and your little one enjoy and please leave questions, suggestions and experiences in the comments :)\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_159_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_159_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_159_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_159_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. \n          for the cake:\n\t\t2 eggs\n\t\t2/3 cup white sugar\n\t\t4 tablespooons pineapple juice (I used orange juice since I used fresh pineapple!)\n\t\t2/3 cup all purpose flour\n\t\t1 teaspoon baking powder\n\t\t1/4 teaspoon salt\nfor the brown sugar topping:\n\t\t2 tablespoons of butter/margarine (I used Earth Balance)\n\t\t1/3 cup brown sugar (packed)\n\t\tcan pineapple rings (or fresh pineapple!)\n\t\tmaraschino cherries\nThis recipe makes 8 cupcakes!\nYou'll also need a muffin tin, sifter, nonstick cooking spray, and an oven set to 350 F. :D\n        . Melt the butter and brown sugar in the microwave - 30 seconds should do it. Once melted, mix it well!\nSift the dry ingredients.\nMix the wet ingredients thoroughly!\nAnd preheat the oven to 350 F. . Once the wet ingredients are mixed well and the dry ingredients are sifted, you can mix them together.\nPour the dry into the wet, and mix until everything is combined and you can't see any clumps of dry ingredients. :). Spray your muffin tin with nonstick cooking spray. :)Spoon in a little bit of the butter/brown sugar mixture into the bottom of each cup - it ends up being a little less than a tablespoon each. Then place a maraschino cherry in the middle and surround it with bit of pineapple.Now you'll spoon the cake batter over the pineapple - fill the tin almost to the top!. Bake in a preheated 350 F oven for 20 minutes. Test their doneness with a toothpick - it should come out of the middle of a cupcake nice and clean!\nOnce they're done, you need to let them cool for at least 15-20 minutes, preferably on a wire rack. :) If you try to take them out of the pan right away, you'll probably lose your toppings because they'll be too hot.\nGo around the edge of each cupcake with a butter knife to help loosen them. Then cover the muffin pan with a large plate or a cutting board and flip it over! This should let them come out nice and easy, and in one piece! You can see one of mine did a flip. :P That's okay - just reassemble it carefully. :D. Because of the topping, I'd recommend keeping these in the fridge if it's warm where you are! They keep well for a couple days, but they have a tendency to dry out a little after that since the batter doesn't have any fat in it. :)\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_160_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_160_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_160_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_160_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Edible Supplies:\t\tCupcake Mix (any flavor)\t\tFrosting (any flavor, but you'll need it to be green)Mini Oreos (I bought 2 $1 packs from Target and that was plenty)\t\tNormal Sized Oreos Brown Candy Melts (you'll need very few) Green Candy Melts (you'll need very few, way less than pictured) Orange Candy Melts (I used a whole bag to dunk all of the mini oreos shown and three big oreos) Brown SprinklesSupplies You Shouldn't Eat:\t\tCupcake Pan\t\tCupcake Liners\t\tContainer to melt melts\t\tCling Wrap\t\tZip Lock Bag\t\tWilton Piping Tips, I used the  #2 round and  #5 round\t\tWax Paper. You'll need to make your cupcakes. \u00a0There is nothing special there, just make them and give them time to cool.\nTo get ready to dunk your Oreos, get a piece of wax paper out. \u00a0You will need a pretty big piece if you do as many as I did. \u00a0Then, in a container,\u00a0carefully\u00a0melt your candy melts. \u00a0Gather your Oreos and you are ready to go!\nMy initial idea for dunking these Oreos, was to stick toothpicks in the cream and then dunk them like that so I could easily remove the toothpick after they were dunked. \u00a0This does not work. \u00a0Because the candy melts are so dense, they cause the toothpick to act as a lever and instead of dunking it in and taking it out all nicely, it pries your Oreo apart and causes a mess.\nSo, to dunk my oreos, I just threw them in the candy melts and used a fork to get them out. \u00a0Sometimes they had too much coating and when they did I would lightly press them against the side of the bowl to get off some of the excess. \u00a0Once they were dunked, I carefully set them on the wax paper. \u00a0I put them so they were standing on their side if I could.. Use the same method for melting candy melts and getting them ready with the wilton tips as I did with the Skeleton Cupcakes\u00a0(Step 3). \u00a0You will not need many candy melts of green or brown at all. \u00a0You are only doing small details and it goes a long way. \u00a0I had extra after I did everything and so I drew out chocolate bats and did green vines, which I did use later.Stems:\nHeat up your chocolate candy melts first. \u00a0Prepare a ziplock bag and you will be using a #4 round tip. \u00a0I show in the pictures above how I did the stems. \u00a0It's fairly simple. \u00a0All I really tried to make sure I did was got a nice thick stem that sort of stuck up. \u00a0Their stems aren't always that long, so you just need a little stubby one on top.Vines:\nHeat up your green candy melts for your stems. \u00a0I used 12 pieces and it was\u00a0definitely\u00a0enough. \u00a0Now just draw some vines on your pumpkins. \u00a0I did a couple leaves using the same method as the stems, except, in stead of pulling up and away from the pumpkin, I kinda of went along the pumpkin. \u00a0You can see a little leaf in Photo 5. \u00a0With your extra green, draw some vines on your wax paper. \u00a0I put these on some of the cupcakes later, just for a little extra something, something.\n*Tip: Since you don't really get the zip lock dirty because the candy melts are wrapped in cling wrap, you can use both corners of the bag. \u00a0Then you only need one bag to do the stems and the vines.\n**Another Tip: Make sure when you put the melts in the cling wrap, that you really twist the ends and get the candy melts all grouped in the middle. \u00a0Otherwise they will spread out in the cling wrap as they melt and as you smush them.. Now all you need to do is frost up your cupcakes. \u00a0Throw on some sprinkles and put on a pumpkin or too. \u00a0Do not press the pumpkin in like you did with the bones in the Skeleton Cupcakes. \u00a0This won't push them in the cupcake because the pumpkins are too fat. \u00a0This will just make a mess of the frosting. \u00a0Just set them on top. \u00a0They should stay fairly well. \u00a0The more frosting you use the better,\u00a0because\u00a0while they won't push into the cupcake, you can bury them in the frosting. \u00a0I put some more sprinkles around the base of the pumpkin once it was on the cupcake.\nFor the Great Pumpkin, you are going to need to cut a slice out of the pumpkin. \u00a0See photos 8 - 10. \u00a0Once you cut out the slice and frost it, make sure you remember where it is because it is hard to tell once the cupcake is frosted :)\nNow you can put your pumpkins on all of your cupcakes and throw some vines in as well. \u00a0I tried to make it look like the vines were coming from under the pumpkins (though, I know the vines would be around the stems).. I always take so many pictures of my\u00a0finished\u00a0projects to get just the right one. \u00a0So I am sharing a bunch with you here :)\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_161_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_161_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_161_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_161_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. This  popcorn ball recipe produces soft and chewy Jello flavored popcorn balls. There are relatively few ingredients and should take less than 20 minutes to make. Popcorn Ball Ingredients:+ 6 Tbsp. butter or Margarine + 3 c. tiny marshmallows+ 1/2 package of 3oz.  (3 Tbsp.) Jello mix+3 quarts unsalted popcorn (about 1/2 cup of popcorn kernel)For Ornaments:+ saran wrap+ Ornament hooks (usually sold in the 100 packs for 99 cents)Misc. things you will also need:+ popcorn popper or large pan and oil to make popcorn+ 2-3 large mixing bowls+ Spatula and mixing spoon+ Microwave+ wax paper to set treats onOriginal Recipe from Chris Tanaka. First Wash Your Hands Good, because nobody likes food borne illnesses or diarrhea. Pop about 1/2 cup of popcorn kernels.  I used a popcorn popper but you could also use a large pot and some cooking oil if you do not have one.  Now use your hands to sift through the popcorn to make sure you there are no stray un-popped popcorn kernel. Transfer the sifted popcorn to a separate bowl. Measure out at least 3 quarts of popcorn, a little extra is fine.  'WARNING!!!'Un-popped popcorn kernel that find themselves in the in the popcorn balls can crack teeth. So it a good idea to hand sift the popcorn from one container to another to avoid a painful and costly accident. . Add 6 Tbsp. butter or Margarine, 3 c. tiny marshmallows, 1/2 package of 3oz.  (3 Tbsp.) Jello mix in a bowl. You can use any jello flavor or color you like. I made a double recipe of lime and a single recipe of cherry jello. Put mixture in the microwave for 40 seconds.  Mix the ingredients with a spatula and microwave again for about another 40 seconds. Make sure all ingredients are mixed together thoroughly and the butter and marshmallows are melted. You may need to put it in the microwave another 20 seconds or so but try not to over microwave or they may not stick together.  It should have a thick syrupy texture.You will have to work quickly. Pour the mixture over popcorn and mix well with a large spoon.Once the popcorn and mixture is thoroughly mixed, butter your hands and start forming popcorn balls. The butter prevents the sticky mixture from sticking to your hands and adds a hint of buttery goodness. I think one recipe would make about 10-15 balls, but it really depends on how big you make them. WARNING!!!'Un-popped popcorn kernel that find themselves in the in the popcorn balls can crack teeth. So it a good idea to hand sift the popcorn from one container to another to avoid a painful and costly accident. . Of course you could just eat these they way they are, but they would also make great Christmas ornaments. You will need some saran wrap and ornament hooks to complete this project.Double wrap each popcorn ball with a small piece of saran. You could single wrap them but the problem is that saran wrap can come off easily and then your popcorn balls will be on the floor. Now stick the hook through the saran wrap and hang them on the tree.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_162_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_162_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_162_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_162_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Recipe: For 2 personsIngredients: For the grilled courgettes & aubergines:2 medium courgettes, cleaned, thinly sliced 2 medium aubergines, cleaned, thinly sliced 2 tablespoons chopped lemon thyme, no big stalks, please! 3 fat cloves of garlic, peeled  finely chopped a fruity extra virgin olive oil black pepper pink salt in a grinder For the oven roasted potato fries: 4 white potatoes, peeled & cut up into thicker fries black pepper pink salt in a grinder that same fruity evoo, as used above For the feta: marinated feta chunks, drained For the lemon thyme dressing: 6 tablespoons of that same fruity extra virgin olive oil 2 tablespoons fresh lemon juice 1 tablespoon chopped lemon thyme black pepper pink salt in a grinder. Marinate the sliced courgettes & aubergines for about 2 hours with the marinade ingredients. Take a small bowl a add lemon juice, chopped lemon thyme, black pepper, pink salt, chopped garlic pieces & that fruity evoo. Mix well with a small whisk & pour this all over the sliced courgettes & aubergine slices. Brush it all on with a silicon brush.. About 40 minutes before dinner, preheat your oven to 200\u00b0C ( 400 F) for 10 minutes. In a fitted oven dish, arrange your potato fries. Scatter 5 grins of black pepper & 5 grins of pink salt over it. Drizzle about 6 drizzles of that fruity oil over them. with clean hands, mingle everything together & roast for about 25-30 minutes into the hot oven.. Heat up onto high heat your grill pan. When hot, brush your slices with the same marinade on one side & place them onto the hot grill. With a fork, push the slices onto the hot grill. Grill them for a few minutes & then brush this upper side with the marinade & turn them over to grill the other side. This side only takes a minute or 2. Place them onto a big plate layered with double kitchen paper to dip off the excess of oil. Place another big plate over it to keep warm.. While the potato fries are in the oven & your grilling is nearly ending, make your easy dressing! Take a jam pot with fitted lid & add all ingredients for your dressing in it. Screw lid on & shake your jam pot up & down for a few times. This way, your dressing is well mixed. Taste! You must taste the lemon & the chopped thyme! Yum yum!Plate up, like photo above & just before serving, drizzle a  bit of the yummy dressing all over your dish, in a circle! MMMMM! This is lovely enjoyed with a crisp white wine, like this one 2009 Jacob\u2019s Creek Riesling Reserve, South Australia. A perfect partner! Enjoy, sweeties! xxxYou can also find this tasty recipe here on my blog: http://sophiesfoodiefiles.wordpress.com/2016/10/16/vegetarian-grilled-veggies-dinner/\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_163_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_163_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_163_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_163_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. 4 boxes of lasagna noodles\n4 1 lb. packages of firm tofu\n4 1 lb. packages of soft tofu\n1 box or large bag of spinach leaves washed, dried and julienne\nolive oil\njuice from two lemons or 1/3 cup lemon juice\ngranulated garlic\nsalt and pepper to taste\n1/2 cup of soy milk\nand some homemade lasagna sauce\u00a0 https://www.instructables.com/id/Spaghetti-Lasagna-Sauce/\nfor the veggie wash you will also need some baking soda\na food processor is great but if you don't have one a sieve or potato masher will work as well. Drain the tofu. Don't discard it. I put it with my water that I use for my garden. Chop it up and set aside.\nIn the food processor you are going to add:\ntofu\nlemon juice\nmilk\ngranulated garlic\nsalt and pepper\nI did it in little batches. Add more milk or lemon juice if you need to. If you can only find extra firm tofu, you will have to add more lemon juice and milk.\nAfter it is all blended and smooth add the spinach. Side note: I wash my veggies with a bowl of water and add a couple or more tablespoons of baking soda. I let the veggies sit in the water for about fifteen minutes and carefully remove them so that the yucky stuff stays at the bottom. For the spinach I just laid it out on a kitchen towel to dry before I cut it up.. \nCook the noodles in small batches to prevent them from breaking. But if they do don't worry about it. I cook the noodles last because if they sit too long they tend to stick together. Even if you add oil to the pot of water.\u00a0 If your water is boiling nicely it should only take from 8-12 minutes for them to cook.. \nNow for the fun part!\nYou are going to layer the ingredients in the following order till it is all gone!\nolive oil\nsauce\nnoodles\n\"cheese\"\nrepeat\nBake at 350 degrees till it is all heated through. Probably about thirty minutes. I top mine with equal parts bread crumbs and parmigiana cheese before baking.Enjoy!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_164_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_164_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_164_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_164_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. -CAJUN SPICE MIX-1 tsp Cumin2 tsp Salt2 tsp Garlic Powder2.5 tsp Paprika2 tsp Black Pepper1 tsp Onion Powder1 tsp Cayenne2 tsp Oregano-FISH FILETS-4 Medium Tilapia FiletsOlive OilCajun Spice Mix-CAJUN RICE-2 TBSP Butter2 tsp Cajun Spice Mix1 Cup White Rice1 Celery Stalk2 TBSP Fresh White Onion-CREAM SAUCE-4 oz. Crab Meat2 TBSP Butter1 Cup Heavy Cream2 tsp Cajun Spices. Combine your spices in a bowl. This will yield about 1/4 cup. Keep the remainder in an air tight containerThis is my take on a Cajun spice mix. It is a mildly hot mix that can easily be changed. Feel free to modify the spice levels to your liking.. Tilapia is a fairly mild fish, so it works quite well with this recipe.Preheat the oven to 450 degrees Fahrenheit (230 C).Line a baking pan with aluminum foil.Pat the fillets dry and brush on a light coating of olive oil. Liberally apply the spice mix to both sides of the fish and place on the pan.Place the pan in your oven and bake for 12-15 minutes (thickness of fish will vary cooking times). The fish is done when it flakes easily with a fork.. Finely chop the celery and onion.Place your skillet over medium to high heat. Add 2 TBSP of butter to the skillet and saute the celery and onion.Add the rice and brown very slightly. Add the chicken stock and Cajun spice then simmer until done (app. 15 minutes). Place the butter and cream in a saucepan over medium heat, stirring constantly. Drain the crab and put it in the pot.  Add 2 tsp. Cajun seasoning and continue to stir. Watch it closely because it will boil over if you do not constantly stir it! Simmer for 5-6 minutes. Remove from heat and allow to cool slightly. -note- If you want a thicker sauce, add 1 tsp of corn starch to water, then add to the sauce while cooking. . Place the fish fillets on a bed of rice. Add the sauce and enjoy!Once again, thanks for reading!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_165_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_165_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_165_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_165_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. So this is a pretty basic bread except for the extra stuff. I have changed it some from the original one I tried. Among other things I substituted a cup of whole wheat flour because I like it. You start with beer. Now the original said 10 oz  of beer but I found that the final mix was to dry with that amount. I think that is because my flour is dry and I always end up adding extra water to almost any recipe. Why would my flour be dry? Well, the inside humidity is about 15 % at the moment and it sucks the moisture out of everything. My cats fur lights up with all the static charge they develop. So it needs the extra water, so why not use more beer?The type of beer? Well I tried one of those dark strong flavored ones but to me the bread had a bitter taste from it. Other people didn't think so, so basically go with what you like. For a mild flavor use a mild beer. if you like the strong beer then go with that. The beer needs to be flat and warm. In other words you need to ruin it. I found the best way is to first take a sip, and then pour the rest into a bowel or cup and use a spoon to stir it until a lot of the bubbles leave. Then microwave it for a little bit until its warm. Stick your finger in it and taste it and if you lament about having ruined it then its just right. Yum, warm flat beer. It should be a little less than 12 oz, unless your sip was more like a glug. Now just leave it alone and put the rest together.. You need 3 cups of flour. I used 2 of a bread flour and one of whole wheat. I think it has a little more body that way.Interestingly I had just put some wood in the wood stove when I started this loaf and I had some sawdust stuck to my shirt. When I plunked the bread pan down on the table some of the saw dust fell into the empty pan. And I had a moment of inspiration ---  I wonder if I add a tablespoon of sawdust to it can I enter it into the wood contest? I of course had to look up information about sawdust and bread. It has happened and been done. In fact around the turn of the century, well the one before this one,  it was not uncommon for it to be put in prisoners bread. It is undigestible and costs almost nothing but makes people feel like they are getting food. You might want to look up information on it, its very interesting.  And now its being added to bread for dieters, refined cellulose, because it is undigestable  but makes you feel like your eating something.  But now its all labeled and such so you know what your getting.OK, no sawdust in this one.3 cups of flour1 tablespoon of sugar (I often use honey instead of or even with) 1 1/2 teaspoons of salt1 tablespoon of butter, or margarine12 oz of flat warm beer1 package of yeast, you can use bread machine yeast or quick rise yeast, whatever works the best for you.8 oz of cheese  -   yeah its a lot of cheese.  I found the type of cheese has a big impact on the flavor of the bread. In one of my other tries, along with the dark beer I also used 4 oz of extra sharp cheddar cheese and 4 oz of Monterrey jack. Wow did that Cheddar flavor come through. If you want strong flavor go for it.  For this one I used mild Cheddar and Monterrey Jack, half and half.Cut up the cheese into bit size blocks. . I have read bread machine instructions where they say to put in ingredients in a certain order. I never saw the point since it all gets mixed up in the first few seconds anyway. So I start with one cup of flour and then melt the butter and add that along with the other dry stuff, add some cheese and then add some of the beer. Add another cup of flour on top of that and add the rest of the  cheese and some more beer and then the rest of the flour and beer. As long as it all gets in there, that is what matters. . This is the easy part, compliments of the bread machine. It does all the work.Set the program for a basic bread. I always set the crust to light, maybe that is just because of my machine. Do it the way you like it. Mine takes 3 hours to run the program.Push start.This is the one important part ----After a few minutes when it is all mixed check to see what the ball looks like. If it is to dry there will still be a lot of flour left that is not formed into the ball. You need to add water a little at a time until everything is picked up into the ball. If it is too wet it will have stuff sticking to the sides of the pan and looking a little gooey. If it does this you need to add a little flour a spoon at a time until nothing is sticking to the sides of the pan. You should have a nice solid ball that doesn't stick to the sides. You will notice that the cheese gets chopped up as the mixing progress. Pretty soon you wont even be able to see any pieces of cheese at all. It all gets blended into the ball.  . After 3 hours, the machine beeps and you get to take out your finished loaf. All that is left is to cut it up, put a little butter on it and enjoy it. Wow, that is good stuff. Make a sandwich by adding a few slices of bacon and you have a complete snack. Or if your into hot stuff you can dip pieces into hot sauce. However you like it.  By the way, in case your wondering, all the alcohol is evaporated out of this in the baking process so its fine for anybody to eat including children. \nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_166_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_166_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_166_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_166_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. First off we want to mix together the cinnamon, allspice, flour, and brown sugar.. Cutting the apples into the pi shape proved to be pretty tricky. The best way I found was to first core the apple, peel it, cut the apple into thin circular slices. Stack the slices on top of each other to resemble the original apple shape and begin carving. The spare scrap pieces of the apple will be used for the base. Toss the apple slices in with some lemon juice so as to prevent oxidation. Ginger ale will also do the trick.. Preheat the oven to 400F\nPour the dry ingredients in with the apples and evenly cover. Place the apples(only the scraps) into a pan and cook on low heat until mixture is soft and sticky.\nLet cool to room temperature.\nIn a Pie pan, unroll one of the pie pastries. Pour the cooked scrap filling into the pie shell. Top with the Pi shaped apples.\nTake the second pie pastry and slice into even lines, this will be the top of our pie. It will be woven in such a way to resemble a lattice.\nPlace in oven and bake until a golden brown crust has been achieved (about 15-20 minutes).. Happy Pi Day!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_167_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_167_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_167_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_167_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Take the water boiler and pour in your water. Be careful not to spill any of it!. Plug in your water boiler and wait for the water to boil. This could take a moment, just look and listen very carefully!\u00a0. Take your cup and insert a teabag to your choice. Fold the string around the edge of the cup, so the little cardboard at the end lies on the surface.. Now it is time to add the hot water! Pour it in slowly and carefully, so there is no risk of burning yourself or somebody else near you!. The color of the water you just poured in, will change color gradually. The long you wait, the stronger the flavour of your tea will be. Precise timing is essential here!. This step is completely optional. Take a spoon and stir the tea-in-the-making. This will help to:\n-Cool the tea\n-Mix in the flavour very well.. Listen up! Here comes the hardest part: be sure not to add sugar while stirring the tea. This could take some effort, but it will assure you an original and unaffected flavour sensation you have never tasted before.. And voila! You now achieved a nice warm cup of tea, without any sugar!\nEnjoy and be sure to blow well before drinking. This will cool the tea efficiently.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_168_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_168_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_168_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_168_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. You Will Need:\n1 cup flour\n1/2 cup butter\n3/4 cup romano cheese\n3 tsp paprika\n2 tsp poppy seeds\n1/2 tsp cinnamon\npinch salt\nOven preheated to 350\u00b0F. In a large mixing bowl, cream together 1/2 cup butter and 3/4 Romano cheese until blended as smoothly as possible.. Add 2 tsp paprika, 1/2 tsp cinnamon and a pinch of salt to the butter-cheese mixture and blend until smooth.. Add 1 cup flour and mix - the dough will be rather dry and slightly tough.. Separate the dough into 18 equal parts and roll into balls, then press and flatten slightly.. Using a sharp paring knife, cut an X into each ball. Only cut about halfway down to avoid splitting the dough balls into quarters.. Press a small hollow into the centre of each ball using a finger or other utensil. Dust the dough balls with approximately 3/4 tsp paprika.. Carefully sprinkle a pinch of poppy seeds into the centre of each ball and brush any strays into the hollow you made with your finger.. Bake in an over at 350\u00b0F for 15-20 minutes. Dust baked puffs with approximately 1/4 tsp paprika to give them a brighter colour (optional). The resulting puffs will be richly cheesy and spicy. Very decadent and filling, they make a great appetizer. Serve during autumn events between poppy season in late summer and Remembrance day in November.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_169_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_169_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_169_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_169_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Ingredients:   4 Tbsp Whole Oat Groats 2 tsp Whole Chia Seeds1/3 cup Water2 Tbsp Milk1 tsp sugar (optional)Frozen Fruit for topping Appliances: Coffee bean grinderMicrowave safe bowlPlateSpoonKnife . Place chia seeds and groats into coffee grinder until mixture is of a powder consistency with minimal chunks. . Place mixture into bowl and stir in water. Microwave for one minute on high. Remove and stir again. It should currently have a dense consistency with a thin layer of excess water. Cover with a towel and wait 15 minutes for the water to soak in fully. . Remove towel and stir. Microwave for another minute on high. . Add sugar and fold until dissolved. Chop up fruit and fold in to add flavor and color. I prefer apples, coconut, and black raspberries. Guide substance to the middle of the bowl leaving room to make a moat of milk. Lastly....enjoy! \nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_170_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_170_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_170_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_170_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. 10 (or however many can quench your insatiable hunger) Regular Oreos\n1 Bottle of Vegetable Oil\n1* Box of Pancake Mix\n1 Mixing bowl\n1 Saucepan\n* Instead of pancake mix, you can just substitute your own pancake batter or even waffle batter.. Whip out your favorite dry pancake mix (or wet pancake mix, but I doubt that exists) and use your eyeballs to read the instructions on it. Use everything it tells you. Don't assume that \"Oh nobody will know if I skimp out on using two eggs, and instead use this fake egg solution I found at the grocery store!\" They will know.\nWhen you are making your pancake batter, use more flour than it calls for, or less milk. You want the batter to be thicker than normal that way it clings to the oreos. I also threw in some vanilla extract and cinnamon for taste.. Pour in just enough oil to totally cover the Oreos and to give them room to move around without hitting bottom. Set your stove-top to medium-high (ours went to 10, and I had it on 7). Using a candy thermometer (or any thermometer, I won't tell anyone) check the temperature every few minutes. When that bad boy reaches 280 Fahrenheit (138 Celsius), your ready to fry!. Grab two or three Oreos and toss them in the pancake batter. Flip them about and make sure that no part of the Oreo is left uncovered. Make sure that you don't leave them in the batter for too long. Say the phone rings and you step away for a few seconds, that's okay. However, if your oil catches fire and you waste time trying to put it out, shame on you. Those Oreos could get soggy!. Now, this is where it gets difficult. Throw your battered Oreos into the molten oil and they should start floating around like little ghosts. Try not to put more than four or five in at a time because it gets very difficult to keep up with each one, and they may start plotting to destroy you.\nContinuously splash hot oil on the top sides of the Oreos while they are cooking. This makes sure that they cook throughout (at least, in theory)\nWatch the bottom of the Oreos. When they reach a rich, golden brown color, flip them over with your tool of choice.\nContinue to watch your Oreos while still splashing them with oil. When the entire oreo is a dark golden brown, (emphasis on GOLDEN, not a dark brown) they are done. Pull them out and place them on a paper towel-topped plate. Give yourself a pat on the back, 'cause you just fried an oreo.. Let all your Oreos cool down before indulging. I say cool down, I just mean don't dig into them five seconds after taking them out of the hot oil. Shove as many in your face as physically possible. These things are like ambrosia. Call your friends, call your neighbors, call your lawyer, call your doctor, call the emergency room. Share these artery-clogging monsters with anyone you can, because I can promise you, you cannot eat them all.\n**A note from the editor-\nWe tried frying other things while making this instructable:\nGolden Oreos - As good as their chocolate counterparts.\nSnickers - The chocolate melted but they are extremely tasty\nReeses Cups - The chocolate and penut butter melted and mixed together to make a delicious concoction.\nButterfingers - Terrible idea. The insides of the Butterfinger caramelized to create a sticky, hard mess.\nRaw Pancake Batter - Turned out sort of like a funnel cake. Very good.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_171_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_171_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_171_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_171_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. You will need to order a big ole bucket of chicken.\nAny fine dining establishment will do, as long as the product is served out of a paper bucket.\nYou will need an oven mitt.  Get one that you can cut up. Do not use mom's or you will be forever banished to your room.. Figure out how you normally hold the bucket with the oven mitt.\nYou can do a side or top hold but the easiest is with the oven mitt under the bucket.\nPlace the bucket down over the mitt.\nCut a slit where your hand would pop out of the oven mitt.  The cut should be small enough to be covered by the bucket. It is best to make the hole just big enough for your hand to squeeze through.\n. Turn the bucket over.\nOutline a hole that will be as big as a clenched fist.\nCut the hole out with a utility knife or just poke a starting hole with a pair of scissors and cut around.\nDiscard the cut out piece of cardboard.. Stuff the front of the oven mitt with paper towels, tissues, etc. to fill out the thumb and finger parts of the oven mitt.\nStick your hand through the hole.\nYou may need to keep your index finger in the oven mitt to support the oven mitt under the bucket while the rest of the fingers go through, kinda like a baseball glove.\nPlace the bucket on top of the oven mitt and place your hand through the hole in the bucket.. Place the Original greasy contents back into bucket, Extra Crispy if that's what you got.\nTo do this \"trick\":\nWalk into the crowded room holding the bucket of chicken.\nPull out a piece of chicken and exclaim \"Dibs on the leg\" or something.  And say \"Who wants the last piece?\"\nHolding the bucket tilted slightly away from the intended victim or whoever is hungry so that they cannot see inside, let them reach in deep to get the last piece of chicken. \nWhen they reach in, grab their hand and listen to them try to break away and scream.\nIt may help to add a bit of fake fur fabric or real fur to the bottom of the bucket if you have it.  That adds to the wonder of what kind of animal was in there.\nHave fun! Muhahhahahahahahhahah!!!!!!!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_172_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_172_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_172_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_172_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Serves: 4Prep time: 10 minutes Chilling time: 30 minutes2 ripe bananas1 cup chocolate flavoured coconut milk1/2 cup coconut oil, melted. Gently melt the coconut oil by placing it in a heat proof bowl set over a pan of steaming water with the heat turned off.This method melts the coconut oil but should keep it below 42\u00baC which in turn keeps it in it's raw form and the nutrients in the oil intact. Keeping the temperature low will also help the mousse set faster.note: you can also melt the coconut oil in the standard way and the mousse will still work.. Place the chocolate flavoured coconut milk and the banana into a high speed blender and blend until smooth. Slowly pour the melted coconut oil in through the top while the blender is running.  The mousse should whip up and turn a lighter chocolate colour.. Pour the mousse into serving bowls and place in the fridge to set for 30 minutes.  . Serve the mousse chilled, topped with banana slices, coconut flakes and cacao nibs.  The mousse will keep in the fridge for up to 5 days, but is best eaten fresh on the day it is made.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_173_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_173_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_173_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_173_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. 15-20 red potatoes1 bunch green onions1 strip of bacon for every two potatoes, I used 8.sour creamshredded cheddar cheesefresh garlicolive oilmustard powder (optional)smoked paprika (optional)salt and pepper!\nPlus, a sheet pan for roasting the taters, a cutting board + a knife and a big bowl to mix it all together. :). \u00a0Turn the heat up to 400 degrees F.\u00a0\nI normally just scrub mine and then leave them on a baking rack to dry most of the way. :). \u00a0Cut the smaller potatoes into eighths. Cut the larger ones to match the size of those pieces.\u00a0\nThe most important part here is getting them all close to the same size!\nOnce they're cut, put them on the sheet pan and season generously with salt and pepper. Then, drizzle on a bit of olive oil and toss them so they're all a wee bit shiny.\u00a0\nThese will go in the oven for 15 minutes!. Trim off the tops and bottoms.\nThen cut them into 1/2 inch wide slices. Put them aside in the big mixing bowl. :)\u00a0. \u00a0Cut the bacon into 1/2-3/4 inch strips.\nMake sure all the fat has been cut through all the way. :)\nYour potatoes should be done with their first 15 minutes now.... \u00a0They should be getting slightly colored and wrinkly now. Just give a good toss and put them back in the oven.. \u00a0Get a large skillet and put it over medium/high heat.\u00a0\nCook the bacon until the meat turns dark red and the fat goes a lovely orange color. Keep in mind it'll cook a bit after you pull it out of the pan so don't overdo it!\nYou may need to drain the fat a couple times while you're cooking - just pour it into a bowl or mug and let harden. You can either cover this and keep it in the fridge or chuck the solid fat into the trash. Your choice. :D\nDrain the bacon on paper towels.\u00a0. Empty all of the bacon fat from the pan except for a little less than a tablespoon or so.\u00a0\nChop up a few cloves of garlic and saute them over medium heat until they get golden brown on the edges - be careful not to burn them! Keep the garlic moving!\nOnce the garlic is done, scoop it onto the paper towels. Notice you've also\u00a0accumulated lots of yummy bacon fat and bits. NOM.\u00a0. \u00a0This time they should be even more brown and wrinkly and smelling pretty fancy.\nToss them around and put them back in for the last 15 minutes. :D. \u00a0The bacon and garlic will still be warm but not hot and everyone will mingle in the bowl and get delicious.\u00a0\nNow we just have to wait on the potatoes.. \u00a0let them cool for a bit (5-10) minutes or so and add them to the bowl. Mix it all together. :). I added a couple of handfuls of cheddar cheese, about 8 oz. (a small container's worth) of sour cream, a good shake of paprika and a couple pinches of mustard powder.\u00a0\nThese amounts are really up to you. Just mess with it and go with your\u00a0taste buds!\u00a0\nMix this all until everybody's happy and coated with deliciousness.\u00a0. \u00a0Or chill! It is also good cold.\u00a0\nI just made sure mine had enough salt and pepper at this point. Not much else to do but eat now!\nEnjoy! ^_^\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_174_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_174_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_174_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_174_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Preheat the oven to 325 degrees Fahrenheit .. Take out all the supplies and put them on the counter. . Put the 2\u00a0sticks\u00a0of\u00a0Butter in a bowl to soften.. Put the Butter (once it is softened) into the mixer.. Plug Mixer into the Electrical Outlet, then measure out 1 Cup of Sugar. Pour the Sugar into the Mixer.. Add 1 Egg to the mix.. Turn the mixer on 2 until well mixed.\u00a0. Once mixed flip the power to 1.. \u00a0Measure and add 3 Cups of Flour to the mixture .. Once the Flour is added, put in1 Teaspoon of Baking Soda into the mixture .. The last part of mixing is to add1' Teaspoon of Vanilla. Then mix until one solid mixture , not separate ingredients in a bowl.. Freezing is an optional thing that lets you freeze the dough until it is needed to be rolled and baked.. Grab a hand full of Flour and spread it out on the counter in about a1x3 foot space(use more Flour if needed).. If you decide not to freeze than you can go ahead start rolling the dough out, in a pushing/ pulling motion all the while pressing down.\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Tip: If the dough is sticking to the rolling pin then you can rub Flour over the rolling pin.. Roll the dough until about 1/2 centimeter thick.. Get out the cookie cutters and cut out the cookies by placing the cutters over the dough and applying pressure.\nTip: Make sure the sharp side of the cookie cutters are pointed towards the dough.. Once the cookies are cut use a small spatula to transfer them out of the dough and onto the stone tray.. Place the tray into the oven. Set the oven timer to 10-12 minutes and hit start(or let it start on its own). . After the timer has gone off, put on a oven mitt and take out the tray.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_175_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_175_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_175_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_175_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. \n          For the cup and saucer:\n\t\tAbout 100g dark chocolate (per cup)\n\t\tThin cardboard\n\t\tAcetate\n\t\tClingfilm\n\t\tSaucer\n\t\tTeaspoon (made from one piece of metal or plastic)\n\t\tScissors\n\t\tSellotape\n\t\tPaintbrush\nFor the mousse filling (this will make enough to fill about 4 cups):\n\t\t100 g dark chocolate\n\t\t1 egg yolk\n\t\t2 egg whites\n\t\t20 g caster sugar\n\t\t2 tbsp strongly made instant coffee\n\t\t(optional 1 tbsp of Kahlua)\n\t\tWhipped cream to finish. Start by making the central piece, the cup. To make the cup, first make the mold. Cut a strip of cardboard, roll it into a short, squat tube and secure it with a couple of pieces of tape. Test the size by sitting it on the saucer. I used a saucer 12 cm in diameter, and piece of card 18 x 5 cm, with about a centimetre of extra length to overlap for a secure fix.\nCut two pieces of acetate the same size as your piece of card. Wrap one of these around the tube, and secure it with some tape. This will provide a nonstick surface to mold the chocolate onto.. Now you have your mold, here comes the messy part. Melt some chocolate. Take the second piece of acetate, lay it on a surface you don't mind getting a bit splattered and spoon some chocolate on to it. Brush out the chocolate until you have an even and fairly thick layer across all of the acetate.\nNow pick up the acetate and wrap it around your mold. Make sure the bottom edge all lines up, as this will help seal in the mousse, and fill in any gaps where both ends meet by brushing on a little more chocolate.\nLeave this to set. Once this has done so very gently peel away the acetate from the outside, and add another layer of chocolate in exactly the same way.. Next make the saucer the cup is going to sit on. To do this, first cover the underside of the saucer in clingfilm. Smooth it out as much as possible, to create a nice surface for molding. If the clingfilm slips, secure it by first brushing the surface of the saucer with a little oil.\nNow, melt some chocolate and pour it onto the saucer. Spread the chocolate across the saucer with a paintbrush until it covers the whole thing in an even, fairly thick layer. Run your finger around the edge to even out any drips.\nLeave this to set. Once it has apply a second coat of chocolate in exactly the same way, and leave that to set.. Once everything has set, this is the scary part, unmolding all of this chocolate.\nFor the saucer, unwrap the clingfilm from underneath and lift it and the chocolate very gently from the saucer. Peel the clingfilm away from the upper side of the chocolate saucer. Place the saucer on a serving plate.\nTo free the cup, first peel off the outer layer of acetate. Now carefully cut the sellotape securing the card and twist the card in on itself, rolling it into a tighter tube. This should make it small enough to remove without cracking the chocolate. Finally, peel away the inner layer of chocolate.\nTo prepare the cup for the mousse, spread or pipe a little melted chocolate onto the saucer and place your cup on top. Fill in any obvious gaps with more chocolate and let the whole thing set while you make the mousse.. To make the chocolate mousse, melt 100 g dark chocolate and stir in the coffee (and kahlua if you are using it). When the mixture has cooled a little, stir in the egg yolk.\nPut the two egg whites in a clean bowl and, using an electric whisk or a lot of elbow grease, whisk the eggs until they form soft peaks. At this stage pour in the sugar. Continue whisking until stiff peaks form and the mixture is glossy.\nVery gently fold the egg whites through the chocolate mixture until fully combined. Spoon the mousse into the chocolate cups, wiping off any spillage.\nLeave this in the fridge for a couple of hours to set.. While the mousse is setting, make the spoon and the handle.\nTo make the spoon, cover the back of the teaspoon and its handle in clingfilm, in the same way as the saucer was done earlier. Brush chocolate onto the clingfilm in a thick layer and leave to set. This will need two or three layers to make sure it's strong enough.\nOnce all the layers have set, very carefully peel the clingfilm off the spoon, and then off the back of the chocolate, Put it somewhere cool to store until the dessert is served.. For the handle pour a little melted chocolate into a piping bag and pipe a thick question mark shape onto some acetate or a similar non stick material. Once that has hardened, pipe another on top of the first layer to build up a little thickness.\nLeave this somewhere cool and safe to store until the dessert is served.. Once the mousse has set the assembly can be finished.\nFirst, carefully secure the handle to the cup with a little melted chocolate. Place the spoon on the saucer.\nTake some whipped cream, and spoon it on top of the mousse.\nFinish off with some grated chocolate, or chocolate shards.\nNoe all that is left is to gather some guests, to impress them with your chocolate confection. Then dig in and enjoy.. \n\t\tIf not being used immediately, the cups can be stored in the fridge until they are needed.\n\t\tI haven't given quantities for how much chocolate makes each part of the cup, since it will depend on the size of your molds. I just melted a little of the chocolate at a time, since not much is needed for each layer. Any leftover chocolate can be remelted for next time. Of course, if you are tempering the chocolate this could make the whole thing a time consuming process.\n\t\tIt's best to make a few spoons and handles spare, as they are the most fragile components of the whole thing.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_176_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_176_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_176_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_176_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. You will need the following materials:\n-Pop Rocks (I used two packets of strawberry Pop Rocks, priced at $0.99 each.)\n-Chocolate (I started out with two $0.99 chocolate bars, which ended not being enough, so I added several cups of chocolate chips.)\nYou will also need these tools:\n-Something to melt the chocolate in (You can use a microwavable bowl in the microwave or a double boiler. I used the microwave.)\n-A mold (I used plastic containers, you could also use clean soap molds, ramekins, or a sheet pan.)\n-Something to line the mold with (use parchment paper for sheet pans, and cooking oil or cooking spray for anything else.)\n-A whisk\n-A spatula. Like I said in the previous step, spray the molds with cooking spray or brush it with oil. Line your mold with parchment paper if you are using a sheet pan.. Microwave the chocolate in a\u00a0microwavable bowl in thirty second\u00a0intervals and mix it around to get it a consistent texture. I started with two\u00a0chocolate\u00a0bars and quickly realized that this wasn't enough and added in a lot of chocolate chips.. After the chocolate is thoroughly melted, I poured in one packet of Pop Rocks, and had a minor panic attack: the Pop Rocks were popping in the chocolate! I suppose it inevitable, but I wasn't sure if the chocolate bars would end up being any good with most of the Pop Rocks having been already popped.. Pour the chocolate into the molds and use the spatula to make sure the mold is evenly coated.\nAfter pouring the chocolate, open up your second packet of Pop Rocks and pour them all over the top of the\u00a0chocolate\u00a0in the molds.. Grab your molds and stick them in the freezer. I'm sure a fridge would work fine, but a freezer is probably faster. Now wait a few hours.\nBe patient, young grasshopper.. The results? A good chocolate bar with a slight strawberry taste and a small pop. It doesn't have as much pop as I would have liked, but it's still there. I had some trouble getting the bars out of their molds (I ended up using a paring knife to pry them out), so next time I would go with a sheet pan and\u00a0parchment\u00a0paper. If you do try this, let me know what you think.\nEnjoy,\nNoahh\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_177_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_177_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_177_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_177_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Cinnamon SnickerPoodles2 3/4 cups flour1 tsp baking powder1 1/2 cups sugar1 cup room temperature butter1/4 tsp salt2 large eggs2 tsps vanilla1 tsp cinnamon1/4 cup sugar for dustingMakes 32 poodles.  Print out 4-8 sheets of the attached SnickerPoodle template. (8 if you have 4 baking sheets, 4 if you have 2 and will be making two rounds on the same ones.). I like to measure out all my ingredients before I get started mixing so there's less chance that I'll forget to add something!Make sure there's a rack in the middle of your oven and preheat it to 350 degrees F.. In a medium sized bowl, blend the butter and sugar with a spoon or hand mixer until well mixed. . Add the vanilla and eggs to the bowl and continue to mix until smooth.. Add the baking powder, salt, and cinnamon and stir well.. Slowly sift in the flour, mixing thoroughly until it has all been added. Be sure to stir all the way to the bottom of the bowl, getting any hidden flour bits.  The dough will look similar to the last picture above when properly mixed.. On a large baking sheet(s), place two print outs of the poodle templates.Cut a piece of parchment paper to fit just inside the edges of the baking sheet(s).. Set yourself up with a little dough ball making assembly line. Place your dough bowl next to a prepped baking sheet and have a small bowl with the 1/4 sugar for dough ball dipping nearby.. You'll be making small balls of dough that are about the size of the thinner inner circles on the templates. The dough will expand as it cooks to grow to the size of the thicker outer lines, connecting ('gluing') the poodle ball sections together.Make the balls by rolling small pieces of dough in between your hands, checking the sizes against the template as you go. Once you've achieved the correct size, dip JUST THE TOP of the balls in sugar and place side sugar up on top of that particular ball on the template. Flatten each a bit with the heal of your hand. Your pieces should be about 1/4\" thick.NOTE: The tiny balls that connect the tail and make up the feet don't need to be dipped or flattened.. Make sure to fill in all the circles with dough balls for each SnickerPoodle!. To shape the heads, use a sharp paring knife to follow the cut (dashed) lines in order to create the poodle nose profiles.. Once you've filled in all the circles and cut the faces out, this is what they should look like!. Before baking, please gently remove the print outs by holding the parchment on the opposite side and slowly sliding them out from underneath.. Bake one sheet at a time on the middle rack of the oven for 8-10 minutes. You want them to be only lightly browned on the bottom. Overcooking them will result in less inner chewiness. (read: less awesome)Once removed from the oven, let cool on the baking sheet to allow the cookies to crisp up. Trying to remove them before they've cooled can result in the poodle ball connections breaking apart. (= broken Snickerpoodles)Repeat with the remaining sheets of poodles. . These would be great for a French themed engagement party or for just any old reason. Have fun! and if you try making them, please post photos to the comments section below and click the 'I Made It' button!Bon Appetit.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_178_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_178_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_178_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_178_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. INGREDIENTS: \n5 cups of flour (unbleached all purpose works well)\n2 cups of warm water (100-110 degrees F)\n2 teaspoons yeast (instant or dry)\n2 teaspoons salt. \nIn a mixing bowl, mix 1 cup warm water, 1 teaspoon yeast and 2 cups of flour. If you are using dry yeast, let the yeast bloom (soak) in the water for a few minutes before adding the flour. Cover the mixture, and let in sit overnight. (For a stronger sour dough-like flavor, let the starter sit for a couple of days).Note: I have found that covering the starter with a cling wrap and then a tea towel helps keep the starter from drying out. You will have to scrape a little of the starter off of the wrap, but it makes further mixing easier.. Add 1 cup warm water, 1 teaspoon yeast and 2 teaspoons salt to the starter. Mix well. Add remaining flour a cup at a time and mix in the flour. I have found it useful to hand mix (with clean, bare hands) to finish mixing, wither using a mixer or not.\nNow knead the dough. If using a mixer, follow the machine's instructions for kneading. 2-5 minutes should do the job (I like going 4 to 5 minutes). If kneading by hand, knead the dough for 5-10 minutes.. Form the dough into a ball, and cover (I reuse the cling wrap and the towel). Now let the dough rise for 3 hours.Note: The dough will rise best if the room temperature is over 70 F.. \nNow, beat down the dough, and knead it once again (with a mixer, knead of a minute or so. By hand, knead for a couple of minutes). Shape and roll the dough into a ball. For a thicker, harder crust, roll the ball in flour (My wife prefers the bread with a thinner crust).\nCover the ball with a tea towel, and let rise for 45 minutes. (I preheat the oven to 400 F after 30 minutes).. Score the top of the bread. I like to make a cross on the top (like a shepherds' bread), but you could score parallel lines in the top or skip the scoring (the crust will rip open any way).\nNow place the bread into an oven preheated to 400F, and bake for 30-35 minutes.\nRemove the bread from the oven, wrap it up in a tea towel, and let it rest for 10 to 15 minutes.NOTE: I bake the bread on a pizza stone, but before we had one, we used a cookie sheet.. For a whole/multi grain bread use 2 cups whole wheat flour for the starter and add 2 cup of the wheat flour and 1 cup rolled oats (dry) when mixing. Rolling the dough in oats instead of flour for step #6 really bumps up the presentation.\n2 or 3 tablespoons of fresh rosemary (chopped) makes a nice addition.\nBrush the bread with an egg wash before baking for a shiny, deep brown crust. For the egg wash, beat together 1 egg and 2 tablespoons of water.\nLastly, have fun with the size and shape of the loaf. Long french bread like loaves work well. Also, the dough can be divided into 2-4 balls and used as bread bowls for soup. Note: You will need to adjust the bake time.. Like almost all breads, this one is most tasty while it is still fresh and warm. Enjoy!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_179_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_179_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_179_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_179_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. A dremel or other rotary toolA Pointed drill bit (see picture above)A cleaned out eggshell (this can be done by drilling two holes into opposite sides of the egg and blowing through one)Led's/Edible Slime(instructions to make can be found online)         (optional)and thats it.... Hold the egg at an angle to the rotary tool and press slightly to make a hole, move it sideways and make a line... thats it. Light it up, if you want by just putting an LED inside it.OR.. fill it up with edible slime for a disgusting looking but delicious desert.(it takes some practice to make slime of the right consistency so that it drips out). Don't get frustrated if you don't get it the first time, it took me a few tries too.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_180_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_180_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_180_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_180_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. 200gms whole Black lentils (urad dal) Kidney Beans (handful) 3 Onion (medium size) 5 Tomatoes (medium size) 3 tbsp Ginger and Garlic paste Chili powder Coriander powder Garam masala 5 tbsp Ghee  Salt Coriander leaves  4-5 tbsp CreamFor the tadka50gms unsalted Butter 2 tbsp  chili powder 1/2 tbsp Asafoetida 1 tbsp Cumin seeds 2 tbsp Garam masala. Soak the black lentils and kidney beans in water overnight. Boil the black lentils and kidney beans in a pressure cooker at high flame. After 5 mins turn the heat to low and allow the lentils to cook for 20 mins. After 25 mins turn off the heat and let the lentils rest in the pressure cooker.. Finely dice the onions and tomatoes. In a heavy bottomed dish add ghee (turn the heat to low). After the ghee is heated add onions and saut\u00e9. After the onions turn brown add ginger garlic paste and allow to cook for 5 min. After the raw smell of the paste disappears add the tomatoes.  Add chili powder, coriander powder, salt. Let this masala cook for 7-8 mins (till the tomatoes have cooked and the oil separates).Now its time to add the lentils.. After all the pressure from the pressure cooker has been released mash the lentils lightly with a masher. Next add the lentils to the masala. After add the lentils, mix well. If your Dal is very thick add hot water to get the required consistency. Add salt and let the Dal come to a boil. Boil at high heat for 5 mins and then turn down the heat to low and cook the Dal for 30 mins. Every 5 mins make sure you stir the Dal to avoid formation of clumps. After your Dal has achieved a thick consistency turn off the heat.. In a small pan heat butter (at low heat). Add cumin seeds to the heated butter. Add chili powder, garam masala, asafoeida, coriander powder. Saute some fresh coriander leaves. After cooking the tadka, add this to the Dal.. After adding the takda, mix well into the Dal. Finally add cream, mix well and garnish with fresh coriander leaves.And thus your Dal Makhni is ready to serve. Serve Dal Makhni with bread, naan or rice.Enjoy !\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_181_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_181_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_181_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_181_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Add all the ingredients (except butter) to a mixing bowl and whisk away until well blended.(This first step is also great for the budding sous chefs to get involved in).. 1. Add about a teaspoon of butter to the pan and let it heat up (on medium heat).2. Pour a big serving spoon dollop of the batter onto the pan and spread it around the pan.  If you find it difficult to spread it using a spoon, just lift the pan and move it back and forth to so that the batter forms somewhat of a round shape.3.And then let it cook for about a minute or so, or until you start noticing little bubble like holes starting to form . Carefully (using a wide spatula) flip the crepe to the other side and cook for another minute or so.  The crepe should have a nice golden brown color to it.. 1. And then the best part - simply spread the nutella onto the crepe. (Spread away and make sure you almost get every part of the crepe covered.). Then, simply roll up the crepe and sprinkle with some powdered sugar if desired.ENJOY!!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_182_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_182_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_182_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_182_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. To make these 3 snacks you will need:For the dough:All purpose flour- 4 cups,Cooking oil- Half cup,Salt- half tea spoon, just for taste,Baking powder- half cup.To make the crispy snacks you will need- half tea spoon of nigella seeds,For the twisted-sweet snacks you will need- 1 tbsp of sugar,For the crunchy coconut snacks you will need- a cup of shredded coconut and 1 cup of sugar.. To make the dough you'll need- all purpose flour, cooking oil (I used soybean oil), a pinch of salt (for taste) and baking powder (check the ingredients step for amount).Mix all the ingredients together. Add water to make dough. Knead the dough very well. Keep kneading until the dough is smooth.Divide the dough into 3 equal portions (since we're making 3 different snacks).. Take any one of the divided doughs and add half tea spoon of nigella seeds to it.Knead the dough very well until the nigella seeds are mixed well.Make a smooth ball and roll it into a roti bread.Cut the  roti bread into thin (1 inch width) slices and then again cut the slices diagonally into small pieces as shown in the picture of this step.Add oil to a frying pan and heat oil in high heat. After the oil heats bring the heat down to medium. Fry the pieces evenly on all sides and  until they're brown. Use a slotted spoon to take them out of the oil.. You don't have to mix anything with the dough for this one. Make a smooth ball and roll it into a roti bread.Cut the roti into small pieces as shown in the picture of this step. Take any of the pieces and twist it, press the ends to secure the twist. Similarly twist the other pieces.In a frying pan heat oil in high heat and once the oil is heated brig the heat down to medium. Fry the twisted pieces evenly on all sides and until they're light brown. Keep them in a dry place while preparing the sugar syrup.In a saucepan add 1 tbsp of sugar and 2 tbsp of water. Stir until the sugar is dissolved and bring the mixture to boil.Add the fried twisted snacks to the thick sugar syrup and stir them until they are covered with sugar syrup evenly on all sides. Take them out of the sugar syrup and serve.. Take a bowl and add the last dough, 1 cup of shredded coconut and 1 cup sugar. Mix them really well. You might need to sprinkle a little flour while kneading. Keep kneading until the dough is smooth and not sticky.Divide the dough into 2 equal portions and make 2 smooth balls. Roll them into thick roti breads.Cut the  roti bread into small pieces as shown in the picture of this step.In a frying pan heat oil in high heat and  once the oil is heated brig the heat down to low. Fry the  pieces until they turn light brown. Fry evenly on all sides. Use a slotted spoon to take them out of the oil. Done! Serve them or store them to serve later! Enjoy!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_183_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_183_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_183_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_183_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Leftover \u2022turkey\u2022stuffing\u2022cranberry sauce\u2022bread. Put turkey on bread. Get your sauce!. Put your stuffing in the microwave for 30 seconds. MAY BE HOT . Put stuffing on sandwich. Done Enjoy Have it for Breakfast LunchDinnerOr midnight snack. Follow like and comment\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_184_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_184_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_184_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_184_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. This should warm up the oranges, you can put it on a longer time, I just stopped because my smoke detector was on lol XD Make sure you don't cut your hand or get burned!. Mine is an automatic boiler so it would stop boiling by itself. However, you can use a microwave to heat up water, microwave the water for around 3-5 minutes.. Make sure don't overfill two teaspoons, if you do, you won't even notice a scent of orange in your tea. Also, you need to use a cup save from heat!!! If you use the plastic ones, it will melt by the boiling water. Make sure you don't get burned, if you need to wash the peels again, go ahead, I washed mine because it had grease everywhere lol. Make sure the plate is not too big or small or you will have a mess XD. Now decorate your tea using lemons, limes, oranges, or even small umbrellas lol XD. And as a bonus for over 3000 views total on my account the calories if this recipe isOnly around 15-20 calories, depending if you need the Vegan Recipe or notAlso, HAPPY HAPPY NEW YEAR!!!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_185_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_185_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_185_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_185_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. To plant an Earthbox, you need:\n1. An earthbox - My mother gave me one as a gift, but you can order one online at www.earthbox.com/.\n2. Plants - Look for your local nursery and spends some time wandering around looking for things you'll either want to see and smell or eat frequently. I had plants forced on to me last summer and ended up really enjoying growing my own herbs and vegetables in particular so I looked for edibles at one of the many independent plant stores in my neighborhood.  I bought lemon verbana, sweet basil, purple basil, tomatilla, arugula and tarragon from a nursery a few blocks from my home. I believe that you can plant up to eight items in the box, and you're encouraged to plant edible items.  I have a large basket on my bike so I biked all of the seedling plants home, which I have to admit greatly added to the fun of the project.\nAs I discovered in the process, you'll probably also need:\n3. Measuring cup\n4. Ruler\n5. Scissors or knife\n6. Plastic bags\n7. A big tub or bucket\n8. A small plastic dish with walls\n9. A tea kettle or watering can. These steps are very self-explanatory.\nSet up your box in a place that will get as much sun as possible.\nWhat you don't get in the instructions, though, is a reminder to look around to see if you're likely to drip mud, water and mud-water onto your neighbor's property.  My balcony, which is just a bunch of wooden slats, is right above my neighbor's balcony.  So I dripped potting soil water all over the table and chairs she has on her balcony.  I recommend avoiding having to apologize for this by laying out some plastic bags beneath your whole set up.. Open the large bag of potting soil that came in the Earthbox and pack it in the two holes on either end of the box. Note that the potting soil comes dry but you need to pack these holes with moist soil, so be prepared to mix it elsewhere.\nThen pour water through the tube until it runs out of the hole at the bottom of the box.  (Note: this is another reason to have a plastic bag or some other device for catching water from the box on the ground before you start.) It takes a lot of water to fill the reservoire, so be prepared to make lots of trips to the sink.  I used my tea kettle since I don't have a proper watering can, and it's really more or less the same thing.. These are the official steps paraphrased from the instructions:1. Add moist potting soil on top of the screen until the box is half full. Sprinkle that layer with water and pat it down, especially above the areas with the holes (which you filled in a previous step).2. Then fill the rest of the box -- completely to the top -- with dry potting soil. Sprinkle well with water and smooth out the soil.3. If you're growing tomatoes, mix in two cups of the dolomite that comes with the box into the top 3-4 inches of the box and re-smooth the dirt.4. The box also comes with fertilizer, which you should use to create a two-inch wide strip in the location that you want to put the fertilizer. The instructions have a handy diagram for where to put the fertilizer and seedlings based on how many and what types of plants you want to grow.Here are my notes on how to make sure this section doesn't take forever:This was the part that took the longest by far, mostly because I didn't have a big tub in which to mix water and the dry potting soil to make the moist soil the directions call for.  I ended up using this dinky little pot with built in plastic to hand mix the water and soil.  I recommend getting a big bucket or tub to do this in one big batch.. Cover the box with the black plastic thing that looks like a shower cap.\nYou should have already chosen a plant placement pattern in the last step; cut 3-inch Xs where you want to put your seedlings and plant them in the holes.  Make sure you firm up the dirt around the seedling once it's in the ground.\nThey don't tell you in the instructions, but it's not surprising that you will need scissors or a knife and a ruler or tape measure for this step. . This is the fun part.\nOne time and one time only, pull back the black plastic around the seedlings and water the plants directly.  Then put the flaps back and don't ever do that again.\nGoing forward, you will always water your box through the tube.  You water until the water starts coming out of the hole at the bottom of the box.  If you live somewhere where it wouldn't be a good idea to have this liquid draining onto someone else's property, I strongly recommend that you find a little plastic container to put below the hole to catch the run off.  I took the cheap plastic bottom of a planting pot and reassigned it to the Earthbox.\nI only water it once a day and so far that seems to be enough in the Chicago climate.\nI can't wait to eat some of these herbs.  Earthbox sells stakes for you to use to prop up any plants that need vertical support. I think I've reached that point with my tomatilla plant, but I'll probably create my own structure.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_186_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_186_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_186_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_186_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Step one: The mango should be half-ripe or ripe.\n                  Use a knife to cut it all around, lengthwise, like an avocado.. Step 2: Place a hand on each 2 sides of the mango (Photo shows only 1 hand; needed the\n             other hand to click the camera).\n             Twist in opposite directions, like an avocado. Don't squeeze the mango too hard.. Step 3: Wow! you've got 2 halves.\n              Use  a spoon to dig out the seed.. Step 4: Use that same spoon to dig in-enjoy !\n             You can share the other half with someone you love, or....\n. Step 5: Cut each half once again, so you have 4 pieces.\n             Pare off the skin.  You can cut the pieces into smaller pieces-put in a bowl to share.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_187_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_187_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_187_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_187_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. 1 \u00bc\u00a0 cup all-purpose flour\n\u00bc teaspoon salt\n\u00bd cup (1 stick) unsalted butter, softened\n\u00e2\u0085\u0093\u00a0 cup brown sugar\n1 teaspoon vanilla extract. Position rack in the center of the oven and preheat to 350. Line a 9X13 baking pan with parchment paper or grease bottom and sides very well.\u00a0 In a small bowl, whisk together flour and salt; set aside.\nIn the bowl of an electric mixer using the paddle attachment (or a bowl that you use your hand mixer with), beat the butter and brown sugar at medium speed until combined, about 1 minute. Beat in the vanilla extract.\u00a0 At low speed, add the flour mixture and mix just until the mixture is crumbly, 10 to 15 seconds.\nPat the dough evenly into the bottom of the prepared pan. Prick the dough well with a fork. Bake for 15 to 18 minutes, until golden brown around the edges. Allow to cool slightly as you prepare topping.. 1 \u00bd sticks unsalted butter\n1 \u00bc\u00a0 cups light brown sugar\n\u00bc honey\n\u00bc cup maple syrup\n\u00bd\u00a0 teaspoon salt\n1 cup heavy cream\n6 \u00bd cups small pretzel twists, lightly crushed. In a large saucepan, combine the butter, brown sugar, honey, syrup, and salt and cook over moderate heat, stirring, until foamy and slightly thickened, about 10 minutes. Add cream and cook, stirring occasionally, until a candy thermometer inserted in the caramel registers 240\u00b0F (soft ball stage), about 11 minutes longer. Add the pretzels, quickly incorporating it into the caramel.\u00a0 Pour the filling over the crust, spreading it evenly. Bake for about 15 minutes, until the topping is bubbling. Let cool completely.. Remove onto a cutting board. Cut into bars and serve.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_188_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_188_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_188_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_188_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Ingredients: 1 pound ground turkey  1 Onion  1 med to large jalapeno pepper  2 cloves garlic  2 cans of beans (I'm using six bean mix and red kidney)  2 cans diced tomatoes  1 tsp. turmeric  1/2 tsp. chili powder  Pinch of tarragon  1 tbsp. Worcestershire sauce   2 tbsp. Sriracha sauce  2 tbsp. BBQ sauce  About 1/8 cup of Franks hot sauce  1/4 quinoa (optional)Tools: BBQ or Smoker Cast Iron Pot  Cutting board  Knife  Stirring spoon  Strainer  Can opener  About 2 cups smoking chips (I'm using hickory with Maple lump charcoal)Fun Fact: If you or someone in your family is anemic, cooking in a cast iron pan can add extra iron to your meals.  The beans will also help.. Soak your wood chips 15 minutes or more. I actually like to soak mine overnight so that they're ready when I need them. I'm using hickory, but I've also used oak, and maple. Maple gives a sweeter flavour (if you're interested). The hickory is a little nutty and has more smell to it.. OnionChop up your onion in course chunks.JalapenoFinely chop your jalapeno. Tip: Don't put your fingers in your eye after.GarlicSlice off the ends of the garlic to make it easier to peel. Once it's peeled, crush and dice your garlic.. Cook turkey until it's brown on medium heat. Don't forget to wash your hands after.You're able to do this step on your bbq as well. I typically get everything up to temperature on the stove because it's faster and easier.. OnionsClear out space in the middle of your pot. Add your chopped onions and cook until semi-transparent. This usually takes about 5 min. Once it's cooked, stir the onions into the meat.GarlicDo the same for the garlic. The garlic won't take as long to brown, maybe 30 seconds. Once it's done, mix it into the meat and onions.. Add spices and stir. 1 tsp. turmeric1/2 tsp. chili powder pinch of tarragonI added the seasoning salt and club house vegetable spice to taste and smell. I didn't actually add any garlic powder, this time, the garlic I used was strong enough to do the trick.Fun Garlic Fact: It's thought to have the ability to reduce cholesterol. This is probably only is true when the garlic is fresh. . Add both cans of tomatoes and both cans of strained beans. I like to add a little extra Franks to give it a little bit of a sweeter kick. Add your pre-cut jalapenos and stir it in.I like to add about 1/4 cup of quinoa. It thickens up your chili while also giving it a nice texture. I add it in its raw state and let it cook while the chili simmers.. Light your charcoal. I build a small fire underneath the charcoal until it catches fire. Once your charcoal is lit, transfer it to the bottom tray.Close the lid and bring your BBQ up to temperature. I smoke at around 275 - 300 degrees F.Safety Tip: Use a  charcoal chimney. It's much safer than my approach. . Once your BBQ is up to temperature, bring out your chili. Strain your wood chips and add about 1/2 cup on top of the coals. Once you've added your chips, slide your chili over top. I leave the lid of my chili open so it can really absorb the smoke flavour. An hour of smoking will do, but I like to smoke for 3-4 hours.Close your lid and open your BBQ's bottom vent.You'll want to add more wood chips every 20-30 minutes. When you add your chips give your chili a stir.. It's hot coming off the BBQ so use caution. Top with cheese and enjoy. It will definitely taste better the next day, so bring some to work and make everyone else jealous.It may not look like much, but this smoked chili packs a huge punch. The nice thing about smoking your chili is that you don't need to make it very spicy for it to have a lot of flavour.If you make it, let me know how it turns out!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_189_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_189_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_189_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_189_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Tip #1 Use a deep fryer, it is unlike anything else!First slice tomatoes into equal slices, about 1/3 inch thick.  Fill deep fryer up to max line with oil and heat to 360 F.. Next set up you breading station.  Place flour on a plate and season with salt and pepper.  Next place eggs and a dash of milk into a bowl and beat together.  Finally take the panko breadcrumbs and season with sea salt, black pepper, garlic powder, onion powder, and parsley; place onto a plate.. Tip #2 Make sure that each tomato is completely coated.Now begin to bread.  First season the tomatoes with salt and pepper.  Now dredge tomato in flour, dip into egg mixture, let it drip, and coat with panko breadcrumbs.  If it is not coated completely dip into egg mixture then back into panko. Repeat this process for all the tomato slices and set them aside.  Next cook the bacon until crispy and set aside in warm place.. Tip #3 Brown paper bags work best when draining.Now begin the aioli.  Place garlic and parsley into the food processor and pulse until well minced.  Next add in mayonnaise, dijon mustard, drizzle of olive oil, squeeze of Sriracha, and season with salt and garlic powder.  Blend together on high until ingredients are all incorporated together.. Now begin to fry tomatoes.  Carefully place a few at a time into fryer and fry until golden brown.  Remove from fryer and set on paper bag until oil has drained off.Place onto plate and garnish with a dollop of aioli topped with bacon crumbles.Suggestions:Serve as an appetizer at any occasion. Great way to use up those unripe tomatoes. Serve with a white wine, or a cold beer.***For this recipe and more like it check out my food blog at everythingbutfish.wix.com/etbf\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_190_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_190_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_190_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_190_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Major Pieces Server CabinetGeorge Forman Grill (good will or some similar heating device) Commercial Kitchen Rack and trays 5 volt Power Supply (I used one from an old PC like this) qty(3) Server Cabinet Fans 2\" of rigid insulation around the cabinetControls Arduino Pro by Sparkfun (although any Arduino will work)\u00a0 LCD Button Shield by Adafruit or\u00a0Sparkfun Dallas Temperature Sensors, qty(3) Sparfkun, Adafruit 5Volt relay by Sparkfun or ebay Notes: The relay has to handle 120volts and 10 amps depending on the george foreman grill you get. Just read the specs.. The back of the unit has a piece of wood that creates a 3\" plenum all the way to the bottom. Hot air is warmed by the heater, rises to the top of the cabinet, and is blown by qty(3) fans into the plenum. \u00a0Air is blown down the backside of the unit and exits next to the heater. (see picture). In order to help moderate the radiant heat transfer we added tiles above the grill. (see picture). The front door does not seal to well. This allows quite a bit of fresh air to circulate into the cabinet which is key as moisture evaporating from the food needs to escape.. Wiring: 1) LCD Shield and Arduino need 5 volts. 2) wire 5 volts, ground, and digital control pin to George Forman Grill relay 3) Wire Server Fans always ON 4) Wire Dallas Temperature Sensors. One towards the top of the unit, one in the middle, and one at the bottom to get an average temperature across the cabinet.\u00a0. The program is attached below. \u00a0It is a pretty simple program using the Dallas Temperature one-wire library and the LCD Button Shield library. Dallas Temperature one-wire tutorial is here. LCD Shield tutorial is here.\u00a0 The three temperature sensors are averaged and the grill is turned ON and OFF based on the temperature setpoint.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_191_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_191_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_191_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_191_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. About 1 kg tomatoes1 small onion2 sticks celery4 - 5 tablespoons brown sugar2 tablespoons tomato puree1 tablespoon paprika1 tablespoon mustard Up to 2 teaspoons chilli powder2 teaspoons salt0.5 teaspoon ground allspice1 teaspoon tamarind paste 100 ml white wine vinegar. Finely chop the onion and celery. Roughly chop the tomatoes. Mix everything in the slow- cooker. Cook on high for 30 minutes, then stir and cook for 8 - 10 hours on low. Cool.. Use a hand blender to puree the sauce. Pass through a sieve to remove the seeds and skins. Re-cook on high for 2 hours to make sure the sauce is sterile before bottling.. Wash the bottles in hot soapy water, then dry in a low oven.Use a ladle and funnel to transfer the sauce to the bottles. Cover immediately. Label when cold.If you want to store the sauce for a long time, it should be sterilised - I have explained a way to do this in another instructable! https://www.instructables.com/id/Sterilise-Preserves-With-A-Sous-vide-Water-Bath/\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_192_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_192_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_192_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_192_3.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. \nI used a fair amount of golden pear tomatoes, and small plum tomatoes. They are pretty teeny, so I used a bunch... but they are also very sweet tasting tomatoes, which taste fantastic in this sauce. You can use regular tomatoes, though!\nI also used shallots, lots of garlic, olive oil, kosher salt, pepper, and a good handful of a bunch of fresh herbs... basil, oregano, thyme, and rosemary, to be exact. You can use dried, but the fresh ones taste amazing if you happen to have them growing.. I cooked everything on the grill, since it's about 1,000 degrees outside and I don't want my kitchen any hotter than it already is.\nI used part of my broiler pan from my oven to cook my tomatoes on... I just wrapped it in foil, since I don't want any of my drippings to escape (I don't want to lose any bit of the yummy!). You can use a foil tray, or a grill pan wrapped in foil, I'd just recommend that you use something with a bit of an edge to prevent oil leaking off the sides and causing flare ups.\nThen I start preheating my grill... I turn it on high and let it get super hot. This isn't really a very particular science. If you're using the oven, I usually roast tomatoes at about 450, and tomatoes this size take about 15-20 minutes. On the grill it takes about 10 tops.. This is pretty simple. You basically want everything about the same size so that it'll all roast at the same rate... so I sliced my plum tomatoes in half lengthwise, and left the pear tomatoes be (minus cutting off nasty spots on a few of them)\nLay all of your tomatoes out on your tray, and load it up with minced garlic, thinly sliced shallots, salt, pepper, and lots of olive oil. (Hold off on the herbs, for now) Toss it all with your hands so that everything is pretty evenly coated.\nThen it goes on the grill!. \nyour tomatoes are ready. And pretty amazing... I'm hungry just looking at this picture.. \nTake it inside and scrape everything into a pot, even the little bits that may stick to the foil. Take a potato masher and smush everything up really well, and use some tongs to pull out the tomato skins (or not, depends on how lazy you are... if you don't I don't think it'd be the end of the world, I know I certainly missed some)\nOnce you have everything mashed up pretty good, add more olive oil. I'm not much for measuring, but I probably added another 1/4 cup.\nMince up your herbs and toss them in, and let everything simmer on low for a good 20 minutes or so, stirring often so nothing burns.. \nAnd that easy, your sauce is totally ready to top your pizza! Or even toss in some pasta, that would be pretty good too. Enjoy!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_193_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_193_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_193_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_193_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Hotdogs - Beef, chicken, pork, mystery meat. Ballpark, fancy, plain, Polish. There are so many styles and any will work.  Skewers - Wooden or metal it doesn't matter as long as they are longer than the dogs.  KnifeCutting BoardHotdog Fixings. Grab a dog and a skewer. Slowly push the skewer through the center as best you can. They tend to be a bit wobbly so you may have to back it out a little and recenter a couple of times. When you have the meat skewered, place it on the cutting board and grab your knife. Starting with the blade resting on the skewer, just above the top of the hotdog, begin making an angled cut all the way through to the skewer. Continue down the length, turning the dog as you go, cutting to the skewer. Finish the cut completely out of the end. Remove the skewer* and set aside the hotdog. You can use the same skewer to cut all of the dogs. *See Step 4 for an alternate method.. You should cook the hotdogs normally, just be aware that they will probably take less time since there is more exposed surface area. Also they are weaker due to the cuts so take care while turning them. Also watch for flames to spring up due to grease and juices from the cuts dripping onto the grill. Not necessarily a bad thing but it will significantly decrease the cook time. To help get the insides crispier, try to carefully expand the coils to expose the cuts. I used a funny little hotdog holder we found somewhere. It actually worked pretty well, although the ends cooked faster then the middle, probably due as much to my grill as the holder.. This is essentially the same except when you finish cutting each hotdog leave it on the skewer. Then, simply grill them on the skewers, remembering to stretch the coils at least a little. This is a good method if you are trying to cook a number of dogs at once and don't have a little holder contraption as they are easier to turn and won't fall apart as easily compared to putting them straight on the grill... One other bonus of the spiral cuts is that they hold condiments and toppings well so load 'em up, or just go simple and really enjoy the crispiness and caramelization. Also because of lost grease drippings a spiral cut dog will be marginally healthier. P.S. It may or may not be worth it to post a picture(s) of your favorite way to eat spiral dogs. ;)\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_194_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_194_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_194_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_194_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. 1/2 pound of bacon2 slices of grain toast1/2 cup crunchy peanut butter1/2 cup raspberry preserve1/4 cup butter1/4 cup blue cheese. Cut bacon into small pieces and cook on medium for 12 minutes. The cook on the bacon is really up to you. If you like chewier bacon then give it less cooking time. If you like crunchier bacon then cook longer.Drain away excess bacon grease. Preserve a small amount of the grease and mix into the butter.. Butter your bread and put butter side down over medium heat. While you wait for the bread to toast, spread peanut butter and raspberry preserve on one side. On the other side, put crumbled blue cheese and bacon pieces.You will notice the peanut butter and preserve side softening and becoming runny. The blue cheese side will become soft but will not melt. Cook until you begin to get a nice crust and golden color on your toast.. Combine both halves into a sandwich. Continue to cook over medium until you have a nice golden crust. Do the same for both sides adding more butter as needed. . Take sandwich off heat. Cut diagonally and serve. Enjoy with your favorite beer of coarse. I poured a nice brown ale with mine to cut through some of the richness of this ultra heavy dish.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_195_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_195_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_195_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_195_3.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Clean the Lotus Root, peeling with a carrot peeler.\u00a0 Rinse, cut into 2\" pieces and submerge in water with a drop of water or some citrus added. Slice the pieces into thin slices, and cut again lengthwise to make slivers.\u00a0 Put back into the water. As you prepare the rest of the meal, change the Lotus Root water as it becomes darker in color.\u00a0 Expect to change the Lotus Root water two or three times before cooking.. Prepare the carrot in the same manner.. Heat a skillet or wok with 2 tbsps olive oil, 1 tbsp low sodium soy sauce, 1 tbsp brown sugar. *The brown sugar will add a lovely caramelized flavor to parts of the stir fry dish.. Cook the lotus root a few minutes.\u00a0 The texture of the root is a bit tough, and it needs extra time to cook.. After a few minutes, add in the carrot and stir.. Add in about an ounce of Sake and a tablespoon of sesame seeds.\u00a0 Stir.. Cover the pot for a few minutes until parts of the vegetables are caramelized.. If you've got traditional Japanese bowls, use them!\u00a0 It already tastes great, but this will make it taste so much better! Here I have the dish pictured with the rest of the meal I cooked!\u00a0 It was absolutely delicious!\u00a0 The lotus root is still crunchy, even after all that cooking! Here's the video:https://www.youtube.com/watch?v=SCghPTWGL-g\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_196_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_196_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_196_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_196_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. I bought my gourd in New Mexico at a charity event and they got all their gourds at an\u00a0Indian reservation. \u00a0\nIngredients\u00a0are for 1 serving. \u00a0\n1 Warty looking gourd.\n1 Package of Raman noodles.\n1 Green \u00a0food coloring.\n1 Battery\u00a0operated\u00a0\u00a0pumpkin carving knife or small hack saw.\n1 Red and 1 Black licorice for eyes and mouth.\n1 sheet of black foam sheet for hat. (Optional. )\u00a0\n1 Plastic\u00a0zip-lock\u00a0\u00a0bag depending on the size of gourd you will be using for inside the gourd.\n1 Small cup of chopped nuts as a garnish.\n1 Custard cup.\n1 Pair of scissors.\n1 marker to mark gourd and craft foam.\nKayle opitional\nI did not glue the embellishments to the gourd because I am using the gourd for a different purpose later. \u00a0\nI don't feel a person would really need to glue them on, \u00a0unless they are making this for children. \u00a0. Isn't this the most beautiful gourd you have ever seen? \u00a0\nI plan to dry mine and use it for a decorative bowl for many years. \u00a0\nIf you plan on making this dish, \u00a0I would go ahead and get your gourd and cut it and let it dry out for 3 days.\u00a0\nLike pumpkins it has a smell and allowing it to air out will\u00a0eliminate the smell totally! \u00a0\u00a0\nGourds are very hard! \u00a0Be prepared to work a little for your efforts. \u00a0\nI used a pumpkin carver but it was still very hard. \u00a0\nI did great but was rather surprised even a battery operated carver was hard to cut through this baby.\nThe good news is all you need to carve is the top so the lid will come off. \u00a0\nCut the top of the gourd off, saving the lid.\nScrape out the inside until it is smooth.\nDiscard inside scrapings or . . . .\u00a0\nIf you have a garden area you might save the seeds for next year and plant them and have your own beautiful gourds! \u00a0\n\u00a0After carving a gourd, I have decided unless I am making pies, I will be using gourds for Halloween decorations. \u00a0\nGourds are great about not getting spoiled and you can use them later as a decoration for many years. \u00a0\nIt was difficult to choose which one I liked best because they had bright\u00a0reddish\u00a0orange, striped yellow , very very warty looking ones, and green ones. \u00a0They also came in many different sizes! \u00a0. You will have a few days to work on the embellishments, \u00a0so you won't be rushed at the last minute. \u00a0\n\u00a0I liked Gourdila without her hat the best. \u00a0\nCut out a triangle to fit the gourds face for a hat. \u00a0\nCut 2 small circles from the black licorice for eyes.\nCut 2 eye brows. \u00a0\nI made some hair to go under the hat ( see picture) \u00a0optional. \u00a0Or you can use cabbage or\u00a0Kayle. \u00a0\nCut one small mouth shape \u00a0from the red\u00a0licorice for the mouth. \u00a0The shape and size depends on where the warts fall and how it will look. \u00a0\n\u00a0\u00a0. Insert the \u00a0a\u00a0zip-lock\u00a0bag into the gourd bottom.\nPlace nuts in a small custard dish. \u00a0\nHave your embellishments ready.\nMake the Ramen noodles according to the directions \u00a0but add the food coloring at the beginning before adding the Ramen.\nScoop out the Ramen and place it in the gourd.\nPlace the witches hat across the top and put the lid on. \u00a0( Add the Kayle if using it for the hair. )\nEmbellish the lid with the eyes, mouth, hair, and eye brows. \u00a0\nI did not add the juice to mine but you can if you like. \u00a0. When my grandson was younger he could eat Ramen noodles and cereal for every meal. \u00a0\nI wished I had thought of this years ago, it might have been just the thing to make him not want to eat Ramen noodles again. \u00a0\nNo, probably not, he would be just the kid that would think it was totally awesome! \u00a0\nNever the less I hope you have a great time grossing people out \u00a0for this Halloween! \u00a0\nHAPPY HALLOWEEN!\n\u00a0\u00a0\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_197_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_197_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_197_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_197_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. Use dry measuring cups for the Flour and liquid measuring cups for the Water4 Cups sifted white all purpose flour1 Tablespoon iodized table salt1 + 1/2 Cups water1 Large Egg Yes I know it seems crazy simple but read on. Mix the Salt and Flour. Add the Egg to the Water and mix them. Now add the Liquid to the Dry ingredients.. Start with a fork but you're gonna have to get your hands into it. I use gloves because it is very sticky and I don't want anyone finding knuckle hair in their food. Just make sure you wash your gloved hands with soap and water before you start mixing to get rid of any rubbery tasting talcum powder. While kneading you will add 1/2 to 3/4 cups more flour to this! This will vary depending on your surroundings, the size of your egg and the moisture content of your flour. Kneed it till it doesn't stick to you and feels firm. About ten minutes. Cover with a damp cloth or plastic wrap and set it aside for at least an hour. I have tried using it early but it makes a tough perogie. Firm is what we are after here not tough. This needs time for the gluten to relax. Go make some filling.. \"A pound? Really?\"\"Yes.\"I know it seems like a lot but most of that is fat that will cook off.Remember to save the bacon fat to cook the onions, sauerkraut and perogies in later. Told you this would kill you young!. Drained Bick's Wine Sauerkraut fried in Safflo Sunflower oil, salted butter and bacon fat (this is Ukrainian vegetarian) add salt and pepper to taste. fry them till they start to dry out and begin to become brown.Fry the yellow onions the same way. You will need at least three large onions. These are added to the potato filling and used as a garnish. I recommend swim goggles cutting onions. These things are brutal. Fry them till golden brown and caramelized. Drain some cottage cheese. You just add some salt and pepper to taste.By the way you also need to peel, cube and boil in salted water at least three large red potatoes. Not the brown ones those are too mealy and crumbly, use red potatoes. Mash them with a little milk when done.Right about now everyone who lives near you will be thinking up excuses to drop by to see what you are cooking.. No point dieting here. Go for fresh and high fat.. Mix up half of the cooked bacon, half the cooked onion and the mashed red potatoes. Taste it and add salt and pepper as needed. Fold it. Do not stir.. At this point your dough has been resting about an hour on the counter. It should feel like a nice firm boob. You can throw everything in the fridge and take a rest at this point or carry on. You can even put it off till tomorrow if you like.. Fire up the TV and get cozy. This is gonna take a while. Punch down your dough and get a cutting board. Flour up the board. Start boiling a pot of salted water now too.. I cut chunks of dough the size of a walnut. I've also tried making huge perogies but I prefer normal ones.. I use an old coffee cup from my Grandfather's house. It just seems right.. Add a walnut sized chunk of whatever. You can use the cottage cheese, the potato and bacon or potato and cheese, the sauerkraut. Whatever. I have even seen them with cherries and sugar inside as a desert.. Pinch the edges together. Keep the filling off the edges that will meet so they stick.. Throw them in. Tip first so they don't splash you. Stir the pot so they don't stick to the pot or their brethren.. After they float they are done. You could eat them at this point but really? We already went down the rabbit hole we might as well stick around for tea!. Safflo sunflower oil, bacon fat and salted butter. Honestly it just doesn't get better than this.. The first one is always mine!. Just make a whole bunch! I usually double the recipe but it's a lot of work. If you're just starting only make one type of filling. You can freeze the extra but I really doubt there will be leftovers. I put mine in a roaster with the last of the fried bacon, onions, some oil and some butter. My family ends up going thru them all in about a day and a half. Serve with sour cream. I hope you find this useful.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_198_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_198_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_198_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_198_3.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_coherence",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "question": "Select the incoherent image in the following sequence of images.",
+    "context": "Here is the context:\n. \n          Gather the following:\n\n\t\t2 Ripe** Avocados\n\t\tVegetable Oil, or preferred frying oil\n\t\tBread Crumbs\n\t\t2 Eggs\n\t\tSalt to taste\n\t\tPepper to taste\n\t\tKitchen Knife\n\t\tSpoon\n\t\tPan\n\t\tTongs (utensil for flipping during Fry step)\n\t\tPaper Towels**NOTE: You want your avocados to be on the firm side of ripe as they will be easier to handle during the breading and egg dipping process.\u00a0. First things first, cut your avocados in half and remove the seeds. Scoop the avocado fruit out of it's leathery shell with a spoon, but make sure to keep the avocado fruit intact.\nPlace the avocado fruit upside down on your cutting board. With a kitchen knife, slice your avocado lengthwise into 3/4 inch (1.905 cm) wide strips.\u00a0. Beat two eggs in a bowl until uniform. These eggs will be used to create the outer coating on the avocados later.\u00a0. \n          Now repeat the following with each of your avocado slices.\u00a0\n\n\t\tCoat each slice in bread crumbs.\u00a0\n\t\tDip your coated avocado slice into your egg batter.\n\t\tCoat your slice again, making sure that all of the egg batter on your slice is covered.\u00a0I've found that it's easiest to do step 1 (above) with all of your slices, then step 2 (above) with all of your slices, and then finally step 3 (above) with all of your slices. I wasn't following this method at first and ended up with gummy fingers covered in bread crumbs and egg--which gets annoying after a couple slices.\u00a0\nAlternative Bread Crumb Idea: If you are GF or trying to avoid bread and gluten, try with a gluten free substitute or coconut flour and flakes. I haven't tried the coconut but it seems like it could be good!. Pour your fry oil of choice into your frying pan, I chose vegetable oil. Turn on the stove, and let the oil heat up.\u00a0\nOnce your oil is hot, place your breaded avocado slices into the pan. Fry the slices until they are brown on both sides. Flip the slices if necessary with kitchen tongs.\u00a0Alternative to Frying: Bake on a lightly greased cookie sheet in a preheated oven (400F-425F) until browned and crispy for 15-20 minutes.\u00a0. When your avocado slices are nicely browned on both sides, remove them from your pan with kitchen tongs. Set them on a plate lined with paper towels. The paper towels will absorb any oil residue on your slices and keep them from getting soggy later.\u00a0\nAllow your slices to dry on the paper towel, but don't let them get too cold. Fries are good hot!. Serve immediately! These tasty little snacks are great at parties, alongside sandwiches, and as an afternoon snack. They go really nicely with dipping sauces as well. They're pictured here with a chipotle dipping sauce (sour cream, lemon, and chipotle chile powder) but I would also try it with the following: cilantro lime, honey mustard, garlic lime, maybe even a soy-sesame, although I have yet to try them all.\u00a0\nFun Avocado Fact: They are also called Alligator Pears\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_199_0.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_199_1.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_199_2.jpg",
+      "../MMIU-Benchmark/visual_coherence/visual_coherence_199_3.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_0_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_0_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_0_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_0_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_0_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_0_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_0_6.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_1_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_1_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_1_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_1_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_1_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_1_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_1_6.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_2_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_2_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_2_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_2_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_2_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_2_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_2_6.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_3_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_3_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_3_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_3_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_3_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_3_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_3_6.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_4_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_4_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_4_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_4_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_4_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_4_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_4_6.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_5_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_5_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_5_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_5_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_5_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_5_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_5_6.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_6_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_6_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_6_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_6_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_6_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_6_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_6_6.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_7_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_7_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_7_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_7_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_7_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_7_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_7_6.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_8_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_8_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_8_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_8_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_8_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_8_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_8_6.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_9_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_9_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_9_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_9_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_9_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_9_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_9_6.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_10_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_10_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_10_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_10_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_10_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_10_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_10_6.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_11_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_11_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_11_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_11_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_11_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_11_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_11_6.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_12_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_12_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_12_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_12_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_12_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_12_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_12_6.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_13_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_13_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_13_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_13_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_13_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_13_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_13_6.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_14_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_14_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_14_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_14_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_14_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_14_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_14_6.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_15_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_15_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_15_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_15_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_15_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_15_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_15_6.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_16_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_16_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_16_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_16_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_16_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_16_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_16_6.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_17_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_17_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_17_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_17_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_17_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_17_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_17_6.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_18_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_18_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_18_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_18_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_18_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_18_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_18_6.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_19_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_19_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_19_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_19_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_19_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_19_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_19_6.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_20_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_20_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_20_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_20_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_20_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_20_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_20_6.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_21_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_21_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_21_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_21_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_21_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_21_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_21_6.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_22_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_22_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_22_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_22_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_22_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_22_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_22_6.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_23_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_23_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_23_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_23_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_23_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_23_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_23_6.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_24_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_24_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_24_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_24_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_24_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_24_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_24_6.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_25_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_25_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_25_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_25_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_25_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_25_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_25_6.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_26_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_26_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_26_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_26_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_26_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_26_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_26_6.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_27_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_27_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_27_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_27_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_27_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_27_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_27_6.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_28_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_28_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_28_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_28_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_28_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_28_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_28_6.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_29_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_29_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_29_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_29_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_29_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_29_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_29_6.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_30_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_30_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_30_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_30_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_30_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_30_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_30_6.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_31_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_31_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_31_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_31_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_31_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_31_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_31_6.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_32_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_32_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_32_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_32_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_32_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_32_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_32_6.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_33_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_33_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_33_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_33_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_33_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_33_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_33_6.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_34_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_34_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_34_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_34_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_34_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_34_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_34_6.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_35_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_35_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_35_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_35_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_35_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_35_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_35_6.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_36_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_36_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_36_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_36_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_36_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_36_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_36_6.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_37_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_37_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_37_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_37_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_37_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_37_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_37_6.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_38_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_38_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_38_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_38_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_38_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_38_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_38_6.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_39_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_39_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_39_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_39_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_39_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_39_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_39_6.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_40_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_40_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_40_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_40_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_40_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_40_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_40_6.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_41_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_41_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_41_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_41_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_41_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_41_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_41_6.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_42_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_42_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_42_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_42_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_42_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_42_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_42_6.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_43_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_43_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_43_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_43_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_43_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_43_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_43_6.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_44_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_44_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_44_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_44_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_44_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_44_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_44_6.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_45_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_45_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_45_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_45_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_45_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_45_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_45_6.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_46_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_46_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_46_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_46_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_46_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_46_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_46_6.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_47_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_47_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_47_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_47_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_47_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_47_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_47_6.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_48_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_48_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_48_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_48_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_48_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_48_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_48_6.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_49_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_49_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_49_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_49_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_49_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_49_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_49_6.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_50_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_50_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_50_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_50_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_50_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_50_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_50_6.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_51_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_51_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_51_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_51_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_51_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_51_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_51_6.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_52_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_52_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_52_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_52_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_52_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_52_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_52_6.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_53_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_53_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_53_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_53_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_53_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_53_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_53_6.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_54_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_54_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_54_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_54_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_54_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_54_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_54_6.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_55_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_55_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_55_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_55_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_55_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_55_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_55_6.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_56_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_56_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_56_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_56_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_56_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_56_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_56_6.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_57_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_57_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_57_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_57_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_57_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_57_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_57_6.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_58_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_58_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_58_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_58_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_58_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_58_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_58_6.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_59_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_59_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_59_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_59_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_59_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_59_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_59_6.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_60_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_60_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_60_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_60_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_60_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_60_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_60_6.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_61_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_61_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_61_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_61_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_61_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_61_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_61_6.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_62_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_62_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_62_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_62_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_62_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_62_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_62_6.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_63_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_63_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_63_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_63_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_63_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_63_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_63_6.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_64_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_64_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_64_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_64_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_64_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_64_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_64_6.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_65_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_65_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_65_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_65_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_65_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_65_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_65_6.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_66_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_66_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_66_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_66_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_66_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_66_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_66_6.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_67_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_67_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_67_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_67_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_67_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_67_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_67_6.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_68_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_68_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_68_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_68_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_68_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_68_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_68_6.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_69_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_69_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_69_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_69_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_69_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_69_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_69_6.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_70_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_70_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_70_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_70_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_70_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_70_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_70_6.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_71_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_71_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_71_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_71_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_71_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_71_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_71_6.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_72_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_72_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_72_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_72_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_72_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_72_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_72_6.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_73_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_73_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_73_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_73_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_73_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_73_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_73_6.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_74_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_74_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_74_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_74_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_74_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_74_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_74_6.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_75_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_75_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_75_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_75_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_75_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_75_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_75_6.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_76_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_76_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_76_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_76_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_76_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_76_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_76_6.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_77_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_77_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_77_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_77_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_77_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_77_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_77_6.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_78_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_78_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_78_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_78_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_78_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_78_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_78_6.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_79_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_79_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_79_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_79_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_79_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_79_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_79_6.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_80_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_80_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_80_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_80_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_80_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_80_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_80_6.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_81_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_81_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_81_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_81_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_81_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_81_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_81_6.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_82_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_82_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_82_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_82_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_82_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_82_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_82_6.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_83_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_83_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_83_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_83_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_83_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_83_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_83_6.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_84_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_84_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_84_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_84_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_84_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_84_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_84_6.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_85_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_85_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_85_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_85_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_85_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_85_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_85_6.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_86_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_86_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_86_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_86_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_86_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_86_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_86_6.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_87_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_87_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_87_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_87_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_87_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_87_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_87_6.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_88_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_88_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_88_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_88_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_88_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_88_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_88_6.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_89_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_89_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_89_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_89_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_89_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_89_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_89_6.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_90_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_90_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_90_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_90_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_90_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_90_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_90_6.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_91_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_91_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_91_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_91_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_91_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_91_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_91_6.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_92_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_92_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_92_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_92_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_92_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_92_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_92_6.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_93_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_93_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_93_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_93_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_93_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_93_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_93_6.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_94_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_94_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_94_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_94_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_94_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_94_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_94_6.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_95_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_95_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_95_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_95_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_95_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_95_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_95_6.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_96_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_96_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_96_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_96_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_96_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_96_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_96_6.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_97_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_97_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_97_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_97_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_97_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_97_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_97_6.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_98_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_98_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_98_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_98_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_98_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_98_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_98_6.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_99_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_99_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_99_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_99_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_99_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_99_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_99_6.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_100_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_100_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_100_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_100_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_100_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_100_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_100_6.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_101_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_101_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_101_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_101_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_101_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_101_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_101_6.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_102_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_102_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_102_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_102_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_102_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_102_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_102_6.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_103_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_103_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_103_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_103_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_103_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_103_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_103_6.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_104_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_104_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_104_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_104_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_104_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_104_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_104_6.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_105_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_105_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_105_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_105_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_105_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_105_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_105_6.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_106_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_106_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_106_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_106_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_106_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_106_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_106_6.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_107_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_107_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_107_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_107_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_107_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_107_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_107_6.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_108_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_108_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_108_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_108_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_108_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_108_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_108_6.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_109_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_109_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_109_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_109_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_109_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_109_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_109_6.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_110_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_110_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_110_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_110_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_110_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_110_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_110_6.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_111_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_111_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_111_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_111_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_111_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_111_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_111_6.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_112_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_112_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_112_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_112_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_112_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_112_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_112_6.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_113_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_113_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_113_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_113_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_113_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_113_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_113_6.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_114_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_114_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_114_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_114_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_114_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_114_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_114_6.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_115_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_115_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_115_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_115_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_115_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_115_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_115_6.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_116_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_116_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_116_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_116_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_116_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_116_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_116_6.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_117_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_117_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_117_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_117_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_117_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_117_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_117_6.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_118_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_118_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_118_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_118_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_118_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_118_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_118_6.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_119_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_119_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_119_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_119_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_119_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_119_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_119_6.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_120_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_120_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_120_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_120_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_120_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_120_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_120_6.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_121_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_121_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_121_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_121_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_121_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_121_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_121_6.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_122_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_122_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_122_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_122_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_122_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_122_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_122_6.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_123_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_123_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_123_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_123_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_123_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_123_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_123_6.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_124_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_124_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_124_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_124_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_124_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_124_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_124_6.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_125_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_125_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_125_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_125_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_125_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_125_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_125_6.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_126_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_126_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_126_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_126_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_126_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_126_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_126_6.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_127_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_127_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_127_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_127_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_127_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_127_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_127_6.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_128_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_128_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_128_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_128_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_128_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_128_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_128_6.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_129_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_129_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_129_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_129_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_129_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_129_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_129_6.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_130_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_130_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_130_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_130_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_130_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_130_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_130_6.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_131_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_131_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_131_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_131_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_131_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_131_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_131_6.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_132_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_132_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_132_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_132_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_132_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_132_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_132_6.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_133_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_133_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_133_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_133_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_133_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_133_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_133_6.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_134_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_134_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_134_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_134_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_134_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_134_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_134_6.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_135_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_135_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_135_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_135_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_135_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_135_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_135_6.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_136_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_136_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_136_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_136_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_136_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_136_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_136_6.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_137_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_137_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_137_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_137_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_137_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_137_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_137_6.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_138_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_138_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_138_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_138_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_138_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_138_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_138_6.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_139_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_139_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_139_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_139_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_139_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_139_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_139_6.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_140_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_140_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_140_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_140_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_140_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_140_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_140_6.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_141_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_141_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_141_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_141_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_141_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_141_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_141_6.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_142_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_142_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_142_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_142_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_142_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_142_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_142_6.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_143_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_143_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_143_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_143_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_143_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_143_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_143_6.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_144_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_144_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_144_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_144_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_144_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_144_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_144_6.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_145_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_145_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_145_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_145_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_145_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_145_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_145_6.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_146_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_146_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_146_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_146_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_146_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_146_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_146_6.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_147_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_147_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_147_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_147_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_147_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_147_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_147_6.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_148_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_148_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_148_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_148_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_148_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_148_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_148_6.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_149_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_149_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_149_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_149_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_149_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_149_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_149_6.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_150_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_150_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_150_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_150_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_150_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_150_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_150_6.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_151_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_151_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_151_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_151_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_151_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_151_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_151_6.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_152_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_152_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_152_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_152_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_152_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_152_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_152_6.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_153_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_153_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_153_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_153_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_153_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_153_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_153_6.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_154_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_154_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_154_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_154_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_154_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_154_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_154_6.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_155_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_155_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_155_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_155_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_155_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_155_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_155_6.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_156_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_156_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_156_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_156_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_156_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_156_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_156_6.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_157_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_157_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_157_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_157_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_157_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_157_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_157_6.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_158_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_158_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_158_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_158_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_158_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_158_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_158_6.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_159_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_159_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_159_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_159_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_159_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_159_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_159_6.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_160_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_160_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_160_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_160_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_160_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_160_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_160_6.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_161_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_161_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_161_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_161_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_161_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_161_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_161_6.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_162_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_162_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_162_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_162_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_162_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_162_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_162_6.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_163_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_163_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_163_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_163_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_163_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_163_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_163_6.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_164_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_164_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_164_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_164_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_164_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_164_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_164_6.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_165_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_165_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_165_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_165_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_165_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_165_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_165_6.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_166_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_166_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_166_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_166_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_166_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_166_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_166_6.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_167_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_167_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_167_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_167_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_167_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_167_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_167_6.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_168_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_168_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_168_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_168_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_168_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_168_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_168_6.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_169_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_169_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_169_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_169_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_169_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_169_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_169_6.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_170_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_170_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_170_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_170_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_170_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_170_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_170_6.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_171_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_171_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_171_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_171_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_171_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_171_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_171_6.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_172_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_172_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_172_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_172_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_172_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_172_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_172_6.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_173_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_173_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_173_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_173_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_173_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_173_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_173_6.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_174_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_174_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_174_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_174_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_174_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_174_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_174_6.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_175_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_175_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_175_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_175_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_175_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_175_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_175_6.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_176_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_176_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_176_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_176_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_176_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_176_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_176_6.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_177_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_177_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_177_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_177_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_177_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_177_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_177_6.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_178_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_178_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_178_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_178_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_178_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_178_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_178_6.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_179_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_179_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_179_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_179_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_179_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_179_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_179_6.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_180_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_180_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_180_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_180_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_180_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_180_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_180_6.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_181_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_181_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_181_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_181_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_181_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_181_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_181_6.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_182_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_182_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_182_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_182_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_182_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_182_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_182_6.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_183_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_183_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_183_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_183_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_183_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_183_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_183_6.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_184_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_184_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_184_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_184_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_184_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_184_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_184_6.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_185_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_185_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_185_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_185_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_185_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_185_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_185_6.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_186_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_186_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_186_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_186_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_186_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_186_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_186_6.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_187_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_187_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_187_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_187_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_187_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_187_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_187_6.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_188_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_188_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_188_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_188_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_188_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_188_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_188_6.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_189_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_189_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_189_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_189_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_189_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_189_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_189_6.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_190_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_190_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_190_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_190_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_190_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_190_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_190_6.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_191_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_191_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_191_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_191_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_191_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_191_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_191_6.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_192_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_192_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_192_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_192_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_192_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_192_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_192_6.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_193_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_193_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_193_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_193_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_193_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_193_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_193_6.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_194_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_194_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_194_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_194_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_194_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_194_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_194_6.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_195_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_195_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_195_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_195_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_195_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_195_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_195_6.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_196_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_196_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_196_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_196_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_196_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_196_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_196_6.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_197_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_197_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_197_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_197_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_197_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_197_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_197_6.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_198_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_198_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_198_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_198_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_198_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_198_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_198_6.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "visual_cloze",
+    "visual_input_component": "Natural image",
+    "source": "RecipeQA",
+    "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "question": "Choose the best image for the missing blank to correctly complete the recipe.",
+    "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end",
+    "input_image_path": [
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_199_0.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_199_1.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_199_2.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_199_3.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_199_4.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_199_5.jpg",
+      "../MMIU-Benchmark/visual_cloze/visual_cloze_199_6.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 1.1529 0.012747 244.44\n0.41529 1.1943 -155.59\n0.00087156 5.6224e-05 1.0092\n\nB: 0.54693 0.20925 -108.35\n-0.082341 1.1176 -236.48\n-0.0006026 0.0001769 1.0001\n\nC: 1.2108 -0.031741 47.374\n0.20996 1.0345 -107.36\n0.00054926 -6.3631e-06 1.0004\n\nD: 1.7761 -0.053427 263.17\n0.41751 1.5987 -329.46\n0.00069677 3.1372e-05 1.0014\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.1529 0.012747 244.44\n0.41529 1.1943 -155.59\n0.00087156 5.6224e-05 1.0092\n\nB: 0.54693 0.20925 -108.35\n-0.082341 1.1176 -236.48\n-0.0006026 0.0001769 1.0001\n\nC: 1.2108 -0.031741 47.374\n0.20996 1.0345 -107.36\n0.00054926 -6.3631e-06 1.0004\n\nD: 1.7761 -0.053427 263.17\n0.41751 1.5987 -329.46\n0.00069677 3.1372e-05 1.0014\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_0_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_0_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.91628 -0.19782 70.502\n0.072414 0.68419 -33.187\n5.7127e-06 -0.00025258 0.99947\n\nB: 1.2895 0.43518 -118.46\n-0.025956 1.4233 161.89\n-3.0413e-05 0.00069874 1.0013\n\nC: 4.3722 0.14407 -818.24\n-0.25209 3.9595 -549.15\n0.001718 0.0010825 0.97985\n\nD: 0.40245 -0.33938 102.29\n-0.2125 0.62381 216.78\n-0.00033866 -1.5855e-05 1.0018\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.91628 -0.19782 70.502\n0.072414 0.68419 -33.187\n5.7127e-06 -0.00025258 0.99947\n\nB: 1.2895 0.43518 -118.46\n-0.025956 1.4233 161.89\n-3.0413e-05 0.00069874 1.0013\n\nC: 4.3722 0.14407 -818.24\n-0.25209 3.9595 -549.15\n0.001718 0.0010825 0.97985\n\nD: 0.40245 -0.33938 102.29\n-0.2125 0.62381 216.78\n-0.00033866 -1.5855e-05 1.0018\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_1_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_1_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.74922 -0.0014388 -75.597\n-0.074158 0.94323 40.455\n-0.00018126 -6.2301e-06 1\n\nB: 0.29858 0.0403 -122.67\n-0.38113 0.61838 172.03\n-0.00071255 -1.0448e-06 0.97348\n\nC: 1.0669 0.31109 194.1\n-0.019953 0.9209 79.624\n0.000135 -7.6705e-05 0.99977\n\nD: 0.37694 0.049406 111.53\n-0.16444 0.72986 84.602\n-0.00037753 4.0247e-05 0.99869\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.74922 -0.0014388 -75.597\n-0.074158 0.94323 40.455\n-0.00018126 -6.2301e-06 1\n\nB: 0.29858 0.0403 -122.67\n-0.38113 0.61838 172.03\n-0.00071255 -1.0448e-06 0.97348\n\nC: 1.0669 0.31109 194.1\n-0.019953 0.9209 79.624\n0.000135 -7.6705e-05 0.99977\n\nD: 0.37694 0.049406 111.53\n-0.16444 0.72986 84.602\n-0.00037753 4.0247e-05 0.99869\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_2_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_2_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 1.9861 0.031586 27.893\n0.62141 1.9607 -531.99\n0.0011993 -1.9815e-05 0.99978\n\nB: -0.21679 -0.12572 585.55\n0.12463 -0.21699 355.1\n-1.085e-06 -1.8818e-06 1.0002\n\nC: 0.44469 -0.1629 197.72\n-0.090792 0.33606 37.55\n-0.00032851 -0.00028415 1.0004\n\nD: 5.1051 0.34986 -885.86\n1.0306 5.9768 -2733.1\n0.0033649 0.00099216 1\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.9861 0.031586 27.893\n0.62141 1.9607 -531.99\n0.0011993 -1.9815e-05 0.99978\n\nB: -0.21679 -0.12572 585.55\n0.12463 -0.21699 355.1\n-1.085e-06 -1.8818e-06 1.0002\n\nC: 0.44469 -0.1629 197.72\n-0.090792 0.33606 37.55\n-0.00032851 -0.00028415 1.0004\n\nD: 5.1051 0.34986 -885.86\n1.0306 5.9768 -2733.1\n0.0033649 0.00099216 1\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_3_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_3_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.48275 -0.12831 276.04\n-0.19138 0.40711 199.19\n-5.6548e-05 -0.00023367 0.99912\n\nB: 1.6408 -0.0013389 -221.64\n0.1704 1.44 -155.56\n0.00036369 -3.22e-05 1.0003\n\nC: 2.4144 -0.0022023 -199.3\n0.52146 2.0547 -569.49\n0.0010423 8.4489e-05 1.0043\n\nD: 2.9599 0.00703 244.64\n0.78405 1.8789 -438.29\n0.0018411 4.4095e-05 0.99694\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.48275 -0.12831 276.04\n-0.19138 0.40711 199.19\n-5.6548e-05 -0.00023367 0.99912\n\nB: 1.6408 -0.0013389 -221.64\n0.1704 1.44 -155.56\n0.00036369 -3.22e-05 1.0003\n\nC: 2.4144 -0.0022023 -199.3\n0.52146 2.0547 -569.49\n0.0010423 8.4489e-05 1.0043\n\nD: 2.9599 0.00703 244.64\n0.78405 1.8789 -438.29\n0.0018411 4.4095e-05 0.99694\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_4_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_4_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 2.4144 -0.0022023 -199.3\n0.52146 2.0547 -569.49\n0.0010423 8.4489e-05 1.0043\n\nB: 1.547 0.11677 155.75\n0.40373 1.373 -170.1\n0.00090791 8.8782e-05 1.0012\n\nC: 1.3951 0.13641 136.74\n0.31704 1.2758 -219.28\n0.00053511 0.00013896 0.99675\n\nD: 0.72201 0.13445 62.975\n0.059719 0.85126 46.305\n-1.7322e-05 0.00018166 1.0001\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 2.4144 -0.0022023 -199.3\n0.52146 2.0547 -569.49\n0.0010423 8.4489e-05 1.0043\n\nB: 1.547 0.11677 155.75\n0.40373 1.373 -170.1\n0.00090791 8.8782e-05 1.0012\n\nC: 1.3951 0.13641 136.74\n0.31704 1.2758 -219.28\n0.00053511 0.00013896 0.99675\n\nD: 0.72201 0.13445 62.975\n0.059719 0.85126 46.305\n-1.7322e-05 0.00018166 1.0001\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_5_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_5_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 1.4932 0.01661 231.74\n0.45676 1.4341 -212.29\n0.0013256 9.9938e-05 0.99686\n\nB: 0.012717 0.014394 193.52\n-0.12386 0.60301 126.7\n-0.00063953 7.9665e-05 1.0012\n\nC: 0.27317 0.041297 84.951\n-0.22859 0.68736 124.47\n-0.00041264 5.2763e-05 1.0003\n\nD: 2.2078 0.054458 63.617\n0.67654 2.2557 -637.98\n0.0013191 8.5079e-05 1.0033\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.4932 0.01661 231.74\n0.45676 1.4341 -212.29\n0.0013256 9.9938e-05 0.99686\n\nB: 0.012717 0.014394 193.52\n-0.12386 0.60301 126.7\n-0.00063953 7.9665e-05 1.0012\n\nC: 0.27317 0.041297 84.951\n-0.22859 0.68736 124.47\n-0.00041264 5.2763e-05 1.0003\n\nD: 2.2078 0.054458 63.617\n0.67654 2.2557 -637.98\n0.0013191 8.5079e-05 1.0033\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_6_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_6_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.60879 -0.35761 289.93\n0.34822 0.61653 -30.949\n-2.0912e-05 1.3527e-06 1.014\n\nB: 1.1901 -0.048587 107.72\n0.14488 1.1926 -121.84\n0.00033622 1.1241e-05 1.0001\n\nC: 2.2787 0.023843 -30.321\n0.58793 1.9158 -459.28\n0.0012782 -6.6868e-06 0.99971\n\nD: 1.8454 -0.0093839 117.6\n0.8533 1.9335 -566.11\n0.0016091 6.8147e-05 1.0105\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.60879 -0.35761 289.93\n0.34822 0.61653 -30.949\n-2.0912e-05 1.3527e-06 1.014\n\nB: 1.1901 -0.048587 107.72\n0.14488 1.1926 -121.84\n0.00033622 1.1241e-05 1.0001\n\nC: 2.2787 0.023843 -30.321\n0.58793 1.9158 -459.28\n0.0012782 -6.6868e-06 0.99971\n\nD: 1.8454 -0.0093839 117.6\n0.8533 1.9335 -566.11\n0.0016091 6.8147e-05 1.0105\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_7_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_7_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.37083 -0.024499 139.16\n-0.094573 0.62749 65.353\n-0.00053805 -2.2225e-05 0.99885\n\nB: 0.75268 -0.0092452 -71.273\n-0.17607 0.97566 6.3105\n-0.00029582 -1.5187e-05 0.99957\n\nC: 0.98278 -0.0048237 22.209\n-0.012055 0.97088 45.658\n-7.6753e-06 -2.3467e-05 1.0001\n\nD: 1.0499 0.025643 108.77\n0.19467 1.0054 -7.8895\n0.0011218 -3.184e-05 1.0021\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.37083 -0.024499 139.16\n-0.094573 0.62749 65.353\n-0.00053805 -2.2225e-05 0.99885\n\nB: 0.75268 -0.0092452 -71.273\n-0.17607 0.97566 6.3105\n-0.00029582 -1.5187e-05 0.99957\n\nC: 0.98278 -0.0048237 22.209\n-0.012055 0.97088 45.658\n-7.6753e-06 -2.3467e-05 1.0001\n\nD: 1.0499 0.025643 108.77\n0.19467 1.0054 -7.8895\n0.0011218 -3.184e-05 1.0021\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_8_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_8_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.3184 0.1614 32.607\n0.092973 1.2239 -454.36\n-0.00072537 0.00028453 0.99713\n\nB: 0.85555 -0.17378 91.59\n0.17068 0.85755 -31.264\n-5.1182e-06 2.0966e-06 1.0023\n\nC: 0.87235 0.023622 101.75\n0.12982 0.76075 59.456\n0.0005519 9.0915e-05 1.0016\n\nD: 0.37083 -0.024499 139.16\n-0.094573 0.62749 65.353\n-0.00053805 -2.2225e-05 0.99885\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.3184 0.1614 32.607\n0.092973 1.2239 -454.36\n-0.00072537 0.00028453 0.99713\n\nB: 0.85555 -0.17378 91.59\n0.17068 0.85755 -31.264\n-5.1182e-06 2.0966e-06 1.0023\n\nC: 0.87235 0.023622 101.75\n0.12982 0.76075 59.456\n0.0005519 9.0915e-05 1.0016\n\nD: 0.37083 -0.024499 139.16\n-0.094573 0.62749 65.353\n-0.00053805 -2.2225e-05 0.99885\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_9_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_9_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.88184 0.31397 -39.976\n-0.18167 0.93621 153.25\n0.00020118 -1.9028e-05 0.99997\n\nB: 0.33414 0.069646 90.22\n-0.25229 0.73446 157.67\n-0.00038885 2.2582e-06 1.0024\n\nC: 0.15114 -0.00089399 241.66\n-0.078633 0.45918 14.453\n-0.00033245 3.1152e-05 0.99996\n\nD: 0.54864 -0.010797 -6.1494\n-0.11876 0.86651 111.28\n-0.00026448 -1.8961e-05 1\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.88184 0.31397 -39.976\n-0.18167 0.93621 153.25\n0.00020118 -1.9028e-05 0.99997\n\nB: 0.33414 0.069646 90.22\n-0.25229 0.73446 157.67\n-0.00038885 2.2582e-06 1.0024\n\nC: 0.15114 -0.00089399 241.66\n-0.078633 0.45918 14.453\n-0.00033245 3.1152e-05 0.99996\n\nD: 0.54864 -0.010797 -6.1494\n-0.11876 0.86651 111.28\n-0.00026448 -1.8961e-05 1\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_10_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_10_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.29858 0.0403 -122.67\n-0.38113 0.61838 172.03\n-0.00071255 -1.0448e-06 0.97348\n\nB: 0.44469 -0.1629 197.72\n-0.090792 0.33606 37.55\n-0.00032851 -0.00028415 1.0004\n\nC: 1.1198 0.031669 158.94\n0.13747 0.986 -24.458\n0.00036259 4.1267e-05 0.99658\n\nD: 0.52949 -0.028655 46.849\n-0.2451 0.79991 158.44\n-0.00032499 -1.8164e-05 0.99959\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.29858 0.0403 -122.67\n-0.38113 0.61838 172.03\n-0.00071255 -1.0448e-06 0.97348\n\nB: 0.44469 -0.1629 197.72\n-0.090792 0.33606 37.55\n-0.00032851 -0.00028415 1.0004\n\nC: 1.1198 0.031669 158.94\n0.13747 0.986 -24.458\n0.00036259 4.1267e-05 0.99658\n\nD: 0.52949 -0.028655 46.849\n-0.2451 0.79991 158.44\n-0.00032499 -1.8164e-05 0.99959\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_11_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_11_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 1.1529 0.012747 244.44\n0.41529 1.1943 -155.59\n0.00087156 5.6224e-05 1.0092\n\nB: 0.1857 -0.0018512 147.73\n-0.094288 0.35154 277.67\n-0.00019671 -1.563e-05 0.9996\n\nC: 0.57079 0.0076829 -45.295\n-0.15447 0.93183 62.276\n-0.00028402 -5.8827e-06 0.99996\n\nD: 0.77105 -0.097833 -3.6994\n-0.092675 0.81167 92.799\n-0.0001392 -0.00012806 0.99964\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.1529 0.012747 244.44\n0.41529 1.1943 -155.59\n0.00087156 5.6224e-05 1.0092\n\nB: 0.1857 -0.0018512 147.73\n-0.094288 0.35154 277.67\n-0.00019671 -1.563e-05 0.9996\n\nC: 0.57079 0.0076829 -45.295\n-0.15447 0.93183 62.276\n-0.00028402 -5.8827e-06 0.99996\n\nD: 0.77105 -0.097833 -3.6994\n-0.092675 0.81167 92.799\n-0.0001392 -0.00012806 0.99964\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_12_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_12_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 2.6481 0.070248 -423.11\n0.5002 2.6605 -906.39\n0.0012014 0.00025943 0.99533\n\nB: 0.58099 -0.029382 -20.47\n-0.29479 0.73128 188.62\n-0.00043803 -4.3076e-05 1.0007\n\nC: 0.34904 -0.0038637 -43.899\n-0.22316 0.99346 45.579\n-0.00041195 -1.2246e-05 1\n\nD: 0.38854 -0.073106 92.576\n-0.1986 0.7319 139.21\n-0.00040811 -1.555e-05 0.99988\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 2.6481 0.070248 -423.11\n0.5002 2.6605 -906.39\n0.0012014 0.00025943 0.99533\n\nB: 0.58099 -0.029382 -20.47\n-0.29479 0.73128 188.62\n-0.00043803 -4.3076e-05 1.0007\n\nC: 0.34904 -0.0038637 -43.899\n-0.22316 0.99346 45.579\n-0.00041195 -1.2246e-05 1\n\nD: 0.38854 -0.073106 92.576\n-0.1986 0.7319 139.21\n-0.00040811 -1.555e-05 0.99988\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_13_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_13_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 2.2078 0.054458 63.617\n0.67654 2.2557 -637.98\n0.0013191 8.5079e-05 1.0033\n\nB: 1.141 -0.024147 186.42\n0.29573 0.97376 -60.872\n0.00082251 -1.0843e-05 0.99973\n\nC: 3.6199 0.1243 -2.4307\n0.35256 5.1536 -1935.2\n0.0029372 0.0011148 1\n\nD: 1.3308 -0.060097 223.54\n0.17906 0.94189 -10.999\n0.00034146 -4.4675e-05 0.99983\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 2.2078 0.054458 63.617\n0.67654 2.2557 -637.98\n0.0013191 8.5079e-05 1.0033\n\nB: 1.141 -0.024147 186.42\n0.29573 0.97376 -60.872\n0.00082251 -1.0843e-05 0.99973\n\nC: 3.6199 0.1243 -2.4307\n0.35256 5.1536 -1935.2\n0.0029372 0.0011148 1\n\nD: 1.3308 -0.060097 223.54\n0.17906 0.94189 -10.999\n0.00034146 -4.4675e-05 0.99983\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_14_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_14_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 1.6408 -0.0013389 -221.64\n0.1704 1.44 -155.56\n0.00036369 -3.22e-05 1.0003\n\nB: 0.3184 0.1614 32.607\n0.092973 1.2239 -454.36\n-0.00072537 0.00028453 0.99713\n\nC: 0.47208 0.021042 63.836\n-0.16332 0.73028 126.94\n-0.00030371 2.4606e-05 0.99981\n\nD: 0.47589 0.042551 60.888\n-0.21388 0.80238 62.033\n-0.0003663 2.6901e-05 1.001\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.6408 -0.0013389 -221.64\n0.1704 1.44 -155.56\n0.00036369 -3.22e-05 1.0003\n\nB: 0.3184 0.1614 32.607\n0.092973 1.2239 -454.36\n-0.00072537 0.00028453 0.99713\n\nC: 0.47208 0.021042 63.836\n-0.16332 0.73028 126.94\n-0.00030371 2.4606e-05 0.99981\n\nD: 0.47589 0.042551 60.888\n-0.21388 0.80238 62.033\n-0.0003663 2.6901e-05 1.001\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_15_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_15_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.44469 -0.1629 197.72\n-0.090792 0.33606 37.55\n-0.00032851 -0.00028415 1.0004\n\nB: 0.7088 -0.010965 -26.07\n-0.13602 0.83489 103.19\n-0.00023352 -1.5615e-05 1.0004\n\nC: 0.54304 0.026384 236.48\n-0.041921 0.64806 87.13\n-5.8662e-05 1.5685e-05 1\n\nD: 0.37107 -0.09213 318.73\n0.086334 0.37505 188.02\n-1.0814e-05 -3.6548e-06 1\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.44469 -0.1629 197.72\n-0.090792 0.33606 37.55\n-0.00032851 -0.00028415 1.0004\n\nB: 0.7088 -0.010965 -26.07\n-0.13602 0.83489 103.19\n-0.00023352 -1.5615e-05 1.0004\n\nC: 0.54304 0.026384 236.48\n-0.041921 0.64806 87.13\n-5.8662e-05 1.5685e-05 1\n\nD: 0.37107 -0.09213 318.73\n0.086334 0.37505 188.02\n-1.0814e-05 -3.6548e-06 1\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_16_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_16_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.77105 -0.097833 -3.6994\n-0.092675 0.81167 92.799\n-0.0001392 -0.00012806 0.99964\n\nB: 0.1857 -0.0018512 147.73\n-0.094288 0.35154 277.67\n-0.00019671 -1.563e-05 0.9996\n\nC: 0.13896 0.020204 194.37\n-0.25201 0.63798 118.99\n-0.00052359 2.2762e-05 0.9996\n\nD: 1.4403 0.27154 10.734\n0.071471 1.5534 -44.533\n0.00030432 0.00049723 1.001\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.77105 -0.097833 -3.6994\n-0.092675 0.81167 92.799\n-0.0001392 -0.00012806 0.99964\n\nB: 0.1857 -0.0018512 147.73\n-0.094288 0.35154 277.67\n-0.00019671 -1.563e-05 0.9996\n\nC: 0.13896 0.020204 194.37\n-0.25201 0.63798 118.99\n-0.00052359 2.2762e-05 0.9996\n\nD: 1.4403 0.27154 10.734\n0.071471 1.5534 -44.533\n0.00030432 0.00049723 1.001\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_17_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_17_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 3.6199 0.1243 -2.4307\n0.35256 5.1536 -1935.2\n0.0029372 0.0011148 1\n\nB: 0.45287 0.0061881 100.32\n-0.053734 0.66556 61.961\n-0.00023168 -5.8559e-06 1.0005\n\nC: 1.0478 0.035143 64.843\n0.063507 1.0349 21.701\n0.00023044 -6.878e-06 0.99998\n\nD: 0.69134 -0.0063829 116.24\n0.0053381 0.71985 83.96\n-1.8171e-05 2.7124e-05 1\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 3.6199 0.1243 -2.4307\n0.35256 5.1536 -1935.2\n0.0029372 0.0011148 1\n\nB: 0.45287 0.0061881 100.32\n-0.053734 0.66556 61.961\n-0.00023168 -5.8559e-06 1.0005\n\nC: 1.0478 0.035143 64.843\n0.063507 1.0349 21.701\n0.00023044 -6.878e-06 0.99998\n\nD: 0.69134 -0.0063829 116.24\n0.0053381 0.71985 83.96\n-1.8171e-05 2.7124e-05 1\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_18_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_18_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.55616 0.0088234 83.342\n-0.19782 0.70845 195.76\n-0.00029305 -3.175e-05 0.99884\n\nB: 0.85799 0.21669 9.4839\n-0.21177 0.85855 130.48\n1.5015e-06 9.2033e-07 1\n\nC: 5.1051 0.34986 -885.86\n1.0306 5.9768 -2733.1\n0.0033649 0.00099216 1\n\nD: 1.4259 0.070724 58.865\n0.39243 1.3442 -170.04\n0.00084248 0.00011346 0.98851\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.55616 0.0088234 83.342\n-0.19782 0.70845 195.76\n-0.00029305 -3.175e-05 0.99884\n\nB: 0.85799 0.21669 9.4839\n-0.21177 0.85855 130.48\n1.5015e-06 9.2033e-07 1\n\nC: 5.1051 0.34986 -885.86\n1.0306 5.9768 -2733.1\n0.0033649 0.00099216 1\n\nD: 1.4259 0.070724 58.865\n0.39243 1.3442 -170.04\n0.00084248 0.00011346 0.98851\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_19_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_19_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.032608 0.010774 198.34\n-0.16134 0.44659 114.31\n-0.00057725 -5.1566e-07 1.0017\n\nB: 0.18178 0.033268 82.883\n-0.24959 0.68306 123.62\n-0.0004688 5.3047e-05 1.0005\n\nC: 0.79208 0.010314 26.019\n-0.023778 0.92337 43.513\n-0.00011513 1.2161e-05 1.0003\n\nD: 1.4259 0.070724 58.865\n0.39243 1.3442 -170.04\n0.00084248 0.00011346 0.98851\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.032608 0.010774 198.34\n-0.16134 0.44659 114.31\n-0.00057725 -5.1566e-07 1.0017\n\nB: 0.18178 0.033268 82.883\n-0.24959 0.68306 123.62\n-0.0004688 5.3047e-05 1.0005\n\nC: 0.79208 0.010314 26.019\n-0.023778 0.92337 43.513\n-0.00011513 1.2161e-05 1.0003\n\nD: 1.4259 0.070724 58.865\n0.39243 1.3442 -170.04\n0.00084248 0.00011346 0.98851\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_20_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_20_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 1.0669 0.31109 194.1\n-0.019953 0.9209 79.624\n0.000135 -7.6705e-05 0.99977\n\nB: 0.20876 0.015221 174.06\n-0.13382 0.55012 11.64\n-0.00044084 3.575e-05 1.0177\n\nC: 1.6408 -0.0013389 -221.64\n0.1704 1.44 -155.56\n0.00036369 -3.22e-05 1.0003\n\nD: 1.3903 -0.069797 29.319\n0.18963 1.0284 22.049\n0.00052989 -9.8197e-05 1.0021\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.0669 0.31109 194.1\n-0.019953 0.9209 79.624\n0.000135 -7.6705e-05 0.99977\n\nB: 0.20876 0.015221 174.06\n-0.13382 0.55012 11.64\n-0.00044084 3.575e-05 1.0177\n\nC: 1.6408 -0.0013389 -221.64\n0.1704 1.44 -155.56\n0.00036369 -3.22e-05 1.0003\n\nD: 1.3903 -0.069797 29.319\n0.18963 1.0284 22.049\n0.00052989 -9.8197e-05 1.0021\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_21_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_21_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.69134 -0.0063829 116.24\n0.0053381 0.71985 83.96\n-1.8171e-05 2.7124e-05 1\n\nB: 0.1857 -0.0018512 147.73\n-0.094288 0.35154 277.67\n-0.00019671 -1.563e-05 0.9996\n\nC: 0.012717 0.014394 193.52\n-0.12386 0.60301 126.7\n-0.00063953 7.9665e-05 1.0012\n\nD: 0.29858 0.0403 -122.67\n-0.38113 0.61838 172.03\n-0.00071255 -1.0448e-06 0.97348\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.69134 -0.0063829 116.24\n0.0053381 0.71985 83.96\n-1.8171e-05 2.7124e-05 1\n\nB: 0.1857 -0.0018512 147.73\n-0.094288 0.35154 277.67\n-0.00019671 -1.563e-05 0.9996\n\nC: 0.012717 0.014394 193.52\n-0.12386 0.60301 126.7\n-0.00063953 7.9665e-05 1.0012\n\nD: 0.29858 0.0403 -122.67\n-0.38113 0.61838 172.03\n-0.00071255 -1.0448e-06 0.97348\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_22_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_22_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 1.8851 0.028166 274.85\n0.48185 1.6951 -326.97\n0.0011778 8.455e-05 0.99801\n\nB: 0.4849 -0.15095 280.72\n-0.18568 0.38797 170.57\n-4.9965e-05 -0.00024428 0.99985\n\nC: 0.98278 -0.0048237 22.209\n-0.012055 0.97088 45.658\n-7.6753e-06 -2.3467e-05 1.0001\n\nD: 1.6284 1.0346 -954.33\n-0.096789 2.5434 -782.98\n-0.00078653 0.0011044 1\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.8851 0.028166 274.85\n0.48185 1.6951 -326.97\n0.0011778 8.455e-05 0.99801\n\nB: 0.4849 -0.15095 280.72\n-0.18568 0.38797 170.57\n-4.9965e-05 -0.00024428 0.99985\n\nC: 0.98278 -0.0048237 22.209\n-0.012055 0.97088 45.658\n-7.6753e-06 -2.3467e-05 1.0001\n\nD: 1.6284 1.0346 -954.33\n-0.096789 2.5434 -782.98\n-0.00078653 0.0011044 1\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_23_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_23_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 1.9207 0.17258 153.68\n0.62581 1.7293 -542.33\n0.0010509 0.0001244 0.99848\n\nB: 0.70212 0.43231 -128.54\n-0.42351 0.70276 199.3\n6.3285e-06 1.2175e-05 0.99997\n\nC: 0.91628 -0.19782 70.502\n0.072414 0.68419 -33.187\n5.7127e-06 -0.00025258 0.99947\n\nD: 0.22888 0.0058691 272.09\n-0.077153 0.3923 203.08\n-0.00024299 -4.5827e-06 1.0015\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.9207 0.17258 153.68\n0.62581 1.7293 -542.33\n0.0010509 0.0001244 0.99848\n\nB: 0.70212 0.43231 -128.54\n-0.42351 0.70276 199.3\n6.3285e-06 1.2175e-05 0.99997\n\nC: 0.91628 -0.19782 70.502\n0.072414 0.68419 -33.187\n5.7127e-06 -0.00025258 0.99947\n\nD: 0.22888 0.0058691 272.09\n-0.077153 0.3923 203.08\n-0.00024299 -4.5827e-06 1.0015\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_24_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_24_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 1.6477 -0.037624 101.59\n0.49962 1.5725 -364.98\n0.00090272 4.6589e-05 1.0037\n\nB: 0.18178 0.033268 82.883\n-0.24959 0.68306 123.62\n-0.0004688 5.3047e-05 1.0005\n\nC: 0.13416 0.073075 56.977\n-0.21333 0.70433 84.528\n-0.00055481 6.1106e-05 1\n\nD: 1.0499 0.025643 108.77\n0.19467 1.0054 -7.8895\n0.0011218 -3.184e-05 1.0021\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.6477 -0.037624 101.59\n0.49962 1.5725 -364.98\n0.00090272 4.6589e-05 1.0037\n\nB: 0.18178 0.033268 82.883\n-0.24959 0.68306 123.62\n-0.0004688 5.3047e-05 1.0005\n\nC: 0.13416 0.073075 56.977\n-0.21333 0.70433 84.528\n-0.00055481 6.1106e-05 1\n\nD: 1.0499 0.025643 108.77\n0.19467 1.0054 -7.8895\n0.0011218 -3.184e-05 1.0021\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_25_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_25_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.38914 0.285 169.51\n-0.28531 0.39347 340.1\n-6.4617e-06 5.0341e-06 1\n\nB: 0.13416 0.073075 56.977\n-0.21333 0.70433 84.528\n-0.00055481 6.1106e-05 1\n\nC: 2.4665 0.083695 233.31\n0.87021 2.8235 -936.68\n0.0017821 0.0001592 0.98707\n\nD: 1.4259 0.070724 58.865\n0.39243 1.3442 -170.04\n0.00084248 0.00011346 0.98851\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.38914 0.285 169.51\n-0.28531 0.39347 340.1\n-6.4617e-06 5.0341e-06 1\n\nB: 0.13416 0.073075 56.977\n-0.21333 0.70433 84.528\n-0.00055481 6.1106e-05 1\n\nC: 2.4665 0.083695 233.31\n0.87021 2.8235 -936.68\n0.0017821 0.0001592 0.98707\n\nD: 1.4259 0.070724 58.865\n0.39243 1.3442 -170.04\n0.00084248 0.00011346 0.98851\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_26_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_26_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 1.3231 -0.10518 226.69\n0.35118 1.4445 -217.52\n0.00076877 -2.4515e-05 0.99903\n\nB: 0.85555 -0.17378 91.59\n0.17068 0.85755 -31.264\n-5.1182e-06 2.0966e-06 1.0023\n\nC: 0.57125 -0.095863 127.19\n0.050302 0.75099 -13.911\n-0.00020485 1.2421e-06 0.9999\n\nD: 1.3186 -0.0097277 -143.16\n0.094663 1.1956 -58.383\n0.00019153 -2.0281e-05 0.99989\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.3231 -0.10518 226.69\n0.35118 1.4445 -217.52\n0.00076877 -2.4515e-05 0.99903\n\nB: 0.85555 -0.17378 91.59\n0.17068 0.85755 -31.264\n-5.1182e-06 2.0966e-06 1.0023\n\nC: 0.57125 -0.095863 127.19\n0.050302 0.75099 -13.911\n-0.00020485 1.2421e-06 0.9999\n\nD: 1.3186 -0.0097277 -143.16\n0.094663 1.1956 -58.383\n0.00019153 -2.0281e-05 0.99989\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_27_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_27_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 1.6996 0.02142 200.09\n0.31149 1.4251 -246.25\n0.00053609 -6.8541e-05 0.99889\n\nB: 0.4591 -0.47767 436.55\n0.46479 0.46941 -27.514\n-2.7182e-05 -1.2668e-06 1.0191\n\nC: 0.55202 0.096567 108.66\n-0.35774 1.4927 -276.32\n-0.00068886 0.0001065 0.98986\n\nD: 1.0499 0.025643 108.77\n0.19467 1.0054 -7.8895\n0.0011218 -3.184e-05 1.0021\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.6996 0.02142 200.09\n0.31149 1.4251 -246.25\n0.00053609 -6.8541e-05 0.99889\n\nB: 0.4591 -0.47767 436.55\n0.46479 0.46941 -27.514\n-2.7182e-05 -1.2668e-06 1.0191\n\nC: 0.55202 0.096567 108.66\n-0.35774 1.4927 -276.32\n-0.00068886 0.0001065 0.98986\n\nD: 1.0499 0.025643 108.77\n0.19467 1.0054 -7.8895\n0.0011218 -3.184e-05 1.0021\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_28_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_28_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.13896 0.020204 194.37\n-0.25201 0.63798 118.99\n-0.00052359 2.2762e-05 0.9996\n\nB: 1.6477 -0.037624 101.59\n0.49962 1.5725 -364.98\n0.00090272 4.6589e-05 1.0037\n\nC: 0.091252 0.0066749 132.72\n-0.14667 0.47258 88.51\n-0.00056772 8.3791e-06 1.0029\n\nD: 0.30367 0.12862 200.05\n-0.12888 0.30356 134.47\n2.6855e-07 -3.4026e-07 1\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.13896 0.020204 194.37\n-0.25201 0.63798 118.99\n-0.00052359 2.2762e-05 0.9996\n\nB: 1.6477 -0.037624 101.59\n0.49962 1.5725 -364.98\n0.00090272 4.6589e-05 1.0037\n\nC: 0.091252 0.0066749 132.72\n-0.14667 0.47258 88.51\n-0.00056772 8.3791e-06 1.0029\n\nD: 0.30367 0.12862 200.05\n-0.12888 0.30356 134.47\n2.6855e-07 -3.4026e-07 1\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_29_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_29_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.31483 0.11583 690.51\n0.17546 0.70637 14.497\n0.00026712 0.00012691 1\n\nB: 0.10472 0.069057 99.841\n-0.17731 0.5329 107.18\n-0.00051255 -1.3734e-05 0.98616\n\nC: 0.76922 -0.28498 222.68\n0.33855 1.0341 -81.069\n0.00035349 1.2014e-05 0.99834\n\nD: -0.47246 -0.28359 869.57\n0.29041 -0.47016 396.67\n5.0949e-06 1.2499e-05 0.99998\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.31483 0.11583 690.51\n0.17546 0.70637 14.497\n0.00026712 0.00012691 1\n\nB: 0.10472 0.069057 99.841\n-0.17731 0.5329 107.18\n-0.00051255 -1.3734e-05 0.98616\n\nC: 0.76922 -0.28498 222.68\n0.33855 1.0341 -81.069\n0.00035349 1.2014e-05 0.99834\n\nD: -0.47246 -0.28359 869.57\n0.29041 -0.47016 396.67\n5.0949e-06 1.2499e-05 0.99998\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_30_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_30_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 1.1198 0.031669 158.94\n0.13747 0.986 -24.458\n0.00036259 4.1267e-05 0.99658\n\nB: 1.3522 0.025037 96.693\n0.20588 1.5085 -279.44\n0.000418 4.2466e-05 1.0103\n\nC: 0.040904 -0.0023332 234.76\n-0.10713 0.35038 218.5\n-0.00028907 6.311e-06 1.0035\n\nD: 0.38266 -0.33125 122.6\n-0.21363 0.61581 225.35\n-0.00034121 -7.7515e-06 0.99865\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.1198 0.031669 158.94\n0.13747 0.986 -24.458\n0.00036259 4.1267e-05 0.99658\n\nB: 1.3522 0.025037 96.693\n0.20588 1.5085 -279.44\n0.000418 4.2466e-05 1.0103\n\nC: 0.040904 -0.0023332 234.76\n-0.10713 0.35038 218.5\n-0.00028907 6.311e-06 1.0035\n\nD: 0.38266 -0.33125 122.6\n-0.21363 0.61581 225.35\n-0.00034121 -7.7515e-06 0.99865\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_31_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_31_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.15114 -0.00089399 241.66\n-0.078633 0.45918 14.453\n-0.00033245 3.1152e-05 0.99996\n\nB: 0.87235 0.023622 101.75\n0.12982 0.76075 59.456\n0.0005519 9.0915e-05 1.0016\n\nC: 0.31483 0.11583 690.51\n0.17546 0.70637 14.497\n0.00026712 0.00012691 1\n\nD: 0.084461 -0.022036 252.3\n-0.21 0.51325 245.38\n-0.000447 -2.621e-05 1.0009\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.15114 -0.00089399 241.66\n-0.078633 0.45918 14.453\n-0.00033245 3.1152e-05 0.99996\n\nB: 0.87235 0.023622 101.75\n0.12982 0.76075 59.456\n0.0005519 9.0915e-05 1.0016\n\nC: 0.31483 0.11583 690.51\n0.17546 0.70637 14.497\n0.00026712 0.00012691 1\n\nD: 0.084461 -0.022036 252.3\n-0.21 0.51325 245.38\n-0.000447 -2.621e-05 1.0009\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_32_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_32_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 2.2078 0.054458 63.617\n0.67654 2.2557 -637.98\n0.0013191 8.5079e-05 1.0033\n\nB: 0.62091 -0.030805 57.622\n-0.22703 0.84222 -13.023\n-0.00037179 -4.2767e-05 0.99852\n\nC: 2.3515 0.16969 142.03\n1.0602 2.1465 -778.33\n0.0016806 -4.8949e-05 0.99537\n\nD: 0.056448 -0.012851 135.19\n-0.38625 0.54689 255.61\n-0.00066718 5.392e-05 1.0012\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 2.2078 0.054458 63.617\n0.67654 2.2557 -637.98\n0.0013191 8.5079e-05 1.0033\n\nB: 0.62091 -0.030805 57.622\n-0.22703 0.84222 -13.023\n-0.00037179 -4.2767e-05 0.99852\n\nC: 2.3515 0.16969 142.03\n1.0602 2.1465 -778.33\n0.0016806 -4.8949e-05 0.99537\n\nD: 0.056448 -0.012851 135.19\n-0.38625 0.54689 255.61\n-0.00066718 5.392e-05 1.0012\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_33_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_33_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.72201 0.13445 62.975\n0.059719 0.85126 46.305\n-1.7322e-05 0.00018166 1.0001\n\nB: 1.4862 -0.061679 54.577\n0.4606 1.2816 -147.5\n0.0007321 -7.3842e-05 0.99895\n\nC: 1.3231 -0.10518 226.69\n0.35118 1.4445 -217.52\n0.00076877 -2.4515e-05 0.99903\n\nD: 1.3522 0.025037 96.693\n0.20588 1.5085 -279.44\n0.000418 4.2466e-05 1.0103\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.72201 0.13445 62.975\n0.059719 0.85126 46.305\n-1.7322e-05 0.00018166 1.0001\n\nB: 1.4862 -0.061679 54.577\n0.4606 1.2816 -147.5\n0.0007321 -7.3842e-05 0.99895\n\nC: 1.3231 -0.10518 226.69\n0.35118 1.4445 -217.52\n0.00076877 -2.4515e-05 0.99903\n\nD: 1.3522 0.025037 96.693\n0.20588 1.5085 -279.44\n0.000418 4.2466e-05 1.0103\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_34_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_34_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.3794 0.089822 49.168\n-0.27745 0.88349 -5.6379\n-0.00046319 5.6849e-05 0.99886\n\nB: 0.88184 0.31397 -39.976\n-0.18167 0.93621 153.25\n0.00020118 -1.9028e-05 0.99997\n\nC: 1.0478 0.035143 64.843\n0.063507 1.0349 21.701\n0.00023044 -6.878e-06 0.99998\n\nD: 1.7761 -0.053427 263.17\n0.41751 1.5987 -329.46\n0.00069677 3.1372e-05 1.0014\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.3794 0.089822 49.168\n-0.27745 0.88349 -5.6379\n-0.00046319 5.6849e-05 0.99886\n\nB: 0.88184 0.31397 -39.976\n-0.18167 0.93621 153.25\n0.00020118 -1.9028e-05 0.99997\n\nC: 1.0478 0.035143 64.843\n0.063507 1.0349 21.701\n0.00023044 -6.878e-06 0.99998\n\nD: 1.7761 -0.053427 263.17\n0.41751 1.5987 -329.46\n0.00069677 3.1372e-05 1.0014\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_35_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_35_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 1.8278 -0.0075993 72.268\n0.68643 1.8832 -550.61\n0.0012853 4.1209e-05 1.006\n\nB: 0.4591 -0.47767 436.55\n0.46479 0.46941 -27.514\n-2.7182e-05 -1.2668e-06 1.0191\n\nC: 0.83129 0.00294 81.765\n-0.011403 0.83158 63.28\n-7.0021e-06 -1.5701e-05 1\n\nD: 1.0819 0.012805 66.799\n0.075853 1.006 5.6909\n0.00034273 -2.4626e-05 1.0003\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.8278 -0.0075993 72.268\n0.68643 1.8832 -550.61\n0.0012853 4.1209e-05 1.006\n\nB: 0.4591 -0.47767 436.55\n0.46479 0.46941 -27.514\n-2.7182e-05 -1.2668e-06 1.0191\n\nC: 0.83129 0.00294 81.765\n-0.011403 0.83158 63.28\n-7.0021e-06 -1.5701e-05 1\n\nD: 1.0819 0.012805 66.799\n0.075853 1.006 5.6909\n0.00034273 -2.4626e-05 1.0003\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_36_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_36_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 1.4932 0.01661 231.74\n0.45676 1.4341 -212.29\n0.0013256 9.9938e-05 0.99686\n\nB: 0.55202 0.096567 108.66\n-0.35774 1.4927 -276.32\n-0.00068886 0.0001065 0.98986\n\nC: 0.17608 -0.024321 273.19\n-0.19809 0.7405 74.826\n-0.00053318 1.2457e-05 1.0069\n\nD: 1.0063 -0.0054085 288.55\n0.23295 0.84053 7.8206\n0.0005941 1.4583e-05 1.0001\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.4932 0.01661 231.74\n0.45676 1.4341 -212.29\n0.0013256 9.9938e-05 0.99686\n\nB: 0.55202 0.096567 108.66\n-0.35774 1.4927 -276.32\n-0.00068886 0.0001065 0.98986\n\nC: 0.17608 -0.024321 273.19\n-0.19809 0.7405 74.826\n-0.00053318 1.2457e-05 1.0069\n\nD: 1.0063 -0.0054085 288.55\n0.23295 0.84053 7.8206\n0.0005941 1.4583e-05 1.0001\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_37_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_37_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 1.4403 0.27154 10.734\n0.071471 1.5534 -44.533\n0.00030432 0.00049723 1.001\n\nB: 0.87235 0.023622 101.75\n0.12982 0.76075 59.456\n0.0005519 9.0915e-05 1.0016\n\nC: 1.1198 0.031669 158.94\n0.13747 0.986 -24.458\n0.00036259 4.1267e-05 0.99658\n\nD: 0.29534 0.035751 -56.21\n-0.35718 0.5432 233.53\n-0.00064211 -1.1093e-05 0.97783\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.4403 0.27154 10.734\n0.071471 1.5534 -44.533\n0.00030432 0.00049723 1.001\n\nB: 0.87235 0.023622 101.75\n0.12982 0.76075 59.456\n0.0005519 9.0915e-05 1.0016\n\nC: 1.1198 0.031669 158.94\n0.13747 0.986 -24.458\n0.00036259 4.1267e-05 0.99658\n\nD: 0.29534 0.035751 -56.21\n-0.35718 0.5432 233.53\n-0.00064211 -1.1093e-05 0.97783\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_38_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_38_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.056448 -0.012851 135.19\n-0.38625 0.54689 255.61\n-0.00066718 5.392e-05 1.0012\n\nB: 1.6996 0.02142 200.09\n0.31149 1.4251 -246.25\n0.00053609 -6.8541e-05 0.99889\n\nC: 1.3186 -0.0097277 -143.16\n0.094663 1.1956 -58.383\n0.00019153 -2.0281e-05 0.99989\n\nD: 0.1268 -0.03963 330.5\n-0.1892 0.46973 254.2\n-0.00039857 -3.9641e-05 0.99971\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.056448 -0.012851 135.19\n-0.38625 0.54689 255.61\n-0.00066718 5.392e-05 1.0012\n\nB: 1.6996 0.02142 200.09\n0.31149 1.4251 -246.25\n0.00053609 -6.8541e-05 0.99889\n\nC: 1.3186 -0.0097277 -143.16\n0.094663 1.1956 -58.383\n0.00019153 -2.0281e-05 0.99989\n\nD: 0.1268 -0.03963 330.5\n-0.1892 0.46973 254.2\n-0.00039857 -3.9641e-05 0.99971\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_39_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_39_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 1.441 -0.037212 269.33\n0.73295 1.6438 -380.65\n0.0014226 4.1601e-05 1.0102\n\nB: 0.60665 -0.013034 217.78\n0.087451 0.52146 32.707\n0.00021516 2.9281e-07 1.0006\n\nC: 0.30367 0.12862 200.05\n-0.12888 0.30356 134.47\n2.6855e-07 -3.4026e-07 1\n\nD: 0.28973 0.014397 100.07\n-0.29955 0.64174 168.27\n-0.00067332 7.239e-06 1.0017\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.441 -0.037212 269.33\n0.73295 1.6438 -380.65\n0.0014226 4.1601e-05 1.0102\n\nB: 0.60665 -0.013034 217.78\n0.087451 0.52146 32.707\n0.00021516 2.9281e-07 1.0006\n\nC: 0.30367 0.12862 200.05\n-0.12888 0.30356 134.47\n2.6855e-07 -3.4026e-07 1\n\nD: 0.28973 0.014397 100.07\n-0.29955 0.64174 168.27\n-0.00067332 7.239e-06 1.0017\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_40_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_40_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.012717 0.014394 193.52\n-0.12386 0.60301 126.7\n-0.00063953 7.9665e-05 1.0012\n\nB: 0.7855 0.039826 119.05\n-0.25749 1.3451 -220.69\n-0.00047304 5.3677e-05 1.001\n\nC: 1.5134 -0.0029581 20.934\n0.2678 1.4062 -232.68\n0.00048583 -4.0311e-06 1.0006\n\nD: 0.85555 -0.17378 91.59\n0.17068 0.85755 -31.264\n-5.1182e-06 2.0966e-06 1.0023\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.012717 0.014394 193.52\n-0.12386 0.60301 126.7\n-0.00063953 7.9665e-05 1.0012\n\nB: 0.7855 0.039826 119.05\n-0.25749 1.3451 -220.69\n-0.00047304 5.3677e-05 1.001\n\nC: 1.5134 -0.0029581 20.934\n0.2678 1.4062 -232.68\n0.00048583 -4.0311e-06 1.0006\n\nD: 0.85555 -0.17378 91.59\n0.17068 0.85755 -31.264\n-5.1182e-06 2.0966e-06 1.0023\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_41_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_41_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.18178 0.033268 82.883\n-0.24959 0.68306 123.62\n-0.0004688 5.3047e-05 1.0005\n\nB: 0.60665 -0.013034 217.78\n0.087451 0.52146 32.707\n0.00021516 2.9281e-07 1.0006\n\nC: 0.62147 0.055609 221.79\n0.21978 1.1561 -23.942\n0.00048557 -4.4311e-05 0.99866\n\nD: 1.3186 -0.0097277 -143.16\n0.094663 1.1956 -58.383\n0.00019153 -2.0281e-05 0.99989\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.18178 0.033268 82.883\n-0.24959 0.68306 123.62\n-0.0004688 5.3047e-05 1.0005\n\nB: 0.60665 -0.013034 217.78\n0.087451 0.52146 32.707\n0.00021516 2.9281e-07 1.0006\n\nC: 0.62147 0.055609 221.79\n0.21978 1.1561 -23.942\n0.00048557 -4.4311e-05 0.99866\n\nD: 1.3186 -0.0097277 -143.16\n0.094663 1.1956 -58.383\n0.00019153 -2.0281e-05 0.99989\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_42_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_42_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.4849 -0.15095 280.72\n-0.18568 0.38797 170.57\n-4.9965e-05 -0.00024428 0.99985\n\nB: 0.81883 -0.28544 161.88\n0.010536 0.53499 62.327\n1.3163e-05 -0.00056443 1.0014\n\nC: 0.54304 0.026384 236.48\n-0.041921 0.64806 87.13\n-5.8662e-05 1.5685e-05 1\n\nD: 0.14705 0.061323 72.893\n-0.27582 0.69094 109.44\n-0.00056993 1.3825e-06 0.9981\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.4849 -0.15095 280.72\n-0.18568 0.38797 170.57\n-4.9965e-05 -0.00024428 0.99985\n\nB: 0.81883 -0.28544 161.88\n0.010536 0.53499 62.327\n1.3163e-05 -0.00056443 1.0014\n\nC: 0.54304 0.026384 236.48\n-0.041921 0.64806 87.13\n-5.8662e-05 1.5685e-05 1\n\nD: 0.14705 0.061323 72.893\n-0.27582 0.69094 109.44\n-0.00056993 1.3825e-06 0.9981\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_43_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_43_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.41873 -0.043533 -18.562\n-0.27021 0.88041 53.791\n-0.00050299 -2.2546e-05 0.99941\n\nB: 0.63669 0.0018872 137.9\n-0.00033285 0.63926 95.922\n-2.0441e-06 4.1104e-06 1\n\nC: 1.1884 0.015274 95.776\n0.23282 1.0681 -20.551\n0.00097623 0.00015903 1.0014\n\nD: 0.3794 0.089822 49.168\n-0.27745 0.88349 -5.6379\n-0.00046319 5.6849e-05 0.99886\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.41873 -0.043533 -18.562\n-0.27021 0.88041 53.791\n-0.00050299 -2.2546e-05 0.99941\n\nB: 0.63669 0.0018872 137.9\n-0.00033285 0.63926 95.922\n-2.0441e-06 4.1104e-06 1\n\nC: 1.1884 0.015274 95.776\n0.23282 1.0681 -20.551\n0.00097623 0.00015903 1.0014\n\nD: 0.3794 0.089822 49.168\n-0.27745 0.88349 -5.6379\n-0.00046319 5.6849e-05 0.99886\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_44_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_44_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.1268 -0.03963 330.5\n-0.1892 0.46973 254.2\n-0.00039857 -3.9641e-05 0.99971\n\nB: 4.3722 0.14407 -818.24\n-0.25209 3.9595 -549.15\n0.001718 0.0010825 0.97985\n\nC: 1.0582 -0.013384 562.45\n0.1807 0.93712 36.472\n0.00043718 5.9368e-06 0.99927\n\nD: 1.3903 -0.069797 29.319\n0.18963 1.0284 22.049\n0.00052989 -9.8197e-05 1.0021\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.1268 -0.03963 330.5\n-0.1892 0.46973 254.2\n-0.00039857 -3.9641e-05 0.99971\n\nB: 4.3722 0.14407 -818.24\n-0.25209 3.9595 -549.15\n0.001718 0.0010825 0.97985\n\nC: 1.0582 -0.013384 562.45\n0.1807 0.93712 36.472\n0.00043718 5.9368e-06 0.99927\n\nD: 1.3903 -0.069797 29.319\n0.18963 1.0284 22.049\n0.00052989 -9.8197e-05 1.0021\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_45_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_45_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.24117 0.068506 48.185\n-0.23318 0.79398 68.106\n-0.0005259 5.079e-05 0.99834\n\nB: 0.53266 0.0019756 44.297\n-0.18137 0.85955 61.945\n-0.00038035 1.4705e-06 0.9999\n\nC: 0.79208 0.010314 26.019\n-0.023778 0.92337 43.513\n-0.00011513 1.2161e-05 1.0003\n\nD: 0.48275 -0.12831 276.04\n-0.19138 0.40711 199.19\n-5.6548e-05 -0.00023367 0.99912\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.24117 0.068506 48.185\n-0.23318 0.79398 68.106\n-0.0005259 5.079e-05 0.99834\n\nB: 0.53266 0.0019756 44.297\n-0.18137 0.85955 61.945\n-0.00038035 1.4705e-06 0.9999\n\nC: 0.79208 0.010314 26.019\n-0.023778 0.92337 43.513\n-0.00011513 1.2161e-05 1.0003\n\nD: 0.48275 -0.12831 276.04\n-0.19138 0.40711 199.19\n-5.6548e-05 -0.00023367 0.99912\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_46_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_46_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.13416 0.073075 56.977\n-0.21333 0.70433 84.528\n-0.00055481 6.1106e-05 1\n\nB: 0.44469 -0.1629 197.72\n-0.090792 0.33606 37.55\n-0.00032851 -0.00028415 1.0004\n\nC: 1.0505 -0.0053825 276.45\n0.20631 0.92888 48.832\n0.00048841 -1.9251e-05 0.99878\n\nD: 0.62147 0.055609 221.79\n0.21978 1.1561 -23.942\n0.00048557 -4.4311e-05 0.99866\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.13416 0.073075 56.977\n-0.21333 0.70433 84.528\n-0.00055481 6.1106e-05 1\n\nB: 0.44469 -0.1629 197.72\n-0.090792 0.33606 37.55\n-0.00032851 -0.00028415 1.0004\n\nC: 1.0505 -0.0053825 276.45\n0.20631 0.92888 48.832\n0.00048841 -1.9251e-05 0.99878\n\nD: 0.62147 0.055609 221.79\n0.21978 1.1561 -23.942\n0.00048557 -4.4311e-05 0.99866\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_47_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_47_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 1.2108 -0.031741 47.374\n0.20996 1.0345 -107.36\n0.00054926 -6.3631e-06 1.0004\n\nB: 2.5614 0.083075 163.07\n0.94137 2.2586 -732.08\n0.0017783 2.1603e-05 0.99316\n\nC: 0.7855 0.039826 119.05\n-0.25749 1.3451 -220.69\n-0.00047304 5.3677e-05 1.001\n\nD: 0.15114 -0.00089399 241.66\n-0.078633 0.45918 14.453\n-0.00033245 3.1152e-05 0.99996\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.2108 -0.031741 47.374\n0.20996 1.0345 -107.36\n0.00054926 -6.3631e-06 1.0004\n\nB: 2.5614 0.083075 163.07\n0.94137 2.2586 -732.08\n0.0017783 2.1603e-05 0.99316\n\nC: 0.7855 0.039826 119.05\n-0.25749 1.3451 -220.69\n-0.00047304 5.3677e-05 1.001\n\nD: 0.15114 -0.00089399 241.66\n-0.078633 0.45918 14.453\n-0.00033245 3.1152e-05 0.99996\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_48_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_48_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.7088 -0.010965 -26.07\n-0.13602 0.83489 103.19\n-0.00023352 -1.5615e-05 1.0004\n\nB: 3.1418 0.21701 -576.91\n0.129 3.5039 -1062.5\n0.0014143 0.00082533 0.98844\n\nC: 0.70161 0.023304 -1.9207\n-0.10366 0.81239 71.251\n-0.00023167 -1.5062e-05 0.99976\n\nD: 0.36677 -0.019493 213.68\n-0.082321 0.47708 180.81\n-0.00021125 -4.1441e-05 1.0123\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.7088 -0.010965 -26.07\n-0.13602 0.83489 103.19\n-0.00023352 -1.5615e-05 1.0004\n\nB: 3.1418 0.21701 -576.91\n0.129 3.5039 -1062.5\n0.0014143 0.00082533 0.98844\n\nC: 0.70161 0.023304 -1.9207\n-0.10366 0.81239 71.251\n-0.00023167 -1.5062e-05 0.99976\n\nD: 0.36677 -0.019493 213.68\n-0.082321 0.47708 180.81\n-0.00021125 -4.1441e-05 1.0123\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_49_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_49_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.35568 0.079611 -21.49\n-0.17793 0.7199 62.24\n-0.00050458 1.9913e-05 0.9982\n\nB: 0.54372 0.011697 65.787\n-0.06271 0.8727 105.67\n-0.00025117 2.4814e-06 0.99967\n\nC: 1.0582 -0.013384 562.45\n0.1807 0.93712 36.472\n0.00043718 5.9368e-06 0.99927\n\nD: 0.47589 0.042551 60.888\n-0.21388 0.80238 62.033\n-0.0003663 2.6901e-05 1.001\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.35568 0.079611 -21.49\n-0.17793 0.7199 62.24\n-0.00050458 1.9913e-05 0.9982\n\nB: 0.54372 0.011697 65.787\n-0.06271 0.8727 105.67\n-0.00025117 2.4814e-06 0.99967\n\nC: 1.0582 -0.013384 562.45\n0.1807 0.93712 36.472\n0.00043718 5.9368e-06 0.99927\n\nD: 0.47589 0.042551 60.888\n-0.21388 0.80238 62.033\n-0.0003663 2.6901e-05 1.001\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_50_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_50_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 2.2787 0.023843 -30.321\n0.58793 1.9158 -459.28\n0.0012782 -6.6868e-06 0.99971\n\nB: 0.22888 0.0058691 272.09\n-0.077153 0.3923 203.08\n-0.00024299 -4.5827e-06 1.0015\n\nC: 0.38922 0.015343 55.85\n-0.1763 0.84543 87.344\n-0.00049385 -2.1034e-05 1.0072\n\nD: 2.4144 -0.0022023 -199.3\n0.52146 2.0547 -569.49\n0.0010423 8.4489e-05 1.0043\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 2.2787 0.023843 -30.321\n0.58793 1.9158 -459.28\n0.0012782 -6.6868e-06 0.99971\n\nB: 0.22888 0.0058691 272.09\n-0.077153 0.3923 203.08\n-0.00024299 -4.5827e-06 1.0015\n\nC: 0.38922 0.015343 55.85\n-0.1763 0.84543 87.344\n-0.00049385 -2.1034e-05 1.0072\n\nD: 2.4144 -0.0022023 -199.3\n0.52146 2.0547 -569.49\n0.0010423 8.4489e-05 1.0043\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_51_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_51_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.38922 0.015343 55.85\n-0.1763 0.84543 87.344\n-0.00049385 -2.1034e-05 1.0072\n\nB: 0.70212 0.43231 -128.54\n-0.42351 0.70276 199.3\n6.3285e-06 1.2175e-05 0.99997\n\nC: 0.42945 0.0071566 96.266\n-0.019537 0.48377 43.049\n-7.8698e-05 1.6013e-05 1.0001\n\nD: 0.70161 0.023304 -1.9207\n-0.10366 0.81239 71.251\n-0.00023167 -1.5062e-05 0.99976\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.38922 0.015343 55.85\n-0.1763 0.84543 87.344\n-0.00049385 -2.1034e-05 1.0072\n\nB: 0.70212 0.43231 -128.54\n-0.42351 0.70276 199.3\n6.3285e-06 1.2175e-05 0.99997\n\nC: 0.42945 0.0071566 96.266\n-0.019537 0.48377 43.049\n-7.8698e-05 1.6013e-05 1.0001\n\nD: 0.70161 0.023304 -1.9207\n-0.10366 0.81239 71.251\n-0.00023167 -1.5062e-05 0.99976\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_52_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_52_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 2.2787 0.023843 -30.321\n0.58793 1.9158 -459.28\n0.0012782 -6.6868e-06 0.99971\n\nB: 0.73597 -0.0032436 13.11\n0.017092 0.71039 36.002\n5.8878e-05 -9.3828e-06 0.99995\n\nC: 1.0478 0.035143 64.843\n0.063507 1.0349 21.701\n0.00023044 -6.878e-06 0.99998\n\nD: 0.52949 -0.028655 46.849\n-0.2451 0.79991 158.44\n-0.00032499 -1.8164e-05 0.99959\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 2.2787 0.023843 -30.321\n0.58793 1.9158 -459.28\n0.0012782 -6.6868e-06 0.99971\n\nB: 0.73597 -0.0032436 13.11\n0.017092 0.71039 36.002\n5.8878e-05 -9.3828e-06 0.99995\n\nC: 1.0478 0.035143 64.843\n0.063507 1.0349 21.701\n0.00023044 -6.878e-06 0.99998\n\nD: 0.52949 -0.028655 46.849\n-0.2451 0.79991 158.44\n-0.00032499 -1.8164e-05 0.99959\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_53_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_53_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.32788 -0.00026656 168.52\n-0.087696 0.49289 72.043\n-0.00025798 4.6006e-06 0.9984\n\nB: 0.84581 -0.039469 34.117\n-0.067529 0.81703 142.37\n-0.00011408 -0.00014793 1.0014\n\nC: 0.33414 0.069646 90.22\n-0.25229 0.73446 157.67\n-0.00038885 2.2582e-06 1.0024\n\nD: 0.38914 0.285 169.51\n-0.28531 0.39347 340.1\n-6.4617e-06 5.0341e-06 1\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.32788 -0.00026656 168.52\n-0.087696 0.49289 72.043\n-0.00025798 4.6006e-06 0.9984\n\nB: 0.84581 -0.039469 34.117\n-0.067529 0.81703 142.37\n-0.00011408 -0.00014793 1.0014\n\nC: 0.33414 0.069646 90.22\n-0.25229 0.73446 157.67\n-0.00038885 2.2582e-06 1.0024\n\nD: 0.38914 0.285 169.51\n-0.28531 0.39347 340.1\n-6.4617e-06 5.0341e-06 1\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_54_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_54_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 1.441 -0.037212 269.33\n0.73295 1.6438 -380.65\n0.0014226 4.1601e-05 1.0102\n\nB: 1.1901 -0.048587 107.72\n0.14488 1.1926 -121.84\n0.00033622 1.1241e-05 1.0001\n\nC: 2.6177 0.042575 -65.797\n0.74359 2.3954 -903.27\n0.0018892 8.2816e-05 0.98996\n\nD: 0.27317 0.041297 84.951\n-0.22859 0.68736 124.47\n-0.00041264 5.2763e-05 1.0003\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.441 -0.037212 269.33\n0.73295 1.6438 -380.65\n0.0014226 4.1601e-05 1.0102\n\nB: 1.1901 -0.048587 107.72\n0.14488 1.1926 -121.84\n0.00033622 1.1241e-05 1.0001\n\nC: 2.6177 0.042575 -65.797\n0.74359 2.3954 -903.27\n0.0018892 8.2816e-05 0.98996\n\nD: 0.27317 0.041297 84.951\n-0.22859 0.68736 124.47\n-0.00041264 5.2763e-05 1.0003\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_55_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_55_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.85799 0.21669 9.4839\n-0.21177 0.85855 130.48\n1.5015e-06 9.2033e-07 1\n\nB: 1.1442 -0.037625 115.5\n0.22206 1.0286 -30.039\n0.00032815 -2.4116e-05 0.9999\n\nC: 0.17608 -0.024321 273.19\n-0.19809 0.7405 74.826\n-0.00053318 1.2457e-05 1.0069\n\nD: 2.3594 0.0026252 -116.05\n0.5085 2.302 -550.96\n0.0013826 0.0001837 1.0004\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.85799 0.21669 9.4839\n-0.21177 0.85855 130.48\n1.5015e-06 9.2033e-07 1\n\nB: 1.1442 -0.037625 115.5\n0.22206 1.0286 -30.039\n0.00032815 -2.4116e-05 0.9999\n\nC: 0.17608 -0.024321 273.19\n-0.19809 0.7405 74.826\n-0.00053318 1.2457e-05 1.0069\n\nD: 2.3594 0.0026252 -116.05\n0.5085 2.302 -550.96\n0.0013826 0.0001837 1.0004\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_56_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_56_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.4849 -0.15095 280.72\n-0.18568 0.38797 170.57\n-4.9965e-05 -0.00024428 0.99985\n\nB: 2.1479 0.036813 206.94\n0.67819 1.8174 -485.8\n0.0012074 -6.8043e-06 0.99599\n\nC: 0.24117 0.068506 48.185\n-0.23318 0.79398 68.106\n-0.0005259 5.079e-05 0.99834\n\nD: 0.38922 0.015343 55.85\n-0.1763 0.84543 87.344\n-0.00049385 -2.1034e-05 1.0072\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.4849 -0.15095 280.72\n-0.18568 0.38797 170.57\n-4.9965e-05 -0.00024428 0.99985\n\nB: 2.1479 0.036813 206.94\n0.67819 1.8174 -485.8\n0.0012074 -6.8043e-06 0.99599\n\nC: 0.24117 0.068506 48.185\n-0.23318 0.79398 68.106\n-0.0005259 5.079e-05 0.99834\n\nD: 0.38922 0.015343 55.85\n-0.1763 0.84543 87.344\n-0.00049385 -2.1034e-05 1.0072\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_57_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_57_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.2564 0.092521 94.187\n-0.28031 0.83589 -0.15652\n-0.00048968 6.0866e-05 1.0015\n\nB: 2.6481 0.070248 -423.11\n0.5002 2.6605 -906.39\n0.0012014 0.00025943 0.99533\n\nC: 1.0035 -0.00055314 2.5255\n-0.0028717 1.0087 -9.7285\n-3.8783e-06 3.4244e-06 1\n\nD: 2.5614 0.083075 163.07\n0.94137 2.2586 -732.08\n0.0017783 2.1603e-05 0.99316\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.2564 0.092521 94.187\n-0.28031 0.83589 -0.15652\n-0.00048968 6.0866e-05 1.0015\n\nB: 2.6481 0.070248 -423.11\n0.5002 2.6605 -906.39\n0.0012014 0.00025943 0.99533\n\nC: 1.0035 -0.00055314 2.5255\n-0.0028717 1.0087 -9.7285\n-3.8783e-06 3.4244e-06 1\n\nD: 2.5614 0.083075 163.07\n0.94137 2.2586 -732.08\n0.0017783 2.1603e-05 0.99316\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_58_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_58_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 1.3838 0.024181 -93.882\n0.093344 1.307 -232.76\n0.00015995 6.7546e-05 1.0008\n\nB: 1.4219 0.01866 342.44\n0.36005 1.3261 -141.73\n0.00090969 2.3838e-05 1.0002\n\nC: 1.9861 0.031586 27.893\n0.62141 1.9607 -531.99\n0.0011993 -1.9815e-05 0.99978\n\nD: 0.22888 0.0058691 272.09\n-0.077153 0.3923 203.08\n-0.00024299 -4.5827e-06 1.0015\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.3838 0.024181 -93.882\n0.093344 1.307 -232.76\n0.00015995 6.7546e-05 1.0008\n\nB: 1.4219 0.01866 342.44\n0.36005 1.3261 -141.73\n0.00090969 2.3838e-05 1.0002\n\nC: 1.9861 0.031586 27.893\n0.62141 1.9607 -531.99\n0.0011993 -1.9815e-05 0.99978\n\nD: 0.22888 0.0058691 272.09\n-0.077153 0.3923 203.08\n-0.00024299 -4.5827e-06 1.0015\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_59_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_59_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 1.6477 -0.037624 101.59\n0.49962 1.5725 -364.98\n0.00090272 4.6589e-05 1.0037\n\nB: 1.8278 -0.0075993 72.268\n0.68643 1.8832 -550.61\n0.0012853 4.1209e-05 1.006\n\nC: 1.4932 0.01661 231.74\n0.45676 1.4341 -212.29\n0.0013256 9.9938e-05 0.99686\n\nD: 2.2787 0.023843 -30.321\n0.58793 1.9158 -459.28\n0.0012782 -6.6868e-06 0.99971\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.6477 -0.037624 101.59\n0.49962 1.5725 -364.98\n0.00090272 4.6589e-05 1.0037\n\nB: 1.8278 -0.0075993 72.268\n0.68643 1.8832 -550.61\n0.0012853 4.1209e-05 1.006\n\nC: 1.4932 0.01661 231.74\n0.45676 1.4341 -212.29\n0.0013256 9.9938e-05 0.99686\n\nD: 2.2787 0.023843 -30.321\n0.58793 1.9158 -459.28\n0.0012782 -6.6868e-06 0.99971\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_60_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_60_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 2.3515 0.16969 142.03\n1.0602 2.1465 -778.33\n0.0016806 -4.8949e-05 0.99537\n\nB: 0.54372 0.011697 65.787\n-0.06271 0.8727 105.67\n-0.00025117 2.4814e-06 0.99967\n\nC: 3.4851 0.086317 195.9\n1.1598 3.067 -1009.5\n0.0025647 -5.4567e-05 0.99349\n\nD: 0.60665 -0.013034 217.78\n0.087451 0.52146 32.707\n0.00021516 2.9281e-07 1.0006\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 2.3515 0.16969 142.03\n1.0602 2.1465 -778.33\n0.0016806 -4.8949e-05 0.99537\n\nB: 0.54372 0.011697 65.787\n-0.06271 0.8727 105.67\n-0.00025117 2.4814e-06 0.99967\n\nC: 3.4851 0.086317 195.9\n1.1598 3.067 -1009.5\n0.0025647 -5.4567e-05 0.99349\n\nD: 0.60665 -0.013034 217.78\n0.087451 0.52146 32.707\n0.00021516 2.9281e-07 1.0006\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_61_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_61_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.88184 0.31397 -39.976\n-0.18167 0.93621 153.25\n0.00020118 -1.9028e-05 0.99997\n\nB: 0.2024 0.0033266 96.15\n-0.28093 0.65512 201.73\n-0.00049784 1.8106e-06 1.0048\n\nC: 0.60367 0.071352 -36.528\n-0.21232 0.96671 -45.299\n-0.00036835 6.7456e-05 0.99996\n\nD: 0.48531 0.10549 -95.005\n-0.11843 0.77202 44.217\n-0.00029301 2.8434e-05 0.99773\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.88184 0.31397 -39.976\n-0.18167 0.93621 153.25\n0.00020118 -1.9028e-05 0.99997\n\nB: 0.2024 0.0033266 96.15\n-0.28093 0.65512 201.73\n-0.00049784 1.8106e-06 1.0048\n\nC: 0.60367 0.071352 -36.528\n-0.21232 0.96671 -45.299\n-0.00036835 6.7456e-05 0.99996\n\nD: 0.48531 0.10549 -95.005\n-0.11843 0.77202 44.217\n-0.00029301 2.8434e-05 0.99773\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_62_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_62_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.8771 0.00026849 -1.1131\n-0.035484 0.88589 36.525\n-7.7192e-05 -1.833e-05 1\n\nB: 0.4591 -0.47767 436.55\n0.46479 0.46941 -27.514\n-2.7182e-05 -1.2668e-06 1.0191\n\nC: 1.1901 -0.048587 107.72\n0.14488 1.1926 -121.84\n0.00033622 1.1241e-05 1.0001\n\nD: 0.70212 0.43231 -128.54\n-0.42351 0.70276 199.3\n6.3285e-06 1.2175e-05 0.99997\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.8771 0.00026849 -1.1131\n-0.035484 0.88589 36.525\n-7.7192e-05 -1.833e-05 1\n\nB: 0.4591 -0.47767 436.55\n0.46479 0.46941 -27.514\n-2.7182e-05 -1.2668e-06 1.0191\n\nC: 1.1901 -0.048587 107.72\n0.14488 1.1926 -121.84\n0.00033622 1.1241e-05 1.0001\n\nD: 0.70212 0.43231 -128.54\n-0.42351 0.70276 199.3\n6.3285e-06 1.2175e-05 0.99997\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_63_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_63_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 1.5134 -0.0029581 20.934\n0.2678 1.4062 -232.68\n0.00048583 -4.0311e-06 1.0006\n\nB: 0.70212 0.43231 -128.54\n-0.42351 0.70276 199.3\n6.3285e-06 1.2175e-05 0.99997\n\nC: 0.49838 -0.015725 33.278\n-0.18045 0.77392 59.799\n-0.00064863 -4.2793e-05 0.99978\n\nD: 0.13416 0.073075 56.977\n-0.21333 0.70433 84.528\n-0.00055481 6.1106e-05 1\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.5134 -0.0029581 20.934\n0.2678 1.4062 -232.68\n0.00048583 -4.0311e-06 1.0006\n\nB: 0.70212 0.43231 -128.54\n-0.42351 0.70276 199.3\n6.3285e-06 1.2175e-05 0.99997\n\nC: 0.49838 -0.015725 33.278\n-0.18045 0.77392 59.799\n-0.00064863 -4.2793e-05 0.99978\n\nD: 0.13416 0.073075 56.977\n-0.21333 0.70433 84.528\n-0.00055481 6.1106e-05 1\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_64_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_64_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 1.3522 0.025037 96.693\n0.20588 1.5085 -279.44\n0.000418 4.2466e-05 1.0103\n\nB: 1.8954 -0.043603 197.83\n0.50589 1.509 -236.95\n0.0010644 -1.6279e-05 1.0115\n\nC: 0.48882 0.0079397 13.575\n-0.24956 0.69593 149.6\n-0.00053246 -7.8574e-06 1.0026\n\nD: 0.94726 0.076953 177.36\n0.25112 1.0126 13.205\n0.00047269 2.7805e-05 0.99969\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.3522 0.025037 96.693\n0.20588 1.5085 -279.44\n0.000418 4.2466e-05 1.0103\n\nB: 1.8954 -0.043603 197.83\n0.50589 1.509 -236.95\n0.0010644 -1.6279e-05 1.0115\n\nC: 0.48882 0.0079397 13.575\n-0.24956 0.69593 149.6\n-0.00053246 -7.8574e-06 1.0026\n\nD: 0.94726 0.076953 177.36\n0.25112 1.0126 13.205\n0.00047269 2.7805e-05 0.99969\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_65_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_65_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 1.0019 0.045013 144.39\n0.13277 0.95284 -14.111\n0.0002066 5.2875e-05 1\n\nB: 2.3515 0.16969 142.03\n1.0602 2.1465 -778.33\n0.0016806 -4.8949e-05 0.99537\n\nC: 0.1857 -0.0018512 147.73\n-0.094288 0.35154 277.67\n-0.00019671 -1.563e-05 0.9996\n\nD: 0.37618 -0.0026073 58.013\n-0.13988 0.81886 117.4\n-0.00032276 -1.1378e-05 0.99983\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.0019 0.045013 144.39\n0.13277 0.95284 -14.111\n0.0002066 5.2875e-05 1\n\nB: 2.3515 0.16969 142.03\n1.0602 2.1465 -778.33\n0.0016806 -4.8949e-05 0.99537\n\nC: 0.1857 -0.0018512 147.73\n-0.094288 0.35154 277.67\n-0.00019671 -1.563e-05 0.9996\n\nD: 0.37618 -0.0026073 58.013\n-0.13988 0.81886 117.4\n-0.00032276 -1.1378e-05 0.99983\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_66_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_66_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.77044 -0.014353 152.19\n0.007827 0.75172 76.397\n1.9039e-05 -2.1554e-05 1\n\nB: 0.40245 -0.33938 102.29\n-0.2125 0.62381 216.78\n-0.00033866 -1.5855e-05 1.0018\n\nC: 1.3838 0.024181 -93.882\n0.093344 1.307 -232.76\n0.00015995 6.7546e-05 1.0008\n\nD: 1.141 -0.024147 186.42\n0.29573 0.97376 -60.872\n0.00082251 -1.0843e-05 0.99973\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.77044 -0.014353 152.19\n0.007827 0.75172 76.397\n1.9039e-05 -2.1554e-05 1\n\nB: 0.40245 -0.33938 102.29\n-0.2125 0.62381 216.78\n-0.00033866 -1.5855e-05 1.0018\n\nC: 1.3838 0.024181 -93.882\n0.093344 1.307 -232.76\n0.00015995 6.7546e-05 1.0008\n\nD: 1.141 -0.024147 186.42\n0.29573 0.97376 -60.872\n0.00082251 -1.0843e-05 0.99973\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_67_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_67_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.79208 0.010314 26.019\n-0.023778 0.92337 43.513\n-0.00011513 1.2161e-05 1.0003\n\nB: 1.4932 0.01661 231.74\n0.45676 1.4341 -212.29\n0.0013256 9.9938e-05 0.99686\n\nC: 0.091252 0.0066749 132.72\n-0.14667 0.47258 88.51\n-0.00056772 8.3791e-06 1.0029\n\nD: 1.8278 -0.0075993 72.268\n0.68643 1.8832 -550.61\n0.0012853 4.1209e-05 1.006\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.79208 0.010314 26.019\n-0.023778 0.92337 43.513\n-0.00011513 1.2161e-05 1.0003\n\nB: 1.4932 0.01661 231.74\n0.45676 1.4341 -212.29\n0.0013256 9.9938e-05 0.99686\n\nC: 0.091252 0.0066749 132.72\n-0.14667 0.47258 88.51\n-0.00056772 8.3791e-06 1.0029\n\nD: 1.8278 -0.0075993 72.268\n0.68643 1.8832 -550.61\n0.0012853 4.1209e-05 1.006\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_68_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_68_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 1.0983 -0.030393 111.31\n0.31879 0.9789 58.516\n0.00050073 -5.3943e-05 1.0005\n\nB: 0.69134 -0.0063829 116.24\n0.0053381 0.71985 83.96\n-1.8171e-05 2.7124e-05 1\n\nC: 0.91628 -0.19782 70.502\n0.072414 0.68419 -33.187\n5.7127e-06 -0.00025258 0.99947\n\nD: 2.3515 0.16969 142.03\n1.0602 2.1465 -778.33\n0.0016806 -4.8949e-05 0.99537\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.0983 -0.030393 111.31\n0.31879 0.9789 58.516\n0.00050073 -5.3943e-05 1.0005\n\nB: 0.69134 -0.0063829 116.24\n0.0053381 0.71985 83.96\n-1.8171e-05 2.7124e-05 1\n\nC: 0.91628 -0.19782 70.502\n0.072414 0.68419 -33.187\n5.7127e-06 -0.00025258 0.99947\n\nD: 2.3515 0.16969 142.03\n1.0602 2.1465 -778.33\n0.0016806 -4.8949e-05 0.99537\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_69_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_69_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.75268 -0.0092452 -71.273\n-0.17607 0.97566 6.3105\n-0.00029582 -1.5187e-05 0.99957\n\nB: 0.091252 0.0066749 132.72\n-0.14667 0.47258 88.51\n-0.00056772 8.3791e-06 1.0029\n\nC: 0.62091 -0.030805 57.622\n-0.22703 0.84222 -13.023\n-0.00037179 -4.2767e-05 0.99852\n\nD: 1.4259 0.070724 58.865\n0.39243 1.3442 -170.04\n0.00084248 0.00011346 0.98851\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.75268 -0.0092452 -71.273\n-0.17607 0.97566 6.3105\n-0.00029582 -1.5187e-05 0.99957\n\nB: 0.091252 0.0066749 132.72\n-0.14667 0.47258 88.51\n-0.00056772 8.3791e-06 1.0029\n\nC: 0.62091 -0.030805 57.622\n-0.22703 0.84222 -13.023\n-0.00037179 -4.2767e-05 0.99852\n\nD: 1.4259 0.070724 58.865\n0.39243 1.3442 -170.04\n0.00084248 0.00011346 0.98851\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_70_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_70_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.49838 -0.015725 33.278\n-0.18045 0.77392 59.799\n-0.00064863 -4.2793e-05 0.99978\n\nB: 2.9599 0.00703 244.64\n0.78405 1.8789 -438.29\n0.0018411 4.4095e-05 0.99694\n\nC: 0.58099 -0.029382 -20.47\n-0.29479 0.73128 188.62\n-0.00043803 -4.3076e-05 1.0007\n\nD: 2.6177 0.042575 -65.797\n0.74359 2.3954 -903.27\n0.0018892 8.2816e-05 0.98996\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.49838 -0.015725 33.278\n-0.18045 0.77392 59.799\n-0.00064863 -4.2793e-05 0.99978\n\nB: 2.9599 0.00703 244.64\n0.78405 1.8789 -438.29\n0.0018411 4.4095e-05 0.99694\n\nC: 0.58099 -0.029382 -20.47\n-0.29479 0.73128 188.62\n-0.00043803 -4.3076e-05 1.0007\n\nD: 2.6177 0.042575 -65.797\n0.74359 2.3954 -903.27\n0.0018892 8.2816e-05 0.98996\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_71_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_71_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 1.0582 -0.013384 562.45\n0.1807 0.93712 36.472\n0.00043718 5.9368e-06 0.99927\n\nB: 0.73597 -0.0032436 13.11\n0.017092 0.71039 36.002\n5.8878e-05 -9.3828e-06 0.99995\n\nC: 0.37694 0.049406 111.53\n-0.16444 0.72986 84.602\n-0.00037753 4.0247e-05 0.99869\n\nD: 1.9834 -0.0016422 376.55\n0.84 1.4832 -241.61\n0.0019136 -3.8955e-05 1.0014\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.0582 -0.013384 562.45\n0.1807 0.93712 36.472\n0.00043718 5.9368e-06 0.99927\n\nB: 0.73597 -0.0032436 13.11\n0.017092 0.71039 36.002\n5.8878e-05 -9.3828e-06 0.99995\n\nC: 0.37694 0.049406 111.53\n-0.16444 0.72986 84.602\n-0.00037753 4.0247e-05 0.99869\n\nD: 1.9834 -0.0016422 376.55\n0.84 1.4832 -241.61\n0.0019136 -3.8955e-05 1.0014\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_72_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_72_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 1.2869 -0.0035671 90.117\n0.34981 1.1421 -290.48\n0.0010338 2.5575e-05 0.99928\n\nB: 0.74922 -0.0014388 -75.597\n-0.074158 0.94323 40.455\n-0.00018126 -6.2301e-06 1\n\nC: 0.67444 0.023361 37.089\n-0.047926 0.90094 60.932\n-0.00018688 1.1402e-05 1.0007\n\nD: 1.0063 -0.0054085 288.55\n0.23295 0.84053 7.8206\n0.0005941 1.4583e-05 1.0001\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.2869 -0.0035671 90.117\n0.34981 1.1421 -290.48\n0.0010338 2.5575e-05 0.99928\n\nB: 0.74922 -0.0014388 -75.597\n-0.074158 0.94323 40.455\n-0.00018126 -6.2301e-06 1\n\nC: 0.67444 0.023361 37.089\n-0.047926 0.90094 60.932\n-0.00018688 1.1402e-05 1.0007\n\nD: 1.0063 -0.0054085 288.55\n0.23295 0.84053 7.8206\n0.0005941 1.4583e-05 1.0001\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_73_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_73_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.29858 0.0403 -122.67\n-0.38113 0.61838 172.03\n-0.00071255 -1.0448e-06 0.97348\n\nB: 1.4259 0.070724 58.865\n0.39243 1.3442 -170.04\n0.00084248 0.00011346 0.98851\n\nC: 0.4605 0.0019073 42.778\n0.003918 0.45748 107.3\n1.6895e-05 4.8733e-06 1.0001\n\nD: 1.7761 -0.053427 263.17\n0.41751 1.5987 -329.46\n0.00069677 3.1372e-05 1.0014\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.29858 0.0403 -122.67\n-0.38113 0.61838 172.03\n-0.00071255 -1.0448e-06 0.97348\n\nB: 1.4259 0.070724 58.865\n0.39243 1.3442 -170.04\n0.00084248 0.00011346 0.98851\n\nC: 0.4605 0.0019073 42.778\n0.003918 0.45748 107.3\n1.6895e-05 4.8733e-06 1.0001\n\nD: 1.7761 -0.053427 263.17\n0.41751 1.5987 -329.46\n0.00069677 3.1372e-05 1.0014\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_74_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_74_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.43124 0.047668 -66.525\n-0.34772 0.62068 209.27\n-0.00060194 -2.1104e-07 0.98648\n\nB: 1.7312 -0.086578 129.17\n0.3882 1.1026 -2.2164\n0.0010948 -0.00011788 1.0024\n\nC: 0.85555 -0.17378 91.59\n0.17068 0.85755 -31.264\n-5.1182e-06 2.0966e-06 1.0023\n\nD: 0.33414 0.069646 90.22\n-0.25229 0.73446 157.67\n-0.00038885 2.2582e-06 1.0024\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.43124 0.047668 -66.525\n-0.34772 0.62068 209.27\n-0.00060194 -2.1104e-07 0.98648\n\nB: 1.7312 -0.086578 129.17\n0.3882 1.1026 -2.2164\n0.0010948 -0.00011788 1.0024\n\nC: 0.85555 -0.17378 91.59\n0.17068 0.85755 -31.264\n-5.1182e-06 2.0966e-06 1.0023\n\nD: 0.33414 0.069646 90.22\n-0.25229 0.73446 157.67\n-0.00038885 2.2582e-06 1.0024\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_75_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_75_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 1.547 0.11677 155.75\n0.40373 1.373 -170.1\n0.00090791 8.8782e-05 1.0012\n\nB: 0.056448 -0.012851 135.19\n-0.38625 0.54689 255.61\n-0.00066718 5.392e-05 1.0012\n\nC: 0.60665 -0.013034 217.78\n0.087451 0.52146 32.707\n0.00021516 2.9281e-07 1.0006\n\nD: 1.2108 -0.031741 47.374\n0.20996 1.0345 -107.36\n0.00054926 -6.3631e-06 1.0004\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.547 0.11677 155.75\n0.40373 1.373 -170.1\n0.00090791 8.8782e-05 1.0012\n\nB: 0.056448 -0.012851 135.19\n-0.38625 0.54689 255.61\n-0.00066718 5.392e-05 1.0012\n\nC: 0.60665 -0.013034 217.78\n0.087451 0.52146 32.707\n0.00021516 2.9281e-07 1.0006\n\nD: 1.2108 -0.031741 47.374\n0.20996 1.0345 -107.36\n0.00054926 -6.3631e-06 1.0004\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_76_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_76_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.0033111 0.031282 184.63\n-0.15843 0.75999 4.5609\n-0.00083562 0.00011238 0.99927\n\nB: 0.88632 -0.012492 -136.92\n-0.047209 1.0157 42.178\n-0.0001423 1.8595e-05 1.0005\n\nC: 0.1857 -0.0018512 147.73\n-0.094288 0.35154 277.67\n-0.00019671 -1.563e-05 0.9996\n\nD: 0.60367 0.071352 -36.528\n-0.21232 0.96671 -45.299\n-0.00036835 6.7456e-05 0.99996\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.0033111 0.031282 184.63\n-0.15843 0.75999 4.5609\n-0.00083562 0.00011238 0.99927\n\nB: 0.88632 -0.012492 -136.92\n-0.047209 1.0157 42.178\n-0.0001423 1.8595e-05 1.0005\n\nC: 0.1857 -0.0018512 147.73\n-0.094288 0.35154 277.67\n-0.00019671 -1.563e-05 0.9996\n\nD: 0.60367 0.071352 -36.528\n-0.21232 0.96671 -45.299\n-0.00036835 6.7456e-05 0.99996\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_77_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_77_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.1857 -0.0018512 147.73\n-0.094288 0.35154 277.67\n-0.00019671 -1.563e-05 0.9996\n\nB: 0.7855 0.039826 119.05\n-0.25749 1.3451 -220.69\n-0.00047304 5.3677e-05 1.001\n\nC: -0.47246 -0.28359 869.57\n0.29041 -0.47016 396.67\n5.0949e-06 1.2499e-05 0.99998\n\nD: -0.47246 -0.28359 869.57\n0.29041 -0.47016 396.67\n5.0949e-06 1.2499e-05 0.99998\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.1857 -0.0018512 147.73\n-0.094288 0.35154 277.67\n-0.00019671 -1.563e-05 0.9996\n\nB: 0.7855 0.039826 119.05\n-0.25749 1.3451 -220.69\n-0.00047304 5.3677e-05 1.001\n\nC: -0.47246 -0.28359 869.57\n0.29041 -0.47016 396.67\n5.0949e-06 1.2499e-05 0.99998\n\nD: -0.47246 -0.28359 869.57\n0.29041 -0.47016 396.67\n5.0949e-06 1.2499e-05 0.99998\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_78_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_78_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.55616 0.0088234 83.342\n-0.19782 0.70845 195.76\n-0.00029305 -3.175e-05 0.99884\n\nB: 0.31483 0.11583 690.51\n0.17546 0.70637 14.497\n0.00026712 0.00012691 1\n\nC: 0.7088 -0.010965 -26.07\n-0.13602 0.83489 103.19\n-0.00023352 -1.5615e-05 1.0004\n\nD: 1.0819 0.012805 66.799\n0.075853 1.006 5.6909\n0.00034273 -2.4626e-05 1.0003\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.55616 0.0088234 83.342\n-0.19782 0.70845 195.76\n-0.00029305 -3.175e-05 0.99884\n\nB: 0.31483 0.11583 690.51\n0.17546 0.70637 14.497\n0.00026712 0.00012691 1\n\nC: 0.7088 -0.010965 -26.07\n-0.13602 0.83489 103.19\n-0.00023352 -1.5615e-05 1.0004\n\nD: 1.0819 0.012805 66.799\n0.075853 1.006 5.6909\n0.00034273 -2.4626e-05 1.0003\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_79_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_79_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 1.3186 -0.0097277 -143.16\n0.094663 1.1956 -58.383\n0.00019153 -2.0281e-05 0.99989\n\nB: 1.1442 -0.037625 115.5\n0.22206 1.0286 -30.039\n0.00032815 -2.4116e-05 0.9999\n\nC: 0.27317 0.041297 84.951\n-0.22859 0.68736 124.47\n-0.00041264 5.2763e-05 1.0003\n\nD: 1.9861 0.031586 27.893\n0.62141 1.9607 -531.99\n0.0011993 -1.9815e-05 0.99978\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.3186 -0.0097277 -143.16\n0.094663 1.1956 -58.383\n0.00019153 -2.0281e-05 0.99989\n\nB: 1.1442 -0.037625 115.5\n0.22206 1.0286 -30.039\n0.00032815 -2.4116e-05 0.9999\n\nC: 0.27317 0.041297 84.951\n-0.22859 0.68736 124.47\n-0.00041264 5.2763e-05 1.0003\n\nD: 1.9861 0.031586 27.893\n0.62141 1.9607 -531.99\n0.0011993 -1.9815e-05 0.99978\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_80_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_80_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.37107 -0.09213 318.73\n0.086334 0.37505 188.02\n-1.0814e-05 -3.6548e-06 1\n\nB: 1.8278 -0.0075993 72.268\n0.68643 1.8832 -550.61\n0.0012853 4.1209e-05 1.006\n\nC: 1.3231 -0.10518 226.69\n0.35118 1.4445 -217.52\n0.00076877 -2.4515e-05 0.99903\n\nD: 0.49202 0.0057754 242.06\n0.058005 0.43541 166.02\n0.00018017 1.0746e-05 0.99974\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.37107 -0.09213 318.73\n0.086334 0.37505 188.02\n-1.0814e-05 -3.6548e-06 1\n\nB: 1.8278 -0.0075993 72.268\n0.68643 1.8832 -550.61\n0.0012853 4.1209e-05 1.006\n\nC: 1.3231 -0.10518 226.69\n0.35118 1.4445 -217.52\n0.00076877 -2.4515e-05 0.99903\n\nD: 0.49202 0.0057754 242.06\n0.058005 0.43541 166.02\n0.00018017 1.0746e-05 0.99974\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_81_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_81_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.52064 0.019326 41.006\n-0.1476 0.75468 101.84\n-0.00026848 4.5639e-05 1.0094\n\nB: 1.6284 1.0346 -954.33\n-0.096789 2.5434 -782.98\n-0.00078653 0.0011044 1\n\nC: 0.1176 -0.0075311 194.61\n-0.10067 0.3391 257.1\n-0.00023555 -9.6091e-06 0.99858\n\nD: 3.6199 0.1243 -2.4307\n0.35256 5.1536 -1935.2\n0.0029372 0.0011148 1\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.52064 0.019326 41.006\n-0.1476 0.75468 101.84\n-0.00026848 4.5639e-05 1.0094\n\nB: 1.6284 1.0346 -954.33\n-0.096789 2.5434 -782.98\n-0.00078653 0.0011044 1\n\nC: 0.1176 -0.0075311 194.61\n-0.10067 0.3391 257.1\n-0.00023555 -9.6091e-06 0.99858\n\nD: 3.6199 0.1243 -2.4307\n0.35256 5.1536 -1935.2\n0.0029372 0.0011148 1\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_82_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_82_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.1857 -0.0018512 147.73\n-0.094288 0.35154 277.67\n-0.00019671 -1.563e-05 0.9996\n\nB: 1.8278 -0.0075993 72.268\n0.68643 1.8832 -550.61\n0.0012853 4.1209e-05 1.006\n\nC: 0.35568 0.079611 -21.49\n-0.17793 0.7199 62.24\n-0.00050458 1.9913e-05 0.9982\n\nD: 0.44469 -0.1629 197.72\n-0.090792 0.33606 37.55\n-0.00032851 -0.00028415 1.0004\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.1857 -0.0018512 147.73\n-0.094288 0.35154 277.67\n-0.00019671 -1.563e-05 0.9996\n\nB: 1.8278 -0.0075993 72.268\n0.68643 1.8832 -550.61\n0.0012853 4.1209e-05 1.006\n\nC: 0.35568 0.079611 -21.49\n-0.17793 0.7199 62.24\n-0.00050458 1.9913e-05 0.9982\n\nD: 0.44469 -0.1629 197.72\n-0.090792 0.33606 37.55\n-0.00032851 -0.00028415 1.0004\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_83_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_83_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.4605 0.0019073 42.778\n0.003918 0.45748 107.3\n1.6895e-05 4.8733e-06 1.0001\n\nB: 2.4144 -0.0022023 -199.3\n0.52146 2.0547 -569.49\n0.0010423 8.4489e-05 1.0043\n\nC: 0.57079 0.0076829 -45.295\n-0.15447 0.93183 62.276\n-0.00028402 -5.8827e-06 0.99996\n\nD: 0.47208 0.021042 63.836\n-0.16332 0.73028 126.94\n-0.00030371 2.4606e-05 0.99981\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.4605 0.0019073 42.778\n0.003918 0.45748 107.3\n1.6895e-05 4.8733e-06 1.0001\n\nB: 2.4144 -0.0022023 -199.3\n0.52146 2.0547 -569.49\n0.0010423 8.4489e-05 1.0043\n\nC: 0.57079 0.0076829 -45.295\n-0.15447 0.93183 62.276\n-0.00028402 -5.8827e-06 0.99996\n\nD: 0.47208 0.021042 63.836\n-0.16332 0.73028 126.94\n-0.00030371 2.4606e-05 0.99981\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_84_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_84_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 1.0669 0.31109 194.1\n-0.019953 0.9209 79.624\n0.000135 -7.6705e-05 0.99977\n\nB: 1.5534 0.017684 158.94\n0.56083 1.4841 -343.65\n0.0010107 3.8363e-05 0.99895\n\nC: 1.4272 0.064496 -40.82\n0.15764 1.3161 -94.847\n0.00037033 4.6015e-05 0.99258\n\nD: 1.3951 0.13641 136.74\n0.31704 1.2758 -219.28\n0.00053511 0.00013896 0.99675\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.0669 0.31109 194.1\n-0.019953 0.9209 79.624\n0.000135 -7.6705e-05 0.99977\n\nB: 1.5534 0.017684 158.94\n0.56083 1.4841 -343.65\n0.0010107 3.8363e-05 0.99895\n\nC: 1.4272 0.064496 -40.82\n0.15764 1.3161 -94.847\n0.00037033 4.6015e-05 0.99258\n\nD: 1.3951 0.13641 136.74\n0.31704 1.2758 -219.28\n0.00053511 0.00013896 0.99675\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_85_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_85_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 2.308 -0.067061 201.09\n0.71494 1.8702 -412.16\n0.0015273 -1.6972e-05 1.0162\n\nB: 1.3838 0.024181 -93.882\n0.093344 1.307 -232.76\n0.00015995 6.7546e-05 1.0008\n\nC: 0.46288 -0.016626 22.437\n-0.26713 0.81047 151.27\n-0.00036789 7.646e-06 0.99855\n\nD: 1.3522 0.025037 96.693\n0.20588 1.5085 -279.44\n0.000418 4.2466e-05 1.0103\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 2.308 -0.067061 201.09\n0.71494 1.8702 -412.16\n0.0015273 -1.6972e-05 1.0162\n\nB: 1.3838 0.024181 -93.882\n0.093344 1.307 -232.76\n0.00015995 6.7546e-05 1.0008\n\nC: 0.46288 -0.016626 22.437\n-0.26713 0.81047 151.27\n-0.00036789 7.646e-06 0.99855\n\nD: 1.3522 0.025037 96.693\n0.20588 1.5085 -279.44\n0.000418 4.2466e-05 1.0103\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_86_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_86_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.4849 -0.15095 280.72\n-0.18568 0.38797 170.57\n-4.9965e-05 -0.00024428 0.99985\n\nB: 1.2108 -0.031741 47.374\n0.20996 1.0345 -107.36\n0.00054926 -6.3631e-06 1.0004\n\nC: 0.75268 -0.0092452 -71.273\n-0.17607 0.97566 6.3105\n-0.00029582 -1.5187e-05 0.99957\n\nD: 1.0063 -0.0054085 288.55\n0.23295 0.84053 7.8206\n0.0005941 1.4583e-05 1.0001\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.4849 -0.15095 280.72\n-0.18568 0.38797 170.57\n-4.9965e-05 -0.00024428 0.99985\n\nB: 1.2108 -0.031741 47.374\n0.20996 1.0345 -107.36\n0.00054926 -6.3631e-06 1.0004\n\nC: 0.75268 -0.0092452 -71.273\n-0.17607 0.97566 6.3105\n-0.00029582 -1.5187e-05 0.99957\n\nD: 1.0063 -0.0054085 288.55\n0.23295 0.84053 7.8206\n0.0005941 1.4583e-05 1.0001\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_87_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_87_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 1.3951 0.13641 136.74\n0.31704 1.2758 -219.28\n0.00053511 0.00013896 0.99675\n\nB: 0.31269 -0.011782 51.842\n-0.22276 0.71181 65.24\n-0.00081452 -4.173e-05 0.99309\n\nC: 0.22888 0.0058691 272.09\n-0.077153 0.3923 203.08\n-0.00024299 -4.5827e-06 1.0015\n\nD: 0.37083 -0.024499 139.16\n-0.094573 0.62749 65.353\n-0.00053805 -2.2225e-05 0.99885\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.3951 0.13641 136.74\n0.31704 1.2758 -219.28\n0.00053511 0.00013896 0.99675\n\nB: 0.31269 -0.011782 51.842\n-0.22276 0.71181 65.24\n-0.00081452 -4.173e-05 0.99309\n\nC: 0.22888 0.0058691 272.09\n-0.077153 0.3923 203.08\n-0.00024299 -4.5827e-06 1.0015\n\nD: 0.37083 -0.024499 139.16\n-0.094573 0.62749 65.353\n-0.00053805 -2.2225e-05 0.99885\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_88_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_88_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.66581 0.6777 -31.246\n-0.14346 0.96853 148.92\n0.00042869 -1.7355e-05 0.99928\n\nB: 0.23209 -0.67097 528.16\n0.66389 0.2516 -30.266\n-3.168e-05 2.5631e-05 1.0087\n\nC: 2.4665 0.083695 233.31\n0.87021 2.8235 -936.68\n0.0017821 0.0001592 0.98707\n\nD: 1.1346 -0.16977 -78.128\n-0.0017173 0.8512 -82.973\n8.0333e-07 -0.00031449 0.99917\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.66581 0.6777 -31.246\n-0.14346 0.96853 148.92\n0.00042869 -1.7355e-05 0.99928\n\nB: 0.23209 -0.67097 528.16\n0.66389 0.2516 -30.266\n-3.168e-05 2.5631e-05 1.0087\n\nC: 2.4665 0.083695 233.31\n0.87021 2.8235 -936.68\n0.0017821 0.0001592 0.98707\n\nD: 1.1346 -0.16977 -78.128\n-0.0017173 0.8512 -82.973\n8.0333e-07 -0.00031449 0.99917\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_89_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_89_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: -0.21679 -0.12572 585.55\n0.12463 -0.21699 355.1\n-1.085e-06 -1.8818e-06 1.0002\n\nB: 0.31237 0.099342 8.5389\n-0.29392 0.92363 14.629\n-0.00074642 6.3257e-05 0.99168\n\nC: 0.51123 -0.013639 59.603\n-0.16055 0.85238 103.24\n-0.0003334 -4.0403e-05 1.0009\n\nD: 1.1202 -0.0055862 43.04\n0.17566 1.0194 -5.6786\n0.00085767 -4.4625e-05 0.99922\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: -0.21679 -0.12572 585.55\n0.12463 -0.21699 355.1\n-1.085e-06 -1.8818e-06 1.0002\n\nB: 0.31237 0.099342 8.5389\n-0.29392 0.92363 14.629\n-0.00074642 6.3257e-05 0.99168\n\nC: 0.51123 -0.013639 59.603\n-0.16055 0.85238 103.24\n-0.0003334 -4.0403e-05 1.0009\n\nD: 1.1202 -0.0055862 43.04\n0.17566 1.0194 -5.6786\n0.00085767 -4.4625e-05 0.99922\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_90_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_90_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.85555 -0.17378 91.59\n0.17068 0.85755 -31.264\n-5.1182e-06 2.0966e-06 1.0023\n\nB: 0.10472 0.069057 99.841\n-0.17731 0.5329 107.18\n-0.00051255 -1.3734e-05 0.98616\n\nC: 1.4403 0.27154 10.734\n0.071471 1.5534 -44.533\n0.00030432 0.00049723 1.001\n\nD: 1.3838 0.024181 -93.882\n0.093344 1.307 -232.76\n0.00015995 6.7546e-05 1.0008\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.85555 -0.17378 91.59\n0.17068 0.85755 -31.264\n-5.1182e-06 2.0966e-06 1.0023\n\nB: 0.10472 0.069057 99.841\n-0.17731 0.5329 107.18\n-0.00051255 -1.3734e-05 0.98616\n\nC: 1.4403 0.27154 10.734\n0.071471 1.5534 -44.533\n0.00030432 0.00049723 1.001\n\nD: 1.3838 0.024181 -93.882\n0.093344 1.307 -232.76\n0.00015995 6.7546e-05 1.0008\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_91_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_91_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 2.3594 0.0026252 -116.05\n0.5085 2.302 -550.96\n0.0013826 0.0001837 1.0004\n\nB: 1.0669 0.31109 194.1\n-0.019953 0.9209 79.624\n0.000135 -7.6705e-05 0.99977\n\nC: 0.25611 0.0594 88.294\n-0.24702 0.7663 71.53\n-0.00048162 6.7687e-05 1.0008\n\nD: 0.54693 0.20925 -108.35\n-0.082341 1.1176 -236.48\n-0.0006026 0.0001769 1.0001\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 2.3594 0.0026252 -116.05\n0.5085 2.302 -550.96\n0.0013826 0.0001837 1.0004\n\nB: 1.0669 0.31109 194.1\n-0.019953 0.9209 79.624\n0.000135 -7.6705e-05 0.99977\n\nC: 0.25611 0.0594 88.294\n-0.24702 0.7663 71.53\n-0.00048162 6.7687e-05 1.0008\n\nD: 0.54693 0.20925 -108.35\n-0.082341 1.1176 -236.48\n-0.0006026 0.0001769 1.0001\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_92_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_92_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.63669 0.0018872 137.9\n-0.00033285 0.63926 95.922\n-2.0441e-06 4.1104e-06 1\n\nB: 1.3903 -0.069797 29.319\n0.18963 1.0284 22.049\n0.00052989 -9.8197e-05 1.0021\n\nC: 0.7855 0.039826 119.05\n-0.25749 1.3451 -220.69\n-0.00047304 5.3677e-05 1.001\n\nD: 1.6477 -0.037624 101.59\n0.49962 1.5725 -364.98\n0.00090272 4.6589e-05 1.0037\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.63669 0.0018872 137.9\n-0.00033285 0.63926 95.922\n-2.0441e-06 4.1104e-06 1\n\nB: 1.3903 -0.069797 29.319\n0.18963 1.0284 22.049\n0.00052989 -9.8197e-05 1.0021\n\nC: 0.7855 0.039826 119.05\n-0.25749 1.3451 -220.69\n-0.00047304 5.3677e-05 1.001\n\nD: 1.6477 -0.037624 101.59\n0.49962 1.5725 -364.98\n0.00090272 4.6589e-05 1.0037\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_93_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_93_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.45287 0.0061881 100.32\n-0.053734 0.66556 61.961\n-0.00023168 -5.8559e-06 1.0005\n\nB: 0.62091 -0.030805 57.622\n-0.22703 0.84222 -13.023\n-0.00037179 -4.2767e-05 0.99852\n\nC: 1.0669 0.31109 194.1\n-0.019953 0.9209 79.624\n0.000135 -7.6705e-05 0.99977\n\nD: 0.46461 0.085196 589.33\n0.19659 0.76327 25.833\n0.00026763 8.9486e-05 1.0006\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.45287 0.0061881 100.32\n-0.053734 0.66556 61.961\n-0.00023168 -5.8559e-06 1.0005\n\nB: 0.62091 -0.030805 57.622\n-0.22703 0.84222 -13.023\n-0.00037179 -4.2767e-05 0.99852\n\nC: 1.0669 0.31109 194.1\n-0.019953 0.9209 79.624\n0.000135 -7.6705e-05 0.99977\n\nD: 0.46461 0.085196 589.33\n0.19659 0.76327 25.833\n0.00026763 8.9486e-05 1.0006\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_94_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_94_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.31237 0.099342 8.5389\n-0.29392 0.92363 14.629\n-0.00074642 6.3257e-05 0.99168\n\nB: 0.1857 -0.0018512 147.73\n-0.094288 0.35154 277.67\n-0.00019671 -1.563e-05 0.9996\n\nC: 0.62147 0.055609 221.79\n0.21978 1.1561 -23.942\n0.00048557 -4.4311e-05 0.99866\n\nD: 0.55202 0.096567 108.66\n-0.35774 1.4927 -276.32\n-0.00068886 0.0001065 0.98986\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.31237 0.099342 8.5389\n-0.29392 0.92363 14.629\n-0.00074642 6.3257e-05 0.99168\n\nB: 0.1857 -0.0018512 147.73\n-0.094288 0.35154 277.67\n-0.00019671 -1.563e-05 0.9996\n\nC: 0.62147 0.055609 221.79\n0.21978 1.1561 -23.942\n0.00048557 -4.4311e-05 0.99866\n\nD: 0.55202 0.096567 108.66\n-0.35774 1.4927 -276.32\n-0.00068886 0.0001065 0.98986\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_95_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_95_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.62091 -0.030805 57.622\n-0.22703 0.84222 -13.023\n-0.00037179 -4.2767e-05 0.99852\n\nB: 1.4219 0.01866 342.44\n0.36005 1.3261 -141.73\n0.00090969 2.3838e-05 1.0002\n\nC: 0.38854 -0.073106 92.576\n-0.1986 0.7319 139.21\n-0.00040811 -1.555e-05 0.99988\n\nD: 0.1268 -0.03963 330.5\n-0.1892 0.46973 254.2\n-0.00039857 -3.9641e-05 0.99971\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.62091 -0.030805 57.622\n-0.22703 0.84222 -13.023\n-0.00037179 -4.2767e-05 0.99852\n\nB: 1.4219 0.01866 342.44\n0.36005 1.3261 -141.73\n0.00090969 2.3838e-05 1.0002\n\nC: 0.38854 -0.073106 92.576\n-0.1986 0.7319 139.21\n-0.00040811 -1.555e-05 0.99988\n\nD: 0.1268 -0.03963 330.5\n-0.1892 0.46973 254.2\n-0.00039857 -3.9641e-05 0.99971\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_96_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_96_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 1.7312 -0.086578 129.17\n0.3882 1.1026 -2.2164\n0.0010948 -0.00011788 1.0024\n\nB: 3.1418 0.21701 -576.91\n0.129 3.5039 -1062.5\n0.0014143 0.00082533 0.98844\n\nC: 0.67783 0.002447 123\n-0.00051063 0.68091 83.563\n-2.5166e-06 5.6486e-06 1\n\nD: 0.4605 0.0019073 42.778\n0.003918 0.45748 107.3\n1.6895e-05 4.8733e-06 1.0001\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.7312 -0.086578 129.17\n0.3882 1.1026 -2.2164\n0.0010948 -0.00011788 1.0024\n\nB: 3.1418 0.21701 -576.91\n0.129 3.5039 -1062.5\n0.0014143 0.00082533 0.98844\n\nC: 0.67783 0.002447 123\n-0.00051063 0.68091 83.563\n-2.5166e-06 5.6486e-06 1\n\nD: 0.4605 0.0019073 42.778\n0.003918 0.45748 107.3\n1.6895e-05 4.8733e-06 1.0001\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_97_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_97_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.77044 -0.014353 152.19\n0.007827 0.75172 76.397\n1.9039e-05 -2.1554e-05 1\n\nB: 1.2108 -0.031741 47.374\n0.20996 1.0345 -107.36\n0.00054926 -6.3631e-06 1.0004\n\nC: 1.5534 0.017684 158.94\n0.56083 1.4841 -343.65\n0.0010107 3.8363e-05 0.99895\n\nD: 1.9861 0.031586 27.893\n0.62141 1.9607 -531.99\n0.0011993 -1.9815e-05 0.99978\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.77044 -0.014353 152.19\n0.007827 0.75172 76.397\n1.9039e-05 -2.1554e-05 1\n\nB: 1.2108 -0.031741 47.374\n0.20996 1.0345 -107.36\n0.00054926 -6.3631e-06 1.0004\n\nC: 1.5534 0.017684 158.94\n0.56083 1.4841 -343.65\n0.0010107 3.8363e-05 0.99895\n\nD: 1.9861 0.031586 27.893\n0.62141 1.9607 -531.99\n0.0011993 -1.9815e-05 0.99978\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_98_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_98_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.45287 0.0061881 100.32\n-0.053734 0.66556 61.961\n-0.00023168 -5.8559e-06 1.0005\n\nB: 1.4862 -0.061679 54.577\n0.4606 1.2816 -147.5\n0.0007321 -7.3842e-05 0.99895\n\nC: 2.4144 -0.0022023 -199.3\n0.52146 2.0547 -569.49\n0.0010423 8.4489e-05 1.0043\n\nD: 2.3515 0.16969 142.03\n1.0602 2.1465 -778.33\n0.0016806 -4.8949e-05 0.99537\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.45287 0.0061881 100.32\n-0.053734 0.66556 61.961\n-0.00023168 -5.8559e-06 1.0005\n\nB: 1.4862 -0.061679 54.577\n0.4606 1.2816 -147.5\n0.0007321 -7.3842e-05 0.99895\n\nC: 2.4144 -0.0022023 -199.3\n0.52146 2.0547 -569.49\n0.0010423 8.4489e-05 1.0043\n\nD: 2.3515 0.16969 142.03\n1.0602 2.1465 -778.33\n0.0016806 -4.8949e-05 0.99537\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_99_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_99_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.37694 0.049406 111.53\n-0.16444 0.72986 84.602\n-0.00037753 4.0247e-05 0.99869\n\nB: 0.1176 -0.0075311 194.61\n-0.10067 0.3391 257.1\n-0.00023555 -9.6091e-06 0.99858\n\nC: 0.77105 -0.097833 -3.6994\n-0.092675 0.81167 92.799\n-0.0001392 -0.00012806 0.99964\n\nD: 0.48531 0.10549 -95.005\n-0.11843 0.77202 44.217\n-0.00029301 2.8434e-05 0.99773\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.37694 0.049406 111.53\n-0.16444 0.72986 84.602\n-0.00037753 4.0247e-05 0.99869\n\nB: 0.1176 -0.0075311 194.61\n-0.10067 0.3391 257.1\n-0.00023555 -9.6091e-06 0.99858\n\nC: 0.77105 -0.097833 -3.6994\n-0.092675 0.81167 92.799\n-0.0001392 -0.00012806 0.99964\n\nD: 0.48531 0.10549 -95.005\n-0.11843 0.77202 44.217\n-0.00029301 2.8434e-05 0.99773\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_100_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_100_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.54864 -0.010797 -6.1494\n-0.11876 0.86651 111.28\n-0.00026448 -1.8961e-05 1\n\nB: 0.57125 -0.095863 127.19\n0.050302 0.75099 -13.911\n-0.00020485 1.2421e-06 0.9999\n\nC: 0.79208 0.010314 26.019\n-0.023778 0.92337 43.513\n-0.00011513 1.2161e-05 1.0003\n\nD: 2.6177 0.042575 -65.797\n0.74359 2.3954 -903.27\n0.0018892 8.2816e-05 0.98996\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.54864 -0.010797 -6.1494\n-0.11876 0.86651 111.28\n-0.00026448 -1.8961e-05 1\n\nB: 0.57125 -0.095863 127.19\n0.050302 0.75099 -13.911\n-0.00020485 1.2421e-06 0.9999\n\nC: 0.79208 0.010314 26.019\n-0.023778 0.92337 43.513\n-0.00011513 1.2161e-05 1.0003\n\nD: 2.6177 0.042575 -65.797\n0.74359 2.3954 -903.27\n0.0018892 8.2816e-05 0.98996\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_101_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_101_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.18178 0.033268 82.883\n-0.24959 0.68306 123.62\n-0.0004688 5.3047e-05 1.0005\n\nB: 0.49202 0.0057754 242.06\n0.058005 0.43541 166.02\n0.00018017 1.0746e-05 0.99974\n\nC: 1.4259 0.070724 58.865\n0.39243 1.3442 -170.04\n0.00084248 0.00011346 0.98851\n\nD: 0.58099 -0.029382 -20.47\n-0.29479 0.73128 188.62\n-0.00043803 -4.3076e-05 1.0007\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.18178 0.033268 82.883\n-0.24959 0.68306 123.62\n-0.0004688 5.3047e-05 1.0005\n\nB: 0.49202 0.0057754 242.06\n0.058005 0.43541 166.02\n0.00018017 1.0746e-05 0.99974\n\nC: 1.4259 0.070724 58.865\n0.39243 1.3442 -170.04\n0.00084248 0.00011346 0.98851\n\nD: 0.58099 -0.029382 -20.47\n-0.29479 0.73128 188.62\n-0.00043803 -4.3076e-05 1.0007\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_102_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_102_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.66581 0.6777 -31.246\n-0.14346 0.96853 148.92\n0.00042869 -1.7355e-05 0.99928\n\nB: 0.54864 -0.010797 -6.1494\n-0.11876 0.86651 111.28\n-0.00026448 -1.8961e-05 1\n\nC: 0.48531 0.10549 -95.005\n-0.11843 0.77202 44.217\n-0.00029301 2.8434e-05 0.99773\n\nD: 0.69134 -0.0063829 116.24\n0.0053381 0.71985 83.96\n-1.8171e-05 2.7124e-05 1\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.66581 0.6777 -31.246\n-0.14346 0.96853 148.92\n0.00042869 -1.7355e-05 0.99928\n\nB: 0.54864 -0.010797 -6.1494\n-0.11876 0.86651 111.28\n-0.00026448 -1.8961e-05 1\n\nC: 0.48531 0.10549 -95.005\n-0.11843 0.77202 44.217\n-0.00029301 2.8434e-05 0.99773\n\nD: 0.69134 -0.0063829 116.24\n0.0053381 0.71985 83.96\n-1.8171e-05 2.7124e-05 1\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_103_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_103_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 5.1051 0.34986 -885.86\n1.0306 5.9768 -2733.1\n0.0033649 0.00099216 1\n\nB: 1.5134 -0.0029581 20.934\n0.2678 1.4062 -232.68\n0.00048583 -4.0311e-06 1.0006\n\nC: 2.2787 0.023843 -30.321\n0.58793 1.9158 -459.28\n0.0012782 -6.6868e-06 0.99971\n\nD: 0.030125 -0.01797 299.5\n-0.19573 0.45869 167.74\n-0.00051291 -3.9704e-05 1.0019\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 5.1051 0.34986 -885.86\n1.0306 5.9768 -2733.1\n0.0033649 0.00099216 1\n\nB: 1.5134 -0.0029581 20.934\n0.2678 1.4062 -232.68\n0.00048583 -4.0311e-06 1.0006\n\nC: 2.2787 0.023843 -30.321\n0.58793 1.9158 -459.28\n0.0012782 -6.6868e-06 0.99971\n\nD: 0.030125 -0.01797 299.5\n-0.19573 0.45869 167.74\n-0.00051291 -3.9704e-05 1.0019\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_104_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_104_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.79208 0.010314 26.019\n-0.023778 0.92337 43.513\n-0.00011513 1.2161e-05 1.0003\n\nB: 0.60879 -0.35761 289.93\n0.34822 0.61653 -30.949\n-2.0912e-05 1.3527e-06 1.014\n\nC: 0.63669 0.0018872 137.9\n-0.00033285 0.63926 95.922\n-2.0441e-06 4.1104e-06 1\n\nD: 1.8851 0.028166 274.85\n0.48185 1.6951 -326.97\n0.0011778 8.455e-05 0.99801\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.79208 0.010314 26.019\n-0.023778 0.92337 43.513\n-0.00011513 1.2161e-05 1.0003\n\nB: 0.60879 -0.35761 289.93\n0.34822 0.61653 -30.949\n-2.0912e-05 1.3527e-06 1.014\n\nC: 0.63669 0.0018872 137.9\n-0.00033285 0.63926 95.922\n-2.0441e-06 4.1104e-06 1\n\nD: 1.8851 0.028166 274.85\n0.48185 1.6951 -326.97\n0.0011778 8.455e-05 0.99801\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_105_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_105_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 1.4733 -0.014435 76.772\n0.25007 1.2556 -120.81\n0.00088206 8.1414e-05 1.002\n\nB: 0.13416 0.073075 56.977\n-0.21333 0.70433 84.528\n-0.00055481 6.1106e-05 1\n\nC: 3.4851 0.086317 195.9\n1.1598 3.067 -1009.5\n0.0025647 -5.4567e-05 0.99349\n\nD: 0.40245 -0.33938 102.29\n-0.2125 0.62381 216.78\n-0.00033866 -1.5855e-05 1.0018\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.4733 -0.014435 76.772\n0.25007 1.2556 -120.81\n0.00088206 8.1414e-05 1.002\n\nB: 0.13416 0.073075 56.977\n-0.21333 0.70433 84.528\n-0.00055481 6.1106e-05 1\n\nC: 3.4851 0.086317 195.9\n1.1598 3.067 -1009.5\n0.0025647 -5.4567e-05 0.99349\n\nD: 0.40245 -0.33938 102.29\n-0.2125 0.62381 216.78\n-0.00033866 -1.5855e-05 1.0018\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_106_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_106_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.77044 -0.014353 152.19\n0.007827 0.75172 76.397\n1.9039e-05 -2.1554e-05 1\n\nB: 0.66581 0.6777 -31.246\n-0.14346 0.96853 148.92\n0.00042869 -1.7355e-05 0.99928\n\nC: 0.54372 0.011697 65.787\n-0.06271 0.8727 105.67\n-0.00025117 2.4814e-06 0.99967\n\nD: 1.7312 -0.086578 129.17\n0.3882 1.1026 -2.2164\n0.0010948 -0.00011788 1.0024\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.77044 -0.014353 152.19\n0.007827 0.75172 76.397\n1.9039e-05 -2.1554e-05 1\n\nB: 0.66581 0.6777 -31.246\n-0.14346 0.96853 148.92\n0.00042869 -1.7355e-05 0.99928\n\nC: 0.54372 0.011697 65.787\n-0.06271 0.8727 105.67\n-0.00025117 2.4814e-06 0.99967\n\nD: 1.7312 -0.086578 129.17\n0.3882 1.1026 -2.2164\n0.0010948 -0.00011788 1.0024\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_107_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_107_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 1.0505 -0.0053825 276.45\n0.20631 0.92888 48.832\n0.00048841 -1.9251e-05 0.99878\n\nB: 0.056448 -0.012851 135.19\n-0.38625 0.54689 255.61\n-0.00066718 5.392e-05 1.0012\n\nC: 0.55202 0.096567 108.66\n-0.35774 1.4927 -276.32\n-0.00068886 0.0001065 0.98986\n\nD: 0.20876 0.015221 174.06\n-0.13382 0.55012 11.64\n-0.00044084 3.575e-05 1.0177\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.0505 -0.0053825 276.45\n0.20631 0.92888 48.832\n0.00048841 -1.9251e-05 0.99878\n\nB: 0.056448 -0.012851 135.19\n-0.38625 0.54689 255.61\n-0.00066718 5.392e-05 1.0012\n\nC: 0.55202 0.096567 108.66\n-0.35774 1.4927 -276.32\n-0.00068886 0.0001065 0.98986\n\nD: 0.20876 0.015221 174.06\n-0.13382 0.55012 11.64\n-0.00044084 3.575e-05 1.0177\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_108_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_108_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 1.141 -0.024147 186.42\n0.29573 0.97376 -60.872\n0.00082251 -1.0843e-05 0.99973\n\nB: 0.42945 0.0071566 96.266\n-0.019537 0.48377 43.049\n-7.8698e-05 1.6013e-05 1.0001\n\nC: 0.48275 -0.12831 276.04\n-0.19138 0.40711 199.19\n-5.6548e-05 -0.00023367 0.99912\n\nD: 0.4849 -0.15095 280.72\n-0.18568 0.38797 170.57\n-4.9965e-05 -0.00024428 0.99985\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.141 -0.024147 186.42\n0.29573 0.97376 -60.872\n0.00082251 -1.0843e-05 0.99973\n\nB: 0.42945 0.0071566 96.266\n-0.019537 0.48377 43.049\n-7.8698e-05 1.6013e-05 1.0001\n\nC: 0.48275 -0.12831 276.04\n-0.19138 0.40711 199.19\n-5.6548e-05 -0.00023367 0.99912\n\nD: 0.4849 -0.15095 280.72\n-0.18568 0.38797 170.57\n-4.9965e-05 -0.00024428 0.99985\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_109_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_109_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.040904 -0.0023332 234.76\n-0.10713 0.35038 218.5\n-0.00028907 6.311e-06 1.0035\n\nB: 0.38914 0.285 169.51\n-0.28531 0.39347 340.1\n-6.4617e-06 5.0341e-06 1\n\nC: 0.0033111 0.031282 184.63\n-0.15843 0.75999 4.5609\n-0.00083562 0.00011238 0.99927\n\nD: 0.33414 0.069646 90.22\n-0.25229 0.73446 157.67\n-0.00038885 2.2582e-06 1.0024\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.040904 -0.0023332 234.76\n-0.10713 0.35038 218.5\n-0.00028907 6.311e-06 1.0035\n\nB: 0.38914 0.285 169.51\n-0.28531 0.39347 340.1\n-6.4617e-06 5.0341e-06 1\n\nC: 0.0033111 0.031282 184.63\n-0.15843 0.75999 4.5609\n-0.00083562 0.00011238 0.99927\n\nD: 0.33414 0.069646 90.22\n-0.25229 0.73446 157.67\n-0.00038885 2.2582e-06 1.0024\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_110_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_110_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.1857 -0.0018512 147.73\n-0.094288 0.35154 277.67\n-0.00019671 -1.563e-05 0.9996\n\nB: 0.42186 0.031568 60.169\n-0.084563 0.88575 93.738\n-0.00032749 1.4457e-05 1.0012\n\nC: -0.47246 -0.28359 869.57\n0.29041 -0.47016 396.67\n5.0949e-06 1.2499e-05 0.99998\n\nD: 1.6408 -0.0013389 -221.64\n0.1704 1.44 -155.56\n0.00036369 -3.22e-05 1.0003\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.1857 -0.0018512 147.73\n-0.094288 0.35154 277.67\n-0.00019671 -1.563e-05 0.9996\n\nB: 0.42186 0.031568 60.169\n-0.084563 0.88575 93.738\n-0.00032749 1.4457e-05 1.0012\n\nC: -0.47246 -0.28359 869.57\n0.29041 -0.47016 396.67\n5.0949e-06 1.2499e-05 0.99998\n\nD: 1.6408 -0.0013389 -221.64\n0.1704 1.44 -155.56\n0.00036369 -3.22e-05 1.0003\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_111_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_111_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: -0.21679 -0.12572 585.55\n0.12463 -0.21699 355.1\n-1.085e-06 -1.8818e-06 1.0002\n\nB: 0.54864 -0.010797 -6.1494\n-0.11876 0.86651 111.28\n-0.00026448 -1.8961e-05 1\n\nC: 0.13896 0.020204 194.37\n-0.25201 0.63798 118.99\n-0.00052359 2.2762e-05 0.9996\n\nD: 1.4932 0.01661 231.74\n0.45676 1.4341 -212.29\n0.0013256 9.9938e-05 0.99686\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: -0.21679 -0.12572 585.55\n0.12463 -0.21699 355.1\n-1.085e-06 -1.8818e-06 1.0002\n\nB: 0.54864 -0.010797 -6.1494\n-0.11876 0.86651 111.28\n-0.00026448 -1.8961e-05 1\n\nC: 0.13896 0.020204 194.37\n-0.25201 0.63798 118.99\n-0.00052359 2.2762e-05 0.9996\n\nD: 1.4932 0.01661 231.74\n0.45676 1.4341 -212.29\n0.0013256 9.9938e-05 0.99686\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_112_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_112_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 1.4932 0.01661 231.74\n0.45676 1.4341 -212.29\n0.0013256 9.9938e-05 0.99686\n\nB: 0.48275 -0.12831 276.04\n-0.19138 0.40711 199.19\n-5.6548e-05 -0.00023367 0.99912\n\nC: 0.46288 -0.016626 22.437\n-0.26713 0.81047 151.27\n-0.00036789 7.646e-06 0.99855\n\nD: 0.14586 0.056449 119.48\n-0.21737 0.71439 95.786\n-0.00051182 3.3282e-05 1.0008\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.4932 0.01661 231.74\n0.45676 1.4341 -212.29\n0.0013256 9.9938e-05 0.99686\n\nB: 0.48275 -0.12831 276.04\n-0.19138 0.40711 199.19\n-5.6548e-05 -0.00023367 0.99912\n\nC: 0.46288 -0.016626 22.437\n-0.26713 0.81047 151.27\n-0.00036789 7.646e-06 0.99855\n\nD: 0.14586 0.056449 119.48\n-0.21737 0.71439 95.786\n-0.00051182 3.3282e-05 1.0008\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_113_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_113_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.44469 -0.1629 197.72\n-0.090792 0.33606 37.55\n-0.00032851 -0.00028415 1.0004\n\nB: 0.4605 0.0019073 42.778\n0.003918 0.45748 107.3\n1.6895e-05 4.8733e-06 1.0001\n\nC: 1.6408 -0.0013389 -221.64\n0.1704 1.44 -155.56\n0.00036369 -3.22e-05 1.0003\n\nD: 0.45841 0.038317 36.428\n-0.26806 0.75693 165.6\n-0.00037539 -1.4035e-05 1.0016\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.44469 -0.1629 197.72\n-0.090792 0.33606 37.55\n-0.00032851 -0.00028415 1.0004\n\nB: 0.4605 0.0019073 42.778\n0.003918 0.45748 107.3\n1.6895e-05 4.8733e-06 1.0001\n\nC: 1.6408 -0.0013389 -221.64\n0.1704 1.44 -155.56\n0.00036369 -3.22e-05 1.0003\n\nD: 0.45841 0.038317 36.428\n-0.26806 0.75693 165.6\n-0.00037539 -1.4035e-05 1.0016\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_114_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_114_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.67444 0.023361 37.089\n-0.047926 0.90094 60.932\n-0.00018688 1.1402e-05 1.0007\n\nB: 0.2024 0.0033266 96.15\n-0.28093 0.65512 201.73\n-0.00049784 1.8106e-06 1.0048\n\nC: 0.37083 -0.024499 139.16\n-0.094573 0.62749 65.353\n-0.00053805 -2.2225e-05 0.99885\n\nD: 1.0669 0.31109 194.1\n-0.019953 0.9209 79.624\n0.000135 -7.6705e-05 0.99977\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.67444 0.023361 37.089\n-0.047926 0.90094 60.932\n-0.00018688 1.1402e-05 1.0007\n\nB: 0.2024 0.0033266 96.15\n-0.28093 0.65512 201.73\n-0.00049784 1.8106e-06 1.0048\n\nC: 0.37083 -0.024499 139.16\n-0.094573 0.62749 65.353\n-0.00053805 -2.2225e-05 0.99885\n\nD: 1.0669 0.31109 194.1\n-0.019953 0.9209 79.624\n0.000135 -7.6705e-05 0.99977\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_115_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_115_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 1.8454 -0.0093839 117.6\n0.8533 1.9335 -566.11\n0.0016091 6.8147e-05 1.0105\n\nB: 0.47589 0.042551 60.888\n-0.21388 0.80238 62.033\n-0.0003663 2.6901e-05 1.001\n\nC: 3.4851 0.086317 195.9\n1.1598 3.067 -1009.5\n0.0025647 -5.4567e-05 0.99349\n\nD: 0.14705 0.061323 72.893\n-0.27582 0.69094 109.44\n-0.00056993 1.3825e-06 0.9981\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.8454 -0.0093839 117.6\n0.8533 1.9335 -566.11\n0.0016091 6.8147e-05 1.0105\n\nB: 0.47589 0.042551 60.888\n-0.21388 0.80238 62.033\n-0.0003663 2.6901e-05 1.001\n\nC: 3.4851 0.086317 195.9\n1.1598 3.067 -1009.5\n0.0025647 -5.4567e-05 0.99349\n\nD: 0.14705 0.061323 72.893\n-0.27582 0.69094 109.44\n-0.00056993 1.3825e-06 0.9981\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_116_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_116_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.37107 -0.09213 318.73\n0.086334 0.37505 188.02\n-1.0814e-05 -3.6548e-06 1\n\nB: 0.29534 0.035751 -56.21\n-0.35718 0.5432 233.53\n-0.00064211 -1.1093e-05 0.97783\n\nC: 0.37694 0.049406 111.53\n-0.16444 0.72986 84.602\n-0.00037753 4.0247e-05 0.99869\n\nD: 2.2787 0.023843 -30.321\n0.58793 1.9158 -459.28\n0.0012782 -6.6868e-06 0.99971\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.37107 -0.09213 318.73\n0.086334 0.37505 188.02\n-1.0814e-05 -3.6548e-06 1\n\nB: 0.29534 0.035751 -56.21\n-0.35718 0.5432 233.53\n-0.00064211 -1.1093e-05 0.97783\n\nC: 0.37694 0.049406 111.53\n-0.16444 0.72986 84.602\n-0.00037753 4.0247e-05 0.99869\n\nD: 2.2787 0.023843 -30.321\n0.58793 1.9158 -459.28\n0.0012782 -6.6868e-06 0.99971\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_117_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_117_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 1.1442 -0.037625 115.5\n0.22206 1.0286 -30.039\n0.00032815 -2.4116e-05 0.9999\n\nB: 0.20876 0.015221 174.06\n-0.13382 0.55012 11.64\n-0.00044084 3.575e-05 1.0177\n\nC: 2.4144 -0.0022023 -199.3\n0.52146 2.0547 -569.49\n0.0010423 8.4489e-05 1.0043\n\nD: 0.51123 -0.013639 59.603\n-0.16055 0.85238 103.24\n-0.0003334 -4.0403e-05 1.0009\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.1442 -0.037625 115.5\n0.22206 1.0286 -30.039\n0.00032815 -2.4116e-05 0.9999\n\nB: 0.20876 0.015221 174.06\n-0.13382 0.55012 11.64\n-0.00044084 3.575e-05 1.0177\n\nC: 2.4144 -0.0022023 -199.3\n0.52146 2.0547 -569.49\n0.0010423 8.4489e-05 1.0043\n\nD: 0.51123 -0.013639 59.603\n-0.16055 0.85238 103.24\n-0.0003334 -4.0403e-05 1.0009\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_118_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_118_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.31237 0.099342 8.5389\n-0.29392 0.92363 14.629\n-0.00074642 6.3257e-05 0.99168\n\nB: 1.4272 0.064496 -40.82\n0.15764 1.3161 -94.847\n0.00037033 4.6015e-05 0.99258\n\nC: 14.984 -1.5209 -1987.5\n0.59203 13.878 -3896.8\n0.0072047 0.0038814 0.92614\n\nD: 1.2108 -0.031741 47.374\n0.20996 1.0345 -107.36\n0.00054926 -6.3631e-06 1.0004\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.31237 0.099342 8.5389\n-0.29392 0.92363 14.629\n-0.00074642 6.3257e-05 0.99168\n\nB: 1.4272 0.064496 -40.82\n0.15764 1.3161 -94.847\n0.00037033 4.6015e-05 0.99258\n\nC: 14.984 -1.5209 -1987.5\n0.59203 13.878 -3896.8\n0.0072047 0.0038814 0.92614\n\nD: 1.2108 -0.031741 47.374\n0.20996 1.0345 -107.36\n0.00054926 -6.3631e-06 1.0004\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_119_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_119_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 1.4733 -0.014435 76.772\n0.25007 1.2556 -120.81\n0.00088206 8.1414e-05 1.002\n\nB: 0.10472 0.069057 99.841\n-0.17731 0.5329 107.18\n-0.00051255 -1.3734e-05 0.98616\n\nC: 0.030125 -0.01797 299.5\n-0.19573 0.45869 167.74\n-0.00051291 -3.9704e-05 1.0019\n\nD: 0.36677 -0.019493 213.68\n-0.082321 0.47708 180.81\n-0.00021125 -4.1441e-05 1.0123\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.4733 -0.014435 76.772\n0.25007 1.2556 -120.81\n0.00088206 8.1414e-05 1.002\n\nB: 0.10472 0.069057 99.841\n-0.17731 0.5329 107.18\n-0.00051255 -1.3734e-05 0.98616\n\nC: 0.030125 -0.01797 299.5\n-0.19573 0.45869 167.74\n-0.00051291 -3.9704e-05 1.0019\n\nD: 0.36677 -0.019493 213.68\n-0.082321 0.47708 180.81\n-0.00021125 -4.1441e-05 1.0123\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_120_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_120_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.43124 0.047668 -66.525\n-0.34772 0.62068 209.27\n-0.00060194 -2.1104e-07 0.98648\n\nB: 1.1202 -0.0055862 43.04\n0.17566 1.0194 -5.6786\n0.00085767 -4.4625e-05 0.99922\n\nC: 0.4849 -0.15095 280.72\n-0.18568 0.38797 170.57\n-4.9965e-05 -0.00024428 0.99985\n\nD: 0.40245 -0.33938 102.29\n-0.2125 0.62381 216.78\n-0.00033866 -1.5855e-05 1.0018\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.43124 0.047668 -66.525\n-0.34772 0.62068 209.27\n-0.00060194 -2.1104e-07 0.98648\n\nB: 1.1202 -0.0055862 43.04\n0.17566 1.0194 -5.6786\n0.00085767 -4.4625e-05 0.99922\n\nC: 0.4849 -0.15095 280.72\n-0.18568 0.38797 170.57\n-4.9965e-05 -0.00024428 0.99985\n\nD: 0.40245 -0.33938 102.29\n-0.2125 0.62381 216.78\n-0.00033866 -1.5855e-05 1.0018\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_121_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_121_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 1.8454 -0.0093839 117.6\n0.8533 1.9335 -566.11\n0.0016091 6.8147e-05 1.0105\n\nB: 0.88184 0.31397 -39.976\n-0.18167 0.93621 153.25\n0.00020118 -1.9028e-05 0.99997\n\nC: 0.030125 -0.01797 299.5\n-0.19573 0.45869 167.74\n-0.00051291 -3.9704e-05 1.0019\n\nD: 0.53266 0.0019756 44.297\n-0.18137 0.85955 61.945\n-0.00038035 1.4705e-06 0.9999\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.8454 -0.0093839 117.6\n0.8533 1.9335 -566.11\n0.0016091 6.8147e-05 1.0105\n\nB: 0.88184 0.31397 -39.976\n-0.18167 0.93621 153.25\n0.00020118 -1.9028e-05 0.99997\n\nC: 0.030125 -0.01797 299.5\n-0.19573 0.45869 167.74\n-0.00051291 -3.9704e-05 1.0019\n\nD: 0.53266 0.0019756 44.297\n-0.18137 0.85955 61.945\n-0.00038035 1.4705e-06 0.9999\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_122_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_122_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 1.4932 0.01661 231.74\n0.45676 1.4341 -212.29\n0.0013256 9.9938e-05 0.99686\n\nB: 2.9721 0.034514 6.1536\n0.86739 2.9829 -532.95\n0.0035453 0.00017204 0.95976\n\nC: 0.091252 0.0066749 132.72\n-0.14667 0.47258 88.51\n-0.00056772 8.3791e-06 1.0029\n\nD: 2.1479 0.036813 206.94\n0.67819 1.8174 -485.8\n0.0012074 -6.8043e-06 0.99599\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.4932 0.01661 231.74\n0.45676 1.4341 -212.29\n0.0013256 9.9938e-05 0.99686\n\nB: 2.9721 0.034514 6.1536\n0.86739 2.9829 -532.95\n0.0035453 0.00017204 0.95976\n\nC: 0.091252 0.0066749 132.72\n-0.14667 0.47258 88.51\n-0.00056772 8.3791e-06 1.0029\n\nD: 2.1479 0.036813 206.94\n0.67819 1.8174 -485.8\n0.0012074 -6.8043e-06 0.99599\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_123_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_123_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 1.0582 -0.013384 562.45\n0.1807 0.93712 36.472\n0.00043718 5.9368e-06 0.99927\n\nB: 1.547 0.11677 155.75\n0.40373 1.373 -170.1\n0.00090791 8.8782e-05 1.0012\n\nC: 0.012717 0.014394 193.52\n-0.12386 0.60301 126.7\n-0.00063953 7.9665e-05 1.0012\n\nD: 0.54304 0.026384 236.48\n-0.041921 0.64806 87.13\n-5.8662e-05 1.5685e-05 1\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.0582 -0.013384 562.45\n0.1807 0.93712 36.472\n0.00043718 5.9368e-06 0.99927\n\nB: 1.547 0.11677 155.75\n0.40373 1.373 -170.1\n0.00090791 8.8782e-05 1.0012\n\nC: 0.012717 0.014394 193.52\n-0.12386 0.60301 126.7\n-0.00063953 7.9665e-05 1.0012\n\nD: 0.54304 0.026384 236.48\n-0.041921 0.64806 87.13\n-5.8662e-05 1.5685e-05 1\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_124_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_124_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.37694 0.049406 111.53\n-0.16444 0.72986 84.602\n-0.00037753 4.0247e-05 0.99869\n\nB: 0.69134 -0.0063829 116.24\n0.0053381 0.71985 83.96\n-1.8171e-05 2.7124e-05 1\n\nC: 0.52949 -0.028655 46.849\n-0.2451 0.79991 158.44\n-0.00032499 -1.8164e-05 0.99959\n\nD: 0.52064 0.019326 41.006\n-0.1476 0.75468 101.84\n-0.00026848 4.5639e-05 1.0094\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.37694 0.049406 111.53\n-0.16444 0.72986 84.602\n-0.00037753 4.0247e-05 0.99869\n\nB: 0.69134 -0.0063829 116.24\n0.0053381 0.71985 83.96\n-1.8171e-05 2.7124e-05 1\n\nC: 0.52949 -0.028655 46.849\n-0.2451 0.79991 158.44\n-0.00032499 -1.8164e-05 0.99959\n\nD: 0.52064 0.019326 41.006\n-0.1476 0.75468 101.84\n-0.00026848 4.5639e-05 1.0094\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_125_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_125_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 1.9834 -0.0016422 376.55\n0.84 1.4832 -241.61\n0.0019136 -3.8955e-05 1.0014\n\nB: 2.2787 0.023843 -30.321\n0.58793 1.9158 -459.28\n0.0012782 -6.6868e-06 0.99971\n\nC: 0.24117 0.068506 48.185\n-0.23318 0.79398 68.106\n-0.0005259 5.079e-05 0.99834\n\nD: 1.7312 -0.086578 129.17\n0.3882 1.1026 -2.2164\n0.0010948 -0.00011788 1.0024\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.9834 -0.0016422 376.55\n0.84 1.4832 -241.61\n0.0019136 -3.8955e-05 1.0014\n\nB: 2.2787 0.023843 -30.321\n0.58793 1.9158 -459.28\n0.0012782 -6.6868e-06 0.99971\n\nC: 0.24117 0.068506 48.185\n-0.23318 0.79398 68.106\n-0.0005259 5.079e-05 0.99834\n\nD: 1.7312 -0.086578 129.17\n0.3882 1.1026 -2.2164\n0.0010948 -0.00011788 1.0024\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_126_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_126_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 1.0983 -0.030393 111.31\n0.31879 0.9789 58.516\n0.00050073 -5.3943e-05 1.0005\n\nB: 0.76922 -0.28498 222.68\n0.33855 1.0341 -81.069\n0.00035349 1.2014e-05 0.99834\n\nC: 0.45841 0.038317 36.428\n-0.26806 0.75693 165.6\n-0.00037539 -1.4035e-05 1.0016\n\nD: 0.39176 -0.48622 421.69\n0.48543 0.39488 -0.097812\n2.3979e-06 -3.3236e-06 1\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.0983 -0.030393 111.31\n0.31879 0.9789 58.516\n0.00050073 -5.3943e-05 1.0005\n\nB: 0.76922 -0.28498 222.68\n0.33855 1.0341 -81.069\n0.00035349 1.2014e-05 0.99834\n\nC: 0.45841 0.038317 36.428\n-0.26806 0.75693 165.6\n-0.00037539 -1.4035e-05 1.0016\n\nD: 0.39176 -0.48622 421.69\n0.48543 0.39488 -0.097812\n2.3979e-06 -3.3236e-06 1\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_127_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_127_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.6729 -0.01895 127.73\n-0.015916 0.67847 176.42\n-3.6225e-05 -3.2204e-05 1\n\nB: 0.091252 0.0066749 132.72\n-0.14667 0.47258 88.51\n-0.00056772 8.3791e-06 1.0029\n\nC: 0.60367 0.071352 -36.528\n-0.21232 0.96671 -45.299\n-0.00036835 6.7456e-05 0.99996\n\nD: 0.14586 0.056449 119.48\n-0.21737 0.71439 95.786\n-0.00051182 3.3282e-05 1.0008\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.6729 -0.01895 127.73\n-0.015916 0.67847 176.42\n-3.6225e-05 -3.2204e-05 1\n\nB: 0.091252 0.0066749 132.72\n-0.14667 0.47258 88.51\n-0.00056772 8.3791e-06 1.0029\n\nC: 0.60367 0.071352 -36.528\n-0.21232 0.96671 -45.299\n-0.00036835 6.7456e-05 0.99996\n\nD: 0.14586 0.056449 119.48\n-0.21737 0.71439 95.786\n-0.00051182 3.3282e-05 1.0008\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_128_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_128_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.28973 0.014397 100.07\n-0.29955 0.64174 168.27\n-0.00067332 7.239e-06 1.0017\n\nB: 0.37618 -0.0026073 58.013\n-0.13988 0.81886 117.4\n-0.00032276 -1.1378e-05 0.99983\n\nC: 0.39176 -0.48622 421.69\n0.48543 0.39488 -0.097812\n2.3979e-06 -3.3236e-06 1\n\nD: -0.19998 0.34647 247.36\n-0.34607 -0.19989 467.21\n2.0354e-07 -5.1701e-08 1\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.28973 0.014397 100.07\n-0.29955 0.64174 168.27\n-0.00067332 7.239e-06 1.0017\n\nB: 0.37618 -0.0026073 58.013\n-0.13988 0.81886 117.4\n-0.00032276 -1.1378e-05 0.99983\n\nC: 0.39176 -0.48622 421.69\n0.48543 0.39488 -0.097812\n2.3979e-06 -3.3236e-06 1\n\nD: -0.19998 0.34647 247.36\n-0.34607 -0.19989 467.21\n2.0354e-07 -5.1701e-08 1\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_129_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_129_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.2024 0.0033266 96.15\n-0.28093 0.65512 201.73\n-0.00049784 1.8106e-06 1.0048\n\nB: 0.54304 0.026384 236.48\n-0.041921 0.64806 87.13\n-5.8662e-05 1.5685e-05 1\n\nC: 0.34904 -0.0038637 -43.899\n-0.22316 0.99346 45.579\n-0.00041195 -1.2246e-05 1\n\nD: 0.70212 0.43231 -128.54\n-0.42351 0.70276 199.3\n6.3285e-06 1.2175e-05 0.99997\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.2024 0.0033266 96.15\n-0.28093 0.65512 201.73\n-0.00049784 1.8106e-06 1.0048\n\nB: 0.54304 0.026384 236.48\n-0.041921 0.64806 87.13\n-5.8662e-05 1.5685e-05 1\n\nC: 0.34904 -0.0038637 -43.899\n-0.22316 0.99346 45.579\n-0.00041195 -1.2246e-05 1\n\nD: 0.70212 0.43231 -128.54\n-0.42351 0.70276 199.3\n6.3285e-06 1.2175e-05 0.99997\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_130_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_130_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.48275 -0.12831 276.04\n-0.19138 0.40711 199.19\n-5.6548e-05 -0.00023367 0.99912\n\nB: 1.8954 -0.043603 197.83\n0.50589 1.509 -236.95\n0.0010644 -1.6279e-05 1.0115\n\nC: 0.94726 0.076953 177.36\n0.25112 1.0126 13.205\n0.00047269 2.7805e-05 0.99969\n\nD: 0.23209 -0.67097 528.16\n0.66389 0.2516 -30.266\n-3.168e-05 2.5631e-05 1.0087\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.48275 -0.12831 276.04\n-0.19138 0.40711 199.19\n-5.6548e-05 -0.00023367 0.99912\n\nB: 1.8954 -0.043603 197.83\n0.50589 1.509 -236.95\n0.0010644 -1.6279e-05 1.0115\n\nC: 0.94726 0.076953 177.36\n0.25112 1.0126 13.205\n0.00047269 2.7805e-05 0.99969\n\nD: 0.23209 -0.67097 528.16\n0.66389 0.2516 -30.266\n-3.168e-05 2.5631e-05 1.0087\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_131_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_131_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 3.1627 -0.045434 -351.49\n0.7877 2.8197 -842.9\n0.0015033 -3.676e-05 1.0055\n\nB: 0.45287 0.0061881 100.32\n-0.053734 0.66556 61.961\n-0.00023168 -5.8559e-06 1.0005\n\nC: 1.8954 -0.043603 197.83\n0.50589 1.509 -236.95\n0.0010644 -1.6279e-05 1.0115\n\nD: 1.441 -0.037212 269.33\n0.73295 1.6438 -380.65\n0.0014226 4.1601e-05 1.0102\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 3.1627 -0.045434 -351.49\n0.7877 2.8197 -842.9\n0.0015033 -3.676e-05 1.0055\n\nB: 0.45287 0.0061881 100.32\n-0.053734 0.66556 61.961\n-0.00023168 -5.8559e-06 1.0005\n\nC: 1.8954 -0.043603 197.83\n0.50589 1.509 -236.95\n0.0010644 -1.6279e-05 1.0115\n\nD: 1.441 -0.037212 269.33\n0.73295 1.6438 -380.65\n0.0014226 4.1601e-05 1.0102\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_132_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_132_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.46461 0.085196 589.33\n0.19659 0.76327 25.833\n0.00026763 8.9486e-05 1.0006\n\nB: 1.3903 -0.069797 29.319\n0.18963 1.0284 22.049\n0.00052989 -9.8197e-05 1.0021\n\nC: 0.45841 0.038317 36.428\n-0.26806 0.75693 165.6\n-0.00037539 -1.4035e-05 1.0016\n\nD: 0.35568 0.079611 -21.49\n-0.17793 0.7199 62.24\n-0.00050458 1.9913e-05 0.9982\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.46461 0.085196 589.33\n0.19659 0.76327 25.833\n0.00026763 8.9486e-05 1.0006\n\nB: 1.3903 -0.069797 29.319\n0.18963 1.0284 22.049\n0.00052989 -9.8197e-05 1.0021\n\nC: 0.45841 0.038317 36.428\n-0.26806 0.75693 165.6\n-0.00037539 -1.4035e-05 1.0016\n\nD: 0.35568 0.079611 -21.49\n-0.17793 0.7199 62.24\n-0.00050458 1.9913e-05 0.9982\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_133_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_133_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.54693 0.20925 -108.35\n-0.082341 1.1176 -236.48\n-0.0006026 0.0001769 1.0001\n\nB: 0.32788 -0.00026656 168.52\n-0.087696 0.49289 72.043\n-0.00025798 4.6006e-06 0.9984\n\nC: 1.3526 0.026797 436.87\n0.31517 1.3826 -234.04\n0.00076901 0.00022984 1.0039\n\nD: 1.3903 -0.069797 29.319\n0.18963 1.0284 22.049\n0.00052989 -9.8197e-05 1.0021\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.54693 0.20925 -108.35\n-0.082341 1.1176 -236.48\n-0.0006026 0.0001769 1.0001\n\nB: 0.32788 -0.00026656 168.52\n-0.087696 0.49289 72.043\n-0.00025798 4.6006e-06 0.9984\n\nC: 1.3526 0.026797 436.87\n0.31517 1.3826 -234.04\n0.00076901 0.00022984 1.0039\n\nD: 1.3903 -0.069797 29.319\n0.18963 1.0284 22.049\n0.00052989 -9.8197e-05 1.0021\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_134_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_134_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.4849 -0.15095 280.72\n-0.18568 0.38797 170.57\n-4.9965e-05 -0.00024428 0.99985\n\nB: 2.4665 0.083695 233.31\n0.87021 2.8235 -936.68\n0.0017821 0.0001592 0.98707\n\nC: 0.72201 0.13445 62.975\n0.059719 0.85126 46.305\n-1.7322e-05 0.00018166 1.0001\n\nD: 1.2869 -0.0035671 90.117\n0.34981 1.1421 -290.48\n0.0010338 2.5575e-05 0.99928\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.4849 -0.15095 280.72\n-0.18568 0.38797 170.57\n-4.9965e-05 -0.00024428 0.99985\n\nB: 2.4665 0.083695 233.31\n0.87021 2.8235 -936.68\n0.0017821 0.0001592 0.98707\n\nC: 0.72201 0.13445 62.975\n0.059719 0.85126 46.305\n-1.7322e-05 0.00018166 1.0001\n\nD: 1.2869 -0.0035671 90.117\n0.34981 1.1421 -290.48\n0.0010338 2.5575e-05 0.99928\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_135_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_135_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.39176 -0.48622 421.69\n0.48543 0.39488 -0.097812\n2.3979e-06 -3.3236e-06 1\n\nB: 1.4259 0.070724 58.865\n0.39243 1.3442 -170.04\n0.00084248 0.00011346 0.98851\n\nC: 1.1442 -0.037625 115.5\n0.22206 1.0286 -30.039\n0.00032815 -2.4116e-05 0.9999\n\nD: 1.8454 -0.0093839 117.6\n0.8533 1.9335 -566.11\n0.0016091 6.8147e-05 1.0105\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.39176 -0.48622 421.69\n0.48543 0.39488 -0.097812\n2.3979e-06 -3.3236e-06 1\n\nB: 1.4259 0.070724 58.865\n0.39243 1.3442 -170.04\n0.00084248 0.00011346 0.98851\n\nC: 1.1442 -0.037625 115.5\n0.22206 1.0286 -30.039\n0.00032815 -2.4116e-05 0.9999\n\nD: 1.8454 -0.0093839 117.6\n0.8533 1.9335 -566.11\n0.0016091 6.8147e-05 1.0105\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_136_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_136_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.23209 -0.67097 528.16\n0.66389 0.2516 -30.266\n-3.168e-05 2.5631e-05 1.0087\n\nB: 0.2564 0.092521 94.187\n-0.28031 0.83589 -0.15652\n-0.00048968 6.0866e-05 1.0015\n\nC: 0.67444 0.023361 37.089\n-0.047926 0.90094 60.932\n-0.00018688 1.1402e-05 1.0007\n\nD: 1.6284 1.0346 -954.33\n-0.096789 2.5434 -782.98\n-0.00078653 0.0011044 1\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.23209 -0.67097 528.16\n0.66389 0.2516 -30.266\n-3.168e-05 2.5631e-05 1.0087\n\nB: 0.2564 0.092521 94.187\n-0.28031 0.83589 -0.15652\n-0.00048968 6.0866e-05 1.0015\n\nC: 0.67444 0.023361 37.089\n-0.047926 0.90094 60.932\n-0.00018688 1.1402e-05 1.0007\n\nD: 1.6284 1.0346 -954.33\n-0.096789 2.5434 -782.98\n-0.00078653 0.0011044 1\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_137_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_137_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.38266 -0.33125 122.6\n-0.21363 0.61581 225.35\n-0.00034121 -7.7515e-06 0.99865\n\nB: 0.75268 -0.0092452 -71.273\n-0.17607 0.97566 6.3105\n-0.00029582 -1.5187e-05 0.99957\n\nC: 0.31483 0.11583 690.51\n0.17546 0.70637 14.497\n0.00026712 0.00012691 1\n\nD: 0.4221 -0.055916 265.09\n0.060544 0.41967 174.7\n7.7273e-06 -2.0972e-06 0.99999\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.38266 -0.33125 122.6\n-0.21363 0.61581 225.35\n-0.00034121 -7.7515e-06 0.99865\n\nB: 0.75268 -0.0092452 -71.273\n-0.17607 0.97566 6.3105\n-0.00029582 -1.5187e-05 0.99957\n\nC: 0.31483 0.11583 690.51\n0.17546 0.70637 14.497\n0.00026712 0.00012691 1\n\nD: 0.4221 -0.055916 265.09\n0.060544 0.41967 174.7\n7.7273e-06 -2.0972e-06 0.99999\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_138_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_138_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 2.5614 0.083075 163.07\n0.94137 2.2586 -732.08\n0.0017783 2.1603e-05 0.99316\n\nB: 0.37618 -0.0026073 58.013\n-0.13988 0.81886 117.4\n-0.00032276 -1.1378e-05 0.99983\n\nC: 0.63669 0.0018872 137.9\n-0.00033285 0.63926 95.922\n-2.0441e-06 4.1104e-06 1\n\nD: 0.40245 -0.33938 102.29\n-0.2125 0.62381 216.78\n-0.00033866 -1.5855e-05 1.0018\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 2.5614 0.083075 163.07\n0.94137 2.2586 -732.08\n0.0017783 2.1603e-05 0.99316\n\nB: 0.37618 -0.0026073 58.013\n-0.13988 0.81886 117.4\n-0.00032276 -1.1378e-05 0.99983\n\nC: 0.63669 0.0018872 137.9\n-0.00033285 0.63926 95.922\n-2.0441e-06 4.1104e-06 1\n\nD: 0.40245 -0.33938 102.29\n-0.2125 0.62381 216.78\n-0.00033866 -1.5855e-05 1.0018\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_139_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_139_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: -0.21679 -0.12572 585.55\n0.12463 -0.21699 355.1\n-1.085e-06 -1.8818e-06 1.0002\n\nB: 1.1529 0.012747 244.44\n0.41529 1.1943 -155.59\n0.00087156 5.6224e-05 1.0092\n\nC: 2.3515 0.16969 142.03\n1.0602 2.1465 -778.33\n0.0016806 -4.8949e-05 0.99537\n\nD: 1.1901 -0.048587 107.72\n0.14488 1.1926 -121.84\n0.00033622 1.1241e-05 1.0001\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: -0.21679 -0.12572 585.55\n0.12463 -0.21699 355.1\n-1.085e-06 -1.8818e-06 1.0002\n\nB: 1.1529 0.012747 244.44\n0.41529 1.1943 -155.59\n0.00087156 5.6224e-05 1.0092\n\nC: 2.3515 0.16969 142.03\n1.0602 2.1465 -778.33\n0.0016806 -4.8949e-05 0.99537\n\nD: 1.1901 -0.048587 107.72\n0.14488 1.1926 -121.84\n0.00033622 1.1241e-05 1.0001\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_140_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_140_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 3.1627 -0.045434 -351.49\n0.7877 2.8197 -842.9\n0.0015033 -3.676e-05 1.0055\n\nB: 0.91628 -0.19782 70.502\n0.072414 0.68419 -33.187\n5.7127e-06 -0.00025258 0.99947\n\nC: 0.27317 0.041297 84.951\n-0.22859 0.68736 124.47\n-0.00041264 5.2763e-05 1.0003\n\nD: 0.7855 0.039826 119.05\n-0.25749 1.3451 -220.69\n-0.00047304 5.3677e-05 1.001\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 3.1627 -0.045434 -351.49\n0.7877 2.8197 -842.9\n0.0015033 -3.676e-05 1.0055\n\nB: 0.91628 -0.19782 70.502\n0.072414 0.68419 -33.187\n5.7127e-06 -0.00025258 0.99947\n\nC: 0.27317 0.041297 84.951\n-0.22859 0.68736 124.47\n-0.00041264 5.2763e-05 1.0003\n\nD: 0.7855 0.039826 119.05\n-0.25749 1.3451 -220.69\n-0.00047304 5.3677e-05 1.001\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_141_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_141_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.38266 -0.33125 122.6\n-0.21363 0.61581 225.35\n-0.00034121 -7.7515e-06 0.99865\n\nB: 0.23209 -0.67097 528.16\n0.66389 0.2516 -30.266\n-3.168e-05 2.5631e-05 1.0087\n\nC: 0.36677 -0.019493 213.68\n-0.082321 0.47708 180.81\n-0.00021125 -4.1441e-05 1.0123\n\nD: 0.77044 -0.014353 152.19\n0.007827 0.75172 76.397\n1.9039e-05 -2.1554e-05 1\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.38266 -0.33125 122.6\n-0.21363 0.61581 225.35\n-0.00034121 -7.7515e-06 0.99865\n\nB: 0.23209 -0.67097 528.16\n0.66389 0.2516 -30.266\n-3.168e-05 2.5631e-05 1.0087\n\nC: 0.36677 -0.019493 213.68\n-0.082321 0.47708 180.81\n-0.00021125 -4.1441e-05 1.0123\n\nD: 0.77044 -0.014353 152.19\n0.007827 0.75172 76.397\n1.9039e-05 -2.1554e-05 1\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_142_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_142_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.67783 0.002447 123\n-0.00051063 0.68091 83.563\n-2.5166e-06 5.6486e-06 1\n\nB: 0.17608 -0.024321 273.19\n-0.19809 0.7405 74.826\n-0.00053318 1.2457e-05 1.0069\n\nC: 0.2564 0.092521 94.187\n-0.28031 0.83589 -0.15652\n-0.00048968 6.0866e-05 1.0015\n\nD: 1.5134 -0.0029581 20.934\n0.2678 1.4062 -232.68\n0.00048583 -4.0311e-06 1.0006\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.67783 0.002447 123\n-0.00051063 0.68091 83.563\n-2.5166e-06 5.6486e-06 1\n\nB: 0.17608 -0.024321 273.19\n-0.19809 0.7405 74.826\n-0.00053318 1.2457e-05 1.0069\n\nC: 0.2564 0.092521 94.187\n-0.28031 0.83589 -0.15652\n-0.00048968 6.0866e-05 1.0015\n\nD: 1.5134 -0.0029581 20.934\n0.2678 1.4062 -232.68\n0.00048583 -4.0311e-06 1.0006\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_143_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_143_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.29858 0.0403 -122.67\n-0.38113 0.61838 172.03\n-0.00071255 -1.0448e-06 0.97348\n\nB: 1.8454 -0.0093839 117.6\n0.8533 1.9335 -566.11\n0.0016091 6.8147e-05 1.0105\n\nC: 0.94726 0.076953 177.36\n0.25112 1.0126 13.205\n0.00047269 2.7805e-05 0.99969\n\nD: 0.17608 -0.024321 273.19\n-0.19809 0.7405 74.826\n-0.00053318 1.2457e-05 1.0069\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.29858 0.0403 -122.67\n-0.38113 0.61838 172.03\n-0.00071255 -1.0448e-06 0.97348\n\nB: 1.8454 -0.0093839 117.6\n0.8533 1.9335 -566.11\n0.0016091 6.8147e-05 1.0105\n\nC: 0.94726 0.076953 177.36\n0.25112 1.0126 13.205\n0.00047269 2.7805e-05 0.99969\n\nD: 0.17608 -0.024321 273.19\n-0.19809 0.7405 74.826\n-0.00053318 1.2457e-05 1.0069\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_144_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_144_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.33414 0.069646 90.22\n-0.25229 0.73446 157.67\n-0.00038885 2.2582e-06 1.0024\n\nB: 0.54693 0.20925 -108.35\n-0.082341 1.1176 -236.48\n-0.0006026 0.0001769 1.0001\n\nC: 0.42945 0.0071566 96.266\n-0.019537 0.48377 43.049\n-7.8698e-05 1.6013e-05 1.0001\n\nD: 0.81883 -0.28544 161.88\n0.010536 0.53499 62.327\n1.3163e-05 -0.00056443 1.0014\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.33414 0.069646 90.22\n-0.25229 0.73446 157.67\n-0.00038885 2.2582e-06 1.0024\n\nB: 0.54693 0.20925 -108.35\n-0.082341 1.1176 -236.48\n-0.0006026 0.0001769 1.0001\n\nC: 0.42945 0.0071566 96.266\n-0.019537 0.48377 43.049\n-7.8698e-05 1.6013e-05 1.0001\n\nD: 0.81883 -0.28544 161.88\n0.010536 0.53499 62.327\n1.3163e-05 -0.00056443 1.0014\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_145_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_145_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.31269 -0.011782 51.842\n-0.22276 0.71181 65.24\n-0.00081452 -4.173e-05 0.99309\n\nB: 3.6199 0.1243 -2.4307\n0.35256 5.1536 -1935.2\n0.0029372 0.0011148 1\n\nC: 2.2787 0.023843 -30.321\n0.58793 1.9158 -459.28\n0.0012782 -6.6868e-06 0.99971\n\nD: 0.23209 -0.67097 528.16\n0.66389 0.2516 -30.266\n-3.168e-05 2.5631e-05 1.0087\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.31269 -0.011782 51.842\n-0.22276 0.71181 65.24\n-0.00081452 -4.173e-05 0.99309\n\nB: 3.6199 0.1243 -2.4307\n0.35256 5.1536 -1935.2\n0.0029372 0.0011148 1\n\nC: 2.2787 0.023843 -30.321\n0.58793 1.9158 -459.28\n0.0012782 -6.6868e-06 0.99971\n\nD: 0.23209 -0.67097 528.16\n0.66389 0.2516 -30.266\n-3.168e-05 2.5631e-05 1.0087\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_146_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_146_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.7855 0.039826 119.05\n-0.25749 1.3451 -220.69\n-0.00047304 5.3677e-05 1.001\n\nB: 0.0033111 0.031282 184.63\n-0.15843 0.75999 4.5609\n-0.00083562 0.00011238 0.99927\n\nC: 0.42186 0.031568 60.169\n-0.084563 0.88575 93.738\n-0.00032749 1.4457e-05 1.0012\n\nD: 0.54693 0.20925 -108.35\n-0.082341 1.1176 -236.48\n-0.0006026 0.0001769 1.0001\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.7855 0.039826 119.05\n-0.25749 1.3451 -220.69\n-0.00047304 5.3677e-05 1.001\n\nB: 0.0033111 0.031282 184.63\n-0.15843 0.75999 4.5609\n-0.00083562 0.00011238 0.99927\n\nC: 0.42186 0.031568 60.169\n-0.084563 0.88575 93.738\n-0.00032749 1.4457e-05 1.0012\n\nD: 0.54693 0.20925 -108.35\n-0.082341 1.1176 -236.48\n-0.0006026 0.0001769 1.0001\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_147_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_147_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 1.3903 -0.069797 29.319\n0.18963 1.0284 22.049\n0.00052989 -9.8197e-05 1.0021\n\nB: 0.55202 0.096567 108.66\n-0.35774 1.4927 -276.32\n-0.00068886 0.0001065 0.98986\n\nC: 1.8454 -0.0093839 117.6\n0.8533 1.9335 -566.11\n0.0016091 6.8147e-05 1.0105\n\nD: 0.86273 0.030727 -257.65\n-0.081274 1.0175 -48.986\n-0.00016043 4.4449e-05 1.0008\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.3903 -0.069797 29.319\n0.18963 1.0284 22.049\n0.00052989 -9.8197e-05 1.0021\n\nB: 0.55202 0.096567 108.66\n-0.35774 1.4927 -276.32\n-0.00068886 0.0001065 0.98986\n\nC: 1.8454 -0.0093839 117.6\n0.8533 1.9335 -566.11\n0.0016091 6.8147e-05 1.0105\n\nD: 0.86273 0.030727 -257.65\n-0.081274 1.0175 -48.986\n-0.00016043 4.4449e-05 1.0008\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_148_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_148_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.70212 0.43231 -128.54\n-0.42351 0.70276 199.3\n6.3285e-06 1.2175e-05 0.99997\n\nB: 0.22888 0.0058691 272.09\n-0.077153 0.3923 203.08\n-0.00024299 -4.5827e-06 1.0015\n\nC: 0.81883 -0.28544 161.88\n0.010536 0.53499 62.327\n1.3163e-05 -0.00056443 1.0014\n\nD: 0.48882 0.0079397 13.575\n-0.24956 0.69593 149.6\n-0.00053246 -7.8574e-06 1.0026\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.70212 0.43231 -128.54\n-0.42351 0.70276 199.3\n6.3285e-06 1.2175e-05 0.99997\n\nB: 0.22888 0.0058691 272.09\n-0.077153 0.3923 203.08\n-0.00024299 -4.5827e-06 1.0015\n\nC: 0.81883 -0.28544 161.88\n0.010536 0.53499 62.327\n1.3163e-05 -0.00056443 1.0014\n\nD: 0.48882 0.0079397 13.575\n-0.24956 0.69593 149.6\n-0.00053246 -7.8574e-06 1.0026\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_149_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_149_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 1.3231 -0.10518 226.69\n0.35118 1.4445 -217.52\n0.00076877 -2.4515e-05 0.99903\n\nB: 0.54864 -0.010797 -6.1494\n-0.11876 0.86651 111.28\n-0.00026448 -1.8961e-05 1\n\nC: 2.4665 0.083695 233.31\n0.87021 2.8235 -936.68\n0.0017821 0.0001592 0.98707\n\nD: 0.85555 -0.17378 91.59\n0.17068 0.85755 -31.264\n-5.1182e-06 2.0966e-06 1.0023\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.3231 -0.10518 226.69\n0.35118 1.4445 -217.52\n0.00076877 -2.4515e-05 0.99903\n\nB: 0.54864 -0.010797 -6.1494\n-0.11876 0.86651 111.28\n-0.00026448 -1.8961e-05 1\n\nC: 2.4665 0.083695 233.31\n0.87021 2.8235 -936.68\n0.0017821 0.0001592 0.98707\n\nD: 0.85555 -0.17378 91.59\n0.17068 0.85755 -31.264\n-5.1182e-06 2.0966e-06 1.0023\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_150_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_150_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.29534 0.035751 -56.21\n-0.35718 0.5432 233.53\n-0.00064211 -1.1093e-05 0.97783\n\nB: 14.984 -1.5209 -1987.5\n0.59203 13.878 -3896.8\n0.0072047 0.0038814 0.92614\n\nC: 0.67783 0.002447 123\n-0.00051063 0.68091 83.563\n-2.5166e-06 5.6486e-06 1\n\nD: 0.4849 -0.15095 280.72\n-0.18568 0.38797 170.57\n-4.9965e-05 -0.00024428 0.99985\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.29534 0.035751 -56.21\n-0.35718 0.5432 233.53\n-0.00064211 -1.1093e-05 0.97783\n\nB: 14.984 -1.5209 -1987.5\n0.59203 13.878 -3896.8\n0.0072047 0.0038814 0.92614\n\nC: 0.67783 0.002447 123\n-0.00051063 0.68091 83.563\n-2.5166e-06 5.6486e-06 1\n\nD: 0.4849 -0.15095 280.72\n-0.18568 0.38797 170.57\n-4.9965e-05 -0.00024428 0.99985\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_151_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_151_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.47208 0.021042 63.836\n-0.16332 0.73028 126.94\n-0.00030371 2.4606e-05 0.99981\n\nB: 0.13416 0.073075 56.977\n-0.21333 0.70433 84.528\n-0.00055481 6.1106e-05 1\n\nC: 1.3308 -0.060097 223.54\n0.17906 0.94189 -10.999\n0.00034146 -4.4675e-05 0.99983\n\nD: 2.4144 -0.0022023 -199.3\n0.52146 2.0547 -569.49\n0.0010423 8.4489e-05 1.0043\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.47208 0.021042 63.836\n-0.16332 0.73028 126.94\n-0.00030371 2.4606e-05 0.99981\n\nB: 0.13416 0.073075 56.977\n-0.21333 0.70433 84.528\n-0.00055481 6.1106e-05 1\n\nC: 1.3308 -0.060097 223.54\n0.17906 0.94189 -10.999\n0.00034146 -4.4675e-05 0.99983\n\nD: 2.4144 -0.0022023 -199.3\n0.52146 2.0547 -569.49\n0.0010423 8.4489e-05 1.0043\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_152_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_152_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.69134 -0.0063829 116.24\n0.0053381 0.71985 83.96\n-1.8171e-05 2.7124e-05 1\n\nB: 0.55347 0.01345 110.12\n-0.085938 0.64894 151.2\n-0.00016395 1.1079e-05 0.99926\n\nC: 0.28973 0.014397 100.07\n-0.29955 0.64174 168.27\n-0.00067332 7.239e-06 1.0017\n\nD: 0.75268 -0.0092452 -71.273\n-0.17607 0.97566 6.3105\n-0.00029582 -1.5187e-05 0.99957\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.69134 -0.0063829 116.24\n0.0053381 0.71985 83.96\n-1.8171e-05 2.7124e-05 1\n\nB: 0.55347 0.01345 110.12\n-0.085938 0.64894 151.2\n-0.00016395 1.1079e-05 0.99926\n\nC: 0.28973 0.014397 100.07\n-0.29955 0.64174 168.27\n-0.00067332 7.239e-06 1.0017\n\nD: 0.75268 -0.0092452 -71.273\n-0.17607 0.97566 6.3105\n-0.00029582 -1.5187e-05 0.99957\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_153_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_153_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.77105 -0.097833 -3.6994\n-0.092675 0.81167 92.799\n-0.0001392 -0.00012806 0.99964\n\nB: 0.32788 -0.00026656 168.52\n-0.087696 0.49289 72.043\n-0.00025798 4.6006e-06 0.9984\n\nC: 0.88184 0.31397 -39.976\n-0.18167 0.93621 153.25\n0.00020118 -1.9028e-05 0.99997\n\nD: 0.7855 0.039826 119.05\n-0.25749 1.3451 -220.69\n-0.00047304 5.3677e-05 1.001\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.77105 -0.097833 -3.6994\n-0.092675 0.81167 92.799\n-0.0001392 -0.00012806 0.99964\n\nB: 0.32788 -0.00026656 168.52\n-0.087696 0.49289 72.043\n-0.00025798 4.6006e-06 0.9984\n\nC: 0.88184 0.31397 -39.976\n-0.18167 0.93621 153.25\n0.00020118 -1.9028e-05 0.99997\n\nD: 0.7855 0.039826 119.05\n-0.25749 1.3451 -220.69\n-0.00047304 5.3677e-05 1.001\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_154_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_154_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 1.6413 0.074225 91.097\n0.77035 1.5061 -362.72\n0.0010583 -7.1897e-05 1.0011\n\nB: 1.0478 0.035143 64.843\n0.063507 1.0349 21.701\n0.00023044 -6.878e-06 0.99998\n\nC: 0.24117 0.068506 48.185\n-0.23318 0.79398 68.106\n-0.0005259 5.079e-05 0.99834\n\nD: 1.3838 0.024181 -93.882\n0.093344 1.307 -232.76\n0.00015995 6.7546e-05 1.0008\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.6413 0.074225 91.097\n0.77035 1.5061 -362.72\n0.0010583 -7.1897e-05 1.0011\n\nB: 1.0478 0.035143 64.843\n0.063507 1.0349 21.701\n0.00023044 -6.878e-06 0.99998\n\nC: 0.24117 0.068506 48.185\n-0.23318 0.79398 68.106\n-0.0005259 5.079e-05 0.99834\n\nD: 1.3838 0.024181 -93.882\n0.093344 1.307 -232.76\n0.00015995 6.7546e-05 1.0008\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_155_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_155_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 1.0035 -0.00055314 2.5255\n-0.0028717 1.0087 -9.7285\n-3.8783e-06 3.4244e-06 1\n\nB: 1.3951 0.13641 136.74\n0.31704 1.2758 -219.28\n0.00053511 0.00013896 0.99675\n\nC: 0.72201 0.13445 62.975\n0.059719 0.85126 46.305\n-1.7322e-05 0.00018166 1.0001\n\nD: 0.45841 0.038317 36.428\n-0.26806 0.75693 165.6\n-0.00037539 -1.4035e-05 1.0016\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.0035 -0.00055314 2.5255\n-0.0028717 1.0087 -9.7285\n-3.8783e-06 3.4244e-06 1\n\nB: 1.3951 0.13641 136.74\n0.31704 1.2758 -219.28\n0.00053511 0.00013896 0.99675\n\nC: 0.72201 0.13445 62.975\n0.059719 0.85126 46.305\n-1.7322e-05 0.00018166 1.0001\n\nD: 0.45841 0.038317 36.428\n-0.26806 0.75693 165.6\n-0.00037539 -1.4035e-05 1.0016\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_156_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_156_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.38922 0.015343 55.85\n-0.1763 0.84543 87.344\n-0.00049385 -2.1034e-05 1.0072\n\nB: 1.2869 -0.0035671 90.117\n0.34981 1.1421 -290.48\n0.0010338 2.5575e-05 0.99928\n\nC: 0.37694 0.049406 111.53\n-0.16444 0.72986 84.602\n-0.00037753 4.0247e-05 0.99869\n\nD: 0.88632 -0.012492 -136.92\n-0.047209 1.0157 42.178\n-0.0001423 1.8595e-05 1.0005\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.38922 0.015343 55.85\n-0.1763 0.84543 87.344\n-0.00049385 -2.1034e-05 1.0072\n\nB: 1.2869 -0.0035671 90.117\n0.34981 1.1421 -290.48\n0.0010338 2.5575e-05 0.99928\n\nC: 0.37694 0.049406 111.53\n-0.16444 0.72986 84.602\n-0.00037753 4.0247e-05 0.99869\n\nD: 0.88632 -0.012492 -136.92\n-0.047209 1.0157 42.178\n-0.0001423 1.8595e-05 1.0005\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_157_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_157_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 1.1442 -0.037625 115.5\n0.22206 1.0286 -30.039\n0.00032815 -2.4116e-05 0.9999\n\nB: 1.2869 -0.0035671 90.117\n0.34981 1.1421 -290.48\n0.0010338 2.5575e-05 0.99928\n\nC: 0.032608 0.010774 198.34\n-0.16134 0.44659 114.31\n-0.00057725 -5.1566e-07 1.0017\n\nD: 1.7761 -0.053427 263.17\n0.41751 1.5987 -329.46\n0.00069677 3.1372e-05 1.0014\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.1442 -0.037625 115.5\n0.22206 1.0286 -30.039\n0.00032815 -2.4116e-05 0.9999\n\nB: 1.2869 -0.0035671 90.117\n0.34981 1.1421 -290.48\n0.0010338 2.5575e-05 0.99928\n\nC: 0.032608 0.010774 198.34\n-0.16134 0.44659 114.31\n-0.00057725 -5.1566e-07 1.0017\n\nD: 1.7761 -0.053427 263.17\n0.41751 1.5987 -329.46\n0.00069677 3.1372e-05 1.0014\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_158_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_158_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 1.7761 -0.053427 263.17\n0.41751 1.5987 -329.46\n0.00069677 3.1372e-05 1.0014\n\nB: 1.3186 -0.0097277 -143.16\n0.094663 1.1956 -58.383\n0.00019153 -2.0281e-05 0.99989\n\nC: 0.38922 0.015343 55.85\n-0.1763 0.84543 87.344\n-0.00049385 -2.1034e-05 1.0072\n\nD: 0.22888 0.0058691 272.09\n-0.077153 0.3923 203.08\n-0.00024299 -4.5827e-06 1.0015\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.7761 -0.053427 263.17\n0.41751 1.5987 -329.46\n0.00069677 3.1372e-05 1.0014\n\nB: 1.3186 -0.0097277 -143.16\n0.094663 1.1956 -58.383\n0.00019153 -2.0281e-05 0.99989\n\nC: 0.38922 0.015343 55.85\n-0.1763 0.84543 87.344\n-0.00049385 -2.1034e-05 1.0072\n\nD: 0.22888 0.0058691 272.09\n-0.077153 0.3923 203.08\n-0.00024299 -4.5827e-06 1.0015\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_159_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_159_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.49202 0.0057754 242.06\n0.058005 0.43541 166.02\n0.00018017 1.0746e-05 0.99974\n\nB: 0.48882 0.0079397 13.575\n-0.24956 0.69593 149.6\n-0.00053246 -7.8574e-06 1.0026\n\nC: 0.86273 0.030727 -257.65\n-0.081274 1.0175 -48.986\n-0.00016043 4.4449e-05 1.0008\n\nD: 1.1198 0.031669 158.94\n0.13747 0.986 -24.458\n0.00036259 4.1267e-05 0.99658\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.49202 0.0057754 242.06\n0.058005 0.43541 166.02\n0.00018017 1.0746e-05 0.99974\n\nB: 0.48882 0.0079397 13.575\n-0.24956 0.69593 149.6\n-0.00053246 -7.8574e-06 1.0026\n\nC: 0.86273 0.030727 -257.65\n-0.081274 1.0175 -48.986\n-0.00016043 4.4449e-05 1.0008\n\nD: 1.1198 0.031669 158.94\n0.13747 0.986 -24.458\n0.00036259 4.1267e-05 0.99658\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_160_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_160_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.58099 -0.029382 -20.47\n-0.29479 0.73128 188.62\n-0.00043803 -4.3076e-05 1.0007\n\nB: 1.1529 0.012747 244.44\n0.41529 1.1943 -155.59\n0.00087156 5.6224e-05 1.0092\n\nC: 0.23209 -0.67097 528.16\n0.66389 0.2516 -30.266\n-3.168e-05 2.5631e-05 1.0087\n\nD: 1.7312 -0.086578 129.17\n0.3882 1.1026 -2.2164\n0.0010948 -0.00011788 1.0024\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.58099 -0.029382 -20.47\n-0.29479 0.73128 188.62\n-0.00043803 -4.3076e-05 1.0007\n\nB: 1.1529 0.012747 244.44\n0.41529 1.1943 -155.59\n0.00087156 5.6224e-05 1.0092\n\nC: 0.23209 -0.67097 528.16\n0.66389 0.2516 -30.266\n-3.168e-05 2.5631e-05 1.0087\n\nD: 1.7312 -0.086578 129.17\n0.3882 1.1026 -2.2164\n0.0010948 -0.00011788 1.0024\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_161_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_161_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.57125 -0.095863 127.19\n0.050302 0.75099 -13.911\n-0.00020485 1.2421e-06 0.9999\n\nB: 0.57079 0.0076829 -45.295\n-0.15447 0.93183 62.276\n-0.00028402 -5.8827e-06 0.99996\n\nC: 0.24117 0.068506 48.185\n-0.23318 0.79398 68.106\n-0.0005259 5.079e-05 0.99834\n\nD: 0.75268 -0.0092452 -71.273\n-0.17607 0.97566 6.3105\n-0.00029582 -1.5187e-05 0.99957\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.57125 -0.095863 127.19\n0.050302 0.75099 -13.911\n-0.00020485 1.2421e-06 0.9999\n\nB: 0.57079 0.0076829 -45.295\n-0.15447 0.93183 62.276\n-0.00028402 -5.8827e-06 0.99996\n\nC: 0.24117 0.068506 48.185\n-0.23318 0.79398 68.106\n-0.0005259 5.079e-05 0.99834\n\nD: 0.75268 -0.0092452 -71.273\n-0.17607 0.97566 6.3105\n-0.00029582 -1.5187e-05 0.99957\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_162_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_162_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.36677 -0.019493 213.68\n-0.082321 0.47708 180.81\n-0.00021125 -4.1441e-05 1.0123\n\nB: 0.60665 -0.013034 217.78\n0.087451 0.52146 32.707\n0.00021516 2.9281e-07 1.0006\n\nC: 0.7088 -0.010965 -26.07\n-0.13602 0.83489 103.19\n-0.00023352 -1.5615e-05 1.0004\n\nD: 0.030125 -0.01797 299.5\n-0.19573 0.45869 167.74\n-0.00051291 -3.9704e-05 1.0019\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.36677 -0.019493 213.68\n-0.082321 0.47708 180.81\n-0.00021125 -4.1441e-05 1.0123\n\nB: 0.60665 -0.013034 217.78\n0.087451 0.52146 32.707\n0.00021516 2.9281e-07 1.0006\n\nC: 0.7088 -0.010965 -26.07\n-0.13602 0.83489 103.19\n-0.00023352 -1.5615e-05 1.0004\n\nD: 0.030125 -0.01797 299.5\n-0.19573 0.45869 167.74\n-0.00051291 -3.9704e-05 1.0019\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_163_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_163_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.55347 0.01345 110.12\n-0.085938 0.64894 151.2\n-0.00016395 1.1079e-05 0.99926\n\nB: 0.47589 0.042551 60.888\n-0.21388 0.80238 62.033\n-0.0003663 2.6901e-05 1.001\n\nC: 2.1479 0.036813 206.94\n0.67819 1.8174 -485.8\n0.0012074 -6.8043e-06 0.99599\n\nD: 0.34904 -0.0038637 -43.899\n-0.22316 0.99346 45.579\n-0.00041195 -1.2246e-05 1\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.55347 0.01345 110.12\n-0.085938 0.64894 151.2\n-0.00016395 1.1079e-05 0.99926\n\nB: 0.47589 0.042551 60.888\n-0.21388 0.80238 62.033\n-0.0003663 2.6901e-05 1.001\n\nC: 2.1479 0.036813 206.94\n0.67819 1.8174 -485.8\n0.0012074 -6.8043e-06 0.99599\n\nD: 0.34904 -0.0038637 -43.899\n-0.22316 0.99346 45.579\n-0.00041195 -1.2246e-05 1\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_164_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_164_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 1.4932 0.01661 231.74\n0.45676 1.4341 -212.29\n0.0013256 9.9938e-05 0.99686\n\nB: 0.40245 -0.33938 102.29\n-0.2125 0.62381 216.78\n-0.00033866 -1.5855e-05 1.0018\n\nC: 2.6481 0.070248 -423.11\n0.5002 2.6605 -906.39\n0.0012014 0.00025943 0.99533\n\nD: 1.6477 -0.037624 101.59\n0.49962 1.5725 -364.98\n0.00090272 4.6589e-05 1.0037\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.4932 0.01661 231.74\n0.45676 1.4341 -212.29\n0.0013256 9.9938e-05 0.99686\n\nB: 0.40245 -0.33938 102.29\n-0.2125 0.62381 216.78\n-0.00033866 -1.5855e-05 1.0018\n\nC: 2.6481 0.070248 -423.11\n0.5002 2.6605 -906.39\n0.0012014 0.00025943 0.99533\n\nD: 1.6477 -0.037624 101.59\n0.49962 1.5725 -364.98\n0.00090272 4.6589e-05 1.0037\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_165_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_165_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 1.7761 -0.053427 263.17\n0.41751 1.5987 -329.46\n0.00069677 3.1372e-05 1.0014\n\nB: 1.1901 -0.048587 107.72\n0.14488 1.1926 -121.84\n0.00033622 1.1241e-05 1.0001\n\nC: 1.8851 0.028166 274.85\n0.48185 1.6951 -326.97\n0.0011778 8.455e-05 0.99801\n\nD: -0.19998 0.34647 247.36\n-0.34607 -0.19989 467.21\n2.0354e-07 -5.1701e-08 1\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.7761 -0.053427 263.17\n0.41751 1.5987 -329.46\n0.00069677 3.1372e-05 1.0014\n\nB: 1.1901 -0.048587 107.72\n0.14488 1.1926 -121.84\n0.00033622 1.1241e-05 1.0001\n\nC: 1.8851 0.028166 274.85\n0.48185 1.6951 -326.97\n0.0011778 8.455e-05 0.99801\n\nD: -0.19998 0.34647 247.36\n-0.34607 -0.19989 467.21\n2.0354e-07 -5.1701e-08 1\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_166_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_166_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 2.2078 0.054458 63.617\n0.67654 2.2557 -637.98\n0.0013191 8.5079e-05 1.0033\n\nB: 3.1627 -0.045434 -351.49\n0.7877 2.8197 -842.9\n0.0015033 -3.676e-05 1.0055\n\nC: 1.1529 0.012747 244.44\n0.41529 1.1943 -155.59\n0.00087156 5.6224e-05 1.0092\n\nD: 0.45841 0.038317 36.428\n-0.26806 0.75693 165.6\n-0.00037539 -1.4035e-05 1.0016\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 2.2078 0.054458 63.617\n0.67654 2.2557 -637.98\n0.0013191 8.5079e-05 1.0033\n\nB: 3.1627 -0.045434 -351.49\n0.7877 2.8197 -842.9\n0.0015033 -3.676e-05 1.0055\n\nC: 1.1529 0.012747 244.44\n0.41529 1.1943 -155.59\n0.00087156 5.6224e-05 1.0092\n\nD: 0.45841 0.038317 36.428\n-0.26806 0.75693 165.6\n-0.00037539 -1.4035e-05 1.0016\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_167_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_167_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.60879 -0.35761 289.93\n0.34822 0.61653 -30.949\n-2.0912e-05 1.3527e-06 1.014\n\nB: 1.2108 -0.031741 47.374\n0.20996 1.0345 -107.36\n0.00054926 -6.3631e-06 1.0004\n\nC: 0.53266 0.0019756 44.297\n-0.18137 0.85955 61.945\n-0.00038035 1.4705e-06 0.9999\n\nD: 2.4665 0.083695 233.31\n0.87021 2.8235 -936.68\n0.0017821 0.0001592 0.98707\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.60879 -0.35761 289.93\n0.34822 0.61653 -30.949\n-2.0912e-05 1.3527e-06 1.014\n\nB: 1.2108 -0.031741 47.374\n0.20996 1.0345 -107.36\n0.00054926 -6.3631e-06 1.0004\n\nC: 0.53266 0.0019756 44.297\n-0.18137 0.85955 61.945\n-0.00038035 1.4705e-06 0.9999\n\nD: 2.4665 0.083695 233.31\n0.87021 2.8235 -936.68\n0.0017821 0.0001592 0.98707\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_168_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_168_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 1.6413 0.074225 91.097\n0.77035 1.5061 -362.72\n0.0010583 -7.1897e-05 1.0011\n\nB: 2.4665 0.083695 233.31\n0.87021 2.8235 -936.68\n0.0017821 0.0001592 0.98707\n\nC: 1.4403 0.27154 10.734\n0.071471 1.5534 -44.533\n0.00030432 0.00049723 1.001\n\nD: 0.85799 0.21669 9.4839\n-0.21177 0.85855 130.48\n1.5015e-06 9.2033e-07 1\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.6413 0.074225 91.097\n0.77035 1.5061 -362.72\n0.0010583 -7.1897e-05 1.0011\n\nB: 2.4665 0.083695 233.31\n0.87021 2.8235 -936.68\n0.0017821 0.0001592 0.98707\n\nC: 1.4403 0.27154 10.734\n0.071471 1.5534 -44.533\n0.00030432 0.00049723 1.001\n\nD: 0.85799 0.21669 9.4839\n-0.21177 0.85855 130.48\n1.5015e-06 9.2033e-07 1\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_169_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_169_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.74922 -0.0014388 -75.597\n-0.074158 0.94323 40.455\n-0.00018126 -6.2301e-06 1\n\nB: 1.7761 -0.053427 263.17\n0.41751 1.5987 -329.46\n0.00069677 3.1372e-05 1.0014\n\nC: 0.55616 0.0088234 83.342\n-0.19782 0.70845 195.76\n-0.00029305 -3.175e-05 0.99884\n\nD: 0.4849 -0.15095 280.72\n-0.18568 0.38797 170.57\n-4.9965e-05 -0.00024428 0.99985\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.74922 -0.0014388 -75.597\n-0.074158 0.94323 40.455\n-0.00018126 -6.2301e-06 1\n\nB: 1.7761 -0.053427 263.17\n0.41751 1.5987 -329.46\n0.00069677 3.1372e-05 1.0014\n\nC: 0.55616 0.0088234 83.342\n-0.19782 0.70845 195.76\n-0.00029305 -3.175e-05 0.99884\n\nD: 0.4849 -0.15095 280.72\n-0.18568 0.38797 170.57\n-4.9965e-05 -0.00024428 0.99985\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_170_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_170_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.31237 0.099342 8.5389\n-0.29392 0.92363 14.629\n-0.00074642 6.3257e-05 0.99168\n\nB: 2.2078 0.054458 63.617\n0.67654 2.2557 -637.98\n0.0013191 8.5079e-05 1.0033\n\nC: 1.3526 0.026797 436.87\n0.31517 1.3826 -234.04\n0.00076901 0.00022984 1.0039\n\nD: 0.53266 0.0019756 44.297\n-0.18137 0.85955 61.945\n-0.00038035 1.4705e-06 0.9999\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.31237 0.099342 8.5389\n-0.29392 0.92363 14.629\n-0.00074642 6.3257e-05 0.99168\n\nB: 2.2078 0.054458 63.617\n0.67654 2.2557 -637.98\n0.0013191 8.5079e-05 1.0033\n\nC: 1.3526 0.026797 436.87\n0.31517 1.3826 -234.04\n0.00076901 0.00022984 1.0039\n\nD: 0.53266 0.0019756 44.297\n-0.18137 0.85955 61.945\n-0.00038035 1.4705e-06 0.9999\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_171_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_171_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.67444 0.023361 37.089\n-0.047926 0.90094 60.932\n-0.00018688 1.1402e-05 1.0007\n\nB: 0.57079 0.0076829 -45.295\n-0.15447 0.93183 62.276\n-0.00028402 -5.8827e-06 0.99996\n\nC: 4.3722 0.14407 -818.24\n-0.25209 3.9595 -549.15\n0.001718 0.0010825 0.97985\n\nD: 1.3186 -0.0097277 -143.16\n0.094663 1.1956 -58.383\n0.00019153 -2.0281e-05 0.99989\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.67444 0.023361 37.089\n-0.047926 0.90094 60.932\n-0.00018688 1.1402e-05 1.0007\n\nB: 0.57079 0.0076829 -45.295\n-0.15447 0.93183 62.276\n-0.00028402 -5.8827e-06 0.99996\n\nC: 4.3722 0.14407 -818.24\n-0.25209 3.9595 -549.15\n0.001718 0.0010825 0.97985\n\nD: 1.3186 -0.0097277 -143.16\n0.094663 1.1956 -58.383\n0.00019153 -2.0281e-05 0.99989\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_172_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_172_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 1.441 -0.037212 269.33\n0.73295 1.6438 -380.65\n0.0014226 4.1601e-05 1.0102\n\nB: 0.25611 0.0594 88.294\n-0.24702 0.7663 71.53\n-0.00048162 6.7687e-05 1.0008\n\nC: 0.37107 -0.09213 318.73\n0.086334 0.37505 188.02\n-1.0814e-05 -3.6548e-06 1\n\nD: 0.66581 0.6777 -31.246\n-0.14346 0.96853 148.92\n0.00042869 -1.7355e-05 0.99928\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.441 -0.037212 269.33\n0.73295 1.6438 -380.65\n0.0014226 4.1601e-05 1.0102\n\nB: 0.25611 0.0594 88.294\n-0.24702 0.7663 71.53\n-0.00048162 6.7687e-05 1.0008\n\nC: 0.37107 -0.09213 318.73\n0.086334 0.37505 188.02\n-1.0814e-05 -3.6548e-06 1\n\nD: 0.66581 0.6777 -31.246\n-0.14346 0.96853 148.92\n0.00042869 -1.7355e-05 0.99928\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_173_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_173_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.94726 0.076953 177.36\n0.25112 1.0126 13.205\n0.00047269 2.7805e-05 0.99969\n\nB: 1.2108 -0.031741 47.374\n0.20996 1.0345 -107.36\n0.00054926 -6.3631e-06 1.0004\n\nC: 0.33492 -0.0051126 63.132\n-0.19841 0.81318 98.482\n-0.00041298 -2.8119e-05 0.99833\n\nD: 0.72201 0.13445 62.975\n0.059719 0.85126 46.305\n-1.7322e-05 0.00018166 1.0001\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.94726 0.076953 177.36\n0.25112 1.0126 13.205\n0.00047269 2.7805e-05 0.99969\n\nB: 1.2108 -0.031741 47.374\n0.20996 1.0345 -107.36\n0.00054926 -6.3631e-06 1.0004\n\nC: 0.33492 -0.0051126 63.132\n-0.19841 0.81318 98.482\n-0.00041298 -2.8119e-05 0.99833\n\nD: 0.72201 0.13445 62.975\n0.059719 0.85126 46.305\n-1.7322e-05 0.00018166 1.0001\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_174_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_174_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.1268 -0.03963 330.5\n-0.1892 0.46973 254.2\n-0.00039857 -3.9641e-05 0.99971\n\nB: 2.9721 0.034514 6.1536\n0.86739 2.9829 -532.95\n0.0035453 0.00017204 0.95976\n\nC: 1.6477 -0.037624 101.59\n0.49962 1.5725 -364.98\n0.00090272 4.6589e-05 1.0037\n\nD: 0.29534 0.035751 -56.21\n-0.35718 0.5432 233.53\n-0.00064211 -1.1093e-05 0.97783\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.1268 -0.03963 330.5\n-0.1892 0.46973 254.2\n-0.00039857 -3.9641e-05 0.99971\n\nB: 2.9721 0.034514 6.1536\n0.86739 2.9829 -532.95\n0.0035453 0.00017204 0.95976\n\nC: 1.6477 -0.037624 101.59\n0.49962 1.5725 -364.98\n0.00090272 4.6589e-05 1.0037\n\nD: 0.29534 0.035751 -56.21\n-0.35718 0.5432 233.53\n-0.00064211 -1.1093e-05 0.97783\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_175_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_175_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.31483 0.11583 690.51\n0.17546 0.70637 14.497\n0.00026712 0.00012691 1\n\nB: 0.77044 -0.014353 152.19\n0.007827 0.75172 76.397\n1.9039e-05 -2.1554e-05 1\n\nC: 1.0983 -0.030393 111.31\n0.31879 0.9789 58.516\n0.00050073 -5.3943e-05 1.0005\n\nD: 0.34904 -0.0038637 -43.899\n-0.22316 0.99346 45.579\n-0.00041195 -1.2246e-05 1\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.31483 0.11583 690.51\n0.17546 0.70637 14.497\n0.00026712 0.00012691 1\n\nB: 0.77044 -0.014353 152.19\n0.007827 0.75172 76.397\n1.9039e-05 -2.1554e-05 1\n\nC: 1.0983 -0.030393 111.31\n0.31879 0.9789 58.516\n0.00050073 -5.3943e-05 1.0005\n\nD: 0.34904 -0.0038637 -43.899\n-0.22316 0.99346 45.579\n-0.00041195 -1.2246e-05 1\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_176_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_176_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.44469 -0.1629 197.72\n-0.090792 0.33606 37.55\n-0.00032851 -0.00028415 1.0004\n\nB: 2.4144 -0.0022023 -199.3\n0.52146 2.0547 -569.49\n0.0010423 8.4489e-05 1.0043\n\nC: 0.88632 -0.012492 -136.92\n-0.047209 1.0157 42.178\n-0.0001423 1.8595e-05 1.0005\n\nD: 0.60665 -0.013034 217.78\n0.087451 0.52146 32.707\n0.00021516 2.9281e-07 1.0006\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.44469 -0.1629 197.72\n-0.090792 0.33606 37.55\n-0.00032851 -0.00028415 1.0004\n\nB: 2.4144 -0.0022023 -199.3\n0.52146 2.0547 -569.49\n0.0010423 8.4489e-05 1.0043\n\nC: 0.88632 -0.012492 -136.92\n-0.047209 1.0157 42.178\n-0.0001423 1.8595e-05 1.0005\n\nD: 0.60665 -0.013034 217.78\n0.087451 0.52146 32.707\n0.00021516 2.9281e-07 1.0006\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_177_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_177_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 1.4403 0.27154 10.734\n0.071471 1.5534 -44.533\n0.00030432 0.00049723 1.001\n\nB: 3.4851 0.086317 195.9\n1.1598 3.067 -1009.5\n0.0025647 -5.4567e-05 0.99349\n\nC: 0.49838 -0.015725 33.278\n-0.18045 0.77392 59.799\n-0.00064863 -4.2793e-05 0.99978\n\nD: 1.8954 -0.043603 197.83\n0.50589 1.509 -236.95\n0.0010644 -1.6279e-05 1.0115\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.4403 0.27154 10.734\n0.071471 1.5534 -44.533\n0.00030432 0.00049723 1.001\n\nB: 3.4851 0.086317 195.9\n1.1598 3.067 -1009.5\n0.0025647 -5.4567e-05 0.99349\n\nC: 0.49838 -0.015725 33.278\n-0.18045 0.77392 59.799\n-0.00064863 -4.2793e-05 0.99978\n\nD: 1.8954 -0.043603 197.83\n0.50589 1.509 -236.95\n0.0010644 -1.6279e-05 1.0115\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_178_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_178_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.25611 0.0594 88.294\n-0.24702 0.7663 71.53\n-0.00048162 6.7687e-05 1.0008\n\nB: 0.54372 0.011697 65.787\n-0.06271 0.8727 105.67\n-0.00025117 2.4814e-06 0.99967\n\nC: 0.55202 0.096567 108.66\n-0.35774 1.4927 -276.32\n-0.00068886 0.0001065 0.98986\n\nD: 0.48531 0.10549 -95.005\n-0.11843 0.77202 44.217\n-0.00029301 2.8434e-05 0.99773\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.25611 0.0594 88.294\n-0.24702 0.7663 71.53\n-0.00048162 6.7687e-05 1.0008\n\nB: 0.54372 0.011697 65.787\n-0.06271 0.8727 105.67\n-0.00025117 2.4814e-06 0.99967\n\nC: 0.55202 0.096567 108.66\n-0.35774 1.4927 -276.32\n-0.00068886 0.0001065 0.98986\n\nD: 0.48531 0.10549 -95.005\n-0.11843 0.77202 44.217\n-0.00029301 2.8434e-05 0.99773\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_179_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_179_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.37694 0.049406 111.53\n-0.16444 0.72986 84.602\n-0.00037753 4.0247e-05 0.99869\n\nB: 0.030125 -0.01797 299.5\n-0.19573 0.45869 167.74\n-0.00051291 -3.9704e-05 1.0019\n\nC: 1.0499 0.025643 108.77\n0.19467 1.0054 -7.8895\n0.0011218 -3.184e-05 1.0021\n\nD: 0.38854 -0.073106 92.576\n-0.1986 0.7319 139.21\n-0.00040811 -1.555e-05 0.99988\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.37694 0.049406 111.53\n-0.16444 0.72986 84.602\n-0.00037753 4.0247e-05 0.99869\n\nB: 0.030125 -0.01797 299.5\n-0.19573 0.45869 167.74\n-0.00051291 -3.9704e-05 1.0019\n\nC: 1.0499 0.025643 108.77\n0.19467 1.0054 -7.8895\n0.0011218 -3.184e-05 1.0021\n\nD: 0.38854 -0.073106 92.576\n-0.1986 0.7319 139.21\n-0.00040811 -1.555e-05 0.99988\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_180_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_180_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.34904 -0.0038637 -43.899\n-0.22316 0.99346 45.579\n-0.00041195 -1.2246e-05 1\n\nB: 1.4219 0.01866 342.44\n0.36005 1.3261 -141.73\n0.00090969 2.3838e-05 1.0002\n\nC: 1.1529 0.012747 244.44\n0.41529 1.1943 -155.59\n0.00087156 5.6224e-05 1.0092\n\nD: 0.74922 -0.0014388 -75.597\n-0.074158 0.94323 40.455\n-0.00018126 -6.2301e-06 1\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.34904 -0.0038637 -43.899\n-0.22316 0.99346 45.579\n-0.00041195 -1.2246e-05 1\n\nB: 1.4219 0.01866 342.44\n0.36005 1.3261 -141.73\n0.00090969 2.3838e-05 1.0002\n\nC: 1.1529 0.012747 244.44\n0.41529 1.1943 -155.59\n0.00087156 5.6224e-05 1.0092\n\nD: 0.74922 -0.0014388 -75.597\n-0.074158 0.94323 40.455\n-0.00018126 -6.2301e-06 1\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_181_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_181_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.81883 -0.28544 161.88\n0.010536 0.53499 62.327\n1.3163e-05 -0.00056443 1.0014\n\nB: 0.36677 -0.019493 213.68\n-0.082321 0.47708 180.81\n-0.00021125 -4.1441e-05 1.0123\n\nC: 1.8278 -0.0075993 72.268\n0.68643 1.8832 -550.61\n0.0012853 4.1209e-05 1.006\n\nD: 0.94726 0.076953 177.36\n0.25112 1.0126 13.205\n0.00047269 2.7805e-05 0.99969\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.81883 -0.28544 161.88\n0.010536 0.53499 62.327\n1.3163e-05 -0.00056443 1.0014\n\nB: 0.36677 -0.019493 213.68\n-0.082321 0.47708 180.81\n-0.00021125 -4.1441e-05 1.0123\n\nC: 1.8278 -0.0075993 72.268\n0.68643 1.8832 -550.61\n0.0012853 4.1209e-05 1.006\n\nD: 0.94726 0.076953 177.36\n0.25112 1.0126 13.205\n0.00047269 2.7805e-05 0.99969\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_182_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_182_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.030125 -0.01797 299.5\n-0.19573 0.45869 167.74\n-0.00051291 -3.9704e-05 1.0019\n\nB: 0.74922 -0.0014388 -75.597\n-0.074158 0.94323 40.455\n-0.00018126 -6.2301e-06 1\n\nC: 1.3186 -0.0097277 -143.16\n0.094663 1.1956 -58.383\n0.00019153 -2.0281e-05 0.99989\n\nD: 0.53266 0.0019756 44.297\n-0.18137 0.85955 61.945\n-0.00038035 1.4705e-06 0.9999\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.030125 -0.01797 299.5\n-0.19573 0.45869 167.74\n-0.00051291 -3.9704e-05 1.0019\n\nB: 0.74922 -0.0014388 -75.597\n-0.074158 0.94323 40.455\n-0.00018126 -6.2301e-06 1\n\nC: 1.3186 -0.0097277 -143.16\n0.094663 1.1956 -58.383\n0.00019153 -2.0281e-05 0.99989\n\nD: 0.53266 0.0019756 44.297\n-0.18137 0.85955 61.945\n-0.00038035 1.4705e-06 0.9999\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_183_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_183_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 1.7761 -0.053427 263.17\n0.41751 1.5987 -329.46\n0.00069677 3.1372e-05 1.0014\n\nB: 0.31269 -0.011782 51.842\n-0.22276 0.71181 65.24\n-0.00081452 -4.173e-05 0.99309\n\nC: 2.9599 0.00703 244.64\n0.78405 1.8789 -438.29\n0.0018411 4.4095e-05 0.99694\n\nD: 1.1529 0.012747 244.44\n0.41529 1.1943 -155.59\n0.00087156 5.6224e-05 1.0092\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.7761 -0.053427 263.17\n0.41751 1.5987 -329.46\n0.00069677 3.1372e-05 1.0014\n\nB: 0.31269 -0.011782 51.842\n-0.22276 0.71181 65.24\n-0.00081452 -4.173e-05 0.99309\n\nC: 2.9599 0.00703 244.64\n0.78405 1.8789 -438.29\n0.0018411 4.4095e-05 0.99694\n\nD: 1.1529 0.012747 244.44\n0.41529 1.1943 -155.59\n0.00087156 5.6224e-05 1.0092\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_184_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_184_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.54372 0.011697 65.787\n-0.06271 0.8727 105.67\n-0.00025117 2.4814e-06 0.99967\n\nB: 0.38922 0.015343 55.85\n-0.1763 0.84543 87.344\n-0.00049385 -2.1034e-05 1.0072\n\nC: 0.45841 0.038317 36.428\n-0.26806 0.75693 165.6\n-0.00037539 -1.4035e-05 1.0016\n\nD: 0.084461 -0.022036 252.3\n-0.21 0.51325 245.38\n-0.000447 -2.621e-05 1.0009\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.54372 0.011697 65.787\n-0.06271 0.8727 105.67\n-0.00025117 2.4814e-06 0.99967\n\nB: 0.38922 0.015343 55.85\n-0.1763 0.84543 87.344\n-0.00049385 -2.1034e-05 1.0072\n\nC: 0.45841 0.038317 36.428\n-0.26806 0.75693 165.6\n-0.00037539 -1.4035e-05 1.0016\n\nD: 0.084461 -0.022036 252.3\n-0.21 0.51325 245.38\n-0.000447 -2.621e-05 1.0009\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_185_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_185_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 3.4851 0.086317 195.9\n1.1598 3.067 -1009.5\n0.0025647 -5.4567e-05 0.99349\n\nB: 0.70212 0.43231 -128.54\n-0.42351 0.70276 199.3\n6.3285e-06 1.2175e-05 0.99997\n\nC: 0.23209 -0.67097 528.16\n0.66389 0.2516 -30.266\n-3.168e-05 2.5631e-05 1.0087\n\nD: 1.0819 0.012805 66.799\n0.075853 1.006 5.6909\n0.00034273 -2.4626e-05 1.0003\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 3.4851 0.086317 195.9\n1.1598 3.067 -1009.5\n0.0025647 -5.4567e-05 0.99349\n\nB: 0.70212 0.43231 -128.54\n-0.42351 0.70276 199.3\n6.3285e-06 1.2175e-05 0.99997\n\nC: 0.23209 -0.67097 528.16\n0.66389 0.2516 -30.266\n-3.168e-05 2.5631e-05 1.0087\n\nD: 1.0819 0.012805 66.799\n0.075853 1.006 5.6909\n0.00034273 -2.4626e-05 1.0003\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_186_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_186_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.29858 0.0403 -122.67\n-0.38113 0.61838 172.03\n-0.00071255 -1.0448e-06 0.97348\n\nB: 1.4862 -0.061679 54.577\n0.4606 1.2816 -147.5\n0.0007321 -7.3842e-05 0.99895\n\nC: 0.41873 -0.043533 -18.562\n-0.27021 0.88041 53.791\n-0.00050299 -2.2546e-05 0.99941\n\nD: 1.0669 0.31109 194.1\n-0.019953 0.9209 79.624\n0.000135 -7.6705e-05 0.99977\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.29858 0.0403 -122.67\n-0.38113 0.61838 172.03\n-0.00071255 -1.0448e-06 0.97348\n\nB: 1.4862 -0.061679 54.577\n0.4606 1.2816 -147.5\n0.0007321 -7.3842e-05 0.99895\n\nC: 0.41873 -0.043533 -18.562\n-0.27021 0.88041 53.791\n-0.00050299 -2.2546e-05 0.99941\n\nD: 1.0669 0.31109 194.1\n-0.019953 0.9209 79.624\n0.000135 -7.6705e-05 0.99977\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_187_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_187_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.47208 0.021042 63.836\n-0.16332 0.73028 126.94\n-0.00030371 2.4606e-05 0.99981\n\nB: 1.6477 -0.037624 101.59\n0.49962 1.5725 -364.98\n0.00090272 4.6589e-05 1.0037\n\nC: 1.0669 0.31109 194.1\n-0.019953 0.9209 79.624\n0.000135 -7.6705e-05 0.99977\n\nD: 2.6177 0.042575 -65.797\n0.74359 2.3954 -903.27\n0.0018892 8.2816e-05 0.98996\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.47208 0.021042 63.836\n-0.16332 0.73028 126.94\n-0.00030371 2.4606e-05 0.99981\n\nB: 1.6477 -0.037624 101.59\n0.49962 1.5725 -364.98\n0.00090272 4.6589e-05 1.0037\n\nC: 1.0669 0.31109 194.1\n-0.019953 0.9209 79.624\n0.000135 -7.6705e-05 0.99977\n\nD: 2.6177 0.042575 -65.797\n0.74359 2.3954 -903.27\n0.0018892 8.2816e-05 0.98996\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_188_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_188_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.43124 0.047668 -66.525\n-0.34772 0.62068 209.27\n-0.00060194 -2.1104e-07 0.98648\n\nB: 0.2564 0.092521 94.187\n-0.28031 0.83589 -0.15652\n-0.00048968 6.0866e-05 1.0015\n\nC: 1.1884 0.015274 95.776\n0.23282 1.0681 -20.551\n0.00097623 0.00015903 1.0014\n\nD: 1.2869 -0.0035671 90.117\n0.34981 1.1421 -290.48\n0.0010338 2.5575e-05 0.99928\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.43124 0.047668 -66.525\n-0.34772 0.62068 209.27\n-0.00060194 -2.1104e-07 0.98648\n\nB: 0.2564 0.092521 94.187\n-0.28031 0.83589 -0.15652\n-0.00048968 6.0866e-05 1.0015\n\nC: 1.1884 0.015274 95.776\n0.23282 1.0681 -20.551\n0.00097623 0.00015903 1.0014\n\nD: 1.2869 -0.0035671 90.117\n0.34981 1.1421 -290.48\n0.0010338 2.5575e-05 0.99928\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_189_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_189_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 1.8851 0.028166 274.85\n0.48185 1.6951 -326.97\n0.0011778 8.455e-05 0.99801\n\nB: 1.1943 0.010001 372.77\n0.22686 1.0937 -67.914\n0.00058802 5.2037e-05 0.99941\n\nC: 0.66581 0.6777 -31.246\n-0.14346 0.96853 148.92\n0.00042869 -1.7355e-05 0.99928\n\nD: 1.1884 0.015274 95.776\n0.23282 1.0681 -20.551\n0.00097623 0.00015903 1.0014\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.8851 0.028166 274.85\n0.48185 1.6951 -326.97\n0.0011778 8.455e-05 0.99801\n\nB: 1.1943 0.010001 372.77\n0.22686 1.0937 -67.914\n0.00058802 5.2037e-05 0.99941\n\nC: 0.66581 0.6777 -31.246\n-0.14346 0.96853 148.92\n0.00042869 -1.7355e-05 0.99928\n\nD: 1.1884 0.015274 95.776\n0.23282 1.0681 -20.551\n0.00097623 0.00015903 1.0014\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_190_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_190_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 1.0582 -0.013384 562.45\n0.1807 0.93712 36.472\n0.00043718 5.9368e-06 0.99927\n\nB: 0.49202 0.0057754 242.06\n0.058005 0.43541 166.02\n0.00018017 1.0746e-05 0.99974\n\nC: 0.72201 0.13445 62.975\n0.059719 0.85126 46.305\n-1.7322e-05 0.00018166 1.0001\n\nD: 0.47589 0.042551 60.888\n-0.21388 0.80238 62.033\n-0.0003663 2.6901e-05 1.001\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.0582 -0.013384 562.45\n0.1807 0.93712 36.472\n0.00043718 5.9368e-06 0.99927\n\nB: 0.49202 0.0057754 242.06\n0.058005 0.43541 166.02\n0.00018017 1.0746e-05 0.99974\n\nC: 0.72201 0.13445 62.975\n0.059719 0.85126 46.305\n-1.7322e-05 0.00018166 1.0001\n\nD: 0.47589 0.042551 60.888\n-0.21388 0.80238 62.033\n-0.0003663 2.6901e-05 1.001\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_191_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_191_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.60665 -0.013034 217.78\n0.087451 0.52146 32.707\n0.00021516 2.9281e-07 1.0006\n\nB: 1.2895 0.43518 -118.46\n-0.025956 1.4233 161.89\n-3.0413e-05 0.00069874 1.0013\n\nC: 2.9599 0.00703 244.64\n0.78405 1.8789 -438.29\n0.0018411 4.4095e-05 0.99694\n\nD: 0.4849 -0.15095 280.72\n-0.18568 0.38797 170.57\n-4.9965e-05 -0.00024428 0.99985\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.60665 -0.013034 217.78\n0.087451 0.52146 32.707\n0.00021516 2.9281e-07 1.0006\n\nB: 1.2895 0.43518 -118.46\n-0.025956 1.4233 161.89\n-3.0413e-05 0.00069874 1.0013\n\nC: 2.9599 0.00703 244.64\n0.78405 1.8789 -438.29\n0.0018411 4.4095e-05 0.99694\n\nD: 0.4849 -0.15095 280.72\n-0.18568 0.38797 170.57\n-4.9965e-05 -0.00024428 0.99985\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_192_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_192_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.60367 0.071352 -36.528\n-0.21232 0.96671 -45.299\n-0.00036835 6.7456e-05 0.99996\n\nB: 1.8278 -0.0075993 72.268\n0.68643 1.8832 -550.61\n0.0012853 4.1209e-05 1.006\n\nC: 1.2869 -0.0035671 90.117\n0.34981 1.1421 -290.48\n0.0010338 2.5575e-05 0.99928\n\nD: 1.7312 -0.086578 129.17\n0.3882 1.1026 -2.2164\n0.0010948 -0.00011788 1.0024\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.60367 0.071352 -36.528\n-0.21232 0.96671 -45.299\n-0.00036835 6.7456e-05 0.99996\n\nB: 1.8278 -0.0075993 72.268\n0.68643 1.8832 -550.61\n0.0012853 4.1209e-05 1.006\n\nC: 1.2869 -0.0035671 90.117\n0.34981 1.1421 -290.48\n0.0010338 2.5575e-05 0.99928\n\nD: 1.7312 -0.086578 129.17\n0.3882 1.1026 -2.2164\n0.0010948 -0.00011788 1.0024\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_193_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_193_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 1.0819 0.012805 66.799\n0.075853 1.006 5.6909\n0.00034273 -2.4626e-05 1.0003\n\nB: 2.2787 0.023843 -30.321\n0.58793 1.9158 -459.28\n0.0012782 -6.6868e-06 0.99971\n\nC: 14.984 -1.5209 -1987.5\n0.59203 13.878 -3896.8\n0.0072047 0.0038814 0.92614\n\nD: 0.7855 0.039826 119.05\n-0.25749 1.3451 -220.69\n-0.00047304 5.3677e-05 1.001\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.0819 0.012805 66.799\n0.075853 1.006 5.6909\n0.00034273 -2.4626e-05 1.0003\n\nB: 2.2787 0.023843 -30.321\n0.58793 1.9158 -459.28\n0.0012782 -6.6868e-06 0.99971\n\nC: 14.984 -1.5209 -1987.5\n0.59203 13.878 -3896.8\n0.0072047 0.0038814 0.92614\n\nD: 0.7855 0.039826 119.05\n-0.25749 1.3451 -220.69\n-0.00047304 5.3677e-05 1.001\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_194_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_194_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 1.3231 -0.10518 226.69\n0.35118 1.4445 -217.52\n0.00076877 -2.4515e-05 0.99903\n\nB: 1.0505 -0.0053825 276.45\n0.20631 0.92888 48.832\n0.00048841 -1.9251e-05 0.99878\n\nC: 1.4259 0.070724 58.865\n0.39243 1.3442 -170.04\n0.00084248 0.00011346 0.98851\n\nD: 1.3526 0.026797 436.87\n0.31517 1.3826 -234.04\n0.00076901 0.00022984 1.0039\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.3231 -0.10518 226.69\n0.35118 1.4445 -217.52\n0.00076877 -2.4515e-05 0.99903\n\nB: 1.0505 -0.0053825 276.45\n0.20631 0.92888 48.832\n0.00048841 -1.9251e-05 0.99878\n\nC: 1.4259 0.070724 58.865\n0.39243 1.3442 -170.04\n0.00084248 0.00011346 0.98851\n\nD: 1.3526 0.026797 436.87\n0.31517 1.3826 -234.04\n0.00076901 0.00022984 1.0039\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_195_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_195_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 3.6199 0.1243 -2.4307\n0.35256 5.1536 -1935.2\n0.0029372 0.0011148 1\n\nB: 0.47589 0.042551 60.888\n-0.21388 0.80238 62.033\n-0.0003663 2.6901e-05 1.001\n\nC: 1.0819 0.012805 66.799\n0.075853 1.006 5.6909\n0.00034273 -2.4626e-05 1.0003\n\nD: 1.3231 -0.10518 226.69\n0.35118 1.4445 -217.52\n0.00076877 -2.4515e-05 0.99903\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 3.6199 0.1243 -2.4307\n0.35256 5.1536 -1935.2\n0.0029372 0.0011148 1\n\nB: 0.47589 0.042551 60.888\n-0.21388 0.80238 62.033\n-0.0003663 2.6901e-05 1.001\n\nC: 1.0819 0.012805 66.799\n0.075853 1.006 5.6909\n0.00034273 -2.4626e-05 1.0003\n\nD: 1.3231 -0.10518 226.69\n0.35118 1.4445 -217.52\n0.00076877 -2.4515e-05 0.99903\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_196_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_196_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.37083 -0.024499 139.16\n-0.094573 0.62749 65.353\n-0.00053805 -2.2225e-05 0.99885\n\nB: 1.4259 0.070724 58.865\n0.39243 1.3442 -170.04\n0.00084248 0.00011346 0.98851\n\nC: 0.57125 -0.095863 127.19\n0.050302 0.75099 -13.911\n-0.00020485 1.2421e-06 0.9999\n\nD: 0.14586 0.056449 119.48\n-0.21737 0.71439 95.786\n-0.00051182 3.3282e-05 1.0008\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.37083 -0.024499 139.16\n-0.094573 0.62749 65.353\n-0.00053805 -2.2225e-05 0.99885\n\nB: 1.4259 0.070724 58.865\n0.39243 1.3442 -170.04\n0.00084248 0.00011346 0.98851\n\nC: 0.57125 -0.095863 127.19\n0.050302 0.75099 -13.911\n-0.00020485 1.2421e-06 0.9999\n\nD: 0.14586 0.056449 119.48\n-0.21737 0.71439 95.786\n-0.00051182 3.3282e-05 1.0008\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_197_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_197_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.29858 0.0403 -122.67\n-0.38113 0.61838 172.03\n-0.00071255 -1.0448e-06 0.97348\n\nB: 0.37083 -0.024499 139.16\n-0.094573 0.62749 65.353\n-0.00053805 -2.2225e-05 0.99885\n\nC: 0.60367 0.071352 -36.528\n-0.21232 0.96671 -45.299\n-0.00036835 6.7456e-05 0.99996\n\nD: 0.60367 0.071352 -36.528\n-0.21232 0.96671 -45.299\n-0.00036835 6.7456e-05 0.99996\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.29858 0.0403 -122.67\n-0.38113 0.61838 172.03\n-0.00071255 -1.0448e-06 0.97348\n\nB: 0.37083 -0.024499 139.16\n-0.094573 0.62749 65.353\n-0.00053805 -2.2225e-05 0.99885\n\nC: 0.60367 0.071352 -36.528\n-0.21232 0.96671 -45.299\n-0.00036835 6.7456e-05 0.99996\n\nD: 0.60367 0.071352 -36.528\n-0.21232 0.96671 -45.299\n-0.00036835 6.7456e-05 0.99996\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_198_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_198_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Homography_estimation",
+    "visual_input_component": "natural image",
+    "source": "Hpatches",
+    "options": "A: 0.29858 0.0403 -122.67\n-0.38113 0.61838 172.03\n-0.00071255 -1.0448e-06 0.97348\n\nB: 0.67783 0.002447 123\n-0.00051063 0.68091 83.563\n-2.5166e-06 5.6486e-06 1\n\nC: 0.46461 0.085196 589.33\n0.19659 0.76327 25.833\n0.00026763 8.9486e-05 1.0006\n\nD: 0.040904 -0.0023332 234.76\n-0.10713 0.35038 218.5\n-0.00028907 6.311e-06 1.0035\n",
+    "question": "Please compute the 3x3 homography matrix between these two images.",
+    "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.29858 0.0403 -122.67\n-0.38113 0.61838 172.03\n-0.00071255 -1.0448e-06 0.97348\n\nB: 0.67783 0.002447 123\n-0.00051063 0.68091 83.563\n-2.5166e-06 5.6486e-06 1\n\nC: 0.46461 0.085196 589.33\n0.19659 0.76327 25.833\n0.00026763 8.9486e-05 1.0006\n\nD: 0.040904 -0.0023332 234.76\n-0.10713 0.35038 218.5\n-0.00028907 6.311e-06 1.0035\n",
+    "input_image_path": [
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_199_0.png",
+      "../MMIU-Benchmark/Homography_estimation/Homography_estimation_199_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is a box that has four items and the three are touching the side.\nB: There is a box that has five items and all are in the center.\nC: There is a box that has three items and the four are touching the side.\nD: There is a bag that has four items and the three are touching the side.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a box that has four items and the three are touching the side.\nB: There is a box that has five items and all are in the center.\nC: There is a box that has three items and the four are touching the side.\nD: There is a bag that has four items and the three are touching the side.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_0_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_0_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_0_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: there is a red square touching the base\nB: there is a white circle touching the base\nC: there is a black square touching the base\nD: there is a black triangle touching the base",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: there is a red square touching the base\nB: there is a white circle touching the base\nC: there is a black square touching the base\nD: there is a black triangle touching the base",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_1_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_1_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_1_2.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is a box with 1 black and 1 blue item.\nB: There is a box with 1 black and 1 green item.\nC: There is a box with 2 black items.\nD: There is a box with 1 red and 1 blue item.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a box with 1 black and 1 blue item.\nB: There is a box with 1 black and 1 green item.\nC: There is a box with 2 black items.\nD: There is a box with 1 red and 1 blue item.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_2_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_2_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_2_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is a black block above a yellow block.\nB: There is a yellow block above a black block.\nC: There is a yellow block below a black block.\nD: There is a yellow block next to a black block.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a black block above a yellow block.\nB: There is a yellow block above a black block.\nC: There is a yellow block below a black block.\nD: There is a yellow block next to a black block.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_3_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_3_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_3_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is a blue block as the top of a tower.\nB: There is a red ball at the top of a tower.\nC: There is a yellow block at the base of a tower.\nD: There is a yellow block as the top of a tower.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a blue block as the top of a tower.\nB: There is a red ball at the top of a tower.\nC: There is a yellow block at the base of a tower.\nD: There is a yellow block as the top of a tower.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_4_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_4_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_4_2.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There are 2 towers that contain white blocks\nB: There are 2 towers that contain black blocks\nC: There are 3 towers that contain black blocks\nD: There is 1 tower that contains black blocks",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are 2 towers that contain white blocks\nB: There are 2 towers that contain black blocks\nC: There are 3 towers that contain black blocks\nD: There is 1 tower that contains black blocks",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_5_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_5_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_5_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: All three towers have a blue base.\nB: None of the towers have a blue base.\nC: Only one tower has a blue base.\nD: Two of the three towers has a blue base.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: All three towers have a blue base.\nB: None of the towers have a blue base.\nC: Only one tower has a blue base.\nD: Two of the three towers has a blue base.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_6_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_6_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_6_2.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is a blue sphere as the base of a tower with more than two blocks\nB: There is a red block as the base of a tower with more than two blocks.\nC: There is a blue block as the base of a tower with more than two blocks.\nD: There is a blue block as the base of a single block tower.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a blue sphere as the base of a tower with more than two blocks\nB: There is a red block as the base of a tower with more than two blocks.\nC: There is a blue block as the base of a tower with more than two blocks.\nD: There is a blue block as the base of a single block tower.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_7_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_7_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_7_2.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There are two colors touching the wall.\nB: The wall has multiple colors.\nC: No colors are touching the wall.\nD: There is only one color touching the wall.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are two colors touching the wall.\nB: The wall has multiple colors.\nC: No colors are touching the wall.\nD: There is only one color touching the wall.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_8_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_8_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_8_2.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is at least 1 triangle closely touching a box corner\nB: There is at least 1 circle closely touching a box edge\nC: There is at least 1 square closely touching a circle\nD: There is at least 1 square closely tocuhing a box corner",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is at least 1 triangle closely touching a box corner\nB: There is at least 1 circle closely touching a box edge\nC: There is at least 1 square closely touching a circle\nD: There is at least 1 square closely tocuhing a box corner",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_9_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_9_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_9_2.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is 1 box with 2 black circles\nB: There is 1 box with 3 black circles\nC: There are 3 boxes with 2 black circles\nD: There are 2 boxes with 1 black circle",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is 1 box with 2 black circles\nB: There is 1 box with 3 black circles\nC: There are 3 boxes with 2 black circles\nD: There are 2 boxes with 1 black circle",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_10_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_10_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_10_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: there is one tower with a black block at the top\nB: there is one tower with a red block at the top\nC: there are two towers with a black block at the top\nD: there is one tower with no block at the top",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: there is one tower with a black block at the top\nB: there is one tower with a red block at the top\nC: there are two towers with a black block at the top\nD: there is one tower with no block at the top",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_11_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_11_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_11_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: A yellow block is under a green block.\nB: There is a yellow block on a blue block.\nC: There is a red block next to a blue block.\nD: The green block is above the red block.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: A yellow block is under a green block.\nB: There is a yellow block on a blue block.\nC: There is a red block next to a blue block.\nD: The green block is above the red block.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_12_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_12_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_12_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: All towers have different base colors.\nB: There are only two towers which has the same base color.\nC: Only one tower has a unique base color.\nD: There are three towers with the same base color.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: All towers have different base colors.\nB: There are only two towers which has the same base color.\nC: Only one tower has a unique base color.\nD: There are three towers with the same base color.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_13_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_13_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_13_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There are three yellow blocks in the middle of a tower.\nB: There are two yellow blocks as the base of a tower.\nC: There are two red blocks as the base of a tower.\nD: There is one yellow block at the top of a tower.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are three yellow blocks in the middle of a tower.\nB: There are two yellow blocks as the base of a tower.\nC: There are two red blocks as the base of a tower.\nD: There is one yellow block at the top of a tower.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_14_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_14_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_14_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is a box with items of orange and pink color.\nB: There is a box with items of only black and blue color.\nC: There is a box with items of red and white color.\nD: There is a box with items of green and yellow color.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a box with items of orange and pink color.\nB: There is a box with items of only black and blue color.\nC: There is a box with items of red and white color.\nD: There is a box with items of green and yellow color.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_15_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_15_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_15_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is a black tower.\nB: There is a black tree.\nC: There is a black bridge.\nD: There is a white tower.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a black tower.\nB: There is a black tree.\nC: There is a black bridge.\nD: There is a white tower.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_16_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_16_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_16_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is exactly one yellow triangle touching the edge\nB: There is exactly one red triangle touching the edge\nC: There are no yellow triangles touching the edge\nD: There are two yellow triangles touching the edge",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is exactly one yellow triangle touching the edge\nB: There is exactly one red triangle touching the edge\nC: There are no yellow triangles touching the edge\nD: There are two yellow triangles touching the edge",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_17_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_17_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_17_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There are exactly 2 blue blocks\nB: There are no blue blocks\nC: There are at least 3 blue blocks\nD: There are more than 10 blue blocks",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are exactly 2 blue blocks\nB: There are no blue blocks\nC: There are at least 3 blue blocks\nD: There are more than 10 blue blocks",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_18_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_18_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_18_2.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There are two white items in the middle of the box.\nB: There is one black item and one white item at the edge of the box.\nC: There are two black items closely touching the bottom of a box.\nD: There is a single black item at the top of the box.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are two white items in the middle of the box.\nB: There is one black item and one white item at the edge of the box.\nC: There are two black items closely touching the bottom of a box.\nD: There is a single black item at the top of the box.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_19_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_19_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_19_2.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: there is no tower with a blue block at the base\nB: there is a tower with a red block at the base\nC: there are multiple towers with a blue block at the base\nD: there is exactly one tower with a blue block at the base",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: there is no tower with a blue block at the base\nB: there is a tower with a red block at the base\nC: there are multiple towers with a blue block at the base\nD: there is exactly one tower with a blue block at the base",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_20_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_20_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_20_2.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is a box, which a blue triangle and at least two black items.\nB: There is a box, which a blue circle and at least two black items.\nC: There is a box, which a blue triangle and only one black item.\nD: There is a box, which a green triangle and at least two black items.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a box, which a blue triangle and at least two black items.\nB: There is a box, which a blue circle and at least two black items.\nC: There is a box, which a blue triangle and only one black item.\nD: There is a box, which a green triangle and at least two black items.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_21_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_21_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_21_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: One tower has a red block on top of a blue block\nB: One tower has a yellow block on top of a green block\nC: One tower has a yellow block on top of a blue block\nD: One tower has a blue block on top of a yellow block",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: One tower has a red block on top of a blue block\nB: One tower has a yellow block on top of a green block\nC: One tower has a yellow block on top of a blue block\nD: One tower has a blue block on top of a yellow block",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_22_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_22_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_22_2.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There are 3 towers with black blocks\nB: No towers have black blocks\nC: There is 1 tower that contains black blocks\nD: There are 2 towers that contain at least 1 black block",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are 3 towers with black blocks\nB: No towers have black blocks\nC: There is 1 tower that contains black blocks\nD: There are 2 towers that contain at least 1 black block",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_23_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_23_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_23_2.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: A black block is at the top of a tower\nB: There is 1 tower with a black block at the bottom\nC: A tower with a red block at the bottom\nD: There are 2 towers with black blocks",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: A black block is at the top of a tower\nB: There is 1 tower with a black block at the bottom\nC: A tower with a red block at the bottom\nD: There are 2 towers with black blocks",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_24_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_24_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_24_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is a pyramid with four blocks.\nB: There is a tower with four blocks.\nC: There is a tower with three blocks.\nD: There is a tower with five blocks.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a pyramid with four blocks.\nB: There is a tower with four blocks.\nC: There is a tower with three blocks.\nD: There is a tower with five blocks.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_25_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_25_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_25_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is a yellow block on a blue block.\nB: There is a yellow block on a green block.\nC: There is a red block on a blue block.\nD: There is a blue block on a yellow block.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a yellow block on a blue block.\nB: There is a yellow block on a green block.\nC: There is a red block on a blue block.\nD: There is a blue block on a yellow block.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_26_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_26_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_26_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There are 3 boxes with a black item on top.\nB: There are 2 boxes with a white item on top.\nC: There is 1 box with a black item on top.\nD: There are 2 boxes with a black item on top.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are 3 boxes with a black item on top.\nB: There are 2 boxes with a white item on top.\nC: There is 1 box with a black item on top.\nD: There are 2 boxes with a black item on top.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_27_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_27_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_27_2.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: there is exactly one red triangle touching the edge\nB: there are two blue triangles touching the edge\nC: there is exactly one blue square touching the edge\nD: there is exactly one blue triangle touching the edge",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: there is exactly one red triangle touching the edge\nB: there are two blue triangles touching the edge\nC: there is exactly one blue square touching the edge\nD: there is exactly one blue triangle touching the edge",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_28_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_28_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_28_2.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: One of the grey boxes has exactly seven objects\nB: One of the grey boxes has exactly eight objects\nC: One of the grey boxes has exactly four objects\nD: One of the grey box has exactly six objects",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: One of the grey boxes has exactly seven objects\nB: One of the grey boxes has exactly eight objects\nC: One of the grey boxes has exactly four objects\nD: One of the grey box has exactly six objects",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_29_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_29_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_29_2.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: there is exactly one tower with two blocks\nB: there are no towers with three blocks\nC: there are at least two towers with four blocks\nD: there is at least one tower with three blocks",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: there is exactly one tower with two blocks\nB: there are no towers with three blocks\nC: there are at least two towers with four blocks\nD: there is at least one tower with three blocks",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_30_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_30_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_30_2.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is a blue triangle touching the side.\nB: There is a red hexagon in the center.\nC: There is a yellow square touching the side.\nD: There is a green circle in the corner.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a blue triangle touching the side.\nB: There is a red hexagon in the center.\nC: There is a yellow square touching the side.\nD: There is a green circle in the corner.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_31_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_31_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_31_2.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is a tower with exactly four blocks with a yellow block at the bottom\nB: There is a tower with exactly three blocks with a yellow block at the top\nC: There is a tower with three red blocks at the top\nD: There is a tower with exactly two blocks, both yellow",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a tower with exactly four blocks with a yellow block at the bottom\nB: There is a tower with exactly three blocks with a yellow block at the top\nC: There is a tower with three red blocks at the top\nD: There is a tower with exactly two blocks, both yellow",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_32_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_32_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_32_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: No boxes contain yellow items\nB: All boxes contain blue items\nC: There is at least 1 yellow item in each box\nD: Each box contains only red items",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: No boxes contain yellow items\nB: All boxes contain blue items\nC: There is at least 1 yellow item in each box\nD: Each box contains only red items",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_33_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_33_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_33_2.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: None of the black triangles are touching the center\nB: All of the black triangles are touching an edge\nC: None of the black triangles are touching a edge\nD: Some black triangles are touching an edge",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: None of the black triangles are touching the center\nB: All of the black triangles are touching an edge\nC: None of the black triangles are touching a edge\nD: Some black triangles are touching an edge",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_34_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_34_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_34_2.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is 1 stack with only purple and orange blocks\nB: There is 1 pile with only green and white blocks\nC: There is 1 tower with only blue and black blocks\nD: There is 1 tower with only red and yellow blocks",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is 1 stack with only purple and orange blocks\nB: There is 1 pile with only green and white blocks\nC: There is 1 tower with only blue and black blocks\nD: There is 1 tower with only red and yellow blocks",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_35_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_35_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_35_2.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There are 3 boxes with a triangle in the middle\nB: There are 2 boxes with a triangle far from the corner\nC: There are 2 circles with a square closely touching a corner\nD: There are 2 boxes with a triangle closely touching a corner",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are 3 boxes with a triangle in the middle\nB: There are 2 boxes with a triangle far from the corner\nC: There are 2 circles with a square closely touching a corner\nD: There are 2 boxes with a triangle closely touching a corner",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_36_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_36_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_36_2.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: there is exactly one circle touching the edge\nB: there are no circles touching the edge\nC: there are at least two circles touching the edge\nD: there are three triangles touching the edge",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: there is exactly one circle touching the edge\nB: there are no circles touching the edge\nC: there are at least two circles touching the edge\nD: there are three triangles touching the edge",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_37_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_37_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_37_2.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is a box with only two items of black and yellow color.\nB: There is a box with two items of red and blue color.\nC: There is a box with three items of black and yellow color.\nD: There is a drawer with two items of green and yellow color.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a box with only two items of black and yellow color.\nB: There is a box with two items of red and blue color.\nC: There is a box with three items of black and yellow color.\nD: There is a drawer with two items of green and yellow color.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_38_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_38_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_38_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is a tower with three blocks.\nB: There is a tower with six blocks.\nC: There is a tower with four blocks.\nD: There is a tower with five blocks.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a tower with three blocks.\nB: There is a tower with six blocks.\nC: There is a tower with four blocks.\nD: There is a tower with five blocks.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_39_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_39_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_39_2.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is a three blocks tower which has only one blue block.\nB: There is a three blocks tower which has only red blocks.\nC: There is a two blocks tower which has only one blue block.\nD: There is a four blocks tower which has two blue blocks.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a three blocks tower which has only one blue block.\nB: There is a three blocks tower which has only red blocks.\nC: There is a two blocks tower which has only one blue block.\nD: There is a four blocks tower which has two blue blocks.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_40_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_40_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_40_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is ablue block on a black block.\nB: There is no block in the picture.\nC: There is a blue block next to a black block.\nD: A black block is on top of a blue block.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is ablue block on a black block.\nB: There is no block in the picture.\nC: There is a blue block next to a black block.\nD: A black block is on top of a blue block.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_41_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_41_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_41_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There are 2 towers with 2 yellow blocks\nB: There is 1 tower with 3 yellow blocks\nC: There is 1 tower with 2 yellow blocks\nD: There is 1 tower with 2 blue blocks",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are 2 towers with 2 yellow blocks\nB: There is 1 tower with 3 yellow blocks\nC: There is 1 tower with 2 yellow blocks\nD: There is 1 tower with 2 blue blocks",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_42_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_42_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_42_2.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: A box holds a blue triangle, a blue square, and a yellow circle.\nB: A box contains a blue circle, a yellow triangle, and a yellow square.\nC: There is a box with a blue triangle, a yellow square and a yellow circle.\nD: There is a box with a blue triangle, a yellow square",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: A box holds a blue triangle, a blue square, and a yellow circle.\nB: A box contains a blue circle, a yellow triangle, and a yellow square.\nC: There is a box with a blue triangle, a yellow square and a yellow circle.\nD: There is a box with a blue triangle, a yellow square",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_43_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_43_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_43_2.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is a box with no items inside.\nB: There is a box with items of three different shapes.\nC: There is a box with items of only one color.\nD: There is a box with items of various colors.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a box with no items inside.\nB: There is a box with items of three different shapes.\nC: There is a box with items of only one color.\nD: There is a box with items of various colors.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_44_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_44_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_44_2.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There are 2 boxes with only red and yellow items.\nB: There are 3 boxes with only black and yellow items.\nC: There are 2 boxes with only black and blue items.\nD: There are 2 boxes with only black and yellow items.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are 2 boxes with only red and yellow items.\nB: There are 3 boxes with only black and yellow items.\nC: There are 2 boxes with only black and blue items.\nD: There are 2 boxes with only black and yellow items.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_45_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_45_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_45_2.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is a red block above a yellow block.\nB: There is a black block above a yellow block.\nC: There is a yellow block below a black block.\nD: There is a yellow block above a black block.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a red block above a yellow block.\nB: There is a black block above a yellow block.\nC: There is a yellow block below a black block.\nD: There is a yellow block above a black block.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_46_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_46_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_46_2.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is a tower with a yellow block over a blue block\nB: There is a tower with a red block over a blue block\nC: There is a tower with a yellow block over a green block\nD: There is a tower with a yellow block next to a blue block",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a tower with a yellow block over a blue block\nB: There is a tower with a red block over a blue block\nC: There is a tower with a yellow block over a green block\nD: There is a tower with a yellow block next to a blue block",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_47_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_47_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_47_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is a box with only two black and blue items.\nB: There is a box with different colored items.\nC: There is a box with several black and blue items.\nD: There is a box with only black items.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a box with only two black and blue items.\nB: There is a box with different colored items.\nC: There is a box with several black and blue items.\nD: There is a box with only black items.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_48_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_48_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_48_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is a box with 4 items and 2 yellow squares\nB: There is a box with 3 items and 2 yellow squares in the middle.\nC: There is a box with 4 items and 2 yellow squares in the middle.\nD: There is a box with 4 items and 2 red circles in the middle.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a box with 4 items and 2 yellow squares\nB: There is a box with 3 items and 2 yellow squares in the middle.\nC: There is a box with 4 items and 2 yellow squares in the middle.\nD: There is a box with 4 items and 2 red circles in the middle.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_49_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_49_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_49_2.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There are two black towers with multiple blocks.\nB: There is a black tower with several blocks.\nC: There is a white tower with only one block.\nD: There is a black tower with only one block.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are two black towers with multiple blocks.\nB: There is a black tower with several blocks.\nC: There is a white tower with only one block.\nD: There is a black tower with only one block.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_50_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_50_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_50_2.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There are 3 black circles\nB: There are 2 white triangles\nC: There are 2 black triangles\nD: There are 5 black triangles",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are 3 black circles\nB: There are 2 white triangles\nC: There are 2 black triangles\nD: There are 5 black triangles",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_51_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_51_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_51_2.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is a tower with four blocks.\nB: There is a row of candles.\nC: There is a stack of plates.\nD: There is a pile of books.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a tower with four blocks.\nB: There is a row of candles.\nC: There is a stack of plates.\nD: There is a pile of books.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_52_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_52_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_52_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There are no blue blocks\nB: There are at least 3 blue blocks\nC: There are exactly two blue blocks\nD: There is only one blue block",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are no blue blocks\nB: There are at least 3 blue blocks\nC: There are exactly two blue blocks\nD: There is only one blue block",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_53_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_53_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_53_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There are 4 yellow items and one large circle touching the wall.\nB: There are 3 yellow items but none are touching the wall.\nC: There are 3 yellow items touching the wall and at least one small circle nearly touching the wall.\nD: There are 2 yellow items touching the wall and no small circles.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are 4 yellow items and one large circle touching the wall.\nB: There are 3 yellow items but none are touching the wall.\nC: There are 3 yellow items touching the wall and at least one small circle nearly touching the wall.\nD: There are 2 yellow items touching the wall and no small circles.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_54_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_54_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_54_2.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is a box with three colors and no items on top.\nB: There is a box with two colors and a white item on top.\nC: There is a round container with all 3 colors and a black item beside it.\nD: There is a box with all 3 colors and a black item on top.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a box with three colors and no items on top.\nB: There is a box with two colors and a white item on top.\nC: There is a round container with all 3 colors and a black item beside it.\nD: There is a box with all 3 colors and a black item on top.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_55_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_55_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_55_2.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is 1 tower with a yellow block at the top\nB: There is 1 tower with a yellow block at the base\nC: There is 1 tower with a red block at the base\nD: There are 2 towers with a yellow block at the base",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is 1 tower with a yellow block at the top\nB: There is 1 tower with a yellow block at the base\nC: There is 1 tower with a red block at the base\nD: There are 2 towers with a yellow block at the base",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_56_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_56_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_56_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: there are two black triangles touching the base\nB: there is one black triangle touching the base\nC: there is one black triangle not touching the base\nD: there are no black triangles touching the base",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: there are two black triangles touching the base\nB: there is one black triangle touching the base\nC: there is one black triangle not touching the base\nD: there are no black triangles touching the base",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_57_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_57_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_57_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is a tower with exactly three blocks with a yellow block at the top\nB: There is a tower with three blocks with a blue block at the top\nC: There is a tower with four blocks and a red block at the top\nD: There is a tower with two blocks and a green block at the top",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a tower with exactly three blocks with a yellow block at the top\nB: There is a tower with three blocks with a blue block at the top\nC: There is a tower with four blocks and a red block at the top\nD: There is a tower with two blocks and a green block at the top",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_58_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_58_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_58_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There are 2 blue blocks\nB: There is 1 blue block\nC: There are 2 red blocks\nD: There are 3 green blocks",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are 2 blue blocks\nB: There is 1 blue block\nC: There are 2 red blocks\nD: There are 3 green blocks",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_59_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_59_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_59_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is a box with 3 items and a black item on top.\nB: There is a box with 5 items and a red item on top.\nC: There is a box with 2 items and a blue item on top.\nD: There is a box with 3 items and a white item on top.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a box with 3 items and a black item on top.\nB: There is a box with 5 items and a red item on top.\nC: There is a box with 2 items and a blue item on top.\nD: There is a box with 3 items and a white item on top.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_60_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_60_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_60_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: t least two of the towers ha yellow bases.\nB: None of the towers have yellow bases.\nC: At most two of the towers have yellow bases.\nD: All of the towers have yellow bases.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: t least two of the towers ha yellow bases.\nB: None of the towers have yellow bases.\nC: At most two of the towers have yellow bases.\nD: All of the towers have yellow bases.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_61_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_61_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_61_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: the tower with two blocks has a black block at the top\nB: the tower with four blocks has a black block at the bottom\nC: the tower with four blocks has a red block at the top\nD: the tower with four blocks has a black block at the top",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: the tower with two blocks has a black block at the top\nB: the tower with four blocks has a black block at the bottom\nC: the tower with four blocks has a red block at the top\nD: the tower with four blocks has a black block at the top",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_62_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_62_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_62_2.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is a box with items of 2 different colors and a black square.\nB: There is a box with items of 4 different colors and no square.\nC: There is a box with items of 2 different colors and a red square.\nD: There is a box with items of 3 different colors and a black square.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a box with items of 2 different colors and a black square.\nB: There is a box with items of 4 different colors and no square.\nC: There is a box with items of 2 different colors and a red square.\nD: There is a box with items of 3 different colors and a black square.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_63_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_63_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_63_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is a yellow square touching the wall.\nB: There is a blue rectangle on the floor.\nC: There is a green circle floating in the air.\nD: There is a red triangle near the door.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a yellow square touching the wall.\nB: There is a blue rectangle on the floor.\nC: There is a green circle floating in the air.\nD: There is a red triangle near the door.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_64_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_64_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_64_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is a box with 3 items of the same color.\nB: There is a box with 4 items of all different colors.\nC: There is a box with 2 items of different colors.\nD: There is a box with 3 items of all 3 different colors.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a box with 3 items of the same color.\nB: There is a box with 4 items of all different colors.\nC: There is a box with 2 items of different colors.\nD: There is a box with 3 items of all 3 different colors.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_65_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_65_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_65_2.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: the tower has two blue blocks with a yellow block at the top\nB: there are three blocks in the tower with a red block at the top\nC: there is a tower with exactly two blocks having a blue block at the top.\nD: the tower has a single blue block at the top and bottom",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: the tower has two blue blocks with a yellow block at the top\nB: there are three blocks in the tower with a red block at the top\nC: there is a tower with exactly two blocks having a blue block at the top.\nD: the tower has a single blue block at the top and bottom",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_66_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_66_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_66_2.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is a tower with a yellow block over a red block\nB: There is a tower with a green block over a yellow block\nC: There is a tower with a yellow block over a blue block\nD: There is a tower with a blue block over a yellow block",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a tower with a yellow block over a red block\nB: There is a tower with a green block over a yellow block\nC: There is a tower with a yellow block over a blue block\nD: There is a tower with a blue block over a yellow block",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_67_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_67_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_67_2.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: green block on the side\nB: blue block at the bottom\nC: yellow block at the top\nD: red block in the middle",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: green block on the side\nB: blue block at the bottom\nC: yellow block at the top\nD: red block in the middle",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_68_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_68_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_68_2.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is a square closely touching the side of a box.\nB: There is a square closely touching the bottom of a box.\nC: There is no square closely touching the top of a box.\nD: There is no square closely touching the bottom of a box.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a square closely touching the side of a box.\nB: There is a square closely touching the bottom of a box.\nC: There is no square closely touching the top of a box.\nD: There is no square closely touching the bottom of a box.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_69_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_69_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_69_2.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is a tower with a yellow block, a blue block and a black block.\nB: There is a tower with a yellow block, a green block and a black block.\nC: There is a tower with a yellow block, a blue block and\nD: There is a tower with a red block, a blue block and a black block.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a tower with a yellow block, a blue block and a black block.\nB: There is a tower with a yellow block, a green block and a black block.\nC: There is a tower with a yellow block, a blue block and\nD: There is a tower with a red block, a blue block and a black block.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_70_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_70_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_70_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is a black tower with only one block.\nB: There is a black tower with multiple blocks.\nC: There is a black tower with no blocks.\nD: There is a white tower with only one block.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a black tower with only one block.\nB: There is a black tower with multiple blocks.\nC: There is a black tower with no blocks.\nD: There is a white tower with only one block.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_71_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_71_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_71_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There are 3 boxes each with black and yellow items.\nB: There is a box with only 3 items of black and yellow color.\nC: There is a black and yellow box with 3 items.\nD: There is a box with various items of different colors.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are 3 boxes each with black and yellow items.\nB: There is a box with only 3 items of black and yellow color.\nC: There is a black and yellow box with 3 items.\nD: There is a box with various items of different colors.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_72_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_72_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_72_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: there is a black square touching the base\nB: there is a black circle touching the base\nC: there is a white square touching the base\nD: the square is floating above the base",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: there is a black square touching the base\nB: there is a black circle touching the base\nC: there is a white square touching the base\nD: the square is floating above the base",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_73_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_73_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_73_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There are exactly two black squares touching an edge\nB: There are exactly three black squares not touching any edge\nC: There is exactly one black square not touching any edge\nD: There are exactly two black squares not touching any edge",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are exactly two black squares touching an edge\nB: There are exactly three black squares not touching any edge\nC: There is exactly one black square not touching any edge\nD: There are exactly two black squares not touching any edge",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_74_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_74_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_74_2.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: there is at least one tower with exactly two blocks having a blue block at the top\nB: there is no tower with exactly two blocks having a blue block at the top\nC: there is at least one tower with exactly two blocks having a red\nD: there is at least one tower with exactly three blocks having a blue block at the top",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: there is at least one tower with exactly two blocks having a blue block at the top\nB: there is no tower with exactly two blocks having a blue block at the top\nC: there is at least one tower with exactly two blocks having a red\nD: there is at least one tower with exactly three blocks having a blue block at the top",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_75_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_75_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_75_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There are 3 boxes with blue, yellow, and red items\nB: There is 1 box with only blue and yellow items\nC: There is 1 box with only red and green items\nD: There are 2 boxes with only blue and yellow items",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are 3 boxes with blue, yellow, and red items\nB: There is 1 box with only blue and yellow items\nC: There is 1 box with only red and green items\nD: There are 2 boxes with only blue and yellow items",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_76_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_76_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_76_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is 1 tower with a blue block at the base\nB: There is 1 tower with a blue block at the top\nC: There is 1 tower with a red block at the base\nD: There are 2 towers with a blue block at the base",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is 1 tower with a blue block at the base\nB: There is 1 tower with a blue block at the top\nC: There is 1 tower with a red block at the base\nD: There are 2 towers with a blue block at the base",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_77_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_77_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_77_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There are 3 towers with a blue block at the base\nB: There are 2 towers with a red block at the base\nC: There is 1 tower with a green block at the top\nD: There is 1 tower with a blue block at the base",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are 3 towers with a blue block at the base\nB: There are 2 towers with a red block at the base\nC: There is 1 tower with a green block at the top\nD: There is 1 tower with a blue block at the base",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_78_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_78_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_78_2.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is a blue block on a single-block tower.\nB: There is a blue block as the top of a tower with at least two blocks.\nC: There is a blue block at the base of a tower with at least two blocks.\nD: There is a green block as the top of a tower with at least two blocks.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a blue block on a single-block tower.\nB: There is a blue block as the top of a tower with at least two blocks.\nC: There is a blue block at the base of a tower with at least two blocks.\nD: There is a green block as the top of a tower with at least two blocks.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_79_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_79_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_79_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is a box with a yellow triangle and three blue items.\nB: There is a box with a yellow square and three green items.\nC: There is a box with a yellow circle and two red items.\nD: There is a box with a yellow circle and three blue items.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a box with a yellow triangle and three blue items.\nB: There is a box with a yellow square and three green items.\nC: There is a box with a yellow circle and two red items.\nD: There is a box with a yellow circle and three blue items.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_80_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_80_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_80_2.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: All 3 colors are not touching the wall.\nB: None of the colors are touching the wall.\nC: ll 3 different colors are touching the wall.\nD: Only 2 colors are touching the wall.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: All 3 colors are not touching the wall.\nB: None of the colors are touching the wall.\nC: ll 3 different colors are touching the wall.\nD: Only 2 colors are touching the wall.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_81_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_81_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_81_2.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is one yellow block at the top of a tower.\nB: There is one red block as the base of a tower.\nC: There are two yellow blocks as the base of a tower.\nD: There are two blue blocks as the base of a tower.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is one yellow block at the top of a tower.\nB: There is one red block as the base of a tower.\nC: There are two yellow blocks as the base of a tower.\nD: There are two blue blocks as the base of a tower.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_82_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_82_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_82_2.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is at least one black block on a blue block.\nB: There is at least one black block on a green block.\nC: There is at least one blue block on a black block.\nD: There are only black blocks.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is at least one black block on a blue block.\nB: There is at least one black block on a green block.\nC: There is at least one blue block on a black block.\nD: There are only black blocks.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_83_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_83_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_83_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: there is a red object touching the edge\nB: there is a green object touching the edge\nC: there is a blue object touching the edge\nD: there is a blue object in the center",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: there is a red object touching the edge\nB: there is a green object touching the edge\nC: there is a blue object touching the edge\nD: there is a blue object in the center",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_84_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_84_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_84_2.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There are 2 towers with only blue and black blocks\nB: There is 1 tower with only yellow and blue blocks\nC: There is 1 tower with only red and green blocks\nD: There is 1 tower with only blue and black blocks",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are 2 towers with only blue and black blocks\nB: There is 1 tower with only yellow and blue blocks\nC: There is 1 tower with only red and green blocks\nD: There is 1 tower with only blue and black blocks",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_85_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_85_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_85_2.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is one yellow item touching the floor.\nB: There are three yellow items touching the wall.\nC: There are two yellow items touching the wall.\nD: There are two blue items touching the wall.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is one yellow item touching the floor.\nB: There are three yellow items touching the wall.\nC: There are two yellow items touching the wall.\nD: There are two blue items touching the wall.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_86_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_86_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_86_2.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: one of the grey square contains exactly four objects\nB: one of the grey square contains exactly five objects\nC: one of the grey square contains exactly three objects\nD: one of the grey squares contains exactly six objects",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: one of the grey square contains exactly four objects\nB: one of the grey square contains exactly five objects\nC: one of the grey square contains exactly three objects\nD: one of the grey squares contains exactly six objects",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_87_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_87_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_87_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: there are two blue circles touching the base\nB: there are two yellow circles touching the base\nC: there are three yellow circles touching the base\nD: there is one yellow circle in the middle",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: there are two blue circles touching the base\nB: there are two yellow circles touching the base\nC: there are three yellow circles touching the base\nD: there is one yellow circle in the middle",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_88_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_88_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_88_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There are 2 black triangles\nB: There are no black triangles\nC: There are 3 black triangles\nD: There are 2 white triangles",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are 2 black triangles\nB: There are no black triangles\nC: There are 3 black triangles\nD: There are 2 white triangles",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_89_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_89_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_89_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is a red block at the top of the tower with only one block.\nB: There is a black block as the base of a tower with at least two blocks.\nC: There is a black block at the base of a tower with only one block.\nD: There is a black block floating in the air beside the tower.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a red block at the top of the tower with only one block.\nB: There is a black block as the base of a tower with at least two blocks.\nC: There is a black block at the base of a tower with only one block.\nD: There is a black block floating in the air beside the tower.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_90_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_90_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_90_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is a green circle in the center of a box.\nB: There is a blue square closely touching the bottom of a box.\nC: There is a yellow star floating above a box.\nD: There is a red triangle in the top right corner of a box.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a green circle in the center of a box.\nB: There is a blue square closely touching the bottom of a box.\nC: There is a yellow star floating above a box.\nD: There is a red triangle in the top right corner of a box.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_91_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_91_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_91_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is only one yellow block as the base of a tower.\nB: There is one yellow block at the top of a tower.\nC: There are three yellow blocks at the base of the tower.\nD: There are two yellow blocks as the base of a tower.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is only one yellow block as the base of a tower.\nB: There is one yellow block at the top of a tower.\nC: There are three yellow blocks at the base of the tower.\nD: There are two yellow blocks as the base of a tower.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_92_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_92_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_92_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: there is one tower having a black block over a blue block\nB: there is one tower having a blue block over a black block\nC: there are two towers having black blocks over blue blocks\nD: there is one tower having a green block over a black block",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: there is one tower having a black block over a blue block\nB: there is one tower having a blue block over a black block\nC: there are two towers having black blocks over blue blocks\nD: there is one tower having a green block over a black block",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_93_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_93_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_93_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There are three towers that have two blue blocks.\nB: There is one tower that has two blue blocks.\nC: There are two towers that have one blue block.\nD: There are two towers that has two blue blocks.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are three towers that have two blue blocks.\nB: There is one tower that has two blue blocks.\nC: There are two towers that have one blue block.\nD: There are two towers that has two blue blocks.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_94_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_94_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_94_2.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There are 4 yellow squares\nB: There are 3 yellow circles\nC: There are 3 yellow squares\nD: There are 3 blue squares",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are 4 yellow squares\nB: There are 3 yellow circles\nC: There are 3 yellow squares\nD: There are 3 blue squares",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_95_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_95_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_95_2.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is a box with 2 items and a yellow one touching the wall.\nB: There are no items in the box.\nC: A green item is touching the wall.\nD: The box contains 5 items.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a box with 2 items and a yellow one touching the wall.\nB: There are no items in the box.\nC: A green item is touching the wall.\nD: The box contains 5 items.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_96_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_96_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_96_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: there is a tree beside the tower\nB: there is a car near the tower\nC: there is a tower with exactly one block\nD: there is a tower with three blocks",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: there is a tree beside the tower\nB: there is a car near the tower\nC: there is a tower with exactly one block\nD: there is a tower with three blocks",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_97_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_97_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_97_2.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: there are two towers with black blocks at the base\nB: there is exactly one tower with a white block at the base\nC: there is no tower with a black block at the base\nD: there is exactly one tower with a black block at the base",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: there are two towers with black blocks at the base\nB: there is exactly one tower with a white block at the base\nC: there is no tower with a black block at the base\nD: there is exactly one tower with a black block at the base",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_98_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_98_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_98_2.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There are 4 black blocks\nB: There are no black blocks\nC: There are 3 black blocks\nD: There are 2 black blocks",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are 4 black blocks\nB: There are no black blocks\nC: There are 3 black blocks\nD: There are 2 black blocks",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_99_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_99_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_99_2.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is only one tower with at least two blue blocks.\nB: There are no towers with yellow blocks.\nC: There are two towers with at least two yellow blocks.\nD: There is only one tower with at least two yellow blocks.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is only one tower with at least two blue blocks.\nB: There are no towers with yellow blocks.\nC: There are two towers with at least two yellow blocks.\nD: There is only one tower with at least two yellow blocks.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_100_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_100_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_100_2.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: there are at least three red triangles not touching any edge\nB: there are at least three yellow triangles touching one edge\nC: there are at least three yellow triangles not touching any edge\nD: there are exactly two yellow triangles not touching any edge",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: there are at least three red triangles not touching any edge\nB: there are at least three yellow triangles touching one edge\nC: there are at least three yellow triangles not touching any edge\nD: there are exactly two yellow triangles not touching any edge",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_101_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_101_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_101_2.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is a box with a red circle and at least two black items.\nB: There is a box with a yellow triangle and at least two black items.\nC: There is a box with a yellow square and at least two black items.\nD: There is a box with a yellow square and no black items.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a box with a red circle and at least two black items.\nB: There is a box with a yellow triangle and at least two black items.\nC: There is a box with a yellow square and at least two black items.\nD: There is a box with a yellow square and no black items.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_102_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_102_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_102_2.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is a tower with no blocks.\nB: There is a tower with only one block.\nC: There is a tower with multiple blocks.\nD: There is no tower at all.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a tower with no blocks.\nB: There is a tower with only one block.\nC: There is a tower with multiple blocks.\nD: There is no tower at all.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_103_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_103_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_103_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: yellow block at the top\nB: yellow block at the bottom\nC: blue block at the top\nD: red block in the middle",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: yellow block at the top\nB: yellow block at the bottom\nC: blue block at the top\nD: red block in the middle",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_104_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_104_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_104_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: there are multiple towers with blocks of different colors\nB: there are no towers with blocks of the same color\nC: there are two towers with more than one block where all the blocks are of same color\nD: there is only one tower with blocks of the same color",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: there are multiple towers with blocks of different colors\nB: there are no towers with blocks of the same color\nC: there are two towers with more than one block where all the blocks are of same color\nD: there is only one tower with blocks of the same color",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_105_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_105_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_105_2.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: One tower has a yellow block on top of a red block\nB: One tower has a blue block on top of a yellow block\nC: One tower has a red block on top of a green block\nD: One tower has a yellow block on top of a blue block",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: One tower has a yellow block on top of a red block\nB: One tower has a blue block on top of a yellow block\nC: One tower has a red block on top of a green block\nD: One tower has a yellow block on top of a blue block",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_106_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_106_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_106_2.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There are at least 3 blue blocks\nB: There are no blue blocks\nC: There are exactly 5 blue blocks\nD: There are at most 2 blue blocks",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are at least 3 blue blocks\nB: There are no blue blocks\nC: There are exactly 5 blue blocks\nD: There are at most 2 blue blocks",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_107_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_107_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_107_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: The tower with four blocks has a black block at the bottom\nB: The tower with four blocks has a black block at the top\nC: The tower with three blocks has a black block at the top\nD: The tower with four blocks has a blue block at the top",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: The tower with four blocks has a black block at the bottom\nB: The tower with four blocks has a black block at the top\nC: The tower with three blocks has a black block at the top\nD: The tower with four blocks has a blue block at the top",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_108_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_108_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_108_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: All towers contain 1 green block\nB: Some towers contain 1 blue block\nC: All towers contain 2 blue blocks\nD: ll towers contain 1 blue block",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: All towers contain 1 green block\nB: Some towers contain 1 blue block\nC: All towers contain 2 blue blocks\nD: ll towers contain 1 blue block",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_109_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_109_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_109_2.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: there are two towers with blue blocks in the middle\nB: there are three towers having red blocks at the top\nC: there is one tower with a green block at the base\nD: there are two towers having a yellow block at the base",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: there are two towers with blue blocks in the middle\nB: there are three towers having red blocks at the top\nC: there is one tower with a green block at the base\nD: there are two towers having a yellow block at the base",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_110_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_110_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_110_2.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: All yellow blocks are at the bottom of the towers.\nB: There are no towers with a yellow block on top.\nC: There is at least a yellow block as the top of a tower.\nD: There are no yellow blocks in the towers.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: All yellow blocks are at the bottom of the towers.\nB: There are no towers with a yellow block on top.\nC: There is at least a yellow block as the top of a tower.\nD: There are no yellow blocks in the towers.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_111_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_111_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_111_2.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is a black block in the middle of a tower with three blocks.\nB: There is a black block at the bottom of a tower with three blocks.\nC: There is a black block as the top of a tower with three blocks.\nD: There is a red block at the top of a tower with three blocks.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a black block in the middle of a tower with three blocks.\nB: There is a black block at the bottom of a tower with three blocks.\nC: There is a black block as the top of a tower with three blocks.\nD: There is a red block at the top of a tower with three blocks.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_112_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_112_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_112_2.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is a tower with exactly four blocks with a black block at the bottom\nB: There is a tower with exactly one block which is black\nC: There is a tower with exactly three blocks with a white block at the top\nD: There is a tower with exactly two blocks with a black block at the top",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a tower with exactly four blocks with a black block at the bottom\nB: There is a tower with exactly one block which is black\nC: There is a tower with exactly three blocks with a white block at the top\nD: There is a tower with exactly two blocks with a black block at the top",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_113_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_113_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_113_2.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There are three towers with the same height and the base is red.\nB: There is one tower with different height and the base is yellow.\nC: There are two towers with the same height and the base is green.\nD: There are two tower with different height and the base is yellow.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are three towers with the same height and the base is red.\nB: There is one tower with different height and the base is yellow.\nC: There are two towers with the same height and the base is green.\nD: There are two tower with different height and the base is yellow.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_114_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_114_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_114_2.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is one blue block as the base of a tower.\nB: There are two blue blocks as the base of a tower.\nC: There are two red blocks as the base of a tower.\nD: There are three blue blocks as the base of a tower.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is one blue block as the base of a tower.\nB: There are two blue blocks as the base of a tower.\nC: There are two red blocks as the base of a tower.\nD: There are three blue blocks as the base of a tower.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_115_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_115_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_115_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There are two yellow blocks in the middle of the tower.\nB: The base of the tower contains a red block.\nC: There is one blue block as the base of the tower.\nD: There is only one yellow block as the base of a tower.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are two yellow blocks in the middle of the tower.\nB: The base of the tower contains a red block.\nC: There is one blue block as the base of the tower.\nD: There is only one yellow block as the base of a tower.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_116_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_116_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_116_2.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is a blue block next to a black block.\nB: There is a blue block below a black block.\nC: There is a blue block above a black block.\nD: There is a black block above a blue block.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a blue block next to a black block.\nB: There is a blue block below a black block.\nC: There is a blue block above a black block.\nD: There is a black block above a blue block.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_117_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_117_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_117_2.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is a box with exactly two black items and at least two blue items.\nB: There is a box with exactly two blue items and at most two black items.\nC: There is a box with exactly two blue items and at least two black items.\nD: There is a box with less than two blue items and exactly two black items",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a box with exactly two black items and at least two blue items.\nB: There is a box with exactly two blue items and at most two black items.\nC: There is a box with exactly two blue items and at least two black items.\nD: There is a box with less than two blue items and exactly two black items",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_118_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_118_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_118_2.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is a yellow item closely touching right wall of a box.\nB: There is a red item closely touching right wall of a box.\nC: There is no yellow item closely touching right wall of a box.\nD: No items are touching the right wall of the box.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a yellow item closely touching right wall of a box.\nB: There is a red item closely touching right wall of a box.\nC: There is no yellow item closely touching right wall of a box.\nD: No items are touching the right wall of the box.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_119_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_119_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_119_2.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: All towers have only red blocks\nB: Only one tower has a blue block\nC: No towers have blue blocks\nD: ll 3 towers have at least 1 blue block",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: All towers have only red blocks\nB: Only one tower has a blue block\nC: No towers have blue blocks\nD: ll 3 towers have at least 1 blue block",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_120_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_120_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_120_2.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is a square touching the corner that is not yellow.\nB: There is a square touching the middle that is not yellow.\nC: There is a square in the center that is not yellow.\nD: There is a square touching the corner that is yellow.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a square touching the corner that is not yellow.\nB: There is a square touching the middle that is not yellow.\nC: There is a square in the center that is not yellow.\nD: There is a square touching the corner that is yellow.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_121_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_121_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_121_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: tleast one black triangle is not touching the edge\nB: No black triangles are present\nC: All black triangles are touching the edge\nD: All triangles are white and touching the edge",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: tleast one black triangle is not touching the edge\nB: No black triangles are present\nC: All black triangles are touching the edge\nD: All triangles are white and touching the edge",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_122_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_122_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_122_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is a tower with only one block.\nB: There is a tower with two blocks.\nC: There is no tower.\nD: There is a tower with multiple blocks.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a tower with only one block.\nB: There is a tower with two blocks.\nC: There is no tower.\nD: There is a tower with multiple blocks.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_123_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_123_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_123_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is a box with 4 items of 3 different colors.\nB: There is a box with 3 items of all 3 different colors.\nC: There is a box with 2 items of all 3 different colors.\nD: There is a box with 3 items of all the same color.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a box with 4 items of 3 different colors.\nB: There is a box with 3 items of all 3 different colors.\nC: There is a box with 2 items of all 3 different colors.\nD: There is a box with 3 items of all the same color.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_124_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_124_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_124_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is a green block at the top of the tower.\nB: The base of the tower is red.\nC: There is a blue block as the base of a tower.\nD: The tower has a yellow base block.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a green block at the top of the tower.\nB: The base of the tower is red.\nC: There is a blue block as the base of a tower.\nD: The tower has a yellow base block.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_125_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_125_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_125_2.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is a pyramid with four blocks.\nB: There is a tower with six blocks.\nC: There is a house with four blocks.\nD: There is a tower with four blocks.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a pyramid with four blocks.\nB: There is a tower with six blocks.\nC: There is a house with four blocks.\nD: There is a tower with four blocks.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_126_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_126_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_126_2.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is no yellow circle closely touching the bottom of a box.\nB: There is no yellow triangle closely touching the bottom of a box.\nC: There is a yellow circle closely touching the bottom of a box.\nD: There is no blue circle closely touching the bottom of a box.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is no yellow circle closely touching the bottom of a box.\nB: There is no yellow triangle closely touching the bottom of a box.\nC: There is a yellow circle closely touching the bottom of a box.\nD: There is no blue circle closely touching the bottom of a box.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_127_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_127_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_127_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is 1 tower with a red block at the base\nB: There is 1 tower with a yellow block at the base\nC: There is 1 tower with a blue block at the base\nD: There are 2 towers with a yellow block at the base",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is 1 tower with a red block at the base\nB: There is 1 tower with a yellow block at the base\nC: There is 1 tower with a blue block at the base\nD: There are 2 towers with a yellow block at the base",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_128_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_128_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_128_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There are 3 white circles\nB: There are 4 black circles\nC: There are 2 black circles\nD: There are 2 white squares",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are 3 white circles\nB: There are 4 black circles\nC: There are 2 black circles\nD: There are 2 white squares",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_129_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_129_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_129_2.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is a black tower.\nB: There is a black house.\nC: There is a white tower.\nD: There is a black tree.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a black tower.\nB: There is a black house.\nC: There is a white tower.\nD: There is a black tree.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_130_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_130_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_130_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: All towers have different heights.\nB: Most towers are of different heights.\nC: There is only one tower with a unique height.\nD: There are at least two towers with the same height.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: All towers have different heights.\nB: Most towers are of different heights.\nC: There is only one tower with a unique height.\nD: There are at least two towers with the same height.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_131_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_131_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_131_2.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is a green hexagon on the table.\nB: There is a red circle on the floor.\nC: There is a yellow square touching the wall.\nD: There is a blue triangle near the door.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a green hexagon on the table.\nB: There is a red circle on the floor.\nC: There is a yellow square touching the wall.\nD: There is a blue triangle near the door.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_132_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_132_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_132_2.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: there are exactly two squares not touching any edge\nB: there are exactly five squares not touching any edge\nC: there are exactly three squares not touching any edge\nD: there are exactly four squares not touching any edge",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: there are exactly two squares not touching any edge\nB: there are exactly five squares not touching any edge\nC: there are exactly three squares not touching any edge\nD: there are exactly four squares not touching any edge",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_133_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_133_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_133_2.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is 1 tower with a red block and a blue block\nB: There is 1 tower with a yellow block and a blue block\nC: There are 2 towers with yellow blocks\nD: There is 1 tower with yellow and red blocks",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is 1 tower with a red block and a blue block\nB: There is 1 tower with a yellow block and a blue block\nC: There are 2 towers with yellow blocks\nD: There is 1 tower with yellow and red blocks",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_134_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_134_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_134_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is a blue item in the center of a box.\nB: There is a blue item touching the left wall of a box.\nC: There is a blue item closely touching right wall of a box.\nD: There is a red item closely touching right wall of a box.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a blue item in the center of a box.\nB: There is a blue item touching the left wall of a box.\nC: There is a blue item closely touching right wall of a box.\nD: There is a red item closely touching right wall of a box.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_135_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_135_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_135_2.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: One of the grey boxes has exactly two objects both of which are circles\nB: One of the grey boxes has exactly three objects all of which are squares\nC: One of the grey box has exactly three objects one of which is a circle\nD: One of the grey boxes has exactly one object which is a triangle",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: One of the grey boxes has exactly two objects both of which are circles\nB: One of the grey boxes has exactly three objects all of which are squares\nC: One of the grey box has exactly three objects one of which is a circle\nD: One of the grey boxes has exactly one object which is a triangle",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_136_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_136_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_136_2.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There are three blue squares touching the edge\nB: There are no blue squares in the picture\nC: There is only one blue square in the center\nD: There are exactly two blue squares not touching the edge",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are three blue squares touching the edge\nB: There are no blue squares in the picture\nC: There is only one blue square in the center\nD: There are exactly two blue squares not touching the edge",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_137_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_137_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_137_2.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: Only 2 yellow and one black item are touching the wall.\nB: Only 2 yellow and one red item are touching the wall.\nC: Only 3 yellow and one black item are touching the wall.\nD: Only 1 yellow and one black item are touching the wall.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: Only 2 yellow and one black item are touching the wall.\nB: Only 2 yellow and one red item are touching the wall.\nC: Only 3 yellow and one black item are touching the wall.\nD: Only 1 yellow and one black item are touching the wall.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_138_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_138_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_138_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: One box has 2 yellow squares\nB: One box has 3 yellow squares\nC: Two boxes have yellow squares\nD: One box has 2 red squares",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: One box has 2 yellow squares\nB: One box has 3 yellow squares\nC: Two boxes have yellow squares\nD: One box has 2 red squares",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_139_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_139_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_139_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There are more than 5 blue blocks\nB: There are no blue blocks\nC: There are exactly 2 blue blocks\nD: There are at least 3 blue blocks",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are more than 5 blue blocks\nB: There are no blue blocks\nC: There are exactly 2 blue blocks\nD: There are at least 3 blue blocks",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_140_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_140_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_140_2.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: the tower with three blocks has a yellow block at the top\nB: the tower with two blocks has a yellow block at the top\nC: the tower with two blocks has a blue block at the top\nD: the tower with two blocks has a yellow block at the bottom",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: the tower with three blocks has a yellow block at the top\nB: the tower with two blocks has a yellow block at the top\nC: the tower with two blocks has a blue block at the top\nD: the tower with two blocks has a yellow block at the bottom",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_141_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_141_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_141_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is a box with 4 items of various colors.\nB: There is a box with 3 items of all 3 different colors.\nC: There is a box with 3 items all of the same color.\nD: There is a box with 2 items of all 3 different colors.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a box with 4 items of various colors.\nB: There is a box with 3 items of all 3 different colors.\nC: There is a box with 3 items all of the same color.\nD: There is a box with 2 items of all 3 different colors.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_142_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_142_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_142_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: there is one tower with a white block at the top\nB: there is one tower with a black block at the top\nC: there is a skyscraper with a blue block at the top\nD: there are two towers with a red block at the top",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: there is one tower with a white block at the top\nB: there is one tower with a black block at the top\nC: there is a skyscraper with a blue block at the top\nD: there are two towers with a red block at the top",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_143_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_143_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_143_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: there is a tower with a four block which has a red block over a blue block\nB: there is a tower with a four block which has a blue block over a blue block\nC: there is a tower with three blocks which has a blue block over a blue block\nD: there is a tower with a four block which has a yellow",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: there is a tower with a four block which has a red block over a blue block\nB: there is a tower with a four block which has a blue block over a blue block\nC: there is a tower with three blocks which has a blue block over a blue block\nD: there is a tower with a four block which has a yellow",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_144_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_144_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_144_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There are three blue squares touching the edge\nB: There are two red squares in the center\nC: There are exactly two blue squares not touching the edge\nD: All blue squares are touching the edge",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are three blue squares touching the edge\nB: There are two red squares in the center\nC: There are exactly two blue squares not touching the edge\nD: All blue squares are touching the edge",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_145_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_145_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_145_2.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: t least two of the towers ha yellow bases.\nB: None of the towers have yellow bases.\nC: All of the towers have blue bases.\nD: At least one of the towers has a red base.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: t least two of the towers ha yellow bases.\nB: None of the towers have yellow bases.\nC: All of the towers have blue bases.\nD: At least one of the towers has a red base.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_146_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_146_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_146_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is a box with a blue square and a blue triangle.\nB: There is a box with a blue circle and a blue triangle.\nC: There is a box with a green circle and a green triangle.\nD: There is a box with a red circle and a red triangle.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a box with a blue square and a blue triangle.\nB: There is a box with a blue circle and a blue triangle.\nC: There is a box with a green circle and a green triangle.\nD: There is a box with a red circle and a red triangle.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_147_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_147_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_147_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: The top of the two four block towers are red.\nB: The top of the two four block towers  are yellow.\nC: The bottom of the two four block towers are yellow.\nD: The top of the single five block tower is yellow.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: The top of the two four block towers are red.\nB: The top of the two four block towers  are yellow.\nC: The bottom of the two four block towers are yellow.\nD: The top of the single five block tower is yellow.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_148_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_148_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_148_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: there is a tower with a yellow block over a blue block\nB: there is a tower with a red block over a green block\nC: there is a tower with a black block over a red block\nD: there is a tower with a black block over a blue block",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: there is a tower with a yellow block over a blue block\nB: there is a tower with a red block over a green block\nC: there is a tower with a black block over a red block\nD: there is a tower with a black block over a blue block",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_149_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_149_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_149_2.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is 1 tower with a blue block at the base\nB: There are 2 towers with yellow blocks at the base\nC: There are 3 towers with green blocks at the base\nD: There is 1 tower with a yellow block at the base",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is 1 tower with a blue block at the base\nB: There are 2 towers with yellow blocks at the base\nC: There are 3 towers with green blocks at the base\nD: There is 1 tower with a yellow block at the base",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_150_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_150_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_150_2.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is a tower with a blue block above a blue block\nB: There is a tower with a blue block above a red block\nC: There is a tower with a red block above a blue block\nD: There is a tower with a blue block below a blue block",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a tower with a blue block above a blue block\nB: There is a tower with a blue block above a red block\nC: There is a tower with a red block above a blue block\nD: There is a tower with a blue block below a blue block",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_151_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_151_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_151_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: there is a red circle in the center\nB: there are no circles touching the edge\nC: all circles are blue\nD: there is at least one yellow circle touching the edge",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: there is a red circle in the center\nB: there are no circles touching the edge\nC: all circles are blue\nD: there is at least one yellow circle touching the edge",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_152_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_152_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_152_2.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is only 1 tower that contains white blocks\nB: There are 3 towers that contain black blocks\nC: There are two towers that contain black blocks\nD: There is only 1 tower than contains black blccks",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is only 1 tower that contains white blocks\nB: There are 3 towers that contain black blocks\nC: There are two towers that contain black blocks\nD: There is only 1 tower than contains black blccks",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_153_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_153_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_153_2.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is a box with items of only black color.\nB: There is a box with exactly 3 items of black and blue color.\nC: There is a box with more than 3 items of black and red color.\nD: There is a box with 3 items at most of black and blue color.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a box with items of only black color.\nB: There is a box with exactly 3 items of black and blue color.\nC: There is a box with more than 3 items of black and red color.\nD: There is a box with 3 items at most of black and blue color.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_154_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_154_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_154_2.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is a stack of 2 green blocks side by side\nB: There is a tower with 2 red blocks stacked together\nC: There is a tower with 3 blue blocks stacked together\nD: There is a tower with 2 blue blocks stacked together",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a stack of 2 green blocks side by side\nB: There is a tower with 2 red blocks stacked together\nC: There is a tower with 3 blue blocks stacked together\nD: There is a tower with 2 blue blocks stacked together",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_155_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_155_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_155_2.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: One box has 3 yellow squares\nB: One box has 2 blue squares\nC: One box has 2 red squares\nD: One box has 2 yellow squares",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: One box has 3 yellow squares\nB: One box has 2 blue squares\nC: One box has 2 red squares\nD: One box has 2 yellow squares",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_156_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_156_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_156_2.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is a tower with 3 blue blocks stacked together\nB: There is a tower with 2 red blocks stacked together\nC: There is a tower with 2 blue blocks stacked together\nD: There is a single blue block in the tower",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a tower with 3 blue blocks stacked together\nB: There is a tower with 2 red blocks stacked together\nC: There is a tower with 2 blue blocks stacked together\nD: There is a single blue block in the tower",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_157_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_157_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_157_2.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is no blue block.\nB: There is at least one black block on a blue block.\nC: There is a blue block on a black block.\nD: There are only black blocks.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is no blue block.\nB: There is at least one black block on a blue block.\nC: There is a blue block on a black block.\nD: There are only black blocks.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_158_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_158_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_158_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: The top of the two three block towers are yellow.\nB: The top of the two four block towers  are yellow.\nC: The bottom of the two four block towers are yellow.\nD: The top of the two four block towers are red.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: The top of the two three block towers are yellow.\nB: The top of the two four block towers  are yellow.\nC: The bottom of the two four block towers are yellow.\nD: The top of the two four block towers are red.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_159_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_159_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_159_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There are exactly two black squares touching every edge\nB: There are exactly two white squares not touching any edge\nC: There are exactly two black squares not touching any edge\nD: There are exactly three black squares not touching any edge",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are exactly two black squares touching every edge\nB: There are exactly two white squares not touching any edge\nC: There are exactly two black squares not touching any edge\nD: There are exactly three black squares not touching any edge",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_160_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_160_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_160_2.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: there are two yellow circles touching the base\nB: there are two red circles touching the base\nC: there are three yellow circles touching the base\nD: there is one yellow circle touching the base",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: there are two yellow circles touching the base\nB: there are two red circles touching the base\nC: there are three yellow circles touching the base\nD: there is one yellow circle touching the base",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_161_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_161_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_161_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is a black block at the bottom of a tower with two blocks.\nB: There is a black block alone on a flat surface.\nC: There is a red block at the top of a tower with three blocks.\nD: There is a black block as the top of a tower with at least two blocks.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a black block at the bottom of a tower with two blocks.\nB: There is a black block alone on a flat surface.\nC: There is a red block at the top of a tower with three blocks.\nD: There is a black block as the top of a tower with at least two blocks.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_162_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_162_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_162_2.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: All blue items are in different boxes.\nB: ll blue items are in the same box.\nC: None of the blue items are in the same box.\nD: Only some blue items are in the same box.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: All blue items are in different boxes.\nB: ll blue items are in the same box.\nC: None of the blue items are in the same box.\nD: Only some blue items are in the same box.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_163_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_163_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_163_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There are 3 towers with 1 yellow block\nB: There are 2 towers with 3 yellow blocks\nC: There is 1 tower with 2 red blocks\nD: There is 1 tower with 3 yellow blocks",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are 3 towers with 1 yellow block\nB: There are 2 towers with 3 yellow blocks\nC: There is 1 tower with 2 red blocks\nD: There is 1 tower with 3 yellow blocks",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_164_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_164_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_164_2.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is 1 red circle\nB: There is 1 black circle\nC: There is 1 black square\nD: There are 2 black circles",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is 1 red circle\nB: There is 1 black circle\nC: There is 1 black square\nD: There are 2 black circles",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_165_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_165_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_165_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is exactly one tower with a red block at base\nB: There is exactly one tower with a yellow block at base\nC: There are two towers with a yellow block at base\nD: There is no tower with a yellow block at base",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is exactly one tower with a red block at base\nB: There is exactly one tower with a yellow block at base\nC: There are two towers with a yellow block at base\nD: There is no tower with a yellow block at base",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_166_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_166_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_166_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: there is at least one tower which has a yellow block above a black block\nB: there is at least one tower which has a black block above a yellow block\nC: all towers have a yellow block above a black block\nD: there is no tower which has a yellow block above a black block",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: there is at least one tower which has a yellow block above a black block\nB: there is at least one tower which has a black block above a yellow block\nC: all towers have a yellow block above a black block\nD: there is no tower which has a yellow block above a black block",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_167_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_167_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_167_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is a tower with a blue block at the top.\nB: There is a blue tower with all blocks the same color.\nC: There is a tower that the second block from the base is blue.\nD: There is a tower with the second block from the top blue.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a tower with a blue block at the top.\nB: There is a blue tower with all blocks the same color.\nC: There is a tower that the second block from the base is blue.\nD: There is a tower with the second block from the top blue.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_168_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_168_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_168_2.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: blue squares are touching the bottom edge\nB: blue squares are touching the top edge\nC: blue squares are not touching any edge\nD: blue squares are touching all edges",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: blue squares are touching the bottom edge\nB: blue squares are touching the top edge\nC: blue squares are not touching any edge\nD: blue squares are touching all edges",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_169_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_169_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_169_2.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is a box with a yellow circle and 2 black squares.\nB: There is a box with a yellow triangle and 2 black circles.\nC: There is a box with a yellow triangle and 2 black squares.\nD: There is a box with a yellow triangle and 3 black squares.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a box with a yellow circle and 2 black squares.\nB: There is a box with a yellow triangle and 2 black circles.\nC: There is a box with a yellow triangle and 2 black squares.\nD: There is a box with a yellow triangle and 3 black squares.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_170_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_170_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_170_2.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is a yellow block as the base of a tower.\nB: There is a yellow block at the top of the tower.\nC: There is no yellow block as the base of a tower.\nD: There are two yellow blocks in the middle of the tower.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a yellow block as the base of a tower.\nB: There is a yellow block at the top of the tower.\nC: There is no yellow block as the base of a tower.\nD: There are two yellow blocks in the middle of the tower.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_171_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_171_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_171_2.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There are multiple towers with different colors.\nB: There is a single block tower with multiple colors.\nC: There is a two blocks tower with different colors.\nD: There is a two blocks tower that has only one color.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are multiple towers with different colors.\nB: There is a single block tower with multiple colors.\nC: There is a two blocks tower with different colors.\nD: There is a two blocks tower that has only one color.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_172_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_172_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_172_2.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: the single block is yellow\nB: the tower with two blocks has a yellow block at the top\nC: the tower with two blocks has a red block at the top\nD: the tower with three blocks has a yellow block at the top",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: the single block is yellow\nB: the tower with two blocks has a yellow block at the top\nC: the tower with two blocks has a red block at the top\nD: the tower with three blocks has a yellow block at the top",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_173_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_173_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_173_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is a tower with a blue block over a yellow block\nB: There is a tower with two yellow blocks\nC: There is a tower with a yellow block over a blue block\nD: There is a tower with a green block over a yellow block",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a tower with a blue block over a yellow block\nB: There is a tower with two yellow blocks\nC: There is a tower with a yellow block over a blue block\nD: There is a tower with a green block over a yellow block",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_174_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_174_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_174_2.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: there is one black triangle not touching any edge\nB: there are two black triangles touching the edges\nC: there are no black triangles visible\nD: there are two black triangles not touching any edge",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: there is one black triangle not touching any edge\nB: there are two black triangles touching the edges\nC: there are no black triangles visible\nD: there are two black triangles not touching any edge",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_175_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_175_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_175_2.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There are no black triangles touching any edge\nB: There is exactly one black triangle touching an edge\nC: There are two black triangles not touching any edges\nD: There is exactly one black triangle not touching any edge",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are no black triangles touching any edge\nB: There is exactly one black triangle touching an edge\nC: There are two black triangles not touching any edges\nD: There is exactly one black triangle not touching any edge",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_176_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_176_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_176_2.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There are two towers that has black block at the top.\nB: There are no towers in the image.\nC: There is only one tower with a black block at the top.\nD: There are two towers, but they have red blocks at the top.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are two towers that has black block at the top.\nB: There are no towers in the image.\nC: There is only one tower with a black block at the top.\nD: There are two towers, but they have red blocks at the top.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_177_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_177_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_177_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There are 4 black circles\nB: There are 3 black circles\nC: There are 2 white circles\nD: There are 2 black circles",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are 4 black circles\nB: There are 3 black circles\nC: There are 2 white circles\nD: There are 2 black circles",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_178_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_178_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_178_2.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is a blocking tower made of three stones.\nB: There is a tower with four same colored blocks.\nC: There is a tower with three different colored blocks.\nD: There is a tower that has three the same blocks color.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a blocking tower made of three stones.\nB: There is a tower with four same colored blocks.\nC: There is a tower with three different colored blocks.\nD: There is a tower that has three the same blocks color.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_179_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_179_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_179_2.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There are five circles not touching any edge\nB: There are exactly four circles touching one edge\nC: There are exactly three circles not touching any edge\nD: There are exactly four circles not touching any edge",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are five circles not touching any edge\nB: There are exactly four circles touching one edge\nC: There are exactly three circles not touching any edge\nD: There are exactly four circles not touching any edge",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_180_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_180_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_180_2.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is a red block as the top of a tower with at least two blocks.\nB: There is a blue block as the bottom of a tower with at least two blocks.\nC: There is a blue block as the top of a tower with at least two blocks.\nD: There is a blue block as the top of a single block tower",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a red block as the top of a tower with at least two blocks.\nB: There is a blue block as the bottom of a tower with at least two blocks.\nC: There is a blue block as the top of a tower with at least two blocks.\nD: There is a blue block as the top of a single block tower",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_181_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_181_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_181_2.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: one of the grey squares is empty\nB: one of the grey squares has exactly five objects\nC: one of the grey square has exactly four objects\nD: one of the grey squares has exactly three objects",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: one of the grey squares is empty\nB: one of the grey squares has exactly five objects\nC: one of the grey square has exactly four objects\nD: one of the grey squares has exactly three objects",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_182_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_182_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_182_2.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is at least 1 circle closely touching a box corner\nB: There is at least 1 square closely tocuhing a box corner\nC: There is at least 1 square touching the center of a box\nD: There is at least 1 triangle closely touching a box corner",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is at least 1 circle closely touching a box corner\nB: There is at least 1 square closely tocuhing a box corner\nC: There is at least 1 square touching the center of a box\nD: There is at least 1 triangle closely touching a box corner",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_183_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_183_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_183_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: Each grey box contains atleast one yellow object touching the edge\nB: Each grey box has no object touching the edge\nC: Each grey box is empty\nD: Each grey box contains a green object",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: Each grey box contains atleast one yellow object touching the edge\nB: Each grey box has no object touching the edge\nC: Each grey box is empty\nD: Each grey box contains a green object",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_184_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_184_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_184_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is at least 1 tower with a blue block at the top\nB: There are exactly 2 towers with a blue block at the top\nC: There are no towers with a blue block at the top\nD: There is at least 1 tower with a green block at the top",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is at least 1 tower with a blue block at the top\nB: There are exactly 2 towers with a blue block at the top\nC: There are no towers with a blue block at the top\nD: There is at least 1 tower with a green block at the top",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_185_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_185_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_185_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: No towers have any height.\nB: All towers have different heights.\nC: There are at least two towers with the same height.\nD: There is only one tower with the same height.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: No towers have any height.\nB: All towers have different heights.\nC: There are at least two towers with the same height.\nD: There is only one tower with the same height.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_186_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_186_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_186_2.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is a tower with three blue blocks.\nB: There is a tower with a black block and two blue blocks.\nC: There is a tower with two black blocks and a blue block.\nD: There is a tower with a black block and a red block.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a tower with three blue blocks.\nB: There is a tower with a black block and two blue blocks.\nC: There is a tower with two black blocks and a blue block.\nD: There is a tower with a black block and a red block.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_187_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_187_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_187_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: there is a tower with a yellow block below a red block at the top\nB: there is a tower with a red block below a yellow block at the top\nC: there is a tower with a blue block below a green block at the top\nD: there is a tower with a yellow block below a yellow block at the top",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: there is a tower with a yellow block below a red block at the top\nB: there is a tower with a red block below a yellow block at the top\nC: there is a tower with a blue block below a green block at the top\nD: there is a tower with a yellow block below a yellow block at the top",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_188_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_188_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_188_2.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There are 2 blue blocks\nB: There are 4 blue blocks\nC: There are 3 blue blocks\nD: There are 2 red blocks",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are 2 blue blocks\nB: There are 4 blue blocks\nC: There are 3 blue blocks\nD: There are 2 red blocks",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_189_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_189_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_189_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is 1 tower with 2 yellow blocks at the base\nB: There are 2 towers with 1 yellow block at the base\nC: There is 1 tower with 1 red block at the base\nD: There is 1 tower with 1 yellow block at the base",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is 1 tower with 2 yellow blocks at the base\nB: There are 2 towers with 1 yellow block at the base\nC: There is 1 tower with 1 red block at the base\nD: There is 1 tower with 1 yellow block at the base",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_190_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_190_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_190_2.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There are two red blocks as the base of a tower.\nB: There is one yellow block as the base of a tower.\nC: There are two yellow blocks as the base of a tower.\nD: There are three yellow blocks as the base of a tower.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are two red blocks as the base of a tower.\nB: There is one yellow block as the base of a tower.\nC: There are two yellow blocks as the base of a tower.\nD: There are three yellow blocks as the base of a tower.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_191_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_191_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_191_2.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: there is no tower with a yellow block above a black block\nB: there is at least one tower which has a yellow block above a black block\nC: every tower has a yellow block above a black block\nD: there is a yellow block below every black block",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: there is no tower with a yellow block above a black block\nB: there is at least one tower which has a yellow block above a black block\nC: every tower has a yellow block above a black block\nD: there is a yellow block below every black block",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_192_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_192_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_192_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There are 3 yellow squares\nB: There are 4 yellow squares\nC: There are 3 yellow circles\nD: There are 2 yellow squares",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are 3 yellow squares\nB: There are 4 yellow squares\nC: There are 3 yellow circles\nD: There are 2 yellow squares",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_193_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_193_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_193_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There are exactly two black blocks as the top of a tower.\nB: There are exactly two black blocks at the bottom of a tower.\nC: There is one black block at the top of a tower.\nD: There are exactly three black blocks as the top of a tower.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are exactly two black blocks as the top of a tower.\nB: There are exactly two black blocks at the bottom of a tower.\nC: There is one black block at the top of a tower.\nD: There are exactly three black blocks as the top of a tower.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_194_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_194_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_194_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is a box with items of various colors.\nB: There is a box with items of only one color.\nC: There is no box with items in it.\nD: There are multiple boxes with items of one color each.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a box with items of various colors.\nB: There is a box with items of only one color.\nC: There is no box with items in it.\nD: There are multiple boxes with items of one color each.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_195_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_195_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_195_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There is a blue item floating in the middle of the box.\nB: There is a blue item closely touching right wall of a box.\nC: There is a green item touching the ceiling of a box.\nD: There is a red item closely touching the left wall of a box.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a blue item floating in the middle of the box.\nB: There is a blue item closely touching right wall of a box.\nC: There is a green item touching the ceiling of a box.\nD: There is a red item closely touching the left wall of a box.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_196_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_196_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_196_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There are 3 boxes with a black item on top.\nB: There are 2 boxes with a white item on top.\nC: There are 2 boxes with a black item on top.\nD: There are 2 boxes with nothing on top.",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are 3 boxes with a black item on top.\nB: There are 2 boxes with a white item on top.\nC: There are 2 boxes with a black item on top.\nD: There are 2 boxes with nothing on top.",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_197_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_197_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_197_2.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There are 2 white circles\nB: There are 2 black circles\nC: There are 3 black circles\nD: There are 4 black circles",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are 2 white circles\nB: There are 2 black circles\nC: There are 3 black circles\nD: There are 4 black circles",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_198_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_198_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_198_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Captioning_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "nlvr",
+    "options": "A: There are 3 white blocks\nB: There are 2 black blocks\nC: There are 3 black blocks\nD: There are 4 black blocks",
+    "question": "Please correctly describe this set of images from the perspective of the spatial context.",
+    "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are 3 white blocks\nB: There are 2 black blocks\nC: There are 3 black blocks\nD: There are 4 black blocks",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_199_0.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_199_1.png",
+      "../MMIU-Benchmark/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_199_2.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "What is at the bottom?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_0_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_0_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_0_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_0_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "Which picture shows the water bottle inside the tent?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_1_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_1_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_1_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image\nE: The sixth image",
+    "question": "Which object is next to the block?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image\nE: The sixth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_2_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_2_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_2_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_2_3.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_2_4.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_2_5.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Which object is next to the one shaped like a cube?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_3_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_3_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_3_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_3_3.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_3_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Which object is next to the flashlight?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_4_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_4_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_4_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_4_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "Which picture shows the piggy bank inside the gift box?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_5_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_5_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_5_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Which object is above the bench?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_6_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_6_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_6_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_6_3.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_6_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "Which object is shaped like a cone and is above the desk?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_7_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_7_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_7_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Which object is beside the dice?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_8_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_8_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_8_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_8_3.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_8_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Which object is next to the watermelon?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_9_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_9_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_9_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_9_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image\nE: The sixth image",
+    "question": "Which object is beside the one shaped like a cone?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image\nE: The sixth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_10_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_10_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_10_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_10_3.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_10_4.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_10_5.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "Which picture shows the cake inside the oven?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_11_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_11_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_11_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "Which object is below the bed?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_12_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_12_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_12_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image\nE: The sixth image",
+    "question": "Which object is beside the one shaped like a cube?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image\nE: The sixth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_13_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_13_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_13_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_13_3.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_13_4.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_13_5.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "Which object is above the table?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_14_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_14_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_14_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "What is on the right?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_15_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_15_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_15_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_15_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Which object is beside the volleyball?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_16_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_16_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_16_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_16_3.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_16_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "Which object is shaped like a sphere and is below the table?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_17_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_17_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_17_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Which object is next to the clock?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_18_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_18_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_18_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_18_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Which object is below the table?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_19_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_19_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_19_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_19_3.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_19_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "What is on the right?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_20_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_20_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_20_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "Which object is below the bed?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_21_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_21_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_21_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "What is on the right?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_22_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_22_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_22_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_22_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Which object is next to the pine cone?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_23_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_23_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_23_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_23_3.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_23_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "What is on the left?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_24_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_24_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_24_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "What is on the right?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_25_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_25_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_25_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Which object is above the bench?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_26_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_26_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_26_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_26_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "What is at the top?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_27_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_27_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_27_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Which object is beside the bead?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_28_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_28_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_28_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_28_3.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_28_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Which object is beside the crate?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_29_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_29_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_29_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_29_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Which object is next to the cup?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_30_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_30_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_30_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_30_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Which object is above the bench?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_31_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_31_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_31_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_31_3.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_31_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "Which object is below the table?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_32_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_32_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_32_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "Which picture shows the muffins outside the oven?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_33_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_33_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_33_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Which object is below the table?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_34_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_34_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_34_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_34_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "What is on the right?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_35_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_35_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_35_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image\nE: The sixth image",
+    "question": "Which object is next to the dog dish?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image\nE: The sixth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_36_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_36_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_36_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_36_3.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_36_4.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_36_5.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "What is in the middle?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_37_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_37_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_37_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_37_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "Which object below the bed is shaped like a cone?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_38_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_38_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_38_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "What is in the middle?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_39_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_39_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_39_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_39_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "Which object is below the table?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_40_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_40_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_40_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "What is in the middle?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_41_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_41_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_41_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_41_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "Which object is above the bench?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_42_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_42_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_42_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "Which picture shows the cow outside the barn?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_43_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_43_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_43_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "Which picture shows the soccer ball outside the gift box?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_44_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_44_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_44_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Which object is next to the flashlight?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_45_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_45_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_45_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_45_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Which object is next to the watermelon?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_46_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_46_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_46_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_46_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "Which object is below the table?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_47_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_47_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_47_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "What is on the left?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_48_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_48_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_48_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_48_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Which object is next to the bead?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_49_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_49_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_49_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_49_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Which object is next to the clock?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_50_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_50_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_50_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_50_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Which object is beside the pair of shoes?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_51_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_51_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_51_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_51_3.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_51_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Which object is beside the one shaped like a sphere?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_52_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_52_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_52_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_52_3.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_52_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image\nE: The sixth image",
+    "question": "Which object is beside the tub of ice cream?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image\nE: The sixth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_53_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_53_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_53_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_53_3.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_53_4.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_53_5.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Which object is below the table?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_54_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_54_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_54_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_54_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "What is at the bottom?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_55_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_55_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_55_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_55_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Which object is next to the block?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_56_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_56_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_56_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_56_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Which object is below the desk?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_57_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_57_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_57_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_57_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Which object is above the bench?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_58_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_58_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_58_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_58_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image\nE: The sixth image",
+    "question": "Which object is next to the trash can?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image\nE: The sixth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_59_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_59_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_59_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_59_3.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_59_4.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_59_5.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Which object is beside the mailing box?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_60_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_60_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_60_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_60_3.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_60_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "What is in the middle?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_61_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_61_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_61_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_61_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "What is on the right?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_62_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_62_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_62_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_62_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "What is on the left?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_63_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_63_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_63_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_63_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "Which object is shaped like a sphere and is below the bench?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_64_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_64_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_64_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "What is on the right?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_65_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_65_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_65_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Which object is next to the basketball?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_66_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_66_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_66_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_66_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Which object is below the table?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_67_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_67_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_67_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_67_3.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_67_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Which object is next to the flashlight?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_68_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_68_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_68_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_68_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Which object is next to the pine cone?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_69_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_69_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_69_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_69_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "What is at the top?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_70_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_70_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_70_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image\nE: The sixth image",
+    "question": "Which object is beside the top?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image\nE: The sixth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_71_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_71_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_71_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_71_3.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_71_4.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_71_5.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Which object is beside the one shaped like a cone?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_72_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_72_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_72_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_72_3.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_72_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "What is at the top?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_73_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_73_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_73_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_73_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "What is on the right?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_74_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_74_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_74_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_74_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "Which picture shows the roast beef inside the oven?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_75_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_75_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_75_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "What is at the top?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_76_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_76_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_76_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Which object is next to the bead?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_77_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_77_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_77_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_77_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "Which object is shaped like a cylinder and is below the bed?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_78_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_78_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_78_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image\nE: The sixth image",
+    "question": "Which object is next to the one shaped like a cube?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image\nE: The sixth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_79_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_79_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_79_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_79_3.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_79_4.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_79_5.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "Which object below the bed is shaped like a cylinder?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_80_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_80_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_80_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Which object is beside the clock?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_81_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_81_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_81_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_81_3.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_81_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "What is on the right?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_82_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_82_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_82_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "What is on the left?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_83_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_83_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_83_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_83_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "What is at the bottom?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_84_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_84_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_84_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "Which object is shaped like a cube and is above the bed?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_85_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_85_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_85_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "What is in the middle?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_86_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_86_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_86_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_86_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "What is on the right?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_87_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_87_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_87_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_87_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "What is on the left?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_88_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_88_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_88_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Which object is next to the storage bin?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_89_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_89_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_89_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_89_3.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_89_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "Which object is below the desk?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_90_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_90_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_90_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "Which picture shows the basketball inside the gift box?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_91_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_91_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_91_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "Which picture shows the toy airplane outside the gift box?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_92_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_92_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_92_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "Which object is shaped like a sphere and is below the table?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_93_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_93_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_93_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Which object is beside the pine cone?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_94_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_94_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_94_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_94_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "What is on the left?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_95_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_95_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_95_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_95_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Which object is beside the box of cookies?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_96_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_96_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_96_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_96_3.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_96_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Which object is beside the computer?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_97_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_97_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_97_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_97_3.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_97_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image\nE: The sixth image",
+    "question": "Which object is beside the butterfly?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image\nE: The sixth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_98_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_98_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_98_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_98_3.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_98_4.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_98_5.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "What is on the left?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_99_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_99_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_99_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_99_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Which object is next to the flashlight?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_100_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_100_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_100_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_100_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "What is on the right?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_101_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_101_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_101_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image\nE: The sixth image",
+    "question": "Which object is beside the one shaped like a cylinder?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image\nE: The sixth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_102_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_102_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_102_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_102_3.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_102_4.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_102_5.png"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "What is at the bottom?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_103_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_103_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_103_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_103_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "What is on the right?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_104_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_104_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_104_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_104_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "What is on the right?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_105_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_105_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_105_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "What is on the right?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_106_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_106_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_106_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "What is on the left?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_107_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_107_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_107_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Which object is next to the pair of shoes?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_108_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_108_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_108_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_108_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "What is in the middle?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_109_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_109_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_109_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_109_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "Which object is below the table?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_110_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_110_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_110_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Which object is below the bench?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_111_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_111_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_111_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_111_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "Which picture shows the book inside the gift box?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_112_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_112_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_112_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "Which object is above the table?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_113_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_113_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_113_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "Which object is above the table?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_114_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_114_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_114_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "What is at the bottom?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_115_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_115_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_115_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "Which object above the table is shaped like a cylinder?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_116_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_116_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_116_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "What is on the right?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_117_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_117_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_117_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "Which object above the desk is shaped like a cylinder?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_118_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_118_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_118_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "What is at the bottom?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_119_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_119_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_119_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "What is in the middle?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_120_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_120_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_120_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_120_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Which object is below the table?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_121_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_121_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_121_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_121_3.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_121_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "Which object above the bench is shaped like a cube?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_122_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_122_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_122_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "What is at the top?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_123_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_123_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_123_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image\nE: The sixth image",
+    "question": "Which object is next to the one shaped like a cube?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image\nE: The sixth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_124_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_124_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_124_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_124_3.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_124_4.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_124_5.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "What is at the bottom?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_125_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_125_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_125_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "What is on the left?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_126_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_126_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_126_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "What is on the right?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_127_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_127_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_127_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "What is at the top?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_128_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_128_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_128_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_128_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "Which object is shaped like a sphere and is above the table?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_129_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_129_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_129_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "What is in the middle?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_130_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_130_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_130_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_130_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Which object is beside the drum?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_131_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_131_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_131_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_131_3.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_131_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "What is in the middle?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_132_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_132_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_132_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_132_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "What is at the bottom?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_133_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_133_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_133_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_133_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "Which object is shaped like a cone and is below the desk?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_134_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_134_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_134_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "Which picture shows the toy pony outside the gift box?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_135_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_135_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_135_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Which object is beside the backpack?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_136_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_136_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_136_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_136_3.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_136_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Which object is next to the pair of shoes?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_137_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_137_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_137_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_137_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Which object is below the desk?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_138_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_138_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_138_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_138_3.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_138_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "What is on the right?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_139_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_139_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_139_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_139_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "Which picture shows the toy car inside the toy box?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_140_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_140_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_140_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Which object is beside the one shaped like a cube?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_141_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_141_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_141_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_141_3.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_141_4.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Which object is next to the one shaped like a cone?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_142_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_142_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_142_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_142_3.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_142_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image\nE: The sixth image",
+    "question": "Which object is next to the one shaped like a cone?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image\nE: The sixth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_143_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_143_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_143_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_143_3.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_143_4.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_143_5.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "What is on the left?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_144_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_144_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_144_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "Which object is above the bed?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_145_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_145_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_145_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "What is on the left?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_146_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_146_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_146_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Which object is below the bench?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_147_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_147_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_147_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_147_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Which object is beside the butterfly?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_148_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_148_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_148_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_148_3.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_148_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "Which object is shaped like a cylinder and is above the table?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_149_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_149_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_149_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image\nE: The sixth image",
+    "question": "Which object is next to the block?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image\nE: The sixth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_150_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_150_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_150_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_150_3.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_150_4.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_150_5.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "What is in the middle?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_151_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_151_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_151_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_151_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Which object is beside the cake?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_152_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_152_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_152_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_152_3.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_152_4.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "Which object is below the bed?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_153_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_153_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_153_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "What is on the left?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_154_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_154_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_154_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "What is at the top?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_155_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_155_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_155_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_155_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Which object is beside the pair of shoes?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_156_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_156_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_156_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_156_3.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_156_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "What is in the middle?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_157_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_157_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_157_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_157_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "Which object is below the table?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_158_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_158_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_158_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "Which object is above the bench?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_159_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_159_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_159_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "What is at the bottom?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_160_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_160_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_160_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "What is on the left?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_161_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_161_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_161_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_161_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Which object is next to the roll of stickers?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_162_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_162_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_162_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_162_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "Which picture shows the roast beef inside the oven?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_163_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_163_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_163_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "Which picture shows the cookies outside the oven?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_164_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_164_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_164_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Which object is next to the one shaped like a sphere?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_165_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_165_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_165_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_165_3.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_165_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "Which object is below the bench?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_166_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_166_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_166_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "Which picture shows the muffins outside the oven?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_167_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_167_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_167_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "What is on the right?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_168_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_168_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_168_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "What is in the middle?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_169_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_169_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_169_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_169_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Which object is beside the storage bin?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_170_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_170_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_170_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_170_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "Which object is shaped like a sphere and is below the bench?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_171_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_171_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_171_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Which object is next to the top?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_172_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_172_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_172_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_172_3.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_172_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Which object is beside the top?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_173_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_173_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_173_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_173_3.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_173_4.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "What is at the bottom?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_174_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_174_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_174_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Which object is next to the drum?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_175_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_175_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_175_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_175_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "What is in the middle?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_176_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_176_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_176_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_176_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "Which object is shaped like a cube and is below the bench?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_177_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_177_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_177_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "Which object is below the table?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_178_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_178_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_178_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Which object is next to the clock?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_179_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_179_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_179_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_179_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image\nE: The sixth image",
+    "question": "Which object is beside the flashlight?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image\nE: The sixth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_180_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_180_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_180_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_180_3.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_180_4.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_180_5.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Which object is above the table?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_181_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_181_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_181_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_181_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "What is on the left?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_182_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_182_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_182_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "Which object below the table is shaped like a sphere?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_183_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_183_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_183_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "What is at the bottom?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_184_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_184_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_184_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_184_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "Which object above the bench is shaped like a cone?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_185_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_185_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_185_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image\nE: The sixth image",
+    "question": "Which object is beside the watermelon?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image\nE: The sixth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_186_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_186_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_186_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_186_3.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_186_4.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_186_5.png"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "Which object below the bed is shaped like a cone?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_187_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_187_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_187_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "Which object is next to the trash can?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_188_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_188_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_188_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_188_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "What is on the right?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_189_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_189_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_189_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_189_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "Which picture shows the cow outside the barn?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_190_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_190_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_190_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "What is on the left?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_191_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_191_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_191_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image\nE: The sixth image",
+    "question": "Which object is next to the one shaped like a sphere?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image\nE: The sixth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_192_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_192_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_192_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_192_3.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_192_4.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_192_5.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "Which object is below the bed?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_193_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_193_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_193_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "Which object is shaped like a cone and is above the table?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_194_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_194_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_194_2.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image",
+    "question": "What is at the bottom?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_195_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_195_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_195_2.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image",
+    "question": "What is at the top?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_196_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_196_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_196_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_196_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image\nE: The sixth image",
+    "question": "Which object is next to the top?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image\nE: The sixth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_197_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_197_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_197_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_197_3.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_197_4.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_197_5.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Which object is next to the bunch of bananas?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_198_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_198_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_198_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_198_3.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_198_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Icon_Question_Answering_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "iconqa",
+    "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "question": "Which object is beside the one shaped like a cube?",
+    "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_199_0.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_199_1.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_199_2.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_199_3.png",
+      "../MMIU-Benchmark/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_199_4.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_0_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_0_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_0_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_0_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_0_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_0_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_0_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_0_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_0_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_0_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_0_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_0_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_0_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_0_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_0_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_0_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_0_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_0_17.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_1_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_1_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_1_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_1_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_1_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_1_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_1_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_1_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_1_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_1_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_1_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_1_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_1_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_1_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_1_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_1_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_1_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_1_17.jpg"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_2_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_2_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_2_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_2_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_2_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_2_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_2_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_2_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_2_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_2_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_2_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_2_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_2_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_2_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_2_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_2_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_2_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_2_17.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_3_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_3_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_3_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_3_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_3_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_3_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_3_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_3_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_3_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_3_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_3_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_3_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_3_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_3_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_3_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_3_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_3_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_3_17.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_4_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_4_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_4_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_4_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_4_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_4_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_4_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_4_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_4_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_4_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_4_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_4_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_4_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_4_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_4_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_4_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_4_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_4_17.jpg"
+    ],
+    "output": "G"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_5_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_5_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_5_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_5_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_5_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_5_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_5_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_5_7.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_6_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_6_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_6_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_6_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_6_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_6_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_6_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_6_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_6_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_6_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_6_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_6_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_6_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_6_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_6_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_6_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_6_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_6_17.jpg"
+    ],
+    "output": "G"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_7_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_7_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_7_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_7_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_7_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_7_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_7_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_7_7.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_8_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_8_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_8_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_8_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_8_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_8_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_8_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_8_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_8_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_8_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_8_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_8_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_8_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_8_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_8_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_8_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_8_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_8_17.jpg"
+    ],
+    "output": "I"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_9_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_9_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_9_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_9_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_9_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_9_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_9_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_9_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_9_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_9_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_9_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_9_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_9_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_9_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_9_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_9_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_9_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_9_17.jpg"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_10_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_10_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_10_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_10_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_10_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_10_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_10_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_10_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_10_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_10_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_10_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_10_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_10_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_10_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_10_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_10_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_10_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_10_17.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_11_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_11_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_11_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_11_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_11_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_11_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_11_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_11_7.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_12_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_12_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_12_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_12_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_12_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_12_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_12_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_12_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_12_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_12_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_12_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_12_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_12_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_12_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_12_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_12_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_12_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_12_17.jpg"
+    ],
+    "output": "I"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_13_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_13_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_13_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_13_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_13_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_13_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_13_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_13_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_13_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_13_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_13_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_13_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_13_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_13_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_13_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_13_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_13_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_13_17.jpg"
+    ],
+    "output": "I"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_14_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_14_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_14_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_14_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_14_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_14_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_14_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_14_7.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_15_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_15_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_15_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_15_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_15_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_15_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_15_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_15_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_15_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_15_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_15_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_15_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_15_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_15_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_15_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_15_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_15_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_15_17.jpg"
+    ],
+    "output": "G"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_16_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_16_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_16_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_16_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_16_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_16_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_16_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_16_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_16_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_16_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_16_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_16_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_16_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_16_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_16_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_16_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_16_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_16_17.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_17_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_17_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_17_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_17_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_17_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_17_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_17_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_17_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_17_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_17_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_17_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_17_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_17_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_17_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_17_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_17_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_17_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_17_17.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_18_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_18_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_18_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_18_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_18_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_18_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_18_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_18_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_18_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_18_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_18_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_18_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_18_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_18_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_18_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_18_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_18_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_18_17.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_19_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_19_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_19_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_19_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_19_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_19_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_19_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_19_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_19_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_19_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_19_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_19_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_19_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_19_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_19_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_19_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_19_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_19_17.jpg"
+    ],
+    "output": "I"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_20_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_20_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_20_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_20_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_20_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_20_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_20_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_20_7.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_21_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_21_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_21_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_21_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_21_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_21_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_21_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_21_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_21_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_21_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_21_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_21_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_21_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_21_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_21_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_21_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_21_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_21_17.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_22_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_22_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_22_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_22_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_22_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_22_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_22_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_22_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_22_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_22_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_22_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_22_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_22_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_22_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_22_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_22_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_22_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_22_17.jpg"
+    ],
+    "output": "F"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_23_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_23_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_23_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_23_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_23_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_23_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_23_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_23_7.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_24_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_24_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_24_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_24_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_24_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_24_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_24_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_24_7.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_25_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_25_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_25_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_25_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_25_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_25_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_25_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_25_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_25_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_25_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_25_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_25_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_25_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_25_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_25_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_25_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_25_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_25_17.jpg"
+    ],
+    "output": "I"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_26_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_26_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_26_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_26_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_26_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_26_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_26_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_26_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_26_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_26_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_26_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_26_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_26_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_26_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_26_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_26_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_26_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_26_17.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_27_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_27_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_27_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_27_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_27_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_27_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_27_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_27_7.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_28_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_28_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_28_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_28_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_28_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_28_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_28_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_28_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_28_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_28_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_28_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_28_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_28_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_28_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_28_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_28_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_28_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_28_17.jpg"
+    ],
+    "output": "G"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_29_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_29_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_29_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_29_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_29_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_29_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_29_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_29_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_29_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_29_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_29_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_29_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_29_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_29_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_29_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_29_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_29_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_29_17.jpg"
+    ],
+    "output": "H"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_30_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_30_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_30_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_30_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_30_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_30_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_30_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_30_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_30_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_30_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_30_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_30_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_30_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_30_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_30_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_30_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_30_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_30_17.jpg"
+    ],
+    "output": "H"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_31_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_31_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_31_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_31_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_31_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_31_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_31_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_31_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_31_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_31_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_31_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_31_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_31_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_31_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_31_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_31_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_31_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_31_17.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_32_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_32_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_32_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_32_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_32_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_32_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_32_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_32_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_32_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_32_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_32_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_32_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_32_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_32_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_32_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_32_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_32_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_32_17.jpg"
+    ],
+    "output": "H"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_33_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_33_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_33_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_33_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_33_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_33_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_33_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_33_7.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_34_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_34_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_34_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_34_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_34_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_34_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_34_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_34_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_34_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_34_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_34_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_34_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_34_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_34_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_34_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_34_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_34_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_34_17.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_35_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_35_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_35_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_35_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_35_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_35_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_35_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_35_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_35_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_35_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_35_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_35_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_35_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_35_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_35_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_35_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_35_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_35_17.jpg"
+    ],
+    "output": "G"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_36_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_36_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_36_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_36_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_36_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_36_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_36_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_36_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_36_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_36_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_36_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_36_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_36_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_36_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_36_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_36_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_36_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_36_17.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_37_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_37_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_37_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_37_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_37_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_37_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_37_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_37_7.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_38_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_38_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_38_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_38_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_38_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_38_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_38_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_38_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_38_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_38_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_38_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_38_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_38_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_38_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_38_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_38_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_38_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_38_17.jpg"
+    ],
+    "output": "G"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_39_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_39_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_39_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_39_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_39_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_39_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_39_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_39_7.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_40_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_40_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_40_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_40_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_40_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_40_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_40_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_40_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_40_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_40_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_40_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_40_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_40_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_40_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_40_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_40_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_40_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_40_17.jpg"
+    ],
+    "output": "I"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_41_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_41_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_41_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_41_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_41_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_41_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_41_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_41_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_41_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_41_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_41_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_41_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_41_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_41_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_41_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_41_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_41_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_41_17.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_42_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_42_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_42_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_42_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_42_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_42_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_42_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_42_7.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_43_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_43_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_43_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_43_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_43_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_43_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_43_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_43_7.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_44_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_44_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_44_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_44_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_44_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_44_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_44_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_44_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_44_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_44_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_44_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_44_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_44_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_44_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_44_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_44_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_44_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_44_17.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_45_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_45_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_45_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_45_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_45_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_45_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_45_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_45_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_45_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_45_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_45_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_45_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_45_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_45_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_45_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_45_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_45_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_45_17.jpg"
+    ],
+    "output": "I"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_46_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_46_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_46_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_46_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_46_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_46_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_46_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_46_7.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_47_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_47_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_47_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_47_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_47_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_47_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_47_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_47_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_47_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_47_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_47_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_47_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_47_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_47_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_47_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_47_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_47_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_47_17.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_48_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_48_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_48_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_48_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_48_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_48_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_48_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_48_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_48_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_48_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_48_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_48_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_48_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_48_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_48_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_48_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_48_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_48_17.jpg"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_49_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_49_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_49_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_49_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_49_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_49_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_49_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_49_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_49_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_49_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_49_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_49_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_49_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_49_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_49_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_49_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_49_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_49_17.jpg"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_50_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_50_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_50_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_50_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_50_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_50_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_50_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_50_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_50_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_50_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_50_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_50_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_50_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_50_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_50_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_50_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_50_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_50_17.jpg"
+    ],
+    "output": "G"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_51_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_51_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_51_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_51_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_51_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_51_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_51_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_51_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_51_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_51_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_51_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_51_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_51_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_51_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_51_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_51_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_51_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_51_17.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_52_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_52_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_52_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_52_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_52_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_52_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_52_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_52_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_52_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_52_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_52_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_52_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_52_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_52_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_52_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_52_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_52_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_52_17.jpg"
+    ],
+    "output": "F"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_53_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_53_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_53_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_53_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_53_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_53_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_53_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_53_7.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_54_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_54_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_54_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_54_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_54_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_54_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_54_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_54_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_54_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_54_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_54_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_54_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_54_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_54_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_54_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_54_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_54_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_54_17.jpg"
+    ],
+    "output": "G"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_55_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_55_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_55_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_55_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_55_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_55_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_55_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_55_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_55_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_55_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_55_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_55_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_55_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_55_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_55_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_55_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_55_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_55_17.jpg"
+    ],
+    "output": "F"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_56_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_56_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_56_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_56_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_56_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_56_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_56_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_56_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_56_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_56_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_56_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_56_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_56_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_56_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_56_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_56_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_56_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_56_17.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_57_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_57_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_57_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_57_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_57_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_57_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_57_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_57_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_57_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_57_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_57_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_57_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_57_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_57_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_57_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_57_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_57_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_57_17.jpg"
+    ],
+    "output": "I"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_58_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_58_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_58_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_58_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_58_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_58_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_58_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_58_7.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_59_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_59_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_59_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_59_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_59_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_59_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_59_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_59_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_59_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_59_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_59_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_59_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_59_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_59_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_59_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_59_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_59_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_59_17.jpg"
+    ],
+    "output": "F"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_60_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_60_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_60_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_60_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_60_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_60_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_60_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_60_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_60_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_60_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_60_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_60_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_60_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_60_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_60_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_60_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_60_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_60_17.jpg"
+    ],
+    "output": "H"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_61_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_61_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_61_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_61_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_61_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_61_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_61_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_61_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_61_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_61_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_61_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_61_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_61_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_61_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_61_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_61_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_61_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_61_17.jpg"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_62_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_62_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_62_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_62_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_62_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_62_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_62_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_62_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_62_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_62_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_62_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_62_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_62_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_62_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_62_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_62_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_62_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_62_17.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_63_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_63_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_63_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_63_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_63_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_63_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_63_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_63_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_63_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_63_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_63_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_63_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_63_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_63_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_63_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_63_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_63_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_63_17.jpg"
+    ],
+    "output": "H"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_64_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_64_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_64_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_64_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_64_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_64_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_64_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_64_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_64_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_64_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_64_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_64_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_64_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_64_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_64_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_64_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_64_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_64_17.jpg"
+    ],
+    "output": "F"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_65_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_65_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_65_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_65_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_65_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_65_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_65_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_65_7.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_66_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_66_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_66_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_66_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_66_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_66_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_66_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_66_7.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_67_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_67_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_67_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_67_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_67_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_67_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_67_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_67_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_67_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_67_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_67_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_67_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_67_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_67_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_67_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_67_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_67_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_67_17.jpg"
+    ],
+    "output": "H"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_68_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_68_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_68_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_68_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_68_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_68_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_68_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_68_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_68_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_68_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_68_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_68_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_68_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_68_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_68_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_68_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_68_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_68_17.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_69_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_69_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_69_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_69_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_69_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_69_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_69_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_69_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_69_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_69_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_69_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_69_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_69_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_69_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_69_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_69_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_69_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_69_17.jpg"
+    ],
+    "output": "F"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_70_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_70_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_70_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_70_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_70_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_70_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_70_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_70_7.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_71_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_71_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_71_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_71_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_71_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_71_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_71_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_71_7.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_72_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_72_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_72_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_72_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_72_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_72_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_72_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_72_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_72_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_72_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_72_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_72_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_72_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_72_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_72_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_72_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_72_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_72_17.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_73_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_73_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_73_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_73_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_73_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_73_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_73_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_73_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_73_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_73_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_73_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_73_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_73_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_73_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_73_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_73_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_73_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_73_17.jpg"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_74_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_74_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_74_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_74_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_74_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_74_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_74_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_74_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_74_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_74_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_74_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_74_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_74_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_74_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_74_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_74_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_74_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_74_17.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_75_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_75_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_75_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_75_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_75_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_75_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_75_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_75_7.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_76_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_76_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_76_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_76_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_76_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_76_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_76_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_76_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_76_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_76_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_76_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_76_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_76_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_76_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_76_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_76_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_76_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_76_17.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_77_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_77_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_77_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_77_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_77_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_77_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_77_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_77_7.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_78_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_78_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_78_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_78_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_78_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_78_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_78_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_78_7.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_79_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_79_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_79_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_79_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_79_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_79_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_79_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_79_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_79_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_79_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_79_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_79_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_79_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_79_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_79_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_79_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_79_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_79_17.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_80_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_80_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_80_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_80_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_80_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_80_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_80_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_80_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_80_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_80_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_80_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_80_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_80_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_80_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_80_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_80_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_80_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_80_17.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_81_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_81_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_81_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_81_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_81_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_81_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_81_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_81_7.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_82_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_82_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_82_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_82_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_82_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_82_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_82_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_82_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_82_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_82_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_82_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_82_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_82_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_82_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_82_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_82_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_82_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_82_17.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_83_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_83_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_83_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_83_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_83_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_83_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_83_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_83_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_83_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_83_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_83_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_83_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_83_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_83_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_83_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_83_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_83_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_83_17.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_84_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_84_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_84_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_84_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_84_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_84_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_84_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_84_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_84_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_84_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_84_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_84_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_84_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_84_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_84_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_84_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_84_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_84_17.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_85_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_85_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_85_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_85_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_85_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_85_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_85_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_85_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_85_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_85_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_85_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_85_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_85_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_85_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_85_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_85_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_85_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_85_17.jpg"
+    ],
+    "output": "I"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_86_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_86_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_86_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_86_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_86_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_86_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_86_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_86_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_86_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_86_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_86_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_86_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_86_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_86_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_86_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_86_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_86_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_86_17.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_87_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_87_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_87_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_87_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_87_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_87_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_87_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_87_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_87_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_87_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_87_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_87_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_87_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_87_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_87_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_87_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_87_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_87_17.jpg"
+    ],
+    "output": "F"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_88_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_88_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_88_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_88_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_88_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_88_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_88_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_88_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_88_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_88_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_88_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_88_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_88_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_88_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_88_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_88_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_88_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_88_17.jpg"
+    ],
+    "output": "H"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_89_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_89_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_89_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_89_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_89_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_89_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_89_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_89_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_89_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_89_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_89_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_89_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_89_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_89_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_89_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_89_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_89_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_89_17.jpg"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_90_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_90_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_90_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_90_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_90_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_90_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_90_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_90_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_90_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_90_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_90_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_90_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_90_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_90_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_90_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_90_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_90_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_90_17.jpg"
+    ],
+    "output": "G"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_91_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_91_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_91_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_91_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_91_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_91_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_91_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_91_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_91_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_91_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_91_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_91_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_91_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_91_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_91_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_91_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_91_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_91_17.jpg"
+    ],
+    "output": "G"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_92_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_92_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_92_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_92_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_92_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_92_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_92_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_92_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_92_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_92_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_92_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_92_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_92_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_92_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_92_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_92_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_92_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_92_17.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_93_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_93_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_93_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_93_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_93_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_93_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_93_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_93_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_93_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_93_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_93_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_93_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_93_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_93_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_93_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_93_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_93_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_93_17.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_94_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_94_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_94_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_94_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_94_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_94_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_94_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_94_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_94_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_94_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_94_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_94_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_94_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_94_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_94_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_94_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_94_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_94_17.jpg"
+    ],
+    "output": "H"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_95_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_95_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_95_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_95_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_95_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_95_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_95_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_95_7.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_96_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_96_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_96_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_96_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_96_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_96_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_96_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_96_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_96_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_96_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_96_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_96_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_96_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_96_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_96_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_96_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_96_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_96_17.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_97_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_97_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_97_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_97_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_97_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_97_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_97_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_97_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_97_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_97_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_97_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_97_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_97_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_97_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_97_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_97_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_97_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_97_17.jpg"
+    ],
+    "output": "H"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_98_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_98_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_98_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_98_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_98_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_98_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_98_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_98_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_98_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_98_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_98_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_98_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_98_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_98_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_98_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_98_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_98_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_98_17.jpg"
+    ],
+    "output": "I"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_99_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_99_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_99_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_99_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_99_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_99_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_99_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_99_7.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_100_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_100_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_100_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_100_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_100_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_100_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_100_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_100_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_100_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_100_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_100_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_100_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_100_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_100_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_100_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_100_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_100_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_100_17.jpg"
+    ],
+    "output": "G"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_101_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_101_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_101_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_101_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_101_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_101_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_101_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_101_7.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_102_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_102_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_102_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_102_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_102_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_102_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_102_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_102_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_102_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_102_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_102_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_102_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_102_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_102_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_102_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_102_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_102_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_102_17.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_103_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_103_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_103_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_103_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_103_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_103_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_103_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_103_7.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_104_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_104_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_104_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_104_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_104_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_104_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_104_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_104_7.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_105_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_105_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_105_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_105_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_105_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_105_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_105_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_105_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_105_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_105_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_105_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_105_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_105_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_105_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_105_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_105_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_105_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_105_17.jpg"
+    ],
+    "output": "H"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_106_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_106_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_106_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_106_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_106_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_106_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_106_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_106_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_106_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_106_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_106_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_106_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_106_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_106_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_106_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_106_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_106_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_106_17.jpg"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_107_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_107_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_107_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_107_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_107_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_107_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_107_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_107_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_107_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_107_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_107_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_107_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_107_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_107_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_107_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_107_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_107_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_107_17.jpg"
+    ],
+    "output": "F"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_108_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_108_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_108_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_108_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_108_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_108_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_108_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_108_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_108_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_108_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_108_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_108_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_108_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_108_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_108_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_108_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_108_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_108_17.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_109_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_109_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_109_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_109_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_109_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_109_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_109_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_109_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_109_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_109_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_109_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_109_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_109_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_109_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_109_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_109_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_109_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_109_17.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_110_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_110_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_110_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_110_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_110_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_110_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_110_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_110_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_110_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_110_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_110_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_110_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_110_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_110_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_110_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_110_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_110_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_110_17.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_111_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_111_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_111_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_111_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_111_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_111_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_111_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_111_7.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_112_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_112_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_112_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_112_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_112_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_112_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_112_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_112_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_112_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_112_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_112_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_112_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_112_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_112_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_112_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_112_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_112_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_112_17.jpg"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_113_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_113_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_113_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_113_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_113_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_113_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_113_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_113_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_113_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_113_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_113_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_113_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_113_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_113_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_113_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_113_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_113_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_113_17.jpg"
+    ],
+    "output": "F"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_114_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_114_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_114_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_114_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_114_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_114_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_114_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_114_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_114_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_114_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_114_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_114_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_114_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_114_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_114_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_114_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_114_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_114_17.jpg"
+    ],
+    "output": "H"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_115_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_115_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_115_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_115_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_115_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_115_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_115_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_115_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_115_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_115_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_115_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_115_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_115_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_115_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_115_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_115_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_115_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_115_17.jpg"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_116_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_116_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_116_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_116_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_116_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_116_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_116_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_116_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_116_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_116_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_116_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_116_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_116_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_116_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_116_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_116_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_116_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_116_17.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_117_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_117_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_117_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_117_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_117_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_117_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_117_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_117_7.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_118_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_118_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_118_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_118_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_118_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_118_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_118_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_118_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_118_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_118_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_118_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_118_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_118_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_118_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_118_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_118_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_118_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_118_17.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_119_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_119_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_119_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_119_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_119_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_119_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_119_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_119_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_119_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_119_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_119_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_119_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_119_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_119_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_119_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_119_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_119_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_119_17.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_120_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_120_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_120_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_120_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_120_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_120_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_120_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_120_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_120_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_120_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_120_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_120_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_120_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_120_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_120_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_120_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_120_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_120_17.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_121_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_121_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_121_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_121_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_121_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_121_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_121_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_121_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_121_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_121_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_121_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_121_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_121_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_121_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_121_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_121_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_121_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_121_17.jpg"
+    ],
+    "output": "F"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_122_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_122_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_122_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_122_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_122_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_122_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_122_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_122_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_122_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_122_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_122_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_122_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_122_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_122_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_122_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_122_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_122_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_122_17.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_123_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_123_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_123_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_123_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_123_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_123_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_123_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_123_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_123_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_123_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_123_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_123_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_123_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_123_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_123_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_123_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_123_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_123_17.jpg"
+    ],
+    "output": "H"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_124_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_124_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_124_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_124_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_124_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_124_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_124_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_124_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_124_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_124_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_124_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_124_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_124_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_124_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_124_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_124_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_124_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_124_17.jpg"
+    ],
+    "output": "I"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_125_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_125_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_125_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_125_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_125_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_125_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_125_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_125_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_125_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_125_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_125_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_125_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_125_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_125_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_125_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_125_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_125_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_125_17.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_126_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_126_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_126_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_126_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_126_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_126_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_126_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_126_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_126_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_126_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_126_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_126_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_126_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_126_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_126_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_126_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_126_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_126_17.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_127_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_127_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_127_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_127_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_127_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_127_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_127_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_127_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_127_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_127_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_127_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_127_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_127_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_127_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_127_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_127_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_127_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_127_17.jpg"
+    ],
+    "output": "I"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_128_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_128_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_128_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_128_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_128_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_128_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_128_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_128_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_128_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_128_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_128_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_128_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_128_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_128_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_128_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_128_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_128_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_128_17.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_129_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_129_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_129_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_129_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_129_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_129_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_129_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_129_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_129_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_129_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_129_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_129_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_129_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_129_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_129_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_129_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_129_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_129_17.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_130_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_130_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_130_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_130_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_130_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_130_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_130_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_130_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_130_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_130_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_130_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_130_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_130_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_130_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_130_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_130_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_130_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_130_17.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_131_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_131_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_131_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_131_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_131_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_131_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_131_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_131_7.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_132_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_132_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_132_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_132_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_132_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_132_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_132_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_132_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_132_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_132_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_132_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_132_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_132_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_132_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_132_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_132_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_132_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_132_17.jpg"
+    ],
+    "output": "F"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_133_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_133_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_133_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_133_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_133_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_133_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_133_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_133_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_133_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_133_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_133_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_133_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_133_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_133_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_133_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_133_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_133_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_133_17.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_134_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_134_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_134_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_134_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_134_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_134_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_134_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_134_7.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_135_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_135_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_135_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_135_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_135_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_135_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_135_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_135_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_135_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_135_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_135_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_135_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_135_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_135_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_135_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_135_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_135_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_135_17.jpg"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_136_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_136_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_136_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_136_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_136_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_136_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_136_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_136_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_136_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_136_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_136_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_136_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_136_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_136_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_136_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_136_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_136_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_136_17.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_137_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_137_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_137_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_137_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_137_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_137_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_137_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_137_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_137_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_137_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_137_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_137_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_137_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_137_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_137_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_137_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_137_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_137_17.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_138_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_138_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_138_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_138_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_138_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_138_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_138_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_138_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_138_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_138_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_138_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_138_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_138_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_138_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_138_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_138_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_138_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_138_17.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_139_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_139_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_139_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_139_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_139_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_139_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_139_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_139_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_139_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_139_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_139_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_139_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_139_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_139_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_139_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_139_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_139_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_139_17.jpg"
+    ],
+    "output": "I"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_140_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_140_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_140_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_140_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_140_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_140_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_140_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_140_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_140_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_140_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_140_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_140_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_140_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_140_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_140_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_140_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_140_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_140_17.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_141_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_141_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_141_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_141_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_141_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_141_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_141_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_141_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_141_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_141_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_141_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_141_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_141_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_141_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_141_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_141_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_141_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_141_17.jpg"
+    ],
+    "output": "I"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_142_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_142_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_142_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_142_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_142_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_142_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_142_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_142_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_142_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_142_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_142_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_142_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_142_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_142_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_142_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_142_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_142_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_142_17.jpg"
+    ],
+    "output": "H"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_143_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_143_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_143_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_143_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_143_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_143_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_143_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_143_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_143_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_143_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_143_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_143_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_143_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_143_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_143_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_143_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_143_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_143_17.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_144_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_144_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_144_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_144_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_144_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_144_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_144_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_144_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_144_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_144_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_144_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_144_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_144_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_144_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_144_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_144_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_144_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_144_17.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_145_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_145_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_145_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_145_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_145_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_145_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_145_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_145_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_145_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_145_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_145_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_145_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_145_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_145_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_145_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_145_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_145_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_145_17.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_146_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_146_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_146_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_146_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_146_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_146_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_146_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_146_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_146_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_146_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_146_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_146_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_146_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_146_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_146_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_146_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_146_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_146_17.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_147_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_147_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_147_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_147_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_147_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_147_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_147_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_147_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_147_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_147_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_147_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_147_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_147_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_147_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_147_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_147_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_147_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_147_17.jpg"
+    ],
+    "output": "G"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_148_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_148_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_148_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_148_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_148_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_148_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_148_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_148_7.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_149_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_149_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_149_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_149_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_149_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_149_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_149_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_149_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_149_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_149_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_149_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_149_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_149_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_149_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_149_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_149_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_149_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_149_17.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_150_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_150_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_150_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_150_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_150_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_150_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_150_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_150_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_150_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_150_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_150_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_150_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_150_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_150_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_150_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_150_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_150_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_150_17.jpg"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_151_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_151_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_151_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_151_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_151_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_151_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_151_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_151_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_151_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_151_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_151_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_151_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_151_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_151_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_151_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_151_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_151_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_151_17.jpg"
+    ],
+    "output": "F"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_152_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_152_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_152_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_152_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_152_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_152_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_152_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_152_7.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_153_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_153_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_153_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_153_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_153_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_153_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_153_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_153_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_153_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_153_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_153_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_153_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_153_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_153_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_153_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_153_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_153_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_153_17.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_154_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_154_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_154_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_154_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_154_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_154_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_154_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_154_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_154_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_154_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_154_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_154_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_154_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_154_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_154_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_154_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_154_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_154_17.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_155_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_155_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_155_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_155_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_155_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_155_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_155_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_155_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_155_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_155_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_155_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_155_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_155_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_155_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_155_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_155_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_155_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_155_17.jpg"
+    ],
+    "output": "I"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_156_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_156_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_156_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_156_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_156_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_156_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_156_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_156_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_156_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_156_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_156_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_156_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_156_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_156_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_156_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_156_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_156_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_156_17.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_157_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_157_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_157_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_157_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_157_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_157_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_157_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_157_7.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_158_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_158_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_158_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_158_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_158_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_158_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_158_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_158_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_158_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_158_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_158_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_158_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_158_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_158_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_158_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_158_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_158_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_158_17.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_159_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_159_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_159_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_159_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_159_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_159_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_159_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_159_7.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_160_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_160_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_160_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_160_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_160_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_160_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_160_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_160_7.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_161_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_161_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_161_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_161_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_161_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_161_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_161_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_161_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_161_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_161_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_161_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_161_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_161_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_161_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_161_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_161_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_161_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_161_17.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_162_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_162_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_162_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_162_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_162_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_162_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_162_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_162_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_162_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_162_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_162_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_162_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_162_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_162_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_162_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_162_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_162_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_162_17.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_163_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_163_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_163_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_163_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_163_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_163_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_163_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_163_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_163_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_163_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_163_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_163_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_163_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_163_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_163_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_163_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_163_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_163_17.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_164_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_164_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_164_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_164_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_164_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_164_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_164_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_164_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_164_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_164_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_164_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_164_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_164_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_164_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_164_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_164_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_164_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_164_17.jpg"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_165_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_165_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_165_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_165_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_165_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_165_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_165_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_165_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_165_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_165_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_165_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_165_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_165_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_165_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_165_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_165_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_165_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_165_17.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_166_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_166_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_166_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_166_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_166_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_166_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_166_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_166_7.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_167_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_167_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_167_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_167_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_167_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_167_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_167_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_167_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_167_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_167_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_167_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_167_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_167_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_167_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_167_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_167_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_167_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_167_17.jpg"
+    ],
+    "output": "H"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_168_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_168_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_168_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_168_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_168_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_168_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_168_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_168_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_168_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_168_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_168_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_168_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_168_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_168_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_168_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_168_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_168_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_168_17.jpg"
+    ],
+    "output": "G"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_169_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_169_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_169_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_169_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_169_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_169_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_169_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_169_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_169_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_169_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_169_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_169_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_169_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_169_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_169_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_169_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_169_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_169_17.jpg"
+    ],
+    "output": "G"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_170_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_170_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_170_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_170_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_170_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_170_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_170_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_170_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_170_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_170_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_170_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_170_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_170_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_170_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_170_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_170_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_170_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_170_17.jpg"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_171_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_171_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_171_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_171_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_171_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_171_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_171_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_171_7.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_172_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_172_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_172_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_172_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_172_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_172_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_172_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_172_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_172_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_172_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_172_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_172_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_172_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_172_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_172_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_172_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_172_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_172_17.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_173_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_173_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_173_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_173_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_173_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_173_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_173_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_173_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_173_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_173_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_173_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_173_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_173_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_173_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_173_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_173_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_173_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_173_17.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_174_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_174_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_174_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_174_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_174_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_174_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_174_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_174_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_174_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_174_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_174_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_174_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_174_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_174_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_174_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_174_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_174_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_174_17.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_175_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_175_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_175_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_175_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_175_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_175_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_175_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_175_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_175_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_175_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_175_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_175_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_175_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_175_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_175_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_175_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_175_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_175_17.jpg"
+    ],
+    "output": "G"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_176_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_176_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_176_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_176_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_176_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_176_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_176_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_176_7.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_177_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_177_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_177_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_177_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_177_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_177_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_177_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_177_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_177_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_177_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_177_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_177_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_177_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_177_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_177_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_177_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_177_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_177_17.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_178_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_178_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_178_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_178_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_178_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_178_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_178_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_178_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_178_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_178_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_178_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_178_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_178_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_178_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_178_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_178_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_178_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_178_17.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_179_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_179_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_179_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_179_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_179_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_179_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_179_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_179_7.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_180_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_180_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_180_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_180_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_180_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_180_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_180_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_180_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_180_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_180_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_180_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_180_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_180_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_180_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_180_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_180_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_180_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_180_17.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_181_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_181_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_181_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_181_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_181_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_181_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_181_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_181_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_181_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_181_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_181_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_181_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_181_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_181_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_181_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_181_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_181_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_181_17.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_182_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_182_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_182_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_182_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_182_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_182_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_182_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_182_7.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_183_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_183_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_183_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_183_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_183_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_183_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_183_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_183_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_183_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_183_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_183_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_183_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_183_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_183_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_183_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_183_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_183_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_183_17.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_184_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_184_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_184_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_184_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_184_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_184_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_184_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_184_7.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_185_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_185_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_185_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_185_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_185_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_185_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_185_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_185_7.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_186_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_186_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_186_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_186_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_186_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_186_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_186_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_186_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_186_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_186_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_186_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_186_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_186_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_186_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_186_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_186_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_186_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_186_17.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_187_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_187_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_187_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_187_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_187_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_187_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_187_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_187_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_187_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_187_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_187_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_187_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_187_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_187_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_187_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_187_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_187_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_187_17.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_188_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_188_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_188_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_188_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_188_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_188_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_188_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_188_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_188_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_188_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_188_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_188_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_188_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_188_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_188_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_188_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_188_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_188_17.jpg"
+    ],
+    "output": "H"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_189_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_189_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_189_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_189_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_189_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_189_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_189_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_189_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_189_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_189_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_189_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_189_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_189_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_189_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_189_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_189_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_189_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_189_17.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_190_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_190_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_190_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_190_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_190_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_190_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_190_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_190_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_190_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_190_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_190_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_190_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_190_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_190_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_190_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_190_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_190_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_190_17.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_191_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_191_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_191_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_191_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_191_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_191_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_191_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_191_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_191_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_191_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_191_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_191_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_191_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_191_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_191_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_191_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_191_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_191_17.jpg"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_192_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_192_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_192_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_192_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_192_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_192_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_192_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_192_7.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_193_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_193_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_193_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_193_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_193_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_193_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_193_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_193_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_193_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_193_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_193_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_193_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_193_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_193_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_193_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_193_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_193_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_193_17.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_194_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_194_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_194_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_194_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_194_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_194_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_194_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_194_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_194_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_194_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_194_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_194_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_194_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_194_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_194_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_194_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_194_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_194_17.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_195_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_195_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_195_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_195_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_195_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_195_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_195_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_195_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_195_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_195_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_195_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_195_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_195_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_195_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_195_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_195_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_195_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_195_17.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_196_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_196_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_196_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_196_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_196_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_196_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_196_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_196_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_196_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_196_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_196_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_196_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_196_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_196_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_196_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_196_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_196_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_196_17.jpg"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_197_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_197_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_197_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_197_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_197_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_197_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_197_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_197_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_197_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_197_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_197_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_197_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_197_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_197_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_197_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_197_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_197_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_197_17.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_198_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_198_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_198_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_198_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_198_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_198_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_198_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_198_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_198_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_198_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_198_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_198_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_198_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_198_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_198_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_198_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_198_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_198_17.jpg"
+    ],
+    "output": "F"
+  },
+  {
+    "task": "Image_text_retrieval_with_Spatial_Context",
+    "visual_input_component": "synthetic image",
+    "source": "SPEC",
+    "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "question": "Please retrieve the matching image to the query text in the candidate images.",
+    "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_199_0.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_199_1.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_199_2.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_199_3.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_199_4.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_199_5.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_199_6.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_199_7.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_199_8.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_199_9.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_199_10.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_199_11.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_199_12.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_199_13.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_199_14.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_199_15.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_199_16.jpg",
+      "../MMIU-Benchmark/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_199_17.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_0_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_0_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_0_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_0_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_0_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_0_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_0_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_0_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_0_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_0_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_0_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_0_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_0_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_0_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_0_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_0_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_1_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_1_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_1_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_1_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_1_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_1_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_1_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_1_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_1_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_1_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_1_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_1_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_1_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_1_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_1_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_1_15.png"
+    ],
+    "output": "G"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_2_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_2_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_2_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_2_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_2_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_2_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_2_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_2_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_2_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_2_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_2_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_2_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_2_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_2_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_2_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_2_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_3_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_3_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_3_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_3_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_3_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_3_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_3_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_3_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_3_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_3_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_3_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_3_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_3_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_3_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_3_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_3_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_4_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_4_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_4_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_4_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_4_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_4_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_4_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_4_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_4_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_4_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_4_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_4_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_4_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_4_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_4_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_4_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_5_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_5_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_5_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_5_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_5_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_5_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_5_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_5_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_5_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_5_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_5_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_5_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_5_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_5_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_5_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_5_15.png"
+    ],
+    "output": "H"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_6_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_6_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_6_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_6_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_6_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_6_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_6_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_6_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_6_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_6_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_6_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_6_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_6_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_6_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_6_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_6_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_7_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_7_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_7_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_7_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_7_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_7_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_7_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_7_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_7_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_7_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_7_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_7_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_7_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_7_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_7_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_7_15.png"
+    ],
+    "output": "H"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_8_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_8_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_8_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_8_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_8_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_8_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_8_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_8_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_8_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_8_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_8_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_8_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_8_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_8_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_8_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_8_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_9_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_9_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_9_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_9_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_9_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_9_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_9_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_9_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_9_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_9_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_9_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_9_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_9_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_9_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_9_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_9_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_10_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_10_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_10_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_10_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_10_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_10_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_10_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_10_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_10_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_10_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_10_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_10_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_10_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_10_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_10_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_10_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_11_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_11_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_11_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_11_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_11_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_11_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_11_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_11_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_11_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_11_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_11_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_11_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_11_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_11_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_11_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_11_15.png"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_12_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_12_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_12_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_12_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_12_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_12_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_12_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_12_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_12_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_12_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_12_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_12_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_12_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_12_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_12_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_12_15.png"
+    ],
+    "output": "F"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_13_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_13_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_13_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_13_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_13_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_13_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_13_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_13_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_13_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_13_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_13_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_13_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_13_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_13_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_13_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_13_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_14_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_14_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_14_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_14_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_14_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_14_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_14_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_14_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_14_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_14_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_14_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_14_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_14_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_14_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_14_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_14_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_15_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_15_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_15_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_15_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_15_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_15_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_15_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_15_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_15_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_15_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_15_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_15_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_15_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_15_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_15_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_15_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_16_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_16_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_16_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_16_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_16_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_16_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_16_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_16_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_16_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_16_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_16_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_16_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_16_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_16_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_16_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_16_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_17_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_17_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_17_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_17_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_17_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_17_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_17_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_17_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_17_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_17_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_17_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_17_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_17_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_17_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_17_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_17_15.png"
+    ],
+    "output": "G"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_18_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_18_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_18_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_18_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_18_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_18_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_18_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_18_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_18_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_18_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_18_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_18_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_18_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_18_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_18_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_18_15.png"
+    ],
+    "output": "H"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_19_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_19_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_19_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_19_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_19_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_19_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_19_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_19_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_19_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_19_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_19_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_19_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_19_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_19_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_19_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_19_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_20_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_20_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_20_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_20_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_20_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_20_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_20_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_20_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_20_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_20_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_20_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_20_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_20_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_20_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_20_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_20_15.png"
+    ],
+    "output": "G"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_21_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_21_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_21_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_21_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_21_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_21_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_21_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_21_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_21_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_21_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_21_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_21_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_21_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_21_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_21_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_21_15.png"
+    ],
+    "output": "G"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_22_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_22_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_22_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_22_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_22_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_22_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_22_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_22_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_22_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_22_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_22_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_22_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_22_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_22_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_22_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_22_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_23_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_23_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_23_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_23_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_23_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_23_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_23_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_23_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_23_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_23_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_23_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_23_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_23_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_23_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_23_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_23_15.png"
+    ],
+    "output": "H"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_24_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_24_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_24_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_24_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_24_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_24_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_24_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_24_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_24_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_24_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_24_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_24_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_24_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_24_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_24_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_24_15.png"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_25_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_25_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_25_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_25_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_25_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_25_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_25_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_25_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_25_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_25_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_25_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_25_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_25_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_25_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_25_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_25_15.png"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_26_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_26_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_26_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_26_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_26_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_26_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_26_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_26_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_26_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_26_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_26_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_26_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_26_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_26_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_26_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_26_15.png"
+    ],
+    "output": "H"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_27_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_27_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_27_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_27_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_27_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_27_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_27_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_27_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_27_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_27_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_27_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_27_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_27_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_27_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_27_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_27_15.png"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_28_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_28_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_28_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_28_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_28_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_28_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_28_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_28_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_28_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_28_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_28_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_28_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_28_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_28_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_28_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_28_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_29_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_29_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_29_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_29_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_29_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_29_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_29_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_29_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_29_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_29_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_29_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_29_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_29_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_29_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_29_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_29_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_30_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_30_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_30_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_30_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_30_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_30_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_30_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_30_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_30_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_30_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_30_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_30_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_30_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_30_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_30_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_30_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_31_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_31_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_31_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_31_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_31_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_31_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_31_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_31_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_31_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_31_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_31_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_31_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_31_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_31_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_31_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_31_15.png"
+    ],
+    "output": "H"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_32_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_32_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_32_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_32_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_32_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_32_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_32_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_32_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_32_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_32_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_32_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_32_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_32_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_32_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_32_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_32_15.png"
+    ],
+    "output": "F"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_33_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_33_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_33_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_33_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_33_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_33_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_33_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_33_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_33_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_33_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_33_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_33_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_33_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_33_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_33_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_33_15.png"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_34_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_34_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_34_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_34_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_34_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_34_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_34_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_34_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_34_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_34_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_34_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_34_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_34_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_34_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_34_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_34_15.png"
+    ],
+    "output": "G"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_35_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_35_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_35_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_35_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_35_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_35_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_35_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_35_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_35_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_35_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_35_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_35_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_35_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_35_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_35_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_35_15.png"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_36_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_36_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_36_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_36_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_36_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_36_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_36_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_36_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_36_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_36_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_36_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_36_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_36_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_36_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_36_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_36_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_37_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_37_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_37_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_37_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_37_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_37_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_37_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_37_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_37_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_37_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_37_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_37_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_37_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_37_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_37_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_37_15.png"
+    ],
+    "output": "F"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_38_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_38_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_38_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_38_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_38_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_38_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_38_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_38_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_38_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_38_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_38_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_38_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_38_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_38_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_38_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_38_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_39_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_39_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_39_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_39_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_39_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_39_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_39_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_39_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_39_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_39_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_39_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_39_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_39_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_39_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_39_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_39_15.png"
+    ],
+    "output": "F"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_40_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_40_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_40_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_40_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_40_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_40_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_40_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_40_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_40_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_40_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_40_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_40_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_40_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_40_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_40_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_40_15.png"
+    ],
+    "output": "G"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_41_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_41_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_41_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_41_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_41_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_41_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_41_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_41_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_41_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_41_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_41_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_41_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_41_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_41_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_41_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_41_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_42_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_42_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_42_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_42_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_42_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_42_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_42_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_42_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_42_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_42_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_42_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_42_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_42_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_42_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_42_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_42_15.png"
+    ],
+    "output": "H"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_43_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_43_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_43_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_43_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_43_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_43_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_43_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_43_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_43_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_43_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_43_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_43_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_43_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_43_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_43_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_43_15.png"
+    ],
+    "output": "G"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_44_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_44_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_44_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_44_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_44_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_44_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_44_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_44_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_44_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_44_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_44_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_44_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_44_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_44_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_44_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_44_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_45_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_45_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_45_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_45_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_45_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_45_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_45_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_45_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_45_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_45_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_45_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_45_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_45_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_45_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_45_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_45_15.png"
+    ],
+    "output": "F"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_46_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_46_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_46_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_46_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_46_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_46_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_46_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_46_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_46_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_46_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_46_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_46_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_46_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_46_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_46_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_46_15.png"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_47_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_47_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_47_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_47_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_47_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_47_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_47_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_47_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_47_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_47_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_47_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_47_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_47_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_47_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_47_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_47_15.png"
+    ],
+    "output": "G"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_48_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_48_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_48_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_48_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_48_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_48_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_48_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_48_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_48_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_48_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_48_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_48_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_48_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_48_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_48_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_48_15.png"
+    ],
+    "output": "H"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_49_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_49_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_49_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_49_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_49_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_49_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_49_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_49_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_49_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_49_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_49_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_49_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_49_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_49_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_49_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_49_15.png"
+    ],
+    "output": "F"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_50_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_50_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_50_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_50_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_50_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_50_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_50_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_50_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_50_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_50_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_50_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_50_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_50_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_50_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_50_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_50_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_51_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_51_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_51_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_51_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_51_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_51_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_51_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_51_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_51_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_51_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_51_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_51_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_51_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_51_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_51_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_51_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_52_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_52_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_52_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_52_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_52_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_52_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_52_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_52_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_52_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_52_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_52_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_52_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_52_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_52_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_52_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_52_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_53_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_53_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_53_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_53_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_53_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_53_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_53_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_53_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_53_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_53_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_53_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_53_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_53_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_53_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_53_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_53_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_54_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_54_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_54_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_54_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_54_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_54_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_54_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_54_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_54_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_54_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_54_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_54_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_54_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_54_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_54_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_54_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_55_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_55_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_55_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_55_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_55_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_55_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_55_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_55_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_55_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_55_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_55_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_55_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_55_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_55_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_55_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_55_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_56_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_56_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_56_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_56_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_56_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_56_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_56_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_56_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_56_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_56_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_56_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_56_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_56_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_56_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_56_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_56_15.png"
+    ],
+    "output": "G"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_57_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_57_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_57_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_57_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_57_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_57_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_57_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_57_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_57_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_57_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_57_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_57_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_57_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_57_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_57_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_57_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_58_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_58_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_58_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_58_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_58_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_58_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_58_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_58_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_58_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_58_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_58_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_58_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_58_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_58_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_58_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_58_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_59_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_59_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_59_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_59_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_59_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_59_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_59_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_59_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_59_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_59_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_59_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_59_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_59_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_59_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_59_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_59_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_60_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_60_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_60_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_60_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_60_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_60_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_60_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_60_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_60_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_60_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_60_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_60_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_60_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_60_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_60_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_60_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_61_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_61_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_61_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_61_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_61_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_61_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_61_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_61_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_61_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_61_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_61_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_61_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_61_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_61_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_61_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_61_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_62_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_62_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_62_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_62_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_62_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_62_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_62_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_62_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_62_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_62_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_62_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_62_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_62_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_62_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_62_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_62_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_63_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_63_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_63_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_63_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_63_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_63_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_63_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_63_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_63_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_63_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_63_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_63_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_63_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_63_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_63_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_63_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_64_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_64_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_64_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_64_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_64_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_64_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_64_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_64_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_64_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_64_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_64_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_64_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_64_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_64_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_64_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_64_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_65_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_65_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_65_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_65_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_65_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_65_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_65_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_65_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_65_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_65_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_65_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_65_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_65_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_65_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_65_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_65_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_66_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_66_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_66_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_66_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_66_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_66_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_66_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_66_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_66_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_66_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_66_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_66_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_66_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_66_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_66_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_66_15.png"
+    ],
+    "output": "F"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_67_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_67_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_67_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_67_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_67_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_67_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_67_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_67_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_67_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_67_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_67_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_67_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_67_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_67_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_67_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_67_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_68_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_68_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_68_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_68_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_68_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_68_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_68_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_68_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_68_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_68_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_68_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_68_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_68_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_68_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_68_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_68_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_69_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_69_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_69_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_69_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_69_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_69_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_69_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_69_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_69_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_69_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_69_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_69_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_69_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_69_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_69_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_69_15.png"
+    ],
+    "output": "H"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_70_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_70_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_70_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_70_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_70_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_70_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_70_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_70_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_70_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_70_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_70_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_70_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_70_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_70_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_70_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_70_15.png"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_71_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_71_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_71_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_71_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_71_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_71_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_71_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_71_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_71_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_71_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_71_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_71_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_71_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_71_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_71_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_71_15.png"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_72_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_72_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_72_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_72_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_72_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_72_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_72_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_72_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_72_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_72_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_72_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_72_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_72_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_72_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_72_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_72_15.png"
+    ],
+    "output": "F"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_73_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_73_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_73_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_73_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_73_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_73_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_73_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_73_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_73_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_73_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_73_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_73_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_73_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_73_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_73_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_73_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_74_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_74_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_74_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_74_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_74_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_74_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_74_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_74_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_74_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_74_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_74_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_74_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_74_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_74_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_74_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_74_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_75_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_75_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_75_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_75_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_75_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_75_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_75_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_75_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_75_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_75_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_75_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_75_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_75_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_75_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_75_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_75_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_76_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_76_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_76_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_76_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_76_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_76_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_76_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_76_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_76_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_76_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_76_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_76_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_76_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_76_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_76_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_76_15.png"
+    ],
+    "output": "F"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_77_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_77_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_77_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_77_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_77_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_77_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_77_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_77_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_77_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_77_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_77_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_77_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_77_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_77_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_77_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_77_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_78_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_78_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_78_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_78_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_78_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_78_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_78_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_78_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_78_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_78_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_78_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_78_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_78_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_78_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_78_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_78_15.png"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_79_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_79_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_79_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_79_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_79_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_79_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_79_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_79_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_79_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_79_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_79_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_79_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_79_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_79_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_79_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_79_15.png"
+    ],
+    "output": "H"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_80_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_80_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_80_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_80_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_80_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_80_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_80_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_80_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_80_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_80_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_80_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_80_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_80_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_80_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_80_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_80_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_81_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_81_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_81_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_81_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_81_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_81_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_81_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_81_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_81_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_81_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_81_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_81_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_81_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_81_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_81_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_81_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_82_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_82_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_82_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_82_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_82_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_82_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_82_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_82_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_82_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_82_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_82_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_82_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_82_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_82_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_82_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_82_15.png"
+    ],
+    "output": "H"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_83_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_83_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_83_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_83_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_83_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_83_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_83_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_83_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_83_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_83_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_83_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_83_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_83_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_83_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_83_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_83_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_84_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_84_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_84_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_84_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_84_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_84_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_84_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_84_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_84_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_84_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_84_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_84_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_84_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_84_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_84_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_84_15.png"
+    ],
+    "output": "G"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_85_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_85_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_85_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_85_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_85_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_85_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_85_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_85_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_85_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_85_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_85_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_85_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_85_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_85_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_85_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_85_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_86_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_86_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_86_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_86_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_86_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_86_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_86_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_86_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_86_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_86_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_86_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_86_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_86_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_86_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_86_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_86_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_87_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_87_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_87_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_87_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_87_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_87_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_87_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_87_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_87_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_87_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_87_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_87_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_87_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_87_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_87_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_87_15.png"
+    ],
+    "output": "F"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_88_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_88_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_88_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_88_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_88_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_88_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_88_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_88_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_88_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_88_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_88_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_88_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_88_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_88_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_88_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_88_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_89_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_89_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_89_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_89_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_89_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_89_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_89_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_89_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_89_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_89_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_89_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_89_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_89_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_89_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_89_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_89_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_90_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_90_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_90_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_90_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_90_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_90_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_90_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_90_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_90_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_90_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_90_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_90_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_90_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_90_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_90_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_90_15.png"
+    ],
+    "output": "G"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_91_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_91_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_91_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_91_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_91_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_91_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_91_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_91_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_91_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_91_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_91_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_91_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_91_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_91_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_91_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_91_15.png"
+    ],
+    "output": "F"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_92_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_92_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_92_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_92_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_92_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_92_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_92_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_92_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_92_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_92_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_92_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_92_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_92_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_92_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_92_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_92_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_93_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_93_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_93_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_93_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_93_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_93_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_93_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_93_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_93_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_93_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_93_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_93_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_93_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_93_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_93_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_93_15.png"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_94_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_94_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_94_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_94_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_94_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_94_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_94_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_94_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_94_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_94_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_94_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_94_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_94_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_94_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_94_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_94_15.png"
+    ],
+    "output": "G"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_95_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_95_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_95_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_95_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_95_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_95_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_95_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_95_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_95_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_95_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_95_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_95_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_95_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_95_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_95_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_95_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_96_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_96_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_96_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_96_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_96_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_96_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_96_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_96_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_96_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_96_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_96_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_96_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_96_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_96_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_96_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_96_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_97_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_97_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_97_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_97_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_97_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_97_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_97_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_97_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_97_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_97_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_97_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_97_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_97_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_97_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_97_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_97_15.png"
+    ],
+    "output": "H"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_98_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_98_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_98_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_98_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_98_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_98_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_98_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_98_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_98_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_98_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_98_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_98_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_98_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_98_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_98_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_98_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_99_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_99_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_99_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_99_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_99_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_99_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_99_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_99_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_99_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_99_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_99_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_99_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_99_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_99_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_99_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_99_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_100_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_100_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_100_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_100_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_100_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_100_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_100_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_100_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_100_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_100_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_100_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_100_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_100_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_100_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_100_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_100_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_101_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_101_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_101_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_101_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_101_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_101_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_101_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_101_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_101_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_101_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_101_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_101_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_101_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_101_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_101_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_101_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_102_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_102_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_102_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_102_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_102_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_102_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_102_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_102_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_102_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_102_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_102_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_102_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_102_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_102_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_102_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_102_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_103_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_103_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_103_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_103_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_103_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_103_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_103_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_103_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_103_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_103_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_103_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_103_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_103_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_103_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_103_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_103_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_104_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_104_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_104_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_104_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_104_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_104_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_104_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_104_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_104_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_104_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_104_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_104_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_104_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_104_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_104_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_104_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_105_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_105_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_105_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_105_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_105_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_105_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_105_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_105_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_105_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_105_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_105_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_105_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_105_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_105_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_105_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_105_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_106_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_106_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_106_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_106_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_106_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_106_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_106_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_106_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_106_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_106_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_106_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_106_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_106_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_106_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_106_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_106_15.png"
+    ],
+    "output": "H"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_107_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_107_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_107_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_107_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_107_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_107_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_107_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_107_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_107_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_107_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_107_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_107_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_107_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_107_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_107_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_107_15.png"
+    ],
+    "output": "H"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_108_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_108_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_108_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_108_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_108_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_108_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_108_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_108_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_108_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_108_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_108_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_108_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_108_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_108_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_108_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_108_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_109_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_109_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_109_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_109_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_109_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_109_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_109_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_109_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_109_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_109_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_109_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_109_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_109_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_109_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_109_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_109_15.png"
+    ],
+    "output": "F"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_110_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_110_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_110_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_110_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_110_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_110_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_110_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_110_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_110_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_110_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_110_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_110_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_110_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_110_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_110_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_110_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_111_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_111_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_111_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_111_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_111_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_111_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_111_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_111_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_111_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_111_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_111_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_111_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_111_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_111_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_111_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_111_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_112_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_112_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_112_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_112_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_112_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_112_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_112_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_112_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_112_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_112_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_112_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_112_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_112_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_112_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_112_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_112_15.png"
+    ],
+    "output": "F"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_113_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_113_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_113_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_113_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_113_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_113_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_113_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_113_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_113_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_113_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_113_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_113_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_113_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_113_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_113_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_113_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_114_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_114_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_114_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_114_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_114_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_114_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_114_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_114_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_114_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_114_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_114_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_114_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_114_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_114_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_114_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_114_15.png"
+    ],
+    "output": "H"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_115_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_115_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_115_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_115_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_115_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_115_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_115_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_115_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_115_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_115_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_115_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_115_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_115_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_115_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_115_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_115_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_116_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_116_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_116_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_116_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_116_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_116_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_116_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_116_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_116_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_116_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_116_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_116_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_116_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_116_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_116_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_116_15.png"
+    ],
+    "output": "G"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_117_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_117_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_117_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_117_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_117_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_117_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_117_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_117_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_117_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_117_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_117_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_117_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_117_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_117_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_117_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_117_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_118_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_118_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_118_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_118_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_118_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_118_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_118_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_118_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_118_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_118_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_118_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_118_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_118_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_118_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_118_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_118_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_119_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_119_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_119_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_119_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_119_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_119_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_119_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_119_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_119_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_119_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_119_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_119_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_119_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_119_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_119_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_119_15.png"
+    ],
+    "output": "G"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_120_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_120_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_120_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_120_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_120_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_120_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_120_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_120_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_120_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_120_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_120_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_120_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_120_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_120_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_120_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_120_15.png"
+    ],
+    "output": "F"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_121_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_121_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_121_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_121_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_121_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_121_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_121_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_121_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_121_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_121_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_121_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_121_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_121_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_121_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_121_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_121_15.png"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_122_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_122_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_122_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_122_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_122_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_122_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_122_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_122_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_122_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_122_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_122_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_122_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_122_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_122_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_122_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_122_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_123_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_123_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_123_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_123_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_123_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_123_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_123_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_123_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_123_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_123_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_123_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_123_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_123_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_123_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_123_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_123_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_124_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_124_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_124_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_124_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_124_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_124_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_124_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_124_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_124_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_124_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_124_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_124_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_124_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_124_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_124_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_124_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_125_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_125_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_125_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_125_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_125_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_125_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_125_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_125_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_125_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_125_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_125_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_125_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_125_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_125_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_125_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_125_15.png"
+    ],
+    "output": "H"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_126_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_126_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_126_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_126_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_126_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_126_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_126_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_126_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_126_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_126_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_126_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_126_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_126_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_126_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_126_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_126_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_127_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_127_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_127_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_127_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_127_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_127_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_127_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_127_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_127_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_127_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_127_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_127_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_127_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_127_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_127_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_127_15.png"
+    ],
+    "output": "G"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_128_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_128_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_128_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_128_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_128_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_128_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_128_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_128_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_128_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_128_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_128_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_128_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_128_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_128_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_128_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_128_15.png"
+    ],
+    "output": "G"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_129_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_129_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_129_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_129_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_129_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_129_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_129_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_129_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_129_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_129_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_129_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_129_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_129_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_129_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_129_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_129_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_130_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_130_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_130_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_130_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_130_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_130_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_130_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_130_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_130_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_130_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_130_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_130_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_130_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_130_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_130_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_130_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_131_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_131_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_131_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_131_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_131_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_131_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_131_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_131_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_131_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_131_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_131_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_131_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_131_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_131_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_131_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_131_15.png"
+    ],
+    "output": "F"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_132_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_132_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_132_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_132_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_132_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_132_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_132_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_132_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_132_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_132_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_132_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_132_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_132_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_132_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_132_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_132_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_133_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_133_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_133_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_133_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_133_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_133_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_133_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_133_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_133_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_133_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_133_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_133_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_133_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_133_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_133_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_133_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_134_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_134_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_134_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_134_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_134_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_134_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_134_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_134_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_134_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_134_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_134_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_134_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_134_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_134_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_134_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_134_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_135_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_135_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_135_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_135_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_135_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_135_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_135_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_135_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_135_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_135_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_135_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_135_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_135_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_135_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_135_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_135_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_136_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_136_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_136_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_136_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_136_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_136_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_136_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_136_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_136_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_136_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_136_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_136_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_136_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_136_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_136_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_136_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_137_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_137_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_137_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_137_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_137_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_137_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_137_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_137_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_137_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_137_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_137_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_137_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_137_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_137_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_137_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_137_15.png"
+    ],
+    "output": "H"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_138_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_138_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_138_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_138_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_138_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_138_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_138_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_138_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_138_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_138_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_138_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_138_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_138_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_138_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_138_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_138_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_139_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_139_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_139_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_139_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_139_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_139_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_139_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_139_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_139_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_139_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_139_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_139_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_139_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_139_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_139_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_139_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_140_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_140_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_140_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_140_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_140_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_140_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_140_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_140_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_140_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_140_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_140_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_140_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_140_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_140_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_140_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_140_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_141_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_141_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_141_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_141_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_141_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_141_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_141_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_141_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_141_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_141_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_141_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_141_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_141_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_141_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_141_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_141_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_142_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_142_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_142_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_142_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_142_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_142_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_142_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_142_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_142_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_142_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_142_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_142_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_142_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_142_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_142_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_142_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_143_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_143_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_143_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_143_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_143_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_143_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_143_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_143_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_143_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_143_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_143_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_143_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_143_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_143_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_143_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_143_15.png"
+    ],
+    "output": "G"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_144_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_144_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_144_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_144_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_144_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_144_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_144_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_144_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_144_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_144_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_144_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_144_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_144_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_144_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_144_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_144_15.png"
+    ],
+    "output": "H"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_145_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_145_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_145_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_145_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_145_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_145_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_145_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_145_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_145_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_145_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_145_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_145_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_145_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_145_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_145_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_145_15.png"
+    ],
+    "output": "F"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_146_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_146_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_146_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_146_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_146_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_146_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_146_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_146_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_146_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_146_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_146_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_146_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_146_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_146_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_146_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_146_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_147_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_147_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_147_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_147_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_147_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_147_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_147_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_147_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_147_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_147_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_147_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_147_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_147_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_147_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_147_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_147_15.png"
+    ],
+    "output": "H"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_148_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_148_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_148_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_148_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_148_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_148_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_148_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_148_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_148_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_148_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_148_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_148_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_148_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_148_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_148_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_148_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_149_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_149_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_149_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_149_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_149_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_149_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_149_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_149_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_149_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_149_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_149_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_149_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_149_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_149_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_149_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_149_15.png"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_150_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_150_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_150_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_150_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_150_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_150_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_150_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_150_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_150_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_150_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_150_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_150_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_150_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_150_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_150_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_150_15.png"
+    ],
+    "output": "F"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_151_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_151_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_151_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_151_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_151_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_151_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_151_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_151_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_151_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_151_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_151_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_151_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_151_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_151_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_151_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_151_15.png"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_152_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_152_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_152_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_152_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_152_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_152_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_152_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_152_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_152_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_152_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_152_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_152_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_152_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_152_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_152_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_152_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_153_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_153_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_153_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_153_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_153_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_153_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_153_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_153_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_153_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_153_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_153_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_153_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_153_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_153_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_153_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_153_15.png"
+    ],
+    "output": "F"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_154_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_154_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_154_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_154_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_154_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_154_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_154_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_154_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_154_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_154_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_154_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_154_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_154_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_154_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_154_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_154_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_155_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_155_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_155_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_155_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_155_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_155_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_155_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_155_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_155_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_155_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_155_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_155_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_155_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_155_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_155_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_155_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_156_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_156_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_156_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_156_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_156_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_156_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_156_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_156_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_156_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_156_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_156_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_156_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_156_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_156_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_156_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_156_15.png"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_157_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_157_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_157_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_157_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_157_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_157_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_157_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_157_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_157_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_157_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_157_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_157_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_157_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_157_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_157_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_157_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_158_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_158_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_158_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_158_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_158_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_158_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_158_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_158_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_158_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_158_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_158_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_158_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_158_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_158_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_158_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_158_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_159_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_159_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_159_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_159_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_159_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_159_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_159_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_159_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_159_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_159_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_159_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_159_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_159_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_159_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_159_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_159_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_160_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_160_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_160_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_160_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_160_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_160_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_160_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_160_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_160_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_160_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_160_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_160_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_160_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_160_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_160_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_160_15.png"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_161_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_161_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_161_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_161_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_161_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_161_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_161_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_161_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_161_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_161_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_161_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_161_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_161_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_161_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_161_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_161_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_162_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_162_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_162_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_162_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_162_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_162_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_162_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_162_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_162_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_162_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_162_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_162_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_162_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_162_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_162_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_162_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_163_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_163_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_163_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_163_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_163_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_163_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_163_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_163_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_163_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_163_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_163_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_163_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_163_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_163_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_163_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_163_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_164_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_164_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_164_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_164_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_164_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_164_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_164_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_164_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_164_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_164_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_164_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_164_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_164_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_164_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_164_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_164_15.png"
+    ],
+    "output": "F"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_165_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_165_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_165_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_165_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_165_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_165_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_165_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_165_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_165_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_165_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_165_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_165_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_165_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_165_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_165_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_165_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_166_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_166_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_166_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_166_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_166_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_166_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_166_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_166_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_166_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_166_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_166_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_166_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_166_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_166_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_166_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_166_15.png"
+    ],
+    "output": "F"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_167_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_167_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_167_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_167_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_167_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_167_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_167_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_167_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_167_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_167_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_167_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_167_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_167_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_167_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_167_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_167_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_168_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_168_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_168_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_168_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_168_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_168_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_168_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_168_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_168_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_168_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_168_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_168_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_168_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_168_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_168_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_168_15.png"
+    ],
+    "output": "G"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_169_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_169_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_169_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_169_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_169_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_169_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_169_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_169_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_169_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_169_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_169_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_169_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_169_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_169_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_169_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_169_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_170_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_170_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_170_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_170_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_170_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_170_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_170_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_170_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_170_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_170_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_170_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_170_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_170_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_170_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_170_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_170_15.png"
+    ],
+    "output": "H"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_171_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_171_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_171_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_171_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_171_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_171_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_171_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_171_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_171_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_171_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_171_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_171_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_171_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_171_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_171_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_171_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_172_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_172_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_172_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_172_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_172_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_172_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_172_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_172_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_172_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_172_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_172_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_172_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_172_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_172_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_172_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_172_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_173_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_173_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_173_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_173_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_173_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_173_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_173_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_173_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_173_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_173_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_173_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_173_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_173_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_173_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_173_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_173_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_174_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_174_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_174_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_174_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_174_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_174_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_174_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_174_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_174_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_174_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_174_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_174_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_174_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_174_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_174_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_174_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_175_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_175_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_175_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_175_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_175_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_175_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_175_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_175_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_175_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_175_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_175_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_175_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_175_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_175_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_175_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_175_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_176_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_176_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_176_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_176_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_176_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_176_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_176_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_176_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_176_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_176_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_176_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_176_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_176_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_176_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_176_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_176_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_177_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_177_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_177_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_177_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_177_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_177_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_177_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_177_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_177_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_177_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_177_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_177_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_177_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_177_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_177_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_177_15.png"
+    ],
+    "output": "F"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_178_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_178_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_178_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_178_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_178_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_178_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_178_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_178_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_178_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_178_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_178_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_178_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_178_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_178_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_178_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_178_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_179_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_179_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_179_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_179_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_179_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_179_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_179_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_179_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_179_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_179_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_179_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_179_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_179_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_179_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_179_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_179_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_180_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_180_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_180_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_180_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_180_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_180_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_180_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_180_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_180_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_180_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_180_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_180_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_180_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_180_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_180_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_180_15.png"
+    ],
+    "output": "F"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_181_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_181_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_181_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_181_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_181_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_181_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_181_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_181_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_181_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_181_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_181_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_181_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_181_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_181_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_181_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_181_15.png"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_182_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_182_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_182_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_182_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_182_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_182_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_182_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_182_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_182_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_182_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_182_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_182_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_182_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_182_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_182_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_182_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_183_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_183_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_183_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_183_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_183_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_183_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_183_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_183_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_183_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_183_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_183_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_183_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_183_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_183_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_183_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_183_15.png"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_184_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_184_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_184_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_184_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_184_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_184_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_184_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_184_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_184_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_184_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_184_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_184_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_184_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_184_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_184_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_184_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_185_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_185_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_185_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_185_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_185_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_185_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_185_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_185_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_185_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_185_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_185_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_185_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_185_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_185_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_185_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_185_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_186_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_186_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_186_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_186_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_186_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_186_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_186_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_186_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_186_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_186_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_186_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_186_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_186_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_186_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_186_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_186_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_187_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_187_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_187_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_187_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_187_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_187_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_187_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_187_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_187_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_187_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_187_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_187_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_187_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_187_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_187_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_187_15.png"
+    ],
+    "output": "F"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_188_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_188_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_188_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_188_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_188_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_188_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_188_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_188_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_188_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_188_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_188_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_188_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_188_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_188_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_188_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_188_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_189_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_189_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_189_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_189_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_189_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_189_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_189_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_189_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_189_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_189_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_189_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_189_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_189_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_189_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_189_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_189_15.png"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_190_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_190_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_190_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_190_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_190_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_190_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_190_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_190_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_190_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_190_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_190_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_190_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_190_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_190_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_190_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_190_15.png"
+    ],
+    "output": "G"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_191_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_191_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_191_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_191_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_191_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_191_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_191_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_191_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_191_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_191_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_191_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_191_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_191_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_191_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_191_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_191_15.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_192_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_192_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_192_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_192_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_192_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_192_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_192_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_192_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_192_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_192_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_192_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_192_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_192_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_192_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_192_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_192_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_193_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_193_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_193_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_193_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_193_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_193_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_193_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_193_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_193_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_193_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_193_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_193_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_193_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_193_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_193_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_193_15.png"
+    ],
+    "output": "H"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_194_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_194_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_194_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_194_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_194_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_194_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_194_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_194_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_194_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_194_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_194_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_194_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_194_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_194_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_194_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_194_15.png"
+    ],
+    "output": "G"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_195_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_195_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_195_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_195_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_195_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_195_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_195_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_195_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_195_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_195_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_195_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_195_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_195_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_195_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_195_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_195_15.png"
+    ],
+    "output": "F"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_196_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_196_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_196_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_196_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_196_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_196_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_196_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_196_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_196_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_196_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_196_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_196_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_196_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_196_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_196_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_196_15.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_197_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_197_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_197_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_197_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_197_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_197_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_197_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_197_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_197_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_197_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_197_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_197_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_197_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_197_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_197_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_197_15.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_198_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_198_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_198_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_198_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_198_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_198_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_198_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_198_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_198_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_198_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_198_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_198_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_198_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_198_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_198_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_198_15.png"
+    ],
+    "output": "E"
+  },
+  {
+    "task": "ravens_progressive_matrices",
+    "visual_input_component": "['synthetic image']",
+    "source": "RAVEN_10000",
+    "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "question": "Following the structural and analogical relations, which image best completes the problem matrix?",
+    "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_199_0.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_199_1.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_199_2.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_199_3.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_199_4.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_199_5.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_199_6.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_199_7.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_199_8.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_199_9.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_199_10.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_199_11.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_199_12.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_199_13.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_199_14.png",
+      "../MMIU-Benchmark/ravens_progressive_matrices/ravens_progressive_matrices_199_15.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [3, 1, 4, 2]\nB: [2, 3, 4, 1]\nC: [2, 1, 3, 4]\nD: [4, 2, 1, 3]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 1, 4, 2]\nB: [2, 3, 4, 1]\nC: [2, 1, 3, 4]\nD: [4, 2, 1, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_0_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_0_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_0_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_0_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_0_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [2, 3, 4, 1]\nB: [3, 1, 2, 4]\nC: [4, 3, 1, 2]\nD: [2, 4, 3, 1]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 3, 4, 1]\nB: [3, 1, 2, 4]\nC: [4, 3, 1, 2]\nD: [2, 4, 3, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_1_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_1_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_1_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_1_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_1_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [3, 4, 1, 2]\nB: [3, 4, 2, 1]\nC: [4, 3, 2, 1]\nD: [1, 2, 4, 3]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 4, 1, 2]\nB: [3, 4, 2, 1]\nC: [4, 3, 2, 1]\nD: [1, 2, 4, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_2_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_2_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_2_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_2_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_2_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [3, 4, 1, 2]\nB: [4, 1, 3, 2]\nC: [1, 3, 4, 2]\nD: [3, 2, 4, 1]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 4, 1, 2]\nB: [4, 1, 3, 2]\nC: [1, 3, 4, 2]\nD: [3, 2, 4, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_3_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_3_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_3_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_3_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_3_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [2, 1, 4, 3]\nB: [3, 1, 2, 4]\nC: [4, 3, 1, 2]\nD: [1, 4, 3, 2]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 1, 4, 3]\nB: [3, 1, 2, 4]\nC: [4, 3, 1, 2]\nD: [1, 4, 3, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_4_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_4_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_4_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_4_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_4_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [3, 2, 4, 1]\nB: [3, 4, 2, 1]\nC: [1, 3, 4, 2]\nD: [3, 1, 4, 2]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 2, 4, 1]\nB: [3, 4, 2, 1]\nC: [1, 3, 4, 2]\nD: [3, 1, 4, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_5_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_5_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_5_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_5_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_5_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [3, 2, 4, 1]\nB: [3, 4, 1, 2]\nC: [2, 1, 3, 4]\nD: [4, 2, 3, 1]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 2, 4, 1]\nB: [3, 4, 1, 2]\nC: [2, 1, 3, 4]\nD: [4, 2, 3, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_6_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_6_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_6_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_6_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_6_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [4, 1, 3, 2]\nB: [4, 1, 2, 3]\nC: [2, 3, 4, 1]\nD: [2, 4, 3, 1]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 1, 3, 2]\nB: [4, 1, 2, 3]\nC: [2, 3, 4, 1]\nD: [2, 4, 3, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_7_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_7_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_7_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_7_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_7_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [2, 4, 3, 1]\nB: [2, 3, 4, 1]\nC: [1, 3, 4, 2]\nD: [4, 3, 2, 1]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 4, 3, 1]\nB: [2, 3, 4, 1]\nC: [1, 3, 4, 2]\nD: [4, 3, 2, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_8_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_8_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_8_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_8_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_8_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [4, 1, 2, 3]\nB: [1, 3, 2, 4]\nC: [3, 1, 2, 4]\nD: [3, 4, 2, 1]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 1, 2, 3]\nB: [1, 3, 2, 4]\nC: [3, 1, 2, 4]\nD: [3, 4, 2, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_9_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_9_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_9_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_9_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_9_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [3, 2, 4, 1]\nB: [2, 1, 3, 4]\nC: [1, 3, 2, 4]\nD: [2, 3, 1, 4]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 2, 4, 1]\nB: [2, 1, 3, 4]\nC: [1, 3, 2, 4]\nD: [2, 3, 1, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_10_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_10_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_10_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_10_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_10_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [1, 4, 3, 2]\nB: [3, 4, 2, 1]\nC: [4, 1, 3, 2]\nD: [2, 4, 1, 3]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 4, 3, 2]\nB: [3, 4, 2, 1]\nC: [4, 1, 3, 2]\nD: [2, 4, 1, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_11_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_11_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_11_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_11_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_11_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [4, 3, 1, 2]\nB: [2, 1, 3, 4]\nC: [1, 2, 4, 3]\nD: [1, 3, 2, 4]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 3, 1, 2]\nB: [2, 1, 3, 4]\nC: [1, 2, 4, 3]\nD: [1, 3, 2, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_12_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_12_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_12_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_12_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_12_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [1, 4, 3, 2]\nB: [4, 3, 1, 2]\nC: [3, 2, 1, 4]\nD: [3, 1, 2, 4]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 4, 3, 2]\nB: [4, 3, 1, 2]\nC: [3, 2, 1, 4]\nD: [3, 1, 2, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_13_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_13_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_13_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_13_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_13_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [4, 3, 2, 1]\nB: [3, 4, 1, 2]\nC: [1, 4, 2, 3]\nD: [4, 3, 1, 2]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 3, 2, 1]\nB: [3, 4, 1, 2]\nC: [1, 4, 2, 3]\nD: [4, 3, 1, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_14_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_14_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_14_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_14_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_14_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [3, 1, 4, 2]\nB: [4, 2, 3, 1]\nC: [1, 4, 2, 3]\nD: [3, 4, 2, 1]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 1, 4, 2]\nB: [4, 2, 3, 1]\nC: [1, 4, 2, 3]\nD: [3, 4, 2, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_15_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_15_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_15_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_15_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_15_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [3, 4, 2, 1]\nB: [4, 2, 3, 1]\nC: [1, 3, 2, 4]\nD: [3, 2, 1, 4]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 4, 2, 1]\nB: [4, 2, 3, 1]\nC: [1, 3, 2, 4]\nD: [3, 2, 1, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_16_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_16_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_16_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_16_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_16_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [2, 1, 4, 3]\nB: [4, 2, 1, 3]\nC: [4, 2, 3, 1]\nD: [3, 4, 1, 2]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 1, 4, 3]\nB: [4, 2, 1, 3]\nC: [4, 2, 3, 1]\nD: [3, 4, 1, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_17_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_17_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_17_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_17_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_17_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [2, 4, 3, 1]\nB: [3, 1, 2, 4]\nC: [1, 3, 2, 4]\nD: [4, 2, 1, 3]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 4, 3, 1]\nB: [3, 1, 2, 4]\nC: [1, 3, 2, 4]\nD: [4, 2, 1, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_18_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_18_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_18_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_18_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_18_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [3, 1, 4, 2]\nB: [3, 2, 1, 4]\nC: [2, 4, 3, 1]\nD: [1, 2, 4, 3]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 1, 4, 2]\nB: [3, 2, 1, 4]\nC: [2, 4, 3, 1]\nD: [1, 2, 4, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_19_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_19_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_19_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_19_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_19_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [2, 1, 4, 3]\nB: [4, 1, 2, 3]\nC: [4, 3, 1, 2]\nD: [1, 2, 4, 3]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 1, 4, 3]\nB: [4, 1, 2, 3]\nC: [4, 3, 1, 2]\nD: [1, 2, 4, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_20_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_20_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_20_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_20_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_20_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [2, 4, 3, 1]\nB: [3, 2, 1, 4]\nC: [3, 2, 4, 1]\nD: [1, 2, 3, 4]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 4, 3, 1]\nB: [3, 2, 1, 4]\nC: [3, 2, 4, 1]\nD: [1, 2, 3, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_21_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_21_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_21_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_21_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_21_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [2, 1, 4, 3]\nB: [2, 3, 4, 1]\nC: [1, 4, 2, 3]\nD: [2, 1, 3, 4]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 1, 4, 3]\nB: [2, 3, 4, 1]\nC: [1, 4, 2, 3]\nD: [2, 1, 3, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_22_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_22_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_22_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_22_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_22_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [2, 4, 1, 3]\nB: [4, 2, 1, 3]\nC: [2, 1, 4, 3]\nD: [1, 3, 2, 4]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 4, 1, 3]\nB: [4, 2, 1, 3]\nC: [2, 1, 4, 3]\nD: [1, 3, 2, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_23_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_23_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_23_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_23_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_23_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [3, 1, 2, 4]\nB: [1, 3, 2, 4]\nC: [2, 1, 4, 3]\nD: [4, 2, 3, 1]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 1, 2, 4]\nB: [1, 3, 2, 4]\nC: [2, 1, 4, 3]\nD: [4, 2, 3, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_24_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_24_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_24_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_24_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_24_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [4, 2, 3, 1]\nB: [1, 4, 3, 2]\nC: [4, 2, 1, 3]\nD: [2, 3, 1, 4]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 2, 3, 1]\nB: [1, 4, 3, 2]\nC: [4, 2, 1, 3]\nD: [2, 3, 1, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_25_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_25_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_25_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_25_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_25_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [3, 4, 1, 2]\nB: [2, 4, 3, 1]\nC: [4, 3, 1, 2]\nD: [3, 1, 4, 2]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 4, 1, 2]\nB: [2, 4, 3, 1]\nC: [4, 3, 1, 2]\nD: [3, 1, 4, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_26_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_26_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_26_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_26_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_26_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [1, 3, 2, 4]\nB: [2, 4, 3, 1]\nC: [1, 4, 2, 3]\nD: [1, 2, 3, 4]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 3, 2, 4]\nB: [2, 4, 3, 1]\nC: [1, 4, 2, 3]\nD: [1, 2, 3, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_27_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_27_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_27_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_27_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_27_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [1, 4, 3, 2]\nB: [4, 1, 3, 2]\nC: [2, 1, 3, 4]\nD: [4, 3, 2, 1]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 4, 3, 2]\nB: [4, 1, 3, 2]\nC: [2, 1, 3, 4]\nD: [4, 3, 2, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_28_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_28_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_28_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_28_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_28_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [3, 2, 1, 4]\nB: [3, 1, 2, 4]\nC: [4, 1, 2, 3]\nD: [1, 4, 3, 2]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 2, 1, 4]\nB: [3, 1, 2, 4]\nC: [4, 1, 2, 3]\nD: [1, 4, 3, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_29_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_29_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_29_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_29_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_29_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [1, 2, 4, 3]\nB: [3, 2, 1, 4]\nC: [1, 3, 2, 4]\nD: [4, 3, 1, 2]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 2, 4, 3]\nB: [3, 2, 1, 4]\nC: [1, 3, 2, 4]\nD: [4, 3, 1, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_30_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_30_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_30_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_30_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_30_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [2, 1, 4, 3]\nB: [1, 4, 3, 2]\nC: [1, 3, 4, 2]\nD: [1, 2, 4, 3]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 1, 4, 3]\nB: [1, 4, 3, 2]\nC: [1, 3, 4, 2]\nD: [1, 2, 4, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_31_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_31_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_31_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_31_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_31_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [2, 3, 1, 4]\nB: [4, 3, 1, 2]\nC: [2, 3, 4, 1]\nD: [2, 1, 4, 3]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 3, 1, 4]\nB: [4, 3, 1, 2]\nC: [2, 3, 4, 1]\nD: [2, 1, 4, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_32_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_32_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_32_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_32_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_32_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [2, 1, 4, 3]\nB: [3, 2, 4, 1]\nC: [3, 4, 1, 2]\nD: [1, 2, 4, 3]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 1, 4, 3]\nB: [3, 2, 4, 1]\nC: [3, 4, 1, 2]\nD: [1, 2, 4, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_33_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_33_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_33_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_33_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_33_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [3, 2, 4, 1]\nB: [3, 1, 2, 4]\nC: [2, 3, 4, 1]\nD: [1, 3, 2, 4]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 2, 4, 1]\nB: [3, 1, 2, 4]\nC: [2, 3, 4, 1]\nD: [1, 3, 2, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_34_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_34_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_34_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_34_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_34_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [3, 4, 2, 1]\nB: [1, 4, 2, 3]\nC: [2, 1, 3, 4]\nD: [2, 3, 1, 4]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 4, 2, 1]\nB: [1, 4, 2, 3]\nC: [2, 1, 3, 4]\nD: [2, 3, 1, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_35_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_35_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_35_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_35_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_35_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [4, 3, 2, 1]\nB: [4, 1, 2, 3]\nC: [3, 2, 1, 4]\nD: [1, 3, 2, 4]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 3, 2, 1]\nB: [4, 1, 2, 3]\nC: [3, 2, 1, 4]\nD: [1, 3, 2, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_36_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_36_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_36_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_36_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_36_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [2, 3, 4, 1]\nB: [4, 1, 2, 3]\nC: [3, 4, 2, 1]\nD: [3, 4, 1, 2]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 3, 4, 1]\nB: [4, 1, 2, 3]\nC: [3, 4, 2, 1]\nD: [3, 4, 1, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_37_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_37_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_37_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_37_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_37_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [4, 3, 1, 2]\nB: [4, 2, 1, 3]\nC: [1, 3, 4, 2]\nD: [3, 1, 2, 4]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 3, 1, 2]\nB: [4, 2, 1, 3]\nC: [1, 3, 4, 2]\nD: [3, 1, 2, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_38_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_38_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_38_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_38_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_38_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [4, 1, 3, 2]\nB: [1, 3, 4, 2]\nC: [2, 3, 4, 1]\nD: [3, 1, 4, 2]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 1, 3, 2]\nB: [1, 3, 4, 2]\nC: [2, 3, 4, 1]\nD: [3, 1, 4, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_39_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_39_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_39_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_39_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_39_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [2, 1, 4, 3]\nB: [3, 4, 1, 2]\nC: [3, 2, 4, 1]\nD: [1, 2, 4, 3]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 1, 4, 3]\nB: [3, 4, 1, 2]\nC: [3, 2, 4, 1]\nD: [1, 2, 4, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_40_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_40_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_40_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_40_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_40_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [3, 4, 1, 2]\nB: [1, 2, 3, 4]\nC: [2, 4, 3, 1]\nD: [2, 3, 1, 4]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 4, 1, 2]\nB: [1, 2, 3, 4]\nC: [2, 4, 3, 1]\nD: [2, 3, 1, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_41_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_41_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_41_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_41_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_41_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [4, 3, 2, 1]\nB: [1, 2, 4, 3]\nC: [2, 1, 3, 4]\nD: [4, 2, 3, 1]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 3, 2, 1]\nB: [1, 2, 4, 3]\nC: [2, 1, 3, 4]\nD: [4, 2, 3, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_42_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_42_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_42_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_42_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_42_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [4, 2, 1, 3]\nB: [3, 1, 4, 2]\nC: [1, 2, 3, 4]\nD: [3, 4, 1, 2]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 2, 1, 3]\nB: [3, 1, 4, 2]\nC: [1, 2, 3, 4]\nD: [3, 4, 1, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_43_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_43_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_43_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_43_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_43_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [1, 3, 4, 2]\nB: [1, 4, 3, 2]\nC: [2, 3, 4, 1]\nD: [2, 1, 3, 4]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 3, 4, 2]\nB: [1, 4, 3, 2]\nC: [2, 3, 4, 1]\nD: [2, 1, 3, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_44_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_44_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_44_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_44_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_44_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [4, 2, 1, 3]\nB: [2, 3, 4, 1]\nC: [4, 3, 2, 1]\nD: [2, 1, 4, 3]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 2, 1, 3]\nB: [2, 3, 4, 1]\nC: [4, 3, 2, 1]\nD: [2, 1, 4, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_45_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_45_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_45_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_45_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_45_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [4, 1, 2, 3]\nB: [1, 3, 2, 4]\nC: [2, 4, 1, 3]\nD: [2, 1, 3, 4]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 1, 2, 3]\nB: [1, 3, 2, 4]\nC: [2, 4, 1, 3]\nD: [2, 1, 3, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_46_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_46_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_46_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_46_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_46_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [2, 1, 4, 3]\nB: [4, 2, 3, 1]\nC: [1, 3, 2, 4]\nD: [3, 1, 2, 4]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 1, 4, 3]\nB: [4, 2, 3, 1]\nC: [1, 3, 2, 4]\nD: [3, 1, 2, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_47_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_47_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_47_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_47_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_47_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [2, 1, 3, 4]\nB: [1, 2, 4, 3]\nC: [3, 1, 2, 4]\nD: [3, 1, 4, 2]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 1, 3, 4]\nB: [1, 2, 4, 3]\nC: [3, 1, 2, 4]\nD: [3, 1, 4, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_48_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_48_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_48_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_48_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_48_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [2, 1, 4, 3]\nB: [1, 4, 2, 3]\nC: [1, 3, 2, 4]\nD: [1, 2, 3, 4]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 1, 4, 3]\nB: [1, 4, 2, 3]\nC: [1, 3, 2, 4]\nD: [1, 2, 3, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_49_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_49_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_49_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_49_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_49_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [1, 2, 4, 3]\nB: [3, 1, 2, 4]\nC: [2, 4, 1, 3]\nD: [3, 4, 2, 1]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 2, 4, 3]\nB: [3, 1, 2, 4]\nC: [2, 4, 1, 3]\nD: [3, 4, 2, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_50_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_50_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_50_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_50_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_50_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [3, 4, 2, 1]\nB: [1, 4, 2, 3]\nC: [4, 2, 3, 1]\nD: [1, 3, 2, 4]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 4, 2, 1]\nB: [1, 4, 2, 3]\nC: [4, 2, 3, 1]\nD: [1, 3, 2, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_51_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_51_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_51_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_51_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_51_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [3, 4, 1, 2]\nB: [3, 2, 4, 1]\nC: [1, 3, 4, 2]\nD: [4, 1, 2, 3]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 4, 1, 2]\nB: [3, 2, 4, 1]\nC: [1, 3, 4, 2]\nD: [4, 1, 2, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_52_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_52_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_52_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_52_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_52_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [4, 2, 3, 1]\nB: [1, 3, 4, 2]\nC: [4, 3, 2, 1]\nD: [3, 4, 1, 2]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 2, 3, 1]\nB: [1, 3, 4, 2]\nC: [4, 3, 2, 1]\nD: [3, 4, 1, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_53_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_53_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_53_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_53_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_53_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [1, 2, 3, 4]\nB: [1, 3, 4, 2]\nC: [4, 3, 1, 2]\nD: [4, 3, 2, 1]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 2, 3, 4]\nB: [1, 3, 4, 2]\nC: [4, 3, 1, 2]\nD: [4, 3, 2, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_54_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_54_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_54_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_54_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_54_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [1, 2, 4, 3]\nB: [2, 1, 4, 3]\nC: [3, 1, 4, 2]\nD: [4, 2, 3, 1]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 2, 4, 3]\nB: [2, 1, 4, 3]\nC: [3, 1, 4, 2]\nD: [4, 2, 3, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_55_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_55_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_55_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_55_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_55_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [1, 4, 3, 2]\nB: [1, 4, 2, 3]\nC: [4, 3, 2, 1]\nD: [1, 3, 4, 2]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 4, 3, 2]\nB: [1, 4, 2, 3]\nC: [4, 3, 2, 1]\nD: [1, 3, 4, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_56_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_56_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_56_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_56_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_56_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [1, 4, 2, 3]\nB: [3, 2, 1, 4]\nC: [4, 2, 1, 3]\nD: [4, 3, 2, 1]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 4, 2, 3]\nB: [3, 2, 1, 4]\nC: [4, 2, 1, 3]\nD: [4, 3, 2, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_57_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_57_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_57_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_57_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_57_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [3, 4, 2, 1]\nB: [2, 1, 3, 4]\nC: [4, 1, 2, 3]\nD: [2, 1, 4, 3]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 4, 2, 1]\nB: [2, 1, 3, 4]\nC: [4, 1, 2, 3]\nD: [2, 1, 4, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_58_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_58_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_58_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_58_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_58_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [3, 1, 4, 2]\nB: [2, 3, 4, 1]\nC: [1, 4, 3, 2]\nD: [1, 4, 2, 3]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 1, 4, 2]\nB: [2, 3, 4, 1]\nC: [1, 4, 3, 2]\nD: [1, 4, 2, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_59_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_59_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_59_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_59_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_59_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [2, 1, 4, 3]\nB: [3, 1, 4, 2]\nC: [2, 3, 1, 4]\nD: [3, 1, 2, 4]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 1, 4, 3]\nB: [3, 1, 4, 2]\nC: [2, 3, 1, 4]\nD: [3, 1, 2, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_60_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_60_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_60_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_60_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_60_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [3, 4, 1, 2]\nB: [4, 3, 1, 2]\nC: [2, 4, 1, 3]\nD: [1, 2, 3, 4]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 4, 1, 2]\nB: [4, 3, 1, 2]\nC: [2, 4, 1, 3]\nD: [1, 2, 3, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_61_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_61_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_61_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_61_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_61_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [1, 4, 2, 3]\nB: [4, 3, 1, 2]\nC: [3, 1, 4, 2]\nD: [2, 3, 4, 1]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 4, 2, 3]\nB: [4, 3, 1, 2]\nC: [3, 1, 4, 2]\nD: [2, 3, 4, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_62_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_62_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_62_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_62_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_62_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [1, 2, 3, 4]\nB: [3, 1, 4, 2]\nC: [4, 3, 2, 1]\nD: [4, 2, 3, 1]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 2, 3, 4]\nB: [3, 1, 4, 2]\nC: [4, 3, 2, 1]\nD: [4, 2, 3, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_63_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_63_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_63_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_63_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_63_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [1, 3, 4, 2]\nB: [4, 1, 2, 3]\nC: [2, 3, 1, 4]\nD: [3, 4, 1, 2]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 3, 4, 2]\nB: [4, 1, 2, 3]\nC: [2, 3, 1, 4]\nD: [3, 4, 1, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_64_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_64_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_64_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_64_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_64_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [4, 3, 2, 1]\nB: [4, 2, 3, 1]\nC: [2, 1, 4, 3]\nD: [3, 2, 4, 1]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 3, 2, 1]\nB: [4, 2, 3, 1]\nC: [2, 1, 4, 3]\nD: [3, 2, 4, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_65_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_65_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_65_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_65_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_65_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [3, 4, 1, 2]\nB: [4, 3, 1, 2]\nC: [1, 3, 2, 4]\nD: [1, 3, 4, 2]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 4, 1, 2]\nB: [4, 3, 1, 2]\nC: [1, 3, 2, 4]\nD: [1, 3, 4, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_66_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_66_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_66_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_66_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_66_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [2, 4, 1, 3]\nB: [4, 3, 1, 2]\nC: [4, 2, 1, 3]\nD: [3, 4, 2, 1]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 4, 1, 3]\nB: [4, 3, 1, 2]\nC: [4, 2, 1, 3]\nD: [3, 4, 2, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_67_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_67_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_67_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_67_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_67_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [3, 4, 2, 1]\nB: [1, 3, 2, 4]\nC: [4, 2, 1, 3]\nD: [3, 2, 4, 1]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 4, 2, 1]\nB: [1, 3, 2, 4]\nC: [4, 2, 1, 3]\nD: [3, 2, 4, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_68_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_68_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_68_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_68_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_68_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [3, 1, 4, 2]\nB: [4, 2, 1, 3]\nC: [3, 2, 4, 1]\nD: [2, 1, 3, 4]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 1, 4, 2]\nB: [4, 2, 1, 3]\nC: [3, 2, 4, 1]\nD: [2, 1, 3, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_69_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_69_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_69_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_69_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_69_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [4, 3, 1, 2]\nB: [1, 4, 2, 3]\nC: [1, 3, 4, 2]\nD: [4, 1, 2, 3]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 3, 1, 2]\nB: [1, 4, 2, 3]\nC: [1, 3, 4, 2]\nD: [4, 1, 2, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_70_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_70_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_70_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_70_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_70_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [1, 2, 3, 4]\nB: [2, 4, 1, 3]\nC: [3, 4, 2, 1]\nD: [3, 2, 4, 1]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 2, 3, 4]\nB: [2, 4, 1, 3]\nC: [3, 4, 2, 1]\nD: [3, 2, 4, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_71_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_71_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_71_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_71_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_71_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [2, 4, 3, 1]\nB: [4, 2, 3, 1]\nC: [2, 1, 3, 4]\nD: [3, 4, 1, 2]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 4, 3, 1]\nB: [4, 2, 3, 1]\nC: [2, 1, 3, 4]\nD: [3, 4, 1, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_72_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_72_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_72_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_72_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_72_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [2, 1, 4, 3]\nB: [1, 4, 2, 3]\nC: [3, 1, 4, 2]\nD: [1, 3, 4, 2]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 1, 4, 3]\nB: [1, 4, 2, 3]\nC: [3, 1, 4, 2]\nD: [1, 3, 4, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_73_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_73_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_73_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_73_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_73_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [1, 2, 3, 4]\nB: [4, 3, 1, 2]\nC: [1, 4, 2, 3]\nD: [4, 3, 2, 1]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 2, 3, 4]\nB: [4, 3, 1, 2]\nC: [1, 4, 2, 3]\nD: [4, 3, 2, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_74_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_74_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_74_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_74_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_74_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [4, 1, 3, 2]\nB: [2, 3, 4, 1]\nC: [4, 3, 1, 2]\nD: [2, 1, 3, 4]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 1, 3, 2]\nB: [2, 3, 4, 1]\nC: [4, 3, 1, 2]\nD: [2, 1, 3, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_75_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_75_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_75_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_75_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_75_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [3, 4, 2, 1]\nB: [4, 2, 1, 3]\nC: [3, 2, 1, 4]\nD: [1, 3, 4, 2]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 4, 2, 1]\nB: [4, 2, 1, 3]\nC: [3, 2, 1, 4]\nD: [1, 3, 4, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_76_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_76_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_76_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_76_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_76_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [3, 4, 1, 2]\nB: [3, 1, 2, 4]\nC: [4, 1, 2, 3]\nD: [4, 3, 1, 2]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 4, 1, 2]\nB: [3, 1, 2, 4]\nC: [4, 1, 2, 3]\nD: [4, 3, 1, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_77_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_77_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_77_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_77_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_77_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [1, 4, 2, 3]\nB: [4, 3, 1, 2]\nC: [4, 1, 3, 2]\nD: [2, 3, 4, 1]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 4, 2, 3]\nB: [4, 3, 1, 2]\nC: [4, 1, 3, 2]\nD: [2, 3, 4, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_78_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_78_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_78_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_78_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_78_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [4, 1, 2, 3]\nB: [1, 3, 4, 2]\nC: [2, 1, 3, 4]\nD: [1, 4, 2, 3]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 1, 2, 3]\nB: [1, 3, 4, 2]\nC: [2, 1, 3, 4]\nD: [1, 4, 2, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_79_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_79_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_79_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_79_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_79_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [4, 1, 2, 3]\nB: [4, 2, 1, 3]\nC: [1, 3, 4, 2]\nD: [2, 3, 1, 4]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 1, 2, 3]\nB: [4, 2, 1, 3]\nC: [1, 3, 4, 2]\nD: [2, 3, 1, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_80_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_80_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_80_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_80_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_80_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [2, 3, 4, 1]\nB: [2, 3, 1, 4]\nC: [4, 2, 1, 3]\nD: [4, 1, 2, 3]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 3, 4, 1]\nB: [2, 3, 1, 4]\nC: [4, 2, 1, 3]\nD: [4, 1, 2, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_81_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_81_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_81_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_81_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_81_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [1, 4, 3, 2]\nB: [2, 1, 4, 3]\nC: [2, 3, 1, 4]\nD: [3, 1, 4, 2]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 4, 3, 2]\nB: [2, 1, 4, 3]\nC: [2, 3, 1, 4]\nD: [3, 1, 4, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_82_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_82_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_82_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_82_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_82_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [2, 4, 3, 1]\nB: [1, 3, 2, 4]\nC: [4, 2, 1, 3]\nD: [4, 3, 2, 1]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 4, 3, 1]\nB: [1, 3, 2, 4]\nC: [4, 2, 1, 3]\nD: [4, 3, 2, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_83_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_83_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_83_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_83_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_83_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [2, 1, 3, 4]\nB: [3, 2, 4, 1]\nC: [1, 3, 4, 2]\nD: [4, 3, 1, 2]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 1, 3, 4]\nB: [3, 2, 4, 1]\nC: [1, 3, 4, 2]\nD: [4, 3, 1, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_84_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_84_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_84_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_84_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_84_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [2, 4, 3, 1]\nB: [1, 3, 2, 4]\nC: [3, 4, 1, 2]\nD: [2, 1, 3, 4]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 4, 3, 1]\nB: [1, 3, 2, 4]\nC: [3, 4, 1, 2]\nD: [2, 1, 3, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_85_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_85_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_85_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_85_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_85_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [2, 4, 1, 3]\nB: [4, 2, 1, 3]\nC: [1, 3, 4, 2]\nD: [3, 2, 1, 4]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 4, 1, 3]\nB: [4, 2, 1, 3]\nC: [1, 3, 4, 2]\nD: [3, 2, 1, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_86_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_86_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_86_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_86_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_86_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [2, 4, 3, 1]\nB: [1, 3, 2, 4]\nC: [1, 4, 3, 2]\nD: [1, 2, 4, 3]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 4, 3, 1]\nB: [1, 3, 2, 4]\nC: [1, 4, 3, 2]\nD: [1, 2, 4, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_87_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_87_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_87_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_87_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_87_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [4, 2, 3, 1]\nB: [3, 2, 1, 4]\nC: [4, 3, 2, 1]\nD: [1, 4, 3, 2]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 2, 3, 1]\nB: [3, 2, 1, 4]\nC: [4, 3, 2, 1]\nD: [1, 4, 3, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_88_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_88_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_88_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_88_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_88_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [3, 4, 2, 1]\nB: [3, 2, 4, 1]\nC: [4, 2, 1, 3]\nD: [4, 1, 3, 2]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 4, 2, 1]\nB: [3, 2, 4, 1]\nC: [4, 2, 1, 3]\nD: [4, 1, 3, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_89_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_89_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_89_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_89_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_89_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [2, 3, 1, 4]\nB: [2, 1, 3, 4]\nC: [3, 4, 2, 1]\nD: [3, 4, 1, 2]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 3, 1, 4]\nB: [2, 1, 3, 4]\nC: [3, 4, 2, 1]\nD: [3, 4, 1, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_90_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_90_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_90_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_90_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_90_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [4, 2, 1, 3]\nB: [4, 1, 3, 2]\nC: [4, 3, 2, 1]\nD: [2, 1, 4, 3]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 2, 1, 3]\nB: [4, 1, 3, 2]\nC: [4, 3, 2, 1]\nD: [2, 1, 4, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_91_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_91_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_91_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_91_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_91_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [4, 3, 1, 2]\nB: [2, 1, 3, 4]\nC: [4, 1, 2, 3]\nD: [1, 3, 4, 2]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 3, 1, 2]\nB: [2, 1, 3, 4]\nC: [4, 1, 2, 3]\nD: [1, 3, 4, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_92_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_92_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_92_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_92_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_92_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [1, 4, 3, 2]\nB: [1, 3, 4, 2]\nC: [2, 4, 1, 3]\nD: [4, 2, 3, 1]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 4, 3, 2]\nB: [1, 3, 4, 2]\nC: [2, 4, 1, 3]\nD: [4, 2, 3, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_93_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_93_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_93_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_93_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_93_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [4, 3, 1, 2]\nB: [1, 4, 3, 2]\nC: [1, 3, 4, 2]\nD: [1, 3, 2, 4]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 3, 1, 2]\nB: [1, 4, 3, 2]\nC: [1, 3, 4, 2]\nD: [1, 3, 2, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_94_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_94_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_94_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_94_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_94_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [3, 1, 2, 4]\nB: [1, 3, 4, 2]\nC: [2, 4, 3, 1]\nD: [3, 2, 4, 1]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 1, 2, 4]\nB: [1, 3, 4, 2]\nC: [2, 4, 3, 1]\nD: [3, 2, 4, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_95_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_95_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_95_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_95_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_95_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [4, 1, 3, 2]\nB: [1, 2, 3, 4]\nC: [2, 4, 3, 1]\nD: [3, 4, 2, 1]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 1, 3, 2]\nB: [1, 2, 3, 4]\nC: [2, 4, 3, 1]\nD: [3, 4, 2, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_96_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_96_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_96_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_96_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_96_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [4, 1, 3, 2]\nB: [2, 4, 1, 3]\nC: [3, 2, 4, 1]\nD: [4, 3, 2, 1]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 1, 3, 2]\nB: [2, 4, 1, 3]\nC: [3, 2, 4, 1]\nD: [4, 3, 2, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_97_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_97_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_97_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_97_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_97_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [3, 2, 4, 1]\nB: [1, 4, 2, 3]\nC: [3, 1, 2, 4]\nD: [4, 3, 2, 1]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 2, 4, 1]\nB: [1, 4, 2, 3]\nC: [3, 1, 2, 4]\nD: [4, 3, 2, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_98_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_98_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_98_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_98_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_98_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['natural_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_natural",
+    "options": "A: [2, 4, 3, 1]\nB: [3, 4, 2, 1]\nC: [4, 1, 2, 3]\nD: [1, 3, 4, 2]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 4, 3, 1]\nB: [3, 4, 2, 1]\nC: [4, 1, 2, 3]\nD: [1, 3, 4, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_99_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_99_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_99_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_99_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_99_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [2, 4, 3, 1]\nB: [2, 1, 4, 3]\nC: [3, 2, 1, 4]\nD: [1, 2, 4, 3]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 4, 3, 1]\nB: [2, 1, 4, 3]\nC: [3, 2, 1, 4]\nD: [1, 2, 4, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_100_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_100_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_100_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_100_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_100_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [1, 4, 3, 2]\nB: [1, 3, 4, 2]\nC: [3, 1, 2, 4]\nD: [2, 4, 1, 3]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 4, 3, 2]\nB: [1, 3, 4, 2]\nC: [3, 1, 2, 4]\nD: [2, 4, 1, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_101_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_101_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_101_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_101_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_101_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [1, 4, 2, 3]\nB: [2, 4, 1, 3]\nC: [2, 1, 4, 3]\nD: [3, 4, 2, 1]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 4, 2, 3]\nB: [2, 4, 1, 3]\nC: [2, 1, 4, 3]\nD: [3, 4, 2, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_102_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_102_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_102_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_102_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_102_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [1, 3, 2, 4]\nB: [1, 3, 4, 2]\nC: [2, 4, 3, 1]\nD: [4, 3, 1, 2]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 3, 2, 4]\nB: [1, 3, 4, 2]\nC: [2, 4, 3, 1]\nD: [4, 3, 1, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_103_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_103_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_103_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_103_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_103_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [4, 1, 2, 3]\nB: [3, 4, 1, 2]\nC: [4, 1, 3, 2]\nD: [4, 3, 1, 2]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 1, 2, 3]\nB: [3, 4, 1, 2]\nC: [4, 1, 3, 2]\nD: [4, 3, 1, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_104_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_104_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_104_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_104_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_104_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [2, 3, 4, 1]\nB: [1, 4, 2, 3]\nC: [2, 3, 1, 4]\nD: [1, 3, 2, 4]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 3, 4, 1]\nB: [1, 4, 2, 3]\nC: [2, 3, 1, 4]\nD: [1, 3, 2, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_105_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_105_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_105_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_105_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_105_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [2, 3, 4, 1]\nB: [3, 4, 1, 2]\nC: [2, 1, 4, 3]\nD: [2, 3, 1, 4]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 3, 4, 1]\nB: [3, 4, 1, 2]\nC: [2, 1, 4, 3]\nD: [2, 3, 1, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_106_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_106_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_106_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_106_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_106_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [3, 4, 1, 2]\nB: [1, 4, 2, 3]\nC: [3, 2, 4, 1]\nD: [4, 2, 1, 3]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 4, 1, 2]\nB: [1, 4, 2, 3]\nC: [3, 2, 4, 1]\nD: [4, 2, 1, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_107_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_107_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_107_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_107_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_107_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [4, 3, 1, 2]\nB: [3, 4, 2, 1]\nC: [3, 2, 4, 1]\nD: [4, 2, 3, 1]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 3, 1, 2]\nB: [3, 4, 2, 1]\nC: [3, 2, 4, 1]\nD: [4, 2, 3, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_108_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_108_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_108_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_108_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_108_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [1, 3, 4, 2]\nB: [3, 1, 2, 4]\nC: [3, 1, 4, 2]\nD: [2, 3, 1, 4]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 3, 4, 2]\nB: [3, 1, 2, 4]\nC: [3, 1, 4, 2]\nD: [2, 3, 1, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_109_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_109_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_109_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_109_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_109_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [4, 3, 2, 1]\nB: [3, 1, 2, 4]\nC: [2, 4, 3, 1]\nD: [1, 4, 3, 2]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 3, 2, 1]\nB: [3, 1, 2, 4]\nC: [2, 4, 3, 1]\nD: [1, 4, 3, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_110_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_110_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_110_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_110_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_110_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [1, 2, 3, 4]\nB: [1, 4, 3, 2]\nC: [2, 1, 3, 4]\nD: [3, 1, 2, 4]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 2, 3, 4]\nB: [1, 4, 3, 2]\nC: [2, 1, 3, 4]\nD: [3, 1, 2, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_111_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_111_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_111_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_111_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_111_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [1, 3, 2, 4]\nB: [4, 2, 1, 3]\nC: [1, 4, 3, 2]\nD: [3, 4, 2, 1]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 3, 2, 4]\nB: [4, 2, 1, 3]\nC: [1, 4, 3, 2]\nD: [3, 4, 2, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_112_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_112_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_112_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_112_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_112_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [1, 3, 2, 4]\nB: [1, 4, 3, 2]\nC: [2, 4, 3, 1]\nD: [1, 4, 2, 3]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 3, 2, 4]\nB: [1, 4, 3, 2]\nC: [2, 4, 3, 1]\nD: [1, 4, 2, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_113_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_113_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_113_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_113_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_113_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [1, 3, 2, 4]\nB: [2, 3, 1, 4]\nC: [3, 4, 2, 1]\nD: [3, 1, 4, 2]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 3, 2, 4]\nB: [2, 3, 1, 4]\nC: [3, 4, 2, 1]\nD: [3, 1, 4, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_114_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_114_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_114_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_114_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_114_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [4, 3, 2, 1]\nB: [1, 3, 4, 2]\nC: [3, 1, 2, 4]\nD: [4, 3, 1, 2]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 3, 2, 1]\nB: [1, 3, 4, 2]\nC: [3, 1, 2, 4]\nD: [4, 3, 1, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_115_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_115_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_115_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_115_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_115_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [3, 2, 1, 4]\nB: [3, 1, 4, 2]\nC: [4, 3, 2, 1]\nD: [1, 3, 2, 4]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 2, 1, 4]\nB: [3, 1, 4, 2]\nC: [4, 3, 2, 1]\nD: [1, 3, 2, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_116_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_116_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_116_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_116_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_116_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [4, 1, 2, 3]\nB: [1, 2, 4, 3]\nC: [3, 2, 1, 4]\nD: [1, 3, 4, 2]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 1, 2, 3]\nB: [1, 2, 4, 3]\nC: [3, 2, 1, 4]\nD: [1, 3, 4, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_117_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_117_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_117_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_117_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_117_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [1, 3, 4, 2]\nB: [4, 3, 2, 1]\nC: [3, 4, 2, 1]\nD: [1, 4, 3, 2]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 3, 4, 2]\nB: [4, 3, 2, 1]\nC: [3, 4, 2, 1]\nD: [1, 4, 3, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_118_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_118_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_118_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_118_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_118_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [4, 3, 1, 2]\nB: [2, 3, 4, 1]\nC: [1, 3, 2, 4]\nD: [1, 3, 4, 2]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 3, 1, 2]\nB: [2, 3, 4, 1]\nC: [1, 3, 2, 4]\nD: [1, 3, 4, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_119_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_119_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_119_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_119_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_119_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [4, 1, 2, 3]\nB: [1, 4, 2, 3]\nC: [3, 1, 2, 4]\nD: [3, 1, 4, 2]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 1, 2, 3]\nB: [1, 4, 2, 3]\nC: [3, 1, 2, 4]\nD: [3, 1, 4, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_120_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_120_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_120_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_120_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_120_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [1, 2, 3, 4]\nB: [4, 3, 2, 1]\nC: [1, 4, 3, 2]\nD: [2, 3, 1, 4]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 2, 3, 4]\nB: [4, 3, 2, 1]\nC: [1, 4, 3, 2]\nD: [2, 3, 1, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_121_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_121_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_121_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_121_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_121_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [1, 4, 3, 2]\nB: [1, 2, 3, 4]\nC: [4, 1, 3, 2]\nD: [1, 4, 2, 3]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 4, 3, 2]\nB: [1, 2, 3, 4]\nC: [4, 1, 3, 2]\nD: [1, 4, 2, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_122_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_122_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_122_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_122_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_122_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [1, 4, 2, 3]\nB: [1, 2, 3, 4]\nC: [1, 3, 4, 2]\nD: [2, 4, 3, 1]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 4, 2, 3]\nB: [1, 2, 3, 4]\nC: [1, 3, 4, 2]\nD: [2, 4, 3, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_123_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_123_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_123_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_123_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_123_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [2, 1, 4, 3]\nB: [2, 4, 3, 1]\nC: [1, 4, 3, 2]\nD: [1, 2, 4, 3]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 1, 4, 3]\nB: [2, 4, 3, 1]\nC: [1, 4, 3, 2]\nD: [1, 2, 4, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_124_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_124_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_124_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_124_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_124_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [1, 4, 2, 3]\nB: [2, 4, 1, 3]\nC: [1, 2, 4, 3]\nD: [4, 1, 3, 2]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 4, 2, 3]\nB: [2, 4, 1, 3]\nC: [1, 2, 4, 3]\nD: [4, 1, 3, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_125_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_125_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_125_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_125_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_125_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [3, 1, 2, 4]\nB: [4, 2, 1, 3]\nC: [3, 4, 1, 2]\nD: [2, 1, 3, 4]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 1, 2, 4]\nB: [4, 2, 1, 3]\nC: [3, 4, 1, 2]\nD: [2, 1, 3, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_126_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_126_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_126_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_126_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_126_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [1, 4, 2, 3]\nB: [3, 2, 4, 1]\nC: [2, 4, 1, 3]\nD: [1, 4, 3, 2]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 4, 2, 3]\nB: [3, 2, 4, 1]\nC: [2, 4, 1, 3]\nD: [1, 4, 3, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_127_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_127_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_127_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_127_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_127_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [2, 4, 3, 1]\nB: [1, 3, 4, 2]\nC: [3, 1, 2, 4]\nD: [1, 2, 3, 4]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 4, 3, 1]\nB: [1, 3, 4, 2]\nC: [3, 1, 2, 4]\nD: [1, 2, 3, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_128_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_128_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_128_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_128_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_128_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [4, 1, 3, 2]\nB: [2, 4, 1, 3]\nC: [2, 4, 3, 1]\nD: [4, 3, 1, 2]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 1, 3, 2]\nB: [2, 4, 1, 3]\nC: [2, 4, 3, 1]\nD: [4, 3, 1, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_129_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_129_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_129_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_129_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_129_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [3, 2, 4, 1]\nB: [2, 4, 3, 1]\nC: [2, 1, 3, 4]\nD: [1, 2, 4, 3]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 2, 4, 1]\nB: [2, 4, 3, 1]\nC: [2, 1, 3, 4]\nD: [1, 2, 4, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_130_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_130_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_130_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_130_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_130_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [3, 1, 2, 4]\nB: [4, 1, 3, 2]\nC: [3, 4, 1, 2]\nD: [4, 3, 1, 2]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 1, 2, 4]\nB: [4, 1, 3, 2]\nC: [3, 4, 1, 2]\nD: [4, 3, 1, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_131_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_131_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_131_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_131_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_131_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [2, 4, 1, 3]\nB: [4, 3, 2, 1]\nC: [4, 1, 3, 2]\nD: [2, 3, 4, 1]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 4, 1, 3]\nB: [4, 3, 2, 1]\nC: [4, 1, 3, 2]\nD: [2, 3, 4, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_132_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_132_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_132_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_132_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_132_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [4, 1, 2, 3]\nB: [2, 3, 1, 4]\nC: [4, 1, 3, 2]\nD: [1, 3, 2, 4]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 1, 2, 3]\nB: [2, 3, 1, 4]\nC: [4, 1, 3, 2]\nD: [1, 3, 2, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_133_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_133_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_133_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_133_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_133_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [2, 1, 4, 3]\nB: [2, 4, 3, 1]\nC: [4, 3, 1, 2]\nD: [1, 2, 3, 4]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 1, 4, 3]\nB: [2, 4, 3, 1]\nC: [4, 3, 1, 2]\nD: [1, 2, 3, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_134_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_134_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_134_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_134_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_134_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [1, 4, 3, 2]\nB: [4, 3, 2, 1]\nC: [4, 1, 2, 3]\nD: [3, 1, 2, 4]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 4, 3, 2]\nB: [4, 3, 2, 1]\nC: [4, 1, 2, 3]\nD: [3, 1, 2, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_135_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_135_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_135_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_135_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_135_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [3, 1, 2, 4]\nB: [4, 2, 3, 1]\nC: [1, 2, 4, 3]\nD: [3, 1, 4, 2]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 1, 2, 4]\nB: [4, 2, 3, 1]\nC: [1, 2, 4, 3]\nD: [3, 1, 4, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_136_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_136_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_136_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_136_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_136_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [1, 4, 2, 3]\nB: [1, 3, 2, 4]\nC: [2, 4, 1, 3]\nD: [1, 4, 3, 2]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 4, 2, 3]\nB: [1, 3, 2, 4]\nC: [2, 4, 1, 3]\nD: [1, 4, 3, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_137_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_137_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_137_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_137_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_137_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [4, 3, 1, 2]\nB: [4, 3, 2, 1]\nC: [4, 2, 3, 1]\nD: [4, 1, 2, 3]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 3, 1, 2]\nB: [4, 3, 2, 1]\nC: [4, 2, 3, 1]\nD: [4, 1, 2, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_138_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_138_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_138_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_138_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_138_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [2, 1, 4, 3]\nB: [3, 1, 2, 4]\nC: [4, 2, 3, 1]\nD: [4, 3, 1, 2]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 1, 4, 3]\nB: [3, 1, 2, 4]\nC: [4, 2, 3, 1]\nD: [4, 3, 1, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_139_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_139_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_139_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_139_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_139_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [4, 1, 3, 2]\nB: [2, 1, 4, 3]\nC: [1, 4, 2, 3]\nD: [2, 3, 4, 1]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 1, 3, 2]\nB: [2, 1, 4, 3]\nC: [1, 4, 2, 3]\nD: [2, 3, 4, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_140_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_140_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_140_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_140_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_140_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [4, 3, 1, 2]\nB: [1, 4, 3, 2]\nC: [2, 4, 3, 1]\nD: [3, 2, 1, 4]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 3, 1, 2]\nB: [1, 4, 3, 2]\nC: [2, 4, 3, 1]\nD: [3, 2, 1, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_141_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_141_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_141_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_141_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_141_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [2, 4, 1, 3]\nB: [1, 4, 2, 3]\nC: [2, 1, 4, 3]\nD: [4, 1, 3, 2]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 4, 1, 3]\nB: [1, 4, 2, 3]\nC: [2, 1, 4, 3]\nD: [4, 1, 3, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_142_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_142_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_142_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_142_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_142_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [4, 2, 3, 1]\nB: [4, 3, 2, 1]\nC: [3, 1, 2, 4]\nD: [4, 2, 1, 3]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 2, 3, 1]\nB: [4, 3, 2, 1]\nC: [3, 1, 2, 4]\nD: [4, 2, 1, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_143_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_143_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_143_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_143_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_143_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [3, 1, 2, 4]\nB: [3, 1, 4, 2]\nC: [3, 2, 1, 4]\nD: [2, 4, 1, 3]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 1, 2, 4]\nB: [3, 1, 4, 2]\nC: [3, 2, 1, 4]\nD: [2, 4, 1, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_144_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_144_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_144_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_144_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_144_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [3, 1, 4, 2]\nB: [2, 4, 1, 3]\nC: [4, 1, 3, 2]\nD: [1, 4, 2, 3]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 1, 4, 2]\nB: [2, 4, 1, 3]\nC: [4, 1, 3, 2]\nD: [1, 4, 2, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_145_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_145_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_145_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_145_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_145_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [4, 3, 1, 2]\nB: [4, 2, 3, 1]\nC: [1, 4, 2, 3]\nD: [2, 4, 3, 1]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 3, 1, 2]\nB: [4, 2, 3, 1]\nC: [1, 4, 2, 3]\nD: [2, 4, 3, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_146_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_146_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_146_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_146_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_146_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [2, 3, 1, 4]\nB: [2, 4, 1, 3]\nC: [4, 1, 3, 2]\nD: [1, 2, 3, 4]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 3, 1, 4]\nB: [2, 4, 1, 3]\nC: [4, 1, 3, 2]\nD: [1, 2, 3, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_147_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_147_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_147_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_147_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_147_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [2, 1, 4, 3]\nB: [2, 3, 1, 4]\nC: [3, 1, 4, 2]\nD: [2, 4, 1, 3]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 1, 4, 3]\nB: [2, 3, 1, 4]\nC: [3, 1, 4, 2]\nD: [2, 4, 1, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_148_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_148_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_148_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_148_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_148_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [1, 2, 4, 3]\nB: [4, 1, 3, 2]\nC: [2, 3, 1, 4]\nD: [4, 2, 1, 3]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 2, 4, 3]\nB: [4, 1, 3, 2]\nC: [2, 3, 1, 4]\nD: [4, 2, 1, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_149_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_149_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_149_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_149_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_149_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [1, 2, 3, 4]\nB: [2, 3, 4, 1]\nC: [4, 1, 3, 2]\nD: [3, 4, 2, 1]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 2, 3, 4]\nB: [2, 3, 4, 1]\nC: [4, 1, 3, 2]\nD: [3, 4, 2, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_150_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_150_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_150_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_150_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_150_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [3, 2, 4, 1]\nB: [3, 2, 1, 4]\nC: [3, 1, 2, 4]\nD: [4, 2, 3, 1]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 2, 4, 1]\nB: [3, 2, 1, 4]\nC: [3, 1, 2, 4]\nD: [4, 2, 3, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_151_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_151_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_151_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_151_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_151_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [4, 3, 1, 2]\nB: [3, 1, 4, 2]\nC: [3, 2, 1, 4]\nD: [4, 1, 3, 2]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 3, 1, 2]\nB: [3, 1, 4, 2]\nC: [3, 2, 1, 4]\nD: [4, 1, 3, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_152_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_152_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_152_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_152_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_152_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [3, 2, 1, 4]\nB: [1, 3, 4, 2]\nC: [2, 3, 1, 4]\nD: [4, 2, 3, 1]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 2, 1, 4]\nB: [1, 3, 4, 2]\nC: [2, 3, 1, 4]\nD: [4, 2, 3, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_153_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_153_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_153_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_153_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_153_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [4, 3, 2, 1]\nB: [4, 3, 1, 2]\nC: [4, 1, 2, 3]\nD: [3, 4, 1, 2]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 3, 2, 1]\nB: [4, 3, 1, 2]\nC: [4, 1, 2, 3]\nD: [3, 4, 1, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_154_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_154_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_154_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_154_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_154_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [4, 3, 2, 1]\nB: [1, 4, 3, 2]\nC: [2, 1, 3, 4]\nD: [1, 3, 4, 2]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 3, 2, 1]\nB: [1, 4, 3, 2]\nC: [2, 1, 3, 4]\nD: [1, 3, 4, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_155_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_155_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_155_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_155_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_155_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [3, 1, 4, 2]\nB: [3, 2, 4, 1]\nC: [4, 2, 3, 1]\nD: [2, 4, 1, 3]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 1, 4, 2]\nB: [3, 2, 4, 1]\nC: [4, 2, 3, 1]\nD: [2, 4, 1, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_156_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_156_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_156_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_156_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_156_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [4, 2, 3, 1]\nB: [1, 3, 2, 4]\nC: [3, 4, 1, 2]\nD: [2, 3, 4, 1]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 2, 3, 1]\nB: [1, 3, 2, 4]\nC: [3, 4, 1, 2]\nD: [2, 3, 4, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_157_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_157_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_157_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_157_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_157_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [2, 3, 4, 1]\nB: [1, 3, 2, 4]\nC: [2, 3, 1, 4]\nD: [3, 4, 2, 1]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 3, 4, 1]\nB: [1, 3, 2, 4]\nC: [2, 3, 1, 4]\nD: [3, 4, 2, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_158_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_158_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_158_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_158_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_158_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [1, 3, 2, 4]\nB: [2, 1, 3, 4]\nC: [3, 4, 2, 1]\nD: [2, 1, 4, 3]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 3, 2, 4]\nB: [2, 1, 3, 4]\nC: [3, 4, 2, 1]\nD: [2, 1, 4, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_159_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_159_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_159_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_159_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_159_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [3, 4, 2, 1]\nB: [4, 3, 1, 2]\nC: [4, 3, 2, 1]\nD: [2, 1, 3, 4]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 4, 2, 1]\nB: [4, 3, 1, 2]\nC: [4, 3, 2, 1]\nD: [2, 1, 3, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_160_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_160_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_160_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_160_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_160_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [2, 1, 3, 4]\nB: [4, 2, 1, 3]\nC: [1, 4, 2, 3]\nD: [1, 2, 4, 3]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 1, 3, 4]\nB: [4, 2, 1, 3]\nC: [1, 4, 2, 3]\nD: [1, 2, 4, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_161_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_161_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_161_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_161_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_161_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [4, 1, 2, 3]\nB: [4, 2, 1, 3]\nC: [1, 4, 2, 3]\nD: [3, 4, 1, 2]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 1, 2, 3]\nB: [4, 2, 1, 3]\nC: [1, 4, 2, 3]\nD: [3, 4, 1, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_162_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_162_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_162_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_162_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_162_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [2, 1, 4, 3]\nB: [4, 2, 1, 3]\nC: [1, 2, 3, 4]\nD: [2, 3, 1, 4]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 1, 4, 3]\nB: [4, 2, 1, 3]\nC: [1, 2, 3, 4]\nD: [2, 3, 1, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_163_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_163_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_163_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_163_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_163_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [2, 4, 3, 1]\nB: [3, 1, 4, 2]\nC: [3, 2, 1, 4]\nD: [2, 1, 4, 3]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 4, 3, 1]\nB: [3, 1, 4, 2]\nC: [3, 2, 1, 4]\nD: [2, 1, 4, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_164_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_164_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_164_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_164_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_164_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [3, 2, 4, 1]\nB: [2, 4, 3, 1]\nC: [1, 2, 4, 3]\nD: [2, 4, 1, 3]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 2, 4, 1]\nB: [2, 4, 3, 1]\nC: [1, 2, 4, 3]\nD: [2, 4, 1, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_165_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_165_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_165_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_165_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_165_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [1, 2, 4, 3]\nB: [1, 4, 2, 3]\nC: [4, 3, 1, 2]\nD: [3, 1, 4, 2]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 2, 4, 3]\nB: [1, 4, 2, 3]\nC: [4, 3, 1, 2]\nD: [3, 1, 4, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_166_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_166_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_166_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_166_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_166_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [2, 4, 1, 3]\nB: [2, 1, 4, 3]\nC: [1, 3, 2, 4]\nD: [3, 1, 4, 2]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 4, 1, 3]\nB: [2, 1, 4, 3]\nC: [1, 3, 2, 4]\nD: [3, 1, 4, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_167_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_167_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_167_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_167_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_167_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [2, 4, 3, 1]\nB: [4, 3, 1, 2]\nC: [4, 1, 3, 2]\nD: [3, 1, 2, 4]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 4, 3, 1]\nB: [4, 3, 1, 2]\nC: [4, 1, 3, 2]\nD: [3, 1, 2, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_168_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_168_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_168_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_168_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_168_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [1, 2, 4, 3]\nB: [3, 2, 4, 1]\nC: [4, 1, 2, 3]\nD: [4, 2, 3, 1]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 2, 4, 3]\nB: [3, 2, 4, 1]\nC: [4, 1, 2, 3]\nD: [4, 2, 3, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_169_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_169_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_169_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_169_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_169_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [3, 2, 1, 4]\nB: [1, 4, 2, 3]\nC: [1, 4, 3, 2]\nD: [1, 2, 4, 3]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 2, 1, 4]\nB: [1, 4, 2, 3]\nC: [1, 4, 3, 2]\nD: [1, 2, 4, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_170_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_170_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_170_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_170_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_170_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [1, 4, 3, 2]\nB: [3, 1, 2, 4]\nC: [2, 4, 3, 1]\nD: [4, 2, 3, 1]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 4, 3, 2]\nB: [3, 1, 2, 4]\nC: [2, 4, 3, 1]\nD: [4, 2, 3, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_171_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_171_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_171_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_171_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_171_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [4, 1, 2, 3]\nB: [4, 2, 1, 3]\nC: [3, 1, 4, 2]\nD: [3, 2, 1, 4]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 1, 2, 3]\nB: [4, 2, 1, 3]\nC: [3, 1, 4, 2]\nD: [3, 2, 1, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_172_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_172_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_172_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_172_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_172_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [2, 3, 4, 1]\nB: [1, 4, 3, 2]\nC: [4, 2, 3, 1]\nD: [4, 1, 3, 2]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 3, 4, 1]\nB: [1, 4, 3, 2]\nC: [4, 2, 3, 1]\nD: [4, 1, 3, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_173_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_173_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_173_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_173_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_173_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [2, 1, 3, 4]\nB: [2, 3, 4, 1]\nC: [3, 1, 4, 2]\nD: [4, 1, 2, 3]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 1, 3, 4]\nB: [2, 3, 4, 1]\nC: [3, 1, 4, 2]\nD: [4, 1, 2, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_174_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_174_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_174_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_174_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_174_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [4, 1, 3, 2]\nB: [1, 4, 2, 3]\nC: [3, 1, 4, 2]\nD: [4, 2, 1, 3]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 1, 3, 2]\nB: [1, 4, 2, 3]\nC: [3, 1, 4, 2]\nD: [4, 2, 1, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_175_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_175_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_175_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_175_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_175_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [3, 2, 1, 4]\nB: [3, 4, 2, 1]\nC: [4, 1, 2, 3]\nD: [2, 1, 3, 4]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 2, 1, 4]\nB: [3, 4, 2, 1]\nC: [4, 1, 2, 3]\nD: [2, 1, 3, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_176_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_176_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_176_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_176_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_176_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [1, 4, 2, 3]\nB: [1, 4, 3, 2]\nC: [2, 1, 4, 3]\nD: [3, 4, 1, 2]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 4, 2, 3]\nB: [1, 4, 3, 2]\nC: [2, 1, 4, 3]\nD: [3, 4, 1, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_177_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_177_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_177_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_177_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_177_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [4, 2, 1, 3]\nB: [1, 4, 3, 2]\nC: [2, 1, 3, 4]\nD: [4, 3, 1, 2]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 2, 1, 3]\nB: [1, 4, 3, 2]\nC: [2, 1, 3, 4]\nD: [4, 3, 1, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_178_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_178_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_178_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_178_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_178_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [4, 1, 2, 3]\nB: [3, 1, 4, 2]\nC: [1, 2, 4, 3]\nD: [1, 2, 3, 4]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 1, 2, 3]\nB: [3, 1, 4, 2]\nC: [1, 2, 4, 3]\nD: [1, 2, 3, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_179_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_179_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_179_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_179_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_179_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [4, 3, 2, 1]\nB: [3, 4, 2, 1]\nC: [3, 1, 4, 2]\nD: [1, 4, 2, 3]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 3, 2, 1]\nB: [3, 4, 2, 1]\nC: [3, 1, 4, 2]\nD: [1, 4, 2, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_180_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_180_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_180_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_180_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_180_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [2, 1, 3, 4]\nB: [4, 1, 3, 2]\nC: [1, 4, 3, 2]\nD: [3, 4, 2, 1]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 1, 3, 4]\nB: [4, 1, 3, 2]\nC: [1, 4, 3, 2]\nD: [3, 4, 2, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_181_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_181_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_181_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_181_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_181_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [4, 1, 3, 2]\nB: [3, 4, 1, 2]\nC: [2, 3, 1, 4]\nD: [3, 2, 4, 1]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 1, 3, 2]\nB: [3, 4, 1, 2]\nC: [2, 3, 1, 4]\nD: [3, 2, 4, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_182_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_182_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_182_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_182_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_182_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [4, 2, 3, 1]\nB: [4, 1, 3, 2]\nC: [2, 1, 4, 3]\nD: [1, 4, 3, 2]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 2, 3, 1]\nB: [4, 1, 3, 2]\nC: [2, 1, 4, 3]\nD: [1, 4, 3, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_183_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_183_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_183_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_183_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_183_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [3, 4, 2, 1]\nB: [3, 2, 4, 1]\nC: [4, 1, 2, 3]\nD: [4, 2, 3, 1]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 4, 2, 1]\nB: [3, 2, 4, 1]\nC: [4, 1, 2, 3]\nD: [4, 2, 3, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_184_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_184_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_184_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_184_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_184_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [1, 3, 2, 4]\nB: [4, 2, 3, 1]\nC: [3, 2, 1, 4]\nD: [3, 1, 4, 2]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 3, 2, 4]\nB: [4, 2, 3, 1]\nC: [3, 2, 1, 4]\nD: [3, 1, 4, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_185_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_185_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_185_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_185_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_185_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [1, 4, 2, 3]\nB: [3, 1, 4, 2]\nC: [2, 1, 4, 3]\nD: [4, 2, 3, 1]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 4, 2, 3]\nB: [3, 1, 4, 2]\nC: [2, 1, 4, 3]\nD: [4, 2, 3, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_186_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_186_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_186_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_186_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_186_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [4, 3, 2, 1]\nB: [4, 3, 1, 2]\nC: [1, 2, 4, 3]\nD: [1, 2, 3, 4]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 3, 2, 1]\nB: [4, 3, 1, 2]\nC: [1, 2, 4, 3]\nD: [1, 2, 3, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_187_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_187_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_187_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_187_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_187_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [4, 2, 3, 1]\nB: [4, 3, 2, 1]\nC: [1, 3, 4, 2]\nD: [1, 3, 2, 4]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 2, 3, 1]\nB: [4, 3, 2, 1]\nC: [1, 3, 4, 2]\nD: [1, 3, 2, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_188_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_188_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_188_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_188_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_188_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [3, 1, 4, 2]\nB: [3, 2, 4, 1]\nC: [1, 2, 4, 3]\nD: [1, 4, 2, 3]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 1, 4, 2]\nB: [3, 2, 4, 1]\nC: [1, 2, 4, 3]\nD: [1, 4, 2, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_189_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_189_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_189_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_189_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_189_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [4, 2, 3, 1]\nB: [1, 2, 3, 4]\nC: [1, 2, 4, 3]\nD: [1, 3, 4, 2]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 2, 3, 1]\nB: [1, 2, 3, 4]\nC: [1, 2, 4, 3]\nD: [1, 3, 4, 2]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_190_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_190_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_190_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_190_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_190_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [2, 3, 4, 1]\nB: [3, 1, 4, 2]\nC: [4, 3, 2, 1]\nD: [4, 1, 2, 3]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 3, 4, 1]\nB: [3, 1, 4, 2]\nC: [4, 3, 2, 1]\nD: [4, 1, 2, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_191_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_191_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_191_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_191_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_191_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [2, 3, 4, 1]\nB: [2, 4, 1, 3]\nC: [1, 4, 2, 3]\nD: [4, 2, 3, 1]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 3, 4, 1]\nB: [2, 4, 1, 3]\nC: [1, 4, 2, 3]\nD: [4, 2, 3, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_192_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_192_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_192_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_192_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_192_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [1, 2, 3, 4]\nB: [4, 2, 1, 3]\nC: [1, 3, 4, 2]\nD: [1, 4, 2, 3]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 2, 3, 4]\nB: [4, 2, 1, 3]\nC: [1, 3, 4, 2]\nD: [1, 4, 2, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_193_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_193_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_193_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_193_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_193_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [4, 1, 3, 2]\nB: [4, 2, 1, 3]\nC: [2, 4, 1, 3]\nD: [1, 3, 2, 4]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 1, 3, 2]\nB: [4, 2, 1, 3]\nC: [2, 4, 1, 3]\nD: [1, 3, 2, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_194_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_194_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_194_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_194_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_194_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [1, 4, 3, 2]\nB: [1, 2, 3, 4]\nC: [4, 1, 2, 3]\nD: [1, 3, 2, 4]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 4, 3, 2]\nB: [1, 2, 3, 4]\nC: [4, 1, 2, 3]\nD: [1, 3, 2, 4]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_195_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_195_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_195_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_195_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_195_4.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [1, 2, 4, 3]\nB: [2, 4, 3, 1]\nC: [3, 1, 4, 2]\nD: [3, 4, 2, 1]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 2, 4, 3]\nB: [2, 4, 3, 1]\nC: [3, 1, 4, 2]\nD: [3, 4, 2, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_196_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_196_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_196_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_196_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_196_4.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [3, 2, 1, 4]\nB: [3, 4, 2, 1]\nC: [4, 2, 3, 1]\nD: [2, 4, 3, 1]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 2, 1, 4]\nB: [3, 4, 2, 1]\nC: [4, 2, 3, 1]\nD: [2, 4, 3, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_197_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_197_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_197_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_197_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_197_4.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [4, 2, 3, 1]\nB: [1, 4, 2, 3]\nC: [3, 1, 4, 2]\nD: [2, 4, 3, 1]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 2, 3, 1]\nB: [1, 4, 2, 3]\nC: [3, 1, 4, 2]\nD: [2, 4, 3, 1]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_198_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_198_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_198_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_198_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_198_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "jigsaw_puzzle_solving",
+    "visual_input_component": "['painting_image', 'visual_mark']",
+    "source": "jigsaw_puzzle_solving_painting",
+    "options": "A: [1, 2, 3, 4]\nB: [2, 4, 1, 3]\nC: [4, 1, 3, 2]\nD: [2, 1, 4, 3]",
+    "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.",
+    "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 2, 3, 4]\nB: [2, 4, 1, 3]\nC: [4, 1, 3, 2]\nD: [2, 1, 4, 3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_199_0.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_199_1.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_199_2.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_199_3.jpg",
+      "../MMIU-Benchmark/jigsaw_puzzle_solving/jigsaw_puzzle_solving_199_4.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -42.98651909317854, \"translation_dx\": 114.49293313374625, \"translation_dy\": -39.53290228333596, \"scale\": 1.442019387031135}\nB: {\"rotation_angle\": -106.99875725121946, \"translation_dx\": 87.96881157950656, \"translation_dy\": -34.70529343588741, \"scale\": 1.407305489874207}\nC: {\"rotation_angle\": 148.22875373623708, \"translation_dx\": 53.75338658972072, \"translation_dy\": -63.78583022927253, \"scale\": 0.9304836306567924}\nD: {\"rotation_angle\": 52.0207999596704, \"translation_dx\": 62.052266940503074, \"translation_dy\": 15.318990484280505, \"scale\": 1.1445040102422772}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -42.98651909317854, \"translation_dx\": 114.49293313374625, \"translation_dy\": -39.53290228333596, \"scale\": 1.442019387031135}\nB: {\"rotation_angle\": -106.99875725121946, \"translation_dx\": 87.96881157950656, \"translation_dy\": -34.70529343588741, \"scale\": 1.407305489874207}\nC: {\"rotation_angle\": 148.22875373623708, \"translation_dx\": 53.75338658972072, \"translation_dy\": -63.78583022927253, \"scale\": 0.9304836306567924}\nD: {\"rotation_angle\": 52.0207999596704, \"translation_dx\": 62.052266940503074, \"translation_dy\": 15.318990484280505, \"scale\": 1.1445040102422772}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_0_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_0_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -38.58021171568234, \"translation_dx\": -80.14139661496048, \"translation_dy\": 7.985099889843255, \"scale\": 1.029545268033875}\nB: {\"rotation_angle\": -160.6395227566207, \"translation_dx\": 53.66643366551958, \"translation_dy\": -27.712376159428388, \"scale\": 1.1084051689599654}\nC: {\"rotation_angle\": 44.2601421515034, \"translation_dx\": -84.9832744911761, \"translation_dy\": -78.07982572554322, \"scale\": 0.5612120736859965}\nD: {\"rotation_angle\": -15.445234303955033, \"translation_dx\": 52.656313993324545, \"translation_dy\": 4.243768644047549, \"scale\": 0.8747335302455691}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -38.58021171568234, \"translation_dx\": -80.14139661496048, \"translation_dy\": 7.985099889843255, \"scale\": 1.029545268033875}\nB: {\"rotation_angle\": -160.6395227566207, \"translation_dx\": 53.66643366551958, \"translation_dy\": -27.712376159428388, \"scale\": 1.1084051689599654}\nC: {\"rotation_angle\": 44.2601421515034, \"translation_dx\": -84.9832744911761, \"translation_dy\": -78.07982572554322, \"scale\": 0.5612120736859965}\nD: {\"rotation_angle\": -15.445234303955033, \"translation_dx\": 52.656313993324545, \"translation_dy\": 4.243768644047549, \"scale\": 0.8747335302455691}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_1_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_1_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -70.18179574394556, \"translation_dx\": -84.02989442213027, \"translation_dy\": 45.46342410564398, \"scale\": 1.28660403831869}\nB: {\"rotation_angle\": 1.3693998936690264, \"translation_dx\": -71.94174431428723, \"translation_dy\": 25.661133958182248, \"scale\": 1.468813327861592}\nC: {\"rotation_angle\": -110.51021822636605, \"translation_dx\": -17.924195571284486, \"translation_dy\": -0.10679752473519954, \"scale\": 1.4066663412939815}\nD: {\"rotation_angle\": 141.74747753602782, \"translation_dx\": -54.793360600935046, \"translation_dy\": -29.72546528603263, \"scale\": 0.6563706152769926}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -70.18179574394556, \"translation_dx\": -84.02989442213027, \"translation_dy\": 45.46342410564398, \"scale\": 1.28660403831869}\nB: {\"rotation_angle\": 1.3693998936690264, \"translation_dx\": -71.94174431428723, \"translation_dy\": 25.661133958182248, \"scale\": 1.468813327861592}\nC: {\"rotation_angle\": -110.51021822636605, \"translation_dx\": -17.924195571284486, \"translation_dy\": -0.10679752473519954, \"scale\": 1.4066663412939815}\nD: {\"rotation_angle\": 141.74747753602782, \"translation_dx\": -54.793360600935046, \"translation_dy\": -29.72546528603263, \"scale\": 0.6563706152769926}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_2_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_2_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -59.18065174130953, \"translation_dx\": -66.15733764198566, \"translation_dy\": -32.06450758946801, \"scale\": 1.1967157159259998}\nB: {\"rotation_angle\": 142.66976946716716, \"translation_dx\": 29.963541003119957, \"translation_dy\": 66.07065092305665, \"scale\": 1.42144068359999}\nC: {\"rotation_angle\": -79.55706788063112, \"translation_dx\": -38.613403166877674, \"translation_dy\": 48.56888435185245, \"scale\": 1.368947012195521}\nD: {\"rotation_angle\": -15.445234303955033, \"translation_dx\": 52.656313993324545, \"translation_dy\": 4.243768644047549, \"scale\": 0.8747335302455691}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -59.18065174130953, \"translation_dx\": -66.15733764198566, \"translation_dy\": -32.06450758946801, \"scale\": 1.1967157159259998}\nB: {\"rotation_angle\": 142.66976946716716, \"translation_dx\": 29.963541003119957, \"translation_dy\": 66.07065092305665, \"scale\": 1.42144068359999}\nC: {\"rotation_angle\": -79.55706788063112, \"translation_dx\": -38.613403166877674, \"translation_dy\": 48.56888435185245, \"scale\": 1.368947012195521}\nD: {\"rotation_angle\": -15.445234303955033, \"translation_dx\": 52.656313993324545, \"translation_dy\": 4.243768644047549, \"scale\": 0.8747335302455691}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_3_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_3_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -160.6395227566207, \"translation_dx\": 53.66643366551958, \"translation_dy\": -27.712376159428388, \"scale\": 1.1084051689599654}\nB: {\"rotation_angle\": 32.170058088704565, \"translation_dx\": 62.48780444449932, \"translation_dy\": 36.464458087386475, \"scale\": 0.8338243238440678}\nC: {\"rotation_angle\": 159.39197876032466, \"translation_dx\": -101.87275621292875, \"translation_dy\": -32.606176111808466, \"scale\": 0.6647290774480178}\nD: {\"rotation_angle\": 4.3566947214011975, \"translation_dx\": 60.69356846846577, \"translation_dy\": 19.542677658157032, \"scale\": 1.353031271581857}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -160.6395227566207, \"translation_dx\": 53.66643366551958, \"translation_dy\": -27.712376159428388, \"scale\": 1.1084051689599654}\nB: {\"rotation_angle\": 32.170058088704565, \"translation_dx\": 62.48780444449932, \"translation_dy\": 36.464458087386475, \"scale\": 0.8338243238440678}\nC: {\"rotation_angle\": 159.39197876032466, \"translation_dx\": -101.87275621292875, \"translation_dy\": -32.606176111808466, \"scale\": 0.6647290774480178}\nD: {\"rotation_angle\": 4.3566947214011975, \"translation_dx\": 60.69356846846577, \"translation_dy\": 19.542677658157032, \"scale\": 1.353031271581857}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_4_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_4_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -38.58021171568234, \"translation_dx\": -80.14139661496048, \"translation_dy\": 7.985099889843255, \"scale\": 1.029545268033875}\nB: {\"rotation_angle\": 170.5673161572617, \"translation_dx\": -54.14309140946517, \"translation_dy\": -20.9067824061149, \"scale\": 0.74080987054586}\nC: {\"rotation_angle\": -137.58016126496426, \"translation_dx\": 45.631572391068715, \"translation_dy\": -54.72741054396442, \"scale\": 1.391656794638211}\nD: {\"rotation_angle\": 115.4472434811122, \"translation_dx\": 69.00896887231048, \"translation_dy\": -26.016218629159226, \"scale\": 0.9339901852292719}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -38.58021171568234, \"translation_dx\": -80.14139661496048, \"translation_dy\": 7.985099889843255, \"scale\": 1.029545268033875}\nB: {\"rotation_angle\": 170.5673161572617, \"translation_dx\": -54.14309140946517, \"translation_dy\": -20.9067824061149, \"scale\": 0.74080987054586}\nC: {\"rotation_angle\": -137.58016126496426, \"translation_dx\": 45.631572391068715, \"translation_dy\": -54.72741054396442, \"scale\": 1.391656794638211}\nD: {\"rotation_angle\": 115.4472434811122, \"translation_dx\": 69.00896887231048, \"translation_dy\": -26.016218629159226, \"scale\": 0.9339901852292719}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_5_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_5_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 67.74863170033868, \"translation_dx\": 0.9436916559104702, \"translation_dy\": 79.02717939495389, \"scale\": 1.0490112177140545}\nB: {\"rotation_angle\": -149.34069149386406, \"translation_dx\": 81.63420911320063, \"translation_dy\": -26.073567429384056, \"scale\": 1.427947630130646}\nC: {\"rotation_angle\": 159.74516071456964, \"translation_dx\": 18.36539372865252, \"translation_dy\": -32.68583255299669, \"scale\": 0.6283421405871866}\nD: {\"rotation_angle\": 72.25092677282458, \"translation_dx\": 61.389740502873025, \"translation_dy\": -36.86538640455047, \"scale\": 1.0748600769835353}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 67.74863170033868, \"translation_dx\": 0.9436916559104702, \"translation_dy\": 79.02717939495389, \"scale\": 1.0490112177140545}\nB: {\"rotation_angle\": -149.34069149386406, \"translation_dx\": 81.63420911320063, \"translation_dy\": -26.073567429384056, \"scale\": 1.427947630130646}\nC: {\"rotation_angle\": 159.74516071456964, \"translation_dx\": 18.36539372865252, \"translation_dy\": -32.68583255299669, \"scale\": 0.6283421405871866}\nD: {\"rotation_angle\": 72.25092677282458, \"translation_dx\": 61.389740502873025, \"translation_dy\": -36.86538640455047, \"scale\": 1.0748600769835353}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_6_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_6_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 178.3015459217881, \"translation_dx\": 2.1592483018484785, \"translation_dy\": -86.15095567396924, \"scale\": 1.206185814877298}\nB: {\"rotation_angle\": 95.69634927891752, \"translation_dx\": -96.46148729426875, \"translation_dy\": -25.496381966922478, \"scale\": 0.7479348241153333}\nC: {\"rotation_angle\": 179.8013352752547, \"translation_dx\": -90.5548533247824, \"translation_dy\": 17.23782922418306, \"scale\": 0.9885365626195518}\nD: {\"rotation_angle\": 49.90656423603761, \"translation_dx\": 85.27067294320437, \"translation_dy\": -8.928665399863448, \"scale\": 0.9370060594249733}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 178.3015459217881, \"translation_dx\": 2.1592483018484785, \"translation_dy\": -86.15095567396924, \"scale\": 1.206185814877298}\nB: {\"rotation_angle\": 95.69634927891752, \"translation_dx\": -96.46148729426875, \"translation_dy\": -25.496381966922478, \"scale\": 0.7479348241153333}\nC: {\"rotation_angle\": 179.8013352752547, \"translation_dx\": -90.5548533247824, \"translation_dy\": 17.23782922418306, \"scale\": 0.9885365626195518}\nD: {\"rotation_angle\": 49.90656423603761, \"translation_dx\": 85.27067294320437, \"translation_dy\": -8.928665399863448, \"scale\": 0.9370060594249733}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_7_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_7_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -16.878745814478265, \"translation_dx\": -68.86659110743665, \"translation_dy\": -98.54142762965468, \"scale\": 1.2648663928919022}\nB: {\"rotation_angle\": -126.23248080179604, \"translation_dx\": -18.04313623288388, \"translation_dy\": 59.052880720386156, \"scale\": 1.3827835175940266}\nC: {\"rotation_angle\": -137.69315675508605, \"translation_dx\": -14.965017175186233, \"translation_dy\": 28.85856493302694, \"scale\": 0.6970825252863025}\nD: {\"rotation_angle\": -68.79930104020924, \"translation_dx\": -103.12901971602221, \"translation_dy\": 94.89161684072867, \"scale\": 1.2295411735859756}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -16.878745814478265, \"translation_dx\": -68.86659110743665, \"translation_dy\": -98.54142762965468, \"scale\": 1.2648663928919022}\nB: {\"rotation_angle\": -126.23248080179604, \"translation_dx\": -18.04313623288388, \"translation_dy\": 59.052880720386156, \"scale\": 1.3827835175940266}\nC: {\"rotation_angle\": -137.69315675508605, \"translation_dx\": -14.965017175186233, \"translation_dy\": 28.85856493302694, \"scale\": 0.6970825252863025}\nD: {\"rotation_angle\": -68.79930104020924, \"translation_dx\": -103.12901971602221, \"translation_dy\": 94.89161684072867, \"scale\": 1.2295411735859756}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_8_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_8_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 143.38145335973087, \"translation_dx\": 86.67970142496799, \"translation_dy\": -33.57640317277091, \"scale\": 0.6114655384261714}\nB: {\"rotation_angle\": -0.45613579718829556, \"translation_dx\": 98.71619714866841, \"translation_dy\": 70.1100439641223, \"scale\": 0.6491919010173006}\nC: {\"rotation_angle\": -79.55706788063112, \"translation_dx\": -38.613403166877674, \"translation_dy\": 48.56888435185245, \"scale\": 1.368947012195521}\nD: {\"rotation_angle\": -79.27003163090343, \"translation_dx\": 8.207736130313549, \"translation_dy\": 6.670417118750038, \"scale\": 1.3327657238113826}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 143.38145335973087, \"translation_dx\": 86.67970142496799, \"translation_dy\": -33.57640317277091, \"scale\": 0.6114655384261714}\nB: {\"rotation_angle\": -0.45613579718829556, \"translation_dx\": 98.71619714866841, \"translation_dy\": 70.1100439641223, \"scale\": 0.6491919010173006}\nC: {\"rotation_angle\": -79.55706788063112, \"translation_dx\": -38.613403166877674, \"translation_dy\": 48.56888435185245, \"scale\": 1.368947012195521}\nD: {\"rotation_angle\": -79.27003163090343, \"translation_dx\": 8.207736130313549, \"translation_dy\": 6.670417118750038, \"scale\": 1.3327657238113826}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_9_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_9_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 72.25092677282458, \"translation_dx\": 61.389740502873025, \"translation_dy\": -36.86538640455047, \"scale\": 1.0748600769835353}\nB: {\"rotation_angle\": 139.13421797404374, \"translation_dx\": -107.62188977651758, \"translation_dy\": -65.35657968686931, \"scale\": 0.569575564082204}\nC: {\"rotation_angle\": -94.06455293225282, \"translation_dx\": -52.04430006776356, \"translation_dy\": 88.55937507710391, \"scale\": 0.8369046461483086}\nD: {\"rotation_angle\": 136.76946369368522, \"translation_dx\": 86.13615517916296, \"translation_dy\": 47.49597577737802, \"scale\": 1.1842967613683704}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 72.25092677282458, \"translation_dx\": 61.389740502873025, \"translation_dy\": -36.86538640455047, \"scale\": 1.0748600769835353}\nB: {\"rotation_angle\": 139.13421797404374, \"translation_dx\": -107.62188977651758, \"translation_dy\": -65.35657968686931, \"scale\": 0.569575564082204}\nC: {\"rotation_angle\": -94.06455293225282, \"translation_dx\": -52.04430006776356, \"translation_dy\": 88.55937507710391, \"scale\": 0.8369046461483086}\nD: {\"rotation_angle\": 136.76946369368522, \"translation_dx\": 86.13615517916296, \"translation_dy\": 47.49597577737802, \"scale\": 1.1842967613683704}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_10_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_10_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 111.11665430921613, \"translation_dx\": -45.526232266105865, \"translation_dy\": -71.56835409165808, \"scale\": 0.5234271564227445}\nB: {\"rotation_angle\": -127.03310490562403, \"translation_dx\": -44.497972498107885, \"translation_dy\": 53.252184804163164, \"scale\": 0.8807762361133948}\nC: {\"rotation_angle\": -23.02063628299686, \"translation_dx\": -42.06347070905805, \"translation_dy\": 68.90308226059909, \"scale\": 0.7321107429069119}\nD: {\"rotation_angle\": 33.426384392539006, \"translation_dx\": -12.448609293998487, \"translation_dy\": 64.03367069956386, \"scale\": 0.6340926377236346}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 111.11665430921613, \"translation_dx\": -45.526232266105865, \"translation_dy\": -71.56835409165808, \"scale\": 0.5234271564227445}\nB: {\"rotation_angle\": -127.03310490562403, \"translation_dx\": -44.497972498107885, \"translation_dy\": 53.252184804163164, \"scale\": 0.8807762361133948}\nC: {\"rotation_angle\": -23.02063628299686, \"translation_dx\": -42.06347070905805, \"translation_dy\": 68.90308226059909, \"scale\": 0.7321107429069119}\nD: {\"rotation_angle\": 33.426384392539006, \"translation_dx\": -12.448609293998487, \"translation_dy\": 64.03367069956386, \"scale\": 0.6340926377236346}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_11_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_11_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 159.39197876032466, \"translation_dx\": -101.87275621292875, \"translation_dy\": -32.606176111808466, \"scale\": 0.6647290774480178}\nB: {\"rotation_angle\": -123.92621597373325, \"translation_dx\": 115.25994331141689, \"translation_dy\": -45.13111299141354, \"scale\": 1.164470344420729}\nC: {\"rotation_angle\": 95.56102360167273, \"translation_dx\": -57.629857243876444, \"translation_dy\": -95.34824117323305, \"scale\": 0.9533126568708786}\nD: {\"rotation_angle\": -23.247975965134003, \"translation_dx\": 108.97564353658032, \"translation_dy\": 27.267413374938258, \"scale\": 1.2292170424899498}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 159.39197876032466, \"translation_dx\": -101.87275621292875, \"translation_dy\": -32.606176111808466, \"scale\": 0.6647290774480178}\nB: {\"rotation_angle\": -123.92621597373325, \"translation_dx\": 115.25994331141689, \"translation_dy\": -45.13111299141354, \"scale\": 1.164470344420729}\nC: {\"rotation_angle\": 95.56102360167273, \"translation_dx\": -57.629857243876444, \"translation_dy\": -95.34824117323305, \"scale\": 0.9533126568708786}\nD: {\"rotation_angle\": -23.247975965134003, \"translation_dx\": 108.97564353658032, \"translation_dy\": 27.267413374938258, \"scale\": 1.2292170424899498}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_12_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_12_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -124.74198080809023, \"translation_dx\": -48.23531115232953, \"translation_dy\": 52.62526617026404, \"scale\": 1.3484625774406969}\nB: {\"rotation_angle\": 153.24034529323683, \"translation_dx\": -80.95083564593054, \"translation_dy\": 58.17854805068575, \"scale\": 0.8564275095577245}\nC: {\"rotation_angle\": -49.11147497176091, \"translation_dx\": -21.61309921155923, \"translation_dy\": 41.841400081955015, \"scale\": 1.3374733710705384}\nD: {\"rotation_angle\": 110.02825264959768, \"translation_dx\": -53.26387197670213, \"translation_dy\": 88.43864976013427, \"scale\": 1.4833645013101147}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -124.74198080809023, \"translation_dx\": -48.23531115232953, \"translation_dy\": 52.62526617026404, \"scale\": 1.3484625774406969}\nB: {\"rotation_angle\": 153.24034529323683, \"translation_dx\": -80.95083564593054, \"translation_dy\": 58.17854805068575, \"scale\": 0.8564275095577245}\nC: {\"rotation_angle\": -49.11147497176091, \"translation_dx\": -21.61309921155923, \"translation_dy\": 41.841400081955015, \"scale\": 1.3374733710705384}\nD: {\"rotation_angle\": 110.02825264959768, \"translation_dx\": -53.26387197670213, \"translation_dy\": 88.43864976013427, \"scale\": 1.4833645013101147}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_13_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_13_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 173.6372649335733, \"translation_dx\": -7.357207392874017, \"translation_dy\": -51.70776156994498, \"scale\": 1.09720142096939}\nB: {\"rotation_angle\": 84.88997243843744, \"translation_dx\": 19.30269357274682, \"translation_dy\": 9.929350250110147, \"scale\": 1.0595552381550672}\nC: {\"rotation_angle\": -169.57691070181107, \"translation_dx\": 67.3776951722352, \"translation_dy\": 6.393739311338578, \"scale\": 0.8283042543093307}\nD: {\"rotation_angle\": 137.6047485759084, \"translation_dx\": -27.00857214512888, \"translation_dy\": -94.97246325619065, \"scale\": 1.1628545134465245}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 173.6372649335733, \"translation_dx\": -7.357207392874017, \"translation_dy\": -51.70776156994498, \"scale\": 1.09720142096939}\nB: {\"rotation_angle\": 84.88997243843744, \"translation_dx\": 19.30269357274682, \"translation_dy\": 9.929350250110147, \"scale\": 1.0595552381550672}\nC: {\"rotation_angle\": -169.57691070181107, \"translation_dx\": 67.3776951722352, \"translation_dy\": 6.393739311338578, \"scale\": 0.8283042543093307}\nD: {\"rotation_angle\": 137.6047485759084, \"translation_dx\": -27.00857214512888, \"translation_dy\": -94.97246325619065, \"scale\": 1.1628545134465245}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_14_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_14_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 83.49682873903629, \"translation_dx\": -127.2042493945246, \"translation_dy\": 2.6616959584396938, \"scale\": 0.9488759478249397}\nB: {\"rotation_angle\": -51.98717119490195, \"translation_dx\": -83.93544420557635, \"translation_dy\": -17.359661719977098, \"scale\": 1.0858344969275349}\nC: {\"rotation_angle\": 162.9787629733711, \"translation_dx\": 56.68968820785494, \"translation_dy\": 63.47754229449794, \"scale\": 0.7767697180212818}\nD: {\"rotation_angle\": 74.4727172984789, \"translation_dx\": 83.0498783040965, \"translation_dy\": 24.573318419119772, \"scale\": 1.4775593630739356}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 83.49682873903629, \"translation_dx\": -127.2042493945246, \"translation_dy\": 2.6616959584396938, \"scale\": 0.9488759478249397}\nB: {\"rotation_angle\": -51.98717119490195, \"translation_dx\": -83.93544420557635, \"translation_dy\": -17.359661719977098, \"scale\": 1.0858344969275349}\nC: {\"rotation_angle\": 162.9787629733711, \"translation_dx\": 56.68968820785494, \"translation_dy\": 63.47754229449794, \"scale\": 0.7767697180212818}\nD: {\"rotation_angle\": 74.4727172984789, \"translation_dx\": 83.0498783040965, \"translation_dy\": 24.573318419119772, \"scale\": 1.4775593630739356}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_15_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_15_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -32.96407209098831, \"translation_dx\": -27.518946535455143, \"translation_dy\": 2.5370159689679213, \"scale\": 1.259328459428434}\nB: {\"rotation_angle\": -70.18179574394556, \"translation_dx\": -84.02989442213027, \"translation_dy\": 45.46342410564398, \"scale\": 1.28660403831869}\nC: {\"rotation_angle\": 112.15713698429767, \"translation_dx\": -0.833180316164956, \"translation_dy\": -100.57740000976534, \"scale\": 1.21487245494624}\nD: {\"rotation_angle\": -103.24791656906933, \"translation_dx\": -2.2454836983213227, \"translation_dy\": 24.014319900588845, \"scale\": 1.3204557483507742}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -32.96407209098831, \"translation_dx\": -27.518946535455143, \"translation_dy\": 2.5370159689679213, \"scale\": 1.259328459428434}\nB: {\"rotation_angle\": -70.18179574394556, \"translation_dx\": -84.02989442213027, \"translation_dy\": 45.46342410564398, \"scale\": 1.28660403831869}\nC: {\"rotation_angle\": 112.15713698429767, \"translation_dx\": -0.833180316164956, \"translation_dy\": -100.57740000976534, \"scale\": 1.21487245494624}\nD: {\"rotation_angle\": -103.24791656906933, \"translation_dx\": -2.2454836983213227, \"translation_dy\": 24.014319900588845, \"scale\": 1.3204557483507742}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_16_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_16_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -132.6730586187399, \"translation_dx\": -14.723128468316531, \"translation_dy\": -95.44210429834934, \"scale\": 1.0421065600095725}\nB: {\"rotation_angle\": -148.06770236959966, \"translation_dx\": 76.71938731609727, \"translation_dy\": 125.67697929104389, \"scale\": 1.1600663307259453}\nC: {\"rotation_angle\": 46.42160956908356, \"translation_dx\": -90.04619228512212, \"translation_dy\": -15.749486436572411, \"scale\": 1.005156310055277}\nD: {\"rotation_angle\": 32.170058088704565, \"translation_dx\": 62.48780444449932, \"translation_dy\": 36.464458087386475, \"scale\": 0.8338243238440678}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -132.6730586187399, \"translation_dx\": -14.723128468316531, \"translation_dy\": -95.44210429834934, \"scale\": 1.0421065600095725}\nB: {\"rotation_angle\": -148.06770236959966, \"translation_dx\": 76.71938731609727, \"translation_dy\": 125.67697929104389, \"scale\": 1.1600663307259453}\nC: {\"rotation_angle\": 46.42160956908356, \"translation_dx\": -90.04619228512212, \"translation_dy\": -15.749486436572411, \"scale\": 1.005156310055277}\nD: {\"rotation_angle\": 32.170058088704565, \"translation_dx\": 62.48780444449932, \"translation_dy\": 36.464458087386475, \"scale\": 0.8338243238440678}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_17_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_17_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 98.62110540120432, \"translation_dx\": 55.8324503005326, \"translation_dy\": -53.32963696213369, \"scale\": 1.3342375308232577}\nB: {\"rotation_angle\": 133.22970053001933, \"translation_dx\": 30.83867253278636, \"translation_dy\": 9.987607615316023, \"scale\": 0.9746642566652708}\nC: {\"rotation_angle\": -113.69332067912192, \"translation_dx\": -23.005200251858383, \"translation_dy\": 57.916315250854666, \"scale\": 0.5483419258047426}\nD: {\"rotation_angle\": -110.51021822636605, \"translation_dx\": -17.924195571284486, \"translation_dy\": -0.10679752473519954, \"scale\": 1.4066663412939815}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 98.62110540120432, \"translation_dx\": 55.8324503005326, \"translation_dy\": -53.32963696213369, \"scale\": 1.3342375308232577}\nB: {\"rotation_angle\": 133.22970053001933, \"translation_dx\": 30.83867253278636, \"translation_dy\": 9.987607615316023, \"scale\": 0.9746642566652708}\nC: {\"rotation_angle\": -113.69332067912192, \"translation_dx\": -23.005200251858383, \"translation_dy\": 57.916315250854666, \"scale\": 0.5483419258047426}\nD: {\"rotation_angle\": -110.51021822636605, \"translation_dx\": -17.924195571284486, \"translation_dy\": -0.10679752473519954, \"scale\": 1.4066663412939815}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_18_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_18_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 98.12478073081388, \"translation_dx\": 82.24255679101596, \"translation_dy\": 10.638794739410258, \"scale\": 1.454613875934863}\nB: {\"rotation_angle\": -100.94596249363259, \"translation_dx\": 18.493532966543597, \"translation_dy\": -4.904135882610319, \"scale\": 1.1575890826518318}\nC: {\"rotation_angle\": 159.39197876032466, \"translation_dx\": -101.87275621292875, \"translation_dy\": -32.606176111808466, \"scale\": 0.6647290774480178}\nD: {\"rotation_angle\": -16.878745814478265, \"translation_dx\": -68.86659110743665, \"translation_dy\": -98.54142762965468, \"scale\": 1.2648663928919022}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 98.12478073081388, \"translation_dx\": 82.24255679101596, \"translation_dy\": 10.638794739410258, \"scale\": 1.454613875934863}\nB: {\"rotation_angle\": -100.94596249363259, \"translation_dx\": 18.493532966543597, \"translation_dy\": -4.904135882610319, \"scale\": 1.1575890826518318}\nC: {\"rotation_angle\": 159.39197876032466, \"translation_dx\": -101.87275621292875, \"translation_dy\": -32.606176111808466, \"scale\": 0.6647290774480178}\nD: {\"rotation_angle\": -16.878745814478265, \"translation_dx\": -68.86659110743665, \"translation_dy\": -98.54142762965468, \"scale\": 1.2648663928919022}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_19_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_19_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 47.16467358014893, \"translation_dx\": -87.19318159487975, \"translation_dy\": -49.56686010575127, \"scale\": 1.2416587716965684}\nB: {\"rotation_angle\": 95.69634927891752, \"translation_dx\": -96.46148729426875, \"translation_dy\": -25.496381966922478, \"scale\": 0.7479348241153333}\nC: {\"rotation_angle\": -152.40502323992493, \"translation_dx\": -0.6096313646742146, \"translation_dy\": 26.2224872549711, \"scale\": 0.6008305458537412}\nD: {\"rotation_angle\": -126.15991399279281, \"translation_dx\": 24.895638463286446, \"translation_dy\": -35.71086816730676, \"scale\": 1.30648936857296}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 47.16467358014893, \"translation_dx\": -87.19318159487975, \"translation_dy\": -49.56686010575127, \"scale\": 1.2416587716965684}\nB: {\"rotation_angle\": 95.69634927891752, \"translation_dx\": -96.46148729426875, \"translation_dy\": -25.496381966922478, \"scale\": 0.7479348241153333}\nC: {\"rotation_angle\": -152.40502323992493, \"translation_dx\": -0.6096313646742146, \"translation_dy\": 26.2224872549711, \"scale\": 0.6008305458537412}\nD: {\"rotation_angle\": -126.15991399279281, \"translation_dx\": 24.895638463286446, \"translation_dy\": -35.71086816730676, \"scale\": 1.30648936857296}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_20_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_20_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 137.6047485759084, \"translation_dx\": -27.00857214512888, \"translation_dy\": -94.97246325619065, \"scale\": 1.1628545134465245}\nB: {\"rotation_angle\": 53.86809011441332, \"translation_dx\": -15.131168518097624, \"translation_dy\": -31.300037391593577, \"scale\": 1.3154620606808156}\nC: {\"rotation_angle\": 44.2601421515034, \"translation_dx\": -84.9832744911761, \"translation_dy\": -78.07982572554322, \"scale\": 0.5612120736859965}\nD: {\"rotation_angle\": -72.82027143369304, \"translation_dx\": -44.85481158127062, \"translation_dy\": 106.69131407191517, \"scale\": 0.716080341101258}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 137.6047485759084, \"translation_dx\": -27.00857214512888, \"translation_dy\": -94.97246325619065, \"scale\": 1.1628545134465245}\nB: {\"rotation_angle\": 53.86809011441332, \"translation_dx\": -15.131168518097624, \"translation_dy\": -31.300037391593577, \"scale\": 1.3154620606808156}\nC: {\"rotation_angle\": 44.2601421515034, \"translation_dx\": -84.9832744911761, \"translation_dy\": -78.07982572554322, \"scale\": 0.5612120736859965}\nD: {\"rotation_angle\": -72.82027143369304, \"translation_dx\": -44.85481158127062, \"translation_dy\": 106.69131407191517, \"scale\": 0.716080341101258}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_21_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_21_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -70.97525301082955, \"translation_dx\": -28.380848037876873, \"translation_dy\": 54.37723426674512, \"scale\": 0.9024922197892329}\nB: {\"rotation_angle\": 148.22875373623708, \"translation_dx\": 53.75338658972072, \"translation_dy\": -63.78583022927253, \"scale\": 0.9304836306567924}\nC: {\"rotation_angle\": -162.31682909306286, \"translation_dx\": 94.60975693720637, \"translation_dy\": -28.569332128995313, \"scale\": 1.1251281587345527}\nD: {\"rotation_angle\": 127.0599036632886, \"translation_dx\": -26.73103881794438, \"translation_dy\": 16.785326739741976, \"scale\": 1.1214331244941351}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -70.97525301082955, \"translation_dx\": -28.380848037876873, \"translation_dy\": 54.37723426674512, \"scale\": 0.9024922197892329}\nB: {\"rotation_angle\": 148.22875373623708, \"translation_dx\": 53.75338658972072, \"translation_dy\": -63.78583022927253, \"scale\": 0.9304836306567924}\nC: {\"rotation_angle\": -162.31682909306286, \"translation_dx\": 94.60975693720637, \"translation_dy\": -28.569332128995313, \"scale\": 1.1251281587345527}\nD: {\"rotation_angle\": 127.0599036632886, \"translation_dx\": -26.73103881794438, \"translation_dy\": 16.785326739741976, \"scale\": 1.1214331244941351}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_22_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_22_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -162.31682909306286, \"translation_dx\": 94.60975693720637, \"translation_dy\": -28.569332128995313, \"scale\": 1.1251281587345527}\nB: {\"rotation_angle\": 168.86687879669455, \"translation_dx\": 30.327287286076626, \"translation_dy\": -73.84263373893171, \"scale\": 1.0887904122788439}\nC: {\"rotation_angle\": -126.23248080179604, \"translation_dx\": -18.04313623288388, \"translation_dy\": 59.052880720386156, \"scale\": 1.3827835175940266}\nD: {\"rotation_angle\": 95.69634927891752, \"translation_dx\": -96.46148729426875, \"translation_dy\": -25.496381966922478, \"scale\": 0.7479348241153333}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -162.31682909306286, \"translation_dx\": 94.60975693720637, \"translation_dy\": -28.569332128995313, \"scale\": 1.1251281587345527}\nB: {\"rotation_angle\": 168.86687879669455, \"translation_dx\": 30.327287286076626, \"translation_dy\": -73.84263373893171, \"scale\": 1.0887904122788439}\nC: {\"rotation_angle\": -126.23248080179604, \"translation_dx\": -18.04313623288388, \"translation_dy\": 59.052880720386156, \"scale\": 1.3827835175940266}\nD: {\"rotation_angle\": 95.69634927891752, \"translation_dx\": -96.46148729426875, \"translation_dy\": -25.496381966922478, \"scale\": 0.7479348241153333}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_23_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_23_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -137.69315675508605, \"translation_dx\": -14.965017175186233, \"translation_dy\": 28.85856493302694, \"scale\": 0.6970825252863025}\nB: {\"rotation_angle\": -115.34417090075787, \"translation_dx\": -118.63121430094503, \"translation_dy\": 41.63412082488844, \"scale\": 0.9001856788272352}\nC: {\"rotation_angle\": 159.18509857624855, \"translation_dx\": 94.5972413522399, \"translation_dy\": -87.01463724053234, \"scale\": 0.7914176569510836}\nD: {\"rotation_angle\": 78.52234880801677, \"translation_dx\": -41.05806913924104, \"translation_dy\": -5.158893155372851, \"scale\": 1.0182841116233097}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -137.69315675508605, \"translation_dx\": -14.965017175186233, \"translation_dy\": 28.85856493302694, \"scale\": 0.6970825252863025}\nB: {\"rotation_angle\": -115.34417090075787, \"translation_dx\": -118.63121430094503, \"translation_dy\": 41.63412082488844, \"scale\": 0.9001856788272352}\nC: {\"rotation_angle\": 159.18509857624855, \"translation_dx\": 94.5972413522399, \"translation_dy\": -87.01463724053234, \"scale\": 0.7914176569510836}\nD: {\"rotation_angle\": 78.52234880801677, \"translation_dx\": -41.05806913924104, \"translation_dy\": -5.158893155372851, \"scale\": 1.0182841116233097}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_24_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_24_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -13.219279868292688, \"translation_dx\": -95.87022677446828, \"translation_dy\": -58.31347876468597, \"scale\": 1.3722022398508045}\nB: {\"rotation_angle\": 26.051749493295517, \"translation_dx\": 8.674153667650117, \"translation_dy\": 81.98381249796742, \"scale\": 1.4721363798843865}\nC: {\"rotation_angle\": 28.728757892682808, \"translation_dx\": 12.065384659700086, \"translation_dy\": -119.64549643343977, \"scale\": 1.126100132224236}\nD: {\"rotation_angle\": 28.728757892682808, \"translation_dx\": 12.065384659700086, \"translation_dy\": -119.64549643343977, \"scale\": 1.126100132224236}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -13.219279868292688, \"translation_dx\": -95.87022677446828, \"translation_dy\": -58.31347876468597, \"scale\": 1.3722022398508045}\nB: {\"rotation_angle\": 26.051749493295517, \"translation_dx\": 8.674153667650117, \"translation_dy\": 81.98381249796742, \"scale\": 1.4721363798843865}\nC: {\"rotation_angle\": 28.728757892682808, \"translation_dx\": 12.065384659700086, \"translation_dy\": -119.64549643343977, \"scale\": 1.126100132224236}\nD: {\"rotation_angle\": 28.728757892682808, \"translation_dx\": 12.065384659700086, \"translation_dy\": -119.64549643343977, \"scale\": 1.126100132224236}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_25_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_25_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 139.13421797404374, \"translation_dx\": -107.62188977651758, \"translation_dy\": -65.35657968686931, \"scale\": 0.569575564082204}\nB: {\"rotation_angle\": 134.22497079750707, \"translation_dx\": -56.33244292094708, \"translation_dy\": 12.15417280277697, \"scale\": 1.260404381889235}\nC: {\"rotation_angle\": 64.33574528550244, \"translation_dx\": -83.09111528364858, \"translation_dy\": 12.26726314152404, \"scale\": 0.7845370507816389}\nD: {\"rotation_angle\": -78.36766094840773, \"translation_dx\": -86.41466180609471, \"translation_dy\": 63.19530077419013, \"scale\": 0.608403973907593}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 139.13421797404374, \"translation_dx\": -107.62188977651758, \"translation_dy\": -65.35657968686931, \"scale\": 0.569575564082204}\nB: {\"rotation_angle\": 134.22497079750707, \"translation_dx\": -56.33244292094708, \"translation_dy\": 12.15417280277697, \"scale\": 1.260404381889235}\nC: {\"rotation_angle\": 64.33574528550244, \"translation_dx\": -83.09111528364858, \"translation_dy\": 12.26726314152404, \"scale\": 0.7845370507816389}\nD: {\"rotation_angle\": -78.36766094840773, \"translation_dx\": -86.41466180609471, \"translation_dy\": 63.19530077419013, \"scale\": 0.608403973907593}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_26_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_26_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 51.652651058291696, \"translation_dx\": -79.60059266318888, \"translation_dy\": 40.24223939512936, \"scale\": 1.045377495061187}\nB: {\"rotation_angle\": 64.33574528550244, \"translation_dx\": -83.09111528364858, \"translation_dy\": 12.26726314152404, \"scale\": 0.7845370507816389}\nC: {\"rotation_angle\": 106.62912259997893, \"translation_dx\": -62.19399566166837, \"translation_dy\": -63.078041204745844, \"scale\": 1.4577244189370733}\nD: {\"rotation_angle\": -44.902472769484746, \"translation_dx\": -36.85475324083902, \"translation_dy\": 36.81692000181951, \"scale\": 1.0769710077370194}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 51.652651058291696, \"translation_dx\": -79.60059266318888, \"translation_dy\": 40.24223939512936, \"scale\": 1.045377495061187}\nB: {\"rotation_angle\": 64.33574528550244, \"translation_dx\": -83.09111528364858, \"translation_dy\": 12.26726314152404, \"scale\": 0.7845370507816389}\nC: {\"rotation_angle\": 106.62912259997893, \"translation_dx\": -62.19399566166837, \"translation_dy\": -63.078041204745844, \"scale\": 1.4577244189370733}\nD: {\"rotation_angle\": -44.902472769484746, \"translation_dx\": -36.85475324083902, \"translation_dy\": 36.81692000181951, \"scale\": 1.0769710077370194}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_27_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_27_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 127.1396993936072, \"translation_dx\": -29.08894824101361, \"translation_dy\": -80.84475014775404, \"scale\": 1.2834497894588772}\nB: {\"rotation_angle\": -169.57691070181107, \"translation_dx\": 67.3776951722352, \"translation_dy\": 6.393739311338578, \"scale\": 0.8283042543093307}\nC: {\"rotation_angle\": 153.24034529323683, \"translation_dx\": -80.95083564593054, \"translation_dy\": 58.17854805068575, \"scale\": 0.8564275095577245}\nD: {\"rotation_angle\": -147.17742740700606, \"translation_dx\": 99.79022385553455, \"translation_dy\": -46.32888217161055, \"scale\": 1.2561938294527635}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 127.1396993936072, \"translation_dx\": -29.08894824101361, \"translation_dy\": -80.84475014775404, \"scale\": 1.2834497894588772}\nB: {\"rotation_angle\": -169.57691070181107, \"translation_dx\": 67.3776951722352, \"translation_dy\": 6.393739311338578, \"scale\": 0.8283042543093307}\nC: {\"rotation_angle\": 153.24034529323683, \"translation_dx\": -80.95083564593054, \"translation_dy\": 58.17854805068575, \"scale\": 0.8564275095577245}\nD: {\"rotation_angle\": -147.17742740700606, \"translation_dx\": 99.79022385553455, \"translation_dy\": -46.32888217161055, \"scale\": 1.2561938294527635}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_28_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_28_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -137.58016126496426, \"translation_dx\": 45.631572391068715, \"translation_dy\": -54.72741054396442, \"scale\": 1.391656794638211}\nB: {\"rotation_angle\": -37.8135886633452, \"translation_dx\": 94.09848811207868, \"translation_dy\": -28.846940165704815, \"scale\": 0.7423292461324351}\nC: {\"rotation_angle\": -97.38730278840897, \"translation_dx\": 79.58431404822528, \"translation_dy\": -65.17570525641105, \"scale\": 0.8501057849742453}\nD: {\"rotation_angle\": -84.90425841207441, \"translation_dx\": -96.22975116611923, \"translation_dy\": -54.13037688992304, \"scale\": 1.161476925450186}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -137.58016126496426, \"translation_dx\": 45.631572391068715, \"translation_dy\": -54.72741054396442, \"scale\": 1.391656794638211}\nB: {\"rotation_angle\": -37.8135886633452, \"translation_dx\": 94.09848811207868, \"translation_dy\": -28.846940165704815, \"scale\": 0.7423292461324351}\nC: {\"rotation_angle\": -97.38730278840897, \"translation_dx\": 79.58431404822528, \"translation_dy\": -65.17570525641105, \"scale\": 0.8501057849742453}\nD: {\"rotation_angle\": -84.90425841207441, \"translation_dx\": -96.22975116611923, \"translation_dy\": -54.13037688992304, \"scale\": 1.161476925450186}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_29_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_29_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 18.52926347539298, \"translation_dx\": -26.155433185237058, \"translation_dy\": -39.799299198218556, \"scale\": 0.9355127285855813}\nB: {\"rotation_angle\": -79.55706788063112, \"translation_dx\": -38.613403166877674, \"translation_dy\": 48.56888435185245, \"scale\": 1.368947012195521}\nC: {\"rotation_angle\": -113.69332067912192, \"translation_dx\": -23.005200251858383, \"translation_dy\": 57.916315250854666, \"scale\": 0.5483419258047426}\nD: {\"rotation_angle\": -32.057796286961064, \"translation_dx\": 119.50392135854452, \"translation_dy\": -17.786253698900993, \"scale\": 1.4583062003808291}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 18.52926347539298, \"translation_dx\": -26.155433185237058, \"translation_dy\": -39.799299198218556, \"scale\": 0.9355127285855813}\nB: {\"rotation_angle\": -79.55706788063112, \"translation_dx\": -38.613403166877674, \"translation_dy\": 48.56888435185245, \"scale\": 1.368947012195521}\nC: {\"rotation_angle\": -113.69332067912192, \"translation_dx\": -23.005200251858383, \"translation_dy\": 57.916315250854666, \"scale\": 0.5483419258047426}\nD: {\"rotation_angle\": -32.057796286961064, \"translation_dx\": 119.50392135854452, \"translation_dy\": -17.786253698900993, \"scale\": 1.4583062003808291}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_30_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_30_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -127.03310490562403, \"translation_dx\": -44.497972498107885, \"translation_dy\": 53.252184804163164, \"scale\": 0.8807762361133948}\nB: {\"rotation_angle\": 2.6800660606496933, \"translation_dx\": 8.805898944242955, \"translation_dy\": -61.557448223727356, \"scale\": 0.7338009245004858}\nC: {\"rotation_angle\": 47.16467358014893, \"translation_dx\": -87.19318159487975, \"translation_dy\": -49.56686010575127, \"scale\": 1.2416587716965684}\nD: {\"rotation_angle\": 153.24034529323683, \"translation_dx\": -80.95083564593054, \"translation_dy\": 58.17854805068575, \"scale\": 0.8564275095577245}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -127.03310490562403, \"translation_dx\": -44.497972498107885, \"translation_dy\": 53.252184804163164, \"scale\": 0.8807762361133948}\nB: {\"rotation_angle\": 2.6800660606496933, \"translation_dx\": 8.805898944242955, \"translation_dy\": -61.557448223727356, \"scale\": 0.7338009245004858}\nC: {\"rotation_angle\": 47.16467358014893, \"translation_dx\": -87.19318159487975, \"translation_dy\": -49.56686010575127, \"scale\": 1.2416587716965684}\nD: {\"rotation_angle\": 153.24034529323683, \"translation_dx\": -80.95083564593054, \"translation_dy\": 58.17854805068575, \"scale\": 0.8564275095577245}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_31_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_31_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 160.04018122869564, \"translation_dx\": -10.031879581871024, \"translation_dy\": 74.10075881851205, \"scale\": 0.8976020445815951}\nB: {\"rotation_angle\": -131.1795029858263, \"translation_dx\": 17.908074544940433, \"translation_dy\": 120.17637833747304, \"scale\": 0.9471882483559888}\nC: {\"rotation_angle\": 104.66960596229086, \"translation_dx\": 122.9579606372167, \"translation_dy\": -32.21502556645471, \"scale\": 0.5791563638149022}\nD: {\"rotation_angle\": -117.26843352521382, \"translation_dx\": 17.28573283600312, \"translation_dy\": -92.45781352854672, \"scale\": 1.478727361005855}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 160.04018122869564, \"translation_dx\": -10.031879581871024, \"translation_dy\": 74.10075881851205, \"scale\": 0.8976020445815951}\nB: {\"rotation_angle\": -131.1795029858263, \"translation_dx\": 17.908074544940433, \"translation_dy\": 120.17637833747304, \"scale\": 0.9471882483559888}\nC: {\"rotation_angle\": 104.66960596229086, \"translation_dx\": 122.9579606372167, \"translation_dy\": -32.21502556645471, \"scale\": 0.5791563638149022}\nD: {\"rotation_angle\": -117.26843352521382, \"translation_dx\": 17.28573283600312, \"translation_dy\": -92.45781352854672, \"scale\": 1.478727361005855}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_32_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_32_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 33.36657735274014, \"translation_dx\": -110.42271839281483, \"translation_dy\": 35.783043595963875, \"scale\": 1.1017945125321793}\nB: {\"rotation_angle\": -153.95647753312159, \"translation_dx\": 64.08546266437509, \"translation_dy\": -34.554486291313935, \"scale\": 1.423360690418288}\nC: {\"rotation_angle\": 162.98131081099467, \"translation_dx\": -80.19473687776261, \"translation_dy\": -17.70282064458462, \"scale\": 1.2855975600149028}\nD: {\"rotation_angle\": 127.0599036632886, \"translation_dx\": -26.73103881794438, \"translation_dy\": 16.785326739741976, \"scale\": 1.1214331244941351}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 33.36657735274014, \"translation_dx\": -110.42271839281483, \"translation_dy\": 35.783043595963875, \"scale\": 1.1017945125321793}\nB: {\"rotation_angle\": -153.95647753312159, \"translation_dx\": 64.08546266437509, \"translation_dy\": -34.554486291313935, \"scale\": 1.423360690418288}\nC: {\"rotation_angle\": 162.98131081099467, \"translation_dx\": -80.19473687776261, \"translation_dy\": -17.70282064458462, \"scale\": 1.2855975600149028}\nD: {\"rotation_angle\": 127.0599036632886, \"translation_dx\": -26.73103881794438, \"translation_dy\": 16.785326739741976, \"scale\": 1.1214331244941351}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_33_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_33_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -13.219279868292688, \"translation_dx\": -95.87022677446828, \"translation_dy\": -58.31347876468597, \"scale\": 1.3722022398508045}\nB: {\"rotation_angle\": -92.49508697379828, \"translation_dx\": 63.09853740086383, \"translation_dy\": 99.47995409556995, \"scale\": 0.9495145406508286}\nC: {\"rotation_angle\": 8.705969178532513, \"translation_dx\": -108.98578445869327, \"translation_dy\": -85.91179454441009, \"scale\": 0.5132717751865925}\nD: {\"rotation_angle\": 72.25092677282458, \"translation_dx\": 61.389740502873025, \"translation_dy\": -36.86538640455047, \"scale\": 1.0748600769835353}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -13.219279868292688, \"translation_dx\": -95.87022677446828, \"translation_dy\": -58.31347876468597, \"scale\": 1.3722022398508045}\nB: {\"rotation_angle\": -92.49508697379828, \"translation_dx\": 63.09853740086383, \"translation_dy\": 99.47995409556995, \"scale\": 0.9495145406508286}\nC: {\"rotation_angle\": 8.705969178532513, \"translation_dx\": -108.98578445869327, \"translation_dy\": -85.91179454441009, \"scale\": 0.5132717751865925}\nD: {\"rotation_angle\": 72.25092677282458, \"translation_dx\": 61.389740502873025, \"translation_dy\": -36.86538640455047, \"scale\": 1.0748600769835353}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_34_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_34_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -8.756342422911757, \"translation_dx\": -120.12147874311805, \"translation_dy\": -16.659510954699698, \"scale\": 0.8471832394055047}\nB: {\"rotation_angle\": 88.55038325147228, \"translation_dx\": -17.272344447388633, \"translation_dy\": -67.72549137992362, \"scale\": 0.5810098703790367}\nC: {\"rotation_angle\": 1.3693998936690264, \"translation_dx\": -71.94174431428723, \"translation_dy\": 25.661133958182248, \"scale\": 1.468813327861592}\nD: {\"rotation_angle\": 170.5673161572617, \"translation_dx\": -54.14309140946517, \"translation_dy\": -20.9067824061149, \"scale\": 0.74080987054586}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -8.756342422911757, \"translation_dx\": -120.12147874311805, \"translation_dy\": -16.659510954699698, \"scale\": 0.8471832394055047}\nB: {\"rotation_angle\": 88.55038325147228, \"translation_dx\": -17.272344447388633, \"translation_dy\": -67.72549137992362, \"scale\": 0.5810098703790367}\nC: {\"rotation_angle\": 1.3693998936690264, \"translation_dx\": -71.94174431428723, \"translation_dy\": 25.661133958182248, \"scale\": 1.468813327861592}\nD: {\"rotation_angle\": 170.5673161572617, \"translation_dx\": -54.14309140946517, \"translation_dy\": -20.9067824061149, \"scale\": 0.74080987054586}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_35_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_35_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -41.748048059314925, \"translation_dx\": 84.2495675740148, \"translation_dy\": -81.02778113177463, \"scale\": 1.207158201764622}\nB: {\"rotation_angle\": 120.9888581359325, \"translation_dx\": 2.43720894071744, \"translation_dy\": -7.865691814940682, \"scale\": 0.5519813971136048}\nC: {\"rotation_angle\": 52.0207999596704, \"translation_dx\": 62.052266940503074, \"translation_dy\": 15.318990484280505, \"scale\": 1.1445040102422772}\nD: {\"rotation_angle\": 115.16030768984217, \"translation_dx\": -1.9669547188467504, \"translation_dy\": 38.42152609256746, \"scale\": 1.3403221872922475}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -41.748048059314925, \"translation_dx\": 84.2495675740148, \"translation_dy\": -81.02778113177463, \"scale\": 1.207158201764622}\nB: {\"rotation_angle\": 120.9888581359325, \"translation_dx\": 2.43720894071744, \"translation_dy\": -7.865691814940682, \"scale\": 0.5519813971136048}\nC: {\"rotation_angle\": 52.0207999596704, \"translation_dx\": 62.052266940503074, \"translation_dy\": 15.318990484280505, \"scale\": 1.1445040102422772}\nD: {\"rotation_angle\": 115.16030768984217, \"translation_dx\": -1.9669547188467504, \"translation_dy\": 38.42152609256746, \"scale\": 1.3403221872922475}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_36_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_36_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 130.382151153576, \"translation_dx\": 48.77925626504499, \"translation_dy\": 54.89982459749416, \"scale\": 1.3647831130001666}\nB: {\"rotation_angle\": -126.15991399279281, \"translation_dx\": 24.895638463286446, \"translation_dy\": -35.71086816730676, \"scale\": 1.30648936857296}\nC: {\"rotation_angle\": -15.445234303955033, \"translation_dx\": 52.656313993324545, \"translation_dy\": 4.243768644047549, \"scale\": 0.8747335302455691}\nD: {\"rotation_angle\": -132.6730586187399, \"translation_dx\": -14.723128468316531, \"translation_dy\": -95.44210429834934, \"scale\": 1.0421065600095725}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 130.382151153576, \"translation_dx\": 48.77925626504499, \"translation_dy\": 54.89982459749416, \"scale\": 1.3647831130001666}\nB: {\"rotation_angle\": -126.15991399279281, \"translation_dx\": 24.895638463286446, \"translation_dy\": -35.71086816730676, \"scale\": 1.30648936857296}\nC: {\"rotation_angle\": -15.445234303955033, \"translation_dx\": 52.656313993324545, \"translation_dy\": 4.243768644047549, \"scale\": 0.8747335302455691}\nD: {\"rotation_angle\": -132.6730586187399, \"translation_dx\": -14.723128468316531, \"translation_dy\": -95.44210429834934, \"scale\": 1.0421065600095725}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_37_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_37_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -97.38730278840897, \"translation_dx\": 79.58431404822528, \"translation_dy\": -65.17570525641105, \"scale\": 0.8501057849742453}\nB: {\"rotation_angle\": 120.9888581359325, \"translation_dx\": 2.43720894071744, \"translation_dy\": -7.865691814940682, \"scale\": 0.5519813971136048}\nC: {\"rotation_angle\": -53.475823147809436, \"translation_dx\": -52.11444637245131, \"translation_dy\": -7.974464084606126, \"scale\": 1.302004904680502}\nD: {\"rotation_angle\": 178.3015459217881, \"translation_dx\": 2.1592483018484785, \"translation_dy\": -86.15095567396924, \"scale\": 1.206185814877298}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -97.38730278840897, \"translation_dx\": 79.58431404822528, \"translation_dy\": -65.17570525641105, \"scale\": 0.8501057849742453}\nB: {\"rotation_angle\": 120.9888581359325, \"translation_dx\": 2.43720894071744, \"translation_dy\": -7.865691814940682, \"scale\": 0.5519813971136048}\nC: {\"rotation_angle\": -53.475823147809436, \"translation_dx\": -52.11444637245131, \"translation_dy\": -7.974464084606126, \"scale\": 1.302004904680502}\nD: {\"rotation_angle\": 178.3015459217881, \"translation_dx\": 2.1592483018484785, \"translation_dy\": -86.15095567396924, \"scale\": 1.206185814877298}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_38_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_38_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -81.11314702551758, \"translation_dx\": -115.5554336511824, \"translation_dy\": 81.04425747964075, \"scale\": 0.8604764063335847}\nB: {\"rotation_angle\": 32.170058088704565, \"translation_dx\": 62.48780444449932, \"translation_dy\": 36.464458087386475, \"scale\": 0.8338243238440678}\nC: {\"rotation_angle\": 22.924180775031914, \"translation_dx\": 8.278066534063711, \"translation_dy\": 39.03722404706397, \"scale\": 0.6972670428813228}\nD: {\"rotation_angle\": 163.34031080178892, \"translation_dx\": -21.567151354845635, \"translation_dy\": -30.72615389540148, \"scale\": 1.2439888416024685}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -81.11314702551758, \"translation_dx\": -115.5554336511824, \"translation_dy\": 81.04425747964075, \"scale\": 0.8604764063335847}\nB: {\"rotation_angle\": 32.170058088704565, \"translation_dx\": 62.48780444449932, \"translation_dy\": 36.464458087386475, \"scale\": 0.8338243238440678}\nC: {\"rotation_angle\": 22.924180775031914, \"translation_dx\": 8.278066534063711, \"translation_dy\": 39.03722404706397, \"scale\": 0.6972670428813228}\nD: {\"rotation_angle\": 163.34031080178892, \"translation_dx\": -21.567151354845635, \"translation_dy\": -30.72615389540148, \"scale\": 1.2439888416024685}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_39_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_39_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -79.27003163090343, \"translation_dx\": 8.207736130313549, \"translation_dy\": 6.670417118750038, \"scale\": 1.3327657238113826}\nB: {\"rotation_angle\": -78.36766094840773, \"translation_dx\": -86.41466180609471, \"translation_dy\": 63.19530077419013, \"scale\": 0.608403973907593}\nC: {\"rotation_angle\": 49.90656423603761, \"translation_dx\": 85.27067294320437, \"translation_dy\": -8.928665399863448, \"scale\": 0.9370060594249733}\nD: {\"rotation_angle\": -72.82027143369304, \"translation_dx\": -44.85481158127062, \"translation_dy\": 106.69131407191517, \"scale\": 0.716080341101258}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -79.27003163090343, \"translation_dx\": 8.207736130313549, \"translation_dy\": 6.670417118750038, \"scale\": 1.3327657238113826}\nB: {\"rotation_angle\": -78.36766094840773, \"translation_dx\": -86.41466180609471, \"translation_dy\": 63.19530077419013, \"scale\": 0.608403973907593}\nC: {\"rotation_angle\": 49.90656423603761, \"translation_dx\": 85.27067294320437, \"translation_dy\": -8.928665399863448, \"scale\": 0.9370060594249733}\nD: {\"rotation_angle\": -72.82027143369304, \"translation_dx\": -44.85481158127062, \"translation_dy\": 106.69131407191517, \"scale\": 0.716080341101258}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_40_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_40_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -161.22593365548192, \"translation_dx\": -119.73961882572601, \"translation_dy\": -93.50838821854722, \"scale\": 1.4476413063179399}\nB: {\"rotation_angle\": 52.0207999596704, \"translation_dx\": 62.052266940503074, \"translation_dy\": 15.318990484280505, \"scale\": 1.1445040102422772}\nC: {\"rotation_angle\": -98.17490649350026, \"translation_dx\": 5.744855173473269, \"translation_dy\": -10.705504600001973, \"scale\": 1.1182428392253487}\nD: {\"rotation_angle\": 159.25105466068987, \"translation_dx\": -126.35420360425098, \"translation_dy\": -17.54721978726404, \"scale\": 1.4952435062275256}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -161.22593365548192, \"translation_dx\": -119.73961882572601, \"translation_dy\": -93.50838821854722, \"scale\": 1.4476413063179399}\nB: {\"rotation_angle\": 52.0207999596704, \"translation_dx\": 62.052266940503074, \"translation_dy\": 15.318990484280505, \"scale\": 1.1445040102422772}\nC: {\"rotation_angle\": -98.17490649350026, \"translation_dx\": 5.744855173473269, \"translation_dy\": -10.705504600001973, \"scale\": 1.1182428392253487}\nD: {\"rotation_angle\": 159.25105466068987, \"translation_dx\": -126.35420360425098, \"translation_dy\": -17.54721978726404, \"scale\": 1.4952435062275256}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_41_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_41_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -98.17490649350026, \"translation_dx\": 5.744855173473269, \"translation_dy\": -10.705504600001973, \"scale\": 1.1182428392253487}\nB: {\"rotation_angle\": -137.69315675508605, \"translation_dx\": -14.965017175186233, \"translation_dy\": 28.85856493302694, \"scale\": 0.6970825252863025}\nC: {\"rotation_angle\": -61.308258156024195, \"translation_dx\": -92.42627707406731, \"translation_dy\": -21.076199203141364, \"scale\": 1.1133621977071444}\nD: {\"rotation_angle\": -22.98450105670534, \"translation_dx\": -24.343109907781525, \"translation_dy\": -75.50859401578859, \"scale\": 0.5077440368943875}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -98.17490649350026, \"translation_dx\": 5.744855173473269, \"translation_dy\": -10.705504600001973, \"scale\": 1.1182428392253487}\nB: {\"rotation_angle\": -137.69315675508605, \"translation_dx\": -14.965017175186233, \"translation_dy\": 28.85856493302694, \"scale\": 0.6970825252863025}\nC: {\"rotation_angle\": -61.308258156024195, \"translation_dx\": -92.42627707406731, \"translation_dy\": -21.076199203141364, \"scale\": 1.1133621977071444}\nD: {\"rotation_angle\": -22.98450105670534, \"translation_dx\": -24.343109907781525, \"translation_dy\": -75.50859401578859, \"scale\": 0.5077440368943875}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_42_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_42_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -169.57691070181107, \"translation_dx\": 67.3776951722352, \"translation_dy\": 6.393739311338578, \"scale\": 0.8283042543093307}\nB: {\"rotation_angle\": -101.64893396855386, \"translation_dx\": -96.08306753711838, \"translation_dy\": 14.852477797043775, \"scale\": 1.3017377870800058}\nC: {\"rotation_angle\": -100.94596249363259, \"translation_dx\": 18.493532966543597, \"translation_dy\": -4.904135882610319, \"scale\": 1.1575890826518318}\nD: {\"rotation_angle\": -35.37165300247324, \"translation_dx\": -51.674784510203665, \"translation_dy\": 35.0550301640573, \"scale\": 1.181842779166554}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -169.57691070181107, \"translation_dx\": 67.3776951722352, \"translation_dy\": 6.393739311338578, \"scale\": 0.8283042543093307}\nB: {\"rotation_angle\": -101.64893396855386, \"translation_dx\": -96.08306753711838, \"translation_dy\": 14.852477797043775, \"scale\": 1.3017377870800058}\nC: {\"rotation_angle\": -100.94596249363259, \"translation_dx\": 18.493532966543597, \"translation_dy\": -4.904135882610319, \"scale\": 1.1575890826518318}\nD: {\"rotation_angle\": -35.37165300247324, \"translation_dx\": -51.674784510203665, \"translation_dy\": 35.0550301640573, \"scale\": 1.181842779166554}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_43_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_43_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -103.5561502427767, \"translation_dx\": -75.76940431238745, \"translation_dy\": -48.3479107136017, \"scale\": 1.0522987713432983}\nB: {\"rotation_angle\": -95.56761680572791, \"translation_dx\": -92.07587430861633, \"translation_dy\": -64.18919222058364, \"scale\": 1.033728049154846}\nC: {\"rotation_angle\": 112.15713698429767, \"translation_dx\": -0.833180316164956, \"translation_dy\": -100.57740000976534, \"scale\": 1.21487245494624}\nD: {\"rotation_angle\": 153.24034529323683, \"translation_dx\": -80.95083564593054, \"translation_dy\": 58.17854805068575, \"scale\": 0.8564275095577245}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -103.5561502427767, \"translation_dx\": -75.76940431238745, \"translation_dy\": -48.3479107136017, \"scale\": 1.0522987713432983}\nB: {\"rotation_angle\": -95.56761680572791, \"translation_dx\": -92.07587430861633, \"translation_dy\": -64.18919222058364, \"scale\": 1.033728049154846}\nC: {\"rotation_angle\": 112.15713698429767, \"translation_dx\": -0.833180316164956, \"translation_dy\": -100.57740000976534, \"scale\": 1.21487245494624}\nD: {\"rotation_angle\": 153.24034529323683, \"translation_dx\": -80.95083564593054, \"translation_dy\": 58.17854805068575, \"scale\": 0.8564275095577245}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_44_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_44_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 97.63348280388993, \"translation_dx\": 59.62332527691919, \"translation_dy\": 12.549462794922746, \"scale\": 0.6927080624806098}\nB: {\"rotation_angle\": -127.03310490562403, \"translation_dx\": -44.497972498107885, \"translation_dy\": 53.252184804163164, \"scale\": 0.8807762361133948}\nC: {\"rotation_angle\": -13.219279868292688, \"translation_dx\": -95.87022677446828, \"translation_dy\": -58.31347876468597, \"scale\": 1.3722022398508045}\nD: {\"rotation_angle\": -14.958482221349612, \"translation_dx\": 49.62118662103501, \"translation_dy\": -13.943537967490855, \"scale\": 1.489574484959727}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 97.63348280388993, \"translation_dx\": 59.62332527691919, \"translation_dy\": 12.549462794922746, \"scale\": 0.6927080624806098}\nB: {\"rotation_angle\": -127.03310490562403, \"translation_dx\": -44.497972498107885, \"translation_dy\": 53.252184804163164, \"scale\": 0.8807762361133948}\nC: {\"rotation_angle\": -13.219279868292688, \"translation_dx\": -95.87022677446828, \"translation_dy\": -58.31347876468597, \"scale\": 1.3722022398508045}\nD: {\"rotation_angle\": -14.958482221349612, \"translation_dx\": 49.62118662103501, \"translation_dy\": -13.943537967490855, \"scale\": 1.489574484959727}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_45_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_45_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -4.956802948250129, \"translation_dx\": -46.115491929325685, \"translation_dy\": 39.01349173096322, \"scale\": 1.02280257064298}\nB: {\"rotation_angle\": 136.2943203908062, \"translation_dx\": 59.15508525636656, \"translation_dy\": -38.46099161723379, \"scale\": 0.6414776081953896}\nC: {\"rotation_angle\": 97.08459407481979, \"translation_dx\": 38.76418659488206, \"translation_dy\": 44.81166266995322, \"scale\": 1.27585958531192}\nD: {\"rotation_angle\": 123.61853421760617, \"translation_dx\": -93.63136806510369, \"translation_dy\": -15.65687765252683, \"scale\": 0.9834422929774667}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -4.956802948250129, \"translation_dx\": -46.115491929325685, \"translation_dy\": 39.01349173096322, \"scale\": 1.02280257064298}\nB: {\"rotation_angle\": 136.2943203908062, \"translation_dx\": 59.15508525636656, \"translation_dy\": -38.46099161723379, \"scale\": 0.6414776081953896}\nC: {\"rotation_angle\": 97.08459407481979, \"translation_dx\": 38.76418659488206, \"translation_dy\": 44.81166266995322, \"scale\": 1.27585958531192}\nD: {\"rotation_angle\": 123.61853421760617, \"translation_dx\": -93.63136806510369, \"translation_dy\": -15.65687765252683, \"scale\": 0.9834422929774667}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_46_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_46_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 45.786611297437304, \"translation_dx\": 45.53183354666939, \"translation_dy\": -112.45880863798888, \"scale\": 0.5686394776423458}\nB: {\"rotation_angle\": -101.64893396855386, \"translation_dx\": -96.08306753711838, \"translation_dy\": 14.852477797043775, \"scale\": 1.3017377870800058}\nC: {\"rotation_angle\": 98.62110540120432, \"translation_dx\": 55.8324503005326, \"translation_dy\": -53.32963696213369, \"scale\": 1.3342375308232577}\nD: {\"rotation_angle\": 98.12478073081388, \"translation_dx\": 82.24255679101596, \"translation_dy\": 10.638794739410258, \"scale\": 1.454613875934863}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 45.786611297437304, \"translation_dx\": 45.53183354666939, \"translation_dy\": -112.45880863798888, \"scale\": 0.5686394776423458}\nB: {\"rotation_angle\": -101.64893396855386, \"translation_dx\": -96.08306753711838, \"translation_dy\": 14.852477797043775, \"scale\": 1.3017377870800058}\nC: {\"rotation_angle\": 98.62110540120432, \"translation_dx\": 55.8324503005326, \"translation_dy\": -53.32963696213369, \"scale\": 1.3342375308232577}\nD: {\"rotation_angle\": 98.12478073081388, \"translation_dx\": 82.24255679101596, \"translation_dy\": 10.638794739410258, \"scale\": 1.454613875934863}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_47_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_47_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 104.66960596229086, \"translation_dx\": 122.9579606372167, \"translation_dy\": -32.21502556645471, \"scale\": 0.5791563638149022}\nB: {\"rotation_angle\": 162.6656255846617, \"translation_dx\": -24.713919503645087, \"translation_dy\": -0.6846177496217649, \"scale\": 0.967192316827237}\nC: {\"rotation_angle\": 159.74516071456964, \"translation_dx\": 18.36539372865252, \"translation_dy\": -32.68583255299669, \"scale\": 0.6283421405871866}\nD: {\"rotation_angle\": 112.15713698429767, \"translation_dx\": -0.833180316164956, \"translation_dy\": -100.57740000976534, \"scale\": 1.21487245494624}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 104.66960596229086, \"translation_dx\": 122.9579606372167, \"translation_dy\": -32.21502556645471, \"scale\": 0.5791563638149022}\nB: {\"rotation_angle\": 162.6656255846617, \"translation_dx\": -24.713919503645087, \"translation_dy\": -0.6846177496217649, \"scale\": 0.967192316827237}\nC: {\"rotation_angle\": 159.74516071456964, \"translation_dx\": 18.36539372865252, \"translation_dy\": -32.68583255299669, \"scale\": 0.6283421405871866}\nD: {\"rotation_angle\": 112.15713698429767, \"translation_dx\": -0.833180316164956, \"translation_dy\": -100.57740000976534, \"scale\": 1.21487245494624}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_48_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_48_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 104.66960596229086, \"translation_dx\": 122.9579606372167, \"translation_dy\": -32.21502556645471, \"scale\": 0.5791563638149022}\nB: {\"rotation_angle\": 49.90656423603761, \"translation_dx\": 85.27067294320437, \"translation_dy\": -8.928665399863448, \"scale\": 0.9370060594249733}\nC: {\"rotation_angle\": -41.748048059314925, \"translation_dx\": 84.2495675740148, \"translation_dy\": -81.02778113177463, \"scale\": 1.207158201764622}\nD: {\"rotation_angle\": -113.69332067912192, \"translation_dx\": -23.005200251858383, \"translation_dy\": 57.916315250854666, \"scale\": 0.5483419258047426}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 104.66960596229086, \"translation_dx\": 122.9579606372167, \"translation_dy\": -32.21502556645471, \"scale\": 0.5791563638149022}\nB: {\"rotation_angle\": 49.90656423603761, \"translation_dx\": 85.27067294320437, \"translation_dy\": -8.928665399863448, \"scale\": 0.9370060594249733}\nC: {\"rotation_angle\": -41.748048059314925, \"translation_dx\": 84.2495675740148, \"translation_dy\": -81.02778113177463, \"scale\": 1.207158201764622}\nD: {\"rotation_angle\": -113.69332067912192, \"translation_dx\": -23.005200251858383, \"translation_dy\": 57.916315250854666, \"scale\": 0.5483419258047426}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_49_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_49_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -94.06455293225282, \"translation_dx\": -52.04430006776356, \"translation_dy\": 88.55937507710391, \"scale\": 0.8369046461483086}\nB: {\"rotation_angle\": -44.902472769484746, \"translation_dx\": -36.85475324083902, \"translation_dy\": 36.81692000181951, \"scale\": 1.0769710077370194}\nC: {\"rotation_angle\": 136.2943203908062, \"translation_dx\": 59.15508525636656, \"translation_dy\": -38.46099161723379, \"scale\": 0.6414776081953896}\nD: {\"rotation_angle\": 67.74863170033868, \"translation_dx\": 0.9436916559104702, \"translation_dy\": 79.02717939495389, \"scale\": 1.0490112177140545}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -94.06455293225282, \"translation_dx\": -52.04430006776356, \"translation_dy\": 88.55937507710391, \"scale\": 0.8369046461483086}\nB: {\"rotation_angle\": -44.902472769484746, \"translation_dx\": -36.85475324083902, \"translation_dy\": 36.81692000181951, \"scale\": 1.0769710077370194}\nC: {\"rotation_angle\": 136.2943203908062, \"translation_dx\": 59.15508525636656, \"translation_dy\": -38.46099161723379, \"scale\": 0.6414776081953896}\nD: {\"rotation_angle\": 67.74863170033868, \"translation_dx\": 0.9436916559104702, \"translation_dy\": 79.02717939495389, \"scale\": 1.0490112177140545}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_50_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_50_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -169.57691070181107, \"translation_dx\": 67.3776951722352, \"translation_dy\": 6.393739311338578, \"scale\": 0.8283042543093307}\nB: {\"rotation_angle\": 53.86809011441332, \"translation_dx\": -15.131168518097624, \"translation_dy\": -31.300037391593577, \"scale\": 1.3154620606808156}\nC: {\"rotation_angle\": -70.18179574394556, \"translation_dx\": -84.02989442213027, \"translation_dy\": 45.46342410564398, \"scale\": 1.28660403831869}\nD: {\"rotation_angle\": -137.69315675508605, \"translation_dx\": -14.965017175186233, \"translation_dy\": 28.85856493302694, \"scale\": 0.6970825252863025}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -169.57691070181107, \"translation_dx\": 67.3776951722352, \"translation_dy\": 6.393739311338578, \"scale\": 0.8283042543093307}\nB: {\"rotation_angle\": 53.86809011441332, \"translation_dx\": -15.131168518097624, \"translation_dy\": -31.300037391593577, \"scale\": 1.3154620606808156}\nC: {\"rotation_angle\": -70.18179574394556, \"translation_dx\": -84.02989442213027, \"translation_dy\": 45.46342410564398, \"scale\": 1.28660403831869}\nD: {\"rotation_angle\": -137.69315675508605, \"translation_dx\": -14.965017175186233, \"translation_dy\": 28.85856493302694, \"scale\": 0.6970825252863025}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_51_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_51_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 159.25105466068987, \"translation_dx\": -126.35420360425098, \"translation_dy\": -17.54721978726404, \"scale\": 1.4952435062275256}\nB: {\"rotation_angle\": 168.86687879669455, \"translation_dx\": 30.327287286076626, \"translation_dy\": -73.84263373893171, \"scale\": 1.0887904122788439}\nC: {\"rotation_angle\": 141.74747753602782, \"translation_dx\": -54.793360600935046, \"translation_dy\": -29.72546528603263, \"scale\": 0.6563706152769926}\nD: {\"rotation_angle\": -4.956802948250129, \"translation_dx\": -46.115491929325685, \"translation_dy\": 39.01349173096322, \"scale\": 1.02280257064298}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 159.25105466068987, \"translation_dx\": -126.35420360425098, \"translation_dy\": -17.54721978726404, \"scale\": 1.4952435062275256}\nB: {\"rotation_angle\": 168.86687879669455, \"translation_dx\": 30.327287286076626, \"translation_dy\": -73.84263373893171, \"scale\": 1.0887904122788439}\nC: {\"rotation_angle\": 141.74747753602782, \"translation_dx\": -54.793360600935046, \"translation_dy\": -29.72546528603263, \"scale\": 0.6563706152769926}\nD: {\"rotation_angle\": -4.956802948250129, \"translation_dx\": -46.115491929325685, \"translation_dy\": 39.01349173096322, \"scale\": 1.02280257064298}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_52_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_52_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 32.170058088704565, \"translation_dx\": 62.48780444449932, \"translation_dy\": 36.464458087386475, \"scale\": 0.8338243238440678}\nB: {\"rotation_angle\": 52.27392299801002, \"translation_dx\": -7.943242591889941, \"translation_dy\": -1.8318597711701017, \"scale\": 1.489664776133741}\nC: {\"rotation_angle\": -31.020660516088725, \"translation_dx\": 105.99805178546191, \"translation_dy\": -82.8489656004858, \"scale\": 1.0703563169477137}\nD: {\"rotation_angle\": -68.79930104020924, \"translation_dx\": -103.12901971602221, \"translation_dy\": 94.89161684072867, \"scale\": 1.2295411735859756}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 32.170058088704565, \"translation_dx\": 62.48780444449932, \"translation_dy\": 36.464458087386475, \"scale\": 0.8338243238440678}\nB: {\"rotation_angle\": 52.27392299801002, \"translation_dx\": -7.943242591889941, \"translation_dy\": -1.8318597711701017, \"scale\": 1.489664776133741}\nC: {\"rotation_angle\": -31.020660516088725, \"translation_dx\": 105.99805178546191, \"translation_dy\": -82.8489656004858, \"scale\": 1.0703563169477137}\nD: {\"rotation_angle\": -68.79930104020924, \"translation_dx\": -103.12901971602221, \"translation_dy\": 94.89161684072867, \"scale\": 1.2295411735859756}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_53_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_53_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 159.18509857624855, \"translation_dx\": 94.5972413522399, \"translation_dy\": -87.01463724053234, \"scale\": 0.7914176569510836}\nB: {\"rotation_angle\": 67.74863170033868, \"translation_dx\": 0.9436916559104702, \"translation_dy\": 79.02717939495389, \"scale\": 1.0490112177140545}\nC: {\"rotation_angle\": 161.7596265938729, \"translation_dx\": -9.170216354863072, \"translation_dy\": -19.23222492696047, \"scale\": 1.1821087248622173}\nD: {\"rotation_angle\": -51.98717119490195, \"translation_dx\": -83.93544420557635, \"translation_dy\": -17.359661719977098, \"scale\": 1.0858344969275349}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 159.18509857624855, \"translation_dx\": 94.5972413522399, \"translation_dy\": -87.01463724053234, \"scale\": 0.7914176569510836}\nB: {\"rotation_angle\": 67.74863170033868, \"translation_dx\": 0.9436916559104702, \"translation_dy\": 79.02717939495389, \"scale\": 1.0490112177140545}\nC: {\"rotation_angle\": 161.7596265938729, \"translation_dx\": -9.170216354863072, \"translation_dy\": -19.23222492696047, \"scale\": 1.1821087248622173}\nD: {\"rotation_angle\": -51.98717119490195, \"translation_dx\": -83.93544420557635, \"translation_dy\": -17.359661719977098, \"scale\": 1.0858344969275349}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_54_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_54_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 139.13421797404374, \"translation_dx\": -107.62188977651758, \"translation_dy\": -65.35657968686931, \"scale\": 0.569575564082204}\nB: {\"rotation_angle\": 159.39197876032466, \"translation_dx\": -101.87275621292875, \"translation_dy\": -32.606176111808466, \"scale\": 0.6647290774480178}\nC: {\"rotation_angle\": 72.25092677282458, \"translation_dx\": 61.389740502873025, \"translation_dy\": -36.86538640455047, \"scale\": 1.0748600769835353}\nD: {\"rotation_angle\": 163.34031080178892, \"translation_dx\": -21.567151354845635, \"translation_dy\": -30.72615389540148, \"scale\": 1.2439888416024685}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 139.13421797404374, \"translation_dx\": -107.62188977651758, \"translation_dy\": -65.35657968686931, \"scale\": 0.569575564082204}\nB: {\"rotation_angle\": 159.39197876032466, \"translation_dx\": -101.87275621292875, \"translation_dy\": -32.606176111808466, \"scale\": 0.6647290774480178}\nC: {\"rotation_angle\": 72.25092677282458, \"translation_dx\": 61.389740502873025, \"translation_dy\": -36.86538640455047, \"scale\": 1.0748600769835353}\nD: {\"rotation_angle\": 163.34031080178892, \"translation_dx\": -21.567151354845635, \"translation_dy\": -30.72615389540148, \"scale\": 1.2439888416024685}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_55_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_55_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 95.69634927891752, \"translation_dx\": -96.46148729426875, \"translation_dy\": -25.496381966922478, \"scale\": 0.7479348241153333}\nB: {\"rotation_angle\": -110.51021822636605, \"translation_dx\": -17.924195571284486, \"translation_dy\": -0.10679752473519954, \"scale\": 1.4066663412939815}\nC: {\"rotation_angle\": 12.872370969250312, \"translation_dx\": -43.1533458138392, \"translation_dy\": -64.88511529320917, \"scale\": 1.3092068537816153}\nD: {\"rotation_angle\": -22.98450105670534, \"translation_dx\": -24.343109907781525, \"translation_dy\": -75.50859401578859, \"scale\": 0.5077440368943875}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 95.69634927891752, \"translation_dx\": -96.46148729426875, \"translation_dy\": -25.496381966922478, \"scale\": 0.7479348241153333}\nB: {\"rotation_angle\": -110.51021822636605, \"translation_dx\": -17.924195571284486, \"translation_dy\": -0.10679752473519954, \"scale\": 1.4066663412939815}\nC: {\"rotation_angle\": 12.872370969250312, \"translation_dx\": -43.1533458138392, \"translation_dy\": -64.88511529320917, \"scale\": 1.3092068537816153}\nD: {\"rotation_angle\": -22.98450105670534, \"translation_dx\": -24.343109907781525, \"translation_dy\": -75.50859401578859, \"scale\": 0.5077440368943875}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_56_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_56_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -6.258618837806779, \"translation_dx\": -117.56200624611057, \"translation_dy\": -84.92852320396813, \"scale\": 0.8703619649920769}\nB: {\"rotation_angle\": 123.61853421760617, \"translation_dx\": -93.63136806510369, \"translation_dy\": -15.65687765252683, \"scale\": 0.9834422929774667}\nC: {\"rotation_angle\": -6.38420562293993, \"translation_dx\": -106.80670691302902, \"translation_dy\": -3.5935098985529663, \"scale\": 1.3037846299861797}\nD: {\"rotation_angle\": 26.051749493295517, \"translation_dx\": 8.674153667650117, \"translation_dy\": 81.98381249796742, \"scale\": 1.4721363798843865}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -6.258618837806779, \"translation_dx\": -117.56200624611057, \"translation_dy\": -84.92852320396813, \"scale\": 0.8703619649920769}\nB: {\"rotation_angle\": 123.61853421760617, \"translation_dx\": -93.63136806510369, \"translation_dy\": -15.65687765252683, \"scale\": 0.9834422929774667}\nC: {\"rotation_angle\": -6.38420562293993, \"translation_dx\": -106.80670691302902, \"translation_dy\": -3.5935098985529663, \"scale\": 1.3037846299861797}\nD: {\"rotation_angle\": 26.051749493295517, \"translation_dx\": 8.674153667650117, \"translation_dy\": 81.98381249796742, \"scale\": 1.4721363798843865}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_57_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_57_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 28.186459007199005, \"translation_dx\": -85.64869298892413, \"translation_dy\": -90.9589081114641, \"scale\": 0.5939510579225048}\nB: {\"rotation_angle\": -126.15991399279281, \"translation_dx\": 24.895638463286446, \"translation_dy\": -35.71086816730676, \"scale\": 1.30648936857296}\nC: {\"rotation_angle\": 134.59992138556464, \"translation_dx\": 5.908404103559974, \"translation_dy\": 47.60587687007518, \"scale\": 1.0105063493742612}\nD: {\"rotation_angle\": 99.4759866737457, \"translation_dx\": -117.67383777244245, \"translation_dy\": -44.645046657688624, \"scale\": 1.4332006009229632}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 28.186459007199005, \"translation_dx\": -85.64869298892413, \"translation_dy\": -90.9589081114641, \"scale\": 0.5939510579225048}\nB: {\"rotation_angle\": -126.15991399279281, \"translation_dx\": 24.895638463286446, \"translation_dy\": -35.71086816730676, \"scale\": 1.30648936857296}\nC: {\"rotation_angle\": 134.59992138556464, \"translation_dx\": 5.908404103559974, \"translation_dy\": 47.60587687007518, \"scale\": 1.0105063493742612}\nD: {\"rotation_angle\": 99.4759866737457, \"translation_dx\": -117.67383777244245, \"translation_dy\": -44.645046657688624, \"scale\": 1.4332006009229632}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_58_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_58_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 99.38174871704592, \"translation_dx\": 57.870588734166205, \"translation_dy\": 17.413162007690403, \"scale\": 1.4113398114931053}\nB: {\"rotation_angle\": -4.956802948250129, \"translation_dx\": -46.115491929325685, \"translation_dy\": 39.01349173096322, \"scale\": 1.02280257064298}\nC: {\"rotation_angle\": -37.8135886633452, \"translation_dx\": 94.09848811207868, \"translation_dy\": -28.846940165704815, \"scale\": 0.7423292461324351}\nD: {\"rotation_angle\": -120.90208363304777, \"translation_dx\": -24.471100960859047, \"translation_dy\": -96.60346561133943, \"scale\": 1.2238954631080248}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 99.38174871704592, \"translation_dx\": 57.870588734166205, \"translation_dy\": 17.413162007690403, \"scale\": 1.4113398114931053}\nB: {\"rotation_angle\": -4.956802948250129, \"translation_dx\": -46.115491929325685, \"translation_dy\": 39.01349173096322, \"scale\": 1.02280257064298}\nC: {\"rotation_angle\": -37.8135886633452, \"translation_dx\": 94.09848811207868, \"translation_dy\": -28.846940165704815, \"scale\": 0.7423292461324351}\nD: {\"rotation_angle\": -120.90208363304777, \"translation_dx\": -24.471100960859047, \"translation_dy\": -96.60346561133943, \"scale\": 1.2238954631080248}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_59_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_59_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -5.683971346231118, \"translation_dx\": -0.7123036436211407, \"translation_dy\": -23.660599152813326, \"scale\": 1.1241034499451734}\nB: {\"rotation_angle\": 74.4727172984789, \"translation_dx\": 83.0498783040965, \"translation_dy\": 24.573318419119772, \"scale\": 1.4775593630739356}\nC: {\"rotation_angle\": -152.40502323992493, \"translation_dx\": -0.6096313646742146, \"translation_dy\": 26.2224872549711, \"scale\": 0.6008305458537412}\nD: {\"rotation_angle\": 134.66606893121838, \"translation_dx\": 30.71289427748178, \"translation_dy\": 31.00111281943242, \"scale\": 0.9716368665085688}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -5.683971346231118, \"translation_dx\": -0.7123036436211407, \"translation_dy\": -23.660599152813326, \"scale\": 1.1241034499451734}\nB: {\"rotation_angle\": 74.4727172984789, \"translation_dx\": 83.0498783040965, \"translation_dy\": 24.573318419119772, \"scale\": 1.4775593630739356}\nC: {\"rotation_angle\": -152.40502323992493, \"translation_dx\": -0.6096313646742146, \"translation_dy\": 26.2224872549711, \"scale\": 0.6008305458537412}\nD: {\"rotation_angle\": 134.66606893121838, \"translation_dx\": 30.71289427748178, \"translation_dy\": 31.00111281943242, \"scale\": 0.9716368665085688}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_60_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_60_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -49.11147497176091, \"translation_dx\": -21.61309921155923, \"translation_dy\": 41.841400081955015, \"scale\": 1.3374733710705384}\nB: {\"rotation_angle\": 52.27392299801002, \"translation_dx\": -7.943242591889941, \"translation_dy\": -1.8318597711701017, \"scale\": 1.489664776133741}\nC: {\"rotation_angle\": -162.31682909306286, \"translation_dx\": 94.60975693720637, \"translation_dy\": -28.569332128995313, \"scale\": 1.1251281587345527}\nD: {\"rotation_angle\": -152.40502323992493, \"translation_dx\": -0.6096313646742146, \"translation_dy\": 26.2224872549711, \"scale\": 0.6008305458537412}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -49.11147497176091, \"translation_dx\": -21.61309921155923, \"translation_dy\": 41.841400081955015, \"scale\": 1.3374733710705384}\nB: {\"rotation_angle\": 52.27392299801002, \"translation_dx\": -7.943242591889941, \"translation_dy\": -1.8318597711701017, \"scale\": 1.489664776133741}\nC: {\"rotation_angle\": -162.31682909306286, \"translation_dx\": 94.60975693720637, \"translation_dy\": -28.569332128995313, \"scale\": 1.1251281587345527}\nD: {\"rotation_angle\": -152.40502323992493, \"translation_dx\": -0.6096313646742146, \"translation_dy\": 26.2224872549711, \"scale\": 0.6008305458537412}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_61_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_61_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -100.94596249363259, \"translation_dx\": 18.493532966543597, \"translation_dy\": -4.904135882610319, \"scale\": 1.1575890826518318}\nB: {\"rotation_angle\": -26.00307697103628, \"translation_dx\": -100.91027332279833, \"translation_dy\": 27.120302875093685, \"scale\": 0.9546103505495939}\nC: {\"rotation_angle\": 99.4759866737457, \"translation_dx\": -117.67383777244245, \"translation_dy\": -44.645046657688624, \"scale\": 1.4332006009229632}\nD: {\"rotation_angle\": -165.5576257925042, \"translation_dx\": 120.02978270991923, \"translation_dy\": -94.68626204020723, \"scale\": 1.377433782383828}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -100.94596249363259, \"translation_dx\": 18.493532966543597, \"translation_dy\": -4.904135882610319, \"scale\": 1.1575890826518318}\nB: {\"rotation_angle\": -26.00307697103628, \"translation_dx\": -100.91027332279833, \"translation_dy\": 27.120302875093685, \"scale\": 0.9546103505495939}\nC: {\"rotation_angle\": 99.4759866737457, \"translation_dx\": -117.67383777244245, \"translation_dy\": -44.645046657688624, \"scale\": 1.4332006009229632}\nD: {\"rotation_angle\": -165.5576257925042, \"translation_dx\": 120.02978270991923, \"translation_dy\": -94.68626204020723, \"scale\": 1.377433782383828}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_62_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_62_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -37.8135886633452, \"translation_dx\": 94.09848811207868, \"translation_dy\": -28.846940165704815, \"scale\": 0.7423292461324351}\nB: {\"rotation_angle\": 98.12478073081388, \"translation_dx\": 82.24255679101596, \"translation_dy\": 10.638794739410258, \"scale\": 1.454613875934863}\nC: {\"rotation_angle\": -98.17490649350026, \"translation_dx\": 5.744855173473269, \"translation_dy\": -10.705504600001973, \"scale\": 1.1182428392253487}\nD: {\"rotation_angle\": 4.601729825002167, \"translation_dx\": -92.34842360064926, \"translation_dy\": 78.34726427877602, \"scale\": 0.7620115680057987}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -37.8135886633452, \"translation_dx\": 94.09848811207868, \"translation_dy\": -28.846940165704815, \"scale\": 0.7423292461324351}\nB: {\"rotation_angle\": 98.12478073081388, \"translation_dx\": 82.24255679101596, \"translation_dy\": 10.638794739410258, \"scale\": 1.454613875934863}\nC: {\"rotation_angle\": -98.17490649350026, \"translation_dx\": 5.744855173473269, \"translation_dy\": -10.705504600001973, \"scale\": 1.1182428392253487}\nD: {\"rotation_angle\": 4.601729825002167, \"translation_dx\": -92.34842360064926, \"translation_dy\": 78.34726427877602, \"scale\": 0.7620115680057987}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_63_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_63_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -59.18065174130953, \"translation_dx\": -66.15733764198566, \"translation_dy\": -32.06450758946801, \"scale\": 1.1967157159259998}\nB: {\"rotation_angle\": 48.71833122181758, \"translation_dx\": -105.22683210092106, \"translation_dy\": -63.34096559919908, \"scale\": 0.7204478932238769}\nC: {\"rotation_angle\": 143.38145335973087, \"translation_dx\": 86.67970142496799, \"translation_dy\": -33.57640317277091, \"scale\": 0.6114655384261714}\nD: {\"rotation_angle\": -149.42147215379055, \"translation_dx\": 2.3444194857030283, \"translation_dy\": 35.92779325530762, \"scale\": 1.0223945055206394}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -59.18065174130953, \"translation_dx\": -66.15733764198566, \"translation_dy\": -32.06450758946801, \"scale\": 1.1967157159259998}\nB: {\"rotation_angle\": 48.71833122181758, \"translation_dx\": -105.22683210092106, \"translation_dy\": -63.34096559919908, \"scale\": 0.7204478932238769}\nC: {\"rotation_angle\": 143.38145335973087, \"translation_dx\": 86.67970142496799, \"translation_dy\": -33.57640317277091, \"scale\": 0.6114655384261714}\nD: {\"rotation_angle\": -149.42147215379055, \"translation_dx\": 2.3444194857030283, \"translation_dy\": 35.92779325530762, \"scale\": 1.0223945055206394}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_64_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_64_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -53.475823147809436, \"translation_dx\": -52.11444637245131, \"translation_dy\": -7.974464084606126, \"scale\": 1.302004904680502}\nB: {\"rotation_angle\": -103.24791656906933, \"translation_dx\": -2.2454836983213227, \"translation_dy\": 24.014319900588845, \"scale\": 1.3204557483507742}\nC: {\"rotation_angle\": -4.956802948250129, \"translation_dx\": -46.115491929325685, \"translation_dy\": 39.01349173096322, \"scale\": 1.02280257064298}\nD: {\"rotation_angle\": 64.33574528550244, \"translation_dx\": -83.09111528364858, \"translation_dy\": 12.26726314152404, \"scale\": 0.7845370507816389}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -53.475823147809436, \"translation_dx\": -52.11444637245131, \"translation_dy\": -7.974464084606126, \"scale\": 1.302004904680502}\nB: {\"rotation_angle\": -103.24791656906933, \"translation_dx\": -2.2454836983213227, \"translation_dy\": 24.014319900588845, \"scale\": 1.3204557483507742}\nC: {\"rotation_angle\": -4.956802948250129, \"translation_dx\": -46.115491929325685, \"translation_dy\": 39.01349173096322, \"scale\": 1.02280257064298}\nD: {\"rotation_angle\": 64.33574528550244, \"translation_dx\": -83.09111528364858, \"translation_dy\": 12.26726314152404, \"scale\": 0.7845370507816389}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_65_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_65_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 83.49682873903629, \"translation_dx\": -127.2042493945246, \"translation_dy\": 2.6616959584396938, \"scale\": 0.9488759478249397}\nB: {\"rotation_angle\": 32.170058088704565, \"translation_dx\": 62.48780444449932, \"translation_dy\": 36.464458087386475, \"scale\": 0.8338243238440678}\nC: {\"rotation_angle\": 99.4759866737457, \"translation_dx\": -117.67383777244245, \"translation_dy\": -44.645046657688624, \"scale\": 1.4332006009229632}\nD: {\"rotation_angle\": 52.27392299801002, \"translation_dx\": -7.943242591889941, \"translation_dy\": -1.8318597711701017, \"scale\": 1.489664776133741}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 83.49682873903629, \"translation_dx\": -127.2042493945246, \"translation_dy\": 2.6616959584396938, \"scale\": 0.9488759478249397}\nB: {\"rotation_angle\": 32.170058088704565, \"translation_dx\": 62.48780444449932, \"translation_dy\": 36.464458087386475, \"scale\": 0.8338243238440678}\nC: {\"rotation_angle\": 99.4759866737457, \"translation_dx\": -117.67383777244245, \"translation_dy\": -44.645046657688624, \"scale\": 1.4332006009229632}\nD: {\"rotation_angle\": 52.27392299801002, \"translation_dx\": -7.943242591889941, \"translation_dy\": -1.8318597711701017, \"scale\": 1.489664776133741}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_66_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_66_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 23.955007488404988, \"translation_dx\": 90.0018582930472, \"translation_dy\": 38.03553582875617, \"scale\": 1.3380437802347522}\nB: {\"rotation_angle\": -113.69332067912192, \"translation_dx\": -23.005200251858383, \"translation_dy\": 57.916315250854666, \"scale\": 0.5483419258047426}\nC: {\"rotation_angle\": 14.369437993555863, \"translation_dx\": -23.54312301695805, \"translation_dy\": 55.41046511147678, \"scale\": 1.115345902394854}\nD: {\"rotation_angle\": 97.08459407481979, \"translation_dx\": 38.76418659488206, \"translation_dy\": 44.81166266995322, \"scale\": 1.27585958531192}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 23.955007488404988, \"translation_dx\": 90.0018582930472, \"translation_dy\": 38.03553582875617, \"scale\": 1.3380437802347522}\nB: {\"rotation_angle\": -113.69332067912192, \"translation_dx\": -23.005200251858383, \"translation_dy\": 57.916315250854666, \"scale\": 0.5483419258047426}\nC: {\"rotation_angle\": 14.369437993555863, \"translation_dx\": -23.54312301695805, \"translation_dy\": 55.41046511147678, \"scale\": 1.115345902394854}\nD: {\"rotation_angle\": 97.08459407481979, \"translation_dx\": 38.76418659488206, \"translation_dy\": 44.81166266995322, \"scale\": 1.27585958531192}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_67_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_67_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 37.640985396206986, \"translation_dx\": -97.39428669742068, \"translation_dy\": 17.900860680283458, \"scale\": 1.0930243251030827}\nB: {\"rotation_angle\": 53.86809011441332, \"translation_dx\": -15.131168518097624, \"translation_dy\": -31.300037391593577, \"scale\": 1.3154620606808156}\nC: {\"rotation_angle\": 49.896013394485834, \"translation_dx\": -25.763756683237403, \"translation_dy\": -26.432232271484168, \"scale\": 1.1619310734744932}\nD: {\"rotation_angle\": -110.46391589612124, \"translation_dx\": -77.96644542647721, \"translation_dy\": -50.23500265461973, \"scale\": 0.7651088884143488}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 37.640985396206986, \"translation_dx\": -97.39428669742068, \"translation_dy\": 17.900860680283458, \"scale\": 1.0930243251030827}\nB: {\"rotation_angle\": 53.86809011441332, \"translation_dx\": -15.131168518097624, \"translation_dy\": -31.300037391593577, \"scale\": 1.3154620606808156}\nC: {\"rotation_angle\": 49.896013394485834, \"translation_dx\": -25.763756683237403, \"translation_dy\": -26.432232271484168, \"scale\": 1.1619310734744932}\nD: {\"rotation_angle\": -110.46391589612124, \"translation_dx\": -77.96644542647721, \"translation_dy\": -50.23500265461973, \"scale\": 0.7651088884143488}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_68_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_68_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -169.57691070181107, \"translation_dx\": 67.3776951722352, \"translation_dy\": 6.393739311338578, \"scale\": 0.8283042543093307}\nB: {\"rotation_angle\": -153.95647753312159, \"translation_dx\": 64.08546266437509, \"translation_dy\": -34.554486291313935, \"scale\": 1.423360690418288}\nC: {\"rotation_angle\": 141.74747753602782, \"translation_dx\": -54.793360600935046, \"translation_dy\": -29.72546528603263, \"scale\": 0.6563706152769926}\nD: {\"rotation_angle\": 138.15953129001275, \"translation_dx\": 108.29077351507729, \"translation_dy\": 11.25207260435026, \"scale\": 1.2682750116992958}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -169.57691070181107, \"translation_dx\": 67.3776951722352, \"translation_dy\": 6.393739311338578, \"scale\": 0.8283042543093307}\nB: {\"rotation_angle\": -153.95647753312159, \"translation_dx\": 64.08546266437509, \"translation_dy\": -34.554486291313935, \"scale\": 1.423360690418288}\nC: {\"rotation_angle\": 141.74747753602782, \"translation_dx\": -54.793360600935046, \"translation_dy\": -29.72546528603263, \"scale\": 0.6563706152769926}\nD: {\"rotation_angle\": 138.15953129001275, \"translation_dx\": 108.29077351507729, \"translation_dy\": 11.25207260435026, \"scale\": 1.2682750116992958}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_69_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_69_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 72.25092677282458, \"translation_dx\": 61.389740502873025, \"translation_dy\": -36.86538640455047, \"scale\": 1.0748600769835353}\nB: {\"rotation_angle\": -16.878745814478265, \"translation_dx\": -68.86659110743665, \"translation_dy\": -98.54142762965468, \"scale\": 1.2648663928919022}\nC: {\"rotation_angle\": -113.69332067912192, \"translation_dx\": -23.005200251858383, \"translation_dy\": 57.916315250854666, \"scale\": 0.5483419258047426}\nD: {\"rotation_angle\": 162.98131081099467, \"translation_dx\": -80.19473687776261, \"translation_dy\": -17.70282064458462, \"scale\": 1.2855975600149028}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 72.25092677282458, \"translation_dx\": 61.389740502873025, \"translation_dy\": -36.86538640455047, \"scale\": 1.0748600769835353}\nB: {\"rotation_angle\": -16.878745814478265, \"translation_dx\": -68.86659110743665, \"translation_dy\": -98.54142762965468, \"scale\": 1.2648663928919022}\nC: {\"rotation_angle\": -113.69332067912192, \"translation_dx\": -23.005200251858383, \"translation_dy\": 57.916315250854666, \"scale\": 0.5483419258047426}\nD: {\"rotation_angle\": 162.98131081099467, \"translation_dx\": -80.19473687776261, \"translation_dy\": -17.70282064458462, \"scale\": 1.2855975600149028}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_70_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_70_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 53.86809011441332, \"translation_dx\": -15.131168518097624, \"translation_dy\": -31.300037391593577, \"scale\": 1.3154620606808156}\nB: {\"rotation_angle\": 115.44035395260755, \"translation_dx\": 104.38539690843712, \"translation_dy\": -82.71757148170198, \"scale\": 0.6534862534786243}\nC: {\"rotation_angle\": 84.88997243843744, \"translation_dx\": 19.30269357274682, \"translation_dy\": 9.929350250110147, \"scale\": 1.0595552381550672}\nD: {\"rotation_angle\": 130.382151153576, \"translation_dx\": 48.77925626504499, \"translation_dy\": 54.89982459749416, \"scale\": 1.3647831130001666}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 53.86809011441332, \"translation_dx\": -15.131168518097624, \"translation_dy\": -31.300037391593577, \"scale\": 1.3154620606808156}\nB: {\"rotation_angle\": 115.44035395260755, \"translation_dx\": 104.38539690843712, \"translation_dy\": -82.71757148170198, \"scale\": 0.6534862534786243}\nC: {\"rotation_angle\": 84.88997243843744, \"translation_dx\": 19.30269357274682, \"translation_dy\": 9.929350250110147, \"scale\": 1.0595552381550672}\nD: {\"rotation_angle\": 130.382151153576, \"translation_dx\": 48.77925626504499, \"translation_dy\": 54.89982459749416, \"scale\": 1.3647831130001666}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_71_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_71_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 26.06413776863195, \"translation_dx\": 104.54441011530889, \"translation_dy\": -2.802993361858995, \"scale\": 0.6919535578881184}\nB: {\"rotation_angle\": -147.17742740700606, \"translation_dx\": 99.79022385553455, \"translation_dy\": -46.32888217161055, \"scale\": 1.2561938294527635}\nC: {\"rotation_angle\": 159.74516071456964, \"translation_dx\": 18.36539372865252, \"translation_dy\": -32.68583255299669, \"scale\": 0.6283421405871866}\nD: {\"rotation_angle\": 104.66960596229086, \"translation_dx\": 122.9579606372167, \"translation_dy\": -32.21502556645471, \"scale\": 0.5791563638149022}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 26.06413776863195, \"translation_dx\": 104.54441011530889, \"translation_dy\": -2.802993361858995, \"scale\": 0.6919535578881184}\nB: {\"rotation_angle\": -147.17742740700606, \"translation_dx\": 99.79022385553455, \"translation_dy\": -46.32888217161055, \"scale\": 1.2561938294527635}\nC: {\"rotation_angle\": 159.74516071456964, \"translation_dx\": 18.36539372865252, \"translation_dy\": -32.68583255299669, \"scale\": 0.6283421405871866}\nD: {\"rotation_angle\": 104.66960596229086, \"translation_dx\": 122.9579606372167, \"translation_dy\": -32.21502556645471, \"scale\": 0.5791563638149022}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_72_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_72_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -127.2688410750471, \"translation_dx\": 10.330064507300825, \"translation_dy\": -25.010404065134438, \"scale\": 1.1376215421095472}\nB: {\"rotation_angle\": 138.15953129001275, \"translation_dx\": 108.29077351507729, \"translation_dy\": 11.25207260435026, \"scale\": 1.2682750116992958}\nC: {\"rotation_angle\": -8.756342422911757, \"translation_dx\": -120.12147874311805, \"translation_dy\": -16.659510954699698, \"scale\": 0.8471832394055047}\nD: {\"rotation_angle\": -70.18179574394556, \"translation_dx\": -84.02989442213027, \"translation_dy\": 45.46342410564398, \"scale\": 1.28660403831869}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -127.2688410750471, \"translation_dx\": 10.330064507300825, \"translation_dy\": -25.010404065134438, \"scale\": 1.1376215421095472}\nB: {\"rotation_angle\": 138.15953129001275, \"translation_dx\": 108.29077351507729, \"translation_dy\": 11.25207260435026, \"scale\": 1.2682750116992958}\nC: {\"rotation_angle\": -8.756342422911757, \"translation_dx\": -120.12147874311805, \"translation_dy\": -16.659510954699698, \"scale\": 0.8471832394055047}\nD: {\"rotation_angle\": -70.18179574394556, \"translation_dx\": -84.02989442213027, \"translation_dy\": 45.46342410564398, \"scale\": 1.28660403831869}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_73_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_73_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 173.6372649335733, \"translation_dx\": -7.357207392874017, \"translation_dy\": -51.70776156994498, \"scale\": 1.09720142096939}\nB: {\"rotation_angle\": 112.15713698429767, \"translation_dx\": -0.833180316164956, \"translation_dy\": -100.57740000976534, \"scale\": 1.21487245494624}\nC: {\"rotation_angle\": 98.88222011850513, \"translation_dx\": 98.58699088344886, \"translation_dy\": 52.424259863835346, \"scale\": 0.8670994673205047}\nD: {\"rotation_angle\": -120.90208363304777, \"translation_dx\": -24.471100960859047, \"translation_dy\": -96.60346561133943, \"scale\": 1.2238954631080248}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 173.6372649335733, \"translation_dx\": -7.357207392874017, \"translation_dy\": -51.70776156994498, \"scale\": 1.09720142096939}\nB: {\"rotation_angle\": 112.15713698429767, \"translation_dx\": -0.833180316164956, \"translation_dy\": -100.57740000976534, \"scale\": 1.21487245494624}\nC: {\"rotation_angle\": 98.88222011850513, \"translation_dx\": 98.58699088344886, \"translation_dy\": 52.424259863835346, \"scale\": 0.8670994673205047}\nD: {\"rotation_angle\": -120.90208363304777, \"translation_dx\": -24.471100960859047, \"translation_dy\": -96.60346561133943, \"scale\": 1.2238954631080248}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_74_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_74_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 88.55038325147228, \"translation_dx\": -17.272344447388633, \"translation_dy\": -67.72549137992362, \"scale\": 0.5810098703790367}\nB: {\"rotation_angle\": 127.1396993936072, \"translation_dx\": -29.08894824101361, \"translation_dy\": -80.84475014775404, \"scale\": 1.2834497894588772}\nC: {\"rotation_angle\": -128.93587705152078, \"translation_dx\": 48.830662388872895, \"translation_dy\": 65.60255696435819, \"scale\": 0.5618983722639579}\nD: {\"rotation_angle\": 2.6800660606496933, \"translation_dx\": 8.805898944242955, \"translation_dy\": -61.557448223727356, \"scale\": 0.7338009245004858}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 88.55038325147228, \"translation_dx\": -17.272344447388633, \"translation_dy\": -67.72549137992362, \"scale\": 0.5810098703790367}\nB: {\"rotation_angle\": 127.1396993936072, \"translation_dx\": -29.08894824101361, \"translation_dy\": -80.84475014775404, \"scale\": 1.2834497894588772}\nC: {\"rotation_angle\": -128.93587705152078, \"translation_dx\": 48.830662388872895, \"translation_dy\": 65.60255696435819, \"scale\": 0.5618983722639579}\nD: {\"rotation_angle\": 2.6800660606496933, \"translation_dx\": 8.805898944242955, \"translation_dy\": -61.557448223727356, \"scale\": 0.7338009245004858}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_75_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_75_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -94.06455293225282, \"translation_dx\": -52.04430006776356, \"translation_dy\": 88.55937507710391, \"scale\": 0.8369046461483086}\nB: {\"rotation_angle\": 14.369437993555863, \"translation_dx\": -23.54312301695805, \"translation_dy\": 55.41046511147678, \"scale\": 1.115345902394854}\nC: {\"rotation_angle\": 99.38174871704592, \"translation_dx\": 57.870588734166205, \"translation_dy\": 17.413162007690403, \"scale\": 1.4113398114931053}\nD: {\"rotation_angle\": -126.23248080179604, \"translation_dx\": -18.04313623288388, \"translation_dy\": 59.052880720386156, \"scale\": 1.3827835175940266}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -94.06455293225282, \"translation_dx\": -52.04430006776356, \"translation_dy\": 88.55937507710391, \"scale\": 0.8369046461483086}\nB: {\"rotation_angle\": 14.369437993555863, \"translation_dx\": -23.54312301695805, \"translation_dy\": 55.41046511147678, \"scale\": 1.115345902394854}\nC: {\"rotation_angle\": 99.38174871704592, \"translation_dx\": 57.870588734166205, \"translation_dy\": 17.413162007690403, \"scale\": 1.4113398114931053}\nD: {\"rotation_angle\": -126.23248080179604, \"translation_dx\": -18.04313623288388, \"translation_dy\": 59.052880720386156, \"scale\": 1.3827835175940266}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_76_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_76_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -72.82027143369304, \"translation_dx\": -44.85481158127062, \"translation_dy\": 106.69131407191517, \"scale\": 0.716080341101258}\nB: {\"rotation_angle\": 74.4727172984789, \"translation_dx\": 83.0498783040965, \"translation_dy\": 24.573318419119772, \"scale\": 1.4775593630739356}\nC: {\"rotation_angle\": 33.426384392539006, \"translation_dx\": -12.448609293998487, \"translation_dy\": 64.03367069956386, \"scale\": 0.6340926377236346}\nD: {\"rotation_angle\": 159.39197876032466, \"translation_dx\": -101.87275621292875, \"translation_dy\": -32.606176111808466, \"scale\": 0.6647290774480178}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -72.82027143369304, \"translation_dx\": -44.85481158127062, \"translation_dy\": 106.69131407191517, \"scale\": 0.716080341101258}\nB: {\"rotation_angle\": 74.4727172984789, \"translation_dx\": 83.0498783040965, \"translation_dy\": 24.573318419119772, \"scale\": 1.4775593630739356}\nC: {\"rotation_angle\": 33.426384392539006, \"translation_dx\": -12.448609293998487, \"translation_dy\": 64.03367069956386, \"scale\": 0.6340926377236346}\nD: {\"rotation_angle\": 159.39197876032466, \"translation_dx\": -101.87275621292875, \"translation_dy\": -32.606176111808466, \"scale\": 0.6647290774480178}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_77_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_77_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 28.186459007199005, \"translation_dx\": -85.64869298892413, \"translation_dy\": -90.9589081114641, \"scale\": 0.5939510579225048}\nB: {\"rotation_angle\": -161.22593365548192, \"translation_dx\": -119.73961882572601, \"translation_dy\": -93.50838821854722, \"scale\": 1.4476413063179399}\nC: {\"rotation_angle\": -46.75272698463425, \"translation_dx\": 16.424107524155175, \"translation_dy\": -60.683488552754085, \"scale\": 1.375025476214386}\nD: {\"rotation_angle\": 141.74747753602782, \"translation_dx\": -54.793360600935046, \"translation_dy\": -29.72546528603263, \"scale\": 0.6563706152769926}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 28.186459007199005, \"translation_dx\": -85.64869298892413, \"translation_dy\": -90.9589081114641, \"scale\": 0.5939510579225048}\nB: {\"rotation_angle\": -161.22593365548192, \"translation_dx\": -119.73961882572601, \"translation_dy\": -93.50838821854722, \"scale\": 1.4476413063179399}\nC: {\"rotation_angle\": -46.75272698463425, \"translation_dx\": 16.424107524155175, \"translation_dy\": -60.683488552754085, \"scale\": 1.375025476214386}\nD: {\"rotation_angle\": 141.74747753602782, \"translation_dx\": -54.793360600935046, \"translation_dy\": -29.72546528603263, \"scale\": 0.6563706152769926}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_78_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_78_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -174.94064668132228, \"translation_dx\": 73.73079207136513, \"translation_dy\": 58.25534486945551, \"scale\": 1.178357936048121}\nB: {\"rotation_angle\": -23.247975965134003, \"translation_dx\": 108.97564353658032, \"translation_dy\": 27.267413374938258, \"scale\": 1.2292170424899498}\nC: {\"rotation_angle\": 33.426384392539006, \"translation_dx\": -12.448609293998487, \"translation_dy\": 64.03367069956386, \"scale\": 0.6340926377236346}\nD: {\"rotation_angle\": 159.39197876032466, \"translation_dx\": -101.87275621292875, \"translation_dy\": -32.606176111808466, \"scale\": 0.6647290774480178}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -174.94064668132228, \"translation_dx\": 73.73079207136513, \"translation_dy\": 58.25534486945551, \"scale\": 1.178357936048121}\nB: {\"rotation_angle\": -23.247975965134003, \"translation_dx\": 108.97564353658032, \"translation_dy\": 27.267413374938258, \"scale\": 1.2292170424899498}\nC: {\"rotation_angle\": 33.426384392539006, \"translation_dx\": -12.448609293998487, \"translation_dy\": 64.03367069956386, \"scale\": 0.6340926377236346}\nD: {\"rotation_angle\": 159.39197876032466, \"translation_dx\": -101.87275621292875, \"translation_dy\": -32.606176111808466, \"scale\": 0.6647290774480178}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_79_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_79_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 32.170058088704565, \"translation_dx\": 62.48780444449932, \"translation_dy\": 36.464458087386475, \"scale\": 0.8338243238440678}\nB: {\"rotation_angle\": 95.56102360167273, \"translation_dx\": -57.629857243876444, \"translation_dy\": -95.34824117323305, \"scale\": 0.9533126568708786}\nC: {\"rotation_angle\": 159.39197876032466, \"translation_dx\": -101.87275621292875, \"translation_dy\": -32.606176111808466, \"scale\": 0.6647290774480178}\nD: {\"rotation_angle\": 112.15713698429767, \"translation_dx\": -0.833180316164956, \"translation_dy\": -100.57740000976534, \"scale\": 1.21487245494624}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 32.170058088704565, \"translation_dx\": 62.48780444449932, \"translation_dy\": 36.464458087386475, \"scale\": 0.8338243238440678}\nB: {\"rotation_angle\": 95.56102360167273, \"translation_dx\": -57.629857243876444, \"translation_dy\": -95.34824117323305, \"scale\": 0.9533126568708786}\nC: {\"rotation_angle\": 159.39197876032466, \"translation_dx\": -101.87275621292875, \"translation_dy\": -32.606176111808466, \"scale\": 0.6647290774480178}\nD: {\"rotation_angle\": 112.15713698429767, \"translation_dx\": -0.833180316164956, \"translation_dy\": -100.57740000976534, \"scale\": 1.21487245494624}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_80_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_80_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -169.57691070181107, \"translation_dx\": 67.3776951722352, \"translation_dy\": 6.393739311338578, \"scale\": 0.8283042543093307}\nB: {\"rotation_angle\": -128.93587705152078, \"translation_dx\": 48.830662388872895, \"translation_dy\": 65.60255696435819, \"scale\": 0.5618983722639579}\nC: {\"rotation_angle\": -126.23248080179604, \"translation_dx\": -18.04313623288388, \"translation_dy\": 59.052880720386156, \"scale\": 1.3827835175940266}\nD: {\"rotation_angle\": -120.90208363304777, \"translation_dx\": -24.471100960859047, \"translation_dy\": -96.60346561133943, \"scale\": 1.2238954631080248}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -169.57691070181107, \"translation_dx\": 67.3776951722352, \"translation_dy\": 6.393739311338578, \"scale\": 0.8283042543093307}\nB: {\"rotation_angle\": -128.93587705152078, \"translation_dx\": 48.830662388872895, \"translation_dy\": 65.60255696435819, \"scale\": 0.5618983722639579}\nC: {\"rotation_angle\": -126.23248080179604, \"translation_dx\": -18.04313623288388, \"translation_dy\": 59.052880720386156, \"scale\": 1.3827835175940266}\nD: {\"rotation_angle\": -120.90208363304777, \"translation_dx\": -24.471100960859047, \"translation_dy\": -96.60346561133943, \"scale\": 1.2238954631080248}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_81_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_81_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 159.25105466068987, \"translation_dx\": -126.35420360425098, \"translation_dy\": -17.54721978726404, \"scale\": 1.4952435062275256}\nB: {\"rotation_angle\": -59.18065174130953, \"translation_dx\": -66.15733764198566, \"translation_dy\": -32.06450758946801, \"scale\": 1.1967157159259998}\nC: {\"rotation_angle\": 134.66606893121838, \"translation_dx\": 30.71289427748178, \"translation_dy\": 31.00111281943242, \"scale\": 0.9716368665085688}\nD: {\"rotation_angle\": 32.170058088704565, \"translation_dx\": 62.48780444449932, \"translation_dy\": 36.464458087386475, \"scale\": 0.8338243238440678}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 159.25105466068987, \"translation_dx\": -126.35420360425098, \"translation_dy\": -17.54721978726404, \"scale\": 1.4952435062275256}\nB: {\"rotation_angle\": -59.18065174130953, \"translation_dx\": -66.15733764198566, \"translation_dy\": -32.06450758946801, \"scale\": 1.1967157159259998}\nC: {\"rotation_angle\": 134.66606893121838, \"translation_dx\": 30.71289427748178, \"translation_dy\": 31.00111281943242, \"scale\": 0.9716368665085688}\nD: {\"rotation_angle\": 32.170058088704565, \"translation_dx\": 62.48780444449932, \"translation_dy\": 36.464458087386475, \"scale\": 0.8338243238440678}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_82_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_82_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -38.67054772511392, \"translation_dx\": 68.1059088983965, \"translation_dy\": -80.75433684597641, \"scale\": 1.0669693911306672}\nB: {\"rotation_angle\": 178.3015459217881, \"translation_dx\": 2.1592483018484785, \"translation_dy\": -86.15095567396924, \"scale\": 1.206185814877298}\nC: {\"rotation_angle\": 98.88222011850513, \"translation_dx\": 98.58699088344886, \"translation_dy\": 52.424259863835346, \"scale\": 0.8670994673205047}\nD: {\"rotation_angle\": -53.475823147809436, \"translation_dx\": -52.11444637245131, \"translation_dy\": -7.974464084606126, \"scale\": 1.302004904680502}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -38.67054772511392, \"translation_dx\": 68.1059088983965, \"translation_dy\": -80.75433684597641, \"scale\": 1.0669693911306672}\nB: {\"rotation_angle\": 178.3015459217881, \"translation_dx\": 2.1592483018484785, \"translation_dy\": -86.15095567396924, \"scale\": 1.206185814877298}\nC: {\"rotation_angle\": 98.88222011850513, \"translation_dx\": 98.58699088344886, \"translation_dy\": 52.424259863835346, \"scale\": 0.8670994673205047}\nD: {\"rotation_angle\": -53.475823147809436, \"translation_dx\": -52.11444637245131, \"translation_dy\": -7.974464084606126, \"scale\": 1.302004904680502}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_83_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_83_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -31.020660516088725, \"translation_dx\": 105.99805178546191, \"translation_dy\": -82.8489656004858, \"scale\": 1.0703563169477137}\nB: {\"rotation_angle\": 98.12478073081388, \"translation_dx\": 82.24255679101596, \"translation_dy\": 10.638794739410258, \"scale\": 1.454613875934863}\nC: {\"rotation_angle\": -4.364889011784271, \"translation_dx\": 74.89385338851659, \"translation_dy\": 29.259521498010997, \"scale\": 1.2877948451877137}\nD: {\"rotation_angle\": 106.62912259997893, \"translation_dx\": -62.19399566166837, \"translation_dy\": -63.078041204745844, \"scale\": 1.4577244189370733}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -31.020660516088725, \"translation_dx\": 105.99805178546191, \"translation_dy\": -82.8489656004858, \"scale\": 1.0703563169477137}\nB: {\"rotation_angle\": 98.12478073081388, \"translation_dx\": 82.24255679101596, \"translation_dy\": 10.638794739410258, \"scale\": 1.454613875934863}\nC: {\"rotation_angle\": -4.364889011784271, \"translation_dx\": 74.89385338851659, \"translation_dy\": 29.259521498010997, \"scale\": 1.2877948451877137}\nD: {\"rotation_angle\": 106.62912259997893, \"translation_dx\": -62.19399566166837, \"translation_dy\": -63.078041204745844, \"scale\": 1.4577244189370733}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_84_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_84_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 12.872370969250312, \"translation_dx\": -43.1533458138392, \"translation_dy\": -64.88511529320917, \"scale\": 1.3092068537816153}\nB: {\"rotation_angle\": -14.958482221349612, \"translation_dx\": 49.62118662103501, \"translation_dy\": -13.943537967490855, \"scale\": 1.489574484959727}\nC: {\"rotation_angle\": 48.71833122181758, \"translation_dx\": -105.22683210092106, \"translation_dy\": -63.34096559919908, \"scale\": 0.7204478932238769}\nD: {\"rotation_angle\": 173.6372649335733, \"translation_dx\": -7.357207392874017, \"translation_dy\": -51.70776156994498, \"scale\": 1.09720142096939}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 12.872370969250312, \"translation_dx\": -43.1533458138392, \"translation_dy\": -64.88511529320917, \"scale\": 1.3092068537816153}\nB: {\"rotation_angle\": -14.958482221349612, \"translation_dx\": 49.62118662103501, \"translation_dy\": -13.943537967490855, \"scale\": 1.489574484959727}\nC: {\"rotation_angle\": 48.71833122181758, \"translation_dx\": -105.22683210092106, \"translation_dy\": -63.34096559919908, \"scale\": 0.7204478932238769}\nD: {\"rotation_angle\": 173.6372649335733, \"translation_dx\": -7.357207392874017, \"translation_dy\": -51.70776156994498, \"scale\": 1.09720142096939}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_85_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_85_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -137.58016126496426, \"translation_dx\": 45.631572391068715, \"translation_dy\": -54.72741054396442, \"scale\": 1.391656794638211}\nB: {\"rotation_angle\": 78.52234880801677, \"translation_dx\": -41.05806913924104, \"translation_dy\": -5.158893155372851, \"scale\": 1.0182841116233097}\nC: {\"rotation_angle\": -0.45613579718829556, \"translation_dx\": 98.71619714866841, \"translation_dy\": 70.1100439641223, \"scale\": 0.6491919010173006}\nD: {\"rotation_angle\": -161.22593365548192, \"translation_dx\": -119.73961882572601, \"translation_dy\": -93.50838821854722, \"scale\": 1.4476413063179399}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -137.58016126496426, \"translation_dx\": 45.631572391068715, \"translation_dy\": -54.72741054396442, \"scale\": 1.391656794638211}\nB: {\"rotation_angle\": 78.52234880801677, \"translation_dx\": -41.05806913924104, \"translation_dy\": -5.158893155372851, \"scale\": 1.0182841116233097}\nC: {\"rotation_angle\": -0.45613579718829556, \"translation_dx\": 98.71619714866841, \"translation_dy\": 70.1100439641223, \"scale\": 0.6491919010173006}\nD: {\"rotation_angle\": -161.22593365548192, \"translation_dx\": -119.73961882572601, \"translation_dy\": -93.50838821854722, \"scale\": 1.4476413063179399}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_86_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_86_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -32.96407209098831, \"translation_dx\": -27.518946535455143, \"translation_dy\": 2.5370159689679213, \"scale\": 1.259328459428434}\nB: {\"rotation_angle\": -50.19218790392131, \"translation_dx\": -27.31734251737683, \"translation_dy\": 8.514724344494553, \"scale\": 1.0874517053433594}\nC: {\"rotation_angle\": -14.958482221349612, \"translation_dx\": 49.62118662103501, \"translation_dy\": -13.943537967490855, \"scale\": 1.489574484959727}\nD: {\"rotation_angle\": 14.369437993555863, \"translation_dx\": -23.54312301695805, \"translation_dy\": 55.41046511147678, \"scale\": 1.115345902394854}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -32.96407209098831, \"translation_dx\": -27.518946535455143, \"translation_dy\": 2.5370159689679213, \"scale\": 1.259328459428434}\nB: {\"rotation_angle\": -50.19218790392131, \"translation_dx\": -27.31734251737683, \"translation_dy\": 8.514724344494553, \"scale\": 1.0874517053433594}\nC: {\"rotation_angle\": -14.958482221349612, \"translation_dx\": 49.62118662103501, \"translation_dy\": -13.943537967490855, \"scale\": 1.489574484959727}\nD: {\"rotation_angle\": 14.369437993555863, \"translation_dx\": -23.54312301695805, \"translation_dy\": 55.41046511147678, \"scale\": 1.115345902394854}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_87_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_87_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 78.52234880801677, \"translation_dx\": -41.05806913924104, \"translation_dy\": -5.158893155372851, \"scale\": 1.0182841116233097}\nB: {\"rotation_angle\": -137.69110011960493, \"translation_dx\": -11.76155657697187, \"translation_dy\": 15.916526895382503, \"scale\": 1.164396221339579}\nC: {\"rotation_angle\": -51.98717119490195, \"translation_dx\": -83.93544420557635, \"translation_dy\": -17.359661719977098, \"scale\": 1.0858344969275349}\nD: {\"rotation_angle\": 111.11665430921613, \"translation_dx\": -45.526232266105865, \"translation_dy\": -71.56835409165808, \"scale\": 0.5234271564227445}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 78.52234880801677, \"translation_dx\": -41.05806913924104, \"translation_dy\": -5.158893155372851, \"scale\": 1.0182841116233097}\nB: {\"rotation_angle\": -137.69110011960493, \"translation_dx\": -11.76155657697187, \"translation_dy\": 15.916526895382503, \"scale\": 1.164396221339579}\nC: {\"rotation_angle\": -51.98717119490195, \"translation_dx\": -83.93544420557635, \"translation_dy\": -17.359661719977098, \"scale\": 1.0858344969275349}\nD: {\"rotation_angle\": 111.11665430921613, \"translation_dx\": -45.526232266105865, \"translation_dy\": -71.56835409165808, \"scale\": 0.5234271564227445}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_88_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_88_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -5.683971346231118, \"translation_dx\": -0.7123036436211407, \"translation_dy\": -23.660599152813326, \"scale\": 1.1241034499451734}\nB: {\"rotation_angle\": -137.58016126496426, \"translation_dx\": 45.631572391068715, \"translation_dy\": -54.72741054396442, \"scale\": 1.391656794638211}\nC: {\"rotation_angle\": -101.64893396855386, \"translation_dx\": -96.08306753711838, \"translation_dy\": 14.852477797043775, \"scale\": 1.3017377870800058}\nD: {\"rotation_angle\": -137.69315675508605, \"translation_dx\": -14.965017175186233, \"translation_dy\": 28.85856493302694, \"scale\": 0.6970825252863025}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -5.683971346231118, \"translation_dx\": -0.7123036436211407, \"translation_dy\": -23.660599152813326, \"scale\": 1.1241034499451734}\nB: {\"rotation_angle\": -137.58016126496426, \"translation_dx\": 45.631572391068715, \"translation_dy\": -54.72741054396442, \"scale\": 1.391656794638211}\nC: {\"rotation_angle\": -101.64893396855386, \"translation_dx\": -96.08306753711838, \"translation_dy\": 14.852477797043775, \"scale\": 1.3017377870800058}\nD: {\"rotation_angle\": -137.69315675508605, \"translation_dx\": -14.965017175186233, \"translation_dy\": 28.85856493302694, \"scale\": 0.6970825252863025}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_89_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_89_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -163.07830945514343, \"translation_dx\": 107.25371607945826, \"translation_dy\": 44.19319462200147, \"scale\": 1.0330497674624493}\nB: {\"rotation_angle\": -5.683971346231118, \"translation_dx\": -0.7123036436211407, \"translation_dy\": -23.660599152813326, \"scale\": 1.1241034499451734}\nC: {\"rotation_angle\": 179.8013352752547, \"translation_dx\": -90.5548533247824, \"translation_dy\": 17.23782922418306, \"scale\": 0.9885365626195518}\nD: {\"rotation_angle\": 142.66976946716716, \"translation_dx\": 29.963541003119957, \"translation_dy\": 66.07065092305665, \"scale\": 1.42144068359999}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -163.07830945514343, \"translation_dx\": 107.25371607945826, \"translation_dy\": 44.19319462200147, \"scale\": 1.0330497674624493}\nB: {\"rotation_angle\": -5.683971346231118, \"translation_dx\": -0.7123036436211407, \"translation_dy\": -23.660599152813326, \"scale\": 1.1241034499451734}\nC: {\"rotation_angle\": 179.8013352752547, \"translation_dx\": -90.5548533247824, \"translation_dy\": 17.23782922418306, \"scale\": 0.9885365626195518}\nD: {\"rotation_angle\": 142.66976946716716, \"translation_dx\": 29.963541003119957, \"translation_dy\": 66.07065092305665, \"scale\": 1.42144068359999}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_90_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_90_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 78.52234880801677, \"translation_dx\": -41.05806913924104, \"translation_dy\": -5.158893155372851, \"scale\": 1.0182841116233097}\nB: {\"rotation_angle\": 171.23105805984426, \"translation_dx\": 28.800906238980815, \"translation_dy\": 60.921924115709544, \"scale\": 1.4441070487112413}\nC: {\"rotation_angle\": 159.18509857624855, \"translation_dx\": 94.5972413522399, \"translation_dy\": -87.01463724053234, \"scale\": 0.7914176569510836}\nD: {\"rotation_angle\": -5.683971346231118, \"translation_dx\": -0.7123036436211407, \"translation_dy\": -23.660599152813326, \"scale\": 1.1241034499451734}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 78.52234880801677, \"translation_dx\": -41.05806913924104, \"translation_dy\": -5.158893155372851, \"scale\": 1.0182841116233097}\nB: {\"rotation_angle\": 171.23105805984426, \"translation_dx\": 28.800906238980815, \"translation_dy\": 60.921924115709544, \"scale\": 1.4441070487112413}\nC: {\"rotation_angle\": 159.18509857624855, \"translation_dx\": 94.5972413522399, \"translation_dy\": -87.01463724053234, \"scale\": 0.7914176569510836}\nD: {\"rotation_angle\": -5.683971346231118, \"translation_dx\": -0.7123036436211407, \"translation_dy\": -23.660599152813326, \"scale\": 1.1241034499451734}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_91_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_91_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -137.69110011960493, \"translation_dx\": -11.76155657697187, \"translation_dy\": 15.916526895382503, \"scale\": 1.164396221339579}\nB: {\"rotation_angle\": 22.924180775031914, \"translation_dx\": 8.278066534063711, \"translation_dy\": 39.03722404706397, \"scale\": 0.6972670428813228}\nC: {\"rotation_angle\": -132.6730586187399, \"translation_dx\": -14.723128468316531, \"translation_dy\": -95.44210429834934, \"scale\": 1.0421065600095725}\nD: {\"rotation_angle\": 32.25033099080062, \"translation_dx\": -33.246475706714875, \"translation_dy\": -9.848772328845214, \"scale\": 0.986502265576198}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -137.69110011960493, \"translation_dx\": -11.76155657697187, \"translation_dy\": 15.916526895382503, \"scale\": 1.164396221339579}\nB: {\"rotation_angle\": 22.924180775031914, \"translation_dx\": 8.278066534063711, \"translation_dy\": 39.03722404706397, \"scale\": 0.6972670428813228}\nC: {\"rotation_angle\": -132.6730586187399, \"translation_dx\": -14.723128468316531, \"translation_dy\": -95.44210429834934, \"scale\": 1.0421065600095725}\nD: {\"rotation_angle\": 32.25033099080062, \"translation_dx\": -33.246475706714875, \"translation_dy\": -9.848772328845214, \"scale\": 0.986502265576198}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_92_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_92_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -38.58021171568234, \"translation_dx\": -80.14139661496048, \"translation_dy\": 7.985099889843255, \"scale\": 1.029545268033875}\nB: {\"rotation_angle\": 138.15953129001275, \"translation_dx\": 108.29077351507729, \"translation_dy\": 11.25207260435026, \"scale\": 1.2682750116992958}\nC: {\"rotation_angle\": 22.924180775031914, \"translation_dx\": 8.278066534063711, \"translation_dy\": 39.03722404706397, \"scale\": 0.6972670428813228}\nD: {\"rotation_angle\": 99.38174871704592, \"translation_dx\": 57.870588734166205, \"translation_dy\": 17.413162007690403, \"scale\": 1.4113398114931053}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -38.58021171568234, \"translation_dx\": -80.14139661496048, \"translation_dy\": 7.985099889843255, \"scale\": 1.029545268033875}\nB: {\"rotation_angle\": 138.15953129001275, \"translation_dx\": 108.29077351507729, \"translation_dy\": 11.25207260435026, \"scale\": 1.2682750116992958}\nC: {\"rotation_angle\": 22.924180775031914, \"translation_dx\": 8.278066534063711, \"translation_dy\": 39.03722404706397, \"scale\": 0.6972670428813228}\nD: {\"rotation_angle\": 99.38174871704592, \"translation_dx\": 57.870588734166205, \"translation_dy\": 17.413162007690403, \"scale\": 1.4113398114931053}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_93_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_93_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 161.7596265938729, \"translation_dx\": -9.170216354863072, \"translation_dy\": -19.23222492696047, \"scale\": 1.1821087248622173}\nB: {\"rotation_angle\": -103.24791656906933, \"translation_dx\": -2.2454836983213227, \"translation_dy\": 24.014319900588845, \"scale\": 1.3204557483507742}\nC: {\"rotation_angle\": 26.06413776863195, \"translation_dx\": 104.54441011530889, \"translation_dy\": -2.802993361858995, \"scale\": 0.6919535578881184}\nD: {\"rotation_angle\": -132.6730586187399, \"translation_dx\": -14.723128468316531, \"translation_dy\": -95.44210429834934, \"scale\": 1.0421065600095725}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 161.7596265938729, \"translation_dx\": -9.170216354863072, \"translation_dy\": -19.23222492696047, \"scale\": 1.1821087248622173}\nB: {\"rotation_angle\": -103.24791656906933, \"translation_dx\": -2.2454836983213227, \"translation_dy\": 24.014319900588845, \"scale\": 1.3204557483507742}\nC: {\"rotation_angle\": 26.06413776863195, \"translation_dx\": 104.54441011530889, \"translation_dy\": -2.802993361858995, \"scale\": 0.6919535578881184}\nD: {\"rotation_angle\": -132.6730586187399, \"translation_dx\": -14.723128468316531, \"translation_dy\": -95.44210429834934, \"scale\": 1.0421065600095725}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_94_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_94_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 115.16030768984217, \"translation_dx\": -1.9669547188467504, \"translation_dy\": 38.42152609256746, \"scale\": 1.3403221872922475}\nB: {\"rotation_angle\": -127.2688410750471, \"translation_dx\": 10.330064507300825, \"translation_dy\": -25.010404065134438, \"scale\": 1.1376215421095472}\nC: {\"rotation_angle\": 36.19361803007027, \"translation_dx\": -50.40071399889004, \"translation_dy\": -85.39533040467117, \"scale\": 0.6522247071940848}\nD: {\"rotation_angle\": 136.2943203908062, \"translation_dx\": 59.15508525636656, \"translation_dy\": -38.46099161723379, \"scale\": 0.6414776081953896}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 115.16030768984217, \"translation_dx\": -1.9669547188467504, \"translation_dy\": 38.42152609256746, \"scale\": 1.3403221872922475}\nB: {\"rotation_angle\": -127.2688410750471, \"translation_dx\": 10.330064507300825, \"translation_dy\": -25.010404065134438, \"scale\": 1.1376215421095472}\nC: {\"rotation_angle\": 36.19361803007027, \"translation_dx\": -50.40071399889004, \"translation_dy\": -85.39533040467117, \"scale\": 0.6522247071940848}\nD: {\"rotation_angle\": 136.2943203908062, \"translation_dx\": 59.15508525636656, \"translation_dy\": -38.46099161723379, \"scale\": 0.6414776081953896}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_95_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_95_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -38.67054772511392, \"translation_dx\": 68.1059088983965, \"translation_dy\": -80.75433684597641, \"scale\": 1.0669693911306672}\nB: {\"rotation_angle\": -97.38730278840897, \"translation_dx\": 79.58431404822528, \"translation_dy\": -65.17570525641105, \"scale\": 0.8501057849742453}\nC: {\"rotation_angle\": -173.49565975712173, \"translation_dx\": 30.5303454517925, \"translation_dy\": 77.86216107455405, \"scale\": 1.067173806992701}\nD: {\"rotation_angle\": 78.52234880801677, \"translation_dx\": -41.05806913924104, \"translation_dy\": -5.158893155372851, \"scale\": 1.0182841116233097}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -38.67054772511392, \"translation_dx\": 68.1059088983965, \"translation_dy\": -80.75433684597641, \"scale\": 1.0669693911306672}\nB: {\"rotation_angle\": -97.38730278840897, \"translation_dx\": 79.58431404822528, \"translation_dy\": -65.17570525641105, \"scale\": 0.8501057849742453}\nC: {\"rotation_angle\": -173.49565975712173, \"translation_dx\": 30.5303454517925, \"translation_dy\": 77.86216107455405, \"scale\": 1.067173806992701}\nD: {\"rotation_angle\": 78.52234880801677, \"translation_dx\": -41.05806913924104, \"translation_dy\": -5.158893155372851, \"scale\": 1.0182841116233097}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_96_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_96_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -50.19218790392131, \"translation_dx\": -27.31734251737683, \"translation_dy\": 8.514724344494553, \"scale\": 1.0874517053433594}\nB: {\"rotation_angle\": 33.426384392539006, \"translation_dx\": -12.448609293998487, \"translation_dy\": 64.03367069956386, \"scale\": 0.6340926377236346}\nC: {\"rotation_angle\": 22.924180775031914, \"translation_dx\": 8.278066534063711, \"translation_dy\": 39.03722404706397, \"scale\": 0.6972670428813228}\nD: {\"rotation_angle\": -115.34417090075787, \"translation_dx\": -118.63121430094503, \"translation_dy\": 41.63412082488844, \"scale\": 0.9001856788272352}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -50.19218790392131, \"translation_dx\": -27.31734251737683, \"translation_dy\": 8.514724344494553, \"scale\": 1.0874517053433594}\nB: {\"rotation_angle\": 33.426384392539006, \"translation_dx\": -12.448609293998487, \"translation_dy\": 64.03367069956386, \"scale\": 0.6340926377236346}\nC: {\"rotation_angle\": 22.924180775031914, \"translation_dx\": 8.278066534063711, \"translation_dy\": 39.03722404706397, \"scale\": 0.6972670428813228}\nD: {\"rotation_angle\": -115.34417090075787, \"translation_dx\": -118.63121430094503, \"translation_dy\": 41.63412082488844, \"scale\": 0.9001856788272352}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_97_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_97_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -68.79930104020924, \"translation_dx\": -103.12901971602221, \"translation_dy\": 94.89161684072867, \"scale\": 1.2295411735859756}\nB: {\"rotation_angle\": -164.42105085554024, \"translation_dx\": 53.959081038248144, \"translation_dy\": -27.892450679654182, \"scale\": 1.1369631742880046}\nC: {\"rotation_angle\": 103.56580652114087, \"translation_dx\": -76.88940345297716, \"translation_dy\": -3.4544443607121593, \"scale\": 1.3949152683659345}\nD: {\"rotation_angle\": 53.86809011441332, \"translation_dx\": -15.131168518097624, \"translation_dy\": -31.300037391593577, \"scale\": 1.3154620606808156}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -68.79930104020924, \"translation_dx\": -103.12901971602221, \"translation_dy\": 94.89161684072867, \"scale\": 1.2295411735859756}\nB: {\"rotation_angle\": -164.42105085554024, \"translation_dx\": 53.959081038248144, \"translation_dy\": -27.892450679654182, \"scale\": 1.1369631742880046}\nC: {\"rotation_angle\": 103.56580652114087, \"translation_dx\": -76.88940345297716, \"translation_dy\": -3.4544443607121593, \"scale\": 1.3949152683659345}\nD: {\"rotation_angle\": 53.86809011441332, \"translation_dx\": -15.131168518097624, \"translation_dy\": -31.300037391593577, \"scale\": 1.3154620606808156}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_98_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_98_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 173.6372649335733, \"translation_dx\": -7.357207392874017, \"translation_dy\": -51.70776156994498, \"scale\": 1.09720142096939}\nB: {\"rotation_angle\": 18.52926347539298, \"translation_dx\": -26.155433185237058, \"translation_dy\": -39.799299198218556, \"scale\": 0.9355127285855813}\nC: {\"rotation_angle\": 107.15748471049534, \"translation_dx\": -112.04520804841785, \"translation_dy\": 107.36899853350675, \"scale\": 0.784106447062462}\nD: {\"rotation_angle\": -49.11147497176091, \"translation_dx\": -21.61309921155923, \"translation_dy\": 41.841400081955015, \"scale\": 1.3374733710705384}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 173.6372649335733, \"translation_dx\": -7.357207392874017, \"translation_dy\": -51.70776156994498, \"scale\": 1.09720142096939}\nB: {\"rotation_angle\": 18.52926347539298, \"translation_dx\": -26.155433185237058, \"translation_dy\": -39.799299198218556, \"scale\": 0.9355127285855813}\nC: {\"rotation_angle\": 107.15748471049534, \"translation_dx\": -112.04520804841785, \"translation_dy\": 107.36899853350675, \"scale\": 0.784106447062462}\nD: {\"rotation_angle\": -49.11147497176091, \"translation_dx\": -21.61309921155923, \"translation_dy\": 41.841400081955015, \"scale\": 1.3374733710705384}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_99_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_99_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 107.15748471049534, \"translation_dx\": -112.04520804841785, \"translation_dy\": 107.36899853350675, \"scale\": 0.784106447062462}\nB: {\"rotation_angle\": 97.08459407481979, \"translation_dx\": 38.76418659488206, \"translation_dy\": 44.81166266995322, \"scale\": 1.27585958531192}\nC: {\"rotation_angle\": -124.27587082376021, \"translation_dx\": -88.19288051455345, \"translation_dy\": 24.145134775980125, \"scale\": 1.4414104211047083}\nD: {\"rotation_angle\": 8.705969178532513, \"translation_dx\": -108.98578445869327, \"translation_dy\": -85.91179454441009, \"scale\": 0.5132717751865925}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 107.15748471049534, \"translation_dx\": -112.04520804841785, \"translation_dy\": 107.36899853350675, \"scale\": 0.784106447062462}\nB: {\"rotation_angle\": 97.08459407481979, \"translation_dx\": 38.76418659488206, \"translation_dy\": 44.81166266995322, \"scale\": 1.27585958531192}\nC: {\"rotation_angle\": -124.27587082376021, \"translation_dx\": -88.19288051455345, \"translation_dy\": 24.145134775980125, \"scale\": 1.4414104211047083}\nD: {\"rotation_angle\": 8.705969178532513, \"translation_dx\": -108.98578445869327, \"translation_dy\": -85.91179454441009, \"scale\": 0.5132717751865925}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_100_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_100_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -38.58021171568234, \"translation_dx\": -80.14139661496048, \"translation_dy\": 7.985099889843255, \"scale\": 1.029545268033875}\nB: {\"rotation_angle\": 83.8873422171626, \"translation_dx\": -89.51171417178318, \"translation_dy\": 44.525876215713694, \"scale\": 0.7096671999666376}\nC: {\"rotation_angle\": 123.61853421760617, \"translation_dx\": -93.63136806510369, \"translation_dy\": -15.65687765252683, \"scale\": 0.9834422929774667}\nD: {\"rotation_angle\": -38.67054772511392, \"translation_dx\": 68.1059088983965, \"translation_dy\": -80.75433684597641, \"scale\": 1.0669693911306672}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -38.58021171568234, \"translation_dx\": -80.14139661496048, \"translation_dy\": 7.985099889843255, \"scale\": 1.029545268033875}\nB: {\"rotation_angle\": 83.8873422171626, \"translation_dx\": -89.51171417178318, \"translation_dy\": 44.525876215713694, \"scale\": 0.7096671999666376}\nC: {\"rotation_angle\": 123.61853421760617, \"translation_dx\": -93.63136806510369, \"translation_dy\": -15.65687765252683, \"scale\": 0.9834422929774667}\nD: {\"rotation_angle\": -38.67054772511392, \"translation_dx\": 68.1059088983965, \"translation_dy\": -80.75433684597641, \"scale\": 1.0669693911306672}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_101_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_101_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -160.6395227566207, \"translation_dx\": 53.66643366551958, \"translation_dy\": -27.712376159428388, \"scale\": 1.1084051689599654}\nB: {\"rotation_angle\": 37.640985396206986, \"translation_dx\": -97.39428669742068, \"translation_dy\": 17.900860680283458, \"scale\": 1.0930243251030827}\nC: {\"rotation_angle\": 95.69634927891752, \"translation_dx\": -96.46148729426875, \"translation_dy\": -25.496381966922478, \"scale\": 0.7479348241153333}\nD: {\"rotation_angle\": 137.6047485759084, \"translation_dx\": -27.00857214512888, \"translation_dy\": -94.97246325619065, \"scale\": 1.1628545134465245}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -160.6395227566207, \"translation_dx\": 53.66643366551958, \"translation_dy\": -27.712376159428388, \"scale\": 1.1084051689599654}\nB: {\"rotation_angle\": 37.640985396206986, \"translation_dx\": -97.39428669742068, \"translation_dy\": 17.900860680283458, \"scale\": 1.0930243251030827}\nC: {\"rotation_angle\": 95.69634927891752, \"translation_dx\": -96.46148729426875, \"translation_dy\": -25.496381966922478, \"scale\": 0.7479348241153333}\nD: {\"rotation_angle\": 137.6047485759084, \"translation_dx\": -27.00857214512888, \"translation_dy\": -94.97246325619065, \"scale\": 1.1628545134465245}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_102_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_102_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 159.39197876032466, \"translation_dx\": -101.87275621292875, \"translation_dy\": -32.606176111808466, \"scale\": 0.6647290774480178}\nB: {\"rotation_angle\": -153.3687774434925, \"translation_dx\": 50.92336593606055, \"translation_dy\": -56.81603844715568, \"scale\": 1.398231264497651}\nC: {\"rotation_angle\": 137.6047485759084, \"translation_dx\": -27.00857214512888, \"translation_dy\": -94.97246325619065, \"scale\": 1.1628545134465245}\nD: {\"rotation_angle\": -76.09611957445006, \"translation_dx\": -118.19634710213703, \"translation_dy\": 85.91610719889127, \"scale\": 1.371999627635525}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 159.39197876032466, \"translation_dx\": -101.87275621292875, \"translation_dy\": -32.606176111808466, \"scale\": 0.6647290774480178}\nB: {\"rotation_angle\": -153.3687774434925, \"translation_dx\": 50.92336593606055, \"translation_dy\": -56.81603844715568, \"scale\": 1.398231264497651}\nC: {\"rotation_angle\": 137.6047485759084, \"translation_dx\": -27.00857214512888, \"translation_dy\": -94.97246325619065, \"scale\": 1.1628545134465245}\nD: {\"rotation_angle\": -76.09611957445006, \"translation_dx\": -118.19634710213703, \"translation_dy\": 85.91610719889127, \"scale\": 1.371999627635525}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_103_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_103_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -38.67054772511392, \"translation_dx\": 68.1059088983965, \"translation_dy\": -80.75433684597641, \"scale\": 1.0669693911306672}\nB: {\"rotation_angle\": -176.3085334768787, \"translation_dx\": -26.09189325642553, \"translation_dy\": 21.458056495366975, \"scale\": 0.7934334422653395}\nC: {\"rotation_angle\": 97.08459407481979, \"translation_dx\": 38.76418659488206, \"translation_dy\": 44.81166266995322, \"scale\": 1.27585958531192}\nD: {\"rotation_angle\": -79.55706788063112, \"translation_dx\": -38.613403166877674, \"translation_dy\": 48.56888435185245, \"scale\": 1.368947012195521}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -38.67054772511392, \"translation_dx\": 68.1059088983965, \"translation_dy\": -80.75433684597641, \"scale\": 1.0669693911306672}\nB: {\"rotation_angle\": -176.3085334768787, \"translation_dx\": -26.09189325642553, \"translation_dy\": 21.458056495366975, \"scale\": 0.7934334422653395}\nC: {\"rotation_angle\": 97.08459407481979, \"translation_dx\": 38.76418659488206, \"translation_dy\": 44.81166266995322, \"scale\": 1.27585958531192}\nD: {\"rotation_angle\": -79.55706788063112, \"translation_dx\": -38.613403166877674, \"translation_dy\": 48.56888435185245, \"scale\": 1.368947012195521}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_104_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_104_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 159.25105466068987, \"translation_dx\": -126.35420360425098, \"translation_dy\": -17.54721978726404, \"scale\": 1.4952435062275256}\nB: {\"rotation_angle\": -5.816806483512181, \"translation_dx\": -70.40329792935935, \"translation_dy\": -21.418007440252175, \"scale\": 1.0041476956174793}\nC: {\"rotation_angle\": 171.23105805984426, \"translation_dx\": 28.800906238980815, \"translation_dy\": 60.921924115709544, \"scale\": 1.4441070487112413}\nD: {\"rotation_angle\": 1.3693998936690264, \"translation_dx\": -71.94174431428723, \"translation_dy\": 25.661133958182248, \"scale\": 1.468813327861592}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 159.25105466068987, \"translation_dx\": -126.35420360425098, \"translation_dy\": -17.54721978726404, \"scale\": 1.4952435062275256}\nB: {\"rotation_angle\": -5.816806483512181, \"translation_dx\": -70.40329792935935, \"translation_dy\": -21.418007440252175, \"scale\": 1.0041476956174793}\nC: {\"rotation_angle\": 171.23105805984426, \"translation_dx\": 28.800906238980815, \"translation_dy\": 60.921924115709544, \"scale\": 1.4441070487112413}\nD: {\"rotation_angle\": 1.3693998936690264, \"translation_dx\": -71.94174431428723, \"translation_dy\": 25.661133958182248, \"scale\": 1.468813327861592}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_105_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_105_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 162.98131081099467, \"translation_dx\": -80.19473687776261, \"translation_dy\": -17.70282064458462, \"scale\": 1.2855975600149028}\nB: {\"rotation_angle\": -79.55706788063112, \"translation_dx\": -38.613403166877674, \"translation_dy\": 48.56888435185245, \"scale\": 1.368947012195521}\nC: {\"rotation_angle\": 33.36657735274014, \"translation_dx\": -110.42271839281483, \"translation_dy\": 35.783043595963875, \"scale\": 1.1017945125321793}\nD: {\"rotation_angle\": 115.4472434811122, \"translation_dx\": 69.00896887231048, \"translation_dy\": -26.016218629159226, \"scale\": 0.9339901852292719}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 162.98131081099467, \"translation_dx\": -80.19473687776261, \"translation_dy\": -17.70282064458462, \"scale\": 1.2855975600149028}\nB: {\"rotation_angle\": -79.55706788063112, \"translation_dx\": -38.613403166877674, \"translation_dy\": 48.56888435185245, \"scale\": 1.368947012195521}\nC: {\"rotation_angle\": 33.36657735274014, \"translation_dx\": -110.42271839281483, \"translation_dy\": 35.783043595963875, \"scale\": 1.1017945125321793}\nD: {\"rotation_angle\": 115.4472434811122, \"translation_dx\": 69.00896887231048, \"translation_dy\": -26.016218629159226, \"scale\": 0.9339901852292719}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_106_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_106_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 179.8013352752547, \"translation_dx\": -90.5548533247824, \"translation_dy\": 17.23782922418306, \"scale\": 0.9885365626195518}\nB: {\"rotation_angle\": 14.369437993555863, \"translation_dx\": -23.54312301695805, \"translation_dy\": 55.41046511147678, \"scale\": 1.115345902394854}\nC: {\"rotation_angle\": -164.42105085554024, \"translation_dx\": 53.959081038248144, \"translation_dy\": -27.892450679654182, \"scale\": 1.1369631742880046}\nD: {\"rotation_angle\": -113.69332067912192, \"translation_dx\": -23.005200251858383, \"translation_dy\": 57.916315250854666, \"scale\": 0.5483419258047426}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 179.8013352752547, \"translation_dx\": -90.5548533247824, \"translation_dy\": 17.23782922418306, \"scale\": 0.9885365626195518}\nB: {\"rotation_angle\": 14.369437993555863, \"translation_dx\": -23.54312301695805, \"translation_dy\": 55.41046511147678, \"scale\": 1.115345902394854}\nC: {\"rotation_angle\": -164.42105085554024, \"translation_dx\": 53.959081038248144, \"translation_dy\": -27.892450679654182, \"scale\": 1.1369631742880046}\nD: {\"rotation_angle\": -113.69332067912192, \"translation_dx\": -23.005200251858383, \"translation_dy\": 57.916315250854666, \"scale\": 0.5483419258047426}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_107_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_107_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -164.42105085554024, \"translation_dx\": 53.959081038248144, \"translation_dy\": -27.892450679654182, \"scale\": 1.1369631742880046}\nB: {\"rotation_angle\": 97.63348280388993, \"translation_dx\": 59.62332527691919, \"translation_dy\": 12.549462794922746, \"scale\": 0.6927080624806098}\nC: {\"rotation_angle\": 46.42160956908356, \"translation_dx\": -90.04619228512212, \"translation_dy\": -15.749486436572411, \"scale\": 1.005156310055277}\nD: {\"rotation_angle\": 136.76946369368522, \"translation_dx\": 86.13615517916296, \"translation_dy\": 47.49597577737802, \"scale\": 1.1842967613683704}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -164.42105085554024, \"translation_dx\": 53.959081038248144, \"translation_dy\": -27.892450679654182, \"scale\": 1.1369631742880046}\nB: {\"rotation_angle\": 97.63348280388993, \"translation_dx\": 59.62332527691919, \"translation_dy\": 12.549462794922746, \"scale\": 0.6927080624806098}\nC: {\"rotation_angle\": 46.42160956908356, \"translation_dx\": -90.04619228512212, \"translation_dy\": -15.749486436572411, \"scale\": 1.005156310055277}\nD: {\"rotation_angle\": 136.76946369368522, \"translation_dx\": 86.13615517916296, \"translation_dy\": 47.49597577737802, \"scale\": 1.1842967613683704}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_108_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_108_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -127.03310490562403, \"translation_dx\": -44.497972498107885, \"translation_dy\": 53.252184804163164, \"scale\": 0.8807762361133948}\nB: {\"rotation_angle\": -4.364889011784271, \"translation_dx\": 74.89385338851659, \"translation_dy\": 29.259521498010997, \"scale\": 1.2877948451877137}\nC: {\"rotation_angle\": -162.31682909306286, \"translation_dx\": 94.60975693720637, \"translation_dy\": -28.569332128995313, \"scale\": 1.1251281587345527}\nD: {\"rotation_angle\": -15.445234303955033, \"translation_dx\": 52.656313993324545, \"translation_dy\": 4.243768644047549, \"scale\": 0.8747335302455691}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -127.03310490562403, \"translation_dx\": -44.497972498107885, \"translation_dy\": 53.252184804163164, \"scale\": 0.8807762361133948}\nB: {\"rotation_angle\": -4.364889011784271, \"translation_dx\": 74.89385338851659, \"translation_dy\": 29.259521498010997, \"scale\": 1.2877948451877137}\nC: {\"rotation_angle\": -162.31682909306286, \"translation_dx\": 94.60975693720637, \"translation_dy\": -28.569332128995313, \"scale\": 1.1251281587345527}\nD: {\"rotation_angle\": -15.445234303955033, \"translation_dx\": 52.656313993324545, \"translation_dy\": 4.243768644047549, \"scale\": 0.8747335302455691}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_109_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_109_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 49.896013394485834, \"translation_dx\": -25.763756683237403, \"translation_dy\": -26.432232271484168, \"scale\": 1.1619310734744932}\nB: {\"rotation_angle\": 133.22970053001933, \"translation_dx\": 30.83867253278636, \"translation_dy\": 9.987607615316023, \"scale\": 0.9746642566652708}\nC: {\"rotation_angle\": 36.19361803007027, \"translation_dx\": -50.40071399889004, \"translation_dy\": -85.39533040467117, \"scale\": 0.6522247071940848}\nD: {\"rotation_angle\": 136.2943203908062, \"translation_dx\": 59.15508525636656, \"translation_dy\": -38.46099161723379, \"scale\": 0.6414776081953896}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 49.896013394485834, \"translation_dx\": -25.763756683237403, \"translation_dy\": -26.432232271484168, \"scale\": 1.1619310734744932}\nB: {\"rotation_angle\": 133.22970053001933, \"translation_dx\": 30.83867253278636, \"translation_dy\": 9.987607615316023, \"scale\": 0.9746642566652708}\nC: {\"rotation_angle\": 36.19361803007027, \"translation_dx\": -50.40071399889004, \"translation_dy\": -85.39533040467117, \"scale\": 0.6522247071940848}\nD: {\"rotation_angle\": 136.2943203908062, \"translation_dx\": 59.15508525636656, \"translation_dy\": -38.46099161723379, \"scale\": 0.6414776081953896}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_110_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_110_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -97.38730278840897, \"translation_dx\": 79.58431404822528, \"translation_dy\": -65.17570525641105, \"scale\": 0.8501057849742453}\nB: {\"rotation_angle\": -72.82027143369304, \"translation_dx\": -44.85481158127062, \"translation_dy\": 106.69131407191517, \"scale\": 0.716080341101258}\nC: {\"rotation_angle\": -113.69332067912192, \"translation_dx\": -23.005200251858383, \"translation_dy\": 57.916315250854666, \"scale\": 0.5483419258047426}\nD: {\"rotation_angle\": 139.13421797404374, \"translation_dx\": -107.62188977651758, \"translation_dy\": -65.35657968686931, \"scale\": 0.569575564082204}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -97.38730278840897, \"translation_dx\": 79.58431404822528, \"translation_dy\": -65.17570525641105, \"scale\": 0.8501057849742453}\nB: {\"rotation_angle\": -72.82027143369304, \"translation_dx\": -44.85481158127062, \"translation_dy\": 106.69131407191517, \"scale\": 0.716080341101258}\nC: {\"rotation_angle\": -113.69332067912192, \"translation_dx\": -23.005200251858383, \"translation_dy\": 57.916315250854666, \"scale\": 0.5483419258047426}\nD: {\"rotation_angle\": 139.13421797404374, \"translation_dx\": -107.62188977651758, \"translation_dy\": -65.35657968686931, \"scale\": 0.569575564082204}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_111_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_111_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 159.39197876032466, \"translation_dx\": -101.87275621292875, \"translation_dy\": -32.606176111808466, \"scale\": 0.6647290774480178}\nB: {\"rotation_angle\": 111.11665430921613, \"translation_dx\": -45.526232266105865, \"translation_dy\": -71.56835409165808, \"scale\": 0.5234271564227445}\nC: {\"rotation_angle\": -49.11147497176091, \"translation_dx\": -21.61309921155923, \"translation_dy\": 41.841400081955015, \"scale\": 1.3374733710705384}\nD: {\"rotation_angle\": -128.74497971799806, \"translation_dx\": -55.835206426128764, \"translation_dy\": 54.178252983369276, \"scale\": 0.8905979693160588}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 159.39197876032466, \"translation_dx\": -101.87275621292875, \"translation_dy\": -32.606176111808466, \"scale\": 0.6647290774480178}\nB: {\"rotation_angle\": 111.11665430921613, \"translation_dx\": -45.526232266105865, \"translation_dy\": -71.56835409165808, \"scale\": 0.5234271564227445}\nC: {\"rotation_angle\": -49.11147497176091, \"translation_dx\": -21.61309921155923, \"translation_dy\": 41.841400081955015, \"scale\": 1.3374733710705384}\nD: {\"rotation_angle\": -128.74497971799806, \"translation_dx\": -55.835206426128764, \"translation_dy\": 54.178252983369276, \"scale\": 0.8905979693160588}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_112_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_112_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 127.0599036632886, \"translation_dx\": -26.73103881794438, \"translation_dy\": 16.785326739741976, \"scale\": 1.1214331244941351}\nB: {\"rotation_angle\": -137.69315675508605, \"translation_dx\": -14.965017175186233, \"translation_dy\": 28.85856493302694, \"scale\": 0.6970825252863025}\nC: {\"rotation_angle\": -174.94064668132228, \"translation_dx\": 73.73079207136513, \"translation_dy\": 58.25534486945551, \"scale\": 1.178357936048121}\nD: {\"rotation_angle\": 168.86687879669455, \"translation_dx\": 30.327287286076626, \"translation_dy\": -73.84263373893171, \"scale\": 1.0887904122788439}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 127.0599036632886, \"translation_dx\": -26.73103881794438, \"translation_dy\": 16.785326739741976, \"scale\": 1.1214331244941351}\nB: {\"rotation_angle\": -137.69315675508605, \"translation_dx\": -14.965017175186233, \"translation_dy\": 28.85856493302694, \"scale\": 0.6970825252863025}\nC: {\"rotation_angle\": -174.94064668132228, \"translation_dx\": 73.73079207136513, \"translation_dy\": 58.25534486945551, \"scale\": 1.178357936048121}\nD: {\"rotation_angle\": 168.86687879669455, \"translation_dx\": 30.327287286076626, \"translation_dy\": -73.84263373893171, \"scale\": 1.0887904122788439}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_113_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_113_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -31.020660516088725, \"translation_dx\": 105.99805178546191, \"translation_dy\": -82.8489656004858, \"scale\": 1.0703563169477137}\nB: {\"rotation_angle\": 45.786611297437304, \"translation_dx\": 45.53183354666939, \"translation_dy\": -112.45880863798888, \"scale\": 0.5686394776423458}\nC: {\"rotation_angle\": -83.37935946961306, \"translation_dx\": -63.440112200681114, \"translation_dy\": -47.62616010479583, \"scale\": 0.6518247509991958}\nD: {\"rotation_angle\": 26.06413776863195, \"translation_dx\": 104.54441011530889, \"translation_dy\": -2.802993361858995, \"scale\": 0.6919535578881184}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -31.020660516088725, \"translation_dx\": 105.99805178546191, \"translation_dy\": -82.8489656004858, \"scale\": 1.0703563169477137}\nB: {\"rotation_angle\": 45.786611297437304, \"translation_dx\": 45.53183354666939, \"translation_dy\": -112.45880863798888, \"scale\": 0.5686394776423458}\nC: {\"rotation_angle\": -83.37935946961306, \"translation_dx\": -63.440112200681114, \"translation_dy\": -47.62616010479583, \"scale\": 0.6518247509991958}\nD: {\"rotation_angle\": 26.06413776863195, \"translation_dx\": 104.54441011530889, \"translation_dy\": -2.802993361858995, \"scale\": 0.6919535578881184}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_114_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_114_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -68.79930104020924, \"translation_dx\": -103.12901971602221, \"translation_dy\": 94.89161684072867, \"scale\": 1.2295411735859756}\nB: {\"rotation_angle\": -176.3085334768787, \"translation_dx\": -26.09189325642553, \"translation_dy\": 21.458056495366975, \"scale\": 0.7934334422653395}\nC: {\"rotation_angle\": 107.15748471049534, \"translation_dx\": -112.04520804841785, \"translation_dy\": 107.36899853350675, \"scale\": 0.784106447062462}\nD: {\"rotation_angle\": -76.09611957445006, \"translation_dx\": -118.19634710213703, \"translation_dy\": 85.91610719889127, \"scale\": 1.371999627635525}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -68.79930104020924, \"translation_dx\": -103.12901971602221, \"translation_dy\": 94.89161684072867, \"scale\": 1.2295411735859756}\nB: {\"rotation_angle\": -176.3085334768787, \"translation_dx\": -26.09189325642553, \"translation_dy\": 21.458056495366975, \"scale\": 0.7934334422653395}\nC: {\"rotation_angle\": 107.15748471049534, \"translation_dx\": -112.04520804841785, \"translation_dy\": 107.36899853350675, \"scale\": 0.784106447062462}\nD: {\"rotation_angle\": -76.09611957445006, \"translation_dx\": -118.19634710213703, \"translation_dy\": 85.91610719889127, \"scale\": 1.371999627635525}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_115_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_115_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 120.9888581359325, \"translation_dx\": 2.43720894071744, \"translation_dy\": -7.865691814940682, \"scale\": 0.5519813971136048}\nB: {\"rotation_angle\": 49.896013394485834, \"translation_dx\": -25.763756683237403, \"translation_dy\": -26.432232271484168, \"scale\": 1.1619310734744932}\nC: {\"rotation_angle\": 46.42160956908356, \"translation_dx\": -90.04619228512212, \"translation_dy\": -15.749486436572411, \"scale\": 1.005156310055277}\nD: {\"rotation_angle\": 123.61853421760617, \"translation_dx\": -93.63136806510369, \"translation_dy\": -15.65687765252683, \"scale\": 0.9834422929774667}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 120.9888581359325, \"translation_dx\": 2.43720894071744, \"translation_dy\": -7.865691814940682, \"scale\": 0.5519813971136048}\nB: {\"rotation_angle\": 49.896013394485834, \"translation_dx\": -25.763756683237403, \"translation_dy\": -26.432232271484168, \"scale\": 1.1619310734744932}\nC: {\"rotation_angle\": 46.42160956908356, \"translation_dx\": -90.04619228512212, \"translation_dy\": -15.749486436572411, \"scale\": 1.005156310055277}\nD: {\"rotation_angle\": 123.61853421760617, \"translation_dx\": -93.63136806510369, \"translation_dy\": -15.65687765252683, \"scale\": 0.9834422929774667}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_116_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_116_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -31.020660516088725, \"translation_dx\": 105.99805178546191, \"translation_dy\": -82.8489656004858, \"scale\": 1.0703563169477137}\nB: {\"rotation_angle\": -46.75272698463425, \"translation_dx\": 16.424107524155175, \"translation_dy\": -60.683488552754085, \"scale\": 1.375025476214386}\nC: {\"rotation_angle\": 4.601729825002167, \"translation_dx\": -92.34842360064926, \"translation_dy\": 78.34726427877602, \"scale\": 0.7620115680057987}\nD: {\"rotation_angle\": -75.97132980340905, \"translation_dx\": 6.960702322199779, \"translation_dy\": 90.08754109424518, \"scale\": 1.363389071715864}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -31.020660516088725, \"translation_dx\": 105.99805178546191, \"translation_dy\": -82.8489656004858, \"scale\": 1.0703563169477137}\nB: {\"rotation_angle\": -46.75272698463425, \"translation_dx\": 16.424107524155175, \"translation_dy\": -60.683488552754085, \"scale\": 1.375025476214386}\nC: {\"rotation_angle\": 4.601729825002167, \"translation_dx\": -92.34842360064926, \"translation_dy\": 78.34726427877602, \"scale\": 0.7620115680057987}\nD: {\"rotation_angle\": -75.97132980340905, \"translation_dx\": 6.960702322199779, \"translation_dy\": 90.08754109424518, \"scale\": 1.363389071715864}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_117_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_117_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 104.66960596229086, \"translation_dx\": 122.9579606372167, \"translation_dy\": -32.21502556645471, \"scale\": 0.5791563638149022}\nB: {\"rotation_angle\": -44.30781692045639, \"translation_dx\": -23.473696812537305, \"translation_dy\": -94.42952089946652, \"scale\": 1.4029179362735564}\nC: {\"rotation_angle\": 37.640985396206986, \"translation_dx\": -97.39428669742068, \"translation_dy\": 17.900860680283458, \"scale\": 1.0930243251030827}\nD: {\"rotation_angle\": 110.02825264959768, \"translation_dx\": -53.26387197670213, \"translation_dy\": 88.43864976013427, \"scale\": 1.4833645013101147}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 104.66960596229086, \"translation_dx\": 122.9579606372167, \"translation_dy\": -32.21502556645471, \"scale\": 0.5791563638149022}\nB: {\"rotation_angle\": -44.30781692045639, \"translation_dx\": -23.473696812537305, \"translation_dy\": -94.42952089946652, \"scale\": 1.4029179362735564}\nC: {\"rotation_angle\": 37.640985396206986, \"translation_dx\": -97.39428669742068, \"translation_dy\": 17.900860680283458, \"scale\": 1.0930243251030827}\nD: {\"rotation_angle\": 110.02825264959768, \"translation_dx\": -53.26387197670213, \"translation_dy\": 88.43864976013427, \"scale\": 1.4833645013101147}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_118_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_118_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 83.8873422171626, \"translation_dx\": -89.51171417178318, \"translation_dy\": 44.525876215713694, \"scale\": 0.7096671999666376}\nB: {\"rotation_angle\": -32.057796286961064, \"translation_dx\": 119.50392135854452, \"translation_dy\": -17.786253698900993, \"scale\": 1.4583062003808291}\nC: {\"rotation_angle\": -31.020660516088725, \"translation_dx\": 105.99805178546191, \"translation_dy\": -82.8489656004858, \"scale\": 1.0703563169477137}\nD: {\"rotation_angle\": 99.38174871704592, \"translation_dx\": 57.870588734166205, \"translation_dy\": 17.413162007690403, \"scale\": 1.4113398114931053}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 83.8873422171626, \"translation_dx\": -89.51171417178318, \"translation_dy\": 44.525876215713694, \"scale\": 0.7096671999666376}\nB: {\"rotation_angle\": -32.057796286961064, \"translation_dx\": 119.50392135854452, \"translation_dy\": -17.786253698900993, \"scale\": 1.4583062003808291}\nC: {\"rotation_angle\": -31.020660516088725, \"translation_dx\": 105.99805178546191, \"translation_dy\": -82.8489656004858, \"scale\": 1.0703563169477137}\nD: {\"rotation_angle\": 99.38174871704592, \"translation_dx\": 57.870588734166205, \"translation_dy\": 17.413162007690403, \"scale\": 1.4113398114931053}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_119_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_119_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -110.46391589612124, \"translation_dx\": -77.96644542647721, \"translation_dy\": -50.23500265461973, \"scale\": 0.7651088884143488}\nB: {\"rotation_angle\": -22.98450105670534, \"translation_dx\": -24.343109907781525, \"translation_dy\": -75.50859401578859, \"scale\": 0.5077440368943875}\nC: {\"rotation_angle\": -97.38730278840897, \"translation_dx\": 79.58431404822528, \"translation_dy\": -65.17570525641105, \"scale\": 0.8501057849742453}\nD: {\"rotation_angle\": -147.17742740700606, \"translation_dx\": 99.79022385553455, \"translation_dy\": -46.32888217161055, \"scale\": 1.2561938294527635}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -110.46391589612124, \"translation_dx\": -77.96644542647721, \"translation_dy\": -50.23500265461973, \"scale\": 0.7651088884143488}\nB: {\"rotation_angle\": -22.98450105670534, \"translation_dx\": -24.343109907781525, \"translation_dy\": -75.50859401578859, \"scale\": 0.5077440368943875}\nC: {\"rotation_angle\": -97.38730278840897, \"translation_dx\": 79.58431404822528, \"translation_dy\": -65.17570525641105, \"scale\": 0.8501057849742453}\nD: {\"rotation_angle\": -147.17742740700606, \"translation_dx\": 99.79022385553455, \"translation_dy\": -46.32888217161055, \"scale\": 1.2561938294527635}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_120_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_120_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 37.640985396206986, \"translation_dx\": -97.39428669742068, \"translation_dy\": 17.900860680283458, \"scale\": 1.0930243251030827}\nB: {\"rotation_angle\": -106.99875725121946, \"translation_dx\": 87.96881157950656, \"translation_dy\": -34.70529343588741, \"scale\": 1.407305489874207}\nC: {\"rotation_angle\": -95.56761680572791, \"translation_dx\": -92.07587430861633, \"translation_dy\": -64.18919222058364, \"scale\": 1.033728049154846}\nD: {\"rotation_angle\": 84.88997243843744, \"translation_dx\": 19.30269357274682, \"translation_dy\": 9.929350250110147, \"scale\": 1.0595552381550672}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 37.640985396206986, \"translation_dx\": -97.39428669742068, \"translation_dy\": 17.900860680283458, \"scale\": 1.0930243251030827}\nB: {\"rotation_angle\": -106.99875725121946, \"translation_dx\": 87.96881157950656, \"translation_dy\": -34.70529343588741, \"scale\": 1.407305489874207}\nC: {\"rotation_angle\": -95.56761680572791, \"translation_dx\": -92.07587430861633, \"translation_dy\": -64.18919222058364, \"scale\": 1.033728049154846}\nD: {\"rotation_angle\": 84.88997243843744, \"translation_dx\": 19.30269357274682, \"translation_dy\": 9.929350250110147, \"scale\": 1.0595552381550672}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_121_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_121_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 159.74516071456964, \"translation_dx\": 18.36539372865252, \"translation_dy\": -32.68583255299669, \"scale\": 0.6283421405871866}\nB: {\"rotation_angle\": 84.88997243843744, \"translation_dx\": 19.30269357274682, \"translation_dy\": 9.929350250110147, \"scale\": 1.0595552381550672}\nC: {\"rotation_angle\": -148.06770236959966, \"translation_dx\": 76.71938731609727, \"translation_dy\": 125.67697929104389, \"scale\": 1.1600663307259453}\nD: {\"rotation_angle\": -131.1795029858263, \"translation_dx\": 17.908074544940433, \"translation_dy\": 120.17637833747304, \"scale\": 0.9471882483559888}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 159.74516071456964, \"translation_dx\": 18.36539372865252, \"translation_dy\": -32.68583255299669, \"scale\": 0.6283421405871866}\nB: {\"rotation_angle\": 84.88997243843744, \"translation_dx\": 19.30269357274682, \"translation_dy\": 9.929350250110147, \"scale\": 1.0595552381550672}\nC: {\"rotation_angle\": -148.06770236959966, \"translation_dx\": 76.71938731609727, \"translation_dy\": 125.67697929104389, \"scale\": 1.1600663307259453}\nD: {\"rotation_angle\": -131.1795029858263, \"translation_dx\": 17.908074544940433, \"translation_dy\": 120.17637833747304, \"scale\": 0.9471882483559888}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_122_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_122_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 70.46713054198463, \"translation_dx\": 21.906055640356044, \"translation_dy\": -12.161170387444017, \"scale\": 0.6983211043742098}\nB: {\"rotation_angle\": -176.3085334768787, \"translation_dx\": -26.09189325642553, \"translation_dy\": 21.458056495366975, \"scale\": 0.7934334422653395}\nC: {\"rotation_angle\": -5.816806483512181, \"translation_dx\": -70.40329792935935, \"translation_dy\": -21.418007440252175, \"scale\": 1.0041476956174793}\nD: {\"rotation_angle\": 97.63348280388993, \"translation_dx\": 59.62332527691919, \"translation_dy\": 12.549462794922746, \"scale\": 0.6927080624806098}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 70.46713054198463, \"translation_dx\": 21.906055640356044, \"translation_dy\": -12.161170387444017, \"scale\": 0.6983211043742098}\nB: {\"rotation_angle\": -176.3085334768787, \"translation_dx\": -26.09189325642553, \"translation_dy\": 21.458056495366975, \"scale\": 0.7934334422653395}\nC: {\"rotation_angle\": -5.816806483512181, \"translation_dx\": -70.40329792935935, \"translation_dy\": -21.418007440252175, \"scale\": 1.0041476956174793}\nD: {\"rotation_angle\": 97.63348280388993, \"translation_dx\": 59.62332527691919, \"translation_dy\": 12.549462794922746, \"scale\": 0.6927080624806098}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_123_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_123_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -38.67054772511392, \"translation_dx\": 68.1059088983965, \"translation_dy\": -80.75433684597641, \"scale\": 1.0669693911306672}\nB: {\"rotation_angle\": -165.5576257925042, \"translation_dx\": 120.02978270991923, \"translation_dy\": -94.68626204020723, \"scale\": 1.377433782383828}\nC: {\"rotation_angle\": -174.94064668132228, \"translation_dx\": 73.73079207136513, \"translation_dy\": 58.25534486945551, \"scale\": 1.178357936048121}\nD: {\"rotation_angle\": -128.74497971799806, \"translation_dx\": -55.835206426128764, \"translation_dy\": 54.178252983369276, \"scale\": 0.8905979693160588}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -38.67054772511392, \"translation_dx\": 68.1059088983965, \"translation_dy\": -80.75433684597641, \"scale\": 1.0669693911306672}\nB: {\"rotation_angle\": -165.5576257925042, \"translation_dx\": 120.02978270991923, \"translation_dy\": -94.68626204020723, \"scale\": 1.377433782383828}\nC: {\"rotation_angle\": -174.94064668132228, \"translation_dx\": 73.73079207136513, \"translation_dy\": 58.25534486945551, \"scale\": 1.178357936048121}\nD: {\"rotation_angle\": -128.74497971799806, \"translation_dx\": -55.835206426128764, \"translation_dy\": 54.178252983369276, \"scale\": 0.8905979693160588}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_124_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_124_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 95.56102360167273, \"translation_dx\": -57.629857243876444, \"translation_dy\": -95.34824117323305, \"scale\": 0.9533126568708786}\nB: {\"rotation_angle\": 36.19361803007027, \"translation_dx\": -50.40071399889004, \"translation_dy\": -85.39533040467117, \"scale\": 0.6522247071940848}\nC: {\"rotation_angle\": -78.36766094840773, \"translation_dx\": -86.41466180609471, \"translation_dy\": 63.19530077419013, \"scale\": 0.608403973907593}\nD: {\"rotation_angle\": -23.02063628299686, \"translation_dx\": -42.06347070905805, \"translation_dy\": 68.90308226059909, \"scale\": 0.7321107429069119}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 95.56102360167273, \"translation_dx\": -57.629857243876444, \"translation_dy\": -95.34824117323305, \"scale\": 0.9533126568708786}\nB: {\"rotation_angle\": 36.19361803007027, \"translation_dx\": -50.40071399889004, \"translation_dy\": -85.39533040467117, \"scale\": 0.6522247071940848}\nC: {\"rotation_angle\": -78.36766094840773, \"translation_dx\": -86.41466180609471, \"translation_dy\": 63.19530077419013, \"scale\": 0.608403973907593}\nD: {\"rotation_angle\": -23.02063628299686, \"translation_dx\": -42.06347070905805, \"translation_dy\": 68.90308226059909, \"scale\": 0.7321107429069119}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_125_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_125_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -103.24791656906933, \"translation_dx\": -2.2454836983213227, \"translation_dy\": 24.014319900588845, \"scale\": 1.3204557483507742}\nB: {\"rotation_angle\": 37.640985396206986, \"translation_dx\": -97.39428669742068, \"translation_dy\": 17.900860680283458, \"scale\": 1.0930243251030827}\nC: {\"rotation_angle\": -94.06455293225282, \"translation_dx\": -52.04430006776356, \"translation_dy\": 88.55937507710391, \"scale\": 0.8369046461483086}\nD: {\"rotation_angle\": -95.56761680572791, \"translation_dx\": -92.07587430861633, \"translation_dy\": -64.18919222058364, \"scale\": 1.033728049154846}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -103.24791656906933, \"translation_dx\": -2.2454836983213227, \"translation_dy\": 24.014319900588845, \"scale\": 1.3204557483507742}\nB: {\"rotation_angle\": 37.640985396206986, \"translation_dx\": -97.39428669742068, \"translation_dy\": 17.900860680283458, \"scale\": 1.0930243251030827}\nC: {\"rotation_angle\": -94.06455293225282, \"translation_dx\": -52.04430006776356, \"translation_dy\": 88.55937507710391, \"scale\": 0.8369046461483086}\nD: {\"rotation_angle\": -95.56761680572791, \"translation_dx\": -92.07587430861633, \"translation_dy\": -64.18919222058364, \"scale\": 1.033728049154846}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_126_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_126_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 115.4472434811122, \"translation_dx\": 69.00896887231048, \"translation_dy\": -26.016218629159226, \"scale\": 0.9339901852292719}\nB: {\"rotation_angle\": -149.34069149386406, \"translation_dx\": 81.63420911320063, \"translation_dy\": -26.073567429384056, \"scale\": 1.427947630130646}\nC: {\"rotation_angle\": 37.640985396206986, \"translation_dx\": -97.39428669742068, \"translation_dy\": 17.900860680283458, \"scale\": 1.0930243251030827}\nD: {\"rotation_angle\": 98.62110540120432, \"translation_dx\": 55.8324503005326, \"translation_dy\": -53.32963696213369, \"scale\": 1.3342375308232577}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 115.4472434811122, \"translation_dx\": 69.00896887231048, \"translation_dy\": -26.016218629159226, \"scale\": 0.9339901852292719}\nB: {\"rotation_angle\": -149.34069149386406, \"translation_dx\": 81.63420911320063, \"translation_dy\": -26.073567429384056, \"scale\": 1.427947630130646}\nC: {\"rotation_angle\": 37.640985396206986, \"translation_dx\": -97.39428669742068, \"translation_dy\": 17.900860680283458, \"scale\": 1.0930243251030827}\nD: {\"rotation_angle\": 98.62110540120432, \"translation_dx\": 55.8324503005326, \"translation_dy\": -53.32963696213369, \"scale\": 1.3342375308232577}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_127_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_127_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 148.22875373623708, \"translation_dx\": 53.75338658972072, \"translation_dy\": -63.78583022927253, \"scale\": 0.9304836306567924}\nB: {\"rotation_angle\": -5.683971346231118, \"translation_dx\": -0.7123036436211407, \"translation_dy\": -23.660599152813326, \"scale\": 1.1241034499451734}\nC: {\"rotation_angle\": -42.98651909317854, \"translation_dx\": 114.49293313374625, \"translation_dy\": -39.53290228333596, \"scale\": 1.442019387031135}\nD: {\"rotation_angle\": -31.020660516088725, \"translation_dx\": 105.99805178546191, \"translation_dy\": -82.8489656004858, \"scale\": 1.0703563169477137}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 148.22875373623708, \"translation_dx\": 53.75338658972072, \"translation_dy\": -63.78583022927253, \"scale\": 0.9304836306567924}\nB: {\"rotation_angle\": -5.683971346231118, \"translation_dx\": -0.7123036436211407, \"translation_dy\": -23.660599152813326, \"scale\": 1.1241034499451734}\nC: {\"rotation_angle\": -42.98651909317854, \"translation_dx\": 114.49293313374625, \"translation_dy\": -39.53290228333596, \"scale\": 1.442019387031135}\nD: {\"rotation_angle\": -31.020660516088725, \"translation_dx\": 105.99805178546191, \"translation_dy\": -82.8489656004858, \"scale\": 1.0703563169477137}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_128_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_128_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -163.07830945514343, \"translation_dx\": 107.25371607945826, \"translation_dy\": 44.19319462200147, \"scale\": 1.0330497674624493}\nB: {\"rotation_angle\": 137.6047485759084, \"translation_dx\": -27.00857214512888, \"translation_dy\": -94.97246325619065, \"scale\": 1.1628545134465245}\nC: {\"rotation_angle\": 137.29869982747988, \"translation_dx\": 75.41375097241084, \"translation_dy\": 55.66358575693553, \"scale\": 1.1335508281242805}\nD: {\"rotation_angle\": -53.475823147809436, \"translation_dx\": -52.11444637245131, \"translation_dy\": -7.974464084606126, \"scale\": 1.302004904680502}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -163.07830945514343, \"translation_dx\": 107.25371607945826, \"translation_dy\": 44.19319462200147, \"scale\": 1.0330497674624493}\nB: {\"rotation_angle\": 137.6047485759084, \"translation_dx\": -27.00857214512888, \"translation_dy\": -94.97246325619065, \"scale\": 1.1628545134465245}\nC: {\"rotation_angle\": 137.29869982747988, \"translation_dx\": 75.41375097241084, \"translation_dy\": 55.66358575693553, \"scale\": 1.1335508281242805}\nD: {\"rotation_angle\": -53.475823147809436, \"translation_dx\": -52.11444637245131, \"translation_dy\": -7.974464084606126, \"scale\": 1.302004904680502}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_129_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_129_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 153.24034529323683, \"translation_dx\": -80.95083564593054, \"translation_dy\": 58.17854805068575, \"scale\": 0.8564275095577245}\nB: {\"rotation_angle\": 171.23105805984426, \"translation_dx\": 28.800906238980815, \"translation_dy\": 60.921924115709544, \"scale\": 1.4441070487112413}\nC: {\"rotation_angle\": -44.30781692045639, \"translation_dx\": -23.473696812537305, \"translation_dy\": -94.42952089946652, \"scale\": 1.4029179362735564}\nD: {\"rotation_angle\": 172.84173099768327, \"translation_dx\": -36.82796075364796, \"translation_dy\": -15.346257103503191, \"scale\": 0.8112655094699114}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 153.24034529323683, \"translation_dx\": -80.95083564593054, \"translation_dy\": 58.17854805068575, \"scale\": 0.8564275095577245}\nB: {\"rotation_angle\": 171.23105805984426, \"translation_dx\": 28.800906238980815, \"translation_dy\": 60.921924115709544, \"scale\": 1.4441070487112413}\nC: {\"rotation_angle\": -44.30781692045639, \"translation_dx\": -23.473696812537305, \"translation_dy\": -94.42952089946652, \"scale\": 1.4029179362735564}\nD: {\"rotation_angle\": 172.84173099768327, \"translation_dx\": -36.82796075364796, \"translation_dy\": -15.346257103503191, \"scale\": 0.8112655094699114}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_130_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_130_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -6.970858631484532, \"translation_dx\": -2.793256631611797, \"translation_dy\": 83.08133552847667, \"scale\": 1.4237697720578382}\nB: {\"rotation_angle\": -79.55706788063112, \"translation_dx\": -38.613403166877674, \"translation_dy\": 48.56888435185245, \"scale\": 1.368947012195521}\nC: {\"rotation_angle\": 88.199522854527, \"translation_dx\": 18.814421533590917, \"translation_dy\": -27.135307313502466, \"scale\": 1.37855935527965}\nD: {\"rotation_angle\": 163.34031080178892, \"translation_dx\": -21.567151354845635, \"translation_dy\": -30.72615389540148, \"scale\": 1.2439888416024685}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -6.970858631484532, \"translation_dx\": -2.793256631611797, \"translation_dy\": 83.08133552847667, \"scale\": 1.4237697720578382}\nB: {\"rotation_angle\": -79.55706788063112, \"translation_dx\": -38.613403166877674, \"translation_dy\": 48.56888435185245, \"scale\": 1.368947012195521}\nC: {\"rotation_angle\": 88.199522854527, \"translation_dx\": 18.814421533590917, \"translation_dy\": -27.135307313502466, \"scale\": 1.37855935527965}\nD: {\"rotation_angle\": 163.34031080178892, \"translation_dx\": -21.567151354845635, \"translation_dy\": -30.72615389540148, \"scale\": 1.2439888416024685}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_131_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_131_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -100.94596249363259, \"translation_dx\": 18.493532966543597, \"translation_dy\": -4.904135882610319, \"scale\": 1.1575890826518318}\nB: {\"rotation_angle\": 134.59992138556464, \"translation_dx\": 5.908404103559974, \"translation_dy\": 47.60587687007518, \"scale\": 1.0105063493742612}\nC: {\"rotation_angle\": -31.020660516088725, \"translation_dx\": 105.99805178546191, \"translation_dy\": -82.8489656004858, \"scale\": 1.0703563169477137}\nD: {\"rotation_angle\": 157.75388648393812, \"translation_dx\": 20.356281771878216, \"translation_dy\": 16.09866009065132, \"scale\": 0.523349135390574}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -100.94596249363259, \"translation_dx\": 18.493532966543597, \"translation_dy\": -4.904135882610319, \"scale\": 1.1575890826518318}\nB: {\"rotation_angle\": 134.59992138556464, \"translation_dx\": 5.908404103559974, \"translation_dy\": 47.60587687007518, \"scale\": 1.0105063493742612}\nC: {\"rotation_angle\": -31.020660516088725, \"translation_dx\": 105.99805178546191, \"translation_dy\": -82.8489656004858, \"scale\": 1.0703563169477137}\nD: {\"rotation_angle\": 157.75388648393812, \"translation_dx\": 20.356281771878216, \"translation_dy\": 16.09866009065132, \"scale\": 0.523349135390574}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_132_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_132_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -97.38730278840897, \"translation_dx\": 79.58431404822528, \"translation_dy\": -65.17570525641105, \"scale\": 0.8501057849742453}\nB: {\"rotation_angle\": 83.49682873903629, \"translation_dx\": -127.2042493945246, \"translation_dy\": 2.6616959584396938, \"scale\": 0.9488759478249397}\nC: {\"rotation_angle\": -149.42147215379055, \"translation_dx\": 2.3444194857030283, \"translation_dy\": 35.92779325530762, \"scale\": 1.0223945055206394}\nD: {\"rotation_angle\": -13.219279868292688, \"translation_dx\": -95.87022677446828, \"translation_dy\": -58.31347876468597, \"scale\": 1.3722022398508045}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -97.38730278840897, \"translation_dx\": 79.58431404822528, \"translation_dy\": -65.17570525641105, \"scale\": 0.8501057849742453}\nB: {\"rotation_angle\": 83.49682873903629, \"translation_dx\": -127.2042493945246, \"translation_dy\": 2.6616959584396938, \"scale\": 0.9488759478249397}\nC: {\"rotation_angle\": -149.42147215379055, \"translation_dx\": 2.3444194857030283, \"translation_dy\": 35.92779325530762, \"scale\": 1.0223945055206394}\nD: {\"rotation_angle\": -13.219279868292688, \"translation_dx\": -95.87022677446828, \"translation_dy\": -58.31347876468597, \"scale\": 1.3722022398508045}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_133_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_133_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 45.786611297437304, \"translation_dx\": 45.53183354666939, \"translation_dy\": -112.45880863798888, \"scale\": 0.5686394776423458}\nB: {\"rotation_angle\": 136.2943203908062, \"translation_dx\": 59.15508525636656, \"translation_dy\": -38.46099161723379, \"scale\": 0.6414776081953896}\nC: {\"rotation_angle\": 98.88222011850513, \"translation_dx\": 98.58699088344886, \"translation_dy\": 52.424259863835346, \"scale\": 0.8670994673205047}\nD: {\"rotation_angle\": -16.878745814478265, \"translation_dx\": -68.86659110743665, \"translation_dy\": -98.54142762965468, \"scale\": 1.2648663928919022}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 45.786611297437304, \"translation_dx\": 45.53183354666939, \"translation_dy\": -112.45880863798888, \"scale\": 0.5686394776423458}\nB: {\"rotation_angle\": 136.2943203908062, \"translation_dx\": 59.15508525636656, \"translation_dy\": -38.46099161723379, \"scale\": 0.6414776081953896}\nC: {\"rotation_angle\": 98.88222011850513, \"translation_dx\": 98.58699088344886, \"translation_dy\": 52.424259863835346, \"scale\": 0.8670994673205047}\nD: {\"rotation_angle\": -16.878745814478265, \"translation_dx\": -68.86659110743665, \"translation_dy\": -98.54142762965468, \"scale\": 1.2648663928919022}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_134_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_134_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -152.40502323992493, \"translation_dx\": -0.6096313646742146, \"translation_dy\": 26.2224872549711, \"scale\": 0.6008305458537412}\nB: {\"rotation_angle\": 134.59992138556464, \"translation_dx\": 5.908404103559974, \"translation_dy\": 47.60587687007518, \"scale\": 1.0105063493742612}\nC: {\"rotation_angle\": -16.878745814478265, \"translation_dx\": -68.86659110743665, \"translation_dy\": -98.54142762965468, \"scale\": 1.2648663928919022}\nD: {\"rotation_angle\": 130.382151153576, \"translation_dx\": 48.77925626504499, \"translation_dy\": 54.89982459749416, \"scale\": 1.3647831130001666}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -152.40502323992493, \"translation_dx\": -0.6096313646742146, \"translation_dy\": 26.2224872549711, \"scale\": 0.6008305458537412}\nB: {\"rotation_angle\": 134.59992138556464, \"translation_dx\": 5.908404103559974, \"translation_dy\": 47.60587687007518, \"scale\": 1.0105063493742612}\nC: {\"rotation_angle\": -16.878745814478265, \"translation_dx\": -68.86659110743665, \"translation_dy\": -98.54142762965468, \"scale\": 1.2648663928919022}\nD: {\"rotation_angle\": 130.382151153576, \"translation_dx\": 48.77925626504499, \"translation_dy\": 54.89982459749416, \"scale\": 1.3647831130001666}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_135_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_135_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 45.786611297437304, \"translation_dx\": 45.53183354666939, \"translation_dy\": -112.45880863798888, \"scale\": 0.5686394776423458}\nB: {\"rotation_angle\": -160.6395227566207, \"translation_dx\": 53.66643366551958, \"translation_dy\": -27.712376159428388, \"scale\": 1.1084051689599654}\nC: {\"rotation_angle\": -124.27587082376021, \"translation_dx\": -88.19288051455345, \"translation_dy\": 24.145134775980125, \"scale\": 1.4414104211047083}\nD: {\"rotation_angle\": 83.8873422171626, \"translation_dx\": -89.51171417178318, \"translation_dy\": 44.525876215713694, \"scale\": 0.7096671999666376}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 45.786611297437304, \"translation_dx\": 45.53183354666939, \"translation_dy\": -112.45880863798888, \"scale\": 0.5686394776423458}\nB: {\"rotation_angle\": -160.6395227566207, \"translation_dx\": 53.66643366551958, \"translation_dy\": -27.712376159428388, \"scale\": 1.1084051689599654}\nC: {\"rotation_angle\": -124.27587082376021, \"translation_dx\": -88.19288051455345, \"translation_dy\": 24.145134775980125, \"scale\": 1.4414104211047083}\nD: {\"rotation_angle\": 83.8873422171626, \"translation_dx\": -89.51171417178318, \"translation_dy\": 44.525876215713694, \"scale\": 0.7096671999666376}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_136_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_136_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 83.49682873903629, \"translation_dx\": -127.2042493945246, \"translation_dy\": 2.6616959584396938, \"scale\": 0.9488759478249397}\nB: {\"rotation_angle\": 111.11665430921613, \"translation_dx\": -45.526232266105865, \"translation_dy\": -71.56835409165808, \"scale\": 0.5234271564227445}\nC: {\"rotation_angle\": -165.5576257925042, \"translation_dx\": 120.02978270991923, \"translation_dy\": -94.68626204020723, \"scale\": 1.377433782383828}\nD: {\"rotation_angle\": -95.56761680572791, \"translation_dx\": -92.07587430861633, \"translation_dy\": -64.18919222058364, \"scale\": 1.033728049154846}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 83.49682873903629, \"translation_dx\": -127.2042493945246, \"translation_dy\": 2.6616959584396938, \"scale\": 0.9488759478249397}\nB: {\"rotation_angle\": 111.11665430921613, \"translation_dx\": -45.526232266105865, \"translation_dy\": -71.56835409165808, \"scale\": 0.5234271564227445}\nC: {\"rotation_angle\": -165.5576257925042, \"translation_dx\": 120.02978270991923, \"translation_dy\": -94.68626204020723, \"scale\": 1.377433782383828}\nD: {\"rotation_angle\": -95.56761680572791, \"translation_dx\": -92.07587430861633, \"translation_dy\": -64.18919222058364, \"scale\": 1.033728049154846}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_137_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_137_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -50.19218790392131, \"translation_dx\": -27.31734251737683, \"translation_dy\": 8.514724344494553, \"scale\": 1.0874517053433594}\nB: {\"rotation_angle\": -176.3085334768787, \"translation_dx\": -26.09189325642553, \"translation_dy\": 21.458056495366975, \"scale\": 0.7934334422653395}\nC: {\"rotation_angle\": -6.970858631484532, \"translation_dx\": -2.793256631611797, \"translation_dy\": 83.08133552847667, \"scale\": 1.4237697720578382}\nD: {\"rotation_angle\": 115.4472434811122, \"translation_dx\": 69.00896887231048, \"translation_dy\": -26.016218629159226, \"scale\": 0.9339901852292719}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -50.19218790392131, \"translation_dx\": -27.31734251737683, \"translation_dy\": 8.514724344494553, \"scale\": 1.0874517053433594}\nB: {\"rotation_angle\": -176.3085334768787, \"translation_dx\": -26.09189325642553, \"translation_dy\": 21.458056495366975, \"scale\": 0.7934334422653395}\nC: {\"rotation_angle\": -6.970858631484532, \"translation_dx\": -2.793256631611797, \"translation_dy\": 83.08133552847667, \"scale\": 1.4237697720578382}\nD: {\"rotation_angle\": 115.4472434811122, \"translation_dx\": 69.00896887231048, \"translation_dy\": -26.016218629159226, \"scale\": 0.9339901852292719}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_138_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_138_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 98.88222011850513, \"translation_dx\": 98.58699088344886, \"translation_dy\": 52.424259863835346, \"scale\": 0.8670994673205047}\nB: {\"rotation_angle\": -128.93587705152078, \"translation_dx\": 48.830662388872895, \"translation_dy\": 65.60255696435819, \"scale\": 0.5618983722639579}\nC: {\"rotation_angle\": -49.11147497176091, \"translation_dx\": -21.61309921155923, \"translation_dy\": 41.841400081955015, \"scale\": 1.3374733710705384}\nD: {\"rotation_angle\": 107.15748471049534, \"translation_dx\": -112.04520804841785, \"translation_dy\": 107.36899853350675, \"scale\": 0.784106447062462}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 98.88222011850513, \"translation_dx\": 98.58699088344886, \"translation_dy\": 52.424259863835346, \"scale\": 0.8670994673205047}\nB: {\"rotation_angle\": -128.93587705152078, \"translation_dx\": 48.830662388872895, \"translation_dy\": 65.60255696435819, \"scale\": 0.5618983722639579}\nC: {\"rotation_angle\": -49.11147497176091, \"translation_dx\": -21.61309921155923, \"translation_dy\": 41.841400081955015, \"scale\": 1.3374733710705384}\nD: {\"rotation_angle\": 107.15748471049534, \"translation_dx\": -112.04520804841785, \"translation_dy\": 107.36899853350675, \"scale\": 0.784106447062462}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_139_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_139_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 172.84173099768327, \"translation_dx\": -36.82796075364796, \"translation_dy\": -15.346257103503191, \"scale\": 0.8112655094699114}\nB: {\"rotation_angle\": -169.57691070181107, \"translation_dx\": 67.3776951722352, \"translation_dy\": 6.393739311338578, \"scale\": 0.8283042543093307}\nC: {\"rotation_angle\": -178.96154331790243, \"translation_dx\": -45.831117140591004, \"translation_dy\": 14.962223802901406, \"scale\": 1.4059876442036168}\nD: {\"rotation_angle\": -127.2688410750471, \"translation_dx\": 10.330064507300825, \"translation_dy\": -25.010404065134438, \"scale\": 1.1376215421095472}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 172.84173099768327, \"translation_dx\": -36.82796075364796, \"translation_dy\": -15.346257103503191, \"scale\": 0.8112655094699114}\nB: {\"rotation_angle\": -169.57691070181107, \"translation_dx\": 67.3776951722352, \"translation_dy\": 6.393739311338578, \"scale\": 0.8283042543093307}\nC: {\"rotation_angle\": -178.96154331790243, \"translation_dx\": -45.831117140591004, \"translation_dy\": 14.962223802901406, \"scale\": 1.4059876442036168}\nD: {\"rotation_angle\": -127.2688410750471, \"translation_dx\": 10.330064507300825, \"translation_dy\": -25.010404065134438, \"scale\": 1.1376215421095472}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_140_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_140_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 134.59992138556464, \"translation_dx\": 5.908404103559974, \"translation_dy\": 47.60587687007518, \"scale\": 1.0105063493742612}\nB: {\"rotation_angle\": -79.55706788063112, \"translation_dx\": -38.613403166877674, \"translation_dy\": 48.56888435185245, \"scale\": 1.368947012195521}\nC: {\"rotation_angle\": 115.44035395260755, \"translation_dx\": 104.38539690843712, \"translation_dy\": -82.71757148170198, \"scale\": 0.6534862534786243}\nD: {\"rotation_angle\": 83.49682873903629, \"translation_dx\": -127.2042493945246, \"translation_dy\": 2.6616959584396938, \"scale\": 0.9488759478249397}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 134.59992138556464, \"translation_dx\": 5.908404103559974, \"translation_dy\": 47.60587687007518, \"scale\": 1.0105063493742612}\nB: {\"rotation_angle\": -79.55706788063112, \"translation_dx\": -38.613403166877674, \"translation_dy\": 48.56888435185245, \"scale\": 1.368947012195521}\nC: {\"rotation_angle\": 115.44035395260755, \"translation_dx\": 104.38539690843712, \"translation_dy\": -82.71757148170198, \"scale\": 0.6534862534786243}\nD: {\"rotation_angle\": 83.49682873903629, \"translation_dx\": -127.2042493945246, \"translation_dy\": 2.6616959584396938, \"scale\": 0.9488759478249397}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_141_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_141_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -49.11147497176091, \"translation_dx\": -21.61309921155923, \"translation_dy\": 41.841400081955015, \"scale\": 1.3374733710705384}\nB: {\"rotation_angle\": -178.96154331790243, \"translation_dx\": -45.831117140591004, \"translation_dy\": 14.962223802901406, \"scale\": 1.4059876442036168}\nC: {\"rotation_angle\": -70.97525301082955, \"translation_dx\": -28.380848037876873, \"translation_dy\": 54.37723426674512, \"scale\": 0.9024922197892329}\nD: {\"rotation_angle\": 120.9888581359325, \"translation_dx\": 2.43720894071744, \"translation_dy\": -7.865691814940682, \"scale\": 0.5519813971136048}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -49.11147497176091, \"translation_dx\": -21.61309921155923, \"translation_dy\": 41.841400081955015, \"scale\": 1.3374733710705384}\nB: {\"rotation_angle\": -178.96154331790243, \"translation_dx\": -45.831117140591004, \"translation_dy\": 14.962223802901406, \"scale\": 1.4059876442036168}\nC: {\"rotation_angle\": -70.97525301082955, \"translation_dx\": -28.380848037876873, \"translation_dy\": 54.37723426674512, \"scale\": 0.9024922197892329}\nD: {\"rotation_angle\": 120.9888581359325, \"translation_dx\": 2.43720894071744, \"translation_dy\": -7.865691814940682, \"scale\": 0.5519813971136048}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_142_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_142_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 12.872370969250312, \"translation_dx\": -43.1533458138392, \"translation_dy\": -64.88511529320917, \"scale\": 1.3092068537816153}\nB: {\"rotation_angle\": -123.92621597373325, \"translation_dx\": 115.25994331141689, \"translation_dy\": -45.13111299141354, \"scale\": 1.164470344420729}\nC: {\"rotation_angle\": 115.4472434811122, \"translation_dx\": 69.00896887231048, \"translation_dy\": -26.016218629159226, \"scale\": 0.9339901852292719}\nD: {\"rotation_angle\": 55.990963226006784, \"translation_dx\": 71.2358057599877, \"translation_dy\": 22.751866785772563, \"scale\": 1.4964705985201703}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 12.872370969250312, \"translation_dx\": -43.1533458138392, \"translation_dy\": -64.88511529320917, \"scale\": 1.3092068537816153}\nB: {\"rotation_angle\": -123.92621597373325, \"translation_dx\": 115.25994331141689, \"translation_dy\": -45.13111299141354, \"scale\": 1.164470344420729}\nC: {\"rotation_angle\": 115.4472434811122, \"translation_dx\": 69.00896887231048, \"translation_dy\": -26.016218629159226, \"scale\": 0.9339901852292719}\nD: {\"rotation_angle\": 55.990963226006784, \"translation_dx\": 71.2358057599877, \"translation_dy\": 22.751866785772563, \"scale\": 1.4964705985201703}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_143_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_143_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 136.2943203908062, \"translation_dx\": 59.15508525636656, \"translation_dy\": -38.46099161723379, \"scale\": 0.6414776081953896}\nB: {\"rotation_angle\": 163.34031080178892, \"translation_dx\": -21.567151354845635, \"translation_dy\": -30.72615389540148, \"scale\": 1.2439888416024685}\nC: {\"rotation_angle\": -5.816806483512181, \"translation_dx\": -70.40329792935935, \"translation_dy\": -21.418007440252175, \"scale\": 1.0041476956174793}\nD: {\"rotation_angle\": -95.56761680572791, \"translation_dx\": -92.07587430861633, \"translation_dy\": -64.18919222058364, \"scale\": 1.033728049154846}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 136.2943203908062, \"translation_dx\": 59.15508525636656, \"translation_dy\": -38.46099161723379, \"scale\": 0.6414776081953896}\nB: {\"rotation_angle\": 163.34031080178892, \"translation_dx\": -21.567151354845635, \"translation_dy\": -30.72615389540148, \"scale\": 1.2439888416024685}\nC: {\"rotation_angle\": -5.816806483512181, \"translation_dx\": -70.40329792935935, \"translation_dy\": -21.418007440252175, \"scale\": 1.0041476956174793}\nD: {\"rotation_angle\": -95.56761680572791, \"translation_dx\": -92.07587430861633, \"translation_dy\": -64.18919222058364, \"scale\": 1.033728049154846}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_144_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_144_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -42.98651909317854, \"translation_dx\": 114.49293313374625, \"translation_dy\": -39.53290228333596, \"scale\": 1.442019387031135}\nB: {\"rotation_angle\": 98.88222011850513, \"translation_dx\": 98.58699088344886, \"translation_dy\": 52.424259863835346, \"scale\": 0.8670994673205047}\nC: {\"rotation_angle\": 120.9888581359325, \"translation_dx\": 2.43720894071744, \"translation_dy\": -7.865691814940682, \"scale\": 0.5519813971136048}\nD: {\"rotation_angle\": 123.61853421760617, \"translation_dx\": -93.63136806510369, \"translation_dy\": -15.65687765252683, \"scale\": 0.9834422929774667}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -42.98651909317854, \"translation_dx\": 114.49293313374625, \"translation_dy\": -39.53290228333596, \"scale\": 1.442019387031135}\nB: {\"rotation_angle\": 98.88222011850513, \"translation_dx\": 98.58699088344886, \"translation_dy\": 52.424259863835346, \"scale\": 0.8670994673205047}\nC: {\"rotation_angle\": 120.9888581359325, \"translation_dx\": 2.43720894071744, \"translation_dy\": -7.865691814940682, \"scale\": 0.5519813971136048}\nD: {\"rotation_angle\": 123.61853421760617, \"translation_dx\": -93.63136806510369, \"translation_dy\": -15.65687765252683, \"scale\": 0.9834422929774667}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_145_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_145_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -6.38420562293993, \"translation_dx\": -106.80670691302902, \"translation_dy\": -3.5935098985529663, \"scale\": 1.3037846299861797}\nB: {\"rotation_angle\": 170.5673161572617, \"translation_dx\": -54.14309140946517, \"translation_dy\": -20.9067824061149, \"scale\": 0.74080987054586}\nC: {\"rotation_angle\": 97.08459407481979, \"translation_dx\": 38.76418659488206, \"translation_dy\": 44.81166266995322, \"scale\": 1.27585958531192}\nD: {\"rotation_angle\": 137.29869982747988, \"translation_dx\": 75.41375097241084, \"translation_dy\": 55.66358575693553, \"scale\": 1.1335508281242805}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -6.38420562293993, \"translation_dx\": -106.80670691302902, \"translation_dy\": -3.5935098985529663, \"scale\": 1.3037846299861797}\nB: {\"rotation_angle\": 170.5673161572617, \"translation_dx\": -54.14309140946517, \"translation_dy\": -20.9067824061149, \"scale\": 0.74080987054586}\nC: {\"rotation_angle\": 97.08459407481979, \"translation_dx\": 38.76418659488206, \"translation_dy\": 44.81166266995322, \"scale\": 1.27585958531192}\nD: {\"rotation_angle\": 137.29869982747988, \"translation_dx\": 75.41375097241084, \"translation_dy\": 55.66358575693553, \"scale\": 1.1335508281242805}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_146_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_146_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 83.49682873903629, \"translation_dx\": -127.2042493945246, \"translation_dy\": 2.6616959584396938, \"scale\": 0.9488759478249397}\nB: {\"rotation_angle\": -8.756342422911757, \"translation_dx\": -120.12147874311805, \"translation_dy\": -16.659510954699698, \"scale\": 0.8471832394055047}\nC: {\"rotation_angle\": -128.93587705152078, \"translation_dx\": 48.830662388872895, \"translation_dy\": 65.60255696435819, \"scale\": 0.5618983722639579}\nD: {\"rotation_angle\": -4.364889011784271, \"translation_dx\": 74.89385338851659, \"translation_dy\": 29.259521498010997, \"scale\": 1.2877948451877137}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 83.49682873903629, \"translation_dx\": -127.2042493945246, \"translation_dy\": 2.6616959584396938, \"scale\": 0.9488759478249397}\nB: {\"rotation_angle\": -8.756342422911757, \"translation_dx\": -120.12147874311805, \"translation_dy\": -16.659510954699698, \"scale\": 0.8471832394055047}\nC: {\"rotation_angle\": -128.93587705152078, \"translation_dx\": 48.830662388872895, \"translation_dy\": 65.60255696435819, \"scale\": 0.5618983722639579}\nD: {\"rotation_angle\": -4.364889011784271, \"translation_dx\": 74.89385338851659, \"translation_dy\": 29.259521498010997, \"scale\": 1.2877948451877137}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_147_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_147_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -127.03310490562403, \"translation_dx\": -44.497972498107885, \"translation_dy\": 53.252184804163164, \"scale\": 0.8807762361133948}\nB: {\"rotation_angle\": -152.40502323992493, \"translation_dx\": -0.6096313646742146, \"translation_dy\": 26.2224872549711, \"scale\": 0.6008305458537412}\nC: {\"rotation_angle\": 161.7596265938729, \"translation_dx\": -9.170216354863072, \"translation_dy\": -19.23222492696047, \"scale\": 1.1821087248622173}\nD: {\"rotation_angle\": -76.09611957445006, \"translation_dx\": -118.19634710213703, \"translation_dy\": 85.91610719889127, \"scale\": 1.371999627635525}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -127.03310490562403, \"translation_dx\": -44.497972498107885, \"translation_dy\": 53.252184804163164, \"scale\": 0.8807762361133948}\nB: {\"rotation_angle\": -152.40502323992493, \"translation_dx\": -0.6096313646742146, \"translation_dy\": 26.2224872549711, \"scale\": 0.6008305458537412}\nC: {\"rotation_angle\": 161.7596265938729, \"translation_dx\": -9.170216354863072, \"translation_dy\": -19.23222492696047, \"scale\": 1.1821087248622173}\nD: {\"rotation_angle\": -76.09611957445006, \"translation_dx\": -118.19634710213703, \"translation_dy\": 85.91610719889127, \"scale\": 1.371999627635525}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_148_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_148_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -32.96407209098831, \"translation_dx\": -27.518946535455143, \"translation_dy\": 2.5370159689679213, \"scale\": 1.259328459428434}\nB: {\"rotation_angle\": -148.06770236959966, \"translation_dx\": 76.71938731609727, \"translation_dy\": 125.67697929104389, \"scale\": 1.1600663307259453}\nC: {\"rotation_angle\": -137.58016126496426, \"translation_dx\": 45.631572391068715, \"translation_dy\": -54.72741054396442, \"scale\": 1.391656794638211}\nD: {\"rotation_angle\": -6.258618837806779, \"translation_dx\": -117.56200624611057, \"translation_dy\": -84.92852320396813, \"scale\": 0.8703619649920769}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -32.96407209098831, \"translation_dx\": -27.518946535455143, \"translation_dy\": 2.5370159689679213, \"scale\": 1.259328459428434}\nB: {\"rotation_angle\": -148.06770236959966, \"translation_dx\": 76.71938731609727, \"translation_dy\": 125.67697929104389, \"scale\": 1.1600663307259453}\nC: {\"rotation_angle\": -137.58016126496426, \"translation_dx\": 45.631572391068715, \"translation_dy\": -54.72741054396442, \"scale\": 1.391656794638211}\nD: {\"rotation_angle\": -6.258618837806779, \"translation_dx\": -117.56200624611057, \"translation_dy\": -84.92852320396813, \"scale\": 0.8703619649920769}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_149_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_149_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -14.958482221349612, \"translation_dx\": 49.62118662103501, \"translation_dy\": -13.943537967490855, \"scale\": 1.489574484959727}\nB: {\"rotation_angle\": 134.59992138556464, \"translation_dx\": 5.908404103559974, \"translation_dy\": 47.60587687007518, \"scale\": 1.0105063493742612}\nC: {\"rotation_angle\": 156.4647723112265, \"translation_dx\": -66.53886800122852, \"translation_dy\": 64.98500274528308, \"scale\": 1.1427015309184732}\nD: {\"rotation_angle\": 111.11665430921613, \"translation_dx\": -45.526232266105865, \"translation_dy\": -71.56835409165808, \"scale\": 0.5234271564227445}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -14.958482221349612, \"translation_dx\": 49.62118662103501, \"translation_dy\": -13.943537967490855, \"scale\": 1.489574484959727}\nB: {\"rotation_angle\": 134.59992138556464, \"translation_dx\": 5.908404103559974, \"translation_dy\": 47.60587687007518, \"scale\": 1.0105063493742612}\nC: {\"rotation_angle\": 156.4647723112265, \"translation_dx\": -66.53886800122852, \"translation_dy\": 64.98500274528308, \"scale\": 1.1427015309184732}\nD: {\"rotation_angle\": 111.11665430921613, \"translation_dx\": -45.526232266105865, \"translation_dy\": -71.56835409165808, \"scale\": 0.5234271564227445}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_150_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_150_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -51.98717119490195, \"translation_dx\": -83.93544420557635, \"translation_dy\": -17.359661719977098, \"scale\": 1.0858344969275349}\nB: {\"rotation_angle\": 163.34031080178892, \"translation_dx\": -21.567151354845635, \"translation_dy\": -30.72615389540148, \"scale\": 1.2439888416024685}\nC: {\"rotation_angle\": 143.38145335973087, \"translation_dx\": 86.67970142496799, \"translation_dy\": -33.57640317277091, \"scale\": 0.6114655384261714}\nD: {\"rotation_angle\": 96.727171962103, \"translation_dx\": 36.81177221178956, \"translation_dy\": 18.012374651364837, \"scale\": 0.7274955443317854}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -51.98717119490195, \"translation_dx\": -83.93544420557635, \"translation_dy\": -17.359661719977098, \"scale\": 1.0858344969275349}\nB: {\"rotation_angle\": 163.34031080178892, \"translation_dx\": -21.567151354845635, \"translation_dy\": -30.72615389540148, \"scale\": 1.2439888416024685}\nC: {\"rotation_angle\": 143.38145335973087, \"translation_dx\": 86.67970142496799, \"translation_dy\": -33.57640317277091, \"scale\": 0.6114655384261714}\nD: {\"rotation_angle\": 96.727171962103, \"translation_dx\": 36.81177221178956, \"translation_dy\": 18.012374651364837, \"scale\": 0.7274955443317854}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_151_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_151_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 18.52926347539298, \"translation_dx\": -26.155433185237058, \"translation_dy\": -39.799299198218556, \"scale\": 0.9355127285855813}\nB: {\"rotation_angle\": 74.4727172984789, \"translation_dx\": 83.0498783040965, \"translation_dy\": 24.573318419119772, \"scale\": 1.4775593630739356}\nC: {\"rotation_angle\": -97.38730278840897, \"translation_dx\": 79.58431404822528, \"translation_dy\": -65.17570525641105, \"scale\": 0.8501057849742453}\nD: {\"rotation_angle\": 95.56102360167273, \"translation_dx\": -57.629857243876444, \"translation_dy\": -95.34824117323305, \"scale\": 0.9533126568708786}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 18.52926347539298, \"translation_dx\": -26.155433185237058, \"translation_dy\": -39.799299198218556, \"scale\": 0.9355127285855813}\nB: {\"rotation_angle\": 74.4727172984789, \"translation_dx\": 83.0498783040965, \"translation_dy\": 24.573318419119772, \"scale\": 1.4775593630739356}\nC: {\"rotation_angle\": -97.38730278840897, \"translation_dx\": 79.58431404822528, \"translation_dy\": -65.17570525641105, \"scale\": 0.8501057849742453}\nD: {\"rotation_angle\": 95.56102360167273, \"translation_dx\": -57.629857243876444, \"translation_dy\": -95.34824117323305, \"scale\": 0.9533126568708786}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_152_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_152_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -106.99875725121946, \"translation_dx\": 87.96881157950656, \"translation_dy\": -34.70529343588741, \"scale\": 1.407305489874207}\nB: {\"rotation_angle\": 115.4472434811122, \"translation_dx\": 69.00896887231048, \"translation_dy\": -26.016218629159226, \"scale\": 0.9339901852292719}\nC: {\"rotation_angle\": 133.22970053001933, \"translation_dx\": 30.83867253278636, \"translation_dy\": 9.987607615316023, \"scale\": 0.9746642566652708}\nD: {\"rotation_angle\": -38.58021171568234, \"translation_dx\": -80.14139661496048, \"translation_dy\": 7.985099889843255, \"scale\": 1.029545268033875}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -106.99875725121946, \"translation_dx\": 87.96881157950656, \"translation_dy\": -34.70529343588741, \"scale\": 1.407305489874207}\nB: {\"rotation_angle\": 115.4472434811122, \"translation_dx\": 69.00896887231048, \"translation_dy\": -26.016218629159226, \"scale\": 0.9339901852292719}\nC: {\"rotation_angle\": 133.22970053001933, \"translation_dx\": 30.83867253278636, \"translation_dy\": 9.987607615316023, \"scale\": 0.9746642566652708}\nD: {\"rotation_angle\": -38.58021171568234, \"translation_dx\": -80.14139661496048, \"translation_dy\": 7.985099889843255, \"scale\": 1.029545268033875}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_153_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_153_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 103.56580652114087, \"translation_dx\": -76.88940345297716, \"translation_dy\": -3.4544443607121593, \"scale\": 1.3949152683659345}\nB: {\"rotation_angle\": 123.61853421760617, \"translation_dx\": -93.63136806510369, \"translation_dy\": -15.65687765252683, \"scale\": 0.9834422929774667}\nC: {\"rotation_angle\": -99.80397961792426, \"translation_dx\": 113.2252387398062, \"translation_dy\": -61.846052830557056, \"scale\": 1.080357872583317}\nD: {\"rotation_angle\": 170.5673161572617, \"translation_dx\": -54.14309140946517, \"translation_dy\": -20.9067824061149, \"scale\": 0.74080987054586}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 103.56580652114087, \"translation_dx\": -76.88940345297716, \"translation_dy\": -3.4544443607121593, \"scale\": 1.3949152683659345}\nB: {\"rotation_angle\": 123.61853421760617, \"translation_dx\": -93.63136806510369, \"translation_dy\": -15.65687765252683, \"scale\": 0.9834422929774667}\nC: {\"rotation_angle\": -99.80397961792426, \"translation_dx\": 113.2252387398062, \"translation_dy\": -61.846052830557056, \"scale\": 1.080357872583317}\nD: {\"rotation_angle\": 170.5673161572617, \"translation_dx\": -54.14309140946517, \"translation_dy\": -20.9067824061149, \"scale\": 0.74080987054586}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_154_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_154_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -132.6730586187399, \"translation_dx\": -14.723128468316531, \"translation_dy\": -95.44210429834934, \"scale\": 1.0421065600095725}\nB: {\"rotation_angle\": -15.445234303955033, \"translation_dx\": 52.656313993324545, \"translation_dy\": 4.243768644047549, \"scale\": 0.8747335302455691}\nC: {\"rotation_angle\": -59.18065174130953, \"translation_dx\": -66.15733764198566, \"translation_dy\": -32.06450758946801, \"scale\": 1.1967157159259998}\nD: {\"rotation_angle\": -153.3687774434925, \"translation_dx\": 50.92336593606055, \"translation_dy\": -56.81603844715568, \"scale\": 1.398231264497651}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -132.6730586187399, \"translation_dx\": -14.723128468316531, \"translation_dy\": -95.44210429834934, \"scale\": 1.0421065600095725}\nB: {\"rotation_angle\": -15.445234303955033, \"translation_dx\": 52.656313993324545, \"translation_dy\": 4.243768644047549, \"scale\": 0.8747335302455691}\nC: {\"rotation_angle\": -59.18065174130953, \"translation_dx\": -66.15733764198566, \"translation_dy\": -32.06450758946801, \"scale\": 1.1967157159259998}\nD: {\"rotation_angle\": -153.3687774434925, \"translation_dx\": 50.92336593606055, \"translation_dy\": -56.81603844715568, \"scale\": 1.398231264497651}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_155_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_155_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -153.3687774434925, \"translation_dx\": 50.92336593606055, \"translation_dy\": -56.81603844715568, \"scale\": 1.398231264497651}\nB: {\"rotation_angle\": -124.74198080809023, \"translation_dx\": -48.23531115232953, \"translation_dy\": 52.62526617026404, \"scale\": 1.3484625774406969}\nC: {\"rotation_angle\": -132.6730586187399, \"translation_dx\": -14.723128468316531, \"translation_dy\": -95.44210429834934, \"scale\": 1.0421065600095725}\nD: {\"rotation_angle\": -32.057796286961064, \"translation_dx\": 119.50392135854452, \"translation_dy\": -17.786253698900993, \"scale\": 1.4583062003808291}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -153.3687774434925, \"translation_dx\": 50.92336593606055, \"translation_dy\": -56.81603844715568, \"scale\": 1.398231264497651}\nB: {\"rotation_angle\": -124.74198080809023, \"translation_dx\": -48.23531115232953, \"translation_dy\": 52.62526617026404, \"scale\": 1.3484625774406969}\nC: {\"rotation_angle\": -132.6730586187399, \"translation_dx\": -14.723128468316531, \"translation_dy\": -95.44210429834934, \"scale\": 1.0421065600095725}\nD: {\"rotation_angle\": -32.057796286961064, \"translation_dx\": 119.50392135854452, \"translation_dy\": -17.786253698900993, \"scale\": 1.4583062003808291}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_156_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_156_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 52.27392299801002, \"translation_dx\": -7.943242591889941, \"translation_dy\": -1.8318597711701017, \"scale\": 1.489664776133741}\nB: {\"rotation_angle\": -153.95647753312159, \"translation_dx\": 64.08546266437509, \"translation_dy\": -34.554486291313935, \"scale\": 1.423360690418288}\nC: {\"rotation_angle\": -126.15991399279281, \"translation_dx\": 24.895638463286446, \"translation_dy\": -35.71086816730676, \"scale\": 1.30648936857296}\nD: {\"rotation_angle\": 98.62110540120432, \"translation_dx\": 55.8324503005326, \"translation_dy\": -53.32963696213369, \"scale\": 1.3342375308232577}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 52.27392299801002, \"translation_dx\": -7.943242591889941, \"translation_dy\": -1.8318597711701017, \"scale\": 1.489664776133741}\nB: {\"rotation_angle\": -153.95647753312159, \"translation_dx\": 64.08546266437509, \"translation_dy\": -34.554486291313935, \"scale\": 1.423360690418288}\nC: {\"rotation_angle\": -126.15991399279281, \"translation_dx\": 24.895638463286446, \"translation_dy\": -35.71086816730676, \"scale\": 1.30648936857296}\nD: {\"rotation_angle\": 98.62110540120432, \"translation_dx\": 55.8324503005326, \"translation_dy\": -53.32963696213369, \"scale\": 1.3342375308232577}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_157_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_157_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -44.30781692045639, \"translation_dx\": -23.473696812537305, \"translation_dy\": -94.42952089946652, \"scale\": 1.4029179362735564}\nB: {\"rotation_angle\": 157.75388648393812, \"translation_dx\": 20.356281771878216, \"translation_dy\": 16.09866009065132, \"scale\": 0.523349135390574}\nC: {\"rotation_angle\": -76.09611957445006, \"translation_dx\": -118.19634710213703, \"translation_dy\": 85.91610719889127, \"scale\": 1.371999627635525}\nD: {\"rotation_angle\": 168.86687879669455, \"translation_dx\": 30.327287286076626, \"translation_dy\": -73.84263373893171, \"scale\": 1.0887904122788439}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -44.30781692045639, \"translation_dx\": -23.473696812537305, \"translation_dy\": -94.42952089946652, \"scale\": 1.4029179362735564}\nB: {\"rotation_angle\": 157.75388648393812, \"translation_dx\": 20.356281771878216, \"translation_dy\": 16.09866009065132, \"scale\": 0.523349135390574}\nC: {\"rotation_angle\": -76.09611957445006, \"translation_dx\": -118.19634710213703, \"translation_dy\": 85.91610719889127, \"scale\": 1.371999627635525}\nD: {\"rotation_angle\": 168.86687879669455, \"translation_dx\": 30.327287286076626, \"translation_dy\": -73.84263373893171, \"scale\": 1.0887904122788439}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_158_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_158_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 107.15748471049534, \"translation_dx\": -112.04520804841785, \"translation_dy\": 107.36899853350675, \"scale\": 0.784106447062462}\nB: {\"rotation_angle\": -76.09611957445006, \"translation_dx\": -118.19634710213703, \"translation_dy\": 85.91610719889127, \"scale\": 1.371999627635525}\nC: {\"rotation_angle\": -16.878745814478265, \"translation_dx\": -68.86659110743665, \"translation_dy\": -98.54142762965468, \"scale\": 1.2648663928919022}\nD: {\"rotation_angle\": 46.42160956908356, \"translation_dx\": -90.04619228512212, \"translation_dy\": -15.749486436572411, \"scale\": 1.005156310055277}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 107.15748471049534, \"translation_dx\": -112.04520804841785, \"translation_dy\": 107.36899853350675, \"scale\": 0.784106447062462}\nB: {\"rotation_angle\": -76.09611957445006, \"translation_dx\": -118.19634710213703, \"translation_dy\": 85.91610719889127, \"scale\": 1.371999627635525}\nC: {\"rotation_angle\": -16.878745814478265, \"translation_dx\": -68.86659110743665, \"translation_dy\": -98.54142762965468, \"scale\": 1.2648663928919022}\nD: {\"rotation_angle\": 46.42160956908356, \"translation_dx\": -90.04619228512212, \"translation_dy\": -15.749486436572411, \"scale\": 1.005156310055277}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_159_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_159_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 2.6800660606496933, \"translation_dx\": 8.805898944242955, \"translation_dy\": -61.557448223727356, \"scale\": 0.7338009245004858}\nB: {\"rotation_angle\": -162.34443008832744, \"translation_dx\": 11.222356042803995, \"translation_dy\": -20.913798214168963, \"scale\": 0.5876305148063811}\nC: {\"rotation_angle\": -41.748048059314925, \"translation_dx\": 84.2495675740148, \"translation_dy\": -81.02778113177463, \"scale\": 1.207158201764622}\nD: {\"rotation_angle\": 97.63348280388993, \"translation_dx\": 59.62332527691919, \"translation_dy\": 12.549462794922746, \"scale\": 0.6927080624806098}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 2.6800660606496933, \"translation_dx\": 8.805898944242955, \"translation_dy\": -61.557448223727356, \"scale\": 0.7338009245004858}\nB: {\"rotation_angle\": -162.34443008832744, \"translation_dx\": 11.222356042803995, \"translation_dy\": -20.913798214168963, \"scale\": 0.5876305148063811}\nC: {\"rotation_angle\": -41.748048059314925, \"translation_dx\": 84.2495675740148, \"translation_dy\": -81.02778113177463, \"scale\": 1.207158201764622}\nD: {\"rotation_angle\": 97.63348280388993, \"translation_dx\": 59.62332527691919, \"translation_dy\": 12.549462794922746, \"scale\": 0.6927080624806098}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_160_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_160_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -127.2688410750471, \"translation_dx\": 10.330064507300825, \"translation_dy\": -25.010404065134438, \"scale\": 1.1376215421095472}\nB: {\"rotation_angle\": 123.61853421760617, \"translation_dx\": -93.63136806510369, \"translation_dy\": -15.65687765252683, \"scale\": 0.9834422929774667}\nC: {\"rotation_angle\": -49.11147497176091, \"translation_dx\": -21.61309921155923, \"translation_dy\": 41.841400081955015, \"scale\": 1.3374733710705384}\nD: {\"rotation_angle\": -22.98450105670534, \"translation_dx\": -24.343109907781525, \"translation_dy\": -75.50859401578859, \"scale\": 0.5077440368943875}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -127.2688410750471, \"translation_dx\": 10.330064507300825, \"translation_dy\": -25.010404065134438, \"scale\": 1.1376215421095472}\nB: {\"rotation_angle\": 123.61853421760617, \"translation_dx\": -93.63136806510369, \"translation_dy\": -15.65687765252683, \"scale\": 0.9834422929774667}\nC: {\"rotation_angle\": -49.11147497176091, \"translation_dx\": -21.61309921155923, \"translation_dy\": 41.841400081955015, \"scale\": 1.3374733710705384}\nD: {\"rotation_angle\": -22.98450105670534, \"translation_dx\": -24.343109907781525, \"translation_dy\": -75.50859401578859, \"scale\": 0.5077440368943875}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_161_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_161_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 4.601729825002167, \"translation_dx\": -92.34842360064926, \"translation_dy\": 78.34726427877602, \"scale\": 0.7620115680057987}\nB: {\"rotation_angle\": 2.6800660606496933, \"translation_dx\": 8.805898944242955, \"translation_dy\": -61.557448223727356, \"scale\": 0.7338009245004858}\nC: {\"rotation_angle\": -110.46391589612124, \"translation_dx\": -77.96644542647721, \"translation_dy\": -50.23500265461973, \"scale\": 0.7651088884143488}\nD: {\"rotation_angle\": 159.74516071456964, \"translation_dx\": 18.36539372865252, \"translation_dy\": -32.68583255299669, \"scale\": 0.6283421405871866}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 4.601729825002167, \"translation_dx\": -92.34842360064926, \"translation_dy\": 78.34726427877602, \"scale\": 0.7620115680057987}\nB: {\"rotation_angle\": 2.6800660606496933, \"translation_dx\": 8.805898944242955, \"translation_dy\": -61.557448223727356, \"scale\": 0.7338009245004858}\nC: {\"rotation_angle\": -110.46391589612124, \"translation_dx\": -77.96644542647721, \"translation_dy\": -50.23500265461973, \"scale\": 0.7651088884143488}\nD: {\"rotation_angle\": 159.74516071456964, \"translation_dx\": 18.36539372865252, \"translation_dy\": -32.68583255299669, \"scale\": 0.6283421405871866}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_162_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_162_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 111.11665430921613, \"translation_dx\": -45.526232266105865, \"translation_dy\": -71.56835409165808, \"scale\": 0.5234271564227445}\nB: {\"rotation_angle\": -137.58016126496426, \"translation_dx\": 45.631572391068715, \"translation_dy\": -54.72741054396442, \"scale\": 1.391656794638211}\nC: {\"rotation_angle\": -162.34443008832744, \"translation_dx\": 11.222356042803995, \"translation_dy\": -20.913798214168963, \"scale\": 0.5876305148063811}\nD: {\"rotation_angle\": 107.15748471049534, \"translation_dx\": -112.04520804841785, \"translation_dy\": 107.36899853350675, \"scale\": 0.784106447062462}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 111.11665430921613, \"translation_dx\": -45.526232266105865, \"translation_dy\": -71.56835409165808, \"scale\": 0.5234271564227445}\nB: {\"rotation_angle\": -137.58016126496426, \"translation_dx\": 45.631572391068715, \"translation_dy\": -54.72741054396442, \"scale\": 1.391656794638211}\nC: {\"rotation_angle\": -162.34443008832744, \"translation_dx\": 11.222356042803995, \"translation_dy\": -20.913798214168963, \"scale\": 0.5876305148063811}\nD: {\"rotation_angle\": 107.15748471049534, \"translation_dx\": -112.04520804841785, \"translation_dy\": 107.36899853350675, \"scale\": 0.784106447062462}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_163_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_163_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -127.2688410750471, \"translation_dx\": 10.330064507300825, \"translation_dy\": -25.010404065134438, \"scale\": 1.1376215421095472}\nB: {\"rotation_angle\": -162.34443008832744, \"translation_dx\": 11.222356042803995, \"translation_dy\": -20.913798214168963, \"scale\": 0.5876305148063811}\nC: {\"rotation_angle\": -53.475823147809436, \"translation_dx\": -52.11444637245131, \"translation_dy\": -7.974464084606126, \"scale\": 1.302004904680502}\nD: {\"rotation_angle\": 64.33574528550244, \"translation_dx\": -83.09111528364858, \"translation_dy\": 12.26726314152404, \"scale\": 0.7845370507816389}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -127.2688410750471, \"translation_dx\": 10.330064507300825, \"translation_dy\": -25.010404065134438, \"scale\": 1.1376215421095472}\nB: {\"rotation_angle\": -162.34443008832744, \"translation_dx\": 11.222356042803995, \"translation_dy\": -20.913798214168963, \"scale\": 0.5876305148063811}\nC: {\"rotation_angle\": -53.475823147809436, \"translation_dx\": -52.11444637245131, \"translation_dy\": -7.974464084606126, \"scale\": 1.302004904680502}\nD: {\"rotation_angle\": 64.33574528550244, \"translation_dx\": -83.09111528364858, \"translation_dy\": 12.26726314152404, \"scale\": 0.7845370507816389}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_164_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_164_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 26.051749493295517, \"translation_dx\": 8.674153667650117, \"translation_dy\": 81.98381249796742, \"scale\": 1.4721363798843865}\nB: {\"rotation_angle\": 23.955007488404988, \"translation_dx\": 90.0018582930472, \"translation_dy\": 38.03553582875617, \"scale\": 1.3380437802347522}\nC: {\"rotation_angle\": 136.76946369368522, \"translation_dx\": 86.13615517916296, \"translation_dy\": 47.49597577737802, \"scale\": 1.1842967613683704}\nD: {\"rotation_angle\": 157.75388648393812, \"translation_dx\": 20.356281771878216, \"translation_dy\": 16.09866009065132, \"scale\": 0.523349135390574}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 26.051749493295517, \"translation_dx\": 8.674153667650117, \"translation_dy\": 81.98381249796742, \"scale\": 1.4721363798843865}\nB: {\"rotation_angle\": 23.955007488404988, \"translation_dx\": 90.0018582930472, \"translation_dy\": 38.03553582875617, \"scale\": 1.3380437802347522}\nC: {\"rotation_angle\": 136.76946369368522, \"translation_dx\": 86.13615517916296, \"translation_dy\": 47.49597577737802, \"scale\": 1.1842967613683704}\nD: {\"rotation_angle\": 157.75388648393812, \"translation_dx\": 20.356281771878216, \"translation_dy\": 16.09866009065132, \"scale\": 0.523349135390574}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_165_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_165_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -128.93587705152078, \"translation_dx\": 48.830662388872895, \"translation_dy\": 65.60255696435819, \"scale\": 0.5618983722639579}\nB: {\"rotation_angle\": 159.25105466068987, \"translation_dx\": -126.35420360425098, \"translation_dy\": -17.54721978726404, \"scale\": 1.4952435062275256}\nC: {\"rotation_angle\": -178.96154331790243, \"translation_dx\": -45.831117140591004, \"translation_dy\": 14.962223802901406, \"scale\": 1.4059876442036168}\nD: {\"rotation_angle\": -97.38730278840897, \"translation_dx\": 79.58431404822528, \"translation_dy\": -65.17570525641105, \"scale\": 0.8501057849742453}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -128.93587705152078, \"translation_dx\": 48.830662388872895, \"translation_dy\": 65.60255696435819, \"scale\": 0.5618983722639579}\nB: {\"rotation_angle\": 159.25105466068987, \"translation_dx\": -126.35420360425098, \"translation_dy\": -17.54721978726404, \"scale\": 1.4952435062275256}\nC: {\"rotation_angle\": -178.96154331790243, \"translation_dx\": -45.831117140591004, \"translation_dy\": 14.962223802901406, \"scale\": 1.4059876442036168}\nD: {\"rotation_angle\": -97.38730278840897, \"translation_dx\": 79.58431404822528, \"translation_dy\": -65.17570525641105, \"scale\": 0.8501057849742453}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_166_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_166_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 115.16030768984217, \"translation_dx\": -1.9669547188467504, \"translation_dy\": 38.42152609256746, \"scale\": 1.3403221872922475}\nB: {\"rotation_angle\": -100.94596249363259, \"translation_dx\": 18.493532966543597, \"translation_dy\": -4.904135882610319, \"scale\": 1.1575890826518318}\nC: {\"rotation_angle\": 28.728757892682808, \"translation_dx\": 12.065384659700086, \"translation_dy\": -119.64549643343977, \"scale\": 1.126100132224236}\nD: {\"rotation_angle\": 173.6372649335733, \"translation_dx\": -7.357207392874017, \"translation_dy\": -51.70776156994498, \"scale\": 1.09720142096939}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 115.16030768984217, \"translation_dx\": -1.9669547188467504, \"translation_dy\": 38.42152609256746, \"scale\": 1.3403221872922475}\nB: {\"rotation_angle\": -100.94596249363259, \"translation_dx\": 18.493532966543597, \"translation_dy\": -4.904135882610319, \"scale\": 1.1575890826518318}\nC: {\"rotation_angle\": 28.728757892682808, \"translation_dx\": 12.065384659700086, \"translation_dy\": -119.64549643343977, \"scale\": 1.126100132224236}\nD: {\"rotation_angle\": 173.6372649335733, \"translation_dx\": -7.357207392874017, \"translation_dy\": -51.70776156994498, \"scale\": 1.09720142096939}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_167_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_167_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 115.16030768984217, \"translation_dx\": -1.9669547188467504, \"translation_dy\": 38.42152609256746, \"scale\": 1.3403221872922475}\nB: {\"rotation_angle\": 64.33574528550244, \"translation_dx\": -83.09111528364858, \"translation_dy\": 12.26726314152404, \"scale\": 0.7845370507816389}\nC: {\"rotation_angle\": -97.38730278840897, \"translation_dx\": 79.58431404822528, \"translation_dy\": -65.17570525641105, \"scale\": 0.8501057849742453}\nD: {\"rotation_angle\": -4.956802948250129, \"translation_dx\": -46.115491929325685, \"translation_dy\": 39.01349173096322, \"scale\": 1.02280257064298}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 115.16030768984217, \"translation_dx\": -1.9669547188467504, \"translation_dy\": 38.42152609256746, \"scale\": 1.3403221872922475}\nB: {\"rotation_angle\": 64.33574528550244, \"translation_dx\": -83.09111528364858, \"translation_dy\": 12.26726314152404, \"scale\": 0.7845370507816389}\nC: {\"rotation_angle\": -97.38730278840897, \"translation_dx\": 79.58431404822528, \"translation_dy\": -65.17570525641105, \"scale\": 0.8501057849742453}\nD: {\"rotation_angle\": -4.956802948250129, \"translation_dx\": -46.115491929325685, \"translation_dy\": 39.01349173096322, \"scale\": 1.02280257064298}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_168_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_168_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -44.902472769484746, \"translation_dx\": -36.85475324083902, \"translation_dy\": 36.81692000181951, \"scale\": 1.0769710077370194}\nB: {\"rotation_angle\": -161.22593365548192, \"translation_dx\": -119.73961882572601, \"translation_dy\": -93.50838821854722, \"scale\": 1.4476413063179399}\nC: {\"rotation_angle\": 32.25033099080062, \"translation_dx\": -33.246475706714875, \"translation_dy\": -9.848772328845214, \"scale\": 0.986502265576198}\nD: {\"rotation_angle\": 55.990963226006784, \"translation_dx\": 71.2358057599877, \"translation_dy\": 22.751866785772563, \"scale\": 1.4964705985201703}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -44.902472769484746, \"translation_dx\": -36.85475324083902, \"translation_dy\": 36.81692000181951, \"scale\": 1.0769710077370194}\nB: {\"rotation_angle\": -161.22593365548192, \"translation_dx\": -119.73961882572601, \"translation_dy\": -93.50838821854722, \"scale\": 1.4476413063179399}\nC: {\"rotation_angle\": 32.25033099080062, \"translation_dx\": -33.246475706714875, \"translation_dy\": -9.848772328845214, \"scale\": 0.986502265576198}\nD: {\"rotation_angle\": 55.990963226006784, \"translation_dx\": 71.2358057599877, \"translation_dy\": 22.751866785772563, \"scale\": 1.4964705985201703}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_169_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_169_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -79.27003163090343, \"translation_dx\": 8.207736130313549, \"translation_dy\": 6.670417118750038, \"scale\": 1.3327657238113826}\nB: {\"rotation_angle\": -32.057796286961064, \"translation_dx\": 119.50392135854452, \"translation_dy\": -17.786253698900993, \"scale\": 1.4583062003808291}\nC: {\"rotation_angle\": 172.84173099768327, \"translation_dx\": -36.82796075364796, \"translation_dy\": -15.346257103503191, \"scale\": 0.8112655094699114}\nD: {\"rotation_angle\": 138.15953129001275, \"translation_dx\": 108.29077351507729, \"translation_dy\": 11.25207260435026, \"scale\": 1.2682750116992958}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -79.27003163090343, \"translation_dx\": 8.207736130313549, \"translation_dy\": 6.670417118750038, \"scale\": 1.3327657238113826}\nB: {\"rotation_angle\": -32.057796286961064, \"translation_dx\": 119.50392135854452, \"translation_dy\": -17.786253698900993, \"scale\": 1.4583062003808291}\nC: {\"rotation_angle\": 172.84173099768327, \"translation_dx\": -36.82796075364796, \"translation_dy\": -15.346257103503191, \"scale\": 0.8112655094699114}\nD: {\"rotation_angle\": 138.15953129001275, \"translation_dx\": 108.29077351507729, \"translation_dy\": 11.25207260435026, \"scale\": 1.2682750116992958}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_170_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_170_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 111.11665430921613, \"translation_dx\": -45.526232266105865, \"translation_dy\": -71.56835409165808, \"scale\": 0.5234271564227445}\nB: {\"rotation_angle\": -152.40502323992493, \"translation_dx\": -0.6096313646742146, \"translation_dy\": 26.2224872549711, \"scale\": 0.6008305458537412}\nC: {\"rotation_angle\": 136.2943203908062, \"translation_dx\": 59.15508525636656, \"translation_dy\": -38.46099161723379, \"scale\": 0.6414776081953896}\nD: {\"rotation_angle\": 142.66976946716716, \"translation_dx\": 29.963541003119957, \"translation_dy\": 66.07065092305665, \"scale\": 1.42144068359999}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 111.11665430921613, \"translation_dx\": -45.526232266105865, \"translation_dy\": -71.56835409165808, \"scale\": 0.5234271564227445}\nB: {\"rotation_angle\": -152.40502323992493, \"translation_dx\": -0.6096313646742146, \"translation_dy\": 26.2224872549711, \"scale\": 0.6008305458537412}\nC: {\"rotation_angle\": 136.2943203908062, \"translation_dx\": 59.15508525636656, \"translation_dy\": -38.46099161723379, \"scale\": 0.6414776081953896}\nD: {\"rotation_angle\": 142.66976946716716, \"translation_dx\": 29.963541003119957, \"translation_dy\": 66.07065092305665, \"scale\": 1.42144068359999}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_171_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_171_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 163.34031080178892, \"translation_dx\": -21.567151354845635, \"translation_dy\": -30.72615389540148, \"scale\": 1.2439888416024685}\nB: {\"rotation_angle\": 107.15748471049534, \"translation_dx\": -112.04520804841785, \"translation_dy\": 107.36899853350675, \"scale\": 0.784106447062462}\nC: {\"rotation_angle\": -35.37165300247324, \"translation_dx\": -51.674784510203665, \"translation_dy\": 35.0550301640573, \"scale\": 1.181842779166554}\nD: {\"rotation_angle\": -147.17742740700606, \"translation_dx\": 99.79022385553455, \"translation_dy\": -46.32888217161055, \"scale\": 1.2561938294527635}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 163.34031080178892, \"translation_dx\": -21.567151354845635, \"translation_dy\": -30.72615389540148, \"scale\": 1.2439888416024685}\nB: {\"rotation_angle\": 107.15748471049534, \"translation_dx\": -112.04520804841785, \"translation_dy\": 107.36899853350675, \"scale\": 0.784106447062462}\nC: {\"rotation_angle\": -35.37165300247324, \"translation_dx\": -51.674784510203665, \"translation_dy\": 35.0550301640573, \"scale\": 1.181842779166554}\nD: {\"rotation_angle\": -147.17742740700606, \"translation_dx\": 99.79022385553455, \"translation_dy\": -46.32888217161055, \"scale\": 1.2561938294527635}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_172_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_172_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 139.13421797404374, \"translation_dx\": -107.62188977651758, \"translation_dy\": -65.35657968686931, \"scale\": 0.569575564082204}\nB: {\"rotation_angle\": 14.369437993555863, \"translation_dx\": -23.54312301695805, \"translation_dy\": 55.41046511147678, \"scale\": 1.115345902394854}\nC: {\"rotation_angle\": -165.5576257925042, \"translation_dx\": 120.02978270991923, \"translation_dy\": -94.68626204020723, \"scale\": 1.377433782383828}\nD: {\"rotation_angle\": 26.06413776863195, \"translation_dx\": 104.54441011530889, \"translation_dy\": -2.802993361858995, \"scale\": 0.6919535578881184}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 139.13421797404374, \"translation_dx\": -107.62188977651758, \"translation_dy\": -65.35657968686931, \"scale\": 0.569575564082204}\nB: {\"rotation_angle\": 14.369437993555863, \"translation_dx\": -23.54312301695805, \"translation_dy\": 55.41046511147678, \"scale\": 1.115345902394854}\nC: {\"rotation_angle\": -165.5576257925042, \"translation_dx\": 120.02978270991923, \"translation_dy\": -94.68626204020723, \"scale\": 1.377433782383828}\nD: {\"rotation_angle\": 26.06413776863195, \"translation_dx\": 104.54441011530889, \"translation_dy\": -2.802993361858995, \"scale\": 0.6919535578881184}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_173_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_173_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -53.475823147809436, \"translation_dx\": -52.11444637245131, \"translation_dy\": -7.974464084606126, \"scale\": 1.302004904680502}\nB: {\"rotation_angle\": 104.66960596229086, \"translation_dx\": 122.9579606372167, \"translation_dy\": -32.21502556645471, \"scale\": 0.5791563638149022}\nC: {\"rotation_angle\": -14.958482221349612, \"translation_dx\": 49.62118662103501, \"translation_dy\": -13.943537967490855, \"scale\": 1.489574484959727}\nD: {\"rotation_angle\": 51.652651058291696, \"translation_dx\": -79.60059266318888, \"translation_dy\": 40.24223939512936, \"scale\": 1.045377495061187}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -53.475823147809436, \"translation_dx\": -52.11444637245131, \"translation_dy\": -7.974464084606126, \"scale\": 1.302004904680502}\nB: {\"rotation_angle\": 104.66960596229086, \"translation_dx\": 122.9579606372167, \"translation_dy\": -32.21502556645471, \"scale\": 0.5791563638149022}\nC: {\"rotation_angle\": -14.958482221349612, \"translation_dx\": 49.62118662103501, \"translation_dy\": -13.943537967490855, \"scale\": 1.489574484959727}\nD: {\"rotation_angle\": 51.652651058291696, \"translation_dx\": -79.60059266318888, \"translation_dy\": 40.24223939512936, \"scale\": 1.045377495061187}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_174_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_174_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 160.04018122869564, \"translation_dx\": -10.031879581871024, \"translation_dy\": 74.10075881851205, \"scale\": 0.8976020445815951}\nB: {\"rotation_angle\": 134.22497079750707, \"translation_dx\": -56.33244292094708, \"translation_dy\": 12.15417280277697, \"scale\": 1.260404381889235}\nC: {\"rotation_angle\": 143.38145335973087, \"translation_dx\": 86.67970142496799, \"translation_dy\": -33.57640317277091, \"scale\": 0.6114655384261714}\nD: {\"rotation_angle\": 162.98131081099467, \"translation_dx\": -80.19473687776261, \"translation_dy\": -17.70282064458462, \"scale\": 1.2855975600149028}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 160.04018122869564, \"translation_dx\": -10.031879581871024, \"translation_dy\": 74.10075881851205, \"scale\": 0.8976020445815951}\nB: {\"rotation_angle\": 134.22497079750707, \"translation_dx\": -56.33244292094708, \"translation_dy\": 12.15417280277697, \"scale\": 1.260404381889235}\nC: {\"rotation_angle\": 143.38145335973087, \"translation_dx\": 86.67970142496799, \"translation_dy\": -33.57640317277091, \"scale\": 0.6114655384261714}\nD: {\"rotation_angle\": 162.98131081099467, \"translation_dx\": -80.19473687776261, \"translation_dy\": -17.70282064458462, \"scale\": 1.2855975600149028}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_175_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_175_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 170.5673161572617, \"translation_dx\": -54.14309140946517, \"translation_dy\": -20.9067824061149, \"scale\": 0.74080987054586}\nB: {\"rotation_angle\": 98.88222011850513, \"translation_dx\": 98.58699088344886, \"translation_dy\": 52.424259863835346, \"scale\": 0.8670994673205047}\nC: {\"rotation_angle\": 107.15748471049534, \"translation_dx\": -112.04520804841785, \"translation_dy\": 107.36899853350675, \"scale\": 0.784106447062462}\nD: {\"rotation_angle\": -78.36766094840773, \"translation_dx\": -86.41466180609471, \"translation_dy\": 63.19530077419013, \"scale\": 0.608403973907593}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 170.5673161572617, \"translation_dx\": -54.14309140946517, \"translation_dy\": -20.9067824061149, \"scale\": 0.74080987054586}\nB: {\"rotation_angle\": 98.88222011850513, \"translation_dx\": 98.58699088344886, \"translation_dy\": 52.424259863835346, \"scale\": 0.8670994673205047}\nC: {\"rotation_angle\": 107.15748471049534, \"translation_dx\": -112.04520804841785, \"translation_dy\": 107.36899853350675, \"scale\": 0.784106447062462}\nD: {\"rotation_angle\": -78.36766094840773, \"translation_dx\": -86.41466180609471, \"translation_dy\": 63.19530077419013, \"scale\": 0.608403973907593}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_176_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_176_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -75.97132980340905, \"translation_dx\": 6.960702322199779, \"translation_dy\": 90.08754109424518, \"scale\": 1.363389071715864}\nB: {\"rotation_angle\": 84.88997243843744, \"translation_dx\": 19.30269357274682, \"translation_dy\": 9.929350250110147, \"scale\": 1.0595552381550672}\nC: {\"rotation_angle\": 106.62912259997893, \"translation_dx\": -62.19399566166837, \"translation_dy\": -63.078041204745844, \"scale\": 1.4577244189370733}\nD: {\"rotation_angle\": 26.06413776863195, \"translation_dx\": 104.54441011530889, \"translation_dy\": -2.802993361858995, \"scale\": 0.6919535578881184}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -75.97132980340905, \"translation_dx\": 6.960702322199779, \"translation_dy\": 90.08754109424518, \"scale\": 1.363389071715864}\nB: {\"rotation_angle\": 84.88997243843744, \"translation_dx\": 19.30269357274682, \"translation_dy\": 9.929350250110147, \"scale\": 1.0595552381550672}\nC: {\"rotation_angle\": 106.62912259997893, \"translation_dx\": -62.19399566166837, \"translation_dy\": -63.078041204745844, \"scale\": 1.4577244189370733}\nD: {\"rotation_angle\": 26.06413776863195, \"translation_dx\": 104.54441011530889, \"translation_dy\": -2.802993361858995, \"scale\": 0.6919535578881184}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_177_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_177_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -149.42147215379055, \"translation_dx\": 2.3444194857030283, \"translation_dy\": 35.92779325530762, \"scale\": 1.0223945055206394}\nB: {\"rotation_angle\": 26.051749493295517, \"translation_dx\": 8.674153667650117, \"translation_dy\": 81.98381249796742, \"scale\": 1.4721363798843865}\nC: {\"rotation_angle\": -106.99875725121946, \"translation_dx\": 87.96881157950656, \"translation_dy\": -34.70529343588741, \"scale\": 1.407305489874207}\nD: {\"rotation_angle\": 14.369437993555863, \"translation_dx\": -23.54312301695805, \"translation_dy\": 55.41046511147678, \"scale\": 1.115345902394854}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -149.42147215379055, \"translation_dx\": 2.3444194857030283, \"translation_dy\": 35.92779325530762, \"scale\": 1.0223945055206394}\nB: {\"rotation_angle\": 26.051749493295517, \"translation_dx\": 8.674153667650117, \"translation_dy\": 81.98381249796742, \"scale\": 1.4721363798843865}\nC: {\"rotation_angle\": -106.99875725121946, \"translation_dx\": 87.96881157950656, \"translation_dy\": -34.70529343588741, \"scale\": 1.407305489874207}\nD: {\"rotation_angle\": 14.369437993555863, \"translation_dx\": -23.54312301695805, \"translation_dy\": 55.41046511147678, \"scale\": 1.115345902394854}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_178_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_178_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 104.66960596229086, \"translation_dx\": 122.9579606372167, \"translation_dy\": -32.21502556645471, \"scale\": 0.5791563638149022}\nB: {\"rotation_angle\": -138.01409324857718, \"translation_dx\": -15.316687484355015, \"translation_dy\": 65.85955726482798, \"scale\": 0.7544815678306976}\nC: {\"rotation_angle\": 143.38145335973087, \"translation_dx\": 86.67970142496799, \"translation_dy\": -33.57640317277091, \"scale\": 0.6114655384261714}\nD: {\"rotation_angle\": -106.99875725121946, \"translation_dx\": 87.96881157950656, \"translation_dy\": -34.70529343588741, \"scale\": 1.407305489874207}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 104.66960596229086, \"translation_dx\": 122.9579606372167, \"translation_dy\": -32.21502556645471, \"scale\": 0.5791563638149022}\nB: {\"rotation_angle\": -138.01409324857718, \"translation_dx\": -15.316687484355015, \"translation_dy\": 65.85955726482798, \"scale\": 0.7544815678306976}\nC: {\"rotation_angle\": 143.38145335973087, \"translation_dx\": 86.67970142496799, \"translation_dy\": -33.57640317277091, \"scale\": 0.6114655384261714}\nD: {\"rotation_angle\": -106.99875725121946, \"translation_dx\": 87.96881157950656, \"translation_dy\": -34.70529343588741, \"scale\": 1.407305489874207}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_179_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_179_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -103.5561502427767, \"translation_dx\": -75.76940431238745, \"translation_dy\": -48.3479107136017, \"scale\": 1.0522987713432983}\nB: {\"rotation_angle\": 136.76946369368522, \"translation_dx\": 86.13615517916296, \"translation_dy\": 47.49597577737802, \"scale\": 1.1842967613683704}\nC: {\"rotation_angle\": 159.74516071456964, \"translation_dx\": 18.36539372865252, \"translation_dy\": -32.68583255299669, \"scale\": 0.6283421405871866}\nD: {\"rotation_angle\": 84.88997243843744, \"translation_dx\": 19.30269357274682, \"translation_dy\": 9.929350250110147, \"scale\": 1.0595552381550672}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -103.5561502427767, \"translation_dx\": -75.76940431238745, \"translation_dy\": -48.3479107136017, \"scale\": 1.0522987713432983}\nB: {\"rotation_angle\": 136.76946369368522, \"translation_dx\": 86.13615517916296, \"translation_dy\": 47.49597577737802, \"scale\": 1.1842967613683704}\nC: {\"rotation_angle\": 159.74516071456964, \"translation_dx\": 18.36539372865252, \"translation_dy\": -32.68583255299669, \"scale\": 0.6283421405871866}\nD: {\"rotation_angle\": 84.88997243843744, \"translation_dx\": 19.30269357274682, \"translation_dy\": 9.929350250110147, \"scale\": 1.0595552381550672}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_180_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_180_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -8.756342422911757, \"translation_dx\": -120.12147874311805, \"translation_dy\": -16.659510954699698, \"scale\": 0.8471832394055047}\nB: {\"rotation_angle\": 134.22497079750707, \"translation_dx\": -56.33244292094708, \"translation_dy\": 12.15417280277697, \"scale\": 1.260404381889235}\nC: {\"rotation_angle\": -173.49565975712173, \"translation_dx\": 30.5303454517925, \"translation_dy\": 77.86216107455405, \"scale\": 1.067173806992701}\nD: {\"rotation_angle\": 148.22875373623708, \"translation_dx\": 53.75338658972072, \"translation_dy\": -63.78583022927253, \"scale\": 0.9304836306567924}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -8.756342422911757, \"translation_dx\": -120.12147874311805, \"translation_dy\": -16.659510954699698, \"scale\": 0.8471832394055047}\nB: {\"rotation_angle\": 134.22497079750707, \"translation_dx\": -56.33244292094708, \"translation_dy\": 12.15417280277697, \"scale\": 1.260404381889235}\nC: {\"rotation_angle\": -173.49565975712173, \"translation_dx\": 30.5303454517925, \"translation_dy\": 77.86216107455405, \"scale\": 1.067173806992701}\nD: {\"rotation_angle\": 148.22875373623708, \"translation_dx\": 53.75338658972072, \"translation_dy\": -63.78583022927253, \"scale\": 0.9304836306567924}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_181_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_181_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -169.57691070181107, \"translation_dx\": 67.3776951722352, \"translation_dy\": 6.393739311338578, \"scale\": 0.8283042543093307}\nB: {\"rotation_angle\": -6.38420562293993, \"translation_dx\": -106.80670691302902, \"translation_dy\": -3.5935098985529663, \"scale\": 1.3037846299861797}\nC: {\"rotation_angle\": 134.66606893121838, \"translation_dx\": 30.71289427748178, \"translation_dy\": 31.00111281943242, \"scale\": 0.9716368665085688}\nD: {\"rotation_angle\": 171.23105805984426, \"translation_dx\": 28.800906238980815, \"translation_dy\": 60.921924115709544, \"scale\": 1.4441070487112413}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -169.57691070181107, \"translation_dx\": 67.3776951722352, \"translation_dy\": 6.393739311338578, \"scale\": 0.8283042543093307}\nB: {\"rotation_angle\": -6.38420562293993, \"translation_dx\": -106.80670691302902, \"translation_dy\": -3.5935098985529663, \"scale\": 1.3037846299861797}\nC: {\"rotation_angle\": 134.66606893121838, \"translation_dx\": 30.71289427748178, \"translation_dy\": 31.00111281943242, \"scale\": 0.9716368665085688}\nD: {\"rotation_angle\": 171.23105805984426, \"translation_dx\": 28.800906238980815, \"translation_dy\": 60.921924115709544, \"scale\": 1.4441070487112413}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_182_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_182_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 18.52926347539298, \"translation_dx\": -26.155433185237058, \"translation_dy\": -39.799299198218556, \"scale\": 0.9355127285855813}\nB: {\"rotation_angle\": 148.22875373623708, \"translation_dx\": 53.75338658972072, \"translation_dy\": -63.78583022927253, \"scale\": 0.9304836306567924}\nC: {\"rotation_angle\": 72.25092677282458, \"translation_dx\": 61.389740502873025, \"translation_dy\": -36.86538640455047, \"scale\": 1.0748600769835353}\nD: {\"rotation_angle\": -75.97132980340905, \"translation_dx\": 6.960702322199779, \"translation_dy\": 90.08754109424518, \"scale\": 1.363389071715864}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 18.52926347539298, \"translation_dx\": -26.155433185237058, \"translation_dy\": -39.799299198218556, \"scale\": 0.9355127285855813}\nB: {\"rotation_angle\": 148.22875373623708, \"translation_dx\": 53.75338658972072, \"translation_dy\": -63.78583022927253, \"scale\": 0.9304836306567924}\nC: {\"rotation_angle\": 72.25092677282458, \"translation_dx\": 61.389740502873025, \"translation_dy\": -36.86538640455047, \"scale\": 1.0748600769835353}\nD: {\"rotation_angle\": -75.97132980340905, \"translation_dx\": 6.960702322199779, \"translation_dy\": 90.08754109424518, \"scale\": 1.363389071715864}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_183_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_183_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -75.97132980340905, \"translation_dx\": 6.960702322199779, \"translation_dy\": 90.08754109424518, \"scale\": 1.363389071715864}\nB: {\"rotation_angle\": 44.2601421515034, \"translation_dx\": -84.9832744911761, \"translation_dy\": -78.07982572554322, \"scale\": 0.5612120736859965}\nC: {\"rotation_angle\": 2.6800660606496933, \"translation_dx\": 8.805898944242955, \"translation_dy\": -61.557448223727356, \"scale\": 0.7338009245004858}\nD: {\"rotation_angle\": -101.64893396855386, \"translation_dx\": -96.08306753711838, \"translation_dy\": 14.852477797043775, \"scale\": 1.3017377870800058}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -75.97132980340905, \"translation_dx\": 6.960702322199779, \"translation_dy\": 90.08754109424518, \"scale\": 1.363389071715864}\nB: {\"rotation_angle\": 44.2601421515034, \"translation_dx\": -84.9832744911761, \"translation_dy\": -78.07982572554322, \"scale\": 0.5612120736859965}\nC: {\"rotation_angle\": 2.6800660606496933, \"translation_dx\": 8.805898944242955, \"translation_dy\": -61.557448223727356, \"scale\": 0.7338009245004858}\nD: {\"rotation_angle\": -101.64893396855386, \"translation_dx\": -96.08306753711838, \"translation_dy\": 14.852477797043775, \"scale\": 1.3017377870800058}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_184_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_184_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -178.96154331790243, \"translation_dx\": -45.831117140591004, \"translation_dy\": 14.962223802901406, \"scale\": 1.4059876442036168}\nB: {\"rotation_angle\": 115.4472434811122, \"translation_dx\": 69.00896887231048, \"translation_dy\": -26.016218629159226, \"scale\": 0.9339901852292719}\nC: {\"rotation_angle\": 2.6800660606496933, \"translation_dx\": 8.805898944242955, \"translation_dy\": -61.557448223727356, \"scale\": 0.7338009245004858}\nD: {\"rotation_angle\": -149.34069149386406, \"translation_dx\": 81.63420911320063, \"translation_dy\": -26.073567429384056, \"scale\": 1.427947630130646}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -178.96154331790243, \"translation_dx\": -45.831117140591004, \"translation_dy\": 14.962223802901406, \"scale\": 1.4059876442036168}\nB: {\"rotation_angle\": 115.4472434811122, \"translation_dx\": 69.00896887231048, \"translation_dy\": -26.016218629159226, \"scale\": 0.9339901852292719}\nC: {\"rotation_angle\": 2.6800660606496933, \"translation_dx\": 8.805898944242955, \"translation_dy\": -61.557448223727356, \"scale\": 0.7338009245004858}\nD: {\"rotation_angle\": -149.34069149386406, \"translation_dx\": 81.63420911320063, \"translation_dy\": -26.073567429384056, \"scale\": 1.427947630130646}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_185_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_185_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -61.308258156024195, \"translation_dx\": -92.42627707406731, \"translation_dy\": -21.076199203141364, \"scale\": 1.1133621977071444}\nB: {\"rotation_angle\": 83.49682873903629, \"translation_dx\": -127.2042493945246, \"translation_dy\": 2.6616959584396938, \"scale\": 0.9488759478249397}\nC: {\"rotation_angle\": -0.45613579718829556, \"translation_dx\": 98.71619714866841, \"translation_dy\": 70.1100439641223, \"scale\": 0.6491919010173006}\nD: {\"rotation_angle\": -124.27587082376021, \"translation_dx\": -88.19288051455345, \"translation_dy\": 24.145134775980125, \"scale\": 1.4414104211047083}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -61.308258156024195, \"translation_dx\": -92.42627707406731, \"translation_dy\": -21.076199203141364, \"scale\": 1.1133621977071444}\nB: {\"rotation_angle\": 83.49682873903629, \"translation_dx\": -127.2042493945246, \"translation_dy\": 2.6616959584396938, \"scale\": 0.9488759478249397}\nC: {\"rotation_angle\": -0.45613579718829556, \"translation_dx\": 98.71619714866841, \"translation_dy\": 70.1100439641223, \"scale\": 0.6491919010173006}\nD: {\"rotation_angle\": -124.27587082376021, \"translation_dx\": -88.19288051455345, \"translation_dy\": 24.145134775980125, \"scale\": 1.4414104211047083}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_186_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_186_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -163.07830945514343, \"translation_dx\": 107.25371607945826, \"translation_dy\": 44.19319462200147, \"scale\": 1.0330497674624493}\nB: {\"rotation_angle\": 115.16030768984217, \"translation_dx\": -1.9669547188467504, \"translation_dy\": 38.42152609256746, \"scale\": 1.3403221872922475}\nC: {\"rotation_angle\": -148.06770236959966, \"translation_dx\": 76.71938731609727, \"translation_dy\": 125.67697929104389, \"scale\": 1.1600663307259453}\nD: {\"rotation_angle\": -98.17490649350026, \"translation_dx\": 5.744855173473269, \"translation_dy\": -10.705504600001973, \"scale\": 1.1182428392253487}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -163.07830945514343, \"translation_dx\": 107.25371607945826, \"translation_dy\": 44.19319462200147, \"scale\": 1.0330497674624493}\nB: {\"rotation_angle\": 115.16030768984217, \"translation_dx\": -1.9669547188467504, \"translation_dy\": 38.42152609256746, \"scale\": 1.3403221872922475}\nC: {\"rotation_angle\": -148.06770236959966, \"translation_dx\": 76.71938731609727, \"translation_dy\": 125.67697929104389, \"scale\": 1.1600663307259453}\nD: {\"rotation_angle\": -98.17490649350026, \"translation_dx\": 5.744855173473269, \"translation_dy\": -10.705504600001973, \"scale\": 1.1182428392253487}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_187_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_187_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 143.38145335973087, \"translation_dx\": 86.67970142496799, \"translation_dy\": -33.57640317277091, \"scale\": 0.6114655384261714}\nB: {\"rotation_angle\": 83.8873422171626, \"translation_dx\": -89.51171417178318, \"translation_dy\": 44.525876215713694, \"scale\": 0.7096671999666376}\nC: {\"rotation_angle\": 44.2601421515034, \"translation_dx\": -84.9832744911761, \"translation_dy\": -78.07982572554322, \"scale\": 0.5612120736859965}\nD: {\"rotation_angle\": 127.1396993936072, \"translation_dx\": -29.08894824101361, \"translation_dy\": -80.84475014775404, \"scale\": 1.2834497894588772}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 143.38145335973087, \"translation_dx\": 86.67970142496799, \"translation_dy\": -33.57640317277091, \"scale\": 0.6114655384261714}\nB: {\"rotation_angle\": 83.8873422171626, \"translation_dx\": -89.51171417178318, \"translation_dy\": 44.525876215713694, \"scale\": 0.7096671999666376}\nC: {\"rotation_angle\": 44.2601421515034, \"translation_dx\": -84.9832744911761, \"translation_dy\": -78.07982572554322, \"scale\": 0.5612120736859965}\nD: {\"rotation_angle\": 127.1396993936072, \"translation_dx\": -29.08894824101361, \"translation_dy\": -80.84475014775404, \"scale\": 1.2834497894588772}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_188_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_188_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -6.258618837806779, \"translation_dx\": -117.56200624611057, \"translation_dy\": -84.92852320396813, \"scale\": 0.8703619649920769}\nB: {\"rotation_angle\": -92.49508697379828, \"translation_dx\": 63.09853740086383, \"translation_dy\": 99.47995409556995, \"scale\": 0.9495145406508286}\nC: {\"rotation_angle\": -46.75272698463425, \"translation_dx\": 16.424107524155175, \"translation_dy\": -60.683488552754085, \"scale\": 1.375025476214386}\nD: {\"rotation_angle\": 99.38174871704592, \"translation_dx\": 57.870588734166205, \"translation_dy\": 17.413162007690403, \"scale\": 1.4113398114931053}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -6.258618837806779, \"translation_dx\": -117.56200624611057, \"translation_dy\": -84.92852320396813, \"scale\": 0.8703619649920769}\nB: {\"rotation_angle\": -92.49508697379828, \"translation_dx\": 63.09853740086383, \"translation_dy\": 99.47995409556995, \"scale\": 0.9495145406508286}\nC: {\"rotation_angle\": -46.75272698463425, \"translation_dx\": 16.424107524155175, \"translation_dy\": -60.683488552754085, \"scale\": 1.375025476214386}\nD: {\"rotation_angle\": 99.38174871704592, \"translation_dx\": 57.870588734166205, \"translation_dy\": 17.413162007690403, \"scale\": 1.4113398114931053}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_189_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_189_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -42.98651909317854, \"translation_dx\": 114.49293313374625, \"translation_dy\": -39.53290228333596, \"scale\": 1.442019387031135}\nB: {\"rotation_angle\": 49.896013394485834, \"translation_dx\": -25.763756683237403, \"translation_dy\": -26.432232271484168, \"scale\": 1.1619310734744932}\nC: {\"rotation_angle\": 97.63348280388993, \"translation_dx\": 59.62332527691919, \"translation_dy\": 12.549462794922746, \"scale\": 0.6927080624806098}\nD: {\"rotation_angle\": -79.27003163090343, \"translation_dx\": 8.207736130313549, \"translation_dy\": 6.670417118750038, \"scale\": 1.3327657238113826}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -42.98651909317854, \"translation_dx\": 114.49293313374625, \"translation_dy\": -39.53290228333596, \"scale\": 1.442019387031135}\nB: {\"rotation_angle\": 49.896013394485834, \"translation_dx\": -25.763756683237403, \"translation_dy\": -26.432232271484168, \"scale\": 1.1619310734744932}\nC: {\"rotation_angle\": 97.63348280388993, \"translation_dx\": 59.62332527691919, \"translation_dy\": 12.549462794922746, \"scale\": 0.6927080624806098}\nD: {\"rotation_angle\": -79.27003163090343, \"translation_dx\": 8.207736130313549, \"translation_dy\": 6.670417118750038, \"scale\": 1.3327657238113826}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_190_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_190_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 46.42160956908356, \"translation_dx\": -90.04619228512212, \"translation_dy\": -15.749486436572411, \"scale\": 1.005156310055277}\nB: {\"rotation_angle\": 142.66976946716716, \"translation_dx\": 29.963541003119957, \"translation_dy\": 66.07065092305665, \"scale\": 1.42144068359999}\nC: {\"rotation_angle\": 123.61853421760617, \"translation_dx\": -93.63136806510369, \"translation_dy\": -15.65687765252683, \"scale\": 0.9834422929774667}\nD: {\"rotation_angle\": 53.86809011441332, \"translation_dx\": -15.131168518097624, \"translation_dy\": -31.300037391593577, \"scale\": 1.3154620606808156}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 46.42160956908356, \"translation_dx\": -90.04619228512212, \"translation_dy\": -15.749486436572411, \"scale\": 1.005156310055277}\nB: {\"rotation_angle\": 142.66976946716716, \"translation_dx\": 29.963541003119957, \"translation_dy\": 66.07065092305665, \"scale\": 1.42144068359999}\nC: {\"rotation_angle\": 123.61853421760617, \"translation_dx\": -93.63136806510369, \"translation_dy\": -15.65687765252683, \"scale\": 0.9834422929774667}\nD: {\"rotation_angle\": 53.86809011441332, \"translation_dx\": -15.131168518097624, \"translation_dy\": -31.300037391593577, \"scale\": 1.3154620606808156}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_191_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_191_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -78.36766094840773, \"translation_dx\": -86.41466180609471, \"translation_dy\": 63.19530077419013, \"scale\": 0.608403973907593}\nB: {\"rotation_angle\": -99.80397961792426, \"translation_dx\": 113.2252387398062, \"translation_dy\": -61.846052830557056, \"scale\": 1.080357872583317}\nC: {\"rotation_angle\": -84.90425841207441, \"translation_dx\": -96.22975116611923, \"translation_dy\": -54.13037688992304, \"scale\": 1.161476925450186}\nD: {\"rotation_angle\": 46.42160956908356, \"translation_dx\": -90.04619228512212, \"translation_dy\": -15.749486436572411, \"scale\": 1.005156310055277}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -78.36766094840773, \"translation_dx\": -86.41466180609471, \"translation_dy\": 63.19530077419013, \"scale\": 0.608403973907593}\nB: {\"rotation_angle\": -99.80397961792426, \"translation_dx\": 113.2252387398062, \"translation_dy\": -61.846052830557056, \"scale\": 1.080357872583317}\nC: {\"rotation_angle\": -84.90425841207441, \"translation_dx\": -96.22975116611923, \"translation_dy\": -54.13037688992304, \"scale\": 1.161476925450186}\nD: {\"rotation_angle\": 46.42160956908356, \"translation_dx\": -90.04619228512212, \"translation_dy\": -15.749486436572411, \"scale\": 1.005156310055277}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_192_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_192_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 99.4759866737457, \"translation_dx\": -117.67383777244245, \"translation_dy\": -44.645046657688624, \"scale\": 1.4332006009229632}\nB: {\"rotation_angle\": -162.31682909306286, \"translation_dx\": 94.60975693720637, \"translation_dy\": -28.569332128995313, \"scale\": 1.1251281587345527}\nC: {\"rotation_angle\": 12.872370969250312, \"translation_dx\": -43.1533458138392, \"translation_dy\": -64.88511529320917, \"scale\": 1.3092068537816153}\nD: {\"rotation_angle\": -23.247975965134003, \"translation_dx\": 108.97564353658032, \"translation_dy\": 27.267413374938258, \"scale\": 1.2292170424899498}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 99.4759866737457, \"translation_dx\": -117.67383777244245, \"translation_dy\": -44.645046657688624, \"scale\": 1.4332006009229632}\nB: {\"rotation_angle\": -162.31682909306286, \"translation_dx\": 94.60975693720637, \"translation_dy\": -28.569332128995313, \"scale\": 1.1251281587345527}\nC: {\"rotation_angle\": 12.872370969250312, \"translation_dx\": -43.1533458138392, \"translation_dy\": -64.88511529320917, \"scale\": 1.3092068537816153}\nD: {\"rotation_angle\": -23.247975965134003, \"translation_dx\": 108.97564353658032, \"translation_dy\": 27.267413374938258, \"scale\": 1.2292170424899498}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_193_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_193_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 137.29869982747988, \"translation_dx\": 75.41375097241084, \"translation_dy\": 55.66358575693553, \"scale\": 1.1335508281242805}\nB: {\"rotation_angle\": 83.8873422171626, \"translation_dx\": -89.51171417178318, \"translation_dy\": 44.525876215713694, \"scale\": 0.7096671999666376}\nC: {\"rotation_angle\": -13.219279868292688, \"translation_dx\": -95.87022677446828, \"translation_dy\": -58.31347876468597, \"scale\": 1.3722022398508045}\nD: {\"rotation_angle\": -6.970858631484532, \"translation_dx\": -2.793256631611797, \"translation_dy\": 83.08133552847667, \"scale\": 1.4237697720578382}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 137.29869982747988, \"translation_dx\": 75.41375097241084, \"translation_dy\": 55.66358575693553, \"scale\": 1.1335508281242805}\nB: {\"rotation_angle\": 83.8873422171626, \"translation_dx\": -89.51171417178318, \"translation_dy\": 44.525876215713694, \"scale\": 0.7096671999666376}\nC: {\"rotation_angle\": -13.219279868292688, \"translation_dx\": -95.87022677446828, \"translation_dy\": -58.31347876468597, \"scale\": 1.3722022398508045}\nD: {\"rotation_angle\": -6.970858631484532, \"translation_dx\": -2.793256631611797, \"translation_dy\": 83.08133552847667, \"scale\": 1.4237697720578382}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_194_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_194_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -100.94596249363259, \"translation_dx\": 18.493532966543597, \"translation_dy\": -4.904135882610319, \"scale\": 1.1575890826518318}\nB: {\"rotation_angle\": -44.30781692045639, \"translation_dx\": -23.473696812537305, \"translation_dy\": -94.42952089946652, \"scale\": 1.4029179362735564}\nC: {\"rotation_angle\": -92.49508697379828, \"translation_dx\": 63.09853740086383, \"translation_dy\": 99.47995409556995, \"scale\": 0.9495145406508286}\nD: {\"rotation_angle\": -61.308258156024195, \"translation_dx\": -92.42627707406731, \"translation_dy\": -21.076199203141364, \"scale\": 1.1133621977071444}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -100.94596249363259, \"translation_dx\": 18.493532966543597, \"translation_dy\": -4.904135882610319, \"scale\": 1.1575890826518318}\nB: {\"rotation_angle\": -44.30781692045639, \"translation_dx\": -23.473696812537305, \"translation_dy\": -94.42952089946652, \"scale\": 1.4029179362735564}\nC: {\"rotation_angle\": -92.49508697379828, \"translation_dx\": 63.09853740086383, \"translation_dy\": 99.47995409556995, \"scale\": 0.9495145406508286}\nD: {\"rotation_angle\": -61.308258156024195, \"translation_dx\": -92.42627707406731, \"translation_dy\": -21.076199203141364, \"scale\": 1.1133621977071444}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_195_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_195_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -23.247975965134003, \"translation_dx\": 108.97564353658032, \"translation_dy\": 27.267413374938258, \"scale\": 1.2292170424899498}\nB: {\"rotation_angle\": 111.11665430921613, \"translation_dx\": -45.526232266105865, \"translation_dy\": -71.56835409165808, \"scale\": 0.5234271564227445}\nC: {\"rotation_angle\": 48.71833122181758, \"translation_dx\": -105.22683210092106, \"translation_dy\": -63.34096559919908, \"scale\": 0.7204478932238769}\nD: {\"rotation_angle\": 88.199522854527, \"translation_dx\": 18.814421533590917, \"translation_dy\": -27.135307313502466, \"scale\": 1.37855935527965}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -23.247975965134003, \"translation_dx\": 108.97564353658032, \"translation_dy\": 27.267413374938258, \"scale\": 1.2292170424899498}\nB: {\"rotation_angle\": 111.11665430921613, \"translation_dx\": -45.526232266105865, \"translation_dy\": -71.56835409165808, \"scale\": 0.5234271564227445}\nC: {\"rotation_angle\": 48.71833122181758, \"translation_dx\": -105.22683210092106, \"translation_dy\": -63.34096559919908, \"scale\": 0.7204478932238769}\nD: {\"rotation_angle\": 88.199522854527, \"translation_dx\": 18.814421533590917, \"translation_dy\": -27.135307313502466, \"scale\": 1.37855935527965}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_196_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_196_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": 142.66976946716716, \"translation_dx\": 29.963541003119957, \"translation_dy\": 66.07065092305665, \"scale\": 1.42144068359999}\nB: {\"rotation_angle\": 53.86809011441332, \"translation_dx\": -15.131168518097624, \"translation_dy\": -31.300037391593577, \"scale\": 1.3154620606808156}\nC: {\"rotation_angle\": 179.8013352752547, \"translation_dx\": -90.5548533247824, \"translation_dy\": 17.23782922418306, \"scale\": 0.9885365626195518}\nD: {\"rotation_angle\": 148.22875373623708, \"translation_dx\": 53.75338658972072, \"translation_dy\": -63.78583022927253, \"scale\": 0.9304836306567924}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 142.66976946716716, \"translation_dx\": 29.963541003119957, \"translation_dy\": 66.07065092305665, \"scale\": 1.42144068359999}\nB: {\"rotation_angle\": 53.86809011441332, \"translation_dx\": -15.131168518097624, \"translation_dy\": -31.300037391593577, \"scale\": 1.3154620606808156}\nC: {\"rotation_angle\": 179.8013352752547, \"translation_dx\": -90.5548533247824, \"translation_dy\": 17.23782922418306, \"scale\": 0.9885365626195518}\nD: {\"rotation_angle\": 148.22875373623708, \"translation_dx\": 53.75338658972072, \"translation_dy\": -63.78583022927253, \"scale\": 0.9304836306567924}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_197_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_197_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -23.247975965134003, \"translation_dx\": 108.97564353658032, \"translation_dy\": 27.267413374938258, \"scale\": 1.2292170424899498}\nB: {\"rotation_angle\": 46.42160956908356, \"translation_dx\": -90.04619228512212, \"translation_dy\": -15.749486436572411, \"scale\": 1.005156310055277}\nC: {\"rotation_angle\": 26.051749493295517, \"translation_dx\": 8.674153667650117, \"translation_dy\": 81.98381249796742, \"scale\": 1.4721363798843865}\nD: {\"rotation_angle\": -160.6395227566207, \"translation_dx\": 53.66643366551958, \"translation_dy\": -27.712376159428388, \"scale\": 1.1084051689599654}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -23.247975965134003, \"translation_dx\": 108.97564353658032, \"translation_dy\": 27.267413374938258, \"scale\": 1.2292170424899498}\nB: {\"rotation_angle\": 46.42160956908356, \"translation_dx\": -90.04619228512212, \"translation_dy\": -15.749486436572411, \"scale\": 1.005156310055277}\nC: {\"rotation_angle\": 26.051749493295517, \"translation_dx\": 8.674153667650117, \"translation_dy\": 81.98381249796742, \"scale\": 1.4721363798843865}\nD: {\"rotation_angle\": -160.6395227566207, \"translation_dx\": 53.66643366551958, \"translation_dy\": -27.712376159428388, \"scale\": 1.1084051689599654}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_198_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_198_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Image_Spatial_Transformation_Estimation",
+    "visual_input_component": "natural image",
+    "source": "COCO_spatial",
+    "options": "A: {\"rotation_angle\": -70.97525301082955, \"translation_dx\": -28.380848037876873, \"translation_dy\": 54.37723426674512, \"scale\": 0.9024922197892329}\nB: {\"rotation_angle\": -5.816806483512181, \"translation_dx\": -70.40329792935935, \"translation_dy\": -21.418007440252175, \"scale\": 1.0041476956174793}\nC: {\"rotation_angle\": 163.34031080178892, \"translation_dx\": -21.567151354845635, \"translation_dy\": -30.72615389540148, \"scale\": 1.2439888416024685}\nD: {\"rotation_angle\": -14.958482221349612, \"translation_dx\": 49.62118662103501, \"translation_dy\": -13.943537967490855, \"scale\": 1.489574484959727}",
+    "question": "Please compute the type and parameters of the spatial transformation between these two images.",
+    "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -70.97525301082955, \"translation_dx\": -28.380848037876873, \"translation_dy\": 54.37723426674512, \"scale\": 0.9024922197892329}\nB: {\"rotation_angle\": -5.816806483512181, \"translation_dx\": -70.40329792935935, \"translation_dy\": -21.418007440252175, \"scale\": 1.0041476956174793}\nC: {\"rotation_angle\": 163.34031080178892, \"translation_dx\": -21.567151354845635, \"translation_dy\": -30.72615389540148, \"scale\": 1.2439888416024685}\nD: {\"rotation_angle\": -14.958482221349612, \"translation_dx\": 49.62118662103501, \"translation_dy\": -13.943537967490855, \"scale\": 1.489574484959727}",
+    "input_image_path": [
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_199_0.jpg",
+      "../MMIU-Benchmark/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_199_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.627, 0.2]\nB: [0.166, 0.657]\nC: [0.95, 0.907]\nD: [0.328, 0.477]",
+    "question": "What is the position coordinates of the point with coordinates ([0.0, 0.0]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.627, 0.2]\nB: [0.166, 0.657]\nC: [0.95, 0.907]\nD: [0.328, 0.477]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_0_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_0_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.368, 0.265]\nB: [0.925, 0.128]\nC: [0.133, 0.261]\nD: [0.488, 0.101]",
+    "question": "What is the position coordinates of the point with coordinates ([0.366, 0.265]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.368, 0.265]\nB: [0.925, 0.128]\nC: [0.133, 0.261]\nD: [0.488, 0.101]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_1_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_1_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.398, 0.165]\nB: [0.606, 0.999]\nC: [0.955, 0.756]\nD: [0.976, 0.964]",
+    "question": "What is the position coordinates of the point with coordinates ([0.488, -0.073]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.398, 0.165]\nB: [0.606, 0.999]\nC: [0.955, 0.756]\nD: [0.976, 0.964]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_2_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_2_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.411, 0.483]\nB: [0.624, 0.13]\nC: [0.256, 0.845]\nD: [0.393, 0.328]",
+    "question": "What is the position coordinates of the point with coordinates ([0.0, 0.0]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.411, 0.483]\nB: [0.624, 0.13]\nC: [0.256, 0.845]\nD: [0.393, 0.328]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_3_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_3_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.851, 0.69]\nB: [0.112, 0.164]\nC: [0.561, 0.3]\nD: [0.69, 0.205]",
+    "question": "What is the position coordinates of the point with coordinates ([0.572, 0.294]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.851, 0.69]\nB: [0.112, 0.164]\nC: [0.561, 0.3]\nD: [0.69, 0.205]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_4_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_4_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.71, 0.765]\nB: [0.039, 0.565]\nC: [0.599, 0.897]\nD: [0.077, 0.037]",
+    "question": "What is the position coordinates of the point with coordinates ([0.127, 0.205]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.71, 0.765]\nB: [0.039, 0.565]\nC: [0.599, 0.897]\nD: [0.077, 0.037]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_5_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_5_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.0, 0.0]\nB: [0.75, 0.266]\nC: [0.658, 0.765]\nD: [0.825, 0.377]",
+    "question": "What is the position coordinates of the point with coordinates ([0.0, 0.0]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.0, 0.0]\nB: [0.75, 0.266]\nC: [0.658, 0.765]\nD: [0.825, 0.377]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_6_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_6_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.458, 0.112]\nB: [0.522, 0.216]\nC: [0.672, 0.493]\nD: [0.435, 0.891]",
+    "question": "What is the position coordinates of the point with coordinates ([0.392, 0.15]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.458, 0.112]\nB: [0.522, 0.216]\nC: [0.672, 0.493]\nD: [0.435, 0.891]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_7_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_7_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.0, 0.0]\nB: [0.055, 0.212]\nC: [0.926, 0.897]\nD: [0.088, 0.69]",
+    "question": "What is the position coordinates of the point with coordinates ([0.0, 0.0]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.0, 0.0]\nB: [0.055, 0.212]\nC: [0.926, 0.897]\nD: [0.088, 0.69]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_8_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_8_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.572, 0.347]\nB: [0.822, 0.524]\nC: [0.668, 0.975]\nD: [0.228, 0.421]",
+    "question": "What is the position coordinates of the point with coordinates ([0.84, 0.359]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.572, 0.347]\nB: [0.822, 0.524]\nC: [0.668, 0.975]\nD: [0.228, 0.421]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_9_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_9_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.55, 0.157]\nB: [0.225, 0.407]\nC: [0.428, 0.202]\nD: [0.848, 0.045]",
+    "question": "What is the position coordinates of the point with coordinates ([0.195, 0.402]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.55, 0.157]\nB: [0.225, 0.407]\nC: [0.428, 0.202]\nD: [0.848, 0.045]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_10_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_10_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.606, 0.811]\nB: [0.463, 0.023]\nC: [0.307, 0.429]\nD: [0.789, 0.214]",
+    "question": "What is the position coordinates of the point with coordinates ([0.793, 0.216]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.606, 0.811]\nB: [0.463, 0.023]\nC: [0.307, 0.429]\nD: [0.789, 0.214]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_11_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_11_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.276, 0.532]\nB: [0.401, 0.534]\nC: [0.28, 0.157]\nD: [0.0, 0.0]",
+    "question": "What is the position coordinates of the point with coordinates ([0.0, 0.0]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.276, 0.532]\nB: [0.401, 0.534]\nC: [0.28, 0.157]\nD: [0.0, 0.0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_12_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_12_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.447, 0.29]\nB: [0.574, 0.304]\nC: [0.111, 0.034]\nD: [0.966, 0.262]",
+    "question": "What is the position coordinates of the point with coordinates ([0.574, 0.304]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.447, 0.29]\nB: [0.574, 0.304]\nC: [0.111, 0.034]\nD: [0.966, 0.262]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_13_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_13_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.769, 0.374]\nB: [0.69, -0.054]\nC: [0.182, 0.457]\nD: [0.423, 0.809]",
+    "question": "What is the position coordinates of the point with coordinates ([0.723, -0.019]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.769, 0.374]\nB: [0.69, -0.054]\nC: [0.182, 0.457]\nD: [0.423, 0.809]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_14_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_14_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.861, 0.924]\nB: [0.976, 0.801]\nC: [0.63, 0.946]\nD: [0.457, 0.566]",
+    "question": "What is the position coordinates of the point with coordinates ([0.491, 0.572]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.861, 0.924]\nB: [0.976, 0.801]\nC: [0.63, 0.946]\nD: [0.457, 0.566]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_15_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_15_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.737, 0.324]\nB: [0.346, 0.386]\nC: [0.464, 0.662]\nD: [0.24, 0.833]",
+    "question": "What is the position coordinates of the point with coordinates ([0.24, 0.833]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.737, 0.324]\nB: [0.346, 0.386]\nC: [0.464, 0.662]\nD: [0.24, 0.833]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_16_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_16_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.902, 0.889]\nB: [0.734, 0.179]\nC: [0.695, 0.313]\nD: [0.552, 0.586]",
+    "question": "What is the position coordinates of the point with coordinates ([0.552, 0.586]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.902, 0.889]\nB: [0.734, 0.179]\nC: [0.695, 0.313]\nD: [0.552, 0.586]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_17_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_17_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.015, 0.757]\nB: [0.493, 0.371]\nC: [0.002, 0.142]\nD: [0.438, 0.698]",
+    "question": "What is the position coordinates of the point with coordinates ([0.496, 0.371]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.015, 0.757]\nB: [0.493, 0.371]\nC: [0.002, 0.142]\nD: [0.438, 0.698]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_18_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_18_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.629, 0.185]\nB: [0.357, 0.413]\nC: [0.521, 0.95]\nD: [0.591, 0.415]",
+    "question": "What is the position coordinates of the point with coordinates ([0.598, 0.417]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.629, 0.185]\nB: [0.357, 0.413]\nC: [0.521, 0.95]\nD: [0.591, 0.415]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_19_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_19_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.244, 0.498]\nB: [0.749, 0.317]\nC: [0.76, 0.581]\nD: [0.806, 0.63]",
+    "question": "What is the position coordinates of the point with coordinates ([0.678, 0.324]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.244, 0.498]\nB: [0.749, 0.317]\nC: [0.76, 0.581]\nD: [0.806, 0.63]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_20_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_20_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.786, 0.763]\nB: [0.139, 0.661]\nC: [0.549, 0.391]\nD: [0.901, 0.478]",
+    "question": "What is the position coordinates of the point with coordinates ([0.577, 0.479]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.786, 0.763]\nB: [0.139, 0.661]\nC: [0.549, 0.391]\nD: [0.901, 0.478]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_21_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_21_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.84, 0.364]\nB: [0.664, 0.326]\nC: [0.643, 0.579]\nD: [0.486, 0.458]",
+    "question": "What is the position coordinates of the point with coordinates ([0.836, 0.364]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.84, 0.364]\nB: [0.664, 0.326]\nC: [0.643, 0.579]\nD: [0.486, 0.458]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_22_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_22_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.667, 0.104]\nB: [0.801, 0.792]\nC: [0.271, 0.317]\nD: [0.699, 0.539]",
+    "question": "What is the position coordinates of the point with coordinates ([0.631, 0.551]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.667, 0.104]\nB: [0.801, 0.792]\nC: [0.271, 0.317]\nD: [0.699, 0.539]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_23_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_23_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.433, 0.435]\nB: [0.509, 0.298]\nC: [0.517, 0.969]\nD: [0.096, 0.626]",
+    "question": "What is the position coordinates of the point with coordinates ([0.517, 0.969]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.433, 0.435]\nB: [0.509, 0.298]\nC: [0.517, 0.969]\nD: [0.096, 0.626]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_24_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_24_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.572, 0.447]\nB: [0.317, 0.394]\nC: [0.276, 0.148]\nD: [0.404, 0.225]",
+    "question": "What is the position coordinates of the point with coordinates ([0.571, 0.446]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.572, 0.447]\nB: [0.317, 0.394]\nC: [0.276, 0.148]\nD: [0.404, 0.225]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_25_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_25_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.082, 0.932]\nB: [0.086, 0.159]\nC: [0.711, 0.457]\nD: [0.056, 0.373]",
+    "question": "What is the position coordinates of the point with coordinates ([0.082, 0.932]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.082, 0.932]\nB: [0.086, 0.159]\nC: [0.711, 0.457]\nD: [0.056, 0.373]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_26_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_26_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.151, 0.743]\nB: [0.746, 0.222]\nC: [0.439, 0.384]\nD: [0.367, 0.888]",
+    "question": "What is the position coordinates of the point with coordinates ([0.717, 0.34]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.151, 0.743]\nB: [0.746, 0.222]\nC: [0.439, 0.384]\nD: [0.367, 0.888]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_27_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_27_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.215, 0.968]\nB: [0.558, 0.522]\nC: [0.967, 0.723]\nD: [0.212, 0.809]",
+    "question": "What is the position coordinates of the point with coordinates ([0.584, 0.596]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.215, 0.968]\nB: [0.558, 0.522]\nC: [0.967, 0.723]\nD: [0.212, 0.809]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_28_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_28_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.413, 0.07]\nB: [0.437, 0.318]\nC: [0.155, 0.833]\nD: [0.607, 0.498]",
+    "question": "What is the position coordinates of the point with coordinates ([0.525, 0.482]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.413, 0.07]\nB: [0.437, 0.318]\nC: [0.155, 0.833]\nD: [0.607, 0.498]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_29_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_29_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.438, 0.097]\nB: [0.631, 0.018]\nC: [0.215, 0.313]\nD: [0.263, 0.723]",
+    "question": "What is the position coordinates of the point with coordinates ([0.0, 0.0]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.438, 0.097]\nB: [0.631, 0.018]\nC: [0.215, 0.313]\nD: [0.263, 0.723]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_30_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_30_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.896, 0.061]\nB: [0.167, 0.451]\nC: [0.216, 0.513]\nD: [0.57, 0.361]",
+    "question": "What is the position coordinates of the point with coordinates ([0.569, 0.361]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.896, 0.061]\nB: [0.167, 0.451]\nC: [0.216, 0.513]\nD: [0.57, 0.361]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_31_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_31_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.917, 0.582]\nB: [0.858, 0.833]\nC: [0.962, 0.955]\nD: [0.285, 0.385]",
+    "question": "What is the position coordinates of the point with coordinates ([0.285, 0.385]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.917, 0.582]\nB: [0.858, 0.833]\nC: [0.962, 0.955]\nD: [0.285, 0.385]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_32_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_32_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.65, 0.016]\nB: [0.761, 0.985]\nC: [0.538, 0.359]\nD: [0.842, 0.025]",
+    "question": "What is the position coordinates of the point with coordinates ([0.537, 0.35]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.65, 0.016]\nB: [0.761, 0.985]\nC: [0.538, 0.359]\nD: [0.842, 0.025]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_33_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_33_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.9, 0.904]\nB: [0.664, 0.466]\nC: [0.273, 0.03]\nD: [0.393, 0.275]",
+    "question": "What is the position coordinates of the point with coordinates ([0.427, 0.335]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.9, 0.904]\nB: [0.664, 0.466]\nC: [0.273, 0.03]\nD: [0.393, 0.275]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_34_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_34_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.051, 0.768]\nB: [0.363, 0.364]\nC: [0.376, 0.685]\nD: [0.454, 0.177]",
+    "question": "What is the position coordinates of the point with coordinates ([0.0, 0.0]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.051, 0.768]\nB: [0.363, 0.364]\nC: [0.376, 0.685]\nD: [0.454, 0.177]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_35_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_35_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.086, 0.538]\nB: [0.209, 0.589]\nC: [0.727, 0.366]\nD: [0.529, 0.299]",
+    "question": "What is the position coordinates of the point with coordinates ([0.789, 0.359]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.086, 0.538]\nB: [0.209, 0.589]\nC: [0.727, 0.366]\nD: [0.529, 0.299]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_36_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_36_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.5, 0.007]\nB: [0.731, 0.113]\nC: [0.636, 0.642]\nD: [0.325, 0.315]",
+    "question": "What is the position coordinates of the point with coordinates ([0.5, 0.007]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.5, 0.007]\nB: [0.731, 0.113]\nC: [0.636, 0.642]\nD: [0.325, 0.315]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_37_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_37_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.0, 0.0]\nB: [0.032, 0.829]\nC: [0.507, 0.48]\nD: [0.697, 0.839]",
+    "question": "What is the position coordinates of the point with coordinates ([0.296, 0.358]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.0, 0.0]\nB: [0.032, 0.829]\nC: [0.507, 0.48]\nD: [0.697, 0.839]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_38_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_38_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.773, 0.291]\nB: [0.256, 0.091]\nC: [0.561, 0.908]\nD: [0.572, 0.294]",
+    "question": "What is the position coordinates of the point with coordinates ([0.572, 0.294]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.773, 0.291]\nB: [0.256, 0.091]\nC: [0.561, 0.908]\nD: [0.572, 0.294]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_39_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_39_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.278, 0.564]\nB: [0.995, 0.367]\nC: [0.923, 0.335]\nD: [0.942, 0.46]",
+    "question": "What is the position coordinates of the point with coordinates ([0.995, 0.367]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.278, 0.564]\nB: [0.995, 0.367]\nC: [0.923, 0.335]\nD: [0.942, 0.46]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_40_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_40_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.791, 0.587]\nB: [0.006, 0.092]\nC: [0.454, 0.459]\nD: [0.339, 0.211]",
+    "question": "What is the position coordinates of the point with coordinates ([0.339, 0.211]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.791, 0.587]\nB: [0.006, 0.092]\nC: [0.454, 0.459]\nD: [0.339, 0.211]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_41_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_41_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.834, 0.042]\nB: [0.0, 0.0]\nC: [0.657, 0.031]\nD: [0.366, 0.215]",
+    "question": "What is the position coordinates of the point with coordinates ([0.0, 0.0]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.834, 0.042]\nB: [0.0, 0.0]\nC: [0.657, 0.031]\nD: [0.366, 0.215]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_42_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_42_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.522, 0.936]\nB: [0.431, 0.505]\nC: [0.056, 0.43]\nD: [0.445, 0.055]",
+    "question": "What is the position coordinates of the point with coordinates ([0.465, 0.516]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.522, 0.936]\nB: [0.431, 0.505]\nC: [0.056, 0.43]\nD: [0.445, 0.055]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_43_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_43_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.441, 0.076]\nB: [0.638, 0.275]\nC: [0.844, 0.793]\nD: [0.485, 0.944]",
+    "question": "What is the position coordinates of the point with coordinates ([0.638, 0.276]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.441, 0.076]\nB: [0.638, 0.275]\nC: [0.844, 0.793]\nD: [0.485, 0.944]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_44_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_44_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.464, 0.801]\nB: [0.453, 0.251]\nC: [0.254, 0.642]\nD: [0.099, 0.252]",
+    "question": "What is the position coordinates of the point with coordinates ([0.254, 0.642]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.464, 0.801]\nB: [0.453, 0.251]\nC: [0.254, 0.642]\nD: [0.099, 0.252]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_45_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_45_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.281, 0.178]\nB: [0.162, 0.715]\nC: [0.761, 0.046]\nD: [0.557, 0.001]",
+    "question": "What is the position coordinates of the point with coordinates ([0.571, 0.033]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.281, 0.178]\nB: [0.162, 0.715]\nC: [0.761, 0.046]\nD: [0.557, 0.001]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_46_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_46_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [1.0, 0.279]\nB: [0.584, 0.204]\nC: [0.191, 0.877]\nD: [0.563, 0.267]",
+    "question": "What is the position coordinates of the point with coordinates ([0.582, 0.204]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [1.0, 0.279]\nB: [0.584, 0.204]\nC: [0.191, 0.877]\nD: [0.563, 0.267]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_47_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_47_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.002, 0.108]\nB: [0.549, 0.37]\nC: [0.846, 0.072]\nD: [0.502, 0.698]",
+    "question": "What is the position coordinates of the point with coordinates ([0.552, 0.368]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.002, 0.108]\nB: [0.549, 0.37]\nC: [0.846, 0.072]\nD: [0.502, 0.698]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_48_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_48_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.509, 0.858]\nB: [0.643, 0.572]\nC: [0.432, 0.735]\nD: [0.542, 0.338]",
+    "question": "What is the position coordinates of the point with coordinates ([0.542, 0.339]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.509, 0.858]\nB: [0.643, 0.572]\nC: [0.432, 0.735]\nD: [0.542, 0.338]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_49_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_49_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.048, 0.391]\nB: [0.787, 0.747]\nC: [0.518, 0.517]\nD: [0.507, 0.833]",
+    "question": "What is the position coordinates of the point with coordinates ([0.515, 0.514]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.048, 0.391]\nB: [0.787, 0.747]\nC: [0.518, 0.517]\nD: [0.507, 0.833]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_50_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_50_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.528, 0.45]\nB: [0.74, 0.315]\nC: [0.482, 0.584]\nD: [0.088, 0.042]",
+    "question": "What is the position coordinates of the point with coordinates ([0.723, 0.386]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.528, 0.45]\nB: [0.74, 0.315]\nC: [0.482, 0.584]\nD: [0.088, 0.042]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_51_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_51_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.04, 0.904]\nB: [0.4, 0.187]\nC: [0.134, 0.465]\nD: [0.294, 0.45]",
+    "question": "What is the position coordinates of the point with coordinates ([0.056, 0.907]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.04, 0.904]\nB: [0.4, 0.187]\nC: [0.134, 0.465]\nD: [0.294, 0.45]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_52_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_52_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.232, 0.071]\nB: [0.57, 0.335]\nC: [0.206, 0.4]\nD: [0.554, 0.081]",
+    "question": "What is the position coordinates of the point with coordinates ([0.585, 0.342]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.232, 0.071]\nB: [0.57, 0.335]\nC: [0.206, 0.4]\nD: [0.554, 0.081]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_53_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_53_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.043, 0.026]\nB: [0.536, 0.287]\nC: [0.878, 0.179]\nD: [0.519, 0.466]",
+    "question": "What is the position coordinates of the point with coordinates ([0.537, 0.483]) in Image 1 within the Image 2? Note that the width of the input RGB image is 910 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.043, 0.026]\nB: [0.536, 0.287]\nC: [0.878, 0.179]\nD: [0.519, 0.466]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_54_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_54_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.712, 0.402]\nB: [0.937, 0.199]\nC: [0.286, 0.017]\nD: [0.843, 0.865]",
+    "question": "What is the position coordinates of the point with coordinates ([0.309, -0.011]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.712, 0.402]\nB: [0.937, 0.199]\nC: [0.286, 0.017]\nD: [0.843, 0.865]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_55_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_55_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.014, 0.882]\nB: [0.728, 0.689]\nC: [0.088, 0.375]\nD: [0.554, 0.511]",
+    "question": "What is the position coordinates of the point with coordinates ([0.6, 0.808]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.014, 0.882]\nB: [0.728, 0.689]\nC: [0.088, 0.375]\nD: [0.554, 0.511]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_56_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_56_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.109, 0.199]\nB: [0.021, 0.741]\nC: [0.0, 0.0]\nD: [0.405, 0.69]",
+    "question": "What is the position coordinates of the point with coordinates ([0.572, 0.171]) in Image 1 within the Image 2? Note that the width of the input RGB image is 910 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.109, 0.199]\nB: [0.021, 0.741]\nC: [0.0, 0.0]\nD: [0.405, 0.69]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_57_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_57_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.38, 0.932]\nB: [0.47, 0.409]\nC: [0.528, 0.936]\nD: [0.533, 0.686]",
+    "question": "What is the position coordinates of the point with coordinates ([0.459, 0.412]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.38, 0.932]\nB: [0.47, 0.409]\nC: [0.528, 0.936]\nD: [0.533, 0.686]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_58_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_58_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.092, 0.225]\nB: [0.605, 0.232]\nC: [0.39, 0.458]\nD: [0.377, 0.065]",
+    "question": "What is the position coordinates of the point with coordinates ([0.064, 0.24]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.092, 0.225]\nB: [0.605, 0.232]\nC: [0.39, 0.458]\nD: [0.377, 0.065]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_59_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_59_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.307, 0.516]\nB: [0.508, 0.388]\nC: [0.368, 0.937]\nD: [0.527, 0.106]",
+    "question": "What is the position coordinates of the point with coordinates ([0.513, 0.478]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.307, 0.516]\nB: [0.508, 0.388]\nC: [0.368, 0.937]\nD: [0.527, 0.106]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_60_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_60_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.621, 0.322]\nB: [0.757, 0.909]\nC: [0.765, 0.887]\nD: [0.485, 0.282]",
+    "question": "What is the position coordinates of the point with coordinates ([0.543, 0.573]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.621, 0.322]\nB: [0.757, 0.909]\nC: [0.765, 0.887]\nD: [0.485, 0.282]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_61_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_61_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.148, 0.593]\nB: [0.867, 0.594]\nC: [0.363, 0.725]\nD: [0.988, 0.381]",
+    "question": "What is the position coordinates of the point with coordinates ([0.363, 0.725]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.148, 0.593]\nB: [0.867, 0.594]\nC: [0.363, 0.725]\nD: [0.988, 0.381]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_62_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_62_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.538, 0.67]\nB: [0.113, 0.312]\nC: [0.781, 0.017]\nD: [0.78, 0.124]",
+    "question": "What is the position coordinates of the point with coordinates ([0.113, 0.312]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.538, 0.67]\nB: [0.113, 0.312]\nC: [0.781, 0.017]\nD: [0.78, 0.124]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_63_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_63_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.409, 0.184]\nB: [0.327, 0.555]\nC: [0.304, 0.166]\nD: [0.398, 0.141]",
+    "question": "What is the position coordinates of the point with coordinates ([0.318, 0.204]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.409, 0.184]\nB: [0.327, 0.555]\nC: [0.304, 0.166]\nD: [0.398, 0.141]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_64_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_64_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.663, 0.685]\nB: [0.5, 0.562]\nC: [0.628, 0.094]\nD: [0.876, 0.492]",
+    "question": "What is the position coordinates of the point with coordinates ([0.5, 0.546]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.663, 0.685]\nB: [0.5, 0.562]\nC: [0.628, 0.094]\nD: [0.876, 0.492]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_65_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_65_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.148, 0.822]\nB: [0.654, 0.462]\nC: [0.274, 0.087]\nD: [0.294, 0.87]",
+    "question": "What is the position coordinates of the point with coordinates ([0.225, -0.034]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.148, 0.822]\nB: [0.654, 0.462]\nC: [0.274, 0.087]\nD: [0.294, 0.87]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_66_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_66_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.443, 0.32]\nB: [0.907, 0.404]\nC: [0.451, 0.543]\nD: [0.775, 0.465]",
+    "question": "What is the position coordinates of the point with coordinates ([0.775, 0.465]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.443, 0.32]\nB: [0.907, 0.404]\nC: [0.451, 0.543]\nD: [0.775, 0.465]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_67_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_67_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.912, 0.423]\nB: [0.477, 0.187]\nC: [0.439, 0.609]\nD: [0.127, 0.162]",
+    "question": "What is the position coordinates of the point with coordinates ([0.475, 0.188]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.912, 0.423]\nB: [0.477, 0.187]\nC: [0.439, 0.609]\nD: [0.127, 0.162]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_68_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_68_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.348, 0.247]\nB: [0.53, 0.395]\nC: [0.894, 0.004]\nD: [0.561, 0.958]",
+    "question": "What is the position coordinates of the point with coordinates ([0.528, 0.394]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.348, 0.247]\nB: [0.53, 0.395]\nC: [0.894, 0.004]\nD: [0.561, 0.958]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_69_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_69_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.451, 0.01]\nB: [0.588, 0.525]\nC: [0.542, 0.784]\nD: [0.271, 0.069]",
+    "question": "What is the position coordinates of the point with coordinates ([0.21, 0.024]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.451, 0.01]\nB: [0.588, 0.525]\nC: [0.542, 0.784]\nD: [0.271, 0.069]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_70_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_70_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.545, 0.343]\nB: [0.872, 0.767]\nC: [0.848, 0.331]\nD: [0.082, 0.655]",
+    "question": "What is the position coordinates of the point with coordinates ([0.546, 0.344]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.545, 0.343]\nB: [0.872, 0.767]\nC: [0.848, 0.331]\nD: [0.082, 0.655]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_71_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_71_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.246, 0.558]\nB: [0.213, 0.365]\nC: [0.605, 0.491]\nD: [0.56, -0.031]",
+    "question": "What is the position coordinates of the point with coordinates ([0.527, 0.136]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.246, 0.558]\nB: [0.213, 0.365]\nC: [0.605, 0.491]\nD: [0.56, -0.031]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_72_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_72_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.497, 0.448]\nB: [0.783, 0.271]\nC: [0.406, 0.738]\nD: [0.416, 0.886]",
+    "question": "What is the position coordinates of the point with coordinates ([0.783, 0.271]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.497, 0.448]\nB: [0.783, 0.271]\nC: [0.406, 0.738]\nD: [0.416, 0.886]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_73_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_73_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.939, 0.844]\nB: [0.453, 0.842]\nC: [0.019, 0.701]\nD: [0.33, 0.019]",
+    "question": "What is the position coordinates of the point with coordinates ([0.382, 0.074]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.939, 0.844]\nB: [0.453, 0.842]\nC: [0.019, 0.701]\nD: [0.33, 0.019]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_74_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_74_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.712, 0.468]\nB: [0.757, 0.203]\nC: [0.602, 0.149]\nD: [0.624, 0.442]",
+    "question": "What is the position coordinates of the point with coordinates ([0.625, 0.442]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.712, 0.468]\nB: [0.757, 0.203]\nC: [0.602, 0.149]\nD: [0.624, 0.442]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_75_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_75_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.775, 0.586]\nB: [0.403, 0.947]\nC: [0.0, 0.0]\nD: [0.095, 0.525]",
+    "question": "What is the position coordinates of the point with coordinates ([0.0, 0.0]) in Image 1 within the Image 2? Note that the width of the input RGB image is 910 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.775, 0.586]\nB: [0.403, 0.947]\nC: [0.0, 0.0]\nD: [0.095, 0.525]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_76_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_76_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.306, 0.517]\nB: [0.404, 0.704]\nC: [0.0, 0.0]\nD: [0.389, 0.429]",
+    "question": "What is the position coordinates of the point with coordinates ([0.0, 0.0]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.306, 0.517]\nB: [0.404, 0.704]\nC: [0.0, 0.0]\nD: [0.389, 0.429]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_77_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_77_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.421, 0.202]\nB: [0.936, 0.193]\nC: [0.836, 0.093]\nD: [0.892, 0.905]",
+    "question": "What is the position coordinates of the point with coordinates ([0.603, 0.295]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.421, 0.202]\nB: [0.936, 0.193]\nC: [0.836, 0.093]\nD: [0.892, 0.905]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_78_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_78_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.415, 0.336]\nB: [0.147, 0.444]\nC: [0.469, 0.996]\nD: [0.759, 0.125]",
+    "question": "What is the position coordinates of the point with coordinates ([0.415, 0.336]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.415, 0.336]\nB: [0.147, 0.444]\nC: [0.469, 0.996]\nD: [0.759, 0.125]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_79_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_79_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.0, 0.0]\nB: [0.001, 0.519]\nC: [0.21, 0.901]\nD: [0.72, 0.872]",
+    "question": "What is the position coordinates of the point with coordinates ([0.0, 0.0]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.0, 0.0]\nB: [0.001, 0.519]\nC: [0.21, 0.901]\nD: [0.72, 0.872]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_80_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_80_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.188, 0.294]\nB: [0.08, 0.837]\nC: [0.878, 0.923]\nD: [0.39, 0.215]",
+    "question": "What is the position coordinates of the point with coordinates ([0.39, 0.215]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.188, 0.294]\nB: [0.08, 0.837]\nC: [0.878, 0.923]\nD: [0.39, 0.215]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_81_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_81_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.725, 0.505]\nB: [0.825, 0.634]\nC: [0.772, 0.85]\nD: [0.521, 0.137]",
+    "question": "What is the position coordinates of the point with coordinates ([0.423, 0.126]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.725, 0.505]\nB: [0.825, 0.634]\nC: [0.772, 0.85]\nD: [0.521, 0.137]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_82_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_82_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.937, 0.437]\nB: [0.932, 0.955]\nC: [0.443, 0.473]\nD: [0.57, -0.021]",
+    "question": "What is the position coordinates of the point with coordinates ([0.379, -0.029]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.937, 0.437]\nB: [0.932, 0.955]\nC: [0.443, 0.473]\nD: [0.57, -0.021]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_83_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_83_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.482, 0.199]\nB: [0.043, 0.981]\nC: [0.419, 0.373]\nD: [0.325, 0.861]",
+    "question": "What is the position coordinates of the point with coordinates ([0.482, 0.199]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.482, 0.199]\nB: [0.043, 0.981]\nC: [0.419, 0.373]\nD: [0.325, 0.861]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_84_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_84_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.27, 0.844]\nB: [0.082, -0.197]\nC: [0.942, 0.56]\nD: [0.625, 0.212]",
+    "question": "What is the position coordinates of the point with coordinates ([0.314, -0.235]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.27, 0.844]\nB: [0.082, -0.197]\nC: [0.942, 0.56]\nD: [0.625, 0.212]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_85_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_85_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.493, 0.952]\nB: [0.403, 0.455]\nC: [0.764, 0.389]\nD: [0.3, 0.08]",
+    "question": "What is the position coordinates of the point with coordinates ([0.442, 0.361]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.493, 0.952]\nB: [0.403, 0.455]\nC: [0.764, 0.389]\nD: [0.3, 0.08]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_86_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_86_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [-0.038, 0.253]\nB: [0.77, 0.338]\nC: [0.766, 0.061]\nD: [0.958, 0.882]",
+    "question": "What is the position coordinates of the point with coordinates ([0.028, 0.3]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [-0.038, 0.253]\nB: [0.77, 0.338]\nC: [0.766, 0.061]\nD: [0.958, 0.882]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_87_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_87_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.005, 0.571]\nB: [0.168, 0.518]\nC: [0.523, 0.466]\nD: [0.784, 0.541]",
+    "question": "What is the position coordinates of the point with coordinates ([0.553, 0.401]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.005, 0.571]\nB: [0.168, 0.518]\nC: [0.523, 0.466]\nD: [0.784, 0.541]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_88_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_88_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.256, 0.018]\nB: [0.492, 0.583]\nC: [0.579, 0.753]\nD: [0.756, 0.803]",
+    "question": "What is the position coordinates of the point with coordinates ([0.536, 0.384]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.256, 0.018]\nB: [0.492, 0.583]\nC: [0.579, 0.753]\nD: [0.756, 0.803]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_89_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_89_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.4, 0.819]\nB: [0.315, 0.418]\nC: [0.695, 0.574]\nD: [0.934, 0.028]",
+    "question": "What is the position coordinates of the point with coordinates ([0.315, 0.418]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.4, 0.819]\nB: [0.315, 0.418]\nC: [0.695, 0.574]\nD: [0.934, 0.028]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_90_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_90_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.161, 0.915]\nB: [0.55, -0.109]\nC: [0.025, 0.306]\nD: [0.859, 0.383]",
+    "question": "What is the position coordinates of the point with coordinates ([0.556, -0.112]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.161, 0.915]\nB: [0.55, -0.109]\nC: [0.025, 0.306]\nD: [0.859, 0.383]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_91_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_91_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.796, 0.095]\nB: [0.902, 0.871]\nC: [0.454, 0.805]\nD: [0.399, 0.254]",
+    "question": "What is the position coordinates of the point with coordinates ([0.336, 0.127]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.796, 0.095]\nB: [0.902, 0.871]\nC: [0.454, 0.805]\nD: [0.399, 0.254]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_92_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_92_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.78, 0.578]\nB: [0.586, 0.492]\nC: [0.362, 0.862]\nD: [0.308, 0.418]",
+    "question": "What is the position coordinates of the point with coordinates ([0.516, 0.501]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.78, 0.578]\nB: [0.586, 0.492]\nC: [0.362, 0.862]\nD: [0.308, 0.418]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_93_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_93_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.771, 0.142]\nB: [0.516, 0.41]\nC: [0.068, 0.844]\nD: [0.331, 0.532]",
+    "question": "What is the position coordinates of the point with coordinates ([0.0, 0.0]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.771, 0.142]\nB: [0.516, 0.41]\nC: [0.068, 0.844]\nD: [0.331, 0.532]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_94_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_94_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.0, 0.0]\nB: [0.53, 0.297]\nC: [0.365, 0.027]\nD: [0.781, 0.768]",
+    "question": "What is the position coordinates of the point with coordinates ([0.372, 0.327]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.0, 0.0]\nB: [0.53, 0.297]\nC: [0.365, 0.027]\nD: [0.781, 0.768]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_95_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_95_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.198, 0.526]\nB: [0.435, 0.603]\nC: [0.508, 0.551]\nD: [0.55, 0.363]",
+    "question": "What is the position coordinates of the point with coordinates ([0.508, 0.551]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.198, 0.526]\nB: [0.435, 0.603]\nC: [0.508, 0.551]\nD: [0.55, 0.363]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_96_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_96_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.12, 0.762]\nB: [0.674, 0.29]\nC: [0.557, 0.641]\nD: [0.055, 0.586]",
+    "question": "What is the position coordinates of the point with coordinates ([0.597, 0.28]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.12, 0.762]\nB: [0.674, 0.29]\nC: [0.557, 0.641]\nD: [0.055, 0.586]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_97_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_97_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.064, 0.785]\nB: [0.378, 0.667]\nC: [0.522, 0.235]\nD: [0.437, 0.118]",
+    "question": "What is the position coordinates of the point with coordinates ([0.378, 0.667]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.064, 0.785]\nB: [0.378, 0.667]\nC: [0.522, 0.235]\nD: [0.437, 0.118]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_98_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_98_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.875, 0.931]\nB: [0.087, 0.702]\nC: [0.508, 0.69]\nD: [0.046, 0.524]",
+    "question": "What is the position coordinates of the point with coordinates ([0.251, 0.5]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.875, 0.931]\nB: [0.087, 0.702]\nC: [0.508, 0.69]\nD: [0.046, 0.524]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_99_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_99_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.449, 0.349]\nB: [0.497, 0.606]\nC: [0.545, 0.303]\nD: [0.125, 0.458]",
+    "question": "What is the position coordinates of the point with coordinates ([0.527, 0.379]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.449, 0.349]\nB: [0.497, 0.606]\nC: [0.545, 0.303]\nD: [0.125, 0.458]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_100_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_100_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.276, 0.406]\nB: [0.94, 0.417]\nC: [0.807, 0.617]\nD: [0.151, 0.326]",
+    "question": "What is the position coordinates of the point with coordinates ([0.266, 0.607]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.276, 0.406]\nB: [0.94, 0.417]\nC: [0.807, 0.617]\nD: [0.151, 0.326]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_101_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_101_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.851, 0.365]\nB: [0.558, 0.074]\nC: [0.378, 0.002]\nD: [0.075, 0.676]",
+    "question": "What is the position coordinates of the point with coordinates ([0.852, 0.365]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.851, 0.365]\nB: [0.558, 0.074]\nC: [0.378, 0.002]\nD: [0.075, 0.676]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_102_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_102_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.0, 0.0]\nB: [0.515, 0.178]\nC: [0.197, 0.534]\nD: [0.536, 0.497]",
+    "question": "What is the position coordinates of the point with coordinates ([0.0, 0.0]) in Image 1 within the Image 2? Note that the width of the input RGB image is 910 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.0, 0.0]\nB: [0.515, 0.178]\nC: [0.197, 0.534]\nD: [0.536, 0.497]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_103_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_103_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.133, 0.966]\nB: [0.167, 0.473]\nC: [0.808, 0.497]\nD: [0.597, 0.39]",
+    "question": "What is the position coordinates of the point with coordinates ([0.502, 0.304]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.133, 0.966]\nB: [0.167, 0.473]\nC: [0.808, 0.497]\nD: [0.597, 0.39]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_104_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_104_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.635, 0.971]\nB: [0.243, 0.351]\nC: [0.0, 0.0]\nD: [0.995, 0.403]",
+    "question": "What is the position coordinates of the point with coordinates ([0.0, 0.0]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.635, 0.971]\nB: [0.243, 0.351]\nC: [0.0, 0.0]\nD: [0.995, 0.403]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_105_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_105_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.0, 0.0]\nB: [0.766, 0.625]\nC: [0.702, 0.537]\nD: [0.4, 0.901]",
+    "question": "What is the position coordinates of the point with coordinates ([0.0, 0.0]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.0, 0.0]\nB: [0.766, 0.625]\nC: [0.702, 0.537]\nD: [0.4, 0.901]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_106_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_106_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.04, 0.521]\nB: [0.013, 0.863]\nC: [0.041, 0.677]\nD: [0.471, 0.865]",
+    "question": "What is the position coordinates of the point with coordinates ([0.554, 0.401]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.04, 0.521]\nB: [0.013, 0.863]\nC: [0.041, 0.677]\nD: [0.471, 0.865]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_107_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_107_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.422, 0.094]\nB: [0.036, 0.241]\nC: [0.832, 0.759]\nD: [0.084, 0.371]",
+    "question": "What is the position coordinates of the point with coordinates ([0.016, 0.253]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.422, 0.094]\nB: [0.036, 0.241]\nC: [0.832, 0.759]\nD: [0.084, 0.371]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_108_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_108_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.435, 0.035]\nB: [0.758, 0.725]\nC: [0.428, 0.944]\nD: [0.191, 0.586]",
+    "question": "What is the position coordinates of the point with coordinates ([0.758, 0.725]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.435, 0.035]\nB: [0.758, 0.725]\nC: [0.428, 0.944]\nD: [0.191, 0.586]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_109_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_109_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.486, 0.472]\nB: [0.0, 0.409]\nC: [0.679, 0.71]\nD: [0.474, 0.443]",
+    "question": "What is the position coordinates of the point with coordinates ([0.383, 0.481]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.486, 0.472]\nB: [0.0, 0.409]\nC: [0.679, 0.71]\nD: [0.474, 0.443]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_110_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_110_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.832, 0.16]\nB: [0.767, 0.295]\nC: [0.238, 0.998]\nD: [0.231, 0.345]",
+    "question": "What is the position coordinates of the point with coordinates ([0.278, 0.312]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.832, 0.16]\nB: [0.767, 0.295]\nC: [0.238, 0.998]\nD: [0.231, 0.345]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_111_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_111_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.98, 0.565]\nB: [0.053, 0.674]\nC: [0.564, 0.876]\nD: [0.452, 0.539]",
+    "question": "What is the position coordinates of the point with coordinates ([0.98, 0.565]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.98, 0.565]\nB: [0.053, 0.674]\nC: [0.564, 0.876]\nD: [0.452, 0.539]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_112_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_112_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.335, 0.563]\nB: [0.88, 0.001]\nC: [0.119, 0.693]\nD: [0.484, 0.412]",
+    "question": "What is the position coordinates of the point with coordinates ([0.521, 0.426]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.335, 0.563]\nB: [0.88, 0.001]\nC: [0.119, 0.693]\nD: [0.484, 0.412]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_113_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_113_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.626, 0.275]\nB: [0.815, 0.877]\nC: [0.004, 0.083]\nD: [0.871, 0.172]",
+    "question": "What is the position coordinates of the point with coordinates ([0.631, 0.278]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.626, 0.275]\nB: [0.815, 0.877]\nC: [0.004, 0.083]\nD: [0.871, 0.172]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_114_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_114_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.017, 0.757]\nB: [0.637, 0.134]\nC: [0.823, 0.303]\nD: [0.415, 0.038]",
+    "question": "What is the position coordinates of the point with coordinates ([0.475, 0.03]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.017, 0.757]\nB: [0.637, 0.134]\nC: [0.823, 0.303]\nD: [0.415, 0.038]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_115_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_115_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.278, 0.07]\nB: [0.287, 0.704]\nC: [0.387, 0.197]\nD: [0.443, 0.105]",
+    "question": "What is the position coordinates of the point with coordinates ([0.414, -0.045]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.278, 0.07]\nB: [0.287, 0.704]\nC: [0.387, 0.197]\nD: [0.443, 0.105]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_116_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_116_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.243, 0.44]\nB: [0.089, 0.367]\nC: [0.322, 0.069]\nD: [0.126, 0.424]",
+    "question": "What is the position coordinates of the point with coordinates ([0.126, 0.424]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.243, 0.44]\nB: [0.089, 0.367]\nC: [0.322, 0.069]\nD: [0.126, 0.424]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_117_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_117_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.878, 0.91]\nB: [0.509, 0.022]\nC: [0.259, 0.162]\nD: [0.213, 0.977]",
+    "question": "What is the position coordinates of the point with coordinates ([0.265, 0.16]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.878, 0.91]\nB: [0.509, 0.022]\nC: [0.259, 0.162]\nD: [0.213, 0.977]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_118_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_118_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.451, 0.674]\nB: [0.529, 0.336]\nC: [0.137, 0.847]\nD: [0.081, 0.187]",
+    "question": "What is the position coordinates of the point with coordinates ([0.529, 0.336]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.451, 0.674]\nB: [0.529, 0.336]\nC: [0.137, 0.847]\nD: [0.081, 0.187]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_119_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_119_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.354, 0.137]\nB: [0.831, 0.926]\nC: [0.473, 0.743]\nD: [0.228, 0.73]",
+    "question": "What is the position coordinates of the point with coordinates ([0.473, 0.743]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.354, 0.137]\nB: [0.831, 0.926]\nC: [0.473, 0.743]\nD: [0.228, 0.73]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_120_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_120_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.703, 0.241]\nB: [0.985, 0.235]\nC: [0.439, 0.494]\nD: [0.614, 0.184]",
+    "question": "What is the position coordinates of the point with coordinates ([0.0, 0.0]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.703, 0.241]\nB: [0.985, 0.235]\nC: [0.439, 0.494]\nD: [0.614, 0.184]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_121_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_121_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.035, 0.992]\nB: [0.994, 0.321]\nC: [0.839, 0.258]\nD: [0.414, 0.367]",
+    "question": "What is the position coordinates of the point with coordinates ([0.306, 0.334]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.035, 0.992]\nB: [0.994, 0.321]\nC: [0.839, 0.258]\nD: [0.414, 0.367]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_122_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_122_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.613, 0.309]\nB: [0.457, 0.931]\nC: [0.669, 0.383]\nD: [0.938, 0.837]",
+    "question": "What is the position coordinates of the point with coordinates ([0.602, 0.319]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.613, 0.309]\nB: [0.457, 0.931]\nC: [0.669, 0.383]\nD: [0.938, 0.837]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_123_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_123_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.533, 0.568]\nB: [0.394, 0.545]\nC: [0.429, 0.604]\nD: [0.299, 0.66]",
+    "question": "What is the position coordinates of the point with coordinates ([0.0, 0.0]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.533, 0.568]\nB: [0.394, 0.545]\nC: [0.429, 0.604]\nD: [0.299, 0.66]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_124_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_124_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.418, 0.337]\nB: [0.703, 0.614]\nC: [0.256, 0.811]\nD: [0.753, 0.192]",
+    "question": "What is the position coordinates of the point with coordinates ([0.419, 0.337]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.418, 0.337]\nB: [0.703, 0.614]\nC: [0.256, 0.811]\nD: [0.753, 0.192]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_125_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_125_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.451, 0.63]\nB: [0.161, 0.672]\nC: [0.117, 0.38]\nD: [0.918, 0.717]",
+    "question": "What is the position coordinates of the point with coordinates ([0.161, 0.672]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.451, 0.63]\nB: [0.161, 0.672]\nC: [0.117, 0.38]\nD: [0.918, 0.717]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_126_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_126_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.487, 0.784]\nB: [0.155, 0.004]\nC: [0.336, 0.564]\nD: [0.045, 0.917]",
+    "question": "What is the position coordinates of the point with coordinates ([0.616, 0.544]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.487, 0.784]\nB: [0.155, 0.004]\nC: [0.336, 0.564]\nD: [0.045, 0.917]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_127_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_127_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.352, 0.409]\nB: [0.959, 0.481]\nC: [0.373, 0.245]\nD: [0.977, 0.091]",
+    "question": "What is the position coordinates of the point with coordinates ([0.352, 0.409]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.352, 0.409]\nB: [0.959, 0.481]\nC: [0.373, 0.245]\nD: [0.977, 0.091]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_128_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_128_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.859, 0.311]\nB: [0.589, 0.682]\nC: [0.306, 0.308]\nD: [0.219, 0.979]",
+    "question": "What is the position coordinates of the point with coordinates ([0.304, 0.307]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.859, 0.311]\nB: [0.589, 0.682]\nC: [0.306, 0.308]\nD: [0.219, 0.979]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_129_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_129_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.472, 0.938]\nB: [0.873, 0.948]\nC: [0.511, 0.28]\nD: [0.829, 0.346]",
+    "question": "What is the position coordinates of the point with coordinates ([0.0, 0.0]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.472, 0.938]\nB: [0.873, 0.948]\nC: [0.511, 0.28]\nD: [0.829, 0.346]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_130_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_130_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.446, 0.638]\nB: [0.628, 0.456]\nC: [0.455, 0.627]\nD: [0.379, 0.405]",
+    "question": "What is the position coordinates of the point with coordinates ([0.56, 0.467]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.446, 0.638]\nB: [0.628, 0.456]\nC: [0.455, 0.627]\nD: [0.379, 0.405]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_131_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_131_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.322, 0.321]\nB: [0.15, 0.133]\nC: [0.989, 0.972]\nD: [0.16, 0.862]",
+    "question": "What is the position coordinates of the point with coordinates ([0.232, 0.188]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.322, 0.321]\nB: [0.15, 0.133]\nC: [0.989, 0.972]\nD: [0.16, 0.862]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_132_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_132_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.931, 0.959]\nB: [0.506, 0.286]\nC: [0.391, 0.531]\nD: [0.469, 0.383]",
+    "question": "What is the position coordinates of the point with coordinates ([0.465, 0.381]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.931, 0.959]\nB: [0.506, 0.286]\nC: [0.391, 0.531]\nD: [0.469, 0.383]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_133_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_133_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.109, 0.806]\nB: [0.197, 0.457]\nC: [0.203, 0.114]\nD: [0.135, 0.938]",
+    "question": "What is the position coordinates of the point with coordinates ([0.2, 0.047]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.109, 0.806]\nB: [0.197, 0.457]\nC: [0.203, 0.114]\nD: [0.135, 0.938]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_134_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_134_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.24, 0.833]\nB: [0.308, 0.425]\nC: [0.339, 0.639]\nD: [0.077, 0.998]",
+    "question": "What is the position coordinates of the point with coordinates ([0.24, 0.833]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.24, 0.833]\nB: [0.308, 0.425]\nC: [0.339, 0.639]\nD: [0.077, 0.998]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_135_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_135_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.0, 0.0]\nB: [0.721, 0.606]\nC: [0.121, 0.428]\nD: [0.252, 0.486]",
+    "question": "What is the position coordinates of the point with coordinates ([0.496, 0.539]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.0, 0.0]\nB: [0.721, 0.606]\nC: [0.121, 0.428]\nD: [0.252, 0.486]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_136_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_136_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.675, 0.84]\nB: [0.087, 0.791]\nC: [0.736, 0.705]\nD: [0.092, 0.465]",
+    "question": "What is the position coordinates of the point with coordinates ([0.131, 0.527]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.675, 0.84]\nB: [0.087, 0.791]\nC: [0.736, 0.705]\nD: [0.092, 0.465]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_137_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_137_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.188, 0.925]\nB: [0.922, 0.115]\nC: [0.894, 0.22]\nD: [0.022, 0.091]",
+    "question": "What is the position coordinates of the point with coordinates ([0.195, 0.928]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.188, 0.925]\nB: [0.922, 0.115]\nC: [0.894, 0.22]\nD: [0.022, 0.091]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_138_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_138_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.962, 0.897]\nB: [0.754, 0.628]\nC: [0.384, 0.96]\nD: [0.784, 0.178]",
+    "question": "What is the position coordinates of the point with coordinates ([0.384, 0.96]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.962, 0.897]\nB: [0.754, 0.628]\nC: [0.384, 0.96]\nD: [0.784, 0.178]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_139_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_139_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.021, 0.739]\nB: [0.0, 0.0]\nC: [0.701, 0.818]\nD: [0.335, 0.057]",
+    "question": "What is the position coordinates of the point with coordinates ([0.0, 0.0]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.021, 0.739]\nB: [0.0, 0.0]\nC: [0.701, 0.818]\nD: [0.335, 0.057]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_140_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_140_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.167, 0.345]\nB: [0.618, 0.201]\nC: [0.805, 0.514]\nD: [0.027, 0.731]",
+    "question": "What is the position coordinates of the point with coordinates ([0.609, 0.209]) in Image 1 within the Image 2? Note that the width of the input RGB image is 910 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.167, 0.345]\nB: [0.618, 0.201]\nC: [0.805, 0.514]\nD: [0.027, 0.731]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_141_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_141_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.373, 0.459]\nB: [0.416, 0.278]\nC: [0.662, 0.648]\nD: [0.304, 0.781]",
+    "question": "What is the position coordinates of the point with coordinates ([0.443, 0.285]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.373, 0.459]\nB: [0.416, 0.278]\nC: [0.662, 0.648]\nD: [0.304, 0.781]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_142_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_142_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.53, 0.42]\nB: [0.638, 0.766]\nC: [0.517, 0.984]\nD: [0.344, 0.268]",
+    "question": "What is the position coordinates of the point with coordinates ([0.345, 0.268]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.53, 0.42]\nB: [0.638, 0.766]\nC: [0.517, 0.984]\nD: [0.344, 0.268]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_143_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_143_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.603, 0.414]\nB: [0.61, 0.464]\nC: [0.292, 0.626]\nD: [0.062, 0.813]",
+    "question": "What is the position coordinates of the point with coordinates ([0.602, 0.412]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.603, 0.414]\nB: [0.61, 0.464]\nC: [0.292, 0.626]\nD: [0.062, 0.813]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_144_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_144_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.493, 0.798]\nB: [0.963, 0.818]\nC: [0.245, 0.105]\nD: [0.982, 0.515]",
+    "question": "What is the position coordinates of the point with coordinates ([0.169, 0.075]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.493, 0.798]\nB: [0.963, 0.818]\nC: [0.245, 0.105]\nD: [0.982, 0.515]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_145_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_145_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.84, 0.083]\nB: [0.114, 0.077]\nC: [0.273, 0.23]\nD: [0.485, 0.534]",
+    "question": "What is the position coordinates of the point with coordinates ([0.443, 0.524]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.84, 0.083]\nB: [0.114, 0.077]\nC: [0.273, 0.23]\nD: [0.485, 0.534]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_146_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_146_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.345, 0.262]\nB: [0.512, 0.224]\nC: [0.657, 0.276]\nD: [0.166, 0.841]",
+    "question": "What is the position coordinates of the point with coordinates ([0.501, 0.22]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.345, 0.262]\nB: [0.512, 0.224]\nC: [0.657, 0.276]\nD: [0.166, 0.841]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_147_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_147_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.357, 0.196]\nB: [0.42, 0.234]\nC: [0.718, 0.336]\nD: [0.573, 0.896]",
+    "question": "What is the position coordinates of the point with coordinates ([0.0, 0.0]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.357, 0.196]\nB: [0.42, 0.234]\nC: [0.718, 0.336]\nD: [0.573, 0.896]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_148_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_148_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.793, 0.03]\nB: [0.879, 0.871]\nC: [0.781, 0.418]\nD: [0.549, 0.338]",
+    "question": "What is the position coordinates of the point with coordinates ([0.53, 0.332]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.793, 0.03]\nB: [0.879, 0.871]\nC: [0.781, 0.418]\nD: [0.549, 0.338]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_149_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_149_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.342, 0.072]\nB: [0.574, 0.028]\nC: [0.795, 0.301]\nD: [0.752, 0.99]",
+    "question": "What is the position coordinates of the point with coordinates ([0.342, 0.072]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.342, 0.072]\nB: [0.574, 0.028]\nC: [0.795, 0.301]\nD: [0.752, 0.99]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_150_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_150_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.082, 0.932]\nB: [0.262, 0.046]\nC: [0.434, 0.576]\nD: [0.686, 0.437]",
+    "question": "What is the position coordinates of the point with coordinates ([0.082, 0.932]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.082, 0.932]\nB: [0.262, 0.046]\nC: [0.434, 0.576]\nD: [0.686, 0.437]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_151_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_151_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.053, 0.623]\nB: [0.624, 0.428]\nC: [0.518, 0.784]\nD: [0.141, 0.376]",
+    "question": "What is the position coordinates of the point with coordinates ([0.624, 0.428]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.053, 0.623]\nB: [0.624, 0.428]\nC: [0.518, 0.784]\nD: [0.141, 0.376]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_152_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_152_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.059, 0.533]\nB: [0.697, 0.415]\nC: [0.114, 0.313]\nD: [0.328, 0.618]",
+    "question": "What is the position coordinates of the point with coordinates ([0.113, 0.313]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.059, 0.533]\nB: [0.697, 0.415]\nC: [0.114, 0.313]\nD: [0.328, 0.618]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_153_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_153_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.488, 0.838]\nB: [0.287, 0.106]\nC: [0.472, 0.074]\nD: [0.079, 0.354]",
+    "question": "What is the position coordinates of the point with coordinates ([0.572, -0.121]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.488, 0.838]\nB: [0.287, 0.106]\nC: [0.472, 0.074]\nD: [0.079, 0.354]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_154_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_154_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.631, 0.352]\nB: [0.646, 0.557]\nC: [0.682, 0.502]\nD: [0.586, 0.751]",
+    "question": "What is the position coordinates of the point with coordinates ([0.0, 0.0]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.631, 0.352]\nB: [0.646, 0.557]\nC: [0.682, 0.502]\nD: [0.586, 0.751]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_155_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_155_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.125, 0.593]\nB: [0.518, 0.506]\nC: [0.515, 0.327]\nD: [0.285, 0.07]",
+    "question": "What is the position coordinates of the point with coordinates ([0.588, 0.496]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.125, 0.593]\nB: [0.518, 0.506]\nC: [0.515, 0.327]\nD: [0.285, 0.07]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_156_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_156_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.176, 0.766]\nB: [0.337, 0.765]\nC: [0.905, 0.67]\nD: [0.04, 0.456]",
+    "question": "What is the position coordinates of the point with coordinates ([0.04, 0.456]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.176, 0.766]\nB: [0.337, 0.765]\nC: [0.905, 0.67]\nD: [0.04, 0.456]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_157_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_157_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.232, 0.766]\nB: [0.161, 0.72]\nC: [0.323, 0.222]\nD: [0.795, 0.138]",
+    "question": "What is the position coordinates of the point with coordinates ([0.361, 0.266]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.232, 0.766]\nB: [0.161, 0.72]\nC: [0.323, 0.222]\nD: [0.795, 0.138]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_158_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_158_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.333, 0.389]\nB: [0.691, 0.301]\nC: [0.868, 0.47]\nD: [0.649, 0.094]",
+    "question": "What is the position coordinates of the point with coordinates ([0.333, 0.389]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.333, 0.389]\nB: [0.691, 0.301]\nC: [0.868, 0.47]\nD: [0.649, 0.094]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_159_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_159_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.412, 0.254]\nB: [0.803, 0.989]\nC: [0.898, 0.497]\nD: [0.43, 0.295]",
+    "question": "What is the position coordinates of the point with coordinates ([0.392, 0.302]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.412, 0.254]\nB: [0.803, 0.989]\nC: [0.898, 0.497]\nD: [0.43, 0.295]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_160_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_160_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.0, 0.0]\nB: [0.864, 0.427]\nC: [0.189, 0.222]\nD: [0.86, 0.108]",
+    "question": "What is the position coordinates of the point with coordinates ([0.0, 0.0]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.0, 0.0]\nB: [0.864, 0.427]\nC: [0.189, 0.222]\nD: [0.86, 0.108]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_161_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_161_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.961, 0.515]\nB: [0.312, 0.682]\nC: [0.209, 0.16]\nD: [0.943, 0.395]",
+    "question": "What is the position coordinates of the point with coordinates ([0.181, 0.189]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.961, 0.515]\nB: [0.312, 0.682]\nC: [0.209, 0.16]\nD: [0.943, 0.395]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_162_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_162_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.606, 0.797]\nB: [0.0, 0.0]\nC: [0.538, 0.287]\nD: [0.14, 0.104]",
+    "question": "What is the position coordinates of the point with coordinates ([0.0, 0.0]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.606, 0.797]\nB: [0.0, 0.0]\nC: [0.538, 0.287]\nD: [0.14, 0.104]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_163_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_163_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.289, 0.952]\nB: [0.872, 0.205]\nC: [0.0, 0.0]\nD: [0.633, 0.427]",
+    "question": "What is the position coordinates of the point with coordinates ([0.0, 0.0]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.289, 0.952]\nB: [0.872, 0.205]\nC: [0.0, 0.0]\nD: [0.633, 0.427]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_164_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_164_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.47, 0.37]\nB: [0.35, 0.4]\nC: [0.042, 0.785]\nD: [0.081, 0.262]",
+    "question": "What is the position coordinates of the point with coordinates ([0.351, 0.401]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.47, 0.37]\nB: [0.35, 0.4]\nC: [0.042, 0.785]\nD: [0.081, 0.262]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_165_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_165_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.161, 0.071]\nB: [0.948, 0.753]\nC: [0.387, 0.629]\nD: [0.408, 0.774]",
+    "question": "What is the position coordinates of the point with coordinates ([0.718, 0.256]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.161, 0.071]\nB: [0.948, 0.753]\nC: [0.387, 0.629]\nD: [0.408, 0.774]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_166_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_166_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.18, 0.986]\nB: [0.148, 0.12]\nC: [0.474, 0.356]\nD: [0.634, 0.061]",
+    "question": "What is the position coordinates of the point with coordinates ([0.551, 0.394]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.18, 0.986]\nB: [0.148, 0.12]\nC: [0.474, 0.356]\nD: [0.634, 0.061]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_167_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_167_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.642, 0.55]\nB: [0.894, 0.525]\nC: [0.887, 0.681]\nD: [0.583, 0.912]",
+    "question": "What is the position coordinates of the point with coordinates ([0.724, 0.512]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.642, 0.55]\nB: [0.894, 0.525]\nC: [0.887, 0.681]\nD: [0.583, 0.912]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_168_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_168_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.987, 0.403]\nB: [0.465, 0.446]\nC: [0.05, 0.858]\nD: [0.457, 0.194]",
+    "question": "What is the position coordinates of the point with coordinates ([0.504, 0.202]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.987, 0.403]\nB: [0.465, 0.446]\nC: [0.05, 0.858]\nD: [0.457, 0.194]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_169_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_169_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.852, 0.571]\nB: [0.771, 0.593]\nC: [0.19, 0.794]\nD: [0.512, 0.314]",
+    "question": "What is the position coordinates of the point with coordinates ([0.513, 0.314]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.852, 0.571]\nB: [0.771, 0.593]\nC: [0.19, 0.794]\nD: [0.512, 0.314]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_170_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_170_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.998, 0.808]\nB: [0.0, 0.0]\nC: [0.98, 0.396]\nD: [0.419, 0.553]",
+    "question": "What is the position coordinates of the point with coordinates ([0.0, 0.0]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.998, 0.808]\nB: [0.0, 0.0]\nC: [0.98, 0.396]\nD: [0.419, 0.553]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_171_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_171_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.52, 0.284]\nB: [0.475, 0.251]\nC: [0.321, 0.629]\nD: [0.432, 0.371]",
+    "question": "What is the position coordinates of the point with coordinates ([0.427, 0.372]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.52, 0.284]\nB: [0.475, 0.251]\nC: [0.321, 0.629]\nD: [0.432, 0.371]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_172_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_172_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.0, 0.0]\nB: [0.781, 0.578]\nC: [0.642, 0.382]\nD: [0.679, 0.324]",
+    "question": "What is the position coordinates of the point with coordinates ([0.751, 0.277]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.0, 0.0]\nB: [0.781, 0.578]\nC: [0.642, 0.382]\nD: [0.679, 0.324]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_173_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_173_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.525, 0.662]\nB: [0.774, 0.504]\nC: [0.263, 0.754]\nD: [0.896, 0.303]",
+    "question": "What is the position coordinates of the point with coordinates ([0.0, 0.0]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.525, 0.662]\nB: [0.774, 0.504]\nC: [0.263, 0.754]\nD: [0.896, 0.303]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_174_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_174_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.336, 0.241]\nB: [0.754, 0.592]\nC: [0.711, 0.154]\nD: [0.814, 0.269]",
+    "question": "What is the position coordinates of the point with coordinates ([0.711, 0.154]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.336, 0.241]\nB: [0.754, 0.592]\nC: [0.711, 0.154]\nD: [0.814, 0.269]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_175_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_175_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.357, 0.26]\nB: [0.145, 0.457]\nC: [0.26, 0.791]\nD: [0.896, 0.054]",
+    "question": "What is the position coordinates of the point with coordinates ([0.357, 0.259]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.357, 0.26]\nB: [0.145, 0.457]\nC: [0.26, 0.791]\nD: [0.896, 0.054]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_176_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_176_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.0, 0.0]\nB: [0.249, 0.178]\nC: [0.969, 0.236]\nD: [0.363, 0.049]",
+    "question": "What is the position coordinates of the point with coordinates ([0.509, 0.617]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.0, 0.0]\nB: [0.249, 0.178]\nC: [0.969, 0.236]\nD: [0.363, 0.049]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_177_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_177_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.396, 0.165]\nB: [0.966, 0.511]\nC: [0.101, 0.549]\nD: [0.871, 0.899]",
+    "question": "What is the position coordinates of the point with coordinates ([0.0, 0.0]) in Image 1 within the Image 2? Note that the width of the input RGB image is 910 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.396, 0.165]\nB: [0.966, 0.511]\nC: [0.101, 0.549]\nD: [0.871, 0.899]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_178_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_178_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.335, 0.835]\nB: [0.526, 0.468]\nC: [0.441, 0.847]\nD: [0.584, 0.202]",
+    "question": "What is the position coordinates of the point with coordinates ([0.491, 0.453]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.335, 0.835]\nB: [0.526, 0.468]\nC: [0.441, 0.847]\nD: [0.584, 0.202]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_179_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_179_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.352, 0.43]\nB: [0.396, 0.842]\nC: [0.544, 0.168]\nD: [0.755, 0.432]",
+    "question": "What is the position coordinates of the point with coordinates ([0.352, 0.43]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.352, 0.43]\nB: [0.396, 0.842]\nC: [0.544, 0.168]\nD: [0.755, 0.432]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_180_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_180_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.738, 0.079]\nB: [0.295, 0.566]\nC: [0.04, 0.229]\nD: [0.771, 0.673]",
+    "question": "What is the position coordinates of the point with coordinates ([0.292, 0.642]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.738, 0.079]\nB: [0.295, 0.566]\nC: [0.04, 0.229]\nD: [0.771, 0.673]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_181_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_181_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.888, 0.387]\nB: [0.016, 0.294]\nC: [0.918, 0.591]\nD: [0.308, 0.501]",
+    "question": "What is the position coordinates of the point with coordinates ([0.0, 0.0]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.888, 0.387]\nB: [0.016, 0.294]\nC: [0.918, 0.591]\nD: [0.308, 0.501]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_182_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_182_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.383, 0.798]\nB: [0.668, 0.133]\nC: [0.133, 0.739]\nD: [0.192, 0.076]",
+    "question": "What is the position coordinates of the point with coordinates ([0.0, 0.0]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.383, 0.798]\nB: [0.668, 0.133]\nC: [0.133, 0.739]\nD: [0.192, 0.076]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_183_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_183_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.634, 0.284]\nB: [0.0, 0.0]\nC: [0.315, 0.604]\nD: [0.141, 0.357]",
+    "question": "What is the position coordinates of the point with coordinates ([0.0, 0.0]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.634, 0.284]\nB: [0.0, 0.0]\nC: [0.315, 0.604]\nD: [0.141, 0.357]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_184_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_184_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.691, 0.879]\nB: [0.362, 0.72]\nC: [0.157, 0.764]\nD: [0.272, 0.551]",
+    "question": "What is the position coordinates of the point with coordinates ([0.272, 0.551]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.691, 0.879]\nB: [0.362, 0.72]\nC: [0.157, 0.764]\nD: [0.272, 0.551]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_185_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_185_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.448, 0.266]\nB: [0.5, 0.567]\nC: [0.943, 0.037]\nD: [0.019, 0.535]",
+    "question": "What is the position coordinates of the point with coordinates ([0.448, 0.266]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.448, 0.266]\nB: [0.5, 0.567]\nC: [0.943, 0.037]\nD: [0.019, 0.535]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_186_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_186_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.664, 0.291]\nB: [0.629, 0.96]\nC: [0.638, 0.438]\nD: [0.072, 0.128]",
+    "question": "What is the position coordinates of the point with coordinates ([0.59, 0.45]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.664, 0.291]\nB: [0.629, 0.96]\nC: [0.638, 0.438]\nD: [0.072, 0.128]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_187_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_187_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.628, 0.379]\nB: [0.793, 0.079]\nC: [0.084, 0.828]\nD: [0.959, 0.595]",
+    "question": "What is the position coordinates of the point with coordinates ([0.959, 0.595]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.628, 0.379]\nB: [0.793, 0.079]\nC: [0.084, 0.828]\nD: [0.959, 0.595]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_188_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_188_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.086, 0.897]\nB: [0.891, 0.598]\nC: [0.731, 0.612]\nD: [0.338, -0.004]",
+    "question": "What is the position coordinates of the point with coordinates ([0.417, 0.005]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.086, 0.897]\nB: [0.891, 0.598]\nC: [0.731, 0.612]\nD: [0.338, -0.004]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_189_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_189_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.314, 0.635]\nB: [0.437, 0.344]\nC: [0.11, 0.731]\nD: [0.763, 0.089]",
+    "question": "What is the position coordinates of the point with coordinates ([0.437, 0.344]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.314, 0.635]\nB: [0.437, 0.344]\nC: [0.11, 0.731]\nD: [0.763, 0.089]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_190_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_190_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.303, 0.199]\nB: [0.353, 0.651]\nC: [0.302, 0.987]\nD: [0.305, 0.316]",
+    "question": "What is the position coordinates of the point with coordinates ([0.0, 0.0]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.303, 0.199]\nB: [0.353, 0.651]\nC: [0.302, 0.987]\nD: [0.305, 0.316]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_191_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_191_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.145, 0.87]\nB: [0.947, 0.301]\nC: [0.046, 0.995]\nD: [0.0, 0.0]",
+    "question": "What is the position coordinates of the point with coordinates ([0.465, 0.564]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.145, 0.87]\nB: [0.947, 0.301]\nC: [0.046, 0.995]\nD: [0.0, 0.0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_192_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_192_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.622, 0.432]\nB: [0.421, 0.201]\nC: [0.707, 0.491]\nD: [0.55, 0.329]",
+    "question": "What is the position coordinates of the point with coordinates ([0.513, 0.53]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.622, 0.432]\nB: [0.421, 0.201]\nC: [0.707, 0.491]\nD: [0.55, 0.329]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_193_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_193_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.556, 0.744]\nB: [0.085, 0.886]\nC: [0.475, 0.451]\nD: [0.417, 0.52]",
+    "question": "What is the position coordinates of the point with coordinates ([0.735, 0.381]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.556, 0.744]\nB: [0.085, 0.886]\nC: [0.475, 0.451]\nD: [0.417, 0.52]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_194_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_194_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['natural_image']",
+    "source": "tapvid_davis",
+    "options": "A: [0.089, 0.936]\nB: [0.642, 0.328]\nC: [0.611, 0.959]\nD: [0.166, 0.377]",
+    "question": "What is the position coordinates of the point with coordinates ([0.114, 0.302]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.",
+    "context": "Select from the following choices.\nA: [0.089, 0.936]\nB: [0.642, 0.328]\nC: [0.611, 0.959]\nD: [0.166, 0.377]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_195_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_195_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.443, 0.616]\nB: [0.663, 0.356]\nC: [0.079, 0.21]\nD: [0.586, -0.124]",
+    "question": "What is the position coordinates of the point with coordinates ([0.733, -0.02]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.443, 0.616]\nB: [0.663, 0.356]\nC: [0.079, 0.21]\nD: [0.586, -0.124]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_196_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_196_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.385, 0.321]\nB: [0.931, 0.242]\nC: [0.011, 0.867]\nD: [0.917, 0.788]",
+    "question": "What is the position coordinates of the point with coordinates ([0.385, 0.321]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.385, 0.321]\nB: [0.931, 0.242]\nC: [0.011, 0.867]\nD: [0.917, 0.788]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_197_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_197_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.757, 0.024]\nB: [0.333, -0.045]\nC: [0.773, 0.154]\nD: [0.253, 0.821]",
+    "question": "What is the position coordinates of the point with coordinates ([0.314, -0.005]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.757, 0.024]\nB: [0.333, -0.045]\nC: [0.773, 0.154]\nD: [0.253, 0.821]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_198_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_198_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "point_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "tapvid_rgb_stacking",
+    "options": "A: [0.627, 0.508]\nB: [0.71, 0.649]\nC: [0.888, 0.125]\nD: [0.302, 0.307]",
+    "question": "What is the position coordinates of the point with coordinates ([0.302, 0.306]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.",
+    "context": "Select from the following choices.\nA: [0.627, 0.508]\nB: [0.71, 0.649]\nC: [0.888, 0.125]\nD: [0.302, 0.307]",
+    "input_image_path": [
+      "../MMIU-Benchmark/point_tracking/point_tracking_199_0.jpg",
+      "../MMIU-Benchmark/point_tracking/point_tracking_199_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.105, 0.0, 0.539, 1.0]\nB: [0.231, 0.444, 0.698, 0.771]\nC: [0.204, 0.496, 0.49, 0.761]\nD: [0.105, 0.0, 0.624, 0.922]",
+    "question": "Here is an object ([0.166, 0.0, 0.589, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.105, 0.0, 0.539, 1.0]\nB: [0.231, 0.444, 0.698, 0.771]\nC: [0.204, 0.496, 0.49, 0.761]\nD: [0.105, 0.0, 0.624, 0.922]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_0_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_0_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.37, 0.132, 0.788, 0.61]\nB: [0.457, 0.328, 0.655, 0.681]\nC: [0.457, 0.328, 0.673, 0.635]\nD: [0.457, 0.328, 0.656, 0.582]",
+    "question": "Here is an object ([0.326, 0.224, 0.691, 0.644]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.37, 0.132, 0.788, 0.61]\nB: [0.457, 0.328, 0.655, 0.681]\nC: [0.457, 0.328, 0.673, 0.635]\nD: [0.457, 0.328, 0.656, 0.582]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_1_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_1_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.546, 0.242, 0.943, 1.0]\nB: [0.173, 0.0, 0.57, 0.758]\nC: [0.516, 0.2, 0.912, 0.958]\nD: [0.367, 0.242, 0.764, 1.0]",
+    "question": "Here is an object ([0.358, 0.26, 0.744, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.546, 0.242, 0.943, 1.0]\nB: [0.173, 0.0, 0.57, 0.758]\nC: [0.516, 0.2, 0.912, 0.958]\nD: [0.367, 0.242, 0.764, 1.0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_2_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_2_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.302, 0.299, 0.571, 1.0]\nB: [0.802, 0.301, 0.919, 0.514]\nC: [0.302, 0.299, 0.607, 0.882]\nD: [0.255, 0.124, 0.525, 0.825]",
+    "question": "Here is an object ([0.649, 0.335, 0.85, 0.992]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.302, 0.299, 0.571, 1.0]\nB: [0.802, 0.301, 0.919, 0.514]\nC: [0.302, 0.299, 0.607, 0.882]\nD: [0.255, 0.124, 0.525, 0.825]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_3_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_3_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.901, 0.0, 1.0, 0.304]\nB: [0.394, 0.317, 0.758, 0.617]\nC: [0.389, 0.294, 0.495, 0.551]\nD: [0.901, 0.0, 0.994, 0.3]",
+    "question": "Here is an object ([0.832, 0.0, 0.977, 0.472]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.901, 0.0, 1.0, 0.304]\nB: [0.394, 0.317, 0.758, 0.617]\nC: [0.389, 0.294, 0.495, 0.551]\nD: [0.901, 0.0, 0.994, 0.3]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_4_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_4_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.987, 0.792, 0.998, 0.876]\nB: [0.987, 0.792, 0.998, 0.871]\nC: [0.987, 0.792, 1.0, 0.892]\nD: [0.987, 0.792, 1.002, 0.901]",
+    "question": "Here is an object ([0.952, 0.703, 1.0, 0.904]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.987, 0.792, 0.998, 0.876]\nB: [0.987, 0.792, 0.998, 0.871]\nC: [0.987, 0.792, 1.0, 0.892]\nD: [0.987, 0.792, 1.002, 0.901]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_5_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_5_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.287, 0.453, 0.617, 0.774]\nB: [0.384, 0.432, 0.713, 0.753]\nC: [0.287, 0.453, 0.623, 0.828]\nD: [0.26, 0.356, 0.59, 0.676]",
+    "question": "Here is an object ([0.284, 0.369, 0.636, 0.674]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.287, 0.453, 0.617, 0.774]\nB: [0.384, 0.432, 0.713, 0.753]\nC: [0.287, 0.453, 0.623, 0.828]\nD: [0.26, 0.356, 0.59, 0.676]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_6_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_6_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.282, 0.0, 1.0, 0.736]\nB: [0.282, 0.0, 1.047, 0.79]\nC: [0.248, 0.156, 0.966, 0.892]\nD: [0.186, 0.067, 0.904, 0.803]",
+    "question": "Here is an object ([0.312, 0.0, 1.0, 0.736]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.282, 0.0, 1.0, 0.736]\nB: [0.282, 0.0, 1.047, 0.79]\nC: [0.248, 0.156, 0.966, 0.892]\nD: [0.186, 0.067, 0.904, 0.803]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_7_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_7_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.863, 0.157, 0.884, 0.624]\nB: [0.159, 0.19, 0.68, 1.0]\nC: [0.159, 0.19, 0.737, 1.156]\nD: [0.159, 0.19, 0.72, 1.014]",
+    "question": "Here is an object ([0.174, 0.19, 0.691, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.863, 0.157, 0.884, 0.624]\nB: [0.159, 0.19, 0.68, 1.0]\nC: [0.159, 0.19, 0.737, 1.156]\nD: [0.159, 0.19, 0.72, 1.014]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_8_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_8_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.309, 0.435, 0.693, 1.0]\nB: [0.786, 0.633, 0.927, 0.776]\nC: [0.263, 0.193, 0.647, 0.758]\nD: [0.263, 0.193, 0.617, 0.678]",
+    "question": "Here is an object ([0.227, 0.218, 0.607, 0.787]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.309, 0.435, 0.693, 1.0]\nB: [0.786, 0.633, 0.927, 0.776]\nC: [0.263, 0.193, 0.647, 0.758]\nD: [0.263, 0.193, 0.617, 0.678]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_9_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_9_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.013, 0.201, 0.494, 0.7]\nB: [0.177, 0.003, 0.868, 1.0]\nC: [0.129, 0.61, 0.238, 0.793]\nD: [0.243, 0.0, 0.934, 0.997]",
+    "question": "Here is an object ([0.125, 0.0, 0.804, 0.988]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.013, 0.201, 0.494, 0.7]\nB: [0.177, 0.003, 0.868, 1.0]\nC: [0.129, 0.61, 0.238, 0.793]\nD: [0.243, 0.0, 0.934, 0.997]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_10_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_10_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.095, 0.672, 0.513, 0.838]\nB: [0.0, 0.257, 0.27, 1.056]\nC: [0.096, 0.265, 0.371, 1.0]\nD: [0.0, 0.257, 0.275, 0.992]",
+    "question": "Here is an object ([0.0, 0.261, 0.28, 0.997]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.095, 0.672, 0.513, 0.838]\nB: [0.0, 0.257, 0.27, 1.056]\nC: [0.096, 0.265, 0.371, 1.0]\nD: [0.0, 0.257, 0.275, 0.992]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_11_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_11_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.173, 0.05, 0.833, 1.1]\nB: [0.173, 0.05, 0.754, 1.04]\nC: [0.173, 0.05, 0.789, 1.0]\nD: [0.173, 0.05, 0.712, 0.936]",
+    "question": "Here is an object ([0.223, 0.032, 0.773, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.173, 0.05, 0.833, 1.1]\nB: [0.173, 0.05, 0.754, 1.04]\nC: [0.173, 0.05, 0.789, 1.0]\nD: [0.173, 0.05, 0.712, 0.936]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_12_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_12_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.0, 0.042, 0.523, 1.0]\nB: [0.0, 0.042, 0.434, 0.851]\nC: [0.205, 0.0, 0.728, 0.958]\nD: [0.259, 0.042, 0.782, 1.0]",
+    "question": "Here is an object ([0.116, 0.024, 0.778, 0.988]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.0, 0.042, 0.523, 1.0]\nB: [0.0, 0.042, 0.434, 0.851]\nC: [0.205, 0.0, 0.728, 0.958]\nD: [0.259, 0.042, 0.782, 1.0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_13_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_13_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.177, 0.135, 0.627, 0.492]\nB: [0.442, 0.412, 0.921, 0.872]\nC: [0.249, 0.269, 0.701, 0.599]\nD: [0.241, 0.326, 0.693, 0.656]",
+    "question": "Here is an object ([0.232, 0.326, 0.684, 0.656]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.177, 0.135, 0.627, 0.492]\nB: [0.442, 0.412, 0.921, 0.872]\nC: [0.249, 0.269, 0.701, 0.599]\nD: [0.241, 0.326, 0.693, 0.656]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_14_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_14_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.124, 0.142, 0.57, 0.851]\nB: [0.124, 0.142, 0.712, 1.079]\nC: [0.124, 0.142, 0.643, 0.931]\nD: [0.445, 0.485, 0.785, 0.751]",
+    "question": "Here is an object ([0.123, 0.161, 0.635, 0.931]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.124, 0.142, 0.57, 0.851]\nB: [0.124, 0.142, 0.712, 1.079]\nC: [0.124, 0.142, 0.643, 0.931]\nD: [0.445, 0.485, 0.785, 0.751]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_15_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_15_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.337, 0.263, 0.374, 0.406]\nB: [0.325, 0.269, 0.362, 0.412]\nC: [0.325, 0.269, 0.362, 0.392]\nD: [0.265, 0.403, 0.642, 0.54]",
+    "question": "Here is an object ([0.311, 0.271, 0.363, 0.412]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.337, 0.263, 0.374, 0.406]\nB: [0.325, 0.269, 0.362, 0.412]\nC: [0.325, 0.269, 0.362, 0.392]\nD: [0.265, 0.403, 0.642, 0.54]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_16_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_16_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.07, 0.04, 0.201, 0.126]\nB: [0.326, 0.0, 0.671, 0.593]\nC: [0.326, 0.0, 0.797, 0.729]\nD: [0.326, 0.0, 0.73, 0.738]",
+    "question": "Here is an object ([0.37, 0.0, 0.701, 0.883]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.07, 0.04, 0.201, 0.126]\nB: [0.326, 0.0, 0.671, 0.593]\nC: [0.326, 0.0, 0.797, 0.729]\nD: [0.326, 0.0, 0.73, 0.738]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_17_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_17_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.467, 0.647, 0.685, 0.889]\nB: [0.494, 0.543, 0.712, 0.785]\nC: [0.649, 0.751, 0.892, 0.776]\nD: [0.494, 0.543, 0.677, 0.764]",
+    "question": "Here is an object ([0.523, 0.457, 0.773, 0.708]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.467, 0.647, 0.685, 0.889]\nB: [0.494, 0.543, 0.712, 0.785]\nC: [0.649, 0.751, 0.892, 0.776]\nD: [0.494, 0.543, 0.677, 0.764]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_18_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_18_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.0, 0.0, 0.879, 0.878]\nB: [0.0, 0.0, 0.892, 0.821]\nC: [0.0, 0.0, 0.992, 0.739]\nD: [0.0, 0.0, 0.894, 1.05]",
+    "question": "Here is an object ([0.0, 0.0, 0.883, 0.86]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.0, 0.0, 0.879, 0.878]\nB: [0.0, 0.0, 0.892, 0.821]\nC: [0.0, 0.0, 0.992, 0.739]\nD: [0.0, 0.0, 0.894, 1.05]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_19_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_19_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.005, 0.846, 0.023, 0.993]\nB: [0.005, 0.846, 0.023, 0.994]\nC: [0.005, 0.846, 0.02, 0.965]\nD: [0.311, 0.061, 0.434, 0.287]",
+    "question": "Here is an object ([0.0, 0.8, 0.043, 0.996]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.005, 0.846, 0.023, 0.993]\nB: [0.005, 0.846, 0.023, 0.994]\nC: [0.005, 0.846, 0.02, 0.965]\nD: [0.311, 0.061, 0.434, 0.287]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_20_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_20_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.323, 0.332, 0.442, 0.565]\nB: [0.323, 0.332, 0.47, 0.558]\nC: [0.323, 0.332, 0.495, 0.537]\nD: [0.323, 0.332, 0.477, 0.526]",
+    "question": "Here is an object ([0.0, 0.05, 0.374, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.323, 0.332, 0.442, 0.565]\nB: [0.323, 0.332, 0.47, 0.558]\nC: [0.323, 0.332, 0.495, 0.537]\nD: [0.323, 0.332, 0.477, 0.526]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_21_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_21_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.292, 0.053, 0.432, 0.251]\nB: [0.27, 0.118, 0.41, 0.317]\nC: [0.27, 0.049, 0.409, 0.247]\nD: [0.689, 0.281, 0.863, 0.306]",
+    "question": "Here is an object ([0.245, 0.086, 0.4, 0.275]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.292, 0.053, 0.432, 0.251]\nB: [0.27, 0.118, 0.41, 0.317]\nC: [0.27, 0.049, 0.409, 0.247]\nD: [0.689, 0.281, 0.863, 0.306]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_22_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_22_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.322, 0.804, 0.577, 0.947]\nB: [0.353, 0.024, 0.68, 0.975]\nC: [0.353, 0.024, 0.669, 0.874]\nD: [0.733, 0.014, 0.774, 0.058]",
+    "question": "Here is an object ([0.0, 0.0, 0.524, 0.828]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.322, 0.804, 0.577, 0.947]\nB: [0.353, 0.024, 0.68, 0.975]\nC: [0.353, 0.024, 0.669, 0.874]\nD: [0.733, 0.014, 0.774, 0.058]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_23_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_23_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.184, 0.0, 0.909, 0.993]\nB: [0.184, 0.0, 1.027, 0.949]\nC: [0.199, 0.0, 0.924, 0.993]\nD: [0.184, 0.0, 1.048, 0.854]",
+    "question": "Here is an object ([0.086, 0.0, 0.87, 0.919]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.184, 0.0, 0.909, 0.993]\nB: [0.184, 0.0, 1.027, 0.949]\nC: [0.199, 0.0, 0.924, 0.993]\nD: [0.184, 0.0, 1.048, 0.854]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_24_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_24_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.047, 0.869, 0.259, 0.904]\nB: [0.443, 0.679, 0.677, 1.011]\nC: [0.443, 0.679, 0.708, 0.976]\nD: [0.566, 0.703, 0.831, 1.0]",
+    "question": "Here is an object ([0.441, 0.71, 0.689, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.047, 0.869, 0.259, 0.904]\nB: [0.443, 0.679, 0.677, 1.011]\nC: [0.443, 0.679, 0.708, 0.976]\nD: [0.566, 0.703, 0.831, 1.0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_25_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_25_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.0, 0.001, 0.45, 1.0]\nB: [0.173, 0.001, 0.623, 1.0]\nC: [0.0, 0.001, 0.441, 1.01]\nD: [0.0, 0.001, 0.377, 1.032]",
+    "question": "Here is an object ([0.0, 0.0, 0.402, 0.996]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.0, 0.001, 0.45, 1.0]\nB: [0.173, 0.001, 0.623, 1.0]\nC: [0.0, 0.001, 0.441, 1.01]\nD: [0.0, 0.001, 0.377, 1.032]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_26_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_26_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.285, 0.181, 0.881, 0.531]\nB: [0.325, 0.09, 0.738, 0.329]\nC: [0.6, 0.492, 0.78, 0.658]\nD: [0.285, 0.181, 0.992, 0.608]",
+    "question": "Here is an object ([0.277, 0.196, 0.994, 0.61]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.285, 0.181, 0.881, 0.531]\nB: [0.325, 0.09, 0.738, 0.329]\nC: [0.6, 0.492, 0.78, 0.658]\nD: [0.285, 0.181, 0.992, 0.608]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_27_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_27_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.596, 0.311, 0.855, 0.971]\nB: [0.656, 0.222, 0.916, 0.882]\nC: [0.181, 0.185, 0.651, 0.349]\nD: [0.57, 0.24, 0.83, 0.9]",
+    "question": "Here is an object ([0.67, 0.219, 0.91, 0.886]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.596, 0.311, 0.855, 0.971]\nB: [0.656, 0.222, 0.916, 0.882]\nC: [0.181, 0.185, 0.651, 0.349]\nD: [0.57, 0.24, 0.83, 0.9]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_28_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_28_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.33, 0.006, 0.963, 0.889]\nB: [0.427, 0.299, 0.457, 0.435]\nC: [0.323, 0.642, 0.555, 0.656]\nD: [0.33, 0.006, 0.966, 1.0]",
+    "question": "Here is an object ([0.304, 0.001, 0.951, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.33, 0.006, 0.963, 0.889]\nB: [0.427, 0.299, 0.457, 0.435]\nC: [0.323, 0.642, 0.555, 0.656]\nD: [0.33, 0.006, 0.966, 1.0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_29_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_29_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.298, 0.661, 0.788, 0.753]\nB: [0.0, 0.0, 0.955, 0.996]\nC: [0.502, 0.29, 0.769, 0.553]\nD: [0.0, 0.004, 0.955, 1.0]",
+    "question": "Here is an object ([0.0, 0.0, 0.893, 0.999]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.298, 0.661, 0.788, 0.753]\nB: [0.0, 0.0, 0.955, 0.996]\nC: [0.502, 0.29, 0.769, 0.553]\nD: [0.0, 0.004, 0.955, 1.0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_30_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_30_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.378, 0.415, 0.458, 0.821]\nB: [0.378, 0.415, 0.467, 0.843]\nC: [0.378, 0.415, 0.462, 0.858]\nD: [0.378, 0.415, 0.473, 0.764]",
+    "question": "Here is an object ([0.366, 0.428, 0.459, 0.826]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.378, 0.415, 0.458, 0.821]\nB: [0.378, 0.415, 0.467, 0.843]\nC: [0.378, 0.415, 0.462, 0.858]\nD: [0.378, 0.415, 0.473, 0.764]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_31_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_31_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.266, 0.356, 0.714, 0.89]\nB: [0.255, 0.425, 0.769, 0.932]\nC: [0.266, 0.356, 0.78, 0.863]\nD: [0.33, 0.275, 0.54, 0.724]",
+    "question": "Here is an object ([0.268, 0.399, 0.774, 0.89]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.266, 0.356, 0.714, 0.89]\nB: [0.255, 0.425, 0.769, 0.932]\nC: [0.266, 0.356, 0.78, 0.863]\nD: [0.33, 0.275, 0.54, 0.724]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_32_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_32_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.878, 0.772, 0.995, 0.833]\nB: [0.134, 0.675, 0.455, 0.933]\nC: [0.397, 0.556, 0.869, 0.714]\nD: [0.134, 0.675, 0.518, 0.892]",
+    "question": "Here is an object ([0.108, 0.626, 0.434, 0.892]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.878, 0.772, 0.995, 0.833]\nB: [0.134, 0.675, 0.455, 0.933]\nC: [0.397, 0.556, 0.869, 0.714]\nD: [0.134, 0.675, 0.518, 0.892]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_33_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_33_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.564, 0.332, 0.63, 0.44]\nB: [0.545, 0.307, 0.611, 0.415]\nC: [0.528, 0.263, 0.595, 0.371]\nD: [0.547, 0.319, 0.613, 0.428]",
+    "question": "Here is an object ([0.593, 0.332, 0.659, 0.447]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.564, 0.332, 0.63, 0.44]\nB: [0.545, 0.307, 0.611, 0.415]\nC: [0.528, 0.263, 0.595, 0.371]\nD: [0.547, 0.319, 0.613, 0.428]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_34_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_34_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.217, 0.24, 0.541, 0.761]\nB: [0.119, 0.435, 0.442, 0.956]\nC: [0.217, 0.24, 0.478, 0.786]\nD: [0.138, 0.474, 0.461, 0.994]",
+    "question": "Here is an object ([0.23, 0.247, 0.55, 0.715]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.217, 0.24, 0.541, 0.761]\nB: [0.119, 0.435, 0.442, 0.956]\nC: [0.217, 0.24, 0.478, 0.786]\nD: [0.138, 0.474, 0.461, 0.994]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_35_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_35_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.003, 0.11, 0.245, 0.188]\nB: [0.0, 0.0, 0.334, 0.435]\nC: [0.0, 0.0, 0.31, 0.529]\nD: [0.0, 0.0, 0.304, 0.487]",
+    "question": "Here is an object ([0.0, 0.0, 0.263, 0.576]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.003, 0.11, 0.245, 0.188]\nB: [0.0, 0.0, 0.334, 0.435]\nC: [0.0, 0.0, 0.31, 0.529]\nD: [0.0, 0.0, 0.304, 0.487]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_36_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_36_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.0, 0.317, 1.094, 0.986]\nB: [0.428, 0.043, 0.832, 0.14]\nC: [0.0, 0.368, 1.0, 0.989]\nD: [0.0, 0.317, 1.0, 0.938]",
+    "question": "Here is an object ([0.609, 0.0, 0.853, 0.433]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.0, 0.317, 1.094, 0.986]\nB: [0.428, 0.043, 0.832, 0.14]\nC: [0.0, 0.368, 1.0, 0.989]\nD: [0.0, 0.317, 1.0, 0.938]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_37_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_37_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.477, 0.165, 0.613, 0.461]\nB: [0.477, 0.083, 0.614, 0.379]\nC: [0.486, 0.0, 0.623, 0.296]\nD: [0.18, 0.044, 0.549, 0.249]",
+    "question": "Here is an object ([0.469, 0.09, 0.59, 0.317]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.477, 0.165, 0.613, 0.461]\nB: [0.477, 0.083, 0.614, 0.379]\nC: [0.486, 0.0, 0.623, 0.296]\nD: [0.18, 0.044, 0.549, 0.249]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_38_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_38_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.204, 0.226, 0.796, 0.667]\nB: [0.85, 0.326, 0.93, 0.539]\nC: [0.317, 0.108, 0.652, 0.579]\nD: [0.204, 0.226, 0.846, 0.713]",
+    "question": "Here is an object ([0.187, 0.107, 0.821, 0.719]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.204, 0.226, 0.796, 0.667]\nB: [0.85, 0.326, 0.93, 0.539]\nC: [0.317, 0.108, 0.652, 0.579]\nD: [0.204, 0.226, 0.846, 0.713]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_39_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_39_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.222, 0.174, 0.717, 1.0]\nB: [0.369, 0.033, 0.864, 0.86]\nC: [0.403, 0.0, 0.898, 0.826]\nD: [0.105, 0.729, 0.198, 0.839]",
+    "question": "Here is an object ([0.263, 0.168, 0.714, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.222, 0.174, 0.717, 1.0]\nB: [0.369, 0.033, 0.864, 0.86]\nC: [0.403, 0.0, 0.898, 0.826]\nD: [0.105, 0.729, 0.198, 0.839]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_40_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_40_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.101, 0.694, 0.357, 0.807]\nB: [0.592, 0.454, 0.698, 0.651]\nC: [0.592, 0.454, 0.694, 0.631]\nD: [0.34, 0.282, 0.835, 0.693]",
+    "question": "Here is an object ([0.541, 0.482, 0.603, 0.624]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.101, 0.694, 0.357, 0.807]\nB: [0.592, 0.454, 0.698, 0.651]\nC: [0.592, 0.454, 0.694, 0.631]\nD: [0.34, 0.282, 0.835, 0.693]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_41_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_41_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.398, 0.606, 0.528, 0.767]\nB: [0.398, 0.606, 0.509, 0.774]\nC: [0.398, 0.606, 0.507, 0.794]\nD: [0.384, 0.192, 0.498, 0.551]",
+    "question": "Here is an object ([0.359, 0.608, 0.466, 0.8]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.398, 0.606, 0.528, 0.767]\nB: [0.398, 0.606, 0.509, 0.774]\nC: [0.398, 0.606, 0.507, 0.794]\nD: [0.384, 0.192, 0.498, 0.551]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_42_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_42_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.216, 0.225, 0.783, 0.904]\nB: [0.216, 0.225, 0.738, 1.065]\nC: [0.216, 0.225, 0.701, 1.0]\nD: [0.216, 0.225, 0.73, 1.015]",
+    "question": "Here is an object ([0.226, 0.208, 0.703, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.216, 0.225, 0.783, 0.904]\nB: [0.216, 0.225, 0.738, 1.065]\nC: [0.216, 0.225, 0.701, 1.0]\nD: [0.216, 0.225, 0.73, 1.015]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_43_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_43_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.425, 0.474, 0.487, 0.668]\nB: [0.425, 0.474, 0.493, 0.706]\nC: [0.425, 0.474, 0.496, 0.647]\nD: [0.439, 0.428, 0.502, 0.622]",
+    "question": "Here is an object ([0.417, 0.481, 0.48, 0.7]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.425, 0.474, 0.487, 0.668]\nB: [0.425, 0.474, 0.493, 0.706]\nC: [0.425, 0.474, 0.496, 0.647]\nD: [0.439, 0.428, 0.502, 0.622]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_44_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_44_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.059, 0.0, 0.673, 0.483]\nB: [0.0, 0.0, 0.576, 0.44]\nC: [0.123, 0.692, 0.539, 0.775]\nD: [0.059, 0.0, 0.634, 0.44]",
+    "question": "Here is an object ([0.203, 0.0, 0.616, 0.404]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.059, 0.0, 0.673, 0.483]\nB: [0.0, 0.0, 0.576, 0.44]\nC: [0.123, 0.692, 0.539, 0.775]\nD: [0.059, 0.0, 0.634, 0.44]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_45_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_45_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.541, 0.742, 0.866, 0.985]\nB: [0.602, 0.447, 0.882, 0.914]\nC: [0.471, 0.222, 0.751, 0.689]\nD: [0.471, 0.222, 0.738, 0.765]",
+    "question": "Here is an object ([0.479, 0.236, 0.73, 0.683]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.541, 0.742, 0.866, 0.985]\nB: [0.602, 0.447, 0.882, 0.914]\nC: [0.471, 0.222, 0.751, 0.689]\nD: [0.471, 0.222, 0.738, 0.765]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_46_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_46_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.201, 0.0, 0.727, 0.497]\nB: [0.201, 0.0, 0.724, 0.519]\nC: [0.201, 0.0, 0.67, 0.574]\nD: [0.201, 0.0, 0.666, 0.542]",
+    "question": "Here is an object ([0.152, 0.0, 0.666, 0.521]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.201, 0.0, 0.727, 0.497]\nB: [0.201, 0.0, 0.724, 0.519]\nC: [0.201, 0.0, 0.67, 0.574]\nD: [0.201, 0.0, 0.666, 0.542]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_47_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_47_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.03, 0.099, 0.872, 1.0]\nB: [0.093, 0.028, 0.167, 0.235]\nC: [0.158, 0.0, 1.0, 0.901]\nD: [0.158, 0.099, 1.0, 1.0]",
+    "question": "Here is an object ([0.044, 0.067, 0.886, 0.978]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.03, 0.099, 0.872, 1.0]\nB: [0.093, 0.028, 0.167, 0.235]\nC: [0.158, 0.0, 1.0, 0.901]\nD: [0.158, 0.099, 1.0, 1.0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_48_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_48_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.285, 0.124, 0.72, 0.76]\nB: [0.463, 0.321, 0.897, 0.957]\nC: [0.372, 0.103, 0.648, 0.493]\nD: [0.285, 0.124, 0.778, 0.754]",
+    "question": "Here is an object ([0.282, 0.122, 0.711, 0.85]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 960 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.285, 0.124, 0.72, 0.76]\nB: [0.463, 0.321, 0.897, 0.957]\nC: [0.372, 0.103, 0.648, 0.493]\nD: [0.285, 0.124, 0.778, 0.754]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_49_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_49_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.259, 0.365, 0.986, 0.932]\nB: [0.01, 0.172, 0.737, 0.739]\nC: [0.273, 0.2, 1.0, 0.767]\nD: [0.091, 0.126, 0.818, 0.693]",
+    "question": "Here is an object ([0.291, 0.342, 0.989, 0.933]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.259, 0.365, 0.986, 0.932]\nB: [0.01, 0.172, 0.737, 0.739]\nC: [0.273, 0.2, 1.0, 0.767]\nD: [0.091, 0.126, 0.818, 0.693]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_50_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_50_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.152, 0.16, 0.498, 0.782]\nB: [0.28, 0.342, 0.627, 0.964]\nC: [0.582, 0.299, 0.674, 0.576]\nD: [0.314, 0.397, 0.733, 0.424]",
+    "question": "Here is an object ([0.072, 0.21, 0.463, 0.842]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.152, 0.16, 0.498, 0.782]\nB: [0.28, 0.342, 0.627, 0.964]\nC: [0.582, 0.299, 0.674, 0.576]\nD: [0.314, 0.397, 0.733, 0.424]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_51_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_51_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.269, 0.0, 1.13, 0.608]\nB: [0.163, 0.639, 0.546, 0.861]\nC: [0.069, 0.075, 0.8, 0.803]\nD: [0.269, 0.0, 1.0, 0.728]",
+    "question": "Here is an object ([0.222, 0.0, 1.0, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.269, 0.0, 1.13, 0.608]\nB: [0.163, 0.639, 0.546, 0.861]\nC: [0.069, 0.075, 0.8, 0.803]\nD: [0.269, 0.0, 1.0, 0.728]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_52_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_52_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.62, 0.253, 0.788, 0.403]\nB: [0.0, 0.097, 0.658, 0.765]\nC: [0.0, 0.0, 0.658, 0.668]\nD: [0.0, 0.097, 0.552, 0.667]",
+    "question": "Here is an object ([0.0, 0.143, 0.421, 0.783]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.62, 0.253, 0.788, 0.403]\nB: [0.0, 0.097, 0.658, 0.765]\nC: [0.0, 0.0, 0.658, 0.668]\nD: [0.0, 0.097, 0.552, 0.667]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_53_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_53_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.582, 0.371, 1.0, 1.0]\nB: [0.544, 0.328, 0.894, 0.85]\nC: [0.205, 0.11, 0.595, 0.439]\nD: [0.544, 0.328, 0.962, 0.957]",
+    "question": "Here is an object ([0.555, 0.332, 0.999, 0.976]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.582, 0.371, 1.0, 1.0]\nB: [0.544, 0.328, 0.894, 0.85]\nC: [0.205, 0.11, 0.595, 0.439]\nD: [0.544, 0.328, 0.962, 0.957]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_54_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_54_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.391, 0.686, 0.805, 0.978]\nB: [0.07, 0.022, 0.341, 0.492]\nC: [0.041, 0.082, 0.181, 0.229]\nD: [0.289, 0.708, 0.703, 1.0]",
+    "question": "Here is an object ([0.306, 0.303, 0.735, 0.643]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.391, 0.686, 0.805, 0.978]\nB: [0.07, 0.022, 0.341, 0.492]\nC: [0.041, 0.082, 0.181, 0.229]\nD: [0.289, 0.708, 0.703, 1.0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_55_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_55_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.277, 0.308, 0.294, 0.386]\nB: [0.463, 0.725, 0.892, 0.968]\nC: [0.277, 0.274, 0.294, 0.351]\nD: [0.113, 0.29, 0.595, 0.572]",
+    "question": "Here is an object ([0.277, 0.307, 0.298, 0.386]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.277, 0.308, 0.294, 0.386]\nB: [0.463, 0.725, 0.892, 0.968]\nC: [0.277, 0.274, 0.294, 0.351]\nD: [0.113, 0.29, 0.595, 0.572]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_56_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_56_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.296, 0.147, 0.609, 0.639]\nB: [0.296, 0.147, 0.559, 0.657]\nC: [0.296, 0.147, 0.628, 0.557]\nD: [0.296, 0.147, 0.656, 0.542]",
+    "question": "Here is an object ([0.292, 0.154, 0.622, 0.629]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 960 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.296, 0.147, 0.609, 0.639]\nB: [0.296, 0.147, 0.559, 0.657]\nC: [0.296, 0.147, 0.628, 0.557]\nD: [0.296, 0.147, 0.656, 0.542]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_57_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_57_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.2, 0.249, 0.502, 0.331]\nB: [0.226, 0.244, 0.654, 0.808]\nC: [0.296, 0.436, 0.724, 1.0]\nD: [0.289, 0.436, 0.717, 1.0]",
+    "question": "Here is an object ([0.207, 0.207, 0.639, 0.775]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.2, 0.249, 0.502, 0.331]\nB: [0.226, 0.244, 0.654, 0.808]\nC: [0.296, 0.436, 0.724, 1.0]\nD: [0.289, 0.436, 0.717, 1.0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_58_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_58_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.133, 0.075, 0.634, 1.0]\nB: [0.228, 0.069, 0.748, 0.904]\nC: [0.011, 0.0, 0.512, 0.925]\nD: [0.228, 0.069, 0.73, 0.994]",
+    "question": "Here is an object ([0.227, 0.072, 0.729, 0.996]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.133, 0.075, 0.634, 1.0]\nB: [0.228, 0.069, 0.748, 0.904]\nC: [0.011, 0.0, 0.512, 0.925]\nD: [0.228, 0.069, 0.73, 0.994]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_59_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_59_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.616, 0.381, 0.995, 0.599]\nB: [0.62, 0.293, 1.0, 0.511]\nC: [0.62, 0.276, 1.0, 0.494]\nD: [0.616, 0.381, 0.992, 0.575]",
+    "question": "Here is an object ([0.579, 0.451, 0.773, 0.635]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.616, 0.381, 0.995, 0.599]\nB: [0.62, 0.293, 1.0, 0.511]\nC: [0.62, 0.276, 1.0, 0.494]\nD: [0.616, 0.381, 0.992, 0.575]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_60_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_60_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.452, 0.315, 0.981, 0.608]\nB: [0.471, 0.414, 1.0, 0.707]\nC: [0.645, 0.575, 0.883, 0.993]\nD: [0.471, 0.414, 1.002, 0.722]",
+    "question": "Here is an object ([0.427, 0.403, 1.0, 0.747]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.452, 0.315, 0.981, 0.608]\nB: [0.471, 0.414, 1.0, 0.707]\nC: [0.645, 0.575, 0.883, 0.993]\nD: [0.471, 0.414, 1.002, 0.722]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_61_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_61_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.473, 0.0, 0.755, 0.89]\nB: [0.411, 0.11, 0.694, 1.0]\nC: [0.411, 0.11, 0.677, 1.015]\nD: [0.411, 0.11, 0.737, 0.967]",
+    "question": "Here is an object ([0.456, 0.044, 0.677, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.473, 0.0, 0.755, 0.89]\nB: [0.411, 0.11, 0.694, 1.0]\nC: [0.411, 0.11, 0.677, 1.015]\nD: [0.411, 0.11, 0.737, 0.967]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_62_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_62_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.31, 0.115, 0.683, 1.0]\nB: [0.216, 0.115, 0.589, 1.0]\nC: [0.367, 0.115, 0.74, 1.0]\nD: [0.455, 0.0, 0.827, 0.885]",
+    "question": "Here is an object ([0.31, 0.121, 0.82, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.31, 0.115, 0.683, 1.0]\nB: [0.216, 0.115, 0.589, 1.0]\nC: [0.367, 0.115, 0.74, 1.0]\nD: [0.455, 0.0, 0.827, 0.885]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_63_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_63_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.252, 0.497, 0.337, 0.736]\nB: [0.29, 0.536, 0.373, 0.743]\nC: [0.442, 0.25, 0.609, 0.683]\nD: [0.252, 0.497, 0.334, 0.704]",
+    "question": "Here is an object ([0.245, 0.492, 0.323, 0.704]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.252, 0.497, 0.337, 0.736]\nB: [0.29, 0.536, 0.373, 0.743]\nC: [0.442, 0.25, 0.609, 0.683]\nD: [0.252, 0.497, 0.334, 0.704]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_64_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_64_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.389, 0.19, 0.691, 0.878]\nB: [0.389, 0.19, 0.753, 0.963]\nC: [0.442, 0.512, 0.885, 0.811]\nD: [0.063, 0.44, 0.373, 0.589]",
+    "question": "Here is an object ([0.397, 0.182, 0.833, 0.95]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.389, 0.19, 0.691, 0.878]\nB: [0.389, 0.19, 0.753, 0.963]\nC: [0.442, 0.512, 0.885, 0.811]\nD: [0.063, 0.44, 0.373, 0.589]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_65_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_65_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.529, 0.0, 0.904, 0.494]\nB: [0.529, 0.0, 0.939, 0.551]\nC: [0.52, 0.0, 0.93, 0.551]\nD: [0.331, 0.583, 0.744, 0.786]",
+    "question": "Here is an object ([0.49, 0.0, 0.793, 0.537]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.529, 0.0, 0.904, 0.494]\nB: [0.529, 0.0, 0.939, 0.551]\nC: [0.52, 0.0, 0.93, 0.551]\nD: [0.331, 0.583, 0.744, 0.786]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_66_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_66_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.217, 0.232, 0.977, 1.054]\nB: [0.217, 0.232, 1.0, 1.0]\nC: [0.457, 0.535, 0.472, 0.564]\nD: [0.217, 0.232, 1.141, 1.086]",
+    "question": "Here is an object ([0.18, 0.047, 1.0, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.217, 0.232, 0.977, 1.054]\nB: [0.217, 0.232, 1.0, 1.0]\nC: [0.457, 0.535, 0.472, 0.564]\nD: [0.217, 0.232, 1.141, 1.086]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_67_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_67_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.53, 0.529, 0.565, 0.629]\nB: [0.536, 0.481, 0.57, 0.581]\nC: [0.53, 0.554, 0.564, 0.654]\nD: [0.514, 0.487, 0.548, 0.588]",
+    "question": "Here is an object ([0.497, 0.794, 0.542, 0.875]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.53, 0.529, 0.565, 0.629]\nB: [0.536, 0.481, 0.57, 0.581]\nC: [0.53, 0.554, 0.564, 0.654]\nD: [0.514, 0.487, 0.548, 0.588]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_68_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_68_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.546, 0.211, 0.739, 1.0]\nB: [0.359, 0.875, 0.488, 0.932]\nC: [0.701, 0.114, 0.773, 0.361]\nD: [0.546, 0.211, 0.75, 1.131]",
+    "question": "Here is an object ([0.492, 0.349, 0.672, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.546, 0.211, 0.739, 1.0]\nB: [0.359, 0.875, 0.488, 0.932]\nC: [0.701, 0.114, 0.773, 0.361]\nD: [0.546, 0.211, 0.75, 1.131]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_69_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_69_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.338, 0.222, 0.376, 0.406]\nB: [0.332, 0.132, 0.37, 0.315]\nC: [0.338, 0.222, 0.373, 0.375]\nD: [0.461, 0.596, 0.902, 0.999]",
+    "question": "Here is an object ([0.28, 0.2, 0.309, 0.4]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.338, 0.222, 0.376, 0.406]\nB: [0.332, 0.132, 0.37, 0.315]\nC: [0.338, 0.222, 0.373, 0.375]\nD: [0.461, 0.596, 0.902, 0.999]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_70_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_70_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.016, 0.403, 0.025, 0.493]\nB: [0.373, 0.369, 0.523, 0.775]\nC: [0.373, 0.369, 0.531, 0.815]\nD: [0.328, 0.226, 0.478, 0.632]",
+    "question": "Here is an object ([0.359, 0.286, 0.48, 0.728]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.016, 0.403, 0.025, 0.493]\nB: [0.373, 0.369, 0.523, 0.775]\nC: [0.373, 0.369, 0.531, 0.815]\nD: [0.328, 0.226, 0.478, 0.632]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_71_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_71_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.489, 0.412, 0.794, 0.653]\nB: [0.0, 0.0, 1.0, 1.0]\nC: [0.0, 0.0, 1.031, 1.006]\nD: [0.0, 0.0, 0.987, 1.135]",
+    "question": "Here is an object ([0.0, 0.0, 1.0, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.489, 0.412, 0.794, 0.653]\nB: [0.0, 0.0, 1.0, 1.0]\nC: [0.0, 0.0, 1.031, 1.006]\nD: [0.0, 0.0, 0.987, 1.135]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_72_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_72_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.607, 0.454, 0.948, 0.86]\nB: [0.589, 0.282, 0.745, 0.399]\nC: [0.095, 0.358, 0.78, 1.0]\nD: [0.0, 0.358, 0.686, 1.0]",
+    "question": "Here is an object ([0.0, 0.194, 0.704, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.607, 0.454, 0.948, 0.86]\nB: [0.589, 0.282, 0.745, 0.399]\nC: [0.095, 0.358, 0.78, 1.0]\nD: [0.0, 0.358, 0.686, 1.0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_73_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_73_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.412, 0.367, 0.697, 0.811]\nB: [0.287, 0.19, 0.572, 0.635]\nC: [0.38, 0.192, 0.665, 0.636]\nD: [0.237, 0.568, 0.317, 0.772]",
+    "question": "Here is an object ([0.397, 0.174, 0.659, 0.717]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.412, 0.367, 0.697, 0.811]\nB: [0.287, 0.19, 0.572, 0.635]\nC: [0.38, 0.192, 0.665, 0.636]\nD: [0.237, 0.568, 0.317, 0.772]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_74_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_74_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.367, 0.144, 0.611, 0.964]\nB: [0.337, 0.181, 0.58, 1.0]\nC: [0.367, 0.144, 0.632, 1.079]\nD: [0.031, 0.693, 0.361, 0.975]",
+    "question": "Here is an object ([0.369, 0.153, 0.609, 0.965]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.367, 0.144, 0.611, 0.964]\nB: [0.337, 0.181, 0.58, 1.0]\nC: [0.367, 0.144, 0.632, 1.079]\nD: [0.031, 0.693, 0.361, 0.975]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_75_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_75_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.066, 0.762, 0.502, 0.981]\nB: [0.448, 0.158, 0.876, 0.285]\nC: [0.0, 0.782, 0.437, 1.0]\nD: [0.158, 0.832, 0.645, 0.868]",
+    "question": "Here is an object ([0.0, 0.443, 0.603, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.066, 0.762, 0.502, 0.981]\nB: [0.448, 0.158, 0.876, 0.285]\nC: [0.0, 0.782, 0.437, 1.0]\nD: [0.158, 0.832, 0.645, 0.868]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_76_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_76_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.232, 0.325, 0.599, 0.582]\nB: [0.321, 0.0, 0.77, 1.0]\nC: [0.286, 0.044, 0.386, 0.461]\nD: [0.321, 0.0, 0.858, 0.894]",
+    "question": "Here is an object ([0.394, 0.001, 0.947, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.232, 0.325, 0.599, 0.582]\nB: [0.321, 0.0, 0.77, 1.0]\nC: [0.286, 0.044, 0.386, 0.461]\nD: [0.321, 0.0, 0.858, 0.894]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_77_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_77_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.673, 0.375, 1.052, 0.929]\nB: [0.673, 0.375, 1.0, 1.0]\nC: [0.673, 0.375, 1.007, 0.979]\nD: [0.248, 0.358, 0.261, 0.589]",
+    "question": "Here is an object ([0.532, 0.296, 1.0, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.673, 0.375, 1.052, 0.929]\nB: [0.673, 0.375, 1.0, 1.0]\nC: [0.673, 0.375, 1.007, 0.979]\nD: [0.248, 0.358, 0.261, 0.589]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_78_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_78_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.327, 0.242, 0.479, 0.401]\nB: [0.397, 0.235, 0.548, 0.394]\nC: [0.669, 0.562, 0.69, 0.632]\nD: [0.766, 0.242, 0.848, 0.442]",
+    "question": "Here is an object ([0.326, 0.249, 0.481, 0.422]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.327, 0.242, 0.479, 0.401]\nB: [0.397, 0.235, 0.548, 0.394]\nC: [0.669, 0.562, 0.69, 0.632]\nD: [0.766, 0.242, 0.848, 0.442]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_79_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_79_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.072, 0.064, 0.542, 0.249]\nB: [0.645, 0.408, 0.869, 0.582]\nC: [0.428, 0.0, 0.695, 0.919]\nD: [0.301, 0.0, 0.568, 0.919]",
+    "question": "Here is an object ([0.31, 0.074, 0.576, 0.826]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.072, 0.064, 0.542, 0.249]\nB: [0.645, 0.408, 0.869, 0.582]\nC: [0.428, 0.0, 0.695, 0.919]\nD: [0.301, 0.0, 0.568, 0.919]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_80_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_80_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.08, 0.778, 0.391, 1.0]\nB: [0.508, 0.303, 0.577, 0.432]\nC: [0.155, 0.778, 0.466, 1.0]\nD: [0.046, 0.778, 0.357, 1.0]",
+    "question": "Here is an object ([0.044, 0.793, 0.334, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.08, 0.778, 0.391, 1.0]\nB: [0.508, 0.303, 0.577, 0.432]\nC: [0.155, 0.778, 0.466, 1.0]\nD: [0.046, 0.778, 0.357, 1.0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_81_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_81_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.0, 0.0, 1.0, 0.999]\nB: [0.0, 0.001, 1.0, 1.0]\nC: [0.411, 0.328, 0.752, 0.585]\nD: [0.525, 0.542, 0.97, 0.881]",
+    "question": "Here is an object ([0.0, 0.001, 1.0, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.0, 0.0, 1.0, 0.999]\nB: [0.0, 0.001, 1.0, 1.0]\nC: [0.411, 0.328, 0.752, 0.585]\nD: [0.525, 0.542, 0.97, 0.881]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_82_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_82_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.304, 0.521, 0.474, 0.738]\nB: [0.18, 0.349, 0.421, 0.765]\nC: [0.18, 0.349, 0.401, 0.797]\nD: [0.282, 0.438, 0.726, 0.922]",
+    "question": "Here is an object ([0.183, 0.338, 0.426, 0.754]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.304, 0.521, 0.474, 0.738]\nB: [0.18, 0.349, 0.421, 0.765]\nC: [0.18, 0.349, 0.401, 0.797]\nD: [0.282, 0.438, 0.726, 0.922]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_83_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_83_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.084, 0.0, 0.97, 0.975]\nB: [0.588, 0.258, 0.977, 0.639]\nC: [0.465, 0.197, 0.775, 0.597]\nD: [0.0, 0.0, 0.886, 0.975]",
+    "question": "Here is an object ([0.0, 0.0, 0.884, 0.967]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.084, 0.0, 0.97, 0.975]\nB: [0.588, 0.258, 0.977, 0.639]\nC: [0.465, 0.197, 0.775, 0.597]\nD: [0.0, 0.0, 0.886, 0.975]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_84_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_84_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.096, 0.446, 1.0, 1.0]\nB: [0.089, 0.375, 0.936, 0.829]\nC: [0.089, 0.375, 0.993, 0.929]\nD: [0.096, 0.436, 1.0, 0.99]",
+    "question": "Here is an object ([0.084, 0.376, 0.99, 0.903]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 406 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.096, 0.446, 1.0, 1.0]\nB: [0.089, 0.375, 0.936, 0.829]\nC: [0.089, 0.375, 0.993, 0.929]\nD: [0.096, 0.436, 1.0, 0.99]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_85_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_85_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.121, 0.364, 0.447, 1.0]\nB: [0.17, 0.364, 0.495, 1.0]\nC: [0.26, 0.364, 0.586, 1.0]\nD: [0.149, 0.364, 0.475, 1.0]",
+    "question": "Here is an object ([0.291, 0.444, 0.606, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.121, 0.364, 0.447, 1.0]\nB: [0.17, 0.364, 0.495, 1.0]\nC: [0.26, 0.364, 0.586, 1.0]\nD: [0.149, 0.364, 0.475, 1.0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_86_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_86_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.687, 0.894, 0.72, 0.951]\nB: [0.17, 0.021, 0.422, 0.332]\nC: [0.263, 0.164, 0.547, 0.483]\nD: [0.263, 0.164, 0.514, 0.475]",
+    "question": "Here is an object ([0.247, 0.165, 0.501, 0.479]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.687, 0.894, 0.72, 0.951]\nB: [0.17, 0.021, 0.422, 0.332]\nC: [0.263, 0.164, 0.547, 0.483]\nD: [0.263, 0.164, 0.514, 0.475]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_87_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_87_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.528, 0.439, 0.573, 0.536]\nB: [0.45, 0.644, 0.761, 0.756]\nC: [0.528, 0.439, 0.577, 0.55]\nD: [0.542, 0.478, 0.588, 0.575]",
+    "question": "Here is an object ([0.536, 0.414, 0.573, 0.528]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 640 and the height is 360.",
+    "context": "Select from the following choices.\nA: [0.528, 0.439, 0.573, 0.536]\nB: [0.45, 0.644, 0.761, 0.756]\nC: [0.528, 0.439, 0.577, 0.55]\nD: [0.542, 0.478, 0.588, 0.575]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_88_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_88_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.598, 0.618, 0.884, 0.681]\nB: [0.141, 0.233, 0.448, 1.0]\nC: [0.473, 0.306, 0.578, 0.575]\nD: [0.141, 0.233, 0.455, 1.097]",
+    "question": "Here is an object ([0.13, 0.26, 0.435, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.598, 0.618, 0.884, 0.681]\nB: [0.141, 0.233, 0.448, 1.0]\nC: [0.473, 0.306, 0.578, 0.575]\nD: [0.141, 0.233, 0.455, 1.097]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_89_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_89_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.0, 0.25, 0.742, 1.0]\nB: [0.258, 0.14, 1.0, 0.89]\nC: [0.016, 0.108, 0.757, 0.858]\nD: [0.809, 0.283, 0.925, 0.317]",
+    "question": "Here is an object ([0.065, 0.108, 1.0, 0.822]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1080 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.0, 0.25, 0.742, 1.0]\nB: [0.258, 0.14, 1.0, 0.89]\nC: [0.016, 0.108, 0.757, 0.858]\nD: [0.809, 0.283, 0.925, 0.317]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_90_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_90_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.508, 0.017, 0.915, 0.2]\nB: [0.0, 0.004, 0.701, 0.935]\nC: [0.0, 0.004, 0.752, 1.0]\nD: [0.248, 0.004, 1.0, 1.0]",
+    "question": "Here is an object ([0.0, 0.021, 0.759, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.508, 0.017, 0.915, 0.2]\nB: [0.0, 0.004, 0.701, 0.935]\nC: [0.0, 0.004, 0.752, 1.0]\nD: [0.248, 0.004, 1.0, 1.0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_91_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_91_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.518, 0.521, 0.582, 0.715]\nB: [0.512, 0.44, 0.566, 0.604]\nC: [0.518, 0.521, 0.573, 0.685]\nD: [0.518, 0.521, 0.578, 0.675]",
+    "question": "Here is an object ([0.504, 0.521, 0.551, 0.662]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.518, 0.521, 0.582, 0.715]\nB: [0.512, 0.44, 0.566, 0.604]\nC: [0.518, 0.521, 0.573, 0.685]\nD: [0.518, 0.521, 0.578, 0.675]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_92_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_92_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.275, 0.518, 0.76, 1.0]\nB: [0.275, 0.518, 0.763, 0.928]\nC: [0.275, 0.518, 0.738, 1.083]\nD: [0.131, 0.343, 0.616, 0.825]",
+    "question": "Here is an object ([0.677, 0.49, 0.845, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.275, 0.518, 0.76, 1.0]\nB: [0.275, 0.518, 0.763, 0.928]\nC: [0.275, 0.518, 0.738, 1.083]\nD: [0.131, 0.343, 0.616, 0.825]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_93_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_93_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.748, 0.189, 0.783, 0.533]\nB: [0.038, 0.267, 0.163, 0.346]\nC: [0.064, 0.235, 0.188, 0.314]\nD: [0.071, 0.322, 0.296, 0.649]",
+    "question": "Here is an object ([0.109, 0.24, 0.23, 0.322]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.748, 0.189, 0.783, 0.533]\nB: [0.038, 0.267, 0.163, 0.346]\nC: [0.064, 0.235, 0.188, 0.314]\nD: [0.071, 0.322, 0.296, 0.649]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_94_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_94_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.436, 0.774, 0.797, 0.901]\nB: [0.478, 0.286, 0.601, 0.464]\nC: [0.439, 0.328, 0.561, 0.506]\nD: [0.652, 0.426, 0.946, 0.767]",
+    "question": "Here is an object ([0.449, 0.339, 0.614, 0.582]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 960 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.436, 0.774, 0.797, 0.901]\nB: [0.478, 0.286, 0.601, 0.464]\nC: [0.439, 0.328, 0.561, 0.506]\nD: [0.652, 0.426, 0.946, 0.767]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_95_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_95_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.0, 0.217, 0.427, 1.0]\nB: [0.0, 0.217, 0.466, 0.968]\nC: [0.156, 0.217, 0.584, 1.0]\nD: [0.0, 0.217, 0.461, 0.944]",
+    "question": "Here is an object ([0.0, 0.206, 0.405, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.0, 0.217, 0.427, 1.0]\nB: [0.0, 0.217, 0.466, 0.968]\nC: [0.156, 0.217, 0.584, 1.0]\nD: [0.0, 0.217, 0.461, 0.944]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_96_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_96_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.412, 0.321, 0.501, 0.774]\nB: [0.047, 0.485, 0.552, 1.0]\nC: [0.119, 0.485, 0.623, 1.0]\nD: [0.119, 0.485, 0.693, 0.969]",
+    "question": "Here is an object ([0.133, 0.522, 0.686, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.412, 0.321, 0.501, 0.774]\nB: [0.047, 0.485, 0.552, 1.0]\nC: [0.119, 0.485, 0.623, 1.0]\nD: [0.119, 0.485, 0.693, 0.969]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_97_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_97_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.416, 0.165, 0.517, 0.535]\nB: [0.43, 0.064, 0.532, 0.433]\nC: [0.422, 0.135, 0.523, 0.504]\nD: [0.422, 0.135, 0.505, 0.537]",
+    "question": "Here is an object ([0.439, 0.157, 0.559, 0.557]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.416, 0.165, 0.517, 0.535]\nB: [0.43, 0.064, 0.532, 0.433]\nC: [0.422, 0.135, 0.523, 0.504]\nD: [0.422, 0.135, 0.505, 0.537]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_98_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_98_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.14, 0.257, 0.775, 0.714]\nB: [0.066, 0.125, 0.656, 0.619]\nC: [0.14, 0.257, 0.826, 0.689]\nD: [0.14, 0.257, 0.73, 0.751]",
+    "question": "Here is an object ([0.154, 0.225, 0.735, 0.743]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.14, 0.257, 0.775, 0.714]\nB: [0.066, 0.125, 0.656, 0.619]\nC: [0.14, 0.257, 0.826, 0.689]\nD: [0.14, 0.257, 0.73, 0.751]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_99_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_99_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.321, 0.429, 0.502, 0.562]\nB: [0.321, 0.429, 0.493, 0.571]\nC: [0.399, 0.408, 0.58, 0.542]\nD: [0.287, 0.482, 0.467, 0.615]",
+    "question": "Here is an object ([0.313, 0.362, 0.605, 0.611]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.321, 0.429, 0.502, 0.562]\nB: [0.321, 0.429, 0.493, 0.571]\nC: [0.399, 0.408, 0.58, 0.542]\nD: [0.287, 0.482, 0.467, 0.615]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_100_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_100_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.145, 0.1, 1.0, 1.0]\nB: [0.81, 0.383, 0.819, 0.604]\nC: [0.145, 0.0, 1.0, 0.9]\nD: [0.145, 0.1, 0.912, 0.946]",
+    "question": "Here is an object ([0.15, 0.078, 1.0, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.145, 0.1, 1.0, 1.0]\nB: [0.81, 0.383, 0.819, 0.604]\nC: [0.145, 0.0, 1.0, 0.9]\nD: [0.145, 0.1, 0.912, 0.946]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_101_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_101_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.077, 0.319, 0.395, 0.588]\nB: [0.563, 0.496, 0.853, 0.843]\nC: [0.576, 0.646, 0.911, 0.826]\nD: [0.498, 0.457, 0.788, 0.804]",
+    "question": "Here is an object ([0.535, 0.507, 0.81, 0.825]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.077, 0.319, 0.395, 0.588]\nB: [0.563, 0.496, 0.853, 0.843]\nC: [0.576, 0.646, 0.911, 0.826]\nD: [0.498, 0.457, 0.788, 0.804]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_102_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_102_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.099, 0.274, 0.831, 1.0]\nB: [0.268, 0.156, 0.948, 0.758]\nC: [0.268, 0.156, 1.018, 0.989]\nD: [0.268, 0.156, 1.0, 0.882]",
+    "question": "Here is an object ([0.295, 0.115, 0.986, 0.876]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 960 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.099, 0.274, 0.831, 1.0]\nB: [0.268, 0.156, 0.948, 0.758]\nC: [0.268, 0.156, 1.018, 0.989]\nD: [0.268, 0.156, 1.0, 0.882]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_103_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_103_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.159, 0.364, 0.504, 0.894]\nB: [0.192, 0.314, 0.537, 0.844]\nC: [0.198, 0.429, 0.423, 0.867]\nD: [0.72, 0.679, 0.87, 0.814]",
+    "question": "Here is an object ([0.155, 0.296, 0.512, 0.847]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.159, 0.364, 0.504, 0.894]\nB: [0.192, 0.314, 0.537, 0.844]\nC: [0.198, 0.429, 0.423, 0.867]\nD: [0.72, 0.679, 0.87, 0.814]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_104_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_104_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.63, 0.36, 0.953, 0.414]\nB: [0.463, 0.172, 0.638, 0.432]\nC: [0.355, 0.146, 0.53, 0.406]\nD: [0.409, 0.218, 0.584, 0.478]",
+    "question": "Here is an object ([0.372, 0.129, 0.613, 0.461]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.63, 0.36, 0.953, 0.414]\nB: [0.463, 0.172, 0.638, 0.432]\nC: [0.355, 0.146, 0.53, 0.406]\nD: [0.409, 0.218, 0.584, 0.478]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_105_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_105_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.712, 0.625, 0.841, 0.796]\nB: [0.351, 0.718, 0.397, 0.847]\nC: [0.364, 0.769, 0.41, 0.899]\nD: [0.409, 0.537, 0.505, 0.747]",
+    "question": "Here is an object ([0.334, 0.714, 0.382, 0.814]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.712, 0.625, 0.841, 0.796]\nB: [0.351, 0.718, 0.397, 0.847]\nC: [0.364, 0.769, 0.41, 0.899]\nD: [0.409, 0.537, 0.505, 0.747]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_106_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_106_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.322, 0.412, 0.575, 0.818]\nB: [0.242, 0.253, 0.484, 0.642]\nC: [0.322, 0.412, 0.563, 0.801]\nD: [0.306, 0.432, 0.548, 0.821]",
+    "question": "Here is an object ([0.298, 0.354, 0.506, 0.793]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.322, 0.412, 0.575, 0.818]\nB: [0.242, 0.253, 0.484, 0.642]\nC: [0.322, 0.412, 0.563, 0.801]\nD: [0.306, 0.432, 0.548, 0.821]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_107_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_107_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.48, 0.347, 0.798, 0.393]\nB: [0.207, 0.154, 0.544, 0.531]\nC: [0.207, 0.154, 0.597, 0.501]\nD: [0.332, 0.514, 0.696, 0.872]",
+    "question": "Here is an object ([0.229, 0.156, 0.602, 0.49]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.48, 0.347, 0.798, 0.393]\nB: [0.207, 0.154, 0.544, 0.531]\nC: [0.207, 0.154, 0.597, 0.501]\nD: [0.332, 0.514, 0.696, 0.872]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_108_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_108_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.222, 0.832, 0.42, 0.985]\nB: [0.277, 0.832, 0.502, 1.0]\nC: [0.222, 0.832, 0.447, 1.0]\nD: [0.222, 0.832, 0.476, 1.031]",
+    "question": "Here is an object ([0.0, 0.457, 0.234, 0.799]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.222, 0.832, 0.42, 0.985]\nB: [0.277, 0.832, 0.502, 1.0]\nC: [0.222, 0.832, 0.447, 1.0]\nD: [0.222, 0.832, 0.476, 1.031]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_109_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_109_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.0, 0.507, 1.0, 0.747]\nB: [0.0, 0.59, 1.0, 0.831]\nC: [0.0, 0.507, 1.165, 0.767]\nD: [0.72, 0.235, 0.856, 0.468]",
+    "question": "Here is an object ([0.0, 0.514, 1.0, 0.725]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 960 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.0, 0.507, 1.0, 0.747]\nB: [0.0, 0.59, 1.0, 0.831]\nC: [0.0, 0.507, 1.165, 0.767]\nD: [0.72, 0.235, 0.856, 0.468]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_110_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_110_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.198, 0.206, 0.652, 0.844]\nB: [0.374, 0.235, 0.77, 0.818]\nC: [0.626, 0.379, 0.905, 0.808]\nD: [0.198, 0.206, 0.594, 0.789]",
+    "question": "Here is an object ([0.207, 0.212, 0.609, 0.786]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.198, 0.206, 0.652, 0.844]\nB: [0.374, 0.235, 0.77, 0.818]\nC: [0.626, 0.379, 0.905, 0.808]\nD: [0.198, 0.206, 0.594, 0.789]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_111_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_111_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.305, 0.0, 0.776, 0.582]\nB: [0.343, 0.211, 0.813, 0.793]\nC: [0.399, 0.029, 0.635, 0.49]\nD: [0.305, 0.0, 0.734, 0.481]",
+    "question": "Here is an object ([0.302, 0.0, 0.73, 0.333]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.305, 0.0, 0.776, 0.582]\nB: [0.343, 0.211, 0.813, 0.793]\nC: [0.399, 0.029, 0.635, 0.49]\nD: [0.305, 0.0, 0.734, 0.481]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_112_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_112_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.322, 0.349, 1.0, 0.732]\nB: [0.127, 0.397, 0.805, 0.781]\nC: [0.314, 0.597, 0.748, 0.897]\nD: [0.003, 0.468, 0.254, 0.578]",
+    "question": "Here is an object ([0.306, 0.381, 1.0, 0.722]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.322, 0.349, 1.0, 0.732]\nB: [0.127, 0.397, 0.805, 0.781]\nC: [0.314, 0.597, 0.748, 0.897]\nD: [0.003, 0.468, 0.254, 0.578]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_113_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_113_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.285, 0.218, 0.616, 0.626]\nB: [0.42, 0.044, 0.645, 0.375]\nC: [0.285, 0.218, 0.609, 0.671]\nD: [0.285, 0.218, 0.62, 0.713]",
+    "question": "Here is an object ([0.395, 0.212, 0.702, 0.669]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.285, 0.218, 0.616, 0.626]\nB: [0.42, 0.044, 0.645, 0.375]\nC: [0.285, 0.218, 0.609, 0.671]\nD: [0.285, 0.218, 0.62, 0.713]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_114_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_114_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.861, 0.214, 0.959, 0.562]\nB: [0.861, 0.214, 0.968, 0.524]\nC: [0.893, 0.111, 1.0, 0.421]\nD: [0.147, 0.603, 0.412, 0.931]",
+    "question": "Here is an object ([0.87, 0.222, 0.975, 0.528]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.861, 0.214, 0.959, 0.562]\nB: [0.861, 0.214, 0.968, 0.524]\nC: [0.893, 0.111, 1.0, 0.421]\nD: [0.147, 0.603, 0.412, 0.931]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_115_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_115_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.18, 0.113, 0.617, 0.567]\nB: [0.057, 0.256, 0.484, 0.771]\nC: [0.427, 0.164, 0.723, 0.478]\nD: [0.18, 0.113, 0.608, 0.628]",
+    "question": "Here is an object ([0.164, 0.11, 0.591, 0.624]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.18, 0.113, 0.617, 0.567]\nB: [0.057, 0.256, 0.484, 0.771]\nC: [0.427, 0.164, 0.723, 0.478]\nD: [0.18, 0.113, 0.608, 0.628]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_116_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_116_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.362, 0.272, 0.645, 0.729]\nB: [0.362, 0.272, 0.713, 0.839]\nC: [0.241, 0.231, 0.585, 0.494]\nD: [0.604, 0.682, 0.843, 0.971]",
+    "question": "Here is an object ([0.323, 0.211, 0.684, 0.831]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.362, 0.272, 0.645, 0.729]\nB: [0.362, 0.272, 0.713, 0.839]\nC: [0.241, 0.231, 0.585, 0.494]\nD: [0.604, 0.682, 0.843, 0.971]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_117_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_117_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.035, 0.06, 1.0, 1.0]\nB: [0.035, 0.06, 1.012, 1.072]\nC: [0.035, 0.06, 1.058, 1.111]\nD: [0.035, 0.06, 1.018, 0.933]",
+    "question": "Here is an object ([0.105, 0.153, 1.0, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.035, 0.06, 1.0, 1.0]\nB: [0.035, 0.06, 1.012, 1.072]\nC: [0.035, 0.06, 1.058, 1.111]\nD: [0.035, 0.06, 1.018, 0.933]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_118_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_118_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.452, 0.0, 1.0, 0.858]\nB: [0.434, 0.0, 0.982, 0.858]\nC: [0.277, 0.025, 0.845, 0.994]\nD: [0.277, 0.025, 0.824, 0.883]",
+    "question": "Here is an object ([0.275, 0.033, 0.816, 0.889]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.452, 0.0, 1.0, 0.858]\nB: [0.434, 0.0, 0.982, 0.858]\nC: [0.277, 0.025, 0.845, 0.994]\nD: [0.277, 0.025, 0.824, 0.883]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_119_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_119_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.0, 0.267, 0.72, 0.754]\nB: [0.044, 0.375, 0.259, 0.868]\nC: [0.0, 0.267, 0.838, 0.692]\nD: [0.0, 0.239, 0.838, 0.664]",
+    "question": "Here is an object ([0.0, 0.268, 0.805, 0.74]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.0, 0.267, 0.72, 0.754]\nB: [0.044, 0.375, 0.259, 0.868]\nC: [0.0, 0.267, 0.838, 0.692]\nD: [0.0, 0.239, 0.838, 0.664]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_120_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_120_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.0, 0.001, 0.979, 0.851]\nB: [0.0, 0.001, 1.0, 1.0]\nC: [0.0, 0.0, 1.0, 0.999]\nD: [0.0, 0.0, 1.0, 0.999]",
+    "question": "Here is an object ([0.302, 0.026, 1.0, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 480 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.0, 0.001, 0.979, 0.851]\nB: [0.0, 0.001, 1.0, 1.0]\nC: [0.0, 0.0, 1.0, 0.999]\nD: [0.0, 0.0, 1.0, 0.999]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_121_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_121_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.281, 0.379, 0.819, 0.546]\nB: [0.018, 0.447, 0.457, 0.604]\nC: [0.018, 0.447, 0.555, 0.614]\nD: [0.414, 0.225, 0.912, 0.421]",
+    "question": "Here is an object ([0.025, 0.489, 0.583, 0.636]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.281, 0.379, 0.819, 0.546]\nB: [0.018, 0.447, 0.457, 0.604]\nC: [0.018, 0.447, 0.555, 0.614]\nD: [0.414, 0.225, 0.912, 0.421]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_122_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_122_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.502, 0.561, 0.619, 0.736]\nB: [0.462, 0.122, 0.881, 0.621]\nC: [0.517, 0.637, 0.634, 0.812]\nD: [0.502, 0.561, 0.606, 0.724]",
+    "question": "Here is an object ([0.515, 0.581, 0.582, 0.721]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.502, 0.561, 0.619, 0.736]\nB: [0.462, 0.122, 0.881, 0.621]\nC: [0.517, 0.637, 0.634, 0.812]\nD: [0.502, 0.561, 0.606, 0.724]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_123_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_123_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.2, 0.082, 0.523, 1.019]\nB: [0.413, 0.683, 0.617, 0.865]\nC: [0.2, 0.082, 0.583, 1.0]\nD: [0.178, 0.69, 0.47, 0.832]",
+    "question": "Here is an object ([0.189, 0.138, 0.595, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.2, 0.082, 0.523, 1.019]\nB: [0.413, 0.683, 0.617, 0.865]\nC: [0.2, 0.082, 0.583, 1.0]\nD: [0.178, 0.69, 0.47, 0.832]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_124_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_124_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.36, 0.119, 0.56, 0.476]\nB: [0.095, 0.053, 0.541, 0.535]\nC: [0.36, 0.119, 0.557, 0.432]\nD: [0.36, 0.119, 0.534, 0.429]",
+    "question": "Here is an object ([0.371, 0.131, 0.545, 0.589]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.36, 0.119, 0.56, 0.476]\nB: [0.095, 0.053, 0.541, 0.535]\nC: [0.36, 0.119, 0.557, 0.432]\nD: [0.36, 0.119, 0.534, 0.429]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_125_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_125_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.156, 0.181, 0.855, 1.158]\nB: [0.534, 0.085, 0.951, 0.11]\nC: [0.63, 0.921, 0.958, 0.972]\nD: [0.156, 0.181, 0.795, 1.0]",
+    "question": "Here is an object ([0.303, 0.033, 0.923, 0.899]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.156, 0.181, 0.855, 1.158]\nB: [0.534, 0.085, 0.951, 0.11]\nC: [0.63, 0.921, 0.958, 0.972]\nD: [0.156, 0.181, 0.795, 1.0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_126_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_126_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.0, 0.142, 0.242, 0.715]\nB: [0.0, 0.142, 0.28, 0.631]\nC: [0.749, 0.119, 0.961, 0.493]\nD: [0.0, 0.142, 0.267, 0.637]",
+    "question": "Here is an object ([0.0, 0.143, 0.256, 0.608]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.0, 0.142, 0.242, 0.715]\nB: [0.0, 0.142, 0.28, 0.631]\nC: [0.749, 0.119, 0.961, 0.493]\nD: [0.0, 0.142, 0.267, 0.637]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_127_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_127_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.131, 0.228, 0.717, 0.76]\nB: [0.219, 0.322, 0.315, 0.586]\nC: [0.131, 0.228, 0.64, 0.701]\nD: [0.648, 0.182, 0.732, 0.421]",
+    "question": "Here is an object ([0.113, 0.224, 0.618, 0.713]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 406 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.131, 0.228, 0.717, 0.76]\nB: [0.219, 0.322, 0.315, 0.586]\nC: [0.131, 0.228, 0.64, 0.701]\nD: [0.648, 0.182, 0.732, 0.421]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_128_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_128_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.531, 0.565, 0.643, 0.656]\nB: [0.008, 0.514, 0.429, 0.842]\nC: [0.531, 0.565, 0.629, 0.65]\nD: [0.077, 0.757, 0.463, 0.997]",
+    "question": "Here is an object ([0.548, 0.553, 0.65, 0.622]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.531, 0.565, 0.643, 0.656]\nB: [0.008, 0.514, 0.429, 0.842]\nC: [0.531, 0.565, 0.629, 0.65]\nD: [0.077, 0.757, 0.463, 0.997]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_129_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_129_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.292, 0.122, 0.642, 0.711]\nB: [0.312, 0.0, 0.662, 0.589]\nC: [0.291, 0.154, 0.641, 0.743]\nD: [0.462, 0.358, 0.812, 0.947]",
+    "question": "Here is an object ([0.295, 0.146, 0.641, 0.739]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.292, 0.122, 0.642, 0.711]\nB: [0.312, 0.0, 0.662, 0.589]\nC: [0.291, 0.154, 0.641, 0.743]\nD: [0.462, 0.358, 0.812, 0.947]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_130_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_130_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.416, 0.371, 0.48, 0.562]\nB: [0.404, 0.312, 0.468, 0.504]\nC: [0.241, 0.174, 0.517, 0.3]\nD: [0.426, 0.392, 0.49, 0.583]",
+    "question": "Here is an object ([0.431, 0.4, 0.509, 0.603]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.416, 0.371, 0.48, 0.562]\nB: [0.404, 0.312, 0.468, 0.504]\nC: [0.241, 0.174, 0.517, 0.3]\nD: [0.426, 0.392, 0.49, 0.583]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_131_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_131_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.0, 0.597, 0.102, 0.949]\nB: [0.0, 0.597, 0.112, 1.035]\nC: [0.0, 0.597, 0.096, 0.986]\nD: [0.493, 0.408, 0.527, 0.66]",
+    "question": "Here is an object ([0.0, 0.621, 0.077, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.0, 0.597, 0.102, 0.949]\nB: [0.0, 0.597, 0.112, 1.035]\nC: [0.0, 0.597, 0.096, 0.986]\nD: [0.493, 0.408, 0.527, 0.66]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_132_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_132_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.089, 0.829, 0.118, 0.868]\nB: [0.54, 0.044, 0.668, 0.643]\nC: [0.537, 0.218, 0.666, 0.817]\nD: [0.54, 0.044, 0.655, 0.714]",
+    "question": "Here is an object ([0.595, 0.092, 0.691, 0.7]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.089, 0.829, 0.118, 0.868]\nB: [0.54, 0.044, 0.668, 0.643]\nC: [0.537, 0.218, 0.666, 0.817]\nD: [0.54, 0.044, 0.655, 0.714]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_133_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_133_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.395, 0.317, 0.602, 0.821]\nB: [0.504, 0.408, 0.513, 0.686]\nC: [0.484, 0.439, 0.69, 0.943]\nD: [0.313, 0.244, 0.52, 0.749]",
+    "question": "Here is an object ([0.429, 0.154, 0.625, 0.786]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.395, 0.317, 0.602, 0.821]\nB: [0.504, 0.408, 0.513, 0.686]\nC: [0.484, 0.439, 0.69, 0.943]\nD: [0.313, 0.244, 0.52, 0.749]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_134_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_134_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.13, 0.0, 0.852, 1.0]\nB: [0.071, 0.0, 0.793, 1.0]\nC: [0.095, 0.306, 0.59, 0.322]\nD: [0.98, 0.435, 0.996, 0.803]",
+    "question": "Here is an object ([0.063, 0.0, 1.0, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.13, 0.0, 0.852, 1.0]\nB: [0.071, 0.0, 0.793, 1.0]\nC: [0.095, 0.306, 0.59, 0.322]\nD: [0.98, 0.435, 0.996, 0.803]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_135_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_135_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.277, 0.0, 0.519, 0.45]\nB: [0.395, 0.013, 0.637, 0.463]\nC: [0.497, 0.199, 0.843, 0.696]\nD: [0.281, 0.114, 0.523, 0.564]",
+    "question": "Here is an object ([0.264, 0.0, 0.491, 0.404]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.277, 0.0, 0.519, 0.45]\nB: [0.395, 0.013, 0.637, 0.463]\nC: [0.497, 0.199, 0.843, 0.696]\nD: [0.281, 0.114, 0.523, 0.564]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_136_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_136_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.901, 0.401, 0.985, 1.051]\nB: [0.901, 0.401, 1.0, 1.0]\nC: [0.504, 0.157, 0.877, 0.589]\nD: [0.901, 0.206, 1.0, 0.804]",
+    "question": "Here is an object ([0.934, 0.432, 1.0, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.901, 0.401, 0.985, 1.051]\nB: [0.901, 0.401, 1.0, 1.0]\nC: [0.504, 0.157, 0.877, 0.589]\nD: [0.901, 0.206, 1.0, 0.804]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_137_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_137_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.0, 0.267, 0.299, 0.561]\nB: [0.0, 0.267, 0.309, 0.537]\nC: [0.0, 0.267, 0.323, 0.568]\nD: [0.0, 0.171, 0.323, 0.472]",
+    "question": "Here is an object ([0.0, 0.246, 0.424, 0.611]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.0, 0.267, 0.299, 0.561]\nB: [0.0, 0.267, 0.309, 0.537]\nC: [0.0, 0.267, 0.323, 0.568]\nD: [0.0, 0.171, 0.323, 0.472]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_138_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_138_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.0, 0.0, 0.606, 1.0]\nB: [0.502, 0.601, 0.622, 0.924]\nC: [0.287, 0.311, 0.747, 0.39]\nD: [0.0, 0.0, 0.535, 1.157]",
+    "question": "Here is an object ([0.0, 0.0, 0.923, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.0, 0.0, 0.606, 1.0]\nB: [0.502, 0.601, 0.622, 0.924]\nC: [0.287, 0.311, 0.747, 0.39]\nD: [0.0, 0.0, 0.535, 1.157]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_139_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_139_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.05, 0.728, 0.195, 0.956]\nB: [0.193, 0.054, 0.217, 0.426]\nC: [0.434, 0.371, 0.787, 1.0]\nD: [0.519, 0.371, 0.872, 1.0]",
+    "question": "Here is an object ([0.529, 0.507, 0.775, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.05, 0.728, 0.195, 0.956]\nB: [0.193, 0.054, 0.217, 0.426]\nC: [0.434, 0.371, 0.787, 1.0]\nD: [0.519, 0.371, 0.872, 1.0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_140_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_140_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.409, 0.479, 0.546, 0.554]\nB: [0.409, 0.479, 0.537, 0.55]\nC: [0.429, 0.487, 0.557, 0.558]\nD: [0.409, 0.479, 0.516, 0.56]",
+    "question": "Here is an object ([0.455, 0.471, 0.564, 0.543]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.409, 0.479, 0.546, 0.554]\nB: [0.409, 0.479, 0.537, 0.55]\nC: [0.429, 0.487, 0.557, 0.558]\nD: [0.409, 0.479, 0.516, 0.56]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_141_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_141_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.514, 0.244, 0.854, 0.649]\nB: [0.601, 0.221, 1.0, 0.662]\nC: [0.514, 0.244, 0.913, 0.686]\nD: [0.601, 0.308, 1.0, 0.75]",
+    "question": "Here is an object ([0.589, 0.235, 0.943, 0.722]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.514, 0.244, 0.854, 0.649]\nB: [0.601, 0.221, 1.0, 0.662]\nC: [0.514, 0.244, 0.913, 0.686]\nD: [0.601, 0.308, 1.0, 0.75]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_142_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_142_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.488, 0.207, 0.569, 0.358]\nB: [0.469, 0.228, 0.549, 0.379]\nC: [0.432, 0.458, 0.816, 0.517]\nD: [0.019, 0.432, 0.448, 0.564]",
+    "question": "Here is an object ([0.496, 0.242, 0.566, 0.381]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.488, 0.207, 0.569, 0.358]\nB: [0.469, 0.228, 0.549, 0.379]\nC: [0.432, 0.458, 0.816, 0.517]\nD: [0.019, 0.432, 0.448, 0.564]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_143_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_143_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.0, 0.161, 0.049, 0.542]\nB: [0.699, 0.242, 0.79, 0.568]\nC: [0.0, 0.099, 0.049, 0.479]\nD: [0.0, 0.101, 0.049, 0.482]",
+    "question": "Here is an object ([0.0, 0.094, 0.1, 0.554]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.0, 0.161, 0.049, 0.542]\nB: [0.699, 0.242, 0.79, 0.568]\nC: [0.0, 0.099, 0.049, 0.479]\nD: [0.0, 0.101, 0.049, 0.482]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_144_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_144_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.377, 0.336, 1.024, 0.835]\nB: [0.377, 0.336, 0.956, 0.956]\nC: [0.377, 0.336, 1.061, 1.003]\nD: [0.101, 0.381, 0.68, 1.0]",
+    "question": "Here is an object ([0.433, 0.271, 0.981, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.377, 0.336, 1.024, 0.835]\nB: [0.377, 0.336, 0.956, 0.956]\nC: [0.377, 0.336, 1.061, 1.003]\nD: [0.101, 0.381, 0.68, 1.0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_145_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_145_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.364, 0.487, 0.55, 0.693]\nB: [0.364, 0.487, 0.529, 0.668]\nC: [0.273, 0.447, 0.459, 0.653]\nD: [0.378, 0.558, 0.564, 0.764]",
+    "question": "Here is an object ([0.342, 0.415, 0.542, 0.607]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.364, 0.487, 0.55, 0.693]\nB: [0.364, 0.487, 0.529, 0.668]\nC: [0.273, 0.447, 0.459, 0.653]\nD: [0.378, 0.558, 0.564, 0.764]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_146_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_146_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.116, 0.26, 0.833, 0.936]\nB: [0.116, 0.26, 0.734, 1.0]\nC: [0.0, 0.26, 0.619, 1.0]\nD: [0.116, 0.626, 0.322, 0.66]",
+    "question": "Here is an object ([0.113, 0.256, 0.725, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.116, 0.26, 0.833, 0.936]\nB: [0.116, 0.26, 0.734, 1.0]\nC: [0.0, 0.26, 0.619, 1.0]\nD: [0.116, 0.626, 0.322, 0.66]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_147_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_147_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.409, 0.407, 0.471, 0.524]\nB: [0.402, 0.449, 0.465, 0.565]\nC: [0.404, 0.357, 0.466, 0.474]\nD: [0.137, 0.357, 0.261, 0.697]",
+    "question": "Here is an object ([0.479, 0.539, 0.527, 0.662]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.409, 0.407, 0.471, 0.524]\nB: [0.402, 0.449, 0.465, 0.565]\nC: [0.404, 0.357, 0.466, 0.474]\nD: [0.137, 0.357, 0.261, 0.697]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_148_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_148_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.453, 0.503, 0.507, 0.681]\nB: [0.128, 0.867, 0.552, 0.899]\nC: [0.276, 0.35, 0.747, 0.397]\nD: [0.453, 0.503, 0.503, 0.706]",
+    "question": "Here is an object ([0.487, 0.506, 0.544, 0.672]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.453, 0.503, 0.507, 0.681]\nB: [0.128, 0.867, 0.552, 0.899]\nC: [0.276, 0.35, 0.747, 0.397]\nD: [0.453, 0.503, 0.503, 0.706]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_149_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_149_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.22, 0.242, 0.862, 0.635]\nB: [0.161, 0.114, 0.634, 0.354]\nC: [0.562, 0.422, 0.925, 0.835]\nD: [0.359, 0.388, 1.0, 0.781]",
+    "question": "Here is an object ([0.209, 0.215, 0.863, 0.618]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.22, 0.242, 0.862, 0.635]\nB: [0.161, 0.114, 0.634, 0.354]\nC: [0.562, 0.422, 0.925, 0.835]\nD: [0.359, 0.388, 1.0, 0.781]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_150_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_150_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.285, 0.511, 1.0, 0.756]\nB: [0.606, 0.539, 0.62, 0.972]\nC: [0.22, 0.585, 0.935, 0.829]\nD: [0.285, 0.511, 1.085, 0.719]",
+    "question": "Here is an object ([0.435, 0.412, 1.0, 0.749]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.285, 0.511, 1.0, 0.756]\nB: [0.606, 0.539, 0.62, 0.972]\nC: [0.22, 0.585, 0.935, 0.829]\nD: [0.285, 0.511, 1.085, 0.719]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_151_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_151_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.209, 0.343, 0.797, 0.886]\nB: [0.028, 0.369, 0.616, 0.912]\nC: [0.0, 0.146, 0.588, 0.689]\nD: [0.337, 0.056, 0.549, 0.196]",
+    "question": "Here is an object ([0.021, 0.375, 0.605, 0.915]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.209, 0.343, 0.797, 0.886]\nB: [0.028, 0.369, 0.616, 0.912]\nC: [0.0, 0.146, 0.588, 0.689]\nD: [0.337, 0.056, 0.549, 0.196]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_152_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_152_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.36, 0.068, 0.727, 0.486]\nB: [0.048, 0.221, 0.545, 0.911]\nC: [0.116, 0.31, 0.613, 1.0]\nD: [0.116, 0.31, 0.68, 1.039]",
+    "question": "Here is an object ([0.116, 0.312, 0.606, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.36, 0.068, 0.727, 0.486]\nB: [0.048, 0.221, 0.545, 0.911]\nC: [0.116, 0.31, 0.613, 1.0]\nD: [0.116, 0.31, 0.68, 1.039]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_153_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_153_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.251, 0.228, 1.0, 1.0]\nB: [0.0, 0.0, 0.749, 0.772]\nC: [0.0, 0.228, 0.749, 1.0]\nD: [0.0, 0.113, 0.749, 0.885]",
+    "question": "Here is an object ([0.0, 0.119, 0.75, 0.885]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 960 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.251, 0.228, 1.0, 1.0]\nB: [0.0, 0.0, 0.749, 0.772]\nC: [0.0, 0.228, 0.749, 1.0]\nD: [0.0, 0.113, 0.749, 0.885]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_154_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_154_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.084, 0.149, 0.391, 0.438]\nB: [0.0, 0.071, 0.836, 1.133]\nC: [0.0, 0.071, 0.905, 1.0]\nD: [0.095, 0.0, 1.0, 0.929]",
+    "question": "Here is an object ([0.0, 0.001, 0.894, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.084, 0.149, 0.391, 0.438]\nB: [0.0, 0.071, 0.836, 1.133]\nC: [0.0, 0.071, 0.905, 1.0]\nD: [0.095, 0.0, 1.0, 0.929]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_155_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_155_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.066, 0.203, 0.189, 0.603]\nB: [0.204, 0.146, 0.611, 1.0]\nC: [0.03, 0.146, 0.437, 1.0]\nD: [0.03, 0.146, 0.445, 0.939]",
+    "question": "Here is an object ([0.034, 0.21, 0.511, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.066, 0.203, 0.189, 0.603]\nB: [0.204, 0.146, 0.611, 1.0]\nC: [0.03, 0.146, 0.437, 1.0]\nD: [0.03, 0.146, 0.445, 0.939]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_156_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_156_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.3, 0.251, 0.613, 1.0]\nB: [0.39, 0.0, 0.712, 0.697]\nC: [0.708, 0.621, 0.739, 0.844]\nD: [0.39, 0.0, 0.703, 0.749]",
+    "question": "Here is an object ([0.242, 0.0, 0.613, 0.656]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.3, 0.251, 0.613, 1.0]\nB: [0.39, 0.0, 0.712, 0.697]\nC: [0.708, 0.621, 0.739, 0.844]\nD: [0.39, 0.0, 0.703, 0.749]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_157_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_157_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.408, 0.212, 0.655, 0.739]\nB: [0.17, 0.383, 0.197, 0.639]\nC: [0.408, 0.212, 0.661, 0.754]\nD: [0.408, 0.212, 0.665, 0.856]",
+    "question": "Here is an object ([0.403, 0.207, 0.651, 0.767]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.408, 0.212, 0.655, 0.739]\nB: [0.17, 0.383, 0.197, 0.639]\nC: [0.408, 0.212, 0.661, 0.754]\nD: [0.408, 0.212, 0.665, 0.856]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_158_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_158_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.594, 0.279, 0.91, 0.968]\nB: [0.486, 0.013, 0.765, 0.59]\nC: [0.446, 0.122, 0.805, 0.543]\nD: [0.594, 0.279, 0.872, 0.857]",
+    "question": "Here is an object ([0.596, 0.289, 0.867, 0.853]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1270 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.594, 0.279, 0.91, 0.968]\nB: [0.486, 0.013, 0.765, 0.59]\nC: [0.446, 0.122, 0.805, 0.543]\nD: [0.594, 0.279, 0.872, 0.857]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_159_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_159_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.002, 0.087, 0.223, 0.472]\nB: [0.194, 0.114, 0.683, 0.775]\nC: [0.069, 0.221, 0.233, 0.621]\nD: [0.179, 0.339, 0.668, 1.0]",
+    "question": "Here is an object ([0.228, 0.0, 0.719, 0.607]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.002, 0.087, 0.223, 0.472]\nB: [0.194, 0.114, 0.683, 0.775]\nC: [0.069, 0.221, 0.233, 0.621]\nD: [0.179, 0.339, 0.668, 1.0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_160_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_160_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.459, 0.085, 0.476, 0.549]\nB: [0.248, 0.667, 0.747, 0.828]\nC: [0.512, 0.371, 0.652, 0.542]\nD: [0.512, 0.371, 0.626, 0.522]",
+    "question": "Here is an object ([0.509, 0.357, 0.635, 0.535]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.459, 0.085, 0.476, 0.549]\nB: [0.248, 0.667, 0.747, 0.828]\nC: [0.512, 0.371, 0.652, 0.542]\nD: [0.512, 0.371, 0.626, 0.522]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_161_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_161_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.32, 0.046, 0.584, 0.879]\nB: [0.177, 0.0, 0.491, 0.917]\nC: [0.494, 0.643, 0.716, 0.814]\nD: [0.32, 0.046, 0.634, 0.963]",
+    "question": "Here is an object ([0.324, 0.046, 0.635, 0.968]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.32, 0.046, 0.584, 0.879]\nB: [0.177, 0.0, 0.491, 0.917]\nC: [0.494, 0.643, 0.716, 0.814]\nD: [0.32, 0.046, 0.634, 0.963]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_162_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_162_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.43, 0.485, 0.783, 0.656]\nB: [0.502, 0.41, 0.579, 0.64]\nC: [0.463, 0.338, 0.54, 0.568]\nD: [0.488, 0.294, 0.566, 0.525]",
+    "question": "Here is an object ([0.476, 0.335, 0.562, 0.568]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.43, 0.485, 0.783, 0.656]\nB: [0.502, 0.41, 0.579, 0.64]\nC: [0.463, 0.338, 0.54, 0.568]\nD: [0.488, 0.294, 0.566, 0.525]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_163_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_163_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.461, 0.357, 0.537, 0.714]\nB: [0.461, 0.357, 0.526, 0.771]\nC: [0.095, 0.572, 0.489, 0.808]\nD: [0.465, 0.401, 0.541, 0.758]",
+    "question": "Here is an object ([0.466, 0.358, 0.545, 0.706]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.461, 0.357, 0.537, 0.714]\nB: [0.461, 0.357, 0.526, 0.771]\nC: [0.095, 0.572, 0.489, 0.808]\nD: [0.465, 0.401, 0.541, 0.758]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_164_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_164_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.442, 0.604, 0.598, 0.832]\nB: [0.49, 0.487, 0.658, 0.771]\nC: [0.442, 0.604, 0.61, 0.887]\nD: [0.409, 0.69, 0.577, 0.974]",
+    "question": "Here is an object ([0.455, 0.621, 0.626, 0.886]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.442, 0.604, 0.598, 0.832]\nB: [0.49, 0.487, 0.658, 0.771]\nC: [0.442, 0.604, 0.61, 0.887]\nD: [0.409, 0.69, 0.577, 0.974]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_165_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_165_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.416, 0.133, 0.897, 0.514]\nB: [0.416, 0.133, 0.995, 0.537]\nC: [0.433, 0.497, 0.685, 0.806]\nD: [0.421, 0.0, 1.0, 0.404]",
+    "question": "Here is an object ([0.436, 0.083, 0.995, 0.561]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 406 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.416, 0.133, 0.897, 0.514]\nB: [0.416, 0.133, 0.995, 0.537]\nC: [0.433, 0.497, 0.685, 0.806]\nD: [0.421, 0.0, 1.0, 0.404]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_166_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_166_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.228, 0.108, 0.396, 0.479]\nB: [0.171, 0.0, 0.923, 0.742]\nC: [0.171, 0.093, 1.0, 0.824]\nD: [0.171, 0.0, 1.0, 0.731]",
+    "question": "Here is an object ([0.165, 0.0, 1.0, 0.726]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.228, 0.108, 0.396, 0.479]\nB: [0.171, 0.0, 0.923, 0.742]\nC: [0.171, 0.093, 1.0, 0.824]\nD: [0.171, 0.0, 1.0, 0.731]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_167_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_167_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.074, 0.186, 0.488, 1.0]\nB: [0.058, 0.151, 0.472, 0.965]\nC: [0.159, 0.186, 0.639, 0.935]\nD: [0.159, 0.186, 0.573, 1.0]",
+    "question": "Here is an object ([0.179, 0.022, 0.554, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.074, 0.186, 0.488, 1.0]\nB: [0.058, 0.151, 0.472, 0.965]\nC: [0.159, 0.186, 0.639, 0.935]\nD: [0.159, 0.186, 0.573, 1.0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_168_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_168_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.0, 0.287, 0.342, 0.665]\nB: [0.078, 0.428, 0.42, 0.806]\nC: [0.34, 0.412, 0.643, 0.438]\nD: [0.0, 0.287, 0.341, 0.682]",
+    "question": "Here is an object ([0.0, 0.297, 0.397, 0.665]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.0, 0.287, 0.342, 0.665]\nB: [0.078, 0.428, 0.42, 0.806]\nC: [0.34, 0.412, 0.643, 0.438]\nD: [0.0, 0.287, 0.341, 0.682]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_169_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_169_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.464, 0.276, 0.727, 1.0]\nB: [0.464, 0.276, 0.745, 0.993]\nC: [0.517, 0.276, 0.78, 1.0]\nD: [0.464, 0.276, 0.692, 0.875]",
+    "question": "Here is an object ([0.455, 0.276, 0.688, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.464, 0.276, 0.727, 1.0]\nB: [0.464, 0.276, 0.745, 0.993]\nC: [0.517, 0.276, 0.78, 1.0]\nD: [0.464, 0.276, 0.692, 0.875]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_170_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_170_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.534, 0.237, 0.687, 0.515]\nB: [0.534, 0.237, 0.662, 0.522]\nC: [0.534, 0.237, 0.641, 0.497]\nD: [0.499, 0.261, 0.628, 0.546]",
+    "question": "Here is an object ([0.58, 0.235, 0.755, 0.518]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.534, 0.237, 0.687, 0.515]\nB: [0.534, 0.237, 0.662, 0.522]\nC: [0.534, 0.237, 0.641, 0.497]\nD: [0.499, 0.261, 0.628, 0.546]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_171_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_171_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.081, 0.196, 0.637, 1.131]\nB: [0.081, 0.196, 0.748, 1.113]\nC: [0.081, 0.196, 0.658, 0.994]\nD: [0.611, 0.761, 0.737, 0.843]",
+    "question": "Here is an object ([0.136, 0.15, 0.672, 0.881]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.081, 0.196, 0.637, 1.131]\nB: [0.081, 0.196, 0.748, 1.113]\nC: [0.081, 0.196, 0.658, 0.994]\nD: [0.611, 0.761, 0.737, 0.843]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_172_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_172_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.23, 0.069, 0.931, 1.0]\nB: [0.23, 0.069, 0.792, 1.121]\nC: [0.218, 0.069, 0.919, 1.0]\nD: [0.457, 0.265, 0.69, 0.581]",
+    "question": "Here is an object ([0.231, 0.124, 0.86, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.23, 0.069, 0.931, 1.0]\nB: [0.23, 0.069, 0.792, 1.121]\nC: [0.218, 0.069, 0.919, 1.0]\nD: [0.457, 0.265, 0.69, 0.581]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_173_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_173_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.159, 0.225, 0.294, 0.533]\nB: [0.218, 0.453, 0.636, 0.631]\nC: [0.292, 0.406, 0.459, 0.643]\nD: [0.292, 0.406, 0.456, 0.7]",
+    "question": "Here is an object ([0.29, 0.426, 0.471, 0.7]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.159, 0.225, 0.294, 0.533]\nB: [0.218, 0.453, 0.636, 0.631]\nC: [0.292, 0.406, 0.459, 0.643]\nD: [0.292, 0.406, 0.456, 0.7]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_174_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_174_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.143, 0.454, 0.249, 0.654]\nB: [0.077, 0.669, 0.136, 0.985]\nC: [0.145, 0.525, 0.252, 0.725]\nD: [0.143, 0.454, 0.266, 0.657]",
+    "question": "Here is an object ([0.12, 0.461, 0.237, 0.653]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.143, 0.454, 0.249, 0.654]\nB: [0.077, 0.669, 0.136, 0.985]\nC: [0.145, 0.525, 0.252, 0.725]\nD: [0.143, 0.454, 0.266, 0.657]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_175_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_175_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.602, 0.0, 0.946, 0.739]\nB: [0.468, 0.376, 0.48, 0.842]\nC: [0.44, 0.261, 0.783, 1.0]\nD: [0.393, 0.261, 0.736, 1.0]",
+    "question": "Here is an object ([0.446, 0.211, 0.622, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 960 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.602, 0.0, 0.946, 0.739]\nB: [0.468, 0.376, 0.48, 0.842]\nC: [0.44, 0.261, 0.783, 1.0]\nD: [0.393, 0.261, 0.736, 1.0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_176_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_176_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.0, 0.001, 0.722, 1.126]\nB: [0.0, 0.001, 0.598, 1.193]\nC: [0.0, 0.001, 0.724, 0.999]\nD: [0.0, 0.001, 0.738, 1.196]",
+    "question": "Here is an object ([0.0, 0.0, 0.755, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.0, 0.001, 0.722, 1.126]\nB: [0.0, 0.001, 0.598, 1.193]\nC: [0.0, 0.001, 0.724, 0.999]\nD: [0.0, 0.001, 0.738, 1.196]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_177_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_177_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.183, 0.761, 0.235, 0.919]\nB: [0.683, 0.257, 0.857, 0.718]\nC: [0.351, 0.0, 1.0, 1.0]\nD: [0.351, 0.0, 0.877, 0.803]",
+    "question": "Here is an object ([0.313, 0.0, 1.0, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.183, 0.761, 0.235, 0.919]\nB: [0.683, 0.257, 0.857, 0.718]\nC: [0.351, 0.0, 1.0, 1.0]\nD: [0.351, 0.0, 0.877, 0.803]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_178_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_178_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.334, 0.014, 1.103, 1.108]\nB: [0.2, 0.281, 0.454, 0.629]\nC: [0.334, 0.014, 0.926, 0.993]\nD: [0.334, 0.014, 1.0, 1.0]",
+    "question": "Here is an object ([0.235, 0.001, 1.0, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.334, 0.014, 1.103, 1.108]\nB: [0.2, 0.281, 0.454, 0.629]\nC: [0.334, 0.014, 0.926, 0.993]\nD: [0.334, 0.014, 1.0, 1.0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_179_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_179_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.356, 0.011, 0.83, 0.357]\nB: [0.183, 0.207, 0.581, 1.011]\nC: [0.183, 0.207, 0.68, 0.996]\nD: [0.183, 0.207, 0.616, 1.11]",
+    "question": "Here is an object ([0.211, 0.165, 0.67, 0.982]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.356, 0.011, 0.83, 0.357]\nB: [0.183, 0.207, 0.581, 1.011]\nC: [0.183, 0.207, 0.68, 0.996]\nD: [0.183, 0.207, 0.616, 1.11]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_180_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_180_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.479, 0.108, 0.884, 0.665]\nB: [0.552, 0.097, 0.956, 0.654]\nC: [0.317, 0.204, 0.722, 0.761]\nD: [0.479, 0.108, 0.859, 0.699]",
+    "question": "Here is an object ([0.457, 0.218, 0.777, 0.725]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.479, 0.108, 0.884, 0.665]\nB: [0.552, 0.097, 0.956, 0.654]\nC: [0.317, 0.204, 0.722, 0.761]\nD: [0.479, 0.108, 0.859, 0.699]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_181_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_181_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.603, 0.522, 0.715, 0.79]\nB: [0.531, 0.461, 0.641, 0.671]\nC: [0.523, 0.396, 0.632, 0.606]\nD: [0.537, 0.519, 0.702, 0.668]",
+    "question": "Here is an object ([0.584, 0.392, 0.634, 0.551]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.603, 0.522, 0.715, 0.79]\nB: [0.531, 0.461, 0.641, 0.671]\nC: [0.523, 0.396, 0.632, 0.606]\nD: [0.537, 0.519, 0.702, 0.668]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_182_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_182_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.0, 0.013, 0.871, 1.0]\nB: [0.129, 0.013, 1.047, 0.982]\nC: [0.696, 0.489, 0.793, 0.943]\nD: [0.129, 0.013, 1.0, 1.0]",
+    "question": "Here is an object ([0.059, 0.0, 1.0, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.0, 0.013, 0.871, 1.0]\nB: [0.129, 0.013, 1.047, 0.982]\nC: [0.696, 0.489, 0.793, 0.943]\nD: [0.129, 0.013, 1.0, 1.0]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_183_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_183_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.42, 0.268, 0.625, 0.971]\nB: [0.42, 0.268, 0.636, 0.778]\nC: [0.42, 0.268, 0.66, 0.865]\nD: [0.42, 0.268, 0.639, 0.919]",
+    "question": "Here is an object ([0.411, 0.272, 0.654, 0.865]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 960 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.42, 0.268, 0.625, 0.971]\nB: [0.42, 0.268, 0.636, 0.778]\nC: [0.42, 0.268, 0.66, 0.865]\nD: [0.42, 0.268, 0.639, 0.919]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_184_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_184_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.174, 0.0, 0.691, 0.558]\nB: [0.483, 0.21, 1.0, 0.768]\nC: [0.382, 0.046, 0.899, 0.604]\nD: [0.432, 0.364, 0.76, 0.779]",
+    "question": "Here is an object ([0.384, 0.018, 0.968, 0.479]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.174, 0.0, 0.691, 0.558]\nB: [0.483, 0.21, 1.0, 0.768]\nC: [0.382, 0.046, 0.899, 0.604]\nD: [0.432, 0.364, 0.76, 0.779]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_185_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_185_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.595, 0.536, 0.865, 0.782]\nB: [0.595, 0.536, 0.829, 0.744]\nC: [0.074, 0.478, 0.275, 0.861]\nD: [0.059, 0.325, 0.287, 0.339]",
+    "question": "Here is an object ([0.487, 0.554, 0.705, 0.758]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.595, 0.536, 0.865, 0.782]\nB: [0.595, 0.536, 0.829, 0.744]\nC: [0.074, 0.478, 0.275, 0.861]\nD: [0.059, 0.325, 0.287, 0.339]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_186_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_186_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.384, 0.524, 0.459, 0.842]\nB: [0.0, 0.0, 0.77, 0.999]\nC: [0.126, 0.49, 0.423, 0.603]\nD: [0.0, 0.0, 0.685, 0.894]",
+    "question": "Here is an object ([0.0, 0.0, 0.784, 0.999]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.384, 0.524, 0.459, 0.842]\nB: [0.0, 0.0, 0.77, 0.999]\nC: [0.126, 0.49, 0.423, 0.603]\nD: [0.0, 0.0, 0.685, 0.894]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_187_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_187_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.0, 0.046, 0.939, 0.84]\nB: [0.057, 0.29, 0.25, 0.646]\nC: [0.578, 0.11, 0.852, 0.163]\nD: [0.0, 0.046, 0.89, 0.84]",
+    "question": "Here is an object ([0.0, 0.001, 0.961, 0.874]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.0, 0.046, 0.939, 0.84]\nB: [0.057, 0.29, 0.25, 0.646]\nC: [0.578, 0.11, 0.852, 0.163]\nD: [0.0, 0.046, 0.89, 0.84]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_188_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_188_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.334, 0.31, 0.566, 0.938]\nB: [0.275, 0.312, 0.504, 1.0]\nC: [0.334, 0.31, 0.563, 0.997]\nD: [0.591, 0.644, 0.888, 0.765]",
+    "question": "Here is an object ([0.262, 0.143, 0.509, 0.997]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.334, 0.31, 0.566, 0.938]\nB: [0.275, 0.312, 0.504, 1.0]\nC: [0.334, 0.31, 0.563, 0.997]\nD: [0.591, 0.644, 0.888, 0.765]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_189_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_189_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.0, 0.565, 0.12, 0.9]\nB: [0.0, 0.565, 0.126, 0.917]\nC: [0.055, 0.589, 0.181, 0.94]\nD: [0.825, 0.094, 0.94, 0.535]",
+    "question": "Here is an object ([0.0, 0.05, 1.0, 0.86]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.0, 0.565, 0.12, 0.9]\nB: [0.0, 0.565, 0.126, 0.917]\nC: [0.055, 0.589, 0.181, 0.94]\nD: [0.825, 0.094, 0.94, 0.535]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_190_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_190_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.644, 0.44, 0.805, 0.861]\nB: [0.587, 0.544, 0.748, 0.965]\nC: [0.644, 0.44, 0.811, 0.821]\nD: [0.644, 0.44, 0.801, 0.908]",
+    "question": "Here is an object ([0.572, 0.41, 0.747, 0.842]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.644, 0.44, 0.805, 0.861]\nB: [0.587, 0.544, 0.748, 0.965]\nC: [0.644, 0.44, 0.811, 0.821]\nD: [0.644, 0.44, 0.801, 0.908]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_191_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_191_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.6, 0.292, 0.677, 0.412]\nB: [0.747, 0.479, 0.991, 1.056]\nC: [0.747, 0.479, 1.0, 1.0]\nD: [0.042, 0.16, 0.117, 0.547]",
+    "question": "Here is an object ([0.755, 0.472, 1.0, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.6, 0.292, 0.677, 0.412]\nB: [0.747, 0.479, 0.991, 1.056]\nC: [0.747, 0.479, 1.0, 1.0]\nD: [0.042, 0.16, 0.117, 0.547]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_192_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_192_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.268, 0.356, 0.665, 1.0]\nB: [0.384, 0.329, 0.781, 0.974]\nC: [0.5, 0.258, 0.897, 0.903]\nD: [0.466, 0.153, 0.863, 0.797]",
+    "question": "Here is an object ([0.386, 0.329, 0.791, 0.968]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.268, 0.356, 0.665, 1.0]\nB: [0.384, 0.329, 0.781, 0.974]\nC: [0.5, 0.258, 0.897, 0.903]\nD: [0.466, 0.153, 0.863, 0.797]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_193_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_193_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.243, 0.956, 0.284, 0.975]\nB: [0.382, 0.301, 0.875, 0.646]\nC: [0.382, 0.301, 1.019, 0.606]\nD: [0.382, 0.301, 0.919, 0.646]",
+    "question": "Here is an object ([0.411, 0.268, 0.728, 0.903]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 960 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.243, 0.956, 0.284, 0.975]\nB: [0.382, 0.301, 0.875, 0.646]\nC: [0.382, 0.301, 1.019, 0.606]\nD: [0.382, 0.301, 0.919, 0.646]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_194_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_194_1.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.388, 0.347, 0.992, 0.839]\nB: [0.388, 0.347, 0.977, 0.91]\nC: [0.477, 0.579, 0.912, 0.9]\nD: [0.388, 0.347, 1.089, 0.938]",
+    "question": "Here is an object ([0.386, 0.367, 0.98, 0.921]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.388, 0.347, 0.992, 0.839]\nB: [0.388, 0.347, 0.977, 0.91]\nC: [0.477, 0.579, 0.912, 0.9]\nD: [0.388, 0.347, 1.089, 0.938]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_195_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_195_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.228, 0.114, 0.254, 0.601]\nB: [0.327, 0.138, 1.0, 1.0]\nC: [0.327, 0.138, 1.021, 0.939]\nD: [0.327, 0.0, 1.0, 0.863]",
+    "question": "Here is an object ([0.332, 0.122, 1.0, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.228, 0.114, 0.254, 0.601]\nB: [0.327, 0.138, 1.0, 1.0]\nC: [0.327, 0.138, 1.021, 0.939]\nD: [0.327, 0.0, 1.0, 0.863]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_196_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_196_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.426, 0.447, 0.531, 0.756]\nB: [0.426, 0.447, 0.534, 0.776]\nC: [0.426, 0.447, 0.53, 0.783]\nD: [0.867, 0.138, 0.923, 0.214]",
+    "question": "Here is an object ([0.431, 0.433, 0.585, 0.769]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.426, 0.447, 0.531, 0.756]\nB: [0.426, 0.447, 0.534, 0.776]\nC: [0.426, 0.447, 0.53, 0.783]\nD: [0.867, 0.138, 0.923, 0.214]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_197_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_197_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "youtubevis2019_sot",
+    "options": "A: [0.341, 0.492, 0.753, 0.8]\nB: [0.152, 0.436, 0.563, 0.744]\nC: [0.168, 0.04, 0.502, 0.061]\nD: [0.593, 0.619, 0.761, 0.656]",
+    "question": "Here is an object ([0.366, 0.504, 0.786, 0.806]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.341, 0.492, 0.753, 0.8]\nB: [0.152, 0.436, 0.563, 0.744]\nC: [0.168, 0.04, 0.502, 0.061]\nD: [0.593, 0.619, 0.761, 0.656]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_198_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_198_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "single_object_tracking",
+    "visual_input_component": "['synthetic_image']",
+    "source": "ovis_sot",
+    "options": "A: [0.049, 0.143, 0.895, 0.719]\nB: [0.049, 0.143, 0.806, 0.606]\nC: [0.049, 0.143, 0.788, 0.667]\nD: [0.246, 0.05, 0.512, 0.212]",
+    "question": "Here is an object ([0.056, 0.144, 0.791, 0.665]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.",
+    "context": "Select from the following choices.\nA: [0.049, 0.143, 0.895, 0.719]\nB: [0.049, 0.143, 0.806, 0.606]\nC: [0.049, 0.143, 0.788, 0.667]\nD: [0.246, 0.05, 0.512, 0.212]",
+    "input_image_path": [
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_199_0.jpg",
+      "../MMIU-Benchmark/single_object_tracking/single_object_tracking_199_1.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: read a book\nB: drink water\nC: ride a bike\nD: play guitar",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: read a book\nB: drink water\nC: ride a bike\nD: play guitar",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_0_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_0_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_0_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_0_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_0_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_0_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_0_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_0_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_0_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_0_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_0_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_0_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: running\nB: sitting down\nC: lying down\nD: standing up",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: running\nB: sitting down\nC: lying down\nD: standing up",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_1_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_1_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_1_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_1_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_1_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_1_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_1_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_1_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_1_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_1_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_1_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_1_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: lying down\nB: standing up\nC: sitting down\nD: jumping",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: lying down\nB: standing up\nC: sitting down\nD: jumping",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_2_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_2_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_2_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_2_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_2_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_2_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_2_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_2_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_2_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_2_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_2_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_2_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: ride bicycle\nB: play guitar\nC: write letter\nD: eat meal",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: ride bicycle\nB: play guitar\nC: write letter\nD: eat meal",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_3_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_3_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_3_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_3_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_3_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_3_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_3_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_3_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_3_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_3_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_3_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_3_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: sit\nB: jump\nC: pickup\nD: run",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: sit\nB: jump\nC: pickup\nD: run",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_4_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_4_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_4_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_4_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_4_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_4_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_4_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_4_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_4_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_4_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_4_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_4_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: make a phone call\nB: play a guitar\nC: ride a bicycle\nD: drink a coffee",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: make a phone call\nB: play a guitar\nC: ride a bicycle\nD: drink a coffee",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_5_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_5_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_5_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_5_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_5_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_5_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_5_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_5_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_5_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_5_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_5_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_5_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: read book\nB: play piano\nC: jog\nD: eat meal",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: read book\nB: play piano\nC: jog\nD: eat meal",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_6_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_6_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_6_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_6_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_6_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_6_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_6_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_6_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_6_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_6_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_6_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_6_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: pickup\nB: sit\nC: run\nD: jump",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: pickup\nB: sit\nC: run\nD: jump",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_7_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_7_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_7_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_7_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_7_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_7_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_7_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_7_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_7_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_7_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_7_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_7_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: running\nB: sleeping\nC: dancing\nD: reading",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: running\nB: sleeping\nC: dancing\nD: reading",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_8_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_8_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_8_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_8_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_8_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_8_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_8_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_8_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_8_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_8_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_8_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_8_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: ride a bicycle\nB: make a phone call\nC: cook a meal\nD: play a piano",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: ride a bicycle\nB: make a phone call\nC: cook a meal\nD: play a piano",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_9_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_9_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_9_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_9_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_9_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_9_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_9_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_9_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_9_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_9_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_9_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_9_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: read a book\nB: tie shoelaces\nC: check time (from watch)\nD: wave hand",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: read a book\nB: tie shoelaces\nC: check time (from watch)\nD: wave hand",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_10_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_10_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_10_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_10_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_10_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_10_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_10_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_10_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_10_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_10_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_10_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_10_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: raise hand (greeting)\nB: touch chest (stomachache\nC: tie shoelaces (preparing to run)\nD: clap hands (applause)",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: raise hand (greeting)\nB: touch chest (stomachache\nC: tie shoelaces (preparing to run)\nD: clap hands (applause)",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_11_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_11_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_11_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_11_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_11_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_11_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_11_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_11_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_11_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_11_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_11_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_11_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: pickup\nB: sit\nC: jump\nD: run",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: pickup\nB: sit\nC: jump\nD: run",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_12_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_12_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_12_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_12_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_12_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_12_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_12_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_12_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_12_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_12_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_12_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_12_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: reading a book\nB: cooking a meal\nC: writing a letter\nD: brushing teeth",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: reading a book\nB: cooking a meal\nC: writing a letter\nD: brushing teeth",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_13_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_13_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_13_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_13_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_13_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_13_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_13_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_13_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_13_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_13_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_13_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_13_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: sit down\nB: jump\nC: wave hand\nD: touch chest (stomachache",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: sit down\nB: jump\nC: wave hand\nD: touch chest (stomachache",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_14_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_14_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_14_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_14_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_14_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_14_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_14_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_14_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_14_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_14_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_14_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_14_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: standing up\nB: jumping\nC: running\nD: sitting down",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: standing up\nB: jumping\nC: running\nD: sitting down",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_15_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_15_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_15_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_15_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_15_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_15_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_15_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_15_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_15_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_15_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_15_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_15_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: pickup\nB: run\nC: sit down\nD: jump",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: pickup\nB: run\nC: sit down\nD: jump",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_16_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_16_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_16_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_16_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_16_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_16_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_16_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_16_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_16_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_16_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_16_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_16_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: jogging\nB: brushing teeth\nC: eating\nD: reading a book",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: jogging\nB: brushing teeth\nC: eating\nD: reading a book",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_17_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_17_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_17_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_17_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_17_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_17_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_17_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_17_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_17_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_17_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_17_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_17_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: eat sandwich\nB: read book\nC: ride bicycle\nD: wear jacket",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: eat sandwich\nB: read book\nC: ride bicycle\nD: wear jacket",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_18_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_18_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_18_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_18_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_18_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_18_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_18_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_18_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_18_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_18_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_18_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_18_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: take off a hat\nB: tie shoelaces\nC: put on a hat\nD: put on gloves",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: take off a hat\nB: tie shoelaces\nC: put on a hat\nD: put on gloves",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_19_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_19_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_19_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_19_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_19_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_19_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_19_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_19_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_19_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_19_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_19_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_19_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: ride bicycle\nB: wear jacket\nC: read book\nD: cook dinner",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: ride bicycle\nB: wear jacket\nC: read book\nD: cook dinner",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_20_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_20_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_20_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_20_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_20_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_20_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_20_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_20_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_20_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_20_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_20_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_20_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: ride a bicycle\nB: tie a shoelace\nC: drink water\nD: read a book",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: ride a bicycle\nB: tie a shoelace\nC: drink water\nD: read a book",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_21_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_21_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_21_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_21_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_21_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_21_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_21_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_21_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_21_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_21_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_21_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_21_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: sitting down\nB: jumping\nC: standing up\nD: lying down",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: sitting down\nB: jumping\nC: standing up\nD: lying down",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_22_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_22_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_22_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_22_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_22_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_22_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_22_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_22_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_22_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_22_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_22_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_22_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: drink water\nB: read a book\nC: tie shoes\nD: climb stairs",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: drink water\nB: read a book\nC: tie shoes\nD: climb stairs",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_23_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_23_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_23_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_23_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_23_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_23_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_23_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_23_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_23_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_23_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_23_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_23_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: play a guitar\nB: drink water\nC: ride a bike\nD: write a note",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: play a guitar\nB: drink water\nC: ride a bike\nD: write a note",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_24_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_24_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_24_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_24_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_24_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_24_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_24_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_24_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_24_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_24_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_24_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_24_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: take off a hat\nB: put on a hat\nC: pick up a book\nD: tie shoelaces",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: take off a hat\nB: put on a hat\nC: pick up a book\nD: tie shoelaces",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_25_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_25_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_25_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_25_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_25_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_25_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_25_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_25_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_25_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_25_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_25_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_25_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: ride bike\nB: read book\nC: play guitar\nD: eat meal",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: ride bike\nB: read book\nC: play guitar\nD: eat meal",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_26_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_26_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_26_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_26_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_26_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_26_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_26_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_26_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_26_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_26_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_26_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_26_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: touch chest (stomachache\nB: throw a ball\nC: jump up\nD: tie shoelaces",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: touch chest (stomachache\nB: throw a ball\nC: jump up\nD: tie shoelaces",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_27_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_27_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_27_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_27_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_27_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_27_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_27_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_27_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_27_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_27_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_27_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_27_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: wave\nB: sit down\nC: jump\nD: pickup",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: wave\nB: sit down\nC: jump\nD: pickup",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_28_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_28_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_28_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_28_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_28_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_28_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_28_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_28_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_28_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_28_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_28_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_28_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: eat sandwich\nB: ride bicycle\nC: wear jacket\nD: play guitar",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: eat sandwich\nB: ride bicycle\nC: wear jacket\nD: play guitar",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_29_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_29_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_29_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_29_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_29_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_29_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_29_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_29_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_29_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_29_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_29_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_29_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: dancing\nB: reading\nC: sleeping\nD: cooking",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: dancing\nB: reading\nC: sleeping\nD: cooking",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_30_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_30_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_30_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_30_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_30_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_30_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_30_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_30_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_30_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_30_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_30_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_30_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: brushing teeth\nB: washing face\nC: brushing hair\nD: combing hair",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: brushing teeth\nB: washing face\nC: brushing hair\nD: combing hair",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_31_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_31_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_31_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_31_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_31_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_31_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_31_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_31_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_31_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_31_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_31_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_31_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: tie shoelaces\nB: check time (from watch)\nC: drink water\nD: read a book",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: tie shoelaces\nB: check time (from watch)\nC: drink water\nD: read a book",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_32_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_32_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_32_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_32_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_32_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_32_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_32_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_32_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_32_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_32_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_32_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_32_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: jump up\nB: touch chest (stomachache\nC: wave hand\nD: sit down",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: jump up\nB: touch chest (stomachache\nC: wave hand\nD: sit down",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_33_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_33_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_33_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_33_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_33_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_33_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_33_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_33_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_33_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_33_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_33_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_33_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: read a book\nB: tie shoelaces\nC: eat an apple\nD: check time (from watch)",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: read a book\nB: tie shoelaces\nC: eat an apple\nD: check time (from watch)",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_34_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_34_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_34_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_34_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_34_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_34_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_34_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_34_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_34_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_34_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_34_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_34_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: run\nB: drop\nC: jump\nD: sit",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: run\nB: drop\nC: jump\nD: sit",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_35_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_35_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_35_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_35_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_35_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_35_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_35_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_35_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_35_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_35_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_35_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_35_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: riding a bike\nB: baking a cake\nC: brushing teeth\nD: playing a guitar",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: riding a bike\nB: baking a cake\nC: brushing teeth\nD: playing a guitar",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_36_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_36_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_36_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_36_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_36_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_36_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_36_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_36_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_36_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_36_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_36_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_36_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: write on a board\nB: tie shoelaces\nC: check time (from watch)\nD: drink water",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: write on a board\nB: tie shoelaces\nC: check time (from watch)\nD: drink water",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_37_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_37_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_37_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_37_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_37_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_37_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_37_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_37_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_37_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_37_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_37_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_37_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: touch chest (stomachache\nB: clapping hands\nC: tying shoes\nD: jumping in place",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: touch chest (stomachache\nB: clapping hands\nC: tying shoes\nD: jumping in place",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_38_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_38_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_38_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_38_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_38_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_38_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_38_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_38_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_38_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_38_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_38_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_38_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: sitting down\nB: standing up\nC: jumping\nD: running",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: sitting down\nB: standing up\nC: jumping\nD: running",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_39_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_39_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_39_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_39_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_39_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_39_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_39_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_39_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_39_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_39_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_39_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_39_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: play guitar\nB: run\nC: sleep\nD: eat meal",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: play guitar\nB: run\nC: sleep\nD: eat meal",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_40_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_40_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_40_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_40_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_40_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_40_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_40_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_40_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_40_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_40_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_40_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_40_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: play piano\nB: eat meal\nC: paint picture\nD: ride bicycle",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: play piano\nB: eat meal\nC: paint picture\nD: ride bicycle",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_41_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_41_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_41_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_41_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_41_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_41_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_41_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_41_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_41_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_41_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_41_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_41_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: cooking\nB: reading\nC: dancing\nD: running",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: cooking\nB: reading\nC: dancing\nD: running",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_42_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_42_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_42_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_42_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_42_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_42_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_42_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_42_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_42_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_42_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_42_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_42_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: jumping\nB: sitting down\nC: lying down\nD: standing up",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: jumping\nB: sitting down\nC: lying down\nD: standing up",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_43_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_43_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_43_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_43_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_43_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_43_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_43_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_43_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_43_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_43_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_43_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_43_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: ride a bike\nB: eat a sandwich\nC: make a phone call\nD: tie a shoe",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: ride a bike\nB: eat a sandwich\nC: make a phone call\nD: tie a shoe",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_44_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_44_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_44_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_44_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_44_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_44_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_44_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_44_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_44_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_44_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_44_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_44_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: jump\nB: pickup\nC: sit down\nD: wave",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: jump\nB: pickup\nC: sit down\nD: wave",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_45_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_45_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_45_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_45_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_45_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_45_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_45_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_45_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_45_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_45_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_45_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_45_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: playing guitar\nB: tieing shoes\nC: drinking water\nD: brushing teeth",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: playing guitar\nB: tieing shoes\nC: drinking water\nD: brushing teeth",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_46_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_46_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_46_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_46_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_46_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_46_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_46_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_46_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_46_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_46_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_46_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_46_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: paint a picture\nB: eat meal\nC: run a marathon\nD: play a musical instrument",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: paint a picture\nB: eat meal\nC: run a marathon\nD: play a musical instrument",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_47_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_47_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_47_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_47_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_47_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_47_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_47_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_47_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_47_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_47_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_47_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_47_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: wave hand\nB: check time (from watch)\nC: tie shoelaces\nD: drink water",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: wave hand\nB: check time (from watch)\nC: tie shoelaces\nD: drink water",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_48_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_48_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_48_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_48_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_48_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_48_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_48_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_48_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_48_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_48_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_48_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_48_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: brushing teeth\nB: tying shoes\nC: cooking food\nD: watering plants",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: brushing teeth\nB: tying shoes\nC: cooking food\nD: watering plants",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_49_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_49_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_49_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_49_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_49_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_49_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_49_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_49_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_49_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_49_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_49_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_49_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: put on a hat\nB: take off a hat\nC: button a shirt\nD: tie a shoelace",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: put on a hat\nB: take off a hat\nC: button a shirt\nD: tie a shoelace",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_50_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_50_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_50_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_50_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_50_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_50_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_50_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_50_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_50_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_50_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_50_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_50_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: tie a shoe\nB: make a phone call\nC: play a guitar\nD: cook a meal",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: tie a shoe\nB: make a phone call\nC: play a guitar\nD: cook a meal",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_51_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_51_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_51_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_51_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_51_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_51_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_51_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_51_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_51_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_51_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_51_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_51_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: drop\nB: jump\nC: run\nD: sit",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: drop\nB: jump\nC: run\nD: sit",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_52_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_52_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_52_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_52_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_52_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_52_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_52_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_52_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_52_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_52_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_52_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_52_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: wave hand\nB: scratch head\nC: touch chest (stomachache\nD: jump up and down",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: wave hand\nB: scratch head\nC: touch chest (stomachache\nD: jump up and down",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_53_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_53_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_53_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_53_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_53_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_53_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_53_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_53_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_53_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_53_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_53_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_53_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: lying down\nB: standing up\nC: running\nD: sitting down",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: lying down\nB: standing up\nC: running\nD: sitting down",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_54_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_54_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_54_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_54_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_54_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_54_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_54_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_54_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_54_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_54_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_54_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_54_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: pickup\nB: run\nC: jump\nD: sit",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: pickup\nB: run\nC: jump\nD: sit",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_55_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_55_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_55_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_55_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_55_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_55_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_55_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_55_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_55_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_55_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_55_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_55_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: wipe face\nB: snap fingers\nC: brush hair\nD: tie shoelace",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: wipe face\nB: snap fingers\nC: brush hair\nD: tie shoelace",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_56_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_56_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_56_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_56_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_56_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_56_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_56_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_56_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_56_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_56_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_56_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_56_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: sleeping\nB: dancing\nC: reading\nD: running",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: sleeping\nB: dancing\nC: reading\nD: running",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_57_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_57_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_57_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_57_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_57_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_57_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_57_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_57_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_57_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_57_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_57_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_57_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: wear jacket\nB: sit down\nC: jump\nD: tie shoelaces",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: wear jacket\nB: sit down\nC: jump\nD: tie shoelaces",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_58_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_58_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_58_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_58_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_58_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_58_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_58_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_58_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_58_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_58_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_58_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_58_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: sitting down\nB: jumping\nC: standing up\nD: lying down",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: sitting down\nB: jumping\nC: standing up\nD: lying down",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_59_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_59_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_59_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_59_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_59_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_59_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_59_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_59_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_59_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_59_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_59_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_59_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: ride a bike\nB: read a book\nC: eat meal\nD: play a musical instrument",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: ride a bike\nB: read a book\nC: eat meal\nD: play a musical instrument",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_60_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_60_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_60_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_60_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_60_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_60_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_60_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_60_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_60_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_60_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_60_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_60_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: touch back (backache)\nB: clap hands\nC: sit down\nD: jump",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: touch back (backache)\nB: clap hands\nC: sit down\nD: jump",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_61_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_61_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_61_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_61_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_61_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_61_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_61_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_61_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_61_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_61_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_61_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_61_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: sitting down\nB: lying down\nC: standing up\nD: jumping",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: sitting down\nB: lying down\nC: standing up\nD: jumping",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_62_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_62_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_62_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_62_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_62_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_62_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_62_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_62_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_62_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_62_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_62_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_62_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: eat meal\nB: write letter\nC: ride bicycle\nD: play guitar",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: eat meal\nB: write letter\nC: ride bicycle\nD: play guitar",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_63_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_63_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_63_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_63_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_63_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_63_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_63_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_63_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_63_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_63_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_63_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_63_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: wave hand\nB: pick up a book\nC: wipe face\nD: tie shoelaces",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: wave hand\nB: pick up a book\nC: wipe face\nD: tie shoelaces",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_64_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_64_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_64_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_64_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_64_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_64_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_64_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_64_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_64_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_64_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_64_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_64_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: read a book\nB: play a guitar\nC: make a phone call\nD: cook a meal",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: read a book\nB: play a guitar\nC: make a phone call\nD: cook a meal",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_65_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_65_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_65_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_65_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_65_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_65_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_65_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_65_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_65_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_65_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_65_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_65_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: read a book\nB: make a phone call\nC: eat a meal\nD: play a video game",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: read a book\nB: make a phone call\nC: eat a meal\nD: play a video game",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_66_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_66_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_66_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_66_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_66_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_66_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_66_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_66_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_66_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_66_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_66_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_66_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: drop\nB: jump\nC: pick\nD: hold",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: drop\nB: jump\nC: pick\nD: hold",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_67_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_67_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_67_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_67_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_67_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_67_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_67_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_67_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_67_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_67_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_67_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_67_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: climb ladder\nB: kick ball\nC: tie shoe\nD: wipe face",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: climb ladder\nB: kick ball\nC: tie shoe\nD: wipe face",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_68_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_68_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_68_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_68_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_68_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_68_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_68_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_68_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_68_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_68_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_68_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_68_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: dancing\nB: reading\nC: cooking\nD: sleeping",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: dancing\nB: reading\nC: cooking\nD: sleeping",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_69_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_69_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_69_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_69_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_69_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_69_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_69_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_69_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_69_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_69_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_69_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_69_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: sitting down\nB: running\nC: standing up\nD: jumping",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: sitting down\nB: running\nC: standing up\nD: jumping",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_70_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_70_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_70_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_70_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_70_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_70_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_70_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_70_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_70_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_70_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_70_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_70_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: sleeping\nB: dancing\nC: cooking\nD: reading",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: sleeping\nB: dancing\nC: cooking\nD: reading",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_71_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_71_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_71_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_71_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_71_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_71_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_71_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_71_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_71_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_71_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_71_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_71_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: running\nB: jumping\nC: sitting down\nD: standing up",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: running\nB: jumping\nC: sitting down\nD: standing up",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_72_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_72_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_72_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_72_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_72_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_72_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_72_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_72_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_72_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_72_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_72_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_72_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: drop\nB: sit\nC: run\nD: jump",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: drop\nB: sit\nC: run\nD: jump",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_73_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_73_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_73_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_73_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_73_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_73_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_73_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_73_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_73_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_73_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_73_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_73_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: brushing teeth\nB: riding a bicycle\nC: cooking dinner\nD: tying shoelaces",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: brushing teeth\nB: riding a bicycle\nC: cooking dinner\nD: tying shoelaces",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_74_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_74_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_74_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_74_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_74_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_74_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_74_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_74_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_74_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_74_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_74_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_74_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: wave hand\nB: tie shoelace\nC: clap hands\nD: wipe face",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: wave hand\nB: tie shoelace\nC: clap hands\nD: wipe face",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_75_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_75_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_75_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_75_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_75_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_75_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_75_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_75_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_75_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_75_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_75_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_75_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: clap hands\nB: wave hand\nC: wipe face\nD: tie shoelace",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: clap hands\nB: wave hand\nC: wipe face\nD: tie shoelace",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_76_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_76_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_76_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_76_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_76_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_76_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_76_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_76_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_76_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_76_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_76_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_76_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: dancing\nB: sleeping\nC: reading\nD: cooking",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: dancing\nB: sleeping\nC: reading\nD: cooking",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_77_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_77_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_77_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_77_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_77_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_77_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_77_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_77_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_77_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_77_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_77_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_77_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: ride bicycle\nB: play guitar\nC: climb ladder\nD: drink water",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: ride bicycle\nB: play guitar\nC: climb ladder\nD: drink water",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_78_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_78_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_78_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_78_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_78_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_78_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_78_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_78_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_78_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_78_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_78_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_78_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: sleeping\nB: reading\nC: dancing\nD: running",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: sleeping\nB: reading\nC: dancing\nD: running",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_79_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_79_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_79_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_79_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_79_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_79_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_79_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_79_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_79_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_79_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_79_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_79_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: wipe face\nB: tie shoelaces\nC: brush hair\nD: write a note",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: wipe face\nB: tie shoelaces\nC: brush hair\nD: write a note",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_80_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_80_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_80_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_80_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_80_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_80_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_80_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_80_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_80_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_80_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_80_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_80_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: play a guitar\nB: read a book\nC: tie shoelaces\nD: check time (from watch)",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: play a guitar\nB: read a book\nC: tie shoelaces\nD: check time (from watch)",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_81_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_81_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_81_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_81_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_81_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_81_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_81_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_81_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_81_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_81_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_81_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_81_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: put on a hat\nB: open a door\nC: tie shoelaces\nD: take off a hat",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: put on a hat\nB: open a door\nC: tie shoelaces\nD: take off a hat",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_82_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_82_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_82_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_82_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_82_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_82_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_82_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_82_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_82_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_82_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_82_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_82_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: eat food\nB: brush hair\nC: wipe face\nD: tie shoelaces",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: eat food\nB: brush hair\nC: wipe face\nD: tie shoelaces",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_83_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_83_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_83_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_83_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_83_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_83_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_83_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_83_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_83_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_83_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_83_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_83_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: standing up\nB: jumping\nC: sitting down\nD: running",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: standing up\nB: jumping\nC: sitting down\nD: running",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_84_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_84_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_84_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_84_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_84_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_84_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_84_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_84_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_84_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_84_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_84_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_84_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: adjust glasses\nB: check time (from watch)\nC: wave hand\nD: tie shoelaces",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: adjust glasses\nB: check time (from watch)\nC: wave hand\nD: tie shoelaces",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_85_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_85_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_85_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_85_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_85_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_85_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_85_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_85_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_85_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_85_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_85_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_85_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: reading\nB: swimming\nC: cooking\nD: dancing",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: reading\nB: swimming\nC: cooking\nD: dancing",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_86_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_86_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_86_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_86_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_86_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_86_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_86_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_86_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_86_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_86_9.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: sit\nB: run\nC: drop\nD: jump",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: sit\nB: run\nC: drop\nD: jump",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_87_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_87_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_87_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_87_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_87_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_87_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_87_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_87_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_87_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_87_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_87_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_87_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: jump\nB: wave hand\nC: tie shoelaces\nD: wipe face",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: jump\nB: wave hand\nC: tie shoelaces\nD: wipe face",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_88_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_88_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_88_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_88_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_88_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_88_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_88_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_88_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_88_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_88_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_88_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_88_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: make a phone call\nB: eat a meal\nC: play a musical instrument\nD: tie a shoelace",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: make a phone call\nB: eat a meal\nC: play a musical instrument\nD: tie a shoelace",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_89_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_89_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_89_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_89_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_89_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_89_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_89_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_89_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_89_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_89_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_89_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_89_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: drop\nB: sit\nC: jump\nD: run",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: drop\nB: sit\nC: jump\nD: run",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_90_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_90_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_90_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_90_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_90_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_90_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_90_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_90_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_90_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_90_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_90_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_90_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: dancing\nB: reading\nC: jumping\nD: running",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: dancing\nB: reading\nC: jumping\nD: running",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_91_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_91_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_91_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_91_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_91_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_91_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_91_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_91_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_91_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_91_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_91_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_91_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: bake a cake\nB: play a guitar\nC: ride a bike\nD: wear jacket",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: bake a cake\nB: play a guitar\nC: ride a bike\nD: wear jacket",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_92_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_92_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_92_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_92_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_92_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_92_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_92_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_92_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_92_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_92_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_92_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_92_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: jumping\nB: touch back (backache)\nC: running\nD: sitting",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: jumping\nB: touch back (backache)\nC: running\nD: sitting",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_93_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_93_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_93_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_93_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_93_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_93_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_93_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_93_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_93_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_93_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_93_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_93_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: jumping\nB: sitting down\nC: running\nD: standing up",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: jumping\nB: sitting down\nC: running\nD: standing up",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_94_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_94_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_94_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_94_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_94_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_94_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_94_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_94_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_94_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_94_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_94_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_94_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: write on blackboard\nB: touch chest (stomachache\nC: jump\nD: sit down",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: write on blackboard\nB: touch chest (stomachache\nC: jump\nD: sit down",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_95_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_95_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_95_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_95_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_95_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_95_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_95_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_95_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_95_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_95_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_95_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_95_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: remove jacket\nB: tie shoelaces\nC: wear jacket\nD: sit down",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: remove jacket\nB: tie shoelaces\nC: wear jacket\nD: sit down",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_96_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_96_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_96_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_96_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_96_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_96_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_96_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_96_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_96_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_96_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_96_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_96_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: put on a coat\nB: take off a hat\nC: put on a hat\nD: tie shoelaces",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: put on a coat\nB: take off a hat\nC: put on a hat\nD: tie shoelaces",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_97_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_97_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_97_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_97_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_97_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_97_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_97_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_97_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_97_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_97_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_97_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_97_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: jumping\nB: lying down\nC: sitting down\nD: standing up",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: jumping\nB: lying down\nC: sitting down\nD: standing up",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_98_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_98_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_98_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_98_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_98_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_98_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_98_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_98_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_98_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_98_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_98_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_98_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: play guitar\nB: tie shoelaces\nC: cook meal\nD: wear jacket",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: play guitar\nB: tie shoelaces\nC: cook meal\nD: wear jacket",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_99_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_99_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_99_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_99_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_99_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_99_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_99_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_99_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_99_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_99_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_99_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_99_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: sit\nB: jump\nC: drop\nD: run",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: sit\nB: jump\nC: drop\nD: run",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_100_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_100_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_100_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_100_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_100_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_100_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_100_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_100_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_100_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_100_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_100_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_100_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: standing up\nB: jumping\nC: sitting down\nD: lying down",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: standing up\nB: jumping\nC: sitting down\nD: lying down",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_101_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_101_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_101_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_101_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_101_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_101_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_101_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_101_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_101_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_101_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_101_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_101_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: brushing teeth\nB: playing basketball\nC: dancing\nD: cooking",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: brushing teeth\nB: playing basketball\nC: dancing\nD: cooking",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_102_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_102_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_102_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_102_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_102_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_102_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_102_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_102_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_102_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_102_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_102_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_102_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: jump\nB: sit\nC: run\nD: bow",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: jump\nB: sit\nC: run\nD: bow",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_103_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_103_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_103_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_103_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_103_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_103_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_103_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_103_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_103_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_103_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_103_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_103_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: clap\nB: drop\nC: run\nD: jump",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: clap\nB: drop\nC: run\nD: jump",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_104_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_104_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_104_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_104_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_104_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_104_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_104_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_104_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_104_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_104_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_104_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_104_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: remove shoes\nB: wear shoes\nC: remove jacket\nD: wear jacket",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: remove shoes\nB: wear shoes\nC: remove jacket\nD: wear jacket",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_105_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_105_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_105_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_105_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_105_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_105_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_105_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_105_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_105_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_105_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_105_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_105_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: tie a shoelace\nB: eat a sandwich\nC: put on a hat\nD: throw a ball",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: tie a shoelace\nB: eat a sandwich\nC: put on a hat\nD: throw a ball",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_106_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_106_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_106_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_106_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_106_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_106_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_106_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_106_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_106_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_106_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_106_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_106_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: check time (from watch)\nB: drink water\nC: tie shoelace\nD: wave hand",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: check time (from watch)\nB: drink water\nC: tie shoelace\nD: wave hand",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_107_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_107_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_107_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_107_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_107_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_107_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_107_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_107_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_107_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_107_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_107_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_107_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: sit down\nB: jump up\nC: take off hat\nD: wear jacket",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: sit down\nB: jump up\nC: take off hat\nD: wear jacket",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_108_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_108_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_108_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_108_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_108_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_108_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_108_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_108_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_108_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_108_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_108_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_108_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: wave hand\nB: clap hands\nC: tie shoe\nD: wipe face",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: wave hand\nB: clap hands\nC: tie shoe\nD: wipe face",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_109_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_109_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_109_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_109_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_109_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_109_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_109_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_109_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_109_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_109_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_109_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_109_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: kick a ball\nB: take off a hat\nC: wave a hand\nD: put on a hat",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: kick a ball\nB: take off a hat\nC: wave a hand\nD: put on a hat",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_110_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_110_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_110_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_110_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_110_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_110_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_110_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_110_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_110_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_110_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_110_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_110_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: tie a shoelace\nB: put on a hat\nC: button a shirt\nD: take off a hat",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: tie a shoelace\nB: put on a hat\nC: button a shirt\nD: take off a hat",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_111_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_111_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_111_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_111_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_111_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_111_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_111_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_111_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_111_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_111_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_111_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_111_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: drink water\nB: play guitar\nC: tie shoelaces\nD: check time (from watch)",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: drink water\nB: play guitar\nC: tie shoelaces\nD: check time (from watch)",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_112_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_112_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_112_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_112_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_112_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_112_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_112_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_112_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_112_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_112_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_112_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_112_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: play football\nB: ride a bike\nC: read a book\nD: eat meal",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: play football\nB: ride a bike\nC: read a book\nD: eat meal",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_113_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_113_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_113_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_113_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_113_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_113_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_113_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_113_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_113_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_113_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_113_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_113_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: sit\nB: run\nC: jump\nD: drop",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: sit\nB: run\nC: jump\nD: drop",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_114_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_114_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_114_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_114_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_114_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_114_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_114_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_114_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_114_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_114_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_114_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_114_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: put on glasses\nB: tie a shoelace\nC: take off a hat\nD: put on a hat",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: put on glasses\nB: tie a shoelace\nC: take off a hat\nD: put on a hat",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_115_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_115_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_115_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_115_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_115_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_115_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_115_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_115_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_115_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_115_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_115_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_115_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: tie a shoelace\nB: drink water\nC: ride a bicycle\nD: read a book",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: tie a shoelace\nB: drink water\nC: ride a bicycle\nD: read a book",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_116_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_116_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_116_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_116_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_116_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_116_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_116_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_116_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_116_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_116_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_116_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_116_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: dance\nB: pickup\nC: sleep\nD: basketball",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: dance\nB: pickup\nC: sleep\nD: basketball",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_117_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_117_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_117_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_117_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_117_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_117_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_117_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_117_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_117_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_117_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_117_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_117_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: eat meal\nB: play guitar\nC: write letter\nD: ride bicycle",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: eat meal\nB: play guitar\nC: write letter\nD: ride bicycle",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_118_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_118_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_118_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_118_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_118_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_118_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_118_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_118_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_118_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_118_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_118_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_118_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: drink water\nB: read a book\nC: play basketball\nD: ride a bicycle",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: drink water\nB: read a book\nC: play basketball\nD: ride a bicycle",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_119_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_119_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_119_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_119_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_119_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_119_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_119_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_119_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_119_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_119_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_119_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_119_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: tie shoe\nB: wave hand\nC: check time (from watch)\nD: pick up phone",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: tie shoe\nB: wave hand\nC: check time (from watch)\nD: pick up phone",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_120_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_120_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_120_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_120_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_120_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_120_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_120_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_120_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_120_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_120_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_120_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_120_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: tie a shoelace\nB: put on a hat\nC: adjust a scarf\nD: take off a hat",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: tie a shoelace\nB: put on a hat\nC: adjust a scarf\nD: take off a hat",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_121_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_121_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_121_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_121_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_121_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_121_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_121_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_121_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_121_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_121_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_121_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_121_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: sitting down\nB: lying down\nC: standing up\nD: running",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: sitting down\nB: lying down\nC: standing up\nD: running",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_122_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_122_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_122_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_122_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_122_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_122_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_122_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_122_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_122_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_122_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_122_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_122_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: read a book\nB: drink water\nC: play guitar\nD: ride a bike",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: read a book\nB: drink water\nC: play guitar\nD: ride a bike",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_123_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_123_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_123_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_123_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_123_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_123_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_123_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_123_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_123_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_123_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_123_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_123_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: tie shoelaces\nB: check time (from watch)\nC: brush hair\nD: eat food",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: tie shoelaces\nB: check time (from watch)\nC: brush hair\nD: eat food",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_124_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_124_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_124_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_124_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_124_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_124_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_124_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_124_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_124_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_124_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_124_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_124_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: wave hand\nB: eat\nC: scratch head\nD: wipe face",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: wave hand\nB: eat\nC: scratch head\nD: wipe face",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_125_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_125_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_125_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_125_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_125_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_125_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_125_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_125_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_125_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_125_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_125_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_125_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: drink water\nB: write notes\nC: play guitar\nD: tie shoes",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: drink water\nB: write notes\nC: play guitar\nD: tie shoes",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_126_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_126_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_126_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_126_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_126_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_126_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_126_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_126_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_126_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_126_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_126_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_126_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: make a phone call\nB: write a letter\nC: tie shoes\nD: brush teeth",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: make a phone call\nB: write a letter\nC: tie shoes\nD: brush teeth",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_127_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_127_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_127_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_127_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_127_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_127_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_127_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_127_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_127_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_127_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_127_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_127_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: reading a book\nB: playing basketball\nC: brushing teeth\nD: riding a bicycle",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: reading a book\nB: playing basketball\nC: brushing teeth\nD: riding a bicycle",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_128_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_128_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_128_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_128_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_128_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_128_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_128_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_128_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_128_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_128_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_128_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_128_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: raise hand (question)\nB: touch chest (stomachache\nC: sit down (rest)\nD: step forward (walk)",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: raise hand (question)\nB: touch chest (stomachache\nC: sit down (rest)\nD: step forward (walk)",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_129_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_129_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_129_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_129_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_129_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_129_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_129_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_129_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_129_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_129_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_129_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_129_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: talk on phone\nB: pick up object\nC: tie shoelaces\nD: wipe face",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: talk on phone\nB: pick up object\nC: tie shoelaces\nD: wipe face",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_130_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_130_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_130_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_130_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_130_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_130_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_130_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_130_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_130_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_130_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_130_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_130_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: jump (exercise)\nB: touch chest (stomachache\nC: sit down (rest)\nD: wave hand (greeting)",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: jump (exercise)\nB: touch chest (stomachache\nC: sit down (rest)\nD: wave hand (greeting)",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_131_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_131_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_131_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_131_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_131_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_131_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_131_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_131_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_131_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_131_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_131_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_131_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: lying down\nB: sitting down\nC: standing up\nD: running",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: lying down\nB: sitting down\nC: standing up\nD: running",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_132_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_132_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_132_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_132_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_132_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_132_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_132_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_132_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_132_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_132_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_132_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_132_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: jumping\nB: sitting\nC: pickup\nD: running",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: jumping\nB: sitting\nC: pickup\nD: running",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_133_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_133_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_133_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_133_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_133_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_133_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_133_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_133_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_133_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_133_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_133_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_133_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: sit\nB: jump\nC: sleep\nD: pickup",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: sit\nB: jump\nC: sleep\nD: pickup",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_134_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_134_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_134_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_134_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_134_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_134_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_134_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_134_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_134_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_134_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_134_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_134_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: eat meal\nB: play guitar\nC: write letter\nD: read book",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: eat meal\nB: play guitar\nC: write letter\nD: read book",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_135_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_135_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_135_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_135_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_135_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_135_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_135_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_135_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_135_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_135_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_135_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_135_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: eat meal\nB: play basketball\nC: walk dog\nD: sleep",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: eat meal\nB: play basketball\nC: walk dog\nD: sleep",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_136_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_136_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_136_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_136_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_136_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_136_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_136_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_136_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_136_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_136_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_136_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_136_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: dance\nB: read book\nC: play tennis\nD: eat meal",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: dance\nB: read book\nC: play tennis\nD: eat meal",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_137_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_137_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_137_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_137_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_137_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_137_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_137_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_137_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_137_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_137_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_137_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_137_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: playing piano\nB: brushing teeth\nC: riding a bike\nD: cooking dinner",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: playing piano\nB: brushing teeth\nC: riding a bike\nD: cooking dinner",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_138_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_138_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_138_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_138_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_138_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_138_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_138_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_138_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_138_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_138_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_138_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_138_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: run\nB: pickup\nC: jump\nD: sit down",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: run\nB: pickup\nC: jump\nD: sit down",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_139_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_139_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_139_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_139_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_139_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_139_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_139_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_139_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_139_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_139_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_139_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_139_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: sit\nB: pickup\nC: jump\nD: run",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: sit\nB: pickup\nC: jump\nD: run",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_140_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_140_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_140_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_140_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_140_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_140_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_140_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_140_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_140_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_140_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_140_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_140_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: wave a hand\nB: tie a shoe\nC: kick a ball\nD: drink water",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: wave a hand\nB: tie a shoe\nC: kick a ball\nD: drink water",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_141_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_141_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_141_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_141_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_141_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_141_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_141_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_141_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_141_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_141_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_141_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_141_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: drop\nB: jump\nC: sit\nD: turn",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: drop\nB: jump\nC: sit\nD: turn",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_142_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_142_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_142_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_142_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_142_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_142_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_142_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_142_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_142_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_142_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_142_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_142_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: pickup\nB: jump\nC: sit\nD: run",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: pickup\nB: jump\nC: sit\nD: run",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_143_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_143_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_143_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_143_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_143_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_143_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_143_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_143_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_143_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_143_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_143_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_143_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: make a phone call\nB: play a guitar\nC: cook a meal\nD: paint a picture",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: make a phone call\nB: play a guitar\nC: cook a meal\nD: paint a picture",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_144_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_144_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_144_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_144_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_144_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_144_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_144_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_144_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_144_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_144_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_144_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_144_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: check time (from watch)\nB: tie shoelaces\nC: eat an apple\nD: play guitar",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: check time (from watch)\nB: tie shoelaces\nC: eat an apple\nD: play guitar",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_145_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_145_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_145_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_145_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_145_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_145_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_145_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_145_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_145_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_145_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_145_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_145_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: running a marathon\nB: cooking dinner\nC: playing a guitar\nD: brushing teeth",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: running a marathon\nB: cooking dinner\nC: playing a guitar\nD: brushing teeth",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_146_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_146_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_146_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_146_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_146_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_146_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_146_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_146_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_146_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_146_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_146_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_146_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: make a phone call\nB: read a book\nC: ride a bicycle\nD: play a guitar",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: make a phone call\nB: read a book\nC: ride a bicycle\nD: play a guitar",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_147_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_147_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_147_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_147_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_147_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_147_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_147_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_147_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_147_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_147_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_147_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_147_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: sit down\nB: wave\nC: put on a hat\nD: take off a hat",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: sit down\nB: wave\nC: put on a hat\nD: take off a hat",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_148_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_148_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_148_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_148_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_148_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_148_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_148_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_148_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_148_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_148_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_148_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_148_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: put on a hat\nB: tie shoes\nC: lift weights\nD: take off a hat",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: put on a hat\nB: tie shoes\nC: lift weights\nD: take off a hat",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_149_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_149_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_149_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_149_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_149_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_149_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_149_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_149_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_149_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_149_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_149_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_149_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: adjust a tie\nB: take off a hat\nC: put on glasses\nD: put on a hat",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: adjust a tie\nB: take off a hat\nC: put on glasses\nD: put on a hat",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_150_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_150_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_150_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_150_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_150_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_150_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_150_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_150_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_150_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_150_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_150_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_150_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: raise hand (greeting)\nB: jump (excited)\nC: touch chest (stomachache\nD: sit down (tired)",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: raise hand (greeting)\nB: jump (excited)\nC: touch chest (stomachache\nD: sit down (tired)",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_151_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_151_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_151_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_151_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_151_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_151_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_151_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_151_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_151_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_151_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_151_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_151_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: raise hand\nB: touch chest (stomachache\nC: jump in place\nD: bend forward",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: raise hand\nB: touch chest (stomachache\nC: jump in place\nD: bend forward",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_152_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_152_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_152_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_152_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_152_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_152_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_152_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_152_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_152_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_152_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_152_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_152_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: ride bike\nB: read book\nC: play guitar\nD: eat meal",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: ride bike\nB: read book\nC: play guitar\nD: eat meal",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_153_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_153_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_153_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_153_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_153_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_153_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_153_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_153_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_153_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_153_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_153_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_153_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: eat meal\nB: play guitar\nC: read book\nD: ride bicycle",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: eat meal\nB: play guitar\nC: read book\nD: ride bicycle",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_154_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_154_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_154_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_154_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_154_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_154_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_154_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_154_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_154_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_154_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_154_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_154_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: sleep\nB: read book\nC: eat meal\nD: run",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: sleep\nB: read book\nC: eat meal\nD: run",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_155_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_155_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_155_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_155_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_155_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_155_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_155_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_155_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_155_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_155_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_155_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_155_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: play guitar\nB: drink water\nC: jump rope\nD: read a book",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: play guitar\nB: drink water\nC: jump rope\nD: read a book",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_156_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_156_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_156_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_156_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_156_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_156_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_156_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_156_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_156_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_156_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_156_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_156_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: drop\nB: spin\nC: run\nD: jump",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: drop\nB: spin\nC: run\nD: jump",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_157_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_157_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_157_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_157_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_157_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_157_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_157_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_157_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_157_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_157_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_157_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_157_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: ride a bicycle\nB: play a guitar\nC: put on a hat\nD: write on a board",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: ride a bicycle\nB: play a guitar\nC: put on a hat\nD: write on a board",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_158_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_158_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_158_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_158_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_158_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_158_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_158_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_158_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_158_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_158_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_158_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_158_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: clap hands\nB: tie shoelaces\nC: touch back (backache)\nD: jump rope",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: clap hands\nB: tie shoelaces\nC: touch back (backache)\nD: jump rope",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_159_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_159_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_159_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_159_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_159_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_159_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_159_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_159_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_159_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_159_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_159_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_159_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: tie shoelaces\nB: drink water\nC: read a book\nD: play guitar",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: tie shoelaces\nB: drink water\nC: read a book\nD: play guitar",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_160_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_160_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_160_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_160_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_160_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_160_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_160_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_160_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_160_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_160_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_160_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_160_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: sitting\nB: touch chest (stomachache\nC: jumping\nD: waving",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: sitting\nB: touch chest (stomachache\nC: jumping\nD: waving",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_161_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_161_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_161_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_161_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_161_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_161_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_161_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_161_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_161_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_161_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_161_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_161_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: play tennis\nB: read a book\nC: eat meal\nD: ride a bike",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: play tennis\nB: read a book\nC: eat meal\nD: ride a bike",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_162_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_162_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_162_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_162_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_162_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_162_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_162_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_162_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_162_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_162_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_162_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_162_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: lying down\nB: running\nC: sitting down\nD: standing up",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: lying down\nB: running\nC: sitting down\nD: standing up",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_163_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_163_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_163_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_163_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_163_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_163_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_163_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_163_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_163_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_163_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_163_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_163_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: jump\nB: run\nC: sit\nD: drop",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: jump\nB: run\nC: sit\nD: drop",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_164_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_164_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_164_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_164_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_164_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_164_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_164_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_164_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_164_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_164_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_164_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_164_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: pick up object\nB: wipe face\nC: tie shoes\nD: jump rope",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: pick up object\nB: wipe face\nC: tie shoes\nD: jump rope",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_165_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_165_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_165_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_165_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_165_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_165_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_165_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_165_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_165_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_165_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_165_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_165_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: sit\nB: run\nC: jump\nD: bow",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: sit\nB: run\nC: jump\nD: bow",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_166_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_166_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_166_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_166_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_166_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_166_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_166_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_166_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_166_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_166_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_166_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_166_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: remove shoes\nB: wear jacket\nC: sit down\nD: drink water",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: remove shoes\nB: wear jacket\nC: sit down\nD: drink water",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_167_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_167_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_167_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_167_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_167_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_167_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_167_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_167_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_167_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_167_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_167_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_167_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: run in place\nB: wave hand\nC: touch chest (stomachache\nD: jump up",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: run in place\nB: wave hand\nC: touch chest (stomachache\nD: jump up",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_168_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_168_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_168_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_168_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_168_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_168_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_168_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_168_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_168_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_168_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_168_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_168_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: tie shoelaces\nB: open a door\nC: brush teeth\nD: check time (from watch)",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: tie shoelaces\nB: open a door\nC: brush teeth\nD: check time (from watch)",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_169_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_169_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_169_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_169_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_169_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_169_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_169_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_169_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_169_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_169_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_169_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_169_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: jump rope\nB: play guitar\nC: wipe face\nD: tie shoelaces",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: jump rope\nB: play guitar\nC: wipe face\nD: tie shoelaces",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_170_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_170_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_170_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_170_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_170_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_170_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_170_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_170_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_170_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_170_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_170_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_170_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: sitting down\nB: running\nC: lying down\nD: standing up",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: sitting down\nB: running\nC: lying down\nD: standing up",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_171_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_171_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_171_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_171_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_171_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_171_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_171_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_171_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_171_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_171_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_171_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_171_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: button a shirt\nB: take off a hat\nC: put on a hat\nD: tie a shoe",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: button a shirt\nB: take off a hat\nC: put on a hat\nD: tie a shoe",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_172_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_172_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_172_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_172_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_172_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_172_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_172_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_172_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_172_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_172_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_172_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_172_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: ride bicycle\nB: play piano\nC: wear jacket\nD: eat apple",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: ride bicycle\nB: play piano\nC: wear jacket\nD: eat apple",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_173_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_173_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_173_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_173_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_173_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_173_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_173_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_173_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_173_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_173_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_173_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_173_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: brushing teeth\nB: jogging\nC: reading a book\nD: cooking",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: brushing teeth\nB: jogging\nC: reading a book\nD: cooking",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_174_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_174_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_174_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_174_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_174_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_174_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_174_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_174_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_174_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_174_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_174_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_174_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: tie shoelaces\nB: pick up bag\nC: clap hands\nD: check time (from watch)",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: tie shoelaces\nB: pick up bag\nC: clap hands\nD: check time (from watch)",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_175_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_175_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_175_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_175_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_175_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_175_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_175_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_175_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_175_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_175_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_175_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_175_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: wave hand\nB: touch back (backache)\nC: eat food\nD: jump",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: wave hand\nB: touch back (backache)\nC: eat food\nD: jump",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_176_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_176_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_176_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_176_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_176_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_176_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_176_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_176_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_176_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_176_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_176_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_176_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: running\nB: reading\nC: dancing\nD: cooking",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: running\nB: reading\nC: dancing\nD: cooking",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_177_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_177_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_177_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_177_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_177_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_177_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_177_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_177_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_177_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_177_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_177_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_177_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: play guitar\nB: eat meal\nC: dance\nD: read book",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: play guitar\nB: eat meal\nC: dance\nD: read book",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_178_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_178_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_178_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_178_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_178_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_178_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_178_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_178_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_178_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_178_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_178_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_178_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: tie shoelaces\nB: make a phone call\nC: brush teeth\nD: write in a notebook",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: tie shoelaces\nB: make a phone call\nC: brush teeth\nD: write in a notebook",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_179_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_179_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_179_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_179_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_179_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_179_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_179_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_179_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_179_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_179_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_179_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_179_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: check time (from watch)\nB: tie shoes\nC: take a photo\nD: read a book",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: check time (from watch)\nB: tie shoes\nC: take a photo\nD: read a book",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_180_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_180_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_180_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_180_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_180_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_180_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_180_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_180_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_180_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_180_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_180_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_180_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: eat sandwich\nB: wear jacket\nC: play piano\nD: ride bicycle",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: eat sandwich\nB: wear jacket\nC: play piano\nD: ride bicycle",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_181_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_181_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_181_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_181_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_181_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_181_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_181_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_181_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_181_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_181_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_181_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_181_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: brushing teeth\nB: cooking\nC: jogging\nD: reading a book",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: brushing teeth\nB: cooking\nC: jogging\nD: reading a book",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_182_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_182_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_182_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_182_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_182_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_182_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_182_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_182_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_182_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_182_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_182_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_182_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: pick up phone\nB: tie shoe\nC: adjust glasses\nD: check time (from watch)",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: pick up phone\nB: tie shoe\nC: adjust glasses\nD: check time (from watch)",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_183_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_183_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_183_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_183_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_183_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_183_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_183_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_183_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_183_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_183_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_183_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_183_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: jumping\nB: sitting down\nC: dancing\nD: running",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: jumping\nB: sitting down\nC: dancing\nD: running",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_184_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_184_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_184_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_184_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_184_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_184_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_184_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_184_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_184_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_184_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_184_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_184_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: ride bike\nB: play guitar\nC: wear jacket\nD: eat food",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: ride bike\nB: play guitar\nC: wear jacket\nD: eat food",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_185_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_185_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_185_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_185_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_185_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_185_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_185_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_185_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_185_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_185_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_185_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_185_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: eat a sandwich\nB: sit down\nC: play a guitar\nD: put on a hat",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: eat a sandwich\nB: sit down\nC: play a guitar\nD: put on a hat",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_186_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_186_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_186_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_186_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_186_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_186_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_186_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_186_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_186_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_186_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_186_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_186_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: jump\nB: drop\nC: run\nD: climb",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: jump\nB: drop\nC: run\nD: climb",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_187_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_187_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_187_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_187_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_187_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_187_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_187_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_187_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_187_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_187_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_187_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_187_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: bow\nB: run\nC: sit\nD: jump",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: bow\nB: run\nC: sit\nD: jump",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_188_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_188_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_188_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_188_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_188_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_188_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_188_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_188_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_188_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_188_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_188_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_188_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: reading\nB: dancing\nC: sleeping\nD: running",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: reading\nB: dancing\nC: sleeping\nD: running",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_189_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_189_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_189_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_189_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_189_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_189_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_189_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_189_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_189_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_189_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_189_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_189_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: wave\nB: jump\nC: bow\nD: sit",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: wave\nB: jump\nC: bow\nD: sit",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_190_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_190_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_190_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_190_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_190_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_190_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_190_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_190_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_190_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_190_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_190_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_190_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: standing up\nB: sitting down\nC: jumping\nD: running",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: standing up\nB: sitting down\nC: jumping\nD: running",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_191_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_191_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_191_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_191_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_191_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_191_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_191_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_191_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_191_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_191_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_191_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_191_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: tie shoelaces\nB: play guitar\nC: read a book\nD: drink water",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: tie shoelaces\nB: play guitar\nC: read a book\nD: drink water",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_192_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_192_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_192_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_192_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_192_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_192_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_192_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_192_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_192_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_192_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_192_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_192_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: running\nB: cooking\nC: reading\nD: dancing",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: running\nB: cooking\nC: reading\nD: dancing",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_193_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_193_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_193_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_193_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_193_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_193_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_193_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_193_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_193_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_193_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_193_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_193_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: play guitar\nB: check time (from watch)\nC: tie shoelaces\nD: eat sandwich",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: play guitar\nB: check time (from watch)\nC: tie shoelaces\nD: eat sandwich",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_194_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_194_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_194_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_194_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_194_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_194_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_194_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_194_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_194_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_194_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_194_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_194_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: make a phone call\nB: read a book\nC: ride a bicycle\nD: play a guitar",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: make a phone call\nB: read a book\nC: ride a bicycle\nD: play a guitar",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_195_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_195_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_195_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_195_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_195_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_195_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_195_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_195_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_195_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_195_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_195_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_195_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: tie shoes\nB: eat a sandwich\nC: read a book\nD: put on a hat",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: tie shoes\nB: eat a sandwich\nC: read a book\nD: put on a hat",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_196_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_196_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_196_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_196_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_196_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_196_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_196_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_196_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_196_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_196_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_196_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_196_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: sitting down\nB: standing up\nC: lying down\nD: jumping",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: sitting down\nB: standing up\nC: lying down\nD: jumping",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_197_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_197_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_197_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_197_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_197_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_197_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_197_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_197_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_197_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_197_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_197_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_197_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_Action_Recognition",
+    "visual_input_component": "natural image",
+    "source": "PKUMMD",
+    "options": "A: play a guitar\nB: make a phone call\nC: tie a shoelace\nD: ride a bicycle",
+    "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.",
+    "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: play a guitar\nB: make a phone call\nC: tie a shoelace\nD: ride a bicycle",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_198_0.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_198_1.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_198_2.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_198_3.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_198_4.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_198_5.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_198_6.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_198_7.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_198_8.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_198_9.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_198_10.png",
+      "../MMIU-Benchmark/Multiview_Action_Recognition/Multiview_Action_Recognition_198_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[755.199, 1687.366, 0.912], [762.788, 1426.72, 1.06], [630.862, 1571.41, 1.003], [798.666, 1466.0, 0.68]]\nB: [[752.983, 1266.122, 0.837], [675.965, 1325.79, 0.95], [756.034, 1628.64, 0.801], [696.028, 1386.4, 0.67]]\nC: [[753.288, 1465.266, 0.978], [728.298, 1787.05, 0.81], [812.921, 1600.32, 0.911], [834.531, 1762.1, 0.91]]\nD: [[705.473, 1565.779, 0.995], [702.703, 1568.02, 0.92], [699.933, 1570.26, 0.845], [697.471, 1572.4, 0.77]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[755.199, 1687.366, 0.912], [762.788, 1426.72, 1.06], [630.862, 1571.41, 1.003], [798.666, 1466.0, 0.68]]\nB: [[752.983, 1266.122, 0.837], [675.965, 1325.79, 0.95], [756.034, 1628.64, 0.801], [696.028, 1386.4, 0.67]]\nC: [[753.288, 1465.266, 0.978], [728.298, 1787.05, 0.81], [812.921, 1600.32, 0.911], [834.531, 1762.1, 0.91]]\nD: [[705.473, 1565.779, 0.995], [702.703, 1568.02, 0.92], [699.933, 1570.26, 0.845], [697.471, 1572.4, 0.77]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_0_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_0_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_0_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_0_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_0_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_0_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_0_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_0_7.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[1779.824, 2603.51, 0.357], [1779.617, 2603.65, 0.307], [1779.419, 2603.795, 0.441], [1779.221, 2603.94, 0.574]]\nB: [[1820.656, 2604.08, 0.355], [1608.069, 2300.22, 0.346], [1590.874, 2776.0, 0.366], [1586.173, 2790.75, 0.602]]\nC: [[2053.203, 2562.85, 0.348], [1922.673, 2150.26, 0.297], [1762.465, 2275.213, 0.516], [1794.318, 2966.29, 0.652]]\nD: [[1676.53, 2378.45, 0.304], [1630.8, 2506.41, 0.34], [1460.959, 2537.73, 0.431], [1807.291, 2750.98, 0.686]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1779.824, 2603.51, 0.357], [1779.617, 2603.65, 0.307], [1779.419, 2603.795, 0.441], [1779.221, 2603.94, 0.574]]\nB: [[1820.656, 2604.08, 0.355], [1608.069, 2300.22, 0.346], [1590.874, 2776.0, 0.366], [1586.173, 2790.75, 0.602]]\nC: [[2053.203, 2562.85, 0.348], [1922.673, 2150.26, 0.297], [1762.465, 2275.213, 0.516], [1794.318, 2966.29, 0.652]]\nD: [[1676.53, 2378.45, 0.304], [1630.8, 2506.41, 0.34], [1460.959, 2537.73, 0.431], [1807.291, 2750.98, 0.686]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_1_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_1_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_1_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_1_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_1_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_1_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_1_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_1_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[648.721, 1650.064, 0.332], [648.899, 1649.775, 0.623], [649.829, 1649.485, 1.045], [649.829, 1649.485, 1.07]]\nB: [[652.771, 1330.238, 0.27], [755.559, 1907.786, 0.731], [646.182, 1892.589, 1.216], [597.495, 1779.123, 0.96]]\nC: [[699.141, 1374.83, 0.288], [751.036, 1823.862, 0.739], [640.56, 1789.673, 1.201], [595.069, 1390.425, 1.03]]\nD: [[747.646, 1793.494, 0.307], [651.728, 1395.546, 0.51], [557.034, 1729.201, 1.22], [743.254, 1745.25, 1.28]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[648.721, 1650.064, 0.332], [648.899, 1649.775, 0.623], [649.829, 1649.485, 1.045], [649.829, 1649.485, 1.07]]\nB: [[652.771, 1330.238, 0.27], [755.559, 1907.786, 0.731], [646.182, 1892.589, 1.216], [597.495, 1779.123, 0.96]]\nC: [[699.141, 1374.83, 0.288], [751.036, 1823.862, 0.739], [640.56, 1789.673, 1.201], [595.069, 1390.425, 1.03]]\nD: [[747.646, 1793.494, 0.307], [651.728, 1395.546, 0.51], [557.034, 1729.201, 1.22], [743.254, 1745.25, 1.28]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_2_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_2_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_2_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_2_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_2_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_2_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_2_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_2_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[372.341, 646.643, 0.41], [323.457, 728.14, 0.355], [328.402, 680.116, 0.356], [304.89, 638.729, 0.37]]\nB: [[374.71, 547.041, 0.452], [266.865, 747.941, 0.359], [360.504, 710.201, 0.414], [289.281, 637.508, 0.34]]\nC: [[324.105, 664.423, 0.389], [324.125, 664.423, 0.395], [324.145, 664.423, 0.402], [324.165, 664.423, 0.409]]\nD: [[382.975, 542.454, 0.448], [273.435, 575.926, 0.36], [306.415, 582.477, 0.37], [367.698, 624.849, 0.412]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[372.341, 646.643, 0.41], [323.457, 728.14, 0.355], [328.402, 680.116, 0.356], [304.89, 638.729, 0.37]]\nB: [[374.71, 547.041, 0.452], [266.865, 747.941, 0.359], [360.504, 710.201, 0.414], [289.281, 637.508, 0.34]]\nC: [[324.105, 664.423, 0.389], [324.125, 664.423, 0.395], [324.145, 664.423, 0.402], [324.165, 664.423, 0.409]]\nD: [[382.975, 542.454, 0.448], [273.435, 575.926, 0.36], [306.415, 582.477, 0.37], [367.698, 624.849, 0.412]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_3_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_3_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_3_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_3_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_3_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_3_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_3_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_3_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[319.582, 1213.1, 0.433], [414.088, 1032.0, 0.628], [421.328, 1137.51, 0.496], [344.955, 1253.44, 0.638]]\nB: [[363.433, 1098.33, 0.529], [363.433, 1098.33, 0.564], [363.433, 1098.33, 0.599], [363.433, 1098.33, 0.634]]\nC: [[310.015, 1243.97, 0.462], [343.153, 1122.0, 0.606], [333.209, 1019.58, 0.517], [431.855, 1307.51, 0.556]]\nD: [[300.468, 996.48, 0.537], [331.062, 1300.52, 0.537], [400.879, 1176.8, 0.602], [389.732, 1170.04, 0.637]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[319.582, 1213.1, 0.433], [414.088, 1032.0, 0.628], [421.328, 1137.51, 0.496], [344.955, 1253.44, 0.638]]\nB: [[363.433, 1098.33, 0.529], [363.433, 1098.33, 0.564], [363.433, 1098.33, 0.599], [363.433, 1098.33, 0.634]]\nC: [[310.015, 1243.97, 0.462], [343.153, 1122.0, 0.606], [333.209, 1019.58, 0.517], [431.855, 1307.51, 0.556]]\nD: [[300.468, 996.48, 0.537], [331.062, 1300.52, 0.537], [400.879, 1176.8, 0.602], [389.732, 1170.04, 0.637]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_4_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_4_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_4_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_4_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_4_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_4_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_4_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_4_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[393.191, 899.659, 0.591], [332.44, 1277.512, 0.54], [378.779, 1199.743, 0.483], [388.415, 1186.22, 0.761]]\nB: [[373.967, 1296.428, 0.56], [468.08, 1301.812, 0.52], [423.341, 1242.289, 0.478], [463.453, 1026.04, 0.769]]\nC: [[396.335, 1122.142, 0.513], [395.62, 1122.119, 0.55], [394.907, 1122.104, 0.586], [392.701, 1122.16, 0.734]]\nD: [[366.604, 1119.109, 0.592], [355.44, 1130.172, 0.57], [469.284, 957.093, 0.569], [384.2, 1040.44, 0.813]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[393.191, 899.659, 0.591], [332.44, 1277.512, 0.54], [378.779, 1199.743, 0.483], [388.415, 1186.22, 0.761]]\nB: [[373.967, 1296.428, 0.56], [468.08, 1301.812, 0.52], [423.341, 1242.289, 0.478], [463.453, 1026.04, 0.769]]\nC: [[396.335, 1122.142, 0.513], [395.62, 1122.119, 0.55], [394.907, 1122.104, 0.586], [392.701, 1122.16, 0.734]]\nD: [[366.604, 1119.109, 0.592], [355.44, 1130.172, 0.57], [469.284, 957.093, 0.569], [384.2, 1040.44, 0.813]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_5_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_5_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_5_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_5_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_5_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_5_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_5_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_5_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[1912.796, 2415.138, 0.226], [1612.76, 2000.136, 0.469], [1650.05, 2082.715, 0.705], [1870.661, 2666.852, 0.889]]\nB: [[2044.921, 2427.821, 0.251], [2197.918, 2811.408, 0.435], [1594.209, 2091.568, 0.541], [1595.884, 2911.557, 0.739]]\nC: [[1855.648, 2492.891, 0.267], [1855.098, 2493.555, 0.467], [1854.597, 2494.197, 0.634], [1854.096, 2494.841, 0.801]]\nD: [[1651.93, 2405.938, 0.246], [2153.625, 2215.89, 0.442], [1530.771, 2046.654, 0.746], [2201.19, 2084.755, 0.722]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1912.796, 2415.138, 0.226], [1612.76, 2000.136, 0.469], [1650.05, 2082.715, 0.705], [1870.661, 2666.852, 0.889]]\nB: [[2044.921, 2427.821, 0.251], [2197.918, 2811.408, 0.435], [1594.209, 2091.568, 0.541], [1595.884, 2911.557, 0.739]]\nC: [[1855.648, 2492.891, 0.267], [1855.098, 2493.555, 0.467], [1854.597, 2494.197, 0.634], [1854.096, 2494.841, 0.801]]\nD: [[1651.93, 2405.938, 0.246], [2153.625, 2215.89, 0.442], [1530.771, 2046.654, 0.746], [2201.19, 2084.755, 0.722]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_6_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_6_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_6_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_6_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_6_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_6_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_6_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_6_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[1835.457, 2530.979, -0.6], [1831.738, 2535.381, -0.475], [1828.016, 2539.789, -0.35], [1823.826, 2544.548, -0.226]]\nB: [[1728.159, 2657.767, -0.6], [1671.146, 2191.293, -0.456], [1889.85, 2711.258, -0.39], [1500.543, 2142.17, -0.266]]\nC: [[1868.34, 2656.949, -0.6], [1847.319, 3027.849, -0.442], [1621.372, 2206.666, -0.29], [1944.205, 2824.5, -0.259]]\nD: [[1798.206, 2853.486, -0.5], [1737.945, 2982.299, -0.415], [1782.37, 2464.903, -0.33], [2009.484, 2271.222, -0.188]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1835.457, 2530.979, -0.6], [1831.738, 2535.381, -0.475], [1828.016, 2539.789, -0.35], [1823.826, 2544.548, -0.226]]\nB: [[1728.159, 2657.767, -0.6], [1671.146, 2191.293, -0.456], [1889.85, 2711.258, -0.39], [1500.543, 2142.17, -0.266]]\nC: [[1868.34, 2656.949, -0.6], [1847.319, 3027.849, -0.442], [1621.372, 2206.666, -0.29], [1944.205, 2824.5, -0.259]]\nD: [[1798.206, 2853.486, -0.5], [1737.945, 2982.299, -0.415], [1782.37, 2464.903, -0.33], [2009.484, 2271.222, -0.188]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_7_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_7_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_7_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_7_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_7_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_7_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_7_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_7_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[2091.918, 2820.157, -0.699], [2085.742, 2471.429, -0.7], [1560.979, 2272.985, -0.534], [1615.9, 2272.87, -0.391]]\nB: [[1807.911, 2559.964, -0.854], [1804.558, 2563.859, -0.725], [1801.201, 2567.758, -0.596], [1797.7, 2572.03, -0.433]]\nC: [[2128.41, 2627.282, -0.79], [1547.739, 2837.704, -0.791], [1686.195, 2104.816, -0.492], [1645.0, 2561.72, -0.364]]\nD: [[1649.251, 2758.133, -0.686], [1533.206, 2890.142, -0.825], [2007.154, 2531.762, -0.478], [2127.3, 2070.45, -0.347]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[2091.918, 2820.157, -0.699], [2085.742, 2471.429, -0.7], [1560.979, 2272.985, -0.534], [1615.9, 2272.87, -0.391]]\nB: [[1807.911, 2559.964, -0.854], [1804.558, 2563.859, -0.725], [1801.201, 2567.758, -0.596], [1797.7, 2572.03, -0.433]]\nC: [[2128.41, 2627.282, -0.79], [1547.739, 2837.704, -0.791], [1686.195, 2104.816, -0.492], [1645.0, 2561.72, -0.364]]\nD: [[1649.251, 2758.133, -0.686], [1533.206, 2890.142, -0.825], [2007.154, 2531.762, -0.478], [2127.3, 2070.45, -0.347]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_8_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_8_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_8_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_8_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_8_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_8_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_8_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_8_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[437.202, 1086.964, 0.692], [437.221, 1087.01, 0.817], [437.244, 1087.066, 0.842], [437.244, 1087.066, 0.842]]\nB: [[357.432, 1159.623, 0.607], [351.412, 1296.28, 0.836], [516.977, 1219.588, 0.769], [425.277, 1005.318, 0.772]]\nC: [[520.991, 1274.564, 0.812], [478.068, 1065.93, 0.705], [398.533, 912.914, 0.73], [470.356, 1123.201, 0.712]]\nD: [[377.562, 951.154, 0.715], [472.017, 932.55, 0.727], [361.039, 1097.241, 0.701], [508.246, 1284.882, 0.804]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[437.202, 1086.964, 0.692], [437.221, 1087.01, 0.817], [437.244, 1087.066, 0.842], [437.244, 1087.066, 0.842]]\nB: [[357.432, 1159.623, 0.607], [351.412, 1296.28, 0.836], [516.977, 1219.588, 0.769], [425.277, 1005.318, 0.772]]\nC: [[520.991, 1274.564, 0.812], [478.068, 1065.93, 0.705], [398.533, 912.914, 0.73], [470.356, 1123.201, 0.712]]\nD: [[377.562, 951.154, 0.715], [472.017, 932.55, 0.727], [361.039, 1097.241, 0.701], [508.246, 1284.882, 0.804]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_9_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_9_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_9_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_9_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_9_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_9_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_9_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_9_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[506.527, 1099.076, 0.589], [420.185, 1298.755, 0.489], [362.914, 1033.947, 0.414], [396.859, 1002.99, 0.306]]\nB: [[424.014, 1100.606, 0.706], [424.133, 1100.728, 0.496], [424.173, 1100.769, 0.426], [424.212, 1100.81, 0.306]]\nC: [[456.889, 932.553, 0.793], [391.51, 1069.937, 0.527], [431.845, 933.545, 0.5], [394.898, 1320.05, 0.264]]\nD: [[378.115, 1221.413, 0.672], [347.816, 1131.373, 0.529], [364.847, 1229.038, 0.466], [397.183, 1091.0, 0.25]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[506.527, 1099.076, 0.589], [420.185, 1298.755, 0.489], [362.914, 1033.947, 0.414], [396.859, 1002.99, 0.306]]\nB: [[424.014, 1100.606, 0.706], [424.133, 1100.728, 0.496], [424.173, 1100.769, 0.426], [424.212, 1100.81, 0.306]]\nC: [[456.889, 932.553, 0.793], [391.51, 1069.937, 0.527], [431.845, 933.545, 0.5], [394.898, 1320.05, 0.264]]\nD: [[378.115, 1221.413, 0.672], [347.816, 1131.373, 0.529], [364.847, 1229.038, 0.466], [397.183, 1091.0, 0.25]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_10_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_10_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_10_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_10_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_10_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_10_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_10_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_10_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[420.731, 1013.531, 0.8], [361.227, 1037.43, 0.587], [485.796, 1006.664, 0.647], [418.217, 1072.225, 0.587]]\nB: [[374.363, 1267.963, 0.71], [402.578, 1232.818, 0.668], [434.034, 921.569, 0.52], [421.208, 1297.52, 0.506]]\nC: [[425.982, 1091.597, 0.73], [425.994, 1091.597, 0.733], [426.028, 1091.597, 0.541], [426.039, 1091.597, 0.619]]\nD: [[468.986, 997.688, 0.61], [441.053, 1239.106, 0.742], [435.348, 1170.376, 0.513], [358.562, 1151.219, 0.672]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[420.731, 1013.531, 0.8], [361.227, 1037.43, 0.587], [485.796, 1006.664, 0.647], [418.217, 1072.225, 0.587]]\nB: [[374.363, 1267.963, 0.71], [402.578, 1232.818, 0.668], [434.034, 921.569, 0.52], [421.208, 1297.52, 0.506]]\nC: [[425.982, 1091.597, 0.73], [425.994, 1091.597, 0.733], [426.028, 1091.597, 0.541], [426.039, 1091.597, 0.619]]\nD: [[468.986, 997.688, 0.61], [441.053, 1239.106, 0.742], [435.348, 1170.376, 0.513], [358.562, 1151.219, 0.672]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_11_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_11_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_11_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_11_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_11_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_11_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_11_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_11_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[2016.542, 845.631, 1.13], [2028.77, 874.497, 1.083], [1971.835, 957.221, 1.15], [1681.888, 953.919, 1.139]]\nB: [[1978.335, 863.179, 0.943], [1978.33, 863.187, 1.065], [1978.325, 863.194, 1.015], [1978.319, 863.201, 0.965]]\nC: [[1640.806, 1002.654, 1.092], [2125.94, 982.727, 1.09], [1765.046, 957.217, 1.116], [2264.988, 900.054, 0.911]]\nD: [[1688.119, 734.16, 0.877], [1887.56, 864.137, 1.092], [2139.033, 980.382, 1.191], [1969.445, 813.79, 0.775]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[2016.542, 845.631, 1.13], [2028.77, 874.497, 1.083], [1971.835, 957.221, 1.15], [1681.888, 953.919, 1.139]]\nB: [[1978.335, 863.179, 0.943], [1978.33, 863.187, 1.065], [1978.325, 863.194, 1.015], [1978.319, 863.201, 0.965]]\nC: [[1640.806, 1002.654, 1.092], [2125.94, 982.727, 1.09], [1765.046, 957.217, 1.116], [2264.988, 900.054, 0.911]]\nD: [[1688.119, 734.16, 0.877], [1887.56, 864.137, 1.092], [2139.033, 980.382, 1.191], [1969.445, 813.79, 0.775]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_12_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_12_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_12_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_12_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_12_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_12_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_12_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_12_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[639.138, 1624.989, -0.086], [636.359, 1627.431, -0.053], [632.807, 1630.318, 0.08], [629.158, 1633.096, 0.314]]\nB: [[543.626, 1367.896, -0.075], [653.208, 1574.861, -0.054], [757.25, 1346.07, 0.08], [540.23, 1650.674, 0.362]]\nC: [[537.409, 1426.609, -0.082], [626.472, 1686.779, -0.051], [691.803, 1387.102, 0.07], [744.081, 1369.746, 0.365]]\nD: [[557.32, 1516.073, -0.08], [526.841, 1596.276, -0.06], [611.464, 1793.408, 0.1], [674.543, 1593.857, 0.364]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[639.138, 1624.989, -0.086], [636.359, 1627.431, -0.053], [632.807, 1630.318, 0.08], [629.158, 1633.096, 0.314]]\nB: [[543.626, 1367.896, -0.075], [653.208, 1574.861, -0.054], [757.25, 1346.07, 0.08], [540.23, 1650.674, 0.362]]\nC: [[537.409, 1426.609, -0.082], [626.472, 1686.779, -0.051], [691.803, 1387.102, 0.07], [744.081, 1369.746, 0.365]]\nD: [[557.32, 1516.073, -0.08], [526.841, 1596.276, -0.06], [611.464, 1793.408, 0.1], [674.543, 1593.857, 0.364]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_13_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_13_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_13_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_13_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_13_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_13_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_13_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_13_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[407.887, 1163.323, 0.511], [407.929, 1163.41, 0.511], [407.934, 1163.409, 0.524], [407.951, 1163.403, 0.537]]\nB: [[388.853, 1125.736, 0.56], [434.747, 1231.09, 0.419], [348.138, 1361.198, 0.597], [328.283, 1154.348, 0.58]]\nC: [[374.741, 1227.419, 0.46], [461.986, 1151.55, 0.428], [486.887, 1127.556, 0.491], [354.147, 1359.889, 0.505]]\nD: [[471.139, 1113.037, 0.544], [333.263, 956.23, 0.501], [355.318, 1217.053, 0.538], [456.915, 1087.324, 0.512]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[407.887, 1163.323, 0.511], [407.929, 1163.41, 0.511], [407.934, 1163.409, 0.524], [407.951, 1163.403, 0.537]]\nB: [[388.853, 1125.736, 0.56], [434.747, 1231.09, 0.419], [348.138, 1361.198, 0.597], [328.283, 1154.348, 0.58]]\nC: [[374.741, 1227.419, 0.46], [461.986, 1151.55, 0.428], [486.887, 1127.556, 0.491], [354.147, 1359.889, 0.505]]\nD: [[471.139, 1113.037, 0.544], [333.263, 956.23, 0.501], [355.318, 1217.053, 0.538], [456.915, 1087.324, 0.512]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_14_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_14_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_14_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_14_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_14_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_14_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_14_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_14_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[1269.546, 1024.852, 1.042], [1269.744, 1025.178, 1.042], [1270.216, 1025.754, 0.992], [1270.837, 1026.506, 1.042]]\nB: [[1423.653, 1173.455, 1.097], [1300.351, 866.909, 0.934], [1179.097, 946.025, 1.104], [1411.454, 1138.532, 1.187]]\nC: [[1145.602, 896.06, 1.073], [1144.171, 966.324, 1.002], [1499.487, 1042.061, 0.91], [1482.233, 956.251, 1.138]]\nD: [[1137.684, 944.23, 0.905], [1316.46, 1218.835, 0.861], [1509.763, 1193.692, 1.048], [1361.774, 1108.409, 0.891]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1269.546, 1024.852, 1.042], [1269.744, 1025.178, 1.042], [1270.216, 1025.754, 0.992], [1270.837, 1026.506, 1.042]]\nB: [[1423.653, 1173.455, 1.097], [1300.351, 866.909, 0.934], [1179.097, 946.025, 1.104], [1411.454, 1138.532, 1.187]]\nC: [[1145.602, 896.06, 1.073], [1144.171, 966.324, 1.002], [1499.487, 1042.061, 0.91], [1482.233, 956.251, 1.138]]\nD: [[1137.684, 944.23, 0.905], [1316.46, 1218.835, 0.861], [1509.763, 1193.692, 1.048], [1361.774, 1108.409, 0.891]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_15_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_15_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_15_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_15_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_15_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_15_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_15_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_15_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[1251.433, 1108.948, 0.433], [1176.759, 1115.714, 0.456], [1227.53, 991.616, 0.633], [1095.585, 1183.286, 0.618]]\nB: [[1509.989, 949.628, 0.539], [1350.384, 1212.22, 0.56], [1071.64, 893.308, 0.484], [1153.706, 1063.833, 0.645]]\nC: [[1298.993, 1034.258, 0.529], [1299.542, 1034.749, 0.554], [1300.09, 1035.239, 0.579], [1300.639, 1035.729, 0.604]]\nD: [[1378.947, 975.996, 0.598], [1493.813, 900.58, 0.493], [1370.14, 1033.836, 0.656], [1047.788, 1106.271, 0.659]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1251.433, 1108.948, 0.433], [1176.759, 1115.714, 0.456], [1227.53, 991.616, 0.633], [1095.585, 1183.286, 0.618]]\nB: [[1509.989, 949.628, 0.539], [1350.384, 1212.22, 0.56], [1071.64, 893.308, 0.484], [1153.706, 1063.833, 0.645]]\nC: [[1298.993, 1034.258, 0.529], [1299.542, 1034.749, 0.554], [1300.09, 1035.239, 0.579], [1300.639, 1035.729, 0.604]]\nD: [[1378.947, 975.996, 0.598], [1493.813, 900.58, 0.493], [1370.14, 1033.836, 0.656], [1047.788, 1106.271, 0.659]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_16_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_16_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_16_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_16_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_16_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_16_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_16_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_16_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[306.186, 763.667, 1.488], [378.546, 697.6, 1.528], [320.79, 550.53, 1.74], [377.634, 523.623, 1.596]]\nB: [[387.559, 726.167, 1.211], [356.987, 561.8, 1.228], [377.54, 655.25, 1.8], [372.07, 602.526, 1.352]]\nC: [[392.768, 743.908, 1.542], [292.481, 723.4, 1.31], [330.74, 682.85, 1.79], [283.31, 638.538, 1.433]]\nD: [[348.147, 646.209, 1.444], [348.144, 646.2, 1.482], [348.14, 646.19, 1.52], [348.137, 646.181, 1.559]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[306.186, 763.667, 1.488], [378.546, 697.6, 1.528], [320.79, 550.53, 1.74], [377.634, 523.623, 1.596]]\nB: [[387.559, 726.167, 1.211], [356.987, 561.8, 1.228], [377.54, 655.25, 1.8], [372.07, 602.526, 1.352]]\nC: [[392.768, 743.908, 1.542], [292.481, 723.4, 1.31], [330.74, 682.85, 1.79], [283.31, 638.538, 1.433]]\nD: [[348.147, 646.209, 1.444], [348.144, 646.2, 1.482], [348.14, 646.19, 1.52], [348.137, 646.181, 1.559]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_17_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_17_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_17_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_17_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_17_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_17_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_17_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_17_7.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[1706.39, 1019.22, 0.455], [2191.986, 926.298, 0.316], [1675.02, 886.17, 0.299], [1941.62, 757.75, 0.341]]\nB: [[2247.24, 737.46, 0.384], [1527.442, 724.25, 0.347], [1575.02, 976.52, 0.327], [1630.08, 842.33, 0.316]]\nC: [[2075.96, 1012.24, 0.409], [1869.437, 795.581, 0.371], [2223.74, 1044.39, 0.397], [1567.73, 972.01, 0.379]]\nD: [[1895.77, 878.51, 0.433], [1895.672, 878.506, 0.338], [1895.77, 878.51, 0.343], [1895.77, 878.51, 0.393]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1706.39, 1019.22, 0.455], [2191.986, 926.298, 0.316], [1675.02, 886.17, 0.299], [1941.62, 757.75, 0.341]]\nB: [[2247.24, 737.46, 0.384], [1527.442, 724.25, 0.347], [1575.02, 976.52, 0.327], [1630.08, 842.33, 0.316]]\nC: [[2075.96, 1012.24, 0.409], [1869.437, 795.581, 0.371], [2223.74, 1044.39, 0.397], [1567.73, 972.01, 0.379]]\nD: [[1895.77, 878.51, 0.433], [1895.672, 878.506, 0.338], [1895.77, 878.51, 0.343], [1895.77, 878.51, 0.393]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_18_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_18_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_18_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_18_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_18_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_18_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_18_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_18_7.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[2234.916, 722.86, 0.39], [1901.638, 1017.1, 0.487], [1734.516, 780.849, 0.344], [1885.643, 867.521, 0.263]]\nB: [[1568.94, 897.301, 0.449], [2000.828, 702.741, 0.446], [1573.358, 1014.024, 0.477], [1578.275, 964.592, 0.265]]\nC: [[2141.663, 908.252, 0.394], [1802.749, 988.498, 0.349], [1873.147, 986.016, 0.413], [2189.02, 894.117, 0.265]]\nD: [[1895.727, 877.737, 0.418], [1895.727, 877.737, 0.418], [1895.727, 877.737, 0.418], [1895.716, 877.802, 0.292]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[2234.916, 722.86, 0.39], [1901.638, 1017.1, 0.487], [1734.516, 780.849, 0.344], [1885.643, 867.521, 0.263]]\nB: [[1568.94, 897.301, 0.449], [2000.828, 702.741, 0.446], [1573.358, 1014.024, 0.477], [1578.275, 964.592, 0.265]]\nC: [[2141.663, 908.252, 0.394], [1802.749, 988.498, 0.349], [1873.147, 986.016, 0.413], [2189.02, 894.117, 0.265]]\nD: [[1895.727, 877.737, 0.418], [1895.727, 877.737, 0.418], [1895.727, 877.737, 0.418], [1895.716, 877.802, 0.292]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_19_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_19_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_19_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_19_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_19_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_19_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_19_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_19_7.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[336.27, 647.992, 0.436], [346.74, 708.566, 0.649], [354.42, 746.112, 0.69], [376.74, 611.59, 0.61]]\nB: [[340.58, 661.842, 0.526], [340.58, 661.842, 0.576], [340.58, 661.842, 0.626], [340.58, 661.842, 0.676]]\nC: [[387.54, 767.29, 0.509], [330.38, 600.327, 0.526], [387.34, 562.731, 0.738], [287.65, 743.046, 0.73]]\nD: [[347.27, 591.306, 0.458], [329.15, 678.06, 0.571], [380.55, 710.329, 0.52], [408.38, 545.098, 0.802]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[336.27, 647.992, 0.436], [346.74, 708.566, 0.649], [354.42, 746.112, 0.69], [376.74, 611.59, 0.61]]\nB: [[340.58, 661.842, 0.526], [340.58, 661.842, 0.576], [340.58, 661.842, 0.626], [340.58, 661.842, 0.676]]\nC: [[387.54, 767.29, 0.509], [330.38, 600.327, 0.526], [387.34, 562.731, 0.738], [287.65, 743.046, 0.73]]\nD: [[347.27, 591.306, 0.458], [329.15, 678.06, 0.571], [380.55, 710.329, 0.52], [408.38, 545.098, 0.802]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_20_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_20_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_20_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_20_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_20_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_20_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_20_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_20_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[641.894, 1481.081, -0.116], [755.139, 1696.093, 0.085], [744.337, 1645.874, -0.021], [549.883, 1475.291, 0.091]]\nB: [[609.159, 1822.97, -0.114], [725.77, 1759.652, 0.076], [541.265, 1644.526, -0.022], [634.034, 1389.951, 0.08]]\nC: [[639.585, 1606.675, -0.122], [640.106, 1606.245, 0.078], [640.626, 1605.815, -0.022], [641.147, 1605.384, 0.078]]\nD: [[553.206, 1422.477, -0.138], [630.222, 1490.963, 0.087], [720.491, 1414.036, -0.022], [698.708, 1478.6, 0.08]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[641.894, 1481.081, -0.116], [755.139, 1696.093, 0.085], [744.337, 1645.874, -0.021], [549.883, 1475.291, 0.091]]\nB: [[609.159, 1822.97, -0.114], [725.77, 1759.652, 0.076], [541.265, 1644.526, -0.022], [634.034, 1389.951, 0.08]]\nC: [[639.585, 1606.675, -0.122], [640.106, 1606.245, 0.078], [640.626, 1605.815, -0.022], [641.147, 1605.384, 0.078]]\nD: [[553.206, 1422.477, -0.138], [630.222, 1490.963, 0.087], [720.491, 1414.036, -0.022], [698.708, 1478.6, 0.08]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_21_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_21_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_21_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_21_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_21_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_21_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_21_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_21_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[1654.688, 731.801, 1.203], [1825.149, 800.76, 1.006], [1536.825, 955.686, 1.262], [2011.454, 920.864, 1.228]]\nB: [[1716.132, 870.368, 1.137], [1714.324, 869.208, 1.137], [1712.096, 868.352, 1.187], [1709.574, 867.934, 1.232]]\nC: [[1523.418, 951.06, 0.924], [1452.823, 761.345, 1.206], [2023.787, 900.571, 0.99], [1938.184, 774.207, 1.182]]\nD: [[1653.54, 790.02, 1.21], [1790.64, 885.935, 1.33], [1634.81, 909.54, 1.184], [1807.277, 934.183, 1.469]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1654.688, 731.801, 1.203], [1825.149, 800.76, 1.006], [1536.825, 955.686, 1.262], [2011.454, 920.864, 1.228]]\nB: [[1716.132, 870.368, 1.137], [1714.324, 869.208, 1.137], [1712.096, 868.352, 1.187], [1709.574, 867.934, 1.232]]\nC: [[1523.418, 951.06, 0.924], [1452.823, 761.345, 1.206], [2023.787, 900.571, 0.99], [1938.184, 774.207, 1.182]]\nD: [[1653.54, 790.02, 1.21], [1790.64, 885.935, 1.33], [1634.81, 909.54, 1.184], [1807.277, 934.183, 1.469]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_22_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_22_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_22_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_22_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_22_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_22_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_22_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_22_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[1943.293, 1014.905, 2.05], [1962.947, 881.292, 1.464], [1771.641, 818.432, 1.783], [2024.383, 893.384, 1.855]]\nB: [[1842.723, 879.95, 1.901], [2117.17, 1006.474, 1.903], [1573.854, 942.118, 1.735], [2097.928, 1012.432, 1.953]]\nC: [[1897.834, 865.209, 1.738], [1897.834, 865.195, 1.688], [1897.833, 865.116, 1.688], [1897.831, 865.001, 1.688]]\nD: [[1801.762, 704.249, 1.493], [1762.225, 848.144, 1.446], [1867.693, 770.539, 1.836], [2098.827, 762.104, 1.81]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1943.293, 1014.905, 2.05], [1962.947, 881.292, 1.464], [1771.641, 818.432, 1.783], [2024.383, 893.384, 1.855]]\nB: [[1842.723, 879.95, 1.901], [2117.17, 1006.474, 1.903], [1573.854, 942.118, 1.735], [2097.928, 1012.432, 1.953]]\nC: [[1897.834, 865.209, 1.738], [1897.834, 865.195, 1.688], [1897.833, 865.116, 1.688], [1897.831, 865.001, 1.688]]\nD: [[1801.762, 704.249, 1.493], [1762.225, 848.144, 1.446], [1867.693, 770.539, 1.836], [2098.827, 762.104, 1.81]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_23_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_23_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_23_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_23_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_23_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_23_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_23_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_23_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[251.423, 613.532, -0.224], [353.651, 580.245, -0.187], [307.419, 820.39, -0.19], [319.555, 661.929, -0.115]]\nB: [[288.517, 703.944, -0.206], [287.575, 632.764, -0.222], [372.62, 616.315, -0.154], [261.943, 809.962, -0.108]]\nC: [[279.61, 776.103, -0.238], [372.908, 643.544, -0.172], [347.733, 585.413, -0.159], [339.729, 666.886, -0.117]]\nD: [[311.976, 694.922, -0.216], [311.533, 694.408, -0.203], [311.103, 693.883, -0.191], [309.589, 691.756, -0.099]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[251.423, 613.532, -0.224], [353.651, 580.245, -0.187], [307.419, 820.39, -0.19], [319.555, 661.929, -0.115]]\nB: [[288.517, 703.944, -0.206], [287.575, 632.764, -0.222], [372.62, 616.315, -0.154], [261.943, 809.962, -0.108]]\nC: [[279.61, 776.103, -0.238], [372.908, 643.544, -0.172], [347.733, 585.413, -0.159], [339.729, 666.886, -0.117]]\nD: [[311.976, 694.922, -0.216], [311.533, 694.408, -0.203], [311.103, 693.883, -0.191], [309.589, 691.756, -0.099]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_24_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_24_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_24_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_24_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_24_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_24_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_24_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_24_7.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[428.593, 999.538, 0.927], [376.399, 1068.754, 0.828], [470.584, 1252.944, 0.961], [513.123, 1108.855, 0.935]]\nB: [[449.491, 963.875, 0.968], [378.432, 1021.223, 1.012], [349.93, 1322.277, 1.125], [411.187, 1019.406, 0.996]]\nC: [[447.511, 981.997, 0.973], [404.158, 1082.968, 0.919], [454.929, 1283.771, 0.917], [471.926, 1109.792, 0.83]]\nD: [[435.351, 1103.132, 0.814], [435.351, 1103.132, 0.964], [435.351, 1103.132, 1.014], [435.351, 1103.132, 0.989]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[428.593, 999.538, 0.927], [376.399, 1068.754, 0.828], [470.584, 1252.944, 0.961], [513.123, 1108.855, 0.935]]\nB: [[449.491, 963.875, 0.968], [378.432, 1021.223, 1.012], [349.93, 1322.277, 1.125], [411.187, 1019.406, 0.996]]\nC: [[447.511, 981.997, 0.973], [404.158, 1082.968, 0.919], [454.929, 1283.771, 0.917], [471.926, 1109.792, 0.83]]\nD: [[435.351, 1103.132, 0.814], [435.351, 1103.132, 0.964], [435.351, 1103.132, 1.014], [435.351, 1103.132, 0.989]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_25_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_25_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_25_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_25_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_25_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_25_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_25_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_25_7.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[1289.156, 997.931, 0.17], [1528.988, 1149.542, 0.135], [1524.67, 1103.565, 0.144], [1254.51, 1059.655, 0.132]]\nB: [[1576.762, 1083.802, 0.16], [1394.53, 1020.578, 0.13], [1145.932, 1107.624, 0.169], [1436.14, 1231.523, 0.156]]\nC: [[1340.124, 1032.575, 0.154], [1340.123, 1032.575, 0.154], [1340.121, 1032.574, 0.154], [1340.12, 1032.574, 0.154]]\nD: [[1216.577, 1183.272, 0.123], [1258.5, 1034.393, 0.163], [1273.558, 1228.419, 0.14], [1288.46, 870.176, 0.174]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1289.156, 997.931, 0.17], [1528.988, 1149.542, 0.135], [1524.67, 1103.565, 0.144], [1254.51, 1059.655, 0.132]]\nB: [[1576.762, 1083.802, 0.16], [1394.53, 1020.578, 0.13], [1145.932, 1107.624, 0.169], [1436.14, 1231.523, 0.156]]\nC: [[1340.124, 1032.575, 0.154], [1340.123, 1032.575, 0.154], [1340.121, 1032.574, 0.154], [1340.12, 1032.574, 0.154]]\nD: [[1216.577, 1183.272, 0.123], [1258.5, 1034.393, 0.163], [1273.558, 1228.419, 0.14], [1288.46, 870.176, 0.174]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_26_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_26_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_26_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_26_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_26_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_26_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_26_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_26_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[1255.784, 1121.555, 1.33], [1075.123, 1055.841, 1.22], [1441.444, 1208.639, 1.1], [1537.429, 1076.298, 1.45]]\nB: [[1100.277, 1164.491, 1.56], [1180.448, 1259.127, 1.17], [1475.037, 1060.06, 1.36], [1311.756, 864.536, 1.05]]\nC: [[1328.793, 876.335, 1.12], [1429.236, 996.25, 1.26], [1195.871, 932.001, 1.51], [1480.133, 1028.558, 1.25]]\nD: [[1328.425, 1052.566, 1.31], [1328.425, 1052.566, 1.31], [1328.425, 1052.566, 1.31], [1328.425, 1052.566, 1.31]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1255.784, 1121.555, 1.33], [1075.123, 1055.841, 1.22], [1441.444, 1208.639, 1.1], [1537.429, 1076.298, 1.45]]\nB: [[1100.277, 1164.491, 1.56], [1180.448, 1259.127, 1.17], [1475.037, 1060.06, 1.36], [1311.756, 864.536, 1.05]]\nC: [[1328.793, 876.335, 1.12], [1429.236, 996.25, 1.26], [1195.871, 932.001, 1.51], [1480.133, 1028.558, 1.25]]\nD: [[1328.425, 1052.566, 1.31], [1328.425, 1052.566, 1.31], [1328.425, 1052.566, 1.31], [1328.425, 1052.566, 1.31]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_27_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_27_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_27_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_27_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_27_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_27_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_27_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_27_7.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[1273.894, 1072.524, 0.908], [1273.894, 1072.524, 0.909], [1273.894, 1072.524, 0.911], [1273.893, 1072.523, 0.912]]\nB: [[1252.346, 1105.514, 0.902], [1209.789, 1085.191, 0.984], [1114.268, 935.639, 0.74], [1170.16, 987.263, 0.918]]\nC: [[1108.639, 1162.182, 1.069], [1297.456, 1226.014, 0.862], [1466.955, 1006.358, 0.987], [1135.299, 1250.877, 0.943]]\nD: [[1221.891, 927.735, 0.939], [1126.972, 1155.177, 0.838], [1313.844, 1145.354, 1.042], [1328.412, 1083.367, 0.762]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1273.894, 1072.524, 0.908], [1273.894, 1072.524, 0.909], [1273.894, 1072.524, 0.911], [1273.893, 1072.523, 0.912]]\nB: [[1252.346, 1105.514, 0.902], [1209.789, 1085.191, 0.984], [1114.268, 935.639, 0.74], [1170.16, 987.263, 0.918]]\nC: [[1108.639, 1162.182, 1.069], [1297.456, 1226.014, 0.862], [1466.955, 1006.358, 0.987], [1135.299, 1250.877, 0.943]]\nD: [[1221.891, 927.735, 0.939], [1126.972, 1155.177, 0.838], [1313.844, 1145.354, 1.042], [1328.412, 1083.367, 0.762]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_28_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_28_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_28_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_28_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_28_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_28_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_28_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_28_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[522.342, 1943.251, 0.31], [505.853, 1457.715, 0.375], [513.011, 1502.032, 0.633], [529.48, 1609.413, 0.729]]\nB: [[626.523, 1972.698, 0.374], [529.275, 1724.592, 0.459], [517.251, 1365.431, 0.651], [714.07, 1806.899, 0.579]]\nC: [[576.087, 1806.167, 0.315], [734.652, 1339.382, 0.394], [725.143, 1697.177, 0.608], [592.16, 1326.812, 0.692]]\nD: [[622.249, 1646.081, 0.321], [621.683, 1646.405, 0.446], [621.109, 1646.715, 0.571], [620.64, 1647.021, 0.721]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[522.342, 1943.251, 0.31], [505.853, 1457.715, 0.375], [513.011, 1502.032, 0.633], [529.48, 1609.413, 0.729]]\nB: [[626.523, 1972.698, 0.374], [529.275, 1724.592, 0.459], [517.251, 1365.431, 0.651], [714.07, 1806.899, 0.579]]\nC: [[576.087, 1806.167, 0.315], [734.652, 1339.382, 0.394], [725.143, 1697.177, 0.608], [592.16, 1326.812, 0.692]]\nD: [[622.249, 1646.081, 0.321], [621.683, 1646.405, 0.446], [621.109, 1646.715, 0.571], [620.64, 1647.021, 0.721]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_29_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_29_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_29_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_29_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_29_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_29_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_29_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_29_7.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[1796.561, 874.996, 1.254], [1796.561, 874.982, 1.216], [1796.561, 874.969, 1.182], [1796.561, 874.957, 1.151]]\nB: [[1829.822, 1005.261, 1.194], [2129.106, 967.913, 1.335], [1439.644, 885.763, 1.155], [2034.051, 719.497, 0.987]]\nC: [[2134.229, 737.814, 1.149], [1953.993, 1047.896, 1.349], [1612.579, 940.305, 1.146], [1599.447, 982.485, 1.365]]\nD: [[1699.287, 941.961, 1.224], [1590.817, 729.191, 1.195], [1711.432, 908.722, 0.971], [1659.459, 924.897, 1.335]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1796.561, 874.996, 1.254], [1796.561, 874.982, 1.216], [1796.561, 874.969, 1.182], [1796.561, 874.957, 1.151]]\nB: [[1829.822, 1005.261, 1.194], [2129.106, 967.913, 1.335], [1439.644, 885.763, 1.155], [2034.051, 719.497, 0.987]]\nC: [[2134.229, 737.814, 1.149], [1953.993, 1047.896, 1.349], [1612.579, 940.305, 1.146], [1599.447, 982.485, 1.365]]\nD: [[1699.287, 941.961, 1.224], [1590.817, 729.191, 1.195], [1711.432, 908.722, 0.971], [1659.459, 924.897, 1.335]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_30_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_30_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_30_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_30_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_30_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_30_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_30_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_30_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[406.663, 1099.631, 0.814], [406.711, 1099.639, 0.923], [406.735, 1099.643, 0.978], [406.717, 1099.695, 0.749]]\nB: [[427.835, 1064.967, 0.714], [484.647, 916.921, 0.994], [411.142, 919.994, 1.029], [362.349, 1103.394, 0.701]]\nC: [[396.877, 1112.011, 0.828], [415.047, 1175.011, 0.772], [440.647, 980.302, 0.825], [395.393, 899.719, 0.603]]\nD: [[473.72, 956.4, 0.8], [485.155, 1094.253, 0.884], [398.711, 1081.924, 0.932], [430.802, 1000.92, 0.78]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[406.663, 1099.631, 0.814], [406.711, 1099.639, 0.923], [406.735, 1099.643, 0.978], [406.717, 1099.695, 0.749]]\nB: [[427.835, 1064.967, 0.714], [484.647, 916.921, 0.994], [411.142, 919.994, 1.029], [362.349, 1103.394, 0.701]]\nC: [[396.877, 1112.011, 0.828], [415.047, 1175.011, 0.772], [440.647, 980.302, 0.825], [395.393, 899.719, 0.603]]\nD: [[473.72, 956.4, 0.8], [485.155, 1094.253, 0.884], [398.711, 1081.924, 0.932], [430.802, 1000.92, 0.78]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_31_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_31_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_31_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_31_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_31_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_31_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_31_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_31_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[1313.621, 933.434, 0.218], [1514.36, 1049.794, 0.194], [1263.349, 1108.661, 0.157], [1490.47, 980.609, 0.195]]\nB: [[1232.867, 1016.208, 0.213], [1250.875, 1010.148, 0.221], [1205.37, 1035.121, 0.184], [1092.698, 953.727, 0.188]]\nC: [[1472.729, 957.241, 0.173], [1510.795, 1241.776, 0.219], [1118.45, 1223.791, 0.168], [1218.898, 1085.684, 0.171]]\nD: [[1337.482, 1035.208, 0.186], [1337.482, 1035.208, 0.186], [1337.482, 1035.208, 0.186], [1337.482, 1035.208, 0.186]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1313.621, 933.434, 0.218], [1514.36, 1049.794, 0.194], [1263.349, 1108.661, 0.157], [1490.47, 980.609, 0.195]]\nB: [[1232.867, 1016.208, 0.213], [1250.875, 1010.148, 0.221], [1205.37, 1035.121, 0.184], [1092.698, 953.727, 0.188]]\nC: [[1472.729, 957.241, 0.173], [1510.795, 1241.776, 0.219], [1118.45, 1223.791, 0.168], [1218.898, 1085.684, 0.171]]\nD: [[1337.482, 1035.208, 0.186], [1337.482, 1035.208, 0.186], [1337.482, 1035.208, 0.186], [1337.482, 1035.208, 0.186]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_32_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_32_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_32_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_32_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_32_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_32_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_32_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_32_7.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[1992.775, 875.132, 0.942], [1985.564, 874.76, 0.95], [1978.377, 874.483, 0.958], [1971.974, 874.315, 0.986]]\nB: [[1755.791, 1044.883, 0.825], [1877.163, 968.52, 1.04], [2106.974, 814.325, 0.994], [1945.338, 748.73, 1.14]]\nC: [[1656.177, 762.998, 0.871], [2009.557, 758.93, 0.8], [1914.45, 722.289, 1.067], [1703.798, 972.938, 1.065]]\nD: [[1816.649, 760.428, 1.116], [1730.801, 1023.39, 1.04], [2342.252, 816.69, 1.126], [2334.939, 947.14, 0.896]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1992.775, 875.132, 0.942], [1985.564, 874.76, 0.95], [1978.377, 874.483, 0.958], [1971.974, 874.315, 0.986]]\nB: [[1755.791, 1044.883, 0.825], [1877.163, 968.52, 1.04], [2106.974, 814.325, 0.994], [1945.338, 748.73, 1.14]]\nC: [[1656.177, 762.998, 0.871], [2009.557, 758.93, 0.8], [1914.45, 722.289, 1.067], [1703.798, 972.938, 1.065]]\nD: [[1816.649, 760.428, 1.116], [1730.801, 1023.39, 1.04], [2342.252, 816.69, 1.126], [2334.939, 947.14, 0.896]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_33_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_33_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_33_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_33_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_33_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_33_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_33_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_33_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[331.098, 1052.547, 0.565], [408.797, 1100.389, 0.637], [418.765, 1332.696, 0.625], [352.119, 1242.135, 0.631]]\nB: [[421.121, 1227.662, 0.557], [446.642, 1087.379, 0.513], [450.924, 1107.261, 0.47], [392.549, 1175.812, 0.691]]\nC: [[396.535, 1162.355, 0.498], [396.535, 1162.355, 0.534], [396.535, 1162.355, 0.571], [396.535, 1162.355, 0.608]]\nD: [[463.951, 972.839, 0.532], [365.417, 1075.626, 0.44], [381.022, 1300.867, 0.549], [368.078, 1350.532, 0.537]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[331.098, 1052.547, 0.565], [408.797, 1100.389, 0.637], [418.765, 1332.696, 0.625], [352.119, 1242.135, 0.631]]\nB: [[421.121, 1227.662, 0.557], [446.642, 1087.379, 0.513], [450.924, 1107.261, 0.47], [392.549, 1175.812, 0.691]]\nC: [[396.535, 1162.355, 0.498], [396.535, 1162.355, 0.534], [396.535, 1162.355, 0.571], [396.535, 1162.355, 0.608]]\nD: [[463.951, 972.839, 0.532], [365.417, 1075.626, 0.44], [381.022, 1300.867, 0.549], [368.078, 1350.532, 0.537]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_34_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_34_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_34_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_34_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_34_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_34_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_34_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_34_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[321.096, 668.091, 0.879], [321.112, 668.116, 0.885], [321.128, 668.141, 0.891], [321.143, 668.166, 0.897]]\nB: [[268.388, 688.723, 0.734], [302.215, 796.657, 0.989], [302.241, 565.326, 1.022], [265.213, 770.117, 0.814]]\nC: [[314.729, 566.271, 0.999], [287.802, 590.987, 1.045], [272.417, 724.544, 0.717], [323.87, 780.287, 0.926]]\nD: [[376.158, 594.596, 0.841], [277.747, 714.363, 0.978], [382.966, 588.719, 0.996], [345.414, 561.146, 0.948]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[321.096, 668.091, 0.879], [321.112, 668.116, 0.885], [321.128, 668.141, 0.891], [321.143, 668.166, 0.897]]\nB: [[268.388, 688.723, 0.734], [302.215, 796.657, 0.989], [302.241, 565.326, 1.022], [265.213, 770.117, 0.814]]\nC: [[314.729, 566.271, 0.999], [287.802, 590.987, 1.045], [272.417, 724.544, 0.717], [323.87, 780.287, 0.926]]\nD: [[376.158, 594.596, 0.841], [277.747, 714.363, 0.978], [382.966, 588.719, 0.996], [345.414, 561.146, 0.948]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_35_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_35_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_35_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_35_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_35_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_35_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_35_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_35_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[754.263, 1891.382, 0.199], [593.138, 1372.169, 0.319], [709.86, 1582.863, 0.554], [634.519, 1531.646, 0.56]]\nB: [[729.655, 1603.012, 0.212], [526.812, 1703.833, 0.343], [552.52, 1297.518, 0.437], [592.969, 1803.518, 0.61]]\nC: [[632.049, 1352.661, 0.204], [726.247, 1377.851, 0.377], [577.44, 1302.511, 0.523], [636.437, 1877.196, 0.48]]\nD: [[655.912, 1592.667, 0.218], [655.637, 1593.173, 0.377], [655.34, 1593.667, 0.535], [654.899, 1594.227, 0.56]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[754.263, 1891.382, 0.199], [593.138, 1372.169, 0.319], [709.86, 1582.863, 0.554], [634.519, 1531.646, 0.56]]\nB: [[729.655, 1603.012, 0.212], [526.812, 1703.833, 0.343], [552.52, 1297.518, 0.437], [592.969, 1803.518, 0.61]]\nC: [[632.049, 1352.661, 0.204], [726.247, 1377.851, 0.377], [577.44, 1302.511, 0.523], [636.437, 1877.196, 0.48]]\nD: [[655.912, 1592.667, 0.218], [655.637, 1593.173, 0.377], [655.34, 1593.667, 0.535], [654.899, 1594.227, 0.56]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_36_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_36_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_36_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_36_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_36_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_36_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_36_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_36_7.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[1863.967, 857.871, 0.65], [1863.962, 857.872, 0.65], [1863.962, 857.872, 0.65], [1863.962, 857.872, 0.65]]\nB: [[2064.908, 1013.124, 0.75], [2122.552, 822.014, 0.59], [2177.833, 1012.188, 0.75], [1595.769, 822.35, 0.73]]\nC: [[1731.702, 852.264, 0.72], [2128.868, 793.194, 0.77], [1755.246, 973.676, 0.67], [1568.102, 944.114, 0.53]]\nD: [[1764.474, 940.448, 0.74], [2091.49, 945.26, 0.67], [2118.947, 923.168, 0.72], [1633.719, 960.882, 0.7]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1863.967, 857.871, 0.65], [1863.962, 857.872, 0.65], [1863.962, 857.872, 0.65], [1863.962, 857.872, 0.65]]\nB: [[2064.908, 1013.124, 0.75], [2122.552, 822.014, 0.59], [2177.833, 1012.188, 0.75], [1595.769, 822.35, 0.73]]\nC: [[1731.702, 852.264, 0.72], [2128.868, 793.194, 0.77], [1755.246, 973.676, 0.67], [1568.102, 944.114, 0.53]]\nD: [[1764.474, 940.448, 0.74], [2091.49, 945.26, 0.67], [2118.947, 923.168, 0.72], [1633.719, 960.882, 0.7]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_37_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_37_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_37_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_37_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_37_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_37_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_37_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_37_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[1780.13, 3028.6, -0.525], [1674.509, 2149.928, -0.337], [1576.236, 2276.286, -0.134], [1537.853, 2314.916, 0.007]]\nB: [[1601.026, 2969.24, -0.541], [2094.18, 2097.632, -0.298], [2014.168, 2653.318, -0.14], [1803.211, 2667.419, 0.009]]\nC: [[1811.441, 2574.96, -0.473], [1814.647, 2570.443, -0.296], [1818.149, 2566.591, -0.119], [1820.651, 2564.035, 0.009]]\nD: [[1791.545, 2532.09, -0.471], [1731.966, 2573.436, -0.251], [1598.687, 2327.018, -0.116], [1468.52, 2562.672, 0.009]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1780.13, 3028.6, -0.525], [1674.509, 2149.928, -0.337], [1576.236, 2276.286, -0.134], [1537.853, 2314.916, 0.007]]\nB: [[1601.026, 2969.24, -0.541], [2094.18, 2097.632, -0.298], [2014.168, 2653.318, -0.14], [1803.211, 2667.419, 0.009]]\nC: [[1811.441, 2574.96, -0.473], [1814.647, 2570.443, -0.296], [1818.149, 2566.591, -0.119], [1820.651, 2564.035, 0.009]]\nD: [[1791.545, 2532.09, -0.471], [1731.966, 2573.436, -0.251], [1598.687, 2327.018, -0.116], [1468.52, 2562.672, 0.009]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_38_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_38_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_38_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_38_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_38_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_38_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_38_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_38_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[418.17, 1093.457, 0.829], [405.894, 1044.702, 0.431], [420.968, 1127.645, 0.604], [406.025, 1265.181, 0.687]]\nB: [[422.18, 1093.142, 0.749], [422.146, 1093.149, 0.523], [422.164, 1093.151, 0.575], [422.182, 1093.152, 0.627]]\nC: [[424.56, 1104.052, 0.696], [456.777, 1163.284, 0.489], [355.959, 1084.822, 0.587], [353.668, 881.288, 0.749]]\nD: [[472.5, 1170.954, 0.897], [500.203, 1162.062, 0.492], [472.1, 1132.062, 0.684], [450.284, 916.311, 0.647]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[418.17, 1093.457, 0.829], [405.894, 1044.702, 0.431], [420.968, 1127.645, 0.604], [406.025, 1265.181, 0.687]]\nB: [[422.18, 1093.142, 0.749], [422.146, 1093.149, 0.523], [422.164, 1093.151, 0.575], [422.182, 1093.152, 0.627]]\nC: [[424.56, 1104.052, 0.696], [456.777, 1163.284, 0.489], [355.959, 1084.822, 0.587], [353.668, 881.288, 0.749]]\nD: [[472.5, 1170.954, 0.897], [500.203, 1162.062, 0.492], [472.1, 1132.062, 0.684], [450.284, 916.311, 0.647]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_39_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_39_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_39_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_39_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_39_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_39_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_39_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_39_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[390.675, 1148.918, 0.446], [390.675, 1148.918, 0.486], [390.675, 1148.918, 0.526], [390.675, 1148.918, 0.566]]\nB: [[325.378, 1080.282, 0.378], [401.054, 1111.492, 0.413], [443.699, 1336.224, 0.541], [437.757, 1205.106, 0.494]]\nC: [[376.096, 1180.944, 0.535], [365.879, 1297.989, 0.536], [347.139, 1107.499, 0.489], [390.705, 1129.597, 0.653]]\nD: [[319.548, 938.981, 0.435], [320.089, 1375.531, 0.568], [447.751, 1028.646, 0.524], [462.869, 953.708, 0.657]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[390.675, 1148.918, 0.446], [390.675, 1148.918, 0.486], [390.675, 1148.918, 0.526], [390.675, 1148.918, 0.566]]\nB: [[325.378, 1080.282, 0.378], [401.054, 1111.492, 0.413], [443.699, 1336.224, 0.541], [437.757, 1205.106, 0.494]]\nC: [[376.096, 1180.944, 0.535], [365.879, 1297.989, 0.536], [347.139, 1107.499, 0.489], [390.705, 1129.597, 0.653]]\nD: [[319.548, 938.981, 0.435], [320.089, 1375.531, 0.568], [447.751, 1028.646, 0.524], [462.869, 953.708, 0.657]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_40_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_40_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_40_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_40_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_40_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_40_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_40_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_40_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[1843.257, 2538.618, -0.551], [1847.727, 2533.368, -0.252], [1851.69, 2528.728, 0.148], [1855.661, 2524.133, 0.447]]\nB: [[2149.832, 2452.89, -0.472], [2084.541, 3035.493, -0.262], [2202.07, 2375.125, 0.153], [1741.345, 2112.152, 0.38]]\nC: [[1481.384, 2461.292, -0.523], [1555.975, 2186.05, -0.244], [1900.07, 2064.722, 0.165], [2087.255, 2686.41, 0.442]]\nD: [[1970.321, 2572.246, -0.542], [1648.575, 2617.927, -0.295], [1998.79, 2542.913, 0.12], [2210.323, 2215.488, 0.469]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1843.257, 2538.618, -0.551], [1847.727, 2533.368, -0.252], [1851.69, 2528.728, 0.148], [1855.661, 2524.133, 0.447]]\nB: [[2149.832, 2452.89, -0.472], [2084.541, 3035.493, -0.262], [2202.07, 2375.125, 0.153], [1741.345, 2112.152, 0.38]]\nC: [[1481.384, 2461.292, -0.523], [1555.975, 2186.05, -0.244], [1900.07, 2064.722, 0.165], [2087.255, 2686.41, 0.442]]\nD: [[1970.321, 2572.246, -0.542], [1648.575, 2617.927, -0.295], [1998.79, 2542.913, 0.12], [2210.323, 2215.488, 0.469]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_41_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_41_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_41_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_41_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_41_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_41_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_41_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_41_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[405.649, 1108.528, 0.594], [405.644, 1108.505, 0.674], [405.626, 1108.416, 0.494], [405.656, 1108.482, 0.494]]\nB: [[334.296, 1327.717, 0.679], [384.849, 1314.532, 0.74], [423.319, 950.2, 0.426], [331.031, 1040.91, 0.551]]\nC: [[347.771, 1314.846, 0.498], [446.389, 1307.841, 0.727], [399.575, 1219.724, 0.443], [426.17, 1311.828, 0.436]]\nD: [[400.335, 1102.261, 0.598], [348.445, 1284.149, 0.65], [478.752, 1133.775, 0.474], [355.374, 1236.721, 0.511]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[405.649, 1108.528, 0.594], [405.644, 1108.505, 0.674], [405.626, 1108.416, 0.494], [405.656, 1108.482, 0.494]]\nB: [[334.296, 1327.717, 0.679], [384.849, 1314.532, 0.74], [423.319, 950.2, 0.426], [331.031, 1040.91, 0.551]]\nC: [[347.771, 1314.846, 0.498], [446.389, 1307.841, 0.727], [399.575, 1219.724, 0.443], [426.17, 1311.828, 0.436]]\nD: [[400.335, 1102.261, 0.598], [348.445, 1284.149, 0.65], [478.752, 1133.775, 0.474], [355.374, 1236.721, 0.511]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_42_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_42_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_42_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_42_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_42_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_42_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_42_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_42_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[1428.27, 974.381, 1.985], [1433.79, 878.793, 1.553], [1487.91, 991.551, 2.084], [1094.85, 1099.062, 1.834]]\nB: [[1126.35, 864.162, 1.48], [1369.77, 1079.18, 2.095], [1104.67, 888.249, 1.995], [1467.5, 1079.513, 1.593]]\nC: [[1319.41, 1031.387, 1.821], [1319.41, 1031.387, 1.821], [1319.41, 1031.387, 1.821], [1319.41, 1031.387, 1.821]]\nD: [[1325.9, 922.588, 2.092], [1241.85, 1191.619, 1.687], [1156.96, 1063.21, 1.942], [1396.09, 908.012, 1.846]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1428.27, 974.381, 1.985], [1433.79, 878.793, 1.553], [1487.91, 991.551, 2.084], [1094.85, 1099.062, 1.834]]\nB: [[1126.35, 864.162, 1.48], [1369.77, 1079.18, 2.095], [1104.67, 888.249, 1.995], [1467.5, 1079.513, 1.593]]\nC: [[1319.41, 1031.387, 1.821], [1319.41, 1031.387, 1.821], [1319.41, 1031.387, 1.821], [1319.41, 1031.387, 1.821]]\nD: [[1325.9, 922.588, 2.092], [1241.85, 1191.619, 1.687], [1156.96, 1063.21, 1.942], [1396.09, 908.012, 1.846]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_43_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_43_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_43_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_43_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_43_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_43_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_43_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_43_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[409.465, 1216.476, 0.476], [318.714, 1226.609, 0.544], [365.342, 995.006, 0.666], [409.876, 1110.956, 0.592]]\nB: [[394.755, 1113.151, 0.528], [394.774, 1113.143, 0.578], [394.793, 1113.134, 0.628], [394.793, 1113.134, 0.703]]\nC: [[420.334, 1155.862, 0.481], [398.422, 922.217, 0.485], [347.385, 1076.56, 0.624], [333.837, 1269.244, 0.608]]\nD: [[335.019, 1099.773, 0.481], [389.242, 976.46, 0.466], [401.879, 992.855, 0.713], [331.612, 1204.414, 0.622]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[409.465, 1216.476, 0.476], [318.714, 1226.609, 0.544], [365.342, 995.006, 0.666], [409.876, 1110.956, 0.592]]\nB: [[394.755, 1113.151, 0.528], [394.774, 1113.143, 0.578], [394.793, 1113.134, 0.628], [394.793, 1113.134, 0.703]]\nC: [[420.334, 1155.862, 0.481], [398.422, 922.217, 0.485], [347.385, 1076.56, 0.624], [333.837, 1269.244, 0.608]]\nD: [[335.019, 1099.773, 0.481], [389.242, 976.46, 0.466], [401.879, 992.855, 0.713], [331.612, 1204.414, 0.622]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_44_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_44_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_44_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_44_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_44_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_44_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_44_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_44_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[361.726, 683.196, 1.397], [329.217, 690.805, 1.946], [283.896, 621.516, 1.958], [356.569, 620.728, 2.064]]\nB: [[418.211, 729.764, 1.616], [294.113, 717.923, 1.419], [354.161, 578.812, 1.657], [290.406, 708.411, 2.08]]\nC: [[294.448, 559.154, 1.483], [317.072, 572.818, 2.094], [333.21, 533.806, 2.046], [288.729, 702.966, 1.84]]\nD: [[349.242, 634.568, 1.725], [349.228, 634.584, 1.748], [349.213, 634.601, 1.771], [349.198, 634.618, 1.794]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[361.726, 683.196, 1.397], [329.217, 690.805, 1.946], [283.896, 621.516, 1.958], [356.569, 620.728, 2.064]]\nB: [[418.211, 729.764, 1.616], [294.113, 717.923, 1.419], [354.161, 578.812, 1.657], [290.406, 708.411, 2.08]]\nC: [[294.448, 559.154, 1.483], [317.072, 572.818, 2.094], [333.21, 533.806, 2.046], [288.729, 702.966, 1.84]]\nD: [[349.242, 634.568, 1.725], [349.228, 634.584, 1.748], [349.213, 634.601, 1.771], [349.198, 634.618, 1.794]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_45_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_45_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_45_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_45_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_45_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_45_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_45_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_45_7.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[1342.828, 1030.123, -0.019], [1342.828, 1030.123, -0.019], [1342.828, 1030.123, -0.019], [1342.828, 1030.123, -0.019]]\nB: [[1097.533, 870.18, -0.018], [1280.182, 1133.641, -0.019], [1489.479, 998.305, -0.016], [1148.803, 1033.836, -0.018]]\nC: [[1127.233, 1005.075, -0.016], [1511.451, 847.909, -0.022], [1150.864, 1055.903, -0.02], [1443.444, 1006.94, -0.017]]\nD: [[1513.245, 1007.752, -0.022], [1331.585, 1065.932, -0.016], [1532.891, 854.441, -0.017], [1526.175, 951.171, -0.022]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1342.828, 1030.123, -0.019], [1342.828, 1030.123, -0.019], [1342.828, 1030.123, -0.019], [1342.828, 1030.123, -0.019]]\nB: [[1097.533, 870.18, -0.018], [1280.182, 1133.641, -0.019], [1489.479, 998.305, -0.016], [1148.803, 1033.836, -0.018]]\nC: [[1127.233, 1005.075, -0.016], [1511.451, 847.909, -0.022], [1150.864, 1055.903, -0.02], [1443.444, 1006.94, -0.017]]\nD: [[1513.245, 1007.752, -0.022], [1331.585, 1065.932, -0.016], [1532.891, 854.441, -0.017], [1526.175, 951.171, -0.022]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_46_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_46_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_46_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_46_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_46_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_46_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_46_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_46_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[368.243, 1147.276, 0.495], [320.188, 1220.441, 0.564], [378.673, 985.999, 0.66], [415.817, 1250.586, 0.729]]\nB: [[471.077, 1316.726, 0.65], [388.888, 998.99, 0.705], [397.125, 1213.868, 0.57], [326.301, 938.598, 0.746]]\nC: [[394.039, 1143.246, 0.615], [391.841, 1138.065, 0.615], [389.353, 1132.372, 0.64], [387.343, 1127.335, 0.765]]\nD: [[380.686, 1104.044, 0.666], [420.094, 1131.831, 0.503], [335.098, 1016.255, 0.76], [342.797, 1164.927, 0.672]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[368.243, 1147.276, 0.495], [320.188, 1220.441, 0.564], [378.673, 985.999, 0.66], [415.817, 1250.586, 0.729]]\nB: [[471.077, 1316.726, 0.65], [388.888, 998.99, 0.705], [397.125, 1213.868, 0.57], [326.301, 938.598, 0.746]]\nC: [[394.039, 1143.246, 0.615], [391.841, 1138.065, 0.615], [389.353, 1132.372, 0.64], [387.343, 1127.335, 0.765]]\nD: [[380.686, 1104.044, 0.666], [420.094, 1131.831, 0.503], [335.098, 1016.255, 0.76], [342.797, 1164.927, 0.672]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_47_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_47_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_47_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_47_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_47_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_47_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_47_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_47_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[529.721, 1110.52, 1.789], [418.305, 959.31, 1.737], [400.029, 1228.06, 2.221], [451.205, 1117.422, 2.204]]\nB: [[411.023, 1090.6, 1.799], [524.659, 1002.9, 1.558], [459.22, 1085.139, 1.901], [427.15, 911.884, 1.822]]\nC: [[503.852, 1131.29, 2.197], [402.563, 1323.31, 1.648], [532.361, 1202.739, 1.985], [480.913, 1034.846, 2.094]]\nD: [[456.587, 1114.23, 2.052], [448.914, 1116.73, 1.885], [448.331, 1116.811, 1.887], [446.322, 1116.881, 2.007]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[529.721, 1110.52, 1.789], [418.305, 959.31, 1.737], [400.029, 1228.06, 2.221], [451.205, 1117.422, 2.204]]\nB: [[411.023, 1090.6, 1.799], [524.659, 1002.9, 1.558], [459.22, 1085.139, 1.901], [427.15, 911.884, 1.822]]\nC: [[503.852, 1131.29, 2.197], [402.563, 1323.31, 1.648], [532.361, 1202.739, 1.985], [480.913, 1034.846, 2.094]]\nD: [[456.587, 1114.23, 2.052], [448.914, 1116.73, 1.885], [448.331, 1116.811, 1.887], [446.322, 1116.881, 2.007]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_48_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_48_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_48_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_48_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_48_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_48_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_48_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_48_7.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[410.862, 1106.326, 0.665], [410.862, 1106.326, 0.59], [410.862, 1106.326, 0.553], [410.862, 1106.326, 0.415]]\nB: [[404.294, 1051.995, 0.561], [398.816, 1084.635, 0.64], [356.338, 1249.05, 0.503], [446.466, 1282.71, 0.342]]\nC: [[336.012, 1230.001, 0.749], [456.309, 1162.403, 0.66], [488.514, 919.924, 0.477], [360.602, 1191.978, 0.369]]\nD: [[407.759, 1016.766, 0.63], [366.992, 935.62, 0.5], [484.755, 1037.534, 0.603], [490.174, 1145.425, 0.38]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[410.862, 1106.326, 0.665], [410.862, 1106.326, 0.59], [410.862, 1106.326, 0.553], [410.862, 1106.326, 0.415]]\nB: [[404.294, 1051.995, 0.561], [398.816, 1084.635, 0.64], [356.338, 1249.05, 0.503], [446.466, 1282.71, 0.342]]\nC: [[336.012, 1230.001, 0.749], [456.309, 1162.403, 0.66], [488.514, 919.924, 0.477], [360.602, 1191.978, 0.369]]\nD: [[407.759, 1016.766, 0.63], [366.992, 935.62, 0.5], [484.755, 1037.534, 0.603], [490.174, 1145.425, 0.38]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_49_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_49_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_49_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_49_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_49_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_49_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_49_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_49_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[674.817, 1495.218, -0.033], [637.599, 1735.138, 0.04], [753.378, 1282.425, 0.266], [579.881, 1534.752, 0.639]]\nB: [[709.455, 1518.871, -0.042], [743.219, 1536.668, 0.05], [546.398, 1312.775, 0.212], [543.916, 1348.998, 0.477]]\nC: [[646.543, 1481.061, -0.039], [690.227, 1278.475, 0.06], [658.101, 1765.87, 0.225], [673.992, 1863.036, 0.546]]\nD: [[654.306, 1593.839, -0.039], [654.897, 1593.314, 0.05], [655.544, 1592.867, 0.238], [656.181, 1592.404, 0.554]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[674.817, 1495.218, -0.033], [637.599, 1735.138, 0.04], [753.378, 1282.425, 0.266], [579.881, 1534.752, 0.639]]\nB: [[709.455, 1518.871, -0.042], [743.219, 1536.668, 0.05], [546.398, 1312.775, 0.212], [543.916, 1348.998, 0.477]]\nC: [[646.543, 1481.061, -0.039], [690.227, 1278.475, 0.06], [658.101, 1765.87, 0.225], [673.992, 1863.036, 0.546]]\nD: [[654.306, 1593.839, -0.039], [654.897, 1593.314, 0.05], [655.544, 1592.867, 0.238], [656.181, 1592.404, 0.554]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_50_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_50_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_50_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_50_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_50_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_50_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_50_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_50_7.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[318.904, 699.89, -0.305], [356.863, 749.45, -0.295], [342.145, 817.48, -0.165], [362.888, 813.169, -0.152]]\nB: [[311.846, 696.05, -0.326], [311.404, 695.55, -0.298], [309.653, 693.549, -0.188], [309.251, 693.028, -0.161]]\nC: [[270.331, 808.82, -0.365], [259.897, 752.35, -0.352], [311.667, 620.881, -0.218], [273.84, 705.279, -0.182]]\nD: [[285.234, 746.79, -0.327], [276.961, 728.57, -0.341], [274.698, 714.537, -0.21], [341.495, 714.928, -0.131]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[318.904, 699.89, -0.305], [356.863, 749.45, -0.295], [342.145, 817.48, -0.165], [362.888, 813.169, -0.152]]\nB: [[311.846, 696.05, -0.326], [311.404, 695.55, -0.298], [309.653, 693.549, -0.188], [309.251, 693.028, -0.161]]\nC: [[270.331, 808.82, -0.365], [259.897, 752.35, -0.352], [311.667, 620.881, -0.218], [273.84, 705.279, -0.182]]\nD: [[285.234, 746.79, -0.327], [276.961, 728.57, -0.341], [274.698, 714.537, -0.21], [341.495, 714.928, -0.131]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_51_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_51_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_51_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_51_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_51_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_51_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_51_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_51_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[432.564, 1385.721, 0.86], [382.527, 1126.126, 0.819], [443.57, 1262.347, 0.849], [446.802, 1286.824, 0.805]]\nB: [[370.754, 1092.547, 0.872], [331.67, 1167.043, 0.761], [399.201, 1018.42, 0.801], [365.027, 1292.343, 0.758]]\nC: [[408.524, 1190.723, 0.733], [408.524, 1190.723, 0.773], [408.524, 1190.723, 0.814], [408.524, 1190.723, 0.854]]\nD: [[402.914, 1215.467, 0.85], [450.76, 1135.126, 0.766], [461.237, 971.14, 0.851], [437.03, 1104.878, 0.788]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[432.564, 1385.721, 0.86], [382.527, 1126.126, 0.819], [443.57, 1262.347, 0.849], [446.802, 1286.824, 0.805]]\nB: [[370.754, 1092.547, 0.872], [331.67, 1167.043, 0.761], [399.201, 1018.42, 0.801], [365.027, 1292.343, 0.758]]\nC: [[408.524, 1190.723, 0.733], [408.524, 1190.723, 0.773], [408.524, 1190.723, 0.814], [408.524, 1190.723, 0.854]]\nD: [[402.914, 1215.467, 0.85], [450.76, 1135.126, 0.766], [461.237, 971.14, 0.851], [437.03, 1104.878, 0.788]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_52_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_52_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_52_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_52_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_52_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_52_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_52_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_52_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[509.54, 973.159, 0.18], [380.671, 1268.603, 0.2], [479.07, 1050.521, 0.236], [494.912, 1129.693, 0.227]]\nB: [[439.95, 1094.017, 0.17], [439.878, 1094.005, 0.2], [439.87, 1094.004, 0.204], [439.863, 1094.003, 0.207]]\nC: [[469.34, 1051.411, 0.16], [354.833, 915.321, 0.2], [409.43, 978.881, 0.18], [455.437, 1174.679, 0.197]]\nD: [[450.6, 1030.444, 0.18], [376.497, 1114.358, 0.2], [397.1, 1100.748, 0.221], [400.171, 883.327, 0.198]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[509.54, 973.159, 0.18], [380.671, 1268.603, 0.2], [479.07, 1050.521, 0.236], [494.912, 1129.693, 0.227]]\nB: [[439.95, 1094.017, 0.17], [439.878, 1094.005, 0.2], [439.87, 1094.004, 0.204], [439.863, 1094.003, 0.207]]\nC: [[469.34, 1051.411, 0.16], [354.833, 915.321, 0.2], [409.43, 978.881, 0.18], [455.437, 1174.679, 0.197]]\nD: [[450.6, 1030.444, 0.18], [376.497, 1114.358, 0.2], [397.1, 1100.748, 0.221], [400.171, 883.327, 0.198]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_53_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_53_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_53_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_53_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_53_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_53_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_53_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_53_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[745.24, 1895.181, 1.097], [763.549, 1536.044, 0.845], [570.809, 1601.968, 0.989], [652.246, 1423.25, 1.127]]\nB: [[532.12, 1872.39, 0.899], [664.607, 1965.812, 0.921], [689.926, 1479.159, 0.981], [613.897, 1574.3, 1.137]]\nC: [[548.35, 1392.059, 1.158], [726.228, 1698.055, 1.001], [546.517, 1691.289, 0.982], [644.748, 1773.44, 1.004]]\nD: [[638.38, 1644.304, 0.969], [638.651, 1644.538, 0.969], [639.028, 1644.741, 0.969], [639.302, 1644.98, 0.969]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[745.24, 1895.181, 1.097], [763.549, 1536.044, 0.845], [570.809, 1601.968, 0.989], [652.246, 1423.25, 1.127]]\nB: [[532.12, 1872.39, 0.899], [664.607, 1965.812, 0.921], [689.926, 1479.159, 0.981], [613.897, 1574.3, 1.137]]\nC: [[548.35, 1392.059, 1.158], [726.228, 1698.055, 1.001], [546.517, 1691.289, 0.982], [644.748, 1773.44, 1.004]]\nD: [[638.38, 1644.304, 0.969], [638.651, 1644.538, 0.969], [639.028, 1644.741, 0.969], [639.302, 1644.98, 0.969]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_54_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_54_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_54_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_54_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_54_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_54_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_54_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_54_7.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[498.649, 946.58, 0.483], [429.418, 1259.918, 0.62], [364.692, 1168.776, 0.667], [459.431, 959.241, 0.827]]\nB: [[455.692, 1030.221, 0.542], [455.48, 1267.707, 0.483], [500.554, 1295.934, 0.772], [397.8, 1198.962, 0.903]]\nC: [[424.598, 1092.173, 0.591], [424.547, 1092.198, 0.561], [424.495, 1092.222, 0.732], [424.504, 1092.223, 0.809]]\nD: [[442.794, 1098.547, 0.607], [348.384, 1268.237, 0.46], [435.072, 1179.416, 0.685], [349.227, 1283.876, 0.743]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[498.649, 946.58, 0.483], [429.418, 1259.918, 0.62], [364.692, 1168.776, 0.667], [459.431, 959.241, 0.827]]\nB: [[455.692, 1030.221, 0.542], [455.48, 1267.707, 0.483], [500.554, 1295.934, 0.772], [397.8, 1198.962, 0.903]]\nC: [[424.598, 1092.173, 0.591], [424.547, 1092.198, 0.561], [424.495, 1092.222, 0.732], [424.504, 1092.223, 0.809]]\nD: [[442.794, 1098.547, 0.607], [348.384, 1268.237, 0.46], [435.072, 1179.416, 0.685], [349.227, 1283.876, 0.743]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_55_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_55_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_55_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_55_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_55_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_55_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_55_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_55_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[1592.863, 1046.051, 0.228], [1890.036, 733.66, 0.198], [2193.82, 880.022, 0.24], [2269.051, 988.07, 0.246]]\nB: [[1799.697, 977.341, 0.188], [2255.207, 938.924, 0.229], [2189.995, 1042.344, 0.204], [1954.403, 990.03, 0.25]]\nC: [[1920.044, 873.356, 0.213], [1920.067, 873.333, 0.213], [1920.067, 873.333, 0.213], [1920.021, 873.38, 0.263]]\nD: [[2188.053, 984.479, 0.185], [1789.35, 823.078, 0.223], [2262.284, 775.407, 0.196], [2232.543, 929.3, 0.291]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1592.863, 1046.051, 0.228], [1890.036, 733.66, 0.198], [2193.82, 880.022, 0.24], [2269.051, 988.07, 0.246]]\nB: [[1799.697, 977.341, 0.188], [2255.207, 938.924, 0.229], [2189.995, 1042.344, 0.204], [1954.403, 990.03, 0.25]]\nC: [[1920.044, 873.356, 0.213], [1920.067, 873.333, 0.213], [1920.067, 873.333, 0.213], [1920.021, 873.38, 0.263]]\nD: [[2188.053, 984.479, 0.185], [1789.35, 823.078, 0.223], [2262.284, 775.407, 0.196], [2232.543, 929.3, 0.291]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_56_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_56_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_56_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_56_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_56_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_56_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_56_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_56_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[1812.249, 2571.668, -0.038], [1816.741, 2566.203, 0.124], [1820.586, 2560.847, 0.325], [1825.127, 2555.039, 0.499]]\nB: [[1810.749, 2477.016, -0.044], [1526.117, 2495.829, 0.129], [2120.606, 2682.221, 0.369], [1940.674, 2177.131, 0.513]]\nC: [[1882.741, 2318.424, -0.045], [1487.68, 2321.211, 0.127], [2151.691, 2137.892, 0.264], [1751.426, 2963.026, 0.451]]\nD: [[1614.268, 2747.937, -0.04], [1694.976, 3075.224, 0.115], [1495.647, 3054.549, 0.349], [2186.702, 2819.745, 0.446]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1812.249, 2571.668, -0.038], [1816.741, 2566.203, 0.124], [1820.586, 2560.847, 0.325], [1825.127, 2555.039, 0.499]]\nB: [[1810.749, 2477.016, -0.044], [1526.117, 2495.829, 0.129], [2120.606, 2682.221, 0.369], [1940.674, 2177.131, 0.513]]\nC: [[1882.741, 2318.424, -0.045], [1487.68, 2321.211, 0.127], [2151.691, 2137.892, 0.264], [1751.426, 2963.026, 0.451]]\nD: [[1614.268, 2747.937, -0.04], [1694.976, 3075.224, 0.115], [1495.647, 3054.549, 0.349], [2186.702, 2819.745, 0.446]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_57_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_57_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_57_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_57_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_57_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_57_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_57_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_57_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[1774.223, 2610.364, 0.996], [1774.053, 2610.352, 1.02], [1773.502, 2611.229, 1.045], [1773.029, 2612.151, 1.07]]\nB: [[1776.641, 2350.607, 1.067], [1644.964, 2423.862, 0.9], [1725.868, 3040.601, 1.103], [2124.437, 2193.747, 0.89]]\nC: [[1671.897, 2705.242, 1.029], [1872.101, 2819.316, 0.97], [1650.995, 2602.6, 1.046], [1436.928, 2842.091, 1.11]]\nD: [[1953.59, 2775.546, 0.952], [1810.746, 2706.189, 0.97], [1565.979, 2177.88, 1.084], [1805.0, 2120.155, 0.92]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1774.223, 2610.364, 0.996], [1774.053, 2610.352, 1.02], [1773.502, 2611.229, 1.045], [1773.029, 2612.151, 1.07]]\nB: [[1776.641, 2350.607, 1.067], [1644.964, 2423.862, 0.9], [1725.868, 3040.601, 1.103], [2124.437, 2193.747, 0.89]]\nC: [[1671.897, 2705.242, 1.029], [1872.101, 2819.316, 0.97], [1650.995, 2602.6, 1.046], [1436.928, 2842.091, 1.11]]\nD: [[1953.59, 2775.546, 0.952], [1810.746, 2706.189, 0.97], [1565.979, 2177.88, 1.084], [1805.0, 2120.155, 0.92]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_58_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_58_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_58_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_58_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_58_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_58_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_58_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_58_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[1736.392, 873.361, 1.079], [1734.892, 874.826, 1.069], [1733.79, 876.382, 1.059], [1732.878, 880.029, 1.041]]\nB: [[1927.606, 821.613, 1.179], [1635.106, 838.612, 1.118], [1798.04, 779.408, 1.262], [1906.044, 824.489, 1.153]]\nC: [[1655.684, 967.021, 1.001], [2054.919, 792.455, 1.056], [1489.06, 752.973, 1.09], [1601.794, 1000.219, 1.236]]\nD: [[2074.836, 711.213, 0.881], [1659.637, 744.201, 1.258], [2073.69, 988.772, 1.118], [1777.227, 826.859, 1.096]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1736.392, 873.361, 1.079], [1734.892, 874.826, 1.069], [1733.79, 876.382, 1.059], [1732.878, 880.029, 1.041]]\nB: [[1927.606, 821.613, 1.179], [1635.106, 838.612, 1.118], [1798.04, 779.408, 1.262], [1906.044, 824.489, 1.153]]\nC: [[1655.684, 967.021, 1.001], [2054.919, 792.455, 1.056], [1489.06, 752.973, 1.09], [1601.794, 1000.219, 1.236]]\nD: [[2074.836, 711.213, 0.881], [1659.637, 744.201, 1.258], [2073.69, 988.772, 1.118], [1777.227, 826.859, 1.096]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_59_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_59_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_59_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_59_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_59_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_59_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_59_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_59_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[283.606, 696.753, 1.329], [383.644, 532.364, 1.363], [332.149, 525.401, 1.473], [385.71, 734.583, 1.097]]\nB: [[383.086, 639.489, 1.063], [279.173, 585.298, 1.388], [361.433, 701.65, 1.375], [295.925, 719.625, 1.188]]\nC: [[330.789, 641.074, 1.158], [330.789, 641.074, 1.212], [330.789, 641.074, 1.267], [330.789, 641.074, 1.322]]\nD: [[306.13, 719.717, 1.368], [345.491, 726.192, 0.98], [345.908, 519.046, 1.407], [318.14, 654.942, 1.306]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[283.606, 696.753, 1.329], [383.644, 532.364, 1.363], [332.149, 525.401, 1.473], [385.71, 734.583, 1.097]]\nB: [[383.086, 639.489, 1.063], [279.173, 585.298, 1.388], [361.433, 701.65, 1.375], [295.925, 719.625, 1.188]]\nC: [[330.789, 641.074, 1.158], [330.789, 641.074, 1.212], [330.789, 641.074, 1.267], [330.789, 641.074, 1.322]]\nD: [[306.13, 719.717, 1.368], [345.491, 726.192, 0.98], [345.908, 519.046, 1.407], [318.14, 654.942, 1.306]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_60_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_60_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_60_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_60_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_60_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_60_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_60_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_60_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[661.561, 1356.246, 0.07], [523.515, 1451.812, 0.242], [629.357, 1441.913, 0.46], [693.951, 1658.121, 0.495]]\nB: [[596.26, 1440.992, 0.068], [660.286, 1441.26, 0.267], [710.568, 1651.675, 0.41], [567.432, 1395.938, 0.5]]\nC: [[628.289, 1618.572, 0.075], [628.026, 1618.937, 0.252], [627.783, 1619.317, 0.43], [627.525, 1619.686, 0.607]]\nD: [[603.153, 1669.791, 0.074], [606.05, 1635.908, 0.212], [518.765, 1574.758, 0.36], [706.901, 1431.785, 0.612]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[661.561, 1356.246, 0.07], [523.515, 1451.812, 0.242], [629.357, 1441.913, 0.46], [693.951, 1658.121, 0.495]]\nB: [[596.26, 1440.992, 0.068], [660.286, 1441.26, 0.267], [710.568, 1651.675, 0.41], [567.432, 1395.938, 0.5]]\nC: [[628.289, 1618.572, 0.075], [628.026, 1618.937, 0.252], [627.783, 1619.317, 0.43], [627.525, 1619.686, 0.607]]\nD: [[603.153, 1669.791, 0.074], [606.05, 1635.908, 0.212], [518.765, 1574.758, 0.36], [706.901, 1431.785, 0.612]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_61_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_61_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_61_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_61_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_61_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_61_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_61_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_61_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[330.056, 1212.866, 0.652], [437.866, 986.828, 0.616], [373.47, 964.81, 0.657], [399.422, 1352.779, 0.635]]\nB: [[376.04, 1070.497, 0.645], [362.804, 1103.987, 0.583], [385.25, 1290.416, 0.524], [372.683, 1131.605, 0.686]]\nC: [[399.012, 1167.878, 0.547], [399.016, 1167.877, 0.567], [399.02, 1167.875, 0.588], [399.024, 1167.873, 0.609]]\nD: [[341.259, 1087.552, 0.588], [364.207, 1061.407, 0.495], [465.16, 1225.627, 0.505], [436.212, 1297.403, 0.597]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[330.056, 1212.866, 0.652], [437.866, 986.828, 0.616], [373.47, 964.81, 0.657], [399.422, 1352.779, 0.635]]\nB: [[376.04, 1070.497, 0.645], [362.804, 1103.987, 0.583], [385.25, 1290.416, 0.524], [372.683, 1131.605, 0.686]]\nC: [[399.012, 1167.878, 0.547], [399.016, 1167.877, 0.567], [399.02, 1167.875, 0.588], [399.024, 1167.873, 0.609]]\nD: [[341.259, 1087.552, 0.588], [364.207, 1061.407, 0.495], [465.16, 1225.627, 0.505], [436.212, 1297.403, 0.597]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_62_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_62_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_62_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_62_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_62_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_62_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_62_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_62_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[1274.538, 1123.672, 0.2], [1346.842, 824.302, 0.2], [1292.774, 973.997, 0.26], [1054.281, 825.781, 0.247]]\nB: [[1328.749, 1065.512, 0.2], [1175.959, 833.053, 0.21], [1184.952, 824.911, 0.22], [1071.949, 1039.546, 0.188]]\nC: [[1098.549, 1020.422, 0.2], [1497.254, 1019.016, 0.22], [1382.864, 830.543, 0.21], [1100.12, 1050.184, 0.22]]\nD: [[1253.322, 1015.243, 0.2], [1253.424, 1015.978, 0.21], [1253.526, 1016.713, 0.22], [1253.637, 1017.522, 0.231]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1274.538, 1123.672, 0.2], [1346.842, 824.302, 0.2], [1292.774, 973.997, 0.26], [1054.281, 825.781, 0.247]]\nB: [[1328.749, 1065.512, 0.2], [1175.959, 833.053, 0.21], [1184.952, 824.911, 0.22], [1071.949, 1039.546, 0.188]]\nC: [[1098.549, 1020.422, 0.2], [1497.254, 1019.016, 0.22], [1382.864, 830.543, 0.21], [1100.12, 1050.184, 0.22]]\nD: [[1253.322, 1015.243, 0.2], [1253.424, 1015.978, 0.21], [1253.526, 1016.713, 0.22], [1253.637, 1017.522, 0.231]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_63_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_63_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_63_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_63_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_63_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_63_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_63_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_63_7.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[1875.125, 875.415, 0.935], [1875.191, 875.416, 0.935], [1875.193, 875.318, 0.985], [1875.195, 875.264, 0.952]]\nB: [[2028.088, 883.086, 0.815], [1648.373, 903.844, 1.102], [1520.401, 917.022, 1.065], [1867.355, 949.61, 0.806]]\nC: [[1847.629, 795.419, 1.055], [2059.556, 721.579, 0.94], [1862.226, 915.16, 1.105], [1795.239, 771.992, 0.777]]\nD: [[1908.891, 972.995, 0.987], [1782.46, 894.481, 0.828], [1850.807, 745.233, 1.094], [1559.966, 967.549, 0.89]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1875.125, 875.415, 0.935], [1875.191, 875.416, 0.935], [1875.193, 875.318, 0.985], [1875.195, 875.264, 0.952]]\nB: [[2028.088, 883.086, 0.815], [1648.373, 903.844, 1.102], [1520.401, 917.022, 1.065], [1867.355, 949.61, 0.806]]\nC: [[1847.629, 795.419, 1.055], [2059.556, 721.579, 0.94], [1862.226, 915.16, 1.105], [1795.239, 771.992, 0.777]]\nD: [[1908.891, 972.995, 0.987], [1782.46, 894.481, 0.828], [1850.807, 745.233, 1.094], [1559.966, 967.549, 0.89]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_64_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_64_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_64_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_64_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_64_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_64_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_64_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_64_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[582.71, 1411.821, -0.354], [557.0, 1488.19, -0.247], [651.447, 1741.537, -0.237], [520.268, 1329.718, -0.188]]\nB: [[635.447, 1620.546, -0.326], [637.445, 1618.566, -0.238], [639.933, 1616.457, -0.267], [642.736, 1614.065, -0.196]]\nC: [[522.996, 1413.245, -0.379], [659.983, 1928.523, -0.204], [766.979, 1315.798, -0.304], [599.616, 1825.248, -0.224]]\nD: [[534.996, 1707.261, -0.292], [614.808, 1704.145, -0.211], [523.563, 1883.81, -0.302], [672.146, 1371.116, -0.218]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[582.71, 1411.821, -0.354], [557.0, 1488.19, -0.247], [651.447, 1741.537, -0.237], [520.268, 1329.718, -0.188]]\nB: [[635.447, 1620.546, -0.326], [637.445, 1618.566, -0.238], [639.933, 1616.457, -0.267], [642.736, 1614.065, -0.196]]\nC: [[522.996, 1413.245, -0.379], [659.983, 1928.523, -0.204], [766.979, 1315.798, -0.304], [599.616, 1825.248, -0.224]]\nD: [[534.996, 1707.261, -0.292], [614.808, 1704.145, -0.211], [523.563, 1883.81, -0.302], [672.146, 1371.116, -0.218]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_65_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_65_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_65_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_65_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_65_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_65_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_65_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_65_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[1680.561, 2863.302, 2.542], [2169.563, 2794.924, 2.265], [1707.842, 2939.301, 2.152], [1632.945, 2507.032, 1.945]]\nB: [[1547.261, 2391.341, 2.475], [1833.795, 2236.842, 2.065], [1997.767, 2445.687, 1.715], [2056.371, 2356.627, 2.0]]\nC: [[1904.106, 2453.654, 2.215], [1897.838, 2460.219, 2.156], [1892.616, 2465.688, 2.107], [1887.387, 2471.164, 2.057]]\nD: [[2182.661, 2279.405, 1.852], [1930.577, 2763.231, 2.146], [1909.158, 2677.265, 2.297], [1596.829, 2331.093, 2.251]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1680.561, 2863.302, 2.542], [2169.563, 2794.924, 2.265], [1707.842, 2939.301, 2.152], [1632.945, 2507.032, 1.945]]\nB: [[1547.261, 2391.341, 2.475], [1833.795, 2236.842, 2.065], [1997.767, 2445.687, 1.715], [2056.371, 2356.627, 2.0]]\nC: [[1904.106, 2453.654, 2.215], [1897.838, 2460.219, 2.156], [1892.616, 2465.688, 2.107], [1887.387, 2471.164, 2.057]]\nD: [[2182.661, 2279.405, 1.852], [1930.577, 2763.231, 2.146], [1909.158, 2677.265, 2.297], [1596.829, 2331.093, 2.251]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_66_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_66_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_66_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_66_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_66_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_66_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_66_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_66_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[325.896, 989.839, 0.854], [326.517, 1140.353, 0.792], [331.478, 1339.384, 0.825], [382.05, 1348.154, 0.905]]\nB: [[393.357, 1149.173, 0.741], [392.945, 1148.426, 0.766], [392.836, 1148.208, 0.791], [392.641, 1147.242, 0.816]]\nC: [[349.533, 1155.715, 0.667], [378.661, 1084.815, 0.825], [431.355, 1125.036, 0.69], [366.861, 940.522, 0.728]]\nD: [[370.793, 1354.659, 0.611], [315.047, 1147.297, 0.791], [387.351, 947.719, 0.922], [465.223, 1022.515, 0.919]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[325.896, 989.839, 0.854], [326.517, 1140.353, 0.792], [331.478, 1339.384, 0.825], [382.05, 1348.154, 0.905]]\nB: [[393.357, 1149.173, 0.741], [392.945, 1148.426, 0.766], [392.836, 1148.208, 0.791], [392.641, 1147.242, 0.816]]\nC: [[349.533, 1155.715, 0.667], [378.661, 1084.815, 0.825], [431.355, 1125.036, 0.69], [366.861, 940.522, 0.728]]\nD: [[370.793, 1354.659, 0.611], [315.047, 1147.297, 0.791], [387.351, 947.719, 0.922], [465.223, 1022.515, 0.919]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_67_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_67_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_67_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_67_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_67_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_67_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_67_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_67_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[2150.649, 802.858, 0.294], [2285.669, 868.672, 0.285], [1699.227, 714.027, 0.212], [1879.379, 823.249, 0.262]]\nB: [[2057.083, 982.769, 0.286], [2194.778, 989.034, 0.224], [1802.969, 943.191, 0.277], [2004.998, 886.268, 0.304]]\nC: [[1585.591, 914.605, 0.27], [1552.179, 1019.735, 0.316], [1997.522, 917.351, 0.271], [2167.86, 906.565, 0.376]]\nD: [[1926.398, 878.499, 0.267], [1926.397, 878.517, 0.277], [1926.355, 878.551, 0.259], [1926.373, 878.505, 0.317]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[2150.649, 802.858, 0.294], [2285.669, 868.672, 0.285], [1699.227, 714.027, 0.212], [1879.379, 823.249, 0.262]]\nB: [[2057.083, 982.769, 0.286], [2194.778, 989.034, 0.224], [1802.969, 943.191, 0.277], [2004.998, 886.268, 0.304]]\nC: [[1585.591, 914.605, 0.27], [1552.179, 1019.735, 0.316], [1997.522, 917.351, 0.271], [2167.86, 906.565, 0.376]]\nD: [[1926.398, 878.499, 0.267], [1926.397, 878.517, 0.277], [1926.355, 878.551, 0.259], [1926.373, 878.505, 0.317]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_68_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_68_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_68_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_68_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_68_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_68_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_68_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_68_7.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[510.539, 1910.445, 0.57], [644.708, 1390.766, 0.487], [528.12, 1465.53, 0.743], [684.489, 1755.997, 0.708]]\nB: [[618.712, 1315.402, 0.58], [707.938, 1490.825, 0.655], [678.962, 1715.92, 0.704], [561.864, 1423.394, 0.869]]\nC: [[644.567, 1930.873, 0.501], [662.364, 1327.012, 0.538], [649.501, 1573.33, 0.633], [563.587, 1732.779, 0.975]]\nD: [[612.719, 1632.142, 0.491], [612.166, 1632.636, 0.566], [611.613, 1633.13, 0.641], [611.127, 1633.567, 0.816]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[510.539, 1910.445, 0.57], [644.708, 1390.766, 0.487], [528.12, 1465.53, 0.743], [684.489, 1755.997, 0.708]]\nB: [[618.712, 1315.402, 0.58], [707.938, 1490.825, 0.655], [678.962, 1715.92, 0.704], [561.864, 1423.394, 0.869]]\nC: [[644.567, 1930.873, 0.501], [662.364, 1327.012, 0.538], [649.501, 1573.33, 0.633], [563.587, 1732.779, 0.975]]\nD: [[612.719, 1632.142, 0.491], [612.166, 1632.636, 0.566], [611.613, 1633.13, 0.641], [611.127, 1633.567, 0.816]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_69_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_69_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_69_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_69_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_69_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_69_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_69_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_69_7.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[439.78, 1227.349, 1.05], [426.44, 1268.51, 1.02], [325.982, 1240.255, 1.035], [445.556, 1380.946, 1.053]]\nB: [[383.14, 979.995, 1.108], [307.405, 1164.897, 0.89], [394.307, 1087.049, 1.131], [410.908, 1074.81, 0.998]]\nC: [[310.31, 1005.482, 1.062], [353.18, 1020.342, 0.94], [407.431, 1247.448, 1.209], [366.856, 956.543, 1.01]]\nD: [[376.13, 1158.507, 0.938], [376.399, 1159.165, 0.98], [376.667, 1159.822, 1.022], [376.878, 1160.357, 1.013]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[439.78, 1227.349, 1.05], [426.44, 1268.51, 1.02], [325.982, 1240.255, 1.035], [445.556, 1380.946, 1.053]]\nB: [[383.14, 979.995, 1.108], [307.405, 1164.897, 0.89], [394.307, 1087.049, 1.131], [410.908, 1074.81, 0.998]]\nC: [[310.31, 1005.482, 1.062], [353.18, 1020.342, 0.94], [407.431, 1247.448, 1.209], [366.856, 956.543, 1.01]]\nD: [[376.13, 1158.507, 0.938], [376.399, 1159.165, 0.98], [376.667, 1159.822, 1.022], [376.878, 1160.357, 1.013]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_70_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_70_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_70_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_70_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_70_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_70_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_70_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_70_7.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[1700.614, 979.18, 0.431], [1688.381, 1025.064, 0.508], [2269.256, 744.117, 0.545], [2084.454, 772.655, 0.492]]\nB: [[2210.59, 1003.05, 0.548], [1727.203, 861.604, 0.538], [1904.192, 830.147, 0.392], [1890.542, 842.708, 0.427]]\nC: [[1895.763, 879.04, 0.501], [1895.752, 879.076, 0.488], [1895.741, 879.112, 0.476], [1895.739, 879.116, 0.464]]\nD: [[1616.91, 819.62, 0.433], [2209.262, 739.792, 0.446], [2172.452, 852.21, 0.474], [2247.291, 966.129, 0.485]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1700.614, 979.18, 0.431], [1688.381, 1025.064, 0.508], [2269.256, 744.117, 0.545], [2084.454, 772.655, 0.492]]\nB: [[2210.59, 1003.05, 0.548], [1727.203, 861.604, 0.538], [1904.192, 830.147, 0.392], [1890.542, 842.708, 0.427]]\nC: [[1895.763, 879.04, 0.501], [1895.752, 879.076, 0.488], [1895.741, 879.112, 0.476], [1895.739, 879.116, 0.464]]\nD: [[1616.91, 819.62, 0.433], [2209.262, 739.792, 0.446], [2172.452, 852.21, 0.474], [2247.291, 966.129, 0.485]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_71_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_71_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_71_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_71_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_71_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_71_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_71_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_71_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[492.434, 1780.838, 1.16], [577.631, 1809.358, 1.725], [545.194, 1817.556, 1.603], [496.388, 1841.178, 2.172]]\nB: [[528.48, 1765.528, 1.61], [545.13, 1628.646, 1.326], [540.257, 1713.355, 1.75], [658.49, 1577.115, 2.147]]\nC: [[560.672, 1608.19, 1.54], [649.701, 1442.289, 1.318], [663.362, 1707.871, 2.131], [530.811, 1383.352, 2.114]]\nD: [[582.374, 1660.997, 1.38], [577.424, 1663.687, 1.585], [572.406, 1666.247, 1.789], [567.347, 1668.872, 2.039]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[492.434, 1780.838, 1.16], [577.631, 1809.358, 1.725], [545.194, 1817.556, 1.603], [496.388, 1841.178, 2.172]]\nB: [[528.48, 1765.528, 1.61], [545.13, 1628.646, 1.326], [540.257, 1713.355, 1.75], [658.49, 1577.115, 2.147]]\nC: [[560.672, 1608.19, 1.54], [649.701, 1442.289, 1.318], [663.362, 1707.871, 2.131], [530.811, 1383.352, 2.114]]\nD: [[582.374, 1660.997, 1.38], [577.424, 1663.687, 1.585], [572.406, 1666.247, 1.789], [567.347, 1668.872, 2.039]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_72_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_72_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_72_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_72_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_72_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_72_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_72_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_72_7.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[752.506, 1329.917, 0.448], [704.23, 1387.5, 0.595], [747.496, 1349.796, 0.742], [584.265, 1616.56, 0.8]]\nB: [[803.078, 1389.349, 0.388], [620.34, 1764.02, 0.522], [772.362, 1430.724, 0.537], [790.895, 1648.104, 0.8]]\nC: [[607.176, 1721.605, 0.459], [602.85, 1587.2, 0.548], [708.169, 1620.023, 0.647], [790.325, 1466.233, 0.7]]\nD: [[672.574, 1595.791, 0.388], [670.89, 1597.24, 0.625], [669.207, 1598.689, 0.663], [667.523, 1600.138, 0.7]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[752.506, 1329.917, 0.448], [704.23, 1387.5, 0.595], [747.496, 1349.796, 0.742], [584.265, 1616.56, 0.8]]\nB: [[803.078, 1389.349, 0.388], [620.34, 1764.02, 0.522], [772.362, 1430.724, 0.537], [790.895, 1648.104, 0.8]]\nC: [[607.176, 1721.605, 0.459], [602.85, 1587.2, 0.548], [708.169, 1620.023, 0.647], [790.325, 1466.233, 0.7]]\nD: [[672.574, 1595.791, 0.388], [670.89, 1597.24, 0.625], [669.207, 1598.689, 0.663], [667.523, 1600.138, 0.7]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_73_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_73_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_73_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_73_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_73_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_73_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_73_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_73_7.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[430.291, 1175.505, 0.588], [429.167, 1272.84, 0.836], [415.87, 1257.143, 0.882], [372.618, 1001.865, 0.689]]\nB: [[410.066, 1196.767, 0.656], [410.072, 1196.78, 0.706], [410.08, 1196.795, 0.756], [410.101, 1196.811, 0.756]]\nC: [[386.139, 1116.452, 0.72], [464.376, 1221.72, 0.778], [364.35, 1233.741, 0.755], [330.159, 1270.327, 0.687]]\nD: [[409.837, 984.781, 0.668], [440.225, 1048.51, 0.572], [446.2, 1257.02, 0.834], [482.338, 985.937, 0.624]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[430.291, 1175.505, 0.588], [429.167, 1272.84, 0.836], [415.87, 1257.143, 0.882], [372.618, 1001.865, 0.689]]\nB: [[410.066, 1196.767, 0.656], [410.072, 1196.78, 0.706], [410.08, 1196.795, 0.756], [410.101, 1196.811, 0.756]]\nC: [[386.139, 1116.452, 0.72], [464.376, 1221.72, 0.778], [364.35, 1233.741, 0.755], [330.159, 1270.327, 0.687]]\nD: [[409.837, 984.781, 0.668], [440.225, 1048.51, 0.572], [446.2, 1257.02, 0.834], [482.338, 985.937, 0.624]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_74_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_74_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_74_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_74_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_74_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_74_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_74_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_74_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[392.514, 1072.298, 0.999], [329.808, 1223.789, 1.084], [463.813, 1302.961, 0.914], [451.609, 1167.4, 1.154]]\nB: [[401.879, 1242.697, 0.728], [341.309, 990.577, 1.02], [450.947, 906.714, 1.117], [328.165, 960.327, 1.095]]\nC: [[342.127, 1107.321, 0.98], [447.285, 926.593, 0.964], [394.127, 898.801, 1.186], [403.682, 1324.015, 1.131]]\nD: [[391.204, 1112.576, 0.863], [391.204, 1112.576, 0.913], [391.208, 1112.586, 1.013], [391.212, 1112.595, 0.993]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[392.514, 1072.298, 0.999], [329.808, 1223.789, 1.084], [463.813, 1302.961, 0.914], [451.609, 1167.4, 1.154]]\nB: [[401.879, 1242.697, 0.728], [341.309, 990.577, 1.02], [450.947, 906.714, 1.117], [328.165, 960.327, 1.095]]\nC: [[342.127, 1107.321, 0.98], [447.285, 926.593, 0.964], [394.127, 898.801, 1.186], [403.682, 1324.015, 1.131]]\nD: [[391.204, 1112.576, 0.863], [391.204, 1112.576, 0.913], [391.208, 1112.586, 1.013], [391.212, 1112.595, 0.993]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_75_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_75_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_75_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_75_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_75_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_75_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_75_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_75_7.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[1321.516, 1033.801, 1.008], [1321.517, 1033.801, 1.008], [1321.517, 1033.8, 1.008], [1321.518, 1033.8, 1.008]]\nB: [[1066.616, 1131.355, 0.921], [1219.6, 1098.492, 0.864], [1282.161, 961.0, 1.081], [1197.931, 1177.0, 1.016]]\nC: [[1190.352, 917.033, 1.028], [1155.143, 1161.133, 1.153], [1394.211, 959.1, 0.834], [1188.323, 1016.1, 1.08]]\nD: [[1067.426, 888.354, 0.843], [1441.448, 1176.105, 0.843], [1087.955, 967.6, 0.966], [1191.947, 906.7, 0.967]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1321.516, 1033.801, 1.008], [1321.517, 1033.801, 1.008], [1321.517, 1033.8, 1.008], [1321.518, 1033.8, 1.008]]\nB: [[1066.616, 1131.355, 0.921], [1219.6, 1098.492, 0.864], [1282.161, 961.0, 1.081], [1197.931, 1177.0, 1.016]]\nC: [[1190.352, 917.033, 1.028], [1155.143, 1161.133, 1.153], [1394.211, 959.1, 0.834], [1188.323, 1016.1, 1.08]]\nD: [[1067.426, 888.354, 0.843], [1441.448, 1176.105, 0.843], [1087.955, 967.6, 0.966], [1191.947, 906.7, 0.967]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_76_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_76_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_76_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_76_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_76_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_76_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_76_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_76_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[438.849, 951.711, 0.795], [314.542, 1143.97, 0.648], [362.865, 1209.907, 0.806], [438.018, 933.567, 0.84]]\nB: [[388.688, 1111.433, 0.677], [388.691, 1111.43, 0.695], [388.695, 1111.428, 0.713], [388.698, 1111.426, 0.716]]\nC: [[316.274, 909.79, 0.674], [451.235, 958.84, 0.605], [365.204, 1239.893, 0.608], [426.654, 1268.736, 0.816]]\nD: [[452.526, 1172.998, 0.585], [454.014, 1000.44, 0.724], [336.213, 1132.703, 0.811], [313.791, 1218.829, 0.612]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[438.849, 951.711, 0.795], [314.542, 1143.97, 0.648], [362.865, 1209.907, 0.806], [438.018, 933.567, 0.84]]\nB: [[388.688, 1111.433, 0.677], [388.691, 1111.43, 0.695], [388.695, 1111.428, 0.713], [388.698, 1111.426, 0.716]]\nC: [[316.274, 909.79, 0.674], [451.235, 958.84, 0.605], [365.204, 1239.893, 0.608], [426.654, 1268.736, 0.816]]\nD: [[452.526, 1172.998, 0.585], [454.014, 1000.44, 0.724], [336.213, 1132.703, 0.811], [313.791, 1218.829, 0.612]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_77_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_77_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_77_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_77_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_77_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_77_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_77_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_77_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[625.166, 1621.071, -0.136], [624.733, 1621.463, -0.074], [624.328, 1621.884, -0.011], [624.034, 1622.205, 0.176]]\nB: [[673.606, 1641.602, -0.151], [650.83, 1385.101, -0.066], [545.785, 1758.678, -0.013], [623.635, 1668.753, 0.151]]\nC: [[612.227, 1304.459, -0.119], [594.602, 1728.678, -0.08], [714.884, 1584.229, -0.012], [716.369, 1325.064, 0.2]]\nD: [[677.007, 1319.892, -0.145], [590.033, 1617.266, -0.079], [508.485, 1809.42, -0.012], [584.009, 1851.902, 0.196]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[625.166, 1621.071, -0.136], [624.733, 1621.463, -0.074], [624.328, 1621.884, -0.011], [624.034, 1622.205, 0.176]]\nB: [[673.606, 1641.602, -0.151], [650.83, 1385.101, -0.066], [545.785, 1758.678, -0.013], [623.635, 1668.753, 0.151]]\nC: [[612.227, 1304.459, -0.119], [594.602, 1728.678, -0.08], [714.884, 1584.229, -0.012], [716.369, 1325.064, 0.2]]\nD: [[677.007, 1319.892, -0.145], [590.033, 1617.266, -0.079], [508.485, 1809.42, -0.012], [584.009, 1851.902, 0.196]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_78_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_78_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_78_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_78_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_78_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_78_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_78_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_78_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[500.254, 1116.383, 0.109], [528.871, 1208.17, 0.09], [370.111, 1189.599, 0.128], [490.724, 1065.954, 0.15]]\nB: [[424.937, 1288.953, 0.124], [494.735, 1135.49, 0.095], [377.247, 1056.094, 0.129], [454.303, 1168.649, 0.16]]\nC: [[445.198, 1091.608, 0.107], [445.269, 1091.74, 0.084], [445.269, 1091.738, 0.117], [445.269, 1091.735, 0.15]]\nD: [[518.8, 1113.376, 0.123], [424.179, 929.73, 0.097], [415.562, 1089.363, 0.113], [387.889, 1032.784, 0.17]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[500.254, 1116.383, 0.109], [528.871, 1208.17, 0.09], [370.111, 1189.599, 0.128], [490.724, 1065.954, 0.15]]\nB: [[424.937, 1288.953, 0.124], [494.735, 1135.49, 0.095], [377.247, 1056.094, 0.129], [454.303, 1168.649, 0.16]]\nC: [[445.198, 1091.608, 0.107], [445.269, 1091.74, 0.084], [445.269, 1091.738, 0.117], [445.269, 1091.735, 0.15]]\nD: [[518.8, 1113.376, 0.123], [424.179, 929.73, 0.097], [415.562, 1089.363, 0.113], [387.889, 1032.784, 0.17]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_79_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_79_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_79_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_79_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_79_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_79_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_79_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_79_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[1484.746, 911.994, 1.171], [1137.15, 1003.822, 1.118], [1492.05, 930.243, 1.121], [1124.198, 1141.212, 1.261]]\nB: [[1219.797, 981.822, 1.234], [1419.86, 1093.926, 0.969], [1395.832, 917.571, 1.104], [1330.08, 1062.03, 1.216]]\nC: [[1453.95, 988.704, 0.898], [1551.29, 1210.843, 1.28], [1428.034, 1104.909, 1.233], [1371.047, 908.624, 1.137]]\nD: [[1328.982, 1049.561, 1.089], [1328.99, 1049.562, 1.089], [1328.997, 1049.563, 1.089], [1329.005, 1049.565, 1.089]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1484.746, 911.994, 1.171], [1137.15, 1003.822, 1.118], [1492.05, 930.243, 1.121], [1124.198, 1141.212, 1.261]]\nB: [[1219.797, 981.822, 1.234], [1419.86, 1093.926, 0.969], [1395.832, 917.571, 1.104], [1330.08, 1062.03, 1.216]]\nC: [[1453.95, 988.704, 0.898], [1551.29, 1210.843, 1.28], [1428.034, 1104.909, 1.233], [1371.047, 908.624, 1.137]]\nD: [[1328.982, 1049.561, 1.089], [1328.99, 1049.562, 1.089], [1328.997, 1049.563, 1.089], [1329.005, 1049.565, 1.089]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_80_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_80_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_80_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_80_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_80_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_80_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_80_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_80_7.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[456.432, 1282.157, 0.891], [479.945, 1042.78, 0.822], [407.218, 1084.054, 0.7], [446.375, 997.916, 0.683]]\nB: [[462.967, 894.315, 0.986], [392.745, 966.59, 0.805], [391.102, 1018.399, 0.622], [493.885, 1286.081, 0.965]]\nC: [[466.287, 926.158, 0.882], [352.099, 1212.35, 0.658], [429.631, 1077.672, 0.822], [411.455, 1150.981, 0.801]]\nD: [[430.242, 1089.779, 1.026], [430.279, 1089.87, 0.776], [430.299, 1089.898, 0.776], [430.321, 1089.952, 0.817]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[456.432, 1282.157, 0.891], [479.945, 1042.78, 0.822], [407.218, 1084.054, 0.7], [446.375, 997.916, 0.683]]\nB: [[462.967, 894.315, 0.986], [392.745, 966.59, 0.805], [391.102, 1018.399, 0.622], [493.885, 1286.081, 0.965]]\nC: [[466.287, 926.158, 0.882], [352.099, 1212.35, 0.658], [429.631, 1077.672, 0.822], [411.455, 1150.981, 0.801]]\nD: [[430.242, 1089.779, 1.026], [430.279, 1089.87, 0.776], [430.299, 1089.898, 0.776], [430.321, 1089.952, 0.817]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_81_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_81_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_81_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_81_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_81_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_81_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_81_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_81_7.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[1945.526, 876.296, 0.419], [1945.526, 876.242, 0.469], [1945.526, 876.177, 0.469], [1945.526, 876.26, 0.519]]\nB: [[2158.266, 800.911, 0.453], [2106.839, 1043.586, 0.501], [2049.112, 832.682, 0.434], [2030.483, 957.49, 0.61]]\nC: [[2028.562, 929.081, 0.457], [1728.295, 771.666, 0.406], [2125.198, 983.306, 0.535], [2151.856, 925.1, 0.483]]\nD: [[2333.669, 1007.109, 0.449], [1683.52, 730.695, 0.511], [2240.73, 776.757, 0.511], [1717.598, 731.99, 0.548]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1945.526, 876.296, 0.419], [1945.526, 876.242, 0.469], [1945.526, 876.177, 0.469], [1945.526, 876.26, 0.519]]\nB: [[2158.266, 800.911, 0.453], [2106.839, 1043.586, 0.501], [2049.112, 832.682, 0.434], [2030.483, 957.49, 0.61]]\nC: [[2028.562, 929.081, 0.457], [1728.295, 771.666, 0.406], [2125.198, 983.306, 0.535], [2151.856, 925.1, 0.483]]\nD: [[2333.669, 1007.109, 0.449], [1683.52, 730.695, 0.511], [2240.73, 776.757, 0.511], [1717.598, 731.99, 0.548]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_82_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_82_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_82_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_82_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_82_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_82_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_82_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_82_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[1937.491, 914.639, 0.22], [1901.969, 968.51, 0.273], [1716.987, 958.003, 0.262], [1808.463, 962.345, 0.288]]\nB: [[2007.196, 817.175, 0.254], [1773.432, 871.003, 0.284], [1939.918, 1002.574, 0.304], [2252.629, 811.34, 0.262]]\nC: [[2285.927, 936.67, 0.263], [1604.253, 825.974, 0.25], [2118.153, 905.274, 0.26], [1884.079, 918.838, 0.296]]\nD: [[1926.631, 877.571, 0.228], [1926.631, 877.571, 0.252], [1926.626, 877.593, 0.255], [1926.638, 877.538, 0.303]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1937.491, 914.639, 0.22], [1901.969, 968.51, 0.273], [1716.987, 958.003, 0.262], [1808.463, 962.345, 0.288]]\nB: [[2007.196, 817.175, 0.254], [1773.432, 871.003, 0.284], [1939.918, 1002.574, 0.304], [2252.629, 811.34, 0.262]]\nC: [[2285.927, 936.67, 0.263], [1604.253, 825.974, 0.25], [2118.153, 905.274, 0.26], [1884.079, 918.838, 0.296]]\nD: [[1926.631, 877.571, 0.228], [1926.631, 877.571, 0.252], [1926.626, 877.593, 0.255], [1926.638, 877.538, 0.303]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_83_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_83_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_83_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_83_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_83_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_83_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_83_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_83_7.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[680.728, 1749.077, -0.375], [745.701, 1474.694, -0.299], [610.56, 1380.492, -0.27], [635.92, 1915.689, -0.136]]\nB: [[660.851, 1604.404, -0.423], [657.771, 1607.079, -0.332], [654.69, 1609.754, -0.24], [651.61, 1612.428, -0.148]]\nC: [[647.562, 1445.984, -0.429], [659.321, 1909.729, -0.283], [754.69, 1382.093, -0.26], [549.55, 1888.817, -0.122]]\nD: [[751.514, 1476.27, -0.49], [654.016, 1488.662, -0.332], [753.33, 1931.072, -0.2], [574.93, 1792.093, -0.171]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[680.728, 1749.077, -0.375], [745.701, 1474.694, -0.299], [610.56, 1380.492, -0.27], [635.92, 1915.689, -0.136]]\nB: [[660.851, 1604.404, -0.423], [657.771, 1607.079, -0.332], [654.69, 1609.754, -0.24], [651.61, 1612.428, -0.148]]\nC: [[647.562, 1445.984, -0.429], [659.321, 1909.729, -0.283], [754.69, 1382.093, -0.26], [549.55, 1888.817, -0.122]]\nD: [[751.514, 1476.27, -0.49], [654.016, 1488.662, -0.332], [753.33, 1931.072, -0.2], [574.93, 1792.093, -0.171]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_84_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_84_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_84_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_84_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_84_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_84_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_84_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_84_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[356.068, 1144.504, 0.82], [356.64, 1144.191, 0.795], [358.929, 1142.941, 0.822], [359.501, 1142.629, 0.839]]\nB: [[401.199, 1094.551, 0.83], [308.99, 1334.228, 0.943], [415.452, 921.574, 0.753], [392.805, 1225.338, 0.965]]\nC: [[395.4, 1321.544, 0.95], [322.87, 1045.667, 0.91], [342.828, 1295.35, 0.695], [397.067, 940.796, 0.768]]\nD: [[418.406, 1138.796, 0.82], [416.34, 1311.233, 0.684], [355.451, 1305.707, 0.882], [410.239, 1120.033, 0.971]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[356.068, 1144.504, 0.82], [356.64, 1144.191, 0.795], [358.929, 1142.941, 0.822], [359.501, 1142.629, 0.839]]\nB: [[401.199, 1094.551, 0.83], [308.99, 1334.228, 0.943], [415.452, 921.574, 0.753], [392.805, 1225.338, 0.965]]\nC: [[395.4, 1321.544, 0.95], [322.87, 1045.667, 0.91], [342.828, 1295.35, 0.695], [397.067, 940.796, 0.768]]\nD: [[418.406, 1138.796, 0.82], [416.34, 1311.233, 0.684], [355.451, 1305.707, 0.882], [410.239, 1120.033, 0.971]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_85_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_85_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_85_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_85_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_85_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_85_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_85_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_85_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[1753.666, 1018.404, 0.365], [1524.0, 883.794, 0.327], [2021.935, 879.24, 0.307], [1882.989, 791.594, 0.35]]\nB: [[2170.362, 809.726, 0.373], [2168.605, 703.253, 0.314], [1918.642, 995.58, 0.329], [1602.549, 910.935, 0.29]]\nC: [[2178.785, 988.248, 0.285], [2227.998, 705.37, 0.287], [1566.17, 877.23, 0.318], [1931.258, 826.324, 0.31]]\nD: [[1902.434, 878.055, 0.343], [1902.434, 878.055, 0.293], [1902.429, 878.07, 0.302], [1902.423, 878.086, 0.31]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1753.666, 1018.404, 0.365], [1524.0, 883.794, 0.327], [2021.935, 879.24, 0.307], [1882.989, 791.594, 0.35]]\nB: [[2170.362, 809.726, 0.373], [2168.605, 703.253, 0.314], [1918.642, 995.58, 0.329], [1602.549, 910.935, 0.29]]\nC: [[2178.785, 988.248, 0.285], [2227.998, 705.37, 0.287], [1566.17, 877.23, 0.318], [1931.258, 826.324, 0.31]]\nD: [[1902.434, 878.055, 0.343], [1902.434, 878.055, 0.293], [1902.429, 878.07, 0.302], [1902.423, 878.086, 0.31]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_86_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_86_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_86_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_86_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_86_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_86_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_86_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_86_7.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[378.922, 1013.673, 0.734], [494.488, 1078.797, 0.456], [393.735, 912.986, 0.581], [419.965, 958.358, 0.767]]\nB: [[433.059, 1088.732, 0.713], [433.043, 1088.668, 0.553], [433.039, 1088.652, 0.513], [433.055, 1088.681, 0.703]]\nC: [[426.779, 1210.184, 0.683], [374.724, 1199.914, 0.579], [356.893, 998.508, 0.494], [512.758, 1067.304, 0.691]]\nD: [[351.961, 935.844, 0.571], [386.254, 1200.68, 0.61], [480.449, 1191.815, 0.483], [412.037, 930.978, 0.833]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[378.922, 1013.673, 0.734], [494.488, 1078.797, 0.456], [393.735, 912.986, 0.581], [419.965, 958.358, 0.767]]\nB: [[433.059, 1088.732, 0.713], [433.043, 1088.668, 0.553], [433.039, 1088.652, 0.513], [433.055, 1088.681, 0.703]]\nC: [[426.779, 1210.184, 0.683], [374.724, 1199.914, 0.579], [356.893, 998.508, 0.494], [512.758, 1067.304, 0.691]]\nD: [[351.961, 935.844, 0.571], [386.254, 1200.68, 0.61], [480.449, 1191.815, 0.483], [412.037, 930.978, 0.833]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_87_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_87_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_87_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_87_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_87_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_87_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_87_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_87_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[2045.551, 727.654, 1.102], [1833.302, 769.345, 1.055], [1827.43, 963.776, 1.306], [1702.867, 738.416, 0.98]]\nB: [[1661.503, 737.862, 0.962], [1861.333, 965.185, 0.908], [1821.87, 992.03, 0.919], [2075.341, 874.239, 1.077]]\nC: [[1741.077, 864.895, 1.109], [1745.181, 865.139, 1.105], [1748.91, 865.361, 1.102], [1752.336, 865.549, 1.096]]\nD: [[1791.757, 846.823, 0.925], [1982.741, 721.703, 1.059], [1406.89, 829.542, 1.078], [1577.41, 813.096, 1.053]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[2045.551, 727.654, 1.102], [1833.302, 769.345, 1.055], [1827.43, 963.776, 1.306], [1702.867, 738.416, 0.98]]\nB: [[1661.503, 737.862, 0.962], [1861.333, 965.185, 0.908], [1821.87, 992.03, 0.919], [2075.341, 874.239, 1.077]]\nC: [[1741.077, 864.895, 1.109], [1745.181, 865.139, 1.105], [1748.91, 865.361, 1.102], [1752.336, 865.549, 1.096]]\nD: [[1791.757, 846.823, 0.925], [1982.741, 721.703, 1.059], [1406.89, 829.542, 1.078], [1577.41, 813.096, 1.053]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_88_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_88_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_88_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_88_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_88_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_88_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_88_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_88_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[505.05, 877.226, 0.203], [445.953, 1244.021, 0.202], [392.706, 1145.406, 0.277], [366.753, 1183.669, 0.29]]\nB: [[457.03, 1257.26, 0.228], [463.22, 1274.147, 0.225], [370.296, 997.948, 0.259], [365.28, 1022.072, 0.29]]\nC: [[434.02, 1096.492, 0.241], [434.019, 1096.492, 0.222], [434.019, 1096.492, 0.236], [434.018, 1096.493, 0.25]]\nD: [[422.09, 929.091, 0.197], [407.921, 1195.795, 0.198], [469.259, 1267.695, 0.23], [354.798, 1155.602, 0.26]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[505.05, 877.226, 0.203], [445.953, 1244.021, 0.202], [392.706, 1145.406, 0.277], [366.753, 1183.669, 0.29]]\nB: [[457.03, 1257.26, 0.228], [463.22, 1274.147, 0.225], [370.296, 997.948, 0.259], [365.28, 1022.072, 0.29]]\nC: [[434.02, 1096.492, 0.241], [434.019, 1096.492, 0.222], [434.019, 1096.492, 0.236], [434.018, 1096.493, 0.25]]\nD: [[422.09, 929.091, 0.197], [407.921, 1195.795, 0.198], [469.259, 1267.695, 0.23], [354.798, 1155.602, 0.26]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_89_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_89_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_89_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_89_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_89_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_89_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_89_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_89_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[382.439, 944.308, 1.097], [282.072, 1346.475, 1.314], [317.768, 1056.456, 0.981], [328.142, 1067.671, 0.969]]\nB: [[348.689, 1130.152, 1.122], [348.689, 1130.152, 1.122], [348.689, 1130.152, 1.122], [348.689, 1130.152, 1.122]]\nC: [[290.351, 1111.854, 1.019], [383.245, 975.676, 1.11], [292.501, 1319.267, 0.953], [293.662, 1130.698, 0.975]]\nD: [[344.597, 1317.41, 1.004], [391.599, 1063.24, 1.128], [415.864, 1014.121, 0.9], [383.217, 1223.267, 1.321]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[382.439, 944.308, 1.097], [282.072, 1346.475, 1.314], [317.768, 1056.456, 0.981], [328.142, 1067.671, 0.969]]\nB: [[348.689, 1130.152, 1.122], [348.689, 1130.152, 1.122], [348.689, 1130.152, 1.122], [348.689, 1130.152, 1.122]]\nC: [[290.351, 1111.854, 1.019], [383.245, 975.676, 1.11], [292.501, 1319.267, 0.953], [293.662, 1130.698, 0.975]]\nD: [[344.597, 1317.41, 1.004], [391.599, 1063.24, 1.128], [415.864, 1014.121, 0.9], [383.217, 1223.267, 1.321]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_90_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_90_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_90_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_90_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_90_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_90_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_90_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_90_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[445.635, 1070.359, 0.563], [380.207, 1209.252, 0.55], [503.803, 1202.552, 0.618], [402.841, 878.242, 0.916]]\nB: [[355.09, 926.566, 0.532], [425.031, 931.852, 0.428], [396.378, 1283.26, 0.637], [419.782, 1021.681, 0.797]]\nC: [[353.198, 1052.673, 0.552], [387.581, 1075.215, 0.55], [453.134, 889.143, 0.766], [432.989, 976.738, 0.712]]\nD: [[435.434, 1087.782, 0.612], [435.403, 1087.706, 0.533], [435.405, 1087.711, 0.695], [435.407, 1087.716, 0.846]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[445.635, 1070.359, 0.563], [380.207, 1209.252, 0.55], [503.803, 1202.552, 0.618], [402.841, 878.242, 0.916]]\nB: [[355.09, 926.566, 0.532], [425.031, 931.852, 0.428], [396.378, 1283.26, 0.637], [419.782, 1021.681, 0.797]]\nC: [[353.198, 1052.673, 0.552], [387.581, 1075.215, 0.55], [453.134, 889.143, 0.766], [432.989, 976.738, 0.712]]\nD: [[435.434, 1087.782, 0.612], [435.403, 1087.706, 0.533], [435.405, 1087.711, 0.695], [435.407, 1087.716, 0.846]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_91_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_91_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_91_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_91_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_91_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_91_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_91_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_91_7.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[1858.977, 3005.32, 0.085], [1613.167, 2545.921, 0.166], [1534.084, 2595.697, 0.482], [1872.638, 2228.595, 0.519]]\nB: [[1516.054, 2338.945, 0.097], [1891.297, 2428.151, 0.236], [1796.827, 2149.677, 0.559], [1658.932, 2766.556, 0.382]]\nC: [[2005.599, 2965.349, 0.12], [1626.186, 2645.937, 0.181], [1937.717, 2253.069, 0.541], [1779.108, 2893.005, 0.435]]\nD: [[1824.199, 2571.318, 0.101], [1824.516, 2570.899, 0.205], [1825.469, 2569.639, 0.518], [1825.787, 2569.219, 0.434]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1858.977, 3005.32, 0.085], [1613.167, 2545.921, 0.166], [1534.084, 2595.697, 0.482], [1872.638, 2228.595, 0.519]]\nB: [[1516.054, 2338.945, 0.097], [1891.297, 2428.151, 0.236], [1796.827, 2149.677, 0.559], [1658.932, 2766.556, 0.382]]\nC: [[2005.599, 2965.349, 0.12], [1626.186, 2645.937, 0.181], [1937.717, 2253.069, 0.541], [1779.108, 2893.005, 0.435]]\nD: [[1824.199, 2571.318, 0.101], [1824.516, 2570.899, 0.205], [1825.469, 2569.639, 0.518], [1825.787, 2569.219, 0.434]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_92_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_92_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_92_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_92_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_92_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_92_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_92_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_92_7.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[598.103, 1642.075, 1.029], [598.744, 1641.829, 1.029], [599.384, 1641.583, 1.029], [600.026, 1641.338, 1.179]]\nB: [[701.368, 1416.38, 0.896], [530.801, 1778.726, 1.056], [579.309, 1558.364, 0.977], [683.542, 1838.774, 1.325]]\nC: [[715.233, 1896.029, 0.968], [530.58, 1520.538, 0.944], [596.209, 1472.502, 0.856], [536.626, 1453.346, 1.289]]\nD: [[626.221, 1751.17, 1.049], [568.701, 1547.296, 1.076], [640.532, 1458.354, 1.122], [626.284, 1959.943, 1.094]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[598.103, 1642.075, 1.029], [598.744, 1641.829, 1.029], [599.384, 1641.583, 1.029], [600.026, 1641.338, 1.179]]\nB: [[701.368, 1416.38, 0.896], [530.801, 1778.726, 1.056], [579.309, 1558.364, 0.977], [683.542, 1838.774, 1.325]]\nC: [[715.233, 1896.029, 0.968], [530.58, 1520.538, 0.944], [596.209, 1472.502, 0.856], [536.626, 1453.346, 1.289]]\nD: [[626.221, 1751.17, 1.049], [568.701, 1547.296, 1.076], [640.532, 1458.354, 1.122], [626.284, 1959.943, 1.094]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_93_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_93_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_93_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_93_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_93_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_93_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_93_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_93_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[413.954, 1081.236, 0.717], [451.481, 1023.032, 0.495], [397.92, 951.148, 0.604], [328.441, 1181.445, 0.598]]\nB: [[457.154, 1119.531, 0.668], [402.874, 923.594, 0.435], [447.684, 1012.752, 0.547], [341.799, 1237.225, 0.728]]\nC: [[354.36, 1174.911, 0.52], [371.321, 1042.76, 0.471], [448.856, 1142.068, 0.628], [427.925, 1261.83, 0.519]]\nD: [[389.399, 1112.311, 0.629], [389.356, 1112.334, 0.529], [389.379, 1112.321, 0.579], [389.403, 1112.309, 0.629]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[413.954, 1081.236, 0.717], [451.481, 1023.032, 0.495], [397.92, 951.148, 0.604], [328.441, 1181.445, 0.598]]\nB: [[457.154, 1119.531, 0.668], [402.874, 923.594, 0.435], [447.684, 1012.752, 0.547], [341.799, 1237.225, 0.728]]\nC: [[354.36, 1174.911, 0.52], [371.321, 1042.76, 0.471], [448.856, 1142.068, 0.628], [427.925, 1261.83, 0.519]]\nD: [[389.399, 1112.311, 0.629], [389.356, 1112.334, 0.529], [389.379, 1112.321, 0.579], [389.403, 1112.309, 0.629]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_94_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_94_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_94_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_94_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_94_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_94_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_94_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_94_7.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[611.892, 1806.578, 0.268], [743.567, 1822.952, 0.264], [555.945, 1619.632, 0.282], [736.456, 1610.791, 0.275]]\nB: [[647.522, 1603.835, 0.243], [647.522, 1603.835, 0.293], [647.522, 1603.835, 0.318], [647.522, 1603.835, 0.343]]\nC: [[613.269, 1753.144, 0.259], [701.513, 1670.735, 0.335], [698.245, 1622.138, 0.321], [547.954, 1706.296, 0.366]]\nD: [[518.452, 1481.659, 0.23], [579.958, 1410.188, 0.243], [523.377, 1912.789, 0.276], [661.654, 1701.9, 0.34]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[611.892, 1806.578, 0.268], [743.567, 1822.952, 0.264], [555.945, 1619.632, 0.282], [736.456, 1610.791, 0.275]]\nB: [[647.522, 1603.835, 0.243], [647.522, 1603.835, 0.293], [647.522, 1603.835, 0.318], [647.522, 1603.835, 0.343]]\nC: [[613.269, 1753.144, 0.259], [701.513, 1670.735, 0.335], [698.245, 1622.138, 0.321], [547.954, 1706.296, 0.366]]\nD: [[518.452, 1481.659, 0.23], [579.958, 1410.188, 0.243], [523.377, 1912.789, 0.276], [661.654, 1701.9, 0.34]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_95_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_95_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_95_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_95_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_95_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_95_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_95_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_95_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[631.815, 1636.973, 0.074], [631.58, 1636.905, 0.174], [631.313, 1636.901, 0.224], [631.183, 1636.842, 0.29]]\nB: [[559.113, 1943.842, 0.076], [518.03, 1864.19, 0.151], [546.229, 1683.354, 0.205], [539.475, 1389.243, 0.24]]\nC: [[689.175, 1624.485, 0.066], [688.09, 1571.158, 0.178], [563.905, 1790.085, 0.19], [581.151, 1421.06, 0.29]]\nD: [[705.525, 1667.251, 0.065], [604.36, 1921.35, 0.181], [722.147, 1476.341, 0.225], [572.745, 1584.256, 0.3]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[631.815, 1636.973, 0.074], [631.58, 1636.905, 0.174], [631.313, 1636.901, 0.224], [631.183, 1636.842, 0.29]]\nB: [[559.113, 1943.842, 0.076], [518.03, 1864.19, 0.151], [546.229, 1683.354, 0.205], [539.475, 1389.243, 0.24]]\nC: [[689.175, 1624.485, 0.066], [688.09, 1571.158, 0.178], [563.905, 1790.085, 0.19], [581.151, 1421.06, 0.29]]\nD: [[705.525, 1667.251, 0.065], [604.36, 1921.35, 0.181], [722.147, 1476.341, 0.225], [572.745, 1584.256, 0.3]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_96_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_96_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_96_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_96_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_96_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_96_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_96_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_96_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[1782.298, 752.391, 0.81], [2097.954, 882.859, 0.769], [1957.145, 911.959, 1.057], [1675.514, 745.849, 0.876]]\nB: [[1869.593, 872.653, 0.94], [1517.943, 982.023, 0.755], [2107.729, 753.584, 0.782], [1585.194, 934.333, 0.784]]\nC: [[1792.225, 846.971, 0.887], [1792.225, 846.971, 0.887], [1792.225, 846.971, 0.887], [1792.225, 846.971, 0.887]]\nD: [[1767.979, 682.569, 0.962], [2081.075, 907.416, 0.768], [2002.738, 790.434, 0.955], [1720.834, 852.507, 1.032]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1782.298, 752.391, 0.81], [2097.954, 882.859, 0.769], [1957.145, 911.959, 1.057], [1675.514, 745.849, 0.876]]\nB: [[1869.593, 872.653, 0.94], [1517.943, 982.023, 0.755], [2107.729, 753.584, 0.782], [1585.194, 934.333, 0.784]]\nC: [[1792.225, 846.971, 0.887], [1792.225, 846.971, 0.887], [1792.225, 846.971, 0.887], [1792.225, 846.971, 0.887]]\nD: [[1767.979, 682.569, 0.962], [2081.075, 907.416, 0.768], [2002.738, 790.434, 0.955], [1720.834, 852.507, 1.032]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_97_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_97_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_97_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_97_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_97_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_97_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_97_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_97_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[724.326, 1495.604, -0.327], [676.702, 1884.979, 0.087], [764.302, 1335.092, 0.381], [548.436, 1731.853, 0.371]]\nB: [[651.934, 1624.096, -0.297], [652.686, 1623.474, 0.103], [653.181, 1623.053, 0.328], [653.687, 1622.622, 0.353]]\nC: [[607.922, 1820.17, -0.295], [753.684, 1433.034, 0.106], [678.243, 1447.392, 0.296], [570.652, 1420.87, 0.385]]\nD: [[751.414, 1685.804, -0.244], [578.476, 1684.213, 0.096], [590.679, 1441.453, 0.28], [536.506, 1378.646, 0.32]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[724.326, 1495.604, -0.327], [676.702, 1884.979, 0.087], [764.302, 1335.092, 0.381], [548.436, 1731.853, 0.371]]\nB: [[651.934, 1624.096, -0.297], [652.686, 1623.474, 0.103], [653.181, 1623.053, 0.328], [653.687, 1622.622, 0.353]]\nC: [[607.922, 1820.17, -0.295], [753.684, 1433.034, 0.106], [678.243, 1447.392, 0.296], [570.652, 1420.87, 0.385]]\nD: [[751.414, 1685.804, -0.244], [578.476, 1684.213, 0.096], [590.679, 1441.453, 0.28], [536.506, 1378.646, 0.32]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_98_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_98_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_98_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_98_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_98_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_98_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_98_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_98_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[1335.014, 862.316, 0.287], [1100.87, 1037.094, 0.23], [1485.551, 931.776, 0.26], [1291.897, 1074.94, 0.272]]\nB: [[1414.862, 952.185, 0.246], [1191.79, 1180.934, 0.227], [1337.485, 931.666, 0.225], [1205.041, 976.826, 0.257]]\nC: [[1365.108, 1014.952, 0.254], [1365.101, 1014.929, 0.254], [1365.094, 1014.907, 0.254], [1365.086, 1014.885, 0.254]]\nD: [[1286.094, 1146.653, 0.233], [1369.879, 1146.619, 0.278], [1377.756, 963.077, 0.259], [1621.927, 877.672, 0.256]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1335.014, 862.316, 0.287], [1100.87, 1037.094, 0.23], [1485.551, 931.776, 0.26], [1291.897, 1074.94, 0.272]]\nB: [[1414.862, 952.185, 0.246], [1191.79, 1180.934, 0.227], [1337.485, 931.666, 0.225], [1205.041, 976.826, 0.257]]\nC: [[1365.108, 1014.952, 0.254], [1365.101, 1014.929, 0.254], [1365.094, 1014.907, 0.254], [1365.086, 1014.885, 0.254]]\nD: [[1286.094, 1146.653, 0.233], [1369.879, 1146.619, 0.278], [1377.756, 963.077, 0.259], [1621.927, 877.672, 0.256]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_99_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_99_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_99_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_99_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_99_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_99_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_99_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_99_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[365.047, 598.348, 1.241], [286.54, 738.564, 1.05], [387.446, 604.928, 1.198], [391.006, 671.391, 1.378]]\nB: [[377.966, 628.41, 1.434], [361.11, 614.334, 1.35], [284.927, 755.854, 1.174], [403.902, 539.249, 1.302]]\nC: [[341.337, 715.12, 1.189], [372.63, 619.563, 1.39], [402.819, 670.746, 1.313], [340.745, 536.458, 1.343]]\nD: [[345.848, 655.799, 1.196], [343.13, 656.562, 1.18], [340.412, 657.325, 1.165], [337.693, 658.088, 1.149]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[365.047, 598.348, 1.241], [286.54, 738.564, 1.05], [387.446, 604.928, 1.198], [391.006, 671.391, 1.378]]\nB: [[377.966, 628.41, 1.434], [361.11, 614.334, 1.35], [284.927, 755.854, 1.174], [403.902, 539.249, 1.302]]\nC: [[341.337, 715.12, 1.189], [372.63, 619.563, 1.39], [402.819, 670.746, 1.313], [340.745, 536.458, 1.343]]\nD: [[345.848, 655.799, 1.196], [343.13, 656.562, 1.18], [340.412, 657.325, 1.165], [337.693, 658.088, 1.149]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_100_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_100_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_100_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_100_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_100_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_100_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_100_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_100_7.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[423.365, 1081.357, 1.92], [358.32, 1065.751, 2.221], [393.86, 1013.258, 1.856], [426.12, 1121.332, 2.203]]\nB: [[374.803, 1125.969, 1.58], [323.517, 1358.518, 1.869], [421.71, 1325.966, 2.205], [374.867, 1307.159, 2.25]]\nC: [[345.831, 1321.961, 2.14], [438.53, 1243.074, 1.588], [406.44, 1418.879, 2.198], [347.216, 1381.306, 1.978]]\nD: [[382.736, 1209.839, 1.88], [383.093, 1209.198, 1.931], [383.45, 1208.557, 1.982], [383.786, 1207.915, 1.982]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[423.365, 1081.357, 1.92], [358.32, 1065.751, 2.221], [393.86, 1013.258, 1.856], [426.12, 1121.332, 2.203]]\nB: [[374.803, 1125.969, 1.58], [323.517, 1358.518, 1.869], [421.71, 1325.966, 2.205], [374.867, 1307.159, 2.25]]\nC: [[345.831, 1321.961, 2.14], [438.53, 1243.074, 1.588], [406.44, 1418.879, 2.198], [347.216, 1381.306, 1.978]]\nD: [[382.736, 1209.839, 1.88], [383.093, 1209.198, 1.931], [383.45, 1208.557, 1.982], [383.786, 1207.915, 1.982]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_101_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_101_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_101_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_101_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_101_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_101_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_101_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_101_7.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[633.909, 1608.489, 1.038], [670.109, 1891.19, 1.125], [522.041, 1396.622, 1.098], [541.68, 1745.168, 1.117]]\nB: [[635.642, 1415.988, 1.081], [633.621, 1654.57, 0.946], [601.731, 1438.83, 1.36], [652.901, 1593.526, 1.066]]\nC: [[555.337, 1356.247, 1.199], [627.708, 1668.4, 0.904], [707.793, 1894.062, 1.109], [480.816, 1651.213, 1.309]]\nD: [[583.549, 1656.391, 1.267], [587.422, 1654.32, 1.126], [591.257, 1652.222, 1.146], [594.995, 1650.206, 1.166]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[633.909, 1608.489, 1.038], [670.109, 1891.19, 1.125], [522.041, 1396.622, 1.098], [541.68, 1745.168, 1.117]]\nB: [[635.642, 1415.988, 1.081], [633.621, 1654.57, 0.946], [601.731, 1438.83, 1.36], [652.901, 1593.526, 1.066]]\nC: [[555.337, 1356.247, 1.199], [627.708, 1668.4, 0.904], [707.793, 1894.062, 1.109], [480.816, 1651.213, 1.309]]\nD: [[583.549, 1656.391, 1.267], [587.422, 1654.32, 1.126], [591.257, 1652.222, 1.146], [594.995, 1650.206, 1.166]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_102_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_102_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_102_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_102_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_102_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_102_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_102_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_102_7.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[697.83, 1848.354, 0.41], [586.56, 1505.654, 0.534], [677.42, 1687.731, 0.544], [605.12, 1621.517, 0.808]]\nB: [[519.11, 1562.82, 0.383], [612.23, 1842.267, 0.582], [524.07, 1920.47, 0.561], [598.47, 1708.973, 0.9]]\nC: [[723.89, 1578.062, 0.473], [519.71, 1405.785, 0.584], [581.29, 1953.42, 0.735], [668.56, 1675.091, 0.868]]\nD: [[619.03, 1648.941, 0.413], [618.43, 1649.273, 0.538], [617.83, 1649.605, 0.663], [617.17, 1649.888, 0.813]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[697.83, 1848.354, 0.41], [586.56, 1505.654, 0.534], [677.42, 1687.731, 0.544], [605.12, 1621.517, 0.808]]\nB: [[519.11, 1562.82, 0.383], [612.23, 1842.267, 0.582], [524.07, 1920.47, 0.561], [598.47, 1708.973, 0.9]]\nC: [[723.89, 1578.062, 0.473], [519.71, 1405.785, 0.584], [581.29, 1953.42, 0.735], [668.56, 1675.091, 0.868]]\nD: [[619.03, 1648.941, 0.413], [618.43, 1649.273, 0.538], [617.83, 1649.605, 0.663], [617.17, 1649.888, 0.813]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_103_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_103_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_103_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_103_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_103_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_103_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_103_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_103_7.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[403.885, 998.16, 0.544], [424.075, 1007.073, 0.82], [461.305, 1015.923, 0.726], [400.111, 1095.775, 0.878]]\nB: [[391.661, 1114.07, 0.663], [391.696, 1114.047, 0.738], [391.688, 1114.052, 0.813], [391.697, 1114.047, 0.818]]\nC: [[386.99, 1297.58, 0.743], [409.207, 1135.586, 0.659], [357.708, 1116.073, 0.868], [392.676, 1300.061, 0.744]]\nD: [[402.241, 1225.3, 0.56], [375.81, 983.912, 0.615], [412.224, 1111.715, 0.708], [393.052, 1232.005, 0.961]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[403.885, 998.16, 0.544], [424.075, 1007.073, 0.82], [461.305, 1015.923, 0.726], [400.111, 1095.775, 0.878]]\nB: [[391.661, 1114.07, 0.663], [391.696, 1114.047, 0.738], [391.688, 1114.052, 0.813], [391.697, 1114.047, 0.818]]\nC: [[386.99, 1297.58, 0.743], [409.207, 1135.586, 0.659], [357.708, 1116.073, 0.868], [392.676, 1300.061, 0.744]]\nD: [[402.241, 1225.3, 0.56], [375.81, 983.912, 0.615], [412.224, 1111.715, 0.708], [393.052, 1232.005, 0.961]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_104_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_104_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_104_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_104_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_104_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_104_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_104_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_104_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[1277.031, 1033.186, 0.322], [1277.662, 1033.929, 0.322], [1279.057, 1035.823, 0.322], [1280.749, 1038.066, 0.372]]\nB: [[1421.88, 1233.909, 0.31], [1157.401, 1129.096, 0.349], [1356.001, 893.496, 0.351], [1288.688, 983.139, 0.356]]\nC: [[1125.382, 1211.613, 0.317], [1176.913, 1001.679, 0.291], [1346.252, 1080.898, 0.373], [1066.545, 1136.811, 0.352]]\nD: [[1059.6, 1024.583, 0.258], [1367.51, 878.274, 0.29], [1278.315, 1180.834, 0.347], [1136.279, 1162.583, 0.374]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1277.031, 1033.186, 0.322], [1277.662, 1033.929, 0.322], [1279.057, 1035.823, 0.322], [1280.749, 1038.066, 0.372]]\nB: [[1421.88, 1233.909, 0.31], [1157.401, 1129.096, 0.349], [1356.001, 893.496, 0.351], [1288.688, 983.139, 0.356]]\nC: [[1125.382, 1211.613, 0.317], [1176.913, 1001.679, 0.291], [1346.252, 1080.898, 0.373], [1066.545, 1136.811, 0.352]]\nD: [[1059.6, 1024.583, 0.258], [1367.51, 878.274, 0.29], [1278.315, 1180.834, 0.347], [1136.279, 1162.583, 0.374]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_105_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_105_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_105_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_105_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_105_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_105_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_105_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_105_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[415.331, 1104.242, 0.613], [415.326, 1104.24, 0.64], [415.316, 1104.236, 0.695], [415.31, 1104.234, 0.723]]\nB: [[345.131, 1192.788, 0.505], [371.883, 1269.14, 0.69], [447.888, 1224.599, 0.614], [392.44, 1170.864, 0.783]]\nC: [[454.914, 1278.297, 0.71], [434.859, 1136.62, 0.63], [416.021, 1211.093, 0.643], [337.73, 960.185, 0.669]]\nD: [[480.222, 1069.404, 0.56], [365.517, 970.37, 0.54], [386.514, 975.732, 0.743], [393.7, 923.805, 0.642]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[415.331, 1104.242, 0.613], [415.326, 1104.24, 0.64], [415.316, 1104.236, 0.695], [415.31, 1104.234, 0.723]]\nB: [[345.131, 1192.788, 0.505], [371.883, 1269.14, 0.69], [447.888, 1224.599, 0.614], [392.44, 1170.864, 0.783]]\nC: [[454.914, 1278.297, 0.71], [434.859, 1136.62, 0.63], [416.021, 1211.093, 0.643], [337.73, 960.185, 0.669]]\nD: [[480.222, 1069.404, 0.56], [365.517, 970.37, 0.54], [386.514, 975.732, 0.743], [393.7, 923.805, 0.642]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_106_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_106_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_106_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_106_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_106_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_106_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_106_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_106_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[407.576, 1163.308, 0.729], [407.573, 1163.324, 0.746], [407.57, 1163.34, 0.762], [407.569, 1163.357, 0.779]]\nB: [[387.473, 1137.771, 0.644], [384.287, 1365.683, 0.681], [390.89, 1137.17, 0.617], [457.784, 1284.967, 0.839]]\nC: [[360.164, 1015.983, 0.678], [381.237, 1053.29, 0.859], [457.22, 1360.88, 0.63], [408.603, 1334.048, 0.816]]\nD: [[392.585, 1372.768, 0.686], [426.374, 1363.72, 0.752], [443.3, 955.82, 0.704], [326.364, 1211.631, 0.769]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[407.576, 1163.308, 0.729], [407.573, 1163.324, 0.746], [407.57, 1163.34, 0.762], [407.569, 1163.357, 0.779]]\nB: [[387.473, 1137.771, 0.644], [384.287, 1365.683, 0.681], [390.89, 1137.17, 0.617], [457.784, 1284.967, 0.839]]\nC: [[360.164, 1015.983, 0.678], [381.237, 1053.29, 0.859], [457.22, 1360.88, 0.63], [408.603, 1334.048, 0.816]]\nD: [[392.585, 1372.768, 0.686], [426.374, 1363.72, 0.752], [443.3, 955.82, 0.704], [326.364, 1211.631, 0.769]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_107_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_107_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_107_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_107_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_107_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_107_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_107_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_107_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[1183.549, 1091.134, 0.407], [1449.921, 972.434, 0.409], [1115.763, 1023.251, 0.366], [1402.669, 992.802, 0.432]]\nB: [[1243.405, 864.452, 0.467], [1368.17, 1085.65, 0.361], [1076.736, 1221.575, 0.333], [1435.133, 1172.523, 0.468]]\nC: [[1295.125, 1032.757, 0.415], [1295.611, 1033.251, 0.415], [1296.187, 1033.665, 0.415], [1296.747, 1033.991, 0.415]]\nD: [[1335.953, 913.089, 0.398], [1461.39, 864.58, 0.459], [1483.452, 900.406, 0.383], [1264.928, 1038.725, 0.375]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1183.549, 1091.134, 0.407], [1449.921, 972.434, 0.409], [1115.763, 1023.251, 0.366], [1402.669, 992.802, 0.432]]\nB: [[1243.405, 864.452, 0.467], [1368.17, 1085.65, 0.361], [1076.736, 1221.575, 0.333], [1435.133, 1172.523, 0.468]]\nC: [[1295.125, 1032.757, 0.415], [1295.611, 1033.251, 0.415], [1296.187, 1033.665, 0.415], [1296.747, 1033.991, 0.415]]\nD: [[1335.953, 913.089, 0.398], [1461.39, 864.58, 0.459], [1483.452, 900.406, 0.383], [1264.928, 1038.725, 0.375]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_108_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_108_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_108_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_108_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_108_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_108_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_108_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_108_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[339.449, 659.894, 0.573], [339.446, 659.895, 0.607], [339.443, 659.896, 0.64], [339.44, 659.897, 0.674]]\nB: [[398.935, 645.734, 0.599], [298.581, 729.947, 0.634], [401.409, 592.555, 0.67], [389.37, 745.064, 0.776]]\nC: [[317.88, 666.567, 0.669], [318.154, 636.677, 0.526], [319.442, 702.387, 0.7], [331.82, 647.551, 0.682]]\nD: [[348.987, 658.876, 0.645], [370.001, 591.87, 0.551], [346.212, 591.313, 0.75], [291.32, 620.068, 0.565]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[339.449, 659.894, 0.573], [339.446, 659.895, 0.607], [339.443, 659.896, 0.64], [339.44, 659.897, 0.674]]\nB: [[398.935, 645.734, 0.599], [298.581, 729.947, 0.634], [401.409, 592.555, 0.67], [389.37, 745.064, 0.776]]\nC: [[317.88, 666.567, 0.669], [318.154, 636.677, 0.526], [319.442, 702.387, 0.7], [331.82, 647.551, 0.682]]\nD: [[348.987, 658.876, 0.645], [370.001, 591.87, 0.551], [346.212, 591.313, 0.75], [291.32, 620.068, 0.565]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_109_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_109_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_109_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_109_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_109_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_109_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_109_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_109_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[1348.14, 1074.93, 0.373], [1537.25, 1072.09, 0.5], [1196.975, 870.732, 0.55], [1269.511, 1097.657, 0.609]]\nB: [[1369.98, 1210.47, 0.291], [1383.17, 1209.316, 0.44], [1098.297, 933.023, 0.49], [1055.724, 1184.093, 0.615]]\nC: [[1279.19, 1030.84, 0.349], [1282.49, 1034.214, 0.43], [1285.285, 1037.189, 0.51], [1288.217, 1040.319, 0.591]]\nD: [[1424.53, 1145.06, 0.417], [1294.63, 1198.674, 0.47], [1368.216, 886.452, 0.51], [1389.846, 1124.768, 0.48]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1348.14, 1074.93, 0.373], [1537.25, 1072.09, 0.5], [1196.975, 870.732, 0.55], [1269.511, 1097.657, 0.609]]\nB: [[1369.98, 1210.47, 0.291], [1383.17, 1209.316, 0.44], [1098.297, 933.023, 0.49], [1055.724, 1184.093, 0.615]]\nC: [[1279.19, 1030.84, 0.349], [1282.49, 1034.214, 0.43], [1285.285, 1037.189, 0.51], [1288.217, 1040.319, 0.591]]\nD: [[1424.53, 1145.06, 0.417], [1294.63, 1198.674, 0.47], [1368.216, 886.452, 0.51], [1389.846, 1124.768, 0.48]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_110_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_110_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_110_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_110_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_110_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_110_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_110_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_110_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[1180.699, 1025.35, 0.352], [1360.818, 1139.597, 0.397], [1152.166, 1159.568, 0.296], [1106.717, 1234.187, 0.313]]\nB: [[1378.182, 1100.4, 0.333], [1294.85, 1232.299, 0.398], [1173.547, 969.988, 0.388], [1171.591, 1158.384, 0.396]]\nC: [[1086.537, 1116.193, 0.36], [1109.417, 1116.907, 0.31], [1478.169, 1103.822, 0.341], [1122.704, 957.886, 0.337]]\nD: [[1275.412, 1026.886, 0.336], [1278.054, 1029.742, 0.336], [1280.696, 1032.599, 0.336], [1283.018, 1035.321, 0.336]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1180.699, 1025.35, 0.352], [1360.818, 1139.597, 0.397], [1152.166, 1159.568, 0.296], [1106.717, 1234.187, 0.313]]\nB: [[1378.182, 1100.4, 0.333], [1294.85, 1232.299, 0.398], [1173.547, 969.988, 0.388], [1171.591, 1158.384, 0.396]]\nC: [[1086.537, 1116.193, 0.36], [1109.417, 1116.907, 0.31], [1478.169, 1103.822, 0.341], [1122.704, 957.886, 0.337]]\nD: [[1275.412, 1026.886, 0.336], [1278.054, 1029.742, 0.336], [1280.696, 1032.599, 0.336], [1283.018, 1035.321, 0.336]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_111_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_111_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_111_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_111_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_111_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_111_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_111_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_111_7.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[1567.635, 999.11, 0.216], [1581.604, 1034.59, 0.201], [2215.085, 945.14, 0.266], [1822.791, 878.16, 0.212]]\nB: [[1547.391, 1001.36, 0.216], [1923.868, 772.06, 0.229], [1775.081, 857.84, 0.257], [1976.165, 741.51, 0.198]]\nC: [[1924.297, 873.96, 0.189], [1924.297, 873.96, 0.206], [1924.297, 873.96, 0.223], [1924.297, 873.96, 0.239]]\nD: [[1739.859, 831.14, 0.169], [1930.99, 1015.96, 0.221], [1891.889, 1021.13, 0.241], [2233.369, 854.3, 0.277]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1567.635, 999.11, 0.216], [1581.604, 1034.59, 0.201], [2215.085, 945.14, 0.266], [1822.791, 878.16, 0.212]]\nB: [[1547.391, 1001.36, 0.216], [1923.868, 772.06, 0.229], [1775.081, 857.84, 0.257], [1976.165, 741.51, 0.198]]\nC: [[1924.297, 873.96, 0.189], [1924.297, 873.96, 0.206], [1924.297, 873.96, 0.223], [1924.297, 873.96, 0.239]]\nD: [[1739.859, 831.14, 0.169], [1930.99, 1015.96, 0.221], [1891.889, 1021.13, 0.241], [2233.369, 854.3, 0.277]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_112_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_112_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_112_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_112_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_112_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_112_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_112_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_112_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[389.455, 1221.754, 1.957], [389.017, 1221.738, 1.957], [388.246, 1221.696, 2.082], [387.407, 1221.65, 2.007]]\nB: [[430.695, 1244.614, 1.598], [332.727, 1219.984, 2.062], [451.568, 1172.545, 2.304], [431.932, 1447.12, 2.075]]\nC: [[434.759, 1360.34, 1.798], [320.818, 1065.151, 2.275], [403.374, 995.774, 1.782], [399.338, 1318.27, 2.25]]\nD: [[338.306, 1065.478, 2.175], [359.176, 1170.276, 2.145], [422.221, 1295.741, 2.146], [318.234, 1189.1, 1.616]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[389.455, 1221.754, 1.957], [389.017, 1221.738, 1.957], [388.246, 1221.696, 2.082], [387.407, 1221.65, 2.007]]\nB: [[430.695, 1244.614, 1.598], [332.727, 1219.984, 2.062], [451.568, 1172.545, 2.304], [431.932, 1447.12, 2.075]]\nC: [[434.759, 1360.34, 1.798], [320.818, 1065.151, 2.275], [403.374, 995.774, 1.782], [399.338, 1318.27, 2.25]]\nD: [[338.306, 1065.478, 2.175], [359.176, 1170.276, 2.145], [422.221, 1295.741, 2.146], [318.234, 1189.1, 1.616]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_113_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_113_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_113_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_113_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_113_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_113_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_113_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_113_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[236.0, 747.95, 0.854], [256.234, 709.537, 0.697], [304.474, 800.037, 0.56], [331.877, 561.326, 0.514]]\nB: [[314.28, 598.25, 0.849], [236.191, 740.031, 0.728], [250.825, 688.597, 0.635], [341.366, 613.896, 0.475]]\nC: [[246.64, 693.8, 0.858], [290.257, 567.854, 0.783], [293.745, 750.544, 0.62], [309.807, 562.559, 0.531]]\nD: [[289.28, 669.01, 0.775], [291.627, 672.377, 0.668], [293.977, 675.748, 0.562], [296.214, 678.956, 0.455]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[236.0, 747.95, 0.854], [256.234, 709.537, 0.697], [304.474, 800.037, 0.56], [331.877, 561.326, 0.514]]\nB: [[314.28, 598.25, 0.849], [236.191, 740.031, 0.728], [250.825, 688.597, 0.635], [341.366, 613.896, 0.475]]\nC: [[246.64, 693.8, 0.858], [290.257, 567.854, 0.783], [293.745, 750.544, 0.62], [309.807, 562.559, 0.531]]\nD: [[289.28, 669.01, 0.775], [291.627, 672.377, 0.668], [293.977, 675.748, 0.562], [296.214, 678.956, 0.455]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_114_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_114_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_114_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_114_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_114_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_114_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_114_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_114_7.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[533.8, 1735.728, -0.009], [518.05, 1383.369, 0.045], [743.409, 1439.406, 0.374], [553.203, 1712.789, 0.68]]\nB: [[653.646, 1831.884, -0.01], [745.339, 1445.929, 0.044], [684.645, 1812.914, 0.333], [569.065, 1458.696, 0.754]]\nC: [[572.656, 1841.565, -0.01], [747.719, 1494.494, 0.038], [688.766, 1558.475, 0.402], [740.666, 1414.102, 0.689]]\nD: [[637.791, 1636.674, -0.011], [637.381, 1637.067, 0.039], [636.158, 1638.241, 0.389], [635.756, 1638.659, 0.689]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[533.8, 1735.728, -0.009], [518.05, 1383.369, 0.045], [743.409, 1439.406, 0.374], [553.203, 1712.789, 0.68]]\nB: [[653.646, 1831.884, -0.01], [745.339, 1445.929, 0.044], [684.645, 1812.914, 0.333], [569.065, 1458.696, 0.754]]\nC: [[572.656, 1841.565, -0.01], [747.719, 1494.494, 0.038], [688.766, 1558.475, 0.402], [740.666, 1414.102, 0.689]]\nD: [[637.791, 1636.674, -0.011], [637.381, 1637.067, 0.039], [636.158, 1638.241, 0.389], [635.756, 1638.659, 0.689]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_115_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_115_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_115_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_115_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_115_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_115_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_115_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_115_7.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[357.199, 1105.26, 0.925], [357.199, 1105.26, 0.874], [357.199, 1105.26, 0.901], [357.199, 1105.26, 1.083]]\nB: [[405.104, 1231.8, 0.941], [321.418, 916.12, 1.011], [382.371, 913.36, 0.794], [428.391, 1299.88, 1.177]]\nC: [[352.491, 1140.82, 0.829], [377.607, 964.69, 0.939], [341.493, 1094.81, 0.997], [329.979, 894.62, 0.879]]\nD: [[368.993, 920.78, 0.953], [328.671, 1054.8, 1.001], [426.057, 1241.84, 0.874], [319.642, 1019.55, 1.122]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[357.199, 1105.26, 0.925], [357.199, 1105.26, 0.874], [357.199, 1105.26, 0.901], [357.199, 1105.26, 1.083]]\nB: [[405.104, 1231.8, 0.941], [321.418, 916.12, 1.011], [382.371, 913.36, 0.794], [428.391, 1299.88, 1.177]]\nC: [[352.491, 1140.82, 0.829], [377.607, 964.69, 0.939], [341.493, 1094.81, 0.997], [329.979, 894.62, 0.879]]\nD: [[368.993, 920.78, 0.953], [328.671, 1054.8, 1.001], [426.057, 1241.84, 0.874], [319.642, 1019.55, 1.122]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_116_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_116_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_116_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_116_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_116_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_116_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_116_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_116_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[390.292, 1280.196, 0.538], [390.276, 1322.072, 0.606], [337.427, 1327.052, 0.501], [388.832, 1049.397, 0.585]]\nB: [[441.456, 1017.908, 0.543], [440.359, 1177.164, 0.491], [371.894, 1012.041, 0.514], [347.529, 1236.907, 0.616]]\nC: [[398.584, 1179.211, 0.555], [316.744, 1033.547, 0.547], [377.513, 1090.27, 0.445], [332.149, 1080.471, 0.473]]\nD: [[393.298, 1155.018, 0.485], [393.298, 1155.017, 0.514], [393.298, 1155.016, 0.542], [393.297, 1155.015, 0.571]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[390.292, 1280.196, 0.538], [390.276, 1322.072, 0.606], [337.427, 1327.052, 0.501], [388.832, 1049.397, 0.585]]\nB: [[441.456, 1017.908, 0.543], [440.359, 1177.164, 0.491], [371.894, 1012.041, 0.514], [347.529, 1236.907, 0.616]]\nC: [[398.584, 1179.211, 0.555], [316.744, 1033.547, 0.547], [377.513, 1090.27, 0.445], [332.149, 1080.471, 0.473]]\nD: [[393.298, 1155.018, 0.485], [393.298, 1155.017, 0.514], [393.298, 1155.016, 0.542], [393.297, 1155.015, 0.571]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_117_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_117_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_117_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_117_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_117_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_117_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_117_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_117_7.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[436.159, 952.778, 0.85], [380.614, 1063.82, 0.774], [459.351, 1286.857, 0.672], [356.13, 1196.862, 0.73]]\nB: [[393.174, 1367.574, 0.683], [466.835, 1298.26, 0.635], [356.883, 1226.503, 0.681], [446.634, 1121.248, 0.813]]\nC: [[399.863, 1143.574, 0.738], [398.996, 1141.132, 0.738], [398.116, 1138.632, 0.738], [397.624, 1136.322, 0.738]]\nD: [[344.514, 1172.922, 0.671], [413.852, 1079.671, 0.613], [361.577, 1132.234, 0.863], [334.055, 1043.733, 0.866]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[436.159, 952.778, 0.85], [380.614, 1063.82, 0.774], [459.351, 1286.857, 0.672], [356.13, 1196.862, 0.73]]\nB: [[393.174, 1367.574, 0.683], [466.835, 1298.26, 0.635], [356.883, 1226.503, 0.681], [446.634, 1121.248, 0.813]]\nC: [[399.863, 1143.574, 0.738], [398.996, 1141.132, 0.738], [398.116, 1138.632, 0.738], [397.624, 1136.322, 0.738]]\nD: [[344.514, 1172.922, 0.671], [413.852, 1079.671, 0.613], [361.577, 1132.234, 0.863], [334.055, 1043.733, 0.866]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_118_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_118_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_118_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_118_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_118_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_118_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_118_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_118_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[379.59, 1225.256, 1.8], [411.498, 1212.145, 1.432], [439.173, 1051.415, 1.512], [378.757, 1170.606, 1.817]]\nB: [[385.93, 1201.138, 1.613], [385.521, 1201.641, 1.663], [384.966, 1202.306, 1.763], [384.443, 1202.903, 1.763]]\nC: [[447.45, 996.224, 1.641], [321.511, 1225.058, 1.654], [320.686, 1029.24, 1.737], [312.326, 1161.223, 1.53]]\nD: [[395.57, 1047.807, 1.499], [340.373, 1260.222, 1.497], [439.995, 1104.894, 1.86], [369.975, 1070.189, 1.508]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[379.59, 1225.256, 1.8], [411.498, 1212.145, 1.432], [439.173, 1051.415, 1.512], [378.757, 1170.606, 1.817]]\nB: [[385.93, 1201.138, 1.613], [385.521, 1201.641, 1.663], [384.966, 1202.306, 1.763], [384.443, 1202.903, 1.763]]\nC: [[447.45, 996.224, 1.641], [321.511, 1225.058, 1.654], [320.686, 1029.24, 1.737], [312.326, 1161.223, 1.53]]\nD: [[395.57, 1047.807, 1.499], [340.373, 1260.222, 1.497], [439.995, 1104.894, 1.86], [369.975, 1070.189, 1.508]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_119_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_119_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_119_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_119_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_119_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_119_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_119_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_119_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[2078.323, 958.565, 0.161], [1559.705, 959.242, 0.151], [2197.62, 763.907, 0.152], [1804.38, 767.638, 0.197]]\nB: [[2138.05, 913.396, 0.132], [2178.71, 1013.596, 0.176], [1703.36, 733.089, 0.191], [1798.969, 734.79, 0.2]]\nC: [[1835.477, 1025.448, 0.137], [1705.515, 1015.928, 0.153], [1802.73, 873.747, 0.182], [1809.594, 823.179, 0.19]]\nD: [[1926.648, 875.886, 0.141], [1926.639, 875.864, 0.154], [1926.63, 875.841, 0.166], [1926.627, 875.833, 0.179]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[2078.323, 958.565, 0.161], [1559.705, 959.242, 0.151], [2197.62, 763.907, 0.152], [1804.38, 767.638, 0.197]]\nB: [[2138.05, 913.396, 0.132], [2178.71, 1013.596, 0.176], [1703.36, 733.089, 0.191], [1798.969, 734.79, 0.2]]\nC: [[1835.477, 1025.448, 0.137], [1705.515, 1015.928, 0.153], [1802.73, 873.747, 0.182], [1809.594, 823.179, 0.19]]\nD: [[1926.648, 875.886, 0.141], [1926.639, 875.864, 0.154], [1926.63, 875.841, 0.166], [1926.627, 875.833, 0.179]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_120_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_120_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_120_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_120_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_120_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_120_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_120_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_120_7.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[1640.64, 2308.693, 0.719], [2017.236, 2113.119, 0.967], [1994.76, 2513.033, 1.14], [1810.618, 2225.686, 1.546]]\nB: [[1973.19, 2747.385, 0.918], [1843.455, 2503.49, 1.052], [1630.78, 2460.524, 1.38], [1987.593, 2630.677, 1.375]]\nC: [[1576.98, 2536.869, 0.66], [2075.147, 2055.992, 1.144], [1827.84, 2639.901, 1.45], [2070.073, 2767.351, 1.167]]\nD: [[1866.27, 2481.021, 0.817], [1865.675, 2481.739, 1.031], [1865.18, 2482.337, 1.21], [1864.684, 2482.936, 1.389]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1640.64, 2308.693, 0.719], [2017.236, 2113.119, 0.967], [1994.76, 2513.033, 1.14], [1810.618, 2225.686, 1.546]]\nB: [[1973.19, 2747.385, 0.918], [1843.455, 2503.49, 1.052], [1630.78, 2460.524, 1.38], [1987.593, 2630.677, 1.375]]\nC: [[1576.98, 2536.869, 0.66], [2075.147, 2055.992, 1.144], [1827.84, 2639.901, 1.45], [2070.073, 2767.351, 1.167]]\nD: [[1866.27, 2481.021, 0.817], [1865.675, 2481.739, 1.031], [1865.18, 2482.337, 1.21], [1864.684, 2482.936, 1.389]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_121_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_121_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_121_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_121_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_121_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_121_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_121_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_121_7.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[1323.534, 1070.968, 0.039], [1242.872, 853.737, 0.042], [1492.766, 1030.756, 0.057], [1309.345, 1096.471, 0.046]]\nB: [[1556.573, 856.346, 0.057], [1108.062, 1183.213, 0.047], [1303.053, 903.29, 0.05], [1529.898, 1191.182, 0.045]]\nC: [[1351.279, 1022.468, 0.048], [1351.279, 1022.468, 0.048], [1351.279, 1022.468, 0.048], [1351.279, 1022.468, 0.048]]\nD: [[1160.276, 1004.554, 0.048], [1454.931, 1040.919, 0.039], [1167.504, 987.892, 0.046], [1457.735, 818.113, 0.042]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1323.534, 1070.968, 0.039], [1242.872, 853.737, 0.042], [1492.766, 1030.756, 0.057], [1309.345, 1096.471, 0.046]]\nB: [[1556.573, 856.346, 0.057], [1108.062, 1183.213, 0.047], [1303.053, 903.29, 0.05], [1529.898, 1191.182, 0.045]]\nC: [[1351.279, 1022.468, 0.048], [1351.279, 1022.468, 0.048], [1351.279, 1022.468, 0.048], [1351.279, 1022.468, 0.048]]\nD: [[1160.276, 1004.554, 0.048], [1454.931, 1040.919, 0.039], [1167.504, 987.892, 0.046], [1457.735, 818.113, 0.042]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_122_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_122_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_122_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_122_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_122_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_122_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_122_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_122_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[398.992, 1279.087, 0.118], [456.215, 1152.193, 0.133], [407.776, 1279.429, 0.095], [527.113, 1246.616, 0.136]]\nB: [[448.696, 1090.248, 0.117], [448.695, 1090.246, 0.117], [448.686, 1090.224, 0.115], [448.685, 1090.222, 0.114]]\nC: [[435.77, 875.144, 0.115], [440.962, 1303.479, 0.129], [413.225, 1290.42, 0.105], [511.071, 1036.309, 0.122]]\nD: [[438.596, 955.475, 0.137], [464.97, 1295.34, 0.118], [386.42, 1095.841, 0.125], [437.592, 1200.522, 0.127]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[398.992, 1279.087, 0.118], [456.215, 1152.193, 0.133], [407.776, 1279.429, 0.095], [527.113, 1246.616, 0.136]]\nB: [[448.696, 1090.248, 0.117], [448.695, 1090.246, 0.117], [448.686, 1090.224, 0.115], [448.685, 1090.222, 0.114]]\nC: [[435.77, 875.144, 0.115], [440.962, 1303.479, 0.129], [413.225, 1290.42, 0.105], [511.071, 1036.309, 0.122]]\nD: [[438.596, 955.475, 0.137], [464.97, 1295.34, 0.118], [386.42, 1095.841, 0.125], [437.592, 1200.522, 0.127]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_123_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_123_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_123_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_123_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_123_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_123_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_123_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_123_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[1835.491, 834.002, 0.48], [2125.794, 969.393, 0.677], [1614.306, 1027.608, 0.583], [1983.247, 970.603, 0.541]]\nB: [[1651.575, 740.269, 0.56], [1919.293, 887.545, 0.629], [1867.876, 908.887, 0.565], [1937.748, 943.609, 0.511]]\nC: [[1784.634, 874.597, 0.596], [1784.597, 874.576, 0.596], [1784.564, 874.558, 0.596], [1784.764, 874.582, 0.596]]\nD: [[1674.888, 950.802, 0.589], [2065.024, 902.619, 0.528], [2130.173, 1019.966, 0.552], [2067.829, 931.775, 0.63]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1835.491, 834.002, 0.48], [2125.794, 969.393, 0.677], [1614.306, 1027.608, 0.583], [1983.247, 970.603, 0.541]]\nB: [[1651.575, 740.269, 0.56], [1919.293, 887.545, 0.629], [1867.876, 908.887, 0.565], [1937.748, 943.609, 0.511]]\nC: [[1784.634, 874.597, 0.596], [1784.597, 874.576, 0.596], [1784.564, 874.558, 0.596], [1784.764, 874.582, 0.596]]\nD: [[1674.888, 950.802, 0.589], [2065.024, 902.619, 0.528], [2130.173, 1019.966, 0.552], [2067.829, 931.775, 0.63]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_124_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_124_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_124_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_124_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_124_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_124_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_124_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_124_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[276.129, 765.803, 0.706], [239.259, 629.205, 0.8], [265.678, 619.0, 0.624], [272.857, 563.448, 0.52]]\nB: [[267.858, 571.717, 0.774], [277.532, 772.037, 0.819], [265.677, 626.589, 0.69], [303.026, 678.599, 0.635]]\nC: [[307.434, 646.641, 0.793], [279.193, 720.372, 0.75], [342.062, 733.991, 0.756], [275.316, 788.349, 0.594]]\nD: [[287.863, 668.522, 0.723], [289.106, 670.134, 0.687], [290.511, 671.955, 0.702], [292.583, 674.718, 0.593]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[276.129, 765.803, 0.706], [239.259, 629.205, 0.8], [265.678, 619.0, 0.624], [272.857, 563.448, 0.52]]\nB: [[267.858, 571.717, 0.774], [277.532, 772.037, 0.819], [265.677, 626.589, 0.69], [303.026, 678.599, 0.635]]\nC: [[307.434, 646.641, 0.793], [279.193, 720.372, 0.75], [342.062, 733.991, 0.756], [275.316, 788.349, 0.594]]\nD: [[287.863, 668.522, 0.723], [289.106, 670.134, 0.687], [290.511, 671.955, 0.702], [292.583, 674.718, 0.593]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_125_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_125_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_125_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_125_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_125_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_125_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_125_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_125_7.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[622.35, 1624.018, -0.016], [621.599, 1624.59, 0.068], [620.967, 1625.056, 0.201], [620.312, 1625.598, 0.284]]\nB: [[715.37, 1646.519, -0.015], [571.805, 1818.41, 0.061], [556.199, 1828.057, 0.206], [719.663, 1568.216, 0.242]]\nC: [[619.74, 1639.913, -0.014], [696.763, 1306.55, 0.056], [512.97, 1560.484, 0.186], [567.584, 1424.02, 0.237]]\nD: [[676.88, 1409.501, -0.016], [537.018, 1735.64, 0.057], [546.621, 1339.978, 0.22], [568.0, 1888.129, 0.245]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[622.35, 1624.018, -0.016], [621.599, 1624.59, 0.068], [620.967, 1625.056, 0.201], [620.312, 1625.598, 0.284]]\nB: [[715.37, 1646.519, -0.015], [571.805, 1818.41, 0.061], [556.199, 1828.057, 0.206], [719.663, 1568.216, 0.242]]\nC: [[619.74, 1639.913, -0.014], [696.763, 1306.55, 0.056], [512.97, 1560.484, 0.186], [567.584, 1424.02, 0.237]]\nD: [[676.88, 1409.501, -0.016], [537.018, 1735.64, 0.057], [546.621, 1339.978, 0.22], [568.0, 1888.129, 0.245]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_126_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_126_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_126_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_126_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_126_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_126_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_126_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_126_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[627.008, 1617.877, -0.387], [626.7, 1618.657, -0.137], [626.332, 1619.431, 0.163], [626.034, 1619.837, 0.363]]\nB: [[514.156, 1782.736, -0.333], [575.6, 1636.318, -0.131], [743.355, 1576.589, 0.179], [505.541, 1559.477, 0.32]]\nC: [[712.578, 1866.613, -0.344], [558.1, 1427.09, -0.154], [677.337, 1665.044, 0.133], [550.249, 1826.976, 0.376]]\nD: [[618.87, 1499.776, -0.427], [647.6, 1861.481, -0.148], [699.281, 1872.065, 0.164], [640.722, 1817.452, 0.342]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[627.008, 1617.877, -0.387], [626.7, 1618.657, -0.137], [626.332, 1619.431, 0.163], [626.034, 1619.837, 0.363]]\nB: [[514.156, 1782.736, -0.333], [575.6, 1636.318, -0.131], [743.355, 1576.589, 0.179], [505.541, 1559.477, 0.32]]\nC: [[712.578, 1866.613, -0.344], [558.1, 1427.09, -0.154], [677.337, 1665.044, 0.133], [550.249, 1826.976, 0.376]]\nD: [[618.87, 1499.776, -0.427], [647.6, 1861.481, -0.148], [699.281, 1872.065, 0.164], [640.722, 1817.452, 0.342]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_127_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_127_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_127_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_127_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_127_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_127_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_127_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_127_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[353.535, 1093.572, 0.8], [325.884, 1175.321, 0.708], [298.264, 1160.539, 1.06], [403.515, 1196.443, 0.896]]\nB: [[298.277, 1279.808, 0.8], [334.872, 990.495, 0.719], [317.907, 1145.582, 0.785], [428.226, 1134.096, 1.096]]\nC: [[361.234, 1127.159, 0.743], [361.244, 1127.193, 0.761], [361.254, 1127.227, 0.979], [361.252, 1127.231, 1.019]]\nD: [[351.808, 979.748, 0.733], [299.091, 972.477, 0.91], [422.591, 1328.277, 1.109], [373.924, 1003.202, 0.826]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[353.535, 1093.572, 0.8], [325.884, 1175.321, 0.708], [298.264, 1160.539, 1.06], [403.515, 1196.443, 0.896]]\nB: [[298.277, 1279.808, 0.8], [334.872, 990.495, 0.719], [317.907, 1145.582, 0.785], [428.226, 1134.096, 1.096]]\nC: [[361.234, 1127.159, 0.743], [361.244, 1127.193, 0.761], [361.254, 1127.227, 0.979], [361.252, 1127.231, 1.019]]\nD: [[351.808, 979.748, 0.733], [299.091, 972.477, 0.91], [422.591, 1328.277, 1.109], [373.924, 1003.202, 0.826]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_128_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_128_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_128_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_128_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_128_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_128_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_128_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_128_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[467.084, 1013.697, 1.484], [410.637, 1087.555, 1.47], [503.896, 994.189, 1.833], [375.598, 1172.16, 1.996]]\nB: [[403.645, 1031.052, 1.679], [463.451, 943.028, 1.451], [499.44, 1242.514, 2.094], [468.957, 1220.61, 2.107]]\nC: [[479.961, 1178.515, 1.846], [421.912, 1195.377, 1.945], [395.807, 904.258, 1.58], [479.041, 963.66, 1.573]]\nD: [[443.949, 1116.592, 1.729], [443.607, 1116.621, 1.729], [442.518, 1116.448, 1.879], [442.143, 1116.34, 1.929]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[467.084, 1013.697, 1.484], [410.637, 1087.555, 1.47], [503.896, 994.189, 1.833], [375.598, 1172.16, 1.996]]\nB: [[403.645, 1031.052, 1.679], [463.451, 943.028, 1.451], [499.44, 1242.514, 2.094], [468.957, 1220.61, 2.107]]\nC: [[479.961, 1178.515, 1.846], [421.912, 1195.377, 1.945], [395.807, 904.258, 1.58], [479.041, 963.66, 1.573]]\nD: [[443.949, 1116.592, 1.729], [443.607, 1116.621, 1.729], [442.518, 1116.448, 1.879], [442.143, 1116.34, 1.929]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_129_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_129_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_129_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_129_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_129_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_129_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_129_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_129_7.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[1044.891, 1237.212, 0.684], [1071.2, 1248.461, 0.639], [1210.008, 933.973, 0.707], [1328.735, 877.082, 0.748]]\nB: [[1117.371, 1205.206, 0.822], [1089.2, 940.984, 0.629], [1072.282, 905.107, 0.824], [1173.176, 946.517, 0.885]]\nC: [[1227.559, 936.208, 0.663], [1471.6, 1143.386, 0.863], [1177.563, 842.525, 0.712], [1310.648, 1103.801, 0.83]]\nD: [[1267.451, 1047.078, 0.822], [1266.5, 1047.564, 0.754], [1265.609, 1047.998, 0.762], [1257.032, 1054.386, 0.759]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1044.891, 1237.212, 0.684], [1071.2, 1248.461, 0.639], [1210.008, 933.973, 0.707], [1328.735, 877.082, 0.748]]\nB: [[1117.371, 1205.206, 0.822], [1089.2, 940.984, 0.629], [1072.282, 905.107, 0.824], [1173.176, 946.517, 0.885]]\nC: [[1227.559, 936.208, 0.663], [1471.6, 1143.386, 0.863], [1177.563, 842.525, 0.712], [1310.648, 1103.801, 0.83]]\nD: [[1267.451, 1047.078, 0.822], [1266.5, 1047.564, 0.754], [1265.609, 1047.998, 0.762], [1257.032, 1054.386, 0.759]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_130_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_130_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_130_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_130_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_130_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_130_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_130_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_130_7.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[1275.825, 1026.459, 0.275], [1278.063, 1029.09, 0.375], [1280.981, 1032.367, 0.325], [1283.902, 1035.648, 0.374]]\nB: [[1030.171, 1170.182, 0.27], [1123.271, 830.3, 0.339], [1164.96, 938.971, 0.375], [1317.327, 864.217, 0.318]]\nC: [[1058.02, 1197.606, 0.255], [1412.723, 1041.94, 0.385], [1413.334, 1081.562, 0.344], [1284.333, 1092.197, 0.438]]\nD: [[1469.903, 1189.502, 0.313], [1314.332, 1032.81, 0.399], [1118.592, 1102.621, 0.281], [1269.138, 1091.852, 0.359]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1275.825, 1026.459, 0.275], [1278.063, 1029.09, 0.375], [1280.981, 1032.367, 0.325], [1283.902, 1035.648, 0.374]]\nB: [[1030.171, 1170.182, 0.27], [1123.271, 830.3, 0.339], [1164.96, 938.971, 0.375], [1317.327, 864.217, 0.318]]\nC: [[1058.02, 1197.606, 0.255], [1412.723, 1041.94, 0.385], [1413.334, 1081.562, 0.344], [1284.333, 1092.197, 0.438]]\nD: [[1469.903, 1189.502, 0.313], [1314.332, 1032.81, 0.399], [1118.592, 1102.621, 0.281], [1269.138, 1091.852, 0.359]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_131_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_131_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_131_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_131_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_131_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_131_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_131_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_131_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[421.972, 1212.966, 0.431], [433.3, 1273.552, 0.507], [464.634, 1128.046, 0.702], [349.601, 1228.731, 0.578]]\nB: [[400.984, 1376.129, 0.581], [391.113, 1165.646, 0.7], [457.469, 1280.832, 0.616], [442.522, 1062.927, 0.701]]\nC: [[449.392, 986.304, 0.601], [473.649, 1081.286, 0.52], [358.026, 1320.626, 0.568], [395.395, 1377.932, 0.573]]\nD: [[399.773, 1169.799, 0.536], [399.773, 1169.799, 0.586], [399.773, 1169.799, 0.636], [399.773, 1169.799, 0.681]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[421.972, 1212.966, 0.431], [433.3, 1273.552, 0.507], [464.634, 1128.046, 0.702], [349.601, 1228.731, 0.578]]\nB: [[400.984, 1376.129, 0.581], [391.113, 1165.646, 0.7], [457.469, 1280.832, 0.616], [442.522, 1062.927, 0.701]]\nC: [[449.392, 986.304, 0.601], [473.649, 1081.286, 0.52], [358.026, 1320.626, 0.568], [395.395, 1377.932, 0.573]]\nD: [[399.773, 1169.799, 0.536], [399.773, 1169.799, 0.586], [399.773, 1169.799, 0.636], [399.773, 1169.799, 0.681]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_132_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_132_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_132_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_132_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_132_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_132_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_132_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_132_7.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[417.658, 1171.585, 1.02], [344.402, 1146.41, 1.185], [367.351, 1178.349, 1.028], [487.83, 1043.697, 0.98]]\nB: [[445.605, 1419.113, 0.98], [352.2, 1288.12, 1.051], [459.196, 983.633, 1.107], [368.22, 1292.263, 1.32]]\nC: [[450.048, 1344.51, 1.16], [403.323, 1079.3, 1.248], [335.599, 1292.674, 1.335], [385.64, 1056.834, 1.11]]\nD: [[419.296, 1191.476, 1.11], [418.846, 1191.58, 1.143], [418.293, 1191.727, 1.176], [417.52, 1191.939, 1.21]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[417.658, 1171.585, 1.02], [344.402, 1146.41, 1.185], [367.351, 1178.349, 1.028], [487.83, 1043.697, 0.98]]\nB: [[445.605, 1419.113, 0.98], [352.2, 1288.12, 1.051], [459.196, 983.633, 1.107], [368.22, 1292.263, 1.32]]\nC: [[450.048, 1344.51, 1.16], [403.323, 1079.3, 1.248], [335.599, 1292.674, 1.335], [385.64, 1056.834, 1.11]]\nD: [[419.296, 1191.476, 1.11], [418.846, 1191.58, 1.143], [418.293, 1191.727, 1.176], [417.52, 1191.939, 1.21]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_133_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_133_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_133_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_133_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_133_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_133_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_133_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_133_7.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[364.455, 946.84, 0.64], [334.117, 1088.62, 0.558], [343.529, 1215.1, 0.571], [332.561, 991.64, 0.55]]\nB: [[398.222, 1166.03, 0.56], [398.222, 1166.03, 0.577], [398.222, 1166.03, 0.594], [398.222, 1166.03, 0.61]]\nC: [[452.892, 1109.11, 0.63], [362.882, 1081.56, 0.574], [328.005, 1052.37, 0.65], [326.765, 997.91, 0.68]]\nD: [[389.913, 1383.18, 0.51], [334.65, 1310.36, 0.682], [445.091, 1036.45, 0.591], [404.94, 1152.47, 0.57]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[364.455, 946.84, 0.64], [334.117, 1088.62, 0.558], [343.529, 1215.1, 0.571], [332.561, 991.64, 0.55]]\nB: [[398.222, 1166.03, 0.56], [398.222, 1166.03, 0.577], [398.222, 1166.03, 0.594], [398.222, 1166.03, 0.61]]\nC: [[452.892, 1109.11, 0.63], [362.882, 1081.56, 0.574], [328.005, 1052.37, 0.65], [326.765, 997.91, 0.68]]\nD: [[389.913, 1383.18, 0.51], [334.65, 1310.36, 0.682], [445.091, 1036.45, 0.591], [404.94, 1152.47, 0.57]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_134_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_134_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_134_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_134_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_134_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_134_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_134_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_134_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[627.271, 1619.557, -0.161], [627.006, 1619.87, -0.051], [626.747, 1620.187, 0.11], [626.52, 1620.528, 0.17]]\nB: [[569.336, 1657.23, -0.136], [526.963, 1384.47, -0.061], [669.247, 1891.64, 0.11], [671.16, 1857.428, 0.15]]\nC: [[684.005, 1527.275, -0.146], [739.824, 1494.52, -0.06], [521.003, 1884.978, 0.09], [553.11, 1840.593, 0.19]]\nD: [[532.728, 1841.748, -0.144], [536.854, 1368.26, -0.059], [622.506, 1400.948, 0.12], [562.38, 1942.023, 0.18]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[627.271, 1619.557, -0.161], [627.006, 1619.87, -0.051], [626.747, 1620.187, 0.11], [626.52, 1620.528, 0.17]]\nB: [[569.336, 1657.23, -0.136], [526.963, 1384.47, -0.061], [669.247, 1891.64, 0.11], [671.16, 1857.428, 0.15]]\nC: [[684.005, 1527.275, -0.146], [739.824, 1494.52, -0.06], [521.003, 1884.978, 0.09], [553.11, 1840.593, 0.19]]\nD: [[532.728, 1841.748, -0.144], [536.854, 1368.26, -0.059], [622.506, 1400.948, 0.12], [562.38, 1942.023, 0.18]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_135_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_135_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_135_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_135_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_135_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_135_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_135_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_135_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[1276.426, 1070.932, 0.876], [1276.425, 1070.932, 0.877], [1276.424, 1070.932, 0.878], [1276.423, 1070.932, 0.879]]\nB: [[1298.613, 1048.861, 0.963], [1211.744, 1284.0, 0.977], [1133.349, 1252.098, 0.958], [1442.465, 1081.694, 0.942]]\nC: [[1136.57, 1184.933, 0.959], [1263.407, 1137.283, 0.74], [1237.716, 1079.234, 0.996], [1254.286, 1092.816, 1.0]]\nD: [[1156.908, 984.436, 0.862], [1293.574, 1008.462, 0.755], [1072.394, 1109.853, 0.763], [1158.181, 1086.592, 0.975]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1276.426, 1070.932, 0.876], [1276.425, 1070.932, 0.877], [1276.424, 1070.932, 0.878], [1276.423, 1070.932, 0.879]]\nB: [[1298.613, 1048.861, 0.963], [1211.744, 1284.0, 0.977], [1133.349, 1252.098, 0.958], [1442.465, 1081.694, 0.942]]\nC: [[1136.57, 1184.933, 0.959], [1263.407, 1137.283, 0.74], [1237.716, 1079.234, 0.996], [1254.286, 1092.816, 1.0]]\nD: [[1156.908, 984.436, 0.862], [1293.574, 1008.462, 0.755], [1072.394, 1109.853, 0.763], [1158.181, 1086.592, 0.975]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_136_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_136_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_136_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_136_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_136_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_136_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_136_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_136_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[1863.527, 866.104, 1.229], [1863.553, 866.65, 1.085], [1863.585, 867.332, 1.016], [1863.611, 868.023, 1.0]]\nB: [[1741.116, 973.52, 1.473], [1586.126, 927.91, 1.219], [1837.83, 816.557, 1.029], [1765.354, 1012.863, 0.8]]\nC: [[2109.608, 749.973, 1.352], [2151.463, 723.35, 1.155], [2081.946, 774.988, 1.039], [1584.067, 819.061, 1.0]]\nD: [[1619.868, 705.702, 1.426], [2015.736, 882.4, 1.072], [1611.742, 1030.157, 1.001], [1809.71, 882.281, 0.9]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1863.527, 866.104, 1.229], [1863.553, 866.65, 1.085], [1863.585, 867.332, 1.016], [1863.611, 868.023, 1.0]]\nB: [[1741.116, 973.52, 1.473], [1586.126, 927.91, 1.219], [1837.83, 816.557, 1.029], [1765.354, 1012.863, 0.8]]\nC: [[2109.608, 749.973, 1.352], [2151.463, 723.35, 1.155], [2081.946, 774.988, 1.039], [1584.067, 819.061, 1.0]]\nD: [[1619.868, 705.702, 1.426], [2015.736, 882.4, 1.072], [1611.742, 1030.157, 1.001], [1809.71, 882.281, 0.9]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_137_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_137_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_137_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_137_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_137_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_137_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_137_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_137_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[370.828, 1093.568, 0.573], [438.229, 1239.617, 0.497], [355.298, 1015.971, 0.498], [469.743, 1216.196, 0.622]]\nB: [[334.534, 1298.472, 0.487], [330.759, 1369.516, 0.441], [394.543, 1079.174, 0.619], [471.577, 1146.247, 0.639]]\nC: [[394.842, 1158.711, 0.487], [394.842, 1158.711, 0.521], [394.842, 1158.711, 0.554], [394.842, 1158.711, 0.587]]\nD: [[370.596, 976.597, 0.509], [364.598, 996.341, 0.435], [427.969, 1274.101, 0.549], [391.146, 1206.744, 0.606]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[370.828, 1093.568, 0.573], [438.229, 1239.617, 0.497], [355.298, 1015.971, 0.498], [469.743, 1216.196, 0.622]]\nB: [[334.534, 1298.472, 0.487], [330.759, 1369.516, 0.441], [394.543, 1079.174, 0.619], [471.577, 1146.247, 0.639]]\nC: [[394.842, 1158.711, 0.487], [394.842, 1158.711, 0.521], [394.842, 1158.711, 0.554], [394.842, 1158.711, 0.587]]\nD: [[370.596, 976.597, 0.509], [364.598, 996.341, 0.435], [427.969, 1274.101, 0.549], [391.146, 1206.744, 0.606]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_138_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_138_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_138_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_138_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_138_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_138_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_138_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_138_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[350.427, 1144.305, 0.623], [349.868, 1144.535, 0.69], [349.308, 1144.766, 0.756], [348.749, 1144.996, 0.823]]\nB: [[364.104, 961.597, 0.533], [321.289, 1034.564, 0.81], [289.738, 1178.466, 0.654], [369.278, 927.402, 0.746]]\nC: [[301.407, 1085.027, 0.561], [353.922, 1230.167, 0.74], [385.078, 1056.365, 0.831], [353.967, 1321.653, 0.933]]\nD: [[332.499, 1323.247, 0.603], [328.44, 1217.95, 0.71], [304.408, 1248.393, 0.704], [312.725, 1041.977, 0.788]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[350.427, 1144.305, 0.623], [349.868, 1144.535, 0.69], [349.308, 1144.766, 0.756], [348.749, 1144.996, 0.823]]\nB: [[364.104, 961.597, 0.533], [321.289, 1034.564, 0.81], [289.738, 1178.466, 0.654], [369.278, 927.402, 0.746]]\nC: [[301.407, 1085.027, 0.561], [353.922, 1230.167, 0.74], [385.078, 1056.365, 0.831], [353.967, 1321.653, 0.933]]\nD: [[332.499, 1323.247, 0.603], [328.44, 1217.95, 0.71], [304.408, 1248.393, 0.704], [312.725, 1041.977, 0.788]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_139_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_139_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_139_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_139_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_139_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_139_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_139_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_139_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[1098.816, 1040.956, 2.25], [1517.027, 1201.267, 2.159], [1287.68, 1114.311, 1.84], [1553.384, 936.614, 1.891]]\nB: [[1537.939, 1003.593, 1.619], [1107.39, 826.486, 1.866], [1160.283, 877.892, 2.283], [1427.746, 1058.395, 2.165]]\nC: [[1325.647, 1026.158, 1.972], [1325.647, 1026.158, 1.972], [1325.647, 1026.158, 1.972], [1325.647, 1026.158, 1.972]]\nD: [[1454.649, 935.993, 2.155], [1581.869, 969.794, 1.581], [1203.456, 996.196, 2.063], [1385.658, 925.079, 2.322]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1098.816, 1040.956, 2.25], [1517.027, 1201.267, 2.159], [1287.68, 1114.311, 1.84], [1553.384, 936.614, 1.891]]\nB: [[1537.939, 1003.593, 1.619], [1107.39, 826.486, 1.866], [1160.283, 877.892, 2.283], [1427.746, 1058.395, 2.165]]\nC: [[1325.647, 1026.158, 1.972], [1325.647, 1026.158, 1.972], [1325.647, 1026.158, 1.972], [1325.647, 1026.158, 1.972]]\nD: [[1454.649, 935.993, 2.155], [1581.869, 969.794, 1.581], [1203.456, 996.196, 2.063], [1385.658, 925.079, 2.322]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_140_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_140_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_140_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_140_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_140_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_140_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_140_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_140_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[436.144, 1095.893, 0.64], [378.481, 1266.824, 0.483], [426.128, 1098.819, 0.65], [324.34, 949.634, 0.625]]\nB: [[397.389, 1164.192, 0.54], [397.389, 1164.192, 0.565], [397.389, 1164.192, 0.59], [397.389, 1164.192, 0.615]]\nC: [[380.365, 1356.637, 0.44], [339.063, 1111.83, 0.512], [337.584, 979.936, 0.64], [437.254, 1203.389, 0.683]]\nD: [[323.527, 1041.167, 0.63], [470.94, 1158.877, 0.637], [366.836, 1001.327, 0.61], [420.137, 1320.47, 0.577]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[436.144, 1095.893, 0.64], [378.481, 1266.824, 0.483], [426.128, 1098.819, 0.65], [324.34, 949.634, 0.625]]\nB: [[397.389, 1164.192, 0.54], [397.389, 1164.192, 0.565], [397.389, 1164.192, 0.59], [397.389, 1164.192, 0.615]]\nC: [[380.365, 1356.637, 0.44], [339.063, 1111.83, 0.512], [337.584, 979.936, 0.64], [437.254, 1203.389, 0.683]]\nD: [[323.527, 1041.167, 0.63], [470.94, 1158.877, 0.637], [366.836, 1001.327, 0.61], [420.137, 1320.47, 0.577]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_141_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_141_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_141_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_141_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_141_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_141_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_141_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_141_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[345.303, 927.986, 0.6], [322.702, 926.62, 0.836], [322.388, 1098.752, 0.715], [411.956, 1203.533, 1.062]]\nB: [[360.095, 1122.376, 0.7], [360.061, 1122.39, 0.773], [360.027, 1122.404, 0.846], [359.993, 1122.417, 0.918]]\nC: [[325.719, 970.767, 0.8], [352.264, 988.9, 0.891], [370.173, 1212.852, 0.847], [306.427, 1052.878, 0.831]]\nD: [[408.76, 1154.201, 0.8], [384.285, 1027.05, 0.647], [381.462, 1131.647, 0.827], [348.63, 1106.215, 0.947]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[345.303, 927.986, 0.6], [322.702, 926.62, 0.836], [322.388, 1098.752, 0.715], [411.956, 1203.533, 1.062]]\nB: [[360.095, 1122.376, 0.7], [360.061, 1122.39, 0.773], [360.027, 1122.404, 0.846], [359.993, 1122.417, 0.918]]\nC: [[325.719, 970.767, 0.8], [352.264, 988.9, 0.891], [370.173, 1212.852, 0.847], [306.427, 1052.878, 0.831]]\nD: [[408.76, 1154.201, 0.8], [384.285, 1027.05, 0.647], [381.462, 1131.647, 0.827], [348.63, 1106.215, 0.947]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_142_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_142_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_142_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_142_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_142_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_142_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_142_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_142_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[342.004, 1040.88, 0.744], [329.607, 1045.03, 0.684], [379.212, 1250.647, 0.868], [326.815, 1264.503, 0.847]]\nB: [[356.724, 1113.785, 0.625], [356.749, 1113.855, 0.775], [356.778, 1113.889, 0.975], [356.785, 1113.897, 1.025]]\nC: [[412.74, 910.82, 0.676], [365.411, 1210.523, 0.664], [303.937, 1114.862, 0.873], [419.175, 1333.448, 0.84]]\nD: [[295.038, 1240.739, 0.504], [341.219, 1044.42, 0.812], [352.463, 1064.815, 1.125], [371.869, 1069.702, 0.977]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[342.004, 1040.88, 0.744], [329.607, 1045.03, 0.684], [379.212, 1250.647, 0.868], [326.815, 1264.503, 0.847]]\nB: [[356.724, 1113.785, 0.625], [356.749, 1113.855, 0.775], [356.778, 1113.889, 0.975], [356.785, 1113.897, 1.025]]\nC: [[412.74, 910.82, 0.676], [365.411, 1210.523, 0.664], [303.937, 1114.862, 0.873], [419.175, 1333.448, 0.84]]\nD: [[295.038, 1240.739, 0.504], [341.219, 1044.42, 0.812], [352.463, 1064.815, 1.125], [371.869, 1069.702, 0.977]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_143_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_143_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_143_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_143_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_143_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_143_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_143_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_143_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[1394.419, 833.646, 0.583], [1328.905, 1140.762, 0.779], [1153.453, 1205.926, 0.657], [1146.98, 1211.025, 0.624]]\nB: [[1313.096, 1036.989, 0.652], [1313.096, 1036.989, 0.652], [1313.096, 1036.989, 0.652], [1313.096, 1036.989, 0.652]]\nC: [[1268.974, 982.967, 0.746], [1564.315, 845.479, 0.727], [1340.978, 1034.082, 0.715], [1495.256, 1214.335, 0.578]]\nD: [[1459.081, 1060.013, 0.734], [1278.902, 880.579, 0.542], [1345.294, 988.27, 0.706], [1466.974, 993.897, 0.65]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1394.419, 833.646, 0.583], [1328.905, 1140.762, 0.779], [1153.453, 1205.926, 0.657], [1146.98, 1211.025, 0.624]]\nB: [[1313.096, 1036.989, 0.652], [1313.096, 1036.989, 0.652], [1313.096, 1036.989, 0.652], [1313.096, 1036.989, 0.652]]\nC: [[1268.974, 982.967, 0.746], [1564.315, 845.479, 0.727], [1340.978, 1034.082, 0.715], [1495.256, 1214.335, 0.578]]\nD: [[1459.081, 1060.013, 0.734], [1278.902, 880.579, 0.542], [1345.294, 988.27, 0.706], [1466.974, 993.897, 0.65]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_144_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_144_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_144_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_144_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_144_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_144_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_144_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_144_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[483.59, 928.44, 0.641], [429.525, 1137.919, 0.585], [431.636, 1075.281, 0.417], [368.098, 942.18, 0.409]]\nB: [[506.828, 1075.62, 0.78], [523.808, 941.549, 0.581], [482.43, 968.916, 0.331], [419.206, 1128.103, 0.389]]\nC: [[440.798, 1086.59, 0.718], [440.809, 1086.616, 0.568], [440.809, 1086.616, 0.368], [440.809, 1086.616, 0.368]]\nD: [[495.844, 942.38, 0.674], [400.153, 884.092, 0.599], [357.631, 872.728, 0.373], [508.512, 889.858, 0.32]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[483.59, 928.44, 0.641], [429.525, 1137.919, 0.585], [431.636, 1075.281, 0.417], [368.098, 942.18, 0.409]]\nB: [[506.828, 1075.62, 0.78], [523.808, 941.549, 0.581], [482.43, 968.916, 0.331], [419.206, 1128.103, 0.389]]\nC: [[440.798, 1086.59, 0.718], [440.809, 1086.616, 0.568], [440.809, 1086.616, 0.368], [440.809, 1086.616, 0.368]]\nD: [[495.844, 942.38, 0.674], [400.153, 884.092, 0.599], [357.631, 872.728, 0.373], [508.512, 889.858, 0.32]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_145_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_145_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_145_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_145_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_145_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_145_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_145_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_145_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[431.795, 1089.293, 0.732], [431.79, 1089.298, 0.611], [431.763, 1089.236, 0.548], [431.76, 1089.235, 0.741]]\nB: [[430.055, 1135.006, 0.834], [362.83, 1266.13, 0.535], [510.725, 963.311, 0.449], [354.76, 1199.812, 0.852]]\nC: [[374.034, 1232.506, 0.835], [446.62, 1198.857, 0.654], [454.385, 1036.14, 0.539], [461.62, 1215.977, 0.65]]\nD: [[426.107, 1134.166, 0.869], [469.51, 941.329, 0.702], [490.047, 990.374, 0.543], [356.18, 1025.654, 0.728]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[431.795, 1089.293, 0.732], [431.79, 1089.298, 0.611], [431.763, 1089.236, 0.548], [431.76, 1089.235, 0.741]]\nB: [[430.055, 1135.006, 0.834], [362.83, 1266.13, 0.535], [510.725, 963.311, 0.449], [354.76, 1199.812, 0.852]]\nC: [[374.034, 1232.506, 0.835], [446.62, 1198.857, 0.654], [454.385, 1036.14, 0.539], [461.62, 1215.977, 0.65]]\nD: [[426.107, 1134.166, 0.869], [469.51, 941.329, 0.702], [490.047, 990.374, 0.543], [356.18, 1025.654, 0.728]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_146_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_146_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_146_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_146_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_146_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_146_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_146_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_146_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[1042.405, 896.49, 0.459], [1248.26, 1026.303, 0.519], [1554.895, 994.696, 0.5], [1480.656, 1055.338, 0.476]]\nB: [[1168.242, 1176.462, 0.406], [1219.42, 1198.448, 0.487], [1280.042, 1223.429, 0.5], [1301.768, 1040.639, 0.468]]\nC: [[1293.229, 1033.246, 0.388], [1296.13, 1035.001, 0.465], [1296.744, 1035.285, 0.5], [1297.358, 1035.569, 0.535]]\nD: [[1205.688, 1036.352, 0.443], [1283.41, 852.956, 0.509], [1152.749, 895.819, 0.4], [1317.874, 1012.868, 0.584]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1042.405, 896.49, 0.459], [1248.26, 1026.303, 0.519], [1554.895, 994.696, 0.5], [1480.656, 1055.338, 0.476]]\nB: [[1168.242, 1176.462, 0.406], [1219.42, 1198.448, 0.487], [1280.042, 1223.429, 0.5], [1301.768, 1040.639, 0.468]]\nC: [[1293.229, 1033.246, 0.388], [1296.13, 1035.001, 0.465], [1296.744, 1035.285, 0.5], [1297.358, 1035.569, 0.535]]\nD: [[1205.688, 1036.352, 0.443], [1283.41, 852.956, 0.509], [1152.749, 895.819, 0.4], [1317.874, 1012.868, 0.584]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_147_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_147_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_147_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_147_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_147_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_147_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_147_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_147_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[2143.23, 880.092, 1.463], [2237.79, 816.687, 1.792], [2021.67, 1004.854, 1.441], [2013.58, 762.901, 1.623]]\nB: [[2030.68, 939.082, 1.556], [1707.64, 827.801, 1.543], [2187.71, 957.754, 1.703], [1734.52, 730.487, 1.539]]\nC: [[2157.8, 869.883, 1.527], [1960.08, 1008.147, 1.759], [1779.88, 929.643, 1.772], [1538.45, 884.771, 1.889]]\nD: [[1907.37, 864.452, 1.647], [1907.37, 864.452, 1.647], [1907.37, 864.452, 1.647], [1907.37, 864.452, 1.647]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[2143.23, 880.092, 1.463], [2237.79, 816.687, 1.792], [2021.67, 1004.854, 1.441], [2013.58, 762.901, 1.623]]\nB: [[2030.68, 939.082, 1.556], [1707.64, 827.801, 1.543], [2187.71, 957.754, 1.703], [1734.52, 730.487, 1.539]]\nC: [[2157.8, 869.883, 1.527], [1960.08, 1008.147, 1.759], [1779.88, 929.643, 1.772], [1538.45, 884.771, 1.889]]\nD: [[1907.37, 864.452, 1.647], [1907.37, 864.452, 1.647], [1907.37, 864.452, 1.647], [1907.37, 864.452, 1.647]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_148_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_148_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_148_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_148_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_148_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_148_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_148_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_148_7.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[333.441, 1111.848, 0.881], [358.68, 1279.096, 0.848], [365.642, 1024.723, 1.135], [416.842, 1221.204, 1.061]]\nB: [[359.553, 1105.178, 0.934], [359.553, 1105.178, 1.045], [359.553, 1105.178, 1.082], [359.553, 1105.178, 1.005]]\nC: [[401.411, 1302.101, 0.9], [401.716, 1050.934, 1.205], [369.114, 1004.72, 0.951], [298.807, 954.194, 1.162]]\nD: [[326.721, 1063.94, 1.102], [316.366, 1058.028, 0.884], [321.691, 954.121, 1.273], [352.284, 1064.278, 1.172]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[333.441, 1111.848, 0.881], [358.68, 1279.096, 0.848], [365.642, 1024.723, 1.135], [416.842, 1221.204, 1.061]]\nB: [[359.553, 1105.178, 0.934], [359.553, 1105.178, 1.045], [359.553, 1105.178, 1.082], [359.553, 1105.178, 1.005]]\nC: [[401.411, 1302.101, 0.9], [401.716, 1050.934, 1.205], [369.114, 1004.72, 0.951], [298.807, 954.194, 1.162]]\nD: [[326.721, 1063.94, 1.102], [316.366, 1058.028, 0.884], [321.691, 954.121, 1.273], [352.284, 1064.278, 1.172]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_149_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_149_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_149_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_149_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_149_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_149_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_149_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_149_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[1914.957, 873.014, 0.241], [1914.951, 872.993, 0.241], [1914.944, 872.972, 0.241], [1914.937, 872.951, 0.241]]\nB: [[2029.589, 880.527, 0.258], [1539.423, 812.215, 0.227], [2219.176, 984.894, 0.264], [2240.588, 737.815, 0.2]]\nC: [[2084.127, 771.252, 0.23], [1889.253, 1036.727, 0.264], [1713.036, 984.106, 0.254], [1867.742, 808.403, 0.199]]\nD: [[1951.845, 910.003, 0.272], [2067.414, 726.492, 0.226], [1574.866, 827.537, 0.223], [2080.281, 1029.475, 0.221]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1914.957, 873.014, 0.241], [1914.951, 872.993, 0.241], [1914.944, 872.972, 0.241], [1914.937, 872.951, 0.241]]\nB: [[2029.589, 880.527, 0.258], [1539.423, 812.215, 0.227], [2219.176, 984.894, 0.264], [2240.588, 737.815, 0.2]]\nC: [[2084.127, 771.252, 0.23], [1889.253, 1036.727, 0.264], [1713.036, 984.106, 0.254], [1867.742, 808.403, 0.199]]\nD: [[1951.845, 910.003, 0.272], [2067.414, 726.492, 0.226], [1574.866, 827.537, 0.223], [2080.281, 1029.475, 0.221]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_150_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_150_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_150_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_150_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_150_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_150_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_150_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_150_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[775.145, 1609.526, 0.275], [728.049, 1548.593, 0.225], [726.281, 1889.554, 0.225], [701.777, 1569.883, 0.285]]\nB: [[635.753, 1733.477, 0.272], [720.552, 1845.796, 0.237], [810.826, 1848.81, 0.272], [673.334, 1345.738, 0.236]]\nC: [[563.331, 1620.514, 0.207], [692.265, 1578.918, 0.262], [633.015, 1756.886, 0.242], [605.445, 1415.589, 0.246]]\nD: [[696.721, 1578.786, 0.244], [696.693, 1578.758, 0.244], [696.674, 1578.723, 0.244], [696.664, 1578.684, 0.244]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[775.145, 1609.526, 0.275], [728.049, 1548.593, 0.225], [726.281, 1889.554, 0.225], [701.777, 1569.883, 0.285]]\nB: [[635.753, 1733.477, 0.272], [720.552, 1845.796, 0.237], [810.826, 1848.81, 0.272], [673.334, 1345.738, 0.236]]\nC: [[563.331, 1620.514, 0.207], [692.265, 1578.918, 0.262], [633.015, 1756.886, 0.242], [605.445, 1415.589, 0.246]]\nD: [[696.721, 1578.786, 0.244], [696.693, 1578.758, 0.244], [696.674, 1578.723, 0.244], [696.664, 1578.684, 0.244]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_151_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_151_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_151_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_151_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_151_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_151_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_151_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_151_7.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[650.646, 1603.342, 0.03], [650.62, 1603.362, 0.13], [650.688, 1603.329, 0.308], [650.756, 1603.295, 0.485]]\nB: [[567.786, 1547.911, 0.04], [679.87, 1773.392, 0.15], [682.07, 1467.32, 0.353], [776.72, 1507.964, 0.528]]\nC: [[765.242, 1777.09, 0.03], [719.37, 1669.37, 0.15], [581.778, 1910.816, 0.336], [602.521, 1760.712, 0.392]]\nD: [[522.813, 1797.126, 0.03], [721.02, 1459.791, 0.11], [758.676, 1452.614, 0.344], [628.48, 1519.176, 0.474]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[650.646, 1603.342, 0.03], [650.62, 1603.362, 0.13], [650.688, 1603.329, 0.308], [650.756, 1603.295, 0.485]]\nB: [[567.786, 1547.911, 0.04], [679.87, 1773.392, 0.15], [682.07, 1467.32, 0.353], [776.72, 1507.964, 0.528]]\nC: [[765.242, 1777.09, 0.03], [719.37, 1669.37, 0.15], [581.778, 1910.816, 0.336], [602.521, 1760.712, 0.392]]\nD: [[522.813, 1797.126, 0.03], [721.02, 1459.791, 0.11], [758.676, 1452.614, 0.344], [628.48, 1519.176, 0.474]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_152_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_152_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_152_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_152_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_152_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_152_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_152_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_152_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[1344.72, 1281.191, 3.09], [1075.39, 972.846, 3.969], [1227.93, 967.953, 2.882], [1313.85, 1191.672, 2.766]]\nB: [[1340.58, 1102.095, 3.358], [1340.58, 1102.095, 3.358], [1340.56, 1102.069, 3.358], [1340.56, 1102.069, 3.358]]\nC: [[1595.5, 1117.589, 3.115], [1363.06, 1051.692, 3.599], [1333.05, 1281.644, 3.54], [1459.61, 1297.063, 3.45]]\nD: [[1406.68, 1037.893, 3.818], [1555.01, 1107.104, 3.069], [1222.57, 1178.434, 3.387], [1139.88, 1136.126, 3.194]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1344.72, 1281.191, 3.09], [1075.39, 972.846, 3.969], [1227.93, 967.953, 2.882], [1313.85, 1191.672, 2.766]]\nB: [[1340.58, 1102.095, 3.358], [1340.58, 1102.095, 3.358], [1340.56, 1102.069, 3.358], [1340.56, 1102.069, 3.358]]\nC: [[1595.5, 1117.589, 3.115], [1363.06, 1051.692, 3.599], [1333.05, 1281.644, 3.54], [1459.61, 1297.063, 3.45]]\nD: [[1406.68, 1037.893, 3.818], [1555.01, 1107.104, 3.069], [1222.57, 1178.434, 3.387], [1139.88, 1136.126, 3.194]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_153_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_153_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_153_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_153_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_153_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_153_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_153_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_153_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[366.659, 1011.101, 1.22], [364.315, 1040.812, 1.078], [399.107, 1299.752, 1.185], [296.881, 916.886, 1.47]]\nB: [[363.259, 1094.238, 1.247], [363.277, 1094.229, 1.276], [363.296, 1094.221, 1.306], [363.315, 1094.212, 1.335]]\nC: [[349.227, 1179.598, 1.249], [388.454, 911.85, 1.32], [338.754, 1093.699, 1.127], [427.119, 948.416, 1.102]]\nD: [[420.809, 960.294, 1.148], [382.372, 1303.064, 1.394], [429.821, 1184.841, 1.121], [362.615, 1271.967, 1.2]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[366.659, 1011.101, 1.22], [364.315, 1040.812, 1.078], [399.107, 1299.752, 1.185], [296.881, 916.886, 1.47]]\nB: [[363.259, 1094.238, 1.247], [363.277, 1094.229, 1.276], [363.296, 1094.221, 1.306], [363.315, 1094.212, 1.335]]\nC: [[349.227, 1179.598, 1.249], [388.454, 911.85, 1.32], [338.754, 1093.699, 1.127], [427.119, 948.416, 1.102]]\nD: [[420.809, 960.294, 1.148], [382.372, 1303.064, 1.394], [429.821, 1184.841, 1.121], [362.615, 1271.967, 1.2]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_154_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_154_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_154_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_154_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_154_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_154_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_154_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_154_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[1954.841, 997.5, 0.236], [1604.826, 1018.173, 0.282], [2056.441, 825.549, 0.202], [2053.148, 806.675, 0.206]]\nB: [[1911.473, 872.92, 0.247], [1911.473, 872.927, 0.247], [1911.473, 872.935, 0.247], [1911.473, 872.912, 0.247]]\nC: [[1627.59, 704.51, 0.221], [1743.247, 820.718, 0.239], [1954.356, 974.622, 0.27], [1682.124, 823.985, 0.25]]\nD: [[1859.848, 855.57, 0.227], [1677.344, 943.885, 0.289], [1603.535, 883.7, 0.263], [2177.549, 800.573, 0.251]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1954.841, 997.5, 0.236], [1604.826, 1018.173, 0.282], [2056.441, 825.549, 0.202], [2053.148, 806.675, 0.206]]\nB: [[1911.473, 872.92, 0.247], [1911.473, 872.927, 0.247], [1911.473, 872.935, 0.247], [1911.473, 872.912, 0.247]]\nC: [[1627.59, 704.51, 0.221], [1743.247, 820.718, 0.239], [1954.356, 974.622, 0.27], [1682.124, 823.985, 0.25]]\nD: [[1859.848, 855.57, 0.227], [1677.344, 943.885, 0.289], [1603.535, 883.7, 0.263], [2177.549, 800.573, 0.251]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_155_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_155_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_155_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_155_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_155_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_155_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_155_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_155_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[405.086, 1259.825, 0.988], [341.109, 1024.661, 1.196], [400.072, 1270.26, 0.973], [361.016, 1307.675, 1.222]]\nB: [[493.579, 972.185, 0.834], [476.692, 1075.558, 0.912], [401.433, 969.577, 0.981], [403.596, 1269.671, 1.032]]\nC: [[431.9, 1414.092, 1.036], [482.304, 1353.899, 0.879], [441.439, 1311.735, 0.916], [400.616, 1079.275, 1.136]]\nD: [[417.374, 1192.132, 0.961], [416.718, 1192.286, 1.011], [416.058, 1192.412, 1.061], [415.392, 1192.512, 1.111]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[405.086, 1259.825, 0.988], [341.109, 1024.661, 1.196], [400.072, 1270.26, 0.973], [361.016, 1307.675, 1.222]]\nB: [[493.579, 972.185, 0.834], [476.692, 1075.558, 0.912], [401.433, 969.577, 0.981], [403.596, 1269.671, 1.032]]\nC: [[431.9, 1414.092, 1.036], [482.304, 1353.899, 0.879], [441.439, 1311.735, 0.916], [400.616, 1079.275, 1.136]]\nD: [[417.374, 1192.132, 0.961], [416.718, 1192.286, 1.011], [416.058, 1192.412, 1.061], [415.392, 1192.512, 1.111]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_156_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_156_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_156_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_156_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_156_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_156_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_156_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_156_7.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[326.56, 1024.467, 0.519], [375.34, 1251.894, 0.454], [324.82, 1119.168, 0.508], [372.44, 1074.004, 0.7]]\nB: [[446.37, 1235.831, 0.437], [419.37, 934.045, 0.644], [380.01, 1072.298, 0.649], [417.35, 1264.432, 0.693]]\nC: [[438.81, 1043.645, 0.444], [435.77, 1350.631, 0.617], [421.85, 1208.417, 0.602], [334.9, 991.841, 0.557]]\nD: [[387.52, 1143.568, 0.508], [387.52, 1143.568, 0.541], [387.52, 1143.568, 0.575], [387.52, 1143.568, 0.608]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[326.56, 1024.467, 0.519], [375.34, 1251.894, 0.454], [324.82, 1119.168, 0.508], [372.44, 1074.004, 0.7]]\nB: [[446.37, 1235.831, 0.437], [419.37, 934.045, 0.644], [380.01, 1072.298, 0.649], [417.35, 1264.432, 0.693]]\nC: [[438.81, 1043.645, 0.444], [435.77, 1350.631, 0.617], [421.85, 1208.417, 0.602], [334.9, 991.841, 0.557]]\nD: [[387.52, 1143.568, 0.508], [387.52, 1143.568, 0.541], [387.52, 1143.568, 0.575], [387.52, 1143.568, 0.608]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_157_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_157_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_157_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_157_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_157_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_157_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_157_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_157_7.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[482.206, 1161.483, 0.923], [362.768, 1002.877, 1.176], [464.329, 933.744, 1.058], [365.042, 1087.466, 0.59]]\nB: [[395.933, 960.032, 0.91], [390.823, 1104.508, 1.171], [437.592, 1073.569, 0.945], [481.154, 1034.808, 0.54]]\nC: [[418.967, 1094.306, 1.038], [418.951, 1094.348, 1.008], [418.987, 1094.368, 1.068], [418.873, 1094.555, 0.56]]\nD: [[425.42, 1279.316, 1.07], [448.65, 1259.388, 1.115], [445.464, 1156.231, 1.185], [492.603, 965.675, 0.64]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[482.206, 1161.483, 0.923], [362.768, 1002.877, 1.176], [464.329, 933.744, 1.058], [365.042, 1087.466, 0.59]]\nB: [[395.933, 960.032, 0.91], [390.823, 1104.508, 1.171], [437.592, 1073.569, 0.945], [481.154, 1034.808, 0.54]]\nC: [[418.967, 1094.306, 1.038], [418.951, 1094.348, 1.008], [418.987, 1094.368, 1.068], [418.873, 1094.555, 0.56]]\nD: [[425.42, 1279.316, 1.07], [448.65, 1259.388, 1.115], [445.464, 1156.231, 1.185], [492.603, 965.675, 0.64]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_158_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_158_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_158_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_158_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_158_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_158_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_158_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_158_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[2133.097, 774.816, 1.225], [1748.933, 832.577, 1.031], [1567.379, 834.66, 1.265], [1502.433, 691.935, 1.105]]\nB: [[1806.535, 861.266, 1.066], [1807.133, 859.654, 1.066], [1807.563, 858.13, 1.066], [1807.849, 856.657, 1.021]]\nC: [[2099.2, 882.41, 1.014], [2094.1, 791.51, 0.857], [1724.174, 770.83, 0.904], [1636.953, 895.976, 1.042]]\nD: [[1853.024, 785.737, 0.887], [1793.267, 868.356, 1.224], [1854.012, 828.75, 1.266], [2127.184, 793.379, 1.141]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[2133.097, 774.816, 1.225], [1748.933, 832.577, 1.031], [1567.379, 834.66, 1.265], [1502.433, 691.935, 1.105]]\nB: [[1806.535, 861.266, 1.066], [1807.133, 859.654, 1.066], [1807.563, 858.13, 1.066], [1807.849, 856.657, 1.021]]\nC: [[2099.2, 882.41, 1.014], [2094.1, 791.51, 0.857], [1724.174, 770.83, 0.904], [1636.953, 895.976, 1.042]]\nD: [[1853.024, 785.737, 0.887], [1793.267, 868.356, 1.224], [1854.012, 828.75, 1.266], [2127.184, 793.379, 1.141]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_159_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_159_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_159_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_159_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_159_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_159_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_159_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_159_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[1039.96, 1235.697, 1.849], [1056.893, 1292.385, 1.963], [1073.252, 1262.199, 2.07], [1057.534, 1122.452, 1.706]]\nB: [[1248.78, 1187.636, 2.366], [1189.102, 1066.566, 2.081], [1235.361, 1032.631, 1.92], [1111.357, 1241.726, 2.171]]\nC: [[1147.17, 1172.553, 1.994], [1344.188, 1301.314, 1.937], [974.406, 1174.168, 1.55], [1159.003, 1238.282, 2.257]]\nD: [[1181.87, 1122.359, 2.116], [1185.363, 1119.943, 2.248], [1188.856, 1117.528, 1.93], [1192.349, 1115.113, 1.954]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1039.96, 1235.697, 1.849], [1056.893, 1292.385, 1.963], [1073.252, 1262.199, 2.07], [1057.534, 1122.452, 1.706]]\nB: [[1248.78, 1187.636, 2.366], [1189.102, 1066.566, 2.081], [1235.361, 1032.631, 1.92], [1111.357, 1241.726, 2.171]]\nC: [[1147.17, 1172.553, 1.994], [1344.188, 1301.314, 1.937], [974.406, 1174.168, 1.55], [1159.003, 1238.282, 2.257]]\nD: [[1181.87, 1122.359, 2.116], [1185.363, 1119.943, 2.248], [1188.856, 1117.528, 1.93], [1192.349, 1115.113, 1.954]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_160_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_160_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_160_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_160_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_160_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_160_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_160_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_160_7.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[407.79, 1169.713, 1.047], [438.535, 914.451, 0.817], [474.944, 1174.579, 0.711], [407.788, 1311.155, 0.697]]\nB: [[411.096, 1097.949, 1.026], [411.096, 1097.949, 0.806], [411.096, 1097.949, 0.756], [411.181, 1097.914, 0.706]]\nC: [[380.059, 1294.905, 1.061], [462.302, 957.203, 0.843], [360.056, 1036.95, 0.635], [377.308, 1174.326, 0.803]]\nD: [[453.527, 1100.533, 1.002], [399.919, 928.877, 0.842], [458.797, 1077.795, 0.803], [333.443, 1294.271, 0.8]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[407.79, 1169.713, 1.047], [438.535, 914.451, 0.817], [474.944, 1174.579, 0.711], [407.788, 1311.155, 0.697]]\nB: [[411.096, 1097.949, 1.026], [411.096, 1097.949, 0.806], [411.096, 1097.949, 0.756], [411.181, 1097.914, 0.706]]\nC: [[380.059, 1294.905, 1.061], [462.302, 957.203, 0.843], [360.056, 1036.95, 0.635], [377.308, 1174.326, 0.803]]\nD: [[453.527, 1100.533, 1.002], [399.919, 928.877, 0.842], [458.797, 1077.795, 0.803], [333.443, 1294.271, 0.8]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_161_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_161_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_161_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_161_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_161_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_161_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_161_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_161_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[377.184, 977.334, 0.742], [375.911, 1030.014, 0.731], [428.729, 1127.517, 0.56], [402.816, 1027.37, 0.555]]\nB: [[447.851, 943.564, 0.851], [507.44, 1079.2, 0.691], [496.497, 1228.261, 0.42], [422.118, 1234.56, 0.566]]\nC: [[427.284, 1091.127, 0.774], [427.286, 1091.126, 0.707], [427.292, 1091.122, 0.507], [427.294, 1091.12, 0.628]]\nD: [[480.047, 995.531, 0.924], [418.676, 923.092, 0.59], [412.858, 1042.957, 0.421], [343.915, 1040.26, 0.623]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[377.184, 977.334, 0.742], [375.911, 1030.014, 0.731], [428.729, 1127.517, 0.56], [402.816, 1027.37, 0.555]]\nB: [[447.851, 943.564, 0.851], [507.44, 1079.2, 0.691], [496.497, 1228.261, 0.42], [422.118, 1234.56, 0.566]]\nC: [[427.284, 1091.127, 0.774], [427.286, 1091.126, 0.707], [427.292, 1091.122, 0.507], [427.294, 1091.12, 0.628]]\nD: [[480.047, 995.531, 0.924], [418.676, 923.092, 0.59], [412.858, 1042.957, 0.421], [343.915, 1040.26, 0.623]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_162_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_162_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_162_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_162_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_162_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_162_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_162_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_162_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[1470.303, 1200.503, -0.117], [1211.565, 906.028, -0.109], [1265.671, 860.24, -0.139], [1225.185, 861.066, -0.166]]\nB: [[1121.534, 848.02, -0.101], [1300.088, 882.168, -0.116], [1372.338, 987.41, -0.161], [1121.468, 1062.994, -0.117]]\nC: [[1239.215, 1012.078, -0.106], [1239.169, 1012.039, -0.126], [1239.146, 1012.02, -0.136], [1239.123, 1012.001, -0.146]]\nD: [[1353.598, 917.259, -0.096], [1218.603, 1014.25, -0.105], [1110.323, 1116.23, -0.13], [1224.972, 1200.395, -0.138]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1470.303, 1200.503, -0.117], [1211.565, 906.028, -0.109], [1265.671, 860.24, -0.139], [1225.185, 861.066, -0.166]]\nB: [[1121.534, 848.02, -0.101], [1300.088, 882.168, -0.116], [1372.338, 987.41, -0.161], [1121.468, 1062.994, -0.117]]\nC: [[1239.215, 1012.078, -0.106], [1239.169, 1012.039, -0.126], [1239.146, 1012.02, -0.136], [1239.123, 1012.001, -0.146]]\nD: [[1353.598, 917.259, -0.096], [1218.603, 1014.25, -0.105], [1110.323, 1116.23, -0.13], [1224.972, 1200.395, -0.138]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_163_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_163_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_163_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_163_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_163_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_163_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_163_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_163_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[390.311, 1121.642, 0.818], [389.934, 1120.974, 0.748], [389.624, 1120.4, 0.778], [389.38, 1119.804, 0.858]]\nB: [[315.313, 963.621, 0.732], [377.113, 912.889, 0.734], [351.418, 1317.0, 0.875], [422.52, 1203.28, 0.858]]\nC: [[416.232, 1151.901, 0.879], [421.322, 1109.197, 0.81], [452.884, 1049.1, 0.63], [415.15, 1098.518, 0.99]]\nD: [[432.466, 1326.016, 0.887], [398.73, 1272.899, 0.773], [453.388, 964.3, 0.838], [398.14, 1308.497, 0.972]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[390.311, 1121.642, 0.818], [389.934, 1120.974, 0.748], [389.624, 1120.4, 0.778], [389.38, 1119.804, 0.858]]\nB: [[315.313, 963.621, 0.732], [377.113, 912.889, 0.734], [351.418, 1317.0, 0.875], [422.52, 1203.28, 0.858]]\nC: [[416.232, 1151.901, 0.879], [421.322, 1109.197, 0.81], [452.884, 1049.1, 0.63], [415.15, 1098.518, 0.99]]\nD: [[432.466, 1326.016, 0.887], [398.73, 1272.899, 0.773], [453.388, 964.3, 0.838], [398.14, 1308.497, 0.972]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_164_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_164_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_164_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_164_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_164_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_164_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_164_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_164_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[288.389, 823.36, 1.04], [243.336, 770.263, 1.17], [329.163, 556.194, 0.924], [278.167, 724.049, 1.175]]\nB: [[303.148, 635.853, 0.872], [255.423, 572.035, 1.073], [262.191, 559.079, 0.89], [295.029, 679.27, 0.862]]\nC: [[250.538, 640.039, 0.892], [355.139, 799.298, 1.133], [280.545, 705.593, 1.08], [285.606, 739.161, 1.054]]\nD: [[301.073, 691.921, 1.008], [300.166, 690.701, 1.008], [299.259, 689.481, 1.008], [298.351, 688.262, 1.008]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[288.389, 823.36, 1.04], [243.336, 770.263, 1.17], [329.163, 556.194, 0.924], [278.167, 724.049, 1.175]]\nB: [[303.148, 635.853, 0.872], [255.423, 572.035, 1.073], [262.191, 559.079, 0.89], [295.029, 679.27, 0.862]]\nC: [[250.538, 640.039, 0.892], [355.139, 799.298, 1.133], [280.545, 705.593, 1.08], [285.606, 739.161, 1.054]]\nD: [[301.073, 691.921, 1.008], [300.166, 690.701, 1.008], [299.259, 689.481, 1.008], [298.351, 688.262, 1.008]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_165_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_165_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_165_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_165_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_165_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_165_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_165_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_165_7.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[374.966, 918.231, 1.798], [317.815, 1228.212, 1.721], [398.89, 875.569, 1.814], [428.79, 1016.473, 1.875]]\nB: [[360.439, 1086.932, 1.56], [381.745, 1179.649, 1.447], [395.985, 1168.142, 1.848], [396.108, 960.271, 1.709]]\nC: [[358.757, 1084.221, 1.582], [358.757, 1084.221, 1.575], [358.757, 1084.221, 1.585], [358.757, 1084.221, 1.664]]\nD: [[320.557, 873.472, 1.863], [409.377, 874.584, 1.304], [376.574, 953.049, 1.86], [336.743, 903.665, 1.608]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[374.966, 918.231, 1.798], [317.815, 1228.212, 1.721], [398.89, 875.569, 1.814], [428.79, 1016.473, 1.875]]\nB: [[360.439, 1086.932, 1.56], [381.745, 1179.649, 1.447], [395.985, 1168.142, 1.848], [396.108, 960.271, 1.709]]\nC: [[358.757, 1084.221, 1.582], [358.757, 1084.221, 1.575], [358.757, 1084.221, 1.585], [358.757, 1084.221, 1.664]]\nD: [[320.557, 873.472, 1.863], [409.377, 874.584, 1.304], [376.574, 953.049, 1.86], [336.743, 903.665, 1.608]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_166_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_166_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_166_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_166_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_166_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_166_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_166_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_166_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[2192.318, 842.128, 0.202], [2028.569, 1002.104, 0.362], [1823.353, 915.088, 0.434], [1848.897, 1015.376, 0.66]]\nB: [[2200.193, 887.932, 0.279], [2035.641, 762.88, 0.308], [1865.863, 809.786, 0.584], [1497.967, 1016.346, 0.579]]\nC: [[1842.467, 871.854, 0.249], [1837.266, 871.727, 0.362], [1831.484, 871.587, 0.487], [1825.702, 871.447, 0.612]]\nD: [[1510.791, 938.627, 0.271], [1853.106, 942.739, 0.426], [1927.734, 739.554, 0.479], [1666.087, 996.74, 0.708]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[2192.318, 842.128, 0.202], [2028.569, 1002.104, 0.362], [1823.353, 915.088, 0.434], [1848.897, 1015.376, 0.66]]\nB: [[2200.193, 887.932, 0.279], [2035.641, 762.88, 0.308], [1865.863, 809.786, 0.584], [1497.967, 1016.346, 0.579]]\nC: [[1842.467, 871.854, 0.249], [1837.266, 871.727, 0.362], [1831.484, 871.587, 0.487], [1825.702, 871.447, 0.612]]\nD: [[1510.791, 938.627, 0.271], [1853.106, 942.739, 0.426], [1927.734, 739.554, 0.479], [1666.087, 996.74, 0.708]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_167_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_167_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_167_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_167_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_167_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_167_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_167_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_167_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[342.032, 761.086, -0.593], [347.646, 679.25, -0.401], [364.231, 569.092, -0.328], [289.053, 634.842, -0.205]]\nB: [[301.298, 665.695, -0.633], [284.255, 584.96, -0.505], [255.024, 783.231, -0.315], [245.543, 830.729, -0.175]]\nC: [[311.284, 836.779, -0.649], [370.864, 573.4, -0.497], [360.873, 715.839, -0.301], [249.474, 613.179, -0.21]]\nD: [[312.445, 705.589, -0.612], [310.539, 703.33, -0.479], [308.633, 701.071, -0.346], [306.729, 698.815, -0.214]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[342.032, 761.086, -0.593], [347.646, 679.25, -0.401], [364.231, 569.092, -0.328], [289.053, 634.842, -0.205]]\nB: [[301.298, 665.695, -0.633], [284.255, 584.96, -0.505], [255.024, 783.231, -0.315], [245.543, 830.729, -0.175]]\nC: [[311.284, 836.779, -0.649], [370.864, 573.4, -0.497], [360.873, 715.839, -0.301], [249.474, 613.179, -0.21]]\nD: [[312.445, 705.589, -0.612], [310.539, 703.33, -0.479], [308.633, 701.071, -0.346], [306.729, 698.815, -0.214]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_168_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_168_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_168_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_168_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_168_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_168_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_168_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_168_7.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[363.209, 1123.747, 1.102], [363.143, 1123.593, 1.085], [363.102, 1123.498, 1.068], [363.137, 1123.58, 1.052]]\nB: [[378.885, 1276.718, 1.19], [435.59, 1222.123, 1.009], [430.113, 989.283, 1.158], [375.367, 990.91, 0.956]]\nC: [[413.521, 1104.065, 1.115], [428.494, 917.554, 1.251], [320.18, 1021.737, 0.933], [318.09, 1005.13, 1.022]]\nD: [[427.259, 1339.216, 1.291], [315.569, 1079.127, 0.936], [304.042, 1194.504, 1.038], [323.022, 982.56, 0.905]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[363.209, 1123.747, 1.102], [363.143, 1123.593, 1.085], [363.102, 1123.498, 1.068], [363.137, 1123.58, 1.052]]\nB: [[378.885, 1276.718, 1.19], [435.59, 1222.123, 1.009], [430.113, 989.283, 1.158], [375.367, 990.91, 0.956]]\nC: [[413.521, 1104.065, 1.115], [428.494, 917.554, 1.251], [320.18, 1021.737, 0.933], [318.09, 1005.13, 1.022]]\nD: [[427.259, 1339.216, 1.291], [315.569, 1079.127, 0.936], [304.042, 1194.504, 1.038], [323.022, 982.56, 0.905]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_169_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_169_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_169_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_169_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_169_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_169_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_169_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_169_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[1333.186, 1040.597, 0.635], [1333.186, 1040.597, 0.635], [1333.186, 1040.597, 0.635], [1333.186, 1040.597, 0.635]]\nB: [[1514.635, 950.684, 0.564], [1066.743, 1156.105, 0.658], [1204.49, 1019.642, 0.74], [1089.793, 1017.943, 0.569]]\nC: [[1321.867, 926.129, 0.741], [1261.374, 1241.754, 0.725], [1359.131, 1028.017, 0.564], [1440.895, 941.11, 0.737]]\nD: [[1471.054, 874.495, 0.591], [1148.049, 1089.103, 0.508], [1489.63, 929.92, 0.603], [1503.98, 1037.54, 0.682]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1333.186, 1040.597, 0.635], [1333.186, 1040.597, 0.635], [1333.186, 1040.597, 0.635], [1333.186, 1040.597, 0.635]]\nB: [[1514.635, 950.684, 0.564], [1066.743, 1156.105, 0.658], [1204.49, 1019.642, 0.74], [1089.793, 1017.943, 0.569]]\nC: [[1321.867, 926.129, 0.741], [1261.374, 1241.754, 0.725], [1359.131, 1028.017, 0.564], [1440.895, 941.11, 0.737]]\nD: [[1471.054, 874.495, 0.591], [1148.049, 1089.103, 0.508], [1489.63, 929.92, 0.603], [1503.98, 1037.54, 0.682]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_170_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_170_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_170_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_170_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_170_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_170_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_170_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_170_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[587.007, 1451.429, 1.011], [592.203, 1759.483, 1.117], [647.108, 1795.963, 1.03], [638.355, 1540.243, 0.93]]\nB: [[640.141, 1888.738, 0.846], [726.767, 1525.298, 0.951], [759.989, 1292.629, 0.84], [736.789, 1660.505, 0.979]]\nC: [[666.472, 1849.208, 0.854], [731.041, 1481.554, 0.835], [803.226, 1800.553, 1.04], [593.45, 1762.889, 1.042]]\nD: [[671.323, 1578.674, 0.957], [671.316, 1578.671, 0.964], [671.308, 1578.668, 0.97], [671.301, 1578.665, 0.976]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[587.007, 1451.429, 1.011], [592.203, 1759.483, 1.117], [647.108, 1795.963, 1.03], [638.355, 1540.243, 0.93]]\nB: [[640.141, 1888.738, 0.846], [726.767, 1525.298, 0.951], [759.989, 1292.629, 0.84], [736.789, 1660.505, 0.979]]\nC: [[666.472, 1849.208, 0.854], [731.041, 1481.554, 0.835], [803.226, 1800.553, 1.04], [593.45, 1762.889, 1.042]]\nD: [[671.323, 1578.674, 0.957], [671.316, 1578.671, 0.964], [671.308, 1578.668, 0.97], [671.301, 1578.665, 0.976]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_171_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_171_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_171_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_171_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_171_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_171_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_171_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_171_7.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[291.787, 679.069, -0.469], [277.905, 732.394, -0.336], [252.638, 766.267, -0.161], [332.611, 636.857, -0.009]]\nB: [[309.014, 658.598, -0.45], [335.072, 790.615, -0.333], [250.147, 699.934, -0.154], [283.681, 656.435, -0.007]]\nC: [[307.569, 699.998, -0.459], [305.228, 697.097, -0.309], [302.888, 694.195, -0.159], [300.547, 691.293, -0.008]]\nD: [[306.127, 672.26, -0.442], [332.611, 777.126, -0.361], [293.54, 777.55, -0.16], [277.878, 827.191, -0.008]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[291.787, 679.069, -0.469], [277.905, 732.394, -0.336], [252.638, 766.267, -0.161], [332.611, 636.857, -0.009]]\nB: [[309.014, 658.598, -0.45], [335.072, 790.615, -0.333], [250.147, 699.934, -0.154], [283.681, 656.435, -0.007]]\nC: [[307.569, 699.998, -0.459], [305.228, 697.097, -0.309], [302.888, 694.195, -0.159], [300.547, 691.293, -0.008]]\nD: [[306.127, 672.26, -0.442], [332.611, 777.126, -0.361], [293.54, 777.55, -0.16], [277.878, 827.191, -0.008]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_172_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_172_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_172_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_172_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_172_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_172_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_172_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_172_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[410.021, 1418.263, 1.006], [335.263, 1143.6, 0.762], [373.402, 1388.34, 1.144], [360.742, 1297.733, 1.1]]\nB: [[401.282, 1193.478, 0.862], [402.619, 1193.23, 0.782], [404.132, 1193.11, 1.001], [405.307, 1192.971, 1.1]]\nC: [[476.438, 1152.023, 0.752], [338.739, 1127.79, 0.816], [436.589, 1274.03, 1.097], [419.84, 1106.209, 1.0]]\nD: [[421.529, 1380.4, 0.797], [464.928, 1028.31, 0.794], [411.148, 983.52, 1.058], [384.373, 1245.097, 1.2]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[410.021, 1418.263, 1.006], [335.263, 1143.6, 0.762], [373.402, 1388.34, 1.144], [360.742, 1297.733, 1.1]]\nB: [[401.282, 1193.478, 0.862], [402.619, 1193.23, 0.782], [404.132, 1193.11, 1.001], [405.307, 1192.971, 1.1]]\nC: [[476.438, 1152.023, 0.752], [338.739, 1127.79, 0.816], [436.589, 1274.03, 1.097], [419.84, 1106.209, 1.0]]\nD: [[421.529, 1380.4, 0.797], [464.928, 1028.31, 0.794], [411.148, 983.52, 1.058], [384.373, 1245.097, 1.2]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_173_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_173_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_173_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_173_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_173_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_173_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_173_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_173_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[720.915, 1310.639, 0.073], [578.541, 1908.08, -0.015], [537.43, 1833.644, 0.24], [531.48, 1353.149, 0.504]]\nB: [[621.807, 1622.951, 0.061], [622.306, 1622.44, -0.014], [623.24, 1621.439, 0.236], [623.71, 1620.935, 0.436]]\nC: [[724.675, 1443.65, 0.065], [727.047, 1304.33, -0.013], [681.05, 1445.626, 0.193], [593.86, 1727.459, 0.372]]\nD: [[667.376, 1736.543, 0.069], [736.258, 1753.41, -0.016], [704.18, 1743.04, 0.25], [644.2, 1366.127, 0.492]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[720.915, 1310.639, 0.073], [578.541, 1908.08, -0.015], [537.43, 1833.644, 0.24], [531.48, 1353.149, 0.504]]\nB: [[621.807, 1622.951, 0.061], [622.306, 1622.44, -0.014], [623.24, 1621.439, 0.236], [623.71, 1620.935, 0.436]]\nC: [[724.675, 1443.65, 0.065], [727.047, 1304.33, -0.013], [681.05, 1445.626, 0.193], [593.86, 1727.459, 0.372]]\nD: [[667.376, 1736.543, 0.069], [736.258, 1753.41, -0.016], [704.18, 1743.04, 0.25], [644.2, 1366.127, 0.492]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_174_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_174_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_174_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_174_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_174_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_174_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_174_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_174_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[2009.229, 1014.34, 0.361], [1586.06, 754.207, 0.245], [2016.84, 904.343, 0.311], [2259.58, 852.389, 0.226]]\nB: [[1902.189, 877.268, 0.309], [1902.179, 877.284, 0.296], [1902.17, 877.299, 0.284], [1902.16, 877.315, 0.271]]\nC: [[1742.017, 880.394, 0.345], [2183.098, 837.873, 0.305], [2257.96, 877.436, 0.227], [1592.71, 1021.048, 0.257]]\nD: [[1584.307, 954.942, 0.354], [1730.467, 891.446, 0.254], [1805.84, 870.388, 0.252], [2140.35, 875.505, 0.278]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[2009.229, 1014.34, 0.361], [1586.06, 754.207, 0.245], [2016.84, 904.343, 0.311], [2259.58, 852.389, 0.226]]\nB: [[1902.189, 877.268, 0.309], [1902.179, 877.284, 0.296], [1902.17, 877.299, 0.284], [1902.16, 877.315, 0.271]]\nC: [[1742.017, 880.394, 0.345], [2183.098, 837.873, 0.305], [2257.96, 877.436, 0.227], [1592.71, 1021.048, 0.257]]\nD: [[1584.307, 954.942, 0.354], [1730.467, 891.446, 0.254], [1805.84, 870.388, 0.252], [2140.35, 875.505, 0.278]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_175_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_175_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_175_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_175_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_175_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_175_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_175_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_175_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[318.894, 1055.632, 0.551], [364.825, 1096.642, 0.53], [434.434, 1265.049, 0.623], [381.624, 937.181, 0.532]]\nB: [[473.765, 1023.067, 0.591], [358.411, 1214.0, 0.58], [399.844, 969.318, 0.568], [346.878, 1295.895, 0.701]]\nC: [[396.557, 1112.412, 0.545], [396.557, 1112.412, 0.57], [396.557, 1112.412, 0.595], [396.559, 1112.411, 0.612]]\nD: [[448.687, 1322.501, 0.469], [405.927, 1315.306, 0.58], [394.036, 899.204, 0.523], [445.892, 1047.925, 0.674]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[318.894, 1055.632, 0.551], [364.825, 1096.642, 0.53], [434.434, 1265.049, 0.623], [381.624, 937.181, 0.532]]\nB: [[473.765, 1023.067, 0.591], [358.411, 1214.0, 0.58], [399.844, 969.318, 0.568], [346.878, 1295.895, 0.701]]\nC: [[396.557, 1112.412, 0.545], [396.557, 1112.412, 0.57], [396.557, 1112.412, 0.595], [396.559, 1112.411, 0.612]]\nD: [[448.687, 1322.501, 0.469], [405.927, 1315.306, 0.58], [394.036, 899.204, 0.523], [445.892, 1047.925, 0.674]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_176_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_176_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_176_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_176_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_176_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_176_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_176_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_176_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[616.635, 1647.01, 0.068], [712.669, 1327.311, 0.03], [653.979, 1570.408, -0.012], [669.754, 1866.231, 0.168]]\nB: [[575.735, 1633.265, 0.067], [652.707, 1894.197, 0.04], [585.605, 1401.243, -0.011], [517.999, 1818.195, 0.152]]\nC: [[619.603, 1624.655, 0.071], [620.215, 1624.227, 0.03], [620.828, 1623.798, -0.012], [621.449, 1623.383, 0.146]]\nD: [[697.585, 1370.261, 0.08], [634.839, 1685.003, 0.02], [620.085, 1806.807, -0.014], [537.801, 1756.717, 0.169]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[616.635, 1647.01, 0.068], [712.669, 1327.311, 0.03], [653.979, 1570.408, -0.012], [669.754, 1866.231, 0.168]]\nB: [[575.735, 1633.265, 0.067], [652.707, 1894.197, 0.04], [585.605, 1401.243, -0.011], [517.999, 1818.195, 0.152]]\nC: [[619.603, 1624.655, 0.071], [620.215, 1624.227, 0.03], [620.828, 1623.798, -0.012], [621.449, 1623.383, 0.146]]\nD: [[697.585, 1370.261, 0.08], [634.839, 1685.003, 0.02], [620.085, 1806.807, -0.014], [537.801, 1756.717, 0.169]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_177_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_177_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_177_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_177_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_177_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_177_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_177_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_177_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[321.608, 1134.254, 0.47], [376.579, 1092.364, 0.561], [328.646, 1331.008, 0.5], [429.422, 1013.513, 0.542]]\nB: [[366.01, 1353.528, 0.41], [346.536, 1032.294, 0.437], [456.295, 1314.242, 0.49], [462.86, 1106.077, 0.501]]\nC: [[394.87, 1020.543, 0.46], [447.385, 1022.557, 0.479], [328.398, 1293.308, 0.63], [370.045, 1320.086, 0.474]]\nD: [[395.651, 1160.538, 0.51], [395.651, 1160.538, 0.535], [395.651, 1160.538, 0.56], [395.651, 1160.538, 0.585]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[321.608, 1134.254, 0.47], [376.579, 1092.364, 0.561], [328.646, 1331.008, 0.5], [429.422, 1013.513, 0.542]]\nB: [[366.01, 1353.528, 0.41], [346.536, 1032.294, 0.437], [456.295, 1314.242, 0.49], [462.86, 1106.077, 0.501]]\nC: [[394.87, 1020.543, 0.46], [447.385, 1022.557, 0.479], [328.398, 1293.308, 0.63], [370.045, 1320.086, 0.474]]\nD: [[395.651, 1160.538, 0.51], [395.651, 1160.538, 0.535], [395.651, 1160.538, 0.56], [395.651, 1160.538, 0.585]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_178_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_178_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_178_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_178_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_178_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_178_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_178_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_178_7.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[705.601, 1670.358, 1.653], [759.064, 1611.115, 1.64], [821.344, 1702.722, 1.626], [808.425, 1696.718, 1.401]]\nB: [[729.457, 1296.83, 1.225], [673.831, 1849.549, 1.6], [822.222, 1475.002, 1.59], [568.262, 1723.288, 1.529]]\nC: [[780.777, 1370.477, 1.602], [703.571, 1274.284, 1.37], [714.428, 1801.829, 1.419], [746.711, 1436.885, 1.447]]\nD: [[710.384, 1565.299, 1.507], [708.536, 1567.037, 1.44], [706.623, 1568.784, 1.524], [704.701, 1570.522, 1.457]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[705.601, 1670.358, 1.653], [759.064, 1611.115, 1.64], [821.344, 1702.722, 1.626], [808.425, 1696.718, 1.401]]\nB: [[729.457, 1296.83, 1.225], [673.831, 1849.549, 1.6], [822.222, 1475.002, 1.59], [568.262, 1723.288, 1.529]]\nC: [[780.777, 1370.477, 1.602], [703.571, 1274.284, 1.37], [714.428, 1801.829, 1.419], [746.711, 1436.885, 1.447]]\nD: [[710.384, 1565.299, 1.507], [708.536, 1567.037, 1.44], [706.623, 1568.784, 1.524], [704.701, 1570.522, 1.457]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_179_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_179_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_179_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_179_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_179_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_179_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_179_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_179_7.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[320.577, 1284.53, 0.932], [414.562, 975.368, 0.94], [402.331, 1119.979, 0.929], [395.463, 906.274, 0.959]]\nB: [[360.972, 1107.73, 0.916], [361.016, 1107.522, 0.816], [361.016, 1107.522, 0.816], [360.975, 1107.716, 1.016]]\nC: [[398.669, 1187.71, 1.007], [399.141, 1252.975, 0.956], [420.325, 1170.467, 0.694], [429.664, 938.089, 1.209]]\nD: [[297.101, 1040.56, 0.77], [316.568, 1235.213, 0.681], [321.404, 1279.995, 0.669], [348.852, 1148.111, 0.837]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[320.577, 1284.53, 0.932], [414.562, 975.368, 0.94], [402.331, 1119.979, 0.929], [395.463, 906.274, 0.959]]\nB: [[360.972, 1107.73, 0.916], [361.016, 1107.522, 0.816], [361.016, 1107.522, 0.816], [360.975, 1107.716, 1.016]]\nC: [[398.669, 1187.71, 1.007], [399.141, 1252.975, 0.956], [420.325, 1170.467, 0.694], [429.664, 938.089, 1.209]]\nD: [[297.101, 1040.56, 0.77], [316.568, 1235.213, 0.681], [321.404, 1279.995, 0.669], [348.852, 1148.111, 0.837]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_180_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_180_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_180_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_180_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_180_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_180_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_180_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_180_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[438.16, 1086.715, 0.696], [438.167, 1086.722, 0.834], [438.221, 1086.776, 0.858], [438.222, 1086.778, 0.851]]\nB: [[507.3, 1112.979, 0.562], [428.69, 1220.639, 0.948], [389.443, 1140.615, 0.714], [399.989, 1095.621, 0.887]]\nC: [[354.14, 1079.897, 0.71], [359.261, 923.891, 0.834], [367.923, 883.883, 0.807], [495.344, 1285.646, 0.707]]\nD: [[403.04, 1151.47, 0.76], [405.751, 1286.822, 0.742], [364.368, 898.997, 0.983], [413.957, 1207.042, 0.986]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[438.16, 1086.715, 0.696], [438.167, 1086.722, 0.834], [438.221, 1086.776, 0.858], [438.222, 1086.778, 0.851]]\nB: [[507.3, 1112.979, 0.562], [428.69, 1220.639, 0.948], [389.443, 1140.615, 0.714], [399.989, 1095.621, 0.887]]\nC: [[354.14, 1079.897, 0.71], [359.261, 923.891, 0.834], [367.923, 883.883, 0.807], [495.344, 1285.646, 0.707]]\nD: [[403.04, 1151.47, 0.76], [405.751, 1286.822, 0.742], [364.368, 898.997, 0.983], [413.957, 1207.042, 0.986]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_181_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_181_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_181_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_181_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_181_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_181_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_181_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_181_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[1355.191, 1159.351, -0.246], [1395.66, 1152.84, -0.228], [1240.863, 981.042, -0.233], [1262.588, 994.473, -0.268]]\nB: [[1217.511, 1111.827, -0.25], [1078.13, 1020.65, -0.204], [1571.618, 1018.832, -0.231], [1379.544, 1182.701, -0.209]]\nC: [[1345.406, 1027.941, -0.246], [1345.406, 1027.941, -0.246], [1345.406, 1027.941, -0.246], [1345.406, 1027.941, -0.246]]\nD: [[1421.753, 850.708, -0.231], [1362.386, 1149.899, -0.264], [1385.927, 906.596, -0.27], [1544.629, 853.12, -0.225]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1355.191, 1159.351, -0.246], [1395.66, 1152.84, -0.228], [1240.863, 981.042, -0.233], [1262.588, 994.473, -0.268]]\nB: [[1217.511, 1111.827, -0.25], [1078.13, 1020.65, -0.204], [1571.618, 1018.832, -0.231], [1379.544, 1182.701, -0.209]]\nC: [[1345.406, 1027.941, -0.246], [1345.406, 1027.941, -0.246], [1345.406, 1027.941, -0.246], [1345.406, 1027.941, -0.246]]\nD: [[1421.753, 850.708, -0.231], [1362.386, 1149.899, -0.264], [1385.927, 906.596, -0.27], [1544.629, 853.12, -0.225]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_182_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_182_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_182_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_182_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_182_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_182_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_182_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_182_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[1187.979, 1094.431, 0.392], [1347.424, 1034.148, 0.436], [1538.003, 902.017, 0.373], [1212.196, 1182.796, 0.356]]\nB: [[1108.77, 1168.85, 0.455], [1257.172, 1008.091, 0.364], [1442.783, 1234.413, 0.459], [1312.575, 918.52, 0.361]]\nC: [[1079.007, 1042.862, 0.339], [1523.203, 1091.192, 0.449], [1147.362, 1204.835, 0.348], [1308.995, 1207.074, 0.355]]\nD: [[1335.042, 1037.999, 0.407], [1335.042, 1037.999, 0.407], [1335.042, 1037.999, 0.407], [1335.042, 1037.999, 0.407]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1187.979, 1094.431, 0.392], [1347.424, 1034.148, 0.436], [1538.003, 902.017, 0.373], [1212.196, 1182.796, 0.356]]\nB: [[1108.77, 1168.85, 0.455], [1257.172, 1008.091, 0.364], [1442.783, 1234.413, 0.459], [1312.575, 918.52, 0.361]]\nC: [[1079.007, 1042.862, 0.339], [1523.203, 1091.192, 0.449], [1147.362, 1204.835, 0.348], [1308.995, 1207.074, 0.355]]\nD: [[1335.042, 1037.999, 0.407], [1335.042, 1037.999, 0.407], [1335.042, 1037.999, 0.407], [1335.042, 1037.999, 0.407]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_183_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_183_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_183_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_183_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_183_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_183_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_183_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_183_7.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[390.593, 1112.966, 0.519], [390.606, 1112.958, 0.557], [390.618, 1112.95, 0.594], [390.631, 1112.942, 0.632]]\nB: [[391.396, 942.794, 0.438], [401.44, 920.349, 0.468], [426.581, 916.13, 0.599], [338.619, 1261.56, 0.586]]\nC: [[374.135, 1327.311, 0.499], [345.631, 969.283, 0.639], [364.128, 913.86, 0.609], [339.45, 1013.046, 0.638]]\nD: [[447.459, 924.256, 0.485], [384.158, 1293.211, 0.601], [467.686, 901.08, 0.57], [450.581, 1198.823, 0.72]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[390.593, 1112.966, 0.519], [390.606, 1112.958, 0.557], [390.618, 1112.95, 0.594], [390.631, 1112.942, 0.632]]\nB: [[391.396, 942.794, 0.438], [401.44, 920.349, 0.468], [426.581, 916.13, 0.599], [338.619, 1261.56, 0.586]]\nC: [[374.135, 1327.311, 0.499], [345.631, 969.283, 0.639], [364.128, 913.86, 0.609], [339.45, 1013.046, 0.638]]\nD: [[447.459, 924.256, 0.485], [384.158, 1293.211, 0.601], [467.686, 901.08, 0.57], [450.581, 1198.823, 0.72]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_184_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_184_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_184_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_184_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_184_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_184_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_184_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_184_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[1529.469, 775.588, 0.372], [1793.42, 970.233, 0.339], [2009.523, 974.973, 0.358], [1610.13, 949.492, 0.279]]\nB: [[1591.041, 893.016, 0.43], [1803.48, 918.619, 0.324], [1586.65, 1006.415, 0.274], [2124.951, 926.48, 0.338]]\nC: [[1905.651, 875.006, 0.371], [1905.64, 875.027, 0.347], [1905.629, 875.048, 0.323], [1905.617, 875.069, 0.299]]\nD: [[1763.75, 935.01, 0.421], [2101.6, 837.057, 0.378], [1529.348, 1031.722, 0.334], [1741.729, 804.694, 0.321]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1529.469, 775.588, 0.372], [1793.42, 970.233, 0.339], [2009.523, 974.973, 0.358], [1610.13, 949.492, 0.279]]\nB: [[1591.041, 893.016, 0.43], [1803.48, 918.619, 0.324], [1586.65, 1006.415, 0.274], [2124.951, 926.48, 0.338]]\nC: [[1905.651, 875.006, 0.371], [1905.64, 875.027, 0.347], [1905.629, 875.048, 0.323], [1905.617, 875.069, 0.299]]\nD: [[1763.75, 935.01, 0.421], [2101.6, 837.057, 0.378], [1529.348, 1031.722, 0.334], [1741.729, 804.694, 0.321]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_185_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_185_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_185_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_185_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_185_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_185_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_185_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_185_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[1102.52, 1189.44, 1.15], [1247.531, 856.838, 0.847], [1591.837, 1070.296, 0.973], [1281.673, 1018.706, 0.914]]\nB: [[1378.756, 954.378, 1.005], [1282.349, 1065.012, 1.03], [1288.645, 985.253, 1.088], [1368.77, 1206.494, 1.161]]\nC: [[1330.066, 1046.334, 0.969], [1330.066, 1046.334, 0.969], [1330.066, 1046.334, 0.969], [1330.066, 1046.334, 0.969]]\nD: [[1490.554, 938.979, 1.093], [1292.466, 1009.582, 0.816], [1257.859, 968.601, 0.966], [1535.083, 927.732, 1.127]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1102.52, 1189.44, 1.15], [1247.531, 856.838, 0.847], [1591.837, 1070.296, 0.973], [1281.673, 1018.706, 0.914]]\nB: [[1378.756, 954.378, 1.005], [1282.349, 1065.012, 1.03], [1288.645, 985.253, 1.088], [1368.77, 1206.494, 1.161]]\nC: [[1330.066, 1046.334, 0.969], [1330.066, 1046.334, 0.969], [1330.066, 1046.334, 0.969], [1330.066, 1046.334, 0.969]]\nD: [[1490.554, 938.979, 1.093], [1292.466, 1009.582, 0.816], [1257.859, 968.601, 0.966], [1535.083, 927.732, 1.127]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_186_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_186_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_186_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_186_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_186_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_186_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_186_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_186_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[417.399, 1193.55, 0.994], [416.858, 1193.723, 1.044], [416.129, 1193.95, 1.119], [415.49, 1194.12, 1.144]]\nB: [[409.271, 1043.71, 0.802], [478.793, 1282.181, 1.098], [347.872, 1358.99, 1.243], [485.83, 1248.86, 0.927]]\nC: [[335.7, 1322.48, 1.133], [434.978, 1371.089, 0.975], [462.262, 1332.7, 0.913], [375.81, 1227.16, 1.032]]\nD: [[403.098, 1282.9, 0.834], [475.947, 968.443, 0.984], [492.927, 1029.78, 1.321], [332.54, 962.76, 1.165]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[417.399, 1193.55, 0.994], [416.858, 1193.723, 1.044], [416.129, 1193.95, 1.119], [415.49, 1194.12, 1.144]]\nB: [[409.271, 1043.71, 0.802], [478.793, 1282.181, 1.098], [347.872, 1358.99, 1.243], [485.83, 1248.86, 0.927]]\nC: [[335.7, 1322.48, 1.133], [434.978, 1371.089, 0.975], [462.262, 1332.7, 0.913], [375.81, 1227.16, 1.032]]\nD: [[403.098, 1282.9, 0.834], [475.947, 968.443, 0.984], [492.927, 1029.78, 1.321], [332.54, 962.76, 1.165]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_187_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_187_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_187_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_187_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_187_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_187_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_187_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_187_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[1542.554, 1185.251, 1.339], [1408.096, 1145.054, 1.499], [1475.525, 890.578, 1.509], [1249.888, 965.619, 1.875]]\nB: [[1536.937, 1095.924, 1.505], [1199.25, 1245.415, 1.478], [1539.555, 1123.021, 1.621], [1138.513, 1047.666, 1.622]]\nC: [[1308.987, 1052.606, 1.402], [1310.095, 1053.915, 1.502], [1311.245, 1055.225, 1.598], [1312.378, 1056.412, 1.693]]\nD: [[1455.025, 1135.652, 1.649], [1113.162, 854.503, 1.292], [1160.009, 1154.438, 1.809], [1412.034, 1067.9, 1.717]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1542.554, 1185.251, 1.339], [1408.096, 1145.054, 1.499], [1475.525, 890.578, 1.509], [1249.888, 965.619, 1.875]]\nB: [[1536.937, 1095.924, 1.505], [1199.25, 1245.415, 1.478], [1539.555, 1123.021, 1.621], [1138.513, 1047.666, 1.622]]\nC: [[1308.987, 1052.606, 1.402], [1310.095, 1053.915, 1.502], [1311.245, 1055.225, 1.598], [1312.378, 1056.412, 1.693]]\nD: [[1455.025, 1135.652, 1.649], [1113.162, 854.503, 1.292], [1160.009, 1154.438, 1.809], [1412.034, 1067.9, 1.717]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_188_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_188_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_188_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_188_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_188_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_188_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_188_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_188_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[1136.4, 1251.375, 0.964], [1098.34, 926.608, 0.962], [1167.04, 1063.589, 0.748], [1241.17, 957.965, 0.728]]\nB: [[1133.35, 933.225, 0.869], [1467.73, 1116.69, 0.711], [1336.16, 1216.158, 0.757], [1461.61, 1223.596, 0.736]]\nC: [[1339.79, 1239.08, 0.933], [1509.78, 946.412, 0.694], [1165.99, 1097.514, 0.848], [1235.1, 968.837, 0.75]]\nD: [[1264.56, 1079.653, 0.835], [1264.56, 1079.653, 0.836], [1264.56, 1079.653, 0.837], [1264.56, 1079.653, 0.837]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1136.4, 1251.375, 0.964], [1098.34, 926.608, 0.962], [1167.04, 1063.589, 0.748], [1241.17, 957.965, 0.728]]\nB: [[1133.35, 933.225, 0.869], [1467.73, 1116.69, 0.711], [1336.16, 1216.158, 0.757], [1461.61, 1223.596, 0.736]]\nC: [[1339.79, 1239.08, 0.933], [1509.78, 946.412, 0.694], [1165.99, 1097.514, 0.848], [1235.1, 968.837, 0.75]]\nD: [[1264.56, 1079.653, 0.835], [1264.56, 1079.653, 0.836], [1264.56, 1079.653, 0.837], [1264.56, 1079.653, 0.837]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_189_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_189_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_189_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_189_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_189_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_189_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_189_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_189_7.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[352.109, 1115.839, 0.635], [352.109, 1115.839, 0.814], [352.109, 1115.839, 0.993], [352.109, 1115.839, 0.801]]\nB: [[403.261, 1293.127, 0.727], [350.132, 1278.347, 0.888], [358.28, 1088.995, 1.055], [317.737, 1172.102, 0.72]]\nC: [[293.311, 1012.839, 0.541], [366.92, 1035.95, 0.854], [356.294, 946.374, 0.819], [390.703, 900.089, 0.781]]\nD: [[318.426, 1207.534, 0.602], [341.429, 1071.396, 0.779], [376.574, 1288.812, 1.059], [313.83, 1243.677, 0.781]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[352.109, 1115.839, 0.635], [352.109, 1115.839, 0.814], [352.109, 1115.839, 0.993], [352.109, 1115.839, 0.801]]\nB: [[403.261, 1293.127, 0.727], [350.132, 1278.347, 0.888], [358.28, 1088.995, 1.055], [317.737, 1172.102, 0.72]]\nC: [[293.311, 1012.839, 0.541], [366.92, 1035.95, 0.854], [356.294, 946.374, 0.819], [390.703, 900.089, 0.781]]\nD: [[318.426, 1207.534, 0.602], [341.429, 1071.396, 0.779], [376.574, 1288.812, 1.059], [313.83, 1243.677, 0.781]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_190_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_190_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_190_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_190_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_190_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_190_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_190_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_190_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[405.833, 996.81, 0.95], [355.845, 1117.198, 1.04], [354.65, 947.797, 0.81], [358.045, 1006.86, 0.9]]\nB: [[464.397, 1283.6, 0.75], [328.65, 936.28, 0.941], [365.855, 941.15, 0.91], [332.144, 978.284, 0.89]]\nC: [[390.721, 1120.16, 0.88], [390.397, 1119.603, 0.905], [390.144, 1119.015, 0.93], [389.874, 1118.388, 1.08]]\nD: [[338.901, 1032.25, 0.92], [452.113, 1110.634, 1.086], [401.774, 1235.48, 1.09], [348.321, 1273.502, 1.06]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[405.833, 996.81, 0.95], [355.845, 1117.198, 1.04], [354.65, 947.797, 0.81], [358.045, 1006.86, 0.9]]\nB: [[464.397, 1283.6, 0.75], [328.65, 936.28, 0.941], [365.855, 941.15, 0.91], [332.144, 978.284, 0.89]]\nC: [[390.721, 1120.16, 0.88], [390.397, 1119.603, 0.905], [390.144, 1119.015, 0.93], [389.874, 1118.388, 1.08]]\nD: [[338.901, 1032.25, 0.92], [452.113, 1110.634, 1.086], [401.774, 1235.48, 1.09], [348.321, 1273.502, 1.06]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_191_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_191_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_191_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_191_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_191_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_191_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_191_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_191_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[310.056, 702.514, -0.534], [308.348, 700.438, -0.379], [306.644, 698.366, -0.226], [299.451, 689.618, 0.29]]\nB: [[311.979, 810.175, -0.565], [260.058, 694.023, -0.342], [272.393, 835.115, -0.249], [357.323, 587.99, 0.25]]\nC: [[344.874, 770.786, -0.524], [341.8, 753.476, -0.452], [326.128, 633.003, -0.265], [319.66, 701.069, 0.24]]\nD: [[365.685, 814.947, -0.568], [301.272, 761.237, -0.383], [347.211, 755.567, -0.186], [320.282, 634.704, 0.25]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[310.056, 702.514, -0.534], [308.348, 700.438, -0.379], [306.644, 698.366, -0.226], [299.451, 689.618, 0.29]]\nB: [[311.979, 810.175, -0.565], [260.058, 694.023, -0.342], [272.393, 835.115, -0.249], [357.323, 587.99, 0.25]]\nC: [[344.874, 770.786, -0.524], [341.8, 753.476, -0.452], [326.128, 633.003, -0.265], [319.66, 701.069, 0.24]]\nD: [[365.685, 814.947, -0.568], [301.272, 761.237, -0.383], [347.211, 755.567, -0.186], [320.282, 634.704, 0.25]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_192_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_192_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_192_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_192_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_192_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_192_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_192_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_192_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[410.458, 1100.432, 0.572], [415.671, 1229.197, 0.401], [335.629, 978.571, 0.674], [395.098, 1300.552, 0.478]]\nB: [[401.269, 1173.484, 0.482], [401.265, 1173.449, 0.482], [401.261, 1173.415, 0.582], [401.213, 1173.399, 0.569]]\nC: [[344.262, 1103.389, 0.405], [334.337, 967.834, 0.523], [322.213, 1142.996, 0.661], [438.778, 1322.064, 0.463]]\nD: [[444.209, 1082.14, 0.539], [424.03, 1030.374, 0.393], [373.133, 1212.622, 0.685], [326.49, 1003.549, 0.642]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[410.458, 1100.432, 0.572], [415.671, 1229.197, 0.401], [335.629, 978.571, 0.674], [395.098, 1300.552, 0.478]]\nB: [[401.269, 1173.484, 0.482], [401.265, 1173.449, 0.482], [401.261, 1173.415, 0.582], [401.213, 1173.399, 0.569]]\nC: [[344.262, 1103.389, 0.405], [334.337, 967.834, 0.523], [322.213, 1142.996, 0.661], [438.778, 1322.064, 0.463]]\nD: [[444.209, 1082.14, 0.539], [424.03, 1030.374, 0.393], [373.133, 1212.622, 0.685], [326.49, 1003.549, 0.642]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_193_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_193_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_193_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_193_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_193_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_193_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_193_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_193_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[1266.882, 1077.846, 0.755], [1266.882, 1077.846, 0.755], [1266.882, 1077.846, 0.755], [1266.882, 1077.846, 0.755]]\nB: [[1271.739, 1048.404, 0.906], [1320.835, 1026.02, 0.819], [1396.815, 1235.0, 0.773], [1277.754, 1106.806, 0.618]]\nC: [[1317.337, 1260.195, 0.841], [1225.296, 1191.372, 0.797], [1170.961, 1158.338, 0.874], [1386.483, 1261.547, 0.715]]\nD: [[1218.794, 899.374, 0.833], [1062.39, 1230.912, 0.743], [1031.716, 1194.236, 0.798], [1438.585, 1159.315, 0.751]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1266.882, 1077.846, 0.755], [1266.882, 1077.846, 0.755], [1266.882, 1077.846, 0.755], [1266.882, 1077.846, 0.755]]\nB: [[1271.739, 1048.404, 0.906], [1320.835, 1026.02, 0.819], [1396.815, 1235.0, 0.773], [1277.754, 1106.806, 0.618]]\nC: [[1317.337, 1260.195, 0.841], [1225.296, 1191.372, 0.797], [1170.961, 1158.338, 0.874], [1386.483, 1261.547, 0.715]]\nD: [[1218.794, 899.374, 0.833], [1062.39, 1230.912, 0.743], [1031.716, 1194.236, 0.798], [1438.585, 1159.315, 0.751]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_194_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_194_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_194_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_194_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_194_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_194_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_194_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_194_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[1416.446, 917.476, 0.369], [1133.347, 1173.31, 0.385], [1116.37, 899.476, 0.388], [1512.12, 1114.073, 0.426]]\nB: [[1278.607, 1028.824, 0.314], [1281.473, 1031.847, 0.364], [1285.19, 1035.681, 0.414], [1288.26, 1038.767, 0.464]]\nC: [[1110.832, 1041.63, 0.259], [1144.062, 983.891, 0.355], [1120.76, 991.772, 0.443], [1528.04, 941.905, 0.384]]\nD: [[1057.696, 870.984, 0.36], [1077.37, 1211.58, 0.316], [1049.97, 1110.459, 0.384], [1427.8, 977.845, 0.531]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1416.446, 917.476, 0.369], [1133.347, 1173.31, 0.385], [1116.37, 899.476, 0.388], [1512.12, 1114.073, 0.426]]\nB: [[1278.607, 1028.824, 0.314], [1281.473, 1031.847, 0.364], [1285.19, 1035.681, 0.414], [1288.26, 1038.767, 0.464]]\nC: [[1110.832, 1041.63, 0.259], [1144.062, 983.891, 0.355], [1120.76, 991.772, 0.443], [1528.04, 941.905, 0.384]]\nD: [[1057.696, 870.984, 0.36], [1077.37, 1211.58, 0.316], [1049.97, 1110.459, 0.384], [1427.8, 977.845, 0.531]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_195_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_195_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_195_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_195_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_195_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_195_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_195_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_195_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[617.725, 1759.21, -0.3], [551.267, 1929.565, -0.167], [709.584, 1522.655, -0.14], [704.467, 1583.292, -0.112]]\nB: [[650.737, 1625.23, -0.3], [651.288, 1624.989, -0.175], [651.844, 1624.758, -0.15], [652.374, 1624.474, -0.125]]\nC: [[757.657, 1791.25, -0.3], [596.205, 1710.113, -0.196], [594.666, 1578.119, -0.17], [645.182, 1918.374, -0.147]]\nD: [[672.039, 1335.14, -0.3], [676.19, 1821.439, -0.205], [711.246, 1830.07, -0.17], [653.306, 1617.612, -0.115]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[617.725, 1759.21, -0.3], [551.267, 1929.565, -0.167], [709.584, 1522.655, -0.14], [704.467, 1583.292, -0.112]]\nB: [[650.737, 1625.23, -0.3], [651.288, 1624.989, -0.175], [651.844, 1624.758, -0.15], [652.374, 1624.474, -0.125]]\nC: [[757.657, 1791.25, -0.3], [596.205, 1710.113, -0.196], [594.666, 1578.119, -0.17], [645.182, 1918.374, -0.147]]\nD: [[672.039, 1335.14, -0.3], [676.19, 1821.439, -0.205], [711.246, 1830.07, -0.17], [653.306, 1617.612, -0.115]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_196_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_196_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_196_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_196_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_196_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_196_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_196_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_196_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[437.74, 1143.27, 0.674], [451.18, 1139.1, 0.835], [366.614, 1128.63, 0.978], [348.424, 1174.882, 0.985]]\nB: [[462.69, 887.14, 0.851], [342.59, 1042.19, 0.759], [433.546, 1124.124, 0.983], [359.663, 1221.9, 0.894]]\nC: [[398.14, 1103.34, 0.828], [398.08, 1103.37, 0.834], [398.005, 1103.406, 0.891], [398.065, 1103.387, 0.875]]\nD: [[378.04, 1027.98, 0.775], [428.89, 1003.86, 0.777], [374.706, 954.311, 1.028], [402.006, 1268.493, 0.732]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[437.74, 1143.27, 0.674], [451.18, 1139.1, 0.835], [366.614, 1128.63, 0.978], [348.424, 1174.882, 0.985]]\nB: [[462.69, 887.14, 0.851], [342.59, 1042.19, 0.759], [433.546, 1124.124, 0.983], [359.663, 1221.9, 0.894]]\nC: [[398.14, 1103.34, 0.828], [398.08, 1103.37, 0.834], [398.005, 1103.406, 0.891], [398.065, 1103.387, 0.875]]\nD: [[378.04, 1027.98, 0.775], [428.89, 1003.86, 0.777], [374.706, 954.311, 1.028], [402.006, 1268.493, 0.732]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_197_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_197_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_197_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_197_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_197_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_197_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_197_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_197_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[312.42, 996.53, 0.734], [392.368, 1071.586, 1.006], [362.03, 997.691, 1.059], [395.055, 894.894, 0.96]]\nB: [[292.47, 1178.83, 0.536], [385.591, 1223.459, 0.952], [347.65, 1157.122, 1.11], [325.064, 1035.946, 1.008]]\nC: [[424.62, 956.05, 0.586], [414.477, 1277.953, 1.04], [360.719, 916.865, 1.121], [431.128, 1236.879, 1.07]]\nD: [[364.33, 1100.33, 0.667], [364.332, 1100.333, 0.879], [364.336, 1100.342, 1.067], [364.336, 1100.342, 0.917]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[312.42, 996.53, 0.734], [392.368, 1071.586, 1.006], [362.03, 997.691, 1.059], [395.055, 894.894, 0.96]]\nB: [[292.47, 1178.83, 0.536], [385.591, 1223.459, 0.952], [347.65, 1157.122, 1.11], [325.064, 1035.946, 1.008]]\nC: [[424.62, 956.05, 0.586], [414.477, 1277.953, 1.04], [360.719, 916.865, 1.121], [431.128, 1236.879, 1.07]]\nD: [[364.33, 1100.33, 0.667], [364.332, 1100.333, 0.879], [364.336, 1100.342, 1.067], [364.336, 1100.342, 0.917]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_198_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_198_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_198_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_198_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_198_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_198_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_198_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_198_7.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Tracking",
+    "visual_input_component": "LiDAR depth image and natural image",
+    "source": "NuScenes_threed_Object_Tracking",
+    "options": "A: [[1802.663, 947.701, 0.425], [1936.882, 764.775, 0.32], [1669.337, 970.661, 0.35], [2083.327, 843.762, 0.422]]\nB: [[1895.725, 877.102, 0.355], [1895.725, 877.102, 0.34], [1895.725, 877.102, 0.39], [1895.773, 877.087, 0.415]]\nC: [[2260.495, 1033.212, 0.352], [1682.669, 730.505, 0.36], [1808.702, 968.32, 0.35], [2274.387, 847.273, 0.38]]\nD: [[2228.008, 726.803, 0.308], [1759.634, 872.261, 0.3], [2044.819, 846.131, 0.41], [2237.11, 733.52, 0.386]]",
+    "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.",
+    "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1802.663, 947.701, 0.425], [1936.882, 764.775, 0.32], [1669.337, 970.661, 0.35], [2083.327, 843.762, 0.422]]\nB: [[1895.725, 877.102, 0.355], [1895.725, 877.102, 0.34], [1895.725, 877.102, 0.39], [1895.773, 877.087, 0.415]]\nC: [[2260.495, 1033.212, 0.352], [1682.669, 730.505, 0.36], [1808.702, 968.32, 0.35], [2274.387, 847.273, 0.38]]\nD: [[2228.008, 726.803, 0.308], [1759.634, 872.261, 0.3], [2044.819, 846.131, 0.41], [2237.11, 733.52, 0.386]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_199_0.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_199_1.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_199_2.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_199_3.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_199_4.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_199_5.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_199_6.png",
+      "../MMIU-Benchmark/threeD_Object_Tracking/threeD_Object_Tracking_199_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_0_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_0_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_1_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_1_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_2_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_2_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_3_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_3_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_4_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_4_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_5_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_5_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_6_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_6_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_7_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_7_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_8_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_8_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_9_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_9_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_10_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_10_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_11_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_11_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_12_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_12_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_13_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_13_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_14_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_14_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_15_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_15_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_16_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_16_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_17_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_17_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_18_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_18_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_19_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_19_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_20_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_20_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_21_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_21_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_22_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_22_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_23_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_23_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_24_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_24_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_25_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_25_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_26_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_26_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_27_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_27_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_28_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_28_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_29_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_29_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_30_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_30_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_31_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_31_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_32_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_32_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_33_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_33_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_34_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_34_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_35_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_35_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_36_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_36_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_37_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_37_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_38_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_38_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_39_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_39_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_40_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_40_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_41_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_41_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_42_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_42_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_43_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_43_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_44_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_44_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_45_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_45_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_46_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_46_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_47_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_47_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_48_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_48_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_49_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_49_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_50_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_50_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_51_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_51_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_52_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_52_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_53_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_53_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_54_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_54_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_55_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_55_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_56_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_56_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_57_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_57_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_58_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_58_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_59_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_59_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_60_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_60_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_61_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_61_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_62_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_62_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_63_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_63_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_64_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_64_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_65_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_65_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_66_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_66_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_67_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_67_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_68_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_68_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_69_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_69_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_70_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_70_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_71_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_71_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_72_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_72_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_73_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_73_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_74_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_74_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_75_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_75_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_76_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_76_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_77_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_77_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_78_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_78_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_79_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_79_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_80_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_80_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_81_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_81_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_82_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_82_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_83_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_83_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_84_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_84_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_85_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_85_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_86_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_86_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_87_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_87_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_88_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_88_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_89_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_89_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_90_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_90_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_91_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_91_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_92_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_92_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_93_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_93_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_94_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_94_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_95_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_95_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_96_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_96_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_97_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_97_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_98_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_98_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_99_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_99_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_100_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_100_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_101_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_101_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_102_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_102_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_103_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_103_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_104_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_104_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_105_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_105_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_106_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_106_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_107_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_107_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_108_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_108_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_109_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_109_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_110_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_110_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_111_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_111_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_112_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_112_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_113_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_113_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_114_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_114_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_115_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_115_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_116_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_116_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_117_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_117_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_118_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_118_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_119_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_119_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_120_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_120_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_121_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_121_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_122_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_122_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_123_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_123_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_124_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_124_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_125_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_125_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_126_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_126_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_127_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_127_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_128_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_128_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_129_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_129_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_130_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_130_1.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_131_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_131_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Multiview_reasoning",
+    "visual_input_component": "natural image",
+    "source": "BLINK_MVR",
+    "options": "A: left\nB: right",
+    "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?",
+    "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right",
+    "input_image_path": [
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_132_0.jpg",
+      "../MMIU-Benchmark/Multiview_reasoning/Multiview_reasoning_132_1.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.891251, 0.378307, -0.25011], [0.443048, 0.608538, -0.658323], [-0.096846, -0.697542, -0.709969]] and translation vector: [4.935522, 3.588868, 1.45033], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.887006, 0.383874, -0.256633], [0.452131, 0.60913, -0.651566], [-0.093796, -0.693975, -0.713864]] and translation vector: [4.940225, 3.582454, 1.45688], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_0_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_0_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_0_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_0_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_0_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_0_5.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.997112, 0.02462, 0.071841], [-0.04661, 0.548461, -0.834876], [-0.059957, -0.835814, -0.545729]] and translation vector: [4.834615, 3.436689, 1.398379], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.998397, 0.025746, 0.050402], [-0.028149, 0.546702, -0.836854], [-0.0491, -0.836932, -0.545101]] and translation vector: [4.839047, 3.434593, 1.400064], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_1_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_1_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_1_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_1_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_1_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_1_5.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.485844, -0.617081, 0.619005], [-0.873216, -0.311825, 0.374512], [-0.038083, -0.722479, -0.690343]] and translation vector: [-0.164865, 3.073333, 1.323993], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.482952, -0.621872, 0.616468], [-0.874972, -0.315096, 0.367612], [-0.034361, -0.716931, -0.696297]] and translation vector: [-0.16601, 3.069565, 1.320265], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_2_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_2_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_2_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_2_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_2_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_2_5.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.934582, -0.143102, 0.325696], [-0.355737, 0.383069, -0.852473], [-0.002774, -0.912568, -0.408916]] and translation vector: [2.694367, 2.483235, 1.465763], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.935747, -0.141154, 0.323191], [-0.352667, 0.379116, -0.85551], [-0.001768, -0.91452, -0.404537]] and translation vector: [2.694351, 2.483417, 1.465522], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_3_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_3_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_3_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_3_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_3_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_3_5.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.08083, -0.463089, 0.882618], [-0.994842, 0.091929, -0.042874], [-0.061284, -0.881531, -0.468131]] and translation vector: [4.543997, 3.147744, 1.235262], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.097623, -0.477164, 0.873375], [-0.993778, 0.094019, -0.059714], [-0.05362, -0.873771, -0.483373]] and translation vector: [4.550471, 3.148599, 1.246367], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_4_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_4_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_4_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_4_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_4_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_4_5.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.877021, 0.121711, -0.464779], [0.46491, 0.459041, -0.75706], [0.12121, -0.880038, -0.459173]] and translation vector: [3.922419, 3.230202, 1.747047], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.876473, 0.11975, -0.466322], [0.465798, 0.455895, -0.758415], [0.121773, -0.881941, -0.455359]] and translation vector: [3.923546, 3.227255, 1.740959], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_5_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_5_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_5_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_5_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_5_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_5_5.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.246516, -0.470365, 0.847341], [-0.959136, 0.006886, 0.282862], [-0.138884, -0.882445, -0.449446]] and translation vector: [3.043058, 2.955299, 1.551102], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.243276, -0.470143, 0.8484], [-0.960213, 0.006937, 0.279182], [-0.13714, -0.882563, -0.44975]] and translation vector: [3.042024, 2.954946, 1.550413], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_6_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_6_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_6_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_6_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_6_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_6_5.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.236277, -0.452541, 0.859872], [-0.970097, 0.160455, -0.182119], [-0.055554, -0.877189, -0.47692]] and translation vector: [1.575898, 1.961144, 1.314442], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.238966, -0.451212, 0.859828], [-0.9694, 0.162109, -0.184349], [-0.056205, -0.87757, -0.476143]] and translation vector: [1.575219, 1.960128, 1.313122], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_7_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_7_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_7_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_7_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_7_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_7_5.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.633294, -0.360819, 0.684652], [-0.773758, -0.312806, 0.550863], [0.015401, -0.878613, -0.477285]] and translation vector: [3.241882, 3.386626, 1.367882], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.618852, -0.359339, 0.698497], [-0.785116, -0.311057, 0.535572], [0.02482, -0.87984, -0.47462]] and translation vector: [3.234923, 3.400149, 1.365622], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_8_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_8_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_8_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_8_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_8_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_8_5.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.974605, -0.106498, 0.196986], [-0.223762, -0.428932, 0.875185], [-0.008712, -0.897037, -0.44187]] and translation vector: [2.006689, 0.552817, 1.711334], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.976991, -0.101609, 0.187523], [-0.213093, -0.42809, 0.878254], [-0.008962, -0.898006, -0.439892]] and translation vector: [2.014877, 0.551422, 1.700123], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_9_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_9_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_9_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_9_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_9_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_9_5.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.874867, -0.0675, 0.479638], [-0.482919, 0.197999, -0.852987], [-0.037391, -0.977875, -0.205819]] and translation vector: [2.397274, 1.722858, 1.486845], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.874077, -0.063653, 0.4816], [-0.484123, 0.196153, -0.852731], [-0.040189, -0.978505, -0.202269]] and translation vector: [2.402604, 1.721845, 1.489477], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_10_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_10_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_10_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_10_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_10_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_10_5.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.606497, 0.359513, -0.709163], [0.793947, -0.321582, 0.515978], [-0.042553, -0.875977, -0.480473]] and translation vector: [5.898605, 1.464963, 1.329018], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.603336, 0.358994, -0.712116], [0.79647, -0.316333, 0.515334], [-0.040264, -0.878098, -0.476783]] and translation vector: [5.91512, 1.4588, 1.326343], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_11_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_11_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_11_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_11_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_11_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_11_5.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.810147, -0.229725, 0.539341], [-0.586224, 0.314131, -0.746769], [0.002128, -0.921167, -0.389162]] and translation vector: [3.108561, 2.950706, 1.466118], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.798041, -0.241673, 0.552019], [-0.602539, 0.306626, -0.736836], [0.00881, -0.920638, -0.390318]] and translation vector: [3.094201, 2.939754, 1.46817], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_12_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_12_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_12_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_12_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_12_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_12_5.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.994136, 0.036629, -0.101745], [0.107123, -0.462198, 0.880283], [-0.014782, -0.88602, -0.463411]] and translation vector: [3.8191, 1.340951, 1.354002], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.994264, 0.034625, -0.101195], [0.105882, -0.452335, 0.885541], [-0.015112, -0.891176, -0.453407]] and translation vector: [3.821174, 1.339834, 1.359098], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_13_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_13_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_13_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_13_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_13_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_13_5.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.199941, 0.263531, -0.943703], [0.979453, -0.027844, 0.19974], [0.026362, -0.964249, -0.263683]] and translation vector: [3.611549, 3.757055, 1.562045], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.20075, 0.267793, -0.94233], [0.97934, -0.030969, 0.199834], [0.024331, -0.962979, -0.268477]] and translation vector: [3.608934, 3.756757, 1.557843], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_14_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_14_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_14_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_14_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_14_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_14_5.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.24604, -0.551346, 0.797171], [-0.968826, -0.115295, 0.219278], [-0.028988, -0.826271, -0.562526]] and translation vector: [1.704247, 2.057158, 1.361636], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.236706, -0.55071, 0.800431], [-0.971342, -0.115817, 0.207564], [-0.021604, -0.826623, -0.562342]] and translation vector: [1.70792, 2.062619, 1.364929], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_15_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_15_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_15_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_15_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_15_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_15_5.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.695296, -0.421579, 0.582095], [-0.717067, -0.351947, 0.601622], [-0.048765, -0.835707, -0.547007]] and translation vector: [2.470866, 0.652559, 1.473924], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.695871, -0.418819, 0.583399], [-0.716734, -0.353708, 0.600986], [-0.045352, -0.83635, -0.546317]] and translation vector: [2.469546, 0.651931, 1.473078], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_16_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_16_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_16_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_16_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_16_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_16_5.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.937403, 0.174354, -0.301457], [0.34768, 0.517889, -0.781607], [0.019845, -0.837491, -0.54609]] and translation vector: [1.513881, 1.499843, 1.388066], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.93698, 0.17766, -0.300842], [0.348874, 0.522274, -0.77815], [0.018876, -0.834067, -0.551341]] and translation vector: [1.515168, 1.503997, 1.385631], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_17_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_17_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_17_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_17_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_17_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_17_5.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.693623, 0.392298, -0.604144], [0.720137, 0.397492, -0.568686], [0.017048, -0.82952, -0.558217]] and translation vector: [2.706242, 2.586761, 1.453005], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.690051, 0.396658, -0.605386], [0.723517, 0.399766, -0.56277], [0.018785, -0.826347, -0.562848]] and translation vector: [2.704536, 2.590014, 1.45316], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_18_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_18_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_18_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_18_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_18_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_18_5.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.530794, 0.426739, -0.732224], [0.841151, 0.159702, -0.516681], [-0.10355, -0.890162, -0.443721]] and translation vector: [5.418979, 4.373359, 1.385162], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.532043, 0.421439, -0.734384], [0.841755, 0.169492, -0.512564], [-0.091542, -0.890877, -0.444925]] and translation vector: [5.415919, 4.39552, 1.38299], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_19_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_19_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_19_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_19_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_19_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_19_5.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.484778, 0.389748, -0.782998], [0.874059, -0.248441, 0.417491], [-0.031813, -0.886777, -0.461102]] and translation vector: [2.948564, 2.712566, 1.480667], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.484062, 0.388161, -0.784229], [0.874419, -0.248162, 0.416902], [-0.03279, -0.887551, -0.459542]] and translation vector: [2.949191, 2.711738, 1.477649], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_20_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_20_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_20_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_20_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_20_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_20_5.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.998134, -0.025826, -0.055325], [0.04389, 0.326427, -0.944203], [0.042444, -0.94487, -0.324684]] and translation vector: [2.355182, 2.984659, 1.395898], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.998605, -0.022906, -0.047579], [0.037628, 0.323493, -0.945482], [0.037048, -0.945953, -0.32218]] and translation vector: [2.345251, 2.98743, 1.391141], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_21_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_21_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_21_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_21_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_21_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_21_5.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.473704, -0.275929, 0.836342], [-0.879436, -0.198746, 0.432542], [0.046868, -0.940406, -0.336809]] and translation vector: [2.984934, 2.048073, 1.446683], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.466625, -0.271085, 0.841888], [-0.8831, -0.195475, 0.426525], [0.048943, -0.942498, -0.330608]] and translation vector: [2.979092, 2.049407, 1.446378], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_22_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_22_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_22_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_22_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_22_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_22_5.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.566304, -0.590941, 0.574533], [-0.823945, 0.423135, -0.376925], [-0.020365, -0.686838, -0.726526]] and translation vector: [2.143516, 1.760119, 1.343188], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.561614, -0.596242, 0.57366], [-0.827171, 0.420904, -0.372329], [-0.019457, -0.683619, -0.729579]] and translation vector: [2.147258, 1.761594, 1.344016], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_23_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_23_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_23_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_23_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_23_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_23_5.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.054781, -0.427281, 0.902458], [-0.998013, -0.051617, 0.036143], [0.031139, -0.902644, -0.429259]] and translation vector: [1.328526, 0.849821, 1.501181], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.086578, -0.407933, 0.908898], [-0.995883, -0.060028, 0.067922], [0.026852, -0.911036, -0.41145]] and translation vector: [1.314662, 0.836147, 1.492068], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_24_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_24_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_24_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_24_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_24_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_24_5.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.941243, -0.209403, 0.264975], [-0.336113, 0.504116, -0.795548], [0.033012, -0.837865, -0.544878]] and translation vector: [4.828751, 9.008894, 1.463441], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.939528, -0.206646, 0.273103], [-0.341818, 0.516505, -0.785101], [0.021179, -0.830976, -0.555906]] and translation vector: [4.819307, 9.009376, 1.463735], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_25_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_25_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_25_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_25_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_25_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_25_5.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.68967, 0.288211, -0.664297], [0.724122, -0.27239, 0.633602], [0.001663, -0.918008, -0.396559]] and translation vector: [2.530043, 2.005069, 1.437417], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.68921, 0.288518, -0.66464], [0.724561, -0.273014, 0.632831], [0.001127, -0.917726, -0.397212]] and translation vector: [2.5334, 2.008455, 1.44069], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_26_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_26_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_26_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_26_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_26_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_26_5.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.994446, -0.078697, 0.06988], [-0.104992, -0.787844, 0.606859], [0.007297, -0.610826, -0.791731]] and translation vector: [1.305105, 0.510448, 1.183315], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.994112, -0.083607, 0.068931], [-0.10831, -0.785774, 0.608956], [0.003251, -0.612836, -0.790203]] and translation vector: [1.308194, 0.508844, 1.184721], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_27_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_27_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_27_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_27_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_27_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_27_5.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.928375, -0.17783, 0.326339], [-0.371449, 0.415395, -0.830345], [0.012101, -0.892089, -0.451697]] and translation vector: [2.096006, 1.919092, 1.36174], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.929206, -0.177937, 0.323905], [-0.369314, 0.414969, -0.83151], [0.013546, -0.892266, -0.451307]] and translation vector: [2.095672, 1.922099, 1.363168], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_28_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_28_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_28_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_28_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_28_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_28_5.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.684823, -0.326379, 0.651532], [-0.728707, -0.304485, 0.613413], [-0.001823, -0.894855, -0.446353]] and translation vector: [2.86358, 2.414664, 1.549631], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.684506, -0.325468, 0.652321], [-0.729004, -0.308374, 0.611113], [0.002261, -0.893855, -0.448351]] and translation vector: [2.864701, 2.413023, 1.547001], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_29_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_29_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_29_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_29_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_29_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_29_5.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.928108, -0.125197, 0.35063], [-0.371823, 0.3599, -0.855699], [-0.019061, -0.924553, -0.380577]] and translation vector: [5.296664, 4.137775, 1.856988], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.930637, -0.119308, 0.34595], [-0.365378, 0.355543, -0.860284], [-0.020361, -0.927014, -0.374474]] and translation vector: [5.29653, 4.126579, 1.856014], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_30_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_30_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_30_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_30_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_30_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_30_5.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.399387, 0.327689, -0.856218], [0.9115, 0.041819, -0.409169], [-0.098274, -0.94386, -0.315391]] and translation vector: [4.88233, 2.963563, 1.403722], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.394763, 0.316878, -0.86241], [0.913367, 0.033579, -0.40575], [-0.099614, -0.947872, -0.302681]] and translation vector: [4.88409, 2.965299, 1.400614], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_31_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_31_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_31_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_31_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_31_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_31_5.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.119369, -0.433868, 0.893034], [-0.990549, 0.113242, -0.077387], [-0.067553, -0.893832, -0.443285]] and translation vector: [3.407035, 4.679209, 1.397058], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.120544, -0.432859, 0.893366], [-0.990306, 0.115004, -0.077902], [-0.06902, -0.894096, -0.442526]] and translation vector: [3.401289, 4.681283, 1.397495], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_32_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_32_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_32_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_32_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_32_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_32_5.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.264492, -0.222038, 0.938479], [-0.962334, 0.002714, 0.271857], [-0.062909, -0.975034, -0.212957]] and translation vector: [0.925816, 4.784833, 1.497389], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.263009, -0.220134, 0.939344], [-0.962729, 0.003779, 0.270443], [-0.063084, -0.975462, -0.210935]] and translation vector: [0.925807, 4.784041, 1.498483], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_33_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_33_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_33_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_33_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_33_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_33_5.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.711391, -0.463973, 0.527875], [-0.700286, 0.531398, -0.476672], [-0.059349, -0.708763, -0.702945]] and translation vector: [2.53321, 4.394931, 1.530427], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.710702, -0.465347, 0.527594], [-0.701175, 0.5294, -0.477586], [-0.057065, -0.709357, -0.702536]] and translation vector: [2.526067, 4.393322, 1.526345], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_34_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_34_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_34_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_34_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_34_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_34_5.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.524333, 0.441188, -0.728305], [0.848808, -0.202677, 0.488311], [0.067827, -0.874228, -0.480754]] and translation vector: [3.10696, 1.250425, 1.344077], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.531491, 0.437044, -0.72561], [0.844432, -0.205894, 0.494513], [0.066725, -0.875557, -0.478485]] and translation vector: [3.107462, 1.25329, 1.344278], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_35_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_35_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_35_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_35_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_35_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_35_5.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.205964, -0.505778, 0.837716], [-0.978495, 0.11627, -0.170378], [-0.011228, -0.854792, -0.518849]] and translation vector: [2.901534, 4.292832, 1.280844], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.204012, -0.504726, 0.838827], [-0.978841, 0.118998, -0.166463], [-0.0158, -0.855039, -0.518324]] and translation vector: [2.909629, 4.290413, 1.285823], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_36_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_36_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_36_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_36_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_36_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_36_5.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.999847, -0.004634, 0.01689], [-0.017397, -0.374134, 0.927211], [0.002023, -0.927363, -0.374157]] and translation vector: [3.310194, 3.16458, 1.506432], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.999774, -0.010896, 0.018284], [-0.021018, -0.369724, 0.928904], [-0.003361, -0.929078, -0.369869]] and translation vector: [3.316631, 3.168954, 1.519748], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_37_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_37_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_37_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_37_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_37_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_37_5.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.951558, 0.16536, -0.259218], [0.307283, -0.481983, 0.820531], [0.010744, -0.860436, -0.509446]] and translation vector: [2.919862, 3.428013, 1.521081], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.951326, 0.167996, -0.258374], [0.307875, -0.4803, 0.821295], [0.013877, -0.860866, -0.508643]] and translation vector: [2.920042, 3.428186, 1.518811], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_38_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_38_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_38_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_38_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_38_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_38_5.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.567127, -0.123224, 0.81436], [-0.823556, -0.071568, 0.562702], [-0.011056, -0.989795, -0.14207]] and translation vector: [0.249561, 0.967409, 1.634127], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.566682, -0.123694, 0.814599], [-0.82386, -0.07149, 0.562268], [-0.011313, -0.989742, -0.142418]] and translation vector: [0.249762, 0.967631, 1.633273], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_39_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_39_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_39_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_39_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_39_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_39_5.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.464707, 0.496079, -0.733453], [0.882598, 0.326106, -0.338639], [0.071191, -0.804711, -0.589382]] and translation vector: [2.864701, 0.868861, 1.204561], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.473617, 0.501904, -0.723726], [0.878064, 0.332992, -0.343688], [0.068496, -0.798254, -0.598414]] and translation vector: [2.869803, 0.866998, 1.20304], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_40_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_40_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_40_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_40_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_40_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_40_5.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.943065, -0.17817, 0.280864], [-0.332105, 0.550897, -0.765649], [-0.018311, -0.815333, -0.578703]] and translation vector: [2.74599, 1.673222, 1.294065], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.942639, -0.173012, 0.285478], [-0.332909, 0.550136, -0.765848], [-0.024551, -0.816957, -0.576177]] and translation vector: [2.737266, 1.663808, 1.300966], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_41_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_41_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_41_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_41_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_41_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_41_5.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.590232, -0.352789, 0.726062], [-0.807221, -0.252962, 0.533296], [-0.004475, -0.900861, -0.434086]] and translation vector: [2.518124, 2.463328, 1.346668], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.586587, -0.358769, 0.726086], [-0.809845, -0.250747, 0.530356], [-0.008212, -0.899117, -0.437632]] and translation vector: [2.520116, 2.462175, 1.344964], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_42_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_42_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_42_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_42_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_42_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_42_5.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.454685, 0.144673, -0.878824], [0.890085, 0.109034, -0.442562], [0.031795, -0.983454, -0.178347]] and translation vector: [3.311996, 2.119304, 1.59409], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.453171, 0.138778, -0.880555], [0.890847, 0.10604, -0.441756], [0.032068, -0.98463, -0.171684]] and translation vector: [3.314367, 2.120091, 1.591769], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_43_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_43_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_43_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_43_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_43_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_43_5.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.86482, -0.183466, 0.467362], [-0.501092, -0.256948, 0.826368], [-0.031523, -0.948851, -0.314147]] and translation vector: [3.012278, 2.022242, 1.442339], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.863867, -0.189194, 0.466839], [-0.502557, -0.260784, 0.824274], [-0.034203, -0.946677, -0.320364]] and translation vector: [3.015002, 2.018446, 1.436262], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_44_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_44_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_44_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_44_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_44_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_44_5.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.880278, -0.246293, 0.405524], [-0.473973, 0.417832, -0.775091], [0.021459, -0.874503, -0.484545]] and translation vector: [3.281806, 2.754624, 1.352781], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.883446, -0.241464, 0.401521], [-0.467927, 0.41107, -0.782347], [0.023856, -0.879043, -0.476146]] and translation vector: [3.2823, 2.745028, 1.352692], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_45_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_45_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_45_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_45_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_45_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_45_5.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.133825, -0.39571, 0.908573], [-0.990975, -0.046263, 0.125813], [-0.007752, -0.91721, -0.398329]] and translation vector: [4.990516, 4.227292, 1.32289], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.168071, -0.388121, 0.906153], [-0.985699, -0.054747, 0.159375], [-0.012247, -0.919981, -0.391772]] and translation vector: [4.987841, 4.19209, 1.32312], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_46_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_46_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_46_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_46_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_46_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_46_5.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.48142, 0.335029, -0.809933], [0.872625, 0.096524, -0.478757], [-0.08222, -0.937251, -0.338823]] and translation vector: [4.429162, 2.287411, 1.464776], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.484328, 0.331289, -0.809737], [0.871134, 0.09698, -0.481374], [-0.080946, -0.938532, -0.335568]] and translation vector: [4.432656, 2.285767, 1.465956], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_47_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_47_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_47_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_47_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_47_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_47_5.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.672393, -0.274439, 0.687438], [-0.739855, -0.221079, 0.635404], [-0.022402, -0.935846, -0.351697]] and translation vector: [3.802358, 2.110255, 1.494557], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.672432, -0.275262, 0.687071], [-0.739825, -0.222066, 0.635095], [-0.022242, -0.93537, -0.35297]] and translation vector: [3.806542, 2.108163, 1.497405], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_48_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_48_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_48_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_48_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_48_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_48_5.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.176261, -0.039155, 0.983564], [-0.983722, -0.028492, -0.177423], [0.03497, -0.998827, -0.033496]] and translation vector: [3.054739, 2.437738, 1.503838], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.18153, -0.048874, 0.98217], [-0.982778, -0.026092, -0.182941], [0.034567, -0.998464, -0.043296]] and translation vector: [3.061021, 2.450195, 1.498681], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_49_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_49_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_49_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_49_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_49_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_49_5.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.467192, 0.317292, -0.825262], [0.883302, -0.126478, 0.451421], [0.038855, -0.939856, -0.339354]] and translation vector: [2.723032, 3.168159, 1.438168], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.467636, 0.312306, -0.826911], [0.883318, -0.130557, 0.450227], [0.03265, -0.940968, -0.336919]] and translation vector: [2.722188, 3.168039, 1.441817], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_50_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_50_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_50_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_50_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_50_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_50_5.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.51864, -0.44867, 0.727811], [-0.853934, -0.229463, 0.467059], [-0.04255, -0.863738, -0.502143]] and translation vector: [1.002297, 1.98866, 1.344191], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.519607, -0.444592, 0.729621], [-0.853432, -0.229314, 0.468049], [-0.040778, -0.865883, -0.498582]] and translation vector: [1.000441, 1.985865, 1.344846], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_51_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_51_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_51_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_51_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_51_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_51_5.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.688084, 0.423256, -0.589401], [0.725514, -0.415863, 0.54835], [-0.013017, -0.80493, -0.593227]] and translation vector: [3.968163, 0.8771, 1.421607], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.688048, 0.420794, -0.591205], [0.725576, -0.411726, 0.551381], [-0.011397, -0.80834, -0.588605]] and translation vector: [3.964529, 0.870938, 1.417962], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_52_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_52_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_52_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_52_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_52_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_52_5.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.042655, 0.409797, -0.911179], [0.998036, -0.024411, -0.0577], [-0.045888, -0.91185, -0.40795]] and translation vector: [2.423933, 1.356295, 3.282493], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.032887, 0.418885, -0.907444], [0.998611, -0.023628, -0.047098], [-0.041169, -0.907732, -0.417526]] and translation vector: [2.425306, 1.358764, 3.278826], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_53_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_53_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_53_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_53_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_53_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_53_5.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.925351, 0.122106, -0.358909], [0.376741, 0.190476, -0.906524], [-0.042329, -0.974068, -0.222259]] and translation vector: [4.735593, 2.732706, 1.21643], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.924788, 0.125024, -0.359357], [0.377675, 0.187086, -0.906841], [-0.046146, -0.974355, -0.220234]] and translation vector: [4.740286, 2.733964, 1.218072], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_54_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_54_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_54_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_54_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_54_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_54_5.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.908726, 0.150598, -0.389277], [0.406624, 0.108936, -0.907078], [-0.094198, -0.982575, -0.16023]] and translation vector: [8.822721, 3.830595, 1.476402], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.908663, 0.151907, -0.388916], [0.40641, 0.108245, -0.907256], [-0.09572, -0.98245, -0.160095]] and translation vector: [8.818814, 3.832555, 1.475788], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_55_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_55_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_55_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_55_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_55_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_55_5.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.984594, -0.069457, 0.160469], [-0.174127, -0.305795, 0.936039], [-0.015944, -0.949561, -0.313178]] and translation vector: [3.941113, 2.817773, 1.559826], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.984592, -0.069572, 0.160429], [-0.174152, -0.307406, 0.935507], [-0.015768, -0.949032, -0.314785]] and translation vector: [3.94407, 2.817183, 1.553188], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_56_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_56_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_56_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_56_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_56_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_56_5.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.677945, 0.409221, -0.610679], [0.735109, 0.38004, -0.561413], [0.00234, -0.829523, -0.558468]] and translation vector: [3.092599, 2.044437, 1.437429], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.678782, 0.408186, -0.610442], [0.734335, 0.380383, -0.562193], [0.002723, -0.829875, -0.557943]] and translation vector: [3.0892, 2.043949, 1.440375], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_57_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_57_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_57_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_57_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_57_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_57_5.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.330673, -0.328207, 0.884837], [-0.942686, -0.070458, 0.326157], [-0.044703, -0.941975, -0.332694]] and translation vector: [3.753276, 4.481459, 1.345242], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.306694, -0.326667, 0.893995], [-0.950878, -0.063631, 0.302957], [-0.04208, -0.942995, -0.330136]] and translation vector: [3.754864, 4.497246, 1.34429], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_58_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_58_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_58_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_58_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_58_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_58_5.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.305635, -0.390507, 0.868385], [-0.952144, 0.122302, -0.280116], [0.003183, -0.91244, -0.409198]] and translation vector: [4.266061, 1.773856, 1.285079], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.300987, -0.399102, 0.866097], [-0.953628, 0.125052, -0.273781], [0.00096, -0.908339, -0.418234]] and translation vector: [4.263163, 1.772832, 1.291083], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_59_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_59_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_59_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_59_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_59_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_59_5.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.853196, -0.330732, 0.403328], [-0.517406, -0.438892, 0.734619], [-0.065945, -0.835458, -0.545584]] and translation vector: [2.734716, 6.775187, 1.412962], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.853022, -0.336855, 0.398601], [-0.516617, -0.436898, 0.736361], [-0.0739, -0.834056, -0.546708]] and translation vector: [2.728871, 6.767794, 1.411126], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_60_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_60_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_60_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_60_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_60_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_60_5.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.207785, -0.462455, 0.861952], [-0.977184, 0.13779, -0.161637], [-0.044019, -0.875871, -0.480534]] and translation vector: [2.720584, 1.654419, 1.522448], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.211008, -0.462778, 0.860995], [-0.976592, 0.137438, -0.165466], [-0.04176, -0.875755, -0.480946]] and translation vector: [2.717844, 1.649691, 1.521912], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_61_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_61_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_61_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_61_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_61_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_61_5.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.624751, -0.31057, 0.716403], [-0.780527, -0.273701, 0.562018], [0.021534, -0.910293, -0.413403]] and translation vector: [-0.212106, 0.775797, 1.619325], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.624146, -0.312612, 0.716042], [-0.781019, -0.274551, 0.56092], [0.02124, -0.909338, -0.415515]] and translation vector: [-0.212874, 0.777223, 1.616059], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_62_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_62_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_62_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_62_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_62_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_62_5.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.964843, 0.186346, -0.185345], [0.252505, 0.461537, -0.850426], [-0.07293, -0.867329, -0.492364]] and translation vector: [3.779865, 2.337391, 1.461827], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.966867, 0.182729, -0.178267], [0.244986, 0.467845, -0.849178], [-0.071768, -0.864715, -0.49711]] and translation vector: [3.779708, 2.335608, 1.46105], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_63_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_63_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_63_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_63_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_63_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_63_5.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.935902, 0.160482, -0.313582], [0.351212, -0.493772, 0.795512], [-0.027173, -0.854655, -0.518485]] and translation vector: [4.465, -0.226232, 1.550028], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.933656, 0.161027, -0.319933], [0.356818, -0.495752, 0.791777], [-0.03111, -0.853405, -0.520319]] and translation vector: [4.478531, -0.229773, 1.540292], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_64_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_64_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_64_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_64_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_64_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_64_5.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.955421, 0.119616, -0.269932], [0.295248, 0.388339, -0.872939], [0.000408, -0.91372, -0.406343]] and translation vector: [2.65583, 2.981598, 1.368648], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.951595, 0.120375, -0.282803], [0.307283, 0.392547, -0.866882], [0.006663, -0.91182, -0.410535]] and translation vector: [2.655525, 2.981353, 1.361859], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_65_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_65_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_65_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_65_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_65_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_65_5.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.409087, -0.112571, 0.905525], [-0.910894, 0.109148, -0.397943], [-0.05404, -0.987631, -0.147191]] and translation vector: [4.421403, 3.579741, 1.526424], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.417977, -0.10834, 0.901974], [-0.906895, 0.107978, -0.407287], [-0.053267, -0.988232, -0.143386]] and translation vector: [4.418822, 3.582731, 1.526625], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_66_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_66_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_66_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_66_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_66_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_66_5.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.931668, 0.072515, -0.356001], [0.362912, -0.231685, 0.902561], [-0.017031, -0.970084, -0.24217]] and translation vector: [5.886859, 3.543659, 1.354971], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.931979, 0.073028, -0.355079], [0.362119, -0.233112, 0.902513], [-0.016864, -0.969704, -0.2437]] and translation vector: [5.882501, 3.543666, 1.354317], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_67_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_67_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_67_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_67_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_67_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_67_5.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.081815, 0.638296, -0.765431], [0.996577, -0.061545, 0.055199], [-0.011875, -0.767327, -0.641146]] and translation vector: [3.004073, 1.570726, 1.431248], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.083332, 0.64082, -0.763155], [0.996457, -0.062303, 0.056492], [-0.011346, -0.765159, -0.643742]] and translation vector: [3.00242, 1.571458, 1.432065], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_68_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_68_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_68_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_68_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_68_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_68_5.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.606468, -0.360414, 0.70873], [-0.789578, -0.16805, 0.590192], [-0.093612, -0.91753, -0.386492]] and translation vector: [2.373669, 6.226582, 1.48631], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.603564, -0.356146, 0.713352], [-0.791899, -0.163667, 0.588311], [-0.092772, -0.919986, -0.380815]] and translation vector: [2.370215, 6.229294, 1.484576], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_69_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_69_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_69_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_69_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_69_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_69_5.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.481759, -0.460793, 0.745371], [-0.875469, 0.290199, -0.386444], [-0.038235, -0.838722, -0.543216]] and translation vector: [3.08436, 2.075189, 1.468295], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.482142, -0.463533, 0.743422], [-0.87538, 0.289132, -0.387445], [-0.035354, -0.83758, -0.54517]] and translation vector: [3.085865, 2.079347, 1.468915], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_70_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_70_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_70_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_70_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_70_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_70_5.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.731293, 0.384445, -0.563394], [0.682011, 0.401944, -0.610984], [-0.008437, -0.831049, -0.556135]] and translation vector: [5.176627, 2.209938, 1.427488], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.733453, 0.387758, -0.558292], [0.679719, 0.411882, -0.606907], [-0.005383, -0.82462, -0.565663]] and translation vector: [5.175584, 2.209993, 1.422561], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_71_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_71_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_71_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_71_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_71_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_71_5.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.414473, -0.491559, 0.765887], [-0.909569, 0.196057, -0.366396], [0.029948, -0.848488, -0.528367]] and translation vector: [0.955419, 3.497842, 1.497559], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.410009, -0.490704, 0.768832], [-0.911757, 0.198024, -0.359841], [0.024328, -0.848526, -0.528594]] and translation vector: [0.937857, 3.503192, 1.495427], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_72_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_72_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_72_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_72_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_72_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_72_5.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.82141, -0.124481, 0.556588], [-0.562763, -0.33543, 0.755503], [0.092651, -0.933805, -0.345579]] and translation vector: [1.795382, 2.457259, 1.379582], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.820332, -0.124179, 0.558243], [-0.564621, -0.330977, 0.75608], [0.090876, -0.935432, -0.341626]] and translation vector: [1.795684, 2.460531, 1.380001], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_73_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_73_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_73_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_73_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_73_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_73_5.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.993805, -0.057016, 0.095394], [-0.110597, -0.423109, 0.899304], [-0.010913, -0.904283, -0.426794]] and translation vector: [3.282054, 2.568905, 1.512321], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.993106, -0.061381, 0.099861], [-0.116562, -0.427194, 0.896615], [-0.012375, -0.902074, -0.431404]] and translation vector: [3.283498, 2.568158, 1.509645], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_74_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_74_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_74_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_74_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_74_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_74_5.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.348231, 0.123124, -0.929288], [0.936413, -1.6e-05, 0.350899], [0.043189, -0.992391, -0.1153]] and translation vector: [2.712005, 2.075202, 1.464169], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.348319, 0.120186, -0.929639], [0.93641, 0.000395, 0.350907], [0.042542, -0.992751, -0.112406]] and translation vector: [2.712393, 2.076758, 1.463984], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_75_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_75_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_75_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_75_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_75_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_75_5.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.482968, -0.397392, 0.78027], [-0.874514, 0.173759, -0.452807], [0.044362, -0.901048, -0.431445]] and translation vector: [8.974016, 2.795387, 1.945192], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.496352, -0.388832, 0.776173], [-0.867003, 0.176647, -0.465943], [0.044064, -0.904216, -0.424797]] and translation vector: [8.98292, 2.792107, 1.939625], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_76_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_76_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_76_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_76_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_76_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_76_5.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.079918, -0.690871, 0.718547], [-0.996802, 0.055321, -0.057677], [9.6e-05, -0.720858, -0.693082]] and translation vector: [1.142658, 0.968078, 1.385987], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.080635, -0.691404, 0.717954], [-0.996742, 0.054488, -0.059473], [0.002, -0.72041, -0.693545]] and translation vector: [1.144302, 0.967344, 1.387927], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_77_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_77_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_77_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_77_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_77_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_77_5.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.725417, 0.297171, -0.620854], [0.687848, -0.279954, 0.669695], [0.025203, -0.912861, -0.407492]] and translation vector: [3.434752, 3.057745, 1.556519], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.722045, 0.303192, -0.621873], [0.691238, -0.278447, 0.666827], [0.029018, -0.911341, -0.410629]] and translation vector: [3.433538, 3.052318, 1.549734], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_78_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_78_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_78_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_78_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_78_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_78_5.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.156961, 0.257294, -0.953501], [0.986843, 0.002956, -0.161652], [-0.038773, -0.966329, -0.254373]] and translation vector: [1.838324, 1.205476, 1.480452], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.155829, 0.255617, -0.954137], [0.987039, 0.002796, -0.160453], [-0.038347, -0.966774, -0.252739]] and translation vector: [1.83996, 1.205416, 1.474648], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_79_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_79_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_79_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_79_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_79_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_79_5.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.922168, 0.178823, -0.342969], [0.38661, 0.453076, -0.803278], [0.011746, -0.873352, -0.486947]] and translation vector: [3.207336, 1.959871, 1.267555], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.914921, 0.180426, -0.361063], [0.403188, 0.450583, -0.796502], [0.018979, -0.874312, -0.484993]] and translation vector: [3.204391, 1.957541, 1.273759], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_80_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_80_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_80_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_80_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_80_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_80_5.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.241978, -0.427128, 0.871211], [-0.963615, 0.210861, -0.164264], [-0.113543, -0.879261, -0.462611]] and translation vector: [2.164319, 10.11033, 1.716674], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.23973, -0.426819, 0.871983], [-0.964754, 0.205144, -0.16482], [-0.108534, -0.880762, -0.460955]] and translation vector: [2.164643, 10.108889, 1.726434], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_81_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_81_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_81_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_81_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_81_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_81_5.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.424269, -0.366439, 0.828081], [-0.894198, -0.025281, 0.446957], [-0.142848, -0.930098, -0.338395]] and translation vector: [2.638367, 6.760901, 1.41712], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.432512, -0.37625, 0.819371], [-0.890339, -0.034872, 0.45396], [-0.14223, -0.925862, -0.350073]] and translation vector: [2.640049, 6.763855, 1.420073], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_82_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_82_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_82_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_82_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_82_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_82_5.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.645842, -0.099101, 0.757012], [-0.761541, -0.013148, 0.647984], [-0.054263, -0.994991, -0.083961]] and translation vector: [3.729951, 1.432448, 1.733539], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.649827, -0.099601, 0.753528], [-0.757797, -0.00807, 0.652441], [-0.058903, -0.994995, -0.080722]] and translation vector: [3.727943, 1.43259, 1.731865], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_83_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_83_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_83_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_83_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_83_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_83_5.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.686341, -0.358824, 0.632599], [-0.727213, -0.35045, 0.590209], [0.009912, -0.865119, -0.50147]] and translation vector: [2.486494, 4.601647, 1.455454], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.681394, -0.352774, 0.64129], [-0.731846, -0.340576, 0.590263], [0.010179, -0.871527, -0.490243]] and translation vector: [2.480601, 4.595852, 1.449959], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_84_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_84_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_84_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_84_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_84_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_84_5.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.515401, -0.339121, 0.786994], [-0.847541, -0.337435, 0.40965], [0.126638, -0.878143, -0.461333]] and translation vector: [4.776819, 1.138867, 1.280463], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.495978, -0.33911, 0.799381], [-0.859276, -0.324304, 0.395565], [0.125103, -0.88308, -0.452237]] and translation vector: [4.773187, 1.14016, 1.284317], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_85_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_85_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_85_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_85_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_85_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_85_5.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.799511, 0.533863, -0.275266], [0.600541, 0.71925, -0.349328], [0.011492, -0.4446, -0.895656]] and translation vector: [2.031323, 2.312379, 1.200993], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.794986, 0.540559, -0.275306], [0.606553, 0.715482, -0.346669], [0.009582, -0.442584, -0.896676]] and translation vector: [2.031011, 2.313572, 1.199732], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_86_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_86_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_86_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_86_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_86_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_86_5.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.830629, 0.239867, -0.502514], [0.556756, 0.37214, -0.742654], [0.008867, -0.896647, -0.442658]] and translation vector: [4.849209, 2.614689, 1.447477], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.826514, 0.239564, -0.509396], [0.562778, 0.371773, -0.738286], [0.012512, -0.89688, -0.442097]] and translation vector: [4.848542, 2.612423, 1.449706], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_87_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_87_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_87_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_87_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_87_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_87_5.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.977181, 0.077241, -0.197866], [0.211774, -0.426158, 0.879512], [-0.016388, -0.901345, -0.432791]] and translation vector: [0.977323, 0.877303, 1.40232], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.979446, 0.063797, -0.19135], [0.200663, -0.404476, 0.892263], [-0.020472, -0.912321, -0.408965]] and translation vector: [0.961423, 0.875672, 1.418643], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_88_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_88_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_88_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_88_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_88_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_88_5.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.977514, -0.102294, 0.184398], [-0.210796, -0.497303, 0.841578], [0.005613, -0.861525, -0.507684]] and translation vector: [3.555602, 1.207732, 1.356493], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.976582, -0.105336, 0.187593], [-0.215087, -0.498001, 0.840079], [0.00493, -0.860755, -0.508995]] and translation vector: [3.555365, 1.207812, 1.356155], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_89_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_89_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_89_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_89_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_89_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_89_5.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.869565, 0.231948, -0.435955], [0.492522, 0.471291, -0.731647], [0.035758, -0.850932, -0.524058]] and translation vector: [2.750575, 3.154689, 1.290553], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.871211, 0.246607, -0.424472], [0.49036, 0.478017, -0.72873], [0.023195, -0.843022, -0.53738]] and translation vector: [2.712538, 3.137298, 1.287246], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_90_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_90_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_90_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_90_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_90_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_90_5.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.991592, 0.052224, -0.118397], [0.1292, -0.348306, 0.928435], [0.007248, -0.935925, -0.352124]] and translation vector: [2.177373, 2.142725, 1.46728], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.992093, 0.047571, -0.11614], [0.125441, -0.346386, 0.929667], [0.003996, -0.936885, -0.349615]] and translation vector: [2.181058, 2.142908, 1.465582], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_91_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_91_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_91_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_91_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_91_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_91_5.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.987126, 0.106622, -0.119219], [0.159938, -0.652529, 0.740693], [0.00118, -0.750225, -0.661181]] and translation vector: [4.64166, 4.052867, 1.404314], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.987387, 0.107853, -0.115912], [0.158278, -0.654013, 0.73974], [0.003975, -0.748756, -0.662834]] and translation vector: [4.649776, 4.051806, 1.400746], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_92_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_92_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_92_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_92_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_92_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_92_5.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.892065, -0.360019, 0.273141], [-0.443019, -0.577417, 0.685801], [-0.089185, -0.732786, -0.674589]] and translation vector: [2.898737, 2.45906, 1.649541], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.888376, -0.366176, 0.276954], [-0.450762, -0.581088, 0.677606], [-0.087189, -0.726809, -0.681283]] and translation vector: [2.873446, 2.440832, 1.651115], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_93_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_93_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_93_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_93_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_93_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_93_5.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.610102, 0.375008, -0.697958], [0.791763, 0.255448, -0.554849], [-0.029781, -0.891132, -0.452767]] and translation vector: [2.349929, 1.419923, 1.358478], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.607496, 0.374505, -0.700496], [0.793845, 0.255679, -0.551759], [-0.027534, -0.891277, -0.452623]] and translation vector: [2.354864, 1.421781, 1.358478], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_94_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_94_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_94_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_94_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_94_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_94_5.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.386761, -0.304254, 0.870543], [-0.920043, 0.191539, -0.34181], [-0.062746, -0.933136, -0.354007]] and translation vector: [2.082368, 4.008438, 1.845888], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.387201, -0.298257, 0.872421], [-0.919947, 0.188025, -0.344013], [-0.061432, -0.935783, -0.347183]] and translation vector: [2.08001, 4.010775, 1.842824], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_95_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_95_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_95_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_95_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_95_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_95_5.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.993306, 0.029023, -0.111812], [0.110831, -0.512349, 0.851596], [-0.032571, -0.858287, -0.512136]] and translation vector: [2.482234, 1.391135, 1.348064], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.992702, 0.031717, -0.116349], [0.116167, -0.510508, 0.85199], [-0.032374, -0.859288, -0.510467]] and translation vector: [2.48213, 1.388715, 1.34704], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_96_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_96_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_96_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_96_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_96_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_96_5.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.660671, 0.426343, -0.617856], [0.749322, -0.423957, 0.508701], [-0.045063, -0.799057, -0.599565]] and translation vector: [1.739014, 2.260029, 1.323145], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.661948, 0.412501, -0.625834], [0.748146, -0.41469, 0.517987], [-0.045857, -0.811095, -0.583114]] and translation vector: [1.741474, 2.257287, 1.327618], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_97_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_97_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_97_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_97_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_97_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_97_5.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.283698, -0.38675, 0.877463], [-0.95878, 0.129662, -0.252839], [-0.015988, -0.913024, -0.407593]] and translation vector: [3.69525, 3.551647, 1.352095], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.292652, -0.378333, 0.878191], [-0.956147, 0.127043, -0.2639], [-0.011726, -0.91691, -0.398922]] and translation vector: [3.694781, 3.553972, 1.346799], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_98_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_98_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_98_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_98_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_98_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_98_5.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.988959, -0.006087, -0.148062], [0.148117, 0.009943, 0.98892], [-0.004548, -0.999932, 0.010735]] and translation vector: [3.911582, 2.672538, 1.565046], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.987297, -0.007995, -0.158684], [0.158774, 0.012251, 0.987239], [-0.005949, -0.999893, 0.013365]] and translation vector: [3.955948, 2.679338, 1.574419], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_99_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_99_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_99_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_99_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_99_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_99_5.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.95695, -0.100486, 0.272304], [-0.288986, 0.24231, -0.92616], [0.027085, -0.964981, -0.260918]] and translation vector: [1.227478, 4.879099, 1.55452], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.957752, -0.097454, 0.27058], [-0.286469, 0.240112, -0.927514], [0.025421, -0.965841, -0.257885]] and translation vector: [1.221714, 4.885019, 1.554874], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_100_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_100_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_100_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_100_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_100_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_100_5.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.895509, 0.17248, -0.410263], [0.444823, 0.375965, -0.812886], [0.014038, -0.91044, -0.413402]] and translation vector: [2.818061, 5.409916, 1.54775], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.895274, 0.172164, -0.410907], [0.445264, 0.376844, -0.812237], [0.01501, -0.910136, -0.414037]] and translation vector: [2.819061, 5.407142, 1.548651], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_101_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_101_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_101_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_101_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_101_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_101_5.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.355681, -0.20797, 0.911175], [-0.934036, 0.113197, -0.338769], [-0.032689, -0.971563, -0.234514]] and translation vector: [0.539195, 4.841905, 1.636959], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.354881, -0.205091, 0.912139], [-0.934375, 0.110848, -0.338608], [-0.031664, -0.972446, -0.230969]] and translation vector: [0.533365, 4.84225, 1.627512], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_102_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_102_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_102_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_102_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_102_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_102_5.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.506976, -0.449046, 0.735753], [-0.861802, 0.247713, -0.442646], [0.016513, -0.858485, -0.512574]] and translation vector: [1.568574, 4.423309, 1.333385], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.503836, -0.444181, 0.740846], [-0.863753, 0.25025, -0.437385], [0.008882, -0.860278, -0.509748]] and translation vector: [1.576928, 4.418399, 1.331934], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_103_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_103_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_103_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_103_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_103_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_103_5.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.221984, 0.421429, -0.879273], [0.97466, 0.121427, -0.187867], [0.027595, -0.898695, -0.437705]] and translation vector: [3.155292, 0.483793, 1.35371], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.224547, 0.416482, -0.880978], [0.973822, 0.128715, -0.187361], [0.035363, -0.899986, -0.434482]] and translation vector: [3.157119, 0.483672, 1.354178], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_104_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_104_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_104_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_104_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_104_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_104_5.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.997074, 0.061747, -0.045056], [0.074474, 0.651998, -0.754554], [-0.017215, -0.755702, -0.654689]] and translation vector: [1.815792, 5.369752, 1.288561], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.994543, 0.080066, -0.066881], [0.102674, 0.63762, -0.763478], [-0.018484, -0.766179, -0.642361]] and translation vector: [1.819087, 5.36055, 1.286161], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_105_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_105_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_105_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_105_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_105_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_105_5.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.476704, 0.41796, -0.773345], [0.878176, 0.186897, -0.440314], [-0.039498, -0.889033, -0.456137]] and translation vector: [2.405627, 4.675593, 1.276166], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.455958, 0.42895, -0.779811], [0.88909, 0.179883, -0.420905], [-0.040272, -0.885237, -0.463394]] and translation vector: [2.408911, 4.675395, 1.276879], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_106_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_106_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_106_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_106_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_106_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_106_5.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.573165, 0.475287, -0.667521], [0.819422, -0.337921, 0.462988], [-0.005517, -0.81235, -0.583144]] and translation vector: [4.230747, 1.597944, 1.425469], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.580595, 0.472456, -0.663095], [0.814187, -0.339873, 0.470729], [-0.002969, -0.813186, -0.581996]] and translation vector: [4.228813, 1.597838, 1.42741], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_107_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_107_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_107_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_107_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_107_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_107_5.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.218501, -0.721835, 0.656667], [-0.97193, -0.10083, 0.212566], [-0.087226, -0.684681, -0.723605]] and translation vector: [2.10902, 2.428258, 1.386435], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.218569, -0.722397, 0.656026], [-0.971546, -0.098231, 0.215522], [-0.091251, -0.684466, -0.723312]] and translation vector: [2.107975, 2.430531, 1.385643], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_108_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_108_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_108_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_108_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_108_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_108_5.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.819759, -0.274444, 0.502669], [-0.572709, 0.39303, -0.719397], [-0.00013, -0.877615, -0.479366]] and translation vector: [2.765326, 1.370172, 1.355227], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.819555, -0.26888, 0.505998], [-0.572993, 0.389095, -0.721307], [-0.002936, -0.881084, -0.472951]] and translation vector: [2.765196, 1.369276, 1.358405], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_109_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_109_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_109_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_109_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_109_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_109_5.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.769532, -0.429513, 0.472588], [-0.615738, -0.302759, 0.727464], [-0.169375, -0.850797, -0.49745]] and translation vector: [2.184386, 2.253813, 1.283805], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.76638, -0.428136, 0.478917], [-0.620171, -0.298738, 0.725357], [-0.167481, -0.85291, -0.494464]] and translation vector: [2.185226, 2.257666, 1.286817], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_110_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_110_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_110_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_110_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_110_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_110_5.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.493838, -0.420518, 0.76111], [-0.864926, -0.147366, 0.479777], [-0.089593, -0.895236, -0.436493]] and translation vector: [0.736944, 2.108944, 1.402726], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.487676, -0.423405, 0.763479], [-0.869284, -0.154634, 0.469504], [-0.080731, -0.892646, -0.443471]] and translation vector: [0.733117, 2.095654, 1.39687], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_111_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_111_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_111_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_111_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_111_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_111_5.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.341382, 0.594812, -0.727775], [0.932196, 0.11517, -0.343142], [-0.120287, -0.795572, -0.593798]] and translation vector: [7.151203, 3.587152, 1.581923], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.344041, 0.585523, -0.734029], [0.930897, 0.110501, -0.348168], [-0.122749, -0.803089, -0.583079]] and translation vector: [7.150104, 3.60012, 1.584136], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_112_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_112_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_112_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_112_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_112_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_112_5.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.060487, 0.154719, -0.986105], [0.998165, 0.006603, -0.060191], [-0.002801, -0.987936, -0.154835]] and translation vector: [6.630666, 2.572317, 1.44523], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.062036, 0.175232, -0.982571], [0.998074, 0.011306, -0.060998], [0.00042, -0.984462, -0.175596]] and translation vector: [6.62843, 2.567178, 1.442285], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_113_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_113_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_113_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_113_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_113_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_113_5.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.207705, 0.494542, -0.843971], [0.97739, -0.069996, 0.199524], [0.039599, -0.866331, -0.497898]] and translation vector: [4.53083, 2.291093, 1.52739], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.209269, 0.494574, -0.843566], [0.977066, -0.071037, 0.200739], [0.039356, -0.866228, -0.498097]] and translation vector: [4.529976, 2.291335, 1.526507], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_114_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_114_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_114_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_114_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_114_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_114_5.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.778266, 0.076502, -0.623257], [0.626532, 0.028295, -0.778882], [-0.041951, -0.996668, -0.069952]] and translation vector: [4.354075, 2.27787, 1.510689], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.774603, 0.078895, -0.627508], [0.631084, 0.031306, -0.775082], [-0.041505, -0.996391, -0.074039]] and translation vector: [4.353431, 2.276987, 1.507071], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_115_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_115_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_115_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_115_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_115_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_115_5.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.982764, 0.054289, -0.17671], [0.184841, -0.27426, 0.943724], [0.002769, -0.960122, -0.279568]] and translation vector: [4.072058, 1.220293, 1.47625], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.982485, 0.057917, -0.177113], [0.186218, -0.270474, 0.944546], [0.0068, -0.960984, -0.276522]] and translation vector: [4.071517, 1.218265, 1.477941], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_116_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_116_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_116_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_116_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_116_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_116_5.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.286652, 0.220257, -0.932372], [0.958024, -0.061246, 0.28007], [0.004584, -0.973517, -0.228568]] and translation vector: [3.76659, 1.676076, 1.452194], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.299829, 0.216367, -0.929133], [0.953977, -0.07366, 0.290693], [-0.005544, -0.973529, -0.228495]] and translation vector: [3.753121, 1.670498, 1.452776], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_117_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_117_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_117_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_117_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_117_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_117_5.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.848489, -0.131122, 0.512712], [-0.527579, 0.133483, -0.838954], [0.041567, -0.982339, -0.182436]] and translation vector: [2.702568, 1.718074, 1.602473], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.851363, -0.128939, 0.508484], [-0.523333, 0.142037, -0.840207], [0.036112, -0.981428, -0.188403]] and translation vector: [2.706553, 1.721294, 1.602035], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_118_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_118_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_118_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_118_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_118_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_118_5.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.764638, 0.028658, -0.643823], [0.64431, -0.055554, 0.762744], [-0.013909, -0.998044, -0.060944]] and translation vector: [3.061982, 3.98913, 1.495508], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.765028, 0.027801, -0.643396], [0.643825, -0.056098, 0.763114], [-0.014878, -0.998038, -0.060816]] and translation vector: [3.064652, 3.991985, 1.487138], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_119_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_119_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_119_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_119_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_119_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_119_5.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.935878, -0.161972, 0.312885], [-0.352322, 0.433116, -0.829627], [-0.001139, -0.886666, -0.46241]] and translation vector: [1.123681, 2.231354, 1.408983], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.935522, -0.159, 0.315466], [-0.353249, 0.430874, -0.830399], [-0.003893, -0.888294, -0.459258]] and translation vector: [1.123559, 2.231523, 1.408322], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_120_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_120_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_120_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_120_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_120_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_120_5.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.45377, -0.425062, 0.783208], [-0.891046, 0.227634, -0.392708], [-0.01136, -0.876074, -0.482043]] and translation vector: [2.25004, 3.862298, 1.519108], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.453547, -0.422981, 0.784463], [-0.891155, 0.226808, -0.392938], [-0.011717, -0.877294, -0.47981]] and translation vector: [2.249275, 3.861866, 1.519019], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_121_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_121_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_121_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_121_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_121_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_121_5.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.205292, 0.226186, -0.952205], [0.97316, -0.150555, 0.174048], [-0.103992, -0.962379, -0.251024]] and translation vector: [4.876985, 2.837537, 1.671042], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.210488, 0.22021, -0.952472], [0.971775, -0.153305, 0.17931], [-0.106533, -0.96333, -0.246263]] and translation vector: [4.87733, 2.840179, 1.675237], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_122_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_122_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_122_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_122_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_122_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_122_5.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.037281, 0.595041, -0.80283], [0.998378, -0.012419, -0.055566], [-0.043034, -0.803599, -0.593613]] and translation vector: [3.95675, 2.244474, 1.442954], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.038109, 0.594465, -0.803218], [0.998341, -0.012073, -0.056302], [-0.043167, -0.80403, -0.593019]] and translation vector: [3.957906, 2.244142, 1.441716], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_123_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_123_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_123_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_123_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_123_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_123_5.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.070416, -0.411804, 0.908548], [-0.99671, 0.065705, -0.047468], [-0.040148, -0.908901, -0.415075]] and translation vector: [2.214543, 1.806687, 1.391502], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.072195, -0.409813, 0.909308], [-0.996578, 0.066438, -0.049181], [-0.040258, -0.909747, -0.413207]] and translation vector: [2.216063, 1.808517, 1.395188], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_124_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_124_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_124_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_124_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_124_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_124_5.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.086843, 0.425015, -0.901011], [0.995696, 0.066429, -0.064634], [0.032383, -0.902745, -0.428955]] and translation vector: [4.261571, 5.85756, 1.66629], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.086953, 0.422316, -0.902268], [0.995713, 0.06553, -0.065286], [0.031554, -0.904077, -0.426204]] and translation vector: [4.260677, 5.865657, 1.669414], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_125_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_125_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_125_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_125_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_125_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_125_5.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.882784, 0.25224, -0.396318], [0.469583, -0.498211, 0.728888], [-0.013595, -0.829554, -0.55826]] and translation vector: [3.463734, 1.394934, 1.262723], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.883097, 0.250738, -0.396574], [0.468931, -0.499833, 0.728197], [-0.015634, -0.829034, -0.558979]] and translation vector: [3.462241, 1.393432, 1.262782], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_126_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_126_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_126_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_126_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_126_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_126_5.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.079656, -0.319192, 0.944337], [-0.994012, 0.096527, -0.051219], [-0.074805, -0.942762, -0.324969]] and translation vector: [4.3352, 2.935251, 1.464921], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.08136, -0.319768, 0.943996], [-0.993796, 0.098086, -0.052427], [-0.075828, -0.942405, -0.325765]] and translation vector: [4.335558, 2.933583, 1.460394], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_127_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_127_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_127_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_127_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_127_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_127_5.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.255252, -0.433184, 0.864406], [-0.966562, 0.137073, -0.216725], [-0.024605, -0.890821, -0.453687]] and translation vector: [1.468232, 3.881342, 1.432686], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.253329, -0.437174, 0.862962], [-0.967015, 0.138948, -0.213484], [-0.026577, -0.888579, -0.457953]] and translation vector: [1.469363, 3.879031, 1.438972], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_128_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_128_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_128_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_128_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_128_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_128_5.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.999403, 0.004498, 0.03425], [-0.034232, -0.004158, 0.999405], [0.004638, -0.999981, -0.004001]] and translation vector: [2.393484, 5.775056, 1.371464], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.998454, -0.001139, 0.055575], [-0.055569, 0.004857, 0.998443], [-0.001408, -0.999988, 0.004786]] and translation vector: [2.356134, 5.774678, 1.367739], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_129_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_129_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_129_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_129_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_129_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_129_5.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.233902, -0.58763, 0.774584], [-0.967246, -0.059828, 0.246692], [-0.098622, -0.806915, -0.582377]] and translation vector: [0.860343, 3.117731, 1.418568], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.233684, -0.587102, 0.775051], [-0.967496, -0.061159, 0.24538], [-0.096661, -0.8072, -0.58231]] and translation vector: [0.859973, 3.119137, 1.418853], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_130_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_130_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_130_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_130_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_130_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_130_5.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.767458, -0.265442, 0.583565], [-0.640543, 0.35536, -0.680752], [-0.026676, -0.896248, -0.442751]] and translation vector: [3.343537, 3.697402, 1.375352], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.780866, -0.263741, 0.566294], [-0.624403, 0.357431, -0.694525], [-0.019236, -0.895926, -0.443786]] and translation vector: [3.344022, 3.709659, 1.376654], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_131_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_131_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_131_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_131_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_131_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_131_5.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.623567, 0.536294, -0.568817], [0.781209, -0.455034, 0.427384], [-0.029628, -0.710867, -0.702702]] and translation vector: [1.790477, 1.816361, 1.229059], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.636074, 0.528408, -0.562313], [0.771074, -0.462894, 0.437235], [-0.029252, -0.711698, -0.701876]] and translation vector: [1.794875, 1.819226, 1.230937], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_132_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_132_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_132_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_132_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_132_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_132_5.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.140295, 0.625342, -0.767636], [0.990108, -0.090149, 0.107516], [-0.001967, -0.775126, -0.631804]] and translation vector: [3.410891, 3.073526, 1.198756], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.148525, 0.612201, -0.776627], [0.988818, -0.102561, 0.108258], [-0.013376, -0.784022, -0.620589]] and translation vector: [3.421496, 3.097678, 1.206193], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_133_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_133_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_133_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_133_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_133_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_133_5.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.14018, 0.443083, -0.885453], [0.989985, -0.07783, 0.117782], [-0.016727, -0.893096, -0.449556]] and translation vector: [3.549726, 0.935059, 1.485921], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.140682, 0.443565, -0.885132], [0.989931, -0.077142, 0.11868], [-0.015638, -0.892916, -0.449951]] and translation vector: [3.549777, 0.934132, 1.483108], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_134_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_134_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_134_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_134_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_134_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_134_5.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.631332, 0.312126, -0.709927], [0.775472, -0.26347, 0.573784], [-0.007951, -0.912776, -0.408382]] and translation vector: [1.600176, 0.624978, 1.327739], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.627277, 0.311053, -0.713982], [0.778666, -0.267257, 0.567673], [-0.014241, -0.912041, -0.409851]] and translation vector: [1.601099, 0.627571, 1.328079], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_135_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_135_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_135_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_135_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_135_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_135_5.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.963317, 0.154363, -0.219528], [0.260086, 0.335369, -0.905474], [-0.066149, -0.929355, -0.363214]] and translation vector: [5.972451, 2.818726, 1.468896], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.963149, 0.154275, -0.220326], [0.260736, 0.334417, -0.905639], [-0.066037, -0.929712, -0.362318]] and translation vector: [5.973901, 2.819783, 1.467855], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_136_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_136_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_136_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_136_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_136_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_136_5.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.983299, 0.047874, -0.175588], [0.180439, -0.382417, 0.9062], [-0.023764, -0.922749, -0.384668]] and translation vector: [2.208684, 3.483128, 1.468268], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.982577, 0.045136, -0.18029], [0.183889, -0.376806, 0.907856], [-0.026957, -0.925192, -0.378541]] and translation vector: [2.211137, 3.481059, 1.465482], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_137_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_137_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_137_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_137_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_137_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_137_5.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.824719, -0.175736, 0.537546], [-0.564369, 0.316962, -0.762249], [-0.036427, -0.932015, -0.360584]] and translation vector: [4.397487, 4.054199, 1.411764], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.821778, -0.181799, 0.540028], [-0.568729, 0.319986, -0.757731], [-0.035047, -0.929816, -0.366351]] and translation vector: [4.391561, 4.044915, 1.406417], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_138_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_138_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_138_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_138_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_138_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_138_5.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.804945, -0.278842, 0.523748], [-0.593014, 0.407765, -0.694307], [-0.019964, -0.869468, -0.493585]] and translation vector: [4.871809, 2.494869, 1.402737], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.804444, -0.274614, 0.526742], [-0.593612, 0.404842, -0.695506], [-0.022252, -0.872176, -0.488687]] and translation vector: [4.863627, 2.491699, 1.400121], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_139_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_139_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_139_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_139_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_139_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_139_5.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.000188, -0.47362, 0.88073], [-0.997828, 0.057931, 0.031365], [-0.065877, -0.878822, -0.47258]] and translation vector: [4.366519, 5.511691, 1.307889], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.002248, -0.465195, 0.885205], [-0.998254, 0.053289, 0.02547], [-0.05902, -0.883603, -0.464503]] and translation vector: [4.36891, 5.516212, 1.317108], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_140_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_140_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_140_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_140_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_140_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_140_5.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.881415, -0.308012, 0.3581], [-0.47008, 0.646119, -0.601294], [-0.046169, -0.698325, -0.71429]] and translation vector: [3.147524, 1.689608, 1.273114], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.879224, -0.311908, 0.360109], [-0.474637, 0.638627, -0.605703], [-0.041052, -0.703469, -0.709539]] and translation vector: [3.141599, 1.689583, 1.27073], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_141_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_141_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_141_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_141_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_141_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_141_5.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.954506, 0.05554, -0.292973], [0.288831, -0.41644, 0.862064], [-0.074127, -0.907465, -0.413536]] and translation vector: [2.66447, 1.005586, 1.476015], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.956668, 0.052296, -0.286448], [0.280824, -0.425753, 0.860158], [-0.076973, -0.903327, -0.42199]] and translation vector: [2.657996, 1.004761, 1.470821], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_142_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_142_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_142_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_142_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_142_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_142_5.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.804414, -0.195207, 0.561082], [-0.593456, -0.306943, 0.74404], [0.026978, -0.931494, -0.362756]] and translation vector: [4.397897, 1.805397, 1.263968], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.81043, -0.19082, 0.553888], [-0.585149, -0.309439, 0.749566], [0.028363, -0.931577, -0.362436]] and translation vector: [4.406421, 1.797547, 1.276681], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_143_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_143_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_143_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_143_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_143_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_143_5.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.924593, 0.219455, -0.311397], [0.371095, 0.334047, -0.86643], [-0.086121, -0.916653, -0.390296]] and translation vector: [7.650298, 2.745242, 1.444521], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.925403, 0.221817, -0.30729], [0.368562, 0.337876, -0.866026], [-0.088274, -0.914679, -0.394425]] and translation vector: [7.650829, 2.747432, 1.442508], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_144_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_144_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_144_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_144_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_144_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_144_5.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.173351, 0.592298, -0.78685], [0.984858, -0.105806, 0.137329], [-0.001913, -0.798742, -0.601671]] and translation vector: [3.264189, 1.940071, 1.28435], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.172933, 0.589263, -0.789217], [0.98493, -0.105695, 0.136901], [-0.002745, -0.800998, -0.598661]] and translation vector: [3.267153, 1.942133, 1.284021], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_145_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_145_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_145_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_145_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_145_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_145_5.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.861262, 0.35211, -0.366398], [0.508128, 0.60504, -0.61297], [0.005853, -0.714105, -0.700014]] and translation vector: [3.145762, 3.637784, 1.437024], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.859655, 0.347273, -0.374693], [0.510745, 0.600786, -0.614977], [0.011546, -0.720041, -0.693836]] and translation vector: [3.145171, 3.63531, 1.440385], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_146_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_146_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_146_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_146_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_146_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_146_5.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.927869, -0.125596, 0.351119], [-0.372891, -0.32108, 0.870551], [0.003399, -0.938687, -0.344754]] and translation vector: [5.442723, 4.031985, 1.348893], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.928984, -0.124208, 0.348657], [-0.370086, -0.32475, 0.870387], [0.005117, -0.937609, -0.347654]] and translation vector: [5.438782, 4.038163, 1.363364], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_147_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_147_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_147_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_147_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_147_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_147_5.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.32152, -0.4706, 0.821681], [-0.946681, 0.178549, -0.268172], [-0.020508, -0.864092, -0.502915]] and translation vector: [2.120097, 2.367636, 1.494245], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.324752, -0.471365, 0.819971], [-0.945715, 0.173395, -0.274877], [-0.012612, -0.864725, -0.502087]] and translation vector: [2.101204, 2.346659, 1.492081], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_148_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_148_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_148_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_148_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_148_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_148_5.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.857694, 0.203115, -0.472341], [0.513544, 0.293426, -0.806333], [-0.025181, -0.934155, -0.355978]] and translation vector: [3.161674, 3.662206, 1.335287], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.856666, 0.203827, -0.473897], [0.515344, 0.296604, -0.804019], [-0.023321, -0.932995, -0.359132]] and translation vector: [3.164327, 3.659025, 1.330704], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_149_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_149_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_149_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_149_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_149_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_149_5.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.975982, 0.033782, -0.215214], [0.215389, -0.297687, 0.930048], [-0.032648, -0.954066, -0.297814]] and translation vector: [2.838751, 1.414222, 1.664536], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.976127, 0.034525, -0.21444], [0.21483, -0.298963, 0.929769], [-0.03201, -0.95364, -0.299243]] and translation vector: [2.83798, 1.414721, 1.663024], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_150_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_150_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_150_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_150_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_150_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_150_5.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.844798, -0.442354, 0.301064], [-0.534849, 0.714819, -0.450523], [-0.015916, -0.541624, -0.84047]] and translation vector: [3.085932, 7.995926, 1.934485], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.833593, -0.457276, 0.309873], [-0.552243, 0.702368, -0.449118], [-0.012274, -0.545507, -0.838017]] and translation vector: [3.091993, 8.002051, 1.93396], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_151_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_151_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_151_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_151_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_151_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_151_5.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.956223, -0.170898, 0.237554], [-0.292595, -0.544035, 0.786393], [-0.005155, -0.821474, -0.570223]] and translation vector: [1.275326, 2.834272, 1.3185], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.956815, -0.170774, 0.235249], [-0.290631, -0.544392, 0.786875], [-0.00631, -0.821263, -0.570514]] and translation vector: [1.276568, 2.833979, 1.318089], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_152_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_152_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_152_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_152_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_152_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_152_5.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.852441, 0.228219, -0.470383], [0.522431, 0.337001, -0.78326], [-0.020235, -0.913426, -0.406502]] and translation vector: [1.798405, 5.320803, 1.619482], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.850776, 0.231102, -0.471988], [0.52508, 0.336676, -0.781627], [-0.021728, -0.91282, -0.407783]] and translation vector: [1.793927, 5.32593, 1.618758], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_153_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_153_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_153_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_153_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_153_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_153_5.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.999494, 0.005595, 0.031322], [-0.029883, 0.172936, -0.98448], [-0.010925, -0.984917, -0.172681]] and translation vector: [6.687301, 5.436423, 1.742894], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.999393, 0.00615, 0.034285], [-0.032681, 0.175053, -0.984017], [-0.012053, -0.98454, -0.174746]] and translation vector: [6.681215, 5.427393, 1.75699], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_154_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_154_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_154_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_154_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_154_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_154_5.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.99336, -0.011945, -0.114427], [0.103059, -0.349694, 0.931178], [-0.051137, -0.936788, -0.346141]] and translation vector: [2.948285, 4.432959, 1.460427], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.99314, -0.016022, -0.115825], [0.102925, -0.35027, 0.930977], [-0.055486, -0.936512, -0.346218]] and translation vector: [2.949102, 4.433566, 1.463483], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_155_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_155_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_155_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_155_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_155_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_155_5.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.40936, -0.486807, 0.77165], [-0.912164, 0.236459, -0.334729], [-0.019515, -0.840896, -0.540844]] and translation vector: [1.412713, 1.214489, 1.390939], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.417972, -0.487805, 0.766384], [-0.908352, 0.237425, -0.344277], [-0.014019, -0.840045, -0.542336]] and translation vector: [1.411881, 1.212071, 1.390231], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_156_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_156_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_156_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_156_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_156_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_156_5.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.442667, -0.46733, 0.765277], [-0.896368, 0.253361, -0.363776], [-0.023888, -0.847001, -0.531054]] and translation vector: [2.453469, 1.905797, 1.451684], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.441405, -0.472001, 0.763136], [-0.897015, 0.253848, -0.361837], [-0.022933, -0.844261, -0.535442]] and translation vector: [2.45238, 1.90449, 1.449179], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_157_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_157_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_157_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_157_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_157_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_157_5.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.88123, -0.188698, 0.433389], [-0.470321, -0.258404, 0.843816], [-0.047237, -0.947428, -0.316462]] and translation vector: [1.061636, 1.321782, 1.457525], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.879526, -0.187337, 0.437423], [-0.473303, -0.249401, 0.844857], [-0.049179, -0.950107, -0.308022]] and translation vector: [1.052651, 1.315727, 1.459226], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_158_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_158_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_158_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_158_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_158_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_158_5.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.15851, 0.420096, -0.893529], [0.981106, -0.034663, -0.190342], [-0.110934, -0.906817, -0.406664]] and translation vector: [4.004256, 0.910349, 2.578562], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.153085, 0.419732, -0.894645], [0.982322, -0.034068, -0.184071], [-0.107739, -0.907009, -0.407097]] and translation vector: [4.005316, 0.908549, 2.574668], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_159_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_159_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_159_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_159_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_159_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_159_5.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.565317, -0.50256, 0.654103], [-0.824719, 0.328974, -0.460017], [0.016003, -0.799506, -0.600445]] and translation vector: [4.07549, 5.065369, 1.281872], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.538132, -0.502349, 0.676801], [-0.842747, 0.30749, -0.441846], [0.013851, -0.808143, -0.588824]] and translation vector: [4.054681, 5.042427, 1.283033], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_160_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_160_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_160_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_160_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_160_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_160_5.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.998162, -0.007354, -0.06016], [0.055338, 0.294228, -0.954132], [0.024717, -0.955707, -0.293281]] and translation vector: [1.687981, 4.43329, 1.569003], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.998237, -0.004775, -0.059163], [0.055295, 0.287523, -0.956176], [0.021577, -0.957762, -0.286752]] and translation vector: [1.687716, 4.435163, 1.571974], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_161_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_161_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_161_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_161_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_161_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_161_5.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.30056, -0.511506, 0.805], [-0.953151, 0.130866, -0.272721], [0.034151, -0.849256, -0.526876]] and translation vector: [-0.281614, 2.924112, 1.306122], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.318531, -0.50267, 0.803655], [-0.947336, 0.139247, -0.288383], [0.033055, -0.85319, -0.520551]] and translation vector: [-0.284617, 2.924129, 1.305331], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_162_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_162_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_162_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_162_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_162_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_162_5.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.815869, 0.244354, -0.524069], [0.578211, -0.336271, 0.743367], [0.005416, -0.909513, -0.415641]] and translation vector: [2.358014, 1.230078, 1.369842], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.817563, 0.244526, -0.521342], [0.575764, -0.332513, 0.746947], [0.009295, -0.910847, -0.41264]] and translation vector: [2.355037, 1.229076, 1.372478], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_163_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_163_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_163_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_163_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_163_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_163_5.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.971613, -0.06682, 0.226943], [-0.235147, 0.378036, -0.89543], [-0.02596, -0.923376, -0.383017]] and translation vector: [2.775299, 4.618156, 1.427592], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.969099, -0.066923, 0.237421], [-0.244849, 0.377786, -0.892932], [-0.029937, -0.923471, -0.382498]] and translation vector: [2.770648, 4.620754, 1.418404], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_164_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_164_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_164_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_164_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_164_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_164_5.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.117057, -0.769276, 0.628102], [-0.987232, -0.021336, 0.157855], [-0.108033, -0.638561, -0.761951]] and translation vector: [1.032686, 1.226834, 2.186959], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.111522, -0.769903, 0.628341], [-0.98843, -0.020525, 0.150284], [-0.102807, -0.637831, -0.763284]] and translation vector: [1.037875, 1.232625, 2.186027], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_165_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_165_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_165_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_165_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_165_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_165_5.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.053762, 0.423971, -0.904079], [0.99709, -0.071809, 0.025618], [-0.05406, -0.902825, -0.426597]] and translation vector: [3.696534, 7.381392, 1.65485], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.059051, 0.424044, -0.903714], [0.996629, -0.076693, 0.029136], [-0.056954, -0.902388, -0.427143]] and translation vector: [3.693501, 7.384472, 1.654036], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_166_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_166_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_166_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_166_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_166_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_166_5.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.255196, -0.436856, 0.862573], [-0.966393, 0.143834, -0.213066], [-0.030988, -0.887958, -0.45888]] and translation vector: [1.734999, 0.744851, 1.432124], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.254375, -0.435236, 0.863634], [-0.966628, 0.142475, -0.21291], [-0.03038, -0.888972, -0.456953]] and translation vector: [1.735377, 0.747301, 1.433656], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_167_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_167_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_167_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_167_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_167_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_167_5.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.299058, 0.37418, -0.877812], [0.95368, -0.085842, 0.288314], [0.032528, -0.923375, -0.38252]] and translation vector: [3.908031, 4.993837, 1.41318], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.301871, 0.365699, -0.880419], [0.952911, -0.087746, 0.290279], [0.028901, -0.926588, -0.374966]] and translation vector: [3.903484, 4.991583, 1.422828], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_168_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_168_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_168_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_168_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_168_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_168_5.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.349467, 0.022881, -0.936669], [0.936944, -0.011774, 0.349282], [-0.003037, -0.999669, -0.025553]] and translation vector: [3.08553, 2.787215, 1.609269], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.348555, 0.021762, -0.937036], [0.937279, -0.012701, 0.34835], [-0.00432, -0.999682, -0.024824]] and translation vector: [3.086167, 2.787834, 1.610474], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_169_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_169_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_169_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_169_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_169_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_169_5.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.187285, -0.627824, 0.755488], [-0.982305, 0.118515, -0.145025], [0.001514, -0.76928, -0.63891]] and translation vector: [1.001752, 1.17634, 1.437838], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.187139, -0.630563, 0.75324], [-0.982328, 0.117514, -0.14568], [0.003345, -0.767191, -0.64141]] and translation vector: [1.00191, 1.178201, 1.437088], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_170_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_170_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_170_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_170_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_170_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_170_5.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.052123, 0.492225, -0.868906], [0.996177, 0.08671, -0.010637], [0.070107, -0.866138, -0.494863]] and translation vector: [3.27549, 2.071379, 1.287401], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.035278, 0.492309, -0.869705], [0.997133, 0.075637, 0.002369], [0.066948, -0.867128, -0.493566]] and translation vector: [3.286684, 2.076202, 1.285681], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_171_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_171_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_171_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_171_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_171_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_171_5.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.721847, -0.019511, -0.691778], [0.690918, -0.036893, 0.721991], [-0.039608, -0.999129, -0.013151]] and translation vector: [1.871862, 0.815296, 1.594356], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.723033, -0.022358, -0.690452], [0.689637, -0.034974, 0.723311], [-0.04032, -0.999138, -0.009869]] and translation vector: [1.872181, 0.815734, 1.596287], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_172_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_172_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_172_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_172_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_172_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_172_5.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.112591, -0.547395, 0.829266], [-0.992672, 0.098819, -0.069547], [-0.043877, -0.83102, -0.55451]] and translation vector: [1.18498, 1.814175, 1.496605], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.111637, -0.546351, 0.830083], [-0.992679, 0.100057, -0.067648], [-0.046096, -0.831558, -0.553521]] and translation vector: [1.186424, 1.810214, 1.495373], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_173_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_173_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_173_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_173_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_173_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_173_5.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.59597, 0.482312, -0.642025], [0.802979, -0.35126, 0.4815], [0.006716, -0.802491, -0.596626]] and translation vector: [3.449961, 1.112515, 1.412234], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.596047, 0.483799, -0.640833], [0.802896, -0.349913, 0.482617], [0.009254, -0.802184, -0.597005]] and translation vector: [3.451157, 1.111087, 1.411899], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_174_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_174_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_174_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_174_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_174_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_174_5.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.132001, -0.567775, 0.812532], [-0.991224, 0.069667, -0.112349], [0.007182, -0.820231, -0.571988]] and translation vector: [2.407685, 4.450429, 1.359714], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.130918, -0.563466, 0.8157], [-0.991376, 0.069526, -0.111087], [0.005882, -0.823209, -0.567709]] and translation vector: [2.40989, 4.444678, 1.359228], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_175_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_175_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_175_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_175_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_175_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_175_5.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.443363, -0.325026, 0.835337], [-0.895367, 0.117125, -0.429651], [0.041809, -0.938424, -0.342946]] and translation vector: [2.190343, 3.392878, 1.594635], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.439336, -0.32163, 0.838772], [-0.897253, 0.111545, -0.427195], [0.043838, -0.940272, -0.337589]] and translation vector: [2.183471, 3.393708, 1.586874], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_176_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_176_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_176_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_176_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_176_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_176_5.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.985254, -0.134646, 0.105573], [-0.142287, -0.302097, 0.942599], [-0.095024, -0.94372, -0.3168]] and translation vector: [1.134605, 1.549487, 1.505245], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.985752, -0.13049, 0.106142], [-0.141062, -0.297585, 0.944216], [-0.091624, -0.945736, -0.311752]] and translation vector: [1.131707, 1.551058, 1.506377], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_177_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_177_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_177_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_177_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_177_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_177_5.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.753053, 0.123809, -0.646206], [0.619922, -0.462608, 0.633791], [-0.220471, -0.877875, -0.42512]] and translation vector: [4.259223, 3.769218, 1.505729], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.760823, 0.125761, -0.636658], [0.611756, -0.466381, 0.638939], [-0.216572, -0.875599, -0.431768]] and translation vector: [4.257898, 3.775608, 1.505422], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_178_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_178_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_178_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_178_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_178_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_178_5.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.924746, 0.145405, -0.351715], [0.379908, 0.407811, -0.830277], [0.022707, -0.901414, -0.432362]] and translation vector: [3.891577, 4.106122, 1.335216], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.925289, 0.144931, -0.350479], [0.378485, 0.412032, -0.828842], [0.024284, -0.899569, -0.436102]] and translation vector: [3.892777, 4.104329, 1.336806], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_179_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_179_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_179_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_179_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_179_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_179_5.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.748873, -0.374013, 0.547087], [-0.662404, -0.447673, 0.600675], [0.020256, -0.812221, -0.582998]] and translation vector: [3.709567, 4.406117, 1.261793], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.747082, -0.370975, 0.551585], [-0.664465, -0.440253, 0.603874], [0.018814, -0.817652, -0.575405]] and translation vector: [3.708719, 4.403161, 1.261416], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_180_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_180_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_180_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_180_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_180_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_180_5.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.934222, -0.219071, 0.281493], [-0.356558, -0.595286, 0.72007], [0.009823, -0.773073, -0.634241]] and translation vector: [0.331108, 1.989283, 1.551545], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.93341, -0.222981, 0.281114], [-0.358788, -0.589093, 0.724045], [0.004154, -0.776691, -0.629868]] and translation vector: [0.338532, 1.98258, 1.554168], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_181_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_181_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_181_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_181_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_181_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_181_5.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.52463, -0.231347, 0.819293], [-0.850589, 0.102279, -0.515789], [0.03553, -0.96748, -0.25044]] and translation vector: [5.897326, 2.792535, 1.553822], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.52763, -0.228151, 0.818263], [-0.84888, 0.105585, -0.517933], [0.03177, -0.967884, -0.249382]] and translation vector: [5.897463, 2.790525, 1.551499], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_182_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_182_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_182_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_182_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_182_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_182_5.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.699126, -0.324611, 0.637064], [-0.713802, 0.265353, -0.648131], [0.041344, -0.907863, -0.417224]] and translation vector: [0.050403, 3.78209, 1.506908], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.698648, -0.327666, 0.636024], [-0.713993, 0.262294, -0.649166], [0.045885, -0.907654, -0.417203]] and translation vector: [0.047406, 3.786517, 1.504266], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_183_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_183_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_183_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_183_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_183_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_183_5.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.990268, -0.101591, 0.095124], [-0.135934, -0.559426, 0.817658], [-0.029851, -0.822631, -0.567792]] and translation vector: [6.679901, 2.488796, 1.402653], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.989948, -0.105417, 0.094292], [-0.137296, -0.556168, 0.819651], [-0.033963, -0.824357, -0.565051]] and translation vector: [6.681146, 2.493639, 1.408598], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_184_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_184_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_184_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_184_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_184_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_184_5.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.311411, -0.45253, 0.835607], [-0.948656, 0.199362, -0.245576], [-0.055457, -0.869179, -0.491379]] and translation vector: [2.299133, 2.388773, 1.459468], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.314195, -0.454542, 0.833471], [-0.947818, 0.20019, -0.248124], [-0.05407, -0.867937, -0.493722]] and translation vector: [2.299448, 2.389842, 1.45904], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_185_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_185_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_185_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_185_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_185_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_185_5.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.789457, 0.162095, -0.592016], [0.613764, 0.197318, -0.764434], [-0.007096, -0.966846, -0.255262]] and translation vector: [5.114759, 3.17533, 1.386193], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.785271, 0.158609, -0.598492], [0.619131, 0.193201, -0.761151], [-0.005096, -0.968255, -0.249915]] and translation vector: [5.11251, 3.170745, 1.383731], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_186_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_186_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_186_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_186_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_186_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_186_5.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.408988, -0.323891, 0.853126], [-0.912443, -0.158736, 0.37716], [0.013263, -0.932683, -0.360453]] and translation vector: [3.672612, 2.990265, 1.494339], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.403714, -0.307769, 0.861564], [-0.914697, -0.154884, 0.373283], [0.018558, -0.93877, -0.344045]] and translation vector: [3.67724, 2.998002, 1.501107], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_187_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_187_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_187_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_187_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_187_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_187_5.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.831143, 0.312948, -0.459636], [0.555586, 0.43327, -0.709649], [-0.022937, -0.845187, -0.533978]] and translation vector: [2.360292, 3.05803, 1.315354], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.8108, 0.328121, -0.484706], [0.584922, 0.423558, -0.691711], [-0.021664, -0.844355, -0.535346]] and translation vector: [2.374215, 3.08026, 1.318953], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_188_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_188_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_188_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_188_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_188_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_188_5.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.802837, 0.056561, -0.593509], [0.596192, 0.071654, -0.799638], [-0.002701, -0.995825, -0.091248]] and translation vector: [2.583219, 4.008804, 1.439254], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.802466, 0.056012, -0.594063], [0.59669, 0.070227, -0.799393], [-0.003056, -0.995957, -0.089777]] and translation vector: [2.583684, 4.008714, 1.434935], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_189_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_189_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_189_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_189_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_189_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_189_5.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.032646, 0.194727, -0.980314], [0.998594, -0.034636, -0.040135], [-0.04177, -0.980246, -0.193322]] and translation vector: [3.506056, 2.493951, 1.706783], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.038857, 0.192835, -0.980462], [0.998032, -0.040846, -0.047587], [-0.049225, -0.980381, -0.190868]] and translation vector: [3.502031, 2.499079, 1.701362], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_190_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_190_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_190_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_190_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_190_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_190_5.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.573389, -0.355745, 0.738018], [-0.818965, 0.223754, -0.528424], [0.02285, -0.907403, -0.419641]] and translation vector: [2.061407, 3.857203, 1.382209], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.569689, -0.351701, 0.742806], [-0.821614, 0.221591, -0.525212], [0.020118, -0.909508, -0.4152]] and translation vector: [2.058259, 3.848013, 1.384733], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_191_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_191_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_191_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_191_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_191_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_191_5.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.863619, -0.252896, 0.436126], [-0.502889, 0.371124, -0.780621], [0.03556, -0.893482, -0.447688]] and translation vector: [2.007098, 3.82416, 1.536992], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.862677, -0.255046, 0.436739], [-0.504412, 0.370978, -0.779707], [0.036841, -0.892932, -0.448682]] and translation vector: [2.007321, 3.81907, 1.542811], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_192_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_192_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_192_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_192_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_192_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_192_5.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.986418, -0.051155, 0.156087], [-0.152905, 0.633099, -0.758819], [-0.060001, -0.772379, -0.632322]] and translation vector: [2.055195, 1.600374, 1.268236], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.986809, -0.050817, 0.15371], [-0.151071, 0.630346, -0.761474], [-0.058194, -0.77465, -0.629707]] and translation vector: [2.054364, 1.600927, 1.26836], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_193_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_193_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_193_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_193_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_193_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_193_5.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.436119, -0.427186, 0.79203], [-0.89981, 0.218659, -0.377532], [-0.011909, -0.877326, -0.479747]] and translation vector: [1.992302, 3.72193, 1.553249], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.436462, -0.426736, 0.792084], [-0.899636, 0.219226, -0.377618], [-0.012502, -0.877403, -0.47959]] and translation vector: [1.991236, 3.722176, 1.553282], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_194_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_194_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_194_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_194_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_194_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_194_5.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.996429, -0.081152, -0.023325], [-0.01119, 0.400709, -0.916137], [0.083693, -0.912604, -0.400187]] and translation vector: [7.365378, 2.610504, 1.343957], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.997089, -0.075007, -0.013671], [-0.016913, 0.392439, -0.919623], [0.074343, -0.916715, -0.392565]] and translation vector: [7.36531, 2.61944, 1.344548], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_195_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_195_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_195_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_195_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_195_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_195_5.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.896132, -0.052356, 0.440688], [-0.436974, -0.277444, 0.855616], [0.07747, -0.959314, -0.271505]] and translation vector: [3.211431, 3.110947, 1.584554], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.889709, -0.065096, 0.451863], [-0.451099, -0.277541, 0.848222], [0.070195, -0.958506, -0.276295]] and translation vector: [3.215954, 3.116336, 1.570817], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_196_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_196_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_196_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_196_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_196_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_196_5.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.643628, -0.362528, 0.674031], [-0.765241, -0.290748, 0.574345], [-0.012243, -0.88546, -0.464555]] and translation vector: [2.632762, 2.243425, 1.452714], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.642371, -0.361874, 0.675579], [-0.76623, -0.285016, 0.575898], [-0.015852, -0.887589, -0.460364]] and translation vector: [2.634792, 2.237319, 1.452971], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_197_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_197_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_197_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_197_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_197_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_197_5.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.612656, -0.411508, 0.674769], [-0.789543, 0.280105, -0.546043], [0.035694, -0.867296, -0.496511]] and translation vector: [1.897828, 2.372103, 1.388776], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.615876, -0.406578, 0.674826], [-0.787242, 0.284147, -0.547275], [0.03076, -0.868305, -0.495075]] and translation vector: [1.892345, 2.36762, 1.390764], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_198_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_198_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_198_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_198_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_198_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_198_5.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Depth_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_depth_estimation",
+    "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.752445, 0.275595, -0.598225], [0.657828, -0.35994, 0.661593], [-0.032994, -0.891342, -0.452129]] and translation vector: [2.633805, 2.70906, 1.31733], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.746128, 0.269733, -0.608718], [0.664676, -0.35493, 0.657443], [-0.038718, -0.895136, -0.444108]] and translation vector: [2.667176, 2.689206, 1.310347], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_199_0.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_199_1.jpg",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_199_2.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_199_3.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_199_4.png",
+      "../MMIU-Benchmark/threeD_Depth_Estimation/threeD_Depth_Estimation_199_5.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.9999860986788529, -0.0029206102888043426, 0.004512150995140729], [0.0029125077264400548, 0.9999942525374551, 0.0017791916930172049], [-0.004515929331115238, -0.0017660484162836249, 0.9999884136529374]], 'translation vector': [0.00035121537956284143, -0.00211204147587285, 0.0015269971399166637]}\nB: {'rotation matrix': [[0.992252, 0.033516, -0.119639], [0.120006, -0.507929, 0.852999], [-0.032179, -0.860747, -0.508015]], 'translation vector': [2.483829, 1.386735, 1.351847]}\nC: {'rotation matrix': [[0.992393, 0.03365, -0.118424], [0.118928, -0.510671, 0.851511], [-0.031822, -0.859118, -0.510788]], 'translation vector': [2.483625, 1.389348, 1.348027]}\nD: {'rotation matrix': [[0.992358, 0.033913, -0.118638], [0.11923, -0.511103, 0.85121], [-0.031769, -0.85885, -0.511241]], 'translation vector': [2.484339, 1.38954, 1.351903]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999860986788529, -0.0029206102888043426, 0.004512150995140729], [0.0029125077264400548, 0.9999942525374551, 0.0017791916930172049], [-0.004515929331115238, -0.0017660484162836249, 0.9999884136529374]], 'translation vector': [0.00035121537956284143, -0.00211204147587285, 0.0015269971399166637]}\nB: {'rotation matrix': [[0.992252, 0.033516, -0.119639], [0.120006, -0.507929, 0.852999], [-0.032179, -0.860747, -0.508015]], 'translation vector': [2.483829, 1.386735, 1.351847]}\nC: {'rotation matrix': [[0.992393, 0.03365, -0.118424], [0.118928, -0.510671, 0.851511], [-0.031822, -0.859118, -0.510788]], 'translation vector': [2.483625, 1.389348, 1.348027]}\nD: {'rotation matrix': [[0.992358, 0.033913, -0.118638], [0.11923, -0.511103, 0.85121], [-0.031769, -0.85885, -0.511241]], 'translation vector': [2.484339, 1.38954, 1.351903]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_0_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_0_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_0_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_0_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.947387, 0.126025, -0.294239], [0.319939, 0.401221, -0.858289], [0.009889, -0.90727, -0.420431]], 'translation vector': [2.649368, 2.97856, 1.365403]}\nB: {'rotation matrix': [[0.9999009689053274, -0.0005834477726320058, -0.014081260252404066], [0.0005185811040898338, 0.9999892566550749, -0.004650674323837413], [0.014082812752834694, 0.004642499383147887, 0.9998897887510193]], 'translation vector': [-0.0001697198246333187, -0.006057464737030116, -0.0030857621071840313]}\nC: {'rotation matrix': [[-0.946914, 0.131611, -0.293313], [0.321456, 0.400409, -0.858102], [0.004509, -0.906836, -0.42146]], 'translation vector': [2.644349, 2.98006, 1.361572]}\nD: {'rotation matrix': [[-0.946851, 0.128282, -0.294988], [0.321573, 0.400396, -0.858064], [0.008037, -0.907318, -0.420367]], 'translation vector': [2.647634, 2.978188, 1.36466]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.947387, 0.126025, -0.294239], [0.319939, 0.401221, -0.858289], [0.009889, -0.90727, -0.420431]], 'translation vector': [2.649368, 2.97856, 1.365403]}\nB: {'rotation matrix': [[0.9999009689053274, -0.0005834477726320058, -0.014081260252404066], [0.0005185811040898338, 0.9999892566550749, -0.004650674323837413], [0.014082812752834694, 0.004642499383147887, 0.9998897887510193]], 'translation vector': [-0.0001697198246333187, -0.006057464737030116, -0.0030857621071840313]}\nC: {'rotation matrix': [[-0.946914, 0.131611, -0.293313], [0.321456, 0.400409, -0.858102], [0.004509, -0.906836, -0.42146]], 'translation vector': [2.644349, 2.98006, 1.361572]}\nD: {'rotation matrix': [[-0.946851, 0.128282, -0.294988], [0.321573, 0.400396, -0.858064], [0.008037, -0.907318, -0.420367]], 'translation vector': [2.647634, 2.978188, 1.36466]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_1_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_1_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_1_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_1_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.932535, 0.164547, -0.321407], [0.359784, -0.498771, 0.788533], [-0.030558, -0.850971, -0.524323]], 'translation vector': [4.48804, -0.229774, 1.538571]}\nB: {'rotation matrix': [[0.999974189934778, 0.00023654440835110308, 0.007205305592304459], [-0.00021900350048315587, 0.9999974076812913, -0.002394096534537237], [-0.00720576870446999, 0.0023924811533001236, 0.9999711271279352]], 'translation vector': [-0.011672688463027825, -0.012243066982587869, 0.0020668703552249035]}\nC: {'rotation matrix': [[0.930699, 0.167887, -0.324983], [0.364431, -0.502007, 0.784334], [-0.031464, -0.848412, -0.5284]], 'translation vector': [4.497419, -0.228559, 1.538943]}\nD: {'rotation matrix': [[0.928253, 0.171766, -0.329913], [0.370592, -0.502789, 0.780939], [-0.031738, -0.847172, -0.53037]], 'translation vector': [4.506209, -0.230888, 1.537021]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.932535, 0.164547, -0.321407], [0.359784, -0.498771, 0.788533], [-0.030558, -0.850971, -0.524323]], 'translation vector': [4.48804, -0.229774, 1.538571]}\nB: {'rotation matrix': [[0.999974189934778, 0.00023654440835110308, 0.007205305592304459], [-0.00021900350048315587, 0.9999974076812913, -0.002394096534537237], [-0.00720576870446999, 0.0023924811533001236, 0.9999711271279352]], 'translation vector': [-0.011672688463027825, -0.012243066982587869, 0.0020668703552249035]}\nC: {'rotation matrix': [[0.930699, 0.167887, -0.324983], [0.364431, -0.502007, 0.784334], [-0.031464, -0.848412, -0.5284]], 'translation vector': [4.497419, -0.228559, 1.538943]}\nD: {'rotation matrix': [[0.928253, 0.171766, -0.329913], [0.370592, -0.502789, 0.780939], [-0.031738, -0.847172, -0.53037]], 'translation vector': [4.506209, -0.230888, 1.537021]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_2_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_2_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_2_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_2_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.9999922186975153, -0.0004929414354074684, -0.003990843308659959], [0.0004944153894448021, 1.0000000788246688, 0.0002887878542439764], [0.0039912978026401735, -0.00029052347125931846, 0.9999922538344134]], 'translation vector': [0.0015230291799757656, -0.0023232322897525567, 0.004482115182110835]}\nB: {'rotation matrix': [[-0.597501, 0.375338, -0.7086], [0.801649, 0.25893, -0.538808], [-0.018758, -0.889987, -0.4556]], 'translation vector': [2.357092, 1.421442, 1.358509]}\nC: {'rotation matrix': [[-0.595396, 0.37569, -0.710183], [0.803242, 0.259116, -0.536341], [-0.017478, -0.889784, -0.456047]], 'translation vector': [2.35612, 1.420569, 1.361782]}\nD: {'rotation matrix': [[-0.600812, 0.375021, -0.705963], [0.799114, 0.258529, -0.542752], [-0.021031, -0.890237, -0.455012]], 'translation vector': [2.356618, 1.42274, 1.357666]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999922186975153, -0.0004929414354074684, -0.003990843308659959], [0.0004944153894448021, 1.0000000788246688, 0.0002887878542439764], [0.0039912978026401735, -0.00029052347125931846, 0.9999922538344134]], 'translation vector': [0.0015230291799757656, -0.0023232322897525567, 0.004482115182110835]}\nB: {'rotation matrix': [[-0.597501, 0.375338, -0.7086], [0.801649, 0.25893, -0.538808], [-0.018758, -0.889987, -0.4556]], 'translation vector': [2.357092, 1.421442, 1.358509]}\nC: {'rotation matrix': [[-0.595396, 0.37569, -0.710183], [0.803242, 0.259116, -0.536341], [-0.017478, -0.889784, -0.456047]], 'translation vector': [2.35612, 1.420569, 1.361782]}\nD: {'rotation matrix': [[-0.600812, 0.375021, -0.705963], [0.799114, 0.258529, -0.542752], [-0.021031, -0.890237, -0.455012]], 'translation vector': [2.356618, 1.42274, 1.357666]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_3_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_3_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_3_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_3_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.690346, 0.288159, -0.663616], [0.723477, -0.272947, 0.634098], [0.001589, -0.917858, -0.396905]], 'translation vector': [2.536332, 2.010734, 1.438743]}\nB: {'rotation matrix': [[0.691208, 0.288183, -0.662708], [0.722652, -0.27257, 0.635201], [0.00242, -0.917963, -0.396658]], 'translation vector': [2.535653, 2.009964, 1.439474]}\nC: {'rotation matrix': [[0.9999995587474457, 0.00024022036499647738, 0.0007957711644081506], [-0.00023933007530568237, 0.9999998633783928, -0.0007108069241564525], [-0.0007964539455043593, 0.0007109455644655081, 1.000000560983507]], 'translation vector': [-0.004770728985455719, 0.002959587174171885, 0.0013885111462622612]}\nD: {'rotation matrix': [[0.690426, 0.287793, -0.663692], [0.723401, -0.272862, 0.634222], [0.001429, -0.917999, -0.396581]], 'translation vector': [2.53477, 2.009069, 1.43814]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.690346, 0.288159, -0.663616], [0.723477, -0.272947, 0.634098], [0.001589, -0.917858, -0.396905]], 'translation vector': [2.536332, 2.010734, 1.438743]}\nB: {'rotation matrix': [[0.691208, 0.288183, -0.662708], [0.722652, -0.27257, 0.635201], [0.00242, -0.917963, -0.396658]], 'translation vector': [2.535653, 2.009964, 1.439474]}\nC: {'rotation matrix': [[0.9999995587474457, 0.00024022036499647738, 0.0007957711644081506], [-0.00023933007530568237, 0.9999998633783928, -0.0007108069241564525], [-0.0007964539455043593, 0.0007109455644655081, 1.000000560983507]], 'translation vector': [-0.004770728985455719, 0.002959587174171885, 0.0013885111462622612]}\nD: {'rotation matrix': [[0.690426, 0.287793, -0.663692], [0.723401, -0.272862, 0.634222], [0.001429, -0.917999, -0.396581]], 'translation vector': [2.53477, 2.009069, 1.43814]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_4_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_4_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_4_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_4_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.9999658530200795, -0.008162385048605073, -0.000989578606081076], [0.008166400906506959, 0.9999603758313507, 0.0035456278045154508], [0.000960099553273771, -0.0035536229934212678, 0.9999930725472598]], 'translation vector': [0.0005115289741049189, -0.00032414464705918244, 0.0017902118924140176]}\nB: {'rotation matrix': [[-0.221487, 0.417059, -0.881479], [0.974313, 0.13239, -0.182174], [0.040721, -0.899186, -0.435668]], 'translation vector': [3.156802, 0.483491, 1.355875]}\nC: {'rotation matrix': [[-0.223193, 0.415497, -0.881786], [0.973999, 0.131126, -0.184746], [0.038864, -0.900094, -0.43396]], 'translation vector': [3.157208, 0.483314, 1.355186]}\nD: {'rotation matrix': [[-0.22378, 0.416079, -0.881363], [0.973939, 0.129755, -0.18603], [0.036958, -0.900023, -0.434273]], 'translation vector': [3.157156, 0.483591, 1.355072]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999658530200795, -0.008162385048605073, -0.000989578606081076], [0.008166400906506959, 0.9999603758313507, 0.0035456278045154508], [0.000960099553273771, -0.0035536229934212678, 0.9999930725472598]], 'translation vector': [0.0005115289741049189, -0.00032414464705918244, 0.0017902118924140176]}\nB: {'rotation matrix': [[-0.221487, 0.417059, -0.881479], [0.974313, 0.13239, -0.182174], [0.040721, -0.899186, -0.435668]], 'translation vector': [3.156802, 0.483491, 1.355875]}\nC: {'rotation matrix': [[-0.223193, 0.415497, -0.881786], [0.973999, 0.131126, -0.184746], [0.038864, -0.900094, -0.43396]], 'translation vector': [3.157208, 0.483314, 1.355186]}\nD: {'rotation matrix': [[-0.22378, 0.416079, -0.881363], [0.973939, 0.129755, -0.18603], [0.036958, -0.900023, -0.434273]], 'translation vector': [3.157156, 0.483591, 1.355072]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_5_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_5_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_5_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_5_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.294979, -0.395497, 0.869811], [-0.955406, 0.135138, -0.26256], [-0.013703, -0.908472, -0.417722]], 'translation vector': [4.231627, 1.757554, 1.314948]}\nB: {'rotation matrix': [[0.9999859365705687, 3.1558862291102445e-05, 0.005361241460385626], [2.1367515200134227e-05, 0.9999509201368397, -0.009913096398898577], [-0.0053616455101250845, 0.009912181817668633, 0.9999368747200875]], 'translation vector': [-0.00185453480108011, 0.004425119632380792, 0.004740673653586214]}\nC: {'rotation matrix': [[-0.295231, -0.385219, 0.874325], [-0.955253, 0.136423, -0.262452], [-0.018176, -0.912686, -0.408258]], 'translation vector': [4.225714, 1.76129, 1.315325]}\nD: {'rotation matrix': [[-0.297898, -0.402478, 0.865603], [-0.954572, 0.132313, -0.266996], [-0.007071, -0.905817, -0.42361]], 'translation vector': [4.239912, 1.761582, 1.310375]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.294979, -0.395497, 0.869811], [-0.955406, 0.135138, -0.26256], [-0.013703, -0.908472, -0.417722]], 'translation vector': [4.231627, 1.757554, 1.314948]}\nB: {'rotation matrix': [[0.9999859365705687, 3.1558862291102445e-05, 0.005361241460385626], [2.1367515200134227e-05, 0.9999509201368397, -0.009913096398898577], [-0.0053616455101250845, 0.009912181817668633, 0.9999368747200875]], 'translation vector': [-0.00185453480108011, 0.004425119632380792, 0.004740673653586214]}\nC: {'rotation matrix': [[-0.295231, -0.385219, 0.874325], [-0.955253, 0.136423, -0.262452], [-0.018176, -0.912686, -0.408258]], 'translation vector': [4.225714, 1.76129, 1.315325]}\nD: {'rotation matrix': [[-0.297898, -0.402478, 0.865603], [-0.954572, 0.132313, -0.266996], [-0.007071, -0.905817, -0.42361]], 'translation vector': [4.239912, 1.761582, 1.310375]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_6_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_6_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_6_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_6_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.43634, -0.426945, 0.792039], [-0.899692, 0.219464, -0.377346], [-0.012718, -0.877242, -0.47988]], 'translation vector': [1.991026, 3.721216, 1.553809]}\nB: {'rotation matrix': [[0.9999994163271791, 0.0007055386419518741, -5.183687097418444e-05], [-0.0007048159396394524, 1.0000001700794185, 0.00017964949547958028], [5.299300853873026e-05, -0.00017925701598013347, 1.000000048079148]], 'translation vector': [-0.0002435455336966541, -0.00047987538941862695, 0.0009530826592798469]}\nC: {'rotation matrix': [[-0.436198, -0.427205, 0.791977], [-0.899763, 0.219364, -0.377235], [-0.012574, -0.877141, -0.480069]], 'translation vector': [1.990491, 3.720783, 1.55354]}\nD: {'rotation matrix': [[-0.436159, -0.427335, 0.791928], [-0.899792, 0.218686, -0.377559], [-0.011839, -0.877246, -0.479894]], 'translation vector': [1.98993, 3.720837, 1.552023]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.43634, -0.426945, 0.792039], [-0.899692, 0.219464, -0.377346], [-0.012718, -0.877242, -0.47988]], 'translation vector': [1.991026, 3.721216, 1.553809]}\nB: {'rotation matrix': [[0.9999994163271791, 0.0007055386419518741, -5.183687097418444e-05], [-0.0007048159396394524, 1.0000001700794185, 0.00017964949547958028], [5.299300853873026e-05, -0.00017925701598013347, 1.000000048079148]], 'translation vector': [-0.0002435455336966541, -0.00047987538941862695, 0.0009530826592798469]}\nC: {'rotation matrix': [[-0.436198, -0.427205, 0.791977], [-0.899763, 0.219364, -0.377235], [-0.012574, -0.877141, -0.480069]], 'translation vector': [1.990491, 3.720783, 1.55354]}\nD: {'rotation matrix': [[-0.436159, -0.427335, 0.791928], [-0.899792, 0.218686, -0.377559], [-0.011839, -0.877246, -0.479894]], 'translation vector': [1.98993, 3.720837, 1.552023]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_7_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_7_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_7_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_7_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.9999987261495074, 0.0004975354064711766, -0.0014612465343166485], [-0.0004956714882147159, 0.9999976114101617, 0.002168145916521647], [0.0014629580715452147, -0.002166812264256858, 0.9999963843096157]], 'translation vector': [-0.0006911004437073487, 0.0010681685362672333, 0.00045340774962232544]}\nB: {'rotation matrix': [[0.254029, -0.222698, 0.941209], [-0.965413, 0.000689, 0.260725], [-0.058712, -0.974887, -0.21482]], 'translation vector': [0.927676, 4.785758, 1.499229]}\nC: {'rotation matrix': [[0.261058, -0.219751, 0.939978], [-0.963311, 0.003543, 0.268366], [-0.062304, -0.97555, -0.210763]], 'translation vector': [0.925951, 4.784105, 1.497862]}\nD: {'rotation matrix': [[0.253006, -0.222602, 0.941507], [-0.965684, 0.00092, 0.259721], [-0.058681, -0.974909, -0.21473]], 'translation vector': [0.928139, 4.78494, 1.499076]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999987261495074, 0.0004975354064711766, -0.0014612465343166485], [-0.0004956714882147159, 0.9999976114101617, 0.002168145916521647], [0.0014629580715452147, -0.002166812264256858, 0.9999963843096157]], 'translation vector': [-0.0006911004437073487, 0.0010681685362672333, 0.00045340774962232544]}\nB: {'rotation matrix': [[0.254029, -0.222698, 0.941209], [-0.965413, 0.000689, 0.260725], [-0.058712, -0.974887, -0.21482]], 'translation vector': [0.927676, 4.785758, 1.499229]}\nC: {'rotation matrix': [[0.261058, -0.219751, 0.939978], [-0.963311, 0.003543, 0.268366], [-0.062304, -0.97555, -0.210763]], 'translation vector': [0.925951, 4.784105, 1.497862]}\nD: {'rotation matrix': [[0.253006, -0.222602, 0.941507], [-0.965684, 0.00092, 0.259721], [-0.058681, -0.974909, -0.21473]], 'translation vector': [0.928139, 4.78494, 1.499076]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_8_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_8_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_8_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_8_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.298773, 0.351612, -0.887189], [0.953749, -0.077747, 0.290375], [0.033123, -0.932912, -0.358578]], 'translation vector': [3.912279, 4.982921, 1.420651]}\nB: {'rotation matrix': [[0.29932, 0.353357, -0.88631], [0.953697, -0.082092, 0.289349], [0.029485, -0.93188, -0.361567]], 'translation vector': [3.9112, 4.98563, 1.419169]}\nC: {'rotation matrix': [[0.999988451223679, 0.004467367701975065, -0.0013038134525021694], [-0.00445692043483639, 0.9999572940857223, 0.008125240965736606], [0.0013398420675200973, -0.008118916964061989, 0.9999668442244075]], 'translation vector': [0.0032416245875248606, 0.010404768814489485, 0.0002686970709979697]}\nD: {'rotation matrix': [[0.298213, 0.352721, -0.886937], [0.953989, -0.07977, 0.289034], [0.031197, -0.932323, -0.36028]], 'translation vector': [3.912466, 4.985029, 1.419803]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.298773, 0.351612, -0.887189], [0.953749, -0.077747, 0.290375], [0.033123, -0.932912, -0.358578]], 'translation vector': [3.912279, 4.982921, 1.420651]}\nB: {'rotation matrix': [[0.29932, 0.353357, -0.88631], [0.953697, -0.082092, 0.289349], [0.029485, -0.93188, -0.361567]], 'translation vector': [3.9112, 4.98563, 1.419169]}\nC: {'rotation matrix': [[0.999988451223679, 0.004467367701975065, -0.0013038134525021694], [-0.00445692043483639, 0.9999572940857223, 0.008125240965736606], [0.0013398420675200973, -0.008118916964061989, 0.9999668442244075]], 'translation vector': [0.0032416245875248606, 0.010404768814489485, 0.0002686970709979697]}\nD: {'rotation matrix': [[0.298213, 0.352721, -0.886937], [0.953989, -0.07977, 0.289034], [0.031197, -0.932323, -0.36028]], 'translation vector': [3.912466, 4.985029, 1.419803]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_9_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_9_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_9_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_9_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.9999651962705811, 0.003066448737415877, 0.007690547235694275], [-0.003089438182273661, 0.9999915450229314, 0.002959839315488226], [-0.007681501308992471, -0.0029840343790842917, 0.999966014383214]], 'translation vector': [-0.009845168086086709, -0.005623772397939042, 0.0006148134083248102]}\nB: {'rotation matrix': [[-0.998744, -0.022866, -0.044595], [0.034706, 0.326335, -0.944617], [0.036152, -0.944977, -0.325132]], 'translation vector': [2.332638, 2.988529, 1.390534]}\nC: {'rotation matrix': [[-0.998733, -0.022769, -0.044885], [0.035006, 0.326505, -0.944547], [0.036161, -0.944921, -0.325294]], 'translation vector': [2.335994, 2.987912, 1.391848]}\nD: {'rotation matrix': [[-0.998702, -0.02238, -0.045764], [0.035975, 0.326219, -0.94461], [0.03607, -0.945029, -0.32499]], 'translation vector': [2.340556, 2.987934, 1.391904]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999651962705811, 0.003066448737415877, 0.007690547235694275], [-0.003089438182273661, 0.9999915450229314, 0.002959839315488226], [-0.007681501308992471, -0.0029840343790842917, 0.999966014383214]], 'translation vector': [-0.009845168086086709, -0.005623772397939042, 0.0006148134083248102]}\nB: {'rotation matrix': [[-0.998744, -0.022866, -0.044595], [0.034706, 0.326335, -0.944617], [0.036152, -0.944977, -0.325132]], 'translation vector': [2.332638, 2.988529, 1.390534]}\nC: {'rotation matrix': [[-0.998733, -0.022769, -0.044885], [0.035006, 0.326505, -0.944547], [0.036161, -0.944921, -0.325294]], 'translation vector': [2.335994, 2.987912, 1.391848]}\nD: {'rotation matrix': [[-0.998702, -0.02238, -0.045764], [0.035975, 0.326219, -0.94461], [0.03607, -0.945029, -0.32499]], 'translation vector': [2.340556, 2.987934, 1.391904]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_10_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_10_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_10_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_10_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.088289, -0.769037, 0.633078], [-0.992448, -0.013575, 0.121917], [-0.085165, -0.63906, -0.764427]], 'translation vector': [1.06143, 1.251586, 2.183495]}\nB: {'rotation matrix': [[0.095281, -0.770575, 0.630187], [-0.991458, -0.016816, 0.129342], [-0.08907, -0.637128, -0.765594]], 'translation vector': [1.056131, 1.246655, 2.184574]}\nC: {'rotation matrix': [[0.101903, -0.771131, 0.628469], [-0.990357, -0.019031, 0.13723], [-0.093862, -0.636392, -0.765634]], 'translation vector': [1.04909, 1.241123, 2.18482]}\nD: {'rotation matrix': [[0.9999704077161121, 0.0009465902067394159, -0.007647096819155797], [-0.0009526969531251086, 0.9999995333260298, -0.0008216172460913287], [0.00764682863028495, 0.0008292870802229541, 0.9999701657998578]], 'translation vector': [0.005049491112872229, 0.003519427946364395, -0.004842133831311157]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.088289, -0.769037, 0.633078], [-0.992448, -0.013575, 0.121917], [-0.085165, -0.63906, -0.764427]], 'translation vector': [1.06143, 1.251586, 2.183495]}\nB: {'rotation matrix': [[0.095281, -0.770575, 0.630187], [-0.991458, -0.016816, 0.129342], [-0.08907, -0.637128, -0.765594]], 'translation vector': [1.056131, 1.246655, 2.184574]}\nC: {'rotation matrix': [[0.101903, -0.771131, 0.628469], [-0.990357, -0.019031, 0.13723], [-0.093862, -0.636392, -0.765634]], 'translation vector': [1.04909, 1.241123, 2.18482]}\nD: {'rotation matrix': [[0.9999704077161121, 0.0009465902067394159, -0.007647096819155797], [-0.0009526969531251086, 0.9999995333260298, -0.0008216172460913287], [0.00764682863028495, 0.0008292870802229541, 0.9999701657998578]], 'translation vector': [0.005049491112872229, 0.003519427946364395, -0.004842133831311157]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_11_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_11_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_11_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_11_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.236859, -0.585227, 0.775504], [-0.967029, -0.065142, 0.246196], [-0.093563, -0.808248, -0.581361]], 'translation vector': [0.85633, 3.124968, 1.418476]}\nB: {'rotation matrix': [[0.9999986227118945, -0.0014386707321173084, -0.0013720086731618905], [0.0014396405048904127, 0.9999985343757846, 0.0002480174905685849], [0.0013722163701430706, -0.00024858511187529484, 0.9999990039875143]], 'translation vector': [0.0014743108378150183, 9.881450519233503e-05, 0.00010772367212419365]}\nC: {'rotation matrix': [[0.234228, -0.586349, 0.775456], [-0.967526, -0.06262, 0.244894], [-0.095034, -0.807635, -0.581975]], 'translation vector': [0.858687, 3.12069, 1.418757]}\nD: {'rotation matrix': [[0.234642, -0.58546, 0.776002], [-0.967537, -0.063552, 0.24461], [-0.093893, -0.808206, -0.581366]], 'translation vector': [0.856906, 3.122666, 1.417663]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.236859, -0.585227, 0.775504], [-0.967029, -0.065142, 0.246196], [-0.093563, -0.808248, -0.581361]], 'translation vector': [0.85633, 3.124968, 1.418476]}\nB: {'rotation matrix': [[0.9999986227118945, -0.0014386707321173084, -0.0013720086731618905], [0.0014396405048904127, 0.9999985343757846, 0.0002480174905685849], [0.0013722163701430706, -0.00024858511187529484, 0.9999990039875143]], 'translation vector': [0.0014743108378150183, 9.881450519233503e-05, 0.00010772367212419365]}\nC: {'rotation matrix': [[0.234228, -0.586349, 0.775456], [-0.967526, -0.06262, 0.244894], [-0.095034, -0.807635, -0.581975]], 'translation vector': [0.858687, 3.12069, 1.418757]}\nD: {'rotation matrix': [[0.234642, -0.58546, 0.776002], [-0.967537, -0.063552, 0.24461], [-0.093893, -0.808206, -0.581366]], 'translation vector': [0.856906, 3.122666, 1.417663]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_12_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_12_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_12_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_12_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.582104, 0.470868, -0.662901], [0.81311, -0.339656, 0.472743], [-0.002559, -0.814197, -0.580583]], 'translation vector': [4.229822, 1.596572, 1.425168]}\nB: {'rotation matrix': [[0.9999560146697009, 0.003230475520535795, -0.00886892770497088], [-0.0032175833964298243, 0.9999942676924473, 0.0014734023416888033], [0.008873986014087454, -0.001444906579376565, 0.9999590747869875]], 'translation vector': [0.0012149381321875374, 0.0024560981455157282, -0.00010287317760537817]}\nC: {'rotation matrix': [[0.582444, 0.471641, -0.662053], [0.812867, -0.340629, 0.472461], [-0.002682, -0.813343, -0.581779]], 'translation vector': [4.230144, 1.598887, 1.426125]}\nD: {'rotation matrix': [[0.583525, 0.471082, -0.661499], [0.812092, -0.340805, 0.473665], [-0.002307, -0.813593, -0.58143]], 'translation vector': [4.230429, 1.59898, 1.426046]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.582104, 0.470868, -0.662901], [0.81311, -0.339656, 0.472743], [-0.002559, -0.814197, -0.580583]], 'translation vector': [4.229822, 1.596572, 1.425168]}\nB: {'rotation matrix': [[0.9999560146697009, 0.003230475520535795, -0.00886892770497088], [-0.0032175833964298243, 0.9999942676924473, 0.0014734023416888033], [0.008873986014087454, -0.001444906579376565, 0.9999590747869875]], 'translation vector': [0.0012149381321875374, 0.0024560981455157282, -0.00010287317760537817]}\nC: {'rotation matrix': [[0.582444, 0.471641, -0.662053], [0.812867, -0.340629, 0.472461], [-0.002682, -0.813343, -0.581779]], 'translation vector': [4.230144, 1.598887, 1.426125]}\nD: {'rotation matrix': [[0.583525, 0.471082, -0.661499], [0.812092, -0.340805, 0.473665], [-0.002307, -0.813593, -0.58143]], 'translation vector': [4.230429, 1.59898, 1.426046]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_13_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_13_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_13_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_13_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.990893, 0.057008, -0.121987], [0.134304, -0.353431, 0.92577], [0.009662, -0.933722, -0.357869]], 'translation vector': [2.186028, 2.144782, 1.462596]}\nB: {'rotation matrix': [[0.991569, 0.053062, -0.11822], [0.129357, -0.351493, 0.927211], [0.007646, -0.934686, -0.355393]], 'translation vector': [2.183204, 2.143093, 1.462234]}\nC: {'rotation matrix': [[0.9999874902969159, 0.004379197439594465, -0.002403723493325247], [-0.004372650351021444, 0.9999870005118716, 0.0026700250405810233], [0.0024158589744238553, -0.0026609890925621414, 0.9999939599581709]], 'translation vector': [-0.003672033476809222, -0.0017027412429904132, -0.0003357999980959647]}\nD: {'rotation matrix': [[0.991257, 0.055775, -0.119575], [0.131602, -0.352888, 0.926364], [0.009471, -0.934002, -0.357143]], 'translation vector': [2.184101, 2.143995, 1.46179]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.990893, 0.057008, -0.121987], [0.134304, -0.353431, 0.92577], [0.009662, -0.933722, -0.357869]], 'translation vector': [2.186028, 2.144782, 1.462596]}\nB: {'rotation matrix': [[0.991569, 0.053062, -0.11822], [0.129357, -0.351493, 0.927211], [0.007646, -0.934686, -0.355393]], 'translation vector': [2.183204, 2.143093, 1.462234]}\nC: {'rotation matrix': [[0.9999874902969159, 0.004379197439594465, -0.002403723493325247], [-0.004372650351021444, 0.9999870005118716, 0.0026700250405810233], [0.0024158589744238553, -0.0026609890925621414, 0.9999939599581709]], 'translation vector': [-0.003672033476809222, -0.0017027412429904132, -0.0003357999980959647]}\nD: {'rotation matrix': [[0.991257, 0.055775, -0.119575], [0.131602, -0.352888, 0.926364], [0.009471, -0.934002, -0.357143]], 'translation vector': [2.184101, 2.143995, 1.46179]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_14_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_14_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_14_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_14_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.9999986691328494, -0.001073261840895818, -0.0008802624854347336], [0.001080335550453663, 0.9999723432097648, 0.00736012496129083], [0.0008730903166935182, -0.007359768837914202, 0.9999729147693108]], 'translation vector': [0.0010486658094048806, -0.004009939681768326, 0.0017973568614269853]}\nB: {'rotation matrix': [[-0.386299, -0.298688, 0.872673], [-0.920393, 0.186791, -0.343491], [-0.060411, -0.935893, -0.347067]], 'translation vector': [2.08048, 4.009937, 1.840847]}\nC: {'rotation matrix': [[-0.383122, -0.307436, 0.871034], [-0.921947, 0.185316, -0.340108], [-0.056855, -0.933349, -0.354438]], 'translation vector': [2.080896, 4.009106, 1.847586]}\nD: {'rotation matrix': [[-0.384424, -0.301178, 0.872645], [-0.921297, 0.185141, -0.341959], [-0.058572, -0.935422, -0.348647]], 'translation vector': [2.077995, 4.010322, 1.837904]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999986691328494, -0.001073261840895818, -0.0008802624854347336], [0.001080335550453663, 0.9999723432097648, 0.00736012496129083], [0.0008730903166935182, -0.007359768837914202, 0.9999729147693108]], 'translation vector': [0.0010486658094048806, -0.004009939681768326, 0.0017973568614269853]}\nB: {'rotation matrix': [[-0.386299, -0.298688, 0.872673], [-0.920393, 0.186791, -0.343491], [-0.060411, -0.935893, -0.347067]], 'translation vector': [2.08048, 4.009937, 1.840847]}\nC: {'rotation matrix': [[-0.383122, -0.307436, 0.871034], [-0.921947, 0.185316, -0.340108], [-0.056855, -0.933349, -0.354438]], 'translation vector': [2.080896, 4.009106, 1.847586]}\nD: {'rotation matrix': [[-0.384424, -0.301178, 0.872645], [-0.921297, 0.185141, -0.341959], [-0.058572, -0.935422, -0.348647]], 'translation vector': [2.077995, 4.010322, 1.837904]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_15_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_15_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_15_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_15_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.9999787823940977, 0.006347499783107699, -0.0013436878206843045], [-0.006341718624184496, 0.9999688828294876, 0.004625015892328648], [0.0013735202687803576, -0.004616292980016383, 0.9999878757602079]], 'translation vector': [-0.006298849286503927, 0.01405890593176995, 0.0007444533799123576]}\nB: {'rotation matrix': [[0.999733, -0.006694, 0.022129], [-0.023039, -0.368118, 0.929494], [0.001924, -0.929755, -0.368173]], 'translation vector': [3.317142, 3.173762, 1.523565]}\nC: {'rotation matrix': [[0.999731, -0.010083, 0.02088], [-0.023127, -0.369367, 0.928996], [-0.001654, -0.929229, -0.3695]], 'translation vector': [3.314788, 3.169853, 1.521514]}\nD: {'rotation matrix': [[0.999712, -0.007131, 0.022924], [-0.023946, -0.364324, 0.930964], [0.001713, -0.931245, -0.36439]], 'translation vector': [3.320507, 3.174599, 1.524876]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999787823940977, 0.006347499783107699, -0.0013436878206843045], [-0.006341718624184496, 0.9999688828294876, 0.004625015892328648], [0.0013735202687803576, -0.004616292980016383, 0.9999878757602079]], 'translation vector': [-0.006298849286503927, 0.01405890593176995, 0.0007444533799123576]}\nB: {'rotation matrix': [[0.999733, -0.006694, 0.022129], [-0.023039, -0.368118, 0.929494], [0.001924, -0.929755, -0.368173]], 'translation vector': [3.317142, 3.173762, 1.523565]}\nC: {'rotation matrix': [[0.999731, -0.010083, 0.02088], [-0.023127, -0.369367, 0.928996], [-0.001654, -0.929229, -0.3695]], 'translation vector': [3.314788, 3.169853, 1.521514]}\nD: {'rotation matrix': [[0.999712, -0.007131, 0.022924], [-0.023946, -0.364324, 0.930964], [0.001713, -0.931245, -0.36439]], 'translation vector': [3.320507, 3.174599, 1.524876]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_16_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_16_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_16_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_16_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.998283, -0.004041, -0.058434], [0.054839, 0.286044, -0.956646], [0.020581, -0.958208, -0.285332]], 'translation vector': [1.688122, 4.435732, 1.572228]}\nB: {'rotation matrix': [[0.9999952600112642, 0.0029895442027471574, 0.0009673920050112699], [-0.0029953662693526914, 0.9999719574782809, 0.006845684012961962], [-0.000945954294544353, -0.006847856485920328, 0.9999767653082445]], 'translation vector': [-0.00043220364465224037, 0.0023057137872921907, 0.0026271806076847426]}\nC: {'rotation matrix': [[-0.998336, -0.002848, -0.057597], [0.054423, 0.283794, -0.95734], [0.019072, -0.958881, -0.283167]], 'translation vector': [1.687961, 4.436946, 1.571062]}\nD: {'rotation matrix': [[-0.998358, -0.001309, -0.057275], [0.054546, 0.284027, -0.957264], [0.017521, -0.958815, -0.283489]], 'translation vector': [1.688286, 4.43679, 1.571851]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.998283, -0.004041, -0.058434], [0.054839, 0.286044, -0.956646], [0.020581, -0.958208, -0.285332]], 'translation vector': [1.688122, 4.435732, 1.572228]}\nB: {'rotation matrix': [[0.9999952600112642, 0.0029895442027471574, 0.0009673920050112699], [-0.0029953662693526914, 0.9999719574782809, 0.006845684012961962], [-0.000945954294544353, -0.006847856485920328, 0.9999767653082445]], 'translation vector': [-0.00043220364465224037, 0.0023057137872921907, 0.0026271806076847426]}\nC: {'rotation matrix': [[-0.998336, -0.002848, -0.057597], [0.054423, 0.283794, -0.95734], [0.019072, -0.958881, -0.283167]], 'translation vector': [1.687961, 4.436946, 1.571062]}\nD: {'rotation matrix': [[-0.998358, -0.001309, -0.057275], [0.054546, 0.284027, -0.957264], [0.017521, -0.958815, -0.283489]], 'translation vector': [1.688286, 4.43679, 1.571851]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_17_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_17_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_17_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_17_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.083515, 0.422666, -0.902429], [0.995888, 0.067297, -0.060645], [0.035099, -0.903783, -0.426549]], 'translation vector': [4.26049, 5.866284, 1.66918]}\nB: {'rotation matrix': [[-0.080848, 0.422553, -0.902725], [0.996028, 0.068154, -0.057302], [0.037311, -0.903772, -0.426385]], 'translation vector': [4.26043, 5.866841, 1.668667]}\nC: {'rotation matrix': [[-0.081468, 0.422714, -0.902594], [0.995995, 0.068006, -0.058049], [0.036844, -0.903708, -0.426561]], 'translation vector': [4.260486, 5.864969, 1.669529]}\nD: {'rotation matrix': [[0.9999996666026792, 0.0007024902365725143, 0.00045309895052521656], [-0.0007041385629428506, 0.9999945142012453, 0.003060691886503368], [-0.00045146230549075186, -0.0030616184704849174, 0.9999957671575359]], 'translation vector': [-0.008238592634347341, 0.0026712907326988944, 0.0010534554726602252]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.083515, 0.422666, -0.902429], [0.995888, 0.067297, -0.060645], [0.035099, -0.903783, -0.426549]], 'translation vector': [4.26049, 5.866284, 1.66918]}\nB: {'rotation matrix': [[-0.080848, 0.422553, -0.902725], [0.996028, 0.068154, -0.057302], [0.037311, -0.903772, -0.426385]], 'translation vector': [4.26043, 5.866841, 1.668667]}\nC: {'rotation matrix': [[-0.081468, 0.422714, -0.902594], [0.995995, 0.068006, -0.058049], [0.036844, -0.903708, -0.426561]], 'translation vector': [4.260486, 5.864969, 1.669529]}\nD: {'rotation matrix': [[0.9999996666026792, 0.0007024902365725143, 0.00045309895052521656], [-0.0007041385629428506, 0.9999945142012453, 0.003060691886503368], [-0.00045146230549075186, -0.0030616184704849174, 0.9999957671575359]], 'translation vector': [-0.008238592634347341, 0.0026712907326988944, 0.0010534554726602252]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_18_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_18_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_18_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_18_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.192624, -0.379717, 0.904826], [-0.981148, -0.059825, 0.183766], [-0.015648, -0.923166, -0.384082]], 'translation vector': [4.984646, 4.164808, 1.32267]}\nB: {'rotation matrix': [[0.9993897560101446, -0.009672092115768357, 0.033569058676964116], [0.00944513640094131, 0.9999317047122397, 0.006930507743012298], [-0.033634032991460915, -0.006610261888877057, 0.9994127816547865]], 'translation vector': [-0.03424616147212767, -0.0027538632482175807, 0.008124405533084023]}\nC: {'rotation matrix': [[0.180272, -0.384554, 0.905329], [-0.983511, -0.056947, 0.171651], [-0.014453, -0.921344, -0.388479]], 'translation vector': [4.987018, 4.177592, 1.323464]}\nD: {'rotation matrix': [[0.205405, -0.377617, 0.902892], [-0.978531, -0.0633, 0.196139], [-0.016912, -0.923796, -0.382512]], 'translation vector': [4.985321, 4.152791, 1.324267]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.192624, -0.379717, 0.904826], [-0.981148, -0.059825, 0.183766], [-0.015648, -0.923166, -0.384082]], 'translation vector': [4.984646, 4.164808, 1.32267]}\nB: {'rotation matrix': [[0.9993897560101446, -0.009672092115768357, 0.033569058676964116], [0.00944513640094131, 0.9999317047122397, 0.006930507743012298], [-0.033634032991460915, -0.006610261888877057, 0.9994127816547865]], 'translation vector': [-0.03424616147212767, -0.0027538632482175807, 0.008124405533084023]}\nC: {'rotation matrix': [[0.180272, -0.384554, 0.905329], [-0.983511, -0.056947, 0.171651], [-0.014453, -0.921344, -0.388479]], 'translation vector': [4.987018, 4.177592, 1.323464]}\nD: {'rotation matrix': [[0.205405, -0.377617, 0.902892], [-0.978531, -0.0633, 0.196139], [-0.016912, -0.923796, -0.382512]], 'translation vector': [4.985321, 4.152791, 1.324267]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_19_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_19_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_19_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_19_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.084118, -0.329466, 0.940413], [-0.993483, 0.100574, -0.05363], [-0.076912, -0.938795, -0.335779]], 'translation vector': [4.338453, 2.933071, 1.462896]}\nB: {'rotation matrix': [[0.9999984493090167, 0.0015295352178733802, -0.0012886089849350254], [-0.0015301527829556035, 0.9999993364831051, -0.0007407082576259022], [0.0012866367449453698, 0.0007417730730382566, 0.99999926268211]], 'translation vector': [-0.001971799758651027, -0.003988184523042726, -0.0019001345524003455]}\nC: {'rotation matrix': [[-0.084181, -0.324678, 0.942071], [-0.993543, 0.09952, -0.054482], [-0.076066, -0.940574, -0.330959]], 'translation vector': [4.337488, 2.935505, 1.461639]}\nD: {'rotation matrix': [[-0.083371, -0.331462, 0.939778], [-0.993645, 0.099215, -0.053156], [-0.075621, -0.938238, -0.337627]], 'translation vector': [4.338066, 2.933557, 1.453891]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.084118, -0.329466, 0.940413], [-0.993483, 0.100574, -0.05363], [-0.076912, -0.938795, -0.335779]], 'translation vector': [4.338453, 2.933071, 1.462896]}\nB: {'rotation matrix': [[0.9999984493090167, 0.0015295352178733802, -0.0012886089849350254], [-0.0015301527829556035, 0.9999993364831051, -0.0007407082576259022], [0.0012866367449453698, 0.0007417730730382566, 0.99999926268211]], 'translation vector': [-0.001971799758651027, -0.003988184523042726, -0.0019001345524003455]}\nC: {'rotation matrix': [[-0.084181, -0.324678, 0.942071], [-0.993543, 0.09952, -0.054482], [-0.076066, -0.940574, -0.330959]], 'translation vector': [4.337488, 2.935505, 1.461639]}\nD: {'rotation matrix': [[-0.083371, -0.331462, 0.939778], [-0.993645, 0.099215, -0.053156], [-0.075621, -0.938238, -0.337627]], 'translation vector': [4.338066, 2.933557, 1.453891]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_20_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_20_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_20_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_20_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.486704, 0.327719, -0.809765], [0.869935, 0.097394, -0.483454], [-0.079571, -0.939742, -0.332496]], 'translation vector': [4.437128, 2.283443, 1.465507]}\nB: {'rotation matrix': [[0.9999935589617881, -0.0023123151478069903, 0.002638162918517237], [0.0023036775758451737, 0.9999925417720922, 0.0032437850296420778], [-0.0026449296307575294, -0.0032387699437697024, 0.9999912244700261]], 'translation vector': [0.0032199017210038927, 0.0001093759992842891, 0.0024338106134420556]}\nC: {'rotation matrix': [[-0.494127, 0.32769, -0.805269], [0.866163, 0.105829, -0.488427], [-0.074832, -0.938839, -0.336126]], 'translation vector': [4.441189, 2.279036, 1.469096]}\nD: {'rotation matrix': [[-0.489836, 0.32797, -0.807773], [0.868301, 0.100425, -0.485767], [-0.078196, -0.939335, -0.333968]], 'translation vector': [4.439312, 2.280933, 1.467607]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.486704, 0.327719, -0.809765], [0.869935, 0.097394, -0.483454], [-0.079571, -0.939742, -0.332496]], 'translation vector': [4.437128, 2.283443, 1.465507]}\nB: {'rotation matrix': [[0.9999935589617881, -0.0023123151478069903, 0.002638162918517237], [0.0023036775758451737, 0.9999925417720922, 0.0032437850296420778], [-0.0026449296307575294, -0.0032387699437697024, 0.9999912244700261]], 'translation vector': [0.0032199017210038927, 0.0001093759992842891, 0.0024338106134420556]}\nC: {'rotation matrix': [[-0.494127, 0.32769, -0.805269], [0.866163, 0.105829, -0.488427], [-0.074832, -0.938839, -0.336126]], 'translation vector': [4.441189, 2.279036, 1.469096]}\nD: {'rotation matrix': [[-0.489836, 0.32797, -0.807773], [0.868301, 0.100425, -0.485767], [-0.078196, -0.939335, -0.333968]], 'translation vector': [4.439312, 2.280933, 1.467607]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_21_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_21_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_21_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_21_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.591474, -0.360427, 0.721284], [-0.806244, -0.251769, 0.535335], [-0.011352, -0.898167, -0.439507]], 'translation vector': [2.523668, 2.4613, 1.342936]}\nB: {'rotation matrix': [[0.9999823901895873, 0.005315913984669213, -0.0024241384709724934], [-0.005325670093989841, 0.9999780230897936, -0.003916850861463004], [0.002402806763812986, 0.0039304838949891525, 0.9999899783862463]], 'translation vector': [-0.0021162233882717763, -0.0011065369325464758, -0.0015805869003076012]}\nC: {'rotation matrix': [[0.588358, -0.362651, 0.722717], [-0.808515, -0.250803, 0.532355], [-0.0118, -0.897542, -0.440771]], 'translation vector': [2.523157, 2.461525, 1.343416]}\nD: {'rotation matrix': [[0.586933, -0.361149, 0.724625], [-0.809565, -0.249931, 0.531168], [-0.010725, -0.898391, -0.439067]], 'translation vector': [2.521696, 2.461699, 1.342706]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.591474, -0.360427, 0.721284], [-0.806244, -0.251769, 0.535335], [-0.011352, -0.898167, -0.439507]], 'translation vector': [2.523668, 2.4613, 1.342936]}\nB: {'rotation matrix': [[0.9999823901895873, 0.005315913984669213, -0.0024241384709724934], [-0.005325670093989841, 0.9999780230897936, -0.003916850861463004], [0.002402806763812986, 0.0039304838949891525, 0.9999899783862463]], 'translation vector': [-0.0021162233882717763, -0.0011065369325464758, -0.0015805869003076012]}\nC: {'rotation matrix': [[0.588358, -0.362651, 0.722717], [-0.808515, -0.250803, 0.532355], [-0.0118, -0.897542, -0.440771]], 'translation vector': [2.523157, 2.461525, 1.343416]}\nD: {'rotation matrix': [[0.586933, -0.361149, 0.724625], [-0.809565, -0.249931, 0.531168], [-0.010725, -0.898391, -0.439067]], 'translation vector': [2.521696, 2.461699, 1.342706]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_22_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_22_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_22_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_22_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.519941, -0.4438, 0.729866], [-0.853216, -0.228773, 0.468706], [-0.041038, -0.866432, -0.497605]], 'translation vector': [1.000289, 1.985685, 1.347635]}\nB: {'rotation matrix': [[0.520738, -0.4401, 0.731535], [-0.852723, -0.226811, 0.470553], [-0.041171, -0.868832, -0.493393]], 'translation vector': [0.998782, 1.983781, 1.347411]}\nC: {'rotation matrix': [[0.9999982219593319, -0.002079266464163152, 4.9620397493296405e-05], [0.0020793800784517404, 0.99998989374647, 0.004115140408478336], [-5.7016409592928054e-05, -0.004114590657101431, 0.9999914687195216]], 'translation vector': [-0.0013942399199289301, -0.0008989415392872679, 0.0029889416631920795]}\nD: {'rotation matrix': [[0.521192, -0.438092, 0.732417], [-0.852373, -0.224319, 0.472378], [-0.04265, -0.870492, -0.490332]], 'translation vector': [0.999181, 1.981126, 1.348386]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.519941, -0.4438, 0.729866], [-0.853216, -0.228773, 0.468706], [-0.041038, -0.866432, -0.497605]], 'translation vector': [1.000289, 1.985685, 1.347635]}\nB: {'rotation matrix': [[0.520738, -0.4401, 0.731535], [-0.852723, -0.226811, 0.470553], [-0.041171, -0.868832, -0.493393]], 'translation vector': [0.998782, 1.983781, 1.347411]}\nC: {'rotation matrix': [[0.9999982219593319, -0.002079266464163152, 4.9620397493296405e-05], [0.0020793800784517404, 0.99998989374647, 0.004115140408478336], [-5.7016409592928054e-05, -0.004114590657101431, 0.9999914687195216]], 'translation vector': [-0.0013942399199289301, -0.0008989415392872679, 0.0029889416631920795]}\nD: {'rotation matrix': [[0.521192, -0.438092, 0.732417], [-0.852373, -0.224319, 0.472378], [-0.04265, -0.870492, -0.490332]], 'translation vector': [0.999181, 1.981126, 1.348386]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_23_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_23_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_23_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_23_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.961526, 0.04991, -0.270143], [0.263115, -0.450039, 0.853367], [-0.078983, -0.891613, -0.445856]], 'translation vector': [2.643601, 1.008587, 1.47483]}\nB: {'rotation matrix': [[0.9999611816262259, 0.006037262376624097, -0.006357733639992428], [-0.0060929360747264, 0.9999425771984013, -0.008789441445278661], [0.006305603526763411, 0.00882761527567942, 0.9999410260999323]], 'translation vector': [0.006025344459442472, -0.004704561758730241, -0.003336645842906716]}\nC: {'rotation matrix': [[0.958799, 0.0516, -0.27936], [0.272826, -0.441359, 0.85485], [-0.079187, -0.895846, -0.437252]], 'translation vector': [2.65219, 1.005876, 1.472401]}\nD: {'rotation matrix': [[0.963523, 0.050371, -0.262843], [0.256662, -0.452159, 0.854211], [-0.075819, -0.890514, -0.448594]], 'translation vector': [2.637859, 1.00927, 1.478429]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.961526, 0.04991, -0.270143], [0.263115, -0.450039, 0.853367], [-0.078983, -0.891613, -0.445856]], 'translation vector': [2.643601, 1.008587, 1.47483]}\nB: {'rotation matrix': [[0.9999611816262259, 0.006037262376624097, -0.006357733639992428], [-0.0060929360747264, 0.9999425771984013, -0.008789441445278661], [0.006305603526763411, 0.00882761527567942, 0.9999410260999323]], 'translation vector': [0.006025344459442472, -0.004704561758730241, -0.003336645842906716]}\nC: {'rotation matrix': [[0.958799, 0.0516, -0.27936], [0.272826, -0.441359, 0.85485], [-0.079187, -0.895846, -0.437252]], 'translation vector': [2.65219, 1.005876, 1.472401]}\nD: {'rotation matrix': [[0.963523, 0.050371, -0.262843], [0.256662, -0.452159, 0.854211], [-0.075819, -0.890514, -0.448594]], 'translation vector': [2.637859, 1.00927, 1.478429]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_24_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_24_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_24_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_24_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.9999982465629441, -0.0013465983865406563, -0.0009053084488285325], [0.0013501875456354963, 0.999982864157121, 0.005743205099670796], [0.0008980715136351007, -0.0057437521444019864, 0.9999827649954101]], 'translation vector': [0.006929822597295576, -0.003954296383870348, -0.0008962942027399556]}\nB: {'rotation matrix': [[0.678055, 0.431256, -0.595198], [0.734977, -0.40565, 0.543375], [-0.007108, -0.805894, -0.592017]], 'translation vector': [3.965842, 0.866337, 1.41271]}\nC: {'rotation matrix': [[0.680551, 0.428937, -0.594024], [0.732652, -0.407746, 0.544944], [-0.008465, -0.806075, -0.591754]], 'translation vector': [3.965306, 0.868392, 1.416605]}\nD: {'rotation matrix': [[0.681867, 0.427349, -0.593659], [0.731402, -0.409894, 0.545013], [-0.010426, -0.805829, -0.592057]], 'translation vector': [3.966104, 0.870012, 1.418402]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999982465629441, -0.0013465983865406563, -0.0009053084488285325], [0.0013501875456354963, 0.999982864157121, 0.005743205099670796], [0.0008980715136351007, -0.0057437521444019864, 0.9999827649954101]], 'translation vector': [0.006929822597295576, -0.003954296383870348, -0.0008962942027399556]}\nB: {'rotation matrix': [[0.678055, 0.431256, -0.595198], [0.734977, -0.40565, 0.543375], [-0.007108, -0.805894, -0.592017]], 'translation vector': [3.965842, 0.866337, 1.41271]}\nC: {'rotation matrix': [[0.680551, 0.428937, -0.594024], [0.732652, -0.407746, 0.544944], [-0.008465, -0.806075, -0.591754]], 'translation vector': [3.965306, 0.868392, 1.416605]}\nD: {'rotation matrix': [[0.681867, 0.427349, -0.593659], [0.731402, -0.409894, 0.545013], [-0.010426, -0.805829, -0.592057]], 'translation vector': [3.966104, 0.870012, 1.418402]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_25_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_25_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_25_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_25_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.999985554212974, -0.0031855347936376403, 0.004407056654662203], [0.003212953146819932, 0.9999747868858251, -0.006356723963664241], [-0.004386364338204691, 0.006371176808258376, 0.999970327730571]], 'translation vector': [-0.010337331339885125, -0.003078319020026643, -0.0057934529197571916]}\nB: {'rotation matrix': [[-0.816952, -0.193331, 0.543335], [-0.575587, 0.331994, -0.747315], [-0.035905, -0.923257, -0.382502]], 'translation vector': [4.389139, 4.029859, 1.398995]}\nC: {'rotation matrix': [[-0.817965, -0.190324, 0.542871], [-0.574258, 0.326013, -0.750961], [-0.034057, -0.926009, -0.375963]], 'translation vector': [4.389857, 4.037429, 1.401592]}\nD: {'rotation matrix': [[-0.817754, -0.196252, 0.541077], [-0.574392, 0.338327, -0.745392], [-0.036776, -0.920337, -0.389394]], 'translation vector': [4.391615, 4.02441, 1.397694]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.999985554212974, -0.0031855347936376403, 0.004407056654662203], [0.003212953146819932, 0.9999747868858251, -0.006356723963664241], [-0.004386364338204691, 0.006371176808258376, 0.999970327730571]], 'translation vector': [-0.010337331339885125, -0.003078319020026643, -0.0057934529197571916]}\nB: {'rotation matrix': [[-0.816952, -0.193331, 0.543335], [-0.575587, 0.331994, -0.747315], [-0.035905, -0.923257, -0.382502]], 'translation vector': [4.389139, 4.029859, 1.398995]}\nC: {'rotation matrix': [[-0.817965, -0.190324, 0.542871], [-0.574258, 0.326013, -0.750961], [-0.034057, -0.926009, -0.375963]], 'translation vector': [4.389857, 4.037429, 1.401592]}\nD: {'rotation matrix': [[-0.817754, -0.196252, 0.541077], [-0.574392, 0.338327, -0.745392], [-0.036776, -0.920337, -0.389394]], 'translation vector': [4.391615, 4.02441, 1.397694]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_26_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_26_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_26_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_26_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.989636, -0.107174, 0.095574], [-0.139302, -0.554918, 0.820159], [-0.034864, -0.824973, -0.564096]], 'translation vector': [6.683643, 2.494903, 1.406773]}\nB: {'rotation matrix': [[0.989623, -0.107386, 0.095468], [-0.139407, -0.55662, 0.818988], [-0.034808, -0.823798, -0.565814]], 'translation vector': [6.681599, 2.49535, 1.408922]}\nC: {'rotation matrix': [[0.989755, -0.10674, 0.094822], [-0.138471, -0.555821, 0.819688], [-0.034789, -0.824421, -0.564907]], 'translation vector': [6.681521, 2.493315, 1.407658]}\nD: {'rotation matrix': [[0.9999901303029469, 0.004176228929177566, 0.0011903596205295832], [-0.00418098989555596, 0.9999858303738082, 0.003280298331639201], [-0.0011769495287118517, -0.003284936107684076, 0.9999936907012513]], 'translation vector': [-0.000365649748907515, 0.007725567810238587, -0.0007277349140568656]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.989636, -0.107174, 0.095574], [-0.139302, -0.554918, 0.820159], [-0.034864, -0.824973, -0.564096]], 'translation vector': [6.683643, 2.494903, 1.406773]}\nB: {'rotation matrix': [[0.989623, -0.107386, 0.095468], [-0.139407, -0.55662, 0.818988], [-0.034808, -0.823798, -0.565814]], 'translation vector': [6.681599, 2.49535, 1.408922]}\nC: {'rotation matrix': [[0.989755, -0.10674, 0.094822], [-0.138471, -0.555821, 0.819688], [-0.034789, -0.824421, -0.564907]], 'translation vector': [6.681521, 2.493315, 1.407658]}\nD: {'rotation matrix': [[0.9999901303029469, 0.004176228929177566, 0.0011903596205295832], [-0.00418098989555596, 0.9999858303738082, 0.003280298331639201], [-0.0011769495287118517, -0.003284936107684076, 0.9999936907012513]], 'translation vector': [-0.000365649748907515, 0.007725567810238587, -0.0007277349140568656]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_27_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_27_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_27_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_27_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.9999990866555979, 0.0004093298689293289, 0.0008391460833273652], [-0.00041250820264666677, 0.9999930945433543, 0.0037605519712407454], [-0.0008378351814566854, -0.0037610473264969298, 0.9999924864591837]], 'translation vector': [-0.002544400937407598, -0.0017921618995201394, 0.0018599883369136982]}\nB: {'rotation matrix': [[0.155491, 0.600889, -0.784063], [0.987779, -0.103232, 0.116776], [-0.010771, -0.792638, -0.609597]], 'translation vector': [3.280226, 1.958162, 1.281368]}\nC: {'rotation matrix': [[0.159827, 0.598569, -0.784966], [0.987096, -0.104834, 0.121042], [-0.009839, -0.794182, -0.6076]], 'translation vector': [3.27763, 1.954194, 1.282551]}\nD: {'rotation matrix': [[0.164916, 0.595071, -0.786571], [0.986276, -0.105924, 0.126651], [-0.007951, -0.796662, -0.604372]], 'translation vector': [3.274219, 1.949482, 1.285722]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999990866555979, 0.0004093298689293289, 0.0008391460833273652], [-0.00041250820264666677, 0.9999930945433543, 0.0037605519712407454], [-0.0008378351814566854, -0.0037610473264969298, 0.9999924864591837]], 'translation vector': [-0.002544400937407598, -0.0017921618995201394, 0.0018599883369136982]}\nB: {'rotation matrix': [[0.155491, 0.600889, -0.784063], [0.987779, -0.103232, 0.116776], [-0.010771, -0.792638, -0.609597]], 'translation vector': [3.280226, 1.958162, 1.281368]}\nC: {'rotation matrix': [[0.159827, 0.598569, -0.784966], [0.987096, -0.104834, 0.121042], [-0.009839, -0.794182, -0.6076]], 'translation vector': [3.27763, 1.954194, 1.282551]}\nD: {'rotation matrix': [[0.164916, 0.595071, -0.786571], [0.986276, -0.105924, 0.126651], [-0.007951, -0.796662, -0.604372]], 'translation vector': [3.274219, 1.949482, 1.285722]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_28_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_28_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_28_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_28_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.9999748122492941, 0.0038384551465654687, -0.005906369455983883], [-0.0037983225864595374, 0.9999703256171942, 0.006729367156156027], [0.005932617650240982, -0.006707131114408277, 0.9999600560348555]], 'translation vector': [-0.00423530822573337, 0.003061759875670811, -0.009950114450137715]}\nB: {'rotation matrix': [[-0.937821, -0.115212, 0.32744], [-0.346749, 0.354456, -0.868405], [-0.016013, -0.927948, -0.372366]], 'translation vector': [5.30238, 4.116027, 1.850731]}\nC: {'rotation matrix': [[-0.932005, -0.116649, 0.343162], [-0.36182, 0.355063, -0.861984], [-0.021294, -0.927536, -0.373127]], 'translation vector': [5.291139, 4.11983, 1.856331]}\nD: {'rotation matrix': [[-0.934388, -0.115649, 0.336964], [-0.355745, 0.353624, -0.865099], [-0.01911, -0.928211, -0.371563]], 'translation vector': [5.294776, 4.11946, 1.854234]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999748122492941, 0.0038384551465654687, -0.005906369455983883], [-0.0037983225864595374, 0.9999703256171942, 0.006729367156156027], [0.005932617650240982, -0.006707131114408277, 0.9999600560348555]], 'translation vector': [-0.00423530822573337, 0.003061759875670811, -0.009950114450137715]}\nB: {'rotation matrix': [[-0.937821, -0.115212, 0.32744], [-0.346749, 0.354456, -0.868405], [-0.016013, -0.927948, -0.372366]], 'translation vector': [5.30238, 4.116027, 1.850731]}\nC: {'rotation matrix': [[-0.932005, -0.116649, 0.343162], [-0.36182, 0.355063, -0.861984], [-0.021294, -0.927536, -0.373127]], 'translation vector': [5.291139, 4.11983, 1.856331]}\nD: {'rotation matrix': [[-0.934388, -0.115649, 0.336964], [-0.355745, 0.353624, -0.865099], [-0.01911, -0.928211, -0.371563]], 'translation vector': [5.294776, 4.11946, 1.854234]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_29_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_29_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_29_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_29_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.431582, -0.098037, 0.896731], [-0.900431, 0.106787, -0.421688], [-0.054418, -0.989437, -0.134363]], 'translation vector': [4.412532, 3.596741, 1.526323]}\nB: {'rotation matrix': [[-0.43275, -0.095778, 0.896412], [-0.899777, 0.107595, -0.422878], [-0.055947, -0.989571, -0.132741]], 'translation vector': [4.410773, 3.601486, 1.526138]}\nC: {'rotation matrix': [[0.9999520333968362, 0.0006753791567632332, -0.009755571516398104], [-0.0006310509478896895, 0.999990797204237, 0.004384447971913257], [0.009758133830260066, -0.004378955576420755, 0.9999427994349811]], 'translation vector': [0.0016435229369795579, -0.00040384651884517453, 0.0035746064241082287]}\nD: {'rotation matrix': [[-0.433914, -0.093907, 0.896047], [-0.899123, 0.108519, -0.42403], [-0.057419, -0.989649, -0.131522]], 'translation vector': [4.40951, 3.606652, 1.52516]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.431582, -0.098037, 0.896731], [-0.900431, 0.106787, -0.421688], [-0.054418, -0.989437, -0.134363]], 'translation vector': [4.412532, 3.596741, 1.526323]}\nB: {'rotation matrix': [[-0.43275, -0.095778, 0.896412], [-0.899777, 0.107595, -0.422878], [-0.055947, -0.989571, -0.132741]], 'translation vector': [4.410773, 3.601486, 1.526138]}\nC: {'rotation matrix': [[0.9999520333968362, 0.0006753791567632332, -0.009755571516398104], [-0.0006310509478896895, 0.999990797204237, 0.004384447971913257], [0.009758133830260066, -0.004378955576420755, 0.9999427994349811]], 'translation vector': [0.0016435229369795579, -0.00040384651884517453, 0.0035746064241082287]}\nD: {'rotation matrix': [[-0.433914, -0.093907, 0.896047], [-0.899123, 0.108519, -0.42403], [-0.057419, -0.989649, -0.131522]], 'translation vector': [4.40951, 3.606652, 1.52516]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_30_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_30_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_30_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_30_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.9999917661523451, 0.003965604575831933, -0.00020113541169654973], [-0.0039642406405361414, 0.9999897644646852, 0.002088386147537849], [0.00020867644615939134, -0.0020887952983848278, 0.9999975122366389]], 'translation vector': [0.003940681703816118, 0.0007777989134077623, 0.003188885648093276]}\nB: {'rotation matrix': [[-0.926146, 0.120999, -0.357228], [0.374267, 0.177659, -0.910144], [-0.046662, -0.976625, -0.209824]], 'translation vector': [4.737155, 2.737478, 1.223721]}\nC: {'rotation matrix': [[-0.926101, 0.124421, -0.356169], [0.374063, 0.179874, -0.909792], [-0.049131, -0.975789, -0.213123]], 'translation vector': [4.73486, 2.737298, 1.223615]}\nD: {'rotation matrix': [[-0.927631, 0.118543, -0.354186], [0.370581, 0.173865, -0.912382], [-0.046576, -0.977609, -0.205212]], 'translation vector': [4.731637, 2.739449, 1.226493]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999917661523451, 0.003965604575831933, -0.00020113541169654973], [-0.0039642406405361414, 0.9999897644646852, 0.002088386147537849], [0.00020867644615939134, -0.0020887952983848278, 0.9999975122366389]], 'translation vector': [0.003940681703816118, 0.0007777989134077623, 0.003188885648093276]}\nB: {'rotation matrix': [[-0.926146, 0.120999, -0.357228], [0.374267, 0.177659, -0.910144], [-0.046662, -0.976625, -0.209824]], 'translation vector': [4.737155, 2.737478, 1.223721]}\nC: {'rotation matrix': [[-0.926101, 0.124421, -0.356169], [0.374063, 0.179874, -0.909792], [-0.049131, -0.975789, -0.213123]], 'translation vector': [4.73486, 2.737298, 1.223615]}\nD: {'rotation matrix': [[-0.927631, 0.118543, -0.354186], [0.370581, 0.173865, -0.912382], [-0.046576, -0.977609, -0.205212]], 'translation vector': [4.731637, 2.739449, 1.226493]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_31_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_31_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_31_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_31_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.854414, -0.337949, 0.394674], [-0.51408, -0.439475, 0.736602], [-0.075485, -0.832257, -0.549227]], 'translation vector': [2.728753, 6.764147, 1.410515]}\nB: {'rotation matrix': [[0.857663, -0.338131, 0.387404], [-0.508133, -0.441807, 0.739329], [-0.078832, -0.830948, -0.550737]], 'translation vector': [2.730525, 6.755143, 1.407191]}\nC: {'rotation matrix': [[0.856314, -0.338309, 0.390222], [-0.510605, -0.441176, 0.738002], [-0.077516, -0.83121, -0.550528]], 'translation vector': [2.731703, 6.760056, 1.408417]}\nD: {'rotation matrix': [[0.9999681707679642, 0.006356789386196493, 0.004849115684302276], [-0.0063488135736310385, 0.9999779524760675, -0.0017687198088092734], [-0.004860696689611514, 0.0017388114569044306, 0.9999869431575653]], 'translation vector': [0.001030885579985874, -0.006730226347642976, 0.006769981822561277]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.854414, -0.337949, 0.394674], [-0.51408, -0.439475, 0.736602], [-0.075485, -0.832257, -0.549227]], 'translation vector': [2.728753, 6.764147, 1.410515]}\nB: {'rotation matrix': [[0.857663, -0.338131, 0.387404], [-0.508133, -0.441807, 0.739329], [-0.078832, -0.830948, -0.550737]], 'translation vector': [2.730525, 6.755143, 1.407191]}\nC: {'rotation matrix': [[0.856314, -0.338309, 0.390222], [-0.510605, -0.441176, 0.738002], [-0.077516, -0.83121, -0.550528]], 'translation vector': [2.731703, 6.760056, 1.408417]}\nD: {'rotation matrix': [[0.9999681707679642, 0.006356789386196493, 0.004849115684302276], [-0.0063488135736310385, 0.9999779524760675, -0.0017687198088092734], [-0.004860696689611514, 0.0017388114569044306, 0.9999869431575653]], 'translation vector': [0.001030885579985874, -0.006730226347642976, 0.006769981822561277]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_32_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_32_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_32_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_32_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.230447, -0.471956, 0.850971], [-0.964157, 0.007445, 0.265227], [-0.131511, -0.881591, -0.453324]], 'translation vector': [3.039354, 2.955346, 1.549151]}\nB: {'rotation matrix': [[0.228449, -0.472123, 0.851417], [-0.96468, 0.008044, 0.2633], [-0.131159, -0.881496, -0.45361]], 'translation vector': [3.038737, 2.954341, 1.548813]}\nC: {'rotation matrix': [[0.9999932374461685, -2.1218966376535376e-05, -0.003832905020754294], [2.3397413103181216e-05, 0.9999998755264938, 0.00025510731414015743], [0.003833858200182055, -0.0002556535302727228, 0.9999922097121886]], 'translation vector': [-0.00018189815458224956, -0.001091765193039329, 0.0006659190149727046]}\nD: {'rotation matrix': [[0.234859, -0.471403, 0.850071], [-0.962925, 0.006583, 0.269689], [-0.132728, -0.881894, -0.452379]], 'translation vector': [3.04024, 2.955162, 1.549553]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.230447, -0.471956, 0.850971], [-0.964157, 0.007445, 0.265227], [-0.131511, -0.881591, -0.453324]], 'translation vector': [3.039354, 2.955346, 1.549151]}\nB: {'rotation matrix': [[0.228449, -0.472123, 0.851417], [-0.96468, 0.008044, 0.2633], [-0.131159, -0.881496, -0.45361]], 'translation vector': [3.038737, 2.954341, 1.548813]}\nC: {'rotation matrix': [[0.9999932374461685, -2.1218966376535376e-05, -0.003832905020754294], [2.3397413103181216e-05, 0.9999998755264938, 0.00025510731414015743], [0.003833858200182055, -0.0002556535302727228, 0.9999922097121886]], 'translation vector': [-0.00018189815458224956, -0.001091765193039329, 0.0006659190149727046]}\nD: {'rotation matrix': [[0.234859, -0.471403, 0.850071], [-0.962925, 0.006583, 0.269689], [-0.132728, -0.881894, -0.452379]], 'translation vector': [3.04024, 2.955162, 1.549553]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_33_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_33_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_33_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_33_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.9999722451927703, 0.006302215099590525, -0.00403981735739725], [-0.006327345454891822, 0.9999605703041478, -0.0062233586210535905], [0.004001166188993241, 0.006248748688806741, 0.9999728906669906]], 'translation vector': [0.005093628494140745, -0.0003734020522905279, 0.0005966377475724594]}\nB: {'rotation matrix': [[-0.852779, -0.130984, 0.505581], [-0.521088, 0.148208, -0.840537], [0.035166, -0.980244, -0.194643]], 'translation vector': [2.708243, 1.722235, 1.600397]}\nC: {'rotation matrix': [[-0.85558, -0.133703, 0.500106], [-0.51643, 0.153622, -0.842437], [0.035809, -0.979042, -0.200484]], 'translation vector': [2.710987, 1.723705, 1.596351]}\nD: {'rotation matrix': [[-0.853917, -0.132599, 0.503232], [-0.519221, 0.151792, -0.841052], [0.035136, -0.979478, -0.198466]], 'translation vector': [2.709099, 1.722802, 1.598917]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999722451927703, 0.006302215099590525, -0.00403981735739725], [-0.006327345454891822, 0.9999605703041478, -0.0062233586210535905], [0.004001166188993241, 0.006248748688806741, 0.9999728906669906]], 'translation vector': [0.005093628494140745, -0.0003734020522905279, 0.0005966377475724594]}\nB: {'rotation matrix': [[-0.852779, -0.130984, 0.505581], [-0.521088, 0.148208, -0.840537], [0.035166, -0.980244, -0.194643]], 'translation vector': [2.708243, 1.722235, 1.600397]}\nC: {'rotation matrix': [[-0.85558, -0.133703, 0.500106], [-0.51643, 0.153622, -0.842437], [0.035809, -0.979042, -0.200484]], 'translation vector': [2.710987, 1.723705, 1.596351]}\nD: {'rotation matrix': [[-0.853917, -0.132599, 0.503232], [-0.519221, 0.151792, -0.841052], [0.035136, -0.979478, -0.198466]], 'translation vector': [2.709099, 1.722802, 1.598917]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_34_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_34_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_34_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_34_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.478873, -0.428944, 0.765955], [-0.87533, -0.166797, 0.453846], [-0.066915, -0.887798, -0.455343]], 'translation vector': [0.725473, 2.084639, 1.401624]}\nB: {'rotation matrix': [[0.9999321559246817, -0.004700941469421, -0.010648795235394873], [0.0046277052826301434, 0.9999657024420984, -0.006814383513752801], [0.010680686020790284, 0.006765103656469935, 0.9999199949357819]], 'translation vector': [-0.010159202650746213, -0.00890278579572934, 0.006564575177659959]}\nC: {'rotation matrix': [[0.476891, -0.427829, 0.767814], [-0.876452, -0.165482, 0.452159], [-0.066387, -0.888582, -0.453888]], 'translation vector': [0.720453, 2.082574, 1.402557]}\nD: {'rotation matrix': [[0.480806, -0.429519, 0.764421], [-0.874127, -0.166433, 0.456292], [-0.068761, -0.887589, -0.455476]], 'translation vector': [0.729586, 2.089959, 1.401763]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.478873, -0.428944, 0.765955], [-0.87533, -0.166797, 0.453846], [-0.066915, -0.887798, -0.455343]], 'translation vector': [0.725473, 2.084639, 1.401624]}\nB: {'rotation matrix': [[0.9999321559246817, -0.004700941469421, -0.010648795235394873], [0.0046277052826301434, 0.9999657024420984, -0.006814383513752801], [0.010680686020790284, 0.006765103656469935, 0.9999199949357819]], 'translation vector': [-0.010159202650746213, -0.00890278579572934, 0.006564575177659959]}\nC: {'rotation matrix': [[0.476891, -0.427829, 0.767814], [-0.876452, -0.165482, 0.452159], [-0.066387, -0.888582, -0.453888]], 'translation vector': [0.720453, 2.082574, 1.402557]}\nD: {'rotation matrix': [[0.480806, -0.429519, 0.764421], [-0.874127, -0.166433, 0.456292], [-0.068761, -0.887589, -0.455476]], 'translation vector': [0.729586, 2.089959, 1.401763]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_35_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_35_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_35_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_35_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.745148, -0.37119, 0.554052], [-0.66666, -0.436838, 0.603934], [0.017857, -0.819385, -0.572966]], 'translation vector': [3.707678, 4.401502, 1.259793]}\nB: {'rotation matrix': [[0.9999956925108978, 0.0027628800894689116, -0.0013776131907199199], [-0.002750831230441356, 0.9999530244932063, 0.009285447903977378], [0.0014027365353670037, -0.00928147923100416, 0.9999562334978788]], 'translation vector': [-0.001323540742452639, -0.0019242276069570963, 0.0020358659652854882]}\nC: {'rotation matrix': [[0.745353, -0.370803, 0.554034], [-0.666429, -0.436771, 0.604239], [0.017933, -0.819595, -0.572662]], 'translation vector': [3.707908, 4.40198, 1.260519]}\nD: {'rotation matrix': [[0.746372, -0.37052, 0.55285], [-0.665272, -0.438418, 0.60432], [0.018468, -0.818844, -0.57372]], 'translation vector': [3.708833, 4.402057, 1.261367]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.745148, -0.37119, 0.554052], [-0.66666, -0.436838, 0.603934], [0.017857, -0.819385, -0.572966]], 'translation vector': [3.707678, 4.401502, 1.259793]}\nB: {'rotation matrix': [[0.9999956925108978, 0.0027628800894689116, -0.0013776131907199199], [-0.002750831230441356, 0.9999530244932063, 0.009285447903977378], [0.0014027365353670037, -0.00928147923100416, 0.9999562334978788]], 'translation vector': [-0.001323540742452639, -0.0019242276069570963, 0.0020358659652854882]}\nC: {'rotation matrix': [[0.745353, -0.370803, 0.554034], [-0.666429, -0.436771, 0.604239], [0.017933, -0.819595, -0.572662]], 'translation vector': [3.707908, 4.40198, 1.260519]}\nD: {'rotation matrix': [[0.746372, -0.37052, 0.55285], [-0.665272, -0.438418, 0.60432], [0.018468, -0.818844, -0.57372]], 'translation vector': [3.708833, 4.402057, 1.261367]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_36_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_36_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_36_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_36_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.116268, -0.545929, 0.829725], [-0.992164, 0.102308, -0.071715], [-0.045736, -0.831562, -0.553546]], 'translation vector': [1.188241, 1.804719, 1.496587]}\nB: {'rotation matrix': [[-0.114381, -0.546538, 0.829586], [-0.992382, 0.101334, -0.070067], [-0.045771, -0.83128, -0.553966]], 'translation vector': [1.18804, 1.806907, 1.497044]}\nC: {'rotation matrix': [[-0.116275, -0.545912, 0.829735], [-0.992183, 0.101947, -0.071965], [-0.045303, -0.831617, -0.553499]], 'translation vector': [1.188215, 1.807271, 1.496983]}\nD: {'rotation matrix': [[0.9999972656726313, 0.0013206868291442259, 0.0020218952923470846], [-0.0013233096218697464, 0.999999141112598, 0.0010787718934225417], [-0.0020206374390544144, -0.0010806530653509267, 0.9999977188154618]], 'translation vector': [-0.0038275910671821123, 0.000160776135250007, -0.0021485328355081296]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.116268, -0.545929, 0.829725], [-0.992164, 0.102308, -0.071715], [-0.045736, -0.831562, -0.553546]], 'translation vector': [1.188241, 1.804719, 1.496587]}\nB: {'rotation matrix': [[-0.114381, -0.546538, 0.829586], [-0.992382, 0.101334, -0.070067], [-0.045771, -0.83128, -0.553966]], 'translation vector': [1.18804, 1.806907, 1.497044]}\nC: {'rotation matrix': [[-0.116275, -0.545912, 0.829735], [-0.992183, 0.101947, -0.071965], [-0.045303, -0.831617, -0.553499]], 'translation vector': [1.188215, 1.807271, 1.496983]}\nD: {'rotation matrix': [[0.9999972656726313, 0.0013206868291442259, 0.0020218952923470846], [-0.0013233096218697464, 0.999999141112598, 0.0010787718934225417], [-0.0020206374390544144, -0.0010806530653509267, 0.9999977188154618]], 'translation vector': [-0.0038275910671821123, 0.000160776135250007, -0.0021485328355081296]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_37_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_37_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_37_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_37_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.997087, -0.025415, -0.071922], [0.054952, -0.414609, 0.908339], [-0.052905, -0.909645, -0.412004]], 'translation vector': [4.407682, 5.403047, 1.49649]}\nB: {'rotation matrix': [[1.0000003154168173, -0.00026066196587795827, 0.0009499731630029714], [0.00026257414143552785, 0.9999962311537921, -0.0026991488717855007], [-0.0009496510997198135, 0.002699452079922895, 0.9999961867215906]], 'translation vector': [-0.0035058102061285012, -0.0001503164701972537, 0.00022028015795205746]}\nC: {'rotation matrix': [[0.996877, -0.026735, -0.074307], [0.056551, -0.415107, 0.908013], [-0.055122, -0.909379, -0.412299]], 'translation vector': [4.407921, 5.402507, 1.494552]}\nD: {'rotation matrix': [[0.997138, -0.02412, -0.07165], [0.055312, -0.413324, 0.908903], [-0.051537, -0.910265, -0.410807]], 'translation vector': [4.410345, 5.401881, 1.497987]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.997087, -0.025415, -0.071922], [0.054952, -0.414609, 0.908339], [-0.052905, -0.909645, -0.412004]], 'translation vector': [4.407682, 5.403047, 1.49649]}\nB: {'rotation matrix': [[1.0000003154168173, -0.00026066196587795827, 0.0009499731630029714], [0.00026257414143552785, 0.9999962311537921, -0.0026991488717855007], [-0.0009496510997198135, 0.002699452079922895, 0.9999961867215906]], 'translation vector': [-0.0035058102061285012, -0.0001503164701972537, 0.00022028015795205746]}\nC: {'rotation matrix': [[0.996877, -0.026735, -0.074307], [0.056551, -0.415107, 0.908013], [-0.055122, -0.909379, -0.412299]], 'translation vector': [4.407921, 5.402507, 1.494552]}\nD: {'rotation matrix': [[0.997138, -0.02412, -0.07165], [0.055312, -0.413324, 0.908903], [-0.051537, -0.910265, -0.410807]], 'translation vector': [4.410345, 5.401881, 1.497987]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_38_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_38_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_38_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_38_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.317061, -0.465845, 0.826112], [-0.947162, 0.200121, -0.250671], [-0.048548, -0.86194, -0.504681]], 'translation vector': [2.298134, 2.388596, 1.453916]}\nB: {'rotation matrix': [[-0.317304, -0.461983, 0.828185], [-0.946993, 0.200626, -0.250908], [-0.05024, -0.863899, -0.501153]], 'translation vector': [2.298876, 2.392571, 1.455489]}\nC: {'rotation matrix': [[0.99999454935107, 0.0002197888751369094, -0.0032128422811739327], [-0.00022937596583946223, 0.9999976816384528, -0.0024943946424807743], [0.0032131009034445414, 0.0024955910558726972, 0.9999912821080568]], 'translation vector': [0.001089045909415276, -0.0004423003337574727, -0.0002086110960047849]}\nD: {'rotation matrix': [[-0.314906, -0.456701, 0.83202], [-0.947639, 0.200286, -0.248728], [-0.053048, -0.866781, -0.49586]], 'translation vector': [2.297376, 2.389925, 1.457247]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.317061, -0.465845, 0.826112], [-0.947162, 0.200121, -0.250671], [-0.048548, -0.86194, -0.504681]], 'translation vector': [2.298134, 2.388596, 1.453916]}\nB: {'rotation matrix': [[-0.317304, -0.461983, 0.828185], [-0.946993, 0.200626, -0.250908], [-0.05024, -0.863899, -0.501153]], 'translation vector': [2.298876, 2.392571, 1.455489]}\nC: {'rotation matrix': [[0.99999454935107, 0.0002197888751369094, -0.0032128422811739327], [-0.00022937596583946223, 0.9999976816384528, -0.0024943946424807743], [0.0032131009034445414, 0.0024955910558726972, 0.9999912821080568]], 'translation vector': [0.001089045909415276, -0.0004423003337574727, -0.0002086110960047849]}\nD: {'rotation matrix': [[-0.314906, -0.456701, 0.83202], [-0.947639, 0.200286, -0.248728], [-0.053048, -0.866781, -0.49586]], 'translation vector': [2.297376, 2.389925, 1.457247]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_39_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_39_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_39_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_39_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.887986, -0.072993, 0.45404], [-0.454573, -0.288743, 0.84261], [0.069596, -0.95462, -0.28958]], 'translation vector': [3.216625, 3.12153, 1.569232]}\nB: {'rotation matrix': [[0.881294, -0.091283, 0.463668], [-0.468392, -0.298889, 0.831429], [0.06269, -0.949912, -0.306165]], 'translation vector': [3.22503, 3.133041, 1.572641]}\nC: {'rotation matrix': [[0.9998534926564413, 0.011234421547280734, -0.012941695383424894], [-0.01131150095792877, 0.9999182350466784, -0.005915937307264116], [0.012873405071388795, 0.006062261057420049, 0.9998987281608317]], 'translation vector': [-0.0006289172879170302, -0.011376901257172278, -0.010410317713380746]}\nD: {'rotation matrix': [[0.883743, -0.084646, 0.460254], [-0.463409, -0.29531, 0.83549], [0.065197, -0.951644, -0.300204]], 'translation vector': [3.211292, 3.12843, 1.571525]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.887986, -0.072993, 0.45404], [-0.454573, -0.288743, 0.84261], [0.069596, -0.95462, -0.28958]], 'translation vector': [3.216625, 3.12153, 1.569232]}\nB: {'rotation matrix': [[0.881294, -0.091283, 0.463668], [-0.468392, -0.298889, 0.831429], [0.06269, -0.949912, -0.306165]], 'translation vector': [3.22503, 3.133041, 1.572641]}\nC: {'rotation matrix': [[0.9998534926564413, 0.011234421547280734, -0.012941695383424894], [-0.01131150095792877, 0.9999182350466784, -0.005915937307264116], [0.012873405071388795, 0.006062261057420049, 0.9998987281608317]], 'translation vector': [-0.0006289172879170302, -0.011376901257172278, -0.010410317713380746]}\nD: {'rotation matrix': [[0.883743, -0.084646, 0.460254], [-0.463409, -0.29531, 0.83549], [0.065197, -0.951644, -0.300204]], 'translation vector': [3.211292, 3.12843, 1.571525]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_40_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_40_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_40_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_40_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.9999922215867046, 0.00022785537094286064, 0.0038428059210989744], [-0.0002767566400686762, 0.9999180395239896, 0.0128255569964717], [-0.0038395911947271166, -0.01282698821636408, 0.9999095074136637]], 'translation vector': [-0.012178319620715694, 0.0009877403245357463, 0.004998693428563072]}\nB: {'rotation matrix': [[-0.349791, 0.571502, -0.742315], [0.927295, 0.098467, -0.361147], [-0.133303, -0.814672, -0.564394]], 'translation vector': [7.153554, 3.625007, 1.584927]}\nC: {'rotation matrix': [[-0.352738, 0.563812, -0.746788], [0.92567, 0.093586, -0.366575], [-0.13679, -0.820584, -0.554915]], 'translation vector': [7.154875, 3.637451, 1.583088]}\nD: {'rotation matrix': [[-0.346362, 0.577228, -0.739487], [0.929238, 0.103006, -0.354833], [-0.128648, -0.81006, -0.57206]], 'translation vector': [7.154236, 3.613202, 1.583063]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999922215867046, 0.00022785537094286064, 0.0038428059210989744], [-0.0002767566400686762, 0.9999180395239896, 0.0128255569964717], [-0.0038395911947271166, -0.01282698821636408, 0.9999095074136637]], 'translation vector': [-0.012178319620715694, 0.0009877403245357463, 0.004998693428563072]}\nB: {'rotation matrix': [[-0.349791, 0.571502, -0.742315], [0.927295, 0.098467, -0.361147], [-0.133303, -0.814672, -0.564394]], 'translation vector': [7.153554, 3.625007, 1.584927]}\nC: {'rotation matrix': [[-0.352738, 0.563812, -0.746788], [0.92567, 0.093586, -0.366575], [-0.13679, -0.820584, -0.554915]], 'translation vector': [7.154875, 3.637451, 1.583088]}\nD: {'rotation matrix': [[-0.346362, 0.577228, -0.739487], [0.929238, 0.103006, -0.354833], [-0.128648, -0.81006, -0.57206]], 'translation vector': [7.154236, 3.613202, 1.583063]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_41_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_41_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_41_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_41_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.295342, -0.337479, 0.893802], [-0.954338, -0.060275, 0.292587], [-0.044868, -0.939402, -0.33987]], 'translation vector': [3.757863, 4.507889, 1.342911]}\nB: {'rotation matrix': [[0.300826, -0.332026, 0.894015], [-0.952739, -0.063015, 0.297182], [-0.042336, -0.941163, -0.33529]], 'translation vector': [3.757184, 4.502328, 1.344268]}\nC: {'rotation matrix': [[0.280244, -0.341756, 0.897032], [-0.959084, -0.060487, 0.276585], [-0.040265, -0.93784, -0.344724]], 'translation vector': [3.749212, 4.541941, 1.346336]}\nD: {'rotation matrix': [[0.9996750004430802, 0.005976407862532351, -0.024761409167148862], [-0.005880950295372731, 0.9999756244730406, 0.0039275528492273585], [0.024784884729748376, -0.0037817785482656863, 0.9996858791488856]], 'translation vector': [0.014484406993793275, 0.0006255655612603661, -0.006516735656635575]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.295342, -0.337479, 0.893802], [-0.954338, -0.060275, 0.292587], [-0.044868, -0.939402, -0.33987]], 'translation vector': [3.757863, 4.507889, 1.342911]}\nB: {'rotation matrix': [[0.300826, -0.332026, 0.894015], [-0.952739, -0.063015, 0.297182], [-0.042336, -0.941163, -0.33529]], 'translation vector': [3.757184, 4.502328, 1.344268]}\nC: {'rotation matrix': [[0.280244, -0.341756, 0.897032], [-0.959084, -0.060487, 0.276585], [-0.040265, -0.93784, -0.344724]], 'translation vector': [3.749212, 4.541941, 1.346336]}\nD: {'rotation matrix': [[0.9996750004430802, 0.005976407862532351, -0.024761409167148862], [-0.005880950295372731, 0.9999756244730406, 0.0039275528492273585], [0.024784884729748376, -0.0037817785482656863, 0.9996858791488856]], 'translation vector': [0.014484406993793275, 0.0006255655612603661, -0.006516735656635575]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_42_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_42_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_42_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_42_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.408105, -0.298824, 0.862644], [-0.912691, -0.155393, 0.377953], [0.021107, -0.941572, -0.33615]], 'translation vector': [3.68854, 2.987475, 1.504179]}\nB: {'rotation matrix': [[0.414415, -0.280989, 0.865625], [-0.909796, -0.152002, 0.386221], [0.023053, -0.947597, -0.318634]], 'translation vector': [3.695469, 2.977012, 1.528306]}\nC: {'rotation matrix': [[0.9999691985275884, -0.0028723049102575577, -0.007257311467463551], [0.0029984988555096107, 0.9998441859378951, 0.01739980616527257], [0.007206986340996781, -0.017422228683580315, 0.9998224150329578]], 'translation vector': [0.005083024714500617, 0.008976294562036191, -0.004546920528859744]}\nD: {'rotation matrix': [[0.410301, -0.290948, 0.864293], [-0.911655, -0.15496, 0.38062], [0.02319, -0.944106, -0.328824]], 'translation vector': [3.694328, 2.984669, 1.517045]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.408105, -0.298824, 0.862644], [-0.912691, -0.155393, 0.377953], [0.021107, -0.941572, -0.33615]], 'translation vector': [3.68854, 2.987475, 1.504179]}\nB: {'rotation matrix': [[0.414415, -0.280989, 0.865625], [-0.909796, -0.152002, 0.386221], [0.023053, -0.947597, -0.318634]], 'translation vector': [3.695469, 2.977012, 1.528306]}\nC: {'rotation matrix': [[0.9999691985275884, -0.0028723049102575577, -0.007257311467463551], [0.0029984988555096107, 0.9998441859378951, 0.01739980616527257], [0.007206986340996781, -0.017422228683580315, 0.9998224150329578]], 'translation vector': [0.005083024714500617, 0.008976294562036191, -0.004546920528859744]}\nD: {'rotation matrix': [[0.410301, -0.290948, 0.864293], [-0.911655, -0.15496, 0.38062], [0.02319, -0.944106, -0.328824]], 'translation vector': [3.694328, 2.984669, 1.517045]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_43_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_43_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_43_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_43_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.794148, 0.542055, -0.274783], [0.607642, 0.715697, -0.34431], [0.010026, -0.440403, -0.897744]], 'translation vector': [2.029685, 2.312871, 1.199782]}\nB: {'rotation matrix': [[-0.789274, 0.545355, -0.282196], [0.613822, 0.713018, -0.338862], [0.016411, -0.440674, -0.897518]], 'translation vector': [2.029754, 2.312013, 1.198812]}\nC: {'rotation matrix': [[-0.792558, 0.542991, -0.277512], [0.609667, 0.714954, -0.342268], [0.01256, -0.440457, -0.897686]], 'translation vector': [2.028831, 2.312793, 1.199579]}\nD: {'rotation matrix': [[0.9999690183091173, 0.00758939386438217, -0.0016348482581436392], [-0.007592849178659918, 0.9999686021169869, -0.0023322818814674375], [0.0016172805944689142, 0.0023449024761050914, 0.9999961697739403]], 'translation vector': [-0.0009595698380654716, -0.001243015152278204, -0.0008030280503015241]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.794148, 0.542055, -0.274783], [0.607642, 0.715697, -0.34431], [0.010026, -0.440403, -0.897744]], 'translation vector': [2.029685, 2.312871, 1.199782]}\nB: {'rotation matrix': [[-0.789274, 0.545355, -0.282196], [0.613822, 0.713018, -0.338862], [0.016411, -0.440674, -0.897518]], 'translation vector': [2.029754, 2.312013, 1.198812]}\nC: {'rotation matrix': [[-0.792558, 0.542991, -0.277512], [0.609667, 0.714954, -0.342268], [0.01256, -0.440457, -0.897686]], 'translation vector': [2.028831, 2.312793, 1.199579]}\nD: {'rotation matrix': [[0.9999690183091173, 0.00758939386438217, -0.0016348482581436392], [-0.007592849178659918, 0.9999686021169869, -0.0023322818814674375], [0.0016172805944689142, 0.0023449024761050914, 0.9999961697739403]], 'translation vector': [-0.0009595698380654716, -0.001243015152278204, -0.0008030280503015241]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_44_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_44_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_44_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_44_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.595869, 0.488846, -0.637158], [0.802996, -0.351087, 0.481596], [0.011728, -0.798604, -0.601743]], 'translation vector': [3.453696, 1.113575, 1.412785]}\nB: {'rotation matrix': [[0.596116, 0.489381, -0.636516], [0.802814, -0.351792, 0.481386], [0.01166, -0.797966, -0.60259]], 'translation vector': [3.45192, 1.112521, 1.411639]}\nC: {'rotation matrix': [[0.9999972585421386, -0.0019706644639079346, -0.0016038162580140711], [0.0019696489330632795, 0.9999981939869151, -0.0004898637930363391], [0.00160459924260016, 0.0004861135599392089, 0.9999985246041514]], 'translation vector': [0.0004367632436044211, -0.0013470306629629059, 0.001255614697354801]}\nD: {'rotation matrix': [[0.596167, 0.487305, -0.638059], [0.802791, -0.351322, 0.481768], [0.010604, -0.799442, -0.60065]], 'translation vector': [3.452477, 1.114933, 1.412574]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.595869, 0.488846, -0.637158], [0.802996, -0.351087, 0.481596], [0.011728, -0.798604, -0.601743]], 'translation vector': [3.453696, 1.113575, 1.412785]}\nB: {'rotation matrix': [[0.596116, 0.489381, -0.636516], [0.802814, -0.351792, 0.481386], [0.01166, -0.797966, -0.60259]], 'translation vector': [3.45192, 1.112521, 1.411639]}\nC: {'rotation matrix': [[0.9999972585421386, -0.0019706644639079346, -0.0016038162580140711], [0.0019696489330632795, 0.9999981939869151, -0.0004898637930363391], [0.00160459924260016, 0.0004861135599392089, 0.9999985246041514]], 'translation vector': [0.0004367632436044211, -0.0013470306629629059, 0.001255614697354801]}\nD: {'rotation matrix': [[0.596167, 0.487305, -0.638059], [0.802791, -0.351322, 0.481768], [0.010604, -0.799442, -0.60065]], 'translation vector': [3.452477, 1.114933, 1.412574]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_45_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_45_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_45_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_45_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.454351, -0.425578, 0.782591], [-0.890802, 0.223047, -0.395881], [-0.006077, -0.877003, -0.480447]], 'translation vector': [2.248463, 3.862178, 1.517095]}\nB: {'rotation matrix': [[-0.455273, -0.423684, 0.783083], [-0.890312, 0.224946, -0.395908], [-0.008411, -0.877434, -0.479623]], 'translation vector': [2.248543, 3.862554, 1.517483]}\nC: {'rotation matrix': [[0.9999998220095317, 0.00019344203418205028, 0.0003904002954544705], [-0.00019414720576528636, 0.99999726822095, 0.002542005647158053], [-0.00038905761812178927, -0.002542402173661647, 0.9999965389998547]], 'translation vector': [-0.0007329855911066829, -0.0003036787606989222, 0.00038766167203613255]}\nD: {'rotation matrix': [[-0.455182, -0.424042, 0.782942], [-0.890372, 0.223522, -0.39658], [-0.006838, -0.877625, -0.479299]], 'translation vector': [2.247845, 3.863035, 1.516836]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.454351, -0.425578, 0.782591], [-0.890802, 0.223047, -0.395881], [-0.006077, -0.877003, -0.480447]], 'translation vector': [2.248463, 3.862178, 1.517095]}\nB: {'rotation matrix': [[-0.455273, -0.423684, 0.783083], [-0.890312, 0.224946, -0.395908], [-0.008411, -0.877434, -0.479623]], 'translation vector': [2.248543, 3.862554, 1.517483]}\nC: {'rotation matrix': [[0.9999998220095317, 0.00019344203418205028, 0.0003904002954544705], [-0.00019414720576528636, 0.99999726822095, 0.002542005647158053], [-0.00038905761812178927, -0.002542402173661647, 0.9999965389998547]], 'translation vector': [-0.0007329855911066829, -0.0003036787606989222, 0.00038766167203613255]}\nD: {'rotation matrix': [[-0.455182, -0.424042, 0.782942], [-0.890372, 0.223522, -0.39658], [-0.006838, -0.877625, -0.479299]], 'translation vector': [2.247845, 3.863035, 1.516836]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_46_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_46_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_46_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_46_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.503003, -0.389292, 0.771648], [-0.863104, 0.179601, -0.472011], [0.045161, -0.903435, -0.426339]], 'translation vector': [8.991447, 2.792113, 1.935809]}\nB: {'rotation matrix': [[0.9998820105783071, 0.006892269339613531, -0.013715431860878731], [-0.006800370949834618, 0.9999536838119013, 0.00673862053023821], [0.013761408703211613, -0.006644159929287518, 0.9998834142169373]], 'translation vector': [0.0018210588463203337, -0.0009922093443961444, -0.010804184818734797]}\nC: {'rotation matrix': [[-0.507392, -0.392271, 0.767253], [-0.860549, 0.184347, -0.474839], [0.044825, -0.901188, -0.431104]], 'translation vector': [8.996889, 2.787546, 1.938329]}\nD: {'rotation matrix': [[-0.511945, -0.391945, 0.76439], [-0.857826, 0.186392, -0.47895], [0.045246, -0.900909, -0.431643]], 'translation vector': [9.004251, 2.788493, 1.934378]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.503003, -0.389292, 0.771648], [-0.863104, 0.179601, -0.472011], [0.045161, -0.903435, -0.426339]], 'translation vector': [8.991447, 2.792113, 1.935809]}\nB: {'rotation matrix': [[0.9998820105783071, 0.006892269339613531, -0.013715431860878731], [-0.006800370949834618, 0.9999536838119013, 0.00673862053023821], [0.013761408703211613, -0.006644159929287518, 0.9998834142169373]], 'translation vector': [0.0018210588463203337, -0.0009922093443961444, -0.010804184818734797]}\nC: {'rotation matrix': [[-0.507392, -0.392271, 0.767253], [-0.860549, 0.184347, -0.474839], [0.044825, -0.901188, -0.431104]], 'translation vector': [8.996889, 2.787546, 1.938329]}\nD: {'rotation matrix': [[-0.511945, -0.391945, 0.76439], [-0.857826, 0.186392, -0.47895], [0.045246, -0.900909, -0.431643]], 'translation vector': [9.004251, 2.788493, 1.934378]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_47_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_47_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_47_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_47_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.347731, 0.021734, -0.937343], [0.937583, -0.012995, 0.347518], [-0.004627, -0.999679, -0.024896]], 'translation vector': [3.086271, 2.7877, 1.609772]}\nB: {'rotation matrix': [[0.349639, 0.022266, -0.93662], [0.936876, -0.012692, 0.349433], [-0.004108, -0.999671, -0.025298]], 'translation vector': [3.085923, 2.787744, 1.608445]}\nC: {'rotation matrix': [[0.9999983503028658, 0.0012588328917477426, 0.0010046121758577645], [-0.001258491549862339, 0.999999582567488, 0.0007249187267349242], [-0.0010031229358240259, -0.0007259449856603071, 0.9999988669360486]], 'translation vector': [-0.0007969980165536406, 0.0011986171600251172, 0.00041117594518969014]}\nD: {'rotation matrix': [[0.348071, 0.022212, -0.937205], [0.937459, -0.012734, 0.347863], [-0.004208, -0.999672, -0.025256]], 'translation vector': [3.0862, 2.78781, 1.60897]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.347731, 0.021734, -0.937343], [0.937583, -0.012995, 0.347518], [-0.004627, -0.999679, -0.024896]], 'translation vector': [3.086271, 2.7877, 1.609772]}\nB: {'rotation matrix': [[0.349639, 0.022266, -0.93662], [0.936876, -0.012692, 0.349433], [-0.004108, -0.999671, -0.025298]], 'translation vector': [3.085923, 2.787744, 1.608445]}\nC: {'rotation matrix': [[0.9999983503028658, 0.0012588328917477426, 0.0010046121758577645], [-0.001258491549862339, 0.999999582567488, 0.0007249187267349242], [-0.0010031229358240259, -0.0007259449856603071, 0.9999988669360486]], 'translation vector': [-0.0007969980165536406, 0.0011986171600251172, 0.00041117594518969014]}\nD: {'rotation matrix': [[0.348071, 0.022212, -0.937205], [0.937459, -0.012734, 0.347863], [-0.004208, -0.999672, -0.025256]], 'translation vector': [3.0862, 2.78781, 1.60897]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_48_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_48_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_48_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_48_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.671878, -0.274721, 0.687829], [-0.740313, -0.220562, 0.635051], [-0.022753, -0.935885, -0.35157]], 'translation vector': [3.807358, 2.10759, 1.500018]}\nB: {'rotation matrix': [[0.9999995287042943, -0.00016620863165950158, -1.0138120576769867e-05], [0.00016705647891749608, 0.9999995162619548, -0.0013603662243301747], [1.062657980571793e-05, 0.001360311886111166, 0.9999990686101642]], 'translation vector': [-0.004297820144825049, 0.003351067382226791, -0.0005408272137925607]}\nC: {'rotation matrix': [[0.670129, -0.272494, 0.690416], [-0.741875, -0.216557, 0.634606], [-0.023412, -0.93747, -0.347278]], 'translation vector': [3.805446, 2.107442, 1.49456]}\nD: {'rotation matrix': [[0.670813, -0.272809, 0.689627], [-0.741265, -0.217599, 0.634962], [-0.023161, -0.937137, -0.348192]], 'translation vector': [3.805646, 2.107794, 1.49708]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.671878, -0.274721, 0.687829], [-0.740313, -0.220562, 0.635051], [-0.022753, -0.935885, -0.35157]], 'translation vector': [3.807358, 2.10759, 1.500018]}\nB: {'rotation matrix': [[0.9999995287042943, -0.00016620863165950158, -1.0138120576769867e-05], [0.00016705647891749608, 0.9999995162619548, -0.0013603662243301747], [1.062657980571793e-05, 0.001360311886111166, 0.9999990686101642]], 'translation vector': [-0.004297820144825049, 0.003351067382226791, -0.0005408272137925607]}\nC: {'rotation matrix': [[0.670129, -0.272494, 0.690416], [-0.741875, -0.216557, 0.634606], [-0.023412, -0.93747, -0.347278]], 'translation vector': [3.805446, 2.107442, 1.49456]}\nD: {'rotation matrix': [[0.670813, -0.272809, 0.689627], [-0.741265, -0.217599, 0.634962], [-0.023161, -0.937137, -0.348192]], 'translation vector': [3.805646, 2.107794, 1.49708]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_49_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_49_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_49_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_49_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.479262, 0.501356, -0.720382], [0.874939, 0.337636, -0.347107], [0.069203, -0.796646, -0.600472]], 'translation vector': [2.874844, 0.864648, 1.19894]}\nB: {'rotation matrix': [[-0.476726, 0.503154, -0.720812], [0.876238, 0.337562, -0.343889], [0.070289, -0.795543, -0.601806]], 'translation vector': [2.872792, 0.865184, 1.200293]}\nC: {'rotation matrix': [[-0.480917, 0.499307, -0.720702], [0.874259, 0.335212, -0.351147], [0.066258, -0.798953, -0.597732]], 'translation vector': [2.877507, 0.861745, 1.198945]}\nD: {'rotation matrix': [[0.9999458075169825, -0.0037298030767581817, 0.009658870905228063], [0.003830453413616424, 0.9999377550296561, -0.010410573829210937], [-0.009619405173591757, 0.010446297554888797, 0.999899528124321]], 'translation vector': [0.004156407549993579, -0.0031544955662062835, 0.0021419719379069946]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.479262, 0.501356, -0.720382], [0.874939, 0.337636, -0.347107], [0.069203, -0.796646, -0.600472]], 'translation vector': [2.874844, 0.864648, 1.19894]}\nB: {'rotation matrix': [[-0.476726, 0.503154, -0.720812], [0.876238, 0.337562, -0.343889], [0.070289, -0.795543, -0.601806]], 'translation vector': [2.872792, 0.865184, 1.200293]}\nC: {'rotation matrix': [[-0.480917, 0.499307, -0.720702], [0.874259, 0.335212, -0.351147], [0.066258, -0.798953, -0.597732]], 'translation vector': [2.877507, 0.861745, 1.198945]}\nD: {'rotation matrix': [[0.9999458075169825, -0.0037298030767581817, 0.009658870905228063], [0.003830453413616424, 0.9999377550296561, -0.010410573829210937], [-0.009619405173591757, 0.010446297554888797, 0.999899528124321]], 'translation vector': [0.004156407549993579, -0.0031544955662062835, 0.0021419719379069946]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_50_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_50_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_50_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_50_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.408839, -0.490635, 0.769499], [-0.912309, 0.19818, -0.358354], [0.023322, -0.848529, -0.528634]], 'translation vector': [0.933751, 3.504875, 1.495928]}\nB: {'rotation matrix': [[-0.405517, -0.49009, 0.771601], [-0.913834, 0.197477, -0.354839], [0.02153, -0.849008, -0.527941]], 'translation vector': [0.92311, 3.508826, 1.494794]}\nC: {'rotation matrix': [[-0.406934, -0.490269, 0.770741], [-0.913168, 0.197092, -0.356762], [0.023003, -0.848994, -0.527902]], 'translation vector': [0.928096, 3.507151, 1.495325]}\nD: {'rotation matrix': [[0.9999720790780128, 0.0021449080565870337, 0.007189565686557225], [-0.0021449387549180303, 0.9999977939892998, -4.554193667967342e-05], [-0.007190098560018757, 3.036958542106401e-05, 0.9999745575458397]], 'translation vector': [-0.0022708049168844724, -0.01148622047209824, 0.014300413115339472]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.408839, -0.490635, 0.769499], [-0.912309, 0.19818, -0.358354], [0.023322, -0.848529, -0.528634]], 'translation vector': [0.933751, 3.504875, 1.495928]}\nB: {'rotation matrix': [[-0.405517, -0.49009, 0.771601], [-0.913834, 0.197477, -0.354839], [0.02153, -0.849008, -0.527941]], 'translation vector': [0.92311, 3.508826, 1.494794]}\nC: {'rotation matrix': [[-0.406934, -0.490269, 0.770741], [-0.913168, 0.197092, -0.356762], [0.023003, -0.848994, -0.527902]], 'translation vector': [0.928096, 3.507151, 1.495325]}\nD: {'rotation matrix': [[0.9999720790780128, 0.0021449080565870337, 0.007189565686557225], [-0.0021449387549180303, 0.9999977939892998, -4.554193667967342e-05], [-0.007190098560018757, 3.036958542106401e-05, 0.9999745575458397]], 'translation vector': [-0.0022708049168844724, -0.01148622047209824, 0.014300413115339472]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_51_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_51_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_51_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_51_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.812183, -0.277025, 0.513436], [-0.583176, 0.410049, -0.70126], [-0.016267, -0.868975, -0.494589]], 'translation vector': [4.864513, 2.490983, 1.398342]}\nB: {'rotation matrix': [[-0.813534, -0.276094, 0.511796], [-0.581281, 0.411223, -0.702146], [-0.016604, -0.868716, -0.495032]], 'translation vector': [4.865518, 2.490622, 1.399591]}\nC: {'rotation matrix': [[0.999997135129029, 0.001605805084179667, 0.0018057529572375504], [-0.0016158596045621908, 0.9999825699875163, 0.0055795127618636095], [-0.0017979244064656866, -0.005583390464990996, 0.9999826090372864]], 'translation vector': [-0.008521917625347264, -0.003245150959530152, 0.000826648743016356]}\nD: {'rotation matrix': [[-0.814102, -0.273924, 0.512058], [-0.580426, 0.411969, -0.702416], [-0.018543, -0.86905, -0.494377]], 'translation vector': [4.865249, 2.49003, 1.4009]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.812183, -0.277025, 0.513436], [-0.583176, 0.410049, -0.70126], [-0.016267, -0.868975, -0.494589]], 'translation vector': [4.864513, 2.490983, 1.398342]}\nB: {'rotation matrix': [[-0.813534, -0.276094, 0.511796], [-0.581281, 0.411223, -0.702146], [-0.016604, -0.868716, -0.495032]], 'translation vector': [4.865518, 2.490622, 1.399591]}\nC: {'rotation matrix': [[0.999997135129029, 0.001605805084179667, 0.0018057529572375504], [-0.0016158596045621908, 0.9999825699875163, 0.0055795127618636095], [-0.0017979244064656866, -0.005583390464990996, 0.9999826090372864]], 'translation vector': [-0.008521917625347264, -0.003245150959530152, 0.000826648743016356]}\nD: {'rotation matrix': [[-0.814102, -0.273924, 0.512058], [-0.580426, 0.411969, -0.702416], [-0.018543, -0.86905, -0.494377]], 'translation vector': [4.865249, 2.49003, 1.4009]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_52_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_52_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_52_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_52_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.065966, 0.424992, -0.90279], [0.996327, -0.077558, 0.03629], [-0.054596, -0.901868, -0.428547]], 'translation vector': [3.688125, 7.382969, 1.65232]}\nB: {'rotation matrix': [[0.06445, 0.42596, -0.902444], [0.996336, -0.078418, 0.034141], [-0.056225, -0.901337, -0.429453]], 'translation vector': [3.691079, 7.385597, 1.655249]}\nC: {'rotation matrix': [[0.065269, 0.425434, -0.902633], [0.996301, -0.078461, 0.035062], [-0.055905, -0.901582, -0.428981]], 'translation vector': [3.688166, 7.384302, 1.653993]}\nD: {'rotation matrix': [[0.9999823018718307, 0.004888113491700664, -0.003558790677819051], [-0.004889637706536031, 0.9999888180833659, -0.00037767294265026564], [0.0035570250944470354, 0.00039493287161776724, 0.9999941352091379]], 'translation vector': [-0.0029368758378494064, 0.0007877967404965047, -0.003178400438716089]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.065966, 0.424992, -0.90279], [0.996327, -0.077558, 0.03629], [-0.054596, -0.901868, -0.428547]], 'translation vector': [3.688125, 7.382969, 1.65232]}\nB: {'rotation matrix': [[0.06445, 0.42596, -0.902444], [0.996336, -0.078418, 0.034141], [-0.056225, -0.901337, -0.429453]], 'translation vector': [3.691079, 7.385597, 1.655249]}\nC: {'rotation matrix': [[0.065269, 0.425434, -0.902633], [0.996301, -0.078461, 0.035062], [-0.055905, -0.901582, -0.428981]], 'translation vector': [3.688166, 7.384302, 1.653993]}\nD: {'rotation matrix': [[0.9999823018718307, 0.004888113491700664, -0.003558790677819051], [-0.004889637706536031, 0.9999888180833659, -0.00037767294265026564], [0.0035570250944470354, 0.00039493287161776724, 0.9999941352091379]], 'translation vector': [-0.0029368758378494064, 0.0007877967404965047, -0.003178400438716089]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_53_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_53_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_53_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_53_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.664692, -0.099755, 0.740428], [-0.744592, -0.007027, 0.667483], [-0.061382, -0.994987, -0.078948]], 'translation vector': [3.729187, 1.43308, 1.737059]}\nB: {'rotation matrix': [[0.660717, -0.100231, 0.743913], [-0.748068, -0.006018, 0.663595], [-0.062036, -0.994946, -0.078955]], 'translation vector': [3.728547, 1.433503, 1.735599]}\nC: {'rotation matrix': [[0.9999743209792964, 0.004172766279522147, 0.005832278992731541], [-0.004189646737158265, 0.9999872000751636, 0.0029117442423506165], [-0.0058200970417569145, -0.0029366540837721536, 0.9999788200004149]], 'translation vector': [0.0013138555211342773, -0.001864474585307807, 0.0012853107981345424]}\nD: {'rotation matrix': [[0.656146, -0.099388, 0.74806], [-0.75226, -0.007571, 0.658823], [-0.059815, -0.99502, -0.079733]], 'translation vector': [3.729275, 1.433124, 1.734442]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.664692, -0.099755, 0.740428], [-0.744592, -0.007027, 0.667483], [-0.061382, -0.994987, -0.078948]], 'translation vector': [3.729187, 1.43308, 1.737059]}\nB: {'rotation matrix': [[0.660717, -0.100231, 0.743913], [-0.748068, -0.006018, 0.663595], [-0.062036, -0.994946, -0.078955]], 'translation vector': [3.728547, 1.433503, 1.735599]}\nC: {'rotation matrix': [[0.9999743209792964, 0.004172766279522147, 0.005832278992731541], [-0.004189646737158265, 0.9999872000751636, 0.0029117442423506165], [-0.0058200970417569145, -0.0029366540837721536, 0.9999788200004149]], 'translation vector': [0.0013138555211342773, -0.001864474585307807, 0.0012853107981345424]}\nD: {'rotation matrix': [[0.656146, -0.099388, 0.74806], [-0.75226, -0.007571, 0.658823], [-0.059815, -0.99502, -0.079733]], 'translation vector': [3.729275, 1.433124, 1.734442]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_54_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_54_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_54_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_54_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.482382, -0.62548, 0.613257], [-0.875317, -0.317346, 0.364844], [-0.033588, -0.712788, -0.700575]], 'translation vector': [-0.164493, 3.070356, 1.320176]}\nB: {'rotation matrix': [[0.482432, -0.626604, 0.612068], [-0.875266, -0.317577, 0.364766], [-0.034185, -0.711697, -0.701654]], 'translation vector': [-0.163574, 3.070977, 1.321051]}\nC: {'rotation matrix': [[0.9999881393517817, -0.00035757344474597234, -0.005016643883040467], [0.0003166564592981544, 0.999967468385718, -0.008020454596773861], [0.005019965204090123, 0.008019005544894866, 0.9999547455108736]], 'translation vector': [-0.0028720129328290156, -0.004572041684123063, -0.0005047793498673403]}\nD: {'rotation matrix': [[0.482883, -0.62302, 0.615362], [-0.875067, -0.316913, 0.36582], [-0.032897, -0.715131, -0.698216]], 'translation vector': [-0.165581, 3.069752, 1.319227]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.482382, -0.62548, 0.613257], [-0.875317, -0.317346, 0.364844], [-0.033588, -0.712788, -0.700575]], 'translation vector': [-0.164493, 3.070356, 1.320176]}\nB: {'rotation matrix': [[0.482432, -0.626604, 0.612068], [-0.875266, -0.317577, 0.364766], [-0.034185, -0.711697, -0.701654]], 'translation vector': [-0.163574, 3.070977, 1.321051]}\nC: {'rotation matrix': [[0.9999881393517817, -0.00035757344474597234, -0.005016643883040467], [0.0003166564592981544, 0.999967468385718, -0.008020454596773861], [0.005019965204090123, 0.008019005544894866, 0.9999547455108736]], 'translation vector': [-0.0028720129328290156, -0.004572041684123063, -0.0005047793498673403]}\nD: {'rotation matrix': [[0.482883, -0.62302, 0.615362], [-0.875067, -0.316913, 0.36582], [-0.032897, -0.715131, -0.698216]], 'translation vector': [-0.165581, 3.069752, 1.319227]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_55_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_55_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_55_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_55_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.254741, -0.436809, 0.862731], [-0.966661, 0.138975, -0.215064], [-0.025957, -0.888754, -0.457649]], 'translation vector': [1.470391, 3.880589, 1.437084]}\nB: {'rotation matrix': [[-0.255547, -0.436424, 0.862688], [-0.966448, 0.139272, -0.215827], [-0.025956, -0.888897, -0.457372]], 'translation vector': [1.471235, 3.880077, 1.436326]}\nC: {'rotation matrix': [[-0.25517, -0.435877, 0.863076], [-0.966551, 0.138838, -0.215646], [-0.025832, -0.889233, -0.456724]], 'translation vector': [1.470861, 3.880012, 1.43644]}\nD: {'rotation matrix': [[0.9999966837298163, 0.0008619563673921012, 0.0026549103476475175], [-0.0008486493068450321, 0.999987344007734, -0.004872285830769469], [-0.0026592683333448467, 0.0048702326102797195, 0.9999843899240968]], 'translation vector': [-0.0017811924174484517, 0.006401158448355426, 0.0014093231794466143]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.254741, -0.436809, 0.862731], [-0.966661, 0.138975, -0.215064], [-0.025957, -0.888754, -0.457649]], 'translation vector': [1.470391, 3.880589, 1.437084]}\nB: {'rotation matrix': [[-0.255547, -0.436424, 0.862688], [-0.966448, 0.139272, -0.215827], [-0.025956, -0.888897, -0.457372]], 'translation vector': [1.471235, 3.880077, 1.436326]}\nC: {'rotation matrix': [[-0.25517, -0.435877, 0.863076], [-0.966551, 0.138838, -0.215646], [-0.025832, -0.889233, -0.456724]], 'translation vector': [1.470861, 3.880012, 1.43644]}\nD: {'rotation matrix': [[0.9999966837298163, 0.0008619563673921012, 0.0026549103476475175], [-0.0008486493068450321, 0.999987344007734, -0.004872285830769469], [-0.0026592683333448467, 0.0048702326102797195, 0.9999843899240968]], 'translation vector': [-0.0017811924174484517, 0.006401158448355426, 0.0014093231794466143]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_56_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_56_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_56_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_56_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.877903, 0.287785, -0.382709], [0.47693, 0.596815, -0.645252], [0.042713, -0.748994, -0.661199]], 'translation vector': [3.16702, 3.626466, 1.453681]}\nB: {'rotation matrix': [[0.9999791193908465, -0.001916910413648059, -0.0061771002364880544], [0.001968865784627184, 0.9999613398790298, 0.008534807689413037], [0.006161535037181589, -0.008547114556570383, 0.9999441766313318]], 'translation vector': [0.0007167215000418725, 0.004111635033621663, 0.00058908656396639]}\nC: {'rotation matrix': [[-0.874912, 0.294682, -0.384306], [0.482557, 0.597398, -0.640512], [0.040836, -0.745841, -0.664871]], 'translation vector': [3.163697, 3.627347, 1.450583]}\nD: {'rotation matrix': [[-0.871313, 0.303569, -0.385564], [0.489353, 0.596266, -0.636396], [0.036709, -0.743177, -0.668087]], 'translation vector': [3.163155, 3.630899, 1.446354]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.877903, 0.287785, -0.382709], [0.47693, 0.596815, -0.645252], [0.042713, -0.748994, -0.661199]], 'translation vector': [3.16702, 3.626466, 1.453681]}\nB: {'rotation matrix': [[0.9999791193908465, -0.001916910413648059, -0.0061771002364880544], [0.001968865784627184, 0.9999613398790298, 0.008534807689413037], [0.006161535037181589, -0.008547114556570383, 0.9999441766313318]], 'translation vector': [0.0007167215000418725, 0.004111635033621663, 0.00058908656396639]}\nC: {'rotation matrix': [[-0.874912, 0.294682, -0.384306], [0.482557, 0.597398, -0.640512], [0.040836, -0.745841, -0.664871]], 'translation vector': [3.163697, 3.627347, 1.450583]}\nD: {'rotation matrix': [[-0.871313, 0.303569, -0.385564], [0.489353, 0.596266, -0.636396], [0.036709, -0.743177, -0.668087]], 'translation vector': [3.163155, 3.630899, 1.446354]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_57_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_57_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_57_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_57_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.078798, -0.401808, 0.912327], [-0.996147, 0.067086, -0.056491], [-0.038506, -0.913263, -0.405546]], 'translation vector': [2.214502, 1.810217, 1.39288]}\nB: {'rotation matrix': [[-0.078108, -0.404311, 0.91128], [-0.996161, 0.067892, -0.055261], [-0.039526, -0.912098, -0.408062]], 'translation vector': [2.215161, 1.809587, 1.395775]}\nC: {'rotation matrix': [[0.9999986851865202, 0.0008401734129283168, -0.0015770679658950186], [-0.0008378992163381685, 0.9999975925523196, 0.0021246735415499604], [0.0015786984008670537, -0.002123956111471475, 0.9999965494818978]], 'translation vector': [0.0020818624690659426, 0.003854659633225399, 0.00023093351122471795]}\nD: {'rotation matrix': [[-0.078693, -0.405741, 0.910594], [-0.996048, 0.069734, -0.055005], [-0.041182, -0.911324, -0.409625]], 'translation vector': [2.217248, 1.812374, 1.391779]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.078798, -0.401808, 0.912327], [-0.996147, 0.067086, -0.056491], [-0.038506, -0.913263, -0.405546]], 'translation vector': [2.214502, 1.810217, 1.39288]}\nB: {'rotation matrix': [[-0.078108, -0.404311, 0.91128], [-0.996161, 0.067892, -0.055261], [-0.039526, -0.912098, -0.408062]], 'translation vector': [2.215161, 1.809587, 1.395775]}\nC: {'rotation matrix': [[0.9999986851865202, 0.0008401734129283168, -0.0015770679658950186], [-0.0008378992163381685, 0.9999975925523196, 0.0021246735415499604], [0.0015786984008670537, -0.002123956111471475, 0.9999965494818978]], 'translation vector': [0.0020818624690659426, 0.003854659633225399, 0.00023093351122471795]}\nD: {'rotation matrix': [[-0.078693, -0.405741, 0.910594], [-0.996048, 0.069734, -0.055005], [-0.041182, -0.911324, -0.409625]], 'translation vector': [2.217248, 1.812374, 1.391779]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_58_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_58_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_58_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_58_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.529336, -0.227144, 0.817441], [-0.847759, 0.103788, -0.520128], [0.033303, -0.968315, -0.247502]], 'translation vector': [5.896173, 2.790533, 1.549775]}\nB: {'rotation matrix': [[0.9999873040333512, 0.004505537820277509, -0.002397070922509149], [-0.004504137793554688, 0.9999890779279523, 0.0010154010037770195], [0.0024022565915601136, -0.0010036320370672355, 0.999996209316038]], 'translation vector': [-0.001560160498486951, -0.0020049110640587564, -0.0017324624821037915]}\nC: {'rotation matrix': [[-0.531472, -0.2283, 0.815731], [-0.846401, 0.104685, -0.522156], [0.033813, -0.967947, -0.24887]], 'translation vector': [5.895259, 2.788617, 1.559572]}\nD: {'rotation matrix': [[-0.53062, -0.226646, 0.816746], [-0.846944, 0.10358, -0.521495], [0.033596, -0.968454, -0.246918]], 'translation vector': [5.896636, 2.790495, 1.551807]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.529336, -0.227144, 0.817441], [-0.847759, 0.103788, -0.520128], [0.033303, -0.968315, -0.247502]], 'translation vector': [5.896173, 2.790533, 1.549775]}\nB: {'rotation matrix': [[0.9999873040333512, 0.004505537820277509, -0.002397070922509149], [-0.004504137793554688, 0.9999890779279523, 0.0010154010037770195], [0.0024022565915601136, -0.0010036320370672355, 0.999996209316038]], 'translation vector': [-0.001560160498486951, -0.0020049110640587564, -0.0017324624821037915]}\nC: {'rotation matrix': [[-0.531472, -0.2283, 0.815731], [-0.846401, 0.104685, -0.522156], [0.033813, -0.967947, -0.24887]], 'translation vector': [5.895259, 2.788617, 1.559572]}\nD: {'rotation matrix': [[-0.53062, -0.226646, 0.816746], [-0.846944, 0.10358, -0.521495], [0.033596, -0.968454, -0.246918]], 'translation vector': [5.896636, 2.790495, 1.551807]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_59_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_59_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_59_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_59_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.9999735234224584, -0.0071926018984539266, -0.00013127408012229648], [0.007193266878786128, 0.999950750728307, 0.006784512971560901], [8.152710211630049e-05, -0.006784805304509037, 0.9999770195971557]], 'translation vector': [-0.005319134943293502, -0.0035091612770564717, 0.0004269635666275251]}\nB: {'rotation matrix': [[-0.879528, -0.314344, 0.357236], [-0.47435, 0.638683, -0.605869], [-0.03771, -0.702334, -0.710848]], 'translation vector': [3.140295, 1.690182, 1.269802]}\nC: {'rotation matrix': [[-0.879673, -0.316123, 0.355304], [-0.474189, 0.640089, -0.604509], [-0.036327, -0.700252, -0.712971]], 'translation vector': [3.138628, 1.688987, 1.26968]}\nD: {'rotation matrix': [[-0.879671, -0.317176, 0.354371], [-0.474219, 0.641391, -0.603103], [-0.036001, -0.698582, -0.714624]], 'translation vector': [3.137942, 1.687445, 1.270163]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999735234224584, -0.0071926018984539266, -0.00013127408012229648], [0.007193266878786128, 0.999950750728307, 0.006784512971560901], [8.152710211630049e-05, -0.006784805304509037, 0.9999770195971557]], 'translation vector': [-0.005319134943293502, -0.0035091612770564717, 0.0004269635666275251]}\nB: {'rotation matrix': [[-0.879528, -0.314344, 0.357236], [-0.47435, 0.638683, -0.605869], [-0.03771, -0.702334, -0.710848]], 'translation vector': [3.140295, 1.690182, 1.269802]}\nC: {'rotation matrix': [[-0.879673, -0.316123, 0.355304], [-0.474189, 0.640089, -0.604509], [-0.036327, -0.700252, -0.712971]], 'translation vector': [3.138628, 1.688987, 1.26968]}\nD: {'rotation matrix': [[-0.879671, -0.317176, 0.354371], [-0.474219, 0.641391, -0.603103], [-0.036001, -0.698582, -0.714624]], 'translation vector': [3.137942, 1.687445, 1.270163]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_60_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_60_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_60_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_60_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.303413, -0.396105, 0.866627], [-0.952854, 0.129076, -0.274605], [-0.003088, -0.909088, -0.416593]], 'translation vector': [3.699021, 3.5579, 1.347225]}\nB: {'rotation matrix': [[-0.298621, -0.390383, 0.870877], [-0.954343, 0.1293, -0.26928], [-0.007482, -0.911527, -0.411171]], 'translation vector': [3.695972, 3.555829, 1.344301]}\nC: {'rotation matrix': [[-0.295385, -0.381895, 0.875731], [-0.955321, 0.128105, -0.266366], [-0.010462, -0.915284, -0.402672]], 'translation vector': [3.694636, 3.554343, 1.343555]}\nD: {'rotation matrix': [[0.9999470991216314, -8.645859163103856e-05, -0.010261215579303299], [0.00018576960757566105, 0.999954044961018, 0.009631176353467179], [0.010258362047762553, -0.00963308504359663, 0.9999007352405366]], 'translation vector': [0.002023687193802637, -0.005328769253949428, -0.0010872499872041086]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.303413, -0.396105, 0.866627], [-0.952854, 0.129076, -0.274605], [-0.003088, -0.909088, -0.416593]], 'translation vector': [3.699021, 3.5579, 1.347225]}\nB: {'rotation matrix': [[-0.298621, -0.390383, 0.870877], [-0.954343, 0.1293, -0.26928], [-0.007482, -0.911527, -0.411171]], 'translation vector': [3.695972, 3.555829, 1.344301]}\nC: {'rotation matrix': [[-0.295385, -0.381895, 0.875731], [-0.955321, 0.128105, -0.266366], [-0.010462, -0.915284, -0.402672]], 'translation vector': [3.694636, 3.554343, 1.343555]}\nD: {'rotation matrix': [[0.9999470991216314, -8.645859163103856e-05, -0.010261215579303299], [0.00018576960757566105, 0.999954044961018, 0.009631176353467179], [0.010258362047762553, -0.00963308504359663, 0.9999007352405366]], 'translation vector': [0.002023687193802637, -0.005328769253949428, -0.0010872499872041086]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_61_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_61_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_61_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_61_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.996466, -0.001244, 0.083992], [-0.083908, 0.032374, 0.995948], [-0.003958, -0.999475, 0.032156]], 'translation vector': [2.320184, 5.772667, 1.343957]}\nB: {'rotation matrix': [[0.994982, 0.000179, 0.100051], [-0.099963, 0.043922, 0.994021], [-0.004216, -0.999035, 0.043719]], 'translation vector': [2.304385, 5.780403, 1.335008]}\nC: {'rotation matrix': [[0.995982, -5.3e-05, 0.089553], [-0.089485, 0.038471, 0.995245], [-0.003498, -0.99926, 0.038311]], 'translation vector': [2.323582, 5.777781, 1.339158]}\nD: {'rotation matrix': [[0.9997533523434441, 0.006129226982668424, -0.021333118212536917], [-0.0059433732378316425, 0.9999427902508286, 0.008815866423587717], [0.02138545815854184, -0.008687653795007199, 0.9997333060483455]], 'translation vector': [0.03726599986839263, -0.00376568964599322, 0.0024709716040858254]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.996466, -0.001244, 0.083992], [-0.083908, 0.032374, 0.995948], [-0.003958, -0.999475, 0.032156]], 'translation vector': [2.320184, 5.772667, 1.343957]}\nB: {'rotation matrix': [[0.994982, 0.000179, 0.100051], [-0.099963, 0.043922, 0.994021], [-0.004216, -0.999035, 0.043719]], 'translation vector': [2.304385, 5.780403, 1.335008]}\nC: {'rotation matrix': [[0.995982, -5.3e-05, 0.089553], [-0.089485, 0.038471, 0.995245], [-0.003498, -0.99926, 0.038311]], 'translation vector': [2.323582, 5.777781, 1.339158]}\nD: {'rotation matrix': [[0.9997533523434441, 0.006129226982668424, -0.021333118212536917], [-0.0059433732378316425, 0.9999427902508286, 0.008815866423587717], [0.02138545815854184, -0.008687653795007199, 0.9997333060483455]], 'translation vector': [0.03726599986839263, -0.00376568964599322, 0.0024709716040858254]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_62_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_62_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_62_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_62_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.980568, 0.060238, -0.186705], [0.195333, -0.388226, 0.900625], [-0.018232, -0.919594, -0.392448]], 'translation vector': [0.950732, 0.877848, 1.428266]}\nB: {'rotation matrix': [[0.980128, 0.059665, -0.18918], [0.197203, -0.396203, 0.896735], [-0.02145, -0.916222, -0.400096]], 'translation vector': [0.955184, 0.877183, 1.426427]}\nC: {'rotation matrix': [[0.979822, 0.061197, -0.190271], [0.198756, -0.398709, 0.89528], [-0.021074, -0.915034, -0.402827]], 'translation vector': [0.958185, 0.874355, 1.42036]}\nD: {'rotation matrix': [[0.9999275128023173, 0.008592189796214917, -0.008452948749791484], [-0.008364555101980847, 0.9996139276735185, 0.026478714279731284], [0.008677131864842291, -0.02640733905967608, 0.9996135651618278]], 'translation vector': [0.01623466192676637, 0.01524648577924892, 0.005088344597766196]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.980568, 0.060238, -0.186705], [0.195333, -0.388226, 0.900625], [-0.018232, -0.919594, -0.392448]], 'translation vector': [0.950732, 0.877848, 1.428266]}\nB: {'rotation matrix': [[0.980128, 0.059665, -0.18918], [0.197203, -0.396203, 0.896735], [-0.02145, -0.916222, -0.400096]], 'translation vector': [0.955184, 0.877183, 1.426427]}\nC: {'rotation matrix': [[0.979822, 0.061197, -0.190271], [0.198756, -0.398709, 0.89528], [-0.021074, -0.915034, -0.402827]], 'translation vector': [0.958185, 0.874355, 1.42036]}\nD: {'rotation matrix': [[0.9999275128023173, 0.008592189796214917, -0.008452948749791484], [-0.008364555101980847, 0.9996139276735185, 0.026478714279731284], [0.008677131864842291, -0.02640733905967608, 0.9996135651618278]], 'translation vector': [0.01623466192676637, 0.01524648577924892, 0.005088344597766196]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_63_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_63_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_63_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_63_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.986729, -0.120978, 0.108304], [-0.140205, -0.298357, 0.944101], [-0.081902, -0.946757, -0.311359]], 'translation vector': [1.126551, 1.554919, 1.507245]}\nB: {'rotation matrix': [[0.9999934208085578, -0.0036450224249741864, 0.00012972614797144], [0.003645024213464494, 0.9999784140945303, 0.005329513561381964], [-0.0001495669882096765, -0.005329922173518521, 0.999986531390115]], 'translation vector': [0.0031820360164642736, 0.0011599203443171113, -0.0008228633488331916]}\nC: {'rotation matrix': [[0.986279, -0.125902, 0.106787], [-0.140227, -0.297512, 0.944364], [-0.087128, -0.94638, -0.311084]], 'translation vector': [1.129178, 1.552708, 1.506911]}\nD: {'rotation matrix': [[0.987067, -0.117318, 0.109254], [-0.140068, -0.29963, 0.943717], [-0.077979, -0.946815, -0.312188]], 'translation vector': [1.12401, 1.557217, 1.508026]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.986729, -0.120978, 0.108304], [-0.140205, -0.298357, 0.944101], [-0.081902, -0.946757, -0.311359]], 'translation vector': [1.126551, 1.554919, 1.507245]}\nB: {'rotation matrix': [[0.9999934208085578, -0.0036450224249741864, 0.00012972614797144], [0.003645024213464494, 0.9999784140945303, 0.005329513561381964], [-0.0001495669882096765, -0.005329922173518521, 0.999986531390115]], 'translation vector': [0.0031820360164642736, 0.0011599203443171113, -0.0008228633488331916]}\nC: {'rotation matrix': [[0.986279, -0.125902, 0.106787], [-0.140227, -0.297512, 0.944364], [-0.087128, -0.94638, -0.311084]], 'translation vector': [1.129178, 1.552708, 1.506911]}\nD: {'rotation matrix': [[0.987067, -0.117318, 0.109254], [-0.140068, -0.29963, 0.943717], [-0.077979, -0.946815, -0.312188]], 'translation vector': [1.12401, 1.557217, 1.508026]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_64_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_64_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_64_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_64_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.34785, 0.12218, -0.929555], [0.936582, -0.000241, 0.350448], [0.042594, -0.992508, -0.114515]], 'translation vector': [2.709976, 2.082475, 1.464411]}\nB: {'rotation matrix': [[0.348346, 0.120067, -0.929645], [0.936396, 0.00053, 0.350944], [0.042629, -0.992766, -0.112245]], 'translation vector': [2.711116, 2.081261, 1.464473]}\nC: {'rotation matrix': [[0.9999996813993267, 0.0006538118368534332, -8.345927351154747e-06], [-0.0006528854877680812, 0.9999955547316733, 0.002915205326606667], [1.0134396361542957e-05, -0.002915969719233099, 0.9999961918791215]], 'translation vector': [-0.0015843322088442413, -0.00023090688010451998, -0.00020610665531961558]}\nD: {'rotation matrix': [[0.34832, 0.118845, -0.929811], [0.936428, 0.00049, 0.350861], [0.042154, -0.992913, -0.111119]], 'translation vector': [2.712512, 2.080143, 1.464219]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.34785, 0.12218, -0.929555], [0.936582, -0.000241, 0.350448], [0.042594, -0.992508, -0.114515]], 'translation vector': [2.709976, 2.082475, 1.464411]}\nB: {'rotation matrix': [[0.348346, 0.120067, -0.929645], [0.936396, 0.00053, 0.350944], [0.042629, -0.992766, -0.112245]], 'translation vector': [2.711116, 2.081261, 1.464473]}\nC: {'rotation matrix': [[0.9999996813993267, 0.0006538118368534332, -8.345927351154747e-06], [-0.0006528854877680812, 0.9999955547316733, 0.002915205326606667], [1.0134396361542957e-05, -0.002915969719233099, 0.9999961918791215]], 'translation vector': [-0.0015843322088442413, -0.00023090688010451998, -0.00020610665531961558]}\nD: {'rotation matrix': [[0.34832, 0.118845, -0.929811], [0.936428, 0.00049, 0.350861], [0.042154, -0.992913, -0.111119]], 'translation vector': [2.712512, 2.080143, 1.464219]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_65_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_65_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_65_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_65_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.436759, -0.378371, 0.816135], [-0.888864, -0.041911, 0.45625], [-0.138426, -0.924705, -0.354625]], 'translation vector': [2.634246, 6.766675, 1.418575]}\nB: {'rotation matrix': [[0.9999585207206515, -0.003693386756648477, 0.008341273999683681], [0.0038081841800440604, 0.9998961013275416, -0.013845142600948103], [-0.008289566528701709, 0.013875695158089622, 0.9998694901286989]], 'translation vector': [0.0023225809960010224, 0.0034699322670346255, -0.0016854173948939177]}\nC: {'rotation matrix': [[0.435508, -0.378095, 0.816931], [-0.889287, -0.039919, 0.455605], [-0.139651, -0.924906, -0.35362]], 'translation vector': [2.637863, 6.764602, 1.421504]}\nD: {'rotation matrix': [[0.436668, -0.378575, 0.81609], [-0.888812, -0.041337, 0.456404], [-0.139048, -0.924647, -0.354533]], 'translation vector': [2.636727, 6.764818, 1.421436]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.436759, -0.378371, 0.816135], [-0.888864, -0.041911, 0.45625], [-0.138426, -0.924705, -0.354625]], 'translation vector': [2.634246, 6.766675, 1.418575]}\nB: {'rotation matrix': [[0.9999585207206515, -0.003693386756648477, 0.008341273999683681], [0.0038081841800440604, 0.9998961013275416, -0.013845142600948103], [-0.008289566528701709, 0.013875695158089622, 0.9998694901286989]], 'translation vector': [0.0023225809960010224, 0.0034699322670346255, -0.0016854173948939177]}\nC: {'rotation matrix': [[0.435508, -0.378095, 0.816931], [-0.889287, -0.039919, 0.455605], [-0.139651, -0.924906, -0.35362]], 'translation vector': [2.637863, 6.764602, 1.421504]}\nD: {'rotation matrix': [[0.436668, -0.378575, 0.81609], [-0.888812, -0.041337, 0.456404], [-0.139048, -0.924647, -0.354533]], 'translation vector': [2.636727, 6.764818, 1.421436]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_66_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_66_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_66_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_66_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.994032, -0.08462, 0.068848], [-0.109061, -0.785368, 0.609346], [0.002509, -0.613218, -0.78991]], 'translation vector': [1.310442, 0.50567, 1.185464]}\nB: {'rotation matrix': [[0.994117, -0.083714, 0.068724], [-0.108266, -0.785982, 0.608696], [0.003059, -0.612556, -0.790421]], 'translation vector': [1.309119, 0.507232, 1.184932]}\nC: {'rotation matrix': [[0.9999858152282176, 0.005111825032549412, 0.0011655701624676482], [-0.005114663160965653, 0.9999842406155692, 0.0025048411154926643], [-0.0011536230586058512, -0.002510397377259177, 0.9999958649343827]], 'translation vector': [-0.003249111441538055, -0.00014047167946484862, 0.0018748641056286486]}\nD: {'rotation matrix': [[0.99397, -0.085854, 0.068209], [-0.109644, -0.78528, 0.609356], [0.001247, -0.61316, -0.789958]], 'translation vector': [1.312051, 0.504544, 1.186353]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.994032, -0.08462, 0.068848], [-0.109061, -0.785368, 0.609346], [0.002509, -0.613218, -0.78991]], 'translation vector': [1.310442, 0.50567, 1.185464]}\nB: {'rotation matrix': [[0.994117, -0.083714, 0.068724], [-0.108266, -0.785982, 0.608696], [0.003059, -0.612556, -0.790421]], 'translation vector': [1.309119, 0.507232, 1.184932]}\nC: {'rotation matrix': [[0.9999858152282176, 0.005111825032549412, 0.0011655701624676482], [-0.005114663160965653, 0.9999842406155692, 0.0025048411154926643], [-0.0011536230586058512, -0.002510397377259177, 0.9999958649343827]], 'translation vector': [-0.003249111441538055, -0.00014047167946484862, 0.0018748641056286486]}\nD: {'rotation matrix': [[0.99397, -0.085854, 0.068209], [-0.109644, -0.78528, 0.609356], [0.001247, -0.61316, -0.789958]], 'translation vector': [1.312051, 0.504544, 1.186353]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_67_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_67_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_67_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_67_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.184838, -0.635323, 0.7498], [-0.98276, 0.116273, -0.143746], [0.004144, -0.763444, -0.645861]], 'translation vector': [1.003433, 1.175637, 1.437383]}\nB: {'rotation matrix': [[-0.184201, -0.636583, 0.748887], [-0.982879, 0.11585, -0.143279], [0.00445, -0.762457, -0.647023]], 'translation vector': [1.004527, 1.174467, 1.438164]}\nC: {'rotation matrix': [[-0.184847, -0.63596, 0.749258], [-0.982754, 0.115597, -0.144335], [0.005179, -0.763016, -0.646359]], 'translation vector': [1.003287, 1.175642, 1.437097]}\nD: {'rotation matrix': [[0.9999986055688241, -0.001502869212153915, -0.0010557523049182509], [0.0014994906185130158, 0.9999931829288878, -0.003258507979167455], [0.0010610991854197703, 0.003257203501798957, 0.999994335142242]], 'translation vector': [0.0018601875903911935, -0.0006944560630759433, -0.0003289584369169929]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.184838, -0.635323, 0.7498], [-0.98276, 0.116273, -0.143746], [0.004144, -0.763444, -0.645861]], 'translation vector': [1.003433, 1.175637, 1.437383]}\nB: {'rotation matrix': [[-0.184201, -0.636583, 0.748887], [-0.982879, 0.11585, -0.143279], [0.00445, -0.762457, -0.647023]], 'translation vector': [1.004527, 1.174467, 1.438164]}\nC: {'rotation matrix': [[-0.184847, -0.63596, 0.749258], [-0.982754, 0.115597, -0.144335], [0.005179, -0.763016, -0.646359]], 'translation vector': [1.003287, 1.175642, 1.437097]}\nD: {'rotation matrix': [[0.9999986055688241, -0.001502869212153915, -0.0010557523049182509], [0.0014994906185130158, 0.9999931829288878, -0.003258507979167455], [0.0010610991854197703, 0.003257203501798957, 0.999994335142242]], 'translation vector': [0.0018601875903911935, -0.0006944560630759433, -0.0003289584369169929]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_68_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_68_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_68_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_68_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.200306, -0.502555, 0.841021], [-0.979587, 0.117581, -0.163047], [-0.016948, -0.856512, -0.515849]], 'translation vector': [2.912164, 4.287547, 1.28791]}\nB: {'rotation matrix': [[0.9999873120379679, 0.002880995332778331, 0.004067114056895998], [-0.002882835524779029, 0.9999951519449766, 0.0005437699978073844], [-0.004064760122873662, -0.0005563594395375273, 0.9999918873961958]], 'translation vector': [-0.0006376699678316555, 0.00863085045062073, -0.004612247460962893]}\nC: {'rotation matrix': [[-0.196493, -0.497947, 0.844654], [-0.980345, 0.115377, -0.160041], [-0.017762, -0.859498, -0.51083]], 'translation vector': [2.914041, 4.284364, 1.288676]}\nD: {'rotation matrix': [[-0.191542, -0.494391, 0.847873], [-0.981313, 0.112624, -0.156017], [-0.018357, -0.861913, -0.506724]], 'translation vector': [2.915548, 4.280207, 1.289523]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.200306, -0.502555, 0.841021], [-0.979587, 0.117581, -0.163047], [-0.016948, -0.856512, -0.515849]], 'translation vector': [2.912164, 4.287547, 1.28791]}\nB: {'rotation matrix': [[0.9999873120379679, 0.002880995332778331, 0.004067114056895998], [-0.002882835524779029, 0.9999951519449766, 0.0005437699978073844], [-0.004064760122873662, -0.0005563594395375273, 0.9999918873961958]], 'translation vector': [-0.0006376699678316555, 0.00863085045062073, -0.004612247460962893]}\nC: {'rotation matrix': [[-0.196493, -0.497947, 0.844654], [-0.980345, 0.115377, -0.160041], [-0.017762, -0.859498, -0.51083]], 'translation vector': [2.914041, 4.284364, 1.288676]}\nD: {'rotation matrix': [[-0.191542, -0.494391, 0.847873], [-0.981313, 0.112624, -0.156017], [-0.018357, -0.861913, -0.506724]], 'translation vector': [2.915548, 4.280207, 1.289523]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_69_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_69_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_69_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_69_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.734788, 0.391108, -0.554185], [0.678289, 0.419895, -0.603003], [-0.00314, -0.818977, -0.573818]], 'translation vector': [5.172049, 2.206823, 1.423276]}\nB: {'rotation matrix': [[-0.735622, 0.390311, -0.55364], [0.677385, 0.420178, -0.603821], [-0.003051, -0.819212, -0.573483]], 'translation vector': [5.170603, 2.207022, 1.424387]}\nC: {'rotation matrix': [[0.9999900939462747, -0.004290505816425898, 0.0009195812709430778], [0.0042999758462496885, 0.9999235506202053, -0.011514550430676607], [-0.0008694094505867219, 0.011517701317526751, 0.9999331376988827]], 'translation vector': [-0.00082889643265327, -0.0036811171481734295, -0.003335935402839496]}\nD: {'rotation matrix': [[-0.734029, 0.39088, -0.55535], [0.679109, 0.418321, -0.603174], [-0.003454, -0.81989, -0.57251]], 'translation vector': [5.174113, 2.207384, 1.421993]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.734788, 0.391108, -0.554185], [0.678289, 0.419895, -0.603003], [-0.00314, -0.818977, -0.573818]], 'translation vector': [5.172049, 2.206823, 1.423276]}\nB: {'rotation matrix': [[-0.735622, 0.390311, -0.55364], [0.677385, 0.420178, -0.603821], [-0.003051, -0.819212, -0.573483]], 'translation vector': [5.170603, 2.207022, 1.424387]}\nC: {'rotation matrix': [[0.9999900939462747, -0.004290505816425898, 0.0009195812709430778], [0.0042999758462496885, 0.9999235506202053, -0.011514550430676607], [-0.0008694094505867219, 0.011517701317526751, 0.9999331376988827]], 'translation vector': [-0.00082889643265327, -0.0036811171481734295, -0.003335935402839496]}\nD: {'rotation matrix': [[-0.734029, 0.39088, -0.55535], [0.679109, 0.418321, -0.603174], [-0.003454, -0.81989, -0.57251]], 'translation vector': [5.174113, 2.207384, 1.421993]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_70_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_70_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_70_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_70_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.231441, -0.548452, 0.803514], [-0.97266, -0.114172, 0.202231], [-0.019175, -0.828351, -0.559882]], 'translation vector': [1.70644, 2.067417, 1.36452]}\nB: {'rotation matrix': [[0.226329, -0.547002, 0.805955], [-0.973947, -0.114973, 0.195472], [-0.01426, -0.829199, -0.558772]], 'translation vector': [1.704179, 2.073727, 1.363978]}\nC: {'rotation matrix': [[0.9999250411381765, -0.0006651889723176079, -0.012145965376701085], [0.0006720606381267904, 0.999998823757648, 0.0005907315128157197], [0.012145773622422931, -0.0005986694012902582, 0.9999254709423802]], 'translation vector': [0.004506212132421972, 0.005377300872088764, -0.0022216956686449407]}\nD: {'rotation matrix': [[0.220852, -0.547607, 0.807063], [-0.975257, -0.115603, 0.188439], [-0.009892, -0.828711, -0.559589]], 'translation vector': [1.70298, 2.078271, 1.364741]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.231441, -0.548452, 0.803514], [-0.97266, -0.114172, 0.202231], [-0.019175, -0.828351, -0.559882]], 'translation vector': [1.70644, 2.067417, 1.36452]}\nB: {'rotation matrix': [[0.226329, -0.547002, 0.805955], [-0.973947, -0.114973, 0.195472], [-0.01426, -0.829199, -0.558772]], 'translation vector': [1.704179, 2.073727, 1.363978]}\nC: {'rotation matrix': [[0.9999250411381765, -0.0006651889723176079, -0.012145965376701085], [0.0006720606381267904, 0.999998823757648, 0.0005907315128157197], [0.012145773622422931, -0.0005986694012902582, 0.9999254709423802]], 'translation vector': [0.004506212132421972, 0.005377300872088764, -0.0022216956686449407]}\nD: {'rotation matrix': [[0.220852, -0.547607, 0.807063], [-0.975257, -0.115603, 0.188439], [-0.009892, -0.828711, -0.559589]], 'translation vector': [1.70298, 2.078271, 1.364741]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_71_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_71_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_71_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_71_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.122634, -0.436101, 0.891503], [-0.989963, 0.117329, -0.078783], [-0.070242, -0.892216, -0.446113]], 'translation vector': [3.397121, 4.680246, 1.399477]}\nB: {'rotation matrix': [[-0.131936, -0.437064, 0.889701], [-0.988639, 0.123231, -0.08607], [-0.072021, -0.890949, -0.448357]], 'translation vector': [3.380324, 4.680538, 1.400463]}\nC: {'rotation matrix': [[0.9999977526314023, 0.0018483257426729351, -0.00041760047623725873], [-0.0018481932934636253, 0.9999980766573896, 0.00088155464161366], [0.00041944957876658323, -0.0008809692307902862, 0.9999991600629605]], 'translation vector': [0.0013914092466897898, -0.0023350058272084695, 0.005488229626908758]}\nD: {'rotation matrix': [[-0.127448, -0.436966, 0.890403], [-0.989339, 0.119783, -0.082825], [-0.070464, -0.891467, -0.447574]], 'translation vector': [3.388002, 4.681844, 1.400749]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.122634, -0.436101, 0.891503], [-0.989963, 0.117329, -0.078783], [-0.070242, -0.892216, -0.446113]], 'translation vector': [3.397121, 4.680246, 1.399477]}\nB: {'rotation matrix': [[-0.131936, -0.437064, 0.889701], [-0.988639, 0.123231, -0.08607], [-0.072021, -0.890949, -0.448357]], 'translation vector': [3.380324, 4.680538, 1.400463]}\nC: {'rotation matrix': [[0.9999977526314023, 0.0018483257426729351, -0.00041760047623725873], [-0.0018481932934636253, 0.9999980766573896, 0.00088155464161366], [0.00041944957876658323, -0.0008809692307902862, 0.9999991600629605]], 'translation vector': [0.0013914092466897898, -0.0023350058272084695, 0.005488229626908758]}\nD: {'rotation matrix': [[-0.127448, -0.436966, 0.890403], [-0.989339, 0.119783, -0.082825], [-0.070464, -0.891467, -0.447574]], 'translation vector': [3.388002, 4.681844, 1.400749]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_72_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_72_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_72_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_72_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.018523, 0.490425, -0.871286], [0.99775, 0.065233, 0.015507], [0.064442, -0.869039, -0.49053]], 'translation vector': [3.293662, 2.081986, 1.287803]}\nB: {'rotation matrix': [[-0.012031, 0.49001, -0.871634], [0.99805, 0.059277, 0.019548], [0.061246, -0.869699, -0.489768]], 'translation vector': [3.297196, 2.086649, 1.289788]}\nC: {'rotation matrix': [[0.9998524821662119, 0.011110486941731442, -0.013083720076090504], [-0.011104516921568186, 0.9999377181682902, 0.0005342849090069752], [0.013088749073536516, -0.0003893477266632072, 0.9999144546094512]], 'translation vector': [-0.004299120253105748, -0.007367168150444914, 0.008875125679755236]}\nD: {'rotation matrix': [[-0.025914, 0.492003, -0.870208], [0.997577, 0.068946, 0.009274], [0.06456, -0.867859, -0.492598]], 'translation vector': [3.28927, 2.078913, 1.287729]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.018523, 0.490425, -0.871286], [0.99775, 0.065233, 0.015507], [0.064442, -0.869039, -0.49053]], 'translation vector': [3.293662, 2.081986, 1.287803]}\nB: {'rotation matrix': [[-0.012031, 0.49001, -0.871634], [0.99805, 0.059277, 0.019548], [0.061246, -0.869699, -0.489768]], 'translation vector': [3.297196, 2.086649, 1.289788]}\nC: {'rotation matrix': [[0.9998524821662119, 0.011110486941731442, -0.013083720076090504], [-0.011104516921568186, 0.9999377181682902, 0.0005342849090069752], [0.013088749073536516, -0.0003893477266632072, 0.9999144546094512]], 'translation vector': [-0.004299120253105748, -0.007367168150444914, 0.008875125679755236]}\nD: {'rotation matrix': [[-0.025914, 0.492003, -0.870208], [0.997577, 0.068946, 0.009274], [0.06456, -0.867859, -0.492598]], 'translation vector': [3.28927, 2.078913, 1.287729]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_73_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_73_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_73_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_73_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.963707, 0.155577, -0.216944], [0.257928, 0.333002, -0.906964], [-0.06886, -0.930003, -0.361044]], 'translation vector': [5.977247, 2.820638, 1.467431]}\nB: {'rotation matrix': [[-0.963568, 0.155178, -0.217846], [0.258718, 0.334184, -0.906303], [-0.067837, -0.929646, -0.362157]], 'translation vector': [5.975011, 2.821235, 1.467201]}\nC: {'rotation matrix': [[-0.963289, 0.1552, -0.219062], [0.259892, 0.33449, -0.905855], [-0.067315, -0.929532, -0.362545]], 'translation vector': [5.973778, 2.820463, 1.46621]}\nD: {'rotation matrix': [[0.9999997264810032, 0.0001398888061159318, -0.0006664023459939352], [-0.00013916119976689398, 1.0000000334978612, 0.0010113123861037441], [0.0006662597489821222, -0.0010109047313490711, 0.9999993871422703]], 'translation vector': [0.0010522232087115668, -0.0015450075589019119, 0.0008995589608247201]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.963707, 0.155577, -0.216944], [0.257928, 0.333002, -0.906964], [-0.06886, -0.930003, -0.361044]], 'translation vector': [5.977247, 2.820638, 1.467431]}\nB: {'rotation matrix': [[-0.963568, 0.155178, -0.217846], [0.258718, 0.334184, -0.906303], [-0.067837, -0.929646, -0.362157]], 'translation vector': [5.975011, 2.821235, 1.467201]}\nC: {'rotation matrix': [[-0.963289, 0.1552, -0.219062], [0.259892, 0.33449, -0.905855], [-0.067315, -0.929532, -0.362545]], 'translation vector': [5.973778, 2.820463, 1.46621]}\nD: {'rotation matrix': [[0.9999997264810032, 0.0001398888061159318, -0.0006664023459939352], [-0.00013916119976689398, 1.0000000334978612, 0.0010113123861037441], [0.0006662597489821222, -0.0010109047313490711, 0.9999993871422703]], 'translation vector': [0.0010522232087115668, -0.0015450075589019119, 0.0008995589608247201]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_74_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_74_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_74_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_74_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.608915, -0.355061, 0.709333], [-0.792506, -0.310645, 0.524818], [0.034009, -0.881721, -0.470545]], 'translation vector': [3.226315, 3.403534, 1.349898]}\nB: {'rotation matrix': [[0.615195, -0.359882, 0.701442], [-0.787888, -0.311929, 0.530973], [0.027713, -0.87931, -0.475444]], 'translation vector': [3.233385, 3.405524, 1.366998]}\nC: {'rotation matrix': [[0.613018, -0.357666, 0.704474], [-0.789518, -0.310613, 0.529321], [0.029499, -0.880678, -0.472795]], 'translation vector': [3.232441, 3.405463, 1.362879]}\nD: {'rotation matrix': [[0.9997863328706507, 0.00048772072536455783, -0.020639341477887214], [-0.000435326360773427, 0.9999962939853105, 0.002563461941184151], [0.020641174681034113, -0.002552850852538859, 0.9997836062327491]], 'translation vector': [0.014979793826815802, -0.00028266630712581176, -0.003454343250508085]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.608915, -0.355061, 0.709333], [-0.792506, -0.310645, 0.524818], [0.034009, -0.881721, -0.470545]], 'translation vector': [3.226315, 3.403534, 1.349898]}\nB: {'rotation matrix': [[0.615195, -0.359882, 0.701442], [-0.787888, -0.311929, 0.530973], [0.027713, -0.87931, -0.475444]], 'translation vector': [3.233385, 3.405524, 1.366998]}\nC: {'rotation matrix': [[0.613018, -0.357666, 0.704474], [-0.789518, -0.310613, 0.529321], [0.029499, -0.880678, -0.472795]], 'translation vector': [3.232441, 3.405463, 1.362879]}\nD: {'rotation matrix': [[0.9997863328706507, 0.00048772072536455783, -0.020639341477887214], [-0.000435326360773427, 0.9999962939853105, 0.002563461941184151], [0.020641174681034113, -0.002552850852538859, 0.9997836062327491]], 'translation vector': [0.014979793826815802, -0.00028266630712581176, -0.003454343250508085]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_75_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_75_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_75_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_75_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.9999991463733822, 0.0008702753414361648, -0.0009523887468317617], [-0.0008598677378153328, 0.999935869624125, 0.011274652427134848], [0.0009622841974666614, -0.011274638788349072, 0.9999353623727418]], 'translation vector': [-0.0018668216035679919, 0.003964358597176476, 0.0035095844812185473]}\nB: {'rotation matrix': [[0.995169, 0.04021, -0.089565], [0.098119, -0.43886, 0.893182], [-0.003392, -0.897655, -0.440686]], 'translation vector': [3.819187, 1.33594, 1.360146]}\nC: {'rotation matrix': [[0.994619, 0.036032, -0.097136], [0.10302, -0.443404, 0.890382], [-0.010989, -0.895597, -0.44473]], 'translation vector': [3.820524, 1.337409, 1.359976]}\nD: {'rotation matrix': [[0.995617, 0.045838, -0.081525], [0.09337, -0.436452, 0.89487], [0.005437, -0.898559, -0.438819]], 'translation vector': [3.821348, 1.335292, 1.36241]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999991463733822, 0.0008702753414361648, -0.0009523887468317617], [-0.0008598677378153328, 0.999935869624125, 0.011274652427134848], [0.0009622841974666614, -0.011274638788349072, 0.9999353623727418]], 'translation vector': [-0.0018668216035679919, 0.003964358597176476, 0.0035095844812185473]}\nB: {'rotation matrix': [[0.995169, 0.04021, -0.089565], [0.098119, -0.43886, 0.893182], [-0.003392, -0.897655, -0.440686]], 'translation vector': [3.819187, 1.33594, 1.360146]}\nC: {'rotation matrix': [[0.994619, 0.036032, -0.097136], [0.10302, -0.443404, 0.890382], [-0.010989, -0.895597, -0.44473]], 'translation vector': [3.820524, 1.337409, 1.359976]}\nD: {'rotation matrix': [[0.995617, 0.045838, -0.081525], [0.09337, -0.436452, 0.89487], [0.005437, -0.898559, -0.438819]], 'translation vector': [3.821348, 1.335292, 1.36241]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_76_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_76_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_76_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_76_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.882555, 0.386221, -0.268197], [0.46123, 0.600144, -0.653524], [-0.091448, -0.700472, -0.707797]], 'translation vector': [4.952963, 3.575409, 1.461658]}\nB: {'rotation matrix': [[-0.884605, 0.387053, -0.260122], [0.456774, 0.606746, -0.650551], [-0.09397, -0.694298, -0.713526]], 'translation vector': [4.944654, 3.579183, 1.459738]}\nC: {'rotation matrix': [[-0.883899, 0.386346, -0.263552], [0.458302, 0.603262, -0.652713], [-0.093182, -0.697719, -0.710286]], 'translation vector': [4.946745, 3.577697, 1.460677]}\nD: {'rotation matrix': [[0.9999447182645155, 0.005004874877878123, -0.00920632642357687], [-0.0050456554798214885, 0.9999777740634089, -0.004314913443862226], [0.009184864298060464, 0.004360756390993736, 0.9999481114756171]], 'translation vector': [0.007685923361448133, 0.0066471354724360054, 0.0017036092209536946]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.882555, 0.386221, -0.268197], [0.46123, 0.600144, -0.653524], [-0.091448, -0.700472, -0.707797]], 'translation vector': [4.952963, 3.575409, 1.461658]}\nB: {'rotation matrix': [[-0.884605, 0.387053, -0.260122], [0.456774, 0.606746, -0.650551], [-0.09397, -0.694298, -0.713526]], 'translation vector': [4.944654, 3.579183, 1.459738]}\nC: {'rotation matrix': [[-0.883899, 0.386346, -0.263552], [0.458302, 0.603262, -0.652713], [-0.093182, -0.697719, -0.710286]], 'translation vector': [4.946745, 3.577697, 1.460677]}\nD: {'rotation matrix': [[0.9999447182645155, 0.005004874877878123, -0.00920632642357687], [-0.0050456554798214885, 0.9999777740634089, -0.004314913443862226], [0.009184864298060464, 0.004360756390993736, 0.9999481114756171]], 'translation vector': [0.007685923361448133, 0.0066471354724360054, 0.0017036092209536946]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_77_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_77_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_77_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_77_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.625113, 0.311868, -0.715523], [0.780406, -0.266348, 0.565708], [-0.014152, -0.912029, -0.409881]], 'translation vector': [1.602076, 0.627028, 1.325196]}\nB: {'rotation matrix': [[0.622635, 0.31497, -0.716324], [0.782419, -0.264717, 0.563689], [-0.012077, -0.911438, -0.411261]], 'translation vector': [1.601839, 0.627416, 1.324643]}\nC: {'rotation matrix': [[0.624152, 0.313246, -0.715759], [0.781196, -0.26537, 0.565077], [-0.012933, -0.911842, -0.410338]], 'translation vector': [1.601807, 0.626749, 1.324787]}\nD: {'rotation matrix': [[0.999966975162773, 0.003632343802915245, 0.007281117690826753], [-0.0036218018444724924, 0.9999920045733598, -0.0017116834889060193], [-0.007286301405287572, 0.0016849146368096719, 0.9999717827304806]], 'translation vector': [-0.0025932164044990547, 0.0007159923074403496, -0.0006736212556601728]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.625113, 0.311868, -0.715523], [0.780406, -0.266348, 0.565708], [-0.014152, -0.912029, -0.409881]], 'translation vector': [1.602076, 0.627028, 1.325196]}\nB: {'rotation matrix': [[0.622635, 0.31497, -0.716324], [0.782419, -0.264717, 0.563689], [-0.012077, -0.911438, -0.411261]], 'translation vector': [1.601839, 0.627416, 1.324643]}\nC: {'rotation matrix': [[0.624152, 0.313246, -0.715759], [0.781196, -0.26537, 0.565077], [-0.012933, -0.911842, -0.410338]], 'translation vector': [1.601807, 0.626749, 1.324787]}\nD: {'rotation matrix': [[0.999966975162773, 0.003632343802915245, 0.007281117690826753], [-0.0036218018444724924, 0.9999920045733598, -0.0017116834889060193], [-0.007286301405287572, 0.0016849146368096719, 0.9999717827304806]], 'translation vector': [-0.0025932164044990547, 0.0007159923074403496, -0.0006736212556601728]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_78_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_78_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_78_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_78_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.198116, 0.271577, -0.941805], [0.979964, -0.034779, 0.196115], [0.020505, -0.961788, -0.273026]], 'translation vector': [3.606948, 3.761193, 1.556592]}\nB: {'rotation matrix': [[0.193825, 0.274451, -0.941864], [0.980909, -0.038747, 0.19057], [0.015807, -0.96082, -0.276722]], 'translation vector': [3.608205, 3.76769, 1.544741]}\nC: {'rotation matrix': [[0.999997674889758, 0.0021739401100205674, -0.00025104493249259135], [-0.002175952729401346, 0.9999848730149307, -0.004981286419615835], [0.0002396488755047073, 0.004981659902978713, 0.9999879544274658]], 'translation vector': [0.0009190453627176964, -0.0033553865594018184, -0.0035327864721872437]}\nD: {'rotation matrix': [[0.190217, 0.28361, -0.939884], [0.981635, -0.040777, 0.186362], [0.014528, -0.958072, -0.286158]], 'translation vector': [3.605221, 3.771751, 1.549751]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.198116, 0.271577, -0.941805], [0.979964, -0.034779, 0.196115], [0.020505, -0.961788, -0.273026]], 'translation vector': [3.606948, 3.761193, 1.556592]}\nB: {'rotation matrix': [[0.193825, 0.274451, -0.941864], [0.980909, -0.038747, 0.19057], [0.015807, -0.96082, -0.276722]], 'translation vector': [3.608205, 3.76769, 1.544741]}\nC: {'rotation matrix': [[0.999997674889758, 0.0021739401100205674, -0.00025104493249259135], [-0.002175952729401346, 0.9999848730149307, -0.004981286419615835], [0.0002396488755047073, 0.004981659902978713, 0.9999879544274658]], 'translation vector': [0.0009190453627176964, -0.0033553865594018184, -0.0035327864721872437]}\nD: {'rotation matrix': [[0.190217, 0.28361, -0.939884], [0.981635, -0.040777, 0.186362], [0.014528, -0.958072, -0.286158]], 'translation vector': [3.605221, 3.771751, 1.549751]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_79_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_79_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_79_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_79_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.631227, -0.321947, 0.705622], [-0.775405, -0.28226, 0.564869], [0.017311, -0.903703, -0.427809]], 'translation vector': [-0.207113, 0.785695, 1.605991]}\nB: {'rotation matrix': [[0.9999995913321388, 0.0005903556746734515, -0.0005873839987845575], [-0.0005915541354645518, 0.9999966980036584, -0.0023348317149788846], [0.0005870160507556152, 0.00233575631512544, 0.9999968115853851]], 'translation vector': [0.0016624461367128474, -0.0028184747771204943, -0.001607026045218174]}\nC: {'rotation matrix': [[0.628117, -0.317695, 0.71031], [-0.777888, -0.278629, 0.563255], [0.018969, -0.906331, -0.422142]], 'translation vector': [-0.210483, 0.781575, 1.607029]}\nD: {'rotation matrix': [[0.626043, -0.313657, 0.713926], [-0.779508, -0.27628, 0.562171], [0.020914, -0.908454, -0.417462]], 'translation vector': [-0.212996, 0.77858, 1.610364]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.631227, -0.321947, 0.705622], [-0.775405, -0.28226, 0.564869], [0.017311, -0.903703, -0.427809]], 'translation vector': [-0.207113, 0.785695, 1.605991]}\nB: {'rotation matrix': [[0.9999995913321388, 0.0005903556746734515, -0.0005873839987845575], [-0.0005915541354645518, 0.9999966980036584, -0.0023348317149788846], [0.0005870160507556152, 0.00233575631512544, 0.9999968115853851]], 'translation vector': [0.0016624461367128474, -0.0028184747771204943, -0.001607026045218174]}\nC: {'rotation matrix': [[0.628117, -0.317695, 0.71031], [-0.777888, -0.278629, 0.563255], [0.018969, -0.906331, -0.422142]], 'translation vector': [-0.210483, 0.781575, 1.607029]}\nD: {'rotation matrix': [[0.626043, -0.313657, 0.713926], [-0.779508, -0.27628, 0.562171], [0.020914, -0.908454, -0.417462]], 'translation vector': [-0.212996, 0.77858, 1.610364]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_80_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_80_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_80_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_80_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.9999006085846415, 0.014105987403183131, 0.0007516964397581732], [-0.014115634211093402, 0.9997972933579915, 0.014373215967786759], [-0.0005489990557007766, -0.014382715808085993, 0.9998969169271865]], 'translation vector': [-0.02535757146051898, 0.0018154305901567636, 0.010236799804218322]}\nB: {'rotation matrix': [[0.151948, 0.599833, -0.785565], [0.987995, -0.114601, 0.103597], [-0.027885, -0.791875, -0.610046]], 'translation vector': [3.432288, 3.133084, 1.213871]}\nC: {'rotation matrix': [[0.14922, 0.604558, -0.78246], [0.988532, -0.109774, 0.103704], [-0.023198, -0.788961, -0.614005]], 'translation vector': [3.429968, 3.121084, 1.211424]}\nD: {'rotation matrix': [[0.14748, 0.608832, -0.77947], [0.988883, -0.105872, 0.104407], [-0.018958, -0.786202, -0.617678]], 'translation vector': [3.426714, 3.1102, 1.209074]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999006085846415, 0.014105987403183131, 0.0007516964397581732], [-0.014115634211093402, 0.9997972933579915, 0.014373215967786759], [-0.0005489990557007766, -0.014382715808085993, 0.9998969169271865]], 'translation vector': [-0.02535757146051898, 0.0018154305901567636, 0.010236799804218322]}\nB: {'rotation matrix': [[0.151948, 0.599833, -0.785565], [0.987995, -0.114601, 0.103597], [-0.027885, -0.791875, -0.610046]], 'translation vector': [3.432288, 3.133084, 1.213871]}\nC: {'rotation matrix': [[0.14922, 0.604558, -0.78246], [0.988532, -0.109774, 0.103704], [-0.023198, -0.788961, -0.614005]], 'translation vector': [3.429968, 3.121084, 1.211424]}\nD: {'rotation matrix': [[0.14748, 0.608832, -0.77947], [0.988883, -0.105872, 0.104407], [-0.018958, -0.786202, -0.617678]], 'translation vector': [3.426714, 3.1102, 1.209074]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_81_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_81_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_81_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_81_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.720072, 0.306191, -0.62269], [0.693309, -0.280455, 0.663829], [0.028622, -0.909721, -0.414232]], 'translation vector': [3.433251, 3.053234, 1.552574]}\nB: {'rotation matrix': [[0.715824, 0.307759, -0.626802], [0.697706, -0.278807, 0.659904], [0.028335, -0.909698, -0.414302]], 'translation vector': [3.42786, 3.050569, 1.552797]}\nC: {'rotation matrix': [[0.99998073687791, -0.005433231351435887, 0.0028092605219069734], [0.0054430621429484745, 0.9999793374936902, -0.0033485077732525377], [-0.00279094918696337, 0.0033635449074639256, 0.9999906279011188]], 'translation vector': [0.0048247922808197785, -0.007326500694675886, 7.779843116662022e-05]}\nD: {'rotation matrix': [[0.717959, 0.306862, -0.624797], [0.695515, -0.279904, 0.66175], [0.028183, -0.909665, -0.414386]], 'translation vector': [3.431505, 3.053102, 1.552563]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.720072, 0.306191, -0.62269], [0.693309, -0.280455, 0.663829], [0.028622, -0.909721, -0.414232]], 'translation vector': [3.433251, 3.053234, 1.552574]}\nB: {'rotation matrix': [[0.715824, 0.307759, -0.626802], [0.697706, -0.278807, 0.659904], [0.028335, -0.909698, -0.414302]], 'translation vector': [3.42786, 3.050569, 1.552797]}\nC: {'rotation matrix': [[0.99998073687791, -0.005433231351435887, 0.0028092605219069734], [0.0054430621429484745, 0.9999793374936902, -0.0033485077732525377], [-0.00279094918696337, 0.0033635449074639256, 0.9999906279011188]], 'translation vector': [0.0048247922808197785, -0.007326500694675886, 7.779843116662022e-05]}\nD: {'rotation matrix': [[0.717959, 0.306862, -0.624797], [0.695515, -0.279904, 0.66175], [0.028183, -0.909665, -0.414386]], 'translation vector': [3.431505, 3.053102, 1.552563]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_82_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_82_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_82_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_82_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.863341, -0.254981, 0.435461], [-0.503057, 0.367008, -0.782457], [0.039694, -0.894589, -0.445123]], 'translation vector': [2.006748, 3.81545, 1.542323]}\nB: {'rotation matrix': [[-0.863173, -0.254818, 0.43589], [-0.503388, 0.367381, -0.782069], [0.039148, -0.894483, -0.445386]], 'translation vector': [2.007018, 3.816806, 1.542476]}\nC: {'rotation matrix': [[-0.863454, -0.255279, 0.435064], [-0.502805, 0.366433, -0.782888], [0.040433, -0.89474, -0.444754]], 'translation vector': [2.007318, 3.814646, 1.54216]}\nD: {'rotation matrix': [[0.9999972277888285, -0.0019486605164344517, 0.0010264732410869921], [0.0019490971963479567, 0.9999969267726074, -0.0010703044149313807], [-0.0010241404575203601, 0.0010719252856060263, 0.9999989421516985]], 'translation vector': [-0.0025894589048998107, 0.007141119527829198, -0.0014552230705469071]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.863341, -0.254981, 0.435461], [-0.503057, 0.367008, -0.782457], [0.039694, -0.894589, -0.445123]], 'translation vector': [2.006748, 3.81545, 1.542323]}\nB: {'rotation matrix': [[-0.863173, -0.254818, 0.43589], [-0.503388, 0.367381, -0.782069], [0.039148, -0.894483, -0.445386]], 'translation vector': [2.007018, 3.816806, 1.542476]}\nC: {'rotation matrix': [[-0.863454, -0.255279, 0.435064], [-0.502805, 0.366433, -0.782888], [0.040433, -0.89474, -0.444754]], 'translation vector': [2.007318, 3.814646, 1.54216]}\nD: {'rotation matrix': [[0.9999972277888285, -0.0019486605164344517, 0.0010264732410869921], [0.0019490971963479567, 0.9999969267726074, -0.0010703044149313807], [-0.0010241404575203601, 0.0010719252856060263, 0.9999989421516985]], 'translation vector': [-0.0025894589048998107, 0.007141119527829198, -0.0014552230705469071]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_83_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_83_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_83_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_83_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.987787, 0.108072, -0.112241], [0.155697, -0.656841, 0.73778], [0.006009, -0.746244, -0.665645]], 'translation vector': [4.649458, 4.057209, 1.404581]}\nB: {'rotation matrix': [[0.988022, 0.106035, -0.112112], [0.15424, -0.656197, 0.738658], [0.004756, -0.747102, -0.664692]], 'translation vector': [4.650307, 4.057695, 1.405486]}\nC: {'rotation matrix': [[0.9999939235077703, -0.0009864268743032946, -0.003107875618402941], [0.0009789613139893545, 0.9999966009200537, -0.0022176346298812244], [0.003110843270450784, 0.0022141652882606867, 0.9999926277610204]], 'translation vector': [-0.007831508873088033, -0.00424079700623059, -0.0006393879079424902]}\nD: {'rotation matrix': [[0.987654, 0.108357, -0.113131], [0.15654, -0.65545, 0.738837], [0.005906, -0.747425, -0.66432]], 'translation vector': [4.648766, 4.054578, 1.401957]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.987787, 0.108072, -0.112241], [0.155697, -0.656841, 0.73778], [0.006009, -0.746244, -0.665645]], 'translation vector': [4.649458, 4.057209, 1.404581]}\nB: {'rotation matrix': [[0.988022, 0.106035, -0.112112], [0.15424, -0.656197, 0.738658], [0.004756, -0.747102, -0.664692]], 'translation vector': [4.650307, 4.057695, 1.405486]}\nC: {'rotation matrix': [[0.9999939235077703, -0.0009864268743032946, -0.003107875618402941], [0.0009789613139893545, 0.9999966009200537, -0.0022176346298812244], [0.003110843270450784, 0.0022141652882606867, 0.9999926277610204]], 'translation vector': [-0.007831508873088033, -0.00424079700623059, -0.0006393879079424902]}\nD: {'rotation matrix': [[0.987654, 0.108357, -0.113131], [0.15654, -0.65545, 0.738837], [0.005906, -0.747425, -0.66432]], 'translation vector': [4.648766, 4.054578, 1.401957]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_84_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_84_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_84_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_84_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.9997717908276891, -0.014061705239683872, 0.016112763216433307], [0.0140800110532502, 0.9999007363199941, -0.0010460438789438157], [-0.01609611583081038, 0.0012720455953134791, 0.9998692891144138]], 'translation vector': [-0.018076948566243978, 0.0017768934909982992, 0.0006580952284183095]}\nB: {'rotation matrix': [[-0.782674, -0.257014, 0.566891], [-0.62216, 0.296092, -0.724739], [0.018416, -0.919931, -0.391647]], 'translation vector': [3.075882, 2.930909, 1.465913]}\nC: {'rotation matrix': [[-0.790591, -0.247688, 0.560015], [-0.61223, 0.301979, -0.730742], [0.011883, -0.920576, -0.390384]], 'translation vector': [3.085087, 2.935415, 1.467454]}\nD: {'rotation matrix': [[-0.773889, -0.266244, 0.574639], [-0.632944, 0.293814, -0.716279], [0.021868, -0.918034, -0.395897]], 'translation vector': [3.064209, 2.92712, 1.46117]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9997717908276891, -0.014061705239683872, 0.016112763216433307], [0.0140800110532502, 0.9999007363199941, -0.0010460438789438157], [-0.01609611583081038, 0.0012720455953134791, 0.9998692891144138]], 'translation vector': [-0.018076948566243978, 0.0017768934909982992, 0.0006580952284183095]}\nB: {'rotation matrix': [[-0.782674, -0.257014, 0.566891], [-0.62216, 0.296092, -0.724739], [0.018416, -0.919931, -0.391647]], 'translation vector': [3.075882, 2.930909, 1.465913]}\nC: {'rotation matrix': [[-0.790591, -0.247688, 0.560015], [-0.61223, 0.301979, -0.730742], [0.011883, -0.920576, -0.390384]], 'translation vector': [3.085087, 2.935415, 1.467454]}\nD: {'rotation matrix': [[-0.773889, -0.266244, 0.574639], [-0.632944, 0.293814, -0.716279], [0.021868, -0.918034, -0.395897]], 'translation vector': [3.064209, 2.92712, 1.46117]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_85_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_85_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_85_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_85_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.516456, 0.471348, -0.714916], [0.852952, -0.209256, 0.47821], [0.075803, -0.856763, -0.510109]], 'translation vector': [4.97866, 0.423553, 1.591931]}\nB: {'rotation matrix': [[0.514459, 0.473148, -0.715167], [0.854068, -0.208015, 0.476757], [0.076811, -0.856073, -0.511116]], 'translation vector': [4.979161, 0.423603, 1.588672]}\nC: {'rotation matrix': [[0.513176, 0.475448, -0.714563], [0.854688, -0.206948, 0.476112], [0.078489, -0.855056, -0.51256]], 'translation vector': [4.976408, 0.420953, 1.588878]}\nD: {'rotation matrix': [[0.9999760932186582, 0.0012257187804321542, -0.006834405192096566], [-0.001218782424370881, 0.999999818921041, 0.0008912149688576313], [0.006835564452074455, -0.0008827669412868106, 0.9999768915925589]], 'translation vector': [-0.008703367750305002, 0.013496314561282197, 0.004560884153690381]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.516456, 0.471348, -0.714916], [0.852952, -0.209256, 0.47821], [0.075803, -0.856763, -0.510109]], 'translation vector': [4.97866, 0.423553, 1.591931]}\nB: {'rotation matrix': [[0.514459, 0.473148, -0.715167], [0.854068, -0.208015, 0.476757], [0.076811, -0.856073, -0.511116]], 'translation vector': [4.979161, 0.423603, 1.588672]}\nC: {'rotation matrix': [[0.513176, 0.475448, -0.714563], [0.854688, -0.206948, 0.476112], [0.078489, -0.855056, -0.51256]], 'translation vector': [4.976408, 0.420953, 1.588878]}\nD: {'rotation matrix': [[0.9999760932186582, 0.0012257187804321542, -0.006834405192096566], [-0.001218782424370881, 0.999999818921041, 0.0008912149688576313], [0.006835564452074455, -0.0008827669412868106, 0.9999768915925589]], 'translation vector': [-0.008703367750305002, 0.013496314561282197, 0.004560884153690381]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_86_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_86_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_86_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_86_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.448824, 0.136877, -0.883075], [0.892984, 0.105987, -0.437432], [0.033721, -0.984902, -0.169799]], 'translation vector': [3.315047, 2.127717, 1.592265]}\nB: {'rotation matrix': [[-0.452202, 0.137197, -0.8813], [0.891317, 0.105713, -0.440885], [0.032677, -0.984887, -0.170089]], 'translation vector': [3.315698, 2.124716, 1.590659]}\nC: {'rotation matrix': [[-0.449366, 0.136914, -0.882794], [0.892692, 0.106685, -0.437859], [0.034232, -0.984821, -0.170162]], 'translation vector': [3.315906, 2.123902, 1.590809]}\nD: {'rotation matrix': [[0.9999985219155758, 3.3460926462502616e-05, -0.0017161305114821916], [-2.2107949587045813e-05, 0.9999776535003064, 0.006715346047559243], [0.0017166465785429835, -0.006715383734168117, 0.9999757458350781]], 'translation vector': [0.0004478029195722488, -0.00269782175769262, 0.002036977128338613]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.448824, 0.136877, -0.883075], [0.892984, 0.105987, -0.437432], [0.033721, -0.984902, -0.169799]], 'translation vector': [3.315047, 2.127717, 1.592265]}\nB: {'rotation matrix': [[-0.452202, 0.137197, -0.8813], [0.891317, 0.105713, -0.440885], [0.032677, -0.984887, -0.170089]], 'translation vector': [3.315698, 2.124716, 1.590659]}\nC: {'rotation matrix': [[-0.449366, 0.136914, -0.882794], [0.892692, 0.106685, -0.437859], [0.034232, -0.984821, -0.170162]], 'translation vector': [3.315906, 2.123902, 1.590809]}\nD: {'rotation matrix': [[0.9999985219155758, 3.3460926462502616e-05, -0.0017161305114821916], [-2.2107949587045813e-05, 0.9999776535003064, 0.006715346047559243], [0.0017166465785429835, -0.006715383734168117, 0.9999757458350781]], 'translation vector': [0.0004478029195722488, -0.00269782175769262, 0.002036977128338613]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_87_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_87_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_87_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_87_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.993234, -0.016226, -0.114989], [0.102235, -0.347461, 0.932104], [-0.055078, -0.937554, -0.343451]], 'translation vector': [2.952414, 4.433719, 1.459459]}\nB: {'rotation matrix': [[0.993467, -0.015486, -0.113064], [0.100672, -0.347655, 0.932202], [-0.053743, -0.937495, -0.343825]], 'translation vector': [2.95506, 4.435545, 1.464879]}\nC: {'rotation matrix': [[0.9999909736646528, 0.004123633453855257, 0.0014048261378039103], [-0.004123349991970072, 0.9999914014789152, -0.00016491391217041438], [-0.0014064329904194771, 0.0001595777583995875, 0.9999985433300184]], 'translation vector': [-0.0007043054873738797, 0.0030876829023283037, 0.0005875691155496909]}\nD: {'rotation matrix': [[0.993543, -0.018943, -0.111866], [0.098443, -0.346258, 0.93296], [-0.056408, -0.937948, -0.342158]], 'translation vector': [2.958581, 4.436487, 1.463224]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.993234, -0.016226, -0.114989], [0.102235, -0.347461, 0.932104], [-0.055078, -0.937554, -0.343451]], 'translation vector': [2.952414, 4.433719, 1.459459]}\nB: {'rotation matrix': [[0.993467, -0.015486, -0.113064], [0.100672, -0.347655, 0.932202], [-0.053743, -0.937495, -0.343825]], 'translation vector': [2.95506, 4.435545, 1.464879]}\nC: {'rotation matrix': [[0.9999909736646528, 0.004123633453855257, 0.0014048261378039103], [-0.004123349991970072, 0.9999914014789152, -0.00016491391217041438], [-0.0014064329904194771, 0.0001595777583995875, 0.9999985433300184]], 'translation vector': [-0.0007043054873738797, 0.0030876829023283037, 0.0005875691155496909]}\nD: {'rotation matrix': [[0.993543, -0.018943, -0.111866], [0.098443, -0.346258, 0.93296], [-0.056408, -0.937948, -0.342158]], 'translation vector': [2.958581, 4.436487, 1.463224]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_88_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_88_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_88_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_88_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.481483, 0.389974, -0.784917], [0.875782, -0.249176, 0.413422], [-0.034359, -0.886471, -0.461507]], 'translation vector': [2.949051, 2.713893, 1.478454]}\nB: {'rotation matrix': [[0.478541, 0.391858, -0.785777], [0.877371, -0.248969, 0.410164], [-0.034908, -0.885699, -0.462947]], 'translation vector': [2.947931, 2.717417, 1.47825]}\nC: {'rotation matrix': [[0.48013, 0.390884, -0.785293], [0.876512, -0.249161, 0.41188], [-0.034667, -0.886075, -0.462245]], 'translation vector': [2.948499, 2.715565, 1.478062]}\nD: {'rotation matrix': [[0.9999997612782662, 0.0004988640565728734, 0.0011620670948559774], [-0.0004998749589789403, 0.9999982856757066, 0.0017162390398127588], [-0.0011605634579595124, -0.0017165704161403005, 0.9999970539013514]], 'translation vector': [0.0003215494383659312, -0.003127483043635193, -0.0005499886913977736]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.481483, 0.389974, -0.784917], [0.875782, -0.249176, 0.413422], [-0.034359, -0.886471, -0.461507]], 'translation vector': [2.949051, 2.713893, 1.478454]}\nB: {'rotation matrix': [[0.478541, 0.391858, -0.785777], [0.877371, -0.248969, 0.410164], [-0.034908, -0.885699, -0.462947]], 'translation vector': [2.947931, 2.717417, 1.47825]}\nC: {'rotation matrix': [[0.48013, 0.390884, -0.785293], [0.876512, -0.249161, 0.41188], [-0.034667, -0.886075, -0.462245]], 'translation vector': [2.948499, 2.715565, 1.478062]}\nD: {'rotation matrix': [[0.9999997612782662, 0.0004988640565728734, 0.0011620670948559774], [-0.0004998749589789403, 0.9999982856757066, 0.0017162390398127588], [-0.0011605634579595124, -0.0017165704161403005, 0.9999970539013514]], 'translation vector': [0.0003215494383659312, -0.003127483043635193, -0.0005499886913977736]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_89_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_89_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_89_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_89_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.564799, -0.352124, 0.746332], [-0.825057, 0.22251, -0.519395], [0.016825, -0.909119, -0.416196]], 'translation vector': [2.054545, 3.84102, 1.387591]}\nB: {'rotation matrix': [[-0.564546, -0.353818, 0.745722], [-0.825222, 0.223074, -0.518891], [0.017242, -0.908323, -0.417914]], 'translation vector': [2.054274, 3.838, 1.389919]}\nC: {'rotation matrix': [[0.9999863571433116, 0.000569020369849223, 0.005277349616356428], [-0.0005964877721919992, 0.9999870648779766, 0.0050113119474318605], [-0.005273998232851741, -0.005013939911459714, 0.9999743290291354]], 'translation vector': [-0.009394794053914524, 0.0032248655541047277, -0.0014403793043347157]}\nD: {'rotation matrix': [[-0.566299, -0.350153, 0.746122], [-0.824022, 0.221689, -0.521385], [0.017157, -0.91008, -0.414077]], 'translation vector': [2.055187, 3.843729, 1.385575]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.564799, -0.352124, 0.746332], [-0.825057, 0.22251, -0.519395], [0.016825, -0.909119, -0.416196]], 'translation vector': [2.054545, 3.84102, 1.387591]}\nB: {'rotation matrix': [[-0.564546, -0.353818, 0.745722], [-0.825222, 0.223074, -0.518891], [0.017242, -0.908323, -0.417914]], 'translation vector': [2.054274, 3.838, 1.389919]}\nC: {'rotation matrix': [[0.9999863571433116, 0.000569020369849223, 0.005277349616356428], [-0.0005964877721919992, 0.9999870648779766, 0.0050113119474318605], [-0.005273998232851741, -0.005013939911459714, 0.9999743290291354]], 'translation vector': [-0.009394794053914524, 0.0032248655541047277, -0.0014403793043347157]}\nD: {'rotation matrix': [[-0.566299, -0.350153, 0.746122], [-0.824022, 0.221689, -0.521385], [0.017157, -0.91008, -0.414077]], 'translation vector': [2.055187, 3.843729, 1.385575]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_90_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_90_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_90_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_90_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.037266, 0.594373, -0.803326], [0.998697, -0.005895, -0.05069], [-0.034865, -0.804168, -0.593379]], 'translation vector': [3.957977, 2.244087, 1.44004]}\nB: {'rotation matrix': [[0.9999996431737382, -0.0003859815143722872, 0.000745633467728039], [0.0003848347144862704, 1.0000000520852899, 0.00069936137332727], [-0.0007460186813055825, -0.0006987638070698045, 0.9999991664939947]], 'translation vector': [0.0003220625244955144, -0.0016866012265464025, 0.00017566976974592308]}\nC: {'rotation matrix': [[-0.03699, 0.597433, -0.801066], [0.998659, -0.006964, -0.051308], [-0.036231, -0.801889, -0.596374]], 'translation vector': [3.95766, 2.242744, 1.440408]}\nD: {'rotation matrix': [[-0.039909, 0.596654, -0.801506], [0.998413, -0.00808, -0.055729], [-0.039727, -0.802458, -0.595385]], 'translation vector': [3.959598, 2.247142, 1.43878]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.037266, 0.594373, -0.803326], [0.998697, -0.005895, -0.05069], [-0.034865, -0.804168, -0.593379]], 'translation vector': [3.957977, 2.244087, 1.44004]}\nB: {'rotation matrix': [[0.9999996431737382, -0.0003859815143722872, 0.000745633467728039], [0.0003848347144862704, 1.0000000520852899, 0.00069936137332727], [-0.0007460186813055825, -0.0006987638070698045, 0.9999991664939947]], 'translation vector': [0.0003220625244955144, -0.0016866012265464025, 0.00017566976974592308]}\nC: {'rotation matrix': [[-0.03699, 0.597433, -0.801066], [0.998659, -0.006964, -0.051308], [-0.036231, -0.801889, -0.596374]], 'translation vector': [3.95766, 2.242744, 1.440408]}\nD: {'rotation matrix': [[-0.039909, 0.596654, -0.801506], [0.998413, -0.00808, -0.055729], [-0.039727, -0.802458, -0.595385]], 'translation vector': [3.959598, 2.247142, 1.43878]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_91_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_91_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_91_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_91_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.875953, -0.064411, 0.478078], [-0.480708, 0.199407, -0.853907], [-0.040331, -0.977798, -0.205634]], 'translation vector': [2.420033, 1.712699, 1.489589]}\nB: {'rotation matrix': [[0.9999951897323609, 0.002443562746852685, 0.0019815417907405445], [-0.0024514031523570133, 0.9999905126626161, 0.0035483645433095957], [-0.001973097402749593, -0.0035542813588159473, 0.9999914538041953]], 'translation vector': [0.004274186437530858, 0.003113397542267471, -0.0028983696999268505]}\nC: {'rotation matrix': [[-0.873782, -0.064754, 0.481987], [-0.484652, 0.197901, -0.852026], [-0.040214, -0.978081, -0.204306]], 'translation vector': [2.408279, 1.71933, 1.490834]}\nD: {'rotation matrix': [[-0.874383, -0.064777, 0.480894], [-0.483517, 0.199683, -0.852255], [-0.04082, -0.977717, -0.20592]], 'translation vector': [2.41403, 1.715424, 1.490696]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.875953, -0.064411, 0.478078], [-0.480708, 0.199407, -0.853907], [-0.040331, -0.977798, -0.205634]], 'translation vector': [2.420033, 1.712699, 1.489589]}\nB: {'rotation matrix': [[0.9999951897323609, 0.002443562746852685, 0.0019815417907405445], [-0.0024514031523570133, 0.9999905126626161, 0.0035483645433095957], [-0.001973097402749593, -0.0035542813588159473, 0.9999914538041953]], 'translation vector': [0.004274186437530858, 0.003113397542267471, -0.0028983696999268505]}\nC: {'rotation matrix': [[-0.873782, -0.064754, 0.481987], [-0.484652, 0.197901, -0.852026], [-0.040214, -0.978081, -0.204306]], 'translation vector': [2.408279, 1.71933, 1.490834]}\nD: {'rotation matrix': [[-0.874383, -0.064777, 0.480894], [-0.483517, 0.199683, -0.852255], [-0.04082, -0.977717, -0.20592]], 'translation vector': [2.41403, 1.715424, 1.490696]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_92_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_92_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_92_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_92_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.222027, -0.462333, 0.858459], [-0.974308, 0.139332, -0.176951], [-0.0378, -0.875691, -0.48139]], 'translation vector': [2.717101, 1.647348, 1.522281]}\nB: {'rotation matrix': [[-0.218431, -0.46311, 0.858963], [-0.975079, 0.138612, -0.173227], [-0.038839, -0.875395, -0.481846]], 'translation vector': [2.716881, 1.647519, 1.52132]}\nC: {'rotation matrix': [[-0.21644, -0.463451, 0.859283], [-0.975487, 0.138488, -0.171017], [-0.039742, -0.875234, -0.482065]], 'translation vector': [2.718464, 1.6518, 1.521331]}\nD: {'rotation matrix': [[0.9999923983379827, -0.0004070717907623284, -0.003958509569048427], [0.0004054339454520325, 1.0000004886800318, -0.0002774375137664041], [0.003959398020789269, 0.00027564706640871675, 0.9999922541189072]], 'translation vector': [-0.005217870906849331, -0.0010876072780465762, 0.0013190166897232292]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.222027, -0.462333, 0.858459], [-0.974308, 0.139332, -0.176951], [-0.0378, -0.875691, -0.48139]], 'translation vector': [2.717101, 1.647348, 1.522281]}\nB: {'rotation matrix': [[-0.218431, -0.46311, 0.858963], [-0.975079, 0.138612, -0.173227], [-0.038839, -0.875395, -0.481846]], 'translation vector': [2.716881, 1.647519, 1.52132]}\nC: {'rotation matrix': [[-0.21644, -0.463451, 0.859283], [-0.975487, 0.138488, -0.171017], [-0.039742, -0.875234, -0.482065]], 'translation vector': [2.718464, 1.6518, 1.521331]}\nD: {'rotation matrix': [[0.9999923983379827, -0.0004070717907623284, -0.003958509569048427], [0.0004054339454520325, 1.0000004886800318, -0.0002774375137664041], [0.003959398020789269, 0.00027564706640871675, 0.9999922541189072]], 'translation vector': [-0.005217870906849331, -0.0010876072780465762, 0.0013190166897232292]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_93_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_93_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_93_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_93_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.937153, -0.143335, 0.318118], [-0.348912, 0.379164, -0.857027], [0.002223, -0.914161, -0.405346]], 'translation vector': [2.695696, 2.482015, 1.468683]}\nB: {'rotation matrix': [[0.9999943196573092, 0.0004252932585082039, -0.003407353327997387], [-0.0004083231357033075, 0.9999884704113977, 0.004802465167927532], [0.0034098975295442984, -0.004801063633517885, 0.9999828785878939]], 'translation vector': [4.878723511492211e-05, -0.0002916568078848991, 6.338042778675224e-05]}\nC: {'rotation matrix': [[-0.936491, -0.141969, 0.320669], [-0.350691, 0.379755, -0.856039], [-0.000244, -0.914129, -0.405424]], 'translation vector': [2.694833, 2.48135, 1.466405]}\nD: {'rotation matrix': [[-0.938082, -0.144128, 0.315008], [-0.346377, 0.376958, -0.859026], [0.005065, -0.914948, -0.40354]], 'translation vector': [2.697706, 2.481531, 1.470994]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.937153, -0.143335, 0.318118], [-0.348912, 0.379164, -0.857027], [0.002223, -0.914161, -0.405346]], 'translation vector': [2.695696, 2.482015, 1.468683]}\nB: {'rotation matrix': [[0.9999943196573092, 0.0004252932585082039, -0.003407353327997387], [-0.0004083231357033075, 0.9999884704113977, 0.004802465167927532], [0.0034098975295442984, -0.004801063633517885, 0.9999828785878939]], 'translation vector': [4.878723511492211e-05, -0.0002916568078848991, 6.338042778675224e-05]}\nC: {'rotation matrix': [[-0.936491, -0.141969, 0.320669], [-0.350691, 0.379755, -0.856039], [-0.000244, -0.914129, -0.405424]], 'translation vector': [2.694833, 2.48135, 1.466405]}\nD: {'rotation matrix': [[-0.938082, -0.144128, 0.315008], [-0.346377, 0.376958, -0.859026], [0.005065, -0.914948, -0.40354]], 'translation vector': [2.697706, 2.481531, 1.470994]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_94_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_94_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_94_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_94_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.9999926818767, 0.0006670318036002099, -0.0037525660402691484], [-0.0006413298116955215, 0.9999778975651503, 0.0065610016234285765], [0.0037569249243787368, -0.006558428485894876, 0.999971386151166]], 'translation vector': [0.00407147231806082, -0.002381515530327949, 0.00020808612264033854]}\nB: {'rotation matrix': [[0.598948, -0.354434, 0.718079], [-0.795274, -0.158225, 0.585238], [-0.093811, -0.921597, -0.376641]], 'translation vector': [2.366687, 6.228749, 1.483315]}\nC: {'rotation matrix': [[0.595688, -0.354051, 0.720975], [-0.797698, -0.155728, 0.582604], [-0.093996, -0.92217, -0.37519]], 'translation vector': [2.365015, 6.231124, 1.484416]}\nD: {'rotation matrix': [[0.602088, -0.354098, 0.715615], [-0.793005, -0.160904, 0.587582], [-0.092916, -0.921263, -0.37768]], 'translation vector': [2.370181, 6.228135, 1.483056]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999926818767, 0.0006670318036002099, -0.0037525660402691484], [-0.0006413298116955215, 0.9999778975651503, 0.0065610016234285765], [0.0037569249243787368, -0.006558428485894876, 0.999971386151166]], 'translation vector': [0.00407147231806082, -0.002381515530327949, 0.00020808612264033854]}\nB: {'rotation matrix': [[0.598948, -0.354434, 0.718079], [-0.795274, -0.158225, 0.585238], [-0.093811, -0.921597, -0.376641]], 'translation vector': [2.366687, 6.228749, 1.483315]}\nC: {'rotation matrix': [[0.595688, -0.354051, 0.720975], [-0.797698, -0.155728, 0.582604], [-0.093996, -0.92217, -0.37519]], 'translation vector': [2.365015, 6.231124, 1.484416]}\nD: {'rotation matrix': [[0.602088, -0.354098, 0.715615], [-0.793005, -0.160904, 0.587582], [-0.092916, -0.921263, -0.37768]], 'translation vector': [2.370181, 6.228135, 1.483056]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_95_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_95_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_95_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_95_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.749807, 0.343159, -0.565713], [0.658704, 0.306467, -0.687158], [-0.062432, -0.887874, -0.455832]], 'translation vector': [3.78087, 2.559782, 1.382918]}\nB: {'rotation matrix': [[0.9999937348891738, -5.242465291256345e-05, -0.003554980216800454], [4.743594033927849e-05, 0.9999987930802742, -0.0013704756295102494], [0.0035549170275783653, 0.0013698614468464884, 0.9999933438233037]], 'translation vector': [-0.006430893779766134, -0.00441205739948114, -0.013902381507369554]}\nC: {'rotation matrix': [[-0.753941, 0.344397, -0.559431], [0.653858, 0.310968, -0.68976], [-0.063586, -0.885827, -0.459639]], 'translation vector': [3.768856, 2.553297, 1.380708]}\nD: {'rotation matrix': [[-0.758857, 0.345337, -0.552158], [0.64782, 0.313263, -0.694403], [-0.066832, -0.884652, -0.461438]], 'translation vector': [3.75736, 2.54705, 1.379026]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.749807, 0.343159, -0.565713], [0.658704, 0.306467, -0.687158], [-0.062432, -0.887874, -0.455832]], 'translation vector': [3.78087, 2.559782, 1.382918]}\nB: {'rotation matrix': [[0.9999937348891738, -5.242465291256345e-05, -0.003554980216800454], [4.743594033927849e-05, 0.9999987930802742, -0.0013704756295102494], [0.0035549170275783653, 0.0013698614468464884, 0.9999933438233037]], 'translation vector': [-0.006430893779766134, -0.00441205739948114, -0.013902381507369554]}\nC: {'rotation matrix': [[-0.753941, 0.344397, -0.559431], [0.653858, 0.310968, -0.68976], [-0.063586, -0.885827, -0.459639]], 'translation vector': [3.768856, 2.553297, 1.380708]}\nD: {'rotation matrix': [[-0.758857, 0.345337, -0.552158], [0.64782, 0.313263, -0.694403], [-0.066832, -0.884652, -0.461438]], 'translation vector': [3.75736, 2.54705, 1.379026]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_96_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_96_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_96_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_96_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.9999813823639969, 0.005886631638650806, 0.0017617762591203541], [-0.005897799441000984, 0.9999663260190635, 0.005653603868593601], [-0.0017278375660129883, -0.005663714705693189, 0.9999828776446472]], 'translation vector': [-0.009345482457980114, -0.0002560144178671564, 0.004418422526778709]}\nB: {'rotation matrix': [[0.930353, -0.229821, 0.285704], [-0.366636, -0.593139, 0.716774], [0.004732, -0.771601, -0.636089]], 'translation vector': [0.347034, 1.978598, 1.559374]}\nC: {'rotation matrix': [[0.932658, -0.225033, 0.281975], [-0.360742, -0.590178, 0.722188], [0.003899, -0.775274, -0.631613]], 'translation vector': [0.341015, 1.979035, 1.553548]}\nD: {'rotation matrix': [[0.93165, -0.227962, 0.282953], [-0.363337, -0.592976, 0.718587], [0.003973, -0.772278, -0.635273]], 'translation vector': [0.345174, 1.978811, 1.55669]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999813823639969, 0.005886631638650806, 0.0017617762591203541], [-0.005897799441000984, 0.9999663260190635, 0.005653603868593601], [-0.0017278375660129883, -0.005663714705693189, 0.9999828776446472]], 'translation vector': [-0.009345482457980114, -0.0002560144178671564, 0.004418422526778709]}\nB: {'rotation matrix': [[0.930353, -0.229821, 0.285704], [-0.366636, -0.593139, 0.716774], [0.004732, -0.771601, -0.636089]], 'translation vector': [0.347034, 1.978598, 1.559374]}\nC: {'rotation matrix': [[0.932658, -0.225033, 0.281975], [-0.360742, -0.590178, 0.722188], [0.003899, -0.775274, -0.631613]], 'translation vector': [0.341015, 1.979035, 1.553548]}\nD: {'rotation matrix': [[0.93165, -0.227962, 0.282953], [-0.363337, -0.592976, 0.718587], [0.003973, -0.772278, -0.635273]], 'translation vector': [0.345174, 1.978811, 1.55669]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_97_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_97_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_97_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_97_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.805636, 0.331099, -0.491249], [0.591886, 0.41495, -0.691005], [-0.024947, -0.847461, -0.530271]], 'translation vector': [2.379813, 3.089217, 1.318416]}\nB: {'rotation matrix': [[-0.795605, 0.337599, -0.503031], [0.605104, 0.402607, -0.686846], [-0.029355, -0.850844, -0.524598]], 'translation vector': [2.393777, 3.105406, 1.314663]}\nC: {'rotation matrix': [[0.9993623988750846, 0.01800106003136589, -0.030848731804103247], [-0.01802560053443595, 0.9998375187449363, -0.0005268676766097016], [0.03083392596568167, 0.0010819486754358148, 0.9995239906208094]], 'translation vector': [-0.0016360885893116628, -0.010945290948126685, 0.024052024973950203]}\nD: {'rotation matrix': [[-0.800158, 0.334375, -0.497936], [0.599132, 0.406738, -0.689642], [-0.02807, -0.850152, -0.525789]], 'translation vector': [2.38798, 3.097038, 1.316188]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.805636, 0.331099, -0.491249], [0.591886, 0.41495, -0.691005], [-0.024947, -0.847461, -0.530271]], 'translation vector': [2.379813, 3.089217, 1.318416]}\nB: {'rotation matrix': [[-0.795605, 0.337599, -0.503031], [0.605104, 0.402607, -0.686846], [-0.029355, -0.850844, -0.524598]], 'translation vector': [2.393777, 3.105406, 1.314663]}\nC: {'rotation matrix': [[0.9993623988750846, 0.01800106003136589, -0.030848731804103247], [-0.01802560053443595, 0.9998375187449363, -0.0005268676766097016], [0.03083392596568167, 0.0010819486754358148, 0.9995239906208094]], 'translation vector': [-0.0016360885893116628, -0.010945290948126685, 0.024052024973950203]}\nD: {'rotation matrix': [[-0.800158, 0.334375, -0.497936], [0.599132, 0.406738, -0.689642], [-0.02807, -0.850152, -0.525789]], 'translation vector': [2.38798, 3.097038, 1.316188]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_98_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_98_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_98_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_98_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.991094, 0.095151, -0.093156], [0.13246, 0.632786, -0.762913], [-0.013644, -0.768458, -0.639754]], 'translation vector': [1.823914, 5.346199, 1.288239]}\nB: {'rotation matrix': [[-0.988726, 0.104422, -0.107319], [0.149216, 0.627347, -0.764311], [-0.012484, -0.771707, -0.635855]], 'translation vector': [1.82699, 5.341948, 1.287049]}\nC: {'rotation matrix': [[-0.99302, 0.087717, -0.078848], [0.116766, 0.636826, -0.762115], [-0.016638, -0.766002, -0.642623]], 'translation vector': [1.820977, 5.35315, 1.28763]}\nD: {'rotation matrix': [[0.9995983225918212, 0.019501636556654815, -0.020561779482543004], [-0.019155805749363774, 0.9996737108067424, 0.016883013818148294], [0.020884207942970644, -0.01648212192569651, 0.9996460764084063]], 'translation vector': [0.004177467169132809, 0.0037647300644172432, -0.008346822766605477]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.991094, 0.095151, -0.093156], [0.13246, 0.632786, -0.762913], [-0.013644, -0.768458, -0.639754]], 'translation vector': [1.823914, 5.346199, 1.288239]}\nB: {'rotation matrix': [[-0.988726, 0.104422, -0.107319], [0.149216, 0.627347, -0.764311], [-0.012484, -0.771707, -0.635855]], 'translation vector': [1.82699, 5.341948, 1.287049]}\nC: {'rotation matrix': [[-0.99302, 0.087717, -0.078848], [0.116766, 0.636826, -0.762115], [-0.016638, -0.766002, -0.642623]], 'translation vector': [1.820977, 5.35315, 1.28763]}\nD: {'rotation matrix': [[0.9995983225918212, 0.019501636556654815, -0.020561779482543004], [-0.019155805749363774, 0.9996737108067424, 0.016883013818148294], [0.020884207942970644, -0.01648212192569651, 0.9996460764084063]], 'translation vector': [0.004177467169132809, 0.0037647300644172432, -0.008346822766605477]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_99_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_99_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_99_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_99_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.821677, -0.468788, 0.324168], [-0.569738, 0.691217, -0.444543], [-0.015674, -0.549962, -0.835043]], 'translation vector': [3.090628, 8.002418, 1.936363]}\nB: {'rotation matrix': [[-0.828255, -0.463511, 0.314882], [-0.560231, 0.696605, -0.4482], [-0.011603, -0.547631, -0.83664]], 'translation vector': [3.092081, 8.003743, 1.933112]}\nC: {'rotation matrix': [[-0.825245, -0.467159, 0.317384], [-0.564664, 0.693585, -0.44732], [-0.011164, -0.548364, -0.836165]], 'translation vector': [3.09483, 8.004893, 1.934166]}\nD: {'rotation matrix': [[0.9997794014417315, -0.01936250651215195, 0.008149128878679036], [0.019327506819120155, 0.9998035065129168, 0.004379447300995329], [-0.008232031850993044, -0.004222057574502129, 0.9999569918430281]], 'translation vector': [0.008428449014058259, -0.001816843944377755, 0.00043274814993932154]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.821677, -0.468788, 0.324168], [-0.569738, 0.691217, -0.444543], [-0.015674, -0.549962, -0.835043]], 'translation vector': [3.090628, 8.002418, 1.936363]}\nB: {'rotation matrix': [[-0.828255, -0.463511, 0.314882], [-0.560231, 0.696605, -0.4482], [-0.011603, -0.547631, -0.83664]], 'translation vector': [3.092081, 8.003743, 1.933112]}\nC: {'rotation matrix': [[-0.825245, -0.467159, 0.317384], [-0.564664, 0.693585, -0.44732], [-0.011164, -0.548364, -0.836165]], 'translation vector': [3.09483, 8.004893, 1.934166]}\nD: {'rotation matrix': [[0.9997794014417315, -0.01936250651215195, 0.008149128878679036], [0.019327506819120155, 0.9998035065129168, 0.004379447300995329], [-0.008232031850993044, -0.004222057574502129, 0.9999569918430281]], 'translation vector': [0.008428449014058259, -0.001816843944377755, 0.00043274814993932154]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_100_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_100_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_100_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_100_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.141521, 0.444417, -0.884571], [0.989842, -0.075821, 0.12027], [-0.013619, -0.892605, -0.450633]], 'translation vector': [3.547713, 0.933243, 1.481136]}\nB: {'rotation matrix': [[0.9999998646080918, -0.0007460665530377528, -0.0009402946708239287], [0.0007455507104745661, 1.0000000002230558, -0.0004258129745529856], [0.0009407423454210515, 0.0004259725827619916, 0.9999998211939707]], 'translation vector': [0.0008665006475636616, -0.0026059059125004003, -0.0011105548234366935]}\nC: {'rotation matrix': [[0.140147, 0.44482, -0.884587], [0.990025, -0.076044, 0.118612], [-0.014506, -0.892386, -0.45104]], 'translation vector': [3.548717, 0.935529, 1.481701]}\nD: {'rotation matrix': [[0.140907, 0.444584, -0.884585], [0.989916, -0.076415, 0.11928], [-0.014565, -0.892472, -0.450868]], 'translation vector': [3.549046, 0.934745, 1.482359]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.141521, 0.444417, -0.884571], [0.989842, -0.075821, 0.12027], [-0.013619, -0.892605, -0.450633]], 'translation vector': [3.547713, 0.933243, 1.481136]}\nB: {'rotation matrix': [[0.9999998646080918, -0.0007460665530377528, -0.0009402946708239287], [0.0007455507104745661, 1.0000000002230558, -0.0004258129745529856], [0.0009407423454210515, 0.0004259725827619916, 0.9999998211939707]], 'translation vector': [0.0008665006475636616, -0.0026059059125004003, -0.0011105548234366935]}\nC: {'rotation matrix': [[0.140147, 0.44482, -0.884587], [0.990025, -0.076044, 0.118612], [-0.014506, -0.892386, -0.45104]], 'translation vector': [3.548717, 0.935529, 1.481701]}\nD: {'rotation matrix': [[0.140907, 0.444584, -0.884585], [0.989916, -0.076415, 0.11928], [-0.014565, -0.892472, -0.450868]], 'translation vector': [3.549046, 0.934745, 1.482359]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_101_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_101_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_101_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_101_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.9999988361601905, 0.001006375069335518, -0.0012640568426656846], [-0.0010070223030151592, 0.9999994457401622, -0.0002863718291939501], [0.00126333848912293, 0.0002872673648295308, 0.9999986249179557]], 'translation vector': [-2.298300292791211e-05, -0.0003253221346838364, -0.001208803459929797]}\nB: {'rotation matrix': [[0.209622, 0.494864, -0.843308], [0.976967, -0.070778, 0.201312], [0.039935, -0.866083, -0.498303]], 'translation vector': [4.529501, 2.292687, 1.525847]}\nC: {'rotation matrix': [[0.210084, 0.49423, -0.843565], [0.976909, -0.071791, 0.201231], [0.038894, -0.866362, -0.4979]], 'translation vector': [4.52972, 2.291977, 1.52688]}\nD: {'rotation matrix': [[0.207746, 0.495681, -0.843292], [0.977345, -0.069508, 0.199914], [0.040478, -0.865719, -0.498891]], 'translation vector': [4.528935, 2.293617, 1.525752]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999988361601905, 0.001006375069335518, -0.0012640568426656846], [-0.0010070223030151592, 0.9999994457401622, -0.0002863718291939501], [0.00126333848912293, 0.0002872673648295308, 0.9999986249179557]], 'translation vector': [-2.298300292791211e-05, -0.0003253221346838364, -0.001208803459929797]}\nB: {'rotation matrix': [[0.209622, 0.494864, -0.843308], [0.976967, -0.070778, 0.201312], [0.039935, -0.866083, -0.498303]], 'translation vector': [4.529501, 2.292687, 1.525847]}\nC: {'rotation matrix': [[0.210084, 0.49423, -0.843565], [0.976909, -0.071791, 0.201231], [0.038894, -0.866362, -0.4979]], 'translation vector': [4.52972, 2.291977, 1.52688]}\nD: {'rotation matrix': [[0.207746, 0.495681, -0.843292], [0.977345, -0.069508, 0.199914], [0.040478, -0.865719, -0.498891]], 'translation vector': [4.528935, 2.293617, 1.525752]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_102_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_102_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_102_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_102_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.08541, 0.640528, -0.76317], [0.996306, -0.061741, 0.059682], [-0.008891, -0.765449, -0.643436]], 'translation vector': [3.003591, 1.574332, 1.432793]}\nB: {'rotation matrix': [[0.08501, 0.641279, -0.762584], [0.996355, -0.060148, 0.06049], [-0.007077, -0.764946, -0.644055]], 'translation vector': [3.00634, 1.575815, 1.433934]}\nC: {'rotation matrix': [[0.9999991614836521, 0.0005695811207308883, -0.0015075373347832835], [-0.0005748316229898535, 0.9999942080654551, -0.0033643004544446934], [0.0015050239127193494, 0.003364654292322913, 0.9999927444380246]], 'translation vector': [-0.0005823890138008103, 0.0017300160779236684, -0.0007769099832195536]}\nD: {'rotation matrix': [[0.085438, 0.641091, -0.762694], [0.996316, -0.060644, 0.060635], [-0.00738, -0.765065, -0.64391]], 'translation vector': [3.005707, 1.574798, 1.4333]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.08541, 0.640528, -0.76317], [0.996306, -0.061741, 0.059682], [-0.008891, -0.765449, -0.643436]], 'translation vector': [3.003591, 1.574332, 1.432793]}\nB: {'rotation matrix': [[0.08501, 0.641279, -0.762584], [0.996355, -0.060148, 0.06049], [-0.007077, -0.764946, -0.644055]], 'translation vector': [3.00634, 1.575815, 1.433934]}\nC: {'rotation matrix': [[0.9999991614836521, 0.0005695811207308883, -0.0015075373347832835], [-0.0005748316229898535, 0.9999942080654551, -0.0033643004544446934], [0.0015050239127193494, 0.003364654292322913, 0.9999927444380246]], 'translation vector': [-0.0005823890138008103, 0.0017300160779236684, -0.0007769099832195536]}\nD: {'rotation matrix': [[0.085438, 0.641091, -0.762694], [0.996316, -0.060644, 0.060635], [-0.00738, -0.765065, -0.64391]], 'translation vector': [3.005707, 1.574798, 1.4333]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_103_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_103_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_103_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_103_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.9999836186953653, 0.00108451635874016, -0.00565701955892193], [-0.0010583782720287828, 0.9999885698645751, 0.004627019650109258], [0.005661574895801672, -0.004620937029738945, 0.9999737330988163]], 'translation vector': [0.0022502124816869973, 0.004079635447382657, -0.0017077678174191036]}\nB: {'rotation matrix': [[0.764916, -0.419696, 0.48863], [-0.623144, -0.290098, 0.726316], [-0.163081, -0.860057, -0.483431]], 'translation vector': [2.190224, 2.255941, 1.286466]}\nC: {'rotation matrix': [[0.764173, -0.416772, 0.492281], [-0.624792, -0.28869, 0.725461], [-0.160235, -0.861951, -0.481005]], 'translation vector': [2.189569, 2.253508, 1.282023]}\nD: {'rotation matrix': [[0.763152, -0.417481, 0.493263], [-0.626561, -0.291187, 0.722933], [-0.158179, -0.860767, -0.483797]], 'translation vector': [2.190887, 2.252149, 1.282769]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999836186953653, 0.00108451635874016, -0.00565701955892193], [-0.0010583782720287828, 0.9999885698645751, 0.004627019650109258], [0.005661574895801672, -0.004620937029738945, 0.9999737330988163]], 'translation vector': [0.0022502124816869973, 0.004079635447382657, -0.0017077678174191036]}\nB: {'rotation matrix': [[0.764916, -0.419696, 0.48863], [-0.623144, -0.290098, 0.726316], [-0.163081, -0.860057, -0.483431]], 'translation vector': [2.190224, 2.255941, 1.286466]}\nC: {'rotation matrix': [[0.764173, -0.416772, 0.492281], [-0.624792, -0.28869, 0.725461], [-0.160235, -0.861951, -0.481005]], 'translation vector': [2.189569, 2.253508, 1.282023]}\nD: {'rotation matrix': [[0.763152, -0.417481, 0.493263], [-0.626561, -0.291187, 0.722933], [-0.158179, -0.860767, -0.483797]], 'translation vector': [2.190887, 2.252149, 1.282769]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_104_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_104_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_104_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_104_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.9999685089712825, -0.00485492735446149, 0.006197125791337409], [0.004892366141195013, 0.9999701266660436, -0.005981156791582798], [-0.006168136508286792, 0.006011265809070663, 0.9999622632239159]], 'translation vector': [0.00022924877864394233, 0.00019097290261571587, -0.001928325440709866]}\nB: {'rotation matrix': [[-0.968997, 0.179836, -0.169422], [0.236776, 0.48002, -0.8447], [-0.070582, -0.858627, -0.507719]], 'translation vector': [3.781446, 2.333063, 1.459816]}\nC: {'rotation matrix': [[-0.967651, 0.180929, -0.175829], [0.242263, 0.471818, -0.84776], [-0.070424, -0.862933, -0.500388]], 'translation vector': [3.780886, 2.334988, 1.460004]}\nD: {'rotation matrix': [[-0.968244, 0.180308, -0.173186], [0.239986, 0.476144, -0.845987], [-0.070076, -0.860684, -0.504294]], 'translation vector': [3.781386, 2.333968, 1.460791]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999685089712825, -0.00485492735446149, 0.006197125791337409], [0.004892366141195013, 0.9999701266660436, -0.005981156791582798], [-0.006168136508286792, 0.006011265809070663, 0.9999622632239159]], 'translation vector': [0.00022924877864394233, 0.00019097290261571587, -0.001928325440709866]}\nB: {'rotation matrix': [[-0.968997, 0.179836, -0.169422], [0.236776, 0.48002, -0.8447], [-0.070582, -0.858627, -0.507719]], 'translation vector': [3.781446, 2.333063, 1.459816]}\nC: {'rotation matrix': [[-0.967651, 0.180929, -0.175829], [0.242263, 0.471818, -0.84776], [-0.070424, -0.862933, -0.500388]], 'translation vector': [3.780886, 2.334988, 1.460004]}\nD: {'rotation matrix': [[-0.968244, 0.180308, -0.173186], [0.239986, 0.476144, -0.845987], [-0.070076, -0.860684, -0.504294]], 'translation vector': [3.781386, 2.333968, 1.460791]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_105_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_105_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_105_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_105_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.769476, 0.035457, -0.637691], [0.638618, -0.056212, 0.767468], [-0.008634, -0.997789, -0.065897]], 'translation vector': [3.059908, 3.99174, 1.48793]}\nB: {'rotation matrix': [[0.768334, 0.034359, -0.639126], [0.639975, -0.056434, 0.766321], [-0.009738, -0.997815, -0.065349]], 'translation vector': [3.063556, 3.993645, 1.487647]}\nC: {'rotation matrix': [[0.76637, 0.032495, -0.641577], [0.642284, -0.057724, 0.764291], [-0.012198, -0.997804, -0.065109]], 'translation vector': [3.065239, 3.993527, 1.488269]}\nD: {'rotation matrix': [[0.9999994770672102, 0.0010059593388553243, -0.0005629779278299809], [-0.0010051492699491647, 0.9999992963767129, 0.00013663416634814593], [0.0005621045000587302, -0.0001358922037830382, 1.0000001627120574]], 'translation vector': [-0.004005273497495132, -0.008267648490985158, -0.0009698463604679297]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.769476, 0.035457, -0.637691], [0.638618, -0.056212, 0.767468], [-0.008634, -0.997789, -0.065897]], 'translation vector': [3.059908, 3.99174, 1.48793]}\nB: {'rotation matrix': [[0.768334, 0.034359, -0.639126], [0.639975, -0.056434, 0.766321], [-0.009738, -0.997815, -0.065349]], 'translation vector': [3.063556, 3.993645, 1.487647]}\nC: {'rotation matrix': [[0.76637, 0.032495, -0.641577], [0.642284, -0.057724, 0.764291], [-0.012198, -0.997804, -0.065109]], 'translation vector': [3.065239, 3.993527, 1.488269]}\nD: {'rotation matrix': [[0.9999994770672102, 0.0010059593388553243, -0.0005629779278299809], [-0.0010051492699491647, 0.9999992963767129, 0.00013663416634814593], [0.0005621045000587302, -0.0001358922037830382, 1.0000001627120574]], 'translation vector': [-0.004005273497495132, -0.008267648490985158, -0.0009698463604679297]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_106_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_106_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_106_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_106_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.9999965024273574, 0.0018995589746770465, -0.002128604659346281], [-0.0018975946490258904, 0.9999971783528935, 0.0010235202394668671], [0.0021305967266473038, -0.00101920750556101, 0.9999966975642395]], 'translation vector': [-0.001221359216795559, -0.0013000622008119134, -0.00023198476015379166]}\nB: {'rotation matrix': [[-0.247804, -0.452831, 0.856468], [-0.967446, 0.162565, -0.193963], [-0.051399, -0.876651, -0.478373]], 'translation vector': [1.577581, 1.960365, 1.31447]}\nC: {'rotation matrix': [[-0.241822, -0.452397, 0.858405], [-0.968762, 0.162689, -0.18717], [-0.054978, -0.876852, -0.477607]], 'translation vector': [1.575634, 1.958436, 1.314538]}\nD: {'rotation matrix': [[-0.251836, -0.455153, 0.854058], [-0.966529, 0.162962, -0.198153], [-0.048989, -0.875374, -0.480959]], 'translation vector': [1.577733, 1.957285, 1.314553]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999965024273574, 0.0018995589746770465, -0.002128604659346281], [-0.0018975946490258904, 0.9999971783528935, 0.0010235202394668671], [0.0021305967266473038, -0.00101920750556101, 0.9999966975642395]], 'translation vector': [-0.001221359216795559, -0.0013000622008119134, -0.00023198476015379166]}\nB: {'rotation matrix': [[-0.247804, -0.452831, 0.856468], [-0.967446, 0.162565, -0.193963], [-0.051399, -0.876651, -0.478373]], 'translation vector': [1.577581, 1.960365, 1.31447]}\nC: {'rotation matrix': [[-0.241822, -0.452397, 0.858405], [-0.968762, 0.162689, -0.18717], [-0.054978, -0.876852, -0.477607]], 'translation vector': [1.575634, 1.958436, 1.314538]}\nD: {'rotation matrix': [[-0.251836, -0.455153, 0.854058], [-0.966529, 0.162962, -0.198153], [-0.048989, -0.875374, -0.480959]], 'translation vector': [1.577733, 1.957285, 1.314553]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_107_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_107_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_107_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_107_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.502725, -0.506922, 0.700212], [-0.864292, 0.279467, -0.418207], [0.016312, -0.815431, -0.578624]], 'translation vector': [4.022235, 5.007849, 1.281956]}\nB: {'rotation matrix': [[0.9994658881112836, -0.01787238768104933, 0.027367099286839433], [0.01746068119707397, 0.9997321901556896, 0.015206345300919908], [-0.027631559291770684, -0.01472107419323857, 0.9995094632380824]], 'translation vector': [-0.030548360022831567, -0.0024606871848007472, 0.004630350985881493]}\nC: {'rotation matrix': [[-0.511887, -0.50554, 0.694551], [-0.858867, 0.284356, -0.426016], [0.017868, -0.814599, -0.579749]], 'translation vector': [4.034731, 5.018784, 1.285057]}\nD: {'rotation matrix': [[-0.524185, -0.50567, 0.685221], [-0.851455, 0.296094, -0.432844], [0.015986, -0.810325, -0.585763]], 'translation vector': [4.046806, 5.029983, 1.286514]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.502725, -0.506922, 0.700212], [-0.864292, 0.279467, -0.418207], [0.016312, -0.815431, -0.578624]], 'translation vector': [4.022235, 5.007849, 1.281956]}\nB: {'rotation matrix': [[0.9994658881112836, -0.01787238768104933, 0.027367099286839433], [0.01746068119707397, 0.9997321901556896, 0.015206345300919908], [-0.027631559291770684, -0.01472107419323857, 0.9995094632380824]], 'translation vector': [-0.030548360022831567, -0.0024606871848007472, 0.004630350985881493]}\nC: {'rotation matrix': [[-0.511887, -0.50554, 0.694551], [-0.858867, 0.284356, -0.426016], [0.017868, -0.814599, -0.579749]], 'translation vector': [4.034731, 5.018784, 1.285057]}\nD: {'rotation matrix': [[-0.524185, -0.50567, 0.685221], [-0.851455, 0.296094, -0.432844], [0.015986, -0.810325, -0.585763]], 'translation vector': [4.046806, 5.029983, 1.286514]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_108_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_108_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_108_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_108_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.9997511181043846, 0.002627583195842048, -0.02210613011174363], [-0.002639480814720638, 0.9999968737080126, -0.0005594700790184048], [0.022104273186971824, 0.0006179932127646122, 0.9997554616613505]], 'translation vector': [0.008057060510321179, -0.003086615617105104, 0.008815946351156123]}\nB: {'rotation matrix': [[-0.793492, -0.269336, 0.545737], [-0.608499, 0.36581, -0.70421], [-0.009967, -0.890865, -0.454158]], 'translation vector': [3.342808, 3.719108, 1.377405]}\nC: {'rotation matrix': [[-0.799682, -0.271069, 0.535752], [-0.600405, 0.367946, -0.710021], [-0.004663, -0.889459, -0.456991]], 'translation vector': [3.342098, 3.723592, 1.379504]}\nD: {'rotation matrix': [[-0.786909, -0.267427, 0.556108], [-0.616914, 0.361106, -0.699299], [-0.013802, -0.893356, -0.449137]], 'translation vector': [3.343614, 3.714152, 1.377028]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9997511181043846, 0.002627583195842048, -0.02210613011174363], [-0.002639480814720638, 0.9999968737080126, -0.0005594700790184048], [0.022104273186971824, 0.0006179932127646122, 0.9997554616613505]], 'translation vector': [0.008057060510321179, -0.003086615617105104, 0.008815946351156123]}\nB: {'rotation matrix': [[-0.793492, -0.269336, 0.545737], [-0.608499, 0.36581, -0.70421], [-0.009967, -0.890865, -0.454158]], 'translation vector': [3.342808, 3.719108, 1.377405]}\nC: {'rotation matrix': [[-0.799682, -0.271069, 0.535752], [-0.600405, 0.367946, -0.710021], [-0.004663, -0.889459, -0.456991]], 'translation vector': [3.342098, 3.723592, 1.379504]}\nD: {'rotation matrix': [[-0.786909, -0.267427, 0.556108], [-0.616914, 0.361106, -0.699299], [-0.013802, -0.893356, -0.449137]], 'translation vector': [3.343614, 3.714152, 1.377028]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_109_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_109_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_109_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_109_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.470058, 0.310455, -0.826234], [0.882195, -0.135697, 0.450908], [0.027869, -0.940853, -0.337668]], 'translation vector': [2.719146, 3.165557, 1.444111]}\nB: {'rotation matrix': [[0.468253, 0.310351, -0.827298], [0.883071, -0.132138, 0.45025], [0.030418, -0.941394, -0.335936]], 'translation vector': [2.721684, 3.167619, 1.442076]}\nC: {'rotation matrix': [[0.9999808269927472, 0.005970992787416191, 0.00174631158613173], [-0.00597527498498062, 0.9999782630360619, 0.002650362855816552], [-0.0017306339108793102, -0.002661153861931357, 0.9999943916418343]], 'translation vector': [0.00038154468875628567, 0.0036815080791540167, 0.0005855298747257098]}\nD: {'rotation matrix': [[0.468431, 0.309409, -0.82755], [0.883026, -0.133283, 0.45], [0.028935, -0.941542, -0.33565]], 'translation vector': [2.722082, 3.167839, 1.441818]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.470058, 0.310455, -0.826234], [0.882195, -0.135697, 0.450908], [0.027869, -0.940853, -0.337668]], 'translation vector': [2.719146, 3.165557, 1.444111]}\nB: {'rotation matrix': [[0.468253, 0.310351, -0.827298], [0.883071, -0.132138, 0.45025], [0.030418, -0.941394, -0.335936]], 'translation vector': [2.721684, 3.167619, 1.442076]}\nC: {'rotation matrix': [[0.9999808269927472, 0.005970992787416191, 0.00174631158613173], [-0.00597527498498062, 0.9999782630360619, 0.002650362855816552], [-0.0017306339108793102, -0.002661153861931357, 0.9999943916418343]], 'translation vector': [0.00038154468875628567, 0.0036815080791540167, 0.0005855298747257098]}\nD: {'rotation matrix': [[0.468431, 0.309409, -0.82755], [0.883026, -0.133283, 0.45], [0.028935, -0.941542, -0.33565]], 'translation vector': [2.722082, 3.167839, 1.441818]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_110_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_110_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_110_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_110_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.886617, -0.372394, 0.274287], [-0.453667, -0.584855, 0.672407], [-0.089983, -0.720602, -0.687485]], 'translation vector': [2.862491, 2.429976, 1.648643]}\nB: {'rotation matrix': [[0.9999609940536269, 0.004336106194237876, -0.007665012330121596], [-0.0043994792450735756, 0.9999564170173506, -0.008231049758408536], [0.007628227917166668, 0.008264723677336768, 0.9999364519647637]], 'translation vector': [0.014388650201931252, -0.01870904449863442, 0.020428177278094317]}\nC: {'rotation matrix': [[0.881825, -0.378531, 0.281247], [-0.462696, -0.579299, 0.671062], [-0.091091, -0.721891, -0.685985]], 'translation vector': [2.843046, 2.410197, 1.648909]}\nD: {'rotation matrix': [[0.88465, -0.37678, 0.274647], [-0.457061, -0.584385, 0.670514], [-0.092137, -0.718701, -0.689188]], 'translation vector': [2.852998, 2.419565, 1.649377]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.886617, -0.372394, 0.274287], [-0.453667, -0.584855, 0.672407], [-0.089983, -0.720602, -0.687485]], 'translation vector': [2.862491, 2.429976, 1.648643]}\nB: {'rotation matrix': [[0.9999609940536269, 0.004336106194237876, -0.007665012330121596], [-0.0043994792450735756, 0.9999564170173506, -0.008231049758408536], [0.007628227917166668, 0.008264723677336768, 0.9999364519647637]], 'translation vector': [0.014388650201931252, -0.01870904449863442, 0.020428177278094317]}\nC: {'rotation matrix': [[0.881825, -0.378531, 0.281247], [-0.462696, -0.579299, 0.671062], [-0.091091, -0.721891, -0.685985]], 'translation vector': [2.843046, 2.410197, 1.648909]}\nD: {'rotation matrix': [[0.88465, -0.37678, 0.274647], [-0.457061, -0.584385, 0.670514], [-0.092137, -0.718701, -0.689188]], 'translation vector': [2.852998, 2.419565, 1.649377]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_111_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_111_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_111_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_111_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.937349, 0.183503, -0.296148], [0.348203, 0.521431, -0.779015], [0.011469, -0.833329, -0.552659]], 'translation vector': [1.516432, 1.509609, 1.382559]}\nB: {'rotation matrix': [[0.9999980782363276, 0.001503213640840046, -0.0005306957238598916], [-0.001506954921949503, 0.9999783283684359, -0.006294441787430057], [0.0005221180642734458, 0.006294562264384682, 0.9999799919937622]], 'translation vector': [-0.0001973645495118026, -0.004429123982855679, 0.002277103824624316]}\nC: {'rotation matrix': [[-0.936868, 0.179689, -0.299985], [0.349254, 0.523335, -0.777266], [0.017327, -0.832966, -0.553053]], 'translation vector': [1.516465, 1.50589, 1.383504]}\nD: {'rotation matrix': [[-0.936977, 0.182065, -0.298205], [0.349103, 0.5225, -0.777896], [0.014184, -0.832974, -0.55313]], 'translation vector': [1.516084, 1.508243, 1.382535]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.937349, 0.183503, -0.296148], [0.348203, 0.521431, -0.779015], [0.011469, -0.833329, -0.552659]], 'translation vector': [1.516432, 1.509609, 1.382559]}\nB: {'rotation matrix': [[0.9999980782363276, 0.001503213640840046, -0.0005306957238598916], [-0.001506954921949503, 0.9999783283684359, -0.006294441787430057], [0.0005221180642734458, 0.006294562264384682, 0.9999799919937622]], 'translation vector': [-0.0001973645495118026, -0.004429123982855679, 0.002277103824624316]}\nC: {'rotation matrix': [[-0.936868, 0.179689, -0.299985], [0.349254, 0.523335, -0.777266], [0.017327, -0.832966, -0.553053]], 'translation vector': [1.516465, 1.50589, 1.383504]}\nD: {'rotation matrix': [[-0.936977, 0.182065, -0.298205], [0.349103, 0.5225, -0.777896], [0.014184, -0.832974, -0.55313]], 'translation vector': [1.516084, 1.508243, 1.382535]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_112_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_112_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_112_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_112_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.857254, 0.207542, -0.471213], [0.514274, 0.300265, -0.803345], [-0.025239, -0.931003, -0.364137]], 'translation vector': [3.165454, 3.656282, 1.333704]}\nB: {'rotation matrix': [[0.99999640966509, -0.0010007660526934368, -0.0025990335284116336], [0.0009919500135136654, 0.9999945413184362, -0.003311856723933156], [0.0026023838793041037, 0.0033091782066769597, 0.999990415293157]], 'translation vector': [0.003805164660490079, -0.0038731744338753593, -0.0029462366598167478]}\nC: {'rotation matrix': [[-0.857583, 0.210228, -0.469422], [0.513576, 0.300052, -0.803871], [-0.028145, -0.930469, -0.365287]], 'translation vector': [3.164042, 3.653142, 1.337743]}\nD: {'rotation matrix': [[-0.856761, 0.210709, -0.470704], [0.514795, 0.294981, -0.804967], [-0.030765, -0.931981, -0.3612]], 'translation vector': [3.165054, 3.650114, 1.341357]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.857254, 0.207542, -0.471213], [0.514274, 0.300265, -0.803345], [-0.025239, -0.931003, -0.364137]], 'translation vector': [3.165454, 3.656282, 1.333704]}\nB: {'rotation matrix': [[0.99999640966509, -0.0010007660526934368, -0.0025990335284116336], [0.0009919500135136654, 0.9999945413184362, -0.003311856723933156], [0.0026023838793041037, 0.0033091782066769597, 0.999990415293157]], 'translation vector': [0.003805164660490079, -0.0038731744338753593, -0.0029462366598167478]}\nC: {'rotation matrix': [[-0.857583, 0.210228, -0.469422], [0.513576, 0.300052, -0.803871], [-0.028145, -0.930469, -0.365287]], 'translation vector': [3.164042, 3.653142, 1.337743]}\nD: {'rotation matrix': [[-0.856761, 0.210709, -0.470704], [0.514795, 0.294981, -0.804967], [-0.030765, -0.931981, -0.3612]], 'translation vector': [3.165054, 3.650114, 1.341357]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_113_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_113_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_113_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_113_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.496753, -0.455066, 0.739021], [-0.867832, 0.250391, -0.429153], [0.010249, -0.854529, -0.519303]], 'translation vector': [1.585927, 4.408765, 1.329075]}\nB: {'rotation matrix': [[-0.500222, -0.451271, 0.739008], [-0.865841, 0.250951, -0.432832], [0.009869, -0.856375, -0.51626]], 'translation vector': [1.58204, 4.414393, 1.331803]}\nC: {'rotation matrix': [[0.9999646386789894, 0.004658434745366248, 0.0070851516010133645], [-0.004681817359291345, 0.999983580482257, 0.0033767996583214267], [-0.007069472768969729, -0.0034085438936624457, 0.9999685018088521]], 'translation vector': [-1.9089315443032717e-05, 0.003691149021725071, -0.009076217757424399]}\nD: {'rotation matrix': [[-0.49386, -0.45517, 0.740893], [-0.869462, 0.246928, -0.427859], [0.011802, -0.855481, -0.5177]], 'translation vector': [1.591466, 4.4048, 1.328646]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.496753, -0.455066, 0.739021], [-0.867832, 0.250391, -0.429153], [0.010249, -0.854529, -0.519303]], 'translation vector': [1.585927, 4.408765, 1.329075]}\nB: {'rotation matrix': [[-0.500222, -0.451271, 0.739008], [-0.865841, 0.250951, -0.432832], [0.009869, -0.856375, -0.51626]], 'translation vector': [1.58204, 4.414393, 1.331803]}\nC: {'rotation matrix': [[0.9999646386789894, 0.004658434745366248, 0.0070851516010133645], [-0.004681817359291345, 0.999983580482257, 0.0033767996583214267], [-0.007069472768969729, -0.0034085438936624457, 0.9999685018088521]], 'translation vector': [-1.9089315443032717e-05, 0.003691149021725071, -0.009076217757424399]}\nD: {'rotation matrix': [[-0.49386, -0.45517, 0.740893], [-0.869462, 0.246928, -0.427859], [0.011802, -0.855481, -0.5177]], 'translation vector': [1.591466, 4.4048, 1.328646]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_114_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_114_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_114_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_114_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.216454, 0.211748, -0.953053], [0.970078, -0.156619, 0.185523], [-0.109982, -0.964693, -0.239313]], 'translation vector': [4.876326, 2.835873, 1.673403]}\nB: {'rotation matrix': [[0.9999815116666099, 0.003828941338965109, -0.004550071168900241], [-0.0038043597787837257, 0.999978256845473, 0.005451903660299082], [0.004571630830927769, -0.005433064547306739, 0.9999746221671975]], 'translation vector': [-0.0021931397990639923, 0.004370231111501255, 0.000887941045025542]}\nC: {'rotation matrix': [[0.223921, 0.203392, -0.953148], [0.967778, -0.16198, 0.192793], [-0.115179, -0.965606, -0.233109]], 'translation vector': [4.877863, 2.835087, 1.676992]}\nD: {'rotation matrix': [[0.219557, 0.208101, -0.953147], [0.969026, -0.159743, 0.188338], [-0.113066, -0.964975, -0.236728]], 'translation vector': [4.875911, 2.83788, 1.674953]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.216454, 0.211748, -0.953053], [0.970078, -0.156619, 0.185523], [-0.109982, -0.964693, -0.239313]], 'translation vector': [4.876326, 2.835873, 1.673403]}\nB: {'rotation matrix': [[0.9999815116666099, 0.003828941338965109, -0.004550071168900241], [-0.0038043597787837257, 0.999978256845473, 0.005451903660299082], [0.004571630830927769, -0.005433064547306739, 0.9999746221671975]], 'translation vector': [-0.0021931397990639923, 0.004370231111501255, 0.000887941045025542]}\nC: {'rotation matrix': [[0.223921, 0.203392, -0.953148], [0.967778, -0.16198, 0.192793], [-0.115179, -0.965606, -0.233109]], 'translation vector': [4.877863, 2.835087, 1.676992]}\nD: {'rotation matrix': [[0.219557, 0.208101, -0.953147], [0.969026, -0.159743, 0.188338], [-0.113066, -0.964975, -0.236728]], 'translation vector': [4.875911, 2.83788, 1.674953]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_115_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_115_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_115_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_115_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.9999402380460055, 0.006292872204581464, 0.009000301608311233], [-0.006374858865950918, 0.9999377706786996, 0.009079354369281548], [-0.008942504139812743, -0.009135104885608418, 0.9999183041835895]], 'translation vector': [3.939302955568991e-05, -0.002970151993936021, 0.008448821166219922]}\nB: {'rotation matrix': [[-0.997375, -0.070877, -0.01485], [-0.014261, 0.393282, -0.919307], [0.070998, -0.916682, -0.393261]], 'translation vector': [7.372805, 2.63008, 1.348598]}\nC: {'rotation matrix': [[-0.997269, -0.072413, -0.014556], [-0.015372, 0.396244, -0.918017], [0.072244, -0.915286, -0.396275]], 'translation vector': [7.36901, 2.625689, 1.34671]}\nD: {'rotation matrix': [[-0.997198, -0.073599, -0.01342], [-0.016859, 0.39584, -0.918165], [0.072888, -0.915366, -0.395972]], 'translation vector': [7.365971, 2.622898, 1.345074]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999402380460055, 0.006292872204581464, 0.009000301608311233], [-0.006374858865950918, 0.9999377706786996, 0.009079354369281548], [-0.008942504139812743, -0.009135104885608418, 0.9999183041835895]], 'translation vector': [3.939302955568991e-05, -0.002970151993936021, 0.008448821166219922]}\nB: {'rotation matrix': [[-0.997375, -0.070877, -0.01485], [-0.014261, 0.393282, -0.919307], [0.070998, -0.916682, -0.393261]], 'translation vector': [7.372805, 2.63008, 1.348598]}\nC: {'rotation matrix': [[-0.997269, -0.072413, -0.014556], [-0.015372, 0.396244, -0.918017], [0.072244, -0.915286, -0.396275]], 'translation vector': [7.36901, 2.625689, 1.34671]}\nD: {'rotation matrix': [[-0.997198, -0.073599, -0.01342], [-0.016859, 0.39584, -0.918165], [0.072888, -0.915366, -0.395972]], 'translation vector': [7.365971, 2.622898, 1.345074]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_116_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_116_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_116_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_116_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.051446, 0.205786, -0.977244], [0.99734, -0.040014, -0.06093], [-0.051642, -0.977778, -0.20318]], 'translation vector': [3.492872, 2.502008, 1.69891]}\nB: {'rotation matrix': [[-0.043348, 0.1967, -0.979505], [0.997776, -0.041176, -0.052425], [-0.050644, -0.979599, -0.194477]], 'translation vector': [3.495688, 2.502278, 1.699202]}\nC: {'rotation matrix': [[-0.045349, 0.201463, -0.978446], [0.997609, -0.041995, -0.054884], [-0.052147, -0.978595, -0.199077]], 'translation vector': [3.49477, 2.503383, 1.707673]}\nD: {'rotation matrix': [[0.999952584360321, 0.006117610645767056, 0.007552374046773563], [-0.006133995866929935, 0.9999788341295721, 0.0021300038842573614], [-0.007539309696335425, -0.0021763143472021637, 0.9999686644670424]], 'translation vector': [-0.005541149058866601, -0.004329021249491083, -0.004737026405577716]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.051446, 0.205786, -0.977244], [0.99734, -0.040014, -0.06093], [-0.051642, -0.977778, -0.20318]], 'translation vector': [3.492872, 2.502008, 1.69891]}\nB: {'rotation matrix': [[-0.043348, 0.1967, -0.979505], [0.997776, -0.041176, -0.052425], [-0.050644, -0.979599, -0.194477]], 'translation vector': [3.495688, 2.502278, 1.699202]}\nC: {'rotation matrix': [[-0.045349, 0.201463, -0.978446], [0.997609, -0.041995, -0.054884], [-0.052147, -0.978595, -0.199077]], 'translation vector': [3.49477, 2.503383, 1.707673]}\nD: {'rotation matrix': [[0.999952584360321, 0.006117610645767056, 0.007552374046773563], [-0.006133995866929935, 0.9999788341295721, 0.0021300038842573614], [-0.007539309696335425, -0.0021763143472021637, 0.9999686644670424]], 'translation vector': [-0.005541149058866601, -0.004329021249491083, -0.004737026405577716]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_117_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_117_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_117_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_117_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.92969, -0.177823, 0.322577], [-0.368073, 0.414955, -0.832066], [0.014105, -0.892296, -0.451231]], 'translation vector': [2.094699, 1.923867, 1.362793]}\nB: {'rotation matrix': [[-0.929496, -0.179835, 0.322021], [-0.368436, 0.412208, -0.833271], [0.017112, -0.893165, -0.449403]], 'translation vector': [2.092189, 1.927801, 1.363214]}\nC: {'rotation matrix': [[0.9999967402226891, -0.00025435497097484245, -0.0026972206948773017], [0.0002554991423686922, 0.9999998064927808, 0.00039832318899401273], [0.0026965738295479497, -0.00039944612541857925, 0.9999956863286328]], 'translation vector': [0.0007808272698826002, -3.308771117738196e-05, 0.0032529965763865576]}\nD: {'rotation matrix': [[-0.929672, -0.179046, 0.321952], [-0.368044, 0.413573, -0.832767], [0.015953, -0.892693, -0.450384]], 'translation vector': [2.09373, 1.925922, 1.362599]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.92969, -0.177823, 0.322577], [-0.368073, 0.414955, -0.832066], [0.014105, -0.892296, -0.451231]], 'translation vector': [2.094699, 1.923867, 1.362793]}\nB: {'rotation matrix': [[-0.929496, -0.179835, 0.322021], [-0.368436, 0.412208, -0.833271], [0.017112, -0.893165, -0.449403]], 'translation vector': [2.092189, 1.927801, 1.363214]}\nC: {'rotation matrix': [[0.9999967402226891, -0.00025435497097484245, -0.0026972206948773017], [0.0002554991423686922, 0.9999998064927808, 0.00039832318899401273], [0.0026965738295479497, -0.00039944612541857925, 0.9999956863286328]], 'translation vector': [0.0007808272698826002, -3.308771117738196e-05, 0.0032529965763865576]}\nD: {'rotation matrix': [[-0.929672, -0.179046, 0.321952], [-0.368044, 0.413573, -0.832767], [0.015953, -0.892693, -0.450384]], 'translation vector': [2.09373, 1.925922, 1.362599]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_118_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_118_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_118_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_118_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.462347, -0.272387, 0.843825], [-0.885267, -0.195868, 0.421827], [0.050379, -0.942041, -0.331694]], 'translation vector': [2.976725, 2.047585, 1.44742]}\nB: {'rotation matrix': [[0.9999659967455217, 0.000730682433916761, -0.008205012696381919], [-0.0006795411337798277, 0.9999808235064653, 0.006171696278259806], [0.008208518663254718, -0.006164897409722228, 0.9999474019525653]], 'translation vector': [0.003919003542433852, -0.0016103743394202397, 0.004248482748549165]}\nC: {'rotation matrix': [[0.463845, -0.2716, 0.843257], [-0.884483, -0.196093, 0.423364], [0.050371, -0.942221, -0.331182]], 'translation vector': [2.976598, 2.048301, 1.445946]}\nD: {'rotation matrix': [[0.465329, -0.271694, 0.842408], [-0.883772, -0.195467, 0.425135], [0.049156, -0.942324, -0.331072]], 'translation vector': [2.978186, 2.04869, 1.446578]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.462347, -0.272387, 0.843825], [-0.885267, -0.195868, 0.421827], [0.050379, -0.942041, -0.331694]], 'translation vector': [2.976725, 2.047585, 1.44742]}\nB: {'rotation matrix': [[0.9999659967455217, 0.000730682433916761, -0.008205012696381919], [-0.0006795411337798277, 0.9999808235064653, 0.006171696278259806], [0.008208518663254718, -0.006164897409722228, 0.9999474019525653]], 'translation vector': [0.003919003542433852, -0.0016103743394202397, 0.004248482748549165]}\nC: {'rotation matrix': [[0.463845, -0.2716, 0.843257], [-0.884483, -0.196093, 0.423364], [0.050371, -0.942221, -0.331182]], 'translation vector': [2.976598, 2.048301, 1.445946]}\nD: {'rotation matrix': [[0.465329, -0.271694, 0.842408], [-0.883772, -0.195467, 0.425135], [0.049156, -0.942324, -0.331072]], 'translation vector': [2.978186, 2.04869, 1.446578]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_119_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_119_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_119_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_119_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.819777, 0.236537, -0.521552], [0.57264, -0.327401, 0.751593], [0.007023, -0.9148, -0.403846]], 'translation vector': [2.353185, 1.22719, 1.374303]}\nB: {'rotation matrix': [[0.999987888286007, -0.002291943522874061, -0.004318624647106336], [0.002305286110298337, 0.9999924577563948, 0.0032570259245620907], [0.0043115657326277725, -0.0032678213093349246, 0.9999859589268988]], 'translation vector': [0.0029862992996512183, 0.0027957678410703846, 0.00028412393673649117]}\nC: {'rotation matrix': [[0.818568, 0.239176, -0.522246], [0.574347, -0.327388, 0.750296], [0.008476, -0.914118, -0.405359]], 'translation vector': [2.353795, 1.227513, 1.374115]}\nD: {'rotation matrix': [[0.821096, 0.234783, -0.520267], [0.570754, -0.327501, 0.752983], [0.006399, -0.915216, -0.402913]], 'translation vector': [2.353373, 1.227232, 1.3746]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.819777, 0.236537, -0.521552], [0.57264, -0.327401, 0.751593], [0.007023, -0.9148, -0.403846]], 'translation vector': [2.353185, 1.22719, 1.374303]}\nB: {'rotation matrix': [[0.999987888286007, -0.002291943522874061, -0.004318624647106336], [0.002305286110298337, 0.9999924577563948, 0.0032570259245620907], [0.0043115657326277725, -0.0032678213093349246, 0.9999859589268988]], 'translation vector': [0.0029862992996512183, 0.0027957678410703846, 0.00028412393673649117]}\nC: {'rotation matrix': [[0.818568, 0.239176, -0.522246], [0.574347, -0.327388, 0.750296], [0.008476, -0.914118, -0.405359]], 'translation vector': [2.353795, 1.227513, 1.374115]}\nD: {'rotation matrix': [[0.821096, 0.234783, -0.520267], [0.570754, -0.327501, 0.752983], [0.006399, -0.915216, -0.402913]], 'translation vector': [2.353373, 1.227232, 1.3746]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_120_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_120_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_120_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_120_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.416416, 0.454823, -0.787232], [0.907976, 0.163592, -0.38577], [-0.046672, -0.875428, -0.481091]], 'translation vector': [2.42158, 4.677908, 1.279661]}\nB: {'rotation matrix': [[-0.425105, 0.447282, -0.786908], [0.904054, 0.167175, -0.393368], [-0.044395, -0.878631, -0.475434]], 'translation vector': [2.418032, 4.676476, 1.278379]}\nC: {'rotation matrix': [[-0.405457, 0.458134, -0.791023], [0.912898, 0.15832, -0.376234], [-0.047131, -0.87467, -0.482422]], 'translation vector': [2.427205, 4.676823, 1.279665]}\nD: {'rotation matrix': [[0.999724588457419, 0.011399918490649034, -0.02049681987065299], [-0.011547405749944445, 0.9999079075389897, -0.007141934731129424], [0.020413044185954875, 0.00737693223574732, 0.9997642455565067]], 'translation vector': [0.0017021170863906754, -0.0007418791262142621, 0.002807958654513776]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.416416, 0.454823, -0.787232], [0.907976, 0.163592, -0.38577], [-0.046672, -0.875428, -0.481091]], 'translation vector': [2.42158, 4.677908, 1.279661]}\nB: {'rotation matrix': [[-0.425105, 0.447282, -0.786908], [0.904054, 0.167175, -0.393368], [-0.044395, -0.878631, -0.475434]], 'translation vector': [2.418032, 4.676476, 1.278379]}\nC: {'rotation matrix': [[-0.405457, 0.458134, -0.791023], [0.912898, 0.15832, -0.376234], [-0.047131, -0.87467, -0.482422]], 'translation vector': [2.427205, 4.676823, 1.279665]}\nD: {'rotation matrix': [[0.999724588457419, 0.011399918490649034, -0.02049681987065299], [-0.011547405749944445, 0.9999079075389897, -0.007141934731129424], [0.020413044185954875, 0.00737693223574732, 0.9997642455565067]], 'translation vector': [0.0017021170863906754, -0.0007418791262142621, 0.002807958654513776]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_121_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_121_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_121_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_121_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.9999991921613425, -0.0009534576315807749, 0.0007322231548443674], [0.0009539109746141209, 0.9999998849157189, 0.0006356436900991667], [-0.0007323849744554394, -0.0006353709242038023, 0.9999990842009482]], 'translation vector': [-0.0019568440092229133, 0.0040178639573226205, -0.0007055440032766036]}\nB: {'rotation matrix': [[-0.677088, 0.408379, -0.612192], [0.735888, 0.380882, -0.559819], [0.004555, -0.829551, -0.558412]], 'translation vector': [3.089066, 2.044868, 1.438859]}\nC: {'rotation matrix': [[-0.677557, 0.408197, -0.611794], [0.735465, 0.379263, -0.561472], [0.002839, -0.830382, -0.557187]], 'translation vector': [3.090277, 2.045193, 1.438377]}\nD: {'rotation matrix': [[-0.677242, 0.408267, -0.612096], [0.73575, 0.380087, -0.56054], [0.003799, -0.829971, -0.557794]], 'translation vector': [3.089461, 2.045596, 1.437863]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999991921613425, -0.0009534576315807749, 0.0007322231548443674], [0.0009539109746141209, 0.9999998849157189, 0.0006356436900991667], [-0.0007323849744554394, -0.0006353709242038023, 0.9999990842009482]], 'translation vector': [-0.0019568440092229133, 0.0040178639573226205, -0.0007055440032766036]}\nB: {'rotation matrix': [[-0.677088, 0.408379, -0.612192], [0.735888, 0.380882, -0.559819], [0.004555, -0.829551, -0.558412]], 'translation vector': [3.089066, 2.044868, 1.438859]}\nC: {'rotation matrix': [[-0.677557, 0.408197, -0.611794], [0.735465, 0.379263, -0.561472], [0.002839, -0.830382, -0.557187]], 'translation vector': [3.090277, 2.045193, 1.438377]}\nD: {'rotation matrix': [[-0.677242, 0.408267, -0.612096], [0.73575, 0.380087, -0.56054], [0.003799, -0.829971, -0.557794]], 'translation vector': [3.089461, 2.045596, 1.437863]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_122_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_122_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_122_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_122_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.9999933784388596, -0.0034228660793026973, 0.0010334015378228609], [0.003443417069472685, 0.9997723135838638, -0.0210485719107263], [-0.0009619583367667721, 0.021052364637286505, 0.9997776329131831]], 'translation vector': [0.004991626612900646, -0.0024493216023662445, -0.003027628610638544]}\nB: {'rotation matrix': [[-0.068724, 0.196407, -0.978111], [0.997631, 0.016511, -0.06678], [0.003034, -0.980384, -0.197076]], 'translation vector': [6.624384, 2.565858, 1.44421]}\nC: {'rotation matrix': [[-0.062271, 0.18592, -0.98059], [0.998056, 0.014281, -0.060673], [0.002724, -0.982461, -0.186448]], 'translation vector': [6.625182, 2.564143, 1.442555]}\nD: {'rotation matrix': [[-0.067121, 0.19262, -0.978975], [0.997737, 0.016917, -0.065078], [0.004026, -0.981128, -0.19332]], 'translation vector': [6.625297, 2.569471, 1.443187]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999933784388596, -0.0034228660793026973, 0.0010334015378228609], [0.003443417069472685, 0.9997723135838638, -0.0210485719107263], [-0.0009619583367667721, 0.021052364637286505, 0.9997776329131831]], 'translation vector': [0.004991626612900646, -0.0024493216023662445, -0.003027628610638544]}\nB: {'rotation matrix': [[-0.068724, 0.196407, -0.978111], [0.997631, 0.016511, -0.06678], [0.003034, -0.980384, -0.197076]], 'translation vector': [6.624384, 2.565858, 1.44421]}\nC: {'rotation matrix': [[-0.062271, 0.18592, -0.98059], [0.998056, 0.014281, -0.060673], [0.002724, -0.982461, -0.186448]], 'translation vector': [6.625182, 2.564143, 1.442555]}\nD: {'rotation matrix': [[-0.067121, 0.19262, -0.978975], [0.997737, 0.016917, -0.065078], [0.004026, -0.981128, -0.19332]], 'translation vector': [6.625297, 2.569471, 1.443187]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_123_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_123_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_123_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_123_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.9997697997860496, 0.001019787082464491, -0.021429481643640898], [-0.0009730181353209979, 0.9999969701208679, 0.002158925702055335], [0.021432088446976687, -0.002138201439636503, 0.9997675908556951]], 'translation vector': [0.004448630857523561, 0.0024420113720010628, -0.001058932632110654]}\nB: {'rotation matrix': [[-0.999487, 0.010341, 0.030333], [-0.019706, 0.548122, -0.836166], [-0.025273, -0.836334, -0.547637]], 'translation vector': [4.843515, 3.430529, 1.401708]}\nC: {'rotation matrix': [[-0.998846, 0.024735, 0.04116], [-0.020973, 0.546345, -0.837298], [-0.043199, -0.837195, -0.545196]], 'translation vector': [4.840129, 3.432139, 1.401112]}\nD: {'rotation matrix': [[-0.99921, 0.020117, 0.034274], [-0.017632, 0.548494, -0.835969], [-0.035616, -0.835913, -0.547706]], 'translation vector': [4.841137, 3.430736, 1.401886]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9997697997860496, 0.001019787082464491, -0.021429481643640898], [-0.0009730181353209979, 0.9999969701208679, 0.002158925702055335], [0.021432088446976687, -0.002138201439636503, 0.9997675908556951]], 'translation vector': [0.004448630857523561, 0.0024420113720010628, -0.001058932632110654]}\nB: {'rotation matrix': [[-0.999487, 0.010341, 0.030333], [-0.019706, 0.548122, -0.836166], [-0.025273, -0.836334, -0.547637]], 'translation vector': [4.843515, 3.430529, 1.401708]}\nC: {'rotation matrix': [[-0.998846, 0.024735, 0.04116], [-0.020973, 0.546345, -0.837298], [-0.043199, -0.837195, -0.545196]], 'translation vector': [4.840129, 3.432139, 1.401112]}\nD: {'rotation matrix': [[-0.99921, 0.020117, 0.034274], [-0.017632, 0.548494, -0.835969], [-0.035616, -0.835913, -0.547706]], 'translation vector': [4.841137, 3.430736, 1.401886]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_124_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_124_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_124_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_124_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.973747, -0.109977, 0.199301], [-0.227471, -0.502961, 0.833839], [0.008537, -0.857284, -0.514773]], 'translation vector': [3.554081, 1.206281, 1.35243]}\nB: {'rotation matrix': [[0.974665, -0.108996, 0.195317], [-0.223559, -0.502331, 0.835276], [0.007072, -0.857778, -0.513971]], 'translation vector': [3.555352, 1.206811, 1.353912]}\nC: {'rotation matrix': [[0.975504, -0.107044, 0.192183], [-0.219886, -0.500464, 0.837368], [0.006546, -0.859114, -0.511742]], 'translation vector': [3.5544, 1.207723, 1.355687]}\nD: {'rotation matrix': [[0.9999906036647181, 0.002816837265478036, -0.0034361334791159484], [-0.0028229898836968203, 0.9999947431038866, -0.0015384023641953812], [0.0034319714995403, 0.0015489580591809052, 0.9999926274107623]], 'translation vector': [0.0002503237708082473, -0.0002760600759463827, -0.00019478740093437086]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.973747, -0.109977, 0.199301], [-0.227471, -0.502961, 0.833839], [0.008537, -0.857284, -0.514773]], 'translation vector': [3.554081, 1.206281, 1.35243]}\nB: {'rotation matrix': [[0.974665, -0.108996, 0.195317], [-0.223559, -0.502331, 0.835276], [0.007072, -0.857778, -0.513971]], 'translation vector': [3.555352, 1.206811, 1.353912]}\nC: {'rotation matrix': [[0.975504, -0.107044, 0.192183], [-0.219886, -0.500464, 0.837368], [0.006546, -0.859114, -0.511742]], 'translation vector': [3.5544, 1.207723, 1.355687]}\nD: {'rotation matrix': [[0.9999906036647181, 0.002816837265478036, -0.0034361334791159484], [-0.0028229898836968203, 0.9999947431038866, -0.0015384023641953812], [0.0034319714995403, 0.0015489580591809052, 0.9999926274107623]], 'translation vector': [0.0002503237708082473, -0.0002760600759463827, -0.00019478740093437086]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_125_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_125_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_125_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_125_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.153679, 0.256881, -0.954146], [0.987333, 0.001369, -0.158656], [-0.03945, -0.966442, -0.253837]], 'translation vector': [1.842026, 1.203469, 1.473211]}\nB: {'rotation matrix': [[-0.151778, 0.257722, -0.954224], [0.987593, 0.000186, -0.157036], [-0.040294, -0.966219, -0.254553]], 'translation vector': [1.842306, 1.202322, 1.472604]}\nC: {'rotation matrix': [[-0.149914, 0.257434, -0.954596], [0.987791, -0.002361, -0.155764], [-0.042353, -0.966293, -0.253937]], 'translation vector': [1.843622, 1.201203, 1.472192]}\nD: {'rotation matrix': [[0.9999992738268638, -0.00012046241721948631, -0.0012199092118460354], [0.0001219402230910216, 0.9999989013124573, 0.0017389291337165482], [0.0012191992898708535, -0.0017382297355628953, 0.9999985296461054]], 'translation vector': [9.159323589502666e-05, -0.0060291788848427785, 8.443913047839757e-05]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.153679, 0.256881, -0.954146], [0.987333, 0.001369, -0.158656], [-0.03945, -0.966442, -0.253837]], 'translation vector': [1.842026, 1.203469, 1.473211]}\nB: {'rotation matrix': [[-0.151778, 0.257722, -0.954224], [0.987593, 0.000186, -0.157036], [-0.040294, -0.966219, -0.254553]], 'translation vector': [1.842306, 1.202322, 1.472604]}\nC: {'rotation matrix': [[-0.149914, 0.257434, -0.954596], [0.987791, -0.002361, -0.155764], [-0.042353, -0.966293, -0.253937]], 'translation vector': [1.843622, 1.201203, 1.472192]}\nD: {'rotation matrix': [[0.9999992738268638, -0.00012046241721948631, -0.0012199092118460354], [0.0001219402230910216, 0.9999989013124573, 0.0017389291337165482], [0.0012191992898708535, -0.0017382297355628953, 0.9999985296461054]], 'translation vector': [9.159323589502666e-05, -0.0060291788848427785, 8.443913047839757e-05]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_126_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_126_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_126_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_126_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.14824, 0.422945, -0.893948], [0.983241, -0.033972, -0.17912], [-0.106127, -0.905518, -0.410821]], 'translation vector': [4.004252, 0.906944, 2.572337]}\nB: {'rotation matrix': [[-0.144176, 0.428291, -0.892065], [0.983875, -0.034383, -0.175522], [-0.105847, -0.902987, -0.416427]], 'translation vector': [4.001886, 0.906293, 2.57387]}\nC: {'rotation matrix': [[0.9999802090304492, -0.0006608909445885984, -0.006377640026736867], [0.0006622709420172097, 0.9999989071934312, 0.0002893250642296396], [0.006377955510407445, -0.00029429125031086645, 0.9999796120375704]], 'translation vector': [0.0015109144602648002, -0.0040381270140653625, -0.0009682382039999382]}\nD: {'rotation matrix': [[-0.139849, 0.432923, -0.890517], [0.98469, -0.03371, -0.171026], [-0.10406, -0.9008, -0.42158]], 'translation vector': [3.996022, 0.9047, 2.579904]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.14824, 0.422945, -0.893948], [0.983241, -0.033972, -0.17912], [-0.106127, -0.905518, -0.410821]], 'translation vector': [4.004252, 0.906944, 2.572337]}\nB: {'rotation matrix': [[-0.144176, 0.428291, -0.892065], [0.983875, -0.034383, -0.175522], [-0.105847, -0.902987, -0.416427]], 'translation vector': [4.001886, 0.906293, 2.57387]}\nC: {'rotation matrix': [[0.9999802090304492, -0.0006608909445885984, -0.006377640026736867], [0.0006622709420172097, 0.9999989071934312, 0.0002893250642296396], [0.006377955510407445, -0.00029429125031086645, 0.9999796120375704]], 'translation vector': [0.0015109144602648002, -0.0040381270140653625, -0.0009682382039999382]}\nD: {'rotation matrix': [[-0.139849, 0.432923, -0.890517], [0.98469, -0.03371, -0.171026], [-0.10406, -0.9008, -0.42158]], 'translation vector': [3.996022, 0.9047, 2.579904]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_127_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_127_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_127_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_127_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.95128, 0.171677, -0.256112], [0.307849, -0.482535, 0.819993], [0.017191, -0.858887, -0.511877]], 'translation vector': [2.918653, 3.427386, 1.515216]}\nB: {'rotation matrix': [[0.9999949680507799, -0.0030198797325234252, -0.001049278425799119], [0.0030204031206264347, 0.9999953358677364, 0.000916435423205514], [0.0010474297666525848, -0.0009198822713830659, 0.9999990387486941]], 'translation vector': [-0.00019299961249297226, -0.0019013116010877518, -0.0012501965874700538]}\nC: {'rotation matrix': [[0.951329, 0.168071, -0.258311], [0.307858, -0.480204, 0.821357], [0.014004, -0.860905, -0.508574]], 'translation vector': [2.920244, 3.426191, 1.515625]}\nD: {'rotation matrix': [[0.951137, 0.174865, -0.254481], [0.308106, -0.483514, 0.81932], [0.020225, -0.857693, -0.513765]], 'translation vector': [2.916759, 3.427486, 1.515303]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.95128, 0.171677, -0.256112], [0.307849, -0.482535, 0.819993], [0.017191, -0.858887, -0.511877]], 'translation vector': [2.918653, 3.427386, 1.515216]}\nB: {'rotation matrix': [[0.9999949680507799, -0.0030198797325234252, -0.001049278425799119], [0.0030204031206264347, 0.9999953358677364, 0.000916435423205514], [0.0010474297666525848, -0.0009198822713830659, 0.9999990387486941]], 'translation vector': [-0.00019299961249297226, -0.0019013116010877518, -0.0012501965874700538]}\nC: {'rotation matrix': [[0.951329, 0.168071, -0.258311], [0.307858, -0.480204, 0.821357], [0.014004, -0.860905, -0.508574]], 'translation vector': [2.920244, 3.426191, 1.515625]}\nD: {'rotation matrix': [[0.951137, 0.174865, -0.254481], [0.308106, -0.483514, 0.81932], [0.020225, -0.857693, -0.513765]], 'translation vector': [2.916759, 3.427486, 1.515303]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_128_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_128_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_128_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_128_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.9999948635923562, 0.0019820469436732306, 0.002154140600797888], [-0.0019903388630383317, 0.9999924231566063, 0.003542917026556551], [-0.0021468846137928286, -0.0035469945961230523, 0.999992296254944]], 'translation vector': [-5.7007563989186494e-05, -0.0006793783086549987, -0.00012474555531971632]}\nB: {'rotation matrix': [[-0.933451, -0.165748, 0.318116], [-0.358704, 0.434072, -0.826385], [-0.001114, -0.885499, -0.464639]], 'translation vector': [1.119556, 2.234202, 1.400117]}\nC: {'rotation matrix': [[-0.933995, -0.170592, 0.31393], [-0.357261, 0.435306, -0.826362], [0.004315, -0.883973, -0.467519]], 'translation vector': [1.117768, 2.23249, 1.399859]}\nD: {'rotation matrix': [[-0.93341, -0.169242, 0.31639], [-0.358807, 0.435851, -0.825404], [0.001794, -0.883963, -0.467553]], 'translation vector': [1.117643, 2.232584, 1.400741]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999948635923562, 0.0019820469436732306, 0.002154140600797888], [-0.0019903388630383317, 0.9999924231566063, 0.003542917026556551], [-0.0021468846137928286, -0.0035469945961230523, 0.999992296254944]], 'translation vector': [-5.7007563989186494e-05, -0.0006793783086549987, -0.00012474555531971632]}\nB: {'rotation matrix': [[-0.933451, -0.165748, 0.318116], [-0.358704, 0.434072, -0.826385], [-0.001114, -0.885499, -0.464639]], 'translation vector': [1.119556, 2.234202, 1.400117]}\nC: {'rotation matrix': [[-0.933995, -0.170592, 0.31393], [-0.357261, 0.435306, -0.826362], [0.004315, -0.883973, -0.467519]], 'translation vector': [1.117768, 2.23249, 1.399859]}\nD: {'rotation matrix': [[-0.93341, -0.169242, 0.31639], [-0.358807, 0.435851, -0.825404], [0.001794, -0.883963, -0.467553]], 'translation vector': [1.117643, 2.232584, 1.400741]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_129_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_129_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_129_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_129_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.748267, 0.274864, -0.603777], [0.662514, -0.356598, 0.658721], [-0.034247, -0.892909, -0.448932]], 'translation vector': [2.689408, 2.67138, 1.313352]}\nB: {'rotation matrix': [[0.746223, 0.273332, -0.606993], [0.664679, -0.356291, 0.656703], [-0.036768, -0.893502, -0.447551]], 'translation vector': [2.678885, 2.679979, 1.310144]}\nC: {'rotation matrix': [[0.9999399674236885, 0.0008968793534765223, 0.01089808273108475], [-0.000989300454985631, 0.9999632239345498, 0.008537240164508266], [-0.010889507071785043, -0.008547033142130317, 0.9999033050990314]], 'translation vector': [-0.01197293045142045, -0.02229876967815736, 0.03026515941132618]}\nD: {'rotation matrix': [[0.750155, 0.270973, -0.603193], [0.660278, -0.356699, 0.660908], [-0.03607, -0.894058, -0.446497]], 'translation vector': [2.698287, 2.659688, 1.315667]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.748267, 0.274864, -0.603777], [0.662514, -0.356598, 0.658721], [-0.034247, -0.892909, -0.448932]], 'translation vector': [2.689408, 2.67138, 1.313352]}\nB: {'rotation matrix': [[0.746223, 0.273332, -0.606993], [0.664679, -0.356291, 0.656703], [-0.036768, -0.893502, -0.447551]], 'translation vector': [2.678885, 2.679979, 1.310144]}\nC: {'rotation matrix': [[0.9999399674236885, 0.0008968793534765223, 0.01089808273108475], [-0.000989300454985631, 0.9999632239345498, 0.008537240164508266], [-0.010889507071785043, -0.008547033142130317, 0.9999033050990314]], 'translation vector': [-0.01197293045142045, -0.02229876967815736, 0.03026515941132618]}\nD: {'rotation matrix': [[0.750155, 0.270973, -0.603193], [0.660278, -0.356699, 0.660908], [-0.03607, -0.894058, -0.446497]], 'translation vector': [2.698287, 2.659688, 1.315667]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_130_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_130_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_130_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_130_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.696591, -0.332063, 0.636], [-0.715704, 0.259464, -0.648419], [0.050297, -0.90687, -0.418398]], 'translation vector': [0.055261, 3.785911, 1.510756]}\nB: {'rotation matrix': [[0.9999892202506441, -0.00432806217171397, -0.0014665396078038379], [0.0043283836537151305, 0.9999910348386083, -5.023853182741767e-05], [0.0014665058361795998, 4.44487020677531e-05, 0.9999983390207243]], 'translation vector': [0.0011882249140264811, -0.004541217231683825, 0.003677767622445316]}\nC: {'rotation matrix': [[-0.698852, -0.328674, 0.635279], [-0.713642, 0.26058, -0.65024], [0.048176, -0.907784, -0.416662]], 'translation vector': [0.047395, 3.788746, 1.502043]}\nD: {'rotation matrix': [[-0.698666, -0.330448, 0.634563], [-0.713793, 0.261647, -0.649647], [0.048643, -0.906832, -0.418676]], 'translation vector': [0.050863, 3.788018, 1.507423]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.696591, -0.332063, 0.636], [-0.715704, 0.259464, -0.648419], [0.050297, -0.90687, -0.418398]], 'translation vector': [0.055261, 3.785911, 1.510756]}\nB: {'rotation matrix': [[0.9999892202506441, -0.00432806217171397, -0.0014665396078038379], [0.0043283836537151305, 0.9999910348386083, -5.023853182741767e-05], [0.0014665058361795998, 4.44487020677531e-05, 0.9999983390207243]], 'translation vector': [0.0011882249140264811, -0.004541217231683825, 0.003677767622445316]}\nC: {'rotation matrix': [[-0.698852, -0.328674, 0.635279], [-0.713642, 0.26058, -0.65024], [0.048176, -0.907784, -0.416662]], 'translation vector': [0.047395, 3.788746, 1.502043]}\nD: {'rotation matrix': [[-0.698666, -0.330448, 0.634563], [-0.713793, 0.261647, -0.649647], [0.048643, -0.906832, -0.418676]], 'translation vector': [0.050863, 3.788018, 1.507423]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_131_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_131_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_131_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_131_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.9999994384686345, -0.0006788559243182462, -0.0008560007213187077], [0.0006780059132462143, 0.9999998393114027, -0.000711634963528042], [0.0008572035387077444, 0.0007099766954480035, 0.9999996039685791]], 'translation vector': [0.0021169101928586176, 0.0016932348180918044, -0.0014691902955634717]}\nB: {'rotation matrix': [[-0.895004, 0.171136, -0.411923], [0.445772, 0.376296, -0.812213], [0.016006, -0.910557, -0.413074]], 'translation vector': [2.821576, 5.408109, 1.547241]}\nC: {'rotation matrix': [[-0.895238, 0.170954, -0.411491], [0.44529, 0.377097, -0.812105], [0.016339, -0.910259, -0.413716]], 'translation vector': [2.819563, 5.407667, 1.547957]}\nD: {'rotation matrix': [[-0.895239, 0.171618, -0.411211], [0.445324, 0.376235, -0.812486], [0.015275, -0.910491, -0.413246]], 'translation vector': [2.820169, 5.40833, 1.547624]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999994384686345, -0.0006788559243182462, -0.0008560007213187077], [0.0006780059132462143, 0.9999998393114027, -0.000711634963528042], [0.0008572035387077444, 0.0007099766954480035, 0.9999996039685791]], 'translation vector': [0.0021169101928586176, 0.0016932348180918044, -0.0014691902955634717]}\nB: {'rotation matrix': [[-0.895004, 0.171136, -0.411923], [0.445772, 0.376296, -0.812213], [0.016006, -0.910557, -0.413074]], 'translation vector': [2.821576, 5.408109, 1.547241]}\nC: {'rotation matrix': [[-0.895238, 0.170954, -0.411491], [0.44529, 0.377097, -0.812105], [0.016339, -0.910259, -0.413716]], 'translation vector': [2.819563, 5.407667, 1.547957]}\nD: {'rotation matrix': [[-0.895239, 0.171618, -0.411211], [0.445324, 0.376235, -0.812486], [0.015275, -0.910491, -0.413246]], 'translation vector': [2.820169, 5.40833, 1.547624]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_132_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_132_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_132_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_132_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.637806, -0.366719, 0.67729], [-0.770059, -0.286951, 0.569797], [-0.014606, -0.884972, -0.465415]], 'translation vector': [2.635432, 2.237918, 1.453759]}\nB: {'rotation matrix': [[0.639756, -0.365353, 0.676188], [-0.768417, -0.286037, 0.572466], [-0.015738, -0.885833, -0.463738]], 'translation vector': [2.635672, 2.238828, 1.45525]}\nC: {'rotation matrix': [[0.999992450941996, 0.003938168732013472, 0.0002619981628988685], [-0.003940060772482587, 0.9999806955317443, 0.004722690751934828], [-0.00024268998302535557, -0.004722987499164323, 0.9999892225354342]], 'translation vector': [-0.005978537327156946, -0.0007775878287423765, 0.0022633181070532693]}\nD: {'rotation matrix': [[0.636585, -0.368058, 0.677712], [-0.771071, -0.287285, 0.568258], [-0.014455, -0.884308, -0.46668]], 'translation vector': [2.636608, 2.236841, 1.454577]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.637806, -0.366719, 0.67729], [-0.770059, -0.286951, 0.569797], [-0.014606, -0.884972, -0.465415]], 'translation vector': [2.635432, 2.237918, 1.453759]}\nB: {'rotation matrix': [[0.639756, -0.365353, 0.676188], [-0.768417, -0.286037, 0.572466], [-0.015738, -0.885833, -0.463738]], 'translation vector': [2.635672, 2.238828, 1.45525]}\nC: {'rotation matrix': [[0.999992450941996, 0.003938168732013472, 0.0002619981628988685], [-0.003940060772482587, 0.9999806955317443, 0.004722690751934828], [-0.00024268998302535557, -0.004722987499164323, 0.9999892225354342]], 'translation vector': [-0.005978537327156946, -0.0007775878287423765, 0.0022633181070532693]}\nD: {'rotation matrix': [[0.636585, -0.368058, 0.677712], [-0.771071, -0.287285, 0.568258], [-0.014455, -0.884308, -0.46668]], 'translation vector': [2.636608, 2.236841, 1.454577]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_133_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_133_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_133_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_133_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.354317, -0.208867, 0.911501], [-0.934632, 0.110757, -0.337929], [-0.030372, -0.971652, -0.234457]], 'translation vector': [0.531753, 4.839624, 1.62588]}\nB: {'rotation matrix': [[-0.359065, -0.216471, 0.907862], [-0.932968, 0.109695, -0.342839], [-0.025373, -0.970107, -0.241348]], 'translation vector': [0.533016, 4.840936, 1.625213]}\nC: {'rotation matrix': [[0.9999996059479784, -0.0012011060402040321, 0.0006047856645127561], [0.0011984060508814654, 0.9999921678086844, 0.003627178785230534], [-0.000607755557205814, -0.0036253860701085153, 0.999994001644102]], 'translation vector': [-0.0020457167014393818, -0.01042060812880563, 0.003252619468668172]}\nD: {'rotation matrix': [[-0.356177, -0.213479, 0.909706], [-0.934025, 0.10958, -0.339984], [-0.027106, -0.970783, -0.238424]], 'translation vector': [0.532497, 4.839391, 1.625248]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.354317, -0.208867, 0.911501], [-0.934632, 0.110757, -0.337929], [-0.030372, -0.971652, -0.234457]], 'translation vector': [0.531753, 4.839624, 1.62588]}\nB: {'rotation matrix': [[-0.359065, -0.216471, 0.907862], [-0.932968, 0.109695, -0.342839], [-0.025373, -0.970107, -0.241348]], 'translation vector': [0.533016, 4.840936, 1.625213]}\nC: {'rotation matrix': [[0.9999996059479784, -0.0012011060402040321, 0.0006047856645127561], [0.0011984060508814654, 0.9999921678086844, 0.003627178785230534], [-0.000607755557205814, -0.0036253860701085153, 0.999994001644102]], 'translation vector': [-0.0020457167014393818, -0.01042060812880563, 0.003252619468668172]}\nD: {'rotation matrix': [[-0.356177, -0.213479, 0.909706], [-0.934025, 0.10958, -0.339984], [-0.027106, -0.970783, -0.238424]], 'translation vector': [0.532497, 4.839391, 1.625248]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_134_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_134_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_134_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_134_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.9999831834333464, -3.507485354961902e-05, -0.005860591935981322], [1.2894070442508321e-05, 0.999992942835117, -0.003855834283817326], [0.00585975547828403, 0.003855906716345711, 0.9999751812718962]], 'translation vector': [-9.176265998767086e-05, -0.003526493044568424, -0.0013563859537040202]}\nB: {'rotation matrix': [[-0.77208, 0.081888, -0.630228], [0.634233, 0.036058, -0.772301], [-0.040517, -0.995989, -0.079776]], 'translation vector': [4.355151, 2.275217, 1.510745]}\nC: {'rotation matrix': [[-0.769009, 0.085964, -0.633432], [0.638035, 0.042436, -0.768838], [-0.039212, -0.995394, -0.087482]], 'translation vector': [4.353152, 2.272772, 1.50454]}\nD: {'rotation matrix': [[-0.770144, 0.083681, -0.632358], [0.636632, 0.03909, -0.770176], [-0.03973, -0.995726, -0.083379]], 'translation vector': [4.354443, 2.273597, 1.508503]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999831834333464, -3.507485354961902e-05, -0.005860591935981322], [1.2894070442508321e-05, 0.999992942835117, -0.003855834283817326], [0.00585975547828403, 0.003855906716345711, 0.9999751812718962]], 'translation vector': [-9.176265998767086e-05, -0.003526493044568424, -0.0013563859537040202]}\nB: {'rotation matrix': [[-0.77208, 0.081888, -0.630228], [0.634233, 0.036058, -0.772301], [-0.040517, -0.995989, -0.079776]], 'translation vector': [4.355151, 2.275217, 1.510745]}\nC: {'rotation matrix': [[-0.769009, 0.085964, -0.633432], [0.638035, 0.042436, -0.768838], [-0.039212, -0.995394, -0.087482]], 'translation vector': [4.353152, 2.272772, 1.50454]}\nD: {'rotation matrix': [[-0.770144, 0.083681, -0.632358], [0.636632, 0.03909, -0.770176], [-0.03973, -0.995726, -0.083379]], 'translation vector': [4.354443, 2.273597, 1.508503]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_135_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_135_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_135_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_135_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.872251, 0.269436, -0.408146], [0.489057, 0.477878, -0.729696], [-0.001562, -0.836084, -0.548599]], 'translation vector': [2.680995, 3.11951, 1.281605]}\nB: {'rotation matrix': [[0.9999175427731898, 0.009289450845399635, 0.008882740398963147], [-0.00915148449802066, 0.9998381508008006, -0.015456879997285015], [-0.00902528905222651, 0.015374430101541683, 0.9998409413313208]], 'translation vector': [-0.02453370105938546, 0.014905487027389919, -0.03059606364374634]}\nC: {'rotation matrix': [[-0.872521, 0.262383, -0.412143], [0.488544, 0.478168, -0.729849], [0.005573, -0.838159, -0.545398]], 'translation vector': [2.690634, 3.125973, 1.284562]}\nD: {'rotation matrix': [[-0.871338, 0.255355, -0.419003], [0.490471, 0.478377, -0.728419], [0.014436, -0.840208, -0.542072]], 'translation vector': [2.702949, 3.129856, 1.287257]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.872251, 0.269436, -0.408146], [0.489057, 0.477878, -0.729696], [-0.001562, -0.836084, -0.548599]], 'translation vector': [2.680995, 3.11951, 1.281605]}\nB: {'rotation matrix': [[0.9999175427731898, 0.009289450845399635, 0.008882740398963147], [-0.00915148449802066, 0.9998381508008006, -0.015456879997285015], [-0.00902528905222651, 0.015374430101541683, 0.9998409413313208]], 'translation vector': [-0.02453370105938546, 0.014905487027389919, -0.03059606364374634]}\nC: {'rotation matrix': [[-0.872521, 0.262383, -0.412143], [0.488544, 0.478168, -0.729849], [0.005573, -0.838159, -0.545398]], 'translation vector': [2.690634, 3.125973, 1.284562]}\nD: {'rotation matrix': [[-0.871338, 0.255355, -0.419003], [0.490471, 0.478377, -0.728419], [0.014436, -0.840208, -0.542072]], 'translation vector': [2.702949, 3.129856, 1.287257]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_136_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_136_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_136_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_136_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.333056, -0.473197, 0.815573], [-0.9429, 0.170506, -0.286124], [-0.003667, -0.864299, -0.502965]], 'translation vector': [2.099262, 2.343947, 1.49878]}\nB: {'rotation matrix': [[-0.341507, -0.468371, 0.814864], [-0.939879, 0.169312, -0.296582], [0.000944, -0.867158, -0.498033]], 'translation vector': [2.09227, 2.339374, 1.500507]}\nC: {'rotation matrix': [[-0.349241, -0.464358, 0.813882], [-0.937028, 0.170072, -0.305049], [0.003234, -0.869165, -0.494512]], 'translation vector': [2.088692, 2.33782, 1.505356]}\nD: {'rotation matrix': [[0.9999635418289009, -0.005130634438046651, -0.00688590414124517], [0.00513692191174813, 0.9999857138339602, 0.0010725741392655886], [0.00688015226508057, -0.0011072559806139443, 0.9999748317531684]], 'translation vector': [-0.026001101753266642, -0.0071394396285136, 0.008639096069164354]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.333056, -0.473197, 0.815573], [-0.9429, 0.170506, -0.286124], [-0.003667, -0.864299, -0.502965]], 'translation vector': [2.099262, 2.343947, 1.49878]}\nB: {'rotation matrix': [[-0.341507, -0.468371, 0.814864], [-0.939879, 0.169312, -0.296582], [0.000944, -0.867158, -0.498033]], 'translation vector': [2.09227, 2.339374, 1.500507]}\nC: {'rotation matrix': [[-0.349241, -0.464358, 0.813882], [-0.937028, 0.170072, -0.305049], [0.003234, -0.869165, -0.494512]], 'translation vector': [2.088692, 2.33782, 1.505356]}\nD: {'rotation matrix': [[0.9999635418289009, -0.005130634438046651, -0.00688590414124517], [0.00513692191174813, 0.9999857138339602, 0.0010725741392655886], [0.00688015226508057, -0.0011072559806139443, 0.9999748317531684]], 'translation vector': [-0.026001101753266642, -0.0071394396285136, 0.008639096069164354]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_137_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_137_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_137_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_137_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.030402, 0.425954, -0.904234], [0.998503, -0.028211, -0.04686], [-0.045469, -0.904305, -0.424459]], 'translation vector': [2.422483, 1.358004, 3.279846]}\nB: {'rotation matrix': [[-0.030422, 0.425378, -0.904504], [0.99853, -0.027681, -0.046602], [-0.044861, -0.904592, -0.42391]], 'translation vector': [2.423117, 1.357937, 3.279462]}\nC: {'rotation matrix': [[0.9999413192700902, -0.00031349790801336034, -0.010858677567769605], [0.00020550611921538246, 0.9999497154111819, -0.010006476843209223], [0.010861290904906616, 0.010003092803207396, 0.9998904023572843]], 'translation vector': [-0.0025713849698387747, -0.003845445277962156, -0.00016886354172340745]}\nD: {'rotation matrix': [[-0.029484, 0.425058, -0.904686], [0.998566, -0.027942, -0.045671], [-0.044692, -0.904735, -0.423624]], 'translation vector': [2.421348, 1.3572, 3.28135]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.030402, 0.425954, -0.904234], [0.998503, -0.028211, -0.04686], [-0.045469, -0.904305, -0.424459]], 'translation vector': [2.422483, 1.358004, 3.279846]}\nB: {'rotation matrix': [[-0.030422, 0.425378, -0.904504], [0.99853, -0.027681, -0.046602], [-0.044861, -0.904592, -0.42391]], 'translation vector': [2.423117, 1.357937, 3.279462]}\nC: {'rotation matrix': [[0.9999413192700902, -0.00031349790801336034, -0.010858677567769605], [0.00020550611921538246, 0.9999497154111819, -0.010006476843209223], [0.010861290904906616, 0.010003092803207396, 0.9998904023572843]], 'translation vector': [-0.0025713849698387747, -0.003845445277962156, -0.00016886354172340745]}\nD: {'rotation matrix': [[-0.029484, 0.425058, -0.904686], [0.998566, -0.027942, -0.045671], [-0.044692, -0.904735, -0.423624]], 'translation vector': [2.421348, 1.3572, 3.28135]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_138_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_138_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_138_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_138_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.765303, 0.126374, -0.631143], [0.606826, -0.468638, 0.641982], [-0.214648, -0.874305, -0.435336]], 'translation vector': [4.259322, 3.776065, 1.503445]}\nB: {'rotation matrix': [[0.771053, 0.12608, -0.624165], [0.599963, -0.472273, 0.645757], [-0.213359, -0.872388, -0.439791]], 'translation vector': [4.254354, 3.773882, 1.500145]}\nC: {'rotation matrix': [[0.9999284250414853, 0.0013159481143052354, -0.011853419692681406], [-0.0013716672277964584, 0.9999877220001062, -0.004620807232633075], [0.011847871043956123, 0.004636177506016426, 0.9999191533457081]], 'translation vector': [-0.0029675207616195465, 0.002877998549804417, -0.005058945419356364]}\nD: {'rotation matrix': [[0.774333, 0.127442, -0.619813], [0.595393, -0.478434, 0.645452], [-0.214281, -0.868826, -0.446345]], 'translation vector': [4.253978, 3.779827, 1.501383]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.765303, 0.126374, -0.631143], [0.606826, -0.468638, 0.641982], [-0.214648, -0.874305, -0.435336]], 'translation vector': [4.259322, 3.776065, 1.503445]}\nB: {'rotation matrix': [[0.771053, 0.12608, -0.624165], [0.599963, -0.472273, 0.645757], [-0.213359, -0.872388, -0.439791]], 'translation vector': [4.254354, 3.773882, 1.500145]}\nC: {'rotation matrix': [[0.9999284250414853, 0.0013159481143052354, -0.011853419692681406], [-0.0013716672277964584, 0.9999877220001062, -0.004620807232633075], [0.011847871043956123, 0.004636177506016426, 0.9999191533457081]], 'translation vector': [-0.0029675207616195465, 0.002877998549804417, -0.005058945419356364]}\nD: {'rotation matrix': [[0.774333, 0.127442, -0.619813], [0.595393, -0.478434, 0.645452], [-0.214281, -0.868826, -0.446345]], 'translation vector': [4.253978, 3.779827, 1.501383]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_139_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_139_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_139_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_139_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.90788, 0.151333, -0.390964], [0.408575, 0.110464, -0.906016], [-0.093922, -0.982291, -0.162118]], 'translation vector': [8.818443, 3.831761, 1.477683]}\nB: {'rotation matrix': [[0.9999985311997388, 0.0014822343891824376, 0.0004132906130215321], [-0.0014816934035833144, 0.9999987577699637, 9.7059989439923e-05], [-0.00041331535706021807, -9.78109674475786e-05, 1.0000005199156523]], 'translation vector': [-0.004405482725633902, -0.00022188509424603264, 0.00016042860388854052]}\nC: {'rotation matrix': [[-0.90752, 0.150251, -0.392214], [0.409699, 0.111045, -0.905437], [-0.092489, -0.982392, -0.162333]], 'translation vector': [8.816371, 3.832904, 1.475888]}\nD: {'rotation matrix': [[-0.907271, 0.148469, -0.393466], [0.410673, 0.111241, -0.904972], [-0.090591, -0.982641, -0.161898]], 'translation vector': [8.814532, 3.834109, 1.474353]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.90788, 0.151333, -0.390964], [0.408575, 0.110464, -0.906016], [-0.093922, -0.982291, -0.162118]], 'translation vector': [8.818443, 3.831761, 1.477683]}\nB: {'rotation matrix': [[0.9999985311997388, 0.0014822343891824376, 0.0004132906130215321], [-0.0014816934035833144, 0.9999987577699637, 9.7059989439923e-05], [-0.00041331535706021807, -9.78109674475786e-05, 1.0000005199156523]], 'translation vector': [-0.004405482725633902, -0.00022188509424603264, 0.00016042860388854052]}\nC: {'rotation matrix': [[-0.90752, 0.150251, -0.392214], [0.409699, 0.111045, -0.905437], [-0.092489, -0.982392, -0.162333]], 'translation vector': [8.816371, 3.832904, 1.475888]}\nD: {'rotation matrix': [[-0.907271, 0.148469, -0.393466], [0.410673, 0.111241, -0.904972], [-0.090591, -0.982641, -0.161898]], 'translation vector': [8.814532, 3.834109, 1.474353]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_140_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_140_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_140_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_140_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.999990277128188, -0.004263613841972057, 0.00022215499723240068], [0.004262324646153357, 0.9999855274813817, 0.003172824698500956], [-0.00023539502566653948, -0.003171844248534696, 0.9999944918809965]], 'translation vector': [0.0008976741232249452, 0.001107833658419377, 0.002287318056557207]}\nB: {'rotation matrix': [[0.982661, 0.058297, -0.176007], [0.185241, -0.268064, 0.945425], [0.007934, -0.961636, -0.274215]], 'translation vector': [4.071507, 1.217171, 1.479186]}\nC: {'rotation matrix': [[0.982484, 0.05703, -0.177406], [0.186213, -0.264319, 0.946288], [0.007075, -0.962748, -0.270309]], 'translation vector': [4.071419, 1.216069, 1.480649]}\nD: {'rotation matrix': [[0.98266, 0.058843, -0.175828], [0.185228, -0.2691, 0.945133], [0.008299, -0.961313, -0.275333]], 'translation vector': [4.071304, 1.217707, 1.478697]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.999990277128188, -0.004263613841972057, 0.00022215499723240068], [0.004262324646153357, 0.9999855274813817, 0.003172824698500956], [-0.00023539502566653948, -0.003171844248534696, 0.9999944918809965]], 'translation vector': [0.0008976741232249452, 0.001107833658419377, 0.002287318056557207]}\nB: {'rotation matrix': [[0.982661, 0.058297, -0.176007], [0.185241, -0.268064, 0.945425], [0.007934, -0.961636, -0.274215]], 'translation vector': [4.071507, 1.217171, 1.479186]}\nC: {'rotation matrix': [[0.982484, 0.05703, -0.177406], [0.186213, -0.264319, 0.946288], [0.007075, -0.962748, -0.270309]], 'translation vector': [4.071419, 1.216069, 1.480649]}\nD: {'rotation matrix': [[0.98266, 0.058843, -0.175828], [0.185228, -0.2691, 0.945133], [0.008299, -0.961313, -0.275333]], 'translation vector': [4.071304, 1.217707, 1.478697]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_141_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_141_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_141_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_141_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.566945, -0.119787, 0.815], [-0.823733, -0.07511, 0.561981], [-0.006103, -0.989954, -0.141256]], 'translation vector': [0.25398, 0.970235, 1.632712]}\nB: {'rotation matrix': [[0.566333, -0.122518, 0.81502], [-0.824133, -0.073956, 0.561548], [-0.008524, -0.989707, -0.142854]], 'translation vector': [0.252647, 0.969528, 1.633147]}\nC: {'rotation matrix': [[0.565918, -0.124531, 0.815003], [-0.824401, -0.073416, 0.561226], [-0.010056, -0.989496, -0.144211]], 'translation vector': [0.251636, 0.969331, 1.634009]}\nD: {'rotation matrix': [[0.9999988126320164, 0.0003312007428265276, -0.0004969284405179408], [-0.00033114016656371985, 0.9999998966821256, -0.0003458941517538565], [0.0004968245827711156, 0.000346552992150063, 0.9999997733502279]], 'translation vector': [5.9331917010907453e-05, -0.0008045063572443834, -0.0004101833402060384]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.566945, -0.119787, 0.815], [-0.823733, -0.07511, 0.561981], [-0.006103, -0.989954, -0.141256]], 'translation vector': [0.25398, 0.970235, 1.632712]}\nB: {'rotation matrix': [[0.566333, -0.122518, 0.81502], [-0.824133, -0.073956, 0.561548], [-0.008524, -0.989707, -0.142854]], 'translation vector': [0.252647, 0.969528, 1.633147]}\nC: {'rotation matrix': [[0.565918, -0.124531, 0.815003], [-0.824401, -0.073416, 0.561226], [-0.010056, -0.989496, -0.144211]], 'translation vector': [0.251636, 0.969331, 1.634009]}\nD: {'rotation matrix': [[0.9999988126320164, 0.0003312007428265276, -0.0004969284405179408], [-0.00033114016656371985, 0.9999998966821256, -0.0003458941517538565], [0.0004968245827711156, 0.000346552992150063, 0.9999997733502279]], 'translation vector': [5.9331917010907453e-05, -0.0008045063572443834, -0.0004101833402060384]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_142_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_142_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_142_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_142_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.9999807090620153, 0.004568922096108109, 0.004345106491864972], [-0.004581791622211755, 0.9999843128892885, 0.0029713844388234126], [-0.00433269576861675, -0.002991018231335416, 0.9999858202835764]], 'translation vector': [-0.011188164884389007, 0.009307427071622687, -0.0007429783939219003]}\nB: {'rotation matrix': [[-0.942483, -0.17354, 0.285674], [-0.333358, 0.550552, -0.765353], [-0.024459, -0.816564, -0.576737]], 'translation vector': [2.733535, 1.660706, 1.301168]}\nC: {'rotation matrix': [[-0.942594, -0.174193, 0.284909], [-0.333124, 0.550105, -0.765776], [-0.023337, -0.816726, -0.576554]], 'translation vector': [2.730048, 1.657302, 1.301829]}\nD: {'rotation matrix': [[-0.942586, -0.174069, 0.285013], [-0.333135, 0.550225, -0.765686], [-0.023539, -0.816672, -0.576622]], 'translation vector': [2.726519, 1.654368, 1.301906]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999807090620153, 0.004568922096108109, 0.004345106491864972], [-0.004581791622211755, 0.9999843128892885, 0.0029713844388234126], [-0.00433269576861675, -0.002991018231335416, 0.9999858202835764]], 'translation vector': [-0.011188164884389007, 0.009307427071622687, -0.0007429783939219003]}\nB: {'rotation matrix': [[-0.942483, -0.17354, 0.285674], [-0.333358, 0.550552, -0.765353], [-0.024459, -0.816564, -0.576737]], 'translation vector': [2.733535, 1.660706, 1.301168]}\nC: {'rotation matrix': [[-0.942594, -0.174193, 0.284909], [-0.333124, 0.550105, -0.765776], [-0.023337, -0.816726, -0.576554]], 'translation vector': [2.730048, 1.657302, 1.301829]}\nD: {'rotation matrix': [[-0.942586, -0.174069, 0.285013], [-0.333135, 0.550225, -0.765686], [-0.023539, -0.816672, -0.576622]], 'translation vector': [2.726519, 1.654368, 1.301906]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_143_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_143_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_143_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_143_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.662528, 0.400848, -0.632754], [0.747812, -0.402283, 0.528154], [-0.042837, -0.823097, -0.566283]], 'translation vector': [1.744816, 2.25794, 1.331918]}\nB: {'rotation matrix': [[0.662009, 0.40559, -0.630271], [0.748112, -0.408664, 0.522802], [-0.045526, -0.817613, -0.573966]], 'translation vector': [1.743048, 2.25768, 1.329749]}\nC: {'rotation matrix': [[0.6641, 0.398897, -0.632339], [0.746639, -0.397674, 0.533278], [-0.038742, -0.826279, -0.561927]], 'translation vector': [1.744615, 2.258795, 1.335923]}\nD: {'rotation matrix': [[0.9999981013423613, 0.0016775919055887754, -0.0009118672263504091], [-0.0016584483846788572, 0.999788802812158, 0.020485068642363036], [0.0009453490386347853, -0.020482492872648857, 0.9997898845593165]], 'translation vector': [0.0006281413363966593, 0.0014761974203534312, 0.005568137578716104]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.662528, 0.400848, -0.632754], [0.747812, -0.402283, 0.528154], [-0.042837, -0.823097, -0.566283]], 'translation vector': [1.744816, 2.25794, 1.331918]}\nB: {'rotation matrix': [[0.662009, 0.40559, -0.630271], [0.748112, -0.408664, 0.522802], [-0.045526, -0.817613, -0.573966]], 'translation vector': [1.743048, 2.25768, 1.329749]}\nC: {'rotation matrix': [[0.6641, 0.398897, -0.632339], [0.746639, -0.397674, 0.533278], [-0.038742, -0.826279, -0.561927]], 'translation vector': [1.744615, 2.258795, 1.335923]}\nD: {'rotation matrix': [[0.9999981013423613, 0.0016775919055887754, -0.0009118672263504091], [-0.0016584483846788572, 0.999788802812158, 0.020485068642363036], [0.0009453490386347853, -0.020482492872648857, 0.9997898845593165]], 'translation vector': [0.0006281413363966593, 0.0014761974203534312, 0.005568137578716104]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_144_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_144_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_144_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_144_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.44362, -0.475187, 0.759868], [-0.895973, 0.254861, -0.363701], [-0.020835, -0.842166, -0.538816]], 'translation vector': [2.452095, 1.901161, 1.451891]}\nB: {'rotation matrix': [[-0.440436, -0.475273, 0.761664], [-0.897497, 0.254555, -0.360142], [-0.02272, -0.84221, -0.538671]], 'translation vector': [2.449051, 1.900731, 1.449924]}\nC: {'rotation matrix': [[-0.441002, -0.475612, 0.761125], [-0.897234, 0.254511, -0.360825], [-0.022102, -0.842032, -0.538975]], 'translation vector': [2.451296, 1.899939, 1.450426]}\nD: {'rotation matrix': [[0.9999985864023682, -0.0015621221187893434, 0.0006936316269794283], [0.0015661737335896364, 0.9999849325621631, -0.005207380366478063], [-0.0006858389600734047, 0.005207931011548828, 0.9999859575854745]], 'translation vector': [-0.0017105359831024458, -0.002297103154811353, -0.000983146020886283]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.44362, -0.475187, 0.759868], [-0.895973, 0.254861, -0.363701], [-0.020835, -0.842166, -0.538816]], 'translation vector': [2.452095, 1.901161, 1.451891]}\nB: {'rotation matrix': [[-0.440436, -0.475273, 0.761664], [-0.897497, 0.254555, -0.360142], [-0.02272, -0.84221, -0.538671]], 'translation vector': [2.449051, 1.900731, 1.449924]}\nC: {'rotation matrix': [[-0.441002, -0.475612, 0.761125], [-0.897234, 0.254511, -0.360825], [-0.022102, -0.842032, -0.538975]], 'translation vector': [2.451296, 1.899939, 1.450426]}\nD: {'rotation matrix': [[0.9999985864023682, -0.0015621221187893434, 0.0006936316269794283], [0.0015661737335896364, 0.9999849325621631, -0.005207380366478063], [-0.0006858389600734047, 0.005207931011548828, 0.9999859575854745]], 'translation vector': [-0.0017105359831024458, -0.002297103154811353, -0.000983146020886283]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_145_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_145_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_145_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_145_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.648777, 0.514326, -0.560854], [0.760441, -0.465898, 0.452404], [-0.028617, -0.720006, -0.693378]], 'translation vector': [1.800914, 1.822078, 1.233863]}\nB: {'rotation matrix': [[0.644427, 0.520052, -0.560589], [0.76413, -0.465418, 0.446645], [-0.028629, -0.716192, -0.697315]], 'translation vector': [1.79848, 1.820985, 1.232666]}\nC: {'rotation matrix': [[0.9998704626137827, 0.011052401167760776, -0.011708701735332047], [-0.011032627408678441, 0.9999372608185829, 0.0017110617555067353], [0.011728123043277196, -0.001580982237636182, 0.9999298219541505]], 'translation vector': [-0.004951638312360451, 0.0003388210922784518, 0.0025384925972402606]}\nD: {'rotation matrix': [[0.639937, 0.524455, -0.56163], [0.767882, -0.464002, 0.441656], [-0.028969, -0.713897, -0.699651]], 'translation vector': [1.797021, 1.819882, 1.231178]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.648777, 0.514326, -0.560854], [0.760441, -0.465898, 0.452404], [-0.028617, -0.720006, -0.693378]], 'translation vector': [1.800914, 1.822078, 1.233863]}\nB: {'rotation matrix': [[0.644427, 0.520052, -0.560589], [0.76413, -0.465418, 0.446645], [-0.028629, -0.716192, -0.697315]], 'translation vector': [1.79848, 1.820985, 1.232666]}\nC: {'rotation matrix': [[0.9998704626137827, 0.011052401167760776, -0.011708701735332047], [-0.011032627408678441, 0.9999372608185829, 0.0017110617555067353], [0.011728123043277196, -0.001580982237636182, 0.9999298219541505]], 'translation vector': [-0.004951638312360451, 0.0003388210922784518, 0.0025384925972402606]}\nD: {'rotation matrix': [[0.639937, 0.524455, -0.56163], [0.767882, -0.464002, 0.441656], [-0.028969, -0.713897, -0.699651]], 'translation vector': [1.797021, 1.819882, 1.231178]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_146_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_146_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_146_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_146_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.480833, -0.463789, 0.74411], [-0.876135, 0.287476, -0.386968], [-0.034442, -0.838008, -0.54457]], 'translation vector': [3.084943, 2.078791, 1.469333]}\nB: {'rotation matrix': [[0.9999951018718091, -0.0022147244398398017, -0.0018850296839706259], [0.002209800973893798, 0.9999949398931387, -0.0022506347742728594], [0.0018898473296588083, 0.0022461573677036097, 0.9999958361862953]], 'translation vector': [0.004387368548260717, 1.4705105160661702e-05, 0.0008301488900060994]}\nC: {'rotation matrix': [[-0.476687, -0.464053, 0.746608], [-0.878377, 0.285219, -0.383541], [-0.034963, -0.838633, -0.543574]], 'translation vector': [3.080459, 2.078543, 1.469168]}\nD: {'rotation matrix': [[-0.479567, -0.463643, 0.745017], [-0.876821, 0.286706, -0.385985], [-0.034641, -0.838352, -0.544027]], 'translation vector': [3.083795, 2.079285, 1.469908]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.480833, -0.463789, 0.74411], [-0.876135, 0.287476, -0.386968], [-0.034442, -0.838008, -0.54457]], 'translation vector': [3.084943, 2.078791, 1.469333]}\nB: {'rotation matrix': [[0.9999951018718091, -0.0022147244398398017, -0.0018850296839706259], [0.002209800973893798, 0.9999949398931387, -0.0022506347742728594], [0.0018898473296588083, 0.0022461573677036097, 0.9999958361862953]], 'translation vector': [0.004387368548260717, 1.4705105160661702e-05, 0.0008301488900060994]}\nC: {'rotation matrix': [[-0.476687, -0.464053, 0.746608], [-0.878377, 0.285219, -0.383541], [-0.034963, -0.838633, -0.543574]], 'translation vector': [3.080459, 2.078543, 1.469168]}\nD: {'rotation matrix': [[-0.479567, -0.463643, 0.745017], [-0.876821, 0.286706, -0.385985], [-0.034641, -0.838352, -0.544027]], 'translation vector': [3.083795, 2.079285, 1.469908]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_147_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_147_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_147_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_147_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.724797, -0.022386, -0.688599], [0.687785, -0.034918, 0.725075], [-0.040276, -0.999139, -0.009911]], 'translation vector': [1.871804, 0.814995, 1.597738]}\nB: {'rotation matrix': [[0.729664, -0.019712, -0.683522], [0.682722, -0.035273, 0.729827], [-0.038496, -0.999183, -0.01228]], 'translation vector': [1.870321, 0.812422, 1.590842]}\nC: {'rotation matrix': [[0.9999971996131989, 0.0007346987095039795, -0.0017367305867328272], [-0.0007296724426740893, 0.9999944712211791, 0.003354984500413017], [0.0017384108444949038, -0.003353839188726123, 0.9999931577143994]], 'translation vector': [-0.0004548504518708807, 0.001951786856664972, -7.749986575322776e-05]}\nD: {'rotation matrix': [[0.728234, -0.022145, -0.684971], [0.684198, -0.033912, 0.728508], [-0.039361, -0.99918, -0.009544]], 'translation vector': [1.869489, 0.812101, 1.591189]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.724797, -0.022386, -0.688599], [0.687785, -0.034918, 0.725075], [-0.040276, -0.999139, -0.009911]], 'translation vector': [1.871804, 0.814995, 1.597738]}\nB: {'rotation matrix': [[0.729664, -0.019712, -0.683522], [0.682722, -0.035273, 0.729827], [-0.038496, -0.999183, -0.01228]], 'translation vector': [1.870321, 0.812422, 1.590842]}\nC: {'rotation matrix': [[0.9999971996131989, 0.0007346987095039795, -0.0017367305867328272], [-0.0007296724426740893, 0.9999944712211791, 0.003354984500413017], [0.0017384108444949038, -0.003353839188726123, 0.9999931577143994]], 'translation vector': [-0.0004548504518708807, 0.001951786856664972, -7.749986575322776e-05]}\nD: {'rotation matrix': [[0.728234, -0.022145, -0.684971], [0.684198, -0.033912, 0.728508], [-0.039361, -0.99918, -0.009544]], 'translation vector': [1.869489, 0.812101, 1.591189]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_148_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_148_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_148_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_148_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.9999774991279762, 0.0031690326046340087, -0.005996753212157376], [-0.003090362324076843, 0.9999123878108386, 0.012868971987653289], [0.006038036583570517, -0.012849517542798453, 0.9998986268025651]], 'translation vector': [-0.00016965780714706113, -0.008841569485431133, 0.004505805451807898]}\nB: {'rotation matrix': [[0.651481, -0.368876, 0.66295], [-0.758449, -0.337487, 0.557546], [0.018072, -0.866045, -0.49964]], 'translation vector': [2.471969, 4.600353, 1.449958]}\nC: {'rotation matrix': [[0.655694, -0.362412, 0.662362], [-0.754768, -0.337631, 0.562433], [0.019802, -0.868713, -0.49492]], 'translation vector': [2.472568, 4.599315, 1.447954]}\nD: {'rotation matrix': [[0.660006, -0.356371, 0.661356], [-0.750952, -0.338178, 0.567192], [0.021525, -0.870997, -0.490817]], 'translation vector': [2.470351, 4.598146, 1.447521]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999774991279762, 0.0031690326046340087, -0.005996753212157376], [-0.003090362324076843, 0.9999123878108386, 0.012868971987653289], [0.006038036583570517, -0.012849517542798453, 0.9998986268025651]], 'translation vector': [-0.00016965780714706113, -0.008841569485431133, 0.004505805451807898]}\nB: {'rotation matrix': [[0.651481, -0.368876, 0.66295], [-0.758449, -0.337487, 0.557546], [0.018072, -0.866045, -0.49964]], 'translation vector': [2.471969, 4.600353, 1.449958]}\nC: {'rotation matrix': [[0.655694, -0.362412, 0.662362], [-0.754768, -0.337631, 0.562433], [0.019802, -0.868713, -0.49492]], 'translation vector': [2.472568, 4.599315, 1.447954]}\nD: {'rotation matrix': [[0.660006, -0.356371, 0.661356], [-0.750952, -0.338178, 0.567192], [0.021525, -0.870997, -0.490817]], 'translation vector': [2.470351, 4.598146, 1.447521]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_149_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_149_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_149_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_149_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.550599, -0.608246, 0.571732], [-0.834661, 0.412205, -0.365279], [-0.013491, -0.678325, -0.734639]], 'translation vector': [2.153644, 1.764514, 1.342866]}\nB: {'rotation matrix': [[0.9999831845983815, -0.004759897354970956, 0.0032519818043716545], [0.004774936203352942, 0.9999789289359345, -0.004542082720936493], [-0.003229136166379075, 0.004558818034209968, 0.9999846215935988]], 'translation vector': [0.0033377456840164577, 0.0021763424534348985, -0.0009933558524138353]}\nC: {'rotation matrix': [[-0.553936, -0.603859, 0.573158], [-0.832399, 0.415212, -0.36703], [-0.016348, -0.680407, -0.732652]], 'translation vector': [2.15044, 1.76409, 1.342895]}\nD: {'rotation matrix': [[-0.551929, -0.606401, 0.572409], [-0.833765, 0.413226, -0.366169], [-0.014489, -0.679354, -0.733668]], 'translation vector': [2.152355, 1.764444, 1.342815]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.550599, -0.608246, 0.571732], [-0.834661, 0.412205, -0.365279], [-0.013491, -0.678325, -0.734639]], 'translation vector': [2.153644, 1.764514, 1.342866]}\nB: {'rotation matrix': [[0.9999831845983815, -0.004759897354970956, 0.0032519818043716545], [0.004774936203352942, 0.9999789289359345, -0.004542082720936493], [-0.003229136166379075, 0.004558818034209968, 0.9999846215935988]], 'translation vector': [0.0033377456840164577, 0.0021763424534348985, -0.0009933558524138353]}\nC: {'rotation matrix': [[-0.553936, -0.603859, 0.573158], [-0.832399, 0.415212, -0.36703], [-0.016348, -0.680407, -0.732652]], 'translation vector': [2.15044, 1.76409, 1.342895]}\nD: {'rotation matrix': [[-0.551929, -0.606401, 0.572409], [-0.833765, 0.413226, -0.366169], [-0.014489, -0.679354, -0.733668]], 'translation vector': [2.152355, 1.764444, 1.342815]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_150_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_150_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_150_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_150_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.254539, -0.436075, 0.863162], [-0.966632, 0.141348, -0.213642], [-0.028842, -0.88874, -0.457503]], 'translation vector': [1.735428, 0.748356, 1.43337]}\nB: {'rotation matrix': [[-0.254681, -0.434936, 0.863695], [-0.966604, 0.140846, -0.2141], [-0.028528, -0.889378, -0.456282]], 'translation vector': [1.735372, 0.748905, 1.433089]}\nC: {'rotation matrix': [[1.0000001555675329, -0.0009323657310693132, 0.0004792288755131117], [0.0009313044295569888, 0.9999972868981549, 0.0021526068874492825], [-0.0004812492477208415, -0.0021517811624316304, 0.9999974478268276]], 'translation vector': [0.0025109364715583116, 0.0011773606935274739, 0.0008952278670837366]}\nD: {'rotation matrix': [[-0.254466, -0.434667, 0.863893], [-0.966638, 0.141384, -0.213593], [-0.029299, -0.889424, -0.456143]], 'translation vector': [1.735598, 0.749181, 1.433436]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.254539, -0.436075, 0.863162], [-0.966632, 0.141348, -0.213642], [-0.028842, -0.88874, -0.457503]], 'translation vector': [1.735428, 0.748356, 1.43337]}\nB: {'rotation matrix': [[-0.254681, -0.434936, 0.863695], [-0.966604, 0.140846, -0.2141], [-0.028528, -0.889378, -0.456282]], 'translation vector': [1.735372, 0.748905, 1.433089]}\nC: {'rotation matrix': [[1.0000001555675329, -0.0009323657310693132, 0.0004792288755131117], [0.0009313044295569888, 0.9999972868981549, 0.0021526068874492825], [-0.0004812492477208415, -0.0021517811624316304, 0.9999974478268276]], 'translation vector': [0.0025109364715583116, 0.0011773606935274739, 0.0008952278670837366]}\nD: {'rotation matrix': [[-0.254466, -0.434667, 0.863893], [-0.966638, 0.141384, -0.213593], [-0.029299, -0.889424, -0.456143]], 'translation vector': [1.735598, 0.749181, 1.433436]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_151_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_151_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_151_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_151_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.874972, 0.120483, -0.468943], [0.467945, 0.459094, -0.755156], [0.124305, -0.88018, -0.458074]], 'translation vector': [3.924764, 3.210204, 1.74232]}\nB: {'rotation matrix': [[-0.872887, 0.124749, -0.471706], [0.47258, 0.456684, -0.75373], [0.121394, -0.880839, -0.457587]], 'translation vector': [3.924155, 3.192614, 1.742181]}\nC: {'rotation matrix': [[-0.873832, 0.122242, -0.470612], [0.470279, 0.458348, -0.754158], [0.123514, -0.880326, -0.458007]], 'translation vector': [3.924731, 3.201921, 1.742944]}\nD: {'rotation matrix': [[0.9999981203399105, -2.162890380847432e-05, -0.0011856852248069943], [2.6074892574446623e-05, 0.9999907421564089, 0.004166234818762547], [0.0011853825032994, -0.0041669736288708105, 0.9999895516522684]], 'translation vector': [0.0031018447373007962, -0.004160693298826068, -0.00448172332630925]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.874972, 0.120483, -0.468943], [0.467945, 0.459094, -0.755156], [0.124305, -0.88018, -0.458074]], 'translation vector': [3.924764, 3.210204, 1.74232]}\nB: {'rotation matrix': [[-0.872887, 0.124749, -0.471706], [0.47258, 0.456684, -0.75373], [0.121394, -0.880839, -0.457587]], 'translation vector': [3.924155, 3.192614, 1.742181]}\nC: {'rotation matrix': [[-0.873832, 0.122242, -0.470612], [0.470279, 0.458348, -0.754158], [0.123514, -0.880326, -0.458007]], 'translation vector': [3.924731, 3.201921, 1.742944]}\nD: {'rotation matrix': [[0.9999981203399105, -2.162890380847432e-05, -0.0011856852248069943], [2.6074892574446623e-05, 0.9999907421564089, 0.004166234818762547], [0.0011853825032994, -0.0041669736288708105, 0.9999895516522684]], 'translation vector': [0.0031018447373007962, -0.004160693298826068, -0.00448172332630925]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_152_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_152_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_152_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_152_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.910706, 0.180669, -0.371448], [0.412445, 0.446604, -0.793999], [0.022439, -0.876301, -0.481241]], 'translation vector': [3.200821, 1.957629, 1.277707]}\nB: {'rotation matrix': [[-0.909073, 0.181004, -0.375266], [0.415983, 0.444783, -0.793175], [0.023344, -0.877158, -0.479635]], 'translation vector': [3.199567, 1.957217, 1.278564]}\nC: {'rotation matrix': [[-0.912991, 0.180032, -0.36611], [0.407432, 0.448869, -0.795309], [0.021154, -0.875275, -0.483163]], 'translation vector': [3.201129, 1.957814, 1.275703]}\nD: {'rotation matrix': [[0.9998091200876347, 0.0024899817556151226, -0.01932364538042797], [-0.0024534091295022355, 0.9999947392396931, 0.0019198938596762963], [0.019328984816227378, -0.0018724870057808746, 0.9998116886101109]], 'translation vector': [-0.0018727616018998638, 0.007005445282938005, 8.97167203275373e-05]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.910706, 0.180669, -0.371448], [0.412445, 0.446604, -0.793999], [0.022439, -0.876301, -0.481241]], 'translation vector': [3.200821, 1.957629, 1.277707]}\nB: {'rotation matrix': [[-0.909073, 0.181004, -0.375266], [0.415983, 0.444783, -0.793175], [0.023344, -0.877158, -0.479635]], 'translation vector': [3.199567, 1.957217, 1.278564]}\nC: {'rotation matrix': [[-0.912991, 0.180032, -0.36611], [0.407432, 0.448869, -0.795309], [0.021154, -0.875275, -0.483163]], 'translation vector': [3.201129, 1.957814, 1.275703]}\nD: {'rotation matrix': [[0.9998091200876347, 0.0024899817556151226, -0.01932364538042797], [-0.0024534091295022355, 0.9999947392396931, 0.0019198938596762963], [0.019328984816227378, -0.0018724870057808746, 0.9998116886101109]], 'translation vector': [-0.0018727616018998638, 0.007005445282938005, 8.97167203275373e-05]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_153_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_153_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_153_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_153_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.9999837026106017, -0.005604330268430811, -0.00017160014562389917], [0.005604883614299803, 0.9999826757832918, 0.0019021058633347495], [0.00016097085173832394, -0.0019036155522551726, 0.9999985288928479]], 'translation vector': [-0.0002532483433466126, 0.009030133518173555, 0.0039788864378584865]}\nB: {'rotation matrix': [[-0.221399, -0.409647, 0.88497], [-0.971033, 0.176243, -0.161347], [-0.089875, -0.895057, -0.436801]], 'translation vector': [2.157726, 10.114248, 1.730212]}\nC: {'rotation matrix': [[-0.233773, -0.418838, 0.877454], [-0.967259, 0.191883, -0.166106], [-0.098797, -0.887556, -0.449982]], 'translation vector': [2.163177, 10.11361, 1.729991]}\nD: {'rotation matrix': [[-0.208812, -0.406957, 0.88926], [-0.974382, 0.164243, -0.153636], [-0.083532, -0.898561, -0.430827]], 'translation vector': [2.154821, 10.118629, 1.726458]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999837026106017, -0.005604330268430811, -0.00017160014562389917], [0.005604883614299803, 0.9999826757832918, 0.0019021058633347495], [0.00016097085173832394, -0.0019036155522551726, 0.9999985288928479]], 'translation vector': [-0.0002532483433466126, 0.009030133518173555, 0.0039788864378584865]}\nB: {'rotation matrix': [[-0.221399, -0.409647, 0.88497], [-0.971033, 0.176243, -0.161347], [-0.089875, -0.895057, -0.436801]], 'translation vector': [2.157726, 10.114248, 1.730212]}\nC: {'rotation matrix': [[-0.233773, -0.418838, 0.877454], [-0.967259, 0.191883, -0.166106], [-0.098797, -0.887556, -0.449982]], 'translation vector': [2.163177, 10.11361, 1.729991]}\nD: {'rotation matrix': [[-0.208812, -0.406957, 0.88926], [-0.974382, 0.164243, -0.153636], [-0.083532, -0.898561, -0.430827]], 'translation vector': [2.154821, 10.118629, 1.726458]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_154_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_154_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_154_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_154_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.864148, -0.19236, 0.465022], [-0.502138, -0.268523, 0.822042], [-0.033259, -0.943871, -0.328635]], 'translation vector': [3.016374, 2.015361, 1.429191]}\nB: {'rotation matrix': [[0.864117, -0.192314, 0.465099], [-0.502115, -0.266283, 0.822784], [-0.034385, -0.944515, -0.326663]], 'translation vector': [3.015528, 2.015384, 1.428328]}\nC: {'rotation matrix': [[0.864639, -0.19262, 0.464001], [-0.501175, -0.266422, 0.823312], [-0.034966, -0.944414, -0.326895]], 'translation vector': [3.015996, 2.015925, 1.430969]}\nD: {'rotation matrix': [[0.9999953542146168, 0.0030947830980374525, -0.0008131951628852412], [-0.0030994452771746766, 0.999974002426472, -0.006530521364187943], [0.00079388588123918, 0.0065324457136649635, 0.999978487244639]], 'translation vector': [-0.004468736350602409, -0.006227529584318603, -8.95755650023311e-05]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.864148, -0.19236, 0.465022], [-0.502138, -0.268523, 0.822042], [-0.033259, -0.943871, -0.328635]], 'translation vector': [3.016374, 2.015361, 1.429191]}\nB: {'rotation matrix': [[0.864117, -0.192314, 0.465099], [-0.502115, -0.266283, 0.822784], [-0.034385, -0.944515, -0.326663]], 'translation vector': [3.015528, 2.015384, 1.428328]}\nC: {'rotation matrix': [[0.864639, -0.19262, 0.464001], [-0.501175, -0.266422, 0.823312], [-0.034966, -0.944414, -0.326895]], 'translation vector': [3.015996, 2.015925, 1.430969]}\nD: {'rotation matrix': [[0.9999953542146168, 0.0030947830980374525, -0.0008131951628852412], [-0.0030994452771746766, 0.999974002426472, -0.006530521364187943], [0.00079388588123918, 0.0065324457136649635, 0.999978487244639]], 'translation vector': [-0.004468736350602409, -0.006227529584318603, -8.95755650023311e-05]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_155_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_155_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_155_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_155_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[1.0000001305586184, 4.504585202989606e-05, -0.0008677063367108602], [-4.562855519759043e-05, 0.9999983740715708, -0.001562168642633918], [0.0008668198225227341, 0.0015626467626293078, 0.9999978423516577]], 'translation vector': [0.004048003920170018, -0.000314296777893075, -0.001713133752151652]}\nB: {'rotation matrix': [[0.932237, 0.077212, -0.353515], [0.361617, -0.233791, 0.902538], [-0.012962, -0.969216, -0.24587]], 'translation vector': [5.874094, 3.546493, 1.351525]}\nC: {'rotation matrix': [[0.932237, 0.075134, -0.353962], [0.361548, -0.233262, 0.902703], [-0.014743, -0.969507, -0.24462]], 'translation vector': [5.877609, 3.546074, 1.353866]}\nD: {'rotation matrix': [[0.932245, 0.073757, -0.354232], [0.36147, -0.233417, 0.902694], [-0.016104, -0.969576, -0.244262]], 'translation vector': [5.878981, 3.545327, 1.355029]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[1.0000001305586184, 4.504585202989606e-05, -0.0008677063367108602], [-4.562855519759043e-05, 0.9999983740715708, -0.001562168642633918], [0.0008668198225227341, 0.0015626467626293078, 0.9999978423516577]], 'translation vector': [0.004048003920170018, -0.000314296777893075, -0.001713133752151652]}\nB: {'rotation matrix': [[0.932237, 0.077212, -0.353515], [0.361617, -0.233791, 0.902538], [-0.012962, -0.969216, -0.24587]], 'translation vector': [5.874094, 3.546493, 1.351525]}\nC: {'rotation matrix': [[0.932237, 0.075134, -0.353962], [0.361548, -0.233262, 0.902703], [-0.014743, -0.969507, -0.24462]], 'translation vector': [5.877609, 3.546074, 1.353866]}\nD: {'rotation matrix': [[0.932245, 0.073757, -0.354232], [0.36147, -0.233417, 0.902694], [-0.016104, -0.969576, -0.244262]], 'translation vector': [5.878981, 3.545327, 1.355029]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_156_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_156_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_156_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_156_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.967201, -0.070709, 0.243972], [-0.252115, 0.384385, -0.88808], [-0.030984, -0.920461, -0.389605]], 'translation vector': [2.768098, 4.614564, 1.418844]}\nB: {'rotation matrix': [[-0.968215, -0.068836, 0.240461], [-0.248279, 0.380909, -0.890655], [-0.030285, -0.922047, -0.385893]], 'translation vector': [2.770223, 4.618487, 1.418033]}\nC: {'rotation matrix': [[0.9999417292568685, -0.00016387346525461165, 0.010781387930699021], [0.00016087662233447972, 0.9999998236818622, 0.00023678654461243325], [-0.010781286291867453, -0.000235115727098794, 0.9999413646146618]], 'translation vector': [-0.004146218083109776, -0.009777600200150782, -9.031272009885072e-05]}\nD: {'rotation matrix': [[-0.966004, -0.071389, 0.248475], [-0.25638, 0.388148, -0.885218], [-0.03325, -0.918828, -0.393256]], 'translation vector': [2.766369, 4.610029, 1.423364]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.967201, -0.070709, 0.243972], [-0.252115, 0.384385, -0.88808], [-0.030984, -0.920461, -0.389605]], 'translation vector': [2.768098, 4.614564, 1.418844]}\nB: {'rotation matrix': [[-0.968215, -0.068836, 0.240461], [-0.248279, 0.380909, -0.890655], [-0.030285, -0.922047, -0.385893]], 'translation vector': [2.770223, 4.618487, 1.418033]}\nC: {'rotation matrix': [[0.9999417292568685, -0.00016387346525461165, 0.010781387930699021], [0.00016087662233447972, 0.9999998236818622, 0.00023678654461243325], [-0.010781286291867453, -0.000235115727098794, 0.9999413646146618]], 'translation vector': [-0.004146218083109776, -0.009777600200150782, -9.031272009885072e-05]}\nD: {'rotation matrix': [[-0.966004, -0.071389, 0.248475], [-0.25638, 0.388148, -0.885218], [-0.03325, -0.918828, -0.393256]], 'translation vector': [2.766369, 4.610029, 1.423364]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_157_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_157_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_157_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_157_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.819175, -0.184232, 0.543149], [-0.572951, -0.305906, 0.760361], [0.02607, -0.934066, -0.356147]], 'translation vector': [4.417876, 1.783865, 1.2754]}\nB: {'rotation matrix': [[0.815086, -0.186535, 0.548488], [-0.578729, -0.305625, 0.756086], [0.026595, -0.933701, -0.357064]], 'translation vector': [4.412413, 1.788676, 1.273151]}\nC: {'rotation matrix': [[0.9999460053456457, -0.005013531268923329, 0.00905468563745482], [0.0050087072013681256, 0.9999874429708866, 0.000635311021125049], [-0.009056848323318739, -0.0005890721337560823, 0.9999585496849335]], 'translation vector': [-0.01186208886847595, 0.011040583723211927, 0.005770402802990571]}\nD: {'rotation matrix': [[0.823616, -0.181983, 0.537158], [-0.566609, -0.305313, 0.765336], [0.024723, -0.934701, -0.354574]], 'translation vector': [4.422497, 1.777795, 1.277376]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.819175, -0.184232, 0.543149], [-0.572951, -0.305906, 0.760361], [0.02607, -0.934066, -0.356147]], 'translation vector': [4.417876, 1.783865, 1.2754]}\nB: {'rotation matrix': [[0.815086, -0.186535, 0.548488], [-0.578729, -0.305625, 0.756086], [0.026595, -0.933701, -0.357064]], 'translation vector': [4.412413, 1.788676, 1.273151]}\nC: {'rotation matrix': [[0.9999460053456457, -0.005013531268923329, 0.00905468563745482], [0.0050087072013681256, 0.9999874429708866, 0.000635311021125049], [-0.009056848323318739, -0.0005890721337560823, 0.9999585496849335]], 'translation vector': [-0.01186208886847595, 0.011040583723211927, 0.005770402802990571]}\nD: {'rotation matrix': [[0.823616, -0.181983, 0.537158], [-0.566609, -0.305313, 0.765336], [0.024723, -0.934701, -0.354574]], 'translation vector': [4.422497, 1.777795, 1.277376]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_158_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_158_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_158_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_158_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.9994834257163207, -0.009826144818556364, 0.03061241068052498], [0.009193271124449974, 0.9997423555682264, 0.020759381716764332], [-0.0308087787211077, -0.020466954157546582, 0.999315737926551]], 'translation vector': [-0.012172691673165703, -0.014778681947341665, 0.009780168112213383]}\nB: {'rotation matrix': [[0.091644, -0.414787, 0.905292], [-0.995361, -0.064895, 0.071028], [0.029288, -0.907601, -0.418811]], 'translation vector': [1.315562, 0.832314, 1.493138]}\nC: {'rotation matrix': [[0.099917, -0.418908, 0.902515], [-0.994362, -0.074403, 0.07555], [0.035502, -0.904975, -0.42398]], 'translation vector': [1.316564, 0.827844, 1.497329]}\nD: {'rotation matrix': [[0.095483, -0.419292, 0.902816], [-0.994923, -0.069178, 0.073096], [0.031806, -0.905212, -0.423769]], 'translation vector': [1.317067, 0.829593, 1.4954]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9994834257163207, -0.009826144818556364, 0.03061241068052498], [0.009193271124449974, 0.9997423555682264, 0.020759381716764332], [-0.0308087787211077, -0.020466954157546582, 0.999315737926551]], 'translation vector': [-0.012172691673165703, -0.014778681947341665, 0.009780168112213383]}\nB: {'rotation matrix': [[0.091644, -0.414787, 0.905292], [-0.995361, -0.064895, 0.071028], [0.029288, -0.907601, -0.418811]], 'translation vector': [1.315562, 0.832314, 1.493138]}\nC: {'rotation matrix': [[0.099917, -0.418908, 0.902515], [-0.994362, -0.074403, 0.07555], [0.035502, -0.904975, -0.42398]], 'translation vector': [1.316564, 0.827844, 1.497329]}\nD: {'rotation matrix': [[0.095483, -0.419292, 0.902816], [-0.994923, -0.069178, 0.073096], [0.031806, -0.905212, -0.423769]], 'translation vector': [1.317067, 0.829593, 1.4954]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_159_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_159_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_159_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_159_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.529968, 0.414216, -0.739972], [0.844868, 0.182762, -0.502789], [-0.073025, -0.891641, -0.446816]], 'translation vector': [5.418643, 4.410617, 1.386009]}\nB: {'rotation matrix': [[-0.531013, 0.418177, -0.736989], [0.843374, 0.176517, -0.507507], [-0.082137, -0.89105, -0.446413]], 'translation vector': [5.416763, 4.405288, 1.382813]}\nC: {'rotation matrix': [[0.9999262982512661, -0.011126048765088865, -0.004724929479929227], [0.011121653443920318, 0.9999373200698624, -0.0008601073344245507], [0.00473500376540592, 0.0008076436796648745, 0.9999882661936769]], 'translation vector': [-0.020480989578110398, -0.004401497485728045, 0.008145336008434256]}\nD: {'rotation matrix': [[-0.533157, 0.409147, -0.740501], [0.84318, 0.185365, -0.504666], [-0.06922, -0.893442, -0.443813]], 'translation vector': [5.417671, 4.419961, 1.384383]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.529968, 0.414216, -0.739972], [0.844868, 0.182762, -0.502789], [-0.073025, -0.891641, -0.446816]], 'translation vector': [5.418643, 4.410617, 1.386009]}\nB: {'rotation matrix': [[-0.531013, 0.418177, -0.736989], [0.843374, 0.176517, -0.507507], [-0.082137, -0.89105, -0.446413]], 'translation vector': [5.416763, 4.405288, 1.382813]}\nC: {'rotation matrix': [[0.9999262982512661, -0.011126048765088865, -0.004724929479929227], [0.011121653443920318, 0.9999373200698624, -0.0008601073344245507], [0.00473500376540592, 0.0008076436796648745, 0.9999882661936769]], 'translation vector': [-0.020480989578110398, -0.004401497485728045, 0.008145336008434256]}\nD: {'rotation matrix': [[-0.533157, 0.409147, -0.740501], [0.84318, 0.185365, -0.504666], [-0.06922, -0.893442, -0.443813]], 'translation vector': [5.417671, 4.419961, 1.384383]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_160_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_160_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_160_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_160_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.848714, 0.230926, -0.475771], [0.528497, 0.337392, -0.77901], [-0.019373, -0.912601, -0.408393]], 'translation vector': [1.792868, 5.329395, 1.618046]}\nB: {'rotation matrix': [[-0.84982, 0.23191, -0.473312], [0.526645, 0.337444, -0.780241], [-0.021229, -0.912332, -0.408901]], 'translation vector': [1.792819, 5.327111, 1.618396]}\nC: {'rotation matrix': [[-0.847697, 0.231565, -0.477271], [0.530179, 0.339488, -0.776955], [-0.017888, -0.911661, -0.410554]], 'translation vector': [1.79081, 5.325803, 1.623639]}\nD: {'rotation matrix': [[0.9999931647889151, 0.002636603414451914, -0.0022507832348392875], [-0.002639482011584797, 0.9999963226575643, -0.0013481976836917885], [0.002247569136725343, 0.001353829445398431, 0.9999964854536705]], 'translation vector': [-0.00651758527214108, -0.0013521488456044173, 0.0015986017122768814]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.848714, 0.230926, -0.475771], [0.528497, 0.337392, -0.77901], [-0.019373, -0.912601, -0.408393]], 'translation vector': [1.792868, 5.329395, 1.618046]}\nB: {'rotation matrix': [[-0.84982, 0.23191, -0.473312], [0.526645, 0.337444, -0.780241], [-0.021229, -0.912332, -0.408901]], 'translation vector': [1.792819, 5.327111, 1.618396]}\nC: {'rotation matrix': [[-0.847697, 0.231565, -0.477271], [0.530179, 0.339488, -0.776955], [-0.017888, -0.911661, -0.410554]], 'translation vector': [1.79081, 5.325803, 1.623639]}\nD: {'rotation matrix': [[0.9999931647889151, 0.002636603414451914, -0.0022507832348392875], [-0.002639482011584797, 0.9999963226575643, -0.0013481976836917885], [0.002247569136725343, 0.001353829445398431, 0.9999964854536705]], 'translation vector': [-0.00651758527214108, -0.0013521488456044173, 0.0015986017122768814]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_161_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_161_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_161_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_161_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.9999401570739126, -0.0046057655246039214, 0.009916971929525087], [0.004585227566153826, 0.9999870493451988, 0.0021285393012493415], [-0.009927494824793211, -0.0020825415412263123, 0.9999492671696912]], 'translation vector': [-0.008397334377028054, -0.00983275627665292, -0.005241897912080518]}\nB: {'rotation matrix': [[0.979087, -0.093288, 0.180791], [-0.203431, -0.440264, 0.874519], [-0.001987, -0.893009, -0.450035]], 'translation vector': [1.973386, 0.601511, 1.693802]}\nC: {'rotation matrix': [[0.980067, -0.092864, 0.175631], [-0.198651, -0.445785, 0.872819], [-0.002761, -0.89031, -0.455347]], 'translation vector': [1.967233, 0.628282, 1.699375]}\nD: {'rotation matrix': [[0.978621, -0.096572, 0.18159], [-0.205599, -0.435758, 0.876267], [-0.005494, -0.894868, -0.446298]], 'translation vector': [1.993303, 0.583546, 1.694907]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999401570739126, -0.0046057655246039214, 0.009916971929525087], [0.004585227566153826, 0.9999870493451988, 0.0021285393012493415], [-0.009927494824793211, -0.0020825415412263123, 0.9999492671696912]], 'translation vector': [-0.008397334377028054, -0.00983275627665292, -0.005241897912080518]}\nB: {'rotation matrix': [[0.979087, -0.093288, 0.180791], [-0.203431, -0.440264, 0.874519], [-0.001987, -0.893009, -0.450035]], 'translation vector': [1.973386, 0.601511, 1.693802]}\nC: {'rotation matrix': [[0.980067, -0.092864, 0.175631], [-0.198651, -0.445785, 0.872819], [-0.002761, -0.89031, -0.455347]], 'translation vector': [1.967233, 0.628282, 1.699375]}\nD: {'rotation matrix': [[0.978621, -0.096572, 0.18159], [-0.205599, -0.435758, 0.876267], [-0.005494, -0.894868, -0.446298]], 'translation vector': [1.993303, 0.583546, 1.694907]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_162_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_162_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_162_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_162_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.170548, -0.038579, 0.984594], [-0.984686, -0.02999, -0.171739], [0.036154, -0.998805, -0.032873]], 'translation vector': [3.062037, 2.44503, 1.503093]}\nB: {'rotation matrix': [[-0.179914, -0.049674, 0.982427], [-0.983099, -0.025325, -0.181318], [0.033887, -0.998444, -0.044279]], 'translation vector': [3.062168, 2.448737, 1.498158]}\nC: {'rotation matrix': [[-0.175595, -0.045495, 0.983411], [-0.983814, -0.028148, -0.176969], [0.035732, -0.998568, -0.039815]], 'translation vector': [3.062089, 2.446902, 1.500845]}\nD: {'rotation matrix': [[0.9999851894661843, 0.0005818244206442844, -0.005337253553840107], [-0.0006352685106689393, 0.9999500541572905, -0.009997050540678364], [0.005330302626435351, 0.010000486202961777, 0.9999353803458123]], 'translation vector': [0.013561096331675682, -0.004517035589624685, -0.0041143791607543]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.170548, -0.038579, 0.984594], [-0.984686, -0.02999, -0.171739], [0.036154, -0.998805, -0.032873]], 'translation vector': [3.062037, 2.44503, 1.503093]}\nB: {'rotation matrix': [[-0.179914, -0.049674, 0.982427], [-0.983099, -0.025325, -0.181318], [0.033887, -0.998444, -0.044279]], 'translation vector': [3.062168, 2.448737, 1.498158]}\nC: {'rotation matrix': [[-0.175595, -0.045495, 0.983411], [-0.983814, -0.028148, -0.176969], [0.035732, -0.998568, -0.039815]], 'translation vector': [3.062089, 2.446902, 1.500845]}\nD: {'rotation matrix': [[0.9999851894661843, 0.0005818244206442844, -0.005337253553840107], [-0.0006352685106689393, 0.9999500541572905, -0.009997050540678364], [0.005330302626435351, 0.010000486202961777, 0.9999353803458123]], 'translation vector': [0.013561096331675682, -0.004517035589624685, -0.0041143791607543]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_163_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_163_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_163_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_163_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.879731, -0.18442, 0.43825], [-0.471339, -0.216982, 0.854844], [-0.062558, -0.958597, -0.27781]], 'translation vector': [1.012821, 1.289106, 1.470609]}\nB: {'rotation matrix': [[0.877414, -0.186393, 0.442044], [-0.476234, -0.227312, 0.84943], [-0.057846, -0.955818, -0.288213]], 'translation vector': [1.033287, 1.302455, 1.466311]}\nC: {'rotation matrix': [[0.999991817168325, 0.002932358485082305, -0.002640226065149888], [-0.0029080462382595554, 0.9999549882789169, 0.009034252562416854], [0.0026665546966547606, -0.009026772459937056, 0.9999556430330881]], 'translation vector': [0.005120345387922942, -0.0015772155749180783, 0.009569803755594242]}\nD: {'rotation matrix': [[0.878466, -0.185862, 0.440175], [-0.473882, -0.221089, 0.852382], [-0.061107, -0.957379, -0.282296]], 'translation vector': [1.023458, 1.295782, 1.469602]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.879731, -0.18442, 0.43825], [-0.471339, -0.216982, 0.854844], [-0.062558, -0.958597, -0.27781]], 'translation vector': [1.012821, 1.289106, 1.470609]}\nB: {'rotation matrix': [[0.877414, -0.186393, 0.442044], [-0.476234, -0.227312, 0.84943], [-0.057846, -0.955818, -0.288213]], 'translation vector': [1.033287, 1.302455, 1.466311]}\nC: {'rotation matrix': [[0.999991817168325, 0.002932358485082305, -0.002640226065149888], [-0.0029080462382595554, 0.9999549882789169, 0.009034252562416854], [0.0026665546966547606, -0.009026772459937056, 0.9999556430330881]], 'translation vector': [0.005120345387922942, -0.0015772155749180783, 0.009569803755594242]}\nD: {'rotation matrix': [[0.878466, -0.185862, 0.440175], [-0.473882, -0.221089, 0.852382], [-0.061107, -0.957379, -0.282296]], 'translation vector': [1.023458, 1.295782, 1.469602]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_164_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_164_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_164_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_164_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.306256, 0.214312, -0.927512], [0.951867, -0.081783, 0.295401], [-0.012547, -0.973336, -0.229043]], 'translation vector': [3.740017, 1.664374, 1.453227]}\nB: {'rotation matrix': [[0.304194, 0.215718, -0.927864], [0.952563, -0.078625, 0.294012], [-0.009529, -0.973285, -0.229402]], 'translation vector': [3.747529, 1.6658, 1.453625]}\nC: {'rotation matrix': [[0.9998537516460376, 0.013008935130727867, -0.011104836010819949], [-0.013009085999586367, 0.9999155647701683, 0.00015274967336381323], [0.011105421497037882, -8.408899546142835e-06, 0.9999381123179228]], 'translation vector': [0.00936290533716333, 0.003069967953460928, -0.010760020039624951]}\nD: {'rotation matrix': [[0.309341, 0.212762, -0.926844], [0.950827, -0.084926, 0.297851], [-0.015342, -0.973406, -0.228571]], 'translation vector': [3.731516, 1.660707, 1.454311]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.306256, 0.214312, -0.927512], [0.951867, -0.081783, 0.295401], [-0.012547, -0.973336, -0.229043]], 'translation vector': [3.740017, 1.664374, 1.453227]}\nB: {'rotation matrix': [[0.304194, 0.215718, -0.927864], [0.952563, -0.078625, 0.294012], [-0.009529, -0.973285, -0.229402]], 'translation vector': [3.747529, 1.6658, 1.453625]}\nC: {'rotation matrix': [[0.9998537516460376, 0.013008935130727867, -0.011104836010819949], [-0.013009085999586367, 0.9999155647701683, 0.00015274967336381323], [0.011105421497037882, -8.408899546142835e-06, 0.9999381123179228]], 'translation vector': [0.00936290533716333, 0.003069967953460928, -0.010760020039624951]}\nD: {'rotation matrix': [[0.309341, 0.212762, -0.926844], [0.950827, -0.084926, 0.297851], [-0.015342, -0.973406, -0.228571]], 'translation vector': [3.731516, 1.660707, 1.454311]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_165_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_165_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_165_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_165_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.226506, -0.721874, 0.653906], [-0.969583, -0.103174, 0.221955], [-0.092757, -0.68429, -0.723287]], 'translation vector': [2.104302, 2.429349, 1.38499]}\nB: {'rotation matrix': [[0.223513, -0.721637, 0.655197], [-0.970242, -0.100499, 0.220298], [-0.093129, -0.684938, -0.722625]], 'translation vector': [2.105446, 2.427759, 1.384995]}\nC: {'rotation matrix': [[0.22885, -0.719341, 0.655878], [-0.96905, -0.104271, 0.223761], [-0.092571, -0.686787, -0.72094]], 'translation vector': [2.102429, 2.429695, 1.385047]}\nD: {'rotation matrix': [[0.9999909282503874, 0.0026675007233110163, 0.003038726855301391], [-0.0026681297586413737, 0.999996473984971, 2.7861207662840692e-05], [-0.0030386998838646344, -3.5943923369286046e-05, 0.9999953589686927]], 'translation vector': [0.002364456746013932, -0.0010737235666011813, -0.00037719790048196256]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.226506, -0.721874, 0.653906], [-0.969583, -0.103174, 0.221955], [-0.092757, -0.68429, -0.723287]], 'translation vector': [2.104302, 2.429349, 1.38499]}\nB: {'rotation matrix': [[0.223513, -0.721637, 0.655197], [-0.970242, -0.100499, 0.220298], [-0.093129, -0.684938, -0.722625]], 'translation vector': [2.105446, 2.427759, 1.384995]}\nC: {'rotation matrix': [[0.22885, -0.719341, 0.655878], [-0.96905, -0.104271, 0.223761], [-0.092571, -0.686787, -0.72094]], 'translation vector': [2.102429, 2.429695, 1.385047]}\nD: {'rotation matrix': [[0.9999909282503874, 0.0026675007233110163, 0.003038726855301391], [-0.0026681297586413737, 0.999996473984971, 2.7861207662840692e-05], [-0.0030386998838646344, -3.5943923369286046e-05, 0.9999953589686927]], 'translation vector': [0.002364456746013932, -0.0010737235666011813, -0.00037719790048196256]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_166_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_166_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_166_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_166_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.711418, -0.467017, 0.525147], [-0.700604, 0.529926, -0.477841], [-0.055129, -0.707865, -0.704193]], 'translation vector': [2.529564, 4.393072, 1.526695]}\nB: {'rotation matrix': [[0.9999966648975326, -0.0024113488419032422, -0.0008172418163523504], [0.0024114327512952905, 0.9999969977952643, 0.0006432367723817748], [0.0008165042296219522, -0.0006466099152102373, 0.9999990767691678]], 'translation vector': [-0.006437670952323948, -0.005367763877482813, 0.00013241538331776326]}\nC: {'rotation matrix': [[-0.711906, -0.467075, 0.524433], [-0.700166, 0.529878, -0.478536], [-0.054374, -0.707863, -0.704254]], 'translation vector': [2.530244, 4.39346, 1.526741]}\nD: {'rotation matrix': [[-0.711605, -0.467107, 0.524814], [-0.700478, 0.529425, -0.47858], [-0.054301, -0.708181, -0.70394]], 'translation vector': [2.529967, 4.393585, 1.525543]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.711418, -0.467017, 0.525147], [-0.700604, 0.529926, -0.477841], [-0.055129, -0.707865, -0.704193]], 'translation vector': [2.529564, 4.393072, 1.526695]}\nB: {'rotation matrix': [[0.9999966648975326, -0.0024113488419032422, -0.0008172418163523504], [0.0024114327512952905, 0.9999969977952643, 0.0006432367723817748], [0.0008165042296219522, -0.0006466099152102373, 0.9999990767691678]], 'translation vector': [-0.006437670952323948, -0.005367763877482813, 0.00013241538331776326]}\nC: {'rotation matrix': [[-0.711906, -0.467075, 0.524433], [-0.700166, 0.529878, -0.478536], [-0.054374, -0.707863, -0.704254]], 'translation vector': [2.530244, 4.39346, 1.526741]}\nD: {'rotation matrix': [[-0.711605, -0.467107, 0.524814], [-0.700478, 0.529425, -0.47858], [-0.054301, -0.708181, -0.70394]], 'translation vector': [2.529967, 4.393585, 1.525543]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_167_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_167_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_167_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_167_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.61412, -0.406634, 0.676392], [-0.788751, 0.286898, -0.543656], [0.027014, -0.867374, -0.496922]], 'translation vector': [1.884445, 2.364432, 1.389567]}\nB: {'rotation matrix': [[0.9999800934681147, 0.006249341845585854, -0.0009789493583592457], [-0.006247228158252656, 0.9999797870897016, 0.0016201321535261565], [0.0009895210241494333, -0.0016143397944822075, 0.9999982198013821]], 'translation vector': [-0.0069672096971418185, 0.0007707556249330061, 0.0022308491982188094]}\nC: {'rotation matrix': [[-0.614991, -0.407345, 0.675171], [-0.788025, 0.286731, -0.544795], [0.028327, -0.867096, -0.497335]], 'translation vector': [1.885989, 2.365962, 1.389016]}\nD: {'rotation matrix': [[-0.615135, -0.40626, 0.675693], [-0.787872, 0.284761, -0.546048], [0.029426, -0.868253, -0.495248]], 'translation vector': [1.88807, 2.366622, 1.388041]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.61412, -0.406634, 0.676392], [-0.788751, 0.286898, -0.543656], [0.027014, -0.867374, -0.496922]], 'translation vector': [1.884445, 2.364432, 1.389567]}\nB: {'rotation matrix': [[0.9999800934681147, 0.006249341845585854, -0.0009789493583592457], [-0.006247228158252656, 0.9999797870897016, 0.0016201321535261565], [0.0009895210241494333, -0.0016143397944822075, 0.9999982198013821]], 'translation vector': [-0.0069672096971418185, 0.0007707556249330061, 0.0022308491982188094]}\nC: {'rotation matrix': [[-0.614991, -0.407345, 0.675171], [-0.788025, 0.286731, -0.544795], [0.028327, -0.867096, -0.497335]], 'translation vector': [1.885989, 2.365962, 1.389016]}\nD: {'rotation matrix': [[-0.615135, -0.40626, 0.675693], [-0.787872, 0.284761, -0.546048], [0.029426, -0.868253, -0.495248]], 'translation vector': [1.88807, 2.366622, 1.388041]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_168_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_168_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_168_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_168_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.9999912398243648, -0.003460941321399837, -0.0022122658857095193], [0.003455741372162686, 0.9999910905464573, -0.0022383864983849893], [0.0022193515549835062, 0.0022307044509074837, 0.999995073124285]], 'translation vector': [-0.001957679288641462, -0.0024920290268601875, -0.000907578453192226]}\nB: {'rotation matrix': [[0.67484, -0.325973, 0.662067], [-0.73754, -0.328352, 0.590102], [0.025034, -0.886525, -0.462003]], 'translation vector': [2.869569, 2.417867, 1.545271]}\nC: {'rotation matrix': [[0.67798, -0.325694, 0.658989], [-0.734824, -0.323965, 0.595886], [0.019413, -0.88824, -0.45897]], 'translation vector': [2.868894, 2.415756, 1.54509]}\nD: {'rotation matrix': [[0.682626, -0.324357, 0.654839], [-0.73069, -0.316101, 0.605122], [0.010719, -0.891556, -0.452783]], 'translation vector': [2.86653, 2.411599, 1.544608]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999912398243648, -0.003460941321399837, -0.0022122658857095193], [0.003455741372162686, 0.9999910905464573, -0.0022383864983849893], [0.0022193515549835062, 0.0022307044509074837, 0.999995073124285]], 'translation vector': [-0.001957679288641462, -0.0024920290268601875, -0.000907578453192226]}\nB: {'rotation matrix': [[0.67484, -0.325973, 0.662067], [-0.73754, -0.328352, 0.590102], [0.025034, -0.886525, -0.462003]], 'translation vector': [2.869569, 2.417867, 1.545271]}\nC: {'rotation matrix': [[0.67798, -0.325694, 0.658989], [-0.734824, -0.323965, 0.595886], [0.019413, -0.88824, -0.45897]], 'translation vector': [2.868894, 2.415756, 1.54509]}\nD: {'rotation matrix': [[0.682626, -0.324357, 0.654839], [-0.73069, -0.316101, 0.605122], [0.010719, -0.891556, -0.452783]], 'translation vector': [2.86653, 2.411599, 1.544608]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_169_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_169_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_169_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_169_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.999502, 0.00746, 0.030678], [-0.028996, 0.167572, -0.985433], [-0.012492, -0.985832, -0.167272]], 'translation vector': [6.682728, 5.426456, 1.759702]}\nB: {'rotation matrix': [[-0.999516, 0.005588, 0.030613], [-0.029207, 0.171126, -0.984816], [-0.010741, -0.985233, -0.17088]], 'translation vector': [6.687027, 5.423337, 1.762554]}\nC: {'rotation matrix': [[-0.999427, 0.005452, 0.0334], [-0.031967, 0.171859, -0.984603], [-0.011109, -0.985106, -0.171586]], 'translation vector': [6.682628, 5.424977, 1.756356]}\nD: {'rotation matrix': [[0.9999959031714618, 0.0006283915085459037, 0.0029514431388234703], [-0.0006213832896176772, 0.9999971711296258, -0.0021323447476004893], [-0.0029538525457832297, 0.002130012321803009, 0.9999931612377413]], 'translation vector': [-0.006207505850969852, 0.015496225089857818, -0.006213786764486251]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.999502, 0.00746, 0.030678], [-0.028996, 0.167572, -0.985433], [-0.012492, -0.985832, -0.167272]], 'translation vector': [6.682728, 5.426456, 1.759702]}\nB: {'rotation matrix': [[-0.999516, 0.005588, 0.030613], [-0.029207, 0.171126, -0.984816], [-0.010741, -0.985233, -0.17088]], 'translation vector': [6.687027, 5.423337, 1.762554]}\nC: {'rotation matrix': [[-0.999427, 0.005452, 0.0334], [-0.031967, 0.171859, -0.984603], [-0.011109, -0.985106, -0.171586]], 'translation vector': [6.682628, 5.424977, 1.756356]}\nD: {'rotation matrix': [[0.9999959031714618, 0.0006283915085459037, 0.0029514431388234703], [-0.0006213832896176772, 0.9999971711296258, -0.0021323447476004893], [-0.0029538525457832297, 0.002130012321803009, 0.9999931612377413]], 'translation vector': [-0.006207505850969852, 0.015496225089857818, -0.006213786764486251]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_170_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_170_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_170_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_170_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.106477, -0.473799, 0.874173], [-0.992913, 0.097339, -0.068183], [-0.052786, -0.875237, -0.480805]], 'translation vector': [4.553204, 3.149855, 1.246823]}\nB: {'rotation matrix': [[-0.115243, -0.46998, 0.875122], [-0.991961, 0.100818, -0.076485], [-0.052282, -0.876901, -0.47782]], 'translation vector': [4.553743, 3.152171, 1.246409]}\nC: {'rotation matrix': [[0.999828950585133, 0.0011190525701963586, -0.018454829607259946], [-0.001416468295998283, 0.9998678811169985, -0.016145155593255522], [0.018434708990961175, 0.016168544891222898, 0.9996994385524549]], 'translation vector': [0.002077144261483088, 0.01271199054625649, -0.0002353155159546816]}\nD: {'rotation matrix': [[-0.120252, -0.468652, 0.87516], [-0.991418, 0.102239, -0.081478], [-0.051291, -0.877447, -0.476924]], 'translation vector': [4.555783, 3.154248, 1.246329]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.106477, -0.473799, 0.874173], [-0.992913, 0.097339, -0.068183], [-0.052786, -0.875237, -0.480805]], 'translation vector': [4.553204, 3.149855, 1.246823]}\nB: {'rotation matrix': [[-0.115243, -0.46998, 0.875122], [-0.991961, 0.100818, -0.076485], [-0.052282, -0.876901, -0.47782]], 'translation vector': [4.553743, 3.152171, 1.246409]}\nC: {'rotation matrix': [[0.999828950585133, 0.0011190525701963586, -0.018454829607259946], [-0.001416468295998283, 0.9998678811169985, -0.016145155593255522], [0.018434708990961175, 0.016168544891222898, 0.9996994385524549]], 'translation vector': [0.002077144261483088, 0.01271199054625649, -0.0002353155159546816]}\nD: {'rotation matrix': [[-0.120252, -0.468652, 0.87516], [-0.991418, 0.102239, -0.081478], [-0.051291, -0.877447, -0.476924]], 'translation vector': [4.555783, 3.154248, 1.246329]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_171_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_171_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_171_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_171_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.927045, 0.223636, -0.300956], [0.364198, 0.346229, -0.864573], [-0.08915, -0.911105, -0.402418]], 'translation vector': [7.648557, 2.747808, 1.440051]}\nB: {'rotation matrix': [[0.9999939386175598, 0.0009488285827030999, 0.0032872470438982896], [-0.0009337892397183689, 0.9999879134504162, -0.004824212466184848], [-0.0032917475553783573, 0.004820430801235839, 0.9999833362205853]], 'translation vector': [-0.000493455857793812, -0.002698981007177137, 0.0012657934234763246]}\nC: {'rotation matrix': [[-0.9261, 0.223085, -0.304257], [0.366658, 0.34218, -0.865144], [-0.08889, -0.912768, -0.398689]], 'translation vector': [7.650569, 2.747621, 1.441708]}\nD: {'rotation matrix': [[-0.927432, 0.22366, -0.299744], [0.363522, 0.350792, -0.863016], [-0.087874, -0.909352, -0.406641]], 'translation vector': [7.650677, 2.747929, 1.439487]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.927045, 0.223636, -0.300956], [0.364198, 0.346229, -0.864573], [-0.08915, -0.911105, -0.402418]], 'translation vector': [7.648557, 2.747808, 1.440051]}\nB: {'rotation matrix': [[0.9999939386175598, 0.0009488285827030999, 0.0032872470438982896], [-0.0009337892397183689, 0.9999879134504162, -0.004824212466184848], [-0.0032917475553783573, 0.004820430801235839, 0.9999833362205853]], 'translation vector': [-0.000493455857793812, -0.002698981007177137, 0.0012657934234763246]}\nC: {'rotation matrix': [[-0.9261, 0.223085, -0.304257], [0.366658, 0.34218, -0.865144], [-0.08889, -0.912768, -0.398689]], 'translation vector': [7.650569, 2.747621, 1.441708]}\nD: {'rotation matrix': [[-0.927432, 0.22366, -0.299744], [0.363522, 0.350792, -0.863016], [-0.087874, -0.909352, -0.406641]], 'translation vector': [7.650677, 2.747929, 1.439487]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_172_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_172_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_172_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_172_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.598936, 0.355502, -0.717562], [0.800003, -0.305531, 0.516379], [-0.035664, -0.883329, -0.467396]], 'translation vector': [5.964795, 1.444893, 1.32602]}\nB: {'rotation matrix': [[0.600188, 0.357296, -0.715622], [0.799089, -0.307102, 0.516861], [-0.035096, -0.882059, -0.46983]], 'translation vector': [5.950611, 1.450679, 1.325211]}\nC: {'rotation matrix': [[0.9999898156973296, -0.003952939496578174, 0.0024431521094752146], [0.003942856258073881, 0.9999843026055344, 0.004096575088743528], [-0.0024598279169757275, -0.004085831486072852, 0.9999883270401599]], 'translation vector': [-0.0051631563465806, -0.010227260523923531, 0.01366119668418353]}\nD: {'rotation matrix': [[0.593595, 0.358896, -0.720305], [0.803944, -0.304849, 0.510628], [-0.036322, -0.882191, -0.469489]], 'translation vector': [5.978991, 1.441471, 1.326102]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.598936, 0.355502, -0.717562], [0.800003, -0.305531, 0.516379], [-0.035664, -0.883329, -0.467396]], 'translation vector': [5.964795, 1.444893, 1.32602]}\nB: {'rotation matrix': [[0.600188, 0.357296, -0.715622], [0.799089, -0.307102, 0.516861], [-0.035096, -0.882059, -0.46983]], 'translation vector': [5.950611, 1.450679, 1.325211]}\nC: {'rotation matrix': [[0.9999898156973296, -0.003952939496578174, 0.0024431521094752146], [0.003942856258073881, 0.9999843026055344, 0.004096575088743528], [-0.0024598279169757275, -0.004085831486072852, 0.9999883270401599]], 'translation vector': [-0.0051631563465806, -0.010227260523923531, 0.01366119668418353]}\nD: {'rotation matrix': [[0.593595, 0.358896, -0.720305], [0.803944, -0.304849, 0.510628], [-0.036322, -0.882191, -0.469489]], 'translation vector': [5.978991, 1.441471, 1.326102]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_173_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_173_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_173_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_173_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.983068, 0.048576, -0.176687], [0.181735, -0.381917, 0.906152], [-0.023462, -0.922919, -0.384278]], 'translation vector': [2.212073, 3.484547, 1.465708]}\nB: {'rotation matrix': [[0.982686, 0.047982, -0.178958], [0.183606, -0.38172, 0.905858], [-0.024847, -0.923032, -0.38392]], 'translation vector': [2.212621, 3.48432, 1.466163]}\nC: {'rotation matrix': [[0.983078, 0.050132, -0.176192], [0.181887, -0.381466, 0.906312], [-0.021776, -0.923023, -0.384129]], 'translation vector': [2.213536, 3.486831, 1.465259]}\nD: {'rotation matrix': [[0.9999889301916728, 0.0015926412893515843, 0.004480728220099604], [-0.001621568987345314, 0.9999775972099906, 0.006504872079739442], [-0.004470952780905997, -0.0065123882198827605, 0.9999681868018854]], 'translation vector': [-0.002104900488902217, -0.0034679151915213424, 0.0012659901948626207]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.983068, 0.048576, -0.176687], [0.181735, -0.381917, 0.906152], [-0.023462, -0.922919, -0.384278]], 'translation vector': [2.212073, 3.484547, 1.465708]}\nB: {'rotation matrix': [[0.982686, 0.047982, -0.178958], [0.183606, -0.38172, 0.905858], [-0.024847, -0.923032, -0.38392]], 'translation vector': [2.212621, 3.48432, 1.466163]}\nC: {'rotation matrix': [[0.983078, 0.050132, -0.176192], [0.181887, -0.381466, 0.906312], [-0.021776, -0.923023, -0.384129]], 'translation vector': [2.213536, 3.486831, 1.465259]}\nD: {'rotation matrix': [[0.9999889301916728, 0.0015926412893515843, 0.004480728220099604], [-0.001621568987345314, 0.9999775972099906, 0.006504872079739442], [-0.004470952780905997, -0.0065123882198827605, 0.9999681868018854]], 'translation vector': [-0.002104900488902217, -0.0034679151915213424, 0.0012659901948626207]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_174_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_174_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_174_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_174_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.9999643490106183, 0.005007713450141965, -0.006820932250517381], [-0.004994218544548531, 0.9999851347119364, 0.0020857596614642536], [0.0068306095153302035, -0.0020519868188517945, 0.9999742349784666]], 'translation vector': [-0.002699516524657053, 0.0005464771955957654, -0.0009563459127281959]}\nB: {'rotation matrix': [[0.549558, 0.430394, -0.716064], [0.833614, -0.2256, 0.504176], [0.05545, -0.873994, -0.482762]], 'translation vector': [3.109701, 1.26111, 1.347453]}\nC: {'rotation matrix': [[0.545357, 0.429527, -0.719787], [0.836166, -0.218941, 0.502882], [0.05841, -0.876111, -0.478557]], 'translation vector': [3.10956, 1.258833, 1.347276]}\nD: {'rotation matrix': [[0.548441, 0.430496, -0.716859], [0.834301, -0.22414, 0.503689], [0.056159, -0.874319, -0.482091]], 'translation vector': [3.109132, 1.259955, 1.347698]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999643490106183, 0.005007713450141965, -0.006820932250517381], [-0.004994218544548531, 0.9999851347119364, 0.0020857596614642536], [0.0068306095153302035, -0.0020519868188517945, 0.9999742349784666]], 'translation vector': [-0.002699516524657053, 0.0005464771955957654, -0.0009563459127281959]}\nB: {'rotation matrix': [[0.549558, 0.430394, -0.716064], [0.833614, -0.2256, 0.504176], [0.05545, -0.873994, -0.482762]], 'translation vector': [3.109701, 1.26111, 1.347453]}\nC: {'rotation matrix': [[0.545357, 0.429527, -0.719787], [0.836166, -0.218941, 0.502882], [0.05841, -0.876111, -0.478557]], 'translation vector': [3.10956, 1.258833, 1.347276]}\nD: {'rotation matrix': [[0.548441, 0.430496, -0.716859], [0.834301, -0.22414, 0.503689], [0.056159, -0.874319, -0.482091]], 'translation vector': [3.109132, 1.259955, 1.347698]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_175_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_175_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_175_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_175_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.9999664473886144, -4.097872539031611e-05, -0.00815329503789991], [4.450303663576986e-05, 1.0000000556925084, 0.0005279321342564721], [0.008153564277134814, -0.0005283064017830043, 0.99996718864845]], 'translation vector': [0.0006960777394775519, 0.0030013724924222718, -0.001027289568414247]}\nB: {'rotation matrix': [[-0.825443, 0.242757, -0.509621], [0.56437, 0.373207, -0.736345], [0.011442, -0.895425, -0.445066]], 'translation vector': [4.848658, 2.610627, 1.449985]}\nC: {'rotation matrix': [[-0.825701, 0.242217, -0.50946], [0.563992, 0.37286, -0.73681], [0.01149, -0.895716, -0.444479]], 'translation vector': [4.848603, 2.611202, 1.449781]}\nD: {'rotation matrix': [[-0.825281, 0.242861, -0.509834], [0.5646, 0.373686, -0.735925], [0.011791, -0.895197, -0.445515]], 'translation vector': [4.848519, 2.6109, 1.44995]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999664473886144, -4.097872539031611e-05, -0.00815329503789991], [4.450303663576986e-05, 1.0000000556925084, 0.0005279321342564721], [0.008153564277134814, -0.0005283064017830043, 0.99996718864845]], 'translation vector': [0.0006960777394775519, 0.0030013724924222718, -0.001027289568414247]}\nB: {'rotation matrix': [[-0.825443, 0.242757, -0.509621], [0.56437, 0.373207, -0.736345], [0.011442, -0.895425, -0.445066]], 'translation vector': [4.848658, 2.610627, 1.449985]}\nC: {'rotation matrix': [[-0.825701, 0.242217, -0.50946], [0.563992, 0.37286, -0.73681], [0.01149, -0.895716, -0.444479]], 'translation vector': [4.848603, 2.611202, 1.449781]}\nD: {'rotation matrix': [[-0.825281, 0.242861, -0.509834], [0.5646, 0.373686, -0.735925], [0.011791, -0.895197, -0.445515]], 'translation vector': [4.848519, 2.6109, 1.44995]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_176_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_176_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_176_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_176_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.820406, -0.123018, 0.558391], [-0.564425, -0.330389, 0.756484], [0.091425, -0.935794, -0.340487]], 'translation vector': [1.795617, 2.461673, 1.379824]}\nB: {'rotation matrix': [[0.820181, -0.122779, 0.558774], [-0.564668, -0.330702, 0.756166], [0.091946, -0.935714, -0.340565]], 'translation vector': [1.795446, 2.463577, 1.379349]}\nC: {'rotation matrix': [[0.9999965570678159, 0.002414373935934805, -0.0013910970690502594], [-0.0024092149366839086, 0.9999893972371463, 0.004094256181755705], [0.0014001974871553952, -0.0040912377586492955, 0.9999902946910975]], 'translation vector': [0.0015616232476808878, 0.0015124074702654866, -0.0024993421767338653]}\nD: {'rotation matrix': [[0.81953, -0.122236, 0.559847], [-0.565374, -0.331709, 0.755196], [0.093395, -0.935429, -0.340955]], 'translation vector': [1.794011, 2.466618, 1.378419]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.820406, -0.123018, 0.558391], [-0.564425, -0.330389, 0.756484], [0.091425, -0.935794, -0.340487]], 'translation vector': [1.795617, 2.461673, 1.379824]}\nB: {'rotation matrix': [[0.820181, -0.122779, 0.558774], [-0.564668, -0.330702, 0.756166], [0.091946, -0.935714, -0.340565]], 'translation vector': [1.795446, 2.463577, 1.379349]}\nC: {'rotation matrix': [[0.9999965570678159, 0.002414373935934805, -0.0013910970690502594], [-0.0024092149366839086, 0.9999893972371463, 0.004094256181755705], [0.0014001974871553952, -0.0040912377586492955, 0.9999902946910975]], 'translation vector': [0.0015616232476808878, 0.0015124074702654866, -0.0024993421767338653]}\nD: {'rotation matrix': [[0.81953, -0.122236, 0.559847], [-0.565374, -0.331709, 0.755196], [0.093395, -0.935429, -0.340955]], 'translation vector': [1.794011, 2.466618, 1.378419]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_177_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_177_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_177_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_177_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.802386, 0.058378, -0.593943], [0.596799, 0.07378, -0.798992], [-0.002822, -0.995564, -0.09404]], 'translation vector': [2.583445, 4.00863, 1.432702]}\nB: {'rotation matrix': [[-0.80243, 0.05764, -0.593956], [0.596739, 0.072442, -0.799159], [-0.003036, -0.995706, -0.092526]], 'translation vector': [2.583423, 4.00901, 1.432499]}\nC: {'rotation matrix': [[-0.802147, 0.057229, -0.594377], [0.597116, 0.07095, -0.799012], [-0.003555, -0.995837, -0.091085]], 'translation vector': [2.584156, 4.008181, 1.433043]}\nD: {'rotation matrix': [[0.9999994783035012, 0.00041051927851740725, -0.0005862593604151125], [-0.00040931866521127987, 0.9999994989245719, 0.001478878293238192], [0.0005866244600769685, -0.00147872503352613, 0.9999988017009569]], 'translation vector': [0.00041364848275698973, -0.004321265289284559, -0.00018345117394513721]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.802386, 0.058378, -0.593943], [0.596799, 0.07378, -0.798992], [-0.002822, -0.995564, -0.09404]], 'translation vector': [2.583445, 4.00863, 1.432702]}\nB: {'rotation matrix': [[-0.80243, 0.05764, -0.593956], [0.596739, 0.072442, -0.799159], [-0.003036, -0.995706, -0.092526]], 'translation vector': [2.583423, 4.00901, 1.432499]}\nC: {'rotation matrix': [[-0.802147, 0.057229, -0.594377], [0.597116, 0.07095, -0.799012], [-0.003555, -0.995837, -0.091085]], 'translation vector': [2.584156, 4.008181, 1.433043]}\nD: {'rotation matrix': [[0.9999994783035012, 0.00041051927851740725, -0.0005862593604151125], [-0.00040931866521127987, 0.9999994989245719, 0.001478878293238192], [0.0005866244600769685, -0.00147872503352613, 0.9999988017009569]], 'translation vector': [0.00041364848275698973, -0.004321265289284559, -0.00018345117394513721]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_178_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_178_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_178_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_178_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.992585, -0.064418, 0.103079], [-0.120896, -0.435169, 0.892195], [-0.012617, -0.898041, -0.43973]], 'translation vector': [3.286474, 2.568909, 1.509796]}\nB: {'rotation matrix': [[0.992385, -0.067834, 0.102811], [-0.122233, -0.439432, 0.889921], [-0.015188, -0.895711, -0.444377]], 'translation vector': [3.289696, 2.56831, 1.509591]}\nC: {'rotation matrix': [[0.9999807964258803, 0.0038860910275422844, -0.004806691946462499], [-0.003909951367268014, 0.9999796651555257, -0.005033098084012006], [0.004787399082777756, 0.005051536745205018, 0.999976147031262]], 'translation vector': [-0.001554233624264434, -0.002644430194159053, -0.0006288644424583545]}\nD: {'rotation matrix': [[0.992758, -0.062357, 0.102681], [-0.119576, -0.430727, 0.894525], [-0.011552, -0.900325, -0.435064]], 'translation vector': [3.283188, 2.568117, 1.510042]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.992585, -0.064418, 0.103079], [-0.120896, -0.435169, 0.892195], [-0.012617, -0.898041, -0.43973]], 'translation vector': [3.286474, 2.568909, 1.509796]}\nB: {'rotation matrix': [[0.992385, -0.067834, 0.102811], [-0.122233, -0.439432, 0.889921], [-0.015188, -0.895711, -0.444377]], 'translation vector': [3.289696, 2.56831, 1.509591]}\nC: {'rotation matrix': [[0.9999807964258803, 0.0038860910275422844, -0.004806691946462499], [-0.003909951367268014, 0.9999796651555257, -0.005033098084012006], [0.004787399082777756, 0.005051536745205018, 0.999976147031262]], 'translation vector': [-0.001554233624264434, -0.002644430194159053, -0.0006288644424583545]}\nD: {'rotation matrix': [[0.992758, -0.062357, 0.102681], [-0.119576, -0.430727, 0.894525], [-0.011552, -0.900325, -0.435064]], 'translation vector': [3.283188, 2.568117, 1.510042]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_179_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_179_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_179_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_179_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.422089, -0.487348, 0.764417], [-0.906483, 0.237506, -0.349114], [-0.011414, -0.840287, -0.542021]], 'translation vector': [1.410195, 1.210537, 1.389714]}\nB: {'rotation matrix': [[-0.429509, -0.487246, 0.760337], [-0.903019, 0.239935, -0.356353], [-0.0088, -0.839656, -0.543047]], 'translation vector': [1.408282, 1.210133, 1.390728]}\nC: {'rotation matrix': [[0.9999401042348363, 0.0004722608940826321, -0.010894896595028812], [-0.0004891614331012371, 0.9999986952391333, -0.0015551949611865112], [0.010893168247730851, 0.0015589513371194136, 0.9999394642924967]], 'translation vector': [-0.002554071705175298, -0.000426511698759402, -0.0005788025297613075]}\nD: {'rotation matrix': [[-0.426247, -0.487547, 0.761978], [-0.904552, 0.238954, -0.353109], [-0.00992, -0.839761, -0.542865]], 'translation vector': [1.409365, 1.209862, 1.390977]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.422089, -0.487348, 0.764417], [-0.906483, 0.237506, -0.349114], [-0.011414, -0.840287, -0.542021]], 'translation vector': [1.410195, 1.210537, 1.389714]}\nB: {'rotation matrix': [[-0.429509, -0.487246, 0.760337], [-0.903019, 0.239935, -0.356353], [-0.0088, -0.839656, -0.543047]], 'translation vector': [1.408282, 1.210133, 1.390728]}\nC: {'rotation matrix': [[0.9999401042348363, 0.0004722608940826321, -0.010894896595028812], [-0.0004891614331012371, 0.9999986952391333, -0.0015551949611865112], [0.010893168247730851, 0.0015589513371194136, 0.9999394642924967]], 'translation vector': [-0.002554071705175298, -0.000426511698759402, -0.0005788025297613075]}\nD: {'rotation matrix': [[-0.426247, -0.487547, 0.761978], [-0.904552, 0.238954, -0.353109], [-0.00992, -0.839761, -0.542865]], 'translation vector': [1.409365, 1.209862, 1.390977]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_180_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_180_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_180_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_180_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.471816, -0.325425, 0.819444], [-0.872235, -0.30807, 0.379868], [0.128827, -0.893975, -0.429199]], 'translation vector': [4.769558, 1.138603, 1.289356]}\nB: {'rotation matrix': [[0.9997418114978017, 0.011895326282976197, -0.019384512225947795], [-0.0117477819809795, 0.9999024052361882, 0.007665229365566708], [0.019474149960248398, -0.007435647706830853, 0.99978314162453]], 'translation vector': [0.0024302909823366026, 0.0025910713671302155, 0.004134808496160325]}\nC: {'rotation matrix': [[0.463265, -0.315518, 0.828151], [-0.87672, -0.299624, 0.376281], [0.129411, -0.900374, -0.415426]], 'translation vector': [4.764074, 1.139958, 1.290116]}\nD: {'rotation matrix': [[0.466198, -0.319071, 0.825139], [-0.875193, -0.302567, 0.377479], [0.129217, -0.898136, -0.420304]], 'translation vector': [4.766454, 1.138272, 1.288707]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.471816, -0.325425, 0.819444], [-0.872235, -0.30807, 0.379868], [0.128827, -0.893975, -0.429199]], 'translation vector': [4.769558, 1.138603, 1.289356]}\nB: {'rotation matrix': [[0.9997418114978017, 0.011895326282976197, -0.019384512225947795], [-0.0117477819809795, 0.9999024052361882, 0.007665229365566708], [0.019474149960248398, -0.007435647706830853, 0.99978314162453]], 'translation vector': [0.0024302909823366026, 0.0025910713671302155, 0.004134808496160325]}\nC: {'rotation matrix': [[0.463265, -0.315518, 0.828151], [-0.87672, -0.299624, 0.376281], [0.129411, -0.900374, -0.415426]], 'translation vector': [4.764074, 1.139958, 1.290116]}\nD: {'rotation matrix': [[0.466198, -0.319071, 0.825139], [-0.875193, -0.302567, 0.377479], [0.129217, -0.898136, -0.420304]], 'translation vector': [4.766454, 1.138272, 1.288707]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_181_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_181_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_181_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_181_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.782202, 0.158811, -0.602444], [0.623011, 0.192986, -0.758033], [-0.004121, -0.968264, -0.249895]], 'translation vector': [5.112607, 3.166242, 1.386639]}\nB: {'rotation matrix': [[-0.778966, 0.157543, -0.606954], [0.627051, 0.189047, -0.75569], [-0.00431, -0.969248, -0.246049]], 'translation vector': [5.115294, 3.157473, 1.383296]}\nC: {'rotation matrix': [[-0.779462, 0.157557, -0.606313], [0.62644, 0.190695, -0.755783], [-0.003458, -0.968923, -0.247339]], 'translation vector': [5.116126, 3.162086, 1.384797]}\nD: {'rotation matrix': [[0.9999744617727225, -0.0001956738250342968, -0.007090775323277213], [0.00023566199396404743, 0.9999840487337286, 0.005570125228072977], [0.007090207625955035, -0.005572126334409686, 0.9999593181215198]], 'translation vector': [0.001060093909475146, -0.0011413036070948707, -0.0054511706559239315]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.782202, 0.158811, -0.602444], [0.623011, 0.192986, -0.758033], [-0.004121, -0.968264, -0.249895]], 'translation vector': [5.112607, 3.166242, 1.386639]}\nB: {'rotation matrix': [[-0.778966, 0.157543, -0.606954], [0.627051, 0.189047, -0.75569], [-0.00431, -0.969248, -0.246049]], 'translation vector': [5.115294, 3.157473, 1.383296]}\nC: {'rotation matrix': [[-0.779462, 0.157557, -0.606313], [0.62644, 0.190695, -0.755783], [-0.003458, -0.968923, -0.247339]], 'translation vector': [5.116126, 3.162086, 1.384797]}\nD: {'rotation matrix': [[0.9999744617727225, -0.0001956738250342968, -0.007090775323277213], [0.00023566199396404743, 0.9999840487337286, 0.005570125228072977], [0.007090207625955035, -0.005572126334409686, 0.9999593181215198]], 'translation vector': [0.001060093909475146, -0.0011413036070948707, -0.0054511706559239315]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_182_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_182_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_182_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_182_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.883351, 0.250777, -0.395983], [0.468408, -0.502792, 0.726495], [-0.016909, -0.827231, -0.561607]], 'translation vector': [3.460753, 1.393703, 1.261616]}\nB: {'rotation matrix': [[0.9999974556820361, 0.0020951454008892073, 0.0005388626757519509], [-0.0020947728022776887, 0.9999969776383335, -0.0008772711173625334], [-0.0005408474142489815, 0.0008761512281933679, 0.9999997549182328]], 'translation vector': [0.002023718546634523, -0.000327483901387704, 0.000534647049254211]}\nC: {'rotation matrix': [[0.882846, 0.250522, -0.397268], [0.469328, -0.502506, 0.726099], [-0.017726, -0.827482, -0.561212]], 'translation vector': [3.46034, 1.393395, 1.261018]}\nD: {'rotation matrix': [[0.883505, 0.250774, -0.395641], [0.468134, -0.50232, 0.726998], [-0.016426, -0.827519, -0.561198]], 'translation vector': [3.461493, 1.393772, 1.262191]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.883351, 0.250777, -0.395983], [0.468408, -0.502792, 0.726495], [-0.016909, -0.827231, -0.561607]], 'translation vector': [3.460753, 1.393703, 1.261616]}\nB: {'rotation matrix': [[0.9999974556820361, 0.0020951454008892073, 0.0005388626757519509], [-0.0020947728022776887, 0.9999969776383335, -0.0008772711173625334], [-0.0005408474142489815, 0.0008761512281933679, 0.9999997549182328]], 'translation vector': [0.002023718546634523, -0.000327483901387704, 0.000534647049254211]}\nC: {'rotation matrix': [[0.882846, 0.250522, -0.397268], [0.469328, -0.502506, 0.726099], [-0.017726, -0.827482, -0.561212]], 'translation vector': [3.46034, 1.393395, 1.261018]}\nD: {'rotation matrix': [[0.883505, 0.250774, -0.395641], [0.468134, -0.50232, 0.726998], [-0.016426, -0.827519, -0.561198]], 'translation vector': [3.461493, 1.393772, 1.262191]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_183_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_183_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_183_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_183_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.082095, -0.690035, 0.719105], [-0.996618, 0.054259, -0.061711], [0.003565, -0.72174, -0.692155]], 'translation vector': [1.142854, 0.964299, 1.384999]}\nB: {'rotation matrix': [[-0.082522, -0.690473, 0.718636], [-0.99657, 0.052619, -0.063881], [0.006294, -0.721442, -0.692446]], 'translation vector': [1.142415, 0.962891, 1.383926]}\nC: {'rotation matrix': [[-0.081714, -0.689876, 0.719301], [-0.996653, 0.054911, -0.060558], [0.00228, -0.721842, -0.692054]], 'translation vector': [1.143872, 0.96595, 1.386324]}\nD: {'rotation matrix': [[0.9999981813065101, -0.0008741599533984917, -0.0018370380112936338], [0.0008726557083801512, 0.9999996139578026, -0.0006456924823724538], [0.0018388517886205992, 0.0006438367932182827, 0.999997821691748]], 'translation vector': [-0.0006029244698710912, 0.002574260386327465, 0.00012150794273751986]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.082095, -0.690035, 0.719105], [-0.996618, 0.054259, -0.061711], [0.003565, -0.72174, -0.692155]], 'translation vector': [1.142854, 0.964299, 1.384999]}\nB: {'rotation matrix': [[-0.082522, -0.690473, 0.718636], [-0.99657, 0.052619, -0.063881], [0.006294, -0.721442, -0.692446]], 'translation vector': [1.142415, 0.962891, 1.383926]}\nC: {'rotation matrix': [[-0.081714, -0.689876, 0.719301], [-0.996653, 0.054911, -0.060558], [0.00228, -0.721842, -0.692054]], 'translation vector': [1.143872, 0.96595, 1.386324]}\nD: {'rotation matrix': [[0.9999981813065101, -0.0008741599533984917, -0.0018370380112936338], [0.0008726557083801512, 0.9999996139578026, -0.0006456924823724538], [0.0018388517886205992, 0.0006438367932182827, 0.999997821691748]], 'translation vector': [-0.0006029244698710912, 0.002574260386327465, 0.00012150794273751986]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_184_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_184_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_184_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_184_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.885696, -0.241091, 0.396758], [-0.463478, 0.409419, -0.785852], [0.027022, -0.879915, -0.474362]], 'translation vector': [3.284311, 2.742399, 1.352773]}\nB: {'rotation matrix': [[-0.887326, -0.240072, 0.393723], [-0.460299, 0.409476, -0.787689], [0.027883, -0.880168, -0.473844]], 'translation vector': [3.284908, 2.737404, 1.354156]}\nC: {'rotation matrix': [[0.9999731444122048, 0.0012103447086951903, -0.007132060128435475], [-0.0011432251198222655, 0.9999548360017081, 0.009399220003853712], [0.007143404275372449, -0.009390869439370925, 0.9999297253861809]], 'translation vector': [-0.004051670502601468, 0.003985677205533222, -0.007748124103307941]}\nD: {'rotation matrix': [[-0.88833, -0.238233, 0.392575], [-0.45841, 0.409739, -0.788653], [0.02703, -0.880545, -0.473192]], 'translation vector': [3.286612, 2.733624, 1.354709]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.885696, -0.241091, 0.396758], [-0.463478, 0.409419, -0.785852], [0.027022, -0.879915, -0.474362]], 'translation vector': [3.284311, 2.742399, 1.352773]}\nB: {'rotation matrix': [[-0.887326, -0.240072, 0.393723], [-0.460299, 0.409476, -0.787689], [0.027883, -0.880168, -0.473844]], 'translation vector': [3.284908, 2.737404, 1.354156]}\nC: {'rotation matrix': [[0.9999731444122048, 0.0012103447086951903, -0.007132060128435475], [-0.0011432251198222655, 0.9999548360017081, 0.009399220003853712], [0.007143404275372449, -0.009390869439370925, 0.9999297253861809]], 'translation vector': [-0.004051670502601468, 0.003985677205533222, -0.007748124103307941]}\nD: {'rotation matrix': [[-0.88833, -0.238233, 0.392575], [-0.45841, 0.409739, -0.788653], [0.02703, -0.880545, -0.473192]], 'translation vector': [3.286612, 2.733624, 1.354709]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_185_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_185_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_185_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_185_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.929629, 0.142082, -0.340003], [0.367757, 0.416138, -0.831615], [0.023331, -0.898132, -0.439106]], 'translation vector': [3.895597, 4.105544, 1.337128]}\nB: {'rotation matrix': [[0.9999978567692223, -0.002080974608083212, 0.0006913520673930054], [0.0020836393175395, 0.9999892180319079, -0.004134789557599625], [-0.0006820179303371059, 0.00413702467971305, 0.9999915554185601]], 'translation vector': [0.0017503586952924977, 0.001995171524919126, -0.0003721348284200232]}\nC: {'rotation matrix': [[-0.930698, 0.142163, -0.337032], [0.365142, 0.415816, -0.832928], [0.021732, -0.898269, -0.438909]], 'translation vector': [3.896934, 4.102128, 1.337288]}\nD: {'rotation matrix': [[-0.927672, 0.142632, -0.34508], [0.372556, 0.415512, -0.82979], [0.02503, -0.898335, -0.438597]], 'translation vector': [3.896674, 4.103256, 1.336071]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.929629, 0.142082, -0.340003], [0.367757, 0.416138, -0.831615], [0.023331, -0.898132, -0.439106]], 'translation vector': [3.895597, 4.105544, 1.337128]}\nB: {'rotation matrix': [[0.9999978567692223, -0.002080974608083212, 0.0006913520673930054], [0.0020836393175395, 0.9999892180319079, -0.004134789557599625], [-0.0006820179303371059, 0.00413702467971305, 0.9999915554185601]], 'translation vector': [0.0017503586952924977, 0.001995171524919126, -0.0003721348284200232]}\nC: {'rotation matrix': [[-0.930698, 0.142163, -0.337032], [0.365142, 0.415816, -0.832928], [0.021732, -0.898269, -0.438909]], 'translation vector': [3.896934, 4.102128, 1.337288]}\nD: {'rotation matrix': [[-0.927672, 0.142632, -0.34508], [0.372556, 0.415512, -0.82979], [0.02503, -0.898335, -0.438597]], 'translation vector': [3.896674, 4.103256, 1.336071]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_186_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_186_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_186_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_186_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.97654, 0.035326, -0.212419], [0.213134, -0.299258, 0.930064], [-0.030712, -0.953518, -0.299767]], 'translation vector': [2.83562, 1.415562, 1.663413]}\nB: {'rotation matrix': [[0.976528, 0.035613, -0.212425], [0.213211, -0.299739, 0.929891], [-0.030556, -0.953356, -0.300296]], 'translation vector': [2.836028, 1.415543, 1.663749]}\nC: {'rotation matrix': [[0.976477, 0.035047, -0.212752], [0.213359, -0.299571, 0.929912], [-0.031143, -0.95343, -0.300002]], 'translation vector': [2.836339, 1.415174, 1.663386]}\nD: {'rotation matrix': [[0.9999989734089719, -0.00043750334983729943, -0.0007413258006153675], [0.00043629581692446537, 0.9999991219954293, -0.0014728840089735186], [0.0007417507454709163, 0.0014731899576831099, 0.9999977641755871]], 'translation vector': [0.0005969954680278278, -0.001266102560834259, -0.001081742192190538]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.97654, 0.035326, -0.212419], [0.213134, -0.299258, 0.930064], [-0.030712, -0.953518, -0.299767]], 'translation vector': [2.83562, 1.415562, 1.663413]}\nB: {'rotation matrix': [[0.976528, 0.035613, -0.212425], [0.213211, -0.299739, 0.929891], [-0.030556, -0.953356, -0.300296]], 'translation vector': [2.836028, 1.415543, 1.663749]}\nC: {'rotation matrix': [[0.976477, 0.035047, -0.212752], [0.213359, -0.299571, 0.929912], [-0.031143, -0.95343, -0.300002]], 'translation vector': [2.836339, 1.415174, 1.663386]}\nD: {'rotation matrix': [[0.9999989734089719, -0.00043750334983729943, -0.0007413258006153675], [0.00043629581692446537, 0.9999991219954293, -0.0014728840089735186], [0.0007417507454709163, 0.0014731899576831099, 0.9999977641755871]], 'translation vector': [0.0005969954680278278, -0.001266102560834259, -0.001081742192190538]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_187_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_187_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_187_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_187_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.693693, -0.417889, 0.586651], [-0.719234, -0.35819, 0.595318], [-0.038645, -0.834907, -0.549033]], 'translation vector': [2.468094, 0.650908, 1.47083]}\nB: {'rotation matrix': [[0.694995, -0.417186, 0.58561], [-0.71783, -0.35584, 0.598413], [-0.041266, -0.836262, -0.546775]], 'translation vector': [2.468435, 0.652249, 1.472357]}\nC: {'rotation matrix': [[0.692825, -0.417185, 0.588176], [-0.720192, -0.359253, 0.593516], [-0.036302, -0.834802, -0.549352]], 'translation vector': [2.467356, 0.649437, 1.470088]}\nD: {'rotation matrix': [[0.9999933308482389, -0.003211931119773083, -0.0013319651843937527], [0.003213068050675667, 0.999995289918959, 0.0008982640947086999], [0.0013289957939952358, -0.000903126250102818, 0.9999984333444373]], 'translation vector': [0.0004300736920825887, -0.0014825221206589134, 0.0006853212942847797]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.693693, -0.417889, 0.586651], [-0.719234, -0.35819, 0.595318], [-0.038645, -0.834907, -0.549033]], 'translation vector': [2.468094, 0.650908, 1.47083]}\nB: {'rotation matrix': [[0.694995, -0.417186, 0.58561], [-0.71783, -0.35584, 0.598413], [-0.041266, -0.836262, -0.546775]], 'translation vector': [2.468435, 0.652249, 1.472357]}\nC: {'rotation matrix': [[0.692825, -0.417185, 0.588176], [-0.720192, -0.359253, 0.593516], [-0.036302, -0.834802, -0.549352]], 'translation vector': [2.467356, 0.649437, 1.470088]}\nD: {'rotation matrix': [[0.9999933308482389, -0.003211931119773083, -0.0013319651843937527], [0.003213068050675667, 0.999995289918959, 0.0008982640947086999], [0.0013289957939952358, -0.000903126250102818, 0.9999984333444373]], 'translation vector': [0.0004300736920825887, -0.0014825221206589134, 0.0006853212942847797]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_188_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_188_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_188_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_188_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.999996219803918, 0.0022950861591112606, 0.0016527156635626785], [-0.002306966520835195, 0.9999706555795355, 0.007290423829373102], [-0.0016351749151619617, -0.007294378220159148, 0.9999718191472952]], 'translation vector': [-0.0006106128955529755, 0.003113758643644493, 0.0009225265168792962]}\nB: {'rotation matrix': [[-0.815808, -0.262316, 0.51541], [-0.578233, 0.385672, -0.71896], [-0.010185, -0.884561, -0.466314]], 'translation vector': [2.767913, 1.370181, 1.363789]}\nC: {'rotation matrix': [[-0.81395, -0.261884, 0.518557], [-0.58082, 0.384609, -0.717443], [-0.011555, -0.885151, -0.46516]], 'translation vector': [2.76859, 1.370986, 1.364432]}\nD: {'rotation matrix': [[-0.813152, -0.262698, 0.519397], [-0.581967, 0.382082, -0.717863], [-0.009871, -0.886004, -0.463573]], 'translation vector': [2.770085, 1.372341, 1.364365]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.999996219803918, 0.0022950861591112606, 0.0016527156635626785], [-0.002306966520835195, 0.9999706555795355, 0.007290423829373102], [-0.0016351749151619617, -0.007294378220159148, 0.9999718191472952]], 'translation vector': [-0.0006106128955529755, 0.003113758643644493, 0.0009225265168792962]}\nB: {'rotation matrix': [[-0.815808, -0.262316, 0.51541], [-0.578233, 0.385672, -0.71896], [-0.010185, -0.884561, -0.466314]], 'translation vector': [2.767913, 1.370181, 1.363789]}\nC: {'rotation matrix': [[-0.81395, -0.261884, 0.518557], [-0.58082, 0.384609, -0.717443], [-0.011555, -0.885151, -0.46516]], 'translation vector': [2.76859, 1.370986, 1.364432]}\nD: {'rotation matrix': [[-0.813152, -0.262698, 0.519397], [-0.581967, 0.382082, -0.717863], [-0.009871, -0.886004, -0.463573]], 'translation vector': [2.770085, 1.372341, 1.364365]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_189_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_189_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_189_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_189_3.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.928393, -0.117955, 0.352381], [-0.371416, -0.324296, 0.86999], [0.011656, -0.938573, -0.344885]], 'translation vector': [5.42922, 4.041657, 1.370122]}\nB: {'rotation matrix': [[0.9999934814259427, -0.0026531579456658904, 0.00224130267955375], [0.0026602242897140562, 0.9999913138748001, -0.003078123639036722], [-0.0022329200630453808, 0.0030851593249457813, 0.9999929388785541]], 'translation vector': [0.005873456268956634, 0.01508492723074184, 0.0010277199164683282]}\nC: {'rotation matrix': [[0.928402, -0.120953, 0.351341], [-0.37149, -0.322693, 0.870554], [0.008079, -0.938743, -0.344522]], 'translation vector': [5.430759, 4.038916, 1.364124]}\nD: {'rotation matrix': [[0.929388, -0.122542, 0.348169], [-0.369059, -0.323333, 0.87135], [0.005798, -0.938317, -0.345726]], 'translation vector': [5.437048, 4.036695, 1.363649]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.928393, -0.117955, 0.352381], [-0.371416, -0.324296, 0.86999], [0.011656, -0.938573, -0.344885]], 'translation vector': [5.42922, 4.041657, 1.370122]}\nB: {'rotation matrix': [[0.9999934814259427, -0.0026531579456658904, 0.00224130267955375], [0.0026602242897140562, 0.9999913138748001, -0.003078123639036722], [-0.0022329200630453808, 0.0030851593249457813, 0.9999929388785541]], 'translation vector': [0.005873456268956634, 0.01508492723074184, 0.0010277199164683282]}\nC: {'rotation matrix': [[0.928402, -0.120953, 0.351341], [-0.37149, -0.322693, 0.870554], [0.008079, -0.938743, -0.344522]], 'translation vector': [5.430759, 4.038916, 1.364124]}\nD: {'rotation matrix': [[0.929388, -0.122542, 0.348169], [-0.369059, -0.323333, 0.87135], [0.005798, -0.938317, -0.345726]], 'translation vector': [5.437048, 4.036695, 1.363649]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_190_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_190_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_190_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_190_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.953183, -0.088424, 0.289177], [-0.302167, 0.24151, -0.922154], [0.011701, -0.966361, -0.256923]], 'translation vector': [1.211269, 4.890959, 1.556685]}\nB: {'rotation matrix': [[-0.956401, -0.093459, 0.276701], [-0.291436, 0.243606, -0.925052], [0.019048, -0.965361, -0.260222]], 'translation vector': [1.215449, 4.887503, 1.555227]}\nC: {'rotation matrix': [[-0.955218, -0.090693, 0.281661], [-0.295515, 0.243707, -0.92373], [0.015133, -0.965599, -0.259595]], 'translation vector': [1.213553, 4.889249, 1.555428]}\nD: {'rotation matrix': [[0.9999952251812577, 0.0022961074705375706, -0.0021161445756707063], [-0.002289678755021547, 0.9999926232978033, 0.0030864959240033065], [0.002122435340409546, -0.003080853599724993, 0.9999927920546235]], 'translation vector': [-0.0038335858537008605, -0.001641279240269633, 0.007141792646779166]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.953183, -0.088424, 0.289177], [-0.302167, 0.24151, -0.922154], [0.011701, -0.966361, -0.256923]], 'translation vector': [1.211269, 4.890959, 1.556685]}\nB: {'rotation matrix': [[-0.956401, -0.093459, 0.276701], [-0.291436, 0.243606, -0.925052], [0.019048, -0.965361, -0.260222]], 'translation vector': [1.215449, 4.887503, 1.555227]}\nC: {'rotation matrix': [[-0.955218, -0.090693, 0.281661], [-0.295515, 0.243707, -0.92373], [0.015133, -0.965599, -0.259595]], 'translation vector': [1.213553, 4.889249, 1.555428]}\nD: {'rotation matrix': [[0.9999952251812577, 0.0022961074705375706, -0.0021161445756707063], [-0.002289678755021547, 0.9999926232978033, 0.0030864959240033065], [0.002122435340409546, -0.003080853599724993, 0.9999927920546235]], 'translation vector': [-0.0038335858537008605, -0.001641279240269633, 0.007141792646779166]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_191_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_191_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_191_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_191_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.984658, -0.071177, 0.15932], [-0.173691, -0.312158, 0.934018], [-0.016748, -0.94736, -0.319732]], 'translation vector': [3.953827, 2.817107, 1.554211]}\nB: {'rotation matrix': [[0.984585, -0.072332, 0.159251], [-0.17407, -0.316233, 0.932575], [-0.017095, -0.94592, -0.323949]], 'translation vector': [3.956161, 2.818039, 1.553922]}\nC: {'rotation matrix': [[1.000000390586898, -0.00015899006682454229, -7.860499707006931e-05], [0.0001592053293816045, 0.9999988073723325, -0.001691416563679354], [7.911078695652774e-05, 0.0016920717225864005, 0.999998246065904]], 'translation vector': [-0.0031188610741375022, -0.006275319353865605, -0.0020119857094367255]}\nD: {'rotation matrix': [[0.985021, -0.07075, 0.157254], [-0.171705, -0.318523, 0.932234], [-0.015866, -0.945271, -0.325899]], 'translation vector': [3.958103, 2.817717, 1.548612]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.984658, -0.071177, 0.15932], [-0.173691, -0.312158, 0.934018], [-0.016748, -0.94736, -0.319732]], 'translation vector': [3.953827, 2.817107, 1.554211]}\nB: {'rotation matrix': [[0.984585, -0.072332, 0.159251], [-0.17407, -0.316233, 0.932575], [-0.017095, -0.94592, -0.323949]], 'translation vector': [3.956161, 2.818039, 1.553922]}\nC: {'rotation matrix': [[1.000000390586898, -0.00015899006682454229, -7.860499707006931e-05], [0.0001592053293816045, 0.9999988073723325, -0.001691416563679354], [7.911078695652774e-05, 0.0016920717225864005, 0.999998246065904]], 'translation vector': [-0.0031188610741375022, -0.006275319353865605, -0.0020119857094367255]}\nD: {'rotation matrix': [[0.985021, -0.07075, 0.157254], [-0.171705, -0.318523, 0.932234], [-0.015866, -0.945271, -0.325899]], 'translation vector': [3.958103, 2.817717, 1.548612]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_192_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_192_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_192_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_192_3.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.324348, -0.501243, 0.802218], [-0.945427, 0.143915, -0.292328], [0.031076, -0.853255, -0.520567]], 'translation vector': [-0.28287, 2.921737, 1.307859]}\nB: {'rotation matrix': [[0.9998206713039935, 0.01088419825826216, -0.015474672154137172], [-0.010778287023678227, 0.999918107104106, 0.00690122311170722], [0.015548906823126927, -0.006732006329303681, 0.9998567482634122]], 'translation vector': [-0.0009142965758486277, -0.0021867567072129113, 0.0020065217081752795]}\nC: {'rotation matrix': [[-0.336594, -0.496252, 0.800274], [-0.941144, 0.149432, -0.30318], [0.030867, -0.855222, -0.517342]], 'translation vector': [-0.283556, 2.919329, 1.307566]}\nD: {'rotation matrix': [[-0.33061, -0.49822, 0.801544], [-0.943277, 0.147056, -0.297664], [0.03043, -0.854489, -0.518578]], 'translation vector': [-0.283976, 2.918767, 1.308425]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.324348, -0.501243, 0.802218], [-0.945427, 0.143915, -0.292328], [0.031076, -0.853255, -0.520567]], 'translation vector': [-0.28287, 2.921737, 1.307859]}\nB: {'rotation matrix': [[0.9998206713039935, 0.01088419825826216, -0.015474672154137172], [-0.010778287023678227, 0.999918107104106, 0.00690122311170722], [0.015548906823126927, -0.006732006329303681, 0.9998567482634122]], 'translation vector': [-0.0009142965758486277, -0.0021867567072129113, 0.0020065217081752795]}\nC: {'rotation matrix': [[-0.336594, -0.496252, 0.800274], [-0.941144, 0.149432, -0.30318], [0.030867, -0.855222, -0.517342]], 'translation vector': [-0.283556, 2.919329, 1.307566]}\nD: {'rotation matrix': [[-0.33061, -0.49822, 0.801544], [-0.943277, 0.147056, -0.297664], [0.03043, -0.854489, -0.518578]], 'translation vector': [-0.283976, 2.918767, 1.308425]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_193_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_193_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_193_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_193_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.988022, -0.009517, -0.154018], [0.15411, 0.009774, 0.988005], [-0.007897, -0.999907, 0.011124]], 'translation vector': [3.954252, 2.675021, 1.588509]}\nB: {'rotation matrix': [[0.9999405001439704, 0.001517544746561925, 0.010769614189192206], [-0.0015446990948080185, 0.9999954653771669, 0.0025653888781814608], [-0.01076601396546154, -0.002581837929747806, 0.999938856444669]], 'translation vector': [-0.04482632526707597, 0.009643399205063075, 0.00020168741742709884]}\nC: {'rotation matrix': [[0.989616, -0.01086, -0.14333], [0.143408, 0.0068, 0.98964], [-0.009773, -0.999918, 0.008287]], 'translation vector': [3.942101, 2.673398, 1.591243]}\nD: {'rotation matrix': [[0.990689, -0.012911, -0.13553], [0.135653, 0.009179, 0.990714], [-0.011547, -0.999875, 0.010845]], 'translation vector': [3.935715, 2.670411, 1.599032]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.988022, -0.009517, -0.154018], [0.15411, 0.009774, 0.988005], [-0.007897, -0.999907, 0.011124]], 'translation vector': [3.954252, 2.675021, 1.588509]}\nB: {'rotation matrix': [[0.9999405001439704, 0.001517544746561925, 0.010769614189192206], [-0.0015446990948080185, 0.9999954653771669, 0.0025653888781814608], [-0.01076601396546154, -0.002581837929747806, 0.999938856444669]], 'translation vector': [-0.04482632526707597, 0.009643399205063075, 0.00020168741742709884]}\nC: {'rotation matrix': [[0.989616, -0.01086, -0.14333], [0.143408, 0.0068, 0.98964], [-0.009773, -0.999918, 0.008287]], 'translation vector': [3.942101, 2.673398, 1.591243]}\nD: {'rotation matrix': [[0.990689, -0.012911, -0.13553], [0.135653, 0.009179, 0.990714], [-0.011547, -0.999875, 0.010845]], 'translation vector': [3.935715, 2.670411, 1.599032]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_194_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_194_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_194_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_194_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.986946, -0.051965, 0.152438], [-0.150832, 0.630041, -0.761774], [-0.056457, -0.774822, -0.629654]], 'translation vector': [2.054614, 1.600808, 1.269291]}\nB: {'rotation matrix': [[-0.986874, -0.051472, 0.153072], [-0.151005, 0.630133, -0.761663], [-0.057252, -0.774779, -0.629634]], 'translation vector': [2.054307, 1.600529, 1.268919]}\nC: {'rotation matrix': [[-0.98698, -0.05266, 0.151977], [-0.150937, 0.629701, -0.762033], [-0.055572, -0.77505, -0.629451]], 'translation vector': [2.055977, 1.600957, 1.269368]}\nD: {'rotation matrix': [[0.9999963800999353, -0.00021451754451556594, -0.002594557385802436], [0.0002238699679127643, 0.9999932861643056, 0.0035784816404103737], [0.002594523779822574, -0.003578355478331594, 0.9999903397449379]], 'translation vector': [-0.000729278960620583, -0.000294753198244152, 0.0006269109678853635]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.986946, -0.051965, 0.152438], [-0.150832, 0.630041, -0.761774], [-0.056457, -0.774822, -0.629654]], 'translation vector': [2.054614, 1.600808, 1.269291]}\nB: {'rotation matrix': [[-0.986874, -0.051472, 0.153072], [-0.151005, 0.630133, -0.761663], [-0.057252, -0.774779, -0.629634]], 'translation vector': [2.054307, 1.600529, 1.268919]}\nC: {'rotation matrix': [[-0.98698, -0.05266, 0.151977], [-0.150937, 0.629701, -0.762033], [-0.055572, -0.77505, -0.629451]], 'translation vector': [2.055977, 1.600957, 1.269368]}\nD: {'rotation matrix': [[0.9999963800999353, -0.00021451754451556594, -0.002594557385802436], [0.0002238699679127643, 0.9999932861643056, 0.0035784816404103737], [0.002594523779822574, -0.003578355478331594, 0.9999903397449379]], 'translation vector': [-0.000729278960620583, -0.000294753198244152, 0.0006269109678853635]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_195_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_195_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_195_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_195_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.68897, 0.400817, -0.603877], [0.724521, 0.403569, -0.55875], [0.01975, -0.822483, -0.568447]], 'translation vector': [2.703838, 2.593028, 1.451995]}\nB: {'rotation matrix': [[0.9999867433099353, 0.0013037371773630478, -0.00505037420137311], [-0.0013327082277047075, 0.9999835004529126, -0.005699178965962697], [0.005042267928314986, 0.005705137184663826, 0.9999709125049986]], 'translation vector': [-0.003533739342698565, -0.0004956556588801009, 0.0008851453202214365]}\nC: {'rotation matrix': [[-0.687775, 0.405276, -0.60226], [0.725687, 0.405077, -0.55614], [0.018572, -0.819551, -0.572706]], 'translation vector': [2.702493, 2.593958, 1.452821]}\nD: {'rotation matrix': [[-0.6898, 0.39893, -0.604178], [0.723736, 0.402474, -0.560554], [0.019544, -0.823935, -0.566347]], 'translation vector': [2.703783, 2.591564, 1.452902]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.68897, 0.400817, -0.603877], [0.724521, 0.403569, -0.55875], [0.01975, -0.822483, -0.568447]], 'translation vector': [2.703838, 2.593028, 1.451995]}\nB: {'rotation matrix': [[0.9999867433099353, 0.0013037371773630478, -0.00505037420137311], [-0.0013327082277047075, 0.9999835004529126, -0.005699178965962697], [0.005042267928314986, 0.005705137184663826, 0.9999709125049986]], 'translation vector': [-0.003533739342698565, -0.0004956556588801009, 0.0008851453202214365]}\nC: {'rotation matrix': [[-0.687775, 0.405276, -0.60226], [0.725687, 0.405077, -0.55614], [0.018572, -0.819551, -0.572706]], 'translation vector': [2.702493, 2.593958, 1.452821]}\nD: {'rotation matrix': [[-0.6898, 0.39893, -0.604178], [0.723736, 0.402474, -0.560554], [0.019544, -0.823935, -0.566347]], 'translation vector': [2.703783, 2.591564, 1.452902]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_196_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_196_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_196_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_196_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[0.956857, -0.169845, 0.235747], [-0.290511, -0.544648, 0.786741], [-0.005224, -0.821286, -0.570492]], 'translation vector': [1.276382, 2.833935, 1.317457]}\nB: {'rotation matrix': [[0.9999970754093099, -0.00022106845961624886, 0.002343703924443124], [0.0002217235397933884, 1.000000114885585, -0.0003702254651105004], [-0.0023436320323324275, 0.00037212161436898277, 0.9999969566117395]], 'translation vector': [-0.0012761111666008684, -0.00028494477773222116, -0.0002961069063054378]}\nC: {'rotation matrix': [[0.956588, -0.169724, 0.236925], [-0.291413, -0.54533, 0.785935], [-0.00419, -0.820859, -0.571116]], 'translation vector': [1.27605, 2.834144, 1.316524]}\nD: {'rotation matrix': [[0.956511, -0.169195, 0.237615], [-0.291683, -0.546399, 0.785092], [-0.003001, -0.820257, -0.571988]], 'translation vector': [1.276076, 2.834318, 1.31658]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.956857, -0.169845, 0.235747], [-0.290511, -0.544648, 0.786741], [-0.005224, -0.821286, -0.570492]], 'translation vector': [1.276382, 2.833935, 1.317457]}\nB: {'rotation matrix': [[0.9999970754093099, -0.00022106845961624886, 0.002343703924443124], [0.0002217235397933884, 1.000000114885585, -0.0003702254651105004], [-0.0023436320323324275, 0.00037212161436898277, 0.9999969566117395]], 'translation vector': [-0.0012761111666008684, -0.00028494477773222116, -0.0002961069063054378]}\nC: {'rotation matrix': [[0.956588, -0.169724, 0.236925], [-0.291413, -0.54533, 0.785935], [-0.00419, -0.820859, -0.571116]], 'translation vector': [1.27605, 2.834144, 1.316524]}\nD: {'rotation matrix': [[0.956511, -0.169195, 0.237615], [-0.291683, -0.546399, 0.785092], [-0.003001, -0.820257, -0.571988]], 'translation vector': [1.276076, 2.834318, 1.31658]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_197_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_197_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_197_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_197_3.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.13245, -0.562539, 0.816092], [-0.991146, 0.067404, -0.114398], [0.009346, -0.824018, -0.566486]], 'translation vector': [2.413971, 4.448666, 1.362137]}\nB: {'rotation matrix': [[-0.128485, -0.560739, 0.817963], [-0.991659, 0.064113, -0.111818], [0.010258, -0.825507, -0.564299]], 'translation vector': [2.417159, 4.443525, 1.361777]}\nC: {'rotation matrix': [[-0.132037, -0.563719, 0.815345], [-0.991214, 0.068574, -0.113106], [0.007848, -0.823115, -0.567821]], 'translation vector': [2.411706, 4.446467, 1.360844]}\nD: {'rotation matrix': [[0.9999987411869987, 0.0004418575380488951, 0.0016406984025160954], [-0.0004493688554351619, 0.9999862647190665, 0.005219826446688158], [-0.0016383092715178728, -0.005221150249650812, 0.9999852785117939]], 'translation vector': [-0.005409867262581081, 0.0012422036096766398, -0.002713386665627704]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.13245, -0.562539, 0.816092], [-0.991146, 0.067404, -0.114398], [0.009346, -0.824018, -0.566486]], 'translation vector': [2.413971, 4.448666, 1.362137]}\nB: {'rotation matrix': [[-0.128485, -0.560739, 0.817963], [-0.991659, 0.064113, -0.111818], [0.010258, -0.825507, -0.564299]], 'translation vector': [2.417159, 4.443525, 1.361777]}\nC: {'rotation matrix': [[-0.132037, -0.563719, 0.815345], [-0.991214, 0.068574, -0.113106], [0.007848, -0.823115, -0.567821]], 'translation vector': [2.411706, 4.446467, 1.360844]}\nD: {'rotation matrix': [[0.9999987411869987, 0.0004418575380488951, 0.0016406984025160954], [-0.0004493688554351619, 0.9999862647190665, 0.005219826446688158], [-0.0016383092715178728, -0.005221150249650812, 0.9999852785117939]], 'translation vector': [-0.005409867262581081, 0.0012422036096766398, -0.002713386665627704]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_198_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_198_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_198_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_198_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Pose_Estimation",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_pose_estimation",
+    "options": "A: {'rotation matrix': [[-0.394987, 0.317496, -0.862079], [0.912593, 0.027696, -0.407931], [-0.10564, -0.947855, -0.300685]], 'translation vector': [4.882912, 2.963368, 1.402415]}\nB: {'rotation matrix': [[-0.393743, 0.318728, -0.862194], [0.912946, 0.026185, -0.40724], [-0.107222, -0.947484, -0.301292]], 'translation vector': [4.884082, 2.960136, 1.407949]}\nC: {'rotation matrix': [[-0.393984, 0.318317, -0.862236], [0.913424, 0.031343, -0.405802], [-0.102149, -0.947466, -0.303106]], 'translation vector': [4.883262, 2.96182, 1.402411]}\nD: {'rotation matrix': [[0.9999866432276233, 0.002858711999338773, -0.0043010740025884895], [-0.0027980802651053054, 0.9998995896653632, 0.013894745582855106], [0.004339725514880082, -0.013881253175420062, 0.9998943009326048]], 'translation vector': [-0.001200424251745491, -0.0035619824296807545, 0.0012814893270478578]}",
+    "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.",
+    "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.394987, 0.317496, -0.862079], [0.912593, 0.027696, -0.407931], [-0.10564, -0.947855, -0.300685]], 'translation vector': [4.882912, 2.963368, 1.402415]}\nB: {'rotation matrix': [[-0.393743, 0.318728, -0.862194], [0.912946, 0.026185, -0.40724], [-0.107222, -0.947484, -0.301292]], 'translation vector': [4.884082, 2.960136, 1.407949]}\nC: {'rotation matrix': [[-0.393984, 0.318317, -0.862236], [0.913424, 0.031343, -0.405802], [-0.102149, -0.947466, -0.303106]], 'translation vector': [4.883262, 2.96182, 1.402411]}\nD: {'rotation matrix': [[0.9999866432276233, 0.002858711999338773, -0.0043010740025884895], [-0.0027980802651053054, 0.9998995896653632, 0.013894745582855106], [0.004339725514880082, -0.013881253175420062, 0.9998943009326048]], 'translation vector': [-0.001200424251745491, -0.0035619824296807545, 0.0012814893270478578]}",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_199_0.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_199_1.png",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_199_2.jpg",
+      "../MMIU-Benchmark/threeD_Pose_Estimation/threeD_Pose_Estimation_199_3.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.424269, -0.366439, 0.828081], [-0.894198, -0.025281, 0.446957], [-0.142848, -0.930098, -0.338395]] and translation vector: [2.638367, 6.760901, 1.41712], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.432512, -0.37625, 0.819371], [-0.890339, -0.034872, 0.45396], [-0.14223, -0.925862, -0.350073]] and translation vector: [2.640049, 6.763855, 1.420073], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.438239, -0.392392, 0.808687], [-0.889665, -0.061011, 0.452519], [-0.128226, -0.917772, -0.375835]] and translation vector: [2.630422, 6.772062, 1.413381]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_0_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_0_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_0_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_0_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_0_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_0_5.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_0_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_0_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.156961, 0.257294, -0.953501], [0.986843, 0.002956, -0.161652], [-0.038773, -0.966329, -0.254373]] and translation vector: [1.838324, 1.205476, 1.480452], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.155829, 0.255617, -0.954137], [0.987039, 0.002796, -0.160453], [-0.038347, -0.966774, -0.252739]] and translation vector: [1.83996, 1.205416, 1.474648], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.143517, 0.25546, -0.956108], [0.988424, -0.011031, -0.151315], [-0.049202, -0.966757, -0.25092]] and translation vector: [1.851541, 1.18465, 1.4701]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_1_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_1_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_1_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_1_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_1_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_1_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_1_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_1_7.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.255252, -0.433184, 0.864406], [-0.966562, 0.137073, -0.216725], [-0.024605, -0.890821, -0.453687]] and translation vector: [1.468232, 3.881342, 1.432686], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.253329, -0.437174, 0.862962], [-0.967015, 0.138948, -0.213484], [-0.026577, -0.888579, -0.457953]] and translation vector: [1.469363, 3.879031, 1.438972], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.261321, -0.422366, 0.867939], [-0.964773, 0.142608, -0.221079], [-0.030398, -0.895137, -0.444754]] and translation vector: [1.471272, 3.88079, 1.429099]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_2_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_2_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_2_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_2_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_2_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_2_5.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_2_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_2_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.99336, -0.011945, -0.114427], [0.103059, -0.349694, 0.931178], [-0.051137, -0.936788, -0.346141]] and translation vector: [2.948285, 4.432959, 1.460427], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.99314, -0.016022, -0.115825], [0.102925, -0.35027, 0.930977], [-0.055486, -0.936512, -0.346218]] and translation vector: [2.949102, 4.433566, 1.463483], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.994232, -0.017087, -0.105881], [0.09324, -0.350155, 0.93204], [-0.053001, -0.936536, -0.346542]] and translation vector: [2.955784, 4.441682, 1.459117]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_3_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_3_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_3_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_3_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_3_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_3_5.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_3_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_3_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.30056, -0.511506, 0.805], [-0.953151, 0.130866, -0.272721], [0.034151, -0.849256, -0.526876]] and translation vector: [-0.281614, 2.924112, 1.306122], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.318531, -0.50267, 0.803655], [-0.947336, 0.139247, -0.288383], [0.033055, -0.85319, -0.520551]] and translation vector: [-0.284617, 2.924129, 1.305331], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.357195, -0.491936, 0.793984], [-0.933829, 0.17044, -0.314507], [0.019391, -0.853785, -0.520264]] and translation vector: [-0.283755, 2.908583, 1.310995]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_4_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_4_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_4_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_4_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_4_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_4_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_4_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_4_7.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.042655, 0.409797, -0.911179], [0.998036, -0.024411, -0.0577], [-0.045888, -0.91185, -0.40795]] and translation vector: [2.423933, 1.356295, 3.282493], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.032887, 0.418885, -0.907444], [0.998611, -0.023628, -0.047098], [-0.041169, -0.907732, -0.417526]] and translation vector: [2.425306, 1.358764, 3.278826], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.041885, 0.387609, -0.920872], [0.998138, -0.024683, -0.055789], [-0.044354, -0.921493, -0.385853]] and translation vector: [2.418078, 1.34298, 3.29873]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_5_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_5_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_5_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_5_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_5_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_5_5.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_5_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_5_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.241978, -0.427128, 0.871211], [-0.963615, 0.210861, -0.164264], [-0.113543, -0.879261, -0.462611]] and translation vector: [2.164319, 10.11033, 1.716674], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.23973, -0.426819, 0.871983], [-0.964754, 0.205144, -0.16482], [-0.108534, -0.880762, -0.460955]] and translation vector: [2.164643, 10.108889, 1.726434], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.169937, -0.42419, 0.889485], [-0.982379, 0.144175, -0.118927], [-0.077795, -0.894023, -0.441217]] and translation vector: [2.137954, 10.094281, 1.733226]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_6_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_6_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_6_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_6_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_6_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_6_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_6_6.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_6_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.753053, 0.123809, -0.646206], [0.619922, -0.462608, 0.633791], [-0.220471, -0.877875, -0.42512]] and translation vector: [4.259223, 3.769218, 1.505729], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.760823, 0.125761, -0.636658], [0.611756, -0.466381, 0.638939], [-0.216572, -0.875599, -0.431768]] and translation vector: [4.257898, 3.775608, 1.505422], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.792722, 0.129689, -0.595629], [0.575941, -0.479462, 0.662124], [-0.199711, -0.867927, -0.454772]] and translation vector: [4.245731, 3.788037, 1.507869]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_7_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_7_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_7_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_7_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_7_4.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_7_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_7_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_7_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.40936, -0.486807, 0.77165], [-0.912164, 0.236459, -0.334729], [-0.019515, -0.840896, -0.540844]] and translation vector: [1.412713, 1.214489, 1.390939], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.417972, -0.487805, 0.766384], [-0.908352, 0.237425, -0.344277], [-0.014019, -0.840045, -0.542336]] and translation vector: [1.411881, 1.212071, 1.390231], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.442659, -0.487865, 0.752356], [-0.896674, 0.245809, -0.368176], [-0.005316, -0.837595, -0.546266]] and translation vector: [1.400211, 1.203382, 1.386707]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_8_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_8_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_8_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_8_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_8_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_8_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_8_6.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_8_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.999403, 0.004498, 0.03425], [-0.034232, -0.004158, 0.999405], [0.004638, -0.999981, -0.004001]] and translation vector: [2.393484, 5.775056, 1.371464], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.998454, -0.001139, 0.055575], [-0.055569, 0.004857, 0.998443], [-0.001408, -0.999988, 0.004786]] and translation vector: [2.356134, 5.774678, 1.367739], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.989764, 0.002175, 0.142698], [-0.142529, 0.066115, 0.98758], [-0.007287, -0.99781, 0.065748]] and translation vector: [2.255451, 5.785594, 1.33032]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_9_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_9_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_9_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_9_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_9_4.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_9_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_9_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_9_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.133825, -0.39571, 0.908573], [-0.990975, -0.046263, 0.125813], [-0.007752, -0.91721, -0.398329]] and translation vector: [4.990516, 4.227292, 1.32289], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.168071, -0.388121, 0.906153], [-0.985699, -0.054747, 0.159375], [-0.012247, -0.919981, -0.391772]] and translation vector: [4.987841, 4.19209, 1.32312], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.233014, -0.364692, 0.901501], [-0.972471, -0.085505, 0.216767], [-0.00197, -0.927194, -0.374577]] and translation vector: [4.985941, 4.092797, 1.324644]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_10_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_10_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_10_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_10_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_10_4.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_10_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_10_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_10_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.993306, 0.029023, -0.111812], [0.110831, -0.512349, 0.851596], [-0.032571, -0.858287, -0.512136]] and translation vector: [2.482234, 1.391135, 1.348064], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.992702, 0.031717, -0.116349], [0.116167, -0.510508, 0.85199], [-0.032374, -0.859288, -0.510467]] and translation vector: [2.48213, 1.388715, 1.34704], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.989452, 0.033499, -0.140936], [0.139029, -0.492892, 0.858911], [-0.040694, -0.869445, -0.49235]] and translation vector: [2.480608, 1.381749, 1.351104]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_11_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_11_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_11_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_11_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_11_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_11_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_11_6.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_11_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.473704, -0.275929, 0.836342], [-0.879436, -0.198746, 0.432542], [0.046868, -0.940406, -0.336809]] and translation vector: [2.984934, 2.048073, 1.446683], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.466625, -0.271085, 0.841888], [-0.8831, -0.195475, 0.426525], [0.048943, -0.942498, -0.330608]] and translation vector: [2.979092, 2.049407, 1.446378], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.457049, -0.259072, 0.850875], [-0.888339, -0.18058, 0.422191], [0.044273, -0.948827, -0.312678]] and translation vector: [2.973803, 2.044357, 1.455601]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_12_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_12_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_12_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_12_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_12_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_12_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_12_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_12_7.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.996429, -0.081152, -0.023325], [-0.01119, 0.400709, -0.916137], [0.083693, -0.912604, -0.400187]] and translation vector: [7.365378, 2.610504, 1.343957], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.997089, -0.075007, -0.013671], [-0.016913, 0.392439, -0.919623], [0.074343, -0.916715, -0.392565]] and translation vector: [7.36531, 2.61944, 1.344548], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.997405, -0.064807, -0.031376], [0.004675, 0.376559, -0.926381], [0.071851, -0.924123, -0.375279]] and translation vector: [7.389543, 2.653858, 1.358479]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_13_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_13_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_13_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_13_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_13_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_13_5.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_13_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_13_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.994136, 0.036629, -0.101745], [0.107123, -0.462198, 0.880283], [-0.014782, -0.88602, -0.463411]] and translation vector: [3.8191, 1.340951, 1.354002], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.994264, 0.034625, -0.101195], [0.105882, -0.452335, 0.885541], [-0.015112, -0.891176, -0.453407]] and translation vector: [3.821174, 1.339834, 1.359098], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.998446, 0.039334, -0.039482], [0.052098, -0.407104, 0.911895], [0.019796, -0.912535, -0.408521]] and translation vector: [3.821787, 1.333543, 1.372052]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_14_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_14_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_14_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_14_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_14_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_14_5.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_14_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_14_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.844798, -0.442354, 0.301064], [-0.534849, 0.714819, -0.450523], [-0.015916, -0.541624, -0.84047]] and translation vector: [3.085932, 7.995926, 1.934485], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.833593, -0.457276, 0.309873], [-0.552243, 0.702368, -0.449118], [-0.012274, -0.545507, -0.838017]] and translation vector: [3.091993, 8.002051, 1.93396], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.810018, -0.472367, 0.347478], [-0.58602, 0.673547, -0.450461], [-0.02126, -0.56851, -0.822401]] and translation vector: [3.083665, 8.001425, 1.939036]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_15_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_15_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_15_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_15_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_15_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_15_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_15_6.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_15_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.937403, 0.174354, -0.301457], [0.34768, 0.517889, -0.781607], [0.019845, -0.837491, -0.54609]] and translation vector: [1.513881, 1.499843, 1.388066], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.93698, 0.17766, -0.300842], [0.348874, 0.522274, -0.77815], [0.018876, -0.834067, -0.551341]] and translation vector: [1.515168, 1.503997, 1.385631], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.940806, 0.177334, -0.288855], [0.338804, 0.516688, -0.786286], [0.009813, -0.837607, -0.546185]] and translation vector: [1.517717, 1.515309, 1.387193]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_16_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_16_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_16_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_16_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_16_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_16_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_16_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_16_7.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.964843, 0.186346, -0.185345], [0.252505, 0.461537, -0.850426], [-0.07293, -0.867329, -0.492364]] and translation vector: [3.779865, 2.337391, 1.461827], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.966867, 0.182729, -0.178267], [0.244986, 0.467845, -0.849178], [-0.071768, -0.864715, -0.49711]] and translation vector: [3.779708, 2.335608, 1.46105], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.975115, 0.169172, -0.14329], [0.209929, 0.496761, -0.842115], [-0.071282, -0.85124, -0.519913]] and translation vector: [3.784041, 2.330569, 1.454727]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_17_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_17_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_17_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_17_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_17_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_17_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_17_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_17_7.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.399387, 0.327689, -0.856218], [0.9115, 0.041819, -0.409169], [-0.098274, -0.94386, -0.315391]] and translation vector: [4.88233, 2.963563, 1.403722], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.394763, 0.316878, -0.86241], [0.913367, 0.033579, -0.40575], [-0.099614, -0.947872, -0.302681]] and translation vector: [4.88409, 2.965299, 1.400614], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.386874, 0.309114, -0.868779], [0.915474, 0.015736, -0.402069], [-0.110614, -0.950895, -0.289074]] and translation vector: [4.883719, 2.961581, 1.413125]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_18_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_18_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_18_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_18_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_18_4.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_18_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_18_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_18_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.830629, 0.239867, -0.502514], [0.556756, 0.37214, -0.742654], [0.008867, -0.896647, -0.442658]] and translation vector: [4.849209, 2.614689, 1.447477], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.826514, 0.239564, -0.509396], [0.562778, 0.371773, -0.738286], [0.012512, -0.89688, -0.442097]] and translation vector: [4.848542, 2.612423, 1.449706], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.822193, 0.245879, -0.513364], [0.569134, 0.369775, -0.734406], [0.009254, -0.895997, -0.443965]] and translation vector: [4.848, 2.609138, 1.450893]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_19_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_19_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_19_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_19_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_19_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_19_5.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_19_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_19_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.815869, 0.244354, -0.524069], [0.578211, -0.336271, 0.743367], [0.005416, -0.909513, -0.415641]] and translation vector: [2.358014, 1.230078, 1.369842], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.817563, 0.244526, -0.521342], [0.575764, -0.332513, 0.746947], [0.009295, -0.910847, -0.41264]] and translation vector: [2.355037, 1.229076, 1.372478], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.827304, 0.233324, -0.511006], [0.561698, -0.330711, 0.758371], [0.007951, -0.914434, -0.404656]] and translation vector: [2.3528, 1.226651, 1.376959]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_20_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_20_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_20_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_20_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_20_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_20_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_20_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_20_7.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.880278, -0.246293, 0.405524], [-0.473973, 0.417832, -0.775091], [0.021459, -0.874503, -0.484545]] and translation vector: [3.281806, 2.754624, 1.352781], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.883446, -0.241464, 0.401521], [-0.467927, 0.41107, -0.782347], [0.023856, -0.879043, -0.476146]] and translation vector: [3.2823, 2.745028, 1.352692], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.889317, -0.237291, 0.390907], [-0.456246, 0.402627, -0.793556], [0.030913, -0.884073, -0.466326]] and translation vector: [3.299646, 2.724283, 1.356988]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_21_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_21_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_21_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_21_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_21_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_21_5.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_21_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_21_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.752388, 0.33007, -0.570058], [0.655329, 0.287372, -0.698542], [-0.066749, -0.89915, -0.43252]] and translation vector: [3.814293, 2.583141, 1.394159], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.750374, 0.330815, -0.572276], [0.657793, 0.28836, -0.695813], [-0.065164, -0.89856, -0.433986]] and translation vector: [3.802971, 2.57897, 1.383742], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.774913, 0.365169, -0.515909], [0.625622, 0.32685, -0.708355], [-0.090045, -0.871677, -0.481738]] and translation vector: [3.702851, 2.52357, 1.379531]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_22_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_22_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_22_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_22_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_22_4.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_22_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_22_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_22_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.764638, 0.028658, -0.643823], [0.64431, -0.055554, 0.762744], [-0.013909, -0.998044, -0.060944]] and translation vector: [3.061982, 3.98913, 1.495508], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.765028, 0.027801, -0.643396], [0.643825, -0.056098, 0.763114], [-0.014878, -0.998038, -0.060816]] and translation vector: [3.064652, 3.991985, 1.487138], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.769869, 0.028995, -0.637544], [0.638044, -0.057257, 0.767869], [-0.01424, -0.997939, -0.06258]] and translation vector: [3.059477, 3.994236, 1.491082]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_23_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_23_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_23_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_23_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_23_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_23_5.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_23_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_23_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.436119, -0.427186, 0.79203], [-0.89981, 0.218659, -0.377532], [-0.011909, -0.877326, -0.479747]] and translation vector: [1.992302, 3.72193, 1.553249], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.436462, -0.426736, 0.792084], [-0.899636, 0.219226, -0.377618], [-0.012502, -0.877403, -0.47959]] and translation vector: [1.991236, 3.722176, 1.553282], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.436236, -0.428201, 0.791418], [-0.899775, 0.217489, -0.37829], [-0.010141, -0.877122, -0.480161]] and translation vector: [1.989599, 3.72313, 1.552786]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_24_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_24_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_24_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_24_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_24_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_24_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_24_6.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_24_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.985254, -0.134646, 0.105573], [-0.142287, -0.302097, 0.942599], [-0.095024, -0.94372, -0.3168]] and translation vector: [1.134605, 1.549487, 1.505245], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.985752, -0.13049, 0.106142], [-0.141062, -0.297585, 0.944216], [-0.091624, -0.945736, -0.311752]] and translation vector: [1.131707, 1.551058, 1.506377], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.987724, -0.11535, 0.105339], [-0.134913, -0.289999, 0.94747], [-0.078743, -0.95005, -0.302001]] and translation vector: [1.113611, 1.565945, 1.522577]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_25_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_25_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_25_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_25_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_25_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_25_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_25_6.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_25_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.567127, -0.123224, 0.81436], [-0.823556, -0.071568, 0.562702], [-0.011056, -0.989795, -0.14207]] and translation vector: [0.249561, 0.967409, 1.634127], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.566682, -0.123694, 0.814599], [-0.82386, -0.07149, 0.562268], [-0.011313, -0.989742, -0.142418]] and translation vector: [0.249762, 0.967631, 1.633273], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.570813, -0.115531, 0.812912], [-0.82106, -0.073224, 0.566127], [-0.005881, -0.990601, -0.136655]] and translation vector: [0.269192, 0.984284, 1.63838]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_26_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_26_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_26_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_26_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_26_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_26_5.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_26_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_26_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.08083, -0.463089, 0.882618], [-0.994842, 0.091929, -0.042874], [-0.061284, -0.881531, -0.468131]] and translation vector: [4.543997, 3.147744, 1.235262], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.097623, -0.477164, 0.873375], [-0.993778, 0.094019, -0.059714], [-0.05362, -0.873771, -0.483373]] and translation vector: [4.550471, 3.148599, 1.246367], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.130487, -0.461277, 0.877608], [-0.991003, 0.087264, -0.101481], [-0.029773, -0.882954, -0.468514]] and translation vector: [4.556965, 3.161462, 1.2534]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_27_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_27_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_27_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_27_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_27_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_27_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_27_6.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_27_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.117057, -0.769276, 0.628102], [-0.987232, -0.021336, 0.157855], [-0.108033, -0.638561, -0.761951]] and translation vector: [1.032686, 1.226834, 2.186959], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.111522, -0.769903, 0.628341], [-0.98843, -0.020525, 0.150284], [-0.102807, -0.637831, -0.763284]] and translation vector: [1.037875, 1.232625, 2.186027], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.047902, -0.766247, 0.640758], [-0.996596, 0.006426, 0.082189], [-0.067095, -0.642514, -0.763331]] and translation vector: [1.085053, 1.269848, 2.178721]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_28_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_28_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_28_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_28_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_28_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_28_5.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_28_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_28_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.824719, -0.175736, 0.537546], [-0.564369, 0.316962, -0.762249], [-0.036427, -0.932015, -0.360584]] and translation vector: [4.397487, 4.054199, 1.411764], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.821778, -0.181799, 0.540028], [-0.568729, 0.319986, -0.757731], [-0.035047, -0.929816, -0.366351]] and translation vector: [4.391561, 4.044915, 1.406417], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.814573, -0.211319, 0.540199], [-0.579135, 0.348873, -0.736811], [-0.032758, -0.913034, -0.406565]] and translation vector: [4.415594, 3.989866, 1.391957]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_29_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_29_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_29_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_29_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_29_4.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_29_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_29_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_29_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.14018, 0.443083, -0.885453], [0.989985, -0.07783, 0.117782], [-0.016727, -0.893096, -0.449556]] and translation vector: [3.549726, 0.935059, 1.485921], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.140682, 0.443565, -0.885132], [0.989931, -0.077142, 0.11868], [-0.015638, -0.892916, -0.449951]] and translation vector: [3.549777, 0.934132, 1.483108], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.137256, 0.445178, -0.88486], [0.99043, -0.074707, 0.116046], [-0.014444, -0.89232, -0.451172]] and translation vector: [3.545579, 0.936731, 1.483973]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_30_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_30_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_30_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_30_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_30_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_30_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_30_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_30_7.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.59597, 0.482312, -0.642025], [0.802979, -0.35126, 0.4815], [0.006716, -0.802491, -0.596626]] and translation vector: [3.449961, 1.112515, 1.412234], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.596047, 0.483799, -0.640833], [0.802896, -0.349913, 0.482617], [0.009254, -0.802184, -0.597005]] and translation vector: [3.451157, 1.111087, 1.411899], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.59137, 0.494753, -0.636789], [0.806303, -0.350525, 0.476453], [0.012516, -0.795205, -0.606211]] and translation vector: [3.452706, 1.109482, 1.412867]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_31_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_31_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_31_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_31_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_31_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_31_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_31_6.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_31_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.112591, -0.547395, 0.829266], [-0.992672, 0.098819, -0.069547], [-0.043877, -0.83102, -0.55451]] and translation vector: [1.18498, 1.814175, 1.496605], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.111637, -0.546351, 0.830083], [-0.992679, 0.100057, -0.067648], [-0.046096, -0.831558, -0.553521]] and translation vector: [1.186424, 1.810214, 1.495373], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.122401, -0.542747, 0.83093], [-0.991535, 0.103412, -0.078512], [-0.043316, -0.833506, -0.55081]] and translation vector: [1.193691, 1.805185, 1.501094]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_32_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_32_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_32_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_32_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_32_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_32_5.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_32_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_32_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.606497, 0.359513, -0.709163], [0.793947, -0.321582, 0.515978], [-0.042553, -0.875977, -0.480473]] and translation vector: [5.898605, 1.464963, 1.329018], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.603336, 0.358994, -0.712116], [0.79647, -0.316333, 0.515334], [-0.040264, -0.878098, -0.476783]] and translation vector: [5.91512, 1.4588, 1.326343], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.586247, 0.38946, -0.710377], [0.809914, -0.302115, 0.502759], [-0.018811, -0.870085, -0.492543]] and translation vector: [6.035654, 1.433116, 1.31748]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_33_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_33_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_33_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_33_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_33_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_33_5.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_33_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_33_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.590232, -0.352789, 0.726062], [-0.807221, -0.252962, 0.533296], [-0.004475, -0.900861, -0.434086]] and translation vector: [2.518124, 2.463328, 1.346668], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.586587, -0.358769, 0.726086], [-0.809845, -0.250747, 0.530356], [-0.008212, -0.899117, -0.437632]] and translation vector: [2.520116, 2.462175, 1.344964], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.595628, -0.375207, 0.710244], [-0.80316, -0.264233, 0.533961], [-0.012675, -0.888482, -0.458736]] and translation vector: [2.525984, 2.461792, 1.333971]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_34_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_34_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_34_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_34_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_34_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_34_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_34_6.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_34_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.934582, -0.143102, 0.325696], [-0.355737, 0.383069, -0.852473], [-0.002774, -0.912568, -0.408916]] and translation vector: [2.694367, 2.483235, 1.465763], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.935747, -0.141154, 0.323191], [-0.352667, 0.379116, -0.85551], [-0.001768, -0.91452, -0.404537]] and translation vector: [2.694351, 2.483417, 1.465522], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.94215, -0.147808, 0.300842], [-0.33486, 0.375166, -0.864361], [0.014894, -0.915098, -0.402958]] and translation vector: [2.702719, 2.477868, 1.47257]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_35_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_35_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_35_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_35_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_35_4.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_35_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_35_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_35_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.684823, -0.326379, 0.651532], [-0.728707, -0.304485, 0.613413], [-0.001823, -0.894855, -0.446353]] and translation vector: [2.86358, 2.414664, 1.549631], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.684506, -0.325468, 0.652321], [-0.729004, -0.308374, 0.611113], [0.002261, -0.893855, -0.448351]] and translation vector: [2.864701, 2.413023, 1.547001], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.67888, -0.327994, 0.656918], [-0.733931, -0.329441, 0.593981], [0.021593, -0.885375, -0.464376]] and translation vector: [2.877256, 2.417151, 1.541322]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_36_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_36_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_36_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_36_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_36_4.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_36_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_36_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_36_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.882784, 0.25224, -0.396318], [0.469583, -0.498211, 0.728888], [-0.013595, -0.829554, -0.55826]] and translation vector: [3.463734, 1.394934, 1.262723], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.883097, 0.250738, -0.396574], [0.468931, -0.499833, 0.728197], [-0.015634, -0.829034, -0.558979]] and translation vector: [3.462241, 1.393432, 1.262782], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.878878, 0.250773, -0.405817], [0.476653, -0.496234, 0.725641], [-0.019409, -0.831183, -0.55566]] and translation vector: [3.458656, 1.394662, 1.254618]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_37_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_37_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_37_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_37_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_37_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_37_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_37_6.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_37_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.752445, 0.275595, -0.598225], [0.657828, -0.35994, 0.661593], [-0.032994, -0.891342, -0.452129]] and translation vector: [2.633805, 2.70906, 1.31733], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.746128, 0.269733, -0.608718], [0.664676, -0.35493, 0.657443], [-0.038718, -0.895136, -0.444108]] and translation vector: [2.667176, 2.689206, 1.310347], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.736878, 0.253582, -0.626664], [0.67323, -0.359496, 0.646161], [-0.061428, -0.89803, -0.435624]] and translation vector: [2.744361, 2.610373, 1.319779]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_38_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_38_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_38_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_38_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_38_4.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_38_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_38_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_38_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.386761, -0.304254, 0.870543], [-0.920043, 0.191539, -0.34181], [-0.062746, -0.933136, -0.354007]] and translation vector: [2.082368, 4.008438, 1.845888], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.387201, -0.298257, 0.872421], [-0.919947, 0.188025, -0.344013], [-0.061432, -0.935783, -0.347183]] and translation vector: [2.08001, 4.010775, 1.842824], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.376594, -0.325714, 0.867229], [-0.924884, 0.185353, -0.332016], [-0.052601, -0.927122, -0.371051]] and translation vector: [2.082613, 4.009402, 1.837637]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_39_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_39_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_39_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_39_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_39_4.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_39_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_39_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_39_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.484778, 0.389748, -0.782998], [0.874059, -0.248441, 0.417491], [-0.031813, -0.886777, -0.461102]] and translation vector: [2.948564, 2.712566, 1.480667], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.484062, 0.388161, -0.784229], [0.874419, -0.248162, 0.416902], [-0.03279, -0.887551, -0.459542]] and translation vector: [2.949191, 2.711738, 1.477649], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.467232, 0.39177, -0.792597], [0.88347, -0.241629, 0.401368], [-0.034271, -0.887768, -0.459014]] and translation vector: [2.947397, 2.72527, 1.480424]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_40_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_40_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_40_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_40_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_40_4.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_40_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_40_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_40_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.493838, -0.420518, 0.76111], [-0.864926, -0.147366, 0.479777], [-0.089593, -0.895236, -0.436493]] and translation vector: [0.736944, 2.108944, 1.402726], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.487676, -0.423405, 0.763479], [-0.869284, -0.154634, 0.469504], [-0.080731, -0.892646, -0.443471]] and translation vector: [0.733117, 2.095654, 1.39687], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.480924, -0.423346, 0.767783], [-0.872629, -0.146192, 0.465989], [-0.085031, -0.894095, -0.439732]] and translation vector: [0.701425, 2.057617, 1.397946]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_41_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_41_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_41_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_41_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_41_4.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_41_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_41_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_41_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.874867, -0.0675, 0.479638], [-0.482919, 0.197999, -0.852987], [-0.037391, -0.977875, -0.205819]] and translation vector: [2.397274, 1.722858, 1.486845], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.874077, -0.063653, 0.4816], [-0.484123, 0.196153, -0.852731], [-0.040189, -0.978505, -0.202269]] and translation vector: [2.402604, 1.721845, 1.489477], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.887879, -0.058916, 0.456289], [-0.458188, 0.203011, -0.865362], [-0.041648, -0.977402, -0.207244]] and translation vector: [2.446714, 1.689918, 1.489633]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_42_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_42_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_42_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_42_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_42_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_42_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_42_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_42_7.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.993805, -0.057016, 0.095394], [-0.110597, -0.423109, 0.899304], [-0.010913, -0.904283, -0.426794]] and translation vector: [3.282054, 2.568905, 1.512321], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.993106, -0.061381, 0.099861], [-0.116562, -0.427194, 0.896615], [-0.012375, -0.902074, -0.431404]] and translation vector: [3.283498, 2.568158, 1.509645], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.991697, -0.07473, 0.104657], [-0.127453, -0.462749, 0.877279], [-0.017129, -0.883334, -0.468431]] and translation vector: [3.294037, 2.566846, 1.501968]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_43_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_43_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_43_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_43_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_43_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_43_5.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_43_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_43_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.672393, -0.274439, 0.687438], [-0.739855, -0.221079, 0.635404], [-0.022402, -0.935846, -0.351697]] and translation vector: [3.802358, 2.110255, 1.494557], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.672432, -0.275262, 0.687071], [-0.739825, -0.222066, 0.635095], [-0.022242, -0.93537, -0.35297]] and translation vector: [3.806542, 2.108163, 1.497405], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.662943, -0.279413, 0.694575], [-0.748414, -0.223073, 0.624593], [-0.019579, -0.933899, -0.357001]] and translation vector: [3.809607, 2.112622, 1.492454]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_44_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_44_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_44_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_44_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_44_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_44_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_44_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_44_7.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.187285, -0.627824, 0.755488], [-0.982305, 0.118515, -0.145025], [0.001514, -0.76928, -0.63891]] and translation vector: [1.001752, 1.17634, 1.437838], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.187139, -0.630563, 0.75324], [-0.982328, 0.117514, -0.14568], [0.003345, -0.767191, -0.64141]] and translation vector: [1.00191, 1.178201, 1.437088], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.182531, -0.636948, 0.748986], [-0.983189, 0.114531, -0.142208], [0.004797, -0.762352, -0.647145]] and translation vector: [1.004145, 1.176443, 1.437678]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_45_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_45_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_45_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_45_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_45_4.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_45_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_45_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_45_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.977181, 0.077241, -0.197866], [0.211774, -0.426158, 0.879512], [-0.016388, -0.901345, -0.432791]] and translation vector: [0.977323, 0.877303, 1.40232], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.979446, 0.063797, -0.19135], [0.200663, -0.404476, 0.892263], [-0.020472, -0.912321, -0.408965]] and translation vector: [0.961423, 0.875672, 1.418643], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.983838, 0.068482, -0.165447], [0.178902, -0.337078, 0.924323], [0.007531, -0.938983, -0.343882]] and translation vector: [0.935081, 0.882589, 1.453845]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_46_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_46_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_46_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_46_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_46_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_46_5.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_46_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_46_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.205964, -0.505778, 0.837716], [-0.978495, 0.11627, -0.170378], [-0.011228, -0.854792, -0.518849]] and translation vector: [2.901534, 4.292832, 1.280844], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.204012, -0.504726, 0.838827], [-0.978841, 0.118998, -0.166463], [-0.0158, -0.855039, -0.518324]] and translation vector: [2.909629, 4.290413, 1.285823], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.169049, -0.47943, 0.861144], [-0.985403, 0.100042, -0.137744], [-0.020112, -0.871859, -0.489344]] and translation vector: [2.918062, 4.255744, 1.296137]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_47_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_47_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_47_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_47_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_47_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_47_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_47_6.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_47_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.810147, -0.229725, 0.539341], [-0.586224, 0.314131, -0.746769], [0.002128, -0.921167, -0.389162]] and translation vector: [3.108561, 2.950706, 1.466118], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.798041, -0.241673, 0.552019], [-0.602539, 0.306626, -0.736836], [0.00881, -0.920638, -0.390318]] and translation vector: [3.094201, 2.939754, 1.46817], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.730942, -0.298846, 0.613526], [-0.681648, 0.276413, -0.677461], [0.03287, -0.913393, -0.40575]] and translation vector: [3.008661, 2.892656, 1.463078]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_48_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_48_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_48_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_48_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_48_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_48_5.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_48_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_48_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.983299, 0.047874, -0.175588], [0.180439, -0.382417, 0.9062], [-0.023764, -0.922749, -0.384668]] and translation vector: [2.208684, 3.483128, 1.468268], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.982577, 0.045136, -0.18029], [0.183889, -0.376806, 0.907856], [-0.026957, -0.925192, -0.378541]] and translation vector: [2.211137, 3.481059, 1.465482], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.983986, 0.057121, -0.168843], [0.177826, -0.379389, 0.907988], [-0.012192, -0.923472, -0.383472]] and translation vector: [2.214237, 3.490379, 1.461581]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_49_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_49_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_49_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_49_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_49_4.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_49_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_49_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_49_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.053762, 0.423971, -0.904079], [0.99709, -0.071809, 0.025618], [-0.05406, -0.902825, -0.426597]] and translation vector: [3.696534, 7.381392, 1.65485], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.059051, 0.424044, -0.903714], [0.996629, -0.076693, 0.029136], [-0.056954, -0.902388, -0.427143]] and translation vector: [3.693501, 7.384472, 1.654036], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.076295, 0.430516, -0.899353], [0.995602, -0.082082, 0.045168], [-0.054375, -0.898843, -0.434884]] and translation vector: [3.686877, 7.38459, 1.650219]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_50_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_50_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_50_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_50_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_50_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_50_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_50_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_50_7.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.079656, -0.319192, 0.944337], [-0.994012, 0.096527, -0.051219], [-0.074805, -0.942762, -0.324969]] and translation vector: [4.3352, 2.935251, 1.464921], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.08136, -0.319768, 0.943996], [-0.993796, 0.098086, -0.052427], [-0.075828, -0.942405, -0.325765]] and translation vector: [4.335558, 2.933583, 1.460394], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.082648, -0.359045, 0.929654], [-0.993327, 0.104973, -0.047767], [-0.080438, -0.927398, -0.365325]] and translation vector: [4.342546, 2.934833, 1.439448]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_51_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_51_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_51_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_51_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_51_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_51_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_51_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_51_7.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.299058, 0.37418, -0.877812], [0.95368, -0.085842, 0.288314], [0.032528, -0.923375, -0.38252]] and translation vector: [3.908031, 4.993837, 1.41318], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.301871, 0.365699, -0.880419], [0.952911, -0.087746, 0.290279], [0.028901, -0.926588, -0.374966]] and translation vector: [3.903484, 4.991583, 1.422828], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.301255, 0.344295, -0.889217], [0.952977, -0.076566, 0.293211], [0.032867, -0.935734, -0.351171]] and translation vector: [3.913385, 4.973511, 1.425571]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_52_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_52_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_52_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_52_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_52_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_52_5.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_52_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_52_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.769532, -0.429513, 0.472588], [-0.615738, -0.302759, 0.727464], [-0.169375, -0.850797, -0.49745]] and translation vector: [2.184386, 2.253813, 1.283805], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.76638, -0.428136, 0.478917], [-0.620171, -0.298738, 0.725357], [-0.167481, -0.85291, -0.494464]] and translation vector: [2.185226, 2.257666, 1.286817], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.752434, -0.422477, 0.505328], [-0.641308, -0.294924, 0.708339], [-0.150223, -0.857049, -0.492848]] and translation vector: [2.203988, 2.240772, 1.285116]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_53_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_53_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_53_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_53_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_53_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_53_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_53_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_53_7.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.255196, -0.436856, 0.862573], [-0.966393, 0.143834, -0.213066], [-0.030988, -0.887958, -0.45888]] and translation vector: [1.734999, 0.744851, 1.432124], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.254375, -0.435236, 0.863634], [-0.966628, 0.142475, -0.21291], [-0.03038, -0.888972, -0.456953]] and translation vector: [1.735377, 0.747301, 1.433656], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.252592, -0.430397, 0.866577], [-0.967061, 0.14143, -0.211638], [-0.031471, -0.891491, -0.451944]] and translation vector: [1.738514, 0.752667, 1.434948]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_54_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_54_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_54_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_54_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_54_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_54_5.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_54_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_54_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.721847, -0.019511, -0.691778], [0.690918, -0.036893, 0.721991], [-0.039608, -0.999129, -0.013151]] and translation vector: [1.871862, 0.815296, 1.594356], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.723033, -0.022358, -0.690452], [0.689637, -0.034974, 0.723311], [-0.04032, -0.999138, -0.009869]] and translation vector: [1.872181, 0.815734, 1.596287], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.722407, -0.014829, -0.691309], [0.690381, -0.040572, 0.722307], [-0.038759, -0.999067, -0.019072]] and translation vector: [1.866769, 0.812653, 1.587453]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_55_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_55_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_55_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_55_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_55_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_55_5.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_55_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_55_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.15851, 0.420096, -0.893529], [0.981106, -0.034663, -0.190342], [-0.110934, -0.906817, -0.406664]] and translation vector: [4.004256, 0.910349, 2.578562], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.153085, 0.419732, -0.894645], [0.982322, -0.034068, -0.184071], [-0.107739, -0.907009, -0.407097]] and translation vector: [4.005316, 0.908549, 2.574668], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.128813, 0.432758, -0.89226], [0.986418, -0.036555, -0.160137], [-0.101917, -0.900769, -0.422171]] and translation vector: [4.005799, 0.894308, 2.560097]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_56_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_56_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_56_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_56_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_56_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_56_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_56_6.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_56_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.974605, -0.106498, 0.196986], [-0.223762, -0.428932, 0.875185], [-0.008712, -0.897037, -0.44187]] and translation vector: [2.006689, 0.552817, 1.711334], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.976991, -0.101609, 0.187523], [-0.213093, -0.42809, 0.878254], [-0.008962, -0.898006, -0.439892]] and translation vector: [2.014877, 0.551422, 1.700123], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.983342, -0.080889, 0.162776], [-0.181747, -0.450774, 0.87394], [0.002683, -0.888966, -0.457967]] and translation vector: [1.906067, 0.734394, 1.70234]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_57_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_57_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_57_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_57_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_57_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_57_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_57_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_57_7.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.677945, 0.409221, -0.610679], [0.735109, 0.38004, -0.561413], [0.00234, -0.829523, -0.558468]] and translation vector: [3.092599, 2.044437, 1.437429], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.678782, 0.408186, -0.610442], [0.734335, 0.380383, -0.562193], [0.002723, -0.829875, -0.557943]] and translation vector: [3.0892, 2.043949, 1.440375], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.676872, 0.407734, -0.61286], [0.736083, 0.380637, -0.559729], [0.005057, -0.829981, -0.557769]] and translation vector: [3.08962, 2.045413, 1.436176]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_58_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_58_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_58_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_58_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_58_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_58_5.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_58_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_58_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.207705, 0.494542, -0.843971], [0.97739, -0.069996, 0.199524], [0.039599, -0.866331, -0.497898]] and translation vector: [4.53083, 2.291093, 1.52739], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.209269, 0.494574, -0.843566], [0.977066, -0.071037, 0.200739], [0.039356, -0.866228, -0.498097]] and translation vector: [4.529976, 2.291335, 1.526507], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.196766, 0.49564, -0.845946], [0.979799, -0.067948, 0.18809], [0.035744, -0.865866, -0.498997]] and translation vector: [4.530453, 2.296434, 1.524226]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_59_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_59_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_59_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_59_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_59_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_59_5.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_59_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_59_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.956223, -0.170898, 0.237554], [-0.292595, -0.544035, 0.786393], [-0.005155, -0.821474, -0.570223]] and translation vector: [1.275326, 2.834272, 1.3185], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.956815, -0.170774, 0.235249], [-0.290631, -0.544392, 0.786875], [-0.00631, -0.821263, -0.570514]] and translation vector: [1.276568, 2.833979, 1.318089], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.956011, -0.167954, 0.240486], [-0.293328, -0.545359, 0.785202], [-0.000727, -0.821203, -0.570635]] and translation vector: [1.277841, 2.834386, 1.31762]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_60_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_60_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_60_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_60_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_60_4.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_60_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_60_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_60_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.928108, -0.125197, 0.35063], [-0.371823, 0.3599, -0.855699], [-0.019061, -0.924553, -0.380577]] and translation vector: [5.296664, 4.137775, 1.856988], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.930637, -0.119308, 0.34595], [-0.365378, 0.355543, -0.860284], [-0.020361, -0.927014, -0.374474]] and translation vector: [5.29653, 4.126579, 1.856014], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.952426, -0.118849, 0.280641], [-0.304767, 0.367704, -0.878584], [0.001226, -0.922317, -0.386432]] and translation vector: [5.320154, 4.099401, 1.857875]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_61_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_61_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_61_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_61_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_61_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_61_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_61_6.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_61_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.86482, -0.183466, 0.467362], [-0.501092, -0.256948, 0.826368], [-0.031523, -0.948851, -0.314147]] and translation vector: [3.012278, 2.022242, 1.442339], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.863867, -0.189194, 0.466839], [-0.502557, -0.260784, 0.824274], [-0.034203, -0.946677, -0.320364]] and translation vector: [3.015002, 2.018446, 1.436262], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.859994, -0.189108, 0.473971], [-0.509792, -0.276775, 0.81456], [-0.022856, -0.942143, -0.33443]] and translation vector: [3.018664, 2.017763, 1.427395]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_62_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_62_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_62_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_62_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_62_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_62_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_62_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_62_7.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.951558, 0.16536, -0.259218], [0.307283, -0.481983, 0.820531], [0.010744, -0.860436, -0.509446]] and translation vector: [2.919862, 3.428013, 1.521081], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.951326, 0.167996, -0.258374], [0.307875, -0.4803, 0.821295], [0.013877, -0.860866, -0.508643]] and translation vector: [2.920042, 3.428186, 1.518811], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.948369, 0.180855, -0.260555], [0.316485, -0.485614, 0.814872], [0.020845, -0.85526, -0.517779]] and translation vector: [2.906806, 3.429147, 1.512746]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_63_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_63_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_63_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_63_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_63_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_63_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_63_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_63_7.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.802837, 0.056561, -0.593509], [0.596192, 0.071654, -0.799638], [-0.002701, -0.995825, -0.091248]] and translation vector: [2.583219, 4.008804, 1.439254], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.802466, 0.056012, -0.594063], [0.59669, 0.070227, -0.799393], [-0.003056, -0.995957, -0.089777]] and translation vector: [2.583684, 4.008714, 1.434935], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.802651, 0.061565, -0.593263], [0.596422, 0.0734, -0.799308], [-0.005664, -0.995401, -0.095633]] and translation vector: [2.580812, 4.010173, 1.435745]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_64_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_64_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_64_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_64_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_64_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_64_5.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_64_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_64_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.355681, -0.20797, 0.911175], [-0.934036, 0.113197, -0.338769], [-0.032689, -0.971563, -0.234514]] and translation vector: [0.539195, 4.841905, 1.636959], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.354881, -0.205091, 0.912139], [-0.934375, 0.110848, -0.338608], [-0.031664, -0.972446, -0.230969]] and translation vector: [0.533365, 4.84225, 1.627512], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.357394, -0.22244, 0.907078], [-0.933778, 0.10396, -0.34242], [-0.018132, -0.969388, -0.244864]] and translation vector: [0.528036, 4.836335, 1.624936]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_65_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_65_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_65_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_65_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_65_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_65_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_65_6.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_65_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.566304, -0.590941, 0.574533], [-0.823945, 0.423135, -0.376925], [-0.020365, -0.686838, -0.726526]] and translation vector: [2.143516, 1.760119, 1.343188], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.561614, -0.596242, 0.57366], [-0.827171, 0.420904, -0.372329], [-0.019457, -0.683619, -0.729579]] and translation vector: [2.147258, 1.761594, 1.344016], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.547252, -0.609389, 0.573725], [-0.836861, 0.409368, -0.363431], [-0.013394, -0.679017, -0.734001]] and translation vector: [2.154856, 1.762344, 1.343807]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_66_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_66_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_66_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_66_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_66_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_66_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_66_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_66_7.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.848489, -0.131122, 0.512712], [-0.527579, 0.133483, -0.838954], [0.041567, -0.982339, -0.182436]] and translation vector: [2.702568, 1.718074, 1.602473], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.851363, -0.128939, 0.508484], [-0.523333, 0.142037, -0.840207], [0.036112, -0.981428, -0.188403]] and translation vector: [2.706553, 1.721294, 1.602035], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.862925, -0.138489, 0.485985], [-0.504369, 0.176659, -0.845224], [0.031201, -0.974481, -0.222293]] and translation vector: [2.716626, 1.723908, 1.586826]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_67_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_67_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_67_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_67_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_67_4.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_67_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_67_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_67_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.205292, 0.226186, -0.952205], [0.97316, -0.150555, 0.174048], [-0.103992, -0.962379, -0.251024]] and translation vector: [4.876985, 2.837537, 1.671042], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.210488, 0.22021, -0.952472], [0.971775, -0.153305, 0.17931], [-0.106533, -0.96333, -0.246263]] and translation vector: [4.87733, 2.840179, 1.675237], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.247756, 0.187443, -0.950517], [0.962582, -0.158806, 0.219585], [-0.109788, -0.969353, -0.219774]] and translation vector: [4.877867, 2.827038, 1.675608]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_68_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_68_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_68_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_68_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_68_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_68_5.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_68_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_68_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.52463, -0.231347, 0.819293], [-0.850589, 0.102279, -0.515789], [0.03553, -0.96748, -0.25044]] and translation vector: [5.897326, 2.792535, 1.553822], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.52763, -0.228151, 0.818263], [-0.84888, 0.105585, -0.517933], [0.03177, -0.967884, -0.249382]] and translation vector: [5.897463, 2.790525, 1.551499], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.541576, -0.222735, 0.810608], [-0.840076, 0.107703, -0.53167], [0.031116, -0.968911, -0.245444]] and translation vector: [5.894893, 2.788883, 1.558074]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_69_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_69_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_69_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_69_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_69_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_69_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_69_6.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_69_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.988959, -0.006087, -0.148062], [0.148117, 0.009943, 0.98892], [-0.004548, -0.999932, 0.010735]] and translation vector: [3.911582, 2.672538, 1.565046], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.987297, -0.007995, -0.158684], [0.158774, 0.012251, 0.987239], [-0.005949, -0.999893, 0.013365]] and translation vector: [3.955948, 2.679338, 1.574419], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.992697, -0.03521, -0.115384], [0.116446, 0.029785, 0.99275], [-0.031518, -0.998936, 0.033668]] and translation vector: [3.907376, 2.643518, 1.623414]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_70_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_70_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_70_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_70_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_70_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_70_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_70_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_70_7.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.999494, 0.005595, 0.031322], [-0.029883, 0.172936, -0.98448], [-0.010925, -0.984917, -0.172681]] and translation vector: [6.687301, 5.436423, 1.742894], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.999393, 0.00615, 0.034285], [-0.032681, 0.175053, -0.984017], [-0.012053, -0.98454, -0.174746]] and translation vector: [6.681215, 5.427393, 1.75699], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.999512, 0.015203, 0.027277], [-0.02448, 0.160854, -0.986675], [-0.019388, -0.986861, -0.160403]] and translation vector: [6.678608, 5.424335, 1.758175]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_71_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_71_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_71_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_71_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_71_4.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_71_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_71_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_71_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.991592, 0.052224, -0.118397], [0.1292, -0.348306, 0.928435], [0.007248, -0.935925, -0.352124]] and translation vector: [2.177373, 2.142725, 1.46728], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.992093, 0.047571, -0.11614], [0.125441, -0.346386, 0.929667], [0.003996, -0.936885, -0.349615]] and translation vector: [2.181058, 2.142908, 1.465582], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.99009, 0.041581, -0.13414], [0.14016, -0.352521, 0.925248], [-0.008815, -0.93488, -0.354856]] and translation vector: [2.196626, 2.148474, 1.466161]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_72_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_72_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_72_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_72_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_72_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_72_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_72_6.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_72_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.132001, -0.567775, 0.812532], [-0.991224, 0.069667, -0.112349], [0.007182, -0.820231, -0.571988]] and translation vector: [2.407685, 4.450429, 1.359714], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.130918, -0.563466, 0.8157], [-0.991376, 0.069526, -0.111087], [0.005882, -0.823209, -0.567709]] and translation vector: [2.40989, 4.444678, 1.359228], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.104614, -0.562754, 0.819978], [-0.994308, 0.042438, -0.097729], [0.020199, -0.825534, -0.563991]] and translation vector: [2.433079, 4.433616, 1.362504]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_73_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_73_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_73_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_73_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_73_4.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_73_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_73_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_73_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.877021, 0.121711, -0.464779], [0.46491, 0.459041, -0.75706], [0.12121, -0.880038, -0.459173]] and translation vector: [3.922419, 3.230202, 1.747047], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.876473, 0.11975, -0.466322], [0.465798, 0.455895, -0.758415], [0.121773, -0.881941, -0.455359]] and translation vector: [3.923546, 3.227255, 1.740959], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.862892, 0.148989, -0.482928], [0.494148, 0.449135, -0.744376], [0.105996, -0.880954, -0.461178]] and translation vector: [3.903725, 3.133858, 1.745573]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_74_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_74_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_74_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_74_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_74_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_74_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_74_6.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_74_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.515401, -0.339121, 0.786994], [-0.847541, -0.337435, 0.40965], [0.126638, -0.878143, -0.461333]] and translation vector: [4.776819, 1.138867, 1.280463], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.495978, -0.33911, 0.799381], [-0.859276, -0.324304, 0.395565], [0.125103, -0.88308, -0.452237]] and translation vector: [4.773187, 1.14016, 1.284317], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.481026, -0.30789, 0.820864], [-0.867671, -0.301264, 0.395457], [0.125539, -0.902465, -0.412064]] and translation vector: [4.757284, 1.147171, 1.295988]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_75_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_75_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_75_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_75_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_75_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_75_5.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_75_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_75_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.623567, 0.536294, -0.568817], [0.781209, -0.455034, 0.427384], [-0.029628, -0.710867, -0.702702]] and translation vector: [1.790477, 1.816361, 1.229059], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.636074, 0.528408, -0.562313], [0.771074, -0.462894, 0.437235], [-0.029252, -0.711698, -0.701876]] and translation vector: [1.794875, 1.819226, 1.230937], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.674924, 0.4822, -0.558534], [0.737532, -0.464309, 0.49037], [-0.022876, -0.7429, -0.669012]] and translation vector: [1.813084, 1.825686, 1.243736]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_76_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_76_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_76_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_76_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_76_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_76_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_76_6.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_76_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.999847, -0.004634, 0.01689], [-0.017397, -0.374134, 0.927211], [0.002023, -0.927363, -0.374157]] and translation vector: [3.310194, 3.16458, 1.506432], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.999774, -0.010896, 0.018284], [-0.021018, -0.369724, 0.928904], [-0.003361, -0.929078, -0.369869]] and translation vector: [3.316631, 3.168954, 1.519748], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.999711, -0.01062, 0.02156], [-0.023945, -0.363153, 0.931422], [-0.002062, -0.931669, -0.363302]] and translation vector: [3.313389, 3.184942, 1.522696]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_77_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_77_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_77_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_77_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_77_4.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_77_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_77_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_77_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.573165, 0.475287, -0.667521], [0.819422, -0.337921, 0.462988], [-0.005517, -0.81235, -0.583144]] and translation vector: [4.230747, 1.597944, 1.425469], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.580595, 0.472456, -0.663095], [0.814187, -0.339873, 0.470729], [-0.002969, -0.813186, -0.581996]] and translation vector: [4.228813, 1.597838, 1.42741], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.590926, 0.466068, -0.658474], [0.806725, -0.340048, 0.483283], [0.00133, -0.816791, -0.576932]] and translation vector: [4.230728, 1.601094, 1.427952]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_78_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_78_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_78_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_78_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_78_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_78_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_78_6.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_78_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.246516, -0.470365, 0.847341], [-0.959136, 0.006886, 0.282862], [-0.138884, -0.882445, -0.449446]] and translation vector: [3.043058, 2.955299, 1.551102], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.243276, -0.470143, 0.8484], [-0.960213, 0.006937, 0.279182], [-0.13714, -0.882563, -0.44975]] and translation vector: [3.042024, 2.954946, 1.550413], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.220837, -0.468715, 0.855299], [-0.967151, 0.007957, 0.254077], [-0.125896, -0.883313, -0.451561]] and translation vector: [3.035462, 2.949861, 1.549809]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_79_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_79_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_79_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_79_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_79_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_79_5.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_79_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_79_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.998134, -0.025826, -0.055325], [0.04389, 0.326427, -0.944203], [0.042444, -0.94487, -0.324684]] and translation vector: [2.355182, 2.984659, 1.395898], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.998605, -0.022906, -0.047579], [0.037628, 0.323493, -0.945482], [0.037048, -0.945953, -0.32218]] and translation vector: [2.345251, 2.98743, 1.391141], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.998425, -0.028903, -0.048087], [0.035665, 0.334665, -0.941662], [0.04331, -0.941894, -0.333107]] and translation vector: [2.317253, 2.991597, 1.388493]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_80_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_80_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_80_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_80_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_80_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_80_5.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_80_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_80_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.176261, -0.039155, 0.983564], [-0.983722, -0.028492, -0.177423], [0.03497, -0.998827, -0.033496]] and translation vector: [3.054739, 2.437738, 1.503838], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.18153, -0.048874, 0.98217], [-0.982778, -0.026092, -0.182941], [0.034567, -0.998464, -0.043296]] and translation vector: [3.061021, 2.450195, 1.498681], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.163045, -0.034334, 0.986021], [-0.986093, -0.02694, -0.163995], [0.032194, -0.999047, -0.029464]] and translation vector: [3.066704, 2.437577, 1.507359]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_81_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_81_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_81_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_81_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_81_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_81_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_81_6.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_81_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.853196, -0.330732, 0.403328], [-0.517406, -0.438892, 0.734619], [-0.065945, -0.835458, -0.545584]] and translation vector: [2.734716, 6.775187, 1.412962], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.853022, -0.336855, 0.398601], [-0.516617, -0.436898, 0.736361], [-0.0739, -0.834056, -0.546708]] and translation vector: [2.728871, 6.767794, 1.411126], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.851443, -0.340578, 0.398812], [-0.519517, -0.44372, 0.730216], [-0.071735, -0.828927, -0.554738]] and translation vector: [2.722152, 6.743406, 1.39829]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_82_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_82_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_82_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_82_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_82_4.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_82_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_82_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_82_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.954506, 0.05554, -0.292973], [0.288831, -0.41644, 0.862064], [-0.074127, -0.907465, -0.413536]] and translation vector: [2.66447, 1.005586, 1.476015], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.956668, 0.052296, -0.286448], [0.280824, -0.425753, 0.860158], [-0.076973, -0.903327, -0.42199]] and translation vector: [2.657996, 1.004761, 1.470821], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.966986, 0.054866, -0.248854], [0.248498, -0.419376, 0.873139], [-0.056458, -0.906153, -0.419165]] and translation vector: [2.617702, 1.004602, 1.502791]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_83_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_83_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_83_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_83_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_83_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_83_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_83_6.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_83_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.804945, -0.278842, 0.523748], [-0.593014, 0.407765, -0.694307], [-0.019964, -0.869468, -0.493585]] and translation vector: [4.871809, 2.494869, 1.402737], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.804444, -0.274614, 0.526742], [-0.593612, 0.404842, -0.695506], [-0.022252, -0.872176, -0.488687]] and translation vector: [4.863627, 2.491699, 1.400121], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.82218, -0.26485, 0.503859], [-0.568804, 0.416386, -0.709285], [-0.021946, -0.869757, -0.492992]] and translation vector: [4.864128, 2.487759, 1.4037]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_84_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_84_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_84_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_84_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_84_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_84_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_84_6.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_84_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.330673, -0.328207, 0.884837], [-0.942686, -0.070458, 0.326157], [-0.044703, -0.941975, -0.332694]] and translation vector: [3.753276, 4.481459, 1.345242], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.306694, -0.326667, 0.893995], [-0.950878, -0.063631, 0.302957], [-0.04208, -0.942995, -0.330136]] and translation vector: [3.754864, 4.497246, 1.34429], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.246991, -0.34493, 0.905549], [-0.96808, -0.046739, 0.246244], [-0.042613, -0.937464, -0.345464]] and translation vector: [3.754345, 4.564482, 1.352383]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_85_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_85_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_85_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_85_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_85_4.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_85_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_85_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_85_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.119369, -0.433868, 0.893034], [-0.990549, 0.113242, -0.077387], [-0.067553, -0.893832, -0.443285]] and translation vector: [3.407035, 4.679209, 1.397058], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.120544, -0.432859, 0.893366], [-0.990306, 0.115004, -0.077902], [-0.06902, -0.894096, -0.442526]] and translation vector: [3.401289, 4.681283, 1.397495], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.162977, -0.454909, 0.875498], [-0.983038, 0.15052, -0.104785], [-0.084112, -0.877725, -0.471725]] and translation vector: [3.342063, 4.674428, 1.399173]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_86_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_86_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_86_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_86_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_86_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_86_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_86_6.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_86_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.767458, -0.265442, 0.583565], [-0.640543, 0.35536, -0.680752], [-0.026676, -0.896248, -0.442751]] and translation vector: [3.343537, 3.697402, 1.375352], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.780866, -0.263741, 0.566294], [-0.624403, 0.357431, -0.694525], [-0.019236, -0.895926, -0.443786]] and translation vector: [3.344022, 3.709659, 1.376654], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.822146, -0.253316, 0.509811], [-0.569276, 0.364542, -0.736908], [0.000823, -0.896069, -0.443913]] and translation vector: [3.329204, 3.745763, 1.383552]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_87_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_87_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_87_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_87_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_87_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_87_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_87_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_87_7.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.612656, -0.411508, 0.674769], [-0.789543, 0.280105, -0.546043], [0.035694, -0.867296, -0.496511]] and translation vector: [1.897828, 2.372103, 1.388776], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.615876, -0.406578, 0.674826], [-0.787242, 0.284147, -0.547275], [0.03076, -0.868305, -0.495075]] and translation vector: [1.892345, 2.36762, 1.390764], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.607068, -0.416916, 0.676498], [-0.79419, 0.289362, -0.534352], [0.027027, -0.861656, -0.506773]] and translation vector: [1.87873, 2.3614, 1.391886]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_88_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_88_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_88_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_88_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_88_4.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_88_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_88_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_88_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.221984, 0.421429, -0.879273], [0.97466, 0.121427, -0.187867], [0.027595, -0.898695, -0.437705]] and translation vector: [3.155292, 0.483793, 1.35371], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.224547, 0.416482, -0.880978], [0.973822, 0.128715, -0.187361], [0.035363, -0.899986, -0.434482]] and translation vector: [3.157119, 0.483672, 1.354178], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.215665, 0.423756, -0.879727], [0.975658, 0.130183, -0.176474], [0.039743, -0.896373, -0.441517]] and translation vector: [3.155366, 0.486351, 1.353433]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_89_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_89_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_89_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_89_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_89_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_89_5.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_89_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_89_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.955421, 0.119616, -0.269932], [0.295248, 0.388339, -0.872939], [0.000408, -0.91372, -0.406343]] and translation vector: [2.65583, 2.981598, 1.368648], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.951595, 0.120375, -0.282803], [0.307283, 0.392547, -0.866882], [0.006663, -0.91182, -0.410535]] and translation vector: [2.655525, 2.981353, 1.361859], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.943467, 0.154725, -0.293138], [0.331247, 0.407989, -0.850776], [-0.01204, -0.89978, -0.436177]] and translation vector: [2.636264, 2.98502, 1.345518]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_90_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_90_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_90_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_90_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_90_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_90_5.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_90_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_90_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.908726, 0.150598, -0.389277], [0.406624, 0.108936, -0.907078], [-0.094198, -0.982575, -0.16023]] and translation vector: [8.822721, 3.830595, 1.476402], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.908663, 0.151907, -0.388916], [0.40641, 0.108245, -0.907256], [-0.09572, -0.98245, -0.160095]] and translation vector: [8.818814, 3.832555, 1.475788], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.906287, 0.145588, -0.396797], [0.413103, 0.106574, -0.904427], [-0.089385, -0.983589, -0.156729]] and translation vector: [8.811844, 3.835278, 1.478992]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_91_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_91_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_91_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_91_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_91_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_91_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_91_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_91_7.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.895509, 0.17248, -0.410263], [0.444823, 0.375965, -0.812886], [0.014038, -0.91044, -0.413402]] and translation vector: [2.818061, 5.409916, 1.54775], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.895274, 0.172164, -0.410907], [0.445264, 0.376844, -0.812237], [0.01501, -0.910136, -0.414037]] and translation vector: [2.819061, 5.407142, 1.548651], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.894314, 0.169155, -0.414233], [0.446992, 0.379174, -0.810201], [0.020016, -0.909733, -0.414712]] and translation vector: [2.82614, 5.405447, 1.545731]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_92_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_92_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_92_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_92_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_92_4.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_92_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_92_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_92_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.852441, 0.228219, -0.470383], [0.522431, 0.337001, -0.78326], [-0.020235, -0.913426, -0.406502]] and translation vector: [1.798405, 5.320803, 1.619482], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.850776, 0.231102, -0.471988], [0.52508, 0.336676, -0.781627], [-0.021728, -0.91282, -0.407783]] and translation vector: [1.793927, 5.32593, 1.618758], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.843319, 0.217806, -0.491298], [0.537393, 0.333805, -0.774456], [-0.004683, -0.917134, -0.398552]] and translation vector: [1.789976, 5.331068, 1.629155]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_93_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_93_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_93_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_93_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_93_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_93_5.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_93_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_93_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.443363, -0.325026, 0.835337], [-0.895367, 0.117125, -0.429651], [0.041809, -0.938424, -0.342946]] and translation vector: [2.190343, 3.392878, 1.594635], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.439336, -0.32163, 0.838772], [-0.897253, 0.111545, -0.427195], [0.043838, -0.940272, -0.337589]] and translation vector: [2.183471, 3.393708, 1.586874], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.44052, -0.339041, 0.83126], [-0.896776, 0.123224, -0.424981], [0.041655, -0.932667, -0.358326]] and translation vector: [2.168168, 3.37614, 1.57519]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_94_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_94_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_94_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_94_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_94_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_94_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_94_6.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_94_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.236277, -0.452541, 0.859872], [-0.970097, 0.160455, -0.182119], [-0.055554, -0.877189, -0.47692]] and translation vector: [1.575898, 1.961144, 1.314442], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.238966, -0.451212, 0.859828], [-0.9694, 0.162109, -0.184349], [-0.056205, -0.87757, -0.476143]] and translation vector: [1.575219, 1.960128, 1.313122], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.271686, -0.463311, 0.843522], [-0.960992, 0.177771, -0.211879], [-0.051788, -0.868182, -0.493536]] and translation vector: [1.583445, 1.96149, 1.313418]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_95_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_95_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_95_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_95_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_95_4.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_95_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_95_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_95_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.931668, 0.072515, -0.356001], [0.362912, -0.231685, 0.902561], [-0.017031, -0.970084, -0.24217]] and translation vector: [5.886859, 3.543659, 1.354971], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.931979, 0.073028, -0.355079], [0.362119, -0.233112, 0.902513], [-0.016864, -0.969704, -0.2437]] and translation vector: [5.882501, 3.543666, 1.354317], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.932369, 0.086637, -0.350973], [0.36142, -0.244825, 0.899687], [-0.007981, -0.965689, -0.259579]] and translation vector: [5.853946, 3.560033, 1.352092]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_96_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_96_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_96_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_96_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_96_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_96_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_96_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_96_7.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.688084, 0.423256, -0.589401], [0.725514, -0.415863, 0.54835], [-0.013017, -0.80493, -0.593227]] and translation vector: [3.968163, 0.8771, 1.421607], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.688048, 0.420794, -0.591205], [0.725576, -0.411726, 0.551381], [-0.011397, -0.80834, -0.588605]] and translation vector: [3.964529, 0.870938, 1.417962], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.665465, 0.44657, -0.598107], [0.746417, -0.402654, 0.529841], [-0.004219, -0.799027, -0.60128]] and translation vector: [3.954065, 0.866652, 1.420457]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_97_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_97_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_97_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_97_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_97_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_97_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_97_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_97_7.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.45377, -0.425062, 0.783208], [-0.891046, 0.227634, -0.392708], [-0.01136, -0.876074, -0.482043]] and translation vector: [2.25004, 3.862298, 1.519108], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.453547, -0.422981, 0.784463], [-0.891155, 0.226808, -0.392938], [-0.011717, -0.877294, -0.47981]] and translation vector: [2.249275, 3.861866, 1.519019], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.445149, -0.42745, 0.786847], [-0.895457, 0.212955, -0.390907], [-0.00047, -0.878599, -0.47756]] and translation vector: [2.244179, 3.86012, 1.517719]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_98_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_98_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_98_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_98_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_98_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_98_5.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_98_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_98_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.778266, 0.076502, -0.623257], [0.626532, 0.028295, -0.778882], [-0.041951, -0.996668, -0.069952]] and translation vector: [4.354075, 2.27787, 1.510689], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.774603, 0.078895, -0.627508], [0.631084, 0.031306, -0.775082], [-0.041505, -0.996391, -0.074039]] and translation vector: [4.353431, 2.276987, 1.507071], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.765589, 0.09814, -0.635801], [0.642341, 0.061836, -0.76392], [-0.035656, -0.99325, -0.110381]] and translation vector: [4.348542, 2.268086, 1.503072]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_99_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_99_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_99_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_99_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_99_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_99_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_99_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_99_7.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.997112, 0.02462, 0.071841], [-0.04661, 0.548461, -0.834876], [-0.059957, -0.835814, -0.545729]] and translation vector: [4.834615, 3.436689, 1.398379], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.998397, 0.025746, 0.050402], [-0.028149, 0.546702, -0.836854], [-0.0491, -0.836932, -0.545101]] and translation vector: [4.839047, 3.434593, 1.400064], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.999077, -0.037699, 0.020609], [-0.036788, 0.502836, -0.863599], [0.022194, -0.863559, -0.503759]] and translation vector: [4.856574, 3.440762, 1.395837]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_100_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_100_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_100_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_100_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_100_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_100_5.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_100_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_100_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.924593, 0.219455, -0.311397], [0.371095, 0.334047, -0.86643], [-0.086121, -0.916653, -0.390296]] and translation vector: [7.650298, 2.745242, 1.444521], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.925403, 0.221817, -0.30729], [0.368562, 0.337876, -0.866026], [-0.088274, -0.914679, -0.394425]] and translation vector: [7.650829, 2.747432, 1.442508], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.931334, 0.218695, -0.291187], [0.355288, 0.37018, -0.858334], [-0.079922, -0.902851, -0.422461]] and translation vector: [7.652313, 2.75096, 1.431448]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_101_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_101_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_101_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_101_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_101_4.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_101_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_101_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_101_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.927869, -0.125596, 0.351119], [-0.372891, -0.32108, 0.870551], [0.003399, -0.938687, -0.344754]] and translation vector: [5.442723, 4.031985, 1.348893], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.928984, -0.124208, 0.348657], [-0.370086, -0.32475, 0.870387], [0.005117, -0.937609, -0.347654]] and translation vector: [5.438782, 4.038163, 1.363364], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.930142, -0.10574, 0.351647], [-0.366759, -0.314483, 0.87555], [0.018006, -0.943355, -0.331295]] and translation vector: [5.443505, 4.02862, 1.369591]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_102_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_102_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_102_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_102_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_102_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_102_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_102_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_102_7.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.32152, -0.4706, 0.821681], [-0.946681, 0.178549, -0.268172], [-0.020508, -0.864092, -0.502915]] and translation vector: [2.120097, 2.367636, 1.494245], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.324752, -0.471365, 0.819971], [-0.945715, 0.173395, -0.274877], [-0.012612, -0.864725, -0.502087]] and translation vector: [2.101204, 2.346659, 1.492081], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.35351, -0.420371, 0.835655], [-0.935423, 0.155099, -0.317693], [0.00394, -0.893998, -0.448054]] and translation vector: [2.068189, 2.338444, 1.524964]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_103_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_103_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_103_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_103_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_103_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_103_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_103_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_103_7.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.504428, 0.479717, -0.717931], [0.860003, -0.204862, 0.467362], [0.077124, -0.853173, -0.515896]] and translation vector: [4.973708, 0.412451, 1.573636], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.50991, 0.478461, -0.714889], [0.856537, -0.205494, 0.47341], [0.079603, -0.853725, -0.514603]] and translation vector: [4.974949, 0.42052, 1.588198], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.529349, 0.450337, -0.719018], [0.846093, -0.217693, 0.486556], [0.062589, -0.865914, -0.496262]] and translation vector: [4.987175, 0.423323, 1.59454]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_104_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_104_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_104_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_104_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_104_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_104_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_104_6.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_104_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.070416, -0.411804, 0.908548], [-0.99671, 0.065705, -0.047468], [-0.040148, -0.908901, -0.415075]] and translation vector: [2.214543, 1.806687, 1.391502], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.072195, -0.409813, 0.909308], [-0.996578, 0.066438, -0.049181], [-0.040258, -0.909747, -0.413207]] and translation vector: [2.216063, 1.808517, 1.395188], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.080916, -0.398975, 0.913384], [-0.996223, 0.061337, -0.061462], [-0.031503, -0.914908, -0.402432]] and translation vector: [2.214478, 1.812354, 1.396036]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_105_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_105_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_105_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_105_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_105_4.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_105_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_105_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_105_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.349467, 0.022881, -0.936669], [0.936944, -0.011774, 0.349282], [-0.003037, -0.999669, -0.025553]] and translation vector: [3.08553, 2.787215, 1.609269], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.348555, 0.021762, -0.937036], [0.937279, -0.012701, 0.34835], [-0.00432, -0.999682, -0.024824]] and translation vector: [3.086167, 2.787834, 1.610474], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.369988, 0.031522, -0.928502], [0.929035, -0.010749, 0.369835], [0.001677, -0.999445, -0.033262]] and translation vector: [3.084904, 2.78765, 1.611416]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_106_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_106_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_106_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_106_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_106_4.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_106_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_106_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_106_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.986418, -0.051155, 0.156087], [-0.152905, 0.633099, -0.758819], [-0.060001, -0.772379, -0.632322]] and translation vector: [2.055195, 1.600374, 1.268236], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.986809, -0.050817, 0.15371], [-0.151071, 0.630346, -0.761474], [-0.058194, -0.77465, -0.629707]] and translation vector: [2.054364, 1.600927, 1.26836], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.986971, -0.056701, 0.150577], [-0.152339, 0.630474, -0.761115], [-0.051779, -0.774137, -0.630897]] and translation vector: [2.055561, 1.60142, 1.26922]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_107_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_107_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_107_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_107_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_107_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_107_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_107_6.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_107_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.987126, 0.106622, -0.119219], [0.159938, -0.652529, 0.740693], [0.00118, -0.750225, -0.661181]] and translation vector: [4.64166, 4.052867, 1.404314], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.987387, 0.107853, -0.115912], [0.158278, -0.654013, 0.73974], [0.003975, -0.748756, -0.662834]] and translation vector: [4.649776, 4.051806, 1.400746], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.98973, 0.078153, -0.119695], [0.141622, -0.649931, 0.746681], [-0.019438, -0.755964, -0.654324]] and translation vector: [4.654046, 4.058671, 1.412681]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_108_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_108_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_108_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_108_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_108_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_108_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_108_6.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_108_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.994446, -0.078697, 0.06988], [-0.104992, -0.787844, 0.606859], [0.007297, -0.610826, -0.791731]] and translation vector: [1.305105, 0.510448, 1.183315], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.994112, -0.083607, 0.068931], [-0.10831, -0.785774, 0.608956], [0.003251, -0.612836, -0.790203]] and translation vector: [1.308194, 0.508844, 1.184721], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.994174, -0.088174, 0.061991], [-0.107635, -0.781912, 0.614026], [-0.00567, -0.617121, -0.786848]] and translation vector: [1.316761, 0.496028, 1.1951]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_109_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_109_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_109_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_109_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_109_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_109_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_109_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_109_7.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.481759, -0.460793, 0.745371], [-0.875469, 0.290199, -0.386444], [-0.038235, -0.838722, -0.543216]] and translation vector: [3.08436, 2.075189, 1.468295], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.482142, -0.463533, 0.743422], [-0.87538, 0.289132, -0.387445], [-0.035354, -0.83758, -0.54517]] and translation vector: [3.085865, 2.079347, 1.468915], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.466183, -0.466331, 0.751804], [-0.884097, 0.276631, -0.376626], [-0.03234, -0.840244, -0.541243]] and translation vector: [3.069418, 2.081707, 1.467716]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_110_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_110_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_110_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_110_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_110_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_110_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_110_6.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_110_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.975982, 0.033782, -0.215214], [0.215389, -0.297687, 0.930048], [-0.032648, -0.954066, -0.297814]] and translation vector: [2.838751, 1.414222, 1.664536], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.976127, 0.034525, -0.21444], [0.21483, -0.298963, 0.929769], [-0.03201, -0.95364, -0.299243]] and translation vector: [2.83798, 1.414721, 1.663024], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.977071, 0.035817, -0.209879], [0.210869, -0.299025, 0.930655], [-0.029426, -0.953573, -0.299721]] and translation vector: [2.830656, 1.415531, 1.663803]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_111_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_111_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_111_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_111_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_111_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_111_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_111_6.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_111_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.054781, -0.427281, 0.902458], [-0.998013, -0.051617, 0.036143], [0.031139, -0.902644, -0.429259]] and translation vector: [1.328526, 0.849821, 1.501181], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.086578, -0.407933, 0.908898], [-0.995883, -0.060028, 0.067922], [0.026852, -0.911036, -0.41145]] and translation vector: [1.314662, 0.836147, 1.492068], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.123316, -0.40327, 0.906734], [-0.991749, -0.082348, 0.098253], [0.035045, -0.911368, -0.410097]] and translation vector: [1.307532, 0.816785, 1.49678]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_112_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_112_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_112_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_112_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_112_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_112_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_112_6.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_112_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.140295, 0.625342, -0.767636], [0.990108, -0.090149, 0.107516], [-0.001967, -0.775126, -0.631804]] and translation vector: [3.410891, 3.073526, 1.198756], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.148525, 0.612201, -0.776627], [0.988818, -0.102561, 0.108258], [-0.013376, -0.784022, -0.620589]] and translation vector: [3.421496, 3.097678, 1.206193], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.180299, 0.582031, -0.792926], [0.982291, -0.148308, 0.114495], [-0.050958, -0.799528, -0.598463]] and translation vector: [3.423417, 3.182928, 1.218892]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_113_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_113_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_113_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_113_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_113_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_113_5.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_113_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_113_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.408988, -0.323891, 0.853126], [-0.912443, -0.158736, 0.37716], [0.013263, -0.932683, -0.360453]] and translation vector: [3.672612, 2.990265, 1.494339], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.403714, -0.307769, 0.861564], [-0.914697, -0.154884, 0.373283], [0.018558, -0.93877, -0.344045]] and translation vector: [3.67724, 2.998002, 1.501107], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.418114, -0.223767, 0.880403], [-0.907864, -0.136047, 0.396578], [0.031035, -0.965101, -0.260033]] and translation vector: [3.686426, 2.992862, 1.516855]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_114_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_114_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_114_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_114_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_114_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_114_5.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_114_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_114_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.804414, -0.195207, 0.561082], [-0.593456, -0.306943, 0.74404], [0.026978, -0.931494, -0.362756]] and translation vector: [4.397897, 1.805397, 1.263968], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.81043, -0.19082, 0.553888], [-0.585149, -0.309439, 0.749566], [0.028363, -0.931577, -0.362436]] and translation vector: [4.406421, 1.797547, 1.276681], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.835561, -0.15907, 0.525866], [-0.54802, -0.309079, 0.777267], [0.038894, -0.937639, -0.345428]] and translation vector: [4.454782, 1.746297, 1.281162]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_115_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_115_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_115_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_115_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_115_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_115_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_115_6.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_115_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.454685, 0.144673, -0.878824], [0.890085, 0.109034, -0.442562], [0.031795, -0.983454, -0.178347]] and translation vector: [3.311996, 2.119304, 1.59409], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.453171, 0.138778, -0.880555], [0.890847, 0.10604, -0.441756], [0.032068, -0.98463, -0.171684]] and translation vector: [3.314367, 2.120091, 1.591769], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.43605, 0.134523, -0.889811], [0.898328, 0.123911, -0.42149], [0.053558, -0.983133, -0.174877]] and translation vector: [3.332471, 2.052713, 1.580764]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_116_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_116_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_116_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_116_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_116_4.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_116_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_116_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_116_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.51864, -0.44867, 0.727811], [-0.853934, -0.229463, 0.467059], [-0.04255, -0.863738, -0.502143]] and translation vector: [1.002297, 1.98866, 1.344191], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.519607, -0.444592, 0.729621], [-0.853432, -0.229314, 0.468049], [-0.040778, -0.865883, -0.498582]] and translation vector: [1.000441, 1.985865, 1.344846], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.525099, -0.430062, 0.734383], [-0.8496, -0.214703, 0.48175], [-0.049508, -0.876898, -0.478121]] and translation vector: [0.994465, 1.977308, 1.35476]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_117_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_117_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_117_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_117_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_117_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_117_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_117_6.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_117_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.68967, 0.288211, -0.664297], [0.724122, -0.27239, 0.633602], [0.001663, -0.918008, -0.396559]] and translation vector: [2.530043, 2.005069, 1.437417], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.68921, 0.288518, -0.66464], [0.724561, -0.273014, 0.632831], [0.001127, -0.917726, -0.397212]] and translation vector: [2.5334, 2.008455, 1.44069], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.695343, 0.287777, -0.658546], [0.718659, -0.271639, 0.640111], [0.005323, -0.918366, -0.395696]] and translation vector: [2.535345, 2.010031, 1.440264]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_118_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_118_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_118_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_118_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_118_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_118_5.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_118_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_118_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.24604, -0.551346, 0.797171], [-0.968826, -0.115295, 0.219278], [-0.028988, -0.826271, -0.562526]] and translation vector: [1.704247, 2.057158, 1.361636], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.236706, -0.55071, 0.800431], [-0.971342, -0.115817, 0.207564], [-0.021604, -0.826623, -0.562342]] and translation vector: [1.70792, 2.062619, 1.364929], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.170375, -0.545117, 0.820866], [-0.98536, -0.099505, 0.138438], [0.006215, -0.832434, -0.554089]] and translation vector: [1.68849, 2.12587, 1.375528]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_119_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_119_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_119_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_119_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_119_4.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_119_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_119_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_119_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.711391, -0.463973, 0.527875], [-0.700286, 0.531398, -0.476672], [-0.059349, -0.708763, -0.702945]] and translation vector: [2.53321, 4.394931, 1.530427], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.710702, -0.465347, 0.527594], [-0.701175, 0.5294, -0.477586], [-0.057065, -0.709357, -0.702536]] and translation vector: [2.526067, 4.393322, 1.526345], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.710832, -0.469663, 0.523579], [-0.701381, 0.52914, -0.477573], [-0.052748, -0.706702, -0.705542]] and translation vector: [2.532494, 4.391185, 1.524071]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_120_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_120_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_120_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_120_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_120_4.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_120_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_120_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_120_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.506976, -0.449046, 0.735753], [-0.861802, 0.247713, -0.442646], [0.016513, -0.858485, -0.512574]] and translation vector: [1.568574, 4.423309, 1.333385], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.503836, -0.444181, 0.740846], [-0.863753, 0.25025, -0.437385], [0.008882, -0.860278, -0.509748]] and translation vector: [1.576928, 4.418399, 1.331934], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.476896, -0.475938, 0.738954], [-0.878865, 0.245876, -0.408828], [0.012886, -0.84441, -0.535543]] and translation vector: [1.618973, 4.377153, 1.328238]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_121_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_121_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_121_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_121_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_121_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_121_5.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_121_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_121_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.857694, 0.203115, -0.472341], [0.513544, 0.293426, -0.806333], [-0.025181, -0.934155, -0.355978]] and translation vector: [3.161674, 3.662206, 1.335287], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.856666, 0.203827, -0.473897], [0.515344, 0.296604, -0.804019], [-0.023321, -0.932995, -0.359132]] and translation vector: [3.164327, 3.659025, 1.330704], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.851543, 0.201203, -0.48414], [0.523447, 0.274112, -0.806762], [-0.029614, -0.940415, -0.338738]] and translation vector: [3.169208, 3.645592, 1.345035]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_122_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_122_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_122_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_122_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_122_4.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_122_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_122_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_122_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.861262, 0.35211, -0.366398], [0.508128, 0.60504, -0.61297], [0.005853, -0.714105, -0.700014]] and translation vector: [3.145762, 3.637784, 1.437024], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.859655, 0.347273, -0.374693], [0.510745, 0.600786, -0.614977], [0.011546, -0.720041, -0.693836]] and translation vector: [3.145171, 3.63531, 1.440385], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.904923, 0.242096, -0.350005], [0.423906, 0.585528, -0.690985], [0.037653, -0.773658, -0.632485]] and translation vector: [3.179198, 3.619442, 1.477378]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_123_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_123_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_123_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_123_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_123_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_123_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_123_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_123_7.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.624751, -0.31057, 0.716403], [-0.780527, -0.273701, 0.562018], [0.021534, -0.910293, -0.413403]] and translation vector: [-0.212106, 0.775797, 1.619325], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.624146, -0.312612, 0.716042], [-0.781019, -0.274551, 0.56092], [0.02124, -0.909338, -0.415515]] and translation vector: [-0.212874, 0.777223, 1.616059], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.642142, -0.354499, 0.679694], [-0.766394, -0.316707, 0.558871], [0.017145, -0.879788, -0.475057]] and translation vector: [-0.180935, 0.825968, 1.590205]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_124_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_124_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_124_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_124_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_124_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_124_5.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_124_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_124_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.984594, -0.069457, 0.160469], [-0.174127, -0.305795, 0.936039], [-0.015944, -0.949561, -0.313178]] and translation vector: [3.941113, 2.817773, 1.559826], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.984592, -0.069572, 0.160429], [-0.174152, -0.307406, 0.935507], [-0.015768, -0.949032, -0.314785]] and translation vector: [3.94407, 2.817183, 1.553188], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.986412, -0.069361, 0.14893], [-0.163547, -0.328462, 0.93025], [-0.015605, -0.941967, -0.335343]] and translation vector: [3.970874, 2.81883, 1.551708]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_125_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_125_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_125_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_125_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_125_4.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_125_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_125_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_125_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.565317, -0.50256, 0.654103], [-0.824719, 0.328974, -0.460017], [0.016003, -0.799506, -0.600445]] and translation vector: [4.07549, 5.065369, 1.281872], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.538132, -0.502349, 0.676801], [-0.842747, 0.30749, -0.441846], [0.013851, -0.808143, -0.588824]] and translation vector: [4.054681, 5.042427, 1.283033], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.45677, -0.532015, 0.712967], [-0.889546, 0.265624, -0.371689], [0.008363, -0.803993, -0.594581]] and translation vector: [3.985017, 4.950093, 1.286783]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_126_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_126_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_126_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_126_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_126_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_126_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_126_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_126_7.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.971613, -0.06682, 0.226943], [-0.235147, 0.378036, -0.89543], [-0.02596, -0.923376, -0.383017]] and translation vector: [2.775299, 4.618156, 1.427592], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.969099, -0.066923, 0.237421], [-0.244849, 0.377786, -0.892932], [-0.029937, -0.923471, -0.382498]] and translation vector: [2.770648, 4.620754, 1.418404], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.959375, -0.08118, 0.270203], [-0.280099, 0.388898, -0.877669], [-0.033832, -0.917697, -0.395838]] and translation vector: [2.756619, 4.594989, 1.414391]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_127_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_127_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_127_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_127_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_127_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_127_5.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_127_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_127_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.218501, -0.721835, 0.656667], [-0.97193, -0.10083, 0.212566], [-0.087226, -0.684681, -0.723605]] and translation vector: [2.10902, 2.428258, 1.386435], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.218569, -0.722397, 0.656026], [-0.971546, -0.098231, 0.215522], [-0.091251, -0.684466, -0.723312]] and translation vector: [2.107975, 2.430531, 1.385643], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.234983, -0.674252, 0.70012], [-0.966581, -0.086145, 0.241454], [-0.102489, -0.73346, -0.671961]] and translation vector: [2.089091, 2.418566, 1.400829]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_128_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_128_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_128_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_128_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_128_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_128_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_128_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_128_7.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.863619, -0.252896, 0.436126], [-0.502889, 0.371124, -0.780621], [0.03556, -0.893482, -0.447688]] and translation vector: [2.007098, 3.82416, 1.536992], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.862677, -0.255046, 0.436739], [-0.504412, 0.370978, -0.779707], [0.036841, -0.892932, -0.448682]] and translation vector: [2.007321, 3.81907, 1.542811], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.863059, -0.255804, 0.435538], [-0.503401, 0.36489, -0.783226], [0.041429, -0.89522, -0.443694]] and translation vector: [2.011345, 3.815826, 1.540639]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_129_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_129_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_129_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_129_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_129_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_129_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_129_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_129_7.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.311411, -0.45253, 0.835607], [-0.948656, 0.199362, -0.245576], [-0.055457, -0.869179, -0.491379]] and translation vector: [2.299133, 2.388773, 1.459468], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.314195, -0.454542, 0.833471], [-0.947818, 0.20019, -0.248124], [-0.05407, -0.867937, -0.493722]] and translation vector: [2.299448, 2.389842, 1.45904], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.319309, -0.479365, 0.817466], [-0.946515, 0.203543, -0.250358], [-0.046377, -0.853686, -0.518719]] and translation vector: [2.297309, 2.382683, 1.450072]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_130_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_130_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_130_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_130_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_130_4.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_130_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_130_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_130_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.922168, 0.178823, -0.342969], [0.38661, 0.453076, -0.803278], [0.011746, -0.873352, -0.486947]] and translation vector: [3.207336, 1.959871, 1.267555], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.914921, 0.180426, -0.361063], [0.403188, 0.450583, -0.796502], [0.018979, -0.874312, -0.484993]] and translation vector: [3.204391, 1.957541, 1.273759], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.899907, 0.183343, -0.395667], [0.435126, 0.437531, -0.786913], [0.028842, -0.880314, -0.473515]] and translation vector: [3.195998, 1.957617, 1.285169]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_131_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_131_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_131_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_131_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_131_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_131_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_131_6.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_131_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.037281, 0.595041, -0.80283], [0.998378, -0.012419, -0.055566], [-0.043034, -0.803599, -0.593613]] and translation vector: [3.95675, 2.244474, 1.442954], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.038109, 0.594465, -0.803218], [0.998341, -0.012073, -0.056302], [-0.043167, -0.80403, -0.593019]] and translation vector: [3.957906, 2.244142, 1.441716], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.035792, 0.584102, -0.810891], [0.99863, -0.010099, -0.051354], [-0.038185, -0.811617, -0.58294]] and translation vector: [3.956708, 2.24149, 1.443636]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_132_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_132_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_132_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_132_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_132_4.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_132_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_132_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_132_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.941243, -0.209403, 0.264975], [-0.336113, 0.504116, -0.795548], [0.033012, -0.837865, -0.544878]] and translation vector: [4.828751, 9.008894, 1.463441], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.939528, -0.206646, 0.273103], [-0.341818, 0.516505, -0.785101], [0.021179, -0.830976, -0.555906]] and translation vector: [4.819307, 9.009376, 1.463735], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.929333, -0.218512, 0.297646], [-0.368979, 0.519063, -0.770992], [0.013974, -0.826333, -0.563008]] and translation vector: [4.802584, 9.04943, 1.458571]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_133_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_133_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_133_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_133_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_133_4.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_133_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_133_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_133_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.341382, 0.594812, -0.727775], [0.932196, 0.11517, -0.343142], [-0.120287, -0.795572, -0.593798]] and translation vector: [7.151203, 3.587152, 1.581923], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.344041, 0.585523, -0.734029], [0.930897, 0.110501, -0.348168], [-0.122749, -0.803089, -0.583079]] and translation vector: [7.150104, 3.60012, 1.584136], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.381268, 0.567894, -0.729473], [0.913798, 0.111991, -0.390424], [-0.140025, -0.815448, -0.561639]] and translation vector: [7.153435, 3.678253, 1.582921]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_134_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_134_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_134_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_134_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_134_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_134_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_134_6.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_134_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.631332, 0.312126, -0.709927], [0.775472, -0.26347, 0.573784], [-0.007951, -0.912776, -0.408382]] and translation vector: [1.600176, 0.624978, 1.327739], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.627277, 0.311053, -0.713982], [0.778666, -0.267257, 0.567673], [-0.014241, -0.912041, -0.409851]] and translation vector: [1.601099, 0.627571, 1.328079], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.610657, 0.317655, -0.725393], [0.791862, -0.253314, 0.555685], [-0.007236, -0.913744, -0.406226]] and translation vector: [1.603666, 0.628049, 1.323957]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_135_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_135_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_135_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_135_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_135_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_135_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_135_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_135_7.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.283698, -0.38675, 0.877463], [-0.95878, 0.129662, -0.252839], [-0.015988, -0.913024, -0.407593]] and translation vector: [3.69525, 3.551647, 1.352095], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.292652, -0.378333, 0.878191], [-0.956147, 0.127043, -0.2639], [-0.011726, -0.91691, -0.398922]] and translation vector: [3.694781, 3.553972, 1.346799], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.31632, -0.391232, 0.864222], [-0.948647, 0.127329, -0.28958], [0.003253, -0.911441, -0.411418]] and translation vector: [3.701458, 3.559184, 1.352364]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_136_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_136_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_136_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_136_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_136_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_136_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_136_6.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_136_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.831143, 0.312948, -0.459636], [0.555586, 0.43327, -0.709649], [-0.022937, -0.845187, -0.533978]] and translation vector: [2.360292, 3.05803, 1.315354], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.8108, 0.328121, -0.484706], [0.584922, 0.423558, -0.691711], [-0.021664, -0.844355, -0.535346]] and translation vector: [2.374215, 3.08026, 1.318953], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.76064, 0.373644, -0.530865], [0.648502, 0.400127, -0.647568], [-0.029546, -0.836832, -0.546661]] and translation vector: [2.421989, 3.144455, 1.295588]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_137_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_137_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_137_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_137_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_137_4.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_137_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_137_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_137_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.963317, 0.154363, -0.219528], [0.260086, 0.335369, -0.905474], [-0.066149, -0.929355, -0.363214]] and translation vector: [5.972451, 2.818726, 1.468896], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.963149, 0.154275, -0.220326], [0.260736, 0.334417, -0.905639], [-0.066037, -0.929712, -0.362318]] and translation vector: [5.973901, 2.819783, 1.467855], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.966667, 0.155296, -0.203565], [0.245918, 0.341836, -0.907013], [-0.07127, -0.926839, -0.368632]] and translation vector: [5.982299, 2.822232, 1.456096]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_138_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_138_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_138_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_138_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_138_4.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_138_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_138_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_138_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.48142, 0.335029, -0.809933], [0.872625, 0.096524, -0.478757], [-0.08222, -0.937251, -0.338823]] and translation vector: [4.429162, 2.287411, 1.464776], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.484328, 0.331289, -0.809737], [0.871134, 0.09698, -0.481374], [-0.080946, -0.938532, -0.335568]] and translation vector: [4.432656, 2.285767, 1.465956], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.510728, 0.315618, -0.799714], [0.857732, 0.123483, -0.499047], [-0.058757, -0.940817, -0.333782]] and translation vector: [4.456876, 2.264055, 1.467574]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_139_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_139_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_139_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_139_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_139_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_139_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_139_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_139_7.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.233902, -0.58763, 0.774584], [-0.967246, -0.059828, 0.246692], [-0.098622, -0.806915, -0.582377]] and translation vector: [0.860343, 3.117731, 1.418568], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.233684, -0.587102, 0.775051], [-0.967496, -0.061159, 0.24538], [-0.096661, -0.8072, -0.58231]] and translation vector: [0.859973, 3.119137, 1.418853], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.249158, -0.592393, 0.766154], [-0.964448, -0.07981, 0.251935], [-0.088098, -0.801687, -0.591217]] and translation vector: [0.847042, 3.133789, 1.403155]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_140_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_140_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_140_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_140_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_140_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_140_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_140_6.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_140_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.645842, -0.099101, 0.757012], [-0.761541, -0.013148, 0.647984], [-0.054263, -0.994991, -0.083961]] and translation vector: [3.729951, 1.432448, 1.733539], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.649827, -0.099601, 0.753528], [-0.757797, -0.00807, 0.652441], [-0.058903, -0.994995, -0.080722]] and translation vector: [3.727943, 1.43259, 1.731865], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.662065, -0.092976, 0.743657], [-0.747389, -0.008433, 0.664333], [-0.055496, -0.995633, -0.075073]] and translation vector: [3.728372, 1.436196, 1.743771]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_141_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_141_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_141_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_141_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_141_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_141_5.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_141_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_141_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.924746, 0.145405, -0.351715], [0.379908, 0.407811, -0.830277], [0.022707, -0.901414, -0.432362]] and translation vector: [3.891577, 4.106122, 1.335216], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.925289, 0.144931, -0.350479], [0.378485, 0.412032, -0.828842], [0.024284, -0.899569, -0.436102]] and translation vector: [3.892777, 4.104329, 1.336806], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.936719, 0.138164, -0.321666], [0.349495, 0.42231, -0.836366], [0.020288, -0.89586, -0.443873]] and translation vector: [3.898582, 4.105442, 1.335634]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_142_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_142_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_142_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_142_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_142_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_142_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_142_6.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_142_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.896132, -0.052356, 0.440688], [-0.436974, -0.277444, 0.855616], [0.07747, -0.959314, -0.271505]] and translation vector: [3.211431, 3.110947, 1.584554], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.889709, -0.065096, 0.451863], [-0.451099, -0.277541, 0.848222], [0.070195, -0.958506, -0.276295]] and translation vector: [3.215954, 3.116336, 1.570817], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.866761, -0.113538, 0.485628], [-0.495946, -0.298858, 0.815305], [0.052566, -0.94752, -0.315347]] and translation vector: [3.24594, 3.15503, 1.569742]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_143_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_143_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_143_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_143_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_143_4.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_143_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_143_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_143_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.052123, 0.492225, -0.868906], [0.996177, 0.08671, -0.010637], [0.070107, -0.866138, -0.494863]] and translation vector: [3.27549, 2.071379, 1.287401], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.035278, 0.492309, -0.869705], [0.997133, 0.075637, 0.002369], [0.066948, -0.867128, -0.493566]] and translation vector: [3.286684, 2.076202, 1.285681], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.002481, 0.481037, -0.876697], [0.99848, 0.047075, 0.028655], [0.055055, -0.875436, -0.480189]] and translation vector: [3.329912, 2.119781, 1.289403]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_144_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_144_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_144_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_144_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_144_4.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_144_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_144_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_144_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.935878, -0.161972, 0.312885], [-0.352322, 0.433116, -0.829627], [-0.001139, -0.886666, -0.46241]] and translation vector: [1.123681, 2.231354, 1.408983], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.935522, -0.159, 0.315466], [-0.353249, 0.430874, -0.830399], [-0.003893, -0.888294, -0.459258]] and translation vector: [1.123559, 2.231523, 1.408322], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.93225, -0.177625, 0.315214], [-0.361774, 0.444334, -0.819565], [0.005515, -0.878076, -0.47849]] and translation vector: [1.117516, 2.230649, 1.39948]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_145_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_145_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_145_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_145_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_145_4.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_145_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_145_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_145_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.305635, -0.390507, 0.868385], [-0.952144, 0.122302, -0.280116], [0.003183, -0.91244, -0.409198]] and translation vector: [4.266061, 1.773856, 1.285079], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.300987, -0.399102, 0.866097], [-0.953628, 0.125052, -0.273781], [0.00096, -0.908339, -0.418234]] and translation vector: [4.263163, 1.772832, 1.291083], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.290604, -0.37367, 0.880863], [-0.956686, 0.130175, -0.260397], [-0.017364, -0.918382, -0.395314]] and translation vector: [4.197608, 1.767915, 1.309526]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_146_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_146_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_146_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_146_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_146_4.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_146_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_146_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_146_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.485844, -0.617081, 0.619005], [-0.873216, -0.311825, 0.374512], [-0.038083, -0.722479, -0.690343]] and translation vector: [-0.164865, 3.073333, 1.323993], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.482952, -0.621872, 0.616468], [-0.874972, -0.315096, 0.367612], [-0.034361, -0.716931, -0.696297]] and translation vector: [-0.16601, 3.069565, 1.320265], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.481893, -0.627462, 0.611613], [-0.875383, -0.314055, 0.367526], [-0.038529, -0.712503, -0.70061]] and translation vector: [-0.162661, 3.069695, 1.32373]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_147_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_147_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_147_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_147_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_147_4.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_147_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_147_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_147_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.82141, -0.124481, 0.556588], [-0.562763, -0.33543, 0.755503], [0.092651, -0.933805, -0.345579]] and translation vector: [1.795382, 2.457259, 1.379582], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.820332, -0.124179, 0.558243], [-0.564621, -0.330977, 0.75608], [0.090876, -0.935432, -0.341626]] and translation vector: [1.795684, 2.460531, 1.380001], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.815123, -0.112718, 0.568216], [-0.568956, -0.340207, 0.748698], [0.108919, -0.933571, -0.341442]] and translation vector: [1.795413, 2.484714, 1.377791]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_148_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_148_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_148_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_148_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_148_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_148_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_148_6.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_148_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.464707, 0.496079, -0.733453], [0.882598, 0.326106, -0.338639], [0.071191, -0.804711, -0.589382]] and translation vector: [2.864701, 0.868861, 1.204561], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.473617, 0.501904, -0.723726], [0.878064, 0.332992, -0.343688], [0.068496, -0.798254, -0.598414]] and translation vector: [2.869803, 0.866998, 1.20304], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.486908, 0.474562, -0.733288], [0.872245, 0.308313, -0.379646], [0.045917, -0.82446, -0.564055]] and translation vector: [2.890215, 0.843054, 1.203118]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_149_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_149_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_149_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_149_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_149_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_149_5.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_149_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_149_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.199941, 0.263531, -0.943703], [0.979453, -0.027844, 0.19974], [0.026362, -0.964249, -0.263683]] and translation vector: [3.611549, 3.757055, 1.562045], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.20075, 0.267793, -0.94233], [0.97934, -0.030969, 0.199834], [0.024331, -0.962979, -0.268477]] and translation vector: [3.608934, 3.756757, 1.557843], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.195501, 0.302185, -0.932986], [0.980511, -0.041383, 0.192056], [0.019427, -0.95235, -0.304386]] and translation vector: [3.586484, 3.775929, 1.547968]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_150_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_150_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_150_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_150_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_150_4.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_150_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_150_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_150_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.869565, 0.231948, -0.435955], [0.492522, 0.471291, -0.731647], [0.035758, -0.850932, -0.524058]] and translation vector: [2.750575, 3.154689, 1.290553], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.871211, 0.246607, -0.424472], [0.49036, 0.478017, -0.72873], [0.023195, -0.843022, -0.53738]] and translation vector: [2.712538, 3.137298, 1.287246], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.868111, 0.301221, -0.394523], [0.496051, 0.497976, -0.711305], [-0.017797, -0.813195, -0.581719]] and translation vector: [2.638672, 3.09301, 1.251808]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_151_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_151_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_151_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_151_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_151_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_151_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_151_6.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_151_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.606468, -0.360414, 0.70873], [-0.789578, -0.16805, 0.590192], [-0.093612, -0.91753, -0.386492]] and translation vector: [2.373669, 6.226582, 1.48631], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.603564, -0.356146, 0.713352], [-0.791899, -0.163667, 0.588311], [-0.092772, -0.919986, -0.380815]] and translation vector: [2.370215, 6.229294, 1.484576], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.585698, -0.348105, 0.731971], [-0.805739, -0.152014, 0.572431], [-0.087997, -0.925048, -0.369516]] and translation vector: [2.368074, 6.23172, 1.479712]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_152_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_152_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_152_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_152_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_152_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_152_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_152_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_152_7.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.414473, -0.491559, 0.765887], [-0.909569, 0.196057, -0.366396], [0.029948, -0.848488, -0.528367]] and translation vector: [0.955419, 3.497842, 1.497559], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.410009, -0.490704, 0.768832], [-0.911757, 0.198024, -0.359841], [0.024328, -0.848526, -0.528594]] and translation vector: [0.937857, 3.503192, 1.495427], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.398859, -0.490133, 0.775036], [-0.916862, 0.197836, -0.346736], [0.016617, -0.848899, -0.528293]] and translation vector: [0.908797, 3.515594, 1.497193]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_153_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_153_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_153_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_153_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_153_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_153_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_153_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_153_7.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.935902, 0.160482, -0.313582], [0.351212, -0.493772, 0.795512], [-0.027173, -0.854655, -0.518485]] and translation vector: [4.465, -0.226232, 1.550028], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.933656, 0.161027, -0.319933], [0.356818, -0.495752, 0.791777], [-0.03111, -0.853405, -0.520319]] and translation vector: [4.478531, -0.229773, 1.540292], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.918867, 0.198209, -0.341168], [0.393883, -0.511652, 0.763589], [-0.023209, -0.836017, -0.548212]] and translation vector: [4.561479, -0.239772, 1.527731]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_154_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_154_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_154_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_154_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_154_4.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_154_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_154_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_154_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.079918, -0.690871, 0.718547], [-0.996802, 0.055321, -0.057677], [9.6e-05, -0.720858, -0.693082]] and translation vector: [1.142658, 0.968078, 1.385987], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.080635, -0.691404, 0.717954], [-0.996742, 0.054488, -0.059473], [0.002, -0.72041, -0.693545]] and translation vector: [1.144302, 0.967344, 1.387927], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.084359, -0.697761, 0.711347], [-0.996391, 0.05228, -0.066881], [0.009477, -0.714421, -0.699652]] and translation vector: [1.144001, 0.956717, 1.378471]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_155_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_155_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_155_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_155_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_155_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_155_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_155_6.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_155_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.799511, 0.533863, -0.275266], [0.600541, 0.71925, -0.349328], [0.011492, -0.4446, -0.895656]] and translation vector: [2.031323, 2.312379, 1.200993], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.794986, 0.540559, -0.275306], [0.606553, 0.715482, -0.346669], [0.009582, -0.442584, -0.896676]] and translation vector: [2.031011, 2.313572, 1.199732], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.773021, 0.563749, -0.290906], [0.633995, 0.702534, -0.323259], [0.022134, -0.434318, -0.900488]] and translation vector: [2.034953, 2.302037, 1.199248]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_156_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_156_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_156_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_156_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_156_4.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_156_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_156_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_156_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.573389, -0.355745, 0.738018], [-0.818965, 0.223754, -0.528424], [0.02285, -0.907403, -0.419641]] and translation vector: [2.061407, 3.857203, 1.382209], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.569689, -0.351701, 0.742806], [-0.821614, 0.221591, -0.525212], [0.020118, -0.909508, -0.4152]] and translation vector: [2.058259, 3.848013, 1.384733], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.577204, -0.345215, 0.740042], [-0.816391, 0.223437, -0.532524], [0.018482, -0.911539, -0.410799]] and translation vector: [2.052109, 3.841456, 1.390313]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_157_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_157_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_157_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_157_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_157_4.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_157_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_157_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_157_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.998162, -0.007354, -0.06016], [0.055338, 0.294228, -0.954132], [0.024717, -0.955707, -0.293281]] and translation vector: [1.687981, 4.43329, 1.569003], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.998237, -0.004775, -0.059163], [0.055295, 0.287523, -0.956176], [0.021577, -0.957762, -0.286752]] and translation vector: [1.687716, 4.435163, 1.571974], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.998336, 0.001509, -0.057642], [0.055709, 0.283251, -0.957427], [0.014882, -0.959045, -0.282864]] and translation vector: [1.68694, 4.439428, 1.572118]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_158_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_158_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_158_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_158_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_158_4.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_158_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_158_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_158_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.633294, -0.360819, 0.684652], [-0.773758, -0.312806, 0.550863], [0.015401, -0.878613, -0.477285]] and translation vector: [3.241882, 3.386626, 1.367882], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.618852, -0.359339, 0.698497], [-0.785116, -0.311057, 0.535572], [0.02482, -0.87984, -0.47462]] and translation vector: [3.234923, 3.400149, 1.365622], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.596077, -0.384708, 0.704764], [-0.800029, -0.359087, 0.480636], [0.068167, -0.850327, -0.521821]] and translation vector: [3.228332, 3.407161, 1.324573]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_159_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_159_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_159_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_159_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_159_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_159_5.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_159_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_159_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.977514, -0.102294, 0.184398], [-0.210796, -0.497303, 0.841578], [0.005613, -0.861525, -0.507684]] and translation vector: [3.555602, 1.207732, 1.356493], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.976582, -0.105336, 0.187593], [-0.215087, -0.498001, 0.840079], [0.00493, -0.860755, -0.508995]] and translation vector: [3.555365, 1.207812, 1.356155], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.974531, -0.107289, 0.196922], [-0.224038, -0.504207, 0.834016], [0.009809, -0.856892, -0.515402]] and translation vector: [3.552069, 1.20032, 1.350158]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_160_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_160_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_160_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_160_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_160_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_160_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_160_6.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_160_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.934222, -0.219071, 0.281493], [-0.356558, -0.595286, 0.72007], [0.009823, -0.773073, -0.634241]] and translation vector: [0.331108, 1.989283, 1.551545], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.93341, -0.222981, 0.281114], [-0.358788, -0.589093, 0.724045], [0.004154, -0.776691, -0.629868]] and translation vector: [0.338532, 1.98258, 1.554168], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.924209, -0.231475, 0.303738], [-0.381819, -0.575084, 0.723528], [0.007196, -0.784664, -0.619879]] and translation vector: [0.352139, 1.976578, 1.57555]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_161_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_161_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_161_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_161_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_161_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_161_5.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_161_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_161_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.928375, -0.17783, 0.326339], [-0.371449, 0.415395, -0.830345], [0.012101, -0.892089, -0.451697]] and translation vector: [2.096006, 1.919092, 1.36174], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.929206, -0.177937, 0.323905], [-0.369314, 0.414969, -0.83151], [0.013546, -0.892266, -0.451307]] and translation vector: [2.095672, 1.922099, 1.363168], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.930649, -0.183615, 0.31651], [-0.365027, 0.405695, -0.837954], [0.025454, -0.895375, -0.444584]] and translation vector: [2.086709, 1.937528, 1.366332]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_162_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_162_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_162_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_162_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_162_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_162_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_162_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_162_7.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.699126, -0.324611, 0.637064], [-0.713802, 0.265353, -0.648131], [0.041344, -0.907863, -0.417224]] and translation vector: [0.050403, 3.78209, 1.506908], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.698648, -0.327666, 0.636024], [-0.713993, 0.262294, -0.649166], [0.045885, -0.907654, -0.417203]] and translation vector: [0.047406, 3.786517, 1.504266], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.671591, -0.353844, 0.650968], [-0.738623, 0.250587, -0.625813], [0.058316, -0.901111, -0.429649]] and translation vector: [0.057884, 3.801169, 1.498956]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_163_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_163_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_163_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_163_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_163_4.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_163_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_163_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_163_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.892065, -0.360019, 0.273141], [-0.443019, -0.577417, 0.685801], [-0.089185, -0.732786, -0.674589]] and translation vector: [2.898737, 2.45906, 1.649541], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.888376, -0.366176, 0.276954], [-0.450762, -0.581088, 0.677606], [-0.087189, -0.726809, -0.681283]] and translation vector: [2.873446, 2.440832, 1.651115], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.866588, -0.389647, 0.31177], [-0.495846, -0.601945, 0.625939], [-0.056227, -0.697021, -0.714843]] and translation vector: [2.802999, 2.373059, 1.651133]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_164_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_164_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_164_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_164_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_164_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_164_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_164_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_164_7.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.660671, 0.426343, -0.617856], [0.749322, -0.423957, 0.508701], [-0.045063, -0.799057, -0.599565]] and translation vector: [1.739014, 2.260029, 1.323145], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.661948, 0.412501, -0.625834], [0.748146, -0.41469, 0.517987], [-0.045857, -0.811095, -0.583114]] and translation vector: [1.741474, 2.257287, 1.327618], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.667808, 0.364392, -0.649039], [0.743671, -0.363436, 0.561132], [-0.031412, -0.857399, -0.513693]] and translation vector: [1.753926, 2.258369, 1.342793]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_165_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_165_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_165_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_165_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_165_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_165_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_165_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_165_7.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.997074, 0.061747, -0.045056], [0.074474, 0.651998, -0.754554], [-0.017215, -0.755702, -0.654689]] and translation vector: [1.815792, 5.369752, 1.288561], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.994543, 0.080066, -0.066881], [0.102674, 0.63762, -0.763478], [-0.018484, -0.766179, -0.642361]] and translation vector: [1.819087, 5.36055, 1.286161], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.977666, 0.151417, -0.145745], [0.209051, 0.629394, -0.748438], [-0.021596, -0.762191, -0.646992]] and translation vector: [1.833647, 5.312907, 1.282765]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_166_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_166_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_166_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_166_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_166_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_166_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_166_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_166_7.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.207785, -0.462455, 0.861952], [-0.977184, 0.13779, -0.161637], [-0.044019, -0.875871, -0.480534]] and translation vector: [2.720584, 1.654419, 1.522448], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.211008, -0.462778, 0.860995], [-0.976592, 0.137438, -0.165466], [-0.04176, -0.875755, -0.480946]] and translation vector: [2.717844, 1.649691, 1.521912], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.235215, -0.460817, 0.855758], [-0.971358, 0.142015, -0.190515], [-0.033738, -0.876059, -0.481022]] and translation vector: [2.714951, 1.646852, 1.521954]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_167_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_167_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_167_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_167_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_167_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_167_5.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_167_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_167_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.686341, -0.358824, 0.632599], [-0.727213, -0.35045, 0.590209], [0.009912, -0.865119, -0.50147]] and translation vector: [2.486494, 4.601647, 1.455454], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.681394, -0.352774, 0.64129], [-0.731846, -0.340576, 0.590263], [0.010179, -0.871527, -0.490243]] and translation vector: [2.480601, 4.595852, 1.449959], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.622935, -0.386366, 0.680202], [-0.78205, -0.328403, 0.52967], [0.018734, -0.861901, -0.50673]] and translation vector: [2.469727, 4.596006, 1.44499]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_168_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_168_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_168_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_168_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_168_4.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_168_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_168_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_168_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.482968, -0.397392, 0.78027], [-0.874514, 0.173759, -0.452807], [0.044362, -0.901048, -0.431445]] and translation vector: [8.974016, 2.795387, 1.945192], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.496352, -0.388832, 0.776173], [-0.867003, 0.176647, -0.465943], [0.044064, -0.904216, -0.424797]] and translation vector: [8.98292, 2.792107, 1.939625], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.528625, -0.374982, 0.76154], [-0.848241, 0.199205, -0.490719], [0.032308, -0.905376, -0.42338]] and translation vector: [9.019628, 2.751405, 1.924251]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_169_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_169_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_169_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_169_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_169_4.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_169_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_169_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_169_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.891251, 0.378307, -0.25011], [0.443048, 0.608538, -0.658323], [-0.096846, -0.697542, -0.709969]] and translation vector: [4.935522, 3.588868, 1.45033], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.887006, 0.383874, -0.256633], [0.452131, 0.60913, -0.651566], [-0.093796, -0.693975, -0.713864]] and translation vector: [4.940225, 3.582454, 1.45688], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.875452, 0.38739, -0.288987], [0.475285, 0.581583, -0.660201], [-0.087685, -0.715325, -0.693269]] and translation vector: [4.970656, 3.561422, 1.469218]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_170_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_170_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_170_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_170_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_170_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_170_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_170_6.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_170_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.530794, 0.426739, -0.732224], [0.841151, 0.159702, -0.516681], [-0.10355, -0.890162, -0.443721]] and translation vector: [5.418979, 4.373359, 1.385162], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.532043, 0.421439, -0.734384], [0.841755, 0.169492, -0.512564], [-0.091542, -0.890877, -0.444925]] and translation vector: [5.415919, 4.39552, 1.38299], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.539398, 0.40032, -0.740806], [0.839984, 0.194205, -0.506666], [-0.05896, -0.89556, -0.441017]] and translation vector: [5.414681, 4.463818, 1.378667]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_171_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_171_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_171_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_171_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_171_4.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_171_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_171_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_171_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.086843, 0.425015, -0.901011], [0.995696, 0.066429, -0.064634], [0.032383, -0.902745, -0.428955]] and translation vector: [4.261571, 5.85756, 1.66629], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.086953, 0.422316, -0.902268], [0.995713, 0.06553, -0.065286], [0.031554, -0.904077, -0.426204]] and translation vector: [4.260677, 5.865657, 1.669414], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.081846, 0.421358, -0.903194], [0.995927, 0.068976, -0.058071], [0.03783, -0.904268, -0.425287]] and translation vector: [4.263237, 5.864869, 1.673574]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_172_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_172_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_172_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_172_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_172_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_172_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_172_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_172_7.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.725417, 0.297171, -0.620854], [0.687848, -0.279954, 0.669695], [0.025203, -0.912861, -0.407492]] and translation vector: [3.434752, 3.057745, 1.556519], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.722045, 0.303192, -0.621873], [0.691238, -0.278447, 0.666827], [0.029018, -0.911341, -0.410629]] and translation vector: [3.433538, 3.052318, 1.549734], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.693174, 0.307801, -0.651742], [0.720057, -0.255516, 0.645158], [0.032049, -0.916499, -0.398751]] and translation vector: [3.420418, 3.038936, 1.558387]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_173_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_173_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_173_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_173_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_173_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_173_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_173_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_173_7.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.032646, 0.194727, -0.980314], [0.998594, -0.034636, -0.040135], [-0.04177, -0.980246, -0.193322]] and translation vector: [3.506056, 2.493951, 1.706783], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.038857, 0.192835, -0.980462], [0.998032, -0.040846, -0.047587], [-0.049225, -0.980381, -0.190868]] and translation vector: [3.502031, 2.499079, 1.701362], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.064111, 0.226282, -0.97195], [0.996323, -0.040955, -0.075254], [-0.056835, -0.9732, -0.222824]] and translation vector: [3.459589, 2.490182, 1.701209]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_174_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_174_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_174_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_174_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_174_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_174_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_174_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_174_7.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.881415, -0.308012, 0.3581], [-0.47008, 0.646119, -0.601294], [-0.046169, -0.698325, -0.71429]] and translation vector: [3.147524, 1.689608, 1.273114], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.879224, -0.311908, 0.360109], [-0.474637, 0.638627, -0.605703], [-0.041052, -0.703469, -0.709539]] and translation vector: [3.141599, 1.689583, 1.27073], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.878218, -0.323901, 0.351882], [-0.476941, 0.647734, -0.594111], [-0.035492, -0.689586, -0.723334]] and translation vector: [3.127244, 1.682619, 1.264528]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_175_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_175_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_175_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_175_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_175_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_175_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_175_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_175_7.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.467192, 0.317292, -0.825262], [0.883302, -0.126478, 0.451421], [0.038855, -0.939856, -0.339354]] and translation vector: [2.723032, 3.168159, 1.438168], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.467636, 0.312306, -0.826911], [0.883318, -0.130557, 0.450227], [0.03265, -0.940968, -0.336919]] and translation vector: [2.722188, 3.168039, 1.441817], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.470302, 0.305008, -0.828122], [0.881834, -0.125828, 0.454462], [0.034414, -0.944001, -0.328143]] and translation vector: [2.718763, 3.171866, 1.451475]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_176_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_176_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_176_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_176_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_176_4.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_176_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_176_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_176_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.943065, -0.17817, 0.280864], [-0.332105, 0.550897, -0.765649], [-0.018311, -0.815333, -0.578703]] and translation vector: [2.74599, 1.673222, 1.294065], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.942639, -0.173012, 0.285478], [-0.332909, 0.550136, -0.765848], [-0.024551, -0.816957, -0.576177]] and translation vector: [2.737266, 1.663808, 1.300966], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.942881, -0.164787, 0.289518], [-0.331772, 0.54291, -0.771477], [-0.030053, -0.823465, -0.566571]] and translation vector: [2.712684, 1.645235, 1.301017]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_177_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_177_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_177_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_177_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_177_4.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_177_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_177_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_177_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.695296, -0.421579, 0.582095], [-0.717067, -0.351947, 0.601622], [-0.048765, -0.835707, -0.547007]] and translation vector: [2.470866, 0.652559, 1.473924], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.695871, -0.418819, 0.583399], [-0.716734, -0.353708, 0.600986], [-0.045352, -0.83635, -0.546317]] and translation vector: [2.469546, 0.651931, 1.473078], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.693531, -0.42586, 0.581085], [-0.719633, -0.371637, 0.586528], [-0.033826, -0.824943, -0.564204]] and translation vector: [2.467637, 0.650008, 1.462326]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_178_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_178_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_178_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_178_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_178_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_178_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_178_6.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_178_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.748873, -0.374013, 0.547087], [-0.662404, -0.447673, 0.600675], [0.020256, -0.812221, -0.582998]] and translation vector: [3.709567, 4.406117, 1.261793], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.747082, -0.370975, 0.551585], [-0.664465, -0.440253, 0.603874], [0.018814, -0.817652, -0.575405]] and translation vector: [3.708719, 4.403161, 1.261416], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.743545, -0.377269, 0.552096], [-0.66849, -0.439378, 0.600057], [0.016196, -0.81524, -0.578898]] and translation vector: [3.708687, 4.402202, 1.259327]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_179_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_179_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_179_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_179_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_179_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_179_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_179_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_179_7.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.925351, 0.122106, -0.358909], [0.376741, 0.190476, -0.906524], [-0.042329, -0.974068, -0.222259]] and translation vector: [4.735593, 2.732706, 1.21643], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.924788, 0.125024, -0.359357], [0.377675, 0.187086, -0.906841], [-0.046146, -0.974355, -0.220234]] and translation vector: [4.740286, 2.733964, 1.218072], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.925715, 0.103215, -0.363867], [0.37582, 0.142741, -0.915633], [-0.042569, -0.984363, -0.170928]] and translation vector: [4.730338, 2.742957, 1.247444]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_180_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_180_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_180_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_180_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_180_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_180_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_180_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_180_7.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.264492, -0.222038, 0.938479], [-0.962334, 0.002714, 0.271857], [-0.062909, -0.975034, -0.212957]] and translation vector: [0.925816, 4.784833, 1.497389], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.263009, -0.220134, 0.939344], [-0.962729, 0.003779, 0.270443], [-0.063084, -0.975462, -0.210935]] and translation vector: [0.925807, 4.784041, 1.498483], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.243124, -0.227834, 0.942858], [-0.968357, -0.000546, 0.249567], [-0.056345, -0.9737, -0.220758]] and translation vector: [0.931793, 4.784123, 1.4987]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_181_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_181_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_181_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_181_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_181_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_181_5.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_181_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_181_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.173351, 0.592298, -0.78685], [0.984858, -0.105806, 0.137329], [-0.001913, -0.798742, -0.601671]] and translation vector: [3.264189, 1.940071, 1.28435], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.172933, 0.589263, -0.789217], [0.98493, -0.105695, 0.136901], [-0.002745, -0.800998, -0.598661]] and translation vector: [3.267153, 1.942133, 1.284021], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.139436, 0.623012, -0.769684], [0.990166, -0.096604, 0.101183], [-0.011316, -0.776224, -0.630355]] and translation vector: [3.29114, 1.970334, 1.268272]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_182_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_182_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_182_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_182_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_182_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_182_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_182_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_182_7.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.442667, -0.46733, 0.765277], [-0.896368, 0.253361, -0.363776], [-0.023888, -0.847001, -0.531054]] and translation vector: [2.453469, 1.905797, 1.451684], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.441405, -0.472001, 0.763136], [-0.897015, 0.253848, -0.361837], [-0.022933, -0.844261, -0.535442]] and translation vector: [2.45238, 1.90449, 1.449179], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.442687, -0.461983, 0.768505], [-0.8965, 0.24504, -0.369112], [-0.017791, -0.852366, -0.522643]] and translation vector: [2.451253, 1.899634, 1.462124]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_183_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_183_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_183_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_183_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_183_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_183_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_183_6.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_183_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.643628, -0.362528, 0.674031], [-0.765241, -0.290748, 0.574345], [-0.012243, -0.88546, -0.464555]] and translation vector: [2.632762, 2.243425, 1.452714], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.642371, -0.361874, 0.675579], [-0.76623, -0.285016, 0.575898], [-0.015852, -0.887589, -0.460364]] and translation vector: [2.634792, 2.237319, 1.452971], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.637523, -0.35682, 0.682821], [-0.770314, -0.279737, 0.573031], [-0.013459, -0.891306, -0.453202]] and translation vector: [2.638724, 2.233015, 1.462981]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_184_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_184_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_184_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_184_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_184_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_184_5.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_184_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_184_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.731293, 0.384445, -0.563394], [0.682011, 0.401944, -0.610984], [-0.008437, -0.831049, -0.556135]] and translation vector: [5.176627, 2.209938, 1.427488], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.733453, 0.387758, -0.558292], [0.679719, 0.411882, -0.606907], [-0.005383, -0.82462, -0.565663]] and translation vector: [5.175584, 2.209993, 1.422561], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.739748, 0.384821, -0.551984], [0.672884, 0.424134, -0.606084], [0.000881, -0.819771, -0.572692]] and translation vector: [5.164479, 2.208437, 1.426833]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_185_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_185_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_185_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_185_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_185_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_185_5.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_185_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_185_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.996822, -0.027813, -0.074656], [0.056495, -0.413943, 0.908548], [-0.056173, -0.909878, -0.411056]] and translation vector: [4.405487, 5.403347, 1.494535], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.996757, -0.027349, -0.075677], [0.057466, -0.416379, 0.907373], [-0.056327, -0.90878, -0.413457]] and translation vector: [4.408994, 5.403286, 1.494292], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.997265, -0.029561, -0.067745], [0.049832, -0.408017, 0.911613], [-0.05459, -0.912496, -0.405428]] and translation vector: [4.415172, 5.400004, 1.499593]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_186_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_186_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_186_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_186_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_186_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_186_5.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_186_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_186_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.789457, 0.162095, -0.592016], [0.613764, 0.197318, -0.764434], [-0.007096, -0.966846, -0.255262]] and translation vector: [5.114759, 3.17533, 1.386193], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.785271, 0.158609, -0.598492], [0.619131, 0.193201, -0.761151], [-0.005096, -0.968255, -0.249915]] and translation vector: [5.11251, 3.170745, 1.383731], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.782732, 0.165019, -0.600083], [0.622288, 0.192888, -0.758652], [-0.009443, -0.967245, -0.253669]] and translation vector: [5.104394, 3.153102, 1.37449]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_187_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_187_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_187_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_187_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_187_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_187_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_187_6.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_187_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.95695, -0.100486, 0.272304], [-0.288986, 0.24231, -0.92616], [0.027085, -0.964981, -0.260918]] and translation vector: [1.227478, 4.879099, 1.55452], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.957752, -0.097454, 0.27058], [-0.286469, 0.240112, -0.927514], [0.025421, -0.965841, -0.257885]] and translation vector: [1.221714, 4.885019, 1.554874], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.941817, -0.081741, 0.326036], [-0.336056, 0.20922, -0.91831], [0.00685, -0.974446, -0.224516]] and translation vector: [1.204022, 4.901892, 1.569033]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_188_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_188_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_188_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_188_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_188_4.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_188_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_188_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_188_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.982764, 0.054289, -0.17671], [0.184841, -0.27426, 0.943724], [0.002769, -0.960122, -0.279568]] and translation vector: [4.072058, 1.220293, 1.47625], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.982485, 0.057917, -0.177113], [0.186218, -0.270474, 0.944546], [0.0068, -0.960984, -0.276522]] and translation vector: [4.071517, 1.218265, 1.477941], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.980674, 0.05705, -0.187148], [0.195532, -0.252477, 0.947641], [0.006813, -0.96592, -0.258752]] and translation vector: [4.0711, 1.209071, 1.48705]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_189_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_189_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_189_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_189_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_189_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_189_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_189_6.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_189_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.693623, 0.392298, -0.604144], [0.720137, 0.397492, -0.568686], [0.017048, -0.82952, -0.558217]] and translation vector: [2.706242, 2.586761, 1.453005], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.690051, 0.396658, -0.605386], [0.723517, 0.399766, -0.56277], [0.018785, -0.826347, -0.562848]] and translation vector: [2.704536, 2.590014, 1.45316], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.674504, 0.428853, -0.600941], [0.737993, 0.414011, -0.53288], [0.020269, -0.80292, -0.595742]] and translation vector: [2.699649, 2.603579, 1.443268]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_190_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_190_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_190_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_190_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_190_4.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_190_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_190_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_190_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.610102, 0.375008, -0.697958], [0.791763, 0.255448, -0.554849], [-0.029781, -0.891132, -0.452767]] and translation vector: [2.349929, 1.419923, 1.358478], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.607496, 0.374505, -0.700496], [0.793845, 0.255679, -0.551759], [-0.027534, -0.891277, -0.452623]] and translation vector: [2.354864, 1.421781, 1.358478], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.579764, 0.373065, -0.724359], [0.814546, 0.24389, -0.526338], [-0.019694, -0.895176, -0.445277]] and translation vector: [2.359462, 1.423068, 1.367348]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_191_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_191_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_191_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_191_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_191_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_191_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_191_6.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_191_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.081815, 0.638296, -0.765431], [0.996577, -0.061545, 0.055199], [-0.011875, -0.767327, -0.641146]] and translation vector: [3.004073, 1.570726, 1.431248], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.083332, 0.64082, -0.763155], [0.996457, -0.062303, 0.056492], [-0.011346, -0.765159, -0.643742]] and translation vector: [3.00242, 1.571458, 1.432065], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.083112, 0.654572, -0.751417], [0.996444, -0.065065, 0.053535], [-0.013848, -0.753195, -0.657652]] and translation vector: [3.01468, 1.572497, 1.43131]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_192_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_192_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_192_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_192_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_192_4.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_192_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_192_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_192_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.348231, 0.123124, -0.929288], [0.936413, -1.6e-05, 0.350899], [0.043189, -0.992391, -0.1153]] and translation vector: [2.712005, 2.075202, 1.464169], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.348319, 0.120186, -0.929639], [0.93641, 0.000395, 0.350907], [0.042542, -0.992751, -0.112406]] and translation vector: [2.712393, 2.076758, 1.463984], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.330226, 0.128954, -0.935052], [0.94318, -0.00633, 0.332223], [0.036923, -0.99163, -0.123717]] and translation vector: [2.702959, 2.087481, 1.468829]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_193_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_193_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_193_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_193_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_193_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_193_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_193_6.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_193_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.524333, 0.441188, -0.728305], [0.848808, -0.202677, 0.488311], [0.067827, -0.874228, -0.480754]] and translation vector: [3.10696, 1.250425, 1.344077], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.531491, 0.437044, -0.72561], [0.844432, -0.205894, 0.494513], [0.066725, -0.875557, -0.478485]] and translation vector: [3.107462, 1.25329, 1.344278], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.56012, 0.431145, -0.707375], [0.826071, -0.226557, 0.516021], [0.062219, -0.873376, -0.483056]] and translation vector: [3.110022, 1.262991, 1.348097]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_194_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_194_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_194_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_194_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_194_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_194_5.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_194_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_194_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.000188, -0.47362, 0.88073], [-0.997828, 0.057931, 0.031365], [-0.065877, -0.878822, -0.47258]] and translation vector: [4.366519, 5.511691, 1.307889], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.002248, -0.465195, 0.885205], [-0.998254, 0.053289, 0.02547], [-0.05902, -0.883603, -0.464503]] and translation vector: [4.36891, 5.516212, 1.317108], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.024267, -0.440835, 0.89726], [-0.998159, 0.06059, 0.002773], [-0.055588, -0.895541, -0.441493]] and translation vector: [4.36929, 5.527184, 1.331889]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_195_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_195_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_195_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_195_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_195_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_195_5.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_195_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_195_7.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.819759, -0.274444, 0.502669], [-0.572709, 0.39303, -0.719397], [-0.00013, -0.877615, -0.479366]] and translation vector: [2.765326, 1.370172, 1.355227], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.819555, -0.26888, 0.505998], [-0.572993, 0.389095, -0.721307], [-0.002936, -0.881084, -0.472951]] and translation vector: [2.765196, 1.369276, 1.358405], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.80543, -0.264338, 0.530479], [-0.592674, 0.365802, -0.717584], [-0.004366, -0.892365, -0.451294]] and translation vector: [2.783833, 1.382351, 1.368477]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_196_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_196_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_196_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_196_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_196_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_196_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_196_6.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_196_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.060487, 0.154719, -0.986105], [0.998165, 0.006603, -0.060191], [-0.002801, -0.987936, -0.154835]] and translation vector: [6.630666, 2.572317, 1.44523], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.062036, 0.175232, -0.982571], [0.998074, 0.011306, -0.060998], [0.00042, -0.984462, -0.175596]] and translation vector: [6.62843, 2.567178, 1.442285], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.077658, 0.209818, -0.974652], [0.996978, 0.01426, -0.076367], [-0.002124, -0.977636, -0.210291]] and translation vector: [6.626263, 2.56408, 1.439607]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_197_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_197_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_197_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_197_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_197_4.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_197_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_197_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_197_7.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.286652, 0.220257, -0.932372], [0.958024, -0.061246, 0.28007], [0.004584, -0.973517, -0.228568]] and translation vector: [3.76659, 1.676076, 1.452194], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.299829, 0.216367, -0.929133], [0.953977, -0.07366, 0.290693], [-0.005544, -0.973529, -0.228495]] and translation vector: [3.753121, 1.670498, 1.452776], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.332229, 0.205241, -0.920597], [0.943053, -0.089416, 0.320398], [-0.016558, -0.974618, -0.22326]] and translation vector: [3.692962, 1.621141, 1.4585]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_198_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_198_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_198_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_198_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_198_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_198_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_198_6.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_198_7.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Scene_Reconstruction",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_scene_reconstruction",
+    "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.409087, -0.112571, 0.905525], [-0.910894, 0.109148, -0.397943], [-0.05404, -0.987631, -0.147191]] and translation vector: [4.421403, 3.579741, 1.526424], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.417977, -0.10834, 0.901974], [-0.906895, 0.107978, -0.407287], [-0.053267, -0.988232, -0.143386]] and translation vector: [4.418822, 3.582731, 1.526625], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.44932, -0.10036, 0.887716], [-0.891042, 0.12205, -0.437205], [-0.064468, -0.987437, -0.144264]] and translation vector: [4.403283, 3.625828, 1.518726]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.",
+    "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_199_0.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_199_1.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_199_2.jpg",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_199_3.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_199_4.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_199_5.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_199_6.png",
+      "../MMIU-Benchmark/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_199_7.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[0.097, -2.343, -0.119, 0.31, 0.062, 0.564], [-0.682, 2.2, 0.854, 0.245, 0.21, 0.887], [1.912, 1.405, 1.111, 0.468, 0.452, 0.649], [1.666, 2.151, 1.319, 0.566, 0.243, 0.586], [1.68, 1.827, 0.754, -0.048, 0.681, 0.119], [1.224, 1.284, 0.947, -0.031, 0.464, 0.194], [1.776, 1.335, 0.376, 0.886, 0.055, 0.335], [1.559, 1.179, 1.34, -0.063, -0.136, -0.017], [0.95, 0.255, 1.046, -0.137, 0.36, -0.187], [1.987, 0.956, 0.62, 0.226, -0.239, 0.309], [1.652, 0.525, 1.179, 0.104, 0.539, -0.125], [1.409, 0.711, 0.681, 0.259, 0.215, 0.643], [1.086, -0.177, -0.123, 0.706, 0.621, 0.551], [1.203, 1.982, 0.324, 0.158, 0.718, -0.1], [-1.373, -0.521, 0.674, -0.225, 0.726, 0.299], [-0.949, 2.103, 0.551, 0.314, 0.298, 0.513], [-0.76, 1.491, 2.338, 0.158, -0.084, 0.396], [1.233, -0.382, 0.447, 0.091, 0.059, -0.038], [1.9, -1.181, 1.39, 0.188, -0.114, -0.011]]\nB: [[-0.029, -1.923, 0.096, 0.249, 0.113, 0.317], [-0.92, 1.815, 0.646, 0.107, 0.184, 0.516], [1.611, 1.619, 0.954, 0.253, 0.266, 0.249], [1.544, 1.826, 0.949, 0.149, 0.178, 0.182], [1.432, 1.79, 0.87, 0.251, 0.301, 0.054], [1.423, 1.326, 0.897, 0.087, 0.104, 0.188], [1.554, 0.837, 0.836, 0.609, 0.432, 0.17], [1.27, 0.844, 0.842, 0.093, 0.107, 0.179], [1.248, 0.676, 0.764, 0.25, 0.269, 0.064], [1.848, 0.645, 0.825, 0.074, 0.102, 0.07], [1.69, 0.454, 0.837, 0.248, 0.167, 0.182], [1.715, 0.276, 0.86, 0.335, 0.399, 0.17], [1.4, -0.193, 0.037, 0.353, 0.275, 0.161], [1.684, 1.781, 0.29, 0.433, 0.395, 0.339], [-1.62, -0.65, 0.742, 0.272, 0.321, 0.114], [-0.975, 1.832, 0.171, 0.262, 0.195, 0.125], [-1.021, 1.599, 2.098, 0.217, 0.273, 0.221], [1.433, 0.084, 0.074, 0.398, 0.344, 0.176], [1.733, -1.208, 1.076, 0.125, 0.108, 0.327]]\nC: [[0.373, -2.148, -0.291, 0.339, 0.018, 0.694], [-0.499, 2.165, 0.993, 0.276, 0.335, 0.775], [1.74, 1.478, 1.323, -0.069, 0.758, 0.607], [1.532, 2.302, 1.262, 0.285, 0.22, -0.252], [1.323, 1.537, 0.593, 0.351, 0.467, 0.392], [1.364, 1.041, 1.236, 0.12, 0.57, 0.444], [1.442, 1.263, 1.284, 1.004, 0.007, 0.304], [1.115, 0.536, 0.672, -0.113, -0.219, -0.082], [1.743, 0.762, 0.395, 0.159, 0.41, 0.323], [2.121, 0.573, 0.527, -0.324, 0.247, 0.462], [1.447, 0.752, 1.299, 0.299, 0.347, 0.233], [1.92, 0.62, 0.769, -0.13, 0.686, -0.059], [0.942, 0.049, -0.066, 0.316, 0.607, 0.459], [2.077, 2.024, 0.781, 0.373, -0.058, 0.752], [-1.491, -0.599, 0.622, 0.707, -0.171, -0.319], [-1.023, 1.772, -0.236, 0.203, 0.47, 0.117], [-0.596, 1.76, 1.726, 0.197, 0.073, 0.18], [1.574, 0.398, 0.118, 0.732, 0.235, 0.24], [1.654, -1.081, 1.126, -0.043, 0.128, 0.085]]\nD: [[-0.016, -2.182, 0.529, 0.012, -0.234, 0.082], [-0.638, 1.532, 1.107, 0.49, 0.648, 0.861], [1.27, 1.262, 1.438, 0.461, 0.457, 0.658], [1.731, 1.955, 1.411, 0.079, 0.038, 0.636], [1.916, 1.8, 0.455, 0.749, 0.555, 0.441], [1.273, 0.97, 0.909, 0.183, -0.155, 0.402], [1.478, 1.046, 1.305, 0.37, 0.729, 0.224], [1.279, 0.48, 0.354, 0.143, -0.211, 0.086], [1.055, 0.494, 1.055, -0.029, 0.559, -0.151], [2.256, 0.151, 1.167, -0.326, -0.138, 0.075], [1.326, 0.605, 0.815, -0.119, 0.42, 0.177], [2.172, 0.341, 0.688, 0.742, 0.292, 0.566], [1.621, -0.605, 0.175, 0.538, -0.117, 0.628], [1.477, 1.542, -0.082, 0.684, 0.168, -0.065], [-1.675, -0.211, 0.417, 0.169, -0.09, -0.164], [-1.277, 1.624, 0.657, -0.231, 0.334, -0.097], [-0.751, 1.371, 1.707, -0.044, 0.702, 0.452], [1.41, -0.207, 0.284, 0.114, 0.651, -0.312], [1.447, -0.888, 1.553, -0.369, 0.402, 0.174]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the object in the scene. The camera pose information includes: the rotation matrix: [[0.86482, -0.183466, 0.467362], [-0.501092, -0.256948, 0.826368], [-0.031523, -0.948851, -0.314147]]; the translation vector: [3.012278, 2.022242, 1.442339], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.097, -2.343, -0.119, 0.31, 0.062, 0.564], [-0.682, 2.2, 0.854, 0.245, 0.21, 0.887], [1.912, 1.405, 1.111, 0.468, 0.452, 0.649], [1.666, 2.151, 1.319, 0.566, 0.243, 0.586], [1.68, 1.827, 0.754, -0.048, 0.681, 0.119], [1.224, 1.284, 0.947, -0.031, 0.464, 0.194], [1.776, 1.335, 0.376, 0.886, 0.055, 0.335], [1.559, 1.179, 1.34, -0.063, -0.136, -0.017], [0.95, 0.255, 1.046, -0.137, 0.36, -0.187], [1.987, 0.956, 0.62, 0.226, -0.239, 0.309], [1.652, 0.525, 1.179, 0.104, 0.539, -0.125], [1.409, 0.711, 0.681, 0.259, 0.215, 0.643], [1.086, -0.177, -0.123, 0.706, 0.621, 0.551], [1.203, 1.982, 0.324, 0.158, 0.718, -0.1], [-1.373, -0.521, 0.674, -0.225, 0.726, 0.299], [-0.949, 2.103, 0.551, 0.314, 0.298, 0.513], [-0.76, 1.491, 2.338, 0.158, -0.084, 0.396], [1.233, -0.382, 0.447, 0.091, 0.059, -0.038], [1.9, -1.181, 1.39, 0.188, -0.114, -0.011]]\nB: [[-0.029, -1.923, 0.096, 0.249, 0.113, 0.317], [-0.92, 1.815, 0.646, 0.107, 0.184, 0.516], [1.611, 1.619, 0.954, 0.253, 0.266, 0.249], [1.544, 1.826, 0.949, 0.149, 0.178, 0.182], [1.432, 1.79, 0.87, 0.251, 0.301, 0.054], [1.423, 1.326, 0.897, 0.087, 0.104, 0.188], [1.554, 0.837, 0.836, 0.609, 0.432, 0.17], [1.27, 0.844, 0.842, 0.093, 0.107, 0.179], [1.248, 0.676, 0.764, 0.25, 0.269, 0.064], [1.848, 0.645, 0.825, 0.074, 0.102, 0.07], [1.69, 0.454, 0.837, 0.248, 0.167, 0.182], [1.715, 0.276, 0.86, 0.335, 0.399, 0.17], [1.4, -0.193, 0.037, 0.353, 0.275, 0.161], [1.684, 1.781, 0.29, 0.433, 0.395, 0.339], [-1.62, -0.65, 0.742, 0.272, 0.321, 0.114], [-0.975, 1.832, 0.171, 0.262, 0.195, 0.125], [-1.021, 1.599, 2.098, 0.217, 0.273, 0.221], [1.433, 0.084, 0.074, 0.398, 0.344, 0.176], [1.733, -1.208, 1.076, 0.125, 0.108, 0.327]]\nC: [[0.373, -2.148, -0.291, 0.339, 0.018, 0.694], [-0.499, 2.165, 0.993, 0.276, 0.335, 0.775], [1.74, 1.478, 1.323, -0.069, 0.758, 0.607], [1.532, 2.302, 1.262, 0.285, 0.22, -0.252], [1.323, 1.537, 0.593, 0.351, 0.467, 0.392], [1.364, 1.041, 1.236, 0.12, 0.57, 0.444], [1.442, 1.263, 1.284, 1.004, 0.007, 0.304], [1.115, 0.536, 0.672, -0.113, -0.219, -0.082], [1.743, 0.762, 0.395, 0.159, 0.41, 0.323], [2.121, 0.573, 0.527, -0.324, 0.247, 0.462], [1.447, 0.752, 1.299, 0.299, 0.347, 0.233], [1.92, 0.62, 0.769, -0.13, 0.686, -0.059], [0.942, 0.049, -0.066, 0.316, 0.607, 0.459], [2.077, 2.024, 0.781, 0.373, -0.058, 0.752], [-1.491, -0.599, 0.622, 0.707, -0.171, -0.319], [-1.023, 1.772, -0.236, 0.203, 0.47, 0.117], [-0.596, 1.76, 1.726, 0.197, 0.073, 0.18], [1.574, 0.398, 0.118, 0.732, 0.235, 0.24], [1.654, -1.081, 1.126, -0.043, 0.128, 0.085]]\nD: [[-0.016, -2.182, 0.529, 0.012, -0.234, 0.082], [-0.638, 1.532, 1.107, 0.49, 0.648, 0.861], [1.27, 1.262, 1.438, 0.461, 0.457, 0.658], [1.731, 1.955, 1.411, 0.079, 0.038, 0.636], [1.916, 1.8, 0.455, 0.749, 0.555, 0.441], [1.273, 0.97, 0.909, 0.183, -0.155, 0.402], [1.478, 1.046, 1.305, 0.37, 0.729, 0.224], [1.279, 0.48, 0.354, 0.143, -0.211, 0.086], [1.055, 0.494, 1.055, -0.029, 0.559, -0.151], [2.256, 0.151, 1.167, -0.326, -0.138, 0.075], [1.326, 0.605, 0.815, -0.119, 0.42, 0.177], [2.172, 0.341, 0.688, 0.742, 0.292, 0.566], [1.621, -0.605, 0.175, 0.538, -0.117, 0.628], [1.477, 1.542, -0.082, 0.684, 0.168, -0.065], [-1.675, -0.211, 0.417, 0.169, -0.09, -0.164], [-1.277, 1.624, 0.657, -0.231, 0.334, -0.097], [-0.751, 1.371, 1.707, -0.044, 0.702, 0.452], [1.41, -0.207, 0.284, 0.114, 0.651, -0.312], [1.447, -0.888, 1.553, -0.369, 0.402, 0.174]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_0_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_0_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-2.373, -1.08, 0.874, 0.298, 4.385, 1.982], [2.17, -0.036, 1.086, 0.309, 6.903, 1.887], [0.087, 4.155, 1.455, 2.931, 0.24, 1.054], [-2.394, 2.523, 0.998, 0.208, 1.864, 1.096]]\nB: [[-2.011, -0.956, 1.222, 0.755, 4.764, 2.451], [2.068, 0.255, 0.587, 0.436, 7.258, 1.991], [0.329, 4.55, 1.265, 3.421, 0.529, 1.367], [-2.081, 2.555, 0.715, 0.343, 1.488, 1.021]]\nC: [[-2.751, -0.591, 0.65, -0.115, 4.495, 2.351], [1.978, 0.331, 1.034, 0.171, 7.03, 2.051], [0.03, 3.84, 1.693, 3.348, 0.554, 1.247], [-2.636, 2.957, 1.408, -0.018, 1.435, 1.132]]\nD: [[-2.401, -1.35, 0.706, 0.08, 4.387, 2.134], [2.651, -0.401, 0.766, 0.612, 6.557, 1.511], [0.041, 3.914, 1.655, 3.173, 0.701, 0.821], [-2.147, 2.752, 0.898, 0.388, 2.028, 1.081]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[-0.060487, 0.154719, -0.986105], [0.998165, 0.006603, -0.060191], [-0.002801, -0.987936, -0.154835]]; the translation vector: [6.630666, 2.572317, 1.44523], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-2.373, -1.08, 0.874, 0.298, 4.385, 1.982], [2.17, -0.036, 1.086, 0.309, 6.903, 1.887], [0.087, 4.155, 1.455, 2.931, 0.24, 1.054], [-2.394, 2.523, 0.998, 0.208, 1.864, 1.096]]\nB: [[-2.011, -0.956, 1.222, 0.755, 4.764, 2.451], [2.068, 0.255, 0.587, 0.436, 7.258, 1.991], [0.329, 4.55, 1.265, 3.421, 0.529, 1.367], [-2.081, 2.555, 0.715, 0.343, 1.488, 1.021]]\nC: [[-2.751, -0.591, 0.65, -0.115, 4.495, 2.351], [1.978, 0.331, 1.034, 0.171, 7.03, 2.051], [0.03, 3.84, 1.693, 3.348, 0.554, 1.247], [-2.636, 2.957, 1.408, -0.018, 1.435, 1.132]]\nD: [[-2.401, -1.35, 0.706, 0.08, 4.387, 2.134], [2.651, -0.401, 0.766, 0.612, 6.557, 1.511], [0.041, 3.914, 1.655, 3.173, 0.701, 0.821], [-2.147, 2.752, 0.898, 0.388, 2.028, 1.081]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_1_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_1_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[0.552, 0.743, 0.817, 1.009, -0.013, 1.113], [0.943, 1.174, 0.929, -0.18, 0.779, 1.068]]\nB: [[0.748, 0.782, 0.621, 0.556, 0.127, 0.839], [1.697, 1.131, 0.375, -0.273, 1.039, 1.495]]\nC: [[0.612, 1.202, 0.708, 0.529, 0.114, 0.821], [1.612, 1.184, 0.198, 0.419, 0.977, 0.647]]\nD: [[0.368, 1.181, 0.615, 0.989, 0.028, 1.271], [1.284, 0.726, 0.504, 0.067, 0.905, 1.037]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the doorframe in the scene. The camera pose information includes: the rotation matrix: [[-0.610102, 0.375008, -0.697958], [0.791763, 0.255448, -0.554849], [-0.029781, -0.891132, -0.452767]]; the translation vector: [2.349929, 1.419923, 1.358478], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.552, 0.743, 0.817, 1.009, -0.013, 1.113], [0.943, 1.174, 0.929, -0.18, 0.779, 1.068]]\nB: [[0.748, 0.782, 0.621, 0.556, 0.127, 0.839], [1.697, 1.131, 0.375, -0.273, 1.039, 1.495]]\nC: [[0.612, 1.202, 0.708, 0.529, 0.114, 0.821], [1.612, 1.184, 0.198, 0.419, 0.977, 0.647]]\nD: [[0.368, 1.181, 0.615, 0.989, 0.028, 1.271], [1.284, 0.726, 0.504, 0.067, 0.905, 1.037]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_2_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_2_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[1.47, 0.453, 0.894, 0.2, 0.52, 0.291], [1.542, -0.676, 0.862, 0.217, 0.405, 0.289], [-1.666, -1.034, 0.158, 0.332, 0.363, 0.294]]\nB: [[1.471, 0.336, 1.375, -0.216, 0.786, 0.736], [1.469, -0.24, 1.152, 0.44, 0.255, 0.196], [-1.84, -1.112, 0.203, 0.586, 0.569, 0.159]]\nC: [[1.315, 0.861, 1.294, -0.145, 0.334, 0.615], [1.166, -0.686, 1.016, 0.2, 0.258, 0.346], [-2.029, -1.236, -0.071, 0.818, 0.37, 0.684]]\nD: [[1.59, 0.904, 1.331, 0.394, 0.302, 0.781], [1.626, -0.262, 1.266, -0.178, 0.337, 0.326], [-1.34, -1.047, 0.45, -0.149, 0.438, 0.179]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the speaker in the scene. The camera pose information includes: the rotation matrix: [[-0.283698, -0.38675, 0.877463], [-0.95878, 0.129662, -0.252839], [-0.015988, -0.913024, -0.407593]]; the translation vector: [3.69525, 3.551647, 1.352095], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.47, 0.453, 0.894, 0.2, 0.52, 0.291], [1.542, -0.676, 0.862, 0.217, 0.405, 0.289], [-1.666, -1.034, 0.158, 0.332, 0.363, 0.294]]\nB: [[1.471, 0.336, 1.375, -0.216, 0.786, 0.736], [1.469, -0.24, 1.152, 0.44, 0.255, 0.196], [-1.84, -1.112, 0.203, 0.586, 0.569, 0.159]]\nC: [[1.315, 0.861, 1.294, -0.145, 0.334, 0.615], [1.166, -0.686, 1.016, 0.2, 0.258, 0.346], [-2.029, -1.236, -0.071, 0.818, 0.37, 0.684]]\nD: [[1.59, 0.904, 1.331, 0.394, 0.302, 0.781], [1.626, -0.262, 1.266, -0.178, 0.337, 0.326], [-1.34, -1.047, 0.45, -0.149, 0.438, 0.179]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_3_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_3_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[0.415, 0.474, 0.662, 0.608, 0.318, 0.607], [0.137, 1.366, 0.214, 0.809, 0.687, 0.563], [-0.119, -1.513, 0.412, 0.991, 0.469, 0.436], [0.958, -1.756, 0.374, 0.393, 0.461, 0.273]]\nB: [[0.097, 0.337, 0.367, 0.736, 0.669, 0.76], [-0.118, 0.915, 0.406, 0.54, 0.71, 0.78], [0.039, -1.273, 0.366, 0.52, 0.703, 0.787], [0.484, -2.107, 0.393, 0.516, 0.773, 0.731]]\nC: [[-0.145, 0.33, 0.215, 0.329, 0.397, 1.235], [-0.041, 1.377, -0.008, 0.173, 0.698, 1.043], [0.354, -0.822, 0.479, 0.306, 0.474, 0.987], [0.885, -2.559, 0.576, 0.548, 1.045, 0.546]]\nD: [[-0.062, 0.608, 0.646, 1.11, 1.056, 0.374], [0.266, 1.23, 0.893, 0.95, 0.801, 1.268], [-0.368, -1.641, 0.003, 0.257, 0.709, 0.427], [0.563, -2.189, 0.486, 0.531, 1.025, 0.714]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the chair in the scene. The camera pose information includes: the rotation matrix: [[-0.857694, 0.203115, -0.472341], [0.513544, 0.293426, -0.806333], [-0.025181, -0.934155, -0.355978]]; the translation vector: [3.161674, 3.662206, 1.335287], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.415, 0.474, 0.662, 0.608, 0.318, 0.607], [0.137, 1.366, 0.214, 0.809, 0.687, 0.563], [-0.119, -1.513, 0.412, 0.991, 0.469, 0.436], [0.958, -1.756, 0.374, 0.393, 0.461, 0.273]]\nB: [[0.097, 0.337, 0.367, 0.736, 0.669, 0.76], [-0.118, 0.915, 0.406, 0.54, 0.71, 0.78], [0.039, -1.273, 0.366, 0.52, 0.703, 0.787], [0.484, -2.107, 0.393, 0.516, 0.773, 0.731]]\nC: [[-0.145, 0.33, 0.215, 0.329, 0.397, 1.235], [-0.041, 1.377, -0.008, 0.173, 0.698, 1.043], [0.354, -0.822, 0.479, 0.306, 0.474, 0.987], [0.885, -2.559, 0.576, 0.548, 1.045, 0.546]]\nD: [[-0.062, 0.608, 0.646, 1.11, 1.056, 0.374], [0.266, 1.23, 0.893, 0.95, 0.801, 1.268], [-0.368, -1.641, 0.003, 0.257, 0.709, 0.427], [0.563, -2.189, 0.486, 0.531, 1.025, 0.714]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_4_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_4_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[1.3, -0.383, 0.923, -0.356, 3.02, 2.635], [-2.115, 1.383, 0.841, 0.469, 0.927, 1.856], [-0.097, 1.484, 1.187, 3.044, -0.119, 1.807], [-1.377, 2.117, 0.899, -0.033, 1.21, 1.808], [1.159, -1.198, 1.874, 0.555, 0.176, 1.325]]\nB: [[1.941, -0.755, 1.598, -0.121, 2.674, 2.518], [-2.423, 1.211, 1.402, 0.192, 0.868, 2.164], [0.169, 1.203, 1.223, 2.673, -0.21, 2.157], [-1.688, 1.565, 0.939, 0.535, 1.926, 1.811], [1.734, -1.649, 1.385, 0.714, 0.109, 1.38]]\nC: [[1.696, -0.287, 1.129, 0.119, 2.8, 2.268], [-2.577, 1.535, 1.204, 0.613, 1.32, 2.198], [0.155, 1.133, 1.131, 3.032, 0.104, 2.204], [-1.285, 1.824, 1.154, 0.225, 1.437, 2.219], [1.488, -1.695, 1.419, 0.405, 0.098, 1.172]]\nD: [[1.453, -0.695, 1.348, 0.547, 3.272, 2.434], [-2.777, 1.417, 1.534, 1.007, 1.19, 1.834], [-0.034, 0.735, 1.208, 2.75, 0.551, 2.499], [-1.311, 1.97, 1.046, -0.151, 1.928, 1.721], [1.211, -2.136, 1.037, 0.829, -0.033, 1.518]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[0.199941, 0.263531, -0.943703], [0.979453, -0.027844, 0.19974], [0.026362, -0.964249, -0.263683]]; the translation vector: [3.611549, 3.757055, 1.562045], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.3, -0.383, 0.923, -0.356, 3.02, 2.635], [-2.115, 1.383, 0.841, 0.469, 0.927, 1.856], [-0.097, 1.484, 1.187, 3.044, -0.119, 1.807], [-1.377, 2.117, 0.899, -0.033, 1.21, 1.808], [1.159, -1.198, 1.874, 0.555, 0.176, 1.325]]\nB: [[1.941, -0.755, 1.598, -0.121, 2.674, 2.518], [-2.423, 1.211, 1.402, 0.192, 0.868, 2.164], [0.169, 1.203, 1.223, 2.673, -0.21, 2.157], [-1.688, 1.565, 0.939, 0.535, 1.926, 1.811], [1.734, -1.649, 1.385, 0.714, 0.109, 1.38]]\nC: [[1.696, -0.287, 1.129, 0.119, 2.8, 2.268], [-2.577, 1.535, 1.204, 0.613, 1.32, 2.198], [0.155, 1.133, 1.131, 3.032, 0.104, 2.204], [-1.285, 1.824, 1.154, 0.225, 1.437, 2.219], [1.488, -1.695, 1.419, 0.405, 0.098, 1.172]]\nD: [[1.453, -0.695, 1.348, 0.547, 3.272, 2.434], [-2.777, 1.417, 1.534, 1.007, 1.19, 1.834], [-0.034, 0.735, 1.208, 2.75, 0.551, 2.499], [-1.311, 1.97, 1.046, -0.151, 1.928, 1.721], [1.211, -2.136, 1.037, 0.829, -0.033, 1.518]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_5_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_5_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[2.228, -1.379, 0.968, 0.05, 0.868, 0.876]]\nB: [[1.972, -1.363, 0.684, 0.094, 1.002, 1.369]]\nC: [[2.421, -1.068, 1.034, 0.177, 1.167, 1.478]]\nD: [[2.274, -1.598, 1.168, 0.52, 1.421, 1.228]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the door in the scene. The camera pose information includes: the rotation matrix: [[-0.156961, 0.257294, -0.953501], [0.986843, 0.002956, -0.161652], [-0.038773, -0.966329, -0.254373]]; the translation vector: [1.838324, 1.205476, 1.480452], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[2.228, -1.379, 0.968, 0.05, 0.868, 0.876]]\nB: [[1.972, -1.363, 0.684, 0.094, 1.002, 1.369]]\nC: [[2.421, -1.068, 1.034, 0.177, 1.167, 1.478]]\nD: [[2.274, -1.598, 1.168, 0.52, 1.421, 1.228]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_6_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_6_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-0.821, -1.627, 0.918, 0.317, 0.52, -0.34]]\nB: [[-1.196, -1.892, 0.539, 0.329, 0.374, 0.146]]\nC: [[-1.472, -2.353, 0.745, 0.703, 0.302, -0.352]]\nD: [[-1.33, -1.906, 0.911, -0.061, 0.797, -0.104]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the jacket in the scene. The camera pose information includes: the rotation matrix: [[0.999847, -0.004634, 0.01689], [-0.017397, -0.374134, 0.927211], [0.002023, -0.927363, -0.374157]]; the translation vector: [3.310194, 3.16458, 1.506432], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.821, -1.627, 0.918, 0.317, 0.52, -0.34]]\nB: [[-1.196, -1.892, 0.539, 0.329, 0.374, 0.146]]\nC: [[-1.472, -2.353, 0.745, 0.703, 0.302, -0.352]]\nD: [[-1.33, -1.906, 0.911, -0.061, 0.797, -0.104]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_7_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_7_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[1.441, 1.064, 0.78, 0.6, 1.194, 1.759]]\nB: [[0.806, 1.175, 0.723, 1.145, 0.857, 1.307]]\nC: [[1.47, 1.204, 0.738, 1.161, 1.149, 1.298]]\nD: [[1.013, 1.023, 0.774, 0.913, 1.329, 1.578]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the desk in the scene. The camera pose information includes: the rotation matrix: [[0.977181, 0.077241, -0.197866], [0.211774, -0.426158, 0.879512], [-0.016388, -0.901345, -0.432791]]; the translation vector: [0.977323, 0.877303, 1.40232], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.441, 1.064, 0.78, 0.6, 1.194, 1.759]]\nB: [[0.806, 1.175, 0.723, 1.145, 0.857, 1.307]]\nC: [[1.47, 1.204, 0.738, 1.161, 1.149, 1.298]]\nD: [[1.013, 1.023, 0.774, 0.913, 1.329, 1.578]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_8_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_8_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-1.32, 0.548, 0.015, 3.907, 5.219, 0.432]]\nB: [[-0.869, 0.661, -0.307, 3.672, 4.614, -0.069]]\nC: [[-0.885, 0.436, 0.066, 3.44, 4.871, 0.305]]\nD: [[-1.228, 0.813, -0.025, 3.355, 4.94, 0.54]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the floor in the scene. The camera pose information includes: the rotation matrix: [[-0.30056, -0.511506, 0.805], [-0.953151, 0.130866, -0.272721], [0.034151, -0.849256, -0.526876]]; the translation vector: [-0.281614, 2.924112, 1.306122], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.32, 0.548, 0.015, 3.907, 5.219, 0.432]]\nB: [[-0.869, 0.661, -0.307, 3.672, 4.614, -0.069]]\nC: [[-0.885, 0.436, 0.066, 3.44, 4.871, 0.305]]\nD: [[-1.228, 0.813, -0.025, 3.355, 4.94, 0.54]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_9_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_9_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[0.236, 0.438, 2.571, 1.376, 1.585, 0.296]]\nB: [[0.105, 0.071, 2.209, 1.375, 1.269, -0.259]]\nC: [[-0.082, 0.088, 2.553, 1.257, 1.665, 0.102]]\nD: [[-0.566, -0.056, 2.979, 1.134, 1.904, -0.21]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the ceiling in the scene. The camera pose information includes: the rotation matrix: [[-0.255196, -0.436856, 0.862573], [-0.966393, 0.143834, -0.213066], [-0.030988, -0.887958, -0.45888]]; the translation vector: [1.734999, 0.744851, 1.432124], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.236, 0.438, 2.571, 1.376, 1.585, 0.296]]\nB: [[0.105, 0.071, 2.209, 1.375, 1.269, -0.259]]\nC: [[-0.082, 0.088, 2.553, 1.257, 1.665, 0.102]]\nD: [[-0.566, -0.056, 2.979, 1.134, 1.904, -0.21]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_10_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_10_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-0.043, 0.444, 0.066, 3.645, 4.94, 0.241]]\nB: [[0.071, 0.025, -0.197, 3.897, 4.771, 0.696]]\nC: [[-0.523, 0.143, -0.403, 3.288, 4.973, 0.134]]\nD: [[0.378, 0.809, 0.444, 3.764, 4.725, 0.036]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the floor in the scene. The camera pose information includes: the rotation matrix: [[-0.436119, -0.427186, 0.79203], [-0.89981, 0.218659, -0.377532], [-0.011909, -0.877326, -0.479747]]; the translation vector: [1.992302, 3.72193, 1.553249], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.043, 0.444, 0.066, 3.645, 4.94, 0.241]]\nB: [[0.071, 0.025, -0.197, 3.897, 4.771, 0.696]]\nC: [[-0.523, 0.143, -0.403, 3.288, 4.973, 0.134]]\nD: [[0.378, 0.809, 0.444, 3.764, 4.725, 0.036]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_11_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_11_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-0.786, -1.469, 1.112, 0.86, 0.176, 1.647]]\nB: [[-0.914, -1.825, 1.495, 0.394, 0.281, 2.141]]\nC: [[-1.155, -1.563, 0.935, 0.81, 0.246, 1.235]]\nD: [[-0.398, -1.079, 1.275, 0.659, -0.007, 1.932]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the curtain in the scene. The camera pose information includes: the rotation matrix: [[-0.112591, -0.547395, 0.829266], [-0.992672, 0.098819, -0.069547], [-0.043877, -0.83102, -0.55451]]; the translation vector: [1.18498, 1.814175, 1.496605], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.786, -1.469, 1.112, 0.86, 0.176, 1.647]]\nB: [[-0.914, -1.825, 1.495, 0.394, 0.281, 2.141]]\nC: [[-1.155, -1.563, 0.935, 0.81, 0.246, 1.235]]\nD: [[-0.398, -1.079, 1.275, 0.659, -0.007, 1.932]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_12_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_12_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[0.187, -2.136, 1.49, 0.407, 0.4, 0.612], [0.6, -1.205, 1.939, 0.176, 0.133, -0.205]]\nB: [[0.434, -1.704, 1.717, 0.327, 0.549, 0.278], [0.752, -1.616, 1.803, 0.403, 0.362, 0.211]]\nC: [[0.158, -1.92, 1.36, -0.055, 0.096, 0.484], [0.44, -1.879, 1.563, 0.594, 0.374, 0.673]]\nD: [[-0.017, -1.973, 1.957, -0.127, 0.324, 0.483], [0.88, -1.365, 2.154, 0.664, 0.083, -0.049]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the box in the scene. The camera pose information includes: the rotation matrix: [[0.645842, -0.099101, 0.757012], [-0.761541, -0.013148, 0.647984], [-0.054263, -0.994991, -0.083961]]; the translation vector: [3.729951, 1.432448, 1.733539], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.187, -2.136, 1.49, 0.407, 0.4, 0.612], [0.6, -1.205, 1.939, 0.176, 0.133, -0.205]]\nB: [[0.434, -1.704, 1.717, 0.327, 0.549, 0.278], [0.752, -1.616, 1.803, 0.403, 0.362, 0.211]]\nC: [[0.158, -1.92, 1.36, -0.055, 0.096, 0.484], [0.44, -1.879, 1.563, 0.594, 0.374, 0.673]]\nD: [[-0.017, -1.973, 1.957, -0.127, 0.324, 0.483], [0.88, -1.365, 2.154, 0.664, 0.083, -0.049]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_13_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_13_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[1.798, -1.611, 1.132, 0.285, 0.531, 1.165]]\nB: [[1.687, -1.332, 1.2, 0.199, 0.988, 0.799]]\nC: [[1.357, -0.901, 1.518, -0.194, 0.606, 1.017]]\nD: [[1.876, -1.168, 0.737, 0.277, 1.058, 1.074]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the window in the scene. The camera pose information includes: the rotation matrix: [[0.14018, 0.443083, -0.885453], [0.989985, -0.07783, 0.117782], [-0.016727, -0.893096, -0.449556]]; the translation vector: [3.549726, 0.935059, 1.485921], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.798, -1.611, 1.132, 0.285, 0.531, 1.165]]\nB: [[1.687, -1.332, 1.2, 0.199, 0.988, 0.799]]\nC: [[1.357, -0.901, 1.518, -0.194, 0.606, 1.017]]\nD: [[1.876, -1.168, 0.737, 0.277, 1.058, 1.074]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_14_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_14_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-0.819, -0.006, 0.434, 0.452, 1.821, 0.691], [-2.563, 0.098, 0.464, 0.939, 2.679, 0.721]]\nB: [[-1.198, -0.018, -0.225, 0.953, 2.14, 0.57], [-3.038, 0.583, 0.16, 0.212, 2.66, 1.39]]\nC: [[-1.115, -0.366, 0.124, 1.037, 1.922, 0.126], [-3.074, 0.098, -0.014, 0.214, 2.71, 0.451]]\nD: [[-0.889, -0.312, 0.236, 0.943, 2.266, 0.443], [-3.042, 0.305, 0.458, 0.511, 3.034, 0.927]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the table in the scene. The camera pose information includes: the rotation matrix: [[0.988959, -0.006087, -0.148062], [0.148117, 0.009943, 0.98892], [-0.004548, -0.999932, 0.010735]]; the translation vector: [3.911582, 2.672538, 1.565046], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.819, -0.006, 0.434, 0.452, 1.821, 0.691], [-2.563, 0.098, 0.464, 0.939, 2.679, 0.721]]\nB: [[-1.198, -0.018, -0.225, 0.953, 2.14, 0.57], [-3.038, 0.583, 0.16, 0.212, 2.66, 1.39]]\nC: [[-1.115, -0.366, 0.124, 1.037, 1.922, 0.126], [-3.074, 0.098, -0.014, 0.214, 2.71, 0.451]]\nD: [[-0.889, -0.312, 0.236, 0.943, 2.266, 0.443], [-3.042, 0.305, 0.458, 0.511, 3.034, 0.927]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_15_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_15_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[1.003, 1.71, 2.094, 0.541, 0.263, 0.118], [1.203, 1.68, 0.477, 1.133, 0.414, 1.172]]\nB: [[1.233, 1.735, 1.724, 0.591, 0.58, 0.487], [0.701, 1.63, 0.882, 0.862, 0.077, 0.96]]\nC: [[0.897, 1.469, 2.006, 0.732, 0.251, 0.23], [0.896, 1.321, 0.509, 1.078, 0.563, 0.956]]\nD: [[1.067, 1.911, 1.889, 0.58, 0.392, -0.26], [1.382, 1.503, 0.381, 1.083, 0.31, 0.478]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the shelf in the scene. The camera pose information includes: the rotation matrix: [[-0.767458, -0.265442, 0.583565], [-0.640543, 0.35536, -0.680752], [-0.026676, -0.896248, -0.442751]]; the translation vector: [3.343537, 3.697402, 1.375352], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.003, 1.71, 2.094, 0.541, 0.263, 0.118], [1.203, 1.68, 0.477, 1.133, 0.414, 1.172]]\nB: [[1.233, 1.735, 1.724, 0.591, 0.58, 0.487], [0.701, 1.63, 0.882, 0.862, 0.077, 0.96]]\nC: [[0.897, 1.469, 2.006, 0.732, 0.251, 0.23], [0.896, 1.321, 0.509, 1.078, 0.563, 0.956]]\nD: [[1.067, 1.911, 1.889, 0.58, 0.392, -0.26], [1.382, 1.503, 0.381, 1.083, 0.31, 0.478]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_16_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_16_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[1.138, 0.124, 0.676, -0.24, 4.388, 1.6], [-0.991, 0.479, 1.092, 0.412, 4.521, 1.772], [0.357, 1.729, 0.48, 2.481, 0.664, 1.17]]\nB: [[1.22, 0.284, 0.605, 0.161, 4.409, 2.133], [-1.442, 0.481, 1.313, -0.024, 4.063, 1.777], [-0.203, 2.498, 0.224, 1.968, 0.299, 1.17]]\nC: [[0.938, 0.113, 0.929, -0.179, 3.724, 1.454], [-0.685, 0.25, 0.853, -0.017, 3.941, 2.536], [-0.076, 2.502, 0.394, 2.11, -0.271, 0.712]]\nD: [[1.264, 0.297, 0.846, 0.212, 3.931, 1.71], [-1.088, 0.197, 1.004, 0.293, 4.063, 2.062], [0.059, 2.138, 0.489, 2.4, 0.176, 0.937]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[-0.311411, -0.45253, 0.835607], [-0.948656, 0.199362, -0.245576], [-0.055457, -0.869179, -0.491379]]; the translation vector: [2.299133, 2.388773, 1.459468], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.138, 0.124, 0.676, -0.24, 4.388, 1.6], [-0.991, 0.479, 1.092, 0.412, 4.521, 1.772], [0.357, 1.729, 0.48, 2.481, 0.664, 1.17]]\nB: [[1.22, 0.284, 0.605, 0.161, 4.409, 2.133], [-1.442, 0.481, 1.313, -0.024, 4.063, 1.777], [-0.203, 2.498, 0.224, 1.968, 0.299, 1.17]]\nC: [[0.938, 0.113, 0.929, -0.179, 3.724, 1.454], [-0.685, 0.25, 0.853, -0.017, 3.941, 2.536], [-0.076, 2.502, 0.394, 2.11, -0.271, 0.712]]\nD: [[1.264, 0.297, 0.846, 0.212, 3.931, 1.71], [-1.088, 0.197, 1.004, 0.293, 4.063, 2.062], [0.059, 2.138, 0.489, 2.4, 0.176, 0.937]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_17_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_17_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-1.286, 0.023, 0.43, 0.149, 0.955, 0.837], [-1.223, 1.506, 0.654, 0.172, 1.002, 1.099]]\nB: [[-1.716, 0.263, -0.049, 0.591, 1.354, 0.711], [-1.546, 1.706, 0.156, 0.078, 0.869, 0.807]]\nC: [[-0.94, -0.207, 0.36, 0.417, 1.226, 0.947], [-1.061, 1.683, 0.653, 0.491, 0.617, 1.297]]\nD: [[-1.505, -0.176, 0.438, -0.283, 0.675, 1.3], [-1.566, 1.679, 1.013, -0.274, 0.726, 0.998]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the doorframe in the scene. The camera pose information includes: the rotation matrix: [[-0.305635, -0.390507, 0.868385], [-0.952144, 0.122302, -0.280116], [0.003183, -0.91244, -0.409198]]; the translation vector: [4.266061, 1.773856, 1.285079], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.286, 0.023, 0.43, 0.149, 0.955, 0.837], [-1.223, 1.506, 0.654, 0.172, 1.002, 1.099]]\nB: [[-1.716, 0.263, -0.049, 0.591, 1.354, 0.711], [-1.546, 1.706, 0.156, 0.078, 0.869, 0.807]]\nC: [[-0.94, -0.207, 0.36, 0.417, 1.226, 0.947], [-1.061, 1.683, 0.653, 0.491, 0.617, 1.297]]\nD: [[-1.505, -0.176, 0.438, -0.283, 0.675, 1.3], [-1.566, 1.679, 1.013, -0.274, 0.726, 0.998]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_18_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_18_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-0.301, -0.248, -0.183, 3.399, 5.209, 0.282]]\nB: [[0.089, -0.015, -0.009, 3.337, 5.518, 0.258]]\nC: [[0.396, -0.159, 0.362, 3.257, 5.951, -0.2]]\nD: [[0.093, 0.115, -0.474, 3.284, 5.168, 0.122]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the floor in the scene. The camera pose information includes: the rotation matrix: [[-0.934582, -0.143102, 0.325696], [-0.355737, 0.383069, -0.852473], [-0.002774, -0.912568, -0.408916]]; the translation vector: [2.694367, 2.483235, 1.465763], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.301, -0.248, -0.183, 3.399, 5.209, 0.282]]\nB: [[0.089, -0.015, -0.009, 3.337, 5.518, 0.258]]\nC: [[0.396, -0.159, 0.362, 3.257, 5.951, -0.2]]\nD: [[0.093, 0.115, -0.474, 3.284, 5.168, 0.122]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_19_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_19_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-0.264, 0.68, 0.463, 0.725, 0.55, 0.967], [1.225, 0.142, 2.156, 0.68, 2.62, 0.669]]\nB: [[0.179, 0.182, 0.661, 0.23, 1.022, 0.617], [1.51, -0.031, 1.909, 0.955, 2.219, 0.647]]\nC: [[0.069, 0.653, 0.766, 0.622, 0.263, 0.911], [0.908, 0.073, 1.898, 0.615, 2.442, 0.579]]\nD: [[-0.094, 1.133, 0.833, 0.562, 0.551, 0.493], [1.073, 0.074, 2.645, 0.407, 2.814, 0.994]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the cabinet in the scene. The camera pose information includes: the rotation matrix: [[-0.928375, -0.17783, 0.326339], [-0.371449, 0.415395, -0.830345], [0.012101, -0.892089, -0.451697]]; the translation vector: [2.096006, 1.919092, 1.36174], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.264, 0.68, 0.463, 0.725, 0.55, 0.967], [1.225, 0.142, 2.156, 0.68, 2.62, 0.669]]\nB: [[0.179, 0.182, 0.661, 0.23, 1.022, 0.617], [1.51, -0.031, 1.909, 0.955, 2.219, 0.647]]\nC: [[0.069, 0.653, 0.766, 0.622, 0.263, 0.911], [0.908, 0.073, 1.898, 0.615, 2.442, 0.579]]\nD: [[-0.094, 1.133, 0.833, 0.562, 0.551, 0.493], [1.073, 0.074, 2.645, 0.407, 2.814, 0.994]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_20_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_20_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-1.718, -0.44, 1.96, 0.228, 0.897, 0.293], [-1.706, -1.293, 1.868, 0.22, 0.846, 0.362], [-1.707, -1.314, 0.762, 0.375, 0.826, 0.302], [-1.691, 1.543, 1.626, 0.337, 0.697, 0.437], [-1.573, 1.406, 1.291, 0.181, 0.564, 0.313]]\nB: [[-1.988, -0.706, 1.585, 0.615, 0.861, 0.547], [-1.403, -1.309, 1.785, -0.129, 1.049, -0.092], [-1.749, -1.25, 0.92, 0.869, 1.088, 0.428], [-1.92, 1.941, 1.694, 0.669, 1.107, 0.403], [-1.483, 1.056, 1.615, 0.417, 0.387, 0.739]]\nC: [[-1.546, -0.182, 1.499, 0.527, 1.029, 0.605], [-1.401, -1.244, 2.308, -0.222, 0.467, 0.206], [-2.193, -1.361, 0.929, 0.133, 0.525, 0.091], [-1.321, 1.865, 1.781, -0.053, 0.666, 0.358], [-1.503, 1.488, 1.689, 0.165, 0.203, 0.17]]\nD: [[-1.94, -0.345, 2.215, 0.028, 0.642, 0.092], [-2.035, -1.654, 1.937, 0.612, 1.134, -0.102], [-1.249, -1.385, 0.367, 0.613, 1.003, 0.682], [-2.01, 1.507, 1.513, 0.614, 0.573, 0.003], [-1.183, 1.016, 0.985, -0.012, 0.86, 0.417]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the books in the scene. The camera pose information includes: the rotation matrix: [[0.725417, 0.297171, -0.620854], [0.687848, -0.279954, 0.669695], [0.025203, -0.912861, -0.407492]]; the translation vector: [3.434752, 3.057745, 1.556519], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.718, -0.44, 1.96, 0.228, 0.897, 0.293], [-1.706, -1.293, 1.868, 0.22, 0.846, 0.362], [-1.707, -1.314, 0.762, 0.375, 0.826, 0.302], [-1.691, 1.543, 1.626, 0.337, 0.697, 0.437], [-1.573, 1.406, 1.291, 0.181, 0.564, 0.313]]\nB: [[-1.988, -0.706, 1.585, 0.615, 0.861, 0.547], [-1.403, -1.309, 1.785, -0.129, 1.049, -0.092], [-1.749, -1.25, 0.92, 0.869, 1.088, 0.428], [-1.92, 1.941, 1.694, 0.669, 1.107, 0.403], [-1.483, 1.056, 1.615, 0.417, 0.387, 0.739]]\nC: [[-1.546, -0.182, 1.499, 0.527, 1.029, 0.605], [-1.401, -1.244, 2.308, -0.222, 0.467, 0.206], [-2.193, -1.361, 0.929, 0.133, 0.525, 0.091], [-1.321, 1.865, 1.781, -0.053, 0.666, 0.358], [-1.503, 1.488, 1.689, 0.165, 0.203, 0.17]]\nD: [[-1.94, -0.345, 2.215, 0.028, 0.642, 0.092], [-2.035, -1.654, 1.937, 0.612, 1.134, -0.102], [-1.249, -1.385, 0.367, 0.613, 1.003, 0.682], [-2.01, 1.507, 1.513, 0.614, 0.573, 0.003], [-1.183, 1.016, 0.985, -0.012, 0.86, 0.417]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_21_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_21_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-2.266, 0.688, 1.427, 0.645, 4.144, 1.891], [1.793, 0.377, 0.356, 0.744, 3.81, 1.61], [-0.941, -1.334, 0.579, 2.335, 0.039, 1.673], [0.362, -1.428, 0.335, 0.484, 0.146, 1.101], [0.403, -1.616, 0.92, 1.722, 0.842, 1.057], [1.194, 3.045, 0.588, 0.013, 0.929, 1.121], [1.761, 2.38, 0.509, 0.767, -0.159, 0.975]]\nB: [[-2.307, 0.099, 1.206, 0.234, 3.804, 1.593], [2.401, 0.098, 0.732, 0.485, 3.642, 1.662], [-1.38, -1.427, 1.26, 2.485, 0.117, 1.419], [0.047, -1.412, 0.542, 0.454, 0.56, 1.109], [0.451, -2.029, 0.885, 1.313, 0.841, 0.961], [1.821, 2.284, 0.965, -0.181, 0.842, 1.118], [1.675, 2.553, 0.956, 0.491, 0.104, 1.591]]\nC: [[-1.548, 0.69, 1.589, 0.293, 3.816, 1.624], [2.411, 0.459, 0.334, 0.543, 4.513, 1.336], [-0.832, -2.051, 0.999, 1.925, 0.593, 1.075], [-0.217, -1.618, 0.815, -0.295, 0.494, 0.985], [1.068, -1.834, 1.273, 1.646, 0.657, 0.959], [1.898, 2.458, 0.543, 0.44, 1.518, 1.452], [2.071, 1.877, 0.293, 1.12, 0.358, 1.716]]\nD: [[-1.964, 0.397, 1.135, 0.305, 4.04, 1.813], [2.143, 0.114, 0.673, 0.413, 4.08, 1.439], [-0.926, -1.676, 0.892, 2.284, 0.231, 1.529], [0.195, -1.875, 0.811, 0.153, 0.424, 1.364], [0.78, -1.998, 0.788, 1.226, 0.36, 1.309], [1.439, 2.685, 0.692, 0.249, 1.216, 1.435], [1.802, 2.098, 0.616, 0.629, 0.105, 1.315]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[0.205292, 0.226186, -0.952205], [0.97316, -0.150555, 0.174048], [-0.103992, -0.962379, -0.251024]]; the translation vector: [4.876985, 2.837537, 1.671042], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-2.266, 0.688, 1.427, 0.645, 4.144, 1.891], [1.793, 0.377, 0.356, 0.744, 3.81, 1.61], [-0.941, -1.334, 0.579, 2.335, 0.039, 1.673], [0.362, -1.428, 0.335, 0.484, 0.146, 1.101], [0.403, -1.616, 0.92, 1.722, 0.842, 1.057], [1.194, 3.045, 0.588, 0.013, 0.929, 1.121], [1.761, 2.38, 0.509, 0.767, -0.159, 0.975]]\nB: [[-2.307, 0.099, 1.206, 0.234, 3.804, 1.593], [2.401, 0.098, 0.732, 0.485, 3.642, 1.662], [-1.38, -1.427, 1.26, 2.485, 0.117, 1.419], [0.047, -1.412, 0.542, 0.454, 0.56, 1.109], [0.451, -2.029, 0.885, 1.313, 0.841, 0.961], [1.821, 2.284, 0.965, -0.181, 0.842, 1.118], [1.675, 2.553, 0.956, 0.491, 0.104, 1.591]]\nC: [[-1.548, 0.69, 1.589, 0.293, 3.816, 1.624], [2.411, 0.459, 0.334, 0.543, 4.513, 1.336], [-0.832, -2.051, 0.999, 1.925, 0.593, 1.075], [-0.217, -1.618, 0.815, -0.295, 0.494, 0.985], [1.068, -1.834, 1.273, 1.646, 0.657, 0.959], [1.898, 2.458, 0.543, 0.44, 1.518, 1.452], [2.071, 1.877, 0.293, 1.12, 0.358, 1.716]]\nD: [[-1.964, 0.397, 1.135, 0.305, 4.04, 1.813], [2.143, 0.114, 0.673, 0.413, 4.08, 1.439], [-0.926, -1.676, 0.892, 2.284, 0.231, 1.529], [0.195, -1.875, 0.811, 0.153, 0.424, 1.364], [0.78, -1.998, 0.788, 1.226, 0.36, 1.309], [1.439, 2.685, 0.692, 0.249, 1.216, 1.435], [1.802, 2.098, 0.616, 0.629, 0.105, 1.315]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_22_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_22_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[0.121, -0.501, -0.221, 0.439, 0.387, 0.089], [1.109, -1.077, -0.06, 0.221, 0.081, 0.556]]\nB: [[-0.019, -0.497, 0.389, 1.016, 0.944, 0.39], [1.279, -1.542, -0.239, 0.195, 0.216, 0.872]]\nC: [[0.21, -0.049, -0.032, 0.947, 0.552, 0.204], [0.617, -0.987, 0.392, 0.284, 0.553, 0.828]]\nD: [[0.392, -0.219, 0.176, 0.595, 0.63, 0.481], [0.882, -1.099, 0.197, 0.524, 0.524, 0.466]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the ottoman in the scene. The camera pose information includes: the rotation matrix: [[0.133825, -0.39571, 0.908573], [-0.990975, -0.046263, 0.125813], [-0.007752, -0.91721, -0.398329]]; the translation vector: [4.990516, 4.227292, 1.32289], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.121, -0.501, -0.221, 0.439, 0.387, 0.089], [1.109, -1.077, -0.06, 0.221, 0.081, 0.556]]\nB: [[-0.019, -0.497, 0.389, 1.016, 0.944, 0.39], [1.279, -1.542, -0.239, 0.195, 0.216, 0.872]]\nC: [[0.21, -0.049, -0.032, 0.947, 0.552, 0.204], [0.617, -0.987, 0.392, 0.284, 0.553, 0.828]]\nD: [[0.392, -0.219, 0.176, 0.595, 0.63, 0.481], [0.882, -1.099, 0.197, 0.524, 0.524, 0.466]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_23_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_23_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[0.923, 3.072, 1.641, 0.406, 0.224, 0.224], [0.852, 2.684, 1.628, 0.411, 0.369, 0.342], [0.952, 2.353, 1.591, 0.332, 0.315, 0.303], [0.919, 1.934, 1.549, 0.278, 0.356, 0.3], [0.991, 1.596, 1.521, 0.302, 0.285, 0.248], [1.083, 1.197, 1.51, 0.2, 0.428, 0.292], [1.067, 0.874, 1.479, 0.258, 0.387, 0.349], [1.029, 0.682, 1.414, 0.27, 0.238, 0.229], [1.041, 0.446, 1.386, 0.31, 0.355, 0.267], [1.007, 0.119, 1.367, 0.313, 0.297, 0.251], [1.072, -0.152, 1.331, 0.368, 0.301, 0.196], [0.978, -0.542, 1.366, 0.293, 0.411, 0.344], [1.038, -0.846, 1.349, 0.398, 0.352, 0.371], [0.995, -1.285, 1.277, 0.273, 0.319, 0.287], [1.051, -1.623, 1.317, 0.372, 0.433, 0.346], [1.016, -1.909, 1.267, 0.375, 0.379, 0.355], [1.01, -2.206, 1.239, 0.32, 0.305, 0.33], [1.021, -2.389, 1.248, 0.292, 0.375, 0.256], [0.945, -2.669, 1.168, 0.312, 0.307, 0.249], [0.986, -2.904, 1.157, 0.265, 0.331, 0.203]]\nB: [[1.16, 2.801, 1.566, 0.581, 0.486, -0.268], [0.756, 2.458, 1.347, -0.04, 0.522, 0.189], [0.535, 2.032, 1.866, 0.051, 0.318, 0.012], [0.896, 2.321, 1.823, -0.143, 0.711, 0.696], [1.3, 1.485, 1.216, 0.089, 0.474, 0.726], [1.333, 1.63, 1.281, 0.587, 0.639, -0.131], [1.034, 0.752, 1.496, 0.694, 0.45, 0.002], [1.132, 0.488, 1.903, -0.121, -0.068, 0.586], [1.244, 0.056, 1.06, 0.343, 0.366, 0.492], [0.523, 0.369, 1.091, 0.036, 0.297, 0.341], [0.945, -0.379, 1.231, -0.009, 0.698, 0.282], [0.742, -0.538, 1.804, 0.143, 0.887, 0.377], [1.245, -0.568, 1.71, 0.143, 0.603, 0.41], [1.356, -0.879, 1.397, 0.576, 0.048, 0.554], [1.47, -2.036, 1.112, 0.54, 0.795, 0.096], [1.472, -1.52, 0.829, 0.648, 0.598, 0.49], [0.775, -2.633, 1.506, -0.16, -0.139, -0.099], [0.838, -2.702, 1.211, 0.137, 0.331, -0.011], [1.261, -2.818, 1.474, 0.679, -0.005, 0.352], [0.793, -2.949, 1.566, -0.008, 0.477, 0.693]]\nC: [[1.056, 2.871, 1.196, 0.82, -0.168, 0.476], [1.086, 3.168, 1.177, -0.05, 0.768, 0.624], [1.078, 2.314, 1.991, 0.481, -0.014, 0.382], [0.899, 1.855, 1.409, -0.073, 0.065, 0.078], [0.796, 1.846, 1.026, -0.008, 0.461, 0.294], [0.96, 0.751, 1.316, 0.52, 0.805, 0.752], [1.18, 1.031, 1.766, 0.673, 0.119, 0.034], [1.398, 0.505, 1.118, -0.168, 0.16, -0.249], [0.838, 0.65, 1.392, 0.173, 0.458, 0.332], [1.111, -0.328, 1.396, 0.558, 0.481, 0.366], [0.597, -0.355, 1.146, 0.623, 0.368, 0.632], [0.691, -0.514, 1.338, -0.157, 0.304, -0.124], [0.696, -1.125, 1.476, 0.501, 0.757, 0.356], [0.907, -0.859, 1.385, 0.656, 0.571, -0.029], [1.035, -1.127, 1.219, 0.093, 0.841, 0.704], [0.635, -1.763, 1.501, -0.076, -0.097, 0.162], [0.614, -1.848, 1.062, 0.328, 0.483, 0.674], [0.692, -2.453, 1.556, 0.665, 0.718, 0.625], [1.074, -2.937, 1.026, 0.776, 0.224, 0.639], [0.852, -3.222, 1.01, 0.571, -0.139, 0.12]]\nD: [[1.022, 3.318, 1.189, 0.205, -0.146, 0.042], [0.815, 2.622, 1.239, 0.213, 0.653, 0.265], [1.051, 2.623, 1.858, 0.743, -0.174, 0.425], [1.36, 1.47, 1.216, -0.071, -0.098, -0.074], [1.312, 2.017, 2.002, -0.015, 0.439, 0.124], [0.798, 1.663, 1.184, 0.218, 0.773, 0.512], [1.438, 0.663, 1.321, 0.334, 0.497, 0.799], [1.496, 1.067, 1.009, 0.492, 0.69, -0.197], [0.673, 0.916, 1.137, 0.692, -0.115, 0.537], [0.588, 0.319, 1.507, 0.723, 0.486, 0.106], [0.938, -0.596, 1.384, 0.378, 0.487, -0.284], [0.718, -0.867, 0.941, 0.405, 0.388, -0.074], [1.365, -0.417, 1.613, 0.897, 0.508, -0.003], [1.124, -1.228, 1.16, 0.374, 0.651, 0.692], [0.872, -1.666, 1.25, 0.857, 0.612, -0.1], [0.693, -1.777, 1.038, 0.754, 0.733, 0.072], [1.133, -1.714, 1.626, 0.475, -0.192, 0.478], [1.392, -2.804, 1.671, -0.124, 0.18, 0.524], [1.024, -2.671, 1.235, 0.602, 0.29, 0.162], [0.636, -2.621, 1.52, -0.11, 0.64, 0.18]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the book in the scene. The camera pose information includes: the rotation matrix: [[0.999403, 0.004498, 0.03425], [-0.034232, -0.004158, 0.999405], [0.004638, -0.999981, -0.004001]]; the translation vector: [2.393484, 5.775056, 1.371464], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.923, 3.072, 1.641, 0.406, 0.224, 0.224], [0.852, 2.684, 1.628, 0.411, 0.369, 0.342], [0.952, 2.353, 1.591, 0.332, 0.315, 0.303], [0.919, 1.934, 1.549, 0.278, 0.356, 0.3], [0.991, 1.596, 1.521, 0.302, 0.285, 0.248], [1.083, 1.197, 1.51, 0.2, 0.428, 0.292], [1.067, 0.874, 1.479, 0.258, 0.387, 0.349], [1.029, 0.682, 1.414, 0.27, 0.238, 0.229], [1.041, 0.446, 1.386, 0.31, 0.355, 0.267], [1.007, 0.119, 1.367, 0.313, 0.297, 0.251], [1.072, -0.152, 1.331, 0.368, 0.301, 0.196], [0.978, -0.542, 1.366, 0.293, 0.411, 0.344], [1.038, -0.846, 1.349, 0.398, 0.352, 0.371], [0.995, -1.285, 1.277, 0.273, 0.319, 0.287], [1.051, -1.623, 1.317, 0.372, 0.433, 0.346], [1.016, -1.909, 1.267, 0.375, 0.379, 0.355], [1.01, -2.206, 1.239, 0.32, 0.305, 0.33], [1.021, -2.389, 1.248, 0.292, 0.375, 0.256], [0.945, -2.669, 1.168, 0.312, 0.307, 0.249], [0.986, -2.904, 1.157, 0.265, 0.331, 0.203]]\nB: [[1.16, 2.801, 1.566, 0.581, 0.486, -0.268], [0.756, 2.458, 1.347, -0.04, 0.522, 0.189], [0.535, 2.032, 1.866, 0.051, 0.318, 0.012], [0.896, 2.321, 1.823, -0.143, 0.711, 0.696], [1.3, 1.485, 1.216, 0.089, 0.474, 0.726], [1.333, 1.63, 1.281, 0.587, 0.639, -0.131], [1.034, 0.752, 1.496, 0.694, 0.45, 0.002], [1.132, 0.488, 1.903, -0.121, -0.068, 0.586], [1.244, 0.056, 1.06, 0.343, 0.366, 0.492], [0.523, 0.369, 1.091, 0.036, 0.297, 0.341], [0.945, -0.379, 1.231, -0.009, 0.698, 0.282], [0.742, -0.538, 1.804, 0.143, 0.887, 0.377], [1.245, -0.568, 1.71, 0.143, 0.603, 0.41], [1.356, -0.879, 1.397, 0.576, 0.048, 0.554], [1.47, -2.036, 1.112, 0.54, 0.795, 0.096], [1.472, -1.52, 0.829, 0.648, 0.598, 0.49], [0.775, -2.633, 1.506, -0.16, -0.139, -0.099], [0.838, -2.702, 1.211, 0.137, 0.331, -0.011], [1.261, -2.818, 1.474, 0.679, -0.005, 0.352], [0.793, -2.949, 1.566, -0.008, 0.477, 0.693]]\nC: [[1.056, 2.871, 1.196, 0.82, -0.168, 0.476], [1.086, 3.168, 1.177, -0.05, 0.768, 0.624], [1.078, 2.314, 1.991, 0.481, -0.014, 0.382], [0.899, 1.855, 1.409, -0.073, 0.065, 0.078], [0.796, 1.846, 1.026, -0.008, 0.461, 0.294], [0.96, 0.751, 1.316, 0.52, 0.805, 0.752], [1.18, 1.031, 1.766, 0.673, 0.119, 0.034], [1.398, 0.505, 1.118, -0.168, 0.16, -0.249], [0.838, 0.65, 1.392, 0.173, 0.458, 0.332], [1.111, -0.328, 1.396, 0.558, 0.481, 0.366], [0.597, -0.355, 1.146, 0.623, 0.368, 0.632], [0.691, -0.514, 1.338, -0.157, 0.304, -0.124], [0.696, -1.125, 1.476, 0.501, 0.757, 0.356], [0.907, -0.859, 1.385, 0.656, 0.571, -0.029], [1.035, -1.127, 1.219, 0.093, 0.841, 0.704], [0.635, -1.763, 1.501, -0.076, -0.097, 0.162], [0.614, -1.848, 1.062, 0.328, 0.483, 0.674], [0.692, -2.453, 1.556, 0.665, 0.718, 0.625], [1.074, -2.937, 1.026, 0.776, 0.224, 0.639], [0.852, -3.222, 1.01, 0.571, -0.139, 0.12]]\nD: [[1.022, 3.318, 1.189, 0.205, -0.146, 0.042], [0.815, 2.622, 1.239, 0.213, 0.653, 0.265], [1.051, 2.623, 1.858, 0.743, -0.174, 0.425], [1.36, 1.47, 1.216, -0.071, -0.098, -0.074], [1.312, 2.017, 2.002, -0.015, 0.439, 0.124], [0.798, 1.663, 1.184, 0.218, 0.773, 0.512], [1.438, 0.663, 1.321, 0.334, 0.497, 0.799], [1.496, 1.067, 1.009, 0.492, 0.69, -0.197], [0.673, 0.916, 1.137, 0.692, -0.115, 0.537], [0.588, 0.319, 1.507, 0.723, 0.486, 0.106], [0.938, -0.596, 1.384, 0.378, 0.487, -0.284], [0.718, -0.867, 0.941, 0.405, 0.388, -0.074], [1.365, -0.417, 1.613, 0.897, 0.508, -0.003], [1.124, -1.228, 1.16, 0.374, 0.651, 0.692], [0.872, -1.666, 1.25, 0.857, 0.612, -0.1], [0.693, -1.777, 1.038, 0.754, 0.733, 0.072], [1.133, -1.714, 1.626, 0.475, -0.192, 0.478], [1.392, -2.804, 1.671, -0.124, 0.18, 0.524], [1.024, -2.671, 1.235, 0.602, 0.29, 0.162], [0.636, -2.621, 1.52, -0.11, 0.64, 0.18]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_24_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_24_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[0.127, 1.263, 0.842, 1.099, 0.165, 0.151], [0.899, 0.349, 0.833, 0.078, 0.633, 0.087]]\nB: [[-0.285, 1.099, 0.515, 1.523, 0.256, -0.319], [0.56, 0.838, 0.875, -0.327, 0.985, 0.228]]\nC: [[0.446, 1.442, 1.259, 1.427, 0.331, 0.006], [0.446, 0.556, 0.643, 0.276, 0.563, -0.341]]\nD: [[-0.356, 1.631, 0.612, 0.864, 0.511, -0.226], [0.523, 0.674, 0.57, 0.567, 0.594, 0.019]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the rail in the scene. The camera pose information includes: the rotation matrix: [[0.631332, 0.312126, -0.709927], [0.775472, -0.26347, 0.573784], [-0.007951, -0.912776, -0.408382]]; the translation vector: [1.600176, 0.624978, 1.327739], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.127, 1.263, 0.842, 1.099, 0.165, 0.151], [0.899, 0.349, 0.833, 0.078, 0.633, 0.087]]\nB: [[-0.285, 1.099, 0.515, 1.523, 0.256, -0.319], [0.56, 0.838, 0.875, -0.327, 0.985, 0.228]]\nC: [[0.446, 1.442, 1.259, 1.427, 0.331, 0.006], [0.446, 0.556, 0.643, 0.276, 0.563, -0.341]]\nD: [[-0.356, 1.631, 0.612, 0.864, 0.511, -0.226], [0.523, 0.674, 0.57, 0.567, 0.594, 0.019]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_25_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_25_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[0.787, 2.876, 1.457, 2.063, -0.079, 1.518], [-0.953, 1.998, 0.997, 0.069, 3.579, 2.369], [0.693, -0.876, 1.265, 0.33, 4.219, 2.072], [0.345, -2.632, 0.88, 1.928, -0.067, 1.565], [-0.455, -2.068, 1.068, 0.08, 1.617, 2.382], [-0.899, -1.196, 0.623, 0.651, 0.095, 2.066]]\nB: [[0.116, 3.687, 0.955, 2.711, 0.51, 1.785], [-0.801, 1.888, 0.628, 0.573, 3.732, 2.205], [1.093, -0.187, 0.521, -0.148, 5.045, 2.538], [0.42, -2.481, 1.79, 1.307, 0.36, 0.947], [-0.514, -1.855, 0.567, 0.468, 1.76, 1.511], [-1.181, -1.155, 0.753, 0.267, -0.268, 1.847]]\nC: [[0.302, 3.207, 1.219, 2.255, 0.306, 1.414], [-0.871, 1.59, 0.966, 0.239, 3.492, 2.066], [0.732, -0.454, 0.961, 0.242, 4.576, 2.069], [-0.078, -2.664, 1.355, 1.624, 0.192, 1.303], [-0.886, -1.849, 0.913, 0.175, 1.703, 1.972], [-1.091, -1.016, 0.816, 0.518, 0.228, 1.826]]\nD: [[0.349, 2.764, 1.178, 2.112, -0.17, 1.603], [-1.032, 1.356, 0.914, 0.415, 3.877, 2.097], [0.285, -0.798, 1.205, 0.303, 4.409, 2.223], [-0.233, -2.574, 1.577, 1.241, 0.359, 1.513], [-1.059, -2.05, 1.259, 0.503, 1.807, 1.753], [-1.108, -1.393, 0.584, 0.052, -0.001, 1.469]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[-0.386761, -0.304254, 0.870543], [-0.920043, 0.191539, -0.34181], [-0.062746, -0.933136, -0.354007]]; the translation vector: [2.082368, 4.008438, 1.845888], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.787, 2.876, 1.457, 2.063, -0.079, 1.518], [-0.953, 1.998, 0.997, 0.069, 3.579, 2.369], [0.693, -0.876, 1.265, 0.33, 4.219, 2.072], [0.345, -2.632, 0.88, 1.928, -0.067, 1.565], [-0.455, -2.068, 1.068, 0.08, 1.617, 2.382], [-0.899, -1.196, 0.623, 0.651, 0.095, 2.066]]\nB: [[0.116, 3.687, 0.955, 2.711, 0.51, 1.785], [-0.801, 1.888, 0.628, 0.573, 3.732, 2.205], [1.093, -0.187, 0.521, -0.148, 5.045, 2.538], [0.42, -2.481, 1.79, 1.307, 0.36, 0.947], [-0.514, -1.855, 0.567, 0.468, 1.76, 1.511], [-1.181, -1.155, 0.753, 0.267, -0.268, 1.847]]\nC: [[0.302, 3.207, 1.219, 2.255, 0.306, 1.414], [-0.871, 1.59, 0.966, 0.239, 3.492, 2.066], [0.732, -0.454, 0.961, 0.242, 4.576, 2.069], [-0.078, -2.664, 1.355, 1.624, 0.192, 1.303], [-0.886, -1.849, 0.913, 0.175, 1.703, 1.972], [-1.091, -1.016, 0.816, 0.518, 0.228, 1.826]]\nD: [[0.349, 2.764, 1.178, 2.112, -0.17, 1.603], [-1.032, 1.356, 0.914, 0.415, 3.877, 2.097], [0.285, -0.798, 1.205, 0.303, 4.409, 2.223], [-0.233, -2.574, 1.577, 1.241, 0.359, 1.513], [-1.059, -2.05, 1.259, 0.503, 1.807, 1.753], [-1.108, -1.393, 0.584, 0.052, -0.001, 1.469]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_26_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_26_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-2.506, 0.209, 0.255, 0.924, 1.928, 0.478], [0.463, -1.087, 0.626, 1.179, 0.62, 0.996], [2.049, 0.799, -0.061, 0.618, 1.041, 1.191]]\nB: [[-2.054, 0.6, 0.688, 1.044, 1.508, 0.567], [0.964, -1.042, 0.495, 1.122, 0.573, 0.421], [2.453, 0.513, 0.739, 0.463, 1.578, 0.424]]\nC: [[-2.686, -0.003, 0.374, 0.562, 1.486, 0.489], [1.084, -0.733, 0.31, 1.073, 1.131, 0.967], [1.7, 0.788, 0.091, 0.433, 1.461, 1.21]]\nD: [[-2.225, 0.184, 0.565, 0.652, 1.867, 0.966], [0.89, -0.986, 0.428, 1.537, 0.782, 0.844], [2.106, 0.485, 0.423, 0.73, 1.476, 0.84]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the couch in the scene. The camera pose information includes: the rotation matrix: [[-0.565317, -0.50256, 0.654103], [-0.824719, 0.328974, -0.460017], [0.016003, -0.799506, -0.600445]]; the translation vector: [4.07549, 5.065369, 1.281872], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-2.506, 0.209, 0.255, 0.924, 1.928, 0.478], [0.463, -1.087, 0.626, 1.179, 0.62, 0.996], [2.049, 0.799, -0.061, 0.618, 1.041, 1.191]]\nB: [[-2.054, 0.6, 0.688, 1.044, 1.508, 0.567], [0.964, -1.042, 0.495, 1.122, 0.573, 0.421], [2.453, 0.513, 0.739, 0.463, 1.578, 0.424]]\nC: [[-2.686, -0.003, 0.374, 0.562, 1.486, 0.489], [1.084, -0.733, 0.31, 1.073, 1.131, 0.967], [1.7, 0.788, 0.091, 0.433, 1.461, 1.21]]\nD: [[-2.225, 0.184, 0.565, 0.652, 1.867, 0.966], [0.89, -0.986, 0.428, 1.537, 0.782, 0.844], [2.106, 0.485, 0.423, 0.73, 1.476, 0.84]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_27_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_27_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-0.787, -0.535, 0.927, 0.017, -0.194, -0.206]]\nB: [[-1.049, -0.444, 0.739, 0.127, 0.097, 0.179]]\nC: [[-1.148, -0.307, 0.649, -0.194, 0.004, 0.501]]\nD: [[-1.423, -0.784, 0.923, 0.285, 0.539, 0.33]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the water bottle in the scene. The camera pose information includes: the rotation matrix: [[0.684823, -0.326379, 0.651532], [-0.728707, -0.304485, 0.613413], [-0.001823, -0.894855, -0.446353]]; the translation vector: [2.86358, 2.414664, 1.549631], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.787, -0.535, 0.927, 0.017, -0.194, -0.206]]\nB: [[-1.049, -0.444, 0.739, 0.127, 0.097, 0.179]]\nC: [[-1.148, -0.307, 0.649, -0.194, 0.004, 0.501]]\nD: [[-1.423, -0.784, 0.923, 0.285, 0.539, 0.33]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_28_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_28_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-0.125, -0.371, 0.52, 0.921, 0.949, 1.032], [-0.05, 0.47, 0.51, 0.929, 1.055, 1.018]]\nB: [[-0.03, 0.021, 0.629, 1.294, 0.744, 0.853], [0.141, 0.523, 0.057, 0.461, 0.601, 1.102]]\nC: [[-0.027, -0.543, 0.255, 1.392, 0.459, 1.351], [-0.542, 0.241, 0.854, 1.099, 1.281, 1.01]]\nD: [[-0.353, -0.617, 0.621, 0.568, 1.229, 1.321], [-0.327, 0.58, 0.56, 0.835, 0.644, 0.683]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the table in the scene. The camera pose information includes: the rotation matrix: [[0.935902, 0.160482, -0.313582], [0.351212, -0.493772, 0.795512], [-0.027173, -0.854655, -0.518485]]; the translation vector: [4.465, -0.226232, 1.550028], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.125, -0.371, 0.52, 0.921, 0.949, 1.032], [-0.05, 0.47, 0.51, 0.929, 1.055, 1.018]]\nB: [[-0.03, 0.021, 0.629, 1.294, 0.744, 0.853], [0.141, 0.523, 0.057, 0.461, 0.601, 1.102]]\nC: [[-0.027, -0.543, 0.255, 1.392, 0.459, 1.351], [-0.542, 0.241, 0.854, 1.099, 1.281, 1.01]]\nD: [[-0.353, -0.617, 0.621, 0.568, 1.229, 1.321], [-0.327, 0.58, 0.56, 0.835, 0.644, 0.683]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_29_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_29_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-1.113, 1.152, 0.299, 1.212, 0.824, 1.479], [-0.739, -1.761, 0.838, 1.759, 0.908, 0.436]]\nB: [[-1.686, 0.962, 0.402, 1.418, 0.984, 0.915], [-0.524, -1.303, 0.377, 1.429, 0.342, 0.995]]\nC: [[-1.37, 1.148, 0.616, 1.114, 0.537, 1.159], [-0.283, -1.543, 0.412, 1.531, 0.506, 0.887]]\nD: [[-1.358, 1.603, 0.665, 1.495, 0.045, 1.488], [0.077, -1.171, 0.113, 1.245, 0.683, 1.338]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the dresser in the scene. The camera pose information includes: the rotation matrix: [[0.993306, 0.029023, -0.111812], [0.110831, -0.512349, 0.851596], [-0.032571, -0.858287, -0.512136]]; the translation vector: [2.482234, 1.391135, 1.348064], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.113, 1.152, 0.299, 1.212, 0.824, 1.479], [-0.739, -1.761, 0.838, 1.759, 0.908, 0.436]]\nB: [[-1.686, 0.962, 0.402, 1.418, 0.984, 0.915], [-0.524, -1.303, 0.377, 1.429, 0.342, 0.995]]\nC: [[-1.37, 1.148, 0.616, 1.114, 0.537, 1.159], [-0.283, -1.543, 0.412, 1.531, 0.506, 0.887]]\nD: [[-1.358, 1.603, 0.665, 1.495, 0.045, 1.488], [0.077, -1.171, 0.113, 1.245, 0.683, 1.338]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_30_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_30_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[1.429, 0.564, 1.259, 0.514, 4.432, 2.586], [-1.998, 0.309, 1.385, 0.292, 3.896, 2.792], [0.693, 2.704, 1.079, 1.949, 0.124, 2.2]]\nB: [[1.111, 0.098, 1.082, 0.466, 4.575, 2.917], [-1.93, 0.083, 1.425, -0.025, 4.078, 2.389], [0.372, 3.074, 1.309, 1.613, 0.349, 2.653]]\nC: [[1.746, 0.141, 1.259, 0.14, 4.199, 2.418], [-1.8, 0.062, 1.744, -0.163, 3.558, 2.447], [0.931, 3.17, 1.18, 1.489, -0.095, 2.336]]\nD: [[1.116, 0.433, 1.412, 0.515, 4.324, 2.69], [-1.509, 0.174, 1.744, -0.053, 3.532, 2.532], [0.744, 2.248, 0.965, 1.964, 0.231, 1.764]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[-0.32152, -0.4706, 0.821681], [-0.946681, 0.178549, -0.268172], [-0.020508, -0.864092, -0.502915]]; the translation vector: [2.120097, 2.367636, 1.494245], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.429, 0.564, 1.259, 0.514, 4.432, 2.586], [-1.998, 0.309, 1.385, 0.292, 3.896, 2.792], [0.693, 2.704, 1.079, 1.949, 0.124, 2.2]]\nB: [[1.111, 0.098, 1.082, 0.466, 4.575, 2.917], [-1.93, 0.083, 1.425, -0.025, 4.078, 2.389], [0.372, 3.074, 1.309, 1.613, 0.349, 2.653]]\nC: [[1.746, 0.141, 1.259, 0.14, 4.199, 2.418], [-1.8, 0.062, 1.744, -0.163, 3.558, 2.447], [0.931, 3.17, 1.18, 1.489, -0.095, 2.336]]\nD: [[1.116, 0.433, 1.412, 0.515, 4.324, 2.69], [-1.509, 0.174, 1.744, -0.053, 3.532, 2.532], [0.744, 2.248, 0.965, 1.964, 0.231, 1.764]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_31_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_31_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[0.189, -0.394, 0.453, 1.615, 0.833, 0.943]]\nB: [[-0.04, -0.278, 0.23, 1.326, 1.046, 0.463]]\nC: [[-0.492, -0.1, 0.679, 1.535, 0.67, -0.014]]\nD: [[0.006, 0.067, 0.535, 1.038, 1.473, 0.446]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the coffee table in the scene. The camera pose information includes: the rotation matrix: [[-0.799511, 0.533863, -0.275266], [0.600541, 0.71925, -0.349328], [0.011492, -0.4446, -0.895656]]; the translation vector: [2.031323, 2.312379, 1.200993], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.189, -0.394, 0.453, 1.615, 0.833, 0.943]]\nB: [[-0.04, -0.278, 0.23, 1.326, 1.046, 0.463]]\nC: [[-0.492, -0.1, 0.679, 1.535, 0.67, -0.014]]\nD: [[0.006, 0.067, 0.535, 1.038, 1.473, 0.446]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_32_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_32_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[1.534, -3.167, 1.655, 0.929, 0.443, 2.126]]\nB: [[1.524, -3.177, 0.91, 0.507, 0.601, 2.272]]\nC: [[1.265, -3.361, 1.281, 0.587, 0.91, 2.343]]\nD: [[1.106, -3.397, 1.033, 0.365, 0.531, 2.023]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the shower walls in the scene. The camera pose information includes: the rotation matrix: [[0.590232, -0.352789, 0.726062], [-0.807221, -0.252962, 0.533296], [-0.004475, -0.900861, -0.434086]]; the translation vector: [2.518124, 2.463328, 1.346668], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.534, -3.167, 1.655, 0.929, 0.443, 2.126]]\nB: [[1.524, -3.177, 0.91, 0.507, 0.601, 2.272]]\nC: [[1.265, -3.361, 1.281, 0.587, 0.91, 2.343]]\nD: [[1.106, -3.397, 1.033, 0.365, 0.531, 2.023]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_33_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_33_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-1.432, -0.058, 1.349, -0.208, 0.669, 1.766], [0.527, -0.253, 0.616, -0.259, 1.051, 2.58]]\nB: [[-1.145, -0.538, 0.911, 0.071, 0.71, 1.954], [0.803, -0.422, 1.032, 0.108, 0.84, 2.211]]\nC: [[-1.363, -0.409, 0.647, 0.052, 0.929, 2.359], [0.332, 0.057, 1.462, -0.091, 0.807, 2.526]]\nD: [[-1.139, -0.369, 0.72, -0.007, 0.535, 2.292], [0.44, -0.22, 1.3, 0.54, 1.144, 2.107]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the door in the scene. The camera pose information includes: the rotation matrix: [[-0.464707, 0.496079, -0.733453], [0.882598, 0.326106, -0.338639], [0.071191, -0.804711, -0.589382]]; the translation vector: [2.864701, 0.868861, 1.204561], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.432, -0.058, 1.349, -0.208, 0.669, 1.766], [0.527, -0.253, 0.616, -0.259, 1.051, 2.58]]\nB: [[-1.145, -0.538, 0.911, 0.071, 0.71, 1.954], [0.803, -0.422, 1.032, 0.108, 0.84, 2.211]]\nC: [[-1.363, -0.409, 0.647, 0.052, 0.929, 2.359], [0.332, 0.057, 1.462, -0.091, 0.807, 2.526]]\nD: [[-1.139, -0.369, 0.72, -0.007, 0.535, 2.292], [0.44, -0.22, 1.3, 0.54, 1.144, 2.107]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_34_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_34_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-2.366, -0.589, 0.493, 0.271, 1.157, 0.396]]\nB: [[-2.148, -0.107, 0.643, 0.495, 1.354, 0.165]]\nC: [[-2.396, -0.378, 0.719, 0.293, 1.134, 0.807]]\nD: [[-2.162, -0.271, 0.293, -0.116, 0.802, 0.089]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the books in the scene. The camera pose information includes: the rotation matrix: [[0.467192, 0.317292, -0.825262], [0.883302, -0.126478, 0.451421], [0.038855, -0.939856, -0.339354]]; the translation vector: [2.723032, 3.168159, 1.438168], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-2.366, -0.589, 0.493, 0.271, 1.157, 0.396]]\nB: [[-2.148, -0.107, 0.643, 0.495, 1.354, 0.165]]\nC: [[-2.396, -0.378, 0.719, 0.293, 1.134, 0.807]]\nD: [[-2.162, -0.271, 0.293, -0.116, 0.802, 0.089]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_35_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_35_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[0.721, -0.518, 0.621, 0.489, 0.194, 0.671], [0.219, 1.2, 0.605, 0.561, 1.11, 0.032]]\nB: [[1.127, -0.237, 0.575, 0.571, 0.442, 0.463], [0.315, 0.86, 0.589, 0.436, 0.639, 0.436]]\nC: [[1.534, -0.019, 0.554, 1.064, 0.929, 0.813], [0.238, 0.541, 0.519, 0.085, 0.619, 0.329]]\nD: [[0.67, 0.235, 1.051, 0.639, -0.039, 0.084], [0.187, 0.38, 0.829, 0.452, 0.327, 0.898]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the chair in the scene. The camera pose information includes: the rotation matrix: [[0.473704, -0.275929, 0.836342], [-0.879436, -0.198746, 0.432542], [0.046868, -0.940406, -0.336809]]; the translation vector: [2.984934, 2.048073, 1.446683], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.721, -0.518, 0.621, 0.489, 0.194, 0.671], [0.219, 1.2, 0.605, 0.561, 1.11, 0.032]]\nB: [[1.127, -0.237, 0.575, 0.571, 0.442, 0.463], [0.315, 0.86, 0.589, 0.436, 0.639, 0.436]]\nC: [[1.534, -0.019, 0.554, 1.064, 0.929, 0.813], [0.238, 0.541, 0.519, 0.085, 0.619, 0.329]]\nD: [[0.67, 0.235, 1.051, 0.639, -0.039, 0.084], [0.187, 0.38, 0.829, 0.452, 0.327, 0.898]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_36_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_36_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[1.092, -0.678, 1.584, -0.059, 1.814, 1.264], [-1.925, -0.058, 1.478, -0.34, 2.948, 1.581]]\nB: [[1.208, -0.318, 1.322, 0.047, 1.935, 1.314], [-2.088, -0.879, 1.441, 0.235, 2.296, 1.085]]\nC: [[1.41, -0.38, 1.574, 0.141, 1.666, 1.41], [-1.712, -0.407, 1.364, 0.152, 2.69, 1.496]]\nD: [[1.415, -0.841, 1.953, -0.188, 1.625, 1.182], [-1.333, -0.183, 1.414, 0.172, 2.326, 1.539]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the blackboard in the scene. The camera pose information includes: the rotation matrix: [[0.24604, -0.551346, 0.797171], [-0.968826, -0.115295, 0.219278], [-0.028988, -0.826271, -0.562526]]; the translation vector: [1.704247, 2.057158, 1.361636], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.092, -0.678, 1.584, -0.059, 1.814, 1.264], [-1.925, -0.058, 1.478, -0.34, 2.948, 1.581]]\nB: [[1.208, -0.318, 1.322, 0.047, 1.935, 1.314], [-2.088, -0.879, 1.441, 0.235, 2.296, 1.085]]\nC: [[1.41, -0.38, 1.574, 0.141, 1.666, 1.41], [-1.712, -0.407, 1.364, 0.152, 2.69, 1.496]]\nD: [[1.415, -0.841, 1.953, -0.188, 1.625, 1.182], [-1.333, -0.183, 1.414, 0.172, 2.326, 1.539]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_37_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_37_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-0.294, -3.518, 1.054, 3.936, 0.361, 0.915], [0.879, 3.786, 1.41, 2.097, 0.63, 1.328]]\nB: [[-0.76, -3.309, 1.31, 3.985, 0.372, 1.047], [0.904, 3.311, 1.519, 1.8, 0.243, 1.41]]\nC: [[-0.969, -3.07, 1.797, 3.572, 0.172, 1.293], [1.021, 3.539, 1.127, 2.014, -0.169, 1.491]]\nD: [[-0.614, -3.4, 1.295, 4.114, 0.42, 0.553], [0.711, 2.902, 1.12, 1.656, 0.643, 1.016]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the whiteboard in the scene. The camera pose information includes: the rotation matrix: [[-0.852441, 0.228219, -0.470383], [0.522431, 0.337001, -0.78326], [-0.020235, -0.913426, -0.406502]]; the translation vector: [1.798405, 5.320803, 1.619482], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.294, -3.518, 1.054, 3.936, 0.361, 0.915], [0.879, 3.786, 1.41, 2.097, 0.63, 1.328]]\nB: [[-0.76, -3.309, 1.31, 3.985, 0.372, 1.047], [0.904, 3.311, 1.519, 1.8, 0.243, 1.41]]\nC: [[-0.969, -3.07, 1.797, 3.572, 0.172, 1.293], [1.021, 3.539, 1.127, 2.014, -0.169, 1.491]]\nD: [[-0.614, -3.4, 1.295, 4.114, 0.42, 0.553], [0.711, 2.902, 1.12, 1.656, 0.643, 1.016]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_38_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_38_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[1.694, -2.027, 0.441, 1.326, 1.107, 0.898], [-0.288, -2.078, 0.474, 1.039, 1.539, 0.924]]\nB: [[1.654, -2.115, 0.692, 1.098, 1.029, 0.654], [-0.011, -1.968, 0.288, 1.388, 1.994, 1.185]]\nC: [[2.035, -2.378, 0.613, 1.604, 1.492, 1.161], [-0.68, -1.93, 0.48, 0.656, 1.897, 0.701]]\nD: [[1.361, -2.093, 0.306, 1.061, 0.846, 0.974], [-0.065, -1.942, 0.682, 1.519, 1.648, 1.035]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the foosball table in the scene. The camera pose information includes: the rotation matrix: [[-0.699126, -0.324611, 0.637064], [-0.713802, 0.265353, -0.648131], [0.041344, -0.907863, -0.417224]]; the translation vector: [0.050403, 3.78209, 1.506908], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.694, -2.027, 0.441, 1.326, 1.107, 0.898], [-0.288, -2.078, 0.474, 1.039, 1.539, 0.924]]\nB: [[1.654, -2.115, 0.692, 1.098, 1.029, 0.654], [-0.011, -1.968, 0.288, 1.388, 1.994, 1.185]]\nC: [[2.035, -2.378, 0.613, 1.604, 1.492, 1.161], [-0.68, -1.93, 0.48, 0.656, 1.897, 0.701]]\nD: [[1.361, -2.093, 0.306, 1.061, 0.846, 0.974], [-0.065, -1.942, 0.682, 1.519, 1.648, 1.035]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_39_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_39_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-1.934, -0.844, -0.178, -0.06, 0.379, 0.377]]\nB: [[-2.118, -0.866, 0.424, -0.166, 0.218, 0.556]]\nC: [[-2.075, -0.928, -0.19, 0.558, 0.471, 0.431]]\nD: [[-1.78, -0.879, 0.057, 0.14, 0.194, 0.118]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the chair in the scene. The camera pose information includes: the rotation matrix: [[-0.079918, -0.690871, 0.718547], [-0.996802, 0.055321, -0.057677], [9.6e-05, -0.720858, -0.693082]]; the translation vector: [1.142658, 0.968078, 1.385987], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.934, -0.844, -0.178, -0.06, 0.379, 0.377]]\nB: [[-2.118, -0.866, 0.424, -0.166, 0.218, 0.556]]\nC: [[-2.075, -0.928, -0.19, 0.558, 0.471, 0.431]]\nD: [[-1.78, -0.879, 0.057, 0.14, 0.194, 0.118]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_40_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_40_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-1.004, -0.056, 0.156, 0.548, 2.574, 0.973], [0.863, 1.538, 0.8, 1.622, 0.932, 0.511]]\nB: [[-0.752, -0.451, 0.479, 0.974, 2.169, 0.971], [0.505, 1.322, 0.592, 1.774, 0.902, 0.995]]\nC: [[-0.502, -0.659, 0.53, 0.847, 2.257, 0.624], [0.069, 1.171, 0.213, 2.015, 1.277, 1.24]]\nD: [[-0.413, -0.371, 0.765, 1.102, 2.094, 1.312], [0.364, 1.532, 0.25, 2.233, 1.243, 0.916]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the couch in the scene. The camera pose information includes: the rotation matrix: [[-0.861262, 0.35211, -0.366398], [0.508128, 0.60504, -0.61297], [0.005853, -0.714105, -0.700014]]; the translation vector: [3.145762, 3.637784, 1.437024], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.004, -0.056, 0.156, 0.548, 2.574, 0.973], [0.863, 1.538, 0.8, 1.622, 0.932, 0.511]]\nB: [[-0.752, -0.451, 0.479, 0.974, 2.169, 0.971], [0.505, 1.322, 0.592, 1.774, 0.902, 0.995]]\nC: [[-0.502, -0.659, 0.53, 0.847, 2.257, 0.624], [0.069, 1.171, 0.213, 2.015, 1.277, 1.24]]\nD: [[-0.413, -0.371, 0.765, 1.102, 2.094, 1.312], [0.364, 1.532, 0.25, 2.233, 1.243, 0.916]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_41_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_41_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[0.464, -1.008, 0.704, 0.548, 0.603, 1.02], [-0.19, -0.382, 0.144, 0.733, 0.716, 0.646], [-0.404, 0.288, 0.436, 0.98, 0.357, 1.05], [1.664, -1.226, -0.006, 0.561, 0.79, 0.588], [0.842, 1.15, 0.487, 0.536, 0.705, 0.632], [0.433, 0.35, -0.074, 0.642, 0.688, 0.335], [1.494, 3.097, 0.552, 0.862, 0.855, 0.649], [-1.799, -1.668, 0.843, 1.135, 1.012, 0.533], [2.071, -0.103, 0.082, 0.7, 0.467, 1.183], [2.558, 1.11, 0.748, 0.486, 0.458, 0.736], [-0.954, 2.892, -0.06, 0.296, 0.278, 1.194], [-1.303, 2.083, 0.061, 0.236, 0.278, 0.444], [-1.569, 0.894, 0.218, 0.482, 1.049, 0.471]]\nB: [[0.772, -0.719, 0.389, 0.713, 0.789, 0.818], [-0.024, -0.745, 0.397, 0.693, 0.69, 0.791], [-0.445, -0.009, 0.396, 0.704, 0.6, 0.798], [1.881, -0.924, 0.405, 0.629, 0.643, 0.773], [0.681, 0.918, 0.401, 0.691, 0.741, 0.776], [0.646, 0.122, 0.392, 0.618, 0.697, 0.804], [1.675, 2.694, 0.343, 0.794, 0.824, 0.712], [-1.741, -1.918, 0.384, 0.689, 0.734, 0.793], [1.972, 0.182, 0.329, 0.798, 0.905, 0.759], [2.104, 1.432, 0.601, 0.176, 0.467, 0.305], [-1.26, 2.803, 0.397, 0.519, 0.618, 0.85], [-1.699, 1.837, 0.379, 0.732, 0.671, 0.798], [-1.685, 1.314, 0.409, 0.719, 0.764, 0.815]]\nC: [[0.533, -0.974, 0.234, 0.918, 0.378, 0.964], [-0.355, -1.19, 0.156, 0.302, 0.635, 0.774], [-0.597, 0.157, 0.288, 1.05, 0.184, 0.298], [2.072, -0.909, 0.536, 0.468, 0.691, 0.463], [0.786, 1.284, 0.692, 1.11, 1.012, 1.207], [0.407, 0.333, 0.418, 0.195, 0.858, 0.97], [1.968, 3.191, -0.153, 0.695, 1.269, 0.454], [-1.257, -1.997, 0.349, 0.303, 0.286, 0.552], [2.317, 0.459, 0.175, 0.403, 1.116, 1.213], [2.141, 1.823, 0.68, -0.29, 0.059, -0.035], [-1.354, 3.299, 0.362, 0.406, 0.802, 0.98], [-2.092, 2.265, 0.732, 1.224, 0.725, 0.93], [-1.784, 1.414, 0.713, 0.316, 1.116, 0.675]]\nD: [[0.989, -0.333, 0.223, 0.813, 0.656, 0.519], [0.19, -0.985, 0.389, 0.303, 0.729, 1.121], [-0.625, 0.156, 0.665, 1.074, 0.926, 0.429], [2.366, -0.669, 0.862, 0.551, 0.718, 0.409], [1.078, 0.548, 0.472, 1.129, 0.587, 0.295], [0.268, -0.298, 0.199, 0.384, 0.582, 0.724], [1.775, 3.124, 0.353, 0.87, 1.306, 0.424], [-2.119, -2.015, 0.712, 0.444, 0.613, 1.097], [2.125, 0.536, -0.025, 0.783, 0.67, 0.385], [2.16, 1.441, 0.464, 0.575, 0.443, 0.108], [-1.127, 3.006, 0.402, 0.226, 0.819, 0.552], [-1.981, 2.007, -0.054, 1.127, 0.372, 0.846], [-1.217, 1.009, -0.072, 0.967, 0.351, 1.126]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the chair in the scene. The camera pose information includes: the rotation matrix: [[0.951558, 0.16536, -0.259218], [0.307283, -0.481983, 0.820531], [0.010744, -0.860436, -0.509446]]; the translation vector: [2.919862, 3.428013, 1.521081], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.464, -1.008, 0.704, 0.548, 0.603, 1.02], [-0.19, -0.382, 0.144, 0.733, 0.716, 0.646], [-0.404, 0.288, 0.436, 0.98, 0.357, 1.05], [1.664, -1.226, -0.006, 0.561, 0.79, 0.588], [0.842, 1.15, 0.487, 0.536, 0.705, 0.632], [0.433, 0.35, -0.074, 0.642, 0.688, 0.335], [1.494, 3.097, 0.552, 0.862, 0.855, 0.649], [-1.799, -1.668, 0.843, 1.135, 1.012, 0.533], [2.071, -0.103, 0.082, 0.7, 0.467, 1.183], [2.558, 1.11, 0.748, 0.486, 0.458, 0.736], [-0.954, 2.892, -0.06, 0.296, 0.278, 1.194], [-1.303, 2.083, 0.061, 0.236, 0.278, 0.444], [-1.569, 0.894, 0.218, 0.482, 1.049, 0.471]]\nB: [[0.772, -0.719, 0.389, 0.713, 0.789, 0.818], [-0.024, -0.745, 0.397, 0.693, 0.69, 0.791], [-0.445, -0.009, 0.396, 0.704, 0.6, 0.798], [1.881, -0.924, 0.405, 0.629, 0.643, 0.773], [0.681, 0.918, 0.401, 0.691, 0.741, 0.776], [0.646, 0.122, 0.392, 0.618, 0.697, 0.804], [1.675, 2.694, 0.343, 0.794, 0.824, 0.712], [-1.741, -1.918, 0.384, 0.689, 0.734, 0.793], [1.972, 0.182, 0.329, 0.798, 0.905, 0.759], [2.104, 1.432, 0.601, 0.176, 0.467, 0.305], [-1.26, 2.803, 0.397, 0.519, 0.618, 0.85], [-1.699, 1.837, 0.379, 0.732, 0.671, 0.798], [-1.685, 1.314, 0.409, 0.719, 0.764, 0.815]]\nC: [[0.533, -0.974, 0.234, 0.918, 0.378, 0.964], [-0.355, -1.19, 0.156, 0.302, 0.635, 0.774], [-0.597, 0.157, 0.288, 1.05, 0.184, 0.298], [2.072, -0.909, 0.536, 0.468, 0.691, 0.463], [0.786, 1.284, 0.692, 1.11, 1.012, 1.207], [0.407, 0.333, 0.418, 0.195, 0.858, 0.97], [1.968, 3.191, -0.153, 0.695, 1.269, 0.454], [-1.257, -1.997, 0.349, 0.303, 0.286, 0.552], [2.317, 0.459, 0.175, 0.403, 1.116, 1.213], [2.141, 1.823, 0.68, -0.29, 0.059, -0.035], [-1.354, 3.299, 0.362, 0.406, 0.802, 0.98], [-2.092, 2.265, 0.732, 1.224, 0.725, 0.93], [-1.784, 1.414, 0.713, 0.316, 1.116, 0.675]]\nD: [[0.989, -0.333, 0.223, 0.813, 0.656, 0.519], [0.19, -0.985, 0.389, 0.303, 0.729, 1.121], [-0.625, 0.156, 0.665, 1.074, 0.926, 0.429], [2.366, -0.669, 0.862, 0.551, 0.718, 0.409], [1.078, 0.548, 0.472, 1.129, 0.587, 0.295], [0.268, -0.298, 0.199, 0.384, 0.582, 0.724], [1.775, 3.124, 0.353, 0.87, 1.306, 0.424], [-2.119, -2.015, 0.712, 0.444, 0.613, 1.097], [2.125, 0.536, -0.025, 0.783, 0.67, 0.385], [2.16, 1.441, 0.464, 0.575, 0.443, 0.108], [-1.127, 3.006, 0.402, 0.226, 0.819, 0.552], [-1.981, 2.007, -0.054, 1.127, 0.372, 0.846], [-1.217, 1.009, -0.072, 0.967, 0.351, 1.126]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_42_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_42_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[1.467, 4.66, 0.479, 1.188, 0.953, 0.41], [-0.153, 2.813, 0.487, 0.892, 1.061, 0.54], [0.179, 1.102, 0.898, 0.937, 1.029, 0.418], [1.771, 1.676, 0.076, 1.486, 0.58, 0.338], [1.933, -1.484, -0.005, 1.579, 0.28, 0.413], [-0.13, 5.403, 0.908, 0.195, 1.153, 0.536]]\nB: [[2.334, 4.242, 0.72, 1.63, 0.144, 0.491], [-0.388, 2.622, 0.373, 0.028, 1.059, 0.86], [0.405, 0.837, 0.295, 0.037, 1.048, 0.862], [2.233, 1.663, 0.713, 1.024, 0.482, 0.49], [1.733, -1.171, 0.524, 1.296, 0.141, 0.721], [-0.04, 4.994, 0.079, 0.373, 0.629, 0.062]]\nC: [[2.181, 4.661, 0.338, 1.164, 0.599, 0.637], [-0.181, 2.466, 0.019, 0.362, 0.849, 0.165], [-0.035, 1.291, 0.403, 0.392, 0.795, 0.638], [2.338, 1.45, 0.464, 0.895, 0.891, 0.816], [2.039, -1.264, 0.768, 1.237, 0.686, -0.053], [-0.249, 5.084, 0.593, 0.24, 0.421, 0.492]]\nD: [[1.918, 4.662, 0.478, 1.328, 0.546, 0.414], [0.093, 2.502, 0.4, 0.472, 0.776, 0.619], [0.138, 1.203, 0.414, 0.446, 0.869, 0.461], [1.918, 1.858, 0.513, 1.358, 0.507, 0.451], [2.021, -1.528, 0.41, 1.371, 0.472, 0.431], [0.209, 5.284, 0.463, 0.428, 0.778, 0.322]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the bench in the scene. The camera pose information includes: the rotation matrix: [[-0.482968, -0.397392, 0.78027], [-0.874514, 0.173759, -0.452807], [0.044362, -0.901048, -0.431445]]; the translation vector: [8.974016, 2.795387, 1.945192], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.467, 4.66, 0.479, 1.188, 0.953, 0.41], [-0.153, 2.813, 0.487, 0.892, 1.061, 0.54], [0.179, 1.102, 0.898, 0.937, 1.029, 0.418], [1.771, 1.676, 0.076, 1.486, 0.58, 0.338], [1.933, -1.484, -0.005, 1.579, 0.28, 0.413], [-0.13, 5.403, 0.908, 0.195, 1.153, 0.536]]\nB: [[2.334, 4.242, 0.72, 1.63, 0.144, 0.491], [-0.388, 2.622, 0.373, 0.028, 1.059, 0.86], [0.405, 0.837, 0.295, 0.037, 1.048, 0.862], [2.233, 1.663, 0.713, 1.024, 0.482, 0.49], [1.733, -1.171, 0.524, 1.296, 0.141, 0.721], [-0.04, 4.994, 0.079, 0.373, 0.629, 0.062]]\nC: [[2.181, 4.661, 0.338, 1.164, 0.599, 0.637], [-0.181, 2.466, 0.019, 0.362, 0.849, 0.165], [-0.035, 1.291, 0.403, 0.392, 0.795, 0.638], [2.338, 1.45, 0.464, 0.895, 0.891, 0.816], [2.039, -1.264, 0.768, 1.237, 0.686, -0.053], [-0.249, 5.084, 0.593, 0.24, 0.421, 0.492]]\nD: [[1.918, 4.662, 0.478, 1.328, 0.546, 0.414], [0.093, 2.502, 0.4, 0.472, 0.776, 0.619], [0.138, 1.203, 0.414, 0.446, 0.869, 0.461], [1.918, 1.858, 0.513, 1.358, 0.507, 0.451], [2.021, -1.528, 0.41, 1.371, 0.472, 0.431], [0.209, 5.284, 0.463, 0.428, 0.778, 0.322]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_43_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_43_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-1.653, 1.297, 0.818, 0.22, 0.426, 1.095]]\nB: [[-1.511, 1.726, 0.43, 0.986, 0.39, 0.401]]\nC: [[-0.81, 1.586, -0.129, 0.278, 0.94, 0.466]]\nD: [[-1.238, 1.344, 0.361, 0.491, 0.703, 0.77]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the toilet in the scene. The camera pose information includes: the rotation matrix: [[0.573165, 0.475287, -0.667521], [0.819422, -0.337921, 0.462988], [-0.005517, -0.81235, -0.583144]]; the translation vector: [4.230747, 1.597944, 1.425469], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.653, 1.297, 0.818, 0.22, 0.426, 1.095]]\nB: [[-1.511, 1.726, 0.43, 0.986, 0.39, 0.401]]\nC: [[-0.81, 1.586, -0.129, 0.278, 0.94, 0.466]]\nD: [[-1.238, 1.344, 0.361, 0.491, 0.703, 0.77]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_44_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_44_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-0.21, -0.485, 1.162, 0.745, 0.977, 0.286], [0.409, 1.308, 0.507, 0.07, 0.653, 0.254], [-1.112, -1.013, 0.307, 0.502, 0.872, 0.567], [-0.178, 2.738, 0.98, 0.567, 0.888, 0.53], [0.062, 1.603, 0.573, 0.692, 0.195, 0.048], [0.203, -0.892, 0.691, 0.653, 0.429, 0.535], [0.028, 1.784, 0.14, 1.147, 0.406, 0.851], [-1.458, 1.116, 0.904, -0.122, 0.156, 0.773], [-1.192, -0.149, 0.71, 0.375, 0.509, 0.581], [1.914, -2.0, 0.71, 0.384, 0.176, -0.331], [1.687, -2.156, 1.387, -0.058, 0.551, 0.368], [0.793, -1.724, 1.309, 1.148, 0.62, 0.588], [1.399, -0.955, 1.401, 0.399, 0.543, 0.388], [-0.785, -3.035, 1.174, 0.319, 0.082, 0.789], [-0.814, -2.329, 0.623, 0.245, 0.091, 0.496], [-1.62, -3.469, 0.316, 0.527, 0.537, -0.091]]\nB: [[-0.08, -0.154, 1.025, 0.591, 0.905, 0.775], [0.195, 0.928, 0.983, 0.833, 0.216, 0.071], [-0.777, -0.244, 0.921, 0.352, 0.434, 0.837], [-0.104, 1.99, 0.831, 0.825, 0.625, 0.159], [1.019, 2.186, 0.505, 0.763, 0.5, 0.673], [0.085, -0.695, 1.038, 0.323, 0.449, 0.684], [-0.014, 1.677, 0.448, 0.846, 0.305, -0.088], [-0.906, 1.351, 0.456, 0.541, 1.066, 0.626], [-1.282, 0.246, 0.87, 0.842, 0.096, -0.15], [1.42, -1.945, 0.918, 0.762, 0.341, 0.254], [1.009, -1.899, 1.409, -0.041, 0.531, 0.04], [0.874, -1.746, 1.047, 0.664, 0.437, 0.465], [0.723, -1.178, 0.705, 0.411, 0.715, 0.301], [-0.325, -2.808, 0.799, 0.443, 0.515, -0.023], [-1.586, -1.764, 0.236, 0.308, 0.382, 0.158], [-1.704, -3.657, 0.202, 0.579, -0.129, 0.217]]\nC: [[0.074, -0.449, 0.793, 0.473, 0.548, 0.492], [0.224, 1.038, 0.781, 0.486, 0.546, 0.522], [-0.941, -0.689, 0.578, 0.69, 0.615, 0.42], [-0.002, 2.387, 0.73, 0.7, 0.637, 0.471], [0.54, 1.814, 0.876, 0.434, 0.466, 0.544], [-0.295, -1.043, 0.729, 0.48, 0.533, 0.501], [-0.372, 1.676, 0.602, 0.676, 0.567, 0.405], [-1.148, 1.569, 0.485, 0.349, 0.639, 0.495], [-0.821, 0.09, 0.693, 0.409, 0.4, 0.269], [1.644, -1.897, 1.107, 0.364, 0.18, 0.12], [1.307, -2.14, 1.134, 0.414, 0.578, 0.428], [0.763, -1.797, 0.957, 0.648, 0.49, 0.409], [1.155, -1.373, 0.909, 0.317, 0.441, 0.117], [-0.563, -2.579, 0.735, 0.309, 0.521, 0.447], [-1.263, -2.059, 0.721, 0.472, 0.232, 0.232], [-1.688, -3.278, 0.604, 0.595, 0.313, 0.401]]\nD: [[0.369, -0.147, 1.222, 0.22, 0.106, 0.249], [0.37, 1.261, 1.11, 0.14, 1.02, 0.894], [-0.639, -0.96, 0.333, 0.677, 0.877, 0.601], [0.112, 1.921, 0.621, 0.682, 0.214, 0.04], [0.061, 1.445, 0.485, 0.375, 0.738, 0.414], [-0.478, -0.871, 0.684, 0.362, 0.566, 0.762], [-0.314, 1.927, 0.136, 0.42, 0.773, 0.685], [-1.086, 1.078, 0.616, 0.363, 0.796, 0.02], [-0.78, 0.455, 1.075, -0.039, 0.211, 0.125], [1.409, -1.503, 1.252, 0.797, 0.258, -0.146], [1.115, -1.981, 0.929, 0.053, 0.518, 0.484], [1.215, -2.2, 1.257, 0.76, 0.293, 0.427], [1.189, -1.058, 0.631, 0.369, 0.328, -0.119], [-0.365, -2.692, 1.041, -0.142, 0.542, 0.05], [-0.833, -2.437, 0.641, 0.718, 0.012, 0.121], [-2.016, -3.644, 1.062, 0.946, -0.031, -0.016]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the chair in the scene. The camera pose information includes: the rotation matrix: [[-0.844798, -0.442354, 0.301064], [-0.534849, 0.714819, -0.450523], [-0.015916, -0.541624, -0.84047]]; the translation vector: [3.085932, 7.995926, 1.934485], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.21, -0.485, 1.162, 0.745, 0.977, 0.286], [0.409, 1.308, 0.507, 0.07, 0.653, 0.254], [-1.112, -1.013, 0.307, 0.502, 0.872, 0.567], [-0.178, 2.738, 0.98, 0.567, 0.888, 0.53], [0.062, 1.603, 0.573, 0.692, 0.195, 0.048], [0.203, -0.892, 0.691, 0.653, 0.429, 0.535], [0.028, 1.784, 0.14, 1.147, 0.406, 0.851], [-1.458, 1.116, 0.904, -0.122, 0.156, 0.773], [-1.192, -0.149, 0.71, 0.375, 0.509, 0.581], [1.914, -2.0, 0.71, 0.384, 0.176, -0.331], [1.687, -2.156, 1.387, -0.058, 0.551, 0.368], [0.793, -1.724, 1.309, 1.148, 0.62, 0.588], [1.399, -0.955, 1.401, 0.399, 0.543, 0.388], [-0.785, -3.035, 1.174, 0.319, 0.082, 0.789], [-0.814, -2.329, 0.623, 0.245, 0.091, 0.496], [-1.62, -3.469, 0.316, 0.527, 0.537, -0.091]]\nB: [[-0.08, -0.154, 1.025, 0.591, 0.905, 0.775], [0.195, 0.928, 0.983, 0.833, 0.216, 0.071], [-0.777, -0.244, 0.921, 0.352, 0.434, 0.837], [-0.104, 1.99, 0.831, 0.825, 0.625, 0.159], [1.019, 2.186, 0.505, 0.763, 0.5, 0.673], [0.085, -0.695, 1.038, 0.323, 0.449, 0.684], [-0.014, 1.677, 0.448, 0.846, 0.305, -0.088], [-0.906, 1.351, 0.456, 0.541, 1.066, 0.626], [-1.282, 0.246, 0.87, 0.842, 0.096, -0.15], [1.42, -1.945, 0.918, 0.762, 0.341, 0.254], [1.009, -1.899, 1.409, -0.041, 0.531, 0.04], [0.874, -1.746, 1.047, 0.664, 0.437, 0.465], [0.723, -1.178, 0.705, 0.411, 0.715, 0.301], [-0.325, -2.808, 0.799, 0.443, 0.515, -0.023], [-1.586, -1.764, 0.236, 0.308, 0.382, 0.158], [-1.704, -3.657, 0.202, 0.579, -0.129, 0.217]]\nC: [[0.074, -0.449, 0.793, 0.473, 0.548, 0.492], [0.224, 1.038, 0.781, 0.486, 0.546, 0.522], [-0.941, -0.689, 0.578, 0.69, 0.615, 0.42], [-0.002, 2.387, 0.73, 0.7, 0.637, 0.471], [0.54, 1.814, 0.876, 0.434, 0.466, 0.544], [-0.295, -1.043, 0.729, 0.48, 0.533, 0.501], [-0.372, 1.676, 0.602, 0.676, 0.567, 0.405], [-1.148, 1.569, 0.485, 0.349, 0.639, 0.495], [-0.821, 0.09, 0.693, 0.409, 0.4, 0.269], [1.644, -1.897, 1.107, 0.364, 0.18, 0.12], [1.307, -2.14, 1.134, 0.414, 0.578, 0.428], [0.763, -1.797, 0.957, 0.648, 0.49, 0.409], [1.155, -1.373, 0.909, 0.317, 0.441, 0.117], [-0.563, -2.579, 0.735, 0.309, 0.521, 0.447], [-1.263, -2.059, 0.721, 0.472, 0.232, 0.232], [-1.688, -3.278, 0.604, 0.595, 0.313, 0.401]]\nD: [[0.369, -0.147, 1.222, 0.22, 0.106, 0.249], [0.37, 1.261, 1.11, 0.14, 1.02, 0.894], [-0.639, -0.96, 0.333, 0.677, 0.877, 0.601], [0.112, 1.921, 0.621, 0.682, 0.214, 0.04], [0.061, 1.445, 0.485, 0.375, 0.738, 0.414], [-0.478, -0.871, 0.684, 0.362, 0.566, 0.762], [-0.314, 1.927, 0.136, 0.42, 0.773, 0.685], [-1.086, 1.078, 0.616, 0.363, 0.796, 0.02], [-0.78, 0.455, 1.075, -0.039, 0.211, 0.125], [1.409, -1.503, 1.252, 0.797, 0.258, -0.146], [1.115, -1.981, 0.929, 0.053, 0.518, 0.484], [1.215, -2.2, 1.257, 0.76, 0.293, 0.427], [1.189, -1.058, 0.631, 0.369, 0.328, -0.119], [-0.365, -2.692, 1.041, -0.142, 0.542, 0.05], [-0.833, -2.437, 0.641, 0.718, 0.012, 0.121], [-2.016, -3.644, 1.062, 0.946, -0.031, -0.016]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_45_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_45_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[0.564, -1.252, 0.695, 0.857, 0.171, 1.483], [-0.805, -0.829, 1.211, 0.152, 2.16, 2.47], [-1.232, 0.204, 0.072, 0.257, 0.055, 0.162], [0.074, 0.233, 1.112, 1.831, 0.158, 2.315], [0.903, -0.43, 1.266, 0.188, 1.396, 2.012], [0.949, -1.643, 0.661, 0.101, 0.755, 1.399], [0.614, -1.995, 0.572, 0.724, 0.056, 1.162]]\nB: [[0.352, -1.05, 0.8, 0.589, 0.069, 1.588], [-0.381, -1.063, 0.88, -0.027, 2.004, 2.387], [-0.929, 0.409, 0.362, 0.309, 0.339, -0.301], [-0.26, 0.432, 1.078, 1.853, 0.513, 2.721], [1.171, -0.028, 1.724, -0.263, 0.948, 2.304], [1.072, -2.041, 1.024, -0.297, 0.869, 1.517], [0.352, -2.32, 0.85, 0.916, -0.424, 1.2]]\nC: [[0.424, -1.563, 1.009, 0.591, -0.023, 1.935], [-0.442, -0.344, 1.695, 0.23, 2.524, 2.736], [-1.205, 0.414, 0.154, -0.209, -0.177, -0.009], [-0.098, 0.328, 1.36, 1.735, 0.101, 1.922], [0.835, -0.195, 1.265, 0.532, 0.907, 2.267], [1.354, -1.455, 1.149, 0.399, 0.893, 1.521], [0.733, -1.909, 0.585, 1.055, -0.351, 1.621]]\nD: [[0.598, -1.618, 0.741, 0.612, 0.383, 1.422], [-0.532, -0.954, 1.597, 0.537, 2.362, 2.085], [-0.843, 0.31, -0.092, 0.065, -0.048, 0.556], [0.212, 0.31, 0.904, 1.605, 0.458, 1.973], [0.493, -0.221, 1.142, 0.015, 1.45, 2.441], [1.383, -2.104, 0.997, -0.035, 0.835, 1.803], [0.664, -2.077, 1.046, 1.1, 0.235, 1.396]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[0.982764, 0.054289, -0.17671], [0.184841, -0.27426, 0.943724], [0.002769, -0.960122, -0.279568]]; the translation vector: [4.072058, 1.220293, 1.47625], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.564, -1.252, 0.695, 0.857, 0.171, 1.483], [-0.805, -0.829, 1.211, 0.152, 2.16, 2.47], [-1.232, 0.204, 0.072, 0.257, 0.055, 0.162], [0.074, 0.233, 1.112, 1.831, 0.158, 2.315], [0.903, -0.43, 1.266, 0.188, 1.396, 2.012], [0.949, -1.643, 0.661, 0.101, 0.755, 1.399], [0.614, -1.995, 0.572, 0.724, 0.056, 1.162]]\nB: [[0.352, -1.05, 0.8, 0.589, 0.069, 1.588], [-0.381, -1.063, 0.88, -0.027, 2.004, 2.387], [-0.929, 0.409, 0.362, 0.309, 0.339, -0.301], [-0.26, 0.432, 1.078, 1.853, 0.513, 2.721], [1.171, -0.028, 1.724, -0.263, 0.948, 2.304], [1.072, -2.041, 1.024, -0.297, 0.869, 1.517], [0.352, -2.32, 0.85, 0.916, -0.424, 1.2]]\nC: [[0.424, -1.563, 1.009, 0.591, -0.023, 1.935], [-0.442, -0.344, 1.695, 0.23, 2.524, 2.736], [-1.205, 0.414, 0.154, -0.209, -0.177, -0.009], [-0.098, 0.328, 1.36, 1.735, 0.101, 1.922], [0.835, -0.195, 1.265, 0.532, 0.907, 2.267], [1.354, -1.455, 1.149, 0.399, 0.893, 1.521], [0.733, -1.909, 0.585, 1.055, -0.351, 1.621]]\nD: [[0.598, -1.618, 0.741, 0.612, 0.383, 1.422], [-0.532, -0.954, 1.597, 0.537, 2.362, 2.085], [-0.843, 0.31, -0.092, 0.065, -0.048, 0.556], [0.212, 0.31, 0.904, 1.605, 0.458, 1.973], [0.493, -0.221, 1.142, 0.015, 1.45, 2.441], [1.383, -2.104, 0.997, -0.035, 0.835, 1.803], [0.664, -2.077, 1.046, 1.1, 0.235, 1.396]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_46_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_46_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[0.784, -1.767, 0.476, 0.231, 0.24, 0.946], [-0.366, -0.774, 1.245, 0.024, 1.212, 0.879], [0.003, -1.957, 1.089, 0.298, 0.345, 0.685], [0.22, -0.648, 1.189, -0.053, 1.037, 0.737]]\nB: [[0.318, -1.739, 0.9, 0.365, 0.659, 0.502], [-0.143, -0.934, 0.904, 0.311, 0.754, 0.487], [-0.263, -1.46, 0.926, 0.248, 0.697, 0.452], [0.319, -1.069, 0.941, 0.277, 0.615, 0.5]]\nC: [[0.289, -1.409, 1.144, 0.67, 0.233, 0.02], [0.068, -0.634, 0.752, -0.119, 1.056, 0.899], [0.211, -1.754, 1.05, -0.206, 0.931, 0.732], [-0.148, -1.524, 1.046, -0.083, 1.07, 0.467]]\nD: [[0.118, -1.502, 0.988, 0.826, 0.676, 0.125], [-0.431, -1.392, 0.927, 0.243, 0.317, 0.128], [-0.041, -1.634, 0.476, -0.222, 0.764, 0.802], [0.199, -1.056, 1.182, 0.1, 0.287, 0.626]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the monitor in the scene. The camera pose information includes: the rotation matrix: [[-0.481759, -0.460793, 0.745371], [-0.875469, 0.290199, -0.386444], [-0.038235, -0.838722, -0.543216]]; the translation vector: [3.08436, 2.075189, 1.468295], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.784, -1.767, 0.476, 0.231, 0.24, 0.946], [-0.366, -0.774, 1.245, 0.024, 1.212, 0.879], [0.003, -1.957, 1.089, 0.298, 0.345, 0.685], [0.22, -0.648, 1.189, -0.053, 1.037, 0.737]]\nB: [[0.318, -1.739, 0.9, 0.365, 0.659, 0.502], [-0.143, -0.934, 0.904, 0.311, 0.754, 0.487], [-0.263, -1.46, 0.926, 0.248, 0.697, 0.452], [0.319, -1.069, 0.941, 0.277, 0.615, 0.5]]\nC: [[0.289, -1.409, 1.144, 0.67, 0.233, 0.02], [0.068, -0.634, 0.752, -0.119, 1.056, 0.899], [0.211, -1.754, 1.05, -0.206, 0.931, 0.732], [-0.148, -1.524, 1.046, -0.083, 1.07, 0.467]]\nD: [[0.118, -1.502, 0.988, 0.826, 0.676, 0.125], [-0.431, -1.392, 0.927, 0.243, 0.317, 0.128], [-0.041, -1.634, 0.476, -0.222, 0.764, 0.802], [0.199, -1.056, 1.182, 0.1, 0.287, 0.626]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_47_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_47_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[1.668, 1.082, 1.323, 0.017, 1.605, 2.936], [2.048, 0.314, 0.42, 0.03, 0.539, 1.761], [1.696, -0.749, 1.568, 0.626, 3.055, 2.333], [0.245, -2.087, 0.863, 3.306, 0.237, 1.624], [-1.623, -0.389, 0.569, -0.197, 4.462, 2.242], [-0.209, 2.531, 1.246, 3.173, 0.694, 2.668]]\nB: [[2.245, 1.07, 1.564, -0.254, 1.675, 2.343], [1.954, 0.142, 1.271, 0.081, 0.135, 1.547], [1.76, -0.992, 1.136, -0.068, 2.332, 2.454], [-0.223, -2.118, 0.902, 3.53, 0.211, 2.156], [-1.973, 0.007, 0.511, -0.091, 4.007, 2.11], [0.095, 1.842, 1.661, 2.856, 0.289, 2.599]]\nC: [[1.954, 0.955, 1.134, 0.469, 1.906, 3.136], [2.108, 0.777, 1.024, 0.49, 0.432, 1.736], [1.804, -1.161, 1.159, 0.579, 2.431, 2.741], [-0.075, -2.215, 1.141, 4.022, 0.647, 1.726], [-1.259, -0.097, 0.765, 0.259, 4.315, 1.492], [0.085, 2.397, 1.377, 2.831, 0.531, 2.459]]\nD: [[1.757, 1.207, 1.277, 0.171, 1.443, 2.662], [1.938, 0.553, 0.792, 0.372, 0.083, 1.699], [2.057, -0.732, 1.221, 0.308, 2.678, 2.532], [0.254, -2.161, 1.253, 3.68, 0.191, 2.014], [-1.595, -0.042, 0.831, 0.273, 4.363, 1.747], [0.19, 2.052, 1.297, 3.302, 0.408, 2.639]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[-0.935878, -0.161972, 0.312885], [-0.352322, 0.433116, -0.829627], [-0.001139, -0.886666, -0.46241]]; the translation vector: [1.123681, 2.231354, 1.408983], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.668, 1.082, 1.323, 0.017, 1.605, 2.936], [2.048, 0.314, 0.42, 0.03, 0.539, 1.761], [1.696, -0.749, 1.568, 0.626, 3.055, 2.333], [0.245, -2.087, 0.863, 3.306, 0.237, 1.624], [-1.623, -0.389, 0.569, -0.197, 4.462, 2.242], [-0.209, 2.531, 1.246, 3.173, 0.694, 2.668]]\nB: [[2.245, 1.07, 1.564, -0.254, 1.675, 2.343], [1.954, 0.142, 1.271, 0.081, 0.135, 1.547], [1.76, -0.992, 1.136, -0.068, 2.332, 2.454], [-0.223, -2.118, 0.902, 3.53, 0.211, 2.156], [-1.973, 0.007, 0.511, -0.091, 4.007, 2.11], [0.095, 1.842, 1.661, 2.856, 0.289, 2.599]]\nC: [[1.954, 0.955, 1.134, 0.469, 1.906, 3.136], [2.108, 0.777, 1.024, 0.49, 0.432, 1.736], [1.804, -1.161, 1.159, 0.579, 2.431, 2.741], [-0.075, -2.215, 1.141, 4.022, 0.647, 1.726], [-1.259, -0.097, 0.765, 0.259, 4.315, 1.492], [0.085, 2.397, 1.377, 2.831, 0.531, 2.459]]\nD: [[1.757, 1.207, 1.277, 0.171, 1.443, 2.662], [1.938, 0.553, 0.792, 0.372, 0.083, 1.699], [2.057, -0.732, 1.221, 0.308, 2.678, 2.532], [0.254, -2.161, 1.253, 3.68, 0.191, 2.014], [-1.595, -0.042, 0.831, 0.273, 4.363, 1.747], [0.19, 2.052, 1.297, 3.302, 0.408, 2.639]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_48_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_48_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-1.874, 0.432, 0.675, 0.547, 4.466, 2.765], [2.077, 1.025, 0.96, 0.435, 5.53, 2.428], [-0.302, -2.355, 0.907, 2.255, -0.132, 2.065], [1.309, 2.97, 0.649, 2.105, 0.601, 1.706], [1.394, -2.794, 0.572, 0.929, 0.444, 0.269]]\nB: [[-1.774, -0.336, 1.363, 0.137, 4.327, 2.118], [2.024, 0.331, 0.678, 0.299, 6.038, 2.411], [-0.973, -2.453, 1.173, 2.209, -0.12, 1.889], [0.907, 3.079, 0.375, 1.445, 0.297, 1.527], [1.457, -2.684, 0.632, 0.896, -0.39, 0.289]]\nC: [[-2.285, -0.2, 1.099, -0.248, 4.908, 2.313], [2.096, 0.355, 1.235, 0.363, 5.974, 2.044], [-1.085, -2.082, 0.91, 2.454, 0.239, 1.438], [0.732, 3.157, 0.493, 1.665, 0.182, 1.592], [1.088, -2.467, -0.003, 0.673, 0.233, 0.167]]\nD: [[-1.815, -0.066, 1.137, 0.19, 4.502, 2.283], [1.738, 0.547, 0.94, 0.42, 5.701, 2.065], [-0.777, -2.286, 1.047, 2.091, 0.123, 1.885], [1.023, 3.389, 0.735, 1.699, 0.123, 1.528], [1.495, -2.301, 0.226, 0.722, 0.028, 0.613]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[0.934222, -0.219071, 0.281493], [-0.356558, -0.595286, 0.72007], [0.009823, -0.773073, -0.634241]]; the translation vector: [0.331108, 1.989283, 1.551545], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.874, 0.432, 0.675, 0.547, 4.466, 2.765], [2.077, 1.025, 0.96, 0.435, 5.53, 2.428], [-0.302, -2.355, 0.907, 2.255, -0.132, 2.065], [1.309, 2.97, 0.649, 2.105, 0.601, 1.706], [1.394, -2.794, 0.572, 0.929, 0.444, 0.269]]\nB: [[-1.774, -0.336, 1.363, 0.137, 4.327, 2.118], [2.024, 0.331, 0.678, 0.299, 6.038, 2.411], [-0.973, -2.453, 1.173, 2.209, -0.12, 1.889], [0.907, 3.079, 0.375, 1.445, 0.297, 1.527], [1.457, -2.684, 0.632, 0.896, -0.39, 0.289]]\nC: [[-2.285, -0.2, 1.099, -0.248, 4.908, 2.313], [2.096, 0.355, 1.235, 0.363, 5.974, 2.044], [-1.085, -2.082, 0.91, 2.454, 0.239, 1.438], [0.732, 3.157, 0.493, 1.665, 0.182, 1.592], [1.088, -2.467, -0.003, 0.673, 0.233, 0.167]]\nD: [[-1.815, -0.066, 1.137, 0.19, 4.502, 2.283], [1.738, 0.547, 0.94, 0.42, 5.701, 2.065], [-0.777, -2.286, 1.047, 2.091, 0.123, 1.885], [1.023, 3.389, 0.735, 1.699, 0.123, 1.528], [1.495, -2.301, 0.226, 0.722, 0.028, 0.613]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_49_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_49_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-1.355, 0.849, 0.484, 0.583, 0.026, -0.166]]\nB: [[-0.954, 0.48, 0.115, 0.22, 0.221, 0.246]]\nC: [[-0.886, 0.23, -0.323, 0.388, 0.524, 0.544]]\nD: [[-0.877, -0.009, -0.082, -0.196, 0.347, 0.57]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the trash can in the scene. The camera pose information includes: the rotation matrix: [[-0.986418, -0.051155, 0.156087], [-0.152905, 0.633099, -0.758819], [-0.060001, -0.772379, -0.632322]]; the translation vector: [2.055195, 1.600374, 1.268236], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.355, 0.849, 0.484, 0.583, 0.026, -0.166]]\nB: [[-0.954, 0.48, 0.115, 0.22, 0.221, 0.246]]\nC: [[-0.886, 0.23, -0.323, 0.388, 0.524, 0.544]]\nD: [[-0.877, -0.009, -0.082, -0.196, 0.347, 0.57]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_50_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_50_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[0.231, -1.891, 1.007, 2.627, 2.266, 2.036], [-0.307, -0.367, 0.926, 1.513, 1.063, 1.867]]\nB: [[0.718, -2.089, 0.687, 2.342, 2.265, 1.579], [-0.634, -0.345, 1.176, 1.11, 1.202, 2.075]]\nC: [[0.464, -1.663, 1.056, 2.135, 2.464, 2.098], [-0.781, -0.219, 1.071, 1.18, 1.361, 1.522]]\nD: [[-0.112, -2.192, 0.852, 2.525, 1.965, 2.377], [0.128, -0.805, 0.888, 1.396, 1.418, 2.338]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the bathroom stall in the scene. The camera pose information includes: the rotation matrix: [[-0.255252, -0.433184, 0.864406], [-0.966562, 0.137073, -0.216725], [-0.024605, -0.890821, -0.453687]]; the translation vector: [1.468232, 3.881342, 1.432686], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.231, -1.891, 1.007, 2.627, 2.266, 2.036], [-0.307, -0.367, 0.926, 1.513, 1.063, 1.867]]\nB: [[0.718, -2.089, 0.687, 2.342, 2.265, 1.579], [-0.634, -0.345, 1.176, 1.11, 1.202, 2.075]]\nC: [[0.464, -1.663, 1.056, 2.135, 2.464, 2.098], [-0.781, -0.219, 1.071, 1.18, 1.361, 1.522]]\nD: [[-0.112, -2.192, 0.852, 2.525, 1.965, 2.377], [0.128, -0.805, 0.888, 1.396, 1.418, 2.338]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_51_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_51_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-2.293, -1.301, 1.261, 0.557, 0.076, 0.181]]\nB: [[-2.3, -0.603, 0.539, 0.144, 0.291, 0.744]]\nC: [[-2.289, -1.004, 0.913, 0.094, 0.463, 0.318]]\nD: [[-2.447, -0.778, 0.56, -0.086, 0.586, 0.687]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the object in the scene. The camera pose information includes: the rotation matrix: [[0.140295, 0.625342, -0.767636], [0.990108, -0.090149, 0.107516], [-0.001967, -0.775126, -0.631804]]; the translation vector: [3.410891, 3.073526, 1.198756], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-2.293, -1.301, 1.261, 0.557, 0.076, 0.181]]\nB: [[-2.3, -0.603, 0.539, 0.144, 0.291, 0.744]]\nC: [[-2.289, -1.004, 0.913, 0.094, 0.463, 0.318]]\nD: [[-2.447, -0.778, 0.56, -0.086, 0.586, 0.687]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_52_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_52_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-1.354, 0.454, 0.921, -0.335, 0.498, 0.874], [0.185, -0.574, 0.751, 0.09, -0.38, 0.911], [0.667, 0.586, 2.143, 0.441, 0.23, 0.505]]\nB: [[-1.09, 0.059, 1.019, 0.117, 0.263, 0.377], [0.279, -1.061, 0.877, 0.477, 0.116, 0.622], [0.666, 0.093, 1.789, 0.132, 0.373, 0.347]]\nC: [[-1.434, -0.263, 0.532, 0.445, 0.024, 0.383], [-0.034, -1.535, 0.533, 0.655, 0.426, 0.876], [0.704, 0.231, 1.687, 0.279, -0.11, 0.575]]\nD: [[-0.897, -0.345, 1.454, 0.607, 0.705, 0.804], [0.146, -1.337, 0.587, 0.096, 0.382, 0.839], [0.567, -0.339, 1.673, 0.166, 0.534, 0.522]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the towel in the scene. The camera pose information includes: the rotation matrix: [[-0.221984, 0.421429, -0.879273], [0.97466, 0.121427, -0.187867], [0.027595, -0.898695, -0.437705]]; the translation vector: [3.155292, 0.483793, 1.35371], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.354, 0.454, 0.921, -0.335, 0.498, 0.874], [0.185, -0.574, 0.751, 0.09, -0.38, 0.911], [0.667, 0.586, 2.143, 0.441, 0.23, 0.505]]\nB: [[-1.09, 0.059, 1.019, 0.117, 0.263, 0.377], [0.279, -1.061, 0.877, 0.477, 0.116, 0.622], [0.666, 0.093, 1.789, 0.132, 0.373, 0.347]]\nC: [[-1.434, -0.263, 0.532, 0.445, 0.024, 0.383], [-0.034, -1.535, 0.533, 0.655, 0.426, 0.876], [0.704, 0.231, 1.687, 0.279, -0.11, 0.575]]\nD: [[-0.897, -0.345, 1.454, 0.607, 0.705, 0.804], [0.146, -1.337, 0.587, 0.096, 0.382, 0.839], [0.567, -0.339, 1.673, 0.166, 0.534, 0.522]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_53_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_53_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-0.872, -1.053, 1.714, -0.4, 0.547, -0.388]]\nB: [[-0.641, -0.865, 2.002, 0.06, 0.688, 0.05]]\nC: [[-0.24, -0.538, 2.349, -0.015, 0.604, 0.452]]\nD: [[-0.437, -0.89, 1.743, -0.382, 0.608, -0.394]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the shower curtain rod in the scene. The camera pose information includes: the rotation matrix: [[0.173351, 0.592298, -0.78685], [0.984858, -0.105806, 0.137329], [-0.001913, -0.798742, -0.601671]]; the translation vector: [3.264189, 1.940071, 1.28435], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.872, -1.053, 1.714, -0.4, 0.547, -0.388]]\nB: [[-0.641, -0.865, 2.002, 0.06, 0.688, 0.05]]\nC: [[-0.24, -0.538, 2.349, -0.015, 0.604, 0.452]]\nD: [[-0.437, -0.89, 1.743, -0.382, 0.608, -0.394]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_54_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_54_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[0.74, 0.949, 0.805, 2.053, 0.051, 1.667], [-1.281, -0.065, 0.899, 0.112, 2.004, 1.85], [0.446, -1.039, 0.627, 1.399, 0.094, 1.27], [1.134, -0.792, 0.678, 0.041, 0.539, 1.365], [-1.434, 2.505, 0.652, 0.518, 0.203, 1.213]]\nB: [[1.029, 0.71, 0.916, 2.029, -0.176, 1.869], [-1.683, -0.445, 1.099, -0.259, 2.235, 1.713], [0.848, -1.08, 0.506, 1.798, 0.259, 1.153], [1.268, -0.42, 0.271, 0.287, 0.751, 1.048], [-1.043, 2.825, 0.333, 0.321, -0.246, 1.582]]\nC: [[0.966, 1.169, 0.637, 2.193, -0.193, 1.801], [-0.869, -0.535, 1.386, 0.092, 1.727, 2.164], [0.169, -1.108, 0.224, 1.056, -0.222, 1.304], [0.91, -1.037, 1.17, -0.025, 0.5, 1.639], [-0.958, 2.714, 0.971, 0.285, -0.285, 1.316]]\nD: [[0.741, 1.382, 0.663, 1.864, -0.249, 2.055], [-1.139, 0.311, 1.207, -0.23, 2.288, 2.067], [0.431, -1.158, 0.998, 1.247, 0.194, 1.309], [0.658, -1.111, 1.067, 0.365, 0.642, 0.899], [-1.437, 2.999, 0.509, 0.702, 0.182, 1.021]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[0.660671, 0.426343, -0.617856], [0.749322, -0.423957, 0.508701], [-0.045063, -0.799057, -0.599565]]; the translation vector: [1.739014, 2.260029, 1.323145], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.74, 0.949, 0.805, 2.053, 0.051, 1.667], [-1.281, -0.065, 0.899, 0.112, 2.004, 1.85], [0.446, -1.039, 0.627, 1.399, 0.094, 1.27], [1.134, -0.792, 0.678, 0.041, 0.539, 1.365], [-1.434, 2.505, 0.652, 0.518, 0.203, 1.213]]\nB: [[1.029, 0.71, 0.916, 2.029, -0.176, 1.869], [-1.683, -0.445, 1.099, -0.259, 2.235, 1.713], [0.848, -1.08, 0.506, 1.798, 0.259, 1.153], [1.268, -0.42, 0.271, 0.287, 0.751, 1.048], [-1.043, 2.825, 0.333, 0.321, -0.246, 1.582]]\nC: [[0.966, 1.169, 0.637, 2.193, -0.193, 1.801], [-0.869, -0.535, 1.386, 0.092, 1.727, 2.164], [0.169, -1.108, 0.224, 1.056, -0.222, 1.304], [0.91, -1.037, 1.17, -0.025, 0.5, 1.639], [-0.958, 2.714, 0.971, 0.285, -0.285, 1.316]]\nD: [[0.741, 1.382, 0.663, 1.864, -0.249, 2.055], [-1.139, 0.311, 1.207, -0.23, 2.288, 2.067], [0.431, -1.158, 0.998, 1.247, 0.194, 1.309], [0.658, -1.111, 1.067, 0.365, 0.642, 0.899], [-1.437, 2.999, 0.509, 0.702, 0.182, 1.021]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_55_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_55_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[2.289, 1.091, 1.235, 0.312, 0.445, 0.181], [2.614, -0.282, 1.115, 0.396, 0.952, 0.37], [2.915, -1.131, 1.323, 0.416, 1.127, 0.582]]\nB: [[2.865, 1.27, 1.277, 0.662, 1.126, 0.413], [2.634, -0.253, 1.185, 0.725, 0.57, 0.723], [2.702, -0.759, 1.232, 0.075, 0.776, 0.187]]\nC: [[2.596, 1.198, 1.179, 0.402, 0.868, 0.166], [2.565, 0.04, 1.202, 0.364, 0.895, 0.33], [2.601, -1.116, 1.104, 0.457, 0.792, 0.155]]\nD: [[2.413, 1.176, 1.278, 0.589, 0.574, -0.073], [2.181, 0.298, 1.094, 0.783, 1.368, 0.634], [2.395, -1.077, 1.082, 0.598, 1.002, 0.39]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the windowsill in the scene. The camera pose information includes: the rotation matrix: [[0.606468, -0.360414, 0.70873], [-0.789578, -0.16805, 0.590192], [-0.093612, -0.91753, -0.386492]]; the translation vector: [2.373669, 6.226582, 1.48631], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[2.289, 1.091, 1.235, 0.312, 0.445, 0.181], [2.614, -0.282, 1.115, 0.396, 0.952, 0.37], [2.915, -1.131, 1.323, 0.416, 1.127, 0.582]]\nB: [[2.865, 1.27, 1.277, 0.662, 1.126, 0.413], [2.634, -0.253, 1.185, 0.725, 0.57, 0.723], [2.702, -0.759, 1.232, 0.075, 0.776, 0.187]]\nC: [[2.596, 1.198, 1.179, 0.402, 0.868, 0.166], [2.565, 0.04, 1.202, 0.364, 0.895, 0.33], [2.601, -1.116, 1.104, 0.457, 0.792, 0.155]]\nD: [[2.413, 1.176, 1.278, 0.589, 0.574, -0.073], [2.181, 0.298, 1.094, 0.783, 1.368, 0.634], [2.395, -1.077, 1.082, 0.598, 1.002, 0.39]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_56_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_56_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-0.001, 2.948, 0.696, 1.051, 0.653, 1.017], [-1.218, 2.566, 0.334, 0.65, 1.143, 0.576], [-1.049, 4.68, 0.409, 0.92, 0.545, 1.203], [-0.848, -3.875, 0.54, 0.615, 0.936, 1.037], [1.041, 2.41, 0.443, 0.89, 0.787, 0.94], [1.624, -1.213, 0.004, 1.481, 1.102, 0.478], [1.287, 1.232, 0.545, 1.204, 1.228, 0.841], [1.818, -0.113, 0.532, 0.874, 0.706, 1.086], [0.092, -5.089, 0.693, 0.961, 0.819, 0.322], [-0.468, -1.491, 0.774, 0.97, 1.2, 1.024]]\nB: [[-0.126, 2.459, 0.086, 0.932, 1.114, 1.123], [-0.966, 2.226, 0.359, 1.537, 0.757, 0.462], [-1.21, 4.954, 0.104, 1.239, 0.624, 0.543], [-0.516, -4.249, 0.544, 1.157, 1.197, 1.269], [1.294, 2.428, 0.861, 1.276, 0.579, 0.451], [1.569, -1.608, 0.36, 0.726, 1.508, 0.636], [0.656, 1.11, -0.004, 0.679, 1.224, 0.752], [0.999, -0.375, 0.707, 0.664, 1.131, 0.788], [0.529, -5.125, 0.2, 0.899, 0.951, 0.927], [-1.058, -1.898, 0.447, 0.976, 1.149, 0.369]]\nC: [[0.237, 2.908, 0.463, 0.898, 0.83, 0.718], [-0.876, 2.53, 0.52, 1.072, 0.924, 0.781], [-1.088, 4.721, 0.492, 0.991, 0.901, 0.767], [-0.583, -3.833, 0.327, 0.92, 0.918, 0.773], [1.47, 1.953, 0.456, 0.894, 0.96, 0.795], [1.829, -1.442, 0.405, 1.045, 1.024, 0.748], [1.035, 0.766, 0.48, 0.857, 0.923, 0.799], [1.416, -0.318, 0.434, 1.021, 0.961, 0.774], [0.375, -5.051, 0.244, 0.861, 0.856, 0.761], [-0.588, -1.854, 0.411, 0.932, 0.952, 0.73]]\nD: [[0.31, 2.967, 0.642, 1.326, 0.654, 0.284], [-1.165, 2.181, 0.237, 1.304, 0.639, 0.395], [-0.721, 4.769, 0.925, 1.219, 0.928, 0.661], [-1.026, -3.416, 0.149, 0.806, 0.901, 0.778], [1.652, 1.761, 0.169, 1.115, 0.472, 1.022], [2.158, -1.036, 0.663, 0.749, 0.724, 1.014], [0.591, 0.853, 0.97, 1.294, 0.724, 0.816], [1.34, 0.03, 0.19, 1.304, 0.703, 0.552], [0.387, -4.975, 0.689, 0.413, 1.29, 0.685], [-0.424, -1.902, 0.121, 1.041, 0.562, 0.86]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the armchair in the scene. The camera pose information includes: the rotation matrix: [[0.974605, -0.106498, 0.196986], [-0.223762, -0.428932, 0.875185], [-0.008712, -0.897037, -0.44187]]; the translation vector: [2.006689, 0.552817, 1.711334], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.001, 2.948, 0.696, 1.051, 0.653, 1.017], [-1.218, 2.566, 0.334, 0.65, 1.143, 0.576], [-1.049, 4.68, 0.409, 0.92, 0.545, 1.203], [-0.848, -3.875, 0.54, 0.615, 0.936, 1.037], [1.041, 2.41, 0.443, 0.89, 0.787, 0.94], [1.624, -1.213, 0.004, 1.481, 1.102, 0.478], [1.287, 1.232, 0.545, 1.204, 1.228, 0.841], [1.818, -0.113, 0.532, 0.874, 0.706, 1.086], [0.092, -5.089, 0.693, 0.961, 0.819, 0.322], [-0.468, -1.491, 0.774, 0.97, 1.2, 1.024]]\nB: [[-0.126, 2.459, 0.086, 0.932, 1.114, 1.123], [-0.966, 2.226, 0.359, 1.537, 0.757, 0.462], [-1.21, 4.954, 0.104, 1.239, 0.624, 0.543], [-0.516, -4.249, 0.544, 1.157, 1.197, 1.269], [1.294, 2.428, 0.861, 1.276, 0.579, 0.451], [1.569, -1.608, 0.36, 0.726, 1.508, 0.636], [0.656, 1.11, -0.004, 0.679, 1.224, 0.752], [0.999, -0.375, 0.707, 0.664, 1.131, 0.788], [0.529, -5.125, 0.2, 0.899, 0.951, 0.927], [-1.058, -1.898, 0.447, 0.976, 1.149, 0.369]]\nC: [[0.237, 2.908, 0.463, 0.898, 0.83, 0.718], [-0.876, 2.53, 0.52, 1.072, 0.924, 0.781], [-1.088, 4.721, 0.492, 0.991, 0.901, 0.767], [-0.583, -3.833, 0.327, 0.92, 0.918, 0.773], [1.47, 1.953, 0.456, 0.894, 0.96, 0.795], [1.829, -1.442, 0.405, 1.045, 1.024, 0.748], [1.035, 0.766, 0.48, 0.857, 0.923, 0.799], [1.416, -0.318, 0.434, 1.021, 0.961, 0.774], [0.375, -5.051, 0.244, 0.861, 0.856, 0.761], [-0.588, -1.854, 0.411, 0.932, 0.952, 0.73]]\nD: [[0.31, 2.967, 0.642, 1.326, 0.654, 0.284], [-1.165, 2.181, 0.237, 1.304, 0.639, 0.395], [-0.721, 4.769, 0.925, 1.219, 0.928, 0.661], [-1.026, -3.416, 0.149, 0.806, 0.901, 0.778], [1.652, 1.761, 0.169, 1.115, 0.472, 1.022], [2.158, -1.036, 0.663, 0.749, 0.724, 1.014], [0.591, 0.853, 0.97, 1.294, 0.724, 0.816], [1.34, 0.03, 0.19, 1.304, 0.703, 0.552], [0.387, -4.975, 0.689, 0.413, 1.29, 0.685], [-0.424, -1.902, 0.121, 1.041, 0.562, 0.86]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_57_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_57_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[0.945, -0.877, -0.123, 1.546, 1.347, 0.112]]\nB: [[0.857, -1.188, 0.118, 1.684, 1.025, 0.263]]\nC: [[0.995, -0.398, -0.499, 1.213, 1.119, -0.036]]\nD: [[0.539, -0.587, -0.275, 1.351, 1.1, -0.319]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the floor in the scene. The camera pose information includes: the rotation matrix: [[-0.693623, 0.392298, -0.604144], [0.720137, 0.397492, -0.568686], [0.017048, -0.82952, -0.558217]]; the translation vector: [2.706242, 2.586761, 1.453005], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.945, -0.877, -0.123, 1.546, 1.347, 0.112]]\nB: [[0.857, -1.188, 0.118, 1.684, 1.025, 0.263]]\nC: [[0.995, -0.398, -0.499, 1.213, 1.119, -0.036]]\nD: [[0.539, -0.587, -0.275, 1.351, 1.1, -0.319]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_58_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_58_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-2.143, 1.575, 0.04, 0.27, 0.312, 0.15], [-0.377, 3.615, 0.346, 0.295, 0.569, 0.473], [-1.45, 0.655, 0.569, 0.701, 0.954, 0.318], [2.369, -0.854, 0.716, 0.861, 0.372, 0.384], [3.488, -0.639, 0.515, 0.374, 0.717, 0.642], [3.31, -2.144, 0.849, 0.423, 1.039, 0.351], [3.753, -1.021, 0.778, 0.709, 0.641, 0.792], [-1.949, 2.854, 0.124, 0.704, 0.146, 0.084]]\nB: [[-1.76, 1.841, 0.53, 0.73, 0.674, 0.544], [-0.679, 3.307, 0.476, 0.669, 0.734, 0.506], [-1.7, 0.568, 0.465, 0.694, 0.62, 0.52], [2.474, -1.195, 0.409, 0.606, 0.509, 0.689], [3.174, -0.614, 0.339, 0.542, 0.599, 0.782], [3.186, -2.158, 0.546, 0.503, 0.633, 0.516], [3.901, -1.236, 0.485, 0.592, 0.545, 0.635], [-1.787, 2.437, 0.508, 0.713, 0.589, 0.468]]\nC: [[-2.143, 1.685, 0.995, 0.615, 0.904, 0.263], [-1.005, 3.628, 0.394, 0.466, 0.405, 0.998], [-2.179, 0.615, 0.333, 0.233, 0.298, 0.889], [2.014, -1.057, 0.599, 0.68, 0.338, 0.974], [2.918, -0.471, 0.1, 0.575, 0.71, 0.376], [3.127, -2.436, 0.498, 0.497, 0.327, 0.902], [3.486, -1.558, 0.63, 0.593, 0.23, 0.81], [-1.407, 2.857, 0.881, 0.499, 1.07, 0.68]]\nD: [[-1.261, 2.229, 0.998, 1.215, 1.048, 0.703], [-1.092, 3.457, -0.005, 0.668, 1.114, 0.663], [-1.477, 0.865, 0.817, 0.301, 0.363, 0.292], [2.93, -1.308, 0.561, 1.073, 0.232, 1.069], [3.634, -0.503, -0.085, 0.796, 0.476, 0.342], [3.396, -2.322, 0.932, 0.945, 0.812, 0.616], [4.075, -1.495, 0.312, 0.703, 0.562, 0.973], [-2.275, 2.728, 0.786, 0.449, 0.77, 0.134]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the chair in the scene. The camera pose information includes: the rotation matrix: [[-0.891251, 0.378307, -0.25011], [0.443048, 0.608538, -0.658323], [-0.096846, -0.697542, -0.709969]]; the translation vector: [4.935522, 3.588868, 1.45033], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-2.143, 1.575, 0.04, 0.27, 0.312, 0.15], [-0.377, 3.615, 0.346, 0.295, 0.569, 0.473], [-1.45, 0.655, 0.569, 0.701, 0.954, 0.318], [2.369, -0.854, 0.716, 0.861, 0.372, 0.384], [3.488, -0.639, 0.515, 0.374, 0.717, 0.642], [3.31, -2.144, 0.849, 0.423, 1.039, 0.351], [3.753, -1.021, 0.778, 0.709, 0.641, 0.792], [-1.949, 2.854, 0.124, 0.704, 0.146, 0.084]]\nB: [[-1.76, 1.841, 0.53, 0.73, 0.674, 0.544], [-0.679, 3.307, 0.476, 0.669, 0.734, 0.506], [-1.7, 0.568, 0.465, 0.694, 0.62, 0.52], [2.474, -1.195, 0.409, 0.606, 0.509, 0.689], [3.174, -0.614, 0.339, 0.542, 0.599, 0.782], [3.186, -2.158, 0.546, 0.503, 0.633, 0.516], [3.901, -1.236, 0.485, 0.592, 0.545, 0.635], [-1.787, 2.437, 0.508, 0.713, 0.589, 0.468]]\nC: [[-2.143, 1.685, 0.995, 0.615, 0.904, 0.263], [-1.005, 3.628, 0.394, 0.466, 0.405, 0.998], [-2.179, 0.615, 0.333, 0.233, 0.298, 0.889], [2.014, -1.057, 0.599, 0.68, 0.338, 0.974], [2.918, -0.471, 0.1, 0.575, 0.71, 0.376], [3.127, -2.436, 0.498, 0.497, 0.327, 0.902], [3.486, -1.558, 0.63, 0.593, 0.23, 0.81], [-1.407, 2.857, 0.881, 0.499, 1.07, 0.68]]\nD: [[-1.261, 2.229, 0.998, 1.215, 1.048, 0.703], [-1.092, 3.457, -0.005, 0.668, 1.114, 0.663], [-1.477, 0.865, 0.817, 0.301, 0.363, 0.292], [2.93, -1.308, 0.561, 1.073, 0.232, 1.069], [3.634, -0.503, -0.085, 0.796, 0.476, 0.342], [3.396, -2.322, 0.932, 0.945, 0.812, 0.616], [4.075, -1.495, 0.312, 0.703, 0.562, 0.973], [-2.275, 2.728, 0.786, 0.449, 0.77, 0.134]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_59_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_59_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-0.205, 1.797, 1.094, 1.154, 0.654, 1.112], [0.658, 1.295, 1.132, 1.277, 0.037, 0.601]]\nB: [[-0.63, 1.531, 1.06, 1.175, 0.329, 0.727], [0.734, 1.578, 0.984, 1.15, 0.266, 0.361]]\nC: [[-0.726, 1.106, 1.434, 1.522, 0.658, 0.308], [1.201, 1.481, 1.246, 0.828, 0.067, 0.371]]\nD: [[-0.719, 1.086, 1.264, 0.78, 0.793, 0.35], [0.868, 1.98, 0.75, 1.049, 0.201, 0.363]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the window in the scene. The camera pose information includes: the rotation matrix: [[0.081815, 0.638296, -0.765431], [0.996577, -0.061545, 0.055199], [-0.011875, -0.767327, -0.641146]]; the translation vector: [3.004073, 1.570726, 1.431248], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.205, 1.797, 1.094, 1.154, 0.654, 1.112], [0.658, 1.295, 1.132, 1.277, 0.037, 0.601]]\nB: [[-0.63, 1.531, 1.06, 1.175, 0.329, 0.727], [0.734, 1.578, 0.984, 1.15, 0.266, 0.361]]\nC: [[-0.726, 1.106, 1.434, 1.522, 0.658, 0.308], [1.201, 1.481, 1.246, 0.828, 0.067, 0.371]]\nD: [[-0.719, 1.086, 1.264, 0.78, 0.793, 0.35], [0.868, 1.98, 0.75, 1.049, 0.201, 0.363]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_60_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_60_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[1.288, 3.663, 0.885, 1.742, 0.266, 1.508], [-1.19, -3.017, 0.715, 0.179, 0.346, 1.183], [-2.568, -0.991, 0.862, 0.362, 2.842, 1.652], [-2.356, 0.313, 1.087, 0.573, 0.323, 1.151], [-2.078, 0.891, 0.947, 0.102, 1.17, 1.498]]\nB: [[1.699, 3.879, 0.807, 1.466, 0.26, 1.658], [-1.338, -3.016, 0.481, -0.091, 0.125, 1.346], [-2.791, -0.651, 0.722, 0.588, 2.782, 1.444], [-2.133, -0.174, 1.179, 0.831, 0.459, 1.476], [-2.564, 1.303, 0.485, 0.444, 1.6, 1.79]]\nC: [[1.384, 3.837, 1.191, 2.116, 0.64, 1.217], [-1.185, -3.083, 1.042, 0.674, 0.205, 0.788], [-2.424, -0.728, 0.743, -0.005, 2.436, 1.937], [-2.645, -0.046, 0.933, 0.095, 0.125, 1.323], [-2.483, 0.961, 0.887, 0.154, 0.979, 1.595]]\nD: [[1.755, 3.461, 0.788, 1.786, 0.256, 1.208], [-1.596, -3.184, 0.789, 0.372, -0.041, 1.319], [-2.923, -1.052, 1.266, 0.216, 3.322, 1.837], [-2.525, 0.237, 1.346, 0.938, 0.473, 0.759], [-2.569, 1.257, 0.568, 0.003, 1.424, 1.337]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[-0.963317, 0.154363, -0.219528], [0.260086, 0.335369, -0.905474], [-0.066149, -0.929355, -0.363214]]; the translation vector: [5.972451, 2.818726, 1.468896], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.288, 3.663, 0.885, 1.742, 0.266, 1.508], [-1.19, -3.017, 0.715, 0.179, 0.346, 1.183], [-2.568, -0.991, 0.862, 0.362, 2.842, 1.652], [-2.356, 0.313, 1.087, 0.573, 0.323, 1.151], [-2.078, 0.891, 0.947, 0.102, 1.17, 1.498]]\nB: [[1.699, 3.879, 0.807, 1.466, 0.26, 1.658], [-1.338, -3.016, 0.481, -0.091, 0.125, 1.346], [-2.791, -0.651, 0.722, 0.588, 2.782, 1.444], [-2.133, -0.174, 1.179, 0.831, 0.459, 1.476], [-2.564, 1.303, 0.485, 0.444, 1.6, 1.79]]\nC: [[1.384, 3.837, 1.191, 2.116, 0.64, 1.217], [-1.185, -3.083, 1.042, 0.674, 0.205, 0.788], [-2.424, -0.728, 0.743, -0.005, 2.436, 1.937], [-2.645, -0.046, 0.933, 0.095, 0.125, 1.323], [-2.483, 0.961, 0.887, 0.154, 0.979, 1.595]]\nD: [[1.755, 3.461, 0.788, 1.786, 0.256, 1.208], [-1.596, -3.184, 0.789, 0.372, -0.041, 1.319], [-2.923, -1.052, 1.266, 0.216, 3.322, 1.837], [-2.525, 0.237, 1.346, 0.938, 0.473, 0.759], [-2.569, 1.257, 0.568, 0.003, 1.424, 1.337]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_61_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_61_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-1.195, 2.616, 0.764, 0.381, 0.904, 1.179], [1.178, 2.791, 1.06, 0.523, 1.876, 0.795], [2.137, -1.926, -0.065, 1.185, 1.084, 0.312], [-0.599, -1.923, 0.519, 0.294, 0.992, 1.206]]\nB: [[-1.068, 2.261, 1.292, 0.971, 1.029, 0.824], [1.489, 3.189, 0.594, 0.774, 1.161, 1.156], [1.843, -2.018, 0.792, 0.473, 1.372, 0.616], [-0.424, -2.012, 0.649, 0.681, 1.809, 0.928]]\nC: [[-0.884, 2.735, 0.809, 0.721, 1.274, 0.906], [1.496, 2.941, 0.628, 0.833, 1.571, 0.871], [1.868, -1.949, 0.395, 0.878, 0.892, 0.788], [-0.793, -2.3, 0.359, 0.741, 1.335, 0.757]]\nD: [[-1.225, 2.279, 0.349, 0.295, 0.789, 0.542], [1.131, 2.464, 0.4, 0.503, 1.911, 0.903], [1.57, -1.598, -0.016, 1.245, 1.391, 0.466], [-0.41, -2.691, 0.26, 1.21, 1.681, 0.98]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the couch in the scene. The camera pose information includes: the rotation matrix: [[-0.824719, -0.175736, 0.537546], [-0.564369, 0.316962, -0.762249], [-0.036427, -0.932015, -0.360584]]; the translation vector: [4.397487, 4.054199, 1.411764], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.195, 2.616, 0.764, 0.381, 0.904, 1.179], [1.178, 2.791, 1.06, 0.523, 1.876, 0.795], [2.137, -1.926, -0.065, 1.185, 1.084, 0.312], [-0.599, -1.923, 0.519, 0.294, 0.992, 1.206]]\nB: [[-1.068, 2.261, 1.292, 0.971, 1.029, 0.824], [1.489, 3.189, 0.594, 0.774, 1.161, 1.156], [1.843, -2.018, 0.792, 0.473, 1.372, 0.616], [-0.424, -2.012, 0.649, 0.681, 1.809, 0.928]]\nC: [[-0.884, 2.735, 0.809, 0.721, 1.274, 0.906], [1.496, 2.941, 0.628, 0.833, 1.571, 0.871], [1.868, -1.949, 0.395, 0.878, 0.892, 0.788], [-0.793, -2.3, 0.359, 0.741, 1.335, 0.757]]\nD: [[-1.225, 2.279, 0.349, 0.295, 0.789, 0.542], [1.131, 2.464, 0.4, 0.503, 1.911, 0.903], [1.57, -1.598, -0.016, 1.245, 1.391, 0.466], [-0.41, -2.691, 0.26, 1.21, 1.681, 0.98]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_62_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_62_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[0.115, -1.562, 0.911, 0.99, 0.176, 0.93], [0.903, -1.409, 1.128, 0.969, 0.316, 0.989]]\nB: [[0.419, -1.51, 1.351, 0.838, 0.104, 1.371], [0.797, -1.27, 0.95, 0.885, 0.56, 0.709]]\nC: [[-0.056, -1.325, 0.584, 1.397, 0.105, 0.436], [0.94, -1.633, 1.178, 0.648, 0.434, 1.044]]\nD: [[0.609, -1.088, 0.429, 1.463, 0.186, 0.54], [0.51, -1.323, 0.699, 1.115, 0.814, 1.432]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the window in the scene. The camera pose information includes: the rotation matrix: [[0.993805, -0.057016, 0.095394], [-0.110597, -0.423109, 0.899304], [-0.010913, -0.904283, -0.426794]]; the translation vector: [3.282054, 2.568905, 1.512321], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.115, -1.562, 0.911, 0.99, 0.176, 0.93], [0.903, -1.409, 1.128, 0.969, 0.316, 0.989]]\nB: [[0.419, -1.51, 1.351, 0.838, 0.104, 1.371], [0.797, -1.27, 0.95, 0.885, 0.56, 0.709]]\nC: [[-0.056, -1.325, 0.584, 1.397, 0.105, 0.436], [0.94, -1.633, 1.178, 0.648, 0.434, 1.044]]\nD: [[0.609, -1.088, 0.429, 1.463, 0.186, 0.54], [0.51, -1.323, 0.699, 1.115, 0.814, 1.432]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_63_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_63_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[1.753, 0.465, 1.403, 0.46, 4.996, 2.959], [-1.738, -1.218, 1.272, 1.062, 1.528, 2.501], [-0.405, 2.797, 1.433, 4.292, 0.332, 2.875], [-2.525, 2.379, 1.355, 0.074, 0.839, 1.664], [-2.109, 0.693, 1.971, 0.227, 2.364, 1.533], [0.282, -2.054, 1.197, 3.118, 0.28, 2.272], [0.151, -2.857, 1.262, 0.294, 1.776, 2.355]]\nB: [[1.644, 0.604, 1.06, 0.766, 5.332, 3.344], [-2.097, -1.683, 0.861, 1.264, 1.832, 2.762], [-0.249, 2.988, 1.171, 4.734, 0.777, 3.234], [-2.915, 2.214, 1.5, 0.285, 1.098, 1.997], [-1.7, 0.54, 1.692, 0.479, 2.794, 1.178], [0.193, -1.942, 1.679, 3.173, -0.143, 2.182], [0.278, -3.151, 1.749, -0.197, 1.898, 2.594]]\nC: [[1.329, 0.268, 1.849, 0.784, 4.719, 2.961], [-2.126, -1.458, 1.073, 0.788, 1.484, 2.789], [0.005, 2.714, 1.367, 3.948, 0.242, 2.522], [-2.545, 2.463, 1.604, 0.21, 1.144, 1.521], [-1.811, 0.332, 2.299, -0.123, 1.943, 1.085], [0.387, -2.373, 0.727, 2.861, -0.215, 2.059], [0.369, -3.057, 1.007, 0.316, 1.439, 1.965]]\nD: [[2.224, 0.601, 1.81, 0.297, 5.34, 2.544], [-1.441, -1.038, 1.648, 1.209, 1.768, 2.642], [-0.455, 2.785, 1.909, 4.119, 0.083, 3.179], [-2.32, 2.048, 1.417, -0.178, 0.398, 1.998], [-1.992, 0.619, 1.973, 0.251, 2.119, 1.3], [0.593, -1.72, 1.138, 2.733, 0.423, 2.378], [0.595, -2.424, 1.148, 0.122, 2.12, 2.512]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[0.994136, 0.036629, -0.101745], [0.107123, -0.462198, 0.880283], [-0.014782, -0.88602, -0.463411]]; the translation vector: [3.8191, 1.340951, 1.354002], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.753, 0.465, 1.403, 0.46, 4.996, 2.959], [-1.738, -1.218, 1.272, 1.062, 1.528, 2.501], [-0.405, 2.797, 1.433, 4.292, 0.332, 2.875], [-2.525, 2.379, 1.355, 0.074, 0.839, 1.664], [-2.109, 0.693, 1.971, 0.227, 2.364, 1.533], [0.282, -2.054, 1.197, 3.118, 0.28, 2.272], [0.151, -2.857, 1.262, 0.294, 1.776, 2.355]]\nB: [[1.644, 0.604, 1.06, 0.766, 5.332, 3.344], [-2.097, -1.683, 0.861, 1.264, 1.832, 2.762], [-0.249, 2.988, 1.171, 4.734, 0.777, 3.234], [-2.915, 2.214, 1.5, 0.285, 1.098, 1.997], [-1.7, 0.54, 1.692, 0.479, 2.794, 1.178], [0.193, -1.942, 1.679, 3.173, -0.143, 2.182], [0.278, -3.151, 1.749, -0.197, 1.898, 2.594]]\nC: [[1.329, 0.268, 1.849, 0.784, 4.719, 2.961], [-2.126, -1.458, 1.073, 0.788, 1.484, 2.789], [0.005, 2.714, 1.367, 3.948, 0.242, 2.522], [-2.545, 2.463, 1.604, 0.21, 1.144, 1.521], [-1.811, 0.332, 2.299, -0.123, 1.943, 1.085], [0.387, -2.373, 0.727, 2.861, -0.215, 2.059], [0.369, -3.057, 1.007, 0.316, 1.439, 1.965]]\nD: [[2.224, 0.601, 1.81, 0.297, 5.34, 2.544], [-1.441, -1.038, 1.648, 1.209, 1.768, 2.642], [-0.455, 2.785, 1.909, 4.119, 0.083, 3.179], [-2.32, 2.048, 1.417, -0.178, 0.398, 1.998], [-1.992, 0.619, 1.973, 0.251, 2.119, 1.3], [0.593, -1.72, 1.138, 2.733, 0.423, 2.378], [0.595, -2.424, 1.148, 0.122, 2.12, 2.512]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_64_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_64_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-2.229, -0.625, 1.789, 0.384, 0.619, 0.577], [-0.619, -2.375, 0.661, 0.567, 0.451, 0.111], [1.272, -2.43, 0.132, 0.447, 0.727, 0.3], [1.644, -2.684, 0.788, 0.577, 0.322, 0.308], [-0.576, -2.651, 0.078, 0.434, 0.523, 0.246], [0.194, -2.629, 0.081, 0.411, 0.494, 0.232], [0.495, -2.465, 0.021, 0.42, 0.459, 0.089], [-0.19, -2.648, 0.019, 0.384, 0.512, 0.116], [0.653, -2.701, 0.729, 0.282, 0.339, 0.17], [0.952, -2.696, 0.744, 0.349, 0.322, 0.22], [1.255, -2.766, 0.79, 0.368, 0.486, 0.32], [0.1, -2.746, 0.715, 0.314, 0.34, 0.156], [0.399, -2.748, 0.688, 0.296, 0.348, 0.103], [-0.127, -2.741, 0.687, 0.255, 0.368, 0.104], [-0.41, -2.745, 0.703, 0.341, 0.369, 0.132], [-1.782, -2.695, 0.512, 0.572, 0.485, 0.241], [-1.365, -2.686, 0.5, 0.492, 0.488, 0.277], [-1.027, -2.616, 0.39, 0.417, 0.378, 0.292], [-2.221, -0.682, 1.308, 0.347, 0.518, 0.497]]\nB: [[-2.177, -0.167, 1.791, 0.429, 0.305, 0.461], [-0.533, -2.428, 0.376, 0.625, 0.83, -0.181], [0.833, -2.571, -0.115, 0.273, 1.148, 0.126], [1.611, -2.253, 0.787, 0.359, 0.551, -0.134], [-1.058, -2.229, -0.315, 0.638, 0.268, -0.067], [0.333, -2.804, -0.071, 0.337, 0.161, 0.002], [0.886, -2.763, 0.464, 0.54, 0.824, 0.171], [0.083, -2.871, 0.059, 0.444, 0.352, 0.054], [0.665, -2.763, 0.558, 0.057, 0.308, 0.039], [0.563, -2.607, 1.101, 0.044, -0.169, 0.664], [1.085, -2.593, 0.464, 0.42, 0.951, 0.013], [-0.365, -2.365, 0.619, 0.59, 0.077, 0.369], [0.543, -2.864, 0.581, 0.554, 0.644, -0.05], [0.36, -3.102, 0.746, 0.301, -0.13, -0.221], [-0.22, -2.771, 1.165, 0.154, 0.295, 0.195], [-1.434, -2.444, 0.547, 0.734, 0.246, -0.108], [-0.997, -2.269, 0.094, 0.441, 0.845, 0.283], [-1.137, -2.213, 0.312, 0.148, 0.309, 0.772], [-2.498, -0.603, 1.369, 0.752, 0.555, 0.615]]\nC: [[-2.33, -1.056, 1.723, -0.065, 0.432, 0.415], [-0.968, -1.986, 0.569, 0.909, 0.497, 0.486], [1.32, -2.346, -0.114, 0.554, 0.588, 0.715], [1.949, -3.024, 0.857, 1.07, 0.018, 0.558], [-0.719, -2.255, 0.515, 0.899, 0.995, 0.643], [0.265, -2.28, -0.308, 0.384, 0.12, 0.468], [0.651, -2.056, -0.288, 0.45, 0.167, 0.402], [-0.058, -2.555, -0.352, 0.064, 0.242, 0.36], [0.516, -2.537, 1.033, 0.148, 0.192, 0.352], [0.983, -2.457, 0.904, -0.004, -0.102, -0.046], [1.601, -2.407, 0.354, 0.85, 0.773, 0.225], [0.09, -3.129, 0.278, 0.778, 0.065, 0.089], [0.498, -3.096, 0.49, 0.127, 0.025, 0.421], [0.282, -2.893, 0.585, 0.538, -0.078, 0.192], [-0.775, -2.875, 0.541, 0.822, 0.042, 0.614], [-1.444, -2.829, 0.956, 0.56, 0.015, 0.186], [-1.857, -2.941, 0.896, 0.404, 0.313, 0.437], [-1.23, -2.427, -0.01, 0.121, 0.029, 0.052], [-2.105, -0.861, 1.621, 0.843, 0.939, 0.137]]\nD: [[-2.187, -0.629, 2.131, 0.702, 0.488, 0.299], [-1.084, -2.454, 0.389, 0.263, 0.376, -0.0], [0.958, -2.794, -0.355, 0.189, 0.618, 0.078], [2.015, -2.977, 0.616, 0.785, -0.119, 0.807], [-0.732, -2.52, -0.405, 0.133, 0.556, -0.136], [0.099, -2.242, 0.21, 0.448, 0.703, 0.555], [0.296, -2.758, -0.175, 0.146, 0.559, 0.119], [0.107, -2.903, 0.259, 0.508, 0.683, 0.189], [0.807, -2.213, 0.988, -0.022, 0.827, 0.39], [1.137, -2.436, 0.849, 0.615, -0.156, -0.078], [1.291, -2.816, 0.462, 0.333, 0.002, 0.188], [-0.057, -2.486, 0.271, 0.707, 0.496, -0.343], [0.312, -2.462, 0.382, 0.486, 0.393, -0.299], [-0.367, -3.213, 1.027, 0.397, 0.32, -0.33], [-0.12, -2.591, 0.295, 0.767, -0.13, -0.295], [-1.934, -2.605, 0.653, 0.958, 0.354, 0.257], [-1.101, -2.538, 0.202, 0.148, 0.769, 0.141], [-0.928, -2.714, 0.387, 0.917, 0.787, 0.443], [-1.94, -0.799, 1.262, 0.381, 0.02, 0.723]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the box in the scene. The camera pose information includes: the rotation matrix: [[0.983299, 0.047874, -0.175588], [0.180439, -0.382417, 0.9062], [-0.023764, -0.922749, -0.384668]]; the translation vector: [2.208684, 3.483128, 1.468268], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-2.229, -0.625, 1.789, 0.384, 0.619, 0.577], [-0.619, -2.375, 0.661, 0.567, 0.451, 0.111], [1.272, -2.43, 0.132, 0.447, 0.727, 0.3], [1.644, -2.684, 0.788, 0.577, 0.322, 0.308], [-0.576, -2.651, 0.078, 0.434, 0.523, 0.246], [0.194, -2.629, 0.081, 0.411, 0.494, 0.232], [0.495, -2.465, 0.021, 0.42, 0.459, 0.089], [-0.19, -2.648, 0.019, 0.384, 0.512, 0.116], [0.653, -2.701, 0.729, 0.282, 0.339, 0.17], [0.952, -2.696, 0.744, 0.349, 0.322, 0.22], [1.255, -2.766, 0.79, 0.368, 0.486, 0.32], [0.1, -2.746, 0.715, 0.314, 0.34, 0.156], [0.399, -2.748, 0.688, 0.296, 0.348, 0.103], [-0.127, -2.741, 0.687, 0.255, 0.368, 0.104], [-0.41, -2.745, 0.703, 0.341, 0.369, 0.132], [-1.782, -2.695, 0.512, 0.572, 0.485, 0.241], [-1.365, -2.686, 0.5, 0.492, 0.488, 0.277], [-1.027, -2.616, 0.39, 0.417, 0.378, 0.292], [-2.221, -0.682, 1.308, 0.347, 0.518, 0.497]]\nB: [[-2.177, -0.167, 1.791, 0.429, 0.305, 0.461], [-0.533, -2.428, 0.376, 0.625, 0.83, -0.181], [0.833, -2.571, -0.115, 0.273, 1.148, 0.126], [1.611, -2.253, 0.787, 0.359, 0.551, -0.134], [-1.058, -2.229, -0.315, 0.638, 0.268, -0.067], [0.333, -2.804, -0.071, 0.337, 0.161, 0.002], [0.886, -2.763, 0.464, 0.54, 0.824, 0.171], [0.083, -2.871, 0.059, 0.444, 0.352, 0.054], [0.665, -2.763, 0.558, 0.057, 0.308, 0.039], [0.563, -2.607, 1.101, 0.044, -0.169, 0.664], [1.085, -2.593, 0.464, 0.42, 0.951, 0.013], [-0.365, -2.365, 0.619, 0.59, 0.077, 0.369], [0.543, -2.864, 0.581, 0.554, 0.644, -0.05], [0.36, -3.102, 0.746, 0.301, -0.13, -0.221], [-0.22, -2.771, 1.165, 0.154, 0.295, 0.195], [-1.434, -2.444, 0.547, 0.734, 0.246, -0.108], [-0.997, -2.269, 0.094, 0.441, 0.845, 0.283], [-1.137, -2.213, 0.312, 0.148, 0.309, 0.772], [-2.498, -0.603, 1.369, 0.752, 0.555, 0.615]]\nC: [[-2.33, -1.056, 1.723, -0.065, 0.432, 0.415], [-0.968, -1.986, 0.569, 0.909, 0.497, 0.486], [1.32, -2.346, -0.114, 0.554, 0.588, 0.715], [1.949, -3.024, 0.857, 1.07, 0.018, 0.558], [-0.719, -2.255, 0.515, 0.899, 0.995, 0.643], [0.265, -2.28, -0.308, 0.384, 0.12, 0.468], [0.651, -2.056, -0.288, 0.45, 0.167, 0.402], [-0.058, -2.555, -0.352, 0.064, 0.242, 0.36], [0.516, -2.537, 1.033, 0.148, 0.192, 0.352], [0.983, -2.457, 0.904, -0.004, -0.102, -0.046], [1.601, -2.407, 0.354, 0.85, 0.773, 0.225], [0.09, -3.129, 0.278, 0.778, 0.065, 0.089], [0.498, -3.096, 0.49, 0.127, 0.025, 0.421], [0.282, -2.893, 0.585, 0.538, -0.078, 0.192], [-0.775, -2.875, 0.541, 0.822, 0.042, 0.614], [-1.444, -2.829, 0.956, 0.56, 0.015, 0.186], [-1.857, -2.941, 0.896, 0.404, 0.313, 0.437], [-1.23, -2.427, -0.01, 0.121, 0.029, 0.052], [-2.105, -0.861, 1.621, 0.843, 0.939, 0.137]]\nD: [[-2.187, -0.629, 2.131, 0.702, 0.488, 0.299], [-1.084, -2.454, 0.389, 0.263, 0.376, -0.0], [0.958, -2.794, -0.355, 0.189, 0.618, 0.078], [2.015, -2.977, 0.616, 0.785, -0.119, 0.807], [-0.732, -2.52, -0.405, 0.133, 0.556, -0.136], [0.099, -2.242, 0.21, 0.448, 0.703, 0.555], [0.296, -2.758, -0.175, 0.146, 0.559, 0.119], [0.107, -2.903, 0.259, 0.508, 0.683, 0.189], [0.807, -2.213, 0.988, -0.022, 0.827, 0.39], [1.137, -2.436, 0.849, 0.615, -0.156, -0.078], [1.291, -2.816, 0.462, 0.333, 0.002, 0.188], [-0.057, -2.486, 0.271, 0.707, 0.496, -0.343], [0.312, -2.462, 0.382, 0.486, 0.393, -0.299], [-0.367, -3.213, 1.027, 0.397, 0.32, -0.33], [-0.12, -2.591, 0.295, 0.767, -0.13, -0.295], [-1.934, -2.605, 0.653, 0.958, 0.354, 0.257], [-1.101, -2.538, 0.202, 0.148, 0.769, 0.141], [-0.928, -2.714, 0.387, 0.917, 0.787, 0.443], [-1.94, -0.799, 1.262, 0.381, 0.02, 0.723]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_65_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_65_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-0.897, 0.522, 0.459, 3.876, 6.999, 0.483]]\nB: [[-1.103, 0.265, -0.086, 3.353, 6.924, 0.299]]\nC: [[-1.473, 0.764, 0.085, 3.499, 6.843, 0.726]]\nD: [[-1.038, 0.389, 0.109, 3.778, 6.648, 0.286]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the floor in the scene. The camera pose information includes: the rotation matrix: [[0.643628, -0.362528, 0.674031], [-0.765241, -0.290748, 0.574345], [-0.012243, -0.88546, -0.464555]]; the translation vector: [2.632762, 2.243425, 1.452714], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.897, 0.522, 0.459, 3.876, 6.999, 0.483]]\nB: [[-1.103, 0.265, -0.086, 3.353, 6.924, 0.299]]\nC: [[-1.473, 0.764, 0.085, 3.499, 6.843, 0.726]]\nD: [[-1.038, 0.389, 0.109, 3.778, 6.648, 0.286]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_66_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_66_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-0.591, 1.483, 1.058, 1.607, 0.601, 2.492], [-2.097, -1.408, 0.835, 0.5, 0.336, 2.389]]\nB: [[0.134, 2.153, 0.957, 1.663, 0.233, 2.443], [-2.254, -1.53, 0.609, 0.554, 0.988, 1.584]]\nC: [[-0.108, 1.926, 1.025, 1.195, 0.255, 2.095], [-1.989, -1.419, 0.985, 0.159, 0.822, 1.991]]\nD: [[0.21, 1.976, 1.369, 1.051, 0.491, 1.606], [-2.131, -1.497, 0.9, 0.432, 0.96, 1.514]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the door in the scene. The camera pose information includes: the rotation matrix: [[-0.925351, 0.122106, -0.358909], [0.376741, 0.190476, -0.906524], [-0.042329, -0.974068, -0.222259]]; the translation vector: [4.735593, 2.732706, 1.21643], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.591, 1.483, 1.058, 1.607, 0.601, 2.492], [-2.097, -1.408, 0.835, 0.5, 0.336, 2.389]]\nB: [[0.134, 2.153, 0.957, 1.663, 0.233, 2.443], [-2.254, -1.53, 0.609, 0.554, 0.988, 1.584]]\nC: [[-0.108, 1.926, 1.025, 1.195, 0.255, 2.095], [-1.989, -1.419, 0.985, 0.159, 0.822, 1.991]]\nD: [[0.21, 1.976, 1.369, 1.051, 0.491, 1.606], [-2.131, -1.497, 0.9, 0.432, 0.96, 1.514]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_67_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_67_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[1.347, 0.112, 0.25, 1.296, 0.747, 0.377], [-0.83, 0.341, 0.317, 1.198, 1.401, 0.615], [-0.794, -1.292, 0.479, 1.485, 1.035, 1.271], [0.127, 1.653, 0.718, 0.98, 0.518, 0.445]]\nB: [[1.529, 0.501, 0.301, 0.944, 1.585, 0.652], [-1.117, 0.064, 0.625, 0.722, 0.832, 1.296], [-1.406, -0.586, 0.491, 0.928, 1.591, 0.802], [0.516, 1.562, 0.345, 0.823, 1.417, 0.468]]\nC: [[1.382, -0.298, 0.162, 0.586, 1.271, 1.153], [-1.729, -0.043, 0.911, 1.507, 1.118, 1.281], [-0.888, -0.525, -0.057, 1.572, 1.192, 0.468], [-0.205, 1.524, 0.606, 0.689, 0.914, 0.382]]\nD: [[1.322, 0.194, 0.453, 1.016, 1.117, 0.863], [-1.253, 0.172, 0.421, 1.029, 1.045, 0.876], [-1.049, -0.979, 0.44, 1.12, 1.131, 0.861], [0.221, 1.294, 0.424, 0.876, 0.92, 0.832]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the armchair in the scene. The camera pose information includes: the rotation matrix: [[0.748873, -0.374013, 0.547087], [-0.662404, -0.447673, 0.600675], [0.020256, -0.812221, -0.582998]]; the translation vector: [3.709567, 4.406117, 1.261793], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.347, 0.112, 0.25, 1.296, 0.747, 0.377], [-0.83, 0.341, 0.317, 1.198, 1.401, 0.615], [-0.794, -1.292, 0.479, 1.485, 1.035, 1.271], [0.127, 1.653, 0.718, 0.98, 0.518, 0.445]]\nB: [[1.529, 0.501, 0.301, 0.944, 1.585, 0.652], [-1.117, 0.064, 0.625, 0.722, 0.832, 1.296], [-1.406, -0.586, 0.491, 0.928, 1.591, 0.802], [0.516, 1.562, 0.345, 0.823, 1.417, 0.468]]\nC: [[1.382, -0.298, 0.162, 0.586, 1.271, 1.153], [-1.729, -0.043, 0.911, 1.507, 1.118, 1.281], [-0.888, -0.525, -0.057, 1.572, 1.192, 0.468], [-0.205, 1.524, 0.606, 0.689, 0.914, 0.382]]\nD: [[1.322, 0.194, 0.453, 1.016, 1.117, 0.863], [-1.253, 0.172, 0.421, 1.029, 1.045, 0.876], [-1.049, -0.979, 0.44, 1.12, 1.131, 0.861], [0.221, 1.294, 0.424, 0.876, 0.92, 0.832]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_68_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_68_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-1.579, -0.488, 0.759, 0.356, 0.729, 0.206], [-1.432, 0.41, 0.224, 0.816, -0.16, 0.054], [-1.128, 1.211, 0.876, 0.072, 0.472, -0.431], [-0.056, 1.335, 1.059, 0.219, -0.158, 0.294], [0.39, 0.373, 0.895, 0.659, 0.538, 0.377], [-1.237, 2.65, 0.314, 0.655, 0.335, -0.177]]\nB: [[-1.898, -0.166, 1.244, 0.693, 0.01, 0.135], [-2.054, 0.428, 0.961, 0.919, 0.356, 0.407], [-1.294, 1.065, 0.511, 0.811, -0.08, -0.323], [0.085, 0.558, 1.04, 0.703, -0.22, -0.384], [1.147, 0.956, 0.305, 0.157, 0.461, -0.367], [-1.796, 2.739, 0.408, 0.015, 0.305, -0.245]]\nC: [[-1.472, -0.634, 0.769, 0.41, 0.312, 0.075], [-1.766, 0.861, 0.684, 0.449, 0.16, 0.051], [-0.868, 0.879, 0.668, 0.414, 0.211, 0.046], [-0.148, 0.874, 0.644, 0.427, 0.151, 0.056], [0.744, 0.838, 0.607, 0.528, 0.174, 0.072], [-1.369, 2.612, 0.558, 0.426, 0.186, 0.029]]\nD: [[-1.326, -0.492, 0.759, 0.773, 0.113, -0.399], [-1.742, 0.884, 0.249, 0.825, 0.051, -0.219], [-0.59, 0.654, 0.814, 0.491, -0.041, -0.171], [-0.618, 1.322, 0.366, 0.807, 0.377, 0.225], [1.165, 1.152, 0.365, 0.032, 0.059, 0.012], [-1.206, 2.669, 0.552, 0.305, 0.052, 0.19]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the keyboard in the scene. The camera pose information includes: the rotation matrix: [[0.053762, 0.423971, -0.904079], [0.99709, -0.071809, 0.025618], [-0.05406, -0.902825, -0.426597]]; the translation vector: [3.696534, 7.381392, 1.65485], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.579, -0.488, 0.759, 0.356, 0.729, 0.206], [-1.432, 0.41, 0.224, 0.816, -0.16, 0.054], [-1.128, 1.211, 0.876, 0.072, 0.472, -0.431], [-0.056, 1.335, 1.059, 0.219, -0.158, 0.294], [0.39, 0.373, 0.895, 0.659, 0.538, 0.377], [-1.237, 2.65, 0.314, 0.655, 0.335, -0.177]]\nB: [[-1.898, -0.166, 1.244, 0.693, 0.01, 0.135], [-2.054, 0.428, 0.961, 0.919, 0.356, 0.407], [-1.294, 1.065, 0.511, 0.811, -0.08, -0.323], [0.085, 0.558, 1.04, 0.703, -0.22, -0.384], [1.147, 0.956, 0.305, 0.157, 0.461, -0.367], [-1.796, 2.739, 0.408, 0.015, 0.305, -0.245]]\nC: [[-1.472, -0.634, 0.769, 0.41, 0.312, 0.075], [-1.766, 0.861, 0.684, 0.449, 0.16, 0.051], [-0.868, 0.879, 0.668, 0.414, 0.211, 0.046], [-0.148, 0.874, 0.644, 0.427, 0.151, 0.056], [0.744, 0.838, 0.607, 0.528, 0.174, 0.072], [-1.369, 2.612, 0.558, 0.426, 0.186, 0.029]]\nD: [[-1.326, -0.492, 0.759, 0.773, 0.113, -0.399], [-1.742, 0.884, 0.249, 0.825, 0.051, -0.219], [-0.59, 0.654, 0.814, 0.491, -0.041, -0.171], [-0.618, 1.322, 0.366, 0.807, 0.377, 0.225], [1.165, 1.152, 0.365, 0.032, 0.059, 0.012], [-1.206, 2.669, 0.552, 0.305, 0.052, 0.19]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_69_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_69_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-0.062, 0.255, 0.974, 0.478, 0.305, 1.9]]\nB: [[0.289, 0.114, 0.997, 0.421, 0.269, 2.332]]\nC: [[-0.529, -0.167, 1.248, 0.711, 0.631, 1.869]]\nD: [[-0.117, 0.693, 1.129, 0.484, 0.656, 2.156]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the shower curtain in the scene. The camera pose information includes: the rotation matrix: [[-0.95695, -0.100486, 0.272304], [-0.288986, 0.24231, -0.92616], [0.027085, -0.964981, -0.260918]]; the translation vector: [1.227478, 4.879099, 1.55452], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.062, 0.255, 0.974, 0.478, 0.305, 1.9]]\nB: [[0.289, 0.114, 0.997, 0.421, 0.269, 2.332]]\nC: [[-0.529, -0.167, 1.248, 0.711, 0.631, 1.869]]\nD: [[-0.117, 0.693, 1.129, 0.484, 0.656, 2.156]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_70_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_70_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[1.525, -2.231, 0.892, 0.349, 0.481, 0.091]]\nB: [[1.636, -2.317, 0.937, 0.292, 0.774, -0.26]]\nC: [[1.735, -2.218, 1.132, -0.039, 0.012, 0.228]]\nD: [[1.335, -2.53, 1.027, 0.634, 0.978, -0.278]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the book in the scene. The camera pose information includes: the rotation matrix: [[-0.863619, -0.252896, 0.436126], [-0.502889, 0.371124, -0.780621], [0.03556, -0.893482, -0.447688]]; the translation vector: [2.007098, 3.82416, 1.536992], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.525, -2.231, 0.892, 0.349, 0.481, 0.091]]\nB: [[1.636, -2.317, 0.937, 0.292, 0.774, -0.26]]\nC: [[1.735, -2.218, 1.132, -0.039, 0.012, 0.228]]\nD: [[1.335, -2.53, 1.027, 0.634, 0.978, -0.278]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_71_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_71_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-3.713, -2.322, 1.093, 0.437, 3.072, 2.045], [-1.081, -3.837, 1.495, 4.867, 0.501, 2.558], [2.258, -2.553, 1.174, 1.342, 1.564, 2.568], [3.141, 1.147, 1.716, 0.254, 5.221, 3.492], [1.44, -2.565, 1.73, 0.493, 2.338, 2.511], [1.459, -1.994, 0.755, 0.967, 0.884, 1.183], [1.27, 3.225, 1.429, 3.362, 0.06, 2.461], [2.687, -1.112, 0.928, 2.314, 0.606, 3.137], [3.573, 2.12, 0.945, -0.323, 1.165, 0.653]]\nB: [[-3.835, -1.629, 1.168, -0.265, 2.747, 2.333], [-1.318, -2.989, 1.688, 4.412, 0.48, 2.388], [2.689, -2.933, 1.545, 1.676, 2.18, 2.201], [3.228, 1.403, 1.452, 0.635, 4.562, 3.257], [1.389, -2.608, 0.976, 1.337, 2.222, 2.449], [1.683, -1.57, 0.448, 0.488, 1.125, 1.219], [1.286, 3.78, 1.634, 2.717, 0.735, 2.673], [2.077, -1.004, 0.831, 1.778, 0.571, 2.523], [3.367, 1.994, 0.998, 0.165, 1.01, 0.878]]\nC: [[-3.518, -1.854, 1.546, 0.215, 3.24, 2.228], [-1.249, -3.369, 1.199, 4.514, 0.422, 2.472], [2.581, -2.461, 1.261, 1.576, 1.946, 2.535], [3.098, 1.012, 1.522, 0.435, 4.946, 2.999], [1.343, -2.44, 1.234, 0.869, 1.985, 2.527], [1.357, -2.033, 0.708, 0.777, 1.087, 1.434], [1.727, 3.433, 1.218, 3.174, 0.459, 2.415], [2.388, -1.448, 1.321, 1.857, 0.151, 2.689], [3.207, 2.39, 1.139, 0.116, 1.457, 0.447]]\nD: [[-3.315, -1.725, 1.076, -0.072, 3.369, 2.316], [-1.509, -2.948, 1.263, 4.588, 0.454, 2.009], [2.958, -2.942, 0.867, 1.911, 2.392, 2.127], [3.566, 0.671, 1.618, 0.253, 5.112, 3.1], [1.816, -2.011, 1.094, 0.402, 1.679, 2.148], [0.957, -1.668, 0.579, 1.105, 0.683, 1.586], [1.676, 3.716, 1.075, 3.204, 0.902, 2.406], [2.655, -1.717, 0.827, 1.883, 0.155, 2.358], [3.511, 2.562, 1.175, -0.348, 1.486, 0.553]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[-0.831143, 0.312948, -0.459636], [0.555586, 0.43327, -0.709649], [-0.022937, -0.845187, -0.533978]]; the translation vector: [2.360292, 3.05803, 1.315354], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-3.713, -2.322, 1.093, 0.437, 3.072, 2.045], [-1.081, -3.837, 1.495, 4.867, 0.501, 2.558], [2.258, -2.553, 1.174, 1.342, 1.564, 2.568], [3.141, 1.147, 1.716, 0.254, 5.221, 3.492], [1.44, -2.565, 1.73, 0.493, 2.338, 2.511], [1.459, -1.994, 0.755, 0.967, 0.884, 1.183], [1.27, 3.225, 1.429, 3.362, 0.06, 2.461], [2.687, -1.112, 0.928, 2.314, 0.606, 3.137], [3.573, 2.12, 0.945, -0.323, 1.165, 0.653]]\nB: [[-3.835, -1.629, 1.168, -0.265, 2.747, 2.333], [-1.318, -2.989, 1.688, 4.412, 0.48, 2.388], [2.689, -2.933, 1.545, 1.676, 2.18, 2.201], [3.228, 1.403, 1.452, 0.635, 4.562, 3.257], [1.389, -2.608, 0.976, 1.337, 2.222, 2.449], [1.683, -1.57, 0.448, 0.488, 1.125, 1.219], [1.286, 3.78, 1.634, 2.717, 0.735, 2.673], [2.077, -1.004, 0.831, 1.778, 0.571, 2.523], [3.367, 1.994, 0.998, 0.165, 1.01, 0.878]]\nC: [[-3.518, -1.854, 1.546, 0.215, 3.24, 2.228], [-1.249, -3.369, 1.199, 4.514, 0.422, 2.472], [2.581, -2.461, 1.261, 1.576, 1.946, 2.535], [3.098, 1.012, 1.522, 0.435, 4.946, 2.999], [1.343, -2.44, 1.234, 0.869, 1.985, 2.527], [1.357, -2.033, 0.708, 0.777, 1.087, 1.434], [1.727, 3.433, 1.218, 3.174, 0.459, 2.415], [2.388, -1.448, 1.321, 1.857, 0.151, 2.689], [3.207, 2.39, 1.139, 0.116, 1.457, 0.447]]\nD: [[-3.315, -1.725, 1.076, -0.072, 3.369, 2.316], [-1.509, -2.948, 1.263, 4.588, 0.454, 2.009], [2.958, -2.942, 0.867, 1.911, 2.392, 2.127], [3.566, 0.671, 1.618, 0.253, 5.112, 3.1], [1.816, -2.011, 1.094, 0.402, 1.679, 2.148], [0.957, -1.668, 0.579, 1.105, 0.683, 1.586], [1.676, 3.716, 1.075, 3.204, 0.902, 2.406], [2.655, -1.717, 0.827, 1.883, 0.155, 2.358], [3.511, 2.562, 1.175, -0.348, 1.486, 0.553]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_72_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_72_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[1.046, -0.307, 0.362, 0.784, -0.024, 0.785], [1.165, -2.351, 1.138, 0.213, 0.736, 0.353], [1.15, -2.093, 0.722, 0.4, 0.33, 0.205], [1.084, -0.85, 1.131, 0.451, -0.085, 0.317], [1.331, -1.435, 0.691, 0.675, 0.723, 0.254], [-1.236, 0.563, -0.088, 0.27, -0.102, 0.794]]\nB: [[1.265, -0.056, 0.282, 0.326, 0.027, 0.886], [1.533, -2.203, 0.449, 0.341, 0.914, 0.835], [0.973, -1.818, 0.452, -0.205, -0.0, 0.557], [1.212, -0.809, 0.364, 0.233, 0.14, 0.279], [0.952, -0.74, 0.435, -0.133, 0.174, 0.554], [-1.162, 0.16, 0.691, 0.327, -0.202, 0.736]]\nC: [[1.057, -0.394, 0.235, 0.507, 0.4, 0.47], [1.152, -1.942, 0.923, 0.249, 0.43, 0.441], [1.185, -1.67, 0.793, 0.195, 0.105, 0.183], [0.815, -0.905, 0.823, 0.231, 0.165, 0.244], [0.988, -0.991, 0.818, 0.253, 0.25, 0.209], [-1.265, 0.61, 0.238, 0.204, 0.16, 0.435]]\nD: [[1.051, -0.65, -0.171, 0.578, 0.483, 0.109], [0.936, -1.859, 0.474, -0.087, 0.06, 0.148], [1.334, -2.107, 0.81, 0.465, 0.412, 0.633], [0.554, -0.966, 0.763, 0.354, 0.344, 0.116], [1.173, -0.543, 0.619, 0.486, 0.296, 0.039], [-1.019, 0.12, 0.267, -0.232, -0.155, 0.735]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the object in the scene. The camera pose information includes: the rotation matrix: [[0.264492, -0.222038, 0.938479], [-0.962334, 0.002714, 0.271857], [-0.062909, -0.975034, -0.212957]]; the translation vector: [0.925816, 4.784833, 1.497389], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.046, -0.307, 0.362, 0.784, -0.024, 0.785], [1.165, -2.351, 1.138, 0.213, 0.736, 0.353], [1.15, -2.093, 0.722, 0.4, 0.33, 0.205], [1.084, -0.85, 1.131, 0.451, -0.085, 0.317], [1.331, -1.435, 0.691, 0.675, 0.723, 0.254], [-1.236, 0.563, -0.088, 0.27, -0.102, 0.794]]\nB: [[1.265, -0.056, 0.282, 0.326, 0.027, 0.886], [1.533, -2.203, 0.449, 0.341, 0.914, 0.835], [0.973, -1.818, 0.452, -0.205, -0.0, 0.557], [1.212, -0.809, 0.364, 0.233, 0.14, 0.279], [0.952, -0.74, 0.435, -0.133, 0.174, 0.554], [-1.162, 0.16, 0.691, 0.327, -0.202, 0.736]]\nC: [[1.057, -0.394, 0.235, 0.507, 0.4, 0.47], [1.152, -1.942, 0.923, 0.249, 0.43, 0.441], [1.185, -1.67, 0.793, 0.195, 0.105, 0.183], [0.815, -0.905, 0.823, 0.231, 0.165, 0.244], [0.988, -0.991, 0.818, 0.253, 0.25, 0.209], [-1.265, 0.61, 0.238, 0.204, 0.16, 0.435]]\nD: [[1.051, -0.65, -0.171, 0.578, 0.483, 0.109], [0.936, -1.859, 0.474, -0.087, 0.06, 0.148], [1.334, -2.107, 0.81, 0.465, 0.412, 0.633], [0.554, -0.966, 0.763, 0.354, 0.344, 0.116], [1.173, -0.543, 0.619, 0.486, 0.296, 0.039], [-1.019, 0.12, 0.267, -0.232, -0.155, 0.735]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_73_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_73_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-1.085, -0.215, 0.983, 0.671, 0.944, 0.637]]\nB: [[-1.195, -0.19, 1.175, 0.471, 1.343, 0.221]]\nC: [[-1.17, -0.298, 0.934, 0.962, 1.213, 0.413]]\nD: [[-1.39, -0.221, 0.693, 0.182, 1.277, 0.167]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the sink in the scene. The camera pose information includes: the rotation matrix: [[-0.409087, -0.112571, 0.905525], [-0.910894, 0.109148, -0.397943], [-0.05404, -0.987631, -0.147191]]; the translation vector: [4.421403, 3.579741, 1.526424], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.085, -0.215, 0.983, 0.671, 0.944, 0.637]]\nB: [[-1.195, -0.19, 1.175, 0.471, 1.343, 0.221]]\nC: [[-1.17, -0.298, 0.934, 0.962, 1.213, 0.413]]\nD: [[-1.39, -0.221, 0.693, 0.182, 1.277, 0.167]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_74_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_74_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-1.095, 1.592, 1.222, 1.568, 0.744, 2.142]]\nB: [[-0.877, 2.359, 1.301, 1.758, 0.807, 2.272]]\nC: [[-0.883, 2.133, 0.636, 0.867, 0.763, 2.547]]\nD: [[-1.101, 1.96, 1.128, 1.33, 0.454, 2.075]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the mirror doors in the scene. The camera pose information includes: the rotation matrix: [[-0.998134, -0.025826, -0.055325], [0.04389, 0.326427, -0.944203], [0.042444, -0.94487, -0.324684]]; the translation vector: [2.355182, 2.984659, 1.395898], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.095, 1.592, 1.222, 1.568, 0.744, 2.142]]\nB: [[-0.877, 2.359, 1.301, 1.758, 0.807, 2.272]]\nC: [[-0.883, 2.133, 0.636, 0.867, 0.763, 2.547]]\nD: [[-1.101, 1.96, 1.128, 1.33, 0.454, 2.075]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_75_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_75_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[1.917, 0.769, 0.393, 0.162, 0.916, 0.83], [2.158, 0.091, 1.885, 0.64, 0.369, 0.373]]\nB: [[1.7, 0.645, 0.863, 0.067, 1.221, 0.849], [2.328, 0.352, 2.246, 0.627, 0.498, 0.253]]\nC: [[1.798, 1.202, -0.093, 0.135, 0.516, 1.131], [2.029, 0.523, 2.037, 0.813, 0.35, 0.6]]\nD: [[1.675, 0.61, 0.316, -0.227, 0.481, 0.35], [2.253, 0.494, 1.51, 1.013, 0.177, 0.842]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the kitchen cabinet in the scene. The camera pose information includes: the rotation matrix: [[-0.399387, 0.327689, -0.856218], [0.9115, 0.041819, -0.409169], [-0.098274, -0.94386, -0.315391]]; the translation vector: [4.88233, 2.963563, 1.403722], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.917, 0.769, 0.393, 0.162, 0.916, 0.83], [2.158, 0.091, 1.885, 0.64, 0.369, 0.373]]\nB: [[1.7, 0.645, 0.863, 0.067, 1.221, 0.849], [2.328, 0.352, 2.246, 0.627, 0.498, 0.253]]\nC: [[1.798, 1.202, -0.093, 0.135, 0.516, 1.131], [2.029, 0.523, 2.037, 0.813, 0.35, 0.6]]\nD: [[1.675, 0.61, 0.316, -0.227, 0.481, 0.35], [2.253, 0.494, 1.51, 1.013, 0.177, 0.842]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_76_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_76_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-1.494, 1.559, 0.938, 0.656, 5.217, 2.076], [2.053, 1.519, 1.142, 0.022, 3.126, 1.696], [1.526, 0.314, 1.467, 0.388, 0.432, 1.965], [1.119, -0.324, 1.442, 0.427, 0.845, 2.198], [1.576, -0.035, 0.321, 0.045, 0.534, 1.117], [1.692, -0.958, 0.006, 0.205, 2.737, 0.651], [2.137, -2.631, 1.077, 0.184, 0.904, 1.508], [1.333, -3.255, 1.466, 0.791, -0.082, 1.662], [1.459, -3.425, 1.79, -0.348, -0.024, 0.359], [0.387, -3.416, 1.314, 3.646, -0.097, 1.995], [-1.825, -3.194, 1.168, 0.422, 1.404, 1.529], [-0.304, 4.179, 0.452, 3.264, -0.007, 1.066], [1.999, 3.872, 0.716, 0.498, 0.487, 1.358]]\nB: [[-1.693, 1.424, 1.03, 0.376, 5.083, 2.034], [1.765, 1.957, 1.138, 0.161, 3.199, 2.18], [1.589, 0.333, 0.987, 0.355, 0.095, 1.877], [1.425, 0.157, 1.015, 0.112, 0.477, 1.967], [1.63, -0.081, 0.672, 0.331, 0.259, 1.339], [1.705, -1.447, 0.484, 0.238, 2.779, 0.873], [1.951, -2.837, 1.012, 0.146, 0.69, 1.445], [1.797, -3.186, 1.022, 0.444, 0.092, 1.424], [1.591, -3.334, 1.384, 0.106, 0.324, 0.652], [-0.022, -3.519, 0.892, 3.311, 0.275, 1.699], [-1.705, -2.728, 0.676, 0.126, 1.402, 1.204], [-0.01, 3.839, 0.745, 3.147, 0.481, 1.347], [1.63, 3.568, 0.892, 0.411, 0.327, 1.532]]\nC: [[-1.26, 1.503, 0.893, 0.629, 5.544, 1.914], [2.175, 1.47, 1.47, 0.109, 3.382, 1.686], [1.982, -0.011, 0.916, 0.426, -0.326, 1.566], [1.181, 0.067, 1.21, 0.067, 0.005, 2.351], [1.524, 0.001, 0.471, 0.286, 0.408, 1.265], [1.238, -1.52, 0.419, 0.599, 3.184, 1.176], [1.553, -3.177, 0.653, 0.32, 0.427, 1.885], [1.383, -3.363, 1.432, 0.865, -0.009, 1.444], [1.288, -3.498, 1.769, -0.257, 0.218, 1.054], [0.393, -3.522, 1.337, 3.619, 0.242, 1.594], [-1.576, -3.113, 0.753, 0.379, 1.777, 1.195], [-0.268, 3.894, 0.852, 2.983, 0.721, 1.393], [1.465, 3.133, 0.435, 0.617, 0.63, 1.96]]\nD: [[-1.946, 1.454, 1.304, 0.285, 4.759, 1.584], [1.735, 2.118, 1.431, 0.5, 3.38, 2.198], [2.01, 0.269, 1.406, 0.118, -0.362, 2.255], [1.665, -0.294, 0.623, -0.295, 0.208, 2.363], [1.918, 0.112, 1.078, 0.599, 0.597, 0.896], [1.655, -1.698, 0.75, 0.063, 2.896, 0.441], [2.382, -2.981, 1.161, 0.203, 0.379, 1.162], [1.828, -2.97, 0.979, 0.706, -0.194, 1.801], [1.717, -3.159, 1.188, 0.204, 0.385, 0.448], [0.303, -3.389, 1.008, 3.649, 0.715, 1.331], [-1.467, -2.443, 0.641, 0.545, 0.903, 1.371], [-0.151, 3.761, 0.508, 3.288, 0.802, 1.225], [1.797, 3.579, 1.179, 0.009, 0.008, 1.708]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[-0.810147, -0.229725, 0.539341], [-0.586224, 0.314131, -0.746769], [0.002128, -0.921167, -0.389162]]; the translation vector: [3.108561, 2.950706, 1.466118], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.494, 1.559, 0.938, 0.656, 5.217, 2.076], [2.053, 1.519, 1.142, 0.022, 3.126, 1.696], [1.526, 0.314, 1.467, 0.388, 0.432, 1.965], [1.119, -0.324, 1.442, 0.427, 0.845, 2.198], [1.576, -0.035, 0.321, 0.045, 0.534, 1.117], [1.692, -0.958, 0.006, 0.205, 2.737, 0.651], [2.137, -2.631, 1.077, 0.184, 0.904, 1.508], [1.333, -3.255, 1.466, 0.791, -0.082, 1.662], [1.459, -3.425, 1.79, -0.348, -0.024, 0.359], [0.387, -3.416, 1.314, 3.646, -0.097, 1.995], [-1.825, -3.194, 1.168, 0.422, 1.404, 1.529], [-0.304, 4.179, 0.452, 3.264, -0.007, 1.066], [1.999, 3.872, 0.716, 0.498, 0.487, 1.358]]\nB: [[-1.693, 1.424, 1.03, 0.376, 5.083, 2.034], [1.765, 1.957, 1.138, 0.161, 3.199, 2.18], [1.589, 0.333, 0.987, 0.355, 0.095, 1.877], [1.425, 0.157, 1.015, 0.112, 0.477, 1.967], [1.63, -0.081, 0.672, 0.331, 0.259, 1.339], [1.705, -1.447, 0.484, 0.238, 2.779, 0.873], [1.951, -2.837, 1.012, 0.146, 0.69, 1.445], [1.797, -3.186, 1.022, 0.444, 0.092, 1.424], [1.591, -3.334, 1.384, 0.106, 0.324, 0.652], [-0.022, -3.519, 0.892, 3.311, 0.275, 1.699], [-1.705, -2.728, 0.676, 0.126, 1.402, 1.204], [-0.01, 3.839, 0.745, 3.147, 0.481, 1.347], [1.63, 3.568, 0.892, 0.411, 0.327, 1.532]]\nC: [[-1.26, 1.503, 0.893, 0.629, 5.544, 1.914], [2.175, 1.47, 1.47, 0.109, 3.382, 1.686], [1.982, -0.011, 0.916, 0.426, -0.326, 1.566], [1.181, 0.067, 1.21, 0.067, 0.005, 2.351], [1.524, 0.001, 0.471, 0.286, 0.408, 1.265], [1.238, -1.52, 0.419, 0.599, 3.184, 1.176], [1.553, -3.177, 0.653, 0.32, 0.427, 1.885], [1.383, -3.363, 1.432, 0.865, -0.009, 1.444], [1.288, -3.498, 1.769, -0.257, 0.218, 1.054], [0.393, -3.522, 1.337, 3.619, 0.242, 1.594], [-1.576, -3.113, 0.753, 0.379, 1.777, 1.195], [-0.268, 3.894, 0.852, 2.983, 0.721, 1.393], [1.465, 3.133, 0.435, 0.617, 0.63, 1.96]]\nD: [[-1.946, 1.454, 1.304, 0.285, 4.759, 1.584], [1.735, 2.118, 1.431, 0.5, 3.38, 2.198], [2.01, 0.269, 1.406, 0.118, -0.362, 2.255], [1.665, -0.294, 0.623, -0.295, 0.208, 2.363], [1.918, 0.112, 1.078, 0.599, 0.597, 0.896], [1.655, -1.698, 0.75, 0.063, 2.896, 0.441], [2.382, -2.981, 1.161, 0.203, 0.379, 1.162], [1.828, -2.97, 0.979, 0.706, -0.194, 1.801], [1.717, -3.159, 1.188, 0.204, 0.385, 0.448], [0.303, -3.389, 1.008, 3.649, 0.715, 1.331], [-1.467, -2.443, 0.641, 0.545, 0.903, 1.371], [-0.151, 3.761, 0.508, 3.288, 0.802, 1.225], [1.797, 3.579, 1.179, 0.009, 0.008, 1.708]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_77_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_77_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-0.793, 1.247, 0.29, 0.296, 0.279, -0.014], [1.019, 0.024, 1.569, 0.553, 0.236, 0.679]]\nB: [[-0.837, 1.73, 0.172, 0.311, 0.446, 0.446], [0.579, -0.45, 1.284, 0.394, 0.372, 0.858]]\nC: [[-0.983, 2.19, 0.493, -0.03, 0.329, 0.928], [0.864, -0.587, 1.773, 0.118, 0.794, 0.799]]\nD: [[-0.553, 2.216, 0.459, 0.267, 0.459, 0.522], [0.806, 0.026, 1.267, 0.403, 0.702, 0.558]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the clothes in the scene. The camera pose information includes: the rotation matrix: [[-0.187285, -0.627824, 0.755488], [-0.982305, 0.118515, -0.145025], [0.001514, -0.76928, -0.63891]]; the translation vector: [1.001752, 1.17634, 1.437838], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.793, 1.247, 0.29, 0.296, 0.279, -0.014], [1.019, 0.024, 1.569, 0.553, 0.236, 0.679]]\nB: [[-0.837, 1.73, 0.172, 0.311, 0.446, 0.446], [0.579, -0.45, 1.284, 0.394, 0.372, 0.858]]\nC: [[-0.983, 2.19, 0.493, -0.03, 0.329, 0.928], [0.864, -0.587, 1.773, 0.118, 0.794, 0.799]]\nD: [[-0.553, 2.216, 0.459, 0.267, 0.459, 0.522], [0.806, 0.026, 1.267, 0.403, 0.702, 0.558]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_78_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_78_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-2.858, -1.05, -0.049, 0.631, 1.092, 1.022], [0.969, 2.457, 0.703, 0.33, 0.355, 0.535], [1.292, 0.687, 0.943, 0.724, 0.324, 1.126], [1.537, -0.024, 0.37, 0.738, 0.769, 0.806], [2.91, -1.195, 1.375, 0.242, 1.166, 0.582], [2.799, -1.708, 0.863, 0.877, 0.364, 0.812], [2.158, -1.992, 0.634, 0.411, 0.065, 1.19], [-2.861, 0.973, 1.098, 0.744, 0.232, 0.595], [-3.055, 1.702, 0.901, 0.639, 0.173, 0.718], [3.451, -0.934, 1.096, 0.502, 0.89, 0.387]]\nB: [[-2.77, -0.712, 0.41, 0.782, 0.713, 0.859], [1.367, 2.116, 0.842, 0.257, 0.504, 0.248], [1.716, 0.519, 0.519, 0.661, 0.573, 0.903], [1.577, -0.324, 0.811, 0.462, 0.54, 0.431], [3.037, -1.452, 0.953, 0.581, 0.687, 0.531], [2.669, -1.872, 0.986, 0.552, 0.48, 0.568], [2.211, -1.887, 0.725, 0.677, 0.554, 1.018], [-2.956, 0.672, 0.826, 0.436, 0.319, 0.465], [-2.626, 1.651, 0.53, 0.537, 0.47, 0.924], [2.995, -0.435, 0.615, 0.566, 0.706, 0.886]]\nC: [[-2.925, -0.243, 0.295, 0.519, 0.44, 0.711], [1.485, 1.766, 1.018, 0.081, 0.848, 0.483], [1.717, 0.68, 0.214, 0.236, 1.037, 0.434], [1.205, -0.323, 1.125, 0.097, 0.642, 0.242], [3.189, -1.068, 0.599, 0.36, 1.144, 0.939], [2.418, -1.941, 1.167, 0.598, 0.698, 0.702], [1.723, -2.159, 0.821, 0.484, 0.884, 0.696], [-3.03, 0.47, 1.025, 0.789, 0.045, 0.278], [-2.913, 1.461, 0.819, 0.202, 0.085, 1.03], [2.826, -0.221, 0.951, 0.339, 0.752, 1.266]]\nD: [[-3.135, -0.575, -0.082, 0.411, 0.399, 1.112], [1.76, 1.636, 0.661, -0.118, 0.316, 0.196], [2.067, 0.976, 0.67, 0.22, 0.315, 1.158], [1.439, -0.283, 0.584, 0.087, 0.218, 0.206], [2.848, -1.357, 1.295, 0.653, 0.266, 0.059], [2.99, -1.86, 1.333, 0.578, 0.108, 0.112], [2.118, -1.567, 1.178, 0.323, 0.289, 0.96], [-3.43, 1.005, 1.071, 0.331, 0.71, 0.959], [-3.114, 1.972, 0.571, 0.075, 0.864, 0.441], [2.987, 0.022, 0.923, 0.173, 0.274, 0.482]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the chair in the scene. The camera pose information includes: the rotation matrix: [[0.515401, -0.339121, 0.786994], [-0.847541, -0.337435, 0.40965], [0.126638, -0.878143, -0.461333]]; the translation vector: [4.776819, 1.138867, 1.280463], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-2.858, -1.05, -0.049, 0.631, 1.092, 1.022], [0.969, 2.457, 0.703, 0.33, 0.355, 0.535], [1.292, 0.687, 0.943, 0.724, 0.324, 1.126], [1.537, -0.024, 0.37, 0.738, 0.769, 0.806], [2.91, -1.195, 1.375, 0.242, 1.166, 0.582], [2.799, -1.708, 0.863, 0.877, 0.364, 0.812], [2.158, -1.992, 0.634, 0.411, 0.065, 1.19], [-2.861, 0.973, 1.098, 0.744, 0.232, 0.595], [-3.055, 1.702, 0.901, 0.639, 0.173, 0.718], [3.451, -0.934, 1.096, 0.502, 0.89, 0.387]]\nB: [[-2.77, -0.712, 0.41, 0.782, 0.713, 0.859], [1.367, 2.116, 0.842, 0.257, 0.504, 0.248], [1.716, 0.519, 0.519, 0.661, 0.573, 0.903], [1.577, -0.324, 0.811, 0.462, 0.54, 0.431], [3.037, -1.452, 0.953, 0.581, 0.687, 0.531], [2.669, -1.872, 0.986, 0.552, 0.48, 0.568], [2.211, -1.887, 0.725, 0.677, 0.554, 1.018], [-2.956, 0.672, 0.826, 0.436, 0.319, 0.465], [-2.626, 1.651, 0.53, 0.537, 0.47, 0.924], [2.995, -0.435, 0.615, 0.566, 0.706, 0.886]]\nC: [[-2.925, -0.243, 0.295, 0.519, 0.44, 0.711], [1.485, 1.766, 1.018, 0.081, 0.848, 0.483], [1.717, 0.68, 0.214, 0.236, 1.037, 0.434], [1.205, -0.323, 1.125, 0.097, 0.642, 0.242], [3.189, -1.068, 0.599, 0.36, 1.144, 0.939], [2.418, -1.941, 1.167, 0.598, 0.698, 0.702], [1.723, -2.159, 0.821, 0.484, 0.884, 0.696], [-3.03, 0.47, 1.025, 0.789, 0.045, 0.278], [-2.913, 1.461, 0.819, 0.202, 0.085, 1.03], [2.826, -0.221, 0.951, 0.339, 0.752, 1.266]]\nD: [[-3.135, -0.575, -0.082, 0.411, 0.399, 1.112], [1.76, 1.636, 0.661, -0.118, 0.316, 0.196], [2.067, 0.976, 0.67, 0.22, 0.315, 1.158], [1.439, -0.283, 0.584, 0.087, 0.218, 0.206], [2.848, -1.357, 1.295, 0.653, 0.266, 0.059], [2.99, -1.86, 1.333, 0.578, 0.108, 0.112], [2.118, -1.567, 1.178, 0.323, 0.289, 0.96], [-3.43, 1.005, 1.071, 0.331, 0.71, 0.959], [-3.114, 1.972, 0.571, 0.075, 0.864, 0.441], [2.987, 0.022, 0.923, 0.173, 0.274, 0.482]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_79_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_79_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[1.456, -1.689, 0.986, 1.238, 0.639, 1.143], [-1.189, -1.791, 0.864, 0.611, 1.379, 1.148]]\nB: [[1.709, -1.624, 1.232, 0.531, 0.409, 1.263], [-0.537, -2.035, 0.965, 0.162, 1.273, 1.399]]\nC: [[1.92, -1.614, 0.415, 0.301, 0.956, 1.133], [-0.648, -1.783, 0.191, 0.47, 1.3, 1.09]]\nD: [[1.863, -1.557, 0.74, 0.792, 0.462, 1.459], [-0.873, -1.717, 0.611, 0.169, 1.157, 1.341]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the door in the scene. The camera pose information includes: the rotation matrix: [[0.348231, 0.123124, -0.929288], [0.936413, -1.6e-05, 0.350899], [0.043189, -0.992391, -0.1153]]; the translation vector: [2.712005, 2.075202, 1.464169], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.456, -1.689, 0.986, 1.238, 0.639, 1.143], [-1.189, -1.791, 0.864, 0.611, 1.379, 1.148]]\nB: [[1.709, -1.624, 1.232, 0.531, 0.409, 1.263], [-0.537, -2.035, 0.965, 0.162, 1.273, 1.399]]\nC: [[1.92, -1.614, 0.415, 0.301, 0.956, 1.133], [-0.648, -1.783, 0.191, 0.47, 1.3, 1.09]]\nD: [[1.863, -1.557, 0.74, 0.792, 0.462, 1.459], [-0.873, -1.717, 0.611, 0.169, 1.157, 1.341]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_80_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_80_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-0.91, 0.435, 1.317, 0.162, -0.028, 0.372], [-1.612, 0.781, 1.119, -0.027, 0.477, 0.779], [-0.879, 0.442, 1.028, 0.015, 0.023, 0.191], [-1.689, 1.721, 1.33, 0.202, 0.203, 0.899]]\nB: [[-1.22, 0.565, 1.527, 0.13, 0.316, 0.334], [-1.214, 0.573, 1.041, 0.138, 0.311, 0.395], [-1.241, 0.926, 1.496, 0.134, 0.334, 0.376], [-1.254, 1.276, 1.499, 0.14, 0.375, 0.407]]\nC: [[-0.897, 0.321, 1.25, -0.192, -0.085, 0.628], [-1.027, 0.54, 0.746, 0.155, 0.593, 0.872], [-1.661, 1.141, 1.852, 0.038, 0.687, 0.36], [-1.716, 1.739, 1.744, 0.171, 0.366, 0.735]]\nD: [[-0.881, 0.818, 1.879, -0.183, 0.463, 0.205], [-0.767, 0.607, 0.616, 0.203, 0.246, 0.191], [-0.822, 0.77, 1.534, -0.248, 0.163, 0.71], [-1.508, 0.961, 1.625, -0.148, 0.39, 0.839]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the rack in the scene. The camera pose information includes: the rotation matrix: [[-0.937403, 0.174354, -0.301457], [0.34768, 0.517889, -0.781607], [0.019845, -0.837491, -0.54609]]; the translation vector: [1.513881, 1.499843, 1.388066], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.91, 0.435, 1.317, 0.162, -0.028, 0.372], [-1.612, 0.781, 1.119, -0.027, 0.477, 0.779], [-0.879, 0.442, 1.028, 0.015, 0.023, 0.191], [-1.689, 1.721, 1.33, 0.202, 0.203, 0.899]]\nB: [[-1.22, 0.565, 1.527, 0.13, 0.316, 0.334], [-1.214, 0.573, 1.041, 0.138, 0.311, 0.395], [-1.241, 0.926, 1.496, 0.134, 0.334, 0.376], [-1.254, 1.276, 1.499, 0.14, 0.375, 0.407]]\nC: [[-0.897, 0.321, 1.25, -0.192, -0.085, 0.628], [-1.027, 0.54, 0.746, 0.155, 0.593, 0.872], [-1.661, 1.141, 1.852, 0.038, 0.687, 0.36], [-1.716, 1.739, 1.744, 0.171, 0.366, 0.735]]\nD: [[-0.881, 0.818, 1.879, -0.183, 0.463, 0.205], [-0.767, 0.607, 0.616, 0.203, 0.246, 0.191], [-0.822, 0.77, 1.534, -0.248, 0.163, 0.71], [-1.508, 0.961, 1.625, -0.148, 0.39, 0.839]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_81_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_81_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[0.79, -0.98, 1.163, 0.352, 0.978, 2.049]]\nB: [[1.303, -0.943, 0.81, 0.085, 1.431, 2.157]]\nC: [[0.918, -1.038, 0.78, -0.022, 1.276, 1.887]]\nD: [[1.132, -1.26, 0.803, 0.268, 1.192, 2.17]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the door in the scene. The camera pose information includes: the rotation matrix: [[-0.15851, 0.420096, -0.893529], [0.981106, -0.034663, -0.190342], [-0.110934, -0.906817, -0.406664]]; the translation vector: [4.004256, 0.910349, 2.578562], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.79, -0.98, 1.163, 0.352, 0.978, 2.049]]\nB: [[1.303, -0.943, 0.81, 0.085, 1.431, 2.157]]\nC: [[0.918, -1.038, 0.78, -0.022, 1.276, 1.887]]\nD: [[1.132, -1.26, 0.803, 0.268, 1.192, 2.17]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_82_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_82_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-2.027, 0.959, 0.116, 0.065, 0.01, 0.668], [-1.502, 1.757, 0.887, 0.168, -0.298, 0.404], [-1.464, 1.693, 1.349, 0.508, -0.033, 0.751], [-1.515, 1.819, 1.174, 0.619, -0.056, 0.931], [-1.32, 1.579, 1.138, 0.221, -0.036, 0.586]]\nB: [[-1.555, 1.321, -0.005, -0.086, -0.066, 0.68], [-1.503, 1.647, 1.497, 0.094, 0.629, 0.772], [-1.545, 2.057, 0.682, 0.091, -0.365, -0.177], [-1.817, 2.125, 0.639, 0.421, 0.176, 0.148], [-2.148, 2.167, 0.268, 0.654, -0.085, 0.81]]\nC: [[-1.921, 0.926, 0.476, 0.205, 0.401, 1.004], [-1.317, 1.461, 1.183, 0.482, -0.087, -0.114], [-0.981, 1.858, 0.937, -0.085, -0.01, 0.117], [-1.804, 1.654, 1.126, 0.091, 0.345, 0.125], [-2.134, 1.498, 0.297, 0.016, 0.463, 0.232]]\nD: [[-2.011, 1.284, 0.385, 0.186, 0.39, 0.566], [-1.266, 1.943, 1.101, 0.313, 0.196, 0.371], [-1.224, 1.994, 0.869, 0.351, 0.116, 0.277], [-1.583, 1.923, 1.035, 0.381, 0.288, 0.498], [-1.707, 1.925, 0.764, 0.426, 0.259, 0.583]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the bag in the scene. The camera pose information includes: the rotation matrix: [[0.82141, -0.124481, 0.556588], [-0.562763, -0.33543, 0.755503], [0.092651, -0.933805, -0.345579]]; the translation vector: [1.795382, 2.457259, 1.379582], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-2.027, 0.959, 0.116, 0.065, 0.01, 0.668], [-1.502, 1.757, 0.887, 0.168, -0.298, 0.404], [-1.464, 1.693, 1.349, 0.508, -0.033, 0.751], [-1.515, 1.819, 1.174, 0.619, -0.056, 0.931], [-1.32, 1.579, 1.138, 0.221, -0.036, 0.586]]\nB: [[-1.555, 1.321, -0.005, -0.086, -0.066, 0.68], [-1.503, 1.647, 1.497, 0.094, 0.629, 0.772], [-1.545, 2.057, 0.682, 0.091, -0.365, -0.177], [-1.817, 2.125, 0.639, 0.421, 0.176, 0.148], [-2.148, 2.167, 0.268, 0.654, -0.085, 0.81]]\nC: [[-1.921, 0.926, 0.476, 0.205, 0.401, 1.004], [-1.317, 1.461, 1.183, 0.482, -0.087, -0.114], [-0.981, 1.858, 0.937, -0.085, -0.01, 0.117], [-1.804, 1.654, 1.126, 0.091, 0.345, 0.125], [-2.134, 1.498, 0.297, 0.016, 0.463, 0.232]]\nD: [[-2.011, 1.284, 0.385, 0.186, 0.39, 0.566], [-1.266, 1.943, 1.101, 0.313, 0.196, 0.371], [-1.224, 1.994, 0.869, 0.351, 0.116, 0.277], [-1.583, 1.923, 1.035, 0.381, 0.288, 0.498], [-1.707, 1.925, 0.764, 0.426, 0.259, 0.583]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_83_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_83_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-2.036, 1.866, 1.489, 0.307, 0.157, 2.451], [-1.162, -0.721, 0.524, 0.061, 0.505, 0.591], [-1.692, -0.087, 1.909, 0.08, 0.301, 0.1], [-1.275, -0.78, -0.299, 0.662, 0.631, -0.319]]\nB: [[-1.306, 1.944, 1.27, 0.242, 0.415, 1.919], [-1.846, -0.095, 0.652, 0.668, -0.011, -0.065], [-1.708, -0.182, 1.324, -0.259, 0.382, 0.757], [-0.989, -0.521, 0.267, 0.114, 0.569, -0.144]]\nC: [[-1.606, 1.5, 1.094, 0.082, 0.444, 2.163], [-1.349, -0.456, 0.266, 0.226, 0.434, 0.139], [-1.295, -0.266, 1.634, 0.118, 0.05, 0.32], [-1.418, -0.408, 0.197, 0.3, 0.329, 0.161]]\nD: [[-1.696, 1.738, 0.967, 0.285, -0.051, 2.413], [-0.922, -0.569, 0.642, 0.33, 0.259, -0.242], [-0.853, -0.408, 2.052, 0.046, 0.488, 0.615], [-1.165, -0.273, 0.19, 0.107, 0.57, 0.605]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the book in the scene. The camera pose information includes: the rotation matrix: [[0.954506, 0.05554, -0.292973], [0.288831, -0.41644, 0.862064], [-0.074127, -0.907465, -0.413536]]; the translation vector: [2.66447, 1.005586, 1.476015], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-2.036, 1.866, 1.489, 0.307, 0.157, 2.451], [-1.162, -0.721, 0.524, 0.061, 0.505, 0.591], [-1.692, -0.087, 1.909, 0.08, 0.301, 0.1], [-1.275, -0.78, -0.299, 0.662, 0.631, -0.319]]\nB: [[-1.306, 1.944, 1.27, 0.242, 0.415, 1.919], [-1.846, -0.095, 0.652, 0.668, -0.011, -0.065], [-1.708, -0.182, 1.324, -0.259, 0.382, 0.757], [-0.989, -0.521, 0.267, 0.114, 0.569, -0.144]]\nC: [[-1.606, 1.5, 1.094, 0.082, 0.444, 2.163], [-1.349, -0.456, 0.266, 0.226, 0.434, 0.139], [-1.295, -0.266, 1.634, 0.118, 0.05, 0.32], [-1.418, -0.408, 0.197, 0.3, 0.329, 0.161]]\nD: [[-1.696, 1.738, 0.967, 0.285, -0.051, 2.413], [-0.922, -0.569, 0.642, 0.33, 0.259, -0.242], [-0.853, -0.408, 2.052, 0.046, 0.488, 0.615], [-1.165, -0.273, 0.19, 0.107, 0.57, 0.605]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_84_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_84_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[0.105, -2.287, 1.298, 5.216, 0.87, 2.807], [1.522, -1.359, 1.295, 0.692, 3.211, 2.519], [-1.482, 0.95, 0.954, -0.118, 3.651, 2.351], [2.186, 1.699, 1.104, -0.162, 3.237, 1.812], [-2.278, -2.05, 0.889, 0.84, 1.577, 2.045], [1.673, 0.416, 0.493, 2.003, 0.505, 2.036], [0.961, 0.497, 0.688, -0.227, 0.745, 1.241], [-2.744, -1.281, 0.568, 1.033, 0.219, 1.748], [-1.203, 3.13, 0.749, 0.354, 1.039, 2.281], [0.453, 3.938, 1.358, 3.143, -0.207, 1.101], [1.577, -0.17, 0.647, 1.504, 0.197, 0.91], [0.55, 0.958, 0.74, -0.022, 0.508, 1.915], [-0.28, -2.346, 2.125, 4.551, 0.291, 0.411], [-2.245, 3.374, 2.167, 0.315, 0.57, 1.021], [-1.883, 4.031, 0.791, 0.575, 0.114, 1.338]]\nB: [[-0.045, -2.933, 0.729, 5.273, 0.38, 1.87], [1.815, -0.872, 1.2, 0.381, 3.199, 2.451], [-1.835, 0.525, 0.883, 0.663, 4.217, 2.299], [1.852, 1.563, 0.117, 0.236, 3.143, 0.945], [-2.89, -2.063, 0.836, 0.206, 1.672, 2.238], [1.326, 0.326, 0.98, 1.535, 0.601, 1.665], [0.245, 0.603, 0.825, -0.004, 1.064, 1.817], [-1.862, -0.49, 1.467, 1.026, 0.012, 1.363], [-1.379, 3.112, 1.213, 0.486, 0.543, 1.682], [0.858, 3.952, 1.318, 3.11, 0.53, 1.733], [1.684, 0.251, 1.226, 1.531, 0.586, 0.576], [0.354, 1.015, 0.82, 0.415, 0.222, 1.857], [-0.273, -2.214, 2.258, 4.248, 0.77, 0.29], [-2.467, 2.65, 1.797, 0.149, 0.618, 1.025], [-2.164, 4.048, 1.03, 0.675, 0.141, 1.166]]\nC: [[-0.372, -2.705, 1.171, 4.784, 0.513, 2.321], [1.95, -1.245, 1.075, 0.31, 2.969, 2.221], [-1.736, 0.974, 1.065, 0.251, 4.086, 2.141], [2.079, 1.895, 0.614, 0.176, 3.361, 1.364], [-2.712, -1.857, 1.256, 0.344, 1.764, 2.405], [1.315, 0.187, 0.814, 1.511, 0.109, 1.639], [0.561, 0.619, 0.751, 0.075, 0.858, 1.522], [-2.331, -0.947, 0.996, 1.029, 0.105, 1.863], [-0.884, 3.244, 0.946, 0.263, 0.884, 1.941], [0.617, 3.626, 1.612, 2.853, 0.244, 1.318], [1.37, 0.273, 0.995, 1.377, 0.117, 0.425], [0.697, 0.781, 1.082, 0.286, 0.193, 2.222], [-0.516, -2.505, 2.299, 4.488, 0.335, 0.203], [-2.543, 2.889, 1.67, 0.173, 0.883, 0.685], [-1.977, 3.626, 1.246, 0.551, 0.106, 1.503]]\nD: [[0.035, -2.86, 0.861, 4.728, 0.233, 1.918], [1.546, -1.395, 0.841, 0.548, 3.249, 1.737], [-1.897, 0.52, 1.01, -0.225, 4.45, 2.412], [1.773, 2.047, 0.256, 0.066, 3.328, 1.231], [-3.131, -2.108, 0.842, 0.6, 1.535, 2.741], [1.218, -0.29, 0.461, 1.245, -0.153, 2.098], [0.922, 0.65, 1.084, -0.181, 0.59, 1.506], [-2.276, -0.909, 0.599, 1.207, -0.285, 1.54], [-0.665, 3.431, 1.123, 0.223, 0.621, 1.641], [0.797, 3.806, 2.013, 2.472, 0.677, 1.495], [1.111, 0.293, 1.457, 1.431, 0.551, 0.85], [0.877, 1.185, 1.451, 0.625, -0.09, 2.43], [-0.138, -2.632, 2.484, 4.711, -0.137, 0.648], [-3.024, 2.792, 1.538, -0.201, 1.018, 0.323], [-2.319, 3.937, 1.522, 0.199, 0.289, 1.095]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[0.99336, -0.011945, -0.114427], [0.103059, -0.349694, 0.931178], [-0.051137, -0.936788, -0.346141]]; the translation vector: [2.948285, 4.432959, 1.460427], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.105, -2.287, 1.298, 5.216, 0.87, 2.807], [1.522, -1.359, 1.295, 0.692, 3.211, 2.519], [-1.482, 0.95, 0.954, -0.118, 3.651, 2.351], [2.186, 1.699, 1.104, -0.162, 3.237, 1.812], [-2.278, -2.05, 0.889, 0.84, 1.577, 2.045], [1.673, 0.416, 0.493, 2.003, 0.505, 2.036], [0.961, 0.497, 0.688, -0.227, 0.745, 1.241], [-2.744, -1.281, 0.568, 1.033, 0.219, 1.748], [-1.203, 3.13, 0.749, 0.354, 1.039, 2.281], [0.453, 3.938, 1.358, 3.143, -0.207, 1.101], [1.577, -0.17, 0.647, 1.504, 0.197, 0.91], [0.55, 0.958, 0.74, -0.022, 0.508, 1.915], [-0.28, -2.346, 2.125, 4.551, 0.291, 0.411], [-2.245, 3.374, 2.167, 0.315, 0.57, 1.021], [-1.883, 4.031, 0.791, 0.575, 0.114, 1.338]]\nB: [[-0.045, -2.933, 0.729, 5.273, 0.38, 1.87], [1.815, -0.872, 1.2, 0.381, 3.199, 2.451], [-1.835, 0.525, 0.883, 0.663, 4.217, 2.299], [1.852, 1.563, 0.117, 0.236, 3.143, 0.945], [-2.89, -2.063, 0.836, 0.206, 1.672, 2.238], [1.326, 0.326, 0.98, 1.535, 0.601, 1.665], [0.245, 0.603, 0.825, -0.004, 1.064, 1.817], [-1.862, -0.49, 1.467, 1.026, 0.012, 1.363], [-1.379, 3.112, 1.213, 0.486, 0.543, 1.682], [0.858, 3.952, 1.318, 3.11, 0.53, 1.733], [1.684, 0.251, 1.226, 1.531, 0.586, 0.576], [0.354, 1.015, 0.82, 0.415, 0.222, 1.857], [-0.273, -2.214, 2.258, 4.248, 0.77, 0.29], [-2.467, 2.65, 1.797, 0.149, 0.618, 1.025], [-2.164, 4.048, 1.03, 0.675, 0.141, 1.166]]\nC: [[-0.372, -2.705, 1.171, 4.784, 0.513, 2.321], [1.95, -1.245, 1.075, 0.31, 2.969, 2.221], [-1.736, 0.974, 1.065, 0.251, 4.086, 2.141], [2.079, 1.895, 0.614, 0.176, 3.361, 1.364], [-2.712, -1.857, 1.256, 0.344, 1.764, 2.405], [1.315, 0.187, 0.814, 1.511, 0.109, 1.639], [0.561, 0.619, 0.751, 0.075, 0.858, 1.522], [-2.331, -0.947, 0.996, 1.029, 0.105, 1.863], [-0.884, 3.244, 0.946, 0.263, 0.884, 1.941], [0.617, 3.626, 1.612, 2.853, 0.244, 1.318], [1.37, 0.273, 0.995, 1.377, 0.117, 0.425], [0.697, 0.781, 1.082, 0.286, 0.193, 2.222], [-0.516, -2.505, 2.299, 4.488, 0.335, 0.203], [-2.543, 2.889, 1.67, 0.173, 0.883, 0.685], [-1.977, 3.626, 1.246, 0.551, 0.106, 1.503]]\nD: [[0.035, -2.86, 0.861, 4.728, 0.233, 1.918], [1.546, -1.395, 0.841, 0.548, 3.249, 1.737], [-1.897, 0.52, 1.01, -0.225, 4.45, 2.412], [1.773, 2.047, 0.256, 0.066, 3.328, 1.231], [-3.131, -2.108, 0.842, 0.6, 1.535, 2.741], [1.218, -0.29, 0.461, 1.245, -0.153, 2.098], [0.922, 0.65, 1.084, -0.181, 0.59, 1.506], [-2.276, -0.909, 0.599, 1.207, -0.285, 1.54], [-0.665, 3.431, 1.123, 0.223, 0.621, 1.641], [0.797, 3.806, 2.013, 2.472, 0.677, 1.495], [1.111, 0.293, 1.457, 1.431, 0.551, 0.85], [0.877, 1.185, 1.451, 0.625, -0.09, 2.43], [-0.138, -2.632, 2.484, 4.711, -0.137, 0.648], [-3.024, 2.792, 1.538, -0.201, 1.018, 0.323], [-2.319, 3.937, 1.522, 0.199, 0.289, 1.095]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_85_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_85_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-2.057, -0.804, 1.454, 0.442, 9.194, 2.993], [-0.24, 3.939, 1.662, 3.88, 0.819, 2.915], [1.686, 1.778, 1.614, 0.375, 4.14, 2.879], [1.518, -0.292, 1.39, 0.502, 0.183, 1.29], [1.407, -0.606, 1.045, 0.392, 0.791, 2.018], [1.569, -2.479, 1.01, 0.478, 3.37, 1.883]]\nB: [[-1.712, -0.852, 0.991, 0.707, 9.653, 3.103], [0.145, 4.4, 1.88, 4.062, 1.231, 2.667], [1.473, 2.151, 1.876, 0.37, 4.413, 3.184], [1.75, -0.649, 1.384, 0.602, -0.213, 1.435], [1.139, -0.573, 1.304, 0.885, 0.718, 2.242], [1.077, -2.453, 0.735, 0.583, 3.786, 1.438]]\nC: [[-1.586, -0.802, 1.264, 0.785, 8.752, 2.813], [-0.226, 4.305, 1.323, 3.698, 1.086, 3.015], [1.969, 1.342, 1.623, -0.075, 3.888, 3.299], [1.213, -0.465, 1.751, 0.015, 0.594, 1.001], [0.993, -0.822, 1.254, 0.504, 1.181, 1.943], [1.069, -2.03, 1.336, 0.651, 3.224, 1.602]]\nD: [[-2.191, -0.396, 1.663, 0.009, 8.751, 3.114], [0.038, 3.888, 1.488, 4.056, 0.477, 3.26], [2.082, 1.991, 1.998, -0.123, 3.891, 2.467], [1.903, -0.079, 0.895, 0.439, 0.291, 0.791], [1.022, -0.776, 0.73, 0.121, 0.449, 1.843], [1.7, -2.034, 1.291, 0.089, 3.481, 2.087]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[-0.908726, 0.150598, -0.389277], [0.406624, 0.108936, -0.907078], [-0.094198, -0.982575, -0.16023]]; the translation vector: [8.822721, 3.830595, 1.476402], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-2.057, -0.804, 1.454, 0.442, 9.194, 2.993], [-0.24, 3.939, 1.662, 3.88, 0.819, 2.915], [1.686, 1.778, 1.614, 0.375, 4.14, 2.879], [1.518, -0.292, 1.39, 0.502, 0.183, 1.29], [1.407, -0.606, 1.045, 0.392, 0.791, 2.018], [1.569, -2.479, 1.01, 0.478, 3.37, 1.883]]\nB: [[-1.712, -0.852, 0.991, 0.707, 9.653, 3.103], [0.145, 4.4, 1.88, 4.062, 1.231, 2.667], [1.473, 2.151, 1.876, 0.37, 4.413, 3.184], [1.75, -0.649, 1.384, 0.602, -0.213, 1.435], [1.139, -0.573, 1.304, 0.885, 0.718, 2.242], [1.077, -2.453, 0.735, 0.583, 3.786, 1.438]]\nC: [[-1.586, -0.802, 1.264, 0.785, 8.752, 2.813], [-0.226, 4.305, 1.323, 3.698, 1.086, 3.015], [1.969, 1.342, 1.623, -0.075, 3.888, 3.299], [1.213, -0.465, 1.751, 0.015, 0.594, 1.001], [0.993, -0.822, 1.254, 0.504, 1.181, 1.943], [1.069, -2.03, 1.336, 0.651, 3.224, 1.602]]\nD: [[-2.191, -0.396, 1.663, 0.009, 8.751, 3.114], [0.038, 3.888, 1.488, 4.056, 0.477, 3.26], [2.082, 1.991, 1.998, -0.123, 3.891, 2.467], [1.903, -0.079, 0.895, 0.439, 0.291, 0.791], [1.022, -0.776, 0.73, 0.121, 0.449, 1.843], [1.7, -2.034, 1.291, 0.089, 3.481, 2.087]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_86_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_86_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-1.406, -0.499, 0.127, 1.547, 0.014, 1.403], [1.954, 0.044, 0.911, 0.502, 4.499, 1.21], [0.82, -1.82, 0.463, 3.057, 0.123, 1.62], [-1.403, -1.396, 1.129, 0.78, 1.058, 1.608], [-1.077, 2.261, 0.337, 0.558, -0.028, 0.985]]\nB: [[-1.494, -0.876, 0.44, 1.936, 0.339, 1.286], [1.687, 0.61, 0.241, 0.363, 4.812, 0.869], [0.73, -1.681, 0.677, 2.541, -0.283, 1.252], [-1.274, -1.409, 0.377, 0.698, 1.296, 0.85], [-1.137, 2.544, 0.383, 0.033, 0.761, 1.018]]\nC: [[-2.268, -0.745, 0.912, 1.628, 0.264, 0.904], [1.958, 0.486, 0.503, -0.211, 4.549, 1.566], [-0.072, -2.097, 0.667, 2.893, 0.559, 1.549], [-0.724, -1.085, 0.723, 0.553, 1.77, 1.227], [-0.926, 2.697, 1.076, 0.821, 0.34, 1.204]]\nD: [[-1.791, -0.394, 0.511, 1.684, 0.11, 0.995], [1.786, 0.422, 0.664, 0.168, 4.577, 1.321], [0.388, -1.877, 0.715, 2.729, 0.147, 1.176], [-1.044, -1.126, 0.648, 0.431, 1.546, 1.277], [-1.081, 2.203, 0.66, 0.386, 0.403, 0.882]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[-0.997112, 0.02462, 0.071841], [-0.04661, 0.548461, -0.834876], [-0.059957, -0.835814, -0.545729]]; the translation vector: [4.834615, 3.436689, 1.398379], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.406, -0.499, 0.127, 1.547, 0.014, 1.403], [1.954, 0.044, 0.911, 0.502, 4.499, 1.21], [0.82, -1.82, 0.463, 3.057, 0.123, 1.62], [-1.403, -1.396, 1.129, 0.78, 1.058, 1.608], [-1.077, 2.261, 0.337, 0.558, -0.028, 0.985]]\nB: [[-1.494, -0.876, 0.44, 1.936, 0.339, 1.286], [1.687, 0.61, 0.241, 0.363, 4.812, 0.869], [0.73, -1.681, 0.677, 2.541, -0.283, 1.252], [-1.274, -1.409, 0.377, 0.698, 1.296, 0.85], [-1.137, 2.544, 0.383, 0.033, 0.761, 1.018]]\nC: [[-2.268, -0.745, 0.912, 1.628, 0.264, 0.904], [1.958, 0.486, 0.503, -0.211, 4.549, 1.566], [-0.072, -2.097, 0.667, 2.893, 0.559, 1.549], [-0.724, -1.085, 0.723, 0.553, 1.77, 1.227], [-0.926, 2.697, 1.076, 0.821, 0.34, 1.204]]\nD: [[-1.791, -0.394, 0.511, 1.684, 0.11, 0.995], [1.786, 0.422, 0.664, 0.168, 4.577, 1.321], [0.388, -1.877, 0.715, 2.729, 0.147, 1.176], [-1.044, -1.126, 0.648, 0.431, 1.546, 1.277], [-1.081, 2.203, 0.66, 0.386, 0.403, 0.882]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_87_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_87_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[0.113, 0.087, 0.564, 0.343, 0.527, 0.305], [0.507, 0.467, 0.458, 0.596, 0.504, 0.317], [0.58, 0.988, 0.644, 0.601, 0.651, 0.477], [0.182, 1.04, 0.677, 0.777, 0.505, 0.512], [1.732, 0.733, 0.527, 0.634, 0.573, 0.263], [1.609, 1.049, 0.659, 0.686, 0.387, 0.426]]\nB: [[0.129, 0.521, 0.187, 0.313, 0.856, 0.592], [0.981, 0.918, 0.313, 0.429, 0.812, 0.551], [0.233, 0.816, 0.228, 0.26, 0.574, 0.165], [-0.257, 0.76, 1.031, 0.337, 0.304, 1.005], [1.703, 1.1, 0.991, 1.058, 0.84, 0.596], [1.167, 0.943, 0.538, 0.487, 0.187, 0.143]]\nC: [[0.577, 0.356, 0.8, 0.107, 0.25, -0.032], [0.156, 0.937, 0.399, 0.676, 0.726, 0.633], [0.215, 0.658, 0.629, 0.763, 0.937, 0.472], [0.377, 0.594, 0.698, 1.038, 0.047, 0.378], [1.421, 1.109, 0.213, 0.954, 0.857, -0.124], [1.144, 1.512, 0.746, 0.326, 0.254, -0.001]]\nD: [[-0.375, 0.568, 0.757, 0.525, 0.71, 0.684], [0.596, 0.141, 0.679, 0.896, 0.714, 0.623], [0.506, 1.007, 0.844, 0.63, 0.899, 0.696], [-0.281, 1.187, 1.15, 1.186, 0.539, 1.005], [1.823, 0.702, 0.5, 0.724, 0.202, 0.553], [1.882, 1.516, 0.881, 1.085, 0.712, 0.444]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the pillow in the scene. The camera pose information includes: the rotation matrix: [[-0.971613, -0.06682, 0.226943], [-0.235147, 0.378036, -0.89543], [-0.02596, -0.923376, -0.383017]]; the translation vector: [2.775299, 4.618156, 1.427592], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.113, 0.087, 0.564, 0.343, 0.527, 0.305], [0.507, 0.467, 0.458, 0.596, 0.504, 0.317], [0.58, 0.988, 0.644, 0.601, 0.651, 0.477], [0.182, 1.04, 0.677, 0.777, 0.505, 0.512], [1.732, 0.733, 0.527, 0.634, 0.573, 0.263], [1.609, 1.049, 0.659, 0.686, 0.387, 0.426]]\nB: [[0.129, 0.521, 0.187, 0.313, 0.856, 0.592], [0.981, 0.918, 0.313, 0.429, 0.812, 0.551], [0.233, 0.816, 0.228, 0.26, 0.574, 0.165], [-0.257, 0.76, 1.031, 0.337, 0.304, 1.005], [1.703, 1.1, 0.991, 1.058, 0.84, 0.596], [1.167, 0.943, 0.538, 0.487, 0.187, 0.143]]\nC: [[0.577, 0.356, 0.8, 0.107, 0.25, -0.032], [0.156, 0.937, 0.399, 0.676, 0.726, 0.633], [0.215, 0.658, 0.629, 0.763, 0.937, 0.472], [0.377, 0.594, 0.698, 1.038, 0.047, 0.378], [1.421, 1.109, 0.213, 0.954, 0.857, -0.124], [1.144, 1.512, 0.746, 0.326, 0.254, -0.001]]\nD: [[-0.375, 0.568, 0.757, 0.525, 0.71, 0.684], [0.596, 0.141, 0.679, 0.896, 0.714, 0.623], [0.506, 1.007, 0.844, 0.63, 0.899, 0.696], [-0.281, 1.187, 1.15, 1.186, 0.539, 1.005], [1.823, 0.702, 0.5, 0.724, 0.202, 0.553], [1.882, 1.516, 0.881, 1.085, 0.712, 0.444]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_88_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_88_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[0.081, -2.379, 1.113, 0.296, 0.447, 1.93], [-2.179, -2.328, 1.113, 1.155, -0.055, 2.191], [0.187, -2.593, 1.0, 1.525, -0.011, 1.61]]\nB: [[0.156, -1.688, 0.672, 1.099, 0.512, 1.787], [-1.601, -2.386, 1.059, 0.494, 0.257, 2.331], [1.089, -2.9, 1.408, 0.896, 0.19, 1.392]]\nC: [[-0.153, -1.917, 0.934, 0.637, 0.572, 1.999], [-2.071, -2.511, 0.942, 0.893, 0.199, 2.089], [0.673, -2.564, 1.392, 1.046, 0.131, 1.552]]\nD: [[0.141, -1.904, 0.579, 0.681, 0.731, 2.01], [-1.907, -2.369, 0.924, 0.512, 0.574, 2.053], [0.733, -2.632, 1.326, 0.995, 0.386, 1.64]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the door in the scene. The camera pose information includes: the rotation matrix: [[-0.086843, 0.425015, -0.901011], [0.995696, 0.066429, -0.064634], [0.032383, -0.902745, -0.428955]]; the translation vector: [4.261571, 5.85756, 1.66629], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.081, -2.379, 1.113, 0.296, 0.447, 1.93], [-2.179, -2.328, 1.113, 1.155, -0.055, 2.191], [0.187, -2.593, 1.0, 1.525, -0.011, 1.61]]\nB: [[0.156, -1.688, 0.672, 1.099, 0.512, 1.787], [-1.601, -2.386, 1.059, 0.494, 0.257, 2.331], [1.089, -2.9, 1.408, 0.896, 0.19, 1.392]]\nC: [[-0.153, -1.917, 0.934, 0.637, 0.572, 1.999], [-2.071, -2.511, 0.942, 0.893, 0.199, 2.089], [0.673, -2.564, 1.392, 1.046, 0.131, 1.552]]\nD: [[0.141, -1.904, 0.579, 0.681, 0.731, 2.01], [-1.907, -2.369, 0.924, 0.512, 0.574, 2.053], [0.733, -2.632, 1.326, 0.995, 0.386, 1.64]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_89_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_89_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[1.066, -4.092, 0.322, 1.809, 0.144, 0.711], [-0.452, -3.998, 0.399, -0.022, -0.207, 0.313]]\nB: [[0.967, -4.137, 0.415, 1.862, 0.896, 0.809], [-0.541, -3.922, 0.752, 0.892, 0.695, 1.151]]\nC: [[0.859, -4.189, 1.178, 1.424, -0.037, 1.276], [-0.399, -4.209, 0.397, 0.399, 0.232, 1.02]]\nD: [[0.733, -4.146, 0.771, 1.808, 0.42, 0.818], [-0.752, -4.266, 0.836, 0.396, 0.285, 0.778]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the window in the scene. The camera pose information includes: the rotation matrix: [[0.504428, 0.479717, -0.717931], [0.860003, -0.204862, 0.467362], [0.077124, -0.853173, -0.515896]]; the translation vector: [4.973708, 0.412451, 1.573636], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.066, -4.092, 0.322, 1.809, 0.144, 0.711], [-0.452, -3.998, 0.399, -0.022, -0.207, 0.313]]\nB: [[0.967, -4.137, 0.415, 1.862, 0.896, 0.809], [-0.541, -3.922, 0.752, 0.892, 0.695, 1.151]]\nC: [[0.859, -4.189, 1.178, 1.424, -0.037, 1.276], [-0.399, -4.209, 0.397, 0.399, 0.232, 1.02]]\nD: [[0.733, -4.146, 0.771, 1.808, 0.42, 0.818], [-0.752, -4.266, 0.836, 0.396, 0.285, 0.778]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_90_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_90_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-2.356, -1.033, 0.487, 1.053, 0.829, 1.337]]\nB: [[-1.861, -0.729, 0.172, 1.066, 1.25, 1.068]]\nC: [[-2.075, -0.604, 0.467, 1.418, 0.63, 1.288]]\nD: [[-2.244, -1.03, 0.539, 1.266, 0.775, 0.934]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the cabinet in the scene. The camera pose information includes: the rotation matrix: [[-0.132001, -0.567775, 0.812532], [-0.991224, 0.069667, -0.112349], [0.007182, -0.820231, -0.571988]]; the translation vector: [2.407685, 4.450429, 1.359714], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-2.356, -1.033, 0.487, 1.053, 0.829, 1.337]]\nB: [[-1.861, -0.729, 0.172, 1.066, 1.25, 1.068]]\nC: [[-2.075, -0.604, 0.467, 1.418, 0.63, 1.288]]\nD: [[-2.244, -1.03, 0.539, 1.266, 0.775, 0.934]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_91_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_91_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[0.569, 0.211, 0.319, 0.687, 0.401, 0.55], [-0.378, 2.451, 0.757, 1.108, 0.785, 1.152], [-0.442, -3.047, 0.599, 0.595, 0.53, 0.698], [-0.671, -2.103, 0.492, 0.589, 0.785, 1.436], [-0.536, -2.312, 0.381, 0.676, 0.927, 0.8], [0.694, -2.162, -0.024, 0.318, 0.238, 1.069], [0.8, -2.531, 0.157, 0.887, 0.472, 0.605], [-0.017, 0.764, 0.766, 0.464, 0.143, 1.084]]\nB: [[-0.14, -0.504, 0.958, 0.996, 0.333, 0.616], [-0.523, 2.406, 0.116, 1.014, 1.032, 0.584], [-1.041, -3.534, 0.221, 1.124, 0.509, 0.64], [-1.178, -1.955, 0.316, 0.454, 0.967, 0.762], [-0.074, -2.655, 0.057, 0.407, 0.341, 0.817], [0.498, -1.8, 0.525, 0.171, 1.003, 0.793], [0.349, -2.636, 0.785, 0.651, 0.822, 0.565], [-0.067, 1.46, 0.267, 0.865, 0.829, 0.524]]\nC: [[0.244, -0.138, 0.489, 0.688, 0.662, 1.02], [-0.663, 2.462, 0.398, 0.618, 0.647, 0.654], [-0.762, -3.211, 0.433, 0.631, 0.73, 0.899], [-0.866, -2.412, 0.459, 0.652, 0.663, 0.995], [-0.182, -2.73, 0.386, 0.664, 0.667, 0.841], [0.386, -2.023, 0.44, 0.586, 0.689, 0.943], [0.543, -2.581, 0.583, 0.445, 0.548, 0.641], [0.339, 1.261, 0.575, 0.571, 0.572, 0.783]]\nD: [[0.09, 0.046, 0.862, 0.335, 0.771, 1.401], [-0.263, 2.607, 0.862, 0.364, 1.092, 0.886], [-1.02, -3.334, 0.931, 1.001, 0.759, 0.875], [-0.888, -2.153, 0.017, 0.223, 0.261, 0.633], [-0.543, -2.555, 0.32, 1.086, 0.816, 0.575], [0.862, -2.2, 0.258, 0.465, 0.987, 0.866], [0.065, -2.865, 0.495, 0.697, 0.945, 0.331], [0.317, 1.592, 1.019, 0.326, 0.876, 0.791]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the office chair in the scene. The camera pose information includes: the rotation matrix: [[0.672393, -0.274439, 0.687438], [-0.739855, -0.221079, 0.635404], [-0.022402, -0.935846, -0.351697]]; the translation vector: [3.802358, 2.110255, 1.494557], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.569, 0.211, 0.319, 0.687, 0.401, 0.55], [-0.378, 2.451, 0.757, 1.108, 0.785, 1.152], [-0.442, -3.047, 0.599, 0.595, 0.53, 0.698], [-0.671, -2.103, 0.492, 0.589, 0.785, 1.436], [-0.536, -2.312, 0.381, 0.676, 0.927, 0.8], [0.694, -2.162, -0.024, 0.318, 0.238, 1.069], [0.8, -2.531, 0.157, 0.887, 0.472, 0.605], [-0.017, 0.764, 0.766, 0.464, 0.143, 1.084]]\nB: [[-0.14, -0.504, 0.958, 0.996, 0.333, 0.616], [-0.523, 2.406, 0.116, 1.014, 1.032, 0.584], [-1.041, -3.534, 0.221, 1.124, 0.509, 0.64], [-1.178, -1.955, 0.316, 0.454, 0.967, 0.762], [-0.074, -2.655, 0.057, 0.407, 0.341, 0.817], [0.498, -1.8, 0.525, 0.171, 1.003, 0.793], [0.349, -2.636, 0.785, 0.651, 0.822, 0.565], [-0.067, 1.46, 0.267, 0.865, 0.829, 0.524]]\nC: [[0.244, -0.138, 0.489, 0.688, 0.662, 1.02], [-0.663, 2.462, 0.398, 0.618, 0.647, 0.654], [-0.762, -3.211, 0.433, 0.631, 0.73, 0.899], [-0.866, -2.412, 0.459, 0.652, 0.663, 0.995], [-0.182, -2.73, 0.386, 0.664, 0.667, 0.841], [0.386, -2.023, 0.44, 0.586, 0.689, 0.943], [0.543, -2.581, 0.583, 0.445, 0.548, 0.641], [0.339, 1.261, 0.575, 0.571, 0.572, 0.783]]\nD: [[0.09, 0.046, 0.862, 0.335, 0.771, 1.401], [-0.263, 2.607, 0.862, 0.364, 1.092, 0.886], [-1.02, -3.334, 0.931, 1.001, 0.759, 0.875], [-0.888, -2.153, 0.017, 0.223, 0.261, 0.633], [-0.543, -2.555, 0.32, 1.086, 0.816, 0.575], [0.862, -2.2, 0.258, 0.465, 0.987, 0.866], [0.065, -2.865, 0.495, 0.697, 0.945, 0.331], [0.317, 1.592, 1.019, 0.326, 0.876, 0.791]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_92_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_92_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-1.126, -1.376, 0.347, 0.441, 1.044, 0.747]]\nB: [[-0.707, -1.056, 0.436, 0.481, 0.775, 0.862]]\nC: [[-1.072, -0.581, 0.729, 0.634, 0.411, 0.815]]\nD: [[-1.2, -0.714, 0.073, 0.598, 1.239, 1.356]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the toilet in the scene. The camera pose information includes: the rotation matrix: [[-0.943065, -0.17817, 0.280864], [-0.332105, 0.550897, -0.765649], [-0.018311, -0.815333, -0.578703]]; the translation vector: [2.74599, 1.673222, 1.294065], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.126, -1.376, 0.347, 0.441, 1.044, 0.747]]\nB: [[-0.707, -1.056, 0.436, 0.481, 0.775, 0.862]]\nC: [[-1.072, -0.581, 0.729, 0.634, 0.411, 0.815]]\nD: [[-1.2, -0.714, 0.073, 0.598, 1.239, 1.356]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_93_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_93_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-1.746, -0.676, 1.548, 0.354, 0.507, 0.554], [1.278, -0.21, 2.039, 0.253, 0.159, 0.277], [1.354, -0.174, 2.085, 0.187, 0.25, 0.284], [1.365, 0.302, 2.07, 0.178, 0.146, 0.195], [1.395, 1.775, 0.709, 0.116, 0.082, 0.239], [0.108, -1.232, 0.61, 0.37, 0.243, 0.232]]\nB: [[-2.116, -0.405, 1.974, 0.197, 0.992, 0.793], [1.595, -0.115, 1.898, 0.53, -0.095, 0.207], [1.74, -0.462, 1.811, 0.459, 0.366, 0.195], [1.756, -0.03, 2.139, 0.506, -0.218, -0.14], [1.496, 1.894, 0.22, -0.344, -0.274, 0.329], [0.12, -1.361, 0.247, 0.677, 0.431, 0.41]]\nC: [[-2.099, -0.677, 1.826, 0.111, 0.048, 0.88], [1.179, -0.084, 2.064, 0.353, -0.335, 0.047], [1.283, -0.017, 2.251, 0.548, 0.539, -0.139], [1.054, -0.131, 1.995, -0.052, 0.135, -0.266], [1.813, 1.809, 0.298, 0.268, -0.092, 0.575], [0.507, -1.135, 0.122, 0.102, 0.682, -0.107]]\nD: [[-2.013, -0.781, 2.031, 0.552, 0.053, 0.962], [1.49, 0.048, 1.694, 0.076, -0.303, 0.184], [1.646, 0.043, 2.403, 0.082, 0.014, 0.773], [1.068, 0.187, 2.309, 0.672, -0.201, 0.291], [1.861, 1.412, 0.913, 0.343, -0.022, 0.312], [-0.111, -1.095, 0.386, 0.723, 0.064, 0.108]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the object in the scene. The camera pose information includes: the rotation matrix: [[0.493838, -0.420518, 0.76111], [-0.864926, -0.147366, 0.479777], [-0.089593, -0.895236, -0.436493]]; the translation vector: [0.736944, 2.108944, 1.402726], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.746, -0.676, 1.548, 0.354, 0.507, 0.554], [1.278, -0.21, 2.039, 0.253, 0.159, 0.277], [1.354, -0.174, 2.085, 0.187, 0.25, 0.284], [1.365, 0.302, 2.07, 0.178, 0.146, 0.195], [1.395, 1.775, 0.709, 0.116, 0.082, 0.239], [0.108, -1.232, 0.61, 0.37, 0.243, 0.232]]\nB: [[-2.116, -0.405, 1.974, 0.197, 0.992, 0.793], [1.595, -0.115, 1.898, 0.53, -0.095, 0.207], [1.74, -0.462, 1.811, 0.459, 0.366, 0.195], [1.756, -0.03, 2.139, 0.506, -0.218, -0.14], [1.496, 1.894, 0.22, -0.344, -0.274, 0.329], [0.12, -1.361, 0.247, 0.677, 0.431, 0.41]]\nC: [[-2.099, -0.677, 1.826, 0.111, 0.048, 0.88], [1.179, -0.084, 2.064, 0.353, -0.335, 0.047], [1.283, -0.017, 2.251, 0.548, 0.539, -0.139], [1.054, -0.131, 1.995, -0.052, 0.135, -0.266], [1.813, 1.809, 0.298, 0.268, -0.092, 0.575], [0.507, -1.135, 0.122, 0.102, 0.682, -0.107]]\nD: [[-2.013, -0.781, 2.031, 0.552, 0.053, 0.962], [1.49, 0.048, 1.694, 0.076, -0.303, 0.184], [1.646, 0.043, 2.403, 0.082, 0.014, 0.773], [1.068, 0.187, 2.309, 0.672, -0.201, 0.291], [1.861, 1.412, 0.913, 0.343, -0.022, 0.312], [-0.111, -1.095, 0.386, 0.723, 0.064, 0.108]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_94_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_94_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[0.095, 0.369, 0.896, 1.864, 0.389, -0.037]]\nB: [[0.821, 1.024, 0.461, 1.589, 1.059, 0.417]]\nC: [[0.235, 0.419, 0.494, 1.232, 0.977, 0.5]]\nD: [[0.531, 0.805, 0.846, 1.569, 0.745, 0.229]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the counter in the scene. The camera pose information includes: the rotation matrix: [[0.882784, 0.25224, -0.396318], [0.469583, -0.498211, 0.728888], [-0.013595, -0.829554, -0.55826]]; the translation vector: [3.463734, 1.394934, 1.262723], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.095, 0.369, 0.896, 1.864, 0.389, -0.037]]\nB: [[0.821, 1.024, 0.461, 1.589, 1.059, 0.417]]\nC: [[0.235, 0.419, 0.494, 1.232, 0.977, 0.5]]\nD: [[0.531, 0.805, 0.846, 1.569, 0.745, 0.229]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_95_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_95_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-0.71, -0.427, 1.41, -0.084, 0.707, -0.208]]\nB: [[-1.108, -0.854, 1.201, 0.423, 0.471, 0.68]]\nC: [[-1.305, -0.718, 1.12, 0.437, 0.021, 0.653]]\nD: [[-1.106, -0.393, 0.937, 0.241, 0.317, 0.242]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the tray in the scene. The camera pose information includes: the rotation matrix: [[-0.998162, -0.007354, -0.06016], [0.055338, 0.294228, -0.954132], [0.024717, -0.955707, -0.293281]]; the translation vector: [1.687981, 4.43329, 1.569003], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.71, -0.427, 1.41, -0.084, 0.707, -0.208]]\nB: [[-1.108, -0.854, 1.201, 0.423, 0.471, 0.68]]\nC: [[-1.305, -0.718, 1.12, 0.437, 0.021, 0.653]]\nD: [[-1.106, -0.393, 0.937, 0.241, 0.317, 0.242]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_96_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_96_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[1.882, 1.56, 0.388, 0.537, 1.676, 0.722], [-0.93, 1.385, 0.286, 0.589, 0.589, 0.521], [-0.937, -1.858, 0.442, 0.583, 0.58, 0.542]]\nB: [[1.943, 1.267, 0.682, 0.711, 1.577, 0.374], [-1.208, 1.812, -0.196, 1.059, 0.169, 0.521], [-1.321, -1.601, 0.071, 0.85, 0.083, 0.59]]\nC: [[2.195, 1.182, 0.758, 0.43, 1.952, 0.35], [-1.23, 1.71, 0.54, 0.173, 0.389, 0.39], [-0.765, -1.788, 0.133, 0.882, 0.65, 0.803]]\nD: [[1.615, 1.264, -0.077, 0.87, 1.187, 0.662], [-0.905, 1.561, 0.641, 0.894, 0.612, 0.112], [-0.628, -2.319, 0.352, 0.102, 0.924, 0.919]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the table in the scene. The camera pose information includes: the rotation matrix: [[-0.530794, 0.426739, -0.732224], [0.841151, 0.159702, -0.516681], [-0.10355, -0.890162, -0.443721]]; the translation vector: [5.418979, 4.373359, 1.385162], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.882, 1.56, 0.388, 0.537, 1.676, 0.722], [-0.93, 1.385, 0.286, 0.589, 0.589, 0.521], [-0.937, -1.858, 0.442, 0.583, 0.58, 0.542]]\nB: [[1.943, 1.267, 0.682, 0.711, 1.577, 0.374], [-1.208, 1.812, -0.196, 1.059, 0.169, 0.521], [-1.321, -1.601, 0.071, 0.85, 0.083, 0.59]]\nC: [[2.195, 1.182, 0.758, 0.43, 1.952, 0.35], [-1.23, 1.71, 0.54, 0.173, 0.389, 0.39], [-0.765, -1.788, 0.133, 0.882, 0.65, 0.803]]\nD: [[1.615, 1.264, -0.077, 0.87, 1.187, 0.662], [-0.905, 1.561, 0.641, 0.894, 0.612, 0.112], [-0.628, -2.319, 0.352, 0.102, 0.924, 0.919]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_97_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_97_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[0.419, -1.385, 0.468, 0.603, 0.548, -0.002]]\nB: [[0.568, -0.742, 0.118, 0.677, 0.387, 0.514]]\nC: [[0.186, -1.693, 0.012, 1.09, 0.395, 0.456]]\nD: [[0.461, -1.208, 0.23, 0.711, 0.358, 0.459]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the toilet in the scene. The camera pose information includes: the rotation matrix: [[0.695296, -0.421579, 0.582095], [-0.717067, -0.351947, 0.601622], [-0.048765, -0.835707, -0.547007]]; the translation vector: [2.470866, 0.652559, 1.473924], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.419, -1.385, 0.468, 0.603, 0.548, -0.002]]\nB: [[0.568, -0.742, 0.118, 0.677, 0.387, 0.514]]\nC: [[0.186, -1.693, 0.012, 1.09, 0.395, 0.456]]\nD: [[0.461, -1.208, 0.23, 0.711, 0.358, 0.459]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_98_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_98_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-0.786, 0.016, 0.222, 1.217, 0.653, 0.431]]\nB: [[-1.037, -0.227, 0.31, 1.564, 0.876, 0.857]]\nC: [[-1.111, -0.46, 0.292, 1.65, 0.975, 0.105]]\nD: [[-0.725, -0.136, -0.167, 0.877, 0.479, 0.631]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the coffee table in the scene. The camera pose information includes: the rotation matrix: [[0.408988, -0.323891, 0.853126], [-0.912443, -0.158736, 0.37716], [0.013263, -0.932683, -0.360453]]; the translation vector: [3.672612, 2.990265, 1.494339], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.786, 0.016, 0.222, 1.217, 0.653, 0.431]]\nB: [[-1.037, -0.227, 0.31, 1.564, 0.876, 0.857]]\nC: [[-1.111, -0.46, 0.292, 1.65, 0.975, 0.105]]\nD: [[-0.725, -0.136, -0.167, 0.877, 0.479, 0.631]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_99_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_99_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[1.78, -0.369, 0.924, 0.56, 5.207, 1.466], [-1.469, 1.84, 1.529, 0.1, 3.46, 1.026], [1.603, 3.517, 0.975, 0.924, 0.045, 0.875], [-1.453, 3.239, 1.664, 0.75, -0.019, 1.939], [-1.132, -0.282, 1.01, 0.6, 1.619, 1.743], [-0.943, -1.737, 1.065, 0.938, -0.129, 1.932], [-0.462, -1.272, 0.976, 0.13, 0.263, 2.111], [-0.647, -3.37, 0.86, -0.005, 1.127, 1.632], [0.759, -3.788, 0.905, 0.472, 0.723, 1.787], [1.365, -2.765, 1.105, 0.336, 0.467, 1.277]]\nB: [[1.074, -0.516, 1.118, -0.086, 5.725, 2.158], [-1.396, 2.343, 1.47, -0.008, 3.692, 1.57], [1.315, 3.398, 1.101, 1.035, 0.612, 0.867], [-0.97, 3.198, 1.01, 0.853, 0.483, 1.503], [-1.961, -0.579, 0.799, 0.47, 0.959, 1.354], [-0.792, -1.093, 0.831, 0.98, 0.15, 1.422], [-0.764, -1.052, 0.538, 0.169, -0.099, 1.83], [-0.948, -3.534, 0.813, 0.512, 1.974, 2.262], [1.309, -3.86, 1.13, 0.074, 1.177, 0.95], [0.842, -2.685, 1.111, 0.225, 0.622, 1.342]]\nC: [[1.398, -0.078, 0.847, 0.238, 5.699, 1.741], [-1.453, 1.912, 1.74, 0.206, 3.243, 1.354], [1.514, 3.636, 0.972, 1.079, 0.266, 0.762], [-1.064, 3.584, 1.382, 0.689, 0.248, 1.654], [-1.552, -0.739, 0.879, 0.227, 1.257, 1.692], [-1.211, -1.342, 0.86, 0.655, 0.096, 1.73], [-0.902, -1.484, 0.9, 0.087, 0.331, 1.816], [-0.874, -3.114, 1.006, 0.184, 1.508, 2.084], [0.921, -3.404, 0.668, 0.136, 1.137, 1.434], [1.157, -2.863, 0.703, 0.531, 0.128, 1.521]]\nD: [[1.025, -0.536, 0.699, 0.592, 5.958, 2.064], [-1.605, 1.792, 2.153, -0.235, 3.185, 1.084], [1.02, 3.68, 1.082, 1.526, 0.082, 0.582], [-1.08, 3.95, 0.986, 0.299, -0.139, 1.856], [-1.893, -0.998, 0.689, 0.259, 1.727, 1.918], [-1.034, -1.551, 0.605, 0.948, 0.46, 1.541], [-1.095, -1.908, 1.355, 0.164, 0.298, 1.555], [-0.914, -3.165, 0.928, -0.077, 1.779, 1.639], [0.568, -3.209, 0.575, 0.598, 1.246, 1.226], [1.226, -3.252, 0.43, 0.831, 0.263, 1.38]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[-0.52463, -0.231347, 0.819293], [-0.850589, 0.102279, -0.515789], [0.03553, -0.96748, -0.25044]]; the translation vector: [5.897326, 2.792535, 1.553822], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.78, -0.369, 0.924, 0.56, 5.207, 1.466], [-1.469, 1.84, 1.529, 0.1, 3.46, 1.026], [1.603, 3.517, 0.975, 0.924, 0.045, 0.875], [-1.453, 3.239, 1.664, 0.75, -0.019, 1.939], [-1.132, -0.282, 1.01, 0.6, 1.619, 1.743], [-0.943, -1.737, 1.065, 0.938, -0.129, 1.932], [-0.462, -1.272, 0.976, 0.13, 0.263, 2.111], [-0.647, -3.37, 0.86, -0.005, 1.127, 1.632], [0.759, -3.788, 0.905, 0.472, 0.723, 1.787], [1.365, -2.765, 1.105, 0.336, 0.467, 1.277]]\nB: [[1.074, -0.516, 1.118, -0.086, 5.725, 2.158], [-1.396, 2.343, 1.47, -0.008, 3.692, 1.57], [1.315, 3.398, 1.101, 1.035, 0.612, 0.867], [-0.97, 3.198, 1.01, 0.853, 0.483, 1.503], [-1.961, -0.579, 0.799, 0.47, 0.959, 1.354], [-0.792, -1.093, 0.831, 0.98, 0.15, 1.422], [-0.764, -1.052, 0.538, 0.169, -0.099, 1.83], [-0.948, -3.534, 0.813, 0.512, 1.974, 2.262], [1.309, -3.86, 1.13, 0.074, 1.177, 0.95], [0.842, -2.685, 1.111, 0.225, 0.622, 1.342]]\nC: [[1.398, -0.078, 0.847, 0.238, 5.699, 1.741], [-1.453, 1.912, 1.74, 0.206, 3.243, 1.354], [1.514, 3.636, 0.972, 1.079, 0.266, 0.762], [-1.064, 3.584, 1.382, 0.689, 0.248, 1.654], [-1.552, -0.739, 0.879, 0.227, 1.257, 1.692], [-1.211, -1.342, 0.86, 0.655, 0.096, 1.73], [-0.902, -1.484, 0.9, 0.087, 0.331, 1.816], [-0.874, -3.114, 1.006, 0.184, 1.508, 2.084], [0.921, -3.404, 0.668, 0.136, 1.137, 1.434], [1.157, -2.863, 0.703, 0.531, 0.128, 1.521]]\nD: [[1.025, -0.536, 0.699, 0.592, 5.958, 2.064], [-1.605, 1.792, 2.153, -0.235, 3.185, 1.084], [1.02, 3.68, 1.082, 1.526, 0.082, 0.582], [-1.08, 3.95, 0.986, 0.299, -0.139, 1.856], [-1.893, -0.998, 0.689, 0.259, 1.727, 1.918], [-1.034, -1.551, 0.605, 0.948, 0.46, 1.541], [-1.095, -1.908, 1.355, 0.164, 0.298, 1.555], [-0.914, -3.165, 0.928, -0.077, 1.779, 1.639], [0.568, -3.209, 0.575, 0.598, 1.246, 1.226], [1.226, -3.252, 0.43, 0.831, 0.263, 1.38]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_100_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_100_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[1.346, -1.632, 0.468, 0.228, 0.435, 0.463], [1.166, -1.379, 0.353, 0.04, 0.901, -0.25], [1.058, -1.308, -0.427, 0.416, 0.148, -0.261], [1.588, -1.671, -0.302, 0.296, 0.611, 0.478], [1.29, -1.513, 0.294, 0.468, 0.683, 0.487]]\nB: [[1.331, -1.83, 0.338, 0.317, 0.297, 0.192], [1.086, -1.365, 0.034, 0.38, 0.508, 0.129], [1.22, -1.567, 0.058, 0.382, 0.375, 0.145], [1.153, -2.04, 0.055, 0.29, 0.371, 0.11], [1.391, -1.481, 0.041, 0.386, 0.621, 0.13]]\nC: [[1.118, -1.374, 0.329, -0.089, 0.113, 0.27], [1.322, -1.418, -0.243, 0.677, 0.961, -0.031], [1.042, -1.495, -0.402, 0.189, 0.317, 0.229], [1.027, -2.005, 0.379, 0.337, 0.077, -0.062], [1.617, -1.294, 0.41, -0.08, 0.836, 0.171]]\nD: [[1.601, -1.491, 0.468, 0.181, 0.51, -0.093], [0.965, -1.654, 0.463, 0.875, 0.478, 0.252], [1.604, -1.87, -0.185, 0.098, 0.676, 0.612], [1.637, -2.272, -0.12, 0.307, 0.185, 0.124], [1.563, -1.727, 0.204, 0.781, 0.373, 0.021]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the shoes in the scene. The camera pose information includes: the rotation matrix: [[-0.079656, -0.319192, 0.944337], [-0.994012, 0.096527, -0.051219], [-0.074805, -0.942762, -0.324969]]; the translation vector: [4.3352, 2.935251, 1.464921], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.346, -1.632, 0.468, 0.228, 0.435, 0.463], [1.166, -1.379, 0.353, 0.04, 0.901, -0.25], [1.058, -1.308, -0.427, 0.416, 0.148, -0.261], [1.588, -1.671, -0.302, 0.296, 0.611, 0.478], [1.29, -1.513, 0.294, 0.468, 0.683, 0.487]]\nB: [[1.331, -1.83, 0.338, 0.317, 0.297, 0.192], [1.086, -1.365, 0.034, 0.38, 0.508, 0.129], [1.22, -1.567, 0.058, 0.382, 0.375, 0.145], [1.153, -2.04, 0.055, 0.29, 0.371, 0.11], [1.391, -1.481, 0.041, 0.386, 0.621, 0.13]]\nC: [[1.118, -1.374, 0.329, -0.089, 0.113, 0.27], [1.322, -1.418, -0.243, 0.677, 0.961, -0.031], [1.042, -1.495, -0.402, 0.189, 0.317, 0.229], [1.027, -2.005, 0.379, 0.337, 0.077, -0.062], [1.617, -1.294, 0.41, -0.08, 0.836, 0.171]]\nD: [[1.601, -1.491, 0.468, 0.181, 0.51, -0.093], [0.965, -1.654, 0.463, 0.875, 0.478, 0.252], [1.604, -1.87, -0.185, 0.098, 0.676, 0.612], [1.637, -2.272, -0.12, 0.307, 0.185, 0.124], [1.563, -1.727, 0.204, 0.781, 0.373, 0.021]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_101_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_101_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-0.756, -0.319, 1.042, 0.429, 0.322, 0.448]]\nB: [[-0.968, 0.035, 0.911, 0.64, 0.607, 0.13]]\nC: [[-0.447, -0.667, 1.37, -0.048, 0.547, 0.66]]\nD: [[-0.868, -0.191, 0.853, 0.715, 0.451, 0.897]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the coffee maker in the scene. The camera pose information includes: the rotation matrix: [[-0.848489, -0.131122, 0.512712], [-0.527579, 0.133483, -0.838954], [0.041567, -0.982339, -0.182436]]; the translation vector: [2.702568, 1.718074, 1.602473], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.756, -0.319, 1.042, 0.429, 0.322, 0.448]]\nB: [[-0.968, 0.035, 0.911, 0.64, 0.607, 0.13]]\nC: [[-0.447, -0.667, 1.37, -0.048, 0.547, 0.66]]\nD: [[-0.868, -0.191, 0.853, 0.715, 0.451, 0.897]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_102_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_102_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[1.912, -2.332, 1.905, 0.812, 1.477, 1.176], [1.567, 2.82, 1.32, 0.327, 1.345, 2.178]]\nB: [[1.7, -2.606, 1.565, 0.678, 1.073, 1.573], [1.645, 3.122, 1.233, 0.724, 1.059, 2.428]]\nC: [[2.101, -2.524, 1.207, 0.883, 0.819, 1.727], [1.696, 3.351, 1.474, 0.479, 1.235, 2.058]]\nD: [[1.681, -2.688, 1.507, 0.221, 0.833, 1.776], [1.4, 2.653, 1.693, 1.075, 1.288, 2.071]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the cabinet in the scene. The camera pose information includes: the rotation matrix: [[0.606497, 0.359513, -0.709163], [0.793947, -0.321582, 0.515978], [-0.042553, -0.875977, -0.480473]]; the translation vector: [5.898605, 1.464963, 1.329018], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.912, -2.332, 1.905, 0.812, 1.477, 1.176], [1.567, 2.82, 1.32, 0.327, 1.345, 2.178]]\nB: [[1.7, -2.606, 1.565, 0.678, 1.073, 1.573], [1.645, 3.122, 1.233, 0.724, 1.059, 2.428]]\nC: [[2.101, -2.524, 1.207, 0.883, 0.819, 1.727], [1.696, 3.351, 1.474, 0.479, 1.235, 2.058]]\nD: [[1.681, -2.688, 1.507, 0.221, 0.833, 1.776], [1.4, 2.653, 1.693, 1.075, 1.288, 2.071]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_103_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_103_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-1.693, -1.201, 1.187, 0.828, 0.813, 1.996]]\nB: [[-1.283, -1.49, 1.157, 1.223, 0.635, 2.337]]\nC: [[-1.607, -1.608, 0.733, 1.415, 0.912, 2.422]]\nD: [[-1.367, -1.969, 1.373, 1.253, 1.096, 1.909]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the cabinets in the scene. The camera pose information includes: the rotation matrix: [[0.349467, 0.022881, -0.936669], [0.936944, -0.011774, 0.349282], [-0.003037, -0.999669, -0.025553]]; the translation vector: [3.08553, 2.787215, 1.609269], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.693, -1.201, 1.187, 0.828, 0.813, 1.996]]\nB: [[-1.283, -1.49, 1.157, 1.223, 0.635, 2.337]]\nC: [[-1.607, -1.608, 0.733, 1.415, 0.912, 2.422]]\nD: [[-1.367, -1.969, 1.373, 1.253, 1.096, 1.909]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_104_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_104_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[1.858, -0.632, 0.828, 0.126, 1.643, 1.687], [-1.33, 0.028, 0.915, 0.226, 2.888, 1.864], [-0.174, -1.42, 0.865, 2.224, 0.121, 1.722], [0.61, 1.413, 0.874, 4.003, 0.17, 1.77], [2.563, 1.11, 0.788, 0.118, 0.484, 1.649]]\nB: [[1.405, -0.208, 0.598, -0.114, 2.093, 1.602], [-1.061, 0.394, 1.019, -0.16, 3.193, 1.369], [-0.359, -0.986, 0.414, 1.802, -0.111, 1.429], [1.035, 1.154, 1.154, 3.812, 0.204, 2.113], [2.12, 1.579, 1.171, -0.054, 0.234, 1.478]]\nC: [[1.89, -0.153, 0.406, 0.028, 1.816, 1.93], [-1.451, -0.417, 1.393, -0.113, 3.307, 1.683], [-0.295, -1.25, 0.577, 1.985, -0.098, 1.447], [0.348, 1.382, 0.753, 3.885, 0.441, 1.993], [2.183, 0.625, 0.617, 0.117, 0.723, 1.324]]\nD: [[2.301, -0.62, 1.122, 0.26, 2.124, 2.126], [-0.834, -0.412, 1.071, -0.118, 2.484, 1.498], [-0.094, -1.494, 0.531, 2.098, -0.018, 2.208], [0.226, 1.164, 1.047, 4.422, 0.121, 1.595], [2.644, 1.556, 0.635, 0.354, 0.125, 1.662]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[0.991592, 0.052224, -0.118397], [0.1292, -0.348306, 0.928435], [0.007248, -0.935925, -0.352124]]; the translation vector: [2.177373, 2.142725, 1.46728], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.858, -0.632, 0.828, 0.126, 1.643, 1.687], [-1.33, 0.028, 0.915, 0.226, 2.888, 1.864], [-0.174, -1.42, 0.865, 2.224, 0.121, 1.722], [0.61, 1.413, 0.874, 4.003, 0.17, 1.77], [2.563, 1.11, 0.788, 0.118, 0.484, 1.649]]\nB: [[1.405, -0.208, 0.598, -0.114, 2.093, 1.602], [-1.061, 0.394, 1.019, -0.16, 3.193, 1.369], [-0.359, -0.986, 0.414, 1.802, -0.111, 1.429], [1.035, 1.154, 1.154, 3.812, 0.204, 2.113], [2.12, 1.579, 1.171, -0.054, 0.234, 1.478]]\nC: [[1.89, -0.153, 0.406, 0.028, 1.816, 1.93], [-1.451, -0.417, 1.393, -0.113, 3.307, 1.683], [-0.295, -1.25, 0.577, 1.985, -0.098, 1.447], [0.348, 1.382, 0.753, 3.885, 0.441, 1.993], [2.183, 0.625, 0.617, 0.117, 0.723, 1.324]]\nD: [[2.301, -0.62, 1.122, 0.26, 2.124, 2.126], [-0.834, -0.412, 1.071, -0.118, 2.484, 1.498], [-0.094, -1.494, 0.531, 2.098, -0.018, 2.208], [0.226, 1.164, 1.047, 4.422, 0.121, 1.595], [2.644, 1.556, 0.635, 0.354, 0.125, 1.662]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_105_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_105_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[1.401, -1.054, 0.005, 0.193, -0.189, 0.608], [-1.764, -0.727, 0.562, 0.549, 0.36, 0.374], [-2.181, 0.328, -0.167, 0.351, 0.064, 0.119]]\nB: [[1.152, -0.296, 0.418, 0.864, 0.385, 0.356], [-2.324, -0.32, 0.424, 0.485, 0.66, -0.082], [-1.955, 0.121, 0.148, 0.369, 0.415, 0.131]]\nC: [[1.282, -0.743, 0.129, 0.493, 0.257, 0.293], [-1.968, -0.763, 0.156, 0.467, 0.241, 0.31], [-1.95, 0.267, 0.16, 0.231, 0.318, 0.302]]\nD: [[1.109, -0.73, -0.038, 0.564, 0.587, 0.172], [-2.259, -0.589, 0.46, 0.771, -0.144, -0.09], [-1.478, 0.494, 0.535, 0.374, 0.223, 0.643]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the trash can in the scene. The camera pose information includes: the rotation matrix: [[-0.789457, 0.162095, -0.592016], [0.613764, 0.197318, -0.764434], [-0.007096, -0.966846, -0.255262]]; the translation vector: [5.114759, 3.17533, 1.386193], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.401, -1.054, 0.005, 0.193, -0.189, 0.608], [-1.764, -0.727, 0.562, 0.549, 0.36, 0.374], [-2.181, 0.328, -0.167, 0.351, 0.064, 0.119]]\nB: [[1.152, -0.296, 0.418, 0.864, 0.385, 0.356], [-2.324, -0.32, 0.424, 0.485, 0.66, -0.082], [-1.955, 0.121, 0.148, 0.369, 0.415, 0.131]]\nC: [[1.282, -0.743, 0.129, 0.493, 0.257, 0.293], [-1.968, -0.763, 0.156, 0.467, 0.241, 0.31], [-1.95, 0.267, 0.16, 0.231, 0.318, 0.302]]\nD: [[1.109, -0.73, -0.038, 0.564, 0.587, 0.172], [-2.259, -0.589, 0.46, 0.771, -0.144, -0.09], [-1.478, 0.494, 0.535, 0.374, 0.223, 0.643]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_106_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_106_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-0.628, -0.574, 0.244, 0.629, 0.377, 0.613]]\nB: [[-0.255, -0.118, 0.331, 1.064, 0.829, 0.169]]\nC: [[-0.907, -0.799, 0.595, 1.106, -0.043, 0.376]]\nD: [[-0.149, -0.775, 0.103, 0.329, 0.393, 1.004]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the toilet in the scene. The camera pose information includes: the rotation matrix: [[-0.881415, -0.308012, 0.3581], [-0.47008, 0.646119, -0.601294], [-0.046169, -0.698325, -0.71429]]; the translation vector: [3.147524, 1.689608, 1.273114], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.628, -0.574, 0.244, 0.629, 0.377, 0.613]]\nB: [[-0.255, -0.118, 0.331, 1.064, 0.829, 0.169]]\nC: [[-0.907, -0.799, 0.595, 1.106, -0.043, 0.376]]\nD: [[-0.149, -0.775, 0.103, 0.329, 0.393, 1.004]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_107_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_107_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-0.984, -0.791, 0.71, -0.025, 0.146, 1.95], [-2.845, 0.826, 0.79, -0.018, 2.045, 2.096], [-3.127, 1.767, 1.058, 0.668, 0.751, 1.034], [-2.598, 2.204, 1.456, 0.337, 0.226, 0.622], [-1.681, 2.399, 0.33, 0.897, -0.155, 1.345], [-3.032, -0.879, 1.701, 0.543, 0.324, 1.619], [-2.871, -1.012, 1.112, 0.372, 0.525, 1.953], [-1.989, -1.505, 1.05, 1.013, 0.57, 2.119], [-2.342, -1.032, 1.573, 0.099, 0.665, 2.113], [-1.812, -1.549, 0.936, 0.03, 0.702, 1.331], [-0.897, -1.334, 1.032, -0.375, 0.838, 1.222], [1.073, -1.243, 1.222, 0.074, 0.103, 1.889], [0.684, -1.361, 0.842, 0.81, 0.134, 2.122], [1.681, -0.985, 0.915, -0.15, 0.046, 2.017], [1.499, -1.143, 1.225, 1.125, 0.255, 2.053], [2.518, -0.611, 1.251, 0.147, 0.376, 1.463], [2.78, -0.555, 1.623, 0.321, 0.5, 1.453], [2.607, 0.422, 0.93, 0.428, 2.768, 1.865], [2.929, 2.246, 1.615, 0.791, 0.45, 0.84], [2.894, 2.607, 0.679, 0.418, 0.185, 1.771], [1.91, 1.95, 0.631, 1.792, 0.498, 2.141], [0.968, 2.657, -0.02, 0.142, 0.513, 1.049], [2.739, 1.468, 0.752, 0.297, 1.124, 0.649], [1.999, -0.528, 0.795, 0.19, 0.625, 0.528]]\nB: [[-1.697, -1.101, 1.32, 0.519, 0.607, 2.31], [-2.793, 0.346, 1.419, 0.299, 1.423, 2.149], [-2.764, 1.638, 1.178, 0.604, 0.448, 0.518], [-2.94, 2.09, 1.372, -0.13, -0.272, 0.296], [-1.734, 2.804, 0.582, 1.396, 0.541, 0.939], [-2.549, -0.196, 1.12, 0.785, 0.411, 1.926], [-2.871, -1.014, 0.799, 0.56, 0.597, 1.935], [-2.659, -0.762, 1.356, 0.825, 0.021, 2.649], [-1.977, -1.011, 1.131, 0.465, 0.035, 2.324], [-1.526, -1.598, 1.392, 0.441, -0.118, 2.102], [-1.353, -0.868, 0.591, 0.125, 0.493, 1.476], [1.173, -1.254, 0.599, -0.335, 0.938, 1.499], [1.444, -1.618, 1.332, 0.376, 0.369, 1.68], [1.582, -1.255, 0.456, -0.034, -0.048, 2.138], [2.452, -1.152, 1.16, 0.41, -0.305, 2.162], [2.246, -1.101, 0.993, 0.065, 0.725, 2.256], [2.414, -0.99, 1.12, 0.836, 0.744, 1.026], [2.607, 0.594, 0.728, -0.103, 2.445, 1.796], [2.075, 1.78, 1.433, 0.826, 1.27, 1.569], [2.842, 2.47, 1.179, 0.437, 0.717, 1.714], [2.073, 1.959, 0.513, 1.293, -0.057, 1.28], [0.684, 2.546, 0.647, 0.281, 0.423, 0.403], [1.985, 2.256, 0.609, 0.323, 0.304, 0.186], [1.904, -0.439, 0.116, 0.205, 0.913, 1.076]]\nC: [[-0.882, -0.996, 0.828, 0.066, -0.03, 2.259], [-2.906, 0.486, 0.584, 0.338, 1.448, 2.228], [-2.335, 1.79, 1.402, 0.799, 0.604, 0.979], [-2.492, 2.696, 0.852, 0.385, -0.119, 0.551], [-2.113, 2.369, 0.634, 1.634, -0.378, 1.33], [-2.9, -0.382, 1.544, 0.229, 0.561, 1.896], [-2.46, -0.938, 0.92, 0.562, 0.836, 1.812], [-2.236, -1.122, 1.385, 0.806, -0.301, 1.756], [-1.859, -1.439, 0.978, -0.087, 0.007, 2.232], [-1.477, -1.605, 1.119, -0.203, 0.225, 1.352], [-0.558, -1.702, 0.427, 0.133, 0.668, 1.46], [0.818, -0.885, 1.161, 0.455, 0.101, 1.667], [0.552, -1.308, 0.707, 0.978, 0.615, 1.676], [2.109, -1.305, 1.008, -0.007, 0.224, 2.013], [2.016, -1.577, 1.004, 0.572, 0.061, 2.141], [1.754, -1.027, 1.286, 0.147, 0.165, 1.509], [2.849, -0.613, 0.987, 0.617, 1.099, 1.162], [2.281, 0.428, 1.287, 0.612, 2.792, 1.8], [2.1, 1.909, 1.627, 0.042, 0.641, 1.338], [2.025, 1.994, 0.97, 0.816, 0.372, 1.93], [1.678, 2.705, 1.241, 1.93, -0.063, 1.837], [0.582, 2.314, 0.279, 0.554, 0.013, 1.151], [2.245, 1.504, 0.631, 0.05, 1.008, 1.066], [2.09, -0.514, 0.622, -0.006, 1.061, 1.06]]\nD: [[-1.212, -1.13, 1.017, 0.465, 0.161, 2.011], [-2.56, 0.64, 0.971, 0.201, 1.804, 1.935], [-2.744, 1.914, 1.197, 0.349, 0.771, 0.66], [-2.606, 2.363, 1.087, 0.038, 0.219, 0.424], [-1.931, 2.472, 0.667, 1.366, 0.094, 1.255], [-2.729, -0.603, 1.38, 0.39, 0.792, 1.541], [-2.531, -0.93, 1.084, 0.175, 0.507, 2.172], [-2.227, -1.13, 1.087, 0.723, 0.142, 2.167], [-1.887, -1.279, 1.074, 0.181, 0.413, 2.133], [-1.395, -1.301, 1.124, 0.117, 0.289, 1.814], [-0.99, -1.313, 0.763, 0.122, 0.477, 1.511], [0.768, -1.372, 0.865, 0.144, 0.573, 1.696], [0.958, -1.124, 0.866, 0.51, 0.163, 1.704], [1.687, -1.284, 0.89, 0.172, 0.422, 1.772], [1.97, -1.137, 0.897, 0.705, 0.139, 1.81], [2.237, -1.017, 0.895, 0.302, 0.335, 1.807], [2.506, -0.615, 1.189, 0.456, 0.783, 1.228], [2.295, 0.463, 0.865, 0.248, 2.384, 1.746], [2.549, 1.9, 1.178, 0.43, 0.874, 1.13], [2.396, 2.329, 0.87, 0.323, 0.269, 1.739], [1.621, 2.45, 0.875, 1.651, 0.189, 1.735], [0.789, 2.563, 0.425, 0.113, 0.177, 0.782], [2.336, 1.911, 0.338, 0.211, 0.688, 0.678], [2.287, -0.576, 0.338, 0.14, 0.728, 0.71]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[-0.731293, 0.384445, -0.563394], [0.682011, 0.401944, -0.610984], [-0.008437, -0.831049, -0.556135]]; the translation vector: [5.176627, 2.209938, 1.427488], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.984, -0.791, 0.71, -0.025, 0.146, 1.95], [-2.845, 0.826, 0.79, -0.018, 2.045, 2.096], [-3.127, 1.767, 1.058, 0.668, 0.751, 1.034], [-2.598, 2.204, 1.456, 0.337, 0.226, 0.622], [-1.681, 2.399, 0.33, 0.897, -0.155, 1.345], [-3.032, -0.879, 1.701, 0.543, 0.324, 1.619], [-2.871, -1.012, 1.112, 0.372, 0.525, 1.953], [-1.989, -1.505, 1.05, 1.013, 0.57, 2.119], [-2.342, -1.032, 1.573, 0.099, 0.665, 2.113], [-1.812, -1.549, 0.936, 0.03, 0.702, 1.331], [-0.897, -1.334, 1.032, -0.375, 0.838, 1.222], [1.073, -1.243, 1.222, 0.074, 0.103, 1.889], [0.684, -1.361, 0.842, 0.81, 0.134, 2.122], [1.681, -0.985, 0.915, -0.15, 0.046, 2.017], [1.499, -1.143, 1.225, 1.125, 0.255, 2.053], [2.518, -0.611, 1.251, 0.147, 0.376, 1.463], [2.78, -0.555, 1.623, 0.321, 0.5, 1.453], [2.607, 0.422, 0.93, 0.428, 2.768, 1.865], [2.929, 2.246, 1.615, 0.791, 0.45, 0.84], [2.894, 2.607, 0.679, 0.418, 0.185, 1.771], [1.91, 1.95, 0.631, 1.792, 0.498, 2.141], [0.968, 2.657, -0.02, 0.142, 0.513, 1.049], [2.739, 1.468, 0.752, 0.297, 1.124, 0.649], [1.999, -0.528, 0.795, 0.19, 0.625, 0.528]]\nB: [[-1.697, -1.101, 1.32, 0.519, 0.607, 2.31], [-2.793, 0.346, 1.419, 0.299, 1.423, 2.149], [-2.764, 1.638, 1.178, 0.604, 0.448, 0.518], [-2.94, 2.09, 1.372, -0.13, -0.272, 0.296], [-1.734, 2.804, 0.582, 1.396, 0.541, 0.939], [-2.549, -0.196, 1.12, 0.785, 0.411, 1.926], [-2.871, -1.014, 0.799, 0.56, 0.597, 1.935], [-2.659, -0.762, 1.356, 0.825, 0.021, 2.649], [-1.977, -1.011, 1.131, 0.465, 0.035, 2.324], [-1.526, -1.598, 1.392, 0.441, -0.118, 2.102], [-1.353, -0.868, 0.591, 0.125, 0.493, 1.476], [1.173, -1.254, 0.599, -0.335, 0.938, 1.499], [1.444, -1.618, 1.332, 0.376, 0.369, 1.68], [1.582, -1.255, 0.456, -0.034, -0.048, 2.138], [2.452, -1.152, 1.16, 0.41, -0.305, 2.162], [2.246, -1.101, 0.993, 0.065, 0.725, 2.256], [2.414, -0.99, 1.12, 0.836, 0.744, 1.026], [2.607, 0.594, 0.728, -0.103, 2.445, 1.796], [2.075, 1.78, 1.433, 0.826, 1.27, 1.569], [2.842, 2.47, 1.179, 0.437, 0.717, 1.714], [2.073, 1.959, 0.513, 1.293, -0.057, 1.28], [0.684, 2.546, 0.647, 0.281, 0.423, 0.403], [1.985, 2.256, 0.609, 0.323, 0.304, 0.186], [1.904, -0.439, 0.116, 0.205, 0.913, 1.076]]\nC: [[-0.882, -0.996, 0.828, 0.066, -0.03, 2.259], [-2.906, 0.486, 0.584, 0.338, 1.448, 2.228], [-2.335, 1.79, 1.402, 0.799, 0.604, 0.979], [-2.492, 2.696, 0.852, 0.385, -0.119, 0.551], [-2.113, 2.369, 0.634, 1.634, -0.378, 1.33], [-2.9, -0.382, 1.544, 0.229, 0.561, 1.896], [-2.46, -0.938, 0.92, 0.562, 0.836, 1.812], [-2.236, -1.122, 1.385, 0.806, -0.301, 1.756], [-1.859, -1.439, 0.978, -0.087, 0.007, 2.232], [-1.477, -1.605, 1.119, -0.203, 0.225, 1.352], [-0.558, -1.702, 0.427, 0.133, 0.668, 1.46], [0.818, -0.885, 1.161, 0.455, 0.101, 1.667], [0.552, -1.308, 0.707, 0.978, 0.615, 1.676], [2.109, -1.305, 1.008, -0.007, 0.224, 2.013], [2.016, -1.577, 1.004, 0.572, 0.061, 2.141], [1.754, -1.027, 1.286, 0.147, 0.165, 1.509], [2.849, -0.613, 0.987, 0.617, 1.099, 1.162], [2.281, 0.428, 1.287, 0.612, 2.792, 1.8], [2.1, 1.909, 1.627, 0.042, 0.641, 1.338], [2.025, 1.994, 0.97, 0.816, 0.372, 1.93], [1.678, 2.705, 1.241, 1.93, -0.063, 1.837], [0.582, 2.314, 0.279, 0.554, 0.013, 1.151], [2.245, 1.504, 0.631, 0.05, 1.008, 1.066], [2.09, -0.514, 0.622, -0.006, 1.061, 1.06]]\nD: [[-1.212, -1.13, 1.017, 0.465, 0.161, 2.011], [-2.56, 0.64, 0.971, 0.201, 1.804, 1.935], [-2.744, 1.914, 1.197, 0.349, 0.771, 0.66], [-2.606, 2.363, 1.087, 0.038, 0.219, 0.424], [-1.931, 2.472, 0.667, 1.366, 0.094, 1.255], [-2.729, -0.603, 1.38, 0.39, 0.792, 1.541], [-2.531, -0.93, 1.084, 0.175, 0.507, 2.172], [-2.227, -1.13, 1.087, 0.723, 0.142, 2.167], [-1.887, -1.279, 1.074, 0.181, 0.413, 2.133], [-1.395, -1.301, 1.124, 0.117, 0.289, 1.814], [-0.99, -1.313, 0.763, 0.122, 0.477, 1.511], [0.768, -1.372, 0.865, 0.144, 0.573, 1.696], [0.958, -1.124, 0.866, 0.51, 0.163, 1.704], [1.687, -1.284, 0.89, 0.172, 0.422, 1.772], [1.97, -1.137, 0.897, 0.705, 0.139, 1.81], [2.237, -1.017, 0.895, 0.302, 0.335, 1.807], [2.506, -0.615, 1.189, 0.456, 0.783, 1.228], [2.295, 0.463, 0.865, 0.248, 2.384, 1.746], [2.549, 1.9, 1.178, 0.43, 0.874, 1.13], [2.396, 2.329, 0.87, 0.323, 0.269, 1.739], [1.621, 2.45, 0.875, 1.651, 0.189, 1.735], [0.789, 2.563, 0.425, 0.113, 0.177, 0.782], [2.336, 1.911, 0.338, 0.211, 0.688, 0.678], [2.287, -0.576, 0.338, 0.14, 0.728, 0.71]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_108_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_108_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-1.196, -0.211, 0.68, 0.711, 0.576, 2.155]]\nB: [[-0.409, 0.533, 1.267, -0.113, 0.263, 1.631]]\nC: [[-0.799, 0.234, 0.962, 0.275, 0.234, 1.923]]\nD: [[-1.167, 0.457, 0.799, -0.179, 0.573, 2.357]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the shower curtain in the scene. The camera pose information includes: the rotation matrix: [[-0.506976, -0.449046, 0.735753], [-0.861802, 0.247713, -0.442646], [0.016513, -0.858485, -0.512574]]; the translation vector: [1.568574, 4.423309, 1.333385], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.196, -0.211, 0.68, 0.711, 0.576, 2.155]]\nB: [[-0.409, 0.533, 1.267, -0.113, 0.263, 1.631]]\nC: [[-0.799, 0.234, 0.962, 0.275, 0.234, 1.923]]\nD: [[-1.167, 0.457, 0.799, -0.179, 0.573, 2.357]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_109_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_109_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-1.94, 1.68, 0.837, 0.663, 0.508, 0.307]]\nB: [[-1.567, 0.924, 0.596, -0.078, 0.24, 0.881]]\nC: [[-1.847, 1.274, 0.842, 0.196, 0.441, 0.778]]\nD: [[-2.041, 1.755, 1.288, 0.168, 0.884, 0.741]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the object in the scene. The camera pose information includes: the rotation matrix: [[0.233902, -0.58763, 0.774584], [-0.967246, -0.059828, 0.246692], [-0.098622, -0.806915, -0.582377]]; the translation vector: [0.860343, 3.117731, 1.418568], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.94, 1.68, 0.837, 0.663, 0.508, 0.307]]\nB: [[-1.567, 0.924, 0.596, -0.078, 0.24, 0.881]]\nC: [[-1.847, 1.274, 0.842, 0.196, 0.441, 0.778]]\nD: [[-2.041, 1.755, 1.288, 0.168, 0.884, 0.741]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_110_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_110_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[1.505, -0.116, 0.747, 0.409, 0.695, 0.297], [-1.441, 0.909, 0.606, 0.695, 0.528, 0.24], [1.536, 0.64, 0.715, 0.483, 0.851, 0.175], [1.546, -0.374, 0.796, 0.374, 0.77, 0.35], [-1.45, 0.754, 0.484, 0.85, 0.766, 0.215]]\nB: [[1.607, 0.304, 0.83, 0.898, 0.58, 0.697], [-1.406, 0.619, 0.763, 0.933, 0.149, 0.108], [1.448, 0.861, 0.699, 0.254, 0.441, 0.026], [1.945, -0.851, 0.97, 0.08, 1.051, 0.781], [-1.319, 0.842, 0.31, 1.314, 0.811, 0.161]]\nC: [[1.451, -0.224, 1.202, 0.474, 0.259, 0.177], [-1.303, 1.145, 0.291, 1.141, 0.346, 0.272], [1.763, 0.401, 0.944, 0.92, 1.062, -0.044], [1.663, -0.056, 0.805, 0.848, 1.189, 0.211], [-1.93, 0.603, 0.76, 0.741, 0.586, -0.206]]\nD: [[1.888, 0.164, 1.08, 0.295, 0.332, 0.729], [-1.781, 1.348, 0.164, 0.674, 0.738, 0.722], [1.997, 0.742, 0.991, 0.029, 0.449, -0.1], [1.487, 0.076, 0.6, 0.156, 0.445, 0.145], [-1.75, 1.16, 0.275, 0.799, 1.235, 0.304]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the pillow in the scene. The camera pose information includes: the rotation matrix: [[0.484778, 0.389748, -0.782998], [0.874059, -0.248441, 0.417491], [-0.031813, -0.886777, -0.461102]]; the translation vector: [2.948564, 2.712566, 1.480667], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.505, -0.116, 0.747, 0.409, 0.695, 0.297], [-1.441, 0.909, 0.606, 0.695, 0.528, 0.24], [1.536, 0.64, 0.715, 0.483, 0.851, 0.175], [1.546, -0.374, 0.796, 0.374, 0.77, 0.35], [-1.45, 0.754, 0.484, 0.85, 0.766, 0.215]]\nB: [[1.607, 0.304, 0.83, 0.898, 0.58, 0.697], [-1.406, 0.619, 0.763, 0.933, 0.149, 0.108], [1.448, 0.861, 0.699, 0.254, 0.441, 0.026], [1.945, -0.851, 0.97, 0.08, 1.051, 0.781], [-1.319, 0.842, 0.31, 1.314, 0.811, 0.161]]\nC: [[1.451, -0.224, 1.202, 0.474, 0.259, 0.177], [-1.303, 1.145, 0.291, 1.141, 0.346, 0.272], [1.763, 0.401, 0.944, 0.92, 1.062, -0.044], [1.663, -0.056, 0.805, 0.848, 1.189, 0.211], [-1.93, 0.603, 0.76, 0.741, 0.586, -0.206]]\nD: [[1.888, 0.164, 1.08, 0.295, 0.332, 0.729], [-1.781, 1.348, 0.164, 0.674, 0.738, 0.722], [1.997, 0.742, 0.991, 0.029, 0.449, -0.1], [1.487, 0.076, 0.6, 0.156, 0.445, 0.145], [-1.75, 1.16, 0.275, 0.799, 1.235, 0.304]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_111_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_111_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[1.148, -1.819, 0.681, 1.73, 0.9, 0.465], [-1.44, 2.208, 0.821, 0.893, 2.303, 0.634], [0.765, 1.362, 0.255, 2.131, 1.052, 0.327], [-1.998, -1.691, 0.062, 1.757, 1.835, 0.718]]\nB: [[1.542, -1.233, 0.854, 2.268, 1.021, 0.755], [-2.098, 1.815, 0.076, 0.977, 1.531, 0.579], [1.499, 1.894, 0.799, 1.364, 1.243, 0.606], [-1.591, -1.777, -0.089, 1.375, 2.302, 0.818]]\nC: [[1.019, -1.513, 0.012, 1.939, 1.04, 0.603], [-1.397, 1.894, 0.192, 1.788, 2.263, 0.963], [0.794, 1.72, 0.728, 1.503, 1.344, 0.994], [-1.899, -1.035, 0.107, 1.802, 1.941, 0.705]]\nD: [[1.181, -1.566, 0.434, 1.91, 1.342, 0.847], [-1.636, 1.86, 0.387, 1.322, 1.894, 0.782], [1.234, 1.651, 0.4, 1.847, 1.393, 0.784], [-1.767, -1.535, 0.407, 1.331, 1.981, 0.802]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the table in the scene. The camera pose information includes: the rotation matrix: [[0.996822, -0.027813, -0.074656], [0.056495, -0.413943, 0.908548], [-0.056173, -0.909878, -0.411056]]; the translation vector: [4.405487, 5.403347, 1.494535], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.148, -1.819, 0.681, 1.73, 0.9, 0.465], [-1.44, 2.208, 0.821, 0.893, 2.303, 0.634], [0.765, 1.362, 0.255, 2.131, 1.052, 0.327], [-1.998, -1.691, 0.062, 1.757, 1.835, 0.718]]\nB: [[1.542, -1.233, 0.854, 2.268, 1.021, 0.755], [-2.098, 1.815, 0.076, 0.977, 1.531, 0.579], [1.499, 1.894, 0.799, 1.364, 1.243, 0.606], [-1.591, -1.777, -0.089, 1.375, 2.302, 0.818]]\nC: [[1.019, -1.513, 0.012, 1.939, 1.04, 0.603], [-1.397, 1.894, 0.192, 1.788, 2.263, 0.963], [0.794, 1.72, 0.728, 1.503, 1.344, 0.994], [-1.899, -1.035, 0.107, 1.802, 1.941, 0.705]]\nD: [[1.181, -1.566, 0.434, 1.91, 1.342, 0.847], [-1.636, 1.86, 0.387, 1.322, 1.894, 0.782], [1.234, 1.651, 0.4, 1.847, 1.393, 0.784], [-1.767, -1.535, 0.407, 1.331, 1.981, 0.802]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_112_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_112_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[2.632, 2.861, 0.973, 0.851, 0.88, 0.553]]\nB: [[2.217, 3.039, 0.859, 0.578, 0.679, 0.811]]\nC: [[2.372, 2.508, 1.395, 0.466, 0.758, 0.941]]\nD: [[2.418, 3.313, 1.363, 0.462, 1.217, 0.869]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the tv in the scene. The camera pose information includes: the rotation matrix: [[-0.869565, 0.231948, -0.435955], [0.492522, 0.471291, -0.731647], [0.035758, -0.850932, -0.524058]]; the translation vector: [2.750575, 3.154689, 1.290553], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[2.632, 2.861, 0.973, 0.851, 0.88, 0.553]]\nB: [[2.217, 3.039, 0.859, 0.578, 0.679, 0.811]]\nC: [[2.372, 2.508, 1.395, 0.466, 0.758, 0.941]]\nD: [[2.418, 3.313, 1.363, 0.462, 1.217, 0.869]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_113_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_113_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-2.523, -0.73, 1.669, 0.473, 3.389, 1.19]]\nB: [[-2.737, -0.956, 1.441, 0.102, 2.891, 0.9]]\nC: [[-2.415, -1.042, 1.71, -0.167, 2.518, 1.307]]\nD: [[-3.121, -1.319, 1.73, 0.166, 2.406, 0.532]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the board in the scene. The camera pose information includes: the rotation matrix: [[0.896132, -0.052356, 0.440688], [-0.436974, -0.277444, 0.855616], [0.07747, -0.959314, -0.271505]]; the translation vector: [3.211431, 3.110947, 1.584554], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-2.523, -0.73, 1.669, 0.473, 3.389, 1.19]]\nB: [[-2.737, -0.956, 1.441, 0.102, 2.891, 0.9]]\nC: [[-2.415, -1.042, 1.71, -0.167, 2.518, 1.307]]\nD: [[-3.121, -1.319, 1.73, 0.166, 2.406, 0.532]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_114_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_114_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[0.919, 2.881, 0.5, 1.076, 0.198, 0.512], [-0.387, 3.021, 0.763, 1.206, 0.18, 1.044]]\nB: [[0.967, 3.235, 0.454, 1.103, -0.268, 0.912], [-0.093, 2.837, 0.491, 1.606, 0.643, 1.265]]\nC: [[1.146, 2.813, 0.895, 1.333, -0.231, 0.884], [-0.108, 2.697, 0.646, 1.144, -0.245, 0.801]]\nD: [[1.405, 2.769, 0.583, 0.816, -0.053, 0.839], [-0.646, 2.953, 0.434, 1.464, 0.436, 0.68]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the mirror in the scene. The camera pose information includes: the rotation matrix: [[-0.880278, -0.246293, 0.405524], [-0.473973, 0.417832, -0.775091], [0.021459, -0.874503, -0.484545]]; the translation vector: [3.281806, 2.754624, 1.352781], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.919, 2.881, 0.5, 1.076, 0.198, 0.512], [-0.387, 3.021, 0.763, 1.206, 0.18, 1.044]]\nB: [[0.967, 3.235, 0.454, 1.103, -0.268, 0.912], [-0.093, 2.837, 0.491, 1.606, 0.643, 1.265]]\nC: [[1.146, 2.813, 0.895, 1.333, -0.231, 0.884], [-0.108, 2.697, 0.646, 1.144, -0.245, 0.801]]\nD: [[1.405, 2.769, 0.583, 0.816, -0.053, 0.839], [-0.646, 2.953, 0.434, 1.464, 0.436, 0.68]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_115_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_115_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[0.099, -1.623, 0.8, 1.091, 0.185, 1.674]]\nB: [[0.028, -1.324, 1.283, 0.847, -0.251, 1.976]]\nC: [[0.008, -1.165, 1.014, 1.132, -0.028, 1.19]]\nD: [[0.219, -1.325, 0.313, 1.01, 0.321, 1.757]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the doorframe in the scene. The camera pose information includes: the rotation matrix: [[-0.874867, -0.0675, 0.479638], [-0.482919, 0.197999, -0.852987], [-0.037391, -0.977875, -0.205819]]; the translation vector: [2.397274, 1.722858, 1.486845], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.099, -1.623, 0.8, 1.091, 0.185, 1.674]]\nB: [[0.028, -1.324, 1.283, 0.847, -0.251, 1.976]]\nC: [[0.008, -1.165, 1.014, 1.132, -0.028, 1.19]]\nD: [[0.219, -1.325, 0.313, 1.01, 0.321, 1.757]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_116_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_116_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[0.712, -1.245, 0.91, 1.048, 1.199, 2.013]]\nB: [[0.626, -1.611, 1.221, 1.09, 1.245, 2.069]]\nC: [[1.138, -1.446, 0.77, 0.846, 1.373, 1.96]]\nD: [[0.371, -1.441, 0.499, 0.655, 1.441, 2.321]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the shower in the scene. The camera pose information includes: the rotation matrix: [[-0.612656, -0.411508, 0.674769], [-0.789543, 0.280105, -0.546043], [0.035694, -0.867296, -0.496511]]; the translation vector: [1.897828, 2.372103, 1.388776], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.712, -1.245, 0.91, 1.048, 1.199, 2.013]]\nB: [[0.626, -1.611, 1.221, 1.09, 1.245, 2.069]]\nC: [[1.138, -1.446, 0.77, 0.846, 1.373, 1.96]]\nD: [[0.371, -1.441, 0.499, 0.655, 1.441, 2.321]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_117_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_117_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-0.851, -0.281, 1.012, 0.232, 0.838, 2.123]]\nB: [[-0.647, 0.167, 1.047, -0.111, 0.572, 1.688]]\nC: [[-0.968, -0.496, 1.046, -0.014, 1.192, 1.751]]\nD: [[-0.616, -0.07, 1.075, 0.231, 1.203, 1.991]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the doorframe in the scene. The camera pose information includes: the rotation matrix: [[-0.48142, 0.335029, -0.809933], [0.872625, 0.096524, -0.478757], [-0.08222, -0.937251, -0.338823]]; the translation vector: [4.429162, 2.287411, 1.464776], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.851, -0.281, 1.012, 0.232, 0.838, 2.123]]\nB: [[-0.647, 0.167, 1.047, -0.111, 0.572, 1.688]]\nC: [[-0.968, -0.496, 1.046, -0.014, 1.192, 1.751]]\nD: [[-0.616, -0.07, 1.075, 0.231, 1.203, 1.991]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_118_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_118_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[1.76, 1.613, 0.501, 0.748, 1.477, 2.283], [-1.256, 0.486, 0.695, 0.261, -0.004, 1.392]]\nB: [[1.66, 0.843, 1.041, 1.024, 1.548, 1.575], [-0.68, 1.177, 0.879, 0.467, 0.635, 2.319]]\nC: [[1.906, 1.059, 1.056, 0.263, 1.047, 1.4], [-0.793, 1.238, 0.654, 0.903, 0.438, 1.901]]\nD: [[1.788, 1.153, 0.954, 0.56, 1.154, 1.881], [-0.939, 0.896, 0.911, 0.636, 0.225, 1.837]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the door in the scene. The camera pose information includes: the rotation matrix: [[-0.414473, -0.491559, 0.765887], [-0.909569, 0.196057, -0.366396], [0.029948, -0.848488, -0.528367]]; the translation vector: [0.955419, 3.497842, 1.497559], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.76, 1.613, 0.501, 0.748, 1.477, 2.283], [-1.256, 0.486, 0.695, 0.261, -0.004, 1.392]]\nB: [[1.66, 0.843, 1.041, 1.024, 1.548, 1.575], [-0.68, 1.177, 0.879, 0.467, 0.635, 2.319]]\nC: [[1.906, 1.059, 1.056, 0.263, 1.047, 1.4], [-0.793, 1.238, 0.654, 0.903, 0.438, 1.901]]\nD: [[1.788, 1.153, 0.954, 0.56, 1.154, 1.881], [-0.939, 0.896, 0.911, 0.636, 0.225, 1.837]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_119_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_119_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[0.054, 1.184, 0.861, 1.846, 0.937, 1.341]]\nB: [[0.486, 0.802, 0.412, 1.751, 1.322, 0.856]]\nC: [[0.138, 0.31, 0.136, 1.361, 1.636, 1.27]]\nD: [[0.461, 1.003, 0.863, 1.591, 0.946, 0.97]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the bed in the scene. The camera pose information includes: the rotation matrix: [[-0.778266, 0.076502, -0.623257], [0.626532, 0.028295, -0.778882], [-0.041951, -0.996668, -0.069952]]; the translation vector: [4.354075, 2.27787, 1.510689], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.054, 1.184, 0.861, 1.846, 0.937, 1.341]]\nB: [[0.486, 0.802, 0.412, 1.751, 1.322, 0.856]]\nC: [[0.138, 0.31, 0.136, 1.361, 1.636, 1.27]]\nD: [[0.461, 1.003, 0.863, 1.591, 0.946, 0.97]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_120_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_120_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[1.119, 2.382, -1.353, 2.688, 3.135, 0.067], [0.13, -2.518, 2.241, 3.948, 1.36, 0.679], [-0.552, 1.97, 3.469, 0.872, 1.36, 0.237]]\nB: [[1.56, 2.895, -0.877, 2.03, 3.162, 0.481], [-0.012, -2.383, 2.434, 3.863, 1.73, 0.8], [-1.053, 2.303, 3.432, 0.863, 1.12, -0.287]]\nC: [[1.156, 2.743, -1.086, 2.211, 3.278, 0.076], [-0.143, -2.063, 2.035, 4.283, 1.757, 0.379], [-1.038, 2.35, 3.4, 1.326, 1.515, 0.161]]\nD: [[1.559, 2.394, -0.855, 1.997, 3.635, -0.357], [-0.381, -2.532, 1.718, 3.949, 1.906, 0.055], [-0.752, 2.698, 2.911, 0.92, 1.137, -0.299]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the floor in the scene. The camera pose information includes: the rotation matrix: [[0.485844, -0.617081, 0.619005], [-0.873216, -0.311825, 0.374512], [-0.038083, -0.722479, -0.690343]]; the translation vector: [-0.164865, 3.073333, 1.323993], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.119, 2.382, -1.353, 2.688, 3.135, 0.067], [0.13, -2.518, 2.241, 3.948, 1.36, 0.679], [-0.552, 1.97, 3.469, 0.872, 1.36, 0.237]]\nB: [[1.56, 2.895, -0.877, 2.03, 3.162, 0.481], [-0.012, -2.383, 2.434, 3.863, 1.73, 0.8], [-1.053, 2.303, 3.432, 0.863, 1.12, -0.287]]\nC: [[1.156, 2.743, -1.086, 2.211, 3.278, 0.076], [-0.143, -2.063, 2.035, 4.283, 1.757, 0.379], [-1.038, 2.35, 3.4, 1.326, 1.515, 0.161]]\nD: [[1.559, 2.394, -0.855, 1.997, 3.635, -0.357], [-0.381, -2.532, 1.718, 3.949, 1.906, 0.055], [-0.752, 2.698, 2.911, 0.92, 1.137, -0.299]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_121_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_121_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[1.806, 0.963, 2.18, -0.15, 0.284, 0.747], [1.781, 1.911, 0.25, 0.639, -0.36, 0.423], [0.587, 2.601, 0.863, 0.066, -0.232, 0.753], [-0.114, 0.242, -0.467, -0.049, 0.569, 0.595], [1.888, 2.229, 0.499, -0.382, 0.046, 0.309], [1.509, 1.846, 0.444, -0.228, 0.075, -0.285]]\nB: [[2.196, 0.809, 1.96, 0.418, -0.322, 0.279], [1.469, 1.583, 0.002, 0.664, -0.01, 0.038], [-0.066, 2.119, 1.533, 0.754, 0.595, 0.72], [0.503, 0.542, -0.003, 0.788, 0.94, -0.045], [2.066, 1.491, 0.9, -0.225, 0.433, 0.456], [1.952, 1.903, 0.373, -0.138, 0.52, 0.69]]\nC: [[2.185, 1.438, 1.612, -0.033, 0.189, 0.171], [2.033, 1.77, 0.709, 0.481, 0.536, -0.285], [-0.228, 2.34, 1.714, 0.595, 0.3, -0.06], [-0.139, 0.192, -0.291, 0.431, 0.48, -0.343], [2.39, 1.84, 0.691, -0.012, 0.252, 0.48], [1.891, 1.81, 0.417, -0.052, -0.296, 0.438]]\nD: [[2.211, 1.285, 1.775, 0.127, 0.174, 0.292], [1.829, 1.683, 0.248, 0.278, 0.134, 0.131], [0.255, 2.241, 1.304, 0.333, 0.221, 0.253], [0.094, 0.321, -0.047, 0.34, 0.473, 0.108], [1.975, 1.944, 0.507, 0.101, 0.048, 0.175], [1.799, 1.959, 0.282, 0.261, 0.114, 0.195]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the object in the scene. The camera pose information includes: the rotation matrix: [[-0.877021, 0.121711, -0.464779], [0.46491, 0.459041, -0.75706], [0.12121, -0.880038, -0.459173]]; the translation vector: [3.922419, 3.230202, 1.747047], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.806, 0.963, 2.18, -0.15, 0.284, 0.747], [1.781, 1.911, 0.25, 0.639, -0.36, 0.423], [0.587, 2.601, 0.863, 0.066, -0.232, 0.753], [-0.114, 0.242, -0.467, -0.049, 0.569, 0.595], [1.888, 2.229, 0.499, -0.382, 0.046, 0.309], [1.509, 1.846, 0.444, -0.228, 0.075, -0.285]]\nB: [[2.196, 0.809, 1.96, 0.418, -0.322, 0.279], [1.469, 1.583, 0.002, 0.664, -0.01, 0.038], [-0.066, 2.119, 1.533, 0.754, 0.595, 0.72], [0.503, 0.542, -0.003, 0.788, 0.94, -0.045], [2.066, 1.491, 0.9, -0.225, 0.433, 0.456], [1.952, 1.903, 0.373, -0.138, 0.52, 0.69]]\nC: [[2.185, 1.438, 1.612, -0.033, 0.189, 0.171], [2.033, 1.77, 0.709, 0.481, 0.536, -0.285], [-0.228, 2.34, 1.714, 0.595, 0.3, -0.06], [-0.139, 0.192, -0.291, 0.431, 0.48, -0.343], [2.39, 1.84, 0.691, -0.012, 0.252, 0.48], [1.891, 1.81, 0.417, -0.052, -0.296, 0.438]]\nD: [[2.211, 1.285, 1.775, 0.127, 0.174, 0.292], [1.829, 1.683, 0.248, 0.278, 0.134, 0.131], [0.255, 2.241, 1.304, 0.333, 0.221, 0.253], [0.094, 0.321, -0.047, 0.34, 0.473, 0.108], [1.975, 1.944, 0.507, 0.101, 0.048, 0.175], [1.799, 1.959, 0.282, 0.261, 0.114, 0.195]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_122_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_122_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-0.687, 1.332, 0.035, 0.175, 2.444, 0.931], [-0.771, -0.087, 1.874, 0.351, 2.708, 1.076], [1.073, -0.054, 0.17, 0.668, 1.99, 0.53], [0.962, 0.659, 2.221, 0.281, 1.867, 1.287]]\nB: [[-0.793, 1.344, 0.264, -0.144, 2.319, 0.303], [-0.892, -0.228, 1.276, 0.428, 2.505, 1.37], [0.859, 0.126, 0.332, 0.439, 1.529, 0.603], [0.425, 0.012, 2.016, 0.908, 2.008, 0.841]]\nC: [[-1.133, 0.424, 0.86, -0.054, 2.382, 0.943], [-1.282, -0.466, 1.739, 0.288, 2.29, 1.182], [0.76, 0.578, 0.124, 0.797, 1.631, 0.597], [0.974, 0.003, 1.857, 0.274, 1.983, 0.737]]\nD: [[-0.66, 0.92, 0.369, 0.068, 2.758, 0.803], [-0.938, -0.063, 1.743, 0.134, 2.421, 0.981], [0.672, 0.378, 0.36, 0.646, 1.681, 0.828], [0.776, 0.348, 1.743, 0.449, 1.71, 0.968]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the kitchen cabinets in the scene. The camera pose information includes: the rotation matrix: [[0.815869, 0.244354, -0.524069], [0.578211, -0.336271, 0.743367], [0.005416, -0.909513, -0.415641]]; the translation vector: [2.358014, 1.230078, 1.369842], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.687, 1.332, 0.035, 0.175, 2.444, 0.931], [-0.771, -0.087, 1.874, 0.351, 2.708, 1.076], [1.073, -0.054, 0.17, 0.668, 1.99, 0.53], [0.962, 0.659, 2.221, 0.281, 1.867, 1.287]]\nB: [[-0.793, 1.344, 0.264, -0.144, 2.319, 0.303], [-0.892, -0.228, 1.276, 0.428, 2.505, 1.37], [0.859, 0.126, 0.332, 0.439, 1.529, 0.603], [0.425, 0.012, 2.016, 0.908, 2.008, 0.841]]\nC: [[-1.133, 0.424, 0.86, -0.054, 2.382, 0.943], [-1.282, -0.466, 1.739, 0.288, 2.29, 1.182], [0.76, 0.578, 0.124, 0.797, 1.631, 0.597], [0.974, 0.003, 1.857, 0.274, 1.983, 0.737]]\nD: [[-0.66, 0.92, 0.369, 0.068, 2.758, 0.803], [-0.938, -0.063, 1.743, 0.134, 2.421, 0.981], [0.672, 0.378, 0.36, 0.646, 1.681, 0.828], [0.776, 0.348, 1.743, 0.449, 1.71, 0.968]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_123_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_123_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[1.177, -0.077, 0.773, 0.974, 8.45, 1.181]]\nB: [[1.408, -0.085, 0.96, 1.256, 8.826, 1.391]]\nC: [[1.29, 0.138, 0.989, 1.682, 8.854, 1.495]]\nD: [[1.087, -0.264, 0.505, 1.705, 9.131, 0.904]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the blinds in the scene. The camera pose information includes: the rotation matrix: [[0.117057, -0.769276, 0.628102], [-0.987232, -0.021336, 0.157855], [-0.108033, -0.638561, -0.761951]]; the translation vector: [1.032686, 1.226834, 2.186959], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.177, -0.077, 0.773, 0.974, 8.45, 1.181]]\nB: [[1.408, -0.085, 0.96, 1.256, 8.826, 1.391]]\nC: [[1.29, 0.138, 0.989, 1.682, 8.854, 1.495]]\nD: [[1.087, -0.264, 0.505, 1.705, 9.131, 0.904]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_124_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_124_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[1.333, 1.449, 0.466, 0.575, 0.885, 0.579], [-0.819, 0.179, 1.256, -0.187, 0.08, 0.261], [-0.881, -0.764, 1.817, 0.04, 0.55, 0.119], [-0.782, -0.821, 1.039, -0.465, -0.079, -0.041]]\nB: [[0.913, 1.406, 0.914, 0.154, 0.729, 0.951], [-0.918, 0.236, 1.614, 0.027, 0.343, 0.415], [-0.932, -0.471, 1.376, 0.043, 0.42, 0.318], [-0.937, -1.266, 1.202, 0.021, 0.397, 0.404]]\nC: [[0.638, 1.511, 1.273, 0.574, 0.958, 0.746], [-1.165, 0.389, 1.897, 0.474, -0.02, 0.527], [-0.474, 0.021, 1.802, 0.289, 0.006, -0.062], [-1.35, -1.672, 1.153, 0.07, 0.246, 0.557]]\nD: [[0.615, 1.775, 1.082, 0.394, 0.94, 1.366], [-0.883, -0.231, 1.634, -0.385, 0.134, 0.914], [-0.757, -0.827, 1.097, 0.253, 0.741, 0.546], [-1.013, -1.459, 1.475, -0.37, 0.862, 0.783]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the picture in the scene. The camera pose information includes: the rotation matrix: [[-0.042655, 0.409797, -0.911179], [0.998036, -0.024411, -0.0577], [-0.045888, -0.91185, -0.40795]]; the translation vector: [2.423933, 1.356295, 3.282493], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.333, 1.449, 0.466, 0.575, 0.885, 0.579], [-0.819, 0.179, 1.256, -0.187, 0.08, 0.261], [-0.881, -0.764, 1.817, 0.04, 0.55, 0.119], [-0.782, -0.821, 1.039, -0.465, -0.079, -0.041]]\nB: [[0.913, 1.406, 0.914, 0.154, 0.729, 0.951], [-0.918, 0.236, 1.614, 0.027, 0.343, 0.415], [-0.932, -0.471, 1.376, 0.043, 0.42, 0.318], [-0.937, -1.266, 1.202, 0.021, 0.397, 0.404]]\nC: [[0.638, 1.511, 1.273, 0.574, 0.958, 0.746], [-1.165, 0.389, 1.897, 0.474, -0.02, 0.527], [-0.474, 0.021, 1.802, 0.289, 0.006, -0.062], [-1.35, -1.672, 1.153, 0.07, 0.246, 0.557]]\nD: [[0.615, 1.775, 1.082, 0.394, 0.94, 1.366], [-0.883, -0.231, 1.634, -0.385, 0.134, 0.914], [-0.757, -0.827, 1.097, 0.253, 0.741, 0.546], [-1.013, -1.459, 1.475, -0.37, 0.862, 0.783]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_125_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_125_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-1.696, 2.84, 0.243, 0.913, 0.14, 0.965], [-2.524, -1.89, 1.517, 0.263, 1.169, 1.097]]\nB: [[-2.032, 3.081, 0.673, 0.874, 0.207, 1.282], [-2.435, -2.167, 1.207, 0.214, 0.953, 0.8]]\nC: [[-2.083, 2.817, 0.915, 0.407, -0.083, 1.119], [-2.185, -1.778, 0.77, 0.561, 0.888, 0.902]]\nD: [[-1.79, 3.485, 0.577, 0.547, 0.315, 1.286], [-2.516, -2.509, 1.071, 0.577, 1.197, 0.616]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the window in the scene. The camera pose information includes: the rotation matrix: [[0.299058, 0.37418, -0.877812], [0.95368, -0.085842, 0.288314], [0.032528, -0.923375, -0.38252]]; the translation vector: [3.908031, 4.993837, 1.41318], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.696, 2.84, 0.243, 0.913, 0.14, 0.965], [-2.524, -1.89, 1.517, 0.263, 1.169, 1.097]]\nB: [[-2.032, 3.081, 0.673, 0.874, 0.207, 1.282], [-2.435, -2.167, 1.207, 0.214, 0.953, 0.8]]\nC: [[-2.083, 2.817, 0.915, 0.407, -0.083, 1.119], [-2.185, -1.778, 0.77, 0.561, 0.888, 0.902]]\nD: [[-1.79, 3.485, 0.577, 0.547, 0.315, 1.286], [-2.516, -2.509, 1.071, 0.577, 1.197, 0.616]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_126_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_126_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[1.147, 0.119, 0.251, 0.463, 0.502, 0.493], [1.142, -0.546, 0.997, 0.457, 0.597, 0.473], [1.198, 0.632, 0.925, 0.452, 0.473, 0.432], [1.163, 0.092, 1.069, 0.44, 0.432, 0.506]]\nB: [[0.939, -0.362, 0.676, 0.67, 0.041, 0.58], [0.766, -0.402, 0.786, 0.189, 1.052, 0.915], [1.684, 0.428, 1.283, 0.635, 0.353, 0.864], [1.275, -0.104, 1.385, 0.008, 0.054, 0.956]]\nC: [[1.248, 0.165, 0.549, 0.255, 0.722, 0.454], [1.139, -0.967, 1.065, 0.247, 0.425, 0.531], [0.839, 1.106, 1.224, 0.271, 0.846, 0.671], [0.954, 0.329, 1.422, 0.774, 0.624, 0.313]]\nD: [[1.328, 0.233, 0.409, 0.859, 0.672, 0.071], [1.492, -0.434, 0.743, 0.731, 0.907, 0.382], [1.626, 0.478, 0.601, 0.312, 0.631, 0.904], [1.629, 0.385, 0.684, 0.845, 0.492, 0.801]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the printer in the scene. The camera pose information includes: the rotation matrix: [[0.985254, -0.134646, 0.105573], [-0.142287, -0.302097, 0.942599], [-0.095024, -0.94372, -0.3168]]; the translation vector: [1.134605, 1.549487, 1.505245], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.147, 0.119, 0.251, 0.463, 0.502, 0.493], [1.142, -0.546, 0.997, 0.457, 0.597, 0.473], [1.198, 0.632, 0.925, 0.452, 0.473, 0.432], [1.163, 0.092, 1.069, 0.44, 0.432, 0.506]]\nB: [[0.939, -0.362, 0.676, 0.67, 0.041, 0.58], [0.766, -0.402, 0.786, 0.189, 1.052, 0.915], [1.684, 0.428, 1.283, 0.635, 0.353, 0.864], [1.275, -0.104, 1.385, 0.008, 0.054, 0.956]]\nC: [[1.248, 0.165, 0.549, 0.255, 0.722, 0.454], [1.139, -0.967, 1.065, 0.247, 0.425, 0.531], [0.839, 1.106, 1.224, 0.271, 0.846, 0.671], [0.954, 0.329, 1.422, 0.774, 0.624, 0.313]]\nD: [[1.328, 0.233, 0.409, 0.859, 0.672, 0.071], [1.492, -0.434, 0.743, 0.731, 0.907, 0.382], [1.626, 0.478, 0.601, 0.312, 0.631, 0.904], [1.629, 0.385, 0.684, 0.845, 0.492, 0.801]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_127_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_127_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-1.978, 2.218, 0.88, 0.413, 0.1, 0.702], [-1.917, 2.1, 1.145, 0.538, 0.288, 0.643]]\nB: [[-1.584, 2.193, 0.205, 0.199, 0.268, 0.839], [-1.535, 2.333, 0.994, 0.342, 0.187, 0.134]]\nC: [[-1.966, 2.066, 0.622, 0.287, 0.189, 0.88], [-1.737, 2.041, 0.848, 0.173, 0.149, 0.382]]\nD: [[-1.998, 2.157, 0.963, 0.629, -0.078, 1.235], [-1.653, 2.214, 0.646, 0.156, 0.285, 0.243]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the towel in the scene. The camera pose information includes: the rotation matrix: [[0.686341, -0.358824, 0.632599], [-0.727213, -0.35045, 0.590209], [0.009912, -0.865119, -0.50147]]; the translation vector: [2.486494, 4.601647, 1.455454], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.978, 2.218, 0.88, 0.413, 0.1, 0.702], [-1.917, 2.1, 1.145, 0.538, 0.288, 0.643]]\nB: [[-1.584, 2.193, 0.205, 0.199, 0.268, 0.839], [-1.535, 2.333, 0.994, 0.342, 0.187, 0.134]]\nC: [[-1.966, 2.066, 0.622, 0.287, 0.189, 0.88], [-1.737, 2.041, 0.848, 0.173, 0.149, 0.382]]\nD: [[-1.998, 2.157, 0.963, 0.629, -0.078, 1.235], [-1.653, 2.214, 0.646, 0.156, 0.285, 0.243]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_128_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_128_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-1.047, -0.588, -0.114, 0.759, 2.976, 0.961]]\nB: [[-1.511, -0.608, 0.081, 1.102, 2.98, 1.011]]\nC: [[-1.203, -0.385, 0.359, 0.756, 2.647, 0.817]]\nD: [[-1.37, -0.358, 0.323, 0.77, 2.437, 0.409]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the desk in the scene. The camera pose information includes: the rotation matrix: [[-0.802837, 0.056561, -0.593509], [0.596192, 0.071654, -0.799638], [-0.002701, -0.995825, -0.091248]]; the translation vector: [2.583219, 4.008804, 1.439254], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.047, -0.588, -0.114, 0.759, 2.976, 0.961]]\nB: [[-1.511, -0.608, 0.081, 1.102, 2.98, 1.011]]\nC: [[-1.203, -0.385, 0.359, 0.756, 2.647, 0.817]]\nD: [[-1.37, -0.358, 0.323, 0.77, 2.437, 0.409]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_129_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_129_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[0.904, -0.732, 0.092, 0.243, 0.59, 0.859], [-1.501, -0.76, 0.757, 1.132, 0.498, 0.756]]\nB: [[1.126, -0.366, 0.392, 0.688, 0.942, 0.802], [-1.375, -0.274, 0.471, 1.076, 0.886, 0.947]]\nC: [[0.868, -0.772, 0.151, 0.633, 1.223, 0.791], [-1.775, -0.718, 0.331, 1.093, 0.846, 1.4]]\nD: [[1.114, -0.309, 0.254, 0.953, 0.846, 0.427], [-1.752, 0.101, 0.877, 0.811, 1.045, 0.651]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the dresser in the scene. The camera pose information includes: the rotation matrix: [[-0.442667, -0.46733, 0.765277], [-0.896368, 0.253361, -0.363776], [-0.023888, -0.847001, -0.531054]]; the translation vector: [2.453469, 1.905797, 1.451684], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.904, -0.732, 0.092, 0.243, 0.59, 0.859], [-1.501, -0.76, 0.757, 1.132, 0.498, 0.756]]\nB: [[1.126, -0.366, 0.392, 0.688, 0.942, 0.802], [-1.375, -0.274, 0.471, 1.076, 0.886, 0.947]]\nC: [[0.868, -0.772, 0.151, 0.633, 1.223, 0.791], [-1.775, -0.718, 0.331, 1.093, 0.846, 1.4]]\nD: [[1.114, -0.309, 0.254, 0.953, 0.846, 0.427], [-1.752, 0.101, 0.877, 0.811, 1.045, 0.651]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_130_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_130_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[0.61, -0.414, 0.408, 4.655, 4.102, -0.244]]\nB: [[0.311, -0.524, 0.039, 4.829, 4.569, 0.162]]\nC: [[0.203, -0.323, 0.325, 5.217, 4.229, 0.65]]\nD: [[0.089, -0.712, 0.114, 5.287, 4.148, 0.629]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the floor in the scene. The camera pose information includes: the rotation matrix: [[0.633294, -0.360819, 0.684652], [-0.773758, -0.312806, 0.550863], [0.015401, -0.878613, -0.477285]]; the translation vector: [3.241882, 3.386626, 1.367882], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.61, -0.414, 0.408, 4.655, 4.102, -0.244]]\nB: [[0.311, -0.524, 0.039, 4.829, 4.569, 0.162]]\nC: [[0.203, -0.323, 0.325, 5.217, 4.229, 0.65]]\nD: [[0.089, -0.712, 0.114, 5.287, 4.148, 0.629]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_131_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_131_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[2.917, -0.639, 0.621, 0.336, 6.104, 1.183], [0.361, -2.884, 0.637, 5.668, -0.024, 1.879], [-2.761, -1.433, 0.573, -0.228, 0.551, 1.578], [-3.018, -2.023, 1.21, 0.696, 0.033, 1.511], [-2.944, 0.173, 0.693, 0.375, 4.326, 1.889]]\nB: [[2.88, 0.258, 0.921, 0.635, 5.466, 1.974], [-0.061, -2.646, 0.552, 6.114, -0.006, 1.775], [-3.069, -1.6, 0.804, 0.521, 0.433, 1.489], [-3.084, -1.953, 1.232, 0.742, 0.11, 1.43], [-2.84, 1.121, 0.562, 0.204, 4.902, 1.824]]\nC: [[3.129, -0.248, 1.092, 0.28, 6.006, 1.69], [0.277, -3.248, 1.229, 5.639, 0.457, 1.83], [-2.943, -1.702, 1.206, 0.61, 0.818, 1.511], [-2.632, -1.423, 0.42, 0.373, 0.138, 1.635], [-3.02, 0.349, 0.427, 0.566, 4.15, 1.781]]\nD: [[3.003, -0.173, 0.772, 0.324, 5.743, 1.505], [-0.052, -3.097, 0.827, 6.005, 0.286, 1.553], [-3.164, -1.839, 0.77, 0.192, 0.577, 1.362], [-2.872, -1.562, 0.743, 0.498, 0.153, 1.361], [-2.619, 0.636, 0.832, 0.279, 4.454, 1.688]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[-0.924593, 0.219455, -0.311397], [0.371095, 0.334047, -0.86643], [-0.086121, -0.916653, -0.390296]]; the translation vector: [7.650298, 2.745242, 1.444521], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[2.917, -0.639, 0.621, 0.336, 6.104, 1.183], [0.361, -2.884, 0.637, 5.668, -0.024, 1.879], [-2.761, -1.433, 0.573, -0.228, 0.551, 1.578], [-3.018, -2.023, 1.21, 0.696, 0.033, 1.511], [-2.944, 0.173, 0.693, 0.375, 4.326, 1.889]]\nB: [[2.88, 0.258, 0.921, 0.635, 5.466, 1.974], [-0.061, -2.646, 0.552, 6.114, -0.006, 1.775], [-3.069, -1.6, 0.804, 0.521, 0.433, 1.489], [-3.084, -1.953, 1.232, 0.742, 0.11, 1.43], [-2.84, 1.121, 0.562, 0.204, 4.902, 1.824]]\nC: [[3.129, -0.248, 1.092, 0.28, 6.006, 1.69], [0.277, -3.248, 1.229, 5.639, 0.457, 1.83], [-2.943, -1.702, 1.206, 0.61, 0.818, 1.511], [-2.632, -1.423, 0.42, 0.373, 0.138, 1.635], [-3.02, 0.349, 0.427, 0.566, 4.15, 1.781]]\nD: [[3.003, -0.173, 0.772, 0.324, 5.743, 1.505], [-0.052, -3.097, 0.827, 6.005, 0.286, 1.553], [-3.164, -1.839, 0.77, 0.192, 0.577, 1.362], [-2.872, -1.562, 0.743, 0.498, 0.153, 1.361], [-2.619, 0.636, 0.832, 0.279, 4.454, 1.688]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_132_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_132_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[0.133, 0.902, 0.422, 0.039, 0.22, 0.632]]\nB: [[-0.076, 0.973, 0.415, -0.004, 1.174, 1.248]]\nC: [[0.144, 0.321, 0.705, -0.021, 0.284, 1.035]]\nD: [[0.355, 0.535, 0.346, 0.07, 0.677, 0.805]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the dishwasher in the scene. The camera pose information includes: the rotation matrix: [[0.975982, 0.033782, -0.215214], [0.215389, -0.297687, 0.930048], [-0.032648, -0.954066, -0.297814]]; the translation vector: [2.838751, 1.414222, 1.664536], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.133, 0.902, 0.422, 0.039, 0.22, 0.632]]\nB: [[-0.076, 0.973, 0.415, -0.004, 1.174, 1.248]]\nC: [[0.144, 0.321, 0.705, -0.021, 0.284, 1.035]]\nD: [[0.355, 0.535, 0.346, 0.07, 0.677, 0.805]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_133_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_133_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-1.578, -0.219, 1.033, 0.257, 3.748, 1.935], [2.209, 0.671, 0.788, -0.095, 3.689, 2.508], [0.223, -2.291, 0.584, 0.625, 0.246, 1.596], [0.409, -2.777, 1.049, -0.283, 0.131, 1.81]]\nB: [[-2.04, 0.586, 0.772, 0.304, 3.812, 1.848], [1.619, 0.488, 0.655, 0.555, 3.765, 1.94], [0.355, -2.984, 1.136, -0.107, 0.055, 1.74], [0.752, -2.78, 0.749, 0.33, 0.188, 1.815]]\nC: [[-1.581, 0.188, 1.09, 0.283, 3.526, 2.183], [1.935, 0.185, 1.045, 0.157, 3.57, 2.128], [0.384, -2.556, 0.863, 0.244, 0.135, 1.758], [0.278, -2.37, 1.022, 0.1, 0.539, 2.045]]\nD: [[-1.492, -0.235, 1.434, 0.171, 3.146, 1.819], [2.128, 0.651, 1.233, 0.526, 3.819, 1.664], [0.295, -2.822, 1.218, -0.087, 0.184, 1.532], [0.45, -1.956, 1.009, 0.299, 0.644, 2.242]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[-0.037281, 0.595041, -0.80283], [0.998378, -0.012419, -0.055566], [-0.043034, -0.803599, -0.593613]]; the translation vector: [3.95675, 2.244474, 1.442954], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.578, -0.219, 1.033, 0.257, 3.748, 1.935], [2.209, 0.671, 0.788, -0.095, 3.689, 2.508], [0.223, -2.291, 0.584, 0.625, 0.246, 1.596], [0.409, -2.777, 1.049, -0.283, 0.131, 1.81]]\nB: [[-2.04, 0.586, 0.772, 0.304, 3.812, 1.848], [1.619, 0.488, 0.655, 0.555, 3.765, 1.94], [0.355, -2.984, 1.136, -0.107, 0.055, 1.74], [0.752, -2.78, 0.749, 0.33, 0.188, 1.815]]\nC: [[-1.581, 0.188, 1.09, 0.283, 3.526, 2.183], [1.935, 0.185, 1.045, 0.157, 3.57, 2.128], [0.384, -2.556, 0.863, 0.244, 0.135, 1.758], [0.278, -2.37, 1.022, 0.1, 0.539, 2.045]]\nD: [[-1.492, -0.235, 1.434, 0.171, 3.146, 1.819], [2.128, 0.651, 1.233, 0.526, 3.819, 1.664], [0.295, -2.822, 1.218, -0.087, 0.184, 1.532], [0.45, -1.956, 1.009, 0.299, 0.644, 2.242]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_134_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_134_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[0.199, 1.125, 0.399, 0.214, 0.05, 0.589]]\nB: [[0.02, 1.322, 0.476, 0.689, 0.454, 0.768]]\nC: [[0.504, 0.831, 0.74, 0.202, 0.254, 0.39]]\nD: [[0.93, 1.224, 1.103, 0.115, -0.143, 0.862]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the shelf in the scene. The camera pose information includes: the rotation matrix: [[0.994446, -0.078697, 0.06988], [-0.104992, -0.787844, 0.606859], [0.007297, -0.610826, -0.791731]]; the translation vector: [1.305105, 0.510448, 1.183315], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.199, 1.125, 0.399, 0.214, 0.05, 0.589]]\nB: [[0.02, 1.322, 0.476, 0.689, 0.454, 0.768]]\nC: [[0.504, 0.831, 0.74, 0.202, 0.254, 0.39]]\nD: [[0.93, 1.224, 1.103, 0.115, -0.143, 0.862]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_135_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_135_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[1.403, 0.709, 0.086, 0.284, 0.032, 0.061]]\nB: [[1.358, 0.357, -0.003, 0.163, 0.142, 0.021]]\nC: [[1.451, 0.553, 0.13, 0.387, 0.236, 0.338]]\nD: [[1.592, 0.722, 0.492, 0.54, 0.067, 0.402]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the trash can in the scene. The camera pose information includes: the rotation matrix: [[-0.573389, -0.355745, 0.738018], [-0.818965, 0.223754, -0.528424], [0.02285, -0.907403, -0.419641]]; the translation vector: [2.061407, 3.857203, 1.382209], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.403, 0.709, 0.086, 0.284, 0.032, 0.061]]\nB: [[1.358, 0.357, -0.003, 0.163, 0.142, 0.021]]\nC: [[1.451, 0.553, 0.13, 0.387, 0.236, 0.338]]\nD: [[1.592, 0.722, 0.492, 0.54, 0.067, 0.402]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_136_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_136_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[2.019, 1.716, -0.199, 0.133, 0.443, 0.08]]\nB: [[1.539, 1.317, 0.169, 1.021, 0.789, 0.672]]\nC: [[1.691, 1.543, 0.248, 0.524, 0.565, 0.475]]\nD: [[1.676, 1.663, -0.114, 0.76, 0.881, 0.004]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the footrest in the scene. The camera pose information includes: the rotation matrix: [[-0.752388, 0.33007, -0.570058], [0.655329, 0.287372, -0.698542], [-0.066749, -0.89915, -0.43252]]; the translation vector: [3.814293, 2.583141, 1.394159], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[2.019, 1.716, -0.199, 0.133, 0.443, 0.08]]\nB: [[1.539, 1.317, 0.169, 1.021, 0.789, 0.672]]\nC: [[1.691, 1.543, 0.248, 0.524, 0.565, 0.475]]\nD: [[1.676, 1.663, -0.114, 0.76, 0.881, 0.004]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_137_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_137_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[2.123, 1.117, 1.321, 0.743, 0.381, 0.635], [2.682, 1.29, 0.828, 0.463, 1.156, 0.705], [2.834, 1.547, 0.399, 0.032, 0.368, 0.16], [3.096, 0.047, 0.87, -0.078, 0.326, -0.125], [2.563, 0.993, 1.356, 0.811, 0.73, 0.229], [2.143, 0.969, 1.037, 0.642, 1.052, 0.055], [2.659, -0.615, 0.836, 0.248, 1.054, 0.022], [3.223, -0.649, 1.168, 0.253, 1.288, 0.671], [2.454, -0.393, 0.682, 0.657, 1.137, 0.691], [-3.644, 1.213, 1.46, 0.605, 1.274, 0.706], [-3.657, -0.185, 0.669, 0.323, 1.16, 0.8], [-3.228, -0.103, 1.329, 0.441, 0.997, 0.754], [-3.305, 0.306, 0.543, 0.056, 1.942, 0.326], [-3.554, -0.503, 0.414, 0.642, 0.665, 0.745], [-2.947, -0.695, 0.368, 0.59, 0.436, 0.372], [-3.593, -0.106, 0.806, 0.216, 0.592, 0.301]]\nB: [[2.989, 1.126, 0.794, -0.179, 0.245, 0.369], [2.962, 1.061, 0.623, -0.057, 0.36, 0.431], [2.845, 1.185, 0.945, 0.308, 0.535, 0.574], [2.424, 0.962, 1.637, -0.272, 0.494, 0.77], [3.085, 0.394, 0.93, 0.245, 0.901, 0.482], [2.87, 0.321, 0.254, 0.308, 0.264, 0.679], [2.834, -0.509, 1.34, 0.641, 0.49, 0.271], [2.993, -0.295, 0.769, -0.075, 1.002, 0.589], [3.132, -0.129, 0.78, 0.069, 1.025, 0.007], [-2.917, 1.638, 1.353, 0.35, 0.736, 0.591], [-2.828, -0.168, 1.186, 0.057, 1.347, 0.51], [-3.297, -0.456, 0.362, 0.307, 0.654, 0.781], [-3.301, 0.612, 0.703, 0.328, 1.414, 0.306], [-2.89, -0.213, 0.298, -0.086, 1.058, 0.488], [-2.855, -0.016, -0.219, -0.168, 0.422, -0.035], [-3.555, 0.252, 0.516, -0.109, 1.029, 0.664]]\nC: [[2.568, 1.418, 1.271, 0.257, 0.709, 0.306], [2.646, 1.448, 0.95, 0.305, 0.76, 0.302], [2.592, 1.461, 0.636, 0.212, 0.718, 0.28], [2.65, 0.514, 1.213, 0.222, 0.814, 0.309], [2.738, 0.497, 0.888, 0.381, 0.863, 0.308], [2.639, 0.563, 0.627, 0.305, 0.736, 0.188], [2.693, -0.392, 1.14, 0.281, 0.891, 0.334], [2.727, -0.372, 0.833, 0.29, 0.926, 0.3], [2.691, -0.383, 0.563, 0.264, 0.854, 0.201], [-3.22, 1.231, 1.017, 0.313, 0.915, 0.346], [-3.273, 0.289, 0.923, 0.23, 1.204, 0.355], [-3.222, -0.487, 0.833, 0.334, 0.747, 0.368], [-3.341, 0.627, 0.626, 0.449, 1.466, 0.437], [-3.265, -0.411, 0.526, 0.337, 0.641, 0.343], [-3.203, -0.328, 0.27, 0.175, 0.592, 0.204], [-3.277, 0.365, 0.338, 0.332, 0.934, 0.242]]\nD: [[2.244, 1.249, 1.196, 0.59, 0.671, 0.591], [3.002, 1.584, 0.459, 0.732, 0.625, -0.064], [2.803, 1.399, 0.195, 0.554, 0.24, -0.185], [2.948, 0.428, 1.564, 0.649, 0.642, 0.076], [2.502, 0.944, 1.279, 0.724, 1.079, 0.788], [3.063, 0.247, 0.912, 0.247, 0.578, 0.126], [2.848, -0.809, 0.778, 0.441, 1.15, 0.263], [2.483, -0.756, 0.605, 0.63, 1.407, 0.292], [2.369, -0.586, 0.732, 0.348, 0.461, 0.12], [-3.238, 0.78, 0.778, 0.212, 1.143, -0.102], [-3.116, 0.426, 0.879, 0.248, 1.646, 0.306], [-2.875, -0.393, 1.087, 0.035, 1.245, 0.038], [-3.308, 0.845, 1.118, 0.472, 1.582, 0.109], [-3.33, -0.848, 0.583, 0.088, 1.108, -0.004], [-3.371, -0.081, 0.236, -0.02, 0.647, 0.543], [-3.267, -0.114, -0.13, -0.134, 1.197, -0.109]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the books in the scene. The camera pose information includes: the rotation matrix: [[0.892065, -0.360019, 0.273141], [-0.443019, -0.577417, 0.685801], [-0.089185, -0.732786, -0.674589]]; the translation vector: [2.898737, 2.45906, 1.649541], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[2.123, 1.117, 1.321, 0.743, 0.381, 0.635], [2.682, 1.29, 0.828, 0.463, 1.156, 0.705], [2.834, 1.547, 0.399, 0.032, 0.368, 0.16], [3.096, 0.047, 0.87, -0.078, 0.326, -0.125], [2.563, 0.993, 1.356, 0.811, 0.73, 0.229], [2.143, 0.969, 1.037, 0.642, 1.052, 0.055], [2.659, -0.615, 0.836, 0.248, 1.054, 0.022], [3.223, -0.649, 1.168, 0.253, 1.288, 0.671], [2.454, -0.393, 0.682, 0.657, 1.137, 0.691], [-3.644, 1.213, 1.46, 0.605, 1.274, 0.706], [-3.657, -0.185, 0.669, 0.323, 1.16, 0.8], [-3.228, -0.103, 1.329, 0.441, 0.997, 0.754], [-3.305, 0.306, 0.543, 0.056, 1.942, 0.326], [-3.554, -0.503, 0.414, 0.642, 0.665, 0.745], [-2.947, -0.695, 0.368, 0.59, 0.436, 0.372], [-3.593, -0.106, 0.806, 0.216, 0.592, 0.301]]\nB: [[2.989, 1.126, 0.794, -0.179, 0.245, 0.369], [2.962, 1.061, 0.623, -0.057, 0.36, 0.431], [2.845, 1.185, 0.945, 0.308, 0.535, 0.574], [2.424, 0.962, 1.637, -0.272, 0.494, 0.77], [3.085, 0.394, 0.93, 0.245, 0.901, 0.482], [2.87, 0.321, 0.254, 0.308, 0.264, 0.679], [2.834, -0.509, 1.34, 0.641, 0.49, 0.271], [2.993, -0.295, 0.769, -0.075, 1.002, 0.589], [3.132, -0.129, 0.78, 0.069, 1.025, 0.007], [-2.917, 1.638, 1.353, 0.35, 0.736, 0.591], [-2.828, -0.168, 1.186, 0.057, 1.347, 0.51], [-3.297, -0.456, 0.362, 0.307, 0.654, 0.781], [-3.301, 0.612, 0.703, 0.328, 1.414, 0.306], [-2.89, -0.213, 0.298, -0.086, 1.058, 0.488], [-2.855, -0.016, -0.219, -0.168, 0.422, -0.035], [-3.555, 0.252, 0.516, -0.109, 1.029, 0.664]]\nC: [[2.568, 1.418, 1.271, 0.257, 0.709, 0.306], [2.646, 1.448, 0.95, 0.305, 0.76, 0.302], [2.592, 1.461, 0.636, 0.212, 0.718, 0.28], [2.65, 0.514, 1.213, 0.222, 0.814, 0.309], [2.738, 0.497, 0.888, 0.381, 0.863, 0.308], [2.639, 0.563, 0.627, 0.305, 0.736, 0.188], [2.693, -0.392, 1.14, 0.281, 0.891, 0.334], [2.727, -0.372, 0.833, 0.29, 0.926, 0.3], [2.691, -0.383, 0.563, 0.264, 0.854, 0.201], [-3.22, 1.231, 1.017, 0.313, 0.915, 0.346], [-3.273, 0.289, 0.923, 0.23, 1.204, 0.355], [-3.222, -0.487, 0.833, 0.334, 0.747, 0.368], [-3.341, 0.627, 0.626, 0.449, 1.466, 0.437], [-3.265, -0.411, 0.526, 0.337, 0.641, 0.343], [-3.203, -0.328, 0.27, 0.175, 0.592, 0.204], [-3.277, 0.365, 0.338, 0.332, 0.934, 0.242]]\nD: [[2.244, 1.249, 1.196, 0.59, 0.671, 0.591], [3.002, 1.584, 0.459, 0.732, 0.625, -0.064], [2.803, 1.399, 0.195, 0.554, 0.24, -0.185], [2.948, 0.428, 1.564, 0.649, 0.642, 0.076], [2.502, 0.944, 1.279, 0.724, 1.079, 0.788], [3.063, 0.247, 0.912, 0.247, 0.578, 0.126], [2.848, -0.809, 0.778, 0.441, 1.15, 0.263], [2.483, -0.756, 0.605, 0.63, 1.407, 0.292], [2.369, -0.586, 0.732, 0.348, 0.461, 0.12], [-3.238, 0.78, 0.778, 0.212, 1.143, -0.102], [-3.116, 0.426, 0.879, 0.248, 1.646, 0.306], [-2.875, -0.393, 1.087, 0.035, 1.245, 0.038], [-3.308, 0.845, 1.118, 0.472, 1.582, 0.109], [-3.33, -0.848, 0.583, 0.088, 1.108, -0.004], [-3.371, -0.081, 0.236, -0.02, 0.647, 0.543], [-3.267, -0.114, -0.13, -0.134, 1.197, -0.109]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_138_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_138_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[0.568, 1.594, 2.336, 3.002, 0.215, 1.78], [-1.527, -0.451, 1.258, 0.103, 4.121, 1.756], [-1.386, -2.636, 1.043, -0.207, -0.22, 1.193], [1.834, 0.934, 1.595, 0.321, 1.764, 2.84], [1.018, -0.319, 0.583, 1.01, 0.681, 1.687]]\nB: [[0.072, 1.537, 1.845, 2.689, 0.191, 1.622], [-1.273, -0.316, 0.956, 0.156, 3.767, 1.891], [-1.14, -2.18, 0.679, 0.246, 0.067, 1.34], [1.381, 0.651, 1.354, 0.135, 1.692, 2.602], [0.889, -0.737, 0.87, 1.059, 1.122, 1.773]]\nC: [[0.21, 1.662, 1.825, 2.75, -0.252, 2.024], [-1.56, -0.058, 0.561, 0.054, 3.741, 2.333], [-1.055, -2.665, 0.535, 0.196, 0.05, 1.825], [1.164, 0.58, 1.628, 0.045, 1.482, 2.195], [1.198, -0.291, 1.331, 0.727, 1.34, 1.309]]\nD: [[-0.147, 1.793, 1.85, 3.103, 0.596, 1.69], [-1.538, -0.388, 0.463, 0.445, 3.441, 1.475], [-1.625, -1.946, 0.934, 0.072, -0.182, 1.409], [1.247, 1.123, 0.994, 0.033, 1.379, 2.521], [0.847, -0.38, 0.424, 0.888, 1.469, 2.148]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[0.721847, -0.019511, -0.691778], [0.690918, -0.036893, 0.721991], [-0.039608, -0.999129, -0.013151]]; the translation vector: [1.871862, 0.815296, 1.594356], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.568, 1.594, 2.336, 3.002, 0.215, 1.78], [-1.527, -0.451, 1.258, 0.103, 4.121, 1.756], [-1.386, -2.636, 1.043, -0.207, -0.22, 1.193], [1.834, 0.934, 1.595, 0.321, 1.764, 2.84], [1.018, -0.319, 0.583, 1.01, 0.681, 1.687]]\nB: [[0.072, 1.537, 1.845, 2.689, 0.191, 1.622], [-1.273, -0.316, 0.956, 0.156, 3.767, 1.891], [-1.14, -2.18, 0.679, 0.246, 0.067, 1.34], [1.381, 0.651, 1.354, 0.135, 1.692, 2.602], [0.889, -0.737, 0.87, 1.059, 1.122, 1.773]]\nC: [[0.21, 1.662, 1.825, 2.75, -0.252, 2.024], [-1.56, -0.058, 0.561, 0.054, 3.741, 2.333], [-1.055, -2.665, 0.535, 0.196, 0.05, 1.825], [1.164, 0.58, 1.628, 0.045, 1.482, 2.195], [1.198, -0.291, 1.331, 0.727, 1.34, 1.309]]\nD: [[-0.147, 1.793, 1.85, 3.103, 0.596, 1.69], [-1.538, -0.388, 0.463, 0.445, 3.441, 1.475], [-1.625, -1.946, 0.934, 0.072, -0.182, 1.409], [1.247, 1.123, 0.994, 0.033, 1.379, 2.521], [0.847, -0.38, 0.424, 0.888, 1.469, 2.148]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_139_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_139_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-0.013, -1.238, 0.461, 0.493, 0.288, 0.575], [-1.147, 3.348, 0.377, 0.564, 1.212, 0.508], [0.233, -3.522, 0.086, 0.938, 1.233, 0.371], [1.742, -2.531, 0.357, 1.04, 1.008, 0.183], [-0.823, -2.335, 0.1, 1.256, 0.934, 0.241], [-1.592, 0.953, 0.372, 1.131, 0.478, 0.788], [-0.027, 4.368, 1.235, 0.272, -0.107, 0.356], [2.853, 3.344, 0.339, 0.665, 0.157, 0.567], [2.027, 4.048, 1.116, 0.515, 0.345, 0.52], [0.87, 3.839, 0.8, 0.696, -0.287, 0.501], [1.369, 2.326, 0.538, 0.245, 0.786, 0.243], [0.357, 3.043, 0.662, 0.778, 0.111, 0.513], [1.447, 2.656, 0.359, 0.141, 0.33, 0.84]]\nB: [[0.068, -1.042, 0.544, 0.886, 0.779, 0.545], [-1.479, 3.034, 0.552, 0.898, 0.801, 0.497], [-0.06, -3.112, 0.543, 0.84, 0.784, 0.511], [1.274, -2.138, 0.543, 0.738, 0.855, 0.547], [-0.786, -2.2, 0.536, 0.806, 0.879, 0.474], [-1.39, 1.148, 0.549, 0.822, 0.745, 0.54], [0.444, 4.003, 0.791, 0.485, 0.139, 0.082], [2.511, 3.843, 0.762, 0.448, 0.131, 0.083], [1.884, 3.916, 0.775, 0.46, 0.149, 0.083], [1.153, 3.946, 0.791, 0.453, 0.166, 0.098], [1.053, 2.651, 0.606, 0.523, 0.61, 0.485], [0.449, 2.899, 0.606, 0.476, 0.557, 0.467], [1.688, 2.596, 0.605, 0.503, 0.592, 0.451]]\nC: [[0.102, -0.947, 0.484, 1.245, 0.79, 0.775], [-1.118, 3.375, 0.842, 0.401, 1.069, 0.196], [0.407, -2.782, 0.934, 1.07, 0.467, 0.067], [1.541, -2.237, 0.403, 0.888, 1.246, 0.245], [-0.917, -1.889, 0.628, 0.956, 1.204, 0.523], [-1.021, 1.176, 0.814, 0.368, 0.456, 0.678], [0.573, 4.084, 1.228, 0.815, 0.355, 0.385], [2.848, 3.659, 0.488, 0.047, 0.047, 0.092], [1.907, 4.123, 0.733, 0.026, 0.33, -0.009], [1.212, 4.443, 1.139, 0.078, -0.234, 0.21], [0.892, 2.632, 1.105, 0.392, 1.061, 0.435], [0.166, 3.349, 0.352, 0.282, 0.481, 0.755], [1.529, 2.634, 0.397, 0.324, 0.54, 0.072]]\nD: [[-0.194, -1.122, 0.104, 1.378, 1.12, 0.253], [-1.005, 3.518, 0.745, 0.428, 0.792, 0.08], [0.214, -2.901, 0.412, 0.728, 0.43, 0.91], [1.573, -2.219, 0.557, 0.934, 1.13, 0.876], [-0.782, -2.154, 0.858, 0.543, 1.135, 0.108], [-1.448, 1.097, 0.92, 1.197, 0.497, 0.181], [0.045, 3.571, 0.423, 0.736, -0.143, -0.417], [2.244, 4.297, 0.746, 0.101, 0.473, -0.26], [1.879, 3.692, 0.375, 0.596, -0.051, -0.206], [1.372, 4.096, 0.929, 0.827, -0.125, 0.334], [1.326, 2.984, 0.19, 0.493, 0.248, 0.576], [0.74, 2.996, 0.477, 0.655, 0.254, 0.849], [2.003, 3.037, 0.818, 0.844, 0.675, 0.272]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the chair in the scene. The camera pose information includes: the rotation matrix: [[0.853196, -0.330732, 0.403328], [-0.517406, -0.438892, 0.734619], [-0.065945, -0.835458, -0.545584]]; the translation vector: [2.734716, 6.775187, 1.412962], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.013, -1.238, 0.461, 0.493, 0.288, 0.575], [-1.147, 3.348, 0.377, 0.564, 1.212, 0.508], [0.233, -3.522, 0.086, 0.938, 1.233, 0.371], [1.742, -2.531, 0.357, 1.04, 1.008, 0.183], [-0.823, -2.335, 0.1, 1.256, 0.934, 0.241], [-1.592, 0.953, 0.372, 1.131, 0.478, 0.788], [-0.027, 4.368, 1.235, 0.272, -0.107, 0.356], [2.853, 3.344, 0.339, 0.665, 0.157, 0.567], [2.027, 4.048, 1.116, 0.515, 0.345, 0.52], [0.87, 3.839, 0.8, 0.696, -0.287, 0.501], [1.369, 2.326, 0.538, 0.245, 0.786, 0.243], [0.357, 3.043, 0.662, 0.778, 0.111, 0.513], [1.447, 2.656, 0.359, 0.141, 0.33, 0.84]]\nB: [[0.068, -1.042, 0.544, 0.886, 0.779, 0.545], [-1.479, 3.034, 0.552, 0.898, 0.801, 0.497], [-0.06, -3.112, 0.543, 0.84, 0.784, 0.511], [1.274, -2.138, 0.543, 0.738, 0.855, 0.547], [-0.786, -2.2, 0.536, 0.806, 0.879, 0.474], [-1.39, 1.148, 0.549, 0.822, 0.745, 0.54], [0.444, 4.003, 0.791, 0.485, 0.139, 0.082], [2.511, 3.843, 0.762, 0.448, 0.131, 0.083], [1.884, 3.916, 0.775, 0.46, 0.149, 0.083], [1.153, 3.946, 0.791, 0.453, 0.166, 0.098], [1.053, 2.651, 0.606, 0.523, 0.61, 0.485], [0.449, 2.899, 0.606, 0.476, 0.557, 0.467], [1.688, 2.596, 0.605, 0.503, 0.592, 0.451]]\nC: [[0.102, -0.947, 0.484, 1.245, 0.79, 0.775], [-1.118, 3.375, 0.842, 0.401, 1.069, 0.196], [0.407, -2.782, 0.934, 1.07, 0.467, 0.067], [1.541, -2.237, 0.403, 0.888, 1.246, 0.245], [-0.917, -1.889, 0.628, 0.956, 1.204, 0.523], [-1.021, 1.176, 0.814, 0.368, 0.456, 0.678], [0.573, 4.084, 1.228, 0.815, 0.355, 0.385], [2.848, 3.659, 0.488, 0.047, 0.047, 0.092], [1.907, 4.123, 0.733, 0.026, 0.33, -0.009], [1.212, 4.443, 1.139, 0.078, -0.234, 0.21], [0.892, 2.632, 1.105, 0.392, 1.061, 0.435], [0.166, 3.349, 0.352, 0.282, 0.481, 0.755], [1.529, 2.634, 0.397, 0.324, 0.54, 0.072]]\nD: [[-0.194, -1.122, 0.104, 1.378, 1.12, 0.253], [-1.005, 3.518, 0.745, 0.428, 0.792, 0.08], [0.214, -2.901, 0.412, 0.728, 0.43, 0.91], [1.573, -2.219, 0.557, 0.934, 1.13, 0.876], [-0.782, -2.154, 0.858, 0.543, 1.135, 0.108], [-1.448, 1.097, 0.92, 1.197, 0.497, 0.181], [0.045, 3.571, 0.423, 0.736, -0.143, -0.417], [2.244, 4.297, 0.746, 0.101, 0.473, -0.26], [1.879, 3.692, 0.375, 0.596, -0.051, -0.206], [1.372, 4.096, 0.929, 0.827, -0.125, 0.334], [1.326, 2.984, 0.19, 0.493, 0.248, 0.576], [0.74, 2.996, 0.477, 0.655, 0.254, 0.849], [2.003, 3.037, 0.818, 0.844, 0.675, 0.272]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_140_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_140_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-0.478, -1.621, 1.471, 0.04, 0.607, 0.77], [-0.622, -0.28, 0.778, 0.596, 0.791, 0.172], [0.68, -2.333, 1.441, 0.527, 0.774, 0.878], [0.47, -3.455, 1.236, 0.145, 0.605, 0.702], [-1.07, -2.607, 1.289, 0.421, 0.201, 0.117], [-0.819, -3.811, 1.501, 0.378, 0.459, 0.305], [-0.289, -1.598, 0.306, 0.193, 0.374, 1.351], [-0.144, -0.28, 1.064, 0.516, 0.462, 1.314], [-0.472, 1.134, 0.95, 0.612, 0.425, 0.242], [0.292, 1.559, -0.0, 1.024, 0.739, 0.637], [1.632, 1.532, 0.377, 0.961, 0.147, 0.54], [1.253, 1.503, 0.223, 0.356, 0.173, 0.917], [2.079, 0.639, 0.524, 0.497, 0.63, 1.101], [1.452, -0.032, 0.35, 1.029, 0.429, 0.469], [1.964, -1.067, 0.351, 1.202, 1.067, 0.649], [1.915, -1.339, 0.962, 0.392, 0.481, -0.02]]\nB: [[-1.05, -1.003, 1.329, 0.32, 0.076, 0.616], [-0.594, -0.426, 0.767, 0.622, 0.307, 0.007], [0.602, -2.305, 1.043, 0.218, 0.243, 0.681], [0.924, -3.409, 0.98, 0.773, 0.471, 1.089], [-0.329, -2.354, 0.789, 0.408, 0.875, 0.623], [-0.349, -3.787, 1.449, 0.31, 0.976, 0.266], [0.652, -1.018, 1.006, 0.796, 0.883, 0.697], [0.628, -0.604, 0.772, 0.114, 0.996, 0.953], [-1.118, 1.128, 0.061, 0.216, 0.338, 0.764], [0.65, 1.585, 0.323, 0.699, 0.859, 0.499], [1.631, 1.493, 0.088, 1.244, 0.636, 1.121], [1.187, 0.927, 0.824, 0.22, 0.275, 0.894], [1.693, 0.178, 0.2, 0.357, 0.96, 0.555], [1.798, -0.426, 0.556, 0.111, 1.016, 0.592], [1.891, -0.692, 0.467, 0.91, 1.42, 0.916], [1.538, -2.029, 0.941, 0.82, 1.037, 0.527]]\nC: [[-0.797, -1.314, 1.076, 0.174, 0.538, 0.297], [-0.804, -0.643, 0.993, 0.18, 0.483, 0.317], [0.265, -2.771, 0.976, 0.564, 0.483, 0.724], [0.443, -3.263, 1.105, 0.404, 0.755, 0.654], [-0.786, -2.701, 1.224, 0.235, 0.623, 0.423], [-0.579, -3.467, 1.386, 0.149, 0.491, 0.284], [0.195, -1.173, 0.617, 0.439, 0.594, 0.981], [0.152, -0.693, 0.576, 0.363, 0.648, 0.901], [-0.836, 1.438, 0.551, 0.438, 0.598, 0.552], [0.258, 1.345, 0.466, 0.561, 0.507, 0.736], [1.246, 1.609, 0.396, 0.752, 0.566, 0.883], [1.646, 1.19, 0.575, 0.611, 0.592, 0.619], [1.73, 0.493, 0.521, 0.445, 0.583, 0.771], [1.766, -0.179, 0.551, 0.536, 0.58, 0.774], [1.864, -0.697, 0.533, 0.816, 1.199, 0.994], [1.74, -1.667, 0.652, 0.516, 0.607, 0.36]]\nD: [[-0.49, -1.289, 0.9, 0.491, 0.951, 0.59], [-1.107, -1.021, 1.479, 0.523, 0.505, 0.09], [-0.233, -2.971, 1.208, 0.309, 0.946, 0.617], [0.587, -2.842, 0.811, 0.828, 0.821, 0.621], [-0.674, -2.976, 1.257, -0.139, 0.206, 0.639], [-0.269, -3.606, 1.299, -0.169, 0.133, 0.486], [0.033, -0.697, 1.063, 0.567, 1.022, 1.265], [0.252, -0.714, 0.426, 0.514, 0.322, 1.359], [-1.161, 1.486, 0.647, 0.683, 0.314, 0.187], [-0.11, 1.173, 0.725, 0.462, 0.264, 1.138], [1.341, 1.682, 0.277, 0.312, 0.356, 0.94], [1.815, 1.188, 0.624, 1.015, 0.174, 0.508], [1.714, 0.423, 0.79, 0.889, 0.659, 0.533], [1.648, -0.367, 0.718, 0.468, 1.049, 0.941], [2.335, -0.44, 0.71, 1.148, 1.407, 0.783], [1.632, -1.945, 0.223, 0.453, 0.239, 0.703]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the chair in the scene. The camera pose information includes: the rotation matrix: [[-0.476704, 0.41796, -0.773345], [0.878176, 0.186897, -0.440314], [-0.039498, -0.889033, -0.456137]]; the translation vector: [2.405627, 4.675593, 1.276166], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.478, -1.621, 1.471, 0.04, 0.607, 0.77], [-0.622, -0.28, 0.778, 0.596, 0.791, 0.172], [0.68, -2.333, 1.441, 0.527, 0.774, 0.878], [0.47, -3.455, 1.236, 0.145, 0.605, 0.702], [-1.07, -2.607, 1.289, 0.421, 0.201, 0.117], [-0.819, -3.811, 1.501, 0.378, 0.459, 0.305], [-0.289, -1.598, 0.306, 0.193, 0.374, 1.351], [-0.144, -0.28, 1.064, 0.516, 0.462, 1.314], [-0.472, 1.134, 0.95, 0.612, 0.425, 0.242], [0.292, 1.559, -0.0, 1.024, 0.739, 0.637], [1.632, 1.532, 0.377, 0.961, 0.147, 0.54], [1.253, 1.503, 0.223, 0.356, 0.173, 0.917], [2.079, 0.639, 0.524, 0.497, 0.63, 1.101], [1.452, -0.032, 0.35, 1.029, 0.429, 0.469], [1.964, -1.067, 0.351, 1.202, 1.067, 0.649], [1.915, -1.339, 0.962, 0.392, 0.481, -0.02]]\nB: [[-1.05, -1.003, 1.329, 0.32, 0.076, 0.616], [-0.594, -0.426, 0.767, 0.622, 0.307, 0.007], [0.602, -2.305, 1.043, 0.218, 0.243, 0.681], [0.924, -3.409, 0.98, 0.773, 0.471, 1.089], [-0.329, -2.354, 0.789, 0.408, 0.875, 0.623], [-0.349, -3.787, 1.449, 0.31, 0.976, 0.266], [0.652, -1.018, 1.006, 0.796, 0.883, 0.697], [0.628, -0.604, 0.772, 0.114, 0.996, 0.953], [-1.118, 1.128, 0.061, 0.216, 0.338, 0.764], [0.65, 1.585, 0.323, 0.699, 0.859, 0.499], [1.631, 1.493, 0.088, 1.244, 0.636, 1.121], [1.187, 0.927, 0.824, 0.22, 0.275, 0.894], [1.693, 0.178, 0.2, 0.357, 0.96, 0.555], [1.798, -0.426, 0.556, 0.111, 1.016, 0.592], [1.891, -0.692, 0.467, 0.91, 1.42, 0.916], [1.538, -2.029, 0.941, 0.82, 1.037, 0.527]]\nC: [[-0.797, -1.314, 1.076, 0.174, 0.538, 0.297], [-0.804, -0.643, 0.993, 0.18, 0.483, 0.317], [0.265, -2.771, 0.976, 0.564, 0.483, 0.724], [0.443, -3.263, 1.105, 0.404, 0.755, 0.654], [-0.786, -2.701, 1.224, 0.235, 0.623, 0.423], [-0.579, -3.467, 1.386, 0.149, 0.491, 0.284], [0.195, -1.173, 0.617, 0.439, 0.594, 0.981], [0.152, -0.693, 0.576, 0.363, 0.648, 0.901], [-0.836, 1.438, 0.551, 0.438, 0.598, 0.552], [0.258, 1.345, 0.466, 0.561, 0.507, 0.736], [1.246, 1.609, 0.396, 0.752, 0.566, 0.883], [1.646, 1.19, 0.575, 0.611, 0.592, 0.619], [1.73, 0.493, 0.521, 0.445, 0.583, 0.771], [1.766, -0.179, 0.551, 0.536, 0.58, 0.774], [1.864, -0.697, 0.533, 0.816, 1.199, 0.994], [1.74, -1.667, 0.652, 0.516, 0.607, 0.36]]\nD: [[-0.49, -1.289, 0.9, 0.491, 0.951, 0.59], [-1.107, -1.021, 1.479, 0.523, 0.505, 0.09], [-0.233, -2.971, 1.208, 0.309, 0.946, 0.617], [0.587, -2.842, 0.811, 0.828, 0.821, 0.621], [-0.674, -2.976, 1.257, -0.139, 0.206, 0.639], [-0.269, -3.606, 1.299, -0.169, 0.133, 0.486], [0.033, -0.697, 1.063, 0.567, 1.022, 1.265], [0.252, -0.714, 0.426, 0.514, 0.322, 1.359], [-1.161, 1.486, 0.647, 0.683, 0.314, 0.187], [-0.11, 1.173, 0.725, 0.462, 0.264, 1.138], [1.341, 1.682, 0.277, 0.312, 0.356, 0.94], [1.815, 1.188, 0.624, 1.015, 0.174, 0.508], [1.714, 0.423, 0.79, 0.889, 0.659, 0.533], [1.648, -0.367, 0.718, 0.468, 1.049, 0.941], [2.335, -0.44, 0.71, 1.148, 1.407, 0.783], [1.632, -1.945, 0.223, 0.453, 0.239, 0.703]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_141_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_141_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[0.214, 2.51, 1.365, 3.041, 0.194, 2.765], [-0.05, -2.478, 0.301, 2.986, 0.3, 0.641], [-1.526, -0.161, 1.336, 0.225, 4.64, 2.734]]\nB: [[0.048, 2.151, 1.156, 2.76, 0.655, 2.821], [0.225, -2.827, 0.677, 3.049, -0.043, 0.254], [-1.759, -0.568, 1.729, -0.249, 5.02, 2.969]]\nC: [[0.558, 2.736, 1.619, 3.45, -0.161, 2.854], [-0.367, -2.102, 0.777, 2.594, 0.161, 0.236], [-1.277, -0.254, 1.397, -0.136, 4.615, 2.411]]\nD: [[0.699, 2.21, 1.721, 3.445, -0.097, 2.767], [0.274, -2.743, -0.017, 2.983, 0.564, 0.816], [-1.467, -0.193, 1.628, 0.718, 4.962, 2.711]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[-0.207785, -0.462455, 0.861952], [-0.977184, 0.13779, -0.161637], [-0.044019, -0.875871, -0.480534]]; the translation vector: [2.720584, 1.654419, 1.522448], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.214, 2.51, 1.365, 3.041, 0.194, 2.765], [-0.05, -2.478, 0.301, 2.986, 0.3, 0.641], [-1.526, -0.161, 1.336, 0.225, 4.64, 2.734]]\nB: [[0.048, 2.151, 1.156, 2.76, 0.655, 2.821], [0.225, -2.827, 0.677, 3.049, -0.043, 0.254], [-1.759, -0.568, 1.729, -0.249, 5.02, 2.969]]\nC: [[0.558, 2.736, 1.619, 3.45, -0.161, 2.854], [-0.367, -2.102, 0.777, 2.594, 0.161, 0.236], [-1.277, -0.254, 1.397, -0.136, 4.615, 2.411]]\nD: [[0.699, 2.21, 1.721, 3.445, -0.097, 2.767], [0.274, -2.743, -0.017, 2.983, 0.564, 0.816], [-1.467, -0.193, 1.628, 0.718, 4.962, 2.711]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_142_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_142_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-1.781, -0.092, 1.437, 0.297, 5.42, 2.838], [-0.225, -2.203, -0.028, 3.04, -0.15, 0.159], [0.563, 2.457, 1.86, 3.628, 0.174, 3.621], [1.294, -0.516, 1.217, 0.088, 5.292, 2.913], [-1.437, -2.905, 0.867, 0.166, -0.17, 0.831]]\nB: [[-1.712, -0.169, 1.937, -0.166, 5.172, 3.249], [0.409, -2.556, 0.634, 3.096, 0.671, 0.11], [-0.287, 2.175, 1.704, 3.703, 0.15, 2.806], [2.158, 0.248, 1.0, 0.669, 5.195, 2.453], [-1.117, -2.267, 1.561, 0.31, -0.422, 1.139]]\nC: [[-1.796, -0.26, 1.071, 0.363, 4.986, 2.747], [-0.333, -2.47, 0.362, 3.532, -0.124, 0.597], [0.208, 2.122, 1.319, 3.656, -0.186, 2.723], [1.521, -0.537, 0.986, 0.704, 5.101, 2.943], [-1.457, -2.856, 0.86, 0.281, 0.313, 0.878]]\nD: [[-1.474, 0.024, 1.526, 0.216, 4.974, 3.09], [0.118, -2.408, 0.332, 3.201, 0.275, 0.54], [0.144, 2.522, 1.535, 3.347, 0.23, 3.137], [1.788, -0.144, 1.382, 0.213, 5.326, 2.779], [-1.437, -2.464, 1.35, 0.243, 0.036, 0.743]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[-0.45377, -0.425062, 0.783208], [-0.891046, 0.227634, -0.392708], [-0.01136, -0.876074, -0.482043]]; the translation vector: [2.25004, 3.862298, 1.519108], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.781, -0.092, 1.437, 0.297, 5.42, 2.838], [-0.225, -2.203, -0.028, 3.04, -0.15, 0.159], [0.563, 2.457, 1.86, 3.628, 0.174, 3.621], [1.294, -0.516, 1.217, 0.088, 5.292, 2.913], [-1.437, -2.905, 0.867, 0.166, -0.17, 0.831]]\nB: [[-1.712, -0.169, 1.937, -0.166, 5.172, 3.249], [0.409, -2.556, 0.634, 3.096, 0.671, 0.11], [-0.287, 2.175, 1.704, 3.703, 0.15, 2.806], [2.158, 0.248, 1.0, 0.669, 5.195, 2.453], [-1.117, -2.267, 1.561, 0.31, -0.422, 1.139]]\nC: [[-1.796, -0.26, 1.071, 0.363, 4.986, 2.747], [-0.333, -2.47, 0.362, 3.532, -0.124, 0.597], [0.208, 2.122, 1.319, 3.656, -0.186, 2.723], [1.521, -0.537, 0.986, 0.704, 5.101, 2.943], [-1.457, -2.856, 0.86, 0.281, 0.313, 0.878]]\nD: [[-1.474, 0.024, 1.526, 0.216, 4.974, 3.09], [0.118, -2.408, 0.332, 3.201, 0.275, 0.54], [0.144, 2.522, 1.535, 3.347, 0.23, 3.137], [1.788, -0.144, 1.382, 0.213, 5.326, 2.779], [-1.437, -2.464, 1.35, 0.243, 0.036, 0.743]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_143_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_143_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[2.29, -0.316, 1.432, 0.063, 4.92, 2.21], [-1.382, -1.33, 2.175, 0.695, 0.538, 0.2], [-1.806, -2.363, 0.83, 0.352, 1.114, 1.836], [-1.306, -1.788, 0.815, 0.84, 0.3, 1.933], [-2.082, -0.191, 1.078, 0.269, 3.774, 1.876], [-1.331, 2.183, 1.747, 0.179, -0.216, 1.553], [-1.305, 2.378, 1.485, 0.225, 1.117, 1.988], [0.552, 2.732, 1.097, 2.615, 0.213, 2.449]]\nB: [[2.019, 0.108, 1.002, 0.215, 5.208, 2.059], [-1.022, -1.729, 2.165, 0.645, 0.146, 0.247], [-1.383, -1.998, 1.17, 0.215, 0.925, 2.24], [-1.526, -1.582, 1.263, 0.348, 0.152, 2.156], [-1.694, 0.207, 1.164, 0.178, 3.686, 2.357], [-1.644, 1.973, 1.343, 0.176, 0.151, 1.146], [-1.605, 2.692, 1.055, 0.124, 1.358, 1.982], [0.625, 2.804, 0.995, 2.8, 0.361, 2.1]]\nC: [[1.702, 0.103, 1.118, 0.347, 5.169, 2.205], [-0.755, -1.506, 2.319, 1.022, 0.542, -0.064], [-1.248, -1.952, 1.27, 0.08, 1.199, 2.239], [-1.042, -1.657, 1.027, 0.155, -0.197, 2.421], [-1.513, 0.045, 1.167, -0.103, 3.723, 2.465], [-1.23, 1.582, 1.115, -0.014, -0.31, 1.511], [-1.196, 2.213, 1.364, -0.205, 1.046, 1.714], [0.962, 2.867, 0.955, 2.429, 0.313, 2.593]]\nD: [[2.083, 0.385, 1.347, 0.273, 5.186, 1.86], [-0.546, -1.555, 1.851, 0.975, 0.412, 0.638], [-1.077, -1.883, 1.417, -0.014, 0.602, 2.249], [-1.395, -1.99, 1.177, -0.094, -0.079, 2.003], [-1.5, 0.548, 1.221, 0.453, 3.489, 2.126], [-2.081, 1.694, 1.43, -0.163, 0.443, 1.038], [-1.256, 2.343, 0.839, 0.584, 1.506, 1.621], [0.155, 3.04, 0.757, 2.991, 0.014, 2.136]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[0.804414, -0.195207, 0.561082], [-0.593456, -0.306943, 0.74404], [0.026978, -0.931494, -0.362756]]; the translation vector: [4.397897, 1.805397, 1.263968], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[2.29, -0.316, 1.432, 0.063, 4.92, 2.21], [-1.382, -1.33, 2.175, 0.695, 0.538, 0.2], [-1.806, -2.363, 0.83, 0.352, 1.114, 1.836], [-1.306, -1.788, 0.815, 0.84, 0.3, 1.933], [-2.082, -0.191, 1.078, 0.269, 3.774, 1.876], [-1.331, 2.183, 1.747, 0.179, -0.216, 1.553], [-1.305, 2.378, 1.485, 0.225, 1.117, 1.988], [0.552, 2.732, 1.097, 2.615, 0.213, 2.449]]\nB: [[2.019, 0.108, 1.002, 0.215, 5.208, 2.059], [-1.022, -1.729, 2.165, 0.645, 0.146, 0.247], [-1.383, -1.998, 1.17, 0.215, 0.925, 2.24], [-1.526, -1.582, 1.263, 0.348, 0.152, 2.156], [-1.694, 0.207, 1.164, 0.178, 3.686, 2.357], [-1.644, 1.973, 1.343, 0.176, 0.151, 1.146], [-1.605, 2.692, 1.055, 0.124, 1.358, 1.982], [0.625, 2.804, 0.995, 2.8, 0.361, 2.1]]\nC: [[1.702, 0.103, 1.118, 0.347, 5.169, 2.205], [-0.755, -1.506, 2.319, 1.022, 0.542, -0.064], [-1.248, -1.952, 1.27, 0.08, 1.199, 2.239], [-1.042, -1.657, 1.027, 0.155, -0.197, 2.421], [-1.513, 0.045, 1.167, -0.103, 3.723, 2.465], [-1.23, 1.582, 1.115, -0.014, -0.31, 1.511], [-1.196, 2.213, 1.364, -0.205, 1.046, 1.714], [0.962, 2.867, 0.955, 2.429, 0.313, 2.593]]\nD: [[2.083, 0.385, 1.347, 0.273, 5.186, 1.86], [-0.546, -1.555, 1.851, 0.975, 0.412, 0.638], [-1.077, -1.883, 1.417, -0.014, 0.602, 2.249], [-1.395, -1.99, 1.177, -0.094, -0.079, 2.003], [-1.5, 0.548, 1.221, 0.453, 3.489, 2.126], [-2.081, 1.694, 1.43, -0.163, 0.443, 1.038], [-1.256, 2.343, 0.839, 0.584, 1.506, 1.621], [0.155, 3.04, 0.757, 2.991, 0.014, 2.136]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_144_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_144_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-1.219, -0.964, 1.58, 0.354, 3.912, 2.347], [0.255, 0.996, 1.173, 3.495, 1.006, 2.248], [1.513, 0.137, 0.79, 0.63, 3.182, 2.432]]\nB: [[-1.818, -0.647, 1.066, 0.433, 4.095, 1.902], [0.12, 0.784, 1.447, 3.712, 0.386, 2.623], [1.292, -0.011, 0.89, 0.451, 3.017, 2.105]]\nC: [[-1.598, -0.539, 1.125, 0.503, 3.791, 2.392], [-0.019, 1.26, 1.209, 3.332, 0.548, 2.478], [1.708, -0.009, 1.196, 0.447, 2.783, 2.468]]\nD: [[-1.147, -0.143, 1.224, 0.476, 4.202, 2.039], [-0.033, 1.197, 1.039, 3.572, 0.489, 2.65], [1.648, -0.334, 1.403, 0.735, 3.031, 2.541]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[0.218501, -0.721835, 0.656667], [-0.97193, -0.10083, 0.212566], [-0.087226, -0.684681, -0.723605]]; the translation vector: [2.10902, 2.428258, 1.386435], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.219, -0.964, 1.58, 0.354, 3.912, 2.347], [0.255, 0.996, 1.173, 3.495, 1.006, 2.248], [1.513, 0.137, 0.79, 0.63, 3.182, 2.432]]\nB: [[-1.818, -0.647, 1.066, 0.433, 4.095, 1.902], [0.12, 0.784, 1.447, 3.712, 0.386, 2.623], [1.292, -0.011, 0.89, 0.451, 3.017, 2.105]]\nC: [[-1.598, -0.539, 1.125, 0.503, 3.791, 2.392], [-0.019, 1.26, 1.209, 3.332, 0.548, 2.478], [1.708, -0.009, 1.196, 0.447, 2.783, 2.468]]\nD: [[-1.147, -0.143, 1.224, 0.476, 4.202, 2.039], [-0.033, 1.197, 1.039, 3.572, 0.489, 2.65], [1.648, -0.334, 1.403, 0.735, 3.031, 2.541]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_145_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_145_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-0.089, 0.93, 0.906, 1.952, 6.847, 0.759], [2.364, 2.183, -0.461, 0.089, -0.064, -0.412], [0.126, -4.445, 0.264, 1.544, 0.593, 0.846], [1.593, -4.523, 0.942, 1.504, 1.198, 0.582]]\nB: [[0.288, 1.03, -0.052, 1.738, 7.022, 0.872], [3.315, 2.231, -0.328, 0.592, 0.336, -0.023], [0.311, -4.176, 1.057, 1.806, 0.812, 1.384], [1.759, -3.771, 0.974, 2.086, 0.713, 1.164]]\nC: [[0.167, 0.689, 0.442, 1.571, 6.663, 0.887], [2.849, 2.011, -0.011, 0.132, 0.183, 0.035], [-0.085, -4.074, 0.615, 1.543, 0.713, 0.958], [1.39, -4.168, 0.506, 1.716, 0.715, 0.966]]\nD: [[0.313, 0.252, 0.284, 1.649, 6.826, 1.244], [2.392, 1.917, -0.34, 0.488, -0.05, 0.218], [0.064, -3.679, 0.658, 2.001, 0.36, 1.007], [1.092, -4.59, 0.839, 1.267, 0.336, 1.034]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the table in the scene. The camera pose information includes: the rotation matrix: [[-0.241978, -0.427128, 0.871211], [-0.963615, 0.210861, -0.164264], [-0.113543, -0.879261, -0.462611]]; the translation vector: [2.164319, 10.11033, 1.716674], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.089, 0.93, 0.906, 1.952, 6.847, 0.759], [2.364, 2.183, -0.461, 0.089, -0.064, -0.412], [0.126, -4.445, 0.264, 1.544, 0.593, 0.846], [1.593, -4.523, 0.942, 1.504, 1.198, 0.582]]\nB: [[0.288, 1.03, -0.052, 1.738, 7.022, 0.872], [3.315, 2.231, -0.328, 0.592, 0.336, -0.023], [0.311, -4.176, 1.057, 1.806, 0.812, 1.384], [1.759, -3.771, 0.974, 2.086, 0.713, 1.164]]\nC: [[0.167, 0.689, 0.442, 1.571, 6.663, 0.887], [2.849, 2.011, -0.011, 0.132, 0.183, 0.035], [-0.085, -4.074, 0.615, 1.543, 0.713, 0.958], [1.39, -4.168, 0.506, 1.716, 0.715, 0.966]]\nD: [[0.313, 0.252, 0.284, 1.649, 6.826, 1.244], [2.392, 1.917, -0.34, 0.488, -0.05, 0.218], [0.064, -3.679, 0.658, 2.001, 0.36, 1.007], [1.092, -4.59, 0.839, 1.267, 0.336, 1.034]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_146_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_146_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[0.766, -1.392, 1.241, 0.359, 0.114, 0.509]]\nB: [[0.818, -0.933, 0.887, 0.454, 0.574, 0.13]]\nC: [[0.354, -0.503, 0.874, 0.511, 0.736, 0.517]]\nD: [[1.154, -1.214, 1.131, 0.407, 0.409, -0.18]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the paper cutter in the scene. The camera pose information includes: the rotation matrix: [[0.624751, -0.31057, 0.716403], [-0.780527, -0.273701, 0.562018], [0.021534, -0.910293, -0.413403]]; the translation vector: [-0.212106, 0.775797, 1.619325], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.766, -1.392, 1.241, 0.359, 0.114, 0.509]]\nB: [[0.818, -0.933, 0.887, 0.454, 0.574, 0.13]]\nC: [[0.354, -0.503, 0.874, 0.511, 0.736, 0.517]]\nD: [[1.154, -1.214, 1.131, 0.407, 0.409, -0.18]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_147_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_147_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-0.354, -1.662, 1.012, 0.018, 0.115, 0.103]]\nB: [[-0.792, -1.485, 1.441, 0.393, -0.105, 0.505]]\nC: [[-0.528, -1.745, 1.201, 0.1, 0.492, -0.087]]\nD: [[-0.815, -1.664, 1.36, -0.019, -0.178, -0.353]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the light switch in the scene. The camera pose information includes: the rotation matrix: [[-0.677945, 0.409221, -0.610679], [0.735109, 0.38004, -0.561413], [0.00234, -0.829523, -0.558468]]; the translation vector: [3.092599, 2.044437, 1.437429], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.354, -1.662, 1.012, 0.018, 0.115, 0.103]]\nB: [[-0.792, -1.485, 1.441, 0.393, -0.105, 0.505]]\nC: [[-0.528, -1.745, 1.201, 0.1, 0.492, -0.087]]\nD: [[-0.815, -1.664, 1.36, -0.019, -0.178, -0.353]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_148_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_148_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[0.747, -2.431, 0.458, 6.984, 5.717, 0.813], [-0.691, 2.68, 0.322, 9.392, 3.093, 0.827]]\nB: [[0.057, -2.429, 0.573, 7.86, 5.95, 0.072], [-0.235, 2.427, 0.641, 8.832, 2.983, 0.523]]\nC: [[0.397, -2.595, 0.294, 7.23, 6.005, 0.625], [-0.735, 2.197, 0.57, 9.4, 2.894, 0.932]]\nD: [[0.26, -2.542, 0.108, 7.4, 6.111, 0.419], [-0.69, 2.286, 0.483, 9.253, 2.675, 0.512]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the floor in the scene. The camera pose information includes: the rotation matrix: [[-0.928108, -0.125197, 0.35063], [-0.371823, 0.3599, -0.855699], [-0.019061, -0.924553, -0.380577]]; the translation vector: [5.296664, 4.137775, 1.856988], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.747, -2.431, 0.458, 6.984, 5.717, 0.813], [-0.691, 2.68, 0.322, 9.392, 3.093, 0.827]]\nB: [[0.057, -2.429, 0.573, 7.86, 5.95, 0.072], [-0.235, 2.427, 0.641, 8.832, 2.983, 0.523]]\nC: [[0.397, -2.595, 0.294, 7.23, 6.005, 0.625], [-0.735, 2.197, 0.57, 9.4, 2.894, 0.932]]\nD: [[0.26, -2.542, 0.108, 7.4, 6.111, 0.419], [-0.69, 2.286, 0.483, 9.253, 2.675, 0.512]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_149_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_149_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[0.374, -0.417, 0.072, 5.927, 4.746, 0.404], [1.639, 0.598, 0.366, 1.118, 4.293, 0.368], [2.9, -1.425, 0.632, 1.428, 0.28, 0.254]]\nB: [[0.799, -0.748, 0.531, 6.317, 4.867, 0.446], [1.703, 0.912, 0.069, 1.474, 3.826, 0.769], [3.325, -1.637, 0.524, 1.447, 0.08, -0.103]]\nC: [[0.109, -0.618, 0.303, 6.211, 4.572, 0.089], [2.121, 0.402, 0.274, 1.014, 4.79, 0.298], [2.929, -1.018, 0.279, 1.903, -0.107, 0.311]]\nD: [[-0.053, -0.73, 0.366, 5.716, 4.946, 0.696], [1.94, 0.414, 0.125, 0.997, 4.139, 0.213], [2.533, -1.383, 0.826, 1.711, 0.446, 0.207]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the floor in the scene. The camera pose information includes: the rotation matrix: [[-0.052123, 0.492225, -0.868906], [0.996177, 0.08671, -0.010637], [0.070107, -0.866138, -0.494863]]; the translation vector: [3.27549, 2.071379, 1.287401], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.374, -0.417, 0.072, 5.927, 4.746, 0.404], [1.639, 0.598, 0.366, 1.118, 4.293, 0.368], [2.9, -1.425, 0.632, 1.428, 0.28, 0.254]]\nB: [[0.799, -0.748, 0.531, 6.317, 4.867, 0.446], [1.703, 0.912, 0.069, 1.474, 3.826, 0.769], [3.325, -1.637, 0.524, 1.447, 0.08, -0.103]]\nC: [[0.109, -0.618, 0.303, 6.211, 4.572, 0.089], [2.121, 0.402, 0.274, 1.014, 4.79, 0.298], [2.929, -1.018, 0.279, 1.903, -0.107, 0.311]]\nD: [[-0.053, -0.73, 0.366, 5.716, 4.946, 0.696], [1.94, 0.414, 0.125, 0.997, 4.139, 0.213], [2.533, -1.383, 0.826, 1.711, 0.446, 0.207]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_150_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_150_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-0.627, -3.376, 0.855, 6.372, 0.045, 1.93], [-4.559, -0.814, 0.815, 0.23, 4.236, 2.315], [2.457, -1.253, 1.135, 0.502, 4.3, 1.849], [2.763, 1.048, 0.711, 0.538, 0.174, 2.307], [2.637, 1.438, 0.737, -0.26, 0.359, 1.655], [2.838, 1.209, 1.15, -0.136, 0.413, 2.018], [2.547, 3.388, 0.7, 0.133, 3.272, 2.033], [1.842, 5.509, 1.228, 1.639, 0.473, 2.533], [1.462, 4.688, 1.401, -0.073, 1.6, 2.078], [3.735, 2.121, 1.265, 0.57, 0.876, 2.223], [3.212, 2.741, 1.782, 1.502, 0.578, -0.004]]\nB: [[-0.903, -3.755, 0.874, 6.176, 0.003, 1.35], [-4.554, -0.741, 1.026, -0.065, 4.304, 1.597], [2.299, -1.131, 0.865, 0.391, 4.838, 2.495], [2.651, 0.847, 1.18, 0.317, 0.031, 1.522], [1.968, 0.985, 1.174, 0.018, 0.505, 1.538], [2.371, 1.973, 0.767, 0.103, 0.399, 1.514], [2.144, 3.832, 1.306, -0.158, 3.954, 2.582], [1.91, 5.239, 0.926, 1.148, 0.013, 2.526], [1.285, 4.859, 0.706, 0.551, 0.734, 2.356], [3.523, 2.358, 1.085, 0.08, 1.478, 2.236], [3.224, 3.037, 2.38, 1.03, -0.257, 0.188]]\nC: [[-1.266, -3.485, 0.564, 6.835, 0.324, 1.764], [-3.629, -0.962, 1.016, -0.055, 4.598, 2.313], [2.437, -1.124, 1.463, -0.109, 4.49, 2.527], [2.113, 1.183, 0.667, 0.701, -0.094, 1.699], [2.045, 1.707, 0.812, -0.196, 0.311, 1.762], [2.268, 1.659, 0.591, -0.094, 0.192, 1.734], [2.994, 3.302, 1.563, 0.269, 3.696, 2.084], [1.688, 4.818, 0.728, 1.225, 0.665, 2.09], [0.999, 4.388, 1.202, -0.003, 0.99, 2.125], [3.205, 2.357, 1.438, 0.088, 1.176, 2.548], [3.215, 2.712, 1.754, 0.977, -0.046, 0.843]]\nD: [[-0.786, -3.408, 0.812, 6.62, 0.23, 1.627], [-4.064, -0.926, 0.97, 0.26, 4.308, 1.916], [2.527, -1.13, 1.126, 0.243, 4.695, 2.21], [2.347, 1.178, 0.964, 0.317, 0.117, 1.88], [2.227, 1.442, 1.014, 0.145, 0.618, 2.004], [2.348, 1.638, 0.879, 0.351, 0.101, 1.748], [2.53, 3.466, 1.165, 0.288, 3.589, 2.389], [1.982, 5.141, 1.204, 1.32, 0.458, 2.309], [1.319, 4.76, 1.151, 0.233, 1.14, 2.044], [3.664, 2.514, 1.17, 0.338, 1.259, 2.387], [3.264, 3.152, 2.173, 1.061, 0.098, 0.435]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[0.688084, 0.423256, -0.589401], [0.725514, -0.415863, 0.54835], [-0.013017, -0.80493, -0.593227]]; the translation vector: [3.968163, 0.8771, 1.421607], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.627, -3.376, 0.855, 6.372, 0.045, 1.93], [-4.559, -0.814, 0.815, 0.23, 4.236, 2.315], [2.457, -1.253, 1.135, 0.502, 4.3, 1.849], [2.763, 1.048, 0.711, 0.538, 0.174, 2.307], [2.637, 1.438, 0.737, -0.26, 0.359, 1.655], [2.838, 1.209, 1.15, -0.136, 0.413, 2.018], [2.547, 3.388, 0.7, 0.133, 3.272, 2.033], [1.842, 5.509, 1.228, 1.639, 0.473, 2.533], [1.462, 4.688, 1.401, -0.073, 1.6, 2.078], [3.735, 2.121, 1.265, 0.57, 0.876, 2.223], [3.212, 2.741, 1.782, 1.502, 0.578, -0.004]]\nB: [[-0.903, -3.755, 0.874, 6.176, 0.003, 1.35], [-4.554, -0.741, 1.026, -0.065, 4.304, 1.597], [2.299, -1.131, 0.865, 0.391, 4.838, 2.495], [2.651, 0.847, 1.18, 0.317, 0.031, 1.522], [1.968, 0.985, 1.174, 0.018, 0.505, 1.538], [2.371, 1.973, 0.767, 0.103, 0.399, 1.514], [2.144, 3.832, 1.306, -0.158, 3.954, 2.582], [1.91, 5.239, 0.926, 1.148, 0.013, 2.526], [1.285, 4.859, 0.706, 0.551, 0.734, 2.356], [3.523, 2.358, 1.085, 0.08, 1.478, 2.236], [3.224, 3.037, 2.38, 1.03, -0.257, 0.188]]\nC: [[-1.266, -3.485, 0.564, 6.835, 0.324, 1.764], [-3.629, -0.962, 1.016, -0.055, 4.598, 2.313], [2.437, -1.124, 1.463, -0.109, 4.49, 2.527], [2.113, 1.183, 0.667, 0.701, -0.094, 1.699], [2.045, 1.707, 0.812, -0.196, 0.311, 1.762], [2.268, 1.659, 0.591, -0.094, 0.192, 1.734], [2.994, 3.302, 1.563, 0.269, 3.696, 2.084], [1.688, 4.818, 0.728, 1.225, 0.665, 2.09], [0.999, 4.388, 1.202, -0.003, 0.99, 2.125], [3.205, 2.357, 1.438, 0.088, 1.176, 2.548], [3.215, 2.712, 1.754, 0.977, -0.046, 0.843]]\nD: [[-0.786, -3.408, 0.812, 6.62, 0.23, 1.627], [-4.064, -0.926, 0.97, 0.26, 4.308, 1.916], [2.527, -1.13, 1.126, 0.243, 4.695, 2.21], [2.347, 1.178, 0.964, 0.317, 0.117, 1.88], [2.227, 1.442, 1.014, 0.145, 0.618, 2.004], [2.348, 1.638, 0.879, 0.351, 0.101, 1.748], [2.53, 3.466, 1.165, 0.288, 3.589, 2.389], [1.982, 5.141, 1.204, 1.32, 0.458, 2.309], [1.319, 4.76, 1.151, 0.233, 1.14, 2.044], [3.664, 2.514, 1.17, 0.338, 1.259, 2.387], [3.264, 3.152, 2.173, 1.061, 0.098, 0.435]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_151_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_151_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[2.24, 0.127, 1.561, 0.17, 6.452, 1.322]]\nB: [[1.808, 0.445, 1.097, -0.163, 6.206, 1.162]]\nC: [[2.076, -0.046, 1.133, 0.371, 6.595, 0.908]]\nD: [[2.152, 0.397, 1.838, 0.141, 6.038, 1.758]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the whiteboard in the scene. The camera pose information includes: the rotation matrix: [[-0.176261, -0.039155, 0.983564], [-0.983722, -0.028492, -0.177423], [0.03497, -0.998827, -0.033496]]; the translation vector: [3.054739, 2.437738, 1.503838], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[2.24, 0.127, 1.561, 0.17, 6.452, 1.322]]\nB: [[1.808, 0.445, 1.097, -0.163, 6.206, 1.162]]\nC: [[2.076, -0.046, 1.133, 0.371, 6.595, 0.908]]\nD: [[2.152, 0.397, 1.838, 0.141, 6.038, 1.758]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_152_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_152_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[0.377, 1.75, 0.313, 0.637, 1.084, 1.086], [-0.879, 2.594, 0.422, 0.742, 0.234, 0.85]]\nB: [[-0.13, 2.575, 0.337, 1.18, 0.166, 1.328], [-0.585, 2.455, 0.693, 0.879, 0.311, 1.309]]\nC: [[-0.219, 1.756, 0.406, 1.056, 0.511, 0.967], [-0.363, 1.945, 0.501, 1.166, 0.939, 0.804]]\nD: [[-0.109, 2.202, 0.796, 0.742, 0.65, 0.918], [-0.778, 2.232, 0.83, 0.777, 0.638, 0.953]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the sofa chair in the scene. The camera pose information includes: the rotation matrix: [[0.753053, 0.123809, -0.646206], [0.619922, -0.462608, 0.633791], [-0.220471, -0.877875, -0.42512]]; the translation vector: [4.259223, 3.769218, 1.505729], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.377, 1.75, 0.313, 0.637, 1.084, 1.086], [-0.879, 2.594, 0.422, 0.742, 0.234, 0.85]]\nB: [[-0.13, 2.575, 0.337, 1.18, 0.166, 1.328], [-0.585, 2.455, 0.693, 0.879, 0.311, 1.309]]\nC: [[-0.219, 1.756, 0.406, 1.056, 0.511, 0.967], [-0.363, 1.945, 0.501, 1.166, 0.939, 0.804]]\nD: [[-0.109, 2.202, 0.796, 0.742, 0.65, 0.918], [-0.778, 2.232, 0.83, 0.777, 0.638, 0.953]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_153_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_153_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-0.251, 0.156, -0.128, -0.051, 0.628, 0.239], [0.767, -0.47, 0.609, 0.494, 0.504, 0.166], [-0.094, 1.499, 1.301, 0.409, 0.637, -0.038], [-0.218, 1.599, 2.043, 0.222, -0.051, -0.084], [0.774, 1.201, 1.813, 0.251, -0.38, 0.063]]\nB: [[-0.408, 0.627, 0.343, 0.188, 0.542, 0.081], [0.501, -0.329, 0.635, 0.154, 0.177, 0.084], [0.343, 1.237, 1.788, 0.265, 0.262, 0.091], [0.216, 1.167, 1.709, 0.275, 0.094, 0.094], [0.467, 1.14, 1.723, 0.259, 0.069, 0.108]]\nC: [[-0.807, 0.141, 0.815, 0.53, 0.607, 0.51], [0.917, -0.099, 0.427, -0.321, -0.28, 0.334], [0.531, 1.399, 2.196, 0.43, 0.03, 0.076], [0.47, 0.735, 1.914, -0.093, 0.374, 0.55], [-0.032, 1.587, 2.029, 0.611, -0.009, -0.144]]\nD: [[-0.225, 0.433, 0.214, 0.523, 1.033, -0.125], [0.497, -0.466, 0.903, 0.572, 0.328, -0.033], [0.249, 0.868, 1.316, 0.58, 0.558, -0.337], [0.688, 0.673, 1.442, -0.064, -0.139, -0.391], [0.045, 1.256, 1.359, -0.021, 0.452, 0.403]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the towel in the scene. The camera pose information includes: the rotation matrix: [[0.956223, -0.170898, 0.237554], [-0.292595, -0.544035, 0.786393], [-0.005155, -0.821474, -0.570223]]; the translation vector: [1.275326, 2.834272, 1.3185], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.251, 0.156, -0.128, -0.051, 0.628, 0.239], [0.767, -0.47, 0.609, 0.494, 0.504, 0.166], [-0.094, 1.499, 1.301, 0.409, 0.637, -0.038], [-0.218, 1.599, 2.043, 0.222, -0.051, -0.084], [0.774, 1.201, 1.813, 0.251, -0.38, 0.063]]\nB: [[-0.408, 0.627, 0.343, 0.188, 0.542, 0.081], [0.501, -0.329, 0.635, 0.154, 0.177, 0.084], [0.343, 1.237, 1.788, 0.265, 0.262, 0.091], [0.216, 1.167, 1.709, 0.275, 0.094, 0.094], [0.467, 1.14, 1.723, 0.259, 0.069, 0.108]]\nC: [[-0.807, 0.141, 0.815, 0.53, 0.607, 0.51], [0.917, -0.099, 0.427, -0.321, -0.28, 0.334], [0.531, 1.399, 2.196, 0.43, 0.03, 0.076], [0.47, 0.735, 1.914, -0.093, 0.374, 0.55], [-0.032, 1.587, 2.029, 0.611, -0.009, -0.144]]\nD: [[-0.225, 0.433, 0.214, 0.523, 1.033, -0.125], [0.497, -0.466, 0.903, 0.572, 0.328, -0.033], [0.249, 0.868, 1.316, 0.58, 0.558, -0.337], [0.688, 0.673, 1.442, -0.064, -0.139, -0.391], [0.045, 1.256, 1.359, -0.021, 0.452, 0.403]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_154_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_154_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[0.306, -0.014, 1.979, 4.019, 7.414, 0.307]]\nB: [[0.103, 0.292, 1.906, 4.176, 7.558, 0.724]]\nC: [[0.489, 0.437, 1.928, 3.337, 7.327, 0.317]]\nD: [[0.278, 0.096, 1.983, 3.8, 7.07, 0.334]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the ceiling in the scene. The camera pose information includes: the rotation matrix: [[-0.443363, -0.325026, 0.835337], [-0.895367, 0.117125, -0.429651], [0.041809, -0.938424, -0.342946]]; the translation vector: [2.190343, 3.392878, 1.594635], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.306, -0.014, 1.979, 4.019, 7.414, 0.307]]\nB: [[0.103, 0.292, 1.906, 4.176, 7.558, 0.724]]\nC: [[0.489, 0.437, 1.928, 3.337, 7.327, 0.317]]\nD: [[0.278, 0.096, 1.983, 3.8, 7.07, 0.334]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_155_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_155_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[0.28, 0.304, -0.341, 2.294, 3.044, -0.02], [-1.331, 0.139, -0.157, 1.326, 1.929, 0.397]]\nB: [[0.062, -0.149, 0.038, 2.484, 2.88, 0.127], [-1.587, -0.328, 0.005, 1.461, 1.909, 0.087]]\nC: [[0.029, -0.482, -0.368, 2.94, 2.904, 0.128], [-1.213, -0.173, 0.109, 1.128, 1.645, 0.334]]\nD: [[0.293, -0.453, -0.316, 2.555, 2.944, -0.31], [-1.538, -0.329, -0.099, 1.121, 2.246, -0.19]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the floor in the scene. The camera pose information includes: the rotation matrix: [[0.59597, 0.482312, -0.642025], [0.802979, -0.35126, 0.4815], [0.006716, -0.802491, -0.596626]]; the translation vector: [3.449961, 1.112515, 1.412234], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.28, 0.304, -0.341, 2.294, 3.044, -0.02], [-1.331, 0.139, -0.157, 1.326, 1.929, 0.397]]\nB: [[0.062, -0.149, 0.038, 2.484, 2.88, 0.127], [-1.587, -0.328, 0.005, 1.461, 1.909, 0.087]]\nC: [[0.029, -0.482, -0.368, 2.94, 2.904, 0.128], [-1.213, -0.173, 0.109, 1.128, 1.645, 0.334]]\nD: [[0.293, -0.453, -0.316, 2.555, 2.944, -0.31], [-1.538, -0.329, -0.099, 1.121, 2.246, -0.19]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_156_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_156_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[0.985, 1.36, 0.323, 0.802, 0.423, -0.124], [2.422, 0.684, 0.754, 0.948, 0.198, 0.546], [-0.687, -2.501, -0.196, 0.73, 0.321, 0.703]]\nB: [[1.365, 1.983, 0.034, 0.504, 0.741, 0.248], [1.92, 1.004, 0.632, 0.401, 0.542, 0.139], [-0.839, -2.927, -0.163, 0.351, 0.752, 0.304]]\nC: [[1.454, 1.792, 0.377, 0.63, 0.637, 0.263], [2.367, 0.546, 0.29, 0.458, 0.434, 0.427], [-1.072, -2.953, 0.222, 0.398, 0.377, 0.406]]\nD: [[1.15, 1.795, -0.026, 0.476, 0.371, 0.563], [2.092, 0.112, 0.084, 0.025, 0.591, 0.3], [-1.071, -3.278, 0.072, 0.031, 0.342, 0.689]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the seat in the scene. The camera pose information includes: the rotation matrix: [[0.000188, -0.47362, 0.88073], [-0.997828, 0.057931, 0.031365], [-0.065877, -0.878822, -0.47258]]; the translation vector: [4.366519, 5.511691, 1.307889], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.985, 1.36, 0.323, 0.802, 0.423, -0.124], [2.422, 0.684, 0.754, 0.948, 0.198, 0.546], [-0.687, -2.501, -0.196, 0.73, 0.321, 0.703]]\nB: [[1.365, 1.983, 0.034, 0.504, 0.741, 0.248], [1.92, 1.004, 0.632, 0.401, 0.542, 0.139], [-0.839, -2.927, -0.163, 0.351, 0.752, 0.304]]\nC: [[1.454, 1.792, 0.377, 0.63, 0.637, 0.263], [2.367, 0.546, 0.29, 0.458, 0.434, 0.427], [-1.072, -2.953, 0.222, 0.398, 0.377, 0.406]]\nD: [[1.15, 1.795, -0.026, 0.476, 0.371, 0.563], [2.092, 0.112, 0.084, 0.025, 0.591, 0.3], [-1.071, -3.278, 0.072, 0.031, 0.342, 0.689]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_157_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_157_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-1.531, 1.811, 0.438, 0.96, -0.248, 1.075], [-1.41, -1.002, 0.77, 0.47, 1.246, 1.71]]\nB: [[-0.951, 1.545, 0.971, 1.428, 0.629, 1.104], [-2.05, -1.043, 0.695, 0.095, 1.11, 1.626]]\nC: [[-1.029, 1.273, 0.377, 1.0, 0.707, 1.651], [-1.265, -0.353, 1.355, -0.02, 0.917, 2.297]]\nD: [[-1.312, 1.674, 0.691, 1.103, 0.228, 1.421], [-1.753, -0.603, 0.956, 0.349, 1.091, 2.04]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the door in the scene. The camera pose information includes: the rotation matrix: [[0.927869, -0.125596, 0.351119], [-0.372891, -0.32108, 0.870551], [0.003399, -0.938687, -0.344754]]; the translation vector: [5.442723, 4.031985, 1.348893], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.531, 1.811, 0.438, 0.96, -0.248, 1.075], [-1.41, -1.002, 0.77, 0.47, 1.246, 1.71]]\nB: [[-0.951, 1.545, 0.971, 1.428, 0.629, 1.104], [-2.05, -1.043, 0.695, 0.095, 1.11, 1.626]]\nC: [[-1.029, 1.273, 0.377, 1.0, 0.707, 1.651], [-1.265, -0.353, 1.355, -0.02, 0.917, 2.297]]\nD: [[-1.312, 1.674, 0.691, 1.103, 0.228, 1.421], [-1.753, -0.603, 0.956, 0.349, 1.091, 2.04]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_158_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_158_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-2.1, -0.155, 0.488, 0.884, 0.762, 1.139]]\nB: [[-1.609, -0.239, 0.938, 0.096, 1.426, 1.078]]\nC: [[-1.861, -0.273, 0.81, 0.598, 0.82, 0.783]]\nD: [[-1.644, -0.605, 0.583, 0.402, 1.231, 1.185]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the shelf in the scene. The camera pose information includes: the rotation matrix: [[-0.070416, -0.411804, 0.908548], [-0.99671, 0.065705, -0.047468], [-0.040148, -0.908901, -0.415075]]; the translation vector: [2.214543, 1.806687, 1.391502], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-2.1, -0.155, 0.488, 0.884, 0.762, 1.139]]\nB: [[-1.609, -0.239, 0.938, 0.096, 1.426, 1.078]]\nC: [[-1.861, -0.273, 0.81, 0.598, 0.82, 0.783]]\nD: [[-1.644, -0.605, 0.583, 0.402, 1.231, 1.185]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_159_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_159_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-0.605, -0.075, 2.599, 6.78, 7.188, 0.753]]\nB: [[-0.136, -0.074, 2.664, 7.091, 7.331, 0.624]]\nC: [[-0.11, -0.067, 2.645, 6.713, 7.047, 0.627]]\nD: [[-0.59, -0.552, 3.048, 6.673, 7.52, 0.884]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the ceiling in the scene. The camera pose information includes: the rotation matrix: [[-0.955421, 0.119616, -0.269932], [0.295248, 0.388339, -0.872939], [0.000408, -0.91372, -0.406343]]; the translation vector: [2.65583, 2.981598, 1.368648], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.605, -0.075, 2.599, 6.78, 7.188, 0.753]]\nB: [[-0.136, -0.074, 2.664, 7.091, 7.331, 0.624]]\nC: [[-0.11, -0.067, 2.645, 6.713, 7.047, 0.627]]\nD: [[-0.59, -0.552, 3.048, 6.673, 7.52, 0.884]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_160_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_160_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[1.773, 0.999, 2.176, 3.762, 2.16, 0.713]]\nB: [[1.402, 0.542, 2.42, 3.544, 2.145, 0.268]]\nC: [[0.962, 0.894, 1.956, 3.984, 2.23, 0.213]]\nD: [[1.508, 0.544, 2.14, 3.898, 2.009, 0.701]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the ceiling in the scene. The camera pose information includes: the rotation matrix: [[-0.454685, 0.144673, -0.878824], [0.890085, 0.109034, -0.442562], [0.031795, -0.983454, -0.178347]]; the translation vector: [3.311996, 2.119304, 1.59409], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.773, 0.999, 2.176, 3.762, 2.16, 0.713]]\nB: [[1.402, 0.542, 2.42, 3.544, 2.145, 0.268]]\nC: [[0.962, 0.894, 1.956, 3.984, 2.23, 0.213]]\nD: [[1.508, 0.544, 2.14, 3.898, 2.009, 0.701]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_161_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_161_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[1.445, -1.591, 0.401, 1.208, 0.362, 0.709], [0.85, 1.846, 0.287, 0.495, 1.135, -0.015]]\nB: [[1.318, -1.383, 0.256, 0.782, 0.724, 0.542], [1.339, 2.155, 0.239, 0.765, 0.899, 0.445]]\nC: [[1.731, -1.727, 0.665, 0.715, 0.694, 0.718], [0.941, 2.531, -0.018, 1.235, 0.51, 0.14]]\nD: [[1.454, -1.414, -0.024, 0.443, 0.46, 0.088], [0.989, 2.555, 0.477, 1.003, 1.282, 0.286]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the coffee table in the scene. The camera pose information includes: the rotation matrix: [[0.990268, -0.101591, 0.095124], [-0.135934, -0.559426, 0.817658], [-0.029851, -0.822631, -0.567792]]; the translation vector: [6.679901, 2.488796, 1.402653], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.445, -1.591, 0.401, 1.208, 0.362, 0.709], [0.85, 1.846, 0.287, 0.495, 1.135, -0.015]]\nB: [[1.318, -1.383, 0.256, 0.782, 0.724, 0.542], [1.339, 2.155, 0.239, 0.765, 0.899, 0.445]]\nC: [[1.731, -1.727, 0.665, 0.715, 0.694, 0.718], [0.941, 2.531, -0.018, 1.235, 0.51, 0.14]]\nD: [[1.454, -1.414, -0.024, 0.443, 0.46, 0.088], [0.989, 2.555, 0.477, 1.003, 1.282, 0.286]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_162_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_162_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[1.849, -1.336, 0.075, 0.117, 0.119, 0.051]]\nB: [[1.779, -1.476, 0.442, 0.485, 0.615, 0.395]]\nC: [[1.904, -1.448, 0.152, 0.474, 0.108, -0.174]]\nD: [[1.427, -1.203, 0.391, 0.059, -0.193, -0.208]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the object in the scene. The camera pose information includes: the rotation matrix: [[0.246516, -0.470365, 0.847341], [-0.959136, 0.006886, 0.282862], [-0.138884, -0.882445, -0.449446]]; the translation vector: [3.043058, 2.955299, 1.551102], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.849, -1.336, 0.075, 0.117, 0.119, 0.051]]\nB: [[1.779, -1.476, 0.442, 0.485, 0.615, 0.395]]\nC: [[1.904, -1.448, 0.152, 0.474, 0.108, -0.174]]\nD: [[1.427, -1.203, 0.391, 0.059, -0.193, -0.208]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_163_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_163_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[1.487, 1.645, 0.382, 0.302, 0.482, 0.482], [0.998, 1.485, 0.258, 0.64, 0.279, 0.771], [0.516, 1.799, 0.05, 0.639, 0.719, 0.616], [0.027, 2.307, 0.334, 0.459, 0.957, 1.087], [-0.766, 2.114, 0.879, 0.637, 0.926, 1.186], [-1.637, 1.716, 0.195, 0.889, 0.178, 0.887], [-1.878, 1.498, 0.042, 1.014, 0.211, 0.465], [-2.168, 2.812, 0.748, 0.089, 0.133, 0.111], [-2.511, -0.128, 0.278, 0.262, 0.627, 1.122], [-3.252, -1.403, 0.351, 0.543, 0.875, 0.964], [-2.67, -1.518, 0.047, 0.142, 0.368, 0.87], [-0.679, -1.008, 0.86, 0.74, 0.76, 1.005], [-0.054, -1.776, 0.013, 0.602, 0.383, 0.483], [0.442, -1.702, 0.129, 0.294, 0.491, 1.264], [-0.906, -1.527, 0.588, 0.773, 1.129, 1.323], [1.797, -0.711, 0.306, 0.186, 0.995, 1.019], [1.694, -1.236, 0.46, 0.778, 1.151, 1.284], [1.24, -1.901, 0.667, 0.364, 1.023, 0.918], [2.398, -1.858, 0.981, 0.284, 0.726, 0.83], [3.224, -1.673, 0.285, 0.245, 0.491, 0.975], [1.768, 2.317, 0.9, 0.325, 1.027, 1.062], [1.496, 2.661, 0.93, -0.007, 0.819, 0.169]]\nB: [[1.146, 0.948, 0.122, 0.95, 1.005, 0.549], [0.748, 0.973, 0.155, 0.6, 0.484, 0.381], [0.549, 2.17, 0.091, 0.666, 0.717, 0.866], [0.554, 2.811, 0.815, 0.733, 0.125, 0.7], [-0.925, 2.187, 0.558, 0.622, 0.213, 0.874], [-1.744, 1.277, 0.001, 0.646, 0.973, 0.704], [-1.809, 2.353, 0.808, 0.048, 0.797, 0.7], [-1.256, 2.641, 1.036, 0.522, 0.609, 0.158], [-1.879, -0.087, 0.596, 0.81, 0.571, 0.463], [-3.345, -0.961, 0.298, 0.354, 0.59, 1.207], [-2.802, -1.895, 0.135, 0.976, 1.183, 0.764], [-0.847, -1.618, 0.508, 0.783, 0.348, 1.292], [-0.511, -1.056, 0.376, 0.73, 0.392, 1.159], [-0.024, -2.057, 0.759, 0.532, 0.455, 0.817], [-0.251, -2.214, 0.173, 1.127, 0.862, 0.708], [2.174, -0.228, 0.822, 0.364, 0.554, 0.827], [1.928, -1.877, 0.198, 0.653, 1.131, 1.053], [1.218, -2.319, 0.663, 0.163, 0.153, 0.793], [2.951, -1.156, 0.405, 1.011, 0.624, 0.772], [3.153, -1.986, 0.421, 0.263, 0.33, 0.7], [1.367, 2.28, 0.547, 1.058, 0.935, 1.287], [2.006, 2.966, 0.782, 0.332, 0.619, 0.04]]\nC: [[1.346, 1.054, 0.767, 0.951, 0.758, 0.769], [0.659, 1.706, 0.684, 0.913, 0.914, 1.319], [0.805, 2.288, 0.288, 0.155, 0.839, 0.635], [0.287, 2.236, 0.545, 0.587, 0.976, 0.783], [-1.118, 2.319, 0.772, 1.192, 0.851, 0.415], [-1.552, 1.463, 0.231, 0.636, 0.79, 0.457], [-1.992, 2.15, 0.851, 0.919, 1.11, 0.624], [-1.923, 2.253, 1.26, 0.407, 0.257, 0.58], [-2.064, -0.023, 0.196, 0.32, 0.999, 0.859], [-3.224, -1.369, 0.324, 1.046, 0.849, 0.941], [-2.953, -2.153, 0.953, 0.315, 0.426, 0.39], [-0.29, -1.189, 0.464, 0.368, 1.039, 1.28], [-0.098, -2.0, 0.391, 0.817, 0.212, 1.036], [0.365, -1.822, 0.164, 0.214, 0.365, 0.378], [-0.847, -2.191, 0.875, 0.968, 0.479, 0.553], [2.332, -0.438, 0.431, 0.138, 0.956, 1.041], [1.345, -2.004, 0.538, 0.439, 0.287, 0.73], [1.393, -1.64, 0.88, 0.322, 0.297, 1.16], [3.052, -1.259, 0.761, 0.943, 0.828, 0.5], [2.735, -2.476, 0.875, 0.335, 1.087, 0.495], [1.336, 2.088, 0.504, 0.673, 1.053, 0.469], [1.386, 2.116, 0.605, 0.191, 0.61, 0.101]]\nD: [[1.518, 1.271, 0.394, 0.605, 0.594, 0.849], [0.943, 1.353, 0.378, 0.619, 0.666, 0.828], [0.701, 1.955, 0.404, 0.648, 0.696, 0.84], [0.523, 2.479, 0.454, 0.541, 0.563, 0.792], [-1.051, 2.117, 0.448, 0.79, 0.709, 0.804], [-1.341, 1.248, 0.462, 0.57, 0.622, 0.853], [-1.574, 1.994, 0.519, 0.538, 0.675, 0.779], [-1.737, 2.403, 0.858, 0.16, 0.317, 0.168], [-2.078, -0.466, 0.495, 0.568, 0.586, 0.801], [-2.925, -1.082, 0.538, 0.66, 0.66, 0.803], [-3.037, -1.752, 0.519, 0.574, 0.705, 0.845], [-0.539, -1.191, 0.375, 0.64, 0.637, 0.843], [-0.068, -1.536, 0.384, 0.646, 0.641, 0.825], [-0.052, -2.09, 0.408, 0.661, 0.773, 0.824], [-0.669, -1.919, 0.395, 0.676, 0.647, 0.832], [2.151, -0.689, 0.438, 0.636, 0.62, 0.802], [1.695, -1.528, 0.421, 0.589, 0.733, 0.82], [1.703, -2.028, 0.457, 0.561, 0.65, 0.798], [2.65, -1.483, 0.534, 0.701, 0.712, 0.852], [2.844, -2.087, 0.588, 0.548, 0.714, 0.804], [1.775, 1.985, 0.459, 0.664, 0.67, 0.811], [1.768, 2.603, 0.602, 0.329, 0.514, 0.537]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the chair in the scene. The camera pose information includes: the rotation matrix: [[0.424269, -0.366439, 0.828081], [-0.894198, -0.025281, 0.446957], [-0.142848, -0.930098, -0.338395]]; the translation vector: [2.638367, 6.760901, 1.41712], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.487, 1.645, 0.382, 0.302, 0.482, 0.482], [0.998, 1.485, 0.258, 0.64, 0.279, 0.771], [0.516, 1.799, 0.05, 0.639, 0.719, 0.616], [0.027, 2.307, 0.334, 0.459, 0.957, 1.087], [-0.766, 2.114, 0.879, 0.637, 0.926, 1.186], [-1.637, 1.716, 0.195, 0.889, 0.178, 0.887], [-1.878, 1.498, 0.042, 1.014, 0.211, 0.465], [-2.168, 2.812, 0.748, 0.089, 0.133, 0.111], [-2.511, -0.128, 0.278, 0.262, 0.627, 1.122], [-3.252, -1.403, 0.351, 0.543, 0.875, 0.964], [-2.67, -1.518, 0.047, 0.142, 0.368, 0.87], [-0.679, -1.008, 0.86, 0.74, 0.76, 1.005], [-0.054, -1.776, 0.013, 0.602, 0.383, 0.483], [0.442, -1.702, 0.129, 0.294, 0.491, 1.264], [-0.906, -1.527, 0.588, 0.773, 1.129, 1.323], [1.797, -0.711, 0.306, 0.186, 0.995, 1.019], [1.694, -1.236, 0.46, 0.778, 1.151, 1.284], [1.24, -1.901, 0.667, 0.364, 1.023, 0.918], [2.398, -1.858, 0.981, 0.284, 0.726, 0.83], [3.224, -1.673, 0.285, 0.245, 0.491, 0.975], [1.768, 2.317, 0.9, 0.325, 1.027, 1.062], [1.496, 2.661, 0.93, -0.007, 0.819, 0.169]]\nB: [[1.146, 0.948, 0.122, 0.95, 1.005, 0.549], [0.748, 0.973, 0.155, 0.6, 0.484, 0.381], [0.549, 2.17, 0.091, 0.666, 0.717, 0.866], [0.554, 2.811, 0.815, 0.733, 0.125, 0.7], [-0.925, 2.187, 0.558, 0.622, 0.213, 0.874], [-1.744, 1.277, 0.001, 0.646, 0.973, 0.704], [-1.809, 2.353, 0.808, 0.048, 0.797, 0.7], [-1.256, 2.641, 1.036, 0.522, 0.609, 0.158], [-1.879, -0.087, 0.596, 0.81, 0.571, 0.463], [-3.345, -0.961, 0.298, 0.354, 0.59, 1.207], [-2.802, -1.895, 0.135, 0.976, 1.183, 0.764], [-0.847, -1.618, 0.508, 0.783, 0.348, 1.292], [-0.511, -1.056, 0.376, 0.73, 0.392, 1.159], [-0.024, -2.057, 0.759, 0.532, 0.455, 0.817], [-0.251, -2.214, 0.173, 1.127, 0.862, 0.708], [2.174, -0.228, 0.822, 0.364, 0.554, 0.827], [1.928, -1.877, 0.198, 0.653, 1.131, 1.053], [1.218, -2.319, 0.663, 0.163, 0.153, 0.793], [2.951, -1.156, 0.405, 1.011, 0.624, 0.772], [3.153, -1.986, 0.421, 0.263, 0.33, 0.7], [1.367, 2.28, 0.547, 1.058, 0.935, 1.287], [2.006, 2.966, 0.782, 0.332, 0.619, 0.04]]\nC: [[1.346, 1.054, 0.767, 0.951, 0.758, 0.769], [0.659, 1.706, 0.684, 0.913, 0.914, 1.319], [0.805, 2.288, 0.288, 0.155, 0.839, 0.635], [0.287, 2.236, 0.545, 0.587, 0.976, 0.783], [-1.118, 2.319, 0.772, 1.192, 0.851, 0.415], [-1.552, 1.463, 0.231, 0.636, 0.79, 0.457], [-1.992, 2.15, 0.851, 0.919, 1.11, 0.624], [-1.923, 2.253, 1.26, 0.407, 0.257, 0.58], [-2.064, -0.023, 0.196, 0.32, 0.999, 0.859], [-3.224, -1.369, 0.324, 1.046, 0.849, 0.941], [-2.953, -2.153, 0.953, 0.315, 0.426, 0.39], [-0.29, -1.189, 0.464, 0.368, 1.039, 1.28], [-0.098, -2.0, 0.391, 0.817, 0.212, 1.036], [0.365, -1.822, 0.164, 0.214, 0.365, 0.378], [-0.847, -2.191, 0.875, 0.968, 0.479, 0.553], [2.332, -0.438, 0.431, 0.138, 0.956, 1.041], [1.345, -2.004, 0.538, 0.439, 0.287, 0.73], [1.393, -1.64, 0.88, 0.322, 0.297, 1.16], [3.052, -1.259, 0.761, 0.943, 0.828, 0.5], [2.735, -2.476, 0.875, 0.335, 1.087, 0.495], [1.336, 2.088, 0.504, 0.673, 1.053, 0.469], [1.386, 2.116, 0.605, 0.191, 0.61, 0.101]]\nD: [[1.518, 1.271, 0.394, 0.605, 0.594, 0.849], [0.943, 1.353, 0.378, 0.619, 0.666, 0.828], [0.701, 1.955, 0.404, 0.648, 0.696, 0.84], [0.523, 2.479, 0.454, 0.541, 0.563, 0.792], [-1.051, 2.117, 0.448, 0.79, 0.709, 0.804], [-1.341, 1.248, 0.462, 0.57, 0.622, 0.853], [-1.574, 1.994, 0.519, 0.538, 0.675, 0.779], [-1.737, 2.403, 0.858, 0.16, 0.317, 0.168], [-2.078, -0.466, 0.495, 0.568, 0.586, 0.801], [-2.925, -1.082, 0.538, 0.66, 0.66, 0.803], [-3.037, -1.752, 0.519, 0.574, 0.705, 0.845], [-0.539, -1.191, 0.375, 0.64, 0.637, 0.843], [-0.068, -1.536, 0.384, 0.646, 0.641, 0.825], [-0.052, -2.09, 0.408, 0.661, 0.773, 0.824], [-0.669, -1.919, 0.395, 0.676, 0.647, 0.832], [2.151, -0.689, 0.438, 0.636, 0.62, 0.802], [1.695, -1.528, 0.421, 0.589, 0.733, 0.82], [1.703, -2.028, 0.457, 0.561, 0.65, 0.798], [2.65, -1.483, 0.534, 0.701, 0.712, 0.852], [2.844, -2.087, 0.588, 0.548, 0.714, 0.804], [1.775, 1.985, 0.459, 0.664, 0.67, 0.811], [1.768, 2.603, 0.602, 0.329, 0.514, 0.537]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_164_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_164_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[2.682, 0.363, 0.619, 1.02, 0.762, 0.226], [0.295, -1.101, 1.206, 0.33, 0.677, 0.052], [-2.902, 0.765, -0.248, 0.755, 0.487, 0.427]]\nB: [[2.599, 0.763, 0.355, 0.291, 0.253, 0.563], [0.364, -0.942, 0.366, 0.823, 0.285, 0.293], [-3.251, -0.039, 0.534, 0.204, 0.315, 0.125]]\nC: [[2.754, 0.716, 0.728, 0.739, 0.536, 0.095], [-0.308, -0.319, 1.147, 0.102, 0.805, 0.177], [-3.102, -0.022, 0.312, 0.658, 0.474, 0.358]]\nD: [[2.461, 0.569, 0.328, 0.546, 0.491, 0.37], [-0.048, -0.818, 0.757, 0.462, 0.439, 0.351], [-2.848, 0.285, 0.131, 0.472, 0.5, 0.37]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the box in the scene. The camera pose information includes: the rotation matrix: [[0.764638, 0.028658, -0.643823], [0.64431, -0.055554, 0.762744], [-0.013909, -0.998044, -0.060944]]; the translation vector: [3.061982, 3.98913, 1.495508], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[2.682, 0.363, 0.619, 1.02, 0.762, 0.226], [0.295, -1.101, 1.206, 0.33, 0.677, 0.052], [-2.902, 0.765, -0.248, 0.755, 0.487, 0.427]]\nB: [[2.599, 0.763, 0.355, 0.291, 0.253, 0.563], [0.364, -0.942, 0.366, 0.823, 0.285, 0.293], [-3.251, -0.039, 0.534, 0.204, 0.315, 0.125]]\nC: [[2.754, 0.716, 0.728, 0.739, 0.536, 0.095], [-0.308, -0.319, 1.147, 0.102, 0.805, 0.177], [-3.102, -0.022, 0.312, 0.658, 0.474, 0.358]]\nD: [[2.461, 0.569, 0.328, 0.546, 0.491, 0.37], [-0.048, -0.818, 0.757, 0.462, 0.439, 0.351], [-2.848, 0.285, 0.131, 0.472, 0.5, 0.37]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_165_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_165_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[0.116, -0.769, 0.665, 0.588, 1.197, 1.025], [-0.796, -1.178, 0.171, 0.576, 0.604, 0.431], [-0.822, 1.221, 0.066, 0.696, 1.356, 0.542], [0.425, 0.671, 0.344, 0.824, 1.179, 0.866]]\nB: [[0.608, -0.936, 0.413, 0.854, 0.8, 0.77], [-0.425, -0.856, 0.339, 0.897, 0.767, 0.762], [-0.451, 1.126, 0.358, 0.838, 0.91, 0.764], [0.774, 1.047, 0.416, 0.815, 0.841, 0.775]]\nC: [[0.288, -1.283, 0.655, 0.817, 0.674, 0.566], [-0.233, -0.57, 0.023, 0.569, 0.942, 1.169], [-0.037, 0.77, 0.308, 0.824, 1.383, 0.685], [0.662, 1.515, 0.896, 0.594, 0.416, 0.9]]\nD: [[1.018, -1.113, 0.375, 0.665, 0.803, 1.039], [-0.701, -1.19, 0.042, 0.611, 0.648, 0.566], [-0.236, 1.15, 0.63, 1.12, 1.165, 0.969], [0.285, 0.606, 0.443, 1.268, 0.881, 0.591]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the chair in the scene. The camera pose information includes: the rotation matrix: [[-0.711391, -0.463973, 0.527875], [-0.700286, 0.531398, -0.476672], [-0.059349, -0.708763, -0.702945]]; the translation vector: [2.53321, 4.394931, 1.530427], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.116, -0.769, 0.665, 0.588, 1.197, 1.025], [-0.796, -1.178, 0.171, 0.576, 0.604, 0.431], [-0.822, 1.221, 0.066, 0.696, 1.356, 0.542], [0.425, 0.671, 0.344, 0.824, 1.179, 0.866]]\nB: [[0.608, -0.936, 0.413, 0.854, 0.8, 0.77], [-0.425, -0.856, 0.339, 0.897, 0.767, 0.762], [-0.451, 1.126, 0.358, 0.838, 0.91, 0.764], [0.774, 1.047, 0.416, 0.815, 0.841, 0.775]]\nC: [[0.288, -1.283, 0.655, 0.817, 0.674, 0.566], [-0.233, -0.57, 0.023, 0.569, 0.942, 1.169], [-0.037, 0.77, 0.308, 0.824, 1.383, 0.685], [0.662, 1.515, 0.896, 0.594, 0.416, 0.9]]\nD: [[1.018, -1.113, 0.375, 0.665, 0.803, 1.039], [-0.701, -1.19, 0.042, 0.611, 0.648, 0.566], [-0.236, 1.15, 0.63, 1.12, 1.165, 0.969], [0.285, 0.606, 0.443, 1.268, 0.881, 0.591]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_166_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_166_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[0.097, 1.124, 0.669, 0.502, 0.516, 0.549], [-0.719, 0.622, 0.51, 0.696, 0.696, 1.008], [0.747, 0.329, 0.449, 0.568, 0.565, 0.934], [0.72, 0.839, 0.522, 0.626, 0.707, 0.997], [-0.373, -0.636, 0.467, 0.582, 0.551, 0.906]]\nB: [[0.297, 0.852, 0.7, 0.103, 0.44, 0.966], [-0.93, 0.904, 0.062, 0.986, 0.828, 0.767], [0.468, 0.69, 0.657, 0.758, 0.619, 1.108], [0.682, 0.702, 0.346, 0.75, 0.569, 0.847], [-0.423, -0.68, 0.291, 0.082, 0.385, 1.192]]\nC: [[0.512, 0.853, 0.312, 0.021, 0.921, 0.339], [-0.518, 0.57, 0.844, 1.067, 0.275, 1.347], [0.721, 0.423, 0.574, 0.387, 0.991, 1.286], [0.648, 0.46, 0.149, 0.657, 0.835, 0.53], [-0.541, -0.731, 0.203, 0.127, 0.654, 0.996]]\nD: [[0.168, 0.81, 1.159, 0.247, 0.182, 0.73], [-0.91, 0.423, 0.9, 0.946, 0.519, 0.547], [1.221, 0.571, 0.284, 0.571, 0.987, 1.376], [1.146, 0.534, 0.507, 0.778, 0.702, 1.372], [-0.13, -0.402, 0.492, 0.884, 0.774, 1.331]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the chair in the scene. The camera pose information includes: the rotation matrix: [[-0.236277, -0.452541, 0.859872], [-0.970097, 0.160455, -0.182119], [-0.055554, -0.877189, -0.47692]]; the translation vector: [1.575898, 1.961144, 1.314442], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.097, 1.124, 0.669, 0.502, 0.516, 0.549], [-0.719, 0.622, 0.51, 0.696, 0.696, 1.008], [0.747, 0.329, 0.449, 0.568, 0.565, 0.934], [0.72, 0.839, 0.522, 0.626, 0.707, 0.997], [-0.373, -0.636, 0.467, 0.582, 0.551, 0.906]]\nB: [[0.297, 0.852, 0.7, 0.103, 0.44, 0.966], [-0.93, 0.904, 0.062, 0.986, 0.828, 0.767], [0.468, 0.69, 0.657, 0.758, 0.619, 1.108], [0.682, 0.702, 0.346, 0.75, 0.569, 0.847], [-0.423, -0.68, 0.291, 0.082, 0.385, 1.192]]\nC: [[0.512, 0.853, 0.312, 0.021, 0.921, 0.339], [-0.518, 0.57, 0.844, 1.067, 0.275, 1.347], [0.721, 0.423, 0.574, 0.387, 0.991, 1.286], [0.648, 0.46, 0.149, 0.657, 0.835, 0.53], [-0.541, -0.731, 0.203, 0.127, 0.654, 0.996]]\nD: [[0.168, 0.81, 1.159, 0.247, 0.182, 0.73], [-0.91, 0.423, 0.9, 0.946, 0.519, 0.547], [1.221, 0.571, 0.284, 0.571, 0.987, 1.376], [1.146, 0.534, 0.507, 0.778, 0.702, 1.372], [-0.13, -0.402, 0.492, 0.884, 0.774, 1.331]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_167_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_167_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[2.094, 0.511, 1.124, 0.123, 2.188, 0.539]]\nB: [[2.081, 0.516, 0.947, 0.355, 2.545, 0.592]]\nC: [[1.732, 0.343, 0.947, 0.511, 2.586, 0.308]]\nD: [[1.989, 0.949, 0.649, -0.276, 2.128, 0.539]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the whiteboard in the scene. The camera pose information includes: the rotation matrix: [[-0.997074, 0.061747, -0.045056], [0.074474, 0.651998, -0.754554], [-0.017215, -0.755702, -0.654689]]; the translation vector: [1.815792, 5.369752, 1.288561], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[2.094, 0.511, 1.124, 0.123, 2.188, 0.539]]\nB: [[2.081, 0.516, 0.947, 0.355, 2.545, 0.592]]\nC: [[1.732, 0.343, 0.947, 0.511, 2.586, 0.308]]\nD: [[1.989, 0.949, 0.649, -0.276, 2.128, 0.539]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_168_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_168_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-1.228, -1.39, 0.131, 0.85, 0.551, 0.683]]\nB: [[-1.305, -1.508, 0.232, 0.822, 0.566, 0.435]]\nC: [[-0.824, -1.786, 0.652, 1.27, 0.727, -0.053]]\nD: [[-0.844, -1.175, 0.453, 0.328, 0.627, 0.359]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the piano bench in the scene. The camera pose information includes: the rotation matrix: [[-0.804945, -0.278842, 0.523748], [-0.593014, 0.407765, -0.694307], [-0.019964, -0.869468, -0.493585]]; the translation vector: [4.871809, 2.494869, 1.402737], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.228, -1.39, 0.131, 0.85, 0.551, 0.683]]\nB: [[-1.305, -1.508, 0.232, 0.822, 0.566, 0.435]]\nC: [[-0.824, -1.786, 0.652, 1.27, 0.727, -0.053]]\nD: [[-0.844, -1.175, 0.453, 0.328, 0.627, 0.359]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_169_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_169_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[0.141, 1.242, 1.15, 2.629, 0.737, 2.324], [-1.855, -0.469, 1.171, 0.539, 3.779, 2.772], [0.841, 0.114, 0.787, 0.691, 3.588, 2.223], [0.571, -1.772, 1.52, 1.679, 0.887, 2.594]]\nB: [[-0.097, 1.504, 1.106, 3.029, 0.358, 2.218], [-1.545, 0.258, 1.211, 0.132, 3.756, 2.16], [0.905, 0.283, 1.397, -0.093, 3.002, 2.004], [0.394, -1.669, 1.139, 2.247, 0.169, 2.208]]\nC: [[-0.253, 1.653, 1.522, 3.078, 0.478, 2.791], [-1.503, -0.37, 1.376, 0.691, 4.127, 2.703], [1.422, -0.022, 0.986, 0.339, 3.887, 2.497], [-0.134, -1.344, 0.891, 2.344, 0.714, 2.451]]\nD: [[-0.058, 1.533, 1.269, 2.876, 0.624, 2.668], [-1.389, 0.007, 1.251, 0.231, 3.638, 2.637], [1.275, 0.042, 1.086, 0.289, 3.412, 2.272], [0.358, -1.537, 1.122, 1.906, 0.425, 2.129]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[-0.032646, 0.194727, -0.980314], [0.998594, -0.034636, -0.040135], [-0.04177, -0.980246, -0.193322]]; the translation vector: [3.506056, 2.493951, 1.706783], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.141, 1.242, 1.15, 2.629, 0.737, 2.324], [-1.855, -0.469, 1.171, 0.539, 3.779, 2.772], [0.841, 0.114, 0.787, 0.691, 3.588, 2.223], [0.571, -1.772, 1.52, 1.679, 0.887, 2.594]]\nB: [[-0.097, 1.504, 1.106, 3.029, 0.358, 2.218], [-1.545, 0.258, 1.211, 0.132, 3.756, 2.16], [0.905, 0.283, 1.397, -0.093, 3.002, 2.004], [0.394, -1.669, 1.139, 2.247, 0.169, 2.208]]\nC: [[-0.253, 1.653, 1.522, 3.078, 0.478, 2.791], [-1.503, -0.37, 1.376, 0.691, 4.127, 2.703], [1.422, -0.022, 0.986, 0.339, 3.887, 2.497], [-0.134, -1.344, 0.891, 2.344, 0.714, 2.451]]\nD: [[-0.058, 1.533, 1.269, 2.876, 0.624, 2.668], [-1.389, 0.007, 1.251, 0.231, 3.638, 2.637], [1.275, 0.042, 1.086, 0.289, 3.412, 2.272], [0.358, -1.537, 1.122, 1.906, 0.425, 2.129]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_170_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_170_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-2.22, -0.005, 0.817, 1.09, 2.894, 0.662], [-2.657, -0.552, 1.11, 0.97, 1.155, 0.566]]\nB: [[-1.955, 0.127, 0.536, 1.492, 2.79, 1.348], [-2.155, 0.305, 0.66, 0.159, 1.455, 0.072]]\nC: [[-1.433, 0.186, 1.02, 1.626, 2.332, 1.534], [-2.448, -0.356, 0.853, 0.017, 1.445, 0.618]]\nD: [[-1.798, 0.428, 0.571, 1.201, 2.441, 1.114], [-2.518, -0.083, 1.081, 0.488, 1.535, 0.157]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the couch in the scene. The camera pose information includes: the rotation matrix: [[-0.205964, -0.505778, 0.837716], [-0.978495, 0.11627, -0.170378], [-0.011228, -0.854792, -0.518849]]; the translation vector: [2.901534, 4.292832, 1.280844], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-2.22, -0.005, 0.817, 1.09, 2.894, 0.662], [-2.657, -0.552, 1.11, 0.97, 1.155, 0.566]]\nB: [[-1.955, 0.127, 0.536, 1.492, 2.79, 1.348], [-2.155, 0.305, 0.66, 0.159, 1.455, 0.072]]\nC: [[-1.433, 0.186, 1.02, 1.626, 2.332, 1.534], [-2.448, -0.356, 0.853, 0.017, 1.445, 0.618]]\nD: [[-1.798, 0.428, 0.571, 1.201, 2.441, 1.114], [-2.518, -0.083, 1.081, 0.488, 1.535, 0.157]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_171_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_171_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-1.442, -1.133, 0.562, 0.636, 0.657, 0.49], [-0.931, -0.023, 0.592, 0.548, 0.635, 0.449], [1.185, -0.67, 0.523, 0.55, 0.618, 0.447], [-0.778, 1.905, 0.84, 0.606, 0.538, 0.514], [-0.723, -1.153, 0.514, 0.657, 0.632, 0.473], [-1.434, -0.489, 0.591, 0.567, 0.545, 0.458], [-1.479, -1.704, 0.547, 0.555, 0.643, 0.506], [-1.06, 0.579, 0.646, 0.554, 0.57, 0.426], [1.728, -0.095, 0.592, 0.547, 0.596, 0.473], [-1.358, 1.889, 0.774, 0.643, 0.662, 0.446], [2.187, 1.992, 0.739, 0.592, 0.503, 0.463], [-0.349, 1.313, 0.568, 0.481, 0.31, 0.827], [0.659, 1.035, 0.643, 0.561, 0.458, 0.449], [1.351, 1.116, 0.663, 0.567, 0.545, 0.469], [1.67, 0.521, 0.73, 0.179, 0.508, 0.285], [0.482, -0.974, 0.492, 0.592, 0.586, 0.475]]\nB: [[-1.049, -1.529, 1.034, 0.201, 0.822, 0.539], [-0.9, 0.339, 0.327, 0.273, 0.766, 0.553], [0.737, -1.05, 0.211, 0.082, 0.504, 0.933], [-1.047, 2.226, 0.838, 0.996, 0.859, 0.972], [-0.719, -0.678, 0.784, 0.49, 0.145, 0.261], [-1.882, -0.392, 0.818, 0.955, 0.143, 0.713], [-1.551, -2.013, 0.366, 0.53, 0.75, 0.368], [-1.315, 0.463, 0.891, 0.81, 0.604, 0.638], [2.147, -0.334, 0.803, 0.499, 0.844, 0.692], [-1.677, 2.042, 0.864, 0.402, 1.157, 0.639], [1.976, 2.077, 0.904, 0.918, 0.711, 0.254], [-0.187, 1.603, 0.781, 0.267, -0.088, 1.027], [0.26, 0.795, 0.514, 0.847, -0.04, 0.297], [1.756, 1.456, 0.644, 0.597, 0.817, 0.47], [1.242, 0.068, 0.373, 0.448, 0.149, 0.381], [0.319, -0.553, 0.655, 0.691, 0.359, 0.589]]\nC: [[-0.988, -1.282, 0.732, 0.336, 0.483, 0.927], [-0.442, -0.073, 0.808, 0.229, 0.772, 0.639], [1.548, -1.036, 0.108, 0.525, 0.245, 0.035], [-1.266, 1.685, 1.335, 0.956, 0.747, 0.267], [-1.079, -1.607, 1.01, 0.83, 1.062, 0.521], [-1.264, -0.925, 0.343, 1.047, 0.715, 0.269], [-1.458, -1.958, 0.337, 0.66, 0.161, 0.546], [-0.733, 0.312, 0.474, 0.521, 0.178, -0.061], [2.105, 0.263, 0.727, 0.39, 0.976, 0.108], [-1.707, 1.787, 0.496, 0.472, 1.062, 0.821], [2.45, 1.544, 0.321, 1.018, 0.15, 0.075], [-0.837, 1.59, 0.268, 0.538, 0.245, 0.497], [0.297, 1.19, 0.423, 0.185, 0.686, 0.323], [0.857, 1.058, 0.937, 0.887, 0.209, 0.519], [1.802, 0.184, 0.797, 0.22, 0.094, 0.637], [0.094, -0.987, 0.725, 0.553, 1.059, 0.036]]\nD: [[-1.227, -0.819, 0.642, 0.301, 0.736, 0.894], [-1.335, 0.35, 0.132, 0.881, 0.202, 0.441], [1.374, -0.345, 0.698, 0.363, 1.089, 0.667], [-0.963, 1.843, 0.91, 0.493, 0.498, 0.35], [-1.186, -1.506, 0.169, 0.581, 0.638, 0.951], [-1.772, -0.025, 0.967, 0.473, 0.884, -0.032], [-1.614, -1.94, 0.374, 0.725, 0.441, 0.512], [-1.408, 0.285, 1.05, 0.486, 0.297, 0.835], [2.021, -0.535, 0.654, 0.219, 0.759, 0.901], [-1.57, 2.203, 0.527, 0.16, 0.291, 0.718], [1.825, 2.298, 0.457, 1.052, 0.655, 0.73], [-0.153, 1.778, 0.354, 0.514, 0.609, 0.42], [0.512, 1.223, 0.597, 0.407, 0.628, 0.692], [1.022, 1.172, 0.206, 0.702, 0.301, 0.176], [1.64, 0.74, 0.55, 0.197, 0.956, 0.52], [0.38, -0.727, 0.278, 0.877, 0.781, 0.837]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the chair in the scene. The camera pose information includes: the rotation matrix: [[-0.830629, 0.239867, -0.502514], [0.556756, 0.37214, -0.742654], [0.008867, -0.896647, -0.442658]]; the translation vector: [4.849209, 2.614689, 1.447477], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.442, -1.133, 0.562, 0.636, 0.657, 0.49], [-0.931, -0.023, 0.592, 0.548, 0.635, 0.449], [1.185, -0.67, 0.523, 0.55, 0.618, 0.447], [-0.778, 1.905, 0.84, 0.606, 0.538, 0.514], [-0.723, -1.153, 0.514, 0.657, 0.632, 0.473], [-1.434, -0.489, 0.591, 0.567, 0.545, 0.458], [-1.479, -1.704, 0.547, 0.555, 0.643, 0.506], [-1.06, 0.579, 0.646, 0.554, 0.57, 0.426], [1.728, -0.095, 0.592, 0.547, 0.596, 0.473], [-1.358, 1.889, 0.774, 0.643, 0.662, 0.446], [2.187, 1.992, 0.739, 0.592, 0.503, 0.463], [-0.349, 1.313, 0.568, 0.481, 0.31, 0.827], [0.659, 1.035, 0.643, 0.561, 0.458, 0.449], [1.351, 1.116, 0.663, 0.567, 0.545, 0.469], [1.67, 0.521, 0.73, 0.179, 0.508, 0.285], [0.482, -0.974, 0.492, 0.592, 0.586, 0.475]]\nB: [[-1.049, -1.529, 1.034, 0.201, 0.822, 0.539], [-0.9, 0.339, 0.327, 0.273, 0.766, 0.553], [0.737, -1.05, 0.211, 0.082, 0.504, 0.933], [-1.047, 2.226, 0.838, 0.996, 0.859, 0.972], [-0.719, -0.678, 0.784, 0.49, 0.145, 0.261], [-1.882, -0.392, 0.818, 0.955, 0.143, 0.713], [-1.551, -2.013, 0.366, 0.53, 0.75, 0.368], [-1.315, 0.463, 0.891, 0.81, 0.604, 0.638], [2.147, -0.334, 0.803, 0.499, 0.844, 0.692], [-1.677, 2.042, 0.864, 0.402, 1.157, 0.639], [1.976, 2.077, 0.904, 0.918, 0.711, 0.254], [-0.187, 1.603, 0.781, 0.267, -0.088, 1.027], [0.26, 0.795, 0.514, 0.847, -0.04, 0.297], [1.756, 1.456, 0.644, 0.597, 0.817, 0.47], [1.242, 0.068, 0.373, 0.448, 0.149, 0.381], [0.319, -0.553, 0.655, 0.691, 0.359, 0.589]]\nC: [[-0.988, -1.282, 0.732, 0.336, 0.483, 0.927], [-0.442, -0.073, 0.808, 0.229, 0.772, 0.639], [1.548, -1.036, 0.108, 0.525, 0.245, 0.035], [-1.266, 1.685, 1.335, 0.956, 0.747, 0.267], [-1.079, -1.607, 1.01, 0.83, 1.062, 0.521], [-1.264, -0.925, 0.343, 1.047, 0.715, 0.269], [-1.458, -1.958, 0.337, 0.66, 0.161, 0.546], [-0.733, 0.312, 0.474, 0.521, 0.178, -0.061], [2.105, 0.263, 0.727, 0.39, 0.976, 0.108], [-1.707, 1.787, 0.496, 0.472, 1.062, 0.821], [2.45, 1.544, 0.321, 1.018, 0.15, 0.075], [-0.837, 1.59, 0.268, 0.538, 0.245, 0.497], [0.297, 1.19, 0.423, 0.185, 0.686, 0.323], [0.857, 1.058, 0.937, 0.887, 0.209, 0.519], [1.802, 0.184, 0.797, 0.22, 0.094, 0.637], [0.094, -0.987, 0.725, 0.553, 1.059, 0.036]]\nD: [[-1.227, -0.819, 0.642, 0.301, 0.736, 0.894], [-1.335, 0.35, 0.132, 0.881, 0.202, 0.441], [1.374, -0.345, 0.698, 0.363, 1.089, 0.667], [-0.963, 1.843, 0.91, 0.493, 0.498, 0.35], [-1.186, -1.506, 0.169, 0.581, 0.638, 0.951], [-1.772, -0.025, 0.967, 0.473, 0.884, -0.032], [-1.614, -1.94, 0.374, 0.725, 0.441, 0.512], [-1.408, 0.285, 1.05, 0.486, 0.297, 0.835], [2.021, -0.535, 0.654, 0.219, 0.759, 0.901], [-1.57, 2.203, 0.527, 0.16, 0.291, 0.718], [1.825, 2.298, 0.457, 1.052, 0.655, 0.73], [-0.153, 1.778, 0.354, 0.514, 0.609, 0.42], [0.512, 1.223, 0.597, 0.407, 0.628, 0.692], [1.022, 1.172, 0.206, 0.702, 0.301, 0.176], [1.64, 0.74, 0.55, 0.197, 0.956, 0.52], [0.38, -0.727, 0.278, 0.877, 0.781, 0.837]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_172_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_172_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[1.285, 2.094, 0.223, 0.8, -0.126, 0.725], [0.89, 2.46, 0.447, -0.137, 0.338, 0.037]]\nB: [[0.378, 1.664, -0.029, 0.245, 0.007, 0.518], [0.551, 2.153, -0.045, 0.073, 0.825, 0.641]]\nC: [[0.842, 1.796, 0.181, 0.339, 0.338, 0.37], [0.768, 2.073, 0.205, 0.294, 0.394, 0.403]]\nD: [[0.562, 2.17, 0.079, 0.501, 0.638, 0.525], [0.42, 1.956, 0.647, 0.731, 0.278, 0.487]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the bucket in the scene. The camera pose information includes: the rotation matrix: [[-0.819759, -0.274444, 0.502669], [-0.572709, 0.39303, -0.719397], [-0.00013, -0.877615, -0.479366]]; the translation vector: [2.765326, 1.370172, 1.355227], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.285, 2.094, 0.223, 0.8, -0.126, 0.725], [0.89, 2.46, 0.447, -0.137, 0.338, 0.037]]\nB: [[0.378, 1.664, -0.029, 0.245, 0.007, 0.518], [0.551, 2.153, -0.045, 0.073, 0.825, 0.641]]\nC: [[0.842, 1.796, 0.181, 0.339, 0.338, 0.37], [0.768, 2.073, 0.205, 0.294, 0.394, 0.403]]\nD: [[0.562, 2.17, 0.079, 0.501, 0.638, 0.525], [0.42, 1.956, 0.647, 0.731, 0.278, 0.487]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_173_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_173_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[0.351, 1.709, 0.416, 3.301, 3.462, -0.205]]\nB: [[0.748, 1.385, 0.703, 3.676, 3.587, 0.247]]\nC: [[0.285, 1.079, 0.707, 4.151, 3.525, -0.098]]\nD: [[0.437, 1.63, 0.992, 3.864, 3.856, 0.472]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the table in the scene. The camera pose information includes: the rotation matrix: [[-0.119369, -0.433868, 0.893034], [-0.990549, 0.113242, -0.077387], [-0.067553, -0.893832, -0.443285]]; the translation vector: [3.407035, 4.679209, 1.397058], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.351, 1.709, 0.416, 3.301, 3.462, -0.205]]\nB: [[0.748, 1.385, 0.703, 3.676, 3.587, 0.247]]\nC: [[0.285, 1.079, 0.707, 4.151, 3.525, -0.098]]\nD: [[0.437, 1.63, 0.992, 3.864, 3.856, 0.472]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_174_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_174_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-2.171, -0.049, 0.811, 0.067, 1.74, 1.931], [2.378, 0.912, 0.851, 0.596, 2.524, 1.349], [0.415, -1.4, 0.688, 3.856, 0.243, 1.685], [2.428, -1.301, 0.364, 0.307, 0.039, 1.588], [-1.851, -0.771, 1.131, 0.437, 0.879, 1.981]]\nB: [[-2.124, 0.402, 0.972, 0.336, 1.814, 2.055], [2.714, 0.714, 0.306, -0.022, 2.698, 1.287], [0.218, -0.935, 0.625, 3.775, 0.411, 1.982], [2.74, -0.719, 0.42, -0.08, 0.24, 0.945], [-2.083, -0.772, 1.329, 0.652, 0.37, 2.117]]\nC: [[-2.229, 0.152, 1.164, 0.205, 1.859, 2.109], [2.442, 0.667, 0.678, 0.238, 2.976, 1.311], [0.131, -1.186, 0.807, 4.198, 0.217, 1.596], [2.311, -0.918, 0.648, 0.343, 0.478, 1.211], [-2.036, -0.925, 1.234, 0.571, 0.559, 1.867]]\nD: [[-2.234, 0.572, 0.912, 0.309, 1.595, 2.084], [2.569, 0.682, 1.117, -0.182, 2.613, 1.651], [-0.058, -0.811, 0.697, 4.093, -0.122, 2.058], [2.792, -1.246, 0.764, 0.753, 0.799, 0.76], [-1.948, -0.836, 1.559, 0.97, 0.14, 2.126]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[-0.924746, 0.145405, -0.351715], [0.379908, 0.407811, -0.830277], [0.022707, -0.901414, -0.432362]]; the translation vector: [3.891577, 4.106122, 1.335216], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-2.171, -0.049, 0.811, 0.067, 1.74, 1.931], [2.378, 0.912, 0.851, 0.596, 2.524, 1.349], [0.415, -1.4, 0.688, 3.856, 0.243, 1.685], [2.428, -1.301, 0.364, 0.307, 0.039, 1.588], [-1.851, -0.771, 1.131, 0.437, 0.879, 1.981]]\nB: [[-2.124, 0.402, 0.972, 0.336, 1.814, 2.055], [2.714, 0.714, 0.306, -0.022, 2.698, 1.287], [0.218, -0.935, 0.625, 3.775, 0.411, 1.982], [2.74, -0.719, 0.42, -0.08, 0.24, 0.945], [-2.083, -0.772, 1.329, 0.652, 0.37, 2.117]]\nC: [[-2.229, 0.152, 1.164, 0.205, 1.859, 2.109], [2.442, 0.667, 0.678, 0.238, 2.976, 1.311], [0.131, -1.186, 0.807, 4.198, 0.217, 1.596], [2.311, -0.918, 0.648, 0.343, 0.478, 1.211], [-2.036, -0.925, 1.234, 0.571, 0.559, 1.867]]\nD: [[-2.234, 0.572, 0.912, 0.309, 1.595, 2.084], [2.569, 0.682, 1.117, -0.182, 2.613, 1.651], [-0.058, -0.811, 0.697, 4.093, -0.122, 2.058], [2.792, -1.246, 0.764, 0.753, 0.799, 0.76], [-1.948, -0.836, 1.559, 0.97, 0.14, 2.126]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_175_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_175_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-1.059, -1.275, 0.883, 0.298, 1.018, 2.01], [0.236, 1.462, 0.631, 0.873, 0.486, 1.281], [1.1, -1.001, 0.952, 1.349, 0.172, 2.338], [-0.839, 2.079, 0.96, 0.395, 0.842, 1.983], [1.955, -4.485, 1.036, 0.141, 0.98, 2.593], [-0.255, -0.481, 0.905, 0.161, 0.928, 1.969]]\nB: [[-0.954, -1.046, 0.462, 0.658, 0.618, 2.138], [-0.213, 1.899, 0.694, 0.708, 0.841, 1.686], [1.362, -1.133, 1.001, 1.465, -0.084, 2.501], [-0.862, 2.364, 0.854, 0.124, 0.853, 2.159], [2.413, -4.964, 0.774, -0.345, 1.16, 3.015], [0.156, -0.554, 0.434, -0.07, 0.695, 2.392]]\nC: [[-0.732, -1.166, 0.44, 0.739, 0.991, 1.593], [0.338, 1.95, 0.672, 0.941, 0.589, 1.757], [0.743, -0.963, 1.147, 1.448, -0.135, 2.517], [-1.129, 2.483, 1.375, 0.132, 1.054, 2.43], [1.587, -4.551, 0.847, 0.35, 0.965, 2.767], [-0.661, -0.507, 0.612, -0.243, 0.847, 1.515]]\nD: [[-0.907, -1.27, 0.42, -0.059, 1.138, 1.561], [0.626, 1.256, 1.105, 1.202, 0.216, 1.006], [0.877, -0.877, 1.149, 0.987, -0.045, 2.737], [-0.783, 1.691, 0.606, 0.081, 0.643, 2.205], [2.363, -4.049, 1.139, 0.229, 0.955, 2.439], [-0.196, -0.854, 0.721, 0.566, 0.583, 2.254]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the door in the scene. The camera pose information includes: the rotation matrix: [[-0.996429, -0.081152, -0.023325], [-0.01119, 0.400709, -0.916137], [0.083693, -0.912604, -0.400187]]; the translation vector: [7.365378, 2.610504, 1.343957], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.059, -1.275, 0.883, 0.298, 1.018, 2.01], [0.236, 1.462, 0.631, 0.873, 0.486, 1.281], [1.1, -1.001, 0.952, 1.349, 0.172, 2.338], [-0.839, 2.079, 0.96, 0.395, 0.842, 1.983], [1.955, -4.485, 1.036, 0.141, 0.98, 2.593], [-0.255, -0.481, 0.905, 0.161, 0.928, 1.969]]\nB: [[-0.954, -1.046, 0.462, 0.658, 0.618, 2.138], [-0.213, 1.899, 0.694, 0.708, 0.841, 1.686], [1.362, -1.133, 1.001, 1.465, -0.084, 2.501], [-0.862, 2.364, 0.854, 0.124, 0.853, 2.159], [2.413, -4.964, 0.774, -0.345, 1.16, 3.015], [0.156, -0.554, 0.434, -0.07, 0.695, 2.392]]\nC: [[-0.732, -1.166, 0.44, 0.739, 0.991, 1.593], [0.338, 1.95, 0.672, 0.941, 0.589, 1.757], [0.743, -0.963, 1.147, 1.448, -0.135, 2.517], [-1.129, 2.483, 1.375, 0.132, 1.054, 2.43], [1.587, -4.551, 0.847, 0.35, 0.965, 2.767], [-0.661, -0.507, 0.612, -0.243, 0.847, 1.515]]\nD: [[-0.907, -1.27, 0.42, -0.059, 1.138, 1.561], [0.626, 1.256, 1.105, 1.202, 0.216, 1.006], [0.877, -0.877, 1.149, 0.987, -0.045, 2.737], [-0.783, 1.691, 0.606, 0.081, 0.643, 2.205], [2.363, -4.049, 1.139, 0.229, 0.955, 2.439], [-0.196, -0.854, 0.721, 0.566, 0.583, 2.254]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_176_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_176_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[0.662, -1.551, 0.955, 1.174, 1.083, 1.423]]\nB: [[0.488, -1.177, 0.89, 1.089, 0.729, 1.751]]\nC: [[0.483, -0.736, 0.965, 0.958, 0.277, 1.886]]\nD: [[0.649, -1.283, 0.609, 1.143, 1.139, 2.193]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the desk in the scene. The camera pose information includes: the rotation matrix: [[0.51864, -0.44867, 0.727811], [-0.853934, -0.229463, 0.467059], [-0.04255, -0.863738, -0.502143]]; the translation vector: [1.002297, 1.98866, 1.344191], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.662, -1.551, 0.955, 1.174, 1.083, 1.423]]\nB: [[0.488, -1.177, 0.89, 1.089, 0.729, 1.751]]\nC: [[0.483, -0.736, 0.965, 0.958, 0.277, 1.886]]\nD: [[0.649, -1.283, 0.609, 1.143, 1.139, 2.193]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_177_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_177_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[1.285, -1.171, 0.463, 0.532, 0.611, 0.923], [1.223, 1.763, 0.56, 0.667, 0.631, 0.966], [-0.307, 1.833, 0.5, 0.619, 0.589, 0.922], [-0.406, -0.951, 0.39, 0.539, 0.611, 0.908], [0.655, 1.709, 0.525, 0.699, 0.634, 0.947], [1.269, -2.956, 0.836, 0.629, 0.557, 0.393], [1.192, 0.557, 0.457, 0.623, 0.615, 0.94], [0.416, -2.765, 0.677, 0.614, 0.452, 0.612], [-0.522, 0.635, 0.418, 0.585, 0.574, 0.905], [-0.363, -3.094, 0.495, 0.715, 0.612, 0.912], [0.336, 0.66, 0.438, 0.602, 0.623, 0.904], [-2.007, -0.347, 0.408, 0.521, 0.585, 0.891], [0.411, -1.036, 0.417, 0.682, 0.634, 0.922], [-2.039, -2.805, 0.495, 0.561, 0.631, 0.9], [-1.956, -1.834, 0.436, 0.597, 0.728, 0.922], [-2.754, 1.479, 0.509, 0.58, 0.603, 0.892]]\nB: [[0.94, -0.734, 0.86, 0.847, 0.174, 1.366], [1.047, 1.724, 0.337, 1.114, 0.725, 1.18], [-0.446, 1.839, 0.399, 1.0, 0.211, 0.928], [-0.224, -0.996, 0.671, 0.902, 0.396, 0.957], [0.648, 2.199, 0.865, 0.644, 0.899, 0.978], [1.601, -3.15, 1.071, 0.541, 0.264, 0.224], [0.709, 0.399, 0.396, 0.628, 0.643, 1.257], [0.103, -2.816, 0.184, 1.095, 0.871, 0.909], [-0.735, 1.113, 0.158, 0.968, 0.355, 1.244], [-0.793, -3.536, 0.957, 0.881, 0.306, 1.233], [-0.114, 0.863, 0.498, 0.236, 0.716, 1.116], [-1.845, -0.397, 0.53, 0.528, 0.958, 0.727], [0.156, -0.653, 0.083, 0.658, 1.129, 0.686], [-2.166, -2.74, 0.163, 0.166, 0.842, 0.447], [-2.421, -1.954, 0.206, 0.882, 0.734, 0.761], [-3.119, 1.809, 0.685, 0.543, 0.98, 1.284]]\nC: [[0.87, -1.386, 0.953, 0.148, 0.539, 1.241], [0.822, 1.276, 0.128, 0.239, 0.572, 1.227], [-0.508, 2.214, 0.373, 0.683, 0.2, 1.183], [-0.547, -1.349, -0.07, 0.231, 0.312, 1.389], [0.457, 1.367, 0.965, 0.768, 0.185, 1.088], [1.563, -2.649, 0.498, 0.756, 0.364, 0.362], [1.083, 0.345, 0.921, 0.769, 0.695, 1.386], [0.143, -3.095, 0.202, 0.278, 0.051, 0.502], [-0.474, 0.978, 0.872, 0.559, 0.082, 1.262], [-0.01, -3.401, 0.115, 1.005, 0.452, 1.143], [-0.106, 1.086, 0.284, 0.105, 0.131, 0.844], [-2.44, -0.304, -0.054, 0.667, 0.457, 0.703], [0.747, -1.031, -0.051, 0.551, 0.84, 0.909], [-2.101, -2.554, 0.473, 1.017, 0.994, 1.065], [-1.883, -2.033, 0.423, 0.644, 1.201, 0.726], [-3.109, 1.24, 0.812, 0.728, 1.099, 0.829]]\nD: [[1.585, -0.899, 0.099, 0.724, 0.912, 0.466], [0.799, 2.074, 0.967, 0.764, 0.821, 0.506], [-0.531, 1.393, 0.134, 0.737, 1.022, 1.024], [-0.008, -0.984, -0.095, 0.085, 0.528, 0.524], [0.362, 2.074, 0.189, 0.835, 0.387, 0.74], [1.446, -2.709, 0.927, 0.329, 0.916, 0.373], [1.078, 0.299, 0.482, 0.303, 0.612, 0.521], [0.439, -2.308, 0.3, 0.788, 0.517, 0.416], [-0.314, 0.386, 0.749, 0.588, 0.522, 1.244], [-0.739, -2.845, 0.766, 0.695, 1.017, 0.779], [-0.116, 0.704, 0.487, 0.148, 0.185, 0.776], [-1.607, 0.118, 0.862, 0.934, 0.609, 0.752], [-0.074, -0.593, 0.062, 0.851, 0.522, 0.762], [-2.188, -3.11, 0.134, 0.427, 0.414, 0.637], [-1.911, -1.906, 0.292, 0.873, 0.728, 0.955], [-2.793, 1.335, 0.084, 0.946, 0.494, 0.463]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the chair in the scene. The camera pose information includes: the rotation matrix: [[0.931668, 0.072515, -0.356001], [0.362912, -0.231685, 0.902561], [-0.017031, -0.970084, -0.24217]]; the translation vector: [5.886859, 3.543659, 1.354971], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.285, -1.171, 0.463, 0.532, 0.611, 0.923], [1.223, 1.763, 0.56, 0.667, 0.631, 0.966], [-0.307, 1.833, 0.5, 0.619, 0.589, 0.922], [-0.406, -0.951, 0.39, 0.539, 0.611, 0.908], [0.655, 1.709, 0.525, 0.699, 0.634, 0.947], [1.269, -2.956, 0.836, 0.629, 0.557, 0.393], [1.192, 0.557, 0.457, 0.623, 0.615, 0.94], [0.416, -2.765, 0.677, 0.614, 0.452, 0.612], [-0.522, 0.635, 0.418, 0.585, 0.574, 0.905], [-0.363, -3.094, 0.495, 0.715, 0.612, 0.912], [0.336, 0.66, 0.438, 0.602, 0.623, 0.904], [-2.007, -0.347, 0.408, 0.521, 0.585, 0.891], [0.411, -1.036, 0.417, 0.682, 0.634, 0.922], [-2.039, -2.805, 0.495, 0.561, 0.631, 0.9], [-1.956, -1.834, 0.436, 0.597, 0.728, 0.922], [-2.754, 1.479, 0.509, 0.58, 0.603, 0.892]]\nB: [[0.94, -0.734, 0.86, 0.847, 0.174, 1.366], [1.047, 1.724, 0.337, 1.114, 0.725, 1.18], [-0.446, 1.839, 0.399, 1.0, 0.211, 0.928], [-0.224, -0.996, 0.671, 0.902, 0.396, 0.957], [0.648, 2.199, 0.865, 0.644, 0.899, 0.978], [1.601, -3.15, 1.071, 0.541, 0.264, 0.224], [0.709, 0.399, 0.396, 0.628, 0.643, 1.257], [0.103, -2.816, 0.184, 1.095, 0.871, 0.909], [-0.735, 1.113, 0.158, 0.968, 0.355, 1.244], [-0.793, -3.536, 0.957, 0.881, 0.306, 1.233], [-0.114, 0.863, 0.498, 0.236, 0.716, 1.116], [-1.845, -0.397, 0.53, 0.528, 0.958, 0.727], [0.156, -0.653, 0.083, 0.658, 1.129, 0.686], [-2.166, -2.74, 0.163, 0.166, 0.842, 0.447], [-2.421, -1.954, 0.206, 0.882, 0.734, 0.761], [-3.119, 1.809, 0.685, 0.543, 0.98, 1.284]]\nC: [[0.87, -1.386, 0.953, 0.148, 0.539, 1.241], [0.822, 1.276, 0.128, 0.239, 0.572, 1.227], [-0.508, 2.214, 0.373, 0.683, 0.2, 1.183], [-0.547, -1.349, -0.07, 0.231, 0.312, 1.389], [0.457, 1.367, 0.965, 0.768, 0.185, 1.088], [1.563, -2.649, 0.498, 0.756, 0.364, 0.362], [1.083, 0.345, 0.921, 0.769, 0.695, 1.386], [0.143, -3.095, 0.202, 0.278, 0.051, 0.502], [-0.474, 0.978, 0.872, 0.559, 0.082, 1.262], [-0.01, -3.401, 0.115, 1.005, 0.452, 1.143], [-0.106, 1.086, 0.284, 0.105, 0.131, 0.844], [-2.44, -0.304, -0.054, 0.667, 0.457, 0.703], [0.747, -1.031, -0.051, 0.551, 0.84, 0.909], [-2.101, -2.554, 0.473, 1.017, 0.994, 1.065], [-1.883, -2.033, 0.423, 0.644, 1.201, 0.726], [-3.109, 1.24, 0.812, 0.728, 1.099, 0.829]]\nD: [[1.585, -0.899, 0.099, 0.724, 0.912, 0.466], [0.799, 2.074, 0.967, 0.764, 0.821, 0.506], [-0.531, 1.393, 0.134, 0.737, 1.022, 1.024], [-0.008, -0.984, -0.095, 0.085, 0.528, 0.524], [0.362, 2.074, 0.189, 0.835, 0.387, 0.74], [1.446, -2.709, 0.927, 0.329, 0.916, 0.373], [1.078, 0.299, 0.482, 0.303, 0.612, 0.521], [0.439, -2.308, 0.3, 0.788, 0.517, 0.416], [-0.314, 0.386, 0.749, 0.588, 0.522, 1.244], [-0.739, -2.845, 0.766, 0.695, 1.017, 0.779], [-0.116, 0.704, 0.487, 0.148, 0.185, 0.776], [-1.607, 0.118, 0.862, 0.934, 0.609, 0.752], [-0.074, -0.593, 0.062, 0.851, 0.522, 0.762], [-2.188, -3.11, 0.134, 0.427, 0.414, 0.637], [-1.911, -1.906, 0.292, 0.873, 0.728, 0.955], [-2.793, 1.335, 0.084, 0.946, 0.494, 0.463]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_178_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_178_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-0.185, -1.981, 0.71, 1.746, 1.931, 1.092]]\nB: [[0.263, -1.622, 0.402, 1.389, 1.64, 0.804]]\nC: [[0.151, -1.735, 0.574, 1.767, 1.715, 0.631]]\nD: [[0.262, -2.035, 0.645, 1.006, 2.054, 1.049]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the table in the scene. The camera pose information includes: the rotation matrix: [[0.987126, 0.106622, -0.119219], [0.159938, -0.652529, 0.740693], [0.00118, -0.750225, -0.661181]]; the translation vector: [4.64166, 4.052867, 1.404314], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.185, -1.981, 0.71, 1.746, 1.931, 1.092]]\nB: [[0.263, -1.622, 0.402, 1.389, 1.64, 0.804]]\nC: [[0.151, -1.735, 0.574, 1.767, 1.715, 0.631]]\nD: [[0.262, -2.035, 0.645, 1.006, 2.054, 1.049]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_179_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_179_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[0.648, -0.593, 0.364, 0.758, 0.748, 0.835], [-1.189, -0.998, 0.388, 0.711, 0.664, 0.751], [-0.106, -0.14, 0.366, 0.681, 0.668, 0.806], [-0.467, -1.537, 0.381, 0.682, 0.66, 0.781]]\nB: [[0.715, -1.041, 0.651, 0.471, 0.834, 0.809], [-1.555, -0.972, 0.209, 0.39, 0.675, 1.08], [0.331, 0.337, 0.451, 0.906, 1.083, 1.138], [-0.367, -1.309, -0.086, 0.84, 1.029, 0.958]]\nC: [[0.76, -0.428, 0.328, 0.718, 0.602, 0.917], [-1.301, -1.169, 0.677, 0.824, 0.61, 0.712], [0.1, -0.045, 0.084, 0.878, 0.367, 0.431], [-0.14, -1.88, 0.43, 0.418, 0.474, 0.77]]\nD: [[0.587, -1.036, 0.299, 1.076, 1.171, 0.475], [-1.011, -1.458, 0.499, 0.276, 1.067, 0.759], [-0.487, -0.498, 0.17, 1.114, 0.58, 1.041], [-0.116, -1.819, 0.569, 0.961, 0.364, 1.204]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the armchair in the scene. The camera pose information includes: the rotation matrix: [[0.68967, 0.288211, -0.664297], [0.724122, -0.27239, 0.633602], [0.001663, -0.918008, -0.396559]]; the translation vector: [2.530043, 2.005069, 1.437417], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.648, -0.593, 0.364, 0.758, 0.748, 0.835], [-1.189, -0.998, 0.388, 0.711, 0.664, 0.751], [-0.106, -0.14, 0.366, 0.681, 0.668, 0.806], [-0.467, -1.537, 0.381, 0.682, 0.66, 0.781]]\nB: [[0.715, -1.041, 0.651, 0.471, 0.834, 0.809], [-1.555, -0.972, 0.209, 0.39, 0.675, 1.08], [0.331, 0.337, 0.451, 0.906, 1.083, 1.138], [-0.367, -1.309, -0.086, 0.84, 1.029, 0.958]]\nC: [[0.76, -0.428, 0.328, 0.718, 0.602, 0.917], [-1.301, -1.169, 0.677, 0.824, 0.61, 0.712], [0.1, -0.045, 0.084, 0.878, 0.367, 0.431], [-0.14, -1.88, 0.43, 0.418, 0.474, 0.77]]\nD: [[0.587, -1.036, 0.299, 1.076, 1.171, 0.475], [-1.011, -1.458, 0.499, 0.276, 1.067, 0.759], [-0.487, -0.498, 0.17, 1.114, 0.58, 1.041], [-0.116, -1.819, 0.569, 0.961, 0.364, 1.204]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_180_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_180_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-0.678, -1.667, 1.218, 1.055, -0.26, 2.418], [-1.464, 1.094, 0.83, 1.262, -0.304, 1.673], [0.614, 2.399, 0.708, 1.662, -0.241, 1.035], [0.965, -0.058, 1.477, 0.773, 4.136, 2.578], [-1.154, 1.558, 1.248, -0.057, 1.091, 2.617], [-1.717, -0.255, 1.603, -0.126, 2.491, 0.5]]\nB: [[-1.352, -1.046, 1.599, 1.44, 0.136, 2.792], [-1.654, 0.816, 0.791, 1.207, 0.132, 1.881], [0.521, 2.259, 1.273, 1.597, -0.332, 0.736], [0.806, 0.159, 1.62, 0.393, 4.568, 2.678], [-1.081, 1.507, 1.378, -0.253, 1.151, 2.742], [-1.495, -0.107, 1.278, 0.325, 1.906, 1.086]]\nC: [[-1.166, -1.418, 1.14, 1.061, 0.184, 2.392], [-1.606, 0.642, 1.143, 0.781, 0.159, 2.172], [0.167, 2.007, 1.118, 1.797, 0.138, 0.569], [0.908, -0.132, 1.206, 0.513, 4.305, 2.258], [-1.326, 1.1, 1.19, 0.242, 0.968, 2.282], [-1.838, -0.447, 1.348, 0.372, 2.121, 0.908]]\nD: [[-0.864, -1.506, 1.526, 0.637, 0.64, 2.228], [-1.745, 0.647, 0.899, 0.933, -0.243, 2.211], [-0.21, 1.507, 1.578, 2.065, 0.587, 0.484], [1.157, 0.294, 0.85, 0.968, 4.125, 2.603], [-1.113, 0.941, 1.165, 0.239, 0.756, 2.423], [-2.32, -0.87, 1.844, 0.517, 2.303, 0.518]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[-0.964843, 0.186346, -0.185345], [0.252505, 0.461537, -0.850426], [-0.07293, -0.867329, -0.492364]]; the translation vector: [3.779865, 2.337391, 1.461827], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.678, -1.667, 1.218, 1.055, -0.26, 2.418], [-1.464, 1.094, 0.83, 1.262, -0.304, 1.673], [0.614, 2.399, 0.708, 1.662, -0.241, 1.035], [0.965, -0.058, 1.477, 0.773, 4.136, 2.578], [-1.154, 1.558, 1.248, -0.057, 1.091, 2.617], [-1.717, -0.255, 1.603, -0.126, 2.491, 0.5]]\nB: [[-1.352, -1.046, 1.599, 1.44, 0.136, 2.792], [-1.654, 0.816, 0.791, 1.207, 0.132, 1.881], [0.521, 2.259, 1.273, 1.597, -0.332, 0.736], [0.806, 0.159, 1.62, 0.393, 4.568, 2.678], [-1.081, 1.507, 1.378, -0.253, 1.151, 2.742], [-1.495, -0.107, 1.278, 0.325, 1.906, 1.086]]\nC: [[-1.166, -1.418, 1.14, 1.061, 0.184, 2.392], [-1.606, 0.642, 1.143, 0.781, 0.159, 2.172], [0.167, 2.007, 1.118, 1.797, 0.138, 0.569], [0.908, -0.132, 1.206, 0.513, 4.305, 2.258], [-1.326, 1.1, 1.19, 0.242, 0.968, 2.282], [-1.838, -0.447, 1.348, 0.372, 2.121, 0.908]]\nD: [[-0.864, -1.506, 1.526, 0.637, 0.64, 2.228], [-1.745, 0.647, 0.899, 0.933, -0.243, 2.211], [-0.21, 1.507, 1.578, 2.065, 0.587, 0.484], [1.157, 0.294, 0.85, 0.968, 4.125, 2.603], [-1.113, 0.941, 1.165, 0.239, 0.756, 2.423], [-2.32, -0.87, 1.844, 0.517, 2.303, 0.518]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_181_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_181_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-1.143, 1.32, 0.902, 0.946, 0.582, 0.814], [1.314, 2.961, 1.36, 1.125, 0.499, 2.359]]\nB: [[-1.164, 1.549, 1.101, 1.224, -0.166, 1.131], [1.215, 3.687, 1.039, 0.93, -0.333, 2.264]]\nC: [[-1.48, 1.652, 0.846, 0.755, 0.321, 1.167], [1.066, 3.327, 1.091, 1.04, 0.081, 1.998]]\nD: [[-1.81, 1.616, 0.892, 0.552, 0.779, 1.45], [1.382, 3.534, 1.284, 1.152, 0.521, 2.312]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the door in the scene. The camera pose information includes: the rotation matrix: [[-0.08083, -0.463089, 0.882618], [-0.994842, 0.091929, -0.042874], [-0.061284, -0.881531, -0.468131]]; the translation vector: [4.543997, 3.147744, 1.235262], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.143, 1.32, 0.902, 0.946, 0.582, 0.814], [1.314, 2.961, 1.36, 1.125, 0.499, 2.359]]\nB: [[-1.164, 1.549, 1.101, 1.224, -0.166, 1.131], [1.215, 3.687, 1.039, 0.93, -0.333, 2.264]]\nC: [[-1.48, 1.652, 0.846, 0.755, 0.321, 1.167], [1.066, 3.327, 1.091, 1.04, 0.081, 1.998]]\nD: [[-1.81, 1.616, 0.892, 0.552, 0.779, 1.45], [1.382, 3.534, 1.284, 1.152, 0.521, 2.312]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_182_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_182_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-0.277, 2.29, 0.603, 1.696, 0.522, 1.643], [-1.375, -0.321, 0.794, 0.343, 5.813, 1.671], [1.159, 0.472, 1.342, 0.486, 3.254, 0.972], [0.747, -2.391, 1.046, 0.652, 1.455, 1.87], [1.085, -1.961, 0.723, 1.174, -0.018, 1.576], [0.586, 1.92, 1.575, 0.646, -0.32, 2.439], [0.415, 2.748, 1.312, -0.384, 0.485, 2.051]]\nB: [[-0.351, 2.608, 0.955, 1.541, 0.097, 1.824], [-1.089, -0.096, 0.669, 0.099, 5.464, 1.395], [1.127, 0.133, 1.411, 0.212, 3.643, 0.932], [0.392, -2.442, 0.754, 0.163, 1.609, 1.541], [0.746, -1.664, 0.806, 0.833, 0.085, 1.637], [0.806, 1.971, 1.104, 0.829, 0.129, 2.13], [0.393, 2.271, 1.106, 0.064, 0.665, 2.126]]\nC: [[-0.013, 2.804, 0.64, 1.604, -0.241, 1.907], [-0.99, 0.257, 0.762, -0.167, 5.28, 1.868], [1.095, -0.318, 1.309, 0.698, 4.021, 0.652], [0.346, -2.805, 0.465, -0.167, 2.086, 1.213], [0.527, -1.307, 1.185, 0.733, -0.294, 1.468], [1.191, 1.911, 1.165, 0.69, 0.519, 1.853], [0.342, 2.498, 1.557, -0.047, 0.494, 2.435]]\nD: [[-0.612, 2.451, 1.013, 1.076, 0.146, 2.285], [-0.642, -0.043, 0.498, -0.353, 5.408, 1.585], [1.099, -0.043, 0.972, -0.204, 4.141, 1.05], [0.087, -2.832, 0.317, 0.167, 1.848, 1.113], [0.399, -1.498, 0.656, 1.117, 0.566, 1.989], [0.495, 2.449, 0.82, 0.411, 0.228, 2.522], [0.507, 2.378, 1.381, -0.184, 0.771, 1.769]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[0.286652, 0.220257, -0.932372], [0.958024, -0.061246, 0.28007], [0.004584, -0.973517, -0.228568]]; the translation vector: [3.76659, 1.676076, 1.452194], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.277, 2.29, 0.603, 1.696, 0.522, 1.643], [-1.375, -0.321, 0.794, 0.343, 5.813, 1.671], [1.159, 0.472, 1.342, 0.486, 3.254, 0.972], [0.747, -2.391, 1.046, 0.652, 1.455, 1.87], [1.085, -1.961, 0.723, 1.174, -0.018, 1.576], [0.586, 1.92, 1.575, 0.646, -0.32, 2.439], [0.415, 2.748, 1.312, -0.384, 0.485, 2.051]]\nB: [[-0.351, 2.608, 0.955, 1.541, 0.097, 1.824], [-1.089, -0.096, 0.669, 0.099, 5.464, 1.395], [1.127, 0.133, 1.411, 0.212, 3.643, 0.932], [0.392, -2.442, 0.754, 0.163, 1.609, 1.541], [0.746, -1.664, 0.806, 0.833, 0.085, 1.637], [0.806, 1.971, 1.104, 0.829, 0.129, 2.13], [0.393, 2.271, 1.106, 0.064, 0.665, 2.126]]\nC: [[-0.013, 2.804, 0.64, 1.604, -0.241, 1.907], [-0.99, 0.257, 0.762, -0.167, 5.28, 1.868], [1.095, -0.318, 1.309, 0.698, 4.021, 0.652], [0.346, -2.805, 0.465, -0.167, 2.086, 1.213], [0.527, -1.307, 1.185, 0.733, -0.294, 1.468], [1.191, 1.911, 1.165, 0.69, 0.519, 1.853], [0.342, 2.498, 1.557, -0.047, 0.494, 2.435]]\nD: [[-0.612, 2.451, 1.013, 1.076, 0.146, 2.285], [-0.642, -0.043, 0.498, -0.353, 5.408, 1.585], [1.099, -0.043, 0.972, -0.204, 4.141, 1.05], [0.087, -2.832, 0.317, 0.167, 1.848, 1.113], [0.399, -1.498, 0.656, 1.117, 0.566, 1.989], [0.495, 2.449, 0.82, 0.411, 0.228, 2.522], [0.507, 2.378, 1.381, -0.184, 0.771, 1.769]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_183_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_183_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-0.457, -0.683, 0.495, 0.679, 0.597, 0.903], [0.426, 0.843, 0.458, 0.566, 0.562, 0.949], [-0.336, 0.792, 0.461, 0.555, 0.539, 0.932], [0.926, -0.823, 0.62, 0.473, 0.568, 0.621], [-1.992, 0.348, 0.596, 0.635, 0.634, 0.647], [1.1, 0.858, 0.465, 0.529, 0.569, 0.952], [-0.253, -1.834, 0.689, 0.639, 0.561, 0.602], [-0.397, 2.119, 0.473, 0.759, 0.651, 0.94], [-1.254, -0.965, 0.657, 0.558, 0.592, 0.636], [-1.564, -0.168, 0.649, 0.462, 0.615, 0.611], [-0.169, -2.406, 0.711, 0.757, 0.669, 0.597], [1.375, -1.924, 0.508, 0.658, 0.509, 0.96], [0.214, -0.572, 0.651, 0.695, 0.494, 0.584], [-2.356, 2.053, 0.675, 0.673, 0.595, 0.536], [-0.799, -2.241, 0.541, 0.662, 0.673, 0.96], [1.941, -1.89, 0.718, 0.565, 0.56, 0.52], [2.571, -0.575, 0.472, 0.572, 0.595, 0.956], [-0.865, 2.028, 0.487, 0.583, 0.461, 0.128], [0.361, -2.459, 0.78, 0.536, 0.269, 0.489], [1.938, -1.474, 0.649, 0.57, 0.554, 0.6], [0.743, -2.15, 0.817, 0.526, 0.164, 0.344], [2.542, -1.129, 0.644, 0.598, 0.586, 0.612], [1.955, 0.145, 0.785, 0.141, 0.509, 0.304], [-1.685, 2.109, 0.449, 0.566, 0.486, 0.153], [-2.437, -1.918, 0.584, 0.515, 0.45, 0.203]]\nB: [[-0.004, -0.217, 0.058, 0.956, 0.104, 0.992], [0.634, 0.895, 0.282, 0.697, 0.893, 1.438], [-0.387, 0.349, 0.019, 0.239, 0.901, 0.96], [1.064, -1.168, 0.964, 0.374, 1.0, 0.253], [-1.606, 0.8, 0.112, 0.967, 0.862, 0.256], [1.288, 0.417, 0.225, 0.427, 0.112, 1.268], [-0.365, -1.988, 0.842, 0.932, 0.117, 0.21], [0.056, 2.617, 0.694, 0.602, 0.776, 0.848], [-1.173, -1.184, 0.2, 0.567, 0.839, 0.497], [-1.108, -0.485, 0.838, 0.382, 0.723, 1.057], [0.031, -2.18, 0.477, 1.078, 0.774, 0.574], [1.32, -1.614, 0.5, 0.512, 0.791, 1.227], [0.57, -1.002, 0.878, 0.861, 0.739, 0.347], [-2.072, 2.433, 1.143, 0.504, 1.054, 0.551], [-0.569, -2.64, 0.278, 0.616, 1.122, 1.078], [2.089, -1.691, 0.769, 0.97, 0.148, 0.992], [2.274, -0.687, 0.634, 0.56, 0.654, 0.811], [-1.223, 2.009, 0.495, 1.006, -0.028, 0.186], [0.145, -2.547, 0.54, 0.793, 0.387, 0.825], [1.744, -1.228, 0.533, 0.139, 0.886, 1.027], [1.123, -2.589, 1.183, 0.079, 0.187, 0.548], [2.235, -0.842, 0.485, 0.73, 0.575, 0.903], [1.694, -0.054, 1.249, 0.468, 0.557, 0.748], [-1.216, 2.107, 0.803, 0.764, 0.267, 0.274], [-2.623, -2.077, 1.01, 0.838, 0.106, -0.148]]\nC: [[-0.042, -0.277, 0.622, 0.349, 0.954, 1.11], [0.421, 0.801, 0.437, 0.094, 0.078, 1.2], [-0.436, 0.855, 0.625, 0.341, 0.737, 1.353], [0.599, -0.582, 0.28, 0.836, 0.717, 0.357], [-2.398, -0.135, 0.951, 0.429, 1.038, 0.502], [1.554, 0.709, 0.624, 0.144, 0.967, 1.304], [-0.666, -1.374, 0.422, 0.517, 0.122, 0.8], [-0.203, 1.908, 0.093, 1.027, 0.556, 0.76], [-1.532, -0.535, 0.437, 0.57, 0.41, 0.413], [-1.535, -0.307, 0.814, 0.936, 0.544, 1.082], [-0.39, -2.044, 0.309, 0.76, 0.801, 0.62], [1.044, -2.393, 0.932, 1.048, 0.287, 1.261], [0.664, -0.294, 1.14, 0.882, 0.176, 0.207], [-2.135, 2.211, 0.272, 0.963, 0.668, 0.76], [-1.028, -2.103, 1.016, 0.918, 0.609, 1.31], [1.579, -2.37, 0.458, 0.202, 0.159, 0.166], [2.079, -0.505, 0.945, 0.57, 0.86, 0.725], [-0.396, 2.379, 0.489, 0.77, 0.063, 0.52], [0.423, -2.492, 0.598, 0.788, 0.241, 0.406], [2.219, -1.548, 0.415, 0.429, 0.702, 0.329], [1.236, -1.961, 0.849, 0.371, 0.256, -0.039], [2.93, -1.099, 1.108, 0.393, 0.388, 0.187], [1.738, -0.099, 0.354, 0.013, 0.06, 0.667], [-1.711, 2.599, 0.36, 0.548, 0.69, -0.323], [-1.988, -1.796, 0.232, 0.609, 0.912, -0.043]]\nD: [[-0.117, -0.712, 0.165, 0.707, 0.749, 0.416], [0.663, 1.109, 0.92, 0.786, 0.382, 0.761], [-0.485, 1.276, -0.006, 0.122, 0.579, 0.562], [0.651, -1.033, 0.48, 0.012, 0.291, 0.281], [-1.67, 0.137, 0.785, 1.091, 0.142, 0.851], [1.44, 0.455, 0.476, 0.133, 0.572, 0.925], [-0.342, -1.74, 0.35, 0.646, 0.394, 0.443], [-0.793, 2.134, 0.146, 1.105, 0.456, 0.742], [-1.574, -0.65, 0.985, 0.2, 0.168, 1.102], [-1.526, 0.104, 0.427, 0.23, 0.555, 0.818], [-0.21, -2.447, 0.593, 1.166, 1.051, 0.465], [1.11, -2.085, 0.532, 0.952, 0.334, 0.936], [-0.231, -0.532, 0.895, 0.826, 0.523, 0.78], [-2.843, 1.728, 0.764, 0.92, 0.672, 0.101], [-0.845, -1.905, 0.458, 0.184, 0.635, 1.348], [1.844, -1.433, 1.033, 0.147, 0.968, 0.118], [2.166, -0.542, 0.733, 0.117, 0.957, 0.814], [-0.92, 1.743, 0.237, 0.993, 0.477, 0.227], [0.31, -2.458, 0.659, 0.782, 0.696, 0.669], [1.626, -1.353, 0.953, 0.579, 0.517, 0.513], [0.856, -2.414, 0.37, 0.927, 0.46, -0.04], [2.564, -1.512, 0.576, 0.167, 0.528, 0.323], [2.064, -0.007, 0.399, -0.022, 0.444, 0.68], [-1.281, 1.93, 0.706, 0.411, 0.266, -0.223], [-2.823, -2.037, 0.823, 0.649, 0.539, 0.078]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the chair in the scene. The camera pose information includes: the rotation matrix: [[-0.895509, 0.17248, -0.410263], [0.444823, 0.375965, -0.812886], [0.014038, -0.91044, -0.413402]]; the translation vector: [2.818061, 5.409916, 1.54775], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.457, -0.683, 0.495, 0.679, 0.597, 0.903], [0.426, 0.843, 0.458, 0.566, 0.562, 0.949], [-0.336, 0.792, 0.461, 0.555, 0.539, 0.932], [0.926, -0.823, 0.62, 0.473, 0.568, 0.621], [-1.992, 0.348, 0.596, 0.635, 0.634, 0.647], [1.1, 0.858, 0.465, 0.529, 0.569, 0.952], [-0.253, -1.834, 0.689, 0.639, 0.561, 0.602], [-0.397, 2.119, 0.473, 0.759, 0.651, 0.94], [-1.254, -0.965, 0.657, 0.558, 0.592, 0.636], [-1.564, -0.168, 0.649, 0.462, 0.615, 0.611], [-0.169, -2.406, 0.711, 0.757, 0.669, 0.597], [1.375, -1.924, 0.508, 0.658, 0.509, 0.96], [0.214, -0.572, 0.651, 0.695, 0.494, 0.584], [-2.356, 2.053, 0.675, 0.673, 0.595, 0.536], [-0.799, -2.241, 0.541, 0.662, 0.673, 0.96], [1.941, -1.89, 0.718, 0.565, 0.56, 0.52], [2.571, -0.575, 0.472, 0.572, 0.595, 0.956], [-0.865, 2.028, 0.487, 0.583, 0.461, 0.128], [0.361, -2.459, 0.78, 0.536, 0.269, 0.489], [1.938, -1.474, 0.649, 0.57, 0.554, 0.6], [0.743, -2.15, 0.817, 0.526, 0.164, 0.344], [2.542, -1.129, 0.644, 0.598, 0.586, 0.612], [1.955, 0.145, 0.785, 0.141, 0.509, 0.304], [-1.685, 2.109, 0.449, 0.566, 0.486, 0.153], [-2.437, -1.918, 0.584, 0.515, 0.45, 0.203]]\nB: [[-0.004, -0.217, 0.058, 0.956, 0.104, 0.992], [0.634, 0.895, 0.282, 0.697, 0.893, 1.438], [-0.387, 0.349, 0.019, 0.239, 0.901, 0.96], [1.064, -1.168, 0.964, 0.374, 1.0, 0.253], [-1.606, 0.8, 0.112, 0.967, 0.862, 0.256], [1.288, 0.417, 0.225, 0.427, 0.112, 1.268], [-0.365, -1.988, 0.842, 0.932, 0.117, 0.21], [0.056, 2.617, 0.694, 0.602, 0.776, 0.848], [-1.173, -1.184, 0.2, 0.567, 0.839, 0.497], [-1.108, -0.485, 0.838, 0.382, 0.723, 1.057], [0.031, -2.18, 0.477, 1.078, 0.774, 0.574], [1.32, -1.614, 0.5, 0.512, 0.791, 1.227], [0.57, -1.002, 0.878, 0.861, 0.739, 0.347], [-2.072, 2.433, 1.143, 0.504, 1.054, 0.551], [-0.569, -2.64, 0.278, 0.616, 1.122, 1.078], [2.089, -1.691, 0.769, 0.97, 0.148, 0.992], [2.274, -0.687, 0.634, 0.56, 0.654, 0.811], [-1.223, 2.009, 0.495, 1.006, -0.028, 0.186], [0.145, -2.547, 0.54, 0.793, 0.387, 0.825], [1.744, -1.228, 0.533, 0.139, 0.886, 1.027], [1.123, -2.589, 1.183, 0.079, 0.187, 0.548], [2.235, -0.842, 0.485, 0.73, 0.575, 0.903], [1.694, -0.054, 1.249, 0.468, 0.557, 0.748], [-1.216, 2.107, 0.803, 0.764, 0.267, 0.274], [-2.623, -2.077, 1.01, 0.838, 0.106, -0.148]]\nC: [[-0.042, -0.277, 0.622, 0.349, 0.954, 1.11], [0.421, 0.801, 0.437, 0.094, 0.078, 1.2], [-0.436, 0.855, 0.625, 0.341, 0.737, 1.353], [0.599, -0.582, 0.28, 0.836, 0.717, 0.357], [-2.398, -0.135, 0.951, 0.429, 1.038, 0.502], [1.554, 0.709, 0.624, 0.144, 0.967, 1.304], [-0.666, -1.374, 0.422, 0.517, 0.122, 0.8], [-0.203, 1.908, 0.093, 1.027, 0.556, 0.76], [-1.532, -0.535, 0.437, 0.57, 0.41, 0.413], [-1.535, -0.307, 0.814, 0.936, 0.544, 1.082], [-0.39, -2.044, 0.309, 0.76, 0.801, 0.62], [1.044, -2.393, 0.932, 1.048, 0.287, 1.261], [0.664, -0.294, 1.14, 0.882, 0.176, 0.207], [-2.135, 2.211, 0.272, 0.963, 0.668, 0.76], [-1.028, -2.103, 1.016, 0.918, 0.609, 1.31], [1.579, -2.37, 0.458, 0.202, 0.159, 0.166], [2.079, -0.505, 0.945, 0.57, 0.86, 0.725], [-0.396, 2.379, 0.489, 0.77, 0.063, 0.52], [0.423, -2.492, 0.598, 0.788, 0.241, 0.406], [2.219, -1.548, 0.415, 0.429, 0.702, 0.329], [1.236, -1.961, 0.849, 0.371, 0.256, -0.039], [2.93, -1.099, 1.108, 0.393, 0.388, 0.187], [1.738, -0.099, 0.354, 0.013, 0.06, 0.667], [-1.711, 2.599, 0.36, 0.548, 0.69, -0.323], [-1.988, -1.796, 0.232, 0.609, 0.912, -0.043]]\nD: [[-0.117, -0.712, 0.165, 0.707, 0.749, 0.416], [0.663, 1.109, 0.92, 0.786, 0.382, 0.761], [-0.485, 1.276, -0.006, 0.122, 0.579, 0.562], [0.651, -1.033, 0.48, 0.012, 0.291, 0.281], [-1.67, 0.137, 0.785, 1.091, 0.142, 0.851], [1.44, 0.455, 0.476, 0.133, 0.572, 0.925], [-0.342, -1.74, 0.35, 0.646, 0.394, 0.443], [-0.793, 2.134, 0.146, 1.105, 0.456, 0.742], [-1.574, -0.65, 0.985, 0.2, 0.168, 1.102], [-1.526, 0.104, 0.427, 0.23, 0.555, 0.818], [-0.21, -2.447, 0.593, 1.166, 1.051, 0.465], [1.11, -2.085, 0.532, 0.952, 0.334, 0.936], [-0.231, -0.532, 0.895, 0.826, 0.523, 0.78], [-2.843, 1.728, 0.764, 0.92, 0.672, 0.101], [-0.845, -1.905, 0.458, 0.184, 0.635, 1.348], [1.844, -1.433, 1.033, 0.147, 0.968, 0.118], [2.166, -0.542, 0.733, 0.117, 0.957, 0.814], [-0.92, 1.743, 0.237, 0.993, 0.477, 0.227], [0.31, -2.458, 0.659, 0.782, 0.696, 0.669], [1.626, -1.353, 0.953, 0.579, 0.517, 0.513], [0.856, -2.414, 0.37, 0.927, 0.46, -0.04], [2.564, -1.512, 0.576, 0.167, 0.528, 0.323], [2.064, -0.007, 0.399, -0.022, 0.444, 0.68], [-1.281, 1.93, 0.706, 0.411, 0.266, -0.223], [-2.823, -2.037, 0.823, 0.649, 0.539, 0.078]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_184_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_184_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-0.878, 0.793, 0.525, 0.307, 1.138, 0.24], [-1.741, 1.942, 0.919, 0.374, 0.284, 0.479], [-0.97, -1.167, 0.396, 0.145, 0.451, -0.19], [-1.083, -1.878, 0.816, 0.138, 0.403, 0.089], [-0.905, -1.314, 0.204, 0.196, 0.671, 0.614], [0.896, -0.37, -0.05, 0.065, 0.48, 0.052], [-0.311, 2.442, 0.913, 0.38, 0.489, 1.491]]\nB: [[-0.86, 0.987, 0.429, 0.753, 0.659, 0.935], [-1.179, 1.249, 1.101, 0.219, 0.461, 0.432], [-1.149, -1.648, 0.72, 0.293, 0.482, 0.205], [-0.806, -1.928, 1.378, 0.631, 0.665, 0.667], [-0.762, -1.463, 0.175, -0.154, 0.327, 0.058], [1.471, -0.937, 0.34, 0.631, 0.439, 0.05], [0.043, 1.928, 0.641, 0.637, 0.49, 0.81]]\nC: [[-0.458, 0.879, 0.858, 0.541, 0.86, 0.578], [-1.685, 1.754, 1.421, 0.241, 0.756, -0.241], [-1.527, -1.671, 0.922, 0.635, 0.013, 0.552], [-1.217, -1.15, 1.123, 0.142, 0.166, 0.441], [-1.042, -1.813, 0.54, 0.438, 0.445, 0.211], [1.239, -0.633, 0.179, 0.181, 0.23, 0.74], [-0.011, 2.472, 1.042, 0.129, 0.472, 1.438]]\nD: [[-0.55, 0.944, 0.644, 0.68, 1.046, 0.521], [-1.267, 1.712, 1.229, 0.359, 0.367, 0.165], [-1.24, -1.459, 0.828, 0.501, 0.283, 0.305], [-1.303, -1.553, 1.014, 0.399, 0.228, 0.178], [-0.73, -1.694, 0.629, 0.212, 0.251, 0.137], [1.337, -0.693, 0.283, 0.145, 0.467, 0.539], [0.135, 2.342, 0.574, 0.546, 0.62, 1.068]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the object in the scene. The camera pose information includes: the rotation matrix: [[0.330673, -0.328207, 0.884837], [-0.942686, -0.070458, 0.326157], [-0.044703, -0.941975, -0.332694]]; the translation vector: [3.753276, 4.481459, 1.345242], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.878, 0.793, 0.525, 0.307, 1.138, 0.24], [-1.741, 1.942, 0.919, 0.374, 0.284, 0.479], [-0.97, -1.167, 0.396, 0.145, 0.451, -0.19], [-1.083, -1.878, 0.816, 0.138, 0.403, 0.089], [-0.905, -1.314, 0.204, 0.196, 0.671, 0.614], [0.896, -0.37, -0.05, 0.065, 0.48, 0.052], [-0.311, 2.442, 0.913, 0.38, 0.489, 1.491]]\nB: [[-0.86, 0.987, 0.429, 0.753, 0.659, 0.935], [-1.179, 1.249, 1.101, 0.219, 0.461, 0.432], [-1.149, -1.648, 0.72, 0.293, 0.482, 0.205], [-0.806, -1.928, 1.378, 0.631, 0.665, 0.667], [-0.762, -1.463, 0.175, -0.154, 0.327, 0.058], [1.471, -0.937, 0.34, 0.631, 0.439, 0.05], [0.043, 1.928, 0.641, 0.637, 0.49, 0.81]]\nC: [[-0.458, 0.879, 0.858, 0.541, 0.86, 0.578], [-1.685, 1.754, 1.421, 0.241, 0.756, -0.241], [-1.527, -1.671, 0.922, 0.635, 0.013, 0.552], [-1.217, -1.15, 1.123, 0.142, 0.166, 0.441], [-1.042, -1.813, 0.54, 0.438, 0.445, 0.211], [1.239, -0.633, 0.179, 0.181, 0.23, 0.74], [-0.011, 2.472, 1.042, 0.129, 0.472, 1.438]]\nD: [[-0.55, 0.944, 0.644, 0.68, 1.046, 0.521], [-1.267, 1.712, 1.229, 0.359, 0.367, 0.165], [-1.24, -1.459, 0.828, 0.501, 0.283, 0.305], [-1.303, -1.553, 1.014, 0.399, 0.228, 0.178], [-0.73, -1.694, 0.629, 0.212, 0.251, 0.137], [1.337, -0.693, 0.283, 0.145, 0.467, 0.539], [0.135, 2.342, 0.574, 0.546, 0.62, 1.068]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_185_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_185_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-0.565, -1.185, 1.353, 0.464, 0.006, 0.118], [0.412, -0.591, 0.819, 0.036, 0.543, 0.322], [0.775, 0.247, 0.843, 0.919, 0.864, 0.389]]\nB: [[-0.648, -1.262, 0.922, 0.446, 0.433, 0.522], [0.437, -0.235, 0.949, 0.366, 0.445, 0.454], [0.764, 0.145, 0.941, 0.483, 0.409, 0.473]]\nC: [[-0.794, -1.422, 1.325, 0.646, -0.011, 0.511], [0.55, -0.124, 0.97, 0.767, 0.276, 0.151], [0.692, 0.134, 0.818, 0.04, 0.142, 0.775]]\nD: [[-0.888, -1.602, 1.373, 0.357, 0.797, 0.596], [0.014, -0.496, 0.808, 0.816, 0.004, 0.14], [0.932, 0.14, 0.871, 0.799, 0.355, 0.358]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the monitor in the scene. The camera pose information includes: the rotation matrix: [[0.054781, -0.427281, 0.902458], [-0.998013, -0.051617, 0.036143], [0.031139, -0.902644, -0.429259]]; the translation vector: [1.328526, 0.849821, 1.501181], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.565, -1.185, 1.353, 0.464, 0.006, 0.118], [0.412, -0.591, 0.819, 0.036, 0.543, 0.322], [0.775, 0.247, 0.843, 0.919, 0.864, 0.389]]\nB: [[-0.648, -1.262, 0.922, 0.446, 0.433, 0.522], [0.437, -0.235, 0.949, 0.366, 0.445, 0.454], [0.764, 0.145, 0.941, 0.483, 0.409, 0.473]]\nC: [[-0.794, -1.422, 1.325, 0.646, -0.011, 0.511], [0.55, -0.124, 0.97, 0.767, 0.276, 0.151], [0.692, 0.134, 0.818, 0.04, 0.142, 0.775]]\nD: [[-0.888, -1.602, 1.373, 0.357, 0.797, 0.596], [0.014, -0.496, 0.808, 0.816, 0.004, 0.14], [0.932, 0.14, 0.871, 0.799, 0.355, 0.358]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_186_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_186_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-1.353, -1.905, 0.542, 0.198, 0.811, 0.866]]\nB: [[-1.69, -2.015, 0.887, 0.014, 0.72, 0.41]]\nC: [[-1.178, -2.25, 0.868, 0.547, 0.466, 0.935]]\nD: [[-1.26, -1.838, 0.523, -0.212, 0.311, 0.619]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the dishwasher in the scene. The camera pose information includes: the rotation matrix: [[0.752445, 0.275595, -0.598225], [0.657828, -0.35994, 0.661593], [-0.032994, -0.891342, -0.452129]]; the translation vector: [2.633805, 2.70906, 1.31733], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.353, -1.905, 0.542, 0.198, 0.811, 0.866]]\nB: [[-1.69, -2.015, 0.887, 0.014, 0.72, 0.41]]\nC: [[-1.178, -2.25, 0.868, 0.547, 0.466, 0.935]]\nD: [[-1.26, -1.838, 0.523, -0.212, 0.311, 0.619]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_187_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_187_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[0.575, -1.33, 0.913, 0.422, 0.222, 1.282], [-1.133, -0.569, 0.691, 0.502, 1.733, 0.362], [-1.177, -0.378, 1.14, 0.427, 2.109, 0.616], [-1.073, 1.114, 1.403, 0.582, 1.219, 1.213], [-0.205, 1.471, 1.068, 1.327, 0.559, 1.11], [0.394, 0.971, 1.401, 0.368, 1.532, 0.97], [0.465, 0.918, 0.399, 0.657, 1.295, 0.944], [-1.033, 0.855, 0.494, 0.561, 0.838, 0.96]]\nB: [[0.612, -1.146, 1.029, 0.876, 0.249, 1.097], [-1.186, -0.558, 0.781, 0.832, 1.793, 0.11], [-0.68, -0.259, 1.327, 0.321, 2.085, 0.39], [-0.751, 1.462, 1.244, 1.064, 1.13, 1.071], [-0.245, 1.694, 1.448, 1.271, 0.405, 0.826], [0.501, 1.197, 1.032, 0.635, 1.295, 1.137], [0.575, 1.298, 0.738, 0.961, 1.68, 0.895], [-1.135, 0.586, 0.775, 0.711, 1.079, 0.526]]\nC: [[0.678, -1.206, 0.812, 0.848, -0.174, 1.369], [-1.023, -0.705, 0.492, 0.502, 1.434, -0.09], [-1.388, -0.068, 1.103, 0.59, 1.707, 0.559], [-1.152, 1.027, 1.347, 0.752, 0.971, 1.412], [-0.198, 1.443, 1.383, 1.532, 0.499, 1.267], [0.854, 0.79, 1.691, 0.351, 1.682, 0.641], [0.791, 0.546, 0.687, 0.219, 1.088, 1.252], [-0.634, 1.336, 0.286, 0.814, 1.197, 1.221]]\nD: [[0.82, -1.281, 0.96, -0.009, 0.662, 1.248], [-1.615, -0.505, 0.267, 0.866, 1.991, 0.496], [-0.964, -0.4, 1.176, 0.247, 2.442, 0.894], [-1.088, 1.358, 1.232, 0.782, 1.082, 0.821], [0.043, 1.718, 1.49, 1.508, 0.835, 1.275], [0.739, 1.084, 1.461, 0.376, 1.382, 1.444], [0.407, 0.623, 0.633, 0.434, 1.281, 1.107], [-1.311, 0.453, 0.839, 0.768, 1.093, 0.774]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the clothes in the scene. The camera pose information includes: the rotation matrix: [[0.88123, -0.188698, 0.433389], [-0.470321, -0.258404, 0.843816], [-0.047237, -0.947428, -0.316462]]; the translation vector: [1.061636, 1.321782, 1.457525], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.575, -1.33, 0.913, 0.422, 0.222, 1.282], [-1.133, -0.569, 0.691, 0.502, 1.733, 0.362], [-1.177, -0.378, 1.14, 0.427, 2.109, 0.616], [-1.073, 1.114, 1.403, 0.582, 1.219, 1.213], [-0.205, 1.471, 1.068, 1.327, 0.559, 1.11], [0.394, 0.971, 1.401, 0.368, 1.532, 0.97], [0.465, 0.918, 0.399, 0.657, 1.295, 0.944], [-1.033, 0.855, 0.494, 0.561, 0.838, 0.96]]\nB: [[0.612, -1.146, 1.029, 0.876, 0.249, 1.097], [-1.186, -0.558, 0.781, 0.832, 1.793, 0.11], [-0.68, -0.259, 1.327, 0.321, 2.085, 0.39], [-0.751, 1.462, 1.244, 1.064, 1.13, 1.071], [-0.245, 1.694, 1.448, 1.271, 0.405, 0.826], [0.501, 1.197, 1.032, 0.635, 1.295, 1.137], [0.575, 1.298, 0.738, 0.961, 1.68, 0.895], [-1.135, 0.586, 0.775, 0.711, 1.079, 0.526]]\nC: [[0.678, -1.206, 0.812, 0.848, -0.174, 1.369], [-1.023, -0.705, 0.492, 0.502, 1.434, -0.09], [-1.388, -0.068, 1.103, 0.59, 1.707, 0.559], [-1.152, 1.027, 1.347, 0.752, 0.971, 1.412], [-0.198, 1.443, 1.383, 1.532, 0.499, 1.267], [0.854, 0.79, 1.691, 0.351, 1.682, 0.641], [0.791, 0.546, 0.687, 0.219, 1.088, 1.252], [-0.634, 1.336, 0.286, 0.814, 1.197, 1.221]]\nD: [[0.82, -1.281, 0.96, -0.009, 0.662, 1.248], [-1.615, -0.505, 0.267, 0.866, 1.991, 0.496], [-0.964, -0.4, 1.176, 0.247, 2.442, 0.894], [-1.088, 1.358, 1.232, 0.782, 1.082, 0.821], [0.043, 1.718, 1.49, 1.508, 0.835, 1.275], [0.739, 1.084, 1.461, 0.376, 1.382, 1.444], [0.407, 0.623, 0.633, 0.434, 1.281, 1.107], [-1.311, 0.453, 0.839, 0.768, 1.093, 0.774]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_188_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_188_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-1.405, 0.601, 0.764, -0.233, 0.102, -0.396]]\nB: [[-1.074, 0.387, 1.003, 0.303, 0.098, -0.079]]\nC: [[-1.416, 1.278, 0.402, -0.026, 0.472, 0.541]]\nD: [[-1.238, 0.875, 0.853, 0.207, 0.18, 0.059]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the washcloth in the scene. The camera pose information includes: the rotation matrix: [[-0.922168, 0.178823, -0.342969], [0.38661, 0.453076, -0.803278], [0.011746, -0.873352, -0.486947]]; the translation vector: [3.207336, 1.959871, 1.267555], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.405, 0.601, 0.764, -0.233, 0.102, -0.396]]\nB: [[-1.074, 0.387, 1.003, 0.303, 0.098, -0.079]]\nC: [[-1.416, 1.278, 0.402, -0.026, 0.472, 0.541]]\nD: [[-1.238, 0.875, 0.853, 0.207, 0.18, 0.059]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_189_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_189_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[1.762, -1.555, 0.827, 2.184, 0.001, 1.66], [-1.543, -2.246, 0.687, 2.166, 0.538, 1.705], [-0.782, 1.874, 0.612, 0.36, -0.393, 2.288], [-2.462, 0.204, 1.234, 0.129, 3.712, 1.462], [-0.383, -1.635, 0.309, 0.432, -0.146, 1.237], [2.291, 0.002, 1.16, 0.467, 3.329, 1.816], [1.638, 1.427, 0.73, 2.104, 0.102, 1.563], [-1.67, 1.91, 0.562, 0.741, 0.507, 1.714]]\nB: [[1.374, -1.714, 0.725, 2.507, 0.169, 1.413], [-1.211, -1.757, 0.826, 2.443, 0.176, 1.694], [-0.519, 1.79, 0.908, 0.294, 0.099, 1.833], [-2.419, 0.035, 0.987, 0.337, 3.555, 1.874], [0.072, -1.69, 0.634, 0.2, 0.284, 1.225], [2.688, 0.023, 0.867, 0.191, 3.55, 1.732], [1.91, 1.763, 0.852, 1.655, 0.149, 1.762], [-2.022, 1.78, 0.984, 1.051, 0.126, 1.927]]\nC: [[0.908, -1.987, 1.173, 2.894, 0.341, 1.45], [-1.176, -1.625, 1.254, 2.938, 0.258, 1.218], [-0.734, 1.406, 1.146, 0.597, 0.342, 1.626], [-2.253, 0.34, 1.308, 0.063, 3.579, 1.568], [-0.079, -1.858, 0.689, 0.18, 0.741, 0.85], [2.904, 0.375, 0.691, 0.079, 3.103, 2.186], [1.824, 1.499, 0.728, 1.255, 0.079, 1.787], [-2.124, 1.899, 1.164, 1.019, 0.481, 1.863]]\nD: [[1.462, -1.527, 0.599, 2.871, 0.537, 1.876], [-0.801, -1.454, 1.05, 2.817, -0.258, 1.392], [-0.063, 2.202, 0.566, 0.693, 0.023, 1.708], [-2.869, -0.081, 1.48, 0.816, 3.209, 2.127], [0.07, -1.284, 0.825, -0.242, 0.304, 1.406], [2.798, 0.497, 1.202, 0.386, 3.591, 2.066], [1.458, 2.138, 0.37, 1.504, 0.6, 1.542], [-2.145, 1.824, 0.999, 1.206, 0.504, 1.867]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[0.984594, -0.069457, 0.160469], [-0.174127, -0.305795, 0.936039], [-0.015944, -0.949561, -0.313178]]; the translation vector: [3.941113, 2.817773, 1.559826], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.762, -1.555, 0.827, 2.184, 0.001, 1.66], [-1.543, -2.246, 0.687, 2.166, 0.538, 1.705], [-0.782, 1.874, 0.612, 0.36, -0.393, 2.288], [-2.462, 0.204, 1.234, 0.129, 3.712, 1.462], [-0.383, -1.635, 0.309, 0.432, -0.146, 1.237], [2.291, 0.002, 1.16, 0.467, 3.329, 1.816], [1.638, 1.427, 0.73, 2.104, 0.102, 1.563], [-1.67, 1.91, 0.562, 0.741, 0.507, 1.714]]\nB: [[1.374, -1.714, 0.725, 2.507, 0.169, 1.413], [-1.211, -1.757, 0.826, 2.443, 0.176, 1.694], [-0.519, 1.79, 0.908, 0.294, 0.099, 1.833], [-2.419, 0.035, 0.987, 0.337, 3.555, 1.874], [0.072, -1.69, 0.634, 0.2, 0.284, 1.225], [2.688, 0.023, 0.867, 0.191, 3.55, 1.732], [1.91, 1.763, 0.852, 1.655, 0.149, 1.762], [-2.022, 1.78, 0.984, 1.051, 0.126, 1.927]]\nC: [[0.908, -1.987, 1.173, 2.894, 0.341, 1.45], [-1.176, -1.625, 1.254, 2.938, 0.258, 1.218], [-0.734, 1.406, 1.146, 0.597, 0.342, 1.626], [-2.253, 0.34, 1.308, 0.063, 3.579, 1.568], [-0.079, -1.858, 0.689, 0.18, 0.741, 0.85], [2.904, 0.375, 0.691, 0.079, 3.103, 2.186], [1.824, 1.499, 0.728, 1.255, 0.079, 1.787], [-2.124, 1.899, 1.164, 1.019, 0.481, 1.863]]\nD: [[1.462, -1.527, 0.599, 2.871, 0.537, 1.876], [-0.801, -1.454, 1.05, 2.817, -0.258, 1.392], [-0.063, 2.202, 0.566, 0.693, 0.023, 1.708], [-2.869, -0.081, 1.48, 0.816, 3.209, 2.127], [0.07, -1.284, 0.825, -0.242, 0.304, 1.406], [2.798, 0.497, 1.202, 0.386, 3.591, 2.066], [1.458, 2.138, 0.37, 1.504, 0.6, 1.542], [-2.145, 1.824, 0.999, 1.206, 0.504, 1.867]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_190_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_190_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-0.548, -0.723, 1.702, 0.029, 0.56, 0.548], [0.99, -0.353, 1.798, 0.254, 0.406, 1.307], [0.157, -0.505, 0.811, 2.866, -0.08, 2.257], [-1.059, 0.22, 1.187, 0.204, 1.96, 1.609], [-1.515, 1.374, 1.432, 0.39, 0.462, 2.145], [-1.181, 2.329, 0.518, 0.161, 0.641, 1.327], [1.31, 1.368, 1.072, 1.078, 3.27, 1.707]]\nB: [[-0.408, -0.944, 1.285, 0.383, 0.855, 1.329], [1.037, -1.048, 1.113, 0.346, 0.92, 1.337], [0.631, -0.546, 1.683, 2.674, 0.421, 2.371], [-1.406, 0.284, 0.493, 0.426, 1.745, 1.616], [-0.913, 1.243, 0.625, 0.944, -0.159, 1.495], [-0.749, 1.827, 0.664, -0.223, 0.933, 1.793], [1.27, 1.253, 1.221, 0.403, 2.724, 2.094]]\nC: [[-0.454, -0.86, 2.026, 0.451, 0.358, 1.257], [1.65, -0.511, 2.057, 0.183, 0.13, 0.645], [0.357, -0.781, 1.143, 3.085, -0.312, 2.705], [-1.785, 0.873, 0.92, 0.414, 1.805, 1.915], [-0.907, 0.946, 0.648, 1.086, 0.063, 2.046], [-0.884, 1.711, 1.057, -0.048, 0.722, 0.964], [1.337, 0.641, 0.462, 0.296, 3.312, 2.01]]\nD: [[-0.751, -0.786, 1.574, 0.095, 0.444, 1.034], [1.18, -0.773, 1.574, 0.094, 0.433, 1.033], [0.142, -0.562, 1.184, 3.142, 0.116, 2.394], [-1.437, 0.419, 0.848, 0.139, 1.974, 1.688], [-1.083, 1.379, 1.042, 0.807, 0.163, 1.776], [-0.694, 1.837, 0.766, 0.107, 0.954, 1.459], [1.355, 0.889, 0.903, 0.788, 2.903, 1.82]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[-0.355681, -0.20797, 0.911175], [-0.934036, 0.113197, -0.338769], [-0.032689, -0.971563, -0.234514]]; the translation vector: [0.539195, 4.841905, 1.636959], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.548, -0.723, 1.702, 0.029, 0.56, 0.548], [0.99, -0.353, 1.798, 0.254, 0.406, 1.307], [0.157, -0.505, 0.811, 2.866, -0.08, 2.257], [-1.059, 0.22, 1.187, 0.204, 1.96, 1.609], [-1.515, 1.374, 1.432, 0.39, 0.462, 2.145], [-1.181, 2.329, 0.518, 0.161, 0.641, 1.327], [1.31, 1.368, 1.072, 1.078, 3.27, 1.707]]\nB: [[-0.408, -0.944, 1.285, 0.383, 0.855, 1.329], [1.037, -1.048, 1.113, 0.346, 0.92, 1.337], [0.631, -0.546, 1.683, 2.674, 0.421, 2.371], [-1.406, 0.284, 0.493, 0.426, 1.745, 1.616], [-0.913, 1.243, 0.625, 0.944, -0.159, 1.495], [-0.749, 1.827, 0.664, -0.223, 0.933, 1.793], [1.27, 1.253, 1.221, 0.403, 2.724, 2.094]]\nC: [[-0.454, -0.86, 2.026, 0.451, 0.358, 1.257], [1.65, -0.511, 2.057, 0.183, 0.13, 0.645], [0.357, -0.781, 1.143, 3.085, -0.312, 2.705], [-1.785, 0.873, 0.92, 0.414, 1.805, 1.915], [-0.907, 0.946, 0.648, 1.086, 0.063, 2.046], [-0.884, 1.711, 1.057, -0.048, 0.722, 0.964], [1.337, 0.641, 0.462, 0.296, 3.312, 2.01]]\nD: [[-0.751, -0.786, 1.574, 0.095, 0.444, 1.034], [1.18, -0.773, 1.574, 0.094, 0.433, 1.033], [0.142, -0.562, 1.184, 3.142, 0.116, 2.394], [-1.437, 0.419, 0.848, 0.139, 1.974, 1.688], [-1.083, 1.379, 1.042, 0.807, 0.163, 1.776], [-0.694, 1.837, 0.766, 0.107, 0.954, 1.459], [1.355, 0.889, 0.903, 0.788, 2.903, 1.82]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_191_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_191_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[1.515, -3.241, 1.128, 2.444, 0.863, 2.298], [1.577, 0.871, 1.235, 2.218, 0.709, 2.09], [1.099, 3.677, 1.424, 1.498, 0.813, 2.316], [1.686, -0.521, 1.08, 2.449, 0.774, 1.941], [1.48, 2.234, 1.312, 2.224, 0.696, 2.161], [0.71, 4.953, 0.833, 0.669, 0.644, 1.154], [1.678, -1.888, 1.095, 2.523, 0.759, 2.102]]\nB: [[1.269, -3.321, 1.076, 2.021, 0.959, 2.397], [1.664, 1.284, 1.204, 2.329, 1.065, 2.182], [1.189, 3.832, 1.394, 1.94, 1.033, 1.829], [2.066, -0.941, 0.589, 2.315, 1.169, 1.455], [1.915, 2.253, 1.321, 2.418, 0.57, 2.378], [0.213, 5.41, 0.898, 0.409, 1.093, 1.517], [1.55, -2.082, 1.024, 2.82, 0.884, 2.344]]\nC: [[1.118, -3.575, 0.993, 1.946, 0.682, 2.318], [1.412, 0.928, 1.006, 2.495, 0.73, 2.187], [0.774, 3.36, 0.968, 1.482, 0.922, 2.574], [1.295, -0.734, 1.167, 2.189, 0.383, 1.587], [1.325, 2.548, 0.999, 2.413, 1.015, 2.532], [0.98, 5.017, 0.875, 0.448, 0.455, 0.917], [2.018, -1.5, 1.046, 2.717, 0.819, 2.55]]\nD: [[1.259, -3.521, 1.143, 2.894, 0.867, 2.663], [1.362, 1.016, 1.431, 2.314, 0.878, 2.2], [0.748, 3.481, 1.025, 1.495, 1.271, 2.75], [1.85, -0.752, 1.348, 2.468, 0.657, 1.566], [1.513, 2.006, 1.345, 1.751, 0.827, 2.159], [0.635, 4.802, 1.263, 0.202, 1.111, 1.501], [1.353, -2.331, 1.563, 2.89, 1.228, 2.108]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the bookshelf in the scene. The camera pose information includes: the rotation matrix: [[-0.941243, -0.209403, 0.264975], [-0.336113, 0.504116, -0.795548], [0.033012, -0.837865, -0.544878]]; the translation vector: [4.828751, 9.008894, 1.463441], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.515, -3.241, 1.128, 2.444, 0.863, 2.298], [1.577, 0.871, 1.235, 2.218, 0.709, 2.09], [1.099, 3.677, 1.424, 1.498, 0.813, 2.316], [1.686, -0.521, 1.08, 2.449, 0.774, 1.941], [1.48, 2.234, 1.312, 2.224, 0.696, 2.161], [0.71, 4.953, 0.833, 0.669, 0.644, 1.154], [1.678, -1.888, 1.095, 2.523, 0.759, 2.102]]\nB: [[1.269, -3.321, 1.076, 2.021, 0.959, 2.397], [1.664, 1.284, 1.204, 2.329, 1.065, 2.182], [1.189, 3.832, 1.394, 1.94, 1.033, 1.829], [2.066, -0.941, 0.589, 2.315, 1.169, 1.455], [1.915, 2.253, 1.321, 2.418, 0.57, 2.378], [0.213, 5.41, 0.898, 0.409, 1.093, 1.517], [1.55, -2.082, 1.024, 2.82, 0.884, 2.344]]\nC: [[1.118, -3.575, 0.993, 1.946, 0.682, 2.318], [1.412, 0.928, 1.006, 2.495, 0.73, 2.187], [0.774, 3.36, 0.968, 1.482, 0.922, 2.574], [1.295, -0.734, 1.167, 2.189, 0.383, 1.587], [1.325, 2.548, 0.999, 2.413, 1.015, 2.532], [0.98, 5.017, 0.875, 0.448, 0.455, 0.917], [2.018, -1.5, 1.046, 2.717, 0.819, 2.55]]\nD: [[1.259, -3.521, 1.143, 2.894, 0.867, 2.663], [1.362, 1.016, 1.431, 2.314, 0.878, 2.2], [0.748, 3.481, 1.025, 1.495, 1.271, 2.75], [1.85, -0.752, 1.348, 2.468, 0.657, 1.566], [1.513, 2.006, 1.345, 1.751, 0.827, 2.159], [0.635, 4.802, 1.263, 0.202, 1.111, 1.501], [1.353, -2.331, 1.563, 2.89, 1.228, 2.108]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_192_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_192_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-1.126, -0.51, 1.73, -0.359, 0.479, 0.324]]\nB: [[-1.548, -0.135, 1.59, 0.021, 0.457, 0.386]]\nC: [[-1.508, -0.035, 1.589, -0.436, 0.071, 0.171]]\nD: [[-1.888, -0.563, 1.28, -0.393, 0.688, 0.046]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the picture in the scene. The camera pose information includes: the rotation matrix: [[0.623567, 0.536294, -0.568817], [0.781209, -0.455034, 0.427384], [-0.029628, -0.710867, -0.702702]]; the translation vector: [1.790477, 1.816361, 1.229059], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.126, -0.51, 1.73, -0.359, 0.479, 0.324]]\nB: [[-1.548, -0.135, 1.59, 0.021, 0.457, 0.386]]\nC: [[-1.508, -0.035, 1.589, -0.436, 0.071, 0.171]]\nD: [[-1.888, -0.563, 1.28, -0.393, 0.688, 0.046]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_193_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_193_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-2.08, 0.154, 1.005, 0.283, 1.414, 1.731]]\nB: [[-1.974, 0.286, 1.416, 0.341, 1.457, 1.235]]\nC: [[-1.941, 0.29, 1.24, -0.098, 1.307, 1.381]]\nD: [[-1.581, 0.374, 0.521, 0.311, 1.136, 1.526]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the whiteboard in the scene. The camera pose information includes: the rotation matrix: [[-0.341382, 0.594812, -0.727775], [0.932196, 0.11517, -0.343142], [-0.120287, -0.795572, -0.593798]]; the translation vector: [7.151203, 3.587152, 1.581923], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-2.08, 0.154, 1.005, 0.283, 1.414, 1.731]]\nB: [[-1.974, 0.286, 1.416, 0.341, 1.457, 1.235]]\nC: [[-1.941, 0.29, 1.24, -0.098, 1.307, 1.381]]\nD: [[-1.581, 0.374, 0.521, 0.311, 1.136, 1.526]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_194_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_194_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-0.65, 1.626, 0.952, 1.426, 0.125, 1.867]]\nB: [[-0.34, 1.647, 1.105, 1.036, 0.294, 2.092]]\nC: [[-0.202, 1.219, 1.248, 1.308, -0.28, 1.829]]\nD: [[-1.114, 1.711, 0.518, 0.996, 0.291, 2.172]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the doorframe in the scene. The camera pose information includes: the rotation matrix: [[-0.40936, -0.486807, 0.77165], [-0.912164, 0.236459, -0.334729], [-0.019515, -0.840896, -0.540844]]; the translation vector: [1.412713, 1.214489, 1.390939], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.65, 1.626, 0.952, 1.426, 0.125, 1.867]]\nB: [[-0.34, 1.647, 1.105, 1.036, 0.294, 2.092]]\nC: [[-0.202, 1.219, 1.248, 1.308, -0.28, 1.829]]\nD: [[-1.114, 1.711, 0.518, 0.996, 0.291, 2.172]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_195_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_195_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[-1.253, 0.185, 0.949, 0.229, 3.97, 1.922], [-0.174, 1.794, 1.044, 2.121, 0.218, 2.116], [0.873, -0.38, 1.281, 0.158, 4.352, 2.497], [0.476, -2.537, 0.593, 0.677, 0.042, 1.129], [0.122, -2.616, 0.312, 0.063, 0.188, 0.596]]\nB: [[-1.703, -0.184, 0.581, -0.118, 3.494, 2.053], [0.248, 1.316, 1.102, 2.022, 0.319, 1.655], [1.35, -0.101, 1.108, 0.315, 4.473, 2.489], [0.441, -2.72, 0.688, 0.321, 0.469, 1.1], [-0.308, -2.248, -0.131, 0.362, 0.498, 0.335]]\nC: [[-1.001, 0.387, 0.855, 0.13, 4.223, 1.808], [-0.121, 2.25, 1.058, 2.216, 0.377, 2.185], [0.489, 0.025, 0.85, -0.341, 3.971, 2.77], [0.668, -2.895, 0.381, 0.972, 0.18, 1.122], [0.223, -2.648, 0.118, -0.29, 0.288, 0.814]]\nD: [[-1.615, 0.237, 0.631, 0.113, 3.734, 2.164], [-0.111, 1.6, 1.257, 2.2, 0.658, 1.704], [0.468, -0.376, 0.97, -0.134, 3.943, 2.668], [0.083, -2.476, 0.49, 0.836, 0.329, 1.629], [-0.101, -2.949, 0.022, 0.48, 0.426, 0.711]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[0.977514, -0.102294, 0.184398], [-0.210796, -0.497303, 0.841578], [0.005613, -0.861525, -0.507684]]; the translation vector: [3.555602, 1.207732, 1.356493], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.253, 0.185, 0.949, 0.229, 3.97, 1.922], [-0.174, 1.794, 1.044, 2.121, 0.218, 2.116], [0.873, -0.38, 1.281, 0.158, 4.352, 2.497], [0.476, -2.537, 0.593, 0.677, 0.042, 1.129], [0.122, -2.616, 0.312, 0.063, 0.188, 0.596]]\nB: [[-1.703, -0.184, 0.581, -0.118, 3.494, 2.053], [0.248, 1.316, 1.102, 2.022, 0.319, 1.655], [1.35, -0.101, 1.108, 0.315, 4.473, 2.489], [0.441, -2.72, 0.688, 0.321, 0.469, 1.1], [-0.308, -2.248, -0.131, 0.362, 0.498, 0.335]]\nC: [[-1.001, 0.387, 0.855, 0.13, 4.223, 1.808], [-0.121, 2.25, 1.058, 2.216, 0.377, 2.185], [0.489, 0.025, 0.85, -0.341, 3.971, 2.77], [0.668, -2.895, 0.381, 0.972, 0.18, 1.122], [0.223, -2.648, 0.118, -0.29, 0.288, 0.814]]\nD: [[-1.615, 0.237, 0.631, 0.113, 3.734, 2.164], [-0.111, 1.6, 1.257, 2.2, 0.658, 1.704], [0.468, -0.376, 0.97, -0.134, 3.943, 2.668], [0.083, -2.476, 0.49, 0.836, 0.329, 1.629], [-0.101, -2.949, 0.022, 0.48, 0.426, 0.711]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_196_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_196_1.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[1.561, -0.516, 0.43, 0.03, 0.264, 0.023]]\nB: [[1.307, -0.077, 0.927, 0.18, 0.373, 0.438]]\nC: [[1.232, 0.339, 1.368, -0.266, 0.794, 0.386]]\nD: [[1.366, 0.134, 0.662, 0.477, 0.375, 0.57]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the toilet paper holder in the scene. The camera pose information includes: the rotation matrix: [[-0.566304, -0.590941, 0.574533], [-0.823945, 0.423135, -0.376925], [-0.020365, -0.686838, -0.726526]]; the translation vector: [2.143516, 1.760119, 1.343188], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.561, -0.516, 0.43, 0.03, 0.264, 0.023]]\nB: [[1.307, -0.077, 0.927, 0.18, 0.373, 0.438]]\nC: [[1.232, 0.339, 1.368, -0.266, 0.794, 0.386]]\nD: [[1.366, 0.134, 0.662, 0.477, 0.375, 0.57]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_197_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_197_1.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[0.989, -2.867, 2.62, 3.827, 5.879, 0.215]]\nB: [[0.557, -2.629, 2.447, 3.868, 5.161, -0.064]]\nC: [[0.767, -2.57, 3.32, 4.124, 4.999, -0.179]]\nD: [[0.538, -2.391, 2.899, 4.263, 5.407, 0.187]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the ceiling in the scene. The camera pose information includes: the rotation matrix: [[-0.999494, 0.005595, 0.031322], [-0.029883, 0.172936, -0.98448], [-0.010925, -0.984917, -0.172681]]; the translation vector: [6.687301, 5.436423, 1.742894], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.989, -2.867, 2.62, 3.827, 5.879, 0.215]]\nB: [[0.557, -2.629, 2.447, 3.868, 5.161, -0.064]]\nC: [[0.767, -2.57, 3.32, 4.124, 4.999, -0.179]]\nD: [[0.538, -2.391, 2.899, 4.263, 5.407, 0.187]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_198_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_198_1.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_Object_Detection",
+    "visual_input_component": "3d image",
+    "source": "SCANNET_threed_bbox_detection",
+    "options": "A: [[0.728, -0.216, 1.391, -0.233, 0.319, 0.888]]\nB: [[1.382, -0.434, 1.41, 0.62, 0.036, 0.847]]\nC: [[1.017, -0.314, 0.963, 0.261, 0.326, 0.441]]\nD: [[1.373, -0.033, 0.749, 0.246, 0.609, 0.097]]",
+    "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the paper towel dispenser in the scene. The camera pose information includes: the rotation matrix: [[0.207705, 0.494542, -0.843971], [0.97739, -0.069996, 0.199524], [0.039599, -0.866331, -0.497898]]; the translation vector: [4.53083, 2.291093, 1.52739], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.",
+    "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.728, -0.216, 1.391, -0.233, 0.319, 0.888]]\nB: [[1.382, -0.434, 1.41, 0.62, 0.036, 0.847]]\nC: [[1.017, -0.314, 0.963, 0.261, 0.326, 0.441]]\nD: [[1.373, -0.033, 0.749, 0.246, 0.609, 0.097]]",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_199_0.jpg",
+      "../MMIU-Benchmark/threeD_Object_Detection/threeD_Object_Detection_199_1.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: microwave\nB: refrigerator\nC: stove\nD: television",
+    "question": "which object changed its status when the person do the first action did before he/she point to something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: microwave\nB: refrigerator\nC: stove\nD: television",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_0_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_0_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_0_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_0_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_0_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_0_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_0_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_0_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_0_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_0_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_0_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_0_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_0_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_0_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_0_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_0_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_0_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_0_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_0_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_0_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_0_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_0_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_0_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_0_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_0_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_0_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_0_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_0_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_0_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_0_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_0_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_0_31.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: apple1\nB: orange2\nC: banana3\nD: grape4",
+    "question": "which object changed its status when the person put something to something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: apple1\nB: orange2\nC: banana3\nD: grape4",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_1_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_1_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_1_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_1_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_1_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_1_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_1_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_1_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_1_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_1_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_1_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_1_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_1_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_1_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_1_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_1_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_1_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_1_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_1_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_1_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_1_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_1_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_1_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_1_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_1_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_1_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_1_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_1_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_1_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_1_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_1_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_1_31.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: broken\nB: emptiness\nC: cleanliness\nD: fullness",
+    "question": "what status of cup changed while the person do the first action did before he/she wash something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: broken\nB: emptiness\nC: cleanliness\nD: fullness",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_2_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_2_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_2_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_2_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_2_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_2_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_2_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_2_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_2_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_2_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_2_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_2_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_2_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_2_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_2_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_2_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_2_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_2_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_2_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_2_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_2_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_2_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_2_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_2_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_2_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_2_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_2_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_2_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_2_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_2_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_2_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_2_31.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: locked\nB: opened\nC: half-opened\nD: closed",
+    "question": "what will the status of fridge change to if the actor do the first action in the video in the future?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: locked\nB: opened\nC: half-opened\nD: closed",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_3_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_3_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_3_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_3_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_3_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_3_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_3_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_3_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_3_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_3_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_3_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_3_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_3_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_3_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_3_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_3_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_3_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_3_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_3_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_3_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_3_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_3_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_3_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_3_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_3_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_3_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_3_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_3_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_3_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_3_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_3_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_3_31.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: Put water-pot to table\nB: Placed water-pot on shelf\nC: Put water-pot to floor\nD: Moved water-pot to window",
+    "question": "How did the person changed the spatial relationships of the last object that has status change in the video?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Put water-pot to table\nB: Placed water-pot on shelf\nC: Put water-pot to floor\nD: Moved water-pot to window",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_4_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_4_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_4_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_4_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_4_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_4_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_4_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_4_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_4_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_4_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_4_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_4_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_4_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_4_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_4_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_4_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_4_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_4_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_4_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_4_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_4_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_4_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_4_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_4_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_4_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_4_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_4_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_4_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_4_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_4_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_4_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_4_31.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: I don't know\nB: maybe\nC: yes\nD: no",
+    "question": "Does the first action did after the person point to something fulfills the preconditions of the action eating something with something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: I don't know\nB: maybe\nC: yes\nD: no",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_5_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_5_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_5_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_5_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_5_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_5_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_5_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_5_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_5_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_5_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_5_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_5_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_5_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_5_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_5_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_5_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_5_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_5_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_5_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_5_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_5_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_5_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_5_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_5_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_5_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_5_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_5_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_5_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_5_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_5_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_5_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_5_31.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: Reading a book quietly\nB: Chopping vegetables on a board\nC: Put fish to basin using fishing-net\nD: Playing a musical instrument",
+    "question": "During which action does the person knows about the other person's action?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Reading a book quietly\nB: Chopping vegetables on a board\nC: Put fish to basin using fishing-net\nD: Playing a musical instrument",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_6_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_6_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_6_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_6_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_6_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_6_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_6_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_6_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_6_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_6_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_6_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_6_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_6_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_6_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_6_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_6_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_6_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_6_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_6_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_6_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_6_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_6_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_6_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_6_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_6_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_6_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_6_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_6_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_6_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_6_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_6_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_6_31.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: sometimes\nB: maybe\nC: no\nD: yes",
+    "question": "If the person did not get something from something, is the person able to open something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: sometimes\nB: maybe\nC: no\nD: yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_7_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_7_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_7_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_7_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_7_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_7_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_7_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_7_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_7_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_7_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_7_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_7_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_7_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_7_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_7_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_7_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_7_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_7_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_7_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_7_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_7_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_7_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_7_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_7_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_7_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_7_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_7_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_7_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_7_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_7_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_7_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_7_31.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: Sit on the couch\nB: Turn on the TV\nC: Open microwave\nD: Close the window",
+    "question": "what will the other person do next?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Sit on the couch\nB: Turn on the TV\nC: Open microwave\nD: Close the window",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_8_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_8_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_8_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_8_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_8_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_8_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_8_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_8_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_8_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_8_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_8_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_8_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_8_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_8_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_8_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_8_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_8_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_8_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_8_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_8_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_8_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_8_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_8_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_8_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_8_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_8_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_8_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_8_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_8_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_8_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_8_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_8_31.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: Pick up the book\nB: Put cup to the other person\nC: Turn off the lights\nD: Close the door",
+    "question": "If the person did not do the last action in the video, what remaining actions in the video is executable?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Pick up the book\nB: Put cup to the other person\nC: Turn off the lights\nD: Close the door",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_9_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_9_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_9_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_9_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_9_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_9_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_9_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_9_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_9_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_9_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_9_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_9_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_9_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_9_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_9_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_9_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_9_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_9_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_9_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_9_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_9_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_9_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_9_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_9_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_9_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_9_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_9_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_9_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_9_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_9_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_9_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_9_31.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: no\nB: maybe\nC: yes\nD: sometimes",
+    "question": "If the person did not sweep something using something, is the person able to turn off something with something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: no\nB: maybe\nC: yes\nD: sometimes",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_10_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_10_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_10_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_10_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_10_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_10_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_10_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_10_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_10_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_10_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_10_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_10_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_10_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_10_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_10_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_10_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_10_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_10_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_10_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_10_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_10_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_10_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_10_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_10_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_10_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_10_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_10_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_10_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_10_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_10_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_10_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_10_31.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: partially\nB: yes\nC: maybe\nD: no",
+    "question": "Did the attribute of remote changed because of the first action in the video?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: partially\nB: yes\nC: maybe\nD: no",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_11_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_11_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_11_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_11_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_11_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_11_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_11_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_11_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_11_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_11_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_11_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_11_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_11_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_11_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_11_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_11_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_11_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_11_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_11_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_11_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_11_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_11_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_11_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_11_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_11_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_11_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_11_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_11_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_11_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_11_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_11_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_11_31.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: Put sandwich to plate\nB: Take sandwich off the plate\nC: Throw sandwich away\nD: Put sandwich in the fridge",
+    "question": "What is the last action the person did in the video?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Put sandwich to plate\nB: Take sandwich off the plate\nC: Throw sandwich away\nD: Put sandwich in the fridge",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_12_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_12_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_12_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_12_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_12_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_12_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_12_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_12_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_12_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_12_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_12_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_12_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_12_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_12_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_12_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_12_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_12_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_12_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_12_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_12_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_12_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_12_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_12_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_12_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_12_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_12_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_12_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_12_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_12_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_12_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_12_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_12_31.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: Wash knife\nB: Dry dishes\nC: Cook meal\nD: Sweep floor",
+    "question": "what is the other person doing while the person put something to something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Wash knife\nB: Dry dishes\nC: Cook meal\nD: Sweep floor",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_13_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_13_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_13_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_13_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_13_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_13_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_13_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_13_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_13_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_13_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_13_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_13_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_13_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_13_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_13_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_13_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_13_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_13_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_13_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_13_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_13_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_13_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_13_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_13_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_13_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_13_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_13_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_13_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_13_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_13_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_13_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_13_31.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: not sure\nB: maybe\nC: yes\nD: no",
+    "question": "Does the last action in the video fulfills the preconditions of the action putting something to something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: not sure\nB: maybe\nC: yes\nD: no",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_14_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_14_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_14_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_14_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_14_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_14_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_14_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_14_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_14_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_14_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_14_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_14_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_14_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_14_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_14_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_14_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_14_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_14_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_14_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_14_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_14_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_14_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_14_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_14_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_14_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_14_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_14_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_14_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_14_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_14_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_14_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_14_31.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: Put the fork in the fridge\nB: Dropped the fork on the floor\nC: Get fork from table\nD: Mixed the fork with a spoon",
+    "question": "How did the person changed the state of mixture of fork?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Put the fork in the fridge\nB: Dropped the fork on the floor\nC: Get fork from table\nD: Mixed the fork with a spoon",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_15_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_15_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_15_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_15_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_15_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_15_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_15_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_15_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_15_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_15_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_15_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_15_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_15_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_15_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_15_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_15_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_15_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_15_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_15_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_15_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_15_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_15_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_15_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_15_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_15_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_15_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_15_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_15_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_15_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_15_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_15_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_15_31.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: maybe\nB: no\nC: sometimes\nD: yes",
+    "question": "Does the last action in the video fulfills the preconditions of the action putting something to something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: maybe\nB: no\nC: sometimes\nD: yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_16_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_16_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_16_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_16_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_16_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_16_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_16_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_16_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_16_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_16_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_16_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_16_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_16_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_16_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_16_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_16_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_16_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_16_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_16_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_16_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_16_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_16_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_16_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_16_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_16_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_16_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_16_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_16_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_16_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_16_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_16_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_16_31.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: falling to the ground\nB: broken in half\nC: completely detached\nD: attached to knife base",
+    "question": "What is the status of knife after the person do the first action did before he/she get something from something to change it?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: falling to the ground\nB: broken in half\nC: completely detached\nD: attached to knife base",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_17_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_17_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_17_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_17_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_17_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_17_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_17_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_17_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_17_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_17_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_17_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_17_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_17_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_17_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_17_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_17_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_17_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_17_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_17_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_17_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_17_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_17_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_17_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_17_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_17_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_17_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_17_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_17_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_17_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_17_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_17_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_17_31.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: remote\nB: book\nC: lamp\nD: cup",
+    "question": "which object changed its status first in the video?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: remote\nB: book\nC: lamp\nD: cup",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_18_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_18_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_18_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_18_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_18_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_18_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_18_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_18_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_18_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_18_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_18_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_18_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_18_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_18_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_18_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_18_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_18_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_18_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_18_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_18_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_18_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_18_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_18_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_18_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_18_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_18_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_18_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_18_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_18_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_18_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_18_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_18_31.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: in fridge\nB: in microwave\nC: on table\nD: in sink",
+    "question": "what will the status of cup1 change to if the actor put something to something in the future?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: in fridge\nB: in microwave\nC: on table\nD: in sink",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_19_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_19_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_19_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_19_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_19_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_19_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_19_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_19_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_19_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_19_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_19_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_19_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_19_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_19_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_19_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_19_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_19_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_19_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_19_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_19_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_19_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_19_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_19_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_19_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_19_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_19_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_19_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_19_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_19_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_19_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_19_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_19_31.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: no\nB: maybe\nC: yes\nD: sometimes",
+    "question": "Did the attribute of controller changed because of the first action did before the person point to something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: no\nB: maybe\nC: yes\nD: sometimes",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_20_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_20_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_20_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_20_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_20_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_20_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_20_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_20_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_20_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_20_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_20_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_20_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_20_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_20_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_20_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_20_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_20_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_20_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_20_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_20_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_20_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_20_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_20_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_20_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_20_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_20_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_20_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_20_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_20_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_20_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_20_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_20_31.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: Add seasoning to meat\nB: Cut meat with a knife\nC: Put meat in oven\nD: Get meat from pan using fork",
+    "question": "How did the person changed the wrappedness of meat1?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Add seasoning to meat\nB: Cut meat with a knife\nC: Put meat in oven\nD: Get meat from pan using fork",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_21_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_21_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_21_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_21_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_21_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_21_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_21_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_21_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_21_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_21_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_21_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_21_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_21_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_21_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_21_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_21_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_21_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_21_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_21_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_21_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_21_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_21_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_21_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_21_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_21_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_21_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_21_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_21_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_21_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_21_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_21_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_21_31.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: Put the cup in the fridge\nB: Wash cup\nC: Break the cup\nD: Throw the cup away",
+    "question": "what will the person do next after this video?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Put the cup in the fridge\nB: Wash cup\nC: Break the cup\nD: Throw the cup away",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_22_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_22_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_22_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_22_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_22_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_22_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_22_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_22_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_22_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_22_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_22_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_22_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_22_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_22_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_22_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_22_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_22_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_22_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_22_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_22_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_22_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_22_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_22_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_22_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_22_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_22_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_22_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_22_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_22_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_22_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_22_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_22_31.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: yes\nB: sometimes\nC: no\nD: maybe",
+    "question": "If the person did not do the first action did before he/she drink something with something, is the person able to wash something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: yes\nB: sometimes\nC: no\nD: maybe",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_23_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_23_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_23_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_23_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_23_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_23_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_23_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_23_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_23_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_23_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_23_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_23_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_23_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_23_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_23_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_23_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_23_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_23_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_23_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_23_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_23_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_23_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_23_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_23_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_23_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_23_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_23_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_23_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_23_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_23_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_23_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_23_31.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: Get kettle from stove\nB: Pick up a spoon\nC: Open the fridge\nD: Turn on the faucet",
+    "question": "What is the first action the person did in the video?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Get kettle from stove\nB: Pick up a spoon\nC: Open the fridge\nD: Turn on the faucet",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_24_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_24_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_24_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_24_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_24_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_24_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_24_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_24_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_24_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_24_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_24_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_24_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_24_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_24_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_24_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_24_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_24_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_24_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_24_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_24_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_24_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_24_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_24_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_24_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_24_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_24_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_24_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_24_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_24_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_24_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_24_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_24_31.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: Get bowl from microwave\nB: Put a cup inside\nC: Turned it on without food\nD: Left it empty",
+    "question": "How did the person changed the emptiness of microwave?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Get bowl from microwave\nB: Put a cup inside\nC: Turned it on without food\nD: Left it empty",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_25_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_25_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_25_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_25_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_25_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_25_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_25_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_25_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_25_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_25_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_25_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_25_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_25_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_25_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_25_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_25_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_25_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_25_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_25_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_25_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_25_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_25_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_25_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_25_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_25_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_25_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_25_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_25_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_25_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_25_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_25_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_25_31.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: frozen\nB: boiled\nC: cooked\nD: raw",
+    "question": "What does the person want meat1 to be for the action cooking something using something in the video?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: frozen\nB: boiled\nC: cooked\nD: raw",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_26_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_26_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_26_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_26_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_26_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_26_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_26_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_26_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_26_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_26_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_26_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_26_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_26_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_26_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_26_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_26_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_26_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_26_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_26_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_26_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_26_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_26_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_26_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_26_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_26_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_26_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_26_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_26_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_26_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_26_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_26_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_26_31.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: next to the sink\nB: inside the cabinet\nC: under the table\nD: on top of knife",
+    "question": "What is the status of watermelon2 before the person put something to something using knife to change it?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: next to the sink\nB: inside the cabinet\nC: under the table\nD: on top of knife",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_27_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_27_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_27_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_27_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_27_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_27_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_27_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_27_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_27_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_27_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_27_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_27_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_27_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_27_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_27_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_27_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_27_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_27_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_27_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_27_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_27_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_27_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_27_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_27_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_27_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_27_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_27_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_27_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_27_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_27_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_27_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_27_31.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: under juicer base\nB: behind juicer base\nC: next to juicer base\nD: on top of juicer base",
+    "question": "What does the person want the last object that has status change in the video to be for the action putting something to something in the video?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: under juicer base\nB: behind juicer base\nC: next to juicer base\nD: on top of juicer base",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_28_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_28_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_28_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_28_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_28_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_28_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_28_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_28_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_28_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_28_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_28_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_28_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_28_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_28_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_28_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_28_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_28_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_28_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_28_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_28_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_28_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_28_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_28_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_28_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_28_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_28_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_28_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_28_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_28_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_28_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_28_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_28_31.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: Hold controller\nB: Drop controller\nC: Throw controller\nD: Put controller to table",
+    "question": "what is the other person doing while the person do the first action did after he/she turn off something with something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Hold controller\nB: Drop controller\nC: Throw controller\nD: Put controller to table",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_29_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_29_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_29_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_29_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_29_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_29_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_29_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_29_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_29_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_29_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_29_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_29_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_29_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_29_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_29_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_29_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_29_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_29_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_29_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_29_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_29_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_29_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_29_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_29_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_29_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_29_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_29_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_29_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_29_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_29_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_29_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_29_31.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: yes\nB: it depends\nC: no\nD: maybe",
+    "question": "If the person did not open something, is the person able to pour from something into something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: yes\nB: it depends\nC: no\nD: maybe",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_30_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_30_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_30_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_30_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_30_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_30_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_30_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_30_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_30_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_30_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_30_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_30_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_30_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_30_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_30_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_30_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_30_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_30_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_30_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_30_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_30_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_30_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_30_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_30_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_30_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_30_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_30_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_30_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_30_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_30_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_30_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_30_31.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: yes\nB: only if they do the second action\nC: no\nD: maybe",
+    "question": "If the person did not do the first action in the video, will juicer-lid change its status?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: yes\nB: only if they do the second action\nC: no\nD: maybe",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_31_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_31_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_31_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_31_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_31_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_31_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_31_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_31_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_31_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_31_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_31_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_31_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_31_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_31_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_31_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_31_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_31_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_31_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_31_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_31_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_31_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_31_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_31_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_31_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_31_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_31_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_31_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_31_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_31_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_31_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_31_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_31_31.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: Eating a snack\nB: Talking on the phone\nC: Point to TV\nD: Reading a book",
+    "question": "What is the person doing before he/she stand-up?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Eating a snack\nB: Talking on the phone\nC: Point to TV\nD: Reading a book",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_32_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_32_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_32_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_32_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_32_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_32_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_32_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_32_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_32_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_32_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_32_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_32_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_32_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_32_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_32_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_32_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_32_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_32_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_32_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_32_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_32_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_32_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_32_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_32_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_32_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_32_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_32_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_32_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_32_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_32_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_32_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_32_31.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: Ignore it\nB: Wipe with a dry cloth\nC: Use a paper towel\nD: Wash cutting-board",
+    "question": "How did the person changed the cleanliness of cutting-board?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Ignore it\nB: Wipe with a dry cloth\nC: Use a paper towel\nD: Wash cutting-board",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_33_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_33_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_33_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_33_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_33_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_33_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_33_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_33_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_33_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_33_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_33_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_33_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_33_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_33_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_33_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_33_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_33_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_33_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_33_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_33_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_33_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_33_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_33_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_33_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_33_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_33_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_33_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_33_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_33_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_33_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_33_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_33_31.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: yes\nB: no\nC: possibly\nD: uncertain",
+    "question": "Did the attribute of closet changed because of the action closing something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: yes\nB: no\nC: possibly\nD: uncertain",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_34_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_34_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_34_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_34_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_34_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_34_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_34_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_34_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_34_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_34_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_34_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_34_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_34_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_34_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_34_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_34_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_34_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_34_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_34_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_34_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_34_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_34_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_34_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_34_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_34_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_34_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_34_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_34_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_34_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_34_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_34_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_34_31.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: Open a window\nB: Read a book\nC: Make a phone call\nD: Get remote from shelf",
+    "question": "What is the person doing before he/she turn on something with something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Open a window\nB: Read a book\nC: Make a phone call\nD: Get remote from shelf",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_35_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_35_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_35_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_35_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_35_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_35_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_35_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_35_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_35_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_35_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_35_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_35_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_35_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_35_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_35_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_35_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_35_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_35_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_35_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_35_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_35_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_35_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_35_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_35_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_35_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_35_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_35_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_35_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_35_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_35_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_35_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_35_31.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: cannot be determined\nB: no\nC: yes\nD: not sure",
+    "question": "Is kettle-base visible to the other person before the person do the first action in the video?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: cannot be determined\nB: no\nC: yes\nD: not sure",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_36_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_36_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_36_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_36_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_36_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_36_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_36_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_36_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_36_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_36_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_36_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_36_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_36_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_36_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_36_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_36_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_36_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_36_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_36_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_36_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_36_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_36_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_36_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_36_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_36_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_36_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_36_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_36_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_36_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_36_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_36_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_36_31.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: Drive a car\nB: Take a nap\nC: Put juicer to juicer-base\nD: Read a book",
+    "question": "what will the other person do next?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Drive a car\nB: Take a nap\nC: Put juicer to juicer-base\nD: Read a book",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_37_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_37_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_37_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_37_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_37_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_37_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_37_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_37_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_37_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_37_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_37_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_37_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_37_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_37_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_37_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_37_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_37_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_37_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_37_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_37_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_37_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_37_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_37_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_37_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_37_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_37_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_37_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_37_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_37_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_37_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_37_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_37_31.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: maybe\nB: yes\nC: no\nD: sometimes",
+    "question": "If the person did not sweep something using something, will the last object that has status change in the video change its status?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: maybe\nB: yes\nC: no\nD: sometimes",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_38_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_38_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_38_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_38_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_38_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_38_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_38_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_38_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_38_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_38_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_38_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_38_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_38_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_38_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_38_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_38_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_38_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_38_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_38_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_38_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_38_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_38_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_38_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_38_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_38_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_38_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_38_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_38_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_38_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_38_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_38_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_38_31.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: taste\nB: color\nC: shape\nD: size",
+    "question": "what status will the person change on tomato?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: taste\nB: color\nC: shape\nD: size",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_39_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_39_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_39_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_39_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_39_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_39_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_39_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_39_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_39_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_39_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_39_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_39_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_39_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_39_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_39_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_39_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_39_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_39_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_39_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_39_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_39_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_39_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_39_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_39_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_39_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_39_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_39_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_39_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_39_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_39_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_39_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_39_31.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: Move vacuum to closet\nB: Get vacuum from floor\nC: Leave vacuum outside\nD: Put vacuum on table",
+    "question": "How did the person changed the spatial relationships of the last object that has status change in the video?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Move vacuum to closet\nB: Get vacuum from floor\nC: Leave vacuum outside\nD: Put vacuum on table",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_40_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_40_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_40_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_40_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_40_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_40_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_40_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_40_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_40_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_40_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_40_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_40_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_40_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_40_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_40_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_40_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_40_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_40_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_40_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_40_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_40_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_40_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_40_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_40_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_40_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_40_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_40_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_40_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_40_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_40_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_40_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_40_31.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: door\nB: sink\nC: chair\nD: table",
+    "question": "which object changed its status when the person do the first action did before he/she fill something using something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: door\nB: sink\nC: chair\nD: table",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_41_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_41_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_41_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_41_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_41_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_41_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_41_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_41_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_41_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_41_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_41_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_41_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_41_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_41_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_41_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_41_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_41_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_41_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_41_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_41_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_41_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_41_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_41_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_41_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_41_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_41_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_41_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_41_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_41_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_41_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_41_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_41_31.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: height\nB: wateredness\nC: humidity\nD: brightness",
+    "question": "Which attribute does the person want to change with plant for doing the action pouring from something into something in the video?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: height\nB: wateredness\nC: humidity\nD: brightness",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_42_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_42_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_42_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_42_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_42_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_42_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_42_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_42_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_42_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_42_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_42_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_42_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_42_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_42_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_42_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_42_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_42_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_42_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_42_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_42_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_42_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_42_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_42_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_42_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_42_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_42_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_42_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_42_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_42_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_42_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_42_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_42_31.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: wateredness\nB: leaf size\nC: height\nD: color",
+    "question": "what status of plant changed while the person do the first action did after he/she fill something using something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: wateredness\nB: leaf size\nC: height\nD: color",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_43_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_43_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_43_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_43_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_43_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_43_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_43_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_43_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_43_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_43_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_43_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_43_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_43_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_43_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_43_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_43_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_43_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_43_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_43_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_43_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_43_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_43_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_43_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_43_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_43_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_43_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_43_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_43_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_43_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_43_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_43_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_43_31.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: Cook food in the kitchen\nB: Watch TV instead of fishing\nC: Get fishing-net and fish from basin and fishing-net\nD: Play a game on the computer",
+    "question": "If the person did not fill something using something, what remaining actions in the video is executable?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Cook food in the kitchen\nB: Watch TV instead of fishing\nC: Get fishing-net and fish from basin and fishing-net\nD: Play a game on the computer",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_44_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_44_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_44_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_44_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_44_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_44_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_44_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_44_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_44_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_44_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_44_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_44_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_44_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_44_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_44_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_44_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_44_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_44_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_44_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_44_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_44_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_44_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_44_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_44_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_44_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_44_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_44_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_44_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_44_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_44_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_44_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_44_31.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: plate\nB: spoon\nC: cup\nD: fork",
+    "question": "which object changed its status last in the video?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: plate\nB: spoon\nC: cup\nD: fork",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_45_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_45_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_45_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_45_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_45_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_45_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_45_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_45_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_45_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_45_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_45_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_45_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_45_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_45_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_45_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_45_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_45_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_45_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_45_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_45_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_45_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_45_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_45_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_45_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_45_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_45_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_45_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_45_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_45_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_45_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_45_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_45_31.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: Work on juicer-lid\nB: Read a book\nC: Cook dinner\nD: Go for a run",
+    "question": "what will the person do next after this video?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Work on juicer-lid\nB: Read a book\nC: Cook dinner\nD: Go for a run",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_46_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_46_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_46_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_46_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_46_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_46_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_46_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_46_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_46_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_46_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_46_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_46_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_46_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_46_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_46_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_46_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_46_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_46_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_46_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_46_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_46_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_46_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_46_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_46_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_46_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_46_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_46_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_46_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_46_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_46_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_46_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_46_31.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: size\nB: openess\nC: brand\nD: color",
+    "question": "Which attribute does the person want to change with fridge for doing the last action in the video in the video?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: size\nB: openess\nC: brand\nD: color",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_47_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_47_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_47_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_47_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_47_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_47_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_47_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_47_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_47_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_47_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_47_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_47_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_47_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_47_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_47_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_47_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_47_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_47_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_47_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_47_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_47_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_47_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_47_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_47_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_47_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_47_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_47_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_47_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_47_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_47_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_47_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_47_31.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: sometimes\nB: only if the person opens something else\nC: no\nD: yes",
+    "question": "If the person did not close something, will cereal1 change its status?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: sometimes\nB: only if the person opens something else\nC: no\nD: yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_48_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_48_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_48_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_48_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_48_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_48_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_48_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_48_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_48_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_48_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_48_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_48_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_48_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_48_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_48_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_48_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_48_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_48_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_48_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_48_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_48_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_48_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_48_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_48_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_48_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_48_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_48_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_48_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_48_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_48_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_48_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_48_31.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: uncertain\nB: no\nC: yes\nD: maybe",
+    "question": "Did the attribute of lettuce changed because of the first action did after the person get something from something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: uncertain\nB: no\nC: yes\nD: maybe",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_49_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_49_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_49_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_49_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_49_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_49_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_49_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_49_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_49_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_49_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_49_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_49_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_49_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_49_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_49_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_49_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_49_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_49_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_49_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_49_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_49_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_49_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_49_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_49_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_49_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_49_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_49_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_49_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_49_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_49_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_49_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_49_31.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: Talking on the phone\nB: Walking away\nC: Eating noodles\nD: Get noodles from table",
+    "question": "what is the other person doing while the person do the first action in the video?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Talking on the phone\nB: Walking away\nC: Eating noodles\nD: Get noodles from table",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_50_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_50_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_50_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_50_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_50_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_50_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_50_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_50_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_50_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_50_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_50_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_50_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_50_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_50_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_50_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_50_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_50_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_50_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_50_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_50_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_50_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_50_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_50_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_50_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_50_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_50_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_50_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_50_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_50_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_50_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_50_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_50_31.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: wrapping\nB: lamp\nC: table\nD: chair",
+    "question": "which object changed its status first in the video?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: wrapping\nB: lamp\nC: table\nD: chair",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_51_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_51_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_51_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_51_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_51_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_51_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_51_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_51_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_51_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_51_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_51_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_51_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_51_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_51_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_51_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_51_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_51_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_51_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_51_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_51_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_51_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_51_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_51_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_51_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_51_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_51_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_51_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_51_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_51_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_51_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_51_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_51_31.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: yes\nB: no\nC: maybe\nD: sometimes",
+    "question": "If the person did not get something from something, is the person able to put something to something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: yes\nB: no\nC: maybe\nD: sometimes",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_52_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_52_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_52_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_52_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_52_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_52_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_52_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_52_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_52_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_52_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_52_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_52_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_52_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_52_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_52_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_52_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_52_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_52_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_52_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_52_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_52_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_52_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_52_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_52_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_52_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_52_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_52_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_52_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_52_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_52_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_52_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_52_31.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: Break the seal\nB: Flip the switch\nC: Open wrapping\nD: Cut the ribbon",
+    "question": "What action caused the first object that has status change in the video's status to change to opened?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Break the seal\nB: Flip the switch\nC: Open wrapping\nD: Cut the ribbon",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_53_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_53_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_53_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_53_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_53_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_53_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_53_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_53_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_53_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_53_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_53_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_53_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_53_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_53_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_53_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_53_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_53_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_53_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_53_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_53_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_53_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_53_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_53_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_53_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_53_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_53_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_53_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_53_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_53_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_53_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_53_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_53_31.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: yes\nB: sometimes\nC: no\nD: maybe",
+    "question": "If the person did not open something, is the person able to put something to something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: yes\nB: sometimes\nC: no\nD: maybe",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_54_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_54_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_54_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_54_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_54_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_54_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_54_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_54_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_54_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_54_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_54_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_54_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_54_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_54_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_54_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_54_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_54_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_54_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_54_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_54_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_54_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_54_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_54_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_54_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_54_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_54_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_54_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_54_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_54_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_54_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_54_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_54_31.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: Place fork on table\nB: Dry fork\nC: Pick up spoon\nD: Wash fork",
+    "question": "If the person did not do the first action did after he/she put something to something, what remaining actions in the video is executable?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Place fork on table\nB: Dry fork\nC: Pick up spoon\nD: Wash fork",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_55_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_55_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_55_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_55_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_55_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_55_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_55_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_55_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_55_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_55_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_55_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_55_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_55_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_55_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_55_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_55_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_55_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_55_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_55_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_55_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_55_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_55_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_55_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_55_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_55_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_55_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_55_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_55_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_55_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_55_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_55_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_55_31.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: under the table\nB: inside the cupboard\nC: in the sink\nD: on top of shelf",
+    "question": "What is the status of plate before the other person do the first action before he/she put something to something to change it?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: under the table\nB: inside the cupboard\nC: in the sink\nD: on top of shelf",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_56_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_56_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_56_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_56_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_56_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_56_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_56_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_56_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_56_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_56_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_56_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_56_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_56_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_56_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_56_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_56_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_56_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_56_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_56_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_56_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_56_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_56_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_56_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_56_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_56_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_56_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_56_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_56_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_56_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_56_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_56_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_56_31.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: attached to fork\nB: on the plate\nC: in the pan\nD: detached from fork",
+    "question": "What is the status of meat3 before the person do the first action did after he/she get something from something using fork to change it?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: attached to fork\nB: on the plate\nC: in the pan\nD: detached from fork",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_57_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_57_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_57_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_57_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_57_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_57_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_57_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_57_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_57_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_57_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_57_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_57_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_57_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_57_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_57_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_57_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_57_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_57_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_57_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_57_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_57_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_57_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_57_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_57_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_57_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_57_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_57_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_57_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_57_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_57_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_57_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_57_31.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: chair\nB: lamp\nC: book\nD: knife",
+    "question": "If the actor do not get something from something, which object will he/she not be able to change in the future?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: chair\nB: lamp\nC: book\nD: knife",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_58_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_58_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_58_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_58_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_58_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_58_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_58_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_58_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_58_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_58_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_58_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_58_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_58_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_58_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_58_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_58_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_58_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_58_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_58_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_58_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_58_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_58_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_58_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_58_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_58_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_58_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_58_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_58_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_58_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_58_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_58_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_58_31.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: I don\u2019t know\nB: yes\nC: maybe\nD: no",
+    "question": "Did the attribute of the last object that has status change in the video changed because of the action turning off something with something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: I don\u2019t know\nB: yes\nC: maybe\nD: no",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_59_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_59_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_59_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_59_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_59_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_59_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_59_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_59_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_59_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_59_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_59_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_59_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_59_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_59_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_59_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_59_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_59_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_59_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_59_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_59_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_59_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_59_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_59_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_59_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_59_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_59_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_59_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_59_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_59_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_59_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_59_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_59_31.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: no\nB: yes\nC: maybe\nD: uncertain",
+    "question": "Did the attribute of fridge changed because of the action closing something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: no\nB: yes\nC: maybe\nD: uncertain",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_60_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_60_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_60_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_60_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_60_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_60_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_60_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_60_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_60_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_60_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_60_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_60_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_60_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_60_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_60_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_60_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_60_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_60_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_60_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_60_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_60_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_60_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_60_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_60_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_60_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_60_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_60_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_60_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_60_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_60_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_60_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_60_31.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: maybe\nB: no\nC: yes\nD: I don\u2019t know",
+    "question": "Did the attribute of fridge changed because of the action opening something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: maybe\nB: no\nC: yes\nD: I don\u2019t know",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_61_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_61_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_61_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_61_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_61_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_61_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_61_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_61_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_61_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_61_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_61_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_61_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_61_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_61_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_61_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_61_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_61_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_61_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_61_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_61_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_61_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_61_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_61_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_61_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_61_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_61_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_61_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_61_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_61_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_61_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_61_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_61_31.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: yes\nB: no\nC: sometimes\nD: unknown",
+    "question": "Is the other person aware when the person stand-up?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: yes\nB: no\nC: sometimes\nD: unknown",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_62_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_62_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_62_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_62_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_62_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_62_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_62_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_62_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_62_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_62_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_62_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_62_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_62_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_62_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_62_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_62_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_62_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_62_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_62_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_62_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_62_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_62_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_62_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_62_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_62_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_62_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_62_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_62_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_62_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_62_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_62_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_62_31.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: yes\nB: no\nC: maybe\nD: sometimes",
+    "question": "Did the attribute of juicer changed because of the first action did before the person open something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: yes\nB: no\nC: maybe\nD: sometimes",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_63_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_63_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_63_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_63_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_63_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_63_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_63_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_63_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_63_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_63_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_63_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_63_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_63_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_63_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_63_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_63_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_63_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_63_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_63_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_63_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_63_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_63_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_63_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_63_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_63_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_63_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_63_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_63_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_63_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_63_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_63_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_63_31.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: Get lettuce from lettuce\nB: Go to sleep\nC: Write a report\nD: Eat a sandwich",
+    "question": "What is the person doing after he/she work on something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Get lettuce from lettuce\nB: Go to sleep\nC: Write a report\nD: Eat a sandwich",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_64_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_64_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_64_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_64_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_64_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_64_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_64_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_64_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_64_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_64_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_64_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_64_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_64_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_64_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_64_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_64_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_64_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_64_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_64_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_64_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_64_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_64_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_64_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_64_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_64_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_64_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_64_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_64_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_64_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_64_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_64_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_64_31.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: partially wrapped\nB: wrapped\nC: double wrapped\nD: unwrapped",
+    "question": "what will the person want to have coffee's wrappedness be in the future?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: partially wrapped\nB: wrapped\nC: double wrapped\nD: unwrapped",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_65_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_65_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_65_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_65_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_65_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_65_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_65_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_65_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_65_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_65_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_65_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_65_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_65_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_65_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_65_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_65_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_65_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_65_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_65_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_65_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_65_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_65_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_65_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_65_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_65_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_65_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_65_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_65_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_65_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_65_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_65_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_65_31.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: maybe\nB: sometimes\nC: yes\nD: no",
+    "question": "If the person did not do the first action did after he/she turn on something with something, is the person able to get something from something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: maybe\nB: sometimes\nC: yes\nD: no",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_66_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_66_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_66_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_66_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_66_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_66_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_66_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_66_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_66_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_66_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_66_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_66_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_66_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_66_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_66_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_66_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_66_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_66_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_66_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_66_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_66_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_66_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_66_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_66_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_66_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_66_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_66_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_66_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_66_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_66_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_66_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_66_31.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: yes\nB: no\nC: often\nD: sometimes",
+    "question": "Did the attribute of fishing-net changed because of the action filling something using something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: yes\nB: no\nC: often\nD: sometimes",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_67_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_67_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_67_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_67_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_67_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_67_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_67_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_67_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_67_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_67_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_67_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_67_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_67_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_67_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_67_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_67_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_67_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_67_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_67_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_67_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_67_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_67_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_67_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_67_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_67_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_67_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_67_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_67_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_67_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_67_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_67_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_67_31.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: in the car\nB: in closet\nC: on the table\nD: under the bed",
+    "question": "What does the person want cup to be for the action putting something to something in the video?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: in the car\nB: in closet\nC: on the table\nD: under the bed",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_68_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_68_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_68_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_68_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_68_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_68_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_68_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_68_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_68_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_68_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_68_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_68_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_68_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_68_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_68_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_68_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_68_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_68_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_68_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_68_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_68_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_68_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_68_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_68_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_68_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_68_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_68_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_68_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_68_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_68_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_68_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_68_31.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: Reading a book\nB: Playing video games\nC: Cooking dinner\nD: Wash juicer and juicer-lid",
+    "question": "what is the other person doing while the person do the first action did before he/she get something from something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Reading a book\nB: Playing video games\nC: Cooking dinner\nD: Wash juicer and juicer-lid",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_69_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_69_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_69_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_69_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_69_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_69_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_69_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_69_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_69_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_69_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_69_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_69_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_69_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_69_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_69_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_69_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_69_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_69_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_69_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_69_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_69_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_69_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_69_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_69_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_69_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_69_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_69_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_69_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_69_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_69_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_69_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_69_31.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: sometimes\nB: maybe\nC: yes\nD: no",
+    "question": "If the person did not open something, will the first object that has status change in the video change its status?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: sometimes\nB: maybe\nC: yes\nD: no",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_70_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_70_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_70_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_70_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_70_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_70_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_70_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_70_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_70_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_70_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_70_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_70_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_70_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_70_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_70_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_70_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_70_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_70_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_70_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_70_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_70_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_70_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_70_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_70_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_70_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_70_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_70_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_70_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_70_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_70_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_70_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_70_31.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: separate from the other person\nB: attached to the other person\nC: above the other person\nD: beneath the other person",
+    "question": "what will the person want to have juice's spatial relationships be in the future?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: separate from the other person\nB: attached to the other person\nC: above the other person\nD: beneath the other person",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_71_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_71_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_71_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_71_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_71_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_71_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_71_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_71_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_71_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_71_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_71_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_71_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_71_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_71_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_71_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_71_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_71_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_71_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_71_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_71_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_71_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_71_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_71_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_71_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_71_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_71_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_71_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_71_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_71_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_71_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_71_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_71_31.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: Pour from bottle-water into juicer\nB: Running a Marathon\nC: Playing a Piano\nD: Sleeping",
+    "question": "What is the person doing after he/she open something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Pour from bottle-water into juicer\nB: Running a Marathon\nC: Playing a Piano\nD: Sleeping",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_72_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_72_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_72_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_72_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_72_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_72_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_72_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_72_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_72_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_72_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_72_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_72_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_72_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_72_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_72_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_72_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_72_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_72_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_72_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_72_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_72_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_72_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_72_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_72_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_72_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_72_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_72_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_72_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_72_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_72_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_72_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_72_31.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: not enough information\nB: no\nC: maybe\nD: yes",
+    "question": "If the person did not do the first action did before he/she pour from something into something, will spoon change its status?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: not enough information\nB: no\nC: maybe\nD: yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_73_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_73_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_73_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_73_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_73_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_73_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_73_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_73_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_73_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_73_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_73_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_73_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_73_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_73_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_73_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_73_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_73_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_73_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_73_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_73_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_73_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_73_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_73_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_73_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_73_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_73_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_73_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_73_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_73_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_73_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_73_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_73_31.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: yes\nB: sometimes\nC: maybe\nD: no",
+    "question": "Did the attribute of closet changed because of the first action did after the person put something to something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: yes\nB: sometimes\nC: maybe\nD: no",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_74_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_74_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_74_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_74_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_74_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_74_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_74_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_74_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_74_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_74_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_74_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_74_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_74_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_74_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_74_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_74_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_74_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_74_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_74_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_74_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_74_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_74_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_74_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_74_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_74_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_74_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_74_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_74_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_74_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_74_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_74_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_74_31.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: temporal relationships\nB: emotional status\nC: spatial relationships\nD: frequency",
+    "question": "what status of cup changed while the person get something from something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: temporal relationships\nB: emotional status\nC: spatial relationships\nD: frequency",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_75_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_75_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_75_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_75_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_75_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_75_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_75_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_75_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_75_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_75_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_75_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_75_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_75_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_75_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_75_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_75_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_75_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_75_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_75_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_75_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_75_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_75_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_75_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_75_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_75_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_75_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_75_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_75_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_75_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_75_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_75_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_75_31.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: placed on the table\nB: attached to me\nC: inside the drawer\nD: on the kitchen counter",
+    "question": "What is the status of knife before the person put something to something to change it?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: placed on the table\nB: attached to me\nC: inside the drawer\nD: on the kitchen counter",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_76_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_76_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_76_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_76_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_76_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_76_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_76_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_76_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_76_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_76_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_76_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_76_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_76_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_76_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_76_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_76_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_76_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_76_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_76_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_76_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_76_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_76_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_76_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_76_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_76_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_76_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_76_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_76_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_76_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_76_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_76_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_76_31.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: irrelevant\nB: no\nC: yes\nD: partially",
+    "question": "Did the attribute of knife changed because of the first action did after the person wash something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: irrelevant\nB: no\nC: yes\nD: partially",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_77_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_77_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_77_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_77_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_77_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_77_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_77_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_77_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_77_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_77_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_77_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_77_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_77_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_77_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_77_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_77_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_77_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_77_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_77_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_77_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_77_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_77_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_77_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_77_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_77_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_77_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_77_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_77_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_77_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_77_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_77_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_77_31.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: maybe\nB: yes\nC: no\nD: unsure",
+    "question": "If the person did not do the first action in the video, will tv change its status?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: maybe\nB: yes\nC: no\nD: unsure",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_78_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_78_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_78_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_78_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_78_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_78_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_78_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_78_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_78_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_78_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_78_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_78_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_78_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_78_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_78_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_78_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_78_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_78_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_78_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_78_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_78_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_78_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_78_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_78_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_78_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_78_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_78_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_78_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_78_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_78_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_78_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_78_31.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: Get juicer from juicer-base\nB: Turn on the blender\nC: Chop vegetables\nD: Check the timer",
+    "question": "What is the person doing after he/she put something to something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Get juicer from juicer-base\nB: Turn on the blender\nC: Chop vegetables\nD: Check the timer",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_79_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_79_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_79_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_79_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_79_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_79_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_79_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_79_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_79_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_79_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_79_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_79_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_79_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_79_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_79_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_79_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_79_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_79_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_79_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_79_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_79_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_79_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_79_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_79_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_79_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_79_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_79_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_79_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_79_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_79_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_79_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_79_31.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: broken\nB: opened\nC: painted\nD: removed",
+    "question": "what will the person want to have the last object that has status change in the video's openess be in the future?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: broken\nB: opened\nC: painted\nD: removed",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_80_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_80_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_80_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_80_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_80_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_80_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_80_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_80_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_80_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_80_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_80_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_80_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_80_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_80_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_80_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_80_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_80_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_80_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_80_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_80_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_80_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_80_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_80_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_80_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_80_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_80_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_80_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_80_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_80_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_80_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_80_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_80_31.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: Put cup to shelf\nB: Cook meal\nC: Wash car\nD: Water plants",
+    "question": "If the person did not do the first action did after he/she close something, what remaining actions in the video is not executable?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Put cup to shelf\nB: Cook meal\nC: Wash car\nD: Water plants",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_81_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_81_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_81_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_81_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_81_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_81_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_81_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_81_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_81_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_81_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_81_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_81_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_81_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_81_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_81_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_81_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_81_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_81_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_81_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_81_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_81_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_81_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_81_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_81_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_81_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_81_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_81_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_81_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_81_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_81_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_81_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_81_31.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: wet\nB: broken\nC: dirty\nD: clean",
+    "question": "What is the status of plate after the person wash something to change it?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: wet\nB: broken\nC: dirty\nD: clean",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_82_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_82_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_82_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_82_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_82_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_82_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_82_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_82_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_82_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_82_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_82_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_82_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_82_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_82_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_82_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_82_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_82_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_82_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_82_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_82_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_82_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_82_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_82_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_82_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_82_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_82_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_82_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_82_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_82_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_82_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_82_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_82_31.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: Get tomato from table\nB: Wash the knife\nC: Chop the tomato\nD: Slice the bread",
+    "question": "What is the person doing before he/she cut something using something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Get tomato from table\nB: Wash the knife\nC: Chop the tomato\nD: Slice the bread",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_83_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_83_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_83_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_83_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_83_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_83_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_83_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_83_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_83_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_83_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_83_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_83_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_83_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_83_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_83_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_83_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_83_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_83_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_83_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_83_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_83_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_83_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_83_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_83_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_83_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_83_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_83_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_83_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_83_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_83_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_83_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_83_31.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: Cook a meal\nB: Read a book\nC: Paint a picture\nD: Put fish to basin using tank",
+    "question": "If the person did not get something from something using fishing-net, what remaining actions in the video is not executable?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Cook a meal\nB: Read a book\nC: Paint a picture\nD: Put fish to basin using tank",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_84_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_84_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_84_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_84_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_84_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_84_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_84_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_84_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_84_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_84_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_84_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_84_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_84_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_84_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_84_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_84_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_84_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_84_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_84_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_84_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_84_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_84_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_84_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_84_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_84_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_84_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_84_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_84_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_84_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_84_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_84_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_84_31.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: only sometimes\nB: maybe\nC: yes\nD: no",
+    "question": "If the person did not pour from something into something, will kettle-lid change its status?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: only sometimes\nB: maybe\nC: yes\nD: no",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_85_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_85_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_85_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_85_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_85_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_85_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_85_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_85_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_85_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_85_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_85_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_85_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_85_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_85_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_85_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_85_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_85_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_85_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_85_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_85_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_85_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_85_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_85_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_85_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_85_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_85_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_85_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_85_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_85_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_85_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_85_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_85_31.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: Read a book\nB: Cook a meal\nC: Get coffee from shelf\nD: Wash the dishes",
+    "question": "what is the other person doing while the person do the first action in the video?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Read a book\nB: Cook a meal\nC: Get coffee from shelf\nD: Wash the dishes",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_86_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_86_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_86_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_86_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_86_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_86_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_86_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_86_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_86_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_86_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_86_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_86_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_86_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_86_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_86_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_86_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_86_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_86_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_86_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_86_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_86_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_86_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_86_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_86_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_86_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_86_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_86_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_86_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_86_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_86_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_86_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_86_31.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: Put tank to table\nB: Take tank from table\nC: Move tank to floor\nD: Put table to tank",
+    "question": "What is the person doing before he/she get something from something and fishing-net?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Put tank to table\nB: Take tank from table\nC: Move tank to floor\nD: Put table to tank",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_87_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_87_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_87_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_87_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_87_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_87_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_87_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_87_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_87_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_87_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_87_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_87_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_87_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_87_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_87_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_87_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_87_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_87_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_87_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_87_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_87_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_87_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_87_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_87_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_87_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_87_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_87_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_87_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_87_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_87_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_87_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_87_31.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: sofa\nB: lamp\nC: table\nD: vacuum",
+    "question": "which object changed its status first in the video?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: sofa\nB: lamp\nC: table\nD: vacuum",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_88_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_88_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_88_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_88_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_88_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_88_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_88_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_88_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_88_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_88_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_88_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_88_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_88_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_88_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_88_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_88_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_88_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_88_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_88_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_88_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_88_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_88_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_88_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_88_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_88_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_88_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_88_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_88_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_88_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_88_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_88_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_88_31.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: Reading a book\nB: Eating lunch\nC: Taking a nap\nD: Put wrapping to table",
+    "question": "what is the other person doing while the person do the first action did before he/she close something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Reading a book\nB: Eating lunch\nC: Taking a nap\nD: Put wrapping to table",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_89_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_89_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_89_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_89_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_89_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_89_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_89_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_89_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_89_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_89_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_89_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_89_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_89_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_89_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_89_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_89_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_89_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_89_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_89_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_89_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_89_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_89_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_89_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_89_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_89_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_89_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_89_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_89_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_89_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_89_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_89_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_89_31.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: half-squeezed\nB: in market\nC: in juicer\nD: unpeeled",
+    "question": "what will the status of orange2 change to if the actor do the first action did before he/she get something from something in the future?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: half-squeezed\nB: in market\nC: in juicer\nD: unpeeled",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_90_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_90_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_90_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_90_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_90_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_90_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_90_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_90_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_90_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_90_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_90_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_90_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_90_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_90_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_90_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_90_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_90_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_90_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_90_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_90_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_90_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_90_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_90_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_90_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_90_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_90_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_90_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_90_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_90_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_90_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_90_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_90_31.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: Slice the watermelon\nB: Peel the watermelon\nC: Throw away the watermelon\nD: Put watermelon to juicer",
+    "question": "what will the other person do next?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Slice the watermelon\nB: Peel the watermelon\nC: Throw away the watermelon\nD: Put watermelon to juicer",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_91_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_91_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_91_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_91_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_91_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_91_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_91_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_91_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_91_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_91_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_91_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_91_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_91_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_91_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_91_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_91_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_91_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_91_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_91_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_91_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_91_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_91_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_91_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_91_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_91_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_91_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_91_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_91_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_91_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_91_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_91_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_91_31.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: maybe\nB: possibly\nC: no\nD: yes",
+    "question": "Did the attribute of kettle-base changed because of the first action did after the person close something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: maybe\nB: possibly\nC: no\nD: yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_92_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_92_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_92_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_92_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_92_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_92_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_92_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_92_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_92_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_92_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_92_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_92_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_92_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_92_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_92_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_92_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_92_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_92_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_92_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_92_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_92_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_92_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_92_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_92_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_92_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_92_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_92_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_92_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_92_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_92_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_92_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_92_31.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: Reading a book\nB: Boiling water\nC: Put kettle to table\nD: Sitting on a chair",
+    "question": "what is the other person doing while the person do the first action did before he/she point to something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Reading a book\nB: Boiling water\nC: Put kettle to table\nD: Sitting on a chair",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_93_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_93_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_93_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_93_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_93_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_93_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_93_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_93_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_93_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_93_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_93_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_93_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_93_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_93_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_93_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_93_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_93_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_93_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_93_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_93_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_93_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_93_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_93_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_93_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_93_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_93_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_93_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_93_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_93_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_93_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_93_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_93_31.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: it depends\nB: yes\nC: sometimes\nD: no",
+    "question": "Does the first action did before the person turn on something with something fulfills the preconditions of the action getting something from something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: it depends\nB: yes\nC: sometimes\nD: no",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_94_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_94_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_94_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_94_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_94_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_94_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_94_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_94_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_94_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_94_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_94_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_94_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_94_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_94_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_94_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_94_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_94_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_94_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_94_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_94_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_94_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_94_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_94_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_94_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_94_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_94_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_94_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_94_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_94_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_94_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_94_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_94_31.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: Place kettle on stovetop\nB: Fill kettle using sink\nC: Turn on the kettle\nD: Add tea leaves to kettle",
+    "question": "What action caused kettle's status to change to nonempty?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Place kettle on stovetop\nB: Fill kettle using sink\nC: Turn on the kettle\nD: Add tea leaves to kettle",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_95_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_95_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_95_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_95_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_95_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_95_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_95_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_95_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_95_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_95_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_95_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_95_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_95_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_95_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_95_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_95_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_95_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_95_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_95_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_95_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_95_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_95_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_95_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_95_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_95_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_95_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_95_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_95_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_95_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_95_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_95_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_95_31.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: yes\nB: no\nC: maybe\nD: sometimes",
+    "question": "Did the attribute of tomato1 changed because of the first action did before the person wash something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: yes\nB: no\nC: maybe\nD: sometimes",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_96_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_96_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_96_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_96_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_96_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_96_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_96_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_96_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_96_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_96_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_96_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_96_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_96_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_96_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_96_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_96_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_96_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_96_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_96_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_96_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_96_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_96_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_96_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_96_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_96_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_96_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_96_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_96_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_96_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_96_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_96_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_96_31.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: Stand up and walk away\nB: Sit down on sofa\nC: Open the window\nD: Start cooking dinner",
+    "question": "what will the other person do next?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Stand up and walk away\nB: Sit down on sofa\nC: Open the window\nD: Start cooking dinner",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_97_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_97_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_97_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_97_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_97_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_97_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_97_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_97_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_97_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_97_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_97_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_97_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_97_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_97_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_97_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_97_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_97_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_97_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_97_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_97_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_97_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_97_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_97_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_97_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_97_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_97_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_97_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_97_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_97_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_97_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_97_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_97_31.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: Watch TV\nB: Read a book\nC: Go for a run\nD: Work on noodles",
+    "question": "what will the person do next after this video?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Watch TV\nB: Read a book\nC: Go for a run\nD: Work on noodles",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_98_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_98_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_98_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_98_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_98_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_98_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_98_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_98_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_98_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_98_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_98_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_98_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_98_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_98_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_98_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_98_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_98_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_98_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_98_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_98_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_98_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_98_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_98_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_98_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_98_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_98_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_98_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_98_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_98_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_98_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_98_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_98_31.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: knife\nB: cup\nC: phone\nD: pen",
+    "question": "which object changed its status when the other person get something from something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: knife\nB: cup\nC: phone\nD: pen",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_99_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_99_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_99_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_99_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_99_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_99_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_99_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_99_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_99_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_99_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_99_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_99_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_99_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_99_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_99_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_99_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_99_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_99_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_99_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_99_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_99_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_99_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_99_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_99_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_99_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_99_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_99_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_99_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_99_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_99_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_99_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_99_31.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: Close the tank lid\nB: Put tank on table\nC: Put tank to sink\nD: Pour tank contents into a glass",
+    "question": "What is the person doing after he/she pour from something into something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Close the tank lid\nB: Put tank on table\nC: Put tank to sink\nD: Pour tank contents into a glass",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_100_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_100_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_100_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_100_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_100_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_100_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_100_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_100_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_100_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_100_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_100_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_100_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_100_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_100_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_100_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_100_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_100_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_100_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_100_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_100_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_100_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_100_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_100_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_100_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_100_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_100_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_100_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_100_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_100_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_100_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_100_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_100_31.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: yes\nB: maybe\nC: no\nD: sometimes",
+    "question": "If the person did not do the first action did before he/she open something, is the person able to get something from something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: yes\nB: maybe\nC: no\nD: sometimes",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_101_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_101_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_101_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_101_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_101_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_101_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_101_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_101_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_101_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_101_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_101_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_101_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_101_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_101_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_101_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_101_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_101_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_101_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_101_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_101_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_101_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_101_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_101_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_101_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_101_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_101_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_101_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_101_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_101_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_101_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_101_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_101_31.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: in room temperature\nB: in boiling water\nC: in the freezer\nD: in the microwave",
+    "question": "what will the person want to have the last object that has status change in the video's temperature be in the future?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: in room temperature\nB: in boiling water\nC: in the freezer\nD: in the microwave",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_102_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_102_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_102_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_102_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_102_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_102_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_102_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_102_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_102_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_102_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_102_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_102_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_102_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_102_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_102_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_102_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_102_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_102_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_102_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_102_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_102_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_102_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_102_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_102_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_102_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_102_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_102_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_102_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_102_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_102_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_102_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_102_31.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: Tie shoes\nB: Read a book\nC: Drink water from cup\nD: Get meat from floor",
+    "question": "What is the person doing before he/she throw something into something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Tie shoes\nB: Read a book\nC: Drink water from cup\nD: Get meat from floor",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_103_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_103_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_103_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_103_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_103_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_103_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_103_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_103_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_103_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_103_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_103_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_103_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_103_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_103_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_103_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_103_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_103_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_103_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_103_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_103_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_103_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_103_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_103_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_103_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_103_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_103_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_103_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_103_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_103_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_103_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_103_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_103_31.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: can not be opened\nB: left part removed\nC: completely sealed\nD: right part added",
+    "question": "What is the precondition of changing the openability of meat2?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: can not be opened\nB: left part removed\nC: completely sealed\nD: right part added",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_104_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_104_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_104_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_104_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_104_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_104_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_104_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_104_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_104_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_104_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_104_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_104_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_104_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_104_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_104_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_104_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_104_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_104_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_104_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_104_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_104_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_104_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_104_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_104_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_104_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_104_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_104_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_104_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_104_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_104_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_104_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_104_31.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: undecided\nB: no\nC: yes\nD: maybe",
+    "question": "Did the attribute of cutting-board changed because of the first action did after the person point to something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: undecided\nB: no\nC: yes\nD: maybe",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_105_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_105_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_105_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_105_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_105_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_105_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_105_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_105_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_105_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_105_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_105_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_105_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_105_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_105_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_105_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_105_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_105_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_105_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_105_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_105_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_105_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_105_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_105_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_105_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_105_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_105_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_105_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_105_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_105_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_105_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_105_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_105_31.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: cleanliness\nB: weight\nC: sharpness\nD: color",
+    "question": "Which attribute does the person want to change with knife for doing the last action in the video in the video?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: cleanliness\nB: weight\nC: sharpness\nD: color",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_106_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_106_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_106_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_106_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_106_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_106_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_106_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_106_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_106_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_106_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_106_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_106_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_106_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_106_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_106_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_106_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_106_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_106_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_106_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_106_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_106_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_106_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_106_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_106_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_106_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_106_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_106_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_106_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_106_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_106_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_106_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_106_31.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: lettuce\nB: carrot\nC: pepper\nD: tomato",
+    "question": "which object changed its status when the other person do the last action in the video?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: lettuce\nB: carrot\nC: pepper\nD: tomato",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_107_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_107_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_107_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_107_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_107_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_107_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_107_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_107_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_107_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_107_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_107_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_107_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_107_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_107_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_107_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_107_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_107_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_107_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_107_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_107_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_107_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_107_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_107_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_107_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_107_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_107_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_107_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_107_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_107_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_107_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_107_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_107_31.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: Leave the watermelon unpeeled\nB: Wash the cutting-board\nC: Cut the watermelon on the floor\nD: Get watermelon from cutting-board",
+    "question": "If the other person did not wash something, what actions of this person in the video is executable?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Leave the watermelon unpeeled\nB: Wash the cutting-board\nC: Cut the watermelon on the floor\nD: Get watermelon from cutting-board",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_108_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_108_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_108_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_108_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_108_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_108_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_108_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_108_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_108_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_108_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_108_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_108_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_108_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_108_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_108_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_108_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_108_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_108_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_108_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_108_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_108_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_108_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_108_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_108_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_108_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_108_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_108_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_108_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_108_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_108_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_108_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_108_31.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: always on\nB: could be turned off\nC: always off\nD: could be turned on",
+    "question": "What is the precondition of changing the switchability of the last object that has status change in the video?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: always on\nB: could be turned off\nC: always off\nD: could be turned on",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_109_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_109_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_109_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_109_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_109_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_109_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_109_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_109_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_109_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_109_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_109_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_109_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_109_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_109_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_109_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_109_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_109_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_109_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_109_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_109_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_109_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_109_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_109_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_109_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_109_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_109_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_109_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_109_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_109_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_109_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_109_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_109_31.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: not sure\nB: maybe\nC: no\nD: yes",
+    "question": "Did the attribute of fridge changed because of the action closing something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: not sure\nB: maybe\nC: no\nD: yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_110_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_110_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_110_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_110_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_110_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_110_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_110_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_110_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_110_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_110_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_110_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_110_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_110_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_110_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_110_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_110_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_110_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_110_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_110_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_110_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_110_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_110_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_110_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_110_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_110_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_110_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_110_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_110_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_110_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_110_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_110_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_110_31.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: no\nB: yes\nC: not sure\nD: maybe",
+    "question": "If the person did not do the last action in the video, is the person able to put something to something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: no\nB: yes\nC: not sure\nD: maybe",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_111_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_111_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_111_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_111_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_111_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_111_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_111_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_111_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_111_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_111_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_111_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_111_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_111_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_111_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_111_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_111_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_111_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_111_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_111_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_111_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_111_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_111_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_111_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_111_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_111_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_111_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_111_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_111_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_111_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_111_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_111_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_111_31.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: max temperature\nB: on\nC: off\nD: half full",
+    "question": "What is the precondition of changing the poweredness of kettle?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: max temperature\nB: on\nC: off\nD: half full",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_112_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_112_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_112_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_112_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_112_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_112_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_112_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_112_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_112_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_112_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_112_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_112_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_112_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_112_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_112_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_112_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_112_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_112_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_112_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_112_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_112_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_112_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_112_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_112_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_112_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_112_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_112_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_112_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_112_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_112_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_112_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_112_31.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: maybe\nB: no\nC: yes\nD: not sure",
+    "question": "Did the attribute of vacuum changed because of the last action in the video?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: maybe\nB: no\nC: yes\nD: not sure",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_113_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_113_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_113_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_113_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_113_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_113_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_113_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_113_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_113_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_113_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_113_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_113_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_113_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_113_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_113_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_113_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_113_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_113_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_113_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_113_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_113_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_113_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_113_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_113_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_113_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_113_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_113_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_113_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_113_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_113_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_113_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_113_31.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: maybe\nB: sometimes\nC: yes\nD: no",
+    "question": "Is milk visible to the other person after the person do the first action did after he/she open something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: maybe\nB: sometimes\nC: yes\nD: no",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_114_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_114_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_114_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_114_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_114_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_114_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_114_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_114_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_114_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_114_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_114_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_114_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_114_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_114_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_114_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_114_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_114_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_114_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_114_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_114_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_114_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_114_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_114_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_114_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_114_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_114_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_114_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_114_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_114_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_114_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_114_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_114_31.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: yes\nB: maybe\nC: not sure\nD: no",
+    "question": "Does the first action in the video fulfills the preconditions of the action opening something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: yes\nB: maybe\nC: not sure\nD: no",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_115_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_115_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_115_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_115_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_115_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_115_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_115_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_115_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_115_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_115_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_115_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_115_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_115_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_115_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_115_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_115_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_115_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_115_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_115_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_115_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_115_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_115_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_115_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_115_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_115_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_115_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_115_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_115_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_115_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_115_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_115_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_115_31.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: sometimes\nB: only if it is a chair\nC: yes\nD: no",
+    "question": "Does the action sitting down on something fulfills the preconditions of the action putting something to something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: sometimes\nB: only if it is a chair\nC: yes\nD: no",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_116_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_116_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_116_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_116_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_116_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_116_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_116_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_116_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_116_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_116_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_116_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_116_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_116_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_116_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_116_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_116_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_116_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_116_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_116_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_116_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_116_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_116_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_116_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_116_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_116_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_116_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_116_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_116_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_116_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_116_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_116_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_116_31.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: on the side of shelf\nB: next to the door\nC: on the edge of table\nD: under the chair",
+    "question": "What is the status of vacuum before the person get something from something to change it?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: on the side of shelf\nB: next to the door\nC: on the edge of table\nD: under the chair",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_117_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_117_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_117_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_117_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_117_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_117_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_117_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_117_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_117_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_117_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_117_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_117_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_117_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_117_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_117_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_117_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_117_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_117_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_117_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_117_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_117_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_117_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_117_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_117_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_117_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_117_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_117_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_117_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_117_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_117_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_117_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_117_31.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: no\nB: yes\nC: maybe\nD: sometimes",
+    "question": "Does the action sitting down on something fulfills the preconditions of the action switching with something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: no\nB: yes\nC: maybe\nD: sometimes",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_118_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_118_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_118_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_118_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_118_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_118_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_118_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_118_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_118_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_118_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_118_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_118_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_118_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_118_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_118_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_118_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_118_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_118_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_118_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_118_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_118_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_118_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_118_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_118_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_118_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_118_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_118_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_118_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_118_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_118_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_118_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_118_31.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: fishing-rod\nB: boat\nC: fishing-net\nD: life-jacket",
+    "question": "which object changed its status when the person do the first action in the video?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: fishing-rod\nB: boat\nC: fishing-net\nD: life-jacket",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_119_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_119_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_119_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_119_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_119_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_119_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_119_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_119_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_119_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_119_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_119_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_119_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_119_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_119_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_119_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_119_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_119_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_119_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_119_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_119_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_119_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_119_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_119_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_119_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_119_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_119_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_119_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_119_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_119_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_119_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_119_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_119_31.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: Moved the milk to the fridge\nB: Placed the milk on the floor\nC: Put milk to table\nD: Took the milk off the table",
+    "question": "How did the person changed the spatial relationships of the first object that has status change in the video?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Moved the milk to the fridge\nB: Placed the milk on the floor\nC: Put milk to table\nD: Took the milk off the table",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_120_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_120_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_120_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_120_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_120_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_120_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_120_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_120_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_120_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_120_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_120_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_120_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_120_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_120_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_120_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_120_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_120_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_120_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_120_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_120_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_120_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_120_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_120_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_120_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_120_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_120_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_120_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_120_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_120_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_120_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_120_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_120_31.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: spoon\nB: knife\nC: plate\nD: fork",
+    "question": "which object changed its status when the other person do the first action before he/she eat something with something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: spoon\nB: knife\nC: plate\nD: fork",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_121_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_121_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_121_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_121_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_121_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_121_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_121_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_121_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_121_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_121_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_121_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_121_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_121_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_121_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_121_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_121_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_121_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_121_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_121_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_121_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_121_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_121_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_121_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_121_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_121_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_121_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_121_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_121_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_121_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_121_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_121_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_121_31.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: sometimes\nB: maybe\nC: yes\nD: no",
+    "question": "If the person did not do the first action did before he/she wash something, will sink change its status?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: sometimes\nB: maybe\nC: yes\nD: no",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_122_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_122_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_122_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_122_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_122_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_122_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_122_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_122_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_122_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_122_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_122_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_122_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_122_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_122_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_122_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_122_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_122_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_122_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_122_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_122_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_122_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_122_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_122_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_122_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_122_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_122_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_122_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_122_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_122_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_122_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_122_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_122_31.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: no\nB: maybe\nC: yes\nD: sometimes",
+    "question": "If the person did not do the first action did before he/she get something from something, is the person able to put something to something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: no\nB: maybe\nC: yes\nD: sometimes",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_123_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_123_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_123_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_123_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_123_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_123_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_123_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_123_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_123_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_123_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_123_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_123_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_123_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_123_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_123_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_123_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_123_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_123_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_123_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_123_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_123_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_123_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_123_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_123_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_123_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_123_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_123_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_123_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_123_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_123_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_123_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_123_31.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: no\nB: yes\nC: maybe\nD: sometimes",
+    "question": "Did the attribute of meat changed because of the action putting something to something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: no\nB: yes\nC: maybe\nD: sometimes",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_124_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_124_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_124_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_124_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_124_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_124_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_124_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_124_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_124_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_124_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_124_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_124_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_124_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_124_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_124_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_124_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_124_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_124_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_124_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_124_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_124_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_124_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_124_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_124_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_124_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_124_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_124_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_124_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_124_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_124_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_124_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_124_31.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: Get meat from meat using spoon\nB: Use a knife to cut the meat\nC: Boil the meat to change its shape\nD: Squash the meat with a fork",
+    "question": "How did the person changed the shape of meat2?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Get meat from meat using spoon\nB: Use a knife to cut the meat\nC: Boil the meat to change its shape\nD: Squash the meat with a fork",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_125_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_125_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_125_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_125_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_125_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_125_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_125_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_125_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_125_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_125_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_125_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_125_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_125_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_125_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_125_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_125_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_125_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_125_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_125_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_125_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_125_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_125_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_125_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_125_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_125_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_125_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_125_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_125_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_125_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_125_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_125_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_125_31.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: only partially\nB: maybe\nC: no\nD: yes",
+    "question": "Did the attribute of spoon changed because of the first action did after the person put something to something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: only partially\nB: maybe\nC: no\nD: yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_126_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_126_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_126_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_126_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_126_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_126_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_126_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_126_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_126_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_126_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_126_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_126_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_126_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_126_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_126_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_126_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_126_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_126_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_126_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_126_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_126_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_126_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_126_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_126_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_126_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_126_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_126_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_126_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_126_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_126_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_126_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_126_31.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: uncertain\nB: no\nC: maybe\nD: yes",
+    "question": "Does the first action did before the person put something to something fulfills the preconditions of the action opening something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: uncertain\nB: no\nC: maybe\nD: yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_127_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_127_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_127_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_127_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_127_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_127_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_127_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_127_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_127_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_127_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_127_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_127_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_127_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_127_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_127_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_127_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_127_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_127_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_127_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_127_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_127_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_127_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_127_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_127_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_127_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_127_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_127_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_127_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_127_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_127_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_127_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_127_31.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: no\nB: yes\nC: only if watermelon1 changes its status first\nD: maybe",
+    "question": "If the person did not do the first action did before he/she get something from something, will watermelon2 change its status?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: no\nB: yes\nC: only if watermelon1 changes its status first\nD: maybe",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_128_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_128_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_128_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_128_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_128_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_128_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_128_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_128_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_128_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_128_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_128_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_128_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_128_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_128_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_128_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_128_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_128_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_128_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_128_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_128_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_128_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_128_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_128_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_128_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_128_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_128_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_128_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_128_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_128_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_128_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_128_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_128_31.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: in the fridge\nB: on the table\nC: in cup1\nD: in cup2",
+    "question": "What is the status of juice before the person do the first action did after he/she get something from something to change it?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: in the fridge\nB: on the table\nC: in cup1\nD: in cup2",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_129_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_129_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_129_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_129_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_129_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_129_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_129_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_129_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_129_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_129_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_129_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_129_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_129_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_129_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_129_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_129_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_129_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_129_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_129_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_129_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_129_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_129_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_129_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_129_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_129_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_129_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_129_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_129_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_129_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_129_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_129_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_129_31.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: in the garden\nB: on the table\nC: under the bed\nD: in sink",
+    "question": "What does the person want tank to be for the first action did before the person pour from something into something in the video?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: in the garden\nB: on the table\nC: under the bed\nD: in sink",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_130_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_130_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_130_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_130_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_130_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_130_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_130_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_130_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_130_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_130_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_130_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_130_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_130_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_130_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_130_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_130_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_130_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_130_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_130_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_130_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_130_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_130_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_130_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_130_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_130_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_130_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_130_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_130_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_130_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_130_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_130_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_130_31.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: in the refrigerator\nB: under the sink\nC: on top of stove\nD: outside in the garden",
+    "question": "What is the status of water-pot after the other person put something to something to change it?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: in the refrigerator\nB: under the sink\nC: on top of stove\nD: outside in the garden",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_131_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_131_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_131_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_131_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_131_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_131_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_131_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_131_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_131_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_131_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_131_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_131_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_131_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_131_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_131_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_131_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_131_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_131_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_131_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_131_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_131_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_131_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_131_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_131_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_131_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_131_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_131_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_131_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_131_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_131_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_131_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_131_31.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: Get watermelon from cutting-board\nB: Wash hands\nC: Put apple on the counter\nD: Chop vegetables",
+    "question": "What is the last action the person did in the video?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Get watermelon from cutting-board\nB: Wash hands\nC: Put apple on the counter\nD: Chop vegetables",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_132_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_132_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_132_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_132_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_132_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_132_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_132_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_132_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_132_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_132_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_132_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_132_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_132_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_132_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_132_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_132_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_132_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_132_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_132_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_132_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_132_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_132_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_132_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_132_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_132_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_132_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_132_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_132_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_132_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_132_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_132_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_132_31.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: possibly\nB: yes\nC: no\nD: unknown",
+    "question": "Does the first action in the video fulfills the preconditions of the action pouring from something into something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: possibly\nB: yes\nC: no\nD: unknown",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_133_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_133_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_133_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_133_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_133_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_133_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_133_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_133_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_133_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_133_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_133_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_133_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_133_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_133_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_133_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_133_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_133_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_133_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_133_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_133_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_133_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_133_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_133_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_133_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_133_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_133_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_133_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_133_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_133_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_133_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_133_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_133_31.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: maybe\nB: no\nC: I am not sure\nD: yes",
+    "question": "Did the attribute of controller1 changed because of the first action did before the person stand-up?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: maybe\nB: no\nC: I am not sure\nD: yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_134_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_134_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_134_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_134_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_134_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_134_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_134_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_134_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_134_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_134_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_134_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_134_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_134_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_134_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_134_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_134_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_134_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_134_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_134_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_134_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_134_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_134_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_134_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_134_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_134_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_134_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_134_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_134_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_134_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_134_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_134_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_134_31.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: full\nB: half-full\nC: boiling\nD: empty",
+    "question": "What is the status of kettle after the person do the last action in the video to change it?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: full\nB: half-full\nC: boiling\nD: empty",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_135_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_135_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_135_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_135_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_135_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_135_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_135_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_135_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_135_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_135_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_135_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_135_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_135_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_135_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_135_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_135_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_135_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_135_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_135_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_135_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_135_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_135_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_135_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_135_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_135_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_135_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_135_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_135_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_135_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_135_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_135_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_135_31.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: Turned on the lights\nB: Plugged in the TV\nC: Turned off the remote\nD: Turn on TV with remote",
+    "question": "How did the person changed the poweredness of the first object that has status change in the video?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Turned on the lights\nB: Plugged in the TV\nC: Turned off the remote\nD: Turn on TV with remote",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_136_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_136_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_136_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_136_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_136_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_136_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_136_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_136_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_136_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_136_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_136_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_136_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_136_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_136_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_136_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_136_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_136_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_136_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_136_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_136_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_136_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_136_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_136_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_136_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_136_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_136_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_136_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_136_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_136_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_136_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_136_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_136_31.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: yes\nB: maybe\nC: sometimes\nD: no",
+    "question": "Does the action putting something to something fulfills the preconditions of the action watching something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: yes\nB: maybe\nC: sometimes\nD: no",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_137_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_137_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_137_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_137_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_137_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_137_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_137_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_137_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_137_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_137_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_137_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_137_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_137_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_137_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_137_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_137_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_137_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_137_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_137_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_137_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_137_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_137_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_137_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_137_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_137_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_137_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_137_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_137_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_137_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_137_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_137_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_137_31.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: maybe\nB: no\nC: yes\nD: only if performed sequentially",
+    "question": "Does the action getting something from something fulfills the preconditions of the action putting something to something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: maybe\nB: no\nC: yes\nD: only if performed sequentially",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_138_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_138_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_138_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_138_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_138_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_138_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_138_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_138_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_138_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_138_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_138_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_138_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_138_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_138_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_138_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_138_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_138_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_138_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_138_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_138_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_138_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_138_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_138_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_138_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_138_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_138_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_138_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_138_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_138_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_138_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_138_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_138_31.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: Cooking food on stove\nB: Fill water-pot using water-dispenser\nC: Reading a book\nD: Talking on the phone",
+    "question": "what is the other person doing while the person stand-up?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Cooking food on stove\nB: Fill water-pot using water-dispenser\nC: Reading a book\nD: Talking on the phone",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_139_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_139_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_139_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_139_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_139_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_139_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_139_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_139_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_139_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_139_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_139_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_139_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_139_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_139_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_139_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_139_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_139_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_139_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_139_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_139_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_139_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_139_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_139_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_139_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_139_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_139_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_139_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_139_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_139_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_139_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_139_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_139_31.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: only if they open the fridge again\nB: yes\nC: maybe\nD: no",
+    "question": "Is fridge visible to the other person after the person do the first action did after he/she put something to something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: only if they open the fridge again\nB: yes\nC: maybe\nD: no",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_140_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_140_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_140_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_140_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_140_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_140_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_140_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_140_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_140_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_140_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_140_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_140_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_140_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_140_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_140_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_140_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_140_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_140_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_140_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_140_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_140_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_140_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_140_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_140_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_140_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_140_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_140_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_140_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_140_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_140_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_140_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_140_31.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: yes\nB: no\nC: not sure\nD: maybe",
+    "question": "Did the attribute of fridge changed because of the action opening something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: yes\nB: no\nC: not sure\nD: maybe",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_141_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_141_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_141_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_141_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_141_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_141_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_141_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_141_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_141_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_141_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_141_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_141_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_141_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_141_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_141_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_141_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_141_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_141_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_141_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_141_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_141_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_141_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_141_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_141_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_141_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_141_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_141_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_141_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_141_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_141_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_141_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_141_31.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: Get cereal from table\nB: Turn off the lights\nC: Wash dishes\nD: Open fridge",
+    "question": "What is the last action the person did in the video?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Get cereal from table\nB: Turn off the lights\nC: Wash dishes\nD: Open fridge",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_142_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_142_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_142_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_142_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_142_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_142_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_142_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_142_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_142_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_142_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_142_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_142_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_142_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_142_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_142_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_142_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_142_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_142_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_142_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_142_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_142_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_142_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_142_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_142_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_142_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_142_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_142_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_142_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_142_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_142_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_142_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_142_31.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: Put orange in the fridge\nB: Put watermelon to fridge\nC: Take watermelon out of the fridge\nD: Take apples from the table",
+    "question": "What is the person doing before he/she get something from something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Put orange in the fridge\nB: Put watermelon to fridge\nC: Take watermelon out of the fridge\nD: Take apples from the table",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_143_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_143_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_143_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_143_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_143_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_143_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_143_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_143_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_143_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_143_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_143_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_143_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_143_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_143_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_143_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_143_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_143_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_143_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_143_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_143_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_143_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_143_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_143_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_143_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_143_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_143_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_143_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_143_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_143_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_143_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_143_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_143_31.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: yes\nB: probably\nC: maybe\nD: no",
+    "question": "Did the attribute of the first object that has status change in the video changed because of the action getting something from something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: yes\nB: probably\nC: maybe\nD: no",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_144_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_144_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_144_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_144_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_144_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_144_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_144_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_144_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_144_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_144_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_144_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_144_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_144_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_144_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_144_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_144_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_144_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_144_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_144_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_144_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_144_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_144_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_144_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_144_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_144_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_144_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_144_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_144_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_144_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_144_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_144_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_144_31.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: television\nB: remote\nC: phone\nD: computer",
+    "question": "If the actor do not put something to something, which object will he/she not be able to change in the future?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: television\nB: remote\nC: phone\nD: computer",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_145_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_145_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_145_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_145_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_145_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_145_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_145_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_145_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_145_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_145_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_145_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_145_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_145_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_145_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_145_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_145_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_145_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_145_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_145_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_145_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_145_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_145_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_145_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_145_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_145_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_145_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_145_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_145_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_145_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_145_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_145_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_145_31.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: Throw the cup away\nB: Break the cup\nC: Put cup to the other person\nD: Keep the cup for themselves",
+    "question": "If the person did not do the first action did after he/she get something from something, what remaining actions in the video is executable?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Throw the cup away\nB: Break the cup\nC: Put cup to the other person\nD: Keep the cup for themselves",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_146_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_146_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_146_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_146_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_146_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_146_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_146_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_146_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_146_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_146_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_146_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_146_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_146_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_146_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_146_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_146_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_146_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_146_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_146_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_146_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_146_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_146_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_146_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_146_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_146_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_146_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_146_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_146_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_146_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_146_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_146_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_146_31.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: mixing\nB: harvesting\nC: watering\nD: pruning",
+    "question": "What does the person want plant to be for the first action did before the person fill something using something in the video?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: mixing\nB: harvesting\nC: watering\nD: pruning",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_147_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_147_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_147_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_147_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_147_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_147_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_147_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_147_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_147_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_147_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_147_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_147_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_147_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_147_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_147_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_147_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_147_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_147_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_147_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_147_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_147_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_147_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_147_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_147_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_147_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_147_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_147_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_147_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_147_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_147_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_147_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_147_31.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: sometimes\nB: yes\nC: no\nD: maybe",
+    "question": "If the person did not do the first action did after he/she fill something using something, is the person able to work on something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: sometimes\nB: yes\nC: no\nD: maybe",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_148_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_148_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_148_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_148_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_148_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_148_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_148_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_148_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_148_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_148_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_148_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_148_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_148_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_148_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_148_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_148_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_148_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_148_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_148_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_148_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_148_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_148_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_148_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_148_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_148_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_148_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_148_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_148_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_148_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_148_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_148_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_148_31.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: emptiness\nB: happiness\nC: fullness\nD: sadness",
+    "question": "what status will the person change on juicer-base?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: emptiness\nB: happiness\nC: fullness\nD: sadness",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_149_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_149_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_149_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_149_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_149_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_149_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_149_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_149_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_149_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_149_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_149_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_149_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_149_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_149_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_149_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_149_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_149_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_149_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_149_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_149_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_149_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_149_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_149_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_149_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_149_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_149_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_149_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_149_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_149_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_149_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_149_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_149_31.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: no\nB: yes\nC: uncertain\nD: maybe",
+    "question": "Did the attribute of water-pot changed because of the first action did after the person sit down on something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: no\nB: yes\nC: uncertain\nD: maybe",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_150_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_150_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_150_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_150_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_150_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_150_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_150_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_150_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_150_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_150_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_150_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_150_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_150_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_150_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_150_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_150_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_150_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_150_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_150_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_150_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_150_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_150_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_150_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_150_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_150_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_150_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_150_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_150_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_150_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_150_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_150_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_150_31.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: kettle\nB: towel\nC: window\nD: chair",
+    "question": "which object changed its status when the other person do the last action in the video?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: kettle\nB: towel\nC: window\nD: chair",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_151_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_151_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_151_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_151_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_151_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_151_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_151_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_151_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_151_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_151_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_151_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_151_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_151_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_151_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_151_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_151_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_151_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_151_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_151_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_151_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_151_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_151_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_151_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_151_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_151_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_151_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_151_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_151_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_151_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_151_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_151_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_151_31.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: Put cup to cutting-board\nB: Playing a game\nC: Reading a book\nD: Watching TV",
+    "question": "what is the other person doing while the person open something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Put cup to cutting-board\nB: Playing a game\nC: Reading a book\nD: Watching TV",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_152_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_152_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_152_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_152_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_152_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_152_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_152_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_152_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_152_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_152_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_152_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_152_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_152_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_152_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_152_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_152_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_152_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_152_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_152_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_152_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_152_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_152_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_152_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_152_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_152_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_152_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_152_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_152_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_152_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_152_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_152_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_152_31.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: state of separation\nB: state of mixture\nC: state of disintegration\nD: state of dissolution",
+    "question": "what status of noodles changed while the person do the last action in the video?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: state of separation\nB: state of mixture\nC: state of disintegration\nD: state of dissolution",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_153_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_153_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_153_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_153_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_153_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_153_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_153_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_153_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_153_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_153_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_153_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_153_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_153_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_153_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_153_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_153_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_153_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_153_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_153_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_153_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_153_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_153_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_153_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_153_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_153_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_153_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_153_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_153_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_153_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_153_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_153_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_153_31.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: in bottle\nB: in cup2\nC: in cup1\nD: on table",
+    "question": "How would the first action did after the person close something change the state of milk1?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: in bottle\nB: in cup2\nC: in cup1\nD: on table",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_154_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_154_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_154_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_154_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_154_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_154_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_154_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_154_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_154_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_154_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_154_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_154_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_154_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_154_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_154_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_154_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_154_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_154_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_154_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_154_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_154_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_154_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_154_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_154_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_154_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_154_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_154_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_154_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_154_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_154_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_154_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_154_31.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: off\nB: ignored\nC: on\nD: broken",
+    "question": "What does the person want kettle to be for the first action did after the person work on something in the video?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: off\nB: ignored\nC: on\nD: broken",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_155_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_155_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_155_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_155_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_155_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_155_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_155_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_155_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_155_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_155_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_155_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_155_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_155_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_155_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_155_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_155_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_155_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_155_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_155_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_155_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_155_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_155_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_155_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_155_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_155_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_155_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_155_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_155_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_155_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_155_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_155_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_155_31.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: no\nB: yes\nC: sometimes\nD: maybe",
+    "question": "Does the action sitting down on something fulfills the preconditions of the action drinking something with something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: no\nB: yes\nC: sometimes\nD: maybe",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_156_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_156_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_156_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_156_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_156_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_156_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_156_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_156_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_156_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_156_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_156_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_156_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_156_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_156_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_156_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_156_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_156_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_156_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_156_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_156_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_156_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_156_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_156_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_156_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_156_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_156_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_156_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_156_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_156_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_156_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_156_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_156_31.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: box1\nB: box2\nC: wrapping1\nD: wrapping2",
+    "question": "which object changed its status when the person get something from something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: box1\nB: box2\nC: wrapping1\nD: wrapping2",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_157_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_157_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_157_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_157_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_157_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_157_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_157_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_157_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_157_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_157_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_157_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_157_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_157_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_157_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_157_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_157_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_157_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_157_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_157_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_157_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_157_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_157_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_157_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_157_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_157_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_157_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_157_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_157_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_157_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_157_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_157_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_157_31.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: yes\nB: unknown\nC: no\nD: maybe",
+    "question": "Did the attribute of lettuce changed because of the first action did after the person put something to something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: yes\nB: unknown\nC: no\nD: maybe",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_158_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_158_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_158_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_158_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_158_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_158_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_158_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_158_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_158_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_158_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_158_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_158_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_158_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_158_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_158_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_158_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_158_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_158_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_158_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_158_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_158_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_158_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_158_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_158_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_158_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_158_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_158_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_158_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_158_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_158_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_158_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_158_31.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: Cook meat using microwave\nB: Cook meat using pan and stove\nC: Cook meat using oven\nD: Cook meat using grill",
+    "question": "How did the person changed the cookedness of meat12?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Cook meat using microwave\nB: Cook meat using pan and stove\nC: Cook meat using oven\nD: Cook meat using grill",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_159_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_159_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_159_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_159_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_159_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_159_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_159_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_159_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_159_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_159_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_159_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_159_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_159_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_159_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_159_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_159_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_159_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_159_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_159_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_159_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_159_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_159_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_159_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_159_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_159_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_159_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_159_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_159_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_159_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_159_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_159_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_159_31.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: no\nB: partially\nC: yes\nD: maybe",
+    "question": "Did the attribute of the first object that has status change in the video changed because of the action filling something using something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: no\nB: partially\nC: yes\nD: maybe",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_160_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_160_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_160_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_160_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_160_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_160_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_160_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_160_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_160_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_160_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_160_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_160_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_160_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_160_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_160_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_160_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_160_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_160_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_160_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_160_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_160_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_160_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_160_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_160_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_160_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_160_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_160_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_160_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_160_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_160_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_160_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_160_31.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: underneath watermelon\nB: on top of watermelon\nC: next to watermelon\nD: inside watermelon",
+    "question": "What is the precondition of changing the spatial relationships of watermelon1?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: underneath watermelon\nB: on top of watermelon\nC: next to watermelon\nD: inside watermelon",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_161_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_161_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_161_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_161_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_161_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_161_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_161_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_161_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_161_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_161_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_161_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_161_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_161_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_161_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_161_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_161_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_161_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_161_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_161_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_161_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_161_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_161_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_161_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_161_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_161_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_161_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_161_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_161_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_161_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_161_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_161_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_161_31.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: yes\nB: sometimes\nC: maybe\nD: no",
+    "question": "Did the attribute of the first object that has status change in the video changed because of the action putting something to something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: yes\nB: sometimes\nC: maybe\nD: no",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_162_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_162_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_162_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_162_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_162_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_162_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_162_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_162_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_162_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_162_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_162_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_162_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_162_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_162_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_162_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_162_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_162_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_162_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_162_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_162_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_162_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_162_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_162_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_162_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_162_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_162_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_162_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_162_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_162_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_162_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_162_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_162_31.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: vacuum cleaner\nB: refrigerator\nC: microwave\nD: television",
+    "question": "which object changed its status when the other person do the first action in the video?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: vacuum cleaner\nB: refrigerator\nC: microwave\nD: television",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_163_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_163_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_163_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_163_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_163_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_163_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_163_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_163_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_163_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_163_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_163_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_163_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_163_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_163_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_163_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_163_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_163_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_163_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_163_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_163_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_163_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_163_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_163_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_163_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_163_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_163_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_163_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_163_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_163_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_163_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_163_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_163_31.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: Sit on the couch\nB: Check their phone\nC: Go outside\nD: Get bowl and spoon from table",
+    "question": "What is the person doing after he/she point to something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Sit on the couch\nB: Check their phone\nC: Go outside\nD: Get bowl and spoon from table",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_164_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_164_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_164_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_164_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_164_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_164_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_164_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_164_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_164_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_164_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_164_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_164_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_164_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_164_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_164_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_164_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_164_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_164_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_164_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_164_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_164_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_164_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_164_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_164_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_164_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_164_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_164_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_164_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_164_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_164_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_164_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_164_31.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: in basin\nB: on ground\nC: in tree\nD: in sky",
+    "question": "How would the action putting something to something using fishing-net change the state of fish?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: in basin\nB: on ground\nC: in tree\nD: in sky",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_165_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_165_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_165_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_165_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_165_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_165_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_165_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_165_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_165_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_165_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_165_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_165_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_165_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_165_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_165_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_165_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_165_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_165_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_165_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_165_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_165_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_165_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_165_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_165_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_165_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_165_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_165_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_165_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_165_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_165_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_165_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_165_31.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: Read a book\nB: Get remote from table\nC: Go for a walk\nD: Start cooking dinner",
+    "question": "what will the other person do next?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Read a book\nB: Get remote from table\nC: Go for a walk\nD: Start cooking dinner",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_166_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_166_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_166_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_166_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_166_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_166_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_166_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_166_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_166_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_166_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_166_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_166_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_166_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_166_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_166_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_166_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_166_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_166_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_166_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_166_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_166_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_166_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_166_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_166_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_166_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_166_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_166_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_166_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_166_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_166_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_166_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_166_31.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: probably not\nB: maybe\nC: yes\nD: no",
+    "question": "Did the attribute of juicer changed because of the last action in the video?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: probably not\nB: maybe\nC: yes\nD: no",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_167_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_167_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_167_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_167_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_167_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_167_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_167_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_167_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_167_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_167_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_167_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_167_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_167_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_167_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_167_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_167_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_167_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_167_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_167_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_167_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_167_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_167_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_167_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_167_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_167_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_167_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_167_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_167_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_167_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_167_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_167_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_167_31.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: Watch TV\nB: Go for a run\nC: Read a book\nD: Wash bowl",
+    "question": "If the person did not eat something with something, what remaining actions in the video is executable?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Watch TV\nB: Go for a run\nC: Read a book\nD: Wash bowl",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_168_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_168_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_168_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_168_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_168_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_168_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_168_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_168_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_168_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_168_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_168_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_168_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_168_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_168_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_168_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_168_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_168_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_168_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_168_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_168_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_168_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_168_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_168_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_168_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_168_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_168_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_168_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_168_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_168_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_168_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_168_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_168_31.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: Painting a portrait\nB: Playing a musical instrument\nC: Cook meat using fork and pan and stove\nD: Reading a book by the fireplace",
+    "question": "what is the other person doing while the person do the last action in the video?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Painting a portrait\nB: Playing a musical instrument\nC: Cook meat using fork and pan and stove\nD: Reading a book by the fireplace",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_169_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_169_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_169_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_169_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_169_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_169_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_169_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_169_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_169_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_169_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_169_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_169_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_169_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_169_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_169_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_169_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_169_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_169_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_169_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_169_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_169_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_169_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_169_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_169_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_169_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_169_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_169_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_169_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_169_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_169_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_169_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_169_31.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: Put meat to pan using fork\nB: Put meat to pan using knife\nC: Put meat to plate using fork\nD: Put meat to pan using spatula",
+    "question": "How did the person changed the spatial relationships of meat1?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Put meat to pan using fork\nB: Put meat to pan using knife\nC: Put meat to plate using fork\nD: Put meat to pan using spatula",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_170_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_170_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_170_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_170_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_170_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_170_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_170_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_170_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_170_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_170_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_170_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_170_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_170_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_170_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_170_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_170_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_170_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_170_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_170_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_170_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_170_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_170_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_170_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_170_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_170_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_170_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_170_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_170_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_170_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_170_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_170_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_170_31.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: no\nB: maybe\nC: uncertain\nD: yes",
+    "question": "Did the attribute of meat1 changed because of the action getting something from something using fork?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: no\nB: maybe\nC: uncertain\nD: yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_171_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_171_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_171_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_171_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_171_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_171_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_171_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_171_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_171_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_171_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_171_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_171_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_171_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_171_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_171_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_171_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_171_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_171_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_171_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_171_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_171_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_171_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_171_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_171_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_171_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_171_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_171_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_171_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_171_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_171_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_171_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_171_31.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: coffee2\nB: coffee1\nC: bottle\nD: tea",
+    "question": "which object changed its status last in the video?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: coffee2\nB: coffee1\nC: bottle\nD: tea",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_172_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_172_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_172_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_172_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_172_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_172_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_172_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_172_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_172_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_172_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_172_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_172_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_172_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_172_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_172_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_172_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_172_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_172_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_172_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_172_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_172_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_172_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_172_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_172_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_172_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_172_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_172_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_172_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_172_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_172_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_172_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_172_31.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: yes\nB: maybe\nC: uncertain\nD: no",
+    "question": "Does the first action in the video fulfills the preconditions of the action opening something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: yes\nB: maybe\nC: uncertain\nD: no",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_173_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_173_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_173_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_173_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_173_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_173_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_173_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_173_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_173_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_173_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_173_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_173_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_173_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_173_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_173_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_173_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_173_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_173_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_173_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_173_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_173_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_173_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_173_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_173_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_173_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_173_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_173_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_173_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_173_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_173_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_173_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_173_31.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: sometimes\nB: no\nC: maybe\nD: yes",
+    "question": "If the person did not do the first action in the video, will drawer change its status?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: sometimes\nB: no\nC: maybe\nD: yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_174_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_174_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_174_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_174_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_174_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_174_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_174_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_174_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_174_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_174_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_174_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_174_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_174_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_174_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_174_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_174_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_174_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_174_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_174_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_174_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_174_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_174_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_174_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_174_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_174_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_174_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_174_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_174_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_174_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_174_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_174_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_174_31.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: no\nB: yes, the status changed due to pouring\nC: the action completed without any status change\nD: the attribute has been initialized",
+    "question": "Did the attribute of the object has status change changed because of the action pouring from something into something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: no\nB: yes, the status changed due to pouring\nC: the action completed without any status change\nD: the attribute has been initialized",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_175_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_175_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_175_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_175_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_175_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_175_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_175_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_175_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_175_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_175_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_175_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_175_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_175_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_175_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_175_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_175_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_175_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_175_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_175_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_175_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_175_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_175_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_175_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_175_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_175_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_175_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_175_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_175_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_175_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_175_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_175_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_175_31.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: sometimes\nB: no\nC: yes\nD: maybe",
+    "question": "If the person did not do the first action did before he/she wash something, is the person able to get something from something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: sometimes\nB: no\nC: yes\nD: maybe",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_176_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_176_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_176_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_176_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_176_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_176_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_176_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_176_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_176_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_176_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_176_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_176_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_176_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_176_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_176_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_176_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_176_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_176_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_176_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_176_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_176_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_176_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_176_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_176_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_176_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_176_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_176_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_176_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_176_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_176_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_176_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_176_31.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: transparent\nB: blue\nC: empty\nD: nonempty",
+    "question": "What is the status of trash-can after the other person throw something into something to change it?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: transparent\nB: blue\nC: empty\nD: nonempty",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_177_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_177_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_177_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_177_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_177_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_177_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_177_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_177_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_177_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_177_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_177_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_177_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_177_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_177_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_177_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_177_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_177_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_177_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_177_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_177_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_177_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_177_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_177_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_177_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_177_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_177_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_177_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_177_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_177_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_177_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_177_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_177_31.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: Put lettuce to trash-can\nB: Moved the trash-can\nC: Cleaned the trash-can\nD: Removed the lettuce",
+    "question": "What action caused trash-can's status to change to nonempty?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Put lettuce to trash-can\nB: Moved the trash-can\nC: Cleaned the trash-can\nD: Removed the lettuce",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_178_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_178_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_178_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_178_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_178_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_178_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_178_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_178_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_178_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_178_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_178_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_178_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_178_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_178_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_178_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_178_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_178_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_178_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_178_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_178_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_178_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_178_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_178_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_178_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_178_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_178_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_178_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_178_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_178_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_178_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_178_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_178_31.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: Turn on the light\nB: Open the refrigerator\nC: Get knife from knife-base\nD: Sit on the couch",
+    "question": "During which action does the person knows about the other person's action?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Turn on the light\nB: Open the refrigerator\nC: Get knife from knife-base\nD: Sit on the couch",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_179_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_179_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_179_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_179_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_179_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_179_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_179_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_179_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_179_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_179_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_179_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_179_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_179_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_179_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_179_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_179_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_179_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_179_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_179_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_179_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_179_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_179_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_179_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_179_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_179_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_179_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_179_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_179_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_179_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_179_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_179_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_179_31.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: maybe\nB: no\nC: sometimes\nD: yes",
+    "question": "Did the attribute of vacuum changed because of the action putting something to something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: maybe\nB: no\nC: sometimes\nD: yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_180_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_180_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_180_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_180_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_180_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_180_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_180_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_180_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_180_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_180_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_180_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_180_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_180_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_180_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_180_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_180_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_180_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_180_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_180_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_180_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_180_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_180_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_180_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_180_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_180_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_180_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_180_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_180_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_180_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_180_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_180_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_180_31.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: inside the refrigerator\nB: next to the coffee machine\nC: on top of juicer base\nD: under the microwave",
+    "question": "what will the person want to have the first object that has status change in the video's spatial relationships be in the future?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: inside the refrigerator\nB: next to the coffee machine\nC: on top of juicer base\nD: under the microwave",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_181_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_181_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_181_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_181_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_181_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_181_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_181_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_181_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_181_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_181_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_181_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_181_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_181_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_181_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_181_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_181_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_181_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_181_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_181_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_181_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_181_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_181_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_181_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_181_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_181_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_181_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_181_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_181_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_181_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_181_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_181_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_181_31.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: Washed the car\nB: Sweep floor using vacuum\nC: Read a book\nD: Cooked dinner",
+    "question": "How did the person changed the cleanliness of vacuum?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Washed the car\nB: Sweep floor using vacuum\nC: Read a book\nD: Cooked dinner",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_182_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_182_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_182_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_182_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_182_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_182_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_182_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_182_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_182_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_182_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_182_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_182_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_182_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_182_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_182_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_182_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_182_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_182_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_182_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_182_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_182_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_182_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_182_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_182_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_182_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_182_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_182_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_182_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_182_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_182_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_182_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_182_31.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: bowl\nB: book\nC: door\nD: lamp",
+    "question": "which object changed its status when the person get something from something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: bowl\nB: book\nC: door\nD: lamp",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_183_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_183_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_183_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_183_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_183_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_183_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_183_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_183_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_183_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_183_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_183_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_183_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_183_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_183_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_183_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_183_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_183_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_183_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_183_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_183_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_183_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_183_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_183_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_183_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_183_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_183_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_183_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_183_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_183_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_183_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_183_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_183_31.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: yes\nB: no\nC: sometimes\nD: always",
+    "question": "Is the other person aware when the person get something from something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: yes\nB: no\nC: sometimes\nD: always",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_184_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_184_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_184_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_184_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_184_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_184_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_184_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_184_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_184_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_184_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_184_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_184_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_184_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_184_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_184_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_184_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_184_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_184_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_184_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_184_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_184_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_184_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_184_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_184_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_184_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_184_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_184_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_184_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_184_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_184_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_184_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_184_31.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: no\nB: yes\nC: I don\u2019t know\nD: maybe",
+    "question": "If the person did not fill something using something, is the person able to do the first action in the video?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: no\nB: yes\nC: I don\u2019t know\nD: maybe",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_185_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_185_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_185_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_185_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_185_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_185_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_185_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_185_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_185_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_185_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_185_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_185_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_185_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_185_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_185_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_185_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_185_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_185_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_185_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_185_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_185_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_185_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_185_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_185_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_185_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_185_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_185_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_185_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_185_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_185_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_185_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_185_31.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: Close the tank-lid\nB: Get tank-lid from table\nC: Take a seat\nD: Pour water into the tank",
+    "question": "what is the other person doing while the person do the first action did after he/she put something to something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Close the tank-lid\nB: Get tank-lid from table\nC: Take a seat\nD: Pour water into the tank",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_186_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_186_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_186_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_186_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_186_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_186_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_186_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_186_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_186_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_186_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_186_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_186_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_186_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_186_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_186_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_186_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_186_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_186_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_186_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_186_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_186_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_186_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_186_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_186_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_186_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_186_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_186_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_186_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_186_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_186_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_186_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_186_31.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: no\nB: only if the object is transparent\nC: sometimes\nD: yes",
+    "question": "Is cutting-board visible to the other person after the person put something to something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: no\nB: only if the object is transparent\nC: sometimes\nD: yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_187_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_187_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_187_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_187_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_187_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_187_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_187_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_187_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_187_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_187_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_187_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_187_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_187_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_187_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_187_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_187_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_187_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_187_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_187_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_187_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_187_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_187_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_187_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_187_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_187_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_187_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_187_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_187_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_187_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_187_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_187_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_187_31.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: Cut another vegetable\nB: Put knife to knife-base\nC: Throw the knife away\nD: Wash the knife",
+    "question": "what will the person do next after this video?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Cut another vegetable\nB: Put knife to knife-base\nC: Throw the knife away\nD: Wash the knife",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_188_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_188_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_188_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_188_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_188_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_188_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_188_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_188_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_188_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_188_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_188_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_188_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_188_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_188_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_188_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_188_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_188_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_188_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_188_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_188_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_188_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_188_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_188_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_188_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_188_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_188_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_188_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_188_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_188_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_188_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_188_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_188_31.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: in a drawer\nB: on a shelf\nC: under the bed\nD: in trash can",
+    "question": "How would the action throwing something into something change the state of wrapping?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: in a drawer\nB: on a shelf\nC: under the bed\nD: in trash can",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_189_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_189_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_189_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_189_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_189_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_189_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_189_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_189_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_189_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_189_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_189_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_189_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_189_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_189_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_189_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_189_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_189_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_189_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_189_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_189_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_189_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_189_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_189_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_189_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_189_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_189_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_189_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_189_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_189_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_189_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_189_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_189_31.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: in the dishwasher\nB: in sink\nC: on the table\nD: in the fridge",
+    "question": "What is the status of cup before the other person do the first action after he/she put something to something to change it?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: in the dishwasher\nB: in sink\nC: on the table\nD: in the fridge",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_190_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_190_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_190_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_190_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_190_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_190_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_190_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_190_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_190_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_190_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_190_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_190_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_190_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_190_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_190_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_190_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_190_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_190_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_190_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_190_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_190_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_190_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_190_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_190_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_190_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_190_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_190_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_190_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_190_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_190_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_190_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_190_31.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: table\nB: tv\nC: window\nD: phone",
+    "question": "which object changed its status when the person do the first action did after he/she stand-up?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: table\nB: tv\nC: window\nD: phone",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_191_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_191_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_191_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_191_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_191_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_191_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_191_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_191_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_191_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_191_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_191_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_191_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_191_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_191_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_191_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_191_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_191_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_191_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_191_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_191_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_191_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_191_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_191_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_191_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_191_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_191_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_191_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_191_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_191_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_191_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_191_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_191_31.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: no\nB: maybe\nC: yes\nD: I don't know",
+    "question": "If the person did not throw something into something, is the person able to get something from something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: no\nB: maybe\nC: yes\nD: I don't know",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_192_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_192_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_192_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_192_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_192_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_192_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_192_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_192_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_192_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_192_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_192_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_192_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_192_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_192_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_192_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_192_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_192_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_192_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_192_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_192_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_192_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_192_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_192_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_192_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_192_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_192_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_192_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_192_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_192_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_192_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_192_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_192_31.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: Get fishing-net from basin\nB: Throw the net into the water\nC: Cover the basin with a lid\nD: Pour water from the basin",
+    "question": "If the person did not pour from something into something, what remaining actions in the video is executable?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Get fishing-net from basin\nB: Throw the net into the water\nC: Cover the basin with a lid\nD: Pour water from the basin",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_193_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_193_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_193_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_193_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_193_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_193_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_193_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_193_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_193_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_193_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_193_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_193_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_193_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_193_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_193_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_193_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_193_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_193_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_193_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_193_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_193_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_193_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_193_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_193_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_193_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_193_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_193_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_193_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_193_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_193_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_193_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_193_31.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: Cut lettuce using knife\nB: Boiling water\nC: Stirring a pot\nD: Peeling an orange",
+    "question": "What is the person doing after he/she throw something into something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Cut lettuce using knife\nB: Boiling water\nC: Stirring a pot\nD: Peeling an orange",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_194_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_194_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_194_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_194_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_194_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_194_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_194_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_194_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_194_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_194_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_194_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_194_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_194_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_194_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_194_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_194_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_194_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_194_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_194_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_194_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_194_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_194_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_194_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_194_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_194_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_194_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_194_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_194_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_194_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_194_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_194_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_194_31.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: sometimes\nB: yes\nC: no\nD: maybe",
+    "question": "If the person did not do the first action did after he/she put something to something, is the person able to get something from something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: sometimes\nB: yes\nC: no\nD: maybe",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_195_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_195_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_195_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_195_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_195_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_195_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_195_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_195_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_195_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_195_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_195_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_195_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_195_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_195_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_195_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_195_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_195_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_195_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_195_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_195_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_195_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_195_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_195_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_195_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_195_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_195_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_195_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_195_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_195_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_195_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_195_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_195_31.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: yes\nB: maybe\nC: no\nD: I don\u2019t know",
+    "question": "If the person did not do the first action in the video, will cereal change its status?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: yes\nB: maybe\nC: no\nD: I don\u2019t know",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_196_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_196_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_196_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_196_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_196_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_196_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_196_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_196_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_196_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_196_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_196_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_196_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_196_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_196_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_196_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_196_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_196_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_196_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_196_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_196_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_196_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_196_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_196_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_196_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_196_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_196_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_196_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_196_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_196_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_196_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_196_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_196_31.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: maybe\nB: uncertain\nC: yes\nD: no",
+    "question": "Does the action getting something from something fulfills the preconditions of the last action in the video?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: maybe\nB: uncertain\nC: yes\nD: no",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_197_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_197_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_197_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_197_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_197_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_197_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_197_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_197_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_197_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_197_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_197_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_197_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_197_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_197_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_197_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_197_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_197_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_197_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_197_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_197_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_197_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_197_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_197_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_197_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_197_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_197_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_197_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_197_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_197_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_197_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_197_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_197_31.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: Used a remote\nB: Pressed a button\nC: Get controller from table\nD: Turned on the switch",
+    "question": "How did the person changed the poweredness of the first object that has status change in the video?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Used a remote\nB: Pressed a button\nC: Get controller from table\nD: Turned on the switch",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_198_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_198_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_198_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_198_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_198_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_198_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_198_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_198_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_198_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_198_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_198_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_198_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_198_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_198_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_198_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_198_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_198_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_198_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_198_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_198_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_198_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_198_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_198_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_198_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_198_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_198_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_198_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_198_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_198_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_198_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_198_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_198_31.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "Egocentric_Video_QuestionAnswering",
+    "visual_input_component": "egocentric image",
+    "source": "EgoTaskQA",
+    "options": "A: sometimes\nB: only if the action is prolonged\nC: yes\nD: no",
+    "question": "Did the attribute of meat changed because of the action closing something?",
+    "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: sometimes\nB: only if the action is prolonged\nC: yes\nD: no",
+    "input_image_path": [
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_199_0.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_199_1.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_199_2.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_199_3.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_199_4.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_199_5.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_199_6.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_199_7.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_199_8.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_199_9.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_199_10.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_199_11.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_199_12.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_199_13.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_199_14.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_199_15.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_199_16.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_199_17.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_199_18.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_199_19.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_199_20.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_199_21.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_199_22.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_199_23.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_199_24.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_199_25.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_199_26.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_199_27.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_199_28.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_199_29.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_199_30.png",
+      "../MMIU-Benchmark/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_199_31.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: bottle\nB: lamp\nC: chair\nD: clock",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bottle\nB: lamp\nC: chair\nD: clock",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_0_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_0_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_0_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_0_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_0_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_0_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: wardrobe\nB: television stand\nC: radio\nD: xbox",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: wardrobe\nB: television stand\nC: radio\nD: xbox",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_1_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_1_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_1_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_1_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_1_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_1_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: tv stand\nB: sofa\nC: stool\nD: chair",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: tv stand\nB: sofa\nC: stool\nD: chair",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_2_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_2_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_2_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_2_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_2_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_2_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: radio\nB: loudspeaker\nC: guitar\nD: microphone",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: radio\nB: loudspeaker\nC: guitar\nD: microphone",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_3_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_3_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_3_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_3_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_3_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_3_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: cabinet\nB: bathtub\nC: glass box\nD: monitor",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: cabinet\nB: bathtub\nC: glass box\nD: monitor",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_4_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_4_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_4_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_4_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_4_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_4_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: mantel\nB: bookshelf\nC: curtain\nD: stool",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: mantel\nB: bookshelf\nC: curtain\nD: stool",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_5_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_5_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_5_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_5_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_5_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_5_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: toilet\nB: sink\nC: bathtub\nD: stool",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: toilet\nB: sink\nC: bathtub\nD: stool",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_6_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_6_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_6_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_6_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_6_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_6_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: bowl\nB: table\nC: stairs\nD: laptop",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bowl\nB: table\nC: stairs\nD: laptop",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_7_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_7_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_7_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_7_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_7_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_7_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: television stand\nB: radio\nC: vase\nD: lamp",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: television stand\nB: radio\nC: vase\nD: lamp",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_8_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_8_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_8_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_8_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_8_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_8_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: bookshelf\nB: telephone\nC: chair\nD: desk",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bookshelf\nB: telephone\nC: chair\nD: desk",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_9_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_9_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_9_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_9_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_9_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_9_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: loudspeaker\nB: watercraft\nC: airplane\nD: chair",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: loudspeaker\nB: watercraft\nC: airplane\nD: chair",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_10_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_10_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_10_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_10_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_10_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_10_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: chair\nB: dresser\nC: night stand\nD: bed",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: chair\nB: dresser\nC: night stand\nD: bed",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_11_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_11_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_11_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_11_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_11_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_11_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: bookshelf\nB: desk\nC: toilet\nD: lamp",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bookshelf\nB: desk\nC: toilet\nD: lamp",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_12_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_12_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_12_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_12_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_12_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_12_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: tv stand\nB: telephone\nC: clock\nD: laptop",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: tv stand\nB: telephone\nC: clock\nD: laptop",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_13_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_13_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_13_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_13_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_13_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_13_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: cabinet\nB: lamp\nC: mantel\nD: table",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: cabinet\nB: lamp\nC: mantel\nD: table",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_14_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_14_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_14_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_14_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_14_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_14_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: clock\nB: vase\nC: bookshelf\nD: telephone",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: clock\nB: vase\nC: bookshelf\nD: telephone",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_15_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_15_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_15_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_15_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_15_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_15_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: guitar\nB: speaker\nC: table\nD: chair",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: guitar\nB: speaker\nC: table\nD: chair",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_16_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_16_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_16_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_16_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_16_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_16_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: stool\nB: piano\nC: microphone\nD: guitar",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: stool\nB: piano\nC: microphone\nD: guitar",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_17_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_17_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_17_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_17_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_17_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_17_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: desk\nB: chair\nC: sofa\nD: table",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: desk\nB: chair\nC: sofa\nD: table",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_18_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_18_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_18_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_18_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_18_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_18_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: chair\nB: night stand\nC: bed\nD: lamp",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: chair\nB: night stand\nC: bed\nD: lamp",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_19_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_19_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_19_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_19_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_19_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_19_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: telephone\nB: sofa\nC: table\nD: chair",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: telephone\nB: sofa\nC: table\nD: chair",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_20_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_20_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_20_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_20_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_20_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_20_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: range hood\nB: clock\nC: telephone\nD: vase",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: range hood\nB: clock\nC: telephone\nD: vase",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_21_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_21_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_21_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_21_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_21_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_21_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: bathtub\nB: airplane\nC: watercraft\nD: car",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bathtub\nB: airplane\nC: watercraft\nD: car",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_22_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_22_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_22_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_22_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_22_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_22_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: airplane\nB: bicycle\nC: motorcycle\nD: car",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: airplane\nB: bicycle\nC: motorcycle\nD: car",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_23_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_23_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_23_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_23_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_23_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_23_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: table\nB: night stand\nC: lamp\nD: chair",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: table\nB: night stand\nC: lamp\nD: chair",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_24_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_24_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_24_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_24_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_24_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_24_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: car\nB: telephone\nC: toilet\nD: bottle",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: car\nB: telephone\nC: toilet\nD: bottle",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_25_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_25_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_25_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_25_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_25_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_25_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: flower pot\nB: lamp\nC: stairs\nD: plant",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: flower pot\nB: lamp\nC: stairs\nD: plant",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_26_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_26_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_26_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_26_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_26_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_26_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: bookshelf\nB: desk\nC: table\nD: chair",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bookshelf\nB: desk\nC: table\nD: chair",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_27_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_27_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_27_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_27_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_27_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_27_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: table\nB: chair\nC: night stand\nD: telephone",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: table\nB: chair\nC: night stand\nD: telephone",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_28_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_28_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_28_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_28_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_28_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_28_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: faucet\nB: telephone\nC: clock\nD: vase",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: faucet\nB: telephone\nC: clock\nD: vase",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_29_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_29_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_29_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_29_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_29_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_29_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: night stand\nB: chair\nC: lamp\nD: dresser",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: night stand\nB: chair\nC: lamp\nD: dresser",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_30_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_30_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_30_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_30_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_30_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_30_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: desk\nB: cabinet\nC: table\nD: chair",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: desk\nB: cabinet\nC: table\nD: chair",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_31_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_31_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_31_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_31_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_31_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_31_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: guitar\nB: microphone\nC: piano\nD: telephone",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: guitar\nB: microphone\nC: piano\nD: telephone",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_32_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_32_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_32_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_32_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_32_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_32_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: mantel\nB: chair\nC: sofa\nD: bed",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: mantel\nB: chair\nC: sofa\nD: bed",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_33_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_33_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_33_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_33_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_33_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_33_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: desk\nB: sofa\nC: table\nD: chair",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: desk\nB: sofa\nC: table\nD: chair",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_34_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_34_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_34_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_34_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_34_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_34_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: bike\nB: car\nC: airplane\nD: bus",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bike\nB: car\nC: airplane\nD: bus",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_35_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_35_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_35_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_35_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_35_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_35_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: lamp\nB: plant\nC: chair\nD: telephone",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: lamp\nB: plant\nC: chair\nD: telephone",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_36_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_36_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_36_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_36_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_36_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_36_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: mug\nB: bottle\nC: glass box\nD: faucet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: mug\nB: bottle\nC: glass box\nD: faucet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_37_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_37_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_37_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_37_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_37_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_37_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: clock\nB: tv stand\nC: mantel\nD: telephone",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: clock\nB: tv stand\nC: mantel\nD: telephone",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_38_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_38_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_38_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_38_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_38_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_38_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: lamp\nB: sofa\nC: chair\nD: telephone",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: lamp\nB: sofa\nC: chair\nD: telephone",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_39_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_39_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_39_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_39_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_39_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_39_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: airplane\nB: boat\nC: sofa\nD: watercraft",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: airplane\nB: boat\nC: sofa\nD: watercraft",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_40_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_40_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_40_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_40_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_40_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_40_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: telephone\nB: clock\nC: vase\nD: car",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: telephone\nB: clock\nC: vase\nD: car",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_41_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_41_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_41_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_41_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_41_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_41_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: monitor\nB: keyboard\nC: television\nD: speaker",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: monitor\nB: keyboard\nC: television\nD: speaker",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_42_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_42_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_42_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_42_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_42_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_42_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: sink\nB: bathtub\nC: toilet\nD: faucet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: sink\nB: bathtub\nC: toilet\nD: faucet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_43_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_43_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_43_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_43_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_43_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_43_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: vase\nB: telephone\nC: clock\nD: guitar",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: vase\nB: telephone\nC: clock\nD: guitar",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_44_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_44_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_44_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_44_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_44_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_44_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: monitor\nB: loudspeaker\nC: guitar\nD: piano",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: monitor\nB: loudspeaker\nC: guitar\nD: piano",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_45_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_45_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_45_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_45_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_45_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_45_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: tv stand\nB: radio\nC: chair\nD: cabinet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: tv stand\nB: radio\nC: chair\nD: cabinet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_46_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_46_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_46_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_46_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_46_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_46_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: chair\nB: table\nC: sofa\nD: telephone",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: chair\nB: table\nC: sofa\nD: telephone",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_47_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_47_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_47_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_47_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_47_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_47_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: lamp\nB: sofa\nC: bed\nD: chair",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: lamp\nB: sofa\nC: bed\nD: chair",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_48_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_48_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_48_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_48_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_48_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_48_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: cabinet\nB: chair\nC: telephone\nD: tv stand",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: cabinet\nB: chair\nC: telephone\nD: tv stand",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_49_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_49_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_49_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_49_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_49_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_49_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: toilet\nB: chair\nC: lamp\nD: sink",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: toilet\nB: chair\nC: lamp\nD: sink",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_50_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_50_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_50_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_50_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_50_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_50_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: monitor\nB: airplane\nC: car\nD: person",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: monitor\nB: airplane\nC: car\nD: person",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_51_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_51_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_51_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_51_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_51_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_51_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: mantel\nB: sofa\nC: telephone\nD: clock",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: mantel\nB: sofa\nC: telephone\nD: clock",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_52_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_52_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_52_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_52_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_52_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_52_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: glass box\nB: bottle\nC: mug\nD: watercraft",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: glass box\nB: bottle\nC: mug\nD: watercraft",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_53_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_53_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_53_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_53_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_53_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_53_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: telephone\nB: guitar\nC: vase\nD: clock",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: telephone\nB: guitar\nC: vase\nD: clock",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_54_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_54_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_54_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_54_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_54_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_54_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: lamp\nB: pistol\nC: rifle\nD: tv stand",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: lamp\nB: pistol\nC: rifle\nD: tv stand",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_55_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_55_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_55_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_55_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_55_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_55_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: bookshelf\nB: sofa\nC: mantel\nD: television",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bookshelf\nB: sofa\nC: mantel\nD: television",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_56_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_56_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_56_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_56_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_56_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_56_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: lamp\nB: vase\nC: bookshelf\nD: curtain",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: lamp\nB: vase\nC: bookshelf\nD: curtain",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_57_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_57_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_57_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_57_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_57_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_57_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: dresser\nB: bookshelf\nC: stool\nD: night stand",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: dresser\nB: bookshelf\nC: stool\nD: night stand",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_58_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_58_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_58_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_58_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_58_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_58_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: vase\nB: clock\nC: tv stand\nD: telephone",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: vase\nB: clock\nC: tv stand\nD: telephone",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_59_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_59_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_59_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_59_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_59_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_59_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: desk\nB: laptop\nC: keyboard\nD: monitor",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: desk\nB: laptop\nC: keyboard\nD: monitor",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_60_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_60_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_60_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_60_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_60_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_60_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: sofa\nB: clock\nC: telephone\nD: guitar",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: sofa\nB: clock\nC: telephone\nD: guitar",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_61_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_61_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_61_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_61_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_61_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_61_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: range hood\nB: telephone\nC: clock\nD: bathtub",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: range hood\nB: telephone\nC: clock\nD: bathtub",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_62_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_62_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_62_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_62_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_62_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_62_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: loudspeaker\nB: radio\nC: telephone\nD: tv stand",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: loudspeaker\nB: radio\nC: telephone\nD: tv stand",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_63_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_63_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_63_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_63_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_63_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_63_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: guitar\nB: microphone\nC: table\nD: piano",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: guitar\nB: microphone\nC: table\nD: piano",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_64_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_64_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_64_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_64_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_64_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_64_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: telephone\nB: chair\nC: sofa\nD: table",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: telephone\nB: chair\nC: sofa\nD: table",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_65_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_65_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_65_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_65_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_65_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_65_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: stair\nB: keyboard\nC: laptop\nD: cellphone",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: stair\nB: keyboard\nC: laptop\nD: cellphone",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_66_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_66_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_66_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_66_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_66_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_66_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: clock\nB: telephone\nC: vase\nD: monitor",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: clock\nB: telephone\nC: vase\nD: monitor",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_67_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_67_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_67_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_67_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_67_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_67_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: night stand\nB: dresser\nC: television\nD: bookshelf",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: night stand\nB: dresser\nC: television\nD: bookshelf",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_68_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_68_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_68_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_68_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_68_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_68_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: laptop\nB: monitor\nC: keyboard\nD: telephone",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: laptop\nB: monitor\nC: keyboard\nD: telephone",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_69_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_69_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_69_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_69_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_69_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_69_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: chair\nB: bed\nC: lamp\nD: night stand",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: chair\nB: bed\nC: lamp\nD: night stand",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_70_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_70_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_70_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_70_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_70_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_70_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: telephone\nB: clock\nC: vase\nD: tv stand",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: telephone\nB: clock\nC: vase\nD: tv stand",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_71_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_71_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_71_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_71_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_71_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_71_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: bottle\nB: lamp\nC: glass box\nD: mug",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bottle\nB: lamp\nC: glass box\nD: mug",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_72_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_72_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_72_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_72_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_72_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_72_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: clock\nB: telephone\nC: tv stand\nD: chair",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: clock\nB: telephone\nC: tv stand\nD: chair",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_73_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_73_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_73_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_73_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_73_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_73_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: lamp\nB: piano\nC: dresser\nD: night stand",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: lamp\nB: piano\nC: dresser\nD: night stand",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_74_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_74_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_74_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_74_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_74_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_74_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: table\nB: sofa\nC: lamp\nD: chair",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: table\nB: sofa\nC: lamp\nD: chair",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_75_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_75_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_75_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_75_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_75_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_75_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: radio\nB: chair\nC: desk\nD: bench",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: radio\nB: chair\nC: desk\nD: bench",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_76_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_76_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_76_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_76_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_76_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_76_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: piano\nB: chair\nC: stool\nD: guitar",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: piano\nB: chair\nC: stool\nD: guitar",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_77_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_77_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_77_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_77_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_77_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_77_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: faucet\nB: telephone\nC: stool\nD: range hood",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: faucet\nB: telephone\nC: stool\nD: range hood",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_78_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_78_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_78_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_78_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_78_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_78_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: vase\nB: telephone\nC: bookshelf\nD: clock",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: vase\nB: telephone\nC: bookshelf\nD: clock",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_79_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_79_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_79_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_79_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_79_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_79_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: sink\nB: chair\nC: plant\nD: cabinet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: sink\nB: chair\nC: plant\nD: cabinet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_80_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_80_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_80_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_80_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_80_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_80_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: faucet\nB: telephone\nC: table\nD: chair",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: faucet\nB: telephone\nC: table\nD: chair",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_81_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_81_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_81_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_81_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_81_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_81_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: telephone\nB: lamp\nC: bookshelf\nD: sink",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: telephone\nB: lamp\nC: bookshelf\nD: sink",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_82_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_82_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_82_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_82_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_82_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_82_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: tv stand\nB: desk\nC: monitor\nD: chair",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: tv stand\nB: desk\nC: monitor\nD: chair",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_83_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_83_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_83_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_83_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_83_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_83_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: night stand\nB: vase\nC: clock\nD: telephone",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: night stand\nB: vase\nC: clock\nD: telephone",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_84_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_84_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_84_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_84_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_84_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_84_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: chair\nB: desk\nC: stool\nD: bookshelf",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: chair\nB: desk\nC: stool\nD: bookshelf",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_85_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_85_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_85_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_85_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_85_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_85_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: rifle\nB: telephone\nC: car\nD: airplane",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: rifle\nB: telephone\nC: car\nD: airplane",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_86_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_86_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_86_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_86_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_86_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_86_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: bottle\nB: sink\nC: toilet\nD: chair",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bottle\nB: sink\nC: toilet\nD: chair",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_87_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_87_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_87_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_87_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_87_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_87_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: vase\nB: television\nC: mirror\nD: decorative bowl",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: vase\nB: television\nC: mirror\nD: decorative bowl",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_88_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_88_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_88_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_88_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_88_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_88_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: bed\nB: chair\nC: sofa\nD: table",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bed\nB: chair\nC: sofa\nD: table",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_89_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_89_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_89_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_89_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_89_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_89_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: sink\nB: faucet\nC: toilet\nD: bathtub",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: sink\nB: faucet\nC: toilet\nD: bathtub",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_90_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_90_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_90_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_90_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_90_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_90_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: table\nB: lamp\nC: vase\nD: chair",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: table\nB: lamp\nC: vase\nD: chair",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_91_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_91_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_91_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_91_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_91_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_91_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: telephone\nB: bottle\nC: glass box\nD: faucet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: telephone\nB: bottle\nC: glass box\nD: faucet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_92_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_92_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_92_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_92_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_92_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_92_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: faucet\nB: telephone\nC: airplane\nD: clock",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: faucet\nB: telephone\nC: airplane\nD: clock",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_93_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_93_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_93_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_93_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_93_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_93_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: faucet\nB: clock\nC: telephone\nD: range hood",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: faucet\nB: clock\nC: telephone\nD: range hood",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_94_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_94_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_94_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_94_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_94_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_94_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: rifle\nB: laptop\nC: clock\nD: pistol",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: rifle\nB: laptop\nC: clock\nD: pistol",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_95_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_95_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_95_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_95_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_95_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_95_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: cup\nB: bottle\nC: glass box\nD: mug",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: cup\nB: bottle\nC: glass box\nD: mug",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_96_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_96_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_96_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_96_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_96_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_96_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: glass box\nB: television\nC: monitor\nD: chair",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: glass box\nB: television\nC: monitor\nD: chair",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_97_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_97_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_97_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_97_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_97_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_97_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: car\nB: telephone\nC: radio\nD: airplane",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: car\nB: telephone\nC: radio\nD: airplane",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_98_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_98_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_98_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_98_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_98_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_98_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: chair\nB: flower pot\nC: lamp\nD: vase",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: chair\nB: flower pot\nC: lamp\nD: vase",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_99_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_99_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_99_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_99_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_99_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_99_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: chair\nB: night stand\nC: bottle\nD: sofa",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: chair\nB: night stand\nC: bottle\nD: sofa",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_100_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_100_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_100_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_100_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_100_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_100_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: chair\nB: table\nC: tv stand\nD: telephone",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: chair\nB: table\nC: tv stand\nD: telephone",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_101_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_101_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_101_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_101_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_101_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_101_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: glass box\nB: table\nC: lamp\nD: telephone",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: glass box\nB: table\nC: lamp\nD: telephone",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_102_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_102_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_102_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_102_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_102_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_102_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: table\nB: vase\nC: chair\nD: lamp",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: table\nB: vase\nC: chair\nD: lamp",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_103_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_103_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_103_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_103_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_103_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_103_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: lamp\nB: chair\nC: table\nD: desk",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: lamp\nB: chair\nC: table\nD: desk",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_104_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_104_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_104_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_104_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_104_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_104_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: vase\nB: lamp\nC: flower pot\nD: plant",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: vase\nB: lamp\nC: flower pot\nD: plant",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_105_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_105_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_105_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_105_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_105_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_105_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: table\nB: sofa\nC: chair\nD: bed",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: table\nB: sofa\nC: chair\nD: bed",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_106_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_106_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_106_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_106_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_106_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_106_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: telephone\nB: guitar\nC: radio\nD: lamp",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: telephone\nB: guitar\nC: radio\nD: lamp",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_107_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_107_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_107_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_107_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_107_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_107_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: telephone\nB: bench\nC: table\nD: lamp",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: telephone\nB: bench\nC: table\nD: lamp",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_108_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_108_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_108_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_108_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_108_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_108_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: telephone\nB: clock\nC: vase\nD: lamp",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: telephone\nB: clock\nC: vase\nD: lamp",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_109_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_109_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_109_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_109_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_109_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_109_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: sofa\nB: table\nC: chair\nD: telephone",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: sofa\nB: table\nC: chair\nD: telephone",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_110_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_110_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_110_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_110_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_110_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_110_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: chair\nB: toilet\nC: table\nD: telephone",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: chair\nB: toilet\nC: table\nD: telephone",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_111_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_111_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_111_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_111_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_111_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_111_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: lamp\nB: toilet\nC: vase\nD: table",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: lamp\nB: toilet\nC: vase\nD: table",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_112_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_112_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_112_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_112_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_112_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_112_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: car\nB: vase\nC: telephone\nD: clock",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: car\nB: vase\nC: telephone\nD: clock",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_113_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_113_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_113_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_113_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_113_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_113_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: chair\nB: desk\nC: bookshelf\nD: wardrobe",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: chair\nB: desk\nC: bookshelf\nD: wardrobe",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_114_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_114_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_114_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_114_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_114_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_114_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: microphone\nB: table\nC: stool\nD: guitar",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: microphone\nB: table\nC: stool\nD: guitar",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_115_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_115_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_115_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_115_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_115_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_115_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: piano\nB: rifle\nC: guitar\nD: telephone",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: piano\nB: rifle\nC: guitar\nD: telephone",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_116_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_116_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_116_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_116_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_116_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_116_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: chair\nB: stairs\nC: laptop\nD: bed",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: chair\nB: stairs\nC: laptop\nD: bed",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_117_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_117_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_117_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_117_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_117_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_117_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: radio\nB: glass box\nC: monitor\nD: lamp",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: radio\nB: glass box\nC: monitor\nD: lamp",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_118_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_118_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_118_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_118_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_118_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_118_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: vase\nB: clock\nC: guitar\nD: telephone",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: vase\nB: clock\nC: guitar\nD: telephone",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_119_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_119_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_119_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_119_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_119_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_119_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: chair\nB: piano\nC: monitor\nD: laptop",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: chair\nB: piano\nC: monitor\nD: laptop",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_120_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_120_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_120_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_120_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_120_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_120_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: clock\nB: telephone\nC: guitar\nD: vase",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: clock\nB: telephone\nC: guitar\nD: vase",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_121_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_121_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_121_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_121_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_121_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_121_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: clock\nB: telephone\nC: vase\nD: plant",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: clock\nB: telephone\nC: vase\nD: plant",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_122_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_122_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_122_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_122_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_122_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_122_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: table\nB: lamp\nC: desk\nD: chair",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: table\nB: lamp\nC: desk\nD: chair",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_123_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_123_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_123_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_123_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_123_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_123_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: sink\nB: telephone\nC: chair\nD: toilet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: sink\nB: telephone\nC: chair\nD: toilet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_124_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_124_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_124_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_124_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_124_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_124_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: night stand\nB: chair\nC: bed\nD: lamp",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: night stand\nB: chair\nC: bed\nD: lamp",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_125_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_125_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_125_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_125_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_125_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_125_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: table\nB: sofa\nC: chair\nD: cabinet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: table\nB: sofa\nC: chair\nD: cabinet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_126_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_126_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_126_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_126_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_126_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_126_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: chair\nB: toilet\nC: telephone\nD: vase",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: chair\nB: toilet\nC: telephone\nD: vase",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_127_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_127_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_127_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_127_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_127_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_127_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: bottle\nB: watercraft\nC: airplane\nD: car",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bottle\nB: watercraft\nC: airplane\nD: car",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_128_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_128_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_128_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_128_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_128_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_128_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: chair\nB: desk\nC: table\nD: sofa",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: chair\nB: desk\nC: table\nD: sofa",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_129_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_129_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_129_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_129_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_129_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_129_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: clock\nB: vase\nC: stool\nD: telephone",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: clock\nB: vase\nC: stool\nD: telephone",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_130_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_130_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_130_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_130_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_130_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_130_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: piano\nB: telephone\nC: clock\nD: vase",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: piano\nB: telephone\nC: clock\nD: vase",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_131_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_131_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_131_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_131_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_131_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_131_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: bottle\nB: faucet\nC: glass box\nD: radio",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bottle\nB: faucet\nC: glass box\nD: radio",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_132_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_132_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_132_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_132_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_132_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_132_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: vase\nB: faucet\nC: clock\nD: telephone",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: vase\nB: faucet\nC: clock\nD: telephone",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_133_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_133_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_133_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_133_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_133_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_133_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: radio\nB: tv stand\nC: lamp\nD: telephone",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: radio\nB: tv stand\nC: lamp\nD: telephone",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_134_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_134_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_134_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_134_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_134_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_134_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: night stand\nB: dresser\nC: bed\nD: lamp",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: night stand\nB: dresser\nC: bed\nD: lamp",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_135_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_135_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_135_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_135_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_135_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_135_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: wardrobe\nB: curtain\nC: bathtub\nD: desk",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: wardrobe\nB: curtain\nC: bathtub\nD: desk",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_136_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_136_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_136_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_136_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_136_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_136_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: stool\nB: chair\nC: desk\nD: table",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: stool\nB: chair\nC: desk\nD: table",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_137_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_137_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_137_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_137_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_137_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_137_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: vase\nB: lamp\nC: telephone\nD: toilet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: vase\nB: lamp\nC: telephone\nD: toilet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_138_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_138_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_138_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_138_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_138_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_138_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: guitar\nB: telephone\nC: radio\nD: laptop",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: guitar\nB: telephone\nC: radio\nD: laptop",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_139_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_139_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_139_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_139_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_139_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_139_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: sink\nB: tv stand\nC: clock\nD: telephone",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: sink\nB: tv stand\nC: clock\nD: telephone",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_140_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_140_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_140_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_140_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_140_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_140_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: flower pot\nB: clock\nC: vase\nD: telephone",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: flower pot\nB: clock\nC: vase\nD: telephone",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_141_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_141_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_141_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_141_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_141_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_141_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: lamp\nB: vase\nC: bottle\nD: stool",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: lamp\nB: vase\nC: bottle\nD: stool",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_142_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_142_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_142_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_142_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_142_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_142_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: sofa\nB: chair\nC: bed\nD: telephone",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: sofa\nB: chair\nC: bed\nD: telephone",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_143_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_143_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_143_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_143_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_143_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_143_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: telephone\nB: chair\nC: sofa\nD: bed",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: telephone\nB: chair\nC: sofa\nD: bed",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_144_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_144_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_144_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_144_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_144_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_144_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: dresser\nB: desk\nC: bathtub\nD: wardrobe",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: dresser\nB: desk\nC: bathtub\nD: wardrobe",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_145_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_145_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_145_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_145_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_145_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_145_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: stool\nB: chair\nC: desk\nD: bench",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: stool\nB: chair\nC: desk\nD: bench",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_146_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_146_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_146_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_146_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_146_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_146_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: telephone\nB: rifle\nC: lamp\nD: plant",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: telephone\nB: rifle\nC: lamp\nD: plant",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_147_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_147_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_147_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_147_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_147_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_147_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: chair\nB: bed\nC: dresser\nD: wardrobe",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: chair\nB: bed\nC: dresser\nD: wardrobe",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_148_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_148_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_148_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_148_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_148_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_148_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: curtain\nB: stool\nC: mantel\nD: bookshelf",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: curtain\nB: stool\nC: mantel\nD: bookshelf",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_149_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_149_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_149_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_149_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_149_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_149_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: vase\nB: tv stand\nC: clock\nD: telephone",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: vase\nB: tv stand\nC: clock\nD: telephone",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_150_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_150_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_150_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_150_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_150_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_150_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: desk\nB: bookshelf\nC: table\nD: chair",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: desk\nB: bookshelf\nC: table\nD: chair",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_151_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_151_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_151_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_151_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_151_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_151_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: lamp\nB: glass box\nC: mug\nD: bottle",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: lamp\nB: glass box\nC: mug\nD: bottle",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_152_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_152_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_152_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_152_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_152_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_152_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: chair\nB: cellphone\nC: watercraft\nD: laptop",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: chair\nB: cellphone\nC: watercraft\nD: laptop",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_153_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_153_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_153_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_153_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_153_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_153_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: chair\nB: sofa\nC: table\nD: lamp",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: chair\nB: sofa\nC: table\nD: lamp",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_154_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_154_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_154_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_154_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_154_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_154_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: microphone\nB: guitar\nC: piano\nD: stool",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: microphone\nB: guitar\nC: piano\nD: stool",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_155_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_155_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_155_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_155_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_155_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_155_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: bookshelf\nB: chair\nC: telephone\nD: table",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bookshelf\nB: chair\nC: telephone\nD: table",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_156_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_156_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_156_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_156_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_156_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_156_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: bed\nB: chair\nC: sofa\nD: lamp",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bed\nB: chair\nC: sofa\nD: lamp",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_157_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_157_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_157_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_157_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_157_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_157_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: telephone\nB: bowl\nC: mug\nD: lamp",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: telephone\nB: bowl\nC: mug\nD: lamp",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_158_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_158_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_158_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_158_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_158_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_158_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: faucet\nB: bathtub\nC: shower\nD: toilet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: faucet\nB: bathtub\nC: shower\nD: toilet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_159_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_159_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_159_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_159_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_159_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_159_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: cabinet\nB: desk\nC: table\nD: chair",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: cabinet\nB: desk\nC: table\nD: chair",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_160_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_160_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_160_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_160_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_160_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_160_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: guitar\nB: stool\nC: telephone\nD: clock",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: guitar\nB: stool\nC: telephone\nD: clock",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_161_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_161_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_161_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_161_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_161_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_161_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: bed\nB: stool\nC: night stand\nD: desk",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bed\nB: stool\nC: night stand\nD: desk",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_162_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_162_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_162_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_162_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_162_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_162_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: airplane\nB: car\nC: motorcycle\nD: bicycle",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: airplane\nB: car\nC: motorcycle\nD: bicycle",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_163_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_163_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_163_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_163_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_163_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_163_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: tv stand\nB: sofa\nC: stool\nD: chair",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: tv stand\nB: sofa\nC: stool\nD: chair",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_164_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_164_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_164_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_164_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_164_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_164_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: piano\nB: clock\nC: guitar\nD: telephone",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: piano\nB: clock\nC: guitar\nD: telephone",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_165_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_165_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_165_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_165_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_165_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_165_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: laptop\nB: telephone\nC: stool\nD: airplane",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: laptop\nB: telephone\nC: stool\nD: airplane",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_166_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_166_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_166_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_166_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_166_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_166_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: telephone\nB: clock\nC: vase\nD: pistol",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: telephone\nB: clock\nC: vase\nD: pistol",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_167_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_167_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_167_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_167_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_167_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_167_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: plant\nB: television stand\nC: lamp\nD: chair",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: plant\nB: television stand\nC: lamp\nD: chair",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_168_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_168_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_168_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_168_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_168_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_168_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: airplane\nB: lamp\nC: radio\nD: tent",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: airplane\nB: lamp\nC: radio\nD: tent",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_169_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_169_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_169_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_169_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_169_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_169_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: lamp\nB: stairs\nC: piano\nD: telephone",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: lamp\nB: stairs\nC: piano\nD: telephone",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_170_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_170_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_170_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_170_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_170_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_170_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: sink\nB: faucet\nC: bottle\nD: tv stand",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: sink\nB: faucet\nC: bottle\nD: tv stand",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_171_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_171_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_171_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_171_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_171_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_171_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: table\nB: sofa\nC: desk\nD: chair",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: table\nB: sofa\nC: desk\nD: chair",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_172_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_172_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_172_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_172_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_172_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_172_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: telephone\nB: vase\nC: chair\nD: toilet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: telephone\nB: vase\nC: chair\nD: toilet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_173_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_173_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_173_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_173_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_173_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_173_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: telephone\nB: chair\nC: table\nD: sofa",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: telephone\nB: chair\nC: table\nD: sofa",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_174_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_174_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_174_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_174_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_174_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_174_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: sink\nB: bathtub\nC: telephone\nD: toilet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: sink\nB: bathtub\nC: telephone\nD: toilet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_175_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_175_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_175_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_175_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_175_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_175_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: clock\nB: tv stand\nC: telephone\nD: vase",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: clock\nB: tv stand\nC: telephone\nD: vase",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_176_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_176_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_176_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_176_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_176_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_176_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: car\nB: bookshelf\nC: airplane\nD: stool",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: car\nB: bookshelf\nC: airplane\nD: stool",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_177_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_177_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_177_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_177_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_177_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_177_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: chair\nB: table\nC: sofa\nD: stool",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: chair\nB: table\nC: sofa\nD: stool",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_178_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_178_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_178_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_178_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_178_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_178_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: faucet\nB: telephone\nC: range hood\nD: stool",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: faucet\nB: telephone\nC: range hood\nD: stool",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_179_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_179_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_179_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_179_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_179_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_179_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: telephone\nB: table\nC: chair\nD: bookshelf",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: telephone\nB: table\nC: chair\nD: bookshelf",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_180_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_180_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_180_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_180_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_180_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_180_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: lamp\nB: desk\nC: bookshelf\nD: chair",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: lamp\nB: desk\nC: bookshelf\nD: chair",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_181_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_181_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_181_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_181_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_181_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_181_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: sink\nB: bathtub\nC: faucet\nD: toilet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: sink\nB: bathtub\nC: faucet\nD: toilet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_182_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_182_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_182_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_182_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_182_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_182_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: telephone\nB: stool\nC: chair\nD: tv stand",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: telephone\nB: stool\nC: chair\nD: tv stand",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_183_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_183_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_183_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_183_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_183_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_183_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: mantel\nB: stairs\nC: fireplace\nD: sofa",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: mantel\nB: stairs\nC: fireplace\nD: sofa",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_184_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_184_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_184_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_184_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_184_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_184_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: desktop\nB: lamp\nC: radio\nD: glass box",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: desktop\nB: lamp\nC: radio\nD: glass box",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_185_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_185_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_185_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_185_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_185_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_185_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: mantel\nB: plant\nC: radio\nD: tv stand",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: mantel\nB: plant\nC: radio\nD: tv stand",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_186_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_186_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_186_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_186_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_186_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_186_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: chair\nB: clock\nC: vase\nD: telephone",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: chair\nB: clock\nC: vase\nD: telephone",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_187_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_187_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_187_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_187_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_187_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_187_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: clock\nB: car\nC: vase\nD: telephone",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: clock\nB: car\nC: vase\nD: telephone",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_188_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_188_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_188_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_188_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_188_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_188_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: telephone\nB: clock\nC: piano\nD: vase",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: telephone\nB: clock\nC: piano\nD: vase",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_189_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_189_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_189_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_189_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_189_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_189_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: television\nB: glass box\nC: chair\nD: bookshelf",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: television\nB: glass box\nC: chair\nD: bookshelf",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_190_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_190_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_190_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_190_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_190_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_190_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: sink\nB: shower curtain\nC: monitor\nD: toilet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: sink\nB: shower curtain\nC: monitor\nD: toilet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_191_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_191_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_191_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_191_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_191_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_191_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: lamp\nB: telephone\nC: stool\nD: sofa",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: lamp\nB: telephone\nC: stool\nD: sofa",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_192_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_192_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_192_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_192_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_192_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_192_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: chair\nB: keyboard\nC: guitar\nD: telephone",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: chair\nB: keyboard\nC: guitar\nD: telephone",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_193_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_193_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_193_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_193_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_193_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_193_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: bookshelf\nB: desk\nC: chair\nD: mantel",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bookshelf\nB: desk\nC: chair\nD: mantel",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_194_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_194_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_194_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_194_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_194_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_194_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: night stand\nB: chair\nC: lamp\nD: bed",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: night stand\nB: chair\nC: lamp\nD: bed",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_195_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_195_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_195_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_195_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_195_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_195_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: sink\nB: bathtub\nC: stool\nD: toilet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: sink\nB: bathtub\nC: stool\nD: toilet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_196_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_196_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_196_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_196_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_196_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_196_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: lamp\nB: plant\nC: flower pot\nD: bookshelf",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: lamp\nB: plant\nC: flower pot\nD: bookshelf",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_197_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_197_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_197_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_197_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_197_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_197_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: bottle\nB: mug\nC: keyboard\nD: cup",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bottle\nB: mug\nC: keyboard\nD: cup",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_198_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_198_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_198_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_198_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_198_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_198_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_cad_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ModelNet40",
+    "options": "A: stool\nB: chair\nC: piano\nD: guitar",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: stool\nB: chair\nC: piano\nD: guitar",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_199_0.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_199_1.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_199_2.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_199_3.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_199_4.jpg",
+      "../MMIU-Benchmark/threed_cad_recognition/threed_cad_recognition_199_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: bed\nB: cabinet\nC: bin\nD: sink",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bed\nB: cabinet\nC: bin\nD: sink",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_0_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_0_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_0_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_0_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_0_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_0_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: box\nB: sink\nC: cabinet\nD: shelf",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: box\nB: sink\nC: cabinet\nD: shelf",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_1_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_1_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_1_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_1_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_1_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_1_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: sink\nB: cabinet\nC: bag\nD: bed",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: sink\nB: cabinet\nC: bag\nD: bed",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_2_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_2_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_2_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_2_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_2_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_2_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: box\nB: sink\nC: chair\nD: shelf",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: box\nB: sink\nC: chair\nD: shelf",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_3_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_3_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_3_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_3_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_3_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_3_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: chair\nB: sofa\nC: bed\nD: cabinet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: chair\nB: sofa\nC: bed\nD: cabinet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_4_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_4_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_4_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_4_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_4_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_4_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: bag\nB: cabinet\nC: desk\nD: sink",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bag\nB: cabinet\nC: desk\nD: sink",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_5_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_5_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_5_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_5_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_5_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_5_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: desk\nB: cabinet\nC: shelf\nD: bed",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: desk\nB: cabinet\nC: shelf\nD: bed",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_6_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_6_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_6_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_6_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_6_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_6_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: cabinet\nB: sink\nC: bag\nD: bin",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: cabinet\nB: sink\nC: bag\nD: bin",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_7_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_7_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_7_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_7_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_7_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_7_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: sink\nB: bed\nC: bin\nD: display",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: sink\nB: bed\nC: bin\nD: display",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_8_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_8_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_8_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_8_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_8_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_8_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: cabinet\nB: sofa\nC: sink\nD: bag",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: cabinet\nB: sofa\nC: sink\nD: bag",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_9_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_9_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_9_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_9_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_9_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_9_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: bed\nB: shelf\nC: door\nD: cabinet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bed\nB: shelf\nC: door\nD: cabinet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_10_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_10_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_10_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_10_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_10_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_10_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: desk\nB: chair\nC: sofa\nD: toilet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: desk\nB: chair\nC: sofa\nD: toilet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_11_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_11_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_11_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_11_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_11_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_11_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: sink\nB: bed\nC: shelf\nD: cabinet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: sink\nB: bed\nC: shelf\nD: cabinet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_12_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_12_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_12_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_12_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_12_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_12_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: shelf\nB: cabinet\nC: desk\nD: bed",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: shelf\nB: cabinet\nC: desk\nD: bed",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_13_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_13_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_13_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_13_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_13_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_13_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: cabinet\nB: bed\nC: shelf\nD: box",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: cabinet\nB: bed\nC: shelf\nD: box",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_14_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_14_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_14_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_14_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_14_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_14_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: bed\nB: chair\nC: cabinet\nD: sofa",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bed\nB: chair\nC: cabinet\nD: sofa",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_15_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_15_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_15_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_15_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_15_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_15_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: sink\nB: bag\nC: cabinet\nD: door",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: sink\nB: bag\nC: cabinet\nD: door",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_16_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_16_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_16_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_16_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_16_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_16_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: sink\nB: cabinet\nC: table\nD: pillow",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: sink\nB: cabinet\nC: table\nD: pillow",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_17_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_17_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_17_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_17_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_17_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_17_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: table\nB: sink\nC: bed\nD: cabinet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: table\nB: sink\nC: bed\nD: cabinet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_18_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_18_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_18_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_18_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_18_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_18_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: cabinet\nB: bag\nC: sink\nD: box",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: cabinet\nB: bag\nC: sink\nD: box",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_19_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_19_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_19_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_19_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_19_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_19_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: cabinet\nB: sink\nC: shelf\nD: bed",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: cabinet\nB: sink\nC: shelf\nD: bed",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_20_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_20_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_20_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_20_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_20_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_20_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: sofa\nB: cabinet\nC: chair\nD: bed",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: sofa\nB: cabinet\nC: chair\nD: bed",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_21_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_21_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_21_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_21_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_21_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_21_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: sink\nB: cabinet\nC: bed\nD: display",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: sink\nB: cabinet\nC: bed\nD: display",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_22_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_22_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_22_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_22_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_22_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_22_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: display\nB: shelf\nC: bin\nD: cabinet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: display\nB: shelf\nC: bin\nD: cabinet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_23_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_23_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_23_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_23_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_23_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_23_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: cabinet\nB: bed\nC: display\nD: shelf",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: cabinet\nB: bed\nC: display\nD: shelf",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_24_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_24_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_24_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_24_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_24_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_24_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: bed\nB: pillow\nC: chair\nD: desk",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bed\nB: pillow\nC: chair\nD: desk",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_25_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_25_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_25_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_25_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_25_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_25_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: pillow\nB: shelf\nC: sink\nD: bed",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: pillow\nB: shelf\nC: sink\nD: bed",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_26_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_26_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_26_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_26_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_26_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_26_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: cabinet\nB: bed\nC: shelf\nD: table",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: cabinet\nB: bed\nC: shelf\nD: table",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_27_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_27_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_27_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_27_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_27_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_27_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: cabinet\nB: bed\nC: bag\nD: sink",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: cabinet\nB: bed\nC: bag\nD: sink",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_28_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_28_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_28_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_28_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_28_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_28_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: shelf\nB: cabinet\nC: sofa\nD: desk",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: shelf\nB: cabinet\nC: sofa\nD: desk",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_29_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_29_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_29_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_29_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_29_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_29_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: sink\nB: chair\nC: door\nD: pillow",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: sink\nB: chair\nC: door\nD: pillow",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_30_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_30_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_30_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_30_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_30_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_30_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: chair\nB: display\nC: bed\nD: sofa",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: chair\nB: display\nC: bed\nD: sofa",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_31_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_31_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_31_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_31_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_31_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_31_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: pillow\nB: sofa\nC: cabinet\nD: toilet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: pillow\nB: sofa\nC: cabinet\nD: toilet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_32_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_32_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_32_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_32_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_32_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_32_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: sink\nB: cabinet\nC: table\nD: bed",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: sink\nB: cabinet\nC: table\nD: bed",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_33_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_33_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_33_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_33_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_33_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_33_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: cabinet\nB: bag\nC: sink\nD: bed",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: cabinet\nB: bag\nC: sink\nD: bed",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_34_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_34_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_34_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_34_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_34_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_34_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: bed\nB: cabinet\nC: bin\nD: sink",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bed\nB: cabinet\nC: bin\nD: sink",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_35_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_35_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_35_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_35_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_35_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_35_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: sink\nB: cabinet\nC: door\nD: bag",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: sink\nB: cabinet\nC: door\nD: bag",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_36_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_36_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_36_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_36_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_36_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_36_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: display\nB: cabinet\nC: sink\nD: bed",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: display\nB: cabinet\nC: sink\nD: bed",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_37_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_37_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_37_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_37_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_37_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_37_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: bed\nB: sink\nC: cabinet\nD: bag",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bed\nB: sink\nC: cabinet\nD: bag",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_38_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_38_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_38_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_38_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_38_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_38_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: sofa\nB: cabinet\nC: bed\nD: shelf",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: sofa\nB: cabinet\nC: bed\nD: shelf",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_39_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_39_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_39_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_39_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_39_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_39_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: sink\nB: desk\nC: shelf\nD: cabinet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: sink\nB: desk\nC: shelf\nD: cabinet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_40_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_40_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_40_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_40_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_40_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_40_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: sink\nB: cabinet\nC: bed\nD: table",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: sink\nB: cabinet\nC: bed\nD: table",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_41_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_41_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_41_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_41_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_41_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_41_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: display\nB: table\nC: bed\nD: cabinet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: display\nB: table\nC: bed\nD: cabinet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_42_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_42_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_42_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_42_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_42_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_42_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: sink\nB: bed\nC: cabinet\nD: shelf",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: sink\nB: bed\nC: cabinet\nD: shelf",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_43_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_43_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_43_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_43_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_43_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_43_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: shelf\nB: bed\nC: cabinet\nD: door",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: shelf\nB: bed\nC: cabinet\nD: door",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_44_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_44_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_44_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_44_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_44_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_44_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: bag\nB: sink\nC: cabinet\nD: table",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bag\nB: sink\nC: cabinet\nD: table",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_45_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_45_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_45_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_45_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_45_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_45_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: display\nB: toilet\nC: chair\nD: table",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: display\nB: toilet\nC: chair\nD: table",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_46_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_46_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_46_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_46_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_46_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_46_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: bed\nB: sink\nC: cabinet\nD: bag",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bed\nB: sink\nC: cabinet\nD: bag",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_47_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_47_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_47_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_47_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_47_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_47_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: bag\nB: sink\nC: bed\nD: cabinet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bag\nB: sink\nC: bed\nD: cabinet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_48_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_48_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_48_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_48_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_48_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_48_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: desk\nB: shelf\nC: bed\nD: cabinet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: desk\nB: shelf\nC: bed\nD: cabinet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_49_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_49_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_49_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_49_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_49_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_49_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: sink\nB: bag\nC: bed\nD: cabinet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: sink\nB: bag\nC: bed\nD: cabinet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_50_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_50_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_50_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_50_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_50_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_50_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: cabinet\nB: bag\nC: sink\nD: box",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: cabinet\nB: bag\nC: sink\nD: box",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_51_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_51_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_51_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_51_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_51_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_51_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: box\nB: display\nC: bed\nD: cabinet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: box\nB: display\nC: bed\nD: cabinet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_52_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_52_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_52_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_52_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_52_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_52_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: bin\nB: cabinet\nC: bed\nD: display",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bin\nB: cabinet\nC: bed\nD: display",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_53_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_53_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_53_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_53_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_53_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_53_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: cabinet\nB: sink\nC: bag\nD: bed",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: cabinet\nB: sink\nC: bag\nD: bed",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_54_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_54_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_54_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_54_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_54_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_54_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: box\nB: cabinet\nC: bed\nD: sink",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: box\nB: cabinet\nC: bed\nD: sink",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_55_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_55_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_55_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_55_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_55_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_55_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: sink\nB: door\nC: bed\nD: cabinet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: sink\nB: door\nC: bed\nD: cabinet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_56_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_56_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_56_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_56_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_56_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_56_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: cabinet\nB: bed\nC: chair\nD: bin",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: cabinet\nB: bed\nC: chair\nD: bin",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_57_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_57_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_57_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_57_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_57_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_57_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: bed\nB: sink\nC: cabinet\nD: bag",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bed\nB: sink\nC: cabinet\nD: bag",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_58_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_58_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_58_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_58_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_58_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_58_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: cabinet\nB: bed\nC: box\nD: table",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: cabinet\nB: bed\nC: box\nD: table",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_59_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_59_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_59_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_59_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_59_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_59_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: sofa\nB: cabinet\nC: sink\nD: display",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: sofa\nB: cabinet\nC: sink\nD: display",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_60_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_60_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_60_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_60_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_60_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_60_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: cabinet\nB: bag\nC: sink\nD: display",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: cabinet\nB: bag\nC: sink\nD: display",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_61_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_61_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_61_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_61_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_61_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_61_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: display\nB: cabinet\nC: sofa\nD: bed",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: display\nB: cabinet\nC: sofa\nD: bed",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_62_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_62_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_62_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_62_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_62_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_62_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: shelf\nB: box\nC: chair\nD: bed",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: shelf\nB: box\nC: chair\nD: bed",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_63_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_63_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_63_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_63_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_63_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_63_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: cabinet\nB: display\nC: chair\nD: bed",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: cabinet\nB: display\nC: chair\nD: bed",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_64_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_64_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_64_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_64_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_64_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_64_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: sink\nB: bag\nC: bed\nD: cabinet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: sink\nB: bag\nC: bed\nD: cabinet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_65_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_65_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_65_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_65_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_65_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_65_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: shelf\nB: cabinet\nC: display\nD: bed",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: shelf\nB: cabinet\nC: display\nD: bed",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_66_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_66_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_66_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_66_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_66_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_66_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: bed\nB: pillow\nC: cabinet\nD: toilet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bed\nB: pillow\nC: cabinet\nD: toilet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_67_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_67_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_67_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_67_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_67_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_67_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: shelf\nB: bed\nC: door\nD: cabinet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: shelf\nB: bed\nC: door\nD: cabinet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_68_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_68_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_68_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_68_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_68_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_68_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: sink\nB: bin\nC: cabinet\nD: display",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: sink\nB: bin\nC: cabinet\nD: display",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_69_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_69_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_69_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_69_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_69_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_69_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: bed\nB: shelf\nC: chair\nD: cabinet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bed\nB: shelf\nC: chair\nD: cabinet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_70_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_70_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_70_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_70_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_70_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_70_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: bed\nB: cabinet\nC: chair\nD: sink",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bed\nB: cabinet\nC: chair\nD: sink",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_71_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_71_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_71_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_71_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_71_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_71_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: sink\nB: bed\nC: shelf\nD: cabinet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: sink\nB: bed\nC: shelf\nD: cabinet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_72_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_72_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_72_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_72_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_72_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_72_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: cabinet\nB: bed\nC: shelf\nD: pillow",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: cabinet\nB: bed\nC: shelf\nD: pillow",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_73_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_73_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_73_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_73_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_73_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_73_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: toilet\nB: bag\nC: cabinet\nD: sink",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: toilet\nB: bag\nC: cabinet\nD: sink",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_74_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_74_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_74_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_74_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_74_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_74_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: cabinet\nB: bin\nC: sink\nD: bed",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: cabinet\nB: bin\nC: sink\nD: bed",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_75_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_75_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_75_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_75_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_75_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_75_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: sink\nB: cabinet\nC: bed\nD: display",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: sink\nB: cabinet\nC: bed\nD: display",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_76_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_76_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_76_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_76_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_76_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_76_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: chair\nB: cabinet\nC: bed\nD: sofa",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: chair\nB: cabinet\nC: bed\nD: sofa",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_77_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_77_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_77_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_77_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_77_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_77_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: bed\nB: display\nC: cabinet\nD: bin",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bed\nB: display\nC: cabinet\nD: bin",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_78_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_78_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_78_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_78_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_78_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_78_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: cabinet\nB: bed\nC: bag\nD: sink",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: cabinet\nB: bed\nC: bag\nD: sink",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_79_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_79_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_79_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_79_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_79_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_79_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: bag\nB: sink\nC: bed\nD: cabinet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bag\nB: sink\nC: bed\nD: cabinet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_80_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_80_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_80_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_80_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_80_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_80_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: sofa\nB: display\nC: sink\nD: cabinet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: sofa\nB: display\nC: sink\nD: cabinet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_81_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_81_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_81_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_81_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_81_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_81_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: bed\nB: display\nC: shelf\nD: cabinet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bed\nB: display\nC: shelf\nD: cabinet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_82_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_82_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_82_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_82_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_82_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_82_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: bed\nB: sofa\nC: chair\nD: cabinet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bed\nB: sofa\nC: chair\nD: cabinet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_83_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_83_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_83_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_83_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_83_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_83_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: table\nB: cabinet\nC: bed\nD: sink",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: table\nB: cabinet\nC: bed\nD: sink",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_84_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_84_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_84_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_84_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_84_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_84_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: bin\nB: box\nC: cabinet\nD: sink",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bin\nB: box\nC: cabinet\nD: sink",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_85_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_85_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_85_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_85_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_85_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_85_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: sink\nB: cabinet\nC: bin\nD: bed",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: sink\nB: cabinet\nC: bin\nD: bed",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_86_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_86_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_86_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_86_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_86_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_86_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: sink\nB: cabinet\nC: bag\nD: display",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: sink\nB: cabinet\nC: bag\nD: display",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_87_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_87_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_87_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_87_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_87_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_87_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: sink\nB: chair\nC: door\nD: bed",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: sink\nB: chair\nC: door\nD: bed",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_88_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_88_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_88_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_88_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_88_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_88_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: cabinet\nB: bed\nC: sink\nD: sofa",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: cabinet\nB: bed\nC: sink\nD: sofa",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_89_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_89_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_89_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_89_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_89_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_89_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: bed\nB: chair\nC: cabinet\nD: desk",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bed\nB: chair\nC: cabinet\nD: desk",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_90_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_90_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_90_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_90_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_90_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_90_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: cabinet\nB: sink\nC: bag\nD: box",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: cabinet\nB: sink\nC: bag\nD: box",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_91_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_91_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_91_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_91_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_91_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_91_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: bed\nB: display\nC: sink\nD: cabinet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bed\nB: display\nC: sink\nD: cabinet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_92_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_92_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_92_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_92_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_92_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_92_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: bag\nB: sink\nC: cabinet\nD: bed",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bag\nB: sink\nC: cabinet\nD: bed",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_93_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_93_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_93_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_93_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_93_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_93_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: cabinet\nB: sink\nC: chair\nD: display",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: cabinet\nB: sink\nC: chair\nD: display",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_94_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_94_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_94_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_94_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_94_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_94_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: desk\nB: bed\nC: chair\nD: cabinet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: desk\nB: bed\nC: chair\nD: cabinet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_95_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_95_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_95_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_95_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_95_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_95_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: cabinet\nB: sofa\nC: shelf\nD: bed",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: cabinet\nB: sofa\nC: shelf\nD: bed",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_96_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_96_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_96_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_96_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_96_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_96_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: bed\nB: bin\nC: cabinet\nD: sofa",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bed\nB: bin\nC: cabinet\nD: sofa",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_97_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_97_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_97_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_97_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_97_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_97_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: cabinet\nB: sink\nC: bed\nD: bag",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: cabinet\nB: sink\nC: bed\nD: bag",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_98_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_98_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_98_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_98_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_98_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_98_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: cabinet\nB: toilet\nC: bag\nD: sink",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: cabinet\nB: toilet\nC: bag\nD: sink",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_99_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_99_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_99_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_99_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_99_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_99_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: shelf\nB: chair\nC: bed\nD: cabinet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: shelf\nB: chair\nC: bed\nD: cabinet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_100_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_100_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_100_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_100_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_100_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_100_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: sink\nB: bin\nC: cabinet\nD: box",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: sink\nB: bin\nC: cabinet\nD: box",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_101_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_101_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_101_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_101_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_101_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_101_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: sink\nB: bin\nC: bag\nD: cabinet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: sink\nB: bin\nC: bag\nD: cabinet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_102_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_102_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_102_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_102_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_102_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_102_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: bed\nB: cabinet\nC: toilet\nD: display",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bed\nB: cabinet\nC: toilet\nD: display",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_103_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_103_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_103_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_103_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_103_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_103_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: bin\nB: bed\nC: table\nD: shelf",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bin\nB: bed\nC: table\nD: shelf",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_104_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_104_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_104_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_104_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_104_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_104_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: cabinet\nB: table\nC: sink\nD: bed",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: cabinet\nB: table\nC: sink\nD: bed",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_105_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_105_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_105_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_105_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_105_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_105_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: cabinet\nB: bin\nC: bed\nD: sink",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: cabinet\nB: bin\nC: bed\nD: sink",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_106_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_106_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_106_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_106_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_106_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_106_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: bed\nB: cabinet\nC: chair\nD: shelf",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bed\nB: cabinet\nC: chair\nD: shelf",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_107_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_107_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_107_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_107_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_107_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_107_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: desk\nB: chair\nC: bin\nD: display",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: desk\nB: chair\nC: bin\nD: display",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_108_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_108_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_108_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_108_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_108_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_108_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: bed\nB: display\nC: sink\nD: cabinet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bed\nB: display\nC: sink\nD: cabinet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_109_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_109_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_109_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_109_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_109_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_109_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: chair\nB: cabinet\nC: bin\nD: bed",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: chair\nB: cabinet\nC: bin\nD: bed",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_110_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_110_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_110_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_110_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_110_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_110_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: chair\nB: bed\nC: sofa\nD: cabinet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: chair\nB: bed\nC: sofa\nD: cabinet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_111_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_111_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_111_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_111_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_111_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_111_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: chair\nB: table\nC: bed\nD: toilet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: chair\nB: table\nC: bed\nD: toilet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_112_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_112_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_112_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_112_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_112_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_112_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: bed\nB: shelf\nC: table\nD: cabinet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bed\nB: shelf\nC: table\nD: cabinet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_113_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_113_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_113_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_113_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_113_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_113_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: bed\nB: table\nC: chair\nD: cabinet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bed\nB: table\nC: chair\nD: cabinet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_114_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_114_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_114_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_114_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_114_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_114_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: bag\nB: box\nC: toilet\nD: pillow",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bag\nB: box\nC: toilet\nD: pillow",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_115_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_115_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_115_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_115_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_115_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_115_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: cabinet\nB: door\nC: bag\nD: sink",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: cabinet\nB: door\nC: bag\nD: sink",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_116_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_116_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_116_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_116_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_116_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_116_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: chair\nB: desk\nC: bed\nD: cabinet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: chair\nB: desk\nC: bed\nD: cabinet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_117_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_117_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_117_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_117_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_117_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_117_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: table\nB: sink\nC: cabinet\nD: bed",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: table\nB: sink\nC: cabinet\nD: bed",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_118_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_118_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_118_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_118_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_118_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_118_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: bin\nB: bed\nC: bag\nD: cabinet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bin\nB: bed\nC: bag\nD: cabinet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_119_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_119_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_119_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_119_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_119_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_119_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: display\nB: cabinet\nC: sink\nD: bed",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: display\nB: cabinet\nC: sink\nD: bed",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_120_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_120_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_120_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_120_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_120_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_120_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: pillow\nB: bed\nC: cabinet\nD: background",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: pillow\nB: bed\nC: cabinet\nD: background",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_121_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_121_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_121_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_121_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_121_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_121_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: pillow\nB: sofa\nC: bed\nD: chair",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: pillow\nB: sofa\nC: bed\nD: chair",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_122_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_122_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_122_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_122_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_122_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_122_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: display\nB: bed\nC: sink\nD: cabinet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: display\nB: bed\nC: sink\nD: cabinet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_123_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_123_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_123_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_123_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_123_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_123_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: sink\nB: cabinet\nC: shelf\nD: box",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: sink\nB: cabinet\nC: shelf\nD: box",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_124_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_124_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_124_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_124_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_124_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_124_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: sink\nB: cabinet\nC: shelf\nD: bed",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: sink\nB: cabinet\nC: shelf\nD: bed",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_125_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_125_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_125_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_125_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_125_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_125_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: sink\nB: bed\nC: cabinet\nD: bag",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: sink\nB: bed\nC: cabinet\nD: bag",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_126_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_126_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_126_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_126_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_126_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_126_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: door\nB: cabinet\nC: sink\nD: shelf",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: door\nB: cabinet\nC: sink\nD: shelf",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_127_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_127_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_127_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_127_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_127_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_127_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: bed\nB: cabinet\nC: toilet\nD: sink",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bed\nB: cabinet\nC: toilet\nD: sink",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_128_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_128_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_128_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_128_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_128_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_128_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: cabinet\nB: bed\nC: desk\nD: shelf",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: cabinet\nB: bed\nC: desk\nD: shelf",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_129_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_129_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_129_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_129_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_129_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_129_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: cabinet\nB: sink\nC: bed\nD: bag",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: cabinet\nB: sink\nC: bed\nD: bag",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_130_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_130_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_130_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_130_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_130_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_130_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: sofa\nB: cabinet\nC: toilet\nD: shelf",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: sofa\nB: cabinet\nC: toilet\nD: shelf",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_131_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_131_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_131_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_131_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_131_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_131_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: bed\nB: cabinet\nC: bag\nD: sink",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bed\nB: cabinet\nC: bag\nD: sink",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_132_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_132_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_132_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_132_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_132_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_132_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: bin\nB: cabinet\nC: display\nD: sink",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bin\nB: cabinet\nC: display\nD: sink",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_133_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_133_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_133_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_133_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_133_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_133_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: sink\nB: cabinet\nC: bag\nD: bed",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: sink\nB: cabinet\nC: bag\nD: bed",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_134_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_134_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_134_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_134_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_134_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_134_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: sink\nB: bed\nC: cabinet\nD: shelf",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: sink\nB: bed\nC: cabinet\nD: shelf",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_135_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_135_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_135_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_135_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_135_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_135_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: toilet\nB: cabinet\nC: sink\nD: bed",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: toilet\nB: cabinet\nC: sink\nD: bed",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_136_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_136_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_136_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_136_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_136_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_136_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: pillow\nB: display\nC: chair\nD: sink",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: pillow\nB: display\nC: chair\nD: sink",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_137_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_137_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_137_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_137_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_137_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_137_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: sink\nB: bed\nC: cabinet\nD: chair",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: sink\nB: bed\nC: cabinet\nD: chair",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_138_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_138_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_138_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_138_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_138_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_138_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: sink\nB: bed\nC: box\nD: cabinet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: sink\nB: bed\nC: box\nD: cabinet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_139_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_139_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_139_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_139_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_139_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_139_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: bin\nB: display\nC: sink\nD: cabinet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bin\nB: display\nC: sink\nD: cabinet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_140_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_140_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_140_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_140_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_140_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_140_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: bed\nB: cabinet\nC: table\nD: sink",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bed\nB: cabinet\nC: table\nD: sink",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_141_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_141_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_141_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_141_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_141_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_141_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: cabinet\nB: shelf\nC: bed\nD: sofa",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: cabinet\nB: shelf\nC: bed\nD: sofa",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_142_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_142_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_142_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_142_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_142_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_142_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: cabinet\nB: box\nC: sink\nD: bed",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: cabinet\nB: box\nC: sink\nD: bed",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_143_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_143_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_143_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_143_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_143_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_143_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: bin\nB: cabinet\nC: shelf\nD: bed",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bin\nB: cabinet\nC: shelf\nD: bed",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_144_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_144_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_144_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_144_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_144_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_144_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: sink\nB: shelf\nC: chair\nD: table",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: sink\nB: shelf\nC: chair\nD: table",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_145_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_145_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_145_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_145_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_145_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_145_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: sink\nB: bed\nC: box\nD: cabinet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: sink\nB: bed\nC: box\nD: cabinet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_146_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_146_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_146_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_146_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_146_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_146_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: desk\nB: bed\nC: cabinet\nD: chair",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: desk\nB: bed\nC: cabinet\nD: chair",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_147_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_147_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_147_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_147_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_147_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_147_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: chair\nB: bed\nC: cabinet\nD: sofa",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: chair\nB: bed\nC: cabinet\nD: sofa",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_148_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_148_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_148_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_148_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_148_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_148_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: bed\nB: cabinet\nC: display\nD: sink",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bed\nB: cabinet\nC: display\nD: sink",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_149_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_149_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_149_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_149_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_149_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_149_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: bed\nB: sink\nC: bag\nD: cabinet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bed\nB: sink\nC: bag\nD: cabinet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_150_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_150_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_150_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_150_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_150_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_150_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: display\nB: sink\nC: bag\nD: cabinet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: display\nB: sink\nC: bag\nD: cabinet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_151_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_151_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_151_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_151_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_151_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_151_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: door\nB: cabinet\nC: bed\nD: shelf",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: door\nB: cabinet\nC: bed\nD: shelf",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_152_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_152_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_152_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_152_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_152_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_152_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: bed\nB: bin\nC: shelf\nD: cabinet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bed\nB: bin\nC: shelf\nD: cabinet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_153_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_153_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_153_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_153_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_153_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_153_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: chair\nB: bed\nC: pillow\nD: sofa",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: chair\nB: bed\nC: pillow\nD: sofa",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_154_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_154_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_154_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_154_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_154_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_154_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: sink\nB: cabinet\nC: bed\nD: display",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: sink\nB: cabinet\nC: bed\nD: display",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_155_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_155_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_155_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_155_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_155_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_155_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: cabinet\nB: bag\nC: sink\nD: bed",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: cabinet\nB: bag\nC: sink\nD: bed",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_156_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_156_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_156_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_156_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_156_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_156_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: bin\nB: display\nC: toilet\nD: cabinet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bin\nB: display\nC: toilet\nD: cabinet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_157_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_157_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_157_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_157_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_157_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_157_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: chair\nB: sink\nC: cabinet\nD: bed",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: chair\nB: sink\nC: cabinet\nD: bed",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_158_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_158_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_158_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_158_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_158_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_158_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: shelf\nB: bed\nC: table\nD: cabinet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: shelf\nB: bed\nC: table\nD: cabinet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_159_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_159_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_159_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_159_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_159_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_159_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: sink\nB: pillow\nC: bed\nD: cabinet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: sink\nB: pillow\nC: bed\nD: cabinet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_160_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_160_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_160_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_160_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_160_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_160_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: sink\nB: bin\nC: cabinet\nD: bag",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: sink\nB: bin\nC: cabinet\nD: bag",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_161_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_161_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_161_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_161_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_161_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_161_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: sink\nB: toilet\nC: cabinet\nD: bed",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: sink\nB: toilet\nC: cabinet\nD: bed",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_162_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_162_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_162_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_162_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_162_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_162_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: sink\nB: bed\nC: display\nD: cabinet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: sink\nB: bed\nC: display\nD: cabinet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_163_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_163_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_163_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_163_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_163_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_163_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: bed\nB: bin\nC: cabinet\nD: door",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bed\nB: bin\nC: cabinet\nD: door",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_164_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_164_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_164_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_164_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_164_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_164_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: bag\nB: sink\nC: door\nD: cabinet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bag\nB: sink\nC: door\nD: cabinet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_165_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_165_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_165_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_165_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_165_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_165_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: desk\nB: cabinet\nC: shelf\nD: bed",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: desk\nB: cabinet\nC: shelf\nD: bed",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_166_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_166_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_166_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_166_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_166_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_166_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: chair\nB: cabinet\nC: desk\nD: bed",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: chair\nB: cabinet\nC: desk\nD: bed",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_167_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_167_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_167_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_167_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_167_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_167_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: sofa\nB: chair\nC: desk\nD: bed",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: sofa\nB: chair\nC: desk\nD: bed",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_168_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_168_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_168_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_168_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_168_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_168_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: display\nB: bag\nC: sink\nD: cabinet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: display\nB: bag\nC: sink\nD: cabinet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_169_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_169_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_169_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_169_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_169_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_169_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: bag\nB: cabinet\nC: bed\nD: sink",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bag\nB: cabinet\nC: bed\nD: sink",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_170_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_170_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_170_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_170_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_170_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_170_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: shelf\nB: cabinet\nC: sink\nD: door",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: shelf\nB: cabinet\nC: sink\nD: door",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_171_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_171_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_171_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_171_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_171_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_171_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: bed\nB: sink\nC: door\nD: shelf",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bed\nB: sink\nC: door\nD: shelf",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_172_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_172_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_172_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_172_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_172_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_172_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: cabinet\nB: shelf\nC: sink\nD: door",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: cabinet\nB: shelf\nC: sink\nD: door",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_173_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_173_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_173_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_173_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_173_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_173_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: bed\nB: cabinet\nC: sink\nD: table",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bed\nB: cabinet\nC: sink\nD: table",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_174_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_174_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_174_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_174_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_174_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_174_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: table\nB: cabinet\nC: sink\nD: bed",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: table\nB: cabinet\nC: sink\nD: bed",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_175_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_175_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_175_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_175_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_175_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_175_5.jpg"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: chair\nB: sink\nC: cabinet\nD: bed",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: chair\nB: sink\nC: cabinet\nD: bed",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_176_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_176_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_176_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_176_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_176_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_176_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: chair\nB: shelf\nC: cabinet\nD: display",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: chair\nB: shelf\nC: cabinet\nD: display",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_177_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_177_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_177_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_177_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_177_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_177_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: sink\nB: bed\nC: bag\nD: cabinet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: sink\nB: bed\nC: bag\nD: cabinet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_178_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_178_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_178_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_178_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_178_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_178_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: sink\nB: chair\nC: cabinet\nD: bag",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: sink\nB: chair\nC: cabinet\nD: bag",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_179_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_179_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_179_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_179_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_179_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_179_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: sink\nB: bed\nC: table\nD: cabinet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: sink\nB: bed\nC: table\nD: cabinet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_180_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_180_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_180_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_180_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_180_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_180_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: bed\nB: door\nC: sink\nD: cabinet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bed\nB: door\nC: sink\nD: cabinet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_181_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_181_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_181_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_181_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_181_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_181_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: cabinet\nB: sofa\nC: bed\nD: bin",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: cabinet\nB: sofa\nC: bed\nD: bin",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_182_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_182_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_182_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_182_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_182_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_182_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: shelf\nB: display\nC: bed\nD: cabinet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: shelf\nB: display\nC: bed\nD: cabinet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_183_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_183_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_183_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_183_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_183_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_183_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: display\nB: sofa\nC: pillow\nD: chair",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: display\nB: sofa\nC: pillow\nD: chair",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_184_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_184_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_184_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_184_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_184_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_184_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: bin\nB: bed\nC: cabinet\nD: pillow",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bin\nB: bed\nC: cabinet\nD: pillow",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_185_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_185_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_185_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_185_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_185_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_185_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: bed\nB: chair\nC: sink\nD: cabinet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bed\nB: chair\nC: sink\nD: cabinet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_186_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_186_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_186_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_186_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_186_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_186_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: display\nB: cabinet\nC: shelf\nD: bed",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: display\nB: cabinet\nC: shelf\nD: bed",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_187_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_187_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_187_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_187_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_187_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_187_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: sink\nB: cabinet\nC: door\nD: bag",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: sink\nB: cabinet\nC: door\nD: bag",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_188_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_188_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_188_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_188_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_188_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_188_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: sink\nB: cabinet\nC: table\nD: bed",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: sink\nB: cabinet\nC: table\nD: bed",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_189_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_189_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_189_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_189_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_189_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_189_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: cabinet\nB: shelf\nC: bed\nD: chair",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: cabinet\nB: shelf\nC: bed\nD: chair",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_190_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_190_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_190_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_190_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_190_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_190_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: display\nB: cabinet\nC: bed\nD: sofa",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: display\nB: cabinet\nC: bed\nD: sofa",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_191_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_191_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_191_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_191_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_191_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_191_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: sink\nB: door\nC: cabinet\nD: bed",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: sink\nB: door\nC: cabinet\nD: bed",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_192_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_192_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_192_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_192_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_192_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_192_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: bag\nB: table\nC: sink\nD: cabinet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bag\nB: table\nC: sink\nD: cabinet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_193_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_193_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_193_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_193_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_193_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_193_5.jpg"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: shelf\nB: bed\nC: cabinet\nD: door",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: shelf\nB: bed\nC: cabinet\nD: door",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_194_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_194_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_194_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_194_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_194_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_194_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: bed\nB: sink\nC: toilet\nD: cabinet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bed\nB: sink\nC: toilet\nD: cabinet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_195_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_195_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_195_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_195_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_195_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_195_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: sink\nB: bed\nC: bin\nD: cabinet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: sink\nB: bed\nC: bin\nD: cabinet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_196_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_196_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_196_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_196_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_196_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_196_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: bag\nB: sink\nC: bed\nD: cabinet",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: bag\nB: sink\nC: bed\nD: cabinet",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_197_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_197_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_197_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_197_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_197_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_197_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: chair\nB: table\nC: sofa\nD: bed",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: chair\nB: table\nC: sofa\nD: bed",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_198_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_198_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_198_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_198_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_198_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_198_5.jpg"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threed_indoor_recognition",
+    "visual_input_component": "Poine cloud image",
+    "source": "ScanObjectNN",
+    "options": "A: cabinet\nB: bin\nC: bed\nD: sink",
+    "question": "What is the category of the point cloud based on the multi-view of the point cloud?",
+    "context": "Select from the following choices.\nA: cabinet\nB: bin\nC: bed\nD: sink",
+    "input_image_path": [
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_199_0.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_199_1.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_199_2.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_199_3.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_199_4.jpg",
+      "../MMIU-Benchmark/threed_indoor_recognition/threed_indoor_recognition_199_5.jpg"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: yes\nB: uncertain\nC: no\nD: maybe",
+    "question": "Are there any things?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: yes\nB: uncertain\nC: no\nD: maybe",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_0_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_0_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_0_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_0_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_0_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_0_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_0_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_0_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_0_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_0_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_0_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_0_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: walking\nB: lying down\nC: sitting\nD: standing",
+    "question": "What is the status of the pedestrian to the back right of me?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: walking\nB: lying down\nC: sitting\nD: standing",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_1_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_1_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_1_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_1_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_1_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_1_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_1_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_1_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_1_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_1_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_1_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_1_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: 5\nB: 2\nC: 3\nD: 0",
+    "question": "How many cars are to the front left of me?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 5\nB: 2\nC: 3\nD: 0",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_2_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_2_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_2_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_2_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_2_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_2_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_2_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_2_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_2_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_2_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_2_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_2_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: 5\nB: 12\nC: 3\nD: 8",
+    "question": "How many moving things are there?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 5\nB: 12\nC: 3\nD: 8",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_3_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_3_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_3_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_3_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_3_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_3_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_3_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_3_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_3_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_3_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_3_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_3_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: moving\nB: idling\nC: broken down\nD: parked",
+    "question": "The truck to the back right of the bus is in what status?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: moving\nB: idling\nC: broken down\nD: parked",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_4_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_4_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_4_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_4_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_4_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_4_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_4_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_4_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_4_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_4_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_4_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_4_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: no\nB: not sure\nC: yes\nD: unknown",
+    "question": "Are there any other pedestrians of the same status as the thing that is to the front of the bicycle?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: no\nB: not sure\nC: yes\nD: unknown",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_5_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_5_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_5_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_5_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_5_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_5_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_5_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_5_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_5_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_5_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_5_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_5_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: new\nB: without rider\nC: for sale\nD: broken",
+    "question": "There is a motorcycle; what status is it?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: new\nB: without rider\nC: for sale\nD: broken",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_6_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_6_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_6_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_6_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_6_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_6_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_6_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_6_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_6_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_6_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_6_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_6_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: sitting\nB: jumping\nC: stationary\nD: moving",
+    "question": "What status is the pedestrian that is to the front of the bus?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: sitting\nB: jumping\nC: stationary\nD: moving",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_7_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_7_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_7_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_7_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_7_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_7_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_7_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_7_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_7_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_7_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_7_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_7_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: on the road\nB: without rider\nC: being ridden by someone\nD: inside the truck",
+    "question": "What is the status of the motorcycle to the front of the parked truck?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: on the road\nB: without rider\nC: being ridden by someone\nD: inside the truck",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_8_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_8_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_8_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_8_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_8_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_8_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_8_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_8_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_8_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_8_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_8_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_8_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: 7\nB: 5\nC: 3\nD: 10",
+    "question": "How many cars are to the back of me?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 7\nB: 5\nC: 3\nD: 10",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_9_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_9_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_9_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_9_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_9_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_9_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_9_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_9_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_9_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_9_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_9_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_9_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: with rider\nB: in repair\nC: being sold\nD: locked up",
+    "question": "The bicycle is in what status?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: with rider\nB: in repair\nC: being sold\nD: locked up",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_10_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_10_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_10_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_10_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_10_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_10_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_10_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_10_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_10_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_10_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_10_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_10_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: 2\nB: 4\nC: 10\nD: 8",
+    "question": "How many other things are there of the same status as the bus?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 2\nB: 4\nC: 10\nD: 8",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_11_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_11_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_11_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_11_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_11_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_11_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_11_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_11_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_11_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_11_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_11_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_11_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: maybe\nB: I don\u2019t know\nC: no\nD: yes",
+    "question": "Is the status of the thing that is to the front left of the construction vehicle the same as the car to the back of the motorcycle?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: maybe\nB: I don\u2019t know\nC: no\nD: yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_12_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_12_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_12_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_12_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_12_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_12_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_12_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_12_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_12_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_12_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_12_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_12_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: waiting\nB: moving\nC: departing\nD: stopped",
+    "question": "There is a bus; what status is it?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: waiting\nB: moving\nC: departing\nD: stopped",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_13_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_13_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_13_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_13_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_13_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_13_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_13_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_13_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_13_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_13_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_13_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_13_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: 6\nB: 4\nC: 2\nD: 7",
+    "question": "There is a bicycle; how many moving things are to the back right of it?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 6\nB: 4\nC: 2\nD: 7",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_14_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_14_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_14_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_14_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_14_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_14_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_14_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_14_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_14_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_14_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_14_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_14_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: yes\nB: no\nC: possibly\nD: maybe",
+    "question": "Are any trucks visible?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: yes\nB: no\nC: possibly\nD: maybe",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_15_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_15_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_15_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_15_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_15_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_15_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_15_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_15_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_15_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_15_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_15_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_15_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: 5\nB: 12\nC: 3\nD: 9",
+    "question": "What number of parked cars are to the front left of me?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 5\nB: 12\nC: 3\nD: 9",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_16_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_16_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_16_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_16_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_16_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_16_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_16_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_16_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_16_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_16_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_16_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_16_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: broken\nB: with rider\nC: new\nD: without rider",
+    "question": "The bicycle to the front of me is in what status?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: broken\nB: with rider\nC: new\nD: without rider",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_17_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_17_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_17_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_17_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_17_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_17_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_17_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_17_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_17_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_17_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_17_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_17_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: maybe\nB: no\nC: uncertain\nD: yes",
+    "question": "There is a truck that is to the back right of the parked thing; is its status the same as the bus?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: maybe\nB: no\nC: uncertain\nD: yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_18_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_18_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_18_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_18_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_18_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_18_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_18_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_18_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_18_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_18_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_18_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_18_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: unsure\nB: yes\nC: no\nD: maybe",
+    "question": "There is a car that is to the front left of the motorcycle; is it the same status as the bus?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: unsure\nB: yes\nC: no\nD: maybe",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_19_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_19_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_19_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_19_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_19_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_19_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_19_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_19_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_19_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_19_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_19_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_19_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: no, but there is a stationary bus\nB: yes, the bus is moving\nC: yes, there is a bus in the frame\nD: no",
+    "question": "There is a with rider motorcycle; are there any moving buss to the back right of it?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: no, but there is a stationary bus\nB: yes, the bus is moving\nC: yes, there is a bus in the frame\nD: no",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_20_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_20_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_20_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_20_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_20_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_20_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_20_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_20_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_20_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_20_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_20_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_20_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: train\nB: trees\nC: bike\nD: car",
+    "question": "The thing that is both to the back of the bus and the front left of me is what?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: train\nB: trees\nC: bike\nD: car",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_21_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_21_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_21_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_21_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_21_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_21_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_21_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_21_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_21_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_21_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_21_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_21_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: no\nB: uncertain\nC: maybe\nD: yes",
+    "question": "Are there any other things of the same status as the traffic cone to the front left of the with rider thing?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: no\nB: uncertain\nC: maybe\nD: yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_22_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_22_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_22_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_22_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_22_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_22_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_22_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_22_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_22_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_22_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_22_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_22_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: uncertain\nB: no\nC: yes\nD: maybe",
+    "question": "There is a truck that is to the back right of the bus; is it the same status as the car that is to the back right of the construction vehicle?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: uncertain\nB: no\nC: yes\nD: maybe",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_23_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_23_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_23_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_23_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_23_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_23_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_23_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_23_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_23_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_23_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_23_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_23_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: sitting\nB: lying down\nC: moving\nD: standing",
+    "question": "What is the status of the pedestrian to the front of the parked thing?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: sitting\nB: lying down\nC: moving\nD: standing",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_24_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_24_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_24_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_24_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_24_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_24_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_24_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_24_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_24_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_24_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_24_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_24_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: not sure\nB: yes\nC: maybe\nD: no",
+    "question": "Are there any other things that in the same status as the bus?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: not sure\nB: yes\nC: maybe\nD: no",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_25_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_25_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_25_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_25_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_25_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_25_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_25_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_25_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_25_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_25_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_25_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_25_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: tree\nB: bicycle\nC: lamp post\nD: pedestrian",
+    "question": "What is the thing that is to the front left of me and the back right of the moving bus?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: tree\nB: bicycle\nC: lamp post\nD: pedestrian",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_26_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_26_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_26_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_26_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_26_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_26_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_26_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_26_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_26_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_26_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_26_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_26_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: sold\nB: moving\nC: parked\nD: broken down",
+    "question": "What is the status of the trailer?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: sold\nB: moving\nC: parked\nD: broken down",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_27_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_27_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_27_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_27_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_27_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_27_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_27_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_27_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_27_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_27_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_27_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_27_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: with rider\nB: being serviced\nC: missing\nD: on stand",
+    "question": "What status is the motorcycle to the front left of the parked thing?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: with rider\nB: being serviced\nC: missing\nD: on stand",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_28_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_28_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_28_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_28_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_28_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_28_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_28_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_28_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_28_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_28_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_28_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_28_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: maybe\nB: no\nC: sometimes\nD: yes",
+    "question": "There is a bus; is its status the same as the bicycle to the back of the with rider motorcycle?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: maybe\nB: no\nC: sometimes\nD: yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_29_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_29_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_29_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_29_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_29_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_29_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_29_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_29_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_29_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_29_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_29_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_29_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: traffic light\nB: bus stop sign\nC: hydrant\nD: pedestrian",
+    "question": "What is the standing pedestrian that is to the front left of the stopped bus?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: traffic light\nB: bus stop sign\nC: hydrant\nD: pedestrian",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_30_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_30_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_30_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_30_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_30_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_30_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_30_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_30_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_30_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_30_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_30_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_30_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: motorcycle\nB: truck\nC: bicycle\nD: car",
+    "question": "What is the thing that is both to the back right of the stopped bus and the front left of the parked truck?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: motorcycle\nB: truck\nC: bicycle\nD: car",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_31_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_31_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_31_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_31_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_31_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_31_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_31_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_31_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_31_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_31_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_31_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_31_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: maybe\nB: uncertain\nC: no\nD: yes",
+    "question": "Are there any other construction vehicles of the same status as the car that is to the back of the bus?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: maybe\nB: uncertain\nC: no\nD: yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_32_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_32_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_32_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_32_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_32_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_32_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_32_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_32_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_32_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_32_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_32_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_32_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: airplane\nB: train\nC: car\nD: motorcycle",
+    "question": "The with rider thing to the front left of the with rider bicycle is what?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: airplane\nB: train\nC: car\nD: motorcycle",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_33_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_33_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_33_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_33_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_33_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_33_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_33_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_33_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_33_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_33_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_33_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_33_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: pedestrian\nB: building\nC: bus\nD: car",
+    "question": "What is the thing that is both to the back of the standing pedestrian and the front left of the parked thing?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: pedestrian\nB: building\nC: bus\nD: car",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_34_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_34_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_34_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_34_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_34_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_34_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_34_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_34_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_34_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_34_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_34_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_34_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: yes\nB: no\nC: unknown\nD: maybe",
+    "question": "There is a car to the back right of the stopped bus; does it have the same status as the bus?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: yes\nB: no\nC: unknown\nD: maybe",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_35_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_35_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_35_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_35_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_35_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_35_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_35_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_35_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_35_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_35_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_35_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_35_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: moving\nB: stopped\nC: under maintenance\nD: idle",
+    "question": "What is the status of the bus?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: moving\nB: stopped\nC: under maintenance\nD: idle",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_36_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_36_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_36_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_36_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_36_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_36_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_36_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_36_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_36_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_36_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_36_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_36_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: yes\nB: no\nC: maybe\nD: sometimes",
+    "question": "Are there any trailers to the front right of me?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: yes\nB: no\nC: maybe\nD: sometimes",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_37_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_37_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_37_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_37_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_37_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_37_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_37_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_37_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_37_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_37_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_37_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_37_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: yes\nB: maybe\nC: no\nD: uncertain",
+    "question": "Is the status of the truck to the front left of the without rider motorcycle the same as the construction vehicle?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: yes\nB: maybe\nC: no\nD: uncertain",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_38_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_38_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_38_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_38_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_38_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_38_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_38_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_38_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_38_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_38_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_38_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_38_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: parked\nB: reversing\nC: broken down\nD: moving",
+    "question": "What status is the car that is to the front of me?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: parked\nB: reversing\nC: broken down\nD: moving",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_39_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_39_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_39_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_39_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_39_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_39_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_39_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_39_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_39_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_39_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_39_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_39_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: yes\nB: maybe\nC: sometimes\nD: no",
+    "question": "Are there any other pedestrians of the same status as the trailer?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: yes\nB: maybe\nC: sometimes\nD: no",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_40_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_40_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_40_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_40_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_40_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_40_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_40_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_40_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_40_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_40_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_40_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_40_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: yes\nB: no\nC: maybe\nD: not sure",
+    "question": "Are there any other things that in the same status as the bus?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: yes\nB: no\nC: maybe\nD: not sure",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_41_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_41_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_41_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_41_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_41_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_41_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_41_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_41_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_41_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_41_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_41_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_41_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: 1\nB: 0\nC: 3\nD: 2",
+    "question": "How many other motorcycles in the same status as the car that is to the back of the moving bus?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 1\nB: 0\nC: 3\nD: 2",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_42_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_42_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_42_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_42_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_42_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_42_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_42_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_42_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_42_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_42_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_42_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_42_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: maybe\nB: no\nC: not sure\nD: yes",
+    "question": "There is a parked truck; are there any moving pedestrians to the front left of it?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: maybe\nB: no\nC: not sure\nD: yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_43_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_43_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_43_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_43_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_43_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_43_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_43_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_43_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_43_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_43_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_43_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_43_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: under maintenance\nB: moving\nC: parked\nD: stopped",
+    "question": "There is a truck to the front left of the stopped construction vehicle; what status is it?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: under maintenance\nB: moving\nC: parked\nD: stopped",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_44_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_44_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_44_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_44_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_44_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_44_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_44_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_44_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_44_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_44_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_44_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_44_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: bicycle\nB: car\nC: motorcycle\nD: tricycle",
+    "question": "There is a with rider thing that is to the front left of the bicycle; what is it?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: bicycle\nB: car\nC: motorcycle\nD: tricycle",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_45_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_45_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_45_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_45_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_45_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_45_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_45_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_45_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_45_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_45_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_45_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_45_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: maybe\nB: yes\nC: no\nD: I can't tell",
+    "question": "Are there any motorcycles to the front right of the bus?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: maybe\nB: yes\nC: no\nD: I can't tell",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_46_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_46_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_46_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_46_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_46_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_46_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_46_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_46_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_46_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_46_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_46_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_46_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: bus\nB: car\nC: bike\nD: train",
+    "question": "What is the stopped thing?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: bus\nB: car\nC: bike\nD: train",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_47_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_47_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_47_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_47_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_47_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_47_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_47_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_47_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_47_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_47_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_47_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_47_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: pedestrian\nB: car\nC: bike\nD: tree",
+    "question": "The standing pedestrian that is to the front left of me is what?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: pedestrian\nB: car\nC: bike\nD: tree",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_48_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_48_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_48_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_48_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_48_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_48_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_48_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_48_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_48_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_48_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_48_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_48_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: 2\nB: 5\nC: 3\nD: 1",
+    "question": "How many other things are in the same status as the bus that is to the back right of the moving bus?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 2\nB: 5\nC: 3\nD: 1",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_49_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_49_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_49_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_49_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_49_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_49_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_49_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_49_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_49_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_49_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_49_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_49_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: standing\nB: moving\nC: lying down\nD: sitting",
+    "question": "What is the status of the pedestrian that is to the front of the traffic cone?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: standing\nB: moving\nC: lying down\nD: sitting",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_50_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_50_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_50_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_50_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_50_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_50_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_50_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_50_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_50_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_50_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_50_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_50_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: maybe\nB: no\nC: yes\nD: unknown",
+    "question": "There is a motorcycle to the back right of the parked thing; does it have the same status as the bicycle that is to the back right of the bus?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: maybe\nB: no\nC: yes\nD: unknown",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_51_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_51_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_51_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_51_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_51_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_51_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_51_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_51_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_51_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_51_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_51_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_51_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: maybe\nB: possibly\nC: no\nD: yes",
+    "question": "Are there any buss to the front right of me?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: maybe\nB: possibly\nC: no\nD: yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_52_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_52_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_52_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_52_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_52_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_52_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_52_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_52_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_52_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_52_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_52_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_52_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: parked\nB: moving\nC: under maintenance\nD: stopping",
+    "question": "The bus is in what status?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: parked\nB: moving\nC: under maintenance\nD: stopping",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_53_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_53_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_53_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_53_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_53_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_53_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_53_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_53_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_53_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_53_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_53_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_53_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: parked\nB: broken down\nC: moving\nD: stopped",
+    "question": "The bus that is to the front of me is in what status?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: parked\nB: broken down\nC: moving\nD: stopped",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_54_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_54_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_54_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_54_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_54_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_54_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_54_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_54_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_54_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_54_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_54_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_54_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: bicycle\nB: car\nC: motorcycle\nD: pedestrian",
+    "question": "The thing that is both to the back of the stopped bus and the back right of me is what?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: bicycle\nB: car\nC: motorcycle\nD: pedestrian",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_55_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_55_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_55_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_55_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_55_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_55_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_55_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_55_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_55_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_55_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_55_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_55_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: yes\nB: maybe\nC: uncertain\nD: no",
+    "question": "There is a with rider thing; are there any parked cars to the back of it?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: yes\nB: maybe\nC: uncertain\nD: no",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_56_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_56_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_56_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_56_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_56_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_56_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_56_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_56_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_56_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_56_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_56_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_56_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: sometimes\nB: no\nC: yes\nD: uncertain",
+    "question": "Does the bicycle have the same status as the thing that is to the back right of the construction vehicle?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: sometimes\nB: no\nC: yes\nD: uncertain",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_57_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_57_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_57_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_57_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_57_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_57_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_57_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_57_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_57_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_57_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_57_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_57_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: only on weekends\nB: no\nC: sometimes\nD: yes",
+    "question": "Are any without rider things visible?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: only on weekends\nB: no\nC: sometimes\nD: yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_58_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_58_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_58_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_58_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_58_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_58_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_58_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_58_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_58_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_58_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_58_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_58_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: maybe\nB: unknown\nC: no\nD: yes",
+    "question": "Is the status of the bicycle the same as the truck that is to the back of the without rider motorcycle?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: maybe\nB: unknown\nC: no\nD: yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_59_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_59_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_59_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_59_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_59_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_59_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_59_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_59_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_59_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_59_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_59_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_59_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: not applicable\nB: no\nC: uncertain\nD: yes",
+    "question": "Are there any other buss that in the same status as the motorcycle to the back right of the moving bus?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: not applicable\nB: no\nC: uncertain\nD: yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_60_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_60_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_60_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_60_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_60_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_60_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_60_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_60_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_60_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_60_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_60_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_60_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: 9\nB: 5\nC: 12\nD: 7",
+    "question": "What number of cars are to the back right of the trailer?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 9\nB: 5\nC: 12\nD: 7",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_61_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_61_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_61_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_61_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_61_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_61_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_61_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_61_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_61_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_61_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_61_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_61_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: 3\nB: 0\nC: 1\nD: 2",
+    "question": "What number of other things are there of the same status as the bicycle?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 3\nB: 0\nC: 1\nD: 2",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_62_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_62_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_62_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_62_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_62_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_62_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_62_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_62_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_62_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_62_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_62_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_62_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: with rider\nB: on the ground\nC: in repair\nD: broken",
+    "question": "What is the status of the bicycle?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: with rider\nB: on the ground\nC: in repair\nD: broken",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_63_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_63_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_63_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_63_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_63_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_63_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_63_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_63_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_63_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_63_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_63_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_63_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: unsure\nB: no\nC: maybe\nD: yes",
+    "question": "Are there any other bicycles of the same status as the car to the front left of the parked trailer?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: unsure\nB: no\nC: maybe\nD: yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_64_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_64_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_64_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_64_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_64_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_64_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_64_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_64_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_64_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_64_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_64_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_64_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: being repaired\nB: moving\nC: stopped\nD: parked",
+    "question": "The construction vehicle is in what status?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: being repaired\nB: moving\nC: stopped\nD: parked",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_65_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_65_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_65_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_65_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_65_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_65_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_65_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_65_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_65_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_65_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_65_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_65_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: building\nB: bicycle\nC: tree\nD: car",
+    "question": "The thing that is both to the front left of the construction vehicle and the back of me is what?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: building\nB: bicycle\nC: tree\nD: car",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_66_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_66_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_66_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_66_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_66_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_66_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_66_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_66_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_66_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_66_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_66_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_66_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: 4\nB: 2\nC: 0\nD: 1",
+    "question": "What number of moving cars are to the front left of the moving bus?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 4\nB: 2\nC: 0\nD: 1",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_67_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_67_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_67_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_67_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_67_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_67_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_67_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_67_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_67_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_67_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_67_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_67_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: 9\nB: 5\nC: 12\nD: 7",
+    "question": "How many cars are to the back of me?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 9\nB: 5\nC: 12\nD: 7",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_68_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_68_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_68_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_68_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_68_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_68_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_68_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_68_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_68_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_68_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_68_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_68_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: sometimes\nB: no\nC: only during peak hours\nD: yes",
+    "question": "Are there any moving trailers?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: sometimes\nB: no\nC: only during peak hours\nD: yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_69_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_69_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_69_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_69_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_69_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_69_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_69_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_69_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_69_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_69_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_69_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_69_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: 6\nB: 12\nC: 9\nD: 3",
+    "question": "What number of cars are there?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 6\nB: 12\nC: 9\nD: 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_70_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_70_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_70_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_70_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_70_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_70_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_70_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_70_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_70_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_70_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_70_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_70_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: unknown\nB: yes\nC: no\nD: maybe",
+    "question": "Is the status of the car that is to the front of the trailer the same as the trailer?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: unknown\nB: yes\nC: no\nD: maybe",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_71_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_71_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_71_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_71_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_71_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_71_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_71_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_71_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_71_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_71_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_71_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_71_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: yes\nB: uncertain\nC: no\nD: maybe",
+    "question": "There is a construction vehicle that is to the front left of the parked truck; does it have the same status as the bus?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: yes\nB: uncertain\nC: no\nD: maybe",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_72_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_72_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_72_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_72_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_72_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_72_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_72_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_72_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_72_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_72_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_72_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_72_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: stopped\nB: moving\nC: broken down\nD: cancelled",
+    "question": "What is the status of the bus to the back right of me?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: stopped\nB: moving\nC: broken down\nD: cancelled",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_73_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_73_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_73_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_73_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_73_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_73_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_73_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_73_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_73_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_73_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_73_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_73_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: 4\nB: 9\nC: 7\nD: 12",
+    "question": "How many other things in the same status as the thing that is to the front of the with rider bicycle?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 4\nB: 9\nC: 7\nD: 12",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_74_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_74_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_74_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_74_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_74_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_74_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_74_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_74_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_74_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_74_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_74_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_74_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: no\nB: maybe\nC: yes\nD: unsure",
+    "question": "Are there any other buss of the same status as the construction vehicle?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: no\nB: maybe\nC: yes\nD: unsure",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_75_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_75_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_75_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_75_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_75_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_75_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_75_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_75_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_75_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_75_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_75_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_75_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: yes\nB: sometimes\nC: maybe\nD: no",
+    "question": "There is a construction vehicle; is its status the same as the bus that is to the front of the truck?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: yes\nB: sometimes\nC: maybe\nD: no",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_76_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_76_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_76_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_76_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_76_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_76_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_76_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_76_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_76_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_76_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_76_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_76_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: tree\nB: car\nC: bicycle\nD: bench",
+    "question": "The thing that is both to the back right of the trailer and the back right of me is what?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: tree\nB: car\nC: bicycle\nD: bench",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_77_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_77_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_77_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_77_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_77_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_77_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_77_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_77_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_77_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_77_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_77_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_77_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: 9\nB: 7\nC: 5\nD: 12",
+    "question": "What number of other things in the same status as the car that is to the front left of the motorcycle?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 9\nB: 7\nC: 5\nD: 12",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_78_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_78_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_78_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_78_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_78_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_78_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_78_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_78_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_78_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_78_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_78_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_78_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: 10\nB: 3\nC: 1\nD: 5",
+    "question": "What number of other things are in the same status as the bus?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 10\nB: 3\nC: 1\nD: 5",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_79_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_79_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_79_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_79_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_79_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_79_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_79_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_79_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_79_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_79_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_79_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_79_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: bench\nB: pedestrian\nC: tree\nD: bicycle",
+    "question": "The thing that is both to the back right of the stopped bus and the back right of me is what?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: bench\nB: pedestrian\nC: tree\nD: bicycle",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_80_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_80_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_80_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_80_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_80_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_80_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_80_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_80_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_80_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_80_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_80_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_80_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: 5\nB: 3\nC: 7\nD: 2",
+    "question": "What number of other things are in the same status as the construction vehicle?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 5\nB: 3\nC: 7\nD: 2",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_81_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_81_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_81_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_81_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_81_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_81_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_81_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_81_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_81_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_81_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_81_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_81_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: 3\nB: 5\nC: 2\nD: 0",
+    "question": "What number of moving buss are to the front right of me?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 3\nB: 5\nC: 2\nD: 0",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_82_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_82_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_82_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_82_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_82_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_82_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_82_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_82_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_82_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_82_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_82_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_82_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: motorcycle\nB: bicycle\nC: trolley\nD: car",
+    "question": "The without rider thing that is to the front left of me is what?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: motorcycle\nB: bicycle\nC: trolley\nD: car",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_83_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_83_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_83_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_83_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_83_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_83_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_83_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_83_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_83_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_83_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_83_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_83_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: with rider\nB: in front of the bus\nC: without rider\nD: parked",
+    "question": "What status is the motorcycle to the back of the moving bus?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: with rider\nB: in front of the bus\nC: without rider\nD: parked",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_84_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_84_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_84_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_84_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_84_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_84_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_84_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_84_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_84_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_84_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_84_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_84_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: scooter\nB: rollerblades\nC: motorcycle\nD: bicycle",
+    "question": "The with rider thing to the back right of the moving bus is what?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: scooter\nB: rollerblades\nC: motorcycle\nD: bicycle",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_85_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_85_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_85_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_85_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_85_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_85_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_85_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_85_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_85_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_85_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_85_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_85_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: bicycle\nB: dog\nC: pedestrian\nD: car",
+    "question": "What is the moving thing that is both to the back right of the motorcycle and the front of the bus?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: bicycle\nB: dog\nC: pedestrian\nD: car",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_86_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_86_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_86_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_86_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_86_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_86_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_86_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_86_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_86_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_86_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_86_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_86_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: uncertain\nB: maybe\nC: yes\nD: no",
+    "question": "Is there another car that has the same status as the thing that is to the front left of the with rider thing?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: uncertain\nB: maybe\nC: yes\nD: no",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_87_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_87_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_87_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_87_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_87_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_87_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_87_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_87_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_87_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_87_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_87_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_87_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: without rider\nB: parked\nC: with rider\nD: damaged",
+    "question": "What status is the motorcycle to the back of the moving bus?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: without rider\nB: parked\nC: with rider\nD: damaged",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_88_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_88_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_88_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_88_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_88_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_88_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_88_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_88_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_88_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_88_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_88_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_88_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: 7\nB: 3\nC: 6\nD: 4",
+    "question": "There is a stopped bus; what number of moving things are to the front left of it?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 7\nB: 3\nC: 6\nD: 4",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_89_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_89_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_89_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_89_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_89_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_89_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_89_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_89_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_89_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_89_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_89_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_89_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: turning\nB: stopped\nC: broken down\nD: moving",
+    "question": "There is a truck to the back right of the moving truck; what status is it?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: turning\nB: stopped\nC: broken down\nD: moving",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_90_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_90_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_90_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_90_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_90_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_90_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_90_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_90_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_90_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_90_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_90_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_90_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: no\nB: maybe\nC: sometimes\nD: yes",
+    "question": "Are there any cars?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: no\nB: maybe\nC: sometimes\nD: yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_91_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_91_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_91_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_91_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_91_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_91_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_91_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_91_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_91_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_91_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_91_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_91_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: moving\nB: stationary\nC: parked\nD: broken down",
+    "question": "There is a car to the back right of me; what status is it?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: moving\nB: stationary\nC: parked\nD: broken down",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_92_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_92_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_92_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_92_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_92_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_92_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_92_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_92_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_92_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_92_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_92_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_92_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: unknown\nB: yes\nC: maybe\nD: no",
+    "question": "There is a construction vehicle; does it have the same status as the car to the back right of the construction vehicle?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: unknown\nB: yes\nC: maybe\nD: no",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_93_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_93_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_93_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_93_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_93_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_93_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_93_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_93_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_93_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_93_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_93_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_93_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: motorcycle\nB: bicycle\nC: truck\nD: car",
+    "question": "What is the moving thing that is both to the back right of the bus and the front left of the with rider bicycle?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: motorcycle\nB: bicycle\nC: truck\nD: car",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_94_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_94_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_94_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_94_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_94_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_94_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_94_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_94_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_94_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_94_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_94_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_94_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: no\nB: sometimes\nC: yes\nD: probably not",
+    "question": "Are there any barriers?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: no\nB: sometimes\nC: yes\nD: probably not",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_95_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_95_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_95_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_95_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_95_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_95_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_95_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_95_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_95_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_95_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_95_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_95_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: 7\nB: 10\nC: 3\nD: 5",
+    "question": "How many moving pedestrians are there?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 7\nB: 10\nC: 3\nD: 5",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_96_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_96_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_96_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_96_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_96_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_96_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_96_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_96_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_96_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_96_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_96_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_96_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: no\nB: yes\nC: there is a rider without a car\nD: maybe",
+    "question": "There is a with rider thing; are there any stopped cars to the back left of it?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: no\nB: yes\nC: there is a rider without a car\nD: maybe",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_97_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_97_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_97_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_97_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_97_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_97_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_97_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_97_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_97_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_97_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_97_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_97_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: yes\nB: not sure\nC: maybe\nD: no",
+    "question": "Is there another car of the same status as the pedestrian to the front of the with rider thing?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: yes\nB: not sure\nC: maybe\nD: no",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_98_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_98_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_98_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_98_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_98_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_98_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_98_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_98_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_98_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_98_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_98_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_98_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: not sure\nB: no\nC: maybe\nD: yes",
+    "question": "Are there any moving buss?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: not sure\nB: no\nC: maybe\nD: yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_99_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_99_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_99_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_99_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_99_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_99_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_99_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_99_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_99_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_99_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_99_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_99_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: moving\nB: under maintenance\nC: being loaded\nD: parked",
+    "question": "The construction vehicle is in what status?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: moving\nB: under maintenance\nC: being loaded\nD: parked",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_100_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_100_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_100_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_100_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_100_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_100_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_100_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_100_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_100_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_100_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_100_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_100_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: delayed\nB: stopped\nC: broken down\nD: moving",
+    "question": "The bus is in what status?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: delayed\nB: stopped\nC: broken down\nD: moving",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_101_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_101_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_101_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_101_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_101_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_101_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_101_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_101_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_101_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_101_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_101_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_101_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: lost\nB: moving\nC: stopped\nD: waiting",
+    "question": "There is a pedestrian that is to the front left of the stopped bus; what status is it?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: lost\nB: moving\nC: stopped\nD: waiting",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_102_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_102_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_102_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_102_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_102_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_102_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_102_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_102_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_102_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_102_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_102_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_102_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: yes\nB: maybe\nC: not sure\nD: no",
+    "question": "There is a bus; is it the same status as the thing that is to the back of the moving bus?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: yes\nB: maybe\nC: not sure\nD: no",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_103_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_103_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_103_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_103_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_103_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_103_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_103_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_103_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_103_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_103_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_103_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_103_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: parked\nB: moving\nC: stopped\nD: overturned",
+    "question": "What is the status of the construction vehicle to the front of the bicycle?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: parked\nB: moving\nC: stopped\nD: overturned",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_104_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_104_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_104_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_104_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_104_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_104_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_104_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_104_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_104_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_104_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_104_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_104_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: yes\nB: sometimes\nC: no\nD: maybe",
+    "question": "Are any things visible?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: yes\nB: sometimes\nC: no\nD: maybe",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_105_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_105_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_105_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_105_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_105_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_105_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_105_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_105_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_105_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_105_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_105_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_105_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: yes\nB: I don't know\nC: maybe\nD: no",
+    "question": "Are there any things to the front of me?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: yes\nB: I don't know\nC: maybe\nD: no",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_106_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_106_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_106_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_106_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_106_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_106_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_106_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_106_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_106_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_106_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_106_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_106_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: bicycle\nB: bus\nC: train\nD: plane",
+    "question": "There is a stopped thing that is to the front of me; what is it?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: bicycle\nB: bus\nC: train\nD: plane",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_107_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_107_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_107_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_107_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_107_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_107_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_107_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_107_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_107_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_107_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_107_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_107_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: uncertain\nB: no\nC: yes\nD: maybe",
+    "question": "Are there any other things that in the same status as the car to the front left of the barrier?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: uncertain\nB: no\nC: yes\nD: maybe",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_108_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_108_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_108_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_108_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_108_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_108_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_108_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_108_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_108_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_108_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_108_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_108_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: maybe\nB: no\nC: yes\nD: possibly",
+    "question": "Does the car that is to the front left of the moving truck have the same status as the motorcycle?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: maybe\nB: no\nC: yes\nD: possibly",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_109_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_109_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_109_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_109_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_109_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_109_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_109_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_109_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_109_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_109_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_109_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_109_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: moving\nB: under maintenance\nC: accelerating\nD: stopped",
+    "question": "The construction vehicle is in what status?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: moving\nB: under maintenance\nC: accelerating\nD: stopped",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_110_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_110_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_110_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_110_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_110_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_110_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_110_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_110_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_110_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_110_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_110_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_110_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: moving\nB: under maintenance\nC: delayed\nD: stopped",
+    "question": "The bus is in what status?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: moving\nB: under maintenance\nC: delayed\nD: stopped",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_111_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_111_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_111_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_111_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_111_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_111_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_111_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_111_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_111_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_111_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_111_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_111_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: maybe\nB: no\nC: yes\nD: uncertain",
+    "question": "There is a car to the front left of the bicycle; is its status the same as the truck to the back right of the moving bus?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: maybe\nB: no\nC: yes\nD: uncertain",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_112_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_112_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_112_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_112_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_112_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_112_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_112_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_112_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_112_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_112_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_112_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_112_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: maybe\nB: yes\nC: no\nD: uncertain",
+    "question": "Is there another car that has the same status as the motorcycle to the back of the moving bus?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: maybe\nB: yes\nC: no\nD: uncertain",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_113_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_113_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_113_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_113_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_113_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_113_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_113_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_113_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_113_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_113_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_113_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_113_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: no\nB: maybe\nC: sometimes\nD: yes",
+    "question": "There is a car to the front of the construction vehicle; is its status the same as the construction vehicle?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: no\nB: maybe\nC: sometimes\nD: yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_114_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_114_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_114_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_114_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_114_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_114_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_114_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_114_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_114_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_114_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_114_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_114_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: no\nB: yes\nC: only one other thing\nD: uncertain",
+    "question": "Are there any other things of the same status as the motorcycle that is to the front left of the parked thing?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: no\nB: yes\nC: only one other thing\nD: uncertain",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_115_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_115_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_115_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_115_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_115_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_115_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_115_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_115_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_115_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_115_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_115_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_115_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: maybe\nB: no\nC: yes\nD: unknown",
+    "question": "Are there any cars to the back left of the construction vehicle?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: maybe\nB: no\nC: yes\nD: unknown",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_116_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_116_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_116_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_116_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_116_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_116_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_116_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_116_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_116_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_116_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_116_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_116_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: broken down\nB: moving backward\nC: without rider\nD: with rider",
+    "question": "There is a motorcycle that is to the back of me; what status is it?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: broken down\nB: moving backward\nC: without rider\nD: with rider",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_117_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_117_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_117_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_117_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_117_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_117_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_117_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_117_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_117_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_117_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_117_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_117_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: 10\nB: 15\nC: 3\nD: 5",
+    "question": "What number of cars are to the front left of me?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 10\nB: 15\nC: 3\nD: 5",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_118_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_118_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_118_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_118_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_118_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_118_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_118_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_118_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_118_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_118_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_118_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_118_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: broken\nB: without rider\nC: missing\nD: with rider",
+    "question": "There is a thing that is to the front left of me; what status is it?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: broken\nB: without rider\nC: missing\nD: with rider",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_119_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_119_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_119_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_119_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_119_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_119_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_119_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_119_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_119_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_119_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_119_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_119_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: sitting\nB: standing\nC: running\nD: walking",
+    "question": "What status is the pedestrian that is to the back right of me?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: sitting\nB: standing\nC: running\nD: walking",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_120_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_120_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_120_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_120_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_120_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_120_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_120_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_120_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_120_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_120_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_120_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_120_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: not sure\nB: maybe\nC: yes\nD: no",
+    "question": "Does the truck to the back of the bus have the same status as the construction vehicle that is to the back right of the with rider thing?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: not sure\nB: maybe\nC: yes\nD: no",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_121_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_121_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_121_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_121_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_121_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_121_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_121_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_121_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_121_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_121_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_121_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_121_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: tree\nB: mailbox\nC: sidewalk\nD: traffic cone",
+    "question": "The thing that is to the back of the moving car and the front left of me is what?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: tree\nB: mailbox\nC: sidewalk\nD: traffic cone",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_122_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_122_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_122_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_122_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_122_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_122_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_122_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_122_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_122_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_122_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_122_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_122_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: unknown\nB: no\nC: yes\nD: maybe",
+    "question": "Do the thing that is to the front of the stopped car and the pedestrian that is to the front left of the stopped car have the same status?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: unknown\nB: no\nC: yes\nD: maybe",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_123_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_123_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_123_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_123_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_123_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_123_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_123_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_123_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_123_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_123_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_123_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_123_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: unknown\nB: no\nC: maybe\nD: yes",
+    "question": "Are there any other pedestrians of the same status as the bus that is to the front left of the stopped bus?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: unknown\nB: no\nC: maybe\nD: yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_124_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_124_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_124_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_124_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_124_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_124_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_124_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_124_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_124_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_124_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_124_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_124_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: broken down\nB: being repaired\nC: without rider\nD: with rider",
+    "question": "What is the status of the motorcycle that is to the front left of me?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: broken down\nB: being repaired\nC: without rider\nD: with rider",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_125_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_125_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_125_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_125_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_125_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_125_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_125_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_125_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_125_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_125_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_125_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_125_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: moving\nB: departing\nC: stopped\nD: arriving",
+    "question": "What is the status of the bus?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: moving\nB: departing\nC: stopped\nD: arriving",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_126_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_126_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_126_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_126_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_126_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_126_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_126_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_126_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_126_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_126_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_126_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_126_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: there are people\nB: yes\nC: a car\nD: no",
+    "question": "Are there any moving things to the back left of the with rider bicycle?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: there are people\nB: yes\nC: a car\nD: no",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_127_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_127_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_127_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_127_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_127_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_127_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_127_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_127_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_127_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_127_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_127_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_127_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: absent\nB: moving\nC: dangerous\nD: stationary",
+    "question": "There is a pedestrian; what status is it?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: absent\nB: moving\nC: dangerous\nD: stationary",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_128_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_128_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_128_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_128_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_128_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_128_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_128_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_128_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_128_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_128_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_128_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_128_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: some\nB: yes\nC: maybe\nD: no",
+    "question": "There is a moving truck; are there any trucks to the front of it?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: some\nB: yes\nC: maybe\nD: no",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_129_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_129_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_129_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_129_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_129_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_129_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_129_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_129_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_129_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_129_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_129_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_129_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: train\nB: bicycle\nC: car\nD: pedestrian",
+    "question": "The stopped thing to the back of the stopped bus is what?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: train\nB: bicycle\nC: car\nD: pedestrian",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_130_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_130_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_130_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_130_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_130_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_130_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_130_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_130_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_130_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_130_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_130_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_130_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: no\nB: not sure\nC: cannot tell\nD: yes",
+    "question": "Are any with rider motorcycles visible?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: no\nB: not sure\nC: cannot tell\nD: yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_131_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_131_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_131_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_131_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_131_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_131_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_131_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_131_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_131_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_131_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_131_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_131_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: 6\nB: 4\nC: 3\nD: 2",
+    "question": "How many things are to the front of me?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 6\nB: 4\nC: 3\nD: 2",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_132_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_132_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_132_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_132_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_132_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_132_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_132_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_132_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_132_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_132_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_132_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_132_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: 5\nB: 2\nC: 4\nD: 7",
+    "question": "What number of moving things are there?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 5\nB: 2\nC: 4\nD: 7",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_133_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_133_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_133_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_133_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_133_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_133_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_133_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_133_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_133_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_133_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_133_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_133_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: no\nB: yes\nC: not sure\nD: maybe",
+    "question": "Are there any barriers?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: no\nB: yes\nC: not sure\nD: maybe",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_134_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_134_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_134_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_134_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_134_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_134_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_134_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_134_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_134_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_134_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_134_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_134_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: maybe\nB: yes\nC: not sure\nD: no",
+    "question": "Is the status of the truck that is to the front left of the moving car the same as the trailer?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: maybe\nB: yes\nC: not sure\nD: no",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_135_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_135_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_135_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_135_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_135_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_135_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_135_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_135_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_135_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_135_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_135_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_135_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: pedestrian\nB: tree\nC: bicycle\nD: traffic light",
+    "question": "What is the thing that is both to the back right of the moving bus and the back right of me?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: pedestrian\nB: tree\nC: bicycle\nD: traffic light",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_136_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_136_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_136_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_136_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_136_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_136_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_136_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_136_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_136_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_136_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_136_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_136_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: pedestrian\nB: bicycle\nC: tree\nD: car",
+    "question": "The moving thing that is both to the back of me and the front of the with rider motorcycle is what?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: pedestrian\nB: bicycle\nC: tree\nD: car",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_137_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_137_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_137_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_137_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_137_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_137_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_137_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_137_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_137_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_137_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_137_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_137_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: no\nB: maybe\nC: not sure\nD: yes",
+    "question": "There is a bus to the front of the parked construction vehicle; is it the same status as the thing that is to the back of the moving truck?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: no\nB: maybe\nC: not sure\nD: yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_138_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_138_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_138_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_138_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_138_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_138_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_138_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_138_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_138_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_138_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_138_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_138_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: truck\nB: car\nC: scooter\nD: bus",
+    "question": "What is the stopped thing that is both to the front left of the with rider motorcycle and the back of the with rider bicycle?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: truck\nB: car\nC: scooter\nD: bus",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_139_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_139_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_139_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_139_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_139_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_139_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_139_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_139_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_139_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_139_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_139_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_139_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: yes\nB: I don\u2019t know\nC: no\nD: maybe",
+    "question": "There is a construction vehicle to the back right of the bus; is it the same status as the motorcycle that is to the back of the bus?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: yes\nB: I don\u2019t know\nC: no\nD: maybe",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_140_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_140_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_140_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_140_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_140_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_140_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_140_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_140_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_140_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_140_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_140_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_140_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: no\nB: maybe\nC: yes\nD: not sure",
+    "question": "Are there any other things that in the same status as the pedestrian to the back right of the stopped bus?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: no\nB: maybe\nC: yes\nD: not sure",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_141_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_141_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_141_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_141_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_141_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_141_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_141_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_141_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_141_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_141_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_141_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_141_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: barrier\nB: fire hydrant\nC: tree\nD: light pole",
+    "question": "The thing that is to the back right of the moving bus is what?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: barrier\nB: fire hydrant\nC: tree\nD: light pole",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_142_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_142_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_142_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_142_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_142_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_142_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_142_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_142_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_142_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_142_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_142_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_142_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: 3\nB: 5\nC: 9\nD: 12",
+    "question": "How many moving things are there?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 3\nB: 5\nC: 9\nD: 12",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_143_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_143_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_143_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_143_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_143_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_143_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_143_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_143_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_143_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_143_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_143_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_143_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: car\nB: tree\nC: bench\nD: bicycle",
+    "question": "What is the moving thing that is to the front left of me?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: car\nB: tree\nC: bench\nD: bicycle",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_144_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_144_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_144_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_144_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_144_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_144_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_144_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_144_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_144_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_144_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_144_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_144_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: 7\nB: 12\nC: 5\nD: 9",
+    "question": "What number of other things in the same status as the car that is to the back right of the parked thing?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 7\nB: 12\nC: 5\nD: 9",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_145_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_145_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_145_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_145_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_145_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_145_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_145_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_145_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_145_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_145_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_145_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_145_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: banana\nB: car\nC: running water\nD: flying bird",
+    "question": "There is a stopped thing; what is it?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: banana\nB: car\nC: running water\nD: flying bird",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_146_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_146_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_146_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_146_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_146_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_146_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_146_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_146_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_146_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_146_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_146_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_146_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: yes\nB: maybe\nC: cannot determine\nD: no",
+    "question": "There is a motorcycle; does it have the same status as the car that is to the front left of the with rider thing?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: yes\nB: maybe\nC: cannot determine\nD: no",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_147_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_147_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_147_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_147_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_147_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_147_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_147_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_147_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_147_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_147_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_147_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_147_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: bicycle\nB: pedestrian\nC: crosswalk\nD: traffic light",
+    "question": "There is a standing pedestrian to the front left of me; what is it?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: bicycle\nB: pedestrian\nC: crosswalk\nD: traffic light",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_148_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_148_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_148_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_148_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_148_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_148_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_148_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_148_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_148_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_148_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_148_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_148_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: 5\nB: 7\nC: 3\nD: 10",
+    "question": "What number of motorcycles are there?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 5\nB: 7\nC: 3\nD: 10",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_149_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_149_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_149_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_149_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_149_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_149_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_149_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_149_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_149_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_149_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_149_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_149_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: no\nB: yes\nC: uncertain\nD: probably",
+    "question": "Is there another construction vehicle of the same status as the truck that is to the front of the moving truck?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: no\nB: yes\nC: uncertain\nD: probably",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_150_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_150_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_150_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_150_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_150_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_150_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_150_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_150_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_150_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_150_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_150_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_150_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: yes\nB: maybe\nC: no\nD: sometimes",
+    "question": "Are there any moving buss?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: yes\nB: maybe\nC: no\nD: sometimes",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_151_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_151_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_151_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_151_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_151_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_151_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_151_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_151_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_151_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_151_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_151_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_151_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: lane divider\nB: barrier\nC: tree\nD: cone",
+    "question": "The thing that is to the back right of me and the back right of the construction vehicle is what?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: lane divider\nB: barrier\nC: tree\nD: cone",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_152_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_152_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_152_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_152_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_152_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_152_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_152_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_152_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_152_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_152_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_152_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_152_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: stationary\nB: disappearing\nC: transforming\nD: moving",
+    "question": "There is a thing that is to the front left of me; what status is it?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: stationary\nB: disappearing\nC: transforming\nD: moving",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_153_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_153_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_153_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_153_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_153_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_153_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_153_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_153_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_153_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_153_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_153_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_153_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: yes\nB: no\nC: not sure\nD: maybe",
+    "question": "There is a bus to the front left of the stopped bus; is it the same status as the motorcycle to the back of the moving bus?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: yes\nB: no\nC: not sure\nD: maybe",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_154_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_154_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_154_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_154_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_154_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_154_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_154_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_154_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_154_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_154_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_154_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_154_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: yes\nB: no\nC: maybe\nD: uncertain",
+    "question": "There is a truck; is it the same status as the car to the back right of the stopped truck?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: yes\nB: no\nC: maybe\nD: uncertain",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_155_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_155_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_155_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_155_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_155_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_155_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_155_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_155_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_155_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_155_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_155_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_155_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: traffic cone\nB: tree\nC: hydrant\nD: bench",
+    "question": "What is the thing that is both to the back right of the parked construction vehicle and the front left of me?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: traffic cone\nB: tree\nC: hydrant\nD: bench",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_156_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_156_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_156_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_156_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_156_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_156_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_156_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_156_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_156_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_156_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_156_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_156_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: stopped\nB: departed\nC: moving\nD: full",
+    "question": "What is the status of the bus?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: stopped\nB: departed\nC: moving\nD: full",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_157_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_157_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_157_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_157_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_157_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_157_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_157_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_157_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_157_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_157_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_157_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_157_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: pedestrian\nB: trash can\nC: tree\nD: motorcycle",
+    "question": "The moving thing that is to the front left of the moving bus and the back of me is what?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: pedestrian\nB: trash can\nC: tree\nD: motorcycle",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_158_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_158_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_158_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_158_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_158_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_158_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_158_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_158_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_158_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_158_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_158_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_158_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: bicycle\nB: bench\nC: car\nD: tree",
+    "question": "The thing that is to the back of me and the front left of the parked construction vehicle is what?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: bicycle\nB: bench\nC: car\nD: tree",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_159_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_159_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_159_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_159_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_159_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_159_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_159_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_159_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_159_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_159_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_159_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_159_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: parked\nB: moving\nC: accelerating\nD: stopped",
+    "question": "What is the status of the car that is to the back of me?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: parked\nB: moving\nC: accelerating\nD: stopped",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_160_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_160_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_160_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_160_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_160_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_160_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_160_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_160_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_160_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_160_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_160_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_160_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: turning\nB: disappearing\nC: stopped\nD: moving",
+    "question": "There is a bus that is to the front of me; what status is it?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: turning\nB: disappearing\nC: stopped\nD: moving",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_161_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_161_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_161_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_161_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_161_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_161_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_161_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_161_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_161_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_161_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_161_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_161_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: 5\nB: 3\nC: 8\nD: 0",
+    "question": "There is a bus; how many things are to the front right of it?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 5\nB: 3\nC: 8\nD: 0",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_162_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_162_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_162_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_162_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_162_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_162_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_162_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_162_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_162_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_162_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_162_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_162_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: stopped\nB: waiting for passengers\nC: moving\nD: broken down",
+    "question": "What is the status of the bus?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: stopped\nB: waiting for passengers\nC: moving\nD: broken down",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_163_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_163_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_163_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_163_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_163_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_163_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_163_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_163_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_163_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_163_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_163_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_163_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: maybe\nB: sometimes\nC: no\nD: yes",
+    "question": "Are there any not standing pedestrians?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: maybe\nB: sometimes\nC: no\nD: yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_164_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_164_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_164_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_164_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_164_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_164_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_164_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_164_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_164_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_164_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_164_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_164_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: maybe\nB: unable to determine\nC: no\nD: yes",
+    "question": "Is the status of the truck that is to the front left of the moving car the same as the trailer?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: maybe\nB: unable to determine\nC: no\nD: yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_165_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_165_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_165_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_165_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_165_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_165_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_165_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_165_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_165_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_165_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_165_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_165_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: car\nB: bicycle\nC: dog\nD: pedestrian",
+    "question": "What is the moving thing to the back right of me?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: car\nB: bicycle\nC: dog\nD: pedestrian",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_166_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_166_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_166_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_166_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_166_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_166_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_166_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_166_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_166_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_166_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_166_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_166_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: maybe\nB: no\nC: yes\nD: uncertain",
+    "question": "Is there another bus that has the same status as the car that is to the front left of the stopped construction vehicle?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: maybe\nB: no\nC: yes\nD: uncertain",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_167_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_167_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_167_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_167_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_167_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_167_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_167_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_167_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_167_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_167_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_167_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_167_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: moving swiftly\nB: stopped\nC: being repaired\nD: broken down",
+    "question": "The bus that is to the back right of the moving bus is in what status?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: moving swiftly\nB: stopped\nC: being repaired\nD: broken down",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_168_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_168_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_168_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_168_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_168_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_168_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_168_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_168_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_168_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_168_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_168_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_168_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: 5\nB: 10\nC: 7\nD: 3",
+    "question": "What number of trucks are there?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 5\nB: 10\nC: 7\nD: 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_169_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_169_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_169_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_169_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_169_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_169_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_169_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_169_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_169_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_169_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_169_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_169_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: bicycle\nB: tree\nC: car\nD: building",
+    "question": "The moving thing that is to the front of the construction vehicle and the front left of me is what?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: bicycle\nB: tree\nC: car\nD: building",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_170_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_170_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_170_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_170_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_170_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_170_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_170_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_170_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_170_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_170_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_170_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_170_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: car\nB: traffic light\nC: bicycle\nD: pedestrian",
+    "question": "What is the thing that is both to the front left of the stopped bus and the back of the with rider thing?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: car\nB: traffic light\nC: bicycle\nD: pedestrian",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_171_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_171_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_171_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_171_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_171_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_171_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_171_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_171_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_171_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_171_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_171_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_171_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: moving\nB: missing\nC: broken down\nD: parked",
+    "question": "What status is the truck to the back right of the bus?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: moving\nB: missing\nC: broken down\nD: parked",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_172_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_172_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_172_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_172_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_172_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_172_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_172_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_172_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_172_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_172_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_172_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_172_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: locked up\nB: in transit\nC: damaged\nD: with rider",
+    "question": "What is the status of the bicycle?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: locked up\nB: in transit\nC: damaged\nD: with rider",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_173_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_173_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_173_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_173_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_173_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_173_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_173_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_173_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_173_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_173_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_173_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_173_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: 1\nB: 5\nC: 10\nD: 3",
+    "question": "What number of stopped trucks are there?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 1\nB: 5\nC: 10\nD: 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_174_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_174_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_174_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_174_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_174_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_174_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_174_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_174_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_174_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_174_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_174_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_174_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: sometimes\nB: maybe\nC: yes\nD: no",
+    "question": "Are there any moving buss to the back left of the bus?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: sometimes\nB: maybe\nC: yes\nD: no",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_175_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_175_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_175_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_175_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_175_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_175_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_175_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_175_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_175_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_175_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_175_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_175_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: moving\nB: broken down\nC: under maintenance\nD: parked",
+    "question": "What is the status of the bus that is to the front left of the stopped bus?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: moving\nB: broken down\nC: under maintenance\nD: parked",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_176_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_176_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_176_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_176_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_176_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_176_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_176_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_176_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_176_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_176_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_176_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_176_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: bus\nB: bicycle\nC: car\nD: train",
+    "question": "The moving thing that is both to the back right of the with rider motorcycle and the front of me is what?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: bus\nB: bicycle\nC: car\nD: train",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_177_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_177_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_177_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_177_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_177_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_177_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_177_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_177_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_177_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_177_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_177_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_177_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: boat\nB: bicycle\nC: house\nD: car",
+    "question": "There is a parked thing that is to the back of me; what is it?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: boat\nB: bicycle\nC: house\nD: car",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_178_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_178_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_178_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_178_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_178_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_178_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_178_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_178_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_178_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_178_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_178_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_178_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: possibly\nB: no\nC: maybe\nD: yes",
+    "question": "Are there any not standing pedestrians?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: possibly\nB: no\nC: maybe\nD: yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_179_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_179_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_179_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_179_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_179_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_179_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_179_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_179_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_179_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_179_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_179_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_179_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: no\nB: yes\nC: maybe\nD: uncertain",
+    "question": "Are any stopped trucks visible?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: no\nB: yes\nC: maybe\nD: uncertain",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_180_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_180_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_180_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_180_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_180_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_180_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_180_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_180_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_180_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_180_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_180_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_180_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: 3\nB: 4\nC: 2\nD: 1",
+    "question": "What number of things are to the back right of the motorcycle?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 3\nB: 4\nC: 2\nD: 1",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_181_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_181_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_181_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_181_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_181_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_181_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_181_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_181_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_181_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_181_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_181_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_181_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: 7\nB: 5\nC: 2\nD: 3",
+    "question": "How many other things in the same status as the thing to the front left of the pedestrian?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 7\nB: 5\nC: 2\nD: 3",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_182_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_182_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_182_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_182_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_182_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_182_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_182_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_182_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_182_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_182_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_182_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_182_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: stopped\nB: disappeared\nC: moving\nD: broken down",
+    "question": "What status is the bus that is to the front of the parked thing?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: stopped\nB: disappeared\nC: moving\nD: broken down",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_183_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_183_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_183_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_183_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_183_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_183_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_183_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_183_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_183_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_183_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_183_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_183_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: bus\nB: bicycle\nC: pedestrian\nD: traffic light",
+    "question": "The thing that is to the front left of me and the front of the with rider motorcycle is what?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: bus\nB: bicycle\nC: pedestrian\nD: traffic light",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_184_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_184_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_184_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_184_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_184_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_184_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_184_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_184_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_184_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_184_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_184_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_184_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: no\nB: maybe\nC: I do not know\nD: yes",
+    "question": "Are there any other cars that in the same status as the bus?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: no\nB: maybe\nC: I do not know\nD: yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_185_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_185_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_185_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_185_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_185_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_185_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_185_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_185_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_185_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_185_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_185_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_185_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: car\nB: tree\nC: bicycle\nD: bus",
+    "question": "There is a stopped thing that is to the front of me; what is it?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: car\nB: tree\nC: bicycle\nD: bus",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_186_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_186_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_186_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_186_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_186_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_186_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_186_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_186_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_186_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_186_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_186_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_186_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: 5\nB: 3\nC: 9\nD: 7",
+    "question": "What number of other things are there of the same status as the construction vehicle?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 5\nB: 3\nC: 9\nD: 7",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_187_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_187_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_187_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_187_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_187_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_187_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_187_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_187_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_187_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_187_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_187_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_187_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: yes\nB: unknown\nC: maybe\nD: no",
+    "question": "Are there any stopped things to the back right of the trailer?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: yes\nB: unknown\nC: maybe\nD: no",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_188_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_188_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_188_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_188_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_188_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_188_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_188_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_188_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_188_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_188_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_188_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_188_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: bus\nB: tree\nC: car\nD: bicycle",
+    "question": "What is the stopped thing that is to the front left of the with rider motorcycle and the back right of me?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: bus\nB: tree\nC: car\nD: bicycle",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_189_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_189_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_189_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_189_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_189_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_189_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_189_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_189_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_189_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_189_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_189_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_189_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: 3\nB: 5\nC: 10\nD: 7",
+    "question": "How many standing pedestrians are there?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 3\nB: 5\nC: 10\nD: 7",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_190_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_190_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_190_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_190_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_190_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_190_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_190_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_190_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_190_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_190_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_190_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_190_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: broken down\nB: stationary\nC: under repair\nD: moving",
+    "question": "There is a bus; what status is it?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: broken down\nB: stationary\nC: under repair\nD: moving",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_191_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_191_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_191_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_191_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_191_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_191_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_191_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_191_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_191_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_191_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_191_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_191_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: possibly\nB: unknown\nC: no\nD: yes",
+    "question": "Does the thing to the front left of the construction vehicle have the same status as the construction vehicle?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: possibly\nB: unknown\nC: no\nD: yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_192_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_192_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_192_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_192_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_192_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_192_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_192_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_192_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_192_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_192_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_192_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_192_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: stopped\nB: broken down\nC: moving\nD: delayed",
+    "question": "What is the status of the bus?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: stopped\nB: broken down\nC: moving\nD: delayed",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_193_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_193_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_193_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_193_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_193_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_193_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_193_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_193_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_193_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_193_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_193_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_193_11.png"
+    ],
+    "output": "C"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: bicycle\nB: car\nC: airplane\nD: bus",
+    "question": "What is the stopped thing?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: bicycle\nB: car\nC: airplane\nD: bus",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_194_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_194_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_194_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_194_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_194_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_194_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_194_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_194_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_194_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_194_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_194_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_194_11.png"
+    ],
+    "output": "D"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: no\nB: only when moving\nC: sometimes\nD: yes",
+    "question": "There is a truck to the front of the stopped construction vehicle; does it have the same status as the bicycle?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: no\nB: only when moving\nC: sometimes\nD: yes",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_195_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_195_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_195_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_195_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_195_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_195_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_195_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_195_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_195_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_195_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_195_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_195_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: 5\nB: 2\nC: 10\nD: 8",
+    "question": "How many moving cars are there?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 5\nB: 2\nC: 10\nD: 8",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_196_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_196_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_196_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_196_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_196_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_196_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_196_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_196_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_196_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_196_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_196_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_196_11.png"
+    ],
+    "output": "A"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: 3\nB: 5\nC: 50\nD: 12",
+    "question": "How many other things are in the same status as the truck?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 3\nB: 5\nC: 50\nD: 12",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_197_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_197_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_197_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_197_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_197_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_197_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_197_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_197_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_197_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_197_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_197_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_197_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: 7\nB: 1\nC: 3\nD: 5",
+    "question": "What number of with rider things are there?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 7\nB: 1\nC: 3\nD: 5",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_198_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_198_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_198_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_198_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_198_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_198_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_198_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_198_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_198_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_198_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_198_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_198_11.png"
+    ],
+    "output": "B"
+  },
+  {
+    "task": "threeD_question_answering",
+    "visual_input_component": "LiDAR image and natural image",
+    "source": "NuScenes_threeD_question_answering",
+    "options": "A: maybe\nB: no\nC: yes\nD: possibly",
+    "question": "There is a car to the front of the parked construction vehicle; is its status the same as the construction vehicle to the front of the moving truck?",
+    "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: maybe\nB: no\nC: yes\nD: possibly",
+    "input_image_path": [
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_199_0.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_199_1.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_199_2.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_199_3.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_199_4.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_199_5.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_199_6.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_199_7.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_199_8.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_199_9.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_199_10.png",
+      "../MMIU-Benchmark/threeD_question_answering/threeD_question_answering_199_11.png"
+    ],
+    "output": "B"
+  }
+]
\ No newline at end of file
diff --git a/VLMEvalKit/test_qwen3_vl.py b/VLMEvalKit/test_qwen3_vl.py
new file mode 100644
index 0000000..40228d7
--- /dev/null
+++ b/VLMEvalKit/test_qwen3_vl.py
@@ -0,0 +1,246 @@
+import os
+import json
+import base64
+import argparse
+import random
+from multiprocessing import Pool
+from tqdm import tqdm
+from openai import OpenAI
+from PIL import Image
+import io
+
+# Configure API client
+base_url = os.getenv('OPENAI_API_BASE', 'https://api.fireworks.ai/inference/v1')
+api_key = os.getenv('FIREWORKS_API_KEY', None)
+
+client = OpenAI(
+    base_url=base_url,
+    api_key=api_key,
+)
+
+def encode_image(image_path):
+    """Encode image to base64."""
+    try:
+        with Image.open(image_path) as img:
+            # Convert to RGB if necessary
+            if img.mode != "RGB":
+                img = img.convert("RGB")
+            
+            # Save to bytes buffer
+            buffer = io.BytesIO()
+            img.save(buffer, format="JPEG", quality=95)
+            buffer.seek(0)
+            
+            return base64.b64encode(buffer.read()).decode("utf-8")
+    except Exception as e:
+        print(f"Error encoding image {image_path}: {e}")
+        return None
+
+def call_qwen3_vl(image_paths, question, model_name="qwen3-vl", interleave_text=False):
+    """Call qwen3-vl via OpenAI-compatible API."""
+    content = []
+    
+    # Random text snippets to interleave between images
+    interleave_texts = [
+        "Here is image",
+        "This image shows",
+        "Looking at this image",
+        "In this image we can see",
+        "This picture contains",
+        "Image",
+    ]
+    
+    # Add images (with optional interleaved text)
+    for i, image_path in enumerate(image_paths):
+        if not os.path.exists(image_path):
+            print(f"Image not found: {image_path}")
+            return 'image error'
+        
+        base64_image = encode_image(image_path)
+        if base64_image is None:
+            return 'image error'
+        
+        # Add interleaved text before each image (except the first)
+        if interleave_text and i > 0:
+            interleave_text_snippet = random.choice(interleave_texts) + f" {i+1}."
+            content.append({
+                "type": "text",
+                "text": interleave_text_snippet
+            })
+        
+        content.append({
+            "type": "image_url",
+            "image_url": {
+                "url": f"data:image/jpeg;base64,{base64_image}"
+            }
+        })
+    
+    # Add text question
+    content.append({
+        "type": "text",
+        "text": question
+    })
+    
+    messages = [
+        {
+            "role": "user",
+            "content": content
+        }
+    ]
+    
+    try:
+        response = client.chat.completions.create(
+            model=model_name,
+            messages=messages,
+            max_tokens=1024,
+            temperature=0.7,
+        )
+        answer = response.choices[0].message.content
+        print(f"ASSISTANT: {answer}")
+        return answer
+    except Exception as e:
+        print(f"Model error: {e}")
+        return 'model error'
+
+parser = argparse.ArgumentParser(description='Run Qwen3-VL inference on MMIU dataset')
+parser.add_argument('--json_path', type=str, default='all.json', help='Path to all.json file')
+parser.add_argument('--limit', type=int, default=None, help='Limit to first N rows')
+parser.add_argument('--sample', type=int, default=None, help='Random sample of N rows')
+parser.add_argument('--tasks', type=str, nargs='+', default=None, help='Filter by specific task names')
+parser.add_argument('--seed', type=int, default=42, help='Random seed for sampling')
+parser.add_argument('--workers', type=int, default=1, help='Number of parallel workers (default: 1)')
+parser.add_argument('--output-dir', type=str, default='../results', help='Output directory for results (default: ../results)')
+parser.add_argument('--interleave-random-text', action='store_true', help='Interleave random text snippets between images (just for testing)')
+args = parser.parse_args()
+
+json_path = args.json_path
+
+tasks_exist = ['person_reid', 'multiple_image_captioning', 'spot_the_similarity', 'face_retrieval', 'sketch2image_retrieval', 'handwritten_retrieval', 'spot_the_diff', 'image2image_retrieval', 'vehicle_retrieval', 'text2image_retrieval',
+'general_action_recognition', 'video_captioning', 'next_img_prediction', 'temporal_ordering', 'meme_vedio_understanding', 'action_quality_assessment', 'temporal_localization', 'mevis',
+'ravens_progressive_matrices', 'threed_indoor_recognition', 'point_tracking', 'threed_cad_recognition', 'single_object_tracking']
+
+# Model name - can be overridden via environment variable
+model_name = os.getenv('MODEL', 'qwen3-vl')
+
+if not os.path.exists(json_path):
+    print(f"Error: {json_path} not found!")
+    print("Please create all.json with the MMIU dataset.")
+    exit(1)
+
+with open(json_path, 'r') as f:
+    data_all = json.load(f)
+
+# Apply filters
+original_count = len(data_all)
+if args.tasks:
+    data_all = [d for d in data_all if d.get('task') in args.tasks]
+    print(f"Filtered to {len(data_all)} rows matching tasks: {args.tasks}")
+
+if args.sample:
+    random.seed(args.seed)
+    data_all = random.sample(data_all, min(args.sample, len(data_all)))
+    print(f"Sampled {len(data_all)} rows (seed={args.seed})")
+elif args.limit:
+    data_all = data_all[:args.limit]
+    print(f"Limited to first {len(data_all)} rows")
+
+# Seed random for interleaved text (if enabled)
+if args.interleave_random_text:
+    random.seed(args.seed)
+
+if original_count != len(data_all):
+    print(f"Processing {len(data_all)} rows (out of {original_count} total)")
+
+def process_single_item(args_tuple):
+    """Process a single task_data item. Used for parallel processing."""
+    task_data, model_name, tasks_exist, interleave_random_text = args_tuple
+    
+    context = task_data["context"]
+    question = task_data["question"]
+    
+    tmp = []
+    image_flag = True
+    
+    for image_path in task_data["input_image_path"]:
+        tmp.append(image_path)
+        if not os.path.exists(image_path):
+            image_flag = False
+            break
+    
+    if image_flag == False:
+        response = 'image none'
+        task_data[model_name] = response
+        print(f"{model_name}, {task_data.get('task', 'unknown')}, {len(tmp)}: {response}")
+        return task_data
+    
+    try:
+        if task_data['task'] in tasks_exist:
+            question_formatted = question + '\n' + context
+        else:
+            question_formatted = context + '\n' + question
+        question_formatted = question_formatted + '\nPlease answer the option directly like A,B,C,D...'
+        
+        response = call_qwen3_vl(tmp, question_formatted, model_name=model_name, interleave_text=interleave_random_text)
+        task_data[model_name] = response
+        print(f"{model_name}, {task_data.get('task', 'unknown')}, {len(tmp)}: {response}")
+    except Exception as e:
+        response = 'model error or image error'
+        task_data[model_name] = response
+        print(f"{model_name}, {task_data.get('task', 'unknown')}, {len(tmp)}: {response}")
+        print(f"Exception: {e}")
+    
+    return task_data
+
+# Process items (sequentially or in parallel)
+if args.workers > 1:
+    print(f"Processing {len(data_all)} items with {args.workers} parallel workers...")
+    if args.interleave_random_text:
+        print("Interleaving random text between images enabled")
+    # Create argument tuples for each item
+    process_args = [(task_data, model_name, tasks_exist, args.interleave_random_text) for task_data in data_all]
+    
+    # Process in parallel with progress bar
+    with Pool(processes=args.workers) as pool:
+        processed_data = list(tqdm(
+            pool.imap(process_single_item, process_args),
+            total=len(process_args),
+            desc="Processing"
+        ))
+else:
+    print(f"Processing {len(data_all)} items sequentially...")
+    if args.interleave_random_text:
+        print("Interleaving random text between images enabled")
+    processed_data = [
+        process_single_item((task_data, model_name, tasks_exist, args.interleave_random_text))
+        for task_data in tqdm(data_all, desc="Processing")
+    ]
+
+# Organize results by task
+results_by_task = {}
+for task_data in processed_data:
+    task_name = task_data.get('task', 'unknown')
+    if task_name not in results_by_task:
+        results_by_task[task_name] = []
+    results_by_task[task_name].append(task_data)
+
+# Save results organized by task
+base_output_dir = os.path.abspath(args.output_dir)  # Resolve to absolute path to avoid issues
+if not os.path.exists(base_output_dir):
+    os.makedirs(base_output_dir)
+
+# Extract basename for directory structure (in case model_name is a full path)
+model_dir_name = os.path.basename(model_name) if os.path.sep in model_name else model_name
+
+for task_name, task_results in results_by_task.items():
+    task_dir = os.path.join(base_output_dir, task_name)
+    model_dir = os.path.join(task_dir, model_dir_name)
+    if not os.path.exists(model_dir):
+        os.makedirs(model_dir)
+    
+    output_path = os.path.join(model_dir, 'metadata_info.json')
+    with open(output_path, 'w') as f:
+        json.dump(task_results, f)
+    print(f"Saved {len(task_results)} results for task '{task_name}' to {output_path}")
+
+print(f"\nAll results saved to: {os.path.abspath(base_output_dir)}")
+
diff --git a/evaluate.py b/evaluate.py
index 3a8ca44..be8318e 100644
--- a/evaluate.py
+++ b/evaluate.py
@@ -2,6 +2,7 @@
 import os
 import base64
 import time
+import argparse
 from openai import OpenAI
 from multiprocessing import Pool
 import re
@@ -9,10 +10,9 @@
 def remove_punctuation(text):
     return re.sub(r'^[.,()]+|[.,()]+$', '', text)
 
-client = OpenAI(
-    base_url='xx',
-    api_key='xx',
-)
+# Default API configuration
+default_base_url = 'https://api.fireworks.ai/inference/v1'
+default_api_key = os.getenv('FIREWORKS_API_KEY', None)
 
 def build_prompt(question, options, prediction):
     tmpl = (
@@ -31,18 +31,41 @@ def build_prompt(question, options, prediction):
     return tmpl.format(question, options, prediction)
 
 
-def process_data(args):
-    data_tmp, modelname = args
+def process_data(args_tuple):
+    data_tmp, modelname, base_url, api_key = args_tuple
     client = OpenAI(
     # base_url='https://kkkc.net/v1',
     # api_key='sk-YJaHfazVSf2WDkAl1bAdE17bF3Ae4923Ba888293B31d13C4',
-    base_url='xx',
-    api_key='xx',
+    base_url=base_url,
+    api_key=api_key,
     )
 
     options = data_tmp['options']
     question = data_tmp['question']
-    prediction = data_tmp[modelname].strip()
+    
+    # Try to find the prediction key - the JSON might have full path or basename as key
+    prediction_key = None
+    if modelname in data_tmp:
+        prediction_key = modelname
+    else:
+        # Try basename if modelname is a full path
+        basename = os.path.basename(modelname) if os.path.sep in modelname else modelname
+        if basename in data_tmp:
+            prediction_key = basename
+        else:
+            # Try to find any key that ends with the basename (for full paths in JSON)
+            for key in data_tmp.keys():
+                if key.endswith(basename) or os.path.basename(key) == basename:
+                    prediction_key = key
+                    break
+    
+    if prediction_key is None or prediction_key not in data_tmp:
+        available_keys = [k for k in data_tmp.keys() if k not in ['options', 'question', 'context', 'input_image_path', 'task', 'output', 'visual_input_component', 'source']]
+        print(f"Warning: Key for model '{modelname}' not found in data. Available model keys: {available_keys}")
+        data_tmp[f'{modelname}_choice'] = 'Z'
+        return data_tmp
+    
+    prediction = data_tmp[prediction_key].strip()
 
     if modelname == 'Claude3' and "copyrighted material" in prediction:
         data_tmp[f'{modelname}_choice'] = 'Z'
@@ -95,58 +118,60 @@ def process_data(args):
 
 
 def main():
-    # modelnames = ['internvl1.5-chat']
-    # modelnames = ['Gemini','Gemini1.0']
-    # modelnames = ['GPT4o','Gemini','Gemini1.0']
-    # modelnames = ['Llava-interleave']
-    modelnames = ['Llava-interleave', 'qwen_chat', 'XComposer2', 'deepseek_vl_7b', 'qwen_base', 'XComposer2_1.8b', 'flamingov2', 'deepseek_vl_1.3b', 'internvl1.5-chat', 'idefics2_8b', 'Mantis', 'idefics_9b_instruct']
-    directorys = ['xx','xx']
+    parser = argparse.ArgumentParser(description='Convert model predictions to multiple choice answers')
+    parser.add_argument('--directories', type=str, nargs='+', default=['./results'], help='Directories containing results (default: ./results)')
+    parser.add_argument('--models', type=str, nargs='+', default=['qwen3-vl'], help='Model names to evaluate (default: qwen3-vl)')
+    parser.add_argument('--workers', type=int, default=10, help='Number of parallel workers (default: 10)')
+    parser.add_argument('--base-url', type=str, default=None, help=f'OpenAI API base URL (default: {default_base_url})')
+    parser.add_argument('--api-key', type=str, default=None, help='OpenAI API key (default: from FIREWORKS_API_KEY env var)')
+    args = parser.parse_args()
+    
+    modelnames = args.models
+    directorys = args.directories
+    base_url = args.base_url or default_base_url
+    api_key = args.api_key or default_api_key
+    
+    if not api_key:
+        print("Error: API key not provided. Set FIREWORKS_API_KEY environment variable or use --api-key")
+        return
    
     for directory in directorys:
+        if not os.path.exists(directory):
+            print(f"Warning: Directory '{directory}' does not exist, skipping...")
+            continue
+            
         tasknames = os.listdir(directory)
         for taskname in tasknames:
-            
-            path = os.path.join(directory,taskname)
             for modelname in modelnames:
-                path = os.path.join(directory,taskname)
-                path = os.path.join(path,modelname)
-
-                print(taskname,modelname)
-                json_path = os.path.join(path,'metadata_info.json')
-                
+                path = os.path.join(directory, taskname, modelname)
 
+                print(taskname, modelname)
+                json_path = os.path.join(path, 'metadata_info.json')
 
                 if not os.path.exists(json_path):
-                    print(json_path,' not exist')
+                    print(json_path, ' not exist')
                     continue
 
-                # output_json_path = os.path.join(path,'metadata_info_choice.json')
-                output_json_path = os.path.join(path,'metadata_info_choice.json')
-                # if os.path.exists(output_json_path) or os.path.exists(output_json_path1):
+                output_json_path = os.path.join(path, 'metadata_info_choice.json')
                 if os.path.exists(output_json_path):
                     print(output_json_path, ' already have')
                     continue
 
-                with open(json_path,'r') as f:
+                with open(json_path, 'r') as f:
                     data = json.load(f)
 
-                        # 将data和modelname打包成元组列表
-                data_with_modelname = [(data_tmp, modelname) for data_tmp in data]
-
-                
+                # Pack data with modelname, base_url, and api_key
+                data_with_args = [(data_tmp, modelname, base_url, api_key) for data_tmp in data]
 
-                pool = Pool(processes=10)  # Adjust the number of processes as per your machine's capability
-                # result = pool.map(process_data, data, modelname)
-                # 使用map方法传递打包后的元组列表
-                result = pool.map(process_data, data_with_modelname)
-        
-                # output_json_path = os.path.join(path,'metadata_info_choice.json')
+                pool = Pool(processes=args.workers)
+                result = pool.map(process_data, data_with_args)
+                pool.close()
+                pool.join()
 
                 with open(output_json_path, 'w') as f:
                     json.dump(result, f)
 
-                print(taskname,modelname,'OK')
-            
+                print(taskname, modelname, 'OK')
 
 
 if __name__ == '__main__':
diff --git a/evaluate_correct.py b/evaluate_correct.py
index 8160a6c..7f481dc 100644
--- a/evaluate_correct.py
+++ b/evaluate_correct.py
@@ -1,19 +1,31 @@
 import json
 import os
 import pandas as pd
+import argparse
 
-directorys = [
-    'xx'
-]
+parser = argparse.ArgumentParser(description='Calculate accuracy scores from MMIU evaluation results')
+parser.add_argument('--directories', type=str, nargs='+', default=['./results'], help='Directories containing results (default: ./results)')
+parser.add_argument('--models', type=str, nargs='+', default=None, help='Model names to evaluate (default: all models found)')
+parser.add_argument('--output', type=str, default='./Accuracy_data_all.csv', help='Output CSV file path')
+args = parser.parse_args()
 
+directorys = args.directories
 
 # Initialize global DataFrames to store data
 global_accuracy_df = pd.DataFrame()
 
 for directory in directorys:
+    if not os.path.exists(directory):
+        print(f"Warning: Directory '{directory}' does not exist, skipping...")
+        continue
+    
     tasknames = sorted(os.listdir(directory))
 
-    modelnames = ['GPT4o','Claude3','Gemini','Gemini1.0','Llava-interleave','Mantis','InternVL2','internvl1.5-chat','qwen_chat', 'qwen_base', 'idefics_9b_instruct','flamingov2', 'deepseek_vl_1.3b', 'XComposer2_1.8b', 'deepseek_vl_7b', 'idefics2_8b', 'XComposer2']
+    # Default model names if not specified
+    if args.models:
+        modelnames = args.models
+    else:
+        modelnames = ['GPT4o','Claude3','Gemini','Gemini1.0','Llava-interleave','Mantis','InternVL2','internvl1.5-chat','qwen_chat', 'qwen_base', 'idefics_9b_instruct','flamingov2', 'deepseek_vl_1.3b', 'XComposer2_1.8b', 'deepseek_vl_7b', 'idefics2_8b', 'XComposer2', 'qwen3-vl']
     # modelnames = ['Llava-interleave']
     # Initialize dictionaries to store data
     accuracy_data = {modelname: [] for modelname in modelnames}
@@ -69,12 +81,24 @@
     # Append to global DataFrames
     global_accuracy_df = pd.concat([global_accuracy_df, accuracy_df])
 
-# Calculate the overall average for each model
-global_accuracy_df.loc['Overall'] = global_accuracy_df.mean()
+# If multiple directories were provided, average results for duplicate task names
+if len(directorys) > 1:
+    # Group by task name and average across directories (skip NaN values)
+    global_accuracy_df = global_accuracy_df.groupby(global_accuracy_df.index).mean(skipna=True)
+
+# Calculate the overall average for each model (skip NaN values)
+global_accuracy_df.loc['Overall'] = global_accuracy_df.mean(skipna=True)
+
+# Print accuracy results
+print("\n" + "="*80)
+print("ACCURACY RESULTS")
+print("="*80)
+print(global_accuracy_df.to_string())
+print("="*80)
 
 # Save global DataFrames to CSV files
-global_accuracy_df.to_csv('./Accuracy_data_all.csv')
+global_accuracy_df.to_csv(args.output)
 
-print("Global DataFrames have been saved as CSV files.")
+print(f"\nAccuracy results have been saved to: {args.output}")
 
 
diff --git a/fix_image_paths.py b/fix_image_paths.py
new file mode 100644
index 0000000..1e6067c
--- /dev/null
+++ b/fix_image_paths.py
@@ -0,0 +1,40 @@
+import json
+import os
+
+# Load the JSON file
+with open('VLMEvalKit/all.json', 'r') as f:
+    data = json.load(f)
+
+# Update image paths
+updated = 0
+for entry in data:
+    old_paths = entry.get('input_image_path', [])
+    if isinstance(old_paths, list):
+        # Remove the category prefix (e.g., "Low-level-semantic/") and update path
+        new_paths = []
+        for p in old_paths:
+            # Path format: ./Low-level-semantic/task_name/image.jpg
+            # We need: ../MMIU-Benchmark/task_name/image.jpg
+            parts = p.split('/')
+            if len(parts) >= 3:
+                # Skip the category directory (parts[1]) and keep task/image
+                new_path = '../MMIU-Benchmark/' + '/'.join(parts[2:])
+            else:
+                # Fallback: just replace ./ with ../MMIU-Benchmark/
+                new_path = p.replace('./', '../MMIU-Benchmark/')
+            new_paths.append(new_path)
+        entry['input_image_path'] = new_paths
+        updated += 1
+
+# Save updated JSON
+with open('VLMEvalKit/all.json', 'w') as f:
+    json.dump(data, f, indent=2)
+
+print(f'Updated {updated} entries with corrected image paths')
+
+# Verify one path
+if len(data) > 0:
+    sample_path = data[0]['input_image_path'][0]
+    print(f'\nSample path: {sample_path}')
+    print(f'Path exists (from VLMEvalKit/): {os.path.exists(os.path.join("VLMEvalKit", sample_path))}')
+
diff --git a/results/Egocentric_Video_QuestionAnswering/qwen3-vl/metadata_info.json b/results/Egocentric_Video_QuestionAnswering/qwen3-vl/metadata_info.json
new file mode 100644
index 0000000..a57d6ce
--- /dev/null
+++ b/results/Egocentric_Video_QuestionAnswering/qwen3-vl/metadata_info.json
@@ -0,0 +1 @@
+[{"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: microwave\nB: refrigerator\nC: stove\nD: television", "question": "which object changed its status when the person do the first action did before he/she point to something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: microwave\nB: refrigerator\nC: stove\nD: television", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_0_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_0_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_0_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_0_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_0_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_0_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_0_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_0_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_0_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_0_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_0_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_0_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_0_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_0_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_0_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_0_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_0_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_0_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_0_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_0_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_0_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_0_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_0_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_0_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_0_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_0_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_0_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_0_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_0_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_0_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_0_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_0_31.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: apple1\nB: orange2\nC: banana3\nD: grape4", "question": "which object changed its status when the person put something to something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: apple1\nB: orange2\nC: banana3\nD: grape4", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_1_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_1_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_1_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_1_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_1_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_1_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_1_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_1_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_1_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_1_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_1_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_1_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_1_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_1_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_1_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_1_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_1_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_1_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_1_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_1_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_1_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_1_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_1_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_1_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_1_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_1_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_1_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_1_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_1_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_1_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_1_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_1_31.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: broken\nB: emptiness\nC: cleanliness\nD: fullness", "question": "what status of cup changed while the person do the first action did before he/she wash something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: broken\nB: emptiness\nC: cleanliness\nD: fullness", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_2_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_2_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_2_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_2_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_2_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_2_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_2_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_2_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_2_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_2_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_2_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_2_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_2_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_2_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_2_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_2_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_2_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_2_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_2_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_2_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_2_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_2_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_2_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_2_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_2_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_2_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_2_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_2_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_2_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_2_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_2_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_2_31.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: locked\nB: opened\nC: half-opened\nD: closed", "question": "what will the status of fridge change to if the actor do the first action in the video in the future?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: locked\nB: opened\nC: half-opened\nD: closed", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_3_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_3_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_3_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_3_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_3_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_3_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_3_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_3_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_3_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_3_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_3_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_3_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_3_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_3_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_3_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_3_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_3_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_3_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_3_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_3_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_3_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_3_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_3_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_3_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_3_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_3_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_3_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_3_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_3_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_3_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_3_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_3_31.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: Put water-pot to table\nB: Placed water-pot on shelf\nC: Put water-pot to floor\nD: Moved water-pot to window", "question": "How did the person changed the spatial relationships of the last object that has status change in the video?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Put water-pot to table\nB: Placed water-pot on shelf\nC: Put water-pot to floor\nD: Moved water-pot to window", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_4_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_4_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_4_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_4_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_4_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_4_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_4_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_4_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_4_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_4_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_4_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_4_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_4_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_4_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_4_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_4_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_4_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_4_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_4_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_4_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_4_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_4_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_4_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_4_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_4_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_4_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_4_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_4_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_4_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_4_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_4_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_4_31.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: I don't know\nB: maybe\nC: yes\nD: no", "question": "Does the first action did after the person point to something fulfills the preconditions of the action eating something with something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: I don't know\nB: maybe\nC: yes\nD: no", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_5_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_5_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_5_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_5_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_5_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_5_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_5_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_5_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_5_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_5_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_5_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_5_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_5_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_5_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_5_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_5_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_5_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_5_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_5_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_5_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_5_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_5_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_5_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_5_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_5_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_5_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_5_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_5_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_5_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_5_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_5_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_5_31.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: Reading a book quietly\nB: Chopping vegetables on a board\nC: Put fish to basin using fishing-net\nD: Playing a musical instrument", "question": "During which action does the person knows about the other person's action?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Reading a book quietly\nB: Chopping vegetables on a board\nC: Put fish to basin using fishing-net\nD: Playing a musical instrument", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_6_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_6_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_6_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_6_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_6_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_6_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_6_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_6_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_6_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_6_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_6_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_6_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_6_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_6_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_6_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_6_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_6_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_6_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_6_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_6_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_6_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_6_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_6_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_6_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_6_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_6_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_6_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_6_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_6_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_6_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_6_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_6_31.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: sometimes\nB: maybe\nC: no\nD: yes", "question": "If the person did not get something from something, is the person able to open something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: sometimes\nB: maybe\nC: no\nD: yes", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_7_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_7_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_7_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_7_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_7_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_7_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_7_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_7_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_7_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_7_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_7_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_7_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_7_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_7_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_7_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_7_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_7_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_7_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_7_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_7_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_7_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_7_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_7_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_7_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_7_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_7_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_7_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_7_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_7_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_7_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_7_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_7_31.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: Sit on the couch\nB: Turn on the TV\nC: Open microwave\nD: Close the window", "question": "what will the other person do next?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Sit on the couch\nB: Turn on the TV\nC: Open microwave\nD: Close the window", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_8_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_8_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_8_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_8_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_8_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_8_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_8_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_8_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_8_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_8_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_8_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_8_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_8_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_8_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_8_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_8_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_8_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_8_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_8_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_8_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_8_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_8_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_8_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_8_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_8_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_8_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_8_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_8_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_8_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_8_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_8_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_8_31.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: Pick up the book\nB: Put cup to the other person\nC: Turn off the lights\nD: Close the door", "question": "If the person did not do the last action in the video, what remaining actions in the video is executable?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Pick up the book\nB: Put cup to the other person\nC: Turn off the lights\nD: Close the door", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_9_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_9_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_9_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_9_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_9_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_9_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_9_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_9_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_9_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_9_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_9_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_9_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_9_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_9_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_9_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_9_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_9_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_9_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_9_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_9_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_9_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_9_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_9_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_9_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_9_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_9_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_9_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_9_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_9_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_9_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_9_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_9_31.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: no\nB: maybe\nC: yes\nD: sometimes", "question": "If the person did not sweep something using something, is the person able to turn off something with something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: no\nB: maybe\nC: yes\nD: sometimes", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_10_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_10_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_10_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_10_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_10_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_10_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_10_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_10_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_10_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_10_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_10_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_10_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_10_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_10_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_10_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_10_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_10_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_10_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_10_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_10_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_10_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_10_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_10_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_10_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_10_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_10_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_10_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_10_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_10_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_10_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_10_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_10_31.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: partially\nB: yes\nC: maybe\nD: no", "question": "Did the attribute of remote changed because of the first action in the video?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: partially\nB: yes\nC: maybe\nD: no", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_11_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_11_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_11_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_11_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_11_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_11_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_11_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_11_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_11_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_11_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_11_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_11_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_11_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_11_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_11_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_11_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_11_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_11_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_11_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_11_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_11_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_11_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_11_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_11_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_11_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_11_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_11_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_11_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_11_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_11_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_11_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_11_31.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: Put sandwich to plate\nB: Take sandwich off the plate\nC: Throw sandwich away\nD: Put sandwich in the fridge", "question": "What is the last action the person did in the video?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Put sandwich to plate\nB: Take sandwich off the plate\nC: Throw sandwich away\nD: Put sandwich in the fridge", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_12_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_12_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_12_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_12_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_12_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_12_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_12_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_12_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_12_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_12_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_12_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_12_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_12_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_12_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_12_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_12_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_12_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_12_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_12_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_12_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_12_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_12_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_12_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_12_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_12_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_12_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_12_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_12_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_12_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_12_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_12_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_12_31.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: Wash knife\nB: Dry dishes\nC: Cook meal\nD: Sweep floor", "question": "what is the other person doing while the person put something to something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Wash knife\nB: Dry dishes\nC: Cook meal\nD: Sweep floor", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_13_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_13_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_13_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_13_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_13_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_13_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_13_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_13_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_13_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_13_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_13_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_13_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_13_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_13_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_13_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_13_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_13_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_13_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_13_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_13_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_13_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_13_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_13_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_13_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_13_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_13_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_13_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_13_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_13_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_13_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_13_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_13_31.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: not sure\nB: maybe\nC: yes\nD: no", "question": "Does the last action in the video fulfills the preconditions of the action putting something to something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: not sure\nB: maybe\nC: yes\nD: no", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_14_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_14_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_14_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_14_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_14_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_14_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_14_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_14_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_14_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_14_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_14_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_14_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_14_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_14_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_14_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_14_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_14_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_14_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_14_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_14_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_14_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_14_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_14_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_14_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_14_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_14_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_14_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_14_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_14_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_14_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_14_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_14_31.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: Put the fork in the fridge\nB: Dropped the fork on the floor\nC: Get fork from table\nD: Mixed the fork with a spoon", "question": "How did the person changed the state of mixture of fork?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Put the fork in the fridge\nB: Dropped the fork on the floor\nC: Get fork from table\nD: Mixed the fork with a spoon", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_15_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_15_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_15_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_15_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_15_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_15_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_15_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_15_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_15_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_15_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_15_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_15_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_15_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_15_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_15_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_15_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_15_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_15_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_15_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_15_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_15_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_15_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_15_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_15_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_15_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_15_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_15_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_15_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_15_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_15_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_15_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_15_31.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: maybe\nB: no\nC: sometimes\nD: yes", "question": "Does the last action in the video fulfills the preconditions of the action putting something to something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: maybe\nB: no\nC: sometimes\nD: yes", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_16_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_16_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_16_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_16_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_16_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_16_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_16_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_16_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_16_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_16_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_16_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_16_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_16_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_16_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_16_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_16_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_16_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_16_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_16_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_16_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_16_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_16_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_16_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_16_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_16_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_16_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_16_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_16_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_16_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_16_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_16_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_16_31.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: falling to the ground\nB: broken in half\nC: completely detached\nD: attached to knife base", "question": "What is the status of knife after the person do the first action did before he/she get something from something to change it?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: falling to the ground\nB: broken in half\nC: completely detached\nD: attached to knife base", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_17_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_17_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_17_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_17_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_17_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_17_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_17_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_17_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_17_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_17_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_17_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_17_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_17_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_17_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_17_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_17_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_17_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_17_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_17_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_17_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_17_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_17_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_17_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_17_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_17_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_17_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_17_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_17_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_17_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_17_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_17_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_17_31.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: remote\nB: book\nC: lamp\nD: cup", "question": "which object changed its status first in the video?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: remote\nB: book\nC: lamp\nD: cup", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_18_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_18_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_18_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_18_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_18_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_18_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_18_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_18_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_18_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_18_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_18_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_18_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_18_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_18_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_18_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_18_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_18_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_18_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_18_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_18_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_18_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_18_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_18_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_18_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_18_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_18_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_18_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_18_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_18_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_18_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_18_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_18_31.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: in fridge\nB: in microwave\nC: on table\nD: in sink", "question": "what will the status of cup1 change to if the actor put something to something in the future?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: in fridge\nB: in microwave\nC: on table\nD: in sink", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_19_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_19_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_19_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_19_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_19_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_19_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_19_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_19_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_19_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_19_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_19_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_19_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_19_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_19_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_19_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_19_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_19_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_19_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_19_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_19_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_19_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_19_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_19_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_19_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_19_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_19_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_19_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_19_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_19_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_19_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_19_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_19_31.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: no\nB: maybe\nC: yes\nD: sometimes", "question": "Did the attribute of controller changed because of the first action did before the person point to something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: no\nB: maybe\nC: yes\nD: sometimes", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_20_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_20_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_20_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_20_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_20_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_20_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_20_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_20_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_20_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_20_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_20_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_20_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_20_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_20_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_20_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_20_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_20_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_20_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_20_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_20_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_20_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_20_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_20_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_20_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_20_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_20_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_20_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_20_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_20_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_20_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_20_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_20_31.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: Add seasoning to meat\nB: Cut meat with a knife\nC: Put meat in oven\nD: Get meat from pan using fork", "question": "How did the person changed the wrappedness of meat1?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Add seasoning to meat\nB: Cut meat with a knife\nC: Put meat in oven\nD: Get meat from pan using fork", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_21_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_21_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_21_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_21_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_21_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_21_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_21_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_21_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_21_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_21_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_21_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_21_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_21_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_21_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_21_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_21_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_21_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_21_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_21_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_21_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_21_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_21_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_21_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_21_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_21_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_21_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_21_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_21_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_21_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_21_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_21_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_21_31.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: Put the cup in the fridge\nB: Wash cup\nC: Break the cup\nD: Throw the cup away", "question": "what will the person do next after this video?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Put the cup in the fridge\nB: Wash cup\nC: Break the cup\nD: Throw the cup away", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_22_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_22_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_22_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_22_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_22_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_22_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_22_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_22_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_22_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_22_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_22_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_22_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_22_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_22_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_22_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_22_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_22_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_22_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_22_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_22_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_22_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_22_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_22_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_22_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_22_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_22_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_22_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_22_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_22_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_22_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_22_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_22_31.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: yes\nB: sometimes\nC: no\nD: maybe", "question": "If the person did not do the first action did before he/she drink something with something, is the person able to wash something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: yes\nB: sometimes\nC: no\nD: maybe", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_23_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_23_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_23_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_23_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_23_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_23_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_23_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_23_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_23_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_23_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_23_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_23_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_23_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_23_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_23_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_23_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_23_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_23_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_23_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_23_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_23_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_23_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_23_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_23_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_23_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_23_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_23_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_23_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_23_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_23_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_23_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_23_31.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: Get kettle from stove\nB: Pick up a spoon\nC: Open the fridge\nD: Turn on the faucet", "question": "What is the first action the person did in the video?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Get kettle from stove\nB: Pick up a spoon\nC: Open the fridge\nD: Turn on the faucet", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_24_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_24_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_24_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_24_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_24_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_24_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_24_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_24_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_24_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_24_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_24_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_24_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_24_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_24_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_24_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_24_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_24_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_24_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_24_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_24_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_24_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_24_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_24_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_24_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_24_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_24_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_24_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_24_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_24_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_24_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_24_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_24_31.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: Get bowl from microwave\nB: Put a cup inside\nC: Turned it on without food\nD: Left it empty", "question": "How did the person changed the emptiness of microwave?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Get bowl from microwave\nB: Put a cup inside\nC: Turned it on without food\nD: Left it empty", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_25_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_25_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_25_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_25_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_25_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_25_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_25_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_25_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_25_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_25_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_25_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_25_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_25_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_25_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_25_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_25_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_25_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_25_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_25_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_25_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_25_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_25_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_25_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_25_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_25_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_25_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_25_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_25_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_25_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_25_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_25_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_25_31.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: frozen\nB: boiled\nC: cooked\nD: raw", "question": "What does the person want meat1 to be for the action cooking something using something in the video?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: frozen\nB: boiled\nC: cooked\nD: raw", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_26_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_26_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_26_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_26_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_26_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_26_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_26_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_26_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_26_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_26_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_26_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_26_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_26_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_26_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_26_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_26_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_26_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_26_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_26_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_26_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_26_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_26_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_26_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_26_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_26_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_26_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_26_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_26_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_26_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_26_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_26_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_26_31.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: next to the sink\nB: inside the cabinet\nC: under the table\nD: on top of knife", "question": "What is the status of watermelon2 before the person put something to something using knife to change it?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: next to the sink\nB: inside the cabinet\nC: under the table\nD: on top of knife", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_27_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_27_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_27_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_27_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_27_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_27_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_27_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_27_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_27_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_27_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_27_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_27_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_27_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_27_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_27_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_27_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_27_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_27_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_27_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_27_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_27_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_27_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_27_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_27_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_27_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_27_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_27_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_27_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_27_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_27_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_27_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_27_31.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: under juicer base\nB: behind juicer base\nC: next to juicer base\nD: on top of juicer base", "question": "What does the person want the last object that has status change in the video to be for the action putting something to something in the video?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: under juicer base\nB: behind juicer base\nC: next to juicer base\nD: on top of juicer base", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_28_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_28_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_28_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_28_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_28_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_28_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_28_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_28_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_28_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_28_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_28_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_28_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_28_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_28_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_28_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_28_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_28_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_28_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_28_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_28_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_28_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_28_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_28_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_28_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_28_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_28_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_28_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_28_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_28_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_28_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_28_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_28_31.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: Hold controller\nB: Drop controller\nC: Throw controller\nD: Put controller to table", "question": "what is the other person doing while the person do the first action did after he/she turn off something with something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Hold controller\nB: Drop controller\nC: Throw controller\nD: Put controller to table", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_29_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_29_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_29_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_29_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_29_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_29_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_29_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_29_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_29_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_29_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_29_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_29_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_29_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_29_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_29_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_29_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_29_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_29_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_29_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_29_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_29_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_29_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_29_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_29_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_29_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_29_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_29_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_29_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_29_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_29_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_29_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_29_31.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: yes\nB: it depends\nC: no\nD: maybe", "question": "If the person did not open something, is the person able to pour from something into something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: yes\nB: it depends\nC: no\nD: maybe", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_30_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_30_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_30_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_30_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_30_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_30_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_30_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_30_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_30_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_30_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_30_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_30_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_30_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_30_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_30_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_30_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_30_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_30_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_30_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_30_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_30_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_30_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_30_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_30_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_30_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_30_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_30_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_30_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_30_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_30_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_30_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_30_31.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: yes\nB: only if they do the second action\nC: no\nD: maybe", "question": "If the person did not do the first action in the video, will juicer-lid change its status?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: yes\nB: only if they do the second action\nC: no\nD: maybe", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_31_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_31_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_31_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_31_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_31_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_31_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_31_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_31_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_31_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_31_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_31_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_31_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_31_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_31_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_31_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_31_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_31_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_31_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_31_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_31_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_31_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_31_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_31_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_31_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_31_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_31_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_31_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_31_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_31_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_31_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_31_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_31_31.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: Eating a snack\nB: Talking on the phone\nC: Point to TV\nD: Reading a book", "question": "What is the person doing before he/she stand-up?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Eating a snack\nB: Talking on the phone\nC: Point to TV\nD: Reading a book", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_32_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_32_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_32_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_32_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_32_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_32_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_32_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_32_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_32_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_32_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_32_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_32_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_32_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_32_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_32_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_32_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_32_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_32_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_32_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_32_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_32_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_32_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_32_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_32_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_32_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_32_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_32_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_32_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_32_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_32_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_32_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_32_31.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: Ignore it\nB: Wipe with a dry cloth\nC: Use a paper towel\nD: Wash cutting-board", "question": "How did the person changed the cleanliness of cutting-board?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Ignore it\nB: Wipe with a dry cloth\nC: Use a paper towel\nD: Wash cutting-board", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_33_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_33_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_33_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_33_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_33_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_33_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_33_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_33_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_33_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_33_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_33_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_33_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_33_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_33_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_33_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_33_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_33_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_33_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_33_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_33_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_33_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_33_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_33_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_33_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_33_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_33_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_33_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_33_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_33_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_33_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_33_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_33_31.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: yes\nB: no\nC: possibly\nD: uncertain", "question": "Did the attribute of closet changed because of the action closing something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: yes\nB: no\nC: possibly\nD: uncertain", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_34_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_34_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_34_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_34_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_34_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_34_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_34_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_34_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_34_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_34_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_34_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_34_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_34_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_34_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_34_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_34_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_34_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_34_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_34_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_34_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_34_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_34_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_34_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_34_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_34_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_34_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_34_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_34_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_34_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_34_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_34_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_34_31.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: Open a window\nB: Read a book\nC: Make a phone call\nD: Get remote from shelf", "question": "What is the person doing before he/she turn on something with something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Open a window\nB: Read a book\nC: Make a phone call\nD: Get remote from shelf", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_35_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_35_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_35_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_35_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_35_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_35_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_35_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_35_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_35_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_35_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_35_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_35_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_35_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_35_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_35_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_35_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_35_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_35_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_35_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_35_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_35_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_35_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_35_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_35_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_35_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_35_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_35_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_35_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_35_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_35_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_35_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_35_31.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: cannot be determined\nB: no\nC: yes\nD: not sure", "question": "Is kettle-base visible to the other person before the person do the first action in the video?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: cannot be determined\nB: no\nC: yes\nD: not sure", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_36_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_36_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_36_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_36_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_36_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_36_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_36_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_36_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_36_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_36_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_36_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_36_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_36_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_36_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_36_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_36_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_36_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_36_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_36_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_36_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_36_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_36_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_36_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_36_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_36_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_36_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_36_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_36_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_36_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_36_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_36_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_36_31.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: Drive a car\nB: Take a nap\nC: Put juicer to juicer-base\nD: Read a book", "question": "what will the other person do next?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Drive a car\nB: Take a nap\nC: Put juicer to juicer-base\nD: Read a book", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_37_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_37_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_37_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_37_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_37_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_37_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_37_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_37_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_37_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_37_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_37_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_37_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_37_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_37_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_37_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_37_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_37_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_37_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_37_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_37_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_37_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_37_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_37_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_37_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_37_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_37_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_37_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_37_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_37_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_37_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_37_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_37_31.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: maybe\nB: yes\nC: no\nD: sometimes", "question": "If the person did not sweep something using something, will the last object that has status change in the video change its status?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: maybe\nB: yes\nC: no\nD: sometimes", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_38_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_38_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_38_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_38_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_38_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_38_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_38_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_38_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_38_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_38_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_38_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_38_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_38_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_38_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_38_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_38_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_38_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_38_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_38_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_38_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_38_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_38_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_38_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_38_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_38_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_38_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_38_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_38_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_38_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_38_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_38_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_38_31.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: taste\nB: color\nC: shape\nD: size", "question": "what status will the person change on tomato?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: taste\nB: color\nC: shape\nD: size", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_39_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_39_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_39_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_39_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_39_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_39_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_39_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_39_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_39_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_39_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_39_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_39_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_39_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_39_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_39_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_39_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_39_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_39_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_39_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_39_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_39_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_39_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_39_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_39_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_39_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_39_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_39_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_39_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_39_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_39_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_39_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_39_31.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: Move vacuum to closet\nB: Get vacuum from floor\nC: Leave vacuum outside\nD: Put vacuum on table", "question": "How did the person changed the spatial relationships of the last object that has status change in the video?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Move vacuum to closet\nB: Get vacuum from floor\nC: Leave vacuum outside\nD: Put vacuum on table", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_40_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_40_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_40_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_40_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_40_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_40_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_40_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_40_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_40_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_40_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_40_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_40_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_40_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_40_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_40_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_40_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_40_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_40_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_40_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_40_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_40_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_40_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_40_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_40_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_40_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_40_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_40_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_40_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_40_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_40_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_40_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_40_31.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: door\nB: sink\nC: chair\nD: table", "question": "which object changed its status when the person do the first action did before he/she fill something using something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: door\nB: sink\nC: chair\nD: table", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_41_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_41_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_41_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_41_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_41_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_41_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_41_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_41_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_41_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_41_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_41_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_41_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_41_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_41_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_41_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_41_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_41_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_41_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_41_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_41_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_41_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_41_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_41_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_41_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_41_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_41_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_41_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_41_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_41_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_41_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_41_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_41_31.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: height\nB: wateredness\nC: humidity\nD: brightness", "question": "Which attribute does the person want to change with plant for doing the action pouring from something into something in the video?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: height\nB: wateredness\nC: humidity\nD: brightness", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_42_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_42_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_42_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_42_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_42_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_42_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_42_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_42_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_42_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_42_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_42_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_42_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_42_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_42_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_42_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_42_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_42_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_42_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_42_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_42_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_42_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_42_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_42_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_42_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_42_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_42_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_42_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_42_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_42_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_42_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_42_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_42_31.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: wateredness\nB: leaf size\nC: height\nD: color", "question": "what status of plant changed while the person do the first action did after he/she fill something using something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: wateredness\nB: leaf size\nC: height\nD: color", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_43_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_43_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_43_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_43_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_43_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_43_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_43_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_43_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_43_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_43_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_43_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_43_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_43_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_43_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_43_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_43_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_43_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_43_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_43_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_43_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_43_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_43_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_43_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_43_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_43_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_43_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_43_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_43_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_43_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_43_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_43_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_43_31.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: Cook food in the kitchen\nB: Watch TV instead of fishing\nC: Get fishing-net and fish from basin and fishing-net\nD: Play a game on the computer", "question": "If the person did not fill something using something, what remaining actions in the video is executable?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Cook food in the kitchen\nB: Watch TV instead of fishing\nC: Get fishing-net and fish from basin and fishing-net\nD: Play a game on the computer", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_44_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_44_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_44_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_44_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_44_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_44_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_44_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_44_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_44_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_44_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_44_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_44_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_44_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_44_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_44_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_44_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_44_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_44_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_44_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_44_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_44_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_44_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_44_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_44_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_44_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_44_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_44_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_44_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_44_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_44_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_44_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_44_31.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: plate\nB: spoon\nC: cup\nD: fork", "question": "which object changed its status last in the video?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: plate\nB: spoon\nC: cup\nD: fork", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_45_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_45_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_45_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_45_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_45_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_45_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_45_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_45_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_45_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_45_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_45_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_45_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_45_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_45_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_45_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_45_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_45_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_45_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_45_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_45_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_45_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_45_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_45_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_45_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_45_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_45_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_45_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_45_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_45_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_45_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_45_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_45_31.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: Work on juicer-lid\nB: Read a book\nC: Cook dinner\nD: Go for a run", "question": "what will the person do next after this video?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Work on juicer-lid\nB: Read a book\nC: Cook dinner\nD: Go for a run", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_46_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_46_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_46_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_46_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_46_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_46_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_46_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_46_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_46_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_46_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_46_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_46_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_46_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_46_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_46_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_46_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_46_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_46_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_46_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_46_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_46_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_46_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_46_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_46_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_46_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_46_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_46_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_46_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_46_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_46_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_46_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_46_31.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: size\nB: openess\nC: brand\nD: color", "question": "Which attribute does the person want to change with fridge for doing the last action in the video in the video?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: size\nB: openess\nC: brand\nD: color", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_47_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_47_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_47_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_47_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_47_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_47_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_47_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_47_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_47_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_47_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_47_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_47_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_47_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_47_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_47_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_47_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_47_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_47_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_47_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_47_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_47_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_47_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_47_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_47_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_47_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_47_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_47_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_47_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_47_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_47_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_47_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_47_31.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: sometimes\nB: only if the person opens something else\nC: no\nD: yes", "question": "If the person did not close something, will cereal1 change its status?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: sometimes\nB: only if the person opens something else\nC: no\nD: yes", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_48_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_48_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_48_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_48_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_48_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_48_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_48_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_48_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_48_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_48_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_48_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_48_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_48_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_48_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_48_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_48_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_48_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_48_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_48_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_48_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_48_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_48_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_48_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_48_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_48_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_48_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_48_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_48_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_48_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_48_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_48_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_48_31.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: uncertain\nB: no\nC: yes\nD: maybe", "question": "Did the attribute of lettuce changed because of the first action did after the person get something from something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: uncertain\nB: no\nC: yes\nD: maybe", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_49_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_49_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_49_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_49_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_49_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_49_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_49_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_49_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_49_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_49_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_49_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_49_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_49_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_49_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_49_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_49_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_49_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_49_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_49_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_49_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_49_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_49_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_49_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_49_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_49_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_49_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_49_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_49_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_49_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_49_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_49_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_49_31.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: Talking on the phone\nB: Walking away\nC: Eating noodles\nD: Get noodles from table", "question": "what is the other person doing while the person do the first action in the video?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Talking on the phone\nB: Walking away\nC: Eating noodles\nD: Get noodles from table", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_50_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_50_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_50_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_50_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_50_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_50_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_50_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_50_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_50_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_50_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_50_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_50_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_50_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_50_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_50_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_50_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_50_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_50_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_50_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_50_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_50_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_50_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_50_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_50_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_50_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_50_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_50_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_50_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_50_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_50_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_50_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_50_31.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: wrapping\nB: lamp\nC: table\nD: chair", "question": "which object changed its status first in the video?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: wrapping\nB: lamp\nC: table\nD: chair", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_51_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_51_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_51_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_51_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_51_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_51_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_51_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_51_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_51_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_51_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_51_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_51_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_51_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_51_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_51_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_51_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_51_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_51_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_51_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_51_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_51_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_51_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_51_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_51_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_51_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_51_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_51_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_51_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_51_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_51_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_51_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_51_31.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: yes\nB: no\nC: maybe\nD: sometimes", "question": "If the person did not get something from something, is the person able to put something to something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: yes\nB: no\nC: maybe\nD: sometimes", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_52_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_52_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_52_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_52_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_52_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_52_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_52_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_52_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_52_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_52_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_52_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_52_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_52_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_52_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_52_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_52_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_52_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_52_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_52_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_52_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_52_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_52_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_52_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_52_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_52_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_52_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_52_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_52_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_52_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_52_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_52_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_52_31.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: Break the seal\nB: Flip the switch\nC: Open wrapping\nD: Cut the ribbon", "question": "What action caused the first object that has status change in the video's status to change to opened?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Break the seal\nB: Flip the switch\nC: Open wrapping\nD: Cut the ribbon", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_53_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_53_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_53_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_53_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_53_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_53_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_53_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_53_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_53_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_53_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_53_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_53_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_53_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_53_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_53_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_53_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_53_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_53_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_53_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_53_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_53_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_53_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_53_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_53_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_53_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_53_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_53_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_53_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_53_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_53_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_53_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_53_31.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: yes\nB: sometimes\nC: no\nD: maybe", "question": "If the person did not open something, is the person able to put something to something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: yes\nB: sometimes\nC: no\nD: maybe", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_54_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_54_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_54_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_54_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_54_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_54_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_54_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_54_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_54_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_54_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_54_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_54_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_54_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_54_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_54_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_54_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_54_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_54_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_54_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_54_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_54_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_54_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_54_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_54_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_54_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_54_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_54_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_54_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_54_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_54_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_54_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_54_31.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: Place fork on table\nB: Dry fork\nC: Pick up spoon\nD: Wash fork", "question": "If the person did not do the first action did after he/she put something to something, what remaining actions in the video is executable?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Place fork on table\nB: Dry fork\nC: Pick up spoon\nD: Wash fork", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_55_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_55_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_55_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_55_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_55_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_55_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_55_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_55_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_55_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_55_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_55_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_55_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_55_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_55_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_55_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_55_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_55_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_55_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_55_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_55_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_55_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_55_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_55_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_55_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_55_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_55_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_55_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_55_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_55_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_55_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_55_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_55_31.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: under the table\nB: inside the cupboard\nC: in the sink\nD: on top of shelf", "question": "What is the status of plate before the other person do the first action before he/she put something to something to change it?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: under the table\nB: inside the cupboard\nC: in the sink\nD: on top of shelf", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_56_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_56_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_56_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_56_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_56_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_56_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_56_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_56_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_56_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_56_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_56_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_56_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_56_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_56_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_56_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_56_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_56_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_56_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_56_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_56_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_56_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_56_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_56_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_56_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_56_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_56_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_56_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_56_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_56_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_56_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_56_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_56_31.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: attached to fork\nB: on the plate\nC: in the pan\nD: detached from fork", "question": "What is the status of meat3 before the person do the first action did after he/she get something from something using fork to change it?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: attached to fork\nB: on the plate\nC: in the pan\nD: detached from fork", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_57_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_57_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_57_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_57_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_57_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_57_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_57_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_57_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_57_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_57_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_57_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_57_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_57_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_57_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_57_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_57_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_57_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_57_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_57_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_57_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_57_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_57_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_57_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_57_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_57_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_57_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_57_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_57_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_57_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_57_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_57_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_57_31.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: chair\nB: lamp\nC: book\nD: knife", "question": "If the actor do not get something from something, which object will he/she not be able to change in the future?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: chair\nB: lamp\nC: book\nD: knife", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_58_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_58_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_58_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_58_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_58_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_58_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_58_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_58_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_58_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_58_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_58_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_58_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_58_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_58_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_58_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_58_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_58_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_58_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_58_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_58_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_58_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_58_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_58_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_58_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_58_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_58_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_58_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_58_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_58_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_58_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_58_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_58_31.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: I don\u2019t know\nB: yes\nC: maybe\nD: no", "question": "Did the attribute of the last object that has status change in the video changed because of the action turning off something with something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: I don\u2019t know\nB: yes\nC: maybe\nD: no", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_59_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_59_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_59_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_59_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_59_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_59_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_59_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_59_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_59_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_59_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_59_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_59_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_59_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_59_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_59_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_59_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_59_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_59_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_59_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_59_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_59_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_59_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_59_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_59_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_59_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_59_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_59_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_59_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_59_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_59_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_59_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_59_31.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: no\nB: yes\nC: maybe\nD: uncertain", "question": "Did the attribute of fridge changed because of the action closing something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: no\nB: yes\nC: maybe\nD: uncertain", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_60_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_60_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_60_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_60_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_60_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_60_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_60_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_60_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_60_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_60_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_60_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_60_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_60_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_60_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_60_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_60_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_60_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_60_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_60_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_60_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_60_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_60_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_60_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_60_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_60_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_60_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_60_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_60_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_60_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_60_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_60_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_60_31.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: maybe\nB: no\nC: yes\nD: I don\u2019t know", "question": "Did the attribute of fridge changed because of the action opening something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: maybe\nB: no\nC: yes\nD: I don\u2019t know", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_61_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_61_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_61_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_61_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_61_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_61_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_61_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_61_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_61_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_61_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_61_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_61_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_61_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_61_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_61_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_61_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_61_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_61_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_61_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_61_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_61_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_61_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_61_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_61_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_61_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_61_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_61_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_61_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_61_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_61_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_61_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_61_31.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: yes\nB: no\nC: sometimes\nD: unknown", "question": "Is the other person aware when the person stand-up?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: yes\nB: no\nC: sometimes\nD: unknown", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_62_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_62_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_62_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_62_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_62_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_62_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_62_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_62_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_62_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_62_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_62_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_62_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_62_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_62_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_62_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_62_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_62_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_62_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_62_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_62_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_62_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_62_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_62_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_62_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_62_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_62_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_62_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_62_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_62_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_62_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_62_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_62_31.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: yes\nB: no\nC: maybe\nD: sometimes", "question": "Did the attribute of juicer changed because of the first action did before the person open something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: yes\nB: no\nC: maybe\nD: sometimes", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_63_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_63_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_63_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_63_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_63_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_63_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_63_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_63_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_63_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_63_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_63_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_63_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_63_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_63_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_63_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_63_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_63_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_63_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_63_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_63_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_63_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_63_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_63_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_63_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_63_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_63_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_63_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_63_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_63_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_63_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_63_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_63_31.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: Get lettuce from lettuce\nB: Go to sleep\nC: Write a report\nD: Eat a sandwich", "question": "What is the person doing after he/she work on something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Get lettuce from lettuce\nB: Go to sleep\nC: Write a report\nD: Eat a sandwich", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_64_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_64_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_64_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_64_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_64_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_64_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_64_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_64_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_64_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_64_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_64_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_64_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_64_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_64_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_64_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_64_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_64_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_64_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_64_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_64_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_64_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_64_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_64_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_64_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_64_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_64_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_64_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_64_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_64_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_64_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_64_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_64_31.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: partially wrapped\nB: wrapped\nC: double wrapped\nD: unwrapped", "question": "what will the person want to have coffee's wrappedness be in the future?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: partially wrapped\nB: wrapped\nC: double wrapped\nD: unwrapped", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_65_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_65_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_65_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_65_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_65_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_65_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_65_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_65_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_65_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_65_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_65_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_65_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_65_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_65_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_65_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_65_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_65_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_65_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_65_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_65_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_65_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_65_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_65_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_65_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_65_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_65_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_65_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_65_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_65_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_65_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_65_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_65_31.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: maybe\nB: sometimes\nC: yes\nD: no", "question": "If the person did not do the first action did after he/she turn on something with something, is the person able to get something from something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: maybe\nB: sometimes\nC: yes\nD: no", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_66_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_66_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_66_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_66_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_66_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_66_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_66_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_66_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_66_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_66_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_66_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_66_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_66_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_66_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_66_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_66_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_66_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_66_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_66_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_66_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_66_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_66_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_66_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_66_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_66_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_66_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_66_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_66_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_66_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_66_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_66_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_66_31.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: yes\nB: no\nC: often\nD: sometimes", "question": "Did the attribute of fishing-net changed because of the action filling something using something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: yes\nB: no\nC: often\nD: sometimes", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_67_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_67_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_67_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_67_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_67_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_67_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_67_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_67_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_67_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_67_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_67_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_67_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_67_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_67_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_67_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_67_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_67_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_67_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_67_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_67_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_67_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_67_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_67_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_67_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_67_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_67_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_67_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_67_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_67_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_67_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_67_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_67_31.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: in the car\nB: in closet\nC: on the table\nD: under the bed", "question": "What does the person want cup to be for the action putting something to something in the video?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: in the car\nB: in closet\nC: on the table\nD: under the bed", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_68_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_68_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_68_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_68_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_68_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_68_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_68_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_68_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_68_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_68_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_68_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_68_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_68_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_68_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_68_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_68_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_68_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_68_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_68_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_68_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_68_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_68_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_68_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_68_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_68_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_68_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_68_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_68_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_68_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_68_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_68_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_68_31.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: Reading a book\nB: Playing video games\nC: Cooking dinner\nD: Wash juicer and juicer-lid", "question": "what is the other person doing while the person do the first action did before he/she get something from something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Reading a book\nB: Playing video games\nC: Cooking dinner\nD: Wash juicer and juicer-lid", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_69_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_69_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_69_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_69_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_69_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_69_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_69_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_69_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_69_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_69_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_69_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_69_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_69_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_69_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_69_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_69_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_69_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_69_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_69_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_69_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_69_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_69_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_69_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_69_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_69_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_69_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_69_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_69_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_69_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_69_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_69_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_69_31.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: sometimes\nB: maybe\nC: yes\nD: no", "question": "If the person did not open something, will the first object that has status change in the video change its status?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: sometimes\nB: maybe\nC: yes\nD: no", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_70_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_70_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_70_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_70_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_70_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_70_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_70_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_70_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_70_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_70_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_70_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_70_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_70_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_70_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_70_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_70_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_70_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_70_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_70_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_70_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_70_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_70_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_70_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_70_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_70_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_70_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_70_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_70_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_70_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_70_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_70_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_70_31.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: separate from the other person\nB: attached to the other person\nC: above the other person\nD: beneath the other person", "question": "what will the person want to have juice's spatial relationships be in the future?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: separate from the other person\nB: attached to the other person\nC: above the other person\nD: beneath the other person", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_71_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_71_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_71_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_71_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_71_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_71_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_71_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_71_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_71_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_71_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_71_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_71_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_71_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_71_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_71_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_71_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_71_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_71_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_71_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_71_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_71_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_71_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_71_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_71_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_71_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_71_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_71_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_71_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_71_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_71_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_71_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_71_31.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: Pour from bottle-water into juicer\nB: Running a Marathon\nC: Playing a Piano\nD: Sleeping", "question": "What is the person doing after he/she open something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Pour from bottle-water into juicer\nB: Running a Marathon\nC: Playing a Piano\nD: Sleeping", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_72_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_72_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_72_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_72_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_72_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_72_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_72_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_72_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_72_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_72_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_72_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_72_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_72_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_72_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_72_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_72_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_72_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_72_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_72_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_72_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_72_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_72_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_72_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_72_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_72_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_72_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_72_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_72_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_72_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_72_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_72_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_72_31.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: not enough information\nB: no\nC: maybe\nD: yes", "question": "If the person did not do the first action did before he/she pour from something into something, will spoon change its status?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: not enough information\nB: no\nC: maybe\nD: yes", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_73_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_73_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_73_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_73_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_73_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_73_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_73_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_73_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_73_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_73_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_73_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_73_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_73_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_73_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_73_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_73_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_73_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_73_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_73_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_73_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_73_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_73_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_73_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_73_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_73_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_73_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_73_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_73_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_73_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_73_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_73_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_73_31.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: yes\nB: sometimes\nC: maybe\nD: no", "question": "Did the attribute of closet changed because of the first action did after the person put something to something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: yes\nB: sometimes\nC: maybe\nD: no", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_74_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_74_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_74_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_74_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_74_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_74_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_74_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_74_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_74_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_74_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_74_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_74_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_74_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_74_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_74_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_74_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_74_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_74_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_74_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_74_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_74_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_74_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_74_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_74_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_74_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_74_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_74_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_74_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_74_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_74_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_74_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_74_31.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: temporal relationships\nB: emotional status\nC: spatial relationships\nD: frequency", "question": "what status of cup changed while the person get something from something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: temporal relationships\nB: emotional status\nC: spatial relationships\nD: frequency", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_75_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_75_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_75_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_75_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_75_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_75_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_75_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_75_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_75_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_75_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_75_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_75_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_75_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_75_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_75_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_75_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_75_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_75_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_75_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_75_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_75_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_75_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_75_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_75_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_75_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_75_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_75_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_75_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_75_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_75_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_75_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_75_31.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: placed on the table\nB: attached to me\nC: inside the drawer\nD: on the kitchen counter", "question": "What is the status of knife before the person put something to something to change it?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: placed on the table\nB: attached to me\nC: inside the drawer\nD: on the kitchen counter", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_76_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_76_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_76_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_76_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_76_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_76_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_76_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_76_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_76_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_76_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_76_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_76_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_76_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_76_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_76_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_76_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_76_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_76_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_76_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_76_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_76_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_76_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_76_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_76_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_76_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_76_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_76_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_76_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_76_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_76_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_76_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_76_31.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: irrelevant\nB: no\nC: yes\nD: partially", "question": "Did the attribute of knife changed because of the first action did after the person wash something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: irrelevant\nB: no\nC: yes\nD: partially", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_77_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_77_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_77_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_77_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_77_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_77_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_77_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_77_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_77_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_77_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_77_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_77_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_77_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_77_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_77_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_77_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_77_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_77_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_77_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_77_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_77_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_77_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_77_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_77_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_77_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_77_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_77_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_77_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_77_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_77_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_77_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_77_31.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: maybe\nB: yes\nC: no\nD: unsure", "question": "If the person did not do the first action in the video, will tv change its status?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: maybe\nB: yes\nC: no\nD: unsure", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_78_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_78_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_78_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_78_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_78_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_78_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_78_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_78_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_78_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_78_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_78_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_78_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_78_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_78_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_78_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_78_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_78_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_78_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_78_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_78_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_78_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_78_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_78_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_78_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_78_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_78_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_78_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_78_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_78_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_78_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_78_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_78_31.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: Get juicer from juicer-base\nB: Turn on the blender\nC: Chop vegetables\nD: Check the timer", "question": "What is the person doing after he/she put something to something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Get juicer from juicer-base\nB: Turn on the blender\nC: Chop vegetables\nD: Check the timer", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_79_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_79_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_79_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_79_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_79_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_79_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_79_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_79_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_79_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_79_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_79_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_79_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_79_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_79_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_79_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_79_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_79_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_79_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_79_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_79_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_79_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_79_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_79_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_79_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_79_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_79_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_79_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_79_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_79_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_79_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_79_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_79_31.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: broken\nB: opened\nC: painted\nD: removed", "question": "what will the person want to have the last object that has status change in the video's openess be in the future?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: broken\nB: opened\nC: painted\nD: removed", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_80_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_80_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_80_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_80_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_80_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_80_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_80_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_80_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_80_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_80_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_80_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_80_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_80_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_80_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_80_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_80_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_80_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_80_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_80_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_80_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_80_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_80_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_80_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_80_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_80_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_80_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_80_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_80_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_80_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_80_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_80_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_80_31.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: Put cup to shelf\nB: Cook meal\nC: Wash car\nD: Water plants", "question": "If the person did not do the first action did after he/she close something, what remaining actions in the video is not executable?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Put cup to shelf\nB: Cook meal\nC: Wash car\nD: Water plants", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_81_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_81_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_81_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_81_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_81_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_81_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_81_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_81_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_81_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_81_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_81_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_81_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_81_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_81_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_81_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_81_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_81_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_81_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_81_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_81_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_81_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_81_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_81_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_81_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_81_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_81_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_81_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_81_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_81_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_81_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_81_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_81_31.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: wet\nB: broken\nC: dirty\nD: clean", "question": "What is the status of plate after the person wash something to change it?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: wet\nB: broken\nC: dirty\nD: clean", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_82_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_82_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_82_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_82_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_82_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_82_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_82_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_82_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_82_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_82_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_82_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_82_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_82_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_82_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_82_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_82_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_82_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_82_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_82_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_82_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_82_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_82_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_82_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_82_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_82_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_82_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_82_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_82_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_82_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_82_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_82_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_82_31.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: Get tomato from table\nB: Wash the knife\nC: Chop the tomato\nD: Slice the bread", "question": "What is the person doing before he/she cut something using something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Get tomato from table\nB: Wash the knife\nC: Chop the tomato\nD: Slice the bread", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_83_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_83_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_83_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_83_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_83_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_83_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_83_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_83_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_83_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_83_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_83_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_83_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_83_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_83_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_83_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_83_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_83_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_83_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_83_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_83_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_83_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_83_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_83_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_83_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_83_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_83_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_83_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_83_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_83_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_83_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_83_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_83_31.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: Cook a meal\nB: Read a book\nC: Paint a picture\nD: Put fish to basin using tank", "question": "If the person did not get something from something using fishing-net, what remaining actions in the video is not executable?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Cook a meal\nB: Read a book\nC: Paint a picture\nD: Put fish to basin using tank", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_84_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_84_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_84_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_84_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_84_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_84_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_84_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_84_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_84_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_84_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_84_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_84_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_84_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_84_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_84_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_84_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_84_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_84_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_84_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_84_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_84_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_84_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_84_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_84_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_84_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_84_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_84_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_84_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_84_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_84_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_84_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_84_31.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: only sometimes\nB: maybe\nC: yes\nD: no", "question": "If the person did not pour from something into something, will kettle-lid change its status?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: only sometimes\nB: maybe\nC: yes\nD: no", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_85_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_85_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_85_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_85_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_85_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_85_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_85_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_85_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_85_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_85_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_85_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_85_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_85_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_85_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_85_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_85_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_85_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_85_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_85_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_85_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_85_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_85_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_85_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_85_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_85_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_85_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_85_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_85_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_85_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_85_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_85_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_85_31.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: Read a book\nB: Cook a meal\nC: Get coffee from shelf\nD: Wash the dishes", "question": "what is the other person doing while the person do the first action in the video?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Read a book\nB: Cook a meal\nC: Get coffee from shelf\nD: Wash the dishes", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_86_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_86_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_86_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_86_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_86_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_86_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_86_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_86_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_86_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_86_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_86_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_86_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_86_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_86_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_86_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_86_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_86_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_86_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_86_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_86_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_86_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_86_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_86_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_86_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_86_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_86_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_86_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_86_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_86_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_86_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_86_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_86_31.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: Put tank to table\nB: Take tank from table\nC: Move tank to floor\nD: Put table to tank", "question": "What is the person doing before he/she get something from something and fishing-net?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Put tank to table\nB: Take tank from table\nC: Move tank to floor\nD: Put table to tank", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_87_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_87_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_87_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_87_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_87_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_87_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_87_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_87_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_87_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_87_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_87_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_87_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_87_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_87_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_87_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_87_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_87_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_87_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_87_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_87_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_87_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_87_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_87_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_87_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_87_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_87_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_87_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_87_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_87_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_87_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_87_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_87_31.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: sofa\nB: lamp\nC: table\nD: vacuum", "question": "which object changed its status first in the video?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: sofa\nB: lamp\nC: table\nD: vacuum", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_88_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_88_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_88_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_88_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_88_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_88_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_88_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_88_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_88_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_88_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_88_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_88_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_88_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_88_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_88_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_88_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_88_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_88_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_88_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_88_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_88_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_88_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_88_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_88_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_88_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_88_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_88_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_88_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_88_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_88_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_88_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_88_31.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: Reading a book\nB: Eating lunch\nC: Taking a nap\nD: Put wrapping to table", "question": "what is the other person doing while the person do the first action did before he/she close something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Reading a book\nB: Eating lunch\nC: Taking a nap\nD: Put wrapping to table", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_89_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_89_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_89_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_89_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_89_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_89_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_89_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_89_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_89_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_89_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_89_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_89_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_89_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_89_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_89_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_89_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_89_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_89_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_89_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_89_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_89_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_89_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_89_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_89_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_89_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_89_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_89_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_89_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_89_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_89_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_89_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_89_31.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: half-squeezed\nB: in market\nC: in juicer\nD: unpeeled", "question": "what will the status of orange2 change to if the actor do the first action did before he/she get something from something in the future?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: half-squeezed\nB: in market\nC: in juicer\nD: unpeeled", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_90_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_90_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_90_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_90_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_90_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_90_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_90_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_90_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_90_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_90_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_90_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_90_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_90_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_90_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_90_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_90_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_90_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_90_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_90_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_90_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_90_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_90_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_90_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_90_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_90_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_90_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_90_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_90_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_90_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_90_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_90_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_90_31.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: Slice the watermelon\nB: Peel the watermelon\nC: Throw away the watermelon\nD: Put watermelon to juicer", "question": "what will the other person do next?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Slice the watermelon\nB: Peel the watermelon\nC: Throw away the watermelon\nD: Put watermelon to juicer", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_91_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_91_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_91_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_91_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_91_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_91_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_91_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_91_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_91_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_91_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_91_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_91_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_91_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_91_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_91_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_91_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_91_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_91_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_91_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_91_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_91_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_91_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_91_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_91_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_91_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_91_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_91_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_91_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_91_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_91_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_91_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_91_31.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: maybe\nB: possibly\nC: no\nD: yes", "question": "Did the attribute of kettle-base changed because of the first action did after the person close something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: maybe\nB: possibly\nC: no\nD: yes", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_92_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_92_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_92_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_92_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_92_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_92_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_92_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_92_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_92_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_92_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_92_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_92_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_92_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_92_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_92_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_92_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_92_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_92_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_92_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_92_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_92_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_92_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_92_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_92_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_92_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_92_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_92_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_92_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_92_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_92_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_92_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_92_31.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: Reading a book\nB: Boiling water\nC: Put kettle to table\nD: Sitting on a chair", "question": "what is the other person doing while the person do the first action did before he/she point to something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Reading a book\nB: Boiling water\nC: Put kettle to table\nD: Sitting on a chair", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_93_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_93_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_93_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_93_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_93_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_93_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_93_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_93_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_93_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_93_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_93_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_93_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_93_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_93_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_93_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_93_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_93_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_93_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_93_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_93_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_93_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_93_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_93_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_93_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_93_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_93_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_93_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_93_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_93_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_93_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_93_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_93_31.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: it depends\nB: yes\nC: sometimes\nD: no", "question": "Does the first action did before the person turn on something with something fulfills the preconditions of the action getting something from something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: it depends\nB: yes\nC: sometimes\nD: no", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_94_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_94_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_94_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_94_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_94_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_94_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_94_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_94_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_94_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_94_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_94_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_94_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_94_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_94_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_94_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_94_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_94_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_94_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_94_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_94_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_94_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_94_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_94_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_94_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_94_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_94_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_94_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_94_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_94_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_94_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_94_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_94_31.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: Place kettle on stovetop\nB: Fill kettle using sink\nC: Turn on the kettle\nD: Add tea leaves to kettle", "question": "What action caused kettle's status to change to nonempty?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Place kettle on stovetop\nB: Fill kettle using sink\nC: Turn on the kettle\nD: Add tea leaves to kettle", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_95_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_95_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_95_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_95_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_95_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_95_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_95_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_95_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_95_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_95_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_95_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_95_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_95_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_95_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_95_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_95_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_95_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_95_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_95_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_95_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_95_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_95_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_95_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_95_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_95_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_95_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_95_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_95_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_95_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_95_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_95_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_95_31.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: yes\nB: no\nC: maybe\nD: sometimes", "question": "Did the attribute of tomato1 changed because of the first action did before the person wash something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: yes\nB: no\nC: maybe\nD: sometimes", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_96_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_96_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_96_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_96_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_96_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_96_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_96_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_96_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_96_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_96_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_96_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_96_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_96_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_96_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_96_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_96_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_96_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_96_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_96_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_96_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_96_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_96_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_96_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_96_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_96_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_96_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_96_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_96_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_96_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_96_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_96_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_96_31.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: Stand up and walk away\nB: Sit down on sofa\nC: Open the window\nD: Start cooking dinner", "question": "what will the other person do next?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Stand up and walk away\nB: Sit down on sofa\nC: Open the window\nD: Start cooking dinner", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_97_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_97_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_97_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_97_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_97_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_97_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_97_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_97_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_97_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_97_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_97_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_97_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_97_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_97_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_97_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_97_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_97_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_97_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_97_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_97_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_97_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_97_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_97_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_97_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_97_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_97_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_97_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_97_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_97_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_97_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_97_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_97_31.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: Watch TV\nB: Read a book\nC: Go for a run\nD: Work on noodles", "question": "what will the person do next after this video?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Watch TV\nB: Read a book\nC: Go for a run\nD: Work on noodles", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_98_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_98_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_98_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_98_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_98_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_98_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_98_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_98_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_98_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_98_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_98_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_98_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_98_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_98_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_98_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_98_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_98_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_98_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_98_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_98_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_98_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_98_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_98_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_98_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_98_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_98_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_98_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_98_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_98_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_98_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_98_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_98_31.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: knife\nB: cup\nC: phone\nD: pen", "question": "which object changed its status when the other person get something from something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: knife\nB: cup\nC: phone\nD: pen", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_99_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_99_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_99_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_99_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_99_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_99_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_99_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_99_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_99_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_99_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_99_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_99_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_99_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_99_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_99_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_99_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_99_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_99_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_99_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_99_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_99_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_99_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_99_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_99_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_99_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_99_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_99_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_99_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_99_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_99_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_99_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_99_31.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: Close the tank lid\nB: Put tank on table\nC: Put tank to sink\nD: Pour tank contents into a glass", "question": "What is the person doing after he/she pour from something into something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Close the tank lid\nB: Put tank on table\nC: Put tank to sink\nD: Pour tank contents into a glass", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_100_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_100_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_100_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_100_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_100_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_100_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_100_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_100_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_100_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_100_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_100_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_100_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_100_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_100_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_100_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_100_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_100_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_100_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_100_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_100_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_100_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_100_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_100_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_100_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_100_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_100_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_100_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_100_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_100_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_100_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_100_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_100_31.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: yes\nB: maybe\nC: no\nD: sometimes", "question": "If the person did not do the first action did before he/she open something, is the person able to get something from something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: yes\nB: maybe\nC: no\nD: sometimes", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_101_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_101_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_101_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_101_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_101_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_101_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_101_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_101_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_101_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_101_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_101_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_101_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_101_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_101_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_101_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_101_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_101_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_101_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_101_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_101_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_101_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_101_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_101_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_101_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_101_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_101_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_101_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_101_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_101_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_101_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_101_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_101_31.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: in room temperature\nB: in boiling water\nC: in the freezer\nD: in the microwave", "question": "what will the person want to have the last object that has status change in the video's temperature be in the future?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: in room temperature\nB: in boiling water\nC: in the freezer\nD: in the microwave", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_102_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_102_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_102_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_102_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_102_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_102_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_102_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_102_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_102_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_102_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_102_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_102_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_102_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_102_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_102_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_102_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_102_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_102_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_102_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_102_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_102_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_102_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_102_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_102_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_102_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_102_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_102_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_102_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_102_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_102_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_102_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_102_31.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: Tie shoes\nB: Read a book\nC: Drink water from cup\nD: Get meat from floor", "question": "What is the person doing before he/she throw something into something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Tie shoes\nB: Read a book\nC: Drink water from cup\nD: Get meat from floor", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_103_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_103_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_103_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_103_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_103_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_103_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_103_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_103_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_103_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_103_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_103_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_103_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_103_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_103_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_103_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_103_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_103_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_103_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_103_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_103_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_103_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_103_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_103_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_103_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_103_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_103_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_103_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_103_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_103_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_103_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_103_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_103_31.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: can not be opened\nB: left part removed\nC: completely sealed\nD: right part added", "question": "What is the precondition of changing the openability of meat2?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: can not be opened\nB: left part removed\nC: completely sealed\nD: right part added", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_104_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_104_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_104_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_104_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_104_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_104_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_104_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_104_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_104_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_104_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_104_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_104_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_104_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_104_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_104_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_104_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_104_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_104_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_104_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_104_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_104_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_104_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_104_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_104_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_104_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_104_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_104_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_104_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_104_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_104_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_104_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_104_31.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: undecided\nB: no\nC: yes\nD: maybe", "question": "Did the attribute of cutting-board changed because of the first action did after the person point to something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: undecided\nB: no\nC: yes\nD: maybe", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_105_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_105_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_105_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_105_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_105_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_105_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_105_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_105_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_105_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_105_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_105_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_105_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_105_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_105_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_105_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_105_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_105_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_105_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_105_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_105_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_105_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_105_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_105_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_105_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_105_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_105_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_105_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_105_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_105_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_105_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_105_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_105_31.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: cleanliness\nB: weight\nC: sharpness\nD: color", "question": "Which attribute does the person want to change with knife for doing the last action in the video in the video?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: cleanliness\nB: weight\nC: sharpness\nD: color", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_106_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_106_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_106_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_106_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_106_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_106_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_106_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_106_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_106_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_106_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_106_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_106_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_106_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_106_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_106_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_106_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_106_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_106_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_106_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_106_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_106_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_106_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_106_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_106_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_106_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_106_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_106_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_106_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_106_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_106_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_106_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_106_31.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: lettuce\nB: carrot\nC: pepper\nD: tomato", "question": "which object changed its status when the other person do the last action in the video?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: lettuce\nB: carrot\nC: pepper\nD: tomato", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_107_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_107_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_107_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_107_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_107_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_107_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_107_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_107_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_107_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_107_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_107_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_107_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_107_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_107_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_107_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_107_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_107_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_107_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_107_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_107_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_107_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_107_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_107_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_107_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_107_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_107_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_107_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_107_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_107_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_107_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_107_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_107_31.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: Leave the watermelon unpeeled\nB: Wash the cutting-board\nC: Cut the watermelon on the floor\nD: Get watermelon from cutting-board", "question": "If the other person did not wash something, what actions of this person in the video is executable?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Leave the watermelon unpeeled\nB: Wash the cutting-board\nC: Cut the watermelon on the floor\nD: Get watermelon from cutting-board", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_108_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_108_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_108_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_108_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_108_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_108_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_108_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_108_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_108_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_108_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_108_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_108_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_108_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_108_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_108_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_108_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_108_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_108_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_108_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_108_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_108_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_108_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_108_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_108_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_108_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_108_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_108_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_108_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_108_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_108_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_108_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_108_31.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: always on\nB: could be turned off\nC: always off\nD: could be turned on", "question": "What is the precondition of changing the switchability of the last object that has status change in the video?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: always on\nB: could be turned off\nC: always off\nD: could be turned on", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_109_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_109_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_109_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_109_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_109_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_109_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_109_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_109_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_109_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_109_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_109_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_109_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_109_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_109_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_109_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_109_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_109_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_109_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_109_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_109_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_109_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_109_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_109_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_109_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_109_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_109_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_109_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_109_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_109_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_109_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_109_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_109_31.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: not sure\nB: maybe\nC: no\nD: yes", "question": "Did the attribute of fridge changed because of the action closing something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: not sure\nB: maybe\nC: no\nD: yes", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_110_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_110_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_110_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_110_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_110_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_110_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_110_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_110_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_110_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_110_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_110_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_110_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_110_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_110_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_110_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_110_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_110_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_110_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_110_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_110_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_110_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_110_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_110_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_110_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_110_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_110_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_110_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_110_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_110_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_110_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_110_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_110_31.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: no\nB: yes\nC: not sure\nD: maybe", "question": "If the person did not do the last action in the video, is the person able to put something to something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: no\nB: yes\nC: not sure\nD: maybe", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_111_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_111_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_111_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_111_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_111_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_111_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_111_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_111_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_111_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_111_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_111_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_111_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_111_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_111_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_111_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_111_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_111_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_111_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_111_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_111_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_111_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_111_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_111_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_111_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_111_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_111_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_111_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_111_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_111_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_111_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_111_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_111_31.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: max temperature\nB: on\nC: off\nD: half full", "question": "What is the precondition of changing the poweredness of kettle?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: max temperature\nB: on\nC: off\nD: half full", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_112_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_112_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_112_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_112_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_112_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_112_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_112_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_112_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_112_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_112_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_112_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_112_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_112_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_112_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_112_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_112_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_112_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_112_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_112_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_112_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_112_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_112_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_112_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_112_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_112_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_112_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_112_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_112_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_112_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_112_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_112_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_112_31.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: maybe\nB: no\nC: yes\nD: not sure", "question": "Did the attribute of vacuum changed because of the last action in the video?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: maybe\nB: no\nC: yes\nD: not sure", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_113_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_113_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_113_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_113_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_113_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_113_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_113_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_113_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_113_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_113_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_113_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_113_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_113_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_113_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_113_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_113_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_113_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_113_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_113_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_113_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_113_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_113_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_113_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_113_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_113_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_113_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_113_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_113_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_113_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_113_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_113_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_113_31.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: maybe\nB: sometimes\nC: yes\nD: no", "question": "Is milk visible to the other person after the person do the first action did after he/she open something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: maybe\nB: sometimes\nC: yes\nD: no", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_114_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_114_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_114_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_114_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_114_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_114_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_114_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_114_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_114_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_114_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_114_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_114_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_114_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_114_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_114_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_114_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_114_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_114_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_114_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_114_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_114_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_114_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_114_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_114_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_114_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_114_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_114_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_114_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_114_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_114_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_114_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_114_31.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: yes\nB: maybe\nC: not sure\nD: no", "question": "Does the first action in the video fulfills the preconditions of the action opening something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: yes\nB: maybe\nC: not sure\nD: no", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_115_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_115_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_115_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_115_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_115_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_115_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_115_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_115_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_115_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_115_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_115_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_115_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_115_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_115_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_115_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_115_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_115_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_115_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_115_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_115_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_115_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_115_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_115_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_115_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_115_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_115_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_115_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_115_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_115_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_115_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_115_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_115_31.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: sometimes\nB: only if it is a chair\nC: yes\nD: no", "question": "Does the action sitting down on something fulfills the preconditions of the action putting something to something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: sometimes\nB: only if it is a chair\nC: yes\nD: no", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_116_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_116_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_116_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_116_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_116_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_116_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_116_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_116_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_116_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_116_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_116_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_116_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_116_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_116_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_116_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_116_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_116_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_116_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_116_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_116_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_116_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_116_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_116_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_116_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_116_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_116_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_116_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_116_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_116_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_116_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_116_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_116_31.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: on the side of shelf\nB: next to the door\nC: on the edge of table\nD: under the chair", "question": "What is the status of vacuum before the person get something from something to change it?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: on the side of shelf\nB: next to the door\nC: on the edge of table\nD: under the chair", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_117_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_117_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_117_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_117_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_117_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_117_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_117_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_117_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_117_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_117_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_117_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_117_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_117_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_117_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_117_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_117_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_117_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_117_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_117_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_117_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_117_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_117_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_117_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_117_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_117_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_117_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_117_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_117_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_117_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_117_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_117_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_117_31.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: no\nB: yes\nC: maybe\nD: sometimes", "question": "Does the action sitting down on something fulfills the preconditions of the action switching with something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: no\nB: yes\nC: maybe\nD: sometimes", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_118_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_118_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_118_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_118_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_118_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_118_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_118_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_118_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_118_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_118_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_118_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_118_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_118_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_118_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_118_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_118_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_118_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_118_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_118_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_118_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_118_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_118_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_118_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_118_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_118_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_118_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_118_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_118_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_118_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_118_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_118_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_118_31.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: fishing-rod\nB: boat\nC: fishing-net\nD: life-jacket", "question": "which object changed its status when the person do the first action in the video?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: fishing-rod\nB: boat\nC: fishing-net\nD: life-jacket", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_119_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_119_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_119_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_119_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_119_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_119_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_119_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_119_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_119_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_119_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_119_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_119_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_119_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_119_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_119_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_119_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_119_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_119_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_119_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_119_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_119_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_119_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_119_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_119_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_119_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_119_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_119_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_119_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_119_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_119_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_119_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_119_31.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: Moved the milk to the fridge\nB: Placed the milk on the floor\nC: Put milk to table\nD: Took the milk off the table", "question": "How did the person changed the spatial relationships of the first object that has status change in the video?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Moved the milk to the fridge\nB: Placed the milk on the floor\nC: Put milk to table\nD: Took the milk off the table", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_120_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_120_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_120_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_120_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_120_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_120_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_120_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_120_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_120_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_120_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_120_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_120_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_120_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_120_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_120_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_120_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_120_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_120_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_120_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_120_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_120_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_120_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_120_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_120_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_120_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_120_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_120_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_120_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_120_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_120_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_120_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_120_31.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: spoon\nB: knife\nC: plate\nD: fork", "question": "which object changed its status when the other person do the first action before he/she eat something with something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: spoon\nB: knife\nC: plate\nD: fork", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_121_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_121_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_121_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_121_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_121_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_121_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_121_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_121_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_121_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_121_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_121_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_121_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_121_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_121_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_121_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_121_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_121_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_121_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_121_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_121_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_121_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_121_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_121_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_121_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_121_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_121_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_121_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_121_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_121_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_121_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_121_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_121_31.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: sometimes\nB: maybe\nC: yes\nD: no", "question": "If the person did not do the first action did before he/she wash something, will sink change its status?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: sometimes\nB: maybe\nC: yes\nD: no", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_122_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_122_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_122_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_122_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_122_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_122_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_122_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_122_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_122_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_122_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_122_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_122_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_122_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_122_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_122_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_122_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_122_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_122_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_122_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_122_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_122_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_122_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_122_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_122_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_122_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_122_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_122_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_122_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_122_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_122_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_122_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_122_31.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: no\nB: maybe\nC: yes\nD: sometimes", "question": "If the person did not do the first action did before he/she get something from something, is the person able to put something to something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: no\nB: maybe\nC: yes\nD: sometimes", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_123_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_123_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_123_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_123_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_123_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_123_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_123_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_123_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_123_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_123_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_123_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_123_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_123_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_123_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_123_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_123_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_123_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_123_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_123_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_123_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_123_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_123_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_123_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_123_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_123_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_123_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_123_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_123_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_123_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_123_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_123_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_123_31.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: no\nB: yes\nC: maybe\nD: sometimes", "question": "Did the attribute of meat changed because of the action putting something to something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: no\nB: yes\nC: maybe\nD: sometimes", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_124_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_124_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_124_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_124_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_124_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_124_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_124_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_124_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_124_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_124_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_124_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_124_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_124_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_124_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_124_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_124_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_124_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_124_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_124_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_124_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_124_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_124_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_124_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_124_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_124_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_124_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_124_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_124_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_124_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_124_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_124_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_124_31.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: Get meat from meat using spoon\nB: Use a knife to cut the meat\nC: Boil the meat to change its shape\nD: Squash the meat with a fork", "question": "How did the person changed the shape of meat2?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Get meat from meat using spoon\nB: Use a knife to cut the meat\nC: Boil the meat to change its shape\nD: Squash the meat with a fork", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_125_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_125_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_125_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_125_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_125_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_125_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_125_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_125_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_125_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_125_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_125_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_125_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_125_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_125_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_125_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_125_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_125_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_125_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_125_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_125_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_125_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_125_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_125_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_125_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_125_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_125_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_125_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_125_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_125_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_125_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_125_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_125_31.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: only partially\nB: maybe\nC: no\nD: yes", "question": "Did the attribute of spoon changed because of the first action did after the person put something to something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: only partially\nB: maybe\nC: no\nD: yes", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_126_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_126_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_126_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_126_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_126_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_126_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_126_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_126_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_126_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_126_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_126_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_126_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_126_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_126_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_126_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_126_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_126_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_126_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_126_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_126_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_126_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_126_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_126_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_126_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_126_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_126_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_126_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_126_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_126_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_126_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_126_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_126_31.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: uncertain\nB: no\nC: maybe\nD: yes", "question": "Does the first action did before the person put something to something fulfills the preconditions of the action opening something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: uncertain\nB: no\nC: maybe\nD: yes", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_127_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_127_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_127_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_127_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_127_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_127_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_127_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_127_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_127_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_127_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_127_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_127_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_127_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_127_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_127_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_127_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_127_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_127_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_127_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_127_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_127_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_127_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_127_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_127_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_127_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_127_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_127_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_127_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_127_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_127_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_127_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_127_31.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: no\nB: yes\nC: only if watermelon1 changes its status first\nD: maybe", "question": "If the person did not do the first action did before he/she get something from something, will watermelon2 change its status?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: no\nB: yes\nC: only if watermelon1 changes its status first\nD: maybe", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_128_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_128_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_128_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_128_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_128_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_128_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_128_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_128_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_128_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_128_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_128_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_128_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_128_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_128_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_128_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_128_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_128_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_128_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_128_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_128_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_128_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_128_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_128_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_128_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_128_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_128_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_128_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_128_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_128_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_128_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_128_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_128_31.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: in the fridge\nB: on the table\nC: in cup1\nD: in cup2", "question": "What is the status of juice before the person do the first action did after he/she get something from something to change it?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: in the fridge\nB: on the table\nC: in cup1\nD: in cup2", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_129_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_129_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_129_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_129_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_129_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_129_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_129_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_129_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_129_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_129_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_129_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_129_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_129_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_129_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_129_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_129_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_129_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_129_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_129_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_129_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_129_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_129_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_129_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_129_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_129_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_129_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_129_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_129_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_129_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_129_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_129_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_129_31.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: in the garden\nB: on the table\nC: under the bed\nD: in sink", "question": "What does the person want tank to be for the first action did before the person pour from something into something in the video?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: in the garden\nB: on the table\nC: under the bed\nD: in sink", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_130_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_130_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_130_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_130_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_130_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_130_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_130_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_130_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_130_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_130_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_130_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_130_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_130_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_130_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_130_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_130_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_130_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_130_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_130_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_130_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_130_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_130_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_130_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_130_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_130_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_130_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_130_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_130_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_130_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_130_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_130_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_130_31.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: in the refrigerator\nB: under the sink\nC: on top of stove\nD: outside in the garden", "question": "What is the status of water-pot after the other person put something to something to change it?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: in the refrigerator\nB: under the sink\nC: on top of stove\nD: outside in the garden", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_131_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_131_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_131_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_131_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_131_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_131_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_131_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_131_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_131_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_131_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_131_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_131_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_131_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_131_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_131_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_131_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_131_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_131_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_131_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_131_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_131_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_131_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_131_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_131_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_131_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_131_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_131_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_131_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_131_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_131_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_131_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_131_31.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: Get watermelon from cutting-board\nB: Wash hands\nC: Put apple on the counter\nD: Chop vegetables", "question": "What is the last action the person did in the video?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Get watermelon from cutting-board\nB: Wash hands\nC: Put apple on the counter\nD: Chop vegetables", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_132_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_132_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_132_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_132_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_132_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_132_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_132_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_132_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_132_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_132_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_132_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_132_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_132_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_132_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_132_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_132_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_132_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_132_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_132_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_132_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_132_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_132_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_132_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_132_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_132_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_132_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_132_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_132_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_132_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_132_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_132_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_132_31.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: possibly\nB: yes\nC: no\nD: unknown", "question": "Does the first action in the video fulfills the preconditions of the action pouring from something into something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: possibly\nB: yes\nC: no\nD: unknown", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_133_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_133_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_133_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_133_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_133_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_133_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_133_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_133_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_133_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_133_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_133_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_133_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_133_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_133_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_133_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_133_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_133_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_133_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_133_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_133_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_133_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_133_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_133_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_133_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_133_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_133_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_133_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_133_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_133_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_133_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_133_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_133_31.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: maybe\nB: no\nC: I am not sure\nD: yes", "question": "Did the attribute of controller1 changed because of the first action did before the person stand-up?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: maybe\nB: no\nC: I am not sure\nD: yes", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_134_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_134_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_134_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_134_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_134_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_134_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_134_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_134_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_134_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_134_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_134_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_134_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_134_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_134_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_134_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_134_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_134_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_134_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_134_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_134_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_134_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_134_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_134_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_134_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_134_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_134_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_134_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_134_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_134_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_134_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_134_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_134_31.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: full\nB: half-full\nC: boiling\nD: empty", "question": "What is the status of kettle after the person do the last action in the video to change it?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: full\nB: half-full\nC: boiling\nD: empty", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_135_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_135_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_135_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_135_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_135_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_135_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_135_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_135_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_135_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_135_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_135_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_135_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_135_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_135_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_135_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_135_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_135_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_135_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_135_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_135_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_135_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_135_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_135_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_135_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_135_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_135_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_135_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_135_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_135_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_135_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_135_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_135_31.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: Turned on the lights\nB: Plugged in the TV\nC: Turned off the remote\nD: Turn on TV with remote", "question": "How did the person changed the poweredness of the first object that has status change in the video?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Turned on the lights\nB: Plugged in the TV\nC: Turned off the remote\nD: Turn on TV with remote", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_136_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_136_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_136_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_136_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_136_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_136_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_136_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_136_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_136_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_136_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_136_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_136_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_136_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_136_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_136_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_136_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_136_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_136_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_136_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_136_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_136_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_136_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_136_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_136_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_136_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_136_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_136_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_136_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_136_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_136_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_136_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_136_31.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: yes\nB: maybe\nC: sometimes\nD: no", "question": "Does the action putting something to something fulfills the preconditions of the action watching something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: yes\nB: maybe\nC: sometimes\nD: no", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_137_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_137_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_137_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_137_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_137_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_137_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_137_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_137_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_137_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_137_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_137_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_137_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_137_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_137_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_137_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_137_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_137_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_137_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_137_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_137_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_137_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_137_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_137_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_137_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_137_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_137_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_137_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_137_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_137_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_137_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_137_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_137_31.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: maybe\nB: no\nC: yes\nD: only if performed sequentially", "question": "Does the action getting something from something fulfills the preconditions of the action putting something to something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: maybe\nB: no\nC: yes\nD: only if performed sequentially", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_138_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_138_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_138_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_138_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_138_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_138_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_138_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_138_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_138_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_138_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_138_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_138_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_138_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_138_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_138_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_138_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_138_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_138_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_138_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_138_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_138_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_138_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_138_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_138_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_138_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_138_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_138_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_138_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_138_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_138_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_138_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_138_31.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: Cooking food on stove\nB: Fill water-pot using water-dispenser\nC: Reading a book\nD: Talking on the phone", "question": "what is the other person doing while the person stand-up?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Cooking food on stove\nB: Fill water-pot using water-dispenser\nC: Reading a book\nD: Talking on the phone", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_139_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_139_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_139_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_139_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_139_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_139_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_139_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_139_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_139_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_139_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_139_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_139_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_139_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_139_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_139_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_139_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_139_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_139_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_139_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_139_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_139_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_139_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_139_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_139_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_139_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_139_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_139_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_139_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_139_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_139_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_139_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_139_31.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: only if they open the fridge again\nB: yes\nC: maybe\nD: no", "question": "Is fridge visible to the other person after the person do the first action did after he/she put something to something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: only if they open the fridge again\nB: yes\nC: maybe\nD: no", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_140_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_140_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_140_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_140_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_140_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_140_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_140_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_140_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_140_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_140_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_140_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_140_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_140_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_140_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_140_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_140_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_140_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_140_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_140_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_140_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_140_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_140_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_140_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_140_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_140_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_140_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_140_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_140_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_140_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_140_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_140_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_140_31.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: yes\nB: no\nC: not sure\nD: maybe", "question": "Did the attribute of fridge changed because of the action opening something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: yes\nB: no\nC: not sure\nD: maybe", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_141_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_141_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_141_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_141_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_141_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_141_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_141_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_141_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_141_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_141_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_141_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_141_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_141_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_141_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_141_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_141_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_141_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_141_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_141_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_141_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_141_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_141_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_141_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_141_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_141_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_141_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_141_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_141_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_141_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_141_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_141_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_141_31.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: Get cereal from table\nB: Turn off the lights\nC: Wash dishes\nD: Open fridge", "question": "What is the last action the person did in the video?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Get cereal from table\nB: Turn off the lights\nC: Wash dishes\nD: Open fridge", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_142_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_142_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_142_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_142_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_142_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_142_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_142_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_142_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_142_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_142_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_142_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_142_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_142_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_142_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_142_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_142_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_142_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_142_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_142_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_142_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_142_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_142_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_142_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_142_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_142_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_142_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_142_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_142_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_142_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_142_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_142_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_142_31.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: Put orange in the fridge\nB: Put watermelon to fridge\nC: Take watermelon out of the fridge\nD: Take apples from the table", "question": "What is the person doing before he/she get something from something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Put orange in the fridge\nB: Put watermelon to fridge\nC: Take watermelon out of the fridge\nD: Take apples from the table", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_143_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_143_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_143_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_143_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_143_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_143_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_143_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_143_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_143_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_143_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_143_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_143_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_143_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_143_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_143_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_143_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_143_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_143_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_143_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_143_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_143_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_143_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_143_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_143_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_143_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_143_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_143_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_143_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_143_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_143_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_143_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_143_31.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: yes\nB: probably\nC: maybe\nD: no", "question": "Did the attribute of the first object that has status change in the video changed because of the action getting something from something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: yes\nB: probably\nC: maybe\nD: no", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_144_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_144_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_144_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_144_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_144_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_144_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_144_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_144_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_144_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_144_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_144_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_144_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_144_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_144_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_144_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_144_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_144_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_144_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_144_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_144_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_144_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_144_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_144_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_144_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_144_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_144_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_144_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_144_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_144_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_144_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_144_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_144_31.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: television\nB: remote\nC: phone\nD: computer", "question": "If the actor do not put something to something, which object will he/she not be able to change in the future?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: television\nB: remote\nC: phone\nD: computer", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_145_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_145_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_145_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_145_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_145_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_145_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_145_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_145_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_145_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_145_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_145_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_145_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_145_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_145_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_145_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_145_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_145_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_145_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_145_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_145_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_145_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_145_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_145_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_145_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_145_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_145_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_145_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_145_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_145_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_145_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_145_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_145_31.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: Throw the cup away\nB: Break the cup\nC: Put cup to the other person\nD: Keep the cup for themselves", "question": "If the person did not do the first action did after he/she get something from something, what remaining actions in the video is executable?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Throw the cup away\nB: Break the cup\nC: Put cup to the other person\nD: Keep the cup for themselves", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_146_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_146_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_146_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_146_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_146_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_146_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_146_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_146_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_146_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_146_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_146_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_146_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_146_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_146_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_146_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_146_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_146_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_146_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_146_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_146_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_146_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_146_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_146_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_146_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_146_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_146_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_146_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_146_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_146_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_146_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_146_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_146_31.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: mixing\nB: harvesting\nC: watering\nD: pruning", "question": "What does the person want plant to be for the first action did before the person fill something using something in the video?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: mixing\nB: harvesting\nC: watering\nD: pruning", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_147_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_147_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_147_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_147_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_147_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_147_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_147_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_147_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_147_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_147_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_147_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_147_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_147_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_147_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_147_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_147_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_147_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_147_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_147_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_147_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_147_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_147_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_147_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_147_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_147_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_147_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_147_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_147_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_147_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_147_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_147_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_147_31.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: sometimes\nB: yes\nC: no\nD: maybe", "question": "If the person did not do the first action did after he/she fill something using something, is the person able to work on something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: sometimes\nB: yes\nC: no\nD: maybe", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_148_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_148_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_148_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_148_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_148_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_148_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_148_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_148_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_148_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_148_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_148_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_148_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_148_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_148_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_148_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_148_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_148_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_148_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_148_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_148_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_148_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_148_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_148_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_148_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_148_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_148_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_148_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_148_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_148_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_148_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_148_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_148_31.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: emptiness\nB: happiness\nC: fullness\nD: sadness", "question": "what status will the person change on juicer-base?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: emptiness\nB: happiness\nC: fullness\nD: sadness", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_149_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_149_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_149_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_149_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_149_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_149_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_149_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_149_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_149_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_149_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_149_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_149_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_149_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_149_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_149_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_149_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_149_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_149_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_149_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_149_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_149_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_149_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_149_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_149_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_149_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_149_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_149_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_149_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_149_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_149_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_149_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_149_31.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: no\nB: yes\nC: uncertain\nD: maybe", "question": "Did the attribute of water-pot changed because of the first action did after the person sit down on something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: no\nB: yes\nC: uncertain\nD: maybe", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_150_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_150_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_150_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_150_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_150_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_150_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_150_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_150_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_150_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_150_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_150_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_150_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_150_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_150_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_150_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_150_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_150_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_150_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_150_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_150_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_150_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_150_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_150_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_150_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_150_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_150_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_150_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_150_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_150_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_150_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_150_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_150_31.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: kettle\nB: towel\nC: window\nD: chair", "question": "which object changed its status when the other person do the last action in the video?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: kettle\nB: towel\nC: window\nD: chair", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_151_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_151_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_151_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_151_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_151_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_151_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_151_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_151_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_151_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_151_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_151_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_151_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_151_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_151_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_151_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_151_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_151_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_151_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_151_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_151_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_151_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_151_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_151_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_151_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_151_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_151_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_151_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_151_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_151_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_151_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_151_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_151_31.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: Put cup to cutting-board\nB: Playing a game\nC: Reading a book\nD: Watching TV", "question": "what is the other person doing while the person open something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Put cup to cutting-board\nB: Playing a game\nC: Reading a book\nD: Watching TV", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_152_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_152_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_152_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_152_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_152_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_152_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_152_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_152_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_152_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_152_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_152_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_152_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_152_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_152_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_152_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_152_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_152_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_152_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_152_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_152_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_152_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_152_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_152_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_152_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_152_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_152_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_152_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_152_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_152_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_152_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_152_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_152_31.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: state of separation\nB: state of mixture\nC: state of disintegration\nD: state of dissolution", "question": "what status of noodles changed while the person do the last action in the video?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: state of separation\nB: state of mixture\nC: state of disintegration\nD: state of dissolution", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_153_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_153_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_153_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_153_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_153_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_153_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_153_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_153_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_153_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_153_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_153_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_153_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_153_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_153_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_153_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_153_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_153_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_153_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_153_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_153_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_153_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_153_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_153_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_153_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_153_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_153_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_153_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_153_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_153_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_153_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_153_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_153_31.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: in bottle\nB: in cup2\nC: in cup1\nD: on table", "question": "How would the first action did after the person close something change the state of milk1?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: in bottle\nB: in cup2\nC: in cup1\nD: on table", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_154_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_154_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_154_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_154_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_154_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_154_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_154_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_154_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_154_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_154_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_154_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_154_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_154_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_154_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_154_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_154_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_154_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_154_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_154_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_154_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_154_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_154_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_154_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_154_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_154_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_154_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_154_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_154_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_154_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_154_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_154_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_154_31.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: off\nB: ignored\nC: on\nD: broken", "question": "What does the person want kettle to be for the first action did after the person work on something in the video?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: off\nB: ignored\nC: on\nD: broken", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_155_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_155_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_155_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_155_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_155_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_155_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_155_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_155_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_155_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_155_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_155_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_155_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_155_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_155_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_155_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_155_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_155_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_155_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_155_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_155_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_155_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_155_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_155_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_155_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_155_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_155_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_155_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_155_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_155_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_155_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_155_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_155_31.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: no\nB: yes\nC: sometimes\nD: maybe", "question": "Does the action sitting down on something fulfills the preconditions of the action drinking something with something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: no\nB: yes\nC: sometimes\nD: maybe", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_156_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_156_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_156_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_156_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_156_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_156_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_156_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_156_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_156_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_156_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_156_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_156_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_156_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_156_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_156_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_156_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_156_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_156_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_156_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_156_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_156_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_156_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_156_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_156_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_156_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_156_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_156_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_156_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_156_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_156_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_156_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_156_31.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: box1\nB: box2\nC: wrapping1\nD: wrapping2", "question": "which object changed its status when the person get something from something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: box1\nB: box2\nC: wrapping1\nD: wrapping2", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_157_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_157_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_157_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_157_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_157_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_157_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_157_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_157_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_157_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_157_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_157_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_157_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_157_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_157_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_157_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_157_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_157_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_157_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_157_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_157_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_157_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_157_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_157_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_157_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_157_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_157_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_157_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_157_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_157_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_157_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_157_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_157_31.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: yes\nB: unknown\nC: no\nD: maybe", "question": "Did the attribute of lettuce changed because of the first action did after the person put something to something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: yes\nB: unknown\nC: no\nD: maybe", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_158_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_158_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_158_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_158_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_158_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_158_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_158_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_158_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_158_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_158_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_158_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_158_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_158_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_158_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_158_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_158_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_158_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_158_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_158_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_158_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_158_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_158_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_158_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_158_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_158_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_158_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_158_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_158_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_158_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_158_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_158_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_158_31.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: Cook meat using microwave\nB: Cook meat using pan and stove\nC: Cook meat using oven\nD: Cook meat using grill", "question": "How did the person changed the cookedness of meat12?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Cook meat using microwave\nB: Cook meat using pan and stove\nC: Cook meat using oven\nD: Cook meat using grill", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_159_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_159_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_159_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_159_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_159_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_159_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_159_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_159_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_159_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_159_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_159_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_159_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_159_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_159_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_159_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_159_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_159_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_159_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_159_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_159_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_159_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_159_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_159_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_159_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_159_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_159_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_159_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_159_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_159_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_159_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_159_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_159_31.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: no\nB: partially\nC: yes\nD: maybe", "question": "Did the attribute of the first object that has status change in the video changed because of the action filling something using something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: no\nB: partially\nC: yes\nD: maybe", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_160_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_160_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_160_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_160_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_160_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_160_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_160_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_160_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_160_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_160_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_160_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_160_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_160_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_160_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_160_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_160_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_160_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_160_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_160_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_160_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_160_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_160_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_160_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_160_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_160_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_160_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_160_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_160_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_160_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_160_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_160_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_160_31.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: underneath watermelon\nB: on top of watermelon\nC: next to watermelon\nD: inside watermelon", "question": "What is the precondition of changing the spatial relationships of watermelon1?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: underneath watermelon\nB: on top of watermelon\nC: next to watermelon\nD: inside watermelon", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_161_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_161_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_161_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_161_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_161_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_161_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_161_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_161_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_161_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_161_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_161_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_161_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_161_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_161_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_161_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_161_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_161_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_161_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_161_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_161_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_161_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_161_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_161_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_161_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_161_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_161_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_161_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_161_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_161_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_161_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_161_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_161_31.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: yes\nB: sometimes\nC: maybe\nD: no", "question": "Did the attribute of the first object that has status change in the video changed because of the action putting something to something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: yes\nB: sometimes\nC: maybe\nD: no", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_162_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_162_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_162_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_162_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_162_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_162_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_162_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_162_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_162_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_162_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_162_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_162_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_162_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_162_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_162_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_162_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_162_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_162_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_162_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_162_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_162_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_162_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_162_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_162_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_162_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_162_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_162_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_162_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_162_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_162_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_162_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_162_31.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: vacuum cleaner\nB: refrigerator\nC: microwave\nD: television", "question": "which object changed its status when the other person do the first action in the video?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: vacuum cleaner\nB: refrigerator\nC: microwave\nD: television", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_163_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_163_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_163_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_163_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_163_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_163_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_163_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_163_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_163_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_163_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_163_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_163_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_163_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_163_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_163_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_163_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_163_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_163_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_163_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_163_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_163_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_163_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_163_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_163_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_163_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_163_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_163_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_163_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_163_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_163_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_163_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_163_31.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: Sit on the couch\nB: Check their phone\nC: Go outside\nD: Get bowl and spoon from table", "question": "What is the person doing after he/she point to something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Sit on the couch\nB: Check their phone\nC: Go outside\nD: Get bowl and spoon from table", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_164_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_164_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_164_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_164_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_164_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_164_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_164_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_164_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_164_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_164_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_164_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_164_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_164_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_164_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_164_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_164_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_164_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_164_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_164_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_164_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_164_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_164_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_164_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_164_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_164_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_164_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_164_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_164_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_164_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_164_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_164_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_164_31.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: in basin\nB: on ground\nC: in tree\nD: in sky", "question": "How would the action putting something to something using fishing-net change the state of fish?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: in basin\nB: on ground\nC: in tree\nD: in sky", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_165_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_165_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_165_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_165_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_165_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_165_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_165_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_165_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_165_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_165_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_165_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_165_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_165_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_165_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_165_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_165_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_165_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_165_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_165_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_165_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_165_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_165_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_165_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_165_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_165_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_165_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_165_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_165_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_165_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_165_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_165_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_165_31.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: Read a book\nB: Get remote from table\nC: Go for a walk\nD: Start cooking dinner", "question": "what will the other person do next?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Read a book\nB: Get remote from table\nC: Go for a walk\nD: Start cooking dinner", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_166_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_166_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_166_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_166_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_166_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_166_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_166_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_166_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_166_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_166_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_166_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_166_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_166_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_166_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_166_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_166_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_166_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_166_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_166_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_166_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_166_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_166_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_166_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_166_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_166_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_166_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_166_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_166_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_166_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_166_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_166_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_166_31.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: probably not\nB: maybe\nC: yes\nD: no", "question": "Did the attribute of juicer changed because of the last action in the video?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: probably not\nB: maybe\nC: yes\nD: no", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_167_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_167_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_167_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_167_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_167_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_167_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_167_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_167_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_167_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_167_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_167_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_167_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_167_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_167_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_167_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_167_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_167_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_167_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_167_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_167_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_167_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_167_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_167_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_167_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_167_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_167_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_167_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_167_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_167_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_167_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_167_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_167_31.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: Watch TV\nB: Go for a run\nC: Read a book\nD: Wash bowl", "question": "If the person did not eat something with something, what remaining actions in the video is executable?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Watch TV\nB: Go for a run\nC: Read a book\nD: Wash bowl", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_168_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_168_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_168_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_168_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_168_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_168_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_168_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_168_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_168_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_168_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_168_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_168_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_168_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_168_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_168_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_168_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_168_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_168_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_168_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_168_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_168_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_168_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_168_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_168_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_168_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_168_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_168_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_168_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_168_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_168_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_168_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_168_31.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: Painting a portrait\nB: Playing a musical instrument\nC: Cook meat using fork and pan and stove\nD: Reading a book by the fireplace", "question": "what is the other person doing while the person do the last action in the video?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Painting a portrait\nB: Playing a musical instrument\nC: Cook meat using fork and pan and stove\nD: Reading a book by the fireplace", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_169_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_169_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_169_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_169_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_169_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_169_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_169_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_169_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_169_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_169_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_169_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_169_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_169_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_169_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_169_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_169_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_169_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_169_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_169_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_169_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_169_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_169_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_169_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_169_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_169_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_169_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_169_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_169_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_169_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_169_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_169_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_169_31.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: Put meat to pan using fork\nB: Put meat to pan using knife\nC: Put meat to plate using fork\nD: Put meat to pan using spatula", "question": "How did the person changed the spatial relationships of meat1?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Put meat to pan using fork\nB: Put meat to pan using knife\nC: Put meat to plate using fork\nD: Put meat to pan using spatula", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_170_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_170_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_170_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_170_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_170_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_170_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_170_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_170_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_170_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_170_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_170_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_170_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_170_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_170_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_170_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_170_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_170_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_170_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_170_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_170_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_170_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_170_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_170_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_170_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_170_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_170_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_170_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_170_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_170_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_170_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_170_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_170_31.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: no\nB: maybe\nC: uncertain\nD: yes", "question": "Did the attribute of meat1 changed because of the action getting something from something using fork?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: no\nB: maybe\nC: uncertain\nD: yes", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_171_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_171_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_171_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_171_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_171_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_171_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_171_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_171_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_171_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_171_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_171_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_171_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_171_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_171_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_171_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_171_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_171_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_171_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_171_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_171_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_171_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_171_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_171_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_171_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_171_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_171_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_171_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_171_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_171_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_171_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_171_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_171_31.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: coffee2\nB: coffee1\nC: bottle\nD: tea", "question": "which object changed its status last in the video?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: coffee2\nB: coffee1\nC: bottle\nD: tea", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_172_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_172_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_172_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_172_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_172_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_172_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_172_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_172_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_172_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_172_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_172_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_172_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_172_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_172_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_172_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_172_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_172_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_172_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_172_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_172_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_172_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_172_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_172_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_172_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_172_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_172_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_172_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_172_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_172_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_172_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_172_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_172_31.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: yes\nB: maybe\nC: uncertain\nD: no", "question": "Does the first action in the video fulfills the preconditions of the action opening something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: yes\nB: maybe\nC: uncertain\nD: no", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_173_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_173_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_173_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_173_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_173_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_173_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_173_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_173_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_173_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_173_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_173_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_173_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_173_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_173_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_173_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_173_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_173_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_173_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_173_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_173_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_173_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_173_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_173_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_173_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_173_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_173_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_173_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_173_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_173_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_173_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_173_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_173_31.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: sometimes\nB: no\nC: maybe\nD: yes", "question": "If the person did not do the first action in the video, will drawer change its status?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: sometimes\nB: no\nC: maybe\nD: yes", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_174_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_174_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_174_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_174_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_174_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_174_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_174_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_174_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_174_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_174_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_174_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_174_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_174_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_174_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_174_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_174_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_174_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_174_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_174_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_174_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_174_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_174_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_174_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_174_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_174_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_174_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_174_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_174_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_174_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_174_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_174_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_174_31.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: no\nB: yes, the status changed due to pouring\nC: the action completed without any status change\nD: the attribute has been initialized", "question": "Did the attribute of the object has status change changed because of the action pouring from something into something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: no\nB: yes, the status changed due to pouring\nC: the action completed without any status change\nD: the attribute has been initialized", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_175_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_175_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_175_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_175_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_175_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_175_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_175_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_175_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_175_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_175_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_175_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_175_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_175_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_175_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_175_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_175_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_175_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_175_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_175_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_175_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_175_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_175_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_175_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_175_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_175_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_175_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_175_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_175_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_175_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_175_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_175_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_175_31.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: sometimes\nB: no\nC: yes\nD: maybe", "question": "If the person did not do the first action did before he/she wash something, is the person able to get something from something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: sometimes\nB: no\nC: yes\nD: maybe", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_176_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_176_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_176_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_176_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_176_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_176_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_176_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_176_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_176_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_176_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_176_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_176_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_176_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_176_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_176_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_176_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_176_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_176_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_176_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_176_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_176_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_176_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_176_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_176_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_176_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_176_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_176_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_176_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_176_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_176_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_176_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_176_31.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: transparent\nB: blue\nC: empty\nD: nonempty", "question": "What is the status of trash-can after the other person throw something into something to change it?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: transparent\nB: blue\nC: empty\nD: nonempty", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_177_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_177_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_177_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_177_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_177_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_177_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_177_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_177_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_177_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_177_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_177_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_177_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_177_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_177_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_177_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_177_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_177_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_177_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_177_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_177_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_177_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_177_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_177_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_177_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_177_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_177_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_177_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_177_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_177_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_177_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_177_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_177_31.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: Put lettuce to trash-can\nB: Moved the trash-can\nC: Cleaned the trash-can\nD: Removed the lettuce", "question": "What action caused trash-can's status to change to nonempty?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Put lettuce to trash-can\nB: Moved the trash-can\nC: Cleaned the trash-can\nD: Removed the lettuce", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_178_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_178_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_178_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_178_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_178_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_178_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_178_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_178_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_178_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_178_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_178_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_178_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_178_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_178_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_178_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_178_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_178_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_178_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_178_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_178_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_178_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_178_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_178_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_178_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_178_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_178_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_178_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_178_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_178_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_178_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_178_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_178_31.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: Turn on the light\nB: Open the refrigerator\nC: Get knife from knife-base\nD: Sit on the couch", "question": "During which action does the person knows about the other person's action?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Turn on the light\nB: Open the refrigerator\nC: Get knife from knife-base\nD: Sit on the couch", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_179_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_179_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_179_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_179_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_179_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_179_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_179_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_179_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_179_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_179_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_179_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_179_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_179_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_179_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_179_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_179_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_179_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_179_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_179_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_179_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_179_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_179_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_179_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_179_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_179_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_179_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_179_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_179_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_179_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_179_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_179_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_179_31.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: maybe\nB: no\nC: sometimes\nD: yes", "question": "Did the attribute of vacuum changed because of the action putting something to something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: maybe\nB: no\nC: sometimes\nD: yes", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_180_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_180_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_180_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_180_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_180_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_180_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_180_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_180_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_180_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_180_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_180_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_180_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_180_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_180_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_180_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_180_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_180_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_180_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_180_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_180_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_180_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_180_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_180_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_180_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_180_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_180_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_180_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_180_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_180_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_180_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_180_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_180_31.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: inside the refrigerator\nB: next to the coffee machine\nC: on top of juicer base\nD: under the microwave", "question": "what will the person want to have the first object that has status change in the video's spatial relationships be in the future?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: inside the refrigerator\nB: next to the coffee machine\nC: on top of juicer base\nD: under the microwave", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_181_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_181_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_181_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_181_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_181_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_181_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_181_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_181_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_181_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_181_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_181_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_181_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_181_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_181_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_181_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_181_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_181_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_181_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_181_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_181_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_181_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_181_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_181_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_181_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_181_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_181_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_181_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_181_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_181_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_181_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_181_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_181_31.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: Washed the car\nB: Sweep floor using vacuum\nC: Read a book\nD: Cooked dinner", "question": "How did the person changed the cleanliness of vacuum?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Washed the car\nB: Sweep floor using vacuum\nC: Read a book\nD: Cooked dinner", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_182_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_182_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_182_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_182_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_182_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_182_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_182_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_182_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_182_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_182_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_182_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_182_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_182_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_182_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_182_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_182_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_182_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_182_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_182_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_182_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_182_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_182_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_182_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_182_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_182_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_182_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_182_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_182_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_182_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_182_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_182_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_182_31.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: bowl\nB: book\nC: door\nD: lamp", "question": "which object changed its status when the person get something from something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: bowl\nB: book\nC: door\nD: lamp", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_183_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_183_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_183_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_183_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_183_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_183_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_183_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_183_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_183_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_183_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_183_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_183_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_183_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_183_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_183_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_183_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_183_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_183_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_183_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_183_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_183_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_183_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_183_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_183_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_183_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_183_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_183_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_183_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_183_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_183_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_183_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_183_31.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: yes\nB: no\nC: sometimes\nD: always", "question": "Is the other person aware when the person get something from something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: yes\nB: no\nC: sometimes\nD: always", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_184_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_184_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_184_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_184_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_184_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_184_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_184_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_184_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_184_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_184_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_184_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_184_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_184_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_184_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_184_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_184_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_184_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_184_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_184_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_184_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_184_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_184_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_184_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_184_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_184_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_184_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_184_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_184_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_184_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_184_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_184_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_184_31.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: no\nB: yes\nC: I don\u2019t know\nD: maybe", "question": "If the person did not fill something using something, is the person able to do the first action in the video?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: no\nB: yes\nC: I don\u2019t know\nD: maybe", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_185_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_185_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_185_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_185_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_185_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_185_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_185_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_185_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_185_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_185_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_185_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_185_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_185_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_185_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_185_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_185_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_185_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_185_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_185_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_185_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_185_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_185_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_185_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_185_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_185_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_185_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_185_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_185_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_185_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_185_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_185_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_185_31.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: Close the tank-lid\nB: Get tank-lid from table\nC: Take a seat\nD: Pour water into the tank", "question": "what is the other person doing while the person do the first action did after he/she put something to something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Close the tank-lid\nB: Get tank-lid from table\nC: Take a seat\nD: Pour water into the tank", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_186_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_186_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_186_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_186_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_186_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_186_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_186_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_186_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_186_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_186_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_186_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_186_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_186_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_186_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_186_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_186_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_186_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_186_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_186_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_186_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_186_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_186_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_186_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_186_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_186_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_186_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_186_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_186_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_186_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_186_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_186_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_186_31.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: no\nB: only if the object is transparent\nC: sometimes\nD: yes", "question": "Is cutting-board visible to the other person after the person put something to something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: no\nB: only if the object is transparent\nC: sometimes\nD: yes", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_187_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_187_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_187_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_187_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_187_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_187_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_187_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_187_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_187_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_187_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_187_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_187_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_187_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_187_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_187_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_187_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_187_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_187_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_187_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_187_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_187_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_187_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_187_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_187_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_187_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_187_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_187_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_187_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_187_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_187_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_187_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_187_31.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: Cut another vegetable\nB: Put knife to knife-base\nC: Throw the knife away\nD: Wash the knife", "question": "what will the person do next after this video?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Cut another vegetable\nB: Put knife to knife-base\nC: Throw the knife away\nD: Wash the knife", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_188_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_188_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_188_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_188_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_188_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_188_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_188_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_188_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_188_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_188_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_188_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_188_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_188_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_188_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_188_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_188_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_188_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_188_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_188_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_188_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_188_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_188_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_188_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_188_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_188_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_188_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_188_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_188_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_188_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_188_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_188_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_188_31.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: in a drawer\nB: on a shelf\nC: under the bed\nD: in trash can", "question": "How would the action throwing something into something change the state of wrapping?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: in a drawer\nB: on a shelf\nC: under the bed\nD: in trash can", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_189_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_189_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_189_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_189_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_189_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_189_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_189_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_189_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_189_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_189_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_189_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_189_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_189_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_189_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_189_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_189_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_189_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_189_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_189_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_189_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_189_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_189_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_189_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_189_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_189_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_189_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_189_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_189_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_189_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_189_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_189_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_189_31.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: in the dishwasher\nB: in sink\nC: on the table\nD: in the fridge", "question": "What is the status of cup before the other person do the first action after he/she put something to something to change it?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: in the dishwasher\nB: in sink\nC: on the table\nD: in the fridge", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_190_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_190_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_190_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_190_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_190_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_190_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_190_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_190_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_190_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_190_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_190_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_190_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_190_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_190_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_190_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_190_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_190_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_190_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_190_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_190_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_190_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_190_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_190_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_190_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_190_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_190_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_190_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_190_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_190_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_190_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_190_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_190_31.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: table\nB: tv\nC: window\nD: phone", "question": "which object changed its status when the person do the first action did after he/she stand-up?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: table\nB: tv\nC: window\nD: phone", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_191_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_191_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_191_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_191_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_191_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_191_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_191_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_191_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_191_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_191_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_191_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_191_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_191_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_191_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_191_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_191_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_191_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_191_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_191_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_191_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_191_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_191_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_191_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_191_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_191_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_191_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_191_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_191_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_191_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_191_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_191_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_191_31.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: no\nB: maybe\nC: yes\nD: I don't know", "question": "If the person did not throw something into something, is the person able to get something from something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: no\nB: maybe\nC: yes\nD: I don't know", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_192_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_192_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_192_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_192_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_192_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_192_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_192_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_192_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_192_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_192_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_192_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_192_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_192_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_192_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_192_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_192_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_192_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_192_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_192_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_192_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_192_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_192_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_192_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_192_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_192_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_192_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_192_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_192_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_192_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_192_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_192_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_192_31.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: Get fishing-net from basin\nB: Throw the net into the water\nC: Cover the basin with a lid\nD: Pour water from the basin", "question": "If the person did not pour from something into something, what remaining actions in the video is executable?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Get fishing-net from basin\nB: Throw the net into the water\nC: Cover the basin with a lid\nD: Pour water from the basin", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_193_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_193_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_193_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_193_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_193_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_193_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_193_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_193_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_193_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_193_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_193_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_193_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_193_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_193_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_193_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_193_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_193_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_193_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_193_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_193_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_193_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_193_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_193_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_193_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_193_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_193_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_193_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_193_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_193_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_193_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_193_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_193_31.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: Cut lettuce using knife\nB: Boiling water\nC: Stirring a pot\nD: Peeling an orange", "question": "What is the person doing after he/she throw something into something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Cut lettuce using knife\nB: Boiling water\nC: Stirring a pot\nD: Peeling an orange", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_194_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_194_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_194_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_194_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_194_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_194_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_194_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_194_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_194_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_194_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_194_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_194_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_194_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_194_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_194_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_194_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_194_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_194_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_194_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_194_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_194_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_194_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_194_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_194_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_194_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_194_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_194_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_194_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_194_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_194_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_194_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_194_31.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: sometimes\nB: yes\nC: no\nD: maybe", "question": "If the person did not do the first action did after he/she put something to something, is the person able to get something from something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: sometimes\nB: yes\nC: no\nD: maybe", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_195_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_195_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_195_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_195_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_195_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_195_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_195_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_195_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_195_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_195_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_195_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_195_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_195_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_195_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_195_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_195_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_195_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_195_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_195_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_195_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_195_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_195_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_195_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_195_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_195_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_195_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_195_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_195_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_195_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_195_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_195_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_195_31.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: yes\nB: maybe\nC: no\nD: I don\u2019t know", "question": "If the person did not do the first action in the video, will cereal change its status?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: yes\nB: maybe\nC: no\nD: I don\u2019t know", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_196_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_196_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_196_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_196_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_196_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_196_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_196_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_196_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_196_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_196_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_196_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_196_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_196_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_196_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_196_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_196_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_196_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_196_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_196_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_196_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_196_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_196_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_196_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_196_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_196_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_196_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_196_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_196_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_196_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_196_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_196_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_196_31.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: maybe\nB: uncertain\nC: yes\nD: no", "question": "Does the action getting something from something fulfills the preconditions of the last action in the video?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: maybe\nB: uncertain\nC: yes\nD: no", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_197_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_197_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_197_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_197_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_197_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_197_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_197_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_197_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_197_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_197_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_197_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_197_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_197_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_197_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_197_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_197_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_197_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_197_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_197_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_197_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_197_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_197_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_197_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_197_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_197_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_197_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_197_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_197_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_197_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_197_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_197_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_197_31.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: Used a remote\nB: Pressed a button\nC: Get controller from table\nD: Turned on the switch", "question": "How did the person changed the poweredness of the first object that has status change in the video?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: Used a remote\nB: Pressed a button\nC: Get controller from table\nD: Turned on the switch", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_198_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_198_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_198_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_198_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_198_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_198_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_198_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_198_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_198_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_198_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_198_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_198_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_198_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_198_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_198_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_198_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_198_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_198_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_198_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_198_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_198_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_198_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_198_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_198_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_198_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_198_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_198_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_198_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_198_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_198_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_198_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_198_31.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Egocentric_Video_QuestionAnswering", "visual_input_component": "egocentric image", "source": "EgoTaskQA", "options": "A: sometimes\nB: only if the action is prolonged\nC: yes\nD: no", "question": "Did the attribute of meat changed because of the action closing something?", "context": "Your task is to understand and reasoning about activities and events from the first-person perspective. \nSelect from the following choices.\nA: sometimes\nB: only if the action is prolonged\nC: yes\nD: no", "input_image_path": ["./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_199_0.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_199_1.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_199_2.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_199_3.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_199_4.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_199_5.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_199_6.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_199_7.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_199_8.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_199_9.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_199_10.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_199_11.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_199_12.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_199_13.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_199_14.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_199_15.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_199_16.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_199_17.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_199_18.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_199_19.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_199_20.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_199_21.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_199_22.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_199_23.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_199_24.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_199_25.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_199_26.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_199_27.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_199_28.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_199_29.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_199_30.png", "./3D-spatial/Egocentric_Video_QuestionAnswering/Egocentric_Video_QuestionAnswering_199_31.png"], "output": "D", "qwen3-vl": "image none"}]
\ No newline at end of file
diff --git a/results/Homography_estimation/qwen3-vl/metadata_info.json b/results/Homography_estimation/qwen3-vl/metadata_info.json
new file mode 100644
index 0000000..dd3bf72
--- /dev/null
+++ b/results/Homography_estimation/qwen3-vl/metadata_info.json
@@ -0,0 +1 @@
+[{"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 1.1529 0.012747 244.44\n0.41529 1.1943 -155.59\n0.00087156 5.6224e-05 1.0092\n\nB: 0.54693 0.20925 -108.35\n-0.082341 1.1176 -236.48\n-0.0006026 0.0001769 1.0001\n\nC: 1.2108 -0.031741 47.374\n0.20996 1.0345 -107.36\n0.00054926 -6.3631e-06 1.0004\n\nD: 1.7761 -0.053427 263.17\n0.41751 1.5987 -329.46\n0.00069677 3.1372e-05 1.0014\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.1529 0.012747 244.44\n0.41529 1.1943 -155.59\n0.00087156 5.6224e-05 1.0092\n\nB: 0.54693 0.20925 -108.35\n-0.082341 1.1176 -236.48\n-0.0006026 0.0001769 1.0001\n\nC: 1.2108 -0.031741 47.374\n0.20996 1.0345 -107.36\n0.00054926 -6.3631e-06 1.0004\n\nD: 1.7761 -0.053427 263.17\n0.41751 1.5987 -329.46\n0.00069677 3.1372e-05 1.0014\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_0_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_0_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.91628 -0.19782 70.502\n0.072414 0.68419 -33.187\n5.7127e-06 -0.00025258 0.99947\n\nB: 1.2895 0.43518 -118.46\n-0.025956 1.4233 161.89\n-3.0413e-05 0.00069874 1.0013\n\nC: 4.3722 0.14407 -818.24\n-0.25209 3.9595 -549.15\n0.001718 0.0010825 0.97985\n\nD: 0.40245 -0.33938 102.29\n-0.2125 0.62381 216.78\n-0.00033866 -1.5855e-05 1.0018\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.91628 -0.19782 70.502\n0.072414 0.68419 -33.187\n5.7127e-06 -0.00025258 0.99947\n\nB: 1.2895 0.43518 -118.46\n-0.025956 1.4233 161.89\n-3.0413e-05 0.00069874 1.0013\n\nC: 4.3722 0.14407 -818.24\n-0.25209 3.9595 -549.15\n0.001718 0.0010825 0.97985\n\nD: 0.40245 -0.33938 102.29\n-0.2125 0.62381 216.78\n-0.00033866 -1.5855e-05 1.0018\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_1_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_1_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.74922 -0.0014388 -75.597\n-0.074158 0.94323 40.455\n-0.00018126 -6.2301e-06 1\n\nB: 0.29858 0.0403 -122.67\n-0.38113 0.61838 172.03\n-0.00071255 -1.0448e-06 0.97348\n\nC: 1.0669 0.31109 194.1\n-0.019953 0.9209 79.624\n0.000135 -7.6705e-05 0.99977\n\nD: 0.37694 0.049406 111.53\n-0.16444 0.72986 84.602\n-0.00037753 4.0247e-05 0.99869\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.74922 -0.0014388 -75.597\n-0.074158 0.94323 40.455\n-0.00018126 -6.2301e-06 1\n\nB: 0.29858 0.0403 -122.67\n-0.38113 0.61838 172.03\n-0.00071255 -1.0448e-06 0.97348\n\nC: 1.0669 0.31109 194.1\n-0.019953 0.9209 79.624\n0.000135 -7.6705e-05 0.99977\n\nD: 0.37694 0.049406 111.53\n-0.16444 0.72986 84.602\n-0.00037753 4.0247e-05 0.99869\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_2_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_2_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 1.9861 0.031586 27.893\n0.62141 1.9607 -531.99\n0.0011993 -1.9815e-05 0.99978\n\nB: -0.21679 -0.12572 585.55\n0.12463 -0.21699 355.1\n-1.085e-06 -1.8818e-06 1.0002\n\nC: 0.44469 -0.1629 197.72\n-0.090792 0.33606 37.55\n-0.00032851 -0.00028415 1.0004\n\nD: 5.1051 0.34986 -885.86\n1.0306 5.9768 -2733.1\n0.0033649 0.00099216 1\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.9861 0.031586 27.893\n0.62141 1.9607 -531.99\n0.0011993 -1.9815e-05 0.99978\n\nB: -0.21679 -0.12572 585.55\n0.12463 -0.21699 355.1\n-1.085e-06 -1.8818e-06 1.0002\n\nC: 0.44469 -0.1629 197.72\n-0.090792 0.33606 37.55\n-0.00032851 -0.00028415 1.0004\n\nD: 5.1051 0.34986 -885.86\n1.0306 5.9768 -2733.1\n0.0033649 0.00099216 1\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_3_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_3_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.48275 -0.12831 276.04\n-0.19138 0.40711 199.19\n-5.6548e-05 -0.00023367 0.99912\n\nB: 1.6408 -0.0013389 -221.64\n0.1704 1.44 -155.56\n0.00036369 -3.22e-05 1.0003\n\nC: 2.4144 -0.0022023 -199.3\n0.52146 2.0547 -569.49\n0.0010423 8.4489e-05 1.0043\n\nD: 2.9599 0.00703 244.64\n0.78405 1.8789 -438.29\n0.0018411 4.4095e-05 0.99694\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.48275 -0.12831 276.04\n-0.19138 0.40711 199.19\n-5.6548e-05 -0.00023367 0.99912\n\nB: 1.6408 -0.0013389 -221.64\n0.1704 1.44 -155.56\n0.00036369 -3.22e-05 1.0003\n\nC: 2.4144 -0.0022023 -199.3\n0.52146 2.0547 -569.49\n0.0010423 8.4489e-05 1.0043\n\nD: 2.9599 0.00703 244.64\n0.78405 1.8789 -438.29\n0.0018411 4.4095e-05 0.99694\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_4_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_4_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 2.4144 -0.0022023 -199.3\n0.52146 2.0547 -569.49\n0.0010423 8.4489e-05 1.0043\n\nB: 1.547 0.11677 155.75\n0.40373 1.373 -170.1\n0.00090791 8.8782e-05 1.0012\n\nC: 1.3951 0.13641 136.74\n0.31704 1.2758 -219.28\n0.00053511 0.00013896 0.99675\n\nD: 0.72201 0.13445 62.975\n0.059719 0.85126 46.305\n-1.7322e-05 0.00018166 1.0001\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 2.4144 -0.0022023 -199.3\n0.52146 2.0547 -569.49\n0.0010423 8.4489e-05 1.0043\n\nB: 1.547 0.11677 155.75\n0.40373 1.373 -170.1\n0.00090791 8.8782e-05 1.0012\n\nC: 1.3951 0.13641 136.74\n0.31704 1.2758 -219.28\n0.00053511 0.00013896 0.99675\n\nD: 0.72201 0.13445 62.975\n0.059719 0.85126 46.305\n-1.7322e-05 0.00018166 1.0001\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_5_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_5_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 1.4932 0.01661 231.74\n0.45676 1.4341 -212.29\n0.0013256 9.9938e-05 0.99686\n\nB: 0.012717 0.014394 193.52\n-0.12386 0.60301 126.7\n-0.00063953 7.9665e-05 1.0012\n\nC: 0.27317 0.041297 84.951\n-0.22859 0.68736 124.47\n-0.00041264 5.2763e-05 1.0003\n\nD: 2.2078 0.054458 63.617\n0.67654 2.2557 -637.98\n0.0013191 8.5079e-05 1.0033\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.4932 0.01661 231.74\n0.45676 1.4341 -212.29\n0.0013256 9.9938e-05 0.99686\n\nB: 0.012717 0.014394 193.52\n-0.12386 0.60301 126.7\n-0.00063953 7.9665e-05 1.0012\n\nC: 0.27317 0.041297 84.951\n-0.22859 0.68736 124.47\n-0.00041264 5.2763e-05 1.0003\n\nD: 2.2078 0.054458 63.617\n0.67654 2.2557 -637.98\n0.0013191 8.5079e-05 1.0033\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_6_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_6_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.60879 -0.35761 289.93\n0.34822 0.61653 -30.949\n-2.0912e-05 1.3527e-06 1.014\n\nB: 1.1901 -0.048587 107.72\n0.14488 1.1926 -121.84\n0.00033622 1.1241e-05 1.0001\n\nC: 2.2787 0.023843 -30.321\n0.58793 1.9158 -459.28\n0.0012782 -6.6868e-06 0.99971\n\nD: 1.8454 -0.0093839 117.6\n0.8533 1.9335 -566.11\n0.0016091 6.8147e-05 1.0105\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.60879 -0.35761 289.93\n0.34822 0.61653 -30.949\n-2.0912e-05 1.3527e-06 1.014\n\nB: 1.1901 -0.048587 107.72\n0.14488 1.1926 -121.84\n0.00033622 1.1241e-05 1.0001\n\nC: 2.2787 0.023843 -30.321\n0.58793 1.9158 -459.28\n0.0012782 -6.6868e-06 0.99971\n\nD: 1.8454 -0.0093839 117.6\n0.8533 1.9335 -566.11\n0.0016091 6.8147e-05 1.0105\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_7_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_7_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.37083 -0.024499 139.16\n-0.094573 0.62749 65.353\n-0.00053805 -2.2225e-05 0.99885\n\nB: 0.75268 -0.0092452 -71.273\n-0.17607 0.97566 6.3105\n-0.00029582 -1.5187e-05 0.99957\n\nC: 0.98278 -0.0048237 22.209\n-0.012055 0.97088 45.658\n-7.6753e-06 -2.3467e-05 1.0001\n\nD: 1.0499 0.025643 108.77\n0.19467 1.0054 -7.8895\n0.0011218 -3.184e-05 1.0021\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.37083 -0.024499 139.16\n-0.094573 0.62749 65.353\n-0.00053805 -2.2225e-05 0.99885\n\nB: 0.75268 -0.0092452 -71.273\n-0.17607 0.97566 6.3105\n-0.00029582 -1.5187e-05 0.99957\n\nC: 0.98278 -0.0048237 22.209\n-0.012055 0.97088 45.658\n-7.6753e-06 -2.3467e-05 1.0001\n\nD: 1.0499 0.025643 108.77\n0.19467 1.0054 -7.8895\n0.0011218 -3.184e-05 1.0021\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_8_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_8_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.3184 0.1614 32.607\n0.092973 1.2239 -454.36\n-0.00072537 0.00028453 0.99713\n\nB: 0.85555 -0.17378 91.59\n0.17068 0.85755 -31.264\n-5.1182e-06 2.0966e-06 1.0023\n\nC: 0.87235 0.023622 101.75\n0.12982 0.76075 59.456\n0.0005519 9.0915e-05 1.0016\n\nD: 0.37083 -0.024499 139.16\n-0.094573 0.62749 65.353\n-0.00053805 -2.2225e-05 0.99885\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.3184 0.1614 32.607\n0.092973 1.2239 -454.36\n-0.00072537 0.00028453 0.99713\n\nB: 0.85555 -0.17378 91.59\n0.17068 0.85755 -31.264\n-5.1182e-06 2.0966e-06 1.0023\n\nC: 0.87235 0.023622 101.75\n0.12982 0.76075 59.456\n0.0005519 9.0915e-05 1.0016\n\nD: 0.37083 -0.024499 139.16\n-0.094573 0.62749 65.353\n-0.00053805 -2.2225e-05 0.99885\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_9_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_9_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.88184 0.31397 -39.976\n-0.18167 0.93621 153.25\n0.00020118 -1.9028e-05 0.99997\n\nB: 0.33414 0.069646 90.22\n-0.25229 0.73446 157.67\n-0.00038885 2.2582e-06 1.0024\n\nC: 0.15114 -0.00089399 241.66\n-0.078633 0.45918 14.453\n-0.00033245 3.1152e-05 0.99996\n\nD: 0.54864 -0.010797 -6.1494\n-0.11876 0.86651 111.28\n-0.00026448 -1.8961e-05 1\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.88184 0.31397 -39.976\n-0.18167 0.93621 153.25\n0.00020118 -1.9028e-05 0.99997\n\nB: 0.33414 0.069646 90.22\n-0.25229 0.73446 157.67\n-0.00038885 2.2582e-06 1.0024\n\nC: 0.15114 -0.00089399 241.66\n-0.078633 0.45918 14.453\n-0.00033245 3.1152e-05 0.99996\n\nD: 0.54864 -0.010797 -6.1494\n-0.11876 0.86651 111.28\n-0.00026448 -1.8961e-05 1\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_10_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_10_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.29858 0.0403 -122.67\n-0.38113 0.61838 172.03\n-0.00071255 -1.0448e-06 0.97348\n\nB: 0.44469 -0.1629 197.72\n-0.090792 0.33606 37.55\n-0.00032851 -0.00028415 1.0004\n\nC: 1.1198 0.031669 158.94\n0.13747 0.986 -24.458\n0.00036259 4.1267e-05 0.99658\n\nD: 0.52949 -0.028655 46.849\n-0.2451 0.79991 158.44\n-0.00032499 -1.8164e-05 0.99959\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.29858 0.0403 -122.67\n-0.38113 0.61838 172.03\n-0.00071255 -1.0448e-06 0.97348\n\nB: 0.44469 -0.1629 197.72\n-0.090792 0.33606 37.55\n-0.00032851 -0.00028415 1.0004\n\nC: 1.1198 0.031669 158.94\n0.13747 0.986 -24.458\n0.00036259 4.1267e-05 0.99658\n\nD: 0.52949 -0.028655 46.849\n-0.2451 0.79991 158.44\n-0.00032499 -1.8164e-05 0.99959\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_11_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_11_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 1.1529 0.012747 244.44\n0.41529 1.1943 -155.59\n0.00087156 5.6224e-05 1.0092\n\nB: 0.1857 -0.0018512 147.73\n-0.094288 0.35154 277.67\n-0.00019671 -1.563e-05 0.9996\n\nC: 0.57079 0.0076829 -45.295\n-0.15447 0.93183 62.276\n-0.00028402 -5.8827e-06 0.99996\n\nD: 0.77105 -0.097833 -3.6994\n-0.092675 0.81167 92.799\n-0.0001392 -0.00012806 0.99964\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.1529 0.012747 244.44\n0.41529 1.1943 -155.59\n0.00087156 5.6224e-05 1.0092\n\nB: 0.1857 -0.0018512 147.73\n-0.094288 0.35154 277.67\n-0.00019671 -1.563e-05 0.9996\n\nC: 0.57079 0.0076829 -45.295\n-0.15447 0.93183 62.276\n-0.00028402 -5.8827e-06 0.99996\n\nD: 0.77105 -0.097833 -3.6994\n-0.092675 0.81167 92.799\n-0.0001392 -0.00012806 0.99964\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_12_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_12_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 2.6481 0.070248 -423.11\n0.5002 2.6605 -906.39\n0.0012014 0.00025943 0.99533\n\nB: 0.58099 -0.029382 -20.47\n-0.29479 0.73128 188.62\n-0.00043803 -4.3076e-05 1.0007\n\nC: 0.34904 -0.0038637 -43.899\n-0.22316 0.99346 45.579\n-0.00041195 -1.2246e-05 1\n\nD: 0.38854 -0.073106 92.576\n-0.1986 0.7319 139.21\n-0.00040811 -1.555e-05 0.99988\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 2.6481 0.070248 -423.11\n0.5002 2.6605 -906.39\n0.0012014 0.00025943 0.99533\n\nB: 0.58099 -0.029382 -20.47\n-0.29479 0.73128 188.62\n-0.00043803 -4.3076e-05 1.0007\n\nC: 0.34904 -0.0038637 -43.899\n-0.22316 0.99346 45.579\n-0.00041195 -1.2246e-05 1\n\nD: 0.38854 -0.073106 92.576\n-0.1986 0.7319 139.21\n-0.00040811 -1.555e-05 0.99988\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_13_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_13_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 2.2078 0.054458 63.617\n0.67654 2.2557 -637.98\n0.0013191 8.5079e-05 1.0033\n\nB: 1.141 -0.024147 186.42\n0.29573 0.97376 -60.872\n0.00082251 -1.0843e-05 0.99973\n\nC: 3.6199 0.1243 -2.4307\n0.35256 5.1536 -1935.2\n0.0029372 0.0011148 1\n\nD: 1.3308 -0.060097 223.54\n0.17906 0.94189 -10.999\n0.00034146 -4.4675e-05 0.99983\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 2.2078 0.054458 63.617\n0.67654 2.2557 -637.98\n0.0013191 8.5079e-05 1.0033\n\nB: 1.141 -0.024147 186.42\n0.29573 0.97376 -60.872\n0.00082251 -1.0843e-05 0.99973\n\nC: 3.6199 0.1243 -2.4307\n0.35256 5.1536 -1935.2\n0.0029372 0.0011148 1\n\nD: 1.3308 -0.060097 223.54\n0.17906 0.94189 -10.999\n0.00034146 -4.4675e-05 0.99983\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_14_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_14_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 1.6408 -0.0013389 -221.64\n0.1704 1.44 -155.56\n0.00036369 -3.22e-05 1.0003\n\nB: 0.3184 0.1614 32.607\n0.092973 1.2239 -454.36\n-0.00072537 0.00028453 0.99713\n\nC: 0.47208 0.021042 63.836\n-0.16332 0.73028 126.94\n-0.00030371 2.4606e-05 0.99981\n\nD: 0.47589 0.042551 60.888\n-0.21388 0.80238 62.033\n-0.0003663 2.6901e-05 1.001\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.6408 -0.0013389 -221.64\n0.1704 1.44 -155.56\n0.00036369 -3.22e-05 1.0003\n\nB: 0.3184 0.1614 32.607\n0.092973 1.2239 -454.36\n-0.00072537 0.00028453 0.99713\n\nC: 0.47208 0.021042 63.836\n-0.16332 0.73028 126.94\n-0.00030371 2.4606e-05 0.99981\n\nD: 0.47589 0.042551 60.888\n-0.21388 0.80238 62.033\n-0.0003663 2.6901e-05 1.001\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_15_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_15_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.44469 -0.1629 197.72\n-0.090792 0.33606 37.55\n-0.00032851 -0.00028415 1.0004\n\nB: 0.7088 -0.010965 -26.07\n-0.13602 0.83489 103.19\n-0.00023352 -1.5615e-05 1.0004\n\nC: 0.54304 0.026384 236.48\n-0.041921 0.64806 87.13\n-5.8662e-05 1.5685e-05 1\n\nD: 0.37107 -0.09213 318.73\n0.086334 0.37505 188.02\n-1.0814e-05 -3.6548e-06 1\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.44469 -0.1629 197.72\n-0.090792 0.33606 37.55\n-0.00032851 -0.00028415 1.0004\n\nB: 0.7088 -0.010965 -26.07\n-0.13602 0.83489 103.19\n-0.00023352 -1.5615e-05 1.0004\n\nC: 0.54304 0.026384 236.48\n-0.041921 0.64806 87.13\n-5.8662e-05 1.5685e-05 1\n\nD: 0.37107 -0.09213 318.73\n0.086334 0.37505 188.02\n-1.0814e-05 -3.6548e-06 1\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_16_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_16_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.77105 -0.097833 -3.6994\n-0.092675 0.81167 92.799\n-0.0001392 -0.00012806 0.99964\n\nB: 0.1857 -0.0018512 147.73\n-0.094288 0.35154 277.67\n-0.00019671 -1.563e-05 0.9996\n\nC: 0.13896 0.020204 194.37\n-0.25201 0.63798 118.99\n-0.00052359 2.2762e-05 0.9996\n\nD: 1.4403 0.27154 10.734\n0.071471 1.5534 -44.533\n0.00030432 0.00049723 1.001\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.77105 -0.097833 -3.6994\n-0.092675 0.81167 92.799\n-0.0001392 -0.00012806 0.99964\n\nB: 0.1857 -0.0018512 147.73\n-0.094288 0.35154 277.67\n-0.00019671 -1.563e-05 0.9996\n\nC: 0.13896 0.020204 194.37\n-0.25201 0.63798 118.99\n-0.00052359 2.2762e-05 0.9996\n\nD: 1.4403 0.27154 10.734\n0.071471 1.5534 -44.533\n0.00030432 0.00049723 1.001\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_17_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_17_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 3.6199 0.1243 -2.4307\n0.35256 5.1536 -1935.2\n0.0029372 0.0011148 1\n\nB: 0.45287 0.0061881 100.32\n-0.053734 0.66556 61.961\n-0.00023168 -5.8559e-06 1.0005\n\nC: 1.0478 0.035143 64.843\n0.063507 1.0349 21.701\n0.00023044 -6.878e-06 0.99998\n\nD: 0.69134 -0.0063829 116.24\n0.0053381 0.71985 83.96\n-1.8171e-05 2.7124e-05 1\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 3.6199 0.1243 -2.4307\n0.35256 5.1536 -1935.2\n0.0029372 0.0011148 1\n\nB: 0.45287 0.0061881 100.32\n-0.053734 0.66556 61.961\n-0.00023168 -5.8559e-06 1.0005\n\nC: 1.0478 0.035143 64.843\n0.063507 1.0349 21.701\n0.00023044 -6.878e-06 0.99998\n\nD: 0.69134 -0.0063829 116.24\n0.0053381 0.71985 83.96\n-1.8171e-05 2.7124e-05 1\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_18_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_18_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.55616 0.0088234 83.342\n-0.19782 0.70845 195.76\n-0.00029305 -3.175e-05 0.99884\n\nB: 0.85799 0.21669 9.4839\n-0.21177 0.85855 130.48\n1.5015e-06 9.2033e-07 1\n\nC: 5.1051 0.34986 -885.86\n1.0306 5.9768 -2733.1\n0.0033649 0.00099216 1\n\nD: 1.4259 0.070724 58.865\n0.39243 1.3442 -170.04\n0.00084248 0.00011346 0.98851\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.55616 0.0088234 83.342\n-0.19782 0.70845 195.76\n-0.00029305 -3.175e-05 0.99884\n\nB: 0.85799 0.21669 9.4839\n-0.21177 0.85855 130.48\n1.5015e-06 9.2033e-07 1\n\nC: 5.1051 0.34986 -885.86\n1.0306 5.9768 -2733.1\n0.0033649 0.00099216 1\n\nD: 1.4259 0.070724 58.865\n0.39243 1.3442 -170.04\n0.00084248 0.00011346 0.98851\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_19_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_19_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.032608 0.010774 198.34\n-0.16134 0.44659 114.31\n-0.00057725 -5.1566e-07 1.0017\n\nB: 0.18178 0.033268 82.883\n-0.24959 0.68306 123.62\n-0.0004688 5.3047e-05 1.0005\n\nC: 0.79208 0.010314 26.019\n-0.023778 0.92337 43.513\n-0.00011513 1.2161e-05 1.0003\n\nD: 1.4259 0.070724 58.865\n0.39243 1.3442 -170.04\n0.00084248 0.00011346 0.98851\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.032608 0.010774 198.34\n-0.16134 0.44659 114.31\n-0.00057725 -5.1566e-07 1.0017\n\nB: 0.18178 0.033268 82.883\n-0.24959 0.68306 123.62\n-0.0004688 5.3047e-05 1.0005\n\nC: 0.79208 0.010314 26.019\n-0.023778 0.92337 43.513\n-0.00011513 1.2161e-05 1.0003\n\nD: 1.4259 0.070724 58.865\n0.39243 1.3442 -170.04\n0.00084248 0.00011346 0.98851\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_20_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_20_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 1.0669 0.31109 194.1\n-0.019953 0.9209 79.624\n0.000135 -7.6705e-05 0.99977\n\nB: 0.20876 0.015221 174.06\n-0.13382 0.55012 11.64\n-0.00044084 3.575e-05 1.0177\n\nC: 1.6408 -0.0013389 -221.64\n0.1704 1.44 -155.56\n0.00036369 -3.22e-05 1.0003\n\nD: 1.3903 -0.069797 29.319\n0.18963 1.0284 22.049\n0.00052989 -9.8197e-05 1.0021\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.0669 0.31109 194.1\n-0.019953 0.9209 79.624\n0.000135 -7.6705e-05 0.99977\n\nB: 0.20876 0.015221 174.06\n-0.13382 0.55012 11.64\n-0.00044084 3.575e-05 1.0177\n\nC: 1.6408 -0.0013389 -221.64\n0.1704 1.44 -155.56\n0.00036369 -3.22e-05 1.0003\n\nD: 1.3903 -0.069797 29.319\n0.18963 1.0284 22.049\n0.00052989 -9.8197e-05 1.0021\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_21_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_21_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.69134 -0.0063829 116.24\n0.0053381 0.71985 83.96\n-1.8171e-05 2.7124e-05 1\n\nB: 0.1857 -0.0018512 147.73\n-0.094288 0.35154 277.67\n-0.00019671 -1.563e-05 0.9996\n\nC: 0.012717 0.014394 193.52\n-0.12386 0.60301 126.7\n-0.00063953 7.9665e-05 1.0012\n\nD: 0.29858 0.0403 -122.67\n-0.38113 0.61838 172.03\n-0.00071255 -1.0448e-06 0.97348\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.69134 -0.0063829 116.24\n0.0053381 0.71985 83.96\n-1.8171e-05 2.7124e-05 1\n\nB: 0.1857 -0.0018512 147.73\n-0.094288 0.35154 277.67\n-0.00019671 -1.563e-05 0.9996\n\nC: 0.012717 0.014394 193.52\n-0.12386 0.60301 126.7\n-0.00063953 7.9665e-05 1.0012\n\nD: 0.29858 0.0403 -122.67\n-0.38113 0.61838 172.03\n-0.00071255 -1.0448e-06 0.97348\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_22_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_22_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 1.8851 0.028166 274.85\n0.48185 1.6951 -326.97\n0.0011778 8.455e-05 0.99801\n\nB: 0.4849 -0.15095 280.72\n-0.18568 0.38797 170.57\n-4.9965e-05 -0.00024428 0.99985\n\nC: 0.98278 -0.0048237 22.209\n-0.012055 0.97088 45.658\n-7.6753e-06 -2.3467e-05 1.0001\n\nD: 1.6284 1.0346 -954.33\n-0.096789 2.5434 -782.98\n-0.00078653 0.0011044 1\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.8851 0.028166 274.85\n0.48185 1.6951 -326.97\n0.0011778 8.455e-05 0.99801\n\nB: 0.4849 -0.15095 280.72\n-0.18568 0.38797 170.57\n-4.9965e-05 -0.00024428 0.99985\n\nC: 0.98278 -0.0048237 22.209\n-0.012055 0.97088 45.658\n-7.6753e-06 -2.3467e-05 1.0001\n\nD: 1.6284 1.0346 -954.33\n-0.096789 2.5434 -782.98\n-0.00078653 0.0011044 1\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_23_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_23_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 1.9207 0.17258 153.68\n0.62581 1.7293 -542.33\n0.0010509 0.0001244 0.99848\n\nB: 0.70212 0.43231 -128.54\n-0.42351 0.70276 199.3\n6.3285e-06 1.2175e-05 0.99997\n\nC: 0.91628 -0.19782 70.502\n0.072414 0.68419 -33.187\n5.7127e-06 -0.00025258 0.99947\n\nD: 0.22888 0.0058691 272.09\n-0.077153 0.3923 203.08\n-0.00024299 -4.5827e-06 1.0015\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.9207 0.17258 153.68\n0.62581 1.7293 -542.33\n0.0010509 0.0001244 0.99848\n\nB: 0.70212 0.43231 -128.54\n-0.42351 0.70276 199.3\n6.3285e-06 1.2175e-05 0.99997\n\nC: 0.91628 -0.19782 70.502\n0.072414 0.68419 -33.187\n5.7127e-06 -0.00025258 0.99947\n\nD: 0.22888 0.0058691 272.09\n-0.077153 0.3923 203.08\n-0.00024299 -4.5827e-06 1.0015\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_24_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_24_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 1.6477 -0.037624 101.59\n0.49962 1.5725 -364.98\n0.00090272 4.6589e-05 1.0037\n\nB: 0.18178 0.033268 82.883\n-0.24959 0.68306 123.62\n-0.0004688 5.3047e-05 1.0005\n\nC: 0.13416 0.073075 56.977\n-0.21333 0.70433 84.528\n-0.00055481 6.1106e-05 1\n\nD: 1.0499 0.025643 108.77\n0.19467 1.0054 -7.8895\n0.0011218 -3.184e-05 1.0021\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.6477 -0.037624 101.59\n0.49962 1.5725 -364.98\n0.00090272 4.6589e-05 1.0037\n\nB: 0.18178 0.033268 82.883\n-0.24959 0.68306 123.62\n-0.0004688 5.3047e-05 1.0005\n\nC: 0.13416 0.073075 56.977\n-0.21333 0.70433 84.528\n-0.00055481 6.1106e-05 1\n\nD: 1.0499 0.025643 108.77\n0.19467 1.0054 -7.8895\n0.0011218 -3.184e-05 1.0021\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_25_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_25_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.38914 0.285 169.51\n-0.28531 0.39347 340.1\n-6.4617e-06 5.0341e-06 1\n\nB: 0.13416 0.073075 56.977\n-0.21333 0.70433 84.528\n-0.00055481 6.1106e-05 1\n\nC: 2.4665 0.083695 233.31\n0.87021 2.8235 -936.68\n0.0017821 0.0001592 0.98707\n\nD: 1.4259 0.070724 58.865\n0.39243 1.3442 -170.04\n0.00084248 0.00011346 0.98851\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.38914 0.285 169.51\n-0.28531 0.39347 340.1\n-6.4617e-06 5.0341e-06 1\n\nB: 0.13416 0.073075 56.977\n-0.21333 0.70433 84.528\n-0.00055481 6.1106e-05 1\n\nC: 2.4665 0.083695 233.31\n0.87021 2.8235 -936.68\n0.0017821 0.0001592 0.98707\n\nD: 1.4259 0.070724 58.865\n0.39243 1.3442 -170.04\n0.00084248 0.00011346 0.98851\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_26_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_26_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 1.3231 -0.10518 226.69\n0.35118 1.4445 -217.52\n0.00076877 -2.4515e-05 0.99903\n\nB: 0.85555 -0.17378 91.59\n0.17068 0.85755 -31.264\n-5.1182e-06 2.0966e-06 1.0023\n\nC: 0.57125 -0.095863 127.19\n0.050302 0.75099 -13.911\n-0.00020485 1.2421e-06 0.9999\n\nD: 1.3186 -0.0097277 -143.16\n0.094663 1.1956 -58.383\n0.00019153 -2.0281e-05 0.99989\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.3231 -0.10518 226.69\n0.35118 1.4445 -217.52\n0.00076877 -2.4515e-05 0.99903\n\nB: 0.85555 -0.17378 91.59\n0.17068 0.85755 -31.264\n-5.1182e-06 2.0966e-06 1.0023\n\nC: 0.57125 -0.095863 127.19\n0.050302 0.75099 -13.911\n-0.00020485 1.2421e-06 0.9999\n\nD: 1.3186 -0.0097277 -143.16\n0.094663 1.1956 -58.383\n0.00019153 -2.0281e-05 0.99989\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_27_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_27_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 1.6996 0.02142 200.09\n0.31149 1.4251 -246.25\n0.00053609 -6.8541e-05 0.99889\n\nB: 0.4591 -0.47767 436.55\n0.46479 0.46941 -27.514\n-2.7182e-05 -1.2668e-06 1.0191\n\nC: 0.55202 0.096567 108.66\n-0.35774 1.4927 -276.32\n-0.00068886 0.0001065 0.98986\n\nD: 1.0499 0.025643 108.77\n0.19467 1.0054 -7.8895\n0.0011218 -3.184e-05 1.0021\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.6996 0.02142 200.09\n0.31149 1.4251 -246.25\n0.00053609 -6.8541e-05 0.99889\n\nB: 0.4591 -0.47767 436.55\n0.46479 0.46941 -27.514\n-2.7182e-05 -1.2668e-06 1.0191\n\nC: 0.55202 0.096567 108.66\n-0.35774 1.4927 -276.32\n-0.00068886 0.0001065 0.98986\n\nD: 1.0499 0.025643 108.77\n0.19467 1.0054 -7.8895\n0.0011218 -3.184e-05 1.0021\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_28_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_28_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.13896 0.020204 194.37\n-0.25201 0.63798 118.99\n-0.00052359 2.2762e-05 0.9996\n\nB: 1.6477 -0.037624 101.59\n0.49962 1.5725 -364.98\n0.00090272 4.6589e-05 1.0037\n\nC: 0.091252 0.0066749 132.72\n-0.14667 0.47258 88.51\n-0.00056772 8.3791e-06 1.0029\n\nD: 0.30367 0.12862 200.05\n-0.12888 0.30356 134.47\n2.6855e-07 -3.4026e-07 1\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.13896 0.020204 194.37\n-0.25201 0.63798 118.99\n-0.00052359 2.2762e-05 0.9996\n\nB: 1.6477 -0.037624 101.59\n0.49962 1.5725 -364.98\n0.00090272 4.6589e-05 1.0037\n\nC: 0.091252 0.0066749 132.72\n-0.14667 0.47258 88.51\n-0.00056772 8.3791e-06 1.0029\n\nD: 0.30367 0.12862 200.05\n-0.12888 0.30356 134.47\n2.6855e-07 -3.4026e-07 1\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_29_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_29_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.31483 0.11583 690.51\n0.17546 0.70637 14.497\n0.00026712 0.00012691 1\n\nB: 0.10472 0.069057 99.841\n-0.17731 0.5329 107.18\n-0.00051255 -1.3734e-05 0.98616\n\nC: 0.76922 -0.28498 222.68\n0.33855 1.0341 -81.069\n0.00035349 1.2014e-05 0.99834\n\nD: -0.47246 -0.28359 869.57\n0.29041 -0.47016 396.67\n5.0949e-06 1.2499e-05 0.99998\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.31483 0.11583 690.51\n0.17546 0.70637 14.497\n0.00026712 0.00012691 1\n\nB: 0.10472 0.069057 99.841\n-0.17731 0.5329 107.18\n-0.00051255 -1.3734e-05 0.98616\n\nC: 0.76922 -0.28498 222.68\n0.33855 1.0341 -81.069\n0.00035349 1.2014e-05 0.99834\n\nD: -0.47246 -0.28359 869.57\n0.29041 -0.47016 396.67\n5.0949e-06 1.2499e-05 0.99998\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_30_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_30_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 1.1198 0.031669 158.94\n0.13747 0.986 -24.458\n0.00036259 4.1267e-05 0.99658\n\nB: 1.3522 0.025037 96.693\n0.20588 1.5085 -279.44\n0.000418 4.2466e-05 1.0103\n\nC: 0.040904 -0.0023332 234.76\n-0.10713 0.35038 218.5\n-0.00028907 6.311e-06 1.0035\n\nD: 0.38266 -0.33125 122.6\n-0.21363 0.61581 225.35\n-0.00034121 -7.7515e-06 0.99865\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.1198 0.031669 158.94\n0.13747 0.986 -24.458\n0.00036259 4.1267e-05 0.99658\n\nB: 1.3522 0.025037 96.693\n0.20588 1.5085 -279.44\n0.000418 4.2466e-05 1.0103\n\nC: 0.040904 -0.0023332 234.76\n-0.10713 0.35038 218.5\n-0.00028907 6.311e-06 1.0035\n\nD: 0.38266 -0.33125 122.6\n-0.21363 0.61581 225.35\n-0.00034121 -7.7515e-06 0.99865\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_31_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_31_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.15114 -0.00089399 241.66\n-0.078633 0.45918 14.453\n-0.00033245 3.1152e-05 0.99996\n\nB: 0.87235 0.023622 101.75\n0.12982 0.76075 59.456\n0.0005519 9.0915e-05 1.0016\n\nC: 0.31483 0.11583 690.51\n0.17546 0.70637 14.497\n0.00026712 0.00012691 1\n\nD: 0.084461 -0.022036 252.3\n-0.21 0.51325 245.38\n-0.000447 -2.621e-05 1.0009\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.15114 -0.00089399 241.66\n-0.078633 0.45918 14.453\n-0.00033245 3.1152e-05 0.99996\n\nB: 0.87235 0.023622 101.75\n0.12982 0.76075 59.456\n0.0005519 9.0915e-05 1.0016\n\nC: 0.31483 0.11583 690.51\n0.17546 0.70637 14.497\n0.00026712 0.00012691 1\n\nD: 0.084461 -0.022036 252.3\n-0.21 0.51325 245.38\n-0.000447 -2.621e-05 1.0009\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_32_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_32_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 2.2078 0.054458 63.617\n0.67654 2.2557 -637.98\n0.0013191 8.5079e-05 1.0033\n\nB: 0.62091 -0.030805 57.622\n-0.22703 0.84222 -13.023\n-0.00037179 -4.2767e-05 0.99852\n\nC: 2.3515 0.16969 142.03\n1.0602 2.1465 -778.33\n0.0016806 -4.8949e-05 0.99537\n\nD: 0.056448 -0.012851 135.19\n-0.38625 0.54689 255.61\n-0.00066718 5.392e-05 1.0012\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 2.2078 0.054458 63.617\n0.67654 2.2557 -637.98\n0.0013191 8.5079e-05 1.0033\n\nB: 0.62091 -0.030805 57.622\n-0.22703 0.84222 -13.023\n-0.00037179 -4.2767e-05 0.99852\n\nC: 2.3515 0.16969 142.03\n1.0602 2.1465 -778.33\n0.0016806 -4.8949e-05 0.99537\n\nD: 0.056448 -0.012851 135.19\n-0.38625 0.54689 255.61\n-0.00066718 5.392e-05 1.0012\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_33_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_33_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.72201 0.13445 62.975\n0.059719 0.85126 46.305\n-1.7322e-05 0.00018166 1.0001\n\nB: 1.4862 -0.061679 54.577\n0.4606 1.2816 -147.5\n0.0007321 -7.3842e-05 0.99895\n\nC: 1.3231 -0.10518 226.69\n0.35118 1.4445 -217.52\n0.00076877 -2.4515e-05 0.99903\n\nD: 1.3522 0.025037 96.693\n0.20588 1.5085 -279.44\n0.000418 4.2466e-05 1.0103\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.72201 0.13445 62.975\n0.059719 0.85126 46.305\n-1.7322e-05 0.00018166 1.0001\n\nB: 1.4862 -0.061679 54.577\n0.4606 1.2816 -147.5\n0.0007321 -7.3842e-05 0.99895\n\nC: 1.3231 -0.10518 226.69\n0.35118 1.4445 -217.52\n0.00076877 -2.4515e-05 0.99903\n\nD: 1.3522 0.025037 96.693\n0.20588 1.5085 -279.44\n0.000418 4.2466e-05 1.0103\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_34_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_34_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.3794 0.089822 49.168\n-0.27745 0.88349 -5.6379\n-0.00046319 5.6849e-05 0.99886\n\nB: 0.88184 0.31397 -39.976\n-0.18167 0.93621 153.25\n0.00020118 -1.9028e-05 0.99997\n\nC: 1.0478 0.035143 64.843\n0.063507 1.0349 21.701\n0.00023044 -6.878e-06 0.99998\n\nD: 1.7761 -0.053427 263.17\n0.41751 1.5987 -329.46\n0.00069677 3.1372e-05 1.0014\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.3794 0.089822 49.168\n-0.27745 0.88349 -5.6379\n-0.00046319 5.6849e-05 0.99886\n\nB: 0.88184 0.31397 -39.976\n-0.18167 0.93621 153.25\n0.00020118 -1.9028e-05 0.99997\n\nC: 1.0478 0.035143 64.843\n0.063507 1.0349 21.701\n0.00023044 -6.878e-06 0.99998\n\nD: 1.7761 -0.053427 263.17\n0.41751 1.5987 -329.46\n0.00069677 3.1372e-05 1.0014\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_35_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_35_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 1.8278 -0.0075993 72.268\n0.68643 1.8832 -550.61\n0.0012853 4.1209e-05 1.006\n\nB: 0.4591 -0.47767 436.55\n0.46479 0.46941 -27.514\n-2.7182e-05 -1.2668e-06 1.0191\n\nC: 0.83129 0.00294 81.765\n-0.011403 0.83158 63.28\n-7.0021e-06 -1.5701e-05 1\n\nD: 1.0819 0.012805 66.799\n0.075853 1.006 5.6909\n0.00034273 -2.4626e-05 1.0003\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.8278 -0.0075993 72.268\n0.68643 1.8832 -550.61\n0.0012853 4.1209e-05 1.006\n\nB: 0.4591 -0.47767 436.55\n0.46479 0.46941 -27.514\n-2.7182e-05 -1.2668e-06 1.0191\n\nC: 0.83129 0.00294 81.765\n-0.011403 0.83158 63.28\n-7.0021e-06 -1.5701e-05 1\n\nD: 1.0819 0.012805 66.799\n0.075853 1.006 5.6909\n0.00034273 -2.4626e-05 1.0003\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_36_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_36_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 1.4932 0.01661 231.74\n0.45676 1.4341 -212.29\n0.0013256 9.9938e-05 0.99686\n\nB: 0.55202 0.096567 108.66\n-0.35774 1.4927 -276.32\n-0.00068886 0.0001065 0.98986\n\nC: 0.17608 -0.024321 273.19\n-0.19809 0.7405 74.826\n-0.00053318 1.2457e-05 1.0069\n\nD: 1.0063 -0.0054085 288.55\n0.23295 0.84053 7.8206\n0.0005941 1.4583e-05 1.0001\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.4932 0.01661 231.74\n0.45676 1.4341 -212.29\n0.0013256 9.9938e-05 0.99686\n\nB: 0.55202 0.096567 108.66\n-0.35774 1.4927 -276.32\n-0.00068886 0.0001065 0.98986\n\nC: 0.17608 -0.024321 273.19\n-0.19809 0.7405 74.826\n-0.00053318 1.2457e-05 1.0069\n\nD: 1.0063 -0.0054085 288.55\n0.23295 0.84053 7.8206\n0.0005941 1.4583e-05 1.0001\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_37_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_37_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 1.4403 0.27154 10.734\n0.071471 1.5534 -44.533\n0.00030432 0.00049723 1.001\n\nB: 0.87235 0.023622 101.75\n0.12982 0.76075 59.456\n0.0005519 9.0915e-05 1.0016\n\nC: 1.1198 0.031669 158.94\n0.13747 0.986 -24.458\n0.00036259 4.1267e-05 0.99658\n\nD: 0.29534 0.035751 -56.21\n-0.35718 0.5432 233.53\n-0.00064211 -1.1093e-05 0.97783\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.4403 0.27154 10.734\n0.071471 1.5534 -44.533\n0.00030432 0.00049723 1.001\n\nB: 0.87235 0.023622 101.75\n0.12982 0.76075 59.456\n0.0005519 9.0915e-05 1.0016\n\nC: 1.1198 0.031669 158.94\n0.13747 0.986 -24.458\n0.00036259 4.1267e-05 0.99658\n\nD: 0.29534 0.035751 -56.21\n-0.35718 0.5432 233.53\n-0.00064211 -1.1093e-05 0.97783\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_38_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_38_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.056448 -0.012851 135.19\n-0.38625 0.54689 255.61\n-0.00066718 5.392e-05 1.0012\n\nB: 1.6996 0.02142 200.09\n0.31149 1.4251 -246.25\n0.00053609 -6.8541e-05 0.99889\n\nC: 1.3186 -0.0097277 -143.16\n0.094663 1.1956 -58.383\n0.00019153 -2.0281e-05 0.99989\n\nD: 0.1268 -0.03963 330.5\n-0.1892 0.46973 254.2\n-0.00039857 -3.9641e-05 0.99971\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.056448 -0.012851 135.19\n-0.38625 0.54689 255.61\n-0.00066718 5.392e-05 1.0012\n\nB: 1.6996 0.02142 200.09\n0.31149 1.4251 -246.25\n0.00053609 -6.8541e-05 0.99889\n\nC: 1.3186 -0.0097277 -143.16\n0.094663 1.1956 -58.383\n0.00019153 -2.0281e-05 0.99989\n\nD: 0.1268 -0.03963 330.5\n-0.1892 0.46973 254.2\n-0.00039857 -3.9641e-05 0.99971\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_39_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_39_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 1.441 -0.037212 269.33\n0.73295 1.6438 -380.65\n0.0014226 4.1601e-05 1.0102\n\nB: 0.60665 -0.013034 217.78\n0.087451 0.52146 32.707\n0.00021516 2.9281e-07 1.0006\n\nC: 0.30367 0.12862 200.05\n-0.12888 0.30356 134.47\n2.6855e-07 -3.4026e-07 1\n\nD: 0.28973 0.014397 100.07\n-0.29955 0.64174 168.27\n-0.00067332 7.239e-06 1.0017\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.441 -0.037212 269.33\n0.73295 1.6438 -380.65\n0.0014226 4.1601e-05 1.0102\n\nB: 0.60665 -0.013034 217.78\n0.087451 0.52146 32.707\n0.00021516 2.9281e-07 1.0006\n\nC: 0.30367 0.12862 200.05\n-0.12888 0.30356 134.47\n2.6855e-07 -3.4026e-07 1\n\nD: 0.28973 0.014397 100.07\n-0.29955 0.64174 168.27\n-0.00067332 7.239e-06 1.0017\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_40_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_40_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.012717 0.014394 193.52\n-0.12386 0.60301 126.7\n-0.00063953 7.9665e-05 1.0012\n\nB: 0.7855 0.039826 119.05\n-0.25749 1.3451 -220.69\n-0.00047304 5.3677e-05 1.001\n\nC: 1.5134 -0.0029581 20.934\n0.2678 1.4062 -232.68\n0.00048583 -4.0311e-06 1.0006\n\nD: 0.85555 -0.17378 91.59\n0.17068 0.85755 -31.264\n-5.1182e-06 2.0966e-06 1.0023\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.012717 0.014394 193.52\n-0.12386 0.60301 126.7\n-0.00063953 7.9665e-05 1.0012\n\nB: 0.7855 0.039826 119.05\n-0.25749 1.3451 -220.69\n-0.00047304 5.3677e-05 1.001\n\nC: 1.5134 -0.0029581 20.934\n0.2678 1.4062 -232.68\n0.00048583 -4.0311e-06 1.0006\n\nD: 0.85555 -0.17378 91.59\n0.17068 0.85755 -31.264\n-5.1182e-06 2.0966e-06 1.0023\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_41_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_41_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.18178 0.033268 82.883\n-0.24959 0.68306 123.62\n-0.0004688 5.3047e-05 1.0005\n\nB: 0.60665 -0.013034 217.78\n0.087451 0.52146 32.707\n0.00021516 2.9281e-07 1.0006\n\nC: 0.62147 0.055609 221.79\n0.21978 1.1561 -23.942\n0.00048557 -4.4311e-05 0.99866\n\nD: 1.3186 -0.0097277 -143.16\n0.094663 1.1956 -58.383\n0.00019153 -2.0281e-05 0.99989\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.18178 0.033268 82.883\n-0.24959 0.68306 123.62\n-0.0004688 5.3047e-05 1.0005\n\nB: 0.60665 -0.013034 217.78\n0.087451 0.52146 32.707\n0.00021516 2.9281e-07 1.0006\n\nC: 0.62147 0.055609 221.79\n0.21978 1.1561 -23.942\n0.00048557 -4.4311e-05 0.99866\n\nD: 1.3186 -0.0097277 -143.16\n0.094663 1.1956 -58.383\n0.00019153 -2.0281e-05 0.99989\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_42_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_42_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.4849 -0.15095 280.72\n-0.18568 0.38797 170.57\n-4.9965e-05 -0.00024428 0.99985\n\nB: 0.81883 -0.28544 161.88\n0.010536 0.53499 62.327\n1.3163e-05 -0.00056443 1.0014\n\nC: 0.54304 0.026384 236.48\n-0.041921 0.64806 87.13\n-5.8662e-05 1.5685e-05 1\n\nD: 0.14705 0.061323 72.893\n-0.27582 0.69094 109.44\n-0.00056993 1.3825e-06 0.9981\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.4849 -0.15095 280.72\n-0.18568 0.38797 170.57\n-4.9965e-05 -0.00024428 0.99985\n\nB: 0.81883 -0.28544 161.88\n0.010536 0.53499 62.327\n1.3163e-05 -0.00056443 1.0014\n\nC: 0.54304 0.026384 236.48\n-0.041921 0.64806 87.13\n-5.8662e-05 1.5685e-05 1\n\nD: 0.14705 0.061323 72.893\n-0.27582 0.69094 109.44\n-0.00056993 1.3825e-06 0.9981\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_43_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_43_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.41873 -0.043533 -18.562\n-0.27021 0.88041 53.791\n-0.00050299 -2.2546e-05 0.99941\n\nB: 0.63669 0.0018872 137.9\n-0.00033285 0.63926 95.922\n-2.0441e-06 4.1104e-06 1\n\nC: 1.1884 0.015274 95.776\n0.23282 1.0681 -20.551\n0.00097623 0.00015903 1.0014\n\nD: 0.3794 0.089822 49.168\n-0.27745 0.88349 -5.6379\n-0.00046319 5.6849e-05 0.99886\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.41873 -0.043533 -18.562\n-0.27021 0.88041 53.791\n-0.00050299 -2.2546e-05 0.99941\n\nB: 0.63669 0.0018872 137.9\n-0.00033285 0.63926 95.922\n-2.0441e-06 4.1104e-06 1\n\nC: 1.1884 0.015274 95.776\n0.23282 1.0681 -20.551\n0.00097623 0.00015903 1.0014\n\nD: 0.3794 0.089822 49.168\n-0.27745 0.88349 -5.6379\n-0.00046319 5.6849e-05 0.99886\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_44_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_44_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.1268 -0.03963 330.5\n-0.1892 0.46973 254.2\n-0.00039857 -3.9641e-05 0.99971\n\nB: 4.3722 0.14407 -818.24\n-0.25209 3.9595 -549.15\n0.001718 0.0010825 0.97985\n\nC: 1.0582 -0.013384 562.45\n0.1807 0.93712 36.472\n0.00043718 5.9368e-06 0.99927\n\nD: 1.3903 -0.069797 29.319\n0.18963 1.0284 22.049\n0.00052989 -9.8197e-05 1.0021\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.1268 -0.03963 330.5\n-0.1892 0.46973 254.2\n-0.00039857 -3.9641e-05 0.99971\n\nB: 4.3722 0.14407 -818.24\n-0.25209 3.9595 -549.15\n0.001718 0.0010825 0.97985\n\nC: 1.0582 -0.013384 562.45\n0.1807 0.93712 36.472\n0.00043718 5.9368e-06 0.99927\n\nD: 1.3903 -0.069797 29.319\n0.18963 1.0284 22.049\n0.00052989 -9.8197e-05 1.0021\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_45_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_45_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.24117 0.068506 48.185\n-0.23318 0.79398 68.106\n-0.0005259 5.079e-05 0.99834\n\nB: 0.53266 0.0019756 44.297\n-0.18137 0.85955 61.945\n-0.00038035 1.4705e-06 0.9999\n\nC: 0.79208 0.010314 26.019\n-0.023778 0.92337 43.513\n-0.00011513 1.2161e-05 1.0003\n\nD: 0.48275 -0.12831 276.04\n-0.19138 0.40711 199.19\n-5.6548e-05 -0.00023367 0.99912\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.24117 0.068506 48.185\n-0.23318 0.79398 68.106\n-0.0005259 5.079e-05 0.99834\n\nB: 0.53266 0.0019756 44.297\n-0.18137 0.85955 61.945\n-0.00038035 1.4705e-06 0.9999\n\nC: 0.79208 0.010314 26.019\n-0.023778 0.92337 43.513\n-0.00011513 1.2161e-05 1.0003\n\nD: 0.48275 -0.12831 276.04\n-0.19138 0.40711 199.19\n-5.6548e-05 -0.00023367 0.99912\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_46_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_46_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.13416 0.073075 56.977\n-0.21333 0.70433 84.528\n-0.00055481 6.1106e-05 1\n\nB: 0.44469 -0.1629 197.72\n-0.090792 0.33606 37.55\n-0.00032851 -0.00028415 1.0004\n\nC: 1.0505 -0.0053825 276.45\n0.20631 0.92888 48.832\n0.00048841 -1.9251e-05 0.99878\n\nD: 0.62147 0.055609 221.79\n0.21978 1.1561 -23.942\n0.00048557 -4.4311e-05 0.99866\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.13416 0.073075 56.977\n-0.21333 0.70433 84.528\n-0.00055481 6.1106e-05 1\n\nB: 0.44469 -0.1629 197.72\n-0.090792 0.33606 37.55\n-0.00032851 -0.00028415 1.0004\n\nC: 1.0505 -0.0053825 276.45\n0.20631 0.92888 48.832\n0.00048841 -1.9251e-05 0.99878\n\nD: 0.62147 0.055609 221.79\n0.21978 1.1561 -23.942\n0.00048557 -4.4311e-05 0.99866\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_47_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_47_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 1.2108 -0.031741 47.374\n0.20996 1.0345 -107.36\n0.00054926 -6.3631e-06 1.0004\n\nB: 2.5614 0.083075 163.07\n0.94137 2.2586 -732.08\n0.0017783 2.1603e-05 0.99316\n\nC: 0.7855 0.039826 119.05\n-0.25749 1.3451 -220.69\n-0.00047304 5.3677e-05 1.001\n\nD: 0.15114 -0.00089399 241.66\n-0.078633 0.45918 14.453\n-0.00033245 3.1152e-05 0.99996\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.2108 -0.031741 47.374\n0.20996 1.0345 -107.36\n0.00054926 -6.3631e-06 1.0004\n\nB: 2.5614 0.083075 163.07\n0.94137 2.2586 -732.08\n0.0017783 2.1603e-05 0.99316\n\nC: 0.7855 0.039826 119.05\n-0.25749 1.3451 -220.69\n-0.00047304 5.3677e-05 1.001\n\nD: 0.15114 -0.00089399 241.66\n-0.078633 0.45918 14.453\n-0.00033245 3.1152e-05 0.99996\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_48_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_48_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.7088 -0.010965 -26.07\n-0.13602 0.83489 103.19\n-0.00023352 -1.5615e-05 1.0004\n\nB: 3.1418 0.21701 -576.91\n0.129 3.5039 -1062.5\n0.0014143 0.00082533 0.98844\n\nC: 0.70161 0.023304 -1.9207\n-0.10366 0.81239 71.251\n-0.00023167 -1.5062e-05 0.99976\n\nD: 0.36677 -0.019493 213.68\n-0.082321 0.47708 180.81\n-0.00021125 -4.1441e-05 1.0123\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.7088 -0.010965 -26.07\n-0.13602 0.83489 103.19\n-0.00023352 -1.5615e-05 1.0004\n\nB: 3.1418 0.21701 -576.91\n0.129 3.5039 -1062.5\n0.0014143 0.00082533 0.98844\n\nC: 0.70161 0.023304 -1.9207\n-0.10366 0.81239 71.251\n-0.00023167 -1.5062e-05 0.99976\n\nD: 0.36677 -0.019493 213.68\n-0.082321 0.47708 180.81\n-0.00021125 -4.1441e-05 1.0123\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_49_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_49_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.35568 0.079611 -21.49\n-0.17793 0.7199 62.24\n-0.00050458 1.9913e-05 0.9982\n\nB: 0.54372 0.011697 65.787\n-0.06271 0.8727 105.67\n-0.00025117 2.4814e-06 0.99967\n\nC: 1.0582 -0.013384 562.45\n0.1807 0.93712 36.472\n0.00043718 5.9368e-06 0.99927\n\nD: 0.47589 0.042551 60.888\n-0.21388 0.80238 62.033\n-0.0003663 2.6901e-05 1.001\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.35568 0.079611 -21.49\n-0.17793 0.7199 62.24\n-0.00050458 1.9913e-05 0.9982\n\nB: 0.54372 0.011697 65.787\n-0.06271 0.8727 105.67\n-0.00025117 2.4814e-06 0.99967\n\nC: 1.0582 -0.013384 562.45\n0.1807 0.93712 36.472\n0.00043718 5.9368e-06 0.99927\n\nD: 0.47589 0.042551 60.888\n-0.21388 0.80238 62.033\n-0.0003663 2.6901e-05 1.001\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_50_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_50_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 2.2787 0.023843 -30.321\n0.58793 1.9158 -459.28\n0.0012782 -6.6868e-06 0.99971\n\nB: 0.22888 0.0058691 272.09\n-0.077153 0.3923 203.08\n-0.00024299 -4.5827e-06 1.0015\n\nC: 0.38922 0.015343 55.85\n-0.1763 0.84543 87.344\n-0.00049385 -2.1034e-05 1.0072\n\nD: 2.4144 -0.0022023 -199.3\n0.52146 2.0547 -569.49\n0.0010423 8.4489e-05 1.0043\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 2.2787 0.023843 -30.321\n0.58793 1.9158 -459.28\n0.0012782 -6.6868e-06 0.99971\n\nB: 0.22888 0.0058691 272.09\n-0.077153 0.3923 203.08\n-0.00024299 -4.5827e-06 1.0015\n\nC: 0.38922 0.015343 55.85\n-0.1763 0.84543 87.344\n-0.00049385 -2.1034e-05 1.0072\n\nD: 2.4144 -0.0022023 -199.3\n0.52146 2.0547 -569.49\n0.0010423 8.4489e-05 1.0043\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_51_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_51_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.38922 0.015343 55.85\n-0.1763 0.84543 87.344\n-0.00049385 -2.1034e-05 1.0072\n\nB: 0.70212 0.43231 -128.54\n-0.42351 0.70276 199.3\n6.3285e-06 1.2175e-05 0.99997\n\nC: 0.42945 0.0071566 96.266\n-0.019537 0.48377 43.049\n-7.8698e-05 1.6013e-05 1.0001\n\nD: 0.70161 0.023304 -1.9207\n-0.10366 0.81239 71.251\n-0.00023167 -1.5062e-05 0.99976\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.38922 0.015343 55.85\n-0.1763 0.84543 87.344\n-0.00049385 -2.1034e-05 1.0072\n\nB: 0.70212 0.43231 -128.54\n-0.42351 0.70276 199.3\n6.3285e-06 1.2175e-05 0.99997\n\nC: 0.42945 0.0071566 96.266\n-0.019537 0.48377 43.049\n-7.8698e-05 1.6013e-05 1.0001\n\nD: 0.70161 0.023304 -1.9207\n-0.10366 0.81239 71.251\n-0.00023167 -1.5062e-05 0.99976\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_52_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_52_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 2.2787 0.023843 -30.321\n0.58793 1.9158 -459.28\n0.0012782 -6.6868e-06 0.99971\n\nB: 0.73597 -0.0032436 13.11\n0.017092 0.71039 36.002\n5.8878e-05 -9.3828e-06 0.99995\n\nC: 1.0478 0.035143 64.843\n0.063507 1.0349 21.701\n0.00023044 -6.878e-06 0.99998\n\nD: 0.52949 -0.028655 46.849\n-0.2451 0.79991 158.44\n-0.00032499 -1.8164e-05 0.99959\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 2.2787 0.023843 -30.321\n0.58793 1.9158 -459.28\n0.0012782 -6.6868e-06 0.99971\n\nB: 0.73597 -0.0032436 13.11\n0.017092 0.71039 36.002\n5.8878e-05 -9.3828e-06 0.99995\n\nC: 1.0478 0.035143 64.843\n0.063507 1.0349 21.701\n0.00023044 -6.878e-06 0.99998\n\nD: 0.52949 -0.028655 46.849\n-0.2451 0.79991 158.44\n-0.00032499 -1.8164e-05 0.99959\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_53_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_53_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.32788 -0.00026656 168.52\n-0.087696 0.49289 72.043\n-0.00025798 4.6006e-06 0.9984\n\nB: 0.84581 -0.039469 34.117\n-0.067529 0.81703 142.37\n-0.00011408 -0.00014793 1.0014\n\nC: 0.33414 0.069646 90.22\n-0.25229 0.73446 157.67\n-0.00038885 2.2582e-06 1.0024\n\nD: 0.38914 0.285 169.51\n-0.28531 0.39347 340.1\n-6.4617e-06 5.0341e-06 1\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.32788 -0.00026656 168.52\n-0.087696 0.49289 72.043\n-0.00025798 4.6006e-06 0.9984\n\nB: 0.84581 -0.039469 34.117\n-0.067529 0.81703 142.37\n-0.00011408 -0.00014793 1.0014\n\nC: 0.33414 0.069646 90.22\n-0.25229 0.73446 157.67\n-0.00038885 2.2582e-06 1.0024\n\nD: 0.38914 0.285 169.51\n-0.28531 0.39347 340.1\n-6.4617e-06 5.0341e-06 1\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_54_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_54_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 1.441 -0.037212 269.33\n0.73295 1.6438 -380.65\n0.0014226 4.1601e-05 1.0102\n\nB: 1.1901 -0.048587 107.72\n0.14488 1.1926 -121.84\n0.00033622 1.1241e-05 1.0001\n\nC: 2.6177 0.042575 -65.797\n0.74359 2.3954 -903.27\n0.0018892 8.2816e-05 0.98996\n\nD: 0.27317 0.041297 84.951\n-0.22859 0.68736 124.47\n-0.00041264 5.2763e-05 1.0003\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.441 -0.037212 269.33\n0.73295 1.6438 -380.65\n0.0014226 4.1601e-05 1.0102\n\nB: 1.1901 -0.048587 107.72\n0.14488 1.1926 -121.84\n0.00033622 1.1241e-05 1.0001\n\nC: 2.6177 0.042575 -65.797\n0.74359 2.3954 -903.27\n0.0018892 8.2816e-05 0.98996\n\nD: 0.27317 0.041297 84.951\n-0.22859 0.68736 124.47\n-0.00041264 5.2763e-05 1.0003\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_55_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_55_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.85799 0.21669 9.4839\n-0.21177 0.85855 130.48\n1.5015e-06 9.2033e-07 1\n\nB: 1.1442 -0.037625 115.5\n0.22206 1.0286 -30.039\n0.00032815 -2.4116e-05 0.9999\n\nC: 0.17608 -0.024321 273.19\n-0.19809 0.7405 74.826\n-0.00053318 1.2457e-05 1.0069\n\nD: 2.3594 0.0026252 -116.05\n0.5085 2.302 -550.96\n0.0013826 0.0001837 1.0004\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.85799 0.21669 9.4839\n-0.21177 0.85855 130.48\n1.5015e-06 9.2033e-07 1\n\nB: 1.1442 -0.037625 115.5\n0.22206 1.0286 -30.039\n0.00032815 -2.4116e-05 0.9999\n\nC: 0.17608 -0.024321 273.19\n-0.19809 0.7405 74.826\n-0.00053318 1.2457e-05 1.0069\n\nD: 2.3594 0.0026252 -116.05\n0.5085 2.302 -550.96\n0.0013826 0.0001837 1.0004\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_56_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_56_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.4849 -0.15095 280.72\n-0.18568 0.38797 170.57\n-4.9965e-05 -0.00024428 0.99985\n\nB: 2.1479 0.036813 206.94\n0.67819 1.8174 -485.8\n0.0012074 -6.8043e-06 0.99599\n\nC: 0.24117 0.068506 48.185\n-0.23318 0.79398 68.106\n-0.0005259 5.079e-05 0.99834\n\nD: 0.38922 0.015343 55.85\n-0.1763 0.84543 87.344\n-0.00049385 -2.1034e-05 1.0072\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.4849 -0.15095 280.72\n-0.18568 0.38797 170.57\n-4.9965e-05 -0.00024428 0.99985\n\nB: 2.1479 0.036813 206.94\n0.67819 1.8174 -485.8\n0.0012074 -6.8043e-06 0.99599\n\nC: 0.24117 0.068506 48.185\n-0.23318 0.79398 68.106\n-0.0005259 5.079e-05 0.99834\n\nD: 0.38922 0.015343 55.85\n-0.1763 0.84543 87.344\n-0.00049385 -2.1034e-05 1.0072\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_57_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_57_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.2564 0.092521 94.187\n-0.28031 0.83589 -0.15652\n-0.00048968 6.0866e-05 1.0015\n\nB: 2.6481 0.070248 -423.11\n0.5002 2.6605 -906.39\n0.0012014 0.00025943 0.99533\n\nC: 1.0035 -0.00055314 2.5255\n-0.0028717 1.0087 -9.7285\n-3.8783e-06 3.4244e-06 1\n\nD: 2.5614 0.083075 163.07\n0.94137 2.2586 -732.08\n0.0017783 2.1603e-05 0.99316\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.2564 0.092521 94.187\n-0.28031 0.83589 -0.15652\n-0.00048968 6.0866e-05 1.0015\n\nB: 2.6481 0.070248 -423.11\n0.5002 2.6605 -906.39\n0.0012014 0.00025943 0.99533\n\nC: 1.0035 -0.00055314 2.5255\n-0.0028717 1.0087 -9.7285\n-3.8783e-06 3.4244e-06 1\n\nD: 2.5614 0.083075 163.07\n0.94137 2.2586 -732.08\n0.0017783 2.1603e-05 0.99316\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_58_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_58_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 1.3838 0.024181 -93.882\n0.093344 1.307 -232.76\n0.00015995 6.7546e-05 1.0008\n\nB: 1.4219 0.01866 342.44\n0.36005 1.3261 -141.73\n0.00090969 2.3838e-05 1.0002\n\nC: 1.9861 0.031586 27.893\n0.62141 1.9607 -531.99\n0.0011993 -1.9815e-05 0.99978\n\nD: 0.22888 0.0058691 272.09\n-0.077153 0.3923 203.08\n-0.00024299 -4.5827e-06 1.0015\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.3838 0.024181 -93.882\n0.093344 1.307 -232.76\n0.00015995 6.7546e-05 1.0008\n\nB: 1.4219 0.01866 342.44\n0.36005 1.3261 -141.73\n0.00090969 2.3838e-05 1.0002\n\nC: 1.9861 0.031586 27.893\n0.62141 1.9607 -531.99\n0.0011993 -1.9815e-05 0.99978\n\nD: 0.22888 0.0058691 272.09\n-0.077153 0.3923 203.08\n-0.00024299 -4.5827e-06 1.0015\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_59_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_59_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 1.6477 -0.037624 101.59\n0.49962 1.5725 -364.98\n0.00090272 4.6589e-05 1.0037\n\nB: 1.8278 -0.0075993 72.268\n0.68643 1.8832 -550.61\n0.0012853 4.1209e-05 1.006\n\nC: 1.4932 0.01661 231.74\n0.45676 1.4341 -212.29\n0.0013256 9.9938e-05 0.99686\n\nD: 2.2787 0.023843 -30.321\n0.58793 1.9158 -459.28\n0.0012782 -6.6868e-06 0.99971\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.6477 -0.037624 101.59\n0.49962 1.5725 -364.98\n0.00090272 4.6589e-05 1.0037\n\nB: 1.8278 -0.0075993 72.268\n0.68643 1.8832 -550.61\n0.0012853 4.1209e-05 1.006\n\nC: 1.4932 0.01661 231.74\n0.45676 1.4341 -212.29\n0.0013256 9.9938e-05 0.99686\n\nD: 2.2787 0.023843 -30.321\n0.58793 1.9158 -459.28\n0.0012782 -6.6868e-06 0.99971\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_60_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_60_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 2.3515 0.16969 142.03\n1.0602 2.1465 -778.33\n0.0016806 -4.8949e-05 0.99537\n\nB: 0.54372 0.011697 65.787\n-0.06271 0.8727 105.67\n-0.00025117 2.4814e-06 0.99967\n\nC: 3.4851 0.086317 195.9\n1.1598 3.067 -1009.5\n0.0025647 -5.4567e-05 0.99349\n\nD: 0.60665 -0.013034 217.78\n0.087451 0.52146 32.707\n0.00021516 2.9281e-07 1.0006\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 2.3515 0.16969 142.03\n1.0602 2.1465 -778.33\n0.0016806 -4.8949e-05 0.99537\n\nB: 0.54372 0.011697 65.787\n-0.06271 0.8727 105.67\n-0.00025117 2.4814e-06 0.99967\n\nC: 3.4851 0.086317 195.9\n1.1598 3.067 -1009.5\n0.0025647 -5.4567e-05 0.99349\n\nD: 0.60665 -0.013034 217.78\n0.087451 0.52146 32.707\n0.00021516 2.9281e-07 1.0006\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_61_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_61_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.88184 0.31397 -39.976\n-0.18167 0.93621 153.25\n0.00020118 -1.9028e-05 0.99997\n\nB: 0.2024 0.0033266 96.15\n-0.28093 0.65512 201.73\n-0.00049784 1.8106e-06 1.0048\n\nC: 0.60367 0.071352 -36.528\n-0.21232 0.96671 -45.299\n-0.00036835 6.7456e-05 0.99996\n\nD: 0.48531 0.10549 -95.005\n-0.11843 0.77202 44.217\n-0.00029301 2.8434e-05 0.99773\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.88184 0.31397 -39.976\n-0.18167 0.93621 153.25\n0.00020118 -1.9028e-05 0.99997\n\nB: 0.2024 0.0033266 96.15\n-0.28093 0.65512 201.73\n-0.00049784 1.8106e-06 1.0048\n\nC: 0.60367 0.071352 -36.528\n-0.21232 0.96671 -45.299\n-0.00036835 6.7456e-05 0.99996\n\nD: 0.48531 0.10549 -95.005\n-0.11843 0.77202 44.217\n-0.00029301 2.8434e-05 0.99773\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_62_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_62_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.8771 0.00026849 -1.1131\n-0.035484 0.88589 36.525\n-7.7192e-05 -1.833e-05 1\n\nB: 0.4591 -0.47767 436.55\n0.46479 0.46941 -27.514\n-2.7182e-05 -1.2668e-06 1.0191\n\nC: 1.1901 -0.048587 107.72\n0.14488 1.1926 -121.84\n0.00033622 1.1241e-05 1.0001\n\nD: 0.70212 0.43231 -128.54\n-0.42351 0.70276 199.3\n6.3285e-06 1.2175e-05 0.99997\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.8771 0.00026849 -1.1131\n-0.035484 0.88589 36.525\n-7.7192e-05 -1.833e-05 1\n\nB: 0.4591 -0.47767 436.55\n0.46479 0.46941 -27.514\n-2.7182e-05 -1.2668e-06 1.0191\n\nC: 1.1901 -0.048587 107.72\n0.14488 1.1926 -121.84\n0.00033622 1.1241e-05 1.0001\n\nD: 0.70212 0.43231 -128.54\n-0.42351 0.70276 199.3\n6.3285e-06 1.2175e-05 0.99997\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_63_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_63_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 1.5134 -0.0029581 20.934\n0.2678 1.4062 -232.68\n0.00048583 -4.0311e-06 1.0006\n\nB: 0.70212 0.43231 -128.54\n-0.42351 0.70276 199.3\n6.3285e-06 1.2175e-05 0.99997\n\nC: 0.49838 -0.015725 33.278\n-0.18045 0.77392 59.799\n-0.00064863 -4.2793e-05 0.99978\n\nD: 0.13416 0.073075 56.977\n-0.21333 0.70433 84.528\n-0.00055481 6.1106e-05 1\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.5134 -0.0029581 20.934\n0.2678 1.4062 -232.68\n0.00048583 -4.0311e-06 1.0006\n\nB: 0.70212 0.43231 -128.54\n-0.42351 0.70276 199.3\n6.3285e-06 1.2175e-05 0.99997\n\nC: 0.49838 -0.015725 33.278\n-0.18045 0.77392 59.799\n-0.00064863 -4.2793e-05 0.99978\n\nD: 0.13416 0.073075 56.977\n-0.21333 0.70433 84.528\n-0.00055481 6.1106e-05 1\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_64_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_64_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 1.3522 0.025037 96.693\n0.20588 1.5085 -279.44\n0.000418 4.2466e-05 1.0103\n\nB: 1.8954 -0.043603 197.83\n0.50589 1.509 -236.95\n0.0010644 -1.6279e-05 1.0115\n\nC: 0.48882 0.0079397 13.575\n-0.24956 0.69593 149.6\n-0.00053246 -7.8574e-06 1.0026\n\nD: 0.94726 0.076953 177.36\n0.25112 1.0126 13.205\n0.00047269 2.7805e-05 0.99969\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.3522 0.025037 96.693\n0.20588 1.5085 -279.44\n0.000418 4.2466e-05 1.0103\n\nB: 1.8954 -0.043603 197.83\n0.50589 1.509 -236.95\n0.0010644 -1.6279e-05 1.0115\n\nC: 0.48882 0.0079397 13.575\n-0.24956 0.69593 149.6\n-0.00053246 -7.8574e-06 1.0026\n\nD: 0.94726 0.076953 177.36\n0.25112 1.0126 13.205\n0.00047269 2.7805e-05 0.99969\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_65_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_65_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 1.0019 0.045013 144.39\n0.13277 0.95284 -14.111\n0.0002066 5.2875e-05 1\n\nB: 2.3515 0.16969 142.03\n1.0602 2.1465 -778.33\n0.0016806 -4.8949e-05 0.99537\n\nC: 0.1857 -0.0018512 147.73\n-0.094288 0.35154 277.67\n-0.00019671 -1.563e-05 0.9996\n\nD: 0.37618 -0.0026073 58.013\n-0.13988 0.81886 117.4\n-0.00032276 -1.1378e-05 0.99983\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.0019 0.045013 144.39\n0.13277 0.95284 -14.111\n0.0002066 5.2875e-05 1\n\nB: 2.3515 0.16969 142.03\n1.0602 2.1465 -778.33\n0.0016806 -4.8949e-05 0.99537\n\nC: 0.1857 -0.0018512 147.73\n-0.094288 0.35154 277.67\n-0.00019671 -1.563e-05 0.9996\n\nD: 0.37618 -0.0026073 58.013\n-0.13988 0.81886 117.4\n-0.00032276 -1.1378e-05 0.99983\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_66_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_66_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.77044 -0.014353 152.19\n0.007827 0.75172 76.397\n1.9039e-05 -2.1554e-05 1\n\nB: 0.40245 -0.33938 102.29\n-0.2125 0.62381 216.78\n-0.00033866 -1.5855e-05 1.0018\n\nC: 1.3838 0.024181 -93.882\n0.093344 1.307 -232.76\n0.00015995 6.7546e-05 1.0008\n\nD: 1.141 -0.024147 186.42\n0.29573 0.97376 -60.872\n0.00082251 -1.0843e-05 0.99973\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.77044 -0.014353 152.19\n0.007827 0.75172 76.397\n1.9039e-05 -2.1554e-05 1\n\nB: 0.40245 -0.33938 102.29\n-0.2125 0.62381 216.78\n-0.00033866 -1.5855e-05 1.0018\n\nC: 1.3838 0.024181 -93.882\n0.093344 1.307 -232.76\n0.00015995 6.7546e-05 1.0008\n\nD: 1.141 -0.024147 186.42\n0.29573 0.97376 -60.872\n0.00082251 -1.0843e-05 0.99973\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_67_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_67_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.79208 0.010314 26.019\n-0.023778 0.92337 43.513\n-0.00011513 1.2161e-05 1.0003\n\nB: 1.4932 0.01661 231.74\n0.45676 1.4341 -212.29\n0.0013256 9.9938e-05 0.99686\n\nC: 0.091252 0.0066749 132.72\n-0.14667 0.47258 88.51\n-0.00056772 8.3791e-06 1.0029\n\nD: 1.8278 -0.0075993 72.268\n0.68643 1.8832 -550.61\n0.0012853 4.1209e-05 1.006\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.79208 0.010314 26.019\n-0.023778 0.92337 43.513\n-0.00011513 1.2161e-05 1.0003\n\nB: 1.4932 0.01661 231.74\n0.45676 1.4341 -212.29\n0.0013256 9.9938e-05 0.99686\n\nC: 0.091252 0.0066749 132.72\n-0.14667 0.47258 88.51\n-0.00056772 8.3791e-06 1.0029\n\nD: 1.8278 -0.0075993 72.268\n0.68643 1.8832 -550.61\n0.0012853 4.1209e-05 1.006\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_68_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_68_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 1.0983 -0.030393 111.31\n0.31879 0.9789 58.516\n0.00050073 -5.3943e-05 1.0005\n\nB: 0.69134 -0.0063829 116.24\n0.0053381 0.71985 83.96\n-1.8171e-05 2.7124e-05 1\n\nC: 0.91628 -0.19782 70.502\n0.072414 0.68419 -33.187\n5.7127e-06 -0.00025258 0.99947\n\nD: 2.3515 0.16969 142.03\n1.0602 2.1465 -778.33\n0.0016806 -4.8949e-05 0.99537\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.0983 -0.030393 111.31\n0.31879 0.9789 58.516\n0.00050073 -5.3943e-05 1.0005\n\nB: 0.69134 -0.0063829 116.24\n0.0053381 0.71985 83.96\n-1.8171e-05 2.7124e-05 1\n\nC: 0.91628 -0.19782 70.502\n0.072414 0.68419 -33.187\n5.7127e-06 -0.00025258 0.99947\n\nD: 2.3515 0.16969 142.03\n1.0602 2.1465 -778.33\n0.0016806 -4.8949e-05 0.99537\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_69_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_69_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.75268 -0.0092452 -71.273\n-0.17607 0.97566 6.3105\n-0.00029582 -1.5187e-05 0.99957\n\nB: 0.091252 0.0066749 132.72\n-0.14667 0.47258 88.51\n-0.00056772 8.3791e-06 1.0029\n\nC: 0.62091 -0.030805 57.622\n-0.22703 0.84222 -13.023\n-0.00037179 -4.2767e-05 0.99852\n\nD: 1.4259 0.070724 58.865\n0.39243 1.3442 -170.04\n0.00084248 0.00011346 0.98851\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.75268 -0.0092452 -71.273\n-0.17607 0.97566 6.3105\n-0.00029582 -1.5187e-05 0.99957\n\nB: 0.091252 0.0066749 132.72\n-0.14667 0.47258 88.51\n-0.00056772 8.3791e-06 1.0029\n\nC: 0.62091 -0.030805 57.622\n-0.22703 0.84222 -13.023\n-0.00037179 -4.2767e-05 0.99852\n\nD: 1.4259 0.070724 58.865\n0.39243 1.3442 -170.04\n0.00084248 0.00011346 0.98851\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_70_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_70_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.49838 -0.015725 33.278\n-0.18045 0.77392 59.799\n-0.00064863 -4.2793e-05 0.99978\n\nB: 2.9599 0.00703 244.64\n0.78405 1.8789 -438.29\n0.0018411 4.4095e-05 0.99694\n\nC: 0.58099 -0.029382 -20.47\n-0.29479 0.73128 188.62\n-0.00043803 -4.3076e-05 1.0007\n\nD: 2.6177 0.042575 -65.797\n0.74359 2.3954 -903.27\n0.0018892 8.2816e-05 0.98996\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.49838 -0.015725 33.278\n-0.18045 0.77392 59.799\n-0.00064863 -4.2793e-05 0.99978\n\nB: 2.9599 0.00703 244.64\n0.78405 1.8789 -438.29\n0.0018411 4.4095e-05 0.99694\n\nC: 0.58099 -0.029382 -20.47\n-0.29479 0.73128 188.62\n-0.00043803 -4.3076e-05 1.0007\n\nD: 2.6177 0.042575 -65.797\n0.74359 2.3954 -903.27\n0.0018892 8.2816e-05 0.98996\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_71_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_71_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 1.0582 -0.013384 562.45\n0.1807 0.93712 36.472\n0.00043718 5.9368e-06 0.99927\n\nB: 0.73597 -0.0032436 13.11\n0.017092 0.71039 36.002\n5.8878e-05 -9.3828e-06 0.99995\n\nC: 0.37694 0.049406 111.53\n-0.16444 0.72986 84.602\n-0.00037753 4.0247e-05 0.99869\n\nD: 1.9834 -0.0016422 376.55\n0.84 1.4832 -241.61\n0.0019136 -3.8955e-05 1.0014\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.0582 -0.013384 562.45\n0.1807 0.93712 36.472\n0.00043718 5.9368e-06 0.99927\n\nB: 0.73597 -0.0032436 13.11\n0.017092 0.71039 36.002\n5.8878e-05 -9.3828e-06 0.99995\n\nC: 0.37694 0.049406 111.53\n-0.16444 0.72986 84.602\n-0.00037753 4.0247e-05 0.99869\n\nD: 1.9834 -0.0016422 376.55\n0.84 1.4832 -241.61\n0.0019136 -3.8955e-05 1.0014\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_72_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_72_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 1.2869 -0.0035671 90.117\n0.34981 1.1421 -290.48\n0.0010338 2.5575e-05 0.99928\n\nB: 0.74922 -0.0014388 -75.597\n-0.074158 0.94323 40.455\n-0.00018126 -6.2301e-06 1\n\nC: 0.67444 0.023361 37.089\n-0.047926 0.90094 60.932\n-0.00018688 1.1402e-05 1.0007\n\nD: 1.0063 -0.0054085 288.55\n0.23295 0.84053 7.8206\n0.0005941 1.4583e-05 1.0001\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.2869 -0.0035671 90.117\n0.34981 1.1421 -290.48\n0.0010338 2.5575e-05 0.99928\n\nB: 0.74922 -0.0014388 -75.597\n-0.074158 0.94323 40.455\n-0.00018126 -6.2301e-06 1\n\nC: 0.67444 0.023361 37.089\n-0.047926 0.90094 60.932\n-0.00018688 1.1402e-05 1.0007\n\nD: 1.0063 -0.0054085 288.55\n0.23295 0.84053 7.8206\n0.0005941 1.4583e-05 1.0001\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_73_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_73_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.29858 0.0403 -122.67\n-0.38113 0.61838 172.03\n-0.00071255 -1.0448e-06 0.97348\n\nB: 1.4259 0.070724 58.865\n0.39243 1.3442 -170.04\n0.00084248 0.00011346 0.98851\n\nC: 0.4605 0.0019073 42.778\n0.003918 0.45748 107.3\n1.6895e-05 4.8733e-06 1.0001\n\nD: 1.7761 -0.053427 263.17\n0.41751 1.5987 -329.46\n0.00069677 3.1372e-05 1.0014\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.29858 0.0403 -122.67\n-0.38113 0.61838 172.03\n-0.00071255 -1.0448e-06 0.97348\n\nB: 1.4259 0.070724 58.865\n0.39243 1.3442 -170.04\n0.00084248 0.00011346 0.98851\n\nC: 0.4605 0.0019073 42.778\n0.003918 0.45748 107.3\n1.6895e-05 4.8733e-06 1.0001\n\nD: 1.7761 -0.053427 263.17\n0.41751 1.5987 -329.46\n0.00069677 3.1372e-05 1.0014\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_74_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_74_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.43124 0.047668 -66.525\n-0.34772 0.62068 209.27\n-0.00060194 -2.1104e-07 0.98648\n\nB: 1.7312 -0.086578 129.17\n0.3882 1.1026 -2.2164\n0.0010948 -0.00011788 1.0024\n\nC: 0.85555 -0.17378 91.59\n0.17068 0.85755 -31.264\n-5.1182e-06 2.0966e-06 1.0023\n\nD: 0.33414 0.069646 90.22\n-0.25229 0.73446 157.67\n-0.00038885 2.2582e-06 1.0024\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.43124 0.047668 -66.525\n-0.34772 0.62068 209.27\n-0.00060194 -2.1104e-07 0.98648\n\nB: 1.7312 -0.086578 129.17\n0.3882 1.1026 -2.2164\n0.0010948 -0.00011788 1.0024\n\nC: 0.85555 -0.17378 91.59\n0.17068 0.85755 -31.264\n-5.1182e-06 2.0966e-06 1.0023\n\nD: 0.33414 0.069646 90.22\n-0.25229 0.73446 157.67\n-0.00038885 2.2582e-06 1.0024\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_75_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_75_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 1.547 0.11677 155.75\n0.40373 1.373 -170.1\n0.00090791 8.8782e-05 1.0012\n\nB: 0.056448 -0.012851 135.19\n-0.38625 0.54689 255.61\n-0.00066718 5.392e-05 1.0012\n\nC: 0.60665 -0.013034 217.78\n0.087451 0.52146 32.707\n0.00021516 2.9281e-07 1.0006\n\nD: 1.2108 -0.031741 47.374\n0.20996 1.0345 -107.36\n0.00054926 -6.3631e-06 1.0004\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.547 0.11677 155.75\n0.40373 1.373 -170.1\n0.00090791 8.8782e-05 1.0012\n\nB: 0.056448 -0.012851 135.19\n-0.38625 0.54689 255.61\n-0.00066718 5.392e-05 1.0012\n\nC: 0.60665 -0.013034 217.78\n0.087451 0.52146 32.707\n0.00021516 2.9281e-07 1.0006\n\nD: 1.2108 -0.031741 47.374\n0.20996 1.0345 -107.36\n0.00054926 -6.3631e-06 1.0004\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_76_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_76_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.0033111 0.031282 184.63\n-0.15843 0.75999 4.5609\n-0.00083562 0.00011238 0.99927\n\nB: 0.88632 -0.012492 -136.92\n-0.047209 1.0157 42.178\n-0.0001423 1.8595e-05 1.0005\n\nC: 0.1857 -0.0018512 147.73\n-0.094288 0.35154 277.67\n-0.00019671 -1.563e-05 0.9996\n\nD: 0.60367 0.071352 -36.528\n-0.21232 0.96671 -45.299\n-0.00036835 6.7456e-05 0.99996\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.0033111 0.031282 184.63\n-0.15843 0.75999 4.5609\n-0.00083562 0.00011238 0.99927\n\nB: 0.88632 -0.012492 -136.92\n-0.047209 1.0157 42.178\n-0.0001423 1.8595e-05 1.0005\n\nC: 0.1857 -0.0018512 147.73\n-0.094288 0.35154 277.67\n-0.00019671 -1.563e-05 0.9996\n\nD: 0.60367 0.071352 -36.528\n-0.21232 0.96671 -45.299\n-0.00036835 6.7456e-05 0.99996\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_77_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_77_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.1857 -0.0018512 147.73\n-0.094288 0.35154 277.67\n-0.00019671 -1.563e-05 0.9996\n\nB: 0.7855 0.039826 119.05\n-0.25749 1.3451 -220.69\n-0.00047304 5.3677e-05 1.001\n\nC: -0.47246 -0.28359 869.57\n0.29041 -0.47016 396.67\n5.0949e-06 1.2499e-05 0.99998\n\nD: -0.47246 -0.28359 869.57\n0.29041 -0.47016 396.67\n5.0949e-06 1.2499e-05 0.99998\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.1857 -0.0018512 147.73\n-0.094288 0.35154 277.67\n-0.00019671 -1.563e-05 0.9996\n\nB: 0.7855 0.039826 119.05\n-0.25749 1.3451 -220.69\n-0.00047304 5.3677e-05 1.001\n\nC: -0.47246 -0.28359 869.57\n0.29041 -0.47016 396.67\n5.0949e-06 1.2499e-05 0.99998\n\nD: -0.47246 -0.28359 869.57\n0.29041 -0.47016 396.67\n5.0949e-06 1.2499e-05 0.99998\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_78_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_78_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.55616 0.0088234 83.342\n-0.19782 0.70845 195.76\n-0.00029305 -3.175e-05 0.99884\n\nB: 0.31483 0.11583 690.51\n0.17546 0.70637 14.497\n0.00026712 0.00012691 1\n\nC: 0.7088 -0.010965 -26.07\n-0.13602 0.83489 103.19\n-0.00023352 -1.5615e-05 1.0004\n\nD: 1.0819 0.012805 66.799\n0.075853 1.006 5.6909\n0.00034273 -2.4626e-05 1.0003\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.55616 0.0088234 83.342\n-0.19782 0.70845 195.76\n-0.00029305 -3.175e-05 0.99884\n\nB: 0.31483 0.11583 690.51\n0.17546 0.70637 14.497\n0.00026712 0.00012691 1\n\nC: 0.7088 -0.010965 -26.07\n-0.13602 0.83489 103.19\n-0.00023352 -1.5615e-05 1.0004\n\nD: 1.0819 0.012805 66.799\n0.075853 1.006 5.6909\n0.00034273 -2.4626e-05 1.0003\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_79_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_79_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 1.3186 -0.0097277 -143.16\n0.094663 1.1956 -58.383\n0.00019153 -2.0281e-05 0.99989\n\nB: 1.1442 -0.037625 115.5\n0.22206 1.0286 -30.039\n0.00032815 -2.4116e-05 0.9999\n\nC: 0.27317 0.041297 84.951\n-0.22859 0.68736 124.47\n-0.00041264 5.2763e-05 1.0003\n\nD: 1.9861 0.031586 27.893\n0.62141 1.9607 -531.99\n0.0011993 -1.9815e-05 0.99978\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.3186 -0.0097277 -143.16\n0.094663 1.1956 -58.383\n0.00019153 -2.0281e-05 0.99989\n\nB: 1.1442 -0.037625 115.5\n0.22206 1.0286 -30.039\n0.00032815 -2.4116e-05 0.9999\n\nC: 0.27317 0.041297 84.951\n-0.22859 0.68736 124.47\n-0.00041264 5.2763e-05 1.0003\n\nD: 1.9861 0.031586 27.893\n0.62141 1.9607 -531.99\n0.0011993 -1.9815e-05 0.99978\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_80_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_80_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.37107 -0.09213 318.73\n0.086334 0.37505 188.02\n-1.0814e-05 -3.6548e-06 1\n\nB: 1.8278 -0.0075993 72.268\n0.68643 1.8832 -550.61\n0.0012853 4.1209e-05 1.006\n\nC: 1.3231 -0.10518 226.69\n0.35118 1.4445 -217.52\n0.00076877 -2.4515e-05 0.99903\n\nD: 0.49202 0.0057754 242.06\n0.058005 0.43541 166.02\n0.00018017 1.0746e-05 0.99974\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.37107 -0.09213 318.73\n0.086334 0.37505 188.02\n-1.0814e-05 -3.6548e-06 1\n\nB: 1.8278 -0.0075993 72.268\n0.68643 1.8832 -550.61\n0.0012853 4.1209e-05 1.006\n\nC: 1.3231 -0.10518 226.69\n0.35118 1.4445 -217.52\n0.00076877 -2.4515e-05 0.99903\n\nD: 0.49202 0.0057754 242.06\n0.058005 0.43541 166.02\n0.00018017 1.0746e-05 0.99974\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_81_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_81_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.52064 0.019326 41.006\n-0.1476 0.75468 101.84\n-0.00026848 4.5639e-05 1.0094\n\nB: 1.6284 1.0346 -954.33\n-0.096789 2.5434 -782.98\n-0.00078653 0.0011044 1\n\nC: 0.1176 -0.0075311 194.61\n-0.10067 0.3391 257.1\n-0.00023555 -9.6091e-06 0.99858\n\nD: 3.6199 0.1243 -2.4307\n0.35256 5.1536 -1935.2\n0.0029372 0.0011148 1\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.52064 0.019326 41.006\n-0.1476 0.75468 101.84\n-0.00026848 4.5639e-05 1.0094\n\nB: 1.6284 1.0346 -954.33\n-0.096789 2.5434 -782.98\n-0.00078653 0.0011044 1\n\nC: 0.1176 -0.0075311 194.61\n-0.10067 0.3391 257.1\n-0.00023555 -9.6091e-06 0.99858\n\nD: 3.6199 0.1243 -2.4307\n0.35256 5.1536 -1935.2\n0.0029372 0.0011148 1\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_82_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_82_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.1857 -0.0018512 147.73\n-0.094288 0.35154 277.67\n-0.00019671 -1.563e-05 0.9996\n\nB: 1.8278 -0.0075993 72.268\n0.68643 1.8832 -550.61\n0.0012853 4.1209e-05 1.006\n\nC: 0.35568 0.079611 -21.49\n-0.17793 0.7199 62.24\n-0.00050458 1.9913e-05 0.9982\n\nD: 0.44469 -0.1629 197.72\n-0.090792 0.33606 37.55\n-0.00032851 -0.00028415 1.0004\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.1857 -0.0018512 147.73\n-0.094288 0.35154 277.67\n-0.00019671 -1.563e-05 0.9996\n\nB: 1.8278 -0.0075993 72.268\n0.68643 1.8832 -550.61\n0.0012853 4.1209e-05 1.006\n\nC: 0.35568 0.079611 -21.49\n-0.17793 0.7199 62.24\n-0.00050458 1.9913e-05 0.9982\n\nD: 0.44469 -0.1629 197.72\n-0.090792 0.33606 37.55\n-0.00032851 -0.00028415 1.0004\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_83_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_83_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.4605 0.0019073 42.778\n0.003918 0.45748 107.3\n1.6895e-05 4.8733e-06 1.0001\n\nB: 2.4144 -0.0022023 -199.3\n0.52146 2.0547 -569.49\n0.0010423 8.4489e-05 1.0043\n\nC: 0.57079 0.0076829 -45.295\n-0.15447 0.93183 62.276\n-0.00028402 -5.8827e-06 0.99996\n\nD: 0.47208 0.021042 63.836\n-0.16332 0.73028 126.94\n-0.00030371 2.4606e-05 0.99981\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.4605 0.0019073 42.778\n0.003918 0.45748 107.3\n1.6895e-05 4.8733e-06 1.0001\n\nB: 2.4144 -0.0022023 -199.3\n0.52146 2.0547 -569.49\n0.0010423 8.4489e-05 1.0043\n\nC: 0.57079 0.0076829 -45.295\n-0.15447 0.93183 62.276\n-0.00028402 -5.8827e-06 0.99996\n\nD: 0.47208 0.021042 63.836\n-0.16332 0.73028 126.94\n-0.00030371 2.4606e-05 0.99981\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_84_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_84_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 1.0669 0.31109 194.1\n-0.019953 0.9209 79.624\n0.000135 -7.6705e-05 0.99977\n\nB: 1.5534 0.017684 158.94\n0.56083 1.4841 -343.65\n0.0010107 3.8363e-05 0.99895\n\nC: 1.4272 0.064496 -40.82\n0.15764 1.3161 -94.847\n0.00037033 4.6015e-05 0.99258\n\nD: 1.3951 0.13641 136.74\n0.31704 1.2758 -219.28\n0.00053511 0.00013896 0.99675\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.0669 0.31109 194.1\n-0.019953 0.9209 79.624\n0.000135 -7.6705e-05 0.99977\n\nB: 1.5534 0.017684 158.94\n0.56083 1.4841 -343.65\n0.0010107 3.8363e-05 0.99895\n\nC: 1.4272 0.064496 -40.82\n0.15764 1.3161 -94.847\n0.00037033 4.6015e-05 0.99258\n\nD: 1.3951 0.13641 136.74\n0.31704 1.2758 -219.28\n0.00053511 0.00013896 0.99675\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_85_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_85_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 2.308 -0.067061 201.09\n0.71494 1.8702 -412.16\n0.0015273 -1.6972e-05 1.0162\n\nB: 1.3838 0.024181 -93.882\n0.093344 1.307 -232.76\n0.00015995 6.7546e-05 1.0008\n\nC: 0.46288 -0.016626 22.437\n-0.26713 0.81047 151.27\n-0.00036789 7.646e-06 0.99855\n\nD: 1.3522 0.025037 96.693\n0.20588 1.5085 -279.44\n0.000418 4.2466e-05 1.0103\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 2.308 -0.067061 201.09\n0.71494 1.8702 -412.16\n0.0015273 -1.6972e-05 1.0162\n\nB: 1.3838 0.024181 -93.882\n0.093344 1.307 -232.76\n0.00015995 6.7546e-05 1.0008\n\nC: 0.46288 -0.016626 22.437\n-0.26713 0.81047 151.27\n-0.00036789 7.646e-06 0.99855\n\nD: 1.3522 0.025037 96.693\n0.20588 1.5085 -279.44\n0.000418 4.2466e-05 1.0103\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_86_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_86_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.4849 -0.15095 280.72\n-0.18568 0.38797 170.57\n-4.9965e-05 -0.00024428 0.99985\n\nB: 1.2108 -0.031741 47.374\n0.20996 1.0345 -107.36\n0.00054926 -6.3631e-06 1.0004\n\nC: 0.75268 -0.0092452 -71.273\n-0.17607 0.97566 6.3105\n-0.00029582 -1.5187e-05 0.99957\n\nD: 1.0063 -0.0054085 288.55\n0.23295 0.84053 7.8206\n0.0005941 1.4583e-05 1.0001\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.4849 -0.15095 280.72\n-0.18568 0.38797 170.57\n-4.9965e-05 -0.00024428 0.99985\n\nB: 1.2108 -0.031741 47.374\n0.20996 1.0345 -107.36\n0.00054926 -6.3631e-06 1.0004\n\nC: 0.75268 -0.0092452 -71.273\n-0.17607 0.97566 6.3105\n-0.00029582 -1.5187e-05 0.99957\n\nD: 1.0063 -0.0054085 288.55\n0.23295 0.84053 7.8206\n0.0005941 1.4583e-05 1.0001\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_87_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_87_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 1.3951 0.13641 136.74\n0.31704 1.2758 -219.28\n0.00053511 0.00013896 0.99675\n\nB: 0.31269 -0.011782 51.842\n-0.22276 0.71181 65.24\n-0.00081452 -4.173e-05 0.99309\n\nC: 0.22888 0.0058691 272.09\n-0.077153 0.3923 203.08\n-0.00024299 -4.5827e-06 1.0015\n\nD: 0.37083 -0.024499 139.16\n-0.094573 0.62749 65.353\n-0.00053805 -2.2225e-05 0.99885\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.3951 0.13641 136.74\n0.31704 1.2758 -219.28\n0.00053511 0.00013896 0.99675\n\nB: 0.31269 -0.011782 51.842\n-0.22276 0.71181 65.24\n-0.00081452 -4.173e-05 0.99309\n\nC: 0.22888 0.0058691 272.09\n-0.077153 0.3923 203.08\n-0.00024299 -4.5827e-06 1.0015\n\nD: 0.37083 -0.024499 139.16\n-0.094573 0.62749 65.353\n-0.00053805 -2.2225e-05 0.99885\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_88_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_88_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.66581 0.6777 -31.246\n-0.14346 0.96853 148.92\n0.00042869 -1.7355e-05 0.99928\n\nB: 0.23209 -0.67097 528.16\n0.66389 0.2516 -30.266\n-3.168e-05 2.5631e-05 1.0087\n\nC: 2.4665 0.083695 233.31\n0.87021 2.8235 -936.68\n0.0017821 0.0001592 0.98707\n\nD: 1.1346 -0.16977 -78.128\n-0.0017173 0.8512 -82.973\n8.0333e-07 -0.00031449 0.99917\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.66581 0.6777 -31.246\n-0.14346 0.96853 148.92\n0.00042869 -1.7355e-05 0.99928\n\nB: 0.23209 -0.67097 528.16\n0.66389 0.2516 -30.266\n-3.168e-05 2.5631e-05 1.0087\n\nC: 2.4665 0.083695 233.31\n0.87021 2.8235 -936.68\n0.0017821 0.0001592 0.98707\n\nD: 1.1346 -0.16977 -78.128\n-0.0017173 0.8512 -82.973\n8.0333e-07 -0.00031449 0.99917\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_89_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_89_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: -0.21679 -0.12572 585.55\n0.12463 -0.21699 355.1\n-1.085e-06 -1.8818e-06 1.0002\n\nB: 0.31237 0.099342 8.5389\n-0.29392 0.92363 14.629\n-0.00074642 6.3257e-05 0.99168\n\nC: 0.51123 -0.013639 59.603\n-0.16055 0.85238 103.24\n-0.0003334 -4.0403e-05 1.0009\n\nD: 1.1202 -0.0055862 43.04\n0.17566 1.0194 -5.6786\n0.00085767 -4.4625e-05 0.99922\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: -0.21679 -0.12572 585.55\n0.12463 -0.21699 355.1\n-1.085e-06 -1.8818e-06 1.0002\n\nB: 0.31237 0.099342 8.5389\n-0.29392 0.92363 14.629\n-0.00074642 6.3257e-05 0.99168\n\nC: 0.51123 -0.013639 59.603\n-0.16055 0.85238 103.24\n-0.0003334 -4.0403e-05 1.0009\n\nD: 1.1202 -0.0055862 43.04\n0.17566 1.0194 -5.6786\n0.00085767 -4.4625e-05 0.99922\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_90_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_90_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.85555 -0.17378 91.59\n0.17068 0.85755 -31.264\n-5.1182e-06 2.0966e-06 1.0023\n\nB: 0.10472 0.069057 99.841\n-0.17731 0.5329 107.18\n-0.00051255 -1.3734e-05 0.98616\n\nC: 1.4403 0.27154 10.734\n0.071471 1.5534 -44.533\n0.00030432 0.00049723 1.001\n\nD: 1.3838 0.024181 -93.882\n0.093344 1.307 -232.76\n0.00015995 6.7546e-05 1.0008\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.85555 -0.17378 91.59\n0.17068 0.85755 -31.264\n-5.1182e-06 2.0966e-06 1.0023\n\nB: 0.10472 0.069057 99.841\n-0.17731 0.5329 107.18\n-0.00051255 -1.3734e-05 0.98616\n\nC: 1.4403 0.27154 10.734\n0.071471 1.5534 -44.533\n0.00030432 0.00049723 1.001\n\nD: 1.3838 0.024181 -93.882\n0.093344 1.307 -232.76\n0.00015995 6.7546e-05 1.0008\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_91_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_91_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 2.3594 0.0026252 -116.05\n0.5085 2.302 -550.96\n0.0013826 0.0001837 1.0004\n\nB: 1.0669 0.31109 194.1\n-0.019953 0.9209 79.624\n0.000135 -7.6705e-05 0.99977\n\nC: 0.25611 0.0594 88.294\n-0.24702 0.7663 71.53\n-0.00048162 6.7687e-05 1.0008\n\nD: 0.54693 0.20925 -108.35\n-0.082341 1.1176 -236.48\n-0.0006026 0.0001769 1.0001\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 2.3594 0.0026252 -116.05\n0.5085 2.302 -550.96\n0.0013826 0.0001837 1.0004\n\nB: 1.0669 0.31109 194.1\n-0.019953 0.9209 79.624\n0.000135 -7.6705e-05 0.99977\n\nC: 0.25611 0.0594 88.294\n-0.24702 0.7663 71.53\n-0.00048162 6.7687e-05 1.0008\n\nD: 0.54693 0.20925 -108.35\n-0.082341 1.1176 -236.48\n-0.0006026 0.0001769 1.0001\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_92_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_92_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.63669 0.0018872 137.9\n-0.00033285 0.63926 95.922\n-2.0441e-06 4.1104e-06 1\n\nB: 1.3903 -0.069797 29.319\n0.18963 1.0284 22.049\n0.00052989 -9.8197e-05 1.0021\n\nC: 0.7855 0.039826 119.05\n-0.25749 1.3451 -220.69\n-0.00047304 5.3677e-05 1.001\n\nD: 1.6477 -0.037624 101.59\n0.49962 1.5725 -364.98\n0.00090272 4.6589e-05 1.0037\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.63669 0.0018872 137.9\n-0.00033285 0.63926 95.922\n-2.0441e-06 4.1104e-06 1\n\nB: 1.3903 -0.069797 29.319\n0.18963 1.0284 22.049\n0.00052989 -9.8197e-05 1.0021\n\nC: 0.7855 0.039826 119.05\n-0.25749 1.3451 -220.69\n-0.00047304 5.3677e-05 1.001\n\nD: 1.6477 -0.037624 101.59\n0.49962 1.5725 -364.98\n0.00090272 4.6589e-05 1.0037\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_93_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_93_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.45287 0.0061881 100.32\n-0.053734 0.66556 61.961\n-0.00023168 -5.8559e-06 1.0005\n\nB: 0.62091 -0.030805 57.622\n-0.22703 0.84222 -13.023\n-0.00037179 -4.2767e-05 0.99852\n\nC: 1.0669 0.31109 194.1\n-0.019953 0.9209 79.624\n0.000135 -7.6705e-05 0.99977\n\nD: 0.46461 0.085196 589.33\n0.19659 0.76327 25.833\n0.00026763 8.9486e-05 1.0006\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.45287 0.0061881 100.32\n-0.053734 0.66556 61.961\n-0.00023168 -5.8559e-06 1.0005\n\nB: 0.62091 -0.030805 57.622\n-0.22703 0.84222 -13.023\n-0.00037179 -4.2767e-05 0.99852\n\nC: 1.0669 0.31109 194.1\n-0.019953 0.9209 79.624\n0.000135 -7.6705e-05 0.99977\n\nD: 0.46461 0.085196 589.33\n0.19659 0.76327 25.833\n0.00026763 8.9486e-05 1.0006\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_94_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_94_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.31237 0.099342 8.5389\n-0.29392 0.92363 14.629\n-0.00074642 6.3257e-05 0.99168\n\nB: 0.1857 -0.0018512 147.73\n-0.094288 0.35154 277.67\n-0.00019671 -1.563e-05 0.9996\n\nC: 0.62147 0.055609 221.79\n0.21978 1.1561 -23.942\n0.00048557 -4.4311e-05 0.99866\n\nD: 0.55202 0.096567 108.66\n-0.35774 1.4927 -276.32\n-0.00068886 0.0001065 0.98986\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.31237 0.099342 8.5389\n-0.29392 0.92363 14.629\n-0.00074642 6.3257e-05 0.99168\n\nB: 0.1857 -0.0018512 147.73\n-0.094288 0.35154 277.67\n-0.00019671 -1.563e-05 0.9996\n\nC: 0.62147 0.055609 221.79\n0.21978 1.1561 -23.942\n0.00048557 -4.4311e-05 0.99866\n\nD: 0.55202 0.096567 108.66\n-0.35774 1.4927 -276.32\n-0.00068886 0.0001065 0.98986\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_95_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_95_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.62091 -0.030805 57.622\n-0.22703 0.84222 -13.023\n-0.00037179 -4.2767e-05 0.99852\n\nB: 1.4219 0.01866 342.44\n0.36005 1.3261 -141.73\n0.00090969 2.3838e-05 1.0002\n\nC: 0.38854 -0.073106 92.576\n-0.1986 0.7319 139.21\n-0.00040811 -1.555e-05 0.99988\n\nD: 0.1268 -0.03963 330.5\n-0.1892 0.46973 254.2\n-0.00039857 -3.9641e-05 0.99971\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.62091 -0.030805 57.622\n-0.22703 0.84222 -13.023\n-0.00037179 -4.2767e-05 0.99852\n\nB: 1.4219 0.01866 342.44\n0.36005 1.3261 -141.73\n0.00090969 2.3838e-05 1.0002\n\nC: 0.38854 -0.073106 92.576\n-0.1986 0.7319 139.21\n-0.00040811 -1.555e-05 0.99988\n\nD: 0.1268 -0.03963 330.5\n-0.1892 0.46973 254.2\n-0.00039857 -3.9641e-05 0.99971\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_96_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_96_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 1.7312 -0.086578 129.17\n0.3882 1.1026 -2.2164\n0.0010948 -0.00011788 1.0024\n\nB: 3.1418 0.21701 -576.91\n0.129 3.5039 -1062.5\n0.0014143 0.00082533 0.98844\n\nC: 0.67783 0.002447 123\n-0.00051063 0.68091 83.563\n-2.5166e-06 5.6486e-06 1\n\nD: 0.4605 0.0019073 42.778\n0.003918 0.45748 107.3\n1.6895e-05 4.8733e-06 1.0001\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.7312 -0.086578 129.17\n0.3882 1.1026 -2.2164\n0.0010948 -0.00011788 1.0024\n\nB: 3.1418 0.21701 -576.91\n0.129 3.5039 -1062.5\n0.0014143 0.00082533 0.98844\n\nC: 0.67783 0.002447 123\n-0.00051063 0.68091 83.563\n-2.5166e-06 5.6486e-06 1\n\nD: 0.4605 0.0019073 42.778\n0.003918 0.45748 107.3\n1.6895e-05 4.8733e-06 1.0001\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_97_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_97_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.77044 -0.014353 152.19\n0.007827 0.75172 76.397\n1.9039e-05 -2.1554e-05 1\n\nB: 1.2108 -0.031741 47.374\n0.20996 1.0345 -107.36\n0.00054926 -6.3631e-06 1.0004\n\nC: 1.5534 0.017684 158.94\n0.56083 1.4841 -343.65\n0.0010107 3.8363e-05 0.99895\n\nD: 1.9861 0.031586 27.893\n0.62141 1.9607 -531.99\n0.0011993 -1.9815e-05 0.99978\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.77044 -0.014353 152.19\n0.007827 0.75172 76.397\n1.9039e-05 -2.1554e-05 1\n\nB: 1.2108 -0.031741 47.374\n0.20996 1.0345 -107.36\n0.00054926 -6.3631e-06 1.0004\n\nC: 1.5534 0.017684 158.94\n0.56083 1.4841 -343.65\n0.0010107 3.8363e-05 0.99895\n\nD: 1.9861 0.031586 27.893\n0.62141 1.9607 -531.99\n0.0011993 -1.9815e-05 0.99978\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_98_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_98_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.45287 0.0061881 100.32\n-0.053734 0.66556 61.961\n-0.00023168 -5.8559e-06 1.0005\n\nB: 1.4862 -0.061679 54.577\n0.4606 1.2816 -147.5\n0.0007321 -7.3842e-05 0.99895\n\nC: 2.4144 -0.0022023 -199.3\n0.52146 2.0547 -569.49\n0.0010423 8.4489e-05 1.0043\n\nD: 2.3515 0.16969 142.03\n1.0602 2.1465 -778.33\n0.0016806 -4.8949e-05 0.99537\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.45287 0.0061881 100.32\n-0.053734 0.66556 61.961\n-0.00023168 -5.8559e-06 1.0005\n\nB: 1.4862 -0.061679 54.577\n0.4606 1.2816 -147.5\n0.0007321 -7.3842e-05 0.99895\n\nC: 2.4144 -0.0022023 -199.3\n0.52146 2.0547 -569.49\n0.0010423 8.4489e-05 1.0043\n\nD: 2.3515 0.16969 142.03\n1.0602 2.1465 -778.33\n0.0016806 -4.8949e-05 0.99537\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_99_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_99_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.37694 0.049406 111.53\n-0.16444 0.72986 84.602\n-0.00037753 4.0247e-05 0.99869\n\nB: 0.1176 -0.0075311 194.61\n-0.10067 0.3391 257.1\n-0.00023555 -9.6091e-06 0.99858\n\nC: 0.77105 -0.097833 -3.6994\n-0.092675 0.81167 92.799\n-0.0001392 -0.00012806 0.99964\n\nD: 0.48531 0.10549 -95.005\n-0.11843 0.77202 44.217\n-0.00029301 2.8434e-05 0.99773\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.37694 0.049406 111.53\n-0.16444 0.72986 84.602\n-0.00037753 4.0247e-05 0.99869\n\nB: 0.1176 -0.0075311 194.61\n-0.10067 0.3391 257.1\n-0.00023555 -9.6091e-06 0.99858\n\nC: 0.77105 -0.097833 -3.6994\n-0.092675 0.81167 92.799\n-0.0001392 -0.00012806 0.99964\n\nD: 0.48531 0.10549 -95.005\n-0.11843 0.77202 44.217\n-0.00029301 2.8434e-05 0.99773\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_100_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_100_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.54864 -0.010797 -6.1494\n-0.11876 0.86651 111.28\n-0.00026448 -1.8961e-05 1\n\nB: 0.57125 -0.095863 127.19\n0.050302 0.75099 -13.911\n-0.00020485 1.2421e-06 0.9999\n\nC: 0.79208 0.010314 26.019\n-0.023778 0.92337 43.513\n-0.00011513 1.2161e-05 1.0003\n\nD: 2.6177 0.042575 -65.797\n0.74359 2.3954 -903.27\n0.0018892 8.2816e-05 0.98996\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.54864 -0.010797 -6.1494\n-0.11876 0.86651 111.28\n-0.00026448 -1.8961e-05 1\n\nB: 0.57125 -0.095863 127.19\n0.050302 0.75099 -13.911\n-0.00020485 1.2421e-06 0.9999\n\nC: 0.79208 0.010314 26.019\n-0.023778 0.92337 43.513\n-0.00011513 1.2161e-05 1.0003\n\nD: 2.6177 0.042575 -65.797\n0.74359 2.3954 -903.27\n0.0018892 8.2816e-05 0.98996\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_101_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_101_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.18178 0.033268 82.883\n-0.24959 0.68306 123.62\n-0.0004688 5.3047e-05 1.0005\n\nB: 0.49202 0.0057754 242.06\n0.058005 0.43541 166.02\n0.00018017 1.0746e-05 0.99974\n\nC: 1.4259 0.070724 58.865\n0.39243 1.3442 -170.04\n0.00084248 0.00011346 0.98851\n\nD: 0.58099 -0.029382 -20.47\n-0.29479 0.73128 188.62\n-0.00043803 -4.3076e-05 1.0007\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.18178 0.033268 82.883\n-0.24959 0.68306 123.62\n-0.0004688 5.3047e-05 1.0005\n\nB: 0.49202 0.0057754 242.06\n0.058005 0.43541 166.02\n0.00018017 1.0746e-05 0.99974\n\nC: 1.4259 0.070724 58.865\n0.39243 1.3442 -170.04\n0.00084248 0.00011346 0.98851\n\nD: 0.58099 -0.029382 -20.47\n-0.29479 0.73128 188.62\n-0.00043803 -4.3076e-05 1.0007\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_102_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_102_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.66581 0.6777 -31.246\n-0.14346 0.96853 148.92\n0.00042869 -1.7355e-05 0.99928\n\nB: 0.54864 -0.010797 -6.1494\n-0.11876 0.86651 111.28\n-0.00026448 -1.8961e-05 1\n\nC: 0.48531 0.10549 -95.005\n-0.11843 0.77202 44.217\n-0.00029301 2.8434e-05 0.99773\n\nD: 0.69134 -0.0063829 116.24\n0.0053381 0.71985 83.96\n-1.8171e-05 2.7124e-05 1\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.66581 0.6777 -31.246\n-0.14346 0.96853 148.92\n0.00042869 -1.7355e-05 0.99928\n\nB: 0.54864 -0.010797 -6.1494\n-0.11876 0.86651 111.28\n-0.00026448 -1.8961e-05 1\n\nC: 0.48531 0.10549 -95.005\n-0.11843 0.77202 44.217\n-0.00029301 2.8434e-05 0.99773\n\nD: 0.69134 -0.0063829 116.24\n0.0053381 0.71985 83.96\n-1.8171e-05 2.7124e-05 1\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_103_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_103_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 5.1051 0.34986 -885.86\n1.0306 5.9768 -2733.1\n0.0033649 0.00099216 1\n\nB: 1.5134 -0.0029581 20.934\n0.2678 1.4062 -232.68\n0.00048583 -4.0311e-06 1.0006\n\nC: 2.2787 0.023843 -30.321\n0.58793 1.9158 -459.28\n0.0012782 -6.6868e-06 0.99971\n\nD: 0.030125 -0.01797 299.5\n-0.19573 0.45869 167.74\n-0.00051291 -3.9704e-05 1.0019\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 5.1051 0.34986 -885.86\n1.0306 5.9768 -2733.1\n0.0033649 0.00099216 1\n\nB: 1.5134 -0.0029581 20.934\n0.2678 1.4062 -232.68\n0.00048583 -4.0311e-06 1.0006\n\nC: 2.2787 0.023843 -30.321\n0.58793 1.9158 -459.28\n0.0012782 -6.6868e-06 0.99971\n\nD: 0.030125 -0.01797 299.5\n-0.19573 0.45869 167.74\n-0.00051291 -3.9704e-05 1.0019\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_104_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_104_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.79208 0.010314 26.019\n-0.023778 0.92337 43.513\n-0.00011513 1.2161e-05 1.0003\n\nB: 0.60879 -0.35761 289.93\n0.34822 0.61653 -30.949\n-2.0912e-05 1.3527e-06 1.014\n\nC: 0.63669 0.0018872 137.9\n-0.00033285 0.63926 95.922\n-2.0441e-06 4.1104e-06 1\n\nD: 1.8851 0.028166 274.85\n0.48185 1.6951 -326.97\n0.0011778 8.455e-05 0.99801\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.79208 0.010314 26.019\n-0.023778 0.92337 43.513\n-0.00011513 1.2161e-05 1.0003\n\nB: 0.60879 -0.35761 289.93\n0.34822 0.61653 -30.949\n-2.0912e-05 1.3527e-06 1.014\n\nC: 0.63669 0.0018872 137.9\n-0.00033285 0.63926 95.922\n-2.0441e-06 4.1104e-06 1\n\nD: 1.8851 0.028166 274.85\n0.48185 1.6951 -326.97\n0.0011778 8.455e-05 0.99801\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_105_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_105_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 1.4733 -0.014435 76.772\n0.25007 1.2556 -120.81\n0.00088206 8.1414e-05 1.002\n\nB: 0.13416 0.073075 56.977\n-0.21333 0.70433 84.528\n-0.00055481 6.1106e-05 1\n\nC: 3.4851 0.086317 195.9\n1.1598 3.067 -1009.5\n0.0025647 -5.4567e-05 0.99349\n\nD: 0.40245 -0.33938 102.29\n-0.2125 0.62381 216.78\n-0.00033866 -1.5855e-05 1.0018\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.4733 -0.014435 76.772\n0.25007 1.2556 -120.81\n0.00088206 8.1414e-05 1.002\n\nB: 0.13416 0.073075 56.977\n-0.21333 0.70433 84.528\n-0.00055481 6.1106e-05 1\n\nC: 3.4851 0.086317 195.9\n1.1598 3.067 -1009.5\n0.0025647 -5.4567e-05 0.99349\n\nD: 0.40245 -0.33938 102.29\n-0.2125 0.62381 216.78\n-0.00033866 -1.5855e-05 1.0018\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_106_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_106_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.77044 -0.014353 152.19\n0.007827 0.75172 76.397\n1.9039e-05 -2.1554e-05 1\n\nB: 0.66581 0.6777 -31.246\n-0.14346 0.96853 148.92\n0.00042869 -1.7355e-05 0.99928\n\nC: 0.54372 0.011697 65.787\n-0.06271 0.8727 105.67\n-0.00025117 2.4814e-06 0.99967\n\nD: 1.7312 -0.086578 129.17\n0.3882 1.1026 -2.2164\n0.0010948 -0.00011788 1.0024\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.77044 -0.014353 152.19\n0.007827 0.75172 76.397\n1.9039e-05 -2.1554e-05 1\n\nB: 0.66581 0.6777 -31.246\n-0.14346 0.96853 148.92\n0.00042869 -1.7355e-05 0.99928\n\nC: 0.54372 0.011697 65.787\n-0.06271 0.8727 105.67\n-0.00025117 2.4814e-06 0.99967\n\nD: 1.7312 -0.086578 129.17\n0.3882 1.1026 -2.2164\n0.0010948 -0.00011788 1.0024\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_107_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_107_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 1.0505 -0.0053825 276.45\n0.20631 0.92888 48.832\n0.00048841 -1.9251e-05 0.99878\n\nB: 0.056448 -0.012851 135.19\n-0.38625 0.54689 255.61\n-0.00066718 5.392e-05 1.0012\n\nC: 0.55202 0.096567 108.66\n-0.35774 1.4927 -276.32\n-0.00068886 0.0001065 0.98986\n\nD: 0.20876 0.015221 174.06\n-0.13382 0.55012 11.64\n-0.00044084 3.575e-05 1.0177\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.0505 -0.0053825 276.45\n0.20631 0.92888 48.832\n0.00048841 -1.9251e-05 0.99878\n\nB: 0.056448 -0.012851 135.19\n-0.38625 0.54689 255.61\n-0.00066718 5.392e-05 1.0012\n\nC: 0.55202 0.096567 108.66\n-0.35774 1.4927 -276.32\n-0.00068886 0.0001065 0.98986\n\nD: 0.20876 0.015221 174.06\n-0.13382 0.55012 11.64\n-0.00044084 3.575e-05 1.0177\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_108_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_108_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 1.141 -0.024147 186.42\n0.29573 0.97376 -60.872\n0.00082251 -1.0843e-05 0.99973\n\nB: 0.42945 0.0071566 96.266\n-0.019537 0.48377 43.049\n-7.8698e-05 1.6013e-05 1.0001\n\nC: 0.48275 -0.12831 276.04\n-0.19138 0.40711 199.19\n-5.6548e-05 -0.00023367 0.99912\n\nD: 0.4849 -0.15095 280.72\n-0.18568 0.38797 170.57\n-4.9965e-05 -0.00024428 0.99985\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.141 -0.024147 186.42\n0.29573 0.97376 -60.872\n0.00082251 -1.0843e-05 0.99973\n\nB: 0.42945 0.0071566 96.266\n-0.019537 0.48377 43.049\n-7.8698e-05 1.6013e-05 1.0001\n\nC: 0.48275 -0.12831 276.04\n-0.19138 0.40711 199.19\n-5.6548e-05 -0.00023367 0.99912\n\nD: 0.4849 -0.15095 280.72\n-0.18568 0.38797 170.57\n-4.9965e-05 -0.00024428 0.99985\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_109_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_109_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.040904 -0.0023332 234.76\n-0.10713 0.35038 218.5\n-0.00028907 6.311e-06 1.0035\n\nB: 0.38914 0.285 169.51\n-0.28531 0.39347 340.1\n-6.4617e-06 5.0341e-06 1\n\nC: 0.0033111 0.031282 184.63\n-0.15843 0.75999 4.5609\n-0.00083562 0.00011238 0.99927\n\nD: 0.33414 0.069646 90.22\n-0.25229 0.73446 157.67\n-0.00038885 2.2582e-06 1.0024\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.040904 -0.0023332 234.76\n-0.10713 0.35038 218.5\n-0.00028907 6.311e-06 1.0035\n\nB: 0.38914 0.285 169.51\n-0.28531 0.39347 340.1\n-6.4617e-06 5.0341e-06 1\n\nC: 0.0033111 0.031282 184.63\n-0.15843 0.75999 4.5609\n-0.00083562 0.00011238 0.99927\n\nD: 0.33414 0.069646 90.22\n-0.25229 0.73446 157.67\n-0.00038885 2.2582e-06 1.0024\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_110_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_110_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.1857 -0.0018512 147.73\n-0.094288 0.35154 277.67\n-0.00019671 -1.563e-05 0.9996\n\nB: 0.42186 0.031568 60.169\n-0.084563 0.88575 93.738\n-0.00032749 1.4457e-05 1.0012\n\nC: -0.47246 -0.28359 869.57\n0.29041 -0.47016 396.67\n5.0949e-06 1.2499e-05 0.99998\n\nD: 1.6408 -0.0013389 -221.64\n0.1704 1.44 -155.56\n0.00036369 -3.22e-05 1.0003\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.1857 -0.0018512 147.73\n-0.094288 0.35154 277.67\n-0.00019671 -1.563e-05 0.9996\n\nB: 0.42186 0.031568 60.169\n-0.084563 0.88575 93.738\n-0.00032749 1.4457e-05 1.0012\n\nC: -0.47246 -0.28359 869.57\n0.29041 -0.47016 396.67\n5.0949e-06 1.2499e-05 0.99998\n\nD: 1.6408 -0.0013389 -221.64\n0.1704 1.44 -155.56\n0.00036369 -3.22e-05 1.0003\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_111_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_111_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: -0.21679 -0.12572 585.55\n0.12463 -0.21699 355.1\n-1.085e-06 -1.8818e-06 1.0002\n\nB: 0.54864 -0.010797 -6.1494\n-0.11876 0.86651 111.28\n-0.00026448 -1.8961e-05 1\n\nC: 0.13896 0.020204 194.37\n-0.25201 0.63798 118.99\n-0.00052359 2.2762e-05 0.9996\n\nD: 1.4932 0.01661 231.74\n0.45676 1.4341 -212.29\n0.0013256 9.9938e-05 0.99686\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: -0.21679 -0.12572 585.55\n0.12463 -0.21699 355.1\n-1.085e-06 -1.8818e-06 1.0002\n\nB: 0.54864 -0.010797 -6.1494\n-0.11876 0.86651 111.28\n-0.00026448 -1.8961e-05 1\n\nC: 0.13896 0.020204 194.37\n-0.25201 0.63798 118.99\n-0.00052359 2.2762e-05 0.9996\n\nD: 1.4932 0.01661 231.74\n0.45676 1.4341 -212.29\n0.0013256 9.9938e-05 0.99686\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_112_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_112_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 1.4932 0.01661 231.74\n0.45676 1.4341 -212.29\n0.0013256 9.9938e-05 0.99686\n\nB: 0.48275 -0.12831 276.04\n-0.19138 0.40711 199.19\n-5.6548e-05 -0.00023367 0.99912\n\nC: 0.46288 -0.016626 22.437\n-0.26713 0.81047 151.27\n-0.00036789 7.646e-06 0.99855\n\nD: 0.14586 0.056449 119.48\n-0.21737 0.71439 95.786\n-0.00051182 3.3282e-05 1.0008\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.4932 0.01661 231.74\n0.45676 1.4341 -212.29\n0.0013256 9.9938e-05 0.99686\n\nB: 0.48275 -0.12831 276.04\n-0.19138 0.40711 199.19\n-5.6548e-05 -0.00023367 0.99912\n\nC: 0.46288 -0.016626 22.437\n-0.26713 0.81047 151.27\n-0.00036789 7.646e-06 0.99855\n\nD: 0.14586 0.056449 119.48\n-0.21737 0.71439 95.786\n-0.00051182 3.3282e-05 1.0008\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_113_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_113_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.44469 -0.1629 197.72\n-0.090792 0.33606 37.55\n-0.00032851 -0.00028415 1.0004\n\nB: 0.4605 0.0019073 42.778\n0.003918 0.45748 107.3\n1.6895e-05 4.8733e-06 1.0001\n\nC: 1.6408 -0.0013389 -221.64\n0.1704 1.44 -155.56\n0.00036369 -3.22e-05 1.0003\n\nD: 0.45841 0.038317 36.428\n-0.26806 0.75693 165.6\n-0.00037539 -1.4035e-05 1.0016\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.44469 -0.1629 197.72\n-0.090792 0.33606 37.55\n-0.00032851 -0.00028415 1.0004\n\nB: 0.4605 0.0019073 42.778\n0.003918 0.45748 107.3\n1.6895e-05 4.8733e-06 1.0001\n\nC: 1.6408 -0.0013389 -221.64\n0.1704 1.44 -155.56\n0.00036369 -3.22e-05 1.0003\n\nD: 0.45841 0.038317 36.428\n-0.26806 0.75693 165.6\n-0.00037539 -1.4035e-05 1.0016\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_114_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_114_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.67444 0.023361 37.089\n-0.047926 0.90094 60.932\n-0.00018688 1.1402e-05 1.0007\n\nB: 0.2024 0.0033266 96.15\n-0.28093 0.65512 201.73\n-0.00049784 1.8106e-06 1.0048\n\nC: 0.37083 -0.024499 139.16\n-0.094573 0.62749 65.353\n-0.00053805 -2.2225e-05 0.99885\n\nD: 1.0669 0.31109 194.1\n-0.019953 0.9209 79.624\n0.000135 -7.6705e-05 0.99977\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.67444 0.023361 37.089\n-0.047926 0.90094 60.932\n-0.00018688 1.1402e-05 1.0007\n\nB: 0.2024 0.0033266 96.15\n-0.28093 0.65512 201.73\n-0.00049784 1.8106e-06 1.0048\n\nC: 0.37083 -0.024499 139.16\n-0.094573 0.62749 65.353\n-0.00053805 -2.2225e-05 0.99885\n\nD: 1.0669 0.31109 194.1\n-0.019953 0.9209 79.624\n0.000135 -7.6705e-05 0.99977\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_115_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_115_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 1.8454 -0.0093839 117.6\n0.8533 1.9335 -566.11\n0.0016091 6.8147e-05 1.0105\n\nB: 0.47589 0.042551 60.888\n-0.21388 0.80238 62.033\n-0.0003663 2.6901e-05 1.001\n\nC: 3.4851 0.086317 195.9\n1.1598 3.067 -1009.5\n0.0025647 -5.4567e-05 0.99349\n\nD: 0.14705 0.061323 72.893\n-0.27582 0.69094 109.44\n-0.00056993 1.3825e-06 0.9981\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.8454 -0.0093839 117.6\n0.8533 1.9335 -566.11\n0.0016091 6.8147e-05 1.0105\n\nB: 0.47589 0.042551 60.888\n-0.21388 0.80238 62.033\n-0.0003663 2.6901e-05 1.001\n\nC: 3.4851 0.086317 195.9\n1.1598 3.067 -1009.5\n0.0025647 -5.4567e-05 0.99349\n\nD: 0.14705 0.061323 72.893\n-0.27582 0.69094 109.44\n-0.00056993 1.3825e-06 0.9981\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_116_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_116_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.37107 -0.09213 318.73\n0.086334 0.37505 188.02\n-1.0814e-05 -3.6548e-06 1\n\nB: 0.29534 0.035751 -56.21\n-0.35718 0.5432 233.53\n-0.00064211 -1.1093e-05 0.97783\n\nC: 0.37694 0.049406 111.53\n-0.16444 0.72986 84.602\n-0.00037753 4.0247e-05 0.99869\n\nD: 2.2787 0.023843 -30.321\n0.58793 1.9158 -459.28\n0.0012782 -6.6868e-06 0.99971\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.37107 -0.09213 318.73\n0.086334 0.37505 188.02\n-1.0814e-05 -3.6548e-06 1\n\nB: 0.29534 0.035751 -56.21\n-0.35718 0.5432 233.53\n-0.00064211 -1.1093e-05 0.97783\n\nC: 0.37694 0.049406 111.53\n-0.16444 0.72986 84.602\n-0.00037753 4.0247e-05 0.99869\n\nD: 2.2787 0.023843 -30.321\n0.58793 1.9158 -459.28\n0.0012782 -6.6868e-06 0.99971\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_117_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_117_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 1.1442 -0.037625 115.5\n0.22206 1.0286 -30.039\n0.00032815 -2.4116e-05 0.9999\n\nB: 0.20876 0.015221 174.06\n-0.13382 0.55012 11.64\n-0.00044084 3.575e-05 1.0177\n\nC: 2.4144 -0.0022023 -199.3\n0.52146 2.0547 -569.49\n0.0010423 8.4489e-05 1.0043\n\nD: 0.51123 -0.013639 59.603\n-0.16055 0.85238 103.24\n-0.0003334 -4.0403e-05 1.0009\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.1442 -0.037625 115.5\n0.22206 1.0286 -30.039\n0.00032815 -2.4116e-05 0.9999\n\nB: 0.20876 0.015221 174.06\n-0.13382 0.55012 11.64\n-0.00044084 3.575e-05 1.0177\n\nC: 2.4144 -0.0022023 -199.3\n0.52146 2.0547 -569.49\n0.0010423 8.4489e-05 1.0043\n\nD: 0.51123 -0.013639 59.603\n-0.16055 0.85238 103.24\n-0.0003334 -4.0403e-05 1.0009\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_118_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_118_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.31237 0.099342 8.5389\n-0.29392 0.92363 14.629\n-0.00074642 6.3257e-05 0.99168\n\nB: 1.4272 0.064496 -40.82\n0.15764 1.3161 -94.847\n0.00037033 4.6015e-05 0.99258\n\nC: 14.984 -1.5209 -1987.5\n0.59203 13.878 -3896.8\n0.0072047 0.0038814 0.92614\n\nD: 1.2108 -0.031741 47.374\n0.20996 1.0345 -107.36\n0.00054926 -6.3631e-06 1.0004\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.31237 0.099342 8.5389\n-0.29392 0.92363 14.629\n-0.00074642 6.3257e-05 0.99168\n\nB: 1.4272 0.064496 -40.82\n0.15764 1.3161 -94.847\n0.00037033 4.6015e-05 0.99258\n\nC: 14.984 -1.5209 -1987.5\n0.59203 13.878 -3896.8\n0.0072047 0.0038814 0.92614\n\nD: 1.2108 -0.031741 47.374\n0.20996 1.0345 -107.36\n0.00054926 -6.3631e-06 1.0004\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_119_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_119_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 1.4733 -0.014435 76.772\n0.25007 1.2556 -120.81\n0.00088206 8.1414e-05 1.002\n\nB: 0.10472 0.069057 99.841\n-0.17731 0.5329 107.18\n-0.00051255 -1.3734e-05 0.98616\n\nC: 0.030125 -0.01797 299.5\n-0.19573 0.45869 167.74\n-0.00051291 -3.9704e-05 1.0019\n\nD: 0.36677 -0.019493 213.68\n-0.082321 0.47708 180.81\n-0.00021125 -4.1441e-05 1.0123\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.4733 -0.014435 76.772\n0.25007 1.2556 -120.81\n0.00088206 8.1414e-05 1.002\n\nB: 0.10472 0.069057 99.841\n-0.17731 0.5329 107.18\n-0.00051255 -1.3734e-05 0.98616\n\nC: 0.030125 -0.01797 299.5\n-0.19573 0.45869 167.74\n-0.00051291 -3.9704e-05 1.0019\n\nD: 0.36677 -0.019493 213.68\n-0.082321 0.47708 180.81\n-0.00021125 -4.1441e-05 1.0123\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_120_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_120_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.43124 0.047668 -66.525\n-0.34772 0.62068 209.27\n-0.00060194 -2.1104e-07 0.98648\n\nB: 1.1202 -0.0055862 43.04\n0.17566 1.0194 -5.6786\n0.00085767 -4.4625e-05 0.99922\n\nC: 0.4849 -0.15095 280.72\n-0.18568 0.38797 170.57\n-4.9965e-05 -0.00024428 0.99985\n\nD: 0.40245 -0.33938 102.29\n-0.2125 0.62381 216.78\n-0.00033866 -1.5855e-05 1.0018\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.43124 0.047668 -66.525\n-0.34772 0.62068 209.27\n-0.00060194 -2.1104e-07 0.98648\n\nB: 1.1202 -0.0055862 43.04\n0.17566 1.0194 -5.6786\n0.00085767 -4.4625e-05 0.99922\n\nC: 0.4849 -0.15095 280.72\n-0.18568 0.38797 170.57\n-4.9965e-05 -0.00024428 0.99985\n\nD: 0.40245 -0.33938 102.29\n-0.2125 0.62381 216.78\n-0.00033866 -1.5855e-05 1.0018\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_121_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_121_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 1.8454 -0.0093839 117.6\n0.8533 1.9335 -566.11\n0.0016091 6.8147e-05 1.0105\n\nB: 0.88184 0.31397 -39.976\n-0.18167 0.93621 153.25\n0.00020118 -1.9028e-05 0.99997\n\nC: 0.030125 -0.01797 299.5\n-0.19573 0.45869 167.74\n-0.00051291 -3.9704e-05 1.0019\n\nD: 0.53266 0.0019756 44.297\n-0.18137 0.85955 61.945\n-0.00038035 1.4705e-06 0.9999\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.8454 -0.0093839 117.6\n0.8533 1.9335 -566.11\n0.0016091 6.8147e-05 1.0105\n\nB: 0.88184 0.31397 -39.976\n-0.18167 0.93621 153.25\n0.00020118 -1.9028e-05 0.99997\n\nC: 0.030125 -0.01797 299.5\n-0.19573 0.45869 167.74\n-0.00051291 -3.9704e-05 1.0019\n\nD: 0.53266 0.0019756 44.297\n-0.18137 0.85955 61.945\n-0.00038035 1.4705e-06 0.9999\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_122_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_122_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 1.4932 0.01661 231.74\n0.45676 1.4341 -212.29\n0.0013256 9.9938e-05 0.99686\n\nB: 2.9721 0.034514 6.1536\n0.86739 2.9829 -532.95\n0.0035453 0.00017204 0.95976\n\nC: 0.091252 0.0066749 132.72\n-0.14667 0.47258 88.51\n-0.00056772 8.3791e-06 1.0029\n\nD: 2.1479 0.036813 206.94\n0.67819 1.8174 -485.8\n0.0012074 -6.8043e-06 0.99599\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.4932 0.01661 231.74\n0.45676 1.4341 -212.29\n0.0013256 9.9938e-05 0.99686\n\nB: 2.9721 0.034514 6.1536\n0.86739 2.9829 -532.95\n0.0035453 0.00017204 0.95976\n\nC: 0.091252 0.0066749 132.72\n-0.14667 0.47258 88.51\n-0.00056772 8.3791e-06 1.0029\n\nD: 2.1479 0.036813 206.94\n0.67819 1.8174 -485.8\n0.0012074 -6.8043e-06 0.99599\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_123_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_123_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 1.0582 -0.013384 562.45\n0.1807 0.93712 36.472\n0.00043718 5.9368e-06 0.99927\n\nB: 1.547 0.11677 155.75\n0.40373 1.373 -170.1\n0.00090791 8.8782e-05 1.0012\n\nC: 0.012717 0.014394 193.52\n-0.12386 0.60301 126.7\n-0.00063953 7.9665e-05 1.0012\n\nD: 0.54304 0.026384 236.48\n-0.041921 0.64806 87.13\n-5.8662e-05 1.5685e-05 1\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.0582 -0.013384 562.45\n0.1807 0.93712 36.472\n0.00043718 5.9368e-06 0.99927\n\nB: 1.547 0.11677 155.75\n0.40373 1.373 -170.1\n0.00090791 8.8782e-05 1.0012\n\nC: 0.012717 0.014394 193.52\n-0.12386 0.60301 126.7\n-0.00063953 7.9665e-05 1.0012\n\nD: 0.54304 0.026384 236.48\n-0.041921 0.64806 87.13\n-5.8662e-05 1.5685e-05 1\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_124_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_124_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.37694 0.049406 111.53\n-0.16444 0.72986 84.602\n-0.00037753 4.0247e-05 0.99869\n\nB: 0.69134 -0.0063829 116.24\n0.0053381 0.71985 83.96\n-1.8171e-05 2.7124e-05 1\n\nC: 0.52949 -0.028655 46.849\n-0.2451 0.79991 158.44\n-0.00032499 -1.8164e-05 0.99959\n\nD: 0.52064 0.019326 41.006\n-0.1476 0.75468 101.84\n-0.00026848 4.5639e-05 1.0094\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.37694 0.049406 111.53\n-0.16444 0.72986 84.602\n-0.00037753 4.0247e-05 0.99869\n\nB: 0.69134 -0.0063829 116.24\n0.0053381 0.71985 83.96\n-1.8171e-05 2.7124e-05 1\n\nC: 0.52949 -0.028655 46.849\n-0.2451 0.79991 158.44\n-0.00032499 -1.8164e-05 0.99959\n\nD: 0.52064 0.019326 41.006\n-0.1476 0.75468 101.84\n-0.00026848 4.5639e-05 1.0094\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_125_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_125_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 1.9834 -0.0016422 376.55\n0.84 1.4832 -241.61\n0.0019136 -3.8955e-05 1.0014\n\nB: 2.2787 0.023843 -30.321\n0.58793 1.9158 -459.28\n0.0012782 -6.6868e-06 0.99971\n\nC: 0.24117 0.068506 48.185\n-0.23318 0.79398 68.106\n-0.0005259 5.079e-05 0.99834\n\nD: 1.7312 -0.086578 129.17\n0.3882 1.1026 -2.2164\n0.0010948 -0.00011788 1.0024\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.9834 -0.0016422 376.55\n0.84 1.4832 -241.61\n0.0019136 -3.8955e-05 1.0014\n\nB: 2.2787 0.023843 -30.321\n0.58793 1.9158 -459.28\n0.0012782 -6.6868e-06 0.99971\n\nC: 0.24117 0.068506 48.185\n-0.23318 0.79398 68.106\n-0.0005259 5.079e-05 0.99834\n\nD: 1.7312 -0.086578 129.17\n0.3882 1.1026 -2.2164\n0.0010948 -0.00011788 1.0024\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_126_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_126_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 1.0983 -0.030393 111.31\n0.31879 0.9789 58.516\n0.00050073 -5.3943e-05 1.0005\n\nB: 0.76922 -0.28498 222.68\n0.33855 1.0341 -81.069\n0.00035349 1.2014e-05 0.99834\n\nC: 0.45841 0.038317 36.428\n-0.26806 0.75693 165.6\n-0.00037539 -1.4035e-05 1.0016\n\nD: 0.39176 -0.48622 421.69\n0.48543 0.39488 -0.097812\n2.3979e-06 -3.3236e-06 1\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.0983 -0.030393 111.31\n0.31879 0.9789 58.516\n0.00050073 -5.3943e-05 1.0005\n\nB: 0.76922 -0.28498 222.68\n0.33855 1.0341 -81.069\n0.00035349 1.2014e-05 0.99834\n\nC: 0.45841 0.038317 36.428\n-0.26806 0.75693 165.6\n-0.00037539 -1.4035e-05 1.0016\n\nD: 0.39176 -0.48622 421.69\n0.48543 0.39488 -0.097812\n2.3979e-06 -3.3236e-06 1\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_127_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_127_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.6729 -0.01895 127.73\n-0.015916 0.67847 176.42\n-3.6225e-05 -3.2204e-05 1\n\nB: 0.091252 0.0066749 132.72\n-0.14667 0.47258 88.51\n-0.00056772 8.3791e-06 1.0029\n\nC: 0.60367 0.071352 -36.528\n-0.21232 0.96671 -45.299\n-0.00036835 6.7456e-05 0.99996\n\nD: 0.14586 0.056449 119.48\n-0.21737 0.71439 95.786\n-0.00051182 3.3282e-05 1.0008\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.6729 -0.01895 127.73\n-0.015916 0.67847 176.42\n-3.6225e-05 -3.2204e-05 1\n\nB: 0.091252 0.0066749 132.72\n-0.14667 0.47258 88.51\n-0.00056772 8.3791e-06 1.0029\n\nC: 0.60367 0.071352 -36.528\n-0.21232 0.96671 -45.299\n-0.00036835 6.7456e-05 0.99996\n\nD: 0.14586 0.056449 119.48\n-0.21737 0.71439 95.786\n-0.00051182 3.3282e-05 1.0008\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_128_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_128_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.28973 0.014397 100.07\n-0.29955 0.64174 168.27\n-0.00067332 7.239e-06 1.0017\n\nB: 0.37618 -0.0026073 58.013\n-0.13988 0.81886 117.4\n-0.00032276 -1.1378e-05 0.99983\n\nC: 0.39176 -0.48622 421.69\n0.48543 0.39488 -0.097812\n2.3979e-06 -3.3236e-06 1\n\nD: -0.19998 0.34647 247.36\n-0.34607 -0.19989 467.21\n2.0354e-07 -5.1701e-08 1\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.28973 0.014397 100.07\n-0.29955 0.64174 168.27\n-0.00067332 7.239e-06 1.0017\n\nB: 0.37618 -0.0026073 58.013\n-0.13988 0.81886 117.4\n-0.00032276 -1.1378e-05 0.99983\n\nC: 0.39176 -0.48622 421.69\n0.48543 0.39488 -0.097812\n2.3979e-06 -3.3236e-06 1\n\nD: -0.19998 0.34647 247.36\n-0.34607 -0.19989 467.21\n2.0354e-07 -5.1701e-08 1\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_129_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_129_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.2024 0.0033266 96.15\n-0.28093 0.65512 201.73\n-0.00049784 1.8106e-06 1.0048\n\nB: 0.54304 0.026384 236.48\n-0.041921 0.64806 87.13\n-5.8662e-05 1.5685e-05 1\n\nC: 0.34904 -0.0038637 -43.899\n-0.22316 0.99346 45.579\n-0.00041195 -1.2246e-05 1\n\nD: 0.70212 0.43231 -128.54\n-0.42351 0.70276 199.3\n6.3285e-06 1.2175e-05 0.99997\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.2024 0.0033266 96.15\n-0.28093 0.65512 201.73\n-0.00049784 1.8106e-06 1.0048\n\nB: 0.54304 0.026384 236.48\n-0.041921 0.64806 87.13\n-5.8662e-05 1.5685e-05 1\n\nC: 0.34904 -0.0038637 -43.899\n-0.22316 0.99346 45.579\n-0.00041195 -1.2246e-05 1\n\nD: 0.70212 0.43231 -128.54\n-0.42351 0.70276 199.3\n6.3285e-06 1.2175e-05 0.99997\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_130_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_130_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.48275 -0.12831 276.04\n-0.19138 0.40711 199.19\n-5.6548e-05 -0.00023367 0.99912\n\nB: 1.8954 -0.043603 197.83\n0.50589 1.509 -236.95\n0.0010644 -1.6279e-05 1.0115\n\nC: 0.94726 0.076953 177.36\n0.25112 1.0126 13.205\n0.00047269 2.7805e-05 0.99969\n\nD: 0.23209 -0.67097 528.16\n0.66389 0.2516 -30.266\n-3.168e-05 2.5631e-05 1.0087\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.48275 -0.12831 276.04\n-0.19138 0.40711 199.19\n-5.6548e-05 -0.00023367 0.99912\n\nB: 1.8954 -0.043603 197.83\n0.50589 1.509 -236.95\n0.0010644 -1.6279e-05 1.0115\n\nC: 0.94726 0.076953 177.36\n0.25112 1.0126 13.205\n0.00047269 2.7805e-05 0.99969\n\nD: 0.23209 -0.67097 528.16\n0.66389 0.2516 -30.266\n-3.168e-05 2.5631e-05 1.0087\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_131_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_131_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 3.1627 -0.045434 -351.49\n0.7877 2.8197 -842.9\n0.0015033 -3.676e-05 1.0055\n\nB: 0.45287 0.0061881 100.32\n-0.053734 0.66556 61.961\n-0.00023168 -5.8559e-06 1.0005\n\nC: 1.8954 -0.043603 197.83\n0.50589 1.509 -236.95\n0.0010644 -1.6279e-05 1.0115\n\nD: 1.441 -0.037212 269.33\n0.73295 1.6438 -380.65\n0.0014226 4.1601e-05 1.0102\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 3.1627 -0.045434 -351.49\n0.7877 2.8197 -842.9\n0.0015033 -3.676e-05 1.0055\n\nB: 0.45287 0.0061881 100.32\n-0.053734 0.66556 61.961\n-0.00023168 -5.8559e-06 1.0005\n\nC: 1.8954 -0.043603 197.83\n0.50589 1.509 -236.95\n0.0010644 -1.6279e-05 1.0115\n\nD: 1.441 -0.037212 269.33\n0.73295 1.6438 -380.65\n0.0014226 4.1601e-05 1.0102\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_132_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_132_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.46461 0.085196 589.33\n0.19659 0.76327 25.833\n0.00026763 8.9486e-05 1.0006\n\nB: 1.3903 -0.069797 29.319\n0.18963 1.0284 22.049\n0.00052989 -9.8197e-05 1.0021\n\nC: 0.45841 0.038317 36.428\n-0.26806 0.75693 165.6\n-0.00037539 -1.4035e-05 1.0016\n\nD: 0.35568 0.079611 -21.49\n-0.17793 0.7199 62.24\n-0.00050458 1.9913e-05 0.9982\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.46461 0.085196 589.33\n0.19659 0.76327 25.833\n0.00026763 8.9486e-05 1.0006\n\nB: 1.3903 -0.069797 29.319\n0.18963 1.0284 22.049\n0.00052989 -9.8197e-05 1.0021\n\nC: 0.45841 0.038317 36.428\n-0.26806 0.75693 165.6\n-0.00037539 -1.4035e-05 1.0016\n\nD: 0.35568 0.079611 -21.49\n-0.17793 0.7199 62.24\n-0.00050458 1.9913e-05 0.9982\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_133_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_133_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.54693 0.20925 -108.35\n-0.082341 1.1176 -236.48\n-0.0006026 0.0001769 1.0001\n\nB: 0.32788 -0.00026656 168.52\n-0.087696 0.49289 72.043\n-0.00025798 4.6006e-06 0.9984\n\nC: 1.3526 0.026797 436.87\n0.31517 1.3826 -234.04\n0.00076901 0.00022984 1.0039\n\nD: 1.3903 -0.069797 29.319\n0.18963 1.0284 22.049\n0.00052989 -9.8197e-05 1.0021\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.54693 0.20925 -108.35\n-0.082341 1.1176 -236.48\n-0.0006026 0.0001769 1.0001\n\nB: 0.32788 -0.00026656 168.52\n-0.087696 0.49289 72.043\n-0.00025798 4.6006e-06 0.9984\n\nC: 1.3526 0.026797 436.87\n0.31517 1.3826 -234.04\n0.00076901 0.00022984 1.0039\n\nD: 1.3903 -0.069797 29.319\n0.18963 1.0284 22.049\n0.00052989 -9.8197e-05 1.0021\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_134_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_134_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.4849 -0.15095 280.72\n-0.18568 0.38797 170.57\n-4.9965e-05 -0.00024428 0.99985\n\nB: 2.4665 0.083695 233.31\n0.87021 2.8235 -936.68\n0.0017821 0.0001592 0.98707\n\nC: 0.72201 0.13445 62.975\n0.059719 0.85126 46.305\n-1.7322e-05 0.00018166 1.0001\n\nD: 1.2869 -0.0035671 90.117\n0.34981 1.1421 -290.48\n0.0010338 2.5575e-05 0.99928\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.4849 -0.15095 280.72\n-0.18568 0.38797 170.57\n-4.9965e-05 -0.00024428 0.99985\n\nB: 2.4665 0.083695 233.31\n0.87021 2.8235 -936.68\n0.0017821 0.0001592 0.98707\n\nC: 0.72201 0.13445 62.975\n0.059719 0.85126 46.305\n-1.7322e-05 0.00018166 1.0001\n\nD: 1.2869 -0.0035671 90.117\n0.34981 1.1421 -290.48\n0.0010338 2.5575e-05 0.99928\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_135_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_135_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.39176 -0.48622 421.69\n0.48543 0.39488 -0.097812\n2.3979e-06 -3.3236e-06 1\n\nB: 1.4259 0.070724 58.865\n0.39243 1.3442 -170.04\n0.00084248 0.00011346 0.98851\n\nC: 1.1442 -0.037625 115.5\n0.22206 1.0286 -30.039\n0.00032815 -2.4116e-05 0.9999\n\nD: 1.8454 -0.0093839 117.6\n0.8533 1.9335 -566.11\n0.0016091 6.8147e-05 1.0105\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.39176 -0.48622 421.69\n0.48543 0.39488 -0.097812\n2.3979e-06 -3.3236e-06 1\n\nB: 1.4259 0.070724 58.865\n0.39243 1.3442 -170.04\n0.00084248 0.00011346 0.98851\n\nC: 1.1442 -0.037625 115.5\n0.22206 1.0286 -30.039\n0.00032815 -2.4116e-05 0.9999\n\nD: 1.8454 -0.0093839 117.6\n0.8533 1.9335 -566.11\n0.0016091 6.8147e-05 1.0105\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_136_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_136_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.23209 -0.67097 528.16\n0.66389 0.2516 -30.266\n-3.168e-05 2.5631e-05 1.0087\n\nB: 0.2564 0.092521 94.187\n-0.28031 0.83589 -0.15652\n-0.00048968 6.0866e-05 1.0015\n\nC: 0.67444 0.023361 37.089\n-0.047926 0.90094 60.932\n-0.00018688 1.1402e-05 1.0007\n\nD: 1.6284 1.0346 -954.33\n-0.096789 2.5434 -782.98\n-0.00078653 0.0011044 1\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.23209 -0.67097 528.16\n0.66389 0.2516 -30.266\n-3.168e-05 2.5631e-05 1.0087\n\nB: 0.2564 0.092521 94.187\n-0.28031 0.83589 -0.15652\n-0.00048968 6.0866e-05 1.0015\n\nC: 0.67444 0.023361 37.089\n-0.047926 0.90094 60.932\n-0.00018688 1.1402e-05 1.0007\n\nD: 1.6284 1.0346 -954.33\n-0.096789 2.5434 -782.98\n-0.00078653 0.0011044 1\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_137_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_137_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.38266 -0.33125 122.6\n-0.21363 0.61581 225.35\n-0.00034121 -7.7515e-06 0.99865\n\nB: 0.75268 -0.0092452 -71.273\n-0.17607 0.97566 6.3105\n-0.00029582 -1.5187e-05 0.99957\n\nC: 0.31483 0.11583 690.51\n0.17546 0.70637 14.497\n0.00026712 0.00012691 1\n\nD: 0.4221 -0.055916 265.09\n0.060544 0.41967 174.7\n7.7273e-06 -2.0972e-06 0.99999\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.38266 -0.33125 122.6\n-0.21363 0.61581 225.35\n-0.00034121 -7.7515e-06 0.99865\n\nB: 0.75268 -0.0092452 -71.273\n-0.17607 0.97566 6.3105\n-0.00029582 -1.5187e-05 0.99957\n\nC: 0.31483 0.11583 690.51\n0.17546 0.70637 14.497\n0.00026712 0.00012691 1\n\nD: 0.4221 -0.055916 265.09\n0.060544 0.41967 174.7\n7.7273e-06 -2.0972e-06 0.99999\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_138_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_138_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 2.5614 0.083075 163.07\n0.94137 2.2586 -732.08\n0.0017783 2.1603e-05 0.99316\n\nB: 0.37618 -0.0026073 58.013\n-0.13988 0.81886 117.4\n-0.00032276 -1.1378e-05 0.99983\n\nC: 0.63669 0.0018872 137.9\n-0.00033285 0.63926 95.922\n-2.0441e-06 4.1104e-06 1\n\nD: 0.40245 -0.33938 102.29\n-0.2125 0.62381 216.78\n-0.00033866 -1.5855e-05 1.0018\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 2.5614 0.083075 163.07\n0.94137 2.2586 -732.08\n0.0017783 2.1603e-05 0.99316\n\nB: 0.37618 -0.0026073 58.013\n-0.13988 0.81886 117.4\n-0.00032276 -1.1378e-05 0.99983\n\nC: 0.63669 0.0018872 137.9\n-0.00033285 0.63926 95.922\n-2.0441e-06 4.1104e-06 1\n\nD: 0.40245 -0.33938 102.29\n-0.2125 0.62381 216.78\n-0.00033866 -1.5855e-05 1.0018\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_139_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_139_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: -0.21679 -0.12572 585.55\n0.12463 -0.21699 355.1\n-1.085e-06 -1.8818e-06 1.0002\n\nB: 1.1529 0.012747 244.44\n0.41529 1.1943 -155.59\n0.00087156 5.6224e-05 1.0092\n\nC: 2.3515 0.16969 142.03\n1.0602 2.1465 -778.33\n0.0016806 -4.8949e-05 0.99537\n\nD: 1.1901 -0.048587 107.72\n0.14488 1.1926 -121.84\n0.00033622 1.1241e-05 1.0001\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: -0.21679 -0.12572 585.55\n0.12463 -0.21699 355.1\n-1.085e-06 -1.8818e-06 1.0002\n\nB: 1.1529 0.012747 244.44\n0.41529 1.1943 -155.59\n0.00087156 5.6224e-05 1.0092\n\nC: 2.3515 0.16969 142.03\n1.0602 2.1465 -778.33\n0.0016806 -4.8949e-05 0.99537\n\nD: 1.1901 -0.048587 107.72\n0.14488 1.1926 -121.84\n0.00033622 1.1241e-05 1.0001\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_140_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_140_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 3.1627 -0.045434 -351.49\n0.7877 2.8197 -842.9\n0.0015033 -3.676e-05 1.0055\n\nB: 0.91628 -0.19782 70.502\n0.072414 0.68419 -33.187\n5.7127e-06 -0.00025258 0.99947\n\nC: 0.27317 0.041297 84.951\n-0.22859 0.68736 124.47\n-0.00041264 5.2763e-05 1.0003\n\nD: 0.7855 0.039826 119.05\n-0.25749 1.3451 -220.69\n-0.00047304 5.3677e-05 1.001\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 3.1627 -0.045434 -351.49\n0.7877 2.8197 -842.9\n0.0015033 -3.676e-05 1.0055\n\nB: 0.91628 -0.19782 70.502\n0.072414 0.68419 -33.187\n5.7127e-06 -0.00025258 0.99947\n\nC: 0.27317 0.041297 84.951\n-0.22859 0.68736 124.47\n-0.00041264 5.2763e-05 1.0003\n\nD: 0.7855 0.039826 119.05\n-0.25749 1.3451 -220.69\n-0.00047304 5.3677e-05 1.001\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_141_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_141_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.38266 -0.33125 122.6\n-0.21363 0.61581 225.35\n-0.00034121 -7.7515e-06 0.99865\n\nB: 0.23209 -0.67097 528.16\n0.66389 0.2516 -30.266\n-3.168e-05 2.5631e-05 1.0087\n\nC: 0.36677 -0.019493 213.68\n-0.082321 0.47708 180.81\n-0.00021125 -4.1441e-05 1.0123\n\nD: 0.77044 -0.014353 152.19\n0.007827 0.75172 76.397\n1.9039e-05 -2.1554e-05 1\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.38266 -0.33125 122.6\n-0.21363 0.61581 225.35\n-0.00034121 -7.7515e-06 0.99865\n\nB: 0.23209 -0.67097 528.16\n0.66389 0.2516 -30.266\n-3.168e-05 2.5631e-05 1.0087\n\nC: 0.36677 -0.019493 213.68\n-0.082321 0.47708 180.81\n-0.00021125 -4.1441e-05 1.0123\n\nD: 0.77044 -0.014353 152.19\n0.007827 0.75172 76.397\n1.9039e-05 -2.1554e-05 1\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_142_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_142_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.67783 0.002447 123\n-0.00051063 0.68091 83.563\n-2.5166e-06 5.6486e-06 1\n\nB: 0.17608 -0.024321 273.19\n-0.19809 0.7405 74.826\n-0.00053318 1.2457e-05 1.0069\n\nC: 0.2564 0.092521 94.187\n-0.28031 0.83589 -0.15652\n-0.00048968 6.0866e-05 1.0015\n\nD: 1.5134 -0.0029581 20.934\n0.2678 1.4062 -232.68\n0.00048583 -4.0311e-06 1.0006\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.67783 0.002447 123\n-0.00051063 0.68091 83.563\n-2.5166e-06 5.6486e-06 1\n\nB: 0.17608 -0.024321 273.19\n-0.19809 0.7405 74.826\n-0.00053318 1.2457e-05 1.0069\n\nC: 0.2564 0.092521 94.187\n-0.28031 0.83589 -0.15652\n-0.00048968 6.0866e-05 1.0015\n\nD: 1.5134 -0.0029581 20.934\n0.2678 1.4062 -232.68\n0.00048583 -4.0311e-06 1.0006\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_143_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_143_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.29858 0.0403 -122.67\n-0.38113 0.61838 172.03\n-0.00071255 -1.0448e-06 0.97348\n\nB: 1.8454 -0.0093839 117.6\n0.8533 1.9335 -566.11\n0.0016091 6.8147e-05 1.0105\n\nC: 0.94726 0.076953 177.36\n0.25112 1.0126 13.205\n0.00047269 2.7805e-05 0.99969\n\nD: 0.17608 -0.024321 273.19\n-0.19809 0.7405 74.826\n-0.00053318 1.2457e-05 1.0069\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.29858 0.0403 -122.67\n-0.38113 0.61838 172.03\n-0.00071255 -1.0448e-06 0.97348\n\nB: 1.8454 -0.0093839 117.6\n0.8533 1.9335 -566.11\n0.0016091 6.8147e-05 1.0105\n\nC: 0.94726 0.076953 177.36\n0.25112 1.0126 13.205\n0.00047269 2.7805e-05 0.99969\n\nD: 0.17608 -0.024321 273.19\n-0.19809 0.7405 74.826\n-0.00053318 1.2457e-05 1.0069\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_144_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_144_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.33414 0.069646 90.22\n-0.25229 0.73446 157.67\n-0.00038885 2.2582e-06 1.0024\n\nB: 0.54693 0.20925 -108.35\n-0.082341 1.1176 -236.48\n-0.0006026 0.0001769 1.0001\n\nC: 0.42945 0.0071566 96.266\n-0.019537 0.48377 43.049\n-7.8698e-05 1.6013e-05 1.0001\n\nD: 0.81883 -0.28544 161.88\n0.010536 0.53499 62.327\n1.3163e-05 -0.00056443 1.0014\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.33414 0.069646 90.22\n-0.25229 0.73446 157.67\n-0.00038885 2.2582e-06 1.0024\n\nB: 0.54693 0.20925 -108.35\n-0.082341 1.1176 -236.48\n-0.0006026 0.0001769 1.0001\n\nC: 0.42945 0.0071566 96.266\n-0.019537 0.48377 43.049\n-7.8698e-05 1.6013e-05 1.0001\n\nD: 0.81883 -0.28544 161.88\n0.010536 0.53499 62.327\n1.3163e-05 -0.00056443 1.0014\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_145_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_145_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.31269 -0.011782 51.842\n-0.22276 0.71181 65.24\n-0.00081452 -4.173e-05 0.99309\n\nB: 3.6199 0.1243 -2.4307\n0.35256 5.1536 -1935.2\n0.0029372 0.0011148 1\n\nC: 2.2787 0.023843 -30.321\n0.58793 1.9158 -459.28\n0.0012782 -6.6868e-06 0.99971\n\nD: 0.23209 -0.67097 528.16\n0.66389 0.2516 -30.266\n-3.168e-05 2.5631e-05 1.0087\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.31269 -0.011782 51.842\n-0.22276 0.71181 65.24\n-0.00081452 -4.173e-05 0.99309\n\nB: 3.6199 0.1243 -2.4307\n0.35256 5.1536 -1935.2\n0.0029372 0.0011148 1\n\nC: 2.2787 0.023843 -30.321\n0.58793 1.9158 -459.28\n0.0012782 -6.6868e-06 0.99971\n\nD: 0.23209 -0.67097 528.16\n0.66389 0.2516 -30.266\n-3.168e-05 2.5631e-05 1.0087\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_146_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_146_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.7855 0.039826 119.05\n-0.25749 1.3451 -220.69\n-0.00047304 5.3677e-05 1.001\n\nB: 0.0033111 0.031282 184.63\n-0.15843 0.75999 4.5609\n-0.00083562 0.00011238 0.99927\n\nC: 0.42186 0.031568 60.169\n-0.084563 0.88575 93.738\n-0.00032749 1.4457e-05 1.0012\n\nD: 0.54693 0.20925 -108.35\n-0.082341 1.1176 -236.48\n-0.0006026 0.0001769 1.0001\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.7855 0.039826 119.05\n-0.25749 1.3451 -220.69\n-0.00047304 5.3677e-05 1.001\n\nB: 0.0033111 0.031282 184.63\n-0.15843 0.75999 4.5609\n-0.00083562 0.00011238 0.99927\n\nC: 0.42186 0.031568 60.169\n-0.084563 0.88575 93.738\n-0.00032749 1.4457e-05 1.0012\n\nD: 0.54693 0.20925 -108.35\n-0.082341 1.1176 -236.48\n-0.0006026 0.0001769 1.0001\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_147_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_147_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 1.3903 -0.069797 29.319\n0.18963 1.0284 22.049\n0.00052989 -9.8197e-05 1.0021\n\nB: 0.55202 0.096567 108.66\n-0.35774 1.4927 -276.32\n-0.00068886 0.0001065 0.98986\n\nC: 1.8454 -0.0093839 117.6\n0.8533 1.9335 -566.11\n0.0016091 6.8147e-05 1.0105\n\nD: 0.86273 0.030727 -257.65\n-0.081274 1.0175 -48.986\n-0.00016043 4.4449e-05 1.0008\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.3903 -0.069797 29.319\n0.18963 1.0284 22.049\n0.00052989 -9.8197e-05 1.0021\n\nB: 0.55202 0.096567 108.66\n-0.35774 1.4927 -276.32\n-0.00068886 0.0001065 0.98986\n\nC: 1.8454 -0.0093839 117.6\n0.8533 1.9335 -566.11\n0.0016091 6.8147e-05 1.0105\n\nD: 0.86273 0.030727 -257.65\n-0.081274 1.0175 -48.986\n-0.00016043 4.4449e-05 1.0008\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_148_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_148_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.70212 0.43231 -128.54\n-0.42351 0.70276 199.3\n6.3285e-06 1.2175e-05 0.99997\n\nB: 0.22888 0.0058691 272.09\n-0.077153 0.3923 203.08\n-0.00024299 -4.5827e-06 1.0015\n\nC: 0.81883 -0.28544 161.88\n0.010536 0.53499 62.327\n1.3163e-05 -0.00056443 1.0014\n\nD: 0.48882 0.0079397 13.575\n-0.24956 0.69593 149.6\n-0.00053246 -7.8574e-06 1.0026\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.70212 0.43231 -128.54\n-0.42351 0.70276 199.3\n6.3285e-06 1.2175e-05 0.99997\n\nB: 0.22888 0.0058691 272.09\n-0.077153 0.3923 203.08\n-0.00024299 -4.5827e-06 1.0015\n\nC: 0.81883 -0.28544 161.88\n0.010536 0.53499 62.327\n1.3163e-05 -0.00056443 1.0014\n\nD: 0.48882 0.0079397 13.575\n-0.24956 0.69593 149.6\n-0.00053246 -7.8574e-06 1.0026\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_149_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_149_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 1.3231 -0.10518 226.69\n0.35118 1.4445 -217.52\n0.00076877 -2.4515e-05 0.99903\n\nB: 0.54864 -0.010797 -6.1494\n-0.11876 0.86651 111.28\n-0.00026448 -1.8961e-05 1\n\nC: 2.4665 0.083695 233.31\n0.87021 2.8235 -936.68\n0.0017821 0.0001592 0.98707\n\nD: 0.85555 -0.17378 91.59\n0.17068 0.85755 -31.264\n-5.1182e-06 2.0966e-06 1.0023\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.3231 -0.10518 226.69\n0.35118 1.4445 -217.52\n0.00076877 -2.4515e-05 0.99903\n\nB: 0.54864 -0.010797 -6.1494\n-0.11876 0.86651 111.28\n-0.00026448 -1.8961e-05 1\n\nC: 2.4665 0.083695 233.31\n0.87021 2.8235 -936.68\n0.0017821 0.0001592 0.98707\n\nD: 0.85555 -0.17378 91.59\n0.17068 0.85755 -31.264\n-5.1182e-06 2.0966e-06 1.0023\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_150_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_150_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.29534 0.035751 -56.21\n-0.35718 0.5432 233.53\n-0.00064211 -1.1093e-05 0.97783\n\nB: 14.984 -1.5209 -1987.5\n0.59203 13.878 -3896.8\n0.0072047 0.0038814 0.92614\n\nC: 0.67783 0.002447 123\n-0.00051063 0.68091 83.563\n-2.5166e-06 5.6486e-06 1\n\nD: 0.4849 -0.15095 280.72\n-0.18568 0.38797 170.57\n-4.9965e-05 -0.00024428 0.99985\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.29534 0.035751 -56.21\n-0.35718 0.5432 233.53\n-0.00064211 -1.1093e-05 0.97783\n\nB: 14.984 -1.5209 -1987.5\n0.59203 13.878 -3896.8\n0.0072047 0.0038814 0.92614\n\nC: 0.67783 0.002447 123\n-0.00051063 0.68091 83.563\n-2.5166e-06 5.6486e-06 1\n\nD: 0.4849 -0.15095 280.72\n-0.18568 0.38797 170.57\n-4.9965e-05 -0.00024428 0.99985\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_151_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_151_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.47208 0.021042 63.836\n-0.16332 0.73028 126.94\n-0.00030371 2.4606e-05 0.99981\n\nB: 0.13416 0.073075 56.977\n-0.21333 0.70433 84.528\n-0.00055481 6.1106e-05 1\n\nC: 1.3308 -0.060097 223.54\n0.17906 0.94189 -10.999\n0.00034146 -4.4675e-05 0.99983\n\nD: 2.4144 -0.0022023 -199.3\n0.52146 2.0547 -569.49\n0.0010423 8.4489e-05 1.0043\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.47208 0.021042 63.836\n-0.16332 0.73028 126.94\n-0.00030371 2.4606e-05 0.99981\n\nB: 0.13416 0.073075 56.977\n-0.21333 0.70433 84.528\n-0.00055481 6.1106e-05 1\n\nC: 1.3308 -0.060097 223.54\n0.17906 0.94189 -10.999\n0.00034146 -4.4675e-05 0.99983\n\nD: 2.4144 -0.0022023 -199.3\n0.52146 2.0547 -569.49\n0.0010423 8.4489e-05 1.0043\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_152_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_152_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.69134 -0.0063829 116.24\n0.0053381 0.71985 83.96\n-1.8171e-05 2.7124e-05 1\n\nB: 0.55347 0.01345 110.12\n-0.085938 0.64894 151.2\n-0.00016395 1.1079e-05 0.99926\n\nC: 0.28973 0.014397 100.07\n-0.29955 0.64174 168.27\n-0.00067332 7.239e-06 1.0017\n\nD: 0.75268 -0.0092452 -71.273\n-0.17607 0.97566 6.3105\n-0.00029582 -1.5187e-05 0.99957\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.69134 -0.0063829 116.24\n0.0053381 0.71985 83.96\n-1.8171e-05 2.7124e-05 1\n\nB: 0.55347 0.01345 110.12\n-0.085938 0.64894 151.2\n-0.00016395 1.1079e-05 0.99926\n\nC: 0.28973 0.014397 100.07\n-0.29955 0.64174 168.27\n-0.00067332 7.239e-06 1.0017\n\nD: 0.75268 -0.0092452 -71.273\n-0.17607 0.97566 6.3105\n-0.00029582 -1.5187e-05 0.99957\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_153_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_153_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.77105 -0.097833 -3.6994\n-0.092675 0.81167 92.799\n-0.0001392 -0.00012806 0.99964\n\nB: 0.32788 -0.00026656 168.52\n-0.087696 0.49289 72.043\n-0.00025798 4.6006e-06 0.9984\n\nC: 0.88184 0.31397 -39.976\n-0.18167 0.93621 153.25\n0.00020118 -1.9028e-05 0.99997\n\nD: 0.7855 0.039826 119.05\n-0.25749 1.3451 -220.69\n-0.00047304 5.3677e-05 1.001\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.77105 -0.097833 -3.6994\n-0.092675 0.81167 92.799\n-0.0001392 -0.00012806 0.99964\n\nB: 0.32788 -0.00026656 168.52\n-0.087696 0.49289 72.043\n-0.00025798 4.6006e-06 0.9984\n\nC: 0.88184 0.31397 -39.976\n-0.18167 0.93621 153.25\n0.00020118 -1.9028e-05 0.99997\n\nD: 0.7855 0.039826 119.05\n-0.25749 1.3451 -220.69\n-0.00047304 5.3677e-05 1.001\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_154_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_154_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 1.6413 0.074225 91.097\n0.77035 1.5061 -362.72\n0.0010583 -7.1897e-05 1.0011\n\nB: 1.0478 0.035143 64.843\n0.063507 1.0349 21.701\n0.00023044 -6.878e-06 0.99998\n\nC: 0.24117 0.068506 48.185\n-0.23318 0.79398 68.106\n-0.0005259 5.079e-05 0.99834\n\nD: 1.3838 0.024181 -93.882\n0.093344 1.307 -232.76\n0.00015995 6.7546e-05 1.0008\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.6413 0.074225 91.097\n0.77035 1.5061 -362.72\n0.0010583 -7.1897e-05 1.0011\n\nB: 1.0478 0.035143 64.843\n0.063507 1.0349 21.701\n0.00023044 -6.878e-06 0.99998\n\nC: 0.24117 0.068506 48.185\n-0.23318 0.79398 68.106\n-0.0005259 5.079e-05 0.99834\n\nD: 1.3838 0.024181 -93.882\n0.093344 1.307 -232.76\n0.00015995 6.7546e-05 1.0008\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_155_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_155_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 1.0035 -0.00055314 2.5255\n-0.0028717 1.0087 -9.7285\n-3.8783e-06 3.4244e-06 1\n\nB: 1.3951 0.13641 136.74\n0.31704 1.2758 -219.28\n0.00053511 0.00013896 0.99675\n\nC: 0.72201 0.13445 62.975\n0.059719 0.85126 46.305\n-1.7322e-05 0.00018166 1.0001\n\nD: 0.45841 0.038317 36.428\n-0.26806 0.75693 165.6\n-0.00037539 -1.4035e-05 1.0016\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.0035 -0.00055314 2.5255\n-0.0028717 1.0087 -9.7285\n-3.8783e-06 3.4244e-06 1\n\nB: 1.3951 0.13641 136.74\n0.31704 1.2758 -219.28\n0.00053511 0.00013896 0.99675\n\nC: 0.72201 0.13445 62.975\n0.059719 0.85126 46.305\n-1.7322e-05 0.00018166 1.0001\n\nD: 0.45841 0.038317 36.428\n-0.26806 0.75693 165.6\n-0.00037539 -1.4035e-05 1.0016\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_156_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_156_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.38922 0.015343 55.85\n-0.1763 0.84543 87.344\n-0.00049385 -2.1034e-05 1.0072\n\nB: 1.2869 -0.0035671 90.117\n0.34981 1.1421 -290.48\n0.0010338 2.5575e-05 0.99928\n\nC: 0.37694 0.049406 111.53\n-0.16444 0.72986 84.602\n-0.00037753 4.0247e-05 0.99869\n\nD: 0.88632 -0.012492 -136.92\n-0.047209 1.0157 42.178\n-0.0001423 1.8595e-05 1.0005\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.38922 0.015343 55.85\n-0.1763 0.84543 87.344\n-0.00049385 -2.1034e-05 1.0072\n\nB: 1.2869 -0.0035671 90.117\n0.34981 1.1421 -290.48\n0.0010338 2.5575e-05 0.99928\n\nC: 0.37694 0.049406 111.53\n-0.16444 0.72986 84.602\n-0.00037753 4.0247e-05 0.99869\n\nD: 0.88632 -0.012492 -136.92\n-0.047209 1.0157 42.178\n-0.0001423 1.8595e-05 1.0005\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_157_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_157_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 1.1442 -0.037625 115.5\n0.22206 1.0286 -30.039\n0.00032815 -2.4116e-05 0.9999\n\nB: 1.2869 -0.0035671 90.117\n0.34981 1.1421 -290.48\n0.0010338 2.5575e-05 0.99928\n\nC: 0.032608 0.010774 198.34\n-0.16134 0.44659 114.31\n-0.00057725 -5.1566e-07 1.0017\n\nD: 1.7761 -0.053427 263.17\n0.41751 1.5987 -329.46\n0.00069677 3.1372e-05 1.0014\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.1442 -0.037625 115.5\n0.22206 1.0286 -30.039\n0.00032815 -2.4116e-05 0.9999\n\nB: 1.2869 -0.0035671 90.117\n0.34981 1.1421 -290.48\n0.0010338 2.5575e-05 0.99928\n\nC: 0.032608 0.010774 198.34\n-0.16134 0.44659 114.31\n-0.00057725 -5.1566e-07 1.0017\n\nD: 1.7761 -0.053427 263.17\n0.41751 1.5987 -329.46\n0.00069677 3.1372e-05 1.0014\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_158_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_158_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 1.7761 -0.053427 263.17\n0.41751 1.5987 -329.46\n0.00069677 3.1372e-05 1.0014\n\nB: 1.3186 -0.0097277 -143.16\n0.094663 1.1956 -58.383\n0.00019153 -2.0281e-05 0.99989\n\nC: 0.38922 0.015343 55.85\n-0.1763 0.84543 87.344\n-0.00049385 -2.1034e-05 1.0072\n\nD: 0.22888 0.0058691 272.09\n-0.077153 0.3923 203.08\n-0.00024299 -4.5827e-06 1.0015\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.7761 -0.053427 263.17\n0.41751 1.5987 -329.46\n0.00069677 3.1372e-05 1.0014\n\nB: 1.3186 -0.0097277 -143.16\n0.094663 1.1956 -58.383\n0.00019153 -2.0281e-05 0.99989\n\nC: 0.38922 0.015343 55.85\n-0.1763 0.84543 87.344\n-0.00049385 -2.1034e-05 1.0072\n\nD: 0.22888 0.0058691 272.09\n-0.077153 0.3923 203.08\n-0.00024299 -4.5827e-06 1.0015\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_159_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_159_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.49202 0.0057754 242.06\n0.058005 0.43541 166.02\n0.00018017 1.0746e-05 0.99974\n\nB: 0.48882 0.0079397 13.575\n-0.24956 0.69593 149.6\n-0.00053246 -7.8574e-06 1.0026\n\nC: 0.86273 0.030727 -257.65\n-0.081274 1.0175 -48.986\n-0.00016043 4.4449e-05 1.0008\n\nD: 1.1198 0.031669 158.94\n0.13747 0.986 -24.458\n0.00036259 4.1267e-05 0.99658\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.49202 0.0057754 242.06\n0.058005 0.43541 166.02\n0.00018017 1.0746e-05 0.99974\n\nB: 0.48882 0.0079397 13.575\n-0.24956 0.69593 149.6\n-0.00053246 -7.8574e-06 1.0026\n\nC: 0.86273 0.030727 -257.65\n-0.081274 1.0175 -48.986\n-0.00016043 4.4449e-05 1.0008\n\nD: 1.1198 0.031669 158.94\n0.13747 0.986 -24.458\n0.00036259 4.1267e-05 0.99658\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_160_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_160_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.58099 -0.029382 -20.47\n-0.29479 0.73128 188.62\n-0.00043803 -4.3076e-05 1.0007\n\nB: 1.1529 0.012747 244.44\n0.41529 1.1943 -155.59\n0.00087156 5.6224e-05 1.0092\n\nC: 0.23209 -0.67097 528.16\n0.66389 0.2516 -30.266\n-3.168e-05 2.5631e-05 1.0087\n\nD: 1.7312 -0.086578 129.17\n0.3882 1.1026 -2.2164\n0.0010948 -0.00011788 1.0024\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.58099 -0.029382 -20.47\n-0.29479 0.73128 188.62\n-0.00043803 -4.3076e-05 1.0007\n\nB: 1.1529 0.012747 244.44\n0.41529 1.1943 -155.59\n0.00087156 5.6224e-05 1.0092\n\nC: 0.23209 -0.67097 528.16\n0.66389 0.2516 -30.266\n-3.168e-05 2.5631e-05 1.0087\n\nD: 1.7312 -0.086578 129.17\n0.3882 1.1026 -2.2164\n0.0010948 -0.00011788 1.0024\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_161_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_161_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.57125 -0.095863 127.19\n0.050302 0.75099 -13.911\n-0.00020485 1.2421e-06 0.9999\n\nB: 0.57079 0.0076829 -45.295\n-0.15447 0.93183 62.276\n-0.00028402 -5.8827e-06 0.99996\n\nC: 0.24117 0.068506 48.185\n-0.23318 0.79398 68.106\n-0.0005259 5.079e-05 0.99834\n\nD: 0.75268 -0.0092452 -71.273\n-0.17607 0.97566 6.3105\n-0.00029582 -1.5187e-05 0.99957\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.57125 -0.095863 127.19\n0.050302 0.75099 -13.911\n-0.00020485 1.2421e-06 0.9999\n\nB: 0.57079 0.0076829 -45.295\n-0.15447 0.93183 62.276\n-0.00028402 -5.8827e-06 0.99996\n\nC: 0.24117 0.068506 48.185\n-0.23318 0.79398 68.106\n-0.0005259 5.079e-05 0.99834\n\nD: 0.75268 -0.0092452 -71.273\n-0.17607 0.97566 6.3105\n-0.00029582 -1.5187e-05 0.99957\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_162_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_162_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.36677 -0.019493 213.68\n-0.082321 0.47708 180.81\n-0.00021125 -4.1441e-05 1.0123\n\nB: 0.60665 -0.013034 217.78\n0.087451 0.52146 32.707\n0.00021516 2.9281e-07 1.0006\n\nC: 0.7088 -0.010965 -26.07\n-0.13602 0.83489 103.19\n-0.00023352 -1.5615e-05 1.0004\n\nD: 0.030125 -0.01797 299.5\n-0.19573 0.45869 167.74\n-0.00051291 -3.9704e-05 1.0019\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.36677 -0.019493 213.68\n-0.082321 0.47708 180.81\n-0.00021125 -4.1441e-05 1.0123\n\nB: 0.60665 -0.013034 217.78\n0.087451 0.52146 32.707\n0.00021516 2.9281e-07 1.0006\n\nC: 0.7088 -0.010965 -26.07\n-0.13602 0.83489 103.19\n-0.00023352 -1.5615e-05 1.0004\n\nD: 0.030125 -0.01797 299.5\n-0.19573 0.45869 167.74\n-0.00051291 -3.9704e-05 1.0019\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_163_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_163_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.55347 0.01345 110.12\n-0.085938 0.64894 151.2\n-0.00016395 1.1079e-05 0.99926\n\nB: 0.47589 0.042551 60.888\n-0.21388 0.80238 62.033\n-0.0003663 2.6901e-05 1.001\n\nC: 2.1479 0.036813 206.94\n0.67819 1.8174 -485.8\n0.0012074 -6.8043e-06 0.99599\n\nD: 0.34904 -0.0038637 -43.899\n-0.22316 0.99346 45.579\n-0.00041195 -1.2246e-05 1\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.55347 0.01345 110.12\n-0.085938 0.64894 151.2\n-0.00016395 1.1079e-05 0.99926\n\nB: 0.47589 0.042551 60.888\n-0.21388 0.80238 62.033\n-0.0003663 2.6901e-05 1.001\n\nC: 2.1479 0.036813 206.94\n0.67819 1.8174 -485.8\n0.0012074 -6.8043e-06 0.99599\n\nD: 0.34904 -0.0038637 -43.899\n-0.22316 0.99346 45.579\n-0.00041195 -1.2246e-05 1\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_164_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_164_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 1.4932 0.01661 231.74\n0.45676 1.4341 -212.29\n0.0013256 9.9938e-05 0.99686\n\nB: 0.40245 -0.33938 102.29\n-0.2125 0.62381 216.78\n-0.00033866 -1.5855e-05 1.0018\n\nC: 2.6481 0.070248 -423.11\n0.5002 2.6605 -906.39\n0.0012014 0.00025943 0.99533\n\nD: 1.6477 -0.037624 101.59\n0.49962 1.5725 -364.98\n0.00090272 4.6589e-05 1.0037\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.4932 0.01661 231.74\n0.45676 1.4341 -212.29\n0.0013256 9.9938e-05 0.99686\n\nB: 0.40245 -0.33938 102.29\n-0.2125 0.62381 216.78\n-0.00033866 -1.5855e-05 1.0018\n\nC: 2.6481 0.070248 -423.11\n0.5002 2.6605 -906.39\n0.0012014 0.00025943 0.99533\n\nD: 1.6477 -0.037624 101.59\n0.49962 1.5725 -364.98\n0.00090272 4.6589e-05 1.0037\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_165_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_165_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 1.7761 -0.053427 263.17\n0.41751 1.5987 -329.46\n0.00069677 3.1372e-05 1.0014\n\nB: 1.1901 -0.048587 107.72\n0.14488 1.1926 -121.84\n0.00033622 1.1241e-05 1.0001\n\nC: 1.8851 0.028166 274.85\n0.48185 1.6951 -326.97\n0.0011778 8.455e-05 0.99801\n\nD: -0.19998 0.34647 247.36\n-0.34607 -0.19989 467.21\n2.0354e-07 -5.1701e-08 1\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.7761 -0.053427 263.17\n0.41751 1.5987 -329.46\n0.00069677 3.1372e-05 1.0014\n\nB: 1.1901 -0.048587 107.72\n0.14488 1.1926 -121.84\n0.00033622 1.1241e-05 1.0001\n\nC: 1.8851 0.028166 274.85\n0.48185 1.6951 -326.97\n0.0011778 8.455e-05 0.99801\n\nD: -0.19998 0.34647 247.36\n-0.34607 -0.19989 467.21\n2.0354e-07 -5.1701e-08 1\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_166_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_166_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 2.2078 0.054458 63.617\n0.67654 2.2557 -637.98\n0.0013191 8.5079e-05 1.0033\n\nB: 3.1627 -0.045434 -351.49\n0.7877 2.8197 -842.9\n0.0015033 -3.676e-05 1.0055\n\nC: 1.1529 0.012747 244.44\n0.41529 1.1943 -155.59\n0.00087156 5.6224e-05 1.0092\n\nD: 0.45841 0.038317 36.428\n-0.26806 0.75693 165.6\n-0.00037539 -1.4035e-05 1.0016\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 2.2078 0.054458 63.617\n0.67654 2.2557 -637.98\n0.0013191 8.5079e-05 1.0033\n\nB: 3.1627 -0.045434 -351.49\n0.7877 2.8197 -842.9\n0.0015033 -3.676e-05 1.0055\n\nC: 1.1529 0.012747 244.44\n0.41529 1.1943 -155.59\n0.00087156 5.6224e-05 1.0092\n\nD: 0.45841 0.038317 36.428\n-0.26806 0.75693 165.6\n-0.00037539 -1.4035e-05 1.0016\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_167_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_167_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.60879 -0.35761 289.93\n0.34822 0.61653 -30.949\n-2.0912e-05 1.3527e-06 1.014\n\nB: 1.2108 -0.031741 47.374\n0.20996 1.0345 -107.36\n0.00054926 -6.3631e-06 1.0004\n\nC: 0.53266 0.0019756 44.297\n-0.18137 0.85955 61.945\n-0.00038035 1.4705e-06 0.9999\n\nD: 2.4665 0.083695 233.31\n0.87021 2.8235 -936.68\n0.0017821 0.0001592 0.98707\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.60879 -0.35761 289.93\n0.34822 0.61653 -30.949\n-2.0912e-05 1.3527e-06 1.014\n\nB: 1.2108 -0.031741 47.374\n0.20996 1.0345 -107.36\n0.00054926 -6.3631e-06 1.0004\n\nC: 0.53266 0.0019756 44.297\n-0.18137 0.85955 61.945\n-0.00038035 1.4705e-06 0.9999\n\nD: 2.4665 0.083695 233.31\n0.87021 2.8235 -936.68\n0.0017821 0.0001592 0.98707\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_168_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_168_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 1.6413 0.074225 91.097\n0.77035 1.5061 -362.72\n0.0010583 -7.1897e-05 1.0011\n\nB: 2.4665 0.083695 233.31\n0.87021 2.8235 -936.68\n0.0017821 0.0001592 0.98707\n\nC: 1.4403 0.27154 10.734\n0.071471 1.5534 -44.533\n0.00030432 0.00049723 1.001\n\nD: 0.85799 0.21669 9.4839\n-0.21177 0.85855 130.48\n1.5015e-06 9.2033e-07 1\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.6413 0.074225 91.097\n0.77035 1.5061 -362.72\n0.0010583 -7.1897e-05 1.0011\n\nB: 2.4665 0.083695 233.31\n0.87021 2.8235 -936.68\n0.0017821 0.0001592 0.98707\n\nC: 1.4403 0.27154 10.734\n0.071471 1.5534 -44.533\n0.00030432 0.00049723 1.001\n\nD: 0.85799 0.21669 9.4839\n-0.21177 0.85855 130.48\n1.5015e-06 9.2033e-07 1\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_169_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_169_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.74922 -0.0014388 -75.597\n-0.074158 0.94323 40.455\n-0.00018126 -6.2301e-06 1\n\nB: 1.7761 -0.053427 263.17\n0.41751 1.5987 -329.46\n0.00069677 3.1372e-05 1.0014\n\nC: 0.55616 0.0088234 83.342\n-0.19782 0.70845 195.76\n-0.00029305 -3.175e-05 0.99884\n\nD: 0.4849 -0.15095 280.72\n-0.18568 0.38797 170.57\n-4.9965e-05 -0.00024428 0.99985\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.74922 -0.0014388 -75.597\n-0.074158 0.94323 40.455\n-0.00018126 -6.2301e-06 1\n\nB: 1.7761 -0.053427 263.17\n0.41751 1.5987 -329.46\n0.00069677 3.1372e-05 1.0014\n\nC: 0.55616 0.0088234 83.342\n-0.19782 0.70845 195.76\n-0.00029305 -3.175e-05 0.99884\n\nD: 0.4849 -0.15095 280.72\n-0.18568 0.38797 170.57\n-4.9965e-05 -0.00024428 0.99985\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_170_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_170_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.31237 0.099342 8.5389\n-0.29392 0.92363 14.629\n-0.00074642 6.3257e-05 0.99168\n\nB: 2.2078 0.054458 63.617\n0.67654 2.2557 -637.98\n0.0013191 8.5079e-05 1.0033\n\nC: 1.3526 0.026797 436.87\n0.31517 1.3826 -234.04\n0.00076901 0.00022984 1.0039\n\nD: 0.53266 0.0019756 44.297\n-0.18137 0.85955 61.945\n-0.00038035 1.4705e-06 0.9999\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.31237 0.099342 8.5389\n-0.29392 0.92363 14.629\n-0.00074642 6.3257e-05 0.99168\n\nB: 2.2078 0.054458 63.617\n0.67654 2.2557 -637.98\n0.0013191 8.5079e-05 1.0033\n\nC: 1.3526 0.026797 436.87\n0.31517 1.3826 -234.04\n0.00076901 0.00022984 1.0039\n\nD: 0.53266 0.0019756 44.297\n-0.18137 0.85955 61.945\n-0.00038035 1.4705e-06 0.9999\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_171_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_171_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.67444 0.023361 37.089\n-0.047926 0.90094 60.932\n-0.00018688 1.1402e-05 1.0007\n\nB: 0.57079 0.0076829 -45.295\n-0.15447 0.93183 62.276\n-0.00028402 -5.8827e-06 0.99996\n\nC: 4.3722 0.14407 -818.24\n-0.25209 3.9595 -549.15\n0.001718 0.0010825 0.97985\n\nD: 1.3186 -0.0097277 -143.16\n0.094663 1.1956 -58.383\n0.00019153 -2.0281e-05 0.99989\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.67444 0.023361 37.089\n-0.047926 0.90094 60.932\n-0.00018688 1.1402e-05 1.0007\n\nB: 0.57079 0.0076829 -45.295\n-0.15447 0.93183 62.276\n-0.00028402 -5.8827e-06 0.99996\n\nC: 4.3722 0.14407 -818.24\n-0.25209 3.9595 -549.15\n0.001718 0.0010825 0.97985\n\nD: 1.3186 -0.0097277 -143.16\n0.094663 1.1956 -58.383\n0.00019153 -2.0281e-05 0.99989\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_172_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_172_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 1.441 -0.037212 269.33\n0.73295 1.6438 -380.65\n0.0014226 4.1601e-05 1.0102\n\nB: 0.25611 0.0594 88.294\n-0.24702 0.7663 71.53\n-0.00048162 6.7687e-05 1.0008\n\nC: 0.37107 -0.09213 318.73\n0.086334 0.37505 188.02\n-1.0814e-05 -3.6548e-06 1\n\nD: 0.66581 0.6777 -31.246\n-0.14346 0.96853 148.92\n0.00042869 -1.7355e-05 0.99928\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.441 -0.037212 269.33\n0.73295 1.6438 -380.65\n0.0014226 4.1601e-05 1.0102\n\nB: 0.25611 0.0594 88.294\n-0.24702 0.7663 71.53\n-0.00048162 6.7687e-05 1.0008\n\nC: 0.37107 -0.09213 318.73\n0.086334 0.37505 188.02\n-1.0814e-05 -3.6548e-06 1\n\nD: 0.66581 0.6777 -31.246\n-0.14346 0.96853 148.92\n0.00042869 -1.7355e-05 0.99928\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_173_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_173_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.94726 0.076953 177.36\n0.25112 1.0126 13.205\n0.00047269 2.7805e-05 0.99969\n\nB: 1.2108 -0.031741 47.374\n0.20996 1.0345 -107.36\n0.00054926 -6.3631e-06 1.0004\n\nC: 0.33492 -0.0051126 63.132\n-0.19841 0.81318 98.482\n-0.00041298 -2.8119e-05 0.99833\n\nD: 0.72201 0.13445 62.975\n0.059719 0.85126 46.305\n-1.7322e-05 0.00018166 1.0001\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.94726 0.076953 177.36\n0.25112 1.0126 13.205\n0.00047269 2.7805e-05 0.99969\n\nB: 1.2108 -0.031741 47.374\n0.20996 1.0345 -107.36\n0.00054926 -6.3631e-06 1.0004\n\nC: 0.33492 -0.0051126 63.132\n-0.19841 0.81318 98.482\n-0.00041298 -2.8119e-05 0.99833\n\nD: 0.72201 0.13445 62.975\n0.059719 0.85126 46.305\n-1.7322e-05 0.00018166 1.0001\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_174_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_174_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.1268 -0.03963 330.5\n-0.1892 0.46973 254.2\n-0.00039857 -3.9641e-05 0.99971\n\nB: 2.9721 0.034514 6.1536\n0.86739 2.9829 -532.95\n0.0035453 0.00017204 0.95976\n\nC: 1.6477 -0.037624 101.59\n0.49962 1.5725 -364.98\n0.00090272 4.6589e-05 1.0037\n\nD: 0.29534 0.035751 -56.21\n-0.35718 0.5432 233.53\n-0.00064211 -1.1093e-05 0.97783\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.1268 -0.03963 330.5\n-0.1892 0.46973 254.2\n-0.00039857 -3.9641e-05 0.99971\n\nB: 2.9721 0.034514 6.1536\n0.86739 2.9829 -532.95\n0.0035453 0.00017204 0.95976\n\nC: 1.6477 -0.037624 101.59\n0.49962 1.5725 -364.98\n0.00090272 4.6589e-05 1.0037\n\nD: 0.29534 0.035751 -56.21\n-0.35718 0.5432 233.53\n-0.00064211 -1.1093e-05 0.97783\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_175_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_175_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.31483 0.11583 690.51\n0.17546 0.70637 14.497\n0.00026712 0.00012691 1\n\nB: 0.77044 -0.014353 152.19\n0.007827 0.75172 76.397\n1.9039e-05 -2.1554e-05 1\n\nC: 1.0983 -0.030393 111.31\n0.31879 0.9789 58.516\n0.00050073 -5.3943e-05 1.0005\n\nD: 0.34904 -0.0038637 -43.899\n-0.22316 0.99346 45.579\n-0.00041195 -1.2246e-05 1\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.31483 0.11583 690.51\n0.17546 0.70637 14.497\n0.00026712 0.00012691 1\n\nB: 0.77044 -0.014353 152.19\n0.007827 0.75172 76.397\n1.9039e-05 -2.1554e-05 1\n\nC: 1.0983 -0.030393 111.31\n0.31879 0.9789 58.516\n0.00050073 -5.3943e-05 1.0005\n\nD: 0.34904 -0.0038637 -43.899\n-0.22316 0.99346 45.579\n-0.00041195 -1.2246e-05 1\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_176_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_176_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.44469 -0.1629 197.72\n-0.090792 0.33606 37.55\n-0.00032851 -0.00028415 1.0004\n\nB: 2.4144 -0.0022023 -199.3\n0.52146 2.0547 -569.49\n0.0010423 8.4489e-05 1.0043\n\nC: 0.88632 -0.012492 -136.92\n-0.047209 1.0157 42.178\n-0.0001423 1.8595e-05 1.0005\n\nD: 0.60665 -0.013034 217.78\n0.087451 0.52146 32.707\n0.00021516 2.9281e-07 1.0006\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.44469 -0.1629 197.72\n-0.090792 0.33606 37.55\n-0.00032851 -0.00028415 1.0004\n\nB: 2.4144 -0.0022023 -199.3\n0.52146 2.0547 -569.49\n0.0010423 8.4489e-05 1.0043\n\nC: 0.88632 -0.012492 -136.92\n-0.047209 1.0157 42.178\n-0.0001423 1.8595e-05 1.0005\n\nD: 0.60665 -0.013034 217.78\n0.087451 0.52146 32.707\n0.00021516 2.9281e-07 1.0006\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_177_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_177_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 1.4403 0.27154 10.734\n0.071471 1.5534 -44.533\n0.00030432 0.00049723 1.001\n\nB: 3.4851 0.086317 195.9\n1.1598 3.067 -1009.5\n0.0025647 -5.4567e-05 0.99349\n\nC: 0.49838 -0.015725 33.278\n-0.18045 0.77392 59.799\n-0.00064863 -4.2793e-05 0.99978\n\nD: 1.8954 -0.043603 197.83\n0.50589 1.509 -236.95\n0.0010644 -1.6279e-05 1.0115\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.4403 0.27154 10.734\n0.071471 1.5534 -44.533\n0.00030432 0.00049723 1.001\n\nB: 3.4851 0.086317 195.9\n1.1598 3.067 -1009.5\n0.0025647 -5.4567e-05 0.99349\n\nC: 0.49838 -0.015725 33.278\n-0.18045 0.77392 59.799\n-0.00064863 -4.2793e-05 0.99978\n\nD: 1.8954 -0.043603 197.83\n0.50589 1.509 -236.95\n0.0010644 -1.6279e-05 1.0115\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_178_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_178_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.25611 0.0594 88.294\n-0.24702 0.7663 71.53\n-0.00048162 6.7687e-05 1.0008\n\nB: 0.54372 0.011697 65.787\n-0.06271 0.8727 105.67\n-0.00025117 2.4814e-06 0.99967\n\nC: 0.55202 0.096567 108.66\n-0.35774 1.4927 -276.32\n-0.00068886 0.0001065 0.98986\n\nD: 0.48531 0.10549 -95.005\n-0.11843 0.77202 44.217\n-0.00029301 2.8434e-05 0.99773\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.25611 0.0594 88.294\n-0.24702 0.7663 71.53\n-0.00048162 6.7687e-05 1.0008\n\nB: 0.54372 0.011697 65.787\n-0.06271 0.8727 105.67\n-0.00025117 2.4814e-06 0.99967\n\nC: 0.55202 0.096567 108.66\n-0.35774 1.4927 -276.32\n-0.00068886 0.0001065 0.98986\n\nD: 0.48531 0.10549 -95.005\n-0.11843 0.77202 44.217\n-0.00029301 2.8434e-05 0.99773\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_179_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_179_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.37694 0.049406 111.53\n-0.16444 0.72986 84.602\n-0.00037753 4.0247e-05 0.99869\n\nB: 0.030125 -0.01797 299.5\n-0.19573 0.45869 167.74\n-0.00051291 -3.9704e-05 1.0019\n\nC: 1.0499 0.025643 108.77\n0.19467 1.0054 -7.8895\n0.0011218 -3.184e-05 1.0021\n\nD: 0.38854 -0.073106 92.576\n-0.1986 0.7319 139.21\n-0.00040811 -1.555e-05 0.99988\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.37694 0.049406 111.53\n-0.16444 0.72986 84.602\n-0.00037753 4.0247e-05 0.99869\n\nB: 0.030125 -0.01797 299.5\n-0.19573 0.45869 167.74\n-0.00051291 -3.9704e-05 1.0019\n\nC: 1.0499 0.025643 108.77\n0.19467 1.0054 -7.8895\n0.0011218 -3.184e-05 1.0021\n\nD: 0.38854 -0.073106 92.576\n-0.1986 0.7319 139.21\n-0.00040811 -1.555e-05 0.99988\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_180_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_180_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.34904 -0.0038637 -43.899\n-0.22316 0.99346 45.579\n-0.00041195 -1.2246e-05 1\n\nB: 1.4219 0.01866 342.44\n0.36005 1.3261 -141.73\n0.00090969 2.3838e-05 1.0002\n\nC: 1.1529 0.012747 244.44\n0.41529 1.1943 -155.59\n0.00087156 5.6224e-05 1.0092\n\nD: 0.74922 -0.0014388 -75.597\n-0.074158 0.94323 40.455\n-0.00018126 -6.2301e-06 1\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.34904 -0.0038637 -43.899\n-0.22316 0.99346 45.579\n-0.00041195 -1.2246e-05 1\n\nB: 1.4219 0.01866 342.44\n0.36005 1.3261 -141.73\n0.00090969 2.3838e-05 1.0002\n\nC: 1.1529 0.012747 244.44\n0.41529 1.1943 -155.59\n0.00087156 5.6224e-05 1.0092\n\nD: 0.74922 -0.0014388 -75.597\n-0.074158 0.94323 40.455\n-0.00018126 -6.2301e-06 1\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_181_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_181_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.81883 -0.28544 161.88\n0.010536 0.53499 62.327\n1.3163e-05 -0.00056443 1.0014\n\nB: 0.36677 -0.019493 213.68\n-0.082321 0.47708 180.81\n-0.00021125 -4.1441e-05 1.0123\n\nC: 1.8278 -0.0075993 72.268\n0.68643 1.8832 -550.61\n0.0012853 4.1209e-05 1.006\n\nD: 0.94726 0.076953 177.36\n0.25112 1.0126 13.205\n0.00047269 2.7805e-05 0.99969\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.81883 -0.28544 161.88\n0.010536 0.53499 62.327\n1.3163e-05 -0.00056443 1.0014\n\nB: 0.36677 -0.019493 213.68\n-0.082321 0.47708 180.81\n-0.00021125 -4.1441e-05 1.0123\n\nC: 1.8278 -0.0075993 72.268\n0.68643 1.8832 -550.61\n0.0012853 4.1209e-05 1.006\n\nD: 0.94726 0.076953 177.36\n0.25112 1.0126 13.205\n0.00047269 2.7805e-05 0.99969\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_182_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_182_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.030125 -0.01797 299.5\n-0.19573 0.45869 167.74\n-0.00051291 -3.9704e-05 1.0019\n\nB: 0.74922 -0.0014388 -75.597\n-0.074158 0.94323 40.455\n-0.00018126 -6.2301e-06 1\n\nC: 1.3186 -0.0097277 -143.16\n0.094663 1.1956 -58.383\n0.00019153 -2.0281e-05 0.99989\n\nD: 0.53266 0.0019756 44.297\n-0.18137 0.85955 61.945\n-0.00038035 1.4705e-06 0.9999\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.030125 -0.01797 299.5\n-0.19573 0.45869 167.74\n-0.00051291 -3.9704e-05 1.0019\n\nB: 0.74922 -0.0014388 -75.597\n-0.074158 0.94323 40.455\n-0.00018126 -6.2301e-06 1\n\nC: 1.3186 -0.0097277 -143.16\n0.094663 1.1956 -58.383\n0.00019153 -2.0281e-05 0.99989\n\nD: 0.53266 0.0019756 44.297\n-0.18137 0.85955 61.945\n-0.00038035 1.4705e-06 0.9999\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_183_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_183_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 1.7761 -0.053427 263.17\n0.41751 1.5987 -329.46\n0.00069677 3.1372e-05 1.0014\n\nB: 0.31269 -0.011782 51.842\n-0.22276 0.71181 65.24\n-0.00081452 -4.173e-05 0.99309\n\nC: 2.9599 0.00703 244.64\n0.78405 1.8789 -438.29\n0.0018411 4.4095e-05 0.99694\n\nD: 1.1529 0.012747 244.44\n0.41529 1.1943 -155.59\n0.00087156 5.6224e-05 1.0092\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.7761 -0.053427 263.17\n0.41751 1.5987 -329.46\n0.00069677 3.1372e-05 1.0014\n\nB: 0.31269 -0.011782 51.842\n-0.22276 0.71181 65.24\n-0.00081452 -4.173e-05 0.99309\n\nC: 2.9599 0.00703 244.64\n0.78405 1.8789 -438.29\n0.0018411 4.4095e-05 0.99694\n\nD: 1.1529 0.012747 244.44\n0.41529 1.1943 -155.59\n0.00087156 5.6224e-05 1.0092\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_184_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_184_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.54372 0.011697 65.787\n-0.06271 0.8727 105.67\n-0.00025117 2.4814e-06 0.99967\n\nB: 0.38922 0.015343 55.85\n-0.1763 0.84543 87.344\n-0.00049385 -2.1034e-05 1.0072\n\nC: 0.45841 0.038317 36.428\n-0.26806 0.75693 165.6\n-0.00037539 -1.4035e-05 1.0016\n\nD: 0.084461 -0.022036 252.3\n-0.21 0.51325 245.38\n-0.000447 -2.621e-05 1.0009\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.54372 0.011697 65.787\n-0.06271 0.8727 105.67\n-0.00025117 2.4814e-06 0.99967\n\nB: 0.38922 0.015343 55.85\n-0.1763 0.84543 87.344\n-0.00049385 -2.1034e-05 1.0072\n\nC: 0.45841 0.038317 36.428\n-0.26806 0.75693 165.6\n-0.00037539 -1.4035e-05 1.0016\n\nD: 0.084461 -0.022036 252.3\n-0.21 0.51325 245.38\n-0.000447 -2.621e-05 1.0009\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_185_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_185_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 3.4851 0.086317 195.9\n1.1598 3.067 -1009.5\n0.0025647 -5.4567e-05 0.99349\n\nB: 0.70212 0.43231 -128.54\n-0.42351 0.70276 199.3\n6.3285e-06 1.2175e-05 0.99997\n\nC: 0.23209 -0.67097 528.16\n0.66389 0.2516 -30.266\n-3.168e-05 2.5631e-05 1.0087\n\nD: 1.0819 0.012805 66.799\n0.075853 1.006 5.6909\n0.00034273 -2.4626e-05 1.0003\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 3.4851 0.086317 195.9\n1.1598 3.067 -1009.5\n0.0025647 -5.4567e-05 0.99349\n\nB: 0.70212 0.43231 -128.54\n-0.42351 0.70276 199.3\n6.3285e-06 1.2175e-05 0.99997\n\nC: 0.23209 -0.67097 528.16\n0.66389 0.2516 -30.266\n-3.168e-05 2.5631e-05 1.0087\n\nD: 1.0819 0.012805 66.799\n0.075853 1.006 5.6909\n0.00034273 -2.4626e-05 1.0003\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_186_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_186_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.29858 0.0403 -122.67\n-0.38113 0.61838 172.03\n-0.00071255 -1.0448e-06 0.97348\n\nB: 1.4862 -0.061679 54.577\n0.4606 1.2816 -147.5\n0.0007321 -7.3842e-05 0.99895\n\nC: 0.41873 -0.043533 -18.562\n-0.27021 0.88041 53.791\n-0.00050299 -2.2546e-05 0.99941\n\nD: 1.0669 0.31109 194.1\n-0.019953 0.9209 79.624\n0.000135 -7.6705e-05 0.99977\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.29858 0.0403 -122.67\n-0.38113 0.61838 172.03\n-0.00071255 -1.0448e-06 0.97348\n\nB: 1.4862 -0.061679 54.577\n0.4606 1.2816 -147.5\n0.0007321 -7.3842e-05 0.99895\n\nC: 0.41873 -0.043533 -18.562\n-0.27021 0.88041 53.791\n-0.00050299 -2.2546e-05 0.99941\n\nD: 1.0669 0.31109 194.1\n-0.019953 0.9209 79.624\n0.000135 -7.6705e-05 0.99977\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_187_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_187_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.47208 0.021042 63.836\n-0.16332 0.73028 126.94\n-0.00030371 2.4606e-05 0.99981\n\nB: 1.6477 -0.037624 101.59\n0.49962 1.5725 -364.98\n0.00090272 4.6589e-05 1.0037\n\nC: 1.0669 0.31109 194.1\n-0.019953 0.9209 79.624\n0.000135 -7.6705e-05 0.99977\n\nD: 2.6177 0.042575 -65.797\n0.74359 2.3954 -903.27\n0.0018892 8.2816e-05 0.98996\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.47208 0.021042 63.836\n-0.16332 0.73028 126.94\n-0.00030371 2.4606e-05 0.99981\n\nB: 1.6477 -0.037624 101.59\n0.49962 1.5725 -364.98\n0.00090272 4.6589e-05 1.0037\n\nC: 1.0669 0.31109 194.1\n-0.019953 0.9209 79.624\n0.000135 -7.6705e-05 0.99977\n\nD: 2.6177 0.042575 -65.797\n0.74359 2.3954 -903.27\n0.0018892 8.2816e-05 0.98996\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_188_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_188_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.43124 0.047668 -66.525\n-0.34772 0.62068 209.27\n-0.00060194 -2.1104e-07 0.98648\n\nB: 0.2564 0.092521 94.187\n-0.28031 0.83589 -0.15652\n-0.00048968 6.0866e-05 1.0015\n\nC: 1.1884 0.015274 95.776\n0.23282 1.0681 -20.551\n0.00097623 0.00015903 1.0014\n\nD: 1.2869 -0.0035671 90.117\n0.34981 1.1421 -290.48\n0.0010338 2.5575e-05 0.99928\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.43124 0.047668 -66.525\n-0.34772 0.62068 209.27\n-0.00060194 -2.1104e-07 0.98648\n\nB: 0.2564 0.092521 94.187\n-0.28031 0.83589 -0.15652\n-0.00048968 6.0866e-05 1.0015\n\nC: 1.1884 0.015274 95.776\n0.23282 1.0681 -20.551\n0.00097623 0.00015903 1.0014\n\nD: 1.2869 -0.0035671 90.117\n0.34981 1.1421 -290.48\n0.0010338 2.5575e-05 0.99928\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_189_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_189_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 1.8851 0.028166 274.85\n0.48185 1.6951 -326.97\n0.0011778 8.455e-05 0.99801\n\nB: 1.1943 0.010001 372.77\n0.22686 1.0937 -67.914\n0.00058802 5.2037e-05 0.99941\n\nC: 0.66581 0.6777 -31.246\n-0.14346 0.96853 148.92\n0.00042869 -1.7355e-05 0.99928\n\nD: 1.1884 0.015274 95.776\n0.23282 1.0681 -20.551\n0.00097623 0.00015903 1.0014\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.8851 0.028166 274.85\n0.48185 1.6951 -326.97\n0.0011778 8.455e-05 0.99801\n\nB: 1.1943 0.010001 372.77\n0.22686 1.0937 -67.914\n0.00058802 5.2037e-05 0.99941\n\nC: 0.66581 0.6777 -31.246\n-0.14346 0.96853 148.92\n0.00042869 -1.7355e-05 0.99928\n\nD: 1.1884 0.015274 95.776\n0.23282 1.0681 -20.551\n0.00097623 0.00015903 1.0014\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_190_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_190_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 1.0582 -0.013384 562.45\n0.1807 0.93712 36.472\n0.00043718 5.9368e-06 0.99927\n\nB: 0.49202 0.0057754 242.06\n0.058005 0.43541 166.02\n0.00018017 1.0746e-05 0.99974\n\nC: 0.72201 0.13445 62.975\n0.059719 0.85126 46.305\n-1.7322e-05 0.00018166 1.0001\n\nD: 0.47589 0.042551 60.888\n-0.21388 0.80238 62.033\n-0.0003663 2.6901e-05 1.001\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.0582 -0.013384 562.45\n0.1807 0.93712 36.472\n0.00043718 5.9368e-06 0.99927\n\nB: 0.49202 0.0057754 242.06\n0.058005 0.43541 166.02\n0.00018017 1.0746e-05 0.99974\n\nC: 0.72201 0.13445 62.975\n0.059719 0.85126 46.305\n-1.7322e-05 0.00018166 1.0001\n\nD: 0.47589 0.042551 60.888\n-0.21388 0.80238 62.033\n-0.0003663 2.6901e-05 1.001\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_191_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_191_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.60665 -0.013034 217.78\n0.087451 0.52146 32.707\n0.00021516 2.9281e-07 1.0006\n\nB: 1.2895 0.43518 -118.46\n-0.025956 1.4233 161.89\n-3.0413e-05 0.00069874 1.0013\n\nC: 2.9599 0.00703 244.64\n0.78405 1.8789 -438.29\n0.0018411 4.4095e-05 0.99694\n\nD: 0.4849 -0.15095 280.72\n-0.18568 0.38797 170.57\n-4.9965e-05 -0.00024428 0.99985\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.60665 -0.013034 217.78\n0.087451 0.52146 32.707\n0.00021516 2.9281e-07 1.0006\n\nB: 1.2895 0.43518 -118.46\n-0.025956 1.4233 161.89\n-3.0413e-05 0.00069874 1.0013\n\nC: 2.9599 0.00703 244.64\n0.78405 1.8789 -438.29\n0.0018411 4.4095e-05 0.99694\n\nD: 0.4849 -0.15095 280.72\n-0.18568 0.38797 170.57\n-4.9965e-05 -0.00024428 0.99985\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_192_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_192_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.60367 0.071352 -36.528\n-0.21232 0.96671 -45.299\n-0.00036835 6.7456e-05 0.99996\n\nB: 1.8278 -0.0075993 72.268\n0.68643 1.8832 -550.61\n0.0012853 4.1209e-05 1.006\n\nC: 1.2869 -0.0035671 90.117\n0.34981 1.1421 -290.48\n0.0010338 2.5575e-05 0.99928\n\nD: 1.7312 -0.086578 129.17\n0.3882 1.1026 -2.2164\n0.0010948 -0.00011788 1.0024\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.60367 0.071352 -36.528\n-0.21232 0.96671 -45.299\n-0.00036835 6.7456e-05 0.99996\n\nB: 1.8278 -0.0075993 72.268\n0.68643 1.8832 -550.61\n0.0012853 4.1209e-05 1.006\n\nC: 1.2869 -0.0035671 90.117\n0.34981 1.1421 -290.48\n0.0010338 2.5575e-05 0.99928\n\nD: 1.7312 -0.086578 129.17\n0.3882 1.1026 -2.2164\n0.0010948 -0.00011788 1.0024\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_193_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_193_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 1.0819 0.012805 66.799\n0.075853 1.006 5.6909\n0.00034273 -2.4626e-05 1.0003\n\nB: 2.2787 0.023843 -30.321\n0.58793 1.9158 -459.28\n0.0012782 -6.6868e-06 0.99971\n\nC: 14.984 -1.5209 -1987.5\n0.59203 13.878 -3896.8\n0.0072047 0.0038814 0.92614\n\nD: 0.7855 0.039826 119.05\n-0.25749 1.3451 -220.69\n-0.00047304 5.3677e-05 1.001\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.0819 0.012805 66.799\n0.075853 1.006 5.6909\n0.00034273 -2.4626e-05 1.0003\n\nB: 2.2787 0.023843 -30.321\n0.58793 1.9158 -459.28\n0.0012782 -6.6868e-06 0.99971\n\nC: 14.984 -1.5209 -1987.5\n0.59203 13.878 -3896.8\n0.0072047 0.0038814 0.92614\n\nD: 0.7855 0.039826 119.05\n-0.25749 1.3451 -220.69\n-0.00047304 5.3677e-05 1.001\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_194_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_194_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 1.3231 -0.10518 226.69\n0.35118 1.4445 -217.52\n0.00076877 -2.4515e-05 0.99903\n\nB: 1.0505 -0.0053825 276.45\n0.20631 0.92888 48.832\n0.00048841 -1.9251e-05 0.99878\n\nC: 1.4259 0.070724 58.865\n0.39243 1.3442 -170.04\n0.00084248 0.00011346 0.98851\n\nD: 1.3526 0.026797 436.87\n0.31517 1.3826 -234.04\n0.00076901 0.00022984 1.0039\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 1.3231 -0.10518 226.69\n0.35118 1.4445 -217.52\n0.00076877 -2.4515e-05 0.99903\n\nB: 1.0505 -0.0053825 276.45\n0.20631 0.92888 48.832\n0.00048841 -1.9251e-05 0.99878\n\nC: 1.4259 0.070724 58.865\n0.39243 1.3442 -170.04\n0.00084248 0.00011346 0.98851\n\nD: 1.3526 0.026797 436.87\n0.31517 1.3826 -234.04\n0.00076901 0.00022984 1.0039\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_195_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_195_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 3.6199 0.1243 -2.4307\n0.35256 5.1536 -1935.2\n0.0029372 0.0011148 1\n\nB: 0.47589 0.042551 60.888\n-0.21388 0.80238 62.033\n-0.0003663 2.6901e-05 1.001\n\nC: 1.0819 0.012805 66.799\n0.075853 1.006 5.6909\n0.00034273 -2.4626e-05 1.0003\n\nD: 1.3231 -0.10518 226.69\n0.35118 1.4445 -217.52\n0.00076877 -2.4515e-05 0.99903\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 3.6199 0.1243 -2.4307\n0.35256 5.1536 -1935.2\n0.0029372 0.0011148 1\n\nB: 0.47589 0.042551 60.888\n-0.21388 0.80238 62.033\n-0.0003663 2.6901e-05 1.001\n\nC: 1.0819 0.012805 66.799\n0.075853 1.006 5.6909\n0.00034273 -2.4626e-05 1.0003\n\nD: 1.3231 -0.10518 226.69\n0.35118 1.4445 -217.52\n0.00076877 -2.4515e-05 0.99903\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_196_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_196_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.37083 -0.024499 139.16\n-0.094573 0.62749 65.353\n-0.00053805 -2.2225e-05 0.99885\n\nB: 1.4259 0.070724 58.865\n0.39243 1.3442 -170.04\n0.00084248 0.00011346 0.98851\n\nC: 0.57125 -0.095863 127.19\n0.050302 0.75099 -13.911\n-0.00020485 1.2421e-06 0.9999\n\nD: 0.14586 0.056449 119.48\n-0.21737 0.71439 95.786\n-0.00051182 3.3282e-05 1.0008\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.37083 -0.024499 139.16\n-0.094573 0.62749 65.353\n-0.00053805 -2.2225e-05 0.99885\n\nB: 1.4259 0.070724 58.865\n0.39243 1.3442 -170.04\n0.00084248 0.00011346 0.98851\n\nC: 0.57125 -0.095863 127.19\n0.050302 0.75099 -13.911\n-0.00020485 1.2421e-06 0.9999\n\nD: 0.14586 0.056449 119.48\n-0.21737 0.71439 95.786\n-0.00051182 3.3282e-05 1.0008\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_197_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_197_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.29858 0.0403 -122.67\n-0.38113 0.61838 172.03\n-0.00071255 -1.0448e-06 0.97348\n\nB: 0.37083 -0.024499 139.16\n-0.094573 0.62749 65.353\n-0.00053805 -2.2225e-05 0.99885\n\nC: 0.60367 0.071352 -36.528\n-0.21232 0.96671 -45.299\n-0.00036835 6.7456e-05 0.99996\n\nD: 0.60367 0.071352 -36.528\n-0.21232 0.96671 -45.299\n-0.00036835 6.7456e-05 0.99996\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.29858 0.0403 -122.67\n-0.38113 0.61838 172.03\n-0.00071255 -1.0448e-06 0.97348\n\nB: 0.37083 -0.024499 139.16\n-0.094573 0.62749 65.353\n-0.00053805 -2.2225e-05 0.99885\n\nC: 0.60367 0.071352 -36.528\n-0.21232 0.96671 -45.299\n-0.00036835 6.7456e-05 0.99996\n\nD: 0.60367 0.071352 -36.528\n-0.21232 0.96671 -45.299\n-0.00036835 6.7456e-05 0.99996\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_198_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_198_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Homography_estimation", "visual_input_component": "natural image", "source": "Hpatches", "options": "A: 0.29858 0.0403 -122.67\n-0.38113 0.61838 172.03\n-0.00071255 -1.0448e-06 0.97348\n\nB: 0.67783 0.002447 123\n-0.00051063 0.68091 83.563\n-2.5166e-06 5.6486e-06 1\n\nC: 0.46461 0.085196 589.33\n0.19659 0.76327 25.833\n0.00026763 8.9486e-05 1.0006\n\nD: 0.040904 -0.0023332 234.76\n-0.10713 0.35038 218.5\n-0.00028907 6.311e-06 1.0035\n", "question": "Please compute the 3x3 homography matrix between these two images.", "context": "Your task is computing the 3x3 homography matrix that maps the coordinates of points in one image to their corresponding coordinates in another image. (Two images of the same planar.)\nSelect from the following choices.\nA: 0.29858 0.0403 -122.67\n-0.38113 0.61838 172.03\n-0.00071255 -1.0448e-06 0.97348\n\nB: 0.67783 0.002447 123\n-0.00051063 0.68091 83.563\n-2.5166e-06 5.6486e-06 1\n\nC: 0.46461 0.085196 589.33\n0.19659 0.76327 25.833\n0.00026763 8.9486e-05 1.0006\n\nD: 0.040904 -0.0023332 234.76\n-0.10713 0.35038 218.5\n-0.00028907 6.311e-06 1.0035\n", "input_image_path": ["./2D-spatial/Homography_estimation/Homography_estimation_199_0.png", "./2D-spatial/Homography_estimation/Homography_estimation_199_1.png"], "output": "D", "qwen3-vl": "image none"}]
\ No newline at end of file
diff --git a/results/Icon_Question_Answering_with_Spatial_Context/qwen3-vl/metadata_info.json b/results/Icon_Question_Answering_with_Spatial_Context/qwen3-vl/metadata_info.json
new file mode 100644
index 0000000..78503b0
--- /dev/null
+++ b/results/Icon_Question_Answering_with_Spatial_Context/qwen3-vl/metadata_info.json
@@ -0,0 +1 @@
+[{"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "What is at the bottom?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_0_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_0_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_0_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_0_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "Which picture shows the water bottle inside the tent?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_1_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_1_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_1_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image\nE: The sixth image", "question": "Which object is next to the block?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image\nE: The sixth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_2_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_2_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_2_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_2_3.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_2_4.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_2_5.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Which object is next to the one shaped like a cube?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_3_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_3_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_3_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_3_3.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_3_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Which object is next to the flashlight?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_4_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_4_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_4_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_4_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "Which picture shows the piggy bank inside the gift box?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_5_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_5_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_5_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Which object is above the bench?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_6_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_6_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_6_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_6_3.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_6_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "Which object is shaped like a cone and is above the desk?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_7_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_7_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_7_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Which object is beside the dice?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_8_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_8_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_8_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_8_3.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_8_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Which object is next to the watermelon?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_9_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_9_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_9_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_9_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image\nE: The sixth image", "question": "Which object is beside the one shaped like a cone?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image\nE: The sixth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_10_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_10_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_10_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_10_3.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_10_4.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_10_5.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "Which picture shows the cake inside the oven?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_11_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_11_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_11_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "Which object is below the bed?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_12_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_12_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_12_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image\nE: The sixth image", "question": "Which object is beside the one shaped like a cube?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image\nE: The sixth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_13_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_13_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_13_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_13_3.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_13_4.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_13_5.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "Which object is above the table?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_14_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_14_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_14_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "What is on the right?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_15_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_15_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_15_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_15_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Which object is beside the volleyball?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_16_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_16_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_16_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_16_3.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_16_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "Which object is shaped like a sphere and is below the table?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_17_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_17_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_17_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Which object is next to the clock?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_18_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_18_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_18_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_18_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Which object is below the table?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_19_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_19_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_19_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_19_3.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_19_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "What is on the right?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_20_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_20_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_20_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "Which object is below the bed?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_21_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_21_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_21_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "What is on the right?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_22_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_22_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_22_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_22_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Which object is next to the pine cone?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_23_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_23_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_23_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_23_3.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_23_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "What is on the left?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_24_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_24_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_24_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "What is on the right?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_25_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_25_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_25_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Which object is above the bench?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_26_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_26_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_26_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_26_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "What is at the top?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_27_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_27_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_27_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Which object is beside the bead?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_28_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_28_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_28_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_28_3.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_28_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Which object is beside the crate?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_29_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_29_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_29_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_29_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Which object is next to the cup?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_30_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_30_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_30_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_30_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Which object is above the bench?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_31_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_31_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_31_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_31_3.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_31_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "Which object is below the table?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_32_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_32_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_32_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "Which picture shows the muffins outside the oven?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_33_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_33_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_33_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Which object is below the table?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_34_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_34_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_34_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_34_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "What is on the right?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_35_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_35_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_35_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image\nE: The sixth image", "question": "Which object is next to the dog dish?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image\nE: The sixth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_36_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_36_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_36_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_36_3.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_36_4.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_36_5.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "What is in the middle?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_37_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_37_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_37_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_37_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "Which object below the bed is shaped like a cone?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_38_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_38_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_38_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "What is in the middle?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_39_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_39_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_39_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_39_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "Which object is below the table?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_40_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_40_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_40_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "What is in the middle?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_41_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_41_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_41_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_41_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "Which object is above the bench?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_42_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_42_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_42_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "Which picture shows the cow outside the barn?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_43_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_43_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_43_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "Which picture shows the soccer ball outside the gift box?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_44_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_44_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_44_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Which object is next to the flashlight?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_45_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_45_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_45_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_45_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Which object is next to the watermelon?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_46_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_46_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_46_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_46_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "Which object is below the table?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_47_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_47_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_47_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "What is on the left?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_48_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_48_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_48_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_48_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Which object is next to the bead?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_49_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_49_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_49_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_49_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Which object is next to the clock?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_50_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_50_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_50_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_50_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Which object is beside the pair of shoes?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_51_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_51_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_51_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_51_3.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_51_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Which object is beside the one shaped like a sphere?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_52_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_52_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_52_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_52_3.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_52_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image\nE: The sixth image", "question": "Which object is beside the tub of ice cream?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image\nE: The sixth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_53_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_53_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_53_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_53_3.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_53_4.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_53_5.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Which object is below the table?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_54_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_54_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_54_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_54_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "What is at the bottom?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_55_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_55_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_55_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_55_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Which object is next to the block?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_56_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_56_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_56_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_56_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Which object is below the desk?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_57_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_57_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_57_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_57_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Which object is above the bench?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_58_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_58_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_58_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_58_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image\nE: The sixth image", "question": "Which object is next to the trash can?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image\nE: The sixth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_59_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_59_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_59_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_59_3.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_59_4.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_59_5.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Which object is beside the mailing box?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_60_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_60_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_60_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_60_3.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_60_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "What is in the middle?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_61_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_61_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_61_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_61_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "What is on the right?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_62_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_62_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_62_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_62_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "What is on the left?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_63_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_63_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_63_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_63_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "Which object is shaped like a sphere and is below the bench?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_64_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_64_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_64_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "What is on the right?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_65_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_65_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_65_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Which object is next to the basketball?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_66_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_66_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_66_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_66_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Which object is below the table?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_67_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_67_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_67_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_67_3.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_67_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Which object is next to the flashlight?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_68_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_68_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_68_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_68_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Which object is next to the pine cone?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_69_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_69_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_69_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_69_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "What is at the top?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_70_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_70_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_70_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image\nE: The sixth image", "question": "Which object is beside the top?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image\nE: The sixth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_71_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_71_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_71_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_71_3.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_71_4.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_71_5.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Which object is beside the one shaped like a cone?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_72_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_72_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_72_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_72_3.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_72_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "What is at the top?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_73_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_73_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_73_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_73_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "What is on the right?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_74_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_74_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_74_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_74_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "Which picture shows the roast beef inside the oven?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_75_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_75_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_75_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "What is at the top?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_76_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_76_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_76_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Which object is next to the bead?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_77_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_77_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_77_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_77_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "Which object is shaped like a cylinder and is below the bed?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_78_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_78_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_78_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image\nE: The sixth image", "question": "Which object is next to the one shaped like a cube?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image\nE: The sixth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_79_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_79_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_79_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_79_3.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_79_4.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_79_5.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "Which object below the bed is shaped like a cylinder?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_80_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_80_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_80_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Which object is beside the clock?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_81_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_81_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_81_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_81_3.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_81_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "What is on the right?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_82_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_82_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_82_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "What is on the left?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_83_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_83_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_83_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_83_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "What is at the bottom?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_84_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_84_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_84_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "Which object is shaped like a cube and is above the bed?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_85_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_85_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_85_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "What is in the middle?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_86_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_86_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_86_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_86_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "What is on the right?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_87_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_87_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_87_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_87_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "What is on the left?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_88_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_88_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_88_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Which object is next to the storage bin?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_89_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_89_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_89_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_89_3.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_89_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "Which object is below the desk?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_90_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_90_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_90_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "Which picture shows the basketball inside the gift box?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_91_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_91_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_91_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "Which picture shows the toy airplane outside the gift box?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_92_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_92_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_92_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "Which object is shaped like a sphere and is below the table?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_93_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_93_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_93_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Which object is beside the pine cone?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_94_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_94_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_94_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_94_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "What is on the left?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_95_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_95_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_95_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_95_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Which object is beside the box of cookies?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_96_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_96_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_96_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_96_3.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_96_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Which object is beside the computer?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_97_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_97_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_97_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_97_3.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_97_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image\nE: The sixth image", "question": "Which object is beside the butterfly?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image\nE: The sixth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_98_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_98_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_98_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_98_3.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_98_4.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_98_5.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "What is on the left?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_99_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_99_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_99_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_99_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Which object is next to the flashlight?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_100_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_100_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_100_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_100_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "What is on the right?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_101_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_101_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_101_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image\nE: The sixth image", "question": "Which object is beside the one shaped like a cylinder?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image\nE: The sixth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_102_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_102_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_102_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_102_3.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_102_4.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_102_5.png"], "output": "E", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "What is at the bottom?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_103_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_103_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_103_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_103_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "What is on the right?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_104_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_104_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_104_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_104_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "What is on the right?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_105_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_105_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_105_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "What is on the right?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_106_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_106_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_106_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "What is on the left?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_107_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_107_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_107_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Which object is next to the pair of shoes?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_108_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_108_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_108_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_108_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "What is in the middle?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_109_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_109_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_109_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_109_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "Which object is below the table?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_110_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_110_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_110_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Which object is below the bench?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_111_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_111_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_111_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_111_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "Which picture shows the book inside the gift box?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_112_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_112_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_112_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "Which object is above the table?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_113_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_113_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_113_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "Which object is above the table?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_114_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_114_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_114_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "What is at the bottom?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_115_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_115_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_115_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "Which object above the table is shaped like a cylinder?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_116_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_116_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_116_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "What is on the right?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_117_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_117_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_117_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "Which object above the desk is shaped like a cylinder?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_118_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_118_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_118_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "What is at the bottom?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_119_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_119_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_119_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "What is in the middle?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_120_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_120_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_120_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_120_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Which object is below the table?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_121_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_121_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_121_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_121_3.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_121_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "Which object above the bench is shaped like a cube?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_122_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_122_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_122_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "What is at the top?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_123_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_123_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_123_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image\nE: The sixth image", "question": "Which object is next to the one shaped like a cube?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image\nE: The sixth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_124_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_124_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_124_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_124_3.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_124_4.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_124_5.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "What is at the bottom?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_125_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_125_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_125_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "What is on the left?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_126_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_126_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_126_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "What is on the right?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_127_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_127_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_127_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "What is at the top?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_128_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_128_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_128_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_128_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "Which object is shaped like a sphere and is above the table?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_129_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_129_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_129_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "What is in the middle?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_130_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_130_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_130_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_130_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Which object is beside the drum?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_131_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_131_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_131_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_131_3.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_131_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "What is in the middle?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_132_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_132_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_132_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_132_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "What is at the bottom?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_133_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_133_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_133_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_133_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "Which object is shaped like a cone and is below the desk?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_134_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_134_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_134_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "Which picture shows the toy pony outside the gift box?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_135_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_135_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_135_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Which object is beside the backpack?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_136_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_136_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_136_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_136_3.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_136_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Which object is next to the pair of shoes?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_137_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_137_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_137_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_137_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Which object is below the desk?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_138_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_138_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_138_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_138_3.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_138_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "What is on the right?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_139_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_139_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_139_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_139_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "Which picture shows the toy car inside the toy box?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_140_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_140_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_140_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Which object is beside the one shaped like a cube?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_141_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_141_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_141_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_141_3.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_141_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Which object is next to the one shaped like a cone?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_142_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_142_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_142_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_142_3.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_142_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image\nE: The sixth image", "question": "Which object is next to the one shaped like a cone?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image\nE: The sixth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_143_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_143_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_143_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_143_3.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_143_4.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_143_5.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "What is on the left?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_144_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_144_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_144_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "Which object is above the bed?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_145_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_145_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_145_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "What is on the left?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_146_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_146_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_146_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Which object is below the bench?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_147_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_147_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_147_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_147_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Which object is beside the butterfly?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_148_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_148_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_148_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_148_3.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_148_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "Which object is shaped like a cylinder and is above the table?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_149_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_149_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_149_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image\nE: The sixth image", "question": "Which object is next to the block?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image\nE: The sixth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_150_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_150_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_150_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_150_3.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_150_4.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_150_5.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "What is in the middle?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_151_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_151_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_151_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_151_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Which object is beside the cake?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_152_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_152_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_152_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_152_3.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_152_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "Which object is below the bed?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_153_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_153_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_153_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "What is on the left?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_154_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_154_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_154_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "What is at the top?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_155_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_155_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_155_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_155_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Which object is beside the pair of shoes?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_156_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_156_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_156_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_156_3.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_156_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "What is in the middle?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_157_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_157_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_157_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_157_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "Which object is below the table?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_158_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_158_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_158_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "Which object is above the bench?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_159_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_159_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_159_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "What is at the bottom?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_160_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_160_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_160_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "What is on the left?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_161_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_161_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_161_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_161_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Which object is next to the roll of stickers?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_162_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_162_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_162_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_162_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "Which picture shows the roast beef inside the oven?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_163_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_163_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_163_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "Which picture shows the cookies outside the oven?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_164_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_164_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_164_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Which object is next to the one shaped like a sphere?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_165_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_165_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_165_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_165_3.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_165_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "Which object is below the bench?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_166_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_166_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_166_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "Which picture shows the muffins outside the oven?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_167_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_167_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_167_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "What is on the right?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_168_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_168_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_168_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "What is in the middle?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_169_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_169_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_169_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_169_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Which object is beside the storage bin?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_170_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_170_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_170_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_170_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "Which object is shaped like a sphere and is below the bench?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_171_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_171_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_171_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Which object is next to the top?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_172_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_172_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_172_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_172_3.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_172_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Which object is beside the top?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_173_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_173_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_173_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_173_3.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_173_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "What is at the bottom?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_174_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_174_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_174_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Which object is next to the drum?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_175_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_175_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_175_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_175_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "What is in the middle?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_176_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_176_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_176_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_176_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "Which object is shaped like a cube and is below the bench?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_177_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_177_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_177_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "Which object is below the table?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_178_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_178_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_178_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Which object is next to the clock?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_179_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_179_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_179_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_179_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image\nE: The sixth image", "question": "Which object is beside the flashlight?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image\nE: The sixth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_180_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_180_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_180_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_180_3.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_180_4.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_180_5.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Which object is above the table?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_181_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_181_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_181_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_181_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "What is on the left?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_182_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_182_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_182_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "Which object below the table is shaped like a sphere?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_183_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_183_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_183_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "What is at the bottom?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_184_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_184_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_184_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_184_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "Which object above the bench is shaped like a cone?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_185_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_185_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_185_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image\nE: The sixth image", "question": "Which object is beside the watermelon?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image\nE: The sixth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_186_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_186_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_186_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_186_3.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_186_4.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_186_5.png"], "output": "E", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "Which object below the bed is shaped like a cone?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_187_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_187_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_187_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Which object is next to the trash can?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_188_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_188_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_188_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_188_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "What is on the right?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_189_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_189_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_189_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_189_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "Which picture shows the cow outside the barn?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_190_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_190_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_190_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "What is on the left?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_191_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_191_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_191_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image\nE: The sixth image", "question": "Which object is next to the one shaped like a sphere?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image\nE: The sixth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_192_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_192_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_192_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_192_3.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_192_4.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_192_5.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "Which object is below the bed?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_193_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_193_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_193_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "Which object is shaped like a cone and is above the table?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_194_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_194_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_194_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image", "question": "What is at the bottom?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_195_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_195_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_195_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "What is at the top?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_196_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_196_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_196_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_196_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image\nE: The sixth image", "question": "Which object is next to the top?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image\nE: The sixth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_197_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_197_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_197_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_197_3.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_197_4.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_197_5.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Which object is next to the bunch of bananas?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_198_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_198_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_198_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_198_3.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_198_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Icon_Question_Answering_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "iconqa", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Which object is beside the one shaped like a cube?", "context": "Please answer a multi-choice question in the spatial context of icon images. The input image is the first image.\nSelect from the following choices.A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_199_0.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_199_1.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_199_2.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_199_3.png", "./2D-spatial/Icon_Question_Answering_with_Spatial_Context/Icon_Question_Answering_with_Spatial_Context_199_4.png"], "output": "A", "qwen3-vl": "image none"}]
\ No newline at end of file
diff --git a/results/Image_Captioning_with_Spatial_Context/qwen3-vl/metadata_info.json b/results/Image_Captioning_with_Spatial_Context/qwen3-vl/metadata_info.json
new file mode 100644
index 0000000..5dd518d
--- /dev/null
+++ b/results/Image_Captioning_with_Spatial_Context/qwen3-vl/metadata_info.json
@@ -0,0 +1 @@
+[{"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is a box that has four items and the three are touching the side.\nB: There is a box that has five items and all are in the center.\nC: There is a box that has three items and the four are touching the side.\nD: There is a bag that has four items and the three are touching the side.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a box that has four items and the three are touching the side.\nB: There is a box that has five items and all are in the center.\nC: There is a box that has three items and the four are touching the side.\nD: There is a bag that has four items and the three are touching the side.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_0_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_0_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_0_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: there is a red square touching the base\nB: there is a white circle touching the base\nC: there is a black square touching the base\nD: there is a black triangle touching the base", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: there is a red square touching the base\nB: there is a white circle touching the base\nC: there is a black square touching the base\nD: there is a black triangle touching the base", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_1_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_1_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_1_2.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is a box with 1 black and 1 blue item.\nB: There is a box with 1 black and 1 green item.\nC: There is a box with 2 black items.\nD: There is a box with 1 red and 1 blue item.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a box with 1 black and 1 blue item.\nB: There is a box with 1 black and 1 green item.\nC: There is a box with 2 black items.\nD: There is a box with 1 red and 1 blue item.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_2_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_2_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_2_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is a black block above a yellow block.\nB: There is a yellow block above a black block.\nC: There is a yellow block below a black block.\nD: There is a yellow block next to a black block.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a black block above a yellow block.\nB: There is a yellow block above a black block.\nC: There is a yellow block below a black block.\nD: There is a yellow block next to a black block.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_3_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_3_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_3_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is a blue block as the top of a tower.\nB: There is a red ball at the top of a tower.\nC: There is a yellow block at the base of a tower.\nD: There is a yellow block as the top of a tower.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a blue block as the top of a tower.\nB: There is a red ball at the top of a tower.\nC: There is a yellow block at the base of a tower.\nD: There is a yellow block as the top of a tower.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_4_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_4_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_4_2.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There are 2 towers that contain white blocks\nB: There are 2 towers that contain black blocks\nC: There are 3 towers that contain black blocks\nD: There is 1 tower that contains black blocks", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are 2 towers that contain white blocks\nB: There are 2 towers that contain black blocks\nC: There are 3 towers that contain black blocks\nD: There is 1 tower that contains black blocks", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_5_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_5_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_5_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: All three towers have a blue base.\nB: None of the towers have a blue base.\nC: Only one tower has a blue base.\nD: Two of the three towers has a blue base.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: All three towers have a blue base.\nB: None of the towers have a blue base.\nC: Only one tower has a blue base.\nD: Two of the three towers has a blue base.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_6_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_6_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_6_2.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is a blue sphere as the base of a tower with more than two blocks\nB: There is a red block as the base of a tower with more than two blocks.\nC: There is a blue block as the base of a tower with more than two blocks.\nD: There is a blue block as the base of a single block tower.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a blue sphere as the base of a tower with more than two blocks\nB: There is a red block as the base of a tower with more than two blocks.\nC: There is a blue block as the base of a tower with more than two blocks.\nD: There is a blue block as the base of a single block tower.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_7_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_7_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_7_2.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There are two colors touching the wall.\nB: The wall has multiple colors.\nC: No colors are touching the wall.\nD: There is only one color touching the wall.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are two colors touching the wall.\nB: The wall has multiple colors.\nC: No colors are touching the wall.\nD: There is only one color touching the wall.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_8_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_8_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_8_2.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is at least 1 triangle closely touching a box corner\nB: There is at least 1 circle closely touching a box edge\nC: There is at least 1 square closely touching a circle\nD: There is at least 1 square closely tocuhing a box corner", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is at least 1 triangle closely touching a box corner\nB: There is at least 1 circle closely touching a box edge\nC: There is at least 1 square closely touching a circle\nD: There is at least 1 square closely tocuhing a box corner", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_9_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_9_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_9_2.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is 1 box with 2 black circles\nB: There is 1 box with 3 black circles\nC: There are 3 boxes with 2 black circles\nD: There are 2 boxes with 1 black circle", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is 1 box with 2 black circles\nB: There is 1 box with 3 black circles\nC: There are 3 boxes with 2 black circles\nD: There are 2 boxes with 1 black circle", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_10_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_10_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_10_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: there is one tower with a black block at the top\nB: there is one tower with a red block at the top\nC: there are two towers with a black block at the top\nD: there is one tower with no block at the top", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: there is one tower with a black block at the top\nB: there is one tower with a red block at the top\nC: there are two towers with a black block at the top\nD: there is one tower with no block at the top", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_11_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_11_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_11_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: A yellow block is under a green block.\nB: There is a yellow block on a blue block.\nC: There is a red block next to a blue block.\nD: The green block is above the red block.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: A yellow block is under a green block.\nB: There is a yellow block on a blue block.\nC: There is a red block next to a blue block.\nD: The green block is above the red block.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_12_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_12_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_12_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: All towers have different base colors.\nB: There are only two towers which has the same base color.\nC: Only one tower has a unique base color.\nD: There are three towers with the same base color.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: All towers have different base colors.\nB: There are only two towers which has the same base color.\nC: Only one tower has a unique base color.\nD: There are three towers with the same base color.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_13_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_13_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_13_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There are three yellow blocks in the middle of a tower.\nB: There are two yellow blocks as the base of a tower.\nC: There are two red blocks as the base of a tower.\nD: There is one yellow block at the top of a tower.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are three yellow blocks in the middle of a tower.\nB: There are two yellow blocks as the base of a tower.\nC: There are two red blocks as the base of a tower.\nD: There is one yellow block at the top of a tower.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_14_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_14_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_14_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is a box with items of orange and pink color.\nB: There is a box with items of only black and blue color.\nC: There is a box with items of red and white color.\nD: There is a box with items of green and yellow color.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a box with items of orange and pink color.\nB: There is a box with items of only black and blue color.\nC: There is a box with items of red and white color.\nD: There is a box with items of green and yellow color.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_15_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_15_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_15_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is a black tower.\nB: There is a black tree.\nC: There is a black bridge.\nD: There is a white tower.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a black tower.\nB: There is a black tree.\nC: There is a black bridge.\nD: There is a white tower.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_16_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_16_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_16_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is exactly one yellow triangle touching the edge\nB: There is exactly one red triangle touching the edge\nC: There are no yellow triangles touching the edge\nD: There are two yellow triangles touching the edge", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is exactly one yellow triangle touching the edge\nB: There is exactly one red triangle touching the edge\nC: There are no yellow triangles touching the edge\nD: There are two yellow triangles touching the edge", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_17_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_17_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_17_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There are exactly 2 blue blocks\nB: There are no blue blocks\nC: There are at least 3 blue blocks\nD: There are more than 10 blue blocks", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are exactly 2 blue blocks\nB: There are no blue blocks\nC: There are at least 3 blue blocks\nD: There are more than 10 blue blocks", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_18_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_18_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_18_2.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There are two white items in the middle of the box.\nB: There is one black item and one white item at the edge of the box.\nC: There are two black items closely touching the bottom of a box.\nD: There is a single black item at the top of the box.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are two white items in the middle of the box.\nB: There is one black item and one white item at the edge of the box.\nC: There are two black items closely touching the bottom of a box.\nD: There is a single black item at the top of the box.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_19_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_19_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_19_2.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: there is no tower with a blue block at the base\nB: there is a tower with a red block at the base\nC: there are multiple towers with a blue block at the base\nD: there is exactly one tower with a blue block at the base", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: there is no tower with a blue block at the base\nB: there is a tower with a red block at the base\nC: there are multiple towers with a blue block at the base\nD: there is exactly one tower with a blue block at the base", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_20_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_20_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_20_2.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is a box, which a blue triangle and at least two black items.\nB: There is a box, which a blue circle and at least two black items.\nC: There is a box, which a blue triangle and only one black item.\nD: There is a box, which a green triangle and at least two black items.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a box, which a blue triangle and at least two black items.\nB: There is a box, which a blue circle and at least two black items.\nC: There is a box, which a blue triangle and only one black item.\nD: There is a box, which a green triangle and at least two black items.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_21_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_21_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_21_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: One tower has a red block on top of a blue block\nB: One tower has a yellow block on top of a green block\nC: One tower has a yellow block on top of a blue block\nD: One tower has a blue block on top of a yellow block", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: One tower has a red block on top of a blue block\nB: One tower has a yellow block on top of a green block\nC: One tower has a yellow block on top of a blue block\nD: One tower has a blue block on top of a yellow block", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_22_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_22_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_22_2.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There are 3 towers with black blocks\nB: No towers have black blocks\nC: There is 1 tower that contains black blocks\nD: There are 2 towers that contain at least 1 black block", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are 3 towers with black blocks\nB: No towers have black blocks\nC: There is 1 tower that contains black blocks\nD: There are 2 towers that contain at least 1 black block", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_23_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_23_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_23_2.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: A black block is at the top of a tower\nB: There is 1 tower with a black block at the bottom\nC: A tower with a red block at the bottom\nD: There are 2 towers with black blocks", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: A black block is at the top of a tower\nB: There is 1 tower with a black block at the bottom\nC: A tower with a red block at the bottom\nD: There are 2 towers with black blocks", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_24_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_24_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_24_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is a pyramid with four blocks.\nB: There is a tower with four blocks.\nC: There is a tower with three blocks.\nD: There is a tower with five blocks.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a pyramid with four blocks.\nB: There is a tower with four blocks.\nC: There is a tower with three blocks.\nD: There is a tower with five blocks.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_25_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_25_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_25_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is a yellow block on a blue block.\nB: There is a yellow block on a green block.\nC: There is a red block on a blue block.\nD: There is a blue block on a yellow block.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a yellow block on a blue block.\nB: There is a yellow block on a green block.\nC: There is a red block on a blue block.\nD: There is a blue block on a yellow block.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_26_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_26_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_26_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There are 3 boxes with a black item on top.\nB: There are 2 boxes with a white item on top.\nC: There is 1 box with a black item on top.\nD: There are 2 boxes with a black item on top.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are 3 boxes with a black item on top.\nB: There are 2 boxes with a white item on top.\nC: There is 1 box with a black item on top.\nD: There are 2 boxes with a black item on top.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_27_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_27_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_27_2.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: there is exactly one red triangle touching the edge\nB: there are two blue triangles touching the edge\nC: there is exactly one blue square touching the edge\nD: there is exactly one blue triangle touching the edge", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: there is exactly one red triangle touching the edge\nB: there are two blue triangles touching the edge\nC: there is exactly one blue square touching the edge\nD: there is exactly one blue triangle touching the edge", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_28_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_28_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_28_2.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: One of the grey boxes has exactly seven objects\nB: One of the grey boxes has exactly eight objects\nC: One of the grey boxes has exactly four objects\nD: One of the grey box has exactly six objects", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: One of the grey boxes has exactly seven objects\nB: One of the grey boxes has exactly eight objects\nC: One of the grey boxes has exactly four objects\nD: One of the grey box has exactly six objects", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_29_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_29_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_29_2.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: there is exactly one tower with two blocks\nB: there are no towers with three blocks\nC: there are at least two towers with four blocks\nD: there is at least one tower with three blocks", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: there is exactly one tower with two blocks\nB: there are no towers with three blocks\nC: there are at least two towers with four blocks\nD: there is at least one tower with three blocks", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_30_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_30_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_30_2.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is a blue triangle touching the side.\nB: There is a red hexagon in the center.\nC: There is a yellow square touching the side.\nD: There is a green circle in the corner.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a blue triangle touching the side.\nB: There is a red hexagon in the center.\nC: There is a yellow square touching the side.\nD: There is a green circle in the corner.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_31_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_31_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_31_2.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is a tower with exactly four blocks with a yellow block at the bottom\nB: There is a tower with exactly three blocks with a yellow block at the top\nC: There is a tower with three red blocks at the top\nD: There is a tower with exactly two blocks, both yellow", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a tower with exactly four blocks with a yellow block at the bottom\nB: There is a tower with exactly three blocks with a yellow block at the top\nC: There is a tower with three red blocks at the top\nD: There is a tower with exactly two blocks, both yellow", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_32_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_32_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_32_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: No boxes contain yellow items\nB: All boxes contain blue items\nC: There is at least 1 yellow item in each box\nD: Each box contains only red items", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: No boxes contain yellow items\nB: All boxes contain blue items\nC: There is at least 1 yellow item in each box\nD: Each box contains only red items", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_33_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_33_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_33_2.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: None of the black triangles are touching the center\nB: All of the black triangles are touching an edge\nC: None of the black triangles are touching a edge\nD: Some black triangles are touching an edge", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: None of the black triangles are touching the center\nB: All of the black triangles are touching an edge\nC: None of the black triangles are touching a edge\nD: Some black triangles are touching an edge", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_34_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_34_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_34_2.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is 1 stack with only purple and orange blocks\nB: There is 1 pile with only green and white blocks\nC: There is 1 tower with only blue and black blocks\nD: There is 1 tower with only red and yellow blocks", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is 1 stack with only purple and orange blocks\nB: There is 1 pile with only green and white blocks\nC: There is 1 tower with only blue and black blocks\nD: There is 1 tower with only red and yellow blocks", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_35_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_35_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_35_2.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There are 3 boxes with a triangle in the middle\nB: There are 2 boxes with a triangle far from the corner\nC: There are 2 circles with a square closely touching a corner\nD: There are 2 boxes with a triangle closely touching a corner", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are 3 boxes with a triangle in the middle\nB: There are 2 boxes with a triangle far from the corner\nC: There are 2 circles with a square closely touching a corner\nD: There are 2 boxes with a triangle closely touching a corner", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_36_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_36_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_36_2.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: there is exactly one circle touching the edge\nB: there are no circles touching the edge\nC: there are at least two circles touching the edge\nD: there are three triangles touching the edge", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: there is exactly one circle touching the edge\nB: there are no circles touching the edge\nC: there are at least two circles touching the edge\nD: there are three triangles touching the edge", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_37_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_37_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_37_2.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is a box with only two items of black and yellow color.\nB: There is a box with two items of red and blue color.\nC: There is a box with three items of black and yellow color.\nD: There is a drawer with two items of green and yellow color.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a box with only two items of black and yellow color.\nB: There is a box with two items of red and blue color.\nC: There is a box with three items of black and yellow color.\nD: There is a drawer with two items of green and yellow color.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_38_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_38_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_38_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is a tower with three blocks.\nB: There is a tower with six blocks.\nC: There is a tower with four blocks.\nD: There is a tower with five blocks.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a tower with three blocks.\nB: There is a tower with six blocks.\nC: There is a tower with four blocks.\nD: There is a tower with five blocks.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_39_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_39_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_39_2.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is a three blocks tower which has only one blue block.\nB: There is a three blocks tower which has only red blocks.\nC: There is a two blocks tower which has only one blue block.\nD: There is a four blocks tower which has two blue blocks.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a three blocks tower which has only one blue block.\nB: There is a three blocks tower which has only red blocks.\nC: There is a two blocks tower which has only one blue block.\nD: There is a four blocks tower which has two blue blocks.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_40_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_40_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_40_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is ablue block on a black block.\nB: There is no block in the picture.\nC: There is a blue block next to a black block.\nD: A black block is on top of a blue block.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is ablue block on a black block.\nB: There is no block in the picture.\nC: There is a blue block next to a black block.\nD: A black block is on top of a blue block.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_41_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_41_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_41_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There are 2 towers with 2 yellow blocks\nB: There is 1 tower with 3 yellow blocks\nC: There is 1 tower with 2 yellow blocks\nD: There is 1 tower with 2 blue blocks", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are 2 towers with 2 yellow blocks\nB: There is 1 tower with 3 yellow blocks\nC: There is 1 tower with 2 yellow blocks\nD: There is 1 tower with 2 blue blocks", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_42_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_42_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_42_2.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: A box holds a blue triangle, a blue square, and a yellow circle.\nB: A box contains a blue circle, a yellow triangle, and a yellow square.\nC: There is a box with a blue triangle, a yellow square and a yellow circle.\nD: There is a box with a blue triangle, a yellow square", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: A box holds a blue triangle, a blue square, and a yellow circle.\nB: A box contains a blue circle, a yellow triangle, and a yellow square.\nC: There is a box with a blue triangle, a yellow square and a yellow circle.\nD: There is a box with a blue triangle, a yellow square", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_43_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_43_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_43_2.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is a box with no items inside.\nB: There is a box with items of three different shapes.\nC: There is a box with items of only one color.\nD: There is a box with items of various colors.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a box with no items inside.\nB: There is a box with items of three different shapes.\nC: There is a box with items of only one color.\nD: There is a box with items of various colors.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_44_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_44_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_44_2.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There are 2 boxes with only red and yellow items.\nB: There are 3 boxes with only black and yellow items.\nC: There are 2 boxes with only black and blue items.\nD: There are 2 boxes with only black and yellow items.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are 2 boxes with only red and yellow items.\nB: There are 3 boxes with only black and yellow items.\nC: There are 2 boxes with only black and blue items.\nD: There are 2 boxes with only black and yellow items.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_45_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_45_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_45_2.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is a red block above a yellow block.\nB: There is a black block above a yellow block.\nC: There is a yellow block below a black block.\nD: There is a yellow block above a black block.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a red block above a yellow block.\nB: There is a black block above a yellow block.\nC: There is a yellow block below a black block.\nD: There is a yellow block above a black block.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_46_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_46_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_46_2.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is a tower with a yellow block over a blue block\nB: There is a tower with a red block over a blue block\nC: There is a tower with a yellow block over a green block\nD: There is a tower with a yellow block next to a blue block", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a tower with a yellow block over a blue block\nB: There is a tower with a red block over a blue block\nC: There is a tower with a yellow block over a green block\nD: There is a tower with a yellow block next to a blue block", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_47_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_47_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_47_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is a box with only two black and blue items.\nB: There is a box with different colored items.\nC: There is a box with several black and blue items.\nD: There is a box with only black items.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a box with only two black and blue items.\nB: There is a box with different colored items.\nC: There is a box with several black and blue items.\nD: There is a box with only black items.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_48_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_48_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_48_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is a box with 4 items and 2 yellow squares\nB: There is a box with 3 items and 2 yellow squares in the middle.\nC: There is a box with 4 items and 2 yellow squares in the middle.\nD: There is a box with 4 items and 2 red circles in the middle.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a box with 4 items and 2 yellow squares\nB: There is a box with 3 items and 2 yellow squares in the middle.\nC: There is a box with 4 items and 2 yellow squares in the middle.\nD: There is a box with 4 items and 2 red circles in the middle.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_49_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_49_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_49_2.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There are two black towers with multiple blocks.\nB: There is a black tower with several blocks.\nC: There is a white tower with only one block.\nD: There is a black tower with only one block.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are two black towers with multiple blocks.\nB: There is a black tower with several blocks.\nC: There is a white tower with only one block.\nD: There is a black tower with only one block.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_50_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_50_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_50_2.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There are 3 black circles\nB: There are 2 white triangles\nC: There are 2 black triangles\nD: There are 5 black triangles", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are 3 black circles\nB: There are 2 white triangles\nC: There are 2 black triangles\nD: There are 5 black triangles", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_51_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_51_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_51_2.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is a tower with four blocks.\nB: There is a row of candles.\nC: There is a stack of plates.\nD: There is a pile of books.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a tower with four blocks.\nB: There is a row of candles.\nC: There is a stack of plates.\nD: There is a pile of books.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_52_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_52_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_52_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There are no blue blocks\nB: There are at least 3 blue blocks\nC: There are exactly two blue blocks\nD: There is only one blue block", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are no blue blocks\nB: There are at least 3 blue blocks\nC: There are exactly two blue blocks\nD: There is only one blue block", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_53_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_53_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_53_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There are 4 yellow items and one large circle touching the wall.\nB: There are 3 yellow items but none are touching the wall.\nC: There are 3 yellow items touching the wall and at least one small circle nearly touching the wall.\nD: There are 2 yellow items touching the wall and no small circles.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are 4 yellow items and one large circle touching the wall.\nB: There are 3 yellow items but none are touching the wall.\nC: There are 3 yellow items touching the wall and at least one small circle nearly touching the wall.\nD: There are 2 yellow items touching the wall and no small circles.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_54_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_54_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_54_2.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is a box with three colors and no items on top.\nB: There is a box with two colors and a white item on top.\nC: There is a round container with all 3 colors and a black item beside it.\nD: There is a box with all 3 colors and a black item on top.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a box with three colors and no items on top.\nB: There is a box with two colors and a white item on top.\nC: There is a round container with all 3 colors and a black item beside it.\nD: There is a box with all 3 colors and a black item on top.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_55_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_55_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_55_2.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is 1 tower with a yellow block at the top\nB: There is 1 tower with a yellow block at the base\nC: There is 1 tower with a red block at the base\nD: There are 2 towers with a yellow block at the base", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is 1 tower with a yellow block at the top\nB: There is 1 tower with a yellow block at the base\nC: There is 1 tower with a red block at the base\nD: There are 2 towers with a yellow block at the base", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_56_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_56_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_56_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: there are two black triangles touching the base\nB: there is one black triangle touching the base\nC: there is one black triangle not touching the base\nD: there are no black triangles touching the base", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: there are two black triangles touching the base\nB: there is one black triangle touching the base\nC: there is one black triangle not touching the base\nD: there are no black triangles touching the base", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_57_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_57_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_57_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is a tower with exactly three blocks with a yellow block at the top\nB: There is a tower with three blocks with a blue block at the top\nC: There is a tower with four blocks and a red block at the top\nD: There is a tower with two blocks and a green block at the top", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a tower with exactly three blocks with a yellow block at the top\nB: There is a tower with three blocks with a blue block at the top\nC: There is a tower with four blocks and a red block at the top\nD: There is a tower with two blocks and a green block at the top", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_58_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_58_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_58_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There are 2 blue blocks\nB: There is 1 blue block\nC: There are 2 red blocks\nD: There are 3 green blocks", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are 2 blue blocks\nB: There is 1 blue block\nC: There are 2 red blocks\nD: There are 3 green blocks", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_59_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_59_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_59_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is a box with 3 items and a black item on top.\nB: There is a box with 5 items and a red item on top.\nC: There is a box with 2 items and a blue item on top.\nD: There is a box with 3 items and a white item on top.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a box with 3 items and a black item on top.\nB: There is a box with 5 items and a red item on top.\nC: There is a box with 2 items and a blue item on top.\nD: There is a box with 3 items and a white item on top.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_60_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_60_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_60_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: t least two of the towers ha yellow bases.\nB: None of the towers have yellow bases.\nC: At most two of the towers have yellow bases.\nD: All of the towers have yellow bases.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: t least two of the towers ha yellow bases.\nB: None of the towers have yellow bases.\nC: At most two of the towers have yellow bases.\nD: All of the towers have yellow bases.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_61_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_61_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_61_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: the tower with two blocks has a black block at the top\nB: the tower with four blocks has a black block at the bottom\nC: the tower with four blocks has a red block at the top\nD: the tower with four blocks has a black block at the top", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: the tower with two blocks has a black block at the top\nB: the tower with four blocks has a black block at the bottom\nC: the tower with four blocks has a red block at the top\nD: the tower with four blocks has a black block at the top", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_62_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_62_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_62_2.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is a box with items of 2 different colors and a black square.\nB: There is a box with items of 4 different colors and no square.\nC: There is a box with items of 2 different colors and a red square.\nD: There is a box with items of 3 different colors and a black square.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a box with items of 2 different colors and a black square.\nB: There is a box with items of 4 different colors and no square.\nC: There is a box with items of 2 different colors and a red square.\nD: There is a box with items of 3 different colors and a black square.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_63_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_63_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_63_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is a yellow square touching the wall.\nB: There is a blue rectangle on the floor.\nC: There is a green circle floating in the air.\nD: There is a red triangle near the door.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a yellow square touching the wall.\nB: There is a blue rectangle on the floor.\nC: There is a green circle floating in the air.\nD: There is a red triangle near the door.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_64_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_64_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_64_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is a box with 3 items of the same color.\nB: There is a box with 4 items of all different colors.\nC: There is a box with 2 items of different colors.\nD: There is a box with 3 items of all 3 different colors.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a box with 3 items of the same color.\nB: There is a box with 4 items of all different colors.\nC: There is a box with 2 items of different colors.\nD: There is a box with 3 items of all 3 different colors.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_65_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_65_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_65_2.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: the tower has two blue blocks with a yellow block at the top\nB: there are three blocks in the tower with a red block at the top\nC: there is a tower with exactly two blocks having a blue block at the top.\nD: the tower has a single blue block at the top and bottom", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: the tower has two blue blocks with a yellow block at the top\nB: there are three blocks in the tower with a red block at the top\nC: there is a tower with exactly two blocks having a blue block at the top.\nD: the tower has a single blue block at the top and bottom", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_66_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_66_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_66_2.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is a tower with a yellow block over a red block\nB: There is a tower with a green block over a yellow block\nC: There is a tower with a yellow block over a blue block\nD: There is a tower with a blue block over a yellow block", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a tower with a yellow block over a red block\nB: There is a tower with a green block over a yellow block\nC: There is a tower with a yellow block over a blue block\nD: There is a tower with a blue block over a yellow block", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_67_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_67_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_67_2.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: green block on the side\nB: blue block at the bottom\nC: yellow block at the top\nD: red block in the middle", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: green block on the side\nB: blue block at the bottom\nC: yellow block at the top\nD: red block in the middle", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_68_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_68_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_68_2.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is a square closely touching the side of a box.\nB: There is a square closely touching the bottom of a box.\nC: There is no square closely touching the top of a box.\nD: There is no square closely touching the bottom of a box.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a square closely touching the side of a box.\nB: There is a square closely touching the bottom of a box.\nC: There is no square closely touching the top of a box.\nD: There is no square closely touching the bottom of a box.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_69_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_69_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_69_2.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is a tower with a yellow block, a blue block and a black block.\nB: There is a tower with a yellow block, a green block and a black block.\nC: There is a tower with a yellow block, a blue block and\nD: There is a tower with a red block, a blue block and a black block.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a tower with a yellow block, a blue block and a black block.\nB: There is a tower with a yellow block, a green block and a black block.\nC: There is a tower with a yellow block, a blue block and\nD: There is a tower with a red block, a blue block and a black block.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_70_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_70_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_70_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is a black tower with only one block.\nB: There is a black tower with multiple blocks.\nC: There is a black tower with no blocks.\nD: There is a white tower with only one block.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a black tower with only one block.\nB: There is a black tower with multiple blocks.\nC: There is a black tower with no blocks.\nD: There is a white tower with only one block.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_71_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_71_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_71_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There are 3 boxes each with black and yellow items.\nB: There is a box with only 3 items of black and yellow color.\nC: There is a black and yellow box with 3 items.\nD: There is a box with various items of different colors.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are 3 boxes each with black and yellow items.\nB: There is a box with only 3 items of black and yellow color.\nC: There is a black and yellow box with 3 items.\nD: There is a box with various items of different colors.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_72_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_72_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_72_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: there is a black square touching the base\nB: there is a black circle touching the base\nC: there is a white square touching the base\nD: the square is floating above the base", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: there is a black square touching the base\nB: there is a black circle touching the base\nC: there is a white square touching the base\nD: the square is floating above the base", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_73_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_73_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_73_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There are exactly two black squares touching an edge\nB: There are exactly three black squares not touching any edge\nC: There is exactly one black square not touching any edge\nD: There are exactly two black squares not touching any edge", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are exactly two black squares touching an edge\nB: There are exactly three black squares not touching any edge\nC: There is exactly one black square not touching any edge\nD: There are exactly two black squares not touching any edge", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_74_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_74_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_74_2.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: there is at least one tower with exactly two blocks having a blue block at the top\nB: there is no tower with exactly two blocks having a blue block at the top\nC: there is at least one tower with exactly two blocks having a red\nD: there is at least one tower with exactly three blocks having a blue block at the top", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: there is at least one tower with exactly two blocks having a blue block at the top\nB: there is no tower with exactly two blocks having a blue block at the top\nC: there is at least one tower with exactly two blocks having a red\nD: there is at least one tower with exactly three blocks having a blue block at the top", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_75_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_75_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_75_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There are 3 boxes with blue, yellow, and red items\nB: There is 1 box with only blue and yellow items\nC: There is 1 box with only red and green items\nD: There are 2 boxes with only blue and yellow items", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are 3 boxes with blue, yellow, and red items\nB: There is 1 box with only blue and yellow items\nC: There is 1 box with only red and green items\nD: There are 2 boxes with only blue and yellow items", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_76_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_76_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_76_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is 1 tower with a blue block at the base\nB: There is 1 tower with a blue block at the top\nC: There is 1 tower with a red block at the base\nD: There are 2 towers with a blue block at the base", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is 1 tower with a blue block at the base\nB: There is 1 tower with a blue block at the top\nC: There is 1 tower with a red block at the base\nD: There are 2 towers with a blue block at the base", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_77_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_77_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_77_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There are 3 towers with a blue block at the base\nB: There are 2 towers with a red block at the base\nC: There is 1 tower with a green block at the top\nD: There is 1 tower with a blue block at the base", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are 3 towers with a blue block at the base\nB: There are 2 towers with a red block at the base\nC: There is 1 tower with a green block at the top\nD: There is 1 tower with a blue block at the base", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_78_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_78_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_78_2.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is a blue block on a single-block tower.\nB: There is a blue block as the top of a tower with at least two blocks.\nC: There is a blue block at the base of a tower with at least two blocks.\nD: There is a green block as the top of a tower with at least two blocks.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a blue block on a single-block tower.\nB: There is a blue block as the top of a tower with at least two blocks.\nC: There is a blue block at the base of a tower with at least two blocks.\nD: There is a green block as the top of a tower with at least two blocks.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_79_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_79_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_79_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is a box with a yellow triangle and three blue items.\nB: There is a box with a yellow square and three green items.\nC: There is a box with a yellow circle and two red items.\nD: There is a box with a yellow circle and three blue items.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a box with a yellow triangle and three blue items.\nB: There is a box with a yellow square and three green items.\nC: There is a box with a yellow circle and two red items.\nD: There is a box with a yellow circle and three blue items.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_80_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_80_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_80_2.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: All 3 colors are not touching the wall.\nB: None of the colors are touching the wall.\nC: ll 3 different colors are touching the wall.\nD: Only 2 colors are touching the wall.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: All 3 colors are not touching the wall.\nB: None of the colors are touching the wall.\nC: ll 3 different colors are touching the wall.\nD: Only 2 colors are touching the wall.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_81_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_81_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_81_2.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is one yellow block at the top of a tower.\nB: There is one red block as the base of a tower.\nC: There are two yellow blocks as the base of a tower.\nD: There are two blue blocks as the base of a tower.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is one yellow block at the top of a tower.\nB: There is one red block as the base of a tower.\nC: There are two yellow blocks as the base of a tower.\nD: There are two blue blocks as the base of a tower.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_82_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_82_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_82_2.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is at least one black block on a blue block.\nB: There is at least one black block on a green block.\nC: There is at least one blue block on a black block.\nD: There are only black blocks.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is at least one black block on a blue block.\nB: There is at least one black block on a green block.\nC: There is at least one blue block on a black block.\nD: There are only black blocks.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_83_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_83_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_83_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: there is a red object touching the edge\nB: there is a green object touching the edge\nC: there is a blue object touching the edge\nD: there is a blue object in the center", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: there is a red object touching the edge\nB: there is a green object touching the edge\nC: there is a blue object touching the edge\nD: there is a blue object in the center", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_84_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_84_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_84_2.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There are 2 towers with only blue and black blocks\nB: There is 1 tower with only yellow and blue blocks\nC: There is 1 tower with only red and green blocks\nD: There is 1 tower with only blue and black blocks", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are 2 towers with only blue and black blocks\nB: There is 1 tower with only yellow and blue blocks\nC: There is 1 tower with only red and green blocks\nD: There is 1 tower with only blue and black blocks", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_85_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_85_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_85_2.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is one yellow item touching the floor.\nB: There are three yellow items touching the wall.\nC: There are two yellow items touching the wall.\nD: There are two blue items touching the wall.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is one yellow item touching the floor.\nB: There are three yellow items touching the wall.\nC: There are two yellow items touching the wall.\nD: There are two blue items touching the wall.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_86_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_86_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_86_2.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: one of the grey square contains exactly four objects\nB: one of the grey square contains exactly five objects\nC: one of the grey square contains exactly three objects\nD: one of the grey squares contains exactly six objects", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: one of the grey square contains exactly four objects\nB: one of the grey square contains exactly five objects\nC: one of the grey square contains exactly three objects\nD: one of the grey squares contains exactly six objects", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_87_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_87_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_87_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: there are two blue circles touching the base\nB: there are two yellow circles touching the base\nC: there are three yellow circles touching the base\nD: there is one yellow circle in the middle", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: there are two blue circles touching the base\nB: there are two yellow circles touching the base\nC: there are three yellow circles touching the base\nD: there is one yellow circle in the middle", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_88_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_88_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_88_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There are 2 black triangles\nB: There are no black triangles\nC: There are 3 black triangles\nD: There are 2 white triangles", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are 2 black triangles\nB: There are no black triangles\nC: There are 3 black triangles\nD: There are 2 white triangles", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_89_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_89_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_89_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is a red block at the top of the tower with only one block.\nB: There is a black block as the base of a tower with at least two blocks.\nC: There is a black block at the base of a tower with only one block.\nD: There is a black block floating in the air beside the tower.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a red block at the top of the tower with only one block.\nB: There is a black block as the base of a tower with at least two blocks.\nC: There is a black block at the base of a tower with only one block.\nD: There is a black block floating in the air beside the tower.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_90_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_90_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_90_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is a green circle in the center of a box.\nB: There is a blue square closely touching the bottom of a box.\nC: There is a yellow star floating above a box.\nD: There is a red triangle in the top right corner of a box.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a green circle in the center of a box.\nB: There is a blue square closely touching the bottom of a box.\nC: There is a yellow star floating above a box.\nD: There is a red triangle in the top right corner of a box.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_91_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_91_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_91_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is only one yellow block as the base of a tower.\nB: There is one yellow block at the top of a tower.\nC: There are three yellow blocks at the base of the tower.\nD: There are two yellow blocks as the base of a tower.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is only one yellow block as the base of a tower.\nB: There is one yellow block at the top of a tower.\nC: There are three yellow blocks at the base of the tower.\nD: There are two yellow blocks as the base of a tower.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_92_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_92_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_92_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: there is one tower having a black block over a blue block\nB: there is one tower having a blue block over a black block\nC: there are two towers having black blocks over blue blocks\nD: there is one tower having a green block over a black block", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: there is one tower having a black block over a blue block\nB: there is one tower having a blue block over a black block\nC: there are two towers having black blocks over blue blocks\nD: there is one tower having a green block over a black block", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_93_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_93_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_93_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There are three towers that have two blue blocks.\nB: There is one tower that has two blue blocks.\nC: There are two towers that have one blue block.\nD: There are two towers that has two blue blocks.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are three towers that have two blue blocks.\nB: There is one tower that has two blue blocks.\nC: There are two towers that have one blue block.\nD: There are two towers that has two blue blocks.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_94_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_94_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_94_2.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There are 4 yellow squares\nB: There are 3 yellow circles\nC: There are 3 yellow squares\nD: There are 3 blue squares", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are 4 yellow squares\nB: There are 3 yellow circles\nC: There are 3 yellow squares\nD: There are 3 blue squares", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_95_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_95_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_95_2.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is a box with 2 items and a yellow one touching the wall.\nB: There are no items in the box.\nC: A green item is touching the wall.\nD: The box contains 5 items.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a box with 2 items and a yellow one touching the wall.\nB: There are no items in the box.\nC: A green item is touching the wall.\nD: The box contains 5 items.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_96_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_96_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_96_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: there is a tree beside the tower\nB: there is a car near the tower\nC: there is a tower with exactly one block\nD: there is a tower with three blocks", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: there is a tree beside the tower\nB: there is a car near the tower\nC: there is a tower with exactly one block\nD: there is a tower with three blocks", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_97_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_97_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_97_2.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: there are two towers with black blocks at the base\nB: there is exactly one tower with a white block at the base\nC: there is no tower with a black block at the base\nD: there is exactly one tower with a black block at the base", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: there are two towers with black blocks at the base\nB: there is exactly one tower with a white block at the base\nC: there is no tower with a black block at the base\nD: there is exactly one tower with a black block at the base", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_98_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_98_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_98_2.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There are 4 black blocks\nB: There are no black blocks\nC: There are 3 black blocks\nD: There are 2 black blocks", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are 4 black blocks\nB: There are no black blocks\nC: There are 3 black blocks\nD: There are 2 black blocks", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_99_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_99_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_99_2.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is only one tower with at least two blue blocks.\nB: There are no towers with yellow blocks.\nC: There are two towers with at least two yellow blocks.\nD: There is only one tower with at least two yellow blocks.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is only one tower with at least two blue blocks.\nB: There are no towers with yellow blocks.\nC: There are two towers with at least two yellow blocks.\nD: There is only one tower with at least two yellow blocks.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_100_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_100_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_100_2.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: there are at least three red triangles not touching any edge\nB: there are at least three yellow triangles touching one edge\nC: there are at least three yellow triangles not touching any edge\nD: there are exactly two yellow triangles not touching any edge", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: there are at least three red triangles not touching any edge\nB: there are at least three yellow triangles touching one edge\nC: there are at least three yellow triangles not touching any edge\nD: there are exactly two yellow triangles not touching any edge", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_101_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_101_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_101_2.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is a box with a red circle and at least two black items.\nB: There is a box with a yellow triangle and at least two black items.\nC: There is a box with a yellow square and at least two black items.\nD: There is a box with a yellow square and no black items.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a box with a red circle and at least two black items.\nB: There is a box with a yellow triangle and at least two black items.\nC: There is a box with a yellow square and at least two black items.\nD: There is a box with a yellow square and no black items.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_102_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_102_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_102_2.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is a tower with no blocks.\nB: There is a tower with only one block.\nC: There is a tower with multiple blocks.\nD: There is no tower at all.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a tower with no blocks.\nB: There is a tower with only one block.\nC: There is a tower with multiple blocks.\nD: There is no tower at all.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_103_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_103_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_103_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: yellow block at the top\nB: yellow block at the bottom\nC: blue block at the top\nD: red block in the middle", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: yellow block at the top\nB: yellow block at the bottom\nC: blue block at the top\nD: red block in the middle", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_104_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_104_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_104_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: there are multiple towers with blocks of different colors\nB: there are no towers with blocks of the same color\nC: there are two towers with more than one block where all the blocks are of same color\nD: there is only one tower with blocks of the same color", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: there are multiple towers with blocks of different colors\nB: there are no towers with blocks of the same color\nC: there are two towers with more than one block where all the blocks are of same color\nD: there is only one tower with blocks of the same color", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_105_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_105_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_105_2.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: One tower has a yellow block on top of a red block\nB: One tower has a blue block on top of a yellow block\nC: One tower has a red block on top of a green block\nD: One tower has a yellow block on top of a blue block", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: One tower has a yellow block on top of a red block\nB: One tower has a blue block on top of a yellow block\nC: One tower has a red block on top of a green block\nD: One tower has a yellow block on top of a blue block", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_106_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_106_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_106_2.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There are at least 3 blue blocks\nB: There are no blue blocks\nC: There are exactly 5 blue blocks\nD: There are at most 2 blue blocks", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are at least 3 blue blocks\nB: There are no blue blocks\nC: There are exactly 5 blue blocks\nD: There are at most 2 blue blocks", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_107_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_107_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_107_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: The tower with four blocks has a black block at the bottom\nB: The tower with four blocks has a black block at the top\nC: The tower with three blocks has a black block at the top\nD: The tower with four blocks has a blue block at the top", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: The tower with four blocks has a black block at the bottom\nB: The tower with four blocks has a black block at the top\nC: The tower with three blocks has a black block at the top\nD: The tower with four blocks has a blue block at the top", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_108_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_108_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_108_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: All towers contain 1 green block\nB: Some towers contain 1 blue block\nC: All towers contain 2 blue blocks\nD: ll towers contain 1 blue block", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: All towers contain 1 green block\nB: Some towers contain 1 blue block\nC: All towers contain 2 blue blocks\nD: ll towers contain 1 blue block", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_109_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_109_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_109_2.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: there are two towers with blue blocks in the middle\nB: there are three towers having red blocks at the top\nC: there is one tower with a green block at the base\nD: there are two towers having a yellow block at the base", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: there are two towers with blue blocks in the middle\nB: there are three towers having red blocks at the top\nC: there is one tower with a green block at the base\nD: there are two towers having a yellow block at the base", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_110_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_110_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_110_2.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: All yellow blocks are at the bottom of the towers.\nB: There are no towers with a yellow block on top.\nC: There is at least a yellow block as the top of a tower.\nD: There are no yellow blocks in the towers.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: All yellow blocks are at the bottom of the towers.\nB: There are no towers with a yellow block on top.\nC: There is at least a yellow block as the top of a tower.\nD: There are no yellow blocks in the towers.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_111_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_111_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_111_2.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is a black block in the middle of a tower with three blocks.\nB: There is a black block at the bottom of a tower with three blocks.\nC: There is a black block as the top of a tower with three blocks.\nD: There is a red block at the top of a tower with three blocks.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a black block in the middle of a tower with three blocks.\nB: There is a black block at the bottom of a tower with three blocks.\nC: There is a black block as the top of a tower with three blocks.\nD: There is a red block at the top of a tower with three blocks.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_112_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_112_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_112_2.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is a tower with exactly four blocks with a black block at the bottom\nB: There is a tower with exactly one block which is black\nC: There is a tower with exactly three blocks with a white block at the top\nD: There is a tower with exactly two blocks with a black block at the top", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a tower with exactly four blocks with a black block at the bottom\nB: There is a tower with exactly one block which is black\nC: There is a tower with exactly three blocks with a white block at the top\nD: There is a tower with exactly two blocks with a black block at the top", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_113_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_113_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_113_2.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There are three towers with the same height and the base is red.\nB: There is one tower with different height and the base is yellow.\nC: There are two towers with the same height and the base is green.\nD: There are two tower with different height and the base is yellow.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are three towers with the same height and the base is red.\nB: There is one tower with different height and the base is yellow.\nC: There are two towers with the same height and the base is green.\nD: There are two tower with different height and the base is yellow.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_114_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_114_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_114_2.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is one blue block as the base of a tower.\nB: There are two blue blocks as the base of a tower.\nC: There are two red blocks as the base of a tower.\nD: There are three blue blocks as the base of a tower.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is one blue block as the base of a tower.\nB: There are two blue blocks as the base of a tower.\nC: There are two red blocks as the base of a tower.\nD: There are three blue blocks as the base of a tower.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_115_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_115_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_115_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There are two yellow blocks in the middle of the tower.\nB: The base of the tower contains a red block.\nC: There is one blue block as the base of the tower.\nD: There is only one yellow block as the base of a tower.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are two yellow blocks in the middle of the tower.\nB: The base of the tower contains a red block.\nC: There is one blue block as the base of the tower.\nD: There is only one yellow block as the base of a tower.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_116_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_116_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_116_2.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is a blue block next to a black block.\nB: There is a blue block below a black block.\nC: There is a blue block above a black block.\nD: There is a black block above a blue block.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a blue block next to a black block.\nB: There is a blue block below a black block.\nC: There is a blue block above a black block.\nD: There is a black block above a blue block.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_117_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_117_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_117_2.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is a box with exactly two black items and at least two blue items.\nB: There is a box with exactly two blue items and at most two black items.\nC: There is a box with exactly two blue items and at least two black items.\nD: There is a box with less than two blue items and exactly two black items", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a box with exactly two black items and at least two blue items.\nB: There is a box with exactly two blue items and at most two black items.\nC: There is a box with exactly two blue items and at least two black items.\nD: There is a box with less than two blue items and exactly two black items", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_118_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_118_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_118_2.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is a yellow item closely touching right wall of a box.\nB: There is a red item closely touching right wall of a box.\nC: There is no yellow item closely touching right wall of a box.\nD: No items are touching the right wall of the box.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a yellow item closely touching right wall of a box.\nB: There is a red item closely touching right wall of a box.\nC: There is no yellow item closely touching right wall of a box.\nD: No items are touching the right wall of the box.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_119_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_119_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_119_2.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: All towers have only red blocks\nB: Only one tower has a blue block\nC: No towers have blue blocks\nD: ll 3 towers have at least 1 blue block", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: All towers have only red blocks\nB: Only one tower has a blue block\nC: No towers have blue blocks\nD: ll 3 towers have at least 1 blue block", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_120_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_120_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_120_2.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is a square touching the corner that is not yellow.\nB: There is a square touching the middle that is not yellow.\nC: There is a square in the center that is not yellow.\nD: There is a square touching the corner that is yellow.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a square touching the corner that is not yellow.\nB: There is a square touching the middle that is not yellow.\nC: There is a square in the center that is not yellow.\nD: There is a square touching the corner that is yellow.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_121_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_121_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_121_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: tleast one black triangle is not touching the edge\nB: No black triangles are present\nC: All black triangles are touching the edge\nD: All triangles are white and touching the edge", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: tleast one black triangle is not touching the edge\nB: No black triangles are present\nC: All black triangles are touching the edge\nD: All triangles are white and touching the edge", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_122_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_122_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_122_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is a tower with only one block.\nB: There is a tower with two blocks.\nC: There is no tower.\nD: There is a tower with multiple blocks.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a tower with only one block.\nB: There is a tower with two blocks.\nC: There is no tower.\nD: There is a tower with multiple blocks.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_123_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_123_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_123_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is a box with 4 items of 3 different colors.\nB: There is a box with 3 items of all 3 different colors.\nC: There is a box with 2 items of all 3 different colors.\nD: There is a box with 3 items of all the same color.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a box with 4 items of 3 different colors.\nB: There is a box with 3 items of all 3 different colors.\nC: There is a box with 2 items of all 3 different colors.\nD: There is a box with 3 items of all the same color.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_124_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_124_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_124_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is a green block at the top of the tower.\nB: The base of the tower is red.\nC: There is a blue block as the base of a tower.\nD: The tower has a yellow base block.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a green block at the top of the tower.\nB: The base of the tower is red.\nC: There is a blue block as the base of a tower.\nD: The tower has a yellow base block.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_125_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_125_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_125_2.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is a pyramid with four blocks.\nB: There is a tower with six blocks.\nC: There is a house with four blocks.\nD: There is a tower with four blocks.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a pyramid with four blocks.\nB: There is a tower with six blocks.\nC: There is a house with four blocks.\nD: There is a tower with four blocks.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_126_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_126_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_126_2.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is no yellow circle closely touching the bottom of a box.\nB: There is no yellow triangle closely touching the bottom of a box.\nC: There is a yellow circle closely touching the bottom of a box.\nD: There is no blue circle closely touching the bottom of a box.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is no yellow circle closely touching the bottom of a box.\nB: There is no yellow triangle closely touching the bottom of a box.\nC: There is a yellow circle closely touching the bottom of a box.\nD: There is no blue circle closely touching the bottom of a box.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_127_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_127_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_127_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is 1 tower with a red block at the base\nB: There is 1 tower with a yellow block at the base\nC: There is 1 tower with a blue block at the base\nD: There are 2 towers with a yellow block at the base", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is 1 tower with a red block at the base\nB: There is 1 tower with a yellow block at the base\nC: There is 1 tower with a blue block at the base\nD: There are 2 towers with a yellow block at the base", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_128_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_128_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_128_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There are 3 white circles\nB: There are 4 black circles\nC: There are 2 black circles\nD: There are 2 white squares", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are 3 white circles\nB: There are 4 black circles\nC: There are 2 black circles\nD: There are 2 white squares", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_129_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_129_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_129_2.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is a black tower.\nB: There is a black house.\nC: There is a white tower.\nD: There is a black tree.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a black tower.\nB: There is a black house.\nC: There is a white tower.\nD: There is a black tree.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_130_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_130_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_130_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: All towers have different heights.\nB: Most towers are of different heights.\nC: There is only one tower with a unique height.\nD: There are at least two towers with the same height.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: All towers have different heights.\nB: Most towers are of different heights.\nC: There is only one tower with a unique height.\nD: There are at least two towers with the same height.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_131_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_131_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_131_2.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is a green hexagon on the table.\nB: There is a red circle on the floor.\nC: There is a yellow square touching the wall.\nD: There is a blue triangle near the door.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a green hexagon on the table.\nB: There is a red circle on the floor.\nC: There is a yellow square touching the wall.\nD: There is a blue triangle near the door.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_132_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_132_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_132_2.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: there are exactly two squares not touching any edge\nB: there are exactly five squares not touching any edge\nC: there are exactly three squares not touching any edge\nD: there are exactly four squares not touching any edge", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: there are exactly two squares not touching any edge\nB: there are exactly five squares not touching any edge\nC: there are exactly three squares not touching any edge\nD: there are exactly four squares not touching any edge", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_133_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_133_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_133_2.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is 1 tower with a red block and a blue block\nB: There is 1 tower with a yellow block and a blue block\nC: There are 2 towers with yellow blocks\nD: There is 1 tower with yellow and red blocks", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is 1 tower with a red block and a blue block\nB: There is 1 tower with a yellow block and a blue block\nC: There are 2 towers with yellow blocks\nD: There is 1 tower with yellow and red blocks", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_134_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_134_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_134_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is a blue item in the center of a box.\nB: There is a blue item touching the left wall of a box.\nC: There is a blue item closely touching right wall of a box.\nD: There is a red item closely touching right wall of a box.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a blue item in the center of a box.\nB: There is a blue item touching the left wall of a box.\nC: There is a blue item closely touching right wall of a box.\nD: There is a red item closely touching right wall of a box.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_135_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_135_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_135_2.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: One of the grey boxes has exactly two objects both of which are circles\nB: One of the grey boxes has exactly three objects all of which are squares\nC: One of the grey box has exactly three objects one of which is a circle\nD: One of the grey boxes has exactly one object which is a triangle", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: One of the grey boxes has exactly two objects both of which are circles\nB: One of the grey boxes has exactly three objects all of which are squares\nC: One of the grey box has exactly three objects one of which is a circle\nD: One of the grey boxes has exactly one object which is a triangle", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_136_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_136_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_136_2.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There are three blue squares touching the edge\nB: There are no blue squares in the picture\nC: There is only one blue square in the center\nD: There are exactly two blue squares not touching the edge", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are three blue squares touching the edge\nB: There are no blue squares in the picture\nC: There is only one blue square in the center\nD: There are exactly two blue squares not touching the edge", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_137_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_137_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_137_2.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: Only 2 yellow and one black item are touching the wall.\nB: Only 2 yellow and one red item are touching the wall.\nC: Only 3 yellow and one black item are touching the wall.\nD: Only 1 yellow and one black item are touching the wall.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: Only 2 yellow and one black item are touching the wall.\nB: Only 2 yellow and one red item are touching the wall.\nC: Only 3 yellow and one black item are touching the wall.\nD: Only 1 yellow and one black item are touching the wall.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_138_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_138_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_138_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: One box has 2 yellow squares\nB: One box has 3 yellow squares\nC: Two boxes have yellow squares\nD: One box has 2 red squares", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: One box has 2 yellow squares\nB: One box has 3 yellow squares\nC: Two boxes have yellow squares\nD: One box has 2 red squares", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_139_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_139_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_139_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There are more than 5 blue blocks\nB: There are no blue blocks\nC: There are exactly 2 blue blocks\nD: There are at least 3 blue blocks", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are more than 5 blue blocks\nB: There are no blue blocks\nC: There are exactly 2 blue blocks\nD: There are at least 3 blue blocks", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_140_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_140_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_140_2.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: the tower with three blocks has a yellow block at the top\nB: the tower with two blocks has a yellow block at the top\nC: the tower with two blocks has a blue block at the top\nD: the tower with two blocks has a yellow block at the bottom", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: the tower with three blocks has a yellow block at the top\nB: the tower with two blocks has a yellow block at the top\nC: the tower with two blocks has a blue block at the top\nD: the tower with two blocks has a yellow block at the bottom", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_141_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_141_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_141_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is a box with 4 items of various colors.\nB: There is a box with 3 items of all 3 different colors.\nC: There is a box with 3 items all of the same color.\nD: There is a box with 2 items of all 3 different colors.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a box with 4 items of various colors.\nB: There is a box with 3 items of all 3 different colors.\nC: There is a box with 3 items all of the same color.\nD: There is a box with 2 items of all 3 different colors.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_142_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_142_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_142_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: there is one tower with a white block at the top\nB: there is one tower with a black block at the top\nC: there is a skyscraper with a blue block at the top\nD: there are two towers with a red block at the top", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: there is one tower with a white block at the top\nB: there is one tower with a black block at the top\nC: there is a skyscraper with a blue block at the top\nD: there are two towers with a red block at the top", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_143_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_143_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_143_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: there is a tower with a four block which has a red block over a blue block\nB: there is a tower with a four block which has a blue block over a blue block\nC: there is a tower with three blocks which has a blue block over a blue block\nD: there is a tower with a four block which has a yellow", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: there is a tower with a four block which has a red block over a blue block\nB: there is a tower with a four block which has a blue block over a blue block\nC: there is a tower with three blocks which has a blue block over a blue block\nD: there is a tower with a four block which has a yellow", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_144_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_144_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_144_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There are three blue squares touching the edge\nB: There are two red squares in the center\nC: There are exactly two blue squares not touching the edge\nD: All blue squares are touching the edge", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are three blue squares touching the edge\nB: There are two red squares in the center\nC: There are exactly two blue squares not touching the edge\nD: All blue squares are touching the edge", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_145_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_145_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_145_2.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: t least two of the towers ha yellow bases.\nB: None of the towers have yellow bases.\nC: All of the towers have blue bases.\nD: At least one of the towers has a red base.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: t least two of the towers ha yellow bases.\nB: None of the towers have yellow bases.\nC: All of the towers have blue bases.\nD: At least one of the towers has a red base.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_146_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_146_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_146_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is a box with a blue square and a blue triangle.\nB: There is a box with a blue circle and a blue triangle.\nC: There is a box with a green circle and a green triangle.\nD: There is a box with a red circle and a red triangle.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a box with a blue square and a blue triangle.\nB: There is a box with a blue circle and a blue triangle.\nC: There is a box with a green circle and a green triangle.\nD: There is a box with a red circle and a red triangle.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_147_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_147_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_147_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: The top of the two four block towers are red.\nB: The top of the two four block towers  are yellow.\nC: The bottom of the two four block towers are yellow.\nD: The top of the single five block tower is yellow.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: The top of the two four block towers are red.\nB: The top of the two four block towers  are yellow.\nC: The bottom of the two four block towers are yellow.\nD: The top of the single five block tower is yellow.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_148_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_148_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_148_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: there is a tower with a yellow block over a blue block\nB: there is a tower with a red block over a green block\nC: there is a tower with a black block over a red block\nD: there is a tower with a black block over a blue block", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: there is a tower with a yellow block over a blue block\nB: there is a tower with a red block over a green block\nC: there is a tower with a black block over a red block\nD: there is a tower with a black block over a blue block", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_149_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_149_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_149_2.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is 1 tower with a blue block at the base\nB: There are 2 towers with yellow blocks at the base\nC: There are 3 towers with green blocks at the base\nD: There is 1 tower with a yellow block at the base", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is 1 tower with a blue block at the base\nB: There are 2 towers with yellow blocks at the base\nC: There are 3 towers with green blocks at the base\nD: There is 1 tower with a yellow block at the base", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_150_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_150_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_150_2.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is a tower with a blue block above a blue block\nB: There is a tower with a blue block above a red block\nC: There is a tower with a red block above a blue block\nD: There is a tower with a blue block below a blue block", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a tower with a blue block above a blue block\nB: There is a tower with a blue block above a red block\nC: There is a tower with a red block above a blue block\nD: There is a tower with a blue block below a blue block", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_151_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_151_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_151_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: there is a red circle in the center\nB: there are no circles touching the edge\nC: all circles are blue\nD: there is at least one yellow circle touching the edge", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: there is a red circle in the center\nB: there are no circles touching the edge\nC: all circles are blue\nD: there is at least one yellow circle touching the edge", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_152_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_152_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_152_2.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is only 1 tower that contains white blocks\nB: There are 3 towers that contain black blocks\nC: There are two towers that contain black blocks\nD: There is only 1 tower than contains black blccks", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is only 1 tower that contains white blocks\nB: There are 3 towers that contain black blocks\nC: There are two towers that contain black blocks\nD: There is only 1 tower than contains black blccks", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_153_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_153_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_153_2.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is a box with items of only black color.\nB: There is a box with exactly 3 items of black and blue color.\nC: There is a box with more than 3 items of black and red color.\nD: There is a box with 3 items at most of black and blue color.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a box with items of only black color.\nB: There is a box with exactly 3 items of black and blue color.\nC: There is a box with more than 3 items of black and red color.\nD: There is a box with 3 items at most of black and blue color.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_154_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_154_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_154_2.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is a stack of 2 green blocks side by side\nB: There is a tower with 2 red blocks stacked together\nC: There is a tower with 3 blue blocks stacked together\nD: There is a tower with 2 blue blocks stacked together", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a stack of 2 green blocks side by side\nB: There is a tower with 2 red blocks stacked together\nC: There is a tower with 3 blue blocks stacked together\nD: There is a tower with 2 blue blocks stacked together", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_155_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_155_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_155_2.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: One box has 3 yellow squares\nB: One box has 2 blue squares\nC: One box has 2 red squares\nD: One box has 2 yellow squares", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: One box has 3 yellow squares\nB: One box has 2 blue squares\nC: One box has 2 red squares\nD: One box has 2 yellow squares", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_156_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_156_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_156_2.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is a tower with 3 blue blocks stacked together\nB: There is a tower with 2 red blocks stacked together\nC: There is a tower with 2 blue blocks stacked together\nD: There is a single blue block in the tower", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a tower with 3 blue blocks stacked together\nB: There is a tower with 2 red blocks stacked together\nC: There is a tower with 2 blue blocks stacked together\nD: There is a single blue block in the tower", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_157_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_157_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_157_2.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is no blue block.\nB: There is at least one black block on a blue block.\nC: There is a blue block on a black block.\nD: There are only black blocks.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is no blue block.\nB: There is at least one black block on a blue block.\nC: There is a blue block on a black block.\nD: There are only black blocks.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_158_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_158_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_158_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: The top of the two three block towers are yellow.\nB: The top of the two four block towers  are yellow.\nC: The bottom of the two four block towers are yellow.\nD: The top of the two four block towers are red.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: The top of the two three block towers are yellow.\nB: The top of the two four block towers  are yellow.\nC: The bottom of the two four block towers are yellow.\nD: The top of the two four block towers are red.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_159_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_159_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_159_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There are exactly two black squares touching every edge\nB: There are exactly two white squares not touching any edge\nC: There are exactly two black squares not touching any edge\nD: There are exactly three black squares not touching any edge", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are exactly two black squares touching every edge\nB: There are exactly two white squares not touching any edge\nC: There are exactly two black squares not touching any edge\nD: There are exactly three black squares not touching any edge", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_160_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_160_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_160_2.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: there are two yellow circles touching the base\nB: there are two red circles touching the base\nC: there are three yellow circles touching the base\nD: there is one yellow circle touching the base", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: there are two yellow circles touching the base\nB: there are two red circles touching the base\nC: there are three yellow circles touching the base\nD: there is one yellow circle touching the base", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_161_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_161_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_161_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is a black block at the bottom of a tower with two blocks.\nB: There is a black block alone on a flat surface.\nC: There is a red block at the top of a tower with three blocks.\nD: There is a black block as the top of a tower with at least two blocks.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a black block at the bottom of a tower with two blocks.\nB: There is a black block alone on a flat surface.\nC: There is a red block at the top of a tower with three blocks.\nD: There is a black block as the top of a tower with at least two blocks.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_162_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_162_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_162_2.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: All blue items are in different boxes.\nB: ll blue items are in the same box.\nC: None of the blue items are in the same box.\nD: Only some blue items are in the same box.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: All blue items are in different boxes.\nB: ll blue items are in the same box.\nC: None of the blue items are in the same box.\nD: Only some blue items are in the same box.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_163_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_163_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_163_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There are 3 towers with 1 yellow block\nB: There are 2 towers with 3 yellow blocks\nC: There is 1 tower with 2 red blocks\nD: There is 1 tower with 3 yellow blocks", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are 3 towers with 1 yellow block\nB: There are 2 towers with 3 yellow blocks\nC: There is 1 tower with 2 red blocks\nD: There is 1 tower with 3 yellow blocks", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_164_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_164_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_164_2.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is 1 red circle\nB: There is 1 black circle\nC: There is 1 black square\nD: There are 2 black circles", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is 1 red circle\nB: There is 1 black circle\nC: There is 1 black square\nD: There are 2 black circles", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_165_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_165_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_165_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is exactly one tower with a red block at base\nB: There is exactly one tower with a yellow block at base\nC: There are two towers with a yellow block at base\nD: There is no tower with a yellow block at base", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is exactly one tower with a red block at base\nB: There is exactly one tower with a yellow block at base\nC: There are two towers with a yellow block at base\nD: There is no tower with a yellow block at base", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_166_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_166_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_166_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: there is at least one tower which has a yellow block above a black block\nB: there is at least one tower which has a black block above a yellow block\nC: all towers have a yellow block above a black block\nD: there is no tower which has a yellow block above a black block", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: there is at least one tower which has a yellow block above a black block\nB: there is at least one tower which has a black block above a yellow block\nC: all towers have a yellow block above a black block\nD: there is no tower which has a yellow block above a black block", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_167_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_167_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_167_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is a tower with a blue block at the top.\nB: There is a blue tower with all blocks the same color.\nC: There is a tower that the second block from the base is blue.\nD: There is a tower with the second block from the top blue.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a tower with a blue block at the top.\nB: There is a blue tower with all blocks the same color.\nC: There is a tower that the second block from the base is blue.\nD: There is a tower with the second block from the top blue.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_168_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_168_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_168_2.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: blue squares are touching the bottom edge\nB: blue squares are touching the top edge\nC: blue squares are not touching any edge\nD: blue squares are touching all edges", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: blue squares are touching the bottom edge\nB: blue squares are touching the top edge\nC: blue squares are not touching any edge\nD: blue squares are touching all edges", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_169_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_169_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_169_2.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is a box with a yellow circle and 2 black squares.\nB: There is a box with a yellow triangle and 2 black circles.\nC: There is a box with a yellow triangle and 2 black squares.\nD: There is a box with a yellow triangle and 3 black squares.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a box with a yellow circle and 2 black squares.\nB: There is a box with a yellow triangle and 2 black circles.\nC: There is a box with a yellow triangle and 2 black squares.\nD: There is a box with a yellow triangle and 3 black squares.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_170_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_170_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_170_2.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is a yellow block as the base of a tower.\nB: There is a yellow block at the top of the tower.\nC: There is no yellow block as the base of a tower.\nD: There are two yellow blocks in the middle of the tower.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a yellow block as the base of a tower.\nB: There is a yellow block at the top of the tower.\nC: There is no yellow block as the base of a tower.\nD: There are two yellow blocks in the middle of the tower.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_171_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_171_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_171_2.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There are multiple towers with different colors.\nB: There is a single block tower with multiple colors.\nC: There is a two blocks tower with different colors.\nD: There is a two blocks tower that has only one color.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are multiple towers with different colors.\nB: There is a single block tower with multiple colors.\nC: There is a two blocks tower with different colors.\nD: There is a two blocks tower that has only one color.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_172_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_172_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_172_2.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: the single block is yellow\nB: the tower with two blocks has a yellow block at the top\nC: the tower with two blocks has a red block at the top\nD: the tower with three blocks has a yellow block at the top", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: the single block is yellow\nB: the tower with two blocks has a yellow block at the top\nC: the tower with two blocks has a red block at the top\nD: the tower with three blocks has a yellow block at the top", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_173_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_173_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_173_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is a tower with a blue block over a yellow block\nB: There is a tower with two yellow blocks\nC: There is a tower with a yellow block over a blue block\nD: There is a tower with a green block over a yellow block", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a tower with a blue block over a yellow block\nB: There is a tower with two yellow blocks\nC: There is a tower with a yellow block over a blue block\nD: There is a tower with a green block over a yellow block", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_174_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_174_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_174_2.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: there is one black triangle not touching any edge\nB: there are two black triangles touching the edges\nC: there are no black triangles visible\nD: there are two black triangles not touching any edge", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: there is one black triangle not touching any edge\nB: there are two black triangles touching the edges\nC: there are no black triangles visible\nD: there are two black triangles not touching any edge", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_175_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_175_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_175_2.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There are no black triangles touching any edge\nB: There is exactly one black triangle touching an edge\nC: There are two black triangles not touching any edges\nD: There is exactly one black triangle not touching any edge", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are no black triangles touching any edge\nB: There is exactly one black triangle touching an edge\nC: There are two black triangles not touching any edges\nD: There is exactly one black triangle not touching any edge", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_176_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_176_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_176_2.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There are two towers that has black block at the top.\nB: There are no towers in the image.\nC: There is only one tower with a black block at the top.\nD: There are two towers, but they have red blocks at the top.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are two towers that has black block at the top.\nB: There are no towers in the image.\nC: There is only one tower with a black block at the top.\nD: There are two towers, but they have red blocks at the top.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_177_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_177_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_177_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There are 4 black circles\nB: There are 3 black circles\nC: There are 2 white circles\nD: There are 2 black circles", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are 4 black circles\nB: There are 3 black circles\nC: There are 2 white circles\nD: There are 2 black circles", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_178_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_178_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_178_2.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is a blocking tower made of three stones.\nB: There is a tower with four same colored blocks.\nC: There is a tower with three different colored blocks.\nD: There is a tower that has three the same blocks color.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a blocking tower made of three stones.\nB: There is a tower with four same colored blocks.\nC: There is a tower with three different colored blocks.\nD: There is a tower that has three the same blocks color.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_179_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_179_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_179_2.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There are five circles not touching any edge\nB: There are exactly four circles touching one edge\nC: There are exactly three circles not touching any edge\nD: There are exactly four circles not touching any edge", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are five circles not touching any edge\nB: There are exactly four circles touching one edge\nC: There are exactly three circles not touching any edge\nD: There are exactly four circles not touching any edge", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_180_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_180_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_180_2.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is a red block as the top of a tower with at least two blocks.\nB: There is a blue block as the bottom of a tower with at least two blocks.\nC: There is a blue block as the top of a tower with at least two blocks.\nD: There is a blue block as the top of a single block tower", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a red block as the top of a tower with at least two blocks.\nB: There is a blue block as the bottom of a tower with at least two blocks.\nC: There is a blue block as the top of a tower with at least two blocks.\nD: There is a blue block as the top of a single block tower", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_181_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_181_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_181_2.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: one of the grey squares is empty\nB: one of the grey squares has exactly five objects\nC: one of the grey square has exactly four objects\nD: one of the grey squares has exactly three objects", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: one of the grey squares is empty\nB: one of the grey squares has exactly five objects\nC: one of the grey square has exactly four objects\nD: one of the grey squares has exactly three objects", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_182_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_182_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_182_2.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is at least 1 circle closely touching a box corner\nB: There is at least 1 square closely tocuhing a box corner\nC: There is at least 1 square touching the center of a box\nD: There is at least 1 triangle closely touching a box corner", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is at least 1 circle closely touching a box corner\nB: There is at least 1 square closely tocuhing a box corner\nC: There is at least 1 square touching the center of a box\nD: There is at least 1 triangle closely touching a box corner", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_183_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_183_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_183_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: Each grey box contains atleast one yellow object touching the edge\nB: Each grey box has no object touching the edge\nC: Each grey box is empty\nD: Each grey box contains a green object", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: Each grey box contains atleast one yellow object touching the edge\nB: Each grey box has no object touching the edge\nC: Each grey box is empty\nD: Each grey box contains a green object", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_184_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_184_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_184_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is at least 1 tower with a blue block at the top\nB: There are exactly 2 towers with a blue block at the top\nC: There are no towers with a blue block at the top\nD: There is at least 1 tower with a green block at the top", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is at least 1 tower with a blue block at the top\nB: There are exactly 2 towers with a blue block at the top\nC: There are no towers with a blue block at the top\nD: There is at least 1 tower with a green block at the top", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_185_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_185_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_185_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: No towers have any height.\nB: All towers have different heights.\nC: There are at least two towers with the same height.\nD: There is only one tower with the same height.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: No towers have any height.\nB: All towers have different heights.\nC: There are at least two towers with the same height.\nD: There is only one tower with the same height.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_186_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_186_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_186_2.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is a tower with three blue blocks.\nB: There is a tower with a black block and two blue blocks.\nC: There is a tower with two black blocks and a blue block.\nD: There is a tower with a black block and a red block.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a tower with three blue blocks.\nB: There is a tower with a black block and two blue blocks.\nC: There is a tower with two black blocks and a blue block.\nD: There is a tower with a black block and a red block.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_187_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_187_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_187_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: there is a tower with a yellow block below a red block at the top\nB: there is a tower with a red block below a yellow block at the top\nC: there is a tower with a blue block below a green block at the top\nD: there is a tower with a yellow block below a yellow block at the top", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: there is a tower with a yellow block below a red block at the top\nB: there is a tower with a red block below a yellow block at the top\nC: there is a tower with a blue block below a green block at the top\nD: there is a tower with a yellow block below a yellow block at the top", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_188_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_188_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_188_2.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There are 2 blue blocks\nB: There are 4 blue blocks\nC: There are 3 blue blocks\nD: There are 2 red blocks", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are 2 blue blocks\nB: There are 4 blue blocks\nC: There are 3 blue blocks\nD: There are 2 red blocks", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_189_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_189_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_189_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is 1 tower with 2 yellow blocks at the base\nB: There are 2 towers with 1 yellow block at the base\nC: There is 1 tower with 1 red block at the base\nD: There is 1 tower with 1 yellow block at the base", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is 1 tower with 2 yellow blocks at the base\nB: There are 2 towers with 1 yellow block at the base\nC: There is 1 tower with 1 red block at the base\nD: There is 1 tower with 1 yellow block at the base", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_190_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_190_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_190_2.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There are two red blocks as the base of a tower.\nB: There is one yellow block as the base of a tower.\nC: There are two yellow blocks as the base of a tower.\nD: There are three yellow blocks as the base of a tower.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are two red blocks as the base of a tower.\nB: There is one yellow block as the base of a tower.\nC: There are two yellow blocks as the base of a tower.\nD: There are three yellow blocks as the base of a tower.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_191_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_191_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_191_2.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: there is no tower with a yellow block above a black block\nB: there is at least one tower which has a yellow block above a black block\nC: every tower has a yellow block above a black block\nD: there is a yellow block below every black block", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: there is no tower with a yellow block above a black block\nB: there is at least one tower which has a yellow block above a black block\nC: every tower has a yellow block above a black block\nD: there is a yellow block below every black block", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_192_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_192_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_192_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There are 3 yellow squares\nB: There are 4 yellow squares\nC: There are 3 yellow circles\nD: There are 2 yellow squares", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are 3 yellow squares\nB: There are 4 yellow squares\nC: There are 3 yellow circles\nD: There are 2 yellow squares", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_193_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_193_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_193_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There are exactly two black blocks as the top of a tower.\nB: There are exactly two black blocks at the bottom of a tower.\nC: There is one black block at the top of a tower.\nD: There are exactly three black blocks as the top of a tower.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are exactly two black blocks as the top of a tower.\nB: There are exactly two black blocks at the bottom of a tower.\nC: There is one black block at the top of a tower.\nD: There are exactly three black blocks as the top of a tower.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_194_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_194_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_194_2.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is a box with items of various colors.\nB: There is a box with items of only one color.\nC: There is no box with items in it.\nD: There are multiple boxes with items of one color each.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a box with items of various colors.\nB: There is a box with items of only one color.\nC: There is no box with items in it.\nD: There are multiple boxes with items of one color each.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_195_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_195_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_195_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There is a blue item floating in the middle of the box.\nB: There is a blue item closely touching right wall of a box.\nC: There is a green item touching the ceiling of a box.\nD: There is a red item closely touching the left wall of a box.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There is a blue item floating in the middle of the box.\nB: There is a blue item closely touching right wall of a box.\nC: There is a green item touching the ceiling of a box.\nD: There is a red item closely touching the left wall of a box.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_196_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_196_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_196_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There are 3 boxes with a black item on top.\nB: There are 2 boxes with a white item on top.\nC: There are 2 boxes with a black item on top.\nD: There are 2 boxes with nothing on top.", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are 3 boxes with a black item on top.\nB: There are 2 boxes with a white item on top.\nC: There are 2 boxes with a black item on top.\nD: There are 2 boxes with nothing on top.", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_197_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_197_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_197_2.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There are 2 white circles\nB: There are 2 black circles\nC: There are 3 black circles\nD: There are 4 black circles", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are 2 white circles\nB: There are 2 black circles\nC: There are 3 black circles\nD: There are 4 black circles", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_198_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_198_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_198_2.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Captioning_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "nlvr", "options": "A: There are 3 white blocks\nB: There are 2 black blocks\nC: There are 3 black blocks\nD: There are 4 black blocks", "question": "Please correctly describe this set of images from the perspective of the spatial context.", "context": "Please correctly describe this set of images from the perspective of the spatial context.\nSelect from the following choices.\nA: There are 3 white blocks\nB: There are 2 black blocks\nC: There are 3 black blocks\nD: There are 4 black blocks", "input_image_path": ["./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_199_0.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_199_1.png", "./2D-spatial/Image_Captioning_with_Spatial_Context/Image_Captioning_with_Spatial_Context_199_2.png"], "output": "C", "qwen3-vl": "image none"}]
\ No newline at end of file
diff --git a/results/Image_Spatial_Transformation_Estimation/qwen3-vl/metadata_info.json b/results/Image_Spatial_Transformation_Estimation/qwen3-vl/metadata_info.json
new file mode 100644
index 0000000..8395b77
--- /dev/null
+++ b/results/Image_Spatial_Transformation_Estimation/qwen3-vl/metadata_info.json
@@ -0,0 +1 @@
+[{"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -42.98651909317854, \"translation_dx\": 114.49293313374625, \"translation_dy\": -39.53290228333596, \"scale\": 1.442019387031135}\nB: {\"rotation_angle\": -106.99875725121946, \"translation_dx\": 87.96881157950656, \"translation_dy\": -34.70529343588741, \"scale\": 1.407305489874207}\nC: {\"rotation_angle\": 148.22875373623708, \"translation_dx\": 53.75338658972072, \"translation_dy\": -63.78583022927253, \"scale\": 0.9304836306567924}\nD: {\"rotation_angle\": 52.0207999596704, \"translation_dx\": 62.052266940503074, \"translation_dy\": 15.318990484280505, \"scale\": 1.1445040102422772}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -42.98651909317854, \"translation_dx\": 114.49293313374625, \"translation_dy\": -39.53290228333596, \"scale\": 1.442019387031135}\nB: {\"rotation_angle\": -106.99875725121946, \"translation_dx\": 87.96881157950656, \"translation_dy\": -34.70529343588741, \"scale\": 1.407305489874207}\nC: {\"rotation_angle\": 148.22875373623708, \"translation_dx\": 53.75338658972072, \"translation_dy\": -63.78583022927253, \"scale\": 0.9304836306567924}\nD: {\"rotation_angle\": 52.0207999596704, \"translation_dx\": 62.052266940503074, \"translation_dy\": 15.318990484280505, \"scale\": 1.1445040102422772}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_0_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_0_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -38.58021171568234, \"translation_dx\": -80.14139661496048, \"translation_dy\": 7.985099889843255, \"scale\": 1.029545268033875}\nB: {\"rotation_angle\": -160.6395227566207, \"translation_dx\": 53.66643366551958, \"translation_dy\": -27.712376159428388, \"scale\": 1.1084051689599654}\nC: {\"rotation_angle\": 44.2601421515034, \"translation_dx\": -84.9832744911761, \"translation_dy\": -78.07982572554322, \"scale\": 0.5612120736859965}\nD: {\"rotation_angle\": -15.445234303955033, \"translation_dx\": 52.656313993324545, \"translation_dy\": 4.243768644047549, \"scale\": 0.8747335302455691}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -38.58021171568234, \"translation_dx\": -80.14139661496048, \"translation_dy\": 7.985099889843255, \"scale\": 1.029545268033875}\nB: {\"rotation_angle\": -160.6395227566207, \"translation_dx\": 53.66643366551958, \"translation_dy\": -27.712376159428388, \"scale\": 1.1084051689599654}\nC: {\"rotation_angle\": 44.2601421515034, \"translation_dx\": -84.9832744911761, \"translation_dy\": -78.07982572554322, \"scale\": 0.5612120736859965}\nD: {\"rotation_angle\": -15.445234303955033, \"translation_dx\": 52.656313993324545, \"translation_dy\": 4.243768644047549, \"scale\": 0.8747335302455691}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_1_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_1_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -70.18179574394556, \"translation_dx\": -84.02989442213027, \"translation_dy\": 45.46342410564398, \"scale\": 1.28660403831869}\nB: {\"rotation_angle\": 1.3693998936690264, \"translation_dx\": -71.94174431428723, \"translation_dy\": 25.661133958182248, \"scale\": 1.468813327861592}\nC: {\"rotation_angle\": -110.51021822636605, \"translation_dx\": -17.924195571284486, \"translation_dy\": -0.10679752473519954, \"scale\": 1.4066663412939815}\nD: {\"rotation_angle\": 141.74747753602782, \"translation_dx\": -54.793360600935046, \"translation_dy\": -29.72546528603263, \"scale\": 0.6563706152769926}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -70.18179574394556, \"translation_dx\": -84.02989442213027, \"translation_dy\": 45.46342410564398, \"scale\": 1.28660403831869}\nB: {\"rotation_angle\": 1.3693998936690264, \"translation_dx\": -71.94174431428723, \"translation_dy\": 25.661133958182248, \"scale\": 1.468813327861592}\nC: {\"rotation_angle\": -110.51021822636605, \"translation_dx\": -17.924195571284486, \"translation_dy\": -0.10679752473519954, \"scale\": 1.4066663412939815}\nD: {\"rotation_angle\": 141.74747753602782, \"translation_dx\": -54.793360600935046, \"translation_dy\": -29.72546528603263, \"scale\": 0.6563706152769926}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_2_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_2_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -59.18065174130953, \"translation_dx\": -66.15733764198566, \"translation_dy\": -32.06450758946801, \"scale\": 1.1967157159259998}\nB: {\"rotation_angle\": 142.66976946716716, \"translation_dx\": 29.963541003119957, \"translation_dy\": 66.07065092305665, \"scale\": 1.42144068359999}\nC: {\"rotation_angle\": -79.55706788063112, \"translation_dx\": -38.613403166877674, \"translation_dy\": 48.56888435185245, \"scale\": 1.368947012195521}\nD: {\"rotation_angle\": -15.445234303955033, \"translation_dx\": 52.656313993324545, \"translation_dy\": 4.243768644047549, \"scale\": 0.8747335302455691}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -59.18065174130953, \"translation_dx\": -66.15733764198566, \"translation_dy\": -32.06450758946801, \"scale\": 1.1967157159259998}\nB: {\"rotation_angle\": 142.66976946716716, \"translation_dx\": 29.963541003119957, \"translation_dy\": 66.07065092305665, \"scale\": 1.42144068359999}\nC: {\"rotation_angle\": -79.55706788063112, \"translation_dx\": -38.613403166877674, \"translation_dy\": 48.56888435185245, \"scale\": 1.368947012195521}\nD: {\"rotation_angle\": -15.445234303955033, \"translation_dx\": 52.656313993324545, \"translation_dy\": 4.243768644047549, \"scale\": 0.8747335302455691}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_3_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_3_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -160.6395227566207, \"translation_dx\": 53.66643366551958, \"translation_dy\": -27.712376159428388, \"scale\": 1.1084051689599654}\nB: {\"rotation_angle\": 32.170058088704565, \"translation_dx\": 62.48780444449932, \"translation_dy\": 36.464458087386475, \"scale\": 0.8338243238440678}\nC: {\"rotation_angle\": 159.39197876032466, \"translation_dx\": -101.87275621292875, \"translation_dy\": -32.606176111808466, \"scale\": 0.6647290774480178}\nD: {\"rotation_angle\": 4.3566947214011975, \"translation_dx\": 60.69356846846577, \"translation_dy\": 19.542677658157032, \"scale\": 1.353031271581857}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -160.6395227566207, \"translation_dx\": 53.66643366551958, \"translation_dy\": -27.712376159428388, \"scale\": 1.1084051689599654}\nB: {\"rotation_angle\": 32.170058088704565, \"translation_dx\": 62.48780444449932, \"translation_dy\": 36.464458087386475, \"scale\": 0.8338243238440678}\nC: {\"rotation_angle\": 159.39197876032466, \"translation_dx\": -101.87275621292875, \"translation_dy\": -32.606176111808466, \"scale\": 0.6647290774480178}\nD: {\"rotation_angle\": 4.3566947214011975, \"translation_dx\": 60.69356846846577, \"translation_dy\": 19.542677658157032, \"scale\": 1.353031271581857}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_4_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_4_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -38.58021171568234, \"translation_dx\": -80.14139661496048, \"translation_dy\": 7.985099889843255, \"scale\": 1.029545268033875}\nB: {\"rotation_angle\": 170.5673161572617, \"translation_dx\": -54.14309140946517, \"translation_dy\": -20.9067824061149, \"scale\": 0.74080987054586}\nC: {\"rotation_angle\": -137.58016126496426, \"translation_dx\": 45.631572391068715, \"translation_dy\": -54.72741054396442, \"scale\": 1.391656794638211}\nD: {\"rotation_angle\": 115.4472434811122, \"translation_dx\": 69.00896887231048, \"translation_dy\": -26.016218629159226, \"scale\": 0.9339901852292719}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -38.58021171568234, \"translation_dx\": -80.14139661496048, \"translation_dy\": 7.985099889843255, \"scale\": 1.029545268033875}\nB: {\"rotation_angle\": 170.5673161572617, \"translation_dx\": -54.14309140946517, \"translation_dy\": -20.9067824061149, \"scale\": 0.74080987054586}\nC: {\"rotation_angle\": -137.58016126496426, \"translation_dx\": 45.631572391068715, \"translation_dy\": -54.72741054396442, \"scale\": 1.391656794638211}\nD: {\"rotation_angle\": 115.4472434811122, \"translation_dx\": 69.00896887231048, \"translation_dy\": -26.016218629159226, \"scale\": 0.9339901852292719}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_5_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_5_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 67.74863170033868, \"translation_dx\": 0.9436916559104702, \"translation_dy\": 79.02717939495389, \"scale\": 1.0490112177140545}\nB: {\"rotation_angle\": -149.34069149386406, \"translation_dx\": 81.63420911320063, \"translation_dy\": -26.073567429384056, \"scale\": 1.427947630130646}\nC: {\"rotation_angle\": 159.74516071456964, \"translation_dx\": 18.36539372865252, \"translation_dy\": -32.68583255299669, \"scale\": 0.6283421405871866}\nD: {\"rotation_angle\": 72.25092677282458, \"translation_dx\": 61.389740502873025, \"translation_dy\": -36.86538640455047, \"scale\": 1.0748600769835353}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 67.74863170033868, \"translation_dx\": 0.9436916559104702, \"translation_dy\": 79.02717939495389, \"scale\": 1.0490112177140545}\nB: {\"rotation_angle\": -149.34069149386406, \"translation_dx\": 81.63420911320063, \"translation_dy\": -26.073567429384056, \"scale\": 1.427947630130646}\nC: {\"rotation_angle\": 159.74516071456964, \"translation_dx\": 18.36539372865252, \"translation_dy\": -32.68583255299669, \"scale\": 0.6283421405871866}\nD: {\"rotation_angle\": 72.25092677282458, \"translation_dx\": 61.389740502873025, \"translation_dy\": -36.86538640455047, \"scale\": 1.0748600769835353}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_6_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_6_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 178.3015459217881, \"translation_dx\": 2.1592483018484785, \"translation_dy\": -86.15095567396924, \"scale\": 1.206185814877298}\nB: {\"rotation_angle\": 95.69634927891752, \"translation_dx\": -96.46148729426875, \"translation_dy\": -25.496381966922478, \"scale\": 0.7479348241153333}\nC: {\"rotation_angle\": 179.8013352752547, \"translation_dx\": -90.5548533247824, \"translation_dy\": 17.23782922418306, \"scale\": 0.9885365626195518}\nD: {\"rotation_angle\": 49.90656423603761, \"translation_dx\": 85.27067294320437, \"translation_dy\": -8.928665399863448, \"scale\": 0.9370060594249733}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 178.3015459217881, \"translation_dx\": 2.1592483018484785, \"translation_dy\": -86.15095567396924, \"scale\": 1.206185814877298}\nB: {\"rotation_angle\": 95.69634927891752, \"translation_dx\": -96.46148729426875, \"translation_dy\": -25.496381966922478, \"scale\": 0.7479348241153333}\nC: {\"rotation_angle\": 179.8013352752547, \"translation_dx\": -90.5548533247824, \"translation_dy\": 17.23782922418306, \"scale\": 0.9885365626195518}\nD: {\"rotation_angle\": 49.90656423603761, \"translation_dx\": 85.27067294320437, \"translation_dy\": -8.928665399863448, \"scale\": 0.9370060594249733}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_7_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_7_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -16.878745814478265, \"translation_dx\": -68.86659110743665, \"translation_dy\": -98.54142762965468, \"scale\": 1.2648663928919022}\nB: {\"rotation_angle\": -126.23248080179604, \"translation_dx\": -18.04313623288388, \"translation_dy\": 59.052880720386156, \"scale\": 1.3827835175940266}\nC: {\"rotation_angle\": -137.69315675508605, \"translation_dx\": -14.965017175186233, \"translation_dy\": 28.85856493302694, \"scale\": 0.6970825252863025}\nD: {\"rotation_angle\": -68.79930104020924, \"translation_dx\": -103.12901971602221, \"translation_dy\": 94.89161684072867, \"scale\": 1.2295411735859756}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -16.878745814478265, \"translation_dx\": -68.86659110743665, \"translation_dy\": -98.54142762965468, \"scale\": 1.2648663928919022}\nB: {\"rotation_angle\": -126.23248080179604, \"translation_dx\": -18.04313623288388, \"translation_dy\": 59.052880720386156, \"scale\": 1.3827835175940266}\nC: {\"rotation_angle\": -137.69315675508605, \"translation_dx\": -14.965017175186233, \"translation_dy\": 28.85856493302694, \"scale\": 0.6970825252863025}\nD: {\"rotation_angle\": -68.79930104020924, \"translation_dx\": -103.12901971602221, \"translation_dy\": 94.89161684072867, \"scale\": 1.2295411735859756}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_8_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_8_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 143.38145335973087, \"translation_dx\": 86.67970142496799, \"translation_dy\": -33.57640317277091, \"scale\": 0.6114655384261714}\nB: {\"rotation_angle\": -0.45613579718829556, \"translation_dx\": 98.71619714866841, \"translation_dy\": 70.1100439641223, \"scale\": 0.6491919010173006}\nC: {\"rotation_angle\": -79.55706788063112, \"translation_dx\": -38.613403166877674, \"translation_dy\": 48.56888435185245, \"scale\": 1.368947012195521}\nD: {\"rotation_angle\": -79.27003163090343, \"translation_dx\": 8.207736130313549, \"translation_dy\": 6.670417118750038, \"scale\": 1.3327657238113826}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 143.38145335973087, \"translation_dx\": 86.67970142496799, \"translation_dy\": -33.57640317277091, \"scale\": 0.6114655384261714}\nB: {\"rotation_angle\": -0.45613579718829556, \"translation_dx\": 98.71619714866841, \"translation_dy\": 70.1100439641223, \"scale\": 0.6491919010173006}\nC: {\"rotation_angle\": -79.55706788063112, \"translation_dx\": -38.613403166877674, \"translation_dy\": 48.56888435185245, \"scale\": 1.368947012195521}\nD: {\"rotation_angle\": -79.27003163090343, \"translation_dx\": 8.207736130313549, \"translation_dy\": 6.670417118750038, \"scale\": 1.3327657238113826}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_9_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_9_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 72.25092677282458, \"translation_dx\": 61.389740502873025, \"translation_dy\": -36.86538640455047, \"scale\": 1.0748600769835353}\nB: {\"rotation_angle\": 139.13421797404374, \"translation_dx\": -107.62188977651758, \"translation_dy\": -65.35657968686931, \"scale\": 0.569575564082204}\nC: {\"rotation_angle\": -94.06455293225282, \"translation_dx\": -52.04430006776356, \"translation_dy\": 88.55937507710391, \"scale\": 0.8369046461483086}\nD: {\"rotation_angle\": 136.76946369368522, \"translation_dx\": 86.13615517916296, \"translation_dy\": 47.49597577737802, \"scale\": 1.1842967613683704}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 72.25092677282458, \"translation_dx\": 61.389740502873025, \"translation_dy\": -36.86538640455047, \"scale\": 1.0748600769835353}\nB: {\"rotation_angle\": 139.13421797404374, \"translation_dx\": -107.62188977651758, \"translation_dy\": -65.35657968686931, \"scale\": 0.569575564082204}\nC: {\"rotation_angle\": -94.06455293225282, \"translation_dx\": -52.04430006776356, \"translation_dy\": 88.55937507710391, \"scale\": 0.8369046461483086}\nD: {\"rotation_angle\": 136.76946369368522, \"translation_dx\": 86.13615517916296, \"translation_dy\": 47.49597577737802, \"scale\": 1.1842967613683704}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_10_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_10_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 111.11665430921613, \"translation_dx\": -45.526232266105865, \"translation_dy\": -71.56835409165808, \"scale\": 0.5234271564227445}\nB: {\"rotation_angle\": -127.03310490562403, \"translation_dx\": -44.497972498107885, \"translation_dy\": 53.252184804163164, \"scale\": 0.8807762361133948}\nC: {\"rotation_angle\": -23.02063628299686, \"translation_dx\": -42.06347070905805, \"translation_dy\": 68.90308226059909, \"scale\": 0.7321107429069119}\nD: {\"rotation_angle\": 33.426384392539006, \"translation_dx\": -12.448609293998487, \"translation_dy\": 64.03367069956386, \"scale\": 0.6340926377236346}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 111.11665430921613, \"translation_dx\": -45.526232266105865, \"translation_dy\": -71.56835409165808, \"scale\": 0.5234271564227445}\nB: {\"rotation_angle\": -127.03310490562403, \"translation_dx\": -44.497972498107885, \"translation_dy\": 53.252184804163164, \"scale\": 0.8807762361133948}\nC: {\"rotation_angle\": -23.02063628299686, \"translation_dx\": -42.06347070905805, \"translation_dy\": 68.90308226059909, \"scale\": 0.7321107429069119}\nD: {\"rotation_angle\": 33.426384392539006, \"translation_dx\": -12.448609293998487, \"translation_dy\": 64.03367069956386, \"scale\": 0.6340926377236346}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_11_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_11_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 159.39197876032466, \"translation_dx\": -101.87275621292875, \"translation_dy\": -32.606176111808466, \"scale\": 0.6647290774480178}\nB: {\"rotation_angle\": -123.92621597373325, \"translation_dx\": 115.25994331141689, \"translation_dy\": -45.13111299141354, \"scale\": 1.164470344420729}\nC: {\"rotation_angle\": 95.56102360167273, \"translation_dx\": -57.629857243876444, \"translation_dy\": -95.34824117323305, \"scale\": 0.9533126568708786}\nD: {\"rotation_angle\": -23.247975965134003, \"translation_dx\": 108.97564353658032, \"translation_dy\": 27.267413374938258, \"scale\": 1.2292170424899498}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 159.39197876032466, \"translation_dx\": -101.87275621292875, \"translation_dy\": -32.606176111808466, \"scale\": 0.6647290774480178}\nB: {\"rotation_angle\": -123.92621597373325, \"translation_dx\": 115.25994331141689, \"translation_dy\": -45.13111299141354, \"scale\": 1.164470344420729}\nC: {\"rotation_angle\": 95.56102360167273, \"translation_dx\": -57.629857243876444, \"translation_dy\": -95.34824117323305, \"scale\": 0.9533126568708786}\nD: {\"rotation_angle\": -23.247975965134003, \"translation_dx\": 108.97564353658032, \"translation_dy\": 27.267413374938258, \"scale\": 1.2292170424899498}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_12_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_12_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -124.74198080809023, \"translation_dx\": -48.23531115232953, \"translation_dy\": 52.62526617026404, \"scale\": 1.3484625774406969}\nB: {\"rotation_angle\": 153.24034529323683, \"translation_dx\": -80.95083564593054, \"translation_dy\": 58.17854805068575, \"scale\": 0.8564275095577245}\nC: {\"rotation_angle\": -49.11147497176091, \"translation_dx\": -21.61309921155923, \"translation_dy\": 41.841400081955015, \"scale\": 1.3374733710705384}\nD: {\"rotation_angle\": 110.02825264959768, \"translation_dx\": -53.26387197670213, \"translation_dy\": 88.43864976013427, \"scale\": 1.4833645013101147}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -124.74198080809023, \"translation_dx\": -48.23531115232953, \"translation_dy\": 52.62526617026404, \"scale\": 1.3484625774406969}\nB: {\"rotation_angle\": 153.24034529323683, \"translation_dx\": -80.95083564593054, \"translation_dy\": 58.17854805068575, \"scale\": 0.8564275095577245}\nC: {\"rotation_angle\": -49.11147497176091, \"translation_dx\": -21.61309921155923, \"translation_dy\": 41.841400081955015, \"scale\": 1.3374733710705384}\nD: {\"rotation_angle\": 110.02825264959768, \"translation_dx\": -53.26387197670213, \"translation_dy\": 88.43864976013427, \"scale\": 1.4833645013101147}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_13_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_13_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 173.6372649335733, \"translation_dx\": -7.357207392874017, \"translation_dy\": -51.70776156994498, \"scale\": 1.09720142096939}\nB: {\"rotation_angle\": 84.88997243843744, \"translation_dx\": 19.30269357274682, \"translation_dy\": 9.929350250110147, \"scale\": 1.0595552381550672}\nC: {\"rotation_angle\": -169.57691070181107, \"translation_dx\": 67.3776951722352, \"translation_dy\": 6.393739311338578, \"scale\": 0.8283042543093307}\nD: {\"rotation_angle\": 137.6047485759084, \"translation_dx\": -27.00857214512888, \"translation_dy\": -94.97246325619065, \"scale\": 1.1628545134465245}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 173.6372649335733, \"translation_dx\": -7.357207392874017, \"translation_dy\": -51.70776156994498, \"scale\": 1.09720142096939}\nB: {\"rotation_angle\": 84.88997243843744, \"translation_dx\": 19.30269357274682, \"translation_dy\": 9.929350250110147, \"scale\": 1.0595552381550672}\nC: {\"rotation_angle\": -169.57691070181107, \"translation_dx\": 67.3776951722352, \"translation_dy\": 6.393739311338578, \"scale\": 0.8283042543093307}\nD: {\"rotation_angle\": 137.6047485759084, \"translation_dx\": -27.00857214512888, \"translation_dy\": -94.97246325619065, \"scale\": 1.1628545134465245}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_14_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_14_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 83.49682873903629, \"translation_dx\": -127.2042493945246, \"translation_dy\": 2.6616959584396938, \"scale\": 0.9488759478249397}\nB: {\"rotation_angle\": -51.98717119490195, \"translation_dx\": -83.93544420557635, \"translation_dy\": -17.359661719977098, \"scale\": 1.0858344969275349}\nC: {\"rotation_angle\": 162.9787629733711, \"translation_dx\": 56.68968820785494, \"translation_dy\": 63.47754229449794, \"scale\": 0.7767697180212818}\nD: {\"rotation_angle\": 74.4727172984789, \"translation_dx\": 83.0498783040965, \"translation_dy\": 24.573318419119772, \"scale\": 1.4775593630739356}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 83.49682873903629, \"translation_dx\": -127.2042493945246, \"translation_dy\": 2.6616959584396938, \"scale\": 0.9488759478249397}\nB: {\"rotation_angle\": -51.98717119490195, \"translation_dx\": -83.93544420557635, \"translation_dy\": -17.359661719977098, \"scale\": 1.0858344969275349}\nC: {\"rotation_angle\": 162.9787629733711, \"translation_dx\": 56.68968820785494, \"translation_dy\": 63.47754229449794, \"scale\": 0.7767697180212818}\nD: {\"rotation_angle\": 74.4727172984789, \"translation_dx\": 83.0498783040965, \"translation_dy\": 24.573318419119772, \"scale\": 1.4775593630739356}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_15_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_15_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -32.96407209098831, \"translation_dx\": -27.518946535455143, \"translation_dy\": 2.5370159689679213, \"scale\": 1.259328459428434}\nB: {\"rotation_angle\": -70.18179574394556, \"translation_dx\": -84.02989442213027, \"translation_dy\": 45.46342410564398, \"scale\": 1.28660403831869}\nC: {\"rotation_angle\": 112.15713698429767, \"translation_dx\": -0.833180316164956, \"translation_dy\": -100.57740000976534, \"scale\": 1.21487245494624}\nD: {\"rotation_angle\": -103.24791656906933, \"translation_dx\": -2.2454836983213227, \"translation_dy\": 24.014319900588845, \"scale\": 1.3204557483507742}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -32.96407209098831, \"translation_dx\": -27.518946535455143, \"translation_dy\": 2.5370159689679213, \"scale\": 1.259328459428434}\nB: {\"rotation_angle\": -70.18179574394556, \"translation_dx\": -84.02989442213027, \"translation_dy\": 45.46342410564398, \"scale\": 1.28660403831869}\nC: {\"rotation_angle\": 112.15713698429767, \"translation_dx\": -0.833180316164956, \"translation_dy\": -100.57740000976534, \"scale\": 1.21487245494624}\nD: {\"rotation_angle\": -103.24791656906933, \"translation_dx\": -2.2454836983213227, \"translation_dy\": 24.014319900588845, \"scale\": 1.3204557483507742}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_16_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_16_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -132.6730586187399, \"translation_dx\": -14.723128468316531, \"translation_dy\": -95.44210429834934, \"scale\": 1.0421065600095725}\nB: {\"rotation_angle\": -148.06770236959966, \"translation_dx\": 76.71938731609727, \"translation_dy\": 125.67697929104389, \"scale\": 1.1600663307259453}\nC: {\"rotation_angle\": 46.42160956908356, \"translation_dx\": -90.04619228512212, \"translation_dy\": -15.749486436572411, \"scale\": 1.005156310055277}\nD: {\"rotation_angle\": 32.170058088704565, \"translation_dx\": 62.48780444449932, \"translation_dy\": 36.464458087386475, \"scale\": 0.8338243238440678}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -132.6730586187399, \"translation_dx\": -14.723128468316531, \"translation_dy\": -95.44210429834934, \"scale\": 1.0421065600095725}\nB: {\"rotation_angle\": -148.06770236959966, \"translation_dx\": 76.71938731609727, \"translation_dy\": 125.67697929104389, \"scale\": 1.1600663307259453}\nC: {\"rotation_angle\": 46.42160956908356, \"translation_dx\": -90.04619228512212, \"translation_dy\": -15.749486436572411, \"scale\": 1.005156310055277}\nD: {\"rotation_angle\": 32.170058088704565, \"translation_dx\": 62.48780444449932, \"translation_dy\": 36.464458087386475, \"scale\": 0.8338243238440678}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_17_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_17_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 98.62110540120432, \"translation_dx\": 55.8324503005326, \"translation_dy\": -53.32963696213369, \"scale\": 1.3342375308232577}\nB: {\"rotation_angle\": 133.22970053001933, \"translation_dx\": 30.83867253278636, \"translation_dy\": 9.987607615316023, \"scale\": 0.9746642566652708}\nC: {\"rotation_angle\": -113.69332067912192, \"translation_dx\": -23.005200251858383, \"translation_dy\": 57.916315250854666, \"scale\": 0.5483419258047426}\nD: {\"rotation_angle\": -110.51021822636605, \"translation_dx\": -17.924195571284486, \"translation_dy\": -0.10679752473519954, \"scale\": 1.4066663412939815}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 98.62110540120432, \"translation_dx\": 55.8324503005326, \"translation_dy\": -53.32963696213369, \"scale\": 1.3342375308232577}\nB: {\"rotation_angle\": 133.22970053001933, \"translation_dx\": 30.83867253278636, \"translation_dy\": 9.987607615316023, \"scale\": 0.9746642566652708}\nC: {\"rotation_angle\": -113.69332067912192, \"translation_dx\": -23.005200251858383, \"translation_dy\": 57.916315250854666, \"scale\": 0.5483419258047426}\nD: {\"rotation_angle\": -110.51021822636605, \"translation_dx\": -17.924195571284486, \"translation_dy\": -0.10679752473519954, \"scale\": 1.4066663412939815}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_18_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_18_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 98.12478073081388, \"translation_dx\": 82.24255679101596, \"translation_dy\": 10.638794739410258, \"scale\": 1.454613875934863}\nB: {\"rotation_angle\": -100.94596249363259, \"translation_dx\": 18.493532966543597, \"translation_dy\": -4.904135882610319, \"scale\": 1.1575890826518318}\nC: {\"rotation_angle\": 159.39197876032466, \"translation_dx\": -101.87275621292875, \"translation_dy\": -32.606176111808466, \"scale\": 0.6647290774480178}\nD: {\"rotation_angle\": -16.878745814478265, \"translation_dx\": -68.86659110743665, \"translation_dy\": -98.54142762965468, \"scale\": 1.2648663928919022}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 98.12478073081388, \"translation_dx\": 82.24255679101596, \"translation_dy\": 10.638794739410258, \"scale\": 1.454613875934863}\nB: {\"rotation_angle\": -100.94596249363259, \"translation_dx\": 18.493532966543597, \"translation_dy\": -4.904135882610319, \"scale\": 1.1575890826518318}\nC: {\"rotation_angle\": 159.39197876032466, \"translation_dx\": -101.87275621292875, \"translation_dy\": -32.606176111808466, \"scale\": 0.6647290774480178}\nD: {\"rotation_angle\": -16.878745814478265, \"translation_dx\": -68.86659110743665, \"translation_dy\": -98.54142762965468, \"scale\": 1.2648663928919022}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_19_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_19_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 47.16467358014893, \"translation_dx\": -87.19318159487975, \"translation_dy\": -49.56686010575127, \"scale\": 1.2416587716965684}\nB: {\"rotation_angle\": 95.69634927891752, \"translation_dx\": -96.46148729426875, \"translation_dy\": -25.496381966922478, \"scale\": 0.7479348241153333}\nC: {\"rotation_angle\": -152.40502323992493, \"translation_dx\": -0.6096313646742146, \"translation_dy\": 26.2224872549711, \"scale\": 0.6008305458537412}\nD: {\"rotation_angle\": -126.15991399279281, \"translation_dx\": 24.895638463286446, \"translation_dy\": -35.71086816730676, \"scale\": 1.30648936857296}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 47.16467358014893, \"translation_dx\": -87.19318159487975, \"translation_dy\": -49.56686010575127, \"scale\": 1.2416587716965684}\nB: {\"rotation_angle\": 95.69634927891752, \"translation_dx\": -96.46148729426875, \"translation_dy\": -25.496381966922478, \"scale\": 0.7479348241153333}\nC: {\"rotation_angle\": -152.40502323992493, \"translation_dx\": -0.6096313646742146, \"translation_dy\": 26.2224872549711, \"scale\": 0.6008305458537412}\nD: {\"rotation_angle\": -126.15991399279281, \"translation_dx\": 24.895638463286446, \"translation_dy\": -35.71086816730676, \"scale\": 1.30648936857296}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_20_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_20_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 137.6047485759084, \"translation_dx\": -27.00857214512888, \"translation_dy\": -94.97246325619065, \"scale\": 1.1628545134465245}\nB: {\"rotation_angle\": 53.86809011441332, \"translation_dx\": -15.131168518097624, \"translation_dy\": -31.300037391593577, \"scale\": 1.3154620606808156}\nC: {\"rotation_angle\": 44.2601421515034, \"translation_dx\": -84.9832744911761, \"translation_dy\": -78.07982572554322, \"scale\": 0.5612120736859965}\nD: {\"rotation_angle\": -72.82027143369304, \"translation_dx\": -44.85481158127062, \"translation_dy\": 106.69131407191517, \"scale\": 0.716080341101258}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 137.6047485759084, \"translation_dx\": -27.00857214512888, \"translation_dy\": -94.97246325619065, \"scale\": 1.1628545134465245}\nB: {\"rotation_angle\": 53.86809011441332, \"translation_dx\": -15.131168518097624, \"translation_dy\": -31.300037391593577, \"scale\": 1.3154620606808156}\nC: {\"rotation_angle\": 44.2601421515034, \"translation_dx\": -84.9832744911761, \"translation_dy\": -78.07982572554322, \"scale\": 0.5612120736859965}\nD: {\"rotation_angle\": -72.82027143369304, \"translation_dx\": -44.85481158127062, \"translation_dy\": 106.69131407191517, \"scale\": 0.716080341101258}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_21_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_21_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -70.97525301082955, \"translation_dx\": -28.380848037876873, \"translation_dy\": 54.37723426674512, \"scale\": 0.9024922197892329}\nB: {\"rotation_angle\": 148.22875373623708, \"translation_dx\": 53.75338658972072, \"translation_dy\": -63.78583022927253, \"scale\": 0.9304836306567924}\nC: {\"rotation_angle\": -162.31682909306286, \"translation_dx\": 94.60975693720637, \"translation_dy\": -28.569332128995313, \"scale\": 1.1251281587345527}\nD: {\"rotation_angle\": 127.0599036632886, \"translation_dx\": -26.73103881794438, \"translation_dy\": 16.785326739741976, \"scale\": 1.1214331244941351}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -70.97525301082955, \"translation_dx\": -28.380848037876873, \"translation_dy\": 54.37723426674512, \"scale\": 0.9024922197892329}\nB: {\"rotation_angle\": 148.22875373623708, \"translation_dx\": 53.75338658972072, \"translation_dy\": -63.78583022927253, \"scale\": 0.9304836306567924}\nC: {\"rotation_angle\": -162.31682909306286, \"translation_dx\": 94.60975693720637, \"translation_dy\": -28.569332128995313, \"scale\": 1.1251281587345527}\nD: {\"rotation_angle\": 127.0599036632886, \"translation_dx\": -26.73103881794438, \"translation_dy\": 16.785326739741976, \"scale\": 1.1214331244941351}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_22_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_22_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -162.31682909306286, \"translation_dx\": 94.60975693720637, \"translation_dy\": -28.569332128995313, \"scale\": 1.1251281587345527}\nB: {\"rotation_angle\": 168.86687879669455, \"translation_dx\": 30.327287286076626, \"translation_dy\": -73.84263373893171, \"scale\": 1.0887904122788439}\nC: {\"rotation_angle\": -126.23248080179604, \"translation_dx\": -18.04313623288388, \"translation_dy\": 59.052880720386156, \"scale\": 1.3827835175940266}\nD: {\"rotation_angle\": 95.69634927891752, \"translation_dx\": -96.46148729426875, \"translation_dy\": -25.496381966922478, \"scale\": 0.7479348241153333}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -162.31682909306286, \"translation_dx\": 94.60975693720637, \"translation_dy\": -28.569332128995313, \"scale\": 1.1251281587345527}\nB: {\"rotation_angle\": 168.86687879669455, \"translation_dx\": 30.327287286076626, \"translation_dy\": -73.84263373893171, \"scale\": 1.0887904122788439}\nC: {\"rotation_angle\": -126.23248080179604, \"translation_dx\": -18.04313623288388, \"translation_dy\": 59.052880720386156, \"scale\": 1.3827835175940266}\nD: {\"rotation_angle\": 95.69634927891752, \"translation_dx\": -96.46148729426875, \"translation_dy\": -25.496381966922478, \"scale\": 0.7479348241153333}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_23_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_23_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -137.69315675508605, \"translation_dx\": -14.965017175186233, \"translation_dy\": 28.85856493302694, \"scale\": 0.6970825252863025}\nB: {\"rotation_angle\": -115.34417090075787, \"translation_dx\": -118.63121430094503, \"translation_dy\": 41.63412082488844, \"scale\": 0.9001856788272352}\nC: {\"rotation_angle\": 159.18509857624855, \"translation_dx\": 94.5972413522399, \"translation_dy\": -87.01463724053234, \"scale\": 0.7914176569510836}\nD: {\"rotation_angle\": 78.52234880801677, \"translation_dx\": -41.05806913924104, \"translation_dy\": -5.158893155372851, \"scale\": 1.0182841116233097}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -137.69315675508605, \"translation_dx\": -14.965017175186233, \"translation_dy\": 28.85856493302694, \"scale\": 0.6970825252863025}\nB: {\"rotation_angle\": -115.34417090075787, \"translation_dx\": -118.63121430094503, \"translation_dy\": 41.63412082488844, \"scale\": 0.9001856788272352}\nC: {\"rotation_angle\": 159.18509857624855, \"translation_dx\": 94.5972413522399, \"translation_dy\": -87.01463724053234, \"scale\": 0.7914176569510836}\nD: {\"rotation_angle\": 78.52234880801677, \"translation_dx\": -41.05806913924104, \"translation_dy\": -5.158893155372851, \"scale\": 1.0182841116233097}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_24_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_24_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -13.219279868292688, \"translation_dx\": -95.87022677446828, \"translation_dy\": -58.31347876468597, \"scale\": 1.3722022398508045}\nB: {\"rotation_angle\": 26.051749493295517, \"translation_dx\": 8.674153667650117, \"translation_dy\": 81.98381249796742, \"scale\": 1.4721363798843865}\nC: {\"rotation_angle\": 28.728757892682808, \"translation_dx\": 12.065384659700086, \"translation_dy\": -119.64549643343977, \"scale\": 1.126100132224236}\nD: {\"rotation_angle\": 28.728757892682808, \"translation_dx\": 12.065384659700086, \"translation_dy\": -119.64549643343977, \"scale\": 1.126100132224236}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -13.219279868292688, \"translation_dx\": -95.87022677446828, \"translation_dy\": -58.31347876468597, \"scale\": 1.3722022398508045}\nB: {\"rotation_angle\": 26.051749493295517, \"translation_dx\": 8.674153667650117, \"translation_dy\": 81.98381249796742, \"scale\": 1.4721363798843865}\nC: {\"rotation_angle\": 28.728757892682808, \"translation_dx\": 12.065384659700086, \"translation_dy\": -119.64549643343977, \"scale\": 1.126100132224236}\nD: {\"rotation_angle\": 28.728757892682808, \"translation_dx\": 12.065384659700086, \"translation_dy\": -119.64549643343977, \"scale\": 1.126100132224236}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_25_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_25_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 139.13421797404374, \"translation_dx\": -107.62188977651758, \"translation_dy\": -65.35657968686931, \"scale\": 0.569575564082204}\nB: {\"rotation_angle\": 134.22497079750707, \"translation_dx\": -56.33244292094708, \"translation_dy\": 12.15417280277697, \"scale\": 1.260404381889235}\nC: {\"rotation_angle\": 64.33574528550244, \"translation_dx\": -83.09111528364858, \"translation_dy\": 12.26726314152404, \"scale\": 0.7845370507816389}\nD: {\"rotation_angle\": -78.36766094840773, \"translation_dx\": -86.41466180609471, \"translation_dy\": 63.19530077419013, \"scale\": 0.608403973907593}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 139.13421797404374, \"translation_dx\": -107.62188977651758, \"translation_dy\": -65.35657968686931, \"scale\": 0.569575564082204}\nB: {\"rotation_angle\": 134.22497079750707, \"translation_dx\": -56.33244292094708, \"translation_dy\": 12.15417280277697, \"scale\": 1.260404381889235}\nC: {\"rotation_angle\": 64.33574528550244, \"translation_dx\": -83.09111528364858, \"translation_dy\": 12.26726314152404, \"scale\": 0.7845370507816389}\nD: {\"rotation_angle\": -78.36766094840773, \"translation_dx\": -86.41466180609471, \"translation_dy\": 63.19530077419013, \"scale\": 0.608403973907593}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_26_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_26_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 51.652651058291696, \"translation_dx\": -79.60059266318888, \"translation_dy\": 40.24223939512936, \"scale\": 1.045377495061187}\nB: {\"rotation_angle\": 64.33574528550244, \"translation_dx\": -83.09111528364858, \"translation_dy\": 12.26726314152404, \"scale\": 0.7845370507816389}\nC: {\"rotation_angle\": 106.62912259997893, \"translation_dx\": -62.19399566166837, \"translation_dy\": -63.078041204745844, \"scale\": 1.4577244189370733}\nD: {\"rotation_angle\": -44.902472769484746, \"translation_dx\": -36.85475324083902, \"translation_dy\": 36.81692000181951, \"scale\": 1.0769710077370194}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 51.652651058291696, \"translation_dx\": -79.60059266318888, \"translation_dy\": 40.24223939512936, \"scale\": 1.045377495061187}\nB: {\"rotation_angle\": 64.33574528550244, \"translation_dx\": -83.09111528364858, \"translation_dy\": 12.26726314152404, \"scale\": 0.7845370507816389}\nC: {\"rotation_angle\": 106.62912259997893, \"translation_dx\": -62.19399566166837, \"translation_dy\": -63.078041204745844, \"scale\": 1.4577244189370733}\nD: {\"rotation_angle\": -44.902472769484746, \"translation_dx\": -36.85475324083902, \"translation_dy\": 36.81692000181951, \"scale\": 1.0769710077370194}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_27_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_27_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 127.1396993936072, \"translation_dx\": -29.08894824101361, \"translation_dy\": -80.84475014775404, \"scale\": 1.2834497894588772}\nB: {\"rotation_angle\": -169.57691070181107, \"translation_dx\": 67.3776951722352, \"translation_dy\": 6.393739311338578, \"scale\": 0.8283042543093307}\nC: {\"rotation_angle\": 153.24034529323683, \"translation_dx\": -80.95083564593054, \"translation_dy\": 58.17854805068575, \"scale\": 0.8564275095577245}\nD: {\"rotation_angle\": -147.17742740700606, \"translation_dx\": 99.79022385553455, \"translation_dy\": -46.32888217161055, \"scale\": 1.2561938294527635}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 127.1396993936072, \"translation_dx\": -29.08894824101361, \"translation_dy\": -80.84475014775404, \"scale\": 1.2834497894588772}\nB: {\"rotation_angle\": -169.57691070181107, \"translation_dx\": 67.3776951722352, \"translation_dy\": 6.393739311338578, \"scale\": 0.8283042543093307}\nC: {\"rotation_angle\": 153.24034529323683, \"translation_dx\": -80.95083564593054, \"translation_dy\": 58.17854805068575, \"scale\": 0.8564275095577245}\nD: {\"rotation_angle\": -147.17742740700606, \"translation_dx\": 99.79022385553455, \"translation_dy\": -46.32888217161055, \"scale\": 1.2561938294527635}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_28_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_28_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -137.58016126496426, \"translation_dx\": 45.631572391068715, \"translation_dy\": -54.72741054396442, \"scale\": 1.391656794638211}\nB: {\"rotation_angle\": -37.8135886633452, \"translation_dx\": 94.09848811207868, \"translation_dy\": -28.846940165704815, \"scale\": 0.7423292461324351}\nC: {\"rotation_angle\": -97.38730278840897, \"translation_dx\": 79.58431404822528, \"translation_dy\": -65.17570525641105, \"scale\": 0.8501057849742453}\nD: {\"rotation_angle\": -84.90425841207441, \"translation_dx\": -96.22975116611923, \"translation_dy\": -54.13037688992304, \"scale\": 1.161476925450186}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -137.58016126496426, \"translation_dx\": 45.631572391068715, \"translation_dy\": -54.72741054396442, \"scale\": 1.391656794638211}\nB: {\"rotation_angle\": -37.8135886633452, \"translation_dx\": 94.09848811207868, \"translation_dy\": -28.846940165704815, \"scale\": 0.7423292461324351}\nC: {\"rotation_angle\": -97.38730278840897, \"translation_dx\": 79.58431404822528, \"translation_dy\": -65.17570525641105, \"scale\": 0.8501057849742453}\nD: {\"rotation_angle\": -84.90425841207441, \"translation_dx\": -96.22975116611923, \"translation_dy\": -54.13037688992304, \"scale\": 1.161476925450186}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_29_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_29_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 18.52926347539298, \"translation_dx\": -26.155433185237058, \"translation_dy\": -39.799299198218556, \"scale\": 0.9355127285855813}\nB: {\"rotation_angle\": -79.55706788063112, \"translation_dx\": -38.613403166877674, \"translation_dy\": 48.56888435185245, \"scale\": 1.368947012195521}\nC: {\"rotation_angle\": -113.69332067912192, \"translation_dx\": -23.005200251858383, \"translation_dy\": 57.916315250854666, \"scale\": 0.5483419258047426}\nD: {\"rotation_angle\": -32.057796286961064, \"translation_dx\": 119.50392135854452, \"translation_dy\": -17.786253698900993, \"scale\": 1.4583062003808291}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 18.52926347539298, \"translation_dx\": -26.155433185237058, \"translation_dy\": -39.799299198218556, \"scale\": 0.9355127285855813}\nB: {\"rotation_angle\": -79.55706788063112, \"translation_dx\": -38.613403166877674, \"translation_dy\": 48.56888435185245, \"scale\": 1.368947012195521}\nC: {\"rotation_angle\": -113.69332067912192, \"translation_dx\": -23.005200251858383, \"translation_dy\": 57.916315250854666, \"scale\": 0.5483419258047426}\nD: {\"rotation_angle\": -32.057796286961064, \"translation_dx\": 119.50392135854452, \"translation_dy\": -17.786253698900993, \"scale\": 1.4583062003808291}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_30_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_30_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -127.03310490562403, \"translation_dx\": -44.497972498107885, \"translation_dy\": 53.252184804163164, \"scale\": 0.8807762361133948}\nB: {\"rotation_angle\": 2.6800660606496933, \"translation_dx\": 8.805898944242955, \"translation_dy\": -61.557448223727356, \"scale\": 0.7338009245004858}\nC: {\"rotation_angle\": 47.16467358014893, \"translation_dx\": -87.19318159487975, \"translation_dy\": -49.56686010575127, \"scale\": 1.2416587716965684}\nD: {\"rotation_angle\": 153.24034529323683, \"translation_dx\": -80.95083564593054, \"translation_dy\": 58.17854805068575, \"scale\": 0.8564275095577245}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -127.03310490562403, \"translation_dx\": -44.497972498107885, \"translation_dy\": 53.252184804163164, \"scale\": 0.8807762361133948}\nB: {\"rotation_angle\": 2.6800660606496933, \"translation_dx\": 8.805898944242955, \"translation_dy\": -61.557448223727356, \"scale\": 0.7338009245004858}\nC: {\"rotation_angle\": 47.16467358014893, \"translation_dx\": -87.19318159487975, \"translation_dy\": -49.56686010575127, \"scale\": 1.2416587716965684}\nD: {\"rotation_angle\": 153.24034529323683, \"translation_dx\": -80.95083564593054, \"translation_dy\": 58.17854805068575, \"scale\": 0.8564275095577245}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_31_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_31_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 160.04018122869564, \"translation_dx\": -10.031879581871024, \"translation_dy\": 74.10075881851205, \"scale\": 0.8976020445815951}\nB: {\"rotation_angle\": -131.1795029858263, \"translation_dx\": 17.908074544940433, \"translation_dy\": 120.17637833747304, \"scale\": 0.9471882483559888}\nC: {\"rotation_angle\": 104.66960596229086, \"translation_dx\": 122.9579606372167, \"translation_dy\": -32.21502556645471, \"scale\": 0.5791563638149022}\nD: {\"rotation_angle\": -117.26843352521382, \"translation_dx\": 17.28573283600312, \"translation_dy\": -92.45781352854672, \"scale\": 1.478727361005855}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 160.04018122869564, \"translation_dx\": -10.031879581871024, \"translation_dy\": 74.10075881851205, \"scale\": 0.8976020445815951}\nB: {\"rotation_angle\": -131.1795029858263, \"translation_dx\": 17.908074544940433, \"translation_dy\": 120.17637833747304, \"scale\": 0.9471882483559888}\nC: {\"rotation_angle\": 104.66960596229086, \"translation_dx\": 122.9579606372167, \"translation_dy\": -32.21502556645471, \"scale\": 0.5791563638149022}\nD: {\"rotation_angle\": -117.26843352521382, \"translation_dx\": 17.28573283600312, \"translation_dy\": -92.45781352854672, \"scale\": 1.478727361005855}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_32_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_32_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 33.36657735274014, \"translation_dx\": -110.42271839281483, \"translation_dy\": 35.783043595963875, \"scale\": 1.1017945125321793}\nB: {\"rotation_angle\": -153.95647753312159, \"translation_dx\": 64.08546266437509, \"translation_dy\": -34.554486291313935, \"scale\": 1.423360690418288}\nC: {\"rotation_angle\": 162.98131081099467, \"translation_dx\": -80.19473687776261, \"translation_dy\": -17.70282064458462, \"scale\": 1.2855975600149028}\nD: {\"rotation_angle\": 127.0599036632886, \"translation_dx\": -26.73103881794438, \"translation_dy\": 16.785326739741976, \"scale\": 1.1214331244941351}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 33.36657735274014, \"translation_dx\": -110.42271839281483, \"translation_dy\": 35.783043595963875, \"scale\": 1.1017945125321793}\nB: {\"rotation_angle\": -153.95647753312159, \"translation_dx\": 64.08546266437509, \"translation_dy\": -34.554486291313935, \"scale\": 1.423360690418288}\nC: {\"rotation_angle\": 162.98131081099467, \"translation_dx\": -80.19473687776261, \"translation_dy\": -17.70282064458462, \"scale\": 1.2855975600149028}\nD: {\"rotation_angle\": 127.0599036632886, \"translation_dx\": -26.73103881794438, \"translation_dy\": 16.785326739741976, \"scale\": 1.1214331244941351}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_33_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_33_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -13.219279868292688, \"translation_dx\": -95.87022677446828, \"translation_dy\": -58.31347876468597, \"scale\": 1.3722022398508045}\nB: {\"rotation_angle\": -92.49508697379828, \"translation_dx\": 63.09853740086383, \"translation_dy\": 99.47995409556995, \"scale\": 0.9495145406508286}\nC: {\"rotation_angle\": 8.705969178532513, \"translation_dx\": -108.98578445869327, \"translation_dy\": -85.91179454441009, \"scale\": 0.5132717751865925}\nD: {\"rotation_angle\": 72.25092677282458, \"translation_dx\": 61.389740502873025, \"translation_dy\": -36.86538640455047, \"scale\": 1.0748600769835353}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -13.219279868292688, \"translation_dx\": -95.87022677446828, \"translation_dy\": -58.31347876468597, \"scale\": 1.3722022398508045}\nB: {\"rotation_angle\": -92.49508697379828, \"translation_dx\": 63.09853740086383, \"translation_dy\": 99.47995409556995, \"scale\": 0.9495145406508286}\nC: {\"rotation_angle\": 8.705969178532513, \"translation_dx\": -108.98578445869327, \"translation_dy\": -85.91179454441009, \"scale\": 0.5132717751865925}\nD: {\"rotation_angle\": 72.25092677282458, \"translation_dx\": 61.389740502873025, \"translation_dy\": -36.86538640455047, \"scale\": 1.0748600769835353}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_34_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_34_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -8.756342422911757, \"translation_dx\": -120.12147874311805, \"translation_dy\": -16.659510954699698, \"scale\": 0.8471832394055047}\nB: {\"rotation_angle\": 88.55038325147228, \"translation_dx\": -17.272344447388633, \"translation_dy\": -67.72549137992362, \"scale\": 0.5810098703790367}\nC: {\"rotation_angle\": 1.3693998936690264, \"translation_dx\": -71.94174431428723, \"translation_dy\": 25.661133958182248, \"scale\": 1.468813327861592}\nD: {\"rotation_angle\": 170.5673161572617, \"translation_dx\": -54.14309140946517, \"translation_dy\": -20.9067824061149, \"scale\": 0.74080987054586}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -8.756342422911757, \"translation_dx\": -120.12147874311805, \"translation_dy\": -16.659510954699698, \"scale\": 0.8471832394055047}\nB: {\"rotation_angle\": 88.55038325147228, \"translation_dx\": -17.272344447388633, \"translation_dy\": -67.72549137992362, \"scale\": 0.5810098703790367}\nC: {\"rotation_angle\": 1.3693998936690264, \"translation_dx\": -71.94174431428723, \"translation_dy\": 25.661133958182248, \"scale\": 1.468813327861592}\nD: {\"rotation_angle\": 170.5673161572617, \"translation_dx\": -54.14309140946517, \"translation_dy\": -20.9067824061149, \"scale\": 0.74080987054586}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_35_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_35_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -41.748048059314925, \"translation_dx\": 84.2495675740148, \"translation_dy\": -81.02778113177463, \"scale\": 1.207158201764622}\nB: {\"rotation_angle\": 120.9888581359325, \"translation_dx\": 2.43720894071744, \"translation_dy\": -7.865691814940682, \"scale\": 0.5519813971136048}\nC: {\"rotation_angle\": 52.0207999596704, \"translation_dx\": 62.052266940503074, \"translation_dy\": 15.318990484280505, \"scale\": 1.1445040102422772}\nD: {\"rotation_angle\": 115.16030768984217, \"translation_dx\": -1.9669547188467504, \"translation_dy\": 38.42152609256746, \"scale\": 1.3403221872922475}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -41.748048059314925, \"translation_dx\": 84.2495675740148, \"translation_dy\": -81.02778113177463, \"scale\": 1.207158201764622}\nB: {\"rotation_angle\": 120.9888581359325, \"translation_dx\": 2.43720894071744, \"translation_dy\": -7.865691814940682, \"scale\": 0.5519813971136048}\nC: {\"rotation_angle\": 52.0207999596704, \"translation_dx\": 62.052266940503074, \"translation_dy\": 15.318990484280505, \"scale\": 1.1445040102422772}\nD: {\"rotation_angle\": 115.16030768984217, \"translation_dx\": -1.9669547188467504, \"translation_dy\": 38.42152609256746, \"scale\": 1.3403221872922475}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_36_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_36_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 130.382151153576, \"translation_dx\": 48.77925626504499, \"translation_dy\": 54.89982459749416, \"scale\": 1.3647831130001666}\nB: {\"rotation_angle\": -126.15991399279281, \"translation_dx\": 24.895638463286446, \"translation_dy\": -35.71086816730676, \"scale\": 1.30648936857296}\nC: {\"rotation_angle\": -15.445234303955033, \"translation_dx\": 52.656313993324545, \"translation_dy\": 4.243768644047549, \"scale\": 0.8747335302455691}\nD: {\"rotation_angle\": -132.6730586187399, \"translation_dx\": -14.723128468316531, \"translation_dy\": -95.44210429834934, \"scale\": 1.0421065600095725}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 130.382151153576, \"translation_dx\": 48.77925626504499, \"translation_dy\": 54.89982459749416, \"scale\": 1.3647831130001666}\nB: {\"rotation_angle\": -126.15991399279281, \"translation_dx\": 24.895638463286446, \"translation_dy\": -35.71086816730676, \"scale\": 1.30648936857296}\nC: {\"rotation_angle\": -15.445234303955033, \"translation_dx\": 52.656313993324545, \"translation_dy\": 4.243768644047549, \"scale\": 0.8747335302455691}\nD: {\"rotation_angle\": -132.6730586187399, \"translation_dx\": -14.723128468316531, \"translation_dy\": -95.44210429834934, \"scale\": 1.0421065600095725}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_37_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_37_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -97.38730278840897, \"translation_dx\": 79.58431404822528, \"translation_dy\": -65.17570525641105, \"scale\": 0.8501057849742453}\nB: {\"rotation_angle\": 120.9888581359325, \"translation_dx\": 2.43720894071744, \"translation_dy\": -7.865691814940682, \"scale\": 0.5519813971136048}\nC: {\"rotation_angle\": -53.475823147809436, \"translation_dx\": -52.11444637245131, \"translation_dy\": -7.974464084606126, \"scale\": 1.302004904680502}\nD: {\"rotation_angle\": 178.3015459217881, \"translation_dx\": 2.1592483018484785, \"translation_dy\": -86.15095567396924, \"scale\": 1.206185814877298}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -97.38730278840897, \"translation_dx\": 79.58431404822528, \"translation_dy\": -65.17570525641105, \"scale\": 0.8501057849742453}\nB: {\"rotation_angle\": 120.9888581359325, \"translation_dx\": 2.43720894071744, \"translation_dy\": -7.865691814940682, \"scale\": 0.5519813971136048}\nC: {\"rotation_angle\": -53.475823147809436, \"translation_dx\": -52.11444637245131, \"translation_dy\": -7.974464084606126, \"scale\": 1.302004904680502}\nD: {\"rotation_angle\": 178.3015459217881, \"translation_dx\": 2.1592483018484785, \"translation_dy\": -86.15095567396924, \"scale\": 1.206185814877298}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_38_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_38_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -81.11314702551758, \"translation_dx\": -115.5554336511824, \"translation_dy\": 81.04425747964075, \"scale\": 0.8604764063335847}\nB: {\"rotation_angle\": 32.170058088704565, \"translation_dx\": 62.48780444449932, \"translation_dy\": 36.464458087386475, \"scale\": 0.8338243238440678}\nC: {\"rotation_angle\": 22.924180775031914, \"translation_dx\": 8.278066534063711, \"translation_dy\": 39.03722404706397, \"scale\": 0.6972670428813228}\nD: {\"rotation_angle\": 163.34031080178892, \"translation_dx\": -21.567151354845635, \"translation_dy\": -30.72615389540148, \"scale\": 1.2439888416024685}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -81.11314702551758, \"translation_dx\": -115.5554336511824, \"translation_dy\": 81.04425747964075, \"scale\": 0.8604764063335847}\nB: {\"rotation_angle\": 32.170058088704565, \"translation_dx\": 62.48780444449932, \"translation_dy\": 36.464458087386475, \"scale\": 0.8338243238440678}\nC: {\"rotation_angle\": 22.924180775031914, \"translation_dx\": 8.278066534063711, \"translation_dy\": 39.03722404706397, \"scale\": 0.6972670428813228}\nD: {\"rotation_angle\": 163.34031080178892, \"translation_dx\": -21.567151354845635, \"translation_dy\": -30.72615389540148, \"scale\": 1.2439888416024685}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_39_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_39_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -79.27003163090343, \"translation_dx\": 8.207736130313549, \"translation_dy\": 6.670417118750038, \"scale\": 1.3327657238113826}\nB: {\"rotation_angle\": -78.36766094840773, \"translation_dx\": -86.41466180609471, \"translation_dy\": 63.19530077419013, \"scale\": 0.608403973907593}\nC: {\"rotation_angle\": 49.90656423603761, \"translation_dx\": 85.27067294320437, \"translation_dy\": -8.928665399863448, \"scale\": 0.9370060594249733}\nD: {\"rotation_angle\": -72.82027143369304, \"translation_dx\": -44.85481158127062, \"translation_dy\": 106.69131407191517, \"scale\": 0.716080341101258}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -79.27003163090343, \"translation_dx\": 8.207736130313549, \"translation_dy\": 6.670417118750038, \"scale\": 1.3327657238113826}\nB: {\"rotation_angle\": -78.36766094840773, \"translation_dx\": -86.41466180609471, \"translation_dy\": 63.19530077419013, \"scale\": 0.608403973907593}\nC: {\"rotation_angle\": 49.90656423603761, \"translation_dx\": 85.27067294320437, \"translation_dy\": -8.928665399863448, \"scale\": 0.9370060594249733}\nD: {\"rotation_angle\": -72.82027143369304, \"translation_dx\": -44.85481158127062, \"translation_dy\": 106.69131407191517, \"scale\": 0.716080341101258}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_40_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_40_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -161.22593365548192, \"translation_dx\": -119.73961882572601, \"translation_dy\": -93.50838821854722, \"scale\": 1.4476413063179399}\nB: {\"rotation_angle\": 52.0207999596704, \"translation_dx\": 62.052266940503074, \"translation_dy\": 15.318990484280505, \"scale\": 1.1445040102422772}\nC: {\"rotation_angle\": -98.17490649350026, \"translation_dx\": 5.744855173473269, \"translation_dy\": -10.705504600001973, \"scale\": 1.1182428392253487}\nD: {\"rotation_angle\": 159.25105466068987, \"translation_dx\": -126.35420360425098, \"translation_dy\": -17.54721978726404, \"scale\": 1.4952435062275256}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -161.22593365548192, \"translation_dx\": -119.73961882572601, \"translation_dy\": -93.50838821854722, \"scale\": 1.4476413063179399}\nB: {\"rotation_angle\": 52.0207999596704, \"translation_dx\": 62.052266940503074, \"translation_dy\": 15.318990484280505, \"scale\": 1.1445040102422772}\nC: {\"rotation_angle\": -98.17490649350026, \"translation_dx\": 5.744855173473269, \"translation_dy\": -10.705504600001973, \"scale\": 1.1182428392253487}\nD: {\"rotation_angle\": 159.25105466068987, \"translation_dx\": -126.35420360425098, \"translation_dy\": -17.54721978726404, \"scale\": 1.4952435062275256}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_41_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_41_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -98.17490649350026, \"translation_dx\": 5.744855173473269, \"translation_dy\": -10.705504600001973, \"scale\": 1.1182428392253487}\nB: {\"rotation_angle\": -137.69315675508605, \"translation_dx\": -14.965017175186233, \"translation_dy\": 28.85856493302694, \"scale\": 0.6970825252863025}\nC: {\"rotation_angle\": -61.308258156024195, \"translation_dx\": -92.42627707406731, \"translation_dy\": -21.076199203141364, \"scale\": 1.1133621977071444}\nD: {\"rotation_angle\": -22.98450105670534, \"translation_dx\": -24.343109907781525, \"translation_dy\": -75.50859401578859, \"scale\": 0.5077440368943875}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -98.17490649350026, \"translation_dx\": 5.744855173473269, \"translation_dy\": -10.705504600001973, \"scale\": 1.1182428392253487}\nB: {\"rotation_angle\": -137.69315675508605, \"translation_dx\": -14.965017175186233, \"translation_dy\": 28.85856493302694, \"scale\": 0.6970825252863025}\nC: {\"rotation_angle\": -61.308258156024195, \"translation_dx\": -92.42627707406731, \"translation_dy\": -21.076199203141364, \"scale\": 1.1133621977071444}\nD: {\"rotation_angle\": -22.98450105670534, \"translation_dx\": -24.343109907781525, \"translation_dy\": -75.50859401578859, \"scale\": 0.5077440368943875}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_42_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_42_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -169.57691070181107, \"translation_dx\": 67.3776951722352, \"translation_dy\": 6.393739311338578, \"scale\": 0.8283042543093307}\nB: {\"rotation_angle\": -101.64893396855386, \"translation_dx\": -96.08306753711838, \"translation_dy\": 14.852477797043775, \"scale\": 1.3017377870800058}\nC: {\"rotation_angle\": -100.94596249363259, \"translation_dx\": 18.493532966543597, \"translation_dy\": -4.904135882610319, \"scale\": 1.1575890826518318}\nD: {\"rotation_angle\": -35.37165300247324, \"translation_dx\": -51.674784510203665, \"translation_dy\": 35.0550301640573, \"scale\": 1.181842779166554}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -169.57691070181107, \"translation_dx\": 67.3776951722352, \"translation_dy\": 6.393739311338578, \"scale\": 0.8283042543093307}\nB: {\"rotation_angle\": -101.64893396855386, \"translation_dx\": -96.08306753711838, \"translation_dy\": 14.852477797043775, \"scale\": 1.3017377870800058}\nC: {\"rotation_angle\": -100.94596249363259, \"translation_dx\": 18.493532966543597, \"translation_dy\": -4.904135882610319, \"scale\": 1.1575890826518318}\nD: {\"rotation_angle\": -35.37165300247324, \"translation_dx\": -51.674784510203665, \"translation_dy\": 35.0550301640573, \"scale\": 1.181842779166554}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_43_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_43_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -103.5561502427767, \"translation_dx\": -75.76940431238745, \"translation_dy\": -48.3479107136017, \"scale\": 1.0522987713432983}\nB: {\"rotation_angle\": -95.56761680572791, \"translation_dx\": -92.07587430861633, \"translation_dy\": -64.18919222058364, \"scale\": 1.033728049154846}\nC: {\"rotation_angle\": 112.15713698429767, \"translation_dx\": -0.833180316164956, \"translation_dy\": -100.57740000976534, \"scale\": 1.21487245494624}\nD: {\"rotation_angle\": 153.24034529323683, \"translation_dx\": -80.95083564593054, \"translation_dy\": 58.17854805068575, \"scale\": 0.8564275095577245}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -103.5561502427767, \"translation_dx\": -75.76940431238745, \"translation_dy\": -48.3479107136017, \"scale\": 1.0522987713432983}\nB: {\"rotation_angle\": -95.56761680572791, \"translation_dx\": -92.07587430861633, \"translation_dy\": -64.18919222058364, \"scale\": 1.033728049154846}\nC: {\"rotation_angle\": 112.15713698429767, \"translation_dx\": -0.833180316164956, \"translation_dy\": -100.57740000976534, \"scale\": 1.21487245494624}\nD: {\"rotation_angle\": 153.24034529323683, \"translation_dx\": -80.95083564593054, \"translation_dy\": 58.17854805068575, \"scale\": 0.8564275095577245}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_44_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_44_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 97.63348280388993, \"translation_dx\": 59.62332527691919, \"translation_dy\": 12.549462794922746, \"scale\": 0.6927080624806098}\nB: {\"rotation_angle\": -127.03310490562403, \"translation_dx\": -44.497972498107885, \"translation_dy\": 53.252184804163164, \"scale\": 0.8807762361133948}\nC: {\"rotation_angle\": -13.219279868292688, \"translation_dx\": -95.87022677446828, \"translation_dy\": -58.31347876468597, \"scale\": 1.3722022398508045}\nD: {\"rotation_angle\": -14.958482221349612, \"translation_dx\": 49.62118662103501, \"translation_dy\": -13.943537967490855, \"scale\": 1.489574484959727}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 97.63348280388993, \"translation_dx\": 59.62332527691919, \"translation_dy\": 12.549462794922746, \"scale\": 0.6927080624806098}\nB: {\"rotation_angle\": -127.03310490562403, \"translation_dx\": -44.497972498107885, \"translation_dy\": 53.252184804163164, \"scale\": 0.8807762361133948}\nC: {\"rotation_angle\": -13.219279868292688, \"translation_dx\": -95.87022677446828, \"translation_dy\": -58.31347876468597, \"scale\": 1.3722022398508045}\nD: {\"rotation_angle\": -14.958482221349612, \"translation_dx\": 49.62118662103501, \"translation_dy\": -13.943537967490855, \"scale\": 1.489574484959727}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_45_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_45_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -4.956802948250129, \"translation_dx\": -46.115491929325685, \"translation_dy\": 39.01349173096322, \"scale\": 1.02280257064298}\nB: {\"rotation_angle\": 136.2943203908062, \"translation_dx\": 59.15508525636656, \"translation_dy\": -38.46099161723379, \"scale\": 0.6414776081953896}\nC: {\"rotation_angle\": 97.08459407481979, \"translation_dx\": 38.76418659488206, \"translation_dy\": 44.81166266995322, \"scale\": 1.27585958531192}\nD: {\"rotation_angle\": 123.61853421760617, \"translation_dx\": -93.63136806510369, \"translation_dy\": -15.65687765252683, \"scale\": 0.9834422929774667}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -4.956802948250129, \"translation_dx\": -46.115491929325685, \"translation_dy\": 39.01349173096322, \"scale\": 1.02280257064298}\nB: {\"rotation_angle\": 136.2943203908062, \"translation_dx\": 59.15508525636656, \"translation_dy\": -38.46099161723379, \"scale\": 0.6414776081953896}\nC: {\"rotation_angle\": 97.08459407481979, \"translation_dx\": 38.76418659488206, \"translation_dy\": 44.81166266995322, \"scale\": 1.27585958531192}\nD: {\"rotation_angle\": 123.61853421760617, \"translation_dx\": -93.63136806510369, \"translation_dy\": -15.65687765252683, \"scale\": 0.9834422929774667}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_46_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_46_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 45.786611297437304, \"translation_dx\": 45.53183354666939, \"translation_dy\": -112.45880863798888, \"scale\": 0.5686394776423458}\nB: {\"rotation_angle\": -101.64893396855386, \"translation_dx\": -96.08306753711838, \"translation_dy\": 14.852477797043775, \"scale\": 1.3017377870800058}\nC: {\"rotation_angle\": 98.62110540120432, \"translation_dx\": 55.8324503005326, \"translation_dy\": -53.32963696213369, \"scale\": 1.3342375308232577}\nD: {\"rotation_angle\": 98.12478073081388, \"translation_dx\": 82.24255679101596, \"translation_dy\": 10.638794739410258, \"scale\": 1.454613875934863}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 45.786611297437304, \"translation_dx\": 45.53183354666939, \"translation_dy\": -112.45880863798888, \"scale\": 0.5686394776423458}\nB: {\"rotation_angle\": -101.64893396855386, \"translation_dx\": -96.08306753711838, \"translation_dy\": 14.852477797043775, \"scale\": 1.3017377870800058}\nC: {\"rotation_angle\": 98.62110540120432, \"translation_dx\": 55.8324503005326, \"translation_dy\": -53.32963696213369, \"scale\": 1.3342375308232577}\nD: {\"rotation_angle\": 98.12478073081388, \"translation_dx\": 82.24255679101596, \"translation_dy\": 10.638794739410258, \"scale\": 1.454613875934863}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_47_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_47_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 104.66960596229086, \"translation_dx\": 122.9579606372167, \"translation_dy\": -32.21502556645471, \"scale\": 0.5791563638149022}\nB: {\"rotation_angle\": 162.6656255846617, \"translation_dx\": -24.713919503645087, \"translation_dy\": -0.6846177496217649, \"scale\": 0.967192316827237}\nC: {\"rotation_angle\": 159.74516071456964, \"translation_dx\": 18.36539372865252, \"translation_dy\": -32.68583255299669, \"scale\": 0.6283421405871866}\nD: {\"rotation_angle\": 112.15713698429767, \"translation_dx\": -0.833180316164956, \"translation_dy\": -100.57740000976534, \"scale\": 1.21487245494624}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 104.66960596229086, \"translation_dx\": 122.9579606372167, \"translation_dy\": -32.21502556645471, \"scale\": 0.5791563638149022}\nB: {\"rotation_angle\": 162.6656255846617, \"translation_dx\": -24.713919503645087, \"translation_dy\": -0.6846177496217649, \"scale\": 0.967192316827237}\nC: {\"rotation_angle\": 159.74516071456964, \"translation_dx\": 18.36539372865252, \"translation_dy\": -32.68583255299669, \"scale\": 0.6283421405871866}\nD: {\"rotation_angle\": 112.15713698429767, \"translation_dx\": -0.833180316164956, \"translation_dy\": -100.57740000976534, \"scale\": 1.21487245494624}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_48_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_48_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 104.66960596229086, \"translation_dx\": 122.9579606372167, \"translation_dy\": -32.21502556645471, \"scale\": 0.5791563638149022}\nB: {\"rotation_angle\": 49.90656423603761, \"translation_dx\": 85.27067294320437, \"translation_dy\": -8.928665399863448, \"scale\": 0.9370060594249733}\nC: {\"rotation_angle\": -41.748048059314925, \"translation_dx\": 84.2495675740148, \"translation_dy\": -81.02778113177463, \"scale\": 1.207158201764622}\nD: {\"rotation_angle\": -113.69332067912192, \"translation_dx\": -23.005200251858383, \"translation_dy\": 57.916315250854666, \"scale\": 0.5483419258047426}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 104.66960596229086, \"translation_dx\": 122.9579606372167, \"translation_dy\": -32.21502556645471, \"scale\": 0.5791563638149022}\nB: {\"rotation_angle\": 49.90656423603761, \"translation_dx\": 85.27067294320437, \"translation_dy\": -8.928665399863448, \"scale\": 0.9370060594249733}\nC: {\"rotation_angle\": -41.748048059314925, \"translation_dx\": 84.2495675740148, \"translation_dy\": -81.02778113177463, \"scale\": 1.207158201764622}\nD: {\"rotation_angle\": -113.69332067912192, \"translation_dx\": -23.005200251858383, \"translation_dy\": 57.916315250854666, \"scale\": 0.5483419258047426}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_49_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_49_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -94.06455293225282, \"translation_dx\": -52.04430006776356, \"translation_dy\": 88.55937507710391, \"scale\": 0.8369046461483086}\nB: {\"rotation_angle\": -44.902472769484746, \"translation_dx\": -36.85475324083902, \"translation_dy\": 36.81692000181951, \"scale\": 1.0769710077370194}\nC: {\"rotation_angle\": 136.2943203908062, \"translation_dx\": 59.15508525636656, \"translation_dy\": -38.46099161723379, \"scale\": 0.6414776081953896}\nD: {\"rotation_angle\": 67.74863170033868, \"translation_dx\": 0.9436916559104702, \"translation_dy\": 79.02717939495389, \"scale\": 1.0490112177140545}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -94.06455293225282, \"translation_dx\": -52.04430006776356, \"translation_dy\": 88.55937507710391, \"scale\": 0.8369046461483086}\nB: {\"rotation_angle\": -44.902472769484746, \"translation_dx\": -36.85475324083902, \"translation_dy\": 36.81692000181951, \"scale\": 1.0769710077370194}\nC: {\"rotation_angle\": 136.2943203908062, \"translation_dx\": 59.15508525636656, \"translation_dy\": -38.46099161723379, \"scale\": 0.6414776081953896}\nD: {\"rotation_angle\": 67.74863170033868, \"translation_dx\": 0.9436916559104702, \"translation_dy\": 79.02717939495389, \"scale\": 1.0490112177140545}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_50_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_50_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -169.57691070181107, \"translation_dx\": 67.3776951722352, \"translation_dy\": 6.393739311338578, \"scale\": 0.8283042543093307}\nB: {\"rotation_angle\": 53.86809011441332, \"translation_dx\": -15.131168518097624, \"translation_dy\": -31.300037391593577, \"scale\": 1.3154620606808156}\nC: {\"rotation_angle\": -70.18179574394556, \"translation_dx\": -84.02989442213027, \"translation_dy\": 45.46342410564398, \"scale\": 1.28660403831869}\nD: {\"rotation_angle\": -137.69315675508605, \"translation_dx\": -14.965017175186233, \"translation_dy\": 28.85856493302694, \"scale\": 0.6970825252863025}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -169.57691070181107, \"translation_dx\": 67.3776951722352, \"translation_dy\": 6.393739311338578, \"scale\": 0.8283042543093307}\nB: {\"rotation_angle\": 53.86809011441332, \"translation_dx\": -15.131168518097624, \"translation_dy\": -31.300037391593577, \"scale\": 1.3154620606808156}\nC: {\"rotation_angle\": -70.18179574394556, \"translation_dx\": -84.02989442213027, \"translation_dy\": 45.46342410564398, \"scale\": 1.28660403831869}\nD: {\"rotation_angle\": -137.69315675508605, \"translation_dx\": -14.965017175186233, \"translation_dy\": 28.85856493302694, \"scale\": 0.6970825252863025}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_51_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_51_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 159.25105466068987, \"translation_dx\": -126.35420360425098, \"translation_dy\": -17.54721978726404, \"scale\": 1.4952435062275256}\nB: {\"rotation_angle\": 168.86687879669455, \"translation_dx\": 30.327287286076626, \"translation_dy\": -73.84263373893171, \"scale\": 1.0887904122788439}\nC: {\"rotation_angle\": 141.74747753602782, \"translation_dx\": -54.793360600935046, \"translation_dy\": -29.72546528603263, \"scale\": 0.6563706152769926}\nD: {\"rotation_angle\": -4.956802948250129, \"translation_dx\": -46.115491929325685, \"translation_dy\": 39.01349173096322, \"scale\": 1.02280257064298}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 159.25105466068987, \"translation_dx\": -126.35420360425098, \"translation_dy\": -17.54721978726404, \"scale\": 1.4952435062275256}\nB: {\"rotation_angle\": 168.86687879669455, \"translation_dx\": 30.327287286076626, \"translation_dy\": -73.84263373893171, \"scale\": 1.0887904122788439}\nC: {\"rotation_angle\": 141.74747753602782, \"translation_dx\": -54.793360600935046, \"translation_dy\": -29.72546528603263, \"scale\": 0.6563706152769926}\nD: {\"rotation_angle\": -4.956802948250129, \"translation_dx\": -46.115491929325685, \"translation_dy\": 39.01349173096322, \"scale\": 1.02280257064298}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_52_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_52_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 32.170058088704565, \"translation_dx\": 62.48780444449932, \"translation_dy\": 36.464458087386475, \"scale\": 0.8338243238440678}\nB: {\"rotation_angle\": 52.27392299801002, \"translation_dx\": -7.943242591889941, \"translation_dy\": -1.8318597711701017, \"scale\": 1.489664776133741}\nC: {\"rotation_angle\": -31.020660516088725, \"translation_dx\": 105.99805178546191, \"translation_dy\": -82.8489656004858, \"scale\": 1.0703563169477137}\nD: {\"rotation_angle\": -68.79930104020924, \"translation_dx\": -103.12901971602221, \"translation_dy\": 94.89161684072867, \"scale\": 1.2295411735859756}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 32.170058088704565, \"translation_dx\": 62.48780444449932, \"translation_dy\": 36.464458087386475, \"scale\": 0.8338243238440678}\nB: {\"rotation_angle\": 52.27392299801002, \"translation_dx\": -7.943242591889941, \"translation_dy\": -1.8318597711701017, \"scale\": 1.489664776133741}\nC: {\"rotation_angle\": -31.020660516088725, \"translation_dx\": 105.99805178546191, \"translation_dy\": -82.8489656004858, \"scale\": 1.0703563169477137}\nD: {\"rotation_angle\": -68.79930104020924, \"translation_dx\": -103.12901971602221, \"translation_dy\": 94.89161684072867, \"scale\": 1.2295411735859756}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_53_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_53_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 159.18509857624855, \"translation_dx\": 94.5972413522399, \"translation_dy\": -87.01463724053234, \"scale\": 0.7914176569510836}\nB: {\"rotation_angle\": 67.74863170033868, \"translation_dx\": 0.9436916559104702, \"translation_dy\": 79.02717939495389, \"scale\": 1.0490112177140545}\nC: {\"rotation_angle\": 161.7596265938729, \"translation_dx\": -9.170216354863072, \"translation_dy\": -19.23222492696047, \"scale\": 1.1821087248622173}\nD: {\"rotation_angle\": -51.98717119490195, \"translation_dx\": -83.93544420557635, \"translation_dy\": -17.359661719977098, \"scale\": 1.0858344969275349}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 159.18509857624855, \"translation_dx\": 94.5972413522399, \"translation_dy\": -87.01463724053234, \"scale\": 0.7914176569510836}\nB: {\"rotation_angle\": 67.74863170033868, \"translation_dx\": 0.9436916559104702, \"translation_dy\": 79.02717939495389, \"scale\": 1.0490112177140545}\nC: {\"rotation_angle\": 161.7596265938729, \"translation_dx\": -9.170216354863072, \"translation_dy\": -19.23222492696047, \"scale\": 1.1821087248622173}\nD: {\"rotation_angle\": -51.98717119490195, \"translation_dx\": -83.93544420557635, \"translation_dy\": -17.359661719977098, \"scale\": 1.0858344969275349}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_54_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_54_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 139.13421797404374, \"translation_dx\": -107.62188977651758, \"translation_dy\": -65.35657968686931, \"scale\": 0.569575564082204}\nB: {\"rotation_angle\": 159.39197876032466, \"translation_dx\": -101.87275621292875, \"translation_dy\": -32.606176111808466, \"scale\": 0.6647290774480178}\nC: {\"rotation_angle\": 72.25092677282458, \"translation_dx\": 61.389740502873025, \"translation_dy\": -36.86538640455047, \"scale\": 1.0748600769835353}\nD: {\"rotation_angle\": 163.34031080178892, \"translation_dx\": -21.567151354845635, \"translation_dy\": -30.72615389540148, \"scale\": 1.2439888416024685}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 139.13421797404374, \"translation_dx\": -107.62188977651758, \"translation_dy\": -65.35657968686931, \"scale\": 0.569575564082204}\nB: {\"rotation_angle\": 159.39197876032466, \"translation_dx\": -101.87275621292875, \"translation_dy\": -32.606176111808466, \"scale\": 0.6647290774480178}\nC: {\"rotation_angle\": 72.25092677282458, \"translation_dx\": 61.389740502873025, \"translation_dy\": -36.86538640455047, \"scale\": 1.0748600769835353}\nD: {\"rotation_angle\": 163.34031080178892, \"translation_dx\": -21.567151354845635, \"translation_dy\": -30.72615389540148, \"scale\": 1.2439888416024685}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_55_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_55_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 95.69634927891752, \"translation_dx\": -96.46148729426875, \"translation_dy\": -25.496381966922478, \"scale\": 0.7479348241153333}\nB: {\"rotation_angle\": -110.51021822636605, \"translation_dx\": -17.924195571284486, \"translation_dy\": -0.10679752473519954, \"scale\": 1.4066663412939815}\nC: {\"rotation_angle\": 12.872370969250312, \"translation_dx\": -43.1533458138392, \"translation_dy\": -64.88511529320917, \"scale\": 1.3092068537816153}\nD: {\"rotation_angle\": -22.98450105670534, \"translation_dx\": -24.343109907781525, \"translation_dy\": -75.50859401578859, \"scale\": 0.5077440368943875}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 95.69634927891752, \"translation_dx\": -96.46148729426875, \"translation_dy\": -25.496381966922478, \"scale\": 0.7479348241153333}\nB: {\"rotation_angle\": -110.51021822636605, \"translation_dx\": -17.924195571284486, \"translation_dy\": -0.10679752473519954, \"scale\": 1.4066663412939815}\nC: {\"rotation_angle\": 12.872370969250312, \"translation_dx\": -43.1533458138392, \"translation_dy\": -64.88511529320917, \"scale\": 1.3092068537816153}\nD: {\"rotation_angle\": -22.98450105670534, \"translation_dx\": -24.343109907781525, \"translation_dy\": -75.50859401578859, \"scale\": 0.5077440368943875}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_56_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_56_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -6.258618837806779, \"translation_dx\": -117.56200624611057, \"translation_dy\": -84.92852320396813, \"scale\": 0.8703619649920769}\nB: {\"rotation_angle\": 123.61853421760617, \"translation_dx\": -93.63136806510369, \"translation_dy\": -15.65687765252683, \"scale\": 0.9834422929774667}\nC: {\"rotation_angle\": -6.38420562293993, \"translation_dx\": -106.80670691302902, \"translation_dy\": -3.5935098985529663, \"scale\": 1.3037846299861797}\nD: {\"rotation_angle\": 26.051749493295517, \"translation_dx\": 8.674153667650117, \"translation_dy\": 81.98381249796742, \"scale\": 1.4721363798843865}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -6.258618837806779, \"translation_dx\": -117.56200624611057, \"translation_dy\": -84.92852320396813, \"scale\": 0.8703619649920769}\nB: {\"rotation_angle\": 123.61853421760617, \"translation_dx\": -93.63136806510369, \"translation_dy\": -15.65687765252683, \"scale\": 0.9834422929774667}\nC: {\"rotation_angle\": -6.38420562293993, \"translation_dx\": -106.80670691302902, \"translation_dy\": -3.5935098985529663, \"scale\": 1.3037846299861797}\nD: {\"rotation_angle\": 26.051749493295517, \"translation_dx\": 8.674153667650117, \"translation_dy\": 81.98381249796742, \"scale\": 1.4721363798843865}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_57_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_57_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 28.186459007199005, \"translation_dx\": -85.64869298892413, \"translation_dy\": -90.9589081114641, \"scale\": 0.5939510579225048}\nB: {\"rotation_angle\": -126.15991399279281, \"translation_dx\": 24.895638463286446, \"translation_dy\": -35.71086816730676, \"scale\": 1.30648936857296}\nC: {\"rotation_angle\": 134.59992138556464, \"translation_dx\": 5.908404103559974, \"translation_dy\": 47.60587687007518, \"scale\": 1.0105063493742612}\nD: {\"rotation_angle\": 99.4759866737457, \"translation_dx\": -117.67383777244245, \"translation_dy\": -44.645046657688624, \"scale\": 1.4332006009229632}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 28.186459007199005, \"translation_dx\": -85.64869298892413, \"translation_dy\": -90.9589081114641, \"scale\": 0.5939510579225048}\nB: {\"rotation_angle\": -126.15991399279281, \"translation_dx\": 24.895638463286446, \"translation_dy\": -35.71086816730676, \"scale\": 1.30648936857296}\nC: {\"rotation_angle\": 134.59992138556464, \"translation_dx\": 5.908404103559974, \"translation_dy\": 47.60587687007518, \"scale\": 1.0105063493742612}\nD: {\"rotation_angle\": 99.4759866737457, \"translation_dx\": -117.67383777244245, \"translation_dy\": -44.645046657688624, \"scale\": 1.4332006009229632}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_58_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_58_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 99.38174871704592, \"translation_dx\": 57.870588734166205, \"translation_dy\": 17.413162007690403, \"scale\": 1.4113398114931053}\nB: {\"rotation_angle\": -4.956802948250129, \"translation_dx\": -46.115491929325685, \"translation_dy\": 39.01349173096322, \"scale\": 1.02280257064298}\nC: {\"rotation_angle\": -37.8135886633452, \"translation_dx\": 94.09848811207868, \"translation_dy\": -28.846940165704815, \"scale\": 0.7423292461324351}\nD: {\"rotation_angle\": -120.90208363304777, \"translation_dx\": -24.471100960859047, \"translation_dy\": -96.60346561133943, \"scale\": 1.2238954631080248}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 99.38174871704592, \"translation_dx\": 57.870588734166205, \"translation_dy\": 17.413162007690403, \"scale\": 1.4113398114931053}\nB: {\"rotation_angle\": -4.956802948250129, \"translation_dx\": -46.115491929325685, \"translation_dy\": 39.01349173096322, \"scale\": 1.02280257064298}\nC: {\"rotation_angle\": -37.8135886633452, \"translation_dx\": 94.09848811207868, \"translation_dy\": -28.846940165704815, \"scale\": 0.7423292461324351}\nD: {\"rotation_angle\": -120.90208363304777, \"translation_dx\": -24.471100960859047, \"translation_dy\": -96.60346561133943, \"scale\": 1.2238954631080248}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_59_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_59_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -5.683971346231118, \"translation_dx\": -0.7123036436211407, \"translation_dy\": -23.660599152813326, \"scale\": 1.1241034499451734}\nB: {\"rotation_angle\": 74.4727172984789, \"translation_dx\": 83.0498783040965, \"translation_dy\": 24.573318419119772, \"scale\": 1.4775593630739356}\nC: {\"rotation_angle\": -152.40502323992493, \"translation_dx\": -0.6096313646742146, \"translation_dy\": 26.2224872549711, \"scale\": 0.6008305458537412}\nD: {\"rotation_angle\": 134.66606893121838, \"translation_dx\": 30.71289427748178, \"translation_dy\": 31.00111281943242, \"scale\": 0.9716368665085688}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -5.683971346231118, \"translation_dx\": -0.7123036436211407, \"translation_dy\": -23.660599152813326, \"scale\": 1.1241034499451734}\nB: {\"rotation_angle\": 74.4727172984789, \"translation_dx\": 83.0498783040965, \"translation_dy\": 24.573318419119772, \"scale\": 1.4775593630739356}\nC: {\"rotation_angle\": -152.40502323992493, \"translation_dx\": -0.6096313646742146, \"translation_dy\": 26.2224872549711, \"scale\": 0.6008305458537412}\nD: {\"rotation_angle\": 134.66606893121838, \"translation_dx\": 30.71289427748178, \"translation_dy\": 31.00111281943242, \"scale\": 0.9716368665085688}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_60_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_60_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -49.11147497176091, \"translation_dx\": -21.61309921155923, \"translation_dy\": 41.841400081955015, \"scale\": 1.3374733710705384}\nB: {\"rotation_angle\": 52.27392299801002, \"translation_dx\": -7.943242591889941, \"translation_dy\": -1.8318597711701017, \"scale\": 1.489664776133741}\nC: {\"rotation_angle\": -162.31682909306286, \"translation_dx\": 94.60975693720637, \"translation_dy\": -28.569332128995313, \"scale\": 1.1251281587345527}\nD: {\"rotation_angle\": -152.40502323992493, \"translation_dx\": -0.6096313646742146, \"translation_dy\": 26.2224872549711, \"scale\": 0.6008305458537412}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -49.11147497176091, \"translation_dx\": -21.61309921155923, \"translation_dy\": 41.841400081955015, \"scale\": 1.3374733710705384}\nB: {\"rotation_angle\": 52.27392299801002, \"translation_dx\": -7.943242591889941, \"translation_dy\": -1.8318597711701017, \"scale\": 1.489664776133741}\nC: {\"rotation_angle\": -162.31682909306286, \"translation_dx\": 94.60975693720637, \"translation_dy\": -28.569332128995313, \"scale\": 1.1251281587345527}\nD: {\"rotation_angle\": -152.40502323992493, \"translation_dx\": -0.6096313646742146, \"translation_dy\": 26.2224872549711, \"scale\": 0.6008305458537412}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_61_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_61_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -100.94596249363259, \"translation_dx\": 18.493532966543597, \"translation_dy\": -4.904135882610319, \"scale\": 1.1575890826518318}\nB: {\"rotation_angle\": -26.00307697103628, \"translation_dx\": -100.91027332279833, \"translation_dy\": 27.120302875093685, \"scale\": 0.9546103505495939}\nC: {\"rotation_angle\": 99.4759866737457, \"translation_dx\": -117.67383777244245, \"translation_dy\": -44.645046657688624, \"scale\": 1.4332006009229632}\nD: {\"rotation_angle\": -165.5576257925042, \"translation_dx\": 120.02978270991923, \"translation_dy\": -94.68626204020723, \"scale\": 1.377433782383828}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -100.94596249363259, \"translation_dx\": 18.493532966543597, \"translation_dy\": -4.904135882610319, \"scale\": 1.1575890826518318}\nB: {\"rotation_angle\": -26.00307697103628, \"translation_dx\": -100.91027332279833, \"translation_dy\": 27.120302875093685, \"scale\": 0.9546103505495939}\nC: {\"rotation_angle\": 99.4759866737457, \"translation_dx\": -117.67383777244245, \"translation_dy\": -44.645046657688624, \"scale\": 1.4332006009229632}\nD: {\"rotation_angle\": -165.5576257925042, \"translation_dx\": 120.02978270991923, \"translation_dy\": -94.68626204020723, \"scale\": 1.377433782383828}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_62_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_62_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -37.8135886633452, \"translation_dx\": 94.09848811207868, \"translation_dy\": -28.846940165704815, \"scale\": 0.7423292461324351}\nB: {\"rotation_angle\": 98.12478073081388, \"translation_dx\": 82.24255679101596, \"translation_dy\": 10.638794739410258, \"scale\": 1.454613875934863}\nC: {\"rotation_angle\": -98.17490649350026, \"translation_dx\": 5.744855173473269, \"translation_dy\": -10.705504600001973, \"scale\": 1.1182428392253487}\nD: {\"rotation_angle\": 4.601729825002167, \"translation_dx\": -92.34842360064926, \"translation_dy\": 78.34726427877602, \"scale\": 0.7620115680057987}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -37.8135886633452, \"translation_dx\": 94.09848811207868, \"translation_dy\": -28.846940165704815, \"scale\": 0.7423292461324351}\nB: {\"rotation_angle\": 98.12478073081388, \"translation_dx\": 82.24255679101596, \"translation_dy\": 10.638794739410258, \"scale\": 1.454613875934863}\nC: {\"rotation_angle\": -98.17490649350026, \"translation_dx\": 5.744855173473269, \"translation_dy\": -10.705504600001973, \"scale\": 1.1182428392253487}\nD: {\"rotation_angle\": 4.601729825002167, \"translation_dx\": -92.34842360064926, \"translation_dy\": 78.34726427877602, \"scale\": 0.7620115680057987}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_63_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_63_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -59.18065174130953, \"translation_dx\": -66.15733764198566, \"translation_dy\": -32.06450758946801, \"scale\": 1.1967157159259998}\nB: {\"rotation_angle\": 48.71833122181758, \"translation_dx\": -105.22683210092106, \"translation_dy\": -63.34096559919908, \"scale\": 0.7204478932238769}\nC: {\"rotation_angle\": 143.38145335973087, \"translation_dx\": 86.67970142496799, \"translation_dy\": -33.57640317277091, \"scale\": 0.6114655384261714}\nD: {\"rotation_angle\": -149.42147215379055, \"translation_dx\": 2.3444194857030283, \"translation_dy\": 35.92779325530762, \"scale\": 1.0223945055206394}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -59.18065174130953, \"translation_dx\": -66.15733764198566, \"translation_dy\": -32.06450758946801, \"scale\": 1.1967157159259998}\nB: {\"rotation_angle\": 48.71833122181758, \"translation_dx\": -105.22683210092106, \"translation_dy\": -63.34096559919908, \"scale\": 0.7204478932238769}\nC: {\"rotation_angle\": 143.38145335973087, \"translation_dx\": 86.67970142496799, \"translation_dy\": -33.57640317277091, \"scale\": 0.6114655384261714}\nD: {\"rotation_angle\": -149.42147215379055, \"translation_dx\": 2.3444194857030283, \"translation_dy\": 35.92779325530762, \"scale\": 1.0223945055206394}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_64_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_64_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -53.475823147809436, \"translation_dx\": -52.11444637245131, \"translation_dy\": -7.974464084606126, \"scale\": 1.302004904680502}\nB: {\"rotation_angle\": -103.24791656906933, \"translation_dx\": -2.2454836983213227, \"translation_dy\": 24.014319900588845, \"scale\": 1.3204557483507742}\nC: {\"rotation_angle\": -4.956802948250129, \"translation_dx\": -46.115491929325685, \"translation_dy\": 39.01349173096322, \"scale\": 1.02280257064298}\nD: {\"rotation_angle\": 64.33574528550244, \"translation_dx\": -83.09111528364858, \"translation_dy\": 12.26726314152404, \"scale\": 0.7845370507816389}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -53.475823147809436, \"translation_dx\": -52.11444637245131, \"translation_dy\": -7.974464084606126, \"scale\": 1.302004904680502}\nB: {\"rotation_angle\": -103.24791656906933, \"translation_dx\": -2.2454836983213227, \"translation_dy\": 24.014319900588845, \"scale\": 1.3204557483507742}\nC: {\"rotation_angle\": -4.956802948250129, \"translation_dx\": -46.115491929325685, \"translation_dy\": 39.01349173096322, \"scale\": 1.02280257064298}\nD: {\"rotation_angle\": 64.33574528550244, \"translation_dx\": -83.09111528364858, \"translation_dy\": 12.26726314152404, \"scale\": 0.7845370507816389}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_65_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_65_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 83.49682873903629, \"translation_dx\": -127.2042493945246, \"translation_dy\": 2.6616959584396938, \"scale\": 0.9488759478249397}\nB: {\"rotation_angle\": 32.170058088704565, \"translation_dx\": 62.48780444449932, \"translation_dy\": 36.464458087386475, \"scale\": 0.8338243238440678}\nC: {\"rotation_angle\": 99.4759866737457, \"translation_dx\": -117.67383777244245, \"translation_dy\": -44.645046657688624, \"scale\": 1.4332006009229632}\nD: {\"rotation_angle\": 52.27392299801002, \"translation_dx\": -7.943242591889941, \"translation_dy\": -1.8318597711701017, \"scale\": 1.489664776133741}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 83.49682873903629, \"translation_dx\": -127.2042493945246, \"translation_dy\": 2.6616959584396938, \"scale\": 0.9488759478249397}\nB: {\"rotation_angle\": 32.170058088704565, \"translation_dx\": 62.48780444449932, \"translation_dy\": 36.464458087386475, \"scale\": 0.8338243238440678}\nC: {\"rotation_angle\": 99.4759866737457, \"translation_dx\": -117.67383777244245, \"translation_dy\": -44.645046657688624, \"scale\": 1.4332006009229632}\nD: {\"rotation_angle\": 52.27392299801002, \"translation_dx\": -7.943242591889941, \"translation_dy\": -1.8318597711701017, \"scale\": 1.489664776133741}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_66_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_66_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 23.955007488404988, \"translation_dx\": 90.0018582930472, \"translation_dy\": 38.03553582875617, \"scale\": 1.3380437802347522}\nB: {\"rotation_angle\": -113.69332067912192, \"translation_dx\": -23.005200251858383, \"translation_dy\": 57.916315250854666, \"scale\": 0.5483419258047426}\nC: {\"rotation_angle\": 14.369437993555863, \"translation_dx\": -23.54312301695805, \"translation_dy\": 55.41046511147678, \"scale\": 1.115345902394854}\nD: {\"rotation_angle\": 97.08459407481979, \"translation_dx\": 38.76418659488206, \"translation_dy\": 44.81166266995322, \"scale\": 1.27585958531192}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 23.955007488404988, \"translation_dx\": 90.0018582930472, \"translation_dy\": 38.03553582875617, \"scale\": 1.3380437802347522}\nB: {\"rotation_angle\": -113.69332067912192, \"translation_dx\": -23.005200251858383, \"translation_dy\": 57.916315250854666, \"scale\": 0.5483419258047426}\nC: {\"rotation_angle\": 14.369437993555863, \"translation_dx\": -23.54312301695805, \"translation_dy\": 55.41046511147678, \"scale\": 1.115345902394854}\nD: {\"rotation_angle\": 97.08459407481979, \"translation_dx\": 38.76418659488206, \"translation_dy\": 44.81166266995322, \"scale\": 1.27585958531192}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_67_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_67_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 37.640985396206986, \"translation_dx\": -97.39428669742068, \"translation_dy\": 17.900860680283458, \"scale\": 1.0930243251030827}\nB: {\"rotation_angle\": 53.86809011441332, \"translation_dx\": -15.131168518097624, \"translation_dy\": -31.300037391593577, \"scale\": 1.3154620606808156}\nC: {\"rotation_angle\": 49.896013394485834, \"translation_dx\": -25.763756683237403, \"translation_dy\": -26.432232271484168, \"scale\": 1.1619310734744932}\nD: {\"rotation_angle\": -110.46391589612124, \"translation_dx\": -77.96644542647721, \"translation_dy\": -50.23500265461973, \"scale\": 0.7651088884143488}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 37.640985396206986, \"translation_dx\": -97.39428669742068, \"translation_dy\": 17.900860680283458, \"scale\": 1.0930243251030827}\nB: {\"rotation_angle\": 53.86809011441332, \"translation_dx\": -15.131168518097624, \"translation_dy\": -31.300037391593577, \"scale\": 1.3154620606808156}\nC: {\"rotation_angle\": 49.896013394485834, \"translation_dx\": -25.763756683237403, \"translation_dy\": -26.432232271484168, \"scale\": 1.1619310734744932}\nD: {\"rotation_angle\": -110.46391589612124, \"translation_dx\": -77.96644542647721, \"translation_dy\": -50.23500265461973, \"scale\": 0.7651088884143488}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_68_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_68_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -169.57691070181107, \"translation_dx\": 67.3776951722352, \"translation_dy\": 6.393739311338578, \"scale\": 0.8283042543093307}\nB: {\"rotation_angle\": -153.95647753312159, \"translation_dx\": 64.08546266437509, \"translation_dy\": -34.554486291313935, \"scale\": 1.423360690418288}\nC: {\"rotation_angle\": 141.74747753602782, \"translation_dx\": -54.793360600935046, \"translation_dy\": -29.72546528603263, \"scale\": 0.6563706152769926}\nD: {\"rotation_angle\": 138.15953129001275, \"translation_dx\": 108.29077351507729, \"translation_dy\": 11.25207260435026, \"scale\": 1.2682750116992958}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -169.57691070181107, \"translation_dx\": 67.3776951722352, \"translation_dy\": 6.393739311338578, \"scale\": 0.8283042543093307}\nB: {\"rotation_angle\": -153.95647753312159, \"translation_dx\": 64.08546266437509, \"translation_dy\": -34.554486291313935, \"scale\": 1.423360690418288}\nC: {\"rotation_angle\": 141.74747753602782, \"translation_dx\": -54.793360600935046, \"translation_dy\": -29.72546528603263, \"scale\": 0.6563706152769926}\nD: {\"rotation_angle\": 138.15953129001275, \"translation_dx\": 108.29077351507729, \"translation_dy\": 11.25207260435026, \"scale\": 1.2682750116992958}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_69_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_69_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 72.25092677282458, \"translation_dx\": 61.389740502873025, \"translation_dy\": -36.86538640455047, \"scale\": 1.0748600769835353}\nB: {\"rotation_angle\": -16.878745814478265, \"translation_dx\": -68.86659110743665, \"translation_dy\": -98.54142762965468, \"scale\": 1.2648663928919022}\nC: {\"rotation_angle\": -113.69332067912192, \"translation_dx\": -23.005200251858383, \"translation_dy\": 57.916315250854666, \"scale\": 0.5483419258047426}\nD: {\"rotation_angle\": 162.98131081099467, \"translation_dx\": -80.19473687776261, \"translation_dy\": -17.70282064458462, \"scale\": 1.2855975600149028}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 72.25092677282458, \"translation_dx\": 61.389740502873025, \"translation_dy\": -36.86538640455047, \"scale\": 1.0748600769835353}\nB: {\"rotation_angle\": -16.878745814478265, \"translation_dx\": -68.86659110743665, \"translation_dy\": -98.54142762965468, \"scale\": 1.2648663928919022}\nC: {\"rotation_angle\": -113.69332067912192, \"translation_dx\": -23.005200251858383, \"translation_dy\": 57.916315250854666, \"scale\": 0.5483419258047426}\nD: {\"rotation_angle\": 162.98131081099467, \"translation_dx\": -80.19473687776261, \"translation_dy\": -17.70282064458462, \"scale\": 1.2855975600149028}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_70_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_70_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 53.86809011441332, \"translation_dx\": -15.131168518097624, \"translation_dy\": -31.300037391593577, \"scale\": 1.3154620606808156}\nB: {\"rotation_angle\": 115.44035395260755, \"translation_dx\": 104.38539690843712, \"translation_dy\": -82.71757148170198, \"scale\": 0.6534862534786243}\nC: {\"rotation_angle\": 84.88997243843744, \"translation_dx\": 19.30269357274682, \"translation_dy\": 9.929350250110147, \"scale\": 1.0595552381550672}\nD: {\"rotation_angle\": 130.382151153576, \"translation_dx\": 48.77925626504499, \"translation_dy\": 54.89982459749416, \"scale\": 1.3647831130001666}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 53.86809011441332, \"translation_dx\": -15.131168518097624, \"translation_dy\": -31.300037391593577, \"scale\": 1.3154620606808156}\nB: {\"rotation_angle\": 115.44035395260755, \"translation_dx\": 104.38539690843712, \"translation_dy\": -82.71757148170198, \"scale\": 0.6534862534786243}\nC: {\"rotation_angle\": 84.88997243843744, \"translation_dx\": 19.30269357274682, \"translation_dy\": 9.929350250110147, \"scale\": 1.0595552381550672}\nD: {\"rotation_angle\": 130.382151153576, \"translation_dx\": 48.77925626504499, \"translation_dy\": 54.89982459749416, \"scale\": 1.3647831130001666}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_71_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_71_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 26.06413776863195, \"translation_dx\": 104.54441011530889, \"translation_dy\": -2.802993361858995, \"scale\": 0.6919535578881184}\nB: {\"rotation_angle\": -147.17742740700606, \"translation_dx\": 99.79022385553455, \"translation_dy\": -46.32888217161055, \"scale\": 1.2561938294527635}\nC: {\"rotation_angle\": 159.74516071456964, \"translation_dx\": 18.36539372865252, \"translation_dy\": -32.68583255299669, \"scale\": 0.6283421405871866}\nD: {\"rotation_angle\": 104.66960596229086, \"translation_dx\": 122.9579606372167, \"translation_dy\": -32.21502556645471, \"scale\": 0.5791563638149022}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 26.06413776863195, \"translation_dx\": 104.54441011530889, \"translation_dy\": -2.802993361858995, \"scale\": 0.6919535578881184}\nB: {\"rotation_angle\": -147.17742740700606, \"translation_dx\": 99.79022385553455, \"translation_dy\": -46.32888217161055, \"scale\": 1.2561938294527635}\nC: {\"rotation_angle\": 159.74516071456964, \"translation_dx\": 18.36539372865252, \"translation_dy\": -32.68583255299669, \"scale\": 0.6283421405871866}\nD: {\"rotation_angle\": 104.66960596229086, \"translation_dx\": 122.9579606372167, \"translation_dy\": -32.21502556645471, \"scale\": 0.5791563638149022}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_72_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_72_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -127.2688410750471, \"translation_dx\": 10.330064507300825, \"translation_dy\": -25.010404065134438, \"scale\": 1.1376215421095472}\nB: {\"rotation_angle\": 138.15953129001275, \"translation_dx\": 108.29077351507729, \"translation_dy\": 11.25207260435026, \"scale\": 1.2682750116992958}\nC: {\"rotation_angle\": -8.756342422911757, \"translation_dx\": -120.12147874311805, \"translation_dy\": -16.659510954699698, \"scale\": 0.8471832394055047}\nD: {\"rotation_angle\": -70.18179574394556, \"translation_dx\": -84.02989442213027, \"translation_dy\": 45.46342410564398, \"scale\": 1.28660403831869}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -127.2688410750471, \"translation_dx\": 10.330064507300825, \"translation_dy\": -25.010404065134438, \"scale\": 1.1376215421095472}\nB: {\"rotation_angle\": 138.15953129001275, \"translation_dx\": 108.29077351507729, \"translation_dy\": 11.25207260435026, \"scale\": 1.2682750116992958}\nC: {\"rotation_angle\": -8.756342422911757, \"translation_dx\": -120.12147874311805, \"translation_dy\": -16.659510954699698, \"scale\": 0.8471832394055047}\nD: {\"rotation_angle\": -70.18179574394556, \"translation_dx\": -84.02989442213027, \"translation_dy\": 45.46342410564398, \"scale\": 1.28660403831869}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_73_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_73_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 173.6372649335733, \"translation_dx\": -7.357207392874017, \"translation_dy\": -51.70776156994498, \"scale\": 1.09720142096939}\nB: {\"rotation_angle\": 112.15713698429767, \"translation_dx\": -0.833180316164956, \"translation_dy\": -100.57740000976534, \"scale\": 1.21487245494624}\nC: {\"rotation_angle\": 98.88222011850513, \"translation_dx\": 98.58699088344886, \"translation_dy\": 52.424259863835346, \"scale\": 0.8670994673205047}\nD: {\"rotation_angle\": -120.90208363304777, \"translation_dx\": -24.471100960859047, \"translation_dy\": -96.60346561133943, \"scale\": 1.2238954631080248}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 173.6372649335733, \"translation_dx\": -7.357207392874017, \"translation_dy\": -51.70776156994498, \"scale\": 1.09720142096939}\nB: {\"rotation_angle\": 112.15713698429767, \"translation_dx\": -0.833180316164956, \"translation_dy\": -100.57740000976534, \"scale\": 1.21487245494624}\nC: {\"rotation_angle\": 98.88222011850513, \"translation_dx\": 98.58699088344886, \"translation_dy\": 52.424259863835346, \"scale\": 0.8670994673205047}\nD: {\"rotation_angle\": -120.90208363304777, \"translation_dx\": -24.471100960859047, \"translation_dy\": -96.60346561133943, \"scale\": 1.2238954631080248}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_74_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_74_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 88.55038325147228, \"translation_dx\": -17.272344447388633, \"translation_dy\": -67.72549137992362, \"scale\": 0.5810098703790367}\nB: {\"rotation_angle\": 127.1396993936072, \"translation_dx\": -29.08894824101361, \"translation_dy\": -80.84475014775404, \"scale\": 1.2834497894588772}\nC: {\"rotation_angle\": -128.93587705152078, \"translation_dx\": 48.830662388872895, \"translation_dy\": 65.60255696435819, \"scale\": 0.5618983722639579}\nD: {\"rotation_angle\": 2.6800660606496933, \"translation_dx\": 8.805898944242955, \"translation_dy\": -61.557448223727356, \"scale\": 0.7338009245004858}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 88.55038325147228, \"translation_dx\": -17.272344447388633, \"translation_dy\": -67.72549137992362, \"scale\": 0.5810098703790367}\nB: {\"rotation_angle\": 127.1396993936072, \"translation_dx\": -29.08894824101361, \"translation_dy\": -80.84475014775404, \"scale\": 1.2834497894588772}\nC: {\"rotation_angle\": -128.93587705152078, \"translation_dx\": 48.830662388872895, \"translation_dy\": 65.60255696435819, \"scale\": 0.5618983722639579}\nD: {\"rotation_angle\": 2.6800660606496933, \"translation_dx\": 8.805898944242955, \"translation_dy\": -61.557448223727356, \"scale\": 0.7338009245004858}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_75_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_75_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -94.06455293225282, \"translation_dx\": -52.04430006776356, \"translation_dy\": 88.55937507710391, \"scale\": 0.8369046461483086}\nB: {\"rotation_angle\": 14.369437993555863, \"translation_dx\": -23.54312301695805, \"translation_dy\": 55.41046511147678, \"scale\": 1.115345902394854}\nC: {\"rotation_angle\": 99.38174871704592, \"translation_dx\": 57.870588734166205, \"translation_dy\": 17.413162007690403, \"scale\": 1.4113398114931053}\nD: {\"rotation_angle\": -126.23248080179604, \"translation_dx\": -18.04313623288388, \"translation_dy\": 59.052880720386156, \"scale\": 1.3827835175940266}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -94.06455293225282, \"translation_dx\": -52.04430006776356, \"translation_dy\": 88.55937507710391, \"scale\": 0.8369046461483086}\nB: {\"rotation_angle\": 14.369437993555863, \"translation_dx\": -23.54312301695805, \"translation_dy\": 55.41046511147678, \"scale\": 1.115345902394854}\nC: {\"rotation_angle\": 99.38174871704592, \"translation_dx\": 57.870588734166205, \"translation_dy\": 17.413162007690403, \"scale\": 1.4113398114931053}\nD: {\"rotation_angle\": -126.23248080179604, \"translation_dx\": -18.04313623288388, \"translation_dy\": 59.052880720386156, \"scale\": 1.3827835175940266}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_76_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_76_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -72.82027143369304, \"translation_dx\": -44.85481158127062, \"translation_dy\": 106.69131407191517, \"scale\": 0.716080341101258}\nB: {\"rotation_angle\": 74.4727172984789, \"translation_dx\": 83.0498783040965, \"translation_dy\": 24.573318419119772, \"scale\": 1.4775593630739356}\nC: {\"rotation_angle\": 33.426384392539006, \"translation_dx\": -12.448609293998487, \"translation_dy\": 64.03367069956386, \"scale\": 0.6340926377236346}\nD: {\"rotation_angle\": 159.39197876032466, \"translation_dx\": -101.87275621292875, \"translation_dy\": -32.606176111808466, \"scale\": 0.6647290774480178}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -72.82027143369304, \"translation_dx\": -44.85481158127062, \"translation_dy\": 106.69131407191517, \"scale\": 0.716080341101258}\nB: {\"rotation_angle\": 74.4727172984789, \"translation_dx\": 83.0498783040965, \"translation_dy\": 24.573318419119772, \"scale\": 1.4775593630739356}\nC: {\"rotation_angle\": 33.426384392539006, \"translation_dx\": -12.448609293998487, \"translation_dy\": 64.03367069956386, \"scale\": 0.6340926377236346}\nD: {\"rotation_angle\": 159.39197876032466, \"translation_dx\": -101.87275621292875, \"translation_dy\": -32.606176111808466, \"scale\": 0.6647290774480178}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_77_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_77_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 28.186459007199005, \"translation_dx\": -85.64869298892413, \"translation_dy\": -90.9589081114641, \"scale\": 0.5939510579225048}\nB: {\"rotation_angle\": -161.22593365548192, \"translation_dx\": -119.73961882572601, \"translation_dy\": -93.50838821854722, \"scale\": 1.4476413063179399}\nC: {\"rotation_angle\": -46.75272698463425, \"translation_dx\": 16.424107524155175, \"translation_dy\": -60.683488552754085, \"scale\": 1.375025476214386}\nD: {\"rotation_angle\": 141.74747753602782, \"translation_dx\": -54.793360600935046, \"translation_dy\": -29.72546528603263, \"scale\": 0.6563706152769926}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 28.186459007199005, \"translation_dx\": -85.64869298892413, \"translation_dy\": -90.9589081114641, \"scale\": 0.5939510579225048}\nB: {\"rotation_angle\": -161.22593365548192, \"translation_dx\": -119.73961882572601, \"translation_dy\": -93.50838821854722, \"scale\": 1.4476413063179399}\nC: {\"rotation_angle\": -46.75272698463425, \"translation_dx\": 16.424107524155175, \"translation_dy\": -60.683488552754085, \"scale\": 1.375025476214386}\nD: {\"rotation_angle\": 141.74747753602782, \"translation_dx\": -54.793360600935046, \"translation_dy\": -29.72546528603263, \"scale\": 0.6563706152769926}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_78_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_78_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -174.94064668132228, \"translation_dx\": 73.73079207136513, \"translation_dy\": 58.25534486945551, \"scale\": 1.178357936048121}\nB: {\"rotation_angle\": -23.247975965134003, \"translation_dx\": 108.97564353658032, \"translation_dy\": 27.267413374938258, \"scale\": 1.2292170424899498}\nC: {\"rotation_angle\": 33.426384392539006, \"translation_dx\": -12.448609293998487, \"translation_dy\": 64.03367069956386, \"scale\": 0.6340926377236346}\nD: {\"rotation_angle\": 159.39197876032466, \"translation_dx\": -101.87275621292875, \"translation_dy\": -32.606176111808466, \"scale\": 0.6647290774480178}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -174.94064668132228, \"translation_dx\": 73.73079207136513, \"translation_dy\": 58.25534486945551, \"scale\": 1.178357936048121}\nB: {\"rotation_angle\": -23.247975965134003, \"translation_dx\": 108.97564353658032, \"translation_dy\": 27.267413374938258, \"scale\": 1.2292170424899498}\nC: {\"rotation_angle\": 33.426384392539006, \"translation_dx\": -12.448609293998487, \"translation_dy\": 64.03367069956386, \"scale\": 0.6340926377236346}\nD: {\"rotation_angle\": 159.39197876032466, \"translation_dx\": -101.87275621292875, \"translation_dy\": -32.606176111808466, \"scale\": 0.6647290774480178}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_79_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_79_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 32.170058088704565, \"translation_dx\": 62.48780444449932, \"translation_dy\": 36.464458087386475, \"scale\": 0.8338243238440678}\nB: {\"rotation_angle\": 95.56102360167273, \"translation_dx\": -57.629857243876444, \"translation_dy\": -95.34824117323305, \"scale\": 0.9533126568708786}\nC: {\"rotation_angle\": 159.39197876032466, \"translation_dx\": -101.87275621292875, \"translation_dy\": -32.606176111808466, \"scale\": 0.6647290774480178}\nD: {\"rotation_angle\": 112.15713698429767, \"translation_dx\": -0.833180316164956, \"translation_dy\": -100.57740000976534, \"scale\": 1.21487245494624}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 32.170058088704565, \"translation_dx\": 62.48780444449932, \"translation_dy\": 36.464458087386475, \"scale\": 0.8338243238440678}\nB: {\"rotation_angle\": 95.56102360167273, \"translation_dx\": -57.629857243876444, \"translation_dy\": -95.34824117323305, \"scale\": 0.9533126568708786}\nC: {\"rotation_angle\": 159.39197876032466, \"translation_dx\": -101.87275621292875, \"translation_dy\": -32.606176111808466, \"scale\": 0.6647290774480178}\nD: {\"rotation_angle\": 112.15713698429767, \"translation_dx\": -0.833180316164956, \"translation_dy\": -100.57740000976534, \"scale\": 1.21487245494624}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_80_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_80_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -169.57691070181107, \"translation_dx\": 67.3776951722352, \"translation_dy\": 6.393739311338578, \"scale\": 0.8283042543093307}\nB: {\"rotation_angle\": -128.93587705152078, \"translation_dx\": 48.830662388872895, \"translation_dy\": 65.60255696435819, \"scale\": 0.5618983722639579}\nC: {\"rotation_angle\": -126.23248080179604, \"translation_dx\": -18.04313623288388, \"translation_dy\": 59.052880720386156, \"scale\": 1.3827835175940266}\nD: {\"rotation_angle\": -120.90208363304777, \"translation_dx\": -24.471100960859047, \"translation_dy\": -96.60346561133943, \"scale\": 1.2238954631080248}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -169.57691070181107, \"translation_dx\": 67.3776951722352, \"translation_dy\": 6.393739311338578, \"scale\": 0.8283042543093307}\nB: {\"rotation_angle\": -128.93587705152078, \"translation_dx\": 48.830662388872895, \"translation_dy\": 65.60255696435819, \"scale\": 0.5618983722639579}\nC: {\"rotation_angle\": -126.23248080179604, \"translation_dx\": -18.04313623288388, \"translation_dy\": 59.052880720386156, \"scale\": 1.3827835175940266}\nD: {\"rotation_angle\": -120.90208363304777, \"translation_dx\": -24.471100960859047, \"translation_dy\": -96.60346561133943, \"scale\": 1.2238954631080248}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_81_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_81_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 159.25105466068987, \"translation_dx\": -126.35420360425098, \"translation_dy\": -17.54721978726404, \"scale\": 1.4952435062275256}\nB: {\"rotation_angle\": -59.18065174130953, \"translation_dx\": -66.15733764198566, \"translation_dy\": -32.06450758946801, \"scale\": 1.1967157159259998}\nC: {\"rotation_angle\": 134.66606893121838, \"translation_dx\": 30.71289427748178, \"translation_dy\": 31.00111281943242, \"scale\": 0.9716368665085688}\nD: {\"rotation_angle\": 32.170058088704565, \"translation_dx\": 62.48780444449932, \"translation_dy\": 36.464458087386475, \"scale\": 0.8338243238440678}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 159.25105466068987, \"translation_dx\": -126.35420360425098, \"translation_dy\": -17.54721978726404, \"scale\": 1.4952435062275256}\nB: {\"rotation_angle\": -59.18065174130953, \"translation_dx\": -66.15733764198566, \"translation_dy\": -32.06450758946801, \"scale\": 1.1967157159259998}\nC: {\"rotation_angle\": 134.66606893121838, \"translation_dx\": 30.71289427748178, \"translation_dy\": 31.00111281943242, \"scale\": 0.9716368665085688}\nD: {\"rotation_angle\": 32.170058088704565, \"translation_dx\": 62.48780444449932, \"translation_dy\": 36.464458087386475, \"scale\": 0.8338243238440678}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_82_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_82_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -38.67054772511392, \"translation_dx\": 68.1059088983965, \"translation_dy\": -80.75433684597641, \"scale\": 1.0669693911306672}\nB: {\"rotation_angle\": 178.3015459217881, \"translation_dx\": 2.1592483018484785, \"translation_dy\": -86.15095567396924, \"scale\": 1.206185814877298}\nC: {\"rotation_angle\": 98.88222011850513, \"translation_dx\": 98.58699088344886, \"translation_dy\": 52.424259863835346, \"scale\": 0.8670994673205047}\nD: {\"rotation_angle\": -53.475823147809436, \"translation_dx\": -52.11444637245131, \"translation_dy\": -7.974464084606126, \"scale\": 1.302004904680502}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -38.67054772511392, \"translation_dx\": 68.1059088983965, \"translation_dy\": -80.75433684597641, \"scale\": 1.0669693911306672}\nB: {\"rotation_angle\": 178.3015459217881, \"translation_dx\": 2.1592483018484785, \"translation_dy\": -86.15095567396924, \"scale\": 1.206185814877298}\nC: {\"rotation_angle\": 98.88222011850513, \"translation_dx\": 98.58699088344886, \"translation_dy\": 52.424259863835346, \"scale\": 0.8670994673205047}\nD: {\"rotation_angle\": -53.475823147809436, \"translation_dx\": -52.11444637245131, \"translation_dy\": -7.974464084606126, \"scale\": 1.302004904680502}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_83_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_83_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -31.020660516088725, \"translation_dx\": 105.99805178546191, \"translation_dy\": -82.8489656004858, \"scale\": 1.0703563169477137}\nB: {\"rotation_angle\": 98.12478073081388, \"translation_dx\": 82.24255679101596, \"translation_dy\": 10.638794739410258, \"scale\": 1.454613875934863}\nC: {\"rotation_angle\": -4.364889011784271, \"translation_dx\": 74.89385338851659, \"translation_dy\": 29.259521498010997, \"scale\": 1.2877948451877137}\nD: {\"rotation_angle\": 106.62912259997893, \"translation_dx\": -62.19399566166837, \"translation_dy\": -63.078041204745844, \"scale\": 1.4577244189370733}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -31.020660516088725, \"translation_dx\": 105.99805178546191, \"translation_dy\": -82.8489656004858, \"scale\": 1.0703563169477137}\nB: {\"rotation_angle\": 98.12478073081388, \"translation_dx\": 82.24255679101596, \"translation_dy\": 10.638794739410258, \"scale\": 1.454613875934863}\nC: {\"rotation_angle\": -4.364889011784271, \"translation_dx\": 74.89385338851659, \"translation_dy\": 29.259521498010997, \"scale\": 1.2877948451877137}\nD: {\"rotation_angle\": 106.62912259997893, \"translation_dx\": -62.19399566166837, \"translation_dy\": -63.078041204745844, \"scale\": 1.4577244189370733}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_84_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_84_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 12.872370969250312, \"translation_dx\": -43.1533458138392, \"translation_dy\": -64.88511529320917, \"scale\": 1.3092068537816153}\nB: {\"rotation_angle\": -14.958482221349612, \"translation_dx\": 49.62118662103501, \"translation_dy\": -13.943537967490855, \"scale\": 1.489574484959727}\nC: {\"rotation_angle\": 48.71833122181758, \"translation_dx\": -105.22683210092106, \"translation_dy\": -63.34096559919908, \"scale\": 0.7204478932238769}\nD: {\"rotation_angle\": 173.6372649335733, \"translation_dx\": -7.357207392874017, \"translation_dy\": -51.70776156994498, \"scale\": 1.09720142096939}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 12.872370969250312, \"translation_dx\": -43.1533458138392, \"translation_dy\": -64.88511529320917, \"scale\": 1.3092068537816153}\nB: {\"rotation_angle\": -14.958482221349612, \"translation_dx\": 49.62118662103501, \"translation_dy\": -13.943537967490855, \"scale\": 1.489574484959727}\nC: {\"rotation_angle\": 48.71833122181758, \"translation_dx\": -105.22683210092106, \"translation_dy\": -63.34096559919908, \"scale\": 0.7204478932238769}\nD: {\"rotation_angle\": 173.6372649335733, \"translation_dx\": -7.357207392874017, \"translation_dy\": -51.70776156994498, \"scale\": 1.09720142096939}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_85_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_85_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -137.58016126496426, \"translation_dx\": 45.631572391068715, \"translation_dy\": -54.72741054396442, \"scale\": 1.391656794638211}\nB: {\"rotation_angle\": 78.52234880801677, \"translation_dx\": -41.05806913924104, \"translation_dy\": -5.158893155372851, \"scale\": 1.0182841116233097}\nC: {\"rotation_angle\": -0.45613579718829556, \"translation_dx\": 98.71619714866841, \"translation_dy\": 70.1100439641223, \"scale\": 0.6491919010173006}\nD: {\"rotation_angle\": -161.22593365548192, \"translation_dx\": -119.73961882572601, \"translation_dy\": -93.50838821854722, \"scale\": 1.4476413063179399}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -137.58016126496426, \"translation_dx\": 45.631572391068715, \"translation_dy\": -54.72741054396442, \"scale\": 1.391656794638211}\nB: {\"rotation_angle\": 78.52234880801677, \"translation_dx\": -41.05806913924104, \"translation_dy\": -5.158893155372851, \"scale\": 1.0182841116233097}\nC: {\"rotation_angle\": -0.45613579718829556, \"translation_dx\": 98.71619714866841, \"translation_dy\": 70.1100439641223, \"scale\": 0.6491919010173006}\nD: {\"rotation_angle\": -161.22593365548192, \"translation_dx\": -119.73961882572601, \"translation_dy\": -93.50838821854722, \"scale\": 1.4476413063179399}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_86_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_86_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -32.96407209098831, \"translation_dx\": -27.518946535455143, \"translation_dy\": 2.5370159689679213, \"scale\": 1.259328459428434}\nB: {\"rotation_angle\": -50.19218790392131, \"translation_dx\": -27.31734251737683, \"translation_dy\": 8.514724344494553, \"scale\": 1.0874517053433594}\nC: {\"rotation_angle\": -14.958482221349612, \"translation_dx\": 49.62118662103501, \"translation_dy\": -13.943537967490855, \"scale\": 1.489574484959727}\nD: {\"rotation_angle\": 14.369437993555863, \"translation_dx\": -23.54312301695805, \"translation_dy\": 55.41046511147678, \"scale\": 1.115345902394854}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -32.96407209098831, \"translation_dx\": -27.518946535455143, \"translation_dy\": 2.5370159689679213, \"scale\": 1.259328459428434}\nB: {\"rotation_angle\": -50.19218790392131, \"translation_dx\": -27.31734251737683, \"translation_dy\": 8.514724344494553, \"scale\": 1.0874517053433594}\nC: {\"rotation_angle\": -14.958482221349612, \"translation_dx\": 49.62118662103501, \"translation_dy\": -13.943537967490855, \"scale\": 1.489574484959727}\nD: {\"rotation_angle\": 14.369437993555863, \"translation_dx\": -23.54312301695805, \"translation_dy\": 55.41046511147678, \"scale\": 1.115345902394854}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_87_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_87_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 78.52234880801677, \"translation_dx\": -41.05806913924104, \"translation_dy\": -5.158893155372851, \"scale\": 1.0182841116233097}\nB: {\"rotation_angle\": -137.69110011960493, \"translation_dx\": -11.76155657697187, \"translation_dy\": 15.916526895382503, \"scale\": 1.164396221339579}\nC: {\"rotation_angle\": -51.98717119490195, \"translation_dx\": -83.93544420557635, \"translation_dy\": -17.359661719977098, \"scale\": 1.0858344969275349}\nD: {\"rotation_angle\": 111.11665430921613, \"translation_dx\": -45.526232266105865, \"translation_dy\": -71.56835409165808, \"scale\": 0.5234271564227445}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 78.52234880801677, \"translation_dx\": -41.05806913924104, \"translation_dy\": -5.158893155372851, \"scale\": 1.0182841116233097}\nB: {\"rotation_angle\": -137.69110011960493, \"translation_dx\": -11.76155657697187, \"translation_dy\": 15.916526895382503, \"scale\": 1.164396221339579}\nC: {\"rotation_angle\": -51.98717119490195, \"translation_dx\": -83.93544420557635, \"translation_dy\": -17.359661719977098, \"scale\": 1.0858344969275349}\nD: {\"rotation_angle\": 111.11665430921613, \"translation_dx\": -45.526232266105865, \"translation_dy\": -71.56835409165808, \"scale\": 0.5234271564227445}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_88_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_88_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -5.683971346231118, \"translation_dx\": -0.7123036436211407, \"translation_dy\": -23.660599152813326, \"scale\": 1.1241034499451734}\nB: {\"rotation_angle\": -137.58016126496426, \"translation_dx\": 45.631572391068715, \"translation_dy\": -54.72741054396442, \"scale\": 1.391656794638211}\nC: {\"rotation_angle\": -101.64893396855386, \"translation_dx\": -96.08306753711838, \"translation_dy\": 14.852477797043775, \"scale\": 1.3017377870800058}\nD: {\"rotation_angle\": -137.69315675508605, \"translation_dx\": -14.965017175186233, \"translation_dy\": 28.85856493302694, \"scale\": 0.6970825252863025}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -5.683971346231118, \"translation_dx\": -0.7123036436211407, \"translation_dy\": -23.660599152813326, \"scale\": 1.1241034499451734}\nB: {\"rotation_angle\": -137.58016126496426, \"translation_dx\": 45.631572391068715, \"translation_dy\": -54.72741054396442, \"scale\": 1.391656794638211}\nC: {\"rotation_angle\": -101.64893396855386, \"translation_dx\": -96.08306753711838, \"translation_dy\": 14.852477797043775, \"scale\": 1.3017377870800058}\nD: {\"rotation_angle\": -137.69315675508605, \"translation_dx\": -14.965017175186233, \"translation_dy\": 28.85856493302694, \"scale\": 0.6970825252863025}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_89_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_89_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -163.07830945514343, \"translation_dx\": 107.25371607945826, \"translation_dy\": 44.19319462200147, \"scale\": 1.0330497674624493}\nB: {\"rotation_angle\": -5.683971346231118, \"translation_dx\": -0.7123036436211407, \"translation_dy\": -23.660599152813326, \"scale\": 1.1241034499451734}\nC: {\"rotation_angle\": 179.8013352752547, \"translation_dx\": -90.5548533247824, \"translation_dy\": 17.23782922418306, \"scale\": 0.9885365626195518}\nD: {\"rotation_angle\": 142.66976946716716, \"translation_dx\": 29.963541003119957, \"translation_dy\": 66.07065092305665, \"scale\": 1.42144068359999}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -163.07830945514343, \"translation_dx\": 107.25371607945826, \"translation_dy\": 44.19319462200147, \"scale\": 1.0330497674624493}\nB: {\"rotation_angle\": -5.683971346231118, \"translation_dx\": -0.7123036436211407, \"translation_dy\": -23.660599152813326, \"scale\": 1.1241034499451734}\nC: {\"rotation_angle\": 179.8013352752547, \"translation_dx\": -90.5548533247824, \"translation_dy\": 17.23782922418306, \"scale\": 0.9885365626195518}\nD: {\"rotation_angle\": 142.66976946716716, \"translation_dx\": 29.963541003119957, \"translation_dy\": 66.07065092305665, \"scale\": 1.42144068359999}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_90_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_90_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 78.52234880801677, \"translation_dx\": -41.05806913924104, \"translation_dy\": -5.158893155372851, \"scale\": 1.0182841116233097}\nB: {\"rotation_angle\": 171.23105805984426, \"translation_dx\": 28.800906238980815, \"translation_dy\": 60.921924115709544, \"scale\": 1.4441070487112413}\nC: {\"rotation_angle\": 159.18509857624855, \"translation_dx\": 94.5972413522399, \"translation_dy\": -87.01463724053234, \"scale\": 0.7914176569510836}\nD: {\"rotation_angle\": -5.683971346231118, \"translation_dx\": -0.7123036436211407, \"translation_dy\": -23.660599152813326, \"scale\": 1.1241034499451734}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 78.52234880801677, \"translation_dx\": -41.05806913924104, \"translation_dy\": -5.158893155372851, \"scale\": 1.0182841116233097}\nB: {\"rotation_angle\": 171.23105805984426, \"translation_dx\": 28.800906238980815, \"translation_dy\": 60.921924115709544, \"scale\": 1.4441070487112413}\nC: {\"rotation_angle\": 159.18509857624855, \"translation_dx\": 94.5972413522399, \"translation_dy\": -87.01463724053234, \"scale\": 0.7914176569510836}\nD: {\"rotation_angle\": -5.683971346231118, \"translation_dx\": -0.7123036436211407, \"translation_dy\": -23.660599152813326, \"scale\": 1.1241034499451734}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_91_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_91_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -137.69110011960493, \"translation_dx\": -11.76155657697187, \"translation_dy\": 15.916526895382503, \"scale\": 1.164396221339579}\nB: {\"rotation_angle\": 22.924180775031914, \"translation_dx\": 8.278066534063711, \"translation_dy\": 39.03722404706397, \"scale\": 0.6972670428813228}\nC: {\"rotation_angle\": -132.6730586187399, \"translation_dx\": -14.723128468316531, \"translation_dy\": -95.44210429834934, \"scale\": 1.0421065600095725}\nD: {\"rotation_angle\": 32.25033099080062, \"translation_dx\": -33.246475706714875, \"translation_dy\": -9.848772328845214, \"scale\": 0.986502265576198}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -137.69110011960493, \"translation_dx\": -11.76155657697187, \"translation_dy\": 15.916526895382503, \"scale\": 1.164396221339579}\nB: {\"rotation_angle\": 22.924180775031914, \"translation_dx\": 8.278066534063711, \"translation_dy\": 39.03722404706397, \"scale\": 0.6972670428813228}\nC: {\"rotation_angle\": -132.6730586187399, \"translation_dx\": -14.723128468316531, \"translation_dy\": -95.44210429834934, \"scale\": 1.0421065600095725}\nD: {\"rotation_angle\": 32.25033099080062, \"translation_dx\": -33.246475706714875, \"translation_dy\": -9.848772328845214, \"scale\": 0.986502265576198}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_92_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_92_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -38.58021171568234, \"translation_dx\": -80.14139661496048, \"translation_dy\": 7.985099889843255, \"scale\": 1.029545268033875}\nB: {\"rotation_angle\": 138.15953129001275, \"translation_dx\": 108.29077351507729, \"translation_dy\": 11.25207260435026, \"scale\": 1.2682750116992958}\nC: {\"rotation_angle\": 22.924180775031914, \"translation_dx\": 8.278066534063711, \"translation_dy\": 39.03722404706397, \"scale\": 0.6972670428813228}\nD: {\"rotation_angle\": 99.38174871704592, \"translation_dx\": 57.870588734166205, \"translation_dy\": 17.413162007690403, \"scale\": 1.4113398114931053}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -38.58021171568234, \"translation_dx\": -80.14139661496048, \"translation_dy\": 7.985099889843255, \"scale\": 1.029545268033875}\nB: {\"rotation_angle\": 138.15953129001275, \"translation_dx\": 108.29077351507729, \"translation_dy\": 11.25207260435026, \"scale\": 1.2682750116992958}\nC: {\"rotation_angle\": 22.924180775031914, \"translation_dx\": 8.278066534063711, \"translation_dy\": 39.03722404706397, \"scale\": 0.6972670428813228}\nD: {\"rotation_angle\": 99.38174871704592, \"translation_dx\": 57.870588734166205, \"translation_dy\": 17.413162007690403, \"scale\": 1.4113398114931053}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_93_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_93_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 161.7596265938729, \"translation_dx\": -9.170216354863072, \"translation_dy\": -19.23222492696047, \"scale\": 1.1821087248622173}\nB: {\"rotation_angle\": -103.24791656906933, \"translation_dx\": -2.2454836983213227, \"translation_dy\": 24.014319900588845, \"scale\": 1.3204557483507742}\nC: {\"rotation_angle\": 26.06413776863195, \"translation_dx\": 104.54441011530889, \"translation_dy\": -2.802993361858995, \"scale\": 0.6919535578881184}\nD: {\"rotation_angle\": -132.6730586187399, \"translation_dx\": -14.723128468316531, \"translation_dy\": -95.44210429834934, \"scale\": 1.0421065600095725}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 161.7596265938729, \"translation_dx\": -9.170216354863072, \"translation_dy\": -19.23222492696047, \"scale\": 1.1821087248622173}\nB: {\"rotation_angle\": -103.24791656906933, \"translation_dx\": -2.2454836983213227, \"translation_dy\": 24.014319900588845, \"scale\": 1.3204557483507742}\nC: {\"rotation_angle\": 26.06413776863195, \"translation_dx\": 104.54441011530889, \"translation_dy\": -2.802993361858995, \"scale\": 0.6919535578881184}\nD: {\"rotation_angle\": -132.6730586187399, \"translation_dx\": -14.723128468316531, \"translation_dy\": -95.44210429834934, \"scale\": 1.0421065600095725}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_94_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_94_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 115.16030768984217, \"translation_dx\": -1.9669547188467504, \"translation_dy\": 38.42152609256746, \"scale\": 1.3403221872922475}\nB: {\"rotation_angle\": -127.2688410750471, \"translation_dx\": 10.330064507300825, \"translation_dy\": -25.010404065134438, \"scale\": 1.1376215421095472}\nC: {\"rotation_angle\": 36.19361803007027, \"translation_dx\": -50.40071399889004, \"translation_dy\": -85.39533040467117, \"scale\": 0.6522247071940848}\nD: {\"rotation_angle\": 136.2943203908062, \"translation_dx\": 59.15508525636656, \"translation_dy\": -38.46099161723379, \"scale\": 0.6414776081953896}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 115.16030768984217, \"translation_dx\": -1.9669547188467504, \"translation_dy\": 38.42152609256746, \"scale\": 1.3403221872922475}\nB: {\"rotation_angle\": -127.2688410750471, \"translation_dx\": 10.330064507300825, \"translation_dy\": -25.010404065134438, \"scale\": 1.1376215421095472}\nC: {\"rotation_angle\": 36.19361803007027, \"translation_dx\": -50.40071399889004, \"translation_dy\": -85.39533040467117, \"scale\": 0.6522247071940848}\nD: {\"rotation_angle\": 136.2943203908062, \"translation_dx\": 59.15508525636656, \"translation_dy\": -38.46099161723379, \"scale\": 0.6414776081953896}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_95_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_95_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -38.67054772511392, \"translation_dx\": 68.1059088983965, \"translation_dy\": -80.75433684597641, \"scale\": 1.0669693911306672}\nB: {\"rotation_angle\": -97.38730278840897, \"translation_dx\": 79.58431404822528, \"translation_dy\": -65.17570525641105, \"scale\": 0.8501057849742453}\nC: {\"rotation_angle\": -173.49565975712173, \"translation_dx\": 30.5303454517925, \"translation_dy\": 77.86216107455405, \"scale\": 1.067173806992701}\nD: {\"rotation_angle\": 78.52234880801677, \"translation_dx\": -41.05806913924104, \"translation_dy\": -5.158893155372851, \"scale\": 1.0182841116233097}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -38.67054772511392, \"translation_dx\": 68.1059088983965, \"translation_dy\": -80.75433684597641, \"scale\": 1.0669693911306672}\nB: {\"rotation_angle\": -97.38730278840897, \"translation_dx\": 79.58431404822528, \"translation_dy\": -65.17570525641105, \"scale\": 0.8501057849742453}\nC: {\"rotation_angle\": -173.49565975712173, \"translation_dx\": 30.5303454517925, \"translation_dy\": 77.86216107455405, \"scale\": 1.067173806992701}\nD: {\"rotation_angle\": 78.52234880801677, \"translation_dx\": -41.05806913924104, \"translation_dy\": -5.158893155372851, \"scale\": 1.0182841116233097}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_96_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_96_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -50.19218790392131, \"translation_dx\": -27.31734251737683, \"translation_dy\": 8.514724344494553, \"scale\": 1.0874517053433594}\nB: {\"rotation_angle\": 33.426384392539006, \"translation_dx\": -12.448609293998487, \"translation_dy\": 64.03367069956386, \"scale\": 0.6340926377236346}\nC: {\"rotation_angle\": 22.924180775031914, \"translation_dx\": 8.278066534063711, \"translation_dy\": 39.03722404706397, \"scale\": 0.6972670428813228}\nD: {\"rotation_angle\": -115.34417090075787, \"translation_dx\": -118.63121430094503, \"translation_dy\": 41.63412082488844, \"scale\": 0.9001856788272352}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -50.19218790392131, \"translation_dx\": -27.31734251737683, \"translation_dy\": 8.514724344494553, \"scale\": 1.0874517053433594}\nB: {\"rotation_angle\": 33.426384392539006, \"translation_dx\": -12.448609293998487, \"translation_dy\": 64.03367069956386, \"scale\": 0.6340926377236346}\nC: {\"rotation_angle\": 22.924180775031914, \"translation_dx\": 8.278066534063711, \"translation_dy\": 39.03722404706397, \"scale\": 0.6972670428813228}\nD: {\"rotation_angle\": -115.34417090075787, \"translation_dx\": -118.63121430094503, \"translation_dy\": 41.63412082488844, \"scale\": 0.9001856788272352}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_97_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_97_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -68.79930104020924, \"translation_dx\": -103.12901971602221, \"translation_dy\": 94.89161684072867, \"scale\": 1.2295411735859756}\nB: {\"rotation_angle\": -164.42105085554024, \"translation_dx\": 53.959081038248144, \"translation_dy\": -27.892450679654182, \"scale\": 1.1369631742880046}\nC: {\"rotation_angle\": 103.56580652114087, \"translation_dx\": -76.88940345297716, \"translation_dy\": -3.4544443607121593, \"scale\": 1.3949152683659345}\nD: {\"rotation_angle\": 53.86809011441332, \"translation_dx\": -15.131168518097624, \"translation_dy\": -31.300037391593577, \"scale\": 1.3154620606808156}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -68.79930104020924, \"translation_dx\": -103.12901971602221, \"translation_dy\": 94.89161684072867, \"scale\": 1.2295411735859756}\nB: {\"rotation_angle\": -164.42105085554024, \"translation_dx\": 53.959081038248144, \"translation_dy\": -27.892450679654182, \"scale\": 1.1369631742880046}\nC: {\"rotation_angle\": 103.56580652114087, \"translation_dx\": -76.88940345297716, \"translation_dy\": -3.4544443607121593, \"scale\": 1.3949152683659345}\nD: {\"rotation_angle\": 53.86809011441332, \"translation_dx\": -15.131168518097624, \"translation_dy\": -31.300037391593577, \"scale\": 1.3154620606808156}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_98_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_98_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 173.6372649335733, \"translation_dx\": -7.357207392874017, \"translation_dy\": -51.70776156994498, \"scale\": 1.09720142096939}\nB: {\"rotation_angle\": 18.52926347539298, \"translation_dx\": -26.155433185237058, \"translation_dy\": -39.799299198218556, \"scale\": 0.9355127285855813}\nC: {\"rotation_angle\": 107.15748471049534, \"translation_dx\": -112.04520804841785, \"translation_dy\": 107.36899853350675, \"scale\": 0.784106447062462}\nD: {\"rotation_angle\": -49.11147497176091, \"translation_dx\": -21.61309921155923, \"translation_dy\": 41.841400081955015, \"scale\": 1.3374733710705384}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 173.6372649335733, \"translation_dx\": -7.357207392874017, \"translation_dy\": -51.70776156994498, \"scale\": 1.09720142096939}\nB: {\"rotation_angle\": 18.52926347539298, \"translation_dx\": -26.155433185237058, \"translation_dy\": -39.799299198218556, \"scale\": 0.9355127285855813}\nC: {\"rotation_angle\": 107.15748471049534, \"translation_dx\": -112.04520804841785, \"translation_dy\": 107.36899853350675, \"scale\": 0.784106447062462}\nD: {\"rotation_angle\": -49.11147497176091, \"translation_dx\": -21.61309921155923, \"translation_dy\": 41.841400081955015, \"scale\": 1.3374733710705384}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_99_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_99_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 107.15748471049534, \"translation_dx\": -112.04520804841785, \"translation_dy\": 107.36899853350675, \"scale\": 0.784106447062462}\nB: {\"rotation_angle\": 97.08459407481979, \"translation_dx\": 38.76418659488206, \"translation_dy\": 44.81166266995322, \"scale\": 1.27585958531192}\nC: {\"rotation_angle\": -124.27587082376021, \"translation_dx\": -88.19288051455345, \"translation_dy\": 24.145134775980125, \"scale\": 1.4414104211047083}\nD: {\"rotation_angle\": 8.705969178532513, \"translation_dx\": -108.98578445869327, \"translation_dy\": -85.91179454441009, \"scale\": 0.5132717751865925}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 107.15748471049534, \"translation_dx\": -112.04520804841785, \"translation_dy\": 107.36899853350675, \"scale\": 0.784106447062462}\nB: {\"rotation_angle\": 97.08459407481979, \"translation_dx\": 38.76418659488206, \"translation_dy\": 44.81166266995322, \"scale\": 1.27585958531192}\nC: {\"rotation_angle\": -124.27587082376021, \"translation_dx\": -88.19288051455345, \"translation_dy\": 24.145134775980125, \"scale\": 1.4414104211047083}\nD: {\"rotation_angle\": 8.705969178532513, \"translation_dx\": -108.98578445869327, \"translation_dy\": -85.91179454441009, \"scale\": 0.5132717751865925}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_100_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_100_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -38.58021171568234, \"translation_dx\": -80.14139661496048, \"translation_dy\": 7.985099889843255, \"scale\": 1.029545268033875}\nB: {\"rotation_angle\": 83.8873422171626, \"translation_dx\": -89.51171417178318, \"translation_dy\": 44.525876215713694, \"scale\": 0.7096671999666376}\nC: {\"rotation_angle\": 123.61853421760617, \"translation_dx\": -93.63136806510369, \"translation_dy\": -15.65687765252683, \"scale\": 0.9834422929774667}\nD: {\"rotation_angle\": -38.67054772511392, \"translation_dx\": 68.1059088983965, \"translation_dy\": -80.75433684597641, \"scale\": 1.0669693911306672}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -38.58021171568234, \"translation_dx\": -80.14139661496048, \"translation_dy\": 7.985099889843255, \"scale\": 1.029545268033875}\nB: {\"rotation_angle\": 83.8873422171626, \"translation_dx\": -89.51171417178318, \"translation_dy\": 44.525876215713694, \"scale\": 0.7096671999666376}\nC: {\"rotation_angle\": 123.61853421760617, \"translation_dx\": -93.63136806510369, \"translation_dy\": -15.65687765252683, \"scale\": 0.9834422929774667}\nD: {\"rotation_angle\": -38.67054772511392, \"translation_dx\": 68.1059088983965, \"translation_dy\": -80.75433684597641, \"scale\": 1.0669693911306672}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_101_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_101_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -160.6395227566207, \"translation_dx\": 53.66643366551958, \"translation_dy\": -27.712376159428388, \"scale\": 1.1084051689599654}\nB: {\"rotation_angle\": 37.640985396206986, \"translation_dx\": -97.39428669742068, \"translation_dy\": 17.900860680283458, \"scale\": 1.0930243251030827}\nC: {\"rotation_angle\": 95.69634927891752, \"translation_dx\": -96.46148729426875, \"translation_dy\": -25.496381966922478, \"scale\": 0.7479348241153333}\nD: {\"rotation_angle\": 137.6047485759084, \"translation_dx\": -27.00857214512888, \"translation_dy\": -94.97246325619065, \"scale\": 1.1628545134465245}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -160.6395227566207, \"translation_dx\": 53.66643366551958, \"translation_dy\": -27.712376159428388, \"scale\": 1.1084051689599654}\nB: {\"rotation_angle\": 37.640985396206986, \"translation_dx\": -97.39428669742068, \"translation_dy\": 17.900860680283458, \"scale\": 1.0930243251030827}\nC: {\"rotation_angle\": 95.69634927891752, \"translation_dx\": -96.46148729426875, \"translation_dy\": -25.496381966922478, \"scale\": 0.7479348241153333}\nD: {\"rotation_angle\": 137.6047485759084, \"translation_dx\": -27.00857214512888, \"translation_dy\": -94.97246325619065, \"scale\": 1.1628545134465245}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_102_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_102_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 159.39197876032466, \"translation_dx\": -101.87275621292875, \"translation_dy\": -32.606176111808466, \"scale\": 0.6647290774480178}\nB: {\"rotation_angle\": -153.3687774434925, \"translation_dx\": 50.92336593606055, \"translation_dy\": -56.81603844715568, \"scale\": 1.398231264497651}\nC: {\"rotation_angle\": 137.6047485759084, \"translation_dx\": -27.00857214512888, \"translation_dy\": -94.97246325619065, \"scale\": 1.1628545134465245}\nD: {\"rotation_angle\": -76.09611957445006, \"translation_dx\": -118.19634710213703, \"translation_dy\": 85.91610719889127, \"scale\": 1.371999627635525}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 159.39197876032466, \"translation_dx\": -101.87275621292875, \"translation_dy\": -32.606176111808466, \"scale\": 0.6647290774480178}\nB: {\"rotation_angle\": -153.3687774434925, \"translation_dx\": 50.92336593606055, \"translation_dy\": -56.81603844715568, \"scale\": 1.398231264497651}\nC: {\"rotation_angle\": 137.6047485759084, \"translation_dx\": -27.00857214512888, \"translation_dy\": -94.97246325619065, \"scale\": 1.1628545134465245}\nD: {\"rotation_angle\": -76.09611957445006, \"translation_dx\": -118.19634710213703, \"translation_dy\": 85.91610719889127, \"scale\": 1.371999627635525}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_103_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_103_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -38.67054772511392, \"translation_dx\": 68.1059088983965, \"translation_dy\": -80.75433684597641, \"scale\": 1.0669693911306672}\nB: {\"rotation_angle\": -176.3085334768787, \"translation_dx\": -26.09189325642553, \"translation_dy\": 21.458056495366975, \"scale\": 0.7934334422653395}\nC: {\"rotation_angle\": 97.08459407481979, \"translation_dx\": 38.76418659488206, \"translation_dy\": 44.81166266995322, \"scale\": 1.27585958531192}\nD: {\"rotation_angle\": -79.55706788063112, \"translation_dx\": -38.613403166877674, \"translation_dy\": 48.56888435185245, \"scale\": 1.368947012195521}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -38.67054772511392, \"translation_dx\": 68.1059088983965, \"translation_dy\": -80.75433684597641, \"scale\": 1.0669693911306672}\nB: {\"rotation_angle\": -176.3085334768787, \"translation_dx\": -26.09189325642553, \"translation_dy\": 21.458056495366975, \"scale\": 0.7934334422653395}\nC: {\"rotation_angle\": 97.08459407481979, \"translation_dx\": 38.76418659488206, \"translation_dy\": 44.81166266995322, \"scale\": 1.27585958531192}\nD: {\"rotation_angle\": -79.55706788063112, \"translation_dx\": -38.613403166877674, \"translation_dy\": 48.56888435185245, \"scale\": 1.368947012195521}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_104_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_104_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 159.25105466068987, \"translation_dx\": -126.35420360425098, \"translation_dy\": -17.54721978726404, \"scale\": 1.4952435062275256}\nB: {\"rotation_angle\": -5.816806483512181, \"translation_dx\": -70.40329792935935, \"translation_dy\": -21.418007440252175, \"scale\": 1.0041476956174793}\nC: {\"rotation_angle\": 171.23105805984426, \"translation_dx\": 28.800906238980815, \"translation_dy\": 60.921924115709544, \"scale\": 1.4441070487112413}\nD: {\"rotation_angle\": 1.3693998936690264, \"translation_dx\": -71.94174431428723, \"translation_dy\": 25.661133958182248, \"scale\": 1.468813327861592}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 159.25105466068987, \"translation_dx\": -126.35420360425098, \"translation_dy\": -17.54721978726404, \"scale\": 1.4952435062275256}\nB: {\"rotation_angle\": -5.816806483512181, \"translation_dx\": -70.40329792935935, \"translation_dy\": -21.418007440252175, \"scale\": 1.0041476956174793}\nC: {\"rotation_angle\": 171.23105805984426, \"translation_dx\": 28.800906238980815, \"translation_dy\": 60.921924115709544, \"scale\": 1.4441070487112413}\nD: {\"rotation_angle\": 1.3693998936690264, \"translation_dx\": -71.94174431428723, \"translation_dy\": 25.661133958182248, \"scale\": 1.468813327861592}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_105_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_105_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 162.98131081099467, \"translation_dx\": -80.19473687776261, \"translation_dy\": -17.70282064458462, \"scale\": 1.2855975600149028}\nB: {\"rotation_angle\": -79.55706788063112, \"translation_dx\": -38.613403166877674, \"translation_dy\": 48.56888435185245, \"scale\": 1.368947012195521}\nC: {\"rotation_angle\": 33.36657735274014, \"translation_dx\": -110.42271839281483, \"translation_dy\": 35.783043595963875, \"scale\": 1.1017945125321793}\nD: {\"rotation_angle\": 115.4472434811122, \"translation_dx\": 69.00896887231048, \"translation_dy\": -26.016218629159226, \"scale\": 0.9339901852292719}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 162.98131081099467, \"translation_dx\": -80.19473687776261, \"translation_dy\": -17.70282064458462, \"scale\": 1.2855975600149028}\nB: {\"rotation_angle\": -79.55706788063112, \"translation_dx\": -38.613403166877674, \"translation_dy\": 48.56888435185245, \"scale\": 1.368947012195521}\nC: {\"rotation_angle\": 33.36657735274014, \"translation_dx\": -110.42271839281483, \"translation_dy\": 35.783043595963875, \"scale\": 1.1017945125321793}\nD: {\"rotation_angle\": 115.4472434811122, \"translation_dx\": 69.00896887231048, \"translation_dy\": -26.016218629159226, \"scale\": 0.9339901852292719}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_106_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_106_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 179.8013352752547, \"translation_dx\": -90.5548533247824, \"translation_dy\": 17.23782922418306, \"scale\": 0.9885365626195518}\nB: {\"rotation_angle\": 14.369437993555863, \"translation_dx\": -23.54312301695805, \"translation_dy\": 55.41046511147678, \"scale\": 1.115345902394854}\nC: {\"rotation_angle\": -164.42105085554024, \"translation_dx\": 53.959081038248144, \"translation_dy\": -27.892450679654182, \"scale\": 1.1369631742880046}\nD: {\"rotation_angle\": -113.69332067912192, \"translation_dx\": -23.005200251858383, \"translation_dy\": 57.916315250854666, \"scale\": 0.5483419258047426}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 179.8013352752547, \"translation_dx\": -90.5548533247824, \"translation_dy\": 17.23782922418306, \"scale\": 0.9885365626195518}\nB: {\"rotation_angle\": 14.369437993555863, \"translation_dx\": -23.54312301695805, \"translation_dy\": 55.41046511147678, \"scale\": 1.115345902394854}\nC: {\"rotation_angle\": -164.42105085554024, \"translation_dx\": 53.959081038248144, \"translation_dy\": -27.892450679654182, \"scale\": 1.1369631742880046}\nD: {\"rotation_angle\": -113.69332067912192, \"translation_dx\": -23.005200251858383, \"translation_dy\": 57.916315250854666, \"scale\": 0.5483419258047426}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_107_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_107_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -164.42105085554024, \"translation_dx\": 53.959081038248144, \"translation_dy\": -27.892450679654182, \"scale\": 1.1369631742880046}\nB: {\"rotation_angle\": 97.63348280388993, \"translation_dx\": 59.62332527691919, \"translation_dy\": 12.549462794922746, \"scale\": 0.6927080624806098}\nC: {\"rotation_angle\": 46.42160956908356, \"translation_dx\": -90.04619228512212, \"translation_dy\": -15.749486436572411, \"scale\": 1.005156310055277}\nD: {\"rotation_angle\": 136.76946369368522, \"translation_dx\": 86.13615517916296, \"translation_dy\": 47.49597577737802, \"scale\": 1.1842967613683704}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -164.42105085554024, \"translation_dx\": 53.959081038248144, \"translation_dy\": -27.892450679654182, \"scale\": 1.1369631742880046}\nB: {\"rotation_angle\": 97.63348280388993, \"translation_dx\": 59.62332527691919, \"translation_dy\": 12.549462794922746, \"scale\": 0.6927080624806098}\nC: {\"rotation_angle\": 46.42160956908356, \"translation_dx\": -90.04619228512212, \"translation_dy\": -15.749486436572411, \"scale\": 1.005156310055277}\nD: {\"rotation_angle\": 136.76946369368522, \"translation_dx\": 86.13615517916296, \"translation_dy\": 47.49597577737802, \"scale\": 1.1842967613683704}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_108_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_108_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -127.03310490562403, \"translation_dx\": -44.497972498107885, \"translation_dy\": 53.252184804163164, \"scale\": 0.8807762361133948}\nB: {\"rotation_angle\": -4.364889011784271, \"translation_dx\": 74.89385338851659, \"translation_dy\": 29.259521498010997, \"scale\": 1.2877948451877137}\nC: {\"rotation_angle\": -162.31682909306286, \"translation_dx\": 94.60975693720637, \"translation_dy\": -28.569332128995313, \"scale\": 1.1251281587345527}\nD: {\"rotation_angle\": -15.445234303955033, \"translation_dx\": 52.656313993324545, \"translation_dy\": 4.243768644047549, \"scale\": 0.8747335302455691}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -127.03310490562403, \"translation_dx\": -44.497972498107885, \"translation_dy\": 53.252184804163164, \"scale\": 0.8807762361133948}\nB: {\"rotation_angle\": -4.364889011784271, \"translation_dx\": 74.89385338851659, \"translation_dy\": 29.259521498010997, \"scale\": 1.2877948451877137}\nC: {\"rotation_angle\": -162.31682909306286, \"translation_dx\": 94.60975693720637, \"translation_dy\": -28.569332128995313, \"scale\": 1.1251281587345527}\nD: {\"rotation_angle\": -15.445234303955033, \"translation_dx\": 52.656313993324545, \"translation_dy\": 4.243768644047549, \"scale\": 0.8747335302455691}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_109_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_109_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 49.896013394485834, \"translation_dx\": -25.763756683237403, \"translation_dy\": -26.432232271484168, \"scale\": 1.1619310734744932}\nB: {\"rotation_angle\": 133.22970053001933, \"translation_dx\": 30.83867253278636, \"translation_dy\": 9.987607615316023, \"scale\": 0.9746642566652708}\nC: {\"rotation_angle\": 36.19361803007027, \"translation_dx\": -50.40071399889004, \"translation_dy\": -85.39533040467117, \"scale\": 0.6522247071940848}\nD: {\"rotation_angle\": 136.2943203908062, \"translation_dx\": 59.15508525636656, \"translation_dy\": -38.46099161723379, \"scale\": 0.6414776081953896}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 49.896013394485834, \"translation_dx\": -25.763756683237403, \"translation_dy\": -26.432232271484168, \"scale\": 1.1619310734744932}\nB: {\"rotation_angle\": 133.22970053001933, \"translation_dx\": 30.83867253278636, \"translation_dy\": 9.987607615316023, \"scale\": 0.9746642566652708}\nC: {\"rotation_angle\": 36.19361803007027, \"translation_dx\": -50.40071399889004, \"translation_dy\": -85.39533040467117, \"scale\": 0.6522247071940848}\nD: {\"rotation_angle\": 136.2943203908062, \"translation_dx\": 59.15508525636656, \"translation_dy\": -38.46099161723379, \"scale\": 0.6414776081953896}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_110_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_110_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -97.38730278840897, \"translation_dx\": 79.58431404822528, \"translation_dy\": -65.17570525641105, \"scale\": 0.8501057849742453}\nB: {\"rotation_angle\": -72.82027143369304, \"translation_dx\": -44.85481158127062, \"translation_dy\": 106.69131407191517, \"scale\": 0.716080341101258}\nC: {\"rotation_angle\": -113.69332067912192, \"translation_dx\": -23.005200251858383, \"translation_dy\": 57.916315250854666, \"scale\": 0.5483419258047426}\nD: {\"rotation_angle\": 139.13421797404374, \"translation_dx\": -107.62188977651758, \"translation_dy\": -65.35657968686931, \"scale\": 0.569575564082204}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -97.38730278840897, \"translation_dx\": 79.58431404822528, \"translation_dy\": -65.17570525641105, \"scale\": 0.8501057849742453}\nB: {\"rotation_angle\": -72.82027143369304, \"translation_dx\": -44.85481158127062, \"translation_dy\": 106.69131407191517, \"scale\": 0.716080341101258}\nC: {\"rotation_angle\": -113.69332067912192, \"translation_dx\": -23.005200251858383, \"translation_dy\": 57.916315250854666, \"scale\": 0.5483419258047426}\nD: {\"rotation_angle\": 139.13421797404374, \"translation_dx\": -107.62188977651758, \"translation_dy\": -65.35657968686931, \"scale\": 0.569575564082204}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_111_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_111_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 159.39197876032466, \"translation_dx\": -101.87275621292875, \"translation_dy\": -32.606176111808466, \"scale\": 0.6647290774480178}\nB: {\"rotation_angle\": 111.11665430921613, \"translation_dx\": -45.526232266105865, \"translation_dy\": -71.56835409165808, \"scale\": 0.5234271564227445}\nC: {\"rotation_angle\": -49.11147497176091, \"translation_dx\": -21.61309921155923, \"translation_dy\": 41.841400081955015, \"scale\": 1.3374733710705384}\nD: {\"rotation_angle\": -128.74497971799806, \"translation_dx\": -55.835206426128764, \"translation_dy\": 54.178252983369276, \"scale\": 0.8905979693160588}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 159.39197876032466, \"translation_dx\": -101.87275621292875, \"translation_dy\": -32.606176111808466, \"scale\": 0.6647290774480178}\nB: {\"rotation_angle\": 111.11665430921613, \"translation_dx\": -45.526232266105865, \"translation_dy\": -71.56835409165808, \"scale\": 0.5234271564227445}\nC: {\"rotation_angle\": -49.11147497176091, \"translation_dx\": -21.61309921155923, \"translation_dy\": 41.841400081955015, \"scale\": 1.3374733710705384}\nD: {\"rotation_angle\": -128.74497971799806, \"translation_dx\": -55.835206426128764, \"translation_dy\": 54.178252983369276, \"scale\": 0.8905979693160588}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_112_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_112_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 127.0599036632886, \"translation_dx\": -26.73103881794438, \"translation_dy\": 16.785326739741976, \"scale\": 1.1214331244941351}\nB: {\"rotation_angle\": -137.69315675508605, \"translation_dx\": -14.965017175186233, \"translation_dy\": 28.85856493302694, \"scale\": 0.6970825252863025}\nC: {\"rotation_angle\": -174.94064668132228, \"translation_dx\": 73.73079207136513, \"translation_dy\": 58.25534486945551, \"scale\": 1.178357936048121}\nD: {\"rotation_angle\": 168.86687879669455, \"translation_dx\": 30.327287286076626, \"translation_dy\": -73.84263373893171, \"scale\": 1.0887904122788439}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 127.0599036632886, \"translation_dx\": -26.73103881794438, \"translation_dy\": 16.785326739741976, \"scale\": 1.1214331244941351}\nB: {\"rotation_angle\": -137.69315675508605, \"translation_dx\": -14.965017175186233, \"translation_dy\": 28.85856493302694, \"scale\": 0.6970825252863025}\nC: {\"rotation_angle\": -174.94064668132228, \"translation_dx\": 73.73079207136513, \"translation_dy\": 58.25534486945551, \"scale\": 1.178357936048121}\nD: {\"rotation_angle\": 168.86687879669455, \"translation_dx\": 30.327287286076626, \"translation_dy\": -73.84263373893171, \"scale\": 1.0887904122788439}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_113_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_113_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -31.020660516088725, \"translation_dx\": 105.99805178546191, \"translation_dy\": -82.8489656004858, \"scale\": 1.0703563169477137}\nB: {\"rotation_angle\": 45.786611297437304, \"translation_dx\": 45.53183354666939, \"translation_dy\": -112.45880863798888, \"scale\": 0.5686394776423458}\nC: {\"rotation_angle\": -83.37935946961306, \"translation_dx\": -63.440112200681114, \"translation_dy\": -47.62616010479583, \"scale\": 0.6518247509991958}\nD: {\"rotation_angle\": 26.06413776863195, \"translation_dx\": 104.54441011530889, \"translation_dy\": -2.802993361858995, \"scale\": 0.6919535578881184}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -31.020660516088725, \"translation_dx\": 105.99805178546191, \"translation_dy\": -82.8489656004858, \"scale\": 1.0703563169477137}\nB: {\"rotation_angle\": 45.786611297437304, \"translation_dx\": 45.53183354666939, \"translation_dy\": -112.45880863798888, \"scale\": 0.5686394776423458}\nC: {\"rotation_angle\": -83.37935946961306, \"translation_dx\": -63.440112200681114, \"translation_dy\": -47.62616010479583, \"scale\": 0.6518247509991958}\nD: {\"rotation_angle\": 26.06413776863195, \"translation_dx\": 104.54441011530889, \"translation_dy\": -2.802993361858995, \"scale\": 0.6919535578881184}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_114_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_114_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -68.79930104020924, \"translation_dx\": -103.12901971602221, \"translation_dy\": 94.89161684072867, \"scale\": 1.2295411735859756}\nB: {\"rotation_angle\": -176.3085334768787, \"translation_dx\": -26.09189325642553, \"translation_dy\": 21.458056495366975, \"scale\": 0.7934334422653395}\nC: {\"rotation_angle\": 107.15748471049534, \"translation_dx\": -112.04520804841785, \"translation_dy\": 107.36899853350675, \"scale\": 0.784106447062462}\nD: {\"rotation_angle\": -76.09611957445006, \"translation_dx\": -118.19634710213703, \"translation_dy\": 85.91610719889127, \"scale\": 1.371999627635525}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -68.79930104020924, \"translation_dx\": -103.12901971602221, \"translation_dy\": 94.89161684072867, \"scale\": 1.2295411735859756}\nB: {\"rotation_angle\": -176.3085334768787, \"translation_dx\": -26.09189325642553, \"translation_dy\": 21.458056495366975, \"scale\": 0.7934334422653395}\nC: {\"rotation_angle\": 107.15748471049534, \"translation_dx\": -112.04520804841785, \"translation_dy\": 107.36899853350675, \"scale\": 0.784106447062462}\nD: {\"rotation_angle\": -76.09611957445006, \"translation_dx\": -118.19634710213703, \"translation_dy\": 85.91610719889127, \"scale\": 1.371999627635525}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_115_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_115_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 120.9888581359325, \"translation_dx\": 2.43720894071744, \"translation_dy\": -7.865691814940682, \"scale\": 0.5519813971136048}\nB: {\"rotation_angle\": 49.896013394485834, \"translation_dx\": -25.763756683237403, \"translation_dy\": -26.432232271484168, \"scale\": 1.1619310734744932}\nC: {\"rotation_angle\": 46.42160956908356, \"translation_dx\": -90.04619228512212, \"translation_dy\": -15.749486436572411, \"scale\": 1.005156310055277}\nD: {\"rotation_angle\": 123.61853421760617, \"translation_dx\": -93.63136806510369, \"translation_dy\": -15.65687765252683, \"scale\": 0.9834422929774667}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 120.9888581359325, \"translation_dx\": 2.43720894071744, \"translation_dy\": -7.865691814940682, \"scale\": 0.5519813971136048}\nB: {\"rotation_angle\": 49.896013394485834, \"translation_dx\": -25.763756683237403, \"translation_dy\": -26.432232271484168, \"scale\": 1.1619310734744932}\nC: {\"rotation_angle\": 46.42160956908356, \"translation_dx\": -90.04619228512212, \"translation_dy\": -15.749486436572411, \"scale\": 1.005156310055277}\nD: {\"rotation_angle\": 123.61853421760617, \"translation_dx\": -93.63136806510369, \"translation_dy\": -15.65687765252683, \"scale\": 0.9834422929774667}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_116_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_116_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -31.020660516088725, \"translation_dx\": 105.99805178546191, \"translation_dy\": -82.8489656004858, \"scale\": 1.0703563169477137}\nB: {\"rotation_angle\": -46.75272698463425, \"translation_dx\": 16.424107524155175, \"translation_dy\": -60.683488552754085, \"scale\": 1.375025476214386}\nC: {\"rotation_angle\": 4.601729825002167, \"translation_dx\": -92.34842360064926, \"translation_dy\": 78.34726427877602, \"scale\": 0.7620115680057987}\nD: {\"rotation_angle\": -75.97132980340905, \"translation_dx\": 6.960702322199779, \"translation_dy\": 90.08754109424518, \"scale\": 1.363389071715864}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -31.020660516088725, \"translation_dx\": 105.99805178546191, \"translation_dy\": -82.8489656004858, \"scale\": 1.0703563169477137}\nB: {\"rotation_angle\": -46.75272698463425, \"translation_dx\": 16.424107524155175, \"translation_dy\": -60.683488552754085, \"scale\": 1.375025476214386}\nC: {\"rotation_angle\": 4.601729825002167, \"translation_dx\": -92.34842360064926, \"translation_dy\": 78.34726427877602, \"scale\": 0.7620115680057987}\nD: {\"rotation_angle\": -75.97132980340905, \"translation_dx\": 6.960702322199779, \"translation_dy\": 90.08754109424518, \"scale\": 1.363389071715864}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_117_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_117_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 104.66960596229086, \"translation_dx\": 122.9579606372167, \"translation_dy\": -32.21502556645471, \"scale\": 0.5791563638149022}\nB: {\"rotation_angle\": -44.30781692045639, \"translation_dx\": -23.473696812537305, \"translation_dy\": -94.42952089946652, \"scale\": 1.4029179362735564}\nC: {\"rotation_angle\": 37.640985396206986, \"translation_dx\": -97.39428669742068, \"translation_dy\": 17.900860680283458, \"scale\": 1.0930243251030827}\nD: {\"rotation_angle\": 110.02825264959768, \"translation_dx\": -53.26387197670213, \"translation_dy\": 88.43864976013427, \"scale\": 1.4833645013101147}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 104.66960596229086, \"translation_dx\": 122.9579606372167, \"translation_dy\": -32.21502556645471, \"scale\": 0.5791563638149022}\nB: {\"rotation_angle\": -44.30781692045639, \"translation_dx\": -23.473696812537305, \"translation_dy\": -94.42952089946652, \"scale\": 1.4029179362735564}\nC: {\"rotation_angle\": 37.640985396206986, \"translation_dx\": -97.39428669742068, \"translation_dy\": 17.900860680283458, \"scale\": 1.0930243251030827}\nD: {\"rotation_angle\": 110.02825264959768, \"translation_dx\": -53.26387197670213, \"translation_dy\": 88.43864976013427, \"scale\": 1.4833645013101147}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_118_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_118_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 83.8873422171626, \"translation_dx\": -89.51171417178318, \"translation_dy\": 44.525876215713694, \"scale\": 0.7096671999666376}\nB: {\"rotation_angle\": -32.057796286961064, \"translation_dx\": 119.50392135854452, \"translation_dy\": -17.786253698900993, \"scale\": 1.4583062003808291}\nC: {\"rotation_angle\": -31.020660516088725, \"translation_dx\": 105.99805178546191, \"translation_dy\": -82.8489656004858, \"scale\": 1.0703563169477137}\nD: {\"rotation_angle\": 99.38174871704592, \"translation_dx\": 57.870588734166205, \"translation_dy\": 17.413162007690403, \"scale\": 1.4113398114931053}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 83.8873422171626, \"translation_dx\": -89.51171417178318, \"translation_dy\": 44.525876215713694, \"scale\": 0.7096671999666376}\nB: {\"rotation_angle\": -32.057796286961064, \"translation_dx\": 119.50392135854452, \"translation_dy\": -17.786253698900993, \"scale\": 1.4583062003808291}\nC: {\"rotation_angle\": -31.020660516088725, \"translation_dx\": 105.99805178546191, \"translation_dy\": -82.8489656004858, \"scale\": 1.0703563169477137}\nD: {\"rotation_angle\": 99.38174871704592, \"translation_dx\": 57.870588734166205, \"translation_dy\": 17.413162007690403, \"scale\": 1.4113398114931053}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_119_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_119_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -110.46391589612124, \"translation_dx\": -77.96644542647721, \"translation_dy\": -50.23500265461973, \"scale\": 0.7651088884143488}\nB: {\"rotation_angle\": -22.98450105670534, \"translation_dx\": -24.343109907781525, \"translation_dy\": -75.50859401578859, \"scale\": 0.5077440368943875}\nC: {\"rotation_angle\": -97.38730278840897, \"translation_dx\": 79.58431404822528, \"translation_dy\": -65.17570525641105, \"scale\": 0.8501057849742453}\nD: {\"rotation_angle\": -147.17742740700606, \"translation_dx\": 99.79022385553455, \"translation_dy\": -46.32888217161055, \"scale\": 1.2561938294527635}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -110.46391589612124, \"translation_dx\": -77.96644542647721, \"translation_dy\": -50.23500265461973, \"scale\": 0.7651088884143488}\nB: {\"rotation_angle\": -22.98450105670534, \"translation_dx\": -24.343109907781525, \"translation_dy\": -75.50859401578859, \"scale\": 0.5077440368943875}\nC: {\"rotation_angle\": -97.38730278840897, \"translation_dx\": 79.58431404822528, \"translation_dy\": -65.17570525641105, \"scale\": 0.8501057849742453}\nD: {\"rotation_angle\": -147.17742740700606, \"translation_dx\": 99.79022385553455, \"translation_dy\": -46.32888217161055, \"scale\": 1.2561938294527635}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_120_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_120_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 37.640985396206986, \"translation_dx\": -97.39428669742068, \"translation_dy\": 17.900860680283458, \"scale\": 1.0930243251030827}\nB: {\"rotation_angle\": -106.99875725121946, \"translation_dx\": 87.96881157950656, \"translation_dy\": -34.70529343588741, \"scale\": 1.407305489874207}\nC: {\"rotation_angle\": -95.56761680572791, \"translation_dx\": -92.07587430861633, \"translation_dy\": -64.18919222058364, \"scale\": 1.033728049154846}\nD: {\"rotation_angle\": 84.88997243843744, \"translation_dx\": 19.30269357274682, \"translation_dy\": 9.929350250110147, \"scale\": 1.0595552381550672}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 37.640985396206986, \"translation_dx\": -97.39428669742068, \"translation_dy\": 17.900860680283458, \"scale\": 1.0930243251030827}\nB: {\"rotation_angle\": -106.99875725121946, \"translation_dx\": 87.96881157950656, \"translation_dy\": -34.70529343588741, \"scale\": 1.407305489874207}\nC: {\"rotation_angle\": -95.56761680572791, \"translation_dx\": -92.07587430861633, \"translation_dy\": -64.18919222058364, \"scale\": 1.033728049154846}\nD: {\"rotation_angle\": 84.88997243843744, \"translation_dx\": 19.30269357274682, \"translation_dy\": 9.929350250110147, \"scale\": 1.0595552381550672}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_121_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_121_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 159.74516071456964, \"translation_dx\": 18.36539372865252, \"translation_dy\": -32.68583255299669, \"scale\": 0.6283421405871866}\nB: {\"rotation_angle\": 84.88997243843744, \"translation_dx\": 19.30269357274682, \"translation_dy\": 9.929350250110147, \"scale\": 1.0595552381550672}\nC: {\"rotation_angle\": -148.06770236959966, \"translation_dx\": 76.71938731609727, \"translation_dy\": 125.67697929104389, \"scale\": 1.1600663307259453}\nD: {\"rotation_angle\": -131.1795029858263, \"translation_dx\": 17.908074544940433, \"translation_dy\": 120.17637833747304, \"scale\": 0.9471882483559888}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 159.74516071456964, \"translation_dx\": 18.36539372865252, \"translation_dy\": -32.68583255299669, \"scale\": 0.6283421405871866}\nB: {\"rotation_angle\": 84.88997243843744, \"translation_dx\": 19.30269357274682, \"translation_dy\": 9.929350250110147, \"scale\": 1.0595552381550672}\nC: {\"rotation_angle\": -148.06770236959966, \"translation_dx\": 76.71938731609727, \"translation_dy\": 125.67697929104389, \"scale\": 1.1600663307259453}\nD: {\"rotation_angle\": -131.1795029858263, \"translation_dx\": 17.908074544940433, \"translation_dy\": 120.17637833747304, \"scale\": 0.9471882483559888}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_122_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_122_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 70.46713054198463, \"translation_dx\": 21.906055640356044, \"translation_dy\": -12.161170387444017, \"scale\": 0.6983211043742098}\nB: {\"rotation_angle\": -176.3085334768787, \"translation_dx\": -26.09189325642553, \"translation_dy\": 21.458056495366975, \"scale\": 0.7934334422653395}\nC: {\"rotation_angle\": -5.816806483512181, \"translation_dx\": -70.40329792935935, \"translation_dy\": -21.418007440252175, \"scale\": 1.0041476956174793}\nD: {\"rotation_angle\": 97.63348280388993, \"translation_dx\": 59.62332527691919, \"translation_dy\": 12.549462794922746, \"scale\": 0.6927080624806098}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 70.46713054198463, \"translation_dx\": 21.906055640356044, \"translation_dy\": -12.161170387444017, \"scale\": 0.6983211043742098}\nB: {\"rotation_angle\": -176.3085334768787, \"translation_dx\": -26.09189325642553, \"translation_dy\": 21.458056495366975, \"scale\": 0.7934334422653395}\nC: {\"rotation_angle\": -5.816806483512181, \"translation_dx\": -70.40329792935935, \"translation_dy\": -21.418007440252175, \"scale\": 1.0041476956174793}\nD: {\"rotation_angle\": 97.63348280388993, \"translation_dx\": 59.62332527691919, \"translation_dy\": 12.549462794922746, \"scale\": 0.6927080624806098}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_123_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_123_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -38.67054772511392, \"translation_dx\": 68.1059088983965, \"translation_dy\": -80.75433684597641, \"scale\": 1.0669693911306672}\nB: {\"rotation_angle\": -165.5576257925042, \"translation_dx\": 120.02978270991923, \"translation_dy\": -94.68626204020723, \"scale\": 1.377433782383828}\nC: {\"rotation_angle\": -174.94064668132228, \"translation_dx\": 73.73079207136513, \"translation_dy\": 58.25534486945551, \"scale\": 1.178357936048121}\nD: {\"rotation_angle\": -128.74497971799806, \"translation_dx\": -55.835206426128764, \"translation_dy\": 54.178252983369276, \"scale\": 0.8905979693160588}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -38.67054772511392, \"translation_dx\": 68.1059088983965, \"translation_dy\": -80.75433684597641, \"scale\": 1.0669693911306672}\nB: {\"rotation_angle\": -165.5576257925042, \"translation_dx\": 120.02978270991923, \"translation_dy\": -94.68626204020723, \"scale\": 1.377433782383828}\nC: {\"rotation_angle\": -174.94064668132228, \"translation_dx\": 73.73079207136513, \"translation_dy\": 58.25534486945551, \"scale\": 1.178357936048121}\nD: {\"rotation_angle\": -128.74497971799806, \"translation_dx\": -55.835206426128764, \"translation_dy\": 54.178252983369276, \"scale\": 0.8905979693160588}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_124_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_124_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 95.56102360167273, \"translation_dx\": -57.629857243876444, \"translation_dy\": -95.34824117323305, \"scale\": 0.9533126568708786}\nB: {\"rotation_angle\": 36.19361803007027, \"translation_dx\": -50.40071399889004, \"translation_dy\": -85.39533040467117, \"scale\": 0.6522247071940848}\nC: {\"rotation_angle\": -78.36766094840773, \"translation_dx\": -86.41466180609471, \"translation_dy\": 63.19530077419013, \"scale\": 0.608403973907593}\nD: {\"rotation_angle\": -23.02063628299686, \"translation_dx\": -42.06347070905805, \"translation_dy\": 68.90308226059909, \"scale\": 0.7321107429069119}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 95.56102360167273, \"translation_dx\": -57.629857243876444, \"translation_dy\": -95.34824117323305, \"scale\": 0.9533126568708786}\nB: {\"rotation_angle\": 36.19361803007027, \"translation_dx\": -50.40071399889004, \"translation_dy\": -85.39533040467117, \"scale\": 0.6522247071940848}\nC: {\"rotation_angle\": -78.36766094840773, \"translation_dx\": -86.41466180609471, \"translation_dy\": 63.19530077419013, \"scale\": 0.608403973907593}\nD: {\"rotation_angle\": -23.02063628299686, \"translation_dx\": -42.06347070905805, \"translation_dy\": 68.90308226059909, \"scale\": 0.7321107429069119}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_125_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_125_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -103.24791656906933, \"translation_dx\": -2.2454836983213227, \"translation_dy\": 24.014319900588845, \"scale\": 1.3204557483507742}\nB: {\"rotation_angle\": 37.640985396206986, \"translation_dx\": -97.39428669742068, \"translation_dy\": 17.900860680283458, \"scale\": 1.0930243251030827}\nC: {\"rotation_angle\": -94.06455293225282, \"translation_dx\": -52.04430006776356, \"translation_dy\": 88.55937507710391, \"scale\": 0.8369046461483086}\nD: {\"rotation_angle\": -95.56761680572791, \"translation_dx\": -92.07587430861633, \"translation_dy\": -64.18919222058364, \"scale\": 1.033728049154846}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -103.24791656906933, \"translation_dx\": -2.2454836983213227, \"translation_dy\": 24.014319900588845, \"scale\": 1.3204557483507742}\nB: {\"rotation_angle\": 37.640985396206986, \"translation_dx\": -97.39428669742068, \"translation_dy\": 17.900860680283458, \"scale\": 1.0930243251030827}\nC: {\"rotation_angle\": -94.06455293225282, \"translation_dx\": -52.04430006776356, \"translation_dy\": 88.55937507710391, \"scale\": 0.8369046461483086}\nD: {\"rotation_angle\": -95.56761680572791, \"translation_dx\": -92.07587430861633, \"translation_dy\": -64.18919222058364, \"scale\": 1.033728049154846}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_126_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_126_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 115.4472434811122, \"translation_dx\": 69.00896887231048, \"translation_dy\": -26.016218629159226, \"scale\": 0.9339901852292719}\nB: {\"rotation_angle\": -149.34069149386406, \"translation_dx\": 81.63420911320063, \"translation_dy\": -26.073567429384056, \"scale\": 1.427947630130646}\nC: {\"rotation_angle\": 37.640985396206986, \"translation_dx\": -97.39428669742068, \"translation_dy\": 17.900860680283458, \"scale\": 1.0930243251030827}\nD: {\"rotation_angle\": 98.62110540120432, \"translation_dx\": 55.8324503005326, \"translation_dy\": -53.32963696213369, \"scale\": 1.3342375308232577}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 115.4472434811122, \"translation_dx\": 69.00896887231048, \"translation_dy\": -26.016218629159226, \"scale\": 0.9339901852292719}\nB: {\"rotation_angle\": -149.34069149386406, \"translation_dx\": 81.63420911320063, \"translation_dy\": -26.073567429384056, \"scale\": 1.427947630130646}\nC: {\"rotation_angle\": 37.640985396206986, \"translation_dx\": -97.39428669742068, \"translation_dy\": 17.900860680283458, \"scale\": 1.0930243251030827}\nD: {\"rotation_angle\": 98.62110540120432, \"translation_dx\": 55.8324503005326, \"translation_dy\": -53.32963696213369, \"scale\": 1.3342375308232577}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_127_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_127_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 148.22875373623708, \"translation_dx\": 53.75338658972072, \"translation_dy\": -63.78583022927253, \"scale\": 0.9304836306567924}\nB: {\"rotation_angle\": -5.683971346231118, \"translation_dx\": -0.7123036436211407, \"translation_dy\": -23.660599152813326, \"scale\": 1.1241034499451734}\nC: {\"rotation_angle\": -42.98651909317854, \"translation_dx\": 114.49293313374625, \"translation_dy\": -39.53290228333596, \"scale\": 1.442019387031135}\nD: {\"rotation_angle\": -31.020660516088725, \"translation_dx\": 105.99805178546191, \"translation_dy\": -82.8489656004858, \"scale\": 1.0703563169477137}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 148.22875373623708, \"translation_dx\": 53.75338658972072, \"translation_dy\": -63.78583022927253, \"scale\": 0.9304836306567924}\nB: {\"rotation_angle\": -5.683971346231118, \"translation_dx\": -0.7123036436211407, \"translation_dy\": -23.660599152813326, \"scale\": 1.1241034499451734}\nC: {\"rotation_angle\": -42.98651909317854, \"translation_dx\": 114.49293313374625, \"translation_dy\": -39.53290228333596, \"scale\": 1.442019387031135}\nD: {\"rotation_angle\": -31.020660516088725, \"translation_dx\": 105.99805178546191, \"translation_dy\": -82.8489656004858, \"scale\": 1.0703563169477137}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_128_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_128_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -163.07830945514343, \"translation_dx\": 107.25371607945826, \"translation_dy\": 44.19319462200147, \"scale\": 1.0330497674624493}\nB: {\"rotation_angle\": 137.6047485759084, \"translation_dx\": -27.00857214512888, \"translation_dy\": -94.97246325619065, \"scale\": 1.1628545134465245}\nC: {\"rotation_angle\": 137.29869982747988, \"translation_dx\": 75.41375097241084, \"translation_dy\": 55.66358575693553, \"scale\": 1.1335508281242805}\nD: {\"rotation_angle\": -53.475823147809436, \"translation_dx\": -52.11444637245131, \"translation_dy\": -7.974464084606126, \"scale\": 1.302004904680502}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -163.07830945514343, \"translation_dx\": 107.25371607945826, \"translation_dy\": 44.19319462200147, \"scale\": 1.0330497674624493}\nB: {\"rotation_angle\": 137.6047485759084, \"translation_dx\": -27.00857214512888, \"translation_dy\": -94.97246325619065, \"scale\": 1.1628545134465245}\nC: {\"rotation_angle\": 137.29869982747988, \"translation_dx\": 75.41375097241084, \"translation_dy\": 55.66358575693553, \"scale\": 1.1335508281242805}\nD: {\"rotation_angle\": -53.475823147809436, \"translation_dx\": -52.11444637245131, \"translation_dy\": -7.974464084606126, \"scale\": 1.302004904680502}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_129_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_129_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 153.24034529323683, \"translation_dx\": -80.95083564593054, \"translation_dy\": 58.17854805068575, \"scale\": 0.8564275095577245}\nB: {\"rotation_angle\": 171.23105805984426, \"translation_dx\": 28.800906238980815, \"translation_dy\": 60.921924115709544, \"scale\": 1.4441070487112413}\nC: {\"rotation_angle\": -44.30781692045639, \"translation_dx\": -23.473696812537305, \"translation_dy\": -94.42952089946652, \"scale\": 1.4029179362735564}\nD: {\"rotation_angle\": 172.84173099768327, \"translation_dx\": -36.82796075364796, \"translation_dy\": -15.346257103503191, \"scale\": 0.8112655094699114}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 153.24034529323683, \"translation_dx\": -80.95083564593054, \"translation_dy\": 58.17854805068575, \"scale\": 0.8564275095577245}\nB: {\"rotation_angle\": 171.23105805984426, \"translation_dx\": 28.800906238980815, \"translation_dy\": 60.921924115709544, \"scale\": 1.4441070487112413}\nC: {\"rotation_angle\": -44.30781692045639, \"translation_dx\": -23.473696812537305, \"translation_dy\": -94.42952089946652, \"scale\": 1.4029179362735564}\nD: {\"rotation_angle\": 172.84173099768327, \"translation_dx\": -36.82796075364796, \"translation_dy\": -15.346257103503191, \"scale\": 0.8112655094699114}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_130_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_130_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -6.970858631484532, \"translation_dx\": -2.793256631611797, \"translation_dy\": 83.08133552847667, \"scale\": 1.4237697720578382}\nB: {\"rotation_angle\": -79.55706788063112, \"translation_dx\": -38.613403166877674, \"translation_dy\": 48.56888435185245, \"scale\": 1.368947012195521}\nC: {\"rotation_angle\": 88.199522854527, \"translation_dx\": 18.814421533590917, \"translation_dy\": -27.135307313502466, \"scale\": 1.37855935527965}\nD: {\"rotation_angle\": 163.34031080178892, \"translation_dx\": -21.567151354845635, \"translation_dy\": -30.72615389540148, \"scale\": 1.2439888416024685}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -6.970858631484532, \"translation_dx\": -2.793256631611797, \"translation_dy\": 83.08133552847667, \"scale\": 1.4237697720578382}\nB: {\"rotation_angle\": -79.55706788063112, \"translation_dx\": -38.613403166877674, \"translation_dy\": 48.56888435185245, \"scale\": 1.368947012195521}\nC: {\"rotation_angle\": 88.199522854527, \"translation_dx\": 18.814421533590917, \"translation_dy\": -27.135307313502466, \"scale\": 1.37855935527965}\nD: {\"rotation_angle\": 163.34031080178892, \"translation_dx\": -21.567151354845635, \"translation_dy\": -30.72615389540148, \"scale\": 1.2439888416024685}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_131_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_131_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -100.94596249363259, \"translation_dx\": 18.493532966543597, \"translation_dy\": -4.904135882610319, \"scale\": 1.1575890826518318}\nB: {\"rotation_angle\": 134.59992138556464, \"translation_dx\": 5.908404103559974, \"translation_dy\": 47.60587687007518, \"scale\": 1.0105063493742612}\nC: {\"rotation_angle\": -31.020660516088725, \"translation_dx\": 105.99805178546191, \"translation_dy\": -82.8489656004858, \"scale\": 1.0703563169477137}\nD: {\"rotation_angle\": 157.75388648393812, \"translation_dx\": 20.356281771878216, \"translation_dy\": 16.09866009065132, \"scale\": 0.523349135390574}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -100.94596249363259, \"translation_dx\": 18.493532966543597, \"translation_dy\": -4.904135882610319, \"scale\": 1.1575890826518318}\nB: {\"rotation_angle\": 134.59992138556464, \"translation_dx\": 5.908404103559974, \"translation_dy\": 47.60587687007518, \"scale\": 1.0105063493742612}\nC: {\"rotation_angle\": -31.020660516088725, \"translation_dx\": 105.99805178546191, \"translation_dy\": -82.8489656004858, \"scale\": 1.0703563169477137}\nD: {\"rotation_angle\": 157.75388648393812, \"translation_dx\": 20.356281771878216, \"translation_dy\": 16.09866009065132, \"scale\": 0.523349135390574}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_132_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_132_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -97.38730278840897, \"translation_dx\": 79.58431404822528, \"translation_dy\": -65.17570525641105, \"scale\": 0.8501057849742453}\nB: {\"rotation_angle\": 83.49682873903629, \"translation_dx\": -127.2042493945246, \"translation_dy\": 2.6616959584396938, \"scale\": 0.9488759478249397}\nC: {\"rotation_angle\": -149.42147215379055, \"translation_dx\": 2.3444194857030283, \"translation_dy\": 35.92779325530762, \"scale\": 1.0223945055206394}\nD: {\"rotation_angle\": -13.219279868292688, \"translation_dx\": -95.87022677446828, \"translation_dy\": -58.31347876468597, \"scale\": 1.3722022398508045}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -97.38730278840897, \"translation_dx\": 79.58431404822528, \"translation_dy\": -65.17570525641105, \"scale\": 0.8501057849742453}\nB: {\"rotation_angle\": 83.49682873903629, \"translation_dx\": -127.2042493945246, \"translation_dy\": 2.6616959584396938, \"scale\": 0.9488759478249397}\nC: {\"rotation_angle\": -149.42147215379055, \"translation_dx\": 2.3444194857030283, \"translation_dy\": 35.92779325530762, \"scale\": 1.0223945055206394}\nD: {\"rotation_angle\": -13.219279868292688, \"translation_dx\": -95.87022677446828, \"translation_dy\": -58.31347876468597, \"scale\": 1.3722022398508045}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_133_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_133_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 45.786611297437304, \"translation_dx\": 45.53183354666939, \"translation_dy\": -112.45880863798888, \"scale\": 0.5686394776423458}\nB: {\"rotation_angle\": 136.2943203908062, \"translation_dx\": 59.15508525636656, \"translation_dy\": -38.46099161723379, \"scale\": 0.6414776081953896}\nC: {\"rotation_angle\": 98.88222011850513, \"translation_dx\": 98.58699088344886, \"translation_dy\": 52.424259863835346, \"scale\": 0.8670994673205047}\nD: {\"rotation_angle\": -16.878745814478265, \"translation_dx\": -68.86659110743665, \"translation_dy\": -98.54142762965468, \"scale\": 1.2648663928919022}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 45.786611297437304, \"translation_dx\": 45.53183354666939, \"translation_dy\": -112.45880863798888, \"scale\": 0.5686394776423458}\nB: {\"rotation_angle\": 136.2943203908062, \"translation_dx\": 59.15508525636656, \"translation_dy\": -38.46099161723379, \"scale\": 0.6414776081953896}\nC: {\"rotation_angle\": 98.88222011850513, \"translation_dx\": 98.58699088344886, \"translation_dy\": 52.424259863835346, \"scale\": 0.8670994673205047}\nD: {\"rotation_angle\": -16.878745814478265, \"translation_dx\": -68.86659110743665, \"translation_dy\": -98.54142762965468, \"scale\": 1.2648663928919022}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_134_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_134_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -152.40502323992493, \"translation_dx\": -0.6096313646742146, \"translation_dy\": 26.2224872549711, \"scale\": 0.6008305458537412}\nB: {\"rotation_angle\": 134.59992138556464, \"translation_dx\": 5.908404103559974, \"translation_dy\": 47.60587687007518, \"scale\": 1.0105063493742612}\nC: {\"rotation_angle\": -16.878745814478265, \"translation_dx\": -68.86659110743665, \"translation_dy\": -98.54142762965468, \"scale\": 1.2648663928919022}\nD: {\"rotation_angle\": 130.382151153576, \"translation_dx\": 48.77925626504499, \"translation_dy\": 54.89982459749416, \"scale\": 1.3647831130001666}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -152.40502323992493, \"translation_dx\": -0.6096313646742146, \"translation_dy\": 26.2224872549711, \"scale\": 0.6008305458537412}\nB: {\"rotation_angle\": 134.59992138556464, \"translation_dx\": 5.908404103559974, \"translation_dy\": 47.60587687007518, \"scale\": 1.0105063493742612}\nC: {\"rotation_angle\": -16.878745814478265, \"translation_dx\": -68.86659110743665, \"translation_dy\": -98.54142762965468, \"scale\": 1.2648663928919022}\nD: {\"rotation_angle\": 130.382151153576, \"translation_dx\": 48.77925626504499, \"translation_dy\": 54.89982459749416, \"scale\": 1.3647831130001666}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_135_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_135_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 45.786611297437304, \"translation_dx\": 45.53183354666939, \"translation_dy\": -112.45880863798888, \"scale\": 0.5686394776423458}\nB: {\"rotation_angle\": -160.6395227566207, \"translation_dx\": 53.66643366551958, \"translation_dy\": -27.712376159428388, \"scale\": 1.1084051689599654}\nC: {\"rotation_angle\": -124.27587082376021, \"translation_dx\": -88.19288051455345, \"translation_dy\": 24.145134775980125, \"scale\": 1.4414104211047083}\nD: {\"rotation_angle\": 83.8873422171626, \"translation_dx\": -89.51171417178318, \"translation_dy\": 44.525876215713694, \"scale\": 0.7096671999666376}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 45.786611297437304, \"translation_dx\": 45.53183354666939, \"translation_dy\": -112.45880863798888, \"scale\": 0.5686394776423458}\nB: {\"rotation_angle\": -160.6395227566207, \"translation_dx\": 53.66643366551958, \"translation_dy\": -27.712376159428388, \"scale\": 1.1084051689599654}\nC: {\"rotation_angle\": -124.27587082376021, \"translation_dx\": -88.19288051455345, \"translation_dy\": 24.145134775980125, \"scale\": 1.4414104211047083}\nD: {\"rotation_angle\": 83.8873422171626, \"translation_dx\": -89.51171417178318, \"translation_dy\": 44.525876215713694, \"scale\": 0.7096671999666376}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_136_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_136_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 83.49682873903629, \"translation_dx\": -127.2042493945246, \"translation_dy\": 2.6616959584396938, \"scale\": 0.9488759478249397}\nB: {\"rotation_angle\": 111.11665430921613, \"translation_dx\": -45.526232266105865, \"translation_dy\": -71.56835409165808, \"scale\": 0.5234271564227445}\nC: {\"rotation_angle\": -165.5576257925042, \"translation_dx\": 120.02978270991923, \"translation_dy\": -94.68626204020723, \"scale\": 1.377433782383828}\nD: {\"rotation_angle\": -95.56761680572791, \"translation_dx\": -92.07587430861633, \"translation_dy\": -64.18919222058364, \"scale\": 1.033728049154846}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 83.49682873903629, \"translation_dx\": -127.2042493945246, \"translation_dy\": 2.6616959584396938, \"scale\": 0.9488759478249397}\nB: {\"rotation_angle\": 111.11665430921613, \"translation_dx\": -45.526232266105865, \"translation_dy\": -71.56835409165808, \"scale\": 0.5234271564227445}\nC: {\"rotation_angle\": -165.5576257925042, \"translation_dx\": 120.02978270991923, \"translation_dy\": -94.68626204020723, \"scale\": 1.377433782383828}\nD: {\"rotation_angle\": -95.56761680572791, \"translation_dx\": -92.07587430861633, \"translation_dy\": -64.18919222058364, \"scale\": 1.033728049154846}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_137_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_137_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -50.19218790392131, \"translation_dx\": -27.31734251737683, \"translation_dy\": 8.514724344494553, \"scale\": 1.0874517053433594}\nB: {\"rotation_angle\": -176.3085334768787, \"translation_dx\": -26.09189325642553, \"translation_dy\": 21.458056495366975, \"scale\": 0.7934334422653395}\nC: {\"rotation_angle\": -6.970858631484532, \"translation_dx\": -2.793256631611797, \"translation_dy\": 83.08133552847667, \"scale\": 1.4237697720578382}\nD: {\"rotation_angle\": 115.4472434811122, \"translation_dx\": 69.00896887231048, \"translation_dy\": -26.016218629159226, \"scale\": 0.9339901852292719}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -50.19218790392131, \"translation_dx\": -27.31734251737683, \"translation_dy\": 8.514724344494553, \"scale\": 1.0874517053433594}\nB: {\"rotation_angle\": -176.3085334768787, \"translation_dx\": -26.09189325642553, \"translation_dy\": 21.458056495366975, \"scale\": 0.7934334422653395}\nC: {\"rotation_angle\": -6.970858631484532, \"translation_dx\": -2.793256631611797, \"translation_dy\": 83.08133552847667, \"scale\": 1.4237697720578382}\nD: {\"rotation_angle\": 115.4472434811122, \"translation_dx\": 69.00896887231048, \"translation_dy\": -26.016218629159226, \"scale\": 0.9339901852292719}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_138_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_138_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 98.88222011850513, \"translation_dx\": 98.58699088344886, \"translation_dy\": 52.424259863835346, \"scale\": 0.8670994673205047}\nB: {\"rotation_angle\": -128.93587705152078, \"translation_dx\": 48.830662388872895, \"translation_dy\": 65.60255696435819, \"scale\": 0.5618983722639579}\nC: {\"rotation_angle\": -49.11147497176091, \"translation_dx\": -21.61309921155923, \"translation_dy\": 41.841400081955015, \"scale\": 1.3374733710705384}\nD: {\"rotation_angle\": 107.15748471049534, \"translation_dx\": -112.04520804841785, \"translation_dy\": 107.36899853350675, \"scale\": 0.784106447062462}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 98.88222011850513, \"translation_dx\": 98.58699088344886, \"translation_dy\": 52.424259863835346, \"scale\": 0.8670994673205047}\nB: {\"rotation_angle\": -128.93587705152078, \"translation_dx\": 48.830662388872895, \"translation_dy\": 65.60255696435819, \"scale\": 0.5618983722639579}\nC: {\"rotation_angle\": -49.11147497176091, \"translation_dx\": -21.61309921155923, \"translation_dy\": 41.841400081955015, \"scale\": 1.3374733710705384}\nD: {\"rotation_angle\": 107.15748471049534, \"translation_dx\": -112.04520804841785, \"translation_dy\": 107.36899853350675, \"scale\": 0.784106447062462}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_139_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_139_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 172.84173099768327, \"translation_dx\": -36.82796075364796, \"translation_dy\": -15.346257103503191, \"scale\": 0.8112655094699114}\nB: {\"rotation_angle\": -169.57691070181107, \"translation_dx\": 67.3776951722352, \"translation_dy\": 6.393739311338578, \"scale\": 0.8283042543093307}\nC: {\"rotation_angle\": -178.96154331790243, \"translation_dx\": -45.831117140591004, \"translation_dy\": 14.962223802901406, \"scale\": 1.4059876442036168}\nD: {\"rotation_angle\": -127.2688410750471, \"translation_dx\": 10.330064507300825, \"translation_dy\": -25.010404065134438, \"scale\": 1.1376215421095472}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 172.84173099768327, \"translation_dx\": -36.82796075364796, \"translation_dy\": -15.346257103503191, \"scale\": 0.8112655094699114}\nB: {\"rotation_angle\": -169.57691070181107, \"translation_dx\": 67.3776951722352, \"translation_dy\": 6.393739311338578, \"scale\": 0.8283042543093307}\nC: {\"rotation_angle\": -178.96154331790243, \"translation_dx\": -45.831117140591004, \"translation_dy\": 14.962223802901406, \"scale\": 1.4059876442036168}\nD: {\"rotation_angle\": -127.2688410750471, \"translation_dx\": 10.330064507300825, \"translation_dy\": -25.010404065134438, \"scale\": 1.1376215421095472}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_140_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_140_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 134.59992138556464, \"translation_dx\": 5.908404103559974, \"translation_dy\": 47.60587687007518, \"scale\": 1.0105063493742612}\nB: {\"rotation_angle\": -79.55706788063112, \"translation_dx\": -38.613403166877674, \"translation_dy\": 48.56888435185245, \"scale\": 1.368947012195521}\nC: {\"rotation_angle\": 115.44035395260755, \"translation_dx\": 104.38539690843712, \"translation_dy\": -82.71757148170198, \"scale\": 0.6534862534786243}\nD: {\"rotation_angle\": 83.49682873903629, \"translation_dx\": -127.2042493945246, \"translation_dy\": 2.6616959584396938, \"scale\": 0.9488759478249397}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 134.59992138556464, \"translation_dx\": 5.908404103559974, \"translation_dy\": 47.60587687007518, \"scale\": 1.0105063493742612}\nB: {\"rotation_angle\": -79.55706788063112, \"translation_dx\": -38.613403166877674, \"translation_dy\": 48.56888435185245, \"scale\": 1.368947012195521}\nC: {\"rotation_angle\": 115.44035395260755, \"translation_dx\": 104.38539690843712, \"translation_dy\": -82.71757148170198, \"scale\": 0.6534862534786243}\nD: {\"rotation_angle\": 83.49682873903629, \"translation_dx\": -127.2042493945246, \"translation_dy\": 2.6616959584396938, \"scale\": 0.9488759478249397}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_141_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_141_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -49.11147497176091, \"translation_dx\": -21.61309921155923, \"translation_dy\": 41.841400081955015, \"scale\": 1.3374733710705384}\nB: {\"rotation_angle\": -178.96154331790243, \"translation_dx\": -45.831117140591004, \"translation_dy\": 14.962223802901406, \"scale\": 1.4059876442036168}\nC: {\"rotation_angle\": -70.97525301082955, \"translation_dx\": -28.380848037876873, \"translation_dy\": 54.37723426674512, \"scale\": 0.9024922197892329}\nD: {\"rotation_angle\": 120.9888581359325, \"translation_dx\": 2.43720894071744, \"translation_dy\": -7.865691814940682, \"scale\": 0.5519813971136048}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -49.11147497176091, \"translation_dx\": -21.61309921155923, \"translation_dy\": 41.841400081955015, \"scale\": 1.3374733710705384}\nB: {\"rotation_angle\": -178.96154331790243, \"translation_dx\": -45.831117140591004, \"translation_dy\": 14.962223802901406, \"scale\": 1.4059876442036168}\nC: {\"rotation_angle\": -70.97525301082955, \"translation_dx\": -28.380848037876873, \"translation_dy\": 54.37723426674512, \"scale\": 0.9024922197892329}\nD: {\"rotation_angle\": 120.9888581359325, \"translation_dx\": 2.43720894071744, \"translation_dy\": -7.865691814940682, \"scale\": 0.5519813971136048}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_142_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_142_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 12.872370969250312, \"translation_dx\": -43.1533458138392, \"translation_dy\": -64.88511529320917, \"scale\": 1.3092068537816153}\nB: {\"rotation_angle\": -123.92621597373325, \"translation_dx\": 115.25994331141689, \"translation_dy\": -45.13111299141354, \"scale\": 1.164470344420729}\nC: {\"rotation_angle\": 115.4472434811122, \"translation_dx\": 69.00896887231048, \"translation_dy\": -26.016218629159226, \"scale\": 0.9339901852292719}\nD: {\"rotation_angle\": 55.990963226006784, \"translation_dx\": 71.2358057599877, \"translation_dy\": 22.751866785772563, \"scale\": 1.4964705985201703}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 12.872370969250312, \"translation_dx\": -43.1533458138392, \"translation_dy\": -64.88511529320917, \"scale\": 1.3092068537816153}\nB: {\"rotation_angle\": -123.92621597373325, \"translation_dx\": 115.25994331141689, \"translation_dy\": -45.13111299141354, \"scale\": 1.164470344420729}\nC: {\"rotation_angle\": 115.4472434811122, \"translation_dx\": 69.00896887231048, \"translation_dy\": -26.016218629159226, \"scale\": 0.9339901852292719}\nD: {\"rotation_angle\": 55.990963226006784, \"translation_dx\": 71.2358057599877, \"translation_dy\": 22.751866785772563, \"scale\": 1.4964705985201703}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_143_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_143_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 136.2943203908062, \"translation_dx\": 59.15508525636656, \"translation_dy\": -38.46099161723379, \"scale\": 0.6414776081953896}\nB: {\"rotation_angle\": 163.34031080178892, \"translation_dx\": -21.567151354845635, \"translation_dy\": -30.72615389540148, \"scale\": 1.2439888416024685}\nC: {\"rotation_angle\": -5.816806483512181, \"translation_dx\": -70.40329792935935, \"translation_dy\": -21.418007440252175, \"scale\": 1.0041476956174793}\nD: {\"rotation_angle\": -95.56761680572791, \"translation_dx\": -92.07587430861633, \"translation_dy\": -64.18919222058364, \"scale\": 1.033728049154846}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 136.2943203908062, \"translation_dx\": 59.15508525636656, \"translation_dy\": -38.46099161723379, \"scale\": 0.6414776081953896}\nB: {\"rotation_angle\": 163.34031080178892, \"translation_dx\": -21.567151354845635, \"translation_dy\": -30.72615389540148, \"scale\": 1.2439888416024685}\nC: {\"rotation_angle\": -5.816806483512181, \"translation_dx\": -70.40329792935935, \"translation_dy\": -21.418007440252175, \"scale\": 1.0041476956174793}\nD: {\"rotation_angle\": -95.56761680572791, \"translation_dx\": -92.07587430861633, \"translation_dy\": -64.18919222058364, \"scale\": 1.033728049154846}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_144_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_144_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -42.98651909317854, \"translation_dx\": 114.49293313374625, \"translation_dy\": -39.53290228333596, \"scale\": 1.442019387031135}\nB: {\"rotation_angle\": 98.88222011850513, \"translation_dx\": 98.58699088344886, \"translation_dy\": 52.424259863835346, \"scale\": 0.8670994673205047}\nC: {\"rotation_angle\": 120.9888581359325, \"translation_dx\": 2.43720894071744, \"translation_dy\": -7.865691814940682, \"scale\": 0.5519813971136048}\nD: {\"rotation_angle\": 123.61853421760617, \"translation_dx\": -93.63136806510369, \"translation_dy\": -15.65687765252683, \"scale\": 0.9834422929774667}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -42.98651909317854, \"translation_dx\": 114.49293313374625, \"translation_dy\": -39.53290228333596, \"scale\": 1.442019387031135}\nB: {\"rotation_angle\": 98.88222011850513, \"translation_dx\": 98.58699088344886, \"translation_dy\": 52.424259863835346, \"scale\": 0.8670994673205047}\nC: {\"rotation_angle\": 120.9888581359325, \"translation_dx\": 2.43720894071744, \"translation_dy\": -7.865691814940682, \"scale\": 0.5519813971136048}\nD: {\"rotation_angle\": 123.61853421760617, \"translation_dx\": -93.63136806510369, \"translation_dy\": -15.65687765252683, \"scale\": 0.9834422929774667}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_145_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_145_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -6.38420562293993, \"translation_dx\": -106.80670691302902, \"translation_dy\": -3.5935098985529663, \"scale\": 1.3037846299861797}\nB: {\"rotation_angle\": 170.5673161572617, \"translation_dx\": -54.14309140946517, \"translation_dy\": -20.9067824061149, \"scale\": 0.74080987054586}\nC: {\"rotation_angle\": 97.08459407481979, \"translation_dx\": 38.76418659488206, \"translation_dy\": 44.81166266995322, \"scale\": 1.27585958531192}\nD: {\"rotation_angle\": 137.29869982747988, \"translation_dx\": 75.41375097241084, \"translation_dy\": 55.66358575693553, \"scale\": 1.1335508281242805}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -6.38420562293993, \"translation_dx\": -106.80670691302902, \"translation_dy\": -3.5935098985529663, \"scale\": 1.3037846299861797}\nB: {\"rotation_angle\": 170.5673161572617, \"translation_dx\": -54.14309140946517, \"translation_dy\": -20.9067824061149, \"scale\": 0.74080987054586}\nC: {\"rotation_angle\": 97.08459407481979, \"translation_dx\": 38.76418659488206, \"translation_dy\": 44.81166266995322, \"scale\": 1.27585958531192}\nD: {\"rotation_angle\": 137.29869982747988, \"translation_dx\": 75.41375097241084, \"translation_dy\": 55.66358575693553, \"scale\": 1.1335508281242805}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_146_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_146_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 83.49682873903629, \"translation_dx\": -127.2042493945246, \"translation_dy\": 2.6616959584396938, \"scale\": 0.9488759478249397}\nB: {\"rotation_angle\": -8.756342422911757, \"translation_dx\": -120.12147874311805, \"translation_dy\": -16.659510954699698, \"scale\": 0.8471832394055047}\nC: {\"rotation_angle\": -128.93587705152078, \"translation_dx\": 48.830662388872895, \"translation_dy\": 65.60255696435819, \"scale\": 0.5618983722639579}\nD: {\"rotation_angle\": -4.364889011784271, \"translation_dx\": 74.89385338851659, \"translation_dy\": 29.259521498010997, \"scale\": 1.2877948451877137}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 83.49682873903629, \"translation_dx\": -127.2042493945246, \"translation_dy\": 2.6616959584396938, \"scale\": 0.9488759478249397}\nB: {\"rotation_angle\": -8.756342422911757, \"translation_dx\": -120.12147874311805, \"translation_dy\": -16.659510954699698, \"scale\": 0.8471832394055047}\nC: {\"rotation_angle\": -128.93587705152078, \"translation_dx\": 48.830662388872895, \"translation_dy\": 65.60255696435819, \"scale\": 0.5618983722639579}\nD: {\"rotation_angle\": -4.364889011784271, \"translation_dx\": 74.89385338851659, \"translation_dy\": 29.259521498010997, \"scale\": 1.2877948451877137}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_147_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_147_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -127.03310490562403, \"translation_dx\": -44.497972498107885, \"translation_dy\": 53.252184804163164, \"scale\": 0.8807762361133948}\nB: {\"rotation_angle\": -152.40502323992493, \"translation_dx\": -0.6096313646742146, \"translation_dy\": 26.2224872549711, \"scale\": 0.6008305458537412}\nC: {\"rotation_angle\": 161.7596265938729, \"translation_dx\": -9.170216354863072, \"translation_dy\": -19.23222492696047, \"scale\": 1.1821087248622173}\nD: {\"rotation_angle\": -76.09611957445006, \"translation_dx\": -118.19634710213703, \"translation_dy\": 85.91610719889127, \"scale\": 1.371999627635525}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -127.03310490562403, \"translation_dx\": -44.497972498107885, \"translation_dy\": 53.252184804163164, \"scale\": 0.8807762361133948}\nB: {\"rotation_angle\": -152.40502323992493, \"translation_dx\": -0.6096313646742146, \"translation_dy\": 26.2224872549711, \"scale\": 0.6008305458537412}\nC: {\"rotation_angle\": 161.7596265938729, \"translation_dx\": -9.170216354863072, \"translation_dy\": -19.23222492696047, \"scale\": 1.1821087248622173}\nD: {\"rotation_angle\": -76.09611957445006, \"translation_dx\": -118.19634710213703, \"translation_dy\": 85.91610719889127, \"scale\": 1.371999627635525}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_148_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_148_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -32.96407209098831, \"translation_dx\": -27.518946535455143, \"translation_dy\": 2.5370159689679213, \"scale\": 1.259328459428434}\nB: {\"rotation_angle\": -148.06770236959966, \"translation_dx\": 76.71938731609727, \"translation_dy\": 125.67697929104389, \"scale\": 1.1600663307259453}\nC: {\"rotation_angle\": -137.58016126496426, \"translation_dx\": 45.631572391068715, \"translation_dy\": -54.72741054396442, \"scale\": 1.391656794638211}\nD: {\"rotation_angle\": -6.258618837806779, \"translation_dx\": -117.56200624611057, \"translation_dy\": -84.92852320396813, \"scale\": 0.8703619649920769}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -32.96407209098831, \"translation_dx\": -27.518946535455143, \"translation_dy\": 2.5370159689679213, \"scale\": 1.259328459428434}\nB: {\"rotation_angle\": -148.06770236959966, \"translation_dx\": 76.71938731609727, \"translation_dy\": 125.67697929104389, \"scale\": 1.1600663307259453}\nC: {\"rotation_angle\": -137.58016126496426, \"translation_dx\": 45.631572391068715, \"translation_dy\": -54.72741054396442, \"scale\": 1.391656794638211}\nD: {\"rotation_angle\": -6.258618837806779, \"translation_dx\": -117.56200624611057, \"translation_dy\": -84.92852320396813, \"scale\": 0.8703619649920769}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_149_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_149_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -14.958482221349612, \"translation_dx\": 49.62118662103501, \"translation_dy\": -13.943537967490855, \"scale\": 1.489574484959727}\nB: {\"rotation_angle\": 134.59992138556464, \"translation_dx\": 5.908404103559974, \"translation_dy\": 47.60587687007518, \"scale\": 1.0105063493742612}\nC: {\"rotation_angle\": 156.4647723112265, \"translation_dx\": -66.53886800122852, \"translation_dy\": 64.98500274528308, \"scale\": 1.1427015309184732}\nD: {\"rotation_angle\": 111.11665430921613, \"translation_dx\": -45.526232266105865, \"translation_dy\": -71.56835409165808, \"scale\": 0.5234271564227445}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -14.958482221349612, \"translation_dx\": 49.62118662103501, \"translation_dy\": -13.943537967490855, \"scale\": 1.489574484959727}\nB: {\"rotation_angle\": 134.59992138556464, \"translation_dx\": 5.908404103559974, \"translation_dy\": 47.60587687007518, \"scale\": 1.0105063493742612}\nC: {\"rotation_angle\": 156.4647723112265, \"translation_dx\": -66.53886800122852, \"translation_dy\": 64.98500274528308, \"scale\": 1.1427015309184732}\nD: {\"rotation_angle\": 111.11665430921613, \"translation_dx\": -45.526232266105865, \"translation_dy\": -71.56835409165808, \"scale\": 0.5234271564227445}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_150_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_150_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -51.98717119490195, \"translation_dx\": -83.93544420557635, \"translation_dy\": -17.359661719977098, \"scale\": 1.0858344969275349}\nB: {\"rotation_angle\": 163.34031080178892, \"translation_dx\": -21.567151354845635, \"translation_dy\": -30.72615389540148, \"scale\": 1.2439888416024685}\nC: {\"rotation_angle\": 143.38145335973087, \"translation_dx\": 86.67970142496799, \"translation_dy\": -33.57640317277091, \"scale\": 0.6114655384261714}\nD: {\"rotation_angle\": 96.727171962103, \"translation_dx\": 36.81177221178956, \"translation_dy\": 18.012374651364837, \"scale\": 0.7274955443317854}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -51.98717119490195, \"translation_dx\": -83.93544420557635, \"translation_dy\": -17.359661719977098, \"scale\": 1.0858344969275349}\nB: {\"rotation_angle\": 163.34031080178892, \"translation_dx\": -21.567151354845635, \"translation_dy\": -30.72615389540148, \"scale\": 1.2439888416024685}\nC: {\"rotation_angle\": 143.38145335973087, \"translation_dx\": 86.67970142496799, \"translation_dy\": -33.57640317277091, \"scale\": 0.6114655384261714}\nD: {\"rotation_angle\": 96.727171962103, \"translation_dx\": 36.81177221178956, \"translation_dy\": 18.012374651364837, \"scale\": 0.7274955443317854}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_151_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_151_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 18.52926347539298, \"translation_dx\": -26.155433185237058, \"translation_dy\": -39.799299198218556, \"scale\": 0.9355127285855813}\nB: {\"rotation_angle\": 74.4727172984789, \"translation_dx\": 83.0498783040965, \"translation_dy\": 24.573318419119772, \"scale\": 1.4775593630739356}\nC: {\"rotation_angle\": -97.38730278840897, \"translation_dx\": 79.58431404822528, \"translation_dy\": -65.17570525641105, \"scale\": 0.8501057849742453}\nD: {\"rotation_angle\": 95.56102360167273, \"translation_dx\": -57.629857243876444, \"translation_dy\": -95.34824117323305, \"scale\": 0.9533126568708786}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 18.52926347539298, \"translation_dx\": -26.155433185237058, \"translation_dy\": -39.799299198218556, \"scale\": 0.9355127285855813}\nB: {\"rotation_angle\": 74.4727172984789, \"translation_dx\": 83.0498783040965, \"translation_dy\": 24.573318419119772, \"scale\": 1.4775593630739356}\nC: {\"rotation_angle\": -97.38730278840897, \"translation_dx\": 79.58431404822528, \"translation_dy\": -65.17570525641105, \"scale\": 0.8501057849742453}\nD: {\"rotation_angle\": 95.56102360167273, \"translation_dx\": -57.629857243876444, \"translation_dy\": -95.34824117323305, \"scale\": 0.9533126568708786}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_152_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_152_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -106.99875725121946, \"translation_dx\": 87.96881157950656, \"translation_dy\": -34.70529343588741, \"scale\": 1.407305489874207}\nB: {\"rotation_angle\": 115.4472434811122, \"translation_dx\": 69.00896887231048, \"translation_dy\": -26.016218629159226, \"scale\": 0.9339901852292719}\nC: {\"rotation_angle\": 133.22970053001933, \"translation_dx\": 30.83867253278636, \"translation_dy\": 9.987607615316023, \"scale\": 0.9746642566652708}\nD: {\"rotation_angle\": -38.58021171568234, \"translation_dx\": -80.14139661496048, \"translation_dy\": 7.985099889843255, \"scale\": 1.029545268033875}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -106.99875725121946, \"translation_dx\": 87.96881157950656, \"translation_dy\": -34.70529343588741, \"scale\": 1.407305489874207}\nB: {\"rotation_angle\": 115.4472434811122, \"translation_dx\": 69.00896887231048, \"translation_dy\": -26.016218629159226, \"scale\": 0.9339901852292719}\nC: {\"rotation_angle\": 133.22970053001933, \"translation_dx\": 30.83867253278636, \"translation_dy\": 9.987607615316023, \"scale\": 0.9746642566652708}\nD: {\"rotation_angle\": -38.58021171568234, \"translation_dx\": -80.14139661496048, \"translation_dy\": 7.985099889843255, \"scale\": 1.029545268033875}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_153_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_153_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 103.56580652114087, \"translation_dx\": -76.88940345297716, \"translation_dy\": -3.4544443607121593, \"scale\": 1.3949152683659345}\nB: {\"rotation_angle\": 123.61853421760617, \"translation_dx\": -93.63136806510369, \"translation_dy\": -15.65687765252683, \"scale\": 0.9834422929774667}\nC: {\"rotation_angle\": -99.80397961792426, \"translation_dx\": 113.2252387398062, \"translation_dy\": -61.846052830557056, \"scale\": 1.080357872583317}\nD: {\"rotation_angle\": 170.5673161572617, \"translation_dx\": -54.14309140946517, \"translation_dy\": -20.9067824061149, \"scale\": 0.74080987054586}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 103.56580652114087, \"translation_dx\": -76.88940345297716, \"translation_dy\": -3.4544443607121593, \"scale\": 1.3949152683659345}\nB: {\"rotation_angle\": 123.61853421760617, \"translation_dx\": -93.63136806510369, \"translation_dy\": -15.65687765252683, \"scale\": 0.9834422929774667}\nC: {\"rotation_angle\": -99.80397961792426, \"translation_dx\": 113.2252387398062, \"translation_dy\": -61.846052830557056, \"scale\": 1.080357872583317}\nD: {\"rotation_angle\": 170.5673161572617, \"translation_dx\": -54.14309140946517, \"translation_dy\": -20.9067824061149, \"scale\": 0.74080987054586}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_154_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_154_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -132.6730586187399, \"translation_dx\": -14.723128468316531, \"translation_dy\": -95.44210429834934, \"scale\": 1.0421065600095725}\nB: {\"rotation_angle\": -15.445234303955033, \"translation_dx\": 52.656313993324545, \"translation_dy\": 4.243768644047549, \"scale\": 0.8747335302455691}\nC: {\"rotation_angle\": -59.18065174130953, \"translation_dx\": -66.15733764198566, \"translation_dy\": -32.06450758946801, \"scale\": 1.1967157159259998}\nD: {\"rotation_angle\": -153.3687774434925, \"translation_dx\": 50.92336593606055, \"translation_dy\": -56.81603844715568, \"scale\": 1.398231264497651}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -132.6730586187399, \"translation_dx\": -14.723128468316531, \"translation_dy\": -95.44210429834934, \"scale\": 1.0421065600095725}\nB: {\"rotation_angle\": -15.445234303955033, \"translation_dx\": 52.656313993324545, \"translation_dy\": 4.243768644047549, \"scale\": 0.8747335302455691}\nC: {\"rotation_angle\": -59.18065174130953, \"translation_dx\": -66.15733764198566, \"translation_dy\": -32.06450758946801, \"scale\": 1.1967157159259998}\nD: {\"rotation_angle\": -153.3687774434925, \"translation_dx\": 50.92336593606055, \"translation_dy\": -56.81603844715568, \"scale\": 1.398231264497651}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_155_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_155_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -153.3687774434925, \"translation_dx\": 50.92336593606055, \"translation_dy\": -56.81603844715568, \"scale\": 1.398231264497651}\nB: {\"rotation_angle\": -124.74198080809023, \"translation_dx\": -48.23531115232953, \"translation_dy\": 52.62526617026404, \"scale\": 1.3484625774406969}\nC: {\"rotation_angle\": -132.6730586187399, \"translation_dx\": -14.723128468316531, \"translation_dy\": -95.44210429834934, \"scale\": 1.0421065600095725}\nD: {\"rotation_angle\": -32.057796286961064, \"translation_dx\": 119.50392135854452, \"translation_dy\": -17.786253698900993, \"scale\": 1.4583062003808291}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -153.3687774434925, \"translation_dx\": 50.92336593606055, \"translation_dy\": -56.81603844715568, \"scale\": 1.398231264497651}\nB: {\"rotation_angle\": -124.74198080809023, \"translation_dx\": -48.23531115232953, \"translation_dy\": 52.62526617026404, \"scale\": 1.3484625774406969}\nC: {\"rotation_angle\": -132.6730586187399, \"translation_dx\": -14.723128468316531, \"translation_dy\": -95.44210429834934, \"scale\": 1.0421065600095725}\nD: {\"rotation_angle\": -32.057796286961064, \"translation_dx\": 119.50392135854452, \"translation_dy\": -17.786253698900993, \"scale\": 1.4583062003808291}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_156_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_156_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 52.27392299801002, \"translation_dx\": -7.943242591889941, \"translation_dy\": -1.8318597711701017, \"scale\": 1.489664776133741}\nB: {\"rotation_angle\": -153.95647753312159, \"translation_dx\": 64.08546266437509, \"translation_dy\": -34.554486291313935, \"scale\": 1.423360690418288}\nC: {\"rotation_angle\": -126.15991399279281, \"translation_dx\": 24.895638463286446, \"translation_dy\": -35.71086816730676, \"scale\": 1.30648936857296}\nD: {\"rotation_angle\": 98.62110540120432, \"translation_dx\": 55.8324503005326, \"translation_dy\": -53.32963696213369, \"scale\": 1.3342375308232577}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 52.27392299801002, \"translation_dx\": -7.943242591889941, \"translation_dy\": -1.8318597711701017, \"scale\": 1.489664776133741}\nB: {\"rotation_angle\": -153.95647753312159, \"translation_dx\": 64.08546266437509, \"translation_dy\": -34.554486291313935, \"scale\": 1.423360690418288}\nC: {\"rotation_angle\": -126.15991399279281, \"translation_dx\": 24.895638463286446, \"translation_dy\": -35.71086816730676, \"scale\": 1.30648936857296}\nD: {\"rotation_angle\": 98.62110540120432, \"translation_dx\": 55.8324503005326, \"translation_dy\": -53.32963696213369, \"scale\": 1.3342375308232577}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_157_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_157_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -44.30781692045639, \"translation_dx\": -23.473696812537305, \"translation_dy\": -94.42952089946652, \"scale\": 1.4029179362735564}\nB: {\"rotation_angle\": 157.75388648393812, \"translation_dx\": 20.356281771878216, \"translation_dy\": 16.09866009065132, \"scale\": 0.523349135390574}\nC: {\"rotation_angle\": -76.09611957445006, \"translation_dx\": -118.19634710213703, \"translation_dy\": 85.91610719889127, \"scale\": 1.371999627635525}\nD: {\"rotation_angle\": 168.86687879669455, \"translation_dx\": 30.327287286076626, \"translation_dy\": -73.84263373893171, \"scale\": 1.0887904122788439}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -44.30781692045639, \"translation_dx\": -23.473696812537305, \"translation_dy\": -94.42952089946652, \"scale\": 1.4029179362735564}\nB: {\"rotation_angle\": 157.75388648393812, \"translation_dx\": 20.356281771878216, \"translation_dy\": 16.09866009065132, \"scale\": 0.523349135390574}\nC: {\"rotation_angle\": -76.09611957445006, \"translation_dx\": -118.19634710213703, \"translation_dy\": 85.91610719889127, \"scale\": 1.371999627635525}\nD: {\"rotation_angle\": 168.86687879669455, \"translation_dx\": 30.327287286076626, \"translation_dy\": -73.84263373893171, \"scale\": 1.0887904122788439}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_158_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_158_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 107.15748471049534, \"translation_dx\": -112.04520804841785, \"translation_dy\": 107.36899853350675, \"scale\": 0.784106447062462}\nB: {\"rotation_angle\": -76.09611957445006, \"translation_dx\": -118.19634710213703, \"translation_dy\": 85.91610719889127, \"scale\": 1.371999627635525}\nC: {\"rotation_angle\": -16.878745814478265, \"translation_dx\": -68.86659110743665, \"translation_dy\": -98.54142762965468, \"scale\": 1.2648663928919022}\nD: {\"rotation_angle\": 46.42160956908356, \"translation_dx\": -90.04619228512212, \"translation_dy\": -15.749486436572411, \"scale\": 1.005156310055277}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 107.15748471049534, \"translation_dx\": -112.04520804841785, \"translation_dy\": 107.36899853350675, \"scale\": 0.784106447062462}\nB: {\"rotation_angle\": -76.09611957445006, \"translation_dx\": -118.19634710213703, \"translation_dy\": 85.91610719889127, \"scale\": 1.371999627635525}\nC: {\"rotation_angle\": -16.878745814478265, \"translation_dx\": -68.86659110743665, \"translation_dy\": -98.54142762965468, \"scale\": 1.2648663928919022}\nD: {\"rotation_angle\": 46.42160956908356, \"translation_dx\": -90.04619228512212, \"translation_dy\": -15.749486436572411, \"scale\": 1.005156310055277}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_159_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_159_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 2.6800660606496933, \"translation_dx\": 8.805898944242955, \"translation_dy\": -61.557448223727356, \"scale\": 0.7338009245004858}\nB: {\"rotation_angle\": -162.34443008832744, \"translation_dx\": 11.222356042803995, \"translation_dy\": -20.913798214168963, \"scale\": 0.5876305148063811}\nC: {\"rotation_angle\": -41.748048059314925, \"translation_dx\": 84.2495675740148, \"translation_dy\": -81.02778113177463, \"scale\": 1.207158201764622}\nD: {\"rotation_angle\": 97.63348280388993, \"translation_dx\": 59.62332527691919, \"translation_dy\": 12.549462794922746, \"scale\": 0.6927080624806098}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 2.6800660606496933, \"translation_dx\": 8.805898944242955, \"translation_dy\": -61.557448223727356, \"scale\": 0.7338009245004858}\nB: {\"rotation_angle\": -162.34443008832744, \"translation_dx\": 11.222356042803995, \"translation_dy\": -20.913798214168963, \"scale\": 0.5876305148063811}\nC: {\"rotation_angle\": -41.748048059314925, \"translation_dx\": 84.2495675740148, \"translation_dy\": -81.02778113177463, \"scale\": 1.207158201764622}\nD: {\"rotation_angle\": 97.63348280388993, \"translation_dx\": 59.62332527691919, \"translation_dy\": 12.549462794922746, \"scale\": 0.6927080624806098}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_160_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_160_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -127.2688410750471, \"translation_dx\": 10.330064507300825, \"translation_dy\": -25.010404065134438, \"scale\": 1.1376215421095472}\nB: {\"rotation_angle\": 123.61853421760617, \"translation_dx\": -93.63136806510369, \"translation_dy\": -15.65687765252683, \"scale\": 0.9834422929774667}\nC: {\"rotation_angle\": -49.11147497176091, \"translation_dx\": -21.61309921155923, \"translation_dy\": 41.841400081955015, \"scale\": 1.3374733710705384}\nD: {\"rotation_angle\": -22.98450105670534, \"translation_dx\": -24.343109907781525, \"translation_dy\": -75.50859401578859, \"scale\": 0.5077440368943875}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -127.2688410750471, \"translation_dx\": 10.330064507300825, \"translation_dy\": -25.010404065134438, \"scale\": 1.1376215421095472}\nB: {\"rotation_angle\": 123.61853421760617, \"translation_dx\": -93.63136806510369, \"translation_dy\": -15.65687765252683, \"scale\": 0.9834422929774667}\nC: {\"rotation_angle\": -49.11147497176091, \"translation_dx\": -21.61309921155923, \"translation_dy\": 41.841400081955015, \"scale\": 1.3374733710705384}\nD: {\"rotation_angle\": -22.98450105670534, \"translation_dx\": -24.343109907781525, \"translation_dy\": -75.50859401578859, \"scale\": 0.5077440368943875}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_161_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_161_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 4.601729825002167, \"translation_dx\": -92.34842360064926, \"translation_dy\": 78.34726427877602, \"scale\": 0.7620115680057987}\nB: {\"rotation_angle\": 2.6800660606496933, \"translation_dx\": 8.805898944242955, \"translation_dy\": -61.557448223727356, \"scale\": 0.7338009245004858}\nC: {\"rotation_angle\": -110.46391589612124, \"translation_dx\": -77.96644542647721, \"translation_dy\": -50.23500265461973, \"scale\": 0.7651088884143488}\nD: {\"rotation_angle\": 159.74516071456964, \"translation_dx\": 18.36539372865252, \"translation_dy\": -32.68583255299669, \"scale\": 0.6283421405871866}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 4.601729825002167, \"translation_dx\": -92.34842360064926, \"translation_dy\": 78.34726427877602, \"scale\": 0.7620115680057987}\nB: {\"rotation_angle\": 2.6800660606496933, \"translation_dx\": 8.805898944242955, \"translation_dy\": -61.557448223727356, \"scale\": 0.7338009245004858}\nC: {\"rotation_angle\": -110.46391589612124, \"translation_dx\": -77.96644542647721, \"translation_dy\": -50.23500265461973, \"scale\": 0.7651088884143488}\nD: {\"rotation_angle\": 159.74516071456964, \"translation_dx\": 18.36539372865252, \"translation_dy\": -32.68583255299669, \"scale\": 0.6283421405871866}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_162_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_162_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 111.11665430921613, \"translation_dx\": -45.526232266105865, \"translation_dy\": -71.56835409165808, \"scale\": 0.5234271564227445}\nB: {\"rotation_angle\": -137.58016126496426, \"translation_dx\": 45.631572391068715, \"translation_dy\": -54.72741054396442, \"scale\": 1.391656794638211}\nC: {\"rotation_angle\": -162.34443008832744, \"translation_dx\": 11.222356042803995, \"translation_dy\": -20.913798214168963, \"scale\": 0.5876305148063811}\nD: {\"rotation_angle\": 107.15748471049534, \"translation_dx\": -112.04520804841785, \"translation_dy\": 107.36899853350675, \"scale\": 0.784106447062462}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 111.11665430921613, \"translation_dx\": -45.526232266105865, \"translation_dy\": -71.56835409165808, \"scale\": 0.5234271564227445}\nB: {\"rotation_angle\": -137.58016126496426, \"translation_dx\": 45.631572391068715, \"translation_dy\": -54.72741054396442, \"scale\": 1.391656794638211}\nC: {\"rotation_angle\": -162.34443008832744, \"translation_dx\": 11.222356042803995, \"translation_dy\": -20.913798214168963, \"scale\": 0.5876305148063811}\nD: {\"rotation_angle\": 107.15748471049534, \"translation_dx\": -112.04520804841785, \"translation_dy\": 107.36899853350675, \"scale\": 0.784106447062462}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_163_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_163_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -127.2688410750471, \"translation_dx\": 10.330064507300825, \"translation_dy\": -25.010404065134438, \"scale\": 1.1376215421095472}\nB: {\"rotation_angle\": -162.34443008832744, \"translation_dx\": 11.222356042803995, \"translation_dy\": -20.913798214168963, \"scale\": 0.5876305148063811}\nC: {\"rotation_angle\": -53.475823147809436, \"translation_dx\": -52.11444637245131, \"translation_dy\": -7.974464084606126, \"scale\": 1.302004904680502}\nD: {\"rotation_angle\": 64.33574528550244, \"translation_dx\": -83.09111528364858, \"translation_dy\": 12.26726314152404, \"scale\": 0.7845370507816389}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -127.2688410750471, \"translation_dx\": 10.330064507300825, \"translation_dy\": -25.010404065134438, \"scale\": 1.1376215421095472}\nB: {\"rotation_angle\": -162.34443008832744, \"translation_dx\": 11.222356042803995, \"translation_dy\": -20.913798214168963, \"scale\": 0.5876305148063811}\nC: {\"rotation_angle\": -53.475823147809436, \"translation_dx\": -52.11444637245131, \"translation_dy\": -7.974464084606126, \"scale\": 1.302004904680502}\nD: {\"rotation_angle\": 64.33574528550244, \"translation_dx\": -83.09111528364858, \"translation_dy\": 12.26726314152404, \"scale\": 0.7845370507816389}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_164_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_164_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 26.051749493295517, \"translation_dx\": 8.674153667650117, \"translation_dy\": 81.98381249796742, \"scale\": 1.4721363798843865}\nB: {\"rotation_angle\": 23.955007488404988, \"translation_dx\": 90.0018582930472, \"translation_dy\": 38.03553582875617, \"scale\": 1.3380437802347522}\nC: {\"rotation_angle\": 136.76946369368522, \"translation_dx\": 86.13615517916296, \"translation_dy\": 47.49597577737802, \"scale\": 1.1842967613683704}\nD: {\"rotation_angle\": 157.75388648393812, \"translation_dx\": 20.356281771878216, \"translation_dy\": 16.09866009065132, \"scale\": 0.523349135390574}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 26.051749493295517, \"translation_dx\": 8.674153667650117, \"translation_dy\": 81.98381249796742, \"scale\": 1.4721363798843865}\nB: {\"rotation_angle\": 23.955007488404988, \"translation_dx\": 90.0018582930472, \"translation_dy\": 38.03553582875617, \"scale\": 1.3380437802347522}\nC: {\"rotation_angle\": 136.76946369368522, \"translation_dx\": 86.13615517916296, \"translation_dy\": 47.49597577737802, \"scale\": 1.1842967613683704}\nD: {\"rotation_angle\": 157.75388648393812, \"translation_dx\": 20.356281771878216, \"translation_dy\": 16.09866009065132, \"scale\": 0.523349135390574}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_165_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_165_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -128.93587705152078, \"translation_dx\": 48.830662388872895, \"translation_dy\": 65.60255696435819, \"scale\": 0.5618983722639579}\nB: {\"rotation_angle\": 159.25105466068987, \"translation_dx\": -126.35420360425098, \"translation_dy\": -17.54721978726404, \"scale\": 1.4952435062275256}\nC: {\"rotation_angle\": -178.96154331790243, \"translation_dx\": -45.831117140591004, \"translation_dy\": 14.962223802901406, \"scale\": 1.4059876442036168}\nD: {\"rotation_angle\": -97.38730278840897, \"translation_dx\": 79.58431404822528, \"translation_dy\": -65.17570525641105, \"scale\": 0.8501057849742453}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -128.93587705152078, \"translation_dx\": 48.830662388872895, \"translation_dy\": 65.60255696435819, \"scale\": 0.5618983722639579}\nB: {\"rotation_angle\": 159.25105466068987, \"translation_dx\": -126.35420360425098, \"translation_dy\": -17.54721978726404, \"scale\": 1.4952435062275256}\nC: {\"rotation_angle\": -178.96154331790243, \"translation_dx\": -45.831117140591004, \"translation_dy\": 14.962223802901406, \"scale\": 1.4059876442036168}\nD: {\"rotation_angle\": -97.38730278840897, \"translation_dx\": 79.58431404822528, \"translation_dy\": -65.17570525641105, \"scale\": 0.8501057849742453}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_166_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_166_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 115.16030768984217, \"translation_dx\": -1.9669547188467504, \"translation_dy\": 38.42152609256746, \"scale\": 1.3403221872922475}\nB: {\"rotation_angle\": -100.94596249363259, \"translation_dx\": 18.493532966543597, \"translation_dy\": -4.904135882610319, \"scale\": 1.1575890826518318}\nC: {\"rotation_angle\": 28.728757892682808, \"translation_dx\": 12.065384659700086, \"translation_dy\": -119.64549643343977, \"scale\": 1.126100132224236}\nD: {\"rotation_angle\": 173.6372649335733, \"translation_dx\": -7.357207392874017, \"translation_dy\": -51.70776156994498, \"scale\": 1.09720142096939}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 115.16030768984217, \"translation_dx\": -1.9669547188467504, \"translation_dy\": 38.42152609256746, \"scale\": 1.3403221872922475}\nB: {\"rotation_angle\": -100.94596249363259, \"translation_dx\": 18.493532966543597, \"translation_dy\": -4.904135882610319, \"scale\": 1.1575890826518318}\nC: {\"rotation_angle\": 28.728757892682808, \"translation_dx\": 12.065384659700086, \"translation_dy\": -119.64549643343977, \"scale\": 1.126100132224236}\nD: {\"rotation_angle\": 173.6372649335733, \"translation_dx\": -7.357207392874017, \"translation_dy\": -51.70776156994498, \"scale\": 1.09720142096939}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_167_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_167_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 115.16030768984217, \"translation_dx\": -1.9669547188467504, \"translation_dy\": 38.42152609256746, \"scale\": 1.3403221872922475}\nB: {\"rotation_angle\": 64.33574528550244, \"translation_dx\": -83.09111528364858, \"translation_dy\": 12.26726314152404, \"scale\": 0.7845370507816389}\nC: {\"rotation_angle\": -97.38730278840897, \"translation_dx\": 79.58431404822528, \"translation_dy\": -65.17570525641105, \"scale\": 0.8501057849742453}\nD: {\"rotation_angle\": -4.956802948250129, \"translation_dx\": -46.115491929325685, \"translation_dy\": 39.01349173096322, \"scale\": 1.02280257064298}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 115.16030768984217, \"translation_dx\": -1.9669547188467504, \"translation_dy\": 38.42152609256746, \"scale\": 1.3403221872922475}\nB: {\"rotation_angle\": 64.33574528550244, \"translation_dx\": -83.09111528364858, \"translation_dy\": 12.26726314152404, \"scale\": 0.7845370507816389}\nC: {\"rotation_angle\": -97.38730278840897, \"translation_dx\": 79.58431404822528, \"translation_dy\": -65.17570525641105, \"scale\": 0.8501057849742453}\nD: {\"rotation_angle\": -4.956802948250129, \"translation_dx\": -46.115491929325685, \"translation_dy\": 39.01349173096322, \"scale\": 1.02280257064298}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_168_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_168_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -44.902472769484746, \"translation_dx\": -36.85475324083902, \"translation_dy\": 36.81692000181951, \"scale\": 1.0769710077370194}\nB: {\"rotation_angle\": -161.22593365548192, \"translation_dx\": -119.73961882572601, \"translation_dy\": -93.50838821854722, \"scale\": 1.4476413063179399}\nC: {\"rotation_angle\": 32.25033099080062, \"translation_dx\": -33.246475706714875, \"translation_dy\": -9.848772328845214, \"scale\": 0.986502265576198}\nD: {\"rotation_angle\": 55.990963226006784, \"translation_dx\": 71.2358057599877, \"translation_dy\": 22.751866785772563, \"scale\": 1.4964705985201703}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -44.902472769484746, \"translation_dx\": -36.85475324083902, \"translation_dy\": 36.81692000181951, \"scale\": 1.0769710077370194}\nB: {\"rotation_angle\": -161.22593365548192, \"translation_dx\": -119.73961882572601, \"translation_dy\": -93.50838821854722, \"scale\": 1.4476413063179399}\nC: {\"rotation_angle\": 32.25033099080062, \"translation_dx\": -33.246475706714875, \"translation_dy\": -9.848772328845214, \"scale\": 0.986502265576198}\nD: {\"rotation_angle\": 55.990963226006784, \"translation_dx\": 71.2358057599877, \"translation_dy\": 22.751866785772563, \"scale\": 1.4964705985201703}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_169_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_169_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -79.27003163090343, \"translation_dx\": 8.207736130313549, \"translation_dy\": 6.670417118750038, \"scale\": 1.3327657238113826}\nB: {\"rotation_angle\": -32.057796286961064, \"translation_dx\": 119.50392135854452, \"translation_dy\": -17.786253698900993, \"scale\": 1.4583062003808291}\nC: {\"rotation_angle\": 172.84173099768327, \"translation_dx\": -36.82796075364796, \"translation_dy\": -15.346257103503191, \"scale\": 0.8112655094699114}\nD: {\"rotation_angle\": 138.15953129001275, \"translation_dx\": 108.29077351507729, \"translation_dy\": 11.25207260435026, \"scale\": 1.2682750116992958}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -79.27003163090343, \"translation_dx\": 8.207736130313549, \"translation_dy\": 6.670417118750038, \"scale\": 1.3327657238113826}\nB: {\"rotation_angle\": -32.057796286961064, \"translation_dx\": 119.50392135854452, \"translation_dy\": -17.786253698900993, \"scale\": 1.4583062003808291}\nC: {\"rotation_angle\": 172.84173099768327, \"translation_dx\": -36.82796075364796, \"translation_dy\": -15.346257103503191, \"scale\": 0.8112655094699114}\nD: {\"rotation_angle\": 138.15953129001275, \"translation_dx\": 108.29077351507729, \"translation_dy\": 11.25207260435026, \"scale\": 1.2682750116992958}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_170_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_170_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 111.11665430921613, \"translation_dx\": -45.526232266105865, \"translation_dy\": -71.56835409165808, \"scale\": 0.5234271564227445}\nB: {\"rotation_angle\": -152.40502323992493, \"translation_dx\": -0.6096313646742146, \"translation_dy\": 26.2224872549711, \"scale\": 0.6008305458537412}\nC: {\"rotation_angle\": 136.2943203908062, \"translation_dx\": 59.15508525636656, \"translation_dy\": -38.46099161723379, \"scale\": 0.6414776081953896}\nD: {\"rotation_angle\": 142.66976946716716, \"translation_dx\": 29.963541003119957, \"translation_dy\": 66.07065092305665, \"scale\": 1.42144068359999}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 111.11665430921613, \"translation_dx\": -45.526232266105865, \"translation_dy\": -71.56835409165808, \"scale\": 0.5234271564227445}\nB: {\"rotation_angle\": -152.40502323992493, \"translation_dx\": -0.6096313646742146, \"translation_dy\": 26.2224872549711, \"scale\": 0.6008305458537412}\nC: {\"rotation_angle\": 136.2943203908062, \"translation_dx\": 59.15508525636656, \"translation_dy\": -38.46099161723379, \"scale\": 0.6414776081953896}\nD: {\"rotation_angle\": 142.66976946716716, \"translation_dx\": 29.963541003119957, \"translation_dy\": 66.07065092305665, \"scale\": 1.42144068359999}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_171_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_171_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 163.34031080178892, \"translation_dx\": -21.567151354845635, \"translation_dy\": -30.72615389540148, \"scale\": 1.2439888416024685}\nB: {\"rotation_angle\": 107.15748471049534, \"translation_dx\": -112.04520804841785, \"translation_dy\": 107.36899853350675, \"scale\": 0.784106447062462}\nC: {\"rotation_angle\": -35.37165300247324, \"translation_dx\": -51.674784510203665, \"translation_dy\": 35.0550301640573, \"scale\": 1.181842779166554}\nD: {\"rotation_angle\": -147.17742740700606, \"translation_dx\": 99.79022385553455, \"translation_dy\": -46.32888217161055, \"scale\": 1.2561938294527635}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 163.34031080178892, \"translation_dx\": -21.567151354845635, \"translation_dy\": -30.72615389540148, \"scale\": 1.2439888416024685}\nB: {\"rotation_angle\": 107.15748471049534, \"translation_dx\": -112.04520804841785, \"translation_dy\": 107.36899853350675, \"scale\": 0.784106447062462}\nC: {\"rotation_angle\": -35.37165300247324, \"translation_dx\": -51.674784510203665, \"translation_dy\": 35.0550301640573, \"scale\": 1.181842779166554}\nD: {\"rotation_angle\": -147.17742740700606, \"translation_dx\": 99.79022385553455, \"translation_dy\": -46.32888217161055, \"scale\": 1.2561938294527635}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_172_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_172_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 139.13421797404374, \"translation_dx\": -107.62188977651758, \"translation_dy\": -65.35657968686931, \"scale\": 0.569575564082204}\nB: {\"rotation_angle\": 14.369437993555863, \"translation_dx\": -23.54312301695805, \"translation_dy\": 55.41046511147678, \"scale\": 1.115345902394854}\nC: {\"rotation_angle\": -165.5576257925042, \"translation_dx\": 120.02978270991923, \"translation_dy\": -94.68626204020723, \"scale\": 1.377433782383828}\nD: {\"rotation_angle\": 26.06413776863195, \"translation_dx\": 104.54441011530889, \"translation_dy\": -2.802993361858995, \"scale\": 0.6919535578881184}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 139.13421797404374, \"translation_dx\": -107.62188977651758, \"translation_dy\": -65.35657968686931, \"scale\": 0.569575564082204}\nB: {\"rotation_angle\": 14.369437993555863, \"translation_dx\": -23.54312301695805, \"translation_dy\": 55.41046511147678, \"scale\": 1.115345902394854}\nC: {\"rotation_angle\": -165.5576257925042, \"translation_dx\": 120.02978270991923, \"translation_dy\": -94.68626204020723, \"scale\": 1.377433782383828}\nD: {\"rotation_angle\": 26.06413776863195, \"translation_dx\": 104.54441011530889, \"translation_dy\": -2.802993361858995, \"scale\": 0.6919535578881184}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_173_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_173_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -53.475823147809436, \"translation_dx\": -52.11444637245131, \"translation_dy\": -7.974464084606126, \"scale\": 1.302004904680502}\nB: {\"rotation_angle\": 104.66960596229086, \"translation_dx\": 122.9579606372167, \"translation_dy\": -32.21502556645471, \"scale\": 0.5791563638149022}\nC: {\"rotation_angle\": -14.958482221349612, \"translation_dx\": 49.62118662103501, \"translation_dy\": -13.943537967490855, \"scale\": 1.489574484959727}\nD: {\"rotation_angle\": 51.652651058291696, \"translation_dx\": -79.60059266318888, \"translation_dy\": 40.24223939512936, \"scale\": 1.045377495061187}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -53.475823147809436, \"translation_dx\": -52.11444637245131, \"translation_dy\": -7.974464084606126, \"scale\": 1.302004904680502}\nB: {\"rotation_angle\": 104.66960596229086, \"translation_dx\": 122.9579606372167, \"translation_dy\": -32.21502556645471, \"scale\": 0.5791563638149022}\nC: {\"rotation_angle\": -14.958482221349612, \"translation_dx\": 49.62118662103501, \"translation_dy\": -13.943537967490855, \"scale\": 1.489574484959727}\nD: {\"rotation_angle\": 51.652651058291696, \"translation_dx\": -79.60059266318888, \"translation_dy\": 40.24223939512936, \"scale\": 1.045377495061187}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_174_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_174_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 160.04018122869564, \"translation_dx\": -10.031879581871024, \"translation_dy\": 74.10075881851205, \"scale\": 0.8976020445815951}\nB: {\"rotation_angle\": 134.22497079750707, \"translation_dx\": -56.33244292094708, \"translation_dy\": 12.15417280277697, \"scale\": 1.260404381889235}\nC: {\"rotation_angle\": 143.38145335973087, \"translation_dx\": 86.67970142496799, \"translation_dy\": -33.57640317277091, \"scale\": 0.6114655384261714}\nD: {\"rotation_angle\": 162.98131081099467, \"translation_dx\": -80.19473687776261, \"translation_dy\": -17.70282064458462, \"scale\": 1.2855975600149028}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 160.04018122869564, \"translation_dx\": -10.031879581871024, \"translation_dy\": 74.10075881851205, \"scale\": 0.8976020445815951}\nB: {\"rotation_angle\": 134.22497079750707, \"translation_dx\": -56.33244292094708, \"translation_dy\": 12.15417280277697, \"scale\": 1.260404381889235}\nC: {\"rotation_angle\": 143.38145335973087, \"translation_dx\": 86.67970142496799, \"translation_dy\": -33.57640317277091, \"scale\": 0.6114655384261714}\nD: {\"rotation_angle\": 162.98131081099467, \"translation_dx\": -80.19473687776261, \"translation_dy\": -17.70282064458462, \"scale\": 1.2855975600149028}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_175_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_175_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 170.5673161572617, \"translation_dx\": -54.14309140946517, \"translation_dy\": -20.9067824061149, \"scale\": 0.74080987054586}\nB: {\"rotation_angle\": 98.88222011850513, \"translation_dx\": 98.58699088344886, \"translation_dy\": 52.424259863835346, \"scale\": 0.8670994673205047}\nC: {\"rotation_angle\": 107.15748471049534, \"translation_dx\": -112.04520804841785, \"translation_dy\": 107.36899853350675, \"scale\": 0.784106447062462}\nD: {\"rotation_angle\": -78.36766094840773, \"translation_dx\": -86.41466180609471, \"translation_dy\": 63.19530077419013, \"scale\": 0.608403973907593}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 170.5673161572617, \"translation_dx\": -54.14309140946517, \"translation_dy\": -20.9067824061149, \"scale\": 0.74080987054586}\nB: {\"rotation_angle\": 98.88222011850513, \"translation_dx\": 98.58699088344886, \"translation_dy\": 52.424259863835346, \"scale\": 0.8670994673205047}\nC: {\"rotation_angle\": 107.15748471049534, \"translation_dx\": -112.04520804841785, \"translation_dy\": 107.36899853350675, \"scale\": 0.784106447062462}\nD: {\"rotation_angle\": -78.36766094840773, \"translation_dx\": -86.41466180609471, \"translation_dy\": 63.19530077419013, \"scale\": 0.608403973907593}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_176_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_176_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -75.97132980340905, \"translation_dx\": 6.960702322199779, \"translation_dy\": 90.08754109424518, \"scale\": 1.363389071715864}\nB: {\"rotation_angle\": 84.88997243843744, \"translation_dx\": 19.30269357274682, \"translation_dy\": 9.929350250110147, \"scale\": 1.0595552381550672}\nC: {\"rotation_angle\": 106.62912259997893, \"translation_dx\": -62.19399566166837, \"translation_dy\": -63.078041204745844, \"scale\": 1.4577244189370733}\nD: {\"rotation_angle\": 26.06413776863195, \"translation_dx\": 104.54441011530889, \"translation_dy\": -2.802993361858995, \"scale\": 0.6919535578881184}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -75.97132980340905, \"translation_dx\": 6.960702322199779, \"translation_dy\": 90.08754109424518, \"scale\": 1.363389071715864}\nB: {\"rotation_angle\": 84.88997243843744, \"translation_dx\": 19.30269357274682, \"translation_dy\": 9.929350250110147, \"scale\": 1.0595552381550672}\nC: {\"rotation_angle\": 106.62912259997893, \"translation_dx\": -62.19399566166837, \"translation_dy\": -63.078041204745844, \"scale\": 1.4577244189370733}\nD: {\"rotation_angle\": 26.06413776863195, \"translation_dx\": 104.54441011530889, \"translation_dy\": -2.802993361858995, \"scale\": 0.6919535578881184}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_177_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_177_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -149.42147215379055, \"translation_dx\": 2.3444194857030283, \"translation_dy\": 35.92779325530762, \"scale\": 1.0223945055206394}\nB: {\"rotation_angle\": 26.051749493295517, \"translation_dx\": 8.674153667650117, \"translation_dy\": 81.98381249796742, \"scale\": 1.4721363798843865}\nC: {\"rotation_angle\": -106.99875725121946, \"translation_dx\": 87.96881157950656, \"translation_dy\": -34.70529343588741, \"scale\": 1.407305489874207}\nD: {\"rotation_angle\": 14.369437993555863, \"translation_dx\": -23.54312301695805, \"translation_dy\": 55.41046511147678, \"scale\": 1.115345902394854}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -149.42147215379055, \"translation_dx\": 2.3444194857030283, \"translation_dy\": 35.92779325530762, \"scale\": 1.0223945055206394}\nB: {\"rotation_angle\": 26.051749493295517, \"translation_dx\": 8.674153667650117, \"translation_dy\": 81.98381249796742, \"scale\": 1.4721363798843865}\nC: {\"rotation_angle\": -106.99875725121946, \"translation_dx\": 87.96881157950656, \"translation_dy\": -34.70529343588741, \"scale\": 1.407305489874207}\nD: {\"rotation_angle\": 14.369437993555863, \"translation_dx\": -23.54312301695805, \"translation_dy\": 55.41046511147678, \"scale\": 1.115345902394854}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_178_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_178_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 104.66960596229086, \"translation_dx\": 122.9579606372167, \"translation_dy\": -32.21502556645471, \"scale\": 0.5791563638149022}\nB: {\"rotation_angle\": -138.01409324857718, \"translation_dx\": -15.316687484355015, \"translation_dy\": 65.85955726482798, \"scale\": 0.7544815678306976}\nC: {\"rotation_angle\": 143.38145335973087, \"translation_dx\": 86.67970142496799, \"translation_dy\": -33.57640317277091, \"scale\": 0.6114655384261714}\nD: {\"rotation_angle\": -106.99875725121946, \"translation_dx\": 87.96881157950656, \"translation_dy\": -34.70529343588741, \"scale\": 1.407305489874207}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 104.66960596229086, \"translation_dx\": 122.9579606372167, \"translation_dy\": -32.21502556645471, \"scale\": 0.5791563638149022}\nB: {\"rotation_angle\": -138.01409324857718, \"translation_dx\": -15.316687484355015, \"translation_dy\": 65.85955726482798, \"scale\": 0.7544815678306976}\nC: {\"rotation_angle\": 143.38145335973087, \"translation_dx\": 86.67970142496799, \"translation_dy\": -33.57640317277091, \"scale\": 0.6114655384261714}\nD: {\"rotation_angle\": -106.99875725121946, \"translation_dx\": 87.96881157950656, \"translation_dy\": -34.70529343588741, \"scale\": 1.407305489874207}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_179_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_179_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -103.5561502427767, \"translation_dx\": -75.76940431238745, \"translation_dy\": -48.3479107136017, \"scale\": 1.0522987713432983}\nB: {\"rotation_angle\": 136.76946369368522, \"translation_dx\": 86.13615517916296, \"translation_dy\": 47.49597577737802, \"scale\": 1.1842967613683704}\nC: {\"rotation_angle\": 159.74516071456964, \"translation_dx\": 18.36539372865252, \"translation_dy\": -32.68583255299669, \"scale\": 0.6283421405871866}\nD: {\"rotation_angle\": 84.88997243843744, \"translation_dx\": 19.30269357274682, \"translation_dy\": 9.929350250110147, \"scale\": 1.0595552381550672}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -103.5561502427767, \"translation_dx\": -75.76940431238745, \"translation_dy\": -48.3479107136017, \"scale\": 1.0522987713432983}\nB: {\"rotation_angle\": 136.76946369368522, \"translation_dx\": 86.13615517916296, \"translation_dy\": 47.49597577737802, \"scale\": 1.1842967613683704}\nC: {\"rotation_angle\": 159.74516071456964, \"translation_dx\": 18.36539372865252, \"translation_dy\": -32.68583255299669, \"scale\": 0.6283421405871866}\nD: {\"rotation_angle\": 84.88997243843744, \"translation_dx\": 19.30269357274682, \"translation_dy\": 9.929350250110147, \"scale\": 1.0595552381550672}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_180_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_180_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -8.756342422911757, \"translation_dx\": -120.12147874311805, \"translation_dy\": -16.659510954699698, \"scale\": 0.8471832394055047}\nB: {\"rotation_angle\": 134.22497079750707, \"translation_dx\": -56.33244292094708, \"translation_dy\": 12.15417280277697, \"scale\": 1.260404381889235}\nC: {\"rotation_angle\": -173.49565975712173, \"translation_dx\": 30.5303454517925, \"translation_dy\": 77.86216107455405, \"scale\": 1.067173806992701}\nD: {\"rotation_angle\": 148.22875373623708, \"translation_dx\": 53.75338658972072, \"translation_dy\": -63.78583022927253, \"scale\": 0.9304836306567924}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -8.756342422911757, \"translation_dx\": -120.12147874311805, \"translation_dy\": -16.659510954699698, \"scale\": 0.8471832394055047}\nB: {\"rotation_angle\": 134.22497079750707, \"translation_dx\": -56.33244292094708, \"translation_dy\": 12.15417280277697, \"scale\": 1.260404381889235}\nC: {\"rotation_angle\": -173.49565975712173, \"translation_dx\": 30.5303454517925, \"translation_dy\": 77.86216107455405, \"scale\": 1.067173806992701}\nD: {\"rotation_angle\": 148.22875373623708, \"translation_dx\": 53.75338658972072, \"translation_dy\": -63.78583022927253, \"scale\": 0.9304836306567924}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_181_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_181_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -169.57691070181107, \"translation_dx\": 67.3776951722352, \"translation_dy\": 6.393739311338578, \"scale\": 0.8283042543093307}\nB: {\"rotation_angle\": -6.38420562293993, \"translation_dx\": -106.80670691302902, \"translation_dy\": -3.5935098985529663, \"scale\": 1.3037846299861797}\nC: {\"rotation_angle\": 134.66606893121838, \"translation_dx\": 30.71289427748178, \"translation_dy\": 31.00111281943242, \"scale\": 0.9716368665085688}\nD: {\"rotation_angle\": 171.23105805984426, \"translation_dx\": 28.800906238980815, \"translation_dy\": 60.921924115709544, \"scale\": 1.4441070487112413}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -169.57691070181107, \"translation_dx\": 67.3776951722352, \"translation_dy\": 6.393739311338578, \"scale\": 0.8283042543093307}\nB: {\"rotation_angle\": -6.38420562293993, \"translation_dx\": -106.80670691302902, \"translation_dy\": -3.5935098985529663, \"scale\": 1.3037846299861797}\nC: {\"rotation_angle\": 134.66606893121838, \"translation_dx\": 30.71289427748178, \"translation_dy\": 31.00111281943242, \"scale\": 0.9716368665085688}\nD: {\"rotation_angle\": 171.23105805984426, \"translation_dx\": 28.800906238980815, \"translation_dy\": 60.921924115709544, \"scale\": 1.4441070487112413}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_182_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_182_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 18.52926347539298, \"translation_dx\": -26.155433185237058, \"translation_dy\": -39.799299198218556, \"scale\": 0.9355127285855813}\nB: {\"rotation_angle\": 148.22875373623708, \"translation_dx\": 53.75338658972072, \"translation_dy\": -63.78583022927253, \"scale\": 0.9304836306567924}\nC: {\"rotation_angle\": 72.25092677282458, \"translation_dx\": 61.389740502873025, \"translation_dy\": -36.86538640455047, \"scale\": 1.0748600769835353}\nD: {\"rotation_angle\": -75.97132980340905, \"translation_dx\": 6.960702322199779, \"translation_dy\": 90.08754109424518, \"scale\": 1.363389071715864}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 18.52926347539298, \"translation_dx\": -26.155433185237058, \"translation_dy\": -39.799299198218556, \"scale\": 0.9355127285855813}\nB: {\"rotation_angle\": 148.22875373623708, \"translation_dx\": 53.75338658972072, \"translation_dy\": -63.78583022927253, \"scale\": 0.9304836306567924}\nC: {\"rotation_angle\": 72.25092677282458, \"translation_dx\": 61.389740502873025, \"translation_dy\": -36.86538640455047, \"scale\": 1.0748600769835353}\nD: {\"rotation_angle\": -75.97132980340905, \"translation_dx\": 6.960702322199779, \"translation_dy\": 90.08754109424518, \"scale\": 1.363389071715864}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_183_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_183_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -75.97132980340905, \"translation_dx\": 6.960702322199779, \"translation_dy\": 90.08754109424518, \"scale\": 1.363389071715864}\nB: {\"rotation_angle\": 44.2601421515034, \"translation_dx\": -84.9832744911761, \"translation_dy\": -78.07982572554322, \"scale\": 0.5612120736859965}\nC: {\"rotation_angle\": 2.6800660606496933, \"translation_dx\": 8.805898944242955, \"translation_dy\": -61.557448223727356, \"scale\": 0.7338009245004858}\nD: {\"rotation_angle\": -101.64893396855386, \"translation_dx\": -96.08306753711838, \"translation_dy\": 14.852477797043775, \"scale\": 1.3017377870800058}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -75.97132980340905, \"translation_dx\": 6.960702322199779, \"translation_dy\": 90.08754109424518, \"scale\": 1.363389071715864}\nB: {\"rotation_angle\": 44.2601421515034, \"translation_dx\": -84.9832744911761, \"translation_dy\": -78.07982572554322, \"scale\": 0.5612120736859965}\nC: {\"rotation_angle\": 2.6800660606496933, \"translation_dx\": 8.805898944242955, \"translation_dy\": -61.557448223727356, \"scale\": 0.7338009245004858}\nD: {\"rotation_angle\": -101.64893396855386, \"translation_dx\": -96.08306753711838, \"translation_dy\": 14.852477797043775, \"scale\": 1.3017377870800058}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_184_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_184_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -178.96154331790243, \"translation_dx\": -45.831117140591004, \"translation_dy\": 14.962223802901406, \"scale\": 1.4059876442036168}\nB: {\"rotation_angle\": 115.4472434811122, \"translation_dx\": 69.00896887231048, \"translation_dy\": -26.016218629159226, \"scale\": 0.9339901852292719}\nC: {\"rotation_angle\": 2.6800660606496933, \"translation_dx\": 8.805898944242955, \"translation_dy\": -61.557448223727356, \"scale\": 0.7338009245004858}\nD: {\"rotation_angle\": -149.34069149386406, \"translation_dx\": 81.63420911320063, \"translation_dy\": -26.073567429384056, \"scale\": 1.427947630130646}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -178.96154331790243, \"translation_dx\": -45.831117140591004, \"translation_dy\": 14.962223802901406, \"scale\": 1.4059876442036168}\nB: {\"rotation_angle\": 115.4472434811122, \"translation_dx\": 69.00896887231048, \"translation_dy\": -26.016218629159226, \"scale\": 0.9339901852292719}\nC: {\"rotation_angle\": 2.6800660606496933, \"translation_dx\": 8.805898944242955, \"translation_dy\": -61.557448223727356, \"scale\": 0.7338009245004858}\nD: {\"rotation_angle\": -149.34069149386406, \"translation_dx\": 81.63420911320063, \"translation_dy\": -26.073567429384056, \"scale\": 1.427947630130646}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_185_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_185_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -61.308258156024195, \"translation_dx\": -92.42627707406731, \"translation_dy\": -21.076199203141364, \"scale\": 1.1133621977071444}\nB: {\"rotation_angle\": 83.49682873903629, \"translation_dx\": -127.2042493945246, \"translation_dy\": 2.6616959584396938, \"scale\": 0.9488759478249397}\nC: {\"rotation_angle\": -0.45613579718829556, \"translation_dx\": 98.71619714866841, \"translation_dy\": 70.1100439641223, \"scale\": 0.6491919010173006}\nD: {\"rotation_angle\": -124.27587082376021, \"translation_dx\": -88.19288051455345, \"translation_dy\": 24.145134775980125, \"scale\": 1.4414104211047083}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -61.308258156024195, \"translation_dx\": -92.42627707406731, \"translation_dy\": -21.076199203141364, \"scale\": 1.1133621977071444}\nB: {\"rotation_angle\": 83.49682873903629, \"translation_dx\": -127.2042493945246, \"translation_dy\": 2.6616959584396938, \"scale\": 0.9488759478249397}\nC: {\"rotation_angle\": -0.45613579718829556, \"translation_dx\": 98.71619714866841, \"translation_dy\": 70.1100439641223, \"scale\": 0.6491919010173006}\nD: {\"rotation_angle\": -124.27587082376021, \"translation_dx\": -88.19288051455345, \"translation_dy\": 24.145134775980125, \"scale\": 1.4414104211047083}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_186_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_186_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -163.07830945514343, \"translation_dx\": 107.25371607945826, \"translation_dy\": 44.19319462200147, \"scale\": 1.0330497674624493}\nB: {\"rotation_angle\": 115.16030768984217, \"translation_dx\": -1.9669547188467504, \"translation_dy\": 38.42152609256746, \"scale\": 1.3403221872922475}\nC: {\"rotation_angle\": -148.06770236959966, \"translation_dx\": 76.71938731609727, \"translation_dy\": 125.67697929104389, \"scale\": 1.1600663307259453}\nD: {\"rotation_angle\": -98.17490649350026, \"translation_dx\": 5.744855173473269, \"translation_dy\": -10.705504600001973, \"scale\": 1.1182428392253487}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -163.07830945514343, \"translation_dx\": 107.25371607945826, \"translation_dy\": 44.19319462200147, \"scale\": 1.0330497674624493}\nB: {\"rotation_angle\": 115.16030768984217, \"translation_dx\": -1.9669547188467504, \"translation_dy\": 38.42152609256746, \"scale\": 1.3403221872922475}\nC: {\"rotation_angle\": -148.06770236959966, \"translation_dx\": 76.71938731609727, \"translation_dy\": 125.67697929104389, \"scale\": 1.1600663307259453}\nD: {\"rotation_angle\": -98.17490649350026, \"translation_dx\": 5.744855173473269, \"translation_dy\": -10.705504600001973, \"scale\": 1.1182428392253487}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_187_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_187_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 143.38145335973087, \"translation_dx\": 86.67970142496799, \"translation_dy\": -33.57640317277091, \"scale\": 0.6114655384261714}\nB: {\"rotation_angle\": 83.8873422171626, \"translation_dx\": -89.51171417178318, \"translation_dy\": 44.525876215713694, \"scale\": 0.7096671999666376}\nC: {\"rotation_angle\": 44.2601421515034, \"translation_dx\": -84.9832744911761, \"translation_dy\": -78.07982572554322, \"scale\": 0.5612120736859965}\nD: {\"rotation_angle\": 127.1396993936072, \"translation_dx\": -29.08894824101361, \"translation_dy\": -80.84475014775404, \"scale\": 1.2834497894588772}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 143.38145335973087, \"translation_dx\": 86.67970142496799, \"translation_dy\": -33.57640317277091, \"scale\": 0.6114655384261714}\nB: {\"rotation_angle\": 83.8873422171626, \"translation_dx\": -89.51171417178318, \"translation_dy\": 44.525876215713694, \"scale\": 0.7096671999666376}\nC: {\"rotation_angle\": 44.2601421515034, \"translation_dx\": -84.9832744911761, \"translation_dy\": -78.07982572554322, \"scale\": 0.5612120736859965}\nD: {\"rotation_angle\": 127.1396993936072, \"translation_dx\": -29.08894824101361, \"translation_dy\": -80.84475014775404, \"scale\": 1.2834497894588772}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_188_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_188_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -6.258618837806779, \"translation_dx\": -117.56200624611057, \"translation_dy\": -84.92852320396813, \"scale\": 0.8703619649920769}\nB: {\"rotation_angle\": -92.49508697379828, \"translation_dx\": 63.09853740086383, \"translation_dy\": 99.47995409556995, \"scale\": 0.9495145406508286}\nC: {\"rotation_angle\": -46.75272698463425, \"translation_dx\": 16.424107524155175, \"translation_dy\": -60.683488552754085, \"scale\": 1.375025476214386}\nD: {\"rotation_angle\": 99.38174871704592, \"translation_dx\": 57.870588734166205, \"translation_dy\": 17.413162007690403, \"scale\": 1.4113398114931053}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -6.258618837806779, \"translation_dx\": -117.56200624611057, \"translation_dy\": -84.92852320396813, \"scale\": 0.8703619649920769}\nB: {\"rotation_angle\": -92.49508697379828, \"translation_dx\": 63.09853740086383, \"translation_dy\": 99.47995409556995, \"scale\": 0.9495145406508286}\nC: {\"rotation_angle\": -46.75272698463425, \"translation_dx\": 16.424107524155175, \"translation_dy\": -60.683488552754085, \"scale\": 1.375025476214386}\nD: {\"rotation_angle\": 99.38174871704592, \"translation_dx\": 57.870588734166205, \"translation_dy\": 17.413162007690403, \"scale\": 1.4113398114931053}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_189_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_189_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -42.98651909317854, \"translation_dx\": 114.49293313374625, \"translation_dy\": -39.53290228333596, \"scale\": 1.442019387031135}\nB: {\"rotation_angle\": 49.896013394485834, \"translation_dx\": -25.763756683237403, \"translation_dy\": -26.432232271484168, \"scale\": 1.1619310734744932}\nC: {\"rotation_angle\": 97.63348280388993, \"translation_dx\": 59.62332527691919, \"translation_dy\": 12.549462794922746, \"scale\": 0.6927080624806098}\nD: {\"rotation_angle\": -79.27003163090343, \"translation_dx\": 8.207736130313549, \"translation_dy\": 6.670417118750038, \"scale\": 1.3327657238113826}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -42.98651909317854, \"translation_dx\": 114.49293313374625, \"translation_dy\": -39.53290228333596, \"scale\": 1.442019387031135}\nB: {\"rotation_angle\": 49.896013394485834, \"translation_dx\": -25.763756683237403, \"translation_dy\": -26.432232271484168, \"scale\": 1.1619310734744932}\nC: {\"rotation_angle\": 97.63348280388993, \"translation_dx\": 59.62332527691919, \"translation_dy\": 12.549462794922746, \"scale\": 0.6927080624806098}\nD: {\"rotation_angle\": -79.27003163090343, \"translation_dx\": 8.207736130313549, \"translation_dy\": 6.670417118750038, \"scale\": 1.3327657238113826}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_190_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_190_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 46.42160956908356, \"translation_dx\": -90.04619228512212, \"translation_dy\": -15.749486436572411, \"scale\": 1.005156310055277}\nB: {\"rotation_angle\": 142.66976946716716, \"translation_dx\": 29.963541003119957, \"translation_dy\": 66.07065092305665, \"scale\": 1.42144068359999}\nC: {\"rotation_angle\": 123.61853421760617, \"translation_dx\": -93.63136806510369, \"translation_dy\": -15.65687765252683, \"scale\": 0.9834422929774667}\nD: {\"rotation_angle\": 53.86809011441332, \"translation_dx\": -15.131168518097624, \"translation_dy\": -31.300037391593577, \"scale\": 1.3154620606808156}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 46.42160956908356, \"translation_dx\": -90.04619228512212, \"translation_dy\": -15.749486436572411, \"scale\": 1.005156310055277}\nB: {\"rotation_angle\": 142.66976946716716, \"translation_dx\": 29.963541003119957, \"translation_dy\": 66.07065092305665, \"scale\": 1.42144068359999}\nC: {\"rotation_angle\": 123.61853421760617, \"translation_dx\": -93.63136806510369, \"translation_dy\": -15.65687765252683, \"scale\": 0.9834422929774667}\nD: {\"rotation_angle\": 53.86809011441332, \"translation_dx\": -15.131168518097624, \"translation_dy\": -31.300037391593577, \"scale\": 1.3154620606808156}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_191_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_191_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -78.36766094840773, \"translation_dx\": -86.41466180609471, \"translation_dy\": 63.19530077419013, \"scale\": 0.608403973907593}\nB: {\"rotation_angle\": -99.80397961792426, \"translation_dx\": 113.2252387398062, \"translation_dy\": -61.846052830557056, \"scale\": 1.080357872583317}\nC: {\"rotation_angle\": -84.90425841207441, \"translation_dx\": -96.22975116611923, \"translation_dy\": -54.13037688992304, \"scale\": 1.161476925450186}\nD: {\"rotation_angle\": 46.42160956908356, \"translation_dx\": -90.04619228512212, \"translation_dy\": -15.749486436572411, \"scale\": 1.005156310055277}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -78.36766094840773, \"translation_dx\": -86.41466180609471, \"translation_dy\": 63.19530077419013, \"scale\": 0.608403973907593}\nB: {\"rotation_angle\": -99.80397961792426, \"translation_dx\": 113.2252387398062, \"translation_dy\": -61.846052830557056, \"scale\": 1.080357872583317}\nC: {\"rotation_angle\": -84.90425841207441, \"translation_dx\": -96.22975116611923, \"translation_dy\": -54.13037688992304, \"scale\": 1.161476925450186}\nD: {\"rotation_angle\": 46.42160956908356, \"translation_dx\": -90.04619228512212, \"translation_dy\": -15.749486436572411, \"scale\": 1.005156310055277}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_192_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_192_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 99.4759866737457, \"translation_dx\": -117.67383777244245, \"translation_dy\": -44.645046657688624, \"scale\": 1.4332006009229632}\nB: {\"rotation_angle\": -162.31682909306286, \"translation_dx\": 94.60975693720637, \"translation_dy\": -28.569332128995313, \"scale\": 1.1251281587345527}\nC: {\"rotation_angle\": 12.872370969250312, \"translation_dx\": -43.1533458138392, \"translation_dy\": -64.88511529320917, \"scale\": 1.3092068537816153}\nD: {\"rotation_angle\": -23.247975965134003, \"translation_dx\": 108.97564353658032, \"translation_dy\": 27.267413374938258, \"scale\": 1.2292170424899498}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 99.4759866737457, \"translation_dx\": -117.67383777244245, \"translation_dy\": -44.645046657688624, \"scale\": 1.4332006009229632}\nB: {\"rotation_angle\": -162.31682909306286, \"translation_dx\": 94.60975693720637, \"translation_dy\": -28.569332128995313, \"scale\": 1.1251281587345527}\nC: {\"rotation_angle\": 12.872370969250312, \"translation_dx\": -43.1533458138392, \"translation_dy\": -64.88511529320917, \"scale\": 1.3092068537816153}\nD: {\"rotation_angle\": -23.247975965134003, \"translation_dx\": 108.97564353658032, \"translation_dy\": 27.267413374938258, \"scale\": 1.2292170424899498}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_193_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_193_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 137.29869982747988, \"translation_dx\": 75.41375097241084, \"translation_dy\": 55.66358575693553, \"scale\": 1.1335508281242805}\nB: {\"rotation_angle\": 83.8873422171626, \"translation_dx\": -89.51171417178318, \"translation_dy\": 44.525876215713694, \"scale\": 0.7096671999666376}\nC: {\"rotation_angle\": -13.219279868292688, \"translation_dx\": -95.87022677446828, \"translation_dy\": -58.31347876468597, \"scale\": 1.3722022398508045}\nD: {\"rotation_angle\": -6.970858631484532, \"translation_dx\": -2.793256631611797, \"translation_dy\": 83.08133552847667, \"scale\": 1.4237697720578382}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 137.29869982747988, \"translation_dx\": 75.41375097241084, \"translation_dy\": 55.66358575693553, \"scale\": 1.1335508281242805}\nB: {\"rotation_angle\": 83.8873422171626, \"translation_dx\": -89.51171417178318, \"translation_dy\": 44.525876215713694, \"scale\": 0.7096671999666376}\nC: {\"rotation_angle\": -13.219279868292688, \"translation_dx\": -95.87022677446828, \"translation_dy\": -58.31347876468597, \"scale\": 1.3722022398508045}\nD: {\"rotation_angle\": -6.970858631484532, \"translation_dx\": -2.793256631611797, \"translation_dy\": 83.08133552847667, \"scale\": 1.4237697720578382}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_194_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_194_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -100.94596249363259, \"translation_dx\": 18.493532966543597, \"translation_dy\": -4.904135882610319, \"scale\": 1.1575890826518318}\nB: {\"rotation_angle\": -44.30781692045639, \"translation_dx\": -23.473696812537305, \"translation_dy\": -94.42952089946652, \"scale\": 1.4029179362735564}\nC: {\"rotation_angle\": -92.49508697379828, \"translation_dx\": 63.09853740086383, \"translation_dy\": 99.47995409556995, \"scale\": 0.9495145406508286}\nD: {\"rotation_angle\": -61.308258156024195, \"translation_dx\": -92.42627707406731, \"translation_dy\": -21.076199203141364, \"scale\": 1.1133621977071444}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -100.94596249363259, \"translation_dx\": 18.493532966543597, \"translation_dy\": -4.904135882610319, \"scale\": 1.1575890826518318}\nB: {\"rotation_angle\": -44.30781692045639, \"translation_dx\": -23.473696812537305, \"translation_dy\": -94.42952089946652, \"scale\": 1.4029179362735564}\nC: {\"rotation_angle\": -92.49508697379828, \"translation_dx\": 63.09853740086383, \"translation_dy\": 99.47995409556995, \"scale\": 0.9495145406508286}\nD: {\"rotation_angle\": -61.308258156024195, \"translation_dx\": -92.42627707406731, \"translation_dy\": -21.076199203141364, \"scale\": 1.1133621977071444}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_195_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_195_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -23.247975965134003, \"translation_dx\": 108.97564353658032, \"translation_dy\": 27.267413374938258, \"scale\": 1.2292170424899498}\nB: {\"rotation_angle\": 111.11665430921613, \"translation_dx\": -45.526232266105865, \"translation_dy\": -71.56835409165808, \"scale\": 0.5234271564227445}\nC: {\"rotation_angle\": 48.71833122181758, \"translation_dx\": -105.22683210092106, \"translation_dy\": -63.34096559919908, \"scale\": 0.7204478932238769}\nD: {\"rotation_angle\": 88.199522854527, \"translation_dx\": 18.814421533590917, \"translation_dy\": -27.135307313502466, \"scale\": 1.37855935527965}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -23.247975965134003, \"translation_dx\": 108.97564353658032, \"translation_dy\": 27.267413374938258, \"scale\": 1.2292170424899498}\nB: {\"rotation_angle\": 111.11665430921613, \"translation_dx\": -45.526232266105865, \"translation_dy\": -71.56835409165808, \"scale\": 0.5234271564227445}\nC: {\"rotation_angle\": 48.71833122181758, \"translation_dx\": -105.22683210092106, \"translation_dy\": -63.34096559919908, \"scale\": 0.7204478932238769}\nD: {\"rotation_angle\": 88.199522854527, \"translation_dx\": 18.814421533590917, \"translation_dy\": -27.135307313502466, \"scale\": 1.37855935527965}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_196_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_196_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": 142.66976946716716, \"translation_dx\": 29.963541003119957, \"translation_dy\": 66.07065092305665, \"scale\": 1.42144068359999}\nB: {\"rotation_angle\": 53.86809011441332, \"translation_dx\": -15.131168518097624, \"translation_dy\": -31.300037391593577, \"scale\": 1.3154620606808156}\nC: {\"rotation_angle\": 179.8013352752547, \"translation_dx\": -90.5548533247824, \"translation_dy\": 17.23782922418306, \"scale\": 0.9885365626195518}\nD: {\"rotation_angle\": 148.22875373623708, \"translation_dx\": 53.75338658972072, \"translation_dy\": -63.78583022927253, \"scale\": 0.9304836306567924}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": 142.66976946716716, \"translation_dx\": 29.963541003119957, \"translation_dy\": 66.07065092305665, \"scale\": 1.42144068359999}\nB: {\"rotation_angle\": 53.86809011441332, \"translation_dx\": -15.131168518097624, \"translation_dy\": -31.300037391593577, \"scale\": 1.3154620606808156}\nC: {\"rotation_angle\": 179.8013352752547, \"translation_dx\": -90.5548533247824, \"translation_dy\": 17.23782922418306, \"scale\": 0.9885365626195518}\nD: {\"rotation_angle\": 148.22875373623708, \"translation_dx\": 53.75338658972072, \"translation_dy\": -63.78583022927253, \"scale\": 0.9304836306567924}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_197_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_197_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -23.247975965134003, \"translation_dx\": 108.97564353658032, \"translation_dy\": 27.267413374938258, \"scale\": 1.2292170424899498}\nB: {\"rotation_angle\": 46.42160956908356, \"translation_dx\": -90.04619228512212, \"translation_dy\": -15.749486436572411, \"scale\": 1.005156310055277}\nC: {\"rotation_angle\": 26.051749493295517, \"translation_dx\": 8.674153667650117, \"translation_dy\": 81.98381249796742, \"scale\": 1.4721363798843865}\nD: {\"rotation_angle\": -160.6395227566207, \"translation_dx\": 53.66643366551958, \"translation_dy\": -27.712376159428388, \"scale\": 1.1084051689599654}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -23.247975965134003, \"translation_dx\": 108.97564353658032, \"translation_dy\": 27.267413374938258, \"scale\": 1.2292170424899498}\nB: {\"rotation_angle\": 46.42160956908356, \"translation_dx\": -90.04619228512212, \"translation_dy\": -15.749486436572411, \"scale\": 1.005156310055277}\nC: {\"rotation_angle\": 26.051749493295517, \"translation_dx\": 8.674153667650117, \"translation_dy\": 81.98381249796742, \"scale\": 1.4721363798843865}\nD: {\"rotation_angle\": -160.6395227566207, \"translation_dx\": 53.66643366551958, \"translation_dy\": -27.712376159428388, \"scale\": 1.1084051689599654}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_198_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_198_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_Spatial_Transformation_Estimation", "visual_input_component": "natural image", "source": "COCO_spatial", "options": "A: {\"rotation_angle\": -70.97525301082955, \"translation_dx\": -28.380848037876873, \"translation_dy\": 54.37723426674512, \"scale\": 0.9024922197892329}\nB: {\"rotation_angle\": -5.816806483512181, \"translation_dx\": -70.40329792935935, \"translation_dy\": -21.418007440252175, \"scale\": 1.0041476956174793}\nC: {\"rotation_angle\": 163.34031080178892, \"translation_dx\": -21.567151354845635, \"translation_dy\": -30.72615389540148, \"scale\": 1.2439888416024685}\nD: {\"rotation_angle\": -14.958482221349612, \"translation_dx\": 49.62118662103501, \"translation_dy\": -13.943537967490855, \"scale\": 1.489574484959727}", "question": "Please compute the type and parameters of the spatial transformation between these two images.", "context": "Given pairs of images depicting scenes before and after a spatial transformation (e.g., rotation, translation), your task is to predict the type and magnitude of the transformation that occurred. \nSelect from the following choices.\nA: {\"rotation_angle\": -70.97525301082955, \"translation_dx\": -28.380848037876873, \"translation_dy\": 54.37723426674512, \"scale\": 0.9024922197892329}\nB: {\"rotation_angle\": -5.816806483512181, \"translation_dx\": -70.40329792935935, \"translation_dy\": -21.418007440252175, \"scale\": 1.0041476956174793}\nC: {\"rotation_angle\": 163.34031080178892, \"translation_dx\": -21.567151354845635, \"translation_dy\": -30.72615389540148, \"scale\": 1.2439888416024685}\nD: {\"rotation_angle\": -14.958482221349612, \"translation_dx\": 49.62118662103501, \"translation_dy\": -13.943537967490855, \"scale\": 1.489574484959727}", "input_image_path": ["./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_199_0.jpg", "./2D-spatial/Image_Spatial_Transformation_Estimation/Image_Spatial_Transformation_Estimation_199_1.jpg"], "output": "A", "qwen3-vl": "image none"}]
\ No newline at end of file
diff --git a/results/Image_text_retrieval_with_Spatial_Context/qwen3-vl/metadata_info.json b/results/Image_text_retrieval_with_Spatial_Context/qwen3-vl/metadata_info.json
new file mode 100644
index 0000000..b7c50be
--- /dev/null
+++ b/results/Image_text_retrieval_with_Spatial_Context/qwen3-vl/metadata_info.json
@@ -0,0 +1 @@
+[{"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_0_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_0_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_0_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_0_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_0_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_0_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_0_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_0_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_0_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_0_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_0_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_0_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_0_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_0_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_0_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_0_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_0_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_0_17.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_1_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_1_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_1_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_1_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_1_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_1_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_1_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_1_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_1_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_1_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_1_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_1_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_1_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_1_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_1_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_1_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_1_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_1_17.jpg"], "output": "E", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_2_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_2_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_2_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_2_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_2_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_2_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_2_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_2_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_2_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_2_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_2_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_2_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_2_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_2_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_2_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_2_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_2_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_2_17.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_3_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_3_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_3_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_3_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_3_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_3_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_3_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_3_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_3_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_3_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_3_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_3_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_3_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_3_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_3_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_3_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_3_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_3_17.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_4_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_4_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_4_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_4_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_4_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_4_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_4_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_4_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_4_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_4_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_4_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_4_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_4_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_4_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_4_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_4_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_4_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_4_17.jpg"], "output": "G", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_5_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_5_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_5_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_5_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_5_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_5_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_5_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_5_7.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_6_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_6_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_6_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_6_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_6_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_6_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_6_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_6_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_6_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_6_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_6_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_6_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_6_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_6_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_6_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_6_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_6_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_6_17.jpg"], "output": "G", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_7_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_7_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_7_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_7_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_7_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_7_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_7_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_7_7.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_8_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_8_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_8_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_8_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_8_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_8_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_8_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_8_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_8_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_8_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_8_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_8_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_8_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_8_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_8_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_8_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_8_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_8_17.jpg"], "output": "I", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_9_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_9_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_9_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_9_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_9_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_9_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_9_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_9_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_9_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_9_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_9_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_9_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_9_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_9_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_9_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_9_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_9_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_9_17.jpg"], "output": "E", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_10_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_10_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_10_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_10_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_10_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_10_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_10_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_10_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_10_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_10_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_10_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_10_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_10_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_10_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_10_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_10_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_10_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_10_17.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_11_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_11_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_11_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_11_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_11_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_11_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_11_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_11_7.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_12_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_12_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_12_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_12_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_12_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_12_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_12_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_12_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_12_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_12_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_12_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_12_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_12_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_12_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_12_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_12_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_12_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_12_17.jpg"], "output": "I", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_13_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_13_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_13_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_13_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_13_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_13_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_13_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_13_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_13_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_13_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_13_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_13_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_13_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_13_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_13_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_13_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_13_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_13_17.jpg"], "output": "I", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_14_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_14_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_14_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_14_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_14_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_14_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_14_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_14_7.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_15_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_15_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_15_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_15_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_15_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_15_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_15_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_15_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_15_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_15_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_15_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_15_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_15_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_15_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_15_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_15_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_15_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_15_17.jpg"], "output": "G", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_16_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_16_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_16_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_16_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_16_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_16_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_16_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_16_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_16_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_16_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_16_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_16_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_16_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_16_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_16_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_16_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_16_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_16_17.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_17_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_17_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_17_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_17_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_17_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_17_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_17_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_17_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_17_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_17_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_17_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_17_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_17_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_17_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_17_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_17_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_17_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_17_17.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_18_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_18_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_18_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_18_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_18_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_18_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_18_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_18_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_18_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_18_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_18_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_18_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_18_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_18_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_18_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_18_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_18_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_18_17.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_19_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_19_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_19_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_19_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_19_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_19_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_19_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_19_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_19_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_19_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_19_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_19_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_19_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_19_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_19_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_19_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_19_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_19_17.jpg"], "output": "I", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_20_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_20_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_20_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_20_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_20_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_20_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_20_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_20_7.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_21_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_21_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_21_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_21_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_21_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_21_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_21_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_21_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_21_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_21_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_21_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_21_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_21_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_21_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_21_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_21_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_21_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_21_17.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_22_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_22_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_22_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_22_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_22_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_22_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_22_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_22_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_22_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_22_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_22_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_22_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_22_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_22_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_22_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_22_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_22_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_22_17.jpg"], "output": "F", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_23_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_23_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_23_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_23_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_23_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_23_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_23_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_23_7.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_24_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_24_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_24_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_24_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_24_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_24_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_24_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_24_7.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_25_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_25_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_25_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_25_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_25_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_25_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_25_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_25_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_25_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_25_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_25_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_25_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_25_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_25_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_25_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_25_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_25_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_25_17.jpg"], "output": "I", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_26_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_26_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_26_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_26_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_26_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_26_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_26_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_26_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_26_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_26_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_26_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_26_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_26_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_26_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_26_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_26_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_26_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_26_17.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_27_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_27_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_27_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_27_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_27_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_27_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_27_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_27_7.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_28_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_28_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_28_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_28_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_28_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_28_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_28_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_28_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_28_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_28_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_28_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_28_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_28_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_28_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_28_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_28_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_28_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_28_17.jpg"], "output": "G", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_29_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_29_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_29_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_29_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_29_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_29_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_29_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_29_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_29_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_29_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_29_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_29_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_29_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_29_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_29_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_29_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_29_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_29_17.jpg"], "output": "H", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_30_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_30_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_30_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_30_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_30_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_30_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_30_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_30_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_30_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_30_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_30_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_30_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_30_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_30_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_30_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_30_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_30_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_30_17.jpg"], "output": "H", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_31_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_31_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_31_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_31_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_31_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_31_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_31_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_31_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_31_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_31_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_31_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_31_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_31_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_31_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_31_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_31_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_31_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_31_17.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_32_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_32_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_32_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_32_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_32_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_32_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_32_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_32_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_32_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_32_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_32_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_32_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_32_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_32_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_32_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_32_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_32_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_32_17.jpg"], "output": "H", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_33_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_33_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_33_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_33_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_33_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_33_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_33_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_33_7.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_34_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_34_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_34_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_34_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_34_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_34_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_34_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_34_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_34_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_34_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_34_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_34_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_34_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_34_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_34_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_34_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_34_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_34_17.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_35_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_35_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_35_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_35_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_35_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_35_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_35_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_35_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_35_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_35_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_35_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_35_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_35_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_35_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_35_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_35_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_35_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_35_17.jpg"], "output": "G", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_36_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_36_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_36_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_36_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_36_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_36_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_36_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_36_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_36_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_36_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_36_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_36_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_36_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_36_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_36_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_36_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_36_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_36_17.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_37_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_37_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_37_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_37_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_37_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_37_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_37_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_37_7.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_38_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_38_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_38_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_38_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_38_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_38_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_38_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_38_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_38_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_38_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_38_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_38_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_38_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_38_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_38_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_38_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_38_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_38_17.jpg"], "output": "G", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_39_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_39_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_39_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_39_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_39_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_39_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_39_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_39_7.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_40_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_40_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_40_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_40_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_40_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_40_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_40_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_40_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_40_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_40_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_40_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_40_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_40_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_40_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_40_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_40_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_40_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_40_17.jpg"], "output": "I", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_41_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_41_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_41_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_41_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_41_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_41_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_41_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_41_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_41_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_41_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_41_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_41_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_41_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_41_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_41_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_41_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_41_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_41_17.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_42_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_42_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_42_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_42_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_42_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_42_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_42_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_42_7.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_43_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_43_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_43_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_43_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_43_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_43_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_43_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_43_7.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_44_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_44_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_44_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_44_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_44_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_44_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_44_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_44_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_44_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_44_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_44_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_44_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_44_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_44_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_44_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_44_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_44_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_44_17.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_45_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_45_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_45_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_45_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_45_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_45_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_45_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_45_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_45_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_45_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_45_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_45_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_45_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_45_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_45_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_45_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_45_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_45_17.jpg"], "output": "I", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_46_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_46_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_46_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_46_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_46_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_46_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_46_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_46_7.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_47_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_47_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_47_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_47_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_47_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_47_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_47_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_47_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_47_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_47_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_47_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_47_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_47_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_47_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_47_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_47_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_47_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_47_17.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_48_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_48_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_48_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_48_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_48_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_48_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_48_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_48_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_48_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_48_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_48_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_48_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_48_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_48_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_48_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_48_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_48_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_48_17.jpg"], "output": "E", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_49_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_49_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_49_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_49_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_49_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_49_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_49_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_49_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_49_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_49_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_49_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_49_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_49_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_49_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_49_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_49_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_49_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_49_17.jpg"], "output": "E", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_50_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_50_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_50_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_50_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_50_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_50_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_50_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_50_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_50_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_50_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_50_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_50_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_50_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_50_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_50_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_50_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_50_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_50_17.jpg"], "output": "G", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_51_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_51_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_51_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_51_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_51_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_51_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_51_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_51_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_51_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_51_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_51_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_51_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_51_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_51_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_51_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_51_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_51_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_51_17.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_52_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_52_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_52_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_52_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_52_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_52_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_52_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_52_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_52_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_52_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_52_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_52_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_52_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_52_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_52_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_52_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_52_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_52_17.jpg"], "output": "F", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_53_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_53_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_53_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_53_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_53_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_53_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_53_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_53_7.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_54_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_54_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_54_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_54_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_54_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_54_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_54_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_54_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_54_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_54_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_54_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_54_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_54_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_54_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_54_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_54_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_54_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_54_17.jpg"], "output": "G", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_55_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_55_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_55_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_55_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_55_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_55_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_55_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_55_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_55_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_55_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_55_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_55_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_55_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_55_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_55_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_55_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_55_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_55_17.jpg"], "output": "F", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_56_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_56_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_56_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_56_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_56_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_56_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_56_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_56_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_56_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_56_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_56_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_56_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_56_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_56_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_56_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_56_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_56_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_56_17.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_57_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_57_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_57_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_57_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_57_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_57_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_57_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_57_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_57_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_57_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_57_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_57_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_57_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_57_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_57_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_57_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_57_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_57_17.jpg"], "output": "I", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_58_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_58_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_58_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_58_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_58_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_58_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_58_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_58_7.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_59_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_59_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_59_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_59_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_59_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_59_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_59_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_59_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_59_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_59_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_59_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_59_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_59_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_59_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_59_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_59_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_59_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_59_17.jpg"], "output": "F", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_60_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_60_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_60_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_60_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_60_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_60_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_60_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_60_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_60_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_60_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_60_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_60_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_60_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_60_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_60_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_60_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_60_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_60_17.jpg"], "output": "H", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_61_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_61_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_61_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_61_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_61_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_61_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_61_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_61_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_61_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_61_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_61_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_61_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_61_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_61_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_61_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_61_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_61_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_61_17.jpg"], "output": "E", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_62_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_62_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_62_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_62_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_62_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_62_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_62_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_62_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_62_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_62_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_62_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_62_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_62_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_62_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_62_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_62_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_62_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_62_17.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_63_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_63_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_63_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_63_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_63_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_63_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_63_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_63_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_63_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_63_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_63_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_63_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_63_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_63_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_63_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_63_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_63_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_63_17.jpg"], "output": "H", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_64_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_64_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_64_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_64_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_64_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_64_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_64_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_64_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_64_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_64_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_64_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_64_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_64_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_64_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_64_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_64_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_64_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_64_17.jpg"], "output": "F", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_65_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_65_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_65_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_65_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_65_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_65_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_65_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_65_7.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_66_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_66_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_66_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_66_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_66_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_66_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_66_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_66_7.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_67_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_67_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_67_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_67_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_67_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_67_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_67_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_67_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_67_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_67_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_67_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_67_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_67_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_67_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_67_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_67_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_67_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_67_17.jpg"], "output": "H", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_68_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_68_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_68_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_68_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_68_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_68_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_68_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_68_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_68_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_68_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_68_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_68_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_68_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_68_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_68_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_68_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_68_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_68_17.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_69_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_69_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_69_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_69_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_69_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_69_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_69_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_69_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_69_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_69_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_69_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_69_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_69_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_69_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_69_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_69_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_69_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_69_17.jpg"], "output": "F", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_70_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_70_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_70_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_70_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_70_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_70_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_70_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_70_7.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_71_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_71_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_71_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_71_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_71_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_71_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_71_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_71_7.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_72_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_72_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_72_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_72_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_72_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_72_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_72_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_72_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_72_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_72_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_72_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_72_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_72_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_72_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_72_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_72_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_72_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_72_17.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_73_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_73_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_73_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_73_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_73_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_73_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_73_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_73_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_73_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_73_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_73_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_73_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_73_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_73_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_73_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_73_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_73_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_73_17.jpg"], "output": "E", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_74_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_74_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_74_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_74_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_74_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_74_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_74_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_74_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_74_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_74_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_74_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_74_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_74_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_74_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_74_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_74_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_74_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_74_17.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_75_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_75_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_75_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_75_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_75_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_75_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_75_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_75_7.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_76_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_76_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_76_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_76_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_76_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_76_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_76_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_76_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_76_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_76_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_76_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_76_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_76_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_76_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_76_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_76_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_76_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_76_17.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_77_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_77_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_77_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_77_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_77_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_77_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_77_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_77_7.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_78_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_78_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_78_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_78_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_78_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_78_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_78_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_78_7.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_79_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_79_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_79_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_79_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_79_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_79_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_79_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_79_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_79_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_79_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_79_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_79_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_79_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_79_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_79_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_79_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_79_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_79_17.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_80_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_80_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_80_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_80_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_80_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_80_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_80_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_80_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_80_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_80_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_80_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_80_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_80_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_80_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_80_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_80_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_80_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_80_17.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_81_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_81_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_81_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_81_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_81_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_81_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_81_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_81_7.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_82_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_82_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_82_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_82_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_82_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_82_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_82_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_82_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_82_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_82_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_82_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_82_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_82_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_82_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_82_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_82_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_82_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_82_17.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_83_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_83_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_83_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_83_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_83_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_83_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_83_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_83_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_83_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_83_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_83_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_83_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_83_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_83_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_83_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_83_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_83_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_83_17.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_84_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_84_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_84_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_84_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_84_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_84_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_84_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_84_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_84_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_84_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_84_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_84_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_84_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_84_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_84_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_84_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_84_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_84_17.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_85_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_85_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_85_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_85_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_85_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_85_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_85_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_85_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_85_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_85_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_85_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_85_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_85_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_85_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_85_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_85_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_85_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_85_17.jpg"], "output": "I", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_86_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_86_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_86_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_86_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_86_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_86_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_86_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_86_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_86_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_86_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_86_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_86_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_86_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_86_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_86_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_86_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_86_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_86_17.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_87_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_87_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_87_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_87_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_87_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_87_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_87_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_87_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_87_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_87_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_87_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_87_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_87_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_87_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_87_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_87_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_87_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_87_17.jpg"], "output": "F", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_88_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_88_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_88_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_88_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_88_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_88_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_88_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_88_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_88_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_88_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_88_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_88_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_88_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_88_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_88_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_88_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_88_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_88_17.jpg"], "output": "H", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_89_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_89_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_89_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_89_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_89_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_89_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_89_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_89_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_89_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_89_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_89_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_89_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_89_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_89_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_89_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_89_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_89_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_89_17.jpg"], "output": "E", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_90_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_90_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_90_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_90_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_90_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_90_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_90_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_90_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_90_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_90_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_90_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_90_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_90_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_90_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_90_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_90_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_90_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_90_17.jpg"], "output": "G", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_91_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_91_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_91_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_91_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_91_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_91_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_91_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_91_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_91_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_91_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_91_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_91_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_91_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_91_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_91_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_91_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_91_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_91_17.jpg"], "output": "G", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_92_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_92_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_92_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_92_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_92_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_92_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_92_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_92_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_92_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_92_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_92_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_92_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_92_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_92_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_92_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_92_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_92_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_92_17.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_93_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_93_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_93_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_93_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_93_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_93_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_93_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_93_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_93_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_93_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_93_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_93_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_93_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_93_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_93_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_93_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_93_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_93_17.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_94_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_94_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_94_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_94_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_94_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_94_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_94_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_94_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_94_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_94_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_94_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_94_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_94_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_94_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_94_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_94_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_94_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_94_17.jpg"], "output": "H", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_95_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_95_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_95_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_95_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_95_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_95_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_95_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_95_7.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_96_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_96_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_96_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_96_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_96_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_96_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_96_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_96_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_96_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_96_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_96_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_96_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_96_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_96_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_96_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_96_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_96_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_96_17.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_97_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_97_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_97_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_97_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_97_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_97_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_97_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_97_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_97_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_97_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_97_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_97_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_97_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_97_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_97_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_97_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_97_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_97_17.jpg"], "output": "H", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_98_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_98_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_98_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_98_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_98_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_98_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_98_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_98_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_98_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_98_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_98_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_98_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_98_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_98_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_98_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_98_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_98_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_98_17.jpg"], "output": "I", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_99_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_99_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_99_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_99_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_99_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_99_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_99_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_99_7.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_100_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_100_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_100_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_100_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_100_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_100_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_100_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_100_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_100_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_100_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_100_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_100_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_100_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_100_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_100_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_100_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_100_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_100_17.jpg"], "output": "G", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_101_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_101_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_101_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_101_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_101_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_101_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_101_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_101_7.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_102_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_102_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_102_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_102_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_102_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_102_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_102_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_102_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_102_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_102_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_102_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_102_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_102_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_102_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_102_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_102_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_102_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_102_17.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_103_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_103_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_103_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_103_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_103_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_103_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_103_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_103_7.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_104_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_104_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_104_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_104_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_104_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_104_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_104_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_104_7.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_105_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_105_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_105_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_105_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_105_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_105_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_105_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_105_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_105_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_105_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_105_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_105_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_105_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_105_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_105_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_105_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_105_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_105_17.jpg"], "output": "H", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_106_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_106_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_106_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_106_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_106_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_106_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_106_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_106_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_106_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_106_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_106_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_106_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_106_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_106_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_106_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_106_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_106_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_106_17.jpg"], "output": "E", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_107_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_107_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_107_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_107_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_107_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_107_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_107_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_107_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_107_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_107_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_107_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_107_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_107_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_107_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_107_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_107_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_107_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_107_17.jpg"], "output": "F", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_108_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_108_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_108_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_108_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_108_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_108_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_108_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_108_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_108_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_108_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_108_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_108_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_108_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_108_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_108_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_108_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_108_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_108_17.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_109_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_109_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_109_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_109_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_109_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_109_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_109_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_109_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_109_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_109_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_109_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_109_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_109_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_109_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_109_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_109_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_109_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_109_17.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_110_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_110_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_110_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_110_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_110_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_110_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_110_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_110_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_110_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_110_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_110_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_110_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_110_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_110_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_110_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_110_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_110_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_110_17.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_111_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_111_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_111_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_111_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_111_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_111_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_111_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_111_7.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_112_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_112_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_112_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_112_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_112_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_112_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_112_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_112_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_112_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_112_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_112_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_112_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_112_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_112_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_112_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_112_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_112_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_112_17.jpg"], "output": "E", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_113_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_113_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_113_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_113_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_113_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_113_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_113_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_113_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_113_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_113_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_113_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_113_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_113_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_113_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_113_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_113_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_113_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_113_17.jpg"], "output": "F", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_114_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_114_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_114_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_114_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_114_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_114_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_114_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_114_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_114_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_114_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_114_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_114_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_114_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_114_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_114_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_114_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_114_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_114_17.jpg"], "output": "H", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_115_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_115_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_115_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_115_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_115_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_115_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_115_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_115_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_115_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_115_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_115_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_115_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_115_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_115_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_115_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_115_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_115_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_115_17.jpg"], "output": "E", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_116_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_116_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_116_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_116_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_116_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_116_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_116_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_116_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_116_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_116_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_116_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_116_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_116_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_116_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_116_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_116_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_116_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_116_17.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_117_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_117_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_117_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_117_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_117_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_117_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_117_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_117_7.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_118_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_118_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_118_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_118_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_118_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_118_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_118_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_118_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_118_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_118_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_118_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_118_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_118_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_118_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_118_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_118_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_118_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_118_17.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_119_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_119_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_119_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_119_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_119_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_119_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_119_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_119_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_119_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_119_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_119_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_119_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_119_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_119_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_119_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_119_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_119_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_119_17.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_120_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_120_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_120_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_120_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_120_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_120_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_120_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_120_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_120_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_120_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_120_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_120_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_120_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_120_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_120_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_120_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_120_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_120_17.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_121_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_121_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_121_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_121_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_121_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_121_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_121_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_121_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_121_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_121_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_121_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_121_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_121_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_121_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_121_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_121_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_121_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_121_17.jpg"], "output": "F", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_122_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_122_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_122_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_122_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_122_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_122_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_122_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_122_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_122_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_122_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_122_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_122_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_122_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_122_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_122_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_122_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_122_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_122_17.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_123_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_123_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_123_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_123_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_123_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_123_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_123_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_123_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_123_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_123_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_123_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_123_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_123_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_123_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_123_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_123_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_123_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_123_17.jpg"], "output": "H", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_124_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_124_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_124_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_124_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_124_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_124_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_124_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_124_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_124_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_124_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_124_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_124_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_124_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_124_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_124_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_124_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_124_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_124_17.jpg"], "output": "I", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_125_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_125_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_125_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_125_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_125_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_125_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_125_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_125_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_125_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_125_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_125_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_125_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_125_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_125_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_125_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_125_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_125_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_125_17.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_126_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_126_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_126_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_126_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_126_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_126_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_126_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_126_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_126_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_126_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_126_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_126_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_126_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_126_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_126_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_126_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_126_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_126_17.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_127_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_127_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_127_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_127_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_127_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_127_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_127_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_127_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_127_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_127_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_127_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_127_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_127_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_127_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_127_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_127_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_127_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_127_17.jpg"], "output": "I", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_128_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_128_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_128_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_128_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_128_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_128_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_128_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_128_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_128_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_128_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_128_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_128_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_128_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_128_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_128_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_128_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_128_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_128_17.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_129_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_129_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_129_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_129_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_129_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_129_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_129_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_129_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_129_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_129_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_129_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_129_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_129_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_129_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_129_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_129_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_129_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_129_17.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_130_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_130_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_130_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_130_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_130_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_130_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_130_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_130_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_130_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_130_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_130_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_130_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_130_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_130_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_130_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_130_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_130_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_130_17.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_131_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_131_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_131_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_131_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_131_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_131_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_131_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_131_7.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_132_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_132_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_132_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_132_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_132_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_132_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_132_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_132_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_132_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_132_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_132_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_132_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_132_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_132_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_132_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_132_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_132_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_132_17.jpg"], "output": "F", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_133_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_133_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_133_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_133_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_133_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_133_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_133_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_133_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_133_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_133_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_133_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_133_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_133_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_133_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_133_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_133_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_133_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_133_17.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_134_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_134_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_134_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_134_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_134_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_134_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_134_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_134_7.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_135_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_135_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_135_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_135_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_135_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_135_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_135_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_135_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_135_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_135_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_135_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_135_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_135_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_135_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_135_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_135_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_135_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_135_17.jpg"], "output": "E", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_136_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_136_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_136_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_136_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_136_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_136_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_136_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_136_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_136_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_136_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_136_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_136_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_136_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_136_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_136_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_136_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_136_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_136_17.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_137_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_137_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_137_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_137_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_137_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_137_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_137_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_137_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_137_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_137_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_137_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_137_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_137_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_137_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_137_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_137_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_137_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_137_17.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_138_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_138_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_138_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_138_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_138_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_138_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_138_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_138_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_138_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_138_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_138_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_138_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_138_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_138_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_138_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_138_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_138_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_138_17.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_139_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_139_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_139_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_139_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_139_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_139_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_139_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_139_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_139_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_139_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_139_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_139_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_139_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_139_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_139_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_139_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_139_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_139_17.jpg"], "output": "I", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_140_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_140_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_140_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_140_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_140_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_140_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_140_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_140_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_140_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_140_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_140_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_140_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_140_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_140_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_140_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_140_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_140_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_140_17.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_141_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_141_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_141_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_141_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_141_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_141_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_141_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_141_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_141_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_141_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_141_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_141_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_141_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_141_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_141_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_141_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_141_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_141_17.jpg"], "output": "I", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_142_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_142_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_142_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_142_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_142_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_142_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_142_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_142_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_142_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_142_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_142_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_142_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_142_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_142_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_142_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_142_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_142_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_142_17.jpg"], "output": "H", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_143_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_143_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_143_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_143_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_143_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_143_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_143_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_143_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_143_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_143_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_143_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_143_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_143_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_143_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_143_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_143_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_143_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_143_17.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_144_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_144_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_144_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_144_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_144_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_144_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_144_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_144_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_144_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_144_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_144_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_144_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_144_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_144_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_144_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_144_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_144_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_144_17.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_145_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_145_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_145_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_145_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_145_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_145_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_145_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_145_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_145_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_145_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_145_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_145_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_145_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_145_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_145_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_145_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_145_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_145_17.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_146_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_146_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_146_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_146_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_146_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_146_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_146_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_146_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_146_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_146_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_146_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_146_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_146_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_146_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_146_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_146_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_146_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_146_17.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_147_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_147_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_147_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_147_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_147_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_147_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_147_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_147_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_147_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_147_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_147_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_147_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_147_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_147_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_147_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_147_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_147_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_147_17.jpg"], "output": "G", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_148_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_148_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_148_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_148_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_148_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_148_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_148_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_148_7.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_149_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_149_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_149_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_149_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_149_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_149_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_149_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_149_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_149_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_149_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_149_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_149_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_149_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_149_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_149_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_149_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_149_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_149_17.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_150_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_150_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_150_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_150_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_150_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_150_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_150_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_150_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_150_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_150_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_150_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_150_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_150_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_150_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_150_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_150_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_150_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_150_17.jpg"], "output": "E", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_151_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_151_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_151_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_151_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_151_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_151_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_151_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_151_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_151_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_151_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_151_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_151_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_151_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_151_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_151_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_151_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_151_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_151_17.jpg"], "output": "F", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_152_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_152_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_152_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_152_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_152_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_152_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_152_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_152_7.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_153_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_153_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_153_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_153_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_153_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_153_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_153_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_153_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_153_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_153_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_153_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_153_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_153_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_153_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_153_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_153_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_153_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_153_17.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_154_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_154_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_154_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_154_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_154_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_154_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_154_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_154_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_154_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_154_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_154_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_154_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_154_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_154_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_154_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_154_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_154_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_154_17.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_155_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_155_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_155_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_155_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_155_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_155_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_155_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_155_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_155_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_155_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_155_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_155_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_155_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_155_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_155_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_155_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_155_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_155_17.jpg"], "output": "I", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_156_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_156_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_156_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_156_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_156_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_156_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_156_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_156_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_156_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_156_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_156_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_156_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_156_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_156_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_156_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_156_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_156_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_156_17.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_157_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_157_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_157_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_157_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_157_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_157_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_157_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_157_7.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_158_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_158_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_158_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_158_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_158_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_158_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_158_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_158_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_158_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_158_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_158_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_158_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_158_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_158_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_158_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_158_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_158_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_158_17.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_159_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_159_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_159_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_159_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_159_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_159_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_159_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_159_7.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_160_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_160_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_160_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_160_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_160_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_160_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_160_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_160_7.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_161_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_161_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_161_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_161_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_161_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_161_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_161_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_161_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_161_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_161_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_161_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_161_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_161_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_161_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_161_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_161_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_161_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_161_17.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_162_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_162_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_162_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_162_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_162_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_162_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_162_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_162_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_162_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_162_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_162_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_162_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_162_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_162_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_162_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_162_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_162_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_162_17.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_163_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_163_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_163_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_163_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_163_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_163_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_163_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_163_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_163_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_163_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_163_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_163_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_163_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_163_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_163_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_163_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_163_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_163_17.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_164_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_164_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_164_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_164_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_164_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_164_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_164_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_164_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_164_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_164_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_164_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_164_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_164_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_164_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_164_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_164_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_164_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_164_17.jpg"], "output": "E", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_165_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_165_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_165_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_165_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_165_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_165_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_165_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_165_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_165_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_165_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_165_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_165_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_165_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_165_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_165_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_165_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_165_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_165_17.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_166_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_166_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_166_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_166_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_166_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_166_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_166_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_166_7.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_167_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_167_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_167_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_167_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_167_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_167_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_167_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_167_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_167_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_167_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_167_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_167_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_167_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_167_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_167_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_167_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_167_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_167_17.jpg"], "output": "H", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_168_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_168_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_168_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_168_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_168_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_168_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_168_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_168_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_168_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_168_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_168_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_168_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_168_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_168_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_168_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_168_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_168_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_168_17.jpg"], "output": "G", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_169_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_169_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_169_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_169_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_169_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_169_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_169_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_169_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_169_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_169_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_169_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_169_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_169_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_169_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_169_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_169_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_169_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_169_17.jpg"], "output": "G", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_170_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_170_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_170_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_170_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_170_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_170_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_170_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_170_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_170_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_170_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_170_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_170_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_170_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_170_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_170_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_170_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_170_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_170_17.jpg"], "output": "E", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_171_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_171_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_171_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_171_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_171_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_171_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_171_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_171_7.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_172_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_172_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_172_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_172_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_172_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_172_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_172_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_172_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_172_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_172_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_172_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_172_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_172_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_172_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_172_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_172_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_172_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_172_17.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_173_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_173_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_173_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_173_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_173_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_173_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_173_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_173_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_173_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_173_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_173_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_173_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_173_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_173_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_173_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_173_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_173_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_173_17.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_174_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_174_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_174_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_174_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_174_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_174_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_174_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_174_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_174_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_174_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_174_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_174_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_174_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_174_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_174_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_174_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_174_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_174_17.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_175_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_175_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_175_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_175_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_175_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_175_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_175_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_175_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_175_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_175_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_175_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_175_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_175_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_175_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_175_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_175_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_175_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_175_17.jpg"], "output": "G", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_176_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_176_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_176_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_176_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_176_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_176_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_176_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_176_7.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_177_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_177_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_177_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_177_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_177_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_177_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_177_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_177_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_177_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_177_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_177_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_177_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_177_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_177_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_177_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_177_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_177_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_177_17.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_178_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_178_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_178_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_178_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_178_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_178_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_178_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_178_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_178_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_178_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_178_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_178_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_178_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_178_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_178_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_178_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_178_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_178_17.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_179_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_179_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_179_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_179_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_179_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_179_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_179_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_179_7.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_180_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_180_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_180_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_180_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_180_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_180_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_180_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_180_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_180_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_180_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_180_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_180_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_180_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_180_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_180_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_180_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_180_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_180_17.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_181_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_181_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_181_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_181_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_181_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_181_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_181_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_181_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_181_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_181_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_181_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_181_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_181_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_181_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_181_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_181_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_181_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_181_17.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_182_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_182_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_182_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_182_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_182_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_182_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_182_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_182_7.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_183_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_183_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_183_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_183_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_183_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_183_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_183_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_183_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_183_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_183_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_183_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_183_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_183_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_183_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_183_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_183_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_183_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_183_17.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_184_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_184_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_184_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_184_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_184_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_184_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_184_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_184_7.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_185_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_185_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_185_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_185_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_185_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_185_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_185_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_185_7.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_186_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_186_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_186_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_186_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_186_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_186_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_186_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_186_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_186_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_186_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_186_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_186_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_186_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_186_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_186_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_186_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_186_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_186_17.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_187_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_187_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_187_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_187_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_187_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_187_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_187_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_187_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_187_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_187_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_187_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_187_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_187_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_187_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_187_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_187_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_187_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_187_17.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_188_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_188_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_188_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_188_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_188_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_188_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_188_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_188_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_188_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_188_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_188_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_188_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_188_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_188_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_188_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_188_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_188_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_188_17.jpg"], "output": "H", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_189_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_189_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_189_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_189_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_189_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_189_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_189_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_189_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_189_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_189_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_189_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_189_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_189_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_189_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_189_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_189_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_189_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_189_17.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_190_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_190_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_190_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_190_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_190_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_190_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_190_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_190_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_190_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_190_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_190_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_190_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_190_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_190_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_190_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_190_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_190_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_190_17.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_191_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_191_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_191_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_191_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_191_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_191_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_191_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_191_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_191_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_191_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_191_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_191_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_191_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_191_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_191_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_191_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_191_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_191_17.jpg"], "output": "E", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_192_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_192_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_192_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_192_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_192_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_192_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_192_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_192_7.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_193_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_193_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_193_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_193_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_193_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_193_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_193_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_193_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_193_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_193_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_193_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_193_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_193_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_193_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_193_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_193_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_193_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_193_17.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_194_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_194_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_194_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_194_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_194_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_194_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_194_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_194_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_194_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_194_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_194_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_194_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_194_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_194_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_194_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_194_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_194_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_194_17.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_195_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_195_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_195_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_195_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_195_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_195_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_195_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_195_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_195_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_195_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_195_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_195_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_195_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_195_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_195_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_195_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_195_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_195_17.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_196_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_196_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_196_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_196_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_196_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_196_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_196_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_196_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_196_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_196_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_196_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_196_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_196_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_196_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_196_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_196_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_196_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_196_17.jpg"], "output": "E", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_197_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_197_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_197_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_197_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_197_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_197_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_197_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_197_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_197_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_197_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_197_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_197_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_197_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_197_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_197_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_197_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_197_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_197_17.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_198_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_198_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_198_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_198_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_198_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_198_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_198_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_198_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_198_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_198_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_198_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_198_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_198_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_198_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_198_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_198_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_198_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_198_17.jpg"], "output": "F", "qwen3-vl": "image none"}, {"task": "Image_text_retrieval_with_Spatial_Context", "visual_input_component": "synthetic image", "source": "SPEC", "options": "A: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "question": "Please retrieve the matching image to the query text in the candidate images.", "context": "Your task is : Given a text addressing spatial context, identify the matched image within candidates. The input images are the first 9 images\nSelect from the following choices.\nA: The 10th image\nB: The 11th image\nC: The 12th image\nD: The 13th image\nE: The 14th image\nF: The 15th image\nG: The 16th image\nH: The 17th image\nI: The 18th image", "input_image_path": ["./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_199_0.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_199_1.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_199_2.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_199_3.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_199_4.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_199_5.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_199_6.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_199_7.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_199_8.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_199_9.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_199_10.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_199_11.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_199_12.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_199_13.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_199_14.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_199_15.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_199_16.jpg", "./2D-spatial/Image_text_retrieval_with_Spatial_Context/Image_text_retrieval_with_Spatial_Context_199_17.jpg"], "output": "A", "qwen3-vl": "image none"}]
\ No newline at end of file
diff --git a/results/Multiview_Action_Recognition/qwen3-vl/metadata_info.json b/results/Multiview_Action_Recognition/qwen3-vl/metadata_info.json
new file mode 100644
index 0000000..59f767a
--- /dev/null
+++ b/results/Multiview_Action_Recognition/qwen3-vl/metadata_info.json
@@ -0,0 +1 @@
+[{"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: read a book\nB: drink water\nC: ride a bike\nD: play guitar", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: read a book\nB: drink water\nC: ride a bike\nD: play guitar", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_0_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_0_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_0_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_0_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_0_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_0_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_0_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_0_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_0_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_0_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_0_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_0_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: running\nB: sitting down\nC: lying down\nD: standing up", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: running\nB: sitting down\nC: lying down\nD: standing up", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_1_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_1_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_1_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_1_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_1_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_1_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_1_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_1_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_1_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_1_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_1_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_1_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: lying down\nB: standing up\nC: sitting down\nD: jumping", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: lying down\nB: standing up\nC: sitting down\nD: jumping", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_2_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_2_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_2_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_2_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_2_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_2_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_2_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_2_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_2_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_2_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_2_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_2_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: ride bicycle\nB: play guitar\nC: write letter\nD: eat meal", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: ride bicycle\nB: play guitar\nC: write letter\nD: eat meal", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_3_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_3_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_3_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_3_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_3_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_3_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_3_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_3_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_3_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_3_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_3_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_3_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: sit\nB: jump\nC: pickup\nD: run", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: sit\nB: jump\nC: pickup\nD: run", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_4_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_4_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_4_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_4_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_4_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_4_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_4_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_4_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_4_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_4_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_4_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_4_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: make a phone call\nB: play a guitar\nC: ride a bicycle\nD: drink a coffee", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: make a phone call\nB: play a guitar\nC: ride a bicycle\nD: drink a coffee", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_5_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_5_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_5_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_5_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_5_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_5_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_5_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_5_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_5_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_5_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_5_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_5_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: read book\nB: play piano\nC: jog\nD: eat meal", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: read book\nB: play piano\nC: jog\nD: eat meal", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_6_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_6_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_6_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_6_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_6_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_6_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_6_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_6_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_6_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_6_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_6_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_6_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: pickup\nB: sit\nC: run\nD: jump", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: pickup\nB: sit\nC: run\nD: jump", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_7_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_7_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_7_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_7_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_7_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_7_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_7_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_7_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_7_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_7_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_7_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_7_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: running\nB: sleeping\nC: dancing\nD: reading", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: running\nB: sleeping\nC: dancing\nD: reading", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_8_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_8_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_8_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_8_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_8_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_8_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_8_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_8_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_8_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_8_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_8_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_8_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: ride a bicycle\nB: make a phone call\nC: cook a meal\nD: play a piano", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: ride a bicycle\nB: make a phone call\nC: cook a meal\nD: play a piano", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_9_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_9_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_9_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_9_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_9_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_9_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_9_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_9_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_9_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_9_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_9_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_9_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: read a book\nB: tie shoelaces\nC: check time (from watch)\nD: wave hand", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: read a book\nB: tie shoelaces\nC: check time (from watch)\nD: wave hand", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_10_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_10_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_10_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_10_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_10_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_10_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_10_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_10_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_10_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_10_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_10_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_10_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: raise hand (greeting)\nB: touch chest (stomachache\nC: tie shoelaces (preparing to run)\nD: clap hands (applause)", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: raise hand (greeting)\nB: touch chest (stomachache\nC: tie shoelaces (preparing to run)\nD: clap hands (applause)", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_11_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_11_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_11_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_11_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_11_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_11_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_11_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_11_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_11_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_11_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_11_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_11_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: pickup\nB: sit\nC: jump\nD: run", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: pickup\nB: sit\nC: jump\nD: run", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_12_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_12_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_12_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_12_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_12_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_12_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_12_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_12_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_12_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_12_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_12_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_12_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: reading a book\nB: cooking a meal\nC: writing a letter\nD: brushing teeth", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: reading a book\nB: cooking a meal\nC: writing a letter\nD: brushing teeth", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_13_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_13_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_13_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_13_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_13_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_13_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_13_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_13_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_13_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_13_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_13_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_13_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: sit down\nB: jump\nC: wave hand\nD: touch chest (stomachache", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: sit down\nB: jump\nC: wave hand\nD: touch chest (stomachache", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_14_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_14_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_14_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_14_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_14_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_14_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_14_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_14_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_14_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_14_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_14_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_14_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: standing up\nB: jumping\nC: running\nD: sitting down", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: standing up\nB: jumping\nC: running\nD: sitting down", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_15_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_15_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_15_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_15_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_15_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_15_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_15_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_15_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_15_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_15_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_15_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_15_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: pickup\nB: run\nC: sit down\nD: jump", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: pickup\nB: run\nC: sit down\nD: jump", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_16_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_16_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_16_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_16_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_16_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_16_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_16_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_16_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_16_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_16_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_16_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_16_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: jogging\nB: brushing teeth\nC: eating\nD: reading a book", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: jogging\nB: brushing teeth\nC: eating\nD: reading a book", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_17_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_17_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_17_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_17_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_17_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_17_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_17_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_17_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_17_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_17_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_17_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_17_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: eat sandwich\nB: read book\nC: ride bicycle\nD: wear jacket", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: eat sandwich\nB: read book\nC: ride bicycle\nD: wear jacket", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_18_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_18_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_18_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_18_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_18_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_18_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_18_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_18_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_18_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_18_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_18_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_18_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: take off a hat\nB: tie shoelaces\nC: put on a hat\nD: put on gloves", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: take off a hat\nB: tie shoelaces\nC: put on a hat\nD: put on gloves", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_19_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_19_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_19_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_19_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_19_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_19_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_19_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_19_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_19_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_19_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_19_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_19_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: ride bicycle\nB: wear jacket\nC: read book\nD: cook dinner", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: ride bicycle\nB: wear jacket\nC: read book\nD: cook dinner", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_20_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_20_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_20_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_20_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_20_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_20_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_20_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_20_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_20_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_20_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_20_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_20_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: ride a bicycle\nB: tie a shoelace\nC: drink water\nD: read a book", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: ride a bicycle\nB: tie a shoelace\nC: drink water\nD: read a book", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_21_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_21_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_21_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_21_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_21_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_21_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_21_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_21_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_21_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_21_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_21_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_21_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: sitting down\nB: jumping\nC: standing up\nD: lying down", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: sitting down\nB: jumping\nC: standing up\nD: lying down", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_22_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_22_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_22_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_22_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_22_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_22_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_22_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_22_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_22_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_22_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_22_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_22_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: drink water\nB: read a book\nC: tie shoes\nD: climb stairs", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: drink water\nB: read a book\nC: tie shoes\nD: climb stairs", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_23_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_23_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_23_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_23_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_23_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_23_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_23_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_23_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_23_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_23_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_23_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_23_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: play a guitar\nB: drink water\nC: ride a bike\nD: write a note", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: play a guitar\nB: drink water\nC: ride a bike\nD: write a note", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_24_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_24_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_24_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_24_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_24_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_24_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_24_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_24_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_24_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_24_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_24_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_24_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: take off a hat\nB: put on a hat\nC: pick up a book\nD: tie shoelaces", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: take off a hat\nB: put on a hat\nC: pick up a book\nD: tie shoelaces", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_25_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_25_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_25_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_25_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_25_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_25_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_25_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_25_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_25_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_25_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_25_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_25_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: ride bike\nB: read book\nC: play guitar\nD: eat meal", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: ride bike\nB: read book\nC: play guitar\nD: eat meal", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_26_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_26_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_26_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_26_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_26_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_26_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_26_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_26_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_26_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_26_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_26_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_26_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: touch chest (stomachache\nB: throw a ball\nC: jump up\nD: tie shoelaces", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: touch chest (stomachache\nB: throw a ball\nC: jump up\nD: tie shoelaces", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_27_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_27_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_27_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_27_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_27_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_27_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_27_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_27_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_27_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_27_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_27_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_27_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: wave\nB: sit down\nC: jump\nD: pickup", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: wave\nB: sit down\nC: jump\nD: pickup", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_28_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_28_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_28_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_28_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_28_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_28_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_28_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_28_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_28_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_28_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_28_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_28_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: eat sandwich\nB: ride bicycle\nC: wear jacket\nD: play guitar", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: eat sandwich\nB: ride bicycle\nC: wear jacket\nD: play guitar", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_29_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_29_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_29_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_29_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_29_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_29_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_29_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_29_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_29_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_29_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_29_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_29_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: dancing\nB: reading\nC: sleeping\nD: cooking", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: dancing\nB: reading\nC: sleeping\nD: cooking", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_30_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_30_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_30_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_30_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_30_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_30_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_30_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_30_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_30_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_30_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_30_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_30_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: brushing teeth\nB: washing face\nC: brushing hair\nD: combing hair", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: brushing teeth\nB: washing face\nC: brushing hair\nD: combing hair", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_31_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_31_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_31_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_31_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_31_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_31_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_31_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_31_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_31_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_31_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_31_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_31_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: tie shoelaces\nB: check time (from watch)\nC: drink water\nD: read a book", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: tie shoelaces\nB: check time (from watch)\nC: drink water\nD: read a book", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_32_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_32_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_32_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_32_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_32_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_32_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_32_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_32_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_32_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_32_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_32_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_32_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: jump up\nB: touch chest (stomachache\nC: wave hand\nD: sit down", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: jump up\nB: touch chest (stomachache\nC: wave hand\nD: sit down", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_33_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_33_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_33_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_33_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_33_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_33_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_33_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_33_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_33_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_33_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_33_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_33_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: read a book\nB: tie shoelaces\nC: eat an apple\nD: check time (from watch)", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: read a book\nB: tie shoelaces\nC: eat an apple\nD: check time (from watch)", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_34_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_34_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_34_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_34_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_34_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_34_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_34_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_34_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_34_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_34_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_34_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_34_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: run\nB: drop\nC: jump\nD: sit", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: run\nB: drop\nC: jump\nD: sit", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_35_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_35_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_35_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_35_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_35_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_35_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_35_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_35_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_35_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_35_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_35_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_35_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: riding a bike\nB: baking a cake\nC: brushing teeth\nD: playing a guitar", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: riding a bike\nB: baking a cake\nC: brushing teeth\nD: playing a guitar", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_36_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_36_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_36_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_36_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_36_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_36_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_36_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_36_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_36_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_36_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_36_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_36_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: write on a board\nB: tie shoelaces\nC: check time (from watch)\nD: drink water", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: write on a board\nB: tie shoelaces\nC: check time (from watch)\nD: drink water", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_37_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_37_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_37_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_37_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_37_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_37_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_37_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_37_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_37_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_37_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_37_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_37_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: touch chest (stomachache\nB: clapping hands\nC: tying shoes\nD: jumping in place", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: touch chest (stomachache\nB: clapping hands\nC: tying shoes\nD: jumping in place", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_38_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_38_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_38_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_38_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_38_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_38_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_38_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_38_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_38_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_38_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_38_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_38_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: sitting down\nB: standing up\nC: jumping\nD: running", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: sitting down\nB: standing up\nC: jumping\nD: running", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_39_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_39_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_39_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_39_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_39_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_39_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_39_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_39_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_39_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_39_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_39_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_39_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: play guitar\nB: run\nC: sleep\nD: eat meal", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: play guitar\nB: run\nC: sleep\nD: eat meal", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_40_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_40_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_40_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_40_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_40_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_40_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_40_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_40_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_40_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_40_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_40_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_40_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: play piano\nB: eat meal\nC: paint picture\nD: ride bicycle", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: play piano\nB: eat meal\nC: paint picture\nD: ride bicycle", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_41_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_41_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_41_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_41_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_41_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_41_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_41_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_41_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_41_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_41_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_41_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_41_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: cooking\nB: reading\nC: dancing\nD: running", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: cooking\nB: reading\nC: dancing\nD: running", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_42_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_42_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_42_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_42_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_42_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_42_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_42_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_42_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_42_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_42_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_42_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_42_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: jumping\nB: sitting down\nC: lying down\nD: standing up", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: jumping\nB: sitting down\nC: lying down\nD: standing up", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_43_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_43_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_43_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_43_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_43_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_43_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_43_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_43_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_43_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_43_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_43_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_43_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: ride a bike\nB: eat a sandwich\nC: make a phone call\nD: tie a shoe", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: ride a bike\nB: eat a sandwich\nC: make a phone call\nD: tie a shoe", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_44_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_44_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_44_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_44_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_44_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_44_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_44_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_44_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_44_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_44_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_44_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_44_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: jump\nB: pickup\nC: sit down\nD: wave", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: jump\nB: pickup\nC: sit down\nD: wave", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_45_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_45_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_45_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_45_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_45_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_45_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_45_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_45_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_45_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_45_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_45_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_45_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: playing guitar\nB: tieing shoes\nC: drinking water\nD: brushing teeth", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: playing guitar\nB: tieing shoes\nC: drinking water\nD: brushing teeth", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_46_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_46_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_46_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_46_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_46_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_46_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_46_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_46_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_46_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_46_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_46_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_46_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: paint a picture\nB: eat meal\nC: run a marathon\nD: play a musical instrument", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: paint a picture\nB: eat meal\nC: run a marathon\nD: play a musical instrument", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_47_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_47_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_47_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_47_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_47_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_47_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_47_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_47_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_47_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_47_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_47_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_47_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: wave hand\nB: check time (from watch)\nC: tie shoelaces\nD: drink water", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: wave hand\nB: check time (from watch)\nC: tie shoelaces\nD: drink water", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_48_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_48_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_48_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_48_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_48_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_48_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_48_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_48_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_48_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_48_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_48_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_48_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: brushing teeth\nB: tying shoes\nC: cooking food\nD: watering plants", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: brushing teeth\nB: tying shoes\nC: cooking food\nD: watering plants", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_49_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_49_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_49_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_49_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_49_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_49_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_49_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_49_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_49_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_49_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_49_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_49_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: put on a hat\nB: take off a hat\nC: button a shirt\nD: tie a shoelace", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: put on a hat\nB: take off a hat\nC: button a shirt\nD: tie a shoelace", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_50_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_50_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_50_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_50_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_50_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_50_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_50_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_50_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_50_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_50_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_50_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_50_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: tie a shoe\nB: make a phone call\nC: play a guitar\nD: cook a meal", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: tie a shoe\nB: make a phone call\nC: play a guitar\nD: cook a meal", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_51_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_51_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_51_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_51_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_51_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_51_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_51_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_51_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_51_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_51_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_51_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_51_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: drop\nB: jump\nC: run\nD: sit", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: drop\nB: jump\nC: run\nD: sit", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_52_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_52_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_52_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_52_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_52_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_52_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_52_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_52_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_52_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_52_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_52_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_52_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: wave hand\nB: scratch head\nC: touch chest (stomachache\nD: jump up and down", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: wave hand\nB: scratch head\nC: touch chest (stomachache\nD: jump up and down", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_53_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_53_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_53_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_53_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_53_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_53_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_53_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_53_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_53_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_53_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_53_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_53_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: lying down\nB: standing up\nC: running\nD: sitting down", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: lying down\nB: standing up\nC: running\nD: sitting down", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_54_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_54_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_54_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_54_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_54_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_54_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_54_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_54_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_54_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_54_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_54_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_54_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: pickup\nB: run\nC: jump\nD: sit", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: pickup\nB: run\nC: jump\nD: sit", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_55_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_55_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_55_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_55_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_55_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_55_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_55_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_55_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_55_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_55_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_55_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_55_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: wipe face\nB: snap fingers\nC: brush hair\nD: tie shoelace", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: wipe face\nB: snap fingers\nC: brush hair\nD: tie shoelace", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_56_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_56_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_56_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_56_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_56_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_56_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_56_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_56_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_56_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_56_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_56_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_56_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: sleeping\nB: dancing\nC: reading\nD: running", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: sleeping\nB: dancing\nC: reading\nD: running", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_57_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_57_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_57_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_57_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_57_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_57_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_57_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_57_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_57_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_57_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_57_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_57_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: wear jacket\nB: sit down\nC: jump\nD: tie shoelaces", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: wear jacket\nB: sit down\nC: jump\nD: tie shoelaces", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_58_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_58_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_58_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_58_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_58_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_58_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_58_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_58_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_58_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_58_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_58_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_58_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: sitting down\nB: jumping\nC: standing up\nD: lying down", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: sitting down\nB: jumping\nC: standing up\nD: lying down", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_59_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_59_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_59_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_59_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_59_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_59_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_59_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_59_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_59_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_59_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_59_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_59_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: ride a bike\nB: read a book\nC: eat meal\nD: play a musical instrument", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: ride a bike\nB: read a book\nC: eat meal\nD: play a musical instrument", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_60_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_60_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_60_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_60_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_60_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_60_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_60_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_60_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_60_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_60_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_60_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_60_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: touch back (backache)\nB: clap hands\nC: sit down\nD: jump", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: touch back (backache)\nB: clap hands\nC: sit down\nD: jump", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_61_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_61_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_61_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_61_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_61_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_61_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_61_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_61_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_61_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_61_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_61_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_61_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: sitting down\nB: lying down\nC: standing up\nD: jumping", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: sitting down\nB: lying down\nC: standing up\nD: jumping", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_62_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_62_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_62_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_62_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_62_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_62_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_62_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_62_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_62_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_62_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_62_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_62_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: eat meal\nB: write letter\nC: ride bicycle\nD: play guitar", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: eat meal\nB: write letter\nC: ride bicycle\nD: play guitar", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_63_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_63_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_63_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_63_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_63_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_63_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_63_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_63_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_63_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_63_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_63_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_63_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: wave hand\nB: pick up a book\nC: wipe face\nD: tie shoelaces", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: wave hand\nB: pick up a book\nC: wipe face\nD: tie shoelaces", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_64_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_64_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_64_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_64_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_64_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_64_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_64_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_64_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_64_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_64_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_64_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_64_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: read a book\nB: play a guitar\nC: make a phone call\nD: cook a meal", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: read a book\nB: play a guitar\nC: make a phone call\nD: cook a meal", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_65_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_65_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_65_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_65_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_65_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_65_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_65_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_65_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_65_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_65_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_65_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_65_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: read a book\nB: make a phone call\nC: eat a meal\nD: play a video game", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: read a book\nB: make a phone call\nC: eat a meal\nD: play a video game", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_66_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_66_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_66_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_66_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_66_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_66_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_66_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_66_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_66_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_66_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_66_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_66_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: drop\nB: jump\nC: pick\nD: hold", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: drop\nB: jump\nC: pick\nD: hold", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_67_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_67_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_67_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_67_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_67_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_67_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_67_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_67_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_67_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_67_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_67_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_67_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: climb ladder\nB: kick ball\nC: tie shoe\nD: wipe face", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: climb ladder\nB: kick ball\nC: tie shoe\nD: wipe face", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_68_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_68_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_68_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_68_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_68_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_68_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_68_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_68_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_68_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_68_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_68_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_68_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: dancing\nB: reading\nC: cooking\nD: sleeping", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: dancing\nB: reading\nC: cooking\nD: sleeping", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_69_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_69_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_69_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_69_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_69_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_69_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_69_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_69_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_69_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_69_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_69_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_69_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: sitting down\nB: running\nC: standing up\nD: jumping", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: sitting down\nB: running\nC: standing up\nD: jumping", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_70_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_70_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_70_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_70_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_70_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_70_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_70_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_70_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_70_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_70_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_70_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_70_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: sleeping\nB: dancing\nC: cooking\nD: reading", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: sleeping\nB: dancing\nC: cooking\nD: reading", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_71_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_71_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_71_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_71_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_71_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_71_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_71_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_71_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_71_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_71_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_71_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_71_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: running\nB: jumping\nC: sitting down\nD: standing up", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: running\nB: jumping\nC: sitting down\nD: standing up", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_72_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_72_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_72_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_72_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_72_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_72_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_72_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_72_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_72_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_72_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_72_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_72_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: drop\nB: sit\nC: run\nD: jump", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: drop\nB: sit\nC: run\nD: jump", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_73_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_73_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_73_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_73_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_73_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_73_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_73_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_73_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_73_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_73_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_73_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_73_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: brushing teeth\nB: riding a bicycle\nC: cooking dinner\nD: tying shoelaces", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: brushing teeth\nB: riding a bicycle\nC: cooking dinner\nD: tying shoelaces", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_74_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_74_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_74_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_74_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_74_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_74_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_74_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_74_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_74_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_74_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_74_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_74_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: wave hand\nB: tie shoelace\nC: clap hands\nD: wipe face", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: wave hand\nB: tie shoelace\nC: clap hands\nD: wipe face", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_75_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_75_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_75_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_75_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_75_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_75_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_75_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_75_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_75_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_75_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_75_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_75_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: clap hands\nB: wave hand\nC: wipe face\nD: tie shoelace", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: clap hands\nB: wave hand\nC: wipe face\nD: tie shoelace", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_76_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_76_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_76_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_76_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_76_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_76_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_76_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_76_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_76_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_76_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_76_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_76_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: dancing\nB: sleeping\nC: reading\nD: cooking", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: dancing\nB: sleeping\nC: reading\nD: cooking", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_77_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_77_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_77_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_77_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_77_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_77_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_77_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_77_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_77_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_77_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_77_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_77_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: ride bicycle\nB: play guitar\nC: climb ladder\nD: drink water", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: ride bicycle\nB: play guitar\nC: climb ladder\nD: drink water", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_78_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_78_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_78_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_78_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_78_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_78_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_78_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_78_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_78_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_78_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_78_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_78_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: sleeping\nB: reading\nC: dancing\nD: running", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: sleeping\nB: reading\nC: dancing\nD: running", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_79_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_79_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_79_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_79_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_79_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_79_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_79_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_79_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_79_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_79_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_79_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_79_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: wipe face\nB: tie shoelaces\nC: brush hair\nD: write a note", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: wipe face\nB: tie shoelaces\nC: brush hair\nD: write a note", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_80_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_80_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_80_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_80_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_80_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_80_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_80_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_80_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_80_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_80_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_80_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_80_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: play a guitar\nB: read a book\nC: tie shoelaces\nD: check time (from watch)", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: play a guitar\nB: read a book\nC: tie shoelaces\nD: check time (from watch)", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_81_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_81_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_81_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_81_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_81_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_81_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_81_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_81_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_81_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_81_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_81_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_81_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: put on a hat\nB: open a door\nC: tie shoelaces\nD: take off a hat", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: put on a hat\nB: open a door\nC: tie shoelaces\nD: take off a hat", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_82_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_82_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_82_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_82_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_82_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_82_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_82_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_82_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_82_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_82_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_82_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_82_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: eat food\nB: brush hair\nC: wipe face\nD: tie shoelaces", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: eat food\nB: brush hair\nC: wipe face\nD: tie shoelaces", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_83_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_83_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_83_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_83_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_83_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_83_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_83_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_83_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_83_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_83_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_83_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_83_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: standing up\nB: jumping\nC: sitting down\nD: running", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: standing up\nB: jumping\nC: sitting down\nD: running", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_84_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_84_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_84_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_84_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_84_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_84_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_84_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_84_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_84_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_84_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_84_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_84_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: adjust glasses\nB: check time (from watch)\nC: wave hand\nD: tie shoelaces", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: adjust glasses\nB: check time (from watch)\nC: wave hand\nD: tie shoelaces", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_85_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_85_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_85_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_85_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_85_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_85_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_85_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_85_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_85_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_85_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_85_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_85_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: reading\nB: swimming\nC: cooking\nD: dancing", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: reading\nB: swimming\nC: cooking\nD: dancing", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_86_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_86_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_86_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_86_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_86_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_86_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_86_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_86_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_86_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_86_9.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: sit\nB: run\nC: drop\nD: jump", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: sit\nB: run\nC: drop\nD: jump", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_87_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_87_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_87_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_87_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_87_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_87_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_87_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_87_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_87_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_87_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_87_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_87_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: jump\nB: wave hand\nC: tie shoelaces\nD: wipe face", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: jump\nB: wave hand\nC: tie shoelaces\nD: wipe face", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_88_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_88_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_88_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_88_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_88_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_88_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_88_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_88_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_88_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_88_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_88_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_88_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: make a phone call\nB: eat a meal\nC: play a musical instrument\nD: tie a shoelace", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: make a phone call\nB: eat a meal\nC: play a musical instrument\nD: tie a shoelace", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_89_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_89_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_89_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_89_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_89_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_89_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_89_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_89_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_89_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_89_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_89_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_89_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: drop\nB: sit\nC: jump\nD: run", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: drop\nB: sit\nC: jump\nD: run", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_90_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_90_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_90_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_90_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_90_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_90_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_90_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_90_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_90_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_90_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_90_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_90_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: dancing\nB: reading\nC: jumping\nD: running", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: dancing\nB: reading\nC: jumping\nD: running", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_91_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_91_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_91_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_91_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_91_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_91_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_91_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_91_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_91_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_91_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_91_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_91_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: bake a cake\nB: play a guitar\nC: ride a bike\nD: wear jacket", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: bake a cake\nB: play a guitar\nC: ride a bike\nD: wear jacket", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_92_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_92_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_92_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_92_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_92_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_92_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_92_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_92_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_92_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_92_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_92_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_92_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: jumping\nB: touch back (backache)\nC: running\nD: sitting", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: jumping\nB: touch back (backache)\nC: running\nD: sitting", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_93_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_93_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_93_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_93_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_93_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_93_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_93_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_93_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_93_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_93_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_93_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_93_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: jumping\nB: sitting down\nC: running\nD: standing up", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: jumping\nB: sitting down\nC: running\nD: standing up", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_94_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_94_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_94_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_94_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_94_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_94_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_94_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_94_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_94_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_94_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_94_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_94_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: write on blackboard\nB: touch chest (stomachache\nC: jump\nD: sit down", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: write on blackboard\nB: touch chest (stomachache\nC: jump\nD: sit down", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_95_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_95_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_95_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_95_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_95_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_95_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_95_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_95_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_95_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_95_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_95_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_95_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: remove jacket\nB: tie shoelaces\nC: wear jacket\nD: sit down", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: remove jacket\nB: tie shoelaces\nC: wear jacket\nD: sit down", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_96_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_96_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_96_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_96_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_96_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_96_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_96_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_96_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_96_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_96_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_96_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_96_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: put on a coat\nB: take off a hat\nC: put on a hat\nD: tie shoelaces", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: put on a coat\nB: take off a hat\nC: put on a hat\nD: tie shoelaces", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_97_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_97_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_97_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_97_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_97_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_97_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_97_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_97_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_97_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_97_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_97_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_97_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: jumping\nB: lying down\nC: sitting down\nD: standing up", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: jumping\nB: lying down\nC: sitting down\nD: standing up", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_98_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_98_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_98_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_98_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_98_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_98_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_98_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_98_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_98_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_98_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_98_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_98_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: play guitar\nB: tie shoelaces\nC: cook meal\nD: wear jacket", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: play guitar\nB: tie shoelaces\nC: cook meal\nD: wear jacket", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_99_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_99_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_99_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_99_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_99_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_99_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_99_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_99_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_99_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_99_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_99_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_99_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: sit\nB: jump\nC: drop\nD: run", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: sit\nB: jump\nC: drop\nD: run", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_100_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_100_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_100_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_100_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_100_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_100_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_100_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_100_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_100_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_100_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_100_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_100_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: standing up\nB: jumping\nC: sitting down\nD: lying down", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: standing up\nB: jumping\nC: sitting down\nD: lying down", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_101_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_101_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_101_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_101_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_101_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_101_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_101_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_101_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_101_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_101_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_101_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_101_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: brushing teeth\nB: playing basketball\nC: dancing\nD: cooking", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: brushing teeth\nB: playing basketball\nC: dancing\nD: cooking", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_102_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_102_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_102_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_102_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_102_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_102_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_102_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_102_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_102_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_102_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_102_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_102_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: jump\nB: sit\nC: run\nD: bow", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: jump\nB: sit\nC: run\nD: bow", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_103_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_103_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_103_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_103_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_103_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_103_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_103_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_103_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_103_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_103_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_103_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_103_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: clap\nB: drop\nC: run\nD: jump", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: clap\nB: drop\nC: run\nD: jump", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_104_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_104_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_104_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_104_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_104_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_104_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_104_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_104_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_104_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_104_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_104_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_104_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: remove shoes\nB: wear shoes\nC: remove jacket\nD: wear jacket", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: remove shoes\nB: wear shoes\nC: remove jacket\nD: wear jacket", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_105_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_105_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_105_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_105_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_105_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_105_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_105_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_105_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_105_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_105_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_105_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_105_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: tie a shoelace\nB: eat a sandwich\nC: put on a hat\nD: throw a ball", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: tie a shoelace\nB: eat a sandwich\nC: put on a hat\nD: throw a ball", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_106_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_106_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_106_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_106_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_106_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_106_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_106_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_106_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_106_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_106_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_106_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_106_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: check time (from watch)\nB: drink water\nC: tie shoelace\nD: wave hand", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: check time (from watch)\nB: drink water\nC: tie shoelace\nD: wave hand", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_107_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_107_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_107_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_107_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_107_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_107_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_107_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_107_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_107_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_107_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_107_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_107_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: sit down\nB: jump up\nC: take off hat\nD: wear jacket", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: sit down\nB: jump up\nC: take off hat\nD: wear jacket", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_108_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_108_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_108_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_108_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_108_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_108_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_108_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_108_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_108_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_108_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_108_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_108_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: wave hand\nB: clap hands\nC: tie shoe\nD: wipe face", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: wave hand\nB: clap hands\nC: tie shoe\nD: wipe face", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_109_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_109_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_109_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_109_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_109_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_109_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_109_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_109_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_109_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_109_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_109_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_109_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: kick a ball\nB: take off a hat\nC: wave a hand\nD: put on a hat", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: kick a ball\nB: take off a hat\nC: wave a hand\nD: put on a hat", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_110_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_110_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_110_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_110_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_110_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_110_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_110_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_110_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_110_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_110_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_110_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_110_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: tie a shoelace\nB: put on a hat\nC: button a shirt\nD: take off a hat", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: tie a shoelace\nB: put on a hat\nC: button a shirt\nD: take off a hat", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_111_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_111_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_111_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_111_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_111_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_111_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_111_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_111_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_111_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_111_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_111_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_111_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: drink water\nB: play guitar\nC: tie shoelaces\nD: check time (from watch)", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: drink water\nB: play guitar\nC: tie shoelaces\nD: check time (from watch)", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_112_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_112_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_112_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_112_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_112_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_112_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_112_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_112_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_112_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_112_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_112_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_112_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: play football\nB: ride a bike\nC: read a book\nD: eat meal", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: play football\nB: ride a bike\nC: read a book\nD: eat meal", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_113_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_113_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_113_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_113_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_113_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_113_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_113_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_113_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_113_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_113_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_113_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_113_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: sit\nB: run\nC: jump\nD: drop", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: sit\nB: run\nC: jump\nD: drop", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_114_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_114_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_114_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_114_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_114_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_114_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_114_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_114_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_114_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_114_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_114_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_114_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: put on glasses\nB: tie a shoelace\nC: take off a hat\nD: put on a hat", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: put on glasses\nB: tie a shoelace\nC: take off a hat\nD: put on a hat", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_115_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_115_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_115_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_115_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_115_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_115_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_115_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_115_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_115_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_115_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_115_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_115_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: tie a shoelace\nB: drink water\nC: ride a bicycle\nD: read a book", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: tie a shoelace\nB: drink water\nC: ride a bicycle\nD: read a book", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_116_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_116_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_116_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_116_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_116_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_116_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_116_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_116_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_116_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_116_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_116_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_116_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: dance\nB: pickup\nC: sleep\nD: basketball", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: dance\nB: pickup\nC: sleep\nD: basketball", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_117_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_117_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_117_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_117_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_117_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_117_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_117_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_117_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_117_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_117_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_117_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_117_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: eat meal\nB: play guitar\nC: write letter\nD: ride bicycle", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: eat meal\nB: play guitar\nC: write letter\nD: ride bicycle", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_118_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_118_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_118_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_118_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_118_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_118_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_118_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_118_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_118_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_118_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_118_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_118_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: drink water\nB: read a book\nC: play basketball\nD: ride a bicycle", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: drink water\nB: read a book\nC: play basketball\nD: ride a bicycle", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_119_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_119_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_119_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_119_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_119_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_119_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_119_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_119_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_119_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_119_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_119_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_119_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: tie shoe\nB: wave hand\nC: check time (from watch)\nD: pick up phone", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: tie shoe\nB: wave hand\nC: check time (from watch)\nD: pick up phone", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_120_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_120_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_120_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_120_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_120_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_120_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_120_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_120_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_120_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_120_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_120_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_120_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: tie a shoelace\nB: put on a hat\nC: adjust a scarf\nD: take off a hat", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: tie a shoelace\nB: put on a hat\nC: adjust a scarf\nD: take off a hat", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_121_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_121_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_121_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_121_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_121_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_121_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_121_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_121_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_121_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_121_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_121_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_121_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: sitting down\nB: lying down\nC: standing up\nD: running", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: sitting down\nB: lying down\nC: standing up\nD: running", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_122_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_122_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_122_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_122_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_122_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_122_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_122_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_122_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_122_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_122_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_122_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_122_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: read a book\nB: drink water\nC: play guitar\nD: ride a bike", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: read a book\nB: drink water\nC: play guitar\nD: ride a bike", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_123_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_123_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_123_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_123_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_123_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_123_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_123_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_123_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_123_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_123_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_123_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_123_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: tie shoelaces\nB: check time (from watch)\nC: brush hair\nD: eat food", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: tie shoelaces\nB: check time (from watch)\nC: brush hair\nD: eat food", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_124_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_124_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_124_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_124_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_124_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_124_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_124_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_124_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_124_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_124_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_124_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_124_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: wave hand\nB: eat\nC: scratch head\nD: wipe face", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: wave hand\nB: eat\nC: scratch head\nD: wipe face", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_125_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_125_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_125_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_125_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_125_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_125_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_125_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_125_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_125_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_125_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_125_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_125_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: drink water\nB: write notes\nC: play guitar\nD: tie shoes", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: drink water\nB: write notes\nC: play guitar\nD: tie shoes", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_126_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_126_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_126_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_126_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_126_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_126_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_126_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_126_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_126_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_126_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_126_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_126_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: make a phone call\nB: write a letter\nC: tie shoes\nD: brush teeth", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: make a phone call\nB: write a letter\nC: tie shoes\nD: brush teeth", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_127_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_127_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_127_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_127_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_127_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_127_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_127_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_127_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_127_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_127_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_127_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_127_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: reading a book\nB: playing basketball\nC: brushing teeth\nD: riding a bicycle", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: reading a book\nB: playing basketball\nC: brushing teeth\nD: riding a bicycle", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_128_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_128_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_128_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_128_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_128_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_128_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_128_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_128_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_128_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_128_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_128_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_128_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: raise hand (question)\nB: touch chest (stomachache\nC: sit down (rest)\nD: step forward (walk)", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: raise hand (question)\nB: touch chest (stomachache\nC: sit down (rest)\nD: step forward (walk)", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_129_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_129_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_129_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_129_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_129_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_129_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_129_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_129_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_129_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_129_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_129_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_129_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: talk on phone\nB: pick up object\nC: tie shoelaces\nD: wipe face", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: talk on phone\nB: pick up object\nC: tie shoelaces\nD: wipe face", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_130_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_130_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_130_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_130_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_130_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_130_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_130_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_130_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_130_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_130_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_130_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_130_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: jump (exercise)\nB: touch chest (stomachache\nC: sit down (rest)\nD: wave hand (greeting)", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: jump (exercise)\nB: touch chest (stomachache\nC: sit down (rest)\nD: wave hand (greeting)", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_131_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_131_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_131_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_131_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_131_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_131_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_131_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_131_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_131_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_131_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_131_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_131_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: lying down\nB: sitting down\nC: standing up\nD: running", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: lying down\nB: sitting down\nC: standing up\nD: running", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_132_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_132_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_132_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_132_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_132_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_132_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_132_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_132_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_132_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_132_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_132_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_132_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: jumping\nB: sitting\nC: pickup\nD: running", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: jumping\nB: sitting\nC: pickup\nD: running", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_133_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_133_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_133_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_133_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_133_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_133_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_133_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_133_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_133_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_133_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_133_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_133_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: sit\nB: jump\nC: sleep\nD: pickup", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: sit\nB: jump\nC: sleep\nD: pickup", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_134_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_134_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_134_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_134_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_134_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_134_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_134_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_134_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_134_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_134_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_134_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_134_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: eat meal\nB: play guitar\nC: write letter\nD: read book", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: eat meal\nB: play guitar\nC: write letter\nD: read book", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_135_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_135_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_135_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_135_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_135_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_135_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_135_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_135_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_135_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_135_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_135_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_135_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: eat meal\nB: play basketball\nC: walk dog\nD: sleep", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: eat meal\nB: play basketball\nC: walk dog\nD: sleep", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_136_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_136_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_136_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_136_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_136_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_136_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_136_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_136_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_136_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_136_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_136_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_136_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: dance\nB: read book\nC: play tennis\nD: eat meal", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: dance\nB: read book\nC: play tennis\nD: eat meal", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_137_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_137_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_137_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_137_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_137_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_137_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_137_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_137_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_137_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_137_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_137_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_137_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: playing piano\nB: brushing teeth\nC: riding a bike\nD: cooking dinner", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: playing piano\nB: brushing teeth\nC: riding a bike\nD: cooking dinner", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_138_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_138_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_138_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_138_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_138_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_138_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_138_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_138_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_138_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_138_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_138_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_138_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: run\nB: pickup\nC: jump\nD: sit down", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: run\nB: pickup\nC: jump\nD: sit down", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_139_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_139_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_139_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_139_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_139_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_139_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_139_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_139_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_139_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_139_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_139_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_139_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: sit\nB: pickup\nC: jump\nD: run", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: sit\nB: pickup\nC: jump\nD: run", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_140_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_140_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_140_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_140_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_140_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_140_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_140_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_140_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_140_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_140_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_140_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_140_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: wave a hand\nB: tie a shoe\nC: kick a ball\nD: drink water", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: wave a hand\nB: tie a shoe\nC: kick a ball\nD: drink water", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_141_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_141_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_141_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_141_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_141_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_141_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_141_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_141_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_141_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_141_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_141_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_141_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: drop\nB: jump\nC: sit\nD: turn", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: drop\nB: jump\nC: sit\nD: turn", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_142_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_142_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_142_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_142_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_142_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_142_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_142_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_142_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_142_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_142_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_142_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_142_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: pickup\nB: jump\nC: sit\nD: run", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: pickup\nB: jump\nC: sit\nD: run", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_143_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_143_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_143_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_143_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_143_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_143_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_143_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_143_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_143_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_143_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_143_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_143_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: make a phone call\nB: play a guitar\nC: cook a meal\nD: paint a picture", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: make a phone call\nB: play a guitar\nC: cook a meal\nD: paint a picture", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_144_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_144_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_144_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_144_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_144_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_144_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_144_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_144_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_144_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_144_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_144_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_144_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: check time (from watch)\nB: tie shoelaces\nC: eat an apple\nD: play guitar", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: check time (from watch)\nB: tie shoelaces\nC: eat an apple\nD: play guitar", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_145_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_145_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_145_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_145_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_145_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_145_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_145_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_145_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_145_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_145_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_145_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_145_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: running a marathon\nB: cooking dinner\nC: playing a guitar\nD: brushing teeth", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: running a marathon\nB: cooking dinner\nC: playing a guitar\nD: brushing teeth", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_146_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_146_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_146_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_146_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_146_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_146_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_146_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_146_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_146_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_146_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_146_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_146_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: make a phone call\nB: read a book\nC: ride a bicycle\nD: play a guitar", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: make a phone call\nB: read a book\nC: ride a bicycle\nD: play a guitar", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_147_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_147_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_147_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_147_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_147_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_147_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_147_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_147_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_147_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_147_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_147_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_147_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: sit down\nB: wave\nC: put on a hat\nD: take off a hat", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: sit down\nB: wave\nC: put on a hat\nD: take off a hat", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_148_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_148_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_148_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_148_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_148_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_148_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_148_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_148_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_148_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_148_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_148_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_148_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: put on a hat\nB: tie shoes\nC: lift weights\nD: take off a hat", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: put on a hat\nB: tie shoes\nC: lift weights\nD: take off a hat", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_149_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_149_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_149_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_149_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_149_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_149_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_149_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_149_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_149_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_149_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_149_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_149_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: adjust a tie\nB: take off a hat\nC: put on glasses\nD: put on a hat", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: adjust a tie\nB: take off a hat\nC: put on glasses\nD: put on a hat", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_150_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_150_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_150_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_150_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_150_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_150_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_150_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_150_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_150_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_150_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_150_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_150_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: raise hand (greeting)\nB: jump (excited)\nC: touch chest (stomachache\nD: sit down (tired)", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: raise hand (greeting)\nB: jump (excited)\nC: touch chest (stomachache\nD: sit down (tired)", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_151_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_151_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_151_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_151_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_151_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_151_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_151_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_151_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_151_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_151_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_151_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_151_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: raise hand\nB: touch chest (stomachache\nC: jump in place\nD: bend forward", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: raise hand\nB: touch chest (stomachache\nC: jump in place\nD: bend forward", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_152_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_152_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_152_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_152_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_152_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_152_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_152_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_152_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_152_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_152_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_152_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_152_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: ride bike\nB: read book\nC: play guitar\nD: eat meal", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: ride bike\nB: read book\nC: play guitar\nD: eat meal", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_153_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_153_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_153_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_153_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_153_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_153_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_153_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_153_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_153_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_153_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_153_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_153_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: eat meal\nB: play guitar\nC: read book\nD: ride bicycle", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: eat meal\nB: play guitar\nC: read book\nD: ride bicycle", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_154_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_154_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_154_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_154_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_154_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_154_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_154_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_154_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_154_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_154_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_154_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_154_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: sleep\nB: read book\nC: eat meal\nD: run", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: sleep\nB: read book\nC: eat meal\nD: run", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_155_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_155_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_155_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_155_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_155_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_155_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_155_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_155_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_155_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_155_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_155_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_155_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: play guitar\nB: drink water\nC: jump rope\nD: read a book", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: play guitar\nB: drink water\nC: jump rope\nD: read a book", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_156_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_156_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_156_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_156_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_156_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_156_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_156_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_156_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_156_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_156_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_156_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_156_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: drop\nB: spin\nC: run\nD: jump", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: drop\nB: spin\nC: run\nD: jump", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_157_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_157_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_157_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_157_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_157_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_157_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_157_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_157_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_157_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_157_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_157_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_157_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: ride a bicycle\nB: play a guitar\nC: put on a hat\nD: write on a board", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: ride a bicycle\nB: play a guitar\nC: put on a hat\nD: write on a board", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_158_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_158_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_158_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_158_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_158_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_158_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_158_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_158_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_158_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_158_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_158_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_158_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: clap hands\nB: tie shoelaces\nC: touch back (backache)\nD: jump rope", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: clap hands\nB: tie shoelaces\nC: touch back (backache)\nD: jump rope", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_159_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_159_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_159_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_159_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_159_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_159_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_159_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_159_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_159_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_159_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_159_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_159_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: tie shoelaces\nB: drink water\nC: read a book\nD: play guitar", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: tie shoelaces\nB: drink water\nC: read a book\nD: play guitar", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_160_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_160_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_160_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_160_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_160_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_160_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_160_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_160_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_160_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_160_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_160_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_160_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: sitting\nB: touch chest (stomachache\nC: jumping\nD: waving", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: sitting\nB: touch chest (stomachache\nC: jumping\nD: waving", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_161_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_161_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_161_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_161_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_161_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_161_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_161_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_161_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_161_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_161_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_161_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_161_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: play tennis\nB: read a book\nC: eat meal\nD: ride a bike", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: play tennis\nB: read a book\nC: eat meal\nD: ride a bike", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_162_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_162_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_162_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_162_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_162_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_162_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_162_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_162_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_162_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_162_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_162_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_162_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: lying down\nB: running\nC: sitting down\nD: standing up", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: lying down\nB: running\nC: sitting down\nD: standing up", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_163_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_163_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_163_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_163_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_163_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_163_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_163_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_163_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_163_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_163_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_163_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_163_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: jump\nB: run\nC: sit\nD: drop", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: jump\nB: run\nC: sit\nD: drop", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_164_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_164_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_164_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_164_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_164_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_164_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_164_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_164_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_164_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_164_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_164_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_164_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: pick up object\nB: wipe face\nC: tie shoes\nD: jump rope", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: pick up object\nB: wipe face\nC: tie shoes\nD: jump rope", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_165_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_165_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_165_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_165_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_165_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_165_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_165_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_165_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_165_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_165_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_165_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_165_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: sit\nB: run\nC: jump\nD: bow", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: sit\nB: run\nC: jump\nD: bow", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_166_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_166_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_166_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_166_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_166_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_166_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_166_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_166_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_166_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_166_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_166_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_166_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: remove shoes\nB: wear jacket\nC: sit down\nD: drink water", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: remove shoes\nB: wear jacket\nC: sit down\nD: drink water", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_167_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_167_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_167_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_167_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_167_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_167_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_167_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_167_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_167_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_167_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_167_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_167_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: run in place\nB: wave hand\nC: touch chest (stomachache\nD: jump up", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: run in place\nB: wave hand\nC: touch chest (stomachache\nD: jump up", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_168_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_168_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_168_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_168_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_168_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_168_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_168_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_168_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_168_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_168_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_168_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_168_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: tie shoelaces\nB: open a door\nC: brush teeth\nD: check time (from watch)", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: tie shoelaces\nB: open a door\nC: brush teeth\nD: check time (from watch)", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_169_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_169_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_169_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_169_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_169_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_169_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_169_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_169_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_169_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_169_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_169_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_169_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: jump rope\nB: play guitar\nC: wipe face\nD: tie shoelaces", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: jump rope\nB: play guitar\nC: wipe face\nD: tie shoelaces", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_170_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_170_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_170_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_170_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_170_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_170_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_170_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_170_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_170_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_170_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_170_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_170_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: sitting down\nB: running\nC: lying down\nD: standing up", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: sitting down\nB: running\nC: lying down\nD: standing up", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_171_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_171_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_171_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_171_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_171_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_171_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_171_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_171_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_171_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_171_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_171_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_171_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: button a shirt\nB: take off a hat\nC: put on a hat\nD: tie a shoe", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: button a shirt\nB: take off a hat\nC: put on a hat\nD: tie a shoe", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_172_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_172_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_172_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_172_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_172_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_172_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_172_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_172_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_172_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_172_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_172_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_172_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: ride bicycle\nB: play piano\nC: wear jacket\nD: eat apple", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: ride bicycle\nB: play piano\nC: wear jacket\nD: eat apple", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_173_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_173_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_173_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_173_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_173_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_173_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_173_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_173_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_173_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_173_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_173_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_173_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: brushing teeth\nB: jogging\nC: reading a book\nD: cooking", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: brushing teeth\nB: jogging\nC: reading a book\nD: cooking", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_174_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_174_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_174_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_174_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_174_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_174_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_174_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_174_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_174_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_174_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_174_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_174_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: tie shoelaces\nB: pick up bag\nC: clap hands\nD: check time (from watch)", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: tie shoelaces\nB: pick up bag\nC: clap hands\nD: check time (from watch)", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_175_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_175_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_175_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_175_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_175_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_175_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_175_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_175_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_175_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_175_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_175_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_175_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: wave hand\nB: touch back (backache)\nC: eat food\nD: jump", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: wave hand\nB: touch back (backache)\nC: eat food\nD: jump", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_176_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_176_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_176_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_176_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_176_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_176_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_176_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_176_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_176_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_176_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_176_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_176_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: running\nB: reading\nC: dancing\nD: cooking", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: running\nB: reading\nC: dancing\nD: cooking", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_177_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_177_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_177_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_177_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_177_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_177_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_177_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_177_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_177_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_177_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_177_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_177_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: play guitar\nB: eat meal\nC: dance\nD: read book", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: play guitar\nB: eat meal\nC: dance\nD: read book", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_178_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_178_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_178_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_178_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_178_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_178_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_178_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_178_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_178_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_178_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_178_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_178_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: tie shoelaces\nB: make a phone call\nC: brush teeth\nD: write in a notebook", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: tie shoelaces\nB: make a phone call\nC: brush teeth\nD: write in a notebook", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_179_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_179_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_179_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_179_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_179_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_179_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_179_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_179_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_179_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_179_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_179_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_179_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: check time (from watch)\nB: tie shoes\nC: take a photo\nD: read a book", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: check time (from watch)\nB: tie shoes\nC: take a photo\nD: read a book", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_180_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_180_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_180_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_180_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_180_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_180_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_180_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_180_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_180_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_180_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_180_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_180_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: eat sandwich\nB: wear jacket\nC: play piano\nD: ride bicycle", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: eat sandwich\nB: wear jacket\nC: play piano\nD: ride bicycle", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_181_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_181_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_181_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_181_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_181_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_181_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_181_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_181_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_181_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_181_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_181_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_181_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: brushing teeth\nB: cooking\nC: jogging\nD: reading a book", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: brushing teeth\nB: cooking\nC: jogging\nD: reading a book", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_182_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_182_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_182_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_182_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_182_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_182_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_182_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_182_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_182_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_182_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_182_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_182_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: pick up phone\nB: tie shoe\nC: adjust glasses\nD: check time (from watch)", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: pick up phone\nB: tie shoe\nC: adjust glasses\nD: check time (from watch)", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_183_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_183_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_183_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_183_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_183_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_183_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_183_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_183_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_183_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_183_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_183_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_183_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: jumping\nB: sitting down\nC: dancing\nD: running", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: jumping\nB: sitting down\nC: dancing\nD: running", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_184_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_184_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_184_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_184_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_184_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_184_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_184_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_184_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_184_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_184_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_184_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_184_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: ride bike\nB: play guitar\nC: wear jacket\nD: eat food", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: ride bike\nB: play guitar\nC: wear jacket\nD: eat food", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_185_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_185_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_185_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_185_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_185_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_185_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_185_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_185_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_185_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_185_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_185_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_185_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: eat a sandwich\nB: sit down\nC: play a guitar\nD: put on a hat", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: eat a sandwich\nB: sit down\nC: play a guitar\nD: put on a hat", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_186_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_186_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_186_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_186_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_186_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_186_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_186_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_186_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_186_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_186_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_186_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_186_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: jump\nB: drop\nC: run\nD: climb", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: jump\nB: drop\nC: run\nD: climb", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_187_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_187_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_187_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_187_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_187_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_187_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_187_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_187_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_187_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_187_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_187_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_187_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: bow\nB: run\nC: sit\nD: jump", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: bow\nB: run\nC: sit\nD: jump", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_188_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_188_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_188_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_188_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_188_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_188_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_188_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_188_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_188_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_188_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_188_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_188_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: reading\nB: dancing\nC: sleeping\nD: running", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: reading\nB: dancing\nC: sleeping\nD: running", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_189_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_189_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_189_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_189_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_189_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_189_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_189_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_189_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_189_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_189_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_189_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_189_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: wave\nB: jump\nC: bow\nD: sit", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: wave\nB: jump\nC: bow\nD: sit", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_190_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_190_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_190_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_190_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_190_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_190_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_190_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_190_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_190_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_190_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_190_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_190_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: standing up\nB: sitting down\nC: jumping\nD: running", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: standing up\nB: sitting down\nC: jumping\nD: running", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_191_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_191_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_191_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_191_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_191_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_191_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_191_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_191_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_191_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_191_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_191_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_191_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: tie shoelaces\nB: play guitar\nC: read a book\nD: drink water", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: tie shoelaces\nB: play guitar\nC: read a book\nD: drink water", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_192_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_192_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_192_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_192_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_192_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_192_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_192_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_192_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_192_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_192_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_192_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_192_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: running\nB: cooking\nC: reading\nD: dancing", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: running\nB: cooking\nC: reading\nD: dancing", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_193_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_193_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_193_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_193_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_193_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_193_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_193_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_193_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_193_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_193_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_193_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_193_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: play guitar\nB: check time (from watch)\nC: tie shoelaces\nD: eat sandwich", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: play guitar\nB: check time (from watch)\nC: tie shoelaces\nD: eat sandwich", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_194_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_194_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_194_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_194_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_194_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_194_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_194_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_194_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_194_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_194_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_194_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_194_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: make a phone call\nB: read a book\nC: ride a bicycle\nD: play a guitar", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: make a phone call\nB: read a book\nC: ride a bicycle\nD: play a guitar", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_195_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_195_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_195_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_195_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_195_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_195_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_195_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_195_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_195_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_195_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_195_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_195_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: tie shoes\nB: eat a sandwich\nC: read a book\nD: put on a hat", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: tie shoes\nB: eat a sandwich\nC: read a book\nD: put on a hat", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_196_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_196_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_196_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_196_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_196_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_196_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_196_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_196_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_196_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_196_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_196_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_196_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: sitting down\nB: standing up\nC: lying down\nD: jumping", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: sitting down\nB: standing up\nC: lying down\nD: jumping", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_197_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_197_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_197_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_197_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_197_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_197_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_197_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_197_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_197_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_197_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_197_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_197_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_Action_Recognition", "visual_input_component": "natural image", "source": "PKUMMD", "options": "A: play a guitar\nB: make a phone call\nC: tie a shoelace\nD: ride a bicycle", "question": "Given the set of images from three different views (i.e., left, middle and right views), please identify the action that this person performs.", "context": "Your task is recognize human actions or activities in a scene using information from multiple views. \nSelect from the following choices.\nA: play a guitar\nB: make a phone call\nC: tie a shoelace\nD: ride a bicycle", "input_image_path": ["./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_198_0.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_198_1.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_198_2.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_198_3.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_198_4.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_198_5.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_198_6.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_198_7.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_198_8.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_198_9.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_198_10.png", "./3D-spatial/Multiview_Action_Recognition/Multiview_Action_Recognition_198_11.png"], "output": "B", "qwen3-vl": "image none"}]
\ No newline at end of file
diff --git a/results/Multiview_reasoning/qwen3-vl/metadata_info.json b/results/Multiview_reasoning/qwen3-vl/metadata_info.json
new file mode 100644
index 0000000..3ec7c10
--- /dev/null
+++ b/results/Multiview_reasoning/qwen3-vl/metadata_info.json
@@ -0,0 +1 @@
+[{"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_0_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_0_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_1_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_1_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_2_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_2_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_3_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_3_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_4_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_4_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_5_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_5_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_6_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_6_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_7_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_7_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_8_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_8_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_9_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_9_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_10_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_10_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_11_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_11_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_12_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_12_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_13_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_13_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_14_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_14_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_15_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_15_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_16_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_16_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_17_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_17_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_18_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_18_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_19_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_19_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_20_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_20_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_21_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_21_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_22_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_22_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_23_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_23_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_24_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_24_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_25_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_25_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_26_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_26_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_27_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_27_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_28_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_28_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_29_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_29_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_30_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_30_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_31_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_31_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_32_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_32_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_33_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_33_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_34_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_34_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_35_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_35_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_36_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_36_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_37_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_37_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_38_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_38_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_39_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_39_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_40_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_40_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_41_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_41_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_42_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_42_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_43_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_43_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_44_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_44_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_45_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_45_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_46_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_46_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_47_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_47_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_48_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_48_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_49_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_49_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_50_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_50_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_51_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_51_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_52_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_52_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_53_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_53_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_54_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_54_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_55_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_55_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_56_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_56_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_57_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_57_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_58_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_58_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_59_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_59_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_60_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_60_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_61_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_61_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_62_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_62_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_63_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_63_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_64_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_64_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_65_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_65_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_66_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_66_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_67_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_67_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_68_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_68_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_69_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_69_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_70_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_70_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_71_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_71_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_72_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_72_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_73_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_73_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_74_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_74_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_75_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_75_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_76_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_76_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_77_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_77_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_78_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_78_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_79_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_79_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_80_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_80_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_81_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_81_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_82_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_82_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_83_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_83_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_84_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_84_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_85_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_85_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_86_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_86_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_87_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_87_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_88_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_88_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_89_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_89_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_90_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_90_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_91_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_91_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_92_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_92_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_93_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_93_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_94_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_94_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_95_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_95_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_96_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_96_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_97_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_97_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_98_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_98_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_99_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_99_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_100_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_100_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_101_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_101_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_102_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_102_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_103_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_103_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_104_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_104_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_105_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_105_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_106_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_106_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_107_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_107_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_108_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_108_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_109_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_109_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_110_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_110_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_111_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_111_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_112_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_112_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_113_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_113_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_114_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_114_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_115_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_115_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_116_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_116_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_117_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_117_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_118_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_118_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_119_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_119_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_120_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_120_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_121_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_121_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_122_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_122_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_123_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_123_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_124_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_124_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_125_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_125_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_126_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_126_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_127_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_127_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_128_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_128_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_129_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_129_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_130_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_130_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_131_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_131_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "Multiview_reasoning", "visual_input_component": "natural image", "source": "BLINK_MVR", "options": "A: left\nB: right", "question": "The images are frames from a video. The first image is from the beginning of the video and the second image is from the end. Is the camera moving left or right when shooting the video?", "context": "Your task is centered on evaluating the multi-view reasoning capabilities of models. The objective is to deduce the relative camera motion based on two images of an object captured from different viewpoints.\nSelect from the following choices.\nA: left\nB: right", "input_image_path": ["./3D-spatial/Multiview_reasoning/Multiview_reasoning_132_0.jpg", "./3D-spatial/Multiview_reasoning/Multiview_reasoning_132_1.jpg"], "output": "B", "qwen3-vl": "image none"}]
\ No newline at end of file
diff --git a/results/action_quality_assessment/qwen3-vl/metadata_info.json b/results/action_quality_assessment/qwen3-vl/metadata_info.json
new file mode 100644
index 0000000..2050912
--- /dev/null
+++ b/results/action_quality_assessment/qwen3-vl/metadata_info.json
@@ -0,0 +1 @@
+[{"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 13.72\nB: 14.35\nC: 16.28\nD: 14.93", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 13.72\nB: 14.35\nC: 16.28\nD: 14.93", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_0_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_0_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_0_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_0_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_0_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_0_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_0_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_0_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_0_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_0_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_0_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_0_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_0_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_0_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_0_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_0_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 15.26\nB: 16.11\nC: 13.57\nD: 14.42", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 15.26\nB: 16.11\nC: 13.57\nD: 14.42", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_1_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_1_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_1_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_1_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_1_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_1_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_1_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_1_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_1_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_1_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_1_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_1_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_1_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_1_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_1_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_1_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 81.6\nB: 96.6\nC: 40.54\nD: 51.19", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 81.6\nB: 96.6\nC: 40.54\nD: 51.19", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_2_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_2_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_2_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_2_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_2_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_2_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_2_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_2_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_2_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_2_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_2_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_2_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_2_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_2_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_2_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_2_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 101.25\nB: 23.09\nC: 48.72\nD: 68.8", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 101.25\nB: 23.09\nC: 48.72\nD: 68.8", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_3_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_3_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_3_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_3_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_3_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_3_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_3_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_3_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_3_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_3_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_3_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_3_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_3_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_3_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_3_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_3_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 51.2\nB: 25.68\nC: 94.09\nD: 77.42", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 51.2\nB: 25.68\nC: 94.09\nD: 77.42", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_4_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_4_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_4_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_4_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_4_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_4_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_4_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_4_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_4_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_4_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_4_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_4_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_4_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_4_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_4_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_4_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 72.9\nB: 42.15\nC: 86.68\nD: 61.07", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 72.9\nB: 42.15\nC: 86.68\nD: 61.07", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_5_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_5_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_5_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_5_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_5_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_5_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_5_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_5_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_5_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_5_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_5_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_5_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_5_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_5_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_5_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_5_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 28.0\nB: 21.54\nC: 43.89\nD: 15.37", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 28.0\nB: 21.54\nC: 43.89\nD: 15.37", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_6_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_6_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_6_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_6_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_6_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_6_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_6_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_6_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_6_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_6_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_6_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_6_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_6_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_6_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_6_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_6_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 63.74\nB: 50.8\nC: 91.69\nD: 47.0", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 63.74\nB: 50.8\nC: 91.69\nD: 47.0", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_7_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_7_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_7_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_7_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_7_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_7_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_7_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_7_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_7_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_7_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_7_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_7_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_7_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_7_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_7_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_7_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 92.75\nB: 27.27\nC: 74.47\nD: 42.73", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 92.75\nB: 27.27\nC: 74.47\nD: 42.73", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_8_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_8_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_8_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_8_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_8_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_8_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_8_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_8_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_8_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_8_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_8_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_8_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_8_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_8_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_8_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_8_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 36.08\nB: 43.14\nC: 23.0\nD: 13.43", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 36.08\nB: 43.14\nC: 23.0\nD: 13.43", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_9_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_9_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_9_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_9_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_9_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_9_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_9_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_9_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_9_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_9_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_9_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_9_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_9_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_9_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_9_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_9_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 14.7\nB: 15.8\nC: 15.61\nD: 13.11", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 14.7\nB: 15.8\nC: 15.61\nD: 13.11", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_10_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_10_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_10_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_10_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_10_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_10_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_10_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_10_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_10_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_10_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_10_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_10_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_10_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_10_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_10_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_10_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 38.64\nB: 78.37\nC: 62.38\nD: 65.49", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 38.64\nB: 78.37\nC: 62.38\nD: 65.49", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_11_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_11_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_11_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_11_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_11_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_11_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_11_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_11_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_11_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_11_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_11_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_11_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_11_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_11_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_11_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_11_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 90.59\nB: 72.12\nC: 53.48\nD: 37.87", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 90.59\nB: 72.12\nC: 53.48\nD: 37.87", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_12_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_12_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_12_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_12_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_12_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_12_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_12_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_12_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_12_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_12_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_12_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_12_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_12_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_12_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_12_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_12_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 73.13\nB: 42.05\nC: 88.2\nD: 34.17", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 73.13\nB: 42.05\nC: 88.2\nD: 34.17", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_13_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_13_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_13_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_13_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_13_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_13_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_13_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_13_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_13_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_13_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_13_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_13_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_13_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_13_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_13_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_13_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 41.65\nB: 31.16\nC: 24.74\nD: 17.0", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 41.65\nB: 31.16\nC: 24.74\nD: 17.0", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_14_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_14_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_14_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_14_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_14_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_14_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_14_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_14_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_14_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_14_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_14_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_14_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_14_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_14_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_14_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_14_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 90.09\nB: 49.05\nC: 65.44\nD: 62.51", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 90.09\nB: 49.05\nC: 65.44\nD: 62.51", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_15_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_15_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_15_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_15_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_15_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_15_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_15_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_15_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_15_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_15_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_15_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_15_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_15_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_15_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_15_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_15_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 66.33\nB: 88.78\nC: 39.89\nD: 57.84", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 66.33\nB: 88.78\nC: 39.89\nD: 57.84", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_16_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_16_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_16_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_16_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_16_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_16_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_16_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_16_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_16_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_16_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_16_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_16_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_16_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_16_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_16_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_16_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 66.72\nB: 92.78\nC: 51.35\nD: 21.6", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 66.72\nB: 92.78\nC: 51.35\nD: 21.6", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_17_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_17_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_17_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_17_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_17_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_17_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_17_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_17_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_17_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_17_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_17_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_17_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_17_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_17_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_17_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_17_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 13.6\nB: 14.75\nC: 16.56\nD: 15.4", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 13.6\nB: 14.75\nC: 16.56\nD: 15.4", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_18_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_18_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_18_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_18_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_18_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_18_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_18_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_18_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_18_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_18_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_18_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_18_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_18_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_18_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_18_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_18_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 47.0\nB: 35.44\nC: 19.47\nD: 14.22", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 47.0\nB: 35.44\nC: 19.47\nD: 14.22", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_19_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_19_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_19_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_19_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_19_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_19_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_19_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_19_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_19_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_19_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_19_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_19_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_19_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_19_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_19_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_19_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 14.04\nB: 15.74\nC: 13.89\nD: 15.78", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 14.04\nB: 15.74\nC: 13.89\nD: 15.78", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_20_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_20_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_20_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_20_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_20_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_20_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_20_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_20_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_20_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_20_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_20_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_20_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_20_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_20_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_20_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_20_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 20.0\nB: 49.69\nC: 31.96\nD: 12.28", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 20.0\nB: 49.69\nC: 31.96\nD: 12.28", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_21_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_21_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_21_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_21_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_21_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_21_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_21_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_21_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_21_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_21_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_21_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_21_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_21_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_21_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_21_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_21_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 38.52\nB: 21.11\nC: 16.2\nD: 31.0", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 38.52\nB: 21.11\nC: 16.2\nD: 31.0", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_22_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_22_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_22_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_22_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_22_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_22_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_22_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_22_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_22_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_22_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_22_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_22_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_22_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_22_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_22_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_22_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 58.15\nB: 66.19\nC: 34.82\nD: 81.2", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 58.15\nB: 66.19\nC: 34.82\nD: 81.2", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_23_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_23_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_23_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_23_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_23_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_23_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_23_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_23_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_23_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_23_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_23_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_23_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_23_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_23_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_23_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_23_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 74.85\nB: 45.75\nC: 84.63\nD: 56.85", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 74.85\nB: 45.75\nC: 84.63\nD: 56.85", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_24_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_24_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_24_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_24_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_24_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_24_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_24_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_24_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_24_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_24_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_24_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_24_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_24_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_24_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_24_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_24_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 31.78\nB: 16.54\nC: 38.0\nD: 23.93", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 31.78\nB: 16.54\nC: 38.0\nD: 23.93", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_25_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_25_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_25_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_25_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_25_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_25_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_25_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_25_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_25_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_25_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_25_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_25_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_25_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_25_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_25_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_25_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 24.67\nB: 33.0\nC: 49.16\nD: 9.13", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 24.67\nB: 33.0\nC: 49.16\nD: 9.13", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_26_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_26_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_26_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_26_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_26_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_26_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_26_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_26_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_26_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_26_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_26_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_26_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_26_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_26_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_26_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_26_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 16.09\nB: 13.82\nC: 15.43\nD: 14.88", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 16.09\nB: 13.82\nC: 15.43\nD: 14.88", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_27_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_27_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_27_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_27_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_27_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_27_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_27_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_27_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_27_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_27_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_27_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_27_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_27_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_27_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_27_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_27_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 37.4\nB: 56.4\nC: 69.5\nD: 101.82", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 37.4\nB: 56.4\nC: 69.5\nD: 101.82", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_28_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_28_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_28_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_28_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_28_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_28_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_28_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_28_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_28_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_28_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_28_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_28_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_28_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_28_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_28_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_28_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 41.0\nB: 8.77\nC: 19.67\nD: 35.16", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 41.0\nB: 8.77\nC: 19.67\nD: 35.16", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_29_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_29_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_29_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_29_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_29_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_29_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_29_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_29_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_29_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_29_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_29_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_29_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_29_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_29_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_29_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_29_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 56.82\nB: 84.15\nC: 26.52\nD: 66.29", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 56.82\nB: 84.15\nC: 26.52\nD: 66.29", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_30_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_30_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_30_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_30_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_30_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_30_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_30_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_30_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_30_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_30_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_30_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_30_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_30_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_30_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_30_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_30_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 74.25\nB: 82.65\nC: 56.79\nD: 24.3", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 74.25\nB: 82.65\nC: 56.79\nD: 24.3", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_31_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_31_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_31_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_31_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_31_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_31_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_31_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_31_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_31_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_31_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_31_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_31_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_31_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_31_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_31_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_31_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 54.04\nB: 76.33\nC: 37.87\nD: 87.47", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 54.04\nB: 76.33\nC: 37.87\nD: 87.47", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_32_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_32_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_32_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_32_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_32_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_32_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_32_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_32_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_32_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_32_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_32_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_32_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_32_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_32_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_32_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_32_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 37.95\nB: 66.54\nC: 52.06\nD: 98.65", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 37.95\nB: 66.54\nC: 52.06\nD: 98.65", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_33_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_33_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_33_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_33_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_33_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_33_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_33_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_33_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_33_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_33_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_33_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_33_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_33_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_33_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_33_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_33_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 65.39\nB: 77.95\nC: 54.0\nD: 94.23", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 65.39\nB: 77.95\nC: 54.0\nD: 94.23", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_34_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_34_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_34_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_34_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_34_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_34_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_34_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_34_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_34_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_34_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_34_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_34_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_34_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_34_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_34_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_34_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 41.04\nB: 18.12\nC: 33.0\nD: 11.99", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 41.04\nB: 18.12\nC: 33.0\nD: 11.99", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_35_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_35_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_35_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_35_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_35_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_35_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_35_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_35_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_35_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_35_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_35_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_35_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_35_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_35_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_35_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_35_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 50.0\nB: 13.02\nC: 19.88\nD: 34.9", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 50.0\nB: 13.02\nC: 19.88\nD: 34.9", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_36_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_36_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_36_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_36_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_36_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_36_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_36_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_36_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_36_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_36_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_36_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_36_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_36_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_36_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_36_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_36_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 21.52\nB: 30.43\nC: 15.58\nD: 47.0", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 21.52\nB: 30.43\nC: 15.58\nD: 47.0", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_37_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_37_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_37_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_37_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_37_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_37_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_37_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_37_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_37_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_37_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_37_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_37_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_37_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_37_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_37_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_37_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 51.1\nB: 86.7\nC: 69.2\nD: 94.61", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 51.1\nB: 86.7\nC: 69.2\nD: 94.61", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_38_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_38_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_38_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_38_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_38_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_38_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_38_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_38_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_38_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_38_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_38_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_38_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_38_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_38_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_38_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_38_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 71.06\nB: 94.05\nC: 55.11\nD: 37.78", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 71.06\nB: 94.05\nC: 55.11\nD: 37.78", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_39_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_39_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_39_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_39_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_39_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_39_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_39_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_39_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_39_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_39_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_39_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_39_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_39_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_39_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_39_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_39_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 13.81\nB: 15.16\nC: 16.04\nD: 14.68", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 13.81\nB: 15.16\nC: 16.04\nD: 14.68", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_40_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_40_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_40_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_40_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_40_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_40_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_40_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_40_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_40_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_40_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_40_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_40_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_40_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_40_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_40_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_40_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 13.67\nB: 14.93\nC: 14.61\nD: 16.1", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 13.67\nB: 14.93\nC: 14.61\nD: 16.1", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_41_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_41_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_41_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_41_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_41_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_41_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_41_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_41_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_41_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_41_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_41_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_41_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_41_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_41_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_41_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_41_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 48.49\nB: 88.9\nC: 64.35\nD: 37.08", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 48.49\nB: 88.9\nC: 64.35\nD: 37.08", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_42_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_42_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_42_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_42_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_42_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_42_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_42_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_42_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_42_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_42_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_42_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_42_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_42_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_42_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_42_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_42_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 37.22\nB: 46.55\nC: 24.0\nD: 9.84", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 37.22\nB: 46.55\nC: 24.0\nD: 9.84", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_43_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_43_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_43_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_43_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_43_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_43_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_43_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_43_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_43_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_43_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_43_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_43_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_43_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_43_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_43_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_43_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 79.2\nB: 57.54\nC: 41.45\nD: 87.34", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 79.2\nB: 57.54\nC: 41.45\nD: 87.34", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_44_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_44_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_44_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_44_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_44_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_44_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_44_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_44_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_44_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_44_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_44_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_44_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_44_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_44_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_44_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_44_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 69.3\nB: 54.14\nC: 30.81\nD: 86.4", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 69.3\nB: 54.14\nC: 30.81\nD: 86.4", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_45_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_45_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_45_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_45_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_45_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_45_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_45_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_45_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_45_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_45_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_45_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_45_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_45_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_45_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_45_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_45_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 14.18\nB: 14.89\nC: 16.62\nD: 13.77", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 14.18\nB: 14.89\nC: 16.62\nD: 13.77", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_46_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_46_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_46_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_46_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_46_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_46_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_46_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_46_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_46_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_46_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_46_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_46_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_46_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_46_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_46_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_46_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 16.13\nB: 15.7\nC: 13.4\nD: 14.0", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 16.13\nB: 15.7\nC: 13.4\nD: 14.0", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_47_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_47_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_47_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_47_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_47_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_47_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_47_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_47_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_47_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_47_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_47_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_47_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_47_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_47_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_47_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_47_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 16.07\nB: 14.31\nC: 13.42\nD: 15.77", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 16.07\nB: 14.31\nC: 13.42\nD: 15.77", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_48_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_48_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_48_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_48_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_48_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_48_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_48_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_48_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_48_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_48_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_48_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_48_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_48_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_48_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_48_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_48_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 55.37\nB: 32.43\nC: 86.4\nD: 71.99", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 55.37\nB: 32.43\nC: 86.4\nD: 71.99", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_49_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_49_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_49_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_49_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_49_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_49_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_49_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_49_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_49_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_49_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_49_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_49_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_49_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_49_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_49_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_49_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 48.34\nB: 69.75\nC: 86.77\nD: 99.17", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 48.34\nB: 69.75\nC: 86.77\nD: 99.17", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_50_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_50_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_50_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_50_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_50_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_50_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_50_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_50_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_50_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_50_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_50_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_50_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_50_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_50_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_50_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_50_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 15.1\nB: 14.74\nC: 15.78\nD: 13.81", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 15.1\nB: 14.74\nC: 15.78\nD: 13.81", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_51_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_51_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_51_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_51_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_51_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_51_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_51_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_51_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_51_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_51_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_51_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_51_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_51_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_51_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_51_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_51_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 39.77\nB: 93.48\nC: 76.5\nD: 43.86", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 39.77\nB: 93.48\nC: 76.5\nD: 43.86", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_52_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_52_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_52_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_52_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_52_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_52_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_52_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_52_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_52_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_52_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_52_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_52_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_52_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_52_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_52_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_52_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 15.3\nB: 13.45\nC: 14.3\nD: 15.81", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 15.3\nB: 13.45\nC: 14.3\nD: 15.81", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_53_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_53_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_53_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_53_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_53_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_53_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_53_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_53_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_53_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_53_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_53_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_53_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_53_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_53_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_53_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_53_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 15.38\nB: 16.05\nC: 13.85\nD: 14.2", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 15.38\nB: 16.05\nC: 13.85\nD: 14.2", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_54_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_54_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_54_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_54_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_54_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_54_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_54_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_54_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_54_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_54_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_54_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_54_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_54_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_54_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_54_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_54_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 40.46\nB: 77.72\nC: 86.4\nD: 42.0", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 40.46\nB: 77.72\nC: 86.4\nD: 42.0", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_55_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_55_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_55_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_55_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_55_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_55_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_55_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_55_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_55_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_55_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_55_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_55_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_55_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_55_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_55_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_55_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 73.6\nB: 57.54\nC: 28.87\nD: 92.2", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 73.6\nB: 57.54\nC: 28.87\nD: 92.2", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_56_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_56_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_56_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_56_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_56_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_56_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_56_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_56_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_56_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_56_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_56_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_56_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_56_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_56_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_56_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_56_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 32.99\nB: 51.0\nC: 90.75\nD: 66.64", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 32.99\nB: 51.0\nC: 90.75\nD: 66.64", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_57_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_57_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_57_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_57_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_57_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_57_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_57_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_57_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_57_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_57_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_57_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_57_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_57_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_57_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_57_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_57_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 60.36\nB: 72.34\nC: 80.64\nD: 92.23", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 60.36\nB: 72.34\nC: 80.64\nD: 92.23", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_58_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_58_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_58_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_58_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_58_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_58_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_58_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_58_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_58_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_58_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_58_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_58_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_58_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_58_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_58_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_58_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 14.2\nB: 13.73\nC: 15.59\nD: 16.57", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 14.2\nB: 13.73\nC: 15.59\nD: 16.57", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_59_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_59_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_59_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_59_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_59_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_59_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_59_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_59_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_59_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_59_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_59_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_59_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_59_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_59_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_59_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_59_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 13.97\nB: 14.16\nC: 15.7\nD: 16.2", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 13.97\nB: 14.16\nC: 15.7\nD: 16.2", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_60_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_60_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_60_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_60_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_60_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_60_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_60_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_60_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_60_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_60_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_60_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_60_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_60_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_60_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_60_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_60_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 45.92\nB: 11.52\nC: 25.38\nD: 36.0", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 45.92\nB: 11.52\nC: 25.38\nD: 36.0", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_61_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_61_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_61_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_61_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_61_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_61_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_61_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_61_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_61_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_61_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_61_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_61_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_61_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_61_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_61_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_61_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 35.36\nB: 74.12\nC: 49.39\nD: 93.6", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 35.36\nB: 74.12\nC: 49.39\nD: 93.6", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_62_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_62_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_62_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_62_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_62_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_62_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_62_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_62_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_62_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_62_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_62_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_62_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_62_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_62_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_62_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_62_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 62.18\nB: 42.9\nC: 80.12\nD: 63.81", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 62.18\nB: 42.9\nC: 80.12\nD: 63.81", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_63_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_63_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_63_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_63_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_63_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_63_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_63_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_63_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_63_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_63_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_63_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_63_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_63_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_63_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_63_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_63_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 18.7\nB: 40.55\nC: 33.0\nD: 11.12", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 18.7\nB: 40.55\nC: 33.0\nD: 11.12", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_64_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_64_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_64_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_64_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_64_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_64_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_64_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_64_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_64_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_64_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_64_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_64_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_64_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_64_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_64_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_64_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 94.83\nB: 59.75\nC: 84.99\nD: 65.28", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 94.83\nB: 59.75\nC: 84.99\nD: 65.28", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_65_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_65_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_65_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_65_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_65_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_65_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_65_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_65_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_65_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_65_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_65_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_65_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_65_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_65_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_65_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_65_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 36.11\nB: 77.04\nC: 44.05\nD: 90.75", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 36.11\nB: 77.04\nC: 44.05\nD: 90.75", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_66_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_66_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_66_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_66_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_66_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_66_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_66_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_66_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_66_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_66_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_66_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_66_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_66_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_66_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_66_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_66_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 15.7\nB: 23.32\nC: 35.3\nD: 40.0", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 15.7\nB: 23.32\nC: 35.3\nD: 40.0", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_67_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_67_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_67_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_67_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_67_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_67_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_67_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_67_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_67_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_67_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_67_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_67_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_67_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_67_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_67_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_67_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 41.0\nB: 16.81\nC: 37.0\nD: 26.08", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 41.0\nB: 16.81\nC: 37.0\nD: 26.08", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_68_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_68_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_68_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_68_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_68_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_68_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_68_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_68_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_68_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_68_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_68_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_68_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_68_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_68_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_68_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_68_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 26.98\nB: 14.1\nC: 37.94\nD: 47.0", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 26.98\nB: 14.1\nC: 37.94\nD: 47.0", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_69_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_69_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_69_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_69_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_69_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_69_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_69_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_69_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_69_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_69_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_69_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_69_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_69_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_69_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_69_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_69_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 15.92\nB: 13.69\nC: 15.11\nD: 14.18", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 15.92\nB: 13.69\nC: 15.11\nD: 14.18", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_70_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_70_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_70_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_70_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_70_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_70_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_70_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_70_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_70_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_70_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_70_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_70_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_70_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_70_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_70_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_70_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 14.7\nB: 16.47\nC: 13.22\nD: 15.67", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 14.7\nB: 16.47\nC: 13.22\nD: 15.67", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_71_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_71_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_71_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_71_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_71_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_71_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_71_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_71_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_71_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_71_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_71_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_71_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_71_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_71_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_71_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_71_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 14.15\nB: 16.15\nC: 14.98\nD: 13.84", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 14.15\nB: 16.15\nC: 14.98\nD: 13.84", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_72_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_72_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_72_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_72_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_72_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_72_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_72_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_72_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_72_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_72_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_72_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_72_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_72_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_72_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_72_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_72_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 22.27\nB: 31.0\nC: 16.79\nD: 42.84", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 22.27\nB: 31.0\nC: 16.79\nD: 42.84", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_73_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_73_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_73_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_73_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_73_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_73_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_73_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_73_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_73_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_73_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_73_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_73_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_73_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_73_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_73_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_73_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 35.12\nB: 8.34\nC: 19.09\nD: 41.0", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 35.12\nB: 8.34\nC: 19.09\nD: 41.0", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_74_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_74_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_74_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_74_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_74_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_74_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_74_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_74_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_74_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_74_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_74_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_74_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_74_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_74_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_74_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_74_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 45.54\nB: 41.25\nC: 88.05\nD: 65.27", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 45.54\nB: 41.25\nC: 88.05\nD: 65.27", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_75_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_75_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_75_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_75_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_75_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_75_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_75_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_75_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_75_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_75_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_75_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_75_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_75_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_75_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_75_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_75_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 15.13\nB: 16.53\nC: 13.86\nD: 14.76", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 15.13\nB: 16.53\nC: 13.86\nD: 14.76", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_76_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_76_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_76_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_76_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_76_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_76_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_76_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_76_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_76_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_76_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_76_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_76_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_76_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_76_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_76_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_76_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 13.44\nB: 15.84\nC: 14.1\nD: 15.39", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 13.44\nB: 15.84\nC: 14.1\nD: 15.39", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_77_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_77_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_77_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_77_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_77_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_77_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_77_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_77_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_77_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_77_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_77_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_77_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_77_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_77_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_77_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_77_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 89.31\nB: 76.24\nC: 49.14\nD: 45.07", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 89.31\nB: 76.24\nC: 49.14\nD: 45.07", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_78_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_78_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_78_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_78_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_78_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_78_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_78_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_78_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_78_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_78_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_78_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_78_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_78_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_78_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_78_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_78_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 55.81\nB: 63.15\nC: 82.13\nD: 87.69", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 55.81\nB: 63.15\nC: 82.13\nD: 87.69", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_79_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_79_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_79_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_79_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_79_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_79_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_79_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_79_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_79_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_79_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_79_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_79_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_79_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_79_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_79_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_79_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 25.35\nB: 36.25\nC: 46.29\nD: 11.0", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 25.35\nB: 36.25\nC: 46.29\nD: 11.0", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_80_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_80_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_80_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_80_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_80_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_80_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_80_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_80_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_80_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_80_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_80_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_80_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_80_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_80_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_80_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_80_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 16.37\nB: 14.69\nC: 14.96\nD: 13.68", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 16.37\nB: 14.69\nC: 14.96\nD: 13.68", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_81_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_81_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_81_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_81_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_81_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_81_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_81_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_81_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_81_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_81_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_81_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_81_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_81_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_81_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_81_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_81_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 13.2\nB: 16.1\nC: 14.73\nD: 15.48", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 13.2\nB: 16.1\nC: 14.73\nD: 15.48", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_82_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_82_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_82_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_82_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_82_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_82_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_82_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_82_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_82_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_82_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_82_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_82_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_82_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_82_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_82_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_82_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 53.37\nB: 89.89\nC: 65.12\nD: 80.19", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 53.37\nB: 89.89\nC: 65.12\nD: 80.19", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_83_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_83_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_83_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_83_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_83_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_83_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_83_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_83_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_83_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_83_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_83_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_83_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_83_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_83_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_83_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_83_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 15.87\nB: 15.68\nC: 13.35\nD: 14.7", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 15.87\nB: 15.68\nC: 13.35\nD: 14.7", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_84_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_84_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_84_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_84_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_84_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_84_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_84_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_84_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_84_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_84_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_84_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_84_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_84_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_84_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_84_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_84_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 99.08\nB: 71.08\nC: 82.07\nD: 53.4", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 99.08\nB: 71.08\nC: 82.07\nD: 53.4", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_85_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_85_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_85_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_85_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_85_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_85_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_85_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_85_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_85_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_85_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_85_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_85_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_85_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_85_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_85_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_85_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 40.56\nB: 91.83\nC: 52.96\nD: 68.4", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 40.56\nB: 91.83\nC: 52.96\nD: 68.4", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_86_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_86_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_86_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_86_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_86_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_86_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_86_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_86_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_86_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_86_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_86_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_86_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_86_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_86_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_86_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_86_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 15.95\nB: 15.3\nC: 13.18\nD: 14.5", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 15.95\nB: 15.3\nC: 13.18\nD: 14.5", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_87_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_87_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_87_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_87_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_87_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_87_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_87_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_87_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_87_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_87_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_87_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_87_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_87_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_87_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_87_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_87_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 51.62\nB: 31.41\nC: 75.9\nD: 83.4", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 51.62\nB: 31.41\nC: 75.9\nD: 83.4", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_88_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_88_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_88_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_88_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_88_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_88_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_88_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_88_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_88_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_88_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_88_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_88_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_88_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_88_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_88_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_88_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 48.83\nB: 25.57\nC: 98.96\nD: 67.3", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 48.83\nB: 25.57\nC: 98.96\nD: 67.3", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_89_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_89_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_89_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_89_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_89_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_89_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_89_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_89_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_89_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_89_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_89_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_89_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_89_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_89_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_89_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_89_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 22.27\nB: 45.0\nC: 31.7\nD: 16.21", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 22.27\nB: 45.0\nC: 31.7\nD: 16.21", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_90_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_90_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_90_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_90_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_90_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_90_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_90_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_90_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_90_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_90_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_90_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_90_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_90_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_90_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_90_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_90_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 30.46\nB: 66.0\nC: 85.93\nD: 57.57", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 30.46\nB: 66.0\nC: 85.93\nD: 57.57", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_91_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_91_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_91_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_91_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_91_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_91_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_91_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_91_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_91_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_91_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_91_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_91_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_91_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_91_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_91_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_91_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 13.0\nB: 32.72\nC: 24.42\nD: 44.28", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 13.0\nB: 32.72\nC: 24.42\nD: 44.28", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_92_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_92_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_92_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_92_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_92_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_92_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_92_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_92_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_92_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_92_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_92_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_92_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_92_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_92_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_92_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_92_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 14.83\nB: 13.36\nC: 15.78\nD: 15.59", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 14.83\nB: 13.36\nC: 15.78\nD: 15.59", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_93_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_93_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_93_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_93_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_93_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_93_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_93_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_93_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_93_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_93_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_93_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_93_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_93_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_93_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_93_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_93_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 61.36\nB: 72.0\nC: 29.41\nD: 90.82", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 61.36\nB: 72.0\nC: 29.41\nD: 90.82", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_94_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_94_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_94_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_94_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_94_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_94_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_94_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_94_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_94_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_94_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_94_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_94_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_94_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_94_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_94_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_94_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 92.8\nB: 55.03\nC: 40.44\nD: 79.92", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 92.8\nB: 55.03\nC: 40.44\nD: 79.92", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_95_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_95_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_95_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_95_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_95_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_95_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_95_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_95_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_95_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_95_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_95_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_95_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_95_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_95_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_95_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_95_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 62.61\nB: 70.36\nC: 37.92\nD: 83.27", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 62.61\nB: 70.36\nC: 37.92\nD: 83.27", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_96_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_96_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_96_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_96_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_96_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_96_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_96_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_96_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_96_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_96_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_96_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_96_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_96_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_96_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_96_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_96_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 57.41\nB: 88.26\nC: 68.75\nD: 43.6", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 57.41\nB: 88.26\nC: 68.75\nD: 43.6", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_97_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_97_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_97_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_97_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_97_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_97_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_97_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_97_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_97_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_97_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_97_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_97_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_97_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_97_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_97_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_97_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 72.66\nB: 52.95\nC: 37.59\nD: 78.8", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 72.66\nB: 52.95\nC: 37.59\nD: 78.8", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_98_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_98_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_98_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_98_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_98_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_98_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_98_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_98_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_98_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_98_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_98_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_98_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_98_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_98_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_98_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_98_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 31.43\nB: 41.0\nC: 23.56\nD: 11.6", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 31.43\nB: 41.0\nC: 23.56\nD: 11.6", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_99_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_99_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_99_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_99_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_99_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_99_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_99_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_99_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_99_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_99_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_99_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_99_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_99_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_99_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_99_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_99_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 15.83\nB: 14.74\nC: 15.03\nD: 13.86", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 15.83\nB: 14.74\nC: 15.03\nD: 13.86", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_100_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_100_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_100_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_100_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_100_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_100_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_100_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_100_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_100_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_100_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_100_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_100_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_100_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_100_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_100_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_100_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 16.34\nB: 14.83\nC: 13.65\nD: 14.95", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 16.34\nB: 14.83\nC: 13.65\nD: 14.95", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_101_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_101_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_101_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_101_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_101_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_101_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_101_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_101_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_101_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_101_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_101_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_101_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_101_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_101_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_101_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_101_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 36.1\nB: 62.13\nC: 86.47\nD: 69.52", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 36.1\nB: 62.13\nC: 86.47\nD: 69.52", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_102_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_102_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_102_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_102_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_102_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_102_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_102_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_102_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_102_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_102_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_102_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_102_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_102_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_102_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_102_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_102_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 95.33\nB: 48.04\nC: 78.15\nD: 66.6", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 95.33\nB: 48.04\nC: 78.15\nD: 66.6", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_103_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_103_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_103_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_103_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_103_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_103_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_103_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_103_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_103_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_103_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_103_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_103_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_103_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_103_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_103_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_103_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 83.75\nB: 40.4\nC: 70.73\nD: 54.88", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 83.75\nB: 40.4\nC: 70.73\nD: 54.88", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_104_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_104_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_104_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_104_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_104_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_104_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_104_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_104_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_104_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_104_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_104_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_104_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_104_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_104_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_104_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_104_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 19.0\nB: 44.42\nC: 17.03\nD: 30.18", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 19.0\nB: 44.42\nC: 17.03\nD: 30.18", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_105_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_105_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_105_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_105_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_105_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_105_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_105_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_105_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_105_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_105_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_105_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_105_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_105_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_105_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_105_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_105_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 13.88\nB: 31.45\nC: 40.87\nD: 24.0", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 13.88\nB: 31.45\nC: 40.87\nD: 24.0", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_106_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_106_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_106_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_106_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_106_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_106_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_106_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_106_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_106_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_106_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_106_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_106_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_106_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_106_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_106_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_106_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 67.5\nB: 45.43\nC: 96.95\nD: 33.01", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 67.5\nB: 45.43\nC: 96.95\nD: 33.01", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_107_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_107_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_107_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_107_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_107_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_107_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_107_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_107_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_107_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_107_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_107_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_107_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_107_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_107_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_107_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_107_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 68.32\nB: 43.26\nC: 50.74\nD: 81.04", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 68.32\nB: 43.26\nC: 50.74\nD: 81.04", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_108_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_108_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_108_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_108_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_108_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_108_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_108_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_108_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_108_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_108_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_108_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_108_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_108_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_108_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_108_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_108_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 68.16\nB: 95.11\nC: 83.52\nD: 61.72", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 68.16\nB: 95.11\nC: 83.52\nD: 61.72", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_109_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_109_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_109_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_109_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_109_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_109_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_109_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_109_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_109_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_109_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_109_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_109_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_109_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_109_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_109_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_109_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 86.14\nB: 52.58\nC: 90.32\nD: 68.4", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 86.14\nB: 52.58\nC: 90.32\nD: 68.4", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_110_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_110_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_110_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_110_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_110_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_110_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_110_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_110_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_110_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_110_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_110_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_110_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_110_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_110_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_110_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_110_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 94.54\nB: 23.49\nC: 73.6\nD: 58.19", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 94.54\nB: 23.49\nC: 73.6\nD: 58.19", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_111_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_111_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_111_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_111_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_111_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_111_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_111_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_111_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_111_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_111_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_111_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_111_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_111_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_111_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_111_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_111_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 96.12\nB: 52.2\nC: 63.19\nD: 85.65", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 96.12\nB: 52.2\nC: 63.19\nD: 85.65", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_112_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_112_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_112_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_112_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_112_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_112_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_112_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_112_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_112_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_112_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_112_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_112_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_112_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_112_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_112_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_112_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 60.04\nB: 48.45\nC: 85.05\nD: 73.29", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 60.04\nB: 48.45\nC: 85.05\nD: 73.29", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_113_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_113_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_113_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_113_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_113_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_113_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_113_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_113_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_113_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_113_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_113_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_113_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_113_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_113_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_113_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_113_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 76.5\nB: 46.5\nC: 24.38\nD: 83.3", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 76.5\nB: 46.5\nC: 24.38\nD: 83.3", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_114_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_114_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_114_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_114_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_114_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_114_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_114_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_114_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_114_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_114_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_114_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_114_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_114_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_114_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_114_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_114_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 13.96\nB: 16.01\nC: 14.28\nD: 15.62", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 13.96\nB: 16.01\nC: 14.28\nD: 15.62", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_115_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_115_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_115_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_115_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_115_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_115_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_115_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_115_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_115_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_115_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_115_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_115_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_115_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_115_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_115_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_115_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 84.03\nB: 61.73\nC: 74.46\nD: 91.2", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 84.03\nB: 61.73\nC: 74.46\nD: 91.2", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_116_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_116_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_116_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_116_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_116_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_116_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_116_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_116_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_116_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_116_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_116_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_116_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_116_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_116_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_116_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_116_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 37.57\nB: 89.35\nC: 60.88\nD: 71.91", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 37.57\nB: 89.35\nC: 60.88\nD: 71.91", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_117_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_117_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_117_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_117_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_117_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_117_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_117_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_117_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_117_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_117_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_117_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_117_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_117_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_117_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_117_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_117_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 81.6\nB: 32.15\nC: 50.53\nD: 102.55", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 81.6\nB: 32.15\nC: 50.53\nD: 102.55", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_118_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_118_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_118_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_118_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_118_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_118_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_118_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_118_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_118_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_118_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_118_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_118_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_118_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_118_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_118_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_118_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 84.35\nB: 51.95\nC: 74.77\nD: 39.8", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 84.35\nB: 51.95\nC: 74.77\nD: 39.8", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_119_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_119_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_119_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_119_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_119_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_119_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_119_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_119_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_119_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_119_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_119_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_119_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_119_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_119_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_119_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_119_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 17.44\nB: 41.32\nC: 22.07\nD: 38.0", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 17.44\nB: 41.32\nC: 22.07\nD: 38.0", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_120_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_120_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_120_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_120_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_120_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_120_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_120_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_120_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_120_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_120_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_120_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_120_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_120_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_120_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_120_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_120_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 78.4\nB: 61.99\nC: 95.04\nD: 38.43", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 78.4\nB: 61.99\nC: 95.04\nD: 38.43", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_121_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_121_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_121_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_121_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_121_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_121_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_121_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_121_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_121_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_121_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_121_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_121_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_121_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_121_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_121_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_121_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 15.97\nB: 13.55\nC: 15.16\nD: 14.35", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 15.97\nB: 13.55\nC: 15.16\nD: 14.35", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_122_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_122_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_122_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_122_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_122_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_122_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_122_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_122_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_122_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_122_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_122_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_122_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_122_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_122_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_122_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_122_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 78.54\nB: 73.19\nC: 89.42\nD: 51.58", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 78.54\nB: 73.19\nC: 89.42\nD: 51.58", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_123_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_123_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_123_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_123_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_123_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_123_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_123_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_123_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_123_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_123_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_123_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_123_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_123_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_123_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_123_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_123_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 42.07\nB: 14.32\nC: 21.13\nD: 35.0", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 42.07\nB: 14.32\nC: 21.13\nD: 35.0", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_124_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_124_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_124_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_124_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_124_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_124_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_124_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_124_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_124_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_124_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_124_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_124_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_124_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_124_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_124_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_124_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 48.93\nB: 98.95\nC: 23.75\nD: 77.7", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 48.93\nB: 98.95\nC: 23.75\nD: 77.7", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_125_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_125_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_125_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_125_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_125_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_125_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_125_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_125_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_125_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_125_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_125_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_125_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_125_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_125_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_125_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_125_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 67.61\nB: 60.31\nC: 83.25\nD: 31.09", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 67.61\nB: 60.31\nC: 83.25\nD: 31.09", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_126_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_126_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_126_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_126_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_126_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_126_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_126_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_126_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_126_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_126_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_126_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_126_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_126_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_126_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_126_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_126_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 14.46\nB: 13.93\nC: 14.95\nD: 16.22", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 14.46\nB: 13.93\nC: 14.95\nD: 16.22", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_127_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_127_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_127_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_127_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_127_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_127_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_127_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_127_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_127_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_127_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_127_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_127_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_127_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_127_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_127_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_127_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 38.86\nB: 39.64\nC: 8.48\nD: 26.0", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 38.86\nB: 39.64\nC: 8.48\nD: 26.0", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_128_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_128_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_128_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_128_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_128_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_128_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_128_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_128_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_128_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_128_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_128_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_128_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_128_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_128_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_128_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_128_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 37.25\nB: 12.0\nC: 29.63\nD: 19.34", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 37.25\nB: 12.0\nC: 29.63\nD: 19.34", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_129_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_129_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_129_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_129_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_129_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_129_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_129_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_129_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_129_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_129_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_129_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_129_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_129_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_129_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_129_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_129_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 14.81\nB: 15.5\nC: 16.04\nD: 13.6", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 14.81\nB: 15.5\nC: 16.04\nD: 13.6", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_130_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_130_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_130_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_130_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_130_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_130_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_130_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_130_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_130_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_130_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_130_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_130_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_130_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_130_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_130_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_130_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 44.49\nB: 86.4\nC: 64.43\nD: 31.81", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 44.49\nB: 86.4\nC: 64.43\nD: 31.81", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_131_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_131_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_131_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_131_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_131_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_131_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_131_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_131_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_131_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_131_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_131_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_131_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_131_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_131_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_131_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_131_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 44.76\nB: 68.61\nC: 83.25\nD: 27.55", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 44.76\nB: 68.61\nC: 83.25\nD: 27.55", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_132_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_132_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_132_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_132_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_132_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_132_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_132_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_132_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_132_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_132_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_132_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_132_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_132_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_132_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_132_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_132_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 16.14\nB: 15.63\nC: 14.3\nD: 13.88", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 16.14\nB: 15.63\nC: 14.3\nD: 13.88", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_133_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_133_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_133_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_133_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_133_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_133_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_133_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_133_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_133_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_133_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_133_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_133_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_133_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_133_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_133_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_133_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 13.73\nB: 15.1\nC: 14.23\nD: 15.92", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 13.73\nB: 15.1\nC: 14.23\nD: 15.92", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_134_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_134_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_134_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_134_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_134_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_134_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_134_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_134_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_134_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_134_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_134_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_134_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_134_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_134_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_134_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_134_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 16.24\nB: 14.52\nC: 13.88\nD: 14.96", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 16.24\nB: 14.52\nC: 13.88\nD: 14.96", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_135_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_135_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_135_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_135_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_135_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_135_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_135_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_135_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_135_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_135_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_135_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_135_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_135_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_135_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_135_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_135_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 71.13\nB: 88.76\nC: 45.99\nD: 63.02", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 71.13\nB: 88.76\nC: 45.99\nD: 63.02", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_136_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_136_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_136_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_136_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_136_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_136_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_136_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_136_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_136_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_136_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_136_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_136_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_136_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_136_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_136_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_136_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 16.45\nB: 15.03\nC: 14.0\nD: 13.32", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 16.45\nB: 15.03\nC: 14.0\nD: 13.32", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_137_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_137_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_137_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_137_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_137_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_137_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_137_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_137_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_137_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_137_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_137_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_137_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_137_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_137_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_137_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_137_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 32.49\nB: 9.52\nC: 44.0\nD: 24.05", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 32.49\nB: 9.52\nC: 44.0\nD: 24.05", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_138_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_138_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_138_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_138_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_138_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_138_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_138_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_138_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_138_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_138_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_138_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_138_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_138_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_138_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_138_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_138_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 25.67\nB: 76.96\nC: 87.45\nD: 43.98", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 25.67\nB: 76.96\nC: 87.45\nD: 43.98", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_139_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_139_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_139_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_139_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_139_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_139_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_139_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_139_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_139_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_139_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_139_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_139_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_139_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_139_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_139_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_139_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 72.41\nB: 41.9\nC: 32.38\nD: 93.6", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 72.41\nB: 41.9\nC: 32.38\nD: 93.6", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_140_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_140_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_140_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_140_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_140_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_140_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_140_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_140_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_140_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_140_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_140_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_140_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_140_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_140_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_140_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_140_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 35.05\nB: 79.28\nC: 46.7\nD: 84.6", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 35.05\nB: 79.28\nC: 46.7\nD: 84.6", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_141_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_141_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_141_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_141_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_141_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_141_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_141_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_141_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_141_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_141_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_141_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_141_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_141_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_141_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_141_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_141_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 101.34\nB: 42.34\nC: 36.95\nD: 77.4", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 101.34\nB: 42.34\nC: 36.95\nD: 77.4", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_142_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_142_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_142_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_142_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_142_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_142_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_142_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_142_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_142_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_142_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_142_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_142_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_142_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_142_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_142_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_142_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 39.85\nB: 23.26\nC: 11.0\nD: 29.1", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 39.85\nB: 23.26\nC: 11.0\nD: 29.1", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_143_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_143_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_143_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_143_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_143_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_143_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_143_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_143_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_143_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_143_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_143_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_143_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_143_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_143_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_143_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_143_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 69.17\nB: 82.13\nC: 88.66\nD: 54.0", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 69.17\nB: 82.13\nC: 88.66\nD: 54.0", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_144_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_144_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_144_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_144_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_144_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_144_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_144_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_144_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_144_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_144_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_144_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_144_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_144_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_144_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_144_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_144_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 16.63\nB: 14.83\nC: 13.58\nD: 15.26", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 16.63\nB: 14.83\nC: 13.58\nD: 15.26", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_145_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_145_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_145_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_145_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_145_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_145_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_145_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_145_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_145_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_145_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_145_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_145_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_145_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_145_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_145_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_145_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 91.08\nB: 67.02\nC: 86.56\nD: 51.84", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 91.08\nB: 67.02\nC: 86.56\nD: 51.84", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_146_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_146_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_146_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_146_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_146_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_146_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_146_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_146_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_146_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_146_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_146_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_146_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_146_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_146_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_146_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_146_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 93.6\nB: 34.32\nC: 45.3\nD: 80.35", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 93.6\nB: 34.32\nC: 45.3\nD: 80.35", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_147_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_147_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_147_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_147_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_147_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_147_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_147_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_147_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_147_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_147_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_147_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_147_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_147_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_147_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_147_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_147_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 92.76\nB: 67.5\nC: 29.61\nD: 57.89", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 92.76\nB: 67.5\nC: 29.61\nD: 57.89", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_148_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_148_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_148_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_148_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_148_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_148_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_148_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_148_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_148_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_148_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_148_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_148_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_148_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_148_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_148_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_148_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 54.53\nB: 83.14\nC: 70.61\nD: 37.4", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 54.53\nB: 83.14\nC: 70.61\nD: 37.4", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_149_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_149_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_149_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_149_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_149_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_149_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_149_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_149_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_149_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_149_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_149_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_149_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_149_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_149_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_149_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_149_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 16.27\nB: 14.39\nC: 14.98\nD: 13.46", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 16.27\nB: 14.39\nC: 14.98\nD: 13.46", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_150_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_150_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_150_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_150_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_150_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_150_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_150_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_150_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_150_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_150_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_150_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_150_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_150_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_150_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_150_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_150_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 76.5\nB: 84.69\nC: 28.49\nD: 56.71", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 76.5\nB: 84.69\nC: 28.49\nD: 56.71", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_151_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_151_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_151_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_151_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_151_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_151_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_151_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_151_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_151_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_151_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_151_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_151_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_151_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_151_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_151_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_151_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 14.6\nB: 16.51\nC: 13.21\nD: 14.95", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 14.6\nB: 16.51\nC: 13.21\nD: 14.95", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_152_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_152_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_152_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_152_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_152_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_152_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_152_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_152_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_152_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_152_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_152_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_152_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_152_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_152_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_152_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_152_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 14.04\nB: 16.4\nC: 13.4\nD: 15.01", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 14.04\nB: 16.4\nC: 13.4\nD: 15.01", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_153_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_153_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_153_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_153_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_153_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_153_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_153_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_153_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_153_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_153_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_153_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_153_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_153_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_153_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_153_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_153_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 82.8\nB: 70.51\nC: 42.58\nD: 37.94", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 82.8\nB: 70.51\nC: 42.58\nD: 37.94", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_154_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_154_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_154_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_154_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_154_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_154_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_154_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_154_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_154_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_154_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_154_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_154_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_154_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_154_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_154_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_154_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 48.64\nB: 74.5\nC: 83.35\nD: 55.78", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 48.64\nB: 74.5\nC: 83.35\nD: 55.78", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_155_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_155_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_155_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_155_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_155_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_155_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_155_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_155_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_155_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_155_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_155_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_155_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_155_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_155_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_155_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_155_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 14.95\nB: 13.58\nC: 16.0\nD: 14.83", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 14.95\nB: 13.58\nC: 16.0\nD: 14.83", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_156_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_156_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_156_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_156_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_156_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_156_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_156_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_156_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_156_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_156_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_156_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_156_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_156_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_156_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_156_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_156_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 97.5\nB: 73.72\nC: 80.44\nD: 51.6", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 97.5\nB: 73.72\nC: 80.44\nD: 51.6", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_157_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_157_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_157_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_157_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_157_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_157_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_157_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_157_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_157_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_157_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_157_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_157_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_157_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_157_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_157_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_157_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 80.0\nB: 86.4\nC: 39.44\nD: 55.46", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 80.0\nB: 86.4\nC: 39.44\nD: 55.46", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_158_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_158_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_158_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_158_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_158_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_158_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_158_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_158_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_158_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_158_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_158_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_158_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_158_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_158_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_158_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_158_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 32.39\nB: 43.5\nC: 15.24\nD: 29.0", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 32.39\nB: 43.5\nC: 15.24\nD: 29.0", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_159_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_159_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_159_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_159_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_159_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_159_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_159_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_159_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_159_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_159_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_159_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_159_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_159_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_159_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_159_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_159_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 82.8\nB: 37.35\nC: 78.23\nD: 49.83", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 82.8\nB: 37.35\nC: 78.23\nD: 49.83", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_160_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_160_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_160_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_160_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_160_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_160_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_160_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_160_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_160_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_160_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_160_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_160_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_160_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_160_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_160_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_160_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 40.75\nB: 19.58\nC: 10.08\nD: 36.0", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 40.75\nB: 19.58\nC: 10.08\nD: 36.0", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_161_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_161_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_161_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_161_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_161_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_161_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_161_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_161_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_161_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_161_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_161_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_161_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_161_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_161_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_161_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_161_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 14.68\nB: 13.47\nC: 15.27\nD: 16.06", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 14.68\nB: 13.47\nC: 15.27\nD: 16.06", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_162_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_162_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_162_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_162_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_162_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_162_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_162_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_162_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_162_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_162_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_162_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_162_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_162_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_162_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_162_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_162_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 16.0\nB: 14.89\nC: 14.5\nD: 13.53", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 16.0\nB: 14.89\nC: 14.5\nD: 13.53", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_163_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_163_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_163_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_163_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_163_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_163_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_163_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_163_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_163_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_163_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_163_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_163_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_163_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_163_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_163_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_163_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 15.03\nB: 14.01\nC: 13.19\nD: 16.56", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 15.03\nB: 14.01\nC: 13.19\nD: 16.56", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_164_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_164_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_164_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_164_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_164_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_164_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_164_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_164_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_164_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_164_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_164_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_164_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_164_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_164_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_164_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_164_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 14.2\nB: 16.29\nC: 13.27\nD: 15.71", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 14.2\nB: 16.29\nC: 13.27\nD: 15.71", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_165_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_165_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_165_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_165_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_165_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_165_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_165_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_165_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_165_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_165_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_165_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_165_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_165_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_165_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_165_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_165_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 65.74\nB: 51.2\nC: 98.5\nD: 32.59", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 65.74\nB: 51.2\nC: 98.5\nD: 32.59", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_166_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_166_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_166_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_166_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_166_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_166_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_166_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_166_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_166_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_166_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_166_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_166_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_166_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_166_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_166_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_166_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 35.99\nB: 28.0\nC: 11.27\nD: 44.98", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 35.99\nB: 28.0\nC: 11.27\nD: 44.98", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_167_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_167_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_167_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_167_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_167_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_167_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_167_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_167_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_167_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_167_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_167_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_167_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_167_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_167_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_167_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_167_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 93.6\nB: 24.93\nC: 46.53\nD: 76.42", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 93.6\nB: 24.93\nC: 46.53\nD: 76.42", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_168_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_168_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_168_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_168_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_168_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_168_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_168_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_168_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_168_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_168_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_168_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_168_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_168_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_168_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_168_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_168_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 28.75\nB: 38.43\nC: 44.48\nD: 10.0", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 28.75\nB: 38.43\nC: 44.48\nD: 10.0", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_169_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_169_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_169_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_169_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_169_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_169_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_169_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_169_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_169_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_169_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_169_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_169_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_169_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_169_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_169_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_169_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 49.6\nB: 41.25\nC: 92.75\nD: 70.1", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 49.6\nB: 41.25\nC: 92.75\nD: 70.1", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_170_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_170_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_170_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_170_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_170_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_170_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_170_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_170_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_170_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_170_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_170_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_170_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_170_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_170_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_170_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_170_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 48.18\nB: 22.07\nC: 92.38\nD: 81.6", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 48.18\nB: 22.07\nC: 92.38\nD: 81.6", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_171_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_171_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_171_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_171_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_171_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_171_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_171_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_171_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_171_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_171_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_171_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_171_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_171_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_171_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_171_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_171_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 87.45\nB: 34.76\nC: 77.5\nD: 41.85", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 87.45\nB: 34.76\nC: 77.5\nD: 41.85", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_172_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_172_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_172_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_172_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_172_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_172_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_172_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_172_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_172_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_172_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_172_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_172_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_172_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_172_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_172_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_172_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 16.16\nB: 34.32\nC: 45.45\nD: 29.0", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 16.16\nB: 34.32\nC: 45.45\nD: 29.0", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_173_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_173_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_173_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_173_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_173_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_173_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_173_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_173_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_173_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_173_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_173_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_173_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_173_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_173_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_173_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_173_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 57.09\nB: 81.93\nC: 45.29\nD: 72.71", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 57.09\nB: 81.93\nC: 45.29\nD: 72.71", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_174_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_174_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_174_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_174_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_174_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_174_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_174_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_174_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_174_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_174_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_174_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_174_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_174_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_174_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_174_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_174_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 14.94\nB: 13.47\nC: 15.9\nD: 14.73", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 14.94\nB: 13.47\nC: 15.9\nD: 14.73", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_175_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_175_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_175_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_175_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_175_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_175_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_175_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_175_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_175_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_175_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_175_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_175_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_175_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_175_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_175_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_175_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 35.17\nB: 54.56\nC: 66.0\nD: 90.91", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 35.17\nB: 54.56\nC: 66.0\nD: 90.91", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_176_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_176_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_176_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_176_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_176_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_176_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_176_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_176_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_176_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_176_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_176_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_176_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_176_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_176_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_176_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_176_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 14.59\nB: 13.65\nC: 16.31\nD: 15.23", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 14.59\nB: 13.65\nC: 16.31\nD: 15.23", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_177_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_177_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_177_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_177_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_177_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_177_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_177_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_177_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_177_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_177_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_177_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_177_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_177_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_177_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_177_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_177_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 27.96\nB: 41.0\nC: 11.9\nD: 32.86", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 27.96\nB: 41.0\nC: 11.9\nD: 32.86", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_178_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_178_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_178_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_178_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_178_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_178_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_178_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_178_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_178_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_178_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_178_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_178_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_178_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_178_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_178_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_178_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 74.71\nB: 51.0\nC: 75.78\nD: 99.25", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 74.71\nB: 51.0\nC: 75.78\nD: 99.25", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_179_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_179_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_179_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_179_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_179_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_179_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_179_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_179_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_179_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_179_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_179_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_179_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_179_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_179_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_179_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_179_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 101.34\nB: 74.25\nC: 34.1\nD: 46.12", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 101.34\nB: 74.25\nC: 34.1\nD: 46.12", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_180_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_180_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_180_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_180_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_180_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_180_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_180_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_180_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_180_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_180_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_180_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_180_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_180_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_180_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_180_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_180_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 17.75\nB: 42.44\nC: 31.0\nD: 11.74", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 17.75\nB: 42.44\nC: 31.0\nD: 11.74", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_181_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_181_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_181_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_181_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_181_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_181_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_181_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_181_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_181_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_181_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_181_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_181_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_181_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_181_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_181_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_181_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 77.89\nB: 55.89\nC: 27.81\nD: 83.2", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 77.89\nB: 55.89\nC: 27.81\nD: 83.2", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_182_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_182_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_182_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_182_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_182_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_182_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_182_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_182_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_182_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_182_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_182_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_182_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_182_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_182_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_182_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_182_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 94.56\nB: 58.85\nC: 65.26\nD: 81.6", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 94.56\nB: 58.85\nC: 65.26\nD: 81.6", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_183_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_183_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_183_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_183_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_183_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_183_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_183_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_183_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_183_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_183_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_183_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_183_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_183_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_183_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_183_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_183_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 23.0\nB: 56.64\nC: 75.07\nD: 102.6", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 23.0\nB: 56.64\nC: 75.07\nD: 102.6", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_184_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_184_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_184_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_184_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_184_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_184_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_184_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_184_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_184_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_184_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_184_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_184_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_184_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_184_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_184_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_184_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 14.38\nB: 13.93\nC: 16.39\nD: 15.1", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 14.38\nB: 13.93\nC: 16.39\nD: 15.1", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_185_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_185_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_185_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_185_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_185_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_185_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_185_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_185_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_185_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_185_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_185_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_185_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_185_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_185_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_185_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_185_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 43.81\nB: 73.78\nC: 84.79\nD: 57.46", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 43.81\nB: 73.78\nC: 84.79\nD: 57.46", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_186_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_186_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_186_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_186_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_186_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_186_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_186_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_186_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_186_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_186_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_186_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_186_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_186_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_186_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_186_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_186_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 26.45\nB: 78.21\nC: 52.14\nD: 100.8", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 26.45\nB: 78.21\nC: 52.14\nD: 100.8", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_187_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_187_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_187_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_187_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_187_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_187_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_187_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_187_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_187_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_187_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_187_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_187_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_187_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_187_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_187_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_187_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 94.63\nB: 70.2\nC: 25.23\nD: 54.96", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 94.63\nB: 70.2\nC: 25.23\nD: 54.96", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_188_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_188_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_188_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_188_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_188_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_188_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_188_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_188_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_188_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_188_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_188_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_188_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_188_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_188_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_188_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_188_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 62.72\nB: 91.88\nC: 64.2\nD: 40.77", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 62.72\nB: 91.88\nC: 64.2\nD: 40.77", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_189_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_189_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_189_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_189_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_189_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_189_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_189_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_189_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_189_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_189_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_189_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_189_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_189_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_189_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_189_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_189_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 47.0\nB: 9.51\nC: 38.33\nD: 20.67", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 47.0\nB: 9.51\nC: 38.33\nD: 20.67", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_190_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_190_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_190_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_190_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_190_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_190_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_190_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_190_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_190_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_190_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_190_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_190_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_190_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_190_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_190_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_190_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 78.4\nB: 24.91\nC: 86.85\nD: 60.54", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 78.4\nB: 24.91\nC: 86.85\nD: 60.54", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_191_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_191_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_191_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_191_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_191_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_191_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_191_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_191_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_191_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_191_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_191_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_191_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_191_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_191_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_191_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_191_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 63.76\nB: 54.54\nC: 84.48\nD: 95.09", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 63.76\nB: 54.54\nC: 84.48\nD: 95.09", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_192_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_192_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_192_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_192_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_192_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_192_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_192_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_192_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_192_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_192_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_192_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_192_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_192_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_192_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_192_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_192_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 41.0\nB: 10.89\nC: 36.0\nD: 24.91", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 41.0\nB: 10.89\nC: 36.0\nD: 24.91", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_193_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_193_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_193_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_193_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_193_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_193_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_193_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_193_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_193_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_193_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_193_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_193_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_193_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_193_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_193_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_193_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 88.27\nB: 32.81\nC: 43.39\nD: 81.0", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 88.27\nB: 32.81\nC: 43.39\nD: 81.0", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_194_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_194_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_194_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_194_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_194_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_194_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_194_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_194_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_194_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_194_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_194_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_194_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_194_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_194_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_194_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_194_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 36.57\nB: 90.89\nC: 60.88\nD: 64.62", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 36.57\nB: 90.89\nC: 60.88\nD: 64.62", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_195_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_195_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_195_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_195_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_195_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_195_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_195_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_195_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_195_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_195_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_195_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_195_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_195_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_195_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_195_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_195_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 79.28\nB: 49.74\nC: 39.01\nD: 69.35", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 79.28\nB: 49.74\nC: 39.01\nD: 69.35", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_196_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_196_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_196_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_196_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_196_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_196_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_196_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_196_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_196_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_196_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_196_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_196_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_196_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_196_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_196_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_196_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 23.36\nB: 79.8\nC: 89.26\nD: 44.0", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 23.36\nB: 79.8\nC: 89.26\nD: 44.0", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_197_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_197_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_197_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_197_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_197_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_197_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_197_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_197_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_197_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_197_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_197_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_197_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_197_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_197_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_197_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_197_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "AQA7", "options": "A: 60.76\nB: 79.8\nC: 90.7\nD: 57.88", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 60.76\nB: 79.8\nC: 90.7\nD: 57.88", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_198_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_198_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_198_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_198_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_198_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_198_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_198_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_198_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_198_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_198_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_198_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_198_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_198_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_198_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_198_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_198_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "action_quality_assessment", "visual_input_component": "Video image or Natural image", "source": "UNLV", "options": "A: 16.67\nB: 14.55\nC: 13.7\nD: 14.9", "question": "What is the most probable action quality assessment number obtained by the person in the video?", "context": "Select from the following choices.\nA: 16.67\nB: 14.55\nC: 13.7\nD: 14.9", "input_image_path": ["./Continuous-temporal/action_quality_assessment/action_quality_assessment_199_0.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_199_1.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_199_2.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_199_3.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_199_4.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_199_5.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_199_6.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_199_7.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_199_8.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_199_9.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_199_10.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_199_11.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_199_12.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_199_13.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_199_14.png", "./Continuous-temporal/action_quality_assessment/action_quality_assessment_199_15.png"], "output": "A", "qwen3-vl": "image none"}]
\ No newline at end of file
diff --git a/results/casuality_reasoning_next_qa/qwen3-vl/metadata_info.json b/results/casuality_reasoning_next_qa/qwen3-vl/metadata_info.json
new file mode 100644
index 0000000..8e8a6d9
--- /dev/null
+++ b/results/casuality_reasoning_next_qa/qwen3-vl/metadata_info.json
@@ -0,0 +1 @@
+[{"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: sleeps\nB: stands up and run towards the man\nC: take a few steps\nD: pour the sand out\nE: touch baby s foot", "question": "what does the girl do after landing on the bed the first time", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: sleeps\nB: stands up and run towards the man\nC: take a few steps\nD: pour the sand out\nE: touch baby s foot\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_0_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_0_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_0_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_0_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_0_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_0_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_0_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_0_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_0_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_0_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_0_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_0_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_0_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_0_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_0_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_0_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: walking dogs on leash\nB: posing\nC: cleaning the table\nD: acting\nE: playing tablet", "question": "what is the lady with apron doing", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: walking dogs on leash\nB: posing\nC: cleaning the table\nD: acting\nE: playing tablet\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_1_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_1_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_1_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_1_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_1_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_1_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_1_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_1_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_1_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_1_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_1_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_1_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_1_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_1_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_1_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_1_15.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: wearing a harness\nB: wear safety belt\nC: holding sled with both hands\nD: move steering wheel\nE: closed the doors of the sleigh", "question": "how does the man in dark green stay sitting on the sleigh while going down", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: wearing a harness\nB: wear safety belt\nC: holding sled with both hands\nD: move steering wheel\nE: closed the doors of the sleigh\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_2_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_2_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_2_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_2_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_2_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_2_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_2_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_2_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_2_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_2_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_2_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_2_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_2_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_2_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_2_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_2_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: keeps it in his pocket\nB: walk\nC: pet dog\nD: take off helmet\nE: wipe his mouth", "question": "what was the man in black holding a bottle doing before he walked away", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: keeps it in his pocket\nB: walk\nC: pet dog\nD: take off helmet\nE: wipe his mouth\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_3_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_3_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_3_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_3_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_3_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_3_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_3_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_3_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_3_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_3_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_3_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_3_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_3_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_3_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_3_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_3_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: the cat was below her\nB: better posture for photoshoot\nC: not make it dirty\nD: washing her legs\nE: she was stepping on mud", "question": "why did the woman hold her dress up high", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: the cat was below her\nB: better posture for photoshoot\nC: not make it dirty\nD: washing her legs\nE: she was stepping on mud\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_4_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_4_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_4_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_4_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_4_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_4_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_4_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_4_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_4_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_4_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_4_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_4_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_4_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_4_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_4_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_4_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: on the plate\nB: on the table\nC: on the bed\nD: on the sofa\nE: on the floor", "question": "where is the food", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: on the plate\nB: on the table\nC: on the bed\nD: on the sofa\nE: on the floor\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_5_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_5_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_5_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_5_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_5_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_5_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_5_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_5_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_5_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_5_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_5_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_5_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_5_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_5_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_5_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_5_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: walk against wind\nB: finished controlling the helicopter\nC: finished watching the peacock\nD: give them guidane\nE: guide baby forward", "question": "why did a man in orange suddenly walk over at the end", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: walk against wind\nB: finished controlling the helicopter\nC: finished watching the peacock\nD: give them guidane\nE: guide baby forward\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_6_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_6_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_6_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_6_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_6_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_6_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_6_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_6_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_6_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_6_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_6_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_6_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_6_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_6_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_6_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_6_15.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: ballet dress\nB: white costume\nC: chef apron\nD: couple outfit\nE: dancing outfit", "question": "what are both of them wearing in the video", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: ballet dress\nB: white costume\nC: chef apron\nD: couple outfit\nE: dancing outfit\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_7_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_7_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_7_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_7_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_7_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_7_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_7_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_7_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_7_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_7_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_7_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_7_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_7_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_7_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_7_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_7_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: tie knots\nB: taste snow\nC: sunk\nD: can not eat too much\nE: eating food", "question": "why did the baby eat the spagetti strand by strand", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: tie knots\nB: taste snow\nC: sunk\nD: can not eat too much\nE: eating food\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_8_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_8_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_8_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_8_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_8_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_8_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_8_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_8_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_8_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_8_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_8_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_8_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_8_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_8_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_8_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_8_15.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: stand up and raise his hands\nB: point to someone\nC: run towards the camera\nD: push her\nE: point at the baby", "question": "how does the white hair man react after seeing the girl fell", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: stand up and raise his hands\nB: point to someone\nC: run towards the camera\nD: push her\nE: point at the baby\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_9_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_9_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_9_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_9_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_9_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_9_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_9_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_9_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_9_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_9_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_9_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_9_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_9_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_9_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_9_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_9_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: trying to look at slides\nB: crossing muddy fields\nC: check on dog\nD: see whether boy follows\nE: check on baby", "question": "why does the man keep turning around while pulling the sled", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: trying to look at slides\nB: crossing muddy fields\nC: check on dog\nD: see whether boy follows\nE: check on baby\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_10_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_10_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_10_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_10_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_10_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_10_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_10_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_10_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_10_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_10_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_10_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_10_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_10_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_10_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_10_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_10_15.jpg"], "output": "E", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: filming\nB: pass paper to woman\nC: singing\nD: standing and watching\nE: dance", "question": "what does the man in white do after moving near to the microphone", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: filming\nB: pass paper to woman\nC: singing\nD: standing and watching\nE: dance\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_11_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_11_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_11_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_11_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_11_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_11_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_11_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_11_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_11_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_11_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_11_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_11_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_11_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_11_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_11_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_11_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: unstable to move\nB: wants to win\nC: interact with robot\nD: cycle\nE: act like tugging tree", "question": "why does the woman have to be next to the robot", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: unstable to move\nB: wants to win\nC: interact with robot\nD: cycle\nE: act like tugging tree\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_12_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_12_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_12_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_12_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_12_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_12_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_12_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_12_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_12_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_12_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_12_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_12_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_12_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_12_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_12_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_12_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: eat apple\nB: play with it\nC: to touch his face\nD: drink water\nE: he was doing an experiment", "question": "why did the man with the cap move his hands at the start", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: eat apple\nB: play with it\nC: to touch his face\nD: drink water\nE: he was doing an experiment\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_13_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_13_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_13_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_13_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_13_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_13_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_13_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_13_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_13_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_13_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_13_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_13_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_13_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_13_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_13_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_13_15.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: soit food out\nB: puts more noodles\nC: walk away\nD: put his hands on his knees\nE: take away pacifier", "question": "what does the adult do after the baby finishes the first strand", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: soit food out\nB: puts more noodles\nC: walk away\nD: put his hands on his knees\nE: take away pacifier\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_14_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_14_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_14_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_14_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_14_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_14_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_14_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_14_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_14_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_14_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_14_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_14_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_14_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_14_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_14_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_14_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: dancing hall\nB: beach\nC: boxing ring\nD: backyard\nE: living room", "question": "where is this video taken", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: dancing hall\nB: beach\nC: boxing ring\nD: backyard\nE: living room\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_15_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_15_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_15_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_15_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_15_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_15_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_15_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_15_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_15_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_15_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_15_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_15_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_15_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_15_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_15_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_15_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: shake water off their heads\nB: cleaning itself\nC: looking for food\nD: hiding from parrot\nE: sleeping", "question": "why did the parrot on the perch clean tuck its head in while resting on the perch", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: shake water off their heads\nB: cleaning itself\nC: looking for food\nD: hiding from parrot\nE: sleeping\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_16_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_16_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_16_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_16_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_16_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_16_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_16_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_16_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_16_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_16_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_16_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_16_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_16_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_16_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_16_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_16_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: take pictures\nB: filming the baby\nC: playing a game\nD: talk to someone in phone\nE: record for memory", "question": "why is the bald man holding a phone to his ear", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: take pictures\nB: filming the baby\nC: playing a game\nD: talk to someone in phone\nE: record for memory\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_17_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_17_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_17_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_17_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_17_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_17_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_17_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_17_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_17_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_17_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_17_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_17_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_17_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_17_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_17_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_17_15.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: fencing\nB: wedding anniversary\nC: party\nD: talent perfromance\nE: public speaking event", "question": "what event is occuring", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: fencing\nB: wedding anniversary\nC: party\nD: talent perfromance\nE: public speaking event\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_18_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_18_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_18_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_18_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_18_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_18_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_18_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_18_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_18_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_18_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_18_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_18_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_18_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_18_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_18_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_18_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: room\nB: forest sanctuary\nC: on the plane\nD: farm\nE: sofa", "question": "where is this video taken", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: room\nB: forest sanctuary\nC: on the plane\nD: farm\nE: sofa\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_19_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_19_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_19_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_19_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_19_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_19_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_19_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_19_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_19_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_19_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_19_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_19_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_19_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_19_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_19_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_19_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: lick its paws\nB: hit the squirrel\nC: look at the cat eat\nD: stop licking\nE: jumped back", "question": "what did the orange cat do after the brown cat found food", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: lick its paws\nB: hit the squirrel\nC: look at the cat eat\nD: stop licking\nE: jumped back\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_20_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_20_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_20_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_20_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_20_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_20_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_20_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_20_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_20_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_20_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_20_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_20_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_20_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_20_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_20_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_20_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: stretch out hand\nB: using red ball\nC: point to book\nD: drawing\nE: shake the toy", "question": "how did the lady in purple try attracting the children s attention", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: stretch out hand\nB: using red ball\nC: point to book\nD: drawing\nE: shake the toy\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_21_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_21_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_21_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_21_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_21_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_21_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_21_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_21_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_21_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_21_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_21_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_21_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_21_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_21_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_21_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_21_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: by breaking\nB: jumping\nC: flying\nD: from a plate\nE: peck", "question": "how do the birds eat", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: by breaking\nB: jumping\nC: flying\nD: from a plate\nE: peck\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_22_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_22_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_22_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_22_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_22_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_22_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_22_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_22_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_22_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_22_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_22_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_22_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_22_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_22_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_22_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_22_15.jpg"], "output": "E", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: resting\nB: passionately acting\nC: being fed\nD: dancing\nE: sleeping", "question": "why are there two birds standing on the hand", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: resting\nB: passionately acting\nC: being fed\nD: dancing\nE: sleeping\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_23_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_23_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_23_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_23_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_23_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_23_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_23_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_23_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_23_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_23_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_23_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_23_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_23_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_23_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_23_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_23_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: lies on chair\nB: move forward\nC: they laughed\nD: look down at baby\nE: hold hands", "question": "what did the lady and man do after waving their hands in the air", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: lies on chair\nB: move forward\nC: they laughed\nD: look down at baby\nE: hold hands\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_24_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_24_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_24_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_24_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_24_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_24_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_24_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_24_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_24_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_24_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_24_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_24_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_24_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_24_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_24_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_24_15.jpg"], "output": "E", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: learn dancing\nB: playing the drum\nC: listen to her talk\nD: to see what show she is watching\nE: she is opening something", "question": "why does everyone focus on the lady in white sitting on the floor", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: learn dancing\nB: playing the drum\nC: listen to her talk\nD: to see what show she is watching\nE: she is opening something\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_25_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_25_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_25_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_25_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_25_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_25_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_25_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_25_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_25_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_25_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_25_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_25_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_25_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_25_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_25_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_25_15.jpg"], "output": "E", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: walk away\nB: ran one round\nC: walk and look around\nD: jump\nE: black dog runs it after", "question": "what does the white dog do after the brown dog completes one round in the middle", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: walk away\nB: ran one round\nC: walk and look around\nD: jump\nE: black dog runs it after\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_26_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_26_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_26_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_26_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_26_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_26_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_26_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_26_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_26_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_26_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_26_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_26_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_26_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_26_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_26_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_26_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: stopped in front of the baby\nB: move toy around\nC: chew on a gum\nD: play with green toy\nE: hold up a cup", "question": "what does the person do while the dog is jumping up and down", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: stopped in front of the baby\nB: move toy around\nC: chew on a gum\nD: play with green toy\nE: hold up a cup\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_27_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_27_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_27_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_27_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_27_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_27_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_27_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_27_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_27_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_27_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_27_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_27_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_27_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_27_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_27_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_27_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: with toys\nB: with the fence\nC: with fans\nD: with yellow sign boards\nE: with fists", "question": "how did both of them hit each other", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: with toys\nB: with the fence\nC: with fans\nD: with yellow sign boards\nE: with fists\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_28_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_28_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_28_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_28_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_28_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_28_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_28_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_28_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_28_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_28_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_28_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_28_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_28_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_28_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_28_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_28_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: sit on stairs\nB: get down from the chair\nC: kneel down\nD: put hands over lady in blue\nE: stand still", "question": "what did the lady in white do when she first approached the lady in blue in the middle", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: sit on stairs\nB: get down from the chair\nC: kneel down\nD: put hands over lady in blue\nE: stand still\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_29_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_29_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_29_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_29_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_29_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_29_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_29_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_29_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_29_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_29_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_29_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_29_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_29_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_29_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_29_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_29_15.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: chair\nB: table\nC: tv screen\nD: piano\nE: dance machine", "question": "what is placed on the right to the lady on stage", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: chair\nB: table\nC: tv screen\nD: piano\nE: dance machine\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_30_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_30_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_30_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_30_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_30_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_30_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_30_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_30_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_30_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_30_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_30_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_30_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_30_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_30_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_30_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_30_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: couch\nB: house\nC: car\nD: bus park place\nE: stage", "question": "where did this occur", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: couch\nB: house\nC: car\nD: bus park place\nE: stage\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_31_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_31_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_31_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_31_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_31_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_31_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_31_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_31_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_31_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_31_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_31_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_31_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_31_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_31_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_31_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_31_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: point to her\nB: keep camera in pocket\nC: look at his phone\nD: plays guitar\nE: plays the guitar", "question": "what does the man in grey suit do after they have finished singing at the end", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: point to her\nB: keep camera in pocket\nC: look at his phone\nD: plays guitar\nE: plays the guitar\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_32_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_32_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_32_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_32_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_32_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_32_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_32_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_32_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_32_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_32_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_32_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_32_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_32_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_32_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_32_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_32_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: to touch the sandals\nB: to dance on the floor\nC: to play\nD: he is bored\nE: listening to music and dancing", "question": "why did the boy punch his hand forwards in the middle of the video", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: to touch the sandals\nB: to dance on the floor\nC: to play\nD: he is bored\nE: listening to music and dancing\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_33_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_33_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_33_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_33_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_33_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_33_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_33_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_33_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_33_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_33_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_33_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_33_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_33_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_33_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_33_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_33_15.jpg"], "output": "E", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: girl hit wall\nB: part of the dance routine\nC: practicing\nD: to wave\nE: pushing the rod", "question": "why is the man raising his legs throughout the video", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: girl hit wall\nB: part of the dance routine\nC: practicing\nD: to wave\nE: pushing the rod\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_34_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_34_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_34_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_34_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_34_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_34_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_34_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_34_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_34_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_34_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_34_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_34_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_34_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_34_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_34_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_34_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: make sound\nB: to drink\nC: enjoying the music\nD: make it more fun\nE: to direct the boy", "question": "why did the man hit the notes in one spectrum and direction", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: make sound\nB: to drink\nC: enjoying the music\nD: make it more fun\nE: to direct the boy\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_35_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_35_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_35_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_35_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_35_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_35_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_35_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_35_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_35_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_35_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_35_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_35_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_35_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_35_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_35_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_35_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: it was sunny outside\nB: fashion\nC: for protection from chemicals\nD: to read the book\nE: to watch television", "question": "why did the boy wear glasses", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: it was sunny outside\nB: fashion\nC: for protection from chemicals\nD: to read the book\nE: to watch television\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_36_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_36_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_36_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_36_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_36_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_36_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_36_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_36_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_36_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_36_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_36_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_36_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_36_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_36_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_36_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_36_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: show he finish chewing\nB: open the toy s mouth\nC: teething\nD: happy and laughing\nE: playing game", "question": "why did the boy put his finger into his mouth", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: show he finish chewing\nB: open the toy s mouth\nC: teething\nD: happy and laughing\nE: playing game\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_37_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_37_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_37_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_37_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_37_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_37_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_37_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_37_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_37_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_37_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_37_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_37_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_37_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_37_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_37_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_37_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: drink\nB: want to play\nC: interested in it\nD: for food\nE: distracted", "question": "why do the cats walk away from the carpark at the middle of the video", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: drink\nB: want to play\nC: interested in it\nD: for food\nE: distracted\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_38_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_38_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_38_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_38_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_38_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_38_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_38_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_38_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_38_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_38_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_38_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_38_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_38_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_38_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_38_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_38_15.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: dog keep blocking\nB: to adjust snowboard\nC: preparing for speech\nD: to show the drink\nE: to change slides", "question": "why does the man stops multiple times in between", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: dog keep blocking\nB: to adjust snowboard\nC: preparing for speech\nD: to show the drink\nE: to change slides\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_39_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_39_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_39_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_39_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_39_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_39_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_39_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_39_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_39_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_39_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_39_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_39_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_39_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_39_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_39_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_39_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: to look at something above\nB: to bow to the man\nC: playing\nD: retrieve ball\nE: pick up stick", "question": "why does the dark brown dog bend down at the end of the video", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: to look at something above\nB: to bow to the man\nC: playing\nD: retrieve ball\nE: pick up stick\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_40_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_40_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_40_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_40_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_40_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_40_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_40_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_40_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_40_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_40_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_40_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_40_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_40_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_40_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_40_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_40_15.jpg"], "output": "E", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: hit the ball\nB: to get the kite handle\nC: play catch with the ball\nD: chasing the car\nE: show excitement", "question": "why were the people running in circles at the start", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: hit the ball\nB: to get the kite handle\nC: play catch with the ball\nD: chasing the car\nE: show excitement\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_41_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_41_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_41_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_41_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_41_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_41_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_41_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_41_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_41_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_41_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_41_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_41_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_41_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_41_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_41_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_41_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: to note something on the book\nB: pose for camera\nC: reach for the airconditioners\nD: play with toy\nE: dancing along with music", "question": "why did the boy lift his hands up above his head nearing the end while turning", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: to note something on the book\nB: pose for camera\nC: reach for the airconditioners\nD: play with toy\nE: dancing along with music\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_42_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_42_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_42_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_42_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_42_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_42_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_42_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_42_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_42_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_42_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_42_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_42_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_42_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_42_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_42_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_42_15.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: fighting for stick\nB: biting on rat\nC: drink the milk\nD: play with ball\nE: follow man s instruction", "question": "why were both dogs looking down near the middle", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: fighting for stick\nB: biting on rat\nC: drink the milk\nD: play with ball\nE: follow man s instruction\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_43_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_43_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_43_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_43_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_43_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_43_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_43_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_43_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_43_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_43_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_43_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_43_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_43_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_43_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_43_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_43_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: for safety in skiing\nB: construction work\nC: protect head from bricks\nD: photo requirement\nE: trying out new helmets", "question": "why is the man wearing helmet", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: for safety in skiing\nB: construction work\nC: protect head from bricks\nD: photo requirement\nE: trying out new helmets\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_44_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_44_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_44_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_44_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_44_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_44_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_44_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_44_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_44_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_44_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_44_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_44_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_44_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_44_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_44_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_44_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: hot\nB: recording the scenery\nC: sunny\nD: raining\nE: to focus on cake", "question": "why did the camera view get blurred", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: hot\nB: recording the scenery\nC: sunny\nD: raining\nE: to focus on cake\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_45_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_45_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_45_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_45_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_45_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_45_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_45_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_45_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_45_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_45_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_45_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_45_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_45_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_45_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_45_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_45_15.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: drinking soup\nB: to eat ice cream\nC: feeding the dog\nD: feed little girl\nE: to stir salad", "question": "why did the lady in red picked up the spoon on the table", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: drinking soup\nB: to eat ice cream\nC: feeding the dog\nD: feed little girl\nE: to stir salad\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_46_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_46_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_46_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_46_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_46_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_46_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_46_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_46_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_46_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_46_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_46_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_46_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_46_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_46_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_46_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_46_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: look around\nB: move to right side\nC: lick hand\nD: jump around\nE: put its paw back", "question": "what does the dog do after the person stretch his hand out", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: look around\nB: move to right side\nC: lick hand\nD: jump around\nE: put its paw back\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_47_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_47_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_47_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_47_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_47_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_47_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_47_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_47_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_47_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_47_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_47_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_47_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_47_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_47_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_47_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_47_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: blue\nB: green\nC: white\nD: black\nE: teal", "question": "what colour shirt was the man wearing", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: blue\nB: green\nC: white\nD: black\nE: teal\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_48_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_48_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_48_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_48_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_48_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_48_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_48_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_48_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_48_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_48_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_48_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_48_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_48_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_48_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_48_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_48_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: raise his hands\nB: pretends to be an animal\nC: jumping in\nD: pail\nE: spectacles", "question": "how did the man in the screen pretended to be swimming with the fishes int he background", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: raise his hands\nB: pretends to be an animal\nC: jumping in\nD: pail\nE: spectacles\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_49_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_49_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_49_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_49_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_49_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_49_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_49_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_49_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_49_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_49_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_49_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_49_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_49_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_49_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_49_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_49_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: drinking\nB: claps\nC: speak into microphone\nD: singing\nE: want to snatch the phone", "question": "what is the man with cap doing while the bald man is answering a call at the beginning", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: drinking\nB: claps\nC: speak into microphone\nD: singing\nE: want to snatch the phone\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_50_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_50_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_50_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_50_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_50_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_50_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_50_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_50_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_50_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_50_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_50_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_50_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_50_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_50_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_50_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_50_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: watches dog run away\nB: excited\nC: gives them food\nD: lower its head\nE: pull it out", "question": "how does the lady react when the dog wo nt let go of the twig", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: watches dog run away\nB: excited\nC: gives them food\nD: lower its head\nE: pull it out\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_51_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_51_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_51_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_51_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_51_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_51_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_51_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_51_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_51_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_51_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_51_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_51_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_51_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_51_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_51_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_51_15.jpg"], "output": "E", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: watching the man write calligraphy\nB: need to play the drum\nC: open space\nD: looking after the child\nE: to show to people", "question": "why is there a man standing at the start of the road", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: watching the man write calligraphy\nB: need to play the drum\nC: open space\nD: looking after the child\nE: to show to people\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_52_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_52_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_52_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_52_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_52_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_52_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_52_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_52_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_52_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_52_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_52_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_52_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_52_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_52_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_52_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_52_15.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: posing for photos\nB: common costume for performance\nC: formal celebration\nD: playing rugby\nE: for safety", "question": "why do the people wear headgear", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: posing for photos\nB: common costume for performance\nC: formal celebration\nD: playing rugby\nE: for safety\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_53_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_53_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_53_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_53_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_53_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_53_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_53_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_53_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_53_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_53_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_53_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_53_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_53_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_53_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_53_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_53_15.jpg"], "output": "E", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: dancing with the girl\nB: mimicking the tv show\nC: perform for the audience\nD: talking\nE: express excited", "question": "why is there a woman dancing and moving along next to the car", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: dancing with the girl\nB: mimicking the tv show\nC: perform for the audience\nD: talking\nE: express excited\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_54_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_54_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_54_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_54_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_54_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_54_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_54_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_54_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_54_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_54_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_54_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_54_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_54_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_54_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_54_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_54_15.jpg"], "output": "E", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: studio\nB: beach\nC: living area\nD: home\nE: garden", "question": "where are the people hanging out", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: studio\nB: beach\nC: living area\nD: home\nE: garden\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_55_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_55_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_55_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_55_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_55_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_55_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_55_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_55_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_55_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_55_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_55_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_55_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_55_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_55_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_55_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_55_15.jpg"], "output": "E", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: caress\nB: place the dog near obstacle\nC: tie it to a pole\nD: carry baby to chase it\nE: biting its tail", "question": "how does the man in black vest correct the track of the white dog at the end", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: caress\nB: place the dog near obstacle\nC: tie it to a pole\nD: carry baby to chase it\nE: biting its tail\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_56_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_56_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_56_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_56_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_56_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_56_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_56_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_56_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_56_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_56_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_56_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_56_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_56_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_56_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_56_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_56_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: asking more food\nB: adjusting its leash\nC: to see who pet it\nD: playing ball games\nE: excited to change his attire", "question": "why was the dog looking upwards in the middle", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: asking more food\nB: adjusting its leash\nC: to see who pet it\nD: playing ball games\nE: excited to change his attire\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_57_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_57_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_57_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_57_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_57_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_57_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_57_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_57_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_57_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_57_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_57_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_57_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_57_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_57_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_57_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_57_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: play with dog\nB: turn away\nC: run to the other side\nD: lick person s hand\nE: look its right", "question": "what does the dog do after getting the twig", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: play with dog\nB: turn away\nC: run to the other side\nD: lick person s hand\nE: look its right\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_58_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_58_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_58_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_58_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_58_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_58_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_58_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_58_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_58_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_58_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_58_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_58_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_58_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_58_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_58_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_58_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: parent offspring\nB: father daughter\nC: husband wife\nD: family member\nE: trainer trainee", "question": "what is the relationship between the man and lady", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: parent offspring\nB: father daughter\nC: husband wife\nD: family member\nE: trainer trainee\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_59_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_59_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_59_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_59_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_59_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_59_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_59_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_59_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_59_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_59_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_59_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_59_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_59_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_59_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_59_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_59_15.jpg"], "output": "E", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: pen and paper\nB: phone\nC: camera\nD: computer\nE: tablet", "question": "how does the person in orange helmet record the activity", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: pen and paper\nB: phone\nC: camera\nD: computer\nE: tablet\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_60_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_60_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_60_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_60_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_60_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_60_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_60_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_60_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_60_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_60_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_60_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_60_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_60_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_60_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_60_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_60_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: sit in a circle\nB: kneeling\nC: stand behind the baby\nD: touching the controls\nE: standing", "question": "how is the boy positioned on the chair", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: sit in a circle\nB: kneeling\nC: stand behind the baby\nD: touching the controls\nE: standing\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_61_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_61_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_61_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_61_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_61_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_61_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_61_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_61_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_61_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_61_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_61_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_61_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_61_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_61_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_61_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_61_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: parent offspring\nB: student\nC: teacher\nD: father son\nE: slides", "question": "what are the blue or green things some children are holding on to", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: parent offspring\nB: student\nC: teacher\nD: father son\nE: slides\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_62_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_62_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_62_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_62_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_62_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_62_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_62_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_62_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_62_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_62_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_62_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_62_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_62_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_62_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_62_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_62_15.jpg"], "output": "E", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: dog snatched from the person\nB: person placed there\nC: person throw it\nD: bite toy from sofa\nE: bite it under the table", "question": "why did the toy end up in the dog s mouth after the middle part of the video", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: dog snatched from the person\nB: person placed there\nC: person throw it\nD: bite toy from sofa\nE: bite it under the table\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_63_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_63_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_63_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_63_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_63_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_63_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_63_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_63_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_63_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_63_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_63_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_63_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_63_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_63_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_63_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_63_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: watching television\nB: playing\nC: eating\nD: huggging the dog\nE: swimming", "question": "why is the boy in blue sitting down on the lady in red s lap", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: watching television\nB: playing\nC: eating\nD: huggging the dog\nE: swimming\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_64_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_64_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_64_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_64_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_64_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_64_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_64_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_64_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_64_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_64_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_64_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_64_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_64_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_64_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_64_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_64_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: play with the toy\nB: pick up the ball\nC: pick up hat\nD: run around the table\nE: dance", "question": "what did the lady in pink do after the man in white missed the ball", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: play with the toy\nB: pick up the ball\nC: pick up hat\nD: run around the table\nE: dance\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_65_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_65_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_65_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_65_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_65_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_65_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_65_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_65_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_65_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_65_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_65_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_65_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_65_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_65_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_65_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_65_15.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: playing with balloon\nB: facing the baby forward\nC: lets the baby hold her fingers\nD: using strap\nE: hands support bum and back", "question": "how does the lady in blue carry the child at the beginning", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: playing with balloon\nB: facing the baby forward\nC: lets the baby hold her fingers\nD: using strap\nE: hands support bum and back\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_66_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_66_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_66_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_66_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_66_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_66_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_66_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_66_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_66_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_66_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_66_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_66_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_66_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_66_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_66_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_66_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: ball dinner\nB: in hospital\nC: keep warm\nD: fencing\nE: scuba diving", "question": "why are the two people wearing something to cover their face", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: ball dinner\nB: in hospital\nC: keep warm\nD: fencing\nE: scuba diving\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_67_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_67_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_67_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_67_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_67_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_67_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_67_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_67_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_67_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_67_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_67_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_67_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_67_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_67_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_67_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_67_15.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: for balance while skating\nB: dancing to rhythm\nC: play game\nD: expressive\nE: playing the drum", "question": "why was the boy s arm moving constantly", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: for balance while skating\nB: dancing to rhythm\nC: play game\nD: expressive\nE: playing the drum\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_68_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_68_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_68_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_68_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_68_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_68_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_68_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_68_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_68_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_68_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_68_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_68_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_68_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_68_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_68_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_68_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: grab it\nB: bite it\nC: kick it\nD: push it away\nE: pass to adult", "question": "how does the baby in purple interact with the red toy", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: grab it\nB: bite it\nC: kick it\nD: push it away\nE: pass to adult\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_69_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_69_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_69_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_69_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_69_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_69_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_69_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_69_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_69_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_69_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_69_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_69_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_69_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_69_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_69_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_69_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: toy car\nB: clothes\nC: handbag\nD: glass bottle\nE: guitar", "question": "what does the lady in black on the sofa hold in her hands", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: toy car\nB: clothes\nC: handbag\nD: glass bottle\nE: guitar\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_70_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_70_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_70_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_70_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_70_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_70_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_70_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_70_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_70_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_70_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_70_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_70_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_70_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_70_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_70_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_70_15.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: use hand gestures to demonstrate\nB: point towards video\nC: keep clapping\nD: playing with her hair\nE: use laser pointer", "question": "how does the girl use body language to demonstrate what she is saying", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: use hand gestures to demonstrate\nB: point towards video\nC: keep clapping\nD: playing with her hair\nE: use laser pointer\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_71_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_71_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_71_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_71_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_71_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_71_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_71_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_71_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_71_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_71_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_71_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_71_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_71_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_71_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_71_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_71_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: three\nB: two\nC: one\nD: four\nE: five", "question": "how many cats are there", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: three\nB: two\nC: one\nD: four\nE: five\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_72_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_72_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_72_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_72_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_72_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_72_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_72_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_72_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_72_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_72_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_72_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_72_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_72_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_72_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_72_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_72_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: boundary\nB: play with snow\nC: experimenting with chemicals\nD: help patients inject needles\nE: cleaning the floor", "question": "why do the people in headgear move on the white surface", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: boundary\nB: play with snow\nC: experimenting with chemicals\nD: help patients inject needles\nE: cleaning the floor\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_73_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_73_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_73_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_73_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_73_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_73_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_73_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_73_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_73_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_73_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_73_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_73_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_73_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_73_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_73_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_73_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: to prevent injuries\nB: signal end of performance\nC: let boy in black try\nD: to reset the match\nE: separated by trainer", "question": "why do the people in the headgear adjust their positions after the person in red hits the other person with the sword", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: to prevent injuries\nB: signal end of performance\nC: let boy in black try\nD: to reset the match\nE: separated by trainer\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_74_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_74_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_74_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_74_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_74_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_74_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_74_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_74_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_74_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_74_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_74_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_74_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_74_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_74_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_74_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_74_15.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: keep baby from falling\nB: engage lady \nC: tease her\nD: place something on windowsill\nE: take a gift", "question": "why did a lady in purple walk in after the lady carried the baby up", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: keep baby from falling\nB: engage lady \nC: tease her\nD: place something on windowsill\nE: take a gift\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_75_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_75_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_75_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_75_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_75_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_75_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_75_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_75_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_75_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_75_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_75_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_75_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_75_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_75_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_75_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_75_15.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: pet it\nB: play ball games\nC: remove the leash\nD: sniff the dog\nE: carry it", "question": "how does the man in black interact with the dog at the end", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: pet it\nB: play ball games\nC: remove the leash\nD: sniff the dog\nE: carry it\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_76_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_76_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_76_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_76_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_76_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_76_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_76_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_76_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_76_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_76_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_76_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_76_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_76_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_76_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_76_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_76_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: go to lady in stripes\nB: unwrap something\nC: adjust the girl s jacket\nD: shuffle cards\nE: play guitar", "question": "what does the lady in white on the floor do as everyone was sitting around", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: go to lady in stripes\nB: unwrap something\nC: adjust the girl s jacket\nD: shuffle cards\nE: play guitar\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_77_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_77_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_77_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_77_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_77_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_77_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_77_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_77_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_77_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_77_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_77_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_77_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_77_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_77_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_77_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_77_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: trainer trainee\nB: couple\nC: offspring\nD: husband and wife\nE: father and daughter", "question": "what is the relationship between the man and the woman", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: trainer trainee\nB: couple\nC: offspring\nD: husband and wife\nE: father and daughter\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_78_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_78_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_78_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_78_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_78_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_78_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_78_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_78_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_78_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_78_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_78_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_78_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_78_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_78_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_78_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_78_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: office\nB: house\nC: train\nD: front porch\nE: park", "question": "where is this video taken", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: office\nB: house\nC: train\nD: front porch\nE: park\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_79_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_79_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_79_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_79_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_79_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_79_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_79_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_79_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_79_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_79_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_79_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_79_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_79_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_79_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_79_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_79_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: working\nB: choreography\nC: dancing\nD: part of home decoration\nE: clean faster", "question": "why does the woman brush the same utensil", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: working\nB: choreography\nC: dancing\nD: part of home decoration\nE: clean faster\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_80_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_80_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_80_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_80_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_80_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_80_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_80_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_80_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_80_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_80_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_80_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_80_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_80_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_80_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_80_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_80_15.jpg"], "output": "E", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: pick up toy\nB: part of the play\nC: moving baby s hands\nD: feeding birds\nE: play music", "question": "why is there a hand stretched out", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: pick up toy\nB: part of the play\nC: moving baby s hands\nD: feeding birds\nE: play music\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_81_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_81_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_81_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_81_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_81_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_81_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_81_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_81_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_81_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_81_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_81_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_81_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_81_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_81_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_81_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_81_15.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: attack with sword\nB: move out of fencing area\nC: fell down\nD: threw sword away\nE: move in front", "question": "what did the fencer in black do when the other fencer moved forward to him at the end", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: attack with sword\nB: move out of fencing area\nC: fell down\nD: threw sword away\nE: move in front\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_82_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_82_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_82_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_82_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_82_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_82_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_82_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_82_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_82_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_82_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_82_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_82_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_82_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_82_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_82_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_82_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: look at someone else s phone\nB: play piano\nC: dancing\nD: pointing to the tiger\nE: drink coffee", "question": "what was the man doing as the lady in blue covered her face", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: look at someone else s phone\nB: play piano\nC: dancing\nD: pointing to the tiger\nE: drink coffee\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_83_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_83_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_83_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_83_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_83_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_83_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_83_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_83_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_83_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_83_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_83_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_83_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_83_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_83_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_83_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_83_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: distracted\nB: get floats\nC: threw the ball\nD: called by man\nE: to get out of pool", "question": "why did the lady in purple walk away from the babies near the end of the video", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: distracted\nB: get floats\nC: threw the ball\nD: called by man\nE: to get out of pool\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_84_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_84_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_84_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_84_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_84_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_84_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_84_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_84_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_84_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_84_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_84_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_84_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_84_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_84_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_84_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_84_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: medical procedure\nB: making a model\nC: climbing rocky mountains\nD: protect baby from getting sick\nE: playing rugby", "question": "why are the people wearing gloves", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: medical procedure\nB: making a model\nC: climbing rocky mountains\nD: protect baby from getting sick\nE: playing rugby\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_85_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_85_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_85_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_85_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_85_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_85_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_85_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_85_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_85_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_85_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_85_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_85_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_85_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_85_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_85_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_85_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: in a circle\nB: using his hands to unscrew the cap\nC: next to baby\nD: hold the sides of the phone\nE: above his head", "question": "how did the person at the end with the camera hold the umbrella", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: in a circle\nB: using his hands to unscrew the cap\nC: next to baby\nD: hold the sides of the phone\nE: above his head\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_86_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_86_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_86_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_86_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_86_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_86_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_86_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_86_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_86_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_86_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_86_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_86_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_86_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_86_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_86_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_86_15.jpg"], "output": "E", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: smiling\nB: laughing\nC: crying\nD: disgusted\nE: itchy and uncomfortable", "question": "how is the boy in blue expressing himself", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: smiling\nB: laughing\nC: crying\nD: disgusted\nE: itchy and uncomfortable\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_87_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_87_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_87_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_87_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_87_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_87_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_87_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_87_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_87_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_87_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_87_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_87_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_87_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_87_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_87_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_87_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: bring girl closer to the tree\nB: follow her instructions to sit\nC: close the door\nD: walked away\nE: blow candle", "question": "what does the white hair man do after picking the girl up", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: bring girl closer to the tree\nB: follow her instructions to sit\nC: close the door\nD: walked away\nE: blow candle\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_88_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_88_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_88_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_88_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_88_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_88_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_88_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_88_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_88_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_88_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_88_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_88_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_88_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_88_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_88_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_88_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: not used to paddle\nB: lady was jumping\nC: training ground\nD: comfortable\nE: mimic movement", "question": "why are the three children in front of the lady in brown not able to balance on the surface near the start", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: not used to paddle\nB: lady was jumping\nC: training ground\nD: comfortable\nE: mimic movement\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_89_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_89_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_89_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_89_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_89_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_89_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_89_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_89_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_89_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_89_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_89_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_89_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_89_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_89_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_89_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_89_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: 2 boys\nB: phone\nC: bottle\nD: stick\nE: cosplay", "question": "what is the person shown on screen holding throughout the video", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: 2 boys\nB: phone\nC: bottle\nD: stick\nE: cosplay\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_90_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_90_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_90_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_90_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_90_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_90_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_90_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_90_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_90_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_90_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_90_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_90_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_90_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_90_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_90_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_90_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: laughing\nB: to pick the girl up\nC: pick up a toy at the side\nD: microphone too short\nE: put phone", "question": "why did the man bend and lower his head to the bed", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: laughing\nB: to pick the girl up\nC: pick up a toy at the side\nD: microphone too short\nE: put phone\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_91_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_91_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_91_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_91_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_91_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_91_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_91_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_91_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_91_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_91_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_91_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_91_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_91_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_91_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_91_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_91_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: forest\nB: bedroom\nC: dining room\nD: by a stream\nE: backyard", "question": "where are the man and the baby hanging out", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: forest\nB: bedroom\nC: dining room\nD: by a stream\nE: backyard\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_92_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_92_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_92_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_92_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_92_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_92_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_92_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_92_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_92_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_92_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_92_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_92_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_92_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_92_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_92_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_92_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: adjust his grip\nB: to take something\nC: to help girl\nD: pass bag to lady\nE: to hold hands", "question": "why does the man remove one of his hands from the handler in the middle", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: adjust his grip\nB: to take something\nC: to help girl\nD: pass bag to lady\nE: to hold hands\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_93_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_93_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_93_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_93_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_93_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_93_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_93_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_93_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_93_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_93_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_93_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_93_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_93_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_93_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_93_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_93_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: performance ended\nB: throw down drinks\nC: distracting others\nD: pick up something\nE: to come down", "question": "why did the the woman with apron bend down after moving to the left of the stage", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: performance ended\nB: throw down drinks\nC: distracting others\nD: pick up something\nE: to come down\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_94_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_94_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_94_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_94_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_94_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_94_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_94_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_94_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_94_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_94_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_94_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_94_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_94_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_94_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_94_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_94_15.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: on table\nB: taps his leg\nC: press the yellow button\nD: show lady\nE: on bottle", "question": "where did the boy put his right hand after he took it out from his mouth", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: on table\nB: taps his leg\nC: press the yellow button\nD: show lady\nE: on bottle\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_95_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_95_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_95_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_95_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_95_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_95_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_95_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_95_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_95_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_95_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_95_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_95_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_95_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_95_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_95_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_95_15.jpg"], "output": "E", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: looked at camera\nB: sit on man and play toy\nC: direct baby away\nD: look back at her\nE: takes the spoon away", "question": "what does the baby do after the lady changes the direction of the toy car in the middle of the video", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: looked at camera\nB: sit on man and play toy\nC: direct baby away\nD: look back at her\nE: takes the spoon away\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_96_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_96_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_96_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_96_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_96_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_96_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_96_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_96_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_96_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_96_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_96_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_96_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_96_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_96_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_96_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_96_15.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: touch nose\nB: flip\nC: roll around\nD: touch the camera\nE: cycle towards the slope", "question": "what does the boy do after rolling over in the middle", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: touch nose\nB: flip\nC: roll around\nD: touch the camera\nE: cycle towards the slope\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_97_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_97_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_97_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_97_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_97_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_97_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_97_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_97_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_97_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_97_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_97_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_97_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_97_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_97_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_97_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_97_15.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: walk away\nB: thumbs up\nC: put down her club\nD: applying cream on face\nE: caressing for the dog", "question": "what did the lady do while turning back", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: walk away\nB: thumbs up\nC: put down her club\nD: applying cream on face\nE: caressing for the dog\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_98_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_98_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_98_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_98_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_98_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_98_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_98_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_98_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_98_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_98_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_98_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_98_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_98_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_98_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_98_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_98_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: playing with baby\nB: look at scenery\nC: carry girl\nD: tap on screen\nE: look at lady in hoodie", "question": "why did the lady slow down when she reached the top", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: playing with baby\nB: look at scenery\nC: carry girl\nD: tap on screen\nE: look at lady in hoodie\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_99_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_99_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_99_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_99_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_99_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_99_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_99_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_99_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_99_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_99_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_99_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_99_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_99_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_99_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_99_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_99_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: acknowledge something\nB: head was uncomfortable\nC: try to sing to beat\nD: posing for the camera\nE: produce higher vioce", "question": "why did the boy nod his head when singing", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: acknowledge something\nB: head was uncomfortable\nC: try to sing to beat\nD: posing for the camera\nE: produce higher vioce\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_100_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_100_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_100_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_100_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_100_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_100_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_100_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_100_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_100_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_100_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_100_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_100_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_100_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_100_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_100_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_100_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: hod in hand\nB: beside the pink toy\nC: in baby s hand\nD: table\nE: chair", "question": "where did the girl put her phone as she kissed it", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: hod in hand\nB: beside the pink toy\nC: in baby s hand\nD: table\nE: chair\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_101_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_101_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_101_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_101_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_101_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_101_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_101_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_101_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_101_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_101_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_101_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_101_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_101_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_101_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_101_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_101_15.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: remove the paper too\nB: look around her\nC: drink water\nD: walk away\nE: point at the music script", "question": "what did the lady in polka dress do after she talked to the person in front of her", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: remove the paper too\nB: look around her\nC: drink water\nD: walk away\nE: point at the music script\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_102_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_102_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_102_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_102_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_102_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_102_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_102_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_102_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_102_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_102_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_102_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_102_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_102_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_102_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_102_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_102_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: karaoke room\nB: in the middle of the sea\nC: skate park\nD: house\nE: basketball court", "question": "where is this happening", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: karaoke room\nB: in the middle of the sea\nC: skate park\nD: house\nE: basketball court\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_103_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_103_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_103_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_103_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_103_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_103_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_103_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_103_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_103_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_103_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_103_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_103_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_103_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_103_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_103_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_103_15.jpg"], "output": "E", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: push trolley\nB: push the back of car\nC: goes to play the piano\nD: smiles\nE: stroke cat", "question": "what did the lady in purple do after she touched the baby s head", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: push trolley\nB: push the back of car\nC: goes to play the piano\nD: smiles\nE: stroke cat\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_104_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_104_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_104_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_104_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_104_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_104_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_104_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_104_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_104_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_104_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_104_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_104_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_104_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_104_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_104_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_104_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: on a baby pram\nB: on a toy\nC: on a walker\nD: baby stroller\nE: on a sled", "question": "how is the baby moved around", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: on a baby pram\nB: on a toy\nC: on a walker\nD: baby stroller\nE: on a sled\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_105_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_105_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_105_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_105_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_105_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_105_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_105_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_105_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_105_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_105_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_105_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_105_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_105_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_105_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_105_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_105_15.jpg"], "output": "E", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: keep warm\nB: protect from sun\nC: goes well with attire\nD: sunny whether\nE: it s cold", "question": "why did the old man wear jacket and hat outdoors", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: keep warm\nB: protect from sun\nC: goes well with attire\nD: sunny whether\nE: it s cold\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_106_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_106_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_106_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_106_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_106_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_106_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_106_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_106_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_106_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_106_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_106_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_106_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_106_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_106_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_106_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_106_15.jpg"], "output": "E", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: follow direction of man s leg\nB: there is cushion behind\nC: observe\nD: another baby is in front\nE: relaxing", "question": "why does the baby lean back as the man pulls the sled", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: follow direction of man s leg\nB: there is cushion behind\nC: observe\nD: another baby is in front\nE: relaxing\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_107_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_107_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_107_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_107_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_107_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_107_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_107_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_107_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_107_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_107_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_107_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_107_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_107_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_107_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_107_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_107_15.jpg"], "output": "E", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: change something on screen\nB: swing cloth\nC: switch positions with other man\nD: taking a break\nE: looking at baby", "question": "why does the man stop for a while in the middle of the video", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: change something on screen\nB: swing cloth\nC: switch positions with other man\nD: taking a break\nE: looking at baby\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_108_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_108_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_108_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_108_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_108_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_108_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_108_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_108_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_108_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_108_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_108_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_108_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_108_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_108_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_108_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_108_15.jpg"], "output": "E", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: want to hug child\nB: dancing\nC: doing squats\nD: want to hug dog\nE: stretching his arms", "question": "why did the adult squat down and opened his arm at the end of the video", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: want to hug child\nB: dancing\nC: doing squats\nD: want to hug dog\nE: stretching his arms\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_109_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_109_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_109_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_109_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_109_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_109_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_109_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_109_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_109_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_109_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_109_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_109_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_109_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_109_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_109_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_109_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: ask cameraman to move\nB: testing the microphone\nC: giving presentation\nD: make voice louder\nE: to capture audience attention", "question": "why does the man in white talk on the microphone towards the end", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: ask cameraman to move\nB: testing the microphone\nC: giving presentation\nD: make voice louder\nE: to capture audience attention\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_110_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_110_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_110_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_110_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_110_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_110_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_110_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_110_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_110_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_110_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_110_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_110_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_110_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_110_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_110_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_110_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: push fish s down\nB: touch his cap\nC: unlocking door\nD: dancing\nE: fist bump with woman", "question": "why did the man raise his hand up in a punch near the end of the video", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: push fish s down\nB: touch his cap\nC: unlocking door\nD: dancing\nE: fist bump with woman\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_111_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_111_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_111_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_111_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_111_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_111_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_111_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_111_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_111_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_111_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_111_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_111_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_111_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_111_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_111_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_111_15.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: raise their heads\nB: adjust her sleeves\nC: watching\nD: looking at girl in pink\nE: continue walking backwards", "question": "what did the lady in green do after bending down to laugh in the middle", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: raise their heads\nB: adjust her sleeves\nC: watching\nD: looking at girl in pink\nE: continue walking backwards\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_112_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_112_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_112_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_112_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_112_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_112_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_112_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_112_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_112_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_112_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_112_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_112_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_112_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_112_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_112_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_112_15.jpg"], "output": "E", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: steps onto it\nB: get up and move away\nC: went forwards and backwards\nD: pull up her wedding dress\nE: using its legs", "question": "how did the lady move herself into the house", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: steps onto it\nB: get up and move away\nC: went forwards and backwards\nD: pull up her wedding dress\nE: using its legs\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_113_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_113_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_113_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_113_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_113_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_113_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_113_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_113_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_113_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_113_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_113_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_113_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_113_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_113_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_113_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_113_15.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: practicing dance\nB: dancing\nC: show connection\nD: funny\nE: controller for dancing game", "question": "why were both of them smilinglaughing when they started dancing", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: practicing dance\nB: dancing\nC: show connection\nD: funny\nE: controller for dancing game\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_114_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_114_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_114_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_114_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_114_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_114_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_114_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_114_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_114_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_114_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_114_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_114_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_114_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_114_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_114_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_114_15.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: roll around\nB: hold man s hand\nC: eating icecream\nD: kiss rabbit\nE: water the plants", "question": "what is the boy doing while rabbit is eating grass", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: roll around\nB: hold man s hand\nC: eating icecream\nD: kiss rabbit\nE: water the plants\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_115_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_115_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_115_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_115_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_115_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_115_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_115_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_115_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_115_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_115_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_115_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_115_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_115_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_115_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_115_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_115_15.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: wipe with towel\nB: rubs it away\nC: snowmobile dig through\nD: shake his body\nE: spread arms out", "question": "how does the boy in front gets rid of the snow", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: wipe with towel\nB: rubs it away\nC: snowmobile dig through\nD: shake his body\nE: spread arms out\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_116_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_116_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_116_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_116_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_116_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_116_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_116_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_116_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_116_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_116_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_116_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_116_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_116_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_116_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_116_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_116_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: annoyed\nB: serious\nC: happy\nD: blessed\nE: disappointed", "question": "how does the girl feel while talking and demonstrating in front of the camera", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: annoyed\nB: serious\nC: happy\nD: blessed\nE: disappointed\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_117_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_117_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_117_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_117_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_117_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_117_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_117_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_117_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_117_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_117_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_117_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_117_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_117_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_117_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_117_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_117_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: food stains\nB: lightning streaks\nC: scratches\nD: stickers\nE: roses", "question": "what is there on the car", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: food stains\nB: lightning streaks\nC: scratches\nD: stickers\nE: roses\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_118_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_118_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_118_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_118_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_118_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_118_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_118_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_118_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_118_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_118_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_118_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_118_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_118_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_118_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_118_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_118_15.jpg"], "output": "E", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: mechanism stop moving\nB: got earmuff\nC: leave the room\nD: man move his hand\nE: listen to the sound", "question": "why does the person in the pink hat move his hand from his ear after the mechanism hits the bell the first time", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: mechanism stop moving\nB: got earmuff\nC: leave the room\nD: man move his hand\nE: listen to the sound\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_119_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_119_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_119_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_119_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_119_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_119_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_119_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_119_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_119_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_119_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_119_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_119_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_119_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_119_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_119_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_119_15.jpg"], "output": "E", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: sunny weather\nB: raining\nC: keep warm\nD: protect from cold temperature\nE: cold", "question": "why are the people dressed in raincoats", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: sunny weather\nB: raining\nC: keep warm\nD: protect from cold temperature\nE: cold\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_120_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_120_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_120_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_120_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_120_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_120_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_120_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_120_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_120_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_120_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_120_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_120_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_120_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_120_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_120_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_120_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: throw a toy\nB: point at her\nC: raise his hands\nD: clap his hand\nE: snap his fingers", "question": "how does the man signal for the girl to stand up after she falls", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: throw a toy\nB: point at her\nC: raise his hands\nD: clap his hand\nE: snap his fingers\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_121_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_121_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_121_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_121_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_121_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_121_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_121_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_121_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_121_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_121_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_121_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_121_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_121_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_121_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_121_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_121_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: stretch\nB: push the baby\nC: tie shoelaces\nD: take balloon away from baby\nE: playing games", "question": "why did the lady in purple bend down at the end of the video", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: stretch\nB: push the baby\nC: tie shoelaces\nD: take balloon away from baby\nE: playing games\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_122_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_122_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_122_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_122_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_122_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_122_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_122_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_122_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_122_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_122_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_122_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_122_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_122_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_122_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_122_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_122_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: one\nB: nine\nC: two\nD: four\nE: three", "question": "how many dogs are there", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: one\nB: nine\nC: two\nD: four\nE: three\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_123_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_123_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_123_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_123_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_123_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_123_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_123_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_123_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_123_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_123_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_123_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_123_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_123_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_123_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_123_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_123_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: drop cloth\nB: touch the cloth\nC: throw away the blue toy\nD: hit baby in grey with toy\nE: took another toy", "question": "what does the baby in purple do after looking at the toy for a while in the middle", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: drop cloth\nB: touch the cloth\nC: throw away the blue toy\nD: hit baby in grey with toy\nE: took another toy\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_124_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_124_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_124_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_124_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_124_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_124_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_124_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_124_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_124_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_124_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_124_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_124_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_124_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_124_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_124_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_124_15.jpg"], "output": "E", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: oversee\nB: watch video\nC: the person with socks walked past\nD: support baby\nE: wants to play with baby", "question": "why does the man keep staring at the boy throughout the video", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: oversee\nB: watch video\nC: the person with socks walked past\nD: support baby\nE: wants to play with baby\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_125_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_125_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_125_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_125_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_125_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_125_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_125_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_125_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_125_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_125_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_125_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_125_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_125_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_125_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_125_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_125_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: gently from the pillow\nB: from man s shoulder\nC: from another bed beside\nD: thrown by woman\nE: jump from sofa", "question": "how did the girl crash on to the bed", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: gently from the pillow\nB: from man s shoulder\nC: from another bed beside\nD: thrown by woman\nE: jump from sofa\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_126_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_126_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_126_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_126_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_126_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_126_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_126_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_126_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_126_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_126_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_126_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_126_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_126_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_126_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_126_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_126_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: play in the sea\nB: catch the branch\nC: playing\nD: playing fetch\nE: the dog bit her hand", "question": "why does the dog run after the twig when the lady throws it", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: play in the sea\nB: catch the branch\nC: playing\nD: playing fetch\nE: the dog bit her hand\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_127_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_127_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_127_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_127_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_127_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_127_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_127_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_127_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_127_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_127_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_127_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_127_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_127_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_127_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_127_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_127_15.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: shake a toy\nB: bib\nC: tickle the baby\nD: wear helmet\nE: use leg to support", "question": "how does the lady prevent the child from falling after putting the child on the ground", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: shake a toy\nB: bib\nC: tickle the baby\nD: wear helmet\nE: use leg to support\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_128_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_128_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_128_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_128_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_128_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_128_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_128_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_128_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_128_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_128_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_128_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_128_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_128_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_128_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_128_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_128_15.jpg"], "output": "E", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: run\nB: playing the drums\nC: aid in his layering work\nD: sitting and listening to man speaking\nE: places pan back on stove", "question": "what did the man on the left do after the other man filled the ground with the liquid", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: run\nB: playing the drums\nC: aid in his layering work\nD: sitting and listening to man speaking\nE: places pan back on stove\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_129_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_129_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_129_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_129_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_129_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_129_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_129_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_129_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_129_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_129_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_129_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_129_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_129_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_129_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_129_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_129_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: four\nB: five\nC: one\nD: eight\nE: three", "question": "how many people are involve din the video", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: four\nB: five\nC: one\nD: eight\nE: three\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_130_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_130_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_130_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_130_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_130_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_130_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_130_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_130_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_130_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_130_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_130_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_130_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_130_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_130_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_130_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_130_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: walk behind him\nB: hit it with a toy\nC: strolls around\nD: holding hand\nE: remote control", "question": "how did the boy showed that he was unstable while rollerblading", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: walk behind him\nB: hit it with a toy\nC: strolls around\nD: holding hand\nE: remote control\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_131_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_131_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_131_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_131_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_131_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_131_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_131_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_131_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_131_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_131_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_131_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_131_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_131_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_131_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_131_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_131_15.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: she wanted to feed the birds\nB: part of dance\nC: wanted to touch the tree\nD: touch adult\nE: pick up toy", "question": "why did the girl stretch out her hand", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: she wanted to feed the birds\nB: part of dance\nC: wanted to touch the tree\nD: touch adult\nE: pick up toy\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_132_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_132_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_132_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_132_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_132_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_132_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_132_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_132_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_132_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_132_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_132_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_132_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_132_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_132_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_132_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_132_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: bring baby away from woman\nB: for boy to pull sled\nC: pass it to boy\nD: help baby sledge\nE: show to other people", "question": "why does the man pull the baby sitting on the sled by the rope", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: bring baby away from woman\nB: for boy to pull sled\nC: pass it to boy\nD: help baby sledge\nE: show to other people\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_133_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_133_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_133_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_133_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_133_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_133_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_133_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_133_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_133_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_133_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_133_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_133_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_133_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_133_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_133_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_133_15.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: drove at full speed\nB: wheelers are moving fast\nC: slope\nD: sleigh is brand new\nE: pushed from behind", "question": "why is the sleigh able to go through the snow bump", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: drove at full speed\nB: wheelers are moving fast\nC: slope\nD: sleigh is brand new\nE: pushed from behind\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_134_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_134_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_134_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_134_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_134_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_134_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_134_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_134_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_134_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_134_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_134_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_134_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_134_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_134_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_134_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_134_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: play with baby\nB: hiking outside\nC: watching the crane\nD: emcee\nE: talking to cameraman", "question": "why is the lady wearing a white cap", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: play with baby\nB: hiking outside\nC: watching the crane\nD: emcee\nE: talking to cameraman\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_135_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_135_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_135_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_135_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_135_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_135_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_135_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_135_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_135_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_135_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_135_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_135_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_135_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_135_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_135_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_135_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: car accident\nB: snow\nC: shops mostly closed\nD: car museum\nE: parked", "question": "why is the cars in the street not moving", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: car accident\nB: snow\nC: shops mostly closed\nD: car museum\nE: parked\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_136_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_136_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_136_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_136_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_136_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_136_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_136_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_136_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_136_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_136_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_136_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_136_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_136_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_136_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_136_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_136_15.jpg"], "output": "E", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: leave the room\nB: put on earmuff\nC: press sides of hat\nD: stop mechanism\nE: stand futher away", "question": "how does the man in pink hat show that he thinks the bell is noisy at the beginning", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: leave the room\nB: put on earmuff\nC: press sides of hat\nD: stop mechanism\nE: stand futher away\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_137_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_137_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_137_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_137_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_137_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_137_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_137_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_137_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_137_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_137_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_137_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_137_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_137_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_137_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_137_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_137_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: not experienced at swimming\nB: feed baby\nC: looking for something\nD: drinking\nE: to touch water", "question": "why is the boy playing with a water bottle", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: not experienced at swimming\nB: feed baby\nC: looking for something\nD: drinking\nE: to touch water\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_138_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_138_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_138_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_138_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_138_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_138_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_138_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_138_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_138_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_138_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_138_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_138_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_138_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_138_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_138_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_138_15.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: pick up toy\nB: to apply the cream\nC: to clean his hand\nD: to eat\nE: scratch his mouth", "question": "why does the baby keep putting his hand in his mouth", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: pick up toy\nB: to apply the cream\nC: to clean his hand\nD: to eat\nE: scratch his mouth\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_139_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_139_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_139_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_139_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_139_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_139_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_139_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_139_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_139_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_139_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_139_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_139_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_139_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_139_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_139_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_139_15.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: playing around\nB: wash his hands\nC: to balance\nD: dancing\nE: to play the guitar", "question": "why did the boy rollerblading hold tightly the lady s hand", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: playing around\nB: wash his hands\nC: to balance\nD: dancing\nE: to play the guitar\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_140_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_140_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_140_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_140_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_140_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_140_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_140_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_140_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_140_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_140_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_140_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_140_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_140_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_140_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_140_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_140_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: jumping around rabbit\nB: caress its ears\nC: feed carrot\nD: kiss it\nE: chasing the rabbit", "question": "how does the boy show affection to the rabbit", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: jumping around rabbit\nB: caress its ears\nC: feed carrot\nD: kiss it\nE: chasing the rabbit\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_141_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_141_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_141_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_141_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_141_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_141_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_141_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_141_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_141_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_141_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_141_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_141_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_141_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_141_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_141_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_141_15.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: her partner talks to her\nB: not to hurt the plant\nC: trying out poses\nD: stable herself before stepping down\nE: to let the dog catch up", "question": "why did the lady turn her head to the left after walking for a while", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: her partner talks to her\nB: not to hurt the plant\nC: trying out poses\nD: stable herself before stepping down\nE: to let the dog catch up\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_142_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_142_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_142_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_142_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_142_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_142_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_142_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_142_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_142_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_142_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_142_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_142_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_142_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_142_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_142_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_142_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: field\nB: park\nC: bedroom\nD: sheltered area\nE: construction site", "question": "where is this video taken", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: field\nB: park\nC: bedroom\nD: sheltered area\nE: construction site\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_143_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_143_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_143_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_143_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_143_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_143_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_143_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_143_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_143_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_143_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_143_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_143_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_143_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_143_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_143_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_143_15.jpg"], "output": "E", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: ran in the same direction\nB: he listen and redo dancing steps\nC: singing away from the microphone\nD: dance\nE: touch his head", "question": "what did the man do after the lady made an angry gesture in the middle of the video", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: ran in the same direction\nB: he listen and redo dancing steps\nC: singing away from the microphone\nD: dance\nE: touch his head\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_144_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_144_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_144_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_144_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_144_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_144_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_144_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_144_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_144_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_144_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_144_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_144_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_144_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_144_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_144_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_144_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: five\nB: thirteen\nC: eight\nD: three\nE: four", "question": "how many people can be seen in the video", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: five\nB: thirteen\nC: eight\nD: three\nE: four\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_145_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_145_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_145_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_145_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_145_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_145_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_145_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_145_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_145_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_145_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_145_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_145_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_145_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_145_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_145_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_145_15.jpg"], "output": "E", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: adjusting it\nB: make hand gestures\nC: waiting for her turn to dance\nD: shout out to the other vocalists\nE: talking to the crowd", "question": "why did the lady with curly hair hold onto her microphone at the end", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: adjusting it\nB: make hand gestures\nC: waiting for her turn to dance\nD: shout out to the other vocalists\nE: talking to the crowd\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_146_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_146_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_146_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_146_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_146_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_146_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_146_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_146_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_146_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_146_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_146_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_146_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_146_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_146_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_146_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_146_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: fly\nB: use beak to pull itself\nC: walk on the ground\nD: skip\nE: roll", "question": "how does the nearest parrot move across the cage", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: fly\nB: use beak to pull itself\nC: walk on the ground\nD: skip\nE: roll\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_147_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_147_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_147_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_147_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_147_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_147_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_147_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_147_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_147_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_147_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_147_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_147_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_147_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_147_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_147_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_147_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: performers costume\nB: photoshoot\nC: outfit for fencing\nD: cooking competition\nE: lab experiment", "question": "why do both the player wear white costume", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: performers costume\nB: photoshoot\nC: outfit for fencing\nD: cooking competition\nE: lab experiment\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_148_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_148_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_148_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_148_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_148_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_148_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_148_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_148_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_148_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_148_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_148_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_148_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_148_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_148_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_148_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_148_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: point to his left\nB: tries to walk\nC: drink from bottle\nD: bring to table\nE: talking", "question": "what does the boy do after putting the bottle flat on his mouth for a while at the start", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: point to his left\nB: tries to walk\nC: drink from bottle\nD: bring to table\nE: talking\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_149_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_149_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_149_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_149_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_149_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_149_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_149_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_149_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_149_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_149_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_149_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_149_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_149_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_149_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_149_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_149_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: run to the brown dog\nB: ran back into the cage\nC: pick it up\nD: retreat\nE: run away", "question": "what did the white dog do when the brown dog turned back to run near the middle", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: run to the brown dog\nB: ran back into the cage\nC: pick it up\nD: retreat\nE: run away\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_150_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_150_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_150_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_150_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_150_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_150_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_150_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_150_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_150_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_150_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_150_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_150_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_150_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_150_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_150_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_150_15.jpg"], "output": "E", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: indoor\nB: kitchen\nC: dance studio\nD: zoo\nE: on the pavement", "question": "where is the girl cycling", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: indoor\nB: kitchen\nC: dance studio\nD: zoo\nE: on the pavement\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_151_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_151_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_151_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_151_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_151_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_151_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_151_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_151_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_151_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_151_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_151_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_151_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_151_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_151_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_151_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_151_15.jpg"], "output": "E", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: to stop the recording\nB: they are practicing\nC: dancing\nD: distracted by dog moving\nE: playing game on mobile", "question": "why do the men stop playing in between", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: to stop the recording\nB: they are practicing\nC: dancing\nD: distracted by dog moving\nE: playing game on mobile\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_152_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_152_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_152_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_152_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_152_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_152_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_152_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_152_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_152_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_152_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_152_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_152_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_152_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_152_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_152_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_152_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: nod her head\nB: raise her hands\nC: swing her hands right and left\nD: write notes\nE: clap her hands", "question": "how does the woman with a red lanyard signal that she is paying attention in the middle of the video", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: nod her head\nB: raise her hands\nC: swing her hands right and left\nD: write notes\nE: clap her hands\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_153_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_153_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_153_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_153_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_153_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_153_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_153_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_153_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_153_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_153_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_153_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_153_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_153_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_153_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_153_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_153_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: sea\nB: swimming pool\nC: outdoor\nD: lake\nE: roadside", "question": "where is this video taken", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: sea\nB: swimming pool\nC: outdoor\nD: lake\nE: roadside\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_154_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_154_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_154_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_154_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_154_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_154_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_154_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_154_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_154_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_154_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_154_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_154_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_154_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_154_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_154_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_154_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: table\nB: on his lap\nC:  on the cube\nD: face\nE: crossing in front", "question": "where did the man in the video put his right hand most of the time", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: table\nB: on his lap\nC:  on the cube\nD: face\nE: crossing in front\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_155_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_155_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_155_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_155_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_155_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_155_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_155_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_155_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_155_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_155_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_155_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_155_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_155_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_155_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_155_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_155_15.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: pick up the seatbelt\nB: clean her face\nC: pass something to the man sitting\nD: row using hands\nE: turns head towards the lady", "question": "what did the robot do after the woman turned herself towards it", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: pick up the seatbelt\nB: clean her face\nC: pass something to the man sitting\nD: row using hands\nE: turns head towards the lady\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_156_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_156_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_156_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_156_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_156_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_156_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_156_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_156_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_156_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_156_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_156_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_156_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_156_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_156_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_156_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_156_15.jpg"], "output": "E", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: three\nB: six\nC: four\nD: five\nE: eight", "question": "how many cars are parked beside the street", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: three\nB: six\nC: four\nD: five\nE: eight\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_157_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_157_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_157_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_157_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_157_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_157_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_157_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_157_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_157_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_157_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_157_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_157_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_157_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_157_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_157_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_157_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: walk\nB: put up her finger\nC: fell\nD: looks forward\nE: go back up", "question": "what happens to the girl after walking backwards", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: walk\nB: put up her finger\nC: fell\nD: looks forward\nE: go back up\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_158_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_158_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_158_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_158_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_158_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_158_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_158_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_158_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_158_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_158_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_158_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_158_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_158_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_158_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_158_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_158_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: change colour of the frames\nB: adjust something\nC: show the frame to the ladies\nD: bored\nE: asking for their opinions", "question": "why did the lady in flower shirt turned the frame to face the two ladies in black", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: change colour of the frames\nB: adjust something\nC: show the frame to the ladies\nD: bored\nE: asking for their opinions\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_159_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_159_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_159_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_159_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_159_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_159_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_159_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_159_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_159_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_159_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_159_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_159_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_159_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_159_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_159_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_159_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: bite back\nB: follow him\nC: run away\nD: plays with the black dog\nE: turn around", "question": "what does the light brown dog do after the dark brown dog turned around to face the other direction in the middle", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: bite back\nB: follow him\nC: run away\nD: plays with the black dog\nE: turn around\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_160_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_160_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_160_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_160_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_160_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_160_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_160_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_160_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_160_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_160_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_160_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_160_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_160_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_160_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_160_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_160_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: the dog rolled it over\nB: dropped from the bicycle\nC: a lady threw it\nD: thrown by man\nE: the kids brought it out", "question": "why was there a ball on the grass nearing the end", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: the dog rolled it over\nB: dropped from the bicycle\nC: a lady threw it\nD: thrown by man\nE: the kids brought it out\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_161_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_161_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_161_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_161_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_161_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_161_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_161_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_161_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_161_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_161_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_161_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_161_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_161_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_161_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_161_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_161_15.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: look at bird\nB: find food\nC: hold for support\nD: try to intimidate dog\nE: show the camera", "question": "why does the nearest parrot bite the cage before moving along it", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: look at bird\nB: find food\nC: hold for support\nD: try to intimidate dog\nE: show the camera\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_162_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_162_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_162_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_162_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_162_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_162_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_162_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_162_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_162_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_162_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_162_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_162_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_162_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_162_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_162_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_162_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: uncomfortable position\nB: playing with baby\nC: crying\nD: stopped from coming out\nE: not able to balance himself", "question": "why does the baby constantly lean its head backwards while being carried", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: uncomfortable position\nB: playing with baby\nC: crying\nD: stopped from coming out\nE: not able to balance himself\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_163_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_163_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_163_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_163_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_163_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_163_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_163_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_163_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_163_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_163_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_163_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_163_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_163_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_163_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_163_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_163_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: rides down on a bike\nB: using sled\nC: held the edge of disc\nD: swing it by the tag\nE: walked down", "question": "how does the man in dark green go down the slope", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: rides down on a bike\nB: using sled\nC: held the edge of disc\nD: swing it by the tag\nE: walked down\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_164_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_164_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_164_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_164_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_164_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_164_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_164_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_164_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_164_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_164_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_164_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_164_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_164_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_164_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_164_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_164_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: direct others attention\nB: take more yoghurt\nC: wanted to play\nD: ask for more water\nE: pose", "question": "why did the baby raise his hand and smile after eating some noodles", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: direct others attention\nB: take more yoghurt\nC: wanted to play\nD: ask for more water\nE: pose\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_165_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_165_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_165_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_165_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_165_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_165_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_165_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_165_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_165_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_165_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_165_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_165_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_165_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_165_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_165_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_165_15.jpg"], "output": "E", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: pushed her out\nB: removed her seat belt\nC: hold her dress\nD: hold her hands\nE: lifted her off the ground", "question": "how did the man help the lady get out of the car", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: pushed her out\nB: removed her seat belt\nC: hold her dress\nD: hold her hands\nE: lifted her off the ground\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_166_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_166_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_166_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_166_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_166_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_166_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_166_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_166_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_166_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_166_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_166_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_166_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_166_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_166_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_166_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_166_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: clean it\nB: admiring\nC: playing\nD: control the handle grip\nE: showing it to lady", "question": "why is the person scrubbing the utensil", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: clean it\nB: admiring\nC: playing\nD: control the handle grip\nE: showing it to lady\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_167_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_167_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_167_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_167_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_167_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_167_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_167_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_167_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_167_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_167_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_167_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_167_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_167_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_167_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_167_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_167_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: look at flowers\nB: chase after ball\nC: swing again\nD: look at the grass\nE: chases after the lady", "question": "what did the brown dog do when the ball was thrown the second time", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: look at flowers\nB: chase after ball\nC: swing again\nD: look at the grass\nE: chases after the lady\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_168_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_168_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_168_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_168_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_168_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_168_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_168_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_168_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_168_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_168_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_168_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_168_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_168_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_168_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_168_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_168_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: fighting with other man\nB: talk to the baby\nC: ties shoelaces\nD: falls\nE: adjust his board", "question": "why does the man end up on the ground at the end", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: fighting with other man\nB: talk to the baby\nC: ties shoelaces\nD: falls\nE: adjust his board\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_169_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_169_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_169_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_169_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_169_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_169_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_169_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_169_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_169_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_169_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_169_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_169_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_169_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_169_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_169_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_169_15.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: think there s more food\nB: play with baby\nC: biting finger\nD: inexperienced to use fork\nE: suck juice from fingers", "question": "why did the baby feed himself with his hands", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: think there s more food\nB: play with baby\nC: biting finger\nD: inexperienced to use fork\nE: suck juice from fingers\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_170_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_170_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_170_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_170_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_170_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_170_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_170_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_170_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_170_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_170_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_170_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_170_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_170_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_170_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_170_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_170_15.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: move his feet\nB: eat food\nC: clap hands\nD: drink water\nE: sleeping", "question": "what does the baby do while on the seat", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: move his feet\nB: eat food\nC: clap hands\nD: drink water\nE: sleeping\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_171_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_171_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_171_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_171_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_171_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_171_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_171_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_171_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_171_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_171_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_171_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_171_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_171_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_171_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_171_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_171_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: shift the wooden thing\nB: bring to the person\nC: throw the ball\nD: lays down\nE: jump towards it", "question": "what does the person do after picking up the ball in front of the brown dog in the middle", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: shift the wooden thing\nB: bring to the person\nC: throw the ball\nD: lays down\nE: jump towards it\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_172_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_172_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_172_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_172_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_172_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_172_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_172_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_172_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_172_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_172_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_172_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_172_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_172_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_172_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_172_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_172_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: looking at cat\nB: drinking water\nC: practicing crawling\nD: to reach the sand\nE: reading", "question": "why is the boy lying on the floor at the start", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: looking at cat\nB: drinking water\nC: practicing crawling\nD: to reach the sand\nE: reading\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_173_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_173_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_173_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_173_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_173_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_173_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_173_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_173_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_173_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_173_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_173_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_173_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_173_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_173_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_173_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_173_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: left and right\nB: move his arms\nC: skipping and raising legs\nD: move up and down\nE: man push his swing", "question": "how is the man dancing in the video", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: left and right\nB: move his arms\nC: skipping and raising legs\nD: move up and down\nE: man push his swing\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_174_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_174_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_174_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_174_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_174_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_174_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_174_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_174_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_174_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_174_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_174_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_174_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_174_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_174_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_174_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_174_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: the man\nB: cat\nC: dog\nD: baby\nE: lady in pink", "question": "who is holding the camera", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: the man\nB: cat\nC: dog\nD: baby\nE: lady in pink\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_175_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_175_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_175_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_175_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_175_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_175_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_175_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_175_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_175_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_175_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_175_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_175_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_175_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_175_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_175_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_175_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: hold on to rope\nB: lie on the floating board\nC: hold the poles at the side\nD: use swimming float\nE: hold adult", "question": "how did the babies support themselves as they learnt to swim", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: hold on to rope\nB: lie on the floating board\nC: hold the poles at the side\nD: use swimming float\nE: hold adult\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_176_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_176_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_176_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_176_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_176_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_176_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_176_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_176_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_176_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_176_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_176_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_176_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_176_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_176_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_176_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_176_15.jpg"], "output": "E", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: woman is playing with baby\nB: see his reaction\nC: ensure him not fall\nD: they are watching over him\nE: see something", "question": "why did the lady keep looking at the boy", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: woman is playing with baby\nB: see his reaction\nC: ensure him not fall\nD: they are watching over him\nE: see something\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_177_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_177_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_177_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_177_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_177_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_177_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_177_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_177_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_177_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_177_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_177_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_177_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_177_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_177_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_177_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_177_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: rubbed his nose\nB: looking at a book\nC: he cried\nD: laughing\nE: he stared at the other man", "question": "how did the man react when the lady put her hand on his shoulder", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: rubbed his nose\nB: looking at a book\nC: he cried\nD: laughing\nE: he stared at the other man\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_178_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_178_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_178_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_178_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_178_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_178_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_178_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_178_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_178_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_178_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_178_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_178_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_178_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_178_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_178_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_178_15.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: making a call\nB: taking photo\nC: gesturing\nD: posing for camera\nE: acting", "question": "why did the bald man hold his phone up", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: making a call\nB: taking photo\nC: gesturing\nD: posing for camera\nE: acting\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_179_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_179_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_179_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_179_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_179_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_179_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_179_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_179_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_179_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_179_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_179_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_179_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_179_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_179_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_179_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_179_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: throwing grass\nB: exercise\nC: pose for photo\nD: point at something\nE: throw the ball", "question": "why did the boy stretch his right arm out at the start of the video", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: throwing grass\nB: exercise\nC: pose for photo\nD: point at something\nE: throw the ball\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_180_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_180_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_180_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_180_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_180_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_180_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_180_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_180_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_180_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_180_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_180_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_180_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_180_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_180_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_180_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_180_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: she knock the car\nB: help to move the car\nC: nod her head\nD: give thumbs up\nE: hold the car wheels", "question": "what did the lady in pink do after the lady in blue pushed the car", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: she knock the car\nB: help to move the car\nC: nod her head\nD: give thumbs up\nE: hold the car wheels\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_181_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_181_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_181_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_181_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_181_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_181_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_181_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_181_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_181_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_181_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_181_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_181_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_181_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_181_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_181_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_181_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: posing with the toy\nB: role playing\nC: trying to look cool\nD: play with baby\nE: pass to adult", "question": "why does the boy in black hold the red toy up", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: posing with the toy\nB: role playing\nC: trying to look cool\nD: play with baby\nE: pass to adult\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_182_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_182_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_182_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_182_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_182_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_182_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_182_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_182_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_182_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_182_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_182_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_182_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_182_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_182_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_182_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_182_15.jpg"], "output": "E", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: eating dinner\nB: playing with toy on table\nC: drinking water\nD: resting\nE: watchign television", "question": "why is the boy sitting down", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: eating dinner\nB: playing with toy on table\nC: drinking water\nD: resting\nE: watchign television\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_183_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_183_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_183_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_183_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_183_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_183_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_183_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_183_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_183_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_183_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_183_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_183_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_183_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_183_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_183_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_183_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: hug adult\nB: stands on the table\nC: look at girl in white\nD: push girl\nE: cover her face", "question": "what did the lady in blue do after putting her phone down at the start", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: hug adult\nB: stands on the table\nC: look at girl in white\nD: push girl\nE: cover her face\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_184_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_184_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_184_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_184_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_184_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_184_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_184_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_184_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_184_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_184_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_184_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_184_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_184_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_184_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_184_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_184_15.jpg"], "output": "E", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: greeting\nB: call waiter\nC: to direct traffic\nD: teach her piano\nE: showing off flag", "question": "why did the man and lady wave their hands in the air", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: greeting\nB: call waiter\nC: to direct traffic\nD: teach her piano\nE: showing off flag\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_185_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_185_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_185_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_185_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_185_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_185_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_185_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_185_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_185_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_185_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_185_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_185_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_185_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_185_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_185_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_185_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: grey\nB: orange\nC: green\nD: white and red\nE: blue", "question": "what is the colour of the lady s bag", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: grey\nB: orange\nC: green\nD: white and red\nE: blue\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_186_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_186_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_186_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_186_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_186_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_186_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_186_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_186_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_186_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_186_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_186_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_186_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_186_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_186_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_186_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_186_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: school\nB: restaurant\nC: in house\nD: front porch\nE: museum", "question": "where could this be happening", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: school\nB: restaurant\nC: in house\nD: front porch\nE: museum\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_187_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_187_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_187_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_187_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_187_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_187_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_187_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_187_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_187_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_187_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_187_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_187_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_187_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_187_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_187_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_187_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: to see where they are going\nB: look out for cars\nC: to dodge baby hitting her\nD: see who is behind them\nE: curious", "question": "why do the two ladies turn their heads backwards at the start", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: to see where they are going\nB: look out for cars\nC: to dodge baby hitting her\nD: see who is behind them\nE: curious\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_188_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_188_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_188_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_188_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_188_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_188_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_188_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_188_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_188_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_188_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_188_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_188_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_188_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_188_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_188_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_188_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: to move away\nB: chasing\nC: play with dog\nD: comfortable\nE: brush the hair away", "question": "why does one cat nudge the other at the beginning", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: to move away\nB: chasing\nC: play with dog\nD: comfortable\nE: brush the hair away\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_189_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_189_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_189_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_189_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_189_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_189_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_189_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_189_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_189_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_189_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_189_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_189_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_189_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_189_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_189_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_189_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: minion soft toy\nB: dog\nC: cup\nD: pacifier\nE: flower", "question": "what is the boy holding in his hand while inside the glass", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: minion soft toy\nB: dog\nC: cup\nD: pacifier\nE: flower\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_190_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_190_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_190_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_190_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_190_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_190_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_190_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_190_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_190_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_190_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_190_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_190_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_190_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_190_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_190_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_190_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: man is controlling\nB: live video\nC: trying to open the item\nD: talk to each other\nE: dancing", "question": "why is the screen behind the person changing", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: man is controlling\nB: live video\nC: trying to open the item\nD: talk to each other\nE: dancing\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_191_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_191_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_191_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_191_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_191_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_191_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_191_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_191_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_191_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_191_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_191_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_191_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_191_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_191_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_191_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_191_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: start claping\nB: unwrapping present\nC: show baby how to steer wheel\nD: pick up something\nE: touch ball", "question": "what does the lady do after bending down at the start of the video", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: start claping\nB: unwrapping present\nC: show baby how to steer wheel\nD: pick up something\nE: touch ball\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_192_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_192_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_192_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_192_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_192_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_192_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_192_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_192_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_192_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_192_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_192_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_192_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_192_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_192_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_192_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_192_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: sitting\nB: holding hands\nC: causal\nD: standing\nE: lying on sofa", "question": "how are the people positioned around the table", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: sitting\nB: holding hands\nC: causal\nD: standing\nE: lying on sofa\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_193_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_193_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_193_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_193_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_193_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_193_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_193_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_193_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_193_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_193_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_193_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_193_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_193_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_193_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_193_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_193_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: jump to another place\nB: fly away\nC: jump around\nD: walk towards the camera\nE: sit down on lap", "question": "where did one of the birds go towards the end of the video", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: jump to another place\nB: fly away\nC: jump around\nD: walk towards the camera\nE: sit down on lap\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_194_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_194_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_194_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_194_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_194_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_194_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_194_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_194_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_194_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_194_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_194_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_194_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_194_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_194_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_194_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_194_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: man feed\nB: wears napkin\nC: lady feed the baby\nD: eats slowly\nE: wears a bib", "question": "how is the food prevented from spilling onto the baby s clothings", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: man feed\nB: wears napkin\nC: lady feed the baby\nD: eats slowly\nE: wears a bib\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_195_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_195_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_195_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_195_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_195_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_195_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_195_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_195_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_195_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_195_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_195_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_195_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_195_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_195_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_195_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_195_15.jpg"], "output": "E", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: pose\nB: prevent car from moving\nC: crafting\nD: clean the wheels\nE: change the wheels", "question": "why did the lady in blue shirt hold the car wheels in the middle of the video", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: pose\nB: prevent car from moving\nC: crafting\nD: clean the wheels\nE: change the wheels\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_196_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_196_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_196_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_196_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_196_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_196_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_196_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_196_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_196_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_196_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_196_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_196_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_196_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_196_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_196_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_196_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: for food\nB: get the toy\nC: man pat sofa\nD: catch the snowball\nE: to fight with cat", "question": "why did the dog jump up very high in the middle", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: for food\nB: get the toy\nC: man pat sofa\nD: catch the snowball\nE: to fight with cat\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_197_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_197_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_197_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_197_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_197_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_197_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_197_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_197_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_197_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_197_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_197_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_197_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_197_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_197_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_197_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_197_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: raises his hands and smiles\nB: turn to look at the lady\nC: bite it\nD: dips her ladle in\nE: disappointed", "question": "what does the baby do after finishing the noodles at the end", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: raises his hands and smiles\nB: turn to look at the lady\nC: bite it\nD: dips her ladle in\nE: disappointed\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_198_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_198_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_198_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_198_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_198_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_198_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_198_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_198_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_198_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_198_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_198_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_198_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_198_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_198_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_198_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_198_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_next_qa", "visual_input_component": "16 natural images", "source": "next_qa", "options": "A: playing the guitar\nB: graduation ceremony\nC: preview of the living room\nD: sports game\nE: book launch", "question": "what is the video about", "context": "You are given 16 images of sequential occurrences, examine the details and answer the given question.\nSelect from the following choices.\nA: playing the guitar\nB: graduation ceremony\nC: preview of the living room\nD: sports game\nE: book launch\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_199_0.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_199_1.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_199_2.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_199_3.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_199_4.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_199_5.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_199_6.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_199_7.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_199_8.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_199_9.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_199_10.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_199_11.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_199_12.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_199_13.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_199_14.jpg", "./High-level-sub-semantic/casuality_reasoning_next_qa/casuality_reasoning_next_qa_199_15.jpg"], "output": "A", "qwen3-vl": "image none"}]
\ No newline at end of file
diff --git a/results/casuality_reasoning_var/qwen3-vl/metadata_info.json b/results/casuality_reasoning_var/qwen3-vl/metadata_info.json
new file mode 100644
index 0000000..98ca48e
--- /dev/null
+++ b/results/casuality_reasoning_var/qwen3-vl/metadata_info.json
@@ -0,0 +1 @@
+[{"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: An athlete is seen standing up to a circle and leads into him throwing a discuss and his face being shown afterwards.\nB: An athlete is observed sitting in a circle, sharing his experiences about discus throw, with his face filled with pride.\nC: The athlete, after finishing his discuss throw, stood in a circle for a post-game interview, his face beaming with pride.\n\nD: The athlete, previously encircled by fans, gently hands over the discus for autographs before merrily snapping selfies showcasing his radiant smile.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: An athlete is seen standing up to a circle and leads into him throwing a discuss and his face being shown afterwards.\nB: An athlete is observed sitting in a circle, sharing his experiences about discus throw, with his face filled with pride.\nC: The athlete, after finishing his discuss throw, stood in a circle for a post-game interview, his face beaming with pride.\n\nD: The athlete, previously encircled by fans, gently hands over the discus for autographs before merrily snapping selfies showcasing his radiant smile.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_0_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_0_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_0_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_0_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_0_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_0_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_0_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_0_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_0_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_0_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_0_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_0_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_0_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_0_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_0_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_0_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: The man waters the sections of the vine.\nB: The man admires the growth of the vine.\nC: The man cuts a few parts of the vine.\nD: The man waters the sections of the vine.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The man waters the sections of the vine.\nB: The man admires the growth of the vine.\nC: The man cuts a few parts of the vine.\nD: The man waters the sections of the vine.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_1_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_1_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_1_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_1_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_1_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_1_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_1_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_1_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_1_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_1_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_1_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_1_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_1_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_1_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_1_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_1_15.jpg"], "output": "F", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: They spilled pasta from the bowl onto the floor.\nB: They spilled pasta from the bowl all over the floor.\nC: They put pasta in the bowl and stir it around.\nD: They spilled pasta from the bowl onto the floor.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: They spilled pasta from the bowl onto the floor.\nB: They spilled pasta from the bowl all over the floor.\nC: They put pasta in the bowl and stir it around.\nD: They spilled pasta from the bowl onto the floor.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_2_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_2_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_2_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_2_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_2_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_2_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_2_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_2_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_2_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_2_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_2_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_2_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_2_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_2_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_2_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_2_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: A bartender explains and shows how to prepare exotic alcoholic drinks in glasses using alcohol and juice.\nB: A bartender spills juice and alcohol, ruining the attempt to create exotic drinks in glasses.\nC: A bartender spills alcohol and juice while clumsily attempting to create exotic alcoholic drinks in glasses.\nD: A bartender spills alcohol and juice while clumsily trying to mix exotic drinks in glasses.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A bartender explains and shows how to prepare exotic alcoholic drinks in glasses using alcohol and juice.\nB: A bartender spills juice and alcohol, ruining the attempt to create exotic drinks in glasses.\nC: A bartender spills alcohol and juice while clumsily attempting to create exotic alcoholic drinks in glasses.\nD: A bartender spills alcohol and juice while clumsily trying to mix exotic drinks in glasses.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_3_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_3_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_3_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_3_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_3_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_3_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_3_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_3_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_3_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_3_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_3_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_3_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_3_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_3_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_3_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_3_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: Many glance at the camera as they calmly rest their faces and arms, without any frantic movements.\nB: Many speak to the camera while doing activities and continue to rub it all over their faces and arms.\nC: Numerous individuals adjust the camera angle during their activities and proceed to display their faces and arms prominently.\nD: Several engage with the camera during tasks and persistently clean their faces and arms.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: Many glance at the camera as they calmly rest their faces and arms, without any frantic movements.\nB: Many speak to the camera while doing activities and continue to rub it all over their faces and arms.\nC: Numerous individuals adjust the camera angle during their activities and proceed to display their faces and arms prominently.\nD: Several engage with the camera during tasks and persistently clean their faces and arms.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_4_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_4_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_4_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_4_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_4_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_4_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_4_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_4_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_4_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_4_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_4_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_4_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_4_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_4_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_4_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_4_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: She starts to paint her legs.\nB: She starts to paint her legs.\nC: She starts to paint her legs.\nD: She begins to shave her legs.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: She starts to paint her legs.\nB: She starts to paint her legs.\nC: She starts to paint her legs.\nD: She begins to shave her legs.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_5_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_5_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_5_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_5_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_5_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_5_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_5_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_5_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_5_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_5_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_5_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_5_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_5_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_5_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_5_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_5_15.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: A woman is gently brushing her cat on the sofa.\nB: A cat is being held down in a woman's lap.\nC: A woman is gently brushing her cat on her lap.\nD: A woman is stroking a cat on her lap.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A woman is gently brushing her cat on the sofa.\nB: A cat is being held down in a woman's lap.\nC: A woman is gently brushing her cat on her lap.\nD: A woman is stroking a cat on her lap.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_6_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_6_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_6_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_6_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_6_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_6_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_6_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_6_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_6_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_6_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_6_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_6_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_6_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_6_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_6_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_6_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: Several more women are depicted painting the same mural, followed by close-up shots of their artwork immediately afterwards.\nB: Several more women are shown making the same jump down the track followed by slow motion shots of their jump immediately afterwards.\nC: Several more women are depicted painting on the canvas, followed by close-up images of their artwork immediately afterwards.\nD: Several more women are depicted painting the same mural on the track, accompanied by slow motion footage of their artistic process immediately afterwards.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: Several more women are depicted painting the same mural, followed by close-up shots of their artwork immediately afterwards.\nB: Several more women are shown making the same jump down the track followed by slow motion shots of their jump immediately afterwards.\nC: Several more women are depicted painting on the canvas, followed by close-up images of their artwork immediately afterwards.\nD: Several more women are depicted painting the same mural on the track, accompanied by slow motion footage of their artistic process immediately afterwards.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_7_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_7_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_7_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_7_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_7_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_7_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_7_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_7_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_7_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_7_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_7_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_7_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_7_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_7_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_7_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_7_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: A man wearing a chef's hat is seen speaking to the camera and leads into a completed cake made and various ingredients being poured into a bowl.\nB: A man in a chef's hat silently shows a finished cake to the camera, then begins to mix various ingredients in a bowl.\nC: A man in a chef's hat is shown tasting a finished cake, adding ingredients to a bowl, and ignoring the camera.\nD: A man in a chef's hat is seen carefully arranging various ingredients in a bowl, before showcasing a beautifully finished cake to the camera.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A man wearing a chef's hat is seen speaking to the camera and leads into a completed cake made and various ingredients being poured into a bowl.\nB: A man in a chef's hat silently shows a finished cake to the camera, then begins to mix various ingredients in a bowl.\nC: A man in a chef's hat is shown tasting a finished cake, adding ingredients to a bowl, and ignoring the camera.\nD: A man in a chef's hat is seen carefully arranging various ingredients in a bowl, before showcasing a beautifully finished cake to the camera.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_8_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_8_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_8_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_8_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_8_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_8_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_8_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_8_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_8_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_8_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_8_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_8_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_8_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_8_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_8_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_8_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: Sentence: A man is adjusting the wall clock by the window and glances at the halfway drawn curtains.\nB: man is walking by a halfway and puth the courtains in the wall by the window.\nC: A man is painting a wall near a window and hangs a picture next to the curtains in the hallway.\nD: The man is fixing the clock on the wall next to the window in the hallway.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: Sentence: A man is adjusting the wall clock by the window and glances at the halfway drawn curtains.\nB: man is walking by a halfway and puth the courtains in the wall by the window.\nC: A man is painting a wall near a window and hangs a picture next to the curtains in the hallway.\nD: The man is fixing the clock on the wall next to the window in the hallway.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_9_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_9_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_9_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_9_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_9_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_9_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_9_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_9_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_9_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_9_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_9_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_9_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_9_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_9_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_9_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_9_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: A dog is seen running through a yard and performing various frisbee tricks with a woman.\nB: A woman is observed teaching a dog how to paint in a studio.\nC: A woman is observed teaching a dog to sit and stay in a yard.\nD: A woman is seen sitting in a yard, grooming her dog and teaching it obedience commands.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A dog is seen running through a yard and performing various frisbee tricks with a woman.\nB: A woman is observed teaching a dog how to paint in a studio.\nC: A woman is observed teaching a dog to sit and stay in a yard.\nD: A woman is seen sitting in a yard, grooming her dog and teaching it obedience commands.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_10_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_10_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_10_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_10_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_10_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_10_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_10_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_10_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_10_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_10_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_10_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_10_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_10_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_10_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_10_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_10_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: She is demonstrating how to unclog the sink with a toothbrush.\nB: She is demonstrating how to paint a sink with a toothbrush.\nC: She is showing how to clean the sink using a toothbrush.\nD: She is teaching how to paint a sink with a toothbrush.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: She is demonstrating how to unclog the sink with a toothbrush.\nB: She is demonstrating how to paint a sink with a toothbrush.\nC: She is showing how to clean the sink using a toothbrush.\nD: She is teaching how to paint a sink with a toothbrush.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_11_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_11_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_11_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_11_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_11_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_11_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_11_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_11_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_11_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_11_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_11_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_11_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_11_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_11_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_11_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_11_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: The man in black is walking away from us.\nB: The man in black turns his back to us.\nC: We see a man in black from the front.\nD: The man in black turned around.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The man in black is walking away from us.\nB: The man in black turns his back to us.\nC: We see a man in black from the front.\nD: The man in black turned around.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_12_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_12_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_12_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_12_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_12_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_12_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_12_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_12_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_12_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_12_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_12_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_12_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_12_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_12_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_12_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_12_15.jpg"], "output": "E", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: Scenes of various pole vaulters vaulting before audiences are shown.\nB: Various pole vaulters are signing autographs for audiences.\nC: Various pole vaulters are signing autographs for their audiences.\nD: Pole vaulters are signing autographs for their enthusiastic audiences.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: Scenes of various pole vaulters vaulting before audiences are shown.\nB: Various pole vaulters are signing autographs for audiences.\nC: Various pole vaulters are signing autographs for their audiences.\nD: Pole vaulters are signing autographs for their enthusiastic audiences.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_13_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_13_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_13_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_13_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_13_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_13_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_13_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_13_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_13_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_13_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_13_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_13_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_13_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_13_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_13_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_13_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: He carefully studies the ball, examining its texture and weight.\nB: He spins around several times with the ball.\nC: He gently tosses the ball back and forth.\nD: He gently sleeps with the ball beside him.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: He carefully studies the ball, examining its texture and weight.\nB: He spins around several times with the ball.\nC: He gently tosses the ball back and forth.\nD: He gently sleeps with the ball beside him.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_14_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_14_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_14_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_14_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_14_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_14_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_14_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_14_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_14_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_14_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_14_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_14_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_14_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_14_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_14_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_14_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: Bryan McBride, a man known for his calm demeanor, sits pensively in a quiet corner, engrossed in a thick book.\nB: A man named BRYAN MCBRIDE is standing and then begins his high jump where he clears it, lands and vigorously cheers as he runs off.\nC: Bryan McBride, a prominent figure, is seated at a conference, attentively listening and occasionally nodding in agreement.\nD: BRYAN MCBRIDE, a well-known individual, is calmly reading a book, completely absorbed in its captivating plot.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: Bryan McBride, a man known for his calm demeanor, sits pensively in a quiet corner, engrossed in a thick book.\nB: A man named BRYAN MCBRIDE is standing and then begins his high jump where he clears it, lands and vigorously cheers as he runs off.\nC: Bryan McBride, a prominent figure, is seated at a conference, attentively listening and occasionally nodding in agreement.\nD: BRYAN MCBRIDE, a well-known individual, is calmly reading a book, completely absorbed in its captivating plot.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_15_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_15_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_15_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_15_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_15_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_15_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_15_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_15_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_15_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_15_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_15_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_15_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_15_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_15_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_15_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_15_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: The man in black is walking away from us.\nB: The man in black turns his back to us.\nC: We see a man in black from the front.\nD: The man in black turned around.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The man in black is walking away from us.\nB: The man in black turns his back to us.\nC: We see a man in black from the front.\nD: The man in black turned around.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_16_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_16_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_16_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_16_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_16_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_16_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_16_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_16_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_16_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_16_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_16_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_16_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_16_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_16_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_16_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_16_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: She is arguing with two men in a conference room.\nB: She is discussing a business plan with two men at a conference.\nC: She is swiming next to two men in a pool.\nD: She is discussing business with two men in a meeting.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: She is arguing with two men in a conference room.\nB: She is discussing a business plan with two men at a conference.\nC: She is swiming next to two men in a pool.\nD: She is discussing business with two men in a meeting.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_17_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_17_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_17_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_17_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_17_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_17_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_17_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_17_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_17_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_17_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_17_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_17_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_17_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_17_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_17_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_17_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: Several clips depict people feeding and caring for bulls.\nB: Several clips depict individuals feeding and caring for bulls.\nC: More clips are shown of people taunting bulls.\nD: People are seen in the clips, feeding the bulls.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: Several clips depict people feeding and caring for bulls.\nB: Several clips depict individuals feeding and caring for bulls.\nC: More clips are shown of people taunting bulls.\nD: People are seen in the clips, feeding the bulls.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_18_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_18_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_18_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_18_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_18_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_18_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_18_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_18_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_18_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_18_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_18_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_18_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_18_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_18_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_18_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_18_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: The individual slips on a pair of gloves, adjusting them before sitting down comfortably.\nB: The person then puts a pair of shoes on and tying them and ending by standing up straight.\nC: The person grabs a pair of shoes, tossing them out the window, and then reclines on the couch.\nD: The person quickly slips on a pair of shoes, promptly kicks a ball, and finally assumes a defensive stance.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The individual slips on a pair of gloves, adjusting them before sitting down comfortably.\nB: The person then puts a pair of shoes on and tying them and ending by standing up straight.\nC: The person grabs a pair of shoes, tossing them out the window, and then reclines on the couch.\nD: The person quickly slips on a pair of shoes, promptly kicks a ball, and finally assumes a defensive stance.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_19_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_19_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_19_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_19_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_19_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_19_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_19_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_19_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_19_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_19_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_19_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_19_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_19_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_19_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_19_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_19_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: A shirtless man is sunbathing near the pole vault area.\n\nB: A shirtless guy does a pole vault.\nC: A shirtless man lounges by the pool.\nD: A shirtless man is sunbathing by the pool.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A shirtless man is sunbathing near the pole vault area.\n\nB: A shirtless guy does a pole vault.\nC: A shirtless man lounges by the pool.\nD: A shirtless man is sunbathing by the pool.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_20_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_20_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_20_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_20_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_20_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_20_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_20_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_20_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_20_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_20_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_20_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_20_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_20_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_20_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_20_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_20_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: A man dressed in a blue and black uniform is standing on top of a mat preparing to do his routine.\nB: A man in a blue and black uniform is sitting on a mat, taking a break from his training.\nC: A man in a blue and black uniform is sitting on a mat, tying his shoelaces.\nD: A man in a blue and black uniform is sitting on a mat, tying his shoelaces.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A man dressed in a blue and black uniform is standing on top of a mat preparing to do his routine.\nB: A man in a blue and black uniform is sitting on a mat, taking a break from his training.\nC: A man in a blue and black uniform is sitting on a mat, tying his shoelaces.\nD: A man in a blue and black uniform is sitting on a mat, tying his shoelaces.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_21_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_21_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_21_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_21_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_21_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_21_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_21_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_21_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_21_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_21_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_21_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_21_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_21_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_21_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_21_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_21_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: A large vegetable is being planted and watered.\nB: A large vegetable is being planted and nurtured.\nC: A large vegetable is being planted and watered.\nD: A large vegetable is being peeled and chopped.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A large vegetable is being planted and watered.\nB: A large vegetable is being planted and nurtured.\nC: A large vegetable is being planted and watered.\nD: A large vegetable is being peeled and chopped.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_22_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_22_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_22_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_22_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_22_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_22_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_22_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_22_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_22_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_22_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_22_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_22_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_22_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_22_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_22_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_22_15.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: She pulls out a tissue paper from the bag.\nB: She removes the tissue paper from the bag.\nC: She removes some tissue paper from the bag.\nD: She adds more tissue paper to the bag.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: She pulls out a tissue paper from the bag.\nB: She removes the tissue paper from the bag.\nC: She removes some tissue paper from the bag.\nD: She adds more tissue paper to the bag.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_23_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_23_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_23_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_23_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_23_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_23_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_23_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_23_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_23_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_23_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_23_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_23_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_23_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_23_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_23_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_23_15.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: The person removes their shoe, places it under the sunlight to dry, and then puts it back on.\nB: The person then takes their shoe off to run under the water and then put on again.\nC: The person removes their shoe to shake out a pebble before putting it back on.\nD: The person removes their shoe, places it in the sunlight to dry, and then wears it again.\n", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The person removes their shoe, places it under the sunlight to dry, and then puts it back on.\nB: The person then takes their shoe off to run under the water and then put on again.\nC: The person removes their shoe to shake out a pebble before putting it back on.\nD: The person removes their shoe, places it in the sunlight to dry, and then wears it again.\n\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_24_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_24_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_24_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_24_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_24_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_24_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_24_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_24_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_24_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_24_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_24_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_24_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_24_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_24_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_24_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_24_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: A person is seen walking in with a tire on a plank and painting the tire.\nB: A person is observed rolling a tire on a plank and polishing it.\nC: A person is spotted using a plank to roll a tire into a recycling facility.\nD: A person is observed rolling a tire on a plank and washing it.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A person is seen walking in with a tire on a plank and painting the tire.\nB: A person is observed rolling a tire on a plank and polishing it.\nC: A person is spotted using a plank to roll a tire into a recycling facility.\nD: A person is observed rolling a tire on a plank and washing it.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_25_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_25_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_25_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_25_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_25_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_25_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_25_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_25_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_25_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_25_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_25_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_25_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_25_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_25_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_25_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_25_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: A man is brushing his teeth in front of the camera.\nB: A man is juggling balls for the camera's amusement.\nC: A man is posing for a selfie in front of the camera.\nD: A man is eating an apple in front of the camera.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A man is brushing his teeth in front of the camera.\nB: A man is juggling balls for the camera's amusement.\nC: A man is posing for a selfie in front of the camera.\nD: A man is eating an apple in front of the camera.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_26_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_26_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_26_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_26_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_26_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_26_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_26_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_26_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_26_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_26_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_26_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_26_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_26_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_26_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_26_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_26_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: The lady squeezes a lemon into oil, combines it with other ingredients, grates something over it, and mixes everything together.\nB: The lady collects the ingredients, including a lemon and oil, writes something on top of the recipe card, and rates it before tidying up.\nC: The lady paints a lemon, places it on an oil canvas, integrates other elements, inscribes a name on top, and mix the colors thoroughly.\nD: The lady juices a lemon and pours it in oil and adds other ingredients and rates something on top before stirring it up.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The lady squeezes a lemon into oil, combines it with other ingredients, grates something over it, and mixes everything together.\nB: The lady collects the ingredients, including a lemon and oil, writes something on top of the recipe card, and rates it before tidying up.\nC: The lady paints a lemon, places it on an oil canvas, integrates other elements, inscribes a name on top, and mix the colors thoroughly.\nD: The lady juices a lemon and pours it in oil and adds other ingredients and rates something on top before stirring it up.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_27_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_27_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_27_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_27_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_27_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_27_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_27_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_27_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_27_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_27_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_27_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_27_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_27_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_27_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_27_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_27_15.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: In the end, he starts to relax a bit, before ultimately falling asleep.\nB: At the end he begins to struggle bit, but finally finished.\nC: He peacefully surrendered at the end, but initially put up a fight.\nD: In the end, he starts to relax a bit, but eventually falls asleep.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: In the end, he starts to relax a bit, before ultimately falling asleep.\nB: At the end he begins to struggle bit, but finally finished.\nC: He peacefully surrendered at the end, but initially put up a fight.\nD: In the end, he starts to relax a bit, but eventually falls asleep.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_28_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_28_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_28_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_28_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_28_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_28_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_28_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_28_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_28_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_28_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_28_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_28_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_28_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_28_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_28_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_28_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: She removed the contact from her eye.\nB: She lost her contact from her eye.\nC: She removed the contact from her eye.\nD: She put the contact into her eye.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: She removed the contact from her eye.\nB: She lost her contact from her eye.\nC: She removed the contact from her eye.\nD: She put the contact into her eye.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_29_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_29_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_29_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_29_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_29_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_29_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_29_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_29_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_29_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_29_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_29_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_29_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_29_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_29_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_29_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_29_15.jpg"], "output": "E", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: He sits down and stretches his body out.\nB: He jumps into the air and flips his body around.\nC: He sits on the ground and stills his body completely.\nD: He lays on the ground and stretches his body out.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: He sits down and stretches his body out.\nB: He jumps into the air and flips his body around.\nC: He sits on the ground and stills his body completely.\nD: He lays on the ground and stretches his body out.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_30_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_30_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_30_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_30_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_30_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_30_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_30_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_30_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_30_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_30_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_30_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_30_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_30_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_30_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_30_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_30_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: Numerous individuals are portrayed sitting around the table, ready to deal cards for a game of poker.\nB: Several people are then shown standing around the table preparing to throw the ball in the cups.\nC: Several people are shown sitting around the table, sharing stories over cups of coffee.\nD: Numerous individuals are displayed sitting around the table, engrossed in a conversation.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: Numerous individuals are portrayed sitting around the table, ready to deal cards for a game of poker.\nB: Several people are then shown standing around the table preparing to throw the ball in the cups.\nC: Several people are shown sitting around the table, sharing stories over cups of coffee.\nD: Numerous individuals are displayed sitting around the table, engrossed in a conversation.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_31_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_31_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_31_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_31_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_31_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_31_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_31_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_31_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_31_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_31_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_31_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_31_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_31_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_31_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_31_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_31_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: He struggles to hold back his tears, wipes them away, then smiles.\nB: He tries with all of his might, lifts it up then puts it down.\nC: He glances with uncertainty, sets it aside, and then walks away.\nD: He effortlessly picks it up, spins it around, then places it back down.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: He struggles to hold back his tears, wipes them away, then smiles.\nB: He tries with all of his might, lifts it up then puts it down.\nC: He glances with uncertainty, sets it aside, and then walks away.\nD: He effortlessly picks it up, spins it around, then places it back down.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_32_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_32_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_32_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_32_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_32_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_32_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_32_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_32_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_32_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_32_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_32_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_32_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_32_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_32_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_32_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_32_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: A boy stands on a track field.\nB: A boy sketches a track field.\nC: A boy sleeps on a track field.\nD: A boy sketches a track field.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A boy stands on a track field.\nB: A boy sketches a track field.\nC: A boy sleeps on a track field.\nD: A boy sketches a track field.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_33_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_33_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_33_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_33_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_33_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_33_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_33_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_33_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_33_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_33_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_33_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_33_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_33_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_33_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_33_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_33_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: The potato rolled off the board onto the floor.\nB: The potato then gets sliced on a board.\nC: The potato was planted in the garden.\nD: The potato is planted in the garden.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The potato rolled off the board onto the floor.\nB: The potato then gets sliced on a board.\nC: The potato was planted in the garden.\nD: The potato is planted in the garden.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_34_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_34_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_34_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_34_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_34_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_34_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_34_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_34_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_34_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_34_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_34_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_34_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_34_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_34_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_34_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_34_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: The woman swept the floor, placing the shoes neatly on the rack.\nB: woman grab the shoes from the floor and wear them.\nC: Woman left the shoes on the floor and walked away barefoot.\nD: The woman tossed the shoes from the floor into a donation box.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The woman swept the floor, placing the shoes neatly on the rack.\nB: woman grab the shoes from the floor and wear them.\nC: Woman left the shoes on the floor and walked away barefoot.\nD: The woman tossed the shoes from the floor into a donation box.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_35_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_35_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_35_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_35_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_35_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_35_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_35_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_35_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_35_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_35_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_35_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_35_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_35_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_35_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_35_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_35_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: A child paints a picture as an adult frames it on the wall.\nB: A child washes and dry their hands leaving the water on which and adult turns off.\nC: An adult reads a book while a child playfully turns the pages.\nD: A child playfully chases an adult around the garden, leaving their hands dirty.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A child paints a picture as an adult frames it on the wall.\nB: A child washes and dry their hands leaving the water on which and adult turns off.\nC: An adult reads a book while a child playfully turns the pages.\nD: A child playfully chases an adult around the garden, leaving their hands dirty.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_36_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_36_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_36_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_36_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_36_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_36_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_36_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_36_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_36_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_36_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_36_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_36_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_36_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_36_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_36_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_36_15.jpg"], "output": "G", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: She tosses the tomato slices into a salad, then toasts the bread for a side dish.\nB: She tosses the tomato slices into the salad, then uses the bread to scoop up the remaining mayo.\nC: She cuts the tomato into slices, then spreads mayo onto the bread before applying the tomatoes.\nD: She plants tomato seeds in the garden, then bakes fresh bread, waiting for the tomatoes to ripen.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: She tosses the tomato slices into a salad, then toasts the bread for a side dish.\nB: She tosses the tomato slices into the salad, then uses the bread to scoop up the remaining mayo.\nC: She cuts the tomato into slices, then spreads mayo onto the bread before applying the tomatoes.\nD: She plants tomato seeds in the garden, then bakes fresh bread, waiting for the tomatoes to ripen.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_37_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_37_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_37_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_37_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_37_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_37_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_37_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_37_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_37_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_37_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_37_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_37_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_37_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_37_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_37_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_37_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: People on the bridge gasp and watch her dive into the water below.\nB: People on the bridge gasp and record videos as she slips and falls into the water.\nC: People on the bridge gasp and hold their breath as she slips and falls into the water.\nD: People on the bridge smile and take pictures of her swinging back and forth over the water.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: People on the bridge gasp and watch her dive into the water below.\nB: People on the bridge gasp and record videos as she slips and falls into the water.\nC: People on the bridge gasp and hold their breath as she slips and falls into the water.\nD: People on the bridge smile and take pictures of her swinging back and forth over the water.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_38_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_38_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_38_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_38_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_38_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_38_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_38_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_38_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_38_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_38_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_38_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_38_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_38_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_38_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_38_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_38_15.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: Suddenly, the woman hits the red shoe with a toothbrush.\nB: Then, the woman talks a toothbrush and brushes the red shoe.\nC: Suddenly, the woman throws the toothbrush at the red shoe in frustration.\nD: Next, the woman uses a toothbrush to scrub the red shoe meticulously.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: Suddenly, the woman hits the red shoe with a toothbrush.\nB: Then, the woman talks a toothbrush and brushes the red shoe.\nC: Suddenly, the woman throws the toothbrush at the red shoe in frustration.\nD: Next, the woman uses a toothbrush to scrub the red shoe meticulously.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_39_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_39_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_39_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_39_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_39_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_39_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_39_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_39_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_39_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_39_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_39_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_39_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_39_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_39_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_39_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_39_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: A person is seen laying plaster onto a roof and using a shovel to flatten it out.\nB: A person is observed removing plaster from a roof with a shovel.\nC: A person is seen using a shovel to remove plaster from a roof.\nD: A person is observed shoveling snow off a roof and spreading salt to prevent ice formation.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A person is seen laying plaster onto a roof and using a shovel to flatten it out.\nB: A person is observed removing plaster from a roof with a shovel.\nC: A person is seen using a shovel to remove plaster from a roof.\nD: A person is observed shoveling snow off a roof and spreading salt to prevent ice formation.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_40_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_40_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_40_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_40_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_40_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_40_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_40_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_40_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_40_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_40_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_40_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_40_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_40_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_40_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_40_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_40_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: She stubs her leg against the table, mid-conversation.\nB: She chats animatedly while applying lotion to her leg.\nC: She begins washing her leg with the soap while talking.\nD: While chatting, she starts sketching on her leg with a marker.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: She stubs her leg against the table, mid-conversation.\nB: She chats animatedly while applying lotion to her leg.\nC: She begins washing her leg with the soap while talking.\nD: While chatting, she starts sketching on her leg with a marker.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_41_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_41_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_41_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_41_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_41_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_41_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_41_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_41_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_41_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_41_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_41_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_41_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_41_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_41_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_41_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_41_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: He carefully places it beside his body for a nap.\nB: He swings it around his body several times.\nC: He carries it gently in his arms across the room.\nD: He drapes it gently over his shoulders.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: He carefully places it beside his body for a nap.\nB: He swings it around his body several times.\nC: He carries it gently in his arms across the room.\nD: He drapes it gently over his shoulders.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_42_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_42_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_42_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_42_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_42_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_42_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_42_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_42_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_42_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_42_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_42_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_42_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_42_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_42_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_42_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_42_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: The woman uses her front hair to test the sharpness of the new scissors.\nB: The woman uses the brush to sweep up her fallen front hair from the floor.\nC: Suddenly, the woman snips off a portion of her front hair with scissors.\nD: Then the woman takes a portion of her front hair and combs it with the brush.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The woman uses her front hair to test the sharpness of the new scissors.\nB: The woman uses the brush to sweep up her fallen front hair from the floor.\nC: Suddenly, the woman snips off a portion of her front hair with scissors.\nD: Then the woman takes a portion of her front hair and combs it with the brush.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_43_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_43_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_43_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_43_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_43_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_43_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_43_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_43_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_43_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_43_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_43_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_43_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_43_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_43_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_43_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_43_15.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: Pictures depict people using their body parts to create fire-inspired artwork.\nB: More pictures are shown of fire as well as people putting their body parts over it.\nC: Pictures depict people using their body parts to paint images of fire.\nD: Pictures are displayed of people painting fire and their body parts with vibrant colors.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: Pictures depict people using their body parts to create fire-inspired artwork.\nB: More pictures are shown of fire as well as people putting their body parts over it.\nC: Pictures depict people using their body parts to paint images of fire.\nD: Pictures are displayed of people painting fire and their body parts with vibrant colors.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_44_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_44_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_44_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_44_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_44_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_44_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_44_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_44_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_44_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_44_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_44_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_44_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_44_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_44_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_44_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_44_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: The man, after waving to the crowd, is seen picking up the same throw he previously used, now in slow motion.\nB: The same throw is shown again in slow motion followed by the man waving to the crowd.\nC: The man waves to the crowd before he throws, this time in fast motion.\nD: The man from the crowd is swiftly caught by the same throw, before he could wave again.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The man, after waving to the crowd, is seen picking up the same throw he previously used, now in slow motion.\nB: The same throw is shown again in slow motion followed by the man waving to the crowd.\nC: The man waves to the crowd before he throws, this time in fast motion.\nD: The man from the crowd is swiftly caught by the same throw, before he could wave again.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_45_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_45_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_45_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_45_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_45_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_45_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_45_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_45_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_45_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_45_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_45_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_45_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_45_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_45_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_45_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_45_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: We see a lady sitting a table drilling holes in a pumpkin as kids watch.\nB: A woman at a table is reading a book to children, with a pumpkin sitting idly nearby.\nC: The kids observe a lady at a table, carving intricate designs into a pumpkin.\nD: A lady at a table is reading a spooky story to children, while a pumpkin sits untouched.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: We see a lady sitting a table drilling holes in a pumpkin as kids watch.\nB: A woman at a table is reading a book to children, with a pumpkin sitting idly nearby.\nC: The kids observe a lady at a table, carving intricate designs into a pumpkin.\nD: A lady at a table is reading a spooky story to children, while a pumpkin sits untouched.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_46_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_46_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_46_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_46_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_46_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_46_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_46_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_46_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_46_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_46_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_46_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_46_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_46_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_46_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_46_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_46_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: The knife is carefully placed next to the block, before being propped up against a piece of kitchen steel.\nB: The knife is carefully placed next to the block, while the piece of kitchen steel is used to straighten a bent fork.\nC: The knife is gently placed in the block, followed by a quick wipe on a kitchen steel piece.\nD: The knife is then moved back and forth across the block and then over a piece of kitchen steel.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The knife is carefully placed next to the block, before being propped up against a piece of kitchen steel.\nB: The knife is carefully placed next to the block, while the piece of kitchen steel is used to straighten a bent fork.\nC: The knife is gently placed in the block, followed by a quick wipe on a kitchen steel piece.\nD: The knife is then moved back and forth across the block and then over a piece of kitchen steel.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_47_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_47_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_47_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_47_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_47_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_47_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_47_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_47_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_47_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_47_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_47_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_47_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_47_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_47_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_47_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_47_15.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: A man displays a photo of a woman hoisting a white flag, scrutinizing it in the dim light.\nB: A man displays a photo of a woman waving a white flag, as he carefully adjusts his camera settings.\nC: A woman raises a white flag and the man's shot is shown again in slow motion.\nD: A man gifts a white flag to a woman who is shown cooking in a slow-motion video.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A man displays a photo of a woman hoisting a white flag, scrutinizing it in the dim light.\nB: A man displays a photo of a woman waving a white flag, as he carefully adjusts his camera settings.\nC: A woman raises a white flag and the man's shot is shown again in slow motion.\nD: A man gifts a white flag to a woman who is shown cooking in a slow-motion video.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_48_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_48_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_48_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_48_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_48_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_48_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_48_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_48_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_48_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_48_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_48_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_48_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_48_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_48_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_48_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_48_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: The woman carefully cleaned the lens before placing it back in its case.\nB: The woman placed the lens on the table next to her eye and examined it before carefully packing it away.\nC: The woman put the lens on side of her eye and blink and then removed the lens again.\nD: The woman accidentally dropped the lens from her eye onto the table.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The woman carefully cleaned the lens before placing it back in its case.\nB: The woman placed the lens on the table next to her eye and examined it before carefully packing it away.\nC: The woman put the lens on side of her eye and blink and then removed the lens again.\nD: The woman accidentally dropped the lens from her eye onto the table.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_49_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_49_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_49_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_49_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_49_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_49_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_49_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_49_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_49_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_49_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_49_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_49_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_49_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_49_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_49_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_49_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: The lady packs apples, nuts, and carrots, ignoring her untouched salad.\nB: The lady collects apples, nuts, and carrots, ignoring her untouched salad.\nC: The lady shows us her salad then adds apples, nuts and carrots.\nD: The lady packs apples, nuts, and carrots, ignoring the salad she initially showed us.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The lady packs apples, nuts, and carrots, ignoring her untouched salad.\nB: The lady collects apples, nuts, and carrots, ignoring her untouched salad.\nC: The lady shows us her salad then adds apples, nuts and carrots.\nD: The lady packs apples, nuts, and carrots, ignoring the salad she initially showed us.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_50_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_50_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_50_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_50_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_50_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_50_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_50_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_50_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_50_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_50_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_50_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_50_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_50_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_50_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_50_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_50_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: A person is seen holding a stick and wacking a pinata in the middle of a party.\nB: A person is noticed presenting a stick as a gift at a serene pinata ceremony.\nC: A person is observed handing over a stick to a child at a peaceful birthday gathering.\nD: A person is spotted passing a stick to a child during a calm gathering.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A person is seen holding a stick and wacking a pinata in the middle of a party.\nB: A person is noticed presenting a stick as a gift at a serene pinata ceremony.\nC: A person is observed handing over a stick to a child at a peaceful birthday gathering.\nD: A person is spotted passing a stick to a child during a calm gathering.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_51_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_51_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_51_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_51_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_51_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_51_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_51_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_51_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_51_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_51_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_51_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_51_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_51_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_51_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_51_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_51_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: A close up of food ingredients are shown followed by a person making a sandwich.\nB: A person is identifying food ingredients before sorting them out, instead of making a sandwich.\nC: Food ingredients are spread out for inspection before a person starts to bake a cake.\nD: A person discards food ingredients after accidentally burning their sandwich.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A close up of food ingredients are shown followed by a person making a sandwich.\nB: A person is identifying food ingredients before sorting them out, instead of making a sandwich.\nC: Food ingredients are spread out for inspection before a person starts to bake a cake.\nD: A person discards food ingredients after accidentally burning their sandwich.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_52_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_52_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_52_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_52_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_52_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_52_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_52_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_52_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_52_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_52_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_52_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_52_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_52_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_52_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_52_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_52_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: The man repairs a broken bar on the parallel bars.\nB: The man performs a routine on the parallel bars.\nC: The man repairs the parallel bars at the gym.\nD: The man repairs the parallel bars.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The man repairs a broken bar on the parallel bars.\nB: The man performs a routine on the parallel bars.\nC: The man repairs the parallel bars at the gym.\nD: The man repairs the parallel bars.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_53_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_53_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_53_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_53_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_53_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_53_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_53_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_53_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_53_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_53_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_53_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_53_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_53_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_53_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_53_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_53_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: He buys a new pair of shoes, discarding his old ones with worn-out soles.\nB: He ties his shoelaces together, hanging the shoes off his backpack, the soles touching.\nC: He tries to wash his shoes by kicking them in the water, the soles coming out.\nD: He ties his shoes securely, ensuring the soles are firmly attached.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: He buys a new pair of shoes, discarding his old ones with worn-out soles.\nB: He ties his shoelaces together, hanging the shoes off his backpack, the soles touching.\nC: He tries to wash his shoes by kicking them in the water, the soles coming out.\nD: He ties his shoes securely, ensuring the soles are firmly attached.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_54_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_54_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_54_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_54_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_54_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_54_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_54_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_54_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_54_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_54_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_54_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_54_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_54_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_54_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_54_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_54_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: Eventually, he places an ax on a log and strolls off.\nB: Ultimately, he places a book on a log and strolls off.\nC: Finally he swings an ax onto a log and walks away.\nD: Eventually, he places an ax beside a log and leaves.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: Eventually, he places an ax on a log and strolls off.\nB: Ultimately, he places a book on a log and strolls off.\nC: Finally he swings an ax onto a log and walks away.\nD: Eventually, he places an ax beside a log and leaves.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_55_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_55_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_55_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_55_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_55_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_55_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_55_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_55_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_55_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_55_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_55_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_55_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_55_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_55_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_55_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_55_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: A person in an orange shirt is sitting on the stairs, tying his shoelaces near the slide.\nB: A person in an orange shirt stands on the stairs next to the slide.\nC: A person in an orange shirt is fixing the slide next to the stairs.\nD: A person in an orange shirt is fixing the slide next to the stairs.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A person in an orange shirt is sitting on the stairs, tying his shoelaces near the slide.\nB: A person in an orange shirt stands on the stairs next to the slide.\nC: A person in an orange shirt is fixing the slide next to the stairs.\nD: A person in an orange shirt is fixing the slide next to the stairs.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_56_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_56_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_56_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_56_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_56_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_56_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_56_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_56_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_56_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_56_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_56_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_56_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_56_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_56_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_56_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_56_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: The man packs all the ingredients into jars, places the dough into a ceramic bowl, and sprinkles in extra chocolate chips.\nB: The man blends all the ingredients together and lays the dough out on a pan and adding more chocolate chips.\nC: The man gathers all the ingredients, rolls the dough into balls, stuffs them with chocolate chips, and chills them in the fridge.\nD: The man gathers all the ingredients, kneads the dough on a clean surface, and sprinkles it with raisins.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The man packs all the ingredients into jars, places the dough into a ceramic bowl, and sprinkles in extra chocolate chips.\nB: The man blends all the ingredients together and lays the dough out on a pan and adding more chocolate chips.\nC: The man gathers all the ingredients, rolls the dough into balls, stuffs them with chocolate chips, and chills them in the fridge.\nD: The man gathers all the ingredients, kneads the dough on a clean surface, and sprinkles it with raisins.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_57_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_57_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_57_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_57_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_57_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_57_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_57_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_57_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_57_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_57_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_57_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_57_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_57_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_57_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_57_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_57_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: An intro leads into several clips of people performing impressive flips off a high dive.\nB: An intro transitions into multiple snippets of individuals enjoying serene swims around a high dive.\nC: An intro transitions into various snippets of individuals engaging in intense debates from a high-rise building.\nD: An intro segues into a compilation of individuals fearlessly bungee jumping from a towering bridge.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: An intro leads into several clips of people performing impressive flips off a high dive.\nB: An intro transitions into multiple snippets of individuals enjoying serene swims around a high dive.\nC: An intro transitions into various snippets of individuals engaging in intense debates from a high-rise building.\nD: An intro segues into a compilation of individuals fearlessly bungee jumping from a towering bridge.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_58_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_58_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_58_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_58_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_58_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_58_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_58_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_58_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_58_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_58_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_58_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_58_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_58_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_58_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_58_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_58_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: The girl sits down and starts sketching the bars on her canvas.\nB: The girl jumps up and begins performing a routine on the bars.\nC: The girl sits down and starts drawing sketches on the bars.\nD: The girl sits down and starts sketching the bars on her drawing pad.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The girl sits down and starts sketching the bars on her canvas.\nB: The girl jumps up and begins performing a routine on the bars.\nC: The girl sits down and starts drawing sketches on the bars.\nD: The girl sits down and starts sketching the bars on her drawing pad.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_59_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_59_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_59_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_59_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_59_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_59_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_59_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_59_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_59_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_59_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_59_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_59_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_59_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_59_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_59_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_59_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: The camera captures the person diligently sketching landscapes from different angles.\nB: The person continues laying plaster down while the camera pans around him from various sides.\nC: The person gingerly sips his coffee, completely oblivious, as the camera stealthily captures him from different angles.\nD: The individual pauses to sip his coffee as the lens captures him from multiple angles.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The camera captures the person diligently sketching landscapes from different angles.\nB: The person continues laying plaster down while the camera pans around him from various sides.\nC: The person gingerly sips his coffee, completely oblivious, as the camera stealthily captures him from different angles.\nD: The individual pauses to sip his coffee as the lens captures him from multiple angles.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_60_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_60_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_60_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_60_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_60_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_60_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_60_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_60_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_60_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_60_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_60_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_60_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_60_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_60_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_60_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_60_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: A young woman is standing at the foot of a pole vault track.\nB: A young woman is tying her shoelaces at the end of a pole vault track.\nC: A young woman is tying her shoelaces at the beginning of a pole vault runway.\nD: A young woman is tying her shoelaces at the base of a pole vault track.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A young woman is standing at the foot of a pole vault track.\nB: A young woman is tying her shoelaces at the end of a pole vault track.\nC: A young woman is tying her shoelaces at the beginning of a pole vault runway.\nD: A young woman is tying her shoelaces at the base of a pole vault track.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_61_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_61_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_61_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_61_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_61_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_61_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_61_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_61_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_61_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_61_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_61_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_61_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_61_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_61_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_61_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_61_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: He slowly walks towards the tall high jump beam, admiring its structure.\nB: He then runs full speed and jumps a tall high jump beam.\nC: He leisurely walks and ducks under a low high jump beam.\nD: He leisurely strolls and stops to gaze at the tall high jump beam.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: He slowly walks towards the tall high jump beam, admiring its structure.\nB: He then runs full speed and jumps a tall high jump beam.\nC: He leisurely walks and ducks under a low high jump beam.\nD: He leisurely strolls and stops to gaze at the tall high jump beam.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_62_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_62_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_62_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_62_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_62_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_62_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_62_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_62_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_62_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_62_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_62_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_62_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_62_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_62_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_62_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_62_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: A camera pans around a wooden floor and shows a person walking downstairs.\nB: A person abruptly drops a camera on a wooden floor before racing upstairs.\nC: A person ascends upstairs, their footfalls echoing on the wooden floor, while a camera lies unused.\nD: A person picks up a fallen camera from the wooden floor at the bottom of the stairs.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A camera pans around a wooden floor and shows a person walking downstairs.\nB: A person abruptly drops a camera on a wooden floor before racing upstairs.\nC: A person ascends upstairs, their footfalls echoing on the wooden floor, while a camera lies unused.\nD: A person picks up a fallen camera from the wooden floor at the bottom of the stairs.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_63_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_63_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_63_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_63_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_63_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_63_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_63_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_63_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_63_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_63_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_63_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_63_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_63_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_63_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_63_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_63_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: man is in a living room painting a couch with purle spray.\nB: Sentence: A man in a living room is relaxing on a purple couch.\nC: Sentence: In a living room, a man is vacuuming a purple couch.\nD: Sentence: In a living room, a man is vacuuming crumbs off a purple couch.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: man is in a living room painting a couch with purle spray.\nB: Sentence: A man in a living room is relaxing on a purple couch.\nC: Sentence: In a living room, a man is vacuuming a purple couch.\nD: Sentence: In a living room, a man is vacuuming crumbs off a purple couch.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_64_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_64_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_64_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_64_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_64_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_64_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_64_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_64_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_64_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_64_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_64_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_64_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_64_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_64_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_64_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_64_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: A man and is dog are seen standing in the middle of a fenced in area performing tricks with frisbees.\nB: A man and his dog are spotted relaxing in a fenced yard, enjoying a peaceful afternoon nap.\nC: A man and his dog are enjoying a quiet picnic in a fenced park, sharing sandwiches.\nD: A man and his dog are calmly watching the sunset from a fenced backyard, completely engrossed in the tranquil scene.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A man and is dog are seen standing in the middle of a fenced in area performing tricks with frisbees.\nB: A man and his dog are spotted relaxing in a fenced yard, enjoying a peaceful afternoon nap.\nC: A man and his dog are enjoying a quiet picnic in a fenced park, sharing sandwiches.\nD: A man and his dog are calmly watching the sunset from a fenced backyard, completely engrossed in the tranquil scene.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_65_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_65_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_65_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_65_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_65_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_65_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_65_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_65_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_65_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_65_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_65_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_65_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_65_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_65_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_65_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_65_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: He cooks a fish, and discards the bones.\nB: He studies a fish, then releases it back into the water.\nC: He reels in a fish, and removes the hook.\nD: He photographs a fish, and releases it back into the water.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: He cooks a fish, and discards the bones.\nB: He studies a fish, then releases it back into the water.\nC: He reels in a fish, and removes the hook.\nD: He photographs a fish, and releases it back into the water.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_66_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_66_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_66_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_66_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_66_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_66_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_66_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_66_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_66_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_66_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_66_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_66_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_66_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_66_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_66_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_66_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: The man dismounts the horse, kneels to pet the calf gently, then rises and strolls away.\nB: Then, the man get down the horse and kneels to tie the legs of the calf, then the man raises and walk.\nC: The man dismounts the horse, gently strokes the calf, and then strolls away leisurely.\nD: The man dismounts the horse, kneels to pet the calf, then stands up and strolls.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The man dismounts the horse, kneels to pet the calf gently, then rises and strolls away.\nB: Then, the man get down the horse and kneels to tie the legs of the calf, then the man raises and walk.\nC: The man dismounts the horse, gently strokes the calf, and then strolls away leisurely.\nD: The man dismounts the horse, kneels to pet the calf, then stands up and strolls.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_67_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_67_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_67_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_67_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_67_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_67_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_67_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_67_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_67_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_67_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_67_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_67_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_67_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_67_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_67_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_67_15.jpg"], "output": "J", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: He carefully arranges his tools next to the kindling, ready for tomorrow's campfire.\nB: He carefully arranges his tools around the kindling, preparing for a camping demonstration.\nC: He starts striking his tools together over the kindling to start the fire.\nD: He gently places his tools beside the kindling, preparing to organize his workshop.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: He carefully arranges his tools next to the kindling, ready for tomorrow's campfire.\nB: He carefully arranges his tools around the kindling, preparing for a camping demonstration.\nC: He starts striking his tools together over the kindling to start the fire.\nD: He gently places his tools beside the kindling, preparing to organize his workshop.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_68_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_68_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_68_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_68_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_68_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_68_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_68_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_68_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_68_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_68_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_68_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_68_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_68_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_68_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_68_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_68_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: They start to remove ornaments from the Christmas tree.\nB: They start to remove ornaments from the Christmas tree.\nC: They begin to put decorations onto the Christmas tree.\nD: They decide to chop down the Christmas tree for firewood.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: They start to remove ornaments from the Christmas tree.\nB: They start to remove ornaments from the Christmas tree.\nC: They begin to put decorations onto the Christmas tree.\nD: They decide to chop down the Christmas tree for firewood.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_69_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_69_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_69_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_69_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_69_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_69_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_69_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_69_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_69_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_69_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_69_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_69_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_69_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_69_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_69_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_69_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: woman is slicing a chocolate cake and its decorating it, put a second floor and some pink fondam.\nB: Sentence: The woman, discarding the second floor, removed the pink fondam and stopped decorating the chocolate cake.\nC: Sentence: A woman is stacking a second floor on a chocolate cake and draping it with pink fondam, without slicing or decorating it.\nD: Sentence: The woman, tired of baking, stashed the chocolate cake and pink fondam, opting to read a novel on her second floor.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: woman is slicing a chocolate cake and its decorating it, put a second floor and some pink fondam.\nB: Sentence: The woman, discarding the second floor, removed the pink fondam and stopped decorating the chocolate cake.\nC: Sentence: A woman is stacking a second floor on a chocolate cake and draping it with pink fondam, without slicing or decorating it.\nD: Sentence: The woman, tired of baking, stashed the chocolate cake and pink fondam, opting to read a novel on her second floor.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_70_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_70_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_70_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_70_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_70_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_70_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_70_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_70_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_70_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_70_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_70_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_70_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_70_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_70_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_70_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_70_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: They break the pitcher into pieces.\nB: They broke the pitcher into pieces.\nC: They pour that into a pitcher.\nD: They wash the pitcher in the sink.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: They break the pitcher into pieces.\nB: They broke the pitcher into pieces.\nC: They pour that into a pitcher.\nD: They wash the pitcher in the sink.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_71_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_71_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_71_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_71_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_71_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_71_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_71_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_71_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_71_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_71_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_71_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_71_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_71_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_71_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_71_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_71_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: A man is seen sitting on the ice speaking to the camera and leads into several shots of him grabbing fish from a pole.\nB: A man is caught on camera lounging on the beach, narrating to the camera while pointing to a pole where various fish are hung.\nC: A man is caught on camera, relaxing on a boat while casting his fishing pole into the water.\nD: A man is spotted standing on the beach, throwing fish back into the ocean, after removing them from a pole.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A man is seen sitting on the ice speaking to the camera and leads into several shots of him grabbing fish from a pole.\nB: A man is caught on camera lounging on the beach, narrating to the camera while pointing to a pole where various fish are hung.\nC: A man is caught on camera, relaxing on a boat while casting his fishing pole into the water.\nD: A man is spotted standing on the beach, throwing fish back into the ocean, after removing them from a pole.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_72_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_72_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_72_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_72_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_72_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_72_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_72_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_72_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_72_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_72_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_72_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_72_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_72_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_72_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_72_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_72_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: A child climbs monkey bars until reach the others side.\nB: A child paints a picture of monkey bars on the other side of the room.\nC: A child paints a picture until the others arrive.\nD: A child draws a picture of monkey bars on the side of his notebook.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A child climbs monkey bars until reach the others side.\nB: A child paints a picture of monkey bars on the other side of the room.\nC: A child paints a picture until the others arrive.\nD: A child draws a picture of monkey bars on the side of his notebook.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_73_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_73_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_73_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_73_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_73_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_73_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_73_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_73_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_73_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_73_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_73_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_73_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_73_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_73_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_73_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_73_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: A big chunk of snow is on the roof of a car.\nB: A car drives over a large mound of snow.\nC: A car drives away, shaking a large chunk of snow off its roof.\nD: A car drives over a big chunk of snow.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A big chunk of snow is on the roof of a car.\nB: A car drives over a large mound of snow.\nC: A car drives away, shaking a large chunk of snow off its roof.\nD: A car drives over a big chunk of snow.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_74_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_74_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_74_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_74_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_74_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_74_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_74_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_74_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_74_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_74_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_74_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_74_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_74_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_74_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_74_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_74_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: The lady squeezes a lemon into oil, combines it with other ingredients, grates something over it, and mixes everything together.\nB: The lady collects the ingredients, including a lemon and oil, writes something on top of the recipe card, and rates it before tidying up.\nC: The lady paints a lemon, places it on an oil canvas, integrates other elements, inscribes a name on top, and mix the colors thoroughly.\nD: The lady juices a lemon and pours it in oil and adds other ingredients and rates something on top before stirring it up.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The lady squeezes a lemon into oil, combines it with other ingredients, grates something over it, and mixes everything together.\nB: The lady collects the ingredients, including a lemon and oil, writes something on top of the recipe card, and rates it before tidying up.\nC: The lady paints a lemon, places it on an oil canvas, integrates other elements, inscribes a name on top, and mix the colors thoroughly.\nD: The lady juices a lemon and pours it in oil and adds other ingredients and rates something on top before stirring it up.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_75_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_75_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_75_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_75_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_75_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_75_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_75_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_75_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_75_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_75_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_75_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_75_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_75_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_75_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_75_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_75_15.jpg"], "output": "E", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: She gently grips the lens in her hands, illustrating how to clean it thoroughly before usage.\nB: She moves her hands around while holding onto the lens and leads into her demonstrating how to put one in your eye.\nC: While holding the lens, she quickly withdraws her hands, demonstrating how to remove it from your eye.\nD: She guides her hands to carefully fix the lens on her camera, showcasing her photographic expertise.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: She gently grips the lens in her hands, illustrating how to clean it thoroughly before usage.\nB: She moves her hands around while holding onto the lens and leads into her demonstrating how to put one in your eye.\nC: While holding the lens, she quickly withdraws her hands, demonstrating how to remove it from your eye.\nD: She guides her hands to carefully fix the lens on her camera, showcasing her photographic expertise.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_76_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_76_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_76_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_76_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_76_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_76_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_76_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_76_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_76_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_76_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_76_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_76_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_76_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_76_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_76_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_76_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: The two exchange glances before diving into the container to retrieve the sunken treasure.\nB: The two examine the container, then exchange a puzzled glance.\nC: The two take a drink from the container and nod to one another.\nD: The two exchange glances across the room, the container untouched between them.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The two exchange glances before diving into the container to retrieve the sunken treasure.\nB: The two examine the container, then exchange a puzzled glance.\nC: The two take a drink from the container and nod to one another.\nD: The two exchange glances across the room, the container untouched between them.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_77_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_77_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_77_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_77_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_77_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_77_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_77_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_77_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_77_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_77_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_77_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_77_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_77_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_77_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_77_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_77_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: First, she brushes her hair, then she secures it into a neat bun.\nB: First she lets the rollers heat up and she puts them onto her hair.\nC: First, she allows the rollers to cool down before she removes them from her hair.\nD: First she collects the rollers, then she begins to neatly organize them in her drawer.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: First, she brushes her hair, then she secures it into a neat bun.\nB: First she lets the rollers heat up and she puts them onto her hair.\nC: First, she allows the rollers to cool down before she removes them from her hair.\nD: First she collects the rollers, then she begins to neatly organize them in her drawer.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_78_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_78_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_78_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_78_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_78_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_78_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_78_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_78_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_78_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_78_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_78_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_78_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_78_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_78_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_78_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_78_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: He cleans his electric razor after using it on his beard.\nB: He uses an electric razor to trim and shave his beard.\nC: He charges his electric razor on the bathroom counter before leaving for work.\nD: He charges his electric razor with a portable power bank when traveling.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: He cleans his electric razor after using it on his beard.\nB: He uses an electric razor to trim and shave his beard.\nC: He charges his electric razor on the bathroom counter before leaving for work.\nD: He charges his electric razor with a portable power bank when traveling.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_79_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_79_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_79_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_79_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_79_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_79_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_79_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_79_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_79_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_79_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_79_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_79_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_79_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_79_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_79_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_79_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: Sentence: Amidst a serene landscape, the camera focuses on the people quietly sitting on boards at the hill's crest.\nB: The people continue to ride the boards down a hill while the camera pans around himself as well as the area around them.\nC: The camera focuses on the people resting on the hill, their boards beside them, capturing a panoramic view of the surrounding area.\nD: Sentence: Amidst a bustling market, the people carry boards up a hill as the camera captures their determination and the vibrant surroundings.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: Sentence: Amidst a serene landscape, the camera focuses on the people quietly sitting on boards at the hill's crest.\nB: The people continue to ride the boards down a hill while the camera pans around himself as well as the area around them.\nC: The camera focuses on the people resting on the hill, their boards beside them, capturing a panoramic view of the surrounding area.\nD: Sentence: Amidst a bustling market, the people carry boards up a hill as the camera captures their determination and the vibrant surroundings.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_80_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_80_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_80_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_80_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_80_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_80_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_80_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_80_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_80_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_80_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_80_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_80_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_80_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_80_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_80_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_80_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: The man carefully places the weight in the middle of the bar, sits down for a while, then stands up and gently sets the bar back on the ground.\nB: The man then picks it up, squats, jumps to open his legs then quickly drops the bar and pushes the weight towards the middle of the bar and rests for a little bit.\nC: The man gently sets the bar down, stretches his legs, then strolls to the center of the bar for a brief reprieve.\nD: The man gently lifts the bar, settles into a steady stance, carefully positions the weight to the center of the bar, and takes a brief respite.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The man carefully places the weight in the middle of the bar, sits down for a while, then stands up and gently sets the bar back on the ground.\nB: The man then picks it up, squats, jumps to open his legs then quickly drops the bar and pushes the weight towards the middle of the bar and rests for a little bit.\nC: The man gently sets the bar down, stretches his legs, then strolls to the center of the bar for a brief reprieve.\nD: The man gently lifts the bar, settles into a steady stance, carefully positions the weight to the center of the bar, and takes a brief respite.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_81_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_81_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_81_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_81_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_81_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_81_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_81_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_81_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_81_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_81_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_81_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_81_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_81_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_81_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_81_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_81_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: An introduction comes onto the screen for a video about a curling game.\nB: The screen displays a curling game instead of the expected video introduction.\nC: The video screen flickers as it transitions from the curling game to an introduction on chess strategies.\nD: The screen displays a curling game interrupted by an unexpected introduction.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: An introduction comes onto the screen for a video about a curling game.\nB: The screen displays a curling game instead of the expected video introduction.\nC: The video screen flickers as it transitions from the curling game to an introduction on chess strategies.\nD: The screen displays a curling game interrupted by an unexpected introduction.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_82_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_82_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_82_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_82_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_82_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_82_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_82_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_82_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_82_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_82_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_82_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_82_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_82_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_82_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_82_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_82_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: The Hispanic man shared his meal with the black man, leading to hearty laughter and mutual respect between them.\nB: The black man and the Hispanic man sat in silence after a heated argument, their smiles replaced with stern expressions.\nC: The black man and hispanic man are working together on a project, concentrating deeply and exchanging innovative ideas.\nD: The black man is in disbelief and they're laughing and having a good time and they re-do it and the hispanic man wins again and more smiles continue.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The Hispanic man shared his meal with the black man, leading to hearty laughter and mutual respect between them.\nB: The black man and the Hispanic man sat in silence after a heated argument, their smiles replaced with stern expressions.\nC: The black man and hispanic man are working together on a project, concentrating deeply and exchanging innovative ideas.\nD: The black man is in disbelief and they're laughing and having a good time and they re-do it and the hispanic man wins again and more smiles continue.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_83_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_83_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_83_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_83_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_83_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_83_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_83_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_83_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_83_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_83_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_83_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_83_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_83_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_83_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_83_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_83_15.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: The man reads beneath the shade of the tulle tree.\nB: The man collects tulle from the tree and folds it neatly.\nC: The man cuts tulle and sticks it in the tree.\nD: The man finds tulle tangled in the tree and carefully removes it.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The man reads beneath the shade of the tulle tree.\nB: The man collects tulle from the tree and folds it neatly.\nC: The man cuts tulle and sticks it in the tree.\nD: The man finds tulle tangled in the tree and carefully removes it.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_84_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_84_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_84_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_84_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_84_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_84_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_84_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_84_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_84_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_84_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_84_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_84_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_84_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_84_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_84_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_84_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: The woman placed her brush and blow dryer on the counter, deciding to let her bangs air dry instead.\nB: The woman is holding a brush and blow dryer and began blow drying her bangs.\nC: The woman put down her brush and blow dryer, deciding to let her bangs air dry instead.\nD: The woman, equipped with a brush and blow dryer, decided to style her bangs into loose waves.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The woman placed her brush and blow dryer on the counter, deciding to let her bangs air dry instead.\nB: The woman is holding a brush and blow dryer and began blow drying her bangs.\nC: The woman put down her brush and blow dryer, deciding to let her bangs air dry instead.\nD: The woman, equipped with a brush and blow dryer, decided to style her bangs into loose waves.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_85_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_85_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_85_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_85_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_85_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_85_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_85_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_85_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_85_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_85_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_85_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_85_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_85_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_85_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_85_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_85_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: He later walks to the high jump and decides to skip it.\nB: He then runs to the high jump and completes it.\nC: He then walks to the high jump and judges it.\nD: He then walks to the high jump and dismantles it.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: He later walks to the high jump and decides to skip it.\nB: He then runs to the high jump and completes it.\nC: He then walks to the high jump and judges it.\nD: He then walks to the high jump and dismantles it.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_86_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_86_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_86_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_86_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_86_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_86_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_86_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_86_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_86_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_86_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_86_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_86_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_86_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_86_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_86_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_86_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: people is gathered around a table serving food.\nB: Sentence: People are debating around a table in a conference room.\nC: People are arguing around a table with papers scattered.\nD: People are debating fiercely around a table.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: people is gathered around a table serving food.\nB: Sentence: People are debating around a table in a conference room.\nC: People are arguing around a table with papers scattered.\nD: People are debating fiercely around a table.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_87_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_87_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_87_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_87_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_87_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_87_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_87_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_87_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_87_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_87_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_87_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_87_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_87_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_87_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_87_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_87_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: He gently places the hefty book from his face to his stomach while lounging on the couch.\nB: He carefully places the heavy weight on his stomach, then gently rolls it off near his face.\nC: He lifts the heavy weight high up near his face and lifts it back down near his stomach.\nD: He gently cradles the newborn baby close to his face, then lowers her down to rest near his stomach.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: He gently places the hefty book from his face to his stomach while lounging on the couch.\nB: He carefully places the heavy weight on his stomach, then gently rolls it off near his face.\nC: He lifts the heavy weight high up near his face and lifts it back down near his stomach.\nD: He gently cradles the newborn baby close to his face, then lowers her down to rest near his stomach.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_88_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_88_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_88_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_88_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_88_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_88_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_88_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_88_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_88_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_88_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_88_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_88_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_88_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_88_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_88_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_88_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: He carefully places the pale of sticks on the ground, ensuring it doesn't topple over as the fire blazes nearby.\nB: He finally gets the pale of sticks lite and as the fire grows the pale of sticks fall.\nC: He neatly arranges the pale of sticks, which promptly topples over, scattering sticks everywhere.\nD: He carefully stacks the pale of sticks, ensuring they don't fall as he adds each one.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: He carefully places the pale of sticks on the ground, ensuring it doesn't topple over as the fire blazes nearby.\nB: He finally gets the pale of sticks lite and as the fire grows the pale of sticks fall.\nC: He neatly arranges the pale of sticks, which promptly topples over, scattering sticks everywhere.\nD: He carefully stacks the pale of sticks, ensuring they don't fall as he adds each one.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_89_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_89_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_89_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_89_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_89_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_89_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_89_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_89_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_89_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_89_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_89_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_89_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_89_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_89_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_89_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_89_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: She dries her hair with a towel and discards the tissue paper.\nB: She rolls up a towel and puts it in tissue paper.\nC: She dries her face with a towel and discards the used tissue paper.\nD: She dries her face with a towel and discards the used tissue paper.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: She dries her hair with a towel and discards the tissue paper.\nB: She rolls up a towel and puts it in tissue paper.\nC: She dries her face with a towel and discards the used tissue paper.\nD: She dries her face with a towel and discards the used tissue paper.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_90_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_90_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_90_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_90_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_90_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_90_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_90_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_90_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_90_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_90_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_90_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_90_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_90_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_90_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_90_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_90_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: Sentence: The man repairs a homemade snow shovel on a small road.\nB: The man uses a home made snow shovel to clear away small road.\nC: Sentence: The man accidentally breaks his homemade snow shovel while fixing a small road.\nD: The man repurposes a homemade snow shovel into a gardening tool in his small road-side garden.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: Sentence: The man repairs a homemade snow shovel on a small road.\nB: The man uses a home made snow shovel to clear away small road.\nC: Sentence: The man accidentally breaks his homemade snow shovel while fixing a small road.\nD: The man repurposes a homemade snow shovel into a gardening tool in his small road-side garden.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_91_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_91_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_91_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_91_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_91_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_91_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_91_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_91_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_91_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_91_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_91_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_91_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_91_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_91_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_91_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_91_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: A guy is trying to lite a pale of sticks in a round barrole.\nB: A man is organizing a pile of sticks in a cylindrical barrel.\nC: A man is sorting a bucket of twigs in a circular barrel.\nD: A man is stacking sticks into a round barrel.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A guy is trying to lite a pale of sticks in a round barrole.\nB: A man is organizing a pile of sticks in a cylindrical barrel.\nC: A man is sorting a bucket of twigs in a circular barrel.\nD: A man is stacking sticks into a round barrel.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_92_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_92_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_92_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_92_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_92_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_92_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_92_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_92_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_92_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_92_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_92_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_92_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_92_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_92_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_92_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_92_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: The man in the blue shirt hands over a bench stone and a knife to the other man, suggesting they try carving sculptures.\nB: In a room, two men, one in a blue shirt, examine a bench stone; the man in blue illustrates how to polish it with a specific lubricant, disregarding the knife he initially intended to sharpen.\nC: Two men are in a room and the man with a blue shirt takes out a bench stone and with a little lubricant on the stone takes an knife and explains how to sharpen it.\nD: In a room, a man in a blue shirt shows his companion how to use a bench stone to polish a gem, instead of sharpening a knife.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The man in the blue shirt hands over a bench stone and a knife to the other man, suggesting they try carving sculptures.\nB: In a room, two men, one in a blue shirt, examine a bench stone; the man in blue illustrates how to polish it with a specific lubricant, disregarding the knife he initially intended to sharpen.\nC: Two men are in a room and the man with a blue shirt takes out a bench stone and with a little lubricant on the stone takes an knife and explains how to sharpen it.\nD: In a room, a man in a blue shirt shows his companion how to use a bench stone to polish a gem, instead of sharpening a knife.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_93_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_93_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_93_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_93_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_93_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_93_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_93_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_93_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_93_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_93_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_93_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_93_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_93_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_93_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_93_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_93_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: While chatting to her online audience, she's seen displaying an ironing board picture, unplugging the iron, and crumpling a shirt.\nB: She's shown plugging in the iron and folding up a shirt while still speaking to the camera and showing a picture of an ironing board.\nC: While chatting with her online viewers, she exhibits a photo of an ironing board before unplugging the iron and crumpling a shirt.\nD: She's seen snapping a picture of an ironing board, turning off the iron, and unfolding a shirt for the camera.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: While chatting to her online audience, she's seen displaying an ironing board picture, unplugging the iron, and crumpling a shirt.\nB: She's shown plugging in the iron and folding up a shirt while still speaking to the camera and showing a picture of an ironing board.\nC: While chatting with her online viewers, she exhibits a photo of an ironing board before unplugging the iron and crumpling a shirt.\nD: She's seen snapping a picture of an ironing board, turning off the iron, and unfolding a shirt for the camera.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_94_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_94_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_94_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_94_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_94_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_94_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_94_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_94_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_94_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_94_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_94_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_94_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_94_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_94_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_94_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_94_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: People are standing around a Christmas tree.\nB: People are planting a Christmas tree.\nC: People are planting a Christmas tree.\nD: People are planting a Christmas tree.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: People are standing around a Christmas tree.\nB: People are planting a Christmas tree.\nC: People are planting a Christmas tree.\nD: People are planting a Christmas tree.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_95_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_95_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_95_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_95_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_95_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_95_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_95_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_95_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_95_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_95_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_95_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_95_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_95_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_95_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_95_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_95_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: Afterwards, the young men play guitars by the river.\nB: Then, the young men splits logs in the woods.\nC: Next, the young men play soccer in the park.\nD: The young men play soccer in the woods.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: Afterwards, the young men play guitars by the river.\nB: Then, the young men splits logs in the woods.\nC: Next, the young men play soccer in the park.\nD: The young men play soccer in the woods.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_96_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_96_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_96_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_96_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_96_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_96_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_96_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_96_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_96_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_96_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_96_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_96_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_96_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_96_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_96_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_96_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: A young child is seen standing on the side with an older man playing shuffleboard and pushing a puck.\nB: An older man is teaching a young child how to fly a kite in the park.\nC: An older man is teaching a young child to fly a kite at the park.\nD: An elder man is observing a young child enthusiastically feeding ducks by the pond.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A young child is seen standing on the side with an older man playing shuffleboard and pushing a puck.\nB: An older man is teaching a young child how to fly a kite in the park.\nC: An older man is teaching a young child to fly a kite at the park.\nD: An elder man is observing a young child enthusiastically feeding ducks by the pond.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_97_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_97_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_97_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_97_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_97_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_97_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_97_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_97_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_97_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_97_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_97_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_97_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_97_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_97_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_97_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_97_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: The man in the dark hospital shirt points his finger at the contact lens, explaining the effects it could have on the optic health of the eye.\nB: The man in the dark hospital shirt reapplies the optic solution to his finger and proceeds to demonstrate how to insert a contact lens to the eye.\nC: The man in the dark hospital shirt points his finger towards the exit, instructing the patient on how to evacuate during an emergency.\nD: The man in the dark hospital shirt playfully flicks his finger, sending the optic solution droplets onto his contact lens before engaging in an animated conversation about eye health.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The man in the dark hospital shirt points his finger at the contact lens, explaining the effects it could have on the optic health of the eye.\nB: The man in the dark hospital shirt reapplies the optic solution to his finger and proceeds to demonstrate how to insert a contact lens to the eye.\nC: The man in the dark hospital shirt points his finger towards the exit, instructing the patient on how to evacuate during an emergency.\nD: The man in the dark hospital shirt playfully flicks his finger, sending the optic solution droplets onto his contact lens before engaging in an animated conversation about eye health.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_98_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_98_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_98_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_98_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_98_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_98_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_98_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_98_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_98_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_98_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_98_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_98_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_98_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_98_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_98_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_98_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: The man kneels to tie his shoes, and then wipes his sweaty brow with a towel.\nB: The man leans to tie his shoelaces, then he wipes his hands with a towel.\nC: The man bends to wash his face, after he dry his face with a towel.\nD: The man kneels to tie his shoe, then wipes his hands with a towel.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The man kneels to tie his shoes, and then wipes his sweaty brow with a towel.\nB: The man leans to tie his shoelaces, then he wipes his hands with a towel.\nC: The man bends to wash his face, after he dry his face with a towel.\nD: The man kneels to tie his shoe, then wipes his hands with a towel.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_99_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_99_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_99_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_99_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_99_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_99_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_99_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_99_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_99_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_99_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_99_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_99_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_99_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_99_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_99_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_99_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: The lady, after applying glue, places the lash gently.\nB: The lady, while laughing, accidentally spills coffee on the lash.\nC: The lady, after applying glue, attaches the lash.\nD: The lady talks the puts glue on the lash.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The lady, after applying glue, places the lash gently.\nB: The lady, while laughing, accidentally spills coffee on the lash.\nC: The lady, after applying glue, attaches the lash.\nD: The lady talks the puts glue on the lash.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_100_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_100_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_100_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_100_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_100_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_100_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_100_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_100_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_100_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_100_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_100_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_100_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_100_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_100_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_100_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_100_15.jpg"], "output": "E", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: A person's feet are shown around a set of shoes and then begins putting socks on his feet.\nB: A person's feet are revealed as they kick off their shoes, playfully wriggling their toes in the sand.\nC: A person's feet are displayed, barefoot and shoeless, as he digs his toes into the warm sand.\nD: A person's feet are showcased, standing barefoot in the sand, shoes discarded next to him.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A person's feet are shown around a set of shoes and then begins putting socks on his feet.\nB: A person's feet are revealed as they kick off their shoes, playfully wriggling their toes in the sand.\nC: A person's feet are displayed, barefoot and shoeless, as he digs his toes into the warm sand.\nD: A person's feet are showcased, standing barefoot in the sand, shoes discarded next to him.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_101_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_101_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_101_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_101_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_101_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_101_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_101_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_101_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_101_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_101_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_101_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_101_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_101_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_101_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_101_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_101_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: Several women are in the gym, taking a yoga class to enhance their flexibility.\nB: A few women are in the gym practicing to do this pole vault, trying to do their best.\nC: A few women are in the kitchen, attempting to perfect their baking skills.\nD: A few women are in the gym, engaged in a lively yoga session, pushing their flexibility to the limit.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: Several women are in the gym, taking a yoga class to enhance their flexibility.\nB: A few women are in the gym practicing to do this pole vault, trying to do their best.\nC: A few women are in the kitchen, attempting to perfect their baking skills.\nD: A few women are in the gym, engaged in a lively yoga session, pushing their flexibility to the limit.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_102_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_102_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_102_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_102_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_102_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_102_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_102_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_102_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_102_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_102_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_102_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_102_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_102_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_102_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_102_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_102_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: The child watches the kayak drift away in the river.\nB: A child observes a kayak floating in the river.\nC: we see a child ride a kayak in a river.\nD: A child catches a fish in a river from a kayak.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The child watches the kayak drift away in the river.\nB: A child observes a kayak floating in the river.\nC: we see a child ride a kayak in a river.\nD: A child catches a fish in a river from a kayak.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_103_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_103_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_103_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_103_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_103_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_103_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_103_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_103_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_103_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_103_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_103_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_103_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_103_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_103_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_103_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_103_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: Someone is putting water into a bowl.\nB: Someone is drinking water from a bowl.\nC: Someone is drinking water from a bowl.\nD: Someone is drinking water from a bowl.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: Someone is putting water into a bowl.\nB: Someone is drinking water from a bowl.\nC: Someone is drinking water from a bowl.\nD: Someone is drinking water from a bowl.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_104_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_104_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_104_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_104_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_104_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_104_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_104_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_104_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_104_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_104_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_104_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_104_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_104_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_104_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_104_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_104_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: She leans on the rail of the bridge, watching the boats sail by.\nB: She leans on the rail of the bridge, admiring the view off the side.\nC: She climbs to the rail of the bridge, then bungee jumps off the side.\nD: She leans on the bridge rail, quietly watching the river flow beneath.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: She leans on the rail of the bridge, watching the boats sail by.\nB: She leans on the rail of the bridge, admiring the view off the side.\nC: She climbs to the rail of the bridge, then bungee jumps off the side.\nD: She leans on the bridge rail, quietly watching the river flow beneath.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_105_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_105_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_105_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_105_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_105_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_105_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_105_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_105_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_105_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_105_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_105_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_105_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_105_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_105_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_105_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_105_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: The man interrupts his routine and falls.\nB: The man interrupts his routine and climbs.\nC: The man finishes his routine and dismounts.\nD: The man interrupts his routine and takes a break.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The man interrupts his routine and falls.\nB: The man interrupts his routine and climbs.\nC: The man finishes his routine and dismounts.\nD: The man interrupts his routine and takes a break.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_106_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_106_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_106_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_106_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_106_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_106_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_106_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_106_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_106_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_106_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_106_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_106_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_106_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_106_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_106_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_106_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: A large crowd is seen sitting around a field followed by a man running with a javelin and throwing it across the field.\nB: A man with a javelin is calmly walking across a field while a large crowd sits around him, watching attentively.\nC: A man is observed teaching a javelin throwing technique to a large crowd gathered in a field.\nD: A man is seen teaching javelin techniques to a large crowd gathered in a field.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A large crowd is seen sitting around a field followed by a man running with a javelin and throwing it across the field.\nB: A man with a javelin is calmly walking across a field while a large crowd sits around him, watching attentively.\nC: A man is observed teaching a javelin throwing technique to a large crowd gathered in a field.\nD: A man is seen teaching javelin techniques to a large crowd gathered in a field.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_107_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_107_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_107_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_107_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_107_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_107_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_107_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_107_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_107_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_107_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_107_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_107_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_107_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_107_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_107_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_107_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: He examines the shingles scattered across the roof.\nB: He examines the shingles scattered across the roof.\nC: He sprays the shingles all over the roof.\nD: He inspects the shingles meticulously across the roof.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: He examines the shingles scattered across the roof.\nB: He examines the shingles scattered across the roof.\nC: He sprays the shingles all over the roof.\nD: He inspects the shingles meticulously across the roof.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_108_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_108_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_108_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_108_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_108_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_108_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_108_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_108_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_108_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_108_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_108_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_108_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_108_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_108_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_108_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_108_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: Another young athlete then writes the same song.\nB: Another young athlete subsequently breaks the same record.\nC: Another young athlete then writes a different story.\nD: Another young athlete then makes the same jump.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: Another young athlete then writes the same song.\nB: Another young athlete subsequently breaks the same record.\nC: Another young athlete then writes a different story.\nD: Another young athlete then makes the same jump.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_109_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_109_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_109_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_109_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_109_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_109_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_109_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_109_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_109_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_109_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_109_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_109_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_109_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_109_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_109_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_109_15.jpg"], "output": "E", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: The man playfully hides behind the giant cookie before tossing it like a frisbee towards the camera.\nB: The man presents the giant cookie to the camera, then decorates it meticulously instead of eating it.\nC: A shot of the giant cookie baked is shown and leads into the man holding it in front of the camera and eating it.\nD: The man carefully presents the giant cookie in front of the camera before deciding to share it with his friends.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The man playfully hides behind the giant cookie before tossing it like a frisbee towards the camera.\nB: The man presents the giant cookie to the camera, then decorates it meticulously instead of eating it.\nC: A shot of the giant cookie baked is shown and leads into the man holding it in front of the camera and eating it.\nD: The man carefully presents the giant cookie in front of the camera before deciding to share it with his friends.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_110_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_110_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_110_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_110_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_110_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_110_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_110_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_110_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_110_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_110_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_110_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_110_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_110_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_110_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_110_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_110_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: The bartender grabs a cup, fills it with ice, and places it under the espresso machine.\nB: Then, the bartender takes a cup and prepares a cocktail in a shaker, then he pours it in the cup.\nC: Suddenly, the bartender grabs a cup, then, instead of a cocktail, he graciously serves a steaming cup of coffee.\nD: The bartender grabs a cup, fills it with coffee from the brewer, and then hands it to the customer.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The bartender grabs a cup, fills it with ice, and places it under the espresso machine.\nB: Then, the bartender takes a cup and prepares a cocktail in a shaker, then he pours it in the cup.\nC: Suddenly, the bartender grabs a cup, then, instead of a cocktail, he graciously serves a steaming cup of coffee.\nD: The bartender grabs a cup, fills it with coffee from the brewer, and then hands it to the customer.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_111_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_111_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_111_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_111_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_111_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_111_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_111_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_111_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_111_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_111_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_111_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_111_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_111_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_111_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_111_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_111_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: The girl paints a blue puck and hangs it on the wall.\nB: The girl uses a blue puck to play air hockey.\nC: The girl moves a blue puck back in place then pushes it forward.\nD: The girl paints a blue puck before tossing it into the pool.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The girl paints a blue puck and hangs it on the wall.\nB: The girl uses a blue puck to play air hockey.\nC: The girl moves a blue puck back in place then pushes it forward.\nD: The girl paints a blue puck before tossing it into the pool.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_112_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_112_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_112_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_112_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_112_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_112_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_112_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_112_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_112_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_112_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_112_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_112_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_112_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_112_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_112_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_112_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: The men laugh and walk to the right.\nB: The men laugh and walk towards the right.\nC: The men turn and face the left.\nD: The men laugh and walk towards the right.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The men laugh and walk to the right.\nB: The men laugh and walk towards the right.\nC: The men turn and face the left.\nD: The men laugh and walk towards the right.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_113_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_113_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_113_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_113_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_113_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_113_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_113_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_113_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_113_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_113_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_113_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_113_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_113_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_113_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_113_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_113_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: A person washes their hands.\nB: A person climbs a mountain.\nC: A person paints their nails.\nD: A person washes their hands.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A person washes their hands.\nB: A person climbs a mountain.\nC: A person paints their nails.\nD: A person washes their hands.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_114_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_114_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_114_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_114_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_114_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_114_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_114_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_114_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_114_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_114_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_114_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_114_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_114_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_114_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_114_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_114_15.jpg"], "output": "I", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: The boat sits idle on the water, tied to the dock.\nB: The boat slides down a ramp into the water.\nC: The boat is lifted from the water onto the ramp.\nD: The boat rests calmly on the water near the ramp.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The boat sits idle on the water, tied to the dock.\nB: The boat slides down a ramp into the water.\nC: The boat is lifted from the water onto the ramp.\nD: The boat rests calmly on the water near the ramp.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_115_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_115_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_115_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_115_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_115_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_115_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_115_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_115_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_115_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_115_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_115_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_115_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_115_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_115_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_115_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_115_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: One of the men reads a book and peacefully falls asleep on the sofa.\nB: One of the men drinks from a cup and falls down unconscious on the floor.\nC: One of the men sets down a cup and begins to energetically dance on the floor.\nD: One of the men places a cup on the floor and helps another to stand up.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: One of the men reads a book and peacefully falls asleep on the sofa.\nB: One of the men drinks from a cup and falls down unconscious on the floor.\nC: One of the men sets down a cup and begins to energetically dance on the floor.\nD: One of the men places a cup on the floor and helps another to stand up.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_116_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_116_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_116_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_116_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_116_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_116_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_116_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_116_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_116_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_116_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_116_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_116_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_116_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_116_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_116_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_116_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: We observe them carefully remove and frame the antique carpet from a different viewpoint.\nB: We observe them carefully arrange and expertly install a new carpet from a different perspective.\nC: We switch and see them rip up and lay new carpet from another angle.\nD: We change and observe them design and paint a mural from a different viewpoint.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: We observe them carefully remove and frame the antique carpet from a different viewpoint.\nB: We observe them carefully arrange and expertly install a new carpet from a different perspective.\nC: We switch and see them rip up and lay new carpet from another angle.\nD: We change and observe them design and paint a mural from a different viewpoint.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_117_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_117_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_117_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_117_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_117_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_117_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_117_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_117_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_117_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_117_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_117_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_117_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_117_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_117_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_117_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_117_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: A gymnast is seen standing ready before uneven bars while many are watching on the sides.\nB: A gymnast, surrounded by onlookers, ties her shoelaces before a long run.\nC: A gymnast, surrounded by spectators, is signing autographs next to the uneven bars.\nD: A gymnast is signing autographs for fans beside the uneven bars after her performance.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A gymnast is seen standing ready before uneven bars while many are watching on the sides.\nB: A gymnast, surrounded by onlookers, ties her shoelaces before a long run.\nC: A gymnast, surrounded by spectators, is signing autographs next to the uneven bars.\nD: A gymnast is signing autographs for fans beside the uneven bars after her performance.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_118_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_118_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_118_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_118_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_118_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_118_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_118_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_118_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_118_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_118_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_118_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_118_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_118_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_118_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_118_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_118_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: The little girl in the black blouse hands the lady a sponge, one at a time, as she scrubs the dishes.\nB: The little girl, holding a black blouse, helps the lady clean a sponge, one curler at a time.\nC: While the lady in the black blouse curls the child's hair the little girl is holding a sponge curler and hands it to the lady one at a time.\nD: The little girl in the black blouse hands a sponge to the lady, who is washing the dishes, one at a time.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The little girl in the black blouse hands the lady a sponge, one at a time, as she scrubs the dishes.\nB: The little girl, holding a black blouse, helps the lady clean a sponge, one curler at a time.\nC: While the lady in the black blouse curls the child's hair the little girl is holding a sponge curler and hands it to the lady one at a time.\nD: The little girl in the black blouse hands a sponge to the lady, who is washing the dishes, one at a time.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_119_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_119_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_119_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_119_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_119_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_119_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_119_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_119_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_119_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_119_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_119_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_119_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_119_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_119_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_119_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_119_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: The person feeds the calf while other people observe from a distance.\nB: The person draws a picture of the calf as other people watch in admiration.\nC: The person captures the calf and other people run in afterwards.\nD: The person feeds the calf while other people watch from a distance.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The person feeds the calf while other people observe from a distance.\nB: The person draws a picture of the calf as other people watch in admiration.\nC: The person captures the calf and other people run in afterwards.\nD: The person feeds the calf while other people watch from a distance.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_120_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_120_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_120_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_120_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_120_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_120_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_120_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_120_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_120_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_120_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_120_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_120_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_120_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_120_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_120_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_120_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: The man leisurely strolls along the track before settling down in the large sand pit to sunbathe.\nB: The man then runs down the track and jumps into a large sand pit.\nC: The man strolls along the track before settling down beside a large sand pit.\nD: The man walks along the track and plants a tree near the large sand pit.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The man leisurely strolls along the track before settling down in the large sand pit to sunbathe.\nB: The man then runs down the track and jumps into a large sand pit.\nC: The man strolls along the track before settling down beside a large sand pit.\nD: The man walks along the track and plants a tree near the large sand pit.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_121_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_121_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_121_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_121_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_121_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_121_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_121_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_121_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_121_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_121_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_121_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_121_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_121_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_121_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_121_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_121_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: A shirtless man wearing long pants and red shoes is bent over and has his two hands gripping onto a barbell with two very large weights on the ends of it.\nB: A man in red shoes and long pants, without a shirt, leisurely ties his shoelaces, ignoring the barbell with hefty weights at his side.\nC: A shirtless man wearing long pants and red shoes is gracefully dancing on a stage, the spotlight highlighting his every move.\nD: A shirtless man in long pants and red shoes is balancing a barbell with two large weights on his shoulders.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A shirtless man wearing long pants and red shoes is bent over and has his two hands gripping onto a barbell with two very large weights on the ends of it.\nB: A man in red shoes and long pants, without a shirt, leisurely ties his shoelaces, ignoring the barbell with hefty weights at his side.\nC: A shirtless man wearing long pants and red shoes is gracefully dancing on a stage, the spotlight highlighting his every move.\nD: A shirtless man in long pants and red shoes is balancing a barbell with two large weights on his shoulders.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_122_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_122_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_122_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_122_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_122_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_122_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_122_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_122_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_122_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_122_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_122_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_122_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_122_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_122_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_122_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_122_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: She wipes off some colors with a paintbrush and starts cleaning the stained canvas.\nB: She mixes some colors with a paintbrush and begins putting the paint on the blank canvas.\nC: With a paintbrush, she carefully cleans the colors off the finished canvas.\nD: She examines the blank canvas with a paintbrush in hand, deciding on the perfect palette of colors.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: She wipes off some colors with a paintbrush and starts cleaning the stained canvas.\nB: She mixes some colors with a paintbrush and begins putting the paint on the blank canvas.\nC: With a paintbrush, she carefully cleans the colors off the finished canvas.\nD: She examines the blank canvas with a paintbrush in hand, deciding on the perfect palette of colors.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_123_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_123_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_123_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_123_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_123_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_123_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_123_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_123_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_123_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_123_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_123_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_123_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_123_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_123_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_123_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_123_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: Two women are seen arguing over a parking spot.\nB: Two women are seen arguing over a book.\nC: Two women are seen debating about a political issue.\nD: Two women are shown talking about snorkling.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: Two women are seen arguing over a parking spot.\nB: Two women are seen arguing over a book.\nC: Two women are seen debating about a political issue.\nD: Two women are shown talking about snorkling.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_124_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_124_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_124_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_124_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_124_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_124_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_124_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_124_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_124_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_124_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_124_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_124_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_124_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_124_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_124_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_124_15.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: An adult writes a book.\nB: An adult walks into frame.\nC: An adult bursts into tears.\nD: The adult throws a frisbee out of frame.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: An adult writes a book.\nB: An adult walks into frame.\nC: An adult bursts into tears.\nD: The adult throws a frisbee out of frame.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_125_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_125_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_125_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_125_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_125_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_125_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_125_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_125_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_125_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_125_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_125_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_125_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_125_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_125_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_125_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_125_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: A male African American gymnast is in a large arena full of people preparing for a performance.\nB: A male African American gymnast is teaching a group of children in a large park.\nC: A male African American gymnast is teaching young children in a crowded community center.\nD: A male African American gymnast is teaching a group of people in a large park.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A male African American gymnast is in a large arena full of people preparing for a performance.\nB: A male African American gymnast is teaching a group of children in a large park.\nC: A male African American gymnast is teaching young children in a crowded community center.\nD: A male African American gymnast is teaching a group of people in a large park.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_126_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_126_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_126_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_126_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_126_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_126_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_126_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_126_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_126_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_126_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_126_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_126_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_126_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_126_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_126_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_126_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: He carefully cleans the tire, then places the hubcap and block safely to the side.\nB: He then puts a block down next to the tire as well as taking off the hubcap.\nC: He swiftly kicks the block aside, opting to adjust the tire pressure without removing the hubcap.\nD: He removes the block from beside the tire and then replaces the hubcap.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: He carefully cleans the tire, then places the hubcap and block safely to the side.\nB: He then puts a block down next to the tire as well as taking off the hubcap.\nC: He swiftly kicks the block aside, opting to adjust the tire pressure without removing the hubcap.\nD: He removes the block from beside the tire and then replaces the hubcap.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_127_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_127_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_127_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_127_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_127_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_127_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_127_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_127_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_127_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_127_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_127_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_127_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_127_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_127_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_127_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_127_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: A sofa sits in a room.\nB: A room is being measured for a new sofa.\nC: The room was cleared of everything except the sofa.\nD: A room is being measured to fit a sofa.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A sofa sits in a room.\nB: A room is being measured for a new sofa.\nC: The room was cleared of everything except the sofa.\nD: A room is being measured to fit a sofa.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_128_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_128_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_128_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_128_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_128_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_128_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_128_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_128_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_128_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_128_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_128_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_128_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_128_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_128_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_128_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_128_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: A person's feet and legs are shown followed by the person putting socks on and rolling their pants over the socks.\nB: A person's feet and legs are displayed, before they kick a soccer ball and sprint towards the goal.\nC: A person's feet and legs are displayed as they kick a soccer ball, then they sit to tie their shoelaces.\nD: A person's feet and legs are displayed as they kick a soccer ball and sprint across the field.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A person's feet and legs are shown followed by the person putting socks on and rolling their pants over the socks.\nB: A person's feet and legs are displayed, before they kick a soccer ball and sprint towards the goal.\nC: A person's feet and legs are displayed as they kick a soccer ball, then they sit to tie their shoelaces.\nD: A person's feet and legs are displayed as they kick a soccer ball and sprint across the field.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_129_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_129_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_129_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_129_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_129_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_129_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_129_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_129_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_129_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_129_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_129_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_129_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_129_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_129_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_129_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_129_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: The sun is setting, casting a warm glow on the kite that lies forgotten on the ground.\nB: The sun set, and no longer could we see the kite lost in the sky.\nC: The kite is high in the sky and is seen against a bright sun.\nD: The sun sets, dimming its brightness as the kite lays forgotten on the ground.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The sun is setting, casting a warm glow on the kite that lies forgotten on the ground.\nB: The sun set, and no longer could we see the kite lost in the sky.\nC: The kite is high in the sky and is seen against a bright sun.\nD: The sun sets, dimming its brightness as the kite lays forgotten on the ground.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_130_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_130_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_130_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_130_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_130_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_130_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_130_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_130_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_130_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_130_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_130_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_130_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_130_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_130_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_130_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_130_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: The man is reading a book beside the white fence.\nB: The man is observed reading a book by the fence.\nC: The man is seen reading a book by the fence.\nD: The man is shown painting the fence white.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The man is reading a book beside the white fence.\nB: The man is observed reading a book by the fence.\nC: The man is seen reading a book by the fence.\nD: The man is shown painting the fence white.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_131_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_131_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_131_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_131_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_131_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_131_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_131_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_131_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_131_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_131_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_131_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_131_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_131_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_131_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_131_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_131_15.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: Suddenly, the little girl sits on the floor and folds her arms.\nB: Then, the little girl jumps to the ground and extend her arms.\nC: Suddenly, the little girl falls asleep on the grass, her arms folded under her head.\nD: Suddenly, the little girl kneels on the ground and covers her face with her arms.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: Suddenly, the little girl sits on the floor and folds her arms.\nB: Then, the little girl jumps to the ground and extend her arms.\nC: Suddenly, the little girl falls asleep on the grass, her arms folded under her head.\nD: Suddenly, the little girl kneels on the ground and covers her face with her arms.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_132_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_132_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_132_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_132_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_132_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_132_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_132_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_132_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_132_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_132_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_132_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_132_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_132_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_132_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_132_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_132_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: She tosses several ingredients into a pot and stirs it gently with a spoon.\nB: She mixes several ingredients into a bowl and spreads it around with a spoon.\nC: She takes several ingredients from the bowl and scatters them with a spoon.\nD: She pours various ingredients into a bowl and stirs it gently with a spoon.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: She tosses several ingredients into a pot and stirs it gently with a spoon.\nB: She mixes several ingredients into a bowl and spreads it around with a spoon.\nC: She takes several ingredients from the bowl and scatters them with a spoon.\nD: She pours various ingredients into a bowl and stirs it gently with a spoon.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_133_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_133_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_133_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_133_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_133_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_133_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_133_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_133_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_133_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_133_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_133_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_133_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_133_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_133_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_133_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_133_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: She calmly sits and watches the water ripple gently.\nB: She then flips and dives in the water with a small splash.\nC: She then stands and gazes at the water, holding a small shell.\nD: She calmly sips her coffee, gazing at the water's tranquil surface.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: She calmly sits and watches the water ripple gently.\nB: She then flips and dives in the water with a small splash.\nC: She then stands and gazes at the water, holding a small shell.\nD: She calmly sips her coffee, gazing at the water's tranquil surface.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_134_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_134_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_134_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_134_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_134_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_134_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_134_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_134_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_134_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_134_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_134_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_134_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_134_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_134_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_134_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_134_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: A man walks up to parallel bars while spectators, competitors, and officials are in the background.\nB: A man avoids parallel bars as spectators, competitors, and officials witness his disqualification in the background.\nC: A man, amid spectators, competitors, and officials, declines to participate in the parallel bars event.\nD: A man sits down to a chess tournament, with spectators, competitors, and officials observing his every move.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A man walks up to parallel bars while spectators, competitors, and officials are in the background.\nB: A man avoids parallel bars as spectators, competitors, and officials witness his disqualification in the background.\nC: A man, amid spectators, competitors, and officials, declines to participate in the parallel bars event.\nD: A man sits down to a chess tournament, with spectators, competitors, and officials observing his every move.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_135_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_135_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_135_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_135_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_135_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_135_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_135_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_135_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_135_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_135_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_135_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_135_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_135_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_135_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_135_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_135_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: A child goes across monkey bars.\nB: A child draws pictures of monkey bars.\nC: A child draws pictures of monkey bars.\nD: A child draws a picture of monkey bars.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A child goes across monkey bars.\nB: A child draws pictures of monkey bars.\nC: A child draws pictures of monkey bars.\nD: A child draws a picture of monkey bars.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_136_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_136_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_136_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_136_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_136_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_136_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_136_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_136_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_136_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_136_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_136_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_136_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_136_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_136_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_136_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_136_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: A man is lifting a large barbell in a competition.\nB: A man is cleaning a large barbell in a gym.\nC: A man is carefully polishing a large barbell for a display.\nD: A man is repairing a large barbell at a workshop.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A man is lifting a large barbell in a competition.\nB: A man is cleaning a large barbell in a gym.\nC: A man is carefully polishing a large barbell for a display.\nD: A man is repairing a large barbell at a workshop.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_137_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_137_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_137_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_137_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_137_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_137_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_137_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_137_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_137_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_137_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_137_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_137_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_137_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_137_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_137_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_137_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: She rinses the paint brush in a paint can, ending her day of painting.\nB: She dips the paint brush into a paint can and continues painting.\nC: She drops the paint brush into the paint can and stops working.\nD: She tosses the paint brush into a paint can and stops working.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: She rinses the paint brush in a paint can, ending her day of painting.\nB: She dips the paint brush into a paint can and continues painting.\nC: She drops the paint brush into the paint can and stops working.\nD: She tosses the paint brush into a paint can and stops working.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_138_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_138_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_138_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_138_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_138_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_138_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_138_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_138_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_138_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_138_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_138_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_138_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_138_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_138_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_138_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_138_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: He discards the items into the sink.\nB: He discards the items from the sink.\nC: He places the items on the sink.\nD: He throws the items into the trash.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: He discards the items into the sink.\nB: He discards the items from the sink.\nC: He places the items on the sink.\nD: He throws the items into the trash.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_139_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_139_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_139_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_139_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_139_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_139_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_139_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_139_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_139_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_139_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_139_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_139_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_139_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_139_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_139_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_139_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: A man is standing in a field in a small circle behind a green fence.\nB: A man is painting a small green fence in a field, encircling him.\nC: A man is repairing a green fence in a small field encircled by trees.\nD: A man is mending a green fence in a small circle within a field.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A man is standing in a field in a small circle behind a green fence.\nB: A man is painting a small green fence in a field, encircling him.\nC: A man is repairing a green fence in a small field encircled by trees.\nD: A man is mending a green fence in a small circle within a field.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_140_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_140_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_140_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_140_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_140_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_140_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_140_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_140_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_140_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_140_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_140_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_140_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_140_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_140_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_140_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_140_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: They both read a book together.\nB: They both read a book together.\nC: They both go down the slide together.\nD: They both cook dinner together.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: They both read a book together.\nB: They both read a book together.\nC: They both go down the slide together.\nD: They both cook dinner together.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_141_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_141_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_141_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_141_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_141_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_141_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_141_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_141_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_141_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_141_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_141_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_141_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_141_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_141_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_141_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_141_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: A woman in a pink jacket is walking her dog outdoors, leisurely enjoying the fresh air without engaging in any games or tricks.\nB: A woman in a pink jacket and her dog are enjoying a peaceful hike in the woods, observing the wildlife and resting by a serene lake.\nC: A woman in a pink jacket and her dog are outdoors and doing tricks wish discs as she throws them the dog catches, as well as the dog jumping over her, rolling over, dancing.\nD: A woman in a pink jacket and her dog are leisurely hiking outdoors, exploring nature trails and resting by a serene lake.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A woman in a pink jacket is walking her dog outdoors, leisurely enjoying the fresh air without engaging in any games or tricks.\nB: A woman in a pink jacket and her dog are enjoying a peaceful hike in the woods, observing the wildlife and resting by a serene lake.\nC: A woman in a pink jacket and her dog are outdoors and doing tricks wish discs as she throws them the dog catches, as well as the dog jumping over her, rolling over, dancing.\nD: A woman in a pink jacket and her dog are leisurely hiking outdoors, exploring nature trails and resting by a serene lake.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_142_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_142_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_142_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_142_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_142_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_142_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_142_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_142_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_142_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_142_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_142_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_142_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_142_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_142_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_142_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_142_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: She picks up a towel, soaks it in vinegar, and begins to massage her aching feet with it.\nB: She picks up a towel, soaks it in vinegar, and begins to dab it on her sunburn for relief.\nC: She then grabs a towel,dips it in the vinegar and starts to wipe the table to clean it.\nD: She grabs a towel, soaks it in vinegar, and begins to pat dry the freshly washed vegetables.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: She picks up a towel, soaks it in vinegar, and begins to massage her aching feet with it.\nB: She picks up a towel, soaks it in vinegar, and begins to dab it on her sunburn for relief.\nC: She then grabs a towel,dips it in the vinegar and starts to wipe the table to clean it.\nD: She grabs a towel, soaks it in vinegar, and begins to pat dry the freshly washed vegetables.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_143_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_143_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_143_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_143_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_143_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_143_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_143_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_143_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_143_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_143_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_143_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_143_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_143_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_143_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_143_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_143_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: Only one player remained on one team, while the other team gained two players.\nB: Only one player from one team, and two from the other, attended the charity event.\nC: Then only one player left on one team and two players on the other one.\nD: One player remained on one team, while the other team gained two new players.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: Only one player remained on one team, while the other team gained two players.\nB: Only one player from one team, and two from the other, attended the charity event.\nC: Then only one player left on one team and two players on the other one.\nD: One player remained on one team, while the other team gained two new players.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_144_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_144_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_144_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_144_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_144_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_144_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_144_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_144_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_144_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_144_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_144_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_144_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_144_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_144_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_144_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_144_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: A man empties the contents of his backpack.\nB: A man puts the backpack on his back.\nC: The man rummages through his backpack.\nD: The man empties his backpack onto the table.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A man empties the contents of his backpack.\nB: A man puts the backpack on his back.\nC: The man rummages through his backpack.\nD: The man empties his backpack onto the table.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_145_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_145_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_145_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_145_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_145_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_145_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_145_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_145_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_145_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_145_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_145_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_145_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_145_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_145_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_145_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_145_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: Sentence: The woman folds the t-shirt and places the iron aside.\nB: woman is holding an iron and is ironing the t shirt.\nC: Sentence: The woman is folding the t-shirt and placing the iron back on its stand.\nD: Woman is folding a t-shirt after removing the iron.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: Sentence: The woman folds the t-shirt and places the iron aside.\nB: woman is holding an iron and is ironing the t shirt.\nC: Sentence: The woman is folding the t-shirt and placing the iron back on its stand.\nD: Woman is folding a t-shirt after removing the iron.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_146_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_146_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_146_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_146_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_146_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_146_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_146_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_146_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_146_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_146_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_146_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_146_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_146_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_146_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_146_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_146_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: They line up one by one, waiting to purchase ice cream below.\nB: They jump off one by one, landing in the water below.\nC: They sit one by one, stargazing on the water's edge.\nD: They march in line, one by one, towards the dining hall.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: They line up one by one, waiting to purchase ice cream below.\nB: They jump off one by one, landing in the water below.\nC: They sit one by one, stargazing on the water's edge.\nD: They march in line, one by one, towards the dining hall.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_147_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_147_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_147_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_147_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_147_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_147_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_147_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_147_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_147_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_147_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_147_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_147_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_147_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_147_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_147_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_147_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: The man dismounts the horse, kneels to pet the calf gently, then rises and strolls away.\nB: Then, the man get down the horse and kneels to tie the legs of the calf, then the man raises and walk.\nC: The man dismounts the horse, gently strokes the calf, and then strolls away leisurely.\nD: The man dismounts the horse, kneels to pet the calf, then stands up and strolls.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The man dismounts the horse, kneels to pet the calf gently, then rises and strolls away.\nB: Then, the man get down the horse and kneels to tie the legs of the calf, then the man raises and walk.\nC: The man dismounts the horse, gently strokes the calf, and then strolls away leisurely.\nD: The man dismounts the horse, kneels to pet the calf, then stands up and strolls.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_148_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_148_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_148_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_148_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_148_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_148_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_148_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_148_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_148_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_148_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_148_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_148_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_148_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_148_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_148_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_148_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: The man gently takes the bar, serves an array of cocktails, then bows to the applauding crowd.\nB: The man then grabs the bar and does a series of flips and turns and then jumps off and nods to the crowd.\nC: The man calmly approaches the bar, orders a drink, then sits quietly, acknowledging the crowd with a brief nod.\nD: The man gently takes the bar, serves a series of cocktails, and then bows to the applauding crowd.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The man gently takes the bar, serves an array of cocktails, then bows to the applauding crowd.\nB: The man then grabs the bar and does a series of flips and turns and then jumps off and nods to the crowd.\nC: The man calmly approaches the bar, orders a drink, then sits quietly, acknowledging the crowd with a brief nod.\nD: The man gently takes the bar, serves a series of cocktails, and then bows to the applauding crowd.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_149_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_149_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_149_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_149_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_149_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_149_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_149_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_149_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_149_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_149_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_149_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_149_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_149_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_149_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_149_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_149_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: The box is effortlessly lifted by a pair of scissors, which then elegantly glide through a sheet of wrapping paper.\nB: Wrapping paper is seen cover the box followed by scissors cutting the paper and laying down a box.\nC: Scissors slice through wrapping paper before it's folded into a box.\nD: The box is sitting idle, flanked by unused wrapping paper and idle scissors.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The box is effortlessly lifted by a pair of scissors, which then elegantly glide through a sheet of wrapping paper.\nB: Wrapping paper is seen cover the box followed by scissors cutting the paper and laying down a box.\nC: Scissors slice through wrapping paper before it's folded into a box.\nD: The box is sitting idle, flanked by unused wrapping paper and idle scissors.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_150_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_150_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_150_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_150_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_150_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_150_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_150_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_150_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_150_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_150_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_150_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_150_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_150_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_150_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_150_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_150_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: A man is seen looking around a field with audience members watching and leads into him running down a field and pole volting over a bar.\nB: A man, observed by a crowd, leisurely strolls across a field, pauses to examine a pole, and playfully attempts to limbo under it.\nC: A man is observed entertaining an audience by juggling balls, before sprinting across a field to pole vault over a high bar.\nD: A man walks through a field, observing the audience, before he begins a leisurely stroll down the lane, using a pole to navigate over a small creek.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A man is seen looking around a field with audience members watching and leads into him running down a field and pole volting over a bar.\nB: A man, observed by a crowd, leisurely strolls across a field, pauses to examine a pole, and playfully attempts to limbo under it.\nC: A man is observed entertaining an audience by juggling balls, before sprinting across a field to pole vault over a high bar.\nD: A man walks through a field, observing the audience, before he begins a leisurely stroll down the lane, using a pole to navigate over a small creek.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_151_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_151_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_151_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_151_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_151_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_151_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_151_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_151_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_151_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_151_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_151_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_151_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_151_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_151_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_151_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_151_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: The woman tossed a frisbee for the dog, who sprinted a few feet to catch it.\nB: The woman raise a feet for the dog to jump over and catch a frisbee.\nC: The woman showed the dog a frisbee and made it sit at her feet.\nD: Sentence: The dog fetches the woman's slippers as she sits, feet propped up.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The woman tossed a frisbee for the dog, who sprinted a few feet to catch it.\nB: The woman raise a feet for the dog to jump over and catch a frisbee.\nC: The woman showed the dog a frisbee and made it sit at her feet.\nD: Sentence: The dog fetches the woman's slippers as she sits, feet propped up.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_152_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_152_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_152_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_152_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_152_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_152_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_152_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_152_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_152_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_152_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_152_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_152_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_152_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_152_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_152_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_152_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: Clips from the game are analyzed and critiqued, while players sit quietly reflecting.\nB: More clips of the game are shown back to back as well as players cheering and celebrating.\nC: Players are seen discussing strategies, with game clips playing in the background.\nD: Players are seen studying the game's clips and reflecting on their moves, instead of cheering and celebrating.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: Clips from the game are analyzed and critiqued, while players sit quietly reflecting.\nB: More clips of the game are shown back to back as well as players cheering and celebrating.\nC: Players are seen discussing strategies, with game clips playing in the background.\nD: Players are seen studying the game's clips and reflecting on their moves, instead of cheering and celebrating.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_153_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_153_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_153_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_153_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_153_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_153_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_153_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_153_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_153_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_153_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_153_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_153_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_153_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_153_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_153_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_153_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: A crew of workers works on constructing a brick wall.\nB: A team of laborers enjoys a lunch break beside a finished brick wall.\nC: A crew of workers takes a break after demolishing a brick wall.\nD: A team of laborers is having lunch after laying bricks all morning.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A crew of workers works on constructing a brick wall.\nB: A team of laborers enjoys a lunch break beside a finished brick wall.\nC: A crew of workers takes a break after demolishing a brick wall.\nD: A team of laborers is having lunch after laying bricks all morning.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_154_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_154_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_154_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_154_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_154_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_154_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_154_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_154_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_154_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_154_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_154_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_154_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_154_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_154_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_154_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_154_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: He asks the woman to hold his book as he scoops ice cream into a bowl.\nB: He escorts the woman out of the store, the forgotten ice cream cone melting on the counter.\nC: He hands the woman a cone and then puts ice cream on top.\nD: He takes a photo of the woman who dropped her ice cream cone.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: He asks the woman to hold his book as he scoops ice cream into a bowl.\nB: He escorts the woman out of the store, the forgotten ice cream cone melting on the counter.\nC: He hands the woman a cone and then puts ice cream on top.\nD: He takes a photo of the woman who dropped her ice cream cone.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_155_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_155_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_155_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_155_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_155_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_155_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_155_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_155_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_155_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_155_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_155_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_155_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_155_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_155_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_155_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_155_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: Several people guide the bull with sticks while someone heals the person injured by the bull.\nB: Several people taunt the bull with sticks while someone is hurt by the bull.\nC: Several people feed the bull with apples while someone is stroking the bull.\nD: Numerous individuals watched the bull peacefully from a distance as someone fed it a snack.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: Several people guide the bull with sticks while someone heals the person injured by the bull.\nB: Several people taunt the bull with sticks while someone is hurt by the bull.\nC: Several people feed the bull with apples while someone is stroking the bull.\nD: Numerous individuals watched the bull peacefully from a distance as someone fed it a snack.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_156_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_156_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_156_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_156_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_156_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_156_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_156_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_156_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_156_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_156_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_156_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_156_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_156_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_156_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_156_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_156_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: A bull is peacefully grazing in the field, while others lazily rest in the warm sunshine.\nB: A bull is then killed and laid in the dirt while others roam around him.\nC: A bull is pampered and fed in the grass as others peacefully graze around him.\nD: A bull peacefully grazes in the meadow as others frolic around him.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A bull is peacefully grazing in the field, while others lazily rest in the warm sunshine.\nB: A bull is then killed and laid in the dirt while others roam around him.\nC: A bull is pampered and fed in the grass as others peacefully graze around him.\nD: A bull peacefully grazes in the meadow as others frolic around him.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_157_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_157_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_157_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_157_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_157_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_157_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_157_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_157_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_157_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_157_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_157_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_157_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_157_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_157_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_157_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_157_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: The man and his dog collect several frisbees scattered on the side, while many people cheer them on.\nB: Many people watch on the side as the man performs tricks with the dog using several frisbees.\nC: Several frisbees lay unused as the man and his dog take a nap, with people passing by quietly.\nD: Several people notice a man feeding his dog with multiple frisbees on the grass.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The man and his dog collect several frisbees scattered on the side, while many people cheer them on.\nB: Many people watch on the side as the man performs tricks with the dog using several frisbees.\nC: Several frisbees lay unused as the man and his dog take a nap, with people passing by quietly.\nD: Several people notice a man feeding his dog with multiple frisbees on the grass.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_158_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_158_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_158_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_158_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_158_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_158_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_158_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_158_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_158_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_158_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_158_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_158_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_158_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_158_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_158_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_158_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: The kindling floats on water.\nB: The kindling is piled up in the shed.\nC: The kindling floats on the river.\nD: The kindling catches on fire.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The kindling floats on water.\nB: The kindling is piled up in the shed.\nC: The kindling floats on the river.\nD: The kindling catches on fire.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_159_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_159_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_159_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_159_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_159_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_159_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_159_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_159_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_159_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_159_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_159_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_159_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_159_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_159_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_159_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_159_15.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: She diligently organizes her tools, demonstrating how she sorts her paint colors.\nB: She meticulously cleans her tools, explaining the color theory behind her palette selection.\nC: She continues to paint along the picture while showing off her tools and how she blends the colors.\nD: She meticulously organizes her art tools, demonstrating her unique approach to color categorization.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: She diligently organizes her tools, demonstrating how she sorts her paint colors.\nB: She meticulously cleans her tools, explaining the color theory behind her palette selection.\nC: She continues to paint along the picture while showing off her tools and how she blends the colors.\nD: She meticulously organizes her art tools, demonstrating her unique approach to color categorization.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_160_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_160_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_160_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_160_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_160_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_160_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_160_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_160_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_160_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_160_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_160_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_160_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_160_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_160_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_160_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_160_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: The girl feeds the cat with her left hand.\nB: The girl holds the cats paw in her left hand.\nC: The cat swipes a toy from the girl's left hand.\nD: The girl pours milk into the cat's bowl with her right hand.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The girl feeds the cat with her left hand.\nB: The girl holds the cats paw in her left hand.\nC: The cat swipes a toy from the girl's left hand.\nD: The girl pours milk into the cat's bowl with her right hand.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_161_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_161_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_161_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_161_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_161_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_161_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_161_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_161_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_161_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_161_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_161_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_161_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_161_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_161_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_161_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_161_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: Eggs are poached, onion caramelized, eggs garnished and served.\nB: Onions are saut\u00e9ed, eggs cracked, and scrambled together.\nC: Eggs are boiled, onion chopped, eggs drained and chopped.\nD: Onions are caramelized, eggs scrambled, and both are mixed for a delicious breakfast.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: Eggs are poached, onion caramelized, eggs garnished and served.\nB: Onions are saut\u00e9ed, eggs cracked, and scrambled together.\nC: Eggs are boiled, onion chopped, eggs drained and chopped.\nD: Onions are caramelized, eggs scrambled, and both are mixed for a delicious breakfast.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_162_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_162_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_162_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_162_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_162_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_162_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_162_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_162_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_162_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_162_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_162_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_162_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_162_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_162_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_162_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_162_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: woman is siting in a bed and its putting white socks.\nB: Sentence: A woman is dancing in a field, removing her white socks.\nC: Woman is standing on a porch, folding white socks.\nD: Sentence: Woman is standing in a kitchen, cooking pasta.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: woman is siting in a bed and its putting white socks.\nB: Sentence: A woman is dancing in a field, removing her white socks.\nC: Woman is standing on a porch, folding white socks.\nD: Sentence: Woman is standing in a kitchen, cooking pasta.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_163_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_163_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_163_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_163_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_163_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_163_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_163_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_163_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_163_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_163_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_163_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_163_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_163_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_163_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_163_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_163_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: The coach invited them for a celebratory dinner after their successful match.\nB: The coach cheered them on and distributed water bottles during their exhausting practice.\nC: The coach watched and evaluates them to give them any tips and pointer.\nD: The coach organized a team dinner to build unity and camarity amongst them.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The coach invited them for a celebratory dinner after their successful match.\nB: The coach cheered them on and distributed water bottles during their exhausting practice.\nC: The coach watched and evaluates them to give them any tips and pointer.\nD: The coach organized a team dinner to build unity and camarity amongst them.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_164_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_164_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_164_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_164_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_164_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_164_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_164_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_164_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_164_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_164_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_164_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_164_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_164_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_164_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_164_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_164_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: He previously nailed a metal piece onto the wall, now he is painting a new shingle to hang on it.\nB: He examines the metal piece he had previously nailed in before deciding to polish it instead of replacing it with a new shingle.\nC: He dusts off the metal piece he previously nailed in and removes the old shingle for replacement.\nD: He lays down a new shingle to replace it over the metal piece he previously nailed in.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: He previously nailed a metal piece onto the wall, now he is painting a new shingle to hang on it.\nB: He examines the metal piece he had previously nailed in before deciding to polish it instead of replacing it with a new shingle.\nC: He dusts off the metal piece he previously nailed in and removes the old shingle for replacement.\nD: He lays down a new shingle to replace it over the metal piece he previously nailed in.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_165_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_165_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_165_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_165_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_165_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_165_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_165_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_165_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_165_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_165_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_165_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_165_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_165_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_165_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_165_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_165_15.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: A man savors a unique dessert.\nB: A man changes his routine.\nC: A man does the same.\nD: A man alters the pattern.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A man savors a unique dessert.\nB: A man changes his routine.\nC: A man does the same.\nD: A man alters the pattern.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_166_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_166_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_166_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_166_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_166_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_166_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_166_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_166_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_166_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_166_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_166_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_166_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_166_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_166_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_166_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_166_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: The man begins to dismantle the piece.\nB: The man begins to disassemble the piece.\nC: The man proceeds to assemble the piece.\nD: The man decides to disassemble the piece.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The man begins to dismantle the piece.\nB: The man begins to disassemble the piece.\nC: The man proceeds to assemble the piece.\nD: The man decides to disassemble the piece.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_167_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_167_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_167_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_167_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_167_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_167_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_167_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_167_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_167_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_167_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_167_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_167_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_167_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_167_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_167_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_167_15.jpg"], "output": "F", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: Then, the man saw a speck of dust in the other eye.\nB: Later, the man removed the contact lens from the other eye.\nC: Suddenly, the man saw a tear forming in the other eye.\nD: Next, the man put the contact lens in the other eye.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: Then, the man saw a speck of dust in the other eye.\nB: Later, the man removed the contact lens from the other eye.\nC: Suddenly, the man saw a tear forming in the other eye.\nD: Next, the man put the contact lens in the other eye.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_168_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_168_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_168_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_168_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_168_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_168_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_168_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_168_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_168_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_168_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_168_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_168_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_168_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_168_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_168_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_168_15.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: He bought the shoe with cash.\nB: He scrubs the shoe with a brush.\nC: He trips over the shoe and drops the brush.\nD: He ties the shoe with a lace.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: He bought the shoe with cash.\nB: He scrubs the shoe with a brush.\nC: He trips over the shoe and drops the brush.\nD: He ties the shoe with a lace.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_169_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_169_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_169_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_169_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_169_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_169_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_169_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_169_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_169_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_169_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_169_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_169_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_169_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_169_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_169_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_169_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: She tosses a towel onto her legs and hands a lotion bottle to her friend.\nB: She tosses a towel onto her legs and hands her lotion to a friend.\nC: She pours lotion onto her hand and puts it on her legs with a towel.\nD: She hands the towel to her friend and uses the lotion to massage her tired legs.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: She tosses a towel onto her legs and hands a lotion bottle to her friend.\nB: She tosses a towel onto her legs and hands her lotion to a friend.\nC: She pours lotion onto her hand and puts it on her legs with a towel.\nD: She hands the towel to her friend and uses the lotion to massage her tired legs.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_170_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_170_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_170_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_170_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_170_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_170_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_170_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_170_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_170_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_170_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_170_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_170_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_170_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_170_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_170_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_170_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: A man is seen bending down in the middle of a forest.\nB: A man is spotted climbing a tree in the heart of a forest.\nC: A man is spotted climbing a tree in the heart of the forest.\nD: A man is spotted climbing a tree in the heart of a forest.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A man is seen bending down in the middle of a forest.\nB: A man is spotted climbing a tree in the heart of a forest.\nC: A man is spotted climbing a tree in the heart of the forest.\nD: A man is spotted climbing a tree in the heart of a forest.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_171_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_171_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_171_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_171_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_171_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_171_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_171_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_171_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_171_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_171_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_171_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_171_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_171_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_171_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_171_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_171_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: The woman, cradling the baby gently in a chair, watches the machines while the yarn lies untouched.\nB: Several shots of machines and yarn are shown as well as the woman still knitting in a chair and helping a baby.\nC: The woman in the chair gently sways the baby to sleep, while machines and colorful yarn are set aside, unused.\nD: The woman is bottle-feeding a baby in a chair while machines and yarn lie dormant in the background.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The woman, cradling the baby gently in a chair, watches the machines while the yarn lies untouched.\nB: Several shots of machines and yarn are shown as well as the woman still knitting in a chair and helping a baby.\nC: The woman in the chair gently sways the baby to sleep, while machines and colorful yarn are set aside, unused.\nD: The woman is bottle-feeding a baby in a chair while machines and yarn lie dormant in the background.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_172_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_172_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_172_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_172_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_172_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_172_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_172_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_172_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_172_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_172_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_172_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_172_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_172_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_172_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_172_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_172_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: The girl reads a book while the two men engage in a heated debate.\nB: The two men continue to play with one another as the girl continues to watch on the side.\nC: The girl interrupts the heated debate between the two men, demanding their attention.\nD: The girl hands a book to the two men who stop their conversation to thank her.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The girl reads a book while the two men engage in a heated debate.\nB: The two men continue to play with one another as the girl continues to watch on the side.\nC: The girl interrupts the heated debate between the two men, demanding their attention.\nD: The girl hands a book to the two men who stop their conversation to thank her.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_173_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_173_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_173_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_173_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_173_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_173_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_173_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_173_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_173_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_173_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_173_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_173_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_173_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_173_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_173_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_173_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: The person suddenly starts climbing the hill with a tube.\nB: The person then begins riding down the hill in a tube.\nC: The person starts climbing up the hill with a tube.\nD: The person then starts climbing up the hill with a backpack.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The person suddenly starts climbing the hill with a tube.\nB: The person then begins riding down the hill in a tube.\nC: The person starts climbing up the hill with a tube.\nD: The person then starts climbing up the hill with a backpack.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_174_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_174_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_174_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_174_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_174_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_174_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_174_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_174_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_174_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_174_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_174_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_174_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_174_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_174_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_174_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_174_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: A child paints a picture as an adult frames it on the wall.\nB: A child washes and dry their hands leaving the water on which and adult turns off.\nC: An adult reads a book while a child playfully turns the pages.\nD: A child playfully chases an adult around the garden, leaving their hands dirty.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A child paints a picture as an adult frames it on the wall.\nB: A child washes and dry their hands leaving the water on which and adult turns off.\nC: An adult reads a book while a child playfully turns the pages.\nD: A child playfully chases an adult around the garden, leaving their hands dirty.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_175_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_175_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_175_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_175_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_175_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_175_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_175_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_175_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_175_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_175_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_175_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_175_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_175_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_175_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_175_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_175_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: She bends down to tie her shoelaces, then gets up, laughing uncontrollably.\nB: She bends down to put socks on, then grabs her back in pain.\nC: She stoops to tie her shoelaces, then reaches back to adjust her ponytail.\nD: She stoops down to tie her shoelaces, and then clutches her necklace in joy.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: She bends down to tie her shoelaces, then gets up, laughing uncontrollably.\nB: She bends down to put socks on, then grabs her back in pain.\nC: She stoops to tie her shoelaces, then reaches back to adjust her ponytail.\nD: She stoops down to tie her shoelaces, and then clutches her necklace in joy.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_176_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_176_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_176_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_176_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_176_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_176_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_176_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_176_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_176_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_176_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_176_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_176_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_176_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_176_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_176_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_176_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: The person removes the ingredients from the bread as the camera captures her actions.\nB: The person puts more ingredients on the bread while the camera watches her movements.\nC: The camera captures the person as she removes ingredients from the bread.\nD: The camera captures the person as she cleans the bread crumbs off the table.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The person removes the ingredients from the bread as the camera captures her actions.\nB: The person puts more ingredients on the bread while the camera watches her movements.\nC: The camera captures the person as she removes ingredients from the bread.\nD: The camera captures the person as she cleans the bread crumbs off the table.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_177_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_177_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_177_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_177_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_177_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_177_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_177_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_177_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_177_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_177_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_177_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_177_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_177_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_177_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_177_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_177_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: He is watching a fish swim beneath the ice through a clear hole.\nB: He is attempting to catch a fish through a hole in the ice.\nC: He is teaching a fish to swim through a hole in the ice.\nD: He is reading a book beside a hole in the ice, ignoring the fish.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: He is watching a fish swim beneath the ice through a clear hole.\nB: He is attempting to catch a fish through a hole in the ice.\nC: He is teaching a fish to swim through a hole in the ice.\nD: He is reading a book beside a hole in the ice, ignoring the fish.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_178_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_178_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_178_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_178_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_178_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_178_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_178_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_178_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_178_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_178_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_178_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_178_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_178_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_178_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_178_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_178_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: She leaves the platform and approaches the other side.\nB: She lingers on the other side, refusing to ascend the platform.\nC: She stops halfway and retreats back from the platform.\nD: She reaches the other side and steps onto the platform.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: She leaves the platform and approaches the other side.\nB: She lingers on the other side, refusing to ascend the platform.\nC: She stops halfway and retreats back from the platform.\nD: She reaches the other side and steps onto the platform.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_179_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_179_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_179_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_179_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_179_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_179_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_179_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_179_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_179_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_179_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_179_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_179_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_179_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_179_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_179_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_179_15.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: The man cleans the paint-soaked brush while the bare cabinet awaits its transformation.\nB: A man cleans a paintbrush before storing the unused paint and the bare cabinet.\nC: Paint is applied to a brush and the man puts a first coat onto the bare cabinet.\nD: The man cleans the paint off the brush after accidentally smearing it on the cabinet.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The man cleans the paint-soaked brush while the bare cabinet awaits its transformation.\nB: A man cleans a paintbrush before storing the unused paint and the bare cabinet.\nC: Paint is applied to a brush and the man puts a first coat onto the bare cabinet.\nD: The man cleans the paint off the brush after accidentally smearing it on the cabinet.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_180_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_180_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_180_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_180_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_180_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_180_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_180_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_180_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_180_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_180_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_180_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_180_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_180_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_180_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_180_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_180_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: A woman throws a frisbee for the dog to catch in a backyard.\nB: A woman finds a frisbee under a bush while gardening in her backyard, which her dog had lost.\nC: A woman plants a tree in the backyard while her dog watches.\nD: A woman is watering plants in a backyard while the dog watches.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A woman throws a frisbee for the dog to catch in a backyard.\nB: A woman finds a frisbee under a bush while gardening in her backyard, which her dog had lost.\nC: A woman plants a tree in the backyard while her dog watches.\nD: A woman is watering plants in a backyard while the dog watches.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_181_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_181_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_181_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_181_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_181_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_181_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_181_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_181_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_181_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_181_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_181_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_181_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_181_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_181_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_181_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_181_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: His trainer praises him for his improvement.\nB: His trainer congratulates him.\nC: His trainer comes towards him.\nD: His trainer submits his resignation to him.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: His trainer praises him for his improvement.\nB: His trainer congratulates him.\nC: His trainer comes towards him.\nD: His trainer submits his resignation to him.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_182_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_182_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_182_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_182_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_182_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_182_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_182_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_182_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_182_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_182_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_182_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_182_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_182_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_182_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_182_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_182_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: The woman carefully feeds the dog as it eagerly catches the food in its mouth.\nB: The woman continues to throw the frisbee around while the dog chases after it and grabs it in it's mouth.\nC: The woman feeds the dog while it eagerly catches the kibble in its mouth.\nD: The woman feeds the dog as it sits patiently, gripping its favorite frisbee in its mouth.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The woman carefully feeds the dog as it eagerly catches the food in its mouth.\nB: The woman continues to throw the frisbee around while the dog chases after it and grabs it in it's mouth.\nC: The woman feeds the dog while it eagerly catches the kibble in its mouth.\nD: The woman feeds the dog as it sits patiently, gripping its favorite frisbee in its mouth.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_183_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_183_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_183_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_183_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_183_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_183_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_183_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_183_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_183_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_183_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_183_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_183_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_183_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_183_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_183_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_183_15.jpg"], "output": "E", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: The man shares a joke with the cop as he polishes his shoes.\nB: A cop looks at him while the man cuts his hair and shaves it all off.\nC: A man plays chess with a cop as his hair is blown by the wind.\nD: A man trims his beard as the cop interrogates him about the missing evidence.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The man shares a joke with the cop as he polishes his shoes.\nB: A cop looks at him while the man cuts his hair and shaves it all off.\nC: A man plays chess with a cop as his hair is blown by the wind.\nD: A man trims his beard as the cop interrogates him about the missing evidence.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_184_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_184_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_184_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_184_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_184_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_184_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_184_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_184_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_184_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_184_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_184_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_184_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_184_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_184_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_184_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_184_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: The woman fills the box with old papers and seals it, preparing it for recycling.\nB: The woman opens the box, removes the paper, and untapes the ends to reveal a surprise gift inside.\nC: The woman covers the box with paper and tapes up the ends to create a finished, wrapped present.\nD: The woman opens the box, removes the paper, revealing a beautifully crafted present inside.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The woman fills the box with old papers and seals it, preparing it for recycling.\nB: The woman opens the box, removes the paper, and untapes the ends to reveal a surprise gift inside.\nC: The woman covers the box with paper and tapes up the ends to create a finished, wrapped present.\nD: The woman opens the box, removes the paper, revealing a beautifully crafted present inside.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_185_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_185_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_185_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_185_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_185_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_185_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_185_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_185_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_185_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_185_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_185_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_185_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_185_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_185_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_185_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_185_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: He uses his long beard to cover the electric razor as a surprise gift.\nB: he begins to shave his long beard with an electric razor.\nC: He starts to comb his long beard with an electric brush.\nD: He starts to stroke his long beard with contemplative fingers.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: He uses his long beard to cover the electric razor as a surprise gift.\nB: he begins to shave his long beard with an electric razor.\nC: He starts to comb his long beard with an electric brush.\nD: He starts to stroke his long beard with contemplative fingers.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_186_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_186_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_186_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_186_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_186_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_186_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_186_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_186_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_186_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_186_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_186_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_186_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_186_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_186_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_186_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_186_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: A person is soaking calmly in the tub water.\nB: A person is kicking around in the tub water.\nC: A person is relaxing in the tub water.\nD: A person is reading a book beside the tub water.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A person is soaking calmly in the tub water.\nB: A person is kicking around in the tub water.\nC: A person is relaxing in the tub water.\nD: A person is reading a book beside the tub water.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_187_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_187_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_187_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_187_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_187_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_187_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_187_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_187_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_187_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_187_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_187_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_187_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_187_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_187_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_187_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_187_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: The man paints on a piece of metal.\nB: The man soldiers on a piece of metal.\nC: The man accidentally stepped on a piece of metal.\nD: The man bends a piece of metal.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The man paints on a piece of metal.\nB: The man soldiers on a piece of metal.\nC: The man accidentally stepped on a piece of metal.\nD: The man bends a piece of metal.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_188_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_188_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_188_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_188_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_188_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_188_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_188_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_188_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_188_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_188_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_188_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_188_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_188_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_188_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_188_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_188_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: He sighs as the audience gasps and his mistake is replayed on the screens.\nB: He cheers while the audience cheers and his shot is shown again for cameras.\nC: He sighs as the audience jeers and his error is replayed on the jumbotron.\nD: He sighs as the audience gasps and his mistake is replayed for the cameras.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: He sighs as the audience gasps and his mistake is replayed on the screens.\nB: He cheers while the audience cheers and his shot is shown again for cameras.\nC: He sighs as the audience jeers and his error is replayed on the jumbotron.\nD: He sighs as the audience gasps and his mistake is replayed for the cameras.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_189_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_189_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_189_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_189_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_189_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_189_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_189_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_189_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_189_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_189_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_189_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_189_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_189_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_189_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_189_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_189_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: A man is seen speaking to the camera and leads into him pouring ice into a glass as well as various liquids.\nB: A man is captured on camera carefully placing ice into a glass, followed by mixing different fluids.\nC: A man is spotted on camera reading a book, then he starts filling a glass with sand and different types of grains.\nD: A man is observed on camera, carefully arranging ice and different liquids into a vibrant display.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A man is seen speaking to the camera and leads into him pouring ice into a glass as well as various liquids.\nB: A man is captured on camera carefully placing ice into a glass, followed by mixing different fluids.\nC: A man is spotted on camera reading a book, then he starts filling a glass with sand and different types of grains.\nD: A man is observed on camera, carefully arranging ice and different liquids into a vibrant display.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_190_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_190_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_190_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_190_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_190_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_190_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_190_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_190_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_190_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_190_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_190_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_190_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_190_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_190_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_190_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_190_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: They laugh loudly as they walk.\nB: They hold hands as they walk.\nC: They turn flips as they go.\nD: They hold hands as they walk.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: They laugh loudly as they walk.\nB: They hold hands as they walk.\nC: They turn flips as they go.\nD: They hold hands as they walk.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_191_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_191_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_191_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_191_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_191_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_191_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_191_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_191_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_191_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_191_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_191_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_191_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_191_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_191_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_191_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_191_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: A man holding the camera is inside a gate while wearing a helmet and talking with another man.\nB: A man in a helmet is handing a camera to another man outside a gate.\nC: A man, wearing a helmet, handed over his camera to another man inside a gate.\nD: A man in a helmet is arguing with another man over a camera, outside the gate.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A man holding the camera is inside a gate while wearing a helmet and talking with another man.\nB: A man in a helmet is handing a camera to another man outside a gate.\nC: A man, wearing a helmet, handed over his camera to another man inside a gate.\nD: A man in a helmet is arguing with another man over a camera, outside the gate.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_192_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_192_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_192_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_192_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_192_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_192_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_192_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_192_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_192_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_192_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_192_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_192_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_192_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_192_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_192_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_192_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: The girl paints the red platform and climbs up the stairs.\nB: The girl paints a red platform and climbs up the stairs.\nC: The girl gets off on a red platform and walks down the stairs.\nD: The girl paints the red platform and climbs up the stairs.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The girl paints the red platform and climbs up the stairs.\nB: The girl paints a red platform and climbs up the stairs.\nC: The girl gets off on a red platform and walks down the stairs.\nD: The girl paints the red platform and climbs up the stairs.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_193_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_193_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_193_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_193_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_193_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_193_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_193_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_193_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_193_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_193_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_193_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_193_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_193_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_193_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_193_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_193_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: A group of kids watch a young boy throw a dart onto a glass window.\nB: A group of kids observe as a young boy gently places a dart onto a glass window for an art project.\nC: A bunch of children observe as a little boy sticks a drawing onto a glass window.\nD: A group of kids observe a young boy sketching a rainbow on a glass window.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: A group of kids watch a young boy throw a dart onto a glass window.\nB: A group of kids observe as a young boy gently places a dart onto a glass window for an art project.\nC: A bunch of children observe as a little boy sticks a drawing onto a glass window.\nD: A group of kids observe a young boy sketching a rainbow on a glass window.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_194_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_194_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_194_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_194_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_194_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_194_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_194_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_194_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_194_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_194_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_194_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_194_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_194_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_194_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_194_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_194_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: He frowns and roughly slaps his face multiple times.\nB: He touches his face several times and smiles.\nC: He slaps his face once and frowns.\nD: He slams his fist on the table and scowls.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: He frowns and roughly slaps his face multiple times.\nB: He touches his face several times and smiles.\nC: He slaps his face once and frowns.\nD: He slams his fist on the table and scowls.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_195_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_195_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_195_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_195_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_195_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_195_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_195_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_195_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_195_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_195_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_195_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_195_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_195_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_195_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_195_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_195_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: The men cook the fish, narrate the recipe to the camera, and continue to showcase various fish dishes.\nB: The men presents the fish to the camera as well as speak to the camera and continue to grab and hold up fish.\nC: The men toss the fish back into the water, silently wave to the camera, and proceed to cast their lines for another catch.\nD: The men cook the fish while bantering for a cooking show, occasionally glancing at the camera.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: The men cook the fish, narrate the recipe to the camera, and continue to showcase various fish dishes.\nB: The men presents the fish to the camera as well as speak to the camera and continue to grab and hold up fish.\nC: The men toss the fish back into the water, silently wave to the camera, and proceed to cast their lines for another catch.\nD: The men cook the fish while bantering for a cooking show, occasionally glancing at the camera.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_196_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_196_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_196_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_196_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_196_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_196_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_196_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_196_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_196_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_196_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_196_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_196_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_196_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_196_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_196_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_196_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: a man is on a snow covered lake with a fishing pole and fish reader.\nB: A man is studying a fish reader near a snow-covered lake, neglecting his abandoned fishing pole.\nC: A man is cleaning his fishing pole and fish reader, beside a snow covered lake.\nD: A man is inspecting his fishing pole and fish reader in the garage, eager for the lake to thaw.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: a man is on a snow covered lake with a fishing pole and fish reader.\nB: A man is studying a fish reader near a snow-covered lake, neglecting his abandoned fishing pole.\nC: A man is cleaning his fishing pole and fish reader, beside a snow covered lake.\nD: A man is inspecting his fishing pole and fish reader in the garage, eager for the lake to thaw.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_197_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_197_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_197_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_197_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_197_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_197_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_197_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_197_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_197_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_197_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_197_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_197_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_197_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_197_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_197_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_197_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: His shirt is neatly folded as he calmly steps on the stage.\nB: He is seen calmly folding his shirt before sitting down to read.\nC: His jump is shown again in slow motion as well as him taking his shirt off.\nD: His shirt is neatly folded as he sits calmly, refusing to jump into conclusions.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: His shirt is neatly folded as he calmly steps on the stage.\nB: He is seen calmly folding his shirt before sitting down to read.\nC: His jump is shown again in slow motion as well as him taking his shirt off.\nD: His shirt is neatly folded as he sits calmly, refusing to jump into conclusions.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_198_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_198_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_198_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_198_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_198_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_198_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_198_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_198_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_198_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_198_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_198_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_198_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_198_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_198_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_198_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_198_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "casuality_reasoning_var", "visual_input_component": "16 natural images", "source": "var", "options": "A: He places the red saw on the tile.\nB: He broke the red tile by dropping a saw.\nC: He cuts the tile with a red saw.\nD: He paints the tile with a red saw.", "question": "What event is most likely to have occurred during the blank frames?", "context": "You are given 16 images of sequential occurrences, some of which contain consecutive blank frames represented by white. Examine the details and answer the given question.\nSelect from the following choices.\nA: He places the red saw on the tile.\nB: He broke the red tile by dropping a saw.\nC: He cuts the tile with a red saw.\nD: He paints the tile with a red saw.\n", "input_image_path": ["./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_199_0.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_199_1.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_199_2.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_199_3.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_199_4.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_199_5.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_199_6.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_199_7.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_199_8.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_199_9.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_199_10.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_199_11.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_199_12.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_199_13.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_199_14.jpg", "./High-level-sub-semantic/casuality_reasoning_var/casuality_reasoning_var_199_15.jpg"], "output": "C", "qwen3-vl": "image none"}]
\ No newline at end of file
diff --git a/results/emotion_recognition_expw/qwen3-vl/metadata_info.json b/results/emotion_recognition_expw/qwen3-vl/metadata_info.json
new file mode 100644
index 0000000..881c736
--- /dev/null
+++ b/results/emotion_recognition_expw/qwen3-vl/metadata_info.json
@@ -0,0 +1 @@
+[{"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_0_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_0_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_0_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_0_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_1_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_1_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_1_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_1_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_2_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_2_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_2_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_2_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_3_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_3_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_3_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_3_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_4_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_4_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_4_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_4_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_5_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_5_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_5_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_5_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_6_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_6_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_6_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_6_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_7_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_7_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_7_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_7_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_8_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_8_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_8_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_8_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_9_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_9_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_9_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_9_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_10_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_10_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_10_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_10_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_11_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_11_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_11_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_11_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_12_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_12_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_12_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_12_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_13_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_13_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_13_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_13_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_14_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_14_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_14_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_14_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_15_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_15_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_15_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_15_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_16_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_16_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_16_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_16_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_17_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_17_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_17_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_17_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_18_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_18_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_18_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_18_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_19_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_19_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_19_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_19_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_20_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_20_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_20_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_20_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_21_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_21_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_21_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_21_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_22_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_22_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_22_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_22_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_23_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_23_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_23_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_23_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_24_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_24_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_24_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_24_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_25_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_25_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_25_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_25_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_26_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_26_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_26_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_26_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_27_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_27_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_27_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_27_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_28_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_28_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_28_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_28_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_29_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_29_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_29_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_29_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_30_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_30_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_30_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_30_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_31_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_31_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_31_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_31_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_32_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_32_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_32_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_32_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_33_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_33_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_33_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_33_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_34_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_34_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_34_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_34_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_35_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_35_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_35_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_35_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_36_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_36_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_36_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_36_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_37_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_37_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_37_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_37_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_38_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_38_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_38_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_38_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_39_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_39_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_39_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_39_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_40_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_40_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_40_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_40_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_41_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_41_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_41_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_41_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_42_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_42_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_42_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_42_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_43_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_43_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_43_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_43_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_44_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_44_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_44_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_44_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_45_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_45_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_45_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_45_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_46_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_46_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_46_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_46_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_47_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_47_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_47_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_47_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_48_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_48_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_48_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_48_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_49_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_49_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_49_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_49_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_50_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_50_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_50_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_50_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_51_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_51_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_51_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_51_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_52_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_52_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_52_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_52_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_53_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_53_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_53_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_53_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_54_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_54_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_54_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_54_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_55_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_55_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_55_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_55_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_56_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_56_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_56_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_56_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_57_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_57_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_57_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_57_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_58_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_58_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_58_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_58_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_59_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_59_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_59_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_59_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_60_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_60_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_60_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_60_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_61_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_61_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_61_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_61_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_62_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_62_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_62_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_62_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_63_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_63_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_63_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_63_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_64_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_64_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_64_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_64_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_65_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_65_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_65_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_65_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_66_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_66_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_66_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_66_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_67_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_67_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_67_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_67_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_68_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_68_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_68_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_68_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_69_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_69_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_69_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_69_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_70_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_70_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_70_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_70_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_71_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_71_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_71_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_71_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_72_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_72_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_72_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_72_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_73_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_73_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_73_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_73_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_74_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_74_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_74_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_74_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_75_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_75_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_75_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_75_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_76_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_76_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_76_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_76_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_77_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_77_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_77_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_77_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_78_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_78_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_78_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_78_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_79_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_79_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_79_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_79_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_80_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_80_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_80_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_80_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_81_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_81_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_81_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_81_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_82_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_82_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_82_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_82_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_83_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_83_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_83_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_83_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_84_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_84_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_84_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_84_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_85_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_85_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_85_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_85_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_86_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_86_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_86_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_86_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_87_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_87_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_87_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_87_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_88_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_88_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_88_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_88_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_89_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_89_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_89_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_89_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_90_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_90_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_90_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_90_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_91_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_91_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_91_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_91_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_92_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_92_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_92_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_92_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_93_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_93_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_93_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_93_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_94_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_94_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_94_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_94_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_95_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_95_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_95_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_95_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_96_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_96_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_96_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_96_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_97_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_97_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_97_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_97_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_98_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_98_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_98_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_98_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_99_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_99_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_99_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_99_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_100_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_100_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_100_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_100_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_101_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_101_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_101_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_101_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_102_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_102_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_102_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_102_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_103_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_103_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_103_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_103_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_104_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_104_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_104_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_104_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_105_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_105_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_105_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_105_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_106_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_106_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_106_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_106_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_107_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_107_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_107_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_107_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_108_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_108_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_108_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_108_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_109_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_109_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_109_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_109_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_110_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_110_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_110_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_110_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_111_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_111_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_111_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_111_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_112_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_112_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_112_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_112_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_113_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_113_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_113_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_113_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_114_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_114_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_114_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_114_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_115_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_115_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_115_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_115_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_116_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_116_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_116_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_116_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_117_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_117_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_117_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_117_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_118_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_118_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_118_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_118_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_119_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_119_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_119_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_119_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_120_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_120_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_120_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_120_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_121_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_121_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_121_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_121_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_122_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_122_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_122_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_122_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_123_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_123_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_123_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_123_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_124_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_124_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_124_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_124_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_125_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_125_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_125_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_125_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_126_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_126_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_126_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_126_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_127_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_127_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_127_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_127_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_128_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_128_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_128_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_128_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_129_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_129_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_129_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_129_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_130_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_130_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_130_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_130_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_131_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_131_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_131_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_131_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_132_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_132_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_132_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_132_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_133_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_133_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_133_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_133_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_134_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_134_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_134_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_134_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_135_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_135_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_135_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_135_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_136_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_136_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_136_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_136_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_137_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_137_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_137_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_137_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_138_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_138_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_138_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_138_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_139_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_139_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_139_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_139_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_140_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_140_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_140_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_140_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_141_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_141_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_141_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_141_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_142_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_142_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_142_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_142_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_143_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_143_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_143_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_143_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_144_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_144_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_144_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_144_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_145_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_145_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_145_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_145_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_146_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_146_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_146_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_146_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_147_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_147_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_147_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_147_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_148_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_148_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_148_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_148_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_149_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_149_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_149_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_149_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_150_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_150_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_150_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_150_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_151_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_151_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_151_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_151_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_152_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_152_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_152_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_152_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_153_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_153_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_153_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_153_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_154_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_154_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_154_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_154_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_155_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_155_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_155_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_155_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_156_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_156_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_156_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_156_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_157_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_157_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_157_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_157_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_158_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_158_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_158_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_158_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_159_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_159_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_159_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_159_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_160_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_160_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_160_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_160_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_161_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_161_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_161_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_161_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_162_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_162_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_162_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_162_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_163_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_163_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_163_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_163_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_164_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_164_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_164_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_164_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_165_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_165_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_165_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_165_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_166_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_166_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_166_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_166_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_167_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_167_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_167_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_167_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_168_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_168_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_168_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_168_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_169_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_169_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_169_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_169_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_170_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_170_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_170_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_170_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_171_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_171_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_171_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_171_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_172_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_172_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_172_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_172_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_173_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_173_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_173_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_173_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_174_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_174_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_174_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_174_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_175_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_175_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_175_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_175_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_176_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_176_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_176_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_176_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_177_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_177_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_177_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_177_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_178_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_178_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_178_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_178_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_179_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_179_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_179_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_179_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_180_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_180_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_180_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_180_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_181_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_181_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_181_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_181_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_182_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_182_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_182_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_182_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_183_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_183_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_183_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_183_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_184_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_184_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_184_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_184_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_185_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_185_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_185_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_185_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_186_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_186_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_186_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_186_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_187_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_187_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_187_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_187_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_188_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_188_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_188_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_188_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_189_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_189_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_189_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_189_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_190_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_190_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_190_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_190_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_191_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_191_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_191_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_191_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_192_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_192_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_192_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_192_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_193_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_193_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_193_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_193_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_194_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_194_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_194_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_194_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_195_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_195_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_195_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_195_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_196_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_196_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_196_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_196_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_197_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_197_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_197_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_197_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_198_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_198_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_198_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_198_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_expw", "visual_input_component": "4 natural images", "source": "expw", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_199_0.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_199_1.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_199_2.jpg", "./High-level-sub-semantic/emotion_recognition_expw/emotion_recognition_expw_199_3.jpg"], "output": "A", "qwen3-vl": "image none"}]
\ No newline at end of file
diff --git a/results/emotion_recognition_findingemo/qwen3-vl/metadata_info.json b/results/emotion_recognition_findingemo/qwen3-vl/metadata_info.json
new file mode 100644
index 0000000..c4e014f
--- /dev/null
+++ b/results/emotion_recognition_findingemo/qwen3-vl/metadata_info.json
@@ -0,0 +1 @@
+[{"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_0_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_0_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_0_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_0_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_1_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_1_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_1_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_1_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_2_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_2_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_2_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_2_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_3_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_3_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_3_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_3_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_4_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_4_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_4_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_4_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_5_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_5_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_5_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_5_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_6_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_6_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_6_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_6_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_7_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_7_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_7_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_7_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_8_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_8_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_8_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_8_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_9_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_9_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_9_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_9_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_10_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_10_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_10_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_10_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_11_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_11_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_11_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_11_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_12_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_12_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_12_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_12_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_13_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_13_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_13_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_13_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_14_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_14_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_14_2.png", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_14_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_15_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_15_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_15_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_15_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_16_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_16_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_16_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_16_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_17_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_17_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_17_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_17_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_18_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_18_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_18_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_18_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_19_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_19_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_19_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_19_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_20_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_20_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_20_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_20_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_21_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_21_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_21_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_21_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_22_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_22_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_22_2.png", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_22_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_23_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_23_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_23_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_23_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_24_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_24_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_24_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_24_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_25_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_25_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_25_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_25_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_26_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_26_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_26_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_26_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_27_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_27_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_27_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_27_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_28_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_28_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_28_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_28_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_29_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_29_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_29_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_29_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_30_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_30_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_30_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_30_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_31_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_31_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_31_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_31_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_32_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_32_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_32_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_32_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_33_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_33_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_33_2.png", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_33_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_34_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_34_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_34_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_34_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_35_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_35_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_35_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_35_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_36_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_36_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_36_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_36_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_37_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_37_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_37_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_37_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_38_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_38_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_38_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_38_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_39_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_39_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_39_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_39_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_40_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_40_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_40_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_40_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_41_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_41_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_41_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_41_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_42_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_42_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_42_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_42_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_43_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_43_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_43_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_43_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_44_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_44_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_44_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_44_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_45_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_45_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_45_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_45_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_46_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_46_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_46_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_46_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_47_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_47_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_47_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_47_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_48_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_48_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_48_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_48_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_49_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_49_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_49_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_49_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_50_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_50_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_50_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_50_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_51_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_51_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_51_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_51_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_52_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_52_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_52_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_52_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_53_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_53_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_53_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_53_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_54_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_54_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_54_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_54_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_55_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_55_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_55_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_55_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_56_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_56_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_56_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_56_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_57_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_57_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_57_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_57_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_58_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_58_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_58_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_58_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_59_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_59_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_59_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_59_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_60_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_60_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_60_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_60_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_61_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_61_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_61_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_61_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_62_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_62_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_62_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_62_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_63_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_63_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_63_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_63_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_64_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_64_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_64_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_64_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_65_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_65_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_65_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_65_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_66_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_66_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_66_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_66_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_67_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_67_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_67_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_67_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_68_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_68_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_68_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_68_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_69_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_69_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_69_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_69_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_70_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_70_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_70_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_70_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_71_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_71_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_71_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_71_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_72_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_72_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_72_2.png", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_72_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_73_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_73_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_73_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_73_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_74_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_74_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_74_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_74_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_75_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_75_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_75_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_75_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_76_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_76_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_76_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_76_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_77_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_77_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_77_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_77_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_78_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_78_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_78_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_78_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_79_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_79_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_79_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_79_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_80_0.png", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_80_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_80_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_80_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_81_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_81_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_81_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_81_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_82_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_82_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_82_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_82_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_83_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_83_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_83_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_83_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_84_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_84_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_84_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_84_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_85_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_85_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_85_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_85_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_86_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_86_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_86_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_86_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_87_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_87_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_87_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_87_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_88_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_88_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_88_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_88_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_89_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_89_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_89_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_89_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_90_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_90_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_90_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_90_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_91_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_91_1.png", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_91_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_91_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_92_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_92_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_92_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_92_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_93_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_93_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_93_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_93_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_94_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_94_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_94_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_94_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_95_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_95_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_95_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_95_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_96_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_96_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_96_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_96_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_97_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_97_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_97_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_97_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_98_0.png", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_98_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_98_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_98_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_99_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_99_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_99_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_99_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_100_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_100_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_100_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_100_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_101_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_101_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_101_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_101_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_102_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_102_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_102_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_102_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_103_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_103_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_103_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_103_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_104_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_104_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_104_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_104_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_105_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_105_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_105_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_105_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_106_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_106_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_106_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_106_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_107_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_107_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_107_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_107_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_108_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_108_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_108_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_108_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_109_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_109_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_109_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_109_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_110_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_110_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_110_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_110_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_111_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_111_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_111_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_111_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_112_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_112_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_112_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_112_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_113_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_113_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_113_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_113_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_114_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_114_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_114_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_114_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_115_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_115_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_115_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_115_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_116_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_116_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_116_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_116_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_117_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_117_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_117_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_117_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_118_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_118_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_118_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_118_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_119_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_119_1.png", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_119_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_119_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_120_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_120_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_120_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_120_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_121_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_121_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_121_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_121_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_122_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_122_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_122_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_122_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_123_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_123_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_123_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_123_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_124_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_124_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_124_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_124_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_125_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_125_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_125_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_125_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_126_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_126_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_126_2.png", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_126_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_127_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_127_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_127_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_127_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_128_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_128_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_128_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_128_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_129_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_129_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_129_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_129_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_130_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_130_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_130_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_130_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_131_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_131_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_131_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_131_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_132_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_132_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_132_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_132_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_133_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_133_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_133_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_133_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_134_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_134_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_134_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_134_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_135_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_135_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_135_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_135_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_136_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_136_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_136_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_136_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_137_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_137_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_137_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_137_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_138_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_138_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_138_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_138_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_139_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_139_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_139_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_139_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_140_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_140_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_140_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_140_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_141_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_141_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_141_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_141_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_142_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_142_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_142_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_142_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_143_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_143_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_143_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_143_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_144_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_144_1.png", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_144_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_144_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_145_0.png", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_145_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_145_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_145_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_146_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_146_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_146_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_146_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_147_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_147_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_147_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_147_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_148_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_148_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_148_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_148_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_149_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_149_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_149_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_149_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_150_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_150_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_150_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_150_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_151_0.png", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_151_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_151_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_151_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_152_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_152_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_152_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_152_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_153_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_153_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_153_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_153_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_154_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_154_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_154_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_154_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_155_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_155_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_155_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_155_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_156_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_156_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_156_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_156_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_157_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_157_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_157_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_157_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_158_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_158_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_158_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_158_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_159_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_159_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_159_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_159_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_160_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_160_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_160_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_160_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_161_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_161_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_161_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_161_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_162_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_162_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_162_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_162_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_163_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_163_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_163_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_163_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_164_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_164_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_164_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_164_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_165_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_165_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_165_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_165_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_166_0.png", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_166_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_166_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_166_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_167_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_167_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_167_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_167_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_168_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_168_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_168_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_168_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_169_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_169_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_169_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_169_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_170_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_170_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_170_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_170_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_171_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_171_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_171_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_171_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_172_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_172_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_172_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_172_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_173_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_173_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_173_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_173_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_174_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_174_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_174_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_174_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_175_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_175_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_175_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_175_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_176_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_176_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_176_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_176_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_177_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_177_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_177_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_177_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_178_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_178_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_178_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_178_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_179_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_179_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_179_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_179_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_180_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_180_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_180_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_180_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_181_0.png", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_181_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_181_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_181_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_182_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_182_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_182_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_182_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_183_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_183_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_183_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_183_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_184_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_184_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_184_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_184_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_185_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_185_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_185_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_185_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_186_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_186_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_186_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_186_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_187_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_187_1.png", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_187_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_187_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_188_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_188_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_188_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_188_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_189_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_189_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_189_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_189_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_190_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_190_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_190_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_190_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_191_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_191_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_191_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_191_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_192_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_192_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_192_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_192_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_193_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_193_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_193_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_193_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_194_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_194_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_194_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_194_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_195_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_195_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_195_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_195_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_196_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_196_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_196_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_196_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_197_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_197_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_197_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_197_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_198_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_198_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_198_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_198_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "emotion_recognition_findingemo", "visual_input_component": "4 natural images", "source": "findingemo", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to show a different emotion from the other images?", "context": "Now you are given four images, please examine the details and tell which one of them has emotions of the characters that are different from the others.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_199_0.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_199_1.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_199_2.jpg", "./High-level-sub-semantic/emotion_recognition_findingemo/emotion_recognition_findingemo_199_3.jpg"], "output": "B", "qwen3-vl": "image none"}]
\ No newline at end of file
diff --git a/results/face_retrieval/qwen3-vl/metadata_info.json b/results/face_retrieval/qwen3-vl/metadata_info.json
new file mode 100644
index 0000000..8de9b36
--- /dev/null
+++ b/results/face_retrieval/qwen3-vl/metadata_info.json
@@ -0,0 +1 @@
+[{"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_0_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_0_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_0_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_0_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_0_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_1_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_1_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_1_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_1_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_1_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_2_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_2_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_2_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_2_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_2_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_3_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_3_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_3_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_3_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_3_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_4_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_4_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_4_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_4_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_4_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_5_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_5_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_5_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_5_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_5_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_6_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_6_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_6_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_6_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_6_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_7_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_7_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_7_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_7_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_7_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_8_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_8_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_8_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_8_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_8_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_9_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_9_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_9_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_9_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_9_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_10_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_10_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_10_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_10_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_10_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_11_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_11_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_11_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_11_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_11_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_12_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_12_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_12_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_12_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_12_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_13_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_13_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_13_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_13_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_13_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_14_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_14_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_14_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_14_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_14_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_15_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_15_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_15_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_15_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_15_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_16_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_16_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_16_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_16_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_16_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_17_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_17_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_17_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_17_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_17_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_18_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_18_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_18_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_18_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_18_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_19_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_19_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_19_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_19_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_19_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_20_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_20_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_20_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_20_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_20_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_21_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_21_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_21_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_21_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_21_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_22_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_22_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_22_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_22_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_22_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_23_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_23_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_23_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_23_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_23_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_24_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_24_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_24_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_24_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_24_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_25_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_25_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_25_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_25_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_25_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_26_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_26_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_26_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_26_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_26_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_27_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_27_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_27_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_27_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_27_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_28_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_28_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_28_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_28_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_28_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_29_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_29_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_29_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_29_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_29_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_30_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_30_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_30_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_30_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_30_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_31_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_31_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_31_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_31_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_31_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_32_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_32_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_32_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_32_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_32_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_33_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_33_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_33_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_33_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_33_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_34_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_34_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_34_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_34_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_34_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_35_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_35_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_35_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_35_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_35_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_36_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_36_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_36_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_36_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_36_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_37_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_37_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_37_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_37_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_37_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_38_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_38_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_38_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_38_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_38_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_39_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_39_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_39_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_39_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_39_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_40_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_40_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_40_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_40_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_40_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_41_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_41_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_41_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_41_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_41_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_42_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_42_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_42_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_42_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_42_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_43_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_43_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_43_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_43_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_43_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_44_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_44_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_44_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_44_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_44_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_45_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_45_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_45_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_45_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_45_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_46_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_46_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_46_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_46_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_46_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_47_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_47_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_47_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_47_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_47_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_48_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_48_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_48_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_48_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_48_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_49_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_49_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_49_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_49_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_49_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_50_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_50_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_50_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_50_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_50_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_51_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_51_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_51_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_51_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_51_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_52_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_52_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_52_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_52_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_52_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_53_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_53_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_53_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_53_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_53_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_54_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_54_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_54_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_54_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_54_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_55_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_55_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_55_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_55_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_55_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_56_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_56_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_56_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_56_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_56_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_57_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_57_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_57_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_57_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_57_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_58_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_58_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_58_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_58_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_58_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_59_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_59_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_59_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_59_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_59_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_60_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_60_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_60_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_60_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_60_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_61_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_61_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_61_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_61_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_61_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_62_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_62_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_62_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_62_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_62_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_63_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_63_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_63_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_63_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_63_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_64_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_64_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_64_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_64_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_64_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_65_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_65_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_65_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_65_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_65_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_66_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_66_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_66_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_66_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_66_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_67_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_67_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_67_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_67_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_67_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_68_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_68_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_68_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_68_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_68_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_69_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_69_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_69_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_69_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_69_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_70_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_70_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_70_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_70_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_70_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_71_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_71_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_71_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_71_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_71_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_72_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_72_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_72_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_72_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_72_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_73_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_73_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_73_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_73_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_73_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_74_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_74_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_74_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_74_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_74_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_75_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_75_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_75_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_75_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_75_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_76_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_76_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_76_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_76_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_76_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_77_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_77_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_77_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_77_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_77_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_78_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_78_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_78_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_78_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_78_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_79_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_79_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_79_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_79_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_79_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_80_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_80_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_80_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_80_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_80_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_81_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_81_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_81_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_81_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_81_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_82_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_82_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_82_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_82_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_82_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_83_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_83_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_83_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_83_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_83_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_84_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_84_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_84_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_84_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_84_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_85_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_85_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_85_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_85_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_85_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_86_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_86_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_86_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_86_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_86_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_87_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_87_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_87_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_87_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_87_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_88_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_88_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_88_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_88_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_88_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_89_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_89_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_89_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_89_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_89_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_90_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_90_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_90_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_90_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_90_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_91_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_91_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_91_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_91_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_91_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_92_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_92_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_92_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_92_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_92_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_93_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_93_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_93_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_93_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_93_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_94_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_94_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_94_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_94_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_94_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_95_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_95_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_95_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_95_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_95_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_96_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_96_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_96_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_96_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_96_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_97_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_97_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_97_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_97_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_97_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_98_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_98_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_98_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_98_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_98_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_99_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_99_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_99_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_99_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_99_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_100_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_100_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_100_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_100_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_100_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_101_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_101_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_101_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_101_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_101_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_102_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_102_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_102_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_102_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_102_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_103_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_103_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_103_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_103_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_103_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_104_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_104_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_104_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_104_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_104_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_105_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_105_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_105_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_105_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_105_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_106_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_106_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_106_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_106_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_106_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_107_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_107_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_107_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_107_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_107_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_108_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_108_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_108_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_108_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_108_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_109_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_109_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_109_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_109_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_109_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_110_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_110_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_110_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_110_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_110_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_111_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_111_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_111_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_111_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_111_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_112_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_112_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_112_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_112_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_112_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_113_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_113_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_113_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_113_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_113_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_114_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_114_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_114_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_114_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_114_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_115_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_115_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_115_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_115_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_115_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_116_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_116_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_116_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_116_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_116_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_117_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_117_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_117_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_117_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_117_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_118_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_118_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_118_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_118_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_118_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_119_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_119_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_119_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_119_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_119_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_120_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_120_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_120_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_120_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_120_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_121_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_121_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_121_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_121_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_121_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_122_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_122_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_122_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_122_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_122_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_123_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_123_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_123_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_123_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_123_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_124_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_124_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_124_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_124_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_124_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_125_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_125_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_125_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_125_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_125_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_126_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_126_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_126_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_126_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_126_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_127_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_127_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_127_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_127_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_127_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_128_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_128_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_128_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_128_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_128_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_129_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_129_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_129_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_129_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_129_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_130_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_130_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_130_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_130_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_130_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_131_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_131_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_131_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_131_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_131_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_132_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_132_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_132_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_132_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_132_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_133_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_133_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_133_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_133_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_133_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_134_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_134_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_134_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_134_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_134_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_135_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_135_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_135_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_135_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_135_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_136_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_136_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_136_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_136_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_136_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_137_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_137_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_137_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_137_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_137_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_138_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_138_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_138_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_138_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_138_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_139_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_139_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_139_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_139_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_139_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_140_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_140_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_140_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_140_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_140_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_141_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_141_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_141_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_141_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_141_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_142_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_142_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_142_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_142_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_142_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_143_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_143_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_143_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_143_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_143_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_144_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_144_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_144_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_144_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_144_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_145_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_145_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_145_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_145_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_145_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_146_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_146_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_146_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_146_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_146_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_147_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_147_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_147_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_147_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_147_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_148_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_148_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_148_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_148_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_148_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_149_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_149_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_149_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_149_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_149_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_150_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_150_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_150_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_150_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_150_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_151_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_151_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_151_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_151_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_151_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_152_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_152_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_152_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_152_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_152_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_153_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_153_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_153_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_153_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_153_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_154_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_154_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_154_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_154_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_154_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_155_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_155_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_155_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_155_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_155_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_156_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_156_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_156_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_156_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_156_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_157_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_157_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_157_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_157_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_157_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_158_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_158_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_158_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_158_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_158_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_159_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_159_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_159_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_159_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_159_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_160_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_160_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_160_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_160_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_160_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_161_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_161_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_161_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_161_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_161_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_162_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_162_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_162_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_162_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_162_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_163_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_163_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_163_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_163_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_163_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_164_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_164_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_164_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_164_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_164_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_165_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_165_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_165_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_165_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_165_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_166_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_166_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_166_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_166_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_166_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_167_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_167_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_167_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_167_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_167_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_168_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_168_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_168_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_168_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_168_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_169_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_169_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_169_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_169_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_169_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_170_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_170_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_170_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_170_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_170_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_171_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_171_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_171_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_171_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_171_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_172_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_172_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_172_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_172_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_172_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_173_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_173_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_173_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_173_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_173_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_174_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_174_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_174_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_174_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_174_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_175_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_175_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_175_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_175_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_175_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_176_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_176_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_176_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_176_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_176_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_177_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_177_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_177_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_177_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_177_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_178_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_178_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_178_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_178_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_178_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_179_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_179_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_179_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_179_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_179_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_180_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_180_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_180_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_180_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_180_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_181_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_181_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_181_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_181_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_181_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_182_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_182_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_182_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_182_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_182_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_183_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_183_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_183_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_183_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_183_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_184_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_184_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_184_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_184_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_184_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_185_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_185_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_185_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_185_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_185_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_186_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_186_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_186_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_186_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_186_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_187_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_187_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_187_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_187_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_187_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_188_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_188_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_188_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_188_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_188_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_189_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_189_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_189_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_189_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_189_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_190_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_190_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_190_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_190_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_190_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_191_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_191_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_191_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_191_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_191_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_192_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_192_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_192_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_192_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_192_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_193_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_193_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_193_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_193_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_193_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_194_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_194_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_194_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_194_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_194_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_195_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_195_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_195_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_195_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_195_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_196_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_196_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_196_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_196_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_196_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_197_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_197_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_197_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_197_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_197_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "CelebA_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_198_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_198_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_198_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_198_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_198_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "face_retrieval", "visual_input_component": "['natural_image']", "source": "lfw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/face_retrieval/face_retrieval_199_0.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_199_1.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_199_2.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_199_3.jpg", "./High-level-obj-semantic/face_retrieval/face_retrieval_199_4.jpg"], "output": "B", "qwen3-vl": "image none"}]
\ No newline at end of file
diff --git a/results/forensic_detection_blink/qwen3-vl/metadata_info.json b/results/forensic_detection_blink/qwen3-vl/metadata_info.json
new file mode 100644
index 0000000..11f5e2e
--- /dev/null
+++ b/results/forensic_detection_blink/qwen3-vl/metadata_info.json
@@ -0,0 +1 @@
+[{"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_0_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_0_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_0_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_0_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_1_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_1_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_1_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_1_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_2_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_2_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_2_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_2_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_3_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_3_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_3_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_3_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_4_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_4_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_4_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_4_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_5_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_5_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_5_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_5_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_6_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_6_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_6_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_6_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_7_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_7_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_7_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_7_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_8_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_8_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_8_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_8_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_9_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_9_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_9_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_9_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_10_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_10_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_10_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_10_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_11_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_11_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_11_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_11_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_12_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_12_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_12_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_12_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_13_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_13_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_13_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_13_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_14_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_14_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_14_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_14_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_15_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_15_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_15_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_15_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_16_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_16_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_16_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_16_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_17_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_17_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_17_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_17_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_18_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_18_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_18_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_18_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_19_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_19_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_19_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_19_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_20_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_20_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_20_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_20_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_21_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_21_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_21_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_21_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_22_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_22_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_22_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_22_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_23_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_23_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_23_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_23_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_24_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_24_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_24_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_24_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_25_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_25_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_25_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_25_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_26_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_26_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_26_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_26_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_27_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_27_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_27_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_27_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_28_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_28_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_28_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_28_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_29_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_29_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_29_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_29_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_30_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_30_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_30_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_30_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_31_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_31_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_31_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_31_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_32_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_32_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_32_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_32_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_33_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_33_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_33_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_33_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_34_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_34_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_34_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_34_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_35_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_35_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_35_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_35_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_36_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_36_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_36_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_36_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_37_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_37_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_37_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_37_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_38_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_38_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_38_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_38_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_39_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_39_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_39_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_39_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_40_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_40_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_40_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_40_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_41_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_41_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_41_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_41_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_42_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_42_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_42_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_42_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_43_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_43_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_43_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_43_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_44_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_44_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_44_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_44_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_45_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_45_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_45_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_45_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_46_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_46_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_46_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_46_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_47_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_47_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_47_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_47_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_48_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_48_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_48_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_48_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_49_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_49_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_49_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_49_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_50_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_50_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_50_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_50_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_51_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_51_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_51_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_51_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_52_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_52_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_52_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_52_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_53_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_53_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_53_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_53_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_54_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_54_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_54_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_54_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_55_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_55_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_55_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_55_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_56_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_56_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_56_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_56_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_57_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_57_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_57_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_57_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_58_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_58_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_58_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_58_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_59_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_59_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_59_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_59_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_60_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_60_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_60_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_60_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_61_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_61_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_61_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_61_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_62_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_62_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_62_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_62_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_63_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_63_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_63_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_63_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_64_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_64_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_64_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_64_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_65_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_65_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_65_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_65_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_66_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_66_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_66_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_66_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_67_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_67_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_67_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_67_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_68_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_68_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_68_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_68_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_69_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_69_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_69_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_69_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_70_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_70_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_70_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_70_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_71_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_71_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_71_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_71_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_72_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_72_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_72_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_72_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_73_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_73_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_73_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_73_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_74_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_74_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_74_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_74_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_75_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_75_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_75_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_75_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_76_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_76_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_76_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_76_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_77_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_77_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_77_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_77_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_78_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_78_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_78_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_78_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_79_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_79_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_79_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_79_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_80_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_80_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_80_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_80_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_81_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_81_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_81_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_81_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_82_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_82_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_82_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_82_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_83_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_83_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_83_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_83_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_84_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_84_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_84_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_84_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_85_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_85_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_85_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_85_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_86_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_86_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_86_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_86_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_87_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_87_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_87_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_87_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_88_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_88_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_88_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_88_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_89_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_89_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_89_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_89_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_90_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_90_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_90_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_90_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_91_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_91_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_91_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_91_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_92_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_92_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_92_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_92_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_93_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_93_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_93_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_93_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_94_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_94_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_94_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_94_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_95_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_95_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_95_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_95_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_96_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_96_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_96_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_96_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_97_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_97_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_97_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_97_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_98_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_98_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_98_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_98_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_99_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_99_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_99_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_99_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_100_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_100_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_100_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_100_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_101_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_101_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_101_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_101_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_102_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_102_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_102_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_102_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_103_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_103_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_103_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_103_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_104_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_104_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_104_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_104_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_105_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_105_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_105_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_105_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_106_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_106_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_106_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_106_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_107_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_107_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_107_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_107_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_108_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_108_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_108_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_108_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_109_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_109_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_109_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_109_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_110_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_110_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_110_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_110_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_111_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_111_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_111_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_111_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_112_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_112_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_112_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_112_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_113_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_113_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_113_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_113_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_114_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_114_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_114_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_114_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_115_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_115_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_115_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_115_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_116_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_116_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_116_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_116_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_117_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_117_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_117_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_117_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_118_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_118_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_118_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_118_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_119_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_119_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_119_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_119_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_120_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_120_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_120_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_120_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_121_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_121_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_121_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_121_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_122_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_122_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_122_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_122_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_123_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_123_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_123_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_123_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_124_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_124_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_124_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_124_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_125_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_125_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_125_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_125_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_126_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_126_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_126_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_126_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_127_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_127_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_127_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_127_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_128_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_128_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_128_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_128_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_129_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_129_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_129_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_129_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_130_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_130_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_130_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_130_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_blink", "visual_input_component": "natural image and synthetic image", "source": "blink", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a real photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a real photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_131_0.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_131_1.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_131_2.jpg", "./Low-level-semantic/forensic_detection_blink/forensic_detection_blink_131_3.jpg"], "output": "A", "qwen3-vl": "image none"}]
\ No newline at end of file
diff --git a/results/forensic_detection_forgerynet/qwen3-vl/metadata_info.json b/results/forensic_detection_forgerynet/qwen3-vl/metadata_info.json
new file mode 100644
index 0000000..d7c0eff
--- /dev/null
+++ b/results/forensic_detection_forgerynet/qwen3-vl/metadata_info.json
@@ -0,0 +1 @@
+[{"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_0_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_0_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_0_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_0_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_1_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_1_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_1_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_1_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_2_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_2_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_2_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_2_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_3_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_3_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_3_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_3_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_4_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_4_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_4_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_4_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_5_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_5_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_5_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_5_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_6_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_6_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_6_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_6_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_7_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_7_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_7_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_7_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_8_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_8_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_8_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_8_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_9_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_9_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_9_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_9_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_10_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_10_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_10_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_10_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_11_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_11_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_11_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_11_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_12_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_12_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_12_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_12_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_13_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_13_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_13_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_13_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_14_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_14_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_14_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_14_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_15_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_15_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_15_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_15_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_16_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_16_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_16_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_16_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_17_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_17_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_17_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_17_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_18_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_18_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_18_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_18_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_19_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_19_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_19_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_19_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_20_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_20_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_20_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_20_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_21_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_21_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_21_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_21_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_22_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_22_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_22_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_22_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_23_0.png", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_23_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_23_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_23_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_24_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_24_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_24_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_24_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_25_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_25_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_25_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_25_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_26_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_26_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_26_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_26_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_27_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_27_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_27_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_27_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_28_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_28_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_28_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_28_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_29_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_29_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_29_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_29_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_30_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_30_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_30_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_30_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_31_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_31_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_31_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_31_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_32_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_32_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_32_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_32_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_33_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_33_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_33_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_33_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_34_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_34_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_34_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_34_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_35_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_35_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_35_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_35_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_36_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_36_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_36_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_36_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_37_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_37_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_37_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_37_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_38_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_38_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_38_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_38_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_39_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_39_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_39_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_39_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_40_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_40_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_40_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_40_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_41_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_41_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_41_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_41_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_42_0.png", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_42_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_42_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_42_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_43_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_43_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_43_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_43_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_44_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_44_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_44_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_44_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_45_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_45_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_45_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_45_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_46_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_46_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_46_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_46_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_47_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_47_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_47_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_47_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_48_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_48_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_48_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_48_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_49_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_49_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_49_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_49_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_50_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_50_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_50_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_50_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_51_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_51_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_51_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_51_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_52_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_52_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_52_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_52_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_53_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_53_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_53_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_53_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_54_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_54_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_54_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_54_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_55_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_55_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_55_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_55_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_56_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_56_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_56_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_56_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_57_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_57_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_57_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_57_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_58_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_58_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_58_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_58_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_59_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_59_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_59_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_59_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_60_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_60_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_60_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_60_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_61_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_61_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_61_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_61_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_62_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_62_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_62_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_62_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_63_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_63_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_63_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_63_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_64_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_64_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_64_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_64_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_65_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_65_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_65_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_65_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_66_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_66_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_66_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_66_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_67_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_67_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_67_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_67_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_68_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_68_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_68_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_68_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_69_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_69_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_69_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_69_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_70_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_70_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_70_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_70_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_71_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_71_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_71_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_71_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_72_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_72_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_72_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_72_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_73_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_73_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_73_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_73_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_74_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_74_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_74_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_74_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_75_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_75_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_75_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_75_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_76_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_76_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_76_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_76_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_77_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_77_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_77_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_77_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_78_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_78_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_78_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_78_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_79_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_79_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_79_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_79_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_80_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_80_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_80_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_80_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_81_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_81_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_81_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_81_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_82_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_82_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_82_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_82_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_83_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_83_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_83_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_83_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_84_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_84_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_84_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_84_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_85_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_85_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_85_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_85_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_86_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_86_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_86_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_86_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_87_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_87_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_87_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_87_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_88_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_88_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_88_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_88_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_89_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_89_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_89_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_89_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_90_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_90_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_90_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_90_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_91_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_91_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_91_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_91_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_92_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_92_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_92_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_92_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_93_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_93_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_93_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_93_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_94_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_94_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_94_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_94_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_95_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_95_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_95_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_95_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_96_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_96_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_96_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_96_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_97_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_97_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_97_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_97_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_98_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_98_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_98_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_98_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_99_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_99_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_99_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_99_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_100_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_100_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_100_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_100_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_101_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_101_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_101_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_101_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_102_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_102_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_102_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_102_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_103_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_103_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_103_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_103_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_104_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_104_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_104_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_104_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_105_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_105_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_105_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_105_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_106_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_106_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_106_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_106_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_107_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_107_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_107_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_107_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_108_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_108_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_108_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_108_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_109_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_109_1.png", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_109_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_109_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_110_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_110_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_110_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_110_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_111_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_111_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_111_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_111_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_112_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_112_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_112_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_112_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_113_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_113_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_113_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_113_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_114_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_114_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_114_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_114_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_115_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_115_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_115_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_115_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_116_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_116_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_116_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_116_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_117_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_117_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_117_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_117_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_118_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_118_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_118_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_118_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_119_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_119_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_119_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_119_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_120_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_120_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_120_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_120_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_121_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_121_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_121_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_121_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_122_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_122_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_122_2.png", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_122_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_123_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_123_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_123_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_123_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_124_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_124_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_124_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_124_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_125_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_125_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_125_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_125_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_126_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_126_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_126_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_126_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_127_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_127_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_127_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_127_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_128_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_128_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_128_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_128_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_129_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_129_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_129_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_129_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_130_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_130_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_130_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_130_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_131_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_131_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_131_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_131_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_132_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_132_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_132_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_132_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_133_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_133_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_133_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_133_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_134_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_134_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_134_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_134_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_135_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_135_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_135_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_135_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_136_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_136_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_136_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_136_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_137_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_137_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_137_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_137_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_138_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_138_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_138_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_138_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_139_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_139_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_139_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_139_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_140_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_140_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_140_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_140_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_141_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_141_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_141_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_141_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_142_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_142_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_142_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_142_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_143_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_143_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_143_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_143_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_144_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_144_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_144_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_144_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_145_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_145_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_145_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_145_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_146_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_146_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_146_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_146_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_147_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_147_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_147_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_147_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_148_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_148_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_148_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_148_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_149_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_149_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_149_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_149_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_150_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_150_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_150_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_150_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_151_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_151_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_151_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_151_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_152_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_152_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_152_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_152_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_153_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_153_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_153_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_153_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_154_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_154_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_154_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_154_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_155_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_155_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_155_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_155_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_156_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_156_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_156_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_156_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_157_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_157_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_157_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_157_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_158_0.png", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_158_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_158_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_158_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_159_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_159_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_159_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_159_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_160_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_160_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_160_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_160_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_161_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_161_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_161_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_161_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_162_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_162_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_162_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_162_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_163_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_163_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_163_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_163_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_164_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_164_1.png", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_164_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_164_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_165_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_165_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_165_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_165_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_166_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_166_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_166_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_166_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_167_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_167_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_167_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_167_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_168_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_168_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_168_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_168_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_169_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_169_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_169_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_169_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_170_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_170_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_170_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_170_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_171_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_171_1.png", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_171_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_171_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_172_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_172_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_172_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_172_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_173_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_173_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_173_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_173_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_174_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_174_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_174_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_174_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_175_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_175_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_175_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_175_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_176_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_176_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_176_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_176_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_177_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_177_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_177_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_177_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_178_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_178_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_178_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_178_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_179_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_179_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_179_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_179_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_180_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_180_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_180_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_180_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_181_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_181_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_181_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_181_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_182_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_182_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_182_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_182_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_183_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_183_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_183_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_183_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_184_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_184_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_184_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_184_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_185_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_185_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_185_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_185_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_186_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_186_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_186_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_186_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_187_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_187_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_187_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_187_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_188_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_188_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_188_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_188_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_189_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_189_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_189_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_189_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_190_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_190_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_190_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_190_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_191_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_191_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_191_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_191_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_192_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_192_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_192_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_192_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_193_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_193_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_193_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_193_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_194_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_194_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_194_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_194_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_195_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_195_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_195_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_195_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_196_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_196_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_196_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_196_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_197_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_197_1.png", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_197_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_197_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_198_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_198_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_198_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_198_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "forensic_detection_forgerynet", "visual_input_component": "natural image and synthetic image", "source": "forgerynet", "options": "A: the first image\nB: the second image\nC: the third image\nD: the fourth image", "question": "Which image is most likely to be a fake photograph?", "context": "You are a judge in a photography competition, and now you are given the four images. Please examine the details and tell which one of them is most likely to be a fake photograph.\nSelect from the following choices.\nA: the first image\nB: the second image\nC: the third image\nD: the fourth image\n", "input_image_path": ["./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_199_0.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_199_1.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_199_2.jpg", "./Low-level-semantic/forensic_detection_forgerynet/forensic_detection_forgerynet_199_3.jpg"], "output": "C", "qwen3-vl": "image none"}]
\ No newline at end of file
diff --git a/results/functional_correspondence_blink/qwen3-vl/metadata_info.json b/results/functional_correspondence_blink/qwen3-vl/metadata_info.json
new file mode 100644
index 0000000..52cb163
--- /dev/null
+++ b/results/functional_correspondence_blink/qwen3-vl/metadata_info.json
@@ -0,0 +1 @@
+[{"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Scrape\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_0_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_0_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mash/Pound\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_1_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_1_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Brush/Dust\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_2_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_2_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_3_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_3_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mash/Pound\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_4_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_4_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Brush/Dust\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_5_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_5_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Scrape\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_6_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_6_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Brush/Dust\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_7_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_7_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mash/Pound\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_8_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_8_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mix\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_9_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_9_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Brush/Dust\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_10_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_10_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Poke\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_11_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_11_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Pour\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_12_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_12_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mash/Pound\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_13_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_13_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Scrape\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_14_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_14_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Pour\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_15_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_15_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Brush/Dust\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_16_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_16_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mash/Pound\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_17_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_17_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Scoop\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_18_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_18_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Brush/Dust\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_19_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_19_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mash/Pound\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_20_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_20_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Pour\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_21_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_21_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Pull out a nail\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_22_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_22_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Brush/Dust\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_23_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_23_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_24_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_24_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Pour\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_25_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_25_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mash/Pound\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_26_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_26_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Pour\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_27_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_27_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_28_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_28_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mash/Pound\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_29_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_29_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mash/Pound\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_30_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_30_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Scrape\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_31_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_31_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Poke\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_32_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_32_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mix\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_33_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_33_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mix\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_34_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_34_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mix\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_35_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_35_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Pour\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_36_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_36_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_37_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_37_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Pour\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_38_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_38_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Brush/Dust\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_39_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_39_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mash/Pound\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_40_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_40_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Brush/Dust\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_41_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_41_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Brush/Dust\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_42_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_42_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Brush/Dust\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_43_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_43_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mix\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_44_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_44_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Pull out a nail\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_45_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_45_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Pour\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_46_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_46_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_47_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_47_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mix\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_48_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_48_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Pull out a nail\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_49_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_49_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mix\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_50_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_50_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Poke\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_51_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_51_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Pull out a nail\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_52_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_52_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Pull out a nail\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_53_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_53_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Pour\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_54_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_54_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Scoop\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_55_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_55_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mash/Pound\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_56_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_56_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Pull out a nail\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_57_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_57_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mix\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_58_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_58_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Brush/Dust\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_59_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_59_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mix\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_60_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_60_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Poke\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_61_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_61_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mix\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_62_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_62_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Scrape\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_63_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_63_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mash/Pound\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_64_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_64_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Pull out a nail\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_65_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_65_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Pour\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_66_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_66_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mix\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_67_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_67_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Pull out a nail\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_68_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_68_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Scoop\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_69_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_69_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Pull out a nail\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_70_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_70_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Pour\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_71_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_71_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mix\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_72_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_72_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Scrape\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_73_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_73_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mash/Pound\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_74_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_74_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mash/Pound\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_75_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_75_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mix\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_76_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_76_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mash/Pound\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_77_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_77_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Scoop\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_78_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_78_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_79_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_79_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Scoop\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_80_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_80_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mash/Pound\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_81_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_81_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Scrape\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_82_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_82_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Scoop\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_83_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_83_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Scrape\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_84_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_84_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Pull out a nail\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_85_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_85_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Brush/Dust\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_86_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_86_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mix\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_87_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_87_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_88_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_88_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Brush/Dust\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_89_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_89_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Pull out a nail\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_90_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_90_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mash/Pound\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_91_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_91_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Pour\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_92_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_92_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Scoop\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_93_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_93_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Brush/Dust\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_94_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_94_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Pull out a nail\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_95_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_95_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Scrape\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_96_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_96_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mix\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_97_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_97_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mix\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_98_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_98_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mix\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_99_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_99_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Pour\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_100_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_100_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mash/Pound\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_101_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_101_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Pour\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_102_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_102_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Scrape\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_103_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_103_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Pour\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_104_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_104_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Pull out a nail\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_105_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_105_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mix\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_106_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_106_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_107_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_107_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Pull out a nail\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_108_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_108_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Scrape\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_109_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_109_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Brush/Dust\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_110_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_110_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mix\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_111_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_111_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_112_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_112_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Poke\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_113_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_113_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Scrape\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_114_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_114_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Pull out a nail\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_115_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_115_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Poke\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_116_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_116_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mash/Pound\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_117_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_117_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Pull out a nail\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_118_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_118_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_119_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_119_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Mash/Pound\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_120_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_120_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_121_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_121_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Pour\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_122_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_122_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Scrape\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_123_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_123_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_124_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_124_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_125_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_125_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Scoop\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_126_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_126_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_127_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_127_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Brush/Dust\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_128_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_128_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Pull out a nail\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_129_0.jpg", "./High-level-obj-semantic/functional_correspondence_blink/functional_correspondence_blink_129_1.jpg"], "output": "A", "qwen3-vl": "image none"}]
\ No newline at end of file
diff --git a/results/functional_correspondence_funk_point/qwen3-vl/metadata_info.json b/results/functional_correspondence_funk_point/qwen3-vl/metadata_info.json
new file mode 100644
index 0000000..da1c166
--- /dev/null
+++ b/results/functional_correspondence_funk_point/qwen3-vl/metadata_info.json
@@ -0,0 +1 @@
+[{"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_0_0.jpg", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_0_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_1_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_1_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_2_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_2_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_3_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_3_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_4_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_4_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_5_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_5_1.JPEG"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_6_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_6_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_7_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_7_1.JPEG"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_8_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_8_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_9_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_9_1.JPEG"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_10_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_10_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_11_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_11_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_12_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_12_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_13_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_13_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_14_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_14_1.JPEG"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_15_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_15_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_16_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_16_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_17_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_17_1.JPEG"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_18_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_18_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_19_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_19_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_20_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_20_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_21_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_21_1.JPEG"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_22_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_22_1.JPEG"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_23_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_23_1.JPEG"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_24_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_24_1.JPEG"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_25_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_25_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_26_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_26_1.JPEG"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_27_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_27_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_28_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_28_1.JPEG"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_29_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_29_1.JPEG"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_30_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_30_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_31_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_31_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_32_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_32_1.JPEG"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_33_0.jpg", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_33_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_34_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_34_1.JPEG"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_35_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_35_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_36_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_36_1.JPEG"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_37_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_37_1.JPEG"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_38_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_38_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_39_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_39_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_40_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_40_1.JPEG"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_41_0.jpg", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_41_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_42_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_42_1.JPEG"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_43_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_43_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_44_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_44_1.JPEG"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_45_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_45_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_46_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_46_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_47_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_47_1.JPEG"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_48_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_48_1.JPEG"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_49_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_49_1.JPEG"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_50_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_50_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_51_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_51_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_52_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_52_1.JPEG"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_53_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_53_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_54_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_54_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_55_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_55_1.JPEG"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_56_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_56_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_57_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_57_1.JPEG"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_58_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_58_1.JPEG"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_59_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_59_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_60_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_60_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_61_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_61_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_62_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_62_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_63_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_63_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_64_0.jpg", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_64_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_65_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_65_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_66_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_66_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_67_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_67_1.JPEG"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_68_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_68_1.JPEG"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_69_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_69_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_70_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_70_1.JPEG"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_71_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_71_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_72_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_72_1.JPEG"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_73_0.jpg", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_73_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_74_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_74_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_75_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_75_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_76_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_76_1.JPEG"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_77_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_77_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_78_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_78_1.JPEG"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_79_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_79_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_80_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_80_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_81_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_81_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_82_0.jpg", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_82_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_83_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_83_1.JPEG"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_84_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_84_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_85_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_85_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_86_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_86_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_87_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_87_1.JPEG"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_88_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_88_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_89_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_89_1.JPEG"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_90_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_90_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_91_0.jpg", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_91_1.JPEG"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_92_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_92_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_93_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_93_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_94_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_94_1.JPEG"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_95_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_95_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_96_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_96_1.JPEG"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_97_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_97_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_98_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_98_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_99_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_99_1.JPEG"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_100_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_100_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_101_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_101_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_102_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_102_1.JPEG"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_103_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_103_1.JPEG"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_104_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_104_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_105_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_105_1.JPEG"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_106_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_106_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_107_0.jpg", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_107_1.JPEG"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_108_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_108_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_109_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_109_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_110_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_110_1.JPEG"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_111_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_111_1.JPEG"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_112_0.jpg", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_112_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_113_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_113_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_114_0.jpg", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_114_1.JPEG"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_115_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_115_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_116_0.jpg", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_116_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_117_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_117_1.JPEG"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_118_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_118_1.JPEG"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_119_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_119_1.JPEG"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_120_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_120_1.JPEG"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_121_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_121_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_122_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_122_1.JPEG"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_123_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_123_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_124_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_124_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_125_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_125_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_126_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_126_1.JPEG"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_127_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_127_1.JPEG"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_128_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_128_1.JPEG"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_129_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_129_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_130_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_130_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_131_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_131_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_132_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_132_1.JPEG"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_133_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_133_1.JPEG"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_134_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_134_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_135_0.jpg", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_135_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_136_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_136_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_137_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_137_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_138_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_138_1.JPEG"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_139_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_139_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_140_0.jpg", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_140_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_141_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_141_1.JPEG"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_142_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_142_1.JPEG"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_143_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_143_1.JPEG"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_144_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_144_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_145_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_145_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_146_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_146_1.JPEG"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_147_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_147_1.JPEG"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_148_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_148_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_149_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_149_1.JPEG"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_150_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_150_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_151_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_151_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_152_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_152_1.JPEG"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_153_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_153_1.JPEG"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_154_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_154_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_155_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_155_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_156_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_156_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_157_0.jpg", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_157_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_158_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_158_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_159_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_159_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_160_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_160_1.JPEG"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_161_0.jpg", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_161_1.JPEG"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_162_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_162_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_163_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_163_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_164_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_164_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_165_0.jpg", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_165_1.JPEG"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_166_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_166_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_167_0.jpg", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_167_1.JPEG"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_168_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_168_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_169_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_169_1.JPEG"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_170_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_170_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_171_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_171_1.JPEG"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_172_0.jpg", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_172_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_173_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_173_1.JPEG"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_174_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_174_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_175_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_175_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_176_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_176_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_177_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_177_1.JPEG"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_178_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_178_1.JPEG"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_179_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_179_1.JPEG"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_180_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_180_1.JPEG"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_181_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_181_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_182_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_182_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_183_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_183_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_184_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_184_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_185_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_185_1.JPEG"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_186_0.jpg", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_186_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_187_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_187_1.JPEG"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_188_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_188_1.JPEG"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_189_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_189_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_190_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_190_1.JPEG"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_191_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_191_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_192_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_192_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_193_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_193_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_194_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_194_1.JPEG"], "output": "B", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_195_0.JPEG", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_195_1.JPEG"], "output": "A", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_196_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_196_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_197_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_197_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_198_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_198_1.JPEG"], "output": "D", "qwen3-vl": "image none"}, {"task": "functional_correspondence_funk_point", "visual_input_component": "2 natural images", "source": "funk_point", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for the same action between different objects. For instance, if a person uses a pot versus a hammer to \"Mash Pound\", then the handle of the pot will be the corresponding point to the handle of the hammer because they serve the same function for the action -- to hold; and the bottom of the pot will be the corresponding point to the face of the hammer because they both mash the other object.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the right image, choices of \"A, B, C, D\" are drawn beside each circle. Select from the choices on the second image and find the corresponding point for the reference point, if we use both items for the action: \"Lift Something\". Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_199_0.png", "./High-level-obj-semantic/functional_correspondence_funk_point/functional_correspondence_funk_point_199_1.jpg"], "output": "B", "qwen3-vl": "image none"}]
\ No newline at end of file
diff --git a/results/general_action_recognition/qwen3-vl/metadata_info.json b/results/general_action_recognition/qwen3-vl/metadata_info.json
new file mode 100644
index 0000000..bc9197d
--- /dev/null
+++ b/results/general_action_recognition/qwen3-vl/metadata_info.json
@@ -0,0 +1 @@
+[{"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: tying knot (not on a tie)\nB: knitting\nC: ironing\nD: weaving basket", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: tying knot (not on a tie)\nB: knitting\nC: ironing\nD: weaving basket", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_0_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_0_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_0_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_0_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_0_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_0_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_0_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_0_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_0_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_0_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_0_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_0_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_0_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_0_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_0_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_0_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: trampoline jump\nB: balancing on trampoline\nC: flipping on trampoline\nD: bouncing on trampoline", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: trampoline jump\nB: balancing on trampoline\nC: flipping on trampoline\nD: bouncing on trampoline", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_1_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_1_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_1_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_1_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_1_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_1_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_1_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_1_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_1_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_1_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_1_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_1_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_1_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_1_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_1_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_1_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: dunking basketball\nB: playing kickball\nC: shooting goal (soccer)\nD: playing basketball", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: dunking basketball\nB: playing kickball\nC: shooting goal (soccer)\nD: playing basketball", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_2_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_2_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_2_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_2_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_2_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_2_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_2_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_2_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_2_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_2_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_2_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_2_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_2_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_2_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_2_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_2_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: carrying baby\nB: using segway\nC: pushing wheelchair\nD: cleaning windows", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: carrying baby\nB: using segway\nC: pushing wheelchair\nD: cleaning windows", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_3_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_3_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_3_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_3_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_3_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_3_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_3_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_3_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_3_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_3_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_3_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_3_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_3_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_3_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_3_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_3_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: playing kickball\nB: playing chess\nC: playing controller\nD: playing basketball", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: playing kickball\nB: playing chess\nC: playing controller\nD: playing basketball", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_4_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_4_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_4_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_4_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_4_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_4_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_4_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_4_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_4_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_4_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_4_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_4_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_4_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_4_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_4_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_4_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: kicking soccer ball\nB: high kick\nC: parkour\nD: dunking basketball", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: kicking soccer ball\nB: high kick\nC: parkour\nD: dunking basketball", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_5_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_5_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_5_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_5_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_5_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_5_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_5_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_5_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_5_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_5_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_5_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_5_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_5_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_5_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_5_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_5_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: jumpstyle dancing\nB: swinging legs\nC: hula hooping\nD: tango dancing", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: jumpstyle dancing\nB: swinging legs\nC: hula hooping\nD: tango dancing", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_6_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_6_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_6_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_6_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_6_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_6_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_6_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_6_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_6_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_6_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_6_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_6_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_6_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_6_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_6_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_6_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: washing dishes\nB: cleaning pool\nC: washing hands\nD: cleaning windows", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: washing dishes\nB: cleaning pool\nC: washing hands\nD: cleaning windows", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_7_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_7_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_7_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_7_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_7_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_7_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_7_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_7_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_7_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_7_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_7_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_7_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_7_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_7_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_7_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_7_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: braiding hair\nB: shining shoes\nC: cutting watermelon\nD: tapping guitar", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: braiding hair\nB: shining shoes\nC: cutting watermelon\nD: tapping guitar", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_8_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_8_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_8_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_8_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_8_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_8_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_8_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_8_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_8_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_8_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_8_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_8_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_8_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_8_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_8_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_8_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: doing laundry\nB: cleaning pool\nC: washing dishes\nD: washing hands", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: doing laundry\nB: cleaning pool\nC: washing dishes\nD: washing hands", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_9_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_9_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_9_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_9_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_9_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_9_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_9_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_9_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_9_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_9_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_9_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_9_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_9_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_9_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_9_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_9_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: snowboarding\nB: ice climbing\nC: skiing crosscountry\nD: biking through snow", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: snowboarding\nB: ice climbing\nC: skiing crosscountry\nD: biking through snow", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_10_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_10_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_10_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_10_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_10_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_10_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_10_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_10_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_10_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_10_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_10_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_10_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_10_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_10_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_10_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_10_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: playing kickball\nB: frisbee catching or throwing\nC: catching or throwing frisbee\nD: biking through snow", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: playing kickball\nB: frisbee catching or throwing\nC: catching or throwing frisbee\nD: biking through snow", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_11_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_11_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_11_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_11_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_11_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_11_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_11_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_11_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_11_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_11_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_11_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_11_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_11_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_11_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_11_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_11_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: packing truck\nB: loading truck\nC: driving truck\nD: unloading truck", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: packing truck\nB: loading truck\nC: driving truck\nD: unloading truck", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_12_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_12_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_12_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_12_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_12_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_12_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_12_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_12_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_12_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_12_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_12_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_12_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_12_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_12_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_12_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_12_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: shaving legs\nB: surfing water\nC: swimming breast stroke\nD: diving cliff", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: shaving legs\nB: surfing water\nC: swimming breast stroke\nD: diving cliff", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_13_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_13_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_13_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_13_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_13_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_13_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_13_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_13_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_13_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_13_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_13_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_13_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_13_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_13_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_13_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_13_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: eating chips\nB: tossing salad\nC: bouncing on trampoline\nD: feeding birds", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: eating chips\nB: tossing salad\nC: bouncing on trampoline\nD: feeding birds", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_14_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_14_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_14_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_14_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_14_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_14_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_14_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_14_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_14_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_14_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_14_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_14_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_14_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_14_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_14_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_14_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: spray painting\nB: baking cookies\nC: using remote controller (not gaming)\nD: blowing out candles", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: spray painting\nB: baking cookies\nC: using remote controller (not gaming)\nD: blowing out candles", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_15_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_15_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_15_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_15_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_15_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_15_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_15_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_15_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_15_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_15_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_15_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_15_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_15_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_15_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_15_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_15_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: punching bag\nB: boxing\nC: wrestling\nD: kickboxing", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: punching bag\nB: boxing\nC: wrestling\nD: kickboxing", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_16_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_16_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_16_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_16_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_16_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_16_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_16_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_16_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_16_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_16_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_16_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_16_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_16_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_16_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_16_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_16_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: unloading truck\nB: tapping pen\nC: playing poker\nD: blowing leaves", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: unloading truck\nB: tapping pen\nC: playing poker\nD: blowing leaves", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_17_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_17_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_17_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_17_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_17_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_17_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_17_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_17_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_17_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_17_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_17_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_17_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_17_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_17_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_17_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_17_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: playing kickball\nB: polishing silverware\nC: sneezing\nD: shining shoes", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: playing kickball\nB: polishing silverware\nC: sneezing\nD: shining shoes", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_18_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_18_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_18_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_18_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_18_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_18_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_18_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_18_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_18_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_18_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_18_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_18_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_18_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_18_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_18_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_18_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: playing saxophone\nB: playing flute\nC: playing trumpet\nD: playing guitar", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: playing saxophone\nB: playing flute\nC: playing trumpet\nD: playing guitar", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_19_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_19_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_19_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_19_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_19_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_19_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_19_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_19_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_19_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_19_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_19_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_19_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_19_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_19_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_19_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_19_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: playing basketball\nB: playing kickball\nC: kicking soccer ball\nD: dodgeball", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: playing basketball\nB: playing kickball\nC: kicking soccer ball\nD: dodgeball", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_20_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_20_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_20_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_20_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_20_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_20_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_20_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_20_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_20_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_20_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_20_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_20_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_20_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_20_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_20_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_20_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: riding scooter\nB: driving tractor\nC: pushing car\nD: cleaning windows", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: riding scooter\nB: driving tractor\nC: pushing car\nD: cleaning windows", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_21_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_21_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_21_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_21_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_21_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_21_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_21_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_21_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_21_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_21_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_21_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_21_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_21_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_21_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_21_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_21_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: snowboarding\nB: skiing crosscountry\nC: surfing water\nD: water skiing", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: snowboarding\nB: skiing crosscountry\nC: surfing water\nD: water skiing", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_22_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_22_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_22_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_22_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_22_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_22_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_22_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_22_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_22_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_22_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_22_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_22_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_22_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_22_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_22_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_22_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: playing guitar\nB: playing didgeridoo\nC: playing keyboard\nD: playing cymbals", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: playing guitar\nB: playing didgeridoo\nC: playing keyboard\nD: playing cymbals", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_23_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_23_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_23_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_23_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_23_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_23_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_23_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_23_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_23_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_23_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_23_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_23_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_23_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_23_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_23_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_23_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: skiing crosscountry\nB: ice fishing\nC: flying kite\nD: snowboarding", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: skiing crosscountry\nB: ice fishing\nC: flying kite\nD: snowboarding", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_24_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_24_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_24_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_24_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_24_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_24_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_24_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_24_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_24_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_24_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_24_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_24_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_24_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_24_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_24_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_24_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: folding clothes\nB: building cabinet\nC: moving furniture\nD: cleaning floor", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: folding clothes\nB: building cabinet\nC: moving furniture\nD: cleaning floor", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_25_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_25_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_25_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_25_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_25_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_25_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_25_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_25_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_25_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_25_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_25_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_25_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_25_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_25_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_25_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_25_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: bench pressing\nB: arm wrestling\nC: squat\nD: deadlifting", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: bench pressing\nB: arm wrestling\nC: squat\nD: deadlifting", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_26_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_26_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_26_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_26_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_26_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_26_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_26_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_26_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_26_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_26_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_26_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_26_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_26_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_26_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_26_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_26_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: cutting watermelon\nB: sanding floor\nC: trimming trees\nD: pruning trees", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: cutting watermelon\nB: sanding floor\nC: trimming trees\nD: pruning trees", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_27_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_27_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_27_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_27_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_27_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_27_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_27_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_27_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_27_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_27_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_27_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_27_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_27_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_27_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_27_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_27_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: riding mountain bike\nB: snowboarding\nC: biking through snow\nD: skiing crosscountry", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: riding mountain bike\nB: snowboarding\nC: biking through snow\nD: skiing crosscountry", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_28_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_28_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_28_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_28_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_28_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_28_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_28_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_28_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_28_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_28_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_28_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_28_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_28_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_28_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_28_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_28_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: playing guitar\nB: playing trombone\nC: playing cymbals\nD: playing harp", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: playing guitar\nB: playing trombone\nC: playing cymbals\nD: playing harp", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_29_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_29_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_29_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_29_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_29_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_29_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_29_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_29_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_29_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_29_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_29_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_29_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_29_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_29_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_29_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_29_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: tango dancing\nB: swing dancing\nC: dancing charleston\nD: jumpstyle dancing", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: tango dancing\nB: swing dancing\nC: dancing charleston\nD: jumpstyle dancing", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_30_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_30_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_30_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_30_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_30_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_30_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_30_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_30_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_30_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_30_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_30_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_30_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_30_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_30_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_30_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_30_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: playing harp\nB: shuffling cards\nC: tango dancing\nD: swing dancing", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: playing harp\nB: shuffling cards\nC: tango dancing\nD: swing dancing", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_31_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_31_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_31_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_31_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_31_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_31_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_31_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_31_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_31_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_31_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_31_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_31_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_31_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_31_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_31_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_31_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: cleaning pool\nB: washing windows\nC: cleaning windows\nD: shining shoes", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: cleaning pool\nB: washing windows\nC: cleaning windows\nD: shining shoes", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_32_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_32_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_32_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_32_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_32_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_32_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_32_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_32_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_32_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_32_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_32_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_32_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_32_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_32_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_32_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_32_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: swinging legs\nB: gymnastics tumbling\nC: squat\nD: stretching leg", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: swinging legs\nB: gymnastics tumbling\nC: squat\nD: stretching leg", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_33_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_33_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_33_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_33_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_33_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_33_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_33_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_33_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_33_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_33_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_33_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_33_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_33_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_33_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_33_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_33_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: teaching sign language\nB: communicating with sign language\nC: sign language interpreting\nD: sign language translation", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: teaching sign language\nB: communicating with sign language\nC: sign language interpreting\nD: sign language translation", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_34_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_34_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_34_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_34_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_34_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_34_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_34_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_34_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_34_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_34_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_34_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_34_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_34_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_34_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_34_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_34_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: opening present\nB: making a cake\nC: unboxing\nD: baking cookies", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: opening present\nB: making a cake\nC: unboxing\nD: baking cookies", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_35_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_35_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_35_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_35_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_35_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_35_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_35_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_35_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_35_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_35_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_35_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_35_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_35_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_35_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_35_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_35_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: shaving head\nB: pumping fist\nC: cleaning toilet\nD: shredding paper", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: shaving head\nB: pumping fist\nC: cleaning toilet\nD: shredding paper", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_36_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_36_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_36_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_36_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_36_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_36_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_36_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_36_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_36_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_36_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_36_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_36_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_36_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_36_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_36_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_36_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: horseback riding\nB: riding mule\nC: riding mountain bike\nD: petting animal (not cat)", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: horseback riding\nB: riding mule\nC: riding mountain bike\nD: petting animal (not cat)", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_37_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_37_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_37_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_37_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_37_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_37_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_37_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_37_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_37_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_37_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_37_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_37_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_37_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_37_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_37_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_37_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: waxing eyebrows\nB: shaving legs\nC: trimming trees\nD: waxing chest", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: waxing eyebrows\nB: shaving legs\nC: trimming trees\nD: waxing chest", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_38_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_38_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_38_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_38_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_38_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_38_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_38_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_38_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_38_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_38_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_38_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_38_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_38_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_38_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_38_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_38_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: grooming horse\nB: milking cow\nC: petting animal (not cat)\nD: feeding goats", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: grooming horse\nB: milking cow\nC: petting animal (not cat)\nD: feeding goats", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_39_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_39_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_39_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_39_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_39_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_39_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_39_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_39_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_39_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_39_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_39_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_39_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_39_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_39_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_39_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_39_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: spray painting\nB: ripping paper\nC: shredding paper\nD: filling eyebrows", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: spray painting\nB: ripping paper\nC: shredding paper\nD: filling eyebrows", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_40_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_40_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_40_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_40_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_40_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_40_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_40_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_40_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_40_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_40_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_40_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_40_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_40_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_40_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_40_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_40_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: massaging person's head\nB: massaging feet\nC: petting cat\nD: petting animal (not cat)", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: massaging person's head\nB: massaging feet\nC: petting cat\nD: petting animal (not cat)", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_41_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_41_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_41_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_41_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_41_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_41_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_41_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_41_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_41_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_41_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_41_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_41_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_41_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_41_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_41_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_41_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: marching\nB: nodding head\nC: clapping\nD: shaking head", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: marching\nB: nodding head\nC: clapping\nD: shaking head", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_42_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_42_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_42_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_42_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_42_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_42_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_42_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_42_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_42_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_42_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_42_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_42_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_42_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_42_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_42_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_42_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: playing kickball\nB: marching\nC: cutting watermelon\nD: blowing leaves", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: playing kickball\nB: marching\nC: cutting watermelon\nD: blowing leaves", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_43_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_43_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_43_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_43_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_43_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_43_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_43_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_43_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_43_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_43_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_43_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_43_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_43_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_43_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_43_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_43_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: rock scissors paper\nB: clapping\nC: bouncing on trampoline\nD: tango dancing", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: rock scissors paper\nB: clapping\nC: bouncing on trampoline\nD: tango dancing", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_44_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_44_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_44_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_44_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_44_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_44_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_44_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_44_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_44_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_44_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_44_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_44_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_44_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_44_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_44_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_44_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: using remote controller (not gaming)\nB: watching TV\nC: playing guitar\nD: typing on keyboard", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: using remote controller (not gaming)\nB: watching TV\nC: playing guitar\nD: typing on keyboard", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_45_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_45_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_45_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_45_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_45_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_45_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_45_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_45_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_45_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_45_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_45_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_45_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_45_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_45_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_45_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_45_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: playing kickball\nB: dribbling basketball\nC: playing basketball\nD: tossing coin", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: playing kickball\nB: dribbling basketball\nC: playing basketball\nD: tossing coin", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_46_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_46_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_46_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_46_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_46_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_46_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_46_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_46_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_46_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_46_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_46_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_46_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_46_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_46_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_46_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_46_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: skiing crosscountry\nB: snowboarding\nC: playing squash or racquetball\nD: riding mountain bike", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: skiing crosscountry\nB: snowboarding\nC: playing squash or racquetball\nD: riding mountain bike", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_47_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_47_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_47_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_47_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_47_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_47_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_47_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_47_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_47_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_47_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_47_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_47_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_47_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_47_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_47_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_47_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: slapping\nB: faceplanting\nC: back raises\nD: massaging person's head", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: slapping\nB: faceplanting\nC: back raises\nD: massaging person's head", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_48_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_48_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_48_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_48_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_48_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_48_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_48_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_48_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_48_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_48_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_48_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_48_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_48_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_48_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_48_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_48_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: rock scissors paper\nB: sword fighting\nC: fencing\nD: balloon blowing", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: rock scissors paper\nB: sword fighting\nC: fencing\nD: balloon blowing", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_49_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_49_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_49_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_49_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_49_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_49_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_49_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_49_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_49_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_49_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_49_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_49_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_49_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_49_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_49_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_49_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: weaving basket\nB: juggling fire\nC: cooking chicken\nD: playing badminton", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: weaving basket\nB: juggling fire\nC: cooking chicken\nD: playing badminton", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_50_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_50_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_50_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_50_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_50_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_50_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_50_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_50_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_50_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_50_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_50_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_50_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_50_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_50_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_50_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_50_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: drumming fingers\nB: shuffling cards\nC: beatboxing\nD: playing cymbals", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: drumming fingers\nB: shuffling cards\nC: beatboxing\nD: playing cymbals", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_51_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_51_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_51_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_51_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_51_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_51_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_51_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_51_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_51_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_51_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_51_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_51_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_51_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_51_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_51_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_51_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: waxing chest\nB: cutting nails\nC: shaving head\nD: trimming trees", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: waxing chest\nB: cutting nails\nC: shaving head\nD: trimming trees", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_52_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_52_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_52_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_52_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_52_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_52_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_52_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_52_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_52_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_52_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_52_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_52_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_52_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_52_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_52_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_52_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: jetskiing\nB: motorcycling\nC: riding mountain bike\nD: snowboarding", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: jetskiing\nB: motorcycling\nC: riding mountain bike\nD: snowboarding", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_53_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_53_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_53_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_53_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_53_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_53_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_53_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_53_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_53_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_53_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_53_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_53_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_53_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_53_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_53_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_53_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: sneezing\nB: breading or breadcrumbing\nC: rock scissors paper\nD: spinning poi", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: sneezing\nB: breading or breadcrumbing\nC: rock scissors paper\nD: spinning poi", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_54_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_54_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_54_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_54_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_54_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_54_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_54_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_54_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_54_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_54_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_54_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_54_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_54_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_54_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_54_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_54_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: climbing tree\nB: watering plants\nC: planting trees\nD: raking leaves", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: climbing tree\nB: watering plants\nC: planting trees\nD: raking leaves", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_55_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_55_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_55_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_55_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_55_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_55_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_55_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_55_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_55_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_55_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_55_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_55_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_55_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_55_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_55_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_55_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: riding mountain bike\nB: tango dancing\nC: playing kickball\nD: country line dancing", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: riding mountain bike\nB: tango dancing\nC: playing kickball\nD: country line dancing", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_56_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_56_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_56_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_56_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_56_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_56_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_56_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_56_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_56_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_56_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_56_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_56_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_56_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_56_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_56_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_56_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: playing kickball\nB: playing didgeridoo\nC: playing basketball\nD: washing dishes", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: playing kickball\nB: playing didgeridoo\nC: playing basketball\nD: washing dishes", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_57_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_57_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_57_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_57_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_57_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_57_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_57_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_57_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_57_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_57_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_57_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_57_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_57_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_57_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_57_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_57_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: cleaning pool\nB: washing dishes\nC: watering plants\nD: doing laundry", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: cleaning pool\nB: washing dishes\nC: watering plants\nD: doing laundry", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_58_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_58_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_58_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_58_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_58_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_58_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_58_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_58_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_58_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_58_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_58_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_58_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_58_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_58_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_58_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_58_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: bench pressing\nB: deadlifting\nC: snatch weight lifting\nD: clean and jerk", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: bench pressing\nB: deadlifting\nC: snatch weight lifting\nD: clean and jerk", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_59_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_59_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_59_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_59_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_59_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_59_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_59_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_59_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_59_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_59_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_59_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_59_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_59_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_59_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_59_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_59_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: tapping guitar\nB: tapping pen\nC: playing guitar\nD: strumming guitar", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: tapping guitar\nB: tapping pen\nC: playing guitar\nD: strumming guitar", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_60_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_60_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_60_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_60_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_60_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_60_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_60_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_60_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_60_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_60_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_60_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_60_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_60_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_60_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_60_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_60_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: frying eggs\nB: grilling meat\nC: boiling pasta\nD: cooking sausages", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: frying eggs\nB: grilling meat\nC: boiling pasta\nD: cooking sausages", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_61_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_61_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_61_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_61_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_61_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_61_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_61_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_61_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_61_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_61_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_61_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_61_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_61_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_61_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_61_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_61_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: playing flute\nB: playing cymbals\nC: playing guitar\nD: playing keyboard", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: playing flute\nB: playing cymbals\nC: playing guitar\nD: playing keyboard", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_62_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_62_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_62_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_62_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_62_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_62_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_62_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_62_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_62_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_62_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_62_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_62_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_62_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_62_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_62_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_62_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: mopping floor\nB: vacuuming\nC: cleaning windows\nD: sweeping floor", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: mopping floor\nB: vacuuming\nC: cleaning windows\nD: sweeping floor", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_63_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_63_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_63_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_63_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_63_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_63_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_63_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_63_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_63_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_63_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_63_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_63_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_63_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_63_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_63_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_63_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: gymnastics tumbling\nB: baking cookies\nC: chopping wood\nD: stretching leg", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: gymnastics tumbling\nB: baking cookies\nC: chopping wood\nD: stretching leg", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_64_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_64_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_64_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_64_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_64_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_64_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_64_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_64_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_64_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_64_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_64_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_64_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_64_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_64_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_64_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_64_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: cutting nails\nB: fixing hair\nC: shaving legs\nD: braiding hair", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: cutting nails\nB: fixing hair\nC: shaving legs\nD: braiding hair", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_65_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_65_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_65_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_65_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_65_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_65_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_65_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_65_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_65_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_65_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_65_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_65_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_65_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_65_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_65_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_65_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: sneezing\nB: auctioning\nC: testifying\nD: sign language interpreting", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: sneezing\nB: auctioning\nC: testifying\nD: sign language interpreting", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_66_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_66_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_66_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_66_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_66_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_66_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_66_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_66_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_66_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_66_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_66_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_66_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_66_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_66_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_66_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_66_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: eating ice cream\nB: licking lips\nC: tasting food\nD: baking cookies", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: eating ice cream\nB: licking lips\nC: tasting food\nD: baking cookies", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_67_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_67_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_67_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_67_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_67_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_67_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_67_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_67_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_67_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_67_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_67_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_67_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_67_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_67_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_67_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_67_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: climbing ladder\nB: krumping\nC: breakdancing\nD: robot dancing", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: climbing ladder\nB: krumping\nC: breakdancing\nD: robot dancing", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_68_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_68_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_68_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_68_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_68_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_68_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_68_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_68_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_68_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_68_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_68_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_68_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_68_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_68_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_68_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_68_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: skateboarding\nB: snowboarding\nC: riding mountain bike\nD: skiing crosscountry", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: skateboarding\nB: snowboarding\nC: riding mountain bike\nD: skiing crosscountry", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_69_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_69_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_69_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_69_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_69_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_69_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_69_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_69_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_69_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_69_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_69_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_69_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_69_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_69_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_69_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_69_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: riding mountain bike\nB: skiing (not slalom or crosscountry)\nC: snowboarding\nD: playing kickball", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: riding mountain bike\nB: skiing (not slalom or crosscountry)\nC: snowboarding\nD: playing kickball", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_70_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_70_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_70_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_70_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_70_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_70_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_70_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_70_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_70_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_70_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_70_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_70_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_70_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_70_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_70_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_70_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: kissing\nB: snuggling\nC: romantic dancing\nD: flirting", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: kissing\nB: snuggling\nC: romantic dancing\nD: flirting", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_71_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_71_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_71_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_71_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_71_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_71_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_71_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_71_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_71_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_71_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_71_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_71_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_71_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_71_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_71_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_71_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: washing feet\nB: playing kickball\nC: playing basketball\nD: playing squash or racquetball", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: washing feet\nB: playing kickball\nC: playing basketball\nD: playing squash or racquetball", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_72_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_72_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_72_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_72_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_72_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_72_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_72_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_72_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_72_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_72_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_72_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_72_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_72_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_72_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_72_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_72_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: waxing chest\nB: shooting goal (soccer)\nC: playing basketball\nD: riding scooter", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: waxing chest\nB: shooting goal (soccer)\nC: playing basketball\nD: riding scooter", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_73_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_73_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_73_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_73_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_73_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_73_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_73_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_73_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_73_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_73_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_73_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_73_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_73_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_73_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_73_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_73_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: hurdling\nB: high jump\nC: pole vault\nD: triple jump", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: hurdling\nB: high jump\nC: pole vault\nD: triple jump", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_74_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_74_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_74_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_74_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_74_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_74_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_74_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_74_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_74_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_74_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_74_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_74_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_74_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_74_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_74_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_74_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: fixing hair\nB: curling hair\nC: braiding hair\nD: cutting nails", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: fixing hair\nB: curling hair\nC: braiding hair\nD: cutting nails", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_75_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_75_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_75_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_75_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_75_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_75_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_75_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_75_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_75_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_75_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_75_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_75_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_75_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_75_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_75_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_75_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: preparing salad\nB: baking cookies\nC: cooking chicken\nD: making a cake", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: preparing salad\nB: baking cookies\nC: cooking chicken\nD: making a cake", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_76_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_76_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_76_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_76_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_76_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_76_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_76_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_76_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_76_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_76_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_76_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_76_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_76_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_76_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_76_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_76_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: waxing chest\nB: arms wrestling\nC: shaving legs\nD: shaving head", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: waxing chest\nB: arms wrestling\nC: shaving legs\nD: shaving head", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_77_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_77_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_77_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_77_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_77_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_77_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_77_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_77_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_77_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_77_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_77_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_77_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_77_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_77_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_77_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_77_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: filling eyebrows\nB: waxing eyebrows\nC: shaving legs\nD: cutting watermelon", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: filling eyebrows\nB: waxing eyebrows\nC: shaving legs\nD: cutting watermelon", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_78_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_78_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_78_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_78_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_78_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_78_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_78_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_78_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_78_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_78_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_78_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_78_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_78_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_78_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_78_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_78_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: milking cow\nB: petting animal (not cat)\nC: feeding birds\nD: holding snake", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: milking cow\nB: petting animal (not cat)\nC: feeding birds\nD: holding snake", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_79_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_79_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_79_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_79_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_79_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_79_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_79_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_79_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_79_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_79_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_79_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_79_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_79_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_79_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_79_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_79_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: shot put\nB: throwing discus\nC: hurdling\nD: discus throw", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: shot put\nB: throwing discus\nC: hurdling\nD: discus throw", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_80_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_80_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_80_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_80_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_80_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_80_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_80_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_80_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_80_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_80_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_80_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_80_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_80_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_80_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_80_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_80_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: making bed\nB: folding paper\nC: folding clothes\nD: ironing", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: making bed\nB: folding paper\nC: folding clothes\nD: ironing", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_81_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_81_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_81_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_81_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_81_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_81_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_81_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_81_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_81_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_81_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_81_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_81_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_81_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_81_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_81_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_81_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: stretching leg\nB: bouncing on trampoline\nC: jumpstyle dancing\nD: exercising with an exercise ball", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: stretching leg\nB: bouncing on trampoline\nC: jumpstyle dancing\nD: exercising with an exercise ball", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_82_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_82_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_82_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_82_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_82_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_82_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_82_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_82_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_82_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_82_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_82_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_82_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_82_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_82_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_82_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_82_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: washing hair\nB: combing hair\nC: styling hair\nD: brushing hair", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: washing hair\nB: combing hair\nC: styling hair\nD: brushing hair", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_83_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_83_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_83_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_83_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_83_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_83_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_83_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_83_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_83_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_83_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_83_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_83_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_83_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_83_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_83_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_83_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: swinging legs\nB: front raises\nC: bouncing on trampoline\nD: stretching leg", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: swinging legs\nB: front raises\nC: bouncing on trampoline\nD: stretching leg", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_84_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_84_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_84_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_84_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_84_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_84_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_84_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_84_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_84_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_84_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_84_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_84_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_84_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_84_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_84_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_84_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: shaving legs\nB: climbing ladder\nC: changing oil\nD: sanding floor", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: shaving legs\nB: climbing ladder\nC: changing oil\nD: sanding floor", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_85_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_85_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_85_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_85_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_85_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_85_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_85_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_85_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_85_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_85_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_85_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_85_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_85_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_85_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_85_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_85_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: laying bricks\nB: swing dancing\nC: ironing\nD: climbing tree", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: laying bricks\nB: swing dancing\nC: ironing\nD: climbing tree", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_86_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_86_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_86_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_86_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_86_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_86_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_86_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_86_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_86_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_86_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_86_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_86_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_86_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_86_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_86_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_86_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: punching bag\nB: eating burger\nC: driving tractor\nD: drinking shots", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: punching bag\nB: eating burger\nC: driving tractor\nD: drinking shots", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_87_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_87_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_87_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_87_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_87_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_87_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_87_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_87_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_87_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_87_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_87_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_87_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_87_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_87_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_87_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_87_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: headbanging\nB: playing guitar\nC: drumming fingers\nD: bouncing on trampoline", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: headbanging\nB: playing guitar\nC: drumming fingers\nD: bouncing on trampoline", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_88_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_88_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_88_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_88_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_88_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_88_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_88_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_88_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_88_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_88_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_88_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_88_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_88_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_88_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_88_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_88_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: breading or breadcrumbing\nB: shoveling snow\nC: springboard diving\nD: decorating the christmas tree", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: breading or breadcrumbing\nB: shoveling snow\nC: springboard diving\nD: decorating the christmas tree", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_89_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_89_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_89_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_89_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_89_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_89_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_89_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_89_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_89_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_89_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_89_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_89_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_89_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_89_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_89_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_89_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: applauding\nB: cheering\nC: clapping\nD: snapping fingers", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: applauding\nB: cheering\nC: clapping\nD: snapping fingers", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_90_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_90_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_90_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_90_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_90_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_90_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_90_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_90_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_90_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_90_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_90_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_90_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_90_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_90_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_90_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_90_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: climbing ladder\nB: playing sitar\nC: playing harp\nD: strumming guitar", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: climbing ladder\nB: playing sitar\nC: playing harp\nD: strumming guitar", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_91_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_91_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_91_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_91_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_91_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_91_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_91_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_91_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_91_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_91_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_91_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_91_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_91_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_91_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_91_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_91_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: playing guitar\nB: making jewelry\nC: filming movie\nD: unboxing", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: playing guitar\nB: making jewelry\nC: filming movie\nD: unboxing", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_92_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_92_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_92_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_92_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_92_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_92_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_92_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_92_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_92_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_92_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_92_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_92_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_92_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_92_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_92_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_92_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: playing cello\nB: playing guitar\nC: bouncing on trampoline\nD: recording music", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: playing cello\nB: playing guitar\nC: bouncing on trampoline\nD: recording music", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_93_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_93_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_93_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_93_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_93_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_93_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_93_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_93_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_93_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_93_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_93_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_93_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_93_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_93_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_93_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_93_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: folding clothes\nB: weaving basket\nC: baking cookies\nD: making jewelry", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: folding clothes\nB: weaving basket\nC: baking cookies\nD: making jewelry", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_94_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_94_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_94_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_94_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_94_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_94_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_94_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_94_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_94_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_94_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_94_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_94_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_94_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_94_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_94_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_94_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: biking through snow\nB: paragliding\nC: riding mountain bike\nD: windsurfing", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: biking through snow\nB: paragliding\nC: riding mountain bike\nD: windsurfing", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_95_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_95_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_95_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_95_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_95_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_95_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_95_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_95_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_95_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_95_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_95_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_95_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_95_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_95_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_95_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_95_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: peeling apples\nB: eating watermelon\nC: shaving legs\nD: cutting watermelon", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: peeling apples\nB: eating watermelon\nC: shaving legs\nD: cutting watermelon", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_96_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_96_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_96_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_96_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_96_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_96_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_96_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_96_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_96_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_96_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_96_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_96_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_96_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_96_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_96_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_96_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: skiing crosscountry\nB: biking through snow\nC: riding mountain bike\nD: snowboarding", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: skiing crosscountry\nB: biking through snow\nC: riding mountain bike\nD: snowboarding", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_97_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_97_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_97_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_97_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_97_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_97_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_97_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_97_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_97_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_97_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_97_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_97_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_97_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_97_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_97_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_97_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: sailing\nB: tapping pen\nC: playing organ\nD: riding mule", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: sailing\nB: tapping pen\nC: playing organ\nD: riding mule", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_98_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_98_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_98_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_98_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_98_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_98_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_98_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_98_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_98_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_98_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_98_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_98_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_98_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_98_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_98_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_98_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: playing kickball\nB: dancing gangnam style\nC: playing basketball\nD: playing paintball", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: playing kickball\nB: dancing gangnam style\nC: playing basketball\nD: playing paintball", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_99_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_99_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_99_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_99_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_99_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_99_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_99_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_99_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_99_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_99_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_99_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_99_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_99_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_99_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_99_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_99_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: deadlifting\nB: bench pressing\nC: pull ups\nD: jogging", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: deadlifting\nB: bench pressing\nC: pull ups\nD: jogging", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_100_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_100_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_100_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_100_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_100_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_100_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_100_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_100_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_100_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_100_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_100_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_100_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_100_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_100_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_100_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_100_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: playing didgeridoo\nB: tossing coin\nC: typing\nD: tapping pen", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: playing didgeridoo\nB: tossing coin\nC: typing\nD: tapping pen", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_101_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_101_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_101_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_101_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_101_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_101_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_101_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_101_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_101_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_101_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_101_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_101_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_101_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_101_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_101_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_101_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: drumming fingers\nB: playing guitar\nC: tapping pen\nD: shuffling cards", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: drumming fingers\nB: playing guitar\nC: tapping pen\nD: shuffling cards", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_102_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_102_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_102_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_102_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_102_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_102_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_102_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_102_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_102_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_102_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_102_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_102_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_102_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_102_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_102_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_102_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: sneezing\nB: blowing leaves\nC: riding mule\nD: passing American football (in game)", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: sneezing\nB: blowing leaves\nC: riding mule\nD: passing American football (in game)", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_103_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_103_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_103_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_103_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_103_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_103_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_103_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_103_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_103_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_103_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_103_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_103_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_103_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_103_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_103_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_103_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: writing\nB: playing guitar\nC: typing on keyboard\nD: playing piano", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: writing\nB: playing guitar\nC: typing on keyboard\nD: playing piano", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_104_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_104_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_104_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_104_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_104_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_104_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_104_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_104_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_104_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_104_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_104_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_104_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_104_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_104_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_104_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_104_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: riding elephant\nB: playing kickball\nC: golf putting\nD: playing golf chipping", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: riding elephant\nB: playing kickball\nC: golf putting\nD: playing golf chipping", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_105_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_105_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_105_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_105_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_105_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_105_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_105_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_105_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_105_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_105_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_105_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_105_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_105_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_105_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_105_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_105_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: assembling computer\nB: sharpening knives\nC: building cabinet\nD: making tea", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: assembling computer\nB: sharpening knives\nC: building cabinet\nD: making tea", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_106_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_106_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_106_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_106_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_106_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_106_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_106_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_106_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_106_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_106_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_106_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_106_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_106_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_106_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_106_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_106_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: shaving legs\nB: sanding floor\nC: cutting watermelon\nD: sharpening knives", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: shaving legs\nB: sanding floor\nC: cutting watermelon\nD: sharpening knives", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_107_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_107_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_107_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_107_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_107_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_107_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_107_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_107_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_107_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_107_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_107_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_107_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_107_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_107_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_107_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_107_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: front raises\nB: bending back\nC: push up\nD: situp", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: front raises\nB: bending back\nC: push up\nD: situp", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_108_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_108_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_108_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_108_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_108_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_108_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_108_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_108_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_108_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_108_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_108_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_108_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_108_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_108_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_108_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_108_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: playing kickball\nB: hoverboarding\nC: riding scooter\nD: using segway", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: playing kickball\nB: hoverboarding\nC: riding scooter\nD: using segway", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_109_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_109_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_109_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_109_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_109_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_109_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_109_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_109_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_109_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_109_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_109_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_109_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_109_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_109_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_109_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_109_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: grooming horse\nB: peeling apples\nC: tickling\nD: cooking on campfire", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: grooming horse\nB: peeling apples\nC: tickling\nD: cooking on campfire", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_110_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_110_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_110_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_110_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_110_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_110_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_110_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_110_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_110_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_110_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_110_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_110_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_110_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_110_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_110_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_110_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: eating cake\nB: eating chips\nC: tasting food\nD: eating watermelon", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: eating cake\nB: eating chips\nC: tasting food\nD: eating watermelon", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_111_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_111_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_111_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_111_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_111_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_111_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_111_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_111_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_111_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_111_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_111_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_111_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_111_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_111_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_111_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_111_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: snatch weight lifting\nB: deadlifting\nC: clean and jerk\nD: bouncing on trampoline", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: snatch weight lifting\nB: deadlifting\nC: clean and jerk\nD: bouncing on trampoline", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_112_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_112_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_112_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_112_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_112_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_112_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_112_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_112_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_112_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_112_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_112_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_112_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_112_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_112_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_112_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_112_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: peeling apples\nB: weaving basket\nC: feeding birds\nD: crossing river", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: peeling apples\nB: weaving basket\nC: feeding birds\nD: crossing river", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_113_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_113_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_113_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_113_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_113_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_113_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_113_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_113_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_113_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_113_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_113_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_113_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_113_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_113_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_113_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_113_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: climbing tree\nB: swing dancing\nC: rock scissors paper\nD: abseiling", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: climbing tree\nB: swing dancing\nC: rock scissors paper\nD: abseiling", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_114_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_114_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_114_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_114_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_114_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_114_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_114_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_114_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_114_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_114_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_114_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_114_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_114_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_114_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_114_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_114_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: grilling fish\nB: baking cookies\nC: cooking chicken\nD: breading or breadcrumbing", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: grilling fish\nB: baking cookies\nC: cooking chicken\nD: breading or breadcrumbing", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_115_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_115_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_115_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_115_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_115_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_115_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_115_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_115_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_115_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_115_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_115_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_115_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_115_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_115_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_115_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_115_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: hockey stop\nB: bobsledding\nC: dribbling basketball\nD: swimming breast stroke", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: hockey stop\nB: bobsledding\nC: dribbling basketball\nD: swimming breast stroke", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_116_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_116_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_116_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_116_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_116_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_116_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_116_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_116_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_116_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_116_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_116_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_116_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_116_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_116_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_116_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_116_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: eating cake\nB: eating chips\nC: eating burger\nD: eating watermelon", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: eating cake\nB: eating chips\nC: eating burger\nD: eating watermelon", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_117_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_117_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_117_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_117_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_117_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_117_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_117_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_117_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_117_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_117_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_117_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_117_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_117_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_117_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_117_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_117_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: bouncing on trampoline\nB: collecting garbage\nC: cleaning pool\nD: garbage collecting", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: bouncing on trampoline\nB: collecting garbage\nC: cleaning pool\nD: garbage collecting", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_118_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_118_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_118_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_118_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_118_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_118_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_118_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_118_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_118_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_118_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_118_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_118_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_118_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_118_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_118_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_118_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: boiling water\nB: mixing drink\nC: brewing coffee\nD: making tea", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: boiling water\nB: mixing drink\nC: brewing coffee\nD: making tea", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_119_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_119_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_119_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_119_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_119_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_119_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_119_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_119_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_119_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_119_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_119_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_119_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_119_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_119_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_119_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_119_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: playing guitar\nB: playing flute\nC: strumming guitar\nD: playing saxophone", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: playing guitar\nB: playing flute\nC: strumming guitar\nD: playing saxophone", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_120_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_120_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_120_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_120_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_120_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_120_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_120_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_120_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_120_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_120_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_120_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_120_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_120_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_120_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_120_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_120_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: beating drum\nB: tasting food\nC: stomping grapes\nD: dancing charleston", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: beating drum\nB: tasting food\nC: stomping grapes\nD: dancing charleston", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_121_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_121_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_121_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_121_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_121_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_121_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_121_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_121_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_121_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_121_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_121_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_121_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_121_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_121_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_121_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_121_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: making a cake\nB: grilling steak\nC: cooking on campfire\nD: baking cookies", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: making a cake\nB: grilling steak\nC: cooking on campfire\nD: baking cookies", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_122_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_122_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_122_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_122_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_122_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_122_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_122_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_122_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_122_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_122_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_122_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_122_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_122_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_122_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_122_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_122_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: springboard diving\nB: bouncing on trampoline\nC: swimming breast stroke\nD: diving cliff", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: springboard diving\nB: bouncing on trampoline\nC: swimming breast stroke\nD: diving cliff", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_123_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_123_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_123_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_123_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_123_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_123_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_123_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_123_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_123_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_123_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_123_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_123_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_123_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_123_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_123_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_123_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: grinding meat\nB: sanding floor\nC: sharpening knives\nD: welding", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: grinding meat\nB: sanding floor\nC: sharpening knives\nD: welding", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_124_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_124_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_124_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_124_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_124_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_124_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_124_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_124_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_124_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_124_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_124_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_124_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_124_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_124_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_124_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_124_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: plastering\nB: driving tractor\nC: playing kickball\nD: riding mountain bike", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: plastering\nB: driving tractor\nC: playing kickball\nD: riding mountain bike", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_125_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_125_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_125_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_125_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_125_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_125_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_125_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_125_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_125_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_125_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_125_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_125_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_125_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_125_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_125_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_125_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: cleaning floor\nB: sweeping floor\nC: mopping floor\nD: washing floor", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: cleaning floor\nB: sweeping floor\nC: mopping floor\nD: washing floor", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_126_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_126_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_126_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_126_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_126_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_126_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_126_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_126_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_126_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_126_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_126_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_126_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_126_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_126_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_126_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_126_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: rock climbing\nB: parkour\nC: ice climbing\nD: free running", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: rock climbing\nB: parkour\nC: ice climbing\nD: free running", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_127_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_127_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_127_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_127_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_127_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_127_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_127_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_127_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_127_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_127_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_127_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_127_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_127_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_127_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_127_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_127_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: riding mountain bike\nB: kicking soccer ball\nC: shooting goal (soccer)\nD: playing basketball", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: riding mountain bike\nB: kicking soccer ball\nC: shooting goal (soccer)\nD: playing basketball", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_128_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_128_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_128_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_128_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_128_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_128_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_128_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_128_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_128_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_128_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_128_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_128_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_128_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_128_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_128_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_128_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: playing violin\nB: sneezing\nC: sailing\nD: tango dancing", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: playing violin\nB: sneezing\nC: sailing\nD: tango dancing", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_129_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_129_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_129_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_129_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_129_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_129_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_129_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_129_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_129_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_129_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_129_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_129_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_129_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_129_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_129_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_129_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: jumpstyle dancing\nB: jumping on trampoline\nC: skipping rope\nD: slacklining", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: jumpstyle dancing\nB: jumping on trampoline\nC: skipping rope\nD: slacklining", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_130_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_130_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_130_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_130_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_130_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_130_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_130_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_130_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_130_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_130_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_130_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_130_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_130_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_130_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_130_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_130_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: riding elephant\nB: petting animal (not cat)\nC: waxing chest\nD: grooming horse", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: riding elephant\nB: petting animal (not cat)\nC: waxing chest\nD: grooming horse", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_131_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_131_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_131_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_131_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_131_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_131_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_131_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_131_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_131_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_131_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_131_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_131_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_131_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_131_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_131_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_131_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: capoeira\nB: hoverboarding\nC: playing kickball\nD: kicking soccer ball", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: capoeira\nB: hoverboarding\nC: playing kickball\nD: kicking soccer ball", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_132_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_132_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_132_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_132_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_132_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_132_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_132_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_132_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_132_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_132_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_132_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_132_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_132_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_132_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_132_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_132_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: saut\u00e9ing vegetables\nB: scrambling eggs\nC: beating eggs\nD: cooking on campfire", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: saut\u00e9ing vegetables\nB: scrambling eggs\nC: beating eggs\nD: cooking on campfire", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_133_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_133_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_133_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_133_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_133_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_133_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_133_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_133_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_133_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_133_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_133_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_133_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_133_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_133_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_133_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_133_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: playing paintball\nB: doing laundry\nC: robot dancing\nD: tango dancing", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: playing paintball\nB: doing laundry\nC: robot dancing\nD: tango dancing", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_134_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_134_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_134_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_134_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_134_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_134_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_134_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_134_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_134_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_134_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_134_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_134_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_134_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_134_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_134_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_134_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: fishing\nB: climbing mountain\nC: rowing boat\nD: crossing river", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: fishing\nB: climbing mountain\nC: rowing boat\nD: crossing river", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_135_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_135_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_135_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_135_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_135_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_135_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_135_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_135_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_135_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_135_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_135_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_135_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_135_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_135_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_135_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_135_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: jumpstyle dancing\nB: jogging\nC: exercising with an exercise ball\nD: running on treadmill", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: jumpstyle dancing\nB: jogging\nC: exercising with an exercise ball\nD: running on treadmill", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_136_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_136_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_136_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_136_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_136_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_136_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_136_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_136_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_136_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_136_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_136_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_136_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_136_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_136_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_136_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_136_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: lunges\nB: leg press\nC: squat\nD: push ups", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: lunges\nB: leg press\nC: squat\nD: push ups", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_137_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_137_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_137_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_137_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_137_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_137_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_137_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_137_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_137_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_137_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_137_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_137_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_137_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_137_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_137_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_137_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: trimming trees\nB: building cabinet\nC: tossing coin\nD: folding paper", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: trimming trees\nB: building cabinet\nC: tossing coin\nD: folding paper", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_138_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_138_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_138_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_138_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_138_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_138_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_138_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_138_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_138_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_138_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_138_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_138_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_138_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_138_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_138_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_138_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: trimming trees\nB: getting a haircut\nC: barbequing\nD: shaving head", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: trimming trees\nB: getting a haircut\nC: barbequing\nD: shaving head", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_139_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_139_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_139_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_139_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_139_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_139_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_139_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_139_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_139_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_139_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_139_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_139_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_139_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_139_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_139_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_139_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: breading or breadcrumbing\nB: making tea\nC: hunting rabbits\nD: egg hunting", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: breading or breadcrumbing\nB: making tea\nC: hunting rabbits\nD: egg hunting", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_140_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_140_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_140_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_140_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_140_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_140_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_140_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_140_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_140_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_140_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_140_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_140_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_140_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_140_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_140_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_140_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: laying bricks\nB: wall painting\nC: plastering\nD: rock scissors paper", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: laying bricks\nB: wall painting\nC: plastering\nD: rock scissors paper", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_141_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_141_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_141_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_141_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_141_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_141_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_141_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_141_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_141_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_141_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_141_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_141_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_141_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_141_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_141_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_141_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: brush painting\nB: breading or breadcrumbing\nC: shining shoes\nD: grinding meat", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: brush painting\nB: breading or breadcrumbing\nC: shining shoes\nD: grinding meat", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_142_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_142_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_142_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_142_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_142_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_142_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_142_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_142_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_142_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_142_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_142_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_142_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_142_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_142_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_142_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_142_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: bending back\nB: spinning poi\nC: juggling fire\nD: parkour", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: bending back\nB: spinning poi\nC: juggling fire\nD: parkour", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_143_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_143_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_143_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_143_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_143_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_143_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_143_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_143_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_143_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_143_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_143_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_143_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_143_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_143_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_143_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_143_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: jetskiing\nB: windsurfing\nC: water skiing\nD: surfing water", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: jetskiing\nB: windsurfing\nC: water skiing\nD: surfing water", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_144_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_144_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_144_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_144_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_144_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_144_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_144_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_144_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_144_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_144_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_144_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_144_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_144_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_144_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_144_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_144_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: tightrope walking\nB: swinging legs\nC: slacklining\nD: rock scissors paper", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: tightrope walking\nB: swinging legs\nC: slacklining\nD: rock scissors paper", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_145_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_145_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_145_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_145_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_145_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_145_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_145_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_145_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_145_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_145_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_145_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_145_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_145_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_145_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_145_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_145_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: parkour\nB: bending back\nC: spray painting\nD: ice climbing", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: parkour\nB: bending back\nC: spray painting\nD: ice climbing", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_146_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_146_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_146_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_146_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_146_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_146_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_146_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_146_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_146_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_146_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_146_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_146_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_146_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_146_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_146_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_146_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: bee keeping\nB: watering plants\nC: baking cookies\nD: trimming trees", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: bee keeping\nB: watering plants\nC: baking cookies\nD: trimming trees", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_147_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_147_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_147_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_147_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_147_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_147_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_147_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_147_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_147_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_147_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_147_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_147_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_147_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_147_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_147_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_147_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: playing chess\nB: rock scissors paper\nC: playing piano\nD: balloon blowing", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: playing chess\nB: rock scissors paper\nC: playing piano\nD: balloon blowing", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_148_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_148_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_148_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_148_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_148_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_148_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_148_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_148_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_148_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_148_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_148_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_148_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_148_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_148_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_148_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_148_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: water skiing\nB: planting trees\nC: watering plants\nD: gardening", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: water skiing\nB: planting trees\nC: watering plants\nD: gardening", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_149_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_149_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_149_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_149_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_149_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_149_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_149_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_149_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_149_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_149_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_149_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_149_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_149_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_149_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_149_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_149_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: swinging legs\nB: pull ups\nC: bouncing on trampoline\nD: dunking basketball", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: swinging legs\nB: pull ups\nC: bouncing on trampoline\nD: dunking basketball", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_150_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_150_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_150_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_150_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_150_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_150_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_150_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_150_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_150_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_150_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_150_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_150_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_150_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_150_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_150_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_150_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: feeding birds\nB: riding camel\nC: riding mule\nD: sailing", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: feeding birds\nB: riding camel\nC: riding mule\nD: sailing", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_151_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_151_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_151_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_151_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_151_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_151_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_151_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_151_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_151_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_151_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_151_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_151_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_151_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_151_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_151_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_151_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: windsurfing\nB: kayaking\nC: sailing\nD: rowing boat", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: windsurfing\nB: kayaking\nC: sailing\nD: rowing boat", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_152_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_152_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_152_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_152_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_152_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_152_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_152_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_152_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_152_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_152_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_152_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_152_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_152_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_152_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_152_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_152_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: squat\nB: yoga\nC: applauding\nD: doing nails", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: squat\nB: yoga\nC: applauding\nD: doing nails", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_153_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_153_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_153_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_153_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_153_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_153_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_153_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_153_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_153_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_153_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_153_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_153_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_153_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_153_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_153_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_153_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: tango dancing\nB: playing organ\nC: strumming guitar\nD: playing bass guitar", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: tango dancing\nB: playing organ\nC: strumming guitar\nD: playing bass guitar", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_154_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_154_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_154_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_154_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_154_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_154_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_154_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_154_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_154_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_154_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_154_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_154_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_154_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_154_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_154_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_154_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: kite surfing\nB: flying kite\nC: swinging legs\nD: bouncing on trampoline", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: kite surfing\nB: flying kite\nC: swinging legs\nD: bouncing on trampoline", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_155_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_155_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_155_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_155_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_155_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_155_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_155_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_155_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_155_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_155_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_155_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_155_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_155_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_155_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_155_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_155_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: cooking sausages\nB: eating burger\nC: grinding meat\nD: breading or breadcrumbing", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: cooking sausages\nB: eating burger\nC: grinding meat\nD: breading or breadcrumbing", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_156_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_156_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_156_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_156_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_156_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_156_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_156_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_156_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_156_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_156_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_156_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_156_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_156_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_156_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_156_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_156_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: petting animal (not cat)\nB: carrying baby\nC: feeding birds\nD: changing oil", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: petting animal (not cat)\nB: carrying baby\nC: feeding birds\nD: changing oil", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_157_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_157_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_157_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_157_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_157_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_157_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_157_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_157_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_157_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_157_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_157_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_157_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_157_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_157_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_157_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_157_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: bouncing on trampoline\nB: snorkeling\nC: surfing water\nD: water skiing", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: bouncing on trampoline\nB: snorkeling\nC: surfing water\nD: water skiing", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_158_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_158_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_158_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_158_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_158_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_158_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_158_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_158_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_158_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_158_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_158_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_158_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_158_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_158_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_158_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_158_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: riding mule\nB: driving tractor\nC: skiing (not slalom or crosscountry)\nD: riding scooter", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: riding mule\nB: driving tractor\nC: skiing (not slalom or crosscountry)\nD: riding scooter", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_159_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_159_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_159_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_159_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_159_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_159_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_159_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_159_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_159_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_159_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_159_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_159_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_159_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_159_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_159_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_159_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: eating cake\nB: making a cake\nC: eating chips\nD: baking cookies", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: eating cake\nB: making a cake\nC: eating chips\nD: baking cookies", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_160_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_160_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_160_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_160_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_160_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_160_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_160_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_160_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_160_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_160_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_160_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_160_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_160_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_160_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_160_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_160_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: drumming fingers\nB: tapping guitar\nC: playing bass guitar\nD: strumming guitar", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: drumming fingers\nB: tapping guitar\nC: playing bass guitar\nD: strumming guitar", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_161_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_161_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_161_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_161_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_161_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_161_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_161_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_161_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_161_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_161_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_161_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_161_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_161_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_161_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_161_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_161_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: playing saxophone\nB: drinking beer\nC: smoking\nD: snorkeling", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: playing saxophone\nB: drinking beer\nC: smoking\nD: snorkeling", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_162_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_162_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_162_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_162_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_162_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_162_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_162_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_162_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_162_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_162_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_162_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_162_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_162_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_162_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_162_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_162_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: marching\nB: auctioning\nC: bouncing on trampoline\nD: checking tires", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: marching\nB: auctioning\nC: bouncing on trampoline\nD: checking tires", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_163_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_163_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_163_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_163_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_163_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_163_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_163_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_163_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_163_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_163_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_163_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_163_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_163_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_163_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_163_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_163_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: cutting watermelon\nB: shredding paper\nC: ripping paper\nD: sweeping floor", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: cutting watermelon\nB: shredding paper\nC: ripping paper\nD: sweeping floor", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_164_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_164_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_164_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_164_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_164_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_164_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_164_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_164_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_164_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_164_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_164_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_164_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_164_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_164_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_164_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_164_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: playing soccer\nB: dribbling basketball\nC: kicking soccer ball\nD: juggling soccer ball", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: playing soccer\nB: dribbling basketball\nC: kicking soccer ball\nD: juggling soccer ball", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_165_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_165_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_165_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_165_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_165_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_165_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_165_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_165_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_165_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_165_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_165_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_165_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_165_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_165_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_165_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_165_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: applying cream\nB: shaving legs\nC: washing feet\nD: massaging person's head", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: applying cream\nB: shaving legs\nC: washing feet\nD: massaging person's head", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_166_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_166_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_166_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_166_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_166_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_166_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_166_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_166_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_166_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_166_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_166_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_166_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_166_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_166_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_166_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_166_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: rock scissors paper\nB: front raises\nC: springboard diving\nD: bungee jumping", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: rock scissors paper\nB: front raises\nC: springboard diving\nD: bungee jumping", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_167_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_167_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_167_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_167_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_167_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_167_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_167_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_167_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_167_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_167_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_167_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_167_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_167_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_167_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_167_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_167_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: tire rotation\nB: changing oil\nC: inspecting engine\nD: checking tires", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: tire rotation\nB: changing oil\nC: inspecting engine\nD: checking tires", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_168_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_168_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_168_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_168_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_168_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_168_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_168_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_168_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_168_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_168_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_168_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_168_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_168_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_168_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_168_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_168_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: massaging feet\nB: milking cow\nC: petting animal (not cat)\nD: feeding birds", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: massaging feet\nB: milking cow\nC: petting animal (not cat)\nD: feeding birds", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_169_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_169_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_169_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_169_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_169_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_169_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_169_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_169_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_169_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_169_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_169_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_169_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_169_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_169_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_169_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_169_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: grilling meat\nB: eating burger\nC: cooking sausages\nD: eating chips", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: grilling meat\nB: eating burger\nC: cooking sausages\nD: eating chips", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_170_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_170_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_170_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_170_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_170_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_170_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_170_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_170_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_170_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_170_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_170_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_170_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_170_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_170_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_170_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_170_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: springboard diving\nB: hurdling\nC: zumba\nD: faceplanting", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: springboard diving\nB: hurdling\nC: zumba\nD: faceplanting", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_171_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_171_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_171_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_171_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_171_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_171_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_171_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_171_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_171_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_171_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_171_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_171_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_171_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_171_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_171_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_171_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: eating chips\nB: drinking beer\nC: tasting food\nD: drinking shots", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: eating chips\nB: drinking beer\nC: tasting food\nD: drinking shots", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_172_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_172_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_172_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_172_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_172_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_172_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_172_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_172_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_172_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_172_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_172_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_172_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_172_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_172_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_172_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_172_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: eating hotdog\nB: peeling apples\nC: baking cookies\nD: eating watermelon", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: eating hotdog\nB: peeling apples\nC: baking cookies\nD: eating watermelon", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_173_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_173_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_173_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_173_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_173_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_173_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_173_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_173_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_173_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_173_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_173_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_173_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_173_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_173_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_173_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_173_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: throwing discus\nB: passing American football (in game)\nC: playing kickball\nD: playing basketball", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: throwing discus\nB: passing American football (in game)\nC: playing kickball\nD: playing basketball", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_174_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_174_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_174_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_174_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_174_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_174_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_174_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_174_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_174_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_174_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_174_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_174_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_174_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_174_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_174_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_174_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: dancing charleston\nB: swing dancing\nC: tango dancing\nD: bungee jumping", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: dancing charleston\nB: swing dancing\nC: tango dancing\nD: bungee jumping", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_175_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_175_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_175_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_175_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_175_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_175_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_175_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_175_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_175_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_175_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_175_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_175_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_175_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_175_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_175_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_175_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: deadlifting\nB: springboard diving\nC: trapezing\nD: rock scissors paper", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: deadlifting\nB: springboard diving\nC: trapezing\nD: rock scissors paper", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_176_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_176_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_176_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_176_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_176_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_176_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_176_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_176_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_176_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_176_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_176_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_176_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_176_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_176_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_176_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_176_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: strumming guitar\nB: weaving basket\nC: peeling apples\nD: baking cookies", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: strumming guitar\nB: weaving basket\nC: peeling apples\nD: baking cookies", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_177_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_177_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_177_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_177_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_177_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_177_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_177_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_177_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_177_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_177_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_177_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_177_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_177_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_177_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_177_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_177_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: sailing\nB: ice climbing\nC: skiing crosscountry\nD: snowmobiling", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: sailing\nB: ice climbing\nC: skiing crosscountry\nD: snowmobiling", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_178_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_178_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_178_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_178_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_178_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_178_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_178_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_178_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_178_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_178_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_178_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_178_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_178_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_178_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_178_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_178_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: petting cat\nB: petting animal (not cat)\nC: feeding birds\nD: stroking dog", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: petting cat\nB: petting animal (not cat)\nC: feeding birds\nD: stroking dog", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_179_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_179_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_179_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_179_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_179_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_179_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_179_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_179_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_179_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_179_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_179_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_179_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_179_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_179_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_179_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_179_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: cooking on campfire\nB: smoking\nC: grilling fish\nD: barbequing", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: cooking on campfire\nB: smoking\nC: grilling fish\nD: barbequing", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_180_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_180_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_180_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_180_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_180_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_180_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_180_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_180_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_180_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_180_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_180_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_180_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_180_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_180_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_180_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_180_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: playing saxophone\nB: playing kickball\nC: playing guitar\nD: playing didgeridoo", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: playing saxophone\nB: playing kickball\nC: playing guitar\nD: playing didgeridoo", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_181_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_181_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_181_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_181_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_181_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_181_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_181_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_181_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_181_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_181_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_181_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_181_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_181_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_181_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_181_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_181_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: playing kickball\nB: driving tractor\nC: playing guitar\nD: air drumming", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: playing kickball\nB: driving tractor\nC: playing guitar\nD: air drumming", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_182_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_182_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_182_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_182_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_182_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_182_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_182_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_182_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_182_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_182_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_182_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_182_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_182_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_182_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_182_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_182_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: skateboarding\nB: playing kickball\nC: skiing crosscountry\nD: hoverboarding", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: skateboarding\nB: playing kickball\nC: skiing crosscountry\nD: hoverboarding", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_183_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_183_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_183_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_183_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_183_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_183_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_183_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_183_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_183_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_183_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_183_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_183_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_183_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_183_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_183_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_183_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: triple jump\nB: hurdling\nC: bouncing on trampoline\nD: high jump", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: triple jump\nB: hurdling\nC: bouncing on trampoline\nD: high jump", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_184_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_184_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_184_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_184_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_184_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_184_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_184_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_184_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_184_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_184_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_184_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_184_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_184_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_184_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_184_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_184_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: kicking soccer ball\nB: playing kickball\nC: playing basketball\nD: playing paintball", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: kicking soccer ball\nB: playing kickball\nC: playing basketball\nD: playing paintball", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_185_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_185_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_185_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_185_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_185_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_185_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_185_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_185_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_185_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_185_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_185_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_185_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_185_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_185_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_185_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_185_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: playing kickball\nB: tossing coin\nC: rock scissors paper\nD: throwing axe", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: playing kickball\nB: tossing coin\nC: rock scissors paper\nD: throwing axe", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_186_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_186_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_186_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_186_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_186_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_186_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_186_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_186_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_186_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_186_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_186_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_186_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_186_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_186_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_186_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_186_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: playing paintball\nB: shooting goal (soccer)\nC: brush painting\nD: celebrating", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: playing paintball\nB: shooting goal (soccer)\nC: brush painting\nD: celebrating", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_187_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_187_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_187_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_187_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_187_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_187_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_187_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_187_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_187_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_187_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_187_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_187_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_187_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_187_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_187_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_187_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: riding mule\nB: shooting goal (soccer)\nC: cutting watermelon\nD: chopping wood", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: riding mule\nB: shooting goal (soccer)\nC: cutting watermelon\nD: chopping wood", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_188_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_188_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_188_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_188_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_188_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_188_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_188_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_188_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_188_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_188_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_188_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_188_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_188_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_188_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_188_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_188_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: assembling computer\nB: grinding meat\nC: moving furniture\nD: brushing hair", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: assembling computer\nB: grinding meat\nC: moving furniture\nD: brushing hair", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_189_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_189_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_189_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_189_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_189_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_189_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_189_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_189_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_189_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_189_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_189_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_189_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_189_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_189_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_189_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_189_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: cooking chicken\nB: weaving basket\nC: making a cake\nD: baking cookies", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: cooking chicken\nB: weaving basket\nC: making a cake\nD: baking cookies", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_190_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_190_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_190_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_190_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_190_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_190_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_190_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_190_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_190_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_190_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_190_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_190_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_190_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_190_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_190_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_190_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: eating chips\nB: cooking on campfire\nC: dining\nD: playing poker", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: eating chips\nB: cooking on campfire\nC: dining\nD: playing poker", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_191_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_191_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_191_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_191_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_191_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_191_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_191_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_191_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_191_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_191_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_191_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_191_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_191_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_191_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_191_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_191_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: throwing discus\nB: cutting watermelon\nC: bending back\nD: throwing axe", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: throwing discus\nB: cutting watermelon\nC: bending back\nD: throwing axe", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_192_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_192_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_192_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_192_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_192_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_192_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_192_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_192_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_192_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_192_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_192_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_192_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_192_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_192_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_192_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_192_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: surfing water\nB: hoverboarding\nC: skateboarding\nD: riding scooter", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: surfing water\nB: hoverboarding\nC: skateboarding\nD: riding scooter", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_193_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_193_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_193_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_193_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_193_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_193_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_193_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_193_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_193_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_193_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_193_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_193_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_193_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_193_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_193_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_193_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: playing saxophone\nB: playing drums\nC: playing guitar\nD: playing trumpet", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: playing saxophone\nB: playing drums\nC: playing guitar\nD: playing trumpet", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_194_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_194_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_194_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_194_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_194_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_194_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_194_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_194_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_194_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_194_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_194_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_194_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_194_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_194_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_194_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_194_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: playing kickball\nB: kicking soccer ball\nC: dribbling basketball\nD: playing basketball", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: playing kickball\nB: kicking soccer ball\nC: dribbling basketball\nD: playing basketball", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_195_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_195_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_195_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_195_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_195_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_195_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_195_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_195_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_195_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_195_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_195_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_195_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_195_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_195_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_195_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_195_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: windsurfing\nB: snorkeling\nC: surfing water\nD: sailing", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: windsurfing\nB: snorkeling\nC: surfing water\nD: sailing", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_196_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_196_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_196_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_196_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_196_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_196_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_196_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_196_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_196_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_196_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_196_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_196_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_196_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_196_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_196_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_196_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: washing feet\nB: earning a hair cut\nC: using segway\nD: cleaning toilet", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: washing feet\nB: earning a hair cut\nC: using segway\nD: cleaning toilet", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_197_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_197_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_197_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_197_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_197_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_197_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_197_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_197_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_197_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_197_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_197_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_197_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_197_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_197_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_197_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_197_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: folding clothes\nB: stretching leg\nC: lunge\nD: situp", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: folding clothes\nB: stretching leg\nC: lunge\nD: situp", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_198_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_198_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_198_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_198_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_198_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_198_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_198_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_198_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_198_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_198_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_198_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_198_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_198_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_198_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_198_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_198_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "general_action_recognition", "visual_input_component": "Video image or Natural image", "source": "kinetics400", "options": "A: auctioning\nB: shuffling cards\nC: rock scissors paper\nD: news anchoring", "question": "What is the action performed by the person in the video?", "context": "Select from the following choices.\nA: auctioning\nB: shuffling cards\nC: rock scissors paper\nD: news anchoring", "input_image_path": ["./Continuous-temporal/general_action_recognition/general_action_recognition_199_0.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_199_1.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_199_2.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_199_3.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_199_4.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_199_5.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_199_6.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_199_7.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_199_8.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_199_9.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_199_10.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_199_11.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_199_12.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_199_13.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_199_14.png", "./Continuous-temporal/general_action_recognition/general_action_recognition_199_15.png"], "output": "D", "qwen3-vl": "image none"}]
\ No newline at end of file
diff --git a/results/gui_app_recognition/qwen3-vl/metadata_info.json b/results/gui_app_recognition/qwen3-vl/metadata_info.json
new file mode 100644
index 0000000..4e4c48f
--- /dev/null
+++ b/results/gui_app_recognition/qwen3-vl/metadata_info.json
@@ -0,0 +1 @@
+[{"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Zedge Wallpapers & Ringtones', 'Google Play Store']\nB: ['Photos', 'iNaturalist']\nC: ['Google Photos', 'PlantNet']\nD: ['Pinterest', 'Setting']\n", "question": "The corresponding actions are: step 1: CLICK: (642, 124)\nstep 2: CLICK: (90, 74)\nstep 3: CLICK: (693, 76)\nstep 4: CLICK: (780, 80)\nstep 5: TYPE: art deco\nstep 6: CLICK: (321, 153)\nstep 7: CLICK: (384, 660)\nstep 8: CLICK: (906, 78)\nstep 9: CLICK: (172, 757)\nstep 10: PRESS_RECENT\nstep 11: CLICK: (68, 530)\nstep 12: CLICK: (540, 276)\nstep 13: CLICK: (202, 215)\nstep 14: CLICK: (172, 238)\nstep 15: CLICK: (851, 70)\nstep 16: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (642, 124)\nstep 2: CLICK: (90, 74)\nstep 3: CLICK: (693, 76)\nstep 4: CLICK: (780, 80)\nstep 5: TYPE: art deco\nstep 6: CLICK: (321, 153)\nstep 7: CLICK: (384, 660)\nstep 8: CLICK: (906, 78)\nstep 9: CLICK: (172, 757)\nstep 10: PRESS_RECENT\nstep 11: CLICK: (68, 530)\nstep 12: CLICK: (540, 276)\nstep 13: CLICK: (202, 215)\nstep 14: CLICK: (172, 238)\nstep 15: CLICK: (851, 70)\nstep 16: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Zedge Wallpapers & Ringtones', 'Google Play Store']\nB: ['Photos', 'iNaturalist']\nC: ['Google Photos', 'PlantNet']\nD: ['Pinterest', 'Setting']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_0_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_0_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_0_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_0_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_0_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_0_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_0_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_0_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_0_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_0_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_0_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_0_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_0_12.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_0_13.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_0_14.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_0_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Contacts', 'Farfetch']\nB: ['Vaulty:Hide Pictures Videos', 'Net-a-Porte']\nC: ['Google Play Store', 'Meesho']\nD: ['PlantNet', 'Target']\n", "question": "The corresponding actions are: step 1: CLICK: (168, 104)\nstep 2: CLICK: (479, 927)\nstep 3: CLICK: (474, 76)\nstep 4: TYPE: Meesho\nstep 5: CLICK: (919, 896)\nstep 6: CLICK: (770, 324)\nstep 7: PRESS_HOME\nstep 8: CLICK: (819, 400)\nstep 9: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (168, 104)\nstep 2: CLICK: (479, 927)\nstep 3: CLICK: (474, 76)\nstep 4: TYPE: Meesho\nstep 5: CLICK: (919, 896)\nstep 6: CLICK: (770, 324)\nstep 7: PRESS_HOME\nstep 8: CLICK: (819, 400)\nstep 9: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Contacts', 'Farfetch']\nB: ['Vaulty:Hide Pictures Videos', 'Net-a-Porte']\nC: ['Google Play Store', 'Meesho']\nD: ['PlantNet', 'Target']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_1_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_1_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_1_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_1_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_1_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_1_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_1_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_1_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_1_8.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Wikipedia', 'Tiktok']\nB: ['Opera', 'Shorts VotTak: Short Video App']\nC: ['Firefox', 'Tubi: Movies & Live TV']\nD: ['Chrome', 'Netflix']\n", "question": "The corresponding actions are: step 1: CLICK: (611, 740)\nstep 2: CLICK: (830, 81)\nstep 3: CLICK: (691, 362)\nstep 4: CLICK: (858, 890)\nstep 5: TYPE: mystery movie on tubi\nstep 6: CLICK: (894, 876)\nstep 7: CLICK: (283, 484)\nstep 8: CLICK: (774, 570)\nstep 9: CLICK: (872, 90)\nstep 10: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (611, 740)\nstep 2: CLICK: (830, 81)\nstep 3: CLICK: (691, 362)\nstep 4: CLICK: (858, 890)\nstep 5: TYPE: mystery movie on tubi\nstep 6: CLICK: (894, 876)\nstep 7: CLICK: (283, 484)\nstep 8: CLICK: (774, 570)\nstep 9: CLICK: (872, 90)\nstep 10: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Wikipedia', 'Tiktok']\nB: ['Opera', 'Shorts VotTak: Short Video App']\nC: ['Firefox', 'Tubi: Movies & Live TV']\nD: ['Chrome', 'Netflix']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_2_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_2_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_2_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_2_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_2_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_2_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_2_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_2_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_2_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_2_9.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['X', 'Google Meet']\nB: ['Threads', 'Zoho Meeting']\nC: ['Gmail', 'Microsoft Teams']\nD: ['Facebook', 'ZOOM Cloud Meetings']\n", "question": "The corresponding actions are: step 1: CLICK: (152, 525)\nstep 2: CLICK: (871, 929)\nstep 3: CLICK: (362, 201)\nstep 4: CLICK: (886, 823)\nstep 5: PRESS_HOME\nstep 6: CLICK: (346, 273)\nstep 7: CLICK: (906, 949)\nstep 8: CLICK: (364, 153)\nstep 9: CLICK: (380, 952)\nstep 10: TYPE: meet.google.com/wtm-nmdy-dav\nstep 11: CLICK: (924, 640)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (152, 525)\nstep 2: CLICK: (871, 929)\nstep 3: CLICK: (362, 201)\nstep 4: CLICK: (886, 823)\nstep 5: PRESS_HOME\nstep 6: CLICK: (346, 273)\nstep 7: CLICK: (906, 949)\nstep 8: CLICK: (364, 153)\nstep 9: CLICK: (380, 952)\nstep 10: TYPE: meet.google.com/wtm-nmdy-dav\nstep 11: CLICK: (924, 640)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['X', 'Google Meet']\nB: ['Threads', 'Zoho Meeting']\nC: ['Gmail', 'Microsoft Teams']\nD: ['Facebook', 'ZOOM Cloud Meetings']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_3_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_3_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_3_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_3_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_3_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_3_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_3_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_3_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_3_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_3_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_3_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_3_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['WPS office', 'Firefox']\nB: ['Google Docs', 'wikiHow']\nC: ['Simplenote', 'Bing: chat with AI & GPT4']\nD: ['Dropbox Paper', 'Chrome']\n", "question": "The corresponding actions are: step 1: CLICK: (156, 132)\nstep 2: CLICK: (848, 188)\nstep 3: TYPE: Seoul weather tomorrow\nstep 4: CLICK: (929, 186)\nstep 5: PRESS_HOME\nstep 6: CLICK: (831, 402)\nstep 7: CLICK: (942, 76)\nstep 8: CLICK: (237, 574)\nstep 9: CLICK: (880, 814)\nstep 10: CLICK: (145, 662)\nstep 11: CLICK: (197, 270)\nstep 12: CLICK: (411, 438)\nstep 13: TYPE: Seoul, tomorrow: rain  todolist: stay at home and play lol\nstep 14: CLICK: (45, 80)\nstep 15: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (156, 132)\nstep 2: CLICK: (848, 188)\nstep 3: TYPE: Seoul weather tomorrow\nstep 4: CLICK: (929, 186)\nstep 5: PRESS_HOME\nstep 6: CLICK: (831, 402)\nstep 7: CLICK: (942, 76)\nstep 8: CLICK: (237, 574)\nstep 9: CLICK: (880, 814)\nstep 10: CLICK: (145, 662)\nstep 11: CLICK: (197, 270)\nstep 12: CLICK: (411, 438)\nstep 13: TYPE: Seoul, tomorrow: rain  todolist: stay at home and play lol\nstep 14: CLICK: (45, 80)\nstep 15: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['WPS office', 'Firefox']\nB: ['Google Docs', 'wikiHow']\nC: ['Simplenote', 'Bing: chat with AI & GPT4']\nD: ['Dropbox Paper', 'Chrome']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_4_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_4_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_4_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_4_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_4_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_4_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_4_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_4_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_4_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_4_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_4_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_4_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_4_12.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_4_13.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_4_14.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['ClevCalc - Calculator', 'DuckDuckGo']\nB: ['Clock', 'Bing: chat with AI & GPT4']\nC: ['Calendar', 'Firefox']\nD: ['DigiCal Calendar Agenda', 'Opera']\n", "question": "The corresponding actions are: step 1: CLICK: (915, 507)\nstep 2: SCROLL: DOWN\nstep 3: CLICK: (628, 72)\nstep 4: TYPE: when is next Olympics opening ceremony\nstep 5: CLICK: (307, 241)\nstep 6: PRESS_RECENT\nstep 7: CLICK: (591, 922)\nstep 8: CLICK: (401, 425)\nstep 9: CLICK: (906, 888)\nstep 10: CLICK: (909, 865)\nstep 11: TYPE: Olympics opening ceremony\nstep 12: CLICK: (433, 198)\nstep 13: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (915, 507)\nstep 2: SCROLL: DOWN\nstep 3: CLICK: (628, 72)\nstep 4: TYPE: when is next Olympics opening ceremony\nstep 5: CLICK: (307, 241)\nstep 6: PRESS_RECENT\nstep 7: CLICK: (591, 922)\nstep 8: CLICK: (401, 425)\nstep 9: CLICK: (906, 888)\nstep 10: CLICK: (909, 865)\nstep 11: TYPE: Olympics opening ceremony\nstep 12: CLICK: (433, 198)\nstep 13: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['ClevCalc - Calculator', 'DuckDuckGo']\nB: ['Clock', 'Bing: chat with AI & GPT4']\nC: ['Calendar', 'Firefox']\nD: ['DigiCal Calendar Agenda', 'Opera']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_5_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_5_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_5_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_5_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_5_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_5_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_5_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_5_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_5_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_5_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_5_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_5_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_5_12.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Yahoo Sports', 'Google Wallet']\nB: ['AP News', 'PayPal - Send, Shop, Manage']\nC: ['Bloomberg: Finance Market News', 'Investing.com']\nD: ['BBC News', 'Venmo']\n", "question": "The corresponding actions are: step 1: CLICK: (674, 351)\nstep 2: CLICK: (908, 74)\nstep 3: CLICK: (302, 69)\nstep 4: TYPE: Nvidia\nstep 5: SCROLL: UP\nstep 6: CLICK: (374, 435)\nstep 7: PRESS_HOME\nstep 8: CLICK: (315, 342)\nstep 9: CLICK: (915, 77)\nstep 10: TYPE: Nvidia\nstep 11: CLICK: (649, 310)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (674, 351)\nstep 2: CLICK: (908, 74)\nstep 3: CLICK: (302, 69)\nstep 4: TYPE: Nvidia\nstep 5: SCROLL: UP\nstep 6: CLICK: (374, 435)\nstep 7: PRESS_HOME\nstep 8: CLICK: (315, 342)\nstep 9: CLICK: (915, 77)\nstep 10: TYPE: Nvidia\nstep 11: CLICK: (649, 310)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Yahoo Sports', 'Google Wallet']\nB: ['AP News', 'PayPal - Send, Shop, Manage']\nC: ['Bloomberg: Finance Market News', 'Investing.com']\nD: ['BBC News', 'Venmo']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_6_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_6_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_6_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_6_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_6_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_6_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_6_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_6_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_6_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_6_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_6_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_6_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Opera', 'Microsoft Word']\nB: ['wikiHow', 'Google Docs']\nC: ['Chrome', 'Simplenote']\nD: ['Edge', 'BasicNote - Notes, Notepad']\n", "question": "The corresponding actions are: step 1: CLICK: (661, 731)\nstep 2: CLICK: (363, 74)\nstep 3: TYPE: weather in Istanbul tomorrow\nstep 4: CLICK: (441, 156)\nstep 5: SCROLL: UP\nstep 6: PRESS_HOME\nstep 7: CLICK: (691, 354)\nstep 8: CLICK: (897, 873)\nstep 9: CLICK: (316, 312)\nstep 10: TYPE: Istanbul,tomorrow:mostly cloudy Todolist:take a trip outsides.\nstep 11: CLICK: (77, 85)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (661, 731)\nstep 2: CLICK: (363, 74)\nstep 3: TYPE: weather in Istanbul tomorrow\nstep 4: CLICK: (441, 156)\nstep 5: SCROLL: UP\nstep 6: PRESS_HOME\nstep 7: CLICK: (691, 354)\nstep 8: CLICK: (897, 873)\nstep 9: CLICK: (316, 312)\nstep 10: TYPE: Istanbul,tomorrow:mostly cloudy Todolist:take a trip outsides.\nstep 11: CLICK: (77, 85)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Opera', 'Microsoft Word']\nB: ['wikiHow', 'Google Docs']\nC: ['Chrome', 'Simplenote']\nD: ['Edge', 'BasicNote - Notes, Notepad']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_7_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_7_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_7_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_7_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_7_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_7_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_7_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_7_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_7_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_7_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_7_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_7_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['PlantNet', 'Triller', 'Firefox']\nB: ['TradingView: Track All Markets', 'Pluto TV - Live TV and Movies', 'wikiHow']\nC: ['Google Play Store', 'Tiktok', 'Chrome']\nD: ['Setting', 'Youtube', 'Opera']\n", "question": "The corresponding actions are: step 1: CLICK: (844, 124)\nstep 2: CLICK: (677, 943)\nstep 3: CLICK: (516, 911)\nstep 4: CLICK: (259, 136)\nstep 5: TYPE: cooking recipes on youtube\nstep 6: CLICK: (949, 921)\nstep 7: CLICK: (170, 337)\nstep 8: PRESS_HOME\nstep 9: CLICK: (384, 834)\nstep 10: CLICK: (268, 960)\nstep 11: CLICK: (442, 325)\nstep 12: CLICK: (797, 93)\nstep 13: SCROLL: RIGHT\nstep 14: SCROLL: UP\nstep 15: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (844, 124)\nstep 2: CLICK: (677, 943)\nstep 3: CLICK: (516, 911)\nstep 4: CLICK: (259, 136)\nstep 5: TYPE: cooking recipes on youtube\nstep 6: CLICK: (949, 921)\nstep 7: CLICK: (170, 337)\nstep 8: PRESS_HOME\nstep 9: CLICK: (384, 834)\nstep 10: CLICK: (268, 960)\nstep 11: CLICK: (442, 325)\nstep 12: CLICK: (797, 93)\nstep 13: SCROLL: RIGHT\nstep 14: SCROLL: UP\nstep 15: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['PlantNet', 'Triller', 'Firefox']\nB: ['TradingView: Track All Markets', 'Pluto TV - Live TV and Movies', 'wikiHow']\nC: ['Google Play Store', 'Tiktok', 'Chrome']\nD: ['Setting', 'Youtube', 'Opera']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_8_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_8_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_8_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_8_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_8_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_8_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_8_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_8_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_8_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_8_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_8_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_8_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_8_12.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_8_13.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_8_14.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Google Play Store', 'Opera']\nB: ['TradingView: Track All Markets', 'Chrome']\nC: ['PlantNet', 'DuckDuckGo']\nD: ['Vaulty:Hide Pictures Videos', 'Edge']\n", "question": "The corresponding actions are: step 1: CLICK: (614, 404)\nstep 2: CLICK: (366, 137)\nstep 3: TYPE: Grocery Shopping Apps\nstep 4: CLICK: (923, 916)\nstep 5: SCROLL: UP\nstep 6: PRESS_HOME\nstep 7: SCROLL: RIGHT\nstep 8: CLICK: (161, 656)\nstep 9: CLICK: (363, 77)\nstep 10: TYPE: AnyList\nstep 11: CLICK: (918, 909)\nstep 12: CLICK: (860, 311)\nstep 13: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (614, 404)\nstep 2: CLICK: (366, 137)\nstep 3: TYPE: Grocery Shopping Apps\nstep 4: CLICK: (923, 916)\nstep 5: SCROLL: UP\nstep 6: PRESS_HOME\nstep 7: SCROLL: RIGHT\nstep 8: CLICK: (161, 656)\nstep 9: CLICK: (363, 77)\nstep 10: TYPE: AnyList\nstep 11: CLICK: (918, 909)\nstep 12: CLICK: (860, 311)\nstep 13: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Google Play Store', 'Opera']\nB: ['TradingView: Track All Markets', 'Chrome']\nC: ['PlantNet', 'DuckDuckGo']\nD: ['Vaulty:Hide Pictures Videos', 'Edge']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_9_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_9_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_9_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_9_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_9_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_9_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_9_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_9_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_9_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_9_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_9_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_9_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_9_12.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Threads', 'iHeart: Music, Radio, Podcasts']\nB: ['Instagram', 'Spotify']\nC: ['X', 'Pandora']\nD: ['Gmail', 'YT Music']\n", "question": "The corresponding actions are: step 1: CLICK: (141, 718)\nstep 2: CLICK: (410, 57)\nstep 3: CLICK: (985, 63)\nstep 4: TYPE: Pop\nstep 5: CLICK: (892, 674)\nstep 6: CLICK: (485, 141)\nstep 7: CLICK: (416, 327)\nstep 8: CLICK: (863, 613)\nstep 9: PRESS_HOME\nstep 10: CLICK: (576, 104)\nstep 11: CLICK: (19, 481)\nstep 12: CLICK: (156, 138)\nstep 13: CLICK: (40, 204)\nstep 14: TYPE: Popular\nstep 15: CLICK: (962, 423)\nstep 16: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (141, 718)\nstep 2: CLICK: (410, 57)\nstep 3: CLICK: (985, 63)\nstep 4: TYPE: Pop\nstep 5: CLICK: (892, 674)\nstep 6: CLICK: (485, 141)\nstep 7: CLICK: (416, 327)\nstep 8: CLICK: (863, 613)\nstep 9: PRESS_HOME\nstep 10: CLICK: (576, 104)\nstep 11: CLICK: (19, 481)\nstep 12: CLICK: (156, 138)\nstep 13: CLICK: (40, 204)\nstep 14: TYPE: Popular\nstep 15: CLICK: (962, 423)\nstep 16: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Threads', 'iHeart: Music, Radio, Podcasts']\nB: ['Instagram', 'Spotify']\nC: ['X', 'Pandora']\nD: ['Gmail', 'YT Music']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_10_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_10_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_10_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_10_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_10_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_10_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_10_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_10_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_10_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_10_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_10_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_10_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_10_12.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_10_13.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_10_14.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_10_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Pluto TV - Live TV and Movies', 'Plantin']\nB: ['Tiktok', 'Picturethis']\nC: ['Triller', 'Tripadvisor']\nD: ['Likee', 'Google Play Store']\n", "question": "The corresponding actions are: step 1: CLICK: (855, 743)\nstep 2: CLICK: (563, 248)\nstep 3: PRESS_HOME\nstep 4: CLICK: (397, 726)\nstep 5: PRESS_HOME\nstep 6: CLICK: (822, 739)\nstep 7: CLICK: (72, 76)\nstep 8: CLICK: (869, 476)\nstep 9: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (855, 743)\nstep 2: CLICK: (563, 248)\nstep 3: PRESS_HOME\nstep 4: CLICK: (397, 726)\nstep 5: PRESS_HOME\nstep 6: CLICK: (822, 739)\nstep 7: CLICK: (72, 76)\nstep 8: CLICK: (869, 476)\nstep 9: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Pluto TV - Live TV and Movies', 'Plantin']\nB: ['Tiktok', 'Picturethis']\nC: ['Triller', 'Tripadvisor']\nD: ['Likee', 'Google Play Store']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_11_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_11_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_11_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_11_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_11_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_11_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_11_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_11_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_11_8.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Youtube', 'Contacts', 'iNaturalist']\nB: ['Netflix', 'PlantNet', 'Applock Pro - APP Lock & Guard']\nC: ['Triller', 'Google Play Store', 'Setting']\nD: ['Tubi: Movies & Live TV', 'Vaulty:Hide Pictures Videos', 'Tripadvisor']\n", "question": "The corresponding actions are: step 1: CLICK: (668, 916)\nstep 2: CLICK: (384, 79)\nstep 3: TYPE: triller\nstep 4: CLICK: (880, 891)\nstep 5: CLICK: (872, 437)\nstep 6: PRESS_HOME\nstep 7: CLICK: (558, 929)\nstep 8: CLICK: (455, 498)\nstep 9: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (668, 916)\nstep 2: CLICK: (384, 79)\nstep 3: TYPE: triller\nstep 4: CLICK: (880, 891)\nstep 5: CLICK: (872, 437)\nstep 6: PRESS_HOME\nstep 7: CLICK: (558, 929)\nstep 8: CLICK: (455, 498)\nstep 9: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Youtube', 'Contacts', 'iNaturalist']\nB: ['Netflix', 'PlantNet', 'Applock Pro - APP Lock & Guard']\nC: ['Triller', 'Google Play Store', 'Setting']\nD: ['Tubi: Movies & Live TV', 'Vaulty:Hide Pictures Videos', 'Tripadvisor']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_12_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_12_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_12_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_12_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_12_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_12_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_12_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_12_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_12_8.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Zoho Meeting', 'Threads']\nB: ['Google Meet', 'Instagram']\nC: ['Microsoft Teams', 'Facebook']\nD: ['ZOOM Cloud Meetings', 'Messenger']\n", "question": "The corresponding actions are: step 1: CLICK: (783, 692)\nstep 2: CLICK: (293, 376)\nstep 3: CLICK: (496, 488)\nstep 4: CLICK: (489, 76)\nstep 5: PRESS_HOME\nstep 6: CLICK: (670, 160)\nstep 7: CLICK: (403, 528)\nstep 8: CLICK: (464, 910)\nstep 9: TYPE: 9298916954\nstep 10: SCROLL: UP\nstep 11: CLICK: (763, 515)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (783, 692)\nstep 2: CLICK: (293, 376)\nstep 3: CLICK: (496, 488)\nstep 4: CLICK: (489, 76)\nstep 5: PRESS_HOME\nstep 6: CLICK: (670, 160)\nstep 7: CLICK: (403, 528)\nstep 8: CLICK: (464, 910)\nstep 9: TYPE: 9298916954\nstep 10: SCROLL: UP\nstep 11: CLICK: (763, 515)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Zoho Meeting', 'Threads']\nB: ['Google Meet', 'Instagram']\nC: ['Microsoft Teams', 'Facebook']\nD: ['ZOOM Cloud Meetings', 'Messenger']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_13_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_13_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_13_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_13_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_13_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_13_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_13_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_13_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_13_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_13_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_13_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_13_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['ChatOn - AI Chat Bot Assistant', 'Quora']\nB: ['WOMBO Dream-AI Art Generator', 'Bing: chat with AI & GPT4']\nC: ['Microsoft Copilot', 'Opera']\nD: ['ChatGPT', 'wikiHow']\n", "question": "The corresponding actions are: step 1: CLICK: (565, 403)\nstep 2: TYPE: tell me about Bolzano-Weierstrass theorem\nstep 3: CLICK: (694, 418)\nstep 4: SCROLL: UP\nstep 5: PRESS_HOME\nstep 6: CLICK: (138, 413)\nstep 7: CLICK: (210, 111)\nstep 8: TYPE: Bolzano-Weierstrass theorem\nstep 9: CLICK: (889, 697)\nstep 10: SCROLL: UP\nstep 11: CLICK: (357, 615)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (565, 403)\nstep 2: TYPE: tell me about Bolzano-Weierstrass theorem\nstep 3: CLICK: (694, 418)\nstep 4: SCROLL: UP\nstep 5: PRESS_HOME\nstep 6: CLICK: (138, 413)\nstep 7: CLICK: (210, 111)\nstep 8: TYPE: Bolzano-Weierstrass theorem\nstep 9: CLICK: (889, 697)\nstep 10: SCROLL: UP\nstep 11: CLICK: (357, 615)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['ChatOn - AI Chat Bot Assistant', 'Quora']\nB: ['WOMBO Dream-AI Art Generator', 'Bing: chat with AI & GPT4']\nC: ['Microsoft Copilot', 'Opera']\nD: ['ChatGPT', 'wikiHow']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_14_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_14_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_14_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_14_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_14_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_14_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_14_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_14_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_14_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_14_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_14_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_14_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Threads', 'DuckDuckgo']\nB: ['Whatsapp', 'Firefox']\nC: ['Tumblr', 'Edge']\nD: ['Instagram', 'Bing: chat with AI & GPT4']\n", "question": "The corresponding actions are: step 1: CLICK: (573, 704)\nstep 2: CLICK: (170, 67)\nstep 3: TYPE: Japan\nstep 4: CLICK: (903, 693)\nstep 5: CLICK: (128, 454)\nstep 6: CLICK: (983, 68)\nstep 7: CLICK: (855, 193)\nstep 8: CLICK: (706, 599)\nstep 9: PRESS_HOME\nstep 10: CLICK: (148, 254)\nstep 11: CLICK: (495, 921)\nstep 12: CLICK: (474, 492)\nstep 13: CLICK: (687, 424)\nstep 14: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (573, 704)\nstep 2: CLICK: (170, 67)\nstep 3: TYPE: Japan\nstep 4: CLICK: (903, 693)\nstep 5: CLICK: (128, 454)\nstep 6: CLICK: (983, 68)\nstep 7: CLICK: (855, 193)\nstep 8: CLICK: (706, 599)\nstep 9: PRESS_HOME\nstep 10: CLICK: (148, 254)\nstep 11: CLICK: (495, 921)\nstep 12: CLICK: (474, 492)\nstep 13: CLICK: (687, 424)\nstep 14: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Threads', 'DuckDuckgo']\nB: ['Whatsapp', 'Firefox']\nC: ['Tumblr', 'Edge']\nD: ['Instagram', 'Bing: chat with AI & GPT4']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_15_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_15_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_15_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_15_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_15_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_15_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_15_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_15_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_15_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_15_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_15_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_15_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_15_12.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_15_13.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Calculator', 'Tripadvisor']\nB: ['aCalendar', 'TradingView: Track All Markets']\nC: ['ClevCalc - Calculator', 'Contacts']\nD: ['Clock', 'Setting']\n", "question": "The corresponding actions are: step 1: CLICK: (453, 491)\nstep 2: SCROLL: UP\nstep 3: CLICK: (160, 682)\nstep 4: SCROLL: UP\nstep 5: CLICK: (437, 672)\nstep 6: CLICK: (480, 427)\nstep 7: CLICK: (480, 427)\nstep 8: PRESS_HOME\nstep 9: SCROLL: UP\nstep 10: CLICK: (795, 766)\nstep 11: CLICK: (781, 846)\nstep 12: CLICK: (651, 599)\nstep 13: CLICK: (796, 724)\nstep 14: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (453, 491)\nstep 2: SCROLL: UP\nstep 3: CLICK: (160, 682)\nstep 4: SCROLL: UP\nstep 5: CLICK: (437, 672)\nstep 6: CLICK: (480, 427)\nstep 7: CLICK: (480, 427)\nstep 8: PRESS_HOME\nstep 9: SCROLL: UP\nstep 10: CLICK: (795, 766)\nstep 11: CLICK: (781, 846)\nstep 12: CLICK: (651, 599)\nstep 13: CLICK: (796, 724)\nstep 14: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Calculator', 'Tripadvisor']\nB: ['aCalendar', 'TradingView: Track All Markets']\nC: ['ClevCalc - Calculator', 'Contacts']\nD: ['Clock', 'Setting']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_16_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_16_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_16_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_16_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_16_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_16_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_16_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_16_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_16_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_16_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_16_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_16_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_16_12.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_16_13.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Waze Navigation & Live Traffic', 'Plantin']\nB: ['Uber', 'TradingView: Track All Markets']\nC: ['Yandex Navigator', 'Contacts']\nD: ['Citymapper', 'Google Play Store']\n", "question": "The corresponding actions are: step 1: CLICK: (837, 522)\nstep 2: CLICK: (505, 466)\nstep 3: CLICK: (371, 294)\nstep 4: TYPE: park\nstep 5: CLICK: (540, 465)\nstep 6: PRESS_RECENT\nstep 7: SCROLL: RIGHT\nstep 8: CLICK: (415, 561)\nstep 9: CLICK: (333, 86)\nstep 10: CLICK: (304, 429)\nstep 11: CLICK: (384, 554)\nstep 12: CLICK: (427, 317)\nstep 13: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (837, 522)\nstep 2: CLICK: (505, 466)\nstep 3: CLICK: (371, 294)\nstep 4: TYPE: park\nstep 5: CLICK: (540, 465)\nstep 6: PRESS_RECENT\nstep 7: SCROLL: RIGHT\nstep 8: CLICK: (415, 561)\nstep 9: CLICK: (333, 86)\nstep 10: CLICK: (304, 429)\nstep 11: CLICK: (384, 554)\nstep 12: CLICK: (427, 317)\nstep 13: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Waze Navigation & Live Traffic', 'Plantin']\nB: ['Uber', 'TradingView: Track All Markets']\nC: ['Yandex Navigator', 'Contacts']\nD: ['Citymapper', 'Google Play Store']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_17_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_17_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_17_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_17_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_17_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_17_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_17_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_17_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_17_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_17_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_17_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_17_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_17_12.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['iNaturalist', 'X']\nB: ['Applock Pro - APP Lock & Guard', 'Messenger']\nC: ['PlantNet', 'Tumblr']\nD: ['Vaulty:Hide Pictures Videos', 'Instagram']\n", "question": "The corresponding actions are: step 1: CLICK: (603, 526)\nstep 2: CLICK: (380, 251)\nstep 3: CLICK: (299, 150)\nstep 4: CLICK: (447, 558)\nstep 5: CLICK: (237, 942)\nstep 6: SCROLL: UP\nstep 7: CLICK: (862, 520)\nstep 8: CLICK: (382, 483)\nstep 9: CLICK: (920, 72)\nstep 10: CLICK: (920, 72)\nstep 11: CLICK: (451, 956)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (603, 526)\nstep 2: CLICK: (380, 251)\nstep 3: CLICK: (299, 150)\nstep 4: CLICK: (447, 558)\nstep 5: CLICK: (237, 942)\nstep 6: SCROLL: UP\nstep 7: CLICK: (862, 520)\nstep 8: CLICK: (382, 483)\nstep 9: CLICK: (920, 72)\nstep 10: CLICK: (920, 72)\nstep 11: CLICK: (451, 956)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['iNaturalist', 'X']\nB: ['Applock Pro - APP Lock & Guard', 'Messenger']\nC: ['PlantNet', 'Tumblr']\nD: ['Vaulty:Hide Pictures Videos', 'Instagram']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_18_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_18_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_18_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_18_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_18_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_18_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_18_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_18_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_18_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_18_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_18_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_18_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Google Keep', 'X']\nB: ['Simplenote', 'Whatsapp']\nC: ['Microsoft Word', 'Threads']\nD: ['WPS office', 'Facebook']\n", "question": "The corresponding actions are: step 1: CLICK: (918, 676)\nstep 2: CLICK: (961, 330)\nstep 3: CLICK: (672, 417)\nstep 4: SCROLL: UP\nstep 5: CLICK: (393, 595)\nstep 6: CLICK: (738, 62)\nstep 7: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (918, 676)\nstep 2: CLICK: (961, 330)\nstep 3: CLICK: (672, 417)\nstep 4: SCROLL: UP\nstep 5: CLICK: (393, 595)\nstep 6: CLICK: (738, 62)\nstep 7: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Google Keep', 'X']\nB: ['Simplenote', 'Whatsapp']\nC: ['Microsoft Word', 'Threads']\nD: ['WPS office', 'Facebook']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_19_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_19_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_19_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_19_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_19_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_19_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_19_6.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Instagram', 'Google Play Store', 'Setting']\nB: ['Gmail', 'iNaturalist', 'Applock Pro - APP Lock & Guard']\nC: ['Whatsapp', 'PlantNet', 'TradingView: Track All Markets']\nD: ['Tumblr', 'Vaulty:Hide Pictures Videos', 'Google Play Store']\n", "question": "The corresponding actions are: step 1: CLICK: (131, 706)\nstep 2: CLICK: (150, 76)\nstep 3: TYPE: tiktok\nstep 4: CLICK: (156, 143)\nstep 5: CLICK: (657, 372)\nstep 6: PRESS_HOME\nstep 7: CLICK: (417, 711)\nstep 8: PRESS_HOME\nstep 9: CLICK: (136, 699)\nstep 10: CLICK: (138, 56)\nstep 11: CLICK: (980, 64)\nstep 12: TYPE: Instagram\nstep 13: CLICK: (225, 135)\nstep 14: CLICK: (540, 373)\nstep 15: CLICK: (691, 376)\nstep 16: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (131, 706)\nstep 2: CLICK: (150, 76)\nstep 3: TYPE: tiktok\nstep 4: CLICK: (156, 143)\nstep 5: CLICK: (657, 372)\nstep 6: PRESS_HOME\nstep 7: CLICK: (417, 711)\nstep 8: PRESS_HOME\nstep 9: CLICK: (136, 699)\nstep 10: CLICK: (138, 56)\nstep 11: CLICK: (980, 64)\nstep 12: TYPE: Instagram\nstep 13: CLICK: (225, 135)\nstep 14: CLICK: (540, 373)\nstep 15: CLICK: (691, 376)\nstep 16: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Instagram', 'Google Play Store', 'Setting']\nB: ['Gmail', 'iNaturalist', 'Applock Pro - APP Lock & Guard']\nC: ['Whatsapp', 'PlantNet', 'TradingView: Track All Markets']\nD: ['Tumblr', 'Vaulty:Hide Pictures Videos', 'Google Play Store']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_20_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_20_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_20_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_20_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_20_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_20_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_20_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_20_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_20_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_20_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_20_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_20_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_20_12.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_20_13.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_20_14.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_20_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Google Play Store', 'Chrome']\nB: ['PlantNet', 'Bing: chat with AI & GPT4']\nC: ['TradingView: Track All Markets', 'Quora']\nD: ['Contacts', 'wikiHow']\n", "question": "The corresponding actions are: step 1: CLICK: (687, 745)\nstep 2: CLICK: (358, 388)\nstep 3: TYPE: Task Manager Apps\nstep 4: CLICK: (920, 881)\nstep 5: PRESS_HOME\nstep 6: CLICK: (502, 755)\nstep 7: CLICK: (383, 84)\nstep 8: TYPE: Todoist\nstep 9: CLICK: (918, 882)\nstep 10: CLICK: (861, 465)\nstep 11: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (687, 745)\nstep 2: CLICK: (358, 388)\nstep 3: TYPE: Task Manager Apps\nstep 4: CLICK: (920, 881)\nstep 5: PRESS_HOME\nstep 6: CLICK: (502, 755)\nstep 7: CLICK: (383, 84)\nstep 8: TYPE: Todoist\nstep 9: CLICK: (918, 882)\nstep 10: CLICK: (861, 465)\nstep 11: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Google Play Store', 'Chrome']\nB: ['PlantNet', 'Bing: chat with AI & GPT4']\nC: ['TradingView: Track All Markets', 'Quora']\nD: ['Contacts', 'wikiHow']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_21_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_21_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_21_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_21_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_21_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_21_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_21_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_21_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_21_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_21_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_21_10.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Threads', 'Chatbot AI & Smart Assistant']\nB: ['Tumblr', 'WOMBO Dream-AI Art Generator']\nC: ['Whatsapp', 'Remix:AI Image Creator']\nD: ['X', 'Picsart AI Photo Editor,Video']\n", "question": "The corresponding actions are: step 1: CLICK: (158, 514)\nstep 2: CLICK: (469, 935)\nstep 3: CLICK: (498, 315)\nstep 4: CLICK: (231, 869)\nstep 5: CLICK: (85, 509)\nstep 6: SCROLL: UP\nstep 7: CLICK: (931, 74)\nstep 8: CLICK: (775, 83)\nstep 9: CLICK: (647, 655)\nstep 10: CLICK: (323, 772)\nstep 11: CLICK: (899, 934)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (158, 514)\nstep 2: CLICK: (469, 935)\nstep 3: CLICK: (498, 315)\nstep 4: CLICK: (231, 869)\nstep 5: CLICK: (85, 509)\nstep 6: SCROLL: UP\nstep 7: CLICK: (931, 74)\nstep 8: CLICK: (775, 83)\nstep 9: CLICK: (647, 655)\nstep 10: CLICK: (323, 772)\nstep 11: CLICK: (899, 934)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Threads', 'Chatbot AI & Smart Assistant']\nB: ['Tumblr', 'WOMBO Dream-AI Art Generator']\nC: ['Whatsapp', 'Remix:AI Image Creator']\nD: ['X', 'Picsart AI Photo Editor,Video']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_22_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_22_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_22_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_22_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_22_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_22_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_22_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_22_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_22_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_22_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_22_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_22_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Vaulty:Hide Pictures Videos', 'Triller', 'Tripadvisor']\nB: ['Contacts', 'Pluto TV - Live TV and Movies', 'Vaulty:Hide Pictures Videos']\nC: ['Applock Pro - APP Lock & Guard', 'Youtube', 'Picturethis']\nD: ['Google Play Store', 'Likee', 'Setting']\n", "question": "The corresponding actions are: step 1: CLICK: (830, 742)\nstep 2: CLICK: (863, 471)\nstep 3: PRESS_HOME\nstep 4: CLICK: (352, 742)\nstep 5: CLICK: (338, 465)\nstep 6: SCROLL: RIGHT\nstep 7: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (830, 742)\nstep 2: CLICK: (863, 471)\nstep 3: PRESS_HOME\nstep 4: CLICK: (352, 742)\nstep 5: CLICK: (338, 465)\nstep 6: SCROLL: RIGHT\nstep 7: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Vaulty:Hide Pictures Videos', 'Triller', 'Tripadvisor']\nB: ['Contacts', 'Pluto TV - Live TV and Movies', 'Vaulty:Hide Pictures Videos']\nC: ['Applock Pro - APP Lock & Guard', 'Youtube', 'Picturethis']\nD: ['Google Play Store', 'Likee', 'Setting']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_23_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_23_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_23_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_23_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_23_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_23_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_23_6.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Netflix', 'Audible: Audio Entertainment']\nB: ['Likee', 'Amazon Kindle']\nC: ['Tiktok', 'Google Play Books & Audiobooks']\nD: ['Pluto TV - Live TV and Movies', 'Libby, by OverDrive']\n", "question": "The corresponding actions are: step 1: CLICK: (402, 641)\nstep 2: CLICK: (937, 77)\nstep 3: TYPE: American Civil War\nstep 4: CLICK: (888, 73)\nstep 5: CLICK: (204, 576)\nstep 6: PRESS_HOME\nstep 7: SCROLL: LEFT\nstep 8: CLICK: (387, 122)\nstep 9: CLICK: (385, 76)\nstep 10: TYPE: American Civil War\nstep 11: CLICK: (303, 292)\nstep 12: CLICK: (708, 313)\nstep 13: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (402, 641)\nstep 2: CLICK: (937, 77)\nstep 3: TYPE: American Civil War\nstep 4: CLICK: (888, 73)\nstep 5: CLICK: (204, 576)\nstep 6: PRESS_HOME\nstep 7: SCROLL: LEFT\nstep 8: CLICK: (387, 122)\nstep 9: CLICK: (385, 76)\nstep 10: TYPE: American Civil War\nstep 11: CLICK: (303, 292)\nstep 12: CLICK: (708, 313)\nstep 13: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Netflix', 'Audible: Audio Entertainment']\nB: ['Likee', 'Amazon Kindle']\nC: ['Tiktok', 'Google Play Books & Audiobooks']\nD: ['Pluto TV - Live TV and Movies', 'Libby, by OverDrive']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_24_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_24_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_24_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_24_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_24_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_24_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_24_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_24_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_24_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_24_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_24_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_24_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_24_12.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Everand', 'Picturethis']\nB: ['Libby, by OverDrive', 'iNaturalist']\nC: ['Ploter - Ebook, Audiobook, PDF', 'Setting']\nD: ['Pocket FM: Audio Series', 'Tripadvisor']\n", "question": "The corresponding actions are: step 1: CLICK: (381, 534)\nstep 2: CLICK: (218, 958)\nstep 3: CLICK: (879, 656)\nstep 4: PRESS_HOME\nstep 5: CLICK: (210, 526)\nstep 6: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (381, 534)\nstep 2: CLICK: (218, 958)\nstep 3: CLICK: (879, 656)\nstep 4: PRESS_HOME\nstep 5: CLICK: (210, 526)\nstep 6: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Everand', 'Picturethis']\nB: ['Libby, by OverDrive', 'iNaturalist']\nC: ['Ploter - Ebook, Audiobook, PDF', 'Setting']\nD: ['Pocket FM: Audio Series', 'Tripadvisor']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_25_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_25_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_25_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_25_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_25_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_25_5.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Plantin', 'Pocket FM: Audio Series']\nB: ['Contacts', 'Ploter - Ebook, Audiobook, PDF']\nC: ['Setting', 'Libby, by OverDrive']\nD: ['Picturethis', 'Amazon Kindle']\n", "question": "The corresponding actions are: step 1: CLICK: (547, 654)\nstep 2: SCROLL: UP\nstep 3: CLICK: (477, 647)\nstep 4: CLICK: (904, 717)\nstep 5: PRESS_HOME\nstep 6: CLICK: (820, 513)\nstep 7: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (547, 654)\nstep 2: SCROLL: UP\nstep 3: CLICK: (477, 647)\nstep 4: CLICK: (904, 717)\nstep 5: PRESS_HOME\nstep 6: CLICK: (820, 513)\nstep 7: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Plantin', 'Pocket FM: Audio Series']\nB: ['Contacts', 'Ploter - Ebook, Audiobook, PDF']\nC: ['Setting', 'Libby, by OverDrive']\nD: ['Picturethis', 'Amazon Kindle']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_26_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_26_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_26_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_26_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_26_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_26_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_26_6.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Bing: chat with AI & GPT4', 'Google Keep']\nB: ['Quora', 'Dropbox Paper']\nC: ['Wikipedia', 'Notepad - Notes and To Do List']\nD: ['DuckDuckgo', 'Simplenote']\n", "question": "The corresponding actions are: step 1: PRESS_HOME\nstep 2: CLICK: (293, 118)\nstep 3: CLICK: (354, 46)\nstep 4: CLICK: (970, 68)\nstep 5: TYPE: carrot cake ingredients\nstep 6: CLICK: (167, 154)\nstep 7: PRESS_HOME\nstep 8: CLICK: (289, 417)\nstep 9: CLICK: (271, 891)\nstep 10: TYPE: shopping list for making carrot cake: baking powder, carrot, ground allspice\nstep 11: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: PRESS_HOME\nstep 2: CLICK: (293, 118)\nstep 3: CLICK: (354, 46)\nstep 4: CLICK: (970, 68)\nstep 5: TYPE: carrot cake ingredients\nstep 6: CLICK: (167, 154)\nstep 7: PRESS_HOME\nstep 8: CLICK: (289, 417)\nstep 9: CLICK: (271, 891)\nstep 10: TYPE: shopping list for making carrot cake: baking powder, carrot, ground allspice\nstep 11: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Bing: chat with AI & GPT4', 'Google Keep']\nB: ['Quora', 'Dropbox Paper']\nC: ['Wikipedia', 'Notepad - Notes and To Do List']\nD: ['DuckDuckgo', 'Simplenote']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_27_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_27_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_27_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_27_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_27_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_27_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_27_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_27_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_27_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_27_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_27_10.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Contacts', 'Threads']\nB: ['Plantin', 'X']\nC: ['Setting', 'Facebook']\nD: ['Google Play Store', 'Instagram']\n", "question": "The corresponding actions are: step 1: CLICK: (936, 76)\nstep 2: TYPE: tiktok\nstep 3: CLICK: (933, 909)\nstep 4: PRESS_HOME\nstep 5: CLICK: (375, 822)\nstep 6: PRESS_HOME\nstep 7: CLICK: (844, 838)\nstep 8: CLICK: (819, 70)\nstep 9: CLICK: (953, 69)\nstep 10: TYPE: instagram\nstep 11: CLICK: (900, 912)\nstep 12: CLICK: (864, 323)\nstep 13: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (936, 76)\nstep 2: TYPE: tiktok\nstep 3: CLICK: (933, 909)\nstep 4: PRESS_HOME\nstep 5: CLICK: (375, 822)\nstep 6: PRESS_HOME\nstep 7: CLICK: (844, 838)\nstep 8: CLICK: (819, 70)\nstep 9: CLICK: (953, 69)\nstep 10: TYPE: instagram\nstep 11: CLICK: (900, 912)\nstep 12: CLICK: (864, 323)\nstep 13: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Contacts', 'Threads']\nB: ['Plantin', 'X']\nC: ['Setting', 'Facebook']\nD: ['Google Play Store', 'Instagram']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_28_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_28_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_28_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_28_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_28_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_28_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_28_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_28_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_28_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_28_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_28_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_28_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_28_12.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Lightroom Photo & Video Editor', 'Tumblr']\nB: ['Adobe Express: AI Video Design', 'Whatsapp']\nC: ['Gallery-photo gallery,album', 'Instagram']\nD: ['Textify- Art Font Photo Editor', 'Facebook']\n", "question": "The corresponding actions are: step 1: CLICK: (822, 533)\nstep 2: CLICK: (342, 357)\nstep 3: SCROLL: UP\nstep 4: CLICK: (241, 641)\nstep 5: CLICK: (74, 934)\nstep 6: SCROLL: LEFT\nstep 7: CLICK: (918, 68)\nstep 8: CLICK: (938, 66)\nstep 9: CLICK: (784, 704)\nstep 10: SCROLL: UP\nstep 11: SCROLL: UP\nstep 12: CLICK: (114, 526)\nstep 13: CLICK: (400, 535)\nstep 14: CLICK: (875, 74)\nstep 15: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (822, 533)\nstep 2: CLICK: (342, 357)\nstep 3: SCROLL: UP\nstep 4: CLICK: (241, 641)\nstep 5: CLICK: (74, 934)\nstep 6: SCROLL: LEFT\nstep 7: CLICK: (918, 68)\nstep 8: CLICK: (938, 66)\nstep 9: CLICK: (784, 704)\nstep 10: SCROLL: UP\nstep 11: SCROLL: UP\nstep 12: CLICK: (114, 526)\nstep 13: CLICK: (400, 535)\nstep 14: CLICK: (875, 74)\nstep 15: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Lightroom Photo & Video Editor', 'Tumblr']\nB: ['Adobe Express: AI Video Design', 'Whatsapp']\nC: ['Gallery-photo gallery,album', 'Instagram']\nD: ['Textify- Art Font Photo Editor', 'Facebook']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_29_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_29_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_29_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_29_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_29_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_29_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_29_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_29_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_29_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_29_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_29_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_29_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_29_12.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_29_13.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_29_14.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Calculator', 'Tumblr']\nB: ['Google Drive', 'Instagram']\nC: ['Basic Calculator: GPA & Math', 'Threads']\nD: ['All-In-One Calculator', 'Messenger']\n", "question": "The corresponding actions are: step 1: CLICK: (148, 537)\nstep 2: SCROLL: UP\nstep 3: CLICK: (905, 352)\nstep 4: CLICK: (273, 525)\nstep 5: PRESS_HOME\nstep 6: CLICK: (604, 160)\nstep 7: CLICK: (884, 147)\nstep 8: CLICK: (304, 677)\nstep 9: CLICK: (166, 903)\nstep 10: TYPE: https://docs.google.com/document/d/1F1mYgUNic6AwXcwpgI0fdfotTjP8R_v4ogXTNTK6u7U/edit?usp=drivesdk\nstep 11: CLICK: (862, 431)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (148, 537)\nstep 2: SCROLL: UP\nstep 3: CLICK: (905, 352)\nstep 4: CLICK: (273, 525)\nstep 5: PRESS_HOME\nstep 6: CLICK: (604, 160)\nstep 7: CLICK: (884, 147)\nstep 8: CLICK: (304, 677)\nstep 9: CLICK: (166, 903)\nstep 10: TYPE: https://docs.google.com/document/d/1F1mYgUNic6AwXcwpgI0fdfotTjP8R_v4ogXTNTK6u7U/edit?usp=drivesdk\nstep 11: CLICK: (862, 431)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Calculator', 'Tumblr']\nB: ['Google Drive', 'Instagram']\nC: ['Basic Calculator: GPA & Math', 'Threads']\nD: ['All-In-One Calculator', 'Messenger']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_30_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_30_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_30_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_30_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_30_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_30_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_30_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_30_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_30_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_30_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_30_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_30_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Google Play Store', 'Triller']\nB: ['Plantin', 'Tiktok']\nC: ['iNaturalist', 'Pluto TV - Live TV and Movies']\nD: ['Setting', 'YouTube']\n", "question": "The corresponding actions are: step 1: CLICK: (846, 824)\nstep 2: CLICK: (346, 511)\nstep 3: CLICK: (315, 623)\nstep 4: CLICK: (824, 74)\nstep 5: TYPE: YouTube\nstep 6: CLICK: (259, 147)\nstep 7: CLICK: (340, 520)\nstep 8: CLICK: (902, 404)\nstep 9: PRESS_HOME\nstep 10: SCROLL: RIGHT\nstep 11: CLICK: (846, 658)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (846, 824)\nstep 2: CLICK: (346, 511)\nstep 3: CLICK: (315, 623)\nstep 4: CLICK: (824, 74)\nstep 5: TYPE: YouTube\nstep 6: CLICK: (259, 147)\nstep 7: CLICK: (340, 520)\nstep 8: CLICK: (902, 404)\nstep 9: PRESS_HOME\nstep 10: SCROLL: RIGHT\nstep 11: CLICK: (846, 658)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Google Play Store', 'Triller']\nB: ['Plantin', 'Tiktok']\nC: ['iNaturalist', 'Pluto TV - Live TV and Movies']\nD: ['Setting', 'YouTube']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_31_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_31_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_31_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_31_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_31_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_31_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_31_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_31_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_31_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_31_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_31_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_31_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Calendar', 'Opera News']\nB: ['Simple Calendar - easy planner', 'NewsBreak']\nC: ['Files', 'Microsoft News']\nD: ['aCalendar', 'SmartNews:News That Matters']\n", "question": "The corresponding actions are: step 1: PRESS_HOME\nstep 2: SCROLL: LEFT\nstep 3: CLICK: (143, 141)\nstep 4: CLICK: (947, 70)\nstep 5: TYPE: ai\nstep 6: CLICK: (306, 122)\nstep 7: SCROLL: UP\nstep 8: IMPOSSIBLE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: PRESS_HOME\nstep 2: SCROLL: LEFT\nstep 3: CLICK: (143, 141)\nstep 4: CLICK: (947, 70)\nstep 5: TYPE: ai\nstep 6: CLICK: (306, 122)\nstep 7: SCROLL: UP\nstep 8: IMPOSSIBLE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Calendar', 'Opera News']\nB: ['Simple Calendar - easy planner', 'NewsBreak']\nC: ['Files', 'Microsoft News']\nD: ['aCalendar', 'SmartNews:News That Matters']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_32_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_32_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_32_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_32_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_32_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_32_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_32_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_32_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Facebook', 'Lazada']\nB: ['X', 'AliExpress']\nC: ['Instagram', 'SSENSE']\nD: ['Gmail', 'Net-a-Porte']\n", "question": "The corresponding actions are: step 1: CLICK: (609, 497)\nstep 2: CLICK: (82, 52)\nstep 3: CLICK: (387, 54)\nstep 4: CLICK: (918, 55)\nstep 5: TYPE: portable speaker recommendation\nstep 6: CLICK: (914, 913)\nstep 7: CLICK: (576, 675)\nstep 8: PRESS_HOME\nstep 9: CLICK: (797, 116)\nstep 10: CLICK: (190, 109)\nstep 11: TYPE: Anker Soundcore\nstep 12: CLICK: (927, 919)\nstep 13: CLICK: (213, 534)\nstep 14: CLICK: (455, 939)\nstep 15: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (609, 497)\nstep 2: CLICK: (82, 52)\nstep 3: CLICK: (387, 54)\nstep 4: CLICK: (918, 55)\nstep 5: TYPE: portable speaker recommendation\nstep 6: CLICK: (914, 913)\nstep 7: CLICK: (576, 675)\nstep 8: PRESS_HOME\nstep 9: CLICK: (797, 116)\nstep 10: CLICK: (190, 109)\nstep 11: TYPE: Anker Soundcore\nstep 12: CLICK: (927, 919)\nstep 13: CLICK: (213, 534)\nstep 14: CLICK: (455, 939)\nstep 15: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Facebook', 'Lazada']\nB: ['X', 'AliExpress']\nC: ['Instagram', 'SSENSE']\nD: ['Gmail', 'Net-a-Porte']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_33_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_33_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_33_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_33_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_33_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_33_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_33_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_33_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_33_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_33_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_33_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_33_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_33_12.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_33_13.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_33_14.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['GPS', 'Vaulty:Hide Pictures Videos']\nB: ['GPS, Maps, Voice Navigation', 'Picturethis']\nC: ['Lyft', 'iNaturalist']\nD: ['Yandex Navigator', 'Google Play Store']\n", "question": "The corresponding actions are: step 1: CLICK: (661, 505)\nstep 2: CLICK: (67, 751)\nstep 3: SCROLL: RIGHT\nstep 4: CLICK: (502, 59)\nstep 5: CLICK: (719, 78)\nstep 6: TYPE: sports arena\nstep 7: CLICK: (327, 168)\nstep 8: IMPOSSIBLE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (661, 505)\nstep 2: CLICK: (67, 751)\nstep 3: SCROLL: RIGHT\nstep 4: CLICK: (502, 59)\nstep 5: CLICK: (719, 78)\nstep 6: TYPE: sports arena\nstep 7: CLICK: (327, 168)\nstep 8: IMPOSSIBLE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['GPS', 'Vaulty:Hide Pictures Videos']\nB: ['GPS, Maps, Voice Navigation', 'Picturethis']\nC: ['Lyft', 'iNaturalist']\nD: ['Yandex Navigator', 'Google Play Store']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_34_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_34_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_34_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_34_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_34_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_34_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_34_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_34_7.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Google Play Store', 'Setting']\nB: ['Plantin', 'PlantNet']\nC: ['Picturethis', 'Picturethis']\nD: ['PlantNet', 'Contacts']\n", "question": "The corresponding actions are: step 1: CLICK: (427, 137)\nstep 2: CLICK: (307, 136)\nstep 3: TYPE: Flipkart\nstep 4: CLICK: (936, 773)\nstep 5: CLICK: (263, 394)\nstep 6: CLICK: (681, 331)\nstep 7: CLICK: (630, 549)\nstep 8: PRESS_HOME\nstep 9: CLICK: (576, 146)\nstep 10: CLICK: (174, 457)\nstep 11: CLICK: (498, 804)\nstep 12: CLICK: (939, 129)\nstep 13: TYPE: Flipkart\nstep 14: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (427, 137)\nstep 2: CLICK: (307, 136)\nstep 3: TYPE: Flipkart\nstep 4: CLICK: (936, 773)\nstep 5: CLICK: (263, 394)\nstep 6: CLICK: (681, 331)\nstep 7: CLICK: (630, 549)\nstep 8: PRESS_HOME\nstep 9: CLICK: (576, 146)\nstep 10: CLICK: (174, 457)\nstep 11: CLICK: (498, 804)\nstep 12: CLICK: (939, 129)\nstep 13: TYPE: Flipkart\nstep 14: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Google Play Store', 'Setting']\nB: ['Plantin', 'PlantNet']\nC: ['Picturethis', 'Picturethis']\nD: ['PlantNet', 'Contacts']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_35_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_35_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_35_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_35_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_35_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_35_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_35_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_35_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_35_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_35_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_35_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_35_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_35_12.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_35_13.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Google Keep', 'Bing: chat with AI & GPT4']\nB: ['Simplenote', 'Opera']\nC: ['Microsoft word', 'DuckDuckgo']\nD: ['WPS office', 'Quora']\n", "question": "The corresponding actions are: step 1: PRESS_HOME\nstep 2: CLICK: (567, 921)\nstep 3: CLICK: (453, 90)\nstep 4: CLICK: (959, 84)\nstep 5: TYPE: sushi ingredients\nstep 6: CLICK: (238, 170)\nstep 7: PRESS_HOME\nstep 8: CLICK: (927, 667)\nstep 9: SCROLL: RIGHT\nstep 10: SCROLL: RIGHT\nstep 11: CLICK: (583, 503)\nstep 12: CLICK: (240, 532)\nstep 13: TYPE: shopping list for sushi:rice, vinegar, wine\nstep 14: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: PRESS_HOME\nstep 2: CLICK: (567, 921)\nstep 3: CLICK: (453, 90)\nstep 4: CLICK: (959, 84)\nstep 5: TYPE: sushi ingredients\nstep 6: CLICK: (238, 170)\nstep 7: PRESS_HOME\nstep 8: CLICK: (927, 667)\nstep 9: SCROLL: RIGHT\nstep 10: SCROLL: RIGHT\nstep 11: CLICK: (583, 503)\nstep 12: CLICK: (240, 532)\nstep 13: TYPE: shopping list for sushi:rice, vinegar, wine\nstep 14: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Google Keep', 'Bing: chat with AI & GPT4']\nB: ['Simplenote', 'Opera']\nC: ['Microsoft word', 'DuckDuckgo']\nD: ['WPS office', 'Quora']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_36_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_36_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_36_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_36_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_36_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_36_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_36_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_36_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_36_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_36_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_36_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_36_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_36_12.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_36_13.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Chrome', 'Microsoft to do']\nB: ['Wikipedia', 'Things']\nC: ['Opera', 'TickTick']\nD: ['DuckDuckGo', 'To-Do List']\n", "question": "The corresponding actions are: step 1: CLICK: (386, 135)\nstep 2: CLICK: (500, 73)\nstep 3: TYPE: when is the next super bowl game\nstep 4: CLICK: (951, 917)\nstep 5: PRESS_HOME\nstep 6: CLICK: (627, 657)\nstep 7: CLICK: (918, 853)\nstep 8: TYPE: Feb9 2025 super bowl game\nstep 9: CLICK: (90, 572)\nstep 10: TYPE:  \nstep 11: CLICK: (922, 646)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (386, 135)\nstep 2: CLICK: (500, 73)\nstep 3: TYPE: when is the next super bowl game\nstep 4: CLICK: (951, 917)\nstep 5: PRESS_HOME\nstep 6: CLICK: (627, 657)\nstep 7: CLICK: (918, 853)\nstep 8: TYPE: Feb9 2025 super bowl game\nstep 9: CLICK: (90, 572)\nstep 10: TYPE:  \nstep 11: CLICK: (922, 646)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Chrome', 'Microsoft to do']\nB: ['Wikipedia', 'Things']\nC: ['Opera', 'TickTick']\nD: ['DuckDuckGo', 'To-Do List']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_37_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_37_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_37_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_37_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_37_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_37_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_37_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_37_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_37_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_37_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_37_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_37_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Booking.com', 'Waze Navigation & Live Traffic']\nB: ['Shazam: Find Music & Concerts', 'Lyft']\nC: ['Apartments.com Rental Search', 'Google Map']\nD: ['Traveloka', 'Yandex Navigator']\n", "question": "The corresponding actions are: step 1: CLICK: (432, 106)\nstep 2: CLICK: (327, 362)\nstep 3: PRESS_HOME\nstep 4: CLICK: (292, 242)\nstep 5: CLICK: (110, 82)\nstep 6: TYPE: 2580-2590 California St\nstep 7: CLICK: (183, 182)\nstep 8: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (432, 106)\nstep 2: CLICK: (327, 362)\nstep 3: PRESS_HOME\nstep 4: CLICK: (292, 242)\nstep 5: CLICK: (110, 82)\nstep 6: TYPE: 2580-2590 California St\nstep 7: CLICK: (183, 182)\nstep 8: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Booking.com', 'Waze Navigation & Live Traffic']\nB: ['Shazam: Find Music & Concerts', 'Lyft']\nC: ['Apartments.com Rental Search', 'Google Map']\nD: ['Traveloka', 'Yandex Navigator']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_38_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_38_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_38_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_38_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_38_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_38_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_38_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_38_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Google Play Store', 'Tokopedia']\nB: ['Tripadvisor', 'Booking.com']\nC: ['Setting', 'Expedia']\nD: ['Picturethis', 'trip.com']\n", "question": "The corresponding actions are: step 1: CLICK: (215, 670)\nstep 2: CLICK: (275, 73)\nstep 3: TYPE: Tokopedia\nstep 4: CLICK: (850, 884)\nstep 5: CLICK: (726, 388)\nstep 6: CLICK: (889, 384)\nstep 7: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (215, 670)\nstep 2: CLICK: (275, 73)\nstep 3: TYPE: Tokopedia\nstep 4: CLICK: (850, 884)\nstep 5: CLICK: (726, 388)\nstep 6: CLICK: (889, 384)\nstep 7: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Google Play Store', 'Tokopedia']\nB: ['Tripadvisor', 'Booking.com']\nC: ['Setting', 'Expedia']\nD: ['Picturethis', 'trip.com']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_39_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_39_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_39_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_39_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_39_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_39_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_39_6.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Yahoo Sports', 'Instagram']\nB: ['AP News', 'Messenger']\nC: ['CNN Breaking US & World News', 'Whatsapp']\nD: ['NewsBreak', 'Facebook']\n", "question": "The corresponding actions are: step 1: CLICK: (929, 676)\nstep 2: CLICK: (561, 612)\nstep 3: CLICK: (516, 514)\nstep 4: CLICK: (716, 74)\nstep 5: CLICK: (356, 85)\nstep 6: TYPE: Cybersecurity Threats\nstep 7: CLICK: (856, 874)\nstep 8: CLICK: (452, 314)\nstep 9: CLICK: (724, 915)\nstep 10: CLICK: (508, 873)\nstep 11: CLICK: (733, 79)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (929, 676)\nstep 2: CLICK: (561, 612)\nstep 3: CLICK: (516, 514)\nstep 4: CLICK: (716, 74)\nstep 5: CLICK: (356, 85)\nstep 6: TYPE: Cybersecurity Threats\nstep 7: CLICK: (856, 874)\nstep 8: CLICK: (452, 314)\nstep 9: CLICK: (724, 915)\nstep 10: CLICK: (508, 873)\nstep 11: CLICK: (733, 79)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Yahoo Sports', 'Instagram']\nB: ['AP News', 'Messenger']\nC: ['CNN Breaking US & World News', 'Whatsapp']\nD: ['NewsBreak', 'Facebook']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_40_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_40_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_40_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_40_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_40_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_40_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_40_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_40_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_40_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_40_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_40_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_40_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Firefox', 'Google Docs']\nB: ['DuckDuckGo', 'Notepad - Notes and To Do List']\nC: ['Edge', 'Simplenote']\nD: ['Opera', 'Google Keep']\n", "question": "The corresponding actions are: step 1: PRESS_HOME\nstep 2: CLICK: (705, 223)\nstep 3: CLICK: (813, 256)\nstep 4: TYPE: weather in Shanghai tomorrow\nstep 5: CLICK: (930, 248)\nstep 6: PRESS_HOME\nstep 7: CLICK: (858, 212)\nstep 8: CLICK: (902, 895)\nstep 9: CLICK: (824, 792)\nstep 10: TYPE: Shanghai,tomorrow Todolist: buy a flight to shanghai.\nstep 11: CLICK: (99, 96)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: PRESS_HOME\nstep 2: CLICK: (705, 223)\nstep 3: CLICK: (813, 256)\nstep 4: TYPE: weather in Shanghai tomorrow\nstep 5: CLICK: (930, 248)\nstep 6: PRESS_HOME\nstep 7: CLICK: (858, 212)\nstep 8: CLICK: (902, 895)\nstep 9: CLICK: (824, 792)\nstep 10: TYPE: Shanghai,tomorrow Todolist: buy a flight to shanghai.\nstep 11: CLICK: (99, 96)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Firefox', 'Google Docs']\nB: ['DuckDuckGo', 'Notepad - Notes and To Do List']\nC: ['Edge', 'Simplenote']\nD: ['Opera', 'Google Keep']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_41_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_41_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_41_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_41_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_41_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_41_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_41_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_41_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_41_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_41_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_41_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_41_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['DigiCal Calendar Agenda', 'Quora']\nB: ['All-In-One Calculator', 'Firefox']\nC: ['Calendar', 'Chrome']\nD: ['Simple Calendar - easy planner', 'Edge']\n", "question": "The corresponding actions are: step 1: CLICK: (642, 790)\nstep 2: CLICK: (788, 202)\nstep 3: TYPE: the latest Transformers movie\nstep 4: CLICK: (926, 905)\nstep 5: PRESS_HOME\nstep 6: SCROLL: UP\nstep 7: CLICK: (595, 368)\nstep 8: CLICK: (836, 716)\nstep 9: CLICK: (355, 842)\nstep 10: TYPE: watch the movie Transformers: the Rise of the Beasts\nstep 11: CLICK: (890, 89)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (642, 790)\nstep 2: CLICK: (788, 202)\nstep 3: TYPE: the latest Transformers movie\nstep 4: CLICK: (926, 905)\nstep 5: PRESS_HOME\nstep 6: SCROLL: UP\nstep 7: CLICK: (595, 368)\nstep 8: CLICK: (836, 716)\nstep 9: CLICK: (355, 842)\nstep 10: TYPE: watch the movie Transformers: the Rise of the Beasts\nstep 11: CLICK: (890, 89)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['DigiCal Calendar Agenda', 'Quora']\nB: ['All-In-One Calculator', 'Firefox']\nC: ['Calendar', 'Chrome']\nD: ['Simple Calendar - easy planner', 'Edge']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_42_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_42_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_42_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_42_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_42_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_42_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_42_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_42_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_42_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_42_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_42_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_42_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Youtube', 'Google Play Store']\nB: ['Tubi: Movies & Live TV', 'PlantNet']\nC: ['Pluto TV - Live TV and Movies', 'Plantin']\nD: ['Tiktok', 'TradingView: Track All Markets']\n", "question": "The corresponding actions are: step 1: CLICK: (129, 808)\nstep 2: PRESS_HOME\nstep 3: CLICK: (834, 809)\nstep 4: TYPE: Adidas Training App\nstep 5: CLICK: (294, 155)\nstep 6: CLICK: (425, 603)\nstep 7: CLICK: (688, 271)\nstep 8: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (129, 808)\nstep 2: PRESS_HOME\nstep 3: CLICK: (834, 809)\nstep 4: TYPE: Adidas Training App\nstep 5: CLICK: (294, 155)\nstep 6: CLICK: (425, 603)\nstep 7: CLICK: (688, 271)\nstep 8: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Youtube', 'Google Play Store']\nB: ['Tubi: Movies & Live TV', 'PlantNet']\nC: ['Pluto TV - Live TV and Movies', 'Plantin']\nD: ['Tiktok', 'TradingView: Track All Markets']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_43_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_43_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_43_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_43_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_43_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_43_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_43_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_43_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Triller', 'Vaulty:Hide Pictures Videos']\nB: ['Youtube', 'Google Play Store']\nC: ['Shorts VotTak: Short Video App', 'Contacts']\nD: ['Pluto TV - Live TV and Movies', 'Picturethis']\n", "question": "The corresponding actions are: step 1: CLICK: (585, 837)\nstep 2: PRESS_HOME\nstep 3: CLICK: (835, 842)\nstep 4: CLICK: (815, 73)\nstep 5: CLICK: (927, 75)\nstep 6: TYPE: Centr App\nstep 7: CLICK: (915, 912)\nstep 8: CLICK: (860, 337)\nstep 9: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (585, 837)\nstep 2: PRESS_HOME\nstep 3: CLICK: (835, 842)\nstep 4: CLICK: (815, 73)\nstep 5: CLICK: (927, 75)\nstep 6: TYPE: Centr App\nstep 7: CLICK: (915, 912)\nstep 8: CLICK: (860, 337)\nstep 9: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Triller', 'Vaulty:Hide Pictures Videos']\nB: ['Youtube', 'Google Play Store']\nC: ['Shorts VotTak: Short Video App', 'Contacts']\nD: ['Pluto TV - Live TV and Movies', 'Picturethis']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_44_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_44_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_44_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_44_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_44_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_44_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_44_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_44_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_44_8.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Kobo Books - eBooks Audiobooks', 'DuckDuckGo']\nB: ['Amazon Kindle', 'Firefox']\nC: ['Audible: Audio Entertainment', 'Chrome']\nD: ['Pocket FM: Audio Series', 'wikiHow']\n", "question": "The corresponding actions are: step 1: CLICK: (802, 311)\nstep 2: CLICK: (249, 82)\nstep 3: TYPE: The Renaissance\nstep 4: CLICK: (166, 216)\nstep 5: CLICK: (170, 417)\nstep 6: PRESS_HOME\nstep 7: CLICK: (561, 311)\nstep 8: CLICK: (196, 86)\nstep 9: TYPE: The Renaissance\nstep 10: CLICK: (158, 167)\nstep 11: CLICK: (316, 327)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (802, 311)\nstep 2: CLICK: (249, 82)\nstep 3: TYPE: The Renaissance\nstep 4: CLICK: (166, 216)\nstep 5: CLICK: (170, 417)\nstep 6: PRESS_HOME\nstep 7: CLICK: (561, 311)\nstep 8: CLICK: (196, 86)\nstep 9: TYPE: The Renaissance\nstep 10: CLICK: (158, 167)\nstep 11: CLICK: (316, 327)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Kobo Books - eBooks Audiobooks', 'DuckDuckGo']\nB: ['Amazon Kindle', 'Firefox']\nC: ['Audible: Audio Entertainment', 'Chrome']\nD: ['Pocket FM: Audio Series', 'wikiHow']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_45_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_45_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_45_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_45_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_45_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_45_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_45_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_45_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_45_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_45_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_45_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_45_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Youtube', 'Setting']\nB: ['Triller', 'Picturethis']\nC: ['Tubi: Movies & Live TV', 'iNaturalist']\nD: ['Netflix', 'Contacts']\n", "question": "The corresponding actions are: step 1: CLICK: (813, 470)\nstep 2: CLICK: (211, 417)\nstep 3: PRESS_HOME\nstep 4: CLICK: (866, 607)\nstep 5: CLICK: (233, 776)\nstep 6: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (813, 470)\nstep 2: CLICK: (211, 417)\nstep 3: PRESS_HOME\nstep 4: CLICK: (866, 607)\nstep 5: CLICK: (233, 776)\nstep 6: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Youtube', 'Setting']\nB: ['Triller', 'Picturethis']\nC: ['Tubi: Movies & Live TV', 'iNaturalist']\nD: ['Netflix', 'Contacts']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_46_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_46_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_46_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_46_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_46_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_46_5.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Firefox', 'Google Docs']\nB: ['Quora', 'Simplenote']\nC: ['DuckDuckgo', 'Google Keep']\nD: ['Opera', 'BasicNote - Notes, Notepad']\n", "question": "The corresponding actions are: step 1: CLICK: (632, 123)\nstep 2: CLICK: (462, 80)\nstep 3: CLICK: (936, 73)\nstep 4: TYPE: sushi ingredients\nstep 5: CLICK: (502, 123)\nstep 6: PRESS_HOME\nstep 7: CLICK: (351, 524)\nstep 8: CLICK: (902, 916)\nstep 9: TYPE: rice, rice vinegar, shaoxing wine\nstep 10: CLICK: (190, 151)\nstep 11: TYPE: shopping list for making sushi\nstep 12: CLICK: (74, 86)\nstep 13: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (632, 123)\nstep 2: CLICK: (462, 80)\nstep 3: CLICK: (936, 73)\nstep 4: TYPE: sushi ingredients\nstep 5: CLICK: (502, 123)\nstep 6: PRESS_HOME\nstep 7: CLICK: (351, 524)\nstep 8: CLICK: (902, 916)\nstep 9: TYPE: rice, rice vinegar, shaoxing wine\nstep 10: CLICK: (190, 151)\nstep 11: TYPE: shopping list for making sushi\nstep 12: CLICK: (74, 86)\nstep 13: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Firefox', 'Google Docs']\nB: ['Quora', 'Simplenote']\nC: ['DuckDuckgo', 'Google Keep']\nD: ['Opera', 'BasicNote - Notes, Notepad']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_47_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_47_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_47_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_47_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_47_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_47_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_47_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_47_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_47_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_47_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_47_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_47_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_47_12.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['X', 'iHeart: Music, Radio, Podcasts']\nB: ['Instagram', 'Spotify']\nC: ['Threads', 'YT Music']\nD: ['Messenger', 'Pandora']\n", "question": "The corresponding actions are: step 1: CLICK: (294, 293)\nstep 2: CLICK: (963, 71)\nstep 3: CLICK: (49, 598)\nstep 4: CLICK: (947, 289)\nstep 5: TYPE: Punk\nstep 6: SCROLL: UP\nstep 7: CLICK: (917, 921)\nstep 8: CLICK: (855, 924)\nstep 9: PRESS_HOME\nstep 10: CLICK: (866, 897)\nstep 11: CLICK: (811, 255)\nstep 12: CLICK: (45, 274)\nstep 13: TYPE: Punk\nstep 14: CLICK: (329, 963)\nstep 15: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (294, 293)\nstep 2: CLICK: (963, 71)\nstep 3: CLICK: (49, 598)\nstep 4: CLICK: (947, 289)\nstep 5: TYPE: Punk\nstep 6: SCROLL: UP\nstep 7: CLICK: (917, 921)\nstep 8: CLICK: (855, 924)\nstep 9: PRESS_HOME\nstep 10: CLICK: (866, 897)\nstep 11: CLICK: (811, 255)\nstep 12: CLICK: (45, 274)\nstep 13: TYPE: Punk\nstep 14: CLICK: (329, 963)\nstep 15: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['X', 'iHeart: Music, Radio, Podcasts']\nB: ['Instagram', 'Spotify']\nC: ['Threads', 'YT Music']\nD: ['Messenger', 'Pandora']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_48_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_48_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_48_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_48_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_48_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_48_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_48_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_48_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_48_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_48_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_48_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_48_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_48_12.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_48_13.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_48_14.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Kobo Books - eBooks Audiobooks', 'Contacts']\nB: ['Everand', 'iNaturalist']\nC: ['Audible: Audio Entertainment', 'Google Play Store']\nD: ['Ploter - Ebook, Audiobook, PDF', 'Setting']\n", "question": "The corresponding actions are: step 1: CLICK: (869, 616)\nstep 2: SCROLL: UP\nstep 3: CLICK: (256, 938)\nstep 4: SCROLL: UP\nstep 5: CLICK: (879, 599)\nstep 6: PRESS_HOME\nstep 7: SCROLL: LEFT\nstep 8: CLICK: (874, 475)\nstep 9: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (869, 616)\nstep 2: SCROLL: UP\nstep 3: CLICK: (256, 938)\nstep 4: SCROLL: UP\nstep 5: CLICK: (879, 599)\nstep 6: PRESS_HOME\nstep 7: SCROLL: LEFT\nstep 8: CLICK: (874, 475)\nstep 9: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Kobo Books - eBooks Audiobooks', 'Contacts']\nB: ['Everand', 'iNaturalist']\nC: ['Audible: Audio Entertainment', 'Google Play Store']\nD: ['Ploter - Ebook, Audiobook, PDF', 'Setting']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_49_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_49_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_49_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_49_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_49_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_49_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_49_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_49_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_49_8.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Quora', 'Any.do']\nB: ['Chrome', 'Microsoft to do']\nC: ['Firefox', 'Todoist']\nD: ['DuckDuckGo', 'To-Do List']\n", "question": "The corresponding actions are: step 1: CLICK: (319, 516)\nstep 2: CLICK: (672, 110)\nstep 3: TYPE: how to learn photography\nstep 4: CLICK: (463, 177)\nstep 5: CLICK: (447, 227)\nstep 6: CLICK: (697, 224)\nstep 7: CLICK: (529, 536)\nstep 8: CLICK: (467, 636)\nstep 9: PRESS_HOME\nstep 10: CLICK: (88, 498)\nstep 11: CLICK: (302, 888)\nstep 12: TYPE: learn photography in \nstep 13: CLICK: (585, 566)\nstep 14: CLICK: (962, 494)\nstep 15: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (319, 516)\nstep 2: CLICK: (672, 110)\nstep 3: TYPE: how to learn photography\nstep 4: CLICK: (463, 177)\nstep 5: CLICK: (447, 227)\nstep 6: CLICK: (697, 224)\nstep 7: CLICK: (529, 536)\nstep 8: CLICK: (467, 636)\nstep 9: PRESS_HOME\nstep 10: CLICK: (88, 498)\nstep 11: CLICK: (302, 888)\nstep 12: TYPE: learn photography in \nstep 13: CLICK: (585, 566)\nstep 14: CLICK: (962, 494)\nstep 15: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Quora', 'Any.do']\nB: ['Chrome', 'Microsoft to do']\nC: ['Firefox', 'Todoist']\nD: ['DuckDuckGo', 'To-Do List']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_50_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_50_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_50_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_50_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_50_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_50_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_50_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_50_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_50_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_50_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_50_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_50_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_50_12.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_50_13.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_50_14.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Tiktok', 'Calculator Plus with History']\nB: ['Tubi: Movies & Live TV', 'DigiCal Calendar Agenda']\nC: ['Shorts VotTak: Short Video App', 'Calendar']\nD: ['Likee', 'Clock']\n", "question": "The corresponding actions are: step 1: CLICK: (566, 122)\nstep 2: CLICK: (398, 914)\nstep 3: CLICK: (311, 48)\nstep 4: TYPE: relaxing soundscape\nstep 5: CLICK: (602, 746)\nstep 6: CLICK: (344, 389)\nstep 7: PRESS_HOME\nstep 8: SCROLL: RIGHT\nstep 9: CLICK: (216, 427)\nstep 10: CLICK: (857, 362)\nstep 11: CLICK: (766, 651)\nstep 12: CLICK: (759, 858)\nstep 13: PRESS_RECENT\nstep 14: CLICK: (57, 261)\nstep 15: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (566, 122)\nstep 2: CLICK: (398, 914)\nstep 3: CLICK: (311, 48)\nstep 4: TYPE: relaxing soundscape\nstep 5: CLICK: (602, 746)\nstep 6: CLICK: (344, 389)\nstep 7: PRESS_HOME\nstep 8: SCROLL: RIGHT\nstep 9: CLICK: (216, 427)\nstep 10: CLICK: (857, 362)\nstep 11: CLICK: (766, 651)\nstep 12: CLICK: (759, 858)\nstep 13: PRESS_RECENT\nstep 14: CLICK: (57, 261)\nstep 15: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Tiktok', 'Calculator Plus with History']\nB: ['Tubi: Movies & Live TV', 'DigiCal Calendar Agenda']\nC: ['Shorts VotTak: Short Video App', 'Calendar']\nD: ['Likee', 'Clock']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_51_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_51_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_51_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_51_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_51_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_51_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_51_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_51_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_51_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_51_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_51_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_51_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_51_12.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_51_13.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_51_14.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['iNaturalist', 'Uber']\nB: ['TradingView: Track All Markets', 'Waze Navigation & Live Traffic']\nC: ['Picturethis', 'Maps']\nD: ['Google Play Store', 'Petal Maps - GPS & Navigation']\n", "question": "The corresponding actions are: step 1: CLICK: (505, 470)\nstep 2: CLICK: (333, 568)\nstep 3: TYPE: gym\nstep 4: CLICK: (447, 251)\nstep 5: PRESS_HOME\nstep 6: CLICK: (524, 748)\nstep 7: CLICK: (333, 65)\nstep 8: CLICK: (80, 956)\nstep 9: CLICK: (252, 631)\nstep 10: CLICK: (349, 776)\nstep 11: CLICK: (488, 446)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (505, 470)\nstep 2: CLICK: (333, 568)\nstep 3: TYPE: gym\nstep 4: CLICK: (447, 251)\nstep 5: PRESS_HOME\nstep 6: CLICK: (524, 748)\nstep 7: CLICK: (333, 65)\nstep 8: CLICK: (80, 956)\nstep 9: CLICK: (252, 631)\nstep 10: CLICK: (349, 776)\nstep 11: CLICK: (488, 446)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['iNaturalist', 'Uber']\nB: ['TradingView: Track All Markets', 'Waze Navigation & Live Traffic']\nC: ['Picturethis', 'Maps']\nD: ['Google Play Store', 'Petal Maps - GPS & Navigation']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_52_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_52_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_52_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_52_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_52_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_52_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_52_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_52_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_52_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_52_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_52_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_52_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Gmail', 'Plantin']\nB: ['Instagram', 'iNaturalist']\nC: ['Threads', 'Google Play Store']\nD: ['Tumblr', 'Vaulty:Hide Pictures Videos']\n", "question": "The corresponding actions are: step 1: CLICK: (542, 654)\nstep 2: CLICK: (193, 344)\nstep 3: CLICK: (267, 248)\nstep 4: CLICK: (554, 472)\nstep 5: CLICK: (360, 905)\nstep 6: CLICK: (781, 695)\nstep 7: CLICK: (753, 921)\nstep 8: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (542, 654)\nstep 2: CLICK: (193, 344)\nstep 3: CLICK: (267, 248)\nstep 4: CLICK: (554, 472)\nstep 5: CLICK: (360, 905)\nstep 6: CLICK: (781, 695)\nstep 7: CLICK: (753, 921)\nstep 8: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Gmail', 'Plantin']\nB: ['Instagram', 'iNaturalist']\nC: ['Threads', 'Google Play Store']\nD: ['Tumblr', 'Vaulty:Hide Pictures Videos']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_53_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_53_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_53_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_53_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_53_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_53_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_53_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_53_7.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Wikipedia', 'Gmail']\nB: ['DuckDuckgo', 'Facebook']\nC: ['wikiHow', 'Whatsapp']\nD: ['Opera', 'Messenger']\n", "question": "The corresponding actions are: step 1: CLICK: (573, 719)\nstep 2: TYPE: Notre-Dame Cathedral in Pairs\nstep 3: CLICK: (909, 688)\nstep 4: CLICK: (468, 884)\nstep 5: CLICK: (358, 554)\nstep 6: CLICK: (372, 868)\nstep 7: CLICK: (978, 61)\nstep 8: CLICK: (843, 194)\nstep 9: CLICK: (683, 861)\nstep 10: CLICK: (686, 51)\nstep 11: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (573, 719)\nstep 2: TYPE: Notre-Dame Cathedral in Pairs\nstep 3: CLICK: (909, 688)\nstep 4: CLICK: (468, 884)\nstep 5: CLICK: (358, 554)\nstep 6: CLICK: (372, 868)\nstep 7: CLICK: (978, 61)\nstep 8: CLICK: (843, 194)\nstep 9: CLICK: (683, 861)\nstep 10: CLICK: (686, 51)\nstep 11: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Wikipedia', 'Gmail']\nB: ['DuckDuckgo', 'Facebook']\nC: ['wikiHow', 'Whatsapp']\nD: ['Opera', 'Messenger']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_54_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_54_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_54_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_54_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_54_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_54_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_54_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_54_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_54_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_54_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_54_10.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Quora', 'Likee', 'Contacts']\nB: ['wikiHow', 'Shorts VotTak: Short Video App', 'Tripadvisor']\nC: ['Chrome', 'Triller', 'Google Play Store']\nD: ['Opera', 'Youtube', 'Setting']\n", "question": "The corresponding actions are: step 1: CLICK: (206, 908)\nstep 2: CLICK: (932, 227)\nstep 3: PRESS_HOME\nstep 4: CLICK: (556, 923)\nstep 5: CLICK: (515, 417)\nstep 6: CLICK: (787, 89)\nstep 7: PRESS_HOME\nstep 8: CLICK: (311, 931)\nstep 9: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (206, 908)\nstep 2: CLICK: (932, 227)\nstep 3: PRESS_HOME\nstep 4: CLICK: (556, 923)\nstep 5: CLICK: (515, 417)\nstep 6: CLICK: (787, 89)\nstep 7: PRESS_HOME\nstep 8: CLICK: (311, 931)\nstep 9: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Quora', 'Likee', 'Contacts']\nB: ['wikiHow', 'Shorts VotTak: Short Video App', 'Tripadvisor']\nC: ['Chrome', 'Triller', 'Google Play Store']\nD: ['Opera', 'Youtube', 'Setting']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_55_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_55_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_55_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_55_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_55_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_55_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_55_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_55_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_55_8.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Chatty - AI Assistant', 'Wikipedia']\nB: ['ChatOn - AI Chat Bot Assistant', 'Chrome']\nC: ['GenZArt:Fast AI Art Generator', 'Bing: chat with AI & GPT4']\nD: ['Microsoft Copilot', 'Firefox']\n", "question": "The corresponding actions are: step 1: CLICK: (394, 409)\nstep 2: TYPE: tell me about Theorem of Green\nstep 3: CLICK: (909, 620)\nstep 4: PRESS_HOME\nstep 5: CLICK: (386, 258)\nstep 6: CLICK: (371, 81)\nstep 7: TYPE: Theorem of Green\nstep 8: CLICK: (918, 909)\nstep 9: CLICK: (280, 652)\nstep 10: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (394, 409)\nstep 2: TYPE: tell me about Theorem of Green\nstep 3: CLICK: (909, 620)\nstep 4: PRESS_HOME\nstep 5: CLICK: (386, 258)\nstep 6: CLICK: (371, 81)\nstep 7: TYPE: Theorem of Green\nstep 8: CLICK: (918, 909)\nstep 9: CLICK: (280, 652)\nstep 10: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Chatty - AI Assistant', 'Wikipedia']\nB: ['ChatOn - AI Chat Bot Assistant', 'Chrome']\nC: ['GenZArt:Fast AI Art Generator', 'Bing: chat with AI & GPT4']\nD: ['Microsoft Copilot', 'Firefox']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_56_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_56_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_56_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_56_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_56_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_56_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_56_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_56_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_56_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_56_9.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Petal Maps - GPS & Navigation', 'GPS, Maps, Voice Navigation']\nB: ['Waze Navigation & Live Traffic', 'Waze Navigation & Live Traffic']\nC: ['GPS', 'Lyft']\nD: ['Maps', 'Uber']\n", "question": "The corresponding actions are: step 1: CLICK: (155, 490)\nstep 2: CLICK: (505, 334)\nstep 3: CLICK: (283, 678)\nstep 4: TYPE: bakery\nstep 5: CLICK: (933, 884)\nstep 6: CLICK: (413, 296)\nstep 7: PRESS_HOME\nstep 8: CLICK: (374, 492)\nstep 9: CLICK: (272, 584)\nstep 10: TYPE: Yamasa\nstep 11: CLICK: (511, 431)\nstep 12: CLICK: (533, 890)\nstep 13: CLICK: (616, 893)\nstep 14: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (155, 490)\nstep 2: CLICK: (505, 334)\nstep 3: CLICK: (283, 678)\nstep 4: TYPE: bakery\nstep 5: CLICK: (933, 884)\nstep 6: CLICK: (413, 296)\nstep 7: PRESS_HOME\nstep 8: CLICK: (374, 492)\nstep 9: CLICK: (272, 584)\nstep 10: TYPE: Yamasa\nstep 11: CLICK: (511, 431)\nstep 12: CLICK: (533, 890)\nstep 13: CLICK: (616, 893)\nstep 14: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Petal Maps - GPS & Navigation', 'GPS, Maps, Voice Navigation']\nB: ['Waze Navigation & Live Traffic', 'Waze Navigation & Live Traffic']\nC: ['GPS', 'Lyft']\nD: ['Maps', 'Uber']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_57_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_57_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_57_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_57_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_57_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_57_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_57_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_57_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_57_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_57_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_57_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_57_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_57_12.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_57_13.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['X', 'TradingView: Track All Markets', 'Tripadvisor']\nB: ['Facebook', 'Google Play Store', 'iNaturalist']\nC: ['Instagram', 'Setting', 'Google Play Store']\nD: ['Threads', 'Picturethis', 'Contacts']\n", "question": "The corresponding actions are: step 1: CLICK: (858, 818)\nstep 2: TYPE: tiktok\nstep 3: CLICK: (911, 889)\nstep 4: PRESS_HOME\nstep 5: CLICK: (404, 813)\nstep 6: PRESS_HOME\nstep 7: CLICK: (856, 808)\nstep 8: CLICK: (804, 69)\nstep 9: CLICK: (926, 83)\nstep 10: TYPE: ins\nstep 11: CLICK: (890, 908)\nstep 12: CLICK: (868, 348)\nstep 13: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (858, 818)\nstep 2: TYPE: tiktok\nstep 3: CLICK: (911, 889)\nstep 4: PRESS_HOME\nstep 5: CLICK: (404, 813)\nstep 6: PRESS_HOME\nstep 7: CLICK: (856, 808)\nstep 8: CLICK: (804, 69)\nstep 9: CLICK: (926, 83)\nstep 10: TYPE: ins\nstep 11: CLICK: (890, 908)\nstep 12: CLICK: (868, 348)\nstep 13: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['X', 'TradingView: Track All Markets', 'Tripadvisor']\nB: ['Facebook', 'Google Play Store', 'iNaturalist']\nC: ['Instagram', 'Setting', 'Google Play Store']\nD: ['Threads', 'Picturethis', 'Contacts']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_58_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_58_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_58_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_58_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_58_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_58_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_58_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_58_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_58_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_58_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_58_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_58_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_58_12.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['BBC News', 'Threads']\nB: ['Yahoo Sports', 'Whatsapp']\nC: ['ESPN', 'Messenger']\nD: ['Opera News', 'Gmail']\n", "question": "The corresponding actions are: step 1: CLICK: (118, 493)\nstep 2: CLICK: (863, 907)\nstep 3: CLICK: (713, 218)\nstep 4: TYPE: Global Economic Trends\nstep 5: CLICK: (933, 871)\nstep 6: CLICK: (222, 425)\nstep 7: CLICK: (595, 153)\nstep 8: SCROLL: UP\nstep 9: SCROLL: UP\nstep 10: CLICK: (486, 667)\nstep 11: CLICK: (874, 479)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (118, 493)\nstep 2: CLICK: (863, 907)\nstep 3: CLICK: (713, 218)\nstep 4: TYPE: Global Economic Trends\nstep 5: CLICK: (933, 871)\nstep 6: CLICK: (222, 425)\nstep 7: CLICK: (595, 153)\nstep 8: SCROLL: UP\nstep 9: SCROLL: UP\nstep 10: CLICK: (486, 667)\nstep 11: CLICK: (874, 479)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['BBC News', 'Threads']\nB: ['Yahoo Sports', 'Whatsapp']\nC: ['ESPN', 'Messenger']\nD: ['Opera News', 'Gmail']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_59_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_59_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_59_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_59_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_59_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_59_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_59_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_59_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_59_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_59_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_59_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_59_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['TradingView: Track All Markets', 'Triller', 'Applock Pro - APP Lock & Guard']\nB: ['Google Play Store', 'Likee', 'Setting']\nC: ['iNaturalist', 'Tubi: Movies & Live TV', 'Tripadvisor']\nD: ['PlantNet', 'Shorts VotTak: Short Video App', 'Google Play Store']\n", "question": "The corresponding actions are: step 1: CLICK: (664, 921)\nstep 2: CLICK: (848, 430)\nstep 3: PRESS_HOME\nstep 4: CLICK: (559, 918)\nstep 5: CLICK: (509, 421)\nstep 6: CLICK: (567, 668)\nstep 7: CLICK: (927, 551)\nstep 8: CLICK: (395, 76)\nstep 9: CLICK: (471, 530)\nstep 10: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (664, 921)\nstep 2: CLICK: (848, 430)\nstep 3: PRESS_HOME\nstep 4: CLICK: (559, 918)\nstep 5: CLICK: (509, 421)\nstep 6: CLICK: (567, 668)\nstep 7: CLICK: (927, 551)\nstep 8: CLICK: (395, 76)\nstep 9: CLICK: (471, 530)\nstep 10: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['TradingView: Track All Markets', 'Triller', 'Applock Pro - APP Lock & Guard']\nB: ['Google Play Store', 'Likee', 'Setting']\nC: ['iNaturalist', 'Tubi: Movies & Live TV', 'Tripadvisor']\nD: ['PlantNet', 'Shorts VotTak: Short Video App', 'Google Play Store']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_60_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_60_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_60_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_60_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_60_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_60_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_60_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_60_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_60_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_60_9.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Contacts', 'Cash App']\nB: ['Applock Pro - APP Lock & Guard', 'PayPal - Send, Shop, Manage']\nC: ['Plantin', 'Venmo']\nD: ['Setting', 'Investing.com']\n", "question": "The corresponding actions are: step 1: CLICK: (397, 372)\nstep 2: CLICK: (237, 604)\nstep 3: CLICK: (492, 615)\nstep 4: CLICK: (789, 629)\nstep 5: CLICK: (209, 718)\nstep 6: CLICK: (527, 721)\nstep 7: CLICK: (759, 725)\nstep 8: SCROLL: UP\nstep 9: SCROLL: UP\nstep 10: CLICK: (865, 738)\nstep 11: PRESS_HOME\nstep 12: CLICK: (873, 245)\nstep 13: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (397, 372)\nstep 2: CLICK: (237, 604)\nstep 3: CLICK: (492, 615)\nstep 4: CLICK: (789, 629)\nstep 5: CLICK: (209, 718)\nstep 6: CLICK: (527, 721)\nstep 7: CLICK: (759, 725)\nstep 8: SCROLL: UP\nstep 9: SCROLL: UP\nstep 10: CLICK: (865, 738)\nstep 11: PRESS_HOME\nstep 12: CLICK: (873, 245)\nstep 13: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Contacts', 'Cash App']\nB: ['Applock Pro - APP Lock & Guard', 'PayPal - Send, Shop, Manage']\nC: ['Plantin', 'Venmo']\nD: ['Setting', 'Investing.com']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_61_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_61_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_61_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_61_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_61_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_61_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_61_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_61_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_61_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_61_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_61_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_61_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_61_12.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['wikiHow', 'Vaulty:Hide Pictures Videos']\nB: ['Quora', 'Contacts']\nC: ['DuckDuckGo', 'Picturethis']\nD: ['Chrome', 'TradingView: Track All Markets']\n", "question": "The corresponding actions are: step 1: CLICK: (439, 913)\nstep 2: CLICK: (310, 433)\nstep 3: TYPE: AMD's stock market news\nstep 4: CLICK: (862, 882)\nstep 5: CLICK: (265, 482)\nstep 6: CLICK: (947, 866)\nstep 7: PRESS_HOME\nstep 8: CLICK: (922, 494)\nstep 9: CLICK: (382, 76)\nstep 10: TYPE: AMD\nstep 11: CLICK: (145, 204)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (439, 913)\nstep 2: CLICK: (310, 433)\nstep 3: TYPE: AMD's stock market news\nstep 4: CLICK: (862, 882)\nstep 5: CLICK: (265, 482)\nstep 6: CLICK: (947, 866)\nstep 7: PRESS_HOME\nstep 8: CLICK: (922, 494)\nstep 9: CLICK: (382, 76)\nstep 10: TYPE: AMD\nstep 11: CLICK: (145, 204)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['wikiHow', 'Vaulty:Hide Pictures Videos']\nB: ['Quora', 'Contacts']\nC: ['DuckDuckGo', 'Picturethis']\nD: ['Chrome', 'TradingView: Track All Markets']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_62_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_62_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_62_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_62_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_62_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_62_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_62_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_62_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_62_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_62_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_62_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_62_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Google Play Store', 'Setting']\nB: ['PlantNet', 'Vaulty:Hide Pictures Videos']\nC: ['Vaulty:Hide Pictures Videos', 'Google Play Store']\nD: ['Plantin', 'PlantNet']\n", "question": "The corresponding actions are: step 1: CLICK: (432, 724)\nstep 2: SCROLL: UP\nstep 3: SCROLL: UP\nstep 4: CLICK: (115, 758)\nstep 5: CLICK: (518, 311)\nstep 6: CLICK: (498, 372)\nstep 7: CLICK: (488, 524)\nstep 8: CLICK: (974, 68)\nstep 9: TYPE: Hindi\nstep 10: CLICK: (413, 247)\nstep 11: SCROLL: UP\nstep 12: CLICK: (822, 608)\nstep 13: PRESS_HOME\nstep 14: CLICK: (140, 718)\nstep 15: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (432, 724)\nstep 2: SCROLL: UP\nstep 3: SCROLL: UP\nstep 4: CLICK: (115, 758)\nstep 5: CLICK: (518, 311)\nstep 6: CLICK: (498, 372)\nstep 7: CLICK: (488, 524)\nstep 8: CLICK: (974, 68)\nstep 9: TYPE: Hindi\nstep 10: CLICK: (413, 247)\nstep 11: SCROLL: UP\nstep 12: CLICK: (822, 608)\nstep 13: PRESS_HOME\nstep 14: CLICK: (140, 718)\nstep 15: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Google Play Store', 'Setting']\nB: ['PlantNet', 'Vaulty:Hide Pictures Videos']\nC: ['Vaulty:Hide Pictures Videos', 'Google Play Store']\nD: ['Plantin', 'PlantNet']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_63_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_63_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_63_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_63_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_63_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_63_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_63_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_63_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_63_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_63_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_63_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_63_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_63_12.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_63_13.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_63_14.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Google Map', 'Uber']\nB: ['Lyft', 'GPS, Maps, Voice Navigation']\nC: ['Yandex Navigator', 'Citymapper']\nD: ['GPS', 'Maps']\n", "question": "The corresponding actions are: step 1: CLICK: (578, 324)\nstep 2: CLICK: (239, 90)\nstep 3: TYPE: hospital\nstep 4: CLICK: (237, 255)\nstep 5: PRESS_HOME\nstep 6: CLICK: (687, 149)\nstep 7: CLICK: (393, 227)\nstep 8: TYPE: Hospital Helipad\nstep 9: SCROLL: UP\nstep 10: CLICK: (460, 709)\nstep 11: CLICK: (435, 335)\nstep 12: CLICK: (442, 885)\nstep 13: CLICK: (437, 924)\nstep 14: CLICK: (448, 887)\nstep 15: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (578, 324)\nstep 2: CLICK: (239, 90)\nstep 3: TYPE: hospital\nstep 4: CLICK: (237, 255)\nstep 5: PRESS_HOME\nstep 6: CLICK: (687, 149)\nstep 7: CLICK: (393, 227)\nstep 8: TYPE: Hospital Helipad\nstep 9: SCROLL: UP\nstep 10: CLICK: (460, 709)\nstep 11: CLICK: (435, 335)\nstep 12: CLICK: (442, 885)\nstep 13: CLICK: (437, 924)\nstep 14: CLICK: (448, 887)\nstep 15: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Google Map', 'Uber']\nB: ['Lyft', 'GPS, Maps, Voice Navigation']\nC: ['Yandex Navigator', 'Citymapper']\nD: ['GPS', 'Maps']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_64_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_64_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_64_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_64_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_64_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_64_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_64_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_64_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_64_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_64_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_64_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_64_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_64_12.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_64_13.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_64_14.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['PlantNet', 'Gmail']\nB: ['Applock Pro - APP Lock & Guard', 'Messenger']\nC: ['Tripadvisor', 'Facebook']\nD: ['Setting', 'Instagram']\n", "question": "The corresponding actions are: step 1: CLICK: (199, 628)\nstep 2: CLICK: (394, 820)\nstep 3: SCROLL: UP\nstep 4: CLICK: (419, 573)\nstep 5: CLICK: (799, 151)\nstep 6: TYPE: Instagram\nstep 7: CLICK: (291, 371)\nstep 8: CLICK: (266, 792)\nstep 9: CLICK: (802, 684)\nstep 10: CLICK: (80, 160)\nstep 11: CLICK: (233, 657)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (199, 628)\nstep 2: CLICK: (394, 820)\nstep 3: SCROLL: UP\nstep 4: CLICK: (419, 573)\nstep 5: CLICK: (799, 151)\nstep 6: TYPE: Instagram\nstep 7: CLICK: (291, 371)\nstep 8: CLICK: (266, 792)\nstep 9: CLICK: (802, 684)\nstep 10: CLICK: (80, 160)\nstep 11: CLICK: (233, 657)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['PlantNet', 'Gmail']\nB: ['Applock Pro - APP Lock & Guard', 'Messenger']\nC: ['Tripadvisor', 'Facebook']\nD: ['Setting', 'Instagram']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_65_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_65_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_65_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_65_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_65_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_65_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_65_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_65_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_65_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_65_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_65_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_65_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Pluto TV - Live TV and Movies', 'DigiCal Calendar Agenda']\nB: ['Triller', 'Calendar']\nC: ['Youtube', 'Clock']\nD: ['Tiktok', 'aCalendar']\n", "question": "The corresponding actions are: step 1: CLICK: (884, 636)\nstep 2: TYPE: nature landscape\nstep 3: CLICK: (920, 941)\nstep 4: PRESS_HOME\nstep 5: CLICK: (366, 275)\nstep 6: TYPE: 30000\nstep 7: CLICK: (462, 834)\nstep 8: SCROLL: RIGHT\nstep 9: SCROLL: RIGHT\nstep 10: CLICK: (500, 820)\nstep 11: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (884, 636)\nstep 2: TYPE: nature landscape\nstep 3: CLICK: (920, 941)\nstep 4: PRESS_HOME\nstep 5: CLICK: (366, 275)\nstep 6: TYPE: 30000\nstep 7: CLICK: (462, 834)\nstep 8: SCROLL: RIGHT\nstep 9: SCROLL: RIGHT\nstep 10: CLICK: (500, 820)\nstep 11: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Pluto TV - Live TV and Movies', 'DigiCal Calendar Agenda']\nB: ['Triller', 'Calendar']\nC: ['Youtube', 'Clock']\nD: ['Tiktok', 'aCalendar']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_66_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_66_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_66_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_66_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_66_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_66_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_66_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_66_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_66_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_66_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_66_10.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Tripadvisor', 'Netflix']\nB: ['Applock Pro - APP Lock & Guard', 'Pluto TV - Live TV and Movies']\nC: ['Vaulty:Hide Pictures Videos', 'Shorts VotTak: Short Video App']\nD: ['Google Play Store', 'Youtube']\n", "question": "The corresponding actions are: step 1: CLICK: (158, 742)\nstep 2: PRESS_HOME\nstep 3: SCROLL: RIGHT\nstep 4: CLICK: (158, 601)\nstep 5: CLICK: (708, 378)\nstep 6: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (158, 742)\nstep 2: PRESS_HOME\nstep 3: SCROLL: RIGHT\nstep 4: CLICK: (158, 601)\nstep 5: CLICK: (708, 378)\nstep 6: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Tripadvisor', 'Netflix']\nB: ['Applock Pro - APP Lock & Guard', 'Pluto TV - Live TV and Movies']\nC: ['Vaulty:Hide Pictures Videos', 'Shorts VotTak: Short Video App']\nD: ['Google Play Store', 'Youtube']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_67_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_67_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_67_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_67_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_67_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_67_5.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Any.do', 'Tubi: Movies & Live TV']\nB: ['To-Do List', 'Youtube']\nC: ['TickTick', 'Shorts VotTak: Short Video App']\nD: ['Things', 'Likee']\n", "question": "The corresponding actions are: step 1: CLICK: (732, 236)\nstep 2: CLICK: (516, 924)\nstep 3: CLICK: (503, 334)\nstep 4: CLICK: (669, 807)\nstep 5: TYPE: Swimming in the morning\nstep 6: CLICK: (681, 908)\nstep 7: PRESS_HOME\nstep 8: CLICK: (599, 914)\nstep 9: CLICK: (972, 44)\nstep 10: TYPE: swimming tutorial\nstep 11: CLICK: (294, 111)\nstep 12: CLICK: (485, 556)\nstep 13: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (732, 236)\nstep 2: CLICK: (516, 924)\nstep 3: CLICK: (503, 334)\nstep 4: CLICK: (669, 807)\nstep 5: TYPE: Swimming in the morning\nstep 6: CLICK: (681, 908)\nstep 7: PRESS_HOME\nstep 8: CLICK: (599, 914)\nstep 9: CLICK: (972, 44)\nstep 10: TYPE: swimming tutorial\nstep 11: CLICK: (294, 111)\nstep 12: CLICK: (485, 556)\nstep 13: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Any.do', 'Tubi: Movies & Live TV']\nB: ['To-Do List', 'Youtube']\nC: ['TickTick', 'Shorts VotTak: Short Video App']\nD: ['Things', 'Likee']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_68_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_68_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_68_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_68_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_68_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_68_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_68_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_68_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_68_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_68_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_68_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_68_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_68_12.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Traveloka', 'Gmail']\nB: ['Booking.com', 'Instagram']\nC: ['Tokopedia', 'Whatsapp']\nD: ['TickPick - Live Event Tickets', 'Messenger']\n", "question": "The corresponding actions are: step 1: CLICK: (310, 71)\nstep 2: TYPE: Santorini, Greece itinerary\nstep 3: CLICK: (943, 906)\nstep 4: CLICK: (153, 237)\nstep 5: CLICK: (521, 806)\nstep 6: IMPOSSIBLE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (310, 71)\nstep 2: TYPE: Santorini, Greece itinerary\nstep 3: CLICK: (943, 906)\nstep 4: CLICK: (153, 237)\nstep 5: CLICK: (521, 806)\nstep 6: IMPOSSIBLE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Traveloka', 'Gmail']\nB: ['Booking.com', 'Instagram']\nC: ['Tokopedia', 'Whatsapp']\nD: ['TickPick - Live Event Tickets', 'Messenger']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_69_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_69_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_69_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_69_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_69_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_69_5.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Spotify', 'Any.do']\nB: ['iHeart: Music, Radio, Podcasts', 'TickTick']\nC: ['Amazon Music', 'To-Do List']\nD: ['Pandora', 'Things']\n", "question": "The corresponding actions are: step 1: CLICK: (363, 209)\nstep 2: CLICK: (697, 596)\nstep 3: SCROLL: UP\nstep 4: CLICK: (336, 821)\nstep 5: PRESS_HOME\nstep 6: SCROLL: RIGHT\nstep 7: CLICK: (108, 212)\nstep 8: CLICK: (841, 887)\nstep 9: CLICK: (274, 435)\nstep 10: CLICK: (580, 571)\nstep 11: TYPE:  do yoga with this\nstep 12: CLICK: (488, 357)\nstep 13: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (363, 209)\nstep 2: CLICK: (697, 596)\nstep 3: SCROLL: UP\nstep 4: CLICK: (336, 821)\nstep 5: PRESS_HOME\nstep 6: SCROLL: RIGHT\nstep 7: CLICK: (108, 212)\nstep 8: CLICK: (841, 887)\nstep 9: CLICK: (274, 435)\nstep 10: CLICK: (580, 571)\nstep 11: TYPE:  do yoga with this\nstep 12: CLICK: (488, 357)\nstep 13: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Spotify', 'Any.do']\nB: ['iHeart: Music, Radio, Podcasts', 'TickTick']\nC: ['Amazon Music', 'To-Do List']\nD: ['Pandora', 'Things']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_70_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_70_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_70_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_70_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_70_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_70_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_70_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_70_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_70_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_70_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_70_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_70_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_70_12.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Edge', 'WPS office']\nB: ['Quora', 'Microsoft Word']\nC: ['Chrome', 'Google Keep']\nD: ['Firefox', 'Dropbox Paper']\n", "question": "The corresponding actions are: step 1: CLICK: (674, 739)\nstep 2: CLICK: (766, 260)\nstep 3: TYPE: 2019 Nobel-Prize Winners in Physics\nstep 4: CLICK: (508, 173)\nstep 5: CLICK: (908, 881)\nstep 6: SCROLL: UP\nstep 7: PRESS_HOME\nstep 8: CLICK: (102, 340)\nstep 9: CLICK: (830, 849)\nstep 10: CLICK: (149, 199)\nstep 11: LONG_PRESS: (141, 198)\nstep 12: CLICK: (119, 123)\nstep 13: CLICK: (247, 310)\nstep 14: TYPE: James Beebles, michel Mayor, Didier Quelvz\nstep 15: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (674, 739)\nstep 2: CLICK: (766, 260)\nstep 3: TYPE: 2019 Nobel-Prize Winners in Physics\nstep 4: CLICK: (508, 173)\nstep 5: CLICK: (908, 881)\nstep 6: SCROLL: UP\nstep 7: PRESS_HOME\nstep 8: CLICK: (102, 340)\nstep 9: CLICK: (830, 849)\nstep 10: CLICK: (149, 199)\nstep 11: LONG_PRESS: (141, 198)\nstep 12: CLICK: (119, 123)\nstep 13: CLICK: (247, 310)\nstep 14: TYPE: James Beebles, michel Mayor, Didier Quelvz\nstep 15: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Edge', 'WPS office']\nB: ['Quora', 'Microsoft Word']\nC: ['Chrome', 'Google Keep']\nD: ['Firefox', 'Dropbox Paper']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_71_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_71_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_71_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_71_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_71_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_71_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_71_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_71_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_71_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_71_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_71_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_71_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_71_12.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_71_13.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_71_14.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Gallery-photo gallery,album', 'Whatsapp']\nB: ['Adobe Express: AI Video Design', 'Messenger']\nC: ['Textify- Art Font Photo Editor', 'X']\nD: ['Lightroom Photo & Video Editor', 'Threads']\n", "question": "The corresponding actions are: step 1: CLICK: (124, 499)\nstep 2: CLICK: (855, 803)\nstep 3: CLICK: (477, 487)\nstep 4: CLICK: (887, 173)\nstep 5: CLICK: (741, 715)\nstep 6: CLICK: (881, 164)\nstep 7: CLICK: (931, 171)\nstep 8: SCROLL: LEFT\nstep 9: SCROLL: LEFT\nstep 10: SCROLL: LEFT\nstep 11: CLICK: (915, 845)\nstep 12: CLICK: (127, 681)\nstep 13: CLICK: (851, 542)\nstep 14: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (124, 499)\nstep 2: CLICK: (855, 803)\nstep 3: CLICK: (477, 487)\nstep 4: CLICK: (887, 173)\nstep 5: CLICK: (741, 715)\nstep 6: CLICK: (881, 164)\nstep 7: CLICK: (931, 171)\nstep 8: SCROLL: LEFT\nstep 9: SCROLL: LEFT\nstep 10: SCROLL: LEFT\nstep 11: CLICK: (915, 845)\nstep 12: CLICK: (127, 681)\nstep 13: CLICK: (851, 542)\nstep 14: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Gallery-photo gallery,album', 'Whatsapp']\nB: ['Adobe Express: AI Video Design', 'Messenger']\nC: ['Textify- Art Font Photo Editor', 'X']\nD: ['Lightroom Photo & Video Editor', 'Threads']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_72_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_72_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_72_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_72_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_72_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_72_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_72_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_72_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_72_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_72_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_72_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_72_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_72_12.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_72_13.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Trulia: Homes For Sale & Rent', 'Citymapper']\nB: ['Zillow: Homes For Sale & Rent', 'GPS']\nC: ['Realtor.com: Buy, Sell & Rent', 'Uber']\nD: ['Redfin Houses for Sale & Rent', 'Waze Navigation & Live Traffic']\n", "question": "The corresponding actions are: step 1: CLICK: (404, 246)\nstep 2: CLICK: (731, 565)\nstep 3: PRESS_HOME\nstep 4: CLICK: (603, 522)\nstep 5: CLICK: (240, 599)\nstep 6: TYPE: 527 Mollno St\nstep 7: CLICK: (914, 911)\nstep 8: CLICK: (350, 647)\nstep 9: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (404, 246)\nstep 2: CLICK: (731, 565)\nstep 3: PRESS_HOME\nstep 4: CLICK: (603, 522)\nstep 5: CLICK: (240, 599)\nstep 6: TYPE: 527 Mollno St\nstep 7: CLICK: (914, 911)\nstep 8: CLICK: (350, 647)\nstep 9: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Trulia: Homes For Sale & Rent', 'Citymapper']\nB: ['Zillow: Homes For Sale & Rent', 'GPS']\nC: ['Realtor.com: Buy, Sell & Rent', 'Uber']\nD: ['Redfin Houses for Sale & Rent', 'Waze Navigation & Live Traffic']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_73_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_73_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_73_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_73_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_73_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_73_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_73_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_73_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_73_8.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Google Play Store', 'Investing.com']\nB: ['Picturethis', 'PayPal - Send, Shop, Manage']\nC: ['Applock Pro - APP Lock & Guard', 'Cash App']\nD: ['Tripadvisor', 'Google Wallet']\n", "question": "The corresponding actions are: step 1: CLICK: (591, 381)\nstep 2: CLICK: (226, 562)\nstep 3: CLICK: (234, 679)\nstep 4: CLICK: (218, 777)\nstep 5: CLICK: (521, 554)\nstep 6: CLICK: (511, 659)\nstep 7: CLICK: (511, 767)\nstep 8: CLICK: (879, 547)\nstep 9: PRESS_HOME\nstep 10: CLICK: (840, 533)\nstep 11: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (591, 381)\nstep 2: CLICK: (226, 562)\nstep 3: CLICK: (234, 679)\nstep 4: CLICK: (218, 777)\nstep 5: CLICK: (521, 554)\nstep 6: CLICK: (511, 659)\nstep 7: CLICK: (511, 767)\nstep 8: CLICK: (879, 547)\nstep 9: PRESS_HOME\nstep 10: CLICK: (840, 533)\nstep 11: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Google Play Store', 'Investing.com']\nB: ['Picturethis', 'PayPal - Send, Shop, Manage']\nC: ['Applock Pro - APP Lock & Guard', 'Cash App']\nD: ['Tripadvisor', 'Google Wallet']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_74_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_74_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_74_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_74_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_74_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_74_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_74_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_74_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_74_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_74_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_74_10.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Netflix', 'Dropbox Paper']\nB: ['Youtube', 'Google Keep']\nC: ['Triller', 'Simplenote']\nD: ['Tubi: Movies & Live TV', 'BasicNote - Notes, Notepad']\n", "question": "The corresponding actions are: step 1: CLICK: (846, 634)\nstep 2: CLICK: (946, 46)\nstep 3: TYPE: 3D Printing Course\nstep 4: CLICK: (488, 96)\nstep 5: CLICK: (388, 343)\nstep 6: CLICK: (907, 397)\nstep 7: CLICK: (260, 844)\nstep 8: PRESS_HOME\nstep 9: CLICK: (417, 490)\nstep 10: CLICK: (877, 903)\nstep 11: CLICK: (444, 654)\nstep 12: CLICK: (145, 134)\nstep 13: TYPE: 3D Printer Course for Beginners \nstep 14: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (846, 634)\nstep 2: CLICK: (946, 46)\nstep 3: TYPE: 3D Printing Course\nstep 4: CLICK: (488, 96)\nstep 5: CLICK: (388, 343)\nstep 6: CLICK: (907, 397)\nstep 7: CLICK: (260, 844)\nstep 8: PRESS_HOME\nstep 9: CLICK: (417, 490)\nstep 10: CLICK: (877, 903)\nstep 11: CLICK: (444, 654)\nstep 12: CLICK: (145, 134)\nstep 13: TYPE: 3D Printer Course for Beginners \nstep 14: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Netflix', 'Dropbox Paper']\nB: ['Youtube', 'Google Keep']\nC: ['Triller', 'Simplenote']\nD: ['Tubi: Movies & Live TV', 'BasicNote - Notes, Notepad']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_75_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_75_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_75_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_75_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_75_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_75_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_75_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_75_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_75_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_75_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_75_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_75_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_75_12.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_75_13.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Firefox', 'Contacts']\nB: ['DuckDuckGo', 'Google Play Store']\nC: ['Wikipedia', 'PlantNet']\nD: ['Chrome', 'TradingView: Track All Markets']\n", "question": "The corresponding actions are: step 1: CLICK: (443, 917)\nstep 2: CLICK: (205, 436)\nstep 3: TYPE: Nvidia's stock market news\nstep 4: CLICK: (855, 897)\nstep 5: CLICK: (164, 720)\nstep 6: PRESS_HOME\nstep 7: CLICK: (915, 490)\nstep 8: CLICK: (389, 82)\nstep 9: TYPE: Nvidia\nstep 10: CLICK: (185, 323)\nstep 11: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (443, 917)\nstep 2: CLICK: (205, 436)\nstep 3: TYPE: Nvidia's stock market news\nstep 4: CLICK: (855, 897)\nstep 5: CLICK: (164, 720)\nstep 6: PRESS_HOME\nstep 7: CLICK: (915, 490)\nstep 8: CLICK: (389, 82)\nstep 9: TYPE: Nvidia\nstep 10: CLICK: (185, 323)\nstep 11: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Firefox', 'Contacts']\nB: ['DuckDuckGo', 'Google Play Store']\nC: ['Wikipedia', 'PlantNet']\nD: ['Chrome', 'TradingView: Track All Markets']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_76_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_76_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_76_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_76_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_76_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_76_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_76_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_76_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_76_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_76_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_76_10.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Yahoo Sports', 'Instagram']\nB: ['AP News', 'Messenger']\nC: ['Microsoft News', 'Facebook']\nD: ['Breaking News: local & Alerts', 'Gmail']\n", "question": "The corresponding actions are: step 1: CLICK: (847, 490)\nstep 2: CLICK: (915, 140)\nstep 3: TYPE: Electric Vehicles\nstep 4: CLICK: (909, 865)\nstep 5: CLICK: (481, 403)\nstep 6: CLICK: (668, 141)\nstep 7: CLICK: (127, 663)\nstep 8: CLICK: (840, 337)\nstep 9: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (847, 490)\nstep 2: CLICK: (915, 140)\nstep 3: TYPE: Electric Vehicles\nstep 4: CLICK: (909, 865)\nstep 5: CLICK: (481, 403)\nstep 6: CLICK: (668, 141)\nstep 7: CLICK: (127, 663)\nstep 8: CLICK: (840, 337)\nstep 9: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Yahoo Sports', 'Instagram']\nB: ['AP News', 'Messenger']\nC: ['Microsoft News', 'Facebook']\nD: ['Breaking News: local & Alerts', 'Gmail']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_77_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_77_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_77_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_77_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_77_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_77_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_77_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_77_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_77_8.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Google Docs', 'Firefox', 'Setting']\nB: ['Dropbox Paper', 'Wikipedia', 'Picturethis']\nC: ['Simplenote', 'Opera', 'PlantNet']\nD: ['Microsoft Word', 'Edge', 'Vaulty:Hide Pictures Videos']\n", "question": "The corresponding actions are: step 1: CLICK: (382, 264)\nstep 2: CLICK: (435, 146)\nstep 3: TYPE: Bristlecone\nstep 4: CLICK: (453, 165)\nstep 5: CLICK: (409, 280)\nstep 6: CLICK: (951, 87)\nstep 7: CLICK: (741, 97)\nstep 8: PRESS_HOME\nstep 9: CLICK: (340, 396)\nstep 10: CLICK: (476, 915)\nstep 11: CLICK: (183, 885)\nstep 12: CLICK: (880, 630)\nstep 13: SCROLL: RIGHT\nstep 14: SCROLL: LEFT\nstep 15: CLICK: (476, 878)\nstep 16: IMPOSSIBLE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (382, 264)\nstep 2: CLICK: (435, 146)\nstep 3: TYPE: Bristlecone\nstep 4: CLICK: (453, 165)\nstep 5: CLICK: (409, 280)\nstep 6: CLICK: (951, 87)\nstep 7: CLICK: (741, 97)\nstep 8: PRESS_HOME\nstep 9: CLICK: (340, 396)\nstep 10: CLICK: (476, 915)\nstep 11: CLICK: (183, 885)\nstep 12: CLICK: (880, 630)\nstep 13: SCROLL: RIGHT\nstep 14: SCROLL: LEFT\nstep 15: CLICK: (476, 878)\nstep 16: IMPOSSIBLE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Google Docs', 'Firefox', 'Setting']\nB: ['Dropbox Paper', 'Wikipedia', 'Picturethis']\nC: ['Simplenote', 'Opera', 'PlantNet']\nD: ['Microsoft Word', 'Edge', 'Vaulty:Hide Pictures Videos']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_78_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_78_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_78_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_78_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_78_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_78_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_78_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_78_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_78_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_78_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_78_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_78_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_78_12.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_78_13.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_78_14.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_78_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Applock Pro - APP Lock & Guard', 'Amazon Kindle']\nB: ['Setting', 'Everand']\nC: ['iNaturalist', 'Pocket FM: Audio Series']\nD: ['Picturethis', 'Audible: Audio Entertainment']\n", "question": "The corresponding actions are: step 1: CLICK: (616, 654)\nstep 2: SCROLL: UP\nstep 3: CLICK: (318, 682)\nstep 4: CLICK: (888, 724)\nstep 5: PRESS_HOME\nstep 6: CLICK: (364, 629)\nstep 7: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (616, 654)\nstep 2: SCROLL: UP\nstep 3: CLICK: (318, 682)\nstep 4: CLICK: (888, 724)\nstep 5: PRESS_HOME\nstep 6: CLICK: (364, 629)\nstep 7: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Applock Pro - APP Lock & Guard', 'Amazon Kindle']\nB: ['Setting', 'Everand']\nC: ['iNaturalist', 'Pocket FM: Audio Series']\nD: ['Picturethis', 'Audible: Audio Entertainment']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_79_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_79_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_79_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_79_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_79_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_79_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_79_6.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Tubi: Movies & Live TV', 'TradingView: Track All Markets']\nB: ['Netflix', 'Contacts']\nC: ['Youtube', 'Vaulty:Hide Pictures Videos']\nD: ['Shorts VotTak: Short Video App', 'Google Play Store']\n", "question": "The corresponding actions are: step 1: CLICK: (841, 767)\nstep 2: CLICK: (772, 68)\nstep 3: CLICK: (913, 74)\nstep 4: TYPE: tiktok\nstep 5: PRESS_HOME\nstep 6: CLICK: (394, 726)\nstep 7: SCROLL: RIGHT\nstep 8: CLICK: (922, 79)\nstep 9: TYPE: vottak\nstep 10: CLICK: (880, 885)\nstep 11: CLICK: (833, 449)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (841, 767)\nstep 2: CLICK: (772, 68)\nstep 3: CLICK: (913, 74)\nstep 4: TYPE: tiktok\nstep 5: PRESS_HOME\nstep 6: CLICK: (394, 726)\nstep 7: SCROLL: RIGHT\nstep 8: CLICK: (922, 79)\nstep 9: TYPE: vottak\nstep 10: CLICK: (880, 885)\nstep 11: CLICK: (833, 449)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Tubi: Movies & Live TV', 'TradingView: Track All Markets']\nB: ['Netflix', 'Contacts']\nC: ['Youtube', 'Vaulty:Hide Pictures Videos']\nD: ['Shorts VotTak: Short Video App', 'Google Play Store']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_80_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_80_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_80_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_80_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_80_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_80_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_80_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_80_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_80_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_80_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_80_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_80_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Setting', 'Shorts VotTak: Short Video App', 'Google Play Store']\nB: ['Applock Pro - APP Lock & Guard', 'Tubi: Movies & Live TV', 'Vaulty:Hide Pictures Videos']\nC: ['Google Play Store', 'Pluto TV - Live TV and Movies', 'Contacts']\nD: ['PlantNet', 'Youtube', 'TradingView: Track All Markets']\n", "question": "The corresponding actions are: step 1: CLICK: (857, 849)\nstep 2: TYPE: vottak\nstep 3: CLICK: (324, 137)\nstep 4: CLICK: (855, 325)\nstep 5: CLICK: (913, 65)\nstep 6: CLICK: (674, 657)\nstep 7: PRESS_HOME\nstep 8: CLICK: (395, 828)\nstep 9: CLICK: (465, 531)\nstep 10: CLICK: (362, 335)\nstep 11: CLICK: (156, 413)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (857, 849)\nstep 2: TYPE: vottak\nstep 3: CLICK: (324, 137)\nstep 4: CLICK: (855, 325)\nstep 5: CLICK: (913, 65)\nstep 6: CLICK: (674, 657)\nstep 7: PRESS_HOME\nstep 8: CLICK: (395, 828)\nstep 9: CLICK: (465, 531)\nstep 10: CLICK: (362, 335)\nstep 11: CLICK: (156, 413)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Setting', 'Shorts VotTak: Short Video App', 'Google Play Store']\nB: ['Applock Pro - APP Lock & Guard', 'Tubi: Movies & Live TV', 'Vaulty:Hide Pictures Videos']\nC: ['Google Play Store', 'Pluto TV - Live TV and Movies', 'Contacts']\nD: ['PlantNet', 'Youtube', 'TradingView: Track All Markets']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_81_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_81_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_81_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_81_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_81_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_81_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_81_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_81_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_81_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_81_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_81_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_81_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Tiktok', 'Setting']\nB: ['Youtube', 'iNaturalist']\nC: ['Shorts VotTak: Short Video App', 'PlantNet']\nD: ['Pluto TV - Live TV and Movies', 'Applock Pro - APP Lock & Guard']\n", "question": "The corresponding actions are: step 1: CLICK: (835, 522)\nstep 2: CLICK: (922, 296)\nstep 3: PRESS_HOME\nstep 4: CLICK: (393, 529)\nstep 5: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (835, 522)\nstep 2: CLICK: (922, 296)\nstep 3: PRESS_HOME\nstep 4: CLICK: (393, 529)\nstep 5: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Tiktok', 'Setting']\nB: ['Youtube', 'iNaturalist']\nC: ['Shorts VotTak: Short Video App', 'PlantNet']\nD: ['Pluto TV - Live TV and Movies', 'Applock Pro - APP Lock & Guard']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_82_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_82_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_82_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_82_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_82_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Youtube', 'Google Play Store']\nB: ['Shorts VotTak: Short Video App', 'Applock Pro - APP Lock & Guard']\nC: ['Tiktok', 'TradingView: Track All Markets']\nD: ['Triller', 'Contacts']\n", "question": "The corresponding actions are: step 1: CLICK: (614, 806)\nstep 2: PRESS_HOME\nstep 3: CLICK: (829, 815)\nstep 4: CLICK: (807, 53)\nstep 5: TYPE: Peloton App\nstep 6: CLICK: (940, 909)\nstep 7: CLICK: (843, 327)\nstep 8: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (614, 806)\nstep 2: PRESS_HOME\nstep 3: CLICK: (829, 815)\nstep 4: CLICK: (807, 53)\nstep 5: TYPE: Peloton App\nstep 6: CLICK: (940, 909)\nstep 7: CLICK: (843, 327)\nstep 8: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Youtube', 'Google Play Store']\nB: ['Shorts VotTak: Short Video App', 'Applock Pro - APP Lock & Guard']\nC: ['Tiktok', 'TradingView: Track All Markets']\nD: ['Triller', 'Contacts']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_83_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_83_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_83_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_83_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_83_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_83_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_83_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_83_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Pandora', 'Tumblr']\nB: ['Spotify', 'Facebook']\nC: ['Amazon Music', 'Messenger']\nD: ['YT Music', 'Instagram']\n", "question": "The corresponding actions are: step 1: CLICK: (880, 116)\nstep 2: CLICK: (408, 55)\nstep 3: TYPE: Electronic\nstep 4: CLICK: (912, 914)\nstep 5: SCROLL: LEFT\nstep 6: CLICK: (862, 119)\nstep 7: CLICK: (928, 189)\nstep 8: PRESS_HOME\nstep 9: CLICK: (617, 252)\nstep 10: TYPE: Electronic\nstep 11: CLICK: (895, 600)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (880, 116)\nstep 2: CLICK: (408, 55)\nstep 3: TYPE: Electronic\nstep 4: CLICK: (912, 914)\nstep 5: SCROLL: LEFT\nstep 6: CLICK: (862, 119)\nstep 7: CLICK: (928, 189)\nstep 8: PRESS_HOME\nstep 9: CLICK: (617, 252)\nstep 10: TYPE: Electronic\nstep 11: CLICK: (895, 600)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Pandora', 'Tumblr']\nB: ['Spotify', 'Facebook']\nC: ['Amazon Music', 'Messenger']\nD: ['YT Music', 'Instagram']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_84_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_84_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_84_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_84_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_84_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_84_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_84_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_84_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_84_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_84_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_84_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_84_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Setting', 'Contacts', 'Shorts VotTak: Short Video App']\nB: ['Plantin', 'PlantNet', 'Tubi: Movies & Live TV']\nC: ['iNaturalist', 'Google Play Store', 'Tiktok']\nD: ['Google Play Store', 'Setting', 'Likee']\n", "question": "The corresponding actions are: step 1: CLICK: (858, 821)\nstep 2: CLICK: (773, 244)\nstep 3: PRESS_HOME\nstep 4: CLICK: (437, 809)\nstep 5: CLICK: (279, 516)\nstep 6: CLICK: (836, 404)\nstep 7: CLICK: (77, 52)\nstep 8: CLICK: (188, 412)\nstep 9: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (858, 821)\nstep 2: CLICK: (773, 244)\nstep 3: PRESS_HOME\nstep 4: CLICK: (437, 809)\nstep 5: CLICK: (279, 516)\nstep 6: CLICK: (836, 404)\nstep 7: CLICK: (77, 52)\nstep 8: CLICK: (188, 412)\nstep 9: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Setting', 'Contacts', 'Shorts VotTak: Short Video App']\nB: ['Plantin', 'PlantNet', 'Tubi: Movies & Live TV']\nC: ['iNaturalist', 'Google Play Store', 'Tiktok']\nD: ['Google Play Store', 'Setting', 'Likee']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_85_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_85_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_85_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_85_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_85_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_85_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_85_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_85_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_85_8.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Chatbot AI & Smart Assistant', 'Firefox']\nB: ['Microsoft Copilot', 'Bing: chat with AI & GPT4']\nC: ['Chatty - AI Assistant', 'DuckDuckGo']\nD: ['ChatOn - AI Chat Bot Assistant', 'Opera']\n", "question": "The corresponding actions are: step 1: CLICK: (377, 377)\nstep 2: TYPE: tell me about Central Limit Theorem\nstep 3: CLICK: (896, 576)\nstep 4: PRESS_HOME\nstep 5: CLICK: (408, 235)\nstep 6: CLICK: (438, 319)\nstep 7: TYPE: Central Limit Theorem\nstep 8: CLICK: (301, 151)\nstep 9: CLICK: (133, 542)\nstep 10: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (377, 377)\nstep 2: TYPE: tell me about Central Limit Theorem\nstep 3: CLICK: (896, 576)\nstep 4: PRESS_HOME\nstep 5: CLICK: (408, 235)\nstep 6: CLICK: (438, 319)\nstep 7: TYPE: Central Limit Theorem\nstep 8: CLICK: (301, 151)\nstep 9: CLICK: (133, 542)\nstep 10: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Chatbot AI & Smart Assistant', 'Firefox']\nB: ['Microsoft Copilot', 'Bing: chat with AI & GPT4']\nC: ['Chatty - AI Assistant', 'DuckDuckGo']\nD: ['ChatOn - AI Chat Bot Assistant', 'Opera']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_86_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_86_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_86_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_86_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_86_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_86_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_86_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_86_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_86_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_86_9.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Lazada', 'Setting']\nB: ['REVOLVE', 'PlantNet']\nC: ['SSENSE', 'Google Play Store']\nD: ['Alibaba.com - B2B marketplace', 'Vaulty:Hide Pictures Videos']\n", "question": "The corresponding actions are: step 1: CLICK: (372, 667)\nstep 2: CLICK: (369, 84)\nstep 3: TYPE: SSENSE\nstep 4: CLICK: (938, 918)\nstep 5: CLICK: (753, 336)\nstep 6: CLICK: (838, 333)\nstep 7: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (372, 667)\nstep 2: CLICK: (369, 84)\nstep 3: TYPE: SSENSE\nstep 4: CLICK: (938, 918)\nstep 5: CLICK: (753, 336)\nstep 6: CLICK: (838, 333)\nstep 7: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Lazada', 'Setting']\nB: ['REVOLVE', 'PlantNet']\nC: ['SSENSE', 'Google Play Store']\nD: ['Alibaba.com - B2B marketplace', 'Vaulty:Hide Pictures Videos']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_87_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_87_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_87_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_87_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_87_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_87_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_87_6.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Tripadvisor', 'Netflix']\nB: ['Applock Pro - APP Lock & Guard', 'Shorts VotTak: Short Video App']\nC: ['Setting', 'Pluto TV - Live TV and Movies']\nD: ['Google Play Store', 'Youtube']\n", "question": "The corresponding actions are: step 1: CLICK: (317, 911)\nstep 2: PRESS_HOME\nstep 3: CLICK: (663, 919)\nstep 4: TYPE: Aaptiv\nstep 5: CLICK: (208, 176)\nstep 6: CLICK: (620, 431)\nstep 7: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (317, 911)\nstep 2: PRESS_HOME\nstep 3: CLICK: (663, 919)\nstep 4: TYPE: Aaptiv\nstep 5: CLICK: (208, 176)\nstep 6: CLICK: (620, 431)\nstep 7: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Tripadvisor', 'Netflix']\nB: ['Applock Pro - APP Lock & Guard', 'Shorts VotTak: Short Video App']\nC: ['Setting', 'Pluto TV - Live TV and Movies']\nD: ['Google Play Store', 'Youtube']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_88_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_88_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_88_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_88_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_88_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_88_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_88_6.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['iNaturalist', 'Temu']\nB: ['Google Play Store', 'THE OUTNET']\nC: ['Applock Pro - APP Lock & Guard', 'Amazon']\nD: ['Vaulty:Hide Pictures Videos', 'MATCHES']\n", "question": "The corresponding actions are: step 1: CLICK: (447, 152)\nstep 2: CLICK: (295, 127)\nstep 3: TYPE: THE OUTNET\nstep 4: CLICK: (920, 742)\nstep 5: CLICK: (915, 228)\nstep 6: CLICK: (905, 297)\nstep 7: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (447, 152)\nstep 2: CLICK: (295, 127)\nstep 3: TYPE: THE OUTNET\nstep 4: CLICK: (920, 742)\nstep 5: CLICK: (915, 228)\nstep 6: CLICK: (905, 297)\nstep 7: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['iNaturalist', 'Temu']\nB: ['Google Play Store', 'THE OUTNET']\nC: ['Applock Pro - APP Lock & Guard', 'Amazon']\nD: ['Vaulty:Hide Pictures Videos', 'MATCHES']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_89_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_89_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_89_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_89_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_89_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_89_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_89_6.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Edge', 'Picturethis']\nB: ['Firefox', 'Contacts']\nC: ['Bing: chat with AI & GPT4', 'Google Play Store']\nD: ['Duckduckgo', 'Setting']\n", "question": "The corresponding actions are: step 1: CLICK: (188, 607)\nstep 2: CLICK: (305, 829)\nstep 3: SCROLL: UP\nstep 4: CLICK: (319, 690)\nstep 5: CLICK: (808, 154)\nstep 6: TYPE: Duckduckgo\nstep 7: CLICK: (347, 381)\nstep 8: CLICK: (333, 787)\nstep 9: CLICK: (786, 670)\nstep 10: CLICK: (86, 157)\nstep 11: CLICK: (227, 631)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (188, 607)\nstep 2: CLICK: (305, 829)\nstep 3: SCROLL: UP\nstep 4: CLICK: (319, 690)\nstep 5: CLICK: (808, 154)\nstep 6: TYPE: Duckduckgo\nstep 7: CLICK: (347, 381)\nstep 8: CLICK: (333, 787)\nstep 9: CLICK: (786, 670)\nstep 10: CLICK: (86, 157)\nstep 11: CLICK: (227, 631)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Edge', 'Picturethis']\nB: ['Firefox', 'Contacts']\nC: ['Bing: chat with AI & GPT4', 'Google Play Store']\nD: ['Duckduckgo', 'Setting']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_90_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_90_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_90_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_90_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_90_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_90_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_90_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_90_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_90_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_90_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_90_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_90_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['DeepL translate', 'Yahoo Sports', 'Microsoft Word']\nB: ['Language Translator: Translate', 'Yahoo Finance: Stock News', 'Simplenote']\nC: ['Microsoft Translator', 'CNN Breaking US & World News', 'Google Keep']\nD: ['Google Translate', 'AP News', 'Google Docs']\n", "question": "The corresponding actions are: step 1: CLICK: (659, 123)\nstep 2: CLICK: (409, 482)\nstep 3: PRESS_HOME\nstep 4: CLICK: (158, 261)\nstep 5: CLICK: (255, 771)\nstep 6: TYPE: Highway collapse kills dozens in southern China\nstep 7: LONG_PRESS: (413, 373)\nstep 8: PRESS_HOME\nstep 9: CLICK: (182, 397)\nstep 10: CLICK: (831, 896)\nstep 11: CLICK: (501, 652)\nstep 12: CLICK: (168, 155)\nstep 13: TYPE: Highway collapse kills dozens in southern China\nstep 14: CLICK: (83, 78)\nstep 15: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (659, 123)\nstep 2: CLICK: (409, 482)\nstep 3: PRESS_HOME\nstep 4: CLICK: (158, 261)\nstep 5: CLICK: (255, 771)\nstep 6: TYPE: Highway collapse kills dozens in southern China\nstep 7: LONG_PRESS: (413, 373)\nstep 8: PRESS_HOME\nstep 9: CLICK: (182, 397)\nstep 10: CLICK: (831, 896)\nstep 11: CLICK: (501, 652)\nstep 12: CLICK: (168, 155)\nstep 13: TYPE: Highway collapse kills dozens in southern China\nstep 14: CLICK: (83, 78)\nstep 15: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['DeepL translate', 'Yahoo Sports', 'Microsoft Word']\nB: ['Language Translator: Translate', 'Yahoo Finance: Stock News', 'Simplenote']\nC: ['Microsoft Translator', 'CNN Breaking US & World News', 'Google Keep']\nD: ['Google Translate', 'AP News', 'Google Docs']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_91_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_91_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_91_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_91_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_91_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_91_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_91_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_91_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_91_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_91_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_91_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_91_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_91_12.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_91_13.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_91_14.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Wish', 'Edge', 'Edge']\nB: ['AliExpress', 'wikiHow', 'Firefox']\nC: ['Temu', 'Firefox', 'Bing: chat with AI & GPT4']\nD: ['Amazon', 'Chrome', 'wikiHow']\n", "question": "The corresponding actions are: step 1: CLICK: (368, 361)\nstep 2: CLICK: (465, 59)\nstep 3: TYPE: book about poetry\nstep 4: CLICK: (933, 915)\nstep 5: PRESS_HOME\nstep 6: CLICK: (384, 240)\nstep 7: CLICK: (936, 56)\nstep 8: TYPE: the ode less travelled book\nstep 9: CLICK: (925, 902)\nstep 10: PRESS_HOME\nstep 11: CLICK: (137, 113)\nstep 12: CLICK: (368, 54)\nstep 13: TYPE: the ode less travelled book\nstep 14: CLICK: (928, 905)\nstep 15: SCROLL: UP\nstep 16: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (368, 361)\nstep 2: CLICK: (465, 59)\nstep 3: TYPE: book about poetry\nstep 4: CLICK: (933, 915)\nstep 5: PRESS_HOME\nstep 6: CLICK: (384, 240)\nstep 7: CLICK: (936, 56)\nstep 8: TYPE: the ode less travelled book\nstep 9: CLICK: (925, 902)\nstep 10: PRESS_HOME\nstep 11: CLICK: (137, 113)\nstep 12: CLICK: (368, 54)\nstep 13: TYPE: the ode less travelled book\nstep 14: CLICK: (928, 905)\nstep 15: SCROLL: UP\nstep 16: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Wish', 'Edge', 'Edge']\nB: ['AliExpress', 'wikiHow', 'Firefox']\nC: ['Temu', 'Firefox', 'Bing: chat with AI & GPT4']\nD: ['Amazon', 'Chrome', 'wikiHow']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_92_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_92_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_92_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_92_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_92_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_92_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_92_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_92_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_92_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_92_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_92_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_92_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_92_12.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_92_13.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_92_14.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_92_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Gmail', 'Tumblr']\nB: ['Facebook', 'Messenger']\nC: ['Instagram', 'Gmail']\nD: ['Threads', 'Instagram']\n", "question": "The corresponding actions are: step 1: CLICK: (559, 146)\nstep 2: CLICK: (668, 65)\nstep 3: CLICK: (636, 134)\nstep 4: CLICK: (324, 228)\nstep 5: TYPE: I am sad now\nstep 6: CLICK: (739, 62)\nstep 7: SCROLL: UP\nstep 8: CLICK: (718, 598)\nstep 9: CLICK: (739, 844)\nstep 10: CLICK: (365, 704)\nstep 11: CLICK: (735, 471)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (559, 146)\nstep 2: CLICK: (668, 65)\nstep 3: CLICK: (636, 134)\nstep 4: CLICK: (324, 228)\nstep 5: TYPE: I am sad now\nstep 6: CLICK: (739, 62)\nstep 7: SCROLL: UP\nstep 8: CLICK: (718, 598)\nstep 9: CLICK: (739, 844)\nstep 10: CLICK: (365, 704)\nstep 11: CLICK: (735, 471)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Gmail', 'Tumblr']\nB: ['Facebook', 'Messenger']\nC: ['Instagram', 'Gmail']\nD: ['Threads', 'Instagram']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_93_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_93_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_93_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_93_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_93_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_93_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_93_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_93_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_93_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_93_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_93_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_93_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Photos', 'Applock Pro - APP Lock & Guard']\nB: ['Google Photos', 'Tripadvisor']\nC: ['ABPV', 'Contacts']\nD: ['Mapillary', 'Setting']\n", "question": "The corresponding actions are: step 1: CLICK: (712, 544)\nstep 2: CLICK: (583, 577)\nstep 3: CLICK: (480, 560)\nstep 4: CLICK: (424, 555)\nstep 5: CLICK: (596, 659)\nstep 6: CLICK: (497, 664)\nstep 7: CLICK: (410, 666)\nstep 8: SCROLL: UP\nstep 9: CLICK: (675, 921)\nstep 10: PRESS_HOME\nstep 11: CLICK: (856, 710)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (712, 544)\nstep 2: CLICK: (583, 577)\nstep 3: CLICK: (480, 560)\nstep 4: CLICK: (424, 555)\nstep 5: CLICK: (596, 659)\nstep 6: CLICK: (497, 664)\nstep 7: CLICK: (410, 666)\nstep 8: SCROLL: UP\nstep 9: CLICK: (675, 921)\nstep 10: PRESS_HOME\nstep 11: CLICK: (856, 710)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Photos', 'Applock Pro - APP Lock & Guard']\nB: ['Google Photos', 'Tripadvisor']\nC: ['ABPV', 'Contacts']\nD: ['Mapillary', 'Setting']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_94_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_94_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_94_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_94_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_94_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_94_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_94_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_94_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_94_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_94_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_94_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_94_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Uber', 'Tubi: Movies & Live TV']\nB: ['GPS', 'Tiktok']\nC: ['Citymapper', 'Shorts VotTak: Short Video App']\nD: ['Google Map', 'Netflix']\n", "question": "The corresponding actions are: step 1: CLICK: (413, 273)\nstep 2: CLICK: (953, 75)\nstep 3: TYPE: the best chinese dim sum restaurant in Los Angeles\nstep 4: CLICK: (958, 922)\nstep 5: CLICK: (264, 467)\nstep 6: PRESS_HOME\nstep 7: CLICK: (127, 154)\nstep 8: CLICK: (556, 179)\nstep 9: CLICK: (232, 832)\nstep 10: TYPE: Won Kok Restaurant \nstep 11: CLICK: (922, 912)\nstep 12: CLICK: (326, 265)\nstep 13: CLICK: (911, 916)\nstep 14: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (413, 273)\nstep 2: CLICK: (953, 75)\nstep 3: TYPE: the best chinese dim sum restaurant in Los Angeles\nstep 4: CLICK: (958, 922)\nstep 5: CLICK: (264, 467)\nstep 6: PRESS_HOME\nstep 7: CLICK: (127, 154)\nstep 8: CLICK: (556, 179)\nstep 9: CLICK: (232, 832)\nstep 10: TYPE: Won Kok Restaurant \nstep 11: CLICK: (922, 912)\nstep 12: CLICK: (326, 265)\nstep 13: CLICK: (911, 916)\nstep 14: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Uber', 'Tubi: Movies & Live TV']\nB: ['GPS', 'Tiktok']\nC: ['Citymapper', 'Shorts VotTak: Short Video App']\nD: ['Google Map', 'Netflix']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_95_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_95_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_95_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_95_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_95_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_95_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_95_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_95_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_95_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_95_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_95_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_95_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_95_12.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_95_13.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['TickTick', 'DuckDuckGo']\nB: ['To-Do List', 'Firefox']\nC: ['Microsoft to do', 'Chrome']\nD: ['Any.do', 'Bing: chat with AI & GPT4']\n", "question": "The corresponding actions are: step 1: CLICK: (605, 815)\nstep 2: CLICK: (795, 184)\nstep 3: TYPE: when is the next fashion week in Paris\nstep 4: CLICK: (444, 167)\nstep 5: PRESS_HOME\nstep 6: CLICK: (393, 647)\nstep 7: CLICK: (424, 937)\nstep 8: CLICK: (469, 884)\nstep 9: CLICK: (770, 362)\nstep 10: CLICK: (749, 364)\nstep 11: CLICK: (412, 588)\nstep 12: CLICK: (750, 726)\nstep 13: CLICK: (242, 891)\nstep 14: TYPE: the next fashion show in Paris\nstep 15: CLICK: (936, 902)\nstep 16: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (605, 815)\nstep 2: CLICK: (795, 184)\nstep 3: TYPE: when is the next fashion week in Paris\nstep 4: CLICK: (444, 167)\nstep 5: PRESS_HOME\nstep 6: CLICK: (393, 647)\nstep 7: CLICK: (424, 937)\nstep 8: CLICK: (469, 884)\nstep 9: CLICK: (770, 362)\nstep 10: CLICK: (749, 364)\nstep 11: CLICK: (412, 588)\nstep 12: CLICK: (750, 726)\nstep 13: CLICK: (242, 891)\nstep 14: TYPE: the next fashion show in Paris\nstep 15: CLICK: (936, 902)\nstep 16: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['TickTick', 'DuckDuckGo']\nB: ['To-Do List', 'Firefox']\nC: ['Microsoft to do', 'Chrome']\nD: ['Any.do', 'Bing: chat with AI & GPT4']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_96_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_96_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_96_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_96_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_96_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_96_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_96_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_96_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_96_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_96_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_96_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_96_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_96_12.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_96_13.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_96_14.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_96_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Basic Calculator: GPA & Math', 'Triller']\nB: ['Calculator', 'Netflix']\nC: ['Google Drive', 'Likee']\nD: ['Clock', 'Youtube']\n", "question": "The corresponding actions are: step 1: CLICK: (331, 912)\nstep 2: TYPE: Origami figure\nstep 3: CLICK: (873, 884)\nstep 4: CLICK: (335, 613)\nstep 5: PRESS_HOME\nstep 6: CLICK: (430, 471)\nstep 7: TYPE: 630\nstep 8: CLICK: (751, 842)\nstep 9: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (331, 912)\nstep 2: TYPE: Origami figure\nstep 3: CLICK: (873, 884)\nstep 4: CLICK: (335, 613)\nstep 5: PRESS_HOME\nstep 6: CLICK: (430, 471)\nstep 7: TYPE: 630\nstep 8: CLICK: (751, 842)\nstep 9: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Basic Calculator: GPA & Math', 'Triller']\nB: ['Calculator', 'Netflix']\nC: ['Google Drive', 'Likee']\nD: ['Clock', 'Youtube']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_97_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_97_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_97_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_97_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_97_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_97_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_97_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_97_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_97_8.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Edge', 'Pluto TV - Live TV and Movies']\nB: ['wikiHow', 'Likee']\nC: ['Quora', 'Tubi: Movies & Live TV']\nD: ['Opera', 'Youtube']\n", "question": "The corresponding actions are: step 1: PRESS_HOME\nstep 2: CLICK: (824, 126)\nstep 3: CLICK: (325, 112)\nstep 4: TYPE: free horror movie on youtube\nstep 5: CLICK: (892, 903)\nstep 6: CLICK: (169, 424)\nstep 7: CLICK: (795, 114)\nstep 8: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: PRESS_HOME\nstep 2: CLICK: (824, 126)\nstep 3: CLICK: (325, 112)\nstep 4: TYPE: free horror movie on youtube\nstep 5: CLICK: (892, 903)\nstep 6: CLICK: (169, 424)\nstep 7: CLICK: (795, 114)\nstep 8: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Edge', 'Pluto TV - Live TV and Movies']\nB: ['wikiHow', 'Likee']\nC: ['Quora', 'Tubi: Movies & Live TV']\nD: ['Opera', 'Youtube']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_98_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_98_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_98_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_98_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_98_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_98_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_98_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_98_7.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Calendar', 'Wikipedia']\nB: ['Calculator', 'DuckDuckGo']\nC: ['Clock', 'Chrome']\nD: ['ClevCalc - Calculator', 'Opera']\n", "question": "The corresponding actions are: step 1: CLICK: (624, 813)\nstep 2: CLICK: (466, 171)\nstep 3: CLICK: (904, 121)\nstep 4: TYPE: nature sound video\nstep 5: CLICK: (912, 905)\nstep 6: CLICK: (551, 344)\nstep 7: PRESS_HOME\nstep 8: CLICK: (598, 368)\nstep 9: CLICK: (870, 140)\nstep 10: TYPE: 10000\nstep 11: CLICK: (566, 821)\nstep 12: PRESS_RECENT\nstep 13: CLICK: (52, 182)\nstep 14: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (624, 813)\nstep 2: CLICK: (466, 171)\nstep 3: CLICK: (904, 121)\nstep 4: TYPE: nature sound video\nstep 5: CLICK: (912, 905)\nstep 6: CLICK: (551, 344)\nstep 7: PRESS_HOME\nstep 8: CLICK: (598, 368)\nstep 9: CLICK: (870, 140)\nstep 10: TYPE: 10000\nstep 11: CLICK: (566, 821)\nstep 12: PRESS_RECENT\nstep 13: CLICK: (52, 182)\nstep 14: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Calendar', 'Wikipedia']\nB: ['Calculator', 'DuckDuckGo']\nC: ['Clock', 'Chrome']\nD: ['ClevCalc - Calculator', 'Opera']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_99_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_99_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_99_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_99_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_99_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_99_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_99_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_99_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_99_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_99_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_99_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_99_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_99_12.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_99_13.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Amazon Kindle', 'Tripadvisor']\nB: ['Kobo Books - eBooks Audiobooks', 'Picturethis']\nC: ['Google Play Books & Audiobooks', 'Setting']\nD: ['Pocket FM: Audio Series', 'Google Play Store']\n", "question": "The corresponding actions are: step 1: CLICK: (342, 266)\nstep 2: SCROLL: UP\nstep 3: CLICK: (374, 550)\nstep 4: CLICK: (902, 652)\nstep 5: PRESS_HOME\nstep 6: CLICK: (179, 268)\nstep 7: CLICK: (444, 554)\nstep 8: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (342, 266)\nstep 2: SCROLL: UP\nstep 3: CLICK: (374, 550)\nstep 4: CLICK: (902, 652)\nstep 5: PRESS_HOME\nstep 6: CLICK: (179, 268)\nstep 7: CLICK: (444, 554)\nstep 8: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Amazon Kindle', 'Tripadvisor']\nB: ['Kobo Books - eBooks Audiobooks', 'Picturethis']\nC: ['Google Play Books & Audiobooks', 'Setting']\nD: ['Pocket FM: Audio Series', 'Google Play Store']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_100_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_100_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_100_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_100_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_100_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_100_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_100_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_100_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Picturethis', 'Tripadvisor', 'Netflix']\nB: ['Plantin', 'Vaulty:Hide Pictures Videos', 'Tiktok']\nC: ['Google Play Store', 'Setting', 'Shorts VotTak: Short Video App']\nD: ['iNaturalist', 'Picturethis', 'Likee']\n", "question": "The corresponding actions are: step 1: CLICK: (827, 749)\nstep 2: CLICK: (830, 468)\nstep 3: PRESS_HOME\nstep 4: CLICK: (374, 746)\nstep 5: CLICK: (208, 576)\nstep 6: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (827, 749)\nstep 2: CLICK: (830, 468)\nstep 3: PRESS_HOME\nstep 4: CLICK: (374, 746)\nstep 5: CLICK: (208, 576)\nstep 6: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Picturethis', 'Tripadvisor', 'Netflix']\nB: ['Plantin', 'Vaulty:Hide Pictures Videos', 'Tiktok']\nC: ['Google Play Store', 'Setting', 'Shorts VotTak: Short Video App']\nD: ['iNaturalist', 'Picturethis', 'Likee']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_101_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_101_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_101_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_101_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_101_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_101_5.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['X', 'AliExpress']\nB: ['Tumblr', 'Wish']\nC: ['Gmail', 'Temu']\nD: ['Instagram', 'SSENSE']\n", "question": "The corresponding actions are: step 1: CLICK: (237, 734)\nstep 2: CLICK: (927, 701)\nstep 3: CLICK: (63, 658)\nstep 4: TYPE: action camera recommendation\nstep 5: CLICK: (748, 96)\nstep 6: PRESS_HOME\nstep 7: CLICK: (70, 580)\nstep 8: CLICK: (69, 531)\nstep 9: CLICK: (58, 123)\nstep 10: TYPE: Insta360 One\nstep 11: CLICK: (743, 34)\nstep 12: CLICK: (716, 636)\nstep 13: CLICK: (927, 644)\nstep 14: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (237, 734)\nstep 2: CLICK: (927, 701)\nstep 3: CLICK: (63, 658)\nstep 4: TYPE: action camera recommendation\nstep 5: CLICK: (748, 96)\nstep 6: PRESS_HOME\nstep 7: CLICK: (70, 580)\nstep 8: CLICK: (69, 531)\nstep 9: CLICK: (58, 123)\nstep 10: TYPE: Insta360 One\nstep 11: CLICK: (743, 34)\nstep 12: CLICK: (716, 636)\nstep 13: CLICK: (927, 644)\nstep 14: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['X', 'AliExpress']\nB: ['Tumblr', 'Wish']\nC: ['Gmail', 'Temu']\nD: ['Instagram', 'SSENSE']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_102_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_102_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_102_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_102_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_102_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_102_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_102_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_102_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_102_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_102_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_102_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_102_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_102_12.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_102_13.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Google Play Store', 'Youtube']\nB: ['iNaturalist', 'Tiktok']\nC: ['PlantNet', 'Netflix']\nD: ['Vaulty:Hide Pictures Videos', 'Triller']\n", "question": "The corresponding actions are: step 1: CLICK: (625, 844)\nstep 2: PRESS_HOME\nstep 3: CLICK: (840, 825)\nstep 4: CLICK: (819, 76)\nstep 5: CLICK: (936, 78)\nstep 6: TYPE: Apple Fitness Plus\nstep 7: CLICK: (924, 906)\nstep 8: CLICK: (721, 552)\nstep 9: CLICK: (712, 235)\nstep 10: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (625, 844)\nstep 2: PRESS_HOME\nstep 3: CLICK: (840, 825)\nstep 4: CLICK: (819, 76)\nstep 5: CLICK: (936, 78)\nstep 6: TYPE: Apple Fitness Plus\nstep 7: CLICK: (924, 906)\nstep 8: CLICK: (721, 552)\nstep 9: CLICK: (712, 235)\nstep 10: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Google Play Store', 'Youtube']\nB: ['iNaturalist', 'Tiktok']\nC: ['PlantNet', 'Netflix']\nD: ['Vaulty:Hide Pictures Videos', 'Triller']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_103_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_103_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_103_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_103_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_103_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_103_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_103_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_103_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_103_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_103_9.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Google Play Store', 'Setting']\nB: ['Picturethis', 'Picturethis']\nC: ['Applock Pro - APP Lock & Guard', 'Contacts']\nD: ['Vaulty:Hide Pictures Videos', 'Tripadvisor']\n", "question": "The corresponding actions are: step 1: CLICK: (86, 652)\nstep 2: CLICK: (279, 93)\nstep 3: TYPE: eBay\nstep 4: CLICK: (859, 887)\nstep 5: CLICK: (612, 441)\nstep 6: CLICK: (697, 535)\nstep 7: PRESS_HOME\nstep 8: CLICK: (425, 654)\nstep 9: CLICK: (205, 475)\nstep 10: CLICK: (464, 857)\nstep 11: CLICK: (916, 83)\nstep 12: TYPE: eBay\nstep 13: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (86, 652)\nstep 2: CLICK: (279, 93)\nstep 3: TYPE: eBay\nstep 4: CLICK: (859, 887)\nstep 5: CLICK: (612, 441)\nstep 6: CLICK: (697, 535)\nstep 7: PRESS_HOME\nstep 8: CLICK: (425, 654)\nstep 9: CLICK: (205, 475)\nstep 10: CLICK: (464, 857)\nstep 11: CLICK: (916, 83)\nstep 12: TYPE: eBay\nstep 13: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Google Play Store', 'Setting']\nB: ['Picturethis', 'Picturethis']\nC: ['Applock Pro - APP Lock & Guard', 'Contacts']\nD: ['Vaulty:Hide Pictures Videos', 'Tripadvisor']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_104_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_104_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_104_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_104_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_104_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_104_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_104_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_104_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_104_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_104_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_104_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_104_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_104_12.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Applock Pro - APP Lock & Guard', 'Google Pay']\nB: ['Setting', 'Chime - Mobile Banking']\nC: ['iNaturalist', 'Cash App']\nD: ['Plantin', 'Investing.com']\n", "question": "The corresponding actions are: step 1: CLICK: (809, 503)\nstep 2: CLICK: (210, 567)\nstep 3: CLICK: (530, 564)\nstep 4: CLICK: (816, 580)\nstep 5: CLICK: (791, 687)\nstep 6: CLICK: (478, 677)\nstep 7: CLICK: (194, 677)\nstep 8: SCROLL: UP\nstep 9: CLICK: (874, 854)\nstep 10: PRESS_HOME\nstep 11: CLICK: (105, 634)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (809, 503)\nstep 2: CLICK: (210, 567)\nstep 3: CLICK: (530, 564)\nstep 4: CLICK: (816, 580)\nstep 5: CLICK: (791, 687)\nstep 6: CLICK: (478, 677)\nstep 7: CLICK: (194, 677)\nstep 8: SCROLL: UP\nstep 9: CLICK: (874, 854)\nstep 10: PRESS_HOME\nstep 11: CLICK: (105, 634)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Applock Pro - APP Lock & Guard', 'Google Pay']\nB: ['Setting', 'Chime - Mobile Banking']\nC: ['iNaturalist', 'Cash App']\nD: ['Plantin', 'Investing.com']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_105_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_105_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_105_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_105_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_105_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_105_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_105_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_105_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_105_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_105_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_105_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_105_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['X', 'Facebook']\nB: ['Threads', 'Instagram']\nC: ['Gmail', 'X']\nD: ['Instagram', 'Whatsapp']\n", "question": "The corresponding actions are: step 1: CLICK: (554, 156)\nstep 2: CLICK: (727, 70)\nstep 3: TYPE:  climate change rally\nstep 4: CLICK: (860, 887)\nstep 5: CLICK: (496, 138)\nstep 6: CLICK: (332, 240)\nstep 7: SCROLL: UP\nstep 8: SCROLL: UP\nstep 9: PRESS_HOME\nstep 10: CLICK: (559, 330)\nstep 11: CLICK: (407, 180)\nstep 12: CLICK: (225, 918)\nstep 13: TYPE: Honk for Climate Action will come on JUN 1\nstep 14: CLICK: (959, 488)\nstep 15: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (554, 156)\nstep 2: CLICK: (727, 70)\nstep 3: TYPE:  climate change rally\nstep 4: CLICK: (860, 887)\nstep 5: CLICK: (496, 138)\nstep 6: CLICK: (332, 240)\nstep 7: SCROLL: UP\nstep 8: SCROLL: UP\nstep 9: PRESS_HOME\nstep 10: CLICK: (559, 330)\nstep 11: CLICK: (407, 180)\nstep 12: CLICK: (225, 918)\nstep 13: TYPE: Honk for Climate Action will come on JUN 1\nstep 14: CLICK: (959, 488)\nstep 15: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['X', 'Facebook']\nB: ['Threads', 'Instagram']\nC: ['Gmail', 'X']\nD: ['Instagram', 'Whatsapp']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_106_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_106_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_106_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_106_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_106_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_106_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_106_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_106_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_106_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_106_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_106_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_106_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_106_12.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_106_13.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_106_14.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Vaulty:Hide Pictures Videos', 'Applock Pro - APP Lock & Guard', 'Youtube']\nB: ['PlantNet', 'Contacts', 'Likee']\nC: ['TradingView: Track All Markets', 'Plantin', 'Tubi: Movies & Live TV']\nD: ['Setting', 'Google Play Store', 'Shorts VotTak: Short Video App']\n", "question": "The corresponding actions are: step 1: CLICK: (672, 909)\nstep 2: CLICK: (872, 450)\nstep 3: CLICK: (716, 29)\nstep 4: PRESS_HOME\nstep 5: CLICK: (552, 934)\nstep 6: CLICK: (918, 543)\nstep 7: CLICK: (386, 72)\nstep 8: CLICK: (483, 511)\nstep 9: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (672, 909)\nstep 2: CLICK: (872, 450)\nstep 3: CLICK: (716, 29)\nstep 4: PRESS_HOME\nstep 5: CLICK: (552, 934)\nstep 6: CLICK: (918, 543)\nstep 7: CLICK: (386, 72)\nstep 8: CLICK: (483, 511)\nstep 9: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Vaulty:Hide Pictures Videos', 'Applock Pro - APP Lock & Guard', 'Youtube']\nB: ['PlantNet', 'Contacts', 'Likee']\nC: ['TradingView: Track All Markets', 'Plantin', 'Tubi: Movies & Live TV']\nD: ['Setting', 'Google Play Store', 'Shorts VotTak: Short Video App']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_107_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_107_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_107_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_107_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_107_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_107_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_107_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_107_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_107_8.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Setting', 'Wish']\nB: ['Tripadvisor', 'Joom']\nC: ['PlantNet', 'SHEIN']\nD: ['Google Play Store', 'Lazada']\n", "question": "The corresponding actions are: step 1: CLICK: (143, 116)\nstep 2: CLICK: (535, 933)\nstep 3: CLICK: (467, 76)\nstep 4: TYPE: Lazada\nstep 5: CLICK: (897, 904)\nstep 6: CLICK: (773, 327)\nstep 7: PRESS_HOME\nstep 8: CLICK: (374, 391)\nstep 9: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (143, 116)\nstep 2: CLICK: (535, 933)\nstep 3: CLICK: (467, 76)\nstep 4: TYPE: Lazada\nstep 5: CLICK: (897, 904)\nstep 6: CLICK: (773, 327)\nstep 7: PRESS_HOME\nstep 8: CLICK: (374, 391)\nstep 9: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Setting', 'Wish']\nB: ['Tripadvisor', 'Joom']\nC: ['PlantNet', 'SHEIN']\nD: ['Google Play Store', 'Lazada']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_108_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_108_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_108_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_108_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_108_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_108_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_108_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_108_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_108_8.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['TradingView: Track All Markets', 'TradingView: Track All Markets']\nB: ['Google Play Store', 'Setting']\nC: ['Picturethis', 'Tripadvisor']\nD: ['Tripadvisor', 'Google Play Store']\n", "question": "The corresponding actions are: step 1: CLICK: (323, 498)\nstep 2: SCROLL: UP\nstep 3: SCROLL: UP\nstep 4: CLICK: (153, 937)\nstep 5: CLICK: (569, 365)\nstep 6: CLICK: (552, 426)\nstep 7: CLICK: (519, 624)\nstep 8: CLICK: (967, 72)\nstep 9: TYPE: Danish\nstep 10: CLICK: (432, 176)\nstep 11: CLICK: (454, 167)\nstep 12: SCROLL: UP\nstep 13: CLICK: (865, 655)\nstep 14: PRESS_HOME\nstep 15: CLICK: (93, 655)\nstep 16: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (323, 498)\nstep 2: SCROLL: UP\nstep 3: SCROLL: UP\nstep 4: CLICK: (153, 937)\nstep 5: CLICK: (569, 365)\nstep 6: CLICK: (552, 426)\nstep 7: CLICK: (519, 624)\nstep 8: CLICK: (967, 72)\nstep 9: TYPE: Danish\nstep 10: CLICK: (432, 176)\nstep 11: CLICK: (454, 167)\nstep 12: SCROLL: UP\nstep 13: CLICK: (865, 655)\nstep 14: PRESS_HOME\nstep 15: CLICK: (93, 655)\nstep 16: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['TradingView: Track All Markets', 'TradingView: Track All Markets']\nB: ['Google Play Store', 'Setting']\nC: ['Picturethis', 'Tripadvisor']\nD: ['Tripadvisor', 'Google Play Store']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_109_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_109_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_109_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_109_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_109_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_109_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_109_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_109_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_109_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_109_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_109_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_109_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_109_12.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_109_13.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_109_14.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_109_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Yandex Translate', 'Spotify']\nB: ['Microsoft Translator', 'iHeart: Music, Radio, Podcasts']\nC: ['Google Translate', 'Amazon Music']\nD: ['Language Translator: Translate', 'YT Music']\n", "question": "The corresponding actions are: step 1: CLICK: (572, 494)\nstep 2: CLICK: (72, 172)\nstep 3: CLICK: (435, 213)\nstep 4: TYPE: The Music of the Night\nstep 5: CLICK: (407, 441)\nstep 6: CLICK: (129, 592)\nstep 7: SCROLL: UP\nstep 8: PRESS_HOME\nstep 9: CLICK: (794, 655)\nstep 10: CLICK: (744, 147)\nstep 11: CLICK: (192, 306)\nstep 12: TYPE: Italian\nstep 13: CLICK: (179, 329)\nstep 14: CLICK: (126, 216)\nstep 15: TYPE: Nighttime sharpens, heightens each sensation\nstep 16: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (572, 494)\nstep 2: CLICK: (72, 172)\nstep 3: CLICK: (435, 213)\nstep 4: TYPE: The Music of the Night\nstep 5: CLICK: (407, 441)\nstep 6: CLICK: (129, 592)\nstep 7: SCROLL: UP\nstep 8: PRESS_HOME\nstep 9: CLICK: (794, 655)\nstep 10: CLICK: (744, 147)\nstep 11: CLICK: (192, 306)\nstep 12: TYPE: Italian\nstep 13: CLICK: (179, 329)\nstep 14: CLICK: (126, 216)\nstep 15: TYPE: Nighttime sharpens, heightens each sensation\nstep 16: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Yandex Translate', 'Spotify']\nB: ['Microsoft Translator', 'iHeart: Music, Radio, Podcasts']\nC: ['Google Translate', 'Amazon Music']\nD: ['Language Translator: Translate', 'YT Music']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_110_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_110_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_110_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_110_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_110_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_110_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_110_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_110_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_110_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_110_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_110_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_110_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_110_12.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_110_13.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_110_14.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_110_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Duolingo', 'To-Do List']\nB: ['Rosetta Stone: Learn, Practice', 'Microsoft to do']\nC: ['Babbel - Learn Languages', 'TickTick']\nD: ['Memrise: speak a new language', 'Things']\n", "question": "The corresponding actions are: step 1: CLICK: (386, 406)\nstep 2: CLICK: (78, 65)\nstep 3: CLICK: (315, 154)\nstep 4: CLICK: (163, 490)\nstep 5: CLICK: (395, 928)\nstep 6: PRESS_HOME\nstep 7: CLICK: (817, 531)\nstep 8: CLICK: (929, 875)\nstep 9: TYPE: Italin Learning\nstep 10: CLICK: (920, 646)\nstep 11: CLICK: (306, 377)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (386, 406)\nstep 2: CLICK: (78, 65)\nstep 3: CLICK: (315, 154)\nstep 4: CLICK: (163, 490)\nstep 5: CLICK: (395, 928)\nstep 6: PRESS_HOME\nstep 7: CLICK: (817, 531)\nstep 8: CLICK: (929, 875)\nstep 9: TYPE: Italin Learning\nstep 10: CLICK: (920, 646)\nstep 11: CLICK: (306, 377)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Duolingo', 'To-Do List']\nB: ['Rosetta Stone: Learn, Practice', 'Microsoft to do']\nC: ['Babbel - Learn Languages', 'TickTick']\nD: ['Memrise: speak a new language', 'Things']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_111_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_111_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_111_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_111_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_111_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_111_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_111_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_111_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_111_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_111_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_111_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_111_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Uber', 'Yandex Navigator']\nB: ['Waze Navigation & Live Traffic', 'Citymapper']\nC: ['Yandex Navigator', 'Maps']\nD: ['Google Map', 'Lyft']\n", "question": "The corresponding actions are: step 1: CLICK: (422, 271)\nstep 2: CLICK: (141, 73)\nstep 3: TYPE: bakery\nstep 4: CLICK: (169, 246)\nstep 5: PRESS_HOME\nstep 6: CLICK: (583, 413)\nstep 7: CLICK: (339, 668)\nstep 8: TYPE: 19459 Stevens Creek Blvd #100\nstep 9: CLICK: (435, 281)\nstep 10: CLICK: (474, 889)\nstep 11: CLICK: (466, 903)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (422, 271)\nstep 2: CLICK: (141, 73)\nstep 3: TYPE: bakery\nstep 4: CLICK: (169, 246)\nstep 5: PRESS_HOME\nstep 6: CLICK: (583, 413)\nstep 7: CLICK: (339, 668)\nstep 8: TYPE: 19459 Stevens Creek Blvd #100\nstep 9: CLICK: (435, 281)\nstep 10: CLICK: (474, 889)\nstep 11: CLICK: (466, 903)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Uber', 'Yandex Navigator']\nB: ['Waze Navigation & Live Traffic', 'Citymapper']\nC: ['Yandex Navigator', 'Maps']\nD: ['Google Map', 'Lyft']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_112_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_112_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_112_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_112_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_112_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_112_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_112_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_112_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_112_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_112_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_112_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_112_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Facebook', 'Threads']\nB: ['Gmail', 'Whatsapp']\nC: ['X', 'Facebook']\nD: ['Threads', 'Gmail']\n", "question": "The corresponding actions are: step 1: CLICK: (384, 222)\nstep 2: CLICK: (500, 934)\nstep 3: TYPE: a\nstep 4: CLICK: (916, 592)\nstep 5: CLICK: (437, 393)\nstep 6: CLICK: (439, 915)\nstep 7: PRESS_HOME\nstep 8: CLICK: (366, 94)\nstep 9: CLICK: (260, 107)\nstep 10: CLICK: (474, 136)\nstep 11: CLICK: (299, 224)\nstep 12: CLICK: (586, 479)\nstep 13: CLICK: (403, 937)\nstep 14: CLICK: (474, 663)\nstep 15: CLICK: (912, 605)\nstep 16: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (384, 222)\nstep 2: CLICK: (500, 934)\nstep 3: TYPE: a\nstep 4: CLICK: (916, 592)\nstep 5: CLICK: (437, 393)\nstep 6: CLICK: (439, 915)\nstep 7: PRESS_HOME\nstep 8: CLICK: (366, 94)\nstep 9: CLICK: (260, 107)\nstep 10: CLICK: (474, 136)\nstep 11: CLICK: (299, 224)\nstep 12: CLICK: (586, 479)\nstep 13: CLICK: (403, 937)\nstep 14: CLICK: (474, 663)\nstep 15: CLICK: (912, 605)\nstep 16: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Facebook', 'Threads']\nB: ['Gmail', 'Whatsapp']\nC: ['X', 'Facebook']\nD: ['Threads', 'Gmail']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_113_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_113_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_113_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_113_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_113_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_113_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_113_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_113_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_113_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_113_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_113_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_113_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_113_12.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_113_13.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_113_14.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_113_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['TradingView: Track All Markets', 'Tubi: Movies & Live TV', 'TradingView: Track All Markets']\nB: ['iNaturalist', 'Pluto TV - Live TV and Movies', 'Vaulty:Hide Pictures Videos']\nC: ['Setting', 'Tiktok', 'Google Play Store']\nD: ['Google Play Store', 'Netflix', 'PlantNet']\n", "question": "The corresponding actions are: step 1: CLICK: (124, 705)\nstep 2: CLICK: (653, 379)\nstep 3: PRESS_HOME\nstep 4: CLICK: (434, 710)\nstep 5: SCROLL: UP\nstep 6: SCROLL: DOWN\nstep 7: CLICK: (152, 414)\nstep 8: CLICK: (483, 352)\nstep 9: CLICK: (424, 580)\nstep 10: CLICK: (936, 474)\nstep 11: CLICK: (394, 74)\nstep 12: CLICK: (478, 456)\nstep 13: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (124, 705)\nstep 2: CLICK: (653, 379)\nstep 3: PRESS_HOME\nstep 4: CLICK: (434, 710)\nstep 5: SCROLL: UP\nstep 6: SCROLL: DOWN\nstep 7: CLICK: (152, 414)\nstep 8: CLICK: (483, 352)\nstep 9: CLICK: (424, 580)\nstep 10: CLICK: (936, 474)\nstep 11: CLICK: (394, 74)\nstep 12: CLICK: (478, 456)\nstep 13: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['TradingView: Track All Markets', 'Tubi: Movies & Live TV', 'TradingView: Track All Markets']\nB: ['iNaturalist', 'Pluto TV - Live TV and Movies', 'Vaulty:Hide Pictures Videos']\nC: ['Setting', 'Tiktok', 'Google Play Store']\nD: ['Google Play Store', 'Netflix', 'PlantNet']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_114_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_114_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_114_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_114_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_114_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_114_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_114_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_114_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_114_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_114_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_114_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_114_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_114_12.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['wikiHow', 'Applock Pro - APP Lock & Guard']\nB: ['Opera', 'PlantNet']\nC: ['Wikipedia', 'Plantin']\nD: ['Firefox', 'TradingView: Track All Markets']\n", "question": "The corresponding actions are: step 1: CLICK: (152, 237)\nstep 2: CLICK: (344, 56)\nstep 3: TYPE: Alibaba's stock market news\nstep 4: CLICK: (943, 908)\nstep 5: CLICK: (316, 321)\nstep 6: PRESS_HOME\nstep 7: CLICK: (127, 500)\nstep 8: CLICK: (921, 56)\nstep 9: TYPE: Alibaba\nstep 10: CLICK: (421, 157)\nstep 11: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (152, 237)\nstep 2: CLICK: (344, 56)\nstep 3: TYPE: Alibaba's stock market news\nstep 4: CLICK: (943, 908)\nstep 5: CLICK: (316, 321)\nstep 6: PRESS_HOME\nstep 7: CLICK: (127, 500)\nstep 8: CLICK: (921, 56)\nstep 9: TYPE: Alibaba\nstep 10: CLICK: (421, 157)\nstep 11: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['wikiHow', 'Applock Pro - APP Lock & Guard']\nB: ['Opera', 'PlantNet']\nC: ['Wikipedia', 'Plantin']\nD: ['Firefox', 'TradingView: Track All Markets']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_115_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_115_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_115_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_115_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_115_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_115_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_115_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_115_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_115_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_115_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_115_10.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Wikipedia', 'X']\nB: ['Firefox', 'Whatsapp']\nC: ['Chrome', 'Messenger']\nD: ['DuckDuckgo', 'Instagram']\n", "question": "The corresponding actions are: step 1: CLICK: (380, 670)\nstep 2: TYPE: sports game events\nstep 3: CLICK: (931, 927)\nstep 4: CLICK: (333, 454)\nstep 5: SCROLL: UP\nstep 6: PRESS_HOME\nstep 7: CLICK: (581, 147)\nstep 8: CLICK: (922, 67)\nstep 9: CLICK: (302, 143)\nstep 10: CLICK: (141, 193)\nstep 11: CLICK: (161, 955)\nstep 12: TYPE: In the MLB,Twins:White Sox is 10:5\nstep 13: CLICK: (904, 635)\nstep 14: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (380, 670)\nstep 2: TYPE: sports game events\nstep 3: CLICK: (931, 927)\nstep 4: CLICK: (333, 454)\nstep 5: SCROLL: UP\nstep 6: PRESS_HOME\nstep 7: CLICK: (581, 147)\nstep 8: CLICK: (922, 67)\nstep 9: CLICK: (302, 143)\nstep 10: CLICK: (141, 193)\nstep 11: CLICK: (161, 955)\nstep 12: TYPE: In the MLB,Twins:White Sox is 10:5\nstep 13: CLICK: (904, 635)\nstep 14: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Wikipedia', 'X']\nB: ['Firefox', 'Whatsapp']\nC: ['Chrome', 'Messenger']\nD: ['DuckDuckgo', 'Instagram']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_116_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_116_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_116_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_116_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_116_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_116_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_116_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_116_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_116_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_116_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_116_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_116_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_116_12.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_116_13.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Any.do', 'Tubi: Movies & Live TV']\nB: ['Todoist', 'Netflix']\nC: ['TickTick', 'Triller']\nD: ['Things', 'Tiktok']\n", "question": "The corresponding actions are: step 1: CLICK: (151, 364)\nstep 2: CLICK: (853, 922)\nstep 3: CLICK: (340, 480)\nstep 4: TYPE: do yoga in the morning\nstep 5: CLICK: (493, 367)\nstep 6: PRESS_HOME\nstep 7: CLICK: (809, 807)\nstep 8: CLICK: (238, 332)\nstep 9: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (151, 364)\nstep 2: CLICK: (853, 922)\nstep 3: CLICK: (340, 480)\nstep 4: TYPE: do yoga in the morning\nstep 5: CLICK: (493, 367)\nstep 6: PRESS_HOME\nstep 7: CLICK: (809, 807)\nstep 8: CLICK: (238, 332)\nstep 9: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Any.do', 'Tubi: Movies & Live TV']\nB: ['Todoist', 'Netflix']\nC: ['TickTick', 'Triller']\nD: ['Things', 'Tiktok']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_117_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_117_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_117_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_117_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_117_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_117_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_117_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_117_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_117_8.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Adobe Express: AI Video Design', 'Tumblr']\nB: ['Gallery-photo gallery,album', 'Whatsapp']\nC: ['Lightroom Photo & Video Editor', 'Messenger']\nD: ['Textify- Art Font Photo Editor', 'X']\n", "question": "The corresponding actions are: step 1: CLICK: (808, 503)\nstep 2: CLICK: (497, 918)\nstep 3: CLICK: (657, 572)\nstep 4: CLICK: (105, 894)\nstep 5: CLICK: (33, 464)\nstep 6: SCROLL: RIGHT\nstep 7: CLICK: (955, 22)\nstep 8: CLICK: (879, 53)\nstep 9: CLICK: (336, 654)\nstep 10: CLICK: (494, 704)\nstep 11: CLICK: (951, 911)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (808, 503)\nstep 2: CLICK: (497, 918)\nstep 3: CLICK: (657, 572)\nstep 4: CLICK: (105, 894)\nstep 5: CLICK: (33, 464)\nstep 6: SCROLL: RIGHT\nstep 7: CLICK: (955, 22)\nstep 8: CLICK: (879, 53)\nstep 9: CLICK: (336, 654)\nstep 10: CLICK: (494, 704)\nstep 11: CLICK: (951, 911)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Adobe Express: AI Video Design', 'Tumblr']\nB: ['Gallery-photo gallery,album', 'Whatsapp']\nC: ['Lightroom Photo & Video Editor', 'Messenger']\nD: ['Textify- Art Font Photo Editor', 'X']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_118_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_118_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_118_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_118_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_118_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_118_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_118_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_118_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_118_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_118_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_118_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_118_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Chatbot AI & Smart Assistant', 'Chrome']\nB: ['WOMBO Dream-AI Art Generator', 'Firefox']\nC: ['ChatGPT', 'Edge']\nD: ['GenZArt:Fast AI Art Generator', 'Opera']\n", "question": "The corresponding actions are: step 1: CLICK: (429, 423)\nstep 2: CLICK: (360, 873)\nstep 3: TYPE: tell me about Binomial theorem\nstep 4: CLICK: (696, 429)\nstep 5: PRESS_HOME\nstep 6: CLICK: (511, 907)\nstep 7: CLICK: (282, 363)\nstep 8: TYPE: Binomial theorem\nstep 9: CLICK: (887, 681)\nstep 10: SCROLL: UP\nstep 11: SCROLL: UP\nstep 12: CLICK: (284, 839)\nstep 13: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (429, 423)\nstep 2: CLICK: (360, 873)\nstep 3: TYPE: tell me about Binomial theorem\nstep 4: CLICK: (696, 429)\nstep 5: PRESS_HOME\nstep 6: CLICK: (511, 907)\nstep 7: CLICK: (282, 363)\nstep 8: TYPE: Binomial theorem\nstep 9: CLICK: (887, 681)\nstep 10: SCROLL: UP\nstep 11: SCROLL: UP\nstep 12: CLICK: (284, 839)\nstep 13: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Chatbot AI & Smart Assistant', 'Chrome']\nB: ['WOMBO Dream-AI Art Generator', 'Firefox']\nC: ['ChatGPT', 'Edge']\nD: ['GenZArt:Fast AI Art Generator', 'Opera']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_119_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_119_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_119_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_119_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_119_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_119_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_119_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_119_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_119_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_119_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_119_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_119_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_119_12.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Pluto TV - Live TV and Movies', 'Simplenote']\nB: ['Youtube', 'WPS office']\nC: ['Triller', 'BasicNote - Notes, Notepad']\nD: ['Tiktok', 'Notepad - Notes and To Do List']\n", "question": "The corresponding actions are: step 1: CLICK: (592, 892)\nstep 2: CLICK: (703, 256)\nstep 3: CLICK: (101, 266)\nstep 4: PRESS_HOME\nstep 5: CLICK: (710, 397)\nstep 6: CLICK: (31, 436)\nstep 7: CLICK: (240, 498)\nstep 8: CLICK: (648, 562)\nstep 9: CLICK: (971, 472)\nstep 10: CLICK: (21, 63)\nstep 11: CLICK: (108, 396)\nstep 12: CLICK: (511, 506)\nstep 13: TYPE:  Smart Cities: Technology and Urban Planning\nstep 14: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (592, 892)\nstep 2: CLICK: (703, 256)\nstep 3: CLICK: (101, 266)\nstep 4: PRESS_HOME\nstep 5: CLICK: (710, 397)\nstep 6: CLICK: (31, 436)\nstep 7: CLICK: (240, 498)\nstep 8: CLICK: (648, 562)\nstep 9: CLICK: (971, 472)\nstep 10: CLICK: (21, 63)\nstep 11: CLICK: (108, 396)\nstep 12: CLICK: (511, 506)\nstep 13: TYPE:  Smart Cities: Technology and Urban Planning\nstep 14: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Pluto TV - Live TV and Movies', 'Simplenote']\nB: ['Youtube', 'WPS office']\nC: ['Triller', 'BasicNote - Notes, Notepad']\nD: ['Tiktok', 'Notepad - Notes and To Do List']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_120_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_120_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_120_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_120_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_120_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_120_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_120_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_120_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_120_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_120_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_120_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_120_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_120_12.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_120_13.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Bing: chat with AI & GPT4', 'Triller', 'PlantNet']\nB: ['DuckDuckGo', 'Tiktok', 'Google Play Store']\nC: ['Opera', 'Youtube', 'Setting']\nD: ['Firefox', 'Tubi: Movies & Live TV', 'Vaulty:Hide Pictures Videos']\n", "question": "The corresponding actions are: step 1: CLICK: (819, 144)\nstep 2: CLICK: (683, 73)\nstep 3: TYPE: DIY crafts blogs on youtube\nstep 4: CLICK: (924, 907)\nstep 5: CLICK: (201, 347)\nstep 6: CLICK: (851, 127)\nstep 7: PRESS_HOME\nstep 8: CLICK: (384, 839)\nstep 9: CLICK: (333, 328)\nstep 10: CLICK: (473, 84)\nstep 11: SCROLL: RIGHT\nstep 12: CLICK: (940, 324)\nstep 13: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (819, 144)\nstep 2: CLICK: (683, 73)\nstep 3: TYPE: DIY crafts blogs on youtube\nstep 4: CLICK: (924, 907)\nstep 5: CLICK: (201, 347)\nstep 6: CLICK: (851, 127)\nstep 7: PRESS_HOME\nstep 8: CLICK: (384, 839)\nstep 9: CLICK: (333, 328)\nstep 10: CLICK: (473, 84)\nstep 11: SCROLL: RIGHT\nstep 12: CLICK: (940, 324)\nstep 13: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Bing: chat with AI & GPT4', 'Triller', 'PlantNet']\nB: ['DuckDuckGo', 'Tiktok', 'Google Play Store']\nC: ['Opera', 'Youtube', 'Setting']\nD: ['Firefox', 'Tubi: Movies & Live TV', 'Vaulty:Hide Pictures Videos']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_121_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_121_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_121_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_121_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_121_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_121_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_121_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_121_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_121_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_121_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_121_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_121_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_121_12.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Pluto TV - Live TV and Movies', 'Plantin', 'iNaturalist']\nB: ['Tubi: Movies & Live TV', 'Contacts', 'Applock Pro - APP Lock & Guard']\nC: ['Tiktok', 'Applock Pro - APP Lock & Guard', 'Vaulty:Hide Pictures Videos']\nD: ['Triller', 'Google Play Store', 'Setting']\n", "question": "The corresponding actions are: step 1: CLICK: (851, 839)\nstep 2: CLICK: (848, 330)\nstep 3: PRESS_HOME\nstep 4: CLICK: (375, 836)\nstep 5: CLICK: (875, 404)\nstep 6: CLICK: (52, 78)\nstep 7: CLICK: (199, 387)\nstep 8: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (851, 839)\nstep 2: CLICK: (848, 330)\nstep 3: PRESS_HOME\nstep 4: CLICK: (375, 836)\nstep 5: CLICK: (875, 404)\nstep 6: CLICK: (52, 78)\nstep 7: CLICK: (199, 387)\nstep 8: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Pluto TV - Live TV and Movies', 'Plantin', 'iNaturalist']\nB: ['Tubi: Movies & Live TV', 'Contacts', 'Applock Pro - APP Lock & Guard']\nC: ['Tiktok', 'Applock Pro - APP Lock & Guard', 'Vaulty:Hide Pictures Videos']\nD: ['Triller', 'Google Play Store', 'Setting']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_122_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_122_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_122_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_122_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_122_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_122_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_122_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_122_7.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Google Docs', 'wikiHow']\nB: ['Microsoft Word', 'Bing: chat with AI & GPT4']\nC: ['Dropbox Paper', 'Opera']\nD: ['WPS office', 'Firefox']\n", "question": "The corresponding actions are: step 1: CLICK: (705, 217)\nstep 2: CLICK: (286, 251)\nstep 3: CLICK: (916, 154)\nstep 4: TYPE: 2022 nobel prize winners in physics\nstep 5: CLICK: (897, 859)\nstep 6: LONG_PRESS: (99, 549)\nstep 7: SCROLL: RIGHT\nstep 8: CLICK: (144, 489)\nstep 9: PRESS_HOME\nstep 10: CLICK: (844, 499)\nstep 11: CLICK: (236, 357)\nstep 12: CLICK: (236, 357)\nstep 13: TYPE: 2022 nobel prize winners in physics:\nstep 14: CLICK: (561, 559)\nstep 15: CLICK: (99, 78)\nstep 16: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (705, 217)\nstep 2: CLICK: (286, 251)\nstep 3: CLICK: (916, 154)\nstep 4: TYPE: 2022 nobel prize winners in physics\nstep 5: CLICK: (897, 859)\nstep 6: LONG_PRESS: (99, 549)\nstep 7: SCROLL: RIGHT\nstep 8: CLICK: (144, 489)\nstep 9: PRESS_HOME\nstep 10: CLICK: (844, 499)\nstep 11: CLICK: (236, 357)\nstep 12: CLICK: (236, 357)\nstep 13: TYPE: 2022 nobel prize winners in physics:\nstep 14: CLICK: (561, 559)\nstep 15: CLICK: (99, 78)\nstep 16: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Google Docs', 'wikiHow']\nB: ['Microsoft Word', 'Bing: chat with AI & GPT4']\nC: ['Dropbox Paper', 'Opera']\nD: ['WPS office', 'Firefox']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_123_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_123_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_123_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_123_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_123_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_123_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_123_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_123_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_123_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_123_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_123_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_123_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_123_12.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_123_13.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_123_14.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_123_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Youtube', 'Google Play Store']\nB: ['Triller', 'Tripadvisor']\nC: ['Netflix', 'TradingView: Track All Markets']\nD: ['Tiktok', 'iNaturalist']\n", "question": "The corresponding actions are: step 1: CLICK: (127, 745)\nstep 2: PRESS_HOME\nstep 3: SCROLL: RIGHT\nstep 4: CLICK: (127, 604)\nstep 5: CLICK: (788, 74)\nstep 6: TYPE: Apple Fitness Plus\nstep 7: CLICK: (924, 864)\nstep 8: CLICK: (652, 693)\nstep 9: CLICK: (605, 496)\nstep 10: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (127, 745)\nstep 2: PRESS_HOME\nstep 3: SCROLL: RIGHT\nstep 4: CLICK: (127, 604)\nstep 5: CLICK: (788, 74)\nstep 6: TYPE: Apple Fitness Plus\nstep 7: CLICK: (924, 864)\nstep 8: CLICK: (652, 693)\nstep 9: CLICK: (605, 496)\nstep 10: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Youtube', 'Google Play Store']\nB: ['Triller', 'Tripadvisor']\nC: ['Netflix', 'TradingView: Track All Markets']\nD: ['Tiktok', 'iNaturalist']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_124_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_124_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_124_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_124_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_124_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_124_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_124_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_124_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_124_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_124_9.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Amazon Music', 'To-Do List']\nB: ['Spotify', 'TickTick']\nC: ['iHeart: Music, Radio, Podcasts', 'Things']\nD: ['Pandora', 'Microsoft to do']\n", "question": "The corresponding actions are: step 1: SCROLL: LEFT\nstep 2: CLICK: (134, 107)\nstep 3: CLICK: (431, 464)\nstep 4: CLICK: (694, 858)\nstep 5: PRESS_HOME\nstep 6: SCROLL: RIGHT\nstep 7: CLICK: (590, 387)\nstep 8: CLICK: (873, 862)\nstep 9: CLICK: (670, 690)\nstep 10: TYPE:  do yoga with this\nstep 11: CLICK: (900, 632)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: SCROLL: LEFT\nstep 2: CLICK: (134, 107)\nstep 3: CLICK: (431, 464)\nstep 4: CLICK: (694, 858)\nstep 5: PRESS_HOME\nstep 6: SCROLL: RIGHT\nstep 7: CLICK: (590, 387)\nstep 8: CLICK: (873, 862)\nstep 9: CLICK: (670, 690)\nstep 10: TYPE:  do yoga with this\nstep 11: CLICK: (900, 632)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Amazon Music', 'To-Do List']\nB: ['Spotify', 'TickTick']\nC: ['iHeart: Music, Radio, Podcasts', 'Things']\nD: ['Pandora', 'Microsoft to do']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_125_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_125_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_125_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_125_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_125_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_125_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_125_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_125_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_125_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_125_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_125_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_125_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Messenger', 'Instagram']\nB: ['Whatsapp', 'Whatsapp']\nC: ['Tumblr', 'Facebook']\nD: ['Instagram', 'Tumblr']\n", "question": "The corresponding actions are: step 1: CLICK: (565, 133)\nstep 2: CLICK: (718, 78)\nstep 3: TYPE: political debate\nstep 4: CLICK: (841, 878)\nstep 5: CLICK: (497, 139)\nstep 6: CLICK: (553, 447)\nstep 7: SCROLL: UP\nstep 8: SCROLL: UP\nstep 9: PRESS_HOME\nstep 10: CLICK: (666, 310)\nstep 11: CLICK: (566, 907)\nstep 12: CLICK: (404, 260)\nstep 13: CLICK: (464, 839)\nstep 14: TYPE: BBC's political debate will come at 6:30PM today\nstep 15: CLICK: (763, 488)\nstep 16: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (565, 133)\nstep 2: CLICK: (718, 78)\nstep 3: TYPE: political debate\nstep 4: CLICK: (841, 878)\nstep 5: CLICK: (497, 139)\nstep 6: CLICK: (553, 447)\nstep 7: SCROLL: UP\nstep 8: SCROLL: UP\nstep 9: PRESS_HOME\nstep 10: CLICK: (666, 310)\nstep 11: CLICK: (566, 907)\nstep 12: CLICK: (404, 260)\nstep 13: CLICK: (464, 839)\nstep 14: TYPE: BBC's political debate will come at 6:30PM today\nstep 15: CLICK: (763, 488)\nstep 16: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Messenger', 'Instagram']\nB: ['Whatsapp', 'Whatsapp']\nC: ['Tumblr', 'Facebook']\nD: ['Instagram', 'Tumblr']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_126_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_126_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_126_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_126_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_126_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_126_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_126_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_126_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_126_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_126_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_126_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_126_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_126_12.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_126_13.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_126_14.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_126_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Facebook', 'Zoho Meeting']\nB: ['X', 'Google Meet']\nC: ['Instagram', 'ZOOM Cloud Meetings']\nD: ['Threads', 'Microsoft Teams']\n", "question": "The corresponding actions are: step 1: CLICK: (694, 653)\nstep 2: CLICK: (376, 869)\nstep 3: CLICK: (198, 250)\nstep 4: CLICK: (820, 724)\nstep 5: PRESS_HOME\nstep 6: CLICK: (572, 315)\nstep 7: CLICK: (894, 907)\nstep 8: CLICK: (360, 196)\nstep 9: CLICK: (426, 908)\nstep 10: TYPE: meet.google.com/adk-mceh-cwe\nstep 11: CLICK: (940, 485)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (694, 653)\nstep 2: CLICK: (376, 869)\nstep 3: CLICK: (198, 250)\nstep 4: CLICK: (820, 724)\nstep 5: PRESS_HOME\nstep 6: CLICK: (572, 315)\nstep 7: CLICK: (894, 907)\nstep 8: CLICK: (360, 196)\nstep 9: CLICK: (426, 908)\nstep 10: TYPE: meet.google.com/adk-mceh-cwe\nstep 11: CLICK: (940, 485)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Facebook', 'Zoho Meeting']\nB: ['X', 'Google Meet']\nC: ['Instagram', 'ZOOM Cloud Meetings']\nD: ['Threads', 'Microsoft Teams']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_127_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_127_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_127_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_127_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_127_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_127_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_127_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_127_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_127_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_127_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_127_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_127_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Instagram', 'YT Music']\nB: ['Messenger', 'Spotify']\nC: ['Threads', 'Pandora']\nD: ['Facebook', 'Amazon Music']\n", "question": "The corresponding actions are: step 1: CLICK: (630, 501)\nstep 2: CLICK: (381, 909)\nstep 3: CLICK: (469, 317)\nstep 4: TYPE: punk\nstep 5: CLICK: (941, 872)\nstep 6: CLICK: (430, 254)\nstep 7: CLICK: (690, 492)\nstep 8: CLICK: (909, 617)\nstep 9: PRESS_HOME\nstep 10: CLICK: (849, 150)\nstep 11: CLICK: (568, 624)\nstep 12: CLICK: (622, 892)\nstep 13: TYPE: Punk Goes 80's\nstep 14: CLICK: (926, 492)\nstep 15: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (630, 501)\nstep 2: CLICK: (381, 909)\nstep 3: CLICK: (469, 317)\nstep 4: TYPE: punk\nstep 5: CLICK: (941, 872)\nstep 6: CLICK: (430, 254)\nstep 7: CLICK: (690, 492)\nstep 8: CLICK: (909, 617)\nstep 9: PRESS_HOME\nstep 10: CLICK: (849, 150)\nstep 11: CLICK: (568, 624)\nstep 12: CLICK: (622, 892)\nstep 13: TYPE: Punk Goes 80's\nstep 14: CLICK: (926, 492)\nstep 15: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Instagram', 'YT Music']\nB: ['Messenger', 'Spotify']\nC: ['Threads', 'Pandora']\nD: ['Facebook', 'Amazon Music']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_128_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_128_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_128_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_128_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_128_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_128_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_128_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_128_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_128_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_128_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_128_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_128_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_128_12.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_128_13.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_128_14.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['iNaturalist', 'Cash App']\nB: ['Plantin', 'Google Wallet']\nC: ['Google Play Store', 'Venmo']\nD: ['Applock Pro - APP Lock & Guard', 'PayPal - Send, Shop, Manage']\n", "question": "The corresponding actions are: step 1: CLICK: (787, 324)\nstep 2: CLICK: (400, 683)\nstep 3: CLICK: (505, 695)\nstep 4: CLICK: (605, 697)\nstep 5: CLICK: (387, 882)\nstep 6: CLICK: (484, 901)\nstep 7: CLICK: (601, 898)\nstep 8: SCROLL: UP\nstep 9: SCROLL: UP\nstep 10: CLICK: (727, 878)\nstep 11: PRESS_HOME\nstep 12: CLICK: (683, 498)\nstep 13: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (787, 324)\nstep 2: CLICK: (400, 683)\nstep 3: CLICK: (505, 695)\nstep 4: CLICK: (605, 697)\nstep 5: CLICK: (387, 882)\nstep 6: CLICK: (484, 901)\nstep 7: CLICK: (601, 898)\nstep 8: SCROLL: UP\nstep 9: SCROLL: UP\nstep 10: CLICK: (727, 878)\nstep 11: PRESS_HOME\nstep 12: CLICK: (683, 498)\nstep 13: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['iNaturalist', 'Cash App']\nB: ['Plantin', 'Google Wallet']\nC: ['Google Play Store', 'Venmo']\nD: ['Applock Pro - APP Lock & Guard', 'PayPal - Send, Shop, Manage']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_129_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_129_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_129_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_129_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_129_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_129_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_129_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_129_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_129_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_129_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_129_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_129_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_129_12.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['To-Do List', 'Pandora']\nB: ['Todoist', 'Amazon Music']\nC: ['Things', 'YT Music']\nD: ['TickTick', 'Spotify']\n", "question": "The corresponding actions are: step 1: CLICK: (610, 243)\nstep 2: CLICK: (822, 504)\nstep 3: CLICK: (85, 854)\nstep 4: PRESS_HOME\nstep 5: CLICK: (136, 381)\nstep 6: CLICK: (904, 927)\nstep 7: CLICK: (289, 473)\nstep 8: CLICK: (525, 647)\nstep 9: TYPE:  do yoga with this\nstep 10: CLICK: (472, 431)\nstep 11: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (610, 243)\nstep 2: CLICK: (822, 504)\nstep 3: CLICK: (85, 854)\nstep 4: PRESS_HOME\nstep 5: CLICK: (136, 381)\nstep 6: CLICK: (904, 927)\nstep 7: CLICK: (289, 473)\nstep 8: CLICK: (525, 647)\nstep 9: TYPE:  do yoga with this\nstep 10: CLICK: (472, 431)\nstep 11: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['To-Do List', 'Pandora']\nB: ['Todoist', 'Amazon Music']\nC: ['Things', 'YT Music']\nD: ['TickTick', 'Spotify']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_130_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_130_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_130_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_130_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_130_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_130_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_130_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_130_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_130_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_130_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_130_10.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Setting', 'Shorts VotTak: Short Video App', 'Google Play Store']\nB: ['PlantNet', 'Youtube', 'Picturethis']\nC: ['Picturethis', 'Likee', 'Applock Pro - APP Lock & Guard']\nD: ['iNaturalist', 'Triller', 'Setting']\n", "question": "The corresponding actions are: step 1: CLICK: (140, 706)\nstep 2: CLICK: (23, 66)\nstep 3: PRESS_HOME\nstep 4: CLICK: (436, 708)\nstep 5: PRESS_HOME\nstep 6: CLICK: (142, 703)\nstep 7: CLICK: (148, 63)\nstep 8: CLICK: (977, 61)\nstep 9: TYPE: vottak\nstep 10: CLICK: (258, 128)\nstep 11: CLICK: (692, 337)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (140, 706)\nstep 2: CLICK: (23, 66)\nstep 3: PRESS_HOME\nstep 4: CLICK: (436, 708)\nstep 5: PRESS_HOME\nstep 6: CLICK: (142, 703)\nstep 7: CLICK: (148, 63)\nstep 8: CLICK: (977, 61)\nstep 9: TYPE: vottak\nstep 10: CLICK: (258, 128)\nstep 11: CLICK: (692, 337)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Setting', 'Shorts VotTak: Short Video App', 'Google Play Store']\nB: ['PlantNet', 'Youtube', 'Picturethis']\nC: ['Picturethis', 'Likee', 'Applock Pro - APP Lock & Guard']\nD: ['iNaturalist', 'Triller', 'Setting']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_131_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_131_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_131_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_131_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_131_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_131_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_131_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_131_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_131_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_131_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_131_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_131_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Gmail', 'Chatty - AI Assistant']\nB: ['Facebook', 'GenZArt:Fast AI Art Generator']\nC: ['Threads', 'Chatbot AI & Smart Assistant']\nD: ['Instagram', 'ChatGPT']\n", "question": "The corresponding actions are: step 1: CLICK: (161, 515)\nstep 2: CLICK: (516, 899)\nstep 3: CLICK: (393, 444)\nstep 4: TYPE:  Sunflower\nstep 5: CLICK: (930, 882)\nstep 6: CLICK: (702, 787)\nstep 7: CLICK: (855, 734)\nstep 8: CLICK: (308, 829)\nstep 9: CLICK: (862, 146)\nstep 10: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (161, 515)\nstep 2: CLICK: (516, 899)\nstep 3: CLICK: (393, 444)\nstep 4: TYPE:  Sunflower\nstep 5: CLICK: (930, 882)\nstep 6: CLICK: (702, 787)\nstep 7: CLICK: (855, 734)\nstep 8: CLICK: (308, 829)\nstep 9: CLICK: (862, 146)\nstep 10: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Gmail', 'Chatty - AI Assistant']\nB: ['Facebook', 'GenZArt:Fast AI Art Generator']\nC: ['Threads', 'Chatbot AI & Smart Assistant']\nD: ['Instagram', 'ChatGPT']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_132_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_132_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_132_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_132_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_132_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_132_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_132_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_132_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_132_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_132_9.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Target', 'Tumblr']\nB: ['Net-a-Porte', 'X']\nC: ['Amazon', 'Instagram']\nD: ['DealMoon', 'Whatsapp']\n", "question": "The corresponding actions are: step 1: CLICK: (233, 576)\nstep 2: CLICK: (360, 960)\nstep 3: CLICK: (71, 705)\nstep 4: TYPE: instant camera recommendation\nstep 5: CLICK: (745, 78)\nstep 6: CLICK: (741, 765)\nstep 7: PRESS_BACK\nstep 8: SCROLL: LEFT\nstep 9: SCROLL: LEFT\nstep 10: IMPOSSIBLE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (233, 576)\nstep 2: CLICK: (360, 960)\nstep 3: CLICK: (71, 705)\nstep 4: TYPE: instant camera recommendation\nstep 5: CLICK: (745, 78)\nstep 6: CLICK: (741, 765)\nstep 7: PRESS_BACK\nstep 8: SCROLL: LEFT\nstep 9: SCROLL: LEFT\nstep 10: IMPOSSIBLE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Target', 'Tumblr']\nB: ['Net-a-Porte', 'X']\nC: ['Amazon', 'Instagram']\nD: ['DealMoon', 'Whatsapp']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_133_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_133_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_133_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_133_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_133_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_133_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_133_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_133_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_133_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_133_9.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Triller', 'Plantin']\nB: ['Likee', 'iNaturalist']\nC: ['Tubi: Movies & Live TV', 'Vaulty:Hide Pictures Videos']\nD: ['Youtube', 'Google Play Store']\n", "question": "The corresponding actions are: step 1: CLICK: (624, 809)\nstep 2: PRESS_HOME\nstep 3: CLICK: (838, 809)\nstep 4: CLICK: (474, 65)\nstep 5: TYPE: Aaptiv\nstep 6: CLICK: (921, 912)\nstep 7: CLICK: (819, 323)\nstep 8: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (624, 809)\nstep 2: PRESS_HOME\nstep 3: CLICK: (838, 809)\nstep 4: CLICK: (474, 65)\nstep 5: TYPE: Aaptiv\nstep 6: CLICK: (921, 912)\nstep 7: CLICK: (819, 323)\nstep 8: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Triller', 'Plantin']\nB: ['Likee', 'iNaturalist']\nC: ['Tubi: Movies & Live TV', 'Vaulty:Hide Pictures Videos']\nD: ['Youtube', 'Google Play Store']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_134_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_134_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_134_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_134_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_134_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_134_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_134_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_134_7.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Firefox', 'Tripadvisor']\nB: ['Chrome', 'TradingView: Track All Markets']\nC: ['Bing: chat with AI & GPT4', 'iNaturalist']\nD: ['Edge', 'Applock Pro - APP Lock & Guard']\n", "question": "The corresponding actions are: step 1: CLICK: (440, 914)\nstep 2: CLICK: (274, 415)\nstep 3: TYPE: Facebook's stock market news\nstep 4: CLICK: (884, 878)\nstep 5: SCROLL: UP\nstep 6: CLICK: (246, 786)\nstep 7: PRESS_HOME\nstep 8: CLICK: (913, 494)\nstep 9: CLICK: (379, 74)\nstep 10: TYPE: Facebook\nstep 11: CLICK: (153, 223)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (440, 914)\nstep 2: CLICK: (274, 415)\nstep 3: TYPE: Facebook's stock market news\nstep 4: CLICK: (884, 878)\nstep 5: SCROLL: UP\nstep 6: CLICK: (246, 786)\nstep 7: PRESS_HOME\nstep 8: CLICK: (913, 494)\nstep 9: CLICK: (379, 74)\nstep 10: TYPE: Facebook\nstep 11: CLICK: (153, 223)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Firefox', 'Tripadvisor']\nB: ['Chrome', 'TradingView: Track All Markets']\nC: ['Bing: chat with AI & GPT4', 'iNaturalist']\nD: ['Edge', 'Applock Pro - APP Lock & Guard']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_135_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_135_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_135_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_135_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_135_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_135_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_135_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_135_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_135_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_135_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_135_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_135_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Bing: chat with AI & GPT4', 'Threads']\nB: ['Chrome', 'Gmail']\nC: ['Wikipedia', 'Facebook']\nD: ['Opera', 'Instagram']\n", "question": "The corresponding actions are: step 1: CLICK: (210, 645)\nstep 2: CLICK: (298, 411)\nstep 3: TYPE: food festival events\nstep 4: CLICK: (856, 897)\nstep 5: PRESS_HOME\nstep 6: CLICK: (839, 338)\nstep 7: CLICK: (31, 189)\nstep 8: CLICK: (43, 164)\nstep 9: TYPE: caba62244@gmail.com\nstep 10: CLICK: (448, 355)\nstep 11: CLICK: (352, 420)\nstep 12: TYPE: Assyrian Food Festival will come on AUG 17\nstep 13: CLICK: (904, 76)\nstep 14: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (210, 645)\nstep 2: CLICK: (298, 411)\nstep 3: TYPE: food festival events\nstep 4: CLICK: (856, 897)\nstep 5: PRESS_HOME\nstep 6: CLICK: (839, 338)\nstep 7: CLICK: (31, 189)\nstep 8: CLICK: (43, 164)\nstep 9: TYPE: caba62244@gmail.com\nstep 10: CLICK: (448, 355)\nstep 11: CLICK: (352, 420)\nstep 12: TYPE: Assyrian Food Festival will come on AUG 17\nstep 13: CLICK: (904, 76)\nstep 14: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Bing: chat with AI & GPT4', 'Threads']\nB: ['Chrome', 'Gmail']\nC: ['Wikipedia', 'Facebook']\nD: ['Opera', 'Instagram']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_136_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_136_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_136_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_136_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_136_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_136_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_136_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_136_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_136_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_136_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_136_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_136_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_136_12.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_136_13.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['X', 'Photos']\nB: ['Gmail', 'Google Photos']\nC: ['Tumblr', 'ABPV']\nD: ['Instagram', 'Mapillary']\n", "question": "The corresponding actions are: step 1: CLICK: (121, 519)\nstep 2: CLICK: (364, 437)\nstep 3: CLICK: (155, 936)\nstep 4: CLICK: (527, 778)\nstep 5: TYPE: caba62244@gmail.com\nstep 6: CLICK: (378, 259)\nstep 7: CLICK: (826, 63)\nstep 8: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (121, 519)\nstep 2: CLICK: (364, 437)\nstep 3: CLICK: (155, 936)\nstep 4: CLICK: (527, 778)\nstep 5: TYPE: caba62244@gmail.com\nstep 6: CLICK: (378, 259)\nstep 7: CLICK: (826, 63)\nstep 8: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['X', 'Photos']\nB: ['Gmail', 'Google Photos']\nC: ['Tumblr', 'ABPV']\nD: ['Instagram', 'Mapillary']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_137_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_137_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_137_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_137_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_137_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_137_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_137_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_137_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['X', 'Calendar']\nB: ['Instagram', 'All-In-One Calculator']\nC: ['Whatsapp', 'Simple Calendar']\nD: ['Messenger', 'Simple Calculator']\n", "question": "The corresponding actions are: step 1: CLICK: (568, 506)\nstep 2: CLICK: (256, 79)\nstep 3: TYPE: what is the latest Terminator movie\nstep 4: CLICK: (865, 889)\nstep 5: SCROLL: UP\nstep 6: PRESS_HOME\nstep 7: CLICK: (561, 935)\nstep 8: CLICK: (774, 806)\nstep 9: CLICK: (187, 806)\nstep 10: TYPE: watch the movie Terminator:DarkFate\nstep 11: CLICK: (418, 196)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (568, 506)\nstep 2: CLICK: (256, 79)\nstep 3: TYPE: what is the latest Terminator movie\nstep 4: CLICK: (865, 889)\nstep 5: SCROLL: UP\nstep 6: PRESS_HOME\nstep 7: CLICK: (561, 935)\nstep 8: CLICK: (774, 806)\nstep 9: CLICK: (187, 806)\nstep 10: TYPE: watch the movie Terminator:DarkFate\nstep 11: CLICK: (418, 196)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['X', 'Calendar']\nB: ['Instagram', 'All-In-One Calculator']\nC: ['Whatsapp', 'Simple Calendar']\nD: ['Messenger', 'Simple Calculator']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_138_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_138_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_138_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_138_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_138_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_138_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_138_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_138_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_138_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_138_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_138_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_138_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Pluto TV - Live TV and Movies', 'Simple Calculator']\nB: ['Youtube', 'Clock']\nC: ['Triller', 'ClevCalc - Calculator']\nD: ['Tubi: Movies & Live TV', 'Calculator']\n", "question": "The corresponding actions are: step 1: CLICK: (627, 834)\nstep 2: TYPE: Language pronunciation\nstep 3: CLICK: (960, 907)\nstep 4: CLICK: (371, 658)\nstep 5: PRESS_HOME\nstep 6: CLICK: (384, 258)\nstep 7: CLICK: (891, 151)\nstep 8: TYPE: 500\nstep 9: CLICK: (507, 804)\nstep 10: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (627, 834)\nstep 2: TYPE: Language pronunciation\nstep 3: CLICK: (960, 907)\nstep 4: CLICK: (371, 658)\nstep 5: PRESS_HOME\nstep 6: CLICK: (384, 258)\nstep 7: CLICK: (891, 151)\nstep 8: TYPE: 500\nstep 9: CLICK: (507, 804)\nstep 10: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Pluto TV - Live TV and Movies', 'Simple Calculator']\nB: ['Youtube', 'Clock']\nC: ['Triller', 'ClevCalc - Calculator']\nD: ['Tubi: Movies & Live TV', 'Calculator']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_139_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_139_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_139_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_139_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_139_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_139_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_139_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_139_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_139_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_139_9.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Tumblr', 'Pandora']\nB: ['Whatsapp', 'YT Music']\nC: ['Instagram', 'iHeart: Music, Radio, Podcasts']\nD: ['Facebook', 'Spotify']\n", "question": "The corresponding actions are: step 1: CLICK: (541, 479)\nstep 2: CLICK: (586, 904)\nstep 3: CLICK: (205, 92)\nstep 4: TYPE: Electronic\nstep 5: SCROLL: LEFT\nstep 6: CLICK: (900, 164)\nstep 7: CLICK: (917, 398)\nstep 8: PRESS_HOME\nstep 9: CLICK: (686, 328)\nstep 10: CLICK: (584, 915)\nstep 11: CLICK: (445, 298)\nstep 12: CLICK: (444, 846)\nstep 13: TYPE: Electronic\nstep 14: CLICK: (757, 507)\nstep 15: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (541, 479)\nstep 2: CLICK: (586, 904)\nstep 3: CLICK: (205, 92)\nstep 4: TYPE: Electronic\nstep 5: SCROLL: LEFT\nstep 6: CLICK: (900, 164)\nstep 7: CLICK: (917, 398)\nstep 8: PRESS_HOME\nstep 9: CLICK: (686, 328)\nstep 10: CLICK: (584, 915)\nstep 11: CLICK: (445, 298)\nstep 12: CLICK: (444, 846)\nstep 13: TYPE: Electronic\nstep 14: CLICK: (757, 507)\nstep 15: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Tumblr', 'Pandora']\nB: ['Whatsapp', 'YT Music']\nC: ['Instagram', 'iHeart: Music, Radio, Podcasts']\nD: ['Facebook', 'Spotify']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_140_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_140_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_140_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_140_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_140_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_140_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_140_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_140_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_140_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_140_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_140_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_140_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_140_12.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_140_13.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_140_14.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Yahoo Sports', 'Gmail']\nB: ['NewsBreak', 'Instagram']\nC: ['BBC Sports', 'Tumblr']\nD: ['Microsoft News', 'X']\n", "question": "The corresponding actions are: step 1: CLICK: (138, 547)\nstep 2: CLICK: (968, 67)\nstep 3: TYPE: football match\nstep 4: CLICK: (972, 67)\nstep 5: TYPE: football\nstep 6: CLICK: (129, 218)\nstep 7: PRESS_HOME\nstep 8: CLICK: (732, 94)\nstep 9: CLICK: (30, 144)\nstep 10: TYPE: caba62244@gmail.com\nstep 11: CLICK: (262, 303)\nstep 12: CLICK: (238, 364)\nstep 13: TYPE: Washington:Minchigan is 13:34\nstep 14: CLICK: (938, 68)\nstep 15: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (138, 547)\nstep 2: CLICK: (968, 67)\nstep 3: TYPE: football match\nstep 4: CLICK: (972, 67)\nstep 5: TYPE: football\nstep 6: CLICK: (129, 218)\nstep 7: PRESS_HOME\nstep 8: CLICK: (732, 94)\nstep 9: CLICK: (30, 144)\nstep 10: TYPE: caba62244@gmail.com\nstep 11: CLICK: (262, 303)\nstep 12: CLICK: (238, 364)\nstep 13: TYPE: Washington:Minchigan is 13:34\nstep 14: CLICK: (938, 68)\nstep 15: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Yahoo Sports', 'Gmail']\nB: ['NewsBreak', 'Instagram']\nC: ['BBC Sports', 'Tumblr']\nD: ['Microsoft News', 'X']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_141_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_141_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_141_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_141_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_141_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_141_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_141_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_141_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_141_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_141_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_141_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_141_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_141_12.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_141_13.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_141_14.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['iNaturalist', 'Opera']\nB: ['Setting', 'Edge']\nC: ['Google Play Store', 'Chrome']\nD: ['Applock Pro - APP Lock & Guard', 'DuckDuckGo']\n", "question": "The corresponding actions are: step 1: CLICK: (637, 824)\nstep 2: CLICK: (450, 274)\nstep 3: TYPE: Email Client Apps\nstep 4: CLICK: (925, 912)\nstep 5: PRESS_HOME\nstep 6: CLICK: (149, 651)\nstep 7: CLICK: (376, 56)\nstep 8: TYPE: Microsoft Outlook\nstep 9: CLICK: (906, 908)\nstep 10: CLICK: (757, 321)\nstep 11: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (637, 824)\nstep 2: CLICK: (450, 274)\nstep 3: TYPE: Email Client Apps\nstep 4: CLICK: (925, 912)\nstep 5: PRESS_HOME\nstep 6: CLICK: (149, 651)\nstep 7: CLICK: (376, 56)\nstep 8: TYPE: Microsoft Outlook\nstep 9: CLICK: (906, 908)\nstep 10: CLICK: (757, 321)\nstep 11: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['iNaturalist', 'Opera']\nB: ['Setting', 'Edge']\nC: ['Google Play Store', 'Chrome']\nD: ['Applock Pro - APP Lock & Guard', 'DuckDuckGo']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_142_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_142_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_142_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_142_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_142_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_142_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_142_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_142_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_142_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_142_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_142_10.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Amazon Kindle', 'Chrome']\nB: ['Libby, by OverDrive', 'Firefox']\nC: ['Ploter - Ebook, Audiobook, PDF', 'Edge']\nD: ['Pocket FM: Audio Series', 'DuckDuckGo']\n", "question": "The corresponding actions are: step 1: CLICK: (607, 831)\nstep 2: CLICK: (411, 284)\nstep 3: TYPE: The Vietnam War\nstep 4: CLICK: (344, 121)\nstep 5: CLICK: (463, 697)\nstep 6: PRESS_HOME\nstep 7: SCROLL: LEFT\nstep 8: CLICK: (374, 107)\nstep 9: CLICK: (309, 62)\nstep 10: TYPE: The Vietnam War\nstep 11: CLICK: (346, 119)\nstep 12: CLICK: (436, 264)\nstep 13: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (607, 831)\nstep 2: CLICK: (411, 284)\nstep 3: TYPE: The Vietnam War\nstep 4: CLICK: (344, 121)\nstep 5: CLICK: (463, 697)\nstep 6: PRESS_HOME\nstep 7: SCROLL: LEFT\nstep 8: CLICK: (374, 107)\nstep 9: CLICK: (309, 62)\nstep 10: TYPE: The Vietnam War\nstep 11: CLICK: (346, 119)\nstep 12: CLICK: (436, 264)\nstep 13: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Amazon Kindle', 'Chrome']\nB: ['Libby, by OverDrive', 'Firefox']\nC: ['Ploter - Ebook, Audiobook, PDF', 'Edge']\nD: ['Pocket FM: Audio Series', 'DuckDuckGo']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_143_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_143_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_143_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_143_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_143_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_143_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_143_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_143_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_143_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_143_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_143_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_143_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_143_12.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Google Map', 'Picturethis']\nB: ['Yandex Navigator', 'Google Play Store']\nC: ['Lyft', 'iNaturalist']\nD: ['Maps', 'Contacts']\n", "question": "The corresponding actions are: step 1: CLICK: (627, 649)\nstep 2: CLICK: (114, 941)\nstep 3: CLICK: (353, 106)\nstep 4: TYPE: swimming pool\nstep 5: CLICK: (422, 191)\nstep 6: CLICK: (159, 581)\nstep 7: PRESS_HOME\nstep 8: CLICK: (884, 821)\nstep 9: CLICK: (313, 88)\nstep 10: TYPE: fitness tracking apps\nstep 11: CLICK: (270, 137)\nstep 12: CLICK: (393, 565)\nstep 13: CLICK: (516, 323)\nstep 14: SCROLL: UP\nstep 15: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (627, 649)\nstep 2: CLICK: (114, 941)\nstep 3: CLICK: (353, 106)\nstep 4: TYPE: swimming pool\nstep 5: CLICK: (422, 191)\nstep 6: CLICK: (159, 581)\nstep 7: PRESS_HOME\nstep 8: CLICK: (884, 821)\nstep 9: CLICK: (313, 88)\nstep 10: TYPE: fitness tracking apps\nstep 11: CLICK: (270, 137)\nstep 12: CLICK: (393, 565)\nstep 13: CLICK: (516, 323)\nstep 14: SCROLL: UP\nstep 15: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Google Map', 'Picturethis']\nB: ['Yandex Navigator', 'Google Play Store']\nC: ['Lyft', 'iNaturalist']\nD: ['Maps', 'Contacts']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_144_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_144_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_144_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_144_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_144_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_144_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_144_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_144_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_144_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_144_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_144_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_144_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_144_12.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_144_13.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_144_14.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Wikipedia', 'Facebook']\nB: ['Chrome', 'Threads']\nC: ['Edge', 'Gmail']\nD: ['Firefox', 'X']\n", "question": "The corresponding actions are: step 1: CLICK: (190, 689)\nstep 2: CLICK: (352, 435)\nstep 3: TYPE: cricket scores\nstep 4: CLICK: (847, 883)\nstep 5: SCROLL: UP\nstep 6: SCROLL: UP\nstep 7: PRESS_HOME\nstep 8: CLICK: (922, 136)\nstep 9: CLICK: (502, 910)\nstep 10: TYPE: RCB : DC is 20:19.1\nstep 11: CLICK: (741, 489)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (190, 689)\nstep 2: CLICK: (352, 435)\nstep 3: TYPE: cricket scores\nstep 4: CLICK: (847, 883)\nstep 5: SCROLL: UP\nstep 6: SCROLL: UP\nstep 7: PRESS_HOME\nstep 8: CLICK: (922, 136)\nstep 9: CLICK: (502, 910)\nstep 10: TYPE: RCB : DC is 20:19.1\nstep 11: CLICK: (741, 489)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Wikipedia', 'Facebook']\nB: ['Chrome', 'Threads']\nC: ['Edge', 'Gmail']\nD: ['Firefox', 'X']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_145_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_145_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_145_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_145_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_145_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_145_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_145_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_145_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_145_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_145_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_145_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_145_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['TradingView: Track All Markets', 'Edge']\nB: ['Applock Pro - APP Lock & Guard', 'Wikipedia']\nC: ['Picturethis', 'wikiHow']\nD: ['Google Play Store', 'Opera']\n", "question": "The corresponding actions are: step 1: CLICK: (592, 234)\nstep 2: CLICK: (331, 123)\nstep 3: TYPE: Language Learning Apps\nstep 4: CLICK: (918, 912)\nstep 5: PRESS_HOME\nstep 6: SCROLL: RIGHT\nstep 7: CLICK: (145, 629)\nstep 8: CLICK: (316, 46)\nstep 9: TYPE: Duolingo\nstep 10: CLICK: (906, 912)\nstep 11: CLICK: (846, 187)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (592, 234)\nstep 2: CLICK: (331, 123)\nstep 3: TYPE: Language Learning Apps\nstep 4: CLICK: (918, 912)\nstep 5: PRESS_HOME\nstep 6: SCROLL: RIGHT\nstep 7: CLICK: (145, 629)\nstep 8: CLICK: (316, 46)\nstep 9: TYPE: Duolingo\nstep 10: CLICK: (906, 912)\nstep 11: CLICK: (846, 187)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['TradingView: Track All Markets', 'Edge']\nB: ['Applock Pro - APP Lock & Guard', 'Wikipedia']\nC: ['Picturethis', 'wikiHow']\nD: ['Google Play Store', 'Opera']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_146_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_146_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_146_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_146_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_146_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_146_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_146_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_146_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_146_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_146_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_146_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_146_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['ChatGPT', 'Chrome']\nB: ['Remix:AI Image Creator', 'Opera']\nC: ['Picsart AI Photo Editor,Video', 'Bing: chat with AI & GPT4']\nD: ['Chatbot AI & Smart Assistant', 'DuckDuckGo']\n", "question": "The corresponding actions are: step 1: CLICK: (425, 401)\nstep 2: CLICK: (390, 891)\nstep 3: TYPE: tell me about Fundamental theorem of calculus\nstep 4: CLICK: (689, 419)\nstep 5: PRESS_HOME\nstep 6: CLICK: (284, 391)\nstep 7: TYPE: Fundamental theorem of calculus\nstep 8: CLICK: (883, 692)\nstep 9: CLICK: (270, 938)\nstep 10: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (425, 401)\nstep 2: CLICK: (390, 891)\nstep 3: TYPE: tell me about Fundamental theorem of calculus\nstep 4: CLICK: (689, 419)\nstep 5: PRESS_HOME\nstep 6: CLICK: (284, 391)\nstep 7: TYPE: Fundamental theorem of calculus\nstep 8: CLICK: (883, 692)\nstep 9: CLICK: (270, 938)\nstep 10: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['ChatGPT', 'Chrome']\nB: ['Remix:AI Image Creator', 'Opera']\nC: ['Picsart AI Photo Editor,Video', 'Bing: chat with AI & GPT4']\nD: ['Chatbot AI & Smart Assistant', 'DuckDuckGo']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_147_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_147_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_147_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_147_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_147_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_147_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_147_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_147_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_147_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_147_9.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Whatsapp', 'TradingView: Track All Markets']\nB: ['Instagram', 'Setting']\nC: ['Messenger', 'Google Play Store']\nD: ['X', 'Picturethis']\n", "question": "The corresponding actions are: step 1: CLICK: (846, 810)\nstep 2: SCROLL: UP\nstep 3: CLICK: (315, 426)\nstep 4: SCROLL: RIGHT\nstep 5: SCROLL: RIGHT\nstep 6: CLICK: (339, 300)\nstep 7: PRESS_HOME\nstep 8: SCROLL: UP\nstep 9: SCROLL: UP\nstep 10: CLICK: (610, 558)\nstep 11: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (846, 810)\nstep 2: SCROLL: UP\nstep 3: CLICK: (315, 426)\nstep 4: SCROLL: RIGHT\nstep 5: SCROLL: RIGHT\nstep 6: CLICK: (339, 300)\nstep 7: PRESS_HOME\nstep 8: SCROLL: UP\nstep 9: SCROLL: UP\nstep 10: CLICK: (610, 558)\nstep 11: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Whatsapp', 'TradingView: Track All Markets']\nB: ['Instagram', 'Setting']\nC: ['Messenger', 'Google Play Store']\nD: ['X', 'Picturethis']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_148_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_148_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_148_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_148_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_148_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_148_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_148_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_148_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_148_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_148_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_148_10.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Picsart AI Photo Editor,Video', 'Instagram']\nB: ['Microsoft Copilot', 'X']\nC: ['Chatty - AI Assistant', 'Facebook']\nD: ['GenZArt:Fast AI Art Generator', 'Messenger']\n", "question": "The corresponding actions are: step 1: CLICK: (663, 677)\nstep 2: CLICK: (490, 899)\nstep 3: CLICK: (390, 337)\nstep 4: TYPE:  backpack\nstep 5: CLICK: (848, 877)\nstep 6: CLICK: (515, 776)\nstep 7: CLICK: (725, 777)\nstep 8: CLICK: (365, 673)\nstep 9: CLICK: (725, 444)\nstep 10: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (663, 677)\nstep 2: CLICK: (490, 899)\nstep 3: CLICK: (390, 337)\nstep 4: TYPE:  backpack\nstep 5: CLICK: (848, 877)\nstep 6: CLICK: (515, 776)\nstep 7: CLICK: (725, 777)\nstep 8: CLICK: (365, 673)\nstep 9: CLICK: (725, 444)\nstep 10: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Picsart AI Photo Editor,Video', 'Instagram']\nB: ['Microsoft Copilot', 'X']\nC: ['Chatty - AI Assistant', 'Facebook']\nD: ['GenZArt:Fast AI Art Generator', 'Messenger']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_149_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_149_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_149_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_149_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_149_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_149_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_149_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_149_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_149_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_149_9.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Opera', 'Messenger']\nB: ['Wikipedia', 'Instagram']\nC: ['Chrome', 'Gmail']\nD: ['DuckDuckGo', 'Tumblr']\n", "question": "The corresponding actions are: step 1: CLICK: (869, 675)\nstep 2: CLICK: (277, 145)\nstep 3: TYPE: craft beer tasting events\nstep 4: CLICK: (904, 929)\nstep 5: PRESS_HOME\nstep 6: CLICK: (846, 147)\nstep 7: CLICK: (500, 421)\nstep 8: CLICK: (560, 946)\nstep 9: TYPE: The Mary Wallopers will come on May 3\nstep 10: CLICK: (951, 636)\nstep 11: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (869, 675)\nstep 2: CLICK: (277, 145)\nstep 3: TYPE: craft beer tasting events\nstep 4: CLICK: (904, 929)\nstep 5: PRESS_HOME\nstep 6: CLICK: (846, 147)\nstep 7: CLICK: (500, 421)\nstep 8: CLICK: (560, 946)\nstep 9: TYPE: The Mary Wallopers will come on May 3\nstep 10: CLICK: (951, 636)\nstep 11: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Opera', 'Messenger']\nB: ['Wikipedia', 'Instagram']\nC: ['Chrome', 'Gmail']\nD: ['DuckDuckGo', 'Tumblr']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_150_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_150_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_150_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_150_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_150_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_150_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_150_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_150_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_150_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_150_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_150_10.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Tumblr', 'ZOOM Cloud Meetings']\nB: ['Gmail', 'Zoho Meeting']\nC: ['Threads', 'Google Meet']\nD: ['Facebook', 'Microsoft Teams']\n", "question": "The corresponding actions are: step 1: CLICK: (409, 517)\nstep 2: CLICK: (806, 79)\nstep 3: CLICK: (208, 884)\nstep 4: CLICK: (340, 845)\nstep 5: PRESS_HOME\nstep 6: CLICK: (165, 150)\nstep 7: CLICK: (261, 123)\nstep 8: CLICK: (440, 153)\nstep 9: CLICK: (304, 241)\nstep 10: CLICK: (496, 491)\nstep 11: CLICK: (605, 951)\nstep 12: TYPE: https://teams.live.com/meet/9383055761460?p=3reyMWIRXcgNuvHI\nstep 13: CLICK: (931, 638)\nstep 14: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (409, 517)\nstep 2: CLICK: (806, 79)\nstep 3: CLICK: (208, 884)\nstep 4: CLICK: (340, 845)\nstep 5: PRESS_HOME\nstep 6: CLICK: (165, 150)\nstep 7: CLICK: (261, 123)\nstep 8: CLICK: (440, 153)\nstep 9: CLICK: (304, 241)\nstep 10: CLICK: (496, 491)\nstep 11: CLICK: (605, 951)\nstep 12: TYPE: https://teams.live.com/meet/9383055761460?p=3reyMWIRXcgNuvHI\nstep 13: CLICK: (931, 638)\nstep 14: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Tumblr', 'ZOOM Cloud Meetings']\nB: ['Gmail', 'Zoho Meeting']\nC: ['Threads', 'Google Meet']\nD: ['Facebook', 'Microsoft Teams']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_151_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_151_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_151_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_151_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_151_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_151_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_151_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_151_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_151_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_151_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_151_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_151_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_151_12.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_151_13.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Notepad - Notes and To Do List', 'Opera']\nB: ['Microsoft Word', 'Quora']\nC: ['Google Keep', 'Firefox']\nD: ['Dropbox Paper', 'Chrome']\n", "question": "The corresponding actions are: step 1: CLICK: (389, 131)\nstep 2: CLICK: (793, 195)\nstep 3: CLICK: (485, 515)\nstep 4: LONG_PRESS: (107, 287)\nstep 5: SCROLL: RIGHT\nstep 6: CLICK: (449, 256)\nstep 7: PRESS_HOME\nstep 8: SCROLL: LEFT\nstep 9: SCROLL: LEFT\nstep 10: CLICK: (371, 412)\nstep 11: CLICK: (913, 936)\nstep 12: CLICK: (866, 816)\nstep 13: TYPE: 2016 Nobel-Prize winners in physics\nstep 14: CLICK: (511, 178)\nstep 15: CLICK: (478, 684)\nstep 16: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (389, 131)\nstep 2: CLICK: (793, 195)\nstep 3: CLICK: (485, 515)\nstep 4: LONG_PRESS: (107, 287)\nstep 5: SCROLL: RIGHT\nstep 6: CLICK: (449, 256)\nstep 7: PRESS_HOME\nstep 8: SCROLL: LEFT\nstep 9: SCROLL: LEFT\nstep 10: CLICK: (371, 412)\nstep 11: CLICK: (913, 936)\nstep 12: CLICK: (866, 816)\nstep 13: TYPE: 2016 Nobel-Prize winners in physics\nstep 14: CLICK: (511, 178)\nstep 15: CLICK: (478, 684)\nstep 16: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Notepad - Notes and To Do List', 'Opera']\nB: ['Microsoft Word', 'Quora']\nC: ['Google Keep', 'Firefox']\nD: ['Dropbox Paper', 'Chrome']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_152_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_152_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_152_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_152_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_152_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_152_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_152_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_152_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_152_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_152_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_152_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_152_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_152_12.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_152_13.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_152_14.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_152_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['BasicNote - Notes, Notepad', 'Tiktok']\nB: ['Microsoft word', 'Youtube']\nC: ['Notepad - Notes and To Do List', 'Triller']\nD: ['Dropbox Paper', 'Shorts VotTak: Short Video App']\n", "question": "The corresponding actions are: step 1: CLICK: (316, 926)\nstep 2: CLICK: (205, 672)\nstep 3: CLICK: (101, 405)\nstep 4: PRESS_HOME\nstep 5: CLICK: (901, 479)\nstep 6: CLICK: (550, 395)\nstep 7: CLICK: (432, 584)\nstep 8: TYPE:  Google's AI Course for Beginners(in 10 minutes)!\nstep 9: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (316, 926)\nstep 2: CLICK: (205, 672)\nstep 3: CLICK: (101, 405)\nstep 4: PRESS_HOME\nstep 5: CLICK: (901, 479)\nstep 6: CLICK: (550, 395)\nstep 7: CLICK: (432, 584)\nstep 8: TYPE:  Google's AI Course for Beginners(in 10 minutes)!\nstep 9: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['BasicNote - Notes, Notepad', 'Tiktok']\nB: ['Microsoft word', 'Youtube']\nC: ['Notepad - Notes and To Do List', 'Triller']\nD: ['Dropbox Paper', 'Shorts VotTak: Short Video App']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_153_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_153_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_153_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_153_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_153_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_153_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_153_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_153_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_153_8.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Vaulty:Hide Pictures Videos', 'Amazon Kindle']\nB: ['Plantin', 'Kobo Books - eBooks Audiobooks']\nC: ['Setting', 'Libby, by OverDrive']\nD: ['Tripadvisor', 'Everand']\n", "question": "The corresponding actions are: step 1: CLICK: (334, 495)\nstep 2: SCROLL: UP\nstep 3: CLICK: (167, 703)\nstep 4: CLICK: (942, 905)\nstep 5: PRESS_HOME\nstep 6: CLICK: (672, 501)\nstep 7: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (334, 495)\nstep 2: SCROLL: UP\nstep 3: CLICK: (167, 703)\nstep 4: CLICK: (942, 905)\nstep 5: PRESS_HOME\nstep 6: CLICK: (672, 501)\nstep 7: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Vaulty:Hide Pictures Videos', 'Amazon Kindle']\nB: ['Plantin', 'Kobo Books - eBooks Audiobooks']\nC: ['Setting', 'Libby, by OverDrive']\nD: ['Tripadvisor', 'Everand']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_154_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_154_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_154_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_154_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_154_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_154_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_154_6.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Picsart AI Photo Editor,Video', 'Firefox']\nB: ['Remix:AI Image Creator', 'Quora']\nC: ['GenZArt:Fast AI Art Generator', 'DuckDuckGo']\nD: ['Chatbot AI & Smart Assistant', 'Chrome']\n", "question": "The corresponding actions are: step 1: CLICK: (119, 347)\nstep 2: CLICK: (77, 92)\nstep 3: CLICK: (488, 717)\nstep 4: CLICK: (305, 885)\nstep 5: TYPE: tell me about Pythagorean theorem\nstep 6: CLICK: (920, 482)\nstep 7: PRESS_HOME\nstep 8: CLICK: (681, 747)\nstep 9: CLICK: (352, 398)\nstep 10: TYPE: Pythagorean theorem\nstep 11: CLICK: (912, 882)\nstep 12: SCROLL: UP\nstep 13: CLICK: (729, 852)\nstep 14: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (119, 347)\nstep 2: CLICK: (77, 92)\nstep 3: CLICK: (488, 717)\nstep 4: CLICK: (305, 885)\nstep 5: TYPE: tell me about Pythagorean theorem\nstep 6: CLICK: (920, 482)\nstep 7: PRESS_HOME\nstep 8: CLICK: (681, 747)\nstep 9: CLICK: (352, 398)\nstep 10: TYPE: Pythagorean theorem\nstep 11: CLICK: (912, 882)\nstep 12: SCROLL: UP\nstep 13: CLICK: (729, 852)\nstep 14: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Picsart AI Photo Editor,Video', 'Firefox']\nB: ['Remix:AI Image Creator', 'Quora']\nC: ['GenZArt:Fast AI Art Generator', 'DuckDuckGo']\nD: ['Chatbot AI & Smart Assistant', 'Chrome']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_155_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_155_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_155_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_155_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_155_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_155_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_155_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_155_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_155_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_155_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_155_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_155_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_155_12.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_155_13.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Bing: chat with AI & GPT4', 'PayPal - Send, Shop, Manage']\nB: ['Firefox', 'Google Pay']\nC: ['Quora', 'Chime - Mobile Banking']\nD: ['Chrome', 'Investing.com']\n", "question": "The corresponding actions are: step 1: CLICK: (514, 901)\nstep 2: CLICK: (332, 355)\nstep 3: TYPE: Coca-Cola's stock market news\nstep 4: CLICK: (908, 688)\nstep 5: SCROLL: UP\nstep 6: CLICK: (333, 585)\nstep 7: PRESS_HOME\nstep 8: CLICK: (426, 397)\nstep 9: CLICK: (697, 69)\nstep 10: TYPE: Coca-Cola\nstep 11: CLICK: (389, 241)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (514, 901)\nstep 2: CLICK: (332, 355)\nstep 3: TYPE: Coca-Cola's stock market news\nstep 4: CLICK: (908, 688)\nstep 5: SCROLL: UP\nstep 6: CLICK: (333, 585)\nstep 7: PRESS_HOME\nstep 8: CLICK: (426, 397)\nstep 9: CLICK: (697, 69)\nstep 10: TYPE: Coca-Cola\nstep 11: CLICK: (389, 241)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Bing: chat with AI & GPT4', 'PayPal - Send, Shop, Manage']\nB: ['Firefox', 'Google Pay']\nC: ['Quora', 'Chime - Mobile Banking']\nD: ['Chrome', 'Investing.com']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_156_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_156_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_156_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_156_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_156_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_156_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_156_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_156_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_156_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_156_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_156_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_156_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Audible: Audio Entertainment', 'Applock Pro - APP Lock & Guard']\nB: ['Amazon Kindle', 'iNaturalist']\nC: ['Ploter - Ebook, Audiobook, PDF', 'Plantin']\nD: ['Google Play Books & Audiobooks', 'Setting']\n", "question": "The corresponding actions are: step 1: CLICK: (433, 710)\nstep 2: CLICK: (152, 944)\nstep 3: CLICK: (963, 801)\nstep 4: PRESS_HOME\nstep 5: SCROLL: LEFT\nstep 6: CLICK: (290, 113)\nstep 7: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (433, 710)\nstep 2: CLICK: (152, 944)\nstep 3: CLICK: (963, 801)\nstep 4: PRESS_HOME\nstep 5: SCROLL: LEFT\nstep 6: CLICK: (290, 113)\nstep 7: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Audible: Audio Entertainment', 'Applock Pro - APP Lock & Guard']\nB: ['Amazon Kindle', 'iNaturalist']\nC: ['Ploter - Ebook, Audiobook, PDF', 'Plantin']\nD: ['Google Play Books & Audiobooks', 'Setting']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_157_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_157_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_157_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_157_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_157_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_157_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_157_6.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Quora', 'Simple Calculator']\nB: ['DuckDuckGo', 'Calculator Plus with History']\nC: ['Edge', 'All-In-One Calculator']\nD: ['Chrome', 'Clock']\n", "question": "The corresponding actions are: step 1: TYPE: nature soundscape video\nstep 2: CLICK: (899, 871)\nstep 3: CLICK: (580, 581)\nstep 4: SCROLL: UP\nstep 5: PRESS_HOME\nstep 6: CLICK: (416, 217)\nstep 7: TYPE: 8000\nstep 8: TYPE: 0\nstep 9: CLICK: (452, 717)\nstep 10: SCROLL: RIGHT\nstep 11: CLICK: (411, 543)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: TYPE: nature soundscape video\nstep 2: CLICK: (899, 871)\nstep 3: CLICK: (580, 581)\nstep 4: SCROLL: UP\nstep 5: PRESS_HOME\nstep 6: CLICK: (416, 217)\nstep 7: TYPE: 8000\nstep 8: TYPE: 0\nstep 9: CLICK: (452, 717)\nstep 10: SCROLL: RIGHT\nstep 11: CLICK: (411, 543)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Quora', 'Simple Calculator']\nB: ['DuckDuckGo', 'Calculator Plus with History']\nC: ['Edge', 'All-In-One Calculator']\nD: ['Chrome', 'Clock']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_158_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_158_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_158_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_158_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_158_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_158_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_158_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_158_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_158_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_158_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_158_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_158_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Tumblr', 'Meesho']\nB: ['Gmail', 'Tata Neu']\nC: ['Instagram', 'Net-a-Porte']\nD: ['Threads', 'Flipkart']\n", "question": "The corresponding actions are: step 1: CLICK: (412, 385)\nstep 2: CLICK: (78, 71)\nstep 3: CLICK: (900, 79)\nstep 4: TYPE: laptop recommendation\nstep 5: CLICK: (884, 907)\nstep 6: SCROLL: UP\nstep 7: SCROLL: UP\nstep 8: CLICK: (529, 525)\nstep 9: PRESS_HOME\nstep 10: CLICK: (137, 255)\nstep 11: CLICK: (418, 132)\nstep 12: TYPE: HP Envy\nstep 13: CLICK: (928, 927)\nstep 14: CLICK: (584, 211)\nstep 15: CLICK: (202, 940)\nstep 16: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (412, 385)\nstep 2: CLICK: (78, 71)\nstep 3: CLICK: (900, 79)\nstep 4: TYPE: laptop recommendation\nstep 5: CLICK: (884, 907)\nstep 6: SCROLL: UP\nstep 7: SCROLL: UP\nstep 8: CLICK: (529, 525)\nstep 9: PRESS_HOME\nstep 10: CLICK: (137, 255)\nstep 11: CLICK: (418, 132)\nstep 12: TYPE: HP Envy\nstep 13: CLICK: (928, 927)\nstep 14: CLICK: (584, 211)\nstep 15: CLICK: (202, 940)\nstep 16: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Tumblr', 'Meesho']\nB: ['Gmail', 'Tata Neu']\nC: ['Instagram', 'Net-a-Porte']\nD: ['Threads', 'Flipkart']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_159_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_159_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_159_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_159_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_159_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_159_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_159_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_159_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_159_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_159_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_159_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_159_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_159_12.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_159_13.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_159_14.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_159_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Threads', 'BBC Sports']\nB: ['Messenger', 'SmartNews:News That Matters']\nC: ['Instagram', 'Yahoo Sports']\nD: ['Whatsapp', 'Breaking News: local & Alerts']\n", "question": "The corresponding actions are: step 1: CLICK: (133, 549)\nstep 2: CLICK: (978, 67)\nstep 3: TYPE: tennis\nstep 4: CLICK: (113, 217)\nstep 5: CLICK: (216, 209)\nstep 6: CLICK: (217, 208)\nstep 7: PRESS_HOME\nstep 8: CLICK: (583, 118)\nstep 9: CLICK: (22, 502)\nstep 10: CLICK: (110, 136)\nstep 11: CLICK: (38, 201)\nstep 12: TYPE: S.Tsitsipas:C.Ruud is 53:76\nstep 13: CLICK: (958, 446)\nstep 14: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (133, 549)\nstep 2: CLICK: (978, 67)\nstep 3: TYPE: tennis\nstep 4: CLICK: (113, 217)\nstep 5: CLICK: (216, 209)\nstep 6: CLICK: (217, 208)\nstep 7: PRESS_HOME\nstep 8: CLICK: (583, 118)\nstep 9: CLICK: (22, 502)\nstep 10: CLICK: (110, 136)\nstep 11: CLICK: (38, 201)\nstep 12: TYPE: S.Tsitsipas:C.Ruud is 53:76\nstep 13: CLICK: (958, 446)\nstep 14: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Threads', 'BBC Sports']\nB: ['Messenger', 'SmartNews:News That Matters']\nC: ['Instagram', 'Yahoo Sports']\nD: ['Whatsapp', 'Breaking News: local & Alerts']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_160_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_160_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_160_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_160_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_160_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_160_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_160_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_160_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_160_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_160_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_160_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_160_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_160_12.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_160_13.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Opera', 'Chime - Mobile Banking']\nB: ['Firefox', 'Google Wallet']\nC: ['DuckDuckGo', 'Cash App']\nD: ['Chrome', 'Investing.com']\n", "question": "The corresponding actions are: step 1: CLICK: (516, 901)\nstep 2: TYPE: McDonald's stock market news\nstep 3: CLICK: (896, 688)\nstep 4: SCROLL: UP\nstep 5: CLICK: (393, 411)\nstep 6: PRESS_HOME\nstep 7: CLICK: (424, 401)\nstep 8: CLICK: (702, 68)\nstep 9: TYPE: McDonald's\nstep 10: CLICK: (428, 261)\nstep 11: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (516, 901)\nstep 2: TYPE: McDonald's stock market news\nstep 3: CLICK: (896, 688)\nstep 4: SCROLL: UP\nstep 5: CLICK: (393, 411)\nstep 6: PRESS_HOME\nstep 7: CLICK: (424, 401)\nstep 8: CLICK: (702, 68)\nstep 9: TYPE: McDonald's\nstep 10: CLICK: (428, 261)\nstep 11: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Opera', 'Chime - Mobile Banking']\nB: ['Firefox', 'Google Wallet']\nC: ['DuckDuckGo', 'Cash App']\nD: ['Chrome', 'Investing.com']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_161_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_161_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_161_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_161_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_161_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_161_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_161_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_161_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_161_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_161_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_161_10.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['GPS, Maps, Voice Navigation', 'Uber']\nB: ['Yandex Navigator', 'Citymapper']\nC: ['Google Map', 'Lyft']\nD: ['Maps', 'Google Map']\n", "question": "The corresponding actions are: step 1: CLICK: (561, 308)\nstep 2: CLICK: (165, 70)\nstep 3: TYPE: book store\nstep 4: CLICK: (171, 160)\nstep 5: PRESS_HOME\nstep 6: CLICK: (549, 172)\nstep 7: CLICK: (350, 623)\nstep 8: TYPE: The Last Bookstore\nstep 9: CLICK: (434, 317)\nstep 10: CLICK: (433, 881)\nstep 11: CLICK: (394, 895)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (561, 308)\nstep 2: CLICK: (165, 70)\nstep 3: TYPE: book store\nstep 4: CLICK: (171, 160)\nstep 5: PRESS_HOME\nstep 6: CLICK: (549, 172)\nstep 7: CLICK: (350, 623)\nstep 8: TYPE: The Last Bookstore\nstep 9: CLICK: (434, 317)\nstep 10: CLICK: (433, 881)\nstep 11: CLICK: (394, 895)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['GPS, Maps, Voice Navigation', 'Uber']\nB: ['Yandex Navigator', 'Citymapper']\nC: ['Google Map', 'Lyft']\nD: ['Maps', 'Google Map']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_162_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_162_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_162_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_162_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_162_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_162_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_162_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_162_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_162_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_162_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_162_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_162_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Todoist', 'Memrise: speak a new language']\nB: ['Things', 'Duolingo']\nC: ['To-Do List', 'Rosetta Stone: Learn, Practice']\nD: ['Any.do', 'Babbel - Learn Languages']\n", "question": "The corresponding actions are: step 1: CLICK: (688, 145)\nstep 2: CLICK: (182, 67)\nstep 3: SCROLL: UP\nstep 4: CLICK: (267, 536)\nstep 5: CLICK: (185, 461)\nstep 6: CLICK: (124, 395)\nstep 7: CLICK: (741, 784)\nstep 8: PRESS_HOME\nstep 9: CLICK: (931, 313)\nstep 10: CLICK: (744, 789)\nstep 11: TYPE: Korean Learning\nstep 12: CLICK: (753, 498)\nstep 13: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (688, 145)\nstep 2: CLICK: (182, 67)\nstep 3: SCROLL: UP\nstep 4: CLICK: (267, 536)\nstep 5: CLICK: (185, 461)\nstep 6: CLICK: (124, 395)\nstep 7: CLICK: (741, 784)\nstep 8: PRESS_HOME\nstep 9: CLICK: (931, 313)\nstep 10: CLICK: (744, 789)\nstep 11: TYPE: Korean Learning\nstep 12: CLICK: (753, 498)\nstep 13: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Todoist', 'Memrise: speak a new language']\nB: ['Things', 'Duolingo']\nC: ['To-Do List', 'Rosetta Stone: Learn, Practice']\nD: ['Any.do', 'Babbel - Learn Languages']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_163_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_163_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_163_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_163_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_163_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_163_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_163_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_163_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_163_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_163_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_163_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_163_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_163_12.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Bloomberg: Finance Market News', 'TradingView: Track All Markets']\nB: ['Yahoo Finance: Stock News', 'Plantin']\nC: ['CNN Breaking US & World News', 'Setting']\nD: ['SmartNews:News That Matters', 'Contacts']\n", "question": "The corresponding actions are: step 1: CLICK: (832, 266)\nstep 2: CLICK: (930, 70)\nstep 3: CLICK: (140, 70)\nstep 4: TYPE: Coca-Cola\nstep 5: CLICK: (264, 449)\nstep 6: PRESS_HOME\nstep 7: CLICK: (140, 398)\nstep 8: CLICK: (941, 77)\nstep 9: TYPE: Coca-Cola\nstep 10: CLICK: (350, 192)\nstep 11: IMPOSSIBLE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (832, 266)\nstep 2: CLICK: (930, 70)\nstep 3: CLICK: (140, 70)\nstep 4: TYPE: Coca-Cola\nstep 5: CLICK: (264, 449)\nstep 6: PRESS_HOME\nstep 7: CLICK: (140, 398)\nstep 8: CLICK: (941, 77)\nstep 9: TYPE: Coca-Cola\nstep 10: CLICK: (350, 192)\nstep 11: IMPOSSIBLE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Bloomberg: Finance Market News', 'TradingView: Track All Markets']\nB: ['Yahoo Finance: Stock News', 'Plantin']\nC: ['CNN Breaking US & World News', 'Setting']\nD: ['SmartNews:News That Matters', 'Contacts']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_164_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_164_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_164_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_164_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_164_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_164_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_164_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_164_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_164_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_164_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_164_10.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Tubi: Movies & Live TV', 'Firefox']\nB: ['Pluto TV - Live TV and Movies', 'Opera']\nC: ['Shorts VotTak: Short Video App', 'Bing: chat with AI & GPT4']\nD: ['Netflix', 'Quora']\n", "question": "The corresponding actions are: step 1: CLICK: (624, 818)\nstep 2: CLICK: (185, 433)\nstep 3: CLICK: (819, 563)\nstep 4: CLICK: (566, 182)\nstep 5: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (624, 818)\nstep 2: CLICK: (185, 433)\nstep 3: CLICK: (819, 563)\nstep 4: CLICK: (566, 182)\nstep 5: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Tubi: Movies & Live TV', 'Firefox']\nB: ['Pluto TV - Live TV and Movies', 'Opera']\nC: ['Shorts VotTak: Short Video App', 'Bing: chat with AI & GPT4']\nD: ['Netflix', 'Quora']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_165_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_165_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_165_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_165_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_165_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Quora', 'Apartment List', 'iHeart: Music, Radio, Podcasts']\nB: ['Chrome', 'StubHub', 'Amazon Music']\nC: ['Opera', 'Airbnb', 'Pandora']\nD: ['Wikipedia', 'Agoda', 'YT Music']\n", "question": "The corresponding actions are: step 1: CLICK: (140, 104)\nstep 2: CLICK: (17, 57)\nstep 3: CLICK: (228, 367)\nstep 4: TYPE: popular pop music band now\nstep 5: CLICK: (304, 192)\nstep 6: CLICK: (308, 724)\nstep 7: PRESS_HOME\nstep 8: SCROLL: RIGHT\nstep 9: CLICK: (713, 706)\nstep 10: CLICK: (143, 308)\nstep 11: CLICK: (281, 543)\nstep 12: PRESS_HOME\nstep 13: SCROLL: LEFT\nstep 14: CLICK: (288, 103)\nstep 15: CLICK: (695, 358)\nstep 16: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (140, 104)\nstep 2: CLICK: (17, 57)\nstep 3: CLICK: (228, 367)\nstep 4: TYPE: popular pop music band now\nstep 5: CLICK: (304, 192)\nstep 6: CLICK: (308, 724)\nstep 7: PRESS_HOME\nstep 8: SCROLL: RIGHT\nstep 9: CLICK: (713, 706)\nstep 10: CLICK: (143, 308)\nstep 11: CLICK: (281, 543)\nstep 12: PRESS_HOME\nstep 13: SCROLL: LEFT\nstep 14: CLICK: (288, 103)\nstep 15: CLICK: (695, 358)\nstep 16: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Quora', 'Apartment List', 'iHeart: Music, Radio, Podcasts']\nB: ['Chrome', 'StubHub', 'Amazon Music']\nC: ['Opera', 'Airbnb', 'Pandora']\nD: ['Wikipedia', 'Agoda', 'YT Music']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_166_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_166_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_166_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_166_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_166_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_166_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_166_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_166_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_166_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_166_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_166_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_166_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_166_12.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_166_13.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_166_14.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_166_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['iNaturalist', 'Meesho']\nB: ['Google Play Store', 'Joom']\nC: ['Setting', 'YOOX']\nD: ['Picturethis', 'Amazon']\n", "question": "The corresponding actions are: step 1: SCROLL: RIGHT\nstep 2: CLICK: (96, 662)\nstep 3: CLICK: (346, 89)\nstep 4: TYPE: Joom\nstep 5: CLICK: (855, 871)\nstep 6: CLICK: (784, 425)\nstep 7: CLICK: (798, 431)\nstep 8: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: SCROLL: RIGHT\nstep 2: CLICK: (96, 662)\nstep 3: CLICK: (346, 89)\nstep 4: TYPE: Joom\nstep 5: CLICK: (855, 871)\nstep 6: CLICK: (784, 425)\nstep 7: CLICK: (798, 431)\nstep 8: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['iNaturalist', 'Meesho']\nB: ['Google Play Store', 'Joom']\nC: ['Setting', 'YOOX']\nD: ['Picturethis', 'Amazon']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_167_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_167_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_167_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_167_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_167_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_167_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_167_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_167_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Job Search by ZipRecruiter', 'Simplenote']\nB: ['LinkedIn: Jobs & Business News', 'Google Keep']\nC: ['Indeed Job Search', 'BasicNote - Notes, Notepad']\nD: ['Indeed Job Search', 'WPS office']\n", "question": "The corresponding actions are: step 1: CLICK: (319, 353)\nstep 2: CLICK: (344, 97)\nstep 3: TYPE: mobile app developer\nstep 4: CLICK: (920, 884)\nstep 5: PRESS_HOME\nstep 6: CLICK: (499, 232)\nstep 7: CLICK: (881, 887)\nstep 8: CLICK: (155, 169)\nstep 9: TYPE: Decker\nstep 10: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (319, 353)\nstep 2: CLICK: (344, 97)\nstep 3: TYPE: mobile app developer\nstep 4: CLICK: (920, 884)\nstep 5: PRESS_HOME\nstep 6: CLICK: (499, 232)\nstep 7: CLICK: (881, 887)\nstep 8: CLICK: (155, 169)\nstep 9: TYPE: Decker\nstep 10: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Job Search by ZipRecruiter', 'Simplenote']\nB: ['LinkedIn: Jobs & Business News', 'Google Keep']\nC: ['Indeed Job Search', 'BasicNote - Notes, Notepad']\nD: ['Indeed Job Search', 'WPS office']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_168_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_168_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_168_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_168_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_168_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_168_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_168_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_168_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_168_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_168_9.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Setting', 'Amazon Kindle']\nB: ['Vaulty:Hide Pictures Videos', 'Ploter - Ebook, Audiobook, PDF']\nC: ['Tripadvisor', 'Libby, by OverDrive']\nD: ['Applock Pro - APP Lock & Guard', 'Everand']\n", "question": "The corresponding actions are: step 1: CLICK: (834, 524)\nstep 2: SCROLL: UP\nstep 3: CLICK: (376, 627)\nstep 4: CLICK: (897, 656)\nstep 5: PRESS_HOME\nstep 6: SCROLL: LEFT\nstep 7: CLICK: (633, 136)\nstep 8: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (834, 524)\nstep 2: SCROLL: UP\nstep 3: CLICK: (376, 627)\nstep 4: CLICK: (897, 656)\nstep 5: PRESS_HOME\nstep 6: SCROLL: LEFT\nstep 7: CLICK: (633, 136)\nstep 8: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Setting', 'Amazon Kindle']\nB: ['Vaulty:Hide Pictures Videos', 'Ploter - Ebook, Audiobook, PDF']\nC: ['Tripadvisor', 'Libby, by OverDrive']\nD: ['Applock Pro - APP Lock & Guard', 'Everand']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_169_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_169_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_169_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_169_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_169_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_169_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_169_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_169_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Wikipedia', 'Any.do']\nB: ['Opera', 'To-Do List']\nC: ['Firefox', 'Things']\nD: ['Chrome', 'TickTick']\n", "question": "The corresponding actions are: step 1: CLICK: (276, 277)\nstep 2: CLICK: (515, 153)\nstep 3: CLICK: (969, 901)\nstep 4: TYPE: Do Yoga Morning\nstep 5: CLICK: (687, 872)\nstep 6: PRESS_HOME\nstep 7: CLICK: (520, 916)\nstep 8: CLICK: (255, 119)\nstep 9: TYPE: Yoga video for beginners\nstep 10: CLICK: (268, 172)\nstep 11: CLICK: (361, 411)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (276, 277)\nstep 2: CLICK: (515, 153)\nstep 3: CLICK: (969, 901)\nstep 4: TYPE: Do Yoga Morning\nstep 5: CLICK: (687, 872)\nstep 6: PRESS_HOME\nstep 7: CLICK: (520, 916)\nstep 8: CLICK: (255, 119)\nstep 9: TYPE: Yoga video for beginners\nstep 10: CLICK: (268, 172)\nstep 11: CLICK: (361, 411)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Wikipedia', 'Any.do']\nB: ['Opera', 'To-Do List']\nC: ['Firefox', 'Things']\nD: ['Chrome', 'TickTick']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_170_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_170_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_170_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_170_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_170_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_170_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_170_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_170_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_170_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_170_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_170_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_170_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Apartment List', 'Citymapper']\nB: ['Apartments.com Rental Search', 'Lyft']\nC: ['Booking.com', 'Maps']\nD: ['TickPick - Live Event Tickets', 'Petal Maps - GPS & Navigation']\n", "question": "The corresponding actions are: step 1: CLICK: (146, 374)\nstep 2: CLICK: (790, 658)\nstep 3: PRESS_HOME\nstep 4: CLICK: (381, 509)\nstep 5: CLICK: (381, 712)\nstep 6: TYPE: 825 E 4th St\nstep 7: CLICK: (447, 272)\nstep 8: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (146, 374)\nstep 2: CLICK: (790, 658)\nstep 3: PRESS_HOME\nstep 4: CLICK: (381, 509)\nstep 5: CLICK: (381, 712)\nstep 6: TYPE: 825 E 4th St\nstep 7: CLICK: (447, 272)\nstep 8: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Apartment List', 'Citymapper']\nB: ['Apartments.com Rental Search', 'Lyft']\nC: ['Booking.com', 'Maps']\nD: ['TickPick - Live Event Tickets', 'Petal Maps - GPS & Navigation']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_171_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_171_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_171_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_171_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_171_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_171_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_171_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_171_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Chrome', 'Messenger']\nB: ['Firefox', 'Tumblr']\nC: ['DuckDuckgo', 'Gmail']\nD: ['Edge', 'Whatsapp']\n", "question": "The corresponding actions are: step 1: CLICK: (203, 491)\nstep 2: TYPE: China\nstep 3: CLICK: (908, 888)\nstep 4: CLICK: (567, 636)\nstep 5: SCROLL: UP\nstep 6: SCROLL: DOWN\nstep 7: CLICK: (975, 89)\nstep 8: CLICK: (739, 240)\nstep 9: CLICK: (475, 688)\nstep 10: TYPE: caba62244@gmail.com\nstep 11: CLICK: (422, 333)\nstep 12: CLICK: (900, 82)\nstep 13: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (203, 491)\nstep 2: TYPE: China\nstep 3: CLICK: (908, 888)\nstep 4: CLICK: (567, 636)\nstep 5: SCROLL: UP\nstep 6: SCROLL: DOWN\nstep 7: CLICK: (975, 89)\nstep 8: CLICK: (739, 240)\nstep 9: CLICK: (475, 688)\nstep 10: TYPE: caba62244@gmail.com\nstep 11: CLICK: (422, 333)\nstep 12: CLICK: (900, 82)\nstep 13: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Chrome', 'Messenger']\nB: ['Firefox', 'Tumblr']\nC: ['DuckDuckgo', 'Gmail']\nD: ['Edge', 'Whatsapp']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_172_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_172_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_172_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_172_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_172_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_172_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_172_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_172_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_172_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_172_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_172_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_172_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_172_12.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Wikipedia', 'Whatsapp']\nB: ['DuckDuckGo', 'Facebook']\nC: ['Bing: chat with AI & GPT4', 'Messenger']\nD: ['Chrome', 'Instagram']\n", "question": "The corresponding actions are: step 1: CLICK: (138, 643)\nstep 2: CLICK: (420, 246)\nstep 3: TYPE:  technology conference events\nstep 4: CLICK: (916, 900)\nstep 5: CLICK: (459, 477)\nstep 6: PRESS_HOME\nstep 7: CLICK: (615, 110)\nstep 8: CLICK: (924, 72)\nstep 9: CLICK: (418, 148)\nstep 10: CLICK: (143, 216)\nstep 11: CLICK: (260, 938)\nstep 12: TYPE: 2024 RSA Conference is on now\nstep 13: CLICK: (909, 588)\nstep 14: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (138, 643)\nstep 2: CLICK: (420, 246)\nstep 3: TYPE:  technology conference events\nstep 4: CLICK: (916, 900)\nstep 5: CLICK: (459, 477)\nstep 6: PRESS_HOME\nstep 7: CLICK: (615, 110)\nstep 8: CLICK: (924, 72)\nstep 9: CLICK: (418, 148)\nstep 10: CLICK: (143, 216)\nstep 11: CLICK: (260, 938)\nstep 12: TYPE: 2024 RSA Conference is on now\nstep 13: CLICK: (909, 588)\nstep 14: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Wikipedia', 'Whatsapp']\nB: ['DuckDuckGo', 'Facebook']\nC: ['Bing: chat with AI & GPT4', 'Messenger']\nD: ['Chrome', 'Instagram']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_173_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_173_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_173_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_173_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_173_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_173_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_173_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_173_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_173_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_173_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_173_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_173_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_173_12.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_173_13.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Edge', 'Whatsapp']\nB: ['Opera', 'Facebook']\nC: ['DuckDuckGo', 'Tumblr']\nD: ['Bing: chat with AI & GPT4', 'Messenger']\n", "question": "The corresponding actions are: step 1: CLICK: (884, 599)\nstep 2: CLICK: (509, 249)\nstep 3: TYPE:  Mississippi River\nstep 4: CLICK: (922, 877)\nstep 5: SCROLL: UP\nstep 6: CLICK: (745, 778)\nstep 7: CLICK: (915, 158)\nstep 8: CLICK: (581, 256)\nstep 9: CLICK: (870, 853)\nstep 10: CLICK: (897, 146)\nstep 11: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (884, 599)\nstep 2: CLICK: (509, 249)\nstep 3: TYPE:  Mississippi River\nstep 4: CLICK: (922, 877)\nstep 5: SCROLL: UP\nstep 6: CLICK: (745, 778)\nstep 7: CLICK: (915, 158)\nstep 8: CLICK: (581, 256)\nstep 9: CLICK: (870, 853)\nstep 10: CLICK: (897, 146)\nstep 11: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Edge', 'Whatsapp']\nB: ['Opera', 'Facebook']\nC: ['DuckDuckGo', 'Tumblr']\nD: ['Bing: chat with AI & GPT4', 'Messenger']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_174_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_174_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_174_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_174_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_174_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_174_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_174_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_174_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_174_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_174_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_174_10.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Tiktok', 'Applock Pro - APP Lock & Guard']\nB: ['Shorts VotTak: Short Video App', 'Picturethis']\nC: ['Tubi: Movies & Live TV', 'Contacts']\nD: ['Youtube', 'Google Play Store']\n", "question": "The corresponding actions are: step 1: CLICK: (607, 813)\nstep 2: PRESS_HOME\nstep 3: CLICK: (819, 815)\nstep 4: CLICK: (802, 59)\nstep 5: CLICK: (931, 56)\nstep 6: TYPE: fiton\nstep 7: CLICK: (934, 899)\nstep 8: CLICK: (862, 329)\nstep 9: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (607, 813)\nstep 2: PRESS_HOME\nstep 3: CLICK: (819, 815)\nstep 4: CLICK: (802, 59)\nstep 5: CLICK: (931, 56)\nstep 6: TYPE: fiton\nstep 7: CLICK: (934, 899)\nstep 8: CLICK: (862, 329)\nstep 9: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Tiktok', 'Applock Pro - APP Lock & Guard']\nB: ['Shorts VotTak: Short Video App', 'Picturethis']\nC: ['Tubi: Movies & Live TV', 'Contacts']\nD: ['Youtube', 'Google Play Store']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_175_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_175_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_175_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_175_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_175_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_175_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_175_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_175_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_175_8.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['OfferUp: Buy. Sell. Letgo.', 'Setting']\nB: ['SHEIN', 'Google Play Store']\nC: ['REVOLVE', 'TradingView: Track All Markets']\nD: ['Alibaba.com - B2B marketplace', 'Contacts']\n", "question": "The corresponding actions are: step 1: CLICK: (210, 684)\nstep 2: CLICK: (308, 113)\nstep 3: TYPE: SHEIN\nstep 4: CLICK: (866, 877)\nstep 5: CLICK: (738, 431)\nstep 6: CLICK: (915, 438)\nstep 7: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (210, 684)\nstep 2: CLICK: (308, 113)\nstep 3: TYPE: SHEIN\nstep 4: CLICK: (866, 877)\nstep 5: CLICK: (738, 431)\nstep 6: CLICK: (915, 438)\nstep 7: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['OfferUp: Buy. Sell. Letgo.', 'Setting']\nB: ['SHEIN', 'Google Play Store']\nC: ['REVOLVE', 'TradingView: Track All Markets']\nD: ['Alibaba.com - B2B marketplace', 'Contacts']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_176_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_176_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_176_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_176_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_176_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_176_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_176_6.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['TradingView: Track All Markets', 'X', 'Vaulty:Hide Pictures Videos']\nB: ['PlantNet', 'Tumblr', 'PlantNet']\nC: ['Picturethis', 'Gmail', 'Contacts']\nD: ['Google Play Store', 'Instagram', 'Setting']\n", "question": "The corresponding actions are: step 1: CLICK: (822, 817)\nstep 2: CLICK: (824, 329)\nstep 3: PRESS_HOME\nstep 4: CLICK: (381, 820)\nstep 5: CLICK: (284, 509)\nstep 6: CLICK: (834, 406)\nstep 7: PRESS_BACK\nstep 8: CLICK: (184, 402)\nstep 9: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (822, 817)\nstep 2: CLICK: (824, 329)\nstep 3: PRESS_HOME\nstep 4: CLICK: (381, 820)\nstep 5: CLICK: (284, 509)\nstep 6: CLICK: (834, 406)\nstep 7: PRESS_BACK\nstep 8: CLICK: (184, 402)\nstep 9: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['TradingView: Track All Markets', 'X', 'Vaulty:Hide Pictures Videos']\nB: ['PlantNet', 'Tumblr', 'PlantNet']\nC: ['Picturethis', 'Gmail', 'Contacts']\nD: ['Google Play Store', 'Instagram', 'Setting']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_177_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_177_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_177_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_177_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_177_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_177_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_177_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_177_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_177_8.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['iNaturalist', 'Vaulty:Hide Pictures Videos', 'Shorts VotTak: Short Video App']\nB: ['Tripadvisor', 'Applock Pro - APP Lock & Guard', 'Youtube']\nC: ['Setting', 'Google Play Store', 'Likee']\nD: ['PlantNet', 'Plantin', 'Tiktok']\n", "question": "The corresponding actions are: step 1: CLICK: (819, 815)\nstep 2: CLICK: (292, 79)\nstep 3: TYPE: Likee\nstep 4: CLICK: (906, 914)\nstep 5: CLICK: (865, 421)\nstep 6: PRESS_HOME\nstep 7: CLICK: (386, 824)\nstep 8: CLICK: (345, 572)\nstep 9: CLICK: (530, 347)\nstep 10: CLICK: (642, 542)\nstep 11: CLICK: (817, 440)\nstep 12: PRESS_BACK\nstep 13: CLICK: (131, 436)\nstep 14: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (819, 815)\nstep 2: CLICK: (292, 79)\nstep 3: TYPE: Likee\nstep 4: CLICK: (906, 914)\nstep 5: CLICK: (865, 421)\nstep 6: PRESS_HOME\nstep 7: CLICK: (386, 824)\nstep 8: CLICK: (345, 572)\nstep 9: CLICK: (530, 347)\nstep 10: CLICK: (642, 542)\nstep 11: CLICK: (817, 440)\nstep 12: PRESS_BACK\nstep 13: CLICK: (131, 436)\nstep 14: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['iNaturalist', 'Vaulty:Hide Pictures Videos', 'Shorts VotTak: Short Video App']\nB: ['Tripadvisor', 'Applock Pro - APP Lock & Guard', 'Youtube']\nC: ['Setting', 'Google Play Store', 'Likee']\nD: ['PlantNet', 'Plantin', 'Tiktok']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_178_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_178_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_178_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_178_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_178_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_178_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_178_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_178_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_178_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_178_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_178_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_178_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_178_12.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_178_13.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Bing: chat with AI & GPT4', 'Whatsapp']\nB: ['DuckDuckgo', 'Messenger']\nC: ['Wikipedia', 'X']\nD: ['Edge', 'Instagram']\n", "question": "The corresponding actions are: step 1: CLICK: (360, 669)\nstep 2: TYPE:  political debate events\nstep 3: CLICK: (891, 922)\nstep 4: CLICK: (328, 242)\nstep 5: SCROLL: UP\nstep 6: SCROLL: UP\nstep 7: PRESS_HOME\nstep 8: CLICK: (819, 132)\nstep 9: CLICK: (366, 470)\nstep 10: CLICK: (523, 948)\nstep 11: TYPE: I saw an ad about NSDA\nstep 12: CLICK: (947, 645)\nstep 13: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (360, 669)\nstep 2: TYPE:  political debate events\nstep 3: CLICK: (891, 922)\nstep 4: CLICK: (328, 242)\nstep 5: SCROLL: UP\nstep 6: SCROLL: UP\nstep 7: PRESS_HOME\nstep 8: CLICK: (819, 132)\nstep 9: CLICK: (366, 470)\nstep 10: CLICK: (523, 948)\nstep 11: TYPE: I saw an ad about NSDA\nstep 12: CLICK: (947, 645)\nstep 13: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Bing: chat with AI & GPT4', 'Whatsapp']\nB: ['DuckDuckgo', 'Messenger']\nC: ['Wikipedia', 'X']\nD: ['Edge', 'Instagram']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_179_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_179_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_179_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_179_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_179_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_179_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_179_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_179_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_179_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_179_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_179_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_179_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_179_12.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['ClevCalc - Calculator', 'iNaturalist']\nB: ['Calendar', 'Setting']\nC: ['Simple Calendar - easy planner', 'Picturethis']\nD: ['Basic Calculator: GPA & Math', 'Vaulty:Hide Pictures Videos']\n", "question": "The corresponding actions are: step 1: CLICK: (325, 497)\nstep 2: SCROLL: UP\nstep 3: SCROLL: UP\nstep 4: CLICK: (195, 932)\nstep 5: CLICK: (567, 349)\nstep 6: CLICK: (528, 421)\nstep 7: CLICK: (535, 619)\nstep 8: CLICK: (970, 90)\nstep 9: TYPE: Spanish\nstep 10: CLICK: (504, 220)\nstep 11: SCROLL: UP\nstep 12: CLICK: (869, 646)\nstep 13: PRESS_HOME\nstep 14: CLICK: (88, 501)\nstep 15: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (325, 497)\nstep 2: SCROLL: UP\nstep 3: SCROLL: UP\nstep 4: CLICK: (195, 932)\nstep 5: CLICK: (567, 349)\nstep 6: CLICK: (528, 421)\nstep 7: CLICK: (535, 619)\nstep 8: CLICK: (970, 90)\nstep 9: TYPE: Spanish\nstep 10: CLICK: (504, 220)\nstep 11: SCROLL: UP\nstep 12: CLICK: (869, 646)\nstep 13: PRESS_HOME\nstep 14: CLICK: (88, 501)\nstep 15: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['ClevCalc - Calculator', 'iNaturalist']\nB: ['Calendar', 'Setting']\nC: ['Simple Calendar - easy planner', 'Picturethis']\nD: ['Basic Calculator: GPA & Math', 'Vaulty:Hide Pictures Videos']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_180_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_180_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_180_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_180_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_180_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_180_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_180_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_180_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_180_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_180_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_180_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_180_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_180_12.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_180_13.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_180_14.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Booking.com', 'Maps']\nB: ['Apartments.com Rental Search', 'Waze Navigation & Live Traffic']\nC: ['Traveloka', 'Yandex Navigator']\nD: ['Apartment List', 'Lyft']\n", "question": "The corresponding actions are: step 1: CLICK: (566, 654)\nstep 2: CLICK: (201, 234)\nstep 3: PRESS_HOME\nstep 4: CLICK: (918, 150)\nstep 5: CLICK: (236, 695)\nstep 6: TYPE: 257 S Spring St\nstep 7: CLICK: (275, 237)\nstep 8: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (566, 654)\nstep 2: CLICK: (201, 234)\nstep 3: PRESS_HOME\nstep 4: CLICK: (918, 150)\nstep 5: CLICK: (236, 695)\nstep 6: TYPE: 257 S Spring St\nstep 7: CLICK: (275, 237)\nstep 8: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Booking.com', 'Maps']\nB: ['Apartments.com Rental Search', 'Waze Navigation & Live Traffic']\nC: ['Traveloka', 'Yandex Navigator']\nD: ['Apartment List', 'Lyft']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_181_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_181_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_181_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_181_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_181_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_181_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_181_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_181_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Instagram', 'Chrome']\nB: ['Messenger', 'Wikipedia']\nC: ['Gmail', 'Bing: chat with AI & GPT4']\nD: ['X', 'Opera']\n", "question": "The corresponding actions are: step 1: CLICK: (161, 598)\nstep 2: CLICK: (351, 379)\nstep 3: TYPE: baseball scores\nstep 4: CLICK: (912, 884)\nstep 5: PRESS_HOME\nstep 6: CLICK: (618, 149)\nstep 7: CLICK: (951, 144)\nstep 8: CLICK: (293, 639)\nstep 9: CLICK: (224, 901)\nstep 10: TYPE: Mariners : Twins is 3:6\nstep 11: CLICK: (844, 481)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (161, 598)\nstep 2: CLICK: (351, 379)\nstep 3: TYPE: baseball scores\nstep 4: CLICK: (912, 884)\nstep 5: PRESS_HOME\nstep 6: CLICK: (618, 149)\nstep 7: CLICK: (951, 144)\nstep 8: CLICK: (293, 639)\nstep 9: CLICK: (224, 901)\nstep 10: TYPE: Mariners : Twins is 3:6\nstep 11: CLICK: (844, 481)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Instagram', 'Chrome']\nB: ['Messenger', 'Wikipedia']\nC: ['Gmail', 'Bing: chat with AI & GPT4']\nD: ['X', 'Opera']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_182_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_182_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_182_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_182_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_182_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_182_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_182_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_182_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_182_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_182_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_182_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_182_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Triller', 'Simplenote']\nB: ['Pluto TV - Live TV and Movies', 'Google Keep']\nC: ['Youtube', 'Microsoft word']\nD: ['Likee', 'Google Docs']\n", "question": "The corresponding actions are: step 1: CLICK: (596, 892)\nstep 2: CLICK: (703, 228)\nstep 3: CLICK: (99, 272)\nstep 4: PRESS_HOME\nstep 5: CLICK: (858, 383)\nstep 6: CLICK: (360, 387)\nstep 7: CLICK: (128, 387)\nstep 8: CLICK: (130, 383)\nstep 9: CLICK: (423, 502)\nstep 10: TYPE:  Smart Cities: Technology and Urban Planning\nstep 11: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (596, 892)\nstep 2: CLICK: (703, 228)\nstep 3: CLICK: (99, 272)\nstep 4: PRESS_HOME\nstep 5: CLICK: (858, 383)\nstep 6: CLICK: (360, 387)\nstep 7: CLICK: (128, 387)\nstep 8: CLICK: (130, 383)\nstep 9: CLICK: (423, 502)\nstep 10: TYPE:  Smart Cities: Technology and Urban Planning\nstep 11: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Triller', 'Simplenote']\nB: ['Pluto TV - Live TV and Movies', 'Google Keep']\nC: ['Youtube', 'Microsoft word']\nD: ['Likee', 'Google Docs']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_183_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_183_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_183_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_183_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_183_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_183_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_183_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_183_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_183_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_183_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_183_10.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Threads', 'Tumblr']\nB: ['X', 'X']\nC: ['Messenger', 'Messenger']\nD: ['Instagram', 'Threads']\n", "question": "The corresponding actions are: step 1: CLICK: (604, 290)\nstep 2: CLICK: (499, 916)\nstep 3: TYPE: I am sad\nstep 4: CLICK: (847, 496)\nstep 5: CLICK: (909, 924)\nstep 6: SCROLL: UP\nstep 7: CLICK: (545, 597)\nstep 8: CLICK: (561, 882)\nstep 9: PRESS_HOME\nstep 10: CLICK: (576, 170)\nstep 11: CLICK: (894, 169)\nstep 12: CLICK: (394, 648)\nstep 13: CLICK: (341, 917)\nstep 14: TYPE: https://www.threads.net/@ct.5024/post/C60c4dFO2YQ/?xmt=AQGzuhpc3kkbQPp3APiJPjh-3Y8uPVOjnWk8g2lv940IEg\nstep 15: CLICK: (856, 447)\nstep 16: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (604, 290)\nstep 2: CLICK: (499, 916)\nstep 3: TYPE: I am sad\nstep 4: CLICK: (847, 496)\nstep 5: CLICK: (909, 924)\nstep 6: SCROLL: UP\nstep 7: CLICK: (545, 597)\nstep 8: CLICK: (561, 882)\nstep 9: PRESS_HOME\nstep 10: CLICK: (576, 170)\nstep 11: CLICK: (894, 169)\nstep 12: CLICK: (394, 648)\nstep 13: CLICK: (341, 917)\nstep 14: TYPE: https://www.threads.net/@ct.5024/post/C60c4dFO2YQ/?xmt=AQGzuhpc3kkbQPp3APiJPjh-3Y8uPVOjnWk8g2lv940IEg\nstep 15: CLICK: (856, 447)\nstep 16: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Threads', 'Tumblr']\nB: ['X', 'X']\nC: ['Messenger', 'Messenger']\nD: ['Instagram', 'Threads']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_184_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_184_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_184_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_184_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_184_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_184_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_184_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_184_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_184_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_184_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_184_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_184_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_184_12.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_184_13.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_184_14.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_184_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['PlantNet', 'Chime - Mobile Banking']\nB: ['Applock Pro - APP Lock & Guard', 'Venmo']\nC: ['Picturethis', 'Cash App']\nD: ['Setting', 'Google Pay']\n", "question": "The corresponding actions are: step 1: CLICK: (341, 393)\nstep 2: CLICK: (242, 573)\nstep 3: CLICK: (524, 567)\nstep 4: CLICK: (799, 576)\nstep 5: CLICK: (272, 664)\nstep 6: CLICK: (492, 671)\nstep 7: CLICK: (815, 671)\nstep 8: SCROLL: UP\nstep 9: SCROLL: UP\nstep 10: CLICK: (876, 623)\nstep 11: PRESS_HOME\nstep 12: CLICK: (133, 527)\nstep 13: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (341, 393)\nstep 2: CLICK: (242, 573)\nstep 3: CLICK: (524, 567)\nstep 4: CLICK: (799, 576)\nstep 5: CLICK: (272, 664)\nstep 6: CLICK: (492, 671)\nstep 7: CLICK: (815, 671)\nstep 8: SCROLL: UP\nstep 9: SCROLL: UP\nstep 10: CLICK: (876, 623)\nstep 11: PRESS_HOME\nstep 12: CLICK: (133, 527)\nstep 13: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['PlantNet', 'Chime - Mobile Banking']\nB: ['Applock Pro - APP Lock & Guard', 'Venmo']\nC: ['Picturethis', 'Cash App']\nD: ['Setting', 'Google Pay']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_185_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_185_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_185_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_185_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_185_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_185_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_185_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_185_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_185_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_185_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_185_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_185_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_185_12.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Google Play Store', 'Setting']\nB: ['Picturethis', 'Contacts']\nC: ['PlantNet', 'Google Play Store']\nD: ['Vaulty:Hide Pictures Videos', 'Plantin']\n", "question": "The corresponding actions are: step 1: CLICK: (391, 149)\nstep 2: CLICK: (391, 149)\nstep 3: TYPE: Tokopedia\nstep 4: CLICK: (933, 882)\nstep 5: CLICK: (616, 499)\nstep 6: CLICK: (324, 395)\nstep 7: CLICK: (761, 565)\nstep 8: PRESS_HOME\nstep 9: CLICK: (186, 635)\nstep 10: CLICK: (288, 823)\nstep 11: SCROLL: UP\nstep 12: CLICK: (441, 645)\nstep 13: CLICK: (802, 157)\nstep 14: TYPE: Tokopedia\nstep 15: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (391, 149)\nstep 2: CLICK: (391, 149)\nstep 3: TYPE: Tokopedia\nstep 4: CLICK: (933, 882)\nstep 5: CLICK: (616, 499)\nstep 6: CLICK: (324, 395)\nstep 7: CLICK: (761, 565)\nstep 8: PRESS_HOME\nstep 9: CLICK: (186, 635)\nstep 10: CLICK: (288, 823)\nstep 11: SCROLL: UP\nstep 12: CLICK: (441, 645)\nstep 13: CLICK: (802, 157)\nstep 14: TYPE: Tokopedia\nstep 15: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Google Play Store', 'Setting']\nB: ['Picturethis', 'Contacts']\nC: ['PlantNet', 'Google Play Store']\nD: ['Vaulty:Hide Pictures Videos', 'Plantin']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_186_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_186_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_186_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_186_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_186_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_186_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_186_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_186_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_186_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_186_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_186_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_186_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_186_12.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_186_13.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_186_14.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Simplenote', 'Youtube']\nB: ['Dropbox Paper', 'Shorts VotTak: Short Video App']\nC: ['BasicNote - Notes, Notepad', 'Tubi: Movies & Live TV']\nD: ['Microsoft Word', 'Triller']\n", "question": "The corresponding actions are: step 1: CLICK: (828, 656)\nstep 2: CLICK: (360, 503)\nstep 3: CLICK: (295, 864)\nstep 4: PRESS_HOME\nstep 5: CLICK: (605, 510)\nstep 6: CLICK: (895, 933)\nstep 7: CLICK: (317, 223)\nstep 8: CLICK: (520, 688)\nstep 9: TYPE:  Google's AI Course for Beginners(in 10 minutes)!\nstep 10: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (828, 656)\nstep 2: CLICK: (360, 503)\nstep 3: CLICK: (295, 864)\nstep 4: PRESS_HOME\nstep 5: CLICK: (605, 510)\nstep 6: CLICK: (895, 933)\nstep 7: CLICK: (317, 223)\nstep 8: CLICK: (520, 688)\nstep 9: TYPE:  Google's AI Course for Beginners(in 10 minutes)!\nstep 10: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Simplenote', 'Youtube']\nB: ['Dropbox Paper', 'Shorts VotTak: Short Video App']\nC: ['BasicNote - Notes, Notepad', 'Tubi: Movies & Live TV']\nD: ['Microsoft Word', 'Triller']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_187_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_187_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_187_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_187_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_187_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_187_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_187_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_187_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_187_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_187_9.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Whatsapp', 'Wikipedia']\nB: ['Threads', 'Chrome']\nC: ['Tumblr', 'Quora']\nD: ['X', 'Edge']\n", "question": "The corresponding actions are: step 1: CLICK: (127, 663)\nstep 2: CLICK: (313, 275)\nstep 3: TYPE:  Steve Jobs\nstep 4: CLICK: (911, 924)\nstep 5: CLICK: (594, 820)\nstep 6: CLICK: (951, 90)\nstep 7: CLICK: (623, 514)\nstep 8: CLICK: (485, 909)\nstep 9: PRESS_HOME\nstep 10: CLICK: (815, 282)\nstep 11: CLICK: (485, 946)\nstep 12: TYPE: https://en.wikipedia.org/wiki/Steve_Jobs\nstep 13: CLICK: (900, 638)\nstep 14: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (127, 663)\nstep 2: CLICK: (313, 275)\nstep 3: TYPE:  Steve Jobs\nstep 4: CLICK: (911, 924)\nstep 5: CLICK: (594, 820)\nstep 6: CLICK: (951, 90)\nstep 7: CLICK: (623, 514)\nstep 8: CLICK: (485, 909)\nstep 9: PRESS_HOME\nstep 10: CLICK: (815, 282)\nstep 11: CLICK: (485, 946)\nstep 12: TYPE: https://en.wikipedia.org/wiki/Steve_Jobs\nstep 13: CLICK: (900, 638)\nstep 14: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Whatsapp', 'Wikipedia']\nB: ['Threads', 'Chrome']\nC: ['Tumblr', 'Quora']\nD: ['X', 'Edge']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_188_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_188_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_188_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_188_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_188_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_188_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_188_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_188_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_188_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_188_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_188_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_188_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_188_12.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_188_13.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Dropbox Paper', 'Indeed Job Search']\nB: ['WPS office', 'LinkedIn: Jobs & Business News']\nC: ['Notepad - Notes and To Do List', 'Indeed Job Search']\nD: ['Google Keep', 'Job Search by ZipRecruiter']\n", "question": "The corresponding actions are: step 1: CLICK: (801, 667)\nstep 2: CLICK: (361, 95)\nstep 3: TYPE: UI/UX designer\nstep 4: CLICK: (874, 878)\nstep 5: CLICK: (485, 603)\nstep 6: PRESS_HOME\nstep 7: CLICK: (908, 482)\nstep 8: CLICK: (435, 869)\nstep 9: TYPE: Merit America\nstep 10: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (801, 667)\nstep 2: CLICK: (361, 95)\nstep 3: TYPE: UI/UX designer\nstep 4: CLICK: (874, 878)\nstep 5: CLICK: (485, 603)\nstep 6: PRESS_HOME\nstep 7: CLICK: (908, 482)\nstep 8: CLICK: (435, 869)\nstep 9: TYPE: Merit America\nstep 10: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Dropbox Paper', 'Indeed Job Search']\nB: ['WPS office', 'LinkedIn: Jobs & Business News']\nC: ['Notepad - Notes and To Do List', 'Indeed Job Search']\nD: ['Google Keep', 'Job Search by ZipRecruiter']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_189_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_189_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_189_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_189_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_189_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_189_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_189_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_189_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_189_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_189_9.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['TickTick', 'Spotify']\nB: ['Microsoft to do', 'Pandora']\nC: ['Any.do', 'Amazon Music']\nD: ['To-Do List', 'iHeart: Music, Radio, Podcasts']\n", "question": "The corresponding actions are: step 1: SCROLL: LEFT\nstep 2: CLICK: (395, 262)\nstep 3: CLICK: (56, 79)\nstep 4: CLICK: (844, 80)\nstep 5: CLICK: (386, 585)\nstep 6: PRESS_HOME\nstep 7: SCROLL: RIGHT\nstep 8: CLICK: (619, 403)\nstep 9: CLICK: (502, 686)\nstep 10: TYPE:  do yoga with this\nstep 11: CLICK: (882, 640)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: SCROLL: LEFT\nstep 2: CLICK: (395, 262)\nstep 3: CLICK: (56, 79)\nstep 4: CLICK: (844, 80)\nstep 5: CLICK: (386, 585)\nstep 6: PRESS_HOME\nstep 7: SCROLL: RIGHT\nstep 8: CLICK: (619, 403)\nstep 9: CLICK: (502, 686)\nstep 10: TYPE:  do yoga with this\nstep 11: CLICK: (882, 640)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['TickTick', 'Spotify']\nB: ['Microsoft to do', 'Pandora']\nC: ['Any.do', 'Amazon Music']\nD: ['To-Do List', 'iHeart: Music, Radio, Podcasts']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_190_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_190_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_190_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_190_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_190_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_190_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_190_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_190_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_190_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_190_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_190_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_190_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Microsoft Copilot', 'wikiHow']\nB: ['GenZArt:Fast AI Art Generator', 'Chrome']\nC: ['Chatty - AI Assistant', 'Firefox']\nD: ['WOMBO Dream-AI Art Generator', 'DuckDuckGo']\n", "question": "The corresponding actions are: step 1: CLICK: (115, 387)\nstep 2: CLICK: (255, 870)\nstep 3: TYPE: tell me about Bayes' theorem\nstep 4: CLICK: (911, 590)\nstep 5: SCROLL: DOWN\nstep 6: PRESS_HOME\nstep 7: CLICK: (148, 235)\nstep 8: CLICK: (354, 60)\nstep 9: TYPE: Bayes' theorem\nstep 10: CLICK: (214, 199)\nstep 11: CLICK: (217, 488)\nstep 12: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (115, 387)\nstep 2: CLICK: (255, 870)\nstep 3: TYPE: tell me about Bayes' theorem\nstep 4: CLICK: (911, 590)\nstep 5: SCROLL: DOWN\nstep 6: PRESS_HOME\nstep 7: CLICK: (148, 235)\nstep 8: CLICK: (354, 60)\nstep 9: TYPE: Bayes' theorem\nstep 10: CLICK: (214, 199)\nstep 11: CLICK: (217, 488)\nstep 12: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Microsoft Copilot', 'wikiHow']\nB: ['GenZArt:Fast AI Art Generator', 'Chrome']\nC: ['Chatty - AI Assistant', 'Firefox']\nD: ['WOMBO Dream-AI Art Generator', 'DuckDuckGo']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_191_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_191_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_191_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_191_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_191_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_191_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_191_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_191_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_191_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_191_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_191_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_191_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Simple Calendar', 'Tiktok']\nB: ['Clock', 'Youtube']\nC: ['Calculator Plus with History', 'Pluto TV - Live TV and Movies']\nD: ['ClevCalc - Calculator', 'Shorts VotTak: Short Video App']\n", "question": "The corresponding actions are: step 1: CLICK: (811, 612)\nstep 2: TYPE: Makeup look\nstep 3: CLICK: (899, 885)\nstep 4: CLICK: (472, 318)\nstep 5: PRESS_HOME\nstep 6: CLICK: (386, 215)\nstep 7: TYPE: 1800\nstep 8: CLICK: (427, 751)\nstep 9: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (811, 612)\nstep 2: TYPE: Makeup look\nstep 3: CLICK: (899, 885)\nstep 4: CLICK: (472, 318)\nstep 5: PRESS_HOME\nstep 6: CLICK: (386, 215)\nstep 7: TYPE: 1800\nstep 8: CLICK: (427, 751)\nstep 9: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Simple Calendar', 'Tiktok']\nB: ['Clock', 'Youtube']\nC: ['Calculator Plus with History', 'Pluto TV - Live TV and Movies']\nD: ['ClevCalc - Calculator', 'Shorts VotTak: Short Video App']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_192_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_192_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_192_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_192_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_192_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_192_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_192_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_192_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_192_8.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['DuckDuckgo', 'Setting']\nB: ['wikiHow', 'PlantNet']\nC: ['Quora', 'Contacts']\nD: ['Firefox', 'Tripadvisor']\n", "question": "The corresponding actions are: step 1: CLICK: (374, 110)\nstep 2: CLICK: (389, 77)\nstep 3: TYPE: minimalist pictures\nstep 4: CLICK: (443, 143)\nstep 5: CLICK: (238, 133)\nstep 6: CLICK: (260, 608)\nstep 7: SCROLL: UP\nstep 8: CLICK: (488, 522)\nstep 9: PRESS_HOME\nstep 10: SCROLL: LEFT\nstep 11: CLICK: (153, 246)\nstep 12: CLICK: (588, 244)\nstep 13: CLICK: (279, 402)\nstep 14: CLICK: (175, 255)\nstep 15: CLICK: (822, 76)\nstep 16: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (374, 110)\nstep 2: CLICK: (389, 77)\nstep 3: TYPE: minimalist pictures\nstep 4: CLICK: (443, 143)\nstep 5: CLICK: (238, 133)\nstep 6: CLICK: (260, 608)\nstep 7: SCROLL: UP\nstep 8: CLICK: (488, 522)\nstep 9: PRESS_HOME\nstep 10: SCROLL: LEFT\nstep 11: CLICK: (153, 246)\nstep 12: CLICK: (588, 244)\nstep 13: CLICK: (279, 402)\nstep 14: CLICK: (175, 255)\nstep 15: CLICK: (822, 76)\nstep 16: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['DuckDuckgo', 'Setting']\nB: ['wikiHow', 'PlantNet']\nC: ['Quora', 'Contacts']\nD: ['Firefox', 'Tripadvisor']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_193_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_193_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_193_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_193_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_193_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_193_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_193_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_193_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_193_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_193_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_193_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_193_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_193_12.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_193_13.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_193_14.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_193_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Zoho Meeting', 'Messenger']\nB: ['Microsoft Teams', 'Whatsapp']\nC: ['ZOOM Cloud Meetings', 'Instagram']\nD: ['Google Meet', 'Tumblr']\n", "question": "The corresponding actions are: step 1: CLICK: (406, 380)\nstep 2: CLICK: (153, 149)\nstep 3: CLICK: (401, 310)\nstep 4: CLICK: (488, 942)\nstep 5: CLICK: (547, 100)\nstep 6: PRESS_HOME\nstep 7: CLICK: (620, 121)\nstep 8: CLICK: (938, 76)\nstep 9: CLICK: (406, 419)\nstep 10: CLICK: (289, 933)\nstep 11: TYPE: 9298916954\nstep 12: SCROLL: UP\nstep 13: CLICK: (858, 594)\nstep 14: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (406, 380)\nstep 2: CLICK: (153, 149)\nstep 3: CLICK: (401, 310)\nstep 4: CLICK: (488, 942)\nstep 5: CLICK: (547, 100)\nstep 6: PRESS_HOME\nstep 7: CLICK: (620, 121)\nstep 8: CLICK: (938, 76)\nstep 9: CLICK: (406, 419)\nstep 10: CLICK: (289, 933)\nstep 11: TYPE: 9298916954\nstep 12: SCROLL: UP\nstep 13: CLICK: (858, 594)\nstep 14: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Zoho Meeting', 'Messenger']\nB: ['Microsoft Teams', 'Whatsapp']\nC: ['ZOOM Cloud Meetings', 'Instagram']\nD: ['Google Meet', 'Tumblr']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_194_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_194_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_194_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_194_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_194_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_194_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_194_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_194_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_194_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_194_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_194_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_194_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_194_12.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_194_13.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Tumblr', 'Google Docs']\nB: ['X', 'Dropbox Paper']\nC: ['Facebook', 'BasicNote - Notes, Notepad']\nD: ['Whatsapp', 'Notepad - Notes and To Do List']\n", "question": "The corresponding actions are: step 1: CLICK: (153, 377)\nstep 2: CLICK: (929, 242)\nstep 3: CLICK: (336, 445)\nstep 4: PRESS_HOME\nstep 5: CLICK: (138, 249)\nstep 6: CLICK: (624, 944)\nstep 7: CLICK: (481, 223)\nstep 8: CLICK: (352, 896)\nstep 9: TYPE: https://docs.google.com/document/d/1SxaVLphFkdlDbR8lDUqNgyrJ4X0NOi9NJZcavp01Cjo/edit?usp=drivesdk\nstep 10: CLICK: (914, 601)\nstep 11: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (153, 377)\nstep 2: CLICK: (929, 242)\nstep 3: CLICK: (336, 445)\nstep 4: PRESS_HOME\nstep 5: CLICK: (138, 249)\nstep 6: CLICK: (624, 944)\nstep 7: CLICK: (481, 223)\nstep 8: CLICK: (352, 896)\nstep 9: TYPE: https://docs.google.com/document/d/1SxaVLphFkdlDbR8lDUqNgyrJ4X0NOi9NJZcavp01Cjo/edit?usp=drivesdk\nstep 10: CLICK: (914, 601)\nstep 11: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Tumblr', 'Google Docs']\nB: ['X', 'Dropbox Paper']\nC: ['Facebook', 'BasicNote - Notes, Notepad']\nD: ['Whatsapp', 'Notepad - Notes and To Do List']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_195_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_195_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_195_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_195_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_195_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_195_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_195_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_195_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_195_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_195_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_195_10.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Firefox', 'Investing.com']\nB: ['Opera', 'Cash App']\nC: ['Quora', 'PayPal - Send, Shop, Manage']\nD: ['Wikipedia', 'Venmo']\n", "question": "The corresponding actions are: step 1: CLICK: (133, 259)\nstep 2: CLICK: (275, 83)\nstep 3: TYPE: Facebook's stock market news\nstep 4: CLICK: (942, 897)\nstep 5: SCROLL: UP\nstep 6: CLICK: (290, 718)\nstep 7: PRESS_HOME\nstep 8: CLICK: (393, 380)\nstep 9: CLICK: (934, 267)\nstep 10: CLICK: (911, 76)\nstep 11: TYPE: Facebook\nstep 12: CLICK: (370, 242)\nstep 13: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (133, 259)\nstep 2: CLICK: (275, 83)\nstep 3: TYPE: Facebook's stock market news\nstep 4: CLICK: (942, 897)\nstep 5: SCROLL: UP\nstep 6: CLICK: (290, 718)\nstep 7: PRESS_HOME\nstep 8: CLICK: (393, 380)\nstep 9: CLICK: (934, 267)\nstep 10: CLICK: (911, 76)\nstep 11: TYPE: Facebook\nstep 12: CLICK: (370, 242)\nstep 13: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Firefox', 'Investing.com']\nB: ['Opera', 'Cash App']\nC: ['Quora', 'PayPal - Send, Shop, Manage']\nD: ['Wikipedia', 'Venmo']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_196_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_196_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_196_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_196_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_196_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_196_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_196_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_196_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_196_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_196_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_196_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_196_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_196_12.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Setting', 'iNaturalist']\nB: ['Plantin', 'TradingView: Track All Markets']\nC: ['Google Play Store', 'Setting']\nD: ['TradingView: Track All Markets', 'Picturethis']\n", "question": "The corresponding actions are: step 1: CLICK: (423, 707)\nstep 2: SCROLL: UP\nstep 3: SCROLL: UP\nstep 4: CLICK: (123, 697)\nstep 5: CLICK: (518, 316)\nstep 6: CLICK: (473, 388)\nstep 7: CLICK: (482, 514)\nstep 8: CLICK: (978, 73)\nstep 9: TYPE: Turkish\nstep 10: CLICK: (419, 139)\nstep 11: CLICK: (413, 171)\nstep 12: SCROLL: UP\nstep 13: CLICK: (828, 608)\nstep 14: PRESS_HOME\nstep 15: CLICK: (131, 714)\nstep 16: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (423, 707)\nstep 2: SCROLL: UP\nstep 3: SCROLL: UP\nstep 4: CLICK: (123, 697)\nstep 5: CLICK: (518, 316)\nstep 6: CLICK: (473, 388)\nstep 7: CLICK: (482, 514)\nstep 8: CLICK: (978, 73)\nstep 9: TYPE: Turkish\nstep 10: CLICK: (419, 139)\nstep 11: CLICK: (413, 171)\nstep 12: SCROLL: UP\nstep 13: CLICK: (828, 608)\nstep 14: PRESS_HOME\nstep 15: CLICK: (131, 714)\nstep 16: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Setting', 'iNaturalist']\nB: ['Plantin', 'TradingView: Track All Markets']\nC: ['Google Play Store', 'Setting']\nD: ['TradingView: Track All Markets', 'Picturethis']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_197_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_197_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_197_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_197_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_197_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_197_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_197_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_197_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_197_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_197_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_197_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_197_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_197_12.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_197_13.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_197_14.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_197_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Google Meet', 'Facebook']\nB: ['ZOOM Cloud Meetings', 'Gmail']\nC: ['Zoho Meeting', 'Instagram']\nD: ['Microsoft Teams', 'Threads']\n", "question": "The corresponding actions are: step 1: CLICK: (596, 555)\nstep 2: CLICK: (98, 147)\nstep 3: CLICK: (431, 291)\nstep 4: CLICK: (538, 958)\nstep 5: CLICK: (462, 93)\nstep 6: PRESS_HOME\nstep 7: SCROLL: UP\nstep 8: CLICK: (375, 151)\nstep 9: CLICK: (697, 876)\nstep 10: TYPE: caba62244@gmail.com\nstep 11: CLICK: (103, 264)\nstep 12: CLICK: (221, 312)\nstep 13: TYPE: 9198916954\nstep 14: CLICK: (846, 76)\nstep 15: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (596, 555)\nstep 2: CLICK: (98, 147)\nstep 3: CLICK: (431, 291)\nstep 4: CLICK: (538, 958)\nstep 5: CLICK: (462, 93)\nstep 6: PRESS_HOME\nstep 7: SCROLL: UP\nstep 8: CLICK: (375, 151)\nstep 9: CLICK: (697, 876)\nstep 10: TYPE: caba62244@gmail.com\nstep 11: CLICK: (103, 264)\nstep 12: CLICK: (221, 312)\nstep 13: TYPE: 9198916954\nstep 14: CLICK: (846, 76)\nstep 15: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Google Meet', 'Facebook']\nB: ['ZOOM Cloud Meetings', 'Gmail']\nC: ['Zoho Meeting', 'Instagram']\nD: ['Microsoft Teams', 'Threads']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_198_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_198_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_198_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_198_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_198_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_198_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_198_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_198_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_198_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_198_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_198_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_198_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_198_12.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_198_13.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_198_14.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_app_recognition", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: ['Microsoft to do', 'DuckDuckgo']\nB: ['To-Do List', 'Bing: chat with AI & GPT4']\nC: ['TickTick', 'Edge']\nD: ['Todoist', 'Quora']\n", "question": "The corresponding actions are: step 1: CLICK: (791, 491)\nstep 2: CLICK: (472, 82)\nstep 3: TYPE: spaceX rocket launch\nstep 4: CLICK: (192, 166)\nstep 5: CLICK: (471, 566)\nstep 6: CLICK: (714, 715)\nstep 7: CLICK: (278, 88)\nstep 8: TYPE: next spaceX rocket launch\nstep 9: CLICK: (240, 166)\nstep 10: CLICK: (408, 330)\nstep 11: PRESS_RECENT\nstep 12: CLICK: (103, 287)\nstep 13: CLICK: (477, 841)\nstep 14: TYPE: SpaceX rocket launch\nstep 15: CLICK: (956, 432)\nstep 16: COMPLETE\nWhich app-combination list was used in this GUI navigation episode?", "context": "Now you are given screenshot images and corresponding actions of a complete GUI navigation episode. The corresponding actions are: step 1: CLICK: (791, 491)\nstep 2: CLICK: (472, 82)\nstep 3: TYPE: spaceX rocket launch\nstep 4: CLICK: (192, 166)\nstep 5: CLICK: (471, 566)\nstep 6: CLICK: (714, 715)\nstep 7: CLICK: (278, 88)\nstep 8: TYPE: next spaceX rocket launch\nstep 9: CLICK: (240, 166)\nstep 10: CLICK: (408, 330)\nstep 11: PRESS_RECENT\nstep 12: CLICK: (103, 287)\nstep 13: CLICK: (477, 841)\nstep 14: TYPE: SpaceX rocket launch\nstep 15: CLICK: (956, 432)\nstep 16: COMPLETE\nPlease analyze the details and identify the app-combination list used in this episode.\nSelect from the following choices.\nA: ['Microsoft to do', 'DuckDuckgo']\nB: ['To-Do List', 'Bing: chat with AI & GPT4']\nC: ['TickTick', 'Edge']\nD: ['Todoist', 'Quora']\n", "input_image_path": ["./Discrete-temporal/gui_app_recognition/gui_app_recognition_199_0.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_199_1.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_199_2.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_199_3.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_199_4.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_199_5.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_199_6.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_199_7.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_199_8.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_199_9.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_199_10.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_199_11.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_199_12.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_199_13.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_199_14.png", "./Discrete-temporal/gui_app_recognition/gui_app_recognition_199_15.png"], "output": "A", "qwen3-vl": "image none"}]
\ No newline at end of file
diff --git a/results/gui_next_action_prediction/qwen3-vl/metadata_info.json b/results/gui_next_action_prediction/qwen3-vl/metadata_info.json
new file mode 100644
index 0000000..8688988
--- /dev/null
+++ b/results/gui_next_action_prediction/qwen3-vl/metadata_info.json
@@ -0,0 +1 @@
+[{"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: TYPE: THE OUTNET\nB: CLICK: (494, 328)\nC: PRESS_HOME\nD: CLICK: (141, 243)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (149, 240)\nstep 3: CLICK: (174, 516)\nstep 4: CLICK: (586, 708)\nstep 5: CLICK: (184, 534)\nstep 6: CLICK: (587, 714)\nstep 7: CLICK: (136, 626)\nstep 8: CLICK: (877, 777)\nstep 9: CLICK: (208, 701)\nstep 10: CLICK: (858, 899)\nI want to Utilize 'Basic Calculator: GPA & Math' to compute today's total cost by adding 67 and 4. Once calculated, record the result in 'Wallet: Budget Money Manager'. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Utilize 'Basic Calculator: GPA & Math' to compute today's total cost by adding 67 and 4. Once calculated, record the result in 'Wallet: Budget Money Manager'.\nThe historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (149, 240)\nstep 3: CLICK: (174, 516)\nstep 4: CLICK: (586, 708)\nstep 5: CLICK: (184, 534)\nstep 6: CLICK: (587, 714)\nstep 7: CLICK: (136, 626)\nstep 8: CLICK: (877, 777)\nstep 9: CLICK: (208, 701)\nstep 10: CLICK: (858, 899)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: TYPE: THE OUTNET\nB: CLICK: (494, 328)\nC: PRESS_HOME\nD: CLICK: (141, 243)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_0_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_0_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_0_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_0_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_0_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_0_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_0_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_0_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_0_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_0_9.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_0_10.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (422, 378)\nB: CLICK: (712, 596)\nC: SCROLL: UP\nD: CLICK: (523, 613)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (317, 139)\nstep 2: CLICK: (205, 613)\nstep 3: CLICK: (331, 255)\nstep 4: TYPE: Business\nstep 5: CLICK: (749, 63)\nstep 6: CLICK: (588, 602)\nstep 7: PRESS_HOME\nstep 8: CLICK: (337, 147)\nstep 9: SCROLL: UP\nstep 10: CLICK: (560, 656)\nI want to Arrange a business meeting with caba62244@gmail.com, ensure to send out the invitations via Gmail, and use ZOOM Cloud Meetings for the meeting. Don't forget to set an alarm clock for the scheduled time using the Clock app. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Arrange a business meeting with caba62244@gmail.com, ensure to send out the invitations via Gmail, and use ZOOM Cloud Meetings for the meeting. Don't forget to set an alarm clock for the scheduled time using the Clock app.\nThe historical actions are: step 1: CLICK: (317, 139)\nstep 2: CLICK: (205, 613)\nstep 3: CLICK: (331, 255)\nstep 4: TYPE: Business\nstep 5: CLICK: (749, 63)\nstep 6: CLICK: (588, 602)\nstep 7: PRESS_HOME\nstep 8: CLICK: (337, 147)\nstep 9: SCROLL: UP\nstep 10: CLICK: (560, 656)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (422, 378)\nB: CLICK: (712, 596)\nC: SCROLL: UP\nD: CLICK: (523, 613)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_1_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_1_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_1_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_1_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_1_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_1_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_1_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_1_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_1_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_1_9.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_1_10.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (938, 907)\nB: CLICK: (941, 892)\nC: CLICK: (949, 239)\nD: TYPE: craft beer tasting events\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (547, 135)\nstep 2: CLICK: (754, 149)\nstep 3: PRESS_HOME\nstep 4: CLICK: (908, 315)\nI want to Open Spotify and listen to a podcast episode on yoga for beginners, then use Things to create a to-do list for your tasks tomorrow. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Open Spotify and listen to a podcast episode on yoga for beginners, then use Things to create a to-do list for your tasks tomorrow.\nThe historical actions are: step 1: CLICK: (547, 135)\nstep 2: CLICK: (754, 149)\nstep 3: PRESS_HOME\nstep 4: CLICK: (908, 315)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (938, 907)\nB: CLICK: (941, 892)\nC: CLICK: (949, 239)\nD: TYPE: craft beer tasting events\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_2_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_2_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_2_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_2_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_2_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: SCROLL: UP\nB: CLICK: (195, 529)\nC: COMPLETE\nD: TYPE: Weather\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (568, 506)\nstep 2: CLICK: (256, 79)\nstep 3: TYPE: what is the latest Terminator movie\nstep 4: CLICK: (865, 889)\nI want to Use app X to find the most recent Terminator movie, then consult your Calendar app to identify a free evening to watch it. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Use app X to find the most recent Terminator movie, then consult your Calendar app to identify a free evening to watch it.\nThe historical actions are: step 1: CLICK: (568, 506)\nstep 2: CLICK: (256, 79)\nstep 3: TYPE: what is the latest Terminator movie\nstep 4: CLICK: (865, 889)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: SCROLL: UP\nB: CLICK: (195, 529)\nC: COMPLETE\nD: TYPE: Weather\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_3_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_3_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_3_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_3_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_3_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (347, 249)\nB: CLICK: (576, 247)\nC: TYPE: when is the Fashion week in Paris\nD: CLICK: (321, 153)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (694, 546)\nstep 2: TYPE: the best farmers market in San Jose\nstep 3: CLICK: (899, 686)\nstep 4: PRESS_HOME\nI want to Using Duckduckgo to find the best farmers market in your local city, and then navigate to it with Firefox. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Using Duckduckgo to find the best farmers market in your local city, and then navigate to it with Firefox.\nThe historical actions are: step 1: CLICK: (694, 546)\nstep 2: TYPE: the best farmers market in San Jose\nstep 3: CLICK: (899, 686)\nstep 4: PRESS_HOME\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (347, 249)\nB: CLICK: (576, 247)\nC: TYPE: when is the Fashion week in Paris\nD: CLICK: (321, 153)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_4_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_4_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_4_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_4_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_4_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: TYPE: I am happy \nB: CLICK: (889, 924)\nC: COMPLETE\nD: CLICK: (822, 133)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (129, 170)\nstep 2: CLICK: (679, 145)\nstep 3: CLICK: (651, 222)\nstep 4: CLICK: (422, 395)\nI want to Post your current feelings on Facebook and then share the same post with Victor Jame via Messenger. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Post your current feelings on Facebook and then share the same post with Victor Jame via Messenger.\nThe historical actions are: step 1: CLICK: (129, 170)\nstep 2: CLICK: (679, 145)\nstep 3: CLICK: (651, 222)\nstep 4: CLICK: (422, 395)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: TYPE: I am happy \nB: CLICK: (889, 924)\nC: COMPLETE\nD: CLICK: (822, 133)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_5_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_5_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_5_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_5_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_5_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (824, 254)\nB: TYPE:  fashion show events\nC: PRESS_HOME\nD: CLICK: (164, 720)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (443, 917)\nstep 2: CLICK: (205, 436)\nstep 3: TYPE: Nvidia's stock market news\nstep 4: CLICK: (855, 897)\nI want to Use Chrome to search for today's stock market news of the company Nvidia, and then open TradingView: Track All Markets to check the stock price trends. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Use Chrome to search for today's stock market news of the company Nvidia, and then open TradingView: Track All Markets to check the stock price trends.\nThe historical actions are: step 1: CLICK: (443, 917)\nstep 2: CLICK: (205, 436)\nstep 3: TYPE: Nvidia's stock market news\nstep 4: CLICK: (855, 897)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (824, 254)\nB: TYPE:  fashion show events\nC: PRESS_HOME\nD: CLICK: (164, 720)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_6_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_6_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_6_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_6_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_6_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (931, 915)\nB: CLICK: (508, 366)\nC: CLICK: (680, 485)\nD: TYPE: hiking trail\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (873, 628)\nstep 2: CLICK: (494, 246)\nstep 3: TYPE:  biographical movies\nstep 4: CLICK: (961, 892)\nstep 5: SCROLL: UP\nstep 6: CLICK: (461, 458)\nstep 7: CLICK: (179, 457)\nstep 8: PRESS_HOME\nstep 9: CLICK: (123, 498)\nstep 10: CLICK: (431, 144)\nstep 11: TYPE: snacks\nstep 12: CLICK: (949, 878)\nstep 13: SCROLL: UP\nstep 14: CLICK: (636, 586)\nstep 15: SCROLL: UP\nI want to Organize a movie night by selecting a biographical film on Opera, adding snacks to your Amazon cart, sending out invites to Victor James via Facebook Messenger, and setting a reminder on the Clock app. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Organize a movie night by selecting a biographical film on Opera, adding snacks to your Amazon cart, sending out invites to Victor James via Facebook Messenger, and setting a reminder on the Clock app.\nThe historical actions are: step 1: CLICK: (873, 628)\nstep 2: CLICK: (494, 246)\nstep 3: TYPE:  biographical movies\nstep 4: CLICK: (961, 892)\nstep 5: SCROLL: UP\nstep 6: CLICK: (461, 458)\nstep 7: CLICK: (179, 457)\nstep 8: PRESS_HOME\nstep 9: CLICK: (123, 498)\nstep 10: CLICK: (431, 144)\nstep 11: TYPE: snacks\nstep 12: CLICK: (949, 878)\nstep 13: SCROLL: UP\nstep 14: CLICK: (636, 586)\nstep 15: SCROLL: UP\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (931, 915)\nB: CLICK: (508, 366)\nC: CLICK: (680, 485)\nD: TYPE: hiking trail\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_7_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_7_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_7_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_7_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_7_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_7_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_7_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_7_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_7_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_7_9.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_7_10.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_7_11.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_7_12.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_7_13.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_7_14.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_7_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (171, 160)\nB: SCROLL: UP\nC: CLICK: (264, 542)\nD: CLICK: (401, 729)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (511, 906)\nstep 2: CLICK: (406, 44)\nstep 3: CLICK: (388, 108)\nstep 4: TYPE: advanture movie\nstep 5: CLICK: (295, 263)\nstep 6: SCROLL: UP\nI want to Use Chrome to search for an Adventure movie, then watch it on Pluto TV - Live TV and Movies. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Use Chrome to search for an Adventure movie, then watch it on Pluto TV - Live TV and Movies.\nThe historical actions are: step 1: CLICK: (511, 906)\nstep 2: CLICK: (406, 44)\nstep 3: CLICK: (388, 108)\nstep 4: TYPE: advanture movie\nstep 5: CLICK: (295, 263)\nstep 6: SCROLL: UP\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (171, 160)\nB: SCROLL: UP\nC: CLICK: (264, 542)\nD: CLICK: (401, 729)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_8_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_8_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_8_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_8_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_8_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_8_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_8_6.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (611, 358)\nB: TYPE: 2022 nobel prize winners in physics:\nC: CLICK: (430, 932)\nD: CLICK: (945, 599)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (705, 217)\nstep 2: CLICK: (286, 251)\nstep 3: CLICK: (916, 154)\nstep 4: TYPE: 2022 nobel prize winners in physics\nstep 5: CLICK: (897, 859)\nstep 6: LONG_PRESS: (99, 549)\nstep 7: SCROLL: RIGHT\nstep 8: CLICK: (144, 489)\nstep 9: PRESS_HOME\nstep 10: CLICK: (844, 499)\nstep 11: CLICK: (236, 357)\nstep 12: CLICK: (236, 357)\nI want to Using Firefox, search for the winners of the 2022 Nobel Prize in Physics and then use WPS Office to record the information. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Using Firefox, search for the winners of the 2022 Nobel Prize in Physics and then use WPS Office to record the information.\nThe historical actions are: step 1: CLICK: (705, 217)\nstep 2: CLICK: (286, 251)\nstep 3: CLICK: (916, 154)\nstep 4: TYPE: 2022 nobel prize winners in physics\nstep 5: CLICK: (897, 859)\nstep 6: LONG_PRESS: (99, 549)\nstep 7: SCROLL: RIGHT\nstep 8: CLICK: (144, 489)\nstep 9: PRESS_HOME\nstep 10: CLICK: (844, 499)\nstep 11: CLICK: (236, 357)\nstep 12: CLICK: (236, 357)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (611, 358)\nB: TYPE: 2022 nobel prize winners in physics:\nC: CLICK: (430, 932)\nD: CLICK: (945, 599)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_9_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_9_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_9_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_9_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_9_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_9_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_9_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_9_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_9_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_9_9.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_9_10.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_9_11.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_9_12.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (535, 201)\nB: CLICK: (798, 431)\nC: CLICK: (893, 928)\nD: CLICK: (934, 437)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (413, 363)\nstep 2: CLICK: (936, 87)\nstep 3: TYPE: the best ice cream parlor in Los Angeles\nstep 4: CLICK: (965, 62)\nstep 5: CLICK: (199, 395)\nstep 6: PRESS_HOME\nstep 7: CLICK: (190, 140)\nstep 8: CLICK: (165, 840)\nI want to Utilize GPS and Tiktok to locate the top-rated ice cream parlor in your local city and navigate to it. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Utilize GPS and Tiktok to locate the top-rated ice cream parlor in your local city and navigate to it.\nThe historical actions are: step 1: CLICK: (413, 363)\nstep 2: CLICK: (936, 87)\nstep 3: TYPE: the best ice cream parlor in Los Angeles\nstep 4: CLICK: (965, 62)\nstep 5: CLICK: (199, 395)\nstep 6: PRESS_HOME\nstep 7: CLICK: (190, 140)\nstep 8: CLICK: (165, 840)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (535, 201)\nB: CLICK: (798, 431)\nC: CLICK: (893, 928)\nD: CLICK: (934, 437)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_10_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_10_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_10_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_10_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_10_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_10_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_10_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_10_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_10_8.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: TYPE: Chinese\nB: PRESS_HOME\nC: CLICK: (572, 509)\nD: TYPE: Washington:Minchigan is 13:34\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (831, 502)\nstep 2: SCROLL: UP\nstep 3: SCROLL: UP\nstep 4: CLICK: (399, 730)\nstep 5: CLICK: (368, 272)\nstep 6: CLICK: (374, 327)\nstep 7: CLICK: (344, 473)\nstep 8: CLICK: (936, 59)\nI want to First, navigate to the 'Setting' app on your phone and switch the language setting to Chinese (Simplified). Then, open the 'Photos' app to verify the change. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: First, navigate to the 'Setting' app on your phone and switch the language setting to Chinese (Simplified). Then, open the 'Photos' app to verify the change.\nThe historical actions are: step 1: CLICK: (831, 502)\nstep 2: SCROLL: UP\nstep 3: SCROLL: UP\nstep 4: CLICK: (399, 730)\nstep 5: CLICK: (368, 272)\nstep 6: CLICK: (374, 327)\nstep 7: CLICK: (344, 473)\nstep 8: CLICK: (936, 59)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: TYPE: Chinese\nB: PRESS_HOME\nC: CLICK: (572, 509)\nD: TYPE: Washington:Minchigan is 13:34\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_11_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_11_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_11_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_11_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_11_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_11_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_11_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_11_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_11_8.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (58, 357)\nB: PRESS_HOME\nC: CLICK: (438, 929)\nD: TYPE: https://sg.docworkspace.com/d/sIDzG2tP9AcPpp7EG\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (830, 485)\nstep 2: CLICK: (318, 460)\nstep 3: CLICK: (802, 150)\nstep 4: CLICK: (926, 155)\nstep 5: CLICK: (601, 423)\nstep 6: CLICK: (783, 150)\nstep 7: CLICK: (713, 674)\nstep 8: PRESS_HOME\nstep 9: CLICK: (384, 150)\nstep 10: CLICK: (699, 817)\nstep 11: TYPE: caba62244@gmail.com\nstep 12: CLICK: (465, 496)\nstep 13: SCROLL: UP\nstep 14: CLICK: (268, 478)\nI want to Locate the working file on your phone using WPS Office and then email it to caba62244@gmail.com via Gmail. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Locate the working file on your phone using WPS Office and then email it to caba62244@gmail.com via Gmail.\nThe historical actions are: step 1: CLICK: (830, 485)\nstep 2: CLICK: (318, 460)\nstep 3: CLICK: (802, 150)\nstep 4: CLICK: (926, 155)\nstep 5: CLICK: (601, 423)\nstep 6: CLICK: (783, 150)\nstep 7: CLICK: (713, 674)\nstep 8: PRESS_HOME\nstep 9: CLICK: (384, 150)\nstep 10: CLICK: (699, 817)\nstep 11: TYPE: caba62244@gmail.com\nstep 12: CLICK: (465, 496)\nstep 13: SCROLL: UP\nstep 14: CLICK: (268, 478)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (58, 357)\nB: PRESS_HOME\nC: CLICK: (438, 929)\nD: TYPE: https://sg.docworkspace.com/d/sIDzG2tP9AcPpp7EG\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_12_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_12_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_12_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_12_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_12_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_12_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_12_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_12_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_12_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_12_9.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_12_10.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_12_11.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_12_12.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_12_13.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_12_14.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (139, 640)\nB: COMPLETE\nC: PRESS_HOME\nD: CLICK: (937, 898)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (177, 616)\nstep 2: CLICK: (355, 371)\nstep 3: TYPE: hiking trail\nstep 4: CLICK: (930, 869)\nstep 5: CLICK: (490, 761)\nstep 6: PRESS_HOME\nstep 7: CLICK: (602, 494)\nstep 8: CLICK: (923, 567)\nstep 9: TYPE: California\nI want to Use Chrome to search for a new hiking trail, check the weekend weather forecast on Windy.com-Weather Forecast, and then invite Tzhau Jau to join the hike through Instagram. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Use Chrome to search for a new hiking trail, check the weekend weather forecast on Windy.com-Weather Forecast, and then invite Tzhau Jau to join the hike through Instagram.\nThe historical actions are: step 1: CLICK: (177, 616)\nstep 2: CLICK: (355, 371)\nstep 3: TYPE: hiking trail\nstep 4: CLICK: (930, 869)\nstep 5: CLICK: (490, 761)\nstep 6: PRESS_HOME\nstep 7: CLICK: (602, 494)\nstep 8: CLICK: (923, 567)\nstep 9: TYPE: California\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (139, 640)\nB: COMPLETE\nC: PRESS_HOME\nD: CLICK: (937, 898)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_13_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_13_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_13_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_13_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_13_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_13_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_13_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_13_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_13_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_13_9.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: SCROLL: LEFT\nB: SCROLL: UP\nC: SCROLL: DOWN\nD: CLICK: (31, 960)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (603, 801)\nstep 2: CLICK: (190, 427)\nstep 3: CLICK: (834, 565)\nI want to Open the Firefox Browser to search for the best video blogs on travel vlogs. Then, go to the Setting app to turn up the brightness on your phone. Finally, open the YouTube app to follow the video blogs you found. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Open the Firefox Browser to search for the best video blogs on travel vlogs. Then, go to the Setting app to turn up the brightness on your phone. Finally, open the YouTube app to follow the video blogs you found.\nThe historical actions are: step 1: CLICK: (603, 801)\nstep 2: CLICK: (190, 427)\nstep 3: CLICK: (834, 565)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: SCROLL: LEFT\nB: SCROLL: UP\nC: SCROLL: DOWN\nD: CLICK: (31, 960)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_14_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_14_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_14_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_14_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (537, 937)\nB: TYPE: Palo Alto Junior Museum and Zoo\nC: PRESS_HOME\nD: COMPLETE\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (425, 262)\nstep 2: CLICK: (161, 71)\nstep 3: TYPE: zoo\nstep 4: CLICK: (867, 684)\nstep 5: PRESS_HOME\nstep 6: CLICK: (433, 409)\nstep 7: CLICK: (350, 616)\nI want to Locate the nearest zoo using Google Map and then book a ride with Uber. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Locate the nearest zoo using Google Map and then book a ride with Uber.\nThe historical actions are: step 1: CLICK: (425, 262)\nstep 2: CLICK: (161, 71)\nstep 3: TYPE: zoo\nstep 4: CLICK: (867, 684)\nstep 5: PRESS_HOME\nstep 6: CLICK: (433, 409)\nstep 7: CLICK: (350, 616)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (537, 937)\nB: TYPE: Palo Alto Junior Museum and Zoo\nC: PRESS_HOME\nD: COMPLETE\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_15_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_15_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_15_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_15_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_15_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_15_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_15_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_15_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (516, 921)\nB: TYPE: Load\nC: CLICK: (544, 353)\nD: COMPLETE\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (864, 553)\nstep 3: CLICK: (654, 748)\nstep 4: CLICK: (747, 758)\nstep 5: CLICK: (840, 749)\nstep 6: CLICK: (581, 751)\nstep 7: CLICK: (630, 761)\nstep 8: PRESS_HOME\nstep 9: CLICK: (709, 713)\nstep 10: CLICK: (399, 889)\nstep 11: CLICK: (506, 266)\nstep 12: CLICK: (552, 516)\nstep 13: CLICK: (484, 618)\nI want to Utilize the 'Scientific calculator plus 991' to compute the sum of 23 and 12 for today's total cost. Once calculated, document this total in the 'Monefy' app for record-keeping purposes. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Utilize the 'Scientific calculator plus 991' to compute the sum of 23 and 12 for today's total cost. Once calculated, document this total in the 'Monefy' app for record-keeping purposes.\nThe historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (864, 553)\nstep 3: CLICK: (654, 748)\nstep 4: CLICK: (747, 758)\nstep 5: CLICK: (840, 749)\nstep 6: CLICK: (581, 751)\nstep 7: CLICK: (630, 761)\nstep 8: PRESS_HOME\nstep 9: CLICK: (709, 713)\nstep 10: CLICK: (399, 889)\nstep 11: CLICK: (506, 266)\nstep 12: CLICK: (552, 516)\nstep 13: CLICK: (484, 618)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (516, 921)\nB: TYPE: Load\nC: CLICK: (544, 353)\nD: COMPLETE\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_16_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_16_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_16_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_16_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_16_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_16_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_16_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_16_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_16_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_16_9.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_16_10.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_16_11.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_16_12.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_16_13.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: SCROLL: UP\nB: CLICK: (858, 182)\nC: CLICK: (946, 903)\nD: TYPE: Fever dream high in the quiet of the night\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (846, 810)\nstep 2: SCROLL: UP\nstep 3: CLICK: (315, 426)\nstep 4: SCROLL: RIGHT\nstep 5: SCROLL: RIGHT\nstep 6: CLICK: (339, 300)\nstep 7: PRESS_HOME\nstep 8: SCROLL: UP\nI want to Open Instagram to watch a trending video and then use the Settings app to turn up the volume on your phone. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Open Instagram to watch a trending video and then use the Settings app to turn up the volume on your phone.\nThe historical actions are: step 1: CLICK: (846, 810)\nstep 2: SCROLL: UP\nstep 3: CLICK: (315, 426)\nstep 4: SCROLL: RIGHT\nstep 5: SCROLL: RIGHT\nstep 6: CLICK: (339, 300)\nstep 7: PRESS_HOME\nstep 8: SCROLL: UP\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: SCROLL: UP\nB: CLICK: (858, 182)\nC: CLICK: (946, 903)\nD: TYPE: Fever dream high in the quiet of the night\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_17_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_17_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_17_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_17_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_17_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_17_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_17_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_17_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_17_8.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (101, 614)\nB: CLICK: (58, 56)\nC: SCROLL: LEFT\nD: CLICK: (925, 918)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: SCROLL: LEFT\nstep 2: CLICK: (637, 372)\nstep 3: CLICK: (64, 51)\nstep 4: CLICK: (887, 80)\nstep 5: TYPE: statue of liberty\nstep 6: CLICK: (437, 248)\nstep 7: CLICK: (836, 51)\nstep 8: CLICK: (901, 648)\nstep 9: PRESS_HOME\nstep 10: CLICK: (179, 636)\nstep 11: CLICK: (914, 904)\nstep 12: CLICK: (882, 797)\nstep 13: TYPE: a travel guide to Statue of Liberty\nstep 14: CLICK: (199, 163)\nstep 15: CLICK: (420, 654)\nI want to Using Tripadvisor, look up a travel guide for visiting the Statue of Liberty in New York and note down the resource website in Notepad - Notes and To Do List. After that, check AccuWeather to choose a rain-free day for your visit. Finally, book your flight from San Francisco through Expedia. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Using Tripadvisor, look up a travel guide for visiting the Statue of Liberty in New York and note down the resource website in Notepad - Notes and To Do List. After that, check AccuWeather to choose a rain-free day for your visit. Finally, book your flight from San Francisco through Expedia.\nThe historical actions are: step 1: SCROLL: LEFT\nstep 2: CLICK: (637, 372)\nstep 3: CLICK: (64, 51)\nstep 4: CLICK: (887, 80)\nstep 5: TYPE: statue of liberty\nstep 6: CLICK: (437, 248)\nstep 7: CLICK: (836, 51)\nstep 8: CLICK: (901, 648)\nstep 9: PRESS_HOME\nstep 10: CLICK: (179, 636)\nstep 11: CLICK: (914, 904)\nstep 12: CLICK: (882, 797)\nstep 13: TYPE: a travel guide to Statue of Liberty\nstep 14: CLICK: (199, 163)\nstep 15: CLICK: (420, 654)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (101, 614)\nB: CLICK: (58, 56)\nC: SCROLL: LEFT\nD: CLICK: (925, 918)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_18_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_18_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_18_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_18_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_18_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_18_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_18_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_18_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_18_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_18_9.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_18_10.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_18_11.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_18_12.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_18_13.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_18_14.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_18_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: TYPE: Deebot\nB: CLICK: (916, 864)\nC: SCROLL: UP\nD: PRESS_HOME\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (524, 81)\nstep 2: CLICK: (766, 85)\nstep 3: TYPE: eclectic\nI want to Using Pinterest, locate an image that showcases an eclectic style. Once found, navigate to the Setting app on your phone to set this image as your wallpaper. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Using Pinterest, locate an image that showcases an eclectic style. Once found, navigate to the Setting app on your phone to set this image as your wallpaper.\nThe historical actions are: step 1: CLICK: (524, 81)\nstep 2: CLICK: (766, 85)\nstep 3: TYPE: eclectic\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: TYPE: Deebot\nB: CLICK: (916, 864)\nC: SCROLL: UP\nD: PRESS_HOME\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_19_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_19_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_19_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_19_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: TYPE: https://www.threads.net/@ct.5024/post/C60c4dFO2YQ/?xmt=AQGzuhpc3kkbQPp3APiJPjh-3Y8uPVOjnWk8g2lv940IEg\nB: CLICK: (308, 253)\nC: SCROLL: UP\nD: COMPLETE\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (604, 290)\nstep 2: CLICK: (499, 916)\nstep 3: TYPE: I am sad\nstep 4: CLICK: (847, 496)\nstep 5: CLICK: (909, 924)\nstep 6: SCROLL: UP\nstep 7: CLICK: (545, 597)\nstep 8: CLICK: (561, 882)\nstep 9: PRESS_HOME\nstep 10: CLICK: (576, 170)\nstep 11: CLICK: (894, 169)\nstep 12: CLICK: (394, 648)\nstep 13: CLICK: (341, 917)\nI want to Use Instagram to post today's feelings, then share that post on Threads with Tzhau Jau. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Use Instagram to post today's feelings, then share that post on Threads with Tzhau Jau.\nThe historical actions are: step 1: CLICK: (604, 290)\nstep 2: CLICK: (499, 916)\nstep 3: TYPE: I am sad\nstep 4: CLICK: (847, 496)\nstep 5: CLICK: (909, 924)\nstep 6: SCROLL: UP\nstep 7: CLICK: (545, 597)\nstep 8: CLICK: (561, 882)\nstep 9: PRESS_HOME\nstep 10: CLICK: (576, 170)\nstep 11: CLICK: (894, 169)\nstep 12: CLICK: (394, 648)\nstep 13: CLICK: (341, 917)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: TYPE: https://www.threads.net/@ct.5024/post/C60c4dFO2YQ/?xmt=AQGzuhpc3kkbQPp3APiJPjh-3Y8uPVOjnWk8g2lv940IEg\nB: CLICK: (308, 253)\nC: SCROLL: UP\nD: COMPLETE\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_20_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_20_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_20_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_20_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_20_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_20_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_20_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_20_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_20_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_20_9.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_20_10.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_20_11.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_20_12.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_20_13.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (418, 649)\nB: TYPE: vanilla extract\nC: CLICK: (871, 688)\nD: TYPE: 3D Printer Course for Beginners\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (311, 349)\nstep 2: CLICK: (915, 82)\nstep 3: TYPE: knitting class\nstep 4: CLICK: (924, 875)\nstep 5: CLICK: (224, 678)\nstep 6: PRESS_HOME\nstep 7: CLICK: (687, 69)\nstep 8: CLICK: (388, 66)\nstep 9: TYPE: knitting wool\nstep 10: CLICK: (944, 879)\nI want to Find a knitting class on TikTok, purchase the needed materials through SHEIN, and create a calendar event to remind you to study using your Calendar app. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Find a knitting class on TikTok, purchase the needed materials through SHEIN, and create a calendar event to remind you to study using your Calendar app.\nThe historical actions are: step 1: CLICK: (311, 349)\nstep 2: CLICK: (915, 82)\nstep 3: TYPE: knitting class\nstep 4: CLICK: (924, 875)\nstep 5: CLICK: (224, 678)\nstep 6: PRESS_HOME\nstep 7: CLICK: (687, 69)\nstep 8: CLICK: (388, 66)\nstep 9: TYPE: knitting wool\nstep 10: CLICK: (944, 879)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (418, 649)\nB: TYPE: vanilla extract\nC: CLICK: (871, 688)\nD: TYPE: 3D Printer Course for Beginners\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_21_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_21_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_21_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_21_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_21_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_21_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_21_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_21_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_21_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_21_9.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_21_10.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: TYPE: tiktok\nB: PRESS_HOME\nC: CLICK: (386, 650)\nD: CLICK: (324, 928)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (632, 797)\nstep 2: CLICK: (783, 175)\nstep 3: TYPE: bmw most popular car\nstep 4: CLICK: (427, 168)\nI want to Identify the most popular BMW vehicle and verify its price using the CarWale: Buy-Sell New/Used Car app and Chrome. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Identify the most popular BMW vehicle and verify its price using the CarWale: Buy-Sell New/Used Car app and Chrome.\nThe historical actions are: step 1: CLICK: (632, 797)\nstep 2: CLICK: (783, 175)\nstep 3: TYPE: bmw most popular car\nstep 4: CLICK: (427, 168)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: TYPE: tiktok\nB: PRESS_HOME\nC: CLICK: (386, 650)\nD: CLICK: (324, 928)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_22_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_22_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_22_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_22_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_22_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: PRESS_HOME\nB: TYPE: https://www.threads.net/@ct.5024/post/C60dWMKOvyV/?xmt=AQGzrQXOCtihDH5Csh-v2jLg95hKd8qdzO3HygJBDKRrZA\nC: CLICK: (488, 522)\nD: CLICK: (519, 621)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (374, 110)\nstep 2: CLICK: (389, 77)\nstep 3: TYPE: minimalist pictures\nstep 4: CLICK: (443, 143)\nstep 5: CLICK: (238, 133)\nstep 6: CLICK: (260, 608)\nstep 7: SCROLL: UP\nI want to Using DuckDuckGo, search for an image that embodies a minimalist style. Once you have found an appropriate picture, open the Settings app on your phone and set the image as your wallpaper. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Using DuckDuckGo, search for an image that embodies a minimalist style. Once you have found an appropriate picture, open the Settings app on your phone and set the image as your wallpaper.\nThe historical actions are: step 1: CLICK: (374, 110)\nstep 2: CLICK: (389, 77)\nstep 3: TYPE: minimalist pictures\nstep 4: CLICK: (443, 143)\nstep 5: CLICK: (238, 133)\nstep 6: CLICK: (260, 608)\nstep 7: SCROLL: UP\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: PRESS_HOME\nB: TYPE: https://www.threads.net/@ct.5024/post/C60dWMKOvyV/?xmt=AQGzrQXOCtihDH5Csh-v2jLg95hKd8qdzO3HygJBDKRrZA\nC: CLICK: (488, 522)\nD: CLICK: (519, 621)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_23_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_23_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_23_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_23_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_23_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_23_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_23_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_23_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (330, 583)\nB: COMPLETE\nC: SCROLL: UP\nD: TYPE: Colosseum, Rome\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: SCROLL: LEFT\nstep 2: CLICK: (392, 249)\nstep 3: CLICK: (628, 65)\nstep 4: TYPE: Dyson V11 Torque Drive\nstep 5: CLICK: (885, 907)\nI want to Check and compare the prices for a Dyson V11 Torque Drive across Target and AliExpress shopping apps, and make sure to add the one with the lowest price to your cart. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Check and compare the prices for a Dyson V11 Torque Drive across Target and AliExpress shopping apps, and make sure to add the one with the lowest price to your cart.\nThe historical actions are: step 1: SCROLL: LEFT\nstep 2: CLICK: (392, 249)\nstep 3: CLICK: (628, 65)\nstep 4: TYPE: Dyson V11 Torque Drive\nstep 5: CLICK: (885, 907)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (330, 583)\nB: COMPLETE\nC: SCROLL: UP\nD: TYPE: Colosseum, Rome\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_24_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_24_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_24_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_24_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_24_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_24_5.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (567, 684)\nB: PRESS_HOME\nC: CLICK: (802, 74)\nD: CLICK: (95, 962)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (674, 351)\nstep 2: CLICK: (227, 812)\nstep 3: TYPE: tell me about Fundamental theorem of calculus\nstep 4: CLICK: (902, 486)\nI want to Utilize the 'Bing: chat with AI & GPT4' app to inquire about the Fundamental Theorem of Calculus. Then, confirm the details by searching in your browser using the 'ChatOn - AI Chat Bot Assistant' app. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Utilize the 'Bing: chat with AI & GPT4' app to inquire about the Fundamental Theorem of Calculus. Then, confirm the details by searching in your browser using the 'ChatOn - AI Chat Bot Assistant' app.\nThe historical actions are: step 1: CLICK: (674, 351)\nstep 2: CLICK: (227, 812)\nstep 3: TYPE: tell me about Fundamental theorem of calculus\nstep 4: CLICK: (902, 486)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (567, 684)\nB: PRESS_HOME\nC: CLICK: (802, 74)\nD: CLICK: (95, 962)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_25_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_25_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_25_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_25_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_25_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (922, 908)\nB: CLICK: (918, 898)\nC: CLICK: (840, 884)\nD: CLICK: (145, 623)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (853, 374)\nstep 2: CLICK: (824, 49)\nstep 3: TYPE: Video Editing Apps\nstep 4: CLICK: (918, 905)\nstep 5: SCROLL: UP\nstep 6: SCROLL: UP\nstep 7: SCROLL: UP\nstep 8: PRESS_HOME\nstep 9: SCROLL: RIGHT\nI want to Investigate video editing applications, select and download one using 'Google Play Store'. You can also use 'Facebook' to seek recommendations or reviews. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Investigate video editing applications, select and download one using 'Google Play Store'. You can also use 'Facebook' to seek recommendations or reviews.\nThe historical actions are: step 1: CLICK: (853, 374)\nstep 2: CLICK: (824, 49)\nstep 3: TYPE: Video Editing Apps\nstep 4: CLICK: (918, 905)\nstep 5: SCROLL: UP\nstep 6: SCROLL: UP\nstep 7: SCROLL: UP\nstep 8: PRESS_HOME\nstep 9: SCROLL: RIGHT\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (922, 908)\nB: CLICK: (918, 898)\nC: CLICK: (840, 884)\nD: CLICK: (145, 623)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_26_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_26_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_26_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_26_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_26_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_26_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_26_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_26_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_26_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_26_9.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (488, 921)\nB: PRESS_HOME\nC: CLICK: (223, 124)\nD: CLICK: (219, 943)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (362, 617)\nstep 2: TYPE: fantasy movies\nstep 3: CLICK: (897, 892)\nstep 4: CLICK: (443, 430)\nstep 5: SCROLL: UP\nstep 6: SCROLL: UP\nstep 7: PRESS_HOME\nstep 8: CLICK: (595, 510)\nI want to Organize a movie night by selecting a fantasy movie on DuckDuckgo, adding some snacks to your Amazon shopping cart, sending out invitations to katsunaksu via Tumblr app, and setting a reminder on Clock. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Organize a movie night by selecting a fantasy movie on DuckDuckgo, adding some snacks to your Amazon shopping cart, sending out invitations to katsunaksu via Tumblr app, and setting a reminder on Clock.\nThe historical actions are: step 1: CLICK: (362, 617)\nstep 2: TYPE: fantasy movies\nstep 3: CLICK: (897, 892)\nstep 4: CLICK: (443, 430)\nstep 5: SCROLL: UP\nstep 6: SCROLL: UP\nstep 7: PRESS_HOME\nstep 8: CLICK: (595, 510)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (488, 921)\nB: PRESS_HOME\nC: CLICK: (223, 124)\nD: CLICK: (219, 943)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_27_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_27_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_27_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_27_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_27_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_27_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_27_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_27_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_27_8.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (371, 528)\nB: CLICK: (917, 880)\nC: CLICK: (667, 672)\nD: CLICK: (609, 372)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (791, 478)\nstep 2: CLICK: (720, 69)\nstep 3: CLICK: (405, 71)\nstep 4: TYPE: cooking class\nstep 5: CLICK: (881, 880)\nstep 6: CLICK: (434, 280)\nstep 7: CLICK: (326, 446)\nstep 8: PRESS_HOME\nstep 9: CLICK: (552, 478)\nstep 10: CLICK: (277, 71)\nstep 11: TYPE: cooking pan\nstep 12: CLICK: (880, 884)\nstep 13: CLICK: (155, 507)\nstep 14: CLICK: (461, 904)\nstep 15: PRESS_HOME\nI want to Search for a Cooking class on Likee, purchase the required materials through Wish, and schedule a study reminder in Calendar. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Search for a Cooking class on Likee, purchase the required materials through Wish, and schedule a study reminder in Calendar.\nThe historical actions are: step 1: CLICK: (791, 478)\nstep 2: CLICK: (720, 69)\nstep 3: CLICK: (405, 71)\nstep 4: TYPE: cooking class\nstep 5: CLICK: (881, 880)\nstep 6: CLICK: (434, 280)\nstep 7: CLICK: (326, 446)\nstep 8: PRESS_HOME\nstep 9: CLICK: (552, 478)\nstep 10: CLICK: (277, 71)\nstep 11: TYPE: cooking pan\nstep 12: CLICK: (880, 884)\nstep 13: CLICK: (155, 507)\nstep 14: CLICK: (461, 904)\nstep 15: PRESS_HOME\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (371, 528)\nB: CLICK: (917, 880)\nC: CLICK: (667, 672)\nD: CLICK: (609, 372)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_28_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_28_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_28_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_28_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_28_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_28_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_28_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_28_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_28_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_28_9.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_28_10.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_28_11.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_28_12.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_28_13.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_28_14.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_28_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: PRESS_HOME\nB: TYPE: properties of hexagon\nC: CLICK: (419, 708)\nD: SCROLL: UP\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (141, 703)\nstep 2: CLICK: (683, 368)\nstep 3: PRESS_HOME\nI want to First, access the Google Play Store to install the Tiktok app. Once installed, open Tiktok, then navigate to the device's Settings to disable notifications for the app. After adjusting the settings, reopen Tiktok to watch a video. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: First, access the Google Play Store to install the Tiktok app. Once installed, open Tiktok, then navigate to the device's Settings to disable notifications for the app. After adjusting the settings, reopen Tiktok to watch a video.\nThe historical actions are: step 1: CLICK: (141, 703)\nstep 2: CLICK: (683, 368)\nstep 3: PRESS_HOME\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: PRESS_HOME\nB: TYPE: properties of hexagon\nC: CLICK: (419, 708)\nD: SCROLL: UP\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_29_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_29_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_29_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_29_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (527, 869)\nB: PRESS_HOME\nC: CLICK: (427, 466)\nD: CLICK: (858, 871)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (397, 499)\nstep 2: CLICK: (76, 160)\nstep 3: TYPE: Plastic Pollution Solutions\nstep 4: CLICK: (940, 888)\nI want to Use Google News to search for the latest articles on Plastic Pollution Solutions, then share the article on X with liudehu19294094. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Use Google News to search for the latest articles on Plastic Pollution Solutions, then share the article on X with liudehu19294094.\nThe historical actions are: step 1: CLICK: (397, 499)\nstep 2: CLICK: (76, 160)\nstep 3: TYPE: Plastic Pollution Solutions\nstep 4: CLICK: (940, 888)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (527, 869)\nB: PRESS_HOME\nC: CLICK: (427, 466)\nD: CLICK: (858, 871)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_30_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_30_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_30_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_30_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_30_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: PRESS_HOME\nB: CLICK: (671, 539)\nC: CLICK: (929, 80)\nD: CLICK: (192, 631)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (610, 833)\nstep 2: PRESS_HOME\nstep 3: CLICK: (855, 835)\nstep 4: CLICK: (828, 72)\nI want to Watch a video on YouTube that discusses recommendations for fitness tracking apps, then head over to the Google Play Store and download one of the suggested apps. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Watch a video on YouTube that discusses recommendations for fitness tracking apps, then head over to the Google Play Store and download one of the suggested apps.\nThe historical actions are: step 1: CLICK: (610, 833)\nstep 2: PRESS_HOME\nstep 3: CLICK: (855, 835)\nstep 4: CLICK: (828, 72)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: PRESS_HOME\nB: CLICK: (671, 539)\nC: CLICK: (929, 80)\nD: CLICK: (192, 631)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_31_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_31_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_31_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_31_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_31_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (926, 879)\nB: CLICK: (409, 132)\nC: CLICK: (666, 183)\nD: CLICK: (519, 103)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (666, 496)\nstep 3: CLICK: (124, 101)\nstep 4: CLICK: (633, 114)\nstep 5: CLICK: (894, 103)\nI want to Locate a nearby fitness training center using Waze Navigation & Live Traffic, then proceed to the Google Play Store to download a fitness tracking app for setting your goals. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Locate a nearby fitness training center using Waze Navigation & Live Traffic, then proceed to the Google Play Store to download a fitness tracking app for setting your goals.\nThe historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (666, 496)\nstep 3: CLICK: (124, 101)\nstep 4: CLICK: (633, 114)\nstep 5: CLICK: (894, 103)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (926, 879)\nB: CLICK: (409, 132)\nC: CLICK: (666, 183)\nD: CLICK: (519, 103)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_32_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_32_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_32_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_32_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_32_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_32_5.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (249, 164)\nB: CLICK: (362, 71)\nC: CLICK: (271, 934)\nD: PRESS_HOME\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (384, 520)\nstep 2: CLICK: (88, 71)\nstep 3: CLICK: (624, 927)\nI want to Listen to an Electronic style album on Pandora, and then share the name of the album with katsunaksu on Tumblr. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Listen to an Electronic style album on Pandora, and then share the name of the album with katsunaksu on Tumblr.\nThe historical actions are: step 1: CLICK: (384, 520)\nstep 2: CLICK: (88, 71)\nstep 3: CLICK: (624, 927)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (249, 164)\nB: CLICK: (362, 71)\nC: CLICK: (271, 934)\nD: PRESS_HOME\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_33_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_33_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_33_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_33_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (316, 475)\nB: CLICK: (257, 257)\nC: CLICK: (559, 183)\nD: SCROLL: UP\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (154, 532)\nstep 2: CLICK: (514, 931)\nstep 3: CLICK: (250, 258)\nstep 4: TYPE: vintage camera\nstep 5: CLICK: (898, 936)\nstep 6: CLICK: (648, 852)\nstep 7: CLICK: (873, 846)\nstep 8: CLICK: (308, 790)\nstep 9: CLICK: (308, 192)\nstep 10: TYPE: caba62244@gmail.com\nI want to Use GenZArt:Fast AI Art Generator to create an image focused on a vintage camera, then share it through Gmail with caba62244@gmail.com. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Use GenZArt:Fast AI Art Generator to create an image focused on a vintage camera, then share it through Gmail with caba62244@gmail.com.\nThe historical actions are: step 1: CLICK: (154, 532)\nstep 2: CLICK: (514, 931)\nstep 3: CLICK: (250, 258)\nstep 4: TYPE: vintage camera\nstep 5: CLICK: (898, 936)\nstep 6: CLICK: (648, 852)\nstep 7: CLICK: (873, 846)\nstep 8: CLICK: (308, 790)\nstep 9: CLICK: (308, 192)\nstep 10: TYPE: caba62244@gmail.com\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (316, 475)\nB: CLICK: (257, 257)\nC: CLICK: (559, 183)\nD: SCROLL: UP\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_34_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_34_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_34_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_34_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_34_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_34_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_34_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_34_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_34_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_34_9.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_34_10.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: SCROLL: LEFT\nB: PRESS_RECENT\nC: TYPE: fitness tracking apps\nD: CLICK: (228, 798)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (788, 340)\nstep 3: CLICK: (45, 682)\nstep 4: CLICK: (387, 241)\nstep 5: CLICK: (201, 234)\nstep 6: TYPE: boxing gym\nstep 7: CLICK: (193, 179)\nstep 8: PRESS_RECENT\nstep 9: CLICK: (94, 266)\nstep 10: CLICK: (298, 99)\nstep 11: TYPE: fitness tracking apps\nstep 12: CLICK: (214, 172)\nI want to Utilize Yandex Navigator to locate a nearby Boxing gym, and subsequently, visit the Google Play Store to download a fitness tracking app for setting your fitness goals. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Utilize Yandex Navigator to locate a nearby Boxing gym, and subsequently, visit the Google Play Store to download a fitness tracking app for setting your fitness goals.\nThe historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (788, 340)\nstep 3: CLICK: (45, 682)\nstep 4: CLICK: (387, 241)\nstep 5: CLICK: (201, 234)\nstep 6: TYPE: boxing gym\nstep 7: CLICK: (193, 179)\nstep 8: PRESS_RECENT\nstep 9: CLICK: (94, 266)\nstep 10: CLICK: (298, 99)\nstep 11: TYPE: fitness tracking apps\nstep 12: CLICK: (214, 172)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: SCROLL: LEFT\nB: PRESS_RECENT\nC: TYPE: fitness tracking apps\nD: CLICK: (228, 798)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_35_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_35_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_35_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_35_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_35_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_35_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_35_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_35_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_35_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_35_9.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_35_10.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_35_11.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_35_12.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (148, 806)\nB: CLICK: (59, 76)\nC: SCROLL: RIGHT\nD: CLICK: (248, 121)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (629, 809)\nstep 2: CLICK: (253, 435)\nstep 3: CLICK: (836, 560)\nstep 4: PRESS_HOME\nstep 5: CLICK: (389, 806)\nstep 6: CLICK: (292, 344)\nstep 7: CLICK: (693, 76)\nstep 8: PRESS_HOME\nI want to Search for the best video blogs on DIY crafts using the Firefox Browser, then increase the brightness on your phone through the Settings app. Finally, open the Youtube app and follow along with the videos. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Search for the best video blogs on DIY crafts using the Firefox Browser, then increase the brightness on your phone through the Settings app. Finally, open the Youtube app and follow along with the videos.\nThe historical actions are: step 1: CLICK: (629, 809)\nstep 2: CLICK: (253, 435)\nstep 3: CLICK: (836, 560)\nstep 4: PRESS_HOME\nstep 5: CLICK: (389, 806)\nstep 6: CLICK: (292, 344)\nstep 7: CLICK: (693, 76)\nstep 8: PRESS_HOME\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (148, 806)\nB: CLICK: (59, 76)\nC: SCROLL: RIGHT\nD: CLICK: (248, 121)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_36_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_36_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_36_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_36_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_36_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_36_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_36_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_36_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_36_8.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: TYPE: AI\nB: CLICK: (369, 261)\nC: SCROLL: UP\nD: CLICK: (647, 519)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (399, 374)\nstep 3: CLICK: (107, 476)\nstep 4: CLICK: (345, 624)\nstep 5: CLICK: (717, 590)\nstep 6: CLICK: (321, 649)\nstep 7: CLICK: (642, 841)\nstep 8: CLICK: (610, 689)\nstep 9: CLICK: (629, 588)\nstep 10: CLICK: (895, 754)\nstep 11: CLICK: (593, 751)\nstep 12: CLICK: (185, 685)\nstep 13: CLICK: (838, 845)\nstep 14: PRESS_HOME\nI want to Use 'ClevCalc - Calculator' to compute the sum of 5.69 and 34 for today's total cost, and then log the result in 'Wallet: Budget Money Manager'. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Use 'ClevCalc - Calculator' to compute the sum of 5.69 and 34 for today's total cost, and then log the result in 'Wallet: Budget Money Manager'.\nThe historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (399, 374)\nstep 3: CLICK: (107, 476)\nstep 4: CLICK: (345, 624)\nstep 5: CLICK: (717, 590)\nstep 6: CLICK: (321, 649)\nstep 7: CLICK: (642, 841)\nstep 8: CLICK: (610, 689)\nstep 9: CLICK: (629, 588)\nstep 10: CLICK: (895, 754)\nstep 11: CLICK: (593, 751)\nstep 12: CLICK: (185, 685)\nstep 13: CLICK: (838, 845)\nstep 14: PRESS_HOME\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: TYPE: AI\nB: CLICK: (369, 261)\nC: SCROLL: UP\nD: CLICK: (647, 519)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_37_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_37_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_37_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_37_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_37_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_37_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_37_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_37_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_37_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_37_9.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_37_10.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_37_11.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_37_12.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_37_13.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_37_14.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (338, 431)\nB: TYPE: May 2 Cost\nC: TYPE: 'Dune' by Frank Herbert book review\nD: CLICK: (802, 64)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (613, 624)\nstep 2: CLICK: (831, 50)\nstep 3: TYPE: portable speaker recommendation\nstep 4: CLICK: (916, 922)\nstep 5: SCROLL: UP\nstep 6: SCROLL: UP\nstep 7: SCROLL: UP\nstep 8: SCROLL: UP\nstep 9: SCROLL: UP\nstep 10: CLICK: (634, 570)\nstep 11: PRESS_HOME\nstep 12: CLICK: (414, 109)\nstep 13: CLICK: (712, 65)\nI want to Utilize Amazon to search for and identify a highly recommended portable speaker. Once you have selected a speaker, use Amazon to make the purchase. Additionally, you can check Facebook for user reviews and recommendations before finalizing your decision. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Utilize Amazon to search for and identify a highly recommended portable speaker. Once you have selected a speaker, use Amazon to make the purchase. Additionally, you can check Facebook for user reviews and recommendations before finalizing your decision.\nThe historical actions are: step 1: CLICK: (613, 624)\nstep 2: CLICK: (831, 50)\nstep 3: TYPE: portable speaker recommendation\nstep 4: CLICK: (916, 922)\nstep 5: SCROLL: UP\nstep 6: SCROLL: UP\nstep 7: SCROLL: UP\nstep 8: SCROLL: UP\nstep 9: SCROLL: UP\nstep 10: CLICK: (634, 570)\nstep 11: PRESS_HOME\nstep 12: CLICK: (414, 109)\nstep 13: CLICK: (712, 65)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (338, 431)\nB: TYPE: May 2 Cost\nC: TYPE: 'Dune' by Frank Herbert book review\nD: CLICK: (802, 64)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_38_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_38_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_38_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_38_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_38_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_38_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_38_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_38_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_38_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_38_9.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_38_10.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_38_11.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_38_12.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_38_13.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: COMPLETE\nB: CLICK: (496, 924)\nC: CLICK: (873, 188)\nD: SCROLL: LEFT\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (410, 815)\nstep 2: CLICK: (918, 317)\nstep 3: PRESS_HOME\nstep 4: CLICK: (130, 244)\nI want to Open Triller to watch a trending video, and then use the Setting app to turn the volume on your phone to maximum. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Open Triller to watch a trending video, and then use the Setting app to turn the volume on your phone to maximum.\nThe historical actions are: step 1: CLICK: (410, 815)\nstep 2: CLICK: (918, 317)\nstep 3: PRESS_HOME\nstep 4: CLICK: (130, 244)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: COMPLETE\nB: CLICK: (496, 924)\nC: CLICK: (873, 188)\nD: SCROLL: LEFT\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_39_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_39_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_39_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_39_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_39_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (138, 93)\nB: TYPE: 'The Midnight Library' by Matt Haig book review\nC: COMPLETE\nD: CLICK: (491, 146)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (727, 547)\nstep 2: CLICK: (373, 146)\nstep 3: TYPE: Autonomous Driving Technologies\nstep 4: CLICK: (887, 689)\nI want to Using X and Microsoft News, search for the latest news articles on Autonomous Driving Technologies and share the most relevant article. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Using X and Microsoft News, search for the latest news articles on Autonomous Driving Technologies and share the most relevant article.\nThe historical actions are: step 1: CLICK: (727, 547)\nstep 2: CLICK: (373, 146)\nstep 3: TYPE: Autonomous Driving Technologies\nstep 4: CLICK: (887, 689)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (138, 93)\nB: TYPE: 'The Midnight Library' by Matt Haig book review\nC: COMPLETE\nD: CLICK: (491, 146)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_40_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_40_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_40_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_40_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_40_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: TYPE: cricket\nB: TYPE: Fall of the Roman Empire\nC: CLICK: (865, 738)\nD: CLICK: (245, 92)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (397, 372)\nstep 2: CLICK: (237, 604)\nstep 3: CLICK: (492, 615)\nstep 4: CLICK: (789, 629)\nstep 5: CLICK: (209, 718)\nstep 6: CLICK: (527, 721)\nstep 7: CLICK: (759, 725)\nstep 8: SCROLL: UP\nstep 9: SCROLL: UP\nI want to First, use Applock Pro - APP Lock & Guard to lock the PayPal - Send, Shop, Manage app. Then, open the PayPal - Send, Shop, Manage app to verify the lock by entering the PIN 123456. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: First, use Applock Pro - APP Lock & Guard to lock the PayPal - Send, Shop, Manage app. Then, open the PayPal - Send, Shop, Manage app to verify the lock by entering the PIN 123456.\nThe historical actions are: step 1: CLICK: (397, 372)\nstep 2: CLICK: (237, 604)\nstep 3: CLICK: (492, 615)\nstep 4: CLICK: (789, 629)\nstep 5: CLICK: (209, 718)\nstep 6: CLICK: (527, 721)\nstep 7: CLICK: (759, 725)\nstep 8: SCROLL: UP\nstep 9: SCROLL: UP\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: TYPE: cricket\nB: TYPE: Fall of the Roman Empire\nC: CLICK: (865, 738)\nD: CLICK: (245, 92)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_41_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_41_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_41_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_41_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_41_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_41_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_41_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_41_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_41_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_41_9.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: COMPLETE\nB: CLICK: (698, 449)\nC: CLICK: (142, 152)\nD: CLICK: (694, 431)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (795, 910)\nstep 2: PRESS_HOME\nstep 3: CLICK: (681, 909)\nstep 4: CLICK: (180, 80)\nstep 5: TYPE: Home Workout No Equipment App\nstep 6: CLICK: (864, 873)\nstep 7: CLICK: (177, 818)\nI want to Open YouTube APP and watch a video about fitness tracking app recommendations. Then, go to the Google Play Store and download one of the recommended apps. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Open YouTube APP and watch a video about fitness tracking app recommendations. Then, go to the Google Play Store and download one of the recommended apps.\nThe historical actions are: step 1: CLICK: (795, 910)\nstep 2: PRESS_HOME\nstep 3: CLICK: (681, 909)\nstep 4: CLICK: (180, 80)\nstep 5: TYPE: Home Workout No Equipment App\nstep 6: CLICK: (864, 873)\nstep 7: CLICK: (177, 818)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: COMPLETE\nB: CLICK: (698, 449)\nC: CLICK: (142, 152)\nD: CLICK: (694, 431)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_42_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_42_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_42_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_42_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_42_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_42_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_42_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_42_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (581, 382)\nB: CLICK: (616, 383)\nC: LONG_PRESS: (420, 547)\nD: CLICK: (865, 839)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (130, 365)\nstep 3: CLICK: (199, 783)\nstep 4: CLICK: (194, 783)\nstep 5: CLICK: (889, 785)\nstep 6: CLICK: (423, 785)\nstep 7: CLICK: (423, 785)\nstep 8: CLICK: (907, 891)\nstep 9: PRESS_HOME\nI want to Use Calculator Plus with History to compute today's total cost by adding 11 and 22, then record the result in Monefy. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Use Calculator Plus with History to compute today's total cost by adding 11 and 22, then record the result in Monefy.\nThe historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (130, 365)\nstep 3: CLICK: (199, 783)\nstep 4: CLICK: (194, 783)\nstep 5: CLICK: (889, 785)\nstep 6: CLICK: (423, 785)\nstep 7: CLICK: (423, 785)\nstep 8: CLICK: (907, 891)\nstep 9: PRESS_HOME\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (581, 382)\nB: CLICK: (616, 383)\nC: LONG_PRESS: (420, 547)\nD: CLICK: (865, 839)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_43_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_43_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_43_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_43_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_43_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_43_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_43_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_43_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_43_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_43_9.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (791, 303)\nB: CLICK: (938, 385)\nC: CLICK: (138, 554)\nD: SCROLL: UP\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: SCROLL: RIGHT\nstep 2: CLICK: (831, 619)\nstep 3: CLICK: (904, 180)\nstep 4: TYPE: ford most popular car\nstep 5: CLICK: (924, 903)\nstep 6: PRESS_HOME\nstep 7: SCROLL: LEFT\nstep 8: CLICK: (858, 622)\nstep 9: CLICK: (490, 317)\nstep 10: CLICK: (187, 129)\nstep 11: CLICK: (269, 167)\nstep 12: TYPE: f\nI want to Using Firefox, identify the most popular car product of Chevrolet and verify its price on the AutoScout24: Buy & sell cars app. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Using Firefox, identify the most popular car product of Chevrolet and verify its price on the AutoScout24: Buy & sell cars app.\nThe historical actions are: step 1: SCROLL: RIGHT\nstep 2: CLICK: (831, 619)\nstep 3: CLICK: (904, 180)\nstep 4: TYPE: ford most popular car\nstep 5: CLICK: (924, 903)\nstep 6: PRESS_HOME\nstep 7: SCROLL: LEFT\nstep 8: CLICK: (858, 622)\nstep 9: CLICK: (490, 317)\nstep 10: CLICK: (187, 129)\nstep 11: CLICK: (269, 167)\nstep 12: TYPE: f\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (791, 303)\nB: CLICK: (938, 385)\nC: CLICK: (138, 554)\nD: SCROLL: UP\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_44_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_44_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_44_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_44_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_44_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_44_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_44_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_44_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_44_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_44_9.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_44_10.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_44_11.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_44_12.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (858, 212)\nB: CLICK: (424, 249)\nC: CLICK: (527, 894)\nD: CLICK: (807, 539)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (705, 223)\nstep 3: CLICK: (813, 256)\nstep 4: TYPE: weather in Shanghai tomorrow\nstep 5: CLICK: (930, 248)\nstep 6: PRESS_HOME\nI want to Using Firefox, find out what the weather will be like in Shanghai tomorrow and then use Google Docs to create a to-do list based on that forecast. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Using Firefox, find out what the weather will be like in Shanghai tomorrow and then use Google Docs to create a to-do list based on that forecast.\nThe historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (705, 223)\nstep 3: CLICK: (813, 256)\nstep 4: TYPE: weather in Shanghai tomorrow\nstep 5: CLICK: (930, 248)\nstep 6: PRESS_HOME\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (858, 212)\nB: CLICK: (424, 249)\nC: CLICK: (527, 894)\nD: CLICK: (807, 539)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_45_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_45_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_45_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_45_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_45_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_45_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_45_6.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (804, 656)\nB: CLICK: (848, 406)\nC: CLICK: (344, 851)\nD: PRESS_HOME\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (154, 484)\nstep 2: CLICK: (884, 793)\nstep 3: CLICK: (547, 548)\nstep 4: CLICK: (919, 167)\nstep 5: CLICK: (766, 742)\nstep 6: CLICK: (873, 166)\nstep 7: CLICK: (806, 164)\nI want to Use Adobe Express: AI Video Design to resize a photo and then share it to Facebook Moments. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Use Adobe Express: AI Video Design to resize a photo and then share it to Facebook Moments.\nThe historical actions are: step 1: CLICK: (154, 484)\nstep 2: CLICK: (884, 793)\nstep 3: CLICK: (547, 548)\nstep 4: CLICK: (919, 167)\nstep 5: CLICK: (766, 742)\nstep 6: CLICK: (873, 166)\nstep 7: CLICK: (806, 164)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (804, 656)\nB: CLICK: (848, 406)\nC: CLICK: (344, 851)\nD: PRESS_HOME\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_46_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_46_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_46_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_46_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_46_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_46_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_46_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_46_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: TYPE: Hindi\nB: SCROLL: UP\nC: TYPE:  do yoga with this\nD: TYPE: the beach boys\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (317, 495)\nstep 2: SCROLL: UP\nstep 3: SCROLL: UP\nstep 4: SCROLL: UP\nstep 5: CLICK: (119, 643)\nstep 6: CLICK: (546, 354)\nstep 7: CLICK: (589, 438)\nstep 8: CLICK: (529, 628)\nstep 9: CLICK: (962, 75)\nI want to Switch the phone's language to Hindi, then open the 'Setting' app to confirm the change, followed by the 'Photos' app to ensure everything is working properly. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Switch the phone's language to Hindi, then open the 'Setting' app to confirm the change, followed by the 'Photos' app to ensure everything is working properly.\nThe historical actions are: step 1: CLICK: (317, 495)\nstep 2: SCROLL: UP\nstep 3: SCROLL: UP\nstep 4: SCROLL: UP\nstep 5: CLICK: (119, 643)\nstep 6: CLICK: (546, 354)\nstep 7: CLICK: (589, 438)\nstep 8: CLICK: (529, 628)\nstep 9: CLICK: (962, 75)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: TYPE: Hindi\nB: SCROLL: UP\nC: TYPE:  do yoga with this\nD: TYPE: the beach boys\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_47_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_47_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_47_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_47_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_47_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_47_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_47_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_47_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_47_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_47_9.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (610, 610)\nB: PRESS_HOME\nC: CLICK: (422, 581)\nD: SCROLL: UP\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (794, 337)\nstep 2: CLICK: (395, 612)\nstep 3: CLICK: (514, 610)\nI want to Using Applock Pro - APP Lock & Guard, secure the Google Pay app. Once done, unlock Google Pay and verify its functionality using the PIN 123789. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Using Applock Pro - APP Lock & Guard, secure the Google Pay app. Once done, unlock Google Pay and verify its functionality using the PIN 123789.\nThe historical actions are: step 1: CLICK: (794, 337)\nstep 2: CLICK: (395, 612)\nstep 3: CLICK: (514, 610)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (610, 610)\nB: PRESS_HOME\nC: CLICK: (422, 581)\nD: SCROLL: UP\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_48_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_48_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_48_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_48_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (76, 299)\nB: TYPE: eBay\nC: CLICK: (598, 649)\nD: SCROLL: UP\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (185, 645)\nstep 2: CLICK: (299, 425)\nstep 3: TYPE: documentary movies\nstep 4: CLICK: (927, 897)\nstep 5: SCROLL: UP\nstep 6: CLICK: (253, 635)\nstep 7: PRESS_HOME\nstep 8: CLICK: (555, 475)\nstep 9: CLICK: (379, 156)\nstep 10: TYPE: snacks\nstep 11: CLICK: (899, 884)\nstep 12: CLICK: (827, 284)\nI want to Organize a movie night by choosing a documentary film using Chrome, adding snacks to your Ebay cart, sending invitations to liudehu19294094 via X, and setting a reminder on Clock. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Organize a movie night by choosing a documentary film using Chrome, adding snacks to your Ebay cart, sending invitations to liudehu19294094 via X, and setting a reminder on Clock.\nThe historical actions are: step 1: CLICK: (185, 645)\nstep 2: CLICK: (299, 425)\nstep 3: TYPE: documentary movies\nstep 4: CLICK: (927, 897)\nstep 5: SCROLL: UP\nstep 6: CLICK: (253, 635)\nstep 7: PRESS_HOME\nstep 8: CLICK: (555, 475)\nstep 9: CLICK: (379, 156)\nstep 10: TYPE: snacks\nstep 11: CLICK: (899, 884)\nstep 12: CLICK: (827, 284)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (76, 299)\nB: TYPE: eBay\nC: CLICK: (598, 649)\nD: SCROLL: UP\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_49_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_49_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_49_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_49_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_49_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_49_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_49_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_49_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_49_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_49_9.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_49_10.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_49_11.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_49_12.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (480, 626)\nB: PRESS_HOME\nC: COMPLETE\nD: CLICK: (127, 471)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (855, 743)\nstep 2: CLICK: (563, 248)\nstep 3: PRESS_HOME\nstep 4: CLICK: (397, 726)\nI want to First, go to the App Store and uninstall the TikTok app. Confirm that TikTok is successfully uninstalled. Next, download the Likee app from the Google Play Store. Once the download is complete, open the Likee app. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: First, go to the App Store and uninstall the TikTok app. Confirm that TikTok is successfully uninstalled. Next, download the Likee app from the Google Play Store. Once the download is complete, open the Likee app.\nThe historical actions are: step 1: CLICK: (855, 743)\nstep 2: CLICK: (563, 248)\nstep 3: PRESS_HOME\nstep 4: CLICK: (397, 726)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (480, 626)\nB: PRESS_HOME\nC: COMPLETE\nD: CLICK: (127, 471)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_50_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_50_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_50_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_50_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_50_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: SCROLL: UP\nB: CLICK: (641, 277)\nC: CLICK: (926, 63)\nD: CLICK: (931, 917)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (870, 512)\nstep 2: CLICK: (378, 231)\nstep 3: CLICK: (931, 62)\nstep 4: CLICK: (593, 442)\nstep 5: CLICK: (634, 249)\nstep 6: PRESS_HOME\nstep 7: CLICK: (578, 125)\nI want to Locate the working file on your phone using Google Docs, then share it with Tzhau Jau via Instagram. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Locate the working file on your phone using Google Docs, then share it with Tzhau Jau via Instagram.\nThe historical actions are: step 1: CLICK: (870, 512)\nstep 2: CLICK: (378, 231)\nstep 3: CLICK: (931, 62)\nstep 4: CLICK: (593, 442)\nstep 5: CLICK: (634, 249)\nstep 6: PRESS_HOME\nstep 7: CLICK: (578, 125)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: SCROLL: UP\nB: CLICK: (641, 277)\nC: CLICK: (926, 63)\nD: CLICK: (931, 917)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_51_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_51_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_51_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_51_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_51_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_51_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_51_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_51_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (944, 930)\nB: TYPE: 2019 nobel prize winners in physics\nC: CLICK: (872, 787)\nD: CLICK: (627, 943)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (159, 646)\nstep 2: CLICK: (427, 280)\nstep 3: TYPE:  Francis Crick\nI want to Open Chrome, search for an introduction about Francis Crick, and share the link to the webpage on Tumblr with katsunaksu. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Open Chrome, search for an introduction about Francis Crick, and share the link to the webpage on Tumblr with katsunaksu.\nThe historical actions are: step 1: CLICK: (159, 646)\nstep 2: CLICK: (427, 280)\nstep 3: TYPE:  Francis Crick\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (944, 930)\nB: TYPE: 2019 nobel prize winners in physics\nC: CLICK: (872, 787)\nD: CLICK: (627, 943)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_52_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_52_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_52_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_52_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: SCROLL: UP\nB: COMPLETE\nC: PRESS_HOME\nD: TYPE:  do yoga with this\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (134, 137)\nstep 2: CLICK: (523, 194)\nstep 3: CLICK: (217, 832)\nstep 4: TYPE: bank\nstep 5: CLICK: (922, 895)\nstep 6: CLICK: (304, 154)\nstep 7: PRESS_HOME\nstep 8: CLICK: (143, 397)\nstep 9: CLICK: (121, 502)\nstep 10: TYPE: Citi\nstep 11: CLICK: (210, 317)\nstep 12: CLICK: (607, 929)\nstep 13: CLICK: (386, 944)\nI want to Locate a nearby bank using GPS and then book a ride with Uber. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Locate a nearby bank using GPS and then book a ride with Uber.\nThe historical actions are: step 1: CLICK: (134, 137)\nstep 2: CLICK: (523, 194)\nstep 3: CLICK: (217, 832)\nstep 4: TYPE: bank\nstep 5: CLICK: (922, 895)\nstep 6: CLICK: (304, 154)\nstep 7: PRESS_HOME\nstep 8: CLICK: (143, 397)\nstep 9: CLICK: (121, 502)\nstep 10: TYPE: Citi\nstep 11: CLICK: (210, 317)\nstep 12: CLICK: (607, 929)\nstep 13: CLICK: (386, 944)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: SCROLL: UP\nB: COMPLETE\nC: PRESS_HOME\nD: TYPE:  do yoga with this\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_53_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_53_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_53_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_53_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_53_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_53_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_53_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_53_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_53_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_53_9.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_53_10.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_53_11.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_53_12.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_53_13.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (409, 290)\nB: CLICK: (795, 184)\nC: CLICK: (322, 228)\nD: CLICK: (485, 764)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (688, 673)\nstep 2: CLICK: (431, 898)\nstep 3: CLICK: (220, 253)\nstep 4: CLICK: (463, 884)\nstep 5: CLICK: (422, 796)\nstep 6: SCROLL: UP\nstep 7: CLICK: (831, 686)\nI want to Use Google Meet to set up an online meeting, and then share the meeting link with Victor James via Facebook. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Use Google Meet to set up an online meeting, and then share the meeting link with Victor James via Facebook.\nThe historical actions are: step 1: CLICK: (688, 673)\nstep 2: CLICK: (431, 898)\nstep 3: CLICK: (220, 253)\nstep 4: CLICK: (463, 884)\nstep 5: CLICK: (422, 796)\nstep 6: SCROLL: UP\nstep 7: CLICK: (831, 686)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (409, 290)\nB: CLICK: (795, 184)\nC: CLICK: (322, 228)\nD: CLICK: (485, 764)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_54_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_54_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_54_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_54_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_54_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_54_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_54_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_54_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: TYPE: Amazfit BIP\nB: CLICK: (529, 912)\nC: CLICK: (252, 107)\nD: TYPE: Love story\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (128, 508)\nstep 2: CLICK: (629, 925)\nstep 3: CLICK: (885, 929)\nstep 4: CLICK: (206, 86)\nstep 5: TYPE: International Space Station\nstep 6: CLICK: (916, 891)\nstep 7: CLICK: (143, 289)\nstep 8: CLICK: (641, 54)\nstep 9: CLICK: (889, 661)\nstep 10: PRESS_HOME\nstep 11: CLICK: (403, 107)\nI want to Utilize the BBC News app to find the most recent articles related to the International Space Station, and then share one of the articles on Facebook. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Utilize the BBC News app to find the most recent articles related to the International Space Station, and then share one of the articles on Facebook.\nThe historical actions are: step 1: CLICK: (128, 508)\nstep 2: CLICK: (629, 925)\nstep 3: CLICK: (885, 929)\nstep 4: CLICK: (206, 86)\nstep 5: TYPE: International Space Station\nstep 6: CLICK: (916, 891)\nstep 7: CLICK: (143, 289)\nstep 8: CLICK: (641, 54)\nstep 9: CLICK: (889, 661)\nstep 10: PRESS_HOME\nstep 11: CLICK: (403, 107)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: TYPE: Amazfit BIP\nB: CLICK: (529, 912)\nC: CLICK: (252, 107)\nD: TYPE: Love story\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_55_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_55_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_55_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_55_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_55_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_55_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_55_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_55_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_55_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_55_9.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_55_10.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_55_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (703, 940)\nB: CLICK: (921, 909)\nC: COMPLETE\nD: SCROLL: UP\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (889, 635)\nstep 2: CLICK: (379, 131)\nstep 3: TYPE: animation movies\nI want to Organize a movie night by first selecting an animated film on Opera, then adding some snacks to your cart on Amazon. Next, send an invitation to caba62244@gmail.com using Gmail, and finally, set a reminder on Clock to ensure you don't forget. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Organize a movie night by first selecting an animated film on Opera, then adding some snacks to your cart on Amazon. Next, send an invitation to caba62244@gmail.com using Gmail, and finally, set a reminder on Clock to ensure you don't forget.\nThe historical actions are: step 1: CLICK: (889, 635)\nstep 2: CLICK: (379, 131)\nstep 3: TYPE: animation movies\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (703, 940)\nB: CLICK: (921, 909)\nC: COMPLETE\nD: SCROLL: UP\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_56_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_56_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_56_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_56_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: COMPLETE\nB: TYPE: park\nC: CLICK: (875, 876)\nD: SCROLL: RIGHT\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (837, 522)\nstep 2: CLICK: (505, 466)\nstep 3: CLICK: (371, 294)\nI want to Utilize Citymapper to locate a nearby park and then visit the Google Play Store to download a fitness tracking app to set your fitness goals. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Utilize Citymapper to locate a nearby park and then visit the Google Play Store to download a fitness tracking app to set your fitness goals.\nThe historical actions are: step 1: CLICK: (837, 522)\nstep 2: CLICK: (505, 466)\nstep 3: CLICK: (371, 294)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: COMPLETE\nB: TYPE: park\nC: CLICK: (875, 876)\nD: SCROLL: RIGHT\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_57_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_57_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_57_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_57_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: PRESS_HOME\nB: CLICK: (185, 645)\nC: CLICK: (865, 77)\nD: COMPLETE\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (843, 523)\nstep 2: CLICK: (897, 849)\nstep 3: CLICK: (131, 374)\nstep 4: CLICK: (899, 80)\nstep 5: CLICK: (731, 812)\nstep 6: CLICK: (916, 88)\nstep 7: CLICK: (809, 76)\nstep 8: CLICK: (943, 81)\nstep 9: CLICK: (297, 896)\nI want to Use Adobe Express: AI Video Design to resize a photo, then share the resized photo to Facebook moments. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Use Adobe Express: AI Video Design to resize a photo, then share the resized photo to Facebook moments.\nThe historical actions are: step 1: CLICK: (843, 523)\nstep 2: CLICK: (897, 849)\nstep 3: CLICK: (131, 374)\nstep 4: CLICK: (899, 80)\nstep 5: CLICK: (731, 812)\nstep 6: CLICK: (916, 88)\nstep 7: CLICK: (809, 76)\nstep 8: CLICK: (943, 81)\nstep 9: CLICK: (297, 896)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: PRESS_HOME\nB: CLICK: (185, 645)\nC: CLICK: (865, 77)\nD: COMPLETE\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_58_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_58_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_58_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_58_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_58_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_58_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_58_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_58_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_58_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_58_9.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: PRESS_HOME\nB: CLICK: (315, 298)\nC: CLICK: (174, 374)\nD: SCROLL: UP\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (165, 110)\nstep 2: CLICK: (340, 570)\nstep 3: CLICK: (206, 683)\nstep 4: CLICK: (814, 72)\nstep 5: TYPE: Uber\nstep 6: CLICK: (209, 148)\nstep 7: CLICK: (146, 541)\nstep 8: CLICK: (838, 449)\nstep 9: PRESS_HOME\nstep 10: CLICK: (386, 240)\nI want to Switch the notifications on or off for any application on your phone, and then proceed to launch the app. Make sure you use the 'Setting' app to adjust the notifications and then open the 'Uber' app. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Switch the notifications on or off for any application on your phone, and then proceed to launch the app. Make sure you use the 'Setting' app to adjust the notifications and then open the 'Uber' app.\nThe historical actions are: step 1: CLICK: (165, 110)\nstep 2: CLICK: (340, 570)\nstep 3: CLICK: (206, 683)\nstep 4: CLICK: (814, 72)\nstep 5: TYPE: Uber\nstep 6: CLICK: (209, 148)\nstep 7: CLICK: (146, 541)\nstep 8: CLICK: (838, 449)\nstep 9: PRESS_HOME\nstep 10: CLICK: (386, 240)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: PRESS_HOME\nB: CLICK: (315, 298)\nC: CLICK: (174, 374)\nD: SCROLL: UP\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_59_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_59_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_59_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_59_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_59_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_59_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_59_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_59_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_59_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_59_9.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_59_10.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (369, 842)\nB: CLICK: (538, 106)\nC: SCROLL: UP\nD: SCROLL: LEFT\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (453, 491)\nstep 2: SCROLL: UP\nstep 3: CLICK: (160, 682)\nstep 4: SCROLL: UP\nstep 5: CLICK: (437, 672)\nstep 6: CLICK: (480, 427)\nstep 7: CLICK: (480, 427)\nstep 8: PRESS_HOME\nI want to Utilize the 'Setting' app to activate 'Do not disturb' mode on your phone and then use the 'Clock' app to set an alarm for 6:00 AM to wake you up. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Utilize the 'Setting' app to activate 'Do not disturb' mode on your phone and then use the 'Clock' app to set an alarm for 6:00 AM to wake you up.\nThe historical actions are: step 1: CLICK: (453, 491)\nstep 2: SCROLL: UP\nstep 3: CLICK: (160, 682)\nstep 4: SCROLL: UP\nstep 5: CLICK: (437, 672)\nstep 6: CLICK: (480, 427)\nstep 7: CLICK: (480, 427)\nstep 8: PRESS_HOME\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (369, 842)\nB: CLICK: (538, 106)\nC: SCROLL: UP\nD: SCROLL: LEFT\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_60_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_60_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_60_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_60_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_60_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_60_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_60_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_60_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_60_8.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: TYPE: o\nB: CLICK: (795, 676)\nC: PRESS_HOME\nD: TYPE: https://docs.google.com/document/d/1F1mYgUNic6AwXcwpgI0fdfotTjP8R_v4ogXTNTK6u7U/edit?usp=drivesdk\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (150, 520)\nstep 2: CLICK: (909, 533)\nstep 3: CLICK: (297, 383)\nstep 4: PRESS_HOME\nstep 5: CLICK: (386, 272)\nstep 6: CLICK: (482, 158)\nstep 7: CLICK: (422, 948)\nI want to Locate the working file on your phone using app Google Drive, and then share it with kiudehu19294094 via X. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Locate the working file on your phone using app Google Drive, and then share it with kiudehu19294094 via X.\nThe historical actions are: step 1: CLICK: (150, 520)\nstep 2: CLICK: (909, 533)\nstep 3: CLICK: (297, 383)\nstep 4: PRESS_HOME\nstep 5: CLICK: (386, 272)\nstep 6: CLICK: (482, 158)\nstep 7: CLICK: (422, 948)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: TYPE: o\nB: CLICK: (795, 676)\nC: PRESS_HOME\nD: TYPE: https://docs.google.com/document/d/1F1mYgUNic6AwXcwpgI0fdfotTjP8R_v4ogXTNTK6u7U/edit?usp=drivesdk\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_61_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_61_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_61_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_61_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_61_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_61_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_61_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_61_7.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (291, 331)\nB: PRESS_HOME\nC: TYPE: Healthy lunch plan\nD: CLICK: (643, 639)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (802, 598)\nstep 2: CLICK: (913, 492)\nstep 3: CLICK: (288, 781)\nstep 4: PRESS_HOME\nstep 5: CLICK: (622, 459)\nstep 6: CLICK: (897, 892)\nI want to Watch a YouTube lecture on 3D Printing and then record the course's name in Simplenote. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Watch a YouTube lecture on 3D Printing and then record the course's name in Simplenote.\nThe historical actions are: step 1: CLICK: (802, 598)\nstep 2: CLICK: (913, 492)\nstep 3: CLICK: (288, 781)\nstep 4: PRESS_HOME\nstep 5: CLICK: (622, 459)\nstep 6: CLICK: (897, 892)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (291, 331)\nB: PRESS_HOME\nC: TYPE: Healthy lunch plan\nD: CLICK: (643, 639)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_62_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_62_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_62_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_62_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_62_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_62_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_62_6.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: SCROLL: UP\nB: CLICK: (83, 78)\nC: CLICK: (763, 917)\nD: CLICK: (744, 789)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (889, 692)\nstep 2: CLICK: (488, 921)\nstep 3: CLICK: (325, 305)\nstep 4: CLICK: (370, 530)\nstep 5: CLICK: (367, 730)\nstep 6: CLICK: (489, 914)\nstep 7: CLICK: (513, 239)\nstep 8: TYPE:  watermelon\nstep 9: CLICK: (740, 78)\nstep 10: CLICK: (626, 895)\nstep 11: CLICK: (319, 914)\nstep 12: CLICK: (610, 722)\nI want to Use the Remix:AI Image Creator app to design an image featuring a watermelon and then share it on Tumblr with katsunaksu. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Use the Remix:AI Image Creator app to design an image featuring a watermelon and then share it on Tumblr with katsunaksu.\nThe historical actions are: step 1: CLICK: (889, 692)\nstep 2: CLICK: (488, 921)\nstep 3: CLICK: (325, 305)\nstep 4: CLICK: (370, 530)\nstep 5: CLICK: (367, 730)\nstep 6: CLICK: (489, 914)\nstep 7: CLICK: (513, 239)\nstep 8: TYPE:  watermelon\nstep 9: CLICK: (740, 78)\nstep 10: CLICK: (626, 895)\nstep 11: CLICK: (319, 914)\nstep 12: CLICK: (610, 722)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: SCROLL: UP\nB: CLICK: (83, 78)\nC: CLICK: (763, 917)\nD: CLICK: (744, 789)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_63_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_63_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_63_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_63_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_63_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_63_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_63_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_63_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_63_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_63_9.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_63_10.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_63_11.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_63_12.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (151, 655)\nB: CLICK: (111, 572)\nC: CLICK: (428, 487)\nD: SCROLL: UP\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (386, 650)\nstep 2: CLICK: (870, 78)\nstep 3: CLICK: (477, 353)\nstep 4: CLICK: (374, 135)\nstep 5: SCROLL: UP\nstep 6: SCROLL: UP\nstep 7: SCROLL: UP\nstep 8: SCROLL: UP\nstep 9: SCROLL: UP\nstep 10: CLICK: (457, 695)\nstep 11: CLICK: (177, 274)\nstep 12: CLICK: (858, 537)\nstep 13: PRESS_HOME\nI want to Remove the Pandora app using the Google Play Store and then navigate to Settings to confirm that the app is no longer present in the app resources. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Remove the Pandora app using the Google Play Store and then navigate to Settings to confirm that the app is no longer present in the app resources.\nThe historical actions are: step 1: CLICK: (386, 650)\nstep 2: CLICK: (870, 78)\nstep 3: CLICK: (477, 353)\nstep 4: CLICK: (374, 135)\nstep 5: SCROLL: UP\nstep 6: SCROLL: UP\nstep 7: SCROLL: UP\nstep 8: SCROLL: UP\nstep 9: SCROLL: UP\nstep 10: CLICK: (457, 695)\nstep 11: CLICK: (177, 274)\nstep 12: CLICK: (858, 537)\nstep 13: PRESS_HOME\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (151, 655)\nB: CLICK: (111, 572)\nC: CLICK: (428, 487)\nD: SCROLL: UP\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_64_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_64_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_64_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_64_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_64_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_64_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_64_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_64_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_64_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_64_9.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_64_10.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_64_11.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_64_12.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_64_13.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (886, 547)\nB: CLICK: (865, 412)\nC: CLICK: (897, 885)\nD: TYPE: Petco\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (567, 328)\nstep 2: CLICK: (192, 73)\nstep 3: TYPE: pet store\nstep 4: CLICK: (245, 173)\nstep 5: PRESS_HOME\nstep 6: CLICK: (660, 138)\nstep 7: CLICK: (438, 233)\nI want to Utilize Google Map to locate a nearby pet store and then use Uber to arrange a ride to the found location. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Utilize Google Map to locate a nearby pet store and then use Uber to arrange a ride to the found location.\nThe historical actions are: step 1: CLICK: (567, 328)\nstep 2: CLICK: (192, 73)\nstep 3: TYPE: pet store\nstep 4: CLICK: (245, 173)\nstep 5: PRESS_HOME\nstep 6: CLICK: (660, 138)\nstep 7: CLICK: (438, 233)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (886, 547)\nB: CLICK: (865, 412)\nC: CLICK: (897, 885)\nD: TYPE: Petco\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_65_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_65_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_65_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_65_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_65_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_65_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_65_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_65_7.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: SCROLL: UP\nB: PRESS_HOME\nC: CLICK: (409, 247)\nD: CLICK: (595, 353)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (123, 397)\nstep 2: CLICK: (855, 944)\nstep 3: CLICK: (281, 478)\nstep 4: TYPE: do yoga in the morning\nstep 5: CLICK: (556, 384)\nstep 6: PRESS_HOME\nstep 7: CLICK: (828, 654)\nI want to Find a beginner Yoga workout video on YouTube and schedule a reminder in Things to do it tomorrow morning. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Find a beginner Yoga workout video on YouTube and schedule a reminder in Things to do it tomorrow morning.\nThe historical actions are: step 1: CLICK: (123, 397)\nstep 2: CLICK: (855, 944)\nstep 3: CLICK: (281, 478)\nstep 4: TYPE: do yoga in the morning\nstep 5: CLICK: (556, 384)\nstep 6: PRESS_HOME\nstep 7: CLICK: (828, 654)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: SCROLL: UP\nB: PRESS_HOME\nC: CLICK: (409, 247)\nD: CLICK: (595, 353)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_66_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_66_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_66_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_66_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_66_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_66_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_66_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_66_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (384, 529)\nB: CLICK: (730, 690)\nC: CLICK: (963, 904)\nD: CLICK: (491, 231)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (138, 228)\nstep 3: CLICK: (352, 78)\nstep 4: TYPE: 2022 nobel-prize winners in physics\nstep 5: CLICK: (913, 871)\nI want to Utilize DuckDuckgo to search for the 2022 Nobel-Prize winners in physics, and then document the gathered information in Google Docs. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Utilize DuckDuckgo to search for the 2022 Nobel-Prize winners in physics, and then document the gathered information in Google Docs.\nThe historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (138, 228)\nstep 3: CLICK: (352, 78)\nstep 4: TYPE: 2022 nobel-prize winners in physics\nstep 5: CLICK: (913, 871)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (384, 529)\nB: CLICK: (730, 690)\nC: CLICK: (963, 904)\nD: CLICK: (491, 231)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_67_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_67_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_67_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_67_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_67_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_67_5.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: SCROLL: DOWN\nB: CLICK: (177, 274)\nC: CLICK: (584, 780)\nD: CLICK: (559, 271)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (386, 650)\nstep 2: CLICK: (870, 78)\nstep 3: CLICK: (477, 353)\nstep 4: CLICK: (374, 135)\nstep 5: SCROLL: UP\nstep 6: SCROLL: UP\nstep 7: SCROLL: UP\nstep 8: SCROLL: UP\nstep 9: SCROLL: UP\nstep 10: CLICK: (457, 695)\nI want to Remove the Pandora app using the Google Play Store and then navigate to Settings to confirm that the app is no longer present in the app resources. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Remove the Pandora app using the Google Play Store and then navigate to Settings to confirm that the app is no longer present in the app resources.\nThe historical actions are: step 1: CLICK: (386, 650)\nstep 2: CLICK: (870, 78)\nstep 3: CLICK: (477, 353)\nstep 4: CLICK: (374, 135)\nstep 5: SCROLL: UP\nstep 6: SCROLL: UP\nstep 7: SCROLL: UP\nstep 8: SCROLL: UP\nstep 9: SCROLL: UP\nstep 10: CLICK: (457, 695)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: SCROLL: DOWN\nB: CLICK: (177, 274)\nC: CLICK: (584, 780)\nD: CLICK: (559, 271)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_68_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_68_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_68_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_68_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_68_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_68_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_68_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_68_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_68_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_68_9.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_68_10.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: SCROLL: LEFT\nB: TYPE: do yoga in the morning\nC: PRESS_HOME\nD: CLICK: (496, 680)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (592, 913)\nstep 2: CLICK: (16, 58)\nstep 3: CLICK: (830, 53)\nstep 4: TYPE: virtual reality\nstep 5: CLICK: (132, 127)\nstep 6: CLICK: (443, 518)\nstep 7: CLICK: (709, 236)\nstep 8: CLICK: (98, 261)\nI want to Watch a lecture on Virtual Reality on YouTube and then record the name of the course in Microsoft Word. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Watch a lecture on Virtual Reality on YouTube and then record the name of the course in Microsoft Word.\nThe historical actions are: step 1: CLICK: (592, 913)\nstep 2: CLICK: (16, 58)\nstep 3: CLICK: (830, 53)\nstep 4: TYPE: virtual reality\nstep 5: CLICK: (132, 127)\nstep 6: CLICK: (443, 518)\nstep 7: CLICK: (709, 236)\nstep 8: CLICK: (98, 261)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: SCROLL: LEFT\nB: TYPE: do yoga in the morning\nC: PRESS_HOME\nD: CLICK: (496, 680)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_69_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_69_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_69_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_69_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_69_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_69_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_69_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_69_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_69_8.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (123, 349)\nB: TYPE: paintbrush\nC: CLICK: (353, 155)\nD: CLICK: (863, 613)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (141, 718)\nstep 2: CLICK: (410, 57)\nstep 3: CLICK: (985, 63)\nstep 4: TYPE: Pop\nstep 5: CLICK: (892, 674)\nstep 6: CLICK: (485, 141)\nstep 7: CLICK: (416, 327)\nI want to Start by listening to a Pop album on Spotify, then share the name of the album on Instagram. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Start by listening to a Pop album on Spotify, then share the name of the album on Instagram.\nThe historical actions are: step 1: CLICK: (141, 718)\nstep 2: CLICK: (410, 57)\nstep 3: CLICK: (985, 63)\nstep 4: TYPE: Pop\nstep 5: CLICK: (892, 674)\nstep 6: CLICK: (485, 141)\nstep 7: CLICK: (416, 327)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (123, 349)\nB: TYPE: paintbrush\nC: CLICK: (353, 155)\nD: CLICK: (863, 613)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_70_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_70_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_70_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_70_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_70_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_70_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_70_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_70_7.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: COMPLETE\nB: CLICK: (54, 81)\nC: CLICK: (518, 375)\nD: CLICK: (308, 71)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (165, 380)\nstep 2: CLICK: (571, 942)\nstep 3: CLICK: (793, 943)\nstep 4: CLICK: (401, 96)\nstep 5: TYPE: hiking trail\nstep 6: CLICK: (537, 242)\nstep 7: PRESS_HOME\nstep 8: CLICK: (338, 501)\nstep 9: CLICK: (929, 76)\nstep 10: CLICK: (479, 153)\nstep 11: TYPE: Los Angeles\nstep 12: CLICK: (457, 247)\nstep 13: SCROLL: UP\nstep 14: SCROLL: UP\nI want to Use Mapillary to find a new hiking trail, then check the weekend weather forecast using Weather & Radar. Finally, invite Victor James to join the hike through Messenger. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Use Mapillary to find a new hiking trail, then check the weekend weather forecast using Weather & Radar. Finally, invite Victor James to join the hike through Messenger.\nThe historical actions are: step 1: CLICK: (165, 380)\nstep 2: CLICK: (571, 942)\nstep 3: CLICK: (793, 943)\nstep 4: CLICK: (401, 96)\nstep 5: TYPE: hiking trail\nstep 6: CLICK: (537, 242)\nstep 7: PRESS_HOME\nstep 8: CLICK: (338, 501)\nstep 9: CLICK: (929, 76)\nstep 10: CLICK: (479, 153)\nstep 11: TYPE: Los Angeles\nstep 12: CLICK: (457, 247)\nstep 13: SCROLL: UP\nstep 14: SCROLL: UP\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: COMPLETE\nB: CLICK: (54, 81)\nC: CLICK: (518, 375)\nD: CLICK: (308, 71)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_71_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_71_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_71_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_71_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_71_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_71_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_71_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_71_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_71_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_71_9.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_71_10.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_71_11.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_71_12.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_71_13.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_71_14.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (311, 467)\nB: SCROLL: UP\nC: COMPLETE\nD: CLICK: (297, 896)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (573, 704)\nstep 2: CLICK: (170, 67)\nstep 3: TYPE: Japan\nstep 4: CLICK: (903, 693)\nstep 5: CLICK: (128, 454)\nstep 6: CLICK: (983, 68)\nstep 7: CLICK: (855, 193)\nstep 8: CLICK: (706, 599)\nstep 9: PRESS_HOME\nstep 10: CLICK: (148, 254)\nstep 11: CLICK: (495, 921)\nstep 12: CLICK: (474, 492)\nstep 13: CLICK: (687, 424)\nI want to Utilize DuckDuckGo to search for an introduction about Japan and then use Threads to share the link to the webpage you find. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Utilize DuckDuckGo to search for an introduction about Japan and then use Threads to share the link to the webpage you find.\nThe historical actions are: step 1: CLICK: (573, 704)\nstep 2: CLICK: (170, 67)\nstep 3: TYPE: Japan\nstep 4: CLICK: (903, 693)\nstep 5: CLICK: (128, 454)\nstep 6: CLICK: (983, 68)\nstep 7: CLICK: (855, 193)\nstep 8: CLICK: (706, 599)\nstep 9: PRESS_HOME\nstep 10: CLICK: (148, 254)\nstep 11: CLICK: (495, 921)\nstep 12: CLICK: (474, 492)\nstep 13: CLICK: (687, 424)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (311, 467)\nB: SCROLL: UP\nC: COMPLETE\nD: CLICK: (297, 896)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_72_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_72_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_72_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_72_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_72_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_72_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_72_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_72_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_72_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_72_9.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_72_10.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_72_11.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_72_12.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_72_13.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (415, 114)\nB: TYPE: do strength training in the morning\nC: CLICK: (428, 226)\nD: SCROLL: RIGHT\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (138, 253)\nstep 2: CLICK: (392, 924)\nstep 3: CLICK: (369, 81)\nstep 4: TYPE: ASMR\nstep 5: CLICK: (351, 121)\nstep 6: CLICK: (583, 212)\nstep 7: CLICK: (353, 327)\nstep 8: PRESS_HOME\nI want to Launch the Triller app to play a soothing soundscape video, then use the Clock app to set a wake-up timer. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Launch the Triller app to play a soothing soundscape video, then use the Clock app to set a wake-up timer.\nThe historical actions are: step 1: CLICK: (138, 253)\nstep 2: CLICK: (392, 924)\nstep 3: CLICK: (369, 81)\nstep 4: TYPE: ASMR\nstep 5: CLICK: (351, 121)\nstep 6: CLICK: (583, 212)\nstep 7: CLICK: (353, 327)\nstep 8: PRESS_HOME\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (415, 114)\nB: TYPE: do strength training in the morning\nC: CLICK: (428, 226)\nD: SCROLL: RIGHT\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_73_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_73_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_73_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_73_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_73_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_73_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_73_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_73_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_73_8.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: PRESS_RECENT\nB: CLICK: (581, 865)\nC: SCROLL: UP\nD: CLICK: (876, 341)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (695, 656)\nstep 3: CLICK: (576, 763)\nstep 4: CLICK: (566, 652)\nstep 5: CLICK: (861, 746)\nstep 6: CLICK: (635, 760)\nstep 7: CLICK: (581, 865)\nI want to Utilize the 'Scientific calculator plus 991' to compute the sum of today's expenses, which are '14+200+8', and then document the total cost in the 'Monefy' app. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Utilize the 'Scientific calculator plus 991' to compute the sum of today's expenses, which are '14+200+8', and then document the total cost in the 'Monefy' app.\nThe historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (695, 656)\nstep 3: CLICK: (576, 763)\nstep 4: CLICK: (566, 652)\nstep 5: CLICK: (861, 746)\nstep 6: CLICK: (635, 760)\nstep 7: CLICK: (581, 865)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: PRESS_RECENT\nB: CLICK: (581, 865)\nC: SCROLL: UP\nD: CLICK: (876, 341)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_74_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_74_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_74_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_74_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_74_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_74_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_74_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_74_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: SCROLL: UP\nB: CLICK: (367, 345)\nC: TYPE: Same Old Blues\nD: CLICK: (868, 889)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (865, 397)\nstep 2: SCROLL: UP\nstep 3: SCROLL: UP\nstep 4: CLICK: (290, 801)\nstep 5: CLICK: (413, 294)\nI want to Switch your phone's language to German and then verify the change by opening either the Clock or Setting app. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Switch your phone's language to German and then verify the change by opening either the Clock or Setting app.\nThe historical actions are: step 1: CLICK: (865, 397)\nstep 2: SCROLL: UP\nstep 3: SCROLL: UP\nstep 4: CLICK: (290, 801)\nstep 5: CLICK: (413, 294)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: SCROLL: UP\nB: CLICK: (367, 345)\nC: TYPE: Same Old Blues\nD: CLICK: (868, 889)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_75_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_75_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_75_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_75_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_75_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_75_5.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: COMPLETE\nB: CLICK: (420, 574)\nC: CLICK: (860, 444)\nD: TYPE: Do you want to go to Los Angeles with me for a hiking trail next Sunday?\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (160, 145)\nstep 2: SCROLL: UP\nstep 3: SCROLL: UP\nstep 4: SCROLL: UP\nI want to Open the AP News app and read an English news article. Use DeepL translate to translate the title of the article into French. Finally, record the translated title in Google Docs. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Open the AP News app and read an English news article. Use DeepL translate to translate the title of the article into French. Finally, record the translated title in Google Docs.\nThe historical actions are: step 1: CLICK: (160, 145)\nstep 2: SCROLL: UP\nstep 3: SCROLL: UP\nstep 4: SCROLL: UP\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: COMPLETE\nB: CLICK: (420, 574)\nC: CLICK: (860, 444)\nD: TYPE: Do you want to go to Los Angeles with me for a hiking trail next Sunday?\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_76_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_76_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_76_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_76_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_76_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (785, 315)\nB: CLICK: (715, 571)\nC: PRESS_HOME\nD: COMPLETE\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (424, 710)\nstep 2: CLICK: (155, 934)\nstep 3: CLICK: (966, 781)\nstep 4: PRESS_HOME\nI want to Switch your device to dark mode using the Settings app and then launch the Libby, by OverDrive reading app. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Switch your device to dark mode using the Settings app and then launch the Libby, by OverDrive reading app.\nThe historical actions are: step 1: CLICK: (424, 710)\nstep 2: CLICK: (155, 934)\nstep 3: CLICK: (966, 781)\nstep 4: PRESS_HOME\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (785, 315)\nB: CLICK: (715, 571)\nC: PRESS_HOME\nD: COMPLETE\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_77_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_77_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_77_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_77_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_77_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (401, 907)\nB: CLICK: (461, 458)\nC: CLICK: (585, 566)\nD: SCROLL: UP\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (319, 516)\nstep 2: CLICK: (672, 110)\nstep 3: TYPE: how to learn photography\nstep 4: CLICK: (463, 177)\nstep 5: CLICK: (447, 227)\nstep 6: CLICK: (697, 224)\nstep 7: CLICK: (529, 536)\nstep 8: CLICK: (467, 636)\nstep 9: PRESS_HOME\nstep 10: CLICK: (88, 498)\nstep 11: CLICK: (302, 888)\nstep 12: TYPE: learn photography in \nI want to Investigate ways to learn photography using Quora, and then use Any.do to create a reminder to start the tutorial on the website. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Investigate ways to learn photography using Quora, and then use Any.do to create a reminder to start the tutorial on the website.\nThe historical actions are: step 1: CLICK: (319, 516)\nstep 2: CLICK: (672, 110)\nstep 3: TYPE: how to learn photography\nstep 4: CLICK: (463, 177)\nstep 5: CLICK: (447, 227)\nstep 6: CLICK: (697, 224)\nstep 7: CLICK: (529, 536)\nstep 8: CLICK: (467, 636)\nstep 9: PRESS_HOME\nstep 10: CLICK: (88, 498)\nstep 11: CLICK: (302, 888)\nstep 12: TYPE: learn photography in \n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (401, 907)\nB: CLICK: (461, 458)\nC: CLICK: (585, 566)\nD: SCROLL: UP\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_78_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_78_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_78_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_78_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_78_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_78_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_78_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_78_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_78_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_78_9.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_78_10.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_78_11.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_78_12.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: TYPE: Bayes' theorem\nB: CLICK: (464, 172)\nC: TYPE: Paris, tomorrow: intervals of clouds and sunshine  todolist: finish th homework and wash the dirty clothes\nD: CLICK: (359, 745)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (441, 914)\nstep 2: CLICK: (269, 419)\nstep 3: TYPE: book about romance\nstep 4: CLICK: (884, 886)\nstep 5: PRESS_HOME\nstep 6: CLICK: (903, 309)\nstep 7: CLICK: (713, 67)\nstep 8: TYPE: outlander book\nstep 9: CLICK: (895, 880)\nstep 10: SCROLL: UP\nstep 11: SCROLL: UP\nstep 12: SCROLL: UP\nstep 13: SCROLL: UP\nstep 14: PRESS_HOME\nstep 15: CLICK: (918, 132)\nI want to Utilize Chrome to search for a renowned romance novel, then browse Instagram for reviews and feedback on that book, and finally, purchase the book via AliExpress. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Utilize Chrome to search for a renowned romance novel, then browse Instagram for reviews and feedback on that book, and finally, purchase the book via AliExpress.\nThe historical actions are: step 1: CLICK: (441, 914)\nstep 2: CLICK: (269, 419)\nstep 3: TYPE: book about romance\nstep 4: CLICK: (884, 886)\nstep 5: PRESS_HOME\nstep 6: CLICK: (903, 309)\nstep 7: CLICK: (713, 67)\nstep 8: TYPE: outlander book\nstep 9: CLICK: (895, 880)\nstep 10: SCROLL: UP\nstep 11: SCROLL: UP\nstep 12: SCROLL: UP\nstep 13: SCROLL: UP\nstep 14: PRESS_HOME\nstep 15: CLICK: (918, 132)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: TYPE: Bayes' theorem\nB: CLICK: (464, 172)\nC: TYPE: Paris, tomorrow: intervals of clouds and sunshine  todolist: finish th homework and wash the dirty clothes\nD: CLICK: (359, 745)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_79_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_79_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_79_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_79_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_79_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_79_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_79_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_79_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_79_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_79_9.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_79_10.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_79_11.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_79_12.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_79_13.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_79_14.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_79_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: SCROLL: UP\nB: TYPE: sewing class\nC: CLICK: (138, 239)\nD: CLICK: (925, 920)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (599, 390)\nstep 2: CLICK: (164, 146)\nstep 3: TYPE: account manager\nI want to Using Indeed Job Search, find an account manager job and then record the company name in Google Keep. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Using Indeed Job Search, find an account manager job and then record the company name in Google Keep.\nThe historical actions are: step 1: CLICK: (599, 390)\nstep 2: CLICK: (164, 146)\nstep 3: TYPE: account manager\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: SCROLL: UP\nB: TYPE: sewing class\nC: CLICK: (138, 239)\nD: CLICK: (925, 920)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_80_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_80_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_80_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_80_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: COMPLETE\nB: CLICK: (203, 290)\nC: CLICK: (498, 640)\nD: CLICK: (906, 593)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (369, 145)\nstep 2: CLICK: (420, 719)\nstep 3: PRESS_HOME\nstep 4: CLICK: (614, 265)\nstep 5: CLICK: (313, 329)\nstep 6: TYPE: Tractor-trailer dirver 'executed' in road rage shooting, fooicials say\nstep 7: CLICK: (929, 641)\nstep 8: PRESS_HOME\nstep 9: CLICK: (855, 407)\nstep 10: CLICK: (797, 824)\nstep 11: CLICK: (145, 652)\nI want to Using Opera News, read an English news article. Translate its title into Korean with DeepL translate, and then document the translated title in WPS office. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Using Opera News, read an English news article. Translate its title into Korean with DeepL translate, and then document the translated title in WPS office.\nThe historical actions are: step 1: CLICK: (369, 145)\nstep 2: CLICK: (420, 719)\nstep 3: PRESS_HOME\nstep 4: CLICK: (614, 265)\nstep 5: CLICK: (313, 329)\nstep 6: TYPE: Tractor-trailer dirver 'executed' in road rage shooting, fooicials say\nstep 7: CLICK: (929, 641)\nstep 8: PRESS_HOME\nstep 9: CLICK: (855, 407)\nstep 10: CLICK: (797, 824)\nstep 11: CLICK: (145, 652)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: COMPLETE\nB: CLICK: (203, 290)\nC: CLICK: (498, 640)\nD: CLICK: (906, 593)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_81_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_81_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_81_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_81_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_81_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_81_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_81_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_81_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_81_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_81_9.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_81_10.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_81_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: PRESS_HOME\nB: CLICK: (385, 78)\nC: CLICK: (499, 460)\nD: SCROLL: RIGHT\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (336, 470)\nstep 3: CLICK: (247, 745)\nstep 4: CLICK: (252, 64)\nstep 5: TYPE: tennis court\nstep 6: CLICK: (922, 873)\nstep 7: CLICK: (394, 165)\nstep 8: PRESS_RECENT\nstep 9: CLICK: (66, 526)\nstep 10: CLICK: (327, 89)\nstep 11: TYPE: fitness tracking apps\nstep 12: CLICK: (511, 165)\nstep 13: CLICK: (452, 787)\nI want to First, locate a nearby tennis court using the 'GPS, Maps, Voice Navigation' app. Once you've found a suitable location, head to the 'Google Play Store' to download a fitness tracking app to set your fitness goals. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: First, locate a nearby tennis court using the 'GPS, Maps, Voice Navigation' app. Once you've found a suitable location, head to the 'Google Play Store' to download a fitness tracking app to set your fitness goals.\nThe historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (336, 470)\nstep 3: CLICK: (247, 745)\nstep 4: CLICK: (252, 64)\nstep 5: TYPE: tennis court\nstep 6: CLICK: (922, 873)\nstep 7: CLICK: (394, 165)\nstep 8: PRESS_RECENT\nstep 9: CLICK: (66, 526)\nstep 10: CLICK: (327, 89)\nstep 11: TYPE: fitness tracking apps\nstep 12: CLICK: (511, 165)\nstep 13: CLICK: (452, 787)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: PRESS_HOME\nB: CLICK: (385, 78)\nC: CLICK: (499, 460)\nD: SCROLL: RIGHT\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_82_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_82_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_82_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_82_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_82_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_82_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_82_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_82_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_82_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_82_9.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_82_10.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_82_11.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_82_12.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_82_13.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: COMPLETE\nB: CLICK: (361, 243)\nC: CLICK: (824, 420)\nD: TYPE: Recipe Organizer Apps\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (474, 220)\nstep 2: CLICK: (949, 72)\nstep 3: TYPE: The Fall of the Berlin Wall\nstep 4: CLICK: (347, 153)\nstep 5: CLICK: (397, 338)\nstep 6: PRESS_HOME\nstep 7: CLICK: (869, 106)\nstep 8: CLICK: (227, 79)\nstep 9: TYPE: The Fall of the Berlin Wall\nstep 10: CLICK: (929, 879)\nstep 11: CLICK: (605, 416)\nstep 12: CLICK: (522, 633)\nstep 13: CLICK: (301, 180)\nstep 14: TYPE: The Fall of the Berlin Wall\nI want to Delve into the history of the Fall of the Berlin Wall by exploring relevant videos on YouTube, and enhance your understanding by reading or listening to a related book on Amazon Kindle. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Delve into the history of the Fall of the Berlin Wall by exploring relevant videos on YouTube, and enhance your understanding by reading or listening to a related book on Amazon Kindle.\nThe historical actions are: step 1: CLICK: (474, 220)\nstep 2: CLICK: (949, 72)\nstep 3: TYPE: The Fall of the Berlin Wall\nstep 4: CLICK: (347, 153)\nstep 5: CLICK: (397, 338)\nstep 6: PRESS_HOME\nstep 7: CLICK: (869, 106)\nstep 8: CLICK: (227, 79)\nstep 9: TYPE: The Fall of the Berlin Wall\nstep 10: CLICK: (929, 879)\nstep 11: CLICK: (605, 416)\nstep 12: CLICK: (522, 633)\nstep 13: CLICK: (301, 180)\nstep 14: TYPE: The Fall of the Berlin Wall\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: COMPLETE\nB: CLICK: (361, 243)\nC: CLICK: (824, 420)\nD: TYPE: Recipe Organizer Apps\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_83_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_83_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_83_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_83_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_83_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_83_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_83_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_83_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_83_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_83_9.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_83_10.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_83_11.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_83_12.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_83_13.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_83_14.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: PRESS_HOME\nB: TYPE:  3D Printer Course for Beginners\nC: CLICK: (307, 490)\nD: CLICK: (159, 646)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (159, 349)\nstep 3: CLICK: (287, 922)\nstep 4: CLICK: (191, 156)\nstep 5: TYPE: Tokyo, Japan itinerary\nstep 6: CLICK: (890, 864)\nstep 7: CLICK: (537, 614)\nI want to Using Threads for research, find an itinerary for visiting Tokyo, Japan, and then proceed to book accommodations through Airbnb. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Using Threads for research, find an itinerary for visiting Tokyo, Japan, and then proceed to book accommodations through Airbnb.\nThe historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (159, 349)\nstep 3: CLICK: (287, 922)\nstep 4: CLICK: (191, 156)\nstep 5: TYPE: Tokyo, Japan itinerary\nstep 6: CLICK: (890, 864)\nstep 7: CLICK: (537, 614)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: PRESS_HOME\nB: TYPE:  3D Printer Course for Beginners\nC: CLICK: (307, 490)\nD: CLICK: (159, 646)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_84_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_84_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_84_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_84_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_84_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_84_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_84_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_84_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: TYPE: how to make vegetable stir fry\nB: CLICK: (409, 649)\nC: CLICK: (387, 66)\nD: COMPLETE\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (194, 478)\nstep 2: CLICK: (281, 439)\nstep 3: CLICK: (161, 286)\nstep 4: CLICK: (333, 908)\nstep 5: CLICK: (308, 664)\nstep 6: CLICK: (859, 917)\nI want to Open 'Gallery-photo gallery, album', select a photo, and share it on the social app 'X' with liudehu19294094. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Open 'Gallery-photo gallery, album', select a photo, and share it on the social app 'X' with liudehu19294094.\nThe historical actions are: step 1: CLICK: (194, 478)\nstep 2: CLICK: (281, 439)\nstep 3: CLICK: (161, 286)\nstep 4: CLICK: (333, 908)\nstep 5: CLICK: (308, 664)\nstep 6: CLICK: (859, 917)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: TYPE: how to make vegetable stir fry\nB: CLICK: (409, 649)\nC: CLICK: (387, 66)\nD: COMPLETE\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_85_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_85_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_85_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_85_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_85_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_85_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_85_6.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (631, 285)\nB: CLICK: (856, 710)\nC: SCROLL: UP\nD: TYPE: caba62244@gmail.com\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (712, 544)\nstep 2: CLICK: (583, 577)\nstep 3: CLICK: (480, 560)\nstep 4: CLICK: (424, 555)\nstep 5: CLICK: (596, 659)\nstep 6: CLICK: (497, 664)\nstep 7: CLICK: (410, 666)\nstep 8: SCROLL: UP\nstep 9: CLICK: (675, 921)\nstep 10: PRESS_HOME\nI want to Use Applock Pro - APP Lock & Guard to lock the Photos app, then open Photos to verify the lock with PIN 321654. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Use Applock Pro - APP Lock & Guard to lock the Photos app, then open Photos to verify the lock with PIN 321654.\nThe historical actions are: step 1: CLICK: (712, 544)\nstep 2: CLICK: (583, 577)\nstep 3: CLICK: (480, 560)\nstep 4: CLICK: (424, 555)\nstep 5: CLICK: (596, 659)\nstep 6: CLICK: (497, 664)\nstep 7: CLICK: (410, 666)\nstep 8: SCROLL: UP\nstep 9: CLICK: (675, 921)\nstep 10: PRESS_HOME\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (631, 285)\nB: CLICK: (856, 710)\nC: SCROLL: UP\nD: TYPE: caba62244@gmail.com\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_86_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_86_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_86_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_86_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_86_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_86_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_86_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_86_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_86_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_86_9.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_86_10.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: SCROLL: UP\nB: SCROLL: RIGHT\nC: PRESS_HOME\nD: TYPE: NBA game\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: PRESS_HOME\nstep 2: SCROLL: LEFT\nstep 3: CLICK: (415, 127)\nstep 4: CLICK: (947, 76)\nstep 5: TYPE: NBA\nstep 6: SCROLL: LEFT\nstep 7: CLICK: (697, 135)\nstep 8: PRESS_HOME\nstep 9: CLICK: (192, 147)\nstep 10: CLICK: (625, 576)\nstep 11: CLICK: (933, 917)\nI want to Using ESPN, find the details for the next NBA game and then set a reminder for it in Microsoft To Do. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Using ESPN, find the details for the next NBA game and then set a reminder for it in Microsoft To Do.\nThe historical actions are: step 1: PRESS_HOME\nstep 2: SCROLL: LEFT\nstep 3: CLICK: (415, 127)\nstep 4: CLICK: (947, 76)\nstep 5: TYPE: NBA\nstep 6: SCROLL: LEFT\nstep 7: CLICK: (697, 135)\nstep 8: PRESS_HOME\nstep 9: CLICK: (192, 147)\nstep 10: CLICK: (625, 576)\nstep 11: CLICK: (933, 917)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: SCROLL: UP\nB: SCROLL: RIGHT\nC: PRESS_HOME\nD: TYPE: NBA game\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_87_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_87_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_87_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_87_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_87_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_87_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_87_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_87_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_87_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_87_9.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_87_10.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_87_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (859, 584)\nB: COMPLETE\nC: TYPE: Beautifuiil realised and quietil beguiling,a vivid evocation of nature.\nD: TYPE: hiking trail\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (140, 269)\nstep 2: CLICK: (485, 88)\nstep 3: TYPE: Shopify's stock market news\nstep 4: CLICK: (490, 181)\nstep 5: CLICK: (369, 338)\nstep 6: PRESS_HOME\nstep 7: CLICK: (607, 273)\nstep 8: CLICK: (944, 78)\nstep 9: TYPE: Shopify\nstep 10: CLICK: (218, 220)\nI want to Using Firefox, search for today's stock market news related to Shopify. After gathering the news, open Investing.com to check the current stock price trends of Shopify. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Using Firefox, search for today's stock market news related to Shopify. After gathering the news, open Investing.com to check the current stock price trends of Shopify.\nThe historical actions are: step 1: CLICK: (140, 269)\nstep 2: CLICK: (485, 88)\nstep 3: TYPE: Shopify's stock market news\nstep 4: CLICK: (490, 181)\nstep 5: CLICK: (369, 338)\nstep 6: PRESS_HOME\nstep 7: CLICK: (607, 273)\nstep 8: CLICK: (944, 78)\nstep 9: TYPE: Shopify\nstep 10: CLICK: (218, 220)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (859, 584)\nB: COMPLETE\nC: TYPE: Beautifuiil realised and quietil beguiling,a vivid evocation of nature.\nD: TYPE: hiking trail\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_88_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_88_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_88_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_88_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_88_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_88_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_88_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_88_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_88_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_88_9.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_88_10.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (777, 842)\nB: CLICK: (920, 261)\nC: COMPLETE\nD: SCROLL: UP\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (791, 914)\nstep 2: PRESS_HOME\nstep 3: CLICK: (903, 136)\nI want to Using the Firefox Browser, search for a popular K-Pop music band, then listen to their latest album on Pandora. Finally, check on TickPick - Live Event Tickets to see if you can purchase a ticket for an upcoming concert. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Using the Firefox Browser, search for a popular K-Pop music band, then listen to their latest album on Pandora. Finally, check on TickPick - Live Event Tickets to see if you can purchase a ticket for an upcoming concert.\nThe historical actions are: step 1: CLICK: (791, 914)\nstep 2: PRESS_HOME\nstep 3: CLICK: (903, 136)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (777, 842)\nB: CLICK: (920, 261)\nC: COMPLETE\nD: SCROLL: UP\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_89_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_89_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_89_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_89_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (191, 520)\nB: SCROLL: UP\nC: COMPLETE\nD: TYPE:  technology conference events\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (785, 660)\nstep 2: CLICK: (241, 434)\nstep 3: CLICK: (415, 479)\nstep 4: CLICK: (514, 73)\nstep 5: PRESS_HOME\nstep 6: CLICK: (557, 313)\nstep 7: CLICK: (373, 174)\nstep 8: CLICK: (330, 911)\nstep 9: TYPE: 9298916954\nI want to Set up an online meeting using ZOOM Cloud Meetings and share the meeting link with liudehu19294094 via app X. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Set up an online meeting using ZOOM Cloud Meetings and share the meeting link with liudehu19294094 via app X.\nThe historical actions are: step 1: CLICK: (785, 660)\nstep 2: CLICK: (241, 434)\nstep 3: CLICK: (415, 479)\nstep 4: CLICK: (514, 73)\nstep 5: PRESS_HOME\nstep 6: CLICK: (557, 313)\nstep 7: CLICK: (373, 174)\nstep 8: CLICK: (330, 911)\nstep 9: TYPE: 9298916954\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (191, 520)\nB: SCROLL: UP\nC: COMPLETE\nD: TYPE:  technology conference events\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_90_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_90_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_90_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_90_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_90_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_90_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_90_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_90_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_90_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_90_9.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: TYPE: 2013 nobel prize winners in physics\nB: CLICK: (616, 133)\nC: TYPE: statue of liberty\nD: SCROLL: UP\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (370, 126)\nstep 3: CLICK: (352, 72)\nstep 4: TYPE: properties of circle\nstep 5: CLICK: (469, 145)\nstep 6: CLICK: (255, 276)\nstep 7: SCROLL: UP\nstep 8: SCROLL: UP\nI want to Utilize DuckDuckgo to gather information on the properties of a Circle, and then compile your findings into a brief document using Microsoft Word. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Utilize DuckDuckgo to gather information on the properties of a Circle, and then compile your findings into a brief document using Microsoft Word.\nThe historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (370, 126)\nstep 3: CLICK: (352, 72)\nstep 4: TYPE: properties of circle\nstep 5: CLICK: (469, 145)\nstep 6: CLICK: (255, 276)\nstep 7: SCROLL: UP\nstep 8: SCROLL: UP\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: TYPE: 2013 nobel prize winners in physics\nB: CLICK: (616, 133)\nC: TYPE: statue of liberty\nD: SCROLL: UP\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_91_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_91_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_91_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_91_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_91_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_91_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_91_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_91_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_91_8.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: TYPE: Instagram\nB: TYPE: GoPro HERO10\nC: COMPLETE\nD: SCROLL: UP\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (595, 913)\nstep 2: CLICK: (717, 234)\nstep 3: CLICK: (95, 261)\nstep 4: PRESS_HOME\nstep 5: CLICK: (717, 412)\nstep 6: CLICK: (235, 487)\nstep 7: CLICK: (81, 398)\nstep 8: CLICK: (470, 498)\nstep 9: TYPE:  VR Development Full Course: Oculus Quest\nI want to Watch a lecture on Virtual Reality on YouTube and then record the course name in the WPS Office notes app. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Watch a lecture on Virtual Reality on YouTube and then record the course name in the WPS Office notes app.\nThe historical actions are: step 1: CLICK: (595, 913)\nstep 2: CLICK: (717, 234)\nstep 3: CLICK: (95, 261)\nstep 4: PRESS_HOME\nstep 5: CLICK: (717, 412)\nstep 6: CLICK: (235, 487)\nstep 7: CLICK: (81, 398)\nstep 8: CLICK: (470, 498)\nstep 9: TYPE:  VR Development Full Course: Oculus Quest\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: TYPE: Instagram\nB: TYPE: GoPro HERO10\nC: COMPLETE\nD: SCROLL: UP\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_92_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_92_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_92_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_92_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_92_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_92_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_92_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_92_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_92_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_92_9.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (158, 742)\nB: CLICK: (907, 242)\nC: CLICK: (264, 272)\nD: CLICK: (471, 264)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (140, 641)\nstep 2: CLICK: (945, 70)\nstep 3: TYPE: World War I\nstep 4: CLICK: (937, 901)\nI want to Delve into a historical event from World War I and either read a related book on Amazon Kindle or listen to a related video on YouTube. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Delve into a historical event from World War I and either read a related book on Amazon Kindle or listen to a related video on YouTube.\nThe historical actions are: step 1: CLICK: (140, 641)\nstep 2: CLICK: (945, 70)\nstep 3: TYPE: World War I\nstep 4: CLICK: (937, 901)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (158, 742)\nB: CLICK: (907, 242)\nC: CLICK: (264, 272)\nD: CLICK: (471, 264)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_93_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_93_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_93_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_93_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_93_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: SCROLL: UP\nB: COMPLETE\nC: PRESS_HOME\nD: SCROLL: DOWN\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (791, 921)\nstep 2: TYPE: popular k-pop music band\nstep 3: CLICK: (863, 872)\nstep 4: PRESS_HOME\nstep 5: CLICK: (557, 128)\nstep 6: CLICK: (73, 169)\nstep 7: CLICK: (403, 215)\nstep 8: TYPE: BTS\nstep 9: CLICK: (882, 895)\nstep 10: CLICK: (352, 236)\nstep 11: CLICK: (949, 457)\nI want to Use Firefox Browser to find a well-known K-Pop band, listen to their latest album on Spotify, and see if you can purchase a concert ticket on StubHub. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Use Firefox Browser to find a well-known K-Pop band, listen to their latest album on Spotify, and see if you can purchase a concert ticket on StubHub.\nThe historical actions are: step 1: CLICK: (791, 921)\nstep 2: TYPE: popular k-pop music band\nstep 3: CLICK: (863, 872)\nstep 4: PRESS_HOME\nstep 5: CLICK: (557, 128)\nstep 6: CLICK: (73, 169)\nstep 7: CLICK: (403, 215)\nstep 8: TYPE: BTS\nstep 9: CLICK: (882, 895)\nstep 10: CLICK: (352, 236)\nstep 11: CLICK: (949, 457)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: SCROLL: UP\nB: COMPLETE\nC: PRESS_HOME\nD: SCROLL: DOWN\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_94_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_94_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_94_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_94_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_94_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_94_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_94_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_94_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_94_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_94_9.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_94_10.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_94_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (966, 604)\nB: CLICK: (97, 165)\nC: TYPE: Swimming Morning\nD: CLICK: (309, 608)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (722, 254)\nstep 2: CLICK: (523, 912)\nstep 3: CLICK: (680, 807)\nI want to Search for a beginner swimming workout video on Likee, and then use the To-Do List app to set a reminder to perform the workout tomorrow morning. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Search for a beginner swimming workout video on Likee, and then use the To-Do List app to set a reminder to perform the workout tomorrow morning.\nThe historical actions are: step 1: CLICK: (722, 254)\nstep 2: CLICK: (523, 912)\nstep 3: CLICK: (680, 807)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (966, 604)\nB: CLICK: (97, 165)\nC: TYPE: Swimming Morning\nD: CLICK: (309, 608)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_95_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_95_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_95_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_95_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (534, 608)\nB: CLICK: (494, 485)\nC: CLICK: (457, 569)\nD: COMPLETE\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (499, 346)\nstep 3: CLICK: (45, 71)\nstep 4: CLICK: (45, 71)\nstep 5: CLICK: (45, 71)\nstep 6: CLICK: (808, 71)\nstep 7: TYPE: 'The Seven Husbands of Evelyn Hugo' by Taylor Jenkins Reid book review\nstep 8: CLICK: (916, 868)\nstep 9: SCROLL: UP\nstep 10: CLICK: (886, 507)\nstep 11: PRESS_HOME\nI want to Please read a book review online on Quora for the book 'The Seven Husbands of Evelyn Hugo' by Taylor Jenkins Reid. Afterwards, purchase either the ebook or the physical copy using Google Play Books & Audiobooks. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Please read a book review online on Quora for the book 'The Seven Husbands of Evelyn Hugo' by Taylor Jenkins Reid. Afterwards, purchase either the ebook or the physical copy using Google Play Books & Audiobooks.\nThe historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (499, 346)\nstep 3: CLICK: (45, 71)\nstep 4: CLICK: (45, 71)\nstep 5: CLICK: (45, 71)\nstep 6: CLICK: (808, 71)\nstep 7: TYPE: 'The Seven Husbands of Evelyn Hugo' by Taylor Jenkins Reid book review\nstep 8: CLICK: (916, 868)\nstep 9: SCROLL: UP\nstep 10: CLICK: (886, 507)\nstep 11: PRESS_HOME\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (534, 608)\nB: CLICK: (494, 485)\nC: CLICK: (457, 569)\nD: COMPLETE\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_96_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_96_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_96_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_96_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_96_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_96_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_96_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_96_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_96_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_96_9.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_96_10.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_96_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: TYPE: 'The Seven Husbands of Evelyn Hugo' by Taylor Jenkins Reid book review\nB: TYPE: 314 6th St Unit 608\nC: PRESS_HOME\nD: CLICK: (361, 735)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (499, 346)\nstep 3: CLICK: (45, 71)\nstep 4: CLICK: (45, 71)\nstep 5: CLICK: (45, 71)\nstep 6: CLICK: (808, 71)\nI want to Please read a book review online on Quora for the book 'The Seven Husbands of Evelyn Hugo' by Taylor Jenkins Reid. Afterwards, purchase either the ebook or the physical copy using Google Play Books & Audiobooks. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Please read a book review online on Quora for the book 'The Seven Husbands of Evelyn Hugo' by Taylor Jenkins Reid. Afterwards, purchase either the ebook or the physical copy using Google Play Books & Audiobooks.\nThe historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (499, 346)\nstep 3: CLICK: (45, 71)\nstep 4: CLICK: (45, 71)\nstep 5: CLICK: (45, 71)\nstep 6: CLICK: (808, 71)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: TYPE: 'The Seven Husbands of Evelyn Hugo' by Taylor Jenkins Reid book review\nB: TYPE: 314 6th St Unit 608\nC: PRESS_HOME\nD: CLICK: (361, 735)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_97_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_97_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_97_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_97_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_97_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_97_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_97_6.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: TYPE: 9298916954\nB: PRESS_BACK\nC: COMPLETE\nD: PRESS_HOME\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (158, 103)\nstep 2: CLICK: (234, 830)\nstep 3: CLICK: (532, 205)\nstep 4: CLICK: (481, 758)\nstep 5: TYPE: shopping mall\nstep 6: CLICK: (895, 896)\nstep 7: CLICK: (527, 153)\nI want to Utilize the GPS app to locate a nearby shopping mall, and then use the Uber app to book a ride to that location. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Utilize the GPS app to locate a nearby shopping mall, and then use the Uber app to book a ride to that location.\nThe historical actions are: step 1: CLICK: (158, 103)\nstep 2: CLICK: (234, 830)\nstep 3: CLICK: (532, 205)\nstep 4: CLICK: (481, 758)\nstep 5: TYPE: shopping mall\nstep 6: CLICK: (895, 896)\nstep 7: CLICK: (527, 153)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: TYPE: 9298916954\nB: PRESS_BACK\nC: COMPLETE\nD: PRESS_HOME\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_98_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_98_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_98_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_98_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_98_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_98_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_98_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_98_7.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: SCROLL: UP\nB: TYPE: Coca-Cola's stock market news\nC: TYPE: the best Italian in San Jose\nD: TYPE: Tokopedia\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (561, 152)\nstep 2: CLICK: (341, 138)\nstep 3: TYPE: Healthy lunch plan\nstep 4: CLICK: (900, 85)\nI want to Choose a healthy lunch plan for the next day, document it, and watch a video on how to prepare one of the dishes. Use TikTok for the video, Opera browser with AI for research, and Google Docs for taking notes. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Choose a healthy lunch plan for the next day, document it, and watch a video on how to prepare one of the dishes. Use TikTok for the video, Opera browser with AI for research, and Google Docs for taking notes.\nThe historical actions are: step 1: CLICK: (561, 152)\nstep 2: CLICK: (341, 138)\nstep 3: TYPE: Healthy lunch plan\nstep 4: CLICK: (900, 85)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: SCROLL: UP\nB: TYPE: Coca-Cola's stock market news\nC: TYPE: the best Italian in San Jose\nD: TYPE: Tokopedia\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_99_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_99_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_99_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_99_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_99_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: SCROLL: RIGHT\nB: CLICK: (877, 862)\nC: COMPLETE\nD: CLICK: (824, 73)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (846, 368)\nstep 2: CLICK: (837, 42)\nstep 3: TYPE: Task Manager Apps\nstep 4: CLICK: (906, 902)\nstep 5: SCROLL: UP\nstep 6: SCROLL: UP\nstep 7: SCROLL: UP\nstep 8: CLICK: (137, 439)\nstep 9: PRESS_HOME\nI want to Investigate Task Manager applications and choose one to download from the Google Play Store. Share your experience or findings on Facebook. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Investigate Task Manager applications and choose one to download from the Google Play Store. Share your experience or findings on Facebook.\nThe historical actions are: step 1: CLICK: (846, 368)\nstep 2: CLICK: (837, 42)\nstep 3: TYPE: Task Manager Apps\nstep 4: CLICK: (906, 902)\nstep 5: SCROLL: UP\nstep 6: SCROLL: UP\nstep 7: SCROLL: UP\nstep 8: CLICK: (137, 439)\nstep 9: PRESS_HOME\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: SCROLL: RIGHT\nB: CLICK: (877, 862)\nC: COMPLETE\nD: CLICK: (824, 73)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_100_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_100_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_100_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_100_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_100_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_100_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_100_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_100_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_100_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_100_9.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (906, 915)\nB: SCROLL: UP\nC: CLICK: (327, 960)\nD: CLICK: (741, 172)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: SCROLL: LEFT\nstep 2: CLICK: (851, 131)\nstep 3: CLICK: (460, 71)\nstep 4: TYPE: the best bookstore in San Jose\nstep 5: CLICK: (913, 909)\nstep 6: PRESS_HOME\nstep 7: CLICK: (168, 125)\nstep 8: CLICK: (567, 211)\nstep 9: CLICK: (297, 847)\nstep 10: TYPE: Recyle Bookstore\nstep 11: CLICK: (902, 924)\nstep 12: CLICK: (395, 158)\nI want to Utilize Firefox to search for the top-rated bookstore in your local city, and then use GPS to navigate to that location. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Utilize Firefox to search for the top-rated bookstore in your local city, and then use GPS to navigate to that location.\nThe historical actions are: step 1: SCROLL: LEFT\nstep 2: CLICK: (851, 131)\nstep 3: CLICK: (460, 71)\nstep 4: TYPE: the best bookstore in San Jose\nstep 5: CLICK: (913, 909)\nstep 6: PRESS_HOME\nstep 7: CLICK: (168, 125)\nstep 8: CLICK: (567, 211)\nstep 9: CLICK: (297, 847)\nstep 10: TYPE: Recyle Bookstore\nstep 11: CLICK: (902, 924)\nstep 12: CLICK: (395, 158)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (906, 915)\nB: SCROLL: UP\nC: CLICK: (327, 960)\nD: CLICK: (741, 172)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_101_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_101_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_101_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_101_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_101_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_101_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_101_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_101_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_101_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_101_9.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_101_10.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_101_11.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_101_12.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (318, 558)\nB: CLICK: (234, 868)\nC: CLICK: (139, 557)\nD: CLICK: (819, 942)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (509, 910)\nstep 3: CLICK: (978, 57)\nstep 4: CLICK: (295, 116)\nstep 5: TYPE: 'Where the Crawdads Sing' by Delia Owens book review\nstep 6: CLICK: (879, 693)\nstep 7: SCROLL: UP\nstep 8: CLICK: (335, 622)\nstep 9: CLICK: (672, 746)\nstep 10: PRESS_HOME\nI want to Open Chrome and search for a book review of 'Where the Crawdads Sing' by Delia Owens. After reading the review, use OfferUp: Buy. Sell. Letgo. to purchase either the ebook or a physical copy of the book. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Open Chrome and search for a book review of 'Where the Crawdads Sing' by Delia Owens. After reading the review, use OfferUp: Buy. Sell. Letgo. to purchase either the ebook or a physical copy of the book.\nThe historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (509, 910)\nstep 3: CLICK: (978, 57)\nstep 4: CLICK: (295, 116)\nstep 5: TYPE: 'Where the Crawdads Sing' by Delia Owens book review\nstep 6: CLICK: (879, 693)\nstep 7: SCROLL: UP\nstep 8: CLICK: (335, 622)\nstep 9: CLICK: (672, 746)\nstep 10: PRESS_HOME\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (318, 558)\nB: CLICK: (234, 868)\nC: CLICK: (139, 557)\nD: CLICK: (819, 942)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_102_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_102_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_102_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_102_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_102_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_102_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_102_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_102_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_102_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_102_9.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_102_10.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (861, 391)\nB: CLICK: (559, 927)\nC: SCROLL: UP\nD: PRESS_HOME\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (802, 916)\nstep 2: CLICK: (920, 147)\nstep 3: CLICK: (692, 558)\nstep 4: PRESS_HOME\nI want to Look up the top video blogs on DIY crafts using Firefox Browser, adjust the phone's brightness through Settings, then open YouTube to watch and follow the content. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Look up the top video blogs on DIY crafts using Firefox Browser, adjust the phone's brightness through Settings, then open YouTube to watch and follow the content.\nThe historical actions are: step 1: CLICK: (802, 916)\nstep 2: CLICK: (920, 147)\nstep 3: CLICK: (692, 558)\nstep 4: PRESS_HOME\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (861, 391)\nB: CLICK: (559, 927)\nC: SCROLL: UP\nD: PRESS_HOME\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_103_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_103_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_103_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_103_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_103_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (787, 511)\nB: CLICK: (352, 398)\nC: PRESS_HOME\nD: CLICK: (184, 234)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: PRESS_HOME\nstep 2: SCROLL: LEFT\nstep 3: CLICK: (846, 123)\nstep 4: SCROLL: UP\nstep 5: SCROLL: UP\nstep 6: SCROLL: UP\nstep 7: SCROLL: DOWN\nstep 8: SCROLL: DOWN\nstep 9: PRESS_HOME\nstep 10: SCROLL: RIGHT\nI want to Utilize DuckDuckgo to search for the 2023 Nobel-Prize winners in physics, and then document the gathered information in Google Docs. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Utilize DuckDuckgo to search for the 2023 Nobel-Prize winners in physics, and then document the gathered information in Google Docs.\nThe historical actions are: step 1: PRESS_HOME\nstep 2: SCROLL: LEFT\nstep 3: CLICK: (846, 123)\nstep 4: SCROLL: UP\nstep 5: SCROLL: UP\nstep 6: SCROLL: UP\nstep 7: SCROLL: DOWN\nstep 8: SCROLL: DOWN\nstep 9: PRESS_HOME\nstep 10: SCROLL: RIGHT\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (787, 511)\nB: CLICK: (352, 398)\nC: PRESS_HOME\nD: CLICK: (184, 234)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_104_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_104_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_104_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_104_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_104_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_104_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_104_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_104_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_104_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_104_9.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_104_10.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (965, 62)\nB: CLICK: (432, 324)\nC: CLICK: (904, 88)\nD: CLICK: (491, 314)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (929, 488)\nstep 2: CLICK: (335, 347)\nstep 3: CLICK: (413, 328)\nstep 4: CLICK: (310, 901)\nstep 5: CLICK: (327, 680)\nstep 6: CLICK: (325, 281)\nstep 7: TYPE: caba62244@gmail.com\nstep 8: CLICK: (360, 364)\nI want to Find a photo in Gallery-photo gallery,album, and share it through Gmail with caba62244@gmail.com. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Find a photo in Gallery-photo gallery,album, and share it through Gmail with caba62244@gmail.com.\nThe historical actions are: step 1: CLICK: (929, 488)\nstep 2: CLICK: (335, 347)\nstep 3: CLICK: (413, 328)\nstep 4: CLICK: (310, 901)\nstep 5: CLICK: (327, 680)\nstep 6: CLICK: (325, 281)\nstep 7: TYPE: caba62244@gmail.com\nstep 8: CLICK: (360, 364)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (965, 62)\nB: CLICK: (432, 324)\nC: CLICK: (904, 88)\nD: CLICK: (491, 314)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_105_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_105_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_105_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_105_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_105_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_105_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_105_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_105_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_105_8.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: SCROLL: UP\nB: CLICK: (288, 890)\nC: LONG_PRESS: (146, 398)\nD: CLICK: (423, 927)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (638, 383)\nstep 2: TYPE: book about biography\nstep 3: CLICK: (909, 913)\nI want to Utilize DuckDuckGo to search for a renowned biography book, then use Instagram to read reviews about it, and finally, head over to Amazon to purchase the book. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Utilize DuckDuckGo to search for a renowned biography book, then use Instagram to read reviews about it, and finally, head over to Amazon to purchase the book.\nThe historical actions are: step 1: CLICK: (638, 383)\nstep 2: TYPE: book about biography\nstep 3: CLICK: (909, 913)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: SCROLL: UP\nB: CLICK: (288, 890)\nC: LONG_PRESS: (146, 398)\nD: CLICK: (423, 927)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_106_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_106_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_106_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_106_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: COMPLETE\nB: CLICK: (114, 175)\nC: CLICK: (913, 878)\nD: CLICK: (250, 303)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (619, 836)\nstep 2: CLICK: (802, 190)\nstep 3: TYPE: Pad Thai ingredients\nstep 4: CLICK: (221, 183)\nstep 5: CLICK: (940, 582)\nstep 6: PRESS_HOME\nstep 7: CLICK: (844, 396)\nstep 8: CLICK: (875, 817)\nstep 9: CLICK: (172, 650)\nI want to Utilize Chrome to search for the ingredients required for Pad Thai. Once you have found the main ingredients, use WPS Office to compile and create a shopping list of these essential items. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Utilize Chrome to search for the ingredients required for Pad Thai. Once you have found the main ingredients, use WPS Office to compile and create a shopping list of these essential items.\nThe historical actions are: step 1: CLICK: (619, 836)\nstep 2: CLICK: (802, 190)\nstep 3: TYPE: Pad Thai ingredients\nstep 4: CLICK: (221, 183)\nstep 5: CLICK: (940, 582)\nstep 6: PRESS_HOME\nstep 7: CLICK: (844, 396)\nstep 8: CLICK: (875, 817)\nstep 9: CLICK: (172, 650)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: COMPLETE\nB: CLICK: (114, 175)\nC: CLICK: (913, 878)\nD: CLICK: (250, 303)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_107_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_107_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_107_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_107_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_107_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_107_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_107_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_107_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_107_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_107_9.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (872, 74)\nB: COMPLETE\nC: TYPE: https://share.newsbreak.com/6uadf7u2\nD: CLICK: (411, 243)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (188, 658)\nstep 2: CLICK: (303, 580)\nstep 3: CLICK: (288, 677)\nstep 4: CLICK: (804, 89)\nstep 5: TYPE: Todoist\nI want to Enable or disable notifications for any application on the phone and subsequently launch the app. Please ensure you use 'Todoist' and 'Setting' to accomplish this. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Enable or disable notifications for any application on the phone and subsequently launch the app. Please ensure you use 'Todoist' and 'Setting' to accomplish this.\nThe historical actions are: step 1: CLICK: (188, 658)\nstep 2: CLICK: (303, 580)\nstep 3: CLICK: (288, 677)\nstep 4: CLICK: (804, 89)\nstep 5: TYPE: Todoist\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (872, 74)\nB: COMPLETE\nC: TYPE: https://share.newsbreak.com/6uadf7u2\nD: CLICK: (411, 243)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_108_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_108_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_108_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_108_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_108_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_108_5.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: TYPE: digital marketing class\nB: CLICK: (326, 274)\nC: CLICK: (517, 942)\nD: CLICK: (374, 62)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (360, 522)\nstep 2: CLICK: (824, 67)\nstep 3: TYPE: basketball\nI want to Use ESPN to find the latest basketball game score, then send the result to caba62244@gmail.com via Gmail. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Use ESPN to find the latest basketball game score, then send the result to caba62244@gmail.com via Gmail.\nThe historical actions are: step 1: CLICK: (360, 522)\nstep 2: CLICK: (824, 67)\nstep 3: TYPE: basketball\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: TYPE: digital marketing class\nB: CLICK: (326, 274)\nC: CLICK: (517, 942)\nD: CLICK: (374, 62)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_109_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_109_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_109_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_109_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (677, 588)\nB: TYPE: Italin Learning\nC: TYPE: Deep Sea Exploration\nD: COMPLETE\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (386, 406)\nstep 2: CLICK: (78, 65)\nstep 3: CLICK: (315, 154)\nstep 4: CLICK: (163, 490)\nstep 5: CLICK: (395, 928)\nstep 6: PRESS_HOME\nstep 7: CLICK: (817, 531)\nstep 8: CLICK: (929, 875)\nI want to Engage in an Italian language lesson on Duolingo and create a comprehensive learning plan using To-Do List. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Engage in an Italian language lesson on Duolingo and create a comprehensive learning plan using To-Do List.\nThe historical actions are: step 1: CLICK: (386, 406)\nstep 2: CLICK: (78, 65)\nstep 3: CLICK: (315, 154)\nstep 4: CLICK: (163, 490)\nstep 5: CLICK: (395, 928)\nstep 6: PRESS_HOME\nstep 7: CLICK: (817, 531)\nstep 8: CLICK: (929, 875)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (677, 588)\nB: TYPE: Italin Learning\nC: TYPE: Deep Sea Exploration\nD: COMPLETE\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_110_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_110_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_110_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_110_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_110_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_110_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_110_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_110_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_110_8.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (920, 767)\nB: CLICK: (239, 90)\nC: CLICK: (508, 892)\nD: CLICK: (163, 832)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (376, 730)\nstep 3: CLICK: (52, 94)\nstep 4: TYPE: 'The Book Thief' by Markus Zusak book review\nstep 5: CLICK: (735, 97)\nstep 6: CLICK: (299, 65)\nstep 7: PRESS_HOME\nstep 8: CLICK: (76, 886)\nI want to Browse Quora to find a book review for 'The Book Thief' by Markus Zusak and then head over to eBay to buy either the ebook or a physical copy. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Browse Quora to find a book review for 'The Book Thief' by Markus Zusak and then head over to eBay to buy either the ebook or a physical copy.\nThe historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (376, 730)\nstep 3: CLICK: (52, 94)\nstep 4: TYPE: 'The Book Thief' by Markus Zusak book review\nstep 5: CLICK: (735, 97)\nstep 6: CLICK: (299, 65)\nstep 7: PRESS_HOME\nstep 8: CLICK: (76, 886)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (920, 767)\nB: CLICK: (239, 90)\nC: CLICK: (508, 892)\nD: CLICK: (163, 832)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_111_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_111_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_111_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_111_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_111_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_111_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_111_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_111_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_111_8.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: PRESS_HOME\nB: SCROLL: UP\nC: CLICK: (495, 672)\nD: CLICK: (875, 910)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (865, 126)\nstep 2: CLICK: (90, 671)\nstep 3: TYPE: Garmin Fenix 6\nstep 4: CLICK: (836, 94)\nstep 5: SCROLL: LEFT\nstep 6: SCROLL: LEFT\nstep 7: SCROLL: LEFT\nI want to Research the prices for a Garmin Fenix 6 across various shopping platforms, specifically Alibaba.com - B2B marketplace and Flipkart, and make sure to add the most affordable option to your cart. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Research the prices for a Garmin Fenix 6 across various shopping platforms, specifically Alibaba.com - B2B marketplace and Flipkart, and make sure to add the most affordable option to your cart.\nThe historical actions are: step 1: CLICK: (865, 126)\nstep 2: CLICK: (90, 671)\nstep 3: TYPE: Garmin Fenix 6\nstep 4: CLICK: (836, 94)\nstep 5: SCROLL: LEFT\nstep 6: SCROLL: LEFT\nstep 7: SCROLL: LEFT\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: PRESS_HOME\nB: SCROLL: UP\nC: CLICK: (495, 672)\nD: CLICK: (875, 910)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_112_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_112_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_112_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_112_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_112_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_112_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_112_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_112_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: TYPE: how to make egg salad sandwich\nB: CLICK: (100, 485)\nC: COMPLETE\nD: CLICK: (919, 74)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (858, 818)\nstep 2: TYPE: tiktok\nstep 3: CLICK: (877, 890)\nstep 4: PRESS_HOME\nstep 5: CLICK: (374, 806)\nstep 6: PRESS_HOME\nstep 7: CLICK: (858, 818)\nstep 8: CLICK: (802, 74)\nI want to Begin by uninstalling the TikTok app using the Google Play Store. Once done, navigate to Settings to verify that TikTok has been successfully uninstalled. After confirmation, download a similar app such as Triller from the Google Play Store, and then proceed to open the Triller app. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Begin by uninstalling the TikTok app using the Google Play Store. Once done, navigate to Settings to verify that TikTok has been successfully uninstalled. After confirmation, download a similar app such as Triller from the Google Play Store, and then proceed to open the Triller app.\nThe historical actions are: step 1: CLICK: (858, 818)\nstep 2: TYPE: tiktok\nstep 3: CLICK: (877, 890)\nstep 4: PRESS_HOME\nstep 5: CLICK: (374, 806)\nstep 6: PRESS_HOME\nstep 7: CLICK: (858, 818)\nstep 8: CLICK: (802, 74)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: TYPE: how to make egg salad sandwich\nB: CLICK: (100, 485)\nC: COMPLETE\nD: CLICK: (919, 74)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_113_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_113_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_113_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_113_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_113_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_113_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_113_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_113_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_113_8.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: PRESS_HOME\nB: SCROLL: UP\nC: CLICK: (875, 379)\nD: CLICK: (264, 873)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (797, 486)\nstep 2: CLICK: (153, 894)\nstep 3: TYPE: Love story\nstep 4: CLICK: (86, 252)\nstep 5: CLICK: (390, 433)\nstep 6: CLICK: (546, 892)\nstep 7: PRESS_HOME\nstep 8: CLICK: (789, 660)\nstep 9: CLICK: (746, 144)\nstep 10: SCROLL: UP\nI want to Using Amazon Music, play the song 'Love Story' and then utilize Yandex Translate to convert the first line of the lyrics into Dutch. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Using Amazon Music, play the song 'Love Story' and then utilize Yandex Translate to convert the first line of the lyrics into Dutch.\nThe historical actions are: step 1: CLICK: (797, 486)\nstep 2: CLICK: (153, 894)\nstep 3: TYPE: Love story\nstep 4: CLICK: (86, 252)\nstep 5: CLICK: (390, 433)\nstep 6: CLICK: (546, 892)\nstep 7: PRESS_HOME\nstep 8: CLICK: (789, 660)\nstep 9: CLICK: (746, 144)\nstep 10: SCROLL: UP\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: PRESS_HOME\nB: SCROLL: UP\nC: CLICK: (875, 379)\nD: CLICK: (264, 873)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_114_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_114_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_114_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_114_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_114_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_114_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_114_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_114_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_114_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_114_9.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_114_10.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: TYPE: Razer\nB: CLICK: (174, 731)\nC: CLICK: (637, 110)\nD: TYPE: Duolingo\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: PRESS_HOME\nstep 2: SCROLL: RIGHT\nstep 3: CLICK: (313, 590)\nstep 4: CLICK: (166, 459)\nstep 5: CLICK: (316, 632)\nstep 6: CLICK: (855, 590)\nI want to Using 'ClevCalc - Calculator', compute the sum of 1.5 and 98, then document today's total cost in 'Wallet: Budget Money Manager'. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Using 'ClevCalc - Calculator', compute the sum of 1.5 and 98, then document today's total cost in 'Wallet: Budget Money Manager'.\nThe historical actions are: step 1: PRESS_HOME\nstep 2: SCROLL: RIGHT\nstep 3: CLICK: (313, 590)\nstep 4: CLICK: (166, 459)\nstep 5: CLICK: (316, 632)\nstep 6: CLICK: (855, 590)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: TYPE: Razer\nB: CLICK: (174, 731)\nC: CLICK: (637, 110)\nD: TYPE: Duolingo\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_115_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_115_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_115_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_115_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_115_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_115_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_115_6.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (436, 484)\nB: TYPE: caba62244@gmail.com\nC: PRESS_HOME\nD: CLICK: (886, 895)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (331, 912)\nstep 2: TYPE: Hairstyling technique\nstep 3: CLICK: (869, 891)\nstep 4: CLICK: (675, 628)\nstep 5: PRESS_HOME\nI want to Watch a makeup tutorial on YouTube and practice the look while keeping track of the time with the Clock app. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Watch a makeup tutorial on YouTube and practice the look while keeping track of the time with the Clock app.\nThe historical actions are: step 1: CLICK: (331, 912)\nstep 2: TYPE: Hairstyling technique\nstep 3: CLICK: (869, 891)\nstep 4: CLICK: (675, 628)\nstep 5: PRESS_HOME\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (436, 484)\nB: TYPE: caba62244@gmail.com\nC: PRESS_HOME\nD: CLICK: (886, 895)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_116_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_116_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_116_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_116_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_116_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_116_5.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (494, 311)\nB: TYPE: Business\nC: TYPE: yoga for beginners\nD: TYPE: tiktok\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (426, 714)\nstep 2: SCROLL: UP\nstep 3: SCROLL: UP\nstep 4: CLICK: (123, 697)\nI want to Update your phone's language settings to Danish and then launch the Photos app to verify the change. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Update your phone's language settings to Danish and then launch the Photos app to verify the change.\nThe historical actions are: step 1: CLICK: (426, 714)\nstep 2: SCROLL: UP\nstep 3: SCROLL: UP\nstep 4: CLICK: (123, 697)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (494, 311)\nB: TYPE: Business\nC: TYPE: yoga for beginners\nD: TYPE: tiktok\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_117_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_117_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_117_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_117_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_117_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (605, 885)\nB: CLICK: (555, 585)\nC: CLICK: (524, 578)\nD: CLICK: (938, 885)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (608, 515)\nstep 2: CLICK: (572, 668)\nstep 3: TYPE: hotel\nstep 4: CLICK: (919, 885)\nstep 5: PRESS_HOME\nstep 6: CLICK: (422, 492)\nstep 7: CLICK: (397, 612)\nstep 8: TYPE: Russ Hotel\nstep 9: CLICK: (366, 432)\nstep 10: CLICK: (608, 867)\nI want to Locate a nearby hotel using Waze Navigation & Live Traffic, and then request a ride through Lyft. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Locate a nearby hotel using Waze Navigation & Live Traffic, and then request a ride through Lyft.\nThe historical actions are: step 1: CLICK: (608, 515)\nstep 2: CLICK: (572, 668)\nstep 3: TYPE: hotel\nstep 4: CLICK: (919, 885)\nstep 5: PRESS_HOME\nstep 6: CLICK: (422, 492)\nstep 7: CLICK: (397, 612)\nstep 8: TYPE: Russ Hotel\nstep 9: CLICK: (366, 432)\nstep 10: CLICK: (608, 867)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (605, 885)\nB: CLICK: (555, 585)\nC: CLICK: (524, 578)\nD: CLICK: (938, 885)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_118_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_118_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_118_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_118_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_118_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_118_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_118_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_118_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_118_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_118_9.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_118_10.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (304, 305)\nB: CLICK: (855, 603)\nC: PRESS_HOME\nD: CLICK: (905, 892)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (130, 142)\nstep 2: CLICK: (813, 164)\nstep 3: TYPE: the best boutique hotel in Los Angeles\nI want to Utilize Duckduckgo to search for the top-rated boutique hotel in your local city, and then use Lyft to navigate to it. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Utilize Duckduckgo to search for the top-rated boutique hotel in your local city, and then use Lyft to navigate to it.\nThe historical actions are: step 1: CLICK: (130, 142)\nstep 2: CLICK: (813, 164)\nstep 3: TYPE: the best boutique hotel in Los Angeles\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (304, 305)\nB: CLICK: (855, 603)\nC: PRESS_HOME\nD: CLICK: (905, 892)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_119_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_119_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_119_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_119_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: COMPLETE\nB: TYPE: how to make garden salad\nC: TYPE: AirPods Pro 2\nD: SCROLL: UP\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (400, 655)\nstep 3: CLICK: (842, 74)\nstep 4: TYPE: 'The Night Circus' by Erin Morgenstern book review\nstep 5: CLICK: (916, 919)\nstep 6: CLICK: (342, 116)\nstep 7: CLICK: (858, 237)\nstep 8: PRESS_HOME\nstep 9: CLICK: (618, 116)\nstep 10: CLICK: (43, 71)\nstep 11: CLICK: (630, 80)\nstep 12: CLICK: (817, 77)\nstep 13: TYPE: 'The Night Circus' by Erin Morgenstern\nstep 14: CLICK: (939, 919)\nstep 15: CLICK: (210, 259)\nI want to Look up a book review for 'The Night Circus' by Erin Morgenstern online and then buy either the ebook or a physical copy using Ebay or Facebook. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Look up a book review for 'The Night Circus' by Erin Morgenstern online and then buy either the ebook or a physical copy using Ebay or Facebook.\nThe historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (400, 655)\nstep 3: CLICK: (842, 74)\nstep 4: TYPE: 'The Night Circus' by Erin Morgenstern book review\nstep 5: CLICK: (916, 919)\nstep 6: CLICK: (342, 116)\nstep 7: CLICK: (858, 237)\nstep 8: PRESS_HOME\nstep 9: CLICK: (618, 116)\nstep 10: CLICK: (43, 71)\nstep 11: CLICK: (630, 80)\nstep 12: CLICK: (817, 77)\nstep 13: TYPE: 'The Night Circus' by Erin Morgenstern\nstep 14: CLICK: (939, 919)\nstep 15: CLICK: (210, 259)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: COMPLETE\nB: TYPE: how to make garden salad\nC: TYPE: AirPods Pro 2\nD: SCROLL: UP\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_120_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_120_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_120_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_120_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_120_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_120_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_120_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_120_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_120_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_120_9.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_120_10.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_120_11.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_120_12.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_120_13.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_120_14.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_120_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: TYPE: 'Educated' by Tara Westover\nB: CLICK: (463, 651)\nC: CLICK: (26, 971)\nD: SCROLL: UP\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (594, 732)\nstep 2: PRESS_HOME\nstep 3: CLICK: (594, 328)\nstep 4: CLICK: (555, 615)\nstep 5: CLICK: (852, 893)\nstep 6: TYPE: Chocolate chip cookies: 150g salted butter, softened 80g light brown muscovado sugar, 80g granulated sugar, 2 tsp vanilla extract, 1 large egg, 225g plain flour, 1/2 tsp bicarbonate of soda, 1/4 tsp salt, 200g plain chocolate chips or chunks\nstep 7: CLICK: (883, 317)\nstep 8: PRESS_HOME\nstep 9: CLICK: (611, 462)\nstep 10: CLICK: (588, 74)\nstep 11: CLICK: (919, 84)\nstep 12: TYPE: vanilla extract\nstep 13: CLICK: (888, 889)\nI want to Open Firefox Browser and search for a recipe for Chocolate chip cookies. Once you've found a suitable recipe, use Microsoft To Do to create a shopping list of the main ingredients. Finally, add these ingredients to your cart on Amazon. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Open Firefox Browser and search for a recipe for Chocolate chip cookies. Once you've found a suitable recipe, use Microsoft To Do to create a shopping list of the main ingredients. Finally, add these ingredients to your cart on Amazon.\nThe historical actions are: step 1: CLICK: (594, 732)\nstep 2: PRESS_HOME\nstep 3: CLICK: (594, 328)\nstep 4: CLICK: (555, 615)\nstep 5: CLICK: (852, 893)\nstep 6: TYPE: Chocolate chip cookies: 150g salted butter, softened 80g light brown muscovado sugar, 80g granulated sugar, 2 tsp vanilla extract, 1 large egg, 225g plain flour, 1/2 tsp bicarbonate of soda, 1/4 tsp salt, 200g plain chocolate chips or chunks\nstep 7: CLICK: (883, 317)\nstep 8: PRESS_HOME\nstep 9: CLICK: (611, 462)\nstep 10: CLICK: (588, 74)\nstep 11: CLICK: (919, 84)\nstep 12: TYPE: vanilla extract\nstep 13: CLICK: (888, 889)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: TYPE: 'Educated' by Tara Westover\nB: CLICK: (463, 651)\nC: CLICK: (26, 971)\nD: SCROLL: UP\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_121_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_121_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_121_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_121_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_121_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_121_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_121_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_121_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_121_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_121_9.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_121_10.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_121_11.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_121_12.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_121_13.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (306, 498)\nB: SCROLL: UP\nC: SCROLL: LEFT\nD: CLICK: (308, 71)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (580, 215)\nstep 2: PRESS_HOME\nstep 3: CLICK: (330, 231)\nstep 4: CLICK: (111, 78)\nstep 5: CLICK: (602, 917)\nI want to Using the Firefox Browser, find a well-known K-Pop band, listen to their most recent album on Pandora, and determine if concert tickets are available for purchase through StubHub. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Using the Firefox Browser, find a well-known K-Pop band, listen to their most recent album on Pandora, and determine if concert tickets are available for purchase through StubHub.\nThe historical actions are: step 1: CLICK: (580, 215)\nstep 2: PRESS_HOME\nstep 3: CLICK: (330, 231)\nstep 4: CLICK: (111, 78)\nstep 5: CLICK: (602, 917)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (306, 498)\nB: SCROLL: UP\nC: SCROLL: LEFT\nD: CLICK: (308, 71)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_122_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_122_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_122_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_122_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_122_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_122_5.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: SCROLL: LEFT\nB: CLICK: (504, 927)\nC: CLICK: (305, 219)\nD: COMPLETE\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: SCROLL: LEFT\nstep 2: CLICK: (872, 117)\nstep 3: CLICK: (280, 56)\nstep 4: TYPE: Nvidia RTX 3080\nstep 5: CLICK: (907, 904)\nstep 6: SCROLL: UP\nstep 7: SCROLL: UP\nstep 8: PRESS_HOME\nstep 9: CLICK: (143, 102)\nstep 10: CLICK: (240, 61)\nstep 11: TYPE: Nvidia RTX 3080\nstep 12: CLICK: (954, 907)\nstep 13: SCROLL: UP\nstep 14: PRESS_HOME\nstep 15: CLICK: (842, 118)\nI want to Look up the prices for an Nvidia RTX 3080 across different shopping platforms, specifically Amazon and AliExpress, then add the most affordable option to your cart. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Look up the prices for an Nvidia RTX 3080 across different shopping platforms, specifically Amazon and AliExpress, then add the most affordable option to your cart.\nThe historical actions are: step 1: SCROLL: LEFT\nstep 2: CLICK: (872, 117)\nstep 3: CLICK: (280, 56)\nstep 4: TYPE: Nvidia RTX 3080\nstep 5: CLICK: (907, 904)\nstep 6: SCROLL: UP\nstep 7: SCROLL: UP\nstep 8: PRESS_HOME\nstep 9: CLICK: (143, 102)\nstep 10: CLICK: (240, 61)\nstep 11: TYPE: Nvidia RTX 3080\nstep 12: CLICK: (954, 907)\nstep 13: SCROLL: UP\nstep 14: PRESS_HOME\nstep 15: CLICK: (842, 118)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: SCROLL: LEFT\nB: CLICK: (504, 927)\nC: CLICK: (305, 219)\nD: COMPLETE\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_123_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_123_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_123_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_123_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_123_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_123_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_123_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_123_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_123_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_123_9.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_123_10.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_123_11.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_123_12.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_123_13.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_123_14.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_123_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: TYPE: 2012 Nobel-Prize winners in physics\nB: CLICK: (876, 895)\nC: CLICK: (593, 514)\nD: SCROLL: UP\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (140, 626)\nstep 2: CLICK: (400, 288)\nstep 3: TYPE: hiking trail\nstep 4: CLICK: (916, 904)\nstep 5: SCROLL: UP\nstep 6: SCROLL: UP\nstep 7: SCROLL: UP\nstep 8: CLICK: (581, 604)\nstep 9: SCROLL: UP\nstep 10: PRESS_HOME\nI want to Use Chrome to find a new hiking trail, check the weekend weather forecast using Weather & Radar, and then invite katsunaksu to join the adventure through Tumblr. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Use Chrome to find a new hiking trail, check the weekend weather forecast using Weather & Radar, and then invite katsunaksu to join the adventure through Tumblr.\nThe historical actions are: step 1: CLICK: (140, 626)\nstep 2: CLICK: (400, 288)\nstep 3: TYPE: hiking trail\nstep 4: CLICK: (916, 904)\nstep 5: SCROLL: UP\nstep 6: SCROLL: UP\nstep 7: SCROLL: UP\nstep 8: CLICK: (581, 604)\nstep 9: SCROLL: UP\nstep 10: PRESS_HOME\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: TYPE: 2012 Nobel-Prize winners in physics\nB: CLICK: (876, 895)\nC: CLICK: (593, 514)\nD: SCROLL: UP\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_124_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_124_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_124_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_124_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_124_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_124_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_124_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_124_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_124_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_124_9.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_124_10.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (913, 74)\nB: CLICK: (555, 475)\nC: SCROLL: UP\nD: TYPE: business\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (858, 505)\nstep 2: CLICK: (524, 563)\nstep 3: CLICK: (64, 54)\nI want to Locate a recent business-related news story using Google News and then add this event to your Calendar. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Locate a recent business-related news story using Google News and then add this event to your Calendar.\nThe historical actions are: step 1: CLICK: (858, 505)\nstep 2: CLICK: (524, 563)\nstep 3: CLICK: (64, 54)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (913, 74)\nB: CLICK: (555, 475)\nC: SCROLL: UP\nD: TYPE: business\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_125_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_125_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_125_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_125_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: TYPE: how to train strength\nB: CLICK: (284, 307)\nC: PRESS_HOME\nD: CLICK: (408, 867)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (573, 244)\nstep 2: CLICK: (950, 909)\nstep 3: TYPE: strength training in the morning\nstep 4: CLICK: (974, 843)\nstep 5: PRESS_HOME\nstep 6: CLICK: (576, 903)\nstep 7: CLICK: (973, 44)\nI want to Search for a beginner-friendly strength training workout video on YouTube and create a reminder in Microsoft To Do to perform it tomorrow morning. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Search for a beginner-friendly strength training workout video on YouTube and create a reminder in Microsoft To Do to perform it tomorrow morning.\nThe historical actions are: step 1: CLICK: (573, 244)\nstep 2: CLICK: (950, 909)\nstep 3: TYPE: strength training in the morning\nstep 4: CLICK: (974, 843)\nstep 5: PRESS_HOME\nstep 6: CLICK: (576, 903)\nstep 7: CLICK: (973, 44)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: TYPE: how to train strength\nB: CLICK: (284, 307)\nC: PRESS_HOME\nD: CLICK: (408, 867)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_126_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_126_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_126_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_126_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_126_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_126_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_126_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_126_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: COMPLETE\nB: TYPE: football\nC: CLICK: (421, 593)\nD: SCROLL: UP\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (419, 654)\nstep 2: TYPE: family movies\nstep 3: CLICK: (913, 924)\nstep 4: CLICK: (170, 434)\nstep 5: SCROLL: UP\nstep 6: SCROLL: UP\nI want to Coordinate a family movie night by selecting a movie suitable for all ages on DuckDuckgo, adding snacks to your cart on Amazon, and sending invitations to Tzhau Jau via Instagram. Don't forget to set a reminder on your Clock app to ensure everything goes smoothly. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Coordinate a family movie night by selecting a movie suitable for all ages on DuckDuckgo, adding snacks to your cart on Amazon, and sending invitations to Tzhau Jau via Instagram. Don't forget to set a reminder on your Clock app to ensure everything goes smoothly.\nThe historical actions are: step 1: CLICK: (419, 654)\nstep 2: TYPE: family movies\nstep 3: CLICK: (913, 924)\nstep 4: CLICK: (170, 434)\nstep 5: SCROLL: UP\nstep 6: SCROLL: UP\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: COMPLETE\nB: TYPE: football\nC: CLICK: (421, 593)\nD: SCROLL: UP\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_127_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_127_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_127_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_127_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_127_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_127_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_127_6.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: TYPE: Cape Town,tomorrowLcloudy todolist: buy a flight to Cape Town\nB: SCROLL: UP\nC: CLICK: (411, 548)\nD: CLICK: (277, 296)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (561, 918)\nstep 2: CLICK: (523, 79)\nstep 3: CLICK: (957, 91)\nstep 4: TYPE: Cape Town weather tomorrow\nstep 5: CLICK: (883, 884)\nstep 6: PRESS_HOME\nstep 7: CLICK: (71, 331)\nstep 8: CLICK: (124, 398)\nI want to Use Daily Forecast to check the weather prediction for Cape Town for the upcoming day. Based on the forecast, create and organize a to-do list using WPS Office. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Use Daily Forecast to check the weather prediction for Cape Town for the upcoming day. Based on the forecast, create and organize a to-do list using WPS Office.\nThe historical actions are: step 1: CLICK: (561, 918)\nstep 2: CLICK: (523, 79)\nstep 3: CLICK: (957, 91)\nstep 4: TYPE: Cape Town weather tomorrow\nstep 5: CLICK: (883, 884)\nstep 6: PRESS_HOME\nstep 7: CLICK: (71, 331)\nstep 8: CLICK: (124, 398)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: TYPE: Cape Town,tomorrowLcloudy todolist: buy a flight to Cape Town\nB: SCROLL: UP\nC: CLICK: (411, 548)\nD: CLICK: (277, 296)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_128_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_128_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_128_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_128_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_128_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_128_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_128_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_128_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_128_8.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: SCROLL: UP\nB: CLICK: (916, 815)\nC: COMPLETE\nD: CLICK: (340, 328)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (884, 614)\nstep 2: SCROLL: UP\nstep 3: SCROLL: UP\nstep 4: SCROLL: UP\nstep 5: CLICK: (443, 644)\nstep 6: CLICK: (323, 408)\nstep 7: CLICK: (316, 475)\nstep 8: CLICK: (297, 674)\nstep 9: CLICK: (938, 87)\nstep 10: TYPE: Finnish\nstep 11: CLICK: (131, 193)\nstep 12: SCROLL: UP\nstep 13: CLICK: (837, 663)\nstep 14: PRESS_HOME\nstep 15: CLICK: (495, 607)\nI want to Switch the language on your phone to Finnish, then open the Settings app to verify the change. Additionally, open the Contacts app to ensure the language update is reflected there as well. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Switch the language on your phone to Finnish, then open the Settings app to verify the change. Additionally, open the Contacts app to ensure the language update is reflected there as well.\nThe historical actions are: step 1: CLICK: (884, 614)\nstep 2: SCROLL: UP\nstep 3: SCROLL: UP\nstep 4: SCROLL: UP\nstep 5: CLICK: (443, 644)\nstep 6: CLICK: (323, 408)\nstep 7: CLICK: (316, 475)\nstep 8: CLICK: (297, 674)\nstep 9: CLICK: (938, 87)\nstep 10: TYPE: Finnish\nstep 11: CLICK: (131, 193)\nstep 12: SCROLL: UP\nstep 13: CLICK: (837, 663)\nstep 14: PRESS_HOME\nstep 15: CLICK: (495, 607)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: SCROLL: UP\nB: CLICK: (916, 815)\nC: COMPLETE\nD: CLICK: (340, 328)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_129_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_129_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_129_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_129_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_129_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_129_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_129_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_129_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_129_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_129_9.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_129_10.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_129_11.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_129_12.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_129_13.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_129_14.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_129_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: TYPE: digital marketing class\nB: SCROLL: UP\nC: CLICK: (441, 645)\nD: CLICK: (206, 427)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (493, 94)\nstep 3: CLICK: (574, 91)\nstep 4: TYPE: KitchenAid Artisan Stand Mixer\nstep 5: CLICK: (872, 860)\nstep 6: SCROLL: UP\nI want to Investigate the prices of a KitchenAid Artisan Stand Mixer across the shopping apps Target and Amazon, then place the most affordable option into your cart. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Investigate the prices of a KitchenAid Artisan Stand Mixer across the shopping apps Target and Amazon, then place the most affordable option into your cart.\nThe historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (493, 94)\nstep 3: CLICK: (574, 91)\nstep 4: TYPE: KitchenAid Artisan Stand Mixer\nstep 5: CLICK: (872, 860)\nstep 6: SCROLL: UP\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: TYPE: digital marketing class\nB: SCROLL: UP\nC: CLICK: (441, 645)\nD: CLICK: (206, 427)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_130_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_130_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_130_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_130_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_130_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_130_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_130_6.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: SCROLL: UP\nB: CLICK: (957, 91)\nC: CLICK: (627, 943)\nD: CLICK: (770, 897)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (797, 482)\nstep 2: CLICK: (148, 892)\nstep 3: TYPE: Love story\nstep 4: CLICK: (105, 248)\nstep 5: CLICK: (384, 438)\nstep 6: CLICK: (565, 897)\nstep 7: PRESS_HOME\nstep 8: CLICK: (558, 667)\nstep 9: CLICK: (391, 683)\nstep 10: CLICK: (657, 909)\nI want to Use Amazon Music to listen to the song 'Love Story' and then open Microsoft Translator to translate the first line of the lyrics into Danish. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Use Amazon Music to listen to the song 'Love Story' and then open Microsoft Translator to translate the first line of the lyrics into Danish.\nThe historical actions are: step 1: CLICK: (797, 482)\nstep 2: CLICK: (148, 892)\nstep 3: TYPE: Love story\nstep 4: CLICK: (105, 248)\nstep 5: CLICK: (384, 438)\nstep 6: CLICK: (565, 897)\nstep 7: PRESS_HOME\nstep 8: CLICK: (558, 667)\nstep 9: CLICK: (391, 683)\nstep 10: CLICK: (657, 909)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: SCROLL: UP\nB: CLICK: (957, 91)\nC: CLICK: (627, 943)\nD: CLICK: (770, 897)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_131_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_131_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_131_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_131_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_131_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_131_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_131_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_131_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_131_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_131_9.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_131_10.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (274, 169)\nB: CLICK: (50, 970)\nC: CLICK: (744, 488)\nD: CLICK: (815, 73)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (585, 837)\nstep 2: PRESS_HOME\nstep 3: CLICK: (835, 842)\nI want to Open YouTube to watch a video that recommends various fitness tracking apps, and then proceed to download one of the suggested apps from the Google Play Store. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Open YouTube to watch a video that recommends various fitness tracking apps, and then proceed to download one of the suggested apps from the Google Play Store.\nThe historical actions are: step 1: CLICK: (585, 837)\nstep 2: PRESS_HOME\nstep 3: CLICK: (835, 842)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (274, 169)\nB: CLICK: (50, 970)\nC: CLICK: (744, 488)\nD: CLICK: (815, 73)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_132_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_132_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_132_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_132_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (773, 544)\nB: CLICK: (819, 144)\nC: LONG_PRESS: (307, 479)\nD: TYPE:  3D Printer Course for Beginners \n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (393, 666)\nstep 2: CLICK: (417, 98)\nstep 3: TYPE: Farfetch\nstep 4: CLICK: (901, 893)\nstep 5: CLICK: (483, 331)\nstep 6: CLICK: (352, 275)\nI want to Remove the Farfetch app using Google Play Store, then navigate to Settings to verify if the app resources are still present. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Remove the Farfetch app using Google Play Store, then navigate to Settings to verify if the app resources are still present.\nThe historical actions are: step 1: CLICK: (393, 666)\nstep 2: CLICK: (417, 98)\nstep 3: TYPE: Farfetch\nstep 4: CLICK: (901, 893)\nstep 5: CLICK: (483, 331)\nstep 6: CLICK: (352, 275)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (773, 544)\nB: CLICK: (819, 144)\nC: LONG_PRESS: (307, 479)\nD: TYPE:  3D Printer Course for Beginners \n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_133_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_133_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_133_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_133_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_133_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_133_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_133_6.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: PRESS_HOME\nB: CLICK: (605, 885)\nC: LONG_PRESS: (106, 287)\nD: COMPLETE\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (637, 812)\nstep 2: CLICK: (797, 180)\nstep 3: CLICK: (508, 584)\nstep 4: CLICK: (912, 703)\nI want to Using Chrome, search for the 2020 Nobel-Prize winners in physics. Once you have the information, record it in Google Docs. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Using Chrome, search for the 2020 Nobel-Prize winners in physics. Once you have the information, record it in Google Docs.\nThe historical actions are: step 1: CLICK: (637, 812)\nstep 2: CLICK: (797, 180)\nstep 3: CLICK: (508, 584)\nstep 4: CLICK: (912, 703)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: PRESS_HOME\nB: CLICK: (605, 885)\nC: LONG_PRESS: (106, 287)\nD: COMPLETE\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_134_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_134_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_134_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_134_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_134_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (413, 83)\nB: PRESS_HOME\nC: CLICK: (184, 249)\nD: CLICK: (143, 706)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (656, 813)\nstep 2: CLICK: (486, 55)\nstep 3: TYPE: properties of rectangle\nstep 4: CLICK: (486, 114)\nstep 5: LONG_PRESS: (55, 533)\nstep 6: SCROLL: RIGHT\nstep 7: CLICK: (155, 485)\nstep 8: PRESS_HOME\nI want to Use Chrome to search for information about the properties of a Rectangle, and then use Google Docs to create a brief document summarizing your findings. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Use Chrome to search for information about the properties of a Rectangle, and then use Google Docs to create a brief document summarizing your findings.\nThe historical actions are: step 1: CLICK: (656, 813)\nstep 2: CLICK: (486, 55)\nstep 3: TYPE: properties of rectangle\nstep 4: CLICK: (486, 114)\nstep 5: LONG_PRESS: (55, 533)\nstep 6: SCROLL: RIGHT\nstep 7: CLICK: (155, 485)\nstep 8: PRESS_HOME\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (413, 83)\nB: PRESS_HOME\nC: CLICK: (184, 249)\nD: CLICK: (143, 706)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_135_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_135_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_135_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_135_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_135_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_135_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_135_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_135_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_135_8.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (607, 941)\nB: COMPLETE\nC: CLICK: (914, 29)\nD: CLICK: (467, 833)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (109, 509)\nstep 2: CLICK: (483, 925)\nstep 3: CLICK: (369, 261)\nstep 4: TYPE: chessboard\nstep 5: CLICK: (118, 971)\nstep 6: CLICK: (435, 843)\nstep 7: CLICK: (861, 837)\nstep 8: SCROLL: UP\nstep 9: CLICK: (299, 772)\nstep 10: CLICK: (194, 472)\nstep 11: CLICK: (914, 23)\nI want to Use the GenZArt:Fast AI Art Generator app to create an image with a chessboard theme, and then share it on Instagram with moments. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Use the GenZArt:Fast AI Art Generator app to create an image with a chessboard theme, and then share it on Instagram with moments.\nThe historical actions are: step 1: CLICK: (109, 509)\nstep 2: CLICK: (483, 925)\nstep 3: CLICK: (369, 261)\nstep 4: TYPE: chessboard\nstep 5: CLICK: (118, 971)\nstep 6: CLICK: (435, 843)\nstep 7: CLICK: (861, 837)\nstep 8: SCROLL: UP\nstep 9: CLICK: (299, 772)\nstep 10: CLICK: (194, 472)\nstep 11: CLICK: (914, 23)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (607, 941)\nB: COMPLETE\nC: CLICK: (914, 29)\nD: CLICK: (467, 833)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_136_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_136_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_136_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_136_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_136_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_136_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_136_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_136_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_136_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_136_9.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_136_10.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_136_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: SCROLL: UP\nB: CLICK: (921, 817)\nC: PRESS_HOME\nD: TYPE: snacks\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (415, 365)\nstep 2: CLICK: (202, 72)\nstep 3: CLICK: (751, 550)\nstep 4: CLICK: (302, 292)\nstep 5: PRESS_HOME\nstep 6: CLICK: (605, 523)\nI want to Engage in an Arabic language lesson and use Todoist to set a structured learning plan, while practicing with Rosetta Stone: Learn, Practice. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Engage in an Arabic language lesson and use Todoist to set a structured learning plan, while practicing with Rosetta Stone: Learn, Practice.\nThe historical actions are: step 1: CLICK: (415, 365)\nstep 2: CLICK: (202, 72)\nstep 3: CLICK: (751, 550)\nstep 4: CLICK: (302, 292)\nstep 5: PRESS_HOME\nstep 6: CLICK: (605, 523)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: SCROLL: UP\nB: CLICK: (921, 817)\nC: PRESS_HOME\nD: TYPE: snacks\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_137_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_137_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_137_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_137_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_137_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_137_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_137_6.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (347, 347)\nB: CLICK: (311, 828)\nC: COMPLETE\nD: PRESS_HOME\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (432, 721)\nstep 2: CLICK: (148, 940)\nstep 3: CLICK: (955, 781)\nstep 4: PRESS_HOME\nstep 5: SCROLL: LEFT\nstep 6: CLICK: (442, 108)\nI want to Switch to dark mode in the Settings app, then open the Amazon Kindle reading app. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Switch to dark mode in the Settings app, then open the Amazon Kindle reading app.\nThe historical actions are: step 1: CLICK: (432, 721)\nstep 2: CLICK: (148, 940)\nstep 3: CLICK: (955, 781)\nstep 4: PRESS_HOME\nstep 5: SCROLL: LEFT\nstep 6: CLICK: (442, 108)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (347, 347)\nB: CLICK: (311, 828)\nC: COMPLETE\nD: PRESS_HOME\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_138_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_138_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_138_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_138_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_138_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_138_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_138_6.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (424, 514)\nB: PRESS_HOME\nC: COMPLETE\nD: CLICK: (432, 93)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (639, 643)\nstep 2: SCROLL: UP\nstep 3: CLICK: (341, 714)\nstep 4: CLICK: (906, 717)\nstep 5: PRESS_HOME\nstep 6: CLICK: (136, 644)\nI want to Switch to dark mode in Setting and then launch the 'Ploter - Ebook, Audiobook, PDF' app. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Switch to dark mode in Setting and then launch the 'Ploter - Ebook, Audiobook, PDF' app.\nThe historical actions are: step 1: CLICK: (639, 643)\nstep 2: SCROLL: UP\nstep 3: CLICK: (341, 714)\nstep 4: CLICK: (906, 717)\nstep 5: PRESS_HOME\nstep 6: CLICK: (136, 644)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (424, 514)\nB: PRESS_HOME\nC: COMPLETE\nD: CLICK: (432, 93)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_139_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_139_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_139_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_139_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_139_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_139_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_139_6.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: TYPE: fitness tracking apps\nB: TYPE: Meeting\nC: CLICK: (42, 69)\nD: COMPLETE\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (834, 504)\nstep 3: CLICK: (447, 693)\nstep 4: TYPE: football field\nstep 5: CLICK: (571, 208)\nstep 6: PRESS_RECENT\nstep 7: SCROLL: RIGHT\nstep 8: CLICK: (524, 529)\nstep 9: CLICK: (357, 67)\nI want to Start by utilizing Petal Maps - GPS & Navigation to locate a nearby Football field. Once found, proceed to the Google Play Store to download a fitness tracking app and set your fitness goals. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Start by utilizing Petal Maps - GPS & Navigation to locate a nearby Football field. Once found, proceed to the Google Play Store to download a fitness tracking app and set your fitness goals.\nThe historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (834, 504)\nstep 3: CLICK: (447, 693)\nstep 4: TYPE: football field\nstep 5: CLICK: (571, 208)\nstep 6: PRESS_RECENT\nstep 7: SCROLL: RIGHT\nstep 8: CLICK: (524, 529)\nstep 9: CLICK: (357, 67)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: TYPE: fitness tracking apps\nB: TYPE: Meeting\nC: CLICK: (42, 69)\nD: COMPLETE\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_140_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_140_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_140_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_140_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_140_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_140_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_140_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_140_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_140_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_140_9.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (454, 354)\nB: SCROLL: LEFT\nC: CLICK: (288, 823)\nD: CLICK: (884, 885)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (572, 922)\nstep 3: CLICK: (459, 878)\nstep 4: CLICK: (146, 90)\nstep 5: TYPE: modern\nstep 6: CLICK: (880, 887)\nstep 7: CLICK: (514, 495)\nstep 8: CLICK: (460, 96)\nstep 9: CLICK: (217, 686)\nstep 10: PRESS_HOME\nstep 11: CLICK: (454, 508)\nstep 12: CLICK: (552, 391)\nstep 13: CLICK: (465, 271)\nI want to Use Pinterest to find a modern-style picture and set it as your phone's wallpaper through the Settings app. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Use Pinterest to find a modern-style picture and set it as your phone's wallpaper through the Settings app.\nThe historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (572, 922)\nstep 3: CLICK: (459, 878)\nstep 4: CLICK: (146, 90)\nstep 5: TYPE: modern\nstep 6: CLICK: (880, 887)\nstep 7: CLICK: (514, 495)\nstep 8: CLICK: (460, 96)\nstep 9: CLICK: (217, 686)\nstep 10: PRESS_HOME\nstep 11: CLICK: (454, 508)\nstep 12: CLICK: (552, 391)\nstep 13: CLICK: (465, 271)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (454, 354)\nB: SCROLL: LEFT\nC: CLICK: (288, 823)\nD: CLICK: (884, 885)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_141_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_141_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_141_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_141_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_141_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_141_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_141_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_141_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_141_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_141_9.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_141_10.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_141_11.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_141_12.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_141_13.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: TYPE: Hospital Helipad\nB: CLICK: (526, 586)\nC: CLICK: (738, 62)\nD: PRESS_HOME\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (244, 749)\nstep 2: CLICK: (931, 655)\nstep 3: CLICK: (73, 796)\nstep 4: TYPE: Shanghai, China itinerary\nstep 5: CLICK: (744, 63)\nstep 6: SCROLL: LEFT\nstep 7: SCROLL: LEFT\nstep 8: SCROLL: LEFT\nstep 9: SCROLL: LEFT\nstep 10: SCROLL: LEFT\nstep 11: SCROLL: LEFT\nI want to Using the X app and Tripadvisor, find an itinerary for visiting Shanghai, China and book your accommodations. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Using the X app and Tripadvisor, find an itinerary for visiting Shanghai, China and book your accommodations.\nThe historical actions are: step 1: CLICK: (244, 749)\nstep 2: CLICK: (931, 655)\nstep 3: CLICK: (73, 796)\nstep 4: TYPE: Shanghai, China itinerary\nstep 5: CLICK: (744, 63)\nstep 6: SCROLL: LEFT\nstep 7: SCROLL: LEFT\nstep 8: SCROLL: LEFT\nstep 9: SCROLL: LEFT\nstep 10: SCROLL: LEFT\nstep 11: SCROLL: LEFT\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: TYPE: Hospital Helipad\nB: CLICK: (526, 586)\nC: CLICK: (738, 62)\nD: PRESS_HOME\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_142_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_142_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_142_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_142_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_142_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_142_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_142_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_142_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_142_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_142_9.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_142_10.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_142_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: TYPE: PURDUE:UCONN is 60:75\nB: CLICK: (97, 308)\nC: CLICK: (212, 494)\nD: CLICK: (441, 645)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (55, 660)\nstep 2: CLICK: (408, 80)\nstep 3: TYPE: Meesho\nstep 4: CLICK: (902, 873)\nstep 5: CLICK: (604, 438)\nstep 6: CLICK: (680, 514)\nstep 7: PRESS_HOME\nI want to First, go to the Google Play Store and uninstall the Meesho app. Afterward, navigate to the Settings app to verify whether the Meesho app is still listed in the app resources. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: First, go to the Google Play Store and uninstall the Meesho app. Afterward, navigate to the Settings app to verify whether the Meesho app is still listed in the app resources.\nThe historical actions are: step 1: CLICK: (55, 660)\nstep 2: CLICK: (408, 80)\nstep 3: TYPE: Meesho\nstep 4: CLICK: (902, 873)\nstep 5: CLICK: (604, 438)\nstep 6: CLICK: (680, 514)\nstep 7: PRESS_HOME\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: TYPE: PURDUE:UCONN is 60:75\nB: CLICK: (97, 308)\nC: CLICK: (212, 494)\nD: CLICK: (441, 645)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_143_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_143_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_143_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_143_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_143_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_143_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_143_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_143_7.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: TYPE: AnyList\nB: CLICK: (807, 552)\nC: CLICK: (910, 691)\nD: TYPE: the best ice cream parlor in Los Angeles\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (122, 154)\nstep 2: CLICK: (836, 156)\nstep 3: CLICK: (424, 249)\nstep 4: CLICK: (813, 164)\nI want to Use Duckduckgo to identify the top ice cream parlor in your local city, then utilize GPS to navigate to it. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Use Duckduckgo to identify the top ice cream parlor in your local city, then utilize GPS to navigate to it.\nThe historical actions are: step 1: CLICK: (122, 154)\nstep 2: CLICK: (836, 156)\nstep 3: CLICK: (424, 249)\nstep 4: CLICK: (813, 164)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: TYPE: AnyList\nB: CLICK: (807, 552)\nC: CLICK: (910, 691)\nD: TYPE: the best ice cream parlor in Los Angeles\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_144_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_144_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_144_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_144_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_144_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: PRESS_HOME\nB: CLICK: (451, 447)\nC: CLICK: (869, 890)\nD: CLICK: (634, 640)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (702, 143)\nstep 2: CLICK: (155, 287)\nstep 3: TYPE: Healthy lunch plan\nstep 4: CLICK: (577, 449)\nstep 5: SCROLL: UP\nstep 6: CLICK: (605, 771)\nstep 7: SCROLL: UP\nstep 8: PRESS_HOME\nstep 9: CLICK: (524, 262)\nI want to Pick a nutritious lunch option for tomorrow, jot it down in Simplenote, and then watch a Tiktok video on how to prepare one of the dishes. You can also check Quora for additional information or tips. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Pick a nutritious lunch option for tomorrow, jot it down in Simplenote, and then watch a Tiktok video on how to prepare one of the dishes. You can also check Quora for additional information or tips.\nThe historical actions are: step 1: CLICK: (702, 143)\nstep 2: CLICK: (155, 287)\nstep 3: TYPE: Healthy lunch plan\nstep 4: CLICK: (577, 449)\nstep 5: SCROLL: UP\nstep 6: CLICK: (605, 771)\nstep 7: SCROLL: UP\nstep 8: PRESS_HOME\nstep 9: CLICK: (524, 262)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: PRESS_HOME\nB: CLICK: (451, 447)\nC: CLICK: (869, 890)\nD: CLICK: (634, 640)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_145_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_145_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_145_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_145_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_145_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_145_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_145_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_145_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_145_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_145_9.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: TYPE: Club Factory Shopping India\nB: TYPE: learn to grow herbs indoors \nC: PRESS_HOME\nD: CLICK: (363, 151)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (836, 82)\nstep 2: CLICK: (58, 73)\nstep 3: CLICK: (258, 424)\nI want to Utilize ABPV to find a coastal style picture and then set it as the wallpaper on your phone using the Setting app. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Utilize ABPV to find a coastal style picture and then set it as the wallpaper on your phone using the Setting app.\nThe historical actions are: step 1: CLICK: (836, 82)\nstep 2: CLICK: (58, 73)\nstep 3: CLICK: (258, 424)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: TYPE: Club Factory Shopping India\nB: TYPE: learn to grow herbs indoors \nC: PRESS_HOME\nD: CLICK: (363, 151)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_146_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_146_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_146_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_146_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (199, 934)\nB: CLICK: (179, 909)\nC: CLICK: (892, 899)\nD: TYPE: do yoga in the morning\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (634, 813)\nstep 2: PRESS_HOME\nstep 3: CLICK: (861, 803)\nstep 4: CLICK: (810, 53)\nstep 5: TYPE: Fitbod\nI want to Watch a video on YouTube about fitness tracking app recommendations and then head over to the Google Play Store to download one of them. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Watch a video on YouTube about fitness tracking app recommendations and then head over to the Google Play Store to download one of them.\nThe historical actions are: step 1: CLICK: (634, 813)\nstep 2: PRESS_HOME\nstep 3: CLICK: (861, 803)\nstep 4: CLICK: (810, 53)\nstep 5: TYPE: Fitbod\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (199, 934)\nB: CLICK: (179, 909)\nC: CLICK: (892, 899)\nD: TYPE: do yoga in the morning\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_147_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_147_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_147_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_147_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_147_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_147_5.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (899, 320)\nB: TYPE: 5000\nC: CLICK: (885, 900)\nD: CLICK: (362, 810)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (855, 813)\nstep 2: CLICK: (783, 215)\nstep 3: PRESS_HOME\nI want to First, head over to the 'Google Play Store' and install the 'Tiktok' app. Once the installation is complete, open the 'Tiktok' app. Next, navigate to the 'Setting' and turn off the notifications for 'Tiktok'. Finally, reopen the 'Tiktok' app to enjoy watching a video. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: First, head over to the 'Google Play Store' and install the 'Tiktok' app. Once the installation is complete, open the 'Tiktok' app. Next, navigate to the 'Setting' and turn off the notifications for 'Tiktok'. Finally, reopen the 'Tiktok' app to enjoy watching a video.\nThe historical actions are: step 1: CLICK: (855, 813)\nstep 2: CLICK: (783, 215)\nstep 3: PRESS_HOME\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (899, 320)\nB: TYPE: 5000\nC: CLICK: (885, 900)\nD: CLICK: (362, 810)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_148_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_148_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_148_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_148_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: SCROLL: UP\nB: CLICK: (337, 525)\nC: CLICK: (919, 58)\nD: CLICK: (347, 936)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (309, 214)\nstep 2: CLICK: (520, 230)\nstep 3: TYPE: The Music of the Night\nstep 4: CLICK: (323, 478)\nstep 5: CLICK: (383, 817)\nI want to Open Spotify and listen to the song 'The Music of the Night.' After listening, use Microsoft Translator to translate the first line of the lyrics into Dutch. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Open Spotify and listen to the song 'The Music of the Night.' After listening, use Microsoft Translator to translate the first line of the lyrics into Dutch.\nThe historical actions are: step 1: CLICK: (309, 214)\nstep 2: CLICK: (520, 230)\nstep 3: TYPE: The Music of the Night\nstep 4: CLICK: (323, 478)\nstep 5: CLICK: (383, 817)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: SCROLL: UP\nB: CLICK: (337, 525)\nC: CLICK: (919, 58)\nD: CLICK: (347, 936)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_149_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_149_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_149_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_149_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_149_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_149_5.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (909, 83)\nB: COMPLETE\nC: CLICK: (402, 879)\nD: CLICK: (897, 146)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (146, 718)\nstep 2: CLICK: (287, 252)\nstep 3: CLICK: (982, 173)\nstep 4: TYPE: esport score\nstep 5: CLICK: (887, 687)\nstep 6: CLICK: (313, 509)\nstep 7: CLICK: (582, 478)\nstep 8: PRESS_HOME\nstep 9: CLICK: (148, 107)\nstep 10: CLICK: (562, 929)\nstep 11: CLICK: (523, 122)\nstep 12: CLICK: (362, 203)\nI want to Using Chrome, find the score of a recent eSports competition and then share the result on Tumblr. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Using Chrome, find the score of a recent eSports competition and then share the result on Tumblr.\nThe historical actions are: step 1: CLICK: (146, 718)\nstep 2: CLICK: (287, 252)\nstep 3: CLICK: (982, 173)\nstep 4: TYPE: esport score\nstep 5: CLICK: (887, 687)\nstep 6: CLICK: (313, 509)\nstep 7: CLICK: (582, 478)\nstep 8: PRESS_HOME\nstep 9: CLICK: (148, 107)\nstep 10: CLICK: (562, 929)\nstep 11: CLICK: (523, 122)\nstep 12: CLICK: (362, 203)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (909, 83)\nB: COMPLETE\nC: CLICK: (402, 879)\nD: CLICK: (897, 146)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_150_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_150_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_150_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_150_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_150_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_150_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_150_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_150_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_150_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_150_9.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_150_10.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_150_11.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_150_12.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: SCROLL: UP\nB: CLICK: (865, 937)\nC: CLICK: (154, 387)\nD: PRESS_HOME\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (390, 245)\nstep 2: CLICK: (186, 73)\nstep 3: TYPE: yoga class\nstep 4: CLICK: (924, 908)\nstep 5: CLICK: (252, 509)\nstep 6: PRESS_HOME\nstep 7: CLICK: (843, 104)\nstep 8: CLICK: (283, 73)\nstep 9: TYPE: yoga mat\nstep 10: CLICK: (934, 911)\nstep 11: CLICK: (927, 583)\nstep 12: CLICK: (477, 935)\nstep 13: PRESS_HOME\nstep 14: CLICK: (367, 520)\nI want to Search for a Yoga class using DuckDuckGo, purchase the necessary items for the class on SHEIN, and then set a reminder in your Calendar to study. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Search for a Yoga class using DuckDuckGo, purchase the necessary items for the class on SHEIN, and then set a reminder in your Calendar to study.\nThe historical actions are: step 1: CLICK: (390, 245)\nstep 2: CLICK: (186, 73)\nstep 3: TYPE: yoga class\nstep 4: CLICK: (924, 908)\nstep 5: CLICK: (252, 509)\nstep 6: PRESS_HOME\nstep 7: CLICK: (843, 104)\nstep 8: CLICK: (283, 73)\nstep 9: TYPE: yoga mat\nstep 10: CLICK: (934, 911)\nstep 11: CLICK: (927, 583)\nstep 12: CLICK: (477, 935)\nstep 13: PRESS_HOME\nstep 14: CLICK: (367, 520)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: SCROLL: UP\nB: CLICK: (865, 937)\nC: CLICK: (154, 387)\nD: PRESS_HOME\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_151_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_151_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_151_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_151_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_151_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_151_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_151_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_151_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_151_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_151_9.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_151_10.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_151_11.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_151_12.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_151_13.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_151_14.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (161, 332)\nB: CLICK: (451, 194)\nC: SCROLL: UP\nD: PRESS_HOME\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (830, 204)\nstep 2: CLICK: (297, 85)\nstep 3: TYPE: Microsoft's stock market news\nstep 4: CLICK: (912, 882)\nstep 5: CLICK: (798, 673)\nI want to Launch Firefox to search for today's stock market news regarding Microsoft. Then, open the TradingView: Track All Markets app to check the stock price trends. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Launch Firefox to search for today's stock market news regarding Microsoft. Then, open the TradingView: Track All Markets app to check the stock price trends.\nThe historical actions are: step 1: CLICK: (830, 204)\nstep 2: CLICK: (297, 85)\nstep 3: TYPE: Microsoft's stock market news\nstep 4: CLICK: (912, 882)\nstep 5: CLICK: (798, 673)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (161, 332)\nB: CLICK: (451, 194)\nC: SCROLL: UP\nD: PRESS_HOME\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_152_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_152_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_152_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_152_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_152_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_152_5.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: SCROLL: DOWN\nB: CLICK: (362, 812)\nC: TYPE: Disco\nD: CLICK: (116, 493)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (362, 383)\nstep 2: CLICK: (67, 76)\nstep 3: CLICK: (348, 152)\nstep 4: CLICK: (217, 436)\nstep 5: CLICK: (313, 941)\nstep 6: PRESS_HOME\nstep 7: CLICK: (165, 527)\nstep 8: CLICK: (884, 877)\nstep 9: TYPE: German Learning\nstep 10: CLICK: (174, 594)\nstep 11: TYPE: 10 mins per day\nstep 12: CLICK: (938, 645)\nI want to Engage in a German language lesson using Duolingo and organize a learning plan with the help of TickTick. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Engage in a German language lesson using Duolingo and organize a learning plan with the help of TickTick.\nThe historical actions are: step 1: CLICK: (362, 383)\nstep 2: CLICK: (67, 76)\nstep 3: CLICK: (348, 152)\nstep 4: CLICK: (217, 436)\nstep 5: CLICK: (313, 941)\nstep 6: PRESS_HOME\nstep 7: CLICK: (165, 527)\nstep 8: CLICK: (884, 877)\nstep 9: TYPE: German Learning\nstep 10: CLICK: (174, 594)\nstep 11: TYPE: 10 mins per day\nstep 12: CLICK: (938, 645)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: SCROLL: DOWN\nB: CLICK: (362, 812)\nC: TYPE: Disco\nD: CLICK: (116, 493)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_153_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_153_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_153_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_153_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_153_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_153_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_153_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_153_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_153_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_153_9.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_153_10.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_153_11.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_153_12.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: TYPE: paintbrush\nB: CLICK: (85, 385)\nC: CLICK: (520, 524)\nD: CLICK: (319, 733)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (842, 836)\nstep 2: PRESS_HOME\nstep 3: CLICK: (127, 267)\nstep 4: CLICK: (880, 912)\nstep 5: TYPE: Chocolate chip cookies: 150g salted butter, softened 80g light brown muscovado sugar, 80g granulated sugar, 2 tsp vanilla extract, 1 large egg, 225g plain flour, 1/2 tsp bicarbonate of soda, 1/4 tsp salt, 200g plain chocolate chips or chunks\nstep 6: PRESS_HOME\nstep 7: CLICK: (159, 519)\nstep 8: CLICK: (47, 74)\nI want to Search for a Chocolate chip cookie recipe using Opera, compile a shopping list of the main ingredients in Google Keep, and then add these items to your cart on Ebay. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Search for a Chocolate chip cookie recipe using Opera, compile a shopping list of the main ingredients in Google Keep, and then add these items to your cart on Ebay.\nThe historical actions are: step 1: CLICK: (842, 836)\nstep 2: PRESS_HOME\nstep 3: CLICK: (127, 267)\nstep 4: CLICK: (880, 912)\nstep 5: TYPE: Chocolate chip cookies: 150g salted butter, softened 80g light brown muscovado sugar, 80g granulated sugar, 2 tsp vanilla extract, 1 large egg, 225g plain flour, 1/2 tsp bicarbonate of soda, 1/4 tsp salt, 200g plain chocolate chips or chunks\nstep 6: PRESS_HOME\nstep 7: CLICK: (159, 519)\nstep 8: CLICK: (47, 74)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: TYPE: paintbrush\nB: CLICK: (85, 385)\nC: CLICK: (520, 524)\nD: CLICK: (319, 733)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_154_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_154_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_154_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_154_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_154_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_154_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_154_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_154_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_154_8.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: TYPE: Chocolate chip cookies: 150g salted butter, softened 80g light brown muscovado sugar, 80g granulated sugar, 2 tsp vanilla extract, 1 large egg, 225g plain flour, 1/2 tsp bicarbonate of soda, 1/4 tsp salt, 200g plain chocolate chips or chunks\nB: TYPE: Nighttime sharpens, heightens each sensation\nC: CLICK: (673, 59)\nD: CLICK: (422, 123)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (716, 119)\nstep 2: CLICK: (319, 64)\nstep 3: TYPE: book about self-help\nstep 4: CLICK: (882, 697)\nstep 5: SCROLL: UP\nstep 6: PRESS_HOME\nstep 7: CLICK: (287, 102)\nstep 8: CLICK: (25, 352)\nstep 9: CLICK: (172, 68)\nstep 10: TYPE: how to win friends and influence people book\nstep 11: CLICK: (901, 692)\nstep 12: CLICK: (548, 434)\nstep 13: PRESS_HOME\nstep 14: CLICK: (571, 270)\nI want to Utilize Instagram, AliExpress, and Firefox to find a renowned self-help book. Start by searching for recommendations and reviews using Instagram and Firefox. Once you have identified the book, proceed to purchase it on AliExpress. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Utilize Instagram, AliExpress, and Firefox to find a renowned self-help book. Start by searching for recommendations and reviews using Instagram and Firefox. Once you have identified the book, proceed to purchase it on AliExpress.\nThe historical actions are: step 1: CLICK: (716, 119)\nstep 2: CLICK: (319, 64)\nstep 3: TYPE: book about self-help\nstep 4: CLICK: (882, 697)\nstep 5: SCROLL: UP\nstep 6: PRESS_HOME\nstep 7: CLICK: (287, 102)\nstep 8: CLICK: (25, 352)\nstep 9: CLICK: (172, 68)\nstep 10: TYPE: how to win friends and influence people book\nstep 11: CLICK: (901, 692)\nstep 12: CLICK: (548, 434)\nstep 13: PRESS_HOME\nstep 14: CLICK: (571, 270)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: TYPE: Chocolate chip cookies: 150g salted butter, softened 80g light brown muscovado sugar, 80g granulated sugar, 2 tsp vanilla extract, 1 large egg, 225g plain flour, 1/2 tsp bicarbonate of soda, 1/4 tsp salt, 200g plain chocolate chips or chunks\nB: TYPE: Nighttime sharpens, heightens each sensation\nC: CLICK: (673, 59)\nD: CLICK: (422, 123)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_155_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_155_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_155_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_155_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_155_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_155_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_155_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_155_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_155_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_155_9.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_155_10.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_155_11.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_155_12.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_155_13.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_155_14.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (352, 204)\nB: CLICK: (366, 897)\nC: CLICK: (139, 211)\nD: CLICK: (938, 874)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (450, 920)\nstep 2: CLICK: (858, 300)\nstep 3: TYPE: when is next comic-con international\nstep 4: CLICK: (236, 287)\nstep 5: PRESS_HOME\nstep 6: CLICK: (330, 136)\nI want to Use Chrome to search for the dates of the next Comic-Con International event and then set a reminder for it in TickTick. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Use Chrome to search for the dates of the next Comic-Con International event and then set a reminder for it in TickTick.\nThe historical actions are: step 1: CLICK: (450, 920)\nstep 2: CLICK: (858, 300)\nstep 3: TYPE: when is next comic-con international\nstep 4: CLICK: (236, 287)\nstep 5: PRESS_HOME\nstep 6: CLICK: (330, 136)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (352, 204)\nB: CLICK: (366, 897)\nC: CLICK: (139, 211)\nD: CLICK: (938, 874)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_156_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_156_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_156_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_156_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_156_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_156_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_156_6.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (163, 159)\nB: TYPE: 125\nC: CLICK: (822, 78)\nD: CLICK: (751, 90)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (863, 505)\nstep 2: CLICK: (895, 243)\nstep 3: CLICK: (54, 62)\nstep 4: CLICK: (765, 70)\nstep 5: TYPE: rock\nstep 6: CLICK: (430, 424)\nstep 7: CLICK: (656, 429)\nstep 8: PRESS_HOME\nstep 9: CLICK: (379, 104)\nstep 10: CLICK: (722, 879)\nstep 11: TYPE: caba62244@gmail.com\nstep 12: CLICK: (263, 324)\nstep 13: CLICK: (253, 341)\nstep 14: TYPE: Rocky\nI want to Listen to a Rock-style album using Pocket FM: Audio Series, then share the name of the album via Gmail with caba62244@gmail.com. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Listen to a Rock-style album using Pocket FM: Audio Series, then share the name of the album via Gmail with caba62244@gmail.com.\nThe historical actions are: step 1: CLICK: (863, 505)\nstep 2: CLICK: (895, 243)\nstep 3: CLICK: (54, 62)\nstep 4: CLICK: (765, 70)\nstep 5: TYPE: rock\nstep 6: CLICK: (430, 424)\nstep 7: CLICK: (656, 429)\nstep 8: PRESS_HOME\nstep 9: CLICK: (379, 104)\nstep 10: CLICK: (722, 879)\nstep 11: TYPE: caba62244@gmail.com\nstep 12: CLICK: (263, 324)\nstep 13: CLICK: (253, 341)\nstep 14: TYPE: Rocky\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (163, 159)\nB: TYPE: 125\nC: CLICK: (822, 78)\nD: CLICK: (751, 90)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_157_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_157_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_157_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_157_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_157_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_157_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_157_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_157_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_157_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_157_9.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_157_10.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_157_11.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_157_12.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_157_13.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_157_14.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (223, 415)\nB: TYPE: The Crusades\nC: CLICK: (683, 589)\nD: COMPLETE\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (910, 324)\nstep 2: CLICK: (764, 811)\nstep 3: CLICK: (555, 573)\nstep 4: CLICK: (746, 86)\nstep 5: CLICK: (628, 752)\nstep 6: CLICK: (751, 90)\nstep 7: CLICK: (766, 86)\nstep 8: SCROLL: LEFT\nstep 9: CLICK: (730, 850)\nstep 10: CLICK: (607, 686)\nstep 11: CLICK: (511, 585)\nstep 12: CLICK: (751, 915)\nI want to Utilize Adobe Express: AI Video Design to edit a photo and then share the edited image on Tumblr, specifically to katsunaksu. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Utilize Adobe Express: AI Video Design to edit a photo and then share the edited image on Tumblr, specifically to katsunaksu.\nThe historical actions are: step 1: CLICK: (910, 324)\nstep 2: CLICK: (764, 811)\nstep 3: CLICK: (555, 573)\nstep 4: CLICK: (746, 86)\nstep 5: CLICK: (628, 752)\nstep 6: CLICK: (751, 90)\nstep 7: CLICK: (766, 86)\nstep 8: SCROLL: LEFT\nstep 9: CLICK: (730, 850)\nstep 10: CLICK: (607, 686)\nstep 11: CLICK: (511, 585)\nstep 12: CLICK: (751, 915)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (223, 415)\nB: TYPE: The Crusades\nC: CLICK: (683, 589)\nD: COMPLETE\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_158_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_158_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_158_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_158_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_158_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_158_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_158_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_158_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_158_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_158_9.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_158_10.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_158_11.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_158_12.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: PRESS_BACK\nB: CLICK: (169, 343)\nC: COMPLETE\nD: TYPE: goodfood recipe\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (288, 262)\nstep 2: CLICK: (21, 358)\nstep 3: CLICK: (124, 67)\nstep 4: TYPE: smart light bulbs recommendation\nstep 5: CLICK: (885, 704)\nstep 6: CLICK: (173, 556)\nstep 7: SCROLL: UP\nstep 8: SCROLL: UP\nI want to Utilize Instagram to discover highly recommended smart light bulbs and then proceed to purchase one through Flipkart. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Utilize Instagram to discover highly recommended smart light bulbs and then proceed to purchase one through Flipkart.\nThe historical actions are: step 1: CLICK: (288, 262)\nstep 2: CLICK: (21, 358)\nstep 3: CLICK: (124, 67)\nstep 4: TYPE: smart light bulbs recommendation\nstep 5: CLICK: (885, 704)\nstep 6: CLICK: (173, 556)\nstep 7: SCROLL: UP\nstep 8: SCROLL: UP\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: PRESS_BACK\nB: CLICK: (169, 343)\nC: COMPLETE\nD: TYPE: goodfood recipe\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_159_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_159_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_159_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_159_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_159_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_159_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_159_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_159_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_159_8.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (424, 646)\nB: CLICK: (482, 292)\nC: CLICK: (379, 74)\nD: COMPLETE\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (440, 914)\nstep 2: CLICK: (274, 415)\nstep 3: TYPE: Facebook's stock market news\nstep 4: CLICK: (884, 878)\nstep 5: SCROLL: UP\nstep 6: CLICK: (246, 786)\nstep 7: PRESS_HOME\nstep 8: CLICK: (913, 494)\nI want to Use Chrome to search for today's stock market news about Facebook, and then open TradingView: Track All Markets to check the stock price trends. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Use Chrome to search for today's stock market news about Facebook, and then open TradingView: Track All Markets to check the stock price trends.\nThe historical actions are: step 1: CLICK: (440, 914)\nstep 2: CLICK: (274, 415)\nstep 3: TYPE: Facebook's stock market news\nstep 4: CLICK: (884, 878)\nstep 5: SCROLL: UP\nstep 6: CLICK: (246, 786)\nstep 7: PRESS_HOME\nstep 8: CLICK: (913, 494)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (424, 646)\nB: CLICK: (482, 292)\nC: CLICK: (379, 74)\nD: COMPLETE\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_160_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_160_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_160_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_160_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_160_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_160_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_160_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_160_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_160_8.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: COMPLETE\nB: CLICK: (113, 808)\nC: CLICK: (666, 804)\nD: CLICK: (264, 887)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: SCROLL: LEFT\nstep 2: CLICK: (584, 553)\nstep 3: CLICK: (347, 546)\nstep 4: CLICK: (440, 809)\nstep 5: CLICK: (343, 658)\nstep 6: CLICK: (545, 896)\nstep 7: CLICK: (554, 806)\nI want to First, use the Calculator app to determine the sum of '27.3+13' for today's total cost. Once you have the total, record this amount in either a document or the Google Keep app. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: First, use the Calculator app to determine the sum of '27.3+13' for today's total cost. Once you have the total, record this amount in either a document or the Google Keep app.\nThe historical actions are: step 1: SCROLL: LEFT\nstep 2: CLICK: (584, 553)\nstep 3: CLICK: (347, 546)\nstep 4: CLICK: (440, 809)\nstep 5: CLICK: (343, 658)\nstep 6: CLICK: (545, 896)\nstep 7: CLICK: (554, 806)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: COMPLETE\nB: CLICK: (113, 808)\nC: CLICK: (666, 804)\nD: CLICK: (264, 887)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_161_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_161_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_161_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_161_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_161_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_161_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_161_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_161_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: PRESS_HOME\nB: CLICK: (819, 243)\nC: CLICK: (919, 69)\nD: CLICK: (370, 319)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: PRESS_HOME\nstep 2: SCROLL: RIGHT\nstep 3: SCROLL: RIGHT\nstep 4: SCROLL: LEFT\nstep 5: CLICK: (858, 114)\nstep 6: SCROLL: UP\nstep 7: SCROLL: UP\nstep 8: SCROLL: UP\nstep 9: SCROLL: UP\nstep 10: SCROLL: DOWN\nstep 11: PRESS_HOME\nstep 12: SCROLL: RIGHT\nI want to Using DuckDuckGo to search for the 2019 Nobel Prize winners in Physics, and then record the gathered information in Simplenote. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Using DuckDuckGo to search for the 2019 Nobel Prize winners in Physics, and then record the gathered information in Simplenote.\nThe historical actions are: step 1: PRESS_HOME\nstep 2: SCROLL: RIGHT\nstep 3: SCROLL: RIGHT\nstep 4: SCROLL: LEFT\nstep 5: CLICK: (858, 114)\nstep 6: SCROLL: UP\nstep 7: SCROLL: UP\nstep 8: SCROLL: UP\nstep 9: SCROLL: UP\nstep 10: SCROLL: DOWN\nstep 11: PRESS_HOME\nstep 12: SCROLL: RIGHT\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: PRESS_HOME\nB: CLICK: (819, 243)\nC: CLICK: (919, 69)\nD: CLICK: (370, 319)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_162_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_162_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_162_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_162_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_162_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_162_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_162_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_162_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_162_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_162_9.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_162_10.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_162_11.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_162_12.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: SCROLL: UP\nB: SCROLL: LEFT\nC: COMPLETE\nD: TYPE: Healthy lunch plan\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (143, 665)\nstep 2: CLICK: (947, 76)\nstep 3: CLICK: (406, 151)\nstep 4: TYPE: Moscow\nstep 5: CLICK: (270, 240)\nstep 6: SCROLL: UP\nstep 7: SCROLL: UP\nstep 8: CLICK: (284, 608)\nstep 9: PRESS_HOME\nstep 10: CLICK: (411, 400)\nstep 11: CLICK: (893, 921)\nstep 12: CLICK: (893, 873)\nstep 13: TYPE: Moscow,tomorrow: cloudy and some sun todolist:buy a flight to Moscow.\nstep 14: CLICK: (69, 86)\nI want to Utilize the Weather & Radar app to check the weather forecast for Moscow tomorrow, and based on that information, create a detailed to-do list using Google Docs. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Utilize the Weather & Radar app to check the weather forecast for Moscow tomorrow, and based on that information, create a detailed to-do list using Google Docs.\nThe historical actions are: step 1: CLICK: (143, 665)\nstep 2: CLICK: (947, 76)\nstep 3: CLICK: (406, 151)\nstep 4: TYPE: Moscow\nstep 5: CLICK: (270, 240)\nstep 6: SCROLL: UP\nstep 7: SCROLL: UP\nstep 8: CLICK: (284, 608)\nstep 9: PRESS_HOME\nstep 10: CLICK: (411, 400)\nstep 11: CLICK: (893, 921)\nstep 12: CLICK: (893, 873)\nstep 13: TYPE: Moscow,tomorrow: cloudy and some sun todolist:buy a flight to Moscow.\nstep 14: CLICK: (69, 86)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: SCROLL: UP\nB: SCROLL: LEFT\nC: COMPLETE\nD: TYPE: Healthy lunch plan\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_163_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_163_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_163_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_163_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_163_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_163_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_163_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_163_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_163_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_163_9.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_163_10.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_163_11.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_163_12.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_163_13.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_163_14.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (879, 693)\nB: TYPE: plain chocolate chips\nC: CLICK: (182, 236)\nD: PRESS_HOME\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (716, 106)\nstep 2: CLICK: (292, 68)\nstep 3: TYPE: book about cookbook\nstep 4: CLICK: (894, 683)\nstep 5: SCROLL: UP\nstep 6: SCROLL: UP\nstep 7: PRESS_HOME\nstep 8: CLICK: (140, 97)\nstep 9: CLICK: (668, 68)\nstep 10: TYPE: cook this book\nI want to Utilize Firefox to search for a renowned cookbook and read various reviews about it. Once decided, check Ebay for listings and proceed with the purchase. Additionally, consider browsing Facebook for any available recommendations or seller reviews before finalizing your purchase. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Utilize Firefox to search for a renowned cookbook and read various reviews about it. Once decided, check Ebay for listings and proceed with the purchase. Additionally, consider browsing Facebook for any available recommendations or seller reviews before finalizing your purchase.\nThe historical actions are: step 1: CLICK: (716, 106)\nstep 2: CLICK: (292, 68)\nstep 3: TYPE: book about cookbook\nstep 4: CLICK: (894, 683)\nstep 5: SCROLL: UP\nstep 6: SCROLL: UP\nstep 7: PRESS_HOME\nstep 8: CLICK: (140, 97)\nstep 9: CLICK: (668, 68)\nstep 10: TYPE: cook this book\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (879, 693)\nB: TYPE: plain chocolate chips\nC: CLICK: (182, 236)\nD: PRESS_HOME\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_164_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_164_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_164_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_164_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_164_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_164_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_164_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_164_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_164_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_164_9.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_164_10.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (543, 277)\nB: CLICK: (860, 887)\nC: PRESS_HOME\nD: CLICK: (931, 638)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (554, 156)\nstep 2: CLICK: (727, 70)\nstep 3: TYPE:  climate change rally\nI want to Locate an event related to a climate change rally using X and Facebook, then have a discussion about it with liudehu19294094. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Locate an event related to a climate change rally using X and Facebook, then have a discussion about it with liudehu19294094.\nThe historical actions are: step 1: CLICK: (554, 156)\nstep 2: CLICK: (727, 70)\nstep 3: TYPE:  climate change rally\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (543, 277)\nB: CLICK: (860, 887)\nC: PRESS_HOME\nD: CLICK: (931, 638)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_165_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_165_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_165_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_165_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: PRESS_HOME\nB: CLICK: (299, 229)\nC: CLICK: (920, 910)\nD: SCROLL: UP\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (141, 670)\nstep 2: CLICK: (317, 282)\nstep 3: TYPE: hiking trail\nstep 4: CLICK: (915, 919)\nstep 5: CLICK: (500, 511)\nstep 6: PRESS_HOME\nstep 7: CLICK: (148, 393)\nstep 8: SCROLL: DOWN\nstep 9: CLICK: (951, 78)\nstep 10: CLICK: (308, 153)\nstep 11: TYPE: California\nI want to Using Chrome, search for a new hiking trail. Then, check the weekend weather forecast using Weather & Radar. Finally, send an invitation to join the hike to caba62244@gmail.com through Gmail. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Using Chrome, search for a new hiking trail. Then, check the weekend weather forecast using Weather & Radar. Finally, send an invitation to join the hike to caba62244@gmail.com through Gmail.\nThe historical actions are: step 1: CLICK: (141, 670)\nstep 2: CLICK: (317, 282)\nstep 3: TYPE: hiking trail\nstep 4: CLICK: (915, 919)\nstep 5: CLICK: (500, 511)\nstep 6: PRESS_HOME\nstep 7: CLICK: (148, 393)\nstep 8: SCROLL: DOWN\nstep 9: CLICK: (951, 78)\nstep 10: CLICK: (308, 153)\nstep 11: TYPE: California\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: PRESS_HOME\nB: CLICK: (299, 229)\nC: CLICK: (920, 910)\nD: SCROLL: UP\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_166_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_166_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_166_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_166_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_166_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_166_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_166_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_166_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_166_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_166_9.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_166_10.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_166_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: PRESS_HOME\nB: SCROLL: UP\nC: TYPE: happy\nD: SCROLL: DOWN\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (415, 663)\nstep 2: CLICK: (172, 84)\nstep 3: SCROLL: UP\nstep 4: CLICK: (196, 707)\nstep 5: CLICK: (459, 327)\nI want to Attend a Portuguese language lesson using Rosetta Stone: Learn, Practice and set a learning plan in Things. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Attend a Portuguese language lesson using Rosetta Stone: Learn, Practice and set a learning plan in Things.\nThe historical actions are: step 1: CLICK: (415, 663)\nstep 2: CLICK: (172, 84)\nstep 3: SCROLL: UP\nstep 4: CLICK: (196, 707)\nstep 5: CLICK: (459, 327)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: PRESS_HOME\nB: SCROLL: UP\nC: TYPE: happy\nD: SCROLL: DOWN\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_167_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_167_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_167_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_167_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_167_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_167_5.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: PRESS_HOME\nB: CLICK: (23, 462)\nC: TYPE: instagram\nD: COMPLETE\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (567, 263)\nstep 2: CLICK: (378, 651)\nstep 3: CLICK: (899, 67)\nstep 4: LONG_PRESS: (463, 287)\nstep 5: SCROLL: LEFT\nstep 6: SCROLL: DOWN\nstep 7: CLICK: (296, 216)\nstep 8: PRESS_HOME\nstep 9: CLICK: (714, 398)\nstep 10: CLICK: (631, 564)\nstep 11: CLICK: (729, 412)\nI want to Open Opera News and read an English news article. Translate the title of the article into German using DeepL translate. Finally, record the translated title in Simplenote. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Open Opera News and read an English news article. Translate the title of the article into German using DeepL translate. Finally, record the translated title in Simplenote.\nThe historical actions are: step 1: CLICK: (567, 263)\nstep 2: CLICK: (378, 651)\nstep 3: CLICK: (899, 67)\nstep 4: LONG_PRESS: (463, 287)\nstep 5: SCROLL: LEFT\nstep 6: SCROLL: DOWN\nstep 7: CLICK: (296, 216)\nstep 8: PRESS_HOME\nstep 9: CLICK: (714, 398)\nstep 10: CLICK: (631, 564)\nstep 11: CLICK: (729, 412)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: PRESS_HOME\nB: CLICK: (23, 462)\nC: TYPE: instagram\nD: COMPLETE\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_168_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_168_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_168_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_168_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_168_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_168_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_168_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_168_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_168_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_168_9.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_168_10.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_168_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (924, 70)\nB: CLICK: (489, 941)\nC: CLICK: (177, 756)\nD: CLICK: (376, 827)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (824, 817)\nstep 2: CLICK: (846, 212)\nstep 3: PRESS_HOME\nI want to First, use the Google Play Store to install the Tiktok app. Once installed, launch Tiktok. After launching, navigate to the Setting app to disable notifications for Tiktok. Finally, reopen Tiktok to enjoy watching a video. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: First, use the Google Play Store to install the Tiktok app. Once installed, launch Tiktok. After launching, navigate to the Setting app to disable notifications for Tiktok. Finally, reopen Tiktok to enjoy watching a video.\nThe historical actions are: step 1: CLICK: (824, 817)\nstep 2: CLICK: (846, 212)\nstep 3: PRESS_HOME\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (924, 70)\nB: CLICK: (489, 941)\nC: CLICK: (177, 756)\nD: CLICK: (376, 827)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_169_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_169_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_169_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_169_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (463, 651)\nB: CLICK: (719, 278)\nC: CLICK: (399, 142)\nD: PRESS_RECENT\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (419, 106)\nstep 2: CLICK: (278, 264)\nstep 3: CLICK: (69, 189)\nstep 4: CLICK: (919, 598)\nstep 5: CLICK: (79, 888)\nstep 6: CLICK: (79, 588)\nstep 7: CLICK: (923, 692)\nstep 8: LONG_PRESS: (228, 387)\nstep 9: SCROLL: RIGHT\nstep 10: CLICK: (465, 332)\nstep 11: PRESS_HOME\nI want to Using Opera, look up the 2021 Nobel-Prize winners in physics and document the details in Microsoft Word. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Using Opera, look up the 2021 Nobel-Prize winners in physics and document the details in Microsoft Word.\nThe historical actions are: step 1: CLICK: (419, 106)\nstep 2: CLICK: (278, 264)\nstep 3: CLICK: (69, 189)\nstep 4: CLICK: (919, 598)\nstep 5: CLICK: (79, 888)\nstep 6: CLICK: (79, 588)\nstep 7: CLICK: (923, 692)\nstep 8: LONG_PRESS: (228, 387)\nstep 9: SCROLL: RIGHT\nstep 10: CLICK: (465, 332)\nstep 11: PRESS_HOME\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (463, 651)\nB: CLICK: (719, 278)\nC: CLICK: (399, 142)\nD: PRESS_RECENT\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_170_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_170_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_170_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_170_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_170_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_170_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_170_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_170_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_170_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_170_9.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_170_10.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_170_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: SCROLL: UP\nB: CLICK: (402, 866)\nC: CLICK: (502, 331)\nD: TYPE: Bayes' theorem\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (216, 682)\nstep 2: CLICK: (205, 425)\nstep 3: TYPE: drama movies\nstep 4: CLICK: (841, 903)\nI want to Organize a movie night by selecting a drama film using Chrome, adding snacks to your cart on eBay, sending out invitations to caba62244@gmail.com via Gmail, and setting a reminder on the Clock app. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Organize a movie night by selecting a drama film using Chrome, adding snacks to your cart on eBay, sending out invitations to caba62244@gmail.com via Gmail, and setting a reminder on the Clock app.\nThe historical actions are: step 1: CLICK: (216, 682)\nstep 2: CLICK: (205, 425)\nstep 3: TYPE: drama movies\nstep 4: CLICK: (841, 903)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: SCROLL: UP\nB: CLICK: (402, 866)\nC: CLICK: (502, 331)\nD: TYPE: Bayes' theorem\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_171_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_171_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_171_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_171_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_171_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: PRESS_HOME\nB: CLICK: (287, 817)\nC: CLICK: (425, 575)\nD: SCROLL: UP\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (130, 362)\nstep 2: CLICK: (226, 866)\nstep 3: TYPE: tell me about Binomial theorem\nstep 4: CLICK: (906, 596)\nstep 5: CLICK: (112, 965)\nstep 6: PRESS_HOME\nstep 7: CLICK: (130, 244)\nstep 8: CLICK: (337, 56)\nstep 9: TYPE: Binomial theorem\nstep 10: CLICK: (928, 905)\nI want to Utilize ChatOn - AI Chat Bot Assistant to inquire about the Binomial theorem, then cross-verify the information using Firefox to conduct a browser search. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Utilize ChatOn - AI Chat Bot Assistant to inquire about the Binomial theorem, then cross-verify the information using Firefox to conduct a browser search.\nThe historical actions are: step 1: CLICK: (130, 362)\nstep 2: CLICK: (226, 866)\nstep 3: TYPE: tell me about Binomial theorem\nstep 4: CLICK: (906, 596)\nstep 5: CLICK: (112, 965)\nstep 6: PRESS_HOME\nstep 7: CLICK: (130, 244)\nstep 8: CLICK: (337, 56)\nstep 9: TYPE: Binomial theorem\nstep 10: CLICK: (928, 905)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: PRESS_HOME\nB: CLICK: (287, 817)\nC: CLICK: (425, 575)\nD: SCROLL: UP\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_172_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_172_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_172_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_172_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_172_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_172_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_172_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_172_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_172_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_172_9.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_172_10.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (898, 69)\nB: CLICK: (323, 80)\nC: COMPLETE\nD: CLICK: (333, 127)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (691, 214)\nstep 2: TYPE: Fitness Tracker Apps\nstep 3: CLICK: (894, 878)\nstep 4: CLICK: (454, 857)\nstep 5: SCROLL: UP\nstep 6: PRESS_HOME\nstep 7: CLICK: (513, 760)\nI want to Using DuckDuckGo to research various Fitness Tracker apps and then download one that suits your needs from the Google Play Store. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Using DuckDuckGo to research various Fitness Tracker apps and then download one that suits your needs from the Google Play Store.\nThe historical actions are: step 1: CLICK: (691, 214)\nstep 2: TYPE: Fitness Tracker Apps\nstep 3: CLICK: (894, 878)\nstep 4: CLICK: (454, 857)\nstep 5: SCROLL: UP\nstep 6: PRESS_HOME\nstep 7: CLICK: (513, 760)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (898, 69)\nB: CLICK: (323, 80)\nC: COMPLETE\nD: CLICK: (333, 127)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_173_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_173_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_173_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_173_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_173_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_173_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_173_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_173_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (240, 779)\nB: CLICK: (532, 928)\nC: CLICK: (163, 147)\nD: CLICK: (408, 492)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (599, 911)\nstep 2: PRESS_HOME\nstep 3: SCROLL: RIGHT\nstep 4: CLICK: (124, 699)\nstep 5: TYPE: centr app\nI want to Open YouTube and watch a video about fitness tracking app recommendations, then head over to the Google Play Store and download one of the suggested apps. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Open YouTube and watch a video about fitness tracking app recommendations, then head over to the Google Play Store and download one of the suggested apps.\nThe historical actions are: step 1: CLICK: (599, 911)\nstep 2: PRESS_HOME\nstep 3: SCROLL: RIGHT\nstep 4: CLICK: (124, 699)\nstep 5: TYPE: centr app\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (240, 779)\nB: CLICK: (532, 928)\nC: CLICK: (163, 147)\nD: CLICK: (408, 492)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_174_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_174_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_174_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_174_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_174_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_174_5.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: TYPE: Meesho\nB: SCROLL: UP\nC: CLICK: (122, 161)\nD: CLICK: (234, 564)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (144, 706)\nstep 2: CLICK: (287, 112)\nstep 3: TYPE: new hiking trail\nstep 4: CLICK: (889, 683)\nstep 5: CLICK: (337, 924)\nstep 6: SCROLL: UP\nstep 7: PRESS_HOME\nstep 8: CLICK: (144, 574)\nstep 9: CLICK: (973, 69)\nstep 10: CLICK: (427, 169)\nstep 11: TYPE: Hong Kong\nstep 12: CLICK: (383, 274)\nstep 13: SCROLL: UP\nstep 14: SCROLL: UP\nI want to Browse for a new hiking trail using Chrome, check the weekend weather with Weather & Radar, and invite Victor James to join through Messenger. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Browse for a new hiking trail using Chrome, check the weekend weather with Weather & Radar, and invite Victor James to join through Messenger.\nThe historical actions are: step 1: CLICK: (144, 706)\nstep 2: CLICK: (287, 112)\nstep 3: TYPE: new hiking trail\nstep 4: CLICK: (889, 683)\nstep 5: CLICK: (337, 924)\nstep 6: SCROLL: UP\nstep 7: PRESS_HOME\nstep 8: CLICK: (144, 574)\nstep 9: CLICK: (973, 69)\nstep 10: CLICK: (427, 169)\nstep 11: TYPE: Hong Kong\nstep 12: CLICK: (383, 274)\nstep 13: SCROLL: UP\nstep 14: SCROLL: UP\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: TYPE: Meesho\nB: SCROLL: UP\nC: CLICK: (122, 161)\nD: CLICK: (234, 564)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_175_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_175_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_175_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_175_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_175_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_175_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_175_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_175_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_175_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_175_9.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_175_10.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_175_11.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_175_12.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_175_13.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_175_14.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (342, 154)\nB: TYPE: TickTick\nC: TYPE: the latest Spider-Man movie\nD: COMPLETE\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (432, 131)\nstep 3: CLICK: (761, 121)\nI want to Utilize Opera to search for the latest Spider-Man movie, and then consult aCalendar to find an available evening to watch it. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Utilize Opera to search for the latest Spider-Man movie, and then consult aCalendar to find an available evening to watch it.\nThe historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (432, 131)\nstep 3: CLICK: (761, 121)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (342, 154)\nB: TYPE: TickTick\nC: TYPE: the latest Spider-Man movie\nD: COMPLETE\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_176_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_176_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_176_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_176_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (581, 368)\nB: TYPE: New York City, USA itinerary\nC: CLICK: (578, 333)\nD: PRESS_HOME\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (229, 269)\nstep 3: CLICK: (59, 950)\nstep 4: CLICK: (67, 52)\nI want to Create an ideal itinerary for a trip to New York City, USA, and secure your stay using Airbnb. Additionally, utilize Threads to discuss and finalize your itinerary details. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Create an ideal itinerary for a trip to New York City, USA, and secure your stay using Airbnb. Additionally, utilize Threads to discuss and finalize your itinerary details.\nThe historical actions are: step 1: PRESS_HOME\nstep 2: CLICK: (229, 269)\nstep 3: CLICK: (59, 950)\nstep 4: CLICK: (67, 52)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (581, 368)\nB: TYPE: New York City, USA itinerary\nC: CLICK: (578, 333)\nD: PRESS_HOME\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_177_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_177_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_177_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_177_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_177_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: COMPLETE\nB: PRESS_HOME\nC: CLICK: (816, 125)\nD: CLICK: (518, 892)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (171, 397)\nstep 2: CLICK: (355, 134)\nstep 3: TYPE: book about technology\nstep 4: CLICK: (915, 907)\nstep 5: PRESS_HOME\nstep 6: CLICK: (840, 265)\nstep 7: CLICK: (293, 949)\nstep 8: CLICK: (467, 87)\nstep 9: TYPE: the age of AI book\nstep 10: CLICK: (930, 916)\nstep 11: CLICK: (467, 801)\nstep 12: SCROLL: UP\nI want to Use Instagram to discover a famous technology book, then switch to Opera to read reviews about it, and finally, head over to Amazon to purchase the book. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Use Instagram to discover a famous technology book, then switch to Opera to read reviews about it, and finally, head over to Amazon to purchase the book.\nThe historical actions are: step 1: CLICK: (171, 397)\nstep 2: CLICK: (355, 134)\nstep 3: TYPE: book about technology\nstep 4: CLICK: (915, 907)\nstep 5: PRESS_HOME\nstep 6: CLICK: (840, 265)\nstep 7: CLICK: (293, 949)\nstep 8: CLICK: (467, 87)\nstep 9: TYPE: the age of AI book\nstep 10: CLICK: (930, 916)\nstep 11: CLICK: (467, 801)\nstep 12: SCROLL: UP\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: COMPLETE\nB: PRESS_HOME\nC: CLICK: (816, 125)\nD: CLICK: (518, 892)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_178_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_178_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_178_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_178_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_178_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_178_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_178_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_178_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_178_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_178_9.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_178_10.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_178_11.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_178_12.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: COMPLETE\nB: CLICK: (911, 80)\nC: CLICK: (464, 812)\nD: TYPE: triller\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (670, 922)\nstep 2: TYPE: tiktok\nstep 3: CLICK: (148, 174)\nstep 4: PRESS_HOME\nstep 5: CLICK: (537, 929)\nstep 6: PRESS_HOME\nstep 7: CLICK: (664, 916)\nstep 8: CLICK: (406, 69)\nstep 9: CLICK: (964, 76)\nI want to First, open the App Store and uninstall the TikTok app. Afterward, navigate to the Setting app to verify if TikTok has been successfully uninstalled. Then, head over to Google Play Store, download the Triller app, and finally, open the newly installed app. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: First, open the App Store and uninstall the TikTok app. Afterward, navigate to the Setting app to verify if TikTok has been successfully uninstalled. Then, head over to Google Play Store, download the Triller app, and finally, open the newly installed app.\nThe historical actions are: step 1: CLICK: (670, 922)\nstep 2: TYPE: tiktok\nstep 3: CLICK: (148, 174)\nstep 4: PRESS_HOME\nstep 5: CLICK: (537, 929)\nstep 6: PRESS_HOME\nstep 7: CLICK: (664, 916)\nstep 8: CLICK: (406, 69)\nstep 9: CLICK: (964, 76)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: COMPLETE\nB: CLICK: (911, 80)\nC: CLICK: (464, 812)\nD: TYPE: triller\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_179_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_179_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_179_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_179_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_179_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_179_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_179_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_179_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_179_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_179_9.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: PRESS_HOME\nB: CLICK: (191, 554)\nC: COMPLETE\nD: CLICK: (837, 80)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (385, 643)\nstep 2: CLICK: (440, 57)\nstep 3: TYPE:  biographical movies\nstep 4: CLICK: (921, 901)\nstep 5: CLICK: (316, 235)\nstep 6: SCROLL: UP\nI want to Organize a movie night by selecting a biographical film on DuckDuckgo, adding snacks to your cart on Amazon, sending an invitation to caba62244@gmail.com via Gmail, and setting a reminder on Clock. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Organize a movie night by selecting a biographical film on DuckDuckgo, adding snacks to your cart on Amazon, sending an invitation to caba62244@gmail.com via Gmail, and setting a reminder on Clock.\nThe historical actions are: step 1: CLICK: (385, 643)\nstep 2: CLICK: (440, 57)\nstep 3: TYPE:  biographical movies\nstep 4: CLICK: (921, 901)\nstep 5: CLICK: (316, 235)\nstep 6: SCROLL: UP\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: PRESS_HOME\nB: CLICK: (191, 554)\nC: COMPLETE\nD: CLICK: (837, 80)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_180_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_180_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_180_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_180_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_180_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_180_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_180_6.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (922, 911)\nB: PRESS_HOME\nC: TYPE: hiking trail\nD: SCROLL: UP\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (160, 145)\nstep 2: SCROLL: UP\nstep 3: SCROLL: UP\nstep 4: SCROLL: UP\nstep 5: CLICK: (420, 574)\nstep 6: LONG_PRESS: (486, 424)\nstep 7: SCROLL: LEFT\nstep 8: SCROLL: DOWN\nstep 9: CLICK: (148, 376)\nI want to Open the AP News app and read an English news article. Use DeepL translate to translate the title of the article into French. Finally, record the translated title in Google Docs. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Open the AP News app and read an English news article. Use DeepL translate to translate the title of the article into French. Finally, record the translated title in Google Docs.\nThe historical actions are: step 1: CLICK: (160, 145)\nstep 2: SCROLL: UP\nstep 3: SCROLL: UP\nstep 4: SCROLL: UP\nstep 5: CLICK: (420, 574)\nstep 6: LONG_PRESS: (486, 424)\nstep 7: SCROLL: LEFT\nstep 8: SCROLL: DOWN\nstep 9: CLICK: (148, 376)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (922, 911)\nB: PRESS_HOME\nC: TYPE: hiking trail\nD: SCROLL: UP\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_181_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_181_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_181_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_181_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_181_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_181_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_181_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_181_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_181_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_181_9.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: SCROLL: RIGHT\nB: CLICK: (931, 908)\nC: PRESS_HOME\nD: TYPE: properties of cylinder\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (377, 144)\nstep 2: CLICK: (578, 199)\nstep 3: CLICK: (938, 131)\nI want to Utilize Opera to research information on the properties of a Cylinder, then compile a brief document using WPS Office. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Utilize Opera to research information on the properties of a Cylinder, then compile a brief document using WPS Office.\nThe historical actions are: step 1: CLICK: (377, 144)\nstep 2: CLICK: (578, 199)\nstep 3: CLICK: (938, 131)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: SCROLL: RIGHT\nB: CLICK: (931, 908)\nC: PRESS_HOME\nD: TYPE: properties of cylinder\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_182_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_182_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_182_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_182_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (872, 789)\nB: CLICK: (516, 901)\nC: CLICK: (880, 699)\nD: CLICK: (892, 716)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (285, 99)\nstep 2: CLICK: (131, 53)\nstep 3: TYPE: mobile app developer\nI want to Utilize LinkedIn: Jobs & Business News to find a mobile app developer job, then use WPS office to record the company name. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Utilize LinkedIn: Jobs & Business News to find a mobile app developer job, then use WPS office to record the company name.\nThe historical actions are: step 1: CLICK: (285, 99)\nstep 2: CLICK: (131, 53)\nstep 3: TYPE: mobile app developer\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (872, 789)\nB: CLICK: (516, 901)\nC: CLICK: (880, 699)\nD: CLICK: (892, 716)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_183_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_183_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_183_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_183_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (183, 82)\nB: CLICK: (347, 381)\nC: CLICK: (174, 345)\nD: CLICK: (936, 919)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (174, 237)\nstep 2: CLICK: (374, 182)\nstep 3: CLICK: (919, 104)\nstep 4: CLICK: (928, 111)\nstep 5: TYPE: when is the next Coachella music festival\nI want to Using Opera, search for the next Coachella music festival dates and then set a reminder in Microsoft To Do. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Using Opera, search for the next Coachella music festival dates and then set a reminder in Microsoft To Do.\nThe historical actions are: step 1: CLICK: (174, 237)\nstep 2: CLICK: (374, 182)\nstep 3: CLICK: (919, 104)\nstep 4: CLICK: (928, 111)\nstep 5: TYPE: when is the next Coachella music festival\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (183, 82)\nB: CLICK: (347, 381)\nC: CLICK: (174, 345)\nD: CLICK: (936, 919)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_184_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_184_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_184_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_184_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_184_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_184_5.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: PRESS_HOME\nB: CLICK: (815, 671)\nC: CLICK: (929, 916)\nD: CLICK: (876, 484)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (341, 393)\nstep 2: CLICK: (242, 573)\nstep 3: CLICK: (524, 567)\nstep 4: CLICK: (799, 576)\nstep 5: CLICK: (272, 664)\nstep 6: CLICK: (492, 671)\nI want to Use Applock Pro - APP Lock & Guard to secure the Venmo app with a lock, and then open Venmo to verify the lock is active. The PIN for the lock is 123456. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Use Applock Pro - APP Lock & Guard to secure the Venmo app with a lock, and then open Venmo to verify the lock is active. The PIN for the lock is 123456.\nThe historical actions are: step 1: CLICK: (341, 393)\nstep 2: CLICK: (242, 573)\nstep 3: CLICK: (524, 567)\nstep 4: CLICK: (799, 576)\nstep 5: CLICK: (272, 664)\nstep 6: CLICK: (492, 671)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: PRESS_HOME\nB: CLICK: (815, 671)\nC: CLICK: (929, 916)\nD: CLICK: (876, 484)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_185_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_185_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_185_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_185_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_185_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_185_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_185_6.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (116, 249)\nB: CLICK: (848, 916)\nC: CLICK: (487, 641)\nD: PRESS_HOME\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (429, 723)\nstep 2: CLICK: (160, 461)\nstep 3: CLICK: (502, 807)\nstep 4: CLICK: (941, 129)\nstep 5: TYPE: TickTick\nstep 6: CLICK: (488, 282)\nI want to Toggle the notifications for any application on your phone using the 'Setting' app, and then open 'TickTick'. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Toggle the notifications for any application on your phone using the 'Setting' app, and then open 'TickTick'.\nThe historical actions are: step 1: CLICK: (429, 723)\nstep 2: CLICK: (160, 461)\nstep 3: CLICK: (502, 807)\nstep 4: CLICK: (941, 129)\nstep 5: TYPE: TickTick\nstep 6: CLICK: (488, 282)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (116, 249)\nB: CLICK: (848, 916)\nC: CLICK: (487, 641)\nD: PRESS_HOME\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_186_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_186_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_186_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_186_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_186_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_186_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_186_6.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (169, 649)\nB: CLICK: (972, 582)\nC: TYPE: Los Angeles\nD: SCROLL: UP\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (130, 699)\nstep 2: CLICK: (305, 109)\nstep 3: TYPE: new hiking trail\nstep 4: CLICK: (880, 693)\nstep 5: SCROLL: UP\nstep 6: CLICK: (429, 158)\nstep 7: SCROLL: UP\nstep 8: PRESS_HOME\nstep 9: CLICK: (423, 546)\nI want to Use Chrome to search for a new hiking trail, then consult Windy.com-Weather Forecast to check the weekend weather. Finally, hop onto Tumblr to invite katsunaksu to join. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Use Chrome to search for a new hiking trail, then consult Windy.com-Weather Forecast to check the weekend weather. Finally, hop onto Tumblr to invite katsunaksu to join.\nThe historical actions are: step 1: CLICK: (130, 699)\nstep 2: CLICK: (305, 109)\nstep 3: TYPE: new hiking trail\nstep 4: CLICK: (880, 693)\nstep 5: SCROLL: UP\nstep 6: CLICK: (429, 158)\nstep 7: SCROLL: UP\nstep 8: PRESS_HOME\nstep 9: CLICK: (423, 546)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (169, 649)\nB: CLICK: (972, 582)\nC: TYPE: Los Angeles\nD: SCROLL: UP\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_187_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_187_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_187_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_187_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_187_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_187_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_187_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_187_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_187_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_187_9.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (46, 499)\nB: CLICK: (955, 53)\nC: CLICK: (450, 927)\nD: CLICK: (474, 582)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (666, 833)\nstep 2: CLICK: (301, 279)\nstep 3: TYPE: The Age of Exploration\nstep 4: CLICK: (270, 139)\nI want to Research a historical event from The Age of Exploration using Chrome, and then read or listen to a related book on Amazon Kindle. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Research a historical event from The Age of Exploration using Chrome, and then read or listen to a related book on Amazon Kindle.\nThe historical actions are: step 1: CLICK: (666, 833)\nstep 2: CLICK: (301, 279)\nstep 3: TYPE: The Age of Exploration\nstep 4: CLICK: (270, 139)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (46, 499)\nB: CLICK: (955, 53)\nC: CLICK: (450, 927)\nD: CLICK: (474, 582)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_188_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_188_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_188_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_188_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_188_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: COMPLETE\nB: PRESS_HOME\nC: CLICK: (547, 605)\nD: TYPE: vintage\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (858, 818)\nstep 2: TYPE: tiktok\nstep 3: CLICK: (877, 890)\nstep 4: PRESS_HOME\nstep 5: CLICK: (374, 806)\nstep 6: PRESS_HOME\nstep 7: CLICK: (858, 818)\nstep 8: CLICK: (802, 74)\nstep 9: CLICK: (919, 74)\nstep 10: TYPE: triller\nstep 11: CLICK: (895, 901)\nstep 12: CLICK: (838, 349)\nI want to Begin by uninstalling the TikTok app using the Google Play Store. Once done, navigate to Settings to verify that TikTok has been successfully uninstalled. After confirmation, download a similar app such as Triller from the Google Play Store, and then proceed to open the Triller app. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Begin by uninstalling the TikTok app using the Google Play Store. Once done, navigate to Settings to verify that TikTok has been successfully uninstalled. After confirmation, download a similar app such as Triller from the Google Play Store, and then proceed to open the Triller app.\nThe historical actions are: step 1: CLICK: (858, 818)\nstep 2: TYPE: tiktok\nstep 3: CLICK: (877, 890)\nstep 4: PRESS_HOME\nstep 5: CLICK: (374, 806)\nstep 6: PRESS_HOME\nstep 7: CLICK: (858, 818)\nstep 8: CLICK: (802, 74)\nstep 9: CLICK: (919, 74)\nstep 10: TYPE: triller\nstep 11: CLICK: (895, 901)\nstep 12: CLICK: (838, 349)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: COMPLETE\nB: PRESS_HOME\nC: CLICK: (547, 605)\nD: TYPE: vintage\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_189_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_189_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_189_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_189_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_189_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_189_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_189_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_189_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_189_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_189_9.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_189_10.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_189_11.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_189_12.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: COMPLETE\nB: TYPE: 2001 Colony St\nC: CLICK: (684, 542)\nD: CLICK: (496, 328)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (722, 109)\nstep 2: CLICK: (254, 727)\nstep 3: PRESS_HOME\nstep 4: CLICK: (565, 261)\nstep 5: CLICK: (355, 599)\nI want to Use Redfin Houses for Sale & Rent to find a rental property in your city. Then, utilize Uber to determine the driving distance from your current location to the property. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Use Redfin Houses for Sale & Rent to find a rental property in your city. Then, utilize Uber to determine the driving distance from your current location to the property.\nThe historical actions are: step 1: CLICK: (722, 109)\nstep 2: CLICK: (254, 727)\nstep 3: PRESS_HOME\nstep 4: CLICK: (565, 261)\nstep 5: CLICK: (355, 599)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: COMPLETE\nB: TYPE: 2001 Colony St\nC: CLICK: (684, 542)\nD: CLICK: (496, 328)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_190_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_190_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_190_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_190_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_190_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_190_5.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: TYPE: shopping list for making sushi\nB: TYPE: Costco\nC: TYPE: when is the next Rolling Stones concert tour\nD: SCROLL: UP\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (294, 257)\nstep 2: CLICK: (261, 652)\nstep 3: TYPE: shopping mall\nstep 4: CLICK: (898, 683)\nstep 5: CLICK: (332, 278)\nstep 6: PRESS_HOME\nstep 7: CLICK: (576, 423)\nstep 8: CLICK: (437, 679)\nI want to Use Waze Navigation & Live Traffic to locate a nearby shopping mall, then book a ride with Lyft. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Use Waze Navigation & Live Traffic to locate a nearby shopping mall, then book a ride with Lyft.\nThe historical actions are: step 1: CLICK: (294, 257)\nstep 2: CLICK: (261, 652)\nstep 3: TYPE: shopping mall\nstep 4: CLICK: (898, 683)\nstep 5: CLICK: (332, 278)\nstep 6: PRESS_HOME\nstep 7: CLICK: (576, 423)\nstep 8: CLICK: (437, 679)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: TYPE: shopping list for making sushi\nB: TYPE: Costco\nC: TYPE: when is the next Rolling Stones concert tour\nD: SCROLL: UP\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_191_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_191_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_191_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_191_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_191_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_191_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_191_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_191_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_191_8.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: SCROLL: UP\nB: TYPE: Hong Kong\nC: PRESS_HOME\nD: TYPE: subaru most popular car\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: PRESS_HOME\nstep 2: SCROLL: RIGHT\nstep 3: CLICK: (166, 199)\nstep 4: CLICK: (297, 82)\nstep 5: CLICK: (924, 76)\nI want to Utilize AutoScout24: Buy & sell cars, and DuckDuckGo to identify the most popular Subaru car product and verify its price within the car app. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Utilize AutoScout24: Buy & sell cars, and DuckDuckGo to identify the most popular Subaru car product and verify its price within the car app.\nThe historical actions are: step 1: PRESS_HOME\nstep 2: SCROLL: RIGHT\nstep 3: CLICK: (166, 199)\nstep 4: CLICK: (297, 82)\nstep 5: CLICK: (924, 76)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: SCROLL: UP\nB: TYPE: Hong Kong\nC: PRESS_HOME\nD: TYPE: subaru most popular car\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_192_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_192_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_192_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_192_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_192_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_192_5.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: PRESS_HOME\nB: CLICK: (301, 123)\nC: SCROLL: RIGHT\nD: COMPLETE\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (322, 132)\nstep 2: CLICK: (272, 590)\nstep 3: CLICK: (501, 232)\nstep 4: TYPE: Business\nstep 5: CLICK: (744, 63)\nstep 6: CLICK: (593, 605)\nstep 7: PRESS_HOME\nI want to Organize a business meeting with Gentsdgoi Setgss via ZOOM Cloud Meetings, send them an invitation through Messenger, and set an alarm clock for the meeting using the Clock app. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Organize a business meeting with Gentsdgoi Setgss via ZOOM Cloud Meetings, send them an invitation through Messenger, and set an alarm clock for the meeting using the Clock app.\nThe historical actions are: step 1: CLICK: (322, 132)\nstep 2: CLICK: (272, 590)\nstep 3: CLICK: (501, 232)\nstep 4: TYPE: Business\nstep 5: CLICK: (744, 63)\nstep 6: CLICK: (593, 605)\nstep 7: PRESS_HOME\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: PRESS_HOME\nB: CLICK: (301, 123)\nC: SCROLL: RIGHT\nD: COMPLETE\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_193_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_193_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_193_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_193_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_193_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_193_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_193_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_193_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: PRESS_HOME\nB: CLICK: (336, 404)\nC: CLICK: (413, 516)\nD: CLICK: (870, 386)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (127, 647)\nstep 2: CLICK: (501, 383)\nstep 3: CLICK: (593, 561)\nstep 4: PRESS_HOME\nI want to Engage in a Spanish language lesson using Duolingo and establish a learning plan with TickTick. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Engage in a Spanish language lesson using Duolingo and establish a learning plan with TickTick.\nThe historical actions are: step 1: CLICK: (127, 647)\nstep 2: CLICK: (501, 383)\nstep 3: CLICK: (593, 561)\nstep 4: PRESS_HOME\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: PRESS_HOME\nB: CLICK: (336, 404)\nC: CLICK: (413, 516)\nD: CLICK: (870, 386)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_194_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_194_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_194_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_194_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_194_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: SCROLL: UP\nB: CLICK: (203, 290)\nC: COMPLETE\nD: CLICK: (454, 937)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (632, 388)\nstep 2: CLICK: (780, 78)\nstep 3: CLICK: (302, 878)\nstep 4: CLICK: (331, 808)\nstep 5: PRESS_HOME\nstep 6: CLICK: (841, 112)\nstep 7: CLICK: (542, 411)\nstep 8: CLICK: (566, 941)\nstep 9: TYPE: https://teams.live.com/meet/9374833361011?p=xEkF25dJMsD4Fheq\nstep 10: CLICK: (916, 596)\nI want to Set up an online meeting using Microsoft Teams and share the meeting link with Victor James through Messenger. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Set up an online meeting using Microsoft Teams and share the meeting link with Victor James through Messenger.\nThe historical actions are: step 1: CLICK: (632, 388)\nstep 2: CLICK: (780, 78)\nstep 3: CLICK: (302, 878)\nstep 4: CLICK: (331, 808)\nstep 5: PRESS_HOME\nstep 6: CLICK: (841, 112)\nstep 7: CLICK: (542, 411)\nstep 8: CLICK: (566, 941)\nstep 9: TYPE: https://teams.live.com/meet/9374833361011?p=xEkF25dJMsD4Fheq\nstep 10: CLICK: (916, 596)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: SCROLL: UP\nB: CLICK: (203, 290)\nC: COMPLETE\nD: CLICK: (454, 937)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_195_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_195_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_195_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_195_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_195_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_195_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_195_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_195_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_195_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_195_9.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_195_10.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: SCROLL: RIGHT\nB: COMPLETE\nC: TYPE: 'Becoming' by Michelle Obama\nD: CLICK: (902, 922)\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (596, 278)\nstep 2: CLICK: (52, 73)\nstep 3: CLICK: (18, 68)\nstep 4: CLICK: (866, 72)\nstep 5: CLICK: (491, 76)\nstep 6: TYPE: how to learn to code in Python\nstep 7: CLICK: (581, 165)\nstep 8: CLICK: (886, 184)\nstep 9: CLICK: (462, 683)\nstep 10: CLICK: (308, 474)\nstep 11: PRESS_HOME\nI want to Conduct research on how to learn Python coding using Quora for information, then create a reminder in TickTick to schedule a time to start the tutorial on the website. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Conduct research on how to learn Python coding using Quora for information, then create a reminder in TickTick to schedule a time to start the tutorial on the website.\nThe historical actions are: step 1: CLICK: (596, 278)\nstep 2: CLICK: (52, 73)\nstep 3: CLICK: (18, 68)\nstep 4: CLICK: (866, 72)\nstep 5: CLICK: (491, 76)\nstep 6: TYPE: how to learn to code in Python\nstep 7: CLICK: (581, 165)\nstep 8: CLICK: (886, 184)\nstep 9: CLICK: (462, 683)\nstep 10: CLICK: (308, 474)\nstep 11: PRESS_HOME\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: SCROLL: RIGHT\nB: COMPLETE\nC: TYPE: 'Becoming' by Michelle Obama\nD: CLICK: (902, 922)\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_196_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_196_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_196_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_196_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_196_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_196_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_196_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_196_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_196_8.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_196_9.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_196_10.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_196_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (883, 692)\nB: COMPLETE\nC: SCROLL: UP\nD: PRESS_HOME\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (425, 401)\nstep 2: CLICK: (390, 891)\nstep 3: TYPE: tell me about Fundamental theorem of calculus\nstep 4: CLICK: (689, 419)\nstep 5: PRESS_HOME\nstep 6: CLICK: (284, 391)\nstep 7: TYPE: Fundamental theorem of calculus\nI want to Consult the AI-related app 'Chatbot AI & Smart Assistant' to inquire about the Fundamental Theorem of Calculus, and then use the 'DuckDuckGo' browser to verify the information. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Consult the AI-related app 'Chatbot AI & Smart Assistant' to inquire about the Fundamental Theorem of Calculus, and then use the 'DuckDuckGo' browser to verify the information.\nThe historical actions are: step 1: CLICK: (425, 401)\nstep 2: CLICK: (390, 891)\nstep 3: TYPE: tell me about Fundamental theorem of calculus\nstep 4: CLICK: (689, 419)\nstep 5: PRESS_HOME\nstep 6: CLICK: (284, 391)\nstep 7: TYPE: Fundamental theorem of calculus\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (883, 692)\nB: COMPLETE\nC: SCROLL: UP\nD: PRESS_HOME\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_197_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_197_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_197_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_197_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_197_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_197_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_197_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_197_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: PRESS_HOME\nB: TYPE: Anemone\nC: COMPLETE\nD: SCROLL: UP\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (687, 220)\nstep 2: CLICK: (245, 727)\nstep 3: CLICK: (531, 724)\nstep 4: CLICK: (752, 717)\nstep 5: CLICK: (263, 821)\nstep 6: CLICK: (512, 822)\nstep 7: CLICK: (752, 825)\nstep 8: SCROLL: UP\nI want to Firstly, utilize 'Applock Pro - APP Lock & Guard' to secure 'Google Wallet'. After setting the lock, proceed to open 'Google Wallet' to verify the security settings using the PIN code 123456. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Firstly, utilize 'Applock Pro - APP Lock & Guard' to secure 'Google Wallet'. After setting the lock, proceed to open 'Google Wallet' to verify the security settings using the PIN code 123456.\nThe historical actions are: step 1: CLICK: (687, 220)\nstep 2: CLICK: (245, 727)\nstep 3: CLICK: (531, 724)\nstep 4: CLICK: (752, 717)\nstep 5: CLICK: (263, 821)\nstep 6: CLICK: (512, 822)\nstep 7: CLICK: (752, 825)\nstep 8: SCROLL: UP\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: PRESS_HOME\nB: TYPE: Anemone\nC: COMPLETE\nD: SCROLL: UP\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_198_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_198_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_198_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_198_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_198_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_198_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_198_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_198_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_198_8.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "gui_next_action_prediction", "visual_input_component": "GUI image", "source": "GUI-Odyssey", "options": "A: CLICK: (925, 919)\nB: CLICK: (351, 134)\nC: PRESS_HOME\nD: COMPLETE\n", "question": "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (133, 212)\nstep 2: CLICK: (863, 881)\nstep 3: CLICK: (316, 453)\nstep 4: TYPE: do yoga in the morning\nstep 5: CLICK: (502, 331)\nstep 6: PRESS_HOME\nstep 7: CLICK: (136, 360)\nstep 8: CLICK: (224, 371)\nI want to Locate a beginner Yoga workout video on Likee and set a reminder in Things to do it tomorrow morning. Which action should I do next?", "context": "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Locate a beginner Yoga workout video on Likee and set a reminder in Things to do it tomorrow morning.\nThe historical actions are: step 1: CLICK: (133, 212)\nstep 2: CLICK: (863, 881)\nstep 3: CLICK: (316, 453)\nstep 4: TYPE: do yoga in the morning\nstep 5: CLICK: (502, 331)\nstep 6: PRESS_HOME\nstep 7: CLICK: (136, 360)\nstep 8: CLICK: (224, 371)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: CLICK: (925, 919)\nB: CLICK: (351, 134)\nC: PRESS_HOME\nD: COMPLETE\n", "input_image_path": ["./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_199_0.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_199_1.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_199_2.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_199_3.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_199_4.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_199_5.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_199_6.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_199_7.png", "./Discrete-temporal/gui_next_action_prediction/gui_next_action_prediction_199_8.png"], "output": "D", "qwen3-vl": "image none"}]
\ No newline at end of file
diff --git a/results/handwritten_retrieval/qwen3-vl/metadata_info.json b/results/handwritten_retrieval/qwen3-vl/metadata_info.json
new file mode 100644
index 0000000..a950aaa
--- /dev/null
+++ b/results/handwritten_retrieval/qwen3-vl/metadata_info.json
@@ -0,0 +1 @@
+[{"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_0_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_0_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_0_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_0_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_0_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_1_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_1_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_1_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_1_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_1_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_2_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_2_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_2_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_2_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_2_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_3_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_3_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_3_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_3_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_3_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_4_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_4_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_4_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_4_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_4_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_5_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_5_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_5_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_5_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_5_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_6_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_6_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_6_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_6_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_6_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_7_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_7_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_7_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_7_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_7_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_8_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_8_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_8_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_8_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_8_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_9_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_9_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_9_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_9_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_9_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_10_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_10_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_10_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_10_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_10_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_11_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_11_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_11_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_11_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_11_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_12_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_12_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_12_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_12_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_12_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_13_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_13_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_13_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_13_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_13_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_14_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_14_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_14_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_14_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_14_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_15_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_15_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_15_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_15_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_15_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_16_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_16_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_16_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_16_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_16_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_17_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_17_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_17_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_17_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_17_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_18_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_18_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_18_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_18_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_18_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_19_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_19_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_19_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_19_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_19_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_20_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_20_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_20_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_20_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_20_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_21_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_21_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_21_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_21_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_21_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_22_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_22_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_22_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_22_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_22_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_23_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_23_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_23_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_23_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_23_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_24_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_24_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_24_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_24_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_24_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_25_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_25_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_25_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_25_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_25_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_26_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_26_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_26_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_26_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_26_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_27_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_27_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_27_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_27_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_27_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_28_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_28_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_28_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_28_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_28_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_29_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_29_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_29_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_29_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_29_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_30_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_30_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_30_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_30_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_30_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_31_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_31_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_31_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_31_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_31_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_32_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_32_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_32_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_32_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_32_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_33_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_33_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_33_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_33_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_33_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_34_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_34_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_34_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_34_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_34_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_35_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_35_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_35_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_35_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_35_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_36_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_36_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_36_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_36_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_36_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_37_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_37_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_37_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_37_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_37_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_38_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_38_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_38_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_38_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_38_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_39_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_39_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_39_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_39_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_39_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_40_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_40_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_40_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_40_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_40_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_41_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_41_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_41_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_41_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_41_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_42_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_42_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_42_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_42_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_42_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_43_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_43_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_43_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_43_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_43_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_44_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_44_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_44_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_44_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_44_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_45_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_45_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_45_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_45_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_45_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_46_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_46_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_46_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_46_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_46_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_47_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_47_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_47_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_47_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_47_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_48_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_48_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_48_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_48_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_48_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_49_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_49_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_49_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_49_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_49_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_50_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_50_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_50_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_50_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_50_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_51_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_51_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_51_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_51_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_51_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_52_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_52_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_52_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_52_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_52_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_53_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_53_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_53_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_53_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_53_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_54_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_54_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_54_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_54_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_54_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_55_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_55_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_55_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_55_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_55_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_56_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_56_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_56_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_56_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_56_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_57_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_57_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_57_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_57_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_57_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_58_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_58_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_58_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_58_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_58_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_59_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_59_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_59_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_59_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_59_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_60_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_60_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_60_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_60_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_60_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_61_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_61_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_61_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_61_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_61_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_62_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_62_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_62_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_62_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_62_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_63_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_63_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_63_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_63_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_63_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_64_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_64_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_64_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_64_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_64_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_65_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_65_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_65_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_65_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_65_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_66_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_66_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_66_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_66_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_66_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_67_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_67_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_67_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_67_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_67_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_68_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_68_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_68_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_68_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_68_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_69_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_69_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_69_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_69_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_69_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_70_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_70_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_70_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_70_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_70_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_71_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_71_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_71_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_71_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_71_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_72_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_72_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_72_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_72_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_72_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_73_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_73_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_73_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_73_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_73_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_74_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_74_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_74_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_74_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_74_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_75_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_75_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_75_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_75_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_75_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_76_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_76_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_76_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_76_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_76_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_77_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_77_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_77_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_77_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_77_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_78_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_78_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_78_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_78_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_78_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_79_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_79_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_79_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_79_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_79_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_80_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_80_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_80_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_80_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_80_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_81_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_81_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_81_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_81_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_81_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_82_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_82_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_82_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_82_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_82_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_83_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_83_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_83_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_83_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_83_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_84_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_84_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_84_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_84_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_84_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_85_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_85_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_85_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_85_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_85_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_86_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_86_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_86_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_86_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_86_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_87_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_87_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_87_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_87_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_87_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_88_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_88_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_88_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_88_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_88_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_89_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_89_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_89_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_89_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_89_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_90_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_90_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_90_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_90_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_90_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_91_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_91_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_91_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_91_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_91_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_92_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_92_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_92_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_92_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_92_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_93_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_93_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_93_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_93_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_93_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_94_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_94_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_94_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_94_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_94_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_95_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_95_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_95_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_95_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_95_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_96_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_96_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_96_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_96_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_96_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_97_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_97_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_97_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_97_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_97_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_98_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_98_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_98_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_98_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_98_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_99_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_99_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_99_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_99_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_99_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_100_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_100_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_100_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_100_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_100_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_101_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_101_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_101_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_101_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_101_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_102_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_102_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_102_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_102_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_102_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_103_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_103_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_103_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_103_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_103_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_104_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_104_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_104_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_104_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_104_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_105_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_105_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_105_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_105_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_105_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_106_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_106_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_106_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_106_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_106_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_107_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_107_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_107_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_107_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_107_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_108_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_108_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_108_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_108_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_108_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_109_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_109_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_109_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_109_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_109_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_110_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_110_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_110_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_110_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_110_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_111_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_111_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_111_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_111_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_111_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_112_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_112_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_112_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_112_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_112_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_113_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_113_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_113_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_113_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_113_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_114_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_114_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_114_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_114_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_114_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_115_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_115_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_115_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_115_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_115_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_116_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_116_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_116_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_116_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_116_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_117_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_117_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_117_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_117_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_117_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_118_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_118_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_118_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_118_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_118_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_119_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_119_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_119_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_119_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_119_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_120_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_120_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_120_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_120_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_120_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_121_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_121_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_121_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_121_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_121_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_122_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_122_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_122_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_122_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_122_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_123_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_123_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_123_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_123_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_123_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_124_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_124_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_124_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_124_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_124_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_125_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_125_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_125_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_125_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_125_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_126_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_126_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_126_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_126_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_126_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_127_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_127_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_127_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_127_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_127_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_128_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_128_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_128_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_128_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_128_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_129_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_129_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_129_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_129_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_129_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_130_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_130_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_130_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_130_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_130_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_131_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_131_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_131_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_131_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_131_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_132_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_132_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_132_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_132_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_132_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_133_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_133_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_133_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_133_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_133_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_134_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_134_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_134_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_134_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_134_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_135_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_135_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_135_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_135_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_135_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_136_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_136_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_136_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_136_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_136_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_137_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_137_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_137_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_137_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_137_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_138_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_138_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_138_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_138_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_138_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_139_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_139_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_139_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_139_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_139_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_140_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_140_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_140_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_140_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_140_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_141_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_141_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_141_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_141_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_141_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_142_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_142_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_142_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_142_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_142_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_143_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_143_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_143_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_143_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_143_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_144_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_144_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_144_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_144_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_144_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_145_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_145_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_145_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_145_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_145_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_146_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_146_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_146_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_146_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_146_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_147_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_147_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_147_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_147_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_147_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_148_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_148_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_148_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_148_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_148_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_149_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_149_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_149_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_149_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_149_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_150_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_150_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_150_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_150_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_150_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_151_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_151_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_151_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_151_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_151_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_152_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_152_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_152_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_152_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_152_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_153_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_153_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_153_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_153_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_153_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_154_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_154_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_154_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_154_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_154_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_155_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_155_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_155_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_155_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_155_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_156_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_156_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_156_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_156_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_156_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_157_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_157_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_157_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_157_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_157_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_158_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_158_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_158_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_158_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_158_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_159_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_159_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_159_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_159_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_159_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_160_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_160_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_160_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_160_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_160_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_161_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_161_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_161_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_161_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_161_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_162_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_162_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_162_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_162_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_162_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_163_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_163_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_163_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_163_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_163_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_164_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_164_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_164_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_164_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_164_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_165_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_165_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_165_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_165_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_165_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_166_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_166_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_166_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_166_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_166_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_167_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_167_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_167_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_167_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_167_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_168_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_168_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_168_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_168_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_168_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_169_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_169_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_169_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_169_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_169_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_170_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_170_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_170_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_170_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_170_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_171_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_171_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_171_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_171_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_171_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_172_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_172_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_172_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_172_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_172_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_173_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_173_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_173_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_173_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_173_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_174_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_174_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_174_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_174_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_174_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_175_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_175_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_175_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_175_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_175_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_176_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_176_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_176_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_176_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_176_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_177_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_177_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_177_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_177_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_177_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_178_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_178_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_178_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_178_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_178_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_179_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_179_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_179_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_179_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_179_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_180_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_180_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_180_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_180_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_180_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_181_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_181_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_181_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_181_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_181_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_182_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_182_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_182_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_182_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_182_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_183_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_183_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_183_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_183_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_183_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_184_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_184_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_184_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_184_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_184_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_185_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_185_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_185_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_185_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_185_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_186_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_186_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_186_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_186_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_186_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_187_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_187_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_187_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_187_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_187_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_188_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_188_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_188_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_188_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_188_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_189_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_189_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_189_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_189_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_189_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_190_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_190_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_190_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_190_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_190_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_191_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_191_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_191_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_191_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_191_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_192_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_192_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_192_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_192_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_192_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_193_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_193_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_193_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_193_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_193_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_194_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_194_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_194_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_194_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_194_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_195_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_195_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_195_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_195_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_195_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_196_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_196_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_196_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_196_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_196_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_197_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_197_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_197_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_197_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_197_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_198_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_198_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_198_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_198_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_198_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "handwritten_retrieval", "visual_input_component": "['text-rich_image']", "source": "iam_handwritten_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar handwritten text snapshot to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_199_0.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_199_1.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_199_2.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_199_3.png", "./High-level-obj-semantic/handwritten_retrieval/handwritten_retrieval_199_4.png"], "output": "A", "qwen3-vl": "image none"}]
\ No newline at end of file
diff --git a/results/image2image_retrieval/qwen3-vl/metadata_info.json b/results/image2image_retrieval/qwen3-vl/metadata_info.json
new file mode 100644
index 0000000..25d7608
--- /dev/null
+++ b/results/image2image_retrieval/qwen3-vl/metadata_info.json
@@ -0,0 +1 @@
+[{"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_0_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_0_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_0_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_0_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_0_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_1_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_1_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_1_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_1_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_1_4.JPEG"], "output": "D", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_2_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_2_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_2_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_2_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_2_4.JPEG"], "output": "C", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_3_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_3_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_3_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_3_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_3_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_4_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_4_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_4_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_4_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_4_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_5_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_5_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_5_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_5_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_5_4.JPEG"], "output": "B", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_6_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_6_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_6_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_6_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_6_4.JPEG"], "output": "B", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_7_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_7_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_7_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_7_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_7_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_8_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_8_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_8_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_8_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_8_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_9_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_9_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_9_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_9_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_9_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_10_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_10_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_10_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_10_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_10_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_11_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_11_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_11_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_11_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_11_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_12_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_12_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_12_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_12_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_12_4.JPEG"], "output": "A", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_13_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_13_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_13_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_13_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_13_4.JPEG"], "output": "A", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_14_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_14_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_14_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_14_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_14_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_15_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_15_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_15_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_15_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_15_4.JPEG"], "output": "D", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_16_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_16_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_16_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_16_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_16_4.JPEG"], "output": "A", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_17_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_17_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_17_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_17_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_17_4.JPEG"], "output": "C", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_18_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_18_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_18_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_18_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_18_4.JPEG"], "output": "C", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_19_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_19_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_19_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_19_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_19_4.JPEG"], "output": "D", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_20_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_20_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_20_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_20_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_20_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_21_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_21_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_21_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_21_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_21_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_22_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_22_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_22_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_22_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_22_4.JPEG"], "output": "D", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_23_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_23_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_23_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_23_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_23_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_24_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_24_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_24_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_24_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_24_4.JPEG"], "output": "D", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_25_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_25_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_25_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_25_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_25_4.JPEG"], "output": "A", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_26_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_26_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_26_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_26_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_26_4.JPEG"], "output": "B", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_27_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_27_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_27_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_27_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_27_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_28_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_28_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_28_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_28_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_28_4.JPEG"], "output": "B", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_29_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_29_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_29_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_29_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_29_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_30_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_30_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_30_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_30_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_30_4.JPEG"], "output": "A", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_31_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_31_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_31_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_31_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_31_4.JPEG"], "output": "B", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_32_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_32_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_32_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_32_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_32_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_33_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_33_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_33_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_33_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_33_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_34_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_34_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_34_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_34_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_34_4.JPEG"], "output": "C", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_35_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_35_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_35_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_35_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_35_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_36_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_36_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_36_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_36_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_36_4.JPEG"], "output": "A", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_37_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_37_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_37_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_37_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_37_4.JPEG"], "output": "D", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_38_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_38_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_38_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_38_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_38_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_39_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_39_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_39_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_39_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_39_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_40_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_40_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_40_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_40_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_40_4.JPEG"], "output": "C", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_41_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_41_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_41_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_41_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_41_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_42_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_42_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_42_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_42_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_42_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_43_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_43_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_43_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_43_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_43_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_44_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_44_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_44_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_44_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_44_4.JPEG"], "output": "C", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_45_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_45_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_45_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_45_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_45_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_46_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_46_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_46_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_46_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_46_4.JPEG"], "output": "A", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_47_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_47_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_47_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_47_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_47_4.JPEG"], "output": "C", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_48_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_48_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_48_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_48_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_48_4.JPEG"], "output": "A", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_49_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_49_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_49_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_49_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_49_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_50_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_50_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_50_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_50_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_50_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_51_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_51_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_51_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_51_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_51_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_52_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_52_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_52_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_52_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_52_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_53_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_53_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_53_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_53_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_53_4.JPEG"], "output": "A", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_54_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_54_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_54_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_54_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_54_4.JPEG"], "output": "D", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_55_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_55_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_55_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_55_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_55_4.JPEG"], "output": "C", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_56_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_56_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_56_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_56_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_56_4.JPEG"], "output": "B", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_57_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_57_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_57_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_57_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_57_4.JPEG"], "output": "A", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_58_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_58_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_58_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_58_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_58_4.JPEG"], "output": "D", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_59_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_59_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_59_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_59_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_59_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_60_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_60_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_60_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_60_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_60_4.JPEG"], "output": "C", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_61_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_61_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_61_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_61_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_61_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_62_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_62_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_62_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_62_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_62_4.JPEG"], "output": "A", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_63_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_63_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_63_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_63_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_63_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_64_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_64_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_64_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_64_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_64_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_65_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_65_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_65_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_65_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_65_4.JPEG"], "output": "A", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_66_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_66_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_66_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_66_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_66_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_67_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_67_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_67_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_67_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_67_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_68_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_68_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_68_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_68_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_68_4.JPEG"], "output": "A", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_69_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_69_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_69_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_69_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_69_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_70_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_70_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_70_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_70_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_70_4.JPEG"], "output": "D", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_71_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_71_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_71_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_71_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_71_4.JPEG"], "output": "B", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_72_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_72_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_72_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_72_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_72_4.JPEG"], "output": "D", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_73_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_73_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_73_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_73_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_73_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_74_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_74_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_74_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_74_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_74_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_75_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_75_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_75_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_75_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_75_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_76_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_76_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_76_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_76_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_76_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_77_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_77_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_77_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_77_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_77_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_78_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_78_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_78_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_78_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_78_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_79_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_79_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_79_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_79_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_79_4.JPEG"], "output": "D", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_80_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_80_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_80_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_80_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_80_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_81_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_81_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_81_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_81_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_81_4.JPEG"], "output": "A", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_82_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_82_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_82_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_82_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_82_4.JPEG"], "output": "C", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_83_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_83_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_83_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_83_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_83_4.JPEG"], "output": "B", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_84_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_84_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_84_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_84_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_84_4.JPEG"], "output": "D", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_85_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_85_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_85_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_85_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_85_4.JPEG"], "output": "A", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_86_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_86_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_86_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_86_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_86_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_87_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_87_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_87_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_87_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_87_4.JPEG"], "output": "D", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_88_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_88_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_88_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_88_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_88_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_89_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_89_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_89_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_89_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_89_4.JPEG"], "output": "D", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_90_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_90_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_90_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_90_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_90_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_91_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_91_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_91_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_91_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_91_4.JPEG"], "output": "C", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_92_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_92_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_92_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_92_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_92_4.JPEG"], "output": "B", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_93_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_93_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_93_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_93_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_93_4.JPEG"], "output": "C", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_94_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_94_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_94_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_94_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_94_4.JPEG"], "output": "D", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_95_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_95_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_95_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_95_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_95_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_96_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_96_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_96_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_96_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_96_4.JPEG"], "output": "B", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_97_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_97_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_97_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_97_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_97_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_98_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_98_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_98_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_98_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_98_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_99_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_99_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_99_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_99_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_99_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_100_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_100_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_100_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_100_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_100_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_101_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_101_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_101_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_101_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_101_4.JPEG"], "output": "C", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_102_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_102_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_102_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_102_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_102_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_103_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_103_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_103_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_103_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_103_4.JPEG"], "output": "D", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_104_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_104_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_104_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_104_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_104_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_105_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_105_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_105_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_105_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_105_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_106_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_106_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_106_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_106_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_106_4.JPEG"], "output": "C", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_107_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_107_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_107_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_107_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_107_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_108_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_108_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_108_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_108_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_108_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_109_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_109_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_109_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_109_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_109_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_110_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_110_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_110_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_110_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_110_4.JPEG"], "output": "C", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_111_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_111_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_111_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_111_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_111_4.JPEG"], "output": "D", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_112_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_112_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_112_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_112_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_112_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_113_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_113_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_113_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_113_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_113_4.JPEG"], "output": "B", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_114_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_114_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_114_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_114_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_114_4.JPEG"], "output": "A", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_115_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_115_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_115_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_115_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_115_4.JPEG"], "output": "A", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_116_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_116_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_116_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_116_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_116_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_117_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_117_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_117_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_117_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_117_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_118_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_118_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_118_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_118_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_118_4.JPEG"], "output": "C", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_119_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_119_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_119_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_119_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_119_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_120_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_120_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_120_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_120_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_120_4.JPEG"], "output": "A", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_121_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_121_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_121_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_121_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_121_4.JPEG"], "output": "C", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_122_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_122_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_122_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_122_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_122_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_123_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_123_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_123_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_123_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_123_4.JPEG"], "output": "A", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_124_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_124_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_124_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_124_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_124_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_125_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_125_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_125_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_125_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_125_4.JPEG"], "output": "D", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_126_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_126_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_126_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_126_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_126_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_127_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_127_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_127_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_127_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_127_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_128_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_128_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_128_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_128_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_128_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_129_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_129_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_129_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_129_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_129_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_130_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_130_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_130_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_130_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_130_4.JPEG"], "output": "C", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_131_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_131_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_131_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_131_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_131_4.JPEG"], "output": "C", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_132_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_132_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_132_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_132_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_132_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_133_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_133_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_133_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_133_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_133_4.JPEG"], "output": "D", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_134_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_134_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_134_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_134_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_134_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_135_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_135_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_135_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_135_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_135_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_136_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_136_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_136_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_136_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_136_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_137_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_137_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_137_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_137_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_137_4.JPEG"], "output": "D", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_138_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_138_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_138_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_138_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_138_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_139_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_139_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_139_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_139_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_139_4.JPEG"], "output": "B", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_140_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_140_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_140_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_140_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_140_4.JPEG"], "output": "A", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_141_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_141_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_141_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_141_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_141_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_142_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_142_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_142_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_142_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_142_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_143_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_143_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_143_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_143_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_143_4.JPEG"], "output": "C", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_144_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_144_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_144_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_144_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_144_4.JPEG"], "output": "C", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_145_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_145_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_145_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_145_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_145_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_146_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_146_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_146_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_146_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_146_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_147_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_147_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_147_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_147_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_147_4.JPEG"], "output": "B", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_148_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_148_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_148_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_148_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_148_4.JPEG"], "output": "B", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_149_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_149_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_149_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_149_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_149_4.JPEG"], "output": "C", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_150_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_150_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_150_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_150_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_150_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_151_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_151_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_151_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_151_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_151_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_152_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_152_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_152_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_152_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_152_4.JPEG"], "output": "A", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_153_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_153_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_153_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_153_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_153_4.JPEG"], "output": "C", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_154_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_154_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_154_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_154_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_154_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_155_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_155_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_155_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_155_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_155_4.JPEG"], "output": "D", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_156_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_156_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_156_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_156_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_156_4.JPEG"], "output": "D", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_157_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_157_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_157_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_157_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_157_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_158_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_158_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_158_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_158_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_158_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_159_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_159_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_159_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_159_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_159_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_160_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_160_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_160_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_160_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_160_4.JPEG"], "output": "A", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_161_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_161_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_161_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_161_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_161_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_162_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_162_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_162_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_162_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_162_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_163_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_163_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_163_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_163_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_163_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_164_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_164_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_164_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_164_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_164_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_165_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_165_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_165_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_165_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_165_4.JPEG"], "output": "A", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_166_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_166_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_166_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_166_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_166_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_167_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_167_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_167_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_167_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_167_4.JPEG"], "output": "A", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_168_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_168_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_168_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_168_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_168_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_169_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_169_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_169_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_169_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_169_4.JPEG"], "output": "C", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_170_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_170_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_170_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_170_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_170_4.JPEG"], "output": "B", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_171_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_171_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_171_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_171_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_171_4.JPEG"], "output": "C", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_172_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_172_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_172_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_172_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_172_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_173_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_173_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_173_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_173_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_173_4.JPEG"], "output": "A", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_174_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_174_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_174_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_174_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_174_4.JPEG"], "output": "B", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_175_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_175_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_175_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_175_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_175_4.JPEG"], "output": "D", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_176_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_176_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_176_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_176_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_176_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_177_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_177_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_177_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_177_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_177_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_178_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_178_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_178_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_178_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_178_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_179_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_179_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_179_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_179_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_179_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_180_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_180_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_180_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_180_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_180_4.JPEG"], "output": "A", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_181_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_181_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_181_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_181_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_181_4.JPEG"], "output": "C", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_182_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_182_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_182_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_182_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_182_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_183_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_183_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_183_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_183_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_183_4.JPEG"], "output": "C", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_184_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_184_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_184_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_184_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_184_4.JPEG"], "output": "B", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_185_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_185_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_185_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_185_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_185_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_186_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_186_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_186_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_186_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_186_4.JPEG"], "output": "C", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_187_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_187_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_187_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_187_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_187_4.JPEG"], "output": "A", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_188_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_188_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_188_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_188_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_188_4.JPEG"], "output": "B", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_189_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_189_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_189_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_189_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_189_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_190_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_190_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_190_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_190_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_190_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_191_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_191_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_191_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_191_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_191_4.JPEG"], "output": "A", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_192_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_192_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_192_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_192_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_192_4.JPEG"], "output": "C", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_193_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_193_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_193_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_193_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_193_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_194_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_194_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_194_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_194_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_194_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_195_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_195_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_195_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_195_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_195_4.JPEG"], "output": "C", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_196_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_196_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_196_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_196_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_196_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_197_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_197_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_197_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_197_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_197_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "tinyimagenet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_198_0.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_198_1.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_198_2.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_198_3.JPEG", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_198_4.JPEG"], "output": "A", "qwen3-vl": "image none"}, {"task": "image2image_retrieval", "visual_input_component": "['natural_image']", "source": "places365_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar scene to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_199_0.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_199_1.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_199_2.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_199_3.jpg", "./High-level-obj-semantic/image2image_retrieval/image2image_retrieval_199_4.jpg"], "output": "D", "qwen3-vl": "image none"}]
\ No newline at end of file
diff --git a/results/jigsaw_puzzle_solving/qwen3-vl/metadata_info.json b/results/jigsaw_puzzle_solving/qwen3-vl/metadata_info.json
new file mode 100644
index 0000000..0b3f8d9
--- /dev/null
+++ b/results/jigsaw_puzzle_solving/qwen3-vl/metadata_info.json
@@ -0,0 +1 @@
+[{"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [3, 1, 4, 2]\nB: [2, 3, 4, 1]\nC: [2, 1, 3, 4]\nD: [4, 2, 1, 3]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 1, 4, 2]\nB: [2, 3, 4, 1]\nC: [2, 1, 3, 4]\nD: [4, 2, 1, 3]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_0_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_0_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_0_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_0_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_0_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [2, 3, 4, 1]\nB: [3, 1, 2, 4]\nC: [4, 3, 1, 2]\nD: [2, 4, 3, 1]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 3, 4, 1]\nB: [3, 1, 2, 4]\nC: [4, 3, 1, 2]\nD: [2, 4, 3, 1]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_1_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_1_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_1_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_1_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_1_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [3, 4, 1, 2]\nB: [3, 4, 2, 1]\nC: [4, 3, 2, 1]\nD: [1, 2, 4, 3]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 4, 1, 2]\nB: [3, 4, 2, 1]\nC: [4, 3, 2, 1]\nD: [1, 2, 4, 3]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_2_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_2_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_2_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_2_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_2_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [3, 4, 1, 2]\nB: [4, 1, 3, 2]\nC: [1, 3, 4, 2]\nD: [3, 2, 4, 1]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 4, 1, 2]\nB: [4, 1, 3, 2]\nC: [1, 3, 4, 2]\nD: [3, 2, 4, 1]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_3_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_3_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_3_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_3_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_3_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [2, 1, 4, 3]\nB: [3, 1, 2, 4]\nC: [4, 3, 1, 2]\nD: [1, 4, 3, 2]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 1, 4, 3]\nB: [3, 1, 2, 4]\nC: [4, 3, 1, 2]\nD: [1, 4, 3, 2]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_4_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_4_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_4_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_4_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_4_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [3, 2, 4, 1]\nB: [3, 4, 2, 1]\nC: [1, 3, 4, 2]\nD: [3, 1, 4, 2]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 2, 4, 1]\nB: [3, 4, 2, 1]\nC: [1, 3, 4, 2]\nD: [3, 1, 4, 2]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_5_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_5_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_5_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_5_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_5_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [3, 2, 4, 1]\nB: [3, 4, 1, 2]\nC: [2, 1, 3, 4]\nD: [4, 2, 3, 1]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 2, 4, 1]\nB: [3, 4, 1, 2]\nC: [2, 1, 3, 4]\nD: [4, 2, 3, 1]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_6_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_6_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_6_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_6_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_6_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [4, 1, 3, 2]\nB: [4, 1, 2, 3]\nC: [2, 3, 4, 1]\nD: [2, 4, 3, 1]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 1, 3, 2]\nB: [4, 1, 2, 3]\nC: [2, 3, 4, 1]\nD: [2, 4, 3, 1]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_7_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_7_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_7_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_7_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_7_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [2, 4, 3, 1]\nB: [2, 3, 4, 1]\nC: [1, 3, 4, 2]\nD: [4, 3, 2, 1]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 4, 3, 1]\nB: [2, 3, 4, 1]\nC: [1, 3, 4, 2]\nD: [4, 3, 2, 1]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_8_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_8_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_8_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_8_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_8_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [4, 1, 2, 3]\nB: [1, 3, 2, 4]\nC: [3, 1, 2, 4]\nD: [3, 4, 2, 1]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 1, 2, 3]\nB: [1, 3, 2, 4]\nC: [3, 1, 2, 4]\nD: [3, 4, 2, 1]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_9_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_9_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_9_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_9_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_9_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [3, 2, 4, 1]\nB: [2, 1, 3, 4]\nC: [1, 3, 2, 4]\nD: [2, 3, 1, 4]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 2, 4, 1]\nB: [2, 1, 3, 4]\nC: [1, 3, 2, 4]\nD: [2, 3, 1, 4]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_10_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_10_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_10_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_10_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_10_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [1, 4, 3, 2]\nB: [3, 4, 2, 1]\nC: [4, 1, 3, 2]\nD: [2, 4, 1, 3]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 4, 3, 2]\nB: [3, 4, 2, 1]\nC: [4, 1, 3, 2]\nD: [2, 4, 1, 3]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_11_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_11_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_11_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_11_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_11_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [4, 3, 1, 2]\nB: [2, 1, 3, 4]\nC: [1, 2, 4, 3]\nD: [1, 3, 2, 4]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 3, 1, 2]\nB: [2, 1, 3, 4]\nC: [1, 2, 4, 3]\nD: [1, 3, 2, 4]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_12_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_12_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_12_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_12_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_12_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [1, 4, 3, 2]\nB: [4, 3, 1, 2]\nC: [3, 2, 1, 4]\nD: [3, 1, 2, 4]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 4, 3, 2]\nB: [4, 3, 1, 2]\nC: [3, 2, 1, 4]\nD: [3, 1, 2, 4]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_13_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_13_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_13_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_13_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_13_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [4, 3, 2, 1]\nB: [3, 4, 1, 2]\nC: [1, 4, 2, 3]\nD: [4, 3, 1, 2]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 3, 2, 1]\nB: [3, 4, 1, 2]\nC: [1, 4, 2, 3]\nD: [4, 3, 1, 2]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_14_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_14_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_14_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_14_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_14_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [3, 1, 4, 2]\nB: [4, 2, 3, 1]\nC: [1, 4, 2, 3]\nD: [3, 4, 2, 1]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 1, 4, 2]\nB: [4, 2, 3, 1]\nC: [1, 4, 2, 3]\nD: [3, 4, 2, 1]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_15_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_15_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_15_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_15_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_15_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [3, 4, 2, 1]\nB: [4, 2, 3, 1]\nC: [1, 3, 2, 4]\nD: [3, 2, 1, 4]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 4, 2, 1]\nB: [4, 2, 3, 1]\nC: [1, 3, 2, 4]\nD: [3, 2, 1, 4]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_16_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_16_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_16_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_16_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_16_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [2, 1, 4, 3]\nB: [4, 2, 1, 3]\nC: [4, 2, 3, 1]\nD: [3, 4, 1, 2]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 1, 4, 3]\nB: [4, 2, 1, 3]\nC: [4, 2, 3, 1]\nD: [3, 4, 1, 2]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_17_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_17_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_17_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_17_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_17_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [2, 4, 3, 1]\nB: [3, 1, 2, 4]\nC: [1, 3, 2, 4]\nD: [4, 2, 1, 3]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 4, 3, 1]\nB: [3, 1, 2, 4]\nC: [1, 3, 2, 4]\nD: [4, 2, 1, 3]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_18_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_18_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_18_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_18_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_18_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [3, 1, 4, 2]\nB: [3, 2, 1, 4]\nC: [2, 4, 3, 1]\nD: [1, 2, 4, 3]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 1, 4, 2]\nB: [3, 2, 1, 4]\nC: [2, 4, 3, 1]\nD: [1, 2, 4, 3]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_19_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_19_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_19_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_19_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_19_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [2, 1, 4, 3]\nB: [4, 1, 2, 3]\nC: [4, 3, 1, 2]\nD: [1, 2, 4, 3]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 1, 4, 3]\nB: [4, 1, 2, 3]\nC: [4, 3, 1, 2]\nD: [1, 2, 4, 3]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_20_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_20_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_20_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_20_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_20_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [2, 4, 3, 1]\nB: [3, 2, 1, 4]\nC: [3, 2, 4, 1]\nD: [1, 2, 3, 4]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 4, 3, 1]\nB: [3, 2, 1, 4]\nC: [3, 2, 4, 1]\nD: [1, 2, 3, 4]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_21_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_21_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_21_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_21_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_21_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [2, 1, 4, 3]\nB: [2, 3, 4, 1]\nC: [1, 4, 2, 3]\nD: [2, 1, 3, 4]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 1, 4, 3]\nB: [2, 3, 4, 1]\nC: [1, 4, 2, 3]\nD: [2, 1, 3, 4]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_22_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_22_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_22_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_22_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_22_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [2, 4, 1, 3]\nB: [4, 2, 1, 3]\nC: [2, 1, 4, 3]\nD: [1, 3, 2, 4]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 4, 1, 3]\nB: [4, 2, 1, 3]\nC: [2, 1, 4, 3]\nD: [1, 3, 2, 4]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_23_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_23_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_23_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_23_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_23_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [3, 1, 2, 4]\nB: [1, 3, 2, 4]\nC: [2, 1, 4, 3]\nD: [4, 2, 3, 1]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 1, 2, 4]\nB: [1, 3, 2, 4]\nC: [2, 1, 4, 3]\nD: [4, 2, 3, 1]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_24_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_24_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_24_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_24_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_24_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [4, 2, 3, 1]\nB: [1, 4, 3, 2]\nC: [4, 2, 1, 3]\nD: [2, 3, 1, 4]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 2, 3, 1]\nB: [1, 4, 3, 2]\nC: [4, 2, 1, 3]\nD: [2, 3, 1, 4]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_25_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_25_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_25_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_25_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_25_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [3, 4, 1, 2]\nB: [2, 4, 3, 1]\nC: [4, 3, 1, 2]\nD: [3, 1, 4, 2]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 4, 1, 2]\nB: [2, 4, 3, 1]\nC: [4, 3, 1, 2]\nD: [3, 1, 4, 2]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_26_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_26_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_26_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_26_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_26_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [1, 3, 2, 4]\nB: [2, 4, 3, 1]\nC: [1, 4, 2, 3]\nD: [1, 2, 3, 4]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 3, 2, 4]\nB: [2, 4, 3, 1]\nC: [1, 4, 2, 3]\nD: [1, 2, 3, 4]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_27_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_27_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_27_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_27_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_27_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [1, 4, 3, 2]\nB: [4, 1, 3, 2]\nC: [2, 1, 3, 4]\nD: [4, 3, 2, 1]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 4, 3, 2]\nB: [4, 1, 3, 2]\nC: [2, 1, 3, 4]\nD: [4, 3, 2, 1]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_28_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_28_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_28_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_28_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_28_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [3, 2, 1, 4]\nB: [3, 1, 2, 4]\nC: [4, 1, 2, 3]\nD: [1, 4, 3, 2]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 2, 1, 4]\nB: [3, 1, 2, 4]\nC: [4, 1, 2, 3]\nD: [1, 4, 3, 2]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_29_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_29_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_29_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_29_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_29_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [1, 2, 4, 3]\nB: [3, 2, 1, 4]\nC: [1, 3, 2, 4]\nD: [4, 3, 1, 2]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 2, 4, 3]\nB: [3, 2, 1, 4]\nC: [1, 3, 2, 4]\nD: [4, 3, 1, 2]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_30_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_30_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_30_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_30_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_30_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [2, 1, 4, 3]\nB: [1, 4, 3, 2]\nC: [1, 3, 4, 2]\nD: [1, 2, 4, 3]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 1, 4, 3]\nB: [1, 4, 3, 2]\nC: [1, 3, 4, 2]\nD: [1, 2, 4, 3]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_31_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_31_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_31_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_31_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_31_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [2, 3, 1, 4]\nB: [4, 3, 1, 2]\nC: [2, 3, 4, 1]\nD: [2, 1, 4, 3]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 3, 1, 4]\nB: [4, 3, 1, 2]\nC: [2, 3, 4, 1]\nD: [2, 1, 4, 3]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_32_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_32_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_32_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_32_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_32_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [2, 1, 4, 3]\nB: [3, 2, 4, 1]\nC: [3, 4, 1, 2]\nD: [1, 2, 4, 3]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 1, 4, 3]\nB: [3, 2, 4, 1]\nC: [3, 4, 1, 2]\nD: [1, 2, 4, 3]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_33_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_33_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_33_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_33_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_33_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [3, 2, 4, 1]\nB: [3, 1, 2, 4]\nC: [2, 3, 4, 1]\nD: [1, 3, 2, 4]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 2, 4, 1]\nB: [3, 1, 2, 4]\nC: [2, 3, 4, 1]\nD: [1, 3, 2, 4]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_34_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_34_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_34_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_34_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_34_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [3, 4, 2, 1]\nB: [1, 4, 2, 3]\nC: [2, 1, 3, 4]\nD: [2, 3, 1, 4]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 4, 2, 1]\nB: [1, 4, 2, 3]\nC: [2, 1, 3, 4]\nD: [2, 3, 1, 4]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_35_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_35_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_35_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_35_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_35_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [4, 3, 2, 1]\nB: [4, 1, 2, 3]\nC: [3, 2, 1, 4]\nD: [1, 3, 2, 4]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 3, 2, 1]\nB: [4, 1, 2, 3]\nC: [3, 2, 1, 4]\nD: [1, 3, 2, 4]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_36_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_36_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_36_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_36_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_36_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [2, 3, 4, 1]\nB: [4, 1, 2, 3]\nC: [3, 4, 2, 1]\nD: [3, 4, 1, 2]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 3, 4, 1]\nB: [4, 1, 2, 3]\nC: [3, 4, 2, 1]\nD: [3, 4, 1, 2]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_37_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_37_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_37_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_37_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_37_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [4, 3, 1, 2]\nB: [4, 2, 1, 3]\nC: [1, 3, 4, 2]\nD: [3, 1, 2, 4]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 3, 1, 2]\nB: [4, 2, 1, 3]\nC: [1, 3, 4, 2]\nD: [3, 1, 2, 4]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_38_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_38_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_38_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_38_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_38_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [4, 1, 3, 2]\nB: [1, 3, 4, 2]\nC: [2, 3, 4, 1]\nD: [3, 1, 4, 2]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 1, 3, 2]\nB: [1, 3, 4, 2]\nC: [2, 3, 4, 1]\nD: [3, 1, 4, 2]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_39_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_39_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_39_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_39_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_39_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [2, 1, 4, 3]\nB: [3, 4, 1, 2]\nC: [3, 2, 4, 1]\nD: [1, 2, 4, 3]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 1, 4, 3]\nB: [3, 4, 1, 2]\nC: [3, 2, 4, 1]\nD: [1, 2, 4, 3]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_40_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_40_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_40_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_40_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_40_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [3, 4, 1, 2]\nB: [1, 2, 3, 4]\nC: [2, 4, 3, 1]\nD: [2, 3, 1, 4]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 4, 1, 2]\nB: [1, 2, 3, 4]\nC: [2, 4, 3, 1]\nD: [2, 3, 1, 4]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_41_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_41_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_41_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_41_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_41_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [4, 3, 2, 1]\nB: [1, 2, 4, 3]\nC: [2, 1, 3, 4]\nD: [4, 2, 3, 1]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 3, 2, 1]\nB: [1, 2, 4, 3]\nC: [2, 1, 3, 4]\nD: [4, 2, 3, 1]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_42_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_42_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_42_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_42_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_42_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [4, 2, 1, 3]\nB: [3, 1, 4, 2]\nC: [1, 2, 3, 4]\nD: [3, 4, 1, 2]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 2, 1, 3]\nB: [3, 1, 4, 2]\nC: [1, 2, 3, 4]\nD: [3, 4, 1, 2]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_43_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_43_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_43_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_43_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_43_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [1, 3, 4, 2]\nB: [1, 4, 3, 2]\nC: [2, 3, 4, 1]\nD: [2, 1, 3, 4]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 3, 4, 2]\nB: [1, 4, 3, 2]\nC: [2, 3, 4, 1]\nD: [2, 1, 3, 4]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_44_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_44_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_44_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_44_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_44_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [4, 2, 1, 3]\nB: [2, 3, 4, 1]\nC: [4, 3, 2, 1]\nD: [2, 1, 4, 3]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 2, 1, 3]\nB: [2, 3, 4, 1]\nC: [4, 3, 2, 1]\nD: [2, 1, 4, 3]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_45_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_45_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_45_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_45_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_45_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [4, 1, 2, 3]\nB: [1, 3, 2, 4]\nC: [2, 4, 1, 3]\nD: [2, 1, 3, 4]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 1, 2, 3]\nB: [1, 3, 2, 4]\nC: [2, 4, 1, 3]\nD: [2, 1, 3, 4]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_46_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_46_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_46_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_46_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_46_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [2, 1, 4, 3]\nB: [4, 2, 3, 1]\nC: [1, 3, 2, 4]\nD: [3, 1, 2, 4]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 1, 4, 3]\nB: [4, 2, 3, 1]\nC: [1, 3, 2, 4]\nD: [3, 1, 2, 4]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_47_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_47_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_47_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_47_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_47_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [2, 1, 3, 4]\nB: [1, 2, 4, 3]\nC: [3, 1, 2, 4]\nD: [3, 1, 4, 2]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 1, 3, 4]\nB: [1, 2, 4, 3]\nC: [3, 1, 2, 4]\nD: [3, 1, 4, 2]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_48_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_48_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_48_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_48_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_48_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [2, 1, 4, 3]\nB: [1, 4, 2, 3]\nC: [1, 3, 2, 4]\nD: [1, 2, 3, 4]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 1, 4, 3]\nB: [1, 4, 2, 3]\nC: [1, 3, 2, 4]\nD: [1, 2, 3, 4]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_49_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_49_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_49_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_49_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_49_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [1, 2, 4, 3]\nB: [3, 1, 2, 4]\nC: [2, 4, 1, 3]\nD: [3, 4, 2, 1]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 2, 4, 3]\nB: [3, 1, 2, 4]\nC: [2, 4, 1, 3]\nD: [3, 4, 2, 1]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_50_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_50_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_50_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_50_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_50_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [3, 4, 2, 1]\nB: [1, 4, 2, 3]\nC: [4, 2, 3, 1]\nD: [1, 3, 2, 4]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 4, 2, 1]\nB: [1, 4, 2, 3]\nC: [4, 2, 3, 1]\nD: [1, 3, 2, 4]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_51_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_51_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_51_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_51_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_51_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [3, 4, 1, 2]\nB: [3, 2, 4, 1]\nC: [1, 3, 4, 2]\nD: [4, 1, 2, 3]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 4, 1, 2]\nB: [3, 2, 4, 1]\nC: [1, 3, 4, 2]\nD: [4, 1, 2, 3]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_52_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_52_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_52_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_52_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_52_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [4, 2, 3, 1]\nB: [1, 3, 4, 2]\nC: [4, 3, 2, 1]\nD: [3, 4, 1, 2]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 2, 3, 1]\nB: [1, 3, 4, 2]\nC: [4, 3, 2, 1]\nD: [3, 4, 1, 2]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_53_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_53_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_53_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_53_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_53_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [1, 2, 3, 4]\nB: [1, 3, 4, 2]\nC: [4, 3, 1, 2]\nD: [4, 3, 2, 1]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 2, 3, 4]\nB: [1, 3, 4, 2]\nC: [4, 3, 1, 2]\nD: [4, 3, 2, 1]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_54_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_54_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_54_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_54_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_54_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [1, 2, 4, 3]\nB: [2, 1, 4, 3]\nC: [3, 1, 4, 2]\nD: [4, 2, 3, 1]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 2, 4, 3]\nB: [2, 1, 4, 3]\nC: [3, 1, 4, 2]\nD: [4, 2, 3, 1]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_55_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_55_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_55_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_55_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_55_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [1, 4, 3, 2]\nB: [1, 4, 2, 3]\nC: [4, 3, 2, 1]\nD: [1, 3, 4, 2]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 4, 3, 2]\nB: [1, 4, 2, 3]\nC: [4, 3, 2, 1]\nD: [1, 3, 4, 2]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_56_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_56_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_56_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_56_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_56_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [1, 4, 2, 3]\nB: [3, 2, 1, 4]\nC: [4, 2, 1, 3]\nD: [4, 3, 2, 1]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 4, 2, 3]\nB: [3, 2, 1, 4]\nC: [4, 2, 1, 3]\nD: [4, 3, 2, 1]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_57_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_57_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_57_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_57_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_57_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [3, 4, 2, 1]\nB: [2, 1, 3, 4]\nC: [4, 1, 2, 3]\nD: [2, 1, 4, 3]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 4, 2, 1]\nB: [2, 1, 3, 4]\nC: [4, 1, 2, 3]\nD: [2, 1, 4, 3]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_58_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_58_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_58_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_58_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_58_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [3, 1, 4, 2]\nB: [2, 3, 4, 1]\nC: [1, 4, 3, 2]\nD: [1, 4, 2, 3]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 1, 4, 2]\nB: [2, 3, 4, 1]\nC: [1, 4, 3, 2]\nD: [1, 4, 2, 3]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_59_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_59_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_59_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_59_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_59_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [2, 1, 4, 3]\nB: [3, 1, 4, 2]\nC: [2, 3, 1, 4]\nD: [3, 1, 2, 4]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 1, 4, 3]\nB: [3, 1, 4, 2]\nC: [2, 3, 1, 4]\nD: [3, 1, 2, 4]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_60_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_60_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_60_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_60_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_60_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [3, 4, 1, 2]\nB: [4, 3, 1, 2]\nC: [2, 4, 1, 3]\nD: [1, 2, 3, 4]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 4, 1, 2]\nB: [4, 3, 1, 2]\nC: [2, 4, 1, 3]\nD: [1, 2, 3, 4]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_61_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_61_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_61_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_61_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_61_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [1, 4, 2, 3]\nB: [4, 3, 1, 2]\nC: [3, 1, 4, 2]\nD: [2, 3, 4, 1]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 4, 2, 3]\nB: [4, 3, 1, 2]\nC: [3, 1, 4, 2]\nD: [2, 3, 4, 1]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_62_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_62_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_62_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_62_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_62_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [1, 2, 3, 4]\nB: [3, 1, 4, 2]\nC: [4, 3, 2, 1]\nD: [4, 2, 3, 1]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 2, 3, 4]\nB: [3, 1, 4, 2]\nC: [4, 3, 2, 1]\nD: [4, 2, 3, 1]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_63_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_63_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_63_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_63_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_63_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [1, 3, 4, 2]\nB: [4, 1, 2, 3]\nC: [2, 3, 1, 4]\nD: [3, 4, 1, 2]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 3, 4, 2]\nB: [4, 1, 2, 3]\nC: [2, 3, 1, 4]\nD: [3, 4, 1, 2]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_64_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_64_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_64_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_64_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_64_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [4, 3, 2, 1]\nB: [4, 2, 3, 1]\nC: [2, 1, 4, 3]\nD: [3, 2, 4, 1]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 3, 2, 1]\nB: [4, 2, 3, 1]\nC: [2, 1, 4, 3]\nD: [3, 2, 4, 1]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_65_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_65_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_65_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_65_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_65_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [3, 4, 1, 2]\nB: [4, 3, 1, 2]\nC: [1, 3, 2, 4]\nD: [1, 3, 4, 2]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 4, 1, 2]\nB: [4, 3, 1, 2]\nC: [1, 3, 2, 4]\nD: [1, 3, 4, 2]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_66_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_66_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_66_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_66_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_66_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [2, 4, 1, 3]\nB: [4, 3, 1, 2]\nC: [4, 2, 1, 3]\nD: [3, 4, 2, 1]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 4, 1, 3]\nB: [4, 3, 1, 2]\nC: [4, 2, 1, 3]\nD: [3, 4, 2, 1]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_67_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_67_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_67_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_67_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_67_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [3, 4, 2, 1]\nB: [1, 3, 2, 4]\nC: [4, 2, 1, 3]\nD: [3, 2, 4, 1]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 4, 2, 1]\nB: [1, 3, 2, 4]\nC: [4, 2, 1, 3]\nD: [3, 2, 4, 1]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_68_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_68_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_68_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_68_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_68_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [3, 1, 4, 2]\nB: [4, 2, 1, 3]\nC: [3, 2, 4, 1]\nD: [2, 1, 3, 4]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 1, 4, 2]\nB: [4, 2, 1, 3]\nC: [3, 2, 4, 1]\nD: [2, 1, 3, 4]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_69_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_69_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_69_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_69_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_69_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [4, 3, 1, 2]\nB: [1, 4, 2, 3]\nC: [1, 3, 4, 2]\nD: [4, 1, 2, 3]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 3, 1, 2]\nB: [1, 4, 2, 3]\nC: [1, 3, 4, 2]\nD: [4, 1, 2, 3]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_70_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_70_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_70_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_70_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_70_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [1, 2, 3, 4]\nB: [2, 4, 1, 3]\nC: [3, 4, 2, 1]\nD: [3, 2, 4, 1]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 2, 3, 4]\nB: [2, 4, 1, 3]\nC: [3, 4, 2, 1]\nD: [3, 2, 4, 1]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_71_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_71_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_71_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_71_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_71_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [2, 4, 3, 1]\nB: [4, 2, 3, 1]\nC: [2, 1, 3, 4]\nD: [3, 4, 1, 2]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 4, 3, 1]\nB: [4, 2, 3, 1]\nC: [2, 1, 3, 4]\nD: [3, 4, 1, 2]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_72_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_72_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_72_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_72_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_72_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [2, 1, 4, 3]\nB: [1, 4, 2, 3]\nC: [3, 1, 4, 2]\nD: [1, 3, 4, 2]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 1, 4, 3]\nB: [1, 4, 2, 3]\nC: [3, 1, 4, 2]\nD: [1, 3, 4, 2]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_73_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_73_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_73_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_73_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_73_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [1, 2, 3, 4]\nB: [4, 3, 1, 2]\nC: [1, 4, 2, 3]\nD: [4, 3, 2, 1]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 2, 3, 4]\nB: [4, 3, 1, 2]\nC: [1, 4, 2, 3]\nD: [4, 3, 2, 1]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_74_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_74_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_74_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_74_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_74_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [4, 1, 3, 2]\nB: [2, 3, 4, 1]\nC: [4, 3, 1, 2]\nD: [2, 1, 3, 4]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 1, 3, 2]\nB: [2, 3, 4, 1]\nC: [4, 3, 1, 2]\nD: [2, 1, 3, 4]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_75_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_75_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_75_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_75_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_75_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [3, 4, 2, 1]\nB: [4, 2, 1, 3]\nC: [3, 2, 1, 4]\nD: [1, 3, 4, 2]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 4, 2, 1]\nB: [4, 2, 1, 3]\nC: [3, 2, 1, 4]\nD: [1, 3, 4, 2]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_76_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_76_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_76_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_76_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_76_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [3, 4, 1, 2]\nB: [3, 1, 2, 4]\nC: [4, 1, 2, 3]\nD: [4, 3, 1, 2]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 4, 1, 2]\nB: [3, 1, 2, 4]\nC: [4, 1, 2, 3]\nD: [4, 3, 1, 2]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_77_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_77_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_77_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_77_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_77_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [1, 4, 2, 3]\nB: [4, 3, 1, 2]\nC: [4, 1, 3, 2]\nD: [2, 3, 4, 1]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 4, 2, 3]\nB: [4, 3, 1, 2]\nC: [4, 1, 3, 2]\nD: [2, 3, 4, 1]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_78_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_78_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_78_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_78_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_78_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [4, 1, 2, 3]\nB: [1, 3, 4, 2]\nC: [2, 1, 3, 4]\nD: [1, 4, 2, 3]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 1, 2, 3]\nB: [1, 3, 4, 2]\nC: [2, 1, 3, 4]\nD: [1, 4, 2, 3]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_79_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_79_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_79_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_79_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_79_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [4, 1, 2, 3]\nB: [4, 2, 1, 3]\nC: [1, 3, 4, 2]\nD: [2, 3, 1, 4]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 1, 2, 3]\nB: [4, 2, 1, 3]\nC: [1, 3, 4, 2]\nD: [2, 3, 1, 4]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_80_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_80_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_80_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_80_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_80_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [2, 3, 4, 1]\nB: [2, 3, 1, 4]\nC: [4, 2, 1, 3]\nD: [4, 1, 2, 3]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 3, 4, 1]\nB: [2, 3, 1, 4]\nC: [4, 2, 1, 3]\nD: [4, 1, 2, 3]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_81_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_81_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_81_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_81_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_81_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [1, 4, 3, 2]\nB: [2, 1, 4, 3]\nC: [2, 3, 1, 4]\nD: [3, 1, 4, 2]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 4, 3, 2]\nB: [2, 1, 4, 3]\nC: [2, 3, 1, 4]\nD: [3, 1, 4, 2]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_82_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_82_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_82_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_82_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_82_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [2, 4, 3, 1]\nB: [1, 3, 2, 4]\nC: [4, 2, 1, 3]\nD: [4, 3, 2, 1]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 4, 3, 1]\nB: [1, 3, 2, 4]\nC: [4, 2, 1, 3]\nD: [4, 3, 2, 1]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_83_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_83_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_83_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_83_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_83_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [2, 1, 3, 4]\nB: [3, 2, 4, 1]\nC: [1, 3, 4, 2]\nD: [4, 3, 1, 2]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 1, 3, 4]\nB: [3, 2, 4, 1]\nC: [1, 3, 4, 2]\nD: [4, 3, 1, 2]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_84_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_84_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_84_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_84_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_84_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [2, 4, 3, 1]\nB: [1, 3, 2, 4]\nC: [3, 4, 1, 2]\nD: [2, 1, 3, 4]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 4, 3, 1]\nB: [1, 3, 2, 4]\nC: [3, 4, 1, 2]\nD: [2, 1, 3, 4]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_85_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_85_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_85_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_85_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_85_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [2, 4, 1, 3]\nB: [4, 2, 1, 3]\nC: [1, 3, 4, 2]\nD: [3, 2, 1, 4]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 4, 1, 3]\nB: [4, 2, 1, 3]\nC: [1, 3, 4, 2]\nD: [3, 2, 1, 4]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_86_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_86_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_86_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_86_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_86_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [2, 4, 3, 1]\nB: [1, 3, 2, 4]\nC: [1, 4, 3, 2]\nD: [1, 2, 4, 3]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 4, 3, 1]\nB: [1, 3, 2, 4]\nC: [1, 4, 3, 2]\nD: [1, 2, 4, 3]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_87_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_87_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_87_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_87_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_87_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [4, 2, 3, 1]\nB: [3, 2, 1, 4]\nC: [4, 3, 2, 1]\nD: [1, 4, 3, 2]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 2, 3, 1]\nB: [3, 2, 1, 4]\nC: [4, 3, 2, 1]\nD: [1, 4, 3, 2]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_88_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_88_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_88_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_88_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_88_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [3, 4, 2, 1]\nB: [3, 2, 4, 1]\nC: [4, 2, 1, 3]\nD: [4, 1, 3, 2]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 4, 2, 1]\nB: [3, 2, 4, 1]\nC: [4, 2, 1, 3]\nD: [4, 1, 3, 2]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_89_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_89_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_89_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_89_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_89_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [2, 3, 1, 4]\nB: [2, 1, 3, 4]\nC: [3, 4, 2, 1]\nD: [3, 4, 1, 2]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 3, 1, 4]\nB: [2, 1, 3, 4]\nC: [3, 4, 2, 1]\nD: [3, 4, 1, 2]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_90_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_90_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_90_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_90_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_90_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [4, 2, 1, 3]\nB: [4, 1, 3, 2]\nC: [4, 3, 2, 1]\nD: [2, 1, 4, 3]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 2, 1, 3]\nB: [4, 1, 3, 2]\nC: [4, 3, 2, 1]\nD: [2, 1, 4, 3]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_91_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_91_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_91_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_91_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_91_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [4, 3, 1, 2]\nB: [2, 1, 3, 4]\nC: [4, 1, 2, 3]\nD: [1, 3, 4, 2]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 3, 1, 2]\nB: [2, 1, 3, 4]\nC: [4, 1, 2, 3]\nD: [1, 3, 4, 2]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_92_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_92_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_92_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_92_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_92_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [1, 4, 3, 2]\nB: [1, 3, 4, 2]\nC: [2, 4, 1, 3]\nD: [4, 2, 3, 1]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 4, 3, 2]\nB: [1, 3, 4, 2]\nC: [2, 4, 1, 3]\nD: [4, 2, 3, 1]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_93_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_93_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_93_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_93_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_93_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [4, 3, 1, 2]\nB: [1, 4, 3, 2]\nC: [1, 3, 4, 2]\nD: [1, 3, 2, 4]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 3, 1, 2]\nB: [1, 4, 3, 2]\nC: [1, 3, 4, 2]\nD: [1, 3, 2, 4]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_94_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_94_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_94_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_94_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_94_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [3, 1, 2, 4]\nB: [1, 3, 4, 2]\nC: [2, 4, 3, 1]\nD: [3, 2, 4, 1]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 1, 2, 4]\nB: [1, 3, 4, 2]\nC: [2, 4, 3, 1]\nD: [3, 2, 4, 1]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_95_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_95_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_95_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_95_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_95_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [4, 1, 3, 2]\nB: [1, 2, 3, 4]\nC: [2, 4, 3, 1]\nD: [3, 4, 2, 1]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 1, 3, 2]\nB: [1, 2, 3, 4]\nC: [2, 4, 3, 1]\nD: [3, 4, 2, 1]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_96_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_96_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_96_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_96_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_96_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [4, 1, 3, 2]\nB: [2, 4, 1, 3]\nC: [3, 2, 4, 1]\nD: [4, 3, 2, 1]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 1, 3, 2]\nB: [2, 4, 1, 3]\nC: [3, 2, 4, 1]\nD: [4, 3, 2, 1]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_97_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_97_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_97_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_97_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_97_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [3, 2, 4, 1]\nB: [1, 4, 2, 3]\nC: [3, 1, 2, 4]\nD: [4, 3, 2, 1]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 2, 4, 1]\nB: [1, 4, 2, 3]\nC: [3, 1, 2, 4]\nD: [4, 3, 2, 1]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_98_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_98_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_98_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_98_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_98_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['natural_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_natural", "options": "A: [2, 4, 3, 1]\nB: [3, 4, 2, 1]\nC: [4, 1, 2, 3]\nD: [1, 3, 4, 2]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 4, 3, 1]\nB: [3, 4, 2, 1]\nC: [4, 1, 2, 3]\nD: [1, 3, 4, 2]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_99_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_99_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_99_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_99_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_99_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [2, 4, 3, 1]\nB: [2, 1, 4, 3]\nC: [3, 2, 1, 4]\nD: [1, 2, 4, 3]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 4, 3, 1]\nB: [2, 1, 4, 3]\nC: [3, 2, 1, 4]\nD: [1, 2, 4, 3]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_100_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_100_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_100_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_100_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_100_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [1, 4, 3, 2]\nB: [1, 3, 4, 2]\nC: [3, 1, 2, 4]\nD: [2, 4, 1, 3]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 4, 3, 2]\nB: [1, 3, 4, 2]\nC: [3, 1, 2, 4]\nD: [2, 4, 1, 3]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_101_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_101_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_101_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_101_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_101_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [1, 4, 2, 3]\nB: [2, 4, 1, 3]\nC: [2, 1, 4, 3]\nD: [3, 4, 2, 1]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 4, 2, 3]\nB: [2, 4, 1, 3]\nC: [2, 1, 4, 3]\nD: [3, 4, 2, 1]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_102_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_102_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_102_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_102_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_102_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [1, 3, 2, 4]\nB: [1, 3, 4, 2]\nC: [2, 4, 3, 1]\nD: [4, 3, 1, 2]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 3, 2, 4]\nB: [1, 3, 4, 2]\nC: [2, 4, 3, 1]\nD: [4, 3, 1, 2]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_103_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_103_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_103_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_103_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_103_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [4, 1, 2, 3]\nB: [3, 4, 1, 2]\nC: [4, 1, 3, 2]\nD: [4, 3, 1, 2]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 1, 2, 3]\nB: [3, 4, 1, 2]\nC: [4, 1, 3, 2]\nD: [4, 3, 1, 2]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_104_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_104_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_104_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_104_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_104_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [2, 3, 4, 1]\nB: [1, 4, 2, 3]\nC: [2, 3, 1, 4]\nD: [1, 3, 2, 4]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 3, 4, 1]\nB: [1, 4, 2, 3]\nC: [2, 3, 1, 4]\nD: [1, 3, 2, 4]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_105_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_105_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_105_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_105_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_105_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [2, 3, 4, 1]\nB: [3, 4, 1, 2]\nC: [2, 1, 4, 3]\nD: [2, 3, 1, 4]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 3, 4, 1]\nB: [3, 4, 1, 2]\nC: [2, 1, 4, 3]\nD: [2, 3, 1, 4]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_106_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_106_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_106_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_106_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_106_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [3, 4, 1, 2]\nB: [1, 4, 2, 3]\nC: [3, 2, 4, 1]\nD: [4, 2, 1, 3]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 4, 1, 2]\nB: [1, 4, 2, 3]\nC: [3, 2, 4, 1]\nD: [4, 2, 1, 3]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_107_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_107_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_107_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_107_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_107_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [4, 3, 1, 2]\nB: [3, 4, 2, 1]\nC: [3, 2, 4, 1]\nD: [4, 2, 3, 1]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 3, 1, 2]\nB: [3, 4, 2, 1]\nC: [3, 2, 4, 1]\nD: [4, 2, 3, 1]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_108_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_108_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_108_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_108_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_108_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [1, 3, 4, 2]\nB: [3, 1, 2, 4]\nC: [3, 1, 4, 2]\nD: [2, 3, 1, 4]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 3, 4, 2]\nB: [3, 1, 2, 4]\nC: [3, 1, 4, 2]\nD: [2, 3, 1, 4]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_109_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_109_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_109_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_109_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_109_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [4, 3, 2, 1]\nB: [3, 1, 2, 4]\nC: [2, 4, 3, 1]\nD: [1, 4, 3, 2]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 3, 2, 1]\nB: [3, 1, 2, 4]\nC: [2, 4, 3, 1]\nD: [1, 4, 3, 2]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_110_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_110_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_110_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_110_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_110_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [1, 2, 3, 4]\nB: [1, 4, 3, 2]\nC: [2, 1, 3, 4]\nD: [3, 1, 2, 4]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 2, 3, 4]\nB: [1, 4, 3, 2]\nC: [2, 1, 3, 4]\nD: [3, 1, 2, 4]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_111_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_111_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_111_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_111_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_111_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [1, 3, 2, 4]\nB: [4, 2, 1, 3]\nC: [1, 4, 3, 2]\nD: [3, 4, 2, 1]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 3, 2, 4]\nB: [4, 2, 1, 3]\nC: [1, 4, 3, 2]\nD: [3, 4, 2, 1]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_112_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_112_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_112_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_112_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_112_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [1, 3, 2, 4]\nB: [1, 4, 3, 2]\nC: [2, 4, 3, 1]\nD: [1, 4, 2, 3]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 3, 2, 4]\nB: [1, 4, 3, 2]\nC: [2, 4, 3, 1]\nD: [1, 4, 2, 3]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_113_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_113_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_113_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_113_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_113_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [1, 3, 2, 4]\nB: [2, 3, 1, 4]\nC: [3, 4, 2, 1]\nD: [3, 1, 4, 2]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 3, 2, 4]\nB: [2, 3, 1, 4]\nC: [3, 4, 2, 1]\nD: [3, 1, 4, 2]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_114_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_114_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_114_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_114_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_114_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [4, 3, 2, 1]\nB: [1, 3, 4, 2]\nC: [3, 1, 2, 4]\nD: [4, 3, 1, 2]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 3, 2, 1]\nB: [1, 3, 4, 2]\nC: [3, 1, 2, 4]\nD: [4, 3, 1, 2]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_115_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_115_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_115_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_115_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_115_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [3, 2, 1, 4]\nB: [3, 1, 4, 2]\nC: [4, 3, 2, 1]\nD: [1, 3, 2, 4]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 2, 1, 4]\nB: [3, 1, 4, 2]\nC: [4, 3, 2, 1]\nD: [1, 3, 2, 4]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_116_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_116_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_116_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_116_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_116_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [4, 1, 2, 3]\nB: [1, 2, 4, 3]\nC: [3, 2, 1, 4]\nD: [1, 3, 4, 2]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 1, 2, 3]\nB: [1, 2, 4, 3]\nC: [3, 2, 1, 4]\nD: [1, 3, 4, 2]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_117_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_117_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_117_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_117_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_117_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [1, 3, 4, 2]\nB: [4, 3, 2, 1]\nC: [3, 4, 2, 1]\nD: [1, 4, 3, 2]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 3, 4, 2]\nB: [4, 3, 2, 1]\nC: [3, 4, 2, 1]\nD: [1, 4, 3, 2]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_118_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_118_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_118_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_118_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_118_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [4, 3, 1, 2]\nB: [2, 3, 4, 1]\nC: [1, 3, 2, 4]\nD: [1, 3, 4, 2]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 3, 1, 2]\nB: [2, 3, 4, 1]\nC: [1, 3, 2, 4]\nD: [1, 3, 4, 2]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_119_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_119_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_119_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_119_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_119_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [4, 1, 2, 3]\nB: [1, 4, 2, 3]\nC: [3, 1, 2, 4]\nD: [3, 1, 4, 2]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 1, 2, 3]\nB: [1, 4, 2, 3]\nC: [3, 1, 2, 4]\nD: [3, 1, 4, 2]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_120_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_120_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_120_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_120_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_120_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [1, 2, 3, 4]\nB: [4, 3, 2, 1]\nC: [1, 4, 3, 2]\nD: [2, 3, 1, 4]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 2, 3, 4]\nB: [4, 3, 2, 1]\nC: [1, 4, 3, 2]\nD: [2, 3, 1, 4]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_121_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_121_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_121_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_121_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_121_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [1, 4, 3, 2]\nB: [1, 2, 3, 4]\nC: [4, 1, 3, 2]\nD: [1, 4, 2, 3]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 4, 3, 2]\nB: [1, 2, 3, 4]\nC: [4, 1, 3, 2]\nD: [1, 4, 2, 3]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_122_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_122_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_122_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_122_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_122_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [1, 4, 2, 3]\nB: [1, 2, 3, 4]\nC: [1, 3, 4, 2]\nD: [2, 4, 3, 1]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 4, 2, 3]\nB: [1, 2, 3, 4]\nC: [1, 3, 4, 2]\nD: [2, 4, 3, 1]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_123_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_123_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_123_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_123_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_123_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [2, 1, 4, 3]\nB: [2, 4, 3, 1]\nC: [1, 4, 3, 2]\nD: [1, 2, 4, 3]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 1, 4, 3]\nB: [2, 4, 3, 1]\nC: [1, 4, 3, 2]\nD: [1, 2, 4, 3]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_124_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_124_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_124_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_124_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_124_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [1, 4, 2, 3]\nB: [2, 4, 1, 3]\nC: [1, 2, 4, 3]\nD: [4, 1, 3, 2]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 4, 2, 3]\nB: [2, 4, 1, 3]\nC: [1, 2, 4, 3]\nD: [4, 1, 3, 2]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_125_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_125_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_125_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_125_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_125_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [3, 1, 2, 4]\nB: [4, 2, 1, 3]\nC: [3, 4, 1, 2]\nD: [2, 1, 3, 4]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 1, 2, 4]\nB: [4, 2, 1, 3]\nC: [3, 4, 1, 2]\nD: [2, 1, 3, 4]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_126_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_126_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_126_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_126_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_126_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [1, 4, 2, 3]\nB: [3, 2, 4, 1]\nC: [2, 4, 1, 3]\nD: [1, 4, 3, 2]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 4, 2, 3]\nB: [3, 2, 4, 1]\nC: [2, 4, 1, 3]\nD: [1, 4, 3, 2]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_127_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_127_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_127_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_127_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_127_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [2, 4, 3, 1]\nB: [1, 3, 4, 2]\nC: [3, 1, 2, 4]\nD: [1, 2, 3, 4]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 4, 3, 1]\nB: [1, 3, 4, 2]\nC: [3, 1, 2, 4]\nD: [1, 2, 3, 4]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_128_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_128_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_128_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_128_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_128_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [4, 1, 3, 2]\nB: [2, 4, 1, 3]\nC: [2, 4, 3, 1]\nD: [4, 3, 1, 2]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 1, 3, 2]\nB: [2, 4, 1, 3]\nC: [2, 4, 3, 1]\nD: [4, 3, 1, 2]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_129_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_129_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_129_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_129_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_129_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [3, 2, 4, 1]\nB: [2, 4, 3, 1]\nC: [2, 1, 3, 4]\nD: [1, 2, 4, 3]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 2, 4, 1]\nB: [2, 4, 3, 1]\nC: [2, 1, 3, 4]\nD: [1, 2, 4, 3]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_130_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_130_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_130_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_130_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_130_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [3, 1, 2, 4]\nB: [4, 1, 3, 2]\nC: [3, 4, 1, 2]\nD: [4, 3, 1, 2]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 1, 2, 4]\nB: [4, 1, 3, 2]\nC: [3, 4, 1, 2]\nD: [4, 3, 1, 2]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_131_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_131_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_131_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_131_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_131_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [2, 4, 1, 3]\nB: [4, 3, 2, 1]\nC: [4, 1, 3, 2]\nD: [2, 3, 4, 1]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 4, 1, 3]\nB: [4, 3, 2, 1]\nC: [4, 1, 3, 2]\nD: [2, 3, 4, 1]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_132_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_132_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_132_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_132_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_132_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [4, 1, 2, 3]\nB: [2, 3, 1, 4]\nC: [4, 1, 3, 2]\nD: [1, 3, 2, 4]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 1, 2, 3]\nB: [2, 3, 1, 4]\nC: [4, 1, 3, 2]\nD: [1, 3, 2, 4]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_133_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_133_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_133_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_133_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_133_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [2, 1, 4, 3]\nB: [2, 4, 3, 1]\nC: [4, 3, 1, 2]\nD: [1, 2, 3, 4]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 1, 4, 3]\nB: [2, 4, 3, 1]\nC: [4, 3, 1, 2]\nD: [1, 2, 3, 4]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_134_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_134_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_134_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_134_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_134_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [1, 4, 3, 2]\nB: [4, 3, 2, 1]\nC: [4, 1, 2, 3]\nD: [3, 1, 2, 4]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 4, 3, 2]\nB: [4, 3, 2, 1]\nC: [4, 1, 2, 3]\nD: [3, 1, 2, 4]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_135_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_135_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_135_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_135_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_135_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [3, 1, 2, 4]\nB: [4, 2, 3, 1]\nC: [1, 2, 4, 3]\nD: [3, 1, 4, 2]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 1, 2, 4]\nB: [4, 2, 3, 1]\nC: [1, 2, 4, 3]\nD: [3, 1, 4, 2]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_136_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_136_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_136_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_136_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_136_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [1, 4, 2, 3]\nB: [1, 3, 2, 4]\nC: [2, 4, 1, 3]\nD: [1, 4, 3, 2]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 4, 2, 3]\nB: [1, 3, 2, 4]\nC: [2, 4, 1, 3]\nD: [1, 4, 3, 2]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_137_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_137_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_137_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_137_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_137_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [4, 3, 1, 2]\nB: [4, 3, 2, 1]\nC: [4, 2, 3, 1]\nD: [4, 1, 2, 3]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 3, 1, 2]\nB: [4, 3, 2, 1]\nC: [4, 2, 3, 1]\nD: [4, 1, 2, 3]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_138_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_138_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_138_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_138_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_138_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [2, 1, 4, 3]\nB: [3, 1, 2, 4]\nC: [4, 2, 3, 1]\nD: [4, 3, 1, 2]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 1, 4, 3]\nB: [3, 1, 2, 4]\nC: [4, 2, 3, 1]\nD: [4, 3, 1, 2]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_139_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_139_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_139_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_139_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_139_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [4, 1, 3, 2]\nB: [2, 1, 4, 3]\nC: [1, 4, 2, 3]\nD: [2, 3, 4, 1]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 1, 3, 2]\nB: [2, 1, 4, 3]\nC: [1, 4, 2, 3]\nD: [2, 3, 4, 1]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_140_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_140_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_140_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_140_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_140_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [4, 3, 1, 2]\nB: [1, 4, 3, 2]\nC: [2, 4, 3, 1]\nD: [3, 2, 1, 4]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 3, 1, 2]\nB: [1, 4, 3, 2]\nC: [2, 4, 3, 1]\nD: [3, 2, 1, 4]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_141_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_141_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_141_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_141_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_141_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [2, 4, 1, 3]\nB: [1, 4, 2, 3]\nC: [2, 1, 4, 3]\nD: [4, 1, 3, 2]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 4, 1, 3]\nB: [1, 4, 2, 3]\nC: [2, 1, 4, 3]\nD: [4, 1, 3, 2]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_142_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_142_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_142_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_142_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_142_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [4, 2, 3, 1]\nB: [4, 3, 2, 1]\nC: [3, 1, 2, 4]\nD: [4, 2, 1, 3]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 2, 3, 1]\nB: [4, 3, 2, 1]\nC: [3, 1, 2, 4]\nD: [4, 2, 1, 3]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_143_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_143_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_143_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_143_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_143_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [3, 1, 2, 4]\nB: [3, 1, 4, 2]\nC: [3, 2, 1, 4]\nD: [2, 4, 1, 3]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 1, 2, 4]\nB: [3, 1, 4, 2]\nC: [3, 2, 1, 4]\nD: [2, 4, 1, 3]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_144_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_144_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_144_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_144_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_144_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [3, 1, 4, 2]\nB: [2, 4, 1, 3]\nC: [4, 1, 3, 2]\nD: [1, 4, 2, 3]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 1, 4, 2]\nB: [2, 4, 1, 3]\nC: [4, 1, 3, 2]\nD: [1, 4, 2, 3]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_145_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_145_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_145_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_145_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_145_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [4, 3, 1, 2]\nB: [4, 2, 3, 1]\nC: [1, 4, 2, 3]\nD: [2, 4, 3, 1]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 3, 1, 2]\nB: [4, 2, 3, 1]\nC: [1, 4, 2, 3]\nD: [2, 4, 3, 1]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_146_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_146_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_146_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_146_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_146_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [2, 3, 1, 4]\nB: [2, 4, 1, 3]\nC: [4, 1, 3, 2]\nD: [1, 2, 3, 4]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 3, 1, 4]\nB: [2, 4, 1, 3]\nC: [4, 1, 3, 2]\nD: [1, 2, 3, 4]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_147_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_147_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_147_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_147_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_147_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [2, 1, 4, 3]\nB: [2, 3, 1, 4]\nC: [3, 1, 4, 2]\nD: [2, 4, 1, 3]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 1, 4, 3]\nB: [2, 3, 1, 4]\nC: [3, 1, 4, 2]\nD: [2, 4, 1, 3]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_148_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_148_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_148_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_148_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_148_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [1, 2, 4, 3]\nB: [4, 1, 3, 2]\nC: [2, 3, 1, 4]\nD: [4, 2, 1, 3]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 2, 4, 3]\nB: [4, 1, 3, 2]\nC: [2, 3, 1, 4]\nD: [4, 2, 1, 3]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_149_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_149_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_149_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_149_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_149_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [1, 2, 3, 4]\nB: [2, 3, 4, 1]\nC: [4, 1, 3, 2]\nD: [3, 4, 2, 1]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 2, 3, 4]\nB: [2, 3, 4, 1]\nC: [4, 1, 3, 2]\nD: [3, 4, 2, 1]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_150_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_150_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_150_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_150_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_150_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [3, 2, 4, 1]\nB: [3, 2, 1, 4]\nC: [3, 1, 2, 4]\nD: [4, 2, 3, 1]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 2, 4, 1]\nB: [3, 2, 1, 4]\nC: [3, 1, 2, 4]\nD: [4, 2, 3, 1]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_151_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_151_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_151_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_151_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_151_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [4, 3, 1, 2]\nB: [3, 1, 4, 2]\nC: [3, 2, 1, 4]\nD: [4, 1, 3, 2]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 3, 1, 2]\nB: [3, 1, 4, 2]\nC: [3, 2, 1, 4]\nD: [4, 1, 3, 2]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_152_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_152_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_152_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_152_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_152_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [3, 2, 1, 4]\nB: [1, 3, 4, 2]\nC: [2, 3, 1, 4]\nD: [4, 2, 3, 1]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 2, 1, 4]\nB: [1, 3, 4, 2]\nC: [2, 3, 1, 4]\nD: [4, 2, 3, 1]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_153_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_153_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_153_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_153_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_153_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [4, 3, 2, 1]\nB: [4, 3, 1, 2]\nC: [4, 1, 2, 3]\nD: [3, 4, 1, 2]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 3, 2, 1]\nB: [4, 3, 1, 2]\nC: [4, 1, 2, 3]\nD: [3, 4, 1, 2]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_154_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_154_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_154_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_154_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_154_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [4, 3, 2, 1]\nB: [1, 4, 3, 2]\nC: [2, 1, 3, 4]\nD: [1, 3, 4, 2]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 3, 2, 1]\nB: [1, 4, 3, 2]\nC: [2, 1, 3, 4]\nD: [1, 3, 4, 2]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_155_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_155_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_155_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_155_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_155_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [3, 1, 4, 2]\nB: [3, 2, 4, 1]\nC: [4, 2, 3, 1]\nD: [2, 4, 1, 3]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 1, 4, 2]\nB: [3, 2, 4, 1]\nC: [4, 2, 3, 1]\nD: [2, 4, 1, 3]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_156_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_156_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_156_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_156_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_156_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [4, 2, 3, 1]\nB: [1, 3, 2, 4]\nC: [3, 4, 1, 2]\nD: [2, 3, 4, 1]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 2, 3, 1]\nB: [1, 3, 2, 4]\nC: [3, 4, 1, 2]\nD: [2, 3, 4, 1]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_157_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_157_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_157_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_157_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_157_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [2, 3, 4, 1]\nB: [1, 3, 2, 4]\nC: [2, 3, 1, 4]\nD: [3, 4, 2, 1]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 3, 4, 1]\nB: [1, 3, 2, 4]\nC: [2, 3, 1, 4]\nD: [3, 4, 2, 1]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_158_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_158_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_158_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_158_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_158_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [1, 3, 2, 4]\nB: [2, 1, 3, 4]\nC: [3, 4, 2, 1]\nD: [2, 1, 4, 3]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 3, 2, 4]\nB: [2, 1, 3, 4]\nC: [3, 4, 2, 1]\nD: [2, 1, 4, 3]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_159_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_159_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_159_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_159_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_159_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [3, 4, 2, 1]\nB: [4, 3, 1, 2]\nC: [4, 3, 2, 1]\nD: [2, 1, 3, 4]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 4, 2, 1]\nB: [4, 3, 1, 2]\nC: [4, 3, 2, 1]\nD: [2, 1, 3, 4]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_160_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_160_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_160_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_160_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_160_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [2, 1, 3, 4]\nB: [4, 2, 1, 3]\nC: [1, 4, 2, 3]\nD: [1, 2, 4, 3]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 1, 3, 4]\nB: [4, 2, 1, 3]\nC: [1, 4, 2, 3]\nD: [1, 2, 4, 3]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_161_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_161_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_161_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_161_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_161_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [4, 1, 2, 3]\nB: [4, 2, 1, 3]\nC: [1, 4, 2, 3]\nD: [3, 4, 1, 2]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 1, 2, 3]\nB: [4, 2, 1, 3]\nC: [1, 4, 2, 3]\nD: [3, 4, 1, 2]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_162_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_162_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_162_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_162_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_162_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [2, 1, 4, 3]\nB: [4, 2, 1, 3]\nC: [1, 2, 3, 4]\nD: [2, 3, 1, 4]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 1, 4, 3]\nB: [4, 2, 1, 3]\nC: [1, 2, 3, 4]\nD: [2, 3, 1, 4]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_163_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_163_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_163_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_163_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_163_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [2, 4, 3, 1]\nB: [3, 1, 4, 2]\nC: [3, 2, 1, 4]\nD: [2, 1, 4, 3]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 4, 3, 1]\nB: [3, 1, 4, 2]\nC: [3, 2, 1, 4]\nD: [2, 1, 4, 3]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_164_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_164_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_164_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_164_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_164_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [3, 2, 4, 1]\nB: [2, 4, 3, 1]\nC: [1, 2, 4, 3]\nD: [2, 4, 1, 3]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 2, 4, 1]\nB: [2, 4, 3, 1]\nC: [1, 2, 4, 3]\nD: [2, 4, 1, 3]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_165_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_165_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_165_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_165_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_165_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [1, 2, 4, 3]\nB: [1, 4, 2, 3]\nC: [4, 3, 1, 2]\nD: [3, 1, 4, 2]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 2, 4, 3]\nB: [1, 4, 2, 3]\nC: [4, 3, 1, 2]\nD: [3, 1, 4, 2]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_166_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_166_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_166_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_166_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_166_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [2, 4, 1, 3]\nB: [2, 1, 4, 3]\nC: [1, 3, 2, 4]\nD: [3, 1, 4, 2]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 4, 1, 3]\nB: [2, 1, 4, 3]\nC: [1, 3, 2, 4]\nD: [3, 1, 4, 2]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_167_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_167_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_167_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_167_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_167_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [2, 4, 3, 1]\nB: [4, 3, 1, 2]\nC: [4, 1, 3, 2]\nD: [3, 1, 2, 4]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 4, 3, 1]\nB: [4, 3, 1, 2]\nC: [4, 1, 3, 2]\nD: [3, 1, 2, 4]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_168_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_168_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_168_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_168_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_168_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [1, 2, 4, 3]\nB: [3, 2, 4, 1]\nC: [4, 1, 2, 3]\nD: [4, 2, 3, 1]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 2, 4, 3]\nB: [3, 2, 4, 1]\nC: [4, 1, 2, 3]\nD: [4, 2, 3, 1]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_169_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_169_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_169_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_169_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_169_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [3, 2, 1, 4]\nB: [1, 4, 2, 3]\nC: [1, 4, 3, 2]\nD: [1, 2, 4, 3]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 2, 1, 4]\nB: [1, 4, 2, 3]\nC: [1, 4, 3, 2]\nD: [1, 2, 4, 3]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_170_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_170_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_170_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_170_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_170_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [1, 4, 3, 2]\nB: [3, 1, 2, 4]\nC: [2, 4, 3, 1]\nD: [4, 2, 3, 1]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 4, 3, 2]\nB: [3, 1, 2, 4]\nC: [2, 4, 3, 1]\nD: [4, 2, 3, 1]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_171_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_171_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_171_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_171_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_171_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [4, 1, 2, 3]\nB: [4, 2, 1, 3]\nC: [3, 1, 4, 2]\nD: [3, 2, 1, 4]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 1, 2, 3]\nB: [4, 2, 1, 3]\nC: [3, 1, 4, 2]\nD: [3, 2, 1, 4]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_172_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_172_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_172_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_172_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_172_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [2, 3, 4, 1]\nB: [1, 4, 3, 2]\nC: [4, 2, 3, 1]\nD: [4, 1, 3, 2]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 3, 4, 1]\nB: [1, 4, 3, 2]\nC: [4, 2, 3, 1]\nD: [4, 1, 3, 2]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_173_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_173_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_173_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_173_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_173_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [2, 1, 3, 4]\nB: [2, 3, 4, 1]\nC: [3, 1, 4, 2]\nD: [4, 1, 2, 3]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 1, 3, 4]\nB: [2, 3, 4, 1]\nC: [3, 1, 4, 2]\nD: [4, 1, 2, 3]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_174_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_174_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_174_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_174_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_174_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [4, 1, 3, 2]\nB: [1, 4, 2, 3]\nC: [3, 1, 4, 2]\nD: [4, 2, 1, 3]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 1, 3, 2]\nB: [1, 4, 2, 3]\nC: [3, 1, 4, 2]\nD: [4, 2, 1, 3]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_175_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_175_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_175_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_175_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_175_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [3, 2, 1, 4]\nB: [3, 4, 2, 1]\nC: [4, 1, 2, 3]\nD: [2, 1, 3, 4]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 2, 1, 4]\nB: [3, 4, 2, 1]\nC: [4, 1, 2, 3]\nD: [2, 1, 3, 4]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_176_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_176_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_176_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_176_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_176_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [1, 4, 2, 3]\nB: [1, 4, 3, 2]\nC: [2, 1, 4, 3]\nD: [3, 4, 1, 2]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 4, 2, 3]\nB: [1, 4, 3, 2]\nC: [2, 1, 4, 3]\nD: [3, 4, 1, 2]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_177_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_177_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_177_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_177_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_177_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [4, 2, 1, 3]\nB: [1, 4, 3, 2]\nC: [2, 1, 3, 4]\nD: [4, 3, 1, 2]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 2, 1, 3]\nB: [1, 4, 3, 2]\nC: [2, 1, 3, 4]\nD: [4, 3, 1, 2]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_178_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_178_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_178_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_178_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_178_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [4, 1, 2, 3]\nB: [3, 1, 4, 2]\nC: [1, 2, 4, 3]\nD: [1, 2, 3, 4]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 1, 2, 3]\nB: [3, 1, 4, 2]\nC: [1, 2, 4, 3]\nD: [1, 2, 3, 4]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_179_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_179_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_179_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_179_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_179_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [4, 3, 2, 1]\nB: [3, 4, 2, 1]\nC: [3, 1, 4, 2]\nD: [1, 4, 2, 3]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 3, 2, 1]\nB: [3, 4, 2, 1]\nC: [3, 1, 4, 2]\nD: [1, 4, 2, 3]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_180_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_180_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_180_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_180_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_180_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [2, 1, 3, 4]\nB: [4, 1, 3, 2]\nC: [1, 4, 3, 2]\nD: [3, 4, 2, 1]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 1, 3, 4]\nB: [4, 1, 3, 2]\nC: [1, 4, 3, 2]\nD: [3, 4, 2, 1]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_181_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_181_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_181_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_181_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_181_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [4, 1, 3, 2]\nB: [3, 4, 1, 2]\nC: [2, 3, 1, 4]\nD: [3, 2, 4, 1]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 1, 3, 2]\nB: [3, 4, 1, 2]\nC: [2, 3, 1, 4]\nD: [3, 2, 4, 1]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_182_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_182_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_182_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_182_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_182_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [4, 2, 3, 1]\nB: [4, 1, 3, 2]\nC: [2, 1, 4, 3]\nD: [1, 4, 3, 2]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 2, 3, 1]\nB: [4, 1, 3, 2]\nC: [2, 1, 4, 3]\nD: [1, 4, 3, 2]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_183_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_183_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_183_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_183_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_183_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [3, 4, 2, 1]\nB: [3, 2, 4, 1]\nC: [4, 1, 2, 3]\nD: [4, 2, 3, 1]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 4, 2, 1]\nB: [3, 2, 4, 1]\nC: [4, 1, 2, 3]\nD: [4, 2, 3, 1]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_184_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_184_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_184_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_184_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_184_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [1, 3, 2, 4]\nB: [4, 2, 3, 1]\nC: [3, 2, 1, 4]\nD: [3, 1, 4, 2]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 3, 2, 4]\nB: [4, 2, 3, 1]\nC: [3, 2, 1, 4]\nD: [3, 1, 4, 2]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_185_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_185_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_185_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_185_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_185_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [1, 4, 2, 3]\nB: [3, 1, 4, 2]\nC: [2, 1, 4, 3]\nD: [4, 2, 3, 1]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 4, 2, 3]\nB: [3, 1, 4, 2]\nC: [2, 1, 4, 3]\nD: [4, 2, 3, 1]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_186_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_186_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_186_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_186_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_186_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [4, 3, 2, 1]\nB: [4, 3, 1, 2]\nC: [1, 2, 4, 3]\nD: [1, 2, 3, 4]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 3, 2, 1]\nB: [4, 3, 1, 2]\nC: [1, 2, 4, 3]\nD: [1, 2, 3, 4]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_187_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_187_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_187_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_187_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_187_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [4, 2, 3, 1]\nB: [4, 3, 2, 1]\nC: [1, 3, 4, 2]\nD: [1, 3, 2, 4]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 2, 3, 1]\nB: [4, 3, 2, 1]\nC: [1, 3, 4, 2]\nD: [1, 3, 2, 4]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_188_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_188_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_188_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_188_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_188_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [3, 1, 4, 2]\nB: [3, 2, 4, 1]\nC: [1, 2, 4, 3]\nD: [1, 4, 2, 3]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 1, 4, 2]\nB: [3, 2, 4, 1]\nC: [1, 2, 4, 3]\nD: [1, 4, 2, 3]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_189_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_189_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_189_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_189_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_189_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [4, 2, 3, 1]\nB: [1, 2, 3, 4]\nC: [1, 2, 4, 3]\nD: [1, 3, 4, 2]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 2, 3, 1]\nB: [1, 2, 3, 4]\nC: [1, 2, 4, 3]\nD: [1, 3, 4, 2]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_190_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_190_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_190_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_190_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_190_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [2, 3, 4, 1]\nB: [3, 1, 4, 2]\nC: [4, 3, 2, 1]\nD: [4, 1, 2, 3]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 3, 4, 1]\nB: [3, 1, 4, 2]\nC: [4, 3, 2, 1]\nD: [4, 1, 2, 3]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_191_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_191_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_191_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_191_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_191_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [2, 3, 4, 1]\nB: [2, 4, 1, 3]\nC: [1, 4, 2, 3]\nD: [4, 2, 3, 1]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [2, 3, 4, 1]\nB: [2, 4, 1, 3]\nC: [1, 4, 2, 3]\nD: [4, 2, 3, 1]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_192_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_192_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_192_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_192_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_192_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [1, 2, 3, 4]\nB: [4, 2, 1, 3]\nC: [1, 3, 4, 2]\nD: [1, 4, 2, 3]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 2, 3, 4]\nB: [4, 2, 1, 3]\nC: [1, 3, 4, 2]\nD: [1, 4, 2, 3]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_193_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_193_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_193_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_193_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_193_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [4, 1, 3, 2]\nB: [4, 2, 1, 3]\nC: [2, 4, 1, 3]\nD: [1, 3, 2, 4]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 1, 3, 2]\nB: [4, 2, 1, 3]\nC: [2, 4, 1, 3]\nD: [1, 3, 2, 4]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_194_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_194_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_194_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_194_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_194_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [1, 4, 3, 2]\nB: [1, 2, 3, 4]\nC: [4, 1, 2, 3]\nD: [1, 3, 2, 4]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 4, 3, 2]\nB: [1, 2, 3, 4]\nC: [4, 1, 2, 3]\nD: [1, 3, 2, 4]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_195_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_195_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_195_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_195_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_195_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [1, 2, 4, 3]\nB: [2, 4, 3, 1]\nC: [3, 1, 4, 2]\nD: [3, 4, 2, 1]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 2, 4, 3]\nB: [2, 4, 3, 1]\nC: [3, 1, 4, 2]\nD: [3, 4, 2, 1]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_196_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_196_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_196_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_196_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_196_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [3, 2, 1, 4]\nB: [3, 4, 2, 1]\nC: [4, 2, 3, 1]\nD: [2, 4, 3, 1]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [3, 2, 1, 4]\nB: [3, 4, 2, 1]\nC: [4, 2, 3, 1]\nD: [2, 4, 3, 1]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_197_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_197_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_197_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_197_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_197_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [4, 2, 3, 1]\nB: [1, 4, 2, 3]\nC: [3, 1, 4, 2]\nD: [2, 4, 3, 1]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [4, 2, 3, 1]\nB: [1, 4, 2, 3]\nC: [3, 1, 4, 2]\nD: [2, 4, 3, 1]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_198_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_198_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_198_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_198_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_198_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "jigsaw_puzzle_solving", "visual_input_component": "['painting_image', 'visual_mark']", "source": "jigsaw_puzzle_solving_painting", "options": "A: [1, 2, 3, 4]\nB: [2, 4, 1, 3]\nC: [4, 1, 3, 2]\nD: [2, 1, 4, 3]", "question": "The patches in the middle of the image might be disordered. Please state the correct order of the number indexes based on the given patches, following the sequence: top left, top right, bottom left, bottom right.", "context": "Your task is give a order of these given images\nSelect from the following choices.\nA: [1, 2, 3, 4]\nB: [2, 4, 1, 3]\nC: [4, 1, 3, 2]\nD: [2, 1, 4, 3]", "input_image_path": ["./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_199_0.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_199_1.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_199_2.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_199_3.jpg", "./2D-spatial/jigsaw_puzzle_solving/jigsaw_puzzle_solving_199_4.jpg"], "output": "B", "qwen3-vl": "image none"}]
\ No newline at end of file
diff --git a/results/meme_vedio_understanding/qwen3-vl/metadata_info.json b/results/meme_vedio_understanding/qwen3-vl/metadata_info.json
new file mode 100644
index 0000000..678f81e
--- /dev/null
+++ b/results/meme_vedio_understanding/qwen3-vl/metadata_info.json
@@ -0,0 +1 @@
+[{"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: Tolerable by humans, intolerable by dogs.\nB: Okay for humans, not okay for dogs.\nC: Acceptable for people, unacceptable for canines.\nD: Endurable by humans, not bearable by dogs.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: Tolerable by humans, intolerable by dogs.\nB: Okay for humans, not okay for dogs.\nC: Acceptable for people, unacceptable for canines.\nD: Endurable by humans, not bearable by dogs.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_0_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_0_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_0_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_0_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_0_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_0_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_0_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_0_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_0_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_0_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_0_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_0_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_0_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_0_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_0_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_0_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The woman was surprised to find a fish in the bag.\nB: A woman was feeding the fish and accidentally dropped the bag.\nC: The woman found a funny note inside the bag instead of a fish.\nD: A woman was carrying a red plastic bag and had the intention of releasing the fish inside. Unfortunately, when she opened the bag, the fish was already dead.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The woman was surprised to find a fish in the bag.\nB: A woman was feeding the fish and accidentally dropped the bag.\nC: The woman found a funny note inside the bag instead of a fish.\nD: A woman was carrying a red plastic bag and had the intention of releasing the fish inside. Unfortunately, when she opened the bag, the fish was already dead.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_1_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_1_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_1_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_1_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_1_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_1_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_1_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_1_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_1_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_1_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_1_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_1_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_1_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_1_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_1_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_1_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The button was strategically placed in an inconvenient location, resulting in a humorous sequence where the shoes were utilized to press it.\nB: An unexpected technical glitch caused the button to be out of reach, prompting the comical use of shoes in the video.\nC: The mishap occurred due to the person's clumsiness, leading to a comedic scene involving the use of shoes to activate the button.\nD: The distance was too great, making it impossible to press the button by hand, resulting in a funny situation where the shoes were used to press the button instead.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The button was strategically placed in an inconvenient location, resulting in a humorous sequence where the shoes were utilized to press it.\nB: An unexpected technical glitch caused the button to be out of reach, prompting the comical use of shoes in the video.\nC: The mishap occurred due to the person's clumsiness, leading to a comedic scene involving the use of shoes to activate the button.\nD: The distance was too great, making it impossible to press the button by hand, resulting in a funny situation where the shoes were used to press the button instead.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_2_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_2_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_2_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_2_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_2_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_2_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_2_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_2_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_2_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_2_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_2_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_2_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_2_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_2_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_2_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_2_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The person's hair turns neon green after the escape.\nB: The person miraculously grows a full head of hair after the escape.\nC: Following a miraculous escape, the individual is left with a completely bald head.\nD: The individual ends up with a stylish new haircut after the escape.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The person's hair turns neon green after the escape.\nB: The person miraculously grows a full head of hair after the escape.\nC: Following a miraculous escape, the individual is left with a completely bald head.\nD: The individual ends up with a stylish new haircut after the escape.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_3_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_3_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_3_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_3_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_3_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_3_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_3_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_3_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_3_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_3_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_3_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_3_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_3_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_3_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_3_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_3_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The video is funny because the fish is actually a toy fish, and the store owner wrote \"a sleeping fish\" which is a humorous way of expressing it.\nB: The video is funny because the fish is actually a fake fish, and the store owner wrote \"a sleeping fish\" which is a humorous way of expressing it.\nC:  The video is funny because the fish is actually a dead fish, and the store owner wrote \"a sleeping fish\" which is a humorous way of expressing it.\nD: The video is funny because the fish is actually a live fish, and the store owner wrote \"a sleeping fish\" which is a humorous way of expressing it.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The video is funny because the fish is actually a toy fish, and the store owner wrote \"a sleeping fish\" which is a humorous way of expressing it.\nB: The video is funny because the fish is actually a fake fish, and the store owner wrote \"a sleeping fish\" which is a humorous way of expressing it.\nC:  The video is funny because the fish is actually a dead fish, and the store owner wrote \"a sleeping fish\" which is a humorous way of expressing it.\nD: The video is funny because the fish is actually a live fish, and the store owner wrote \"a sleeping fish\" which is a humorous way of expressing it.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_4_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_4_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_4_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_4_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_4_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_4_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_4_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_4_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_4_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_4_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_4_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_4_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_4_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_4_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_4_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_4_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The shot reveals a room occupied by people dressed uniformly in red apparel. A female stands at the rear exit while a young man rises to his feet, placing his right hand on his head. One of the males seated at the front turns his head before twisting it back to its original position. The young man who stood up, pats the male seated in front with his hand.\nB: The scene captures a group of people wearing different colored outfits, engaged in various activities as they move around the room.\nC: In the video, a man and a woman engage in a serious conversation while others around them are busy with their activities.\nD: The video features a group of individuals in casual clothing, standing in a room filled with colorful decorations.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The shot reveals a room occupied by people dressed uniformly in red apparel. A female stands at the rear exit while a young man rises to his feet, placing his right hand on his head. One of the males seated at the front turns his head before twisting it back to its original position. The young man who stood up, pats the male seated in front with his hand.\nB: The scene captures a group of people wearing different colored outfits, engaged in various activities as they move around the room.\nC: In the video, a man and a woman engage in a serious conversation while others around them are busy with their activities.\nD: The video features a group of individuals in casual clothing, standing in a room filled with colorful decorations.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_5_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_5_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_5_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_5_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_5_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_5_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_5_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_5_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_5_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_5_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_5_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_5_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_5_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_5_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_5_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_5_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The comedic nature of the video stems from the individual's portrayal of a character from a well-known game, \"League of Legends,\" in an economical way.\nB: The video is humorous because the individual is mimicking a character from League of Legends called \"Master Yi\" in a cost-effective manner.\nC: The humor in the video comes from the individual's imitation of a character from a game known as \"Master Yi\" in a cost-efficient manner.\nD: The video is funny because the person is impersonating a character from a popular video game called \"League of Legends\" in a budget-friendly way.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The comedic nature of the video stems from the individual's portrayal of a character from a well-known game, \"League of Legends,\" in an economical way.\nB: The video is humorous because the individual is mimicking a character from League of Legends called \"Master Yi\" in a cost-effective manner.\nC: The humor in the video comes from the individual's imitation of a character from a game known as \"Master Yi\" in a cost-efficient manner.\nD: The video is funny because the person is impersonating a character from a popular video game called \"League of Legends\" in a budget-friendly way.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_6_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_6_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_6_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_6_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_6_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_6_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_6_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_6_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_6_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_6_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_6_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_6_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_6_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_6_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_6_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_6_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: A person in a yellow coat and brown hat stood near a flock of sheep. The person made a sudden gesture, causing the sheep to disperse in haste.\nB: A woman in a blue dress and white hat was surrounded by zebras. She shouted, causing the zebras to scatter in fear.\nC: A man wearing a green shirt and yellow cap approached a group of chickens. The man made a sudden movement, causing the chickens to run away in panic.\nD: A lady donning a red dress and a black hat is standing in front of the camera, accompanied by a few cows. The woman swiftly turns her face and scowls, resulting in the cows next to her hastily fleeing.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: A person in a yellow coat and brown hat stood near a flock of sheep. The person made a sudden gesture, causing the sheep to disperse in haste.\nB: A woman in a blue dress and white hat was surrounded by zebras. She shouted, causing the zebras to scatter in fear.\nC: A man wearing a green shirt and yellow cap approached a group of chickens. The man made a sudden movement, causing the chickens to run away in panic.\nD: A lady donning a red dress and a black hat is standing in front of the camera, accompanied by a few cows. The woman swiftly turns her face and scowls, resulting in the cows next to her hastily fleeing.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_7_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_7_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_7_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_7_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_7_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_7_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_7_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_7_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_7_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_7_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_7_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_7_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_7_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_7_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_7_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_7_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: A person enjoying a leisurely run.\nB:  A person running under compulsion.\nC: A person sprinting in a race for exercise.\nD: A person participating in a marathon for fun.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: A person enjoying a leisurely run.\nB:  A person running under compulsion.\nC: A person sprinting in a race for exercise.\nD: A person participating in a marathon for fun.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_8_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_8_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_8_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_8_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_8_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_8_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_8_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_8_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_8_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_8_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_8_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_8_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_8_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_8_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_8_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_8_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: Fizzy drink challenge\nB: Comparing home-made vs store-bought cola\nC: Real versus artificial cola\nD: Spot the difference: natural vs synthetic soda", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: Fizzy drink challenge\nB: Comparing home-made vs store-bought cola\nC: Real versus artificial cola\nD: Spot the difference: natural vs synthetic soda", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_9_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_9_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_9_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_9_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_9_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_9_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_9_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_9_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_9_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_9_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_9_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_9_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_9_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_9_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_9_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_9_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The hairdresser realizes the hairdryer is not working and tries to fix it, but accidentally sucks the customer's hair into the dryer.\nB: The hairdresser accidentally sets the customer's hair on fire with the hairdryer.\nC: With the help of a hairdryer, the hairdresser blows the customer's hair, causing all of it to be blown off.\nD: The hairdresser uses the hairdryer to blow the customer's wig off.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The hairdresser realizes the hairdryer is not working and tries to fix it, but accidentally sucks the customer's hair into the dryer.\nB: The hairdresser accidentally sets the customer's hair on fire with the hairdryer.\nC: With the help of a hairdryer, the hairdresser blows the customer's hair, causing all of it to be blown off.\nD: The hairdresser uses the hairdryer to blow the customer's wig off.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_10_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_10_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_10_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_10_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_10_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_10_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_10_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_10_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_10_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_10_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_10_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_10_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_10_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_10_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_10_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_10_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: What's the big deal with the belt?\nB: Feeling stressed about a belt?\nC: Why are you feeling anxious over picking up a simple belt?\nD: Why all the fuss over a belt?", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: What's the big deal with the belt?\nB: Feeling stressed about a belt?\nC: Why are you feeling anxious over picking up a simple belt?\nD: Why all the fuss over a belt?", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_11_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_11_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_11_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_11_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_11_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_11_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_11_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_11_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_11_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_11_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_11_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_11_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_11_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_11_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_11_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_11_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The upbeat music and flashing lights create an energetic atmosphere, and the child's constant nodding gives the impression of a dance party.\nB: The vibrant music and flickering lights create an atmosphere reminiscent of a jumping rave, and the child's constant nodding gives the impression of an adult enjoying themselves. Therefore, in my opinion, this video is quite captivating.\nC: The slow music and steady lights create a serene atmosphere, and the child's constant nodding gives the impression of boredom.\nD: The dim music and flickering lights create a spooky atmosphere, and the child's constant nodding gives the impression of being hypnotized.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The upbeat music and flashing lights create an energetic atmosphere, and the child's constant nodding gives the impression of a dance party.\nB: The vibrant music and flickering lights create an atmosphere reminiscent of a jumping rave, and the child's constant nodding gives the impression of an adult enjoying themselves. Therefore, in my opinion, this video is quite captivating.\nC: The slow music and steady lights create a serene atmosphere, and the child's constant nodding gives the impression of boredom.\nD: The dim music and flickering lights create a spooky atmosphere, and the child's constant nodding gives the impression of being hypnotized.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_12_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_12_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_12_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_12_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_12_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_12_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_12_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_12_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_12_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_12_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_12_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_12_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_12_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_12_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_12_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_12_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The comical struggle of oversized head and chair\nB: The unexpected snare of the oversized head\nC: The chair's revenge on the oversized head\nD: The oversized head ensnared by the chair's grip.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The comical struggle of oversized head and chair\nB: The unexpected snare of the oversized head\nC: The chair's revenge on the oversized head\nD: The oversized head ensnared by the chair's grip.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_13_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_13_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_13_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_13_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_13_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_13_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_13_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_13_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_13_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_13_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_13_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_13_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_13_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_13_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_13_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_13_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: Ping pong played with imaginary ball and paddle in the air.\nB: Hilarious attempt at playing table tennis with invisible opponents.\nC: Playing ping pong in an alternate dimension with invisible equipment.\nD: A funny game of table tennis without the table or the actual ball.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: Ping pong played with imaginary ball and paddle in the air.\nB: Hilarious attempt at playing table tennis with invisible opponents.\nC: Playing ping pong in an alternate dimension with invisible equipment.\nD: A funny game of table tennis without the table or the actual ball.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_14_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_14_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_14_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_14_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_14_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_14_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_14_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_14_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_14_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_14_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_14_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_14_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_14_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_14_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_14_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_14_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: As the man stripped, he revealed a funny hat instead of his bald head, leading to laughter from the woman.\nB: The man tripped and fell, causing the woman to burst into laughter and help him up.\nC: While dancing, the man's pants fell down, leading to laughter from the woman who then helped him cover up.\nD: While removing his clothes, the man accidentally lifted his wig, exposing his baldness, which caused the woman to burst into laughter. Without hesitation, she stood up and helped him secure the wig back in place.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: As the man stripped, he revealed a funny hat instead of his bald head, leading to laughter from the woman.\nB: The man tripped and fell, causing the woman to burst into laughter and help him up.\nC: While dancing, the man's pants fell down, leading to laughter from the woman who then helped him cover up.\nD: While removing his clothes, the man accidentally lifted his wig, exposing his baldness, which caused the woman to burst into laughter. Without hesitation, she stood up and helped him secure the wig back in place.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_15_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_15_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_15_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_15_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_15_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_15_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_15_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_15_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_15_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_15_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_15_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_15_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_15_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_15_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_15_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_15_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: A Golden Retriever is running away from its owner who is trying to give it a bath.\nB: A Golden Retriever is sitting in the bathtub with its owner playing music in the background.\nC: A Golden Retriever is barking at its owner in the bathroom.\nD:  A Golden Retriever is lying in front of the toilet bowl, making a sound, and its owner is patting its back from behind.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: A Golden Retriever is running away from its owner who is trying to give it a bath.\nB: A Golden Retriever is sitting in the bathtub with its owner playing music in the background.\nC: A Golden Retriever is barking at its owner in the bathroom.\nD:  A Golden Retriever is lying in front of the toilet bowl, making a sound, and its owner is patting its back from behind.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_16_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_16_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_16_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_16_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_16_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_16_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_16_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_16_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_16_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_16_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_16_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_16_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_16_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_16_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_16_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_16_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: A person slipped and slid down the icy stairs to the very bottom, and the second person, who observed the first person's fall, also slipped and fell.\nB: The first person managed to avoid slipping and falling down the icy stairs.\nC: The second person smoothly maneuvered around the icy stairs without any accidents.\nD: A person gracefully descended the icy stairs without any mishaps.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: A person slipped and slid down the icy stairs to the very bottom, and the second person, who observed the first person's fall, also slipped and fell.\nB: The first person managed to avoid slipping and falling down the icy stairs.\nC: The second person smoothly maneuvered around the icy stairs without any accidents.\nD: A person gracefully descended the icy stairs without any mishaps.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_17_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_17_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_17_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_17_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_17_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_17_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_17_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_17_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_17_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_17_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_17_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_17_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_17_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_17_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_17_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_17_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The humorous part is that the tap is triggered by a sensor with a small time lag, and turning it on manually is too quick to wash thoroughly. This scenario appears rather amusing.\nB: The humorous aspect is the delay in the water flow caused by the sensor, leading to a comical situation.\nC: The humor comes from the unexpected water flow timing which catches the person off-guard.\nD: The comedic effect is achieved by the water faucet turning on automatically while the person's hand is still nearby.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The humorous part is that the tap is triggered by a sensor with a small time lag, and turning it on manually is too quick to wash thoroughly. This scenario appears rather amusing.\nB: The humorous aspect is the delay in the water flow caused by the sensor, leading to a comical situation.\nC: The humor comes from the unexpected water flow timing which catches the person off-guard.\nD: The comedic effect is achieved by the water faucet turning on automatically while the person's hand is still nearby.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_18_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_18_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_18_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_18_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_18_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_18_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_18_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_18_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_18_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_18_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_18_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_18_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_18_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_18_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_18_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_18_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: A dog ran across the screen, completely ignoring the boy's laughter.\nB: The little boy started crying instead of laughing, which made everyone around him start crying too.\nC: A group of adults were standing in the background, looking bored and uninterested.\nD:  A little boy in a suit stood in the crowd, laughing so hard that he fell to the ground and tore his pants.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: A dog ran across the screen, completely ignoring the boy's laughter.\nB: The little boy started crying instead of laughing, which made everyone around him start crying too.\nC: A group of adults were standing in the background, looking bored and uninterested.\nD:  A little boy in a suit stood in the crowd, laughing so hard that he fell to the ground and tore his pants.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_19_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_19_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_19_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_19_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_19_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_19_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_19_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_19_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_19_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_19_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_19_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_19_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_19_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_19_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_19_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_19_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The man's attempt to place the final tile was a failure, and he became frustrated as the tiles fell one by one.\nB: The man's attempt to place the final tile was successful, but soon after, the tiles began to fall one by one until all of them had fallen. The man found it tragic and cried.\nC: The man's attempt to place the final tile was successful, but soon after, the tiles began to fall one by one until all of them had fallen. The man found it amusing and laughed.\nD: The man's attempt to place the final tile was successful, but soon after, the tiles began to fall one by one until all of them had fallen. The man found it confusing and became silent.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The man's attempt to place the final tile was a failure, and he became frustrated as the tiles fell one by one.\nB: The man's attempt to place the final tile was successful, but soon after, the tiles began to fall one by one until all of them had fallen. The man found it tragic and cried.\nC: The man's attempt to place the final tile was successful, but soon after, the tiles began to fall one by one until all of them had fallen. The man found it amusing and laughed.\nD: The man's attempt to place the final tile was successful, but soon after, the tiles began to fall one by one until all of them had fallen. The man found it confusing and became silent.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_20_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_20_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_20_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_20_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_20_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_20_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_20_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_20_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_20_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_20_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_20_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_20_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_20_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_20_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_20_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_20_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The video shows a cat wearing a hat with a funny expression on its face.\nB:  In the frame, there are two fingers with a piece of fabric resembling eyes and a mouth, cut into three openings. Subsequently, the fabric is placed on the cat's head.\nC: In the video, the cat is shown playing with a toy that resembles a human face.\nD: The cat in the video has been digitally altered to look like it is talking.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The video shows a cat wearing a hat with a funny expression on its face.\nB:  In the frame, there are two fingers with a piece of fabric resembling eyes and a mouth, cut into three openings. Subsequently, the fabric is placed on the cat's head.\nC: In the video, the cat is shown playing with a toy that resembles a human face.\nD: The cat in the video has been digitally altered to look like it is talking.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_21_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_21_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_21_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_21_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_21_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_21_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_21_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_21_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_21_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_21_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_21_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_21_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_21_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_21_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_21_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_21_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The humor in the video relied too heavily on offensive jokes.\nB: The video failed to capture the serious nature of the wedding event.\nC: The comical element of the video was overshadowed by awkwardness and discomfort.\nD: The groomsman's inquiry was inappropriate for the occasion of the wedding, but it stirred up thoughts of a comical relationship with a friend, resulting in amusement.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The humor in the video relied too heavily on offensive jokes.\nB: The video failed to capture the serious nature of the wedding event.\nC: The comical element of the video was overshadowed by awkwardness and discomfort.\nD: The groomsman's inquiry was inappropriate for the occasion of the wedding, but it stirred up thoughts of a comical relationship with a friend, resulting in amusement.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_22_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_22_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_22_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_22_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_22_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_22_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_22_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_22_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_22_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_22_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_22_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_22_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_22_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_22_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_22_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_22_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: Is Ultraman looking for a new mode of transportation?\nB: Is Ultraman practicing cow wrangling techniques?\nC: Is Ultraman a professional cow herder?\nD: Does Ultraman also require the services of a cow to be led?", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: Is Ultraman looking for a new mode of transportation?\nB: Is Ultraman practicing cow wrangling techniques?\nC: Is Ultraman a professional cow herder?\nD: Does Ultraman also require the services of a cow to be led?", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_23_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_23_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_23_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_23_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_23_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_23_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_23_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_23_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_23_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_23_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_23_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_23_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_23_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_23_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_23_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_23_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: A man closed the door of the bathroom.\nB:  A man pushed open the door of the bathroom.\nC: A man opened the door of the bedroom.\nD: A woman opened the door of the bathroom.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: A man closed the door of the bathroom.\nB:  A man pushed open the door of the bathroom.\nC: A man opened the door of the bedroom.\nD: A woman opened the door of the bathroom.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_24_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_24_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_24_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_24_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_24_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_24_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_24_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_24_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_24_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_24_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_24_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_24_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_24_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_24_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_24_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_24_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The first person's hilarious fall down the stairs, one step at a time, was topped by the second person who repeated the same and fell down as well.\nB: The unexpected nature of the falls\nC: The synchronized falling of two people\nD: The well-choreographed tumble down the stairs", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The first person's hilarious fall down the stairs, one step at a time, was topped by the second person who repeated the same and fell down as well.\nB: The unexpected nature of the falls\nC: The synchronized falling of two people\nD: The well-choreographed tumble down the stairs", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_25_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_25_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_25_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_25_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_25_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_25_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_25_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_25_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_25_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_25_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_25_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_25_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_25_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_25_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_25_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_25_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: What's the point of this game?\nB: Can we just stop playing?\nC: Are we done with the game yet?\nD: Why aren't we continuing the game?", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: What's the point of this game?\nB: Can we just stop playing?\nC: Are we done with the game yet?\nD: Why aren't we continuing the game?", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_26_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_26_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_26_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_26_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_26_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_26_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_26_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_26_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_26_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_26_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_26_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_26_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_26_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_26_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_26_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_26_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: Singing dog.\nB: Resting fish.\nC: Flying elephant.\nD: Dancing cat.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: Singing dog.\nB: Resting fish.\nC: Flying elephant.\nD: Dancing cat.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_27_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_27_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_27_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_27_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_27_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_27_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_27_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_27_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_27_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_27_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_27_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_27_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_27_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_27_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_27_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_27_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: Footwear, come and lend a hand.\nB: Boots, come and support me.\nC:  Shoes, come and help me.\nD: Sneakers, come and assist me.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: Footwear, come and lend a hand.\nB: Boots, come and support me.\nC:  Shoes, come and help me.\nD: Sneakers, come and assist me.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_28_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_28_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_28_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_28_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_28_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_28_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_28_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_28_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_28_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_28_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_28_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_28_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_28_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_28_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_28_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_28_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: Uncontrollable eruption of chuckles\nB: I had a sudden laugh of release.\nC: Unexpected burst of laughter\nD: Spontaneous outburst of giggles", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: Uncontrollable eruption of chuckles\nB: I had a sudden laugh of release.\nC: Unexpected burst of laughter\nD: Spontaneous outburst of giggles", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_29_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_29_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_29_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_29_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_29_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_29_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_29_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_29_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_29_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_29_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_29_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_29_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_29_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_29_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_29_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_29_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The child's reaction to realizing the cola was different from what he expected, resulting in the humor of the video.\nB: The child mistakenly took a bite of the girl's hand instead of the cola, which caused the humor.\nC: The girl intentionally tricked the child into biting her hand instead of the cola, leading to the comedic effect.\nD: Believing that the cola in the girl's hand was the same as the one she handed him, the child took a bite only to realize it wasn't, and the situation was humorous.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The child's reaction to realizing the cola was different from what he expected, resulting in the humor of the video.\nB: The child mistakenly took a bite of the girl's hand instead of the cola, which caused the humor.\nC: The girl intentionally tricked the child into biting her hand instead of the cola, leading to the comedic effect.\nD: Believing that the cola in the girl's hand was the same as the one she handed him, the child took a bite only to realize it wasn't, and the situation was humorous.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_30_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_30_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_30_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_30_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_30_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_30_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_30_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_30_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_30_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_30_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_30_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_30_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_30_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_30_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_30_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_30_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: Liberation? No, it's murder!\nB: Emancipation? No, it's carnage!\nC: Release? No, it's chaos!\nD: Freedom? No, it's mayhem!", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: Liberation? No, it's murder!\nB: Emancipation? No, it's carnage!\nC: Release? No, it's chaos!\nD: Freedom? No, it's mayhem!", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_31_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_31_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_31_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_31_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_31_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_31_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_31_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_31_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_31_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_31_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_31_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_31_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_31_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_31_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_31_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_31_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: Swift Sword Savior\nB: Rapid Dagger Martyr\nC: Quick Knife Sinner\nD: Fast Blade Saint", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: Swift Sword Savior\nB: Rapid Dagger Martyr\nC: Quick Knife Sinner\nD: Fast Blade Saint", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_32_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_32_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_32_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_32_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_32_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_32_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_32_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_32_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_32_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_32_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_32_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_32_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_32_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_32_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_32_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_32_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The fast movement of the dog.\nB: The unexpected appearance of a panda in the video.\nC: The funny dance moves of the person in the panda costume.\nD: The person at the back dressed in a panda costume cannot manage the dog in front, and hence, has to run alongside the dog.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The fast movement of the dog.\nB: The unexpected appearance of a panda in the video.\nC: The funny dance moves of the person in the panda costume.\nD: The person at the back dressed in a panda costume cannot manage the dog in front, and hence, has to run alongside the dog.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_33_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_33_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_33_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_33_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_33_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_33_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_33_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_33_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_33_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_33_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_33_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_33_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_33_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_33_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_33_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_33_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The best man tripped and fell between the groom and the bride while carrying the rings.\nB: The best man was standing next to the groom and the bride and made a serious expression.\nC:  The best man stands between the groom and the bride and asks the groom, \"What about me?\"\nD: The best man asked the bride instead of the groom, \"What about me?\"", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The best man tripped and fell between the groom and the bride while carrying the rings.\nB: The best man was standing next to the groom and the bride and made a serious expression.\nC:  The best man stands between the groom and the bride and asks the groom, \"What about me?\"\nD: The best man asked the bride instead of the groom, \"What about me?\"", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_34_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_34_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_34_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_34_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_34_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_34_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_34_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_34_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_34_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_34_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_34_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_34_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_34_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_34_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_34_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_34_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: A man trying to dance but failing miserably.\nB: A young kid who is enthusiastically bopping to the rhythm of hip-hop music.\nC: A group of elderly people doing a slow dance to classical music.\nD: A cat enthusiastically bopping to the rhythm of heavy metal music.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: A man trying to dance but failing miserably.\nB: A young kid who is enthusiastically bopping to the rhythm of hip-hop music.\nC: A group of elderly people doing a slow dance to classical music.\nD: A cat enthusiastically bopping to the rhythm of heavy metal music.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_35_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_35_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_35_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_35_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_35_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_35_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_35_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_35_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_35_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_35_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_35_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_35_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_35_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_35_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_35_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_35_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: When you wash your hands at the sink, if you forget to turn off the tap, the water might keep running, leading to an unexpected wet surprise.\nB: When you wash your hands at the sink, if you select the correct faucet, the water might spill out of the adjacent faucet, causing you to overlook cleaning your hands.\nC: When you wash your hands at the sink, if you select the toilet instead of the sink, the water might spill out, causing a funny mix-up.\nD: When you wash your hands at the sink, if you select the incorrect faucet, the water might spill out of the adjacent faucet, causing you to overlook cleaning your hands.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: When you wash your hands at the sink, if you forget to turn off the tap, the water might keep running, leading to an unexpected wet surprise.\nB: When you wash your hands at the sink, if you select the correct faucet, the water might spill out of the adjacent faucet, causing you to overlook cleaning your hands.\nC: When you wash your hands at the sink, if you select the toilet instead of the sink, the water might spill out, causing a funny mix-up.\nD: When you wash your hands at the sink, if you select the incorrect faucet, the water might spill out of the adjacent faucet, causing you to overlook cleaning your hands.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_36_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_36_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_36_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_36_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_36_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_36_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_36_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_36_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_36_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_36_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_36_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_36_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_36_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_36_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_36_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_36_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: A person gets their head stuck in a chair and struggles to get free\nB: The man tries to perform a stunt but fails miserably\nC: With the intention of squeezing through the gap between the chair's seats, the man finds his head tightly trapped, leading to a series of unsuccessful spins without achieving freedom.\nD: The man accidentally falls off the chair while attempting a funny move", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: A person gets their head stuck in a chair and struggles to get free\nB: The man tries to perform a stunt but fails miserably\nC: With the intention of squeezing through the gap between the chair's seats, the man finds his head tightly trapped, leading to a series of unsuccessful spins without achieving freedom.\nD: The man accidentally falls off the chair while attempting a funny move", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_37_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_37_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_37_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_37_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_37_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_37_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_37_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_37_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_37_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_37_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_37_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_37_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_37_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_37_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_37_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_37_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The funny part is the woman's fear of the cow, which is both surprising and amusing.\nB: The humor in this comes from the cow's reaction, which is unexpected and hilarious.\nC: The humor in this lies in the woman's attempt to scare the cow, which is an absurd and comical act.\nD: The video's humor stems from the cow's confusion, creating a lighthearted and entertaining moment.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The funny part is the woman's fear of the cow, which is both surprising and amusing.\nB: The humor in this comes from the cow's reaction, which is unexpected and hilarious.\nC: The humor in this lies in the woman's attempt to scare the cow, which is an absurd and comical act.\nD: The video's humor stems from the cow's confusion, creating a lighthearted and entertaining moment.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_38_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_38_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_38_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_38_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_38_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_38_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_38_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_38_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_38_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_38_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_38_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_38_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_38_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_38_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_38_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_38_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: When your outfit doesn't impress the cat\nB: A cat's opinion on your fashion choices\nC: The cat is not a fan of your mask\nD: Your mask might not be appreciated by cats.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: When your outfit doesn't impress the cat\nB: A cat's opinion on your fashion choices\nC: The cat is not a fan of your mask\nD: Your mask might not be appreciated by cats.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_39_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_39_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_39_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_39_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_39_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_39_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_39_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_39_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_39_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_39_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_39_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_39_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_39_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_39_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_39_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_39_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: Transporting wig parts to their end location.\nB: Shipping wig pieces to their destination.\nC: Sending wig components to their ultimate destination.\nD: Delivering hairpieces to their final stop.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: Transporting wig parts to their end location.\nB: Shipping wig pieces to their destination.\nC: Sending wig components to their ultimate destination.\nD: Delivering hairpieces to their final stop.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_40_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_40_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_40_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_40_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_40_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_40_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_40_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_40_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_40_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_40_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_40_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_40_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_40_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_40_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_40_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_40_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: A child and two dogs are playing with a ball on the beach.\nB: The dogs are barking at the child while playing on the beach.\nC: The child is chasing the dogs on the beach.\nD:  A child and two dogs are lying on the beach. The child kicks one of the dogs with their foot, and the dog gets up and retaliates by digging sand back at the child.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: A child and two dogs are playing with a ball on the beach.\nB: The dogs are barking at the child while playing on the beach.\nC: The child is chasing the dogs on the beach.\nD:  A child and two dogs are lying on the beach. The child kicks one of the dogs with their foot, and the dog gets up and retaliates by digging sand back at the child.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_41_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_41_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_41_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_41_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_41_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_41_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_41_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_41_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_41_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_41_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_41_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_41_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_41_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_41_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_41_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_41_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The use of dramatic lighting and sound effects contributes to the humor.\nB: The unexpected appearance of a superhero in a cowboy role creates the humor.\nC: The camera captures a rural setting, where a man is seen dressed up as Ultraman and performing the actions of a cowboy. This contrast in the environment and the character of Ultraman makes the scene quite amusing.\nD: The use of futuristic technology in a traditional rural setting adds to the humor.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The use of dramatic lighting and sound effects contributes to the humor.\nB: The unexpected appearance of a superhero in a cowboy role creates the humor.\nC: The camera captures a rural setting, where a man is seen dressed up as Ultraman and performing the actions of a cowboy. This contrast in the environment and the character of Ultraman makes the scene quite amusing.\nD: The use of futuristic technology in a traditional rural setting adds to the humor.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_42_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_42_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_42_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_42_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_42_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_42_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_42_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_42_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_42_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_42_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_42_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_42_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_42_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_42_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_42_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_42_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: Makeover for a bald head.\nB: Transformation for a bald head.\nC: Haircut for a bald head.\nD: Shaving day for a bald head.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: Makeover for a bald head.\nB: Transformation for a bald head.\nC: Haircut for a bald head.\nD: Shaving day for a bald head.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_43_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_43_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_43_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_43_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_43_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_43_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_43_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_43_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_43_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_43_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_43_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_43_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_43_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_43_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_43_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_43_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: Pants: Allow me to begin with a hearty laugh.\nB: Skirt: Let's kick off with a bored yawn.\nC: Shorts: Let's kick off with a hearty cry.\nD: Shirt: Let's start with a sad sigh.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: Pants: Allow me to begin with a hearty laugh.\nB: Skirt: Let's kick off with a bored yawn.\nC: Shorts: Let's kick off with a hearty cry.\nD: Shirt: Let's start with a sad sigh.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_44_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_44_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_44_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_44_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_44_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_44_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_44_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_44_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_44_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_44_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_44_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_44_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_44_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_44_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_44_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_44_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The tense music in the background adds to the suspense and makes the situation funnier.\nB:  The first two people thought that the bathroom door needed to be pulled open, and they pulled hard but couldn't open it. Then a man who came later easily pushed the door open and went into the bathroom to wash his hands. This contrast is very funny.\nC: The unexpected twist at the end, where the man effortlessly opens the door, creates the comedic effect.\nD: The exaggerated facial expressions of the characters make the situation comical.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The tense music in the background adds to the suspense and makes the situation funnier.\nB:  The first two people thought that the bathroom door needed to be pulled open, and they pulled hard but couldn't open it. Then a man who came later easily pushed the door open and went into the bathroom to wash his hands. This contrast is very funny.\nC: The unexpected twist at the end, where the man effortlessly opens the door, creates the comedic effect.\nD: The exaggerated facial expressions of the characters make the situation comical.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_45_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_45_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_45_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_45_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_45_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_45_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_45_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_45_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_45_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_45_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_45_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_45_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_45_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_45_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_45_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_45_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The humor in the video arises from the Golden Retriever's thoughtful decision-making process before consuming the beer.\nB: The humor is derived from the Golden Retriever's graceful and elegant behavior while drinking beer, showcasing his refined taste and manners.\nC: The video's humor comes from the Golden Retriever's responsible drinking habits and his ability to handle alcohol well.\nD: The Golden Retriever started by sneakily drinking beer, but his limited capacity for alcohol led to him clutching the toilet bowl and vomiting. It was quite a comical sight to see the Golden Retriever holding onto the toilet bowl while throwing up.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The humor in the video arises from the Golden Retriever's thoughtful decision-making process before consuming the beer.\nB: The humor is derived from the Golden Retriever's graceful and elegant behavior while drinking beer, showcasing his refined taste and manners.\nC: The video's humor comes from the Golden Retriever's responsible drinking habits and his ability to handle alcohol well.\nD: The Golden Retriever started by sneakily drinking beer, but his limited capacity for alcohol led to him clutching the toilet bowl and vomiting. It was quite a comical sight to see the Golden Retriever holding onto the toilet bowl while throwing up.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_46_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_46_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_46_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_46_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_46_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_46_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_46_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_46_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_46_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_46_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_46_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_46_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_46_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_46_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_46_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_46_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The person's initial approach seemed like they were catching a dangerous animal, but the final outcome of catching a belt was mundane.\nB: The person's initial careful approach seemed like they were catching a live fish, but the final outcome of catching a belt was amusing.\nC: The person's initial careful approach seemed like they were catching a snake, but the final outcome of catching a belt was comical.\nD: The person's initial careful approach seemed like they were catching a valuable item, but the final outcome of catching a belt was disappointing.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The person's initial approach seemed like they were catching a dangerous animal, but the final outcome of catching a belt was mundane.\nB: The person's initial careful approach seemed like they were catching a live fish, but the final outcome of catching a belt was amusing.\nC: The person's initial careful approach seemed like they were catching a snake, but the final outcome of catching a belt was comical.\nD: The person's initial careful approach seemed like they were catching a valuable item, but the final outcome of catching a belt was disappointing.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_47_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_47_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_47_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_47_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_47_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_47_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_47_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_47_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_47_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_47_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_47_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_47_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_47_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_47_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_47_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_47_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The comedic effect is due to the impressive special effects used to create the illusion of a bouncing ball.\nB: The humor comes from the intense and competitive facial expressions of the people playing table tennis.\nC: The video is not comedic; it is a serious demonstration of table tennis skills.\nD: Two people were pretending to play table tennis, but there was no ball involved. The sound of the ball hitting the paddle was actually the person next to them patting their stomach, which was hilarious.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The comedic effect is due to the impressive special effects used to create the illusion of a bouncing ball.\nB: The humor comes from the intense and competitive facial expressions of the people playing table tennis.\nC: The video is not comedic; it is a serious demonstration of table tennis skills.\nD: Two people were pretending to play table tennis, but there was no ball involved. The sound of the ball hitting the paddle was actually the person next to them patting their stomach, which was hilarious.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_48_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_48_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_48_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_48_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_48_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_48_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_48_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_48_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_48_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_48_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_48_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_48_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_48_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_48_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_48_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_48_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: A red-haired woman looked in the mirror and combed her long hair elegantly.\nB: A red-haired woman with long hair approached the mirror and was surprised to find a different hairstyle.\nC: A red-haired woman with long hair approached the mirror and lowered her head, only to find a bald patch on top of her head.\nD: A red-haired woman with long hair approached the mirror and laughed at her reflection.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: A red-haired woman looked in the mirror and combed her long hair elegantly.\nB: A red-haired woman with long hair approached the mirror and was surprised to find a different hairstyle.\nC: A red-haired woman with long hair approached the mirror and lowered her head, only to find a bald patch on top of her head.\nD: A red-haired woman with long hair approached the mirror and laughed at her reflection.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_49_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_49_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_49_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_49_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_49_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_49_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_49_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_49_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_49_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_49_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_49_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_49_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_49_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_49_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_49_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_49_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: As the lid of the wooden board is lifted, a cat suddenly pops out and starts meowing loudly before darting away.\nB: After lifting the lid of the wooden board, a loud horn noise is heard, and a clown pops out with confetti before disappearing.\nC: When one person lifts the lid of a wooden board, the other person immediately appears, sticking their head out. The second person shakes slightly, and then turns around, leaving the scene.\nD: One person opens the lid of a plastic container, and the other person jumps out with a scary expression before quickly running off.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: As the lid of the wooden board is lifted, a cat suddenly pops out and starts meowing loudly before darting away.\nB: After lifting the lid of the wooden board, a loud horn noise is heard, and a clown pops out with confetti before disappearing.\nC: When one person lifts the lid of a wooden board, the other person immediately appears, sticking their head out. The second person shakes slightly, and then turns around, leaving the scene.\nD: One person opens the lid of a plastic container, and the other person jumps out with a scary expression before quickly running off.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_50_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_50_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_50_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_50_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_50_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_50_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_50_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_50_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_50_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_50_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_50_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_50_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_50_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_50_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_50_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_50_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: A man is seated on a ledge near the water, dressed in only a single pair of pants. He extends his arms, and a small dog leaps and unintentionally crashes into the man's groin area. Subsequently, the man doubles over, grimacing in pain as he holds his groin.\nB: The man is practicing yoga on the edge of the water and suddenly loses balance.\nC: A man is sitting calmly by the water, enjoying the peaceful scenery.\nD: The man stands up and starts dancing to the music playing in the background.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: A man is seated on a ledge near the water, dressed in only a single pair of pants. He extends his arms, and a small dog leaps and unintentionally crashes into the man's groin area. Subsequently, the man doubles over, grimacing in pain as he holds his groin.\nB: The man is practicing yoga on the edge of the water and suddenly loses balance.\nC: A man is sitting calmly by the water, enjoying the peaceful scenery.\nD: The man stands up and starts dancing to the music playing in the background.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_51_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_51_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_51_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_51_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_51_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_51_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_51_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_51_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_51_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_51_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_51_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_51_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_51_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_51_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_51_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_51_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The video's humor arises from the slow-motion replay of the husky's fall, exaggerating its clumsiness.\nB: The video's humor is derived from the realistic CGI effects used to make the husky's fall look convincing.\nC: The video's amusement factor stems from the husky's carefree gait causing it to tumble off the bridge, which is undoubtedly humorous.\nD: The video's humor comes from the serious music in the background creating a contrasting effect with the husky's actions.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The video's humor arises from the slow-motion replay of the husky's fall, exaggerating its clumsiness.\nB: The video's humor is derived from the realistic CGI effects used to make the husky's fall look convincing.\nC: The video's amusement factor stems from the husky's carefree gait causing it to tumble off the bridge, which is undoubtedly humorous.\nD: The video's humor comes from the serious music in the background creating a contrasting effect with the husky's actions.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_52_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_52_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_52_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_52_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_52_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_52_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_52_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_52_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_52_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_52_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_52_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_52_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_52_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_52_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_52_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_52_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The video lacks any humor or comedic elements, and is intended to be informative.\nB: The video is serious and educational, aimed at teaching important life lessons.\nC: The video is a heartwarming and emotional portrayal of the student-coach relationship, without any comedic elements.\nD: The coach's perplexing behavior in order to teach the student was amusing.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The video lacks any humor or comedic elements, and is intended to be informative.\nB: The video is serious and educational, aimed at teaching important life lessons.\nC: The video is a heartwarming and emotional portrayal of the student-coach relationship, without any comedic elements.\nD: The coach's perplexing behavior in order to teach the student was amusing.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_53_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_53_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_53_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_53_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_53_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_53_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_53_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_53_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_53_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_53_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_53_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_53_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_53_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_53_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_53_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_53_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: A dog is carrying a red balloon on its back while running happily.\nB: A plastic bottle emitting blue smoke is being carried on the back of a dog while it walks ahead.\nC: A cat is pulling a cart with a red bottle on it while walking leisurely.\nD: A squirrel is carrying a yellow bag on its back while scurrying around.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: A dog is carrying a red balloon on its back while running happily.\nB: A plastic bottle emitting blue smoke is being carried on the back of a dog while it walks ahead.\nC: A cat is pulling a cart with a red bottle on it while walking leisurely.\nD: A squirrel is carrying a yellow bag on its back while scurrying around.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_54_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_54_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_54_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_54_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_54_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_54_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_54_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_54_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_54_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_54_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_54_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_54_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_54_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_54_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_54_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_54_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: Hardcore Sit-ups to Heavy Metal\nB: Rock and Roll Yoga Session\nC: Dance Party Workout\nD: A sit-up exercise routine inspired by rock and roll movements.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: Hardcore Sit-ups to Heavy Metal\nB: Rock and Roll Yoga Session\nC: Dance Party Workout\nD: A sit-up exercise routine inspired by rock and roll movements.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_55_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_55_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_55_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_55_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_55_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_55_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_55_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_55_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_55_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_55_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_55_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_55_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_55_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_55_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_55_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_55_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The man's frustration with the hand sanitizer dispenser is understandable and not intended to be humorous.\nB: The man's reaction to the malfunctioning hand sanitizer dispenser is distressing and not funny at all.\nC: The man finds the malfunctioning hand sanitizer dispenser annoying but not in a humorous way.\nD: The man becomes infuriated as the hand sanitizer dispenser keeps dispensing the liquid after he's done cleaning his hands, causing him to toss the cloth he was using, which is quite comical.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The man's frustration with the hand sanitizer dispenser is understandable and not intended to be humorous.\nB: The man's reaction to the malfunctioning hand sanitizer dispenser is distressing and not funny at all.\nC: The man finds the malfunctioning hand sanitizer dispenser annoying but not in a humorous way.\nD: The man becomes infuriated as the hand sanitizer dispenser keeps dispensing the liquid after he's done cleaning his hands, causing him to toss the cloth he was using, which is quite comical.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_56_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_56_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_56_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_56_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_56_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_56_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_56_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_56_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_56_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_56_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_56_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_56_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_56_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_56_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_56_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_56_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The perfectly executed somersault by the child\nB: The child's impeccable balance and gracefulness\nC:  The child fell down because his left hand did not support him, which he did not anticipate. This accident seemed comical to the onlookers.\nD: The child's careful and calculated movement", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The perfectly executed somersault by the child\nB: The child's impeccable balance and gracefulness\nC:  The child fell down because his left hand did not support him, which he did not anticipate. This accident seemed comical to the onlookers.\nD: The child's careful and calculated movement", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_57_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_57_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_57_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_57_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_57_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_57_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_57_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_57_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_57_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_57_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_57_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_57_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_57_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_57_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_57_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_57_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: A man in a chicken costume playing the guitar in front of a crowd.\nB: A black and white cow-patterned feline is crammed inside a water glass and wildly shaking its head.\nC: A small brown dog wearing a hat and sunglasses is riding a skateboard in the park.\nD: A group of colorful parrots singing and dancing on a tree branch.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: A man in a chicken costume playing the guitar in front of a crowd.\nB: A black and white cow-patterned feline is crammed inside a water glass and wildly shaking its head.\nC: A small brown dog wearing a hat and sunglasses is riding a skateboard in the park.\nD: A group of colorful parrots singing and dancing on a tree branch.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_58_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_58_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_58_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_58_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_58_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_58_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_58_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_58_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_58_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_58_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_58_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_58_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_58_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_58_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_58_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_58_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The man's never-ending fall is a source of laughter.\nB: The background music adds to the comedic effect.\nC: The sudden change in lighting creates a hilarious atmosphere.\nD: The unexpected appearance of a dancing dog steals the show.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The man's never-ending fall is a source of laughter.\nB: The background music adds to the comedic effect.\nC: The sudden change in lighting creates a hilarious atmosphere.\nD: The unexpected appearance of a dancing dog steals the show.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_59_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_59_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_59_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_59_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_59_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_59_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_59_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_59_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_59_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_59_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_59_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_59_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_59_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_59_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_59_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_59_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: Pretending to recognize and be familiar with strangers.\nB: Pretending to know the lyrics to a song at a karaoke night.\nC: Acting like a professional chef in a cooking show.\nD: Attempting to blend in with a group of tourists.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: Pretending to recognize and be familiar with strangers.\nB: Pretending to know the lyrics to a song at a karaoke night.\nC: Acting like a professional chef in a cooking show.\nD: Attempting to blend in with a group of tourists.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_60_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_60_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_60_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_60_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_60_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_60_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_60_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_60_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_60_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_60_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_60_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_60_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_60_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_60_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_60_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_60_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The secret to clean hands revealed.\nB:  The problem of washing hands.\nC: The art of washing hands.\nD: A guide to perfect hand hygiene.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The secret to clean hands revealed.\nB:  The problem of washing hands.\nC: The art of washing hands.\nD: A guide to perfect hand hygiene.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_61_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_61_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_61_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_61_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_61_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_61_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_61_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_61_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_61_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_61_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_61_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_61_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_61_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_61_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_61_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_61_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A:  The chair is about to collapse due to someone sitting on it. A special effect is used to show a person struggling to hold the legs of two chairs with a rope, which looks a bit funny.\nB: The video is comedic because the chairs are perfectly stable and nothing interesting happens.\nC: The video is comedic because the person sitting on the chair is not struggling at all and everything seems normal.\nD: The video is comedic because the special effect makes the chair collapse in a serious and dangerous way.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA:  The chair is about to collapse due to someone sitting on it. A special effect is used to show a person struggling to hold the legs of two chairs with a rope, which looks a bit funny.\nB: The video is comedic because the chairs are perfectly stable and nothing interesting happens.\nC: The video is comedic because the person sitting on the chair is not struggling at all and everything seems normal.\nD: The video is comedic because the special effect makes the chair collapse in a serious and dangerous way.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_62_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_62_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_62_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_62_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_62_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_62_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_62_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_62_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_62_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_62_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_62_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_62_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_62_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_62_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_62_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_62_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: A man is standing on a pedestal with one foot on a brick, while another man attempts to break the brick with his hand. The brick unexpectedly shifts, causing the man on the pedestal to lose his balance and fall.\nB: A man is attempting a dangerous stunt on a tall pedestal\nC: Two men are engaging in a physical fight on a platform\nD: A man is performing a risky acrobatic maneuver with another person", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: A man is standing on a pedestal with one foot on a brick, while another man attempts to break the brick with his hand. The brick unexpectedly shifts, causing the man on the pedestal to lose his balance and fall.\nB: A man is attempting a dangerous stunt on a tall pedestal\nC: Two men are engaging in a physical fight on a platform\nD: A man is performing a risky acrobatic maneuver with another person", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_63_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_63_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_63_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_63_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_63_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_63_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_63_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_63_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_63_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_63_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_63_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_63_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_63_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_63_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_63_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_63_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The umbrella was carried away gently by the wind, without causing any trouble.\nB: Both individuals were laughing and having a great time even after the umbrella was taken away.\nC: The two individuals were enjoying a pleasant walk under the umbrella.\nD: Two individuals were finding it difficult to walk together under one umbrella, but suddenly a strong wind carried the umbrella away. One of them gazed sadly at the disappearing umbrella.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The umbrella was carried away gently by the wind, without causing any trouble.\nB: Both individuals were laughing and having a great time even after the umbrella was taken away.\nC: The two individuals were enjoying a pleasant walk under the umbrella.\nD: Two individuals were finding it difficult to walk together under one umbrella, but suddenly a strong wind carried the umbrella away. One of them gazed sadly at the disappearing umbrella.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_64_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_64_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_64_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_64_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_64_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_64_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_64_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_64_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_64_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_64_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_64_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_64_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_64_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_64_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_64_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_64_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: Old high school buddies meeting again after a lifetime.\nB: Two best friends caught in a hilarious mix-up.\nC: Long-lost cousins reunited after a decade apart.\nD: Brothers who had lost each other's whereabouts for many years.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: Old high school buddies meeting again after a lifetime.\nB: Two best friends caught in a hilarious mix-up.\nC: Long-lost cousins reunited after a decade apart.\nD: Brothers who had lost each other's whereabouts for many years.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_65_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_65_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_65_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_65_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_65_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_65_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_65_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_65_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_65_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_65_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_65_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_65_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_65_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_65_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_65_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_65_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The video had no humor, it was just a serious demonstration of drink balancing\nB: Every drink was oddly askew, but the cups managed to catch not even a single spill.\nC: The way the drinks were perfectly aligned and the cups caught every spill\nD: The drinks were spilled all over the place and the cups were knocked over", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The video had no humor, it was just a serious demonstration of drink balancing\nB: Every drink was oddly askew, but the cups managed to catch not even a single spill.\nC: The way the drinks were perfectly aligned and the cups caught every spill\nD: The drinks were spilled all over the place and the cups were knocked over", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_66_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_66_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_66_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_66_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_66_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_66_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_66_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_66_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_66_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_66_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_66_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_66_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_66_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_66_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_66_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_66_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The video's humor comes from Miller's impressive magic skills displayed in the video.\nB: The video humorously depicts Miller successfully pulling off the magic trick.\nC: The video's comical element stems from its revelation of Miller's magic trick from a different perspective, making the entire process appear foolish and amusing.\nD: The video is funny because of Miller's serious demeanor during the magic trick.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The video's humor comes from Miller's impressive magic skills displayed in the video.\nB: The video humorously depicts Miller successfully pulling off the magic trick.\nC: The video's comical element stems from its revelation of Miller's magic trick from a different perspective, making the entire process appear foolish and amusing.\nD: The video is funny because of Miller's serious demeanor during the magic trick.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_67_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_67_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_67_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_67_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_67_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_67_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_67_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_67_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_67_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_67_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_67_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_67_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_67_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_67_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_67_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_67_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A:  The male protagonist inserted the key of the tractor into the mainframe and turned it hard. The sound of the engine igniting was accompanied by the background, and then the computer was turned on.\nB: The male protagonist accidentally started the tractor while trying to turn on the computer.\nC: The male protagonist tried to start the tractor using his phone, but it didn't work.\nD: The female protagonist struggled to start the tractor, but eventually managed to get it running.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA:  The male protagonist inserted the key of the tractor into the mainframe and turned it hard. The sound of the engine igniting was accompanied by the background, and then the computer was turned on.\nB: The male protagonist accidentally started the tractor while trying to turn on the computer.\nC: The male protagonist tried to start the tractor using his phone, but it didn't work.\nD: The female protagonist struggled to start the tractor, but eventually managed to get it running.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_68_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_68_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_68_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_68_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_68_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_68_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_68_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_68_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_68_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_68_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_68_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_68_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_68_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_68_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_68_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_68_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The best way to cure baldness is using the right shampoo.\nB: The usage of shampoo is not necessary for individuals who have a bald head.\nC: Using shampoo for bald people helps in growing hair back.\nD: Shampoo can make bald people look even more bald.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The best way to cure baldness is using the right shampoo.\nB: The usage of shampoo is not necessary for individuals who have a bald head.\nC: Using shampoo for bald people helps in growing hair back.\nD: Shampoo can make bald people look even more bald.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_69_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_69_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_69_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_69_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_69_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_69_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_69_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_69_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_69_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_69_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_69_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_69_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_69_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_69_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_69_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_69_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: We are distinct.\nB: We are indistinct.\nC: They are identical.\nD: Their differences are unclear.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: We are distinct.\nB: We are indistinct.\nC: They are identical.\nD: Their differences are unclear.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_70_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_70_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_70_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_70_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_70_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_70_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_70_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_70_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_70_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_70_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_70_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_70_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_70_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_70_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_70_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_70_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The man was standing still when the tomato sauce explosion happened, and he did not react at all.\nB: The man was anxiously eating his burger and then he spilled the ketchup.\nC: The man was peacefully eating his burger and nothing interesting happened.\nD: The man was comfortably munching on his burger when he was suddenly hit with a tomato sauce explosion, and his expression of disbelief was absolutely hysterical.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The man was standing still when the tomato sauce explosion happened, and he did not react at all.\nB: The man was anxiously eating his burger and then he spilled the ketchup.\nC: The man was peacefully eating his burger and nothing interesting happened.\nD: The man was comfortably munching on his burger when he was suddenly hit with a tomato sauce explosion, and his expression of disbelief was absolutely hysterical.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_71_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_71_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_71_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_71_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_71_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_71_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_71_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_71_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_71_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_71_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_71_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_71_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_71_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_71_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_71_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_71_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: When the video started, only the legs were visible, but as the camera turned, a man was seen engrossed in playing games on his mobile phone.\nB: Initially, it looked like a serious interview, but then the interviewee started singing a popular song loudly.\nC: At the beginning of the video, it seemed like a cooking show, but suddenly, a cat appeared and knocked down all the ingredients.\nD: In the opening scene, it appeared to be a wildlife documentary, but then the camera zoomed out to reveal it was just a pet cat acting as if it was in the wild.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: When the video started, only the legs were visible, but as the camera turned, a man was seen engrossed in playing games on his mobile phone.\nB: Initially, it looked like a serious interview, but then the interviewee started singing a popular song loudly.\nC: At the beginning of the video, it seemed like a cooking show, but suddenly, a cat appeared and knocked down all the ingredients.\nD: In the opening scene, it appeared to be a wildlife documentary, but then the camera zoomed out to reveal it was just a pet cat acting as if it was in the wild.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_72_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_72_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_72_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_72_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_72_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_72_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_72_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_72_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_72_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_72_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_72_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_72_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_72_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_72_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_72_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_72_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: A canine is present on the couch while a man carrying a backpack and a sound system tied to his hand is hopping and jumping from left to right, mimicking a bird's movements, and the dog's head is tracking his actions.\nB: The man and the dog are both sleeping on the couch, and the sound system is playing music in the background.\nC: A man is sitting on the couch with the dog, while a bird is hopping and jumping from left to right and the man's head is tracking its movements.\nD: The man is standing still, while the dog is barking and running around the room, trying to catch the bird.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: A canine is present on the couch while a man carrying a backpack and a sound system tied to his hand is hopping and jumping from left to right, mimicking a bird's movements, and the dog's head is tracking his actions.\nB: The man and the dog are both sleeping on the couch, and the sound system is playing music in the background.\nC: A man is sitting on the couch with the dog, while a bird is hopping and jumping from left to right and the man's head is tracking its movements.\nD: The man is standing still, while the dog is barking and running around the room, trying to catch the bird.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_73_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_73_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_73_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_73_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_73_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_73_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_73_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_73_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_73_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_73_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_73_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_73_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_73_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_73_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_73_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_73_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The unexpected appearance of a young woman\nB: The use of fast-paced music in the background\nC: The presence of a colorful, tropical backdrop\nD: Initially, it may be assumed that these legs are of a female, but surprisingly they belong to a bald-headed old man, which creates a comical contrast.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The unexpected appearance of a young woman\nB: The use of fast-paced music in the background\nC: The presence of a colorful, tropical backdrop\nD: Initially, it may be assumed that these legs are of a female, but surprisingly they belong to a bald-headed old man, which creates a comical contrast.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_74_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_74_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_74_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_74_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_74_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_74_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_74_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_74_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_74_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_74_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_74_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_74_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_74_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_74_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_74_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_74_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: Life as a coach is a breeze!\nB: Being a coach is a challenging task.\nC: Being a coach is a walk in the park.\nD: Coaching is an easy job.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: Life as a coach is a breeze!\nB: Being a coach is a challenging task.\nC: Being a coach is a walk in the park.\nD: Coaching is an easy job.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_75_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_75_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_75_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_75_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_75_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_75_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_75_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_75_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_75_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_75_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_75_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_75_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_75_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_75_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_75_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_75_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A:  A person is holding a rope with both hands, each hand grabbing one end of the rope.\nB: A person is using a rope to tie a knot in the video.\nC: A person is holding a snake with both hands in the video.\nD: A person is holding a hose with both hands, spraying water in the video.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA:  A person is holding a rope with both hands, each hand grabbing one end of the rope.\nB: A person is using a rope to tie a knot in the video.\nC: A person is holding a snake with both hands in the video.\nD: A person is holding a hose with both hands, spraying water in the video.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_76_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_76_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_76_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_76_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_76_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_76_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_76_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_76_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_76_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_76_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_76_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_76_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_76_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_76_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_76_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_76_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The video is comedic because of the serious soundtrack playing in the background, creating a contrast with the funny visuals.\nB: The sight of dogs pushing forward as if they are wearing a jetpack on their back is amusing.\nC: The comedic effect of the video is due to the slow-motion footage of the dogs, making their movements comically exaggerated.\nD: The video is comedic because of the unexpected plot twist at the end, catching the viewers by surprise.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The video is comedic because of the serious soundtrack playing in the background, creating a contrast with the funny visuals.\nB: The sight of dogs pushing forward as if they are wearing a jetpack on their back is amusing.\nC: The comedic effect of the video is due to the slow-motion footage of the dogs, making their movements comically exaggerated.\nD: The video is comedic because of the unexpected plot twist at the end, catching the viewers by surprise.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_77_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_77_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_77_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_77_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_77_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_77_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_77_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_77_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_77_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_77_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_77_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_77_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_77_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_77_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_77_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_77_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: In Front of the Trickery\nB: Beneath the Enchantment\nC: Underneath the Illusion\nD:  Behind the Magic", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: In Front of the Trickery\nB: Beneath the Enchantment\nC: Underneath the Illusion\nD:  Behind the Magic", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_78_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_78_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_78_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_78_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_78_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_78_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_78_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_78_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_78_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_78_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_78_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_78_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_78_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_78_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_78_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_78_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A:  When washing a bald head, the shampoo applied slips down easily.\nB: The video features a person using a soap dispenser only to have the soap squirt directly onto their face.\nC: The video depicts a person attempting to sit on a wet chair and sliding off.\nD: The video shows a person slipping on a banana peel.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA:  When washing a bald head, the shampoo applied slips down easily.\nB: The video features a person using a soap dispenser only to have the soap squirt directly onto their face.\nC: The video depicts a person attempting to sit on a wet chair and sliding off.\nD: The video shows a person slipping on a banana peel.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_79_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_79_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_79_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_79_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_79_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_79_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_79_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_79_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_79_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_79_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_79_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_79_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_79_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_79_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_79_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_79_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: A man in a white shirt is standing on a skateboard and doing stunts.\nB: A child in a green shirt is playing with a ball and jumping on a trampoline.\nC: A person wearing a blue jacket is dancing in the rain with an umbrella.\nD:  A child wearing an orange shirt is holding a stick in their right hand, extending their left hand, and then falling to the left on the ground.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: A man in a white shirt is standing on a skateboard and doing stunts.\nB: A child in a green shirt is playing with a ball and jumping on a trampoline.\nC: A person wearing a blue jacket is dancing in the rain with an umbrella.\nD:  A child wearing an orange shirt is holding a stick in their right hand, extending their left hand, and then falling to the left on the ground.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_80_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_80_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_80_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_80_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_80_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_80_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_80_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_80_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_80_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_80_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_80_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_80_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_80_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_80_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_80_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_80_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: A blue car carrying a large bird with a striped fur pattern, sleeping on the car roof.\nB: A white car carrying a big cat with a rainbow fur pattern, hiding inside the car.\nC: A red car carrying a medium-sized rabbit with a polka dot fur pattern, playing with the car window.\nD: A black car carries a small dog with a white and black fur pattern, peering out from the window. The scene then shifts to a person standing on a nearby street with a white and black pattern painted on their face.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: A blue car carrying a large bird with a striped fur pattern, sleeping on the car roof.\nB: A white car carrying a big cat with a rainbow fur pattern, hiding inside the car.\nC: A red car carrying a medium-sized rabbit with a polka dot fur pattern, playing with the car window.\nD: A black car carries a small dog with a white and black fur pattern, peering out from the window. The scene then shifts to a person standing on a nearby street with a white and black pattern painted on their face.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_81_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_81_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_81_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_81_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_81_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_81_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_81_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_81_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_81_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_81_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_81_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_81_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_81_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_81_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_81_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_81_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The sound of a female exercising can be heard, her hair falling over her face as she does sit-ups.\nB: A male voice is heard, while a person sits on the couch watching TV.\nC: There is loud music playing in the background as a person struggles to open a jar.\nD: The noise of a dog barking can be heard as a person takes a nap.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The sound of a female exercising can be heard, her hair falling over her face as she does sit-ups.\nB: A male voice is heard, while a person sits on the couch watching TV.\nC: There is loud music playing in the background as a person struggles to open a jar.\nD: The noise of a dog barking can be heard as a person takes a nap.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_82_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_82_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_82_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_82_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_82_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_82_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_82_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_82_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_82_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_82_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_82_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_82_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_82_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_82_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_82_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_82_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: Beverage: Roaming through the cup's entire world.\nB: Liquid travel: Exploring the cup's universe.\nC: Drink: A journey across the cup's universe.\nD:  Beverage: Passing through the whole world of the cup.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: Beverage: Roaming through the cup's entire world.\nB: Liquid travel: Exploring the cup's universe.\nC: Drink: A journey across the cup's universe.\nD:  Beverage: Passing through the whole world of the cup.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_83_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_83_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_83_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_83_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_83_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_83_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_83_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_83_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_83_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_83_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_83_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_83_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_83_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_83_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_83_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_83_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The dizzying camera work creates a sense of confusion that leads to laughter\nB: It is as if humans are incapable of washing themselves entirely clean, as foam keeps reappearing in different places. The juxtaposition of the before and after states is rather amusing.\nC: The use of unrealistic special effects adds to the comedic effect\nD: The exaggerated facial expressions of the characters evoke laughter", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The dizzying camera work creates a sense of confusion that leads to laughter\nB: It is as if humans are incapable of washing themselves entirely clean, as foam keeps reappearing in different places. The juxtaposition of the before and after states is rather amusing.\nC: The use of unrealistic special effects adds to the comedic effect\nD: The exaggerated facial expressions of the characters evoke laughter", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_84_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_84_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_84_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_84_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_84_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_84_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_84_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_84_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_84_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_84_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_84_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_84_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_84_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_84_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_84_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_84_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: Does it look like I am angry?\nB: Is it evident that I am happy?\nC: Do you think I am sad?\nD: Am I clearly upset?", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: Does it look like I am angry?\nB: Is it evident that I am happy?\nC: Do you think I am sad?\nD: Am I clearly upset?", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_85_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_85_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_85_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_85_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_85_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_85_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_85_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_85_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_85_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_85_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_85_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_85_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_85_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_85_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_85_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_85_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The dog's action was intentional and not a mistake, making it funny.\nB: The man extended his arms, hoping to catch the small dog, but the dog misinterpreted his gesture and directly hit his groin area. It seems that the dog didn't grasp the man's intention, creating this hilarious scene.\nC: The man's painful expression added to the comedic effect.\nD: The scene was staged, and the man and dog were actually friends, which made it less funny.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The dog's action was intentional and not a mistake, making it funny.\nB: The man extended his arms, hoping to catch the small dog, but the dog misinterpreted his gesture and directly hit his groin area. It seems that the dog didn't grasp the man's intention, creating this hilarious scene.\nC: The man's painful expression added to the comedic effect.\nD: The scene was staged, and the man and dog were actually friends, which made it less funny.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_86_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_86_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_86_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_86_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_86_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_86_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_86_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_86_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_86_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_86_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_86_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_86_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_86_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_86_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_86_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_86_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The hidden person was actually a friend playing a prank, so it was not unexpected.\nB: The person was not startled, it was all part of a planned act.\nC:  The person was startled by the person hidden under the cover, which was a bit unexpected.\nD: The person was not surprised by the person hidden under the cover, it was all staged.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The hidden person was actually a friend playing a prank, so it was not unexpected.\nB: The person was not startled, it was all part of a planned act.\nC:  The person was startled by the person hidden under the cover, which was a bit unexpected.\nD: The person was not surprised by the person hidden under the cover, it was all staged.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_87_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_87_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_87_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_87_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_87_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_87_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_87_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_87_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_87_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_87_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_87_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_87_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_87_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_87_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_87_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_87_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: A road-ignorant Husky.\nB: A confused snow dog.\nC: A lost Husky on the highway.\nD: A bewildered canine traveler.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: A road-ignorant Husky.\nB: A confused snow dog.\nC: A lost Husky on the highway.\nD: A bewildered canine traveler.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_88_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_88_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_88_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_88_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_88_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_88_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_88_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_88_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_88_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_88_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_88_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_88_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_88_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_88_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_88_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_88_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: Instead of splitting the brick, breakdance on it.\nB: If the brick cannot be split, try juggling with it.\nC: If the brick cannot be split, do a backflip instead.\nD: When the brick is unbreakable, start a magic show with it.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: Instead of splitting the brick, breakdance on it.\nB: If the brick cannot be split, try juggling with it.\nC: If the brick cannot be split, do a backflip instead.\nD: When the brick is unbreakable, start a magic show with it.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_89_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_89_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_89_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_89_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_89_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_89_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_89_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_89_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_89_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_89_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_89_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_89_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_89_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_89_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_89_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_89_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The \"Chicken\" is starting up its primary engine.\nB: The cow is starting up its primary engine.\nC: The cat is starting up its primary engine.\nD: The dog is starting up its primary engine.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The \"Chicken\" is starting up its primary engine.\nB: The cow is starting up its primary engine.\nC: The cat is starting up its primary engine.\nD: The dog is starting up its primary engine.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_90_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_90_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_90_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_90_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_90_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_90_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_90_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_90_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_90_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_90_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_90_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_90_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_90_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_90_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_90_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_90_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The cat is trying to steal the human's cup.\nB: The cat is not interested in the water inside a human's cup.\nC: The cat prefers tea over water from a human's cup.\nD: According to the cat, the water inside a human's cup is the most preferable to quench its thirst.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The cat is trying to steal the human's cup.\nB: The cat is not interested in the water inside a human's cup.\nC: The cat prefers tea over water from a human's cup.\nD: According to the cat, the water inside a human's cup is the most preferable to quench its thirst.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_91_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_91_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_91_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_91_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_91_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_91_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_91_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_91_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_91_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_91_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_91_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_91_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_91_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_91_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_91_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_91_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: Infinite staircase with no end in sight\nB: Staircase that goes on endlessly without any conclusion.\nC: Endless staircase with no resolution in sight\nD: Never-ending stairs leading to nowhere", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: Infinite staircase with no end in sight\nB: Staircase that goes on endlessly without any conclusion.\nC: Endless staircase with no resolution in sight\nD: Never-ending stairs leading to nowhere", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_92_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_92_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_92_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_92_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_92_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_92_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_92_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_92_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_92_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_92_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_92_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_92_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_92_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_92_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_92_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_92_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: A man said hi to a group of people who welcomed him warmly, and then they all hugged each other.\nB: A man walked past a group of people who were waving at him, and then he stopped to take a selfie with them.\nC: A man said hi to a group of people who welcomed him warmly, but he decided to bypass them and greeted the people after them.\nD: A man greeted a group of people and received a lot of high-fives from them before dancing with them.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: A man said hi to a group of people who welcomed him warmly, and then they all hugged each other.\nB: A man walked past a group of people who were waving at him, and then he stopped to take a selfie with them.\nC: A man said hi to a group of people who welcomed him warmly, but he decided to bypass them and greeted the people after them.\nD: A man greeted a group of people and received a lot of high-fives from them before dancing with them.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_93_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_93_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_93_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_93_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_93_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_93_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_93_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_93_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_93_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_93_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_93_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_93_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_93_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_93_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_93_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_93_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The ashamed master and the collected pup.\nB: The confident master and the disobedient pup.\nC: The disappointed master and the scattered pup.\nD: The proud master and the chaotic pup.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The ashamed master and the collected pup.\nB: The confident master and the disobedient pup.\nC: The disappointed master and the scattered pup.\nD: The proud master and the chaotic pup.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_94_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_94_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_94_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_94_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_94_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_94_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_94_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_94_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_94_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_94_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_94_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_94_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_94_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_94_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_94_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_94_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: A person is trying to catch something, but it slips and hits them on the face.\nB: Someone leans their face towards an object, shakes it, and the object tumbles down, hitting them on the face.\nC: Someone tries to balance an object on their head, but it falls and hits them on the face.\nD: An individual is attempting to juggle objects, but they accidentally drop one, and it hits them on the face.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: A person is trying to catch something, but it slips and hits them on the face.\nB: Someone leans their face towards an object, shakes it, and the object tumbles down, hitting them on the face.\nC: Someone tries to balance an object on their head, but it falls and hits them on the face.\nD: An individual is attempting to juggle objects, but they accidentally drop one, and it hits them on the face.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_95_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_95_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_95_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_95_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_95_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_95_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_95_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_95_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_95_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_95_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_95_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_95_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_95_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_95_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_95_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_95_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: A strong gust of wind tore the umbrella's fabric from the holders' grip, leading to a hilarious situation.\nB: The fabric of the umbrella got caught in the wind, causing a comical struggle for the individuals trying to hold onto it.\nC: The umbrella fabric got tangled in the wind and caused chaos among the people holding it.\nD: The wind disrupted the umbrella fabric, which was held by two people, and one of them watched the departing fabric.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: A strong gust of wind tore the umbrella's fabric from the holders' grip, leading to a hilarious situation.\nB: The fabric of the umbrella got caught in the wind, causing a comical struggle for the individuals trying to hold onto it.\nC: The umbrella fabric got tangled in the wind and caused chaos among the people holding it.\nD: The wind disrupted the umbrella fabric, which was held by two people, and one of them watched the departing fabric.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_96_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_96_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_96_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_96_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_96_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_96_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_96_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_96_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_96_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_96_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_96_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_96_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_96_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_96_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_96_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_96_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The person in question mistakes the soap dispenser for a hand sanitizer and tries to use it.\nB: The person in question accidentally sprays themselves with the cleaning liquid.\nC: The person in question hurls the cleaning cloth they are holding towards the soap dispenser.\nD: The person in question slips on the wet floor and falls down.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The person in question mistakes the soap dispenser for a hand sanitizer and tries to use it.\nB: The person in question accidentally sprays themselves with the cleaning liquid.\nC: The person in question hurls the cleaning cloth they are holding towards the soap dispenser.\nD: The person in question slips on the wet floor and falls down.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_97_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_97_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_97_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_97_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_97_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_97_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_97_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_97_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_97_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_97_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_97_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_97_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_97_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_97_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_97_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_97_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The video is comedic because it shows a serious battle between characters with no humor\nB: The video is comedic due to the intense and dramatic music in the background\nC: Ultraman is significantly less tall than the monster and appears a bit overweight while riding a dinosaur, which adds to the comical and amusing sight of him chasing the monster from behind.\nD: The video is comedic because it depicts a realistic and serious situation", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The video is comedic because it shows a serious battle between characters with no humor\nB: The video is comedic due to the intense and dramatic music in the background\nC: Ultraman is significantly less tall than the monster and appears a bit overweight while riding a dinosaur, which adds to the comical and amusing sight of him chasing the monster from behind.\nD: The video is comedic because it depicts a realistic and serious situation", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_98_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_98_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_98_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_98_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_98_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_98_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_98_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_98_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_98_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_98_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_98_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_98_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_98_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_98_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_98_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_98_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The humor in the video comes from the customer accidentally hitting the owner instead of the targets, which leads to a series of comical mishaps.\nB: The humor in the video stems from the customer's exceptional marksmanship, hitting every target with precision. While this may be a tragedy for the owner, it is a source of amusement for the viewers.\nC: The video's humor is derived from the customers' lack of aim, resulting in a failed and uneventful experience for the owner.\nD: The humor arises from the customer's inability to hit any target, causing chaos and frustration for the owner.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The humor in the video comes from the customer accidentally hitting the owner instead of the targets, which leads to a series of comical mishaps.\nB: The humor in the video stems from the customer's exceptional marksmanship, hitting every target with precision. While this may be a tragedy for the owner, it is a source of amusement for the viewers.\nC: The video's humor is derived from the customers' lack of aim, resulting in a failed and uneventful experience for the owner.\nD: The humor arises from the customer's inability to hit any target, causing chaos and frustration for the owner.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_99_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_99_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_99_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_99_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_99_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_99_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_99_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_99_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_99_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_99_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_99_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_99_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_99_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_99_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_99_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_99_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: Creative approach to staying dry in the rain.\nB: Innovative rain protection technique.\nC: Unexpected rain shelter solution.\nD:  New way of sheltering from the rain.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: Creative approach to staying dry in the rain.\nB: Innovative rain protection technique.\nC: Unexpected rain shelter solution.\nD:  New way of sheltering from the rain.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_100_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_100_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_100_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_100_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_100_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_100_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_100_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_100_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_100_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_100_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_100_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_100_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_100_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_100_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_100_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_100_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The beginning occurrence was completely unplanned.\nB: Provide proof that the first instance happened coincidentally.\nC: A hilarious twist of fate led to the initial occurrence.\nD: Proof that the initial incident was a random stroke of luck.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The beginning occurrence was completely unplanned.\nB: Provide proof that the first instance happened coincidentally.\nC: A hilarious twist of fate led to the initial occurrence.\nD: Proof that the initial incident was a random stroke of luck.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_101_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_101_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_101_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_101_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_101_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_101_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_101_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_101_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_101_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_101_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_101_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_101_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_101_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_101_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_101_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_101_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: A bird flying through a window.\nB: A cat being thrown by a slingshot.\nC:  The dog launched into the air through a catapult mechanism.\nD: A dog jumping over a fence.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: A bird flying through a window.\nB: A cat being thrown by a slingshot.\nC:  The dog launched into the air through a catapult mechanism.\nD: A dog jumping over a fence.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_102_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_102_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_102_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_102_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_102_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_102_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_102_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_102_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_102_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_102_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_102_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_102_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_102_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_102_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_102_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_102_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The electric bike hit the car, causing the car to shake and lose control.\nB:  A car on the road hit an electric bike, and the nearby electric bike shook and overturned.\nC: A police car arrived and stopped the car from hitting the electric bike.\nD: The electric bike swerved to avoid the car, causing it to crash into a nearby wall.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The electric bike hit the car, causing the car to shake and lose control.\nB:  A car on the road hit an electric bike, and the nearby electric bike shook and overturned.\nC: A police car arrived and stopped the car from hitting the electric bike.\nD: The electric bike swerved to avoid the car, causing it to crash into a nearby wall.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_103_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_103_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_103_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_103_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_103_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_103_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_103_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_103_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_103_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_103_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_103_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_103_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_103_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_103_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_103_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_103_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The use of vibrant colors adds humor to the video\nB: The unexpected use of sound effects makes it funny\nC: The comedic element is present in two unforeseen events. Firstly, the adult finishes the strawberry, and instead of being upset, the child fakes a smile. Secondly, the child swiftly goes from laughing to crying, providing another twist in the story.\nD: The slow-motion effect on the adult's reaction creates the humor", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The use of vibrant colors adds humor to the video\nB: The unexpected use of sound effects makes it funny\nC: The comedic element is present in two unforeseen events. Firstly, the adult finishes the strawberry, and instead of being upset, the child fakes a smile. Secondly, the child swiftly goes from laughing to crying, providing another twist in the story.\nD: The slow-motion effect on the adult's reaction creates the humor", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_104_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_104_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_104_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_104_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_104_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_104_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_104_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_104_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_104_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_104_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_104_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_104_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_104_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_104_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_104_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_104_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: High jumping by humans.\nB: Alien invasion rehearsal\nC: A new world record in jumping\nD: Extreme sports competition", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: High jumping by humans.\nB: Alien invasion rehearsal\nC: A new world record in jumping\nD: Extreme sports competition", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_105_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_105_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_105_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_105_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_105_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_105_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_105_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_105_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_105_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_105_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_105_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_105_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_105_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_105_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_105_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_105_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The man positioned on the left kept on hitting, whereas the man on the right showed no reaction. After the man on the right gave a blow with his stick, the man on the left stumbled.\nB: The man on the left was actually the aggressor, but the man on the right surprised him with a swift blow, causing the man on the left to lose his balance.\nC: The man on the left was the one who didn't show any reaction while the man on the right continued to hit. Later, the man on the left unexpectedly retaliated and the man on the right stumbled.\nD: Initially, both men were engaged in hitting each other, but the man on the right suddenly stopped, causing the man on the left to lose balance and stumble.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The man positioned on the left kept on hitting, whereas the man on the right showed no reaction. After the man on the right gave a blow with his stick, the man on the left stumbled.\nB: The man on the left was actually the aggressor, but the man on the right surprised him with a swift blow, causing the man on the left to lose his balance.\nC: The man on the left was the one who didn't show any reaction while the man on the right continued to hit. Later, the man on the left unexpectedly retaliated and the man on the right stumbled.\nD: Initially, both men were engaged in hitting each other, but the man on the right suddenly stopped, causing the man on the left to lose balance and stumble.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_106_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_106_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_106_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_106_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_106_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_106_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_106_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_106_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_106_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_106_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_106_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_106_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_106_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_106_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_106_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_106_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The video is funny because of the smooth and effortless running of the student without any mishaps.\nB: The video is funny because it shows a student responsibly avoiding the utility pole and reaching the destination safely.\nC:  The video is hilarious as a student running didn't pay attention to the front and comically crashed into a roadside utility pole, and most importantly, even knocked down the pole.\nD: The video is funny because of the serious and dangerous accident involving a student and a utility pole.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The video is funny because of the smooth and effortless running of the student without any mishaps.\nB: The video is funny because it shows a student responsibly avoiding the utility pole and reaching the destination safely.\nC:  The video is hilarious as a student running didn't pay attention to the front and comically crashed into a roadside utility pole, and most importantly, even knocked down the pole.\nD: The video is funny because of the serious and dangerous accident involving a student and a utility pole.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_107_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_107_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_107_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_107_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_107_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_107_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_107_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_107_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_107_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_107_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_107_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_107_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_107_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_107_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_107_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_107_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The man played a game of catch with the bags, skillfully avoiding getting hit.\nB: While disposing of his trash, the man tossed a bag upwards and it landed on him by accident. He proceeded to kick another bag down to the ground, but unfortunately missed his target.\nC: The man threw the bag in the air and it landed perfectly in the garbage can.\nD: The man picked up the bags and placed them neatly by the curb.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The man played a game of catch with the bags, skillfully avoiding getting hit.\nB: While disposing of his trash, the man tossed a bag upwards and it landed on him by accident. He proceeded to kick another bag down to the ground, but unfortunately missed his target.\nC: The man threw the bag in the air and it landed perfectly in the garbage can.\nD: The man picked up the bags and placed them neatly by the curb.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_108_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_108_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_108_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_108_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_108_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_108_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_108_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_108_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_108_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_108_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_108_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_108_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_108_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_108_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_108_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_108_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The owner accidentally backs the car into the roadside bushes despite the reverse camera system.\nB: The car's reverse camera system fails to beep as the car approaches the roadside bushes, leading to a collision.\nC: The car's reverse camera system beeps erratically, causing confusion for the owner.\nD: The car's reverse camera system is constantly beeping as the car approaches the roadside bushes, but upon the owner's exit from the driver's seat and inspection, it is found that the bushes are still a considerable distance away.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The owner accidentally backs the car into the roadside bushes despite the reverse camera system.\nB: The car's reverse camera system fails to beep as the car approaches the roadside bushes, leading to a collision.\nC: The car's reverse camera system beeps erratically, causing confusion for the owner.\nD: The car's reverse camera system is constantly beeping as the car approaches the roadside bushes, but upon the owner's exit from the driver's seat and inspection, it is found that the bushes are still a considerable distance away.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_109_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_109_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_109_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_109_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_109_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_109_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_109_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_109_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_109_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_109_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_109_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_109_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_109_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_109_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_109_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_109_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The video's funny moment arises from the man's overacting, as he attempts to catch a soda can with his hand but instead gets hit on the head, resulting in a tragicomic scene.\nB: The video's humor comes from the unexpected failure of the man to open the soda can, resulting in a frustrating and disappointing moment.\nC: The comedic aspect of the video is derived from the man's serious and reserved demeanor as he avoids the soda can, creating a tension-filled scene.\nD: The humor in the video is due to the man being successful in catching the soda can with his hand, leading to a heartwarming moment.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The video's funny moment arises from the man's overacting, as he attempts to catch a soda can with his hand but instead gets hit on the head, resulting in a tragicomic scene.\nB: The video's humor comes from the unexpected failure of the man to open the soda can, resulting in a frustrating and disappointing moment.\nC: The comedic aspect of the video is derived from the man's serious and reserved demeanor as he avoids the soda can, creating a tension-filled scene.\nD: The humor in the video is due to the man being successful in catching the soda can with his hand, leading to a heartwarming moment.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_110_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_110_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_110_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_110_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_110_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_110_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_110_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_110_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_110_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_110_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_110_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_110_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_110_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_110_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_110_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_110_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: Abrupt shock.\nB: Instant shock.\nC:  Sudden surprise.\nD: Unexpected shock.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: Abrupt shock.\nB: Instant shock.\nC:  Sudden surprise.\nD: Unexpected shock.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_111_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_111_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_111_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_111_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_111_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_111_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_111_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_111_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_111_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_111_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_111_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_111_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_111_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_111_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_111_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_111_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: From behind the glass, a spotted leopard and a young child are seen batting at each other playfully.\nB: A zebra and a little boy are engaging in a dangerous game of tag.\nC: A lion and a toddler are having a serious fight inside a cage.\nD: A monkey and a kid are seen throwing items at each other from behind a glass enclosure.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: From behind the glass, a spotted leopard and a young child are seen batting at each other playfully.\nB: A zebra and a little boy are engaging in a dangerous game of tag.\nC: A lion and a toddler are having a serious fight inside a cage.\nD: A monkey and a kid are seen throwing items at each other from behind a glass enclosure.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_112_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_112_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_112_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_112_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_112_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_112_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_112_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_112_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_112_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_112_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_112_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_112_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_112_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_112_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_112_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_112_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: Oh no, I lost my pet rock again, where did it go?\nB: My cellular device appears to be missing, any idea where it could be?\nC: Has anyone seen my imaginary friend? I think they ran away again.\nD: I seem to have misplaced my invisible hat, any thoughts on its whereabouts?", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: Oh no, I lost my pet rock again, where did it go?\nB: My cellular device appears to be missing, any idea where it could be?\nC: Has anyone seen my imaginary friend? I think they ran away again.\nD: I seem to have misplaced my invisible hat, any thoughts on its whereabouts?", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_113_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_113_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_113_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_113_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_113_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_113_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_113_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_113_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_113_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_113_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_113_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_113_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_113_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_113_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_113_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_113_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: What makes the video funny is the dog using the person as a marker and peeing directly on the woman. The woman's unfortunate experience adds an element of tragicomedy to the scene.\nB: The video's humor comes from the unexpected interaction between the dog and the woman.\nC: The funny aspect of the video is the woman's unexpected encounter with the dog.\nD: The humor is created by the dog's unusual behavior towards the woman.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: What makes the video funny is the dog using the person as a marker and peeing directly on the woman. The woman's unfortunate experience adds an element of tragicomedy to the scene.\nB: The video's humor comes from the unexpected interaction between the dog and the woman.\nC: The funny aspect of the video is the woman's unexpected encounter with the dog.\nD: The humor is created by the dog's unusual behavior towards the woman.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_114_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_114_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_114_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_114_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_114_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_114_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_114_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_114_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_114_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_114_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_114_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_114_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_114_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_114_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_114_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_114_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: Where was this video filmed?\nB: What time is the lunch break?\nC: How many dogs are in the video?\nD: Who is the designated security personnel?", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: Where was this video filmed?\nB: What time is the lunch break?\nC: How many dogs are in the video?\nD: Who is the designated security personnel?", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_115_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_115_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_115_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_115_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_115_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_115_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_115_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_115_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_115_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_115_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_115_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_115_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_115_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_115_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_115_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_115_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: From beneath the toilet door panel, a hand is reaching out with an upward-facing palm to receive chopsticks and a spoon from someone outside.\nB: The hand is asking for help to get out of the bathroom.\nC: The hand is actually reaching out for a handshake.\nD: A person is handing over toilet paper instead of chopsticks and a spoon.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: From beneath the toilet door panel, a hand is reaching out with an upward-facing palm to receive chopsticks and a spoon from someone outside.\nB: The hand is asking for help to get out of the bathroom.\nC: The hand is actually reaching out for a handshake.\nD: A person is handing over toilet paper instead of chopsticks and a spoon.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_116_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_116_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_116_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_116_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_116_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_116_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_116_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_116_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_116_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_116_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_116_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_116_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_116_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_116_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_116_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_116_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: A peaceful resolution with positive outcome.\nB: A successful attack with unexpected results.\nC: A well-executed plan that achieves the desired goal.\nD: A failed attack that yields no results.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: A peaceful resolution with positive outcome.\nB: A successful attack with unexpected results.\nC: A well-executed plan that achieves the desired goal.\nD: A failed attack that yields no results.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_117_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_117_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_117_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_117_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_117_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_117_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_117_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_117_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_117_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_117_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_117_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_117_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_117_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_117_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_117_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_117_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: A lady is reclining on a lounger when she sways twice, and tips over, falling to the ground.\nB: The lady jumps from the lounger and starts dancing.\nC: The lounger breaks and the lady falls abruptly.\nD: A lady is sitting calmly on a lounger and gets up gracefully.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: A lady is reclining on a lounger when she sways twice, and tips over, falling to the ground.\nB: The lady jumps from the lounger and starts dancing.\nC: The lounger breaks and the lady falls abruptly.\nD: A lady is sitting calmly on a lounger and gets up gracefully.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_118_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_118_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_118_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_118_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_118_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_118_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_118_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_118_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_118_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_118_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_118_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_118_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_118_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_118_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_118_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_118_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The cat exclaimed in frustration, \"Why is it always me who has to face unlucky incidents?\"\nB: The Cat's Fortune\nC: Unlucky Cat Adventures\nD: Feline Frustration", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The cat exclaimed in frustration, \"Why is it always me who has to face unlucky incidents?\"\nB: The Cat's Fortune\nC: Unlucky Cat Adventures\nD: Feline Frustration", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_119_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_119_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_119_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_119_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_119_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_119_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_119_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_119_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_119_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_119_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_119_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_119_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_119_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_119_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_119_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_119_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The video is entertaining because it shows a person teasing a small dog and the dog responds by tugging on the person's clothing, creating a humorous moment.\nB:  The video is hilarious because a person is teasing a small dog by shaking their buttocks and the dog bites off their pants, which is very funny.\nC: The video is funny because a person is playing with a small dog and the dog tugs on their clothes, resulting in a humorous situation.\nD: The video is amusing because it depicts a person dancing with a small dog and the dog reacts by pulling on the person's clothes, which is comical.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The video is entertaining because it shows a person teasing a small dog and the dog responds by tugging on the person's clothing, creating a humorous moment.\nB:  The video is hilarious because a person is teasing a small dog by shaking their buttocks and the dog bites off their pants, which is very funny.\nC: The video is funny because a person is playing with a small dog and the dog tugs on their clothes, resulting in a humorous situation.\nD: The video is amusing because it depicts a person dancing with a small dog and the dog reacts by pulling on the person's clothes, which is comical.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_120_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_120_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_120_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_120_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_120_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_120_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_120_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_120_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_120_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_120_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_120_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_120_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_120_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_120_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_120_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_120_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: Three guys are dancing on a bench to a popular song, trying to impress their friends.\nB: A group of people are struggling to move a heavy bench across the room.\nC: Three friends are sitting on a bench and watching a funny video on their phone.\nD: Inside a KTV, three males are sitting on a long bench, touching each other's backs, and simultaneously making rowing movements, causing the bench to move backwards.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: Three guys are dancing on a bench to a popular song, trying to impress their friends.\nB: A group of people are struggling to move a heavy bench across the room.\nC: Three friends are sitting on a bench and watching a funny video on their phone.\nD: Inside a KTV, three males are sitting on a long bench, touching each other's backs, and simultaneously making rowing movements, causing the bench to move backwards.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_121_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_121_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_121_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_121_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_121_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_121_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_121_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_121_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_121_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_121_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_121_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_121_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_121_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_121_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_121_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_121_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: After drinking, the black dog is telling others not to drink anymore.\nB: The black dog is warning others about the dangers of overeating.\nC: The black dog is encouraging others to drink more alcohol.\nD: The black dog is enjoying a drinking spree with its friends.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: After drinking, the black dog is telling others not to drink anymore.\nB: The black dog is warning others about the dangers of overeating.\nC: The black dog is encouraging others to drink more alcohol.\nD: The black dog is enjoying a drinking spree with its friends.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_122_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_122_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_122_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_122_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_122_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_122_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_122_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_122_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_122_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_122_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_122_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_122_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_122_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_122_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_122_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_122_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The entire video is not funny, it's just plain awkward\nB: The behavior doesn't strike me as funny, but for some people, the fact that the man on top was unexpectedly awoken could be a source of amusement.\nC: The humor comes from the man's startled reaction\nD: The funny part is the loud noise that wakes up the man", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The entire video is not funny, it's just plain awkward\nB: The behavior doesn't strike me as funny, but for some people, the fact that the man on top was unexpectedly awoken could be a source of amusement.\nC: The humor comes from the man's startled reaction\nD: The funny part is the loud noise that wakes up the man", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_123_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_123_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_123_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_123_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_123_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_123_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_123_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_123_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_123_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_123_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_123_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_123_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_123_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_123_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_123_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_123_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The security personnel lifted their hands and conducted a check on the individual coming towards them.\nB: The security personnel danced with the individual instead of conducting a check.\nC: The security personnel ignored the individual and continued chatting with each other.\nD: The security personnel mistook the individual for someone else and waved them through without a check.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The security personnel lifted their hands and conducted a check on the individual coming towards them.\nB: The security personnel danced with the individual instead of conducting a check.\nC: The security personnel ignored the individual and continued chatting with each other.\nD: The security personnel mistook the individual for someone else and waved them through without a check.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_124_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_124_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_124_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_124_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_124_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_124_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_124_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_124_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_124_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_124_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_124_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_124_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_124_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_124_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_124_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_124_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The dog's determination to succeed\nB: It is often said that success breeds success, but it was hilarious to see this dog hit the glass on its second try after succeeding once.\nC: The glass representing a barrier for the dog\nD: The unexpected outcome of the dog hitting the glass on its second try", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The dog's determination to succeed\nB: It is often said that success breeds success, but it was hilarious to see this dog hit the glass on its second try after succeeding once.\nC: The glass representing a barrier for the dog\nD: The unexpected outcome of the dog hitting the glass on its second try", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_125_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_125_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_125_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_125_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_125_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_125_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_125_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_125_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_125_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_125_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_125_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_125_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_125_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_125_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_125_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_125_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The video is comedic because of the dull and uninteresting tapping between the flower leopard and the child through the glass.\nB: The video is comedic because of the serious interaction between the flower leopard and the child through the glass.\nC: The video is comedic because of the aggressive behavior of the flower leopard towards the child through the glass.\nD: The playful tapping between the flower leopard and the child through the glass gives the impression that the leopard has transformed into an adorable and humorous child.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The video is comedic because of the dull and uninteresting tapping between the flower leopard and the child through the glass.\nB: The video is comedic because of the serious interaction between the flower leopard and the child through the glass.\nC: The video is comedic because of the aggressive behavior of the flower leopard towards the child through the glass.\nD: The playful tapping between the flower leopard and the child through the glass gives the impression that the leopard has transformed into an adorable and humorous child.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_126_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_126_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_126_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_126_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_126_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_126_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_126_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_126_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_126_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_126_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_126_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_126_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_126_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_126_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_126_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_126_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The audience's expectations were completely met, and there was nothing unexpected or humorous about the situation.\nB:  The child was holding a basin of snow with a serious expression, intending to pour it towards her father in a swift and standard manner. As a result, the full basin of snow, with the child's full force, fell back onto her own face, greatly disappointing the audience's expectations, which was really hard to hold back the laughter.\nC: The child's serious expression and intention to pour snow towards her father were not funny at all.\nD: The fall of snow onto the child's face was a tragic and painful event, not a comedic one.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The audience's expectations were completely met, and there was nothing unexpected or humorous about the situation.\nB:  The child was holding a basin of snow with a serious expression, intending to pour it towards her father in a swift and standard manner. As a result, the full basin of snow, with the child's full force, fell back onto her own face, greatly disappointing the audience's expectations, which was really hard to hold back the laughter.\nC: The child's serious expression and intention to pour snow towards her father were not funny at all.\nD: The fall of snow onto the child's face was a tragic and painful event, not a comedic one.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_127_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_127_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_127_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_127_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_127_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_127_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_127_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_127_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_127_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_127_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_127_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_127_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_127_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_127_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_127_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_127_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: A sudden gust of wind blows away the umbrella, leaving everyone drenched and shocked.\nB: The scene of a group of people huddled under a large umbrella to escape the rain, which has a capacity for many individuals, is quite humorous.\nC: The unexpected appearance of a clown riding a unicycle in the background adds to the comedic effect.\nD: The group of people suddenly break into a spontaneous dance party, much to the amusement of onlookers.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: A sudden gust of wind blows away the umbrella, leaving everyone drenched and shocked.\nB: The scene of a group of people huddled under a large umbrella to escape the rain, which has a capacity for many individuals, is quite humorous.\nC: The unexpected appearance of a clown riding a unicycle in the background adds to the comedic effect.\nD: The group of people suddenly break into a spontaneous dance party, much to the amusement of onlookers.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_128_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_128_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_128_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_128_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_128_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_128_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_128_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_128_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_128_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_128_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_128_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_128_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_128_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_128_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_128_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_128_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: Feely flip caution\nB: Touchy reverse alert\nC: Sensitive retro warning\nD: Tactile backward alarm", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: Feely flip caution\nB: Touchy reverse alert\nC: Sensitive retro warning\nD: Tactile backward alarm", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_129_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_129_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_129_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_129_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_129_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_129_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_129_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_129_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_129_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_129_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_129_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_129_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_129_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_129_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_129_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_129_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The man on skis falls down after hitting a rock.\nB: The men are playing a friendly game of snow football.\nC: Two men are skiing together on the snow.\nD:  Two men are pulling a squatting man on skis through the snow. The squatting man sticks out his right foot and trips the man on his right.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The man on skis falls down after hitting a rock.\nB: The men are playing a friendly game of snow football.\nC: Two men are skiing together on the snow.\nD:  Two men are pulling a squatting man on skis through the snow. The squatting man sticks out his right foot and trips the man on his right.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_130_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_130_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_130_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_130_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_130_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_130_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_130_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_130_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_130_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_130_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_130_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_130_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_130_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_130_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_130_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_130_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The strength difference between the two men is evident. The man on the right exudes a Dragon Aotian-like protagonist vibe, and even a slight move from him renders the other party defenseless.\nB: The video maintains its humorous tone by incorporating slapstick comedy and clever visual effects.\nC: The video maintains its humorous tone through exaggerated facial expressions and dramatic music cues.\nD: The humorous tone is achieved through witty dialogue and unexpected plot twists.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The strength difference between the two men is evident. The man on the right exudes a Dragon Aotian-like protagonist vibe, and even a slight move from him renders the other party defenseless.\nB: The video maintains its humorous tone by incorporating slapstick comedy and clever visual effects.\nC: The video maintains its humorous tone through exaggerated facial expressions and dramatic music cues.\nD: The humorous tone is achieved through witty dialogue and unexpected plot twists.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_131_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_131_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_131_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_131_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_131_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_131_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_131_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_131_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_131_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_131_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_131_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_131_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_131_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_131_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_131_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_131_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: While walking, a man accidentally kicked a soda can which hit him in the face.\nB: A man jumped on a wooden plank and a soda can flew and hit him on the head.\nC: A man slipped on a piece of wood, causing a soda can to hit him in the head.\nD: Stepping on a wooden board, a man caused a soda can to be flung into the air from the other end. The man tried to catch it with his hand, but the can missed his grip and struck him in the head.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: While walking, a man accidentally kicked a soda can which hit him in the face.\nB: A man jumped on a wooden plank and a soda can flew and hit him on the head.\nC: A man slipped on a piece of wood, causing a soda can to hit him in the head.\nD: Stepping on a wooden board, a man caused a soda can to be flung into the air from the other end. The man tried to catch it with his hand, but the can missed his grip and struck him in the head.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_132_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_132_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_132_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_132_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_132_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_132_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_132_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_132_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_132_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_132_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_132_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_132_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_132_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_132_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_132_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_132_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The king of accuracy.\nB: The champion of sharpshooting.\nC: The master of archery.\nD: The ruler of marksmanship.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The king of accuracy.\nB: The champion of sharpshooting.\nC: The master of archery.\nD: The ruler of marksmanship.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_133_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_133_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_133_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_133_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_133_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_133_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_133_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_133_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_133_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_133_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_133_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_133_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_133_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_133_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_133_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_133_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: Colossal monster is leading the way for Tiny Ultraman.\nB: Giant Ultraman is running from a tiny monster.\nC: Tiny Ultraman is in pursuit of a colossal monster.\nD: Tiny Ultraman is relaxing with the colossal monster.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: Colossal monster is leading the way for Tiny Ultraman.\nB: Giant Ultraman is running from a tiny monster.\nC: Tiny Ultraman is in pursuit of a colossal monster.\nD: Tiny Ultraman is relaxing with the colossal monster.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_134_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_134_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_134_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_134_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_134_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_134_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_134_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_134_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_134_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_134_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_134_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_134_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_134_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_134_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_134_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_134_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A:  An adult is holding a child wearing a white hat. The adult holds a strawberry and reaches towards the child's mouth, then puts it into their own mouth. The child initially smiles but then shows a crying face.\nB: The adult and the child are sharing a strawberry happily.\nC: A child is feeding a strawberry to an adult wearing a white hat.\nD: The child is teasing the adult with a strawberry, causing the adult to make a funny face.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA:  An adult is holding a child wearing a white hat. The adult holds a strawberry and reaches towards the child's mouth, then puts it into their own mouth. The child initially smiles but then shows a crying face.\nB: The adult and the child are sharing a strawberry happily.\nC: A child is feeding a strawberry to an adult wearing a white hat.\nD: The child is teasing the adult with a strawberry, causing the adult to make a funny face.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_135_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_135_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_135_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_135_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_135_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_135_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_135_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_135_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_135_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_135_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_135_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_135_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_135_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_135_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_135_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_135_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: Your trousers have been gnawed away.\nB: Seems like your pants were devoured.\nC: Looks like your trousers got eaten.\nD: Your pants have been chewed up.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: Your trousers have been gnawed away.\nB: Seems like your pants were devoured.\nC: Looks like your trousers got eaten.\nD: Your pants have been chewed up.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_136_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_136_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_136_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_136_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_136_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_136_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_136_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_136_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_136_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_136_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_136_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_136_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_136_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_136_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_136_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_136_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: Two men are walking on a sturdy bridge over a field. The last man starts jumping on the bridge, causing it to sway. The first man cannot maintain his balance, and the last man falls down, dropping his phone into the water.\nB: Four men are walking on a suspension bridge over water. The last man starts jumping on the bridge, causing it to sway. The first three men cannot maintain their balance, and the last man falls down, dropping his hat into the water.\nC: Three women are walking on a suspension bridge over water. The last two women start singing, causing the bridge to sway. The first woman cannot maintain her balance, and all women fall down, dropping their phones into the water.\nD:  Three women are walking on a suspension bridge over water. The last woman starts jumping on the bridge, causing it to sway. The first two women cannot maintain their balance, and the last woman falls down, dropping her phone into the water.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: Two men are walking on a sturdy bridge over a field. The last man starts jumping on the bridge, causing it to sway. The first man cannot maintain his balance, and the last man falls down, dropping his phone into the water.\nB: Four men are walking on a suspension bridge over water. The last man starts jumping on the bridge, causing it to sway. The first three men cannot maintain their balance, and the last man falls down, dropping his hat into the water.\nC: Three women are walking on a suspension bridge over water. The last two women start singing, causing the bridge to sway. The first woman cannot maintain her balance, and all women fall down, dropping their phones into the water.\nD:  Three women are walking on a suspension bridge over water. The last woman starts jumping on the bridge, causing it to sway. The first two women cannot maintain their balance, and the last woman falls down, dropping her phone into the water.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_137_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_137_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_137_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_137_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_137_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_137_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_137_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_137_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_137_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_137_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_137_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_137_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_137_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_137_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_137_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_137_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The video's humor is based on the use of chopsticks and spoons, implying that the person in the video is consuming feces with these tools, which are typically used for eating.\nB: The video is comedic due to the use of advanced special effects that make the scene look realistic\nC: The video is comedic because it depicts a serious situation with dramatic music, creating a suspenseful atmosphere\nD: The video is comedic because it features famous celebrity cameos, adding a touch of glamour and sophistication", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The video's humor is based on the use of chopsticks and spoons, implying that the person in the video is consuming feces with these tools, which are typically used for eating.\nB: The video is comedic due to the use of advanced special effects that make the scene look realistic\nC: The video is comedic because it depicts a serious situation with dramatic music, creating a suspenseful atmosphere\nD: The video is comedic because it features famous celebrity cameos, adding a touch of glamour and sophistication", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_138_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_138_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_138_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_138_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_138_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_138_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_138_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_138_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_138_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_138_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_138_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_138_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_138_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_138_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_138_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_138_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: Unexpected collision with a roadside pole.\nB: To hit a utility pole head-on with great impact.\nC: The moment of impact with a utility pole.\nD: Car crash into a pole at full speed.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: Unexpected collision with a roadside pole.\nB: To hit a utility pole head-on with great impact.\nC: The moment of impact with a utility pole.\nD: Car crash into a pole at full speed.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_139_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_139_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_139_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_139_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_139_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_139_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_139_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_139_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_139_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_139_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_139_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_139_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_139_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_139_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_139_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_139_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: Four dogs are surrounding a water source, drinking. However, once the black dog finishes, it begins to dig at the water bowl with its paws, hindering the other dogs from getting a drink.\nB: One dog is drinking while the other dogs are playing around it.\nC: The dogs are enjoying a peaceful drink together without any disruptions.\nD: The dogs are fighting over a treat in the water bowl.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: Four dogs are surrounding a water source, drinking. However, once the black dog finishes, it begins to dig at the water bowl with its paws, hindering the other dogs from getting a drink.\nB: One dog is drinking while the other dogs are playing around it.\nC: The dogs are enjoying a peaceful drink together without any disruptions.\nD: The dogs are fighting over a treat in the water bowl.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_140_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_140_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_140_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_140_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_140_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_140_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_140_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_140_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_140_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_140_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_140_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_140_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_140_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_140_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_140_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_140_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: Marking is a beloved activity of dogs to indicate their territory.\nB: Dogs have a unique way of expressing their love for grass.\nC: Territorial disputes among dogs can be quite colorful.\nD: Dogs take pride in leaving their mark everywhere they go.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: Marking is a beloved activity of dogs to indicate their territory.\nB: Dogs have a unique way of expressing their love for grass.\nC: Territorial disputes among dogs can be quite colorful.\nD: Dogs take pride in leaving their mark everywhere they go.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_141_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_141_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_141_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_141_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_141_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_141_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_141_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_141_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_141_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_141_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_141_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_141_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_141_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_141_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_141_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_141_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A:  A dog stretched out its paws and scratched towards a snake. Suddenly, when the dog pulled the snake over, it jumped into the air. After falling to the ground, the dog quickly rolled over and stood up to stare at the snake.\nB: A cat meowed loudly and scared the dog away.\nC: A bird flew down and perched on the dog's nose, making it sneeze.\nD: A squirrel ran past the dog, causing it to chase after the squirrel.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA:  A dog stretched out its paws and scratched towards a snake. Suddenly, when the dog pulled the snake over, it jumped into the air. After falling to the ground, the dog quickly rolled over and stood up to stare at the snake.\nB: A cat meowed loudly and scared the dog away.\nC: A bird flew down and perched on the dog's nose, making it sneeze.\nD: A squirrel ran past the dog, causing it to chase after the squirrel.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_142_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_142_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_142_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_142_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_142_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_142_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_142_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_142_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_142_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_142_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_142_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_142_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_142_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_142_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_142_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_142_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: Two cats gracefully danced down the slope in perfect synchronization.\nB: The cats climbed the slope skillfully and reached the top without any mishaps.\nC: The cats peacefully enjoyed the view while sitting on the slope.\nD: On a smooth incline, one cat lost its balance and knocked down another cat, resulting in both cats falling off the slope while intertwined.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: Two cats gracefully danced down the slope in perfect synchronization.\nB: The cats climbed the slope skillfully and reached the top without any mishaps.\nC: The cats peacefully enjoyed the view while sitting on the slope.\nD: On a smooth incline, one cat lost its balance and knocked down another cat, resulting in both cats falling off the slope while intertwined.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_143_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_143_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_143_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_143_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_143_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_143_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_143_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_143_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_143_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_143_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_143_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_143_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_143_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_143_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_143_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_143_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The audience did not find the video amusing.\nB: In an attempt to jump with height, the person collided with the pole and fell onto the mat. The series of events that followed were smooth and entertaining.\nC: The video was a serious depiction of a dangerous stunt.\nD: The person effortlessly executed the jump without any mishaps.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The audience did not find the video amusing.\nB: In an attempt to jump with height, the person collided with the pole and fell onto the mat. The series of events that followed were smooth and entertaining.\nC: The video was a serious depiction of a dangerous stunt.\nD: The person effortlessly executed the jump without any mishaps.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_144_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_144_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_144_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_144_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_144_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_144_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_144_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_144_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_144_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_144_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_144_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_144_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_144_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_144_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_144_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_144_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: Hidden damage.\nB: Secretive injury\nC: Concealed harm\nD: Unexpected destruction", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: Hidden damage.\nB: Secretive injury\nC: Concealed harm\nD: Unexpected destruction", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_145_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_145_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_145_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_145_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_145_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_145_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_145_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_145_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_145_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_145_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_145_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_145_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_145_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_145_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_145_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_145_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The video is funny because it shows three men skydiving from a plane with serious expressions.\nB:  The video is funny because three men were having a great time in KTV, pretending that the chairs were boats and paddling with funny movements.\nC: The video is funny because it captures a serious business meeting with three men sitting in an office.\nD: The video is funny because it features three men participating in a cooking competition with intense concentration.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The video is funny because it shows three men skydiving from a plane with serious expressions.\nB:  The video is funny because three men were having a great time in KTV, pretending that the chairs were boats and paddling with funny movements.\nC: The video is funny because it captures a serious business meeting with three men sitting in an office.\nD: The video is funny because it features three men participating in a cooking competition with intense concentration.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_146_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_146_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_146_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_146_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_146_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_146_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_146_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_146_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_146_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_146_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_146_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_146_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_146_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_146_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_146_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_146_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The part where he successfully kicked the garbage was the funniest moment.\nB: The video was not funny at all.\nC: The moment when the garbage that he had thrown out hit him was not funny at all.\nD: The moment when the garbage that he had thrown out hit him was amusing enough. However, the situation turned even more hilarious when he missed kicking it.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The part where he successfully kicked the garbage was the funniest moment.\nB: The video was not funny at all.\nC: The moment when the garbage that he had thrown out hit him was not funny at all.\nD: The moment when the garbage that he had thrown out hit him was amusing enough. However, the situation turned even more hilarious when he missed kicking it.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_147_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_147_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_147_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_147_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_147_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_147_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_147_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_147_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_147_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_147_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_147_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_147_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_147_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_147_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_147_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_147_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: Two people on the lower bunk are kicking the bed board of the upper bunk with their feet, causing the person on top to be shaken awake.\nB: The person on the lower bunk is quietly reading a book while the person on the upper bunk is peacefully sleeping.\nC: The person on the lower bunk is playing a guitar, creating a calming atmosphere, while the person on the upper bunk is enjoying the music.\nD: Three people are having a pillow fight on the lower bunk, while the person on the upper bunk is fast asleep.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: Two people on the lower bunk are kicking the bed board of the upper bunk with their feet, causing the person on top to be shaken awake.\nB: The person on the lower bunk is quietly reading a book while the person on the upper bunk is peacefully sleeping.\nC: The person on the lower bunk is playing a guitar, creating a calming atmosphere, while the person on the upper bunk is enjoying the music.\nD: Three people are having a pillow fight on the lower bunk, while the person on the upper bunk is fast asleep.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_148_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_148_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_148_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_148_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_148_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_148_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_148_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_148_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_148_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_148_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_148_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_148_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_148_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_148_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_148_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_148_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The woman's action is actually quite dangerous and not suitable for a humorous video.\nB: The woman's movement is ordinary and predictable, which makes the video boring.\nC: It's entertaining to observe the woman's unconventional way of descending to the ground, with her head facing directly downward.\nD: The video lacks any unexpected or unconventional elements, making it uninteresting.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The woman's action is actually quite dangerous and not suitable for a humorous video.\nB: The woman's movement is ordinary and predictable, which makes the video boring.\nC: It's entertaining to observe the woman's unconventional way of descending to the ground, with her head facing directly downward.\nD: The video lacks any unexpected or unconventional elements, making it uninteresting.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_149_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_149_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_149_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_149_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_149_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_149_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_149_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_149_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_149_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_149_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_149_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_149_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_149_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_149_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_149_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_149_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: Library study session\nB: Funeral procession\nC: Political debate\nD:  Social dance party.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: Library study session\nB: Funeral procession\nC: Political debate\nD:  Social dance party.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_150_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_150_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_150_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_150_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_150_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_150_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_150_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_150_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_150_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_150_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_150_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_150_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_150_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_150_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_150_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_150_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The child and the adult break into a choreographed dance routine instead of playing the prank.\nB: The child accidentally trips and falls while trying to switch off the light.\nC: The computer screen suddenly freezes, causing confusion for the child and the adult.\nD: The child who was seated in front of the computer rapidly turns it off and switches off the light. When the adult opens the door and finds nothing amiss, the child promptly switches on the lights and spots the adult standing behind them.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The child and the adult break into a choreographed dance routine instead of playing the prank.\nB: The child accidentally trips and falls while trying to switch off the light.\nC: The computer screen suddenly freezes, causing confusion for the child and the adult.\nD: The child who was seated in front of the computer rapidly turns it off and switches off the light. When the adult opens the door and finds nothing amiss, the child promptly switches on the lights and spots the adult standing behind them.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_151_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_151_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_151_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_151_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_151_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_151_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_151_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_151_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_151_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_151_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_151_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_151_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_151_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_151_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_151_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_151_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: Upon hearing the staff member boldly remark, \"Slow down, it's a big fat guy,\" directly in front of the man, he instantly turned his head in utter shock and fixed his gaze on the staff member. His wide-eyed stare revealed a mixture of astonishment, perplexity, and dazedness. Without giving the man a chance to respond, the staff member ruthlessly pushed him and initiated the cable car ride.\nB: The video maintains its humorous tone through the use of dark and offensive humor, which may not be suitable for everyone.\nC: The video maintains its humorous tone by incorporating slapstick comedy and exaggerated physical gestures.\nD: The video maintains its humorous tone by emphasizing awkward social interactions and uncomfortable situations.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: Upon hearing the staff member boldly remark, \"Slow down, it's a big fat guy,\" directly in front of the man, he instantly turned his head in utter shock and fixed his gaze on the staff member. His wide-eyed stare revealed a mixture of astonishment, perplexity, and dazedness. Without giving the man a chance to respond, the staff member ruthlessly pushed him and initiated the cable car ride.\nB: The video maintains its humorous tone through the use of dark and offensive humor, which may not be suitable for everyone.\nC: The video maintains its humorous tone by incorporating slapstick comedy and exaggerated physical gestures.\nD: The video maintains its humorous tone by emphasizing awkward social interactions and uncomfortable situations.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_152_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_152_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_152_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_152_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_152_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_152_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_152_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_152_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_152_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_152_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_152_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_152_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_152_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_152_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_152_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_152_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The Dance Floor Champion\nB: The Air Guitar Virtuoso\nC: The Lip-Sync Sensation\nD: The Karaoke Master", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The Dance Floor Champion\nB: The Air Guitar Virtuoso\nC: The Lip-Sync Sensation\nD: The Karaoke Master", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_153_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_153_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_153_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_153_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_153_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_153_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_153_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_153_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_153_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_153_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_153_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_153_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_153_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_153_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_153_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_153_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The video maintains its humorous tone by using dark and gloomy lighting to set a serious mood.\nB: The video maintains its humorous tone by adding intense music to create tension and suspense.\nC: The small canine was hesitant to swim, so the bigger dog lifted it up and took it along, creating a humorous scene.\nD: The video maintains its humorous tone by including sad and heartwrenching soundtrack to evoke emotional response.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The video maintains its humorous tone by using dark and gloomy lighting to set a serious mood.\nB: The video maintains its humorous tone by adding intense music to create tension and suspense.\nC: The small canine was hesitant to swim, so the bigger dog lifted it up and took it along, creating a humorous scene.\nD: The video maintains its humorous tone by including sad and heartwrenching soundtrack to evoke emotional response.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_154_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_154_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_154_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_154_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_154_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_154_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_154_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_154_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_154_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_154_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_154_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_154_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_154_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_154_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_154_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_154_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: A blue cake is being given to two young cats by a person, but one of the cats unexpectedly reaches out and bats the cake away with its claws.\nB: A red cake is being given to two dogs by a person, but one of the dogs unexpectedly reaches out and barks at the cake.\nC: A yellow cake is being given to two kittens by a person, but one of the kittens unexpectedly reaches out and licks the cake.\nD: A green cake is being given to two puppies by a person, but one of the puppies unexpectedly reaches out and eats the cake.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: A blue cake is being given to two young cats by a person, but one of the cats unexpectedly reaches out and bats the cake away with its claws.\nB: A red cake is being given to two dogs by a person, but one of the dogs unexpectedly reaches out and barks at the cake.\nC: A yellow cake is being given to two kittens by a person, but one of the kittens unexpectedly reaches out and licks the cake.\nD: A green cake is being given to two puppies by a person, but one of the puppies unexpectedly reaches out and eats the cake.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_155_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_155_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_155_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_155_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_155_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_155_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_155_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_155_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_155_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_155_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_155_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_155_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_155_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_155_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_155_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_155_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: Reflection of hands in the mirror\nB: Mystical hand movements in the reflected image\nC: Shadowy figures reflected in the glass\nD: Ghostly apparition in the mirror", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: Reflection of hands in the mirror\nB: Mystical hand movements in the reflected image\nC: Shadowy figures reflected in the glass\nD: Ghostly apparition in the mirror", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_156_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_156_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_156_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_156_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_156_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_156_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_156_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_156_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_156_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_156_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_156_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_156_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_156_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_156_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_156_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_156_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: Following the cleaning session, the individual tucked the broom beneath their armpit and subsequently couldn't locate it.\nB: The person comically mistook a mop for a broom and started using it as a microphone instead.\nC: After finishing the cleaning, the person accidentally tripped over the broom and fell comically.\nD: Once done with cleaning, the person used the broom as a guitar and performed a funny air guitar solo.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: Following the cleaning session, the individual tucked the broom beneath their armpit and subsequently couldn't locate it.\nB: The person comically mistook a mop for a broom and started using it as a microphone instead.\nC: After finishing the cleaning, the person accidentally tripped over the broom and fell comically.\nD: Once done with cleaning, the person used the broom as a guitar and performed a funny air guitar solo.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_157_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_157_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_157_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_157_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_157_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_157_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_157_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_157_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_157_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_157_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_157_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_157_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_157_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_157_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_157_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_157_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: A high-speed chase\nB: A discarded vehicle.\nC: A luxury car showroom\nD: A traffic jam on the highway", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: A high-speed chase\nB: A discarded vehicle.\nC: A luxury car showroom\nD: A traffic jam on the highway", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_158_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_158_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_158_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_158_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_158_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_158_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_158_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_158_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_158_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_158_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_158_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_158_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_158_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_158_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_158_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_158_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: Dogs scared of needles during vet visit\nB: Canine vaccination anxiety revealed\nC: Canine pups are also frightened of getting vaccinated.\nD: Adorable puppies facing their worst fear", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: Dogs scared of needles during vet visit\nB: Canine vaccination anxiety revealed\nC: Canine pups are also frightened of getting vaccinated.\nD: Adorable puppies facing their worst fear", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_159_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_159_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_159_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_159_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_159_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_159_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_159_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_159_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_159_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_159_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_159_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_159_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_159_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_159_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_159_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_159_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The unexpected twist in the storyline creates a funny outcome.\nB: The use of exaggerated facial expressions makes the video amusing.\nC: The humorous background music adds to the comedic effect.\nD:  The funny thing is the expression on the lady's face, as if she is putting in a lot of effort, which is quite comical.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The unexpected twist in the storyline creates a funny outcome.\nB: The use of exaggerated facial expressions makes the video amusing.\nC: The humorous background music adds to the comedic effect.\nD:  The funny thing is the expression on the lady's face, as if she is putting in a lot of effort, which is quite comical.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_160_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_160_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_160_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_160_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_160_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_160_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_160_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_160_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_160_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_160_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_160_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_160_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_160_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_160_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_160_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_160_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: At a crowded market, a monkey was laughing loudly while stealing fruits.\nB: In a library, a cat was meowing loudly while knocking down books.\nC: On the coastal pathway, one bird was screeching with an open beak, followed by another bird's call.\nD: In a classroom, a dog was howling loudly while running around the desk.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: At a crowded market, a monkey was laughing loudly while stealing fruits.\nB: In a library, a cat was meowing loudly while knocking down books.\nC: On the coastal pathway, one bird was screeching with an open beak, followed by another bird's call.\nD: In a classroom, a dog was howling loudly while running around the desk.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_161_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_161_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_161_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_161_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_161_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_161_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_161_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_161_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_161_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_161_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_161_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_161_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_161_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_161_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_161_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_161_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: Discovering the hidden secrets of the universe through music.\nB: A funny look at pet care techniques.\nC: Mastering the art of cooking without a recipe.\nD: The ideal applications of a stethoscope.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: Discovering the hidden secrets of the universe through music.\nB: A funny look at pet care techniques.\nC: Mastering the art of cooking without a recipe.\nD: The ideal applications of a stethoscope.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_162_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_162_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_162_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_162_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_162_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_162_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_162_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_162_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_162_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_162_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_162_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_162_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_162_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_162_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_162_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_162_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The Joker and Batman are dancing in perfect sync.\nB: The head of Kung Fu Panda is perfectly aligned with Mickey Mouse's body, with the addition of a pair of human legs.\nC: Minnie Mouse and Donald Duck are having a tea party together.\nD: SpongeBob SquarePants is playing the piano with all four arms.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The Joker and Batman are dancing in perfect sync.\nB: The head of Kung Fu Panda is perfectly aligned with Mickey Mouse's body, with the addition of a pair of human legs.\nC: Minnie Mouse and Donald Duck are having a tea party together.\nD: SpongeBob SquarePants is playing the piano with all four arms.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_163_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_163_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_163_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_163_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_163_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_163_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_163_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_163_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_163_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_163_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_163_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_163_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_163_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_163_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_163_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_163_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: Three friends jumping on a trampoline.\nB: Two women are trying to balance on a seesaw.\nC: Sisters falling together from a swing.\nD:  If they are sisters, they will fall together on level ground.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: Three friends jumping on a trampoline.\nB: Two women are trying to balance on a seesaw.\nC: Sisters falling together from a swing.\nD:  If they are sisters, they will fall together on level ground.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_164_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_164_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_164_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_164_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_164_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_164_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_164_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_164_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_164_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_164_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_164_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_164_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_164_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_164_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_164_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_164_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The playful music in the background\nB: The comical facial expressions of the onlookers\nC: It was a funny sight to see the woman drop from the swing.\nD: The unexpectedness of the woman dropping from the swing", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The playful music in the background\nB: The comical facial expressions of the onlookers\nC: It was a funny sight to see the woman drop from the swing.\nD: The unexpectedness of the woman dropping from the swing", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_165_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_165_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_165_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_165_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_165_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_165_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_165_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_165_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_165_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_165_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_165_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_165_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_165_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_165_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_165_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_165_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: A dog suddenly ran onto the mat, causing the high jumper to lose balance and fall.\nB: The high jump bar fell due to a strong wind, causing chaos in the competition.\nC: Someone jumped high and landed on the edge of the mat, resulting in the mat being overturned and the high jump apparatus being knocked down.\nD: The person attempted a somersault but failed and crashed into the audience.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: A dog suddenly ran onto the mat, causing the high jumper to lose balance and fall.\nB: The high jump bar fell due to a strong wind, causing chaos in the competition.\nC: Someone jumped high and landed on the edge of the mat, resulting in the mat being overturned and the high jump apparatus being knocked down.\nD: The person attempted a somersault but failed and crashed into the audience.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_166_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_166_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_166_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_166_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_166_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_166_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_166_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_166_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_166_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_166_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_166_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_166_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_166_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_166_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_166_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_166_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: Boring Haircut.\nB: Flaming Hairstyle.\nC: Soggy Hairstyle.\nD: Plain Hairstyle.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: Boring Haircut.\nB: Flaming Hairstyle.\nC: Soggy Hairstyle.\nD: Plain Hairstyle.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_167_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_167_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_167_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_167_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_167_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_167_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_167_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_167_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_167_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_167_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_167_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_167_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_167_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_167_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_167_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_167_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: With its eyes covered, a donkey is being walked around a millstone, which causes it to eventually start nibbling on the grains on the plate.\nB: The millstone is spinning while the donkey watches from a distance.\nC: A donkey is leisurely walking around a millstone, observing its surroundings.\nD: The donkey is standing still, not showing any reaction to the millstone.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: With its eyes covered, a donkey is being walked around a millstone, which causes it to eventually start nibbling on the grains on the plate.\nB: The millstone is spinning while the donkey watches from a distance.\nC: A donkey is leisurely walking around a millstone, observing its surroundings.\nD: The donkey is standing still, not showing any reaction to the millstone.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_168_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_168_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_168_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_168_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_168_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_168_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_168_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_168_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_168_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_168_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_168_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_168_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_168_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_168_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_168_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_168_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: A small dog jumping through a hula hoop.\nB: A little dog leaping over a bungee cord.\nC: A tiny pup skipping over an elastic cord.\nD: A miniature puppy hopping over a rubber band.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: A small dog jumping through a hula hoop.\nB: A little dog leaping over a bungee cord.\nC: A tiny pup skipping over an elastic cord.\nD: A miniature puppy hopping over a rubber band.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_169_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_169_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_169_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_169_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_169_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_169_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_169_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_169_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_169_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_169_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_169_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_169_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_169_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_169_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_169_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_169_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: A dump truck is driving on a highway with multiple lanes. The sun is setting in the background, casting long shadows from the surrounding trees.\nB: A delivery van is parked outside a cafe, with people passing by and clouds drifting slowly across the sky.\nC: A cement mixer truck is moving through a tunnel with a series of lights along both sides of the tunnel. The ladder at the rear of the truck produces shadows in varying locations as it moves beneath the different lights.\nD: A bicycle is traversing a bridge with colorful graffiti on the walls. The river below reflects the bridge's arches in a mesmerizing pattern.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: A dump truck is driving on a highway with multiple lanes. The sun is setting in the background, casting long shadows from the surrounding trees.\nB: A delivery van is parked outside a cafe, with people passing by and clouds drifting slowly across the sky.\nC: A cement mixer truck is moving through a tunnel with a series of lights along both sides of the tunnel. The ladder at the rear of the truck produces shadows in varying locations as it moves beneath the different lights.\nD: A bicycle is traversing a bridge with colorful graffiti on the walls. The river below reflects the bridge's arches in a mesmerizing pattern.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_170_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_170_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_170_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_170_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_170_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_170_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_170_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_170_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_170_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_170_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_170_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_170_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_170_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_170_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_170_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_170_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: Way off target.\nB: Bullseye!\nC:  Right on target.\nD: Missed the mark.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: Way off target.\nB: Bullseye!\nC:  Right on target.\nD: Missed the mark.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_171_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_171_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_171_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_171_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_171_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_171_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_171_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_171_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_171_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_171_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_171_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_171_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_171_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_171_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_171_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_171_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The video is funny because the man singing is very passionate, while the man sitting next to him appears to be bored and uninterested, reducing the humor.\nB: The video is funny because the man singing is very passionate, while the man sitting next to him looks very uncomfortable and in pain after hearing it. The contrast between the two is amusing, showing that the man's singing is actually quite good, which makes people laugh.\nC: The video is funny because the man singing is very passionate, while the man sitting next to him is also enjoying the singing, which adds to the fun atmosphere.\nD:  The video is funny because the man singing is very passionate, while the man sitting next to him looks very uncomfortable and in pain after hearing it. The contrast between the two is amusing, showing that the man's singing is so bad yet he is still so into it, which makes people laugh.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The video is funny because the man singing is very passionate, while the man sitting next to him appears to be bored and uninterested, reducing the humor.\nB: The video is funny because the man singing is very passionate, while the man sitting next to him looks very uncomfortable and in pain after hearing it. The contrast between the two is amusing, showing that the man's singing is actually quite good, which makes people laugh.\nC: The video is funny because the man singing is very passionate, while the man sitting next to him is also enjoying the singing, which adds to the fun atmosphere.\nD:  The video is funny because the man singing is very passionate, while the man sitting next to him looks very uncomfortable and in pain after hearing it. The contrast between the two is amusing, showing that the man's singing is so bad yet he is still so into it, which makes people laugh.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_172_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_172_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_172_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_172_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_172_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_172_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_172_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_172_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_172_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_172_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_172_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_172_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_172_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_172_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_172_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_172_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The video maintains its humorous tone by showing a person and a cat dance in unison, which is a rare sight, and the cat's happy and excited expression adds a touch of humor to the situation.\nB: Seeing a person and a dog dance in unison is a rare sight, and the little pup's confused and disoriented expression adds a touch of humor to the situation.\nC: The video maintains its humorous tone by showing a cat and a dog dancing in unison, which is a rare sight, and the cat's confused and disoriented expression adds a touch of humor to the situation.\nD: The video maintains its humorous tone by showing a person and a dog dance in unison, which is a common sight, and the little dog's confident and focused expression adds a touch of seriousness to the situation.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The video maintains its humorous tone by showing a person and a cat dance in unison, which is a rare sight, and the cat's happy and excited expression adds a touch of humor to the situation.\nB: Seeing a person and a dog dance in unison is a rare sight, and the little pup's confused and disoriented expression adds a touch of humor to the situation.\nC: The video maintains its humorous tone by showing a cat and a dog dancing in unison, which is a rare sight, and the cat's confused and disoriented expression adds a touch of humor to the situation.\nD: The video maintains its humorous tone by showing a person and a dog dance in unison, which is a common sight, and the little dog's confident and focused expression adds a touch of seriousness to the situation.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_173_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_173_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_173_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_173_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_173_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_173_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_173_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_173_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_173_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_173_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_173_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_173_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_173_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_173_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_173_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_173_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: There are no other puppies in the video, it's just the dog shaking up and down.\nB:  The dog is shaking up and down in mid-air, while its feet keep stepping on another puppy's head repeatedly. It looks like it's doing pull-ups, and the movement is very strange.\nC: The dog is actually flying and not shaking up and down.\nD: The dog is calmly sitting on the ground and not moving at all.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: There are no other puppies in the video, it's just the dog shaking up and down.\nB:  The dog is shaking up and down in mid-air, while its feet keep stepping on another puppy's head repeatedly. It looks like it's doing pull-ups, and the movement is very strange.\nC: The dog is actually flying and not shaking up and down.\nD: The dog is calmly sitting on the ground and not moving at all.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_174_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_174_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_174_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_174_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_174_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_174_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_174_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_174_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_174_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_174_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_174_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_174_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_174_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_174_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_174_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_174_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: Amidst the background noise of a person's loud laughter, two birds begin to cackle with their beaks wide open, much like human laughter. It appears as if they are responding to the human's merriment, making the scene quite comical.\nB: The video is humorous because the birds are actually speaking in human language and telling jokes.\nC: The comical element comes from the birds nervously laughing in response to the person's aggressive behavior.\nD: The humor arises from the birds mimicking the laughter of the person, creating an eerie and unsettling atmosphere.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: Amidst the background noise of a person's loud laughter, two birds begin to cackle with their beaks wide open, much like human laughter. It appears as if they are responding to the human's merriment, making the scene quite comical.\nB: The video is humorous because the birds are actually speaking in human language and telling jokes.\nC: The comical element comes from the birds nervously laughing in response to the person's aggressive behavior.\nD: The humor arises from the birds mimicking the laughter of the person, creating an eerie and unsettling atmosphere.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_175_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_175_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_175_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_175_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_175_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_175_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_175_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_175_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_175_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_175_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_175_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_175_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_175_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_175_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_175_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_175_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The kitty is very hungry.\nB:  This kitty won't eat.\nC: The cat is eagerly devouring its food.\nD: This cat is enjoying a meal.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The kitty is very hungry.\nB:  This kitty won't eat.\nC: The cat is eagerly devouring its food.\nD: This cat is enjoying a meal.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_176_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_176_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_176_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_176_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_176_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_176_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_176_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_176_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_176_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_176_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_176_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_176_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_176_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_176_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_176_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_176_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The ladder's shadows looked scary and ominous in the tunnel.\nB:  In the tunnel, the cement truck kept moving forward. The shadows of the ladder under different lights seemed to be dancing, making the ladder very interesting.\nC: The cement truck's movement was unpredictable and chaotic.\nD: The video was boring and lacked any comedic elements.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The ladder's shadows looked scary and ominous in the tunnel.\nB:  In the tunnel, the cement truck kept moving forward. The shadows of the ladder under different lights seemed to be dancing, making the ladder very interesting.\nC: The cement truck's movement was unpredictable and chaotic.\nD: The video was boring and lacked any comedic elements.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_177_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_177_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_177_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_177_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_177_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_177_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_177_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_177_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_177_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_177_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_177_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_177_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_177_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_177_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_177_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_177_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: A random spectator enjoying the tug-of-war match.\nB: The most exceptional player in the audience during a tug-of-war event.\nC: The unexpected hero of a sports event.\nD: The surprising winner of a competitive game.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: A random spectator enjoying the tug-of-war match.\nB: The most exceptional player in the audience during a tug-of-war event.\nC: The unexpected hero of a sports event.\nD: The surprising winner of a competitive game.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_178_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_178_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_178_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_178_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_178_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_178_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_178_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_178_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_178_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_178_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_178_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_178_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_178_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_178_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_178_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_178_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The old man tries to use the stethoscope as a microphone.\nB: The woman uses the stethoscope to listen to the old man's heartbeat.\nC: The old man uses the stethoscope to listen to music.\nD:  The old man puts the stethoscope on his ears, and the woman speaks into the other end.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The old man tries to use the stethoscope as a microphone.\nB: The woman uses the stethoscope to listen to the old man's heartbeat.\nC: The old man uses the stethoscope to listen to music.\nD:  The old man puts the stethoscope on his ears, and the woman speaks into the other end.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_179_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_179_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_179_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_179_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_179_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_179_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_179_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_179_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_179_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_179_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_179_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_179_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_179_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_179_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_179_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_179_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: Geospatial disturbance.\nB: Dimensional shift.\nC: Temporal distortion.\nD: Spatial displacement.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: Geospatial disturbance.\nB: Dimensional shift.\nC: Temporal distortion.\nD: Spatial displacement.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_180_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_180_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_180_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_180_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_180_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_180_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_180_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_180_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_180_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_180_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_180_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_180_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_180_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_180_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_180_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_180_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The video is comedic because the person fails to jump onto the cushion and falls down clumsily.\nB: After jumping onto a cushion, the person manages to flip the entire cushion over, which is an unforeseen turn of events.\nC: The video is comedic because the person successfully jumps onto the cushion but then just walks away.\nD: The video is comedic because the cushion remains unaffected and the person looks disappointed.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The video is comedic because the person fails to jump onto the cushion and falls down clumsily.\nB: After jumping onto a cushion, the person manages to flip the entire cushion over, which is an unforeseen turn of events.\nC: The video is comedic because the person successfully jumps onto the cushion but then just walks away.\nD: The video is comedic because the cushion remains unaffected and the person looks disappointed.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_181_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_181_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_181_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_181_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_181_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_181_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_181_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_181_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_181_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_181_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_181_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_181_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_181_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_181_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_181_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_181_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The video is not funny because the man's failed stunt results in a serious injury and is actually quite disturbing.\nB: The video is funny because the man successfully performs a cool stunt by leaping over a garbage bin with elegance and grace.\nC: The video is humorous because the man attempts to pull off a cool stunt by leaping over a garbage bin, but his efforts result in failure and a comical groin injury.\nD: The video is funny because the man is not attempting any stunt and is just walking casually.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The video is not funny because the man's failed stunt results in a serious injury and is actually quite disturbing.\nB: The video is funny because the man successfully performs a cool stunt by leaping over a garbage bin with elegance and grace.\nC: The video is humorous because the man attempts to pull off a cool stunt by leaping over a garbage bin, but his efforts result in failure and a comical groin injury.\nD: The video is funny because the man is not attempting any stunt and is just walking casually.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_182_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_182_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_182_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_182_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_182_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_182_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_182_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_182_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_182_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_182_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_182_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_182_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_182_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_182_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_182_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_182_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: A dog ran through and caused chaos by knocking over a table.\nB: A man suddenly tripped and knocked over a stack of chairs.\nC: When a woman fell, she inadvertently caused another woman to lose her balance.\nD: Two people collided while trying to catch a flying object.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: A dog ran through and caused chaos by knocking over a table.\nB: A man suddenly tripped and knocked over a stack of chairs.\nC: When a woman fell, she inadvertently caused another woman to lose her balance.\nD: Two people collided while trying to catch a flying object.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_183_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_183_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_183_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_183_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_183_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_183_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_183_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_183_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_183_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_183_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_183_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_183_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_183_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_183_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_183_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_183_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The sound of the puppy's bark is uncannily similar to that of a baby's weeping. Standard puppy barks sound like \"woof woof woof,\" but this one's barks are \"ah ah ah\" with a piercing sound.\nB: The video maintains its humorous tone by juxtaposing the adorable visuals of the puppy with the unexpected high-pitched barks, creating a humorous and surprising experience for the viewers.\nC: The video maintains its humorous tone by incorporating funny captions that highlight the unusual sounds of the puppy's barks, enhancing the comedic effect of the meme.\nD: The video maintains its humorous tone through the clever use of unexpected sound effects, creating a comical contrast between the puppy's appearance and its unique barks.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The sound of the puppy's bark is uncannily similar to that of a baby's weeping. Standard puppy barks sound like \"woof woof woof,\" but this one's barks are \"ah ah ah\" with a piercing sound.\nB: The video maintains its humorous tone by juxtaposing the adorable visuals of the puppy with the unexpected high-pitched barks, creating a humorous and surprising experience for the viewers.\nC: The video maintains its humorous tone by incorporating funny captions that highlight the unusual sounds of the puppy's barks, enhancing the comedic effect of the meme.\nD: The video maintains its humorous tone through the clever use of unexpected sound effects, creating a comical contrast between the puppy's appearance and its unique barks.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_184_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_184_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_184_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_184_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_184_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_184_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_184_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_184_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_184_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_184_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_184_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_184_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_184_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_184_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_184_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_184_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: A man is playing guitar at a concert with a deep, soothing voice and standing still.\nB: A man is reciting poetry in a library with a soft, calm voice and remaining seated.\nC: A man is rapping on stage with a loud, energetic voice and dancing wildly.\nD: A man is singing at KTV with a sharp, high-pitched voice and performing a forward-leaning arrow step stance.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: A man is playing guitar at a concert with a deep, soothing voice and standing still.\nB: A man is reciting poetry in a library with a soft, calm voice and remaining seated.\nC: A man is rapping on stage with a loud, energetic voice and dancing wildly.\nD: A man is singing at KTV with a sharp, high-pitched voice and performing a forward-leaning arrow step stance.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_185_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_185_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_185_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_185_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_185_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_185_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_185_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_185_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_185_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_185_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_185_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_185_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_185_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_185_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_185_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_185_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The man was too scared to start the zip-line.\nB: The staff member pushed the man without any reason.\nC: The man started the zip-line without looking around.\nD: Upon hearing the words of the staff member nearby, the man who was about to start the zip-line turned to stare at them before being swiftly pushed to begin the course.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The man was too scared to start the zip-line.\nB: The staff member pushed the man without any reason.\nC: The man started the zip-line without looking around.\nD: Upon hearing the words of the staff member nearby, the man who was about to start the zip-line turned to stare at them before being swiftly pushed to begin the course.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_186_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_186_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_186_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_186_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_186_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_186_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_186_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_186_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_186_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_186_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_186_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_186_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_186_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_186_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_186_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_186_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: At a birthday party, a man is painting with a brush in his hand, while another person is playing the guitar with intense focus.\nB: Inside a library, a person is quietly reading a book, while another individual is loudly discussing a topic on their phone.\nC: In a classroom, a teacher is scolding a student for not paying attention, while a group of students is laughing and having fun.\nD: Inside a KTV, a man is singing with a microphone in his grasp, keeping both hands open. On the adjacent sofa, a man is sitting with a fierce expression, unable to look straight ahead, with a downturned mouth.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: At a birthday party, a man is painting with a brush in his hand, while another person is playing the guitar with intense focus.\nB: Inside a library, a person is quietly reading a book, while another individual is loudly discussing a topic on their phone.\nC: In a classroom, a teacher is scolding a student for not paying attention, while a group of students is laughing and having fun.\nD: Inside a KTV, a man is singing with a microphone in his grasp, keeping both hands open. On the adjacent sofa, a man is sitting with a fierce expression, unable to look straight ahead, with a downturned mouth.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_187_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_187_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_187_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_187_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_187_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_187_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_187_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_187_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_187_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_187_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_187_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_187_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_187_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_187_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_187_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_187_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A:  A little dog doing pull-ups.\nB: A young bird learning to fly for the first time.\nC: A small cat napping on a tree branch.\nD: A tiny hamster running on a miniature treadmill.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA:  A little dog doing pull-ups.\nB: A young bird learning to fly for the first time.\nC: A small cat napping on a tree branch.\nD: A tiny hamster running on a miniature treadmill.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_188_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_188_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_188_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_188_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_188_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_188_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_188_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_188_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_188_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_188_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_188_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_188_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_188_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_188_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_188_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_188_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The well-dressed main character exudes sophistication.\nB: The run-down look of the car appears comical.\nC: The high-quality production value enhances the realism.\nD: The dramatic music in the background adds to the serious tone.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The well-dressed main character exudes sophistication.\nB: The run-down look of the car appears comical.\nC: The high-quality production value enhances the realism.\nD: The dramatic music in the background adds to the serious tone.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_189_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_189_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_189_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_189_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_189_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_189_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_189_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_189_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_189_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_189_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_189_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_189_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_189_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_189_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_189_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_189_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The person secured a rubber band to the puppy and initiated a game of jump rope.\nB: The puppy started dancing to a classical music piece.\nC: The puppy started speaking in fluent English.\nD: The person tried to ride the puppy like a bull.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The person secured a rubber band to the puppy and initiated a game of jump rope.\nB: The puppy started dancing to a classical music piece.\nC: The puppy started speaking in fluent English.\nD: The person tried to ride the puppy like a bull.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_190_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_190_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_190_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_190_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_190_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_190_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_190_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_190_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_190_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_190_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_190_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_190_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_190_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_190_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_190_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_190_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The use of dramatic music in the background adds to the comical effect.\nB: The slow motion effect used in the video enhances the comedic timing.\nC: The sudden appearance of a clown is unexpected and causes laughter.\nD: The situation of someone being unable to locate the broom despite placing it there themselves is humorous.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The use of dramatic music in the background adds to the comical effect.\nB: The slow motion effect used in the video enhances the comedic timing.\nC: The sudden appearance of a clown is unexpected and causes laughter.\nD: The situation of someone being unable to locate the broom despite placing it there themselves is humorous.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_191_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_191_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_191_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_191_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_191_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_191_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_191_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_191_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_191_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_191_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_191_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_191_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_191_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_191_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_191_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_191_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: The man and woman were about to leave the dance floor when the male bystander stopped them.\nB: The man and woman were arguing until the male bystander interrupted them.\nC: A man and woman were holding each other and dancing at a ball. A male bystander came over and tapped the man on his back, and soon after, the two men began dancing together.\nD: The man and woman were performing a traditional dance until the male bystander joined in.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: The man and woman were about to leave the dance floor when the male bystander stopped them.\nB: The man and woman were arguing until the male bystander interrupted them.\nC: A man and woman were holding each other and dancing at a ball. A male bystander came over and tapped the man on his back, and soon after, the two men began dancing together.\nD: The man and woman were performing a traditional dance until the male bystander joined in.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_192_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_192_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_192_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_192_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_192_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_192_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_192_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_192_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_192_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_192_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_192_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_192_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_192_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_192_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_192_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_192_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: Using flames to style hair is an uncommon sight, and the man's baldness on the upper portion of his head adds a comical element.\nB: The use of water to style hair is a common sight, and the man's baldness on the upper portion of his head adds a serious element.\nC: The use of feathers to style hair is an uncommon sight, and the man's full head of hair adds a comical element.\nD: The use of confetti to style hair is a common sight, and the man's baldness on the upper portion of his head adds a dramatic element.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: Using flames to style hair is an uncommon sight, and the man's baldness on the upper portion of his head adds a comical element.\nB: The use of water to style hair is a common sight, and the man's baldness on the upper portion of his head adds a serious element.\nC: The use of feathers to style hair is an uncommon sight, and the man's full head of hair adds a comical element.\nD: The use of confetti to style hair is a common sight, and the man's baldness on the upper portion of his head adds a dramatic element.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_193_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_193_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_193_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_193_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_193_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_193_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_193_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_193_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_193_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_193_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_193_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_193_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_193_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_193_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_193_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_193_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: Using his hand to tap the mirror, the man then linked his hands together, only to reveal that he had a phone gripped in his mouth.\nB: The man used his hand to tap the mirror, then he linked his hands together, and finally he showed that he had a wallet stuck in his mouth.\nC: Initially tapping the mirror, the man then interlocked his fingers, ultimately exposing that he was holding a toy car in his mouth.\nD: After tapping the mirror with his hand, the man formed a heart shape with his fingers, leading to the reveal of a sandwich he was holding in his mouth.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: Using his hand to tap the mirror, the man then linked his hands together, only to reveal that he had a phone gripped in his mouth.\nB: The man used his hand to tap the mirror, then he linked his hands together, and finally he showed that he had a wallet stuck in his mouth.\nC: Initially tapping the mirror, the man then interlocked his fingers, ultimately exposing that he was holding a toy car in his mouth.\nD: After tapping the mirror with his hand, the man formed a heart shape with his fingers, leading to the reveal of a sandwich he was holding in his mouth.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_194_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_194_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_194_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_194_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_194_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_194_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_194_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_194_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_194_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_194_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_194_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_194_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_194_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_194_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_194_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_194_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: A dog is resting atop another dog.\nB: Two dogs playing together in the park.\nC: A cat is sleeping on a dog's back.\nD: A dog is jumping over another dog.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: A dog is resting atop another dog.\nB: Two dogs playing together in the park.\nC: A cat is sleeping on a dog's back.\nD: A dog is jumping over another dog.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_195_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_195_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_195_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_195_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_195_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_195_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_195_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_195_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_195_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_195_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_195_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_195_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_195_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_195_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_195_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_195_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: Martial Arts Mickey Mouse.\nB: Ninja SpongeBob SquarePants.\nC: Karate Bugs Bunny.\nD: Kung Fu Donald Duck.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: Martial Arts Mickey Mouse.\nB: Ninja SpongeBob SquarePants.\nC: Karate Bugs Bunny.\nD: Kung Fu Donald Duck.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_196_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_196_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_196_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_196_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_196_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_196_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_196_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_196_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_196_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_196_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_196_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_196_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_196_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_196_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_196_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_196_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: There is no music playing in the video, it is silent.\nB: The men in the video are not dancing, but are standing still.\nC: The dog is not involved in the dance, but is just sitting in the background.\nD: A dog is part of the dance as two men groove to the music.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: There is no music playing in the video, it is silent.\nB: The men in the video are not dancing, but are standing still.\nC: The dog is not involved in the dance, but is just sitting in the background.\nD: A dog is part of the dance as two men groove to the music.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_197_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_197_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_197_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_197_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_197_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_197_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_197_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_197_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_197_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_197_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_197_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_197_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_197_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_197_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_197_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_197_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: Sitting on a swing, the woman couldn't maintain stability and ended up falling down.\nB: The woman gracefully mastered the art of swinging and looked like a professional gymnast.\nC: The swing gently rocked back and forth as the woman enjoyed a peaceful moment.\nD: The woman effortlessly balanced on the swing, demonstrating impressive acrobatic skills.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: Sitting on a swing, the woman couldn't maintain stability and ended up falling down.\nB: The woman gracefully mastered the art of swinging and looked like a professional gymnast.\nC: The swing gently rocked back and forth as the woman enjoyed a peaceful moment.\nD: The woman effortlessly balanced on the swing, demonstrating impressive acrobatic skills.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_198_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_198_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_198_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_198_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_198_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_198_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_198_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_198_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_198_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_198_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_198_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_198_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_198_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_198_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_198_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_198_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "meme_vedio_understanding", "visual_input_component": "Video image or Natural image", "source": "fun_qa", "options": "A: Blindfolded cooking experiment\nB: Unexpected blindfold challenge\nC: Blindfolded taste test gone wrong!\nD: The love for food persists even when blindfolded.", "question": "Please generate a description for this meme", "context": "Select from the following choices.\nA: Blindfolded cooking experiment\nB: Unexpected blindfold challenge\nC: Blindfolded taste test gone wrong!\nD: The love for food persists even when blindfolded.", "input_image_path": ["./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_199_0.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_199_1.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_199_2.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_199_3.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_199_4.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_199_5.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_199_6.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_199_7.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_199_8.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_199_9.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_199_10.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_199_11.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_199_12.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_199_13.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_199_14.png", "./Continuous-temporal/meme_vedio_understanding/meme_vedio_understanding_199_15.png"], "output": "D", "qwen3-vl": "image none"}]
\ No newline at end of file
diff --git a/results/mevis/qwen3-vl/metadata_info.json b/results/mevis/qwen3-vl/metadata_info.json
new file mode 100644
index 0000000..d9e9cbf
--- /dev/null
+++ b/results/mevis/qwen3-vl/metadata_info.json
@@ -0,0 +1 @@
+[{"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.108, 0.0, 0.425, 0.999]\nB: [0.092, 0.001, 0.409, 1.0]\nC: [0.108, 0.0, 0.383, 1.167]\nD: [0.64, 0.324, 0.772, 0.771]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1460 and the height is 864.\nCAPTION: Man kneeling and feeding pandas then standing up and moving around", "context": "Select from the following choices.\nA: [0.108, 0.0, 0.425, 0.999]\nB: [0.092, 0.001, 0.409, 1.0]\nC: [0.108, 0.0, 0.383, 1.167]\nD: [0.64, 0.324, 0.772, 0.771]", "input_image_path": ["./Continuous-temporal/mevis/mevis_0_0.jpg", "./Continuous-temporal/mevis/mevis_0_1.jpg", "./Continuous-temporal/mevis/mevis_0_2.jpg", "./Continuous-temporal/mevis/mevis_0_3.jpg", "./Continuous-temporal/mevis/mevis_0_4.jpg", "./Continuous-temporal/mevis/mevis_0_5.jpg", "./Continuous-temporal/mevis/mevis_0_6.jpg", "./Continuous-temporal/mevis/mevis_0_7.jpg", "./Continuous-temporal/mevis/mevis_0_8.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.459, 0.453, 0.692, 0.601]\nB: [0.499, 0.224, 0.658, 0.74]\nC: [0.546, 0.275, 0.855, 0.561]\nD: [0.648, 0.603, 0.942, 0.669]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1280 and the height is 720.\nCAPTION: Monkey jumping to the right", "context": "Select from the following choices.\nA: [0.459, 0.453, 0.692, 0.601]\nB: [0.499, 0.224, 0.658, 0.74]\nC: [0.546, 0.275, 0.855, 0.561]\nD: [0.648, 0.603, 0.942, 0.669]", "input_image_path": ["./Continuous-temporal/mevis/mevis_1_0.jpg", "./Continuous-temporal/mevis/mevis_1_1.jpg", "./Continuous-temporal/mevis/mevis_1_2.jpg", "./Continuous-temporal/mevis/mevis_1_3.jpg", "./Continuous-temporal/mevis/mevis_1_4.jpg", "./Continuous-temporal/mevis/mevis_1_5.jpg", "./Continuous-temporal/mevis/mevis_1_6.jpg", "./Continuous-temporal/mevis/mevis_1_7.jpg", "./Continuous-temporal/mevis/mevis_1_8.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.471, 0.597, 0.96, 0.716]\nB: [0.104, 0.011, 0.433, 0.256]\nC: [0.204, 0.0, 0.999, 0.792]\nD: [0.0, 0.081, 0.795, 0.873]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1766 and the height is 945.\nCAPTION: Airplane moving from left to right", "context": "Select from the following choices.\nA: [0.471, 0.597, 0.96, 0.716]\nB: [0.104, 0.011, 0.433, 0.256]\nC: [0.204, 0.0, 0.999, 0.792]\nD: [0.0, 0.081, 0.795, 0.873]", "input_image_path": ["./Continuous-temporal/mevis/mevis_2_0.jpg", "./Continuous-temporal/mevis/mevis_2_1.jpg", "./Continuous-temporal/mevis/mevis_2_2.jpg", "./Continuous-temporal/mevis/mevis_2_3.jpg", "./Continuous-temporal/mevis/mevis_2_4.jpg", "./Continuous-temporal/mevis/mevis_2_5.jpg", "./Continuous-temporal/mevis/mevis_2_6.jpg", "./Continuous-temporal/mevis/mevis_2_7.jpg", "./Continuous-temporal/mevis/mevis_2_8.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.751, 0.411, 0.953, 0.67]\nB: [0.404, 0.633, 0.585, 0.895]\nC: [0.331, 0.387, 0.736, 0.809]\nD: [0.734, 0.306, 0.936, 0.565]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: moving bike", "context": "Select from the following choices.\nA: [0.751, 0.411, 0.953, 0.67]\nB: [0.404, 0.633, 0.585, 0.895]\nC: [0.331, 0.387, 0.736, 0.809]\nD: [0.734, 0.306, 0.936, 0.565]", "input_image_path": ["./Continuous-temporal/mevis/mevis_3_0.jpg", "./Continuous-temporal/mevis/mevis_3_1.jpg", "./Continuous-temporal/mevis/mevis_3_2.jpg", "./Continuous-temporal/mevis/mevis_3_3.jpg", "./Continuous-temporal/mevis/mevis_3_4.jpg", "./Continuous-temporal/mevis/mevis_3_5.jpg", "./Continuous-temporal/mevis/mevis_3_6.jpg", "./Continuous-temporal/mevis/mevis_3_7.jpg", "./Continuous-temporal/mevis/mevis_3_8.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.515, 0.0, 0.668, 0.425]\nB: [0.489, 0.0, 0.646, 0.49]\nC: [0.515, 0.0, 0.672, 0.49]\nD: [0.818, 0.565, 0.854, 0.915]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: person holding a rope", "context": "Select from the following choices.\nA: [0.515, 0.0, 0.668, 0.425]\nB: [0.489, 0.0, 0.646, 0.49]\nC: [0.515, 0.0, 0.672, 0.49]\nD: [0.818, 0.565, 0.854, 0.915]", "input_image_path": ["./Continuous-temporal/mevis/mevis_4_0.jpg", "./Continuous-temporal/mevis/mevis_4_1.jpg", "./Continuous-temporal/mevis/mevis_4_2.jpg", "./Continuous-temporal/mevis/mevis_4_3.jpg", "./Continuous-temporal/mevis/mevis_4_4.jpg", "./Continuous-temporal/mevis/mevis_4_5.jpg", "./Continuous-temporal/mevis/mevis_4_6.jpg", "./Continuous-temporal/mevis/mevis_4_7.jpg", "./Continuous-temporal/mevis/mevis_4_8.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.229, 0.095, 0.414, 0.583]\nB: [0.6, 0.377, 0.902, 0.706]\nC: [0.34, 0.421, 0.4, 0.718]\nD: [0.34, 0.421, 0.4, 0.663]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 512 and the height is 252.\nCAPTION: The man who pulls the horse and runs in circles", "context": "Select from the following choices.\nA: [0.229, 0.095, 0.414, 0.583]\nB: [0.6, 0.377, 0.902, 0.706]\nC: [0.34, 0.421, 0.4, 0.718]\nD: [0.34, 0.421, 0.4, 0.663]", "input_image_path": ["./Continuous-temporal/mevis/mevis_5_0.jpg", "./Continuous-temporal/mevis/mevis_5_1.jpg", "./Continuous-temporal/mevis/mevis_5_2.jpg", "./Continuous-temporal/mevis/mevis_5_3.jpg", "./Continuous-temporal/mevis/mevis_5_4.jpg", "./Continuous-temporal/mevis/mevis_5_5.jpg", "./Continuous-temporal/mevis/mevis_5_6.jpg", "./Continuous-temporal/mevis/mevis_5_7.jpg", "./Continuous-temporal/mevis/mevis_5_8.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.938, 0.165, 0.965, 0.472]\nB: [0.002, 0.107, 0.289, 0.46]\nC: [0.243, 0.441, 0.277, 0.48]\nD: [0.243, 0.441, 0.28, 0.48]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 544.\nCAPTION: Stationary truck", "context": "Select from the following choices.\nA: [0.938, 0.165, 0.965, 0.472]\nB: [0.002, 0.107, 0.289, 0.46]\nC: [0.243, 0.441, 0.277, 0.48]\nD: [0.243, 0.441, 0.28, 0.48]", "input_image_path": ["./Continuous-temporal/mevis/mevis_6_0.jpg", "./Continuous-temporal/mevis/mevis_6_1.jpg", "./Continuous-temporal/mevis/mevis_6_2.jpg", "./Continuous-temporal/mevis/mevis_6_3.jpg", "./Continuous-temporal/mevis/mevis_6_4.jpg", "./Continuous-temporal/mevis/mevis_6_5.jpg", "./Continuous-temporal/mevis/mevis_6_6.jpg", "./Continuous-temporal/mevis/mevis_6_7.jpg", "./Continuous-temporal/mevis/mevis_6_8.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.36, 0.405, 0.87, 0.536]\nB: [0.36, 0.405, 0.802, 0.554]\nC: [0.627, 0.3, 0.791, 0.407]\nD: [0.36, 0.405, 0.799, 0.534]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 496.\nCAPTION: plane move faster", "context": "Select from the following choices.\nA: [0.36, 0.405, 0.87, 0.536]\nB: [0.36, 0.405, 0.802, 0.554]\nC: [0.627, 0.3, 0.791, 0.407]\nD: [0.36, 0.405, 0.799, 0.534]", "input_image_path": ["./Continuous-temporal/mevis/mevis_7_0.jpg", "./Continuous-temporal/mevis/mevis_7_1.jpg", "./Continuous-temporal/mevis/mevis_7_2.jpg", "./Continuous-temporal/mevis/mevis_7_3.jpg", "./Continuous-temporal/mevis/mevis_7_4.jpg", "./Continuous-temporal/mevis/mevis_7_5.jpg", "./Continuous-temporal/mevis/mevis_7_6.jpg", "./Continuous-temporal/mevis/mevis_7_7.jpg", "./Continuous-temporal/mevis/mevis_7_8.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.312, 0.268, 0.537, 0.651]\nB: [0.189, 0.408, 0.598, 0.683]\nC: [0.266, 0.106, 0.491, 0.488]\nD: [0.313, 0.153, 0.539, 0.535]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1280 and the height is 596.\nCAPTION: panda sit and eat, then walking to the leftmost", "context": "Select from the following choices.\nA: [0.312, 0.268, 0.537, 0.651]\nB: [0.189, 0.408, 0.598, 0.683]\nC: [0.266, 0.106, 0.491, 0.488]\nD: [0.313, 0.153, 0.539, 0.535]", "input_image_path": ["./Continuous-temporal/mevis/mevis_8_0.jpg", "./Continuous-temporal/mevis/mevis_8_1.jpg", "./Continuous-temporal/mevis/mevis_8_2.jpg", "./Continuous-temporal/mevis/mevis_8_3.jpg", "./Continuous-temporal/mevis/mevis_8_4.jpg", "./Continuous-temporal/mevis/mevis_8_5.jpg", "./Continuous-temporal/mevis/mevis_8_6.jpg", "./Continuous-temporal/mevis/mevis_8_7.jpg", "./Continuous-temporal/mevis/mevis_8_8.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.517, 0.399, 0.793, 0.95]\nB: [0.672, 0.449, 0.947, 1.0]\nC: [0.64, 0.192, 0.916, 0.743]\nD: [0.56, 0.448, 0.835, 0.999]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: fish stay still and is the final to move, then swim around", "context": "Select from the following choices.\nA: [0.517, 0.399, 0.793, 0.95]\nB: [0.672, 0.449, 0.947, 1.0]\nC: [0.64, 0.192, 0.916, 0.743]\nD: [0.56, 0.448, 0.835, 0.999]", "input_image_path": ["./Continuous-temporal/mevis/mevis_9_0.jpg", "./Continuous-temporal/mevis/mevis_9_1.jpg", "./Continuous-temporal/mevis/mevis_9_2.jpg", "./Continuous-temporal/mevis/mevis_9_3.jpg", "./Continuous-temporal/mevis/mevis_9_4.jpg", "./Continuous-temporal/mevis/mevis_9_5.jpg", "./Continuous-temporal/mevis/mevis_9_6.jpg", "./Continuous-temporal/mevis/mevis_9_7.jpg", "./Continuous-temporal/mevis/mevis_9_8.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.098, 0.465, 0.303, 0.585]\nB: [0.121, 0.454, 0.326, 0.574]\nC: [0.121, 0.454, 0.324, 0.579]\nD: [0.121, 0.454, 0.326, 0.594]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 544.\nCAPTION: black one that turns and goes left", "context": "Select from the following choices.\nA: [0.098, 0.465, 0.303, 0.585]\nB: [0.121, 0.454, 0.326, 0.574]\nC: [0.121, 0.454, 0.324, 0.579]\nD: [0.121, 0.454, 0.326, 0.594]", "input_image_path": ["./Continuous-temporal/mevis/mevis_10_0.jpg", "./Continuous-temporal/mevis/mevis_10_1.jpg", "./Continuous-temporal/mevis/mevis_10_2.jpg", "./Continuous-temporal/mevis/mevis_10_3.jpg", "./Continuous-temporal/mevis/mevis_10_4.jpg", "./Continuous-temporal/mevis/mevis_10_5.jpg", "./Continuous-temporal/mevis/mevis_10_6.jpg", "./Continuous-temporal/mevis/mevis_10_7.jpg", "./Continuous-temporal/mevis/mevis_10_8.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.075, 0.629, 0.129, 0.875]\nB: [0.714, 0.632, 0.746, 0.732]\nC: [0.356, 0.713, 0.678, 0.908]\nD: [0.714, 0.632, 0.74, 0.728]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 544.\nCAPTION: The horse running alongside the white railing.", "context": "Select from the following choices.\nA: [0.075, 0.629, 0.129, 0.875]\nB: [0.714, 0.632, 0.746, 0.732]\nC: [0.356, 0.713, 0.678, 0.908]\nD: [0.714, 0.632, 0.74, 0.728]", "input_image_path": ["./Continuous-temporal/mevis/mevis_11_0.jpg", "./Continuous-temporal/mevis/mevis_11_1.jpg", "./Continuous-temporal/mevis/mevis_11_2.jpg", "./Continuous-temporal/mevis/mevis_11_3.jpg", "./Continuous-temporal/mevis/mevis_11_4.jpg", "./Continuous-temporal/mevis/mevis_11_5.jpg", "./Continuous-temporal/mevis/mevis_11_6.jpg", "./Continuous-temporal/mevis/mevis_11_7.jpg", "./Continuous-temporal/mevis/mevis_11_8.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.0, 0.458, 0.149, 0.824]\nB: [0.309, 0.0, 0.473, 0.107]\nC: [0.274, 0.0, 0.438, 0.107]\nD: [0.274, 0.0, 0.436, 0.1]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 720.\nCAPTION: people standing", "context": "Select from the following choices.\nA: [0.0, 0.458, 0.149, 0.824]\nB: [0.309, 0.0, 0.473, 0.107]\nC: [0.274, 0.0, 0.438, 0.107]\nD: [0.274, 0.0, 0.436, 0.1]", "input_image_path": ["./Continuous-temporal/mevis/mevis_12_0.jpg", "./Continuous-temporal/mevis/mevis_12_1.jpg", "./Continuous-temporal/mevis/mevis_12_2.jpg", "./Continuous-temporal/mevis/mevis_12_3.jpg", "./Continuous-temporal/mevis/mevis_12_4.jpg", "./Continuous-temporal/mevis/mevis_12_5.jpg", "./Continuous-temporal/mevis/mevis_12_6.jpg", "./Continuous-temporal/mevis/mevis_12_7.jpg", "./Continuous-temporal/mevis/mevis_12_8.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.221, 0.11, 0.562, 0.656]\nB: [0.137, 0.273, 0.503, 0.921]\nC: [0.137, 0.273, 0.426, 0.741]\nD: [0.137, 0.273, 0.478, 0.819]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: front elephant walking to backwards", "context": "Select from the following choices.\nA: [0.221, 0.11, 0.562, 0.656]\nB: [0.137, 0.273, 0.503, 0.921]\nC: [0.137, 0.273, 0.426, 0.741]\nD: [0.137, 0.273, 0.478, 0.819]", "input_image_path": ["./Continuous-temporal/mevis/mevis_13_0.jpg", "./Continuous-temporal/mevis/mevis_13_1.jpg", "./Continuous-temporal/mevis/mevis_13_2.jpg", "./Continuous-temporal/mevis/mevis_13_3.jpg", "./Continuous-temporal/mevis/mevis_13_4.jpg", "./Continuous-temporal/mevis/mevis_13_5.jpg", "./Continuous-temporal/mevis/mevis_13_6.jpg", "./Continuous-temporal/mevis/mevis_13_7.jpg", "./Continuous-temporal/mevis/mevis_13_8.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.443, 0.427, 0.702, 0.837]\nB: [0.443, 0.427, 0.732, 0.895]\nC: [0.532, 0.417, 0.86, 0.458]\nD: [0.221, 0.395, 0.5, 0.881]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1460 and the height is 864.\nCAPTION: sit down and eat, then walk and stand up using the back legs", "context": "Select from the following choices.\nA: [0.443, 0.427, 0.702, 0.837]\nB: [0.443, 0.427, 0.732, 0.895]\nC: [0.532, 0.417, 0.86, 0.458]\nD: [0.221, 0.395, 0.5, 0.881]", "input_image_path": ["./Continuous-temporal/mevis/mevis_14_0.jpg", "./Continuous-temporal/mevis/mevis_14_1.jpg", "./Continuous-temporal/mevis/mevis_14_2.jpg", "./Continuous-temporal/mevis/mevis_14_3.jpg", "./Continuous-temporal/mevis/mevis_14_4.jpg", "./Continuous-temporal/mevis/mevis_14_5.jpg", "./Continuous-temporal/mevis/mevis_14_6.jpg", "./Continuous-temporal/mevis/mevis_14_7.jpg", "./Continuous-temporal/mevis/mevis_14_8.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.034, 0.165, 0.329, 0.683]\nB: [0.084, 0.018, 0.378, 0.536]\nC: [0.177, 0.224, 0.471, 0.742]\nD: [0.198, 0.454, 0.492, 0.972]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1280 and the height is 720.\nCAPTION: The yellow truck in motion.", "context": "Select from the following choices.\nA: [0.034, 0.165, 0.329, 0.683]\nB: [0.084, 0.018, 0.378, 0.536]\nC: [0.177, 0.224, 0.471, 0.742]\nD: [0.198, 0.454, 0.492, 0.972]", "input_image_path": ["./Continuous-temporal/mevis/mevis_15_0.jpg", "./Continuous-temporal/mevis/mevis_15_1.jpg", "./Continuous-temporal/mevis/mevis_15_2.jpg", "./Continuous-temporal/mevis/mevis_15_3.jpg", "./Continuous-temporal/mevis/mevis_15_4.jpg", "./Continuous-temporal/mevis/mevis_15_5.jpg", "./Continuous-temporal/mevis/mevis_15_6.jpg", "./Continuous-temporal/mevis/mevis_15_7.jpg", "./Continuous-temporal/mevis/mevis_15_8.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.484, 0.053, 0.529, 0.188]\nB: [0.394, 0.024, 0.831, 0.342]\nC: [0.481, 0.01, 0.526, 0.145]\nD: [0.503, 0.0, 0.548, 0.135]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: the right one of the two sitting people in the distance", "context": "Select from the following choices.\nA: [0.484, 0.053, 0.529, 0.188]\nB: [0.394, 0.024, 0.831, 0.342]\nC: [0.481, 0.01, 0.526, 0.145]\nD: [0.503, 0.0, 0.548, 0.135]", "input_image_path": ["./Continuous-temporal/mevis/mevis_16_0.jpg", "./Continuous-temporal/mevis/mevis_16_1.jpg", "./Continuous-temporal/mevis/mevis_16_2.jpg", "./Continuous-temporal/mevis/mevis_16_3.jpg", "./Continuous-temporal/mevis/mevis_16_4.jpg", "./Continuous-temporal/mevis/mevis_16_5.jpg", "./Continuous-temporal/mevis/mevis_16_6.jpg", "./Continuous-temporal/mevis/mevis_16_7.jpg", "./Continuous-temporal/mevis/mevis_16_8.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.546, 0.484, 0.821, 0.866]\nB: [0.354, 0.538, 0.431, 0.933]\nC: [0.105, 0.437, 0.253, 0.914]\nD: [0.373, 0.403, 0.451, 0.797]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1567 and the height is 730.\nCAPTION: standing still eating the grass without changing position", "context": "Select from the following choices.\nA: [0.546, 0.484, 0.821, 0.866]\nB: [0.354, 0.538, 0.431, 0.933]\nC: [0.105, 0.437, 0.253, 0.914]\nD: [0.373, 0.403, 0.451, 0.797]", "input_image_path": ["./Continuous-temporal/mevis/mevis_17_0.jpg", "./Continuous-temporal/mevis/mevis_17_1.jpg", "./Continuous-temporal/mevis/mevis_17_2.jpg", "./Continuous-temporal/mevis/mevis_17_3.jpg", "./Continuous-temporal/mevis/mevis_17_4.jpg", "./Continuous-temporal/mevis/mevis_17_5.jpg", "./Continuous-temporal/mevis/mevis_17_6.jpg", "./Continuous-temporal/mevis/mevis_17_7.jpg", "./Continuous-temporal/mevis/mevis_17_8.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.077, 0.644, 0.468, 1.0]\nB: [0.0, 0.644, 0.391, 1.0]\nC: [0.123, 0.642, 0.514, 0.999]\nD: [0.123, 0.642, 0.574, 0.992]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 794.\nCAPTION: The bear that was pinned to the ground during the fight", "context": "Select from the following choices.\nA: [0.077, 0.644, 0.468, 1.0]\nB: [0.0, 0.644, 0.391, 1.0]\nC: [0.123, 0.642, 0.514, 0.999]\nD: [0.123, 0.642, 0.574, 0.992]", "input_image_path": ["./Continuous-temporal/mevis/mevis_18_0.jpg", "./Continuous-temporal/mevis/mevis_18_1.jpg", "./Continuous-temporal/mevis/mevis_18_2.jpg", "./Continuous-temporal/mevis/mevis_18_3.jpg", "./Continuous-temporal/mevis/mevis_18_4.jpg", "./Continuous-temporal/mevis/mevis_18_5.jpg", "./Continuous-temporal/mevis/mevis_18_6.jpg", "./Continuous-temporal/mevis/mevis_18_7.jpg", "./Continuous-temporal/mevis/mevis_18_8.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.899, 0.384, 1.0, 0.721]\nB: [0.865, 0.253, 0.966, 0.59]\nC: [0.865, 0.253, 0.982, 0.587]\nD: [0.815, 0.165, 0.916, 0.502]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: Girl riding bicycle in a circle", "context": "Select from the following choices.\nA: [0.899, 0.384, 1.0, 0.721]\nB: [0.865, 0.253, 0.966, 0.59]\nC: [0.865, 0.253, 0.982, 0.587]\nD: [0.815, 0.165, 0.916, 0.502]", "input_image_path": ["./Continuous-temporal/mevis/mevis_19_0.jpg", "./Continuous-temporal/mevis/mevis_19_1.jpg", "./Continuous-temporal/mevis/mevis_19_2.jpg", "./Continuous-temporal/mevis/mevis_19_3.jpg", "./Continuous-temporal/mevis/mevis_19_4.jpg", "./Continuous-temporal/mevis/mevis_19_5.jpg", "./Continuous-temporal/mevis/mevis_19_6.jpg", "./Continuous-temporal/mevis/mevis_19_7.jpg", "./Continuous-temporal/mevis/mevis_19_8.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.387, 0.396, 0.516, 0.804]\nB: [0.384, 0.324, 0.498, 0.683]\nC: [0.384, 0.324, 0.505, 0.725]\nD: [0.384, 0.324, 0.514, 0.732]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1280 and the height is 720.\nCAPTION: Elephant putting its trunk on another elephant's back", "context": "Select from the following choices.\nA: [0.387, 0.396, 0.516, 0.804]\nB: [0.384, 0.324, 0.498, 0.683]\nC: [0.384, 0.324, 0.505, 0.725]\nD: [0.384, 0.324, 0.514, 0.732]", "input_image_path": ["./Continuous-temporal/mevis/mevis_20_0.jpg", "./Continuous-temporal/mevis/mevis_20_1.jpg", "./Continuous-temporal/mevis/mevis_20_2.jpg", "./Continuous-temporal/mevis/mevis_20_3.jpg", "./Continuous-temporal/mevis/mevis_20_4.jpg", "./Continuous-temporal/mevis/mevis_20_5.jpg", "./Continuous-temporal/mevis/mevis_20_6.jpg", "./Continuous-temporal/mevis/mevis_20_7.jpg", "./Continuous-temporal/mevis/mevis_20_8.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.42, 0.233, 0.816, 0.666]\nB: [0.494, 0.229, 0.558, 0.489]\nC: [0.758, 0.261, 0.86, 0.384]\nD: [0.494, 0.229, 0.55, 0.444]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: The monkey pulling the cart", "context": "Select from the following choices.\nA: [0.42, 0.233, 0.816, 0.666]\nB: [0.494, 0.229, 0.558, 0.489]\nC: [0.758, 0.261, 0.86, 0.384]\nD: [0.494, 0.229, 0.55, 0.444]", "input_image_path": ["./Continuous-temporal/mevis/mevis_21_0.jpg", "./Continuous-temporal/mevis/mevis_21_1.jpg", "./Continuous-temporal/mevis/mevis_21_2.jpg", "./Continuous-temporal/mevis/mevis_21_3.jpg", "./Continuous-temporal/mevis/mevis_21_4.jpg", "./Continuous-temporal/mevis/mevis_21_5.jpg", "./Continuous-temporal/mevis/mevis_21_6.jpg", "./Continuous-temporal/mevis/mevis_21_7.jpg", "./Continuous-temporal/mevis/mevis_21_8.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.041, 0.394, 0.379, 0.856]\nB: [0.217, 0.306, 0.414, 0.613]\nC: [0.217, 0.306, 0.447, 0.583]\nD: [0.06, 0.273, 0.247, 0.523]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 636 and the height is 480.\nCAPTION: tiger walking from distance and lying down on the ground to drink water", "context": "Select from the following choices.\nA: [0.041, 0.394, 0.379, 0.856]\nB: [0.217, 0.306, 0.414, 0.613]\nC: [0.217, 0.306, 0.447, 0.583]\nD: [0.06, 0.273, 0.247, 0.523]", "input_image_path": ["./Continuous-temporal/mevis/mevis_22_0.jpg", "./Continuous-temporal/mevis/mevis_22_1.jpg", "./Continuous-temporal/mevis/mevis_22_2.jpg", "./Continuous-temporal/mevis/mevis_22_3.jpg", "./Continuous-temporal/mevis/mevis_22_4.jpg", "./Continuous-temporal/mevis/mevis_22_5.jpg", "./Continuous-temporal/mevis/mevis_22_6.jpg", "./Continuous-temporal/mevis/mevis_22_7.jpg", "./Continuous-temporal/mevis/mevis_22_8.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.095, 0.373, 0.885, 1.12]\nB: [0.096, 0.374, 1.0, 1.0]\nC: [0.455, 0.355, 0.526, 0.388]\nD: [0.095, 0.373, 0.999, 0.999]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: The person with the lizard in their hand.", "context": "Select from the following choices.\nA: [0.095, 0.373, 0.885, 1.12]\nB: [0.096, 0.374, 1.0, 1.0]\nC: [0.455, 0.355, 0.526, 0.388]\nD: [0.095, 0.373, 0.999, 0.999]", "input_image_path": ["./Continuous-temporal/mevis/mevis_23_0.jpg", "./Continuous-temporal/mevis/mevis_23_1.jpg", "./Continuous-temporal/mevis/mevis_23_2.jpg", "./Continuous-temporal/mevis/mevis_23_3.jpg", "./Continuous-temporal/mevis/mevis_23_4.jpg", "./Continuous-temporal/mevis/mevis_23_5.jpg", "./Continuous-temporal/mevis/mevis_23_6.jpg", "./Continuous-temporal/mevis/mevis_23_7.jpg", "./Continuous-temporal/mevis/mevis_23_8.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.376, 0.443, 0.562, 0.608]\nB: [0.402, 0.431, 0.588, 0.596]\nC: [0.2, 0.171, 0.318, 0.419]\nD: [0.376, 0.443, 0.567, 0.621]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: eat the food on the man's hand then walk away to the food box", "context": "Select from the following choices.\nA: [0.376, 0.443, 0.562, 0.608]\nB: [0.402, 0.431, 0.588, 0.596]\nC: [0.2, 0.171, 0.318, 0.419]\nD: [0.376, 0.443, 0.567, 0.621]", "input_image_path": ["./Continuous-temporal/mevis/mevis_24_0.jpg", "./Continuous-temporal/mevis/mevis_24_1.jpg", "./Continuous-temporal/mevis/mevis_24_2.jpg", "./Continuous-temporal/mevis/mevis_24_3.jpg", "./Continuous-temporal/mevis/mevis_24_4.jpg", "./Continuous-temporal/mevis/mevis_24_5.jpg", "./Continuous-temporal/mevis/mevis_24_6.jpg", "./Continuous-temporal/mevis/mevis_24_7.jpg", "./Continuous-temporal/mevis/mevis_24_8.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.0, 0.194, 0.136, 0.298]\nB: [0.0, 0.194, 0.162, 0.31]\nC: [0.26, 0.81, 0.487, 0.87]\nD: [0.0, 0.194, 0.122, 0.303]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1080 and the height is 1920.\nCAPTION: The fish in the top most.", "context": "Select from the following choices.\nA: [0.0, 0.194, 0.136, 0.298]\nB: [0.0, 0.194, 0.162, 0.31]\nC: [0.26, 0.81, 0.487, 0.87]\nD: [0.0, 0.194, 0.122, 0.303]", "input_image_path": ["./Continuous-temporal/mevis/mevis_25_0.jpg", "./Continuous-temporal/mevis/mevis_25_1.jpg", "./Continuous-temporal/mevis/mevis_25_2.jpg", "./Continuous-temporal/mevis/mevis_25_3.jpg", "./Continuous-temporal/mevis/mevis_25_4.jpg", "./Continuous-temporal/mevis/mevis_25_5.jpg", "./Continuous-temporal/mevis/mevis_25_6.jpg", "./Continuous-temporal/mevis/mevis_25_7.jpg", "./Continuous-temporal/mevis/mevis_25_8.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.416, 0.166, 0.608, 0.431]\nB: [0.142, 0.332, 0.275, 0.644]\nC: [0.148, 0.279, 0.225, 0.488]\nD: [0.331, 0.507, 0.795, 0.96]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1280 and the height is 596.\nCAPTION: The giant panda that has been sitting and eating without moving its position", "context": "Select from the following choices.\nA: [0.416, 0.166, 0.608, 0.431]\nB: [0.142, 0.332, 0.275, 0.644]\nC: [0.148, 0.279, 0.225, 0.488]\nD: [0.331, 0.507, 0.795, 0.96]", "input_image_path": ["./Continuous-temporal/mevis/mevis_26_0.jpg", "./Continuous-temporal/mevis/mevis_26_1.jpg", "./Continuous-temporal/mevis/mevis_26_2.jpg", "./Continuous-temporal/mevis/mevis_26_3.jpg", "./Continuous-temporal/mevis/mevis_26_4.jpg", "./Continuous-temporal/mevis/mevis_26_5.jpg", "./Continuous-temporal/mevis/mevis_26_6.jpg", "./Continuous-temporal/mevis/mevis_26_7.jpg", "./Continuous-temporal/mevis/mevis_26_8.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.0, 0.0, 0.449, 0.46]\nB: [0.0, 0.0, 0.493, 0.471]\nC: [0.0, 0.0, 0.522, 0.436]\nD: [0.065, 0.0, 0.514, 0.46]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 945.\nCAPTION: turtle swimming to the left", "context": "Select from the following choices.\nA: [0.0, 0.0, 0.449, 0.46]\nB: [0.0, 0.0, 0.493, 0.471]\nC: [0.0, 0.0, 0.522, 0.436]\nD: [0.065, 0.0, 0.514, 0.46]", "input_image_path": ["./Continuous-temporal/mevis/mevis_27_0.jpg", "./Continuous-temporal/mevis/mevis_27_1.jpg", "./Continuous-temporal/mevis/mevis_27_2.jpg", "./Continuous-temporal/mevis/mevis_27_3.jpg", "./Continuous-temporal/mevis/mevis_27_4.jpg", "./Continuous-temporal/mevis/mevis_27_5.jpg", "./Continuous-temporal/mevis/mevis_27_6.jpg", "./Continuous-temporal/mevis/mevis_27_7.jpg", "./Continuous-temporal/mevis/mevis_27_8.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.397, 0.443, 0.607, 0.575]\nB: [0.499, 0.579, 0.999, 0.86]\nC: [0.397, 0.443, 0.599, 0.557]\nD: [0.208, 0.079, 0.706, 0.226]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 544.\nCAPTION: black car move and turn left", "context": "Select from the following choices.\nA: [0.397, 0.443, 0.607, 0.575]\nB: [0.499, 0.579, 0.999, 0.86]\nC: [0.397, 0.443, 0.599, 0.557]\nD: [0.208, 0.079, 0.706, 0.226]", "input_image_path": ["./Continuous-temporal/mevis/mevis_28_0.jpg", "./Continuous-temporal/mevis/mevis_28_1.jpg", "./Continuous-temporal/mevis/mevis_28_2.jpg", "./Continuous-temporal/mevis/mevis_28_3.jpg", "./Continuous-temporal/mevis/mevis_28_4.jpg", "./Continuous-temporal/mevis/mevis_28_5.jpg", "./Continuous-temporal/mevis/mevis_28_6.jpg", "./Continuous-temporal/mevis/mevis_28_7.jpg", "./Continuous-temporal/mevis/mevis_28_8.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.287, 0.225, 0.303, 0.372]\nB: [0.292, 0.285, 0.309, 0.432]\nC: [0.292, 0.285, 0.31, 0.431]\nD: [0.642, 0.415, 0.974, 0.576]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1280 and the height is 720.\nCAPTION: man coming out and leaning against the door", "context": "Select from the following choices.\nA: [0.287, 0.225, 0.303, 0.372]\nB: [0.292, 0.285, 0.309, 0.432]\nC: [0.292, 0.285, 0.31, 0.431]\nD: [0.642, 0.415, 0.974, 0.576]", "input_image_path": ["./Continuous-temporal/mevis/mevis_29_0.jpg", "./Continuous-temporal/mevis/mevis_29_1.jpg", "./Continuous-temporal/mevis/mevis_29_2.jpg", "./Continuous-temporal/mevis/mevis_29_3.jpg", "./Continuous-temporal/mevis/mevis_29_4.jpg", "./Continuous-temporal/mevis/mevis_29_5.jpg", "./Continuous-temporal/mevis/mevis_29_6.jpg", "./Continuous-temporal/mevis/mevis_29_7.jpg", "./Continuous-temporal/mevis/mevis_29_8.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.0, 0.27, 0.367, 0.342]\nB: [0.0, 0.27, 0.415, 0.372]\nC: [0.0, 0.27, 0.35, 0.359]\nD: [0.0, 0.27, 0.318, 0.343]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1080 and the height is 1920.\nCAPTION: most top long fish", "context": "Select from the following choices.\nA: [0.0, 0.27, 0.367, 0.342]\nB: [0.0, 0.27, 0.415, 0.372]\nC: [0.0, 0.27, 0.35, 0.359]\nD: [0.0, 0.27, 0.318, 0.343]", "input_image_path": ["./Continuous-temporal/mevis/mevis_30_0.jpg", "./Continuous-temporal/mevis/mevis_30_1.jpg", "./Continuous-temporal/mevis/mevis_30_2.jpg", "./Continuous-temporal/mevis/mevis_30_3.jpg", "./Continuous-temporal/mevis/mevis_30_4.jpg", "./Continuous-temporal/mevis/mevis_30_5.jpg", "./Continuous-temporal/mevis/mevis_30_6.jpg", "./Continuous-temporal/mevis/mevis_30_7.jpg", "./Continuous-temporal/mevis/mevis_30_8.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.36, 0.0, 0.58, 0.605]\nB: [0.36, 0.0, 0.627, 0.794]\nC: [0.36, 0.0, 0.575, 0.559]\nD: [0.36, 0.0, 0.588, 0.686]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 544.\nCAPTION: moving boy", "context": "Select from the following choices.\nA: [0.36, 0.0, 0.58, 0.605]\nB: [0.36, 0.0, 0.627, 0.794]\nC: [0.36, 0.0, 0.575, 0.559]\nD: [0.36, 0.0, 0.588, 0.686]", "input_image_path": ["./Continuous-temporal/mevis/mevis_31_0.jpg", "./Continuous-temporal/mevis/mevis_31_1.jpg", "./Continuous-temporal/mevis/mevis_31_2.jpg", "./Continuous-temporal/mevis/mevis_31_3.jpg", "./Continuous-temporal/mevis/mevis_31_4.jpg", "./Continuous-temporal/mevis/mevis_31_5.jpg", "./Continuous-temporal/mevis/mevis_31_6.jpg", "./Continuous-temporal/mevis/mevis_31_7.jpg", "./Continuous-temporal/mevis/mevis_31_8.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.121, 0.024, 0.819, 0.716]\nB: [0.302, 0.163, 0.999, 0.856]\nC: [0.302, 0.163, 1.105, 0.84]\nD: [0.0, 0.0, 0.697, 0.692]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1004.\nCAPTION: The turtle that descended from above and reached the pool's depths.", "context": "Select from the following choices.\nA: [0.121, 0.024, 0.819, 0.716]\nB: [0.302, 0.163, 0.999, 0.856]\nC: [0.302, 0.163, 1.105, 0.84]\nD: [0.0, 0.0, 0.697, 0.692]", "input_image_path": ["./Continuous-temporal/mevis/mevis_32_0.jpg", "./Continuous-temporal/mevis/mevis_32_1.jpg", "./Continuous-temporal/mevis/mevis_32_2.jpg", "./Continuous-temporal/mevis/mevis_32_3.jpg", "./Continuous-temporal/mevis/mevis_32_4.jpg", "./Continuous-temporal/mevis/mevis_32_5.jpg", "./Continuous-temporal/mevis/mevis_32_6.jpg", "./Continuous-temporal/mevis/mevis_32_7.jpg", "./Continuous-temporal/mevis/mevis_32_8.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.8, 0.246, 0.854, 0.596]\nB: [0.8, 0.246, 0.849, 0.552]\nC: [0.168, 0.408, 0.398, 0.881]\nD: [0.8, 0.246, 0.846, 0.541]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 2340 and the height is 1080.\nCAPTION: Man clapping his hands", "context": "Select from the following choices.\nA: [0.8, 0.246, 0.854, 0.596]\nB: [0.8, 0.246, 0.849, 0.552]\nC: [0.168, 0.408, 0.398, 0.881]\nD: [0.8, 0.246, 0.846, 0.541]", "input_image_path": ["./Continuous-temporal/mevis/mevis_33_0.jpg", "./Continuous-temporal/mevis/mevis_33_1.jpg", "./Continuous-temporal/mevis/mevis_33_2.jpg", "./Continuous-temporal/mevis/mevis_33_3.jpg", "./Continuous-temporal/mevis/mevis_33_4.jpg", "./Continuous-temporal/mevis/mevis_33_5.jpg", "./Continuous-temporal/mevis/mevis_33_6.jpg", "./Continuous-temporal/mevis/mevis_33_7.jpg", "./Continuous-temporal/mevis/mevis_33_8.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.306, 0.732, 0.552, 0.961]\nB: [0.317, 0.557, 0.562, 0.787]\nC: [0.258, 0.651, 0.504, 0.881]\nD: [0.258, 0.651, 0.507, 0.851]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 544.\nCAPTION: The turtle moving to the left.", "context": "Select from the following choices.\nA: [0.306, 0.732, 0.552, 0.961]\nB: [0.317, 0.557, 0.562, 0.787]\nC: [0.258, 0.651, 0.504, 0.881]\nD: [0.258, 0.651, 0.507, 0.851]", "input_image_path": ["./Continuous-temporal/mevis/mevis_34_0.jpg", "./Continuous-temporal/mevis/mevis_34_1.jpg", "./Continuous-temporal/mevis/mevis_34_2.jpg", "./Continuous-temporal/mevis/mevis_34_3.jpg", "./Continuous-temporal/mevis/mevis_34_4.jpg", "./Continuous-temporal/mevis/mevis_34_5.jpg", "./Continuous-temporal/mevis/mevis_34_6.jpg", "./Continuous-temporal/mevis/mevis_34_7.jpg", "./Continuous-temporal/mevis/mevis_34_8.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.706, 0.375, 0.747, 0.484]\nB: [0.703, 0.348, 0.743, 0.456]\nC: [0.706, 0.375, 0.745, 0.492]\nD: [0.68, 0.46, 0.774, 0.86]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1567 and the height is 730.\nCAPTION: The leading cow among the group.", "context": "Select from the following choices.\nA: [0.706, 0.375, 0.747, 0.484]\nB: [0.703, 0.348, 0.743, 0.456]\nC: [0.706, 0.375, 0.745, 0.492]\nD: [0.68, 0.46, 0.774, 0.86]", "input_image_path": ["./Continuous-temporal/mevis/mevis_35_0.jpg", "./Continuous-temporal/mevis/mevis_35_1.jpg", "./Continuous-temporal/mevis/mevis_35_2.jpg", "./Continuous-temporal/mevis/mevis_35_3.jpg", "./Continuous-temporal/mevis/mevis_35_4.jpg", "./Continuous-temporal/mevis/mevis_35_5.jpg", "./Continuous-temporal/mevis/mevis_35_6.jpg", "./Continuous-temporal/mevis/mevis_35_7.jpg", "./Continuous-temporal/mevis/mevis_35_8.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.133, 0.533, 0.149, 0.608]\nB: [0.417, 0.292, 0.566, 0.682]\nC: [0.368, 0.36, 0.517, 0.75]\nD: [0.472, 0.233, 0.621, 0.623]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 544.\nCAPTION: little dog running", "context": "Select from the following choices.\nA: [0.133, 0.533, 0.149, 0.608]\nB: [0.417, 0.292, 0.566, 0.682]\nC: [0.368, 0.36, 0.517, 0.75]\nD: [0.472, 0.233, 0.621, 0.623]", "input_image_path": ["./Continuous-temporal/mevis/mevis_36_0.jpg", "./Continuous-temporal/mevis/mevis_36_1.jpg", "./Continuous-temporal/mevis/mevis_36_2.jpg", "./Continuous-temporal/mevis/mevis_36_3.jpg", "./Continuous-temporal/mevis/mevis_36_4.jpg", "./Continuous-temporal/mevis/mevis_36_5.jpg", "./Continuous-temporal/mevis/mevis_36_6.jpg", "./Continuous-temporal/mevis/mevis_36_7.jpg", "./Continuous-temporal/mevis/mevis_36_8.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.265, 0.246, 0.601, 0.517]\nB: [0.243, 0.441, 0.282, 0.498]\nC: [0.255, 0.417, 0.289, 0.471]\nD: [0.243, 0.441, 0.276, 0.494]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 544.\nCAPTION: truck park", "context": "Select from the following choices.\nA: [0.265, 0.246, 0.601, 0.517]\nB: [0.243, 0.441, 0.282, 0.498]\nC: [0.255, 0.417, 0.289, 0.471]\nD: [0.243, 0.441, 0.276, 0.494]", "input_image_path": ["./Continuous-temporal/mevis/mevis_37_0.jpg", "./Continuous-temporal/mevis/mevis_37_1.jpg", "./Continuous-temporal/mevis/mevis_37_2.jpg", "./Continuous-temporal/mevis/mevis_37_3.jpg", "./Continuous-temporal/mevis/mevis_37_4.jpg", "./Continuous-temporal/mevis/mevis_37_5.jpg", "./Continuous-temporal/mevis/mevis_37_6.jpg", "./Continuous-temporal/mevis/mevis_37_7.jpg", "./Continuous-temporal/mevis/mevis_37_8.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.501, 0.24, 0.833, 0.356]\nB: [0.408, 0.508, 0.505, 0.782]\nC: [0.408, 0.508, 0.521, 0.744]\nD: [0.395, 0.518, 0.492, 0.792]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: monkey sit still", "context": "Select from the following choices.\nA: [0.501, 0.24, 0.833, 0.356]\nB: [0.408, 0.508, 0.505, 0.782]\nC: [0.408, 0.508, 0.521, 0.744]\nD: [0.395, 0.518, 0.492, 0.792]", "input_image_path": ["./Continuous-temporal/mevis/mevis_38_0.jpg", "./Continuous-temporal/mevis/mevis_38_1.jpg", "./Continuous-temporal/mevis/mevis_38_2.jpg", "./Continuous-temporal/mevis/mevis_38_3.jpg", "./Continuous-temporal/mevis/mevis_38_4.jpg", "./Continuous-temporal/mevis/mevis_38_5.jpg", "./Continuous-temporal/mevis/mevis_38_6.jpg", "./Continuous-temporal/mevis/mevis_38_7.jpg", "./Continuous-temporal/mevis/mevis_38_8.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.352, 0.431, 0.601, 0.832]\nB: [0.435, 0.428, 0.684, 0.83]\nC: [0.532, 0.471, 0.78, 0.873]\nD: [0.435, 0.428, 0.724, 0.907]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1460 and the height is 864.\nCAPTION: The panda standing up", "context": "Select from the following choices.\nA: [0.352, 0.431, 0.601, 0.832]\nB: [0.435, 0.428, 0.684, 0.83]\nC: [0.532, 0.471, 0.78, 0.873]\nD: [0.435, 0.428, 0.724, 0.907]", "input_image_path": ["./Continuous-temporal/mevis/mevis_39_0.jpg", "./Continuous-temporal/mevis/mevis_39_1.jpg", "./Continuous-temporal/mevis/mevis_39_2.jpg", "./Continuous-temporal/mevis/mevis_39_3.jpg", "./Continuous-temporal/mevis/mevis_39_4.jpg", "./Continuous-temporal/mevis/mevis_39_5.jpg", "./Continuous-temporal/mevis/mevis_39_6.jpg", "./Continuous-temporal/mevis/mevis_39_7.jpg", "./Continuous-temporal/mevis/mevis_39_8.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.626, 0.397, 0.948, 0.649]\nB: [0.191, 0.181, 0.532, 0.751]\nC: [0.191, 0.181, 0.465, 0.806]\nD: [0.608, 0.556, 0.637, 0.939]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1280 and the height is 720.\nCAPTION: yellow truck move forward", "context": "Select from the following choices.\nA: [0.626, 0.397, 0.948, 0.649]\nB: [0.191, 0.181, 0.532, 0.751]\nC: [0.191, 0.181, 0.465, 0.806]\nD: [0.608, 0.556, 0.637, 0.939]", "input_image_path": ["./Continuous-temporal/mevis/mevis_40_0.jpg", "./Continuous-temporal/mevis/mevis_40_1.jpg", "./Continuous-temporal/mevis/mevis_40_2.jpg", "./Continuous-temporal/mevis/mevis_40_3.jpg", "./Continuous-temporal/mevis/mevis_40_4.jpg", "./Continuous-temporal/mevis/mevis_40_5.jpg", "./Continuous-temporal/mevis/mevis_40_6.jpg", "./Continuous-temporal/mevis/mevis_40_7.jpg", "./Continuous-temporal/mevis/mevis_40_8.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.0, 0.191, 0.63, 0.999]\nB: [0.0, 0.192, 0.63, 1.0]\nC: [0.248, 0.0, 0.878, 0.808]\nD: [0.353, 0.572, 0.852, 0.749]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: A black horse always facing the right", "context": "Select from the following choices.\nA: [0.0, 0.191, 0.63, 0.999]\nB: [0.0, 0.192, 0.63, 1.0]\nC: [0.248, 0.0, 0.878, 0.808]\nD: [0.353, 0.572, 0.852, 0.749]", "input_image_path": ["./Continuous-temporal/mevis/mevis_41_0.jpg", "./Continuous-temporal/mevis/mevis_41_1.jpg", "./Continuous-temporal/mevis/mevis_41_2.jpg", "./Continuous-temporal/mevis/mevis_41_3.jpg", "./Continuous-temporal/mevis/mevis_41_4.jpg", "./Continuous-temporal/mevis/mevis_41_5.jpg", "./Continuous-temporal/mevis/mevis_41_6.jpg", "./Continuous-temporal/mevis/mevis_41_7.jpg", "./Continuous-temporal/mevis/mevis_41_8.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.377, 0.561, 0.715, 0.654]\nB: [0.125, 0.545, 0.651, 1.0]\nC: [0.13, 0.544, 0.655, 0.999]\nD: [0.158, 0.519, 0.683, 0.974]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1652 and the height is 1080.\nCAPTION: rabbit jumping over another rabbit", "context": "Select from the following choices.\nA: [0.377, 0.561, 0.715, 0.654]\nB: [0.125, 0.545, 0.651, 1.0]\nC: [0.13, 0.544, 0.655, 0.999]\nD: [0.158, 0.519, 0.683, 0.974]", "input_image_path": ["./Continuous-temporal/mevis/mevis_42_0.jpg", "./Continuous-temporal/mevis/mevis_42_1.jpg", "./Continuous-temporal/mevis/mevis_42_2.jpg", "./Continuous-temporal/mevis/mevis_42_3.jpg", "./Continuous-temporal/mevis/mevis_42_4.jpg", "./Continuous-temporal/mevis/mevis_42_5.jpg", "./Continuous-temporal/mevis/mevis_42_6.jpg", "./Continuous-temporal/mevis/mevis_42_7.jpg", "./Continuous-temporal/mevis/mevis_42_8.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.366, 0.147, 0.625, 0.316]\nB: [0.316, 0.168, 0.999, 0.911]\nC: [0.316, 0.168, 1.068, 1.012]\nD: [0.316, 0.168, 0.984, 0.77]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1004.\nCAPTION: The sea turtle that swam down from the surface and reached the bottom of the pool.", "context": "Select from the following choices.\nA: [0.366, 0.147, 0.625, 0.316]\nB: [0.316, 0.168, 0.999, 0.911]\nC: [0.316, 0.168, 1.068, 1.012]\nD: [0.316, 0.168, 0.984, 0.77]", "input_image_path": ["./Continuous-temporal/mevis/mevis_43_0.jpg", "./Continuous-temporal/mevis/mevis_43_1.jpg", "./Continuous-temporal/mevis/mevis_43_2.jpg", "./Continuous-temporal/mevis/mevis_43_3.jpg", "./Continuous-temporal/mevis/mevis_43_4.jpg", "./Continuous-temporal/mevis/mevis_43_5.jpg", "./Continuous-temporal/mevis/mevis_43_6.jpg", "./Continuous-temporal/mevis/mevis_43_7.jpg", "./Continuous-temporal/mevis/mevis_43_8.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.245, 0.485, 0.546, 0.903]\nB: [0.186, 0.281, 0.36, 0.562]\nC: [0.378, 0.582, 0.679, 1.0]\nD: [0.447, 0.47, 0.62, 0.658]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: The first lizard to be taken and gripped by hand.", "context": "Select from the following choices.\nA: [0.245, 0.485, 0.546, 0.903]\nB: [0.186, 0.281, 0.36, 0.562]\nC: [0.378, 0.582, 0.679, 1.0]\nD: [0.447, 0.47, 0.62, 0.658]", "input_image_path": ["./Continuous-temporal/mevis/mevis_44_0.jpg", "./Continuous-temporal/mevis/mevis_44_1.jpg", "./Continuous-temporal/mevis/mevis_44_2.jpg", "./Continuous-temporal/mevis/mevis_44_3.jpg", "./Continuous-temporal/mevis/mevis_44_4.jpg", "./Continuous-temporal/mevis/mevis_44_5.jpg", "./Continuous-temporal/mevis/mevis_44_6.jpg", "./Continuous-temporal/mevis/mevis_44_7.jpg", "./Continuous-temporal/mevis/mevis_44_8.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.348, 0.596, 0.573, 0.807]\nB: [0.314, 0.491, 0.537, 0.721]\nC: [0.348, 0.596, 0.572, 0.825]\nD: [0.41, 0.34, 0.572, 0.592]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 544.\nCAPTION: Turtle turning without changing position", "context": "Select from the following choices.\nA: [0.348, 0.596, 0.573, 0.807]\nB: [0.314, 0.491, 0.537, 0.721]\nC: [0.348, 0.596, 0.572, 0.825]\nD: [0.41, 0.34, 0.572, 0.592]", "input_image_path": ["./Continuous-temporal/mevis/mevis_45_0.jpg", "./Continuous-temporal/mevis/mevis_45_1.jpg", "./Continuous-temporal/mevis/mevis_45_2.jpg", "./Continuous-temporal/mevis/mevis_45_3.jpg", "./Continuous-temporal/mevis/mevis_45_4.jpg", "./Continuous-temporal/mevis/mevis_45_5.jpg", "./Continuous-temporal/mevis/mevis_45_6.jpg", "./Continuous-temporal/mevis/mevis_45_7.jpg", "./Continuous-temporal/mevis/mevis_45_8.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.059, 0.252, 0.199, 0.493]\nB: [0.287, 0.193, 0.524, 0.522]\nC: [0.344, 0.539, 0.829, 0.673]\nD: [0.287, 0.193, 0.568, 0.468]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1280 and the height is 596.\nCAPTION: panda sit and eat without any moving", "context": "Select from the following choices.\nA: [0.059, 0.252, 0.199, 0.493]\nB: [0.287, 0.193, 0.524, 0.522]\nC: [0.344, 0.539, 0.829, 0.673]\nD: [0.287, 0.193, 0.568, 0.468]", "input_image_path": ["./Continuous-temporal/mevis/mevis_46_0.jpg", "./Continuous-temporal/mevis/mevis_46_1.jpg", "./Continuous-temporal/mevis/mevis_46_2.jpg", "./Continuous-temporal/mevis/mevis_46_3.jpg", "./Continuous-temporal/mevis/mevis_46_4.jpg", "./Continuous-temporal/mevis/mevis_46_5.jpg", "./Continuous-temporal/mevis/mevis_46_6.jpg", "./Continuous-temporal/mevis/mevis_46_7.jpg", "./Continuous-temporal/mevis/mevis_46_8.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.76, 0.422, 0.815, 0.712]\nB: [0.76, 0.422, 0.813, 0.716]\nC: [0.764, 0.456, 0.818, 0.746]\nD: [0.76, 0.422, 0.81, 0.669]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 2340 and the height is 1080.\nCAPTION: The bear on the right hula hooping with its neck", "context": "Select from the following choices.\nA: [0.76, 0.422, 0.815, 0.712]\nB: [0.76, 0.422, 0.813, 0.716]\nC: [0.764, 0.456, 0.818, 0.746]\nD: [0.76, 0.422, 0.81, 0.669]", "input_image_path": ["./Continuous-temporal/mevis/mevis_47_0.jpg", "./Continuous-temporal/mevis/mevis_47_1.jpg", "./Continuous-temporal/mevis/mevis_47_2.jpg", "./Continuous-temporal/mevis/mevis_47_3.jpg", "./Continuous-temporal/mevis/mevis_47_4.jpg", "./Continuous-temporal/mevis/mevis_47_5.jpg", "./Continuous-temporal/mevis/mevis_47_6.jpg", "./Continuous-temporal/mevis/mevis_47_7.jpg", "./Continuous-temporal/mevis/mevis_47_8.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.567, 0.246, 0.699, 0.561]\nB: [0.567, 0.246, 0.697, 0.532]\nC: [0.567, 0.246, 0.723, 0.553]\nD: [0.567, 0.246, 0.678, 0.553]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: black dog play with the other dog", "context": "Select from the following choices.\nA: [0.567, 0.246, 0.699, 0.561]\nB: [0.567, 0.246, 0.697, 0.532]\nC: [0.567, 0.246, 0.723, 0.553]\nD: [0.567, 0.246, 0.678, 0.553]", "input_image_path": ["./Continuous-temporal/mevis/mevis_48_0.jpg", "./Continuous-temporal/mevis/mevis_48_1.jpg", "./Continuous-temporal/mevis/mevis_48_2.jpg", "./Continuous-temporal/mevis/mevis_48_3.jpg", "./Continuous-temporal/mevis/mevis_48_4.jpg", "./Continuous-temporal/mevis/mevis_48_5.jpg", "./Continuous-temporal/mevis/mevis_48_6.jpg", "./Continuous-temporal/mevis/mevis_48_7.jpg", "./Continuous-temporal/mevis/mevis_48_8.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.024, 0.223, 0.053, 0.419]\nB: [0.014, 0.286, 0.119, 0.484]\nC: [0.433, 0.522, 0.598, 0.968]\nD: [0.044, 0.236, 0.148, 0.434]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: Fish swimming to the left then right", "context": "Select from the following choices.\nA: [0.024, 0.223, 0.053, 0.419]\nB: [0.014, 0.286, 0.119, 0.484]\nC: [0.433, 0.522, 0.598, 0.968]\nD: [0.044, 0.236, 0.148, 0.434]", "input_image_path": ["./Continuous-temporal/mevis/mevis_49_0.jpg", "./Continuous-temporal/mevis/mevis_49_1.jpg", "./Continuous-temporal/mevis/mevis_49_2.jpg", "./Continuous-temporal/mevis/mevis_49_3.jpg", "./Continuous-temporal/mevis/mevis_49_4.jpg", "./Continuous-temporal/mevis/mevis_49_5.jpg", "./Continuous-temporal/mevis/mevis_49_6.jpg", "./Continuous-temporal/mevis/mevis_49_7.jpg", "./Continuous-temporal/mevis/mevis_49_8.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.362, 0.256, 0.483, 0.664]\nB: [0.301, 0.582, 0.714, 0.651]\nC: [0.417, 0.501, 0.538, 0.91]\nD: [0.389, 0.314, 0.51, 0.722]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1280 and the height is 720.\nCAPTION: elephant walking behind and putting its trunk on others ", "context": "Select from the following choices.\nA: [0.362, 0.256, 0.483, 0.664]\nB: [0.301, 0.582, 0.714, 0.651]\nC: [0.417, 0.501, 0.538, 0.91]\nD: [0.389, 0.314, 0.51, 0.722]", "input_image_path": ["./Continuous-temporal/mevis/mevis_50_0.jpg", "./Continuous-temporal/mevis/mevis_50_1.jpg", "./Continuous-temporal/mevis/mevis_50_2.jpg", "./Continuous-temporal/mevis/mevis_50_3.jpg", "./Continuous-temporal/mevis/mevis_50_4.jpg", "./Continuous-temporal/mevis/mevis_50_5.jpg", "./Continuous-temporal/mevis/mevis_50_6.jpg", "./Continuous-temporal/mevis/mevis_50_7.jpg", "./Continuous-temporal/mevis/mevis_50_8.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.553, 0.399, 0.586, 0.565]\nB: [0.553, 0.399, 0.588, 0.567]\nC: [0.553, 0.399, 0.582, 0.542]\nD: [0.553, 0.399, 0.588, 0.559]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: man standing near bicycles", "context": "Select from the following choices.\nA: [0.553, 0.399, 0.586, 0.565]\nB: [0.553, 0.399, 0.588, 0.567]\nC: [0.553, 0.399, 0.582, 0.542]\nD: [0.553, 0.399, 0.588, 0.559]", "input_image_path": ["./Continuous-temporal/mevis/mevis_51_0.jpg", "./Continuous-temporal/mevis/mevis_51_1.jpg", "./Continuous-temporal/mevis/mevis_51_2.jpg", "./Continuous-temporal/mevis/mevis_51_3.jpg", "./Continuous-temporal/mevis/mevis_51_4.jpg", "./Continuous-temporal/mevis/mevis_51_5.jpg", "./Continuous-temporal/mevis/mevis_51_6.jpg", "./Continuous-temporal/mevis/mevis_51_7.jpg", "./Continuous-temporal/mevis/mevis_51_8.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.461, 0.381, 0.531, 0.663]\nB: [0.449, 0.417, 0.527, 0.714]\nC: [0.461, 0.381, 0.539, 0.679]\nD: [0.395, 0.484, 0.838, 0.81]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 512 and the height is 252.\nCAPTION: person pull a horse", "context": "Select from the following choices.\nA: [0.461, 0.381, 0.531, 0.663]\nB: [0.449, 0.417, 0.527, 0.714]\nC: [0.461, 0.381, 0.539, 0.679]\nD: [0.395, 0.484, 0.838, 0.81]", "input_image_path": ["./Continuous-temporal/mevis/mevis_52_0.jpg", "./Continuous-temporal/mevis/mevis_52_1.jpg", "./Continuous-temporal/mevis/mevis_52_2.jpg", "./Continuous-temporal/mevis/mevis_52_3.jpg", "./Continuous-temporal/mevis/mevis_52_4.jpg", "./Continuous-temporal/mevis/mevis_52_5.jpg", "./Continuous-temporal/mevis/mevis_52_6.jpg", "./Continuous-temporal/mevis/mevis_52_7.jpg", "./Continuous-temporal/mevis/mevis_52_8.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.683, 0.564, 0.694, 0.605]\nB: [0.688, 0.575, 0.7, 0.612]\nC: [0.683, 0.564, 0.696, 0.601]\nD: [0.683, 0.564, 0.697, 0.607]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 544.\nCAPTION: The rider on the horse running beside the white railing.", "context": "Select from the following choices.\nA: [0.683, 0.564, 0.694, 0.605]\nB: [0.688, 0.575, 0.7, 0.612]\nC: [0.683, 0.564, 0.696, 0.601]\nD: [0.683, 0.564, 0.697, 0.607]", "input_image_path": ["./Continuous-temporal/mevis/mevis_53_0.jpg", "./Continuous-temporal/mevis/mevis_53_1.jpg", "./Continuous-temporal/mevis/mevis_53_2.jpg", "./Continuous-temporal/mevis/mevis_53_3.jpg", "./Continuous-temporal/mevis/mevis_53_4.jpg", "./Continuous-temporal/mevis/mevis_53_5.jpg", "./Continuous-temporal/mevis/mevis_53_6.jpg", "./Continuous-temporal/mevis/mevis_53_7.jpg", "./Continuous-temporal/mevis/mevis_53_8.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.588, 0.919, 0.692, 0.999]\nB: [0.588, 0.919, 0.673, 1.015]\nC: [0.573, 0.92, 0.677, 1.0]\nD: [0.346, 0.162, 0.584, 0.436]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1652 and the height is 1080.\nCAPTION: The rabbit that is having food in the lower right corner.", "context": "Select from the following choices.\nA: [0.588, 0.919, 0.692, 0.999]\nB: [0.588, 0.919, 0.673, 1.015]\nC: [0.573, 0.92, 0.677, 1.0]\nD: [0.346, 0.162, 0.584, 0.436]", "input_image_path": ["./Continuous-temporal/mevis/mevis_54_0.jpg", "./Continuous-temporal/mevis/mevis_54_1.jpg", "./Continuous-temporal/mevis/mevis_54_2.jpg", "./Continuous-temporal/mevis/mevis_54_3.jpg", "./Continuous-temporal/mevis/mevis_54_4.jpg", "./Continuous-temporal/mevis/mevis_54_5.jpg", "./Continuous-temporal/mevis/mevis_54_6.jpg", "./Continuous-temporal/mevis/mevis_54_7.jpg", "./Continuous-temporal/mevis/mevis_54_8.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.148, 0.223, 0.467, 0.894]\nB: [0.743, 0.431, 0.787, 0.731]\nC: [0.148, 0.223, 0.43, 0.912]\nD: [0.002, 0.119, 0.32, 0.79]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: Kitten looking around without moving position", "context": "Select from the following choices.\nA: [0.148, 0.223, 0.467, 0.894]\nB: [0.743, 0.431, 0.787, 0.731]\nC: [0.148, 0.223, 0.43, 0.912]\nD: [0.002, 0.119, 0.32, 0.79]", "input_image_path": ["./Continuous-temporal/mevis/mevis_55_0.jpg", "./Continuous-temporal/mevis/mevis_55_1.jpg", "./Continuous-temporal/mevis/mevis_55_2.jpg", "./Continuous-temporal/mevis/mevis_55_3.jpg", "./Continuous-temporal/mevis/mevis_55_4.jpg", "./Continuous-temporal/mevis/mevis_55_5.jpg", "./Continuous-temporal/mevis/mevis_55_6.jpg", "./Continuous-temporal/mevis/mevis_55_7.jpg", "./Continuous-temporal/mevis/mevis_55_8.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.317, 0.46, 0.443, 0.741]\nB: [0.34, 0.559, 0.466, 0.84]\nC: [0.317, 0.46, 0.456, 0.772]\nD: [0.059, 0.752, 0.133, 0.831]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 544.\nCAPTION: bike move around", "context": "Select from the following choices.\nA: [0.317, 0.46, 0.443, 0.741]\nB: [0.34, 0.559, 0.466, 0.84]\nC: [0.317, 0.46, 0.456, 0.772]\nD: [0.059, 0.752, 0.133, 0.831]", "input_image_path": ["./Continuous-temporal/mevis/mevis_56_0.jpg", "./Continuous-temporal/mevis/mevis_56_1.jpg", "./Continuous-temporal/mevis/mevis_56_2.jpg", "./Continuous-temporal/mevis/mevis_56_3.jpg", "./Continuous-temporal/mevis/mevis_56_4.jpg", "./Continuous-temporal/mevis/mevis_56_5.jpg", "./Continuous-temporal/mevis/mevis_56_6.jpg", "./Continuous-temporal/mevis/mevis_56_7.jpg", "./Continuous-temporal/mevis/mevis_56_8.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.424, 0.525, 0.676, 0.927]\nB: [0.424, 0.525, 0.701, 1.007]\nC: [0.424, 0.525, 0.631, 0.91]\nD: [0.166, 0.488, 0.56, 0.691]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1460 and the height is 864.\nCAPTION: The panda lying down and eating", "context": "Select from the following choices.\nA: [0.424, 0.525, 0.676, 0.927]\nB: [0.424, 0.525, 0.701, 1.007]\nC: [0.424, 0.525, 0.631, 0.91]\nD: [0.166, 0.488, 0.56, 0.691]", "input_image_path": ["./Continuous-temporal/mevis/mevis_57_0.jpg", "./Continuous-temporal/mevis/mevis_57_1.jpg", "./Continuous-temporal/mevis/mevis_57_2.jpg", "./Continuous-temporal/mevis/mevis_57_3.jpg", "./Continuous-temporal/mevis/mevis_57_4.jpg", "./Continuous-temporal/mevis/mevis_57_5.jpg", "./Continuous-temporal/mevis/mevis_57_6.jpg", "./Continuous-temporal/mevis/mevis_57_7.jpg", "./Continuous-temporal/mevis/mevis_57_8.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.245, 0.404, 0.611, 0.725]\nB: [0.167, 0.029, 0.418, 0.199]\nC: [0.378, 0.483, 0.745, 0.804]\nD: [0.378, 0.483, 0.699, 0.8]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1280 and the height is 720.\nCAPTION: the elephant that was attacked", "context": "Select from the following choices.\nA: [0.245, 0.404, 0.611, 0.725]\nB: [0.167, 0.029, 0.418, 0.199]\nC: [0.378, 0.483, 0.745, 0.804]\nD: [0.378, 0.483, 0.699, 0.8]", "input_image_path": ["./Continuous-temporal/mevis/mevis_58_0.jpg", "./Continuous-temporal/mevis/mevis_58_1.jpg", "./Continuous-temporal/mevis/mevis_58_2.jpg", "./Continuous-temporal/mevis/mevis_58_3.jpg", "./Continuous-temporal/mevis/mevis_58_4.jpg", "./Continuous-temporal/mevis/mevis_58_5.jpg", "./Continuous-temporal/mevis/mevis_58_6.jpg", "./Continuous-temporal/mevis/mevis_58_7.jpg", "./Continuous-temporal/mevis/mevis_58_8.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.559, 0.398, 0.82, 1.094]\nB: [0.559, 0.398, 0.867, 0.998]\nC: [0.559, 0.398, 0.923, 0.881]\nD: [0.559, 0.398, 0.924, 0.884]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1280 and the height is 596.\nCAPTION: Panda sitting down to eat then moving to right to eat again", "context": "Select from the following choices.\nA: [0.559, 0.398, 0.82, 1.094]\nB: [0.559, 0.398, 0.867, 0.998]\nC: [0.559, 0.398, 0.923, 0.881]\nD: [0.559, 0.398, 0.924, 0.884]", "input_image_path": ["./Continuous-temporal/mevis/mevis_59_0.jpg", "./Continuous-temporal/mevis/mevis_59_1.jpg", "./Continuous-temporal/mevis/mevis_59_2.jpg", "./Continuous-temporal/mevis/mevis_59_3.jpg", "./Continuous-temporal/mevis/mevis_59_4.jpg", "./Continuous-temporal/mevis/mevis_59_5.jpg", "./Continuous-temporal/mevis/mevis_59_6.jpg", "./Continuous-temporal/mevis/mevis_59_7.jpg", "./Continuous-temporal/mevis/mevis_59_8.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.593, 0.415, 0.788, 0.656]\nB: [0.65, 0.351, 0.891, 0.636]\nC: [0.593, 0.415, 0.833, 0.7]\nD: [0.698, 0.435, 0.939, 0.72]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: Sheep with the black head facing down to eat then walking forward", "context": "Select from the following choices.\nA: [0.593, 0.415, 0.788, 0.656]\nB: [0.65, 0.351, 0.891, 0.636]\nC: [0.593, 0.415, 0.833, 0.7]\nD: [0.698, 0.435, 0.939, 0.72]", "input_image_path": ["./Continuous-temporal/mevis/mevis_60_0.jpg", "./Continuous-temporal/mevis/mevis_60_1.jpg", "./Continuous-temporal/mevis/mevis_60_2.jpg", "./Continuous-temporal/mevis/mevis_60_3.jpg", "./Continuous-temporal/mevis/mevis_60_4.jpg", "./Continuous-temporal/mevis/mevis_60_5.jpg", "./Continuous-temporal/mevis/mevis_60_6.jpg", "./Continuous-temporal/mevis/mevis_60_7.jpg", "./Continuous-temporal/mevis/mevis_60_8.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.648, 0.0, 1.052, 0.953]\nB: [0.648, 0.0, 0.999, 0.929]\nC: [0.559, 0.708, 0.731, 0.883]\nD: [0.648, 0.0, 0.965, 0.952]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: cat stand and climb at right", "context": "Select from the following choices.\nA: [0.648, 0.0, 1.052, 0.953]\nB: [0.648, 0.0, 0.999, 0.929]\nC: [0.559, 0.708, 0.731, 0.883]\nD: [0.648, 0.0, 0.965, 0.952]", "input_image_path": ["./Continuous-temporal/mevis/mevis_61_0.jpg", "./Continuous-temporal/mevis/mevis_61_1.jpg", "./Continuous-temporal/mevis/mevis_61_2.jpg", "./Continuous-temporal/mevis/mevis_61_3.jpg", "./Continuous-temporal/mevis/mevis_61_4.jpg", "./Continuous-temporal/mevis/mevis_61_5.jpg", "./Continuous-temporal/mevis/mevis_61_6.jpg", "./Continuous-temporal/mevis/mevis_61_7.jpg", "./Continuous-temporal/mevis/mevis_61_8.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.474, 0.155, 0.787, 0.23]\nB: [0.528, 0.295, 0.707, 0.66]\nC: [0.653, 0.775, 0.872, 0.809]\nD: [0.528, 0.295, 0.729, 0.73]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: The last lizard to be taken and held in hand.", "context": "Select from the following choices.\nA: [0.474, 0.155, 0.787, 0.23]\nB: [0.528, 0.295, 0.707, 0.66]\nC: [0.653, 0.775, 0.872, 0.809]\nD: [0.528, 0.295, 0.729, 0.73]", "input_image_path": ["./Continuous-temporal/mevis/mevis_62_0.jpg", "./Continuous-temporal/mevis/mevis_62_1.jpg", "./Continuous-temporal/mevis/mevis_62_2.jpg", "./Continuous-temporal/mevis/mevis_62_3.jpg", "./Continuous-temporal/mevis/mevis_62_4.jpg", "./Continuous-temporal/mevis/mevis_62_5.jpg", "./Continuous-temporal/mevis/mevis_62_6.jpg", "./Continuous-temporal/mevis/mevis_62_7.jpg", "./Continuous-temporal/mevis/mevis_62_8.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.0, 0.0, 0.155, 0.999]\nB: [0.058, 0.001, 0.214, 1.0]\nC: [0.0, 0.0, 0.127, 1.029]\nD: [0.01, 0.001, 0.165, 1.0]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 945.\nCAPTION: person standing behind little girl feeding rabbit", "context": "Select from the following choices.\nA: [0.0, 0.0, 0.155, 0.999]\nB: [0.058, 0.001, 0.214, 1.0]\nC: [0.0, 0.0, 0.127, 1.029]\nD: [0.01, 0.001, 0.165, 1.0]", "input_image_path": ["./Continuous-temporal/mevis/mevis_63_0.jpg", "./Continuous-temporal/mevis/mevis_63_1.jpg", "./Continuous-temporal/mevis/mevis_63_2.jpg", "./Continuous-temporal/mevis/mevis_63_3.jpg", "./Continuous-temporal/mevis/mevis_63_4.jpg", "./Continuous-temporal/mevis/mevis_63_5.jpg", "./Continuous-temporal/mevis/mevis_63_6.jpg", "./Continuous-temporal/mevis/mevis_63_7.jpg", "./Continuous-temporal/mevis/mevis_63_8.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.399, 0.593, 0.583, 0.76]\nB: [0.399, 0.593, 0.573, 0.762]\nC: [0.399, 0.593, 0.547, 0.764]\nD: [0.398, 0.662, 0.572, 0.832]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1080 and the height is 1152.\nCAPTION: Bear walking forward and jumping over a barricade", "context": "Select from the following choices.\nA: [0.399, 0.593, 0.583, 0.76]\nB: [0.399, 0.593, 0.573, 0.762]\nC: [0.399, 0.593, 0.547, 0.764]\nD: [0.398, 0.662, 0.572, 0.832]", "input_image_path": ["./Continuous-temporal/mevis/mevis_64_0.jpg", "./Continuous-temporal/mevis/mevis_64_1.jpg", "./Continuous-temporal/mevis/mevis_64_2.jpg", "./Continuous-temporal/mevis/mevis_64_3.jpg", "./Continuous-temporal/mevis/mevis_64_4.jpg", "./Continuous-temporal/mevis/mevis_64_5.jpg", "./Continuous-temporal/mevis/mevis_64_6.jpg", "./Continuous-temporal/mevis/mevis_64_7.jpg", "./Continuous-temporal/mevis/mevis_64_8.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.098, 0.372, 0.469, 0.908]\nB: [0.0, 0.428, 0.323, 1.0]\nC: [0.098, 0.372, 0.421, 0.944]\nD: [0.098, 0.372, 0.402, 1.019]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1652 and the height is 1080.\nCAPTION: rabbit eating and walking", "context": "Select from the following choices.\nA: [0.098, 0.372, 0.469, 0.908]\nB: [0.0, 0.428, 0.323, 1.0]\nC: [0.098, 0.372, 0.421, 0.944]\nD: [0.098, 0.372, 0.402, 1.019]", "input_image_path": ["./Continuous-temporal/mevis/mevis_65_0.jpg", "./Continuous-temporal/mevis/mevis_65_1.jpg", "./Continuous-temporal/mevis/mevis_65_2.jpg", "./Continuous-temporal/mevis/mevis_65_3.jpg", "./Continuous-temporal/mevis/mevis_65_4.jpg", "./Continuous-temporal/mevis/mevis_65_5.jpg", "./Continuous-temporal/mevis/mevis_65_6.jpg", "./Continuous-temporal/mevis/mevis_65_7.jpg", "./Continuous-temporal/mevis/mevis_65_8.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.434, 0.53, 0.716, 0.826]\nB: [0.419, 0.025, 0.783, 0.21]\nC: [0.45, 0.638, 0.695, 1.0]\nD: [0.434, 0.53, 0.68, 0.892]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 640 and the height is 362.\nCAPTION: The final cow that stepped forward.", "context": "Select from the following choices.\nA: [0.434, 0.53, 0.716, 0.826]\nB: [0.419, 0.025, 0.783, 0.21]\nC: [0.45, 0.638, 0.695, 1.0]\nD: [0.434, 0.53, 0.68, 0.892]", "input_image_path": ["./Continuous-temporal/mevis/mevis_66_0.jpg", "./Continuous-temporal/mevis/mevis_66_1.jpg", "./Continuous-temporal/mevis/mevis_66_2.jpg", "./Continuous-temporal/mevis/mevis_66_3.jpg", "./Continuous-temporal/mevis/mevis_66_4.jpg", "./Continuous-temporal/mevis/mevis_66_5.jpg", "./Continuous-temporal/mevis/mevis_66_6.jpg", "./Continuous-temporal/mevis/mevis_66_7.jpg", "./Continuous-temporal/mevis/mevis_66_8.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.433, 0.508, 0.731, 0.844]\nB: [0.375, 0.594, 0.627, 1.0]\nC: [0.433, 0.508, 0.684, 0.914]\nD: [0.315, 0.626, 0.408, 0.671]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1460 and the height is 864.\nCAPTION: sit on the ground and eat then lay down and turn over", "context": "Select from the following choices.\nA: [0.433, 0.508, 0.731, 0.844]\nB: [0.375, 0.594, 0.627, 1.0]\nC: [0.433, 0.508, 0.684, 0.914]\nD: [0.315, 0.626, 0.408, 0.671]", "input_image_path": ["./Continuous-temporal/mevis/mevis_67_0.jpg", "./Continuous-temporal/mevis/mevis_67_1.jpg", "./Continuous-temporal/mevis/mevis_67_2.jpg", "./Continuous-temporal/mevis/mevis_67_3.jpg", "./Continuous-temporal/mevis/mevis_67_4.jpg", "./Continuous-temporal/mevis/mevis_67_5.jpg", "./Continuous-temporal/mevis/mevis_67_6.jpg", "./Continuous-temporal/mevis/mevis_67_7.jpg", "./Continuous-temporal/mevis/mevis_67_8.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.036, 0.0, 0.554, 1.066]\nB: [0.106, 0.0, 0.632, 0.999]\nC: [0.036, 0.0, 0.563, 0.999]\nD: [0.102, 0.04, 0.442, 0.333]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 945.\nCAPTION: little girl feeding rabbit", "context": "Select from the following choices.\nA: [0.036, 0.0, 0.554, 1.066]\nB: [0.106, 0.0, 0.632, 0.999]\nC: [0.036, 0.0, 0.563, 0.999]\nD: [0.102, 0.04, 0.442, 0.333]", "input_image_path": ["./Continuous-temporal/mevis/mevis_68_0.jpg", "./Continuous-temporal/mevis/mevis_68_1.jpg", "./Continuous-temporal/mevis/mevis_68_2.jpg", "./Continuous-temporal/mevis/mevis_68_3.jpg", "./Continuous-temporal/mevis/mevis_68_4.jpg", "./Continuous-temporal/mevis/mevis_68_5.jpg", "./Continuous-temporal/mevis/mevis_68_6.jpg", "./Continuous-temporal/mevis/mevis_68_7.jpg", "./Continuous-temporal/mevis/mevis_68_8.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.411, 0.501, 0.508, 0.771]\nB: [0.288, 0.192, 0.605, 0.634]\nC: [0.397, 0.621, 0.494, 0.892]\nD: [0.411, 0.501, 0.521, 0.733]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: The monkey who has been sitting", "context": "Select from the following choices.\nA: [0.411, 0.501, 0.508, 0.771]\nB: [0.288, 0.192, 0.605, 0.634]\nC: [0.397, 0.621, 0.494, 0.892]\nD: [0.411, 0.501, 0.521, 0.733]", "input_image_path": ["./Continuous-temporal/mevis/mevis_69_0.jpg", "./Continuous-temporal/mevis/mevis_69_1.jpg", "./Continuous-temporal/mevis/mevis_69_2.jpg", "./Continuous-temporal/mevis/mevis_69_3.jpg", "./Continuous-temporal/mevis/mevis_69_4.jpg", "./Continuous-temporal/mevis/mevis_69_5.jpg", "./Continuous-temporal/mevis/mevis_69_6.jpg", "./Continuous-temporal/mevis/mevis_69_7.jpg", "./Continuous-temporal/mevis/mevis_69_8.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.642, 0.553, 0.662, 0.59]\nB: [0.642, 0.553, 0.66, 0.597]\nC: [0.642, 0.553, 0.66, 0.601]\nD: [0.644, 0.559, 0.662, 0.603]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 544.\nCAPTION: The equestrian on the horse running alongside the white fence.", "context": "Select from the following choices.\nA: [0.642, 0.553, 0.662, 0.59]\nB: [0.642, 0.553, 0.66, 0.597]\nC: [0.642, 0.553, 0.66, 0.601]\nD: [0.644, 0.559, 0.662, 0.603]", "input_image_path": ["./Continuous-temporal/mevis/mevis_70_0.jpg", "./Continuous-temporal/mevis/mevis_70_1.jpg", "./Continuous-temporal/mevis/mevis_70_2.jpg", "./Continuous-temporal/mevis/mevis_70_3.jpg", "./Continuous-temporal/mevis/mevis_70_4.jpg", "./Continuous-temporal/mevis/mevis_70_5.jpg", "./Continuous-temporal/mevis/mevis_70_6.jpg", "./Continuous-temporal/mevis/mevis_70_7.jpg", "./Continuous-temporal/mevis/mevis_70_8.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.0, 0.481, 0.272, 0.998]\nB: [0.0, 0.483, 0.272, 1.0]\nC: [0.502, 0.515, 0.584, 0.925]\nD: [0.0, 0.481, 0.32, 1.033]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 640 and the height is 480.\nCAPTION: the one that was picked up by hand", "context": "Select from the following choices.\nA: [0.0, 0.481, 0.272, 0.998]\nB: [0.0, 0.483, 0.272, 1.0]\nC: [0.502, 0.515, 0.584, 0.925]\nD: [0.0, 0.481, 0.32, 1.033]", "input_image_path": ["./Continuous-temporal/mevis/mevis_71_0.jpg", "./Continuous-temporal/mevis/mevis_71_1.jpg", "./Continuous-temporal/mevis/mevis_71_2.jpg", "./Continuous-temporal/mevis/mevis_71_3.jpg", "./Continuous-temporal/mevis/mevis_71_4.jpg", "./Continuous-temporal/mevis/mevis_71_5.jpg", "./Continuous-temporal/mevis/mevis_71_6.jpg", "./Continuous-temporal/mevis/mevis_71_7.jpg", "./Continuous-temporal/mevis/mevis_71_8.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.176, 0.451, 0.526, 0.801]\nB: [0.176, 0.451, 0.481, 0.763]\nC: [0.176, 0.451, 0.493, 0.833]\nD: [0.24, 0.387, 0.557, 0.769]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: The lizard that was captured and immediately held in hand.", "context": "Select from the following choices.\nA: [0.176, 0.451, 0.526, 0.801]\nB: [0.176, 0.451, 0.481, 0.763]\nC: [0.176, 0.451, 0.493, 0.833]\nD: [0.24, 0.387, 0.557, 0.769]", "input_image_path": ["./Continuous-temporal/mevis/mevis_72_0.jpg", "./Continuous-temporal/mevis/mevis_72_1.jpg", "./Continuous-temporal/mevis/mevis_72_2.jpg", "./Continuous-temporal/mevis/mevis_72_3.jpg", "./Continuous-temporal/mevis/mevis_72_4.jpg", "./Continuous-temporal/mevis/mevis_72_5.jpg", "./Continuous-temporal/mevis/mevis_72_6.jpg", "./Continuous-temporal/mevis/mevis_72_7.jpg", "./Continuous-temporal/mevis/mevis_72_8.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.152, 0.0, 1.001, 0.211]\nB: [0.67, 0.739, 0.865, 0.901]\nC: [0.152, 0.0, 0.948, 0.25]\nD: [0.152, 0.0, 0.999, 0.26]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1766 and the height is 945.\nCAPTION: move forward towards us", "context": "Select from the following choices.\nA: [0.152, 0.0, 1.001, 0.211]\nB: [0.67, 0.739, 0.865, 0.901]\nC: [0.152, 0.0, 0.948, 0.25]\nD: [0.152, 0.0, 0.999, 0.26]", "input_image_path": ["./Continuous-temporal/mevis/mevis_73_0.jpg", "./Continuous-temporal/mevis/mevis_73_1.jpg", "./Continuous-temporal/mevis/mevis_73_2.jpg", "./Continuous-temporal/mevis/mevis_73_3.jpg", "./Continuous-temporal/mevis/mevis_73_4.jpg", "./Continuous-temporal/mevis/mevis_73_5.jpg", "./Continuous-temporal/mevis/mevis_73_6.jpg", "./Continuous-temporal/mevis/mevis_73_7.jpg", "./Continuous-temporal/mevis/mevis_73_8.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.65, 0.397, 0.915, 0.586]\nB: [0.65, 0.397, 0.921, 0.608]\nC: [0.735, 0.314, 1.0, 0.504]\nD: [0.662, 0.298, 0.681, 0.551]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 544.\nCAPTION: silver turning and driving to left", "context": "Select from the following choices.\nA: [0.65, 0.397, 0.915, 0.586]\nB: [0.65, 0.397, 0.921, 0.608]\nC: [0.735, 0.314, 1.0, 0.504]\nD: [0.662, 0.298, 0.681, 0.551]", "input_image_path": ["./Continuous-temporal/mevis/mevis_74_0.jpg", "./Continuous-temporal/mevis/mevis_74_1.jpg", "./Continuous-temporal/mevis/mevis_74_2.jpg", "./Continuous-temporal/mevis/mevis_74_3.jpg", "./Continuous-temporal/mevis/mevis_74_4.jpg", "./Continuous-temporal/mevis/mevis_74_5.jpg", "./Continuous-temporal/mevis/mevis_74_6.jpg", "./Continuous-temporal/mevis/mevis_74_7.jpg", "./Continuous-temporal/mevis/mevis_74_8.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.942, 0.121, 1.0, 0.271]\nB: [0.306, 0.752, 0.469, 0.964]\nC: [0.941, 0.179, 0.999, 0.329]\nD: [0.941, 0.179, 1.006, 0.32]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: The motionless black car parked.", "context": "Select from the following choices.\nA: [0.942, 0.121, 1.0, 0.271]\nB: [0.306, 0.752, 0.469, 0.964]\nC: [0.941, 0.179, 0.999, 0.329]\nD: [0.941, 0.179, 1.006, 0.32]", "input_image_path": ["./Continuous-temporal/mevis/mevis_75_0.jpg", "./Continuous-temporal/mevis/mevis_75_1.jpg", "./Continuous-temporal/mevis/mevis_75_2.jpg", "./Continuous-temporal/mevis/mevis_75_3.jpg", "./Continuous-temporal/mevis/mevis_75_4.jpg", "./Continuous-temporal/mevis/mevis_75_5.jpg", "./Continuous-temporal/mevis/mevis_75_6.jpg", "./Continuous-temporal/mevis/mevis_75_7.jpg", "./Continuous-temporal/mevis/mevis_75_8.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.111, 0.693, 0.399, 0.884]\nB: [0.233, 0.673, 0.51, 0.825]\nC: [0.233, 0.673, 0.521, 0.864]\nD: [0.479, 0.156, 0.73, 0.526]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 544.\nCAPTION: moving from middle to left", "context": "Select from the following choices.\nA: [0.111, 0.693, 0.399, 0.884]\nB: [0.233, 0.673, 0.51, 0.825]\nC: [0.233, 0.673, 0.521, 0.864]\nD: [0.479, 0.156, 0.73, 0.526]", "input_image_path": ["./Continuous-temporal/mevis/mevis_76_0.jpg", "./Continuous-temporal/mevis/mevis_76_1.jpg", "./Continuous-temporal/mevis/mevis_76_2.jpg", "./Continuous-temporal/mevis/mevis_76_3.jpg", "./Continuous-temporal/mevis/mevis_76_4.jpg", "./Continuous-temporal/mevis/mevis_76_5.jpg", "./Continuous-temporal/mevis/mevis_76_6.jpg", "./Continuous-temporal/mevis/mevis_76_7.jpg", "./Continuous-temporal/mevis/mevis_76_8.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.328, 0.296, 0.403, 0.708]\nB: [0.328, 0.296, 0.414, 0.676]\nC: [0.321, 0.452, 0.406, 0.833]\nD: [0.31, 0.485, 0.396, 0.866]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 544.\nCAPTION: people move around", "context": "Select from the following choices.\nA: [0.328, 0.296, 0.403, 0.708]\nB: [0.328, 0.296, 0.414, 0.676]\nC: [0.321, 0.452, 0.406, 0.833]\nD: [0.31, 0.485, 0.396, 0.866]", "input_image_path": ["./Continuous-temporal/mevis/mevis_77_0.jpg", "./Continuous-temporal/mevis/mevis_77_1.jpg", "./Continuous-temporal/mevis/mevis_77_2.jpg", "./Continuous-temporal/mevis/mevis_77_3.jpg", "./Continuous-temporal/mevis/mevis_77_4.jpg", "./Continuous-temporal/mevis/mevis_77_5.jpg", "./Continuous-temporal/mevis/mevis_77_6.jpg", "./Continuous-temporal/mevis/mevis_77_7.jpg", "./Continuous-temporal/mevis/mevis_77_8.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.329, 0.896, 0.467, 0.965]\nB: [0.0, 0.333, 0.02, 0.431]\nC: [0.0, 0.381, 0.02, 0.479]\nD: [0.0, 0.333, 0.023, 0.423]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 480.\nCAPTION: bird standing still on cage", "context": "Select from the following choices.\nA: [0.329, 0.896, 0.467, 0.965]\nB: [0.0, 0.333, 0.02, 0.431]\nC: [0.0, 0.381, 0.02, 0.479]\nD: [0.0, 0.333, 0.023, 0.423]", "input_image_path": ["./Continuous-temporal/mevis/mevis_78_0.jpg", "./Continuous-temporal/mevis/mevis_78_1.jpg", "./Continuous-temporal/mevis/mevis_78_2.jpg", "./Continuous-temporal/mevis/mevis_78_3.jpg", "./Continuous-temporal/mevis/mevis_78_4.jpg", "./Continuous-temporal/mevis/mevis_78_5.jpg", "./Continuous-temporal/mevis/mevis_78_6.jpg", "./Continuous-temporal/mevis/mevis_78_7.jpg", "./Continuous-temporal/mevis/mevis_78_8.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.418, 0.227, 0.47, 0.406]\nB: [0.396, 0.311, 0.448, 0.49]\nC: [0.414, 0.231, 0.466, 0.41]\nD: [0.414, 0.231, 0.456, 0.437]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: Monkey stand on a cart, then walk around", "context": "Select from the following choices.\nA: [0.418, 0.227, 0.47, 0.406]\nB: [0.396, 0.311, 0.448, 0.49]\nC: [0.414, 0.231, 0.466, 0.41]\nD: [0.414, 0.231, 0.456, 0.437]", "input_image_path": ["./Continuous-temporal/mevis/mevis_79_0.jpg", "./Continuous-temporal/mevis/mevis_79_1.jpg", "./Continuous-temporal/mevis/mevis_79_2.jpg", "./Continuous-temporal/mevis/mevis_79_3.jpg", "./Continuous-temporal/mevis/mevis_79_4.jpg", "./Continuous-temporal/mevis/mevis_79_5.jpg", "./Continuous-temporal/mevis/mevis_79_6.jpg", "./Continuous-temporal/mevis/mevis_79_7.jpg", "./Continuous-temporal/mevis/mevis_79_8.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.113, 0.338, 0.473, 0.76]\nB: [0.49, 0.271, 0.523, 0.461]\nC: [0.505, 0.233, 0.538, 0.424]\nD: [0.505, 0.233, 0.544, 0.429]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1280 and the height is 720.\nCAPTION: The monkey sitting crouched at the center of the hole.", "context": "Select from the following choices.\nA: [0.113, 0.338, 0.473, 0.76]\nB: [0.49, 0.271, 0.523, 0.461]\nC: [0.505, 0.233, 0.538, 0.424]\nD: [0.505, 0.233, 0.544, 0.429]", "input_image_path": ["./Continuous-temporal/mevis/mevis_80_0.jpg", "./Continuous-temporal/mevis/mevis_80_1.jpg", "./Continuous-temporal/mevis/mevis_80_2.jpg", "./Continuous-temporal/mevis/mevis_80_3.jpg", "./Continuous-temporal/mevis/mevis_80_4.jpg", "./Continuous-temporal/mevis/mevis_80_5.jpg", "./Continuous-temporal/mevis/mevis_80_6.jpg", "./Continuous-temporal/mevis/mevis_80_7.jpg", "./Continuous-temporal/mevis/mevis_80_8.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.407, 0.359, 0.537, 0.715]\nB: [0.407, 0.359, 0.534, 0.764]\nC: [0.407, 0.359, 0.517, 0.71]\nD: [0.407, 0.359, 0.539, 0.777]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1567 and the height is 730.\nCAPTION: the cow eating without moving position", "context": "Select from the following choices.\nA: [0.407, 0.359, 0.537, 0.715]\nB: [0.407, 0.359, 0.534, 0.764]\nC: [0.407, 0.359, 0.517, 0.71]\nD: [0.407, 0.359, 0.539, 0.777]", "input_image_path": ["./Continuous-temporal/mevis/mevis_81_0.jpg", "./Continuous-temporal/mevis/mevis_81_1.jpg", "./Continuous-temporal/mevis/mevis_81_2.jpg", "./Continuous-temporal/mevis/mevis_81_3.jpg", "./Continuous-temporal/mevis/mevis_81_4.jpg", "./Continuous-temporal/mevis/mevis_81_5.jpg", "./Continuous-temporal/mevis/mevis_81_6.jpg", "./Continuous-temporal/mevis/mevis_81_7.jpg", "./Continuous-temporal/mevis/mevis_81_8.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.809, 0.0, 0.998, 0.577]\nB: [0.58, 0.129, 0.956, 0.394]\nC: [0.811, 0.092, 1.0, 0.669]\nD: [0.777, 0.058, 0.966, 0.635]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 640 and the height is 480.\nCAPTION: Hand of human holding food and pulling lizard up", "context": "Select from the following choices.\nA: [0.809, 0.0, 0.998, 0.577]\nB: [0.58, 0.129, 0.956, 0.394]\nC: [0.811, 0.092, 1.0, 0.669]\nD: [0.777, 0.058, 0.966, 0.635]", "input_image_path": ["./Continuous-temporal/mevis/mevis_82_0.jpg", "./Continuous-temporal/mevis/mevis_82_1.jpg", "./Continuous-temporal/mevis/mevis_82_2.jpg", "./Continuous-temporal/mevis/mevis_82_3.jpg", "./Continuous-temporal/mevis/mevis_82_4.jpg", "./Continuous-temporal/mevis/mevis_82_5.jpg", "./Continuous-temporal/mevis/mevis_82_6.jpg", "./Continuous-temporal/mevis/mevis_82_7.jpg", "./Continuous-temporal/mevis/mevis_82_8.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.236, 0.463, 0.27, 0.502]\nB: [0.214, 0.235, 0.507, 0.347]\nC: [0.341, 0.844, 0.493, 0.93]\nD: [0.236, 0.463, 0.275, 0.502]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 544.\nCAPTION: The first car moving in a straight line.", "context": "Select from the following choices.\nA: [0.236, 0.463, 0.27, 0.502]\nB: [0.214, 0.235, 0.507, 0.347]\nC: [0.341, 0.844, 0.493, 0.93]\nD: [0.236, 0.463, 0.275, 0.502]", "input_image_path": ["./Continuous-temporal/mevis/mevis_83_0.jpg", "./Continuous-temporal/mevis/mevis_83_1.jpg", "./Continuous-temporal/mevis/mevis_83_2.jpg", "./Continuous-temporal/mevis/mevis_83_3.jpg", "./Continuous-temporal/mevis/mevis_83_4.jpg", "./Continuous-temporal/mevis/mevis_83_5.jpg", "./Continuous-temporal/mevis/mevis_83_6.jpg", "./Continuous-temporal/mevis/mevis_83_7.jpg", "./Continuous-temporal/mevis/mevis_83_8.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.31, 0.168, 0.824, 0.958]\nB: [0.07, 0.769, 0.559, 0.911]\nC: [0.321, 0.693, 0.776, 0.816]\nD: [0.266, 0.163, 0.781, 0.954]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: The horse facing right then turning and facing left", "context": "Select from the following choices.\nA: [0.31, 0.168, 0.824, 0.958]\nB: [0.07, 0.769, 0.559, 0.911]\nC: [0.321, 0.693, 0.776, 0.816]\nD: [0.266, 0.163, 0.781, 0.954]", "input_image_path": ["./Continuous-temporal/mevis/mevis_84_0.jpg", "./Continuous-temporal/mevis/mevis_84_1.jpg", "./Continuous-temporal/mevis/mevis_84_2.jpg", "./Continuous-temporal/mevis/mevis_84_3.jpg", "./Continuous-temporal/mevis/mevis_84_4.jpg", "./Continuous-temporal/mevis/mevis_84_5.jpg", "./Continuous-temporal/mevis/mevis_84_6.jpg", "./Continuous-temporal/mevis/mevis_84_7.jpg", "./Continuous-temporal/mevis/mevis_84_8.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.687, 0.43, 0.892, 0.709]\nB: [0.389, 0.685, 0.827, 0.74]\nC: [0.686, 0.508, 0.87, 0.754]\nD: [0.687, 0.43, 0.871, 0.675]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: Bicycle moving in a circle", "context": "Select from the following choices.\nA: [0.687, 0.43, 0.892, 0.709]\nB: [0.389, 0.685, 0.827, 0.74]\nC: [0.686, 0.508, 0.87, 0.754]\nD: [0.687, 0.43, 0.871, 0.675]", "input_image_path": ["./Continuous-temporal/mevis/mevis_85_0.jpg", "./Continuous-temporal/mevis/mevis_85_1.jpg", "./Continuous-temporal/mevis/mevis_85_2.jpg", "./Continuous-temporal/mevis/mevis_85_3.jpg", "./Continuous-temporal/mevis/mevis_85_4.jpg", "./Continuous-temporal/mevis/mevis_85_5.jpg", "./Continuous-temporal/mevis/mevis_85_6.jpg", "./Continuous-temporal/mevis/mevis_85_7.jpg", "./Continuous-temporal/mevis/mevis_85_8.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.267, 0.388, 0.508, 0.823]\nB: [0.115, 0.0, 0.357, 0.435]\nC: [0.169, 0.184, 0.41, 0.619]\nD: [0.122, 0.181, 0.364, 0.617]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: The second bird to reach the bottom of the cage.", "context": "Select from the following choices.\nA: [0.267, 0.388, 0.508, 0.823]\nB: [0.115, 0.0, 0.357, 0.435]\nC: [0.169, 0.184, 0.41, 0.619]\nD: [0.122, 0.181, 0.364, 0.617]", "input_image_path": ["./Continuous-temporal/mevis/mevis_86_0.jpg", "./Continuous-temporal/mevis/mevis_86_1.jpg", "./Continuous-temporal/mevis/mevis_86_2.jpg", "./Continuous-temporal/mevis/mevis_86_3.jpg", "./Continuous-temporal/mevis/mevis_86_4.jpg", "./Continuous-temporal/mevis/mevis_86_5.jpg", "./Continuous-temporal/mevis/mevis_86_6.jpg", "./Continuous-temporal/mevis/mevis_86_7.jpg", "./Continuous-temporal/mevis/mevis_86_8.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.324, 0.576, 0.95, 0.734]\nB: [0.369, 0.517, 0.994, 0.676]\nC: [0.35, 0.59, 0.976, 0.748]\nD: [0.333, 0.072, 0.674, 0.306]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1080 and the height is 1920.\nCAPTION: white fish swiming and moving a bit", "context": "Select from the following choices.\nA: [0.324, 0.576, 0.95, 0.734]\nB: [0.369, 0.517, 0.994, 0.676]\nC: [0.35, 0.59, 0.976, 0.748]\nD: [0.333, 0.072, 0.674, 0.306]", "input_image_path": ["./Continuous-temporal/mevis/mevis_87_0.jpg", "./Continuous-temporal/mevis/mevis_87_1.jpg", "./Continuous-temporal/mevis/mevis_87_2.jpg", "./Continuous-temporal/mevis/mevis_87_3.jpg", "./Continuous-temporal/mevis/mevis_87_4.jpg", "./Continuous-temporal/mevis/mevis_87_5.jpg", "./Continuous-temporal/mevis/mevis_87_6.jpg", "./Continuous-temporal/mevis/mevis_87_7.jpg", "./Continuous-temporal/mevis/mevis_87_8.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.324, 0.006, 0.572, 0.471]\nB: [0.765, 0.374, 0.807, 0.671]\nC: [0.765, 0.374, 0.806, 0.701]\nD: [0.026, 0.234, 0.297, 0.251]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 2340 and the height is 1080.\nCAPTION: A bear standing upright, twirling a hula hoop with its neck.", "context": "Select from the following choices.\nA: [0.324, 0.006, 0.572, 0.471]\nB: [0.765, 0.374, 0.807, 0.671]\nC: [0.765, 0.374, 0.806, 0.701]\nD: [0.026, 0.234, 0.297, 0.251]", "input_image_path": ["./Continuous-temporal/mevis/mevis_88_0.jpg", "./Continuous-temporal/mevis/mevis_88_1.jpg", "./Continuous-temporal/mevis/mevis_88_2.jpg", "./Continuous-temporal/mevis/mevis_88_3.jpg", "./Continuous-temporal/mevis/mevis_88_4.jpg", "./Continuous-temporal/mevis/mevis_88_5.jpg", "./Continuous-temporal/mevis/mevis_88_6.jpg", "./Continuous-temporal/mevis/mevis_88_7.jpg", "./Continuous-temporal/mevis/mevis_88_8.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.038, 0.141, 0.311, 0.554]\nB: [0.352, 0.368, 0.706, 0.46]\nC: [0.16, 0.219, 0.481, 0.745]\nD: [0.229, 0.09, 0.551, 0.617]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1652 and the height is 1080.\nCAPTION: The white rabbit that hasn't moved from its position.", "context": "Select from the following choices.\nA: [0.038, 0.141, 0.311, 0.554]\nB: [0.352, 0.368, 0.706, 0.46]\nC: [0.16, 0.219, 0.481, 0.745]\nD: [0.229, 0.09, 0.551, 0.617]", "input_image_path": ["./Continuous-temporal/mevis/mevis_89_0.jpg", "./Continuous-temporal/mevis/mevis_89_1.jpg", "./Continuous-temporal/mevis/mevis_89_2.jpg", "./Continuous-temporal/mevis/mevis_89_3.jpg", "./Continuous-temporal/mevis/mevis_89_4.jpg", "./Continuous-temporal/mevis/mevis_89_5.jpg", "./Continuous-temporal/mevis/mevis_89_6.jpg", "./Continuous-temporal/mevis/mevis_89_7.jpg", "./Continuous-temporal/mevis/mevis_89_8.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.286, 0.38, 0.393, 0.766]\nB: [0.283, 0.341, 0.39, 0.727]\nC: [0.217, 0.722, 0.513, 0.913]\nD: [0.254, 0.176, 0.36, 0.562]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: turn and walk away from us", "context": "Select from the following choices.\nA: [0.286, 0.38, 0.393, 0.766]\nB: [0.283, 0.341, 0.39, 0.727]\nC: [0.217, 0.722, 0.513, 0.913]\nD: [0.254, 0.176, 0.36, 0.562]", "input_image_path": ["./Continuous-temporal/mevis/mevis_90_0.jpg", "./Continuous-temporal/mevis/mevis_90_1.jpg", "./Continuous-temporal/mevis/mevis_90_2.jpg", "./Continuous-temporal/mevis/mevis_90_3.jpg", "./Continuous-temporal/mevis/mevis_90_4.jpg", "./Continuous-temporal/mevis/mevis_90_5.jpg", "./Continuous-temporal/mevis/mevis_90_6.jpg", "./Continuous-temporal/mevis/mevis_90_7.jpg", "./Continuous-temporal/mevis/mevis_90_8.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.111, 0.251, 0.29, 0.589]\nB: [0.051, 0.513, 0.218, 0.995]\nC: [0.041, 0.149, 0.22, 0.487]\nD: [0.078, 0.387, 0.43, 0.512]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: The bird that arrived at the cage bottom in second place.", "context": "Select from the following choices.\nA: [0.111, 0.251, 0.29, 0.589]\nB: [0.051, 0.513, 0.218, 0.995]\nC: [0.041, 0.149, 0.22, 0.487]\nD: [0.078, 0.387, 0.43, 0.512]", "input_image_path": ["./Continuous-temporal/mevis/mevis_91_0.jpg", "./Continuous-temporal/mevis/mevis_91_1.jpg", "./Continuous-temporal/mevis/mevis_91_2.jpg", "./Continuous-temporal/mevis/mevis_91_3.jpg", "./Continuous-temporal/mevis/mevis_91_4.jpg", "./Continuous-temporal/mevis/mevis_91_5.jpg", "./Continuous-temporal/mevis/mevis_91_6.jpg", "./Continuous-temporal/mevis/mevis_91_7.jpg", "./Continuous-temporal/mevis/mevis_91_8.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.561, 0.561, 0.783, 0.887]\nB: [0.456, 0.659, 0.679, 0.985]\nC: [0.561, 0.561, 0.76, 0.854]\nD: [0.561, 0.561, 0.783, 0.858]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: sheep eating food from bowl then eating food from human hand", "context": "Select from the following choices.\nA: [0.561, 0.561, 0.783, 0.887]\nB: [0.456, 0.659, 0.679, 0.985]\nC: [0.561, 0.561, 0.76, 0.854]\nD: [0.561, 0.561, 0.783, 0.858]", "input_image_path": ["./Continuous-temporal/mevis/mevis_92_0.jpg", "./Continuous-temporal/mevis/mevis_92_1.jpg", "./Continuous-temporal/mevis/mevis_92_2.jpg", "./Continuous-temporal/mevis/mevis_92_3.jpg", "./Continuous-temporal/mevis/mevis_92_4.jpg", "./Continuous-temporal/mevis/mevis_92_5.jpg", "./Continuous-temporal/mevis/mevis_92_6.jpg", "./Continuous-temporal/mevis/mevis_92_7.jpg", "./Continuous-temporal/mevis/mevis_92_8.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.027, 0.0, 0.999, 0.999]\nB: [0.349, 0.066, 0.418, 0.457]\nC: [0.002, 0.0, 0.974, 0.999]\nD: [0.027, 0.0, 0.853, 1.096]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: The individual gripping the lizard.", "context": "Select from the following choices.\nA: [0.027, 0.0, 0.999, 0.999]\nB: [0.349, 0.066, 0.418, 0.457]\nC: [0.002, 0.0, 0.974, 0.999]\nD: [0.027, 0.0, 0.853, 1.096]", "input_image_path": ["./Continuous-temporal/mevis/mevis_93_0.jpg", "./Continuous-temporal/mevis/mevis_93_1.jpg", "./Continuous-temporal/mevis/mevis_93_2.jpg", "./Continuous-temporal/mevis/mevis_93_3.jpg", "./Continuous-temporal/mevis/mevis_93_4.jpg", "./Continuous-temporal/mevis/mevis_93_5.jpg", "./Continuous-temporal/mevis/mevis_93_6.jpg", "./Continuous-temporal/mevis/mevis_93_7.jpg", "./Continuous-temporal/mevis/mevis_93_8.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.223, 0.608, 0.363, 1.0]\nB: [0.233, 0.388, 0.373, 0.779]\nC: [0.27, 0.458, 0.41, 0.85]\nD: [0.27, 0.458, 0.384, 0.852]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 636 and the height is 480.\nCAPTION: The tiger that moved from the right to the left.", "context": "Select from the following choices.\nA: [0.223, 0.608, 0.363, 1.0]\nB: [0.233, 0.388, 0.373, 0.779]\nC: [0.27, 0.458, 0.41, 0.85]\nD: [0.27, 0.458, 0.384, 0.852]", "input_image_path": ["./Continuous-temporal/mevis/mevis_94_0.jpg", "./Continuous-temporal/mevis/mevis_94_1.jpg", "./Continuous-temporal/mevis/mevis_94_2.jpg", "./Continuous-temporal/mevis/mevis_94_3.jpg", "./Continuous-temporal/mevis/mevis_94_4.jpg", "./Continuous-temporal/mevis/mevis_94_5.jpg", "./Continuous-temporal/mevis/mevis_94_6.jpg", "./Continuous-temporal/mevis/mevis_94_7.jpg", "./Continuous-temporal/mevis/mevis_94_8.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.398, 0.841, 0.564, 1.0]\nB: [0.493, 0.807, 0.658, 0.967]\nC: [0.465, 0.84, 0.661, 1.019]\nD: [0.465, 0.84, 0.63, 0.999]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: fish stay still without moving position", "context": "Select from the following choices.\nA: [0.398, 0.841, 0.564, 1.0]\nB: [0.493, 0.807, 0.658, 0.967]\nC: [0.465, 0.84, 0.661, 1.019]\nD: [0.465, 0.84, 0.63, 0.999]", "input_image_path": ["./Continuous-temporal/mevis/mevis_95_0.jpg", "./Continuous-temporal/mevis/mevis_95_1.jpg", "./Continuous-temporal/mevis/mevis_95_2.jpg", "./Continuous-temporal/mevis/mevis_95_3.jpg", "./Continuous-temporal/mevis/mevis_95_4.jpg", "./Continuous-temporal/mevis/mevis_95_5.jpg", "./Continuous-temporal/mevis/mevis_95_6.jpg", "./Continuous-temporal/mevis/mevis_95_7.jpg", "./Continuous-temporal/mevis/mevis_95_8.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.226, 0.56, 0.609, 0.968]\nB: [0.144, 0.591, 0.527, 0.998]\nC: [0.313, 0.592, 0.697, 1.0]\nD: [0.144, 0.591, 0.553, 0.941]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1280 and the height is 596.\nCAPTION: Panda turning around and moving from leftmost to the middle then lying down to eat", "context": "Select from the following choices.\nA: [0.226, 0.56, 0.609, 0.968]\nB: [0.144, 0.591, 0.527, 0.998]\nC: [0.313, 0.592, 0.697, 1.0]\nD: [0.144, 0.591, 0.553, 0.941]", "input_image_path": ["./Continuous-temporal/mevis/mevis_96_0.jpg", "./Continuous-temporal/mevis/mevis_96_1.jpg", "./Continuous-temporal/mevis/mevis_96_2.jpg", "./Continuous-temporal/mevis/mevis_96_3.jpg", "./Continuous-temporal/mevis/mevis_96_4.jpg", "./Continuous-temporal/mevis/mevis_96_5.jpg", "./Continuous-temporal/mevis/mevis_96_6.jpg", "./Continuous-temporal/mevis/mevis_96_7.jpg", "./Continuous-temporal/mevis/mevis_96_8.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.078, 0.251, 0.358, 0.994]\nB: [0.028, 0.251, 0.308, 0.994]\nC: [0.869, 0.508, 0.948, 0.934]\nD: [0.078, 0.251, 0.383, 1.017]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 640 and the height is 362.\nCAPTION: The walking cow that was the first to approach.", "context": "Select from the following choices.\nA: [0.078, 0.251, 0.358, 0.994]\nB: [0.028, 0.251, 0.308, 0.994]\nC: [0.869, 0.508, 0.948, 0.934]\nD: [0.078, 0.251, 0.383, 1.017]", "input_image_path": ["./Continuous-temporal/mevis/mevis_97_0.jpg", "./Continuous-temporal/mevis/mevis_97_1.jpg", "./Continuous-temporal/mevis/mevis_97_2.jpg", "./Continuous-temporal/mevis/mevis_97_3.jpg", "./Continuous-temporal/mevis/mevis_97_4.jpg", "./Continuous-temporal/mevis/mevis_97_5.jpg", "./Continuous-temporal/mevis/mevis_97_6.jpg", "./Continuous-temporal/mevis/mevis_97_7.jpg", "./Continuous-temporal/mevis/mevis_97_8.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.327, 0.329, 0.722, 0.667]\nB: [0.278, 0.179, 0.715, 0.517]\nC: [0.234, 0.444, 0.671, 0.781]\nD: [0.327, 0.329, 0.764, 0.667]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 636 and the height is 480.\nCAPTION: Tiger walking from right to left", "context": "Select from the following choices.\nA: [0.327, 0.329, 0.722, 0.667]\nB: [0.278, 0.179, 0.715, 0.517]\nC: [0.234, 0.444, 0.671, 0.781]\nD: [0.327, 0.329, 0.764, 0.667]", "input_image_path": ["./Continuous-temporal/mevis/mevis_98_0.jpg", "./Continuous-temporal/mevis/mevis_98_1.jpg", "./Continuous-temporal/mevis/mevis_98_2.jpg", "./Continuous-temporal/mevis/mevis_98_3.jpg", "./Continuous-temporal/mevis/mevis_98_4.jpg", "./Continuous-temporal/mevis/mevis_98_5.jpg", "./Continuous-temporal/mevis/mevis_98_6.jpg", "./Continuous-temporal/mevis/mevis_98_7.jpg", "./Continuous-temporal/mevis/mevis_98_8.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.595, 0.534, 0.818, 0.857]\nB: [0.297, 0.076, 0.501, 1.018]\nC: [0.259, 0.125, 0.494, 0.987]\nD: [0.297, 0.076, 0.532, 0.938]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: cat does not change position but lowered head", "context": "Select from the following choices.\nA: [0.595, 0.534, 0.818, 0.857]\nB: [0.297, 0.076, 0.501, 1.018]\nC: [0.259, 0.125, 0.494, 0.987]\nD: [0.297, 0.076, 0.532, 0.938]", "input_image_path": ["./Continuous-temporal/mevis/mevis_99_0.jpg", "./Continuous-temporal/mevis/mevis_99_1.jpg", "./Continuous-temporal/mevis/mevis_99_2.jpg", "./Continuous-temporal/mevis/mevis_99_3.jpg", "./Continuous-temporal/mevis/mevis_99_4.jpg", "./Continuous-temporal/mevis/mevis_99_5.jpg", "./Continuous-temporal/mevis/mevis_99_6.jpg", "./Continuous-temporal/mevis/mevis_99_7.jpg", "./Continuous-temporal/mevis/mevis_99_8.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.123, 0.256, 0.179, 0.609]\nB: [0.123, 0.256, 0.182, 0.572]\nC: [0.112, 0.396, 0.179, 0.707]\nD: [0.123, 0.256, 0.19, 0.567]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 2340 and the height is 1080.\nCAPTION: The trainer on the left, guiding the bear to perform the hula hoop trick using its mouth.", "context": "Select from the following choices.\nA: [0.123, 0.256, 0.179, 0.609]\nB: [0.123, 0.256, 0.182, 0.572]\nC: [0.112, 0.396, 0.179, 0.707]\nD: [0.123, 0.256, 0.19, 0.567]", "input_image_path": ["./Continuous-temporal/mevis/mevis_100_0.jpg", "./Continuous-temporal/mevis/mevis_100_1.jpg", "./Continuous-temporal/mevis/mevis_100_2.jpg", "./Continuous-temporal/mevis/mevis_100_3.jpg", "./Continuous-temporal/mevis/mevis_100_4.jpg", "./Continuous-temporal/mevis/mevis_100_5.jpg", "./Continuous-temporal/mevis/mevis_100_6.jpg", "./Continuous-temporal/mevis/mevis_100_7.jpg", "./Continuous-temporal/mevis/mevis_100_8.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.667, 0.375, 0.726, 0.688]\nB: [0.667, 0.375, 0.742, 0.667]\nC: [0.667, 0.375, 0.74, 0.696]\nD: [0.672, 0.296, 0.745, 0.617]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 480.\nCAPTION: The bird that is standing still on the wooden pole in the right cage.", "context": "Select from the following choices.\nA: [0.667, 0.375, 0.726, 0.688]\nB: [0.667, 0.375, 0.742, 0.667]\nC: [0.667, 0.375, 0.74, 0.696]\nD: [0.672, 0.296, 0.745, 0.617]", "input_image_path": ["./Continuous-temporal/mevis/mevis_101_0.jpg", "./Continuous-temporal/mevis/mevis_101_1.jpg", "./Continuous-temporal/mevis/mevis_101_2.jpg", "./Continuous-temporal/mevis/mevis_101_3.jpg", "./Continuous-temporal/mevis/mevis_101_4.jpg", "./Continuous-temporal/mevis/mevis_101_5.jpg", "./Continuous-temporal/mevis/mevis_101_6.jpg", "./Continuous-temporal/mevis/mevis_101_7.jpg", "./Continuous-temporal/mevis/mevis_101_8.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.338, 0.908, 0.401, 0.919]\nB: [0.232, 0.002, 1.0, 1.0]\nC: [0.232, 0.0, 1.0, 0.998]\nD: [0.231, 0.0, 0.999, 0.998]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: turtle does not chang position, eating sands then look up", "context": "Select from the following choices.\nA: [0.338, 0.908, 0.401, 0.919]\nB: [0.232, 0.002, 1.0, 1.0]\nC: [0.232, 0.0, 1.0, 0.998]\nD: [0.231, 0.0, 0.999, 0.998]", "input_image_path": ["./Continuous-temporal/mevis/mevis_102_0.jpg", "./Continuous-temporal/mevis/mevis_102_1.jpg", "./Continuous-temporal/mevis/mevis_102_2.jpg", "./Continuous-temporal/mevis/mevis_102_3.jpg", "./Continuous-temporal/mevis/mevis_102_4.jpg", "./Continuous-temporal/mevis/mevis_102_5.jpg", "./Continuous-temporal/mevis/mevis_102_6.jpg", "./Continuous-temporal/mevis/mevis_102_7.jpg", "./Continuous-temporal/mevis/mevis_102_8.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.318, 0.467, 0.571, 0.866]\nB: [0.435, 0.299, 0.688, 0.698]\nC: [0.122, 0.596, 0.486, 0.725]\nD: [0.435, 0.299, 0.668, 0.681]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: The lizard that was finally picked up by hand.", "context": "Select from the following choices.\nA: [0.318, 0.467, 0.571, 0.866]\nB: [0.435, 0.299, 0.688, 0.698]\nC: [0.122, 0.596, 0.486, 0.725]\nD: [0.435, 0.299, 0.668, 0.681]", "input_image_path": ["./Continuous-temporal/mevis/mevis_103_0.jpg", "./Continuous-temporal/mevis/mevis_103_1.jpg", "./Continuous-temporal/mevis/mevis_103_2.jpg", "./Continuous-temporal/mevis/mevis_103_3.jpg", "./Continuous-temporal/mevis/mevis_103_4.jpg", "./Continuous-temporal/mevis/mevis_103_5.jpg", "./Continuous-temporal/mevis/mevis_103_6.jpg", "./Continuous-temporal/mevis/mevis_103_7.jpg", "./Continuous-temporal/mevis/mevis_103_8.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.122, 0.013, 0.556, 1.099]\nB: [0.239, 0.853, 0.337, 0.969]\nC: [0.377, 0.331, 0.759, 0.392]\nD: [0.122, 0.013, 0.617, 0.943]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: The cat that first climbed the cat tree.", "context": "Select from the following choices.\nA: [0.122, 0.013, 0.556, 1.099]\nB: [0.239, 0.853, 0.337, 0.969]\nC: [0.377, 0.331, 0.759, 0.392]\nD: [0.122, 0.013, 0.617, 0.943]", "input_image_path": ["./Continuous-temporal/mevis/mevis_104_0.jpg", "./Continuous-temporal/mevis/mevis_104_1.jpg", "./Continuous-temporal/mevis/mevis_104_2.jpg", "./Continuous-temporal/mevis/mevis_104_3.jpg", "./Continuous-temporal/mevis/mevis_104_4.jpg", "./Continuous-temporal/mevis/mevis_104_5.jpg", "./Continuous-temporal/mevis/mevis_104_6.jpg", "./Continuous-temporal/mevis/mevis_104_7.jpg", "./Continuous-temporal/mevis/mevis_104_8.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.085, 0.445, 0.406, 1.039]\nB: [0.085, 0.445, 0.37, 0.998]\nC: [0.086, 0.446, 0.37, 1.0]\nD: [0.185, 0.292, 0.47, 0.846]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1280 and the height is 596.\nCAPTION: Panda turning around and moving forward from leftmost to rightmost", "context": "Select from the following choices.\nA: [0.085, 0.445, 0.406, 1.039]\nB: [0.085, 0.445, 0.37, 0.998]\nC: [0.086, 0.446, 0.37, 1.0]\nD: [0.185, 0.292, 0.47, 0.846]", "input_image_path": ["./Continuous-temporal/mevis/mevis_105_0.jpg", "./Continuous-temporal/mevis/mevis_105_1.jpg", "./Continuous-temporal/mevis/mevis_105_2.jpg", "./Continuous-temporal/mevis/mevis_105_3.jpg", "./Continuous-temporal/mevis/mevis_105_4.jpg", "./Continuous-temporal/mevis/mevis_105_5.jpg", "./Continuous-temporal/mevis/mevis_105_6.jpg", "./Continuous-temporal/mevis/mevis_105_7.jpg", "./Continuous-temporal/mevis/mevis_105_8.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.216, 0.701, 0.324, 1.0]\nB: [0.214, 0.693, 0.322, 0.992]\nC: [0.214, 0.571, 0.322, 0.87]\nD: [0.24, 0.699, 0.348, 0.998]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: The bird that reached the cage bottom before others.", "context": "Select from the following choices.\nA: [0.216, 0.701, 0.324, 1.0]\nB: [0.214, 0.693, 0.322, 0.992]\nC: [0.214, 0.571, 0.322, 0.87]\nD: [0.24, 0.699, 0.348, 0.998]", "input_image_path": ["./Continuous-temporal/mevis/mevis_106_0.jpg", "./Continuous-temporal/mevis/mevis_106_1.jpg", "./Continuous-temporal/mevis/mevis_106_2.jpg", "./Continuous-temporal/mevis/mevis_106_3.jpg", "./Continuous-temporal/mevis/mevis_106_4.jpg", "./Continuous-temporal/mevis/mevis_106_5.jpg", "./Continuous-temporal/mevis/mevis_106_6.jpg", "./Continuous-temporal/mevis/mevis_106_7.jpg", "./Continuous-temporal/mevis/mevis_106_8.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.587, 0.322, 0.825, 0.853]\nB: [0.272, 0.179, 0.45, 0.271]\nC: [0.63, 0.44, 0.868, 0.971]\nD: [0.587, 0.322, 0.799, 0.957]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1280 and the height is 720.\nCAPTION: monkey crawling around on rocks then jumping to the left", "context": "Select from the following choices.\nA: [0.587, 0.322, 0.825, 0.853]\nB: [0.272, 0.179, 0.45, 0.271]\nC: [0.63, 0.44, 0.868, 0.971]\nD: [0.587, 0.322, 0.799, 0.957]", "input_image_path": ["./Continuous-temporal/mevis/mevis_107_0.jpg", "./Continuous-temporal/mevis/mevis_107_1.jpg", "./Continuous-temporal/mevis/mevis_107_2.jpg", "./Continuous-temporal/mevis/mevis_107_3.jpg", "./Continuous-temporal/mevis/mevis_107_4.jpg", "./Continuous-temporal/mevis/mevis_107_5.jpg", "./Continuous-temporal/mevis/mevis_107_6.jpg", "./Continuous-temporal/mevis/mevis_107_7.jpg", "./Continuous-temporal/mevis/mevis_107_8.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.196, 0.0, 0.481, 0.836]\nB: [0.092, 0.0, 0.377, 0.836]\nC: [0.196, 0.0, 0.463, 0.77]\nD: [0.156, 0.0, 0.442, 0.836]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 544.\nCAPTION: Child walking and holding dog", "context": "Select from the following choices.\nA: [0.196, 0.0, 0.481, 0.836]\nB: [0.092, 0.0, 0.377, 0.836]\nC: [0.196, 0.0, 0.463, 0.77]\nD: [0.156, 0.0, 0.442, 0.836]", "input_image_path": ["./Continuous-temporal/mevis/mevis_108_0.jpg", "./Continuous-temporal/mevis/mevis_108_1.jpg", "./Continuous-temporal/mevis/mevis_108_2.jpg", "./Continuous-temporal/mevis/mevis_108_3.jpg", "./Continuous-temporal/mevis/mevis_108_4.jpg", "./Continuous-temporal/mevis/mevis_108_5.jpg", "./Continuous-temporal/mevis/mevis_108_6.jpg", "./Continuous-temporal/mevis/mevis_108_7.jpg", "./Continuous-temporal/mevis/mevis_108_8.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.613, 0.489, 0.919, 0.603]\nB: [0.47, 0.017, 0.69, 0.178]\nC: [0.613, 0.489, 0.871, 0.598]\nD: [0.613, 0.489, 0.891, 0.598]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 604 and the height is 1280.\nCAPTION: dog playing with monkey", "context": "Select from the following choices.\nA: [0.613, 0.489, 0.919, 0.603]\nB: [0.47, 0.017, 0.69, 0.178]\nC: [0.613, 0.489, 0.871, 0.598]\nD: [0.613, 0.489, 0.891, 0.598]", "input_image_path": ["./Continuous-temporal/mevis/mevis_109_0.jpg", "./Continuous-temporal/mevis/mevis_109_1.jpg", "./Continuous-temporal/mevis/mevis_109_2.jpg", "./Continuous-temporal/mevis/mevis_109_3.jpg", "./Continuous-temporal/mevis/mevis_109_4.jpg", "./Continuous-temporal/mevis/mevis_109_5.jpg", "./Continuous-temporal/mevis/mevis_109_6.jpg", "./Continuous-temporal/mevis/mevis_109_7.jpg", "./Continuous-temporal/mevis/mevis_109_8.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.53, 0.183, 0.836, 0.911]\nB: [0.53, 0.183, 0.866, 0.99]\nC: [0.384, 0.272, 0.69, 1.0]\nD: [0.396, 0.0, 0.702, 0.728]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: The darker-colored one among the two dogs playing together.", "context": "Select from the following choices.\nA: [0.53, 0.183, 0.836, 0.911]\nB: [0.53, 0.183, 0.866, 0.99]\nC: [0.384, 0.272, 0.69, 1.0]\nD: [0.396, 0.0, 0.702, 0.728]", "input_image_path": ["./Continuous-temporal/mevis/mevis_110_0.jpg", "./Continuous-temporal/mevis/mevis_110_1.jpg", "./Continuous-temporal/mevis/mevis_110_2.jpg", "./Continuous-temporal/mevis/mevis_110_3.jpg", "./Continuous-temporal/mevis/mevis_110_4.jpg", "./Continuous-temporal/mevis/mevis_110_5.jpg", "./Continuous-temporal/mevis/mevis_110_6.jpg", "./Continuous-temporal/mevis/mevis_110_7.jpg", "./Continuous-temporal/mevis/mevis_110_8.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.067, 0.082, 0.288, 0.254]\nB: [0.72, 0.243, 0.741, 0.544]\nC: [0.961, 0.199, 1.001, 0.396]\nD: [0.961, 0.199, 0.999, 0.386]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: The black car parked without moving.", "context": "Select from the following choices.\nA: [0.067, 0.082, 0.288, 0.254]\nB: [0.72, 0.243, 0.741, 0.544]\nC: [0.961, 0.199, 1.001, 0.396]\nD: [0.961, 0.199, 0.999, 0.386]", "input_image_path": ["./Continuous-temporal/mevis/mevis_111_0.jpg", "./Continuous-temporal/mevis/mevis_111_1.jpg", "./Continuous-temporal/mevis/mevis_111_2.jpg", "./Continuous-temporal/mevis/mevis_111_3.jpg", "./Continuous-temporal/mevis/mevis_111_4.jpg", "./Continuous-temporal/mevis/mevis_111_5.jpg", "./Continuous-temporal/mevis/mevis_111_6.jpg", "./Continuous-temporal/mevis/mevis_111_7.jpg", "./Continuous-temporal/mevis/mevis_111_8.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.177, 0.391, 0.631, 0.549]\nB: [0.291, 0.486, 0.47, 0.681]\nC: [0.263, 0.063, 0.436, 0.385]\nD: [0.243, 0.53, 0.422, 0.724]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1080 and the height is 1152.\nCAPTION: The big bear is moving with three small bear cubs in tow across the road.", "context": "Select from the following choices.\nA: [0.177, 0.391, 0.631, 0.549]\nB: [0.291, 0.486, 0.47, 0.681]\nC: [0.263, 0.063, 0.436, 0.385]\nD: [0.243, 0.53, 0.422, 0.724]", "input_image_path": ["./Continuous-temporal/mevis/mevis_112_0.jpg", "./Continuous-temporal/mevis/mevis_112_1.jpg", "./Continuous-temporal/mevis/mevis_112_2.jpg", "./Continuous-temporal/mevis/mevis_112_3.jpg", "./Continuous-temporal/mevis/mevis_112_4.jpg", "./Continuous-temporal/mevis/mevis_112_5.jpg", "./Continuous-temporal/mevis/mevis_112_6.jpg", "./Continuous-temporal/mevis/mevis_112_7.jpg", "./Continuous-temporal/mevis/mevis_112_8.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.826, 0.279, 0.853, 0.622]\nB: [0.826, 0.279, 0.857, 0.632]\nC: [0.391, 0.739, 0.438, 0.837]\nD: [0.826, 0.279, 0.861, 0.641]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 2340 and the height is 1080.\nCAPTION: A man clapping and stepping back while next to the hula-hooping bear.", "context": "Select from the following choices.\nA: [0.826, 0.279, 0.853, 0.622]\nB: [0.826, 0.279, 0.857, 0.632]\nC: [0.391, 0.739, 0.438, 0.837]\nD: [0.826, 0.279, 0.861, 0.641]", "input_image_path": ["./Continuous-temporal/mevis/mevis_113_0.jpg", "./Continuous-temporal/mevis/mevis_113_1.jpg", "./Continuous-temporal/mevis/mevis_113_2.jpg", "./Continuous-temporal/mevis/mevis_113_3.jpg", "./Continuous-temporal/mevis/mevis_113_4.jpg", "./Continuous-temporal/mevis/mevis_113_5.jpg", "./Continuous-temporal/mevis/mevis_113_6.jpg", "./Continuous-temporal/mevis/mevis_113_7.jpg", "./Continuous-temporal/mevis/mevis_113_8.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.479, 0.202, 0.684, 0.675]\nB: [0.285, 0.381, 0.652, 0.762]\nC: [0.416, 0.167, 0.621, 0.639]\nD: [0.479, 0.202, 0.705, 0.627]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 512 and the height is 252.\nCAPTION: Horse moving around", "context": "Select from the following choices.\nA: [0.479, 0.202, 0.684, 0.675]\nB: [0.285, 0.381, 0.652, 0.762]\nC: [0.416, 0.167, 0.621, 0.639]\nD: [0.479, 0.202, 0.705, 0.627]", "input_image_path": ["./Continuous-temporal/mevis/mevis_114_0.jpg", "./Continuous-temporal/mevis/mevis_114_1.jpg", "./Continuous-temporal/mevis/mevis_114_2.jpg", "./Continuous-temporal/mevis/mevis_114_3.jpg", "./Continuous-temporal/mevis/mevis_114_4.jpg", "./Continuous-temporal/mevis/mevis_114_5.jpg", "./Continuous-temporal/mevis/mevis_114_6.jpg", "./Continuous-temporal/mevis/mevis_114_7.jpg", "./Continuous-temporal/mevis/mevis_114_8.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.127, 0.294, 0.187, 0.575]\nB: [0.101, 0.391, 0.16, 0.672]\nC: [0.113, 0.282, 0.162, 0.612]\nD: [0.113, 0.282, 0.173, 0.564]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 2340 and the height is 1080.\nCAPTION: Man walking backwards and taking a hoop from a bear", "context": "Select from the following choices.\nA: [0.127, 0.294, 0.187, 0.575]\nB: [0.101, 0.391, 0.16, 0.672]\nC: [0.113, 0.282, 0.162, 0.612]\nD: [0.113, 0.282, 0.173, 0.564]", "input_image_path": ["./Continuous-temporal/mevis/mevis_115_0.jpg", "./Continuous-temporal/mevis/mevis_115_1.jpg", "./Continuous-temporal/mevis/mevis_115_2.jpg", "./Continuous-temporal/mevis/mevis_115_3.jpg", "./Continuous-temporal/mevis/mevis_115_4.jpg", "./Continuous-temporal/mevis/mevis_115_5.jpg", "./Continuous-temporal/mevis/mevis_115_6.jpg", "./Continuous-temporal/mevis/mevis_115_7.jpg", "./Continuous-temporal/mevis/mevis_115_8.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.714, 0.632, 0.748, 0.746]\nB: [0.714, 0.632, 0.746, 0.732]\nC: [0.714, 0.632, 0.74, 0.733]\nD: [0.714, 0.632, 0.751, 0.748]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 544.\nCAPTION: The running horse next to the white fence.", "context": "Select from the following choices.\nA: [0.714, 0.632, 0.748, 0.746]\nB: [0.714, 0.632, 0.746, 0.732]\nC: [0.714, 0.632, 0.74, 0.733]\nD: [0.714, 0.632, 0.751, 0.748]", "input_image_path": ["./Continuous-temporal/mevis/mevis_116_0.jpg", "./Continuous-temporal/mevis/mevis_116_1.jpg", "./Continuous-temporal/mevis/mevis_116_2.jpg", "./Continuous-temporal/mevis/mevis_116_3.jpg", "./Continuous-temporal/mevis/mevis_116_4.jpg", "./Continuous-temporal/mevis/mevis_116_5.jpg", "./Continuous-temporal/mevis/mevis_116_6.jpg", "./Continuous-temporal/mevis/mevis_116_7.jpg", "./Continuous-temporal/mevis/mevis_116_8.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.032, 0.599, 0.339, 0.829]\nB: [0.376, 0.424, 0.446, 0.621]\nC: [0.359, 0.484, 0.43, 0.681]\nD: [0.477, 0.795, 0.566, 0.988]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: elephant in the distance moving to left", "context": "Select from the following choices.\nA: [0.032, 0.599, 0.339, 0.829]\nB: [0.376, 0.424, 0.446, 0.621]\nC: [0.359, 0.484, 0.43, 0.681]\nD: [0.477, 0.795, 0.566, 0.988]", "input_image_path": ["./Continuous-temporal/mevis/mevis_117_0.jpg", "./Continuous-temporal/mevis/mevis_117_1.jpg", "./Continuous-temporal/mevis/mevis_117_2.jpg", "./Continuous-temporal/mevis/mevis_117_3.jpg", "./Continuous-temporal/mevis/mevis_117_4.jpg", "./Continuous-temporal/mevis/mevis_117_5.jpg", "./Continuous-temporal/mevis/mevis_117_6.jpg", "./Continuous-temporal/mevis/mevis_117_7.jpg", "./Continuous-temporal/mevis/mevis_117_8.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.692, 0.905, 0.802, 0.999]\nB: [0.712, 0.881, 0.822, 0.976]\nC: [0.709, 0.906, 0.819, 1.0]\nD: [0.139, 0.554, 0.377, 0.619]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1652 and the height is 1080.\nCAPTION: black eating rabbit on the rightmost", "context": "Select from the following choices.\nA: [0.692, 0.905, 0.802, 0.999]\nB: [0.712, 0.881, 0.822, 0.976]\nC: [0.709, 0.906, 0.819, 1.0]\nD: [0.139, 0.554, 0.377, 0.619]", "input_image_path": ["./Continuous-temporal/mevis/mevis_118_0.jpg", "./Continuous-temporal/mevis/mevis_118_1.jpg", "./Continuous-temporal/mevis/mevis_118_2.jpg", "./Continuous-temporal/mevis/mevis_118_3.jpg", "./Continuous-temporal/mevis/mevis_118_4.jpg", "./Continuous-temporal/mevis/mevis_118_5.jpg", "./Continuous-temporal/mevis/mevis_118_6.jpg", "./Continuous-temporal/mevis/mevis_118_7.jpg", "./Continuous-temporal/mevis/mevis_118_8.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.024, 0.528, 0.244, 0.761]\nB: [0.589, 0.096, 0.719, 0.403]\nC: [0.108, 0.634, 0.328, 0.868]\nD: [0.631, 0.408, 0.876, 0.443]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 544.\nCAPTION: turtle swimming right", "context": "Select from the following choices.\nA: [0.024, 0.528, 0.244, 0.761]\nB: [0.589, 0.096, 0.719, 0.403]\nC: [0.108, 0.634, 0.328, 0.868]\nD: [0.631, 0.408, 0.876, 0.443]", "input_image_path": ["./Continuous-temporal/mevis/mevis_119_0.jpg", "./Continuous-temporal/mevis/mevis_119_1.jpg", "./Continuous-temporal/mevis/mevis_119_2.jpg", "./Continuous-temporal/mevis/mevis_119_3.jpg", "./Continuous-temporal/mevis/mevis_119_4.jpg", "./Continuous-temporal/mevis/mevis_119_5.jpg", "./Continuous-temporal/mevis/mevis_119_6.jpg", "./Continuous-temporal/mevis/mevis_119_7.jpg", "./Continuous-temporal/mevis/mevis_119_8.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.434, 0.53, 0.636, 0.854]\nB: [0.342, 0.586, 0.588, 0.948]\nC: [0.472, 0.373, 0.717, 0.735]\nD: [0.434, 0.53, 0.68, 0.892]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 640 and the height is 362.\nCAPTION: Small cow walking to the front", "context": "Select from the following choices.\nA: [0.434, 0.53, 0.636, 0.854]\nB: [0.342, 0.586, 0.588, 0.948]\nC: [0.472, 0.373, 0.717, 0.735]\nD: [0.434, 0.53, 0.68, 0.892]", "input_image_path": ["./Continuous-temporal/mevis/mevis_120_0.jpg", "./Continuous-temporal/mevis/mevis_120_1.jpg", "./Continuous-temporal/mevis/mevis_120_2.jpg", "./Continuous-temporal/mevis/mevis_120_3.jpg", "./Continuous-temporal/mevis/mevis_120_4.jpg", "./Continuous-temporal/mevis/mevis_120_5.jpg", "./Continuous-temporal/mevis/mevis_120_6.jpg", "./Continuous-temporal/mevis/mevis_120_7.jpg", "./Continuous-temporal/mevis/mevis_120_8.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.0, 0.001, 0.165, 1.0]\nB: [0.61, 0.789, 0.838, 0.937]\nC: [0.03, 0.0, 0.195, 0.999]\nD: [0.0, 0.0, 0.165, 0.999]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 945.\nCAPTION: Person showing up in the final and extending arm forward", "context": "Select from the following choices.\nA: [0.0, 0.001, 0.165, 1.0]\nB: [0.61, 0.789, 0.838, 0.937]\nC: [0.03, 0.0, 0.195, 0.999]\nD: [0.0, 0.0, 0.165, 0.999]", "input_image_path": ["./Continuous-temporal/mevis/mevis_121_0.jpg", "./Continuous-temporal/mevis/mevis_121_1.jpg", "./Continuous-temporal/mevis/mevis_121_2.jpg", "./Continuous-temporal/mevis/mevis_121_3.jpg", "./Continuous-temporal/mevis/mevis_121_4.jpg", "./Continuous-temporal/mevis/mevis_121_5.jpg", "./Continuous-temporal/mevis/mevis_121_6.jpg", "./Continuous-temporal/mevis/mevis_121_7.jpg", "./Continuous-temporal/mevis/mevis_121_8.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.349, 0.14, 0.56, 0.604]\nB: [0.188, 0.331, 0.589, 0.633]\nC: [0.665, 0.365, 0.74, 0.694]\nD: [0.665, 0.365, 0.747, 0.71]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 480.\nCAPTION: bird standing still on bamboo pole", "context": "Select from the following choices.\nA: [0.349, 0.14, 0.56, 0.604]\nB: [0.188, 0.331, 0.589, 0.633]\nC: [0.665, 0.365, 0.74, 0.694]\nD: [0.665, 0.365, 0.747, 0.71]", "input_image_path": ["./Continuous-temporal/mevis/mevis_122_0.jpg", "./Continuous-temporal/mevis/mevis_122_1.jpg", "./Continuous-temporal/mevis/mevis_122_2.jpg", "./Continuous-temporal/mevis/mevis_122_3.jpg", "./Continuous-temporal/mevis/mevis_122_4.jpg", "./Continuous-temporal/mevis/mevis_122_5.jpg", "./Continuous-temporal/mevis/mevis_122_6.jpg", "./Continuous-temporal/mevis/mevis_122_7.jpg", "./Continuous-temporal/mevis/mevis_122_8.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.122, 0.308, 0.178, 0.597]\nB: [0.122, 0.308, 0.188, 0.552]\nC: [0.12, 0.26, 0.176, 0.549]\nD: [0.122, 0.308, 0.174, 0.604]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 2340 and the height is 1080.\nCAPTION: The bear on the left turning the hula hoop with its head", "context": "Select from the following choices.\nA: [0.122, 0.308, 0.178, 0.597]\nB: [0.122, 0.308, 0.188, 0.552]\nC: [0.12, 0.26, 0.176, 0.549]\nD: [0.122, 0.308, 0.174, 0.604]", "input_image_path": ["./Continuous-temporal/mevis/mevis_123_0.jpg", "./Continuous-temporal/mevis/mevis_123_1.jpg", "./Continuous-temporal/mevis/mevis_123_2.jpg", "./Continuous-temporal/mevis/mevis_123_3.jpg", "./Continuous-temporal/mevis/mevis_123_4.jpg", "./Continuous-temporal/mevis/mevis_123_5.jpg", "./Continuous-temporal/mevis/mevis_123_6.jpg", "./Continuous-temporal/mevis/mevis_123_7.jpg", "./Continuous-temporal/mevis/mevis_123_8.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.348, 0.444, 0.829, 0.892]\nB: [0.008, 0.375, 0.028, 0.477]\nC: [0.009, 0.29, 0.029, 0.392]\nD: [0.0, 0.329, 0.02, 0.431]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 480.\nCAPTION: The bird that is standing still on the wooden perch inside the cage on the left side.", "context": "Select from the following choices.\nA: [0.348, 0.444, 0.829, 0.892]\nB: [0.008, 0.375, 0.028, 0.477]\nC: [0.009, 0.29, 0.029, 0.392]\nD: [0.0, 0.329, 0.02, 0.431]", "input_image_path": ["./Continuous-temporal/mevis/mevis_124_0.jpg", "./Continuous-temporal/mevis/mevis_124_1.jpg", "./Continuous-temporal/mevis/mevis_124_2.jpg", "./Continuous-temporal/mevis/mevis_124_3.jpg", "./Continuous-temporal/mevis/mevis_124_4.jpg", "./Continuous-temporal/mevis/mevis_124_5.jpg", "./Continuous-temporal/mevis/mevis_124_6.jpg", "./Continuous-temporal/mevis/mevis_124_7.jpg", "./Continuous-temporal/mevis/mevis_124_8.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.435, 0.353, 0.534, 0.581]\nB: [0.385, 0.399, 0.498, 0.685]\nC: [0.435, 0.353, 0.548, 0.639]\nD: [0.435, 0.353, 0.545, 0.657]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1280 and the height is 720.\nCAPTION: elephant walking ahead", "context": "Select from the following choices.\nA: [0.435, 0.353, 0.534, 0.581]\nB: [0.385, 0.399, 0.498, 0.685]\nC: [0.435, 0.353, 0.548, 0.639]\nD: [0.435, 0.353, 0.545, 0.657]", "input_image_path": ["./Continuous-temporal/mevis/mevis_125_0.jpg", "./Continuous-temporal/mevis/mevis_125_1.jpg", "./Continuous-temporal/mevis/mevis_125_2.jpg", "./Continuous-temporal/mevis/mevis_125_3.jpg", "./Continuous-temporal/mevis/mevis_125_4.jpg", "./Continuous-temporal/mevis/mevis_125_5.jpg", "./Continuous-temporal/mevis/mevis_125_6.jpg", "./Continuous-temporal/mevis/mevis_125_7.jpg", "./Continuous-temporal/mevis/mevis_125_8.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.706, 0.289, 0.802, 0.651]\nB: [0.716, 0.293, 0.812, 0.655]\nC: [0.675, 0.248, 0.771, 0.61]\nD: [0.716, 0.293, 0.802, 0.694]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: little girl riding a bicycle", "context": "Select from the following choices.\nA: [0.706, 0.289, 0.802, 0.651]\nB: [0.716, 0.293, 0.812, 0.655]\nC: [0.675, 0.248, 0.771, 0.61]\nD: [0.716, 0.293, 0.802, 0.694]", "input_image_path": ["./Continuous-temporal/mevis/mevis_126_0.jpg", "./Continuous-temporal/mevis/mevis_126_1.jpg", "./Continuous-temporal/mevis/mevis_126_2.jpg", "./Continuous-temporal/mevis/mevis_126_3.jpg", "./Continuous-temporal/mevis/mevis_126_4.jpg", "./Continuous-temporal/mevis/mevis_126_5.jpg", "./Continuous-temporal/mevis/mevis_126_6.jpg", "./Continuous-temporal/mevis/mevis_126_7.jpg", "./Continuous-temporal/mevis/mevis_126_8.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.139, 0.183, 0.458, 0.93]\nB: [0.421, 0.427, 0.828, 0.869]\nC: [0.139, 0.183, 0.51, 1.015]\nD: [0.19, 0.254, 0.509, 1.0]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: cat sitting on the leftmost without moving position", "context": "Select from the following choices.\nA: [0.139, 0.183, 0.458, 0.93]\nB: [0.421, 0.427, 0.828, 0.869]\nC: [0.139, 0.183, 0.51, 1.015]\nD: [0.19, 0.254, 0.509, 1.0]", "input_image_path": ["./Continuous-temporal/mevis/mevis_127_0.jpg", "./Continuous-temporal/mevis/mevis_127_1.jpg", "./Continuous-temporal/mevis/mevis_127_2.jpg", "./Continuous-temporal/mevis/mevis_127_3.jpg", "./Continuous-temporal/mevis/mevis_127_4.jpg", "./Continuous-temporal/mevis/mevis_127_5.jpg", "./Continuous-temporal/mevis/mevis_127_6.jpg", "./Continuous-temporal/mevis/mevis_127_7.jpg", "./Continuous-temporal/mevis/mevis_127_8.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.477, 0.429, 0.921, 0.702]\nB: [0.386, 0.437, 0.622, 0.575]\nC: [0.386, 0.437, 0.594, 0.569]\nD: [0.055, 0.35, 0.388, 0.759]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: The first sheep that is eating food from the man's hand.", "context": "Select from the following choices.\nA: [0.477, 0.429, 0.921, 0.702]\nB: [0.386, 0.437, 0.622, 0.575]\nC: [0.386, 0.437, 0.594, 0.569]\nD: [0.055, 0.35, 0.388, 0.759]", "input_image_path": ["./Continuous-temporal/mevis/mevis_128_0.jpg", "./Continuous-temporal/mevis/mevis_128_1.jpg", "./Continuous-temporal/mevis/mevis_128_2.jpg", "./Continuous-temporal/mevis/mevis_128_3.jpg", "./Continuous-temporal/mevis/mevis_128_4.jpg", "./Continuous-temporal/mevis/mevis_128_5.jpg", "./Continuous-temporal/mevis/mevis_128_6.jpg", "./Continuous-temporal/mevis/mevis_128_7.jpg", "./Continuous-temporal/mevis/mevis_128_8.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.497, 0.478, 0.571, 0.551]\nB: [0.472, 0.45, 0.546, 0.524]\nC: [0.068, 0.333, 0.529, 0.676]\nD: [0.45, 0.46, 0.524, 0.533]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 544.\nCAPTION: black SUV moving directly forward across the road", "context": "Select from the following choices.\nA: [0.497, 0.478, 0.571, 0.551]\nB: [0.472, 0.45, 0.546, 0.524]\nC: [0.068, 0.333, 0.529, 0.676]\nD: [0.45, 0.46, 0.524, 0.533]", "input_image_path": ["./Continuous-temporal/mevis/mevis_129_0.jpg", "./Continuous-temporal/mevis/mevis_129_1.jpg", "./Continuous-temporal/mevis/mevis_129_2.jpg", "./Continuous-temporal/mevis/mevis_129_3.jpg", "./Continuous-temporal/mevis/mevis_129_4.jpg", "./Continuous-temporal/mevis/mevis_129_5.jpg", "./Continuous-temporal/mevis/mevis_129_6.jpg", "./Continuous-temporal/mevis/mevis_129_7.jpg", "./Continuous-temporal/mevis/mevis_129_8.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.692, 0.905, 0.802, 0.999]\nB: [0.692, 0.905, 0.806, 1.003]\nC: [0.482, 0.202, 0.765, 0.613]\nD: [0.692, 0.905, 0.793, 0.99]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1652 and the height is 1080.\nCAPTION: black rabbit eating", "context": "Select from the following choices.\nA: [0.692, 0.905, 0.802, 0.999]\nB: [0.692, 0.905, 0.806, 1.003]\nC: [0.482, 0.202, 0.765, 0.613]\nD: [0.692, 0.905, 0.793, 0.99]", "input_image_path": ["./Continuous-temporal/mevis/mevis_130_0.jpg", "./Continuous-temporal/mevis/mevis_130_1.jpg", "./Continuous-temporal/mevis/mevis_130_2.jpg", "./Continuous-temporal/mevis/mevis_130_3.jpg", "./Continuous-temporal/mevis/mevis_130_4.jpg", "./Continuous-temporal/mevis/mevis_130_5.jpg", "./Continuous-temporal/mevis/mevis_130_6.jpg", "./Continuous-temporal/mevis/mevis_130_7.jpg", "./Continuous-temporal/mevis/mevis_130_8.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.449, 0.012, 1.0, 0.501]\nB: [0.448, 0.0, 0.942, 0.505]\nC: [0.449, 0.0, 1.0, 0.489]\nD: [0.448, 0.0, 0.999, 0.489]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1440 and the height is 1080.\nCAPTION: puppy that overwhelms another puppy", "context": "Select from the following choices.\nA: [0.449, 0.012, 1.0, 0.501]\nB: [0.448, 0.0, 0.942, 0.505]\nC: [0.449, 0.0, 1.0, 0.489]\nD: [0.448, 0.0, 0.999, 0.489]", "input_image_path": ["./Continuous-temporal/mevis/mevis_131_0.jpg", "./Continuous-temporal/mevis/mevis_131_1.jpg", "./Continuous-temporal/mevis/mevis_131_2.jpg", "./Continuous-temporal/mevis/mevis_131_3.jpg", "./Continuous-temporal/mevis/mevis_131_4.jpg", "./Continuous-temporal/mevis/mevis_131_5.jpg", "./Continuous-temporal/mevis/mevis_131_6.jpg", "./Continuous-temporal/mevis/mevis_131_7.jpg", "./Continuous-temporal/mevis/mevis_131_8.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.152, 0.0, 0.999, 0.26]\nB: [0.152, 0.0, 0.941, 0.298]\nC: [0.0, 0.0, 0.848, 0.26]\nD: [0.152, 0.0, 1.129, 0.251]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1766 and the height is 945.\nCAPTION: Airplane moving forward", "context": "Select from the following choices.\nA: [0.152, 0.0, 0.999, 0.26]\nB: [0.152, 0.0, 0.941, 0.298]\nC: [0.0, 0.0, 0.848, 0.26]\nD: [0.152, 0.0, 1.129, 0.251]", "input_image_path": ["./Continuous-temporal/mevis/mevis_132_0.jpg", "./Continuous-temporal/mevis/mevis_132_1.jpg", "./Continuous-temporal/mevis/mevis_132_2.jpg", "./Continuous-temporal/mevis/mevis_132_3.jpg", "./Continuous-temporal/mevis/mevis_132_4.jpg", "./Continuous-temporal/mevis/mevis_132_5.jpg", "./Continuous-temporal/mevis/mevis_132_6.jpg", "./Continuous-temporal/mevis/mevis_132_7.jpg", "./Continuous-temporal/mevis/mevis_132_8.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.061, 0.222, 0.382, 0.782]\nB: [0.061, 0.222, 0.411, 0.733]\nC: [0.061, 0.222, 0.393, 0.89]\nD: [0.061, 0.222, 0.441, 0.809]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 528.\nCAPTION: lighter one of two fighting yaks", "context": "Select from the following choices.\nA: [0.061, 0.222, 0.382, 0.782]\nB: [0.061, 0.222, 0.411, 0.733]\nC: [0.061, 0.222, 0.393, 0.89]\nD: [0.061, 0.222, 0.441, 0.809]", "input_image_path": ["./Continuous-temporal/mevis/mevis_133_0.jpg", "./Continuous-temporal/mevis/mevis_133_1.jpg", "./Continuous-temporal/mevis/mevis_133_2.jpg", "./Continuous-temporal/mevis/mevis_133_3.jpg", "./Continuous-temporal/mevis/mevis_133_4.jpg", "./Continuous-temporal/mevis/mevis_133_5.jpg", "./Continuous-temporal/mevis/mevis_133_6.jpg", "./Continuous-temporal/mevis/mevis_133_7.jpg", "./Continuous-temporal/mevis/mevis_133_8.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.367, 0.479, 0.452, 0.642]\nB: [0.417, 0.868, 0.645, 0.895]\nC: [0.126, 0.444, 0.287, 1.0]\nD: [0.149, 0.442, 0.31, 0.999]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1567 and the height is 730.\nCAPTION: cow shaking head and looking us", "context": "Select from the following choices.\nA: [0.367, 0.479, 0.452, 0.642]\nB: [0.417, 0.868, 0.645, 0.895]\nC: [0.126, 0.444, 0.287, 1.0]\nD: [0.149, 0.442, 0.31, 0.999]", "input_image_path": ["./Continuous-temporal/mevis/mevis_134_0.jpg", "./Continuous-temporal/mevis/mevis_134_1.jpg", "./Continuous-temporal/mevis/mevis_134_2.jpg", "./Continuous-temporal/mevis/mevis_134_3.jpg", "./Continuous-temporal/mevis/mevis_134_4.jpg", "./Continuous-temporal/mevis/mevis_134_5.jpg", "./Continuous-temporal/mevis/mevis_134_6.jpg", "./Continuous-temporal/mevis/mevis_134_7.jpg", "./Continuous-temporal/mevis/mevis_134_8.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.473, 0.165, 0.517, 0.544]\nB: [0.311, 0.415, 0.404, 0.794]\nC: [0.35, 0.307, 0.443, 0.686]\nD: [0.36, 0.165, 0.453, 0.544]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 544.\nCAPTION: Man pushing bicycle around in a circle", "context": "Select from the following choices.\nA: [0.473, 0.165, 0.517, 0.544]\nB: [0.311, 0.415, 0.404, 0.794]\nC: [0.35, 0.307, 0.443, 0.686]\nD: [0.36, 0.165, 0.453, 0.544]", "input_image_path": ["./Continuous-temporal/mevis/mevis_135_0.jpg", "./Continuous-temporal/mevis/mevis_135_1.jpg", "./Continuous-temporal/mevis/mevis_135_2.jpg", "./Continuous-temporal/mevis/mevis_135_3.jpg", "./Continuous-temporal/mevis/mevis_135_4.jpg", "./Continuous-temporal/mevis/mevis_135_5.jpg", "./Continuous-temporal/mevis/mevis_135_6.jpg", "./Continuous-temporal/mevis/mevis_135_7.jpg", "./Continuous-temporal/mevis/mevis_135_8.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.587, 0.235, 0.684, 0.65]\nB: [0.129, 0.113, 0.32, 0.35]\nC: [0.447, 0.354, 0.567, 0.633]\nD: [0.447, 0.354, 0.592, 0.637]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1280 and the height is 720.\nCAPTION: Elephant crushed by another elephant's trunk", "context": "Select from the following choices.\nA: [0.587, 0.235, 0.684, 0.65]\nB: [0.129, 0.113, 0.32, 0.35]\nC: [0.447, 0.354, 0.567, 0.633]\nD: [0.447, 0.354, 0.592, 0.637]", "input_image_path": ["./Continuous-temporal/mevis/mevis_136_0.jpg", "./Continuous-temporal/mevis/mevis_136_1.jpg", "./Continuous-temporal/mevis/mevis_136_2.jpg", "./Continuous-temporal/mevis/mevis_136_3.jpg", "./Continuous-temporal/mevis/mevis_136_4.jpg", "./Continuous-temporal/mevis/mevis_136_5.jpg", "./Continuous-temporal/mevis/mevis_136_6.jpg", "./Continuous-temporal/mevis/mevis_136_7.jpg", "./Continuous-temporal/mevis/mevis_136_8.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.487, 0.379, 0.941, 0.53]\nB: [0.323, 0.292, 0.682, 0.742]\nC: [0.295, 0.403, 0.71, 0.556]\nD: [0.295, 0.403, 0.748, 0.554]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 496.\nCAPTION: The model airplane with a faster moving speed.", "context": "Select from the following choices.\nA: [0.487, 0.379, 0.941, 0.53]\nB: [0.323, 0.292, 0.682, 0.742]\nC: [0.295, 0.403, 0.71, 0.556]\nD: [0.295, 0.403, 0.748, 0.554]", "input_image_path": ["./Continuous-temporal/mevis/mevis_137_0.jpg", "./Continuous-temporal/mevis/mevis_137_1.jpg", "./Continuous-temporal/mevis/mevis_137_2.jpg", "./Continuous-temporal/mevis/mevis_137_3.jpg", "./Continuous-temporal/mevis/mevis_137_4.jpg", "./Continuous-temporal/mevis/mevis_137_5.jpg", "./Continuous-temporal/mevis/mevis_137_6.jpg", "./Continuous-temporal/mevis/mevis_137_7.jpg", "./Continuous-temporal/mevis/mevis_137_8.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.435, 0.128, 0.633, 0.545]\nB: [0.435, 0.128, 0.597, 0.566]\nC: [0.234, 0.647, 0.36, 0.912]\nD: [0.065, 0.056, 0.176, 0.237]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: white dog play with the other dog", "context": "Select from the following choices.\nA: [0.435, 0.128, 0.633, 0.545]\nB: [0.435, 0.128, 0.597, 0.566]\nC: [0.234, 0.647, 0.36, 0.912]\nD: [0.065, 0.056, 0.176, 0.237]", "input_image_path": ["./Continuous-temporal/mevis/mevis_138_0.jpg", "./Continuous-temporal/mevis/mevis_138_1.jpg", "./Continuous-temporal/mevis/mevis_138_2.jpg", "./Continuous-temporal/mevis/mevis_138_3.jpg", "./Continuous-temporal/mevis/mevis_138_4.jpg", "./Continuous-temporal/mevis/mevis_138_5.jpg", "./Continuous-temporal/mevis/mevis_138_6.jpg", "./Continuous-temporal/mevis/mevis_138_7.jpg", "./Continuous-temporal/mevis/mevis_138_8.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.457, 0.498, 0.581, 0.721]\nB: [0.429, 0.533, 0.553, 0.756]\nC: [0.431, 0.441, 0.555, 0.664]\nD: [0.431, 0.441, 0.562, 0.669]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 544.\nCAPTION: parking bike", "context": "Select from the following choices.\nA: [0.457, 0.498, 0.581, 0.721]\nB: [0.429, 0.533, 0.553, 0.756]\nC: [0.431, 0.441, 0.555, 0.664]\nD: [0.431, 0.441, 0.562, 0.669]", "input_image_path": ["./Continuous-temporal/mevis/mevis_139_0.jpg", "./Continuous-temporal/mevis/mevis_139_1.jpg", "./Continuous-temporal/mevis/mevis_139_2.jpg", "./Continuous-temporal/mevis/mevis_139_3.jpg", "./Continuous-temporal/mevis/mevis_139_4.jpg", "./Continuous-temporal/mevis/mevis_139_5.jpg", "./Continuous-temporal/mevis/mevis_139_6.jpg", "./Continuous-temporal/mevis/mevis_139_7.jpg", "./Continuous-temporal/mevis/mevis_139_8.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.0, 0.105, 0.665, 0.999]\nB: [0.0, 0.105, 0.54, 1.054]\nC: [0.264, 0.106, 0.929, 1.0]\nD: [0.094, 0.106, 0.759, 1.0]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: The horse whose head is being struck by the tail of another horse", "context": "Select from the following choices.\nA: [0.0, 0.105, 0.665, 0.999]\nB: [0.0, 0.105, 0.54, 1.054]\nC: [0.264, 0.106, 0.929, 1.0]\nD: [0.094, 0.106, 0.759, 1.0]", "input_image_path": ["./Continuous-temporal/mevis/mevis_140_0.jpg", "./Continuous-temporal/mevis/mevis_140_1.jpg", "./Continuous-temporal/mevis/mevis_140_2.jpg", "./Continuous-temporal/mevis/mevis_140_3.jpg", "./Continuous-temporal/mevis/mevis_140_4.jpg", "./Continuous-temporal/mevis/mevis_140_5.jpg", "./Continuous-temporal/mevis/mevis_140_6.jpg", "./Continuous-temporal/mevis/mevis_140_7.jpg", "./Continuous-temporal/mevis/mevis_140_8.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.263, 0.682, 0.502, 0.997]\nB: [0.246, 0.685, 0.486, 1.0]\nC: [0.263, 0.682, 0.533, 0.971]\nD: [0.423, 0.255, 0.673, 0.324]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1280 and the height is 648.\nCAPTION: Bear chasing another bear around by walking in circle", "context": "Select from the following choices.\nA: [0.263, 0.682, 0.502, 0.997]\nB: [0.246, 0.685, 0.486, 1.0]\nC: [0.263, 0.682, 0.533, 0.971]\nD: [0.423, 0.255, 0.673, 0.324]", "input_image_path": ["./Continuous-temporal/mevis/mevis_141_0.jpg", "./Continuous-temporal/mevis/mevis_141_1.jpg", "./Continuous-temporal/mevis/mevis_141_2.jpg", "./Continuous-temporal/mevis/mevis_141_3.jpg", "./Continuous-temporal/mevis/mevis_141_4.jpg", "./Continuous-temporal/mevis/mevis_141_5.jpg", "./Continuous-temporal/mevis/mevis_141_6.jpg", "./Continuous-temporal/mevis/mevis_141_7.jpg", "./Continuous-temporal/mevis/mevis_141_8.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.356, 0.0, 0.485, 0.056]\nB: [0.516, 0.864, 0.689, 0.976]\nC: [0.433, 0.097, 0.614, 0.318]\nD: [0.008, 0.274, 0.424, 0.511]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 720.\nCAPTION: The person standing beside, observing the fight between the two dogs.", "context": "Select from the following choices.\nA: [0.356, 0.0, 0.485, 0.056]\nB: [0.516, 0.864, 0.689, 0.976]\nC: [0.433, 0.097, 0.614, 0.318]\nD: [0.008, 0.274, 0.424, 0.511]", "input_image_path": ["./Continuous-temporal/mevis/mevis_142_0.jpg", "./Continuous-temporal/mevis/mevis_142_1.jpg", "./Continuous-temporal/mevis/mevis_142_2.jpg", "./Continuous-temporal/mevis/mevis_142_3.jpg", "./Continuous-temporal/mevis/mevis_142_4.jpg", "./Continuous-temporal/mevis/mevis_142_5.jpg", "./Continuous-temporal/mevis/mevis_142_6.jpg", "./Continuous-temporal/mevis/mevis_142_7.jpg", "./Continuous-temporal/mevis/mevis_142_8.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.067, 0.447, 0.347, 0.492]\nB: [0.107, 0.317, 0.526, 0.785]\nC: [0.107, 0.317, 0.539, 0.871]\nD: [0.285, 0.191, 0.769, 0.504]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1652 and the height is 1080.\nCAPTION: Rabbit moving around and eating leaves", "context": "Select from the following choices.\nA: [0.067, 0.447, 0.347, 0.492]\nB: [0.107, 0.317, 0.526, 0.785]\nC: [0.107, 0.317, 0.539, 0.871]\nD: [0.285, 0.191, 0.769, 0.504]", "input_image_path": ["./Continuous-temporal/mevis/mevis_143_0.jpg", "./Continuous-temporal/mevis/mevis_143_1.jpg", "./Continuous-temporal/mevis/mevis_143_2.jpg", "./Continuous-temporal/mevis/mevis_143_3.jpg", "./Continuous-temporal/mevis/mevis_143_4.jpg", "./Continuous-temporal/mevis/mevis_143_5.jpg", "./Continuous-temporal/mevis/mevis_143_6.jpg", "./Continuous-temporal/mevis/mevis_143_7.jpg", "./Continuous-temporal/mevis/mevis_143_8.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.315, 0.147, 1.082, 0.965]\nB: [0.315, 0.147, 1.085, 0.92]\nC: [0.315, 0.147, 1.037, 1.001]\nD: [0.315, 0.147, 0.969, 0.947]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: The horse, initially facing right, then turns to face left", "context": "Select from the following choices.\nA: [0.315, 0.147, 1.082, 0.965]\nB: [0.315, 0.147, 1.085, 0.92]\nC: [0.315, 0.147, 1.037, 1.001]\nD: [0.315, 0.147, 0.969, 0.947]", "input_image_path": ["./Continuous-temporal/mevis/mevis_144_0.jpg", "./Continuous-temporal/mevis/mevis_144_1.jpg", "./Continuous-temporal/mevis/mevis_144_2.jpg", "./Continuous-temporal/mevis/mevis_144_3.jpg", "./Continuous-temporal/mevis/mevis_144_4.jpg", "./Continuous-temporal/mevis/mevis_144_5.jpg", "./Continuous-temporal/mevis/mevis_144_6.jpg", "./Continuous-temporal/mevis/mevis_144_7.jpg", "./Continuous-temporal/mevis/mevis_144_8.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.872, 0.209, 0.98, 0.326]\nB: [0.852, 0.23, 0.959, 0.346]\nC: [0.852, 0.23, 0.947, 0.324]\nD: [0.807, 0.175, 0.914, 0.292]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: parking white car", "context": "Select from the following choices.\nA: [0.872, 0.209, 0.98, 0.326]\nB: [0.852, 0.23, 0.959, 0.346]\nC: [0.852, 0.23, 0.947, 0.324]\nD: [0.807, 0.175, 0.914, 0.292]", "input_image_path": ["./Continuous-temporal/mevis/mevis_145_0.jpg", "./Continuous-temporal/mevis/mevis_145_1.jpg", "./Continuous-temporal/mevis/mevis_145_2.jpg", "./Continuous-temporal/mevis/mevis_145_3.jpg", "./Continuous-temporal/mevis/mevis_145_4.jpg", "./Continuous-temporal/mevis/mevis_145_5.jpg", "./Continuous-temporal/mevis/mevis_145_6.jpg", "./Continuous-temporal/mevis/mevis_145_7.jpg", "./Continuous-temporal/mevis/mevis_145_8.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.003, 0.086, 0.25, 0.3]\nB: [0.085, 0.061, 0.332, 0.275]\nC: [0.096, 0.09, 0.343, 0.304]\nD: [0.003, 0.086, 0.254, 0.325]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: The bird that arrived at the bottom of the cage first.", "context": "Select from the following choices.\nA: [0.003, 0.086, 0.25, 0.3]\nB: [0.085, 0.061, 0.332, 0.275]\nC: [0.096, 0.09, 0.343, 0.304]\nD: [0.003, 0.086, 0.254, 0.325]", "input_image_path": ["./Continuous-temporal/mevis/mevis_146_0.jpg", "./Continuous-temporal/mevis/mevis_146_1.jpg", "./Continuous-temporal/mevis/mevis_146_2.jpg", "./Continuous-temporal/mevis/mevis_146_3.jpg", "./Continuous-temporal/mevis/mevis_146_4.jpg", "./Continuous-temporal/mevis/mevis_146_5.jpg", "./Continuous-temporal/mevis/mevis_146_6.jpg", "./Continuous-temporal/mevis/mevis_146_7.jpg", "./Continuous-temporal/mevis/mevis_146_8.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.584, 0.354, 0.932, 0.634]\nB: [0.364, 0.439, 0.436, 0.622]\nC: [0.34, 0.475, 0.412, 0.658]\nD: [0.364, 0.439, 0.432, 0.608]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: The elephant walking to the left.", "context": "Select from the following choices.\nA: [0.584, 0.354, 0.932, 0.634]\nB: [0.364, 0.439, 0.436, 0.622]\nC: [0.34, 0.475, 0.412, 0.658]\nD: [0.364, 0.439, 0.432, 0.608]", "input_image_path": ["./Continuous-temporal/mevis/mevis_147_0.jpg", "./Continuous-temporal/mevis/mevis_147_1.jpg", "./Continuous-temporal/mevis/mevis_147_2.jpg", "./Continuous-temporal/mevis/mevis_147_3.jpg", "./Continuous-temporal/mevis/mevis_147_4.jpg", "./Continuous-temporal/mevis/mevis_147_5.jpg", "./Continuous-temporal/mevis/mevis_147_6.jpg", "./Continuous-temporal/mevis/mevis_147_7.jpg", "./Continuous-temporal/mevis/mevis_147_8.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.31, 0.426, 0.433, 0.642]\nB: [0.31, 0.426, 0.423, 0.608]\nC: [0.31, 0.426, 0.45, 0.673]\nD: [0.309, 0.434, 0.432, 0.649]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 544.\nCAPTION: Bicycle being pushed around in a circle", "context": "Select from the following choices.\nA: [0.31, 0.426, 0.433, 0.642]\nB: [0.31, 0.426, 0.423, 0.608]\nC: [0.31, 0.426, 0.45, 0.673]\nD: [0.309, 0.434, 0.432, 0.649]", "input_image_path": ["./Continuous-temporal/mevis/mevis_148_0.jpg", "./Continuous-temporal/mevis/mevis_148_1.jpg", "./Continuous-temporal/mevis/mevis_148_2.jpg", "./Continuous-temporal/mevis/mevis_148_3.jpg", "./Continuous-temporal/mevis/mevis_148_4.jpg", "./Continuous-temporal/mevis/mevis_148_5.jpg", "./Continuous-temporal/mevis/mevis_148_6.jpg", "./Continuous-temporal/mevis/mevis_148_7.jpg", "./Continuous-temporal/mevis/mevis_148_8.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.321, 0.44, 0.514, 0.715]\nB: [0.599, 0.537, 0.865, 0.898]\nC: [0.248, 0.394, 0.442, 0.669]\nD: [0.341, 0.617, 0.824, 0.998]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 636 and the height is 480.\nCAPTION: The tiger that came to drink water last.", "context": "Select from the following choices.\nA: [0.321, 0.44, 0.514, 0.715]\nB: [0.599, 0.537, 0.865, 0.898]\nC: [0.248, 0.394, 0.442, 0.669]\nD: [0.341, 0.617, 0.824, 0.998]", "input_image_path": ["./Continuous-temporal/mevis/mevis_149_0.jpg", "./Continuous-temporal/mevis/mevis_149_1.jpg", "./Continuous-temporal/mevis/mevis_149_2.jpg", "./Continuous-temporal/mevis/mevis_149_3.jpg", "./Continuous-temporal/mevis/mevis_149_4.jpg", "./Continuous-temporal/mevis/mevis_149_5.jpg", "./Continuous-temporal/mevis/mevis_149_6.jpg", "./Continuous-temporal/mevis/mevis_149_7.jpg", "./Continuous-temporal/mevis/mevis_149_8.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.317, 0.473, 0.775, 0.875]\nB: [0.317, 0.473, 0.744, 0.815]\nC: [0.453, 0.381, 0.88, 0.723]\nD: [0.317, 0.473, 0.662, 0.806]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 640 and the height is 480.\nCAPTION: The lizard that first ate the food.", "context": "Select from the following choices.\nA: [0.317, 0.473, 0.775, 0.875]\nB: [0.317, 0.473, 0.744, 0.815]\nC: [0.453, 0.381, 0.88, 0.723]\nD: [0.317, 0.473, 0.662, 0.806]", "input_image_path": ["./Continuous-temporal/mevis/mevis_150_0.jpg", "./Continuous-temporal/mevis/mevis_150_1.jpg", "./Continuous-temporal/mevis/mevis_150_2.jpg", "./Continuous-temporal/mevis/mevis_150_3.jpg", "./Continuous-temporal/mevis/mevis_150_4.jpg", "./Continuous-temporal/mevis/mevis_150_5.jpg", "./Continuous-temporal/mevis/mevis_150_6.jpg", "./Continuous-temporal/mevis/mevis_150_7.jpg", "./Continuous-temporal/mevis/mevis_150_8.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.0, 0.0, 0.708, 0.978]\nB: [0.291, 0.0, 0.999, 0.978]\nC: [0.265, 0.0, 0.973, 0.978]\nD: [0.291, 0.0, 0.882, 1.101]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: Turtle biting at the ground then looking up", "context": "Select from the following choices.\nA: [0.0, 0.0, 0.708, 0.978]\nB: [0.291, 0.0, 0.999, 0.978]\nC: [0.265, 0.0, 0.973, 0.978]\nD: [0.291, 0.0, 0.882, 1.101]", "input_image_path": ["./Continuous-temporal/mevis/mevis_151_0.jpg", "./Continuous-temporal/mevis/mevis_151_1.jpg", "./Continuous-temporal/mevis/mevis_151_2.jpg", "./Continuous-temporal/mevis/mevis_151_3.jpg", "./Continuous-temporal/mevis/mevis_151_4.jpg", "./Continuous-temporal/mevis/mevis_151_5.jpg", "./Continuous-temporal/mevis/mevis_151_6.jpg", "./Continuous-temporal/mevis/mevis_151_7.jpg", "./Continuous-temporal/mevis/mevis_151_8.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.178, 0.354, 0.671, 0.415]\nB: [0.529, 0.094, 0.652, 0.226]\nC: [0.52, 0.228, 0.9, 0.739]\nD: [0.376, 0.194, 0.494, 0.64]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1280 and the height is 720.\nCAPTION: The red truck in motion.", "context": "Select from the following choices.\nA: [0.178, 0.354, 0.671, 0.415]\nB: [0.529, 0.094, 0.652, 0.226]\nC: [0.52, 0.228, 0.9, 0.739]\nD: [0.376, 0.194, 0.494, 0.64]", "input_image_path": ["./Continuous-temporal/mevis/mevis_152_0.jpg", "./Continuous-temporal/mevis/mevis_152_1.jpg", "./Continuous-temporal/mevis/mevis_152_2.jpg", "./Continuous-temporal/mevis/mevis_152_3.jpg", "./Continuous-temporal/mevis/mevis_152_4.jpg", "./Continuous-temporal/mevis/mevis_152_5.jpg", "./Continuous-temporal/mevis/mevis_152_6.jpg", "./Continuous-temporal/mevis/mevis_152_7.jpg", "./Continuous-temporal/mevis/mevis_152_8.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.351, 0.528, 0.504, 0.777]\nB: [0.321, 0.242, 0.374, 0.512]\nC: [0.195, 0.202, 0.392, 0.886]\nD: [0.215, 0.316, 0.411, 1.0]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: Cat walking forward", "context": "Select from the following choices.\nA: [0.351, 0.528, 0.504, 0.777]\nB: [0.321, 0.242, 0.374, 0.512]\nC: [0.195, 0.202, 0.392, 0.886]\nD: [0.215, 0.316, 0.411, 1.0]", "input_image_path": ["./Continuous-temporal/mevis/mevis_153_0.jpg", "./Continuous-temporal/mevis/mevis_153_1.jpg", "./Continuous-temporal/mevis/mevis_153_2.jpg", "./Continuous-temporal/mevis/mevis_153_3.jpg", "./Continuous-temporal/mevis/mevis_153_4.jpg", "./Continuous-temporal/mevis/mevis_153_5.jpg", "./Continuous-temporal/mevis/mevis_153_6.jpg", "./Continuous-temporal/mevis/mevis_153_7.jpg", "./Continuous-temporal/mevis/mevis_153_8.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.709, 0.471, 0.932, 0.591]\nB: [0.227, 0.343, 0.601, 0.803]\nC: [0.194, 0.624, 0.248, 0.951]\nD: [0.709, 0.471, 0.955, 0.591]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 604 and the height is 1280.\nCAPTION: Puppy running around ", "context": "Select from the following choices.\nA: [0.709, 0.471, 0.932, 0.591]\nB: [0.227, 0.343, 0.601, 0.803]\nC: [0.194, 0.624, 0.248, 0.951]\nD: [0.709, 0.471, 0.955, 0.591]", "input_image_path": ["./Continuous-temporal/mevis/mevis_154_0.jpg", "./Continuous-temporal/mevis/mevis_154_1.jpg", "./Continuous-temporal/mevis/mevis_154_2.jpg", "./Continuous-temporal/mevis/mevis_154_3.jpg", "./Continuous-temporal/mevis/mevis_154_4.jpg", "./Continuous-temporal/mevis/mevis_154_5.jpg", "./Continuous-temporal/mevis/mevis_154_6.jpg", "./Continuous-temporal/mevis/mevis_154_7.jpg", "./Continuous-temporal/mevis/mevis_154_8.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.374, 0.222, 0.81, 0.677]\nB: [0.374, 0.222, 0.764, 0.631]\nC: [0.464, 0.105, 0.854, 0.513]\nD: [0.374, 0.222, 0.692, 0.671]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1440 and the height is 1080.\nCAPTION: Puppy lying on the ground with its belly leaking", "context": "Select from the following choices.\nA: [0.374, 0.222, 0.81, 0.677]\nB: [0.374, 0.222, 0.764, 0.631]\nC: [0.464, 0.105, 0.854, 0.513]\nD: [0.374, 0.222, 0.692, 0.671]", "input_image_path": ["./Continuous-temporal/mevis/mevis_155_0.jpg", "./Continuous-temporal/mevis/mevis_155_1.jpg", "./Continuous-temporal/mevis/mevis_155_2.jpg", "./Continuous-temporal/mevis/mevis_155_3.jpg", "./Continuous-temporal/mevis/mevis_155_4.jpg", "./Continuous-temporal/mevis/mevis_155_5.jpg", "./Continuous-temporal/mevis/mevis_155_6.jpg", "./Continuous-temporal/mevis/mevis_155_7.jpg", "./Continuous-temporal/mevis/mevis_155_8.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.544, 0.351, 0.595, 0.461]\nB: [0.544, 0.351, 0.588, 0.456]\nC: [0.562, 0.326, 0.613, 0.436]\nD: [0.523, 0.379, 0.573, 0.489]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1280 and the height is 720.\nCAPTION: turtle is moved by monkeys on stone", "context": "Select from the following choices.\nA: [0.544, 0.351, 0.595, 0.461]\nB: [0.544, 0.351, 0.588, 0.456]\nC: [0.562, 0.326, 0.613, 0.436]\nD: [0.523, 0.379, 0.573, 0.489]", "input_image_path": ["./Continuous-temporal/mevis/mevis_156_0.jpg", "./Continuous-temporal/mevis/mevis_156_1.jpg", "./Continuous-temporal/mevis/mevis_156_2.jpg", "./Continuous-temporal/mevis/mevis_156_3.jpg", "./Continuous-temporal/mevis/mevis_156_4.jpg", "./Continuous-temporal/mevis/mevis_156_5.jpg", "./Continuous-temporal/mevis/mevis_156_6.jpg", "./Continuous-temporal/mevis/mevis_156_7.jpg", "./Continuous-temporal/mevis/mevis_156_8.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.199, 0.282, 0.295, 0.529]\nB: [0.392, 0.079, 0.849, 0.268]\nC: [0.199, 0.282, 0.276, 0.526]\nD: [0.044, 0.596, 0.499, 0.908]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: The bird that has not reached the bottom of the cage yet.", "context": "Select from the following choices.\nA: [0.199, 0.282, 0.295, 0.529]\nB: [0.392, 0.079, 0.849, 0.268]\nC: [0.199, 0.282, 0.276, 0.526]\nD: [0.044, 0.596, 0.499, 0.908]", "input_image_path": ["./Continuous-temporal/mevis/mevis_157_0.jpg", "./Continuous-temporal/mevis/mevis_157_1.jpg", "./Continuous-temporal/mevis/mevis_157_2.jpg", "./Continuous-temporal/mevis/mevis_157_3.jpg", "./Continuous-temporal/mevis/mevis_157_4.jpg", "./Continuous-temporal/mevis/mevis_157_5.jpg", "./Continuous-temporal/mevis/mevis_157_6.jpg", "./Continuous-temporal/mevis/mevis_157_7.jpg", "./Continuous-temporal/mevis/mevis_157_8.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.209, 0.29, 0.602, 0.59]\nB: [0.449, 0.438, 0.581, 0.682]\nC: [0.699, 0.158, 0.923, 0.535]\nD: [0.449, 0.438, 0.566, 0.653]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 544.\nCAPTION: Stationary bicycle", "context": "Select from the following choices.\nA: [0.209, 0.29, 0.602, 0.59]\nB: [0.449, 0.438, 0.581, 0.682]\nC: [0.699, 0.158, 0.923, 0.535]\nD: [0.449, 0.438, 0.566, 0.653]", "input_image_path": ["./Continuous-temporal/mevis/mevis_158_0.jpg", "./Continuous-temporal/mevis/mevis_158_1.jpg", "./Continuous-temporal/mevis/mevis_158_2.jpg", "./Continuous-temporal/mevis/mevis_158_3.jpg", "./Continuous-temporal/mevis/mevis_158_4.jpg", "./Continuous-temporal/mevis/mevis_158_5.jpg", "./Continuous-temporal/mevis/mevis_158_6.jpg", "./Continuous-temporal/mevis/mevis_158_7.jpg", "./Continuous-temporal/mevis/mevis_158_8.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.466, 0.0, 0.511, 0.153]\nB: [0.466, 0.0, 0.506, 0.154]\nC: [0.466, 0.0, 0.505, 0.136]\nD: [0.73, 0.279, 0.812, 0.482]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: the left one of the two sitting people in the distance", "context": "Select from the following choices.\nA: [0.466, 0.0, 0.511, 0.153]\nB: [0.466, 0.0, 0.506, 0.154]\nC: [0.466, 0.0, 0.505, 0.136]\nD: [0.73, 0.279, 0.812, 0.482]", "input_image_path": ["./Continuous-temporal/mevis/mevis_159_0.jpg", "./Continuous-temporal/mevis/mevis_159_1.jpg", "./Continuous-temporal/mevis/mevis_159_2.jpg", "./Continuous-temporal/mevis/mevis_159_3.jpg", "./Continuous-temporal/mevis/mevis_159_4.jpg", "./Continuous-temporal/mevis/mevis_159_5.jpg", "./Continuous-temporal/mevis/mevis_159_6.jpg", "./Continuous-temporal/mevis/mevis_159_7.jpg", "./Continuous-temporal/mevis/mevis_159_8.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.33, 0.105, 0.536, 0.932]\nB: [0.385, 0.088, 0.592, 0.916]\nC: [0.33, 0.105, 0.545, 0.894]\nD: [0.017, 0.121, 0.509, 0.611]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: cat with a yellow ring around its neck", "context": "Select from the following choices.\nA: [0.33, 0.105, 0.536, 0.932]\nB: [0.385, 0.088, 0.592, 0.916]\nC: [0.33, 0.105, 0.545, 0.894]\nD: [0.017, 0.121, 0.509, 0.611]", "input_image_path": ["./Continuous-temporal/mevis/mevis_160_0.jpg", "./Continuous-temporal/mevis/mevis_160_1.jpg", "./Continuous-temporal/mevis/mevis_160_2.jpg", "./Continuous-temporal/mevis/mevis_160_3.jpg", "./Continuous-temporal/mevis/mevis_160_4.jpg", "./Continuous-temporal/mevis/mevis_160_5.jpg", "./Continuous-temporal/mevis/mevis_160_6.jpg", "./Continuous-temporal/mevis/mevis_160_7.jpg", "./Continuous-temporal/mevis/mevis_160_8.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.221, 0.155, 0.509, 0.54]\nB: [0.221, 0.155, 0.524, 0.703]\nC: [0.107, 0.125, 0.37, 0.606]\nD: [0.221, 0.155, 0.483, 0.636]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 528.\nCAPTION: darker one of two fighting yaks", "context": "Select from the following choices.\nA: [0.221, 0.155, 0.509, 0.54]\nB: [0.221, 0.155, 0.524, 0.703]\nC: [0.107, 0.125, 0.37, 0.606]\nD: [0.221, 0.155, 0.483, 0.636]", "input_image_path": ["./Continuous-temporal/mevis/mevis_161_0.jpg", "./Continuous-temporal/mevis/mevis_161_1.jpg", "./Continuous-temporal/mevis/mevis_161_2.jpg", "./Continuous-temporal/mevis/mevis_161_3.jpg", "./Continuous-temporal/mevis/mevis_161_4.jpg", "./Continuous-temporal/mevis/mevis_161_5.jpg", "./Continuous-temporal/mevis/mevis_161_6.jpg", "./Continuous-temporal/mevis/mevis_161_7.jpg", "./Continuous-temporal/mevis/mevis_161_8.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.031, 0.234, 0.411, 0.404]\nB: [0.847, 0.229, 0.953, 0.344]\nC: [0.842, 0.222, 0.947, 0.337]\nD: [0.826, 0.244, 0.932, 0.359]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: parking black car", "context": "Select from the following choices.\nA: [0.031, 0.234, 0.411, 0.404]\nB: [0.847, 0.229, 0.953, 0.344]\nC: [0.842, 0.222, 0.947, 0.337]\nD: [0.826, 0.244, 0.932, 0.359]", "input_image_path": ["./Continuous-temporal/mevis/mevis_162_0.jpg", "./Continuous-temporal/mevis/mevis_162_1.jpg", "./Continuous-temporal/mevis/mevis_162_2.jpg", "./Continuous-temporal/mevis/mevis_162_3.jpg", "./Continuous-temporal/mevis/mevis_162_4.jpg", "./Continuous-temporal/mevis/mevis_162_5.jpg", "./Continuous-temporal/mevis/mevis_162_6.jpg", "./Continuous-temporal/mevis/mevis_162_7.jpg", "./Continuous-temporal/mevis/mevis_162_8.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.164, 0.276, 0.317, 0.447]\nB: [0.164, 0.276, 0.3, 0.456]\nC: [0.412, 0.111, 0.527, 0.517]\nD: [0.164, 0.276, 0.277, 0.46]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1280 and the height is 720.\nCAPTION: Monkey running out of a hole to the left then right", "context": "Select from the following choices.\nA: [0.164, 0.276, 0.317, 0.447]\nB: [0.164, 0.276, 0.3, 0.456]\nC: [0.412, 0.111, 0.527, 0.517]\nD: [0.164, 0.276, 0.277, 0.46]", "input_image_path": ["./Continuous-temporal/mevis/mevis_163_0.jpg", "./Continuous-temporal/mevis/mevis_163_1.jpg", "./Continuous-temporal/mevis/mevis_163_2.jpg", "./Continuous-temporal/mevis/mevis_163_3.jpg", "./Continuous-temporal/mevis/mevis_163_4.jpg", "./Continuous-temporal/mevis/mevis_163_5.jpg", "./Continuous-temporal/mevis/mevis_163_6.jpg", "./Continuous-temporal/mevis/mevis_163_7.jpg", "./Continuous-temporal/mevis/mevis_163_8.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.172, 0.283, 0.258, 0.559]\nB: [0.548, 0.524, 0.785, 0.948]\nC: [0.172, 0.283, 0.271, 0.557]\nD: [0.131, 0.323, 0.23, 0.597]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: The bird that hasn't touched the cage floor so far.", "context": "Select from the following choices.\nA: [0.172, 0.283, 0.258, 0.559]\nB: [0.548, 0.524, 0.785, 0.948]\nC: [0.172, 0.283, 0.271, 0.557]\nD: [0.131, 0.323, 0.23, 0.597]", "input_image_path": ["./Continuous-temporal/mevis/mevis_164_0.jpg", "./Continuous-temporal/mevis/mevis_164_1.jpg", "./Continuous-temporal/mevis/mevis_164_2.jpg", "./Continuous-temporal/mevis/mevis_164_3.jpg", "./Continuous-temporal/mevis/mevis_164_4.jpg", "./Continuous-temporal/mevis/mevis_164_5.jpg", "./Continuous-temporal/mevis/mevis_164_6.jpg", "./Continuous-temporal/mevis/mevis_164_7.jpg", "./Continuous-temporal/mevis/mevis_164_8.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.214, 0.404, 0.521, 0.765]\nB: [0.322, 0.46, 0.629, 0.821]\nC: [0.257, 0.354, 0.564, 0.715]\nD: [0.3, 0.597, 0.607, 0.958]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1280 and the height is 720.\nCAPTION: The elephant being assaulted and harassed by its companions.", "context": "Select from the following choices.\nA: [0.214, 0.404, 0.521, 0.765]\nB: [0.322, 0.46, 0.629, 0.821]\nC: [0.257, 0.354, 0.564, 0.715]\nD: [0.3, 0.597, 0.607, 0.958]", "input_image_path": ["./Continuous-temporal/mevis/mevis_165_0.jpg", "./Continuous-temporal/mevis/mevis_165_1.jpg", "./Continuous-temporal/mevis/mevis_165_2.jpg", "./Continuous-temporal/mevis/mevis_165_3.jpg", "./Continuous-temporal/mevis/mevis_165_4.jpg", "./Continuous-temporal/mevis/mevis_165_5.jpg", "./Continuous-temporal/mevis/mevis_165_6.jpg", "./Continuous-temporal/mevis/mevis_165_7.jpg", "./Continuous-temporal/mevis/mevis_165_8.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.173, 0.303, 0.32, 0.724]\nB: [0.18, 0.397, 0.327, 0.819]\nC: [0.276, 0.579, 0.423, 1.0]\nD: [0.203, 0.44, 0.349, 0.861]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: Man squatting down and opening his palm", "context": "Select from the following choices.\nA: [0.173, 0.303, 0.32, 0.724]\nB: [0.18, 0.397, 0.327, 0.819]\nC: [0.276, 0.579, 0.423, 1.0]\nD: [0.203, 0.44, 0.349, 0.861]", "input_image_path": ["./Continuous-temporal/mevis/mevis_166_0.jpg", "./Continuous-temporal/mevis/mevis_166_1.jpg", "./Continuous-temporal/mevis/mevis_166_2.jpg", "./Continuous-temporal/mevis/mevis_166_3.jpg", "./Continuous-temporal/mevis/mevis_166_4.jpg", "./Continuous-temporal/mevis/mevis_166_5.jpg", "./Continuous-temporal/mevis/mevis_166_6.jpg", "./Continuous-temporal/mevis/mevis_166_7.jpg", "./Continuous-temporal/mevis/mevis_166_8.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.185, 0.433, 0.398, 0.844]\nB: [0.192, 0.389, 0.377, 0.817]\nC: [0.185, 0.433, 0.369, 0.861]\nD: [0.278, 0.1, 0.616, 0.4]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: person feeding sheeps", "context": "Select from the following choices.\nA: [0.185, 0.433, 0.398, 0.844]\nB: [0.192, 0.389, 0.377, 0.817]\nC: [0.185, 0.433, 0.369, 0.861]\nD: [0.278, 0.1, 0.616, 0.4]", "input_image_path": ["./Continuous-temporal/mevis/mevis_167_0.jpg", "./Continuous-temporal/mevis/mevis_167_1.jpg", "./Continuous-temporal/mevis/mevis_167_2.jpg", "./Continuous-temporal/mevis/mevis_167_3.jpg", "./Continuous-temporal/mevis/mevis_167_4.jpg", "./Continuous-temporal/mevis/mevis_167_5.jpg", "./Continuous-temporal/mevis/mevis_167_6.jpg", "./Continuous-temporal/mevis/mevis_167_7.jpg", "./Continuous-temporal/mevis/mevis_167_8.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.182, 0.029, 0.661, 0.508]\nB: [0.184, 0.168, 0.605, 0.803]\nC: [0.216, 0.331, 0.577, 0.948]\nD: [0.216, 0.331, 0.636, 0.967]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: cat chasing cat teaser", "context": "Select from the following choices.\nA: [0.182, 0.029, 0.661, 0.508]\nB: [0.184, 0.168, 0.605, 0.803]\nC: [0.216, 0.331, 0.577, 0.948]\nD: [0.216, 0.331, 0.636, 0.967]", "input_image_path": ["./Continuous-temporal/mevis/mevis_168_0.jpg", "./Continuous-temporal/mevis/mevis_168_1.jpg", "./Continuous-temporal/mevis/mevis_168_2.jpg", "./Continuous-temporal/mevis/mevis_168_3.jpg", "./Continuous-temporal/mevis/mevis_168_4.jpg", "./Continuous-temporal/mevis/mevis_168_5.jpg", "./Continuous-temporal/mevis/mevis_168_6.jpg", "./Continuous-temporal/mevis/mevis_168_7.jpg", "./Continuous-temporal/mevis/mevis_168_8.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.022, 0.206, 0.13, 0.376]\nB: [0.02, 0.272, 0.128, 0.443]\nC: [0.0, 0.228, 0.108, 0.399]\nD: [0.38, 0.124, 0.665, 0.614]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1280 and the height is 720.\nCAPTION: monkey walking in and out of the gate", "context": "Select from the following choices.\nA: [0.022, 0.206, 0.13, 0.376]\nB: [0.02, 0.272, 0.128, 0.443]\nC: [0.0, 0.228, 0.108, 0.399]\nD: [0.38, 0.124, 0.665, 0.614]", "input_image_path": ["./Continuous-temporal/mevis/mevis_169_0.jpg", "./Continuous-temporal/mevis/mevis_169_1.jpg", "./Continuous-temporal/mevis/mevis_169_2.jpg", "./Continuous-temporal/mevis/mevis_169_3.jpg", "./Continuous-temporal/mevis/mevis_169_4.jpg", "./Continuous-temporal/mevis/mevis_169_5.jpg", "./Continuous-temporal/mevis/mevis_169_6.jpg", "./Continuous-temporal/mevis/mevis_169_7.jpg", "./Continuous-temporal/mevis/mevis_169_8.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.591, 0.321, 0.819, 0.879]\nB: [0.591, 0.321, 0.816, 0.849]\nC: [0.591, 0.321, 0.797, 0.938]\nD: [0.591, 0.321, 0.828, 0.908]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1280 and the height is 720.\nCAPTION: Monkey moving around and jumping to the left", "context": "Select from the following choices.\nA: [0.591, 0.321, 0.819, 0.879]\nB: [0.591, 0.321, 0.816, 0.849]\nC: [0.591, 0.321, 0.797, 0.938]\nD: [0.591, 0.321, 0.828, 0.908]", "input_image_path": ["./Continuous-temporal/mevis/mevis_170_0.jpg", "./Continuous-temporal/mevis/mevis_170_1.jpg", "./Continuous-temporal/mevis/mevis_170_2.jpg", "./Continuous-temporal/mevis/mevis_170_3.jpg", "./Continuous-temporal/mevis/mevis_170_4.jpg", "./Continuous-temporal/mevis/mevis_170_5.jpg", "./Continuous-temporal/mevis/mevis_170_6.jpg", "./Continuous-temporal/mevis/mevis_170_7.jpg", "./Continuous-temporal/mevis/mevis_170_8.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.824, 0.389, 0.91, 0.531]\nB: [0.447, 0.617, 0.93, 0.662]\nC: [0.85, 0.361, 0.936, 0.503]\nD: [0.814, 0.372, 0.9, 0.514]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1280 and the height is 720.\nCAPTION: The one that ran away from the right first", "context": "Select from the following choices.\nA: [0.824, 0.389, 0.91, 0.531]\nB: [0.447, 0.617, 0.93, 0.662]\nC: [0.85, 0.361, 0.936, 0.503]\nD: [0.814, 0.372, 0.9, 0.514]", "input_image_path": ["./Continuous-temporal/mevis/mevis_171_0.jpg", "./Continuous-temporal/mevis/mevis_171_1.jpg", "./Continuous-temporal/mevis/mevis_171_2.jpg", "./Continuous-temporal/mevis/mevis_171_3.jpg", "./Continuous-temporal/mevis/mevis_171_4.jpg", "./Continuous-temporal/mevis/mevis_171_5.jpg", "./Continuous-temporal/mevis/mevis_171_6.jpg", "./Continuous-temporal/mevis/mevis_171_7.jpg", "./Continuous-temporal/mevis/mevis_171_8.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.089, 0.316, 0.117, 0.569]\nB: [0.0, 0.0, 0.938, 0.907]\nC: [0.0, 0.0, 0.999, 0.817]\nD: [0.001, 0.006, 1.0, 0.823]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1766 and the height is 945.\nCAPTION: Aircraft moving rightward", "context": "Select from the following choices.\nA: [0.089, 0.316, 0.117, 0.569]\nB: [0.0, 0.0, 0.938, 0.907]\nC: [0.0, 0.0, 0.999, 0.817]\nD: [0.001, 0.006, 1.0, 0.823]", "input_image_path": ["./Continuous-temporal/mevis/mevis_172_0.jpg", "./Continuous-temporal/mevis/mevis_172_1.jpg", "./Continuous-temporal/mevis/mevis_172_2.jpg", "./Continuous-temporal/mevis/mevis_172_3.jpg", "./Continuous-temporal/mevis/mevis_172_4.jpg", "./Continuous-temporal/mevis/mevis_172_5.jpg", "./Continuous-temporal/mevis/mevis_172_6.jpg", "./Continuous-temporal/mevis/mevis_172_7.jpg", "./Continuous-temporal/mevis/mevis_172_8.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.348, 0.861, 0.555, 0.968]\nB: [0.286, 0.694, 0.507, 0.998]\nC: [0.286, 0.694, 0.47, 1.019]\nD: [0.243, 0.59, 0.464, 0.894]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1280 and the height is 648.\nCAPTION: bear fight then sit", "context": "Select from the following choices.\nA: [0.348, 0.861, 0.555, 0.968]\nB: [0.286, 0.694, 0.507, 0.998]\nC: [0.286, 0.694, 0.47, 1.019]\nD: [0.243, 0.59, 0.464, 0.894]", "input_image_path": ["./Continuous-temporal/mevis/mevis_173_0.jpg", "./Continuous-temporal/mevis/mevis_173_1.jpg", "./Continuous-temporal/mevis/mevis_173_2.jpg", "./Continuous-temporal/mevis/mevis_173_3.jpg", "./Continuous-temporal/mevis/mevis_173_4.jpg", "./Continuous-temporal/mevis/mevis_173_5.jpg", "./Continuous-temporal/mevis/mevis_173_6.jpg", "./Continuous-temporal/mevis/mevis_173_7.jpg", "./Continuous-temporal/mevis/mevis_173_8.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.061, 0.194, 0.192, 0.585]\nB: [0.219, 0.111, 0.325, 0.363]\nC: [0.165, 0.65, 0.391, 0.915]\nD: [0.09, 0.733, 0.316, 0.998]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1280 and the height is 648.\nCAPTION: The bear moving in reverse.", "context": "Select from the following choices.\nA: [0.061, 0.194, 0.192, 0.585]\nB: [0.219, 0.111, 0.325, 0.363]\nC: [0.165, 0.65, 0.391, 0.915]\nD: [0.09, 0.733, 0.316, 0.998]", "input_image_path": ["./Continuous-temporal/mevis/mevis_174_0.jpg", "./Continuous-temporal/mevis/mevis_174_1.jpg", "./Continuous-temporal/mevis/mevis_174_2.jpg", "./Continuous-temporal/mevis/mevis_174_3.jpg", "./Continuous-temporal/mevis/mevis_174_4.jpg", "./Continuous-temporal/mevis/mevis_174_5.jpg", "./Continuous-temporal/mevis/mevis_174_6.jpg", "./Continuous-temporal/mevis/mevis_174_7.jpg", "./Continuous-temporal/mevis/mevis_174_8.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.432, 0.037, 0.81, 0.448]\nB: [0.119, 0.0, 0.913, 0.438]\nC: [0.0, 0.0, 0.794, 0.438]\nD: [0.119, 0.0, 0.864, 0.437]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1766 and the height is 945.\nCAPTION: aircraft moving leftward", "context": "Select from the following choices.\nA: [0.432, 0.037, 0.81, 0.448]\nB: [0.119, 0.0, 0.913, 0.438]\nC: [0.0, 0.0, 0.794, 0.438]\nD: [0.119, 0.0, 0.864, 0.437]", "input_image_path": ["./Continuous-temporal/mevis/mevis_175_0.jpg", "./Continuous-temporal/mevis/mevis_175_1.jpg", "./Continuous-temporal/mevis/mevis_175_2.jpg", "./Continuous-temporal/mevis/mevis_175_3.jpg", "./Continuous-temporal/mevis/mevis_175_4.jpg", "./Continuous-temporal/mevis/mevis_175_5.jpg", "./Continuous-temporal/mevis/mevis_175_6.jpg", "./Continuous-temporal/mevis/mevis_175_7.jpg", "./Continuous-temporal/mevis/mevis_175_8.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.393, 0.381, 0.514, 0.996]\nB: [0.414, 0.385, 0.535, 1.0]\nC: [0.096, 0.222, 0.52, 0.615]\nD: [0.861, 0.472, 0.906, 0.813]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 512 and the height is 252.\nCAPTION: man moving to right and watching the horse running in circles", "context": "Select from the following choices.\nA: [0.393, 0.381, 0.514, 0.996]\nB: [0.414, 0.385, 0.535, 1.0]\nC: [0.096, 0.222, 0.52, 0.615]\nD: [0.861, 0.472, 0.906, 0.813]", "input_image_path": ["./Continuous-temporal/mevis/mevis_176_0.jpg", "./Continuous-temporal/mevis/mevis_176_1.jpg", "./Continuous-temporal/mevis/mevis_176_2.jpg", "./Continuous-temporal/mevis/mevis_176_3.jpg", "./Continuous-temporal/mevis/mevis_176_4.jpg", "./Continuous-temporal/mevis/mevis_176_5.jpg", "./Continuous-temporal/mevis/mevis_176_6.jpg", "./Continuous-temporal/mevis/mevis_176_7.jpg", "./Continuous-temporal/mevis/mevis_176_8.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.366, 0.448, 0.75, 0.929]\nB: [0.361, 0.037, 0.847, 0.281]\nC: [0.366, 0.448, 0.784, 0.86]\nD: [0.366, 0.448, 0.861, 0.896]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 640 and the height is 480.\nCAPTION: the one who eats the food first", "context": "Select from the following choices.\nA: [0.366, 0.448, 0.75, 0.929]\nB: [0.361, 0.037, 0.847, 0.281]\nC: [0.366, 0.448, 0.784, 0.86]\nD: [0.366, 0.448, 0.861, 0.896]", "input_image_path": ["./Continuous-temporal/mevis/mevis_177_0.jpg", "./Continuous-temporal/mevis/mevis_177_1.jpg", "./Continuous-temporal/mevis/mevis_177_2.jpg", "./Continuous-temporal/mevis/mevis_177_3.jpg", "./Continuous-temporal/mevis/mevis_177_4.jpg", "./Continuous-temporal/mevis/mevis_177_5.jpg", "./Continuous-temporal/mevis/mevis_177_6.jpg", "./Continuous-temporal/mevis/mevis_177_7.jpg", "./Continuous-temporal/mevis/mevis_177_8.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.796, 0.477, 0.999, 0.998]\nB: [0.504, 0.03, 0.769, 0.436]\nC: [0.797, 0.478, 1.0, 1.0]\nD: [0.484, 0.49, 0.883, 0.916]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1280 and the height is 596.\nCAPTION: panda sit down and eat, then move, then sit down and eat", "context": "Select from the following choices.\nA: [0.796, 0.477, 0.999, 0.998]\nB: [0.504, 0.03, 0.769, 0.436]\nC: [0.797, 0.478, 1.0, 1.0]\nD: [0.484, 0.49, 0.883, 0.916]", "input_image_path": ["./Continuous-temporal/mevis/mevis_178_0.jpg", "./Continuous-temporal/mevis/mevis_178_1.jpg", "./Continuous-temporal/mevis/mevis_178_2.jpg", "./Continuous-temporal/mevis/mevis_178_3.jpg", "./Continuous-temporal/mevis/mevis_178_4.jpg", "./Continuous-temporal/mevis/mevis_178_5.jpg", "./Continuous-temporal/mevis/mevis_178_6.jpg", "./Continuous-temporal/mevis/mevis_178_7.jpg", "./Continuous-temporal/mevis/mevis_178_8.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.0, 0.0, 0.536, 0.817]\nB: [0.527, 0.349, 0.567, 0.619]\nC: [0.028, 0.459, 0.397, 0.856]\nD: [0.0, 0.0, 0.456, 0.998]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 945.\nCAPTION: Girl holding leaf with right hand and pulling away then standing up", "context": "Select from the following choices.\nA: [0.0, 0.0, 0.536, 0.817]\nB: [0.527, 0.349, 0.567, 0.619]\nC: [0.028, 0.459, 0.397, 0.856]\nD: [0.0, 0.0, 0.456, 0.998]", "input_image_path": ["./Continuous-temporal/mevis/mevis_179_0.jpg", "./Continuous-temporal/mevis/mevis_179_1.jpg", "./Continuous-temporal/mevis/mevis_179_2.jpg", "./Continuous-temporal/mevis/mevis_179_3.jpg", "./Continuous-temporal/mevis/mevis_179_4.jpg", "./Continuous-temporal/mevis/mevis_179_5.jpg", "./Continuous-temporal/mevis/mevis_179_6.jpg", "./Continuous-temporal/mevis/mevis_179_7.jpg", "./Continuous-temporal/mevis/mevis_179_8.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.0, 0.0, 0.394, 0.32]\nB: [0.0, 0.0, 0.369, 0.36]\nC: [0.0, 0.0, 0.421, 0.277]\nD: [0.315, 0.304, 0.751, 0.604]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1766 and the height is 945.\nCAPTION: Airplane moving from right to left", "context": "Select from the following choices.\nA: [0.0, 0.0, 0.394, 0.32]\nB: [0.0, 0.0, 0.369, 0.36]\nC: [0.0, 0.0, 0.421, 0.277]\nD: [0.315, 0.304, 0.751, 0.604]", "input_image_path": ["./Continuous-temporal/mevis/mevis_180_0.jpg", "./Continuous-temporal/mevis/mevis_180_1.jpg", "./Continuous-temporal/mevis/mevis_180_2.jpg", "./Continuous-temporal/mevis/mevis_180_3.jpg", "./Continuous-temporal/mevis/mevis_180_4.jpg", "./Continuous-temporal/mevis/mevis_180_5.jpg", "./Continuous-temporal/mevis/mevis_180_6.jpg", "./Continuous-temporal/mevis/mevis_180_7.jpg", "./Continuous-temporal/mevis/mevis_180_8.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.406, 0.49, 0.778, 0.986]\nB: [0.176, 0.182, 0.362, 0.224]\nC: [0.307, 0.268, 0.679, 0.764]\nD: [0.307, 0.268, 0.648, 0.8]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1280 and the height is 720.\nCAPTION: monkey jumping right to the rock then crawling around", "context": "Select from the following choices.\nA: [0.406, 0.49, 0.778, 0.986]\nB: [0.176, 0.182, 0.362, 0.224]\nC: [0.307, 0.268, 0.679, 0.764]\nD: [0.307, 0.268, 0.648, 0.8]", "input_image_path": ["./Continuous-temporal/mevis/mevis_181_0.jpg", "./Continuous-temporal/mevis/mevis_181_1.jpg", "./Continuous-temporal/mevis/mevis_181_2.jpg", "./Continuous-temporal/mevis/mevis_181_3.jpg", "./Continuous-temporal/mevis/mevis_181_4.jpg", "./Continuous-temporal/mevis/mevis_181_5.jpg", "./Continuous-temporal/mevis/mevis_181_6.jpg", "./Continuous-temporal/mevis/mevis_181_7.jpg", "./Continuous-temporal/mevis/mevis_181_8.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.544, 0.447, 0.603, 0.518]\nB: [0.527, 0.443, 0.586, 0.515]\nC: [0.527, 0.443, 0.584, 0.5]\nD: [0.527, 0.443, 0.583, 0.506]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 544.\nCAPTION: The second vehicle going straight at the crossroads.", "context": "Select from the following choices.\nA: [0.544, 0.447, 0.603, 0.518]\nB: [0.527, 0.443, 0.586, 0.515]\nC: [0.527, 0.443, 0.584, 0.5]\nD: [0.527, 0.443, 0.583, 0.506]", "input_image_path": ["./Continuous-temporal/mevis/mevis_182_0.jpg", "./Continuous-temporal/mevis/mevis_182_1.jpg", "./Continuous-temporal/mevis/mevis_182_2.jpg", "./Continuous-temporal/mevis/mevis_182_3.jpg", "./Continuous-temporal/mevis/mevis_182_4.jpg", "./Continuous-temporal/mevis/mevis_182_5.jpg", "./Continuous-temporal/mevis/mevis_182_6.jpg", "./Continuous-temporal/mevis/mevis_182_7.jpg", "./Continuous-temporal/mevis/mevis_182_8.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.659, 0.6, 0.999, 0.999]\nB: [0.267, 0.368, 0.319, 0.865]\nC: [0.213, 0.071, 0.666, 0.468]\nD: [0.609, 0.601, 0.949, 1.0]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 945.\nCAPTION: Rabbit on the left bending forward then hopping in a round to rightmost", "context": "Select from the following choices.\nA: [0.659, 0.6, 0.999, 0.999]\nB: [0.267, 0.368, 0.319, 0.865]\nC: [0.213, 0.071, 0.666, 0.468]\nD: [0.609, 0.601, 0.949, 1.0]", "input_image_path": ["./Continuous-temporal/mevis/mevis_183_0.jpg", "./Continuous-temporal/mevis/mevis_183_1.jpg", "./Continuous-temporal/mevis/mevis_183_2.jpg", "./Continuous-temporal/mevis/mevis_183_3.jpg", "./Continuous-temporal/mevis/mevis_183_4.jpg", "./Continuous-temporal/mevis/mevis_183_5.jpg", "./Continuous-temporal/mevis/mevis_183_6.jpg", "./Continuous-temporal/mevis/mevis_183_7.jpg", "./Continuous-temporal/mevis/mevis_183_8.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.122, 0.699, 0.352, 1.026]\nB: [0.077, 0.701, 0.33, 1.0]\nC: [0.122, 0.699, 0.341, 1.006]\nD: [0.122, 0.699, 0.376, 0.998]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1280 and the height is 648.\nCAPTION: Bear being chased and walking backward", "context": "Select from the following choices.\nA: [0.122, 0.699, 0.352, 1.026]\nB: [0.077, 0.701, 0.33, 1.0]\nC: [0.122, 0.699, 0.341, 1.006]\nD: [0.122, 0.699, 0.376, 0.998]", "input_image_path": ["./Continuous-temporal/mevis/mevis_184_0.jpg", "./Continuous-temporal/mevis/mevis_184_1.jpg", "./Continuous-temporal/mevis/mevis_184_2.jpg", "./Continuous-temporal/mevis/mevis_184_3.jpg", "./Continuous-temporal/mevis/mevis_184_4.jpg", "./Continuous-temporal/mevis/mevis_184_5.jpg", "./Continuous-temporal/mevis/mevis_184_6.jpg", "./Continuous-temporal/mevis/mevis_184_7.jpg", "./Continuous-temporal/mevis/mevis_184_8.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.43, 0.075, 1.022, 0.917]\nB: [0.439, 0.323, 0.531, 0.685]\nC: [0.134, 0.398, 0.588, 0.696]\nD: [0.43, 0.075, 0.942, 0.878]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 640 and the height is 362.\nCAPTION: cow waving head without moving position", "context": "Select from the following choices.\nA: [0.43, 0.075, 1.022, 0.917]\nB: [0.439, 0.323, 0.531, 0.685]\nC: [0.134, 0.398, 0.588, 0.696]\nD: [0.43, 0.075, 0.942, 0.878]", "input_image_path": ["./Continuous-temporal/mevis/mevis_185_0.jpg", "./Continuous-temporal/mevis/mevis_185_1.jpg", "./Continuous-temporal/mevis/mevis_185_2.jpg", "./Continuous-temporal/mevis/mevis_185_3.jpg", "./Continuous-temporal/mevis/mevis_185_4.jpg", "./Continuous-temporal/mevis/mevis_185_5.jpg", "./Continuous-temporal/mevis/mevis_185_6.jpg", "./Continuous-temporal/mevis/mevis_185_7.jpg", "./Continuous-temporal/mevis/mevis_185_8.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.381, 0.381, 0.479, 0.889]\nB: [0.381, 0.381, 0.484, 0.996]\nC: [0.762, 0.214, 0.949, 0.595]\nD: [0.41, 0.226, 0.514, 0.841]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 512 and the height is 252.\nCAPTION: person walk forward", "context": "Select from the following choices.\nA: [0.381, 0.381, 0.479, 0.889]\nB: [0.381, 0.381, 0.484, 0.996]\nC: [0.762, 0.214, 0.949, 0.595]\nD: [0.41, 0.226, 0.514, 0.841]", "input_image_path": ["./Continuous-temporal/mevis/mevis_186_0.jpg", "./Continuous-temporal/mevis/mevis_186_1.jpg", "./Continuous-temporal/mevis/mevis_186_2.jpg", "./Continuous-temporal/mevis/mevis_186_3.jpg", "./Continuous-temporal/mevis/mevis_186_4.jpg", "./Continuous-temporal/mevis/mevis_186_5.jpg", "./Continuous-temporal/mevis/mevis_186_6.jpg", "./Continuous-temporal/mevis/mevis_186_7.jpg", "./Continuous-temporal/mevis/mevis_186_8.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.505, 0.235, 0.539, 0.421]\nB: [0.502, 0.315, 0.537, 0.501]\nC: [0.505, 0.235, 0.534, 0.415]\nD: [0.505, 0.235, 0.534, 0.406]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1280 and the height is 720.\nCAPTION: The monkey squatting in the middle of the cave entrance.", "context": "Select from the following choices.\nA: [0.505, 0.235, 0.539, 0.421]\nB: [0.502, 0.315, 0.537, 0.501]\nC: [0.505, 0.235, 0.534, 0.415]\nD: [0.505, 0.235, 0.534, 0.406]", "input_image_path": ["./Continuous-temporal/mevis/mevis_187_0.jpg", "./Continuous-temporal/mevis/mevis_187_1.jpg", "./Continuous-temporal/mevis/mevis_187_2.jpg", "./Continuous-temporal/mevis/mevis_187_3.jpg", "./Continuous-temporal/mevis/mevis_187_4.jpg", "./Continuous-temporal/mevis/mevis_187_5.jpg", "./Continuous-temporal/mevis/mevis_187_6.jpg", "./Continuous-temporal/mevis/mevis_187_7.jpg", "./Continuous-temporal/mevis/mevis_187_8.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.531, 0.29, 0.714, 0.687]\nB: [0.527, 0.434, 0.703, 0.857]\nC: [0.449, 0.314, 0.633, 0.711]\nD: [0.527, 0.434, 0.71, 0.831]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: still bike", "context": "Select from the following choices.\nA: [0.531, 0.29, 0.714, 0.687]\nB: [0.527, 0.434, 0.703, 0.857]\nC: [0.449, 0.314, 0.633, 0.711]\nD: [0.527, 0.434, 0.71, 0.831]", "input_image_path": ["./Continuous-temporal/mevis/mevis_188_0.jpg", "./Continuous-temporal/mevis/mevis_188_1.jpg", "./Continuous-temporal/mevis/mevis_188_2.jpg", "./Continuous-temporal/mevis/mevis_188_3.jpg", "./Continuous-temporal/mevis/mevis_188_4.jpg", "./Continuous-temporal/mevis/mevis_188_5.jpg", "./Continuous-temporal/mevis/mevis_188_6.jpg", "./Continuous-temporal/mevis/mevis_188_7.jpg", "./Continuous-temporal/mevis/mevis_188_8.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.156, 0.229, 0.402, 0.967]\nB: [0.295, 0.05, 0.639, 0.536]\nC: [0.156, 0.229, 0.389, 0.956]\nD: [0.156, 0.229, 0.366, 0.89]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 640 and the height is 362.\nCAPTION: Cow walking to the front first", "context": "Select from the following choices.\nA: [0.156, 0.229, 0.402, 0.967]\nB: [0.295, 0.05, 0.639, 0.536]\nC: [0.156, 0.229, 0.389, 0.956]\nD: [0.156, 0.229, 0.366, 0.89]", "input_image_path": ["./Continuous-temporal/mevis/mevis_189_0.jpg", "./Continuous-temporal/mevis/mevis_189_1.jpg", "./Continuous-temporal/mevis/mevis_189_2.jpg", "./Continuous-temporal/mevis/mevis_189_3.jpg", "./Continuous-temporal/mevis/mevis_189_4.jpg", "./Continuous-temporal/mevis/mevis_189_5.jpg", "./Continuous-temporal/mevis/mevis_189_6.jpg", "./Continuous-temporal/mevis/mevis_189_7.jpg", "./Continuous-temporal/mevis/mevis_189_8.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.577, 0.806, 0.624, 0.838]\nB: [0.39, 0.0, 0.999, 0.999]\nC: [0.391, 0.0, 1.0, 0.999]\nD: [0.208, 0.041, 0.483, 0.497]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: cat climbing on cat tree", "context": "Select from the following choices.\nA: [0.577, 0.806, 0.624, 0.838]\nB: [0.39, 0.0, 0.999, 0.999]\nC: [0.391, 0.0, 1.0, 0.999]\nD: [0.208, 0.041, 0.483, 0.497]", "input_image_path": ["./Continuous-temporal/mevis/mevis_190_0.jpg", "./Continuous-temporal/mevis/mevis_190_1.jpg", "./Continuous-temporal/mevis/mevis_190_2.jpg", "./Continuous-temporal/mevis/mevis_190_3.jpg", "./Continuous-temporal/mevis/mevis_190_4.jpg", "./Continuous-temporal/mevis/mevis_190_5.jpg", "./Continuous-temporal/mevis/mevis_190_6.jpg", "./Continuous-temporal/mevis/mevis_190_7.jpg", "./Continuous-temporal/mevis/mevis_190_8.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.576, 0.004, 0.609, 0.46]\nB: [0.141, 0.397, 0.406, 0.802]\nC: [0.199, 0.353, 0.465, 0.758]\nD: [0.199, 0.353, 0.412, 0.679]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 512 and the height is 252.\nCAPTION: horse run in circles", "context": "Select from the following choices.\nA: [0.576, 0.004, 0.609, 0.46]\nB: [0.141, 0.397, 0.406, 0.802]\nC: [0.199, 0.353, 0.465, 0.758]\nD: [0.199, 0.353, 0.412, 0.679]", "input_image_path": ["./Continuous-temporal/mevis/mevis_191_0.jpg", "./Continuous-temporal/mevis/mevis_191_1.jpg", "./Continuous-temporal/mevis/mevis_191_2.jpg", "./Continuous-temporal/mevis/mevis_191_3.jpg", "./Continuous-temporal/mevis/mevis_191_4.jpg", "./Continuous-temporal/mevis/mevis_191_5.jpg", "./Continuous-temporal/mevis/mevis_191_6.jpg", "./Continuous-temporal/mevis/mevis_191_7.jpg", "./Continuous-temporal/mevis/mevis_191_8.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.459, 0.0, 0.998, 0.925]\nB: [0.459, 0.0, 1.07, 1.094]\nC: [0.291, 0.075, 0.83, 1.0]\nD: [0.459, 0.0, 0.994, 1.006]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 640 and the height is 362.\nCAPTION: Cow looking around without moving", "context": "Select from the following choices.\nA: [0.459, 0.0, 0.998, 0.925]\nB: [0.459, 0.0, 1.07, 1.094]\nC: [0.291, 0.075, 0.83, 1.0]\nD: [0.459, 0.0, 0.994, 1.006]", "input_image_path": ["./Continuous-temporal/mevis/mevis_192_0.jpg", "./Continuous-temporal/mevis/mevis_192_1.jpg", "./Continuous-temporal/mevis/mevis_192_2.jpg", "./Continuous-temporal/mevis/mevis_192_3.jpg", "./Continuous-temporal/mevis/mevis_192_4.jpg", "./Continuous-temporal/mevis/mevis_192_5.jpg", "./Continuous-temporal/mevis/mevis_192_6.jpg", "./Continuous-temporal/mevis/mevis_192_7.jpg", "./Continuous-temporal/mevis/mevis_192_8.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.334, 0.0, 0.448, 0.665]\nB: [0.334, 0.0, 0.444, 0.569]\nC: [0.334, 0.0, 0.444, 0.528]\nD: [0.334, 0.0, 0.46, 0.55]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: Person moving around holding leashes", "context": "Select from the following choices.\nA: [0.334, 0.0, 0.448, 0.665]\nB: [0.334, 0.0, 0.444, 0.569]\nC: [0.334, 0.0, 0.444, 0.528]\nD: [0.334, 0.0, 0.46, 0.55]", "input_image_path": ["./Continuous-temporal/mevis/mevis_193_0.jpg", "./Continuous-temporal/mevis/mevis_193_1.jpg", "./Continuous-temporal/mevis/mevis_193_2.jpg", "./Continuous-temporal/mevis/mevis_193_3.jpg", "./Continuous-temporal/mevis/mevis_193_4.jpg", "./Continuous-temporal/mevis/mevis_193_5.jpg", "./Continuous-temporal/mevis/mevis_193_6.jpg", "./Continuous-temporal/mevis/mevis_193_7.jpg", "./Continuous-temporal/mevis/mevis_193_8.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.662, 0.542, 0.828, 0.8]\nB: [0.455, 0.368, 0.599, 0.715]\nC: [0.455, 0.368, 0.595, 0.741]\nD: [0.455, 0.368, 0.585, 0.759]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 544.\nCAPTION: Dog walking then turn around", "context": "Select from the following choices.\nA: [0.662, 0.542, 0.828, 0.8]\nB: [0.455, 0.368, 0.599, 0.715]\nC: [0.455, 0.368, 0.595, 0.741]\nD: [0.455, 0.368, 0.585, 0.759]", "input_image_path": ["./Continuous-temporal/mevis/mevis_194_0.jpg", "./Continuous-temporal/mevis/mevis_194_1.jpg", "./Continuous-temporal/mevis/mevis_194_2.jpg", "./Continuous-temporal/mevis/mevis_194_3.jpg", "./Continuous-temporal/mevis/mevis_194_4.jpg", "./Continuous-temporal/mevis/mevis_194_5.jpg", "./Continuous-temporal/mevis/mevis_194_6.jpg", "./Continuous-temporal/mevis/mevis_194_7.jpg", "./Continuous-temporal/mevis/mevis_194_8.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.542, 0.445, 0.727, 0.843]\nB: [0.478, 0.457, 0.662, 0.855]\nC: [0.505, 0.394, 0.689, 0.792]\nD: [0.465, 0.516, 0.649, 0.913]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1920 and the height is 1080.\nCAPTION: Stationary bicycle", "context": "Select from the following choices.\nA: [0.542, 0.445, 0.727, 0.843]\nB: [0.478, 0.457, 0.662, 0.855]\nC: [0.505, 0.394, 0.689, 0.792]\nD: [0.465, 0.516, 0.649, 0.913]", "input_image_path": ["./Continuous-temporal/mevis/mevis_195_0.jpg", "./Continuous-temporal/mevis/mevis_195_1.jpg", "./Continuous-temporal/mevis/mevis_195_2.jpg", "./Continuous-temporal/mevis/mevis_195_3.jpg", "./Continuous-temporal/mevis/mevis_195_4.jpg", "./Continuous-temporal/mevis/mevis_195_5.jpg", "./Continuous-temporal/mevis/mevis_195_6.jpg", "./Continuous-temporal/mevis/mevis_195_7.jpg", "./Continuous-temporal/mevis/mevis_195_8.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.866, 0.0, 0.99, 0.22]\nB: [0.722, 0.25, 0.802, 0.602]\nC: [0.866, 0.0, 0.973, 0.201]\nD: [0.51, 0.383, 0.97, 0.458]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 960 and the height is 528.\nCAPTION: yak standing near two fighting yaks", "context": "Select from the following choices.\nA: [0.866, 0.0, 0.99, 0.22]\nB: [0.722, 0.25, 0.802, 0.602]\nC: [0.866, 0.0, 0.973, 0.201]\nD: [0.51, 0.383, 0.97, 0.458]", "input_image_path": ["./Continuous-temporal/mevis/mevis_196_0.jpg", "./Continuous-temporal/mevis/mevis_196_1.jpg", "./Continuous-temporal/mevis/mevis_196_2.jpg", "./Continuous-temporal/mevis/mevis_196_3.jpg", "./Continuous-temporal/mevis/mevis_196_4.jpg", "./Continuous-temporal/mevis/mevis_196_5.jpg", "./Continuous-temporal/mevis/mevis_196_6.jpg", "./Continuous-temporal/mevis/mevis_196_7.jpg", "./Continuous-temporal/mevis/mevis_196_8.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.747, 0.274, 0.786, 0.366]\nB: [0.762, 0.793, 0.96, 0.873]\nC: [0.743, 0.214, 0.783, 0.305]\nD: [0.753, 0.242, 0.793, 0.334]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1567 and the height is 730.\nCAPTION: The cow taking the lead in the herd.", "context": "Select from the following choices.\nA: [0.747, 0.274, 0.786, 0.366]\nB: [0.762, 0.793, 0.96, 0.873]\nC: [0.743, 0.214, 0.783, 0.305]\nD: [0.753, 0.242, 0.793, 0.334]", "input_image_path": ["./Continuous-temporal/mevis/mevis_197_0.jpg", "./Continuous-temporal/mevis/mevis_197_1.jpg", "./Continuous-temporal/mevis/mevis_197_2.jpg", "./Continuous-temporal/mevis/mevis_197_3.jpg", "./Continuous-temporal/mevis/mevis_197_4.jpg", "./Continuous-temporal/mevis/mevis_197_5.jpg", "./Continuous-temporal/mevis/mevis_197_6.jpg", "./Continuous-temporal/mevis/mevis_197_7.jpg", "./Continuous-temporal/mevis/mevis_197_8.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.225, 0.104, 0.586, 0.619]\nB: [0.401, 0.539, 0.679, 0.902]\nC: [0.479, 0.144, 0.7, 0.563]\nD: [0.103, 0.0, 0.464, 0.515]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1652 and the height is 1080.\nCAPTION: standing still and eating white rabbit", "context": "Select from the following choices.\nA: [0.225, 0.104, 0.586, 0.619]\nB: [0.401, 0.539, 0.679, 0.902]\nC: [0.479, 0.144, 0.7, 0.563]\nD: [0.103, 0.0, 0.464, 0.515]", "input_image_path": ["./Continuous-temporal/mevis/mevis_198_0.jpg", "./Continuous-temporal/mevis/mevis_198_1.jpg", "./Continuous-temporal/mevis/mevis_198_2.jpg", "./Continuous-temporal/mevis/mevis_198_3.jpg", "./Continuous-temporal/mevis/mevis_198_4.jpg", "./Continuous-temporal/mevis/mevis_198_5.jpg", "./Continuous-temporal/mevis/mevis_198_6.jpg", "./Continuous-temporal/mevis/mevis_198_7.jpg", "./Continuous-temporal/mevis/mevis_198_8.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "mevis", "visual_input_component": "Video image or Natural image", "source": "MeViS", "options": "A: [0.223, 0.415, 0.498, 0.772]\nB: [0.223, 0.415, 0.496, 0.722]\nC: [0.223, 0.415, 0.53, 0.769]\nD: [0.12, 0.554, 0.443, 0.631]", "question": "I have provided several frames from a video, and I will also provide a caption. Please provide the bounding box coordinates for the described object or area using the format [x1, y1, x2, y2]. Here, [x1, y1] represent the top-left coordinates and [x2, y2] the bottom-right coordinates within a normalized range of 0 to 1, where [0, 0] is the top-left corner and [1, 1] is the bottom-right corner of the image. Note that the width of the input image is 1652 and the height is 1080.\nCAPTION: rabbit being jumped by another rabbit", "context": "Select from the following choices.\nA: [0.223, 0.415, 0.498, 0.772]\nB: [0.223, 0.415, 0.496, 0.722]\nC: [0.223, 0.415, 0.53, 0.769]\nD: [0.12, 0.554, 0.443, 0.631]", "input_image_path": ["./Continuous-temporal/mevis/mevis_199_0.jpg", "./Continuous-temporal/mevis/mevis_199_1.jpg", "./Continuous-temporal/mevis/mevis_199_2.jpg", "./Continuous-temporal/mevis/mevis_199_3.jpg", "./Continuous-temporal/mevis/mevis_199_4.jpg", "./Continuous-temporal/mevis/mevis_199_5.jpg", "./Continuous-temporal/mevis/mevis_199_6.jpg", "./Continuous-temporal/mevis/mevis_199_7.jpg", "./Continuous-temporal/mevis/mevis_199_8.jpg"], "output": "A", "qwen3-vl": "image none"}]
\ No newline at end of file
diff --git a/results/multiple_image_captioning/qwen3-vl/metadata_info.json b/results/multiple_image_captioning/qwen3-vl/metadata_info.json
new file mode 100644
index 0000000..7fe4e32
--- /dev/null
+++ b/results/multiple_image_captioning/qwen3-vl/metadata_info.json
@@ -0,0 +1 @@
+[{"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: Man types on computer Woman talks to man while typing Woman smirks at something Winters day in england Blizzard hits england\nB: Man types on computer Woman talks to man while typing Woman smiles at something Winters day in england Blizzard hits england\nC: Man types on computer Woman talks to man while typing Woman smirks at something Winters day in england Snowfall hits england\nD: Man types on computer Woman talks to man while typing Woman smirks at something Winters day in england Blizzard hits russia", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: Man types on computer Woman talks to man while typing Woman smirks at something Winters day in england Blizzard hits england\nB: Man types on computer Woman talks to man while typing Woman smiles at something Winters day in england Blizzard hits england\nC: Man types on computer Woman talks to man while typing Woman smirks at something Winters day in england Snowfall hits england\nD: Man types on computer Woman talks to man while typing Woman smirks at something Winters day in england Blizzard hits russia", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_0_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_0_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_0_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_0_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_0_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: [male] is excited about his new job in the city and decides to explore it on foot. He finds himself drawn to the beach and feels a deep connection to the sea.\nB: [female] is bored with her life , she wants to explore new places all by herself .she views the city and realizes she needs to get away and go away someplace different .the next morning she catches a bus that takes her out of the city .the bus drops her off at a port where she gets on a boat which will sail away .the boat takes her to the beach , which she feels heals her soul . she loves her new surroundings .\nC: [female] feels trapped in her routine and decides to take a plane to a foreign country. The new environment rejuvenates her, and she feels a sense of freedom and excitement.\nD: [male] is tired of the city life and decides to hike to the top of a mountain. The expansive view takes his breath away, and he feels a profound sense of peace and serenity.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: [male] is excited about his new job in the city and decides to explore it on foot. He finds himself drawn to the beach and feels a deep connection to the sea.\nB: [female] is bored with her life , she wants to explore new places all by herself .she views the city and realizes she needs to get away and go away someplace different .the next morning she catches a bus that takes her out of the city .the bus drops her off at a port where she gets on a boat which will sail away .the boat takes her to the beach , which she feels heals her soul . she loves her new surroundings .\nC: [female] feels trapped in her routine and decides to take a plane to a foreign country. The new environment rejuvenates her, and she feels a sense of freedom and excitement.\nD: [male] is tired of the city life and decides to hike to the top of a mountain. The expansive view takes his breath away, and he feels a profound sense of peace and serenity.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_1_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_1_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_1_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_1_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_1_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: we rode our atv to the mountain top ...and , suddenly , we were back on the beach and viewing a wondering opening .i asked my husband to take a picture ; however , he preferred to stand behind the camera .we then moved back to the mountain and saw a beautiful inlet..the inlet showed the strong power of the water moving in from the ocean . we took our picture and were ready for bed .\nB: we walked to the mountain top and took some pictures. then, we went to the beach and found a secluded spot. later, we observed the force of the ocean waves. finally, we slept under the stars.\nC: we drove our car to the mountain top and enjoyed the view. then, we went to the beach and saw a beautiful sunset. later, we captured the waves crashing on the shore. finally, we felt tired and went to bed.\nD: we hiked to the mountain top and felt the refreshing breeze. then, we visited the beach and admired the waves. later, we saw a natural wonder. finally, we fell asleep in our tent.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: we rode our atv to the mountain top ...and , suddenly , we were back on the beach and viewing a wondering opening .i asked my husband to take a picture ; however , he preferred to stand behind the camera .we then moved back to the mountain and saw a beautiful inlet..the inlet showed the strong power of the water moving in from the ocean . we took our picture and were ready for bed .\nB: we walked to the mountain top and took some pictures. then, we went to the beach and found a secluded spot. later, we observed the force of the ocean waves. finally, we slept under the stars.\nC: we drove our car to the mountain top and enjoyed the view. then, we went to the beach and saw a beautiful sunset. later, we captured the waves crashing on the shore. finally, we felt tired and went to bed.\nD: we hiked to the mountain top and felt the refreshing breeze. then, we visited the beach and admired the waves. later, we saw a natural wonder. finally, we fell asleep in our tent.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_2_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_2_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_2_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_2_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_2_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: A couple rowing a boat in the lake\nB: A group of people having a picnic near the lake\nC: Some children are on top of a rock watching the fish in the lake. They have jumped into the lake and are playing in the lake. Some girls are diving to the bottom of the lake There are several waterfalls far from the lake where people are cooling off. Nearby there is a trail where people are walking in the middle of nature.\nD: A man fishing in the lake", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: A couple rowing a boat in the lake\nB: A group of people having a picnic near the lake\nC: Some children are on top of a rock watching the fish in the lake. They have jumped into the lake and are playing in the lake. Some girls are diving to the bottom of the lake There are several waterfalls far from the lake where people are cooling off. Nearby there is a trail where people are walking in the middle of nature.\nD: A man fishing in the lake", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_3_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_3_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_3_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_3_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_3_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: the football match was so exhilarating.\nB: the basketball game was thrilling.\nC: i was really excited to see my first hockey game .the players made their way to the ice .the game started and it was so amazing to see .the players went by so quickly .it was exhilarating .\nD: i was energized when i saw the soccer match.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: the football match was so exhilarating.\nB: the basketball game was thrilling.\nC: i was really excited to see my first hockey game .the players made their way to the ice .the game started and it was so amazing to see .the players went by so quickly .it was exhilarating .\nD: i was energized when i saw the soccer match.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_4_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_4_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_4_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_4_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_4_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: we made a trip out to visit the location location bridge in location location .our friends from back home also came along with us on the trip .we had drinks on the beach .and played football and catch as well .everyone had a good time at the beach that day .\nB: we went to a zoo and saw some animals.\nC: we visited a park in location location .\nD: we had a picnic in the park.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: we made a trip out to visit the location location bridge in location location .our friends from back home also came along with us on the trip .we had drinks on the beach .and played football and catch as well .everyone had a good time at the beach that day .\nB: we went to a zoo and saw some animals.\nC: we visited a park in location location .\nD: we had a picnic in the park.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_5_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_5_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_5_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_5_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_5_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: Many reporters are unimpressed by the lack of palm trees and bushes in the place.\nB: Some people like to visit old buildings that share the space with the green of nature. Some reporters are impressed by the amount of palm trees and bushes in the place. People also love to walk along the paths and observe all kinds of trees. There are also transports on site that can take visitors to more distant locations. People are also delighted with the rooms decorated with beautiful maps and period objects and tables\nC: People often dislike walking along the paths and observing all kinds of trees.\nD: Few people dislike visiting old buildings that are surrounded by nature.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: Many reporters are unimpressed by the lack of palm trees and bushes in the place.\nB: Some people like to visit old buildings that share the space with the green of nature. Some reporters are impressed by the amount of palm trees and bushes in the place. People also love to walk along the paths and observe all kinds of trees. There are also transports on site that can take visitors to more distant locations. People are also delighted with the rooms decorated with beautiful maps and period objects and tables\nC: People often dislike walking along the paths and observing all kinds of trees.\nD: Few people dislike visiting old buildings that are surrounded by nature.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_6_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_6_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_6_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_6_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_6_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: everyone came to help clean up the swamp .even jethro , the most bitter participant of them all , helped out .it was a good chance to catch up with family .everyone ate hot dogs .they all worked together and had a good time .\nB: nobody came to help clean up the swamp .even jethro , the most bitter participant of them all , did not help out .it was a bad chance to catch up with family .nobody ate hot dogs .they all worked alone and had a bad time .\nC: some people came to help clean up the swamp .even jethro , the most bitter participant of them all , helped out only a little bit .it was a mediocre chance to catch up with family .some people ate hot dogs .they all worked together but had a mediocre time .\nD: only a few people came to help clean up the swamp .even jethro , the most bitter participant of them all , did not help out .it was not a chance to catch up with family .only a few people ate hot dogs .they all worked together but had a mediocre time .", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: everyone came to help clean up the swamp .even jethro , the most bitter participant of them all , helped out .it was a good chance to catch up with family .everyone ate hot dogs .they all worked together and had a good time .\nB: nobody came to help clean up the swamp .even jethro , the most bitter participant of them all , did not help out .it was a bad chance to catch up with family .nobody ate hot dogs .they all worked alone and had a bad time .\nC: some people came to help clean up the swamp .even jethro , the most bitter participant of them all , helped out only a little bit .it was a mediocre chance to catch up with family .some people ate hot dogs .they all worked together but had a mediocre time .\nD: only a few people came to help clean up the swamp .even jethro , the most bitter participant of them all , did not help out .it was not a chance to catch up with family .only a few people ate hot dogs .they all worked together but had a mediocre time .", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_7_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_7_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_7_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_7_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_7_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: Midday there was a ship that anchored at a pier. At the pier, a sailor was folding a gangway. He looked inside his tool box to see if there is any tool that could help him with the operation. After a couple of hours passed by, another ship anchored. When the ship anchored, the gangway was unfolded, for the passengers\nB: Late in the evening there was a helicopter that took off from a helipad. At the helipad, a woman was unfolding a helicopter blade. She looked inside her tool box to see if there is any tool that could help her with the operation. After a couple of hours passed by, another helicopter took off. When the helicopter took off, the helicopter blade was folded.\nC: In the afternoon there was a train that arrived at a platform. At the platform, a conductor was folding a platform bridge. He looked inside his tool box to see if there is any tool that could help him with the operation. After a couple of hours passed by, another train arrived. When the train arrived, the platform bridge was unfolded, for the passengers\nD: Early in the morning there was an airplane the landed on an airport. On the air port, a men was folding a jet bridge. He looked inside his tool box to see if there is any tool the could help him with the operation. After a couple of hours passed by, another plane landed. When the plane landed, the jet bridge was unfolded, for the passengers", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: Midday there was a ship that anchored at a pier. At the pier, a sailor was folding a gangway. He looked inside his tool box to see if there is any tool that could help him with the operation. After a couple of hours passed by, another ship anchored. When the ship anchored, the gangway was unfolded, for the passengers\nB: Late in the evening there was a helicopter that took off from a helipad. At the helipad, a woman was unfolding a helicopter blade. She looked inside her tool box to see if there is any tool that could help her with the operation. After a couple of hours passed by, another helicopter took off. When the helicopter took off, the helicopter blade was folded.\nC: In the afternoon there was a train that arrived at a platform. At the platform, a conductor was folding a platform bridge. He looked inside his tool box to see if there is any tool that could help him with the operation. After a couple of hours passed by, another train arrived. When the train arrived, the platform bridge was unfolded, for the passengers\nD: Early in the morning there was an airplane the landed on an airport. On the air port, a men was folding a jet bridge. He looked inside his tool box to see if there is any tool the could help him with the operation. After a couple of hours passed by, another plane landed. When the plane landed, the jet bridge was unfolded, for the passengers", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_8_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_8_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_8_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_8_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_8_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: The images depict a marketplace, not a forest or hillside scene.\nB: No, I haven't heard about the Forest Hill side sales on green bananas.\nC: There are no people in the images, only bananas.\nD: Hey, have you heard about the Forest Hill side sales on green bananas? One dude is just staring at the camera. Dude, like, more peeps saw the camera. Hey, let's boost those bike sales in Banana! More people sell bananas on bikes.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: The images depict a marketplace, not a forest or hillside scene.\nB: No, I haven't heard about the Forest Hill side sales on green bananas.\nC: There are no people in the images, only bananas.\nD: Hey, have you heard about the Forest Hill side sales on green bananas? One dude is just staring at the camera. Dude, like, more peeps saw the camera. Hey, let's boost those bike sales in Banana! More people sell bananas on bikes.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_9_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_9_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_9_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_9_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_9_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: it is a busy marketplace with colorful stalls and shops.\nB: it is an amazing colorful palace, to relax and learn about his ancient palace. The site has so many sites and gardens people enjoy the peaceful stroll thru the castle. Tourist enjoy a clean place to stroll thru the green colorful gardens of this beautiful palace. The view is magnificent and the museum is very clean place to visit. The place has entertaining maps and exhibitions, making sure you don't cross the velvet red ropes.\nC: it is a quiet and serene beach with crystal clear water.\nD: it is a modern skyscraper with a bustling city around it.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: it is a busy marketplace with colorful stalls and shops.\nB: it is an amazing colorful palace, to relax and learn about his ancient palace. The site has so many sites and gardens people enjoy the peaceful stroll thru the castle. Tourist enjoy a clean place to stroll thru the green colorful gardens of this beautiful palace. The view is magnificent and the museum is very clean place to visit. The place has entertaining maps and exhibitions, making sure you don't cross the velvet red ropes.\nC: it is a quiet and serene beach with crystal clear water.\nD: it is a modern skyscraper with a bustling city around it.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_10_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_10_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_10_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_10_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_10_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: we were anxious for our fun to halt .a ride on the swings was nerve-wracking .a little tension from the roller coaster was just what we feared .of course , we still had time to try out our aiming skills .we had so much pressure that we decided to leave in the evening .\nB: we were unsure about our fun from the start .a ride on the swings was boring .a little disappointment from the roller coaster was just what we didn't need .of course , we still had time to attempt our aiming skills .we had so much boredom that we left in the evening .\nC: we were ready for our fun to begin .a ride on the swings was exhilarating .a little thrill from the roller coaster was just what we needed .of course , we still had time to test out our shooting skills .we had so much fun that we made sure to stay into the evening .\nD: we were unprepared for our fun to end .a ride on the swings was terrifying .a little scare from the roller coaster was just what we dreaded .of course , we still had time to practice our aiming skills .we had so much stress that we were forced to leave in the evening .", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: we were anxious for our fun to halt .a ride on the swings was nerve-wracking .a little tension from the roller coaster was just what we feared .of course , we still had time to try out our aiming skills .we had so much pressure that we decided to leave in the evening .\nB: we were unsure about our fun from the start .a ride on the swings was boring .a little disappointment from the roller coaster was just what we didn't need .of course , we still had time to attempt our aiming skills .we had so much boredom that we left in the evening .\nC: we were ready for our fun to begin .a ride on the swings was exhilarating .a little thrill from the roller coaster was just what we needed .of course , we still had time to test out our shooting skills .we had so much fun that we made sure to stay into the evening .\nD: we were unprepared for our fun to end .a ride on the swings was terrifying .a little scare from the roller coaster was just what we dreaded .of course , we still had time to practice our aiming skills .we had so much stress that we were forced to leave in the evening .", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_11_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_11_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_11_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_11_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_11_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: During a soccer game, a group of girls line up, getting ready to play. The referee blows the whistle, and the girls start sprinting at different speeds.\nB: The coach instructs his players on when the game will begin. A group of girls line up, waiting for the game to start.\nC: A group of students gather in the gym, preparing for a relay race. The students take off and run at varying speeds. One of the girls in pink falls behind and eventually stops running.\nD: In PE class, a gym teacher instructs her students on when a race will begin. A group of girls line up, waiting for the race to start. Off they go, and the girls sprint at different paces. A girl in pink runs but notices that she is falling behind. The girl stops, is out of breath, and decides to not run anymore.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: During a soccer game, a group of girls line up, getting ready to play. The referee blows the whistle, and the girls start sprinting at different speeds.\nB: The coach instructs his players on when the game will begin. A group of girls line up, waiting for the game to start.\nC: A group of students gather in the gym, preparing for a relay race. The students take off and run at varying speeds. One of the girls in pink falls behind and eventually stops running.\nD: In PE class, a gym teacher instructs her students on when a race will begin. A group of girls line up, waiting for the race to start. Off they go, and the girls sprint at different paces. A girl in pink runs but notices that she is falling behind. The girl stops, is out of breath, and decides to not run anymore.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_12_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_12_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_12_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_12_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_12_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: the soccer player took a penalty kick .the referee checked the time on the scoreboard .the ball hit the goalpost and missed the goal . the other team's player tried to intercept the ball . the goalkeeper dived to save the ball .\nB: the basketball player made a dunk shot .the scoreboard displayed the team scores .the player was fouled and given free throws . he missed the free throws .he collided with another player and got injured .the game was delayed for medical attention .\nC: the baseball pitcher threw a curveball .the scoreboard showed the inning and outs .the batter hit a home run . the fans cheered and waved their banners .the team celebrated with high fives and hugs .\nD: the tennis player got ready to serve the ball .the board showed the score of the two players .it was the other player 's turn to serve . she was about to serve the ball .she accidentally fell down mid serve , and had to get help .the court was cleared off for the next match .", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: the soccer player took a penalty kick .the referee checked the time on the scoreboard .the ball hit the goalpost and missed the goal . the other team's player tried to intercept the ball . the goalkeeper dived to save the ball .\nB: the basketball player made a dunk shot .the scoreboard displayed the team scores .the player was fouled and given free throws . he missed the free throws .he collided with another player and got injured .the game was delayed for medical attention .\nC: the baseball pitcher threw a curveball .the scoreboard showed the inning and outs .the batter hit a home run . the fans cheered and waved their banners .the team celebrated with high fives and hugs .\nD: the tennis player got ready to serve the ball .the board showed the score of the two players .it was the other player 's turn to serve . she was about to serve the ball .she accidentally fell down mid serve , and had to get help .the court was cleared off for the next match .", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_13_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_13_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_13_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_13_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_13_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: we had an outing at the theme park with [male] .[male] got to sit in his own part of the train .he was really cheerful to be there .he kept asking what ride we would go on next .when the ride was over though , he wanted to go again .\nB: we had a day at the kid park with [female] .[female] got to sit in her own part of the train .she was really happy to be there .she kept asking what ride we would go on next .when the ride was over though , she wanted to go again .\nC: we spent a day at the amusement park with [female] .[female] sat in her own part of the train .she was really happy to be there .she kept asking what ride we would attend next .when the ride was over though , she wanted to go again .\nD: they had a day at the water park with [male] .[male] got to sit in his own part of the train .he was really happy to be there .he kept asking what ride we would go on next .when the ride was over though , he wanted to go again .", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: we had an outing at the theme park with [male] .[male] got to sit in his own part of the train .he was really cheerful to be there .he kept asking what ride we would go on next .when the ride was over though , he wanted to go again .\nB: we had a day at the kid park with [female] .[female] got to sit in her own part of the train .she was really happy to be there .she kept asking what ride we would go on next .when the ride was over though , she wanted to go again .\nC: we spent a day at the amusement park with [female] .[female] sat in her own part of the train .she was really happy to be there .she kept asking what ride we would attend next .when the ride was over though , she wanted to go again .\nD: they had a day at the water park with [male] .[male] got to sit in his own part of the train .he was really happy to be there .he kept asking what ride we would go on next .when the ride was over though , he wanted to go again .", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_14_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_14_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_14_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_14_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_14_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: The family got lost at the airport. The young girl and her mother were confused to see their dad. They wandered around and looked lost. They decided to go to the beach on a cloudy day. Someone from the park took a picture of them. They looked puzzled as the person took the picture.\nB: The family had a fight at the airport. The young girl and her mother were angry to see their dad. They argued with each other and looked mad. They decided to go to the movies on a rainy day. Someone from the park took a picture of them. They scowled as the person took the picture.\nC: The family finally met at the airport. The young girl and her mother were happy to see their dad. They hugged each other and smiled. They decided to head to the park on a beautiful sunny day. Someone from the park took a picture of them. They smiled as the person took the picture.\nD: The family missed their flight at the airport. The young girl and her mother were upset to see their dad. They cried and looked unhappy. They decided to stay home on a gloomy day. Someone from the park took a picture of them. They frowned as the person took the picture.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: The family got lost at the airport. The young girl and her mother were confused to see their dad. They wandered around and looked lost. They decided to go to the beach on a cloudy day. Someone from the park took a picture of them. They looked puzzled as the person took the picture.\nB: The family had a fight at the airport. The young girl and her mother were angry to see their dad. They argued with each other and looked mad. They decided to go to the movies on a rainy day. Someone from the park took a picture of them. They scowled as the person took the picture.\nC: The family finally met at the airport. The young girl and her mother were happy to see their dad. They hugged each other and smiled. They decided to head to the park on a beautiful sunny day. Someone from the park took a picture of them. They smiled as the person took the picture.\nD: The family missed their flight at the airport. The young girl and her mother were upset to see their dad. They cried and looked unhappy. They decided to stay home on a gloomy day. Someone from the park took a picture of them. They frowned as the person took the picture.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_15_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_15_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_15_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_15_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_15_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: [male] was all set for the beach .he drove to the beach in his fancy car .he arrived at his fancy hotel .and he looked out the hotel window .still, the beach turned out to be quite ordinary .\nB: [male] was prepared for the beach trip in his old car at his ordinary hotel .he gazed out of his hotel window .nonetheless, the beach he intended to visit was not mundane at all .\nC: [female] was ready for the trip to the beach .he jumped in his luxury car .he made it to his luxury hotel .and he looked out his luxury window .however , the beach he planned to go to , was not luxurious at all .\nD: [female] was getting ready to go to the beach .she drove her car to her hotel .she had a view of the beach from her hotel .however, the beach looked disappointing .", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: [male] was all set for the beach .he drove to the beach in his fancy car .he arrived at his fancy hotel .and he looked out the hotel window .still, the beach turned out to be quite ordinary .\nB: [male] was prepared for the beach trip in his old car at his ordinary hotel .he gazed out of his hotel window .nonetheless, the beach he intended to visit was not mundane at all .\nC: [female] was ready for the trip to the beach .he jumped in his luxury car .he made it to his luxury hotel .and he looked out his luxury window .however , the beach he planned to go to , was not luxurious at all .\nD: [female] was getting ready to go to the beach .she drove her car to her hotel .she had a view of the beach from her hotel .however, the beach looked disappointing .", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_16_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_16_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_16_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_16_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_16_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: the day at location location for the jones ' started with the tilt-a-whirl .next they went on the cyclone , a famous coaster that set the dad 's teeth on edge .the kids begged to go on the drop dive , which delivers a sheer drop from 70 feet .tired , they searched for a bathroom but all were out of order .finally , they found some public restrooms near the beach , and settled in for the afternoon .\nB: the day at location location for the smiths ' started with the merry-go-round .next they went on the giant drop, a thrilling ride that made the dad scream .the kids begged to go on the log flume, which splashes down from a great height .tired, they searched for a drink stand but all were sold out .finally, they found some lemonade stands near the beach, and refreshed themselves for the afternoon .\nC: the day at location location for the jones ' started with the carousel .next they went on the ferris wheel, a popular ride that made the dad feel dizzy .the kids begged to go on the roller coaster, which has loops and twists .tired, they searched for a food stall but all were closed .finally, they found some ice cream stalls near the beach, and treated themselves for the afternoon .\nD: the day at location location for the parkers ' started with the bumper cars .next they went on the haunted house, a spooky attraction that made the dad jump .the kids begged to go on the pirate ship, which swings back and forth .tired, they searched for a souvenir shop but all were closed .finally, they found some beach shops near the beach, and shopped for the afternoon .", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: the day at location location for the jones ' started with the tilt-a-whirl .next they went on the cyclone , a famous coaster that set the dad 's teeth on edge .the kids begged to go on the drop dive , which delivers a sheer drop from 70 feet .tired , they searched for a bathroom but all were out of order .finally , they found some public restrooms near the beach , and settled in for the afternoon .\nB: the day at location location for the smiths ' started with the merry-go-round .next they went on the giant drop, a thrilling ride that made the dad scream .the kids begged to go on the log flume, which splashes down from a great height .tired, they searched for a drink stand but all were sold out .finally, they found some lemonade stands near the beach, and refreshed themselves for the afternoon .\nC: the day at location location for the jones ' started with the carousel .next they went on the ferris wheel, a popular ride that made the dad feel dizzy .the kids begged to go on the roller coaster, which has loops and twists .tired, they searched for a food stall but all were closed .finally, they found some ice cream stalls near the beach, and treated themselves for the afternoon .\nD: the day at location location for the parkers ' started with the bumper cars .next they went on the haunted house, a spooky attraction that made the dad jump .the kids begged to go on the pirate ship, which swings back and forth .tired, they searched for a souvenir shop but all were closed .finally, they found some beach shops near the beach, and shopped for the afternoon .", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_17_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_17_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_17_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_17_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_17_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: it was the annual meeting for pinoymac .employees celebrated the occasion .even those who could n't get away from the computer had fun .family and friends were invited .everyone got a free sweatshirt to commemorate the occasion .\nB: it was the birthday celebration for pinoymac .employees celebrated the occasion .even those who could n't get away from the computer had fun .family and friends were invited .everyone got a free sweatshirt to commemorate the occasion .\nC: it was the anniversary party for pinoymac .employees celebrated the occasion .even those who couldn't get away from the computer had fun .family and friends were invited .everyone got a free sweatshirt to commemorate the occasion .\nD: it was the farewell party for pinoymac .employees celebrated the occasion .even those who could n't get away from the computer had fun .family and friends were invited .everyone got a free sweatshirt to commemorate the occasion .", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: it was the annual meeting for pinoymac .employees celebrated the occasion .even those who could n't get away from the computer had fun .family and friends were invited .everyone got a free sweatshirt to commemorate the occasion .\nB: it was the birthday celebration for pinoymac .employees celebrated the occasion .even those who could n't get away from the computer had fun .family and friends were invited .everyone got a free sweatshirt to commemorate the occasion .\nC: it was the anniversary party for pinoymac .employees celebrated the occasion .even those who couldn't get away from the computer had fun .family and friends were invited .everyone got a free sweatshirt to commemorate the occasion .\nD: it was the farewell party for pinoymac .employees celebrated the occasion .even those who could n't get away from the computer had fun .family and friends were invited .everyone got a free sweatshirt to commemorate the occasion .", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_18_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_18_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_18_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_18_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_18_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: Everyone was optimistic about the future despite knowing the end was near.\nB: The group was filled with despair and sadness as they prepared for the end.\nC: we had all finally gathered together to make the plan come true .all of our history we talked about . we knew what needed to be done , but instead of somberness we found joy in the tasks .we sat together for the last time for the last meal we would have with each other .[male] was helping with the cooking tonight . his job was to make sure the special ingredient was added .after eating we all knew the end was coming . [female] touched my face , and we kissed the kiss of the damned .\nD: The group was somber and serious as they discussed their plan.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: Everyone was optimistic about the future despite knowing the end was near.\nB: The group was filled with despair and sadness as they prepared for the end.\nC: we had all finally gathered together to make the plan come true .all of our history we talked about . we knew what needed to be done , but instead of somberness we found joy in the tasks .we sat together for the last time for the last meal we would have with each other .[male] was helping with the cooking tonight . his job was to make sure the special ingredient was added .after eating we all knew the end was coming . [female] touched my face , and we kissed the kiss of the damned .\nD: The group was somber and serious as they discussed their plan.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_19_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_19_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_19_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_19_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_19_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: The girls\u2019 track coach gathered the team in front of her. A couple of the Orange Hills Crosscountry girls were not sure the meant them. Coach dismissed them, letting them both know that the training was not for their specialty. She then turned to the remaining three girls and gave them their training assignment. The three girls headed down the paved path, running together.\nB: The boys\u2019 track coach dismissed the team\nC: The girls were not interested in the training\nD: No one knew what the coach meant", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: The girls\u2019 track coach gathered the team in front of her. A couple of the Orange Hills Crosscountry girls were not sure the meant them. Coach dismissed them, letting them both know that the training was not for their specialty. She then turned to the remaining three girls and gave them their training assignment. The three girls headed down the paved path, running together.\nB: The boys\u2019 track coach dismissed the team\nC: The girls were not interested in the training\nD: No one knew what the coach meant", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_20_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_20_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_20_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_20_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_20_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: A quiet day at the beach. The fishermen didn't catch much initially, but later they spotted some fish. The fish were quite large in size. Let's see how the day unfolds.\nB: A busy day at the park. The fishermen were disappointed at first, but later they were able to catch some bigger fish. The fish were of various sizes. Let's see what happens next.\nC: A fun day at the river. The fishermen found many fish at the beginning. Later, they spotted some large fish. The fish were enormous. Let's see what happens later.\nD: Another day at the lake. There were not very many fish seen at first. Eventually, the guys started to see some activity. The fish were pretty small. They will see what the rest of the day brings.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: A quiet day at the beach. The fishermen didn't catch much initially, but later they spotted some fish. The fish were quite large in size. Let's see how the day unfolds.\nB: A busy day at the park. The fishermen were disappointed at first, but later they were able to catch some bigger fish. The fish were of various sizes. Let's see what happens next.\nC: A fun day at the river. The fishermen found many fish at the beginning. Later, they spotted some large fish. The fish were enormous. Let's see what happens later.\nD: Another day at the lake. There were not very many fish seen at first. Eventually, the guys started to see some activity. The fish were pretty small. They will see what the rest of the day brings.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_21_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_21_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_21_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_21_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_21_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: A look at a gorgeous lake on a cold winters day. One man decides its the perfect condition for fishing. The man caught a huge fish hes about to pack up and take home. After packing the fish in a cooler he looks back at it admiring his catch. The man then returns to his car and gives his dog a pet and a loving gaze before heading out.\nB: A view of a beautiful garden on a sunny afternoon. One woman decides its the perfect condition for gardening. The woman caught a butterfly and is about to release it.\nC: A view of a crowded beach on a hot summer day. One woman decides its the perfect condition for sunbathing. The woman caught a huge wave and is about to go surfing.\nD: A look at a cloudy sky on a rainy day. One man decides its the perfect condition for a hike. The man found a treasure and is about to take it home.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: A look at a gorgeous lake on a cold winters day. One man decides its the perfect condition for fishing. The man caught a huge fish hes about to pack up and take home. After packing the fish in a cooler he looks back at it admiring his catch. The man then returns to his car and gives his dog a pet and a loving gaze before heading out.\nB: A view of a beautiful garden on a sunny afternoon. One woman decides its the perfect condition for gardening. The woman caught a butterfly and is about to release it.\nC: A view of a crowded beach on a hot summer day. One woman decides its the perfect condition for sunbathing. The woman caught a huge wave and is about to go surfing.\nD: A look at a cloudy sky on a rainy day. One man decides its the perfect condition for a hike. The man found a treasure and is about to take it home.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_22_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_22_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_22_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_22_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_22_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: a couple enjoying a picnic in the park\nB: a group of people taking a photo in front of a building\nC: a lady selfie shot with balloons and man a man see looks a ocean another selfie shot take  with man a man click the photo riding boat in ocean men and women enjoying and take selfie at boat\nD: a man playing the guitar on the beach", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: a couple enjoying a picnic in the park\nB: a group of people taking a photo in front of a building\nC: a lady selfie shot with balloons and man a man see looks a ocean another selfie shot take  with man a man click the photo riding boat in ocean men and women enjoying and take selfie at boat\nD: a man playing the guitar on the beach", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_23_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_23_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_23_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_23_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_23_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: [male] was performing at a large concert hall with a band. The audience was cheering and clapping loudly. He felt proud of his performance.\nB: [female] was performing at a small gathering of people . she played her guitar and sang .the people did not go near her and stayed at the far end of the room .[female] wondered was her singing bad ? she decided to just play the guitar .she knew a lot of songs and played them all .when [female] was finished , it was location 's turn to play guitar for the crowd .\nC: [female] was sitting alone in her room, playing the guitar and singing. She enjoyed the peaceful atmosphere and the sound of her music.\nD: A group of people were having a dance party in a spacious club. The DJ was playing energetic music and everyone was dancing enthusiastically.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: [male] was performing at a large concert hall with a band. The audience was cheering and clapping loudly. He felt proud of his performance.\nB: [female] was performing at a small gathering of people . she played her guitar and sang .the people did not go near her and stayed at the far end of the room .[female] wondered was her singing bad ? she decided to just play the guitar .she knew a lot of songs and played them all .when [female] was finished , it was location 's turn to play guitar for the crowd .\nC: [female] was sitting alone in her room, playing the guitar and singing. She enjoyed the peaceful atmosphere and the sound of her music.\nD: A group of people were having a dance party in a spacious club. The DJ was playing energetic music and everyone was dancing enthusiastically.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_24_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_24_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_24_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_24_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_24_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: They are 5 Children Ready to jump water They are swim to water They are keep going to under water They are enjoying with the water fall Finely They are going to home\nB: They are 5 Children Ready to jump water They are walk to water They are keep going to under water They are enjoying with the water fall Finely They are going to home\nC: They are 5 Adults Ready to jump water They are swim to water They are keep going to under water They are enjoying with the water fall Finely They are going to home\nD: They are 5 Children Ready to jump fire They are swim to water They are keep going to under water They are enjoying with the water fall Finely They are going to home", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: They are 5 Children Ready to jump water They are swim to water They are keep going to under water They are enjoying with the water fall Finely They are going to home\nB: They are 5 Children Ready to jump water They are walk to water They are keep going to under water They are enjoying with the water fall Finely They are going to home\nC: They are 5 Adults Ready to jump water They are swim to water They are keep going to under water They are enjoying with the water fall Finely They are going to home\nD: They are 5 Children Ready to jump fire They are swim to water They are keep going to under water They are enjoying with the water fall Finely They are going to home", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_25_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_25_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_25_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_25_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_25_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: the fans are excited to finally see the race take place .the red race car zooms past the onlookers .not far behind was another race car .the fans cheered with excitement to see the cars go by so quickly .the crowd clears out because the race is finished .\nB: the fans are disappointed to miss the race\nC: the race cars move slowly as the fans lose interest\nD: the red race car crashes into the onlookers", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: the fans are excited to finally see the race take place .the red race car zooms past the onlookers .not far behind was another race car .the fans cheered with excitement to see the cars go by so quickly .the crowd clears out because the race is finished .\nB: the fans are disappointed to miss the race\nC: the race cars move slowly as the fans lose interest\nD: the red race car crashes into the onlookers", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_26_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_26_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_26_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_26_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_26_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: A man is delivering a package on his orange motorcycle.\nB: A man is waiting for his delivery to come.\nC: A [female] woman securely places her delivery within a burlap bag hosted on her orange moped. A husband[male] and wife[female] patiently wait for their delivery to come. The [woman] wife begins to prepare for her work while she is awaiting, lighting a cauldron. The [woman] delivery driver makes sure to follow safety precautions and puts on a white helmet. Ready to go, the [woman] delivery driver rides off on her orange moped.\nD: The husband is helping the wife with her work.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: A man is delivering a package on his orange motorcycle.\nB: A man is waiting for his delivery to come.\nC: A [female] woman securely places her delivery within a burlap bag hosted on her orange moped. A husband[male] and wife[female] patiently wait for their delivery to come. The [woman] wife begins to prepare for her work while she is awaiting, lighting a cauldron. The [woman] delivery driver makes sure to follow safety precautions and puts on a white helmet. Ready to go, the [woman] delivery driver rides off on her orange moped.\nD: The husband is helping the wife with her work.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_27_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_27_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_27_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_27_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_27_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: A family enjoying a picnic in the park.\nB: A couple taking a romantic stroll by the beach.\nC: A group of friends meet up to do some biking. Nice bike! One mentions to the kid with the red bike. And they head on down the trail. Bosco [male] stops for a selfie, It's such a nice day. At the end of their journey one does a quick trick to show off before leaving.\nD: A group of students studying for an exam in the library.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: A family enjoying a picnic in the park.\nB: A couple taking a romantic stroll by the beach.\nC: A group of friends meet up to do some biking. Nice bike! One mentions to the kid with the red bike. And they head on down the trail. Bosco [male] stops for a selfie, It's such a nice day. At the end of their journey one does a quick trick to show off before leaving.\nD: A group of students studying for an exam in the library.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_28_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_28_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_28_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_28_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_28_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: The family is struggling to push the stroller in bad weather.\nB: The family is arguing and can't decide where to go next.\nC: The family is lost and doesn't know where to go next.\nD: The family decides to visit a new location and takes a selfie. The male and female are pushing a stroller on a sunny day. The family huddle up to talk about what to do next at the chosen location. The female is happy and ready to explore. The family starts walking to the next location.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: The family is struggling to push the stroller in bad weather.\nB: The family is arguing and can't decide where to go next.\nC: The family is lost and doesn't know where to go next.\nD: The family decides to visit a new location and takes a selfie. The male and female are pushing a stroller on a sunny day. The family huddle up to talk about what to do next at the chosen location. The female is happy and ready to explore. The family starts walking to the next location.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_29_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_29_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_29_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_29_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_29_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: Jessy was in bright green clothes and was calm and relaxed.\nB: hotel heaven room  no  112 was packed with journalists patiently waiting  for the guest. and particularly jessy in white clothes was looked edgy, to see the three sisters elena, marina, and sabrina who were her arch rivals then the guest madam mercury entered the  room no 112 looking puzzled to see a mysery  woman wearing a white pearl on her ear.....she was the owner of the hotel ms gomes\nC: The hotel room was empty and no one was waiting for any guest.\nD: There were no journalists in the hotel room and no one was waiting for any guest.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: Jessy was in bright green clothes and was calm and relaxed.\nB: hotel heaven room  no  112 was packed with journalists patiently waiting  for the guest. and particularly jessy in white clothes was looked edgy, to see the three sisters elena, marina, and sabrina who were her arch rivals then the guest madam mercury entered the  room no 112 looking puzzled to see a mysery  woman wearing a white pearl on her ear.....she was the owner of the hotel ms gomes\nC: The hotel room was empty and no one was waiting for any guest.\nD: There were no journalists in the hotel room and no one was waiting for any guest.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_30_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_30_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_30_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_30_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_30_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: they were on a business trip to a wealthy country where they experienced luxurious living\nB: they explored a developed nation with modern infrastructure and high standard of living\nC: we went on a vacation to location to explore the country .we found a lot of nice people and interesting sights .the country overall was fairly poor and we felt bad at times .the rivers were wild and mostly used for fishing .many people make their living fishing\nD: they visited an affluent country with beautiful landscapes and prosperous people", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: they were on a business trip to a wealthy country where they experienced luxurious living\nB: they explored a developed nation with modern infrastructure and high standard of living\nC: we went on a vacation to location to explore the country .we found a lot of nice people and interesting sights .the country overall was fairly poor and we felt bad at times .the rivers were wild and mostly used for fishing .many people make their living fishing\nD: they visited an affluent country with beautiful landscapes and prosperous people", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_31_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_31_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_31_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_31_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_31_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: tourists love to take photos of the river and old buildings .lovers come to the bridge to symbolize their love with locks .the city's local art work is popular among the locals .statues are a significant part of the city's attractions .\nB: the bridge has a rustic charm that attracts visitors .lovers often leave locks on the bridge as a symbol of commitment .many tourists enjoy posing in front of the beautiful art work .the city is known for its famous statues .\nC: the bridge is a lovely part of the city .people show their love by placing locks on the bridge .people from all over like to pose in front of local art work .statues are always a welcome sight for tourists. & # 13 ;ice skating is a wonderful time for many .\nD: the river is the main attraction of the city .tourists love to take photos in front of the old buildings .people like to attach love locks on the buildings .the local art work draws a lot of attention .", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: tourists love to take photos of the river and old buildings .lovers come to the bridge to symbolize their love with locks .the city's local art work is popular among the locals .statues are a significant part of the city's attractions .\nB: the bridge has a rustic charm that attracts visitors .lovers often leave locks on the bridge as a symbol of commitment .many tourists enjoy posing in front of the beautiful art work .the city is known for its famous statues .\nC: the bridge is a lovely part of the city .people show their love by placing locks on the bridge .people from all over like to pose in front of local art work .statues are always a welcome sight for tourists. & # 13 ;ice skating is a wonderful time for many .\nD: the river is the main attraction of the city .tourists love to take photos in front of the old buildings .people like to attach love locks on the buildings .the local art work draws a lot of attention .", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_32_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_32_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_32_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_32_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_32_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: a group of people are gathered around outside in a gloomy and polluted winter day\nB: the villagers skate, ski, and snowboard to get to their destination in the desert\nC: there is a group of people scattered around and specific corner of the market\nD: a community of people are gathered around outside on a nice, beautiful and green summer day The villagers walk, bike, and drive to get to their destination at the market, there is a group of people gathered around and specific corner of the market The men gather around the market square the deal was on bananas", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: a group of people are gathered around outside in a gloomy and polluted winter day\nB: the villagers skate, ski, and snowboard to get to their destination in the desert\nC: there is a group of people scattered around and specific corner of the market\nD: a community of people are gathered around outside on a nice, beautiful and green summer day The villagers walk, bike, and drive to get to their destination at the market, there is a group of people gathered around and specific corner of the market The men gather around the market square the deal was on bananas", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_33_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_33_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_33_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_33_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_33_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: An elderly woman is spotted at the festival wearing colorful headwear.\nB: A woman is holding a child during a festival while the child looks at the camera. An elderly woman is spotted at the festival wearing colorful headwear. Another elderly woman can also be seen at the festival wearing colorful headwear. Here is a group of performers in uniform from the festival. It is a hot and sunny day and the performers are tired.\nC: A woman is holding a child during a festival while the child looks at the camera.\nD: A group of performers in uniform from the festival are excited and energized.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: An elderly woman is spotted at the festival wearing colorful headwear.\nB: A woman is holding a child during a festival while the child looks at the camera. An elderly woman is spotted at the festival wearing colorful headwear. Another elderly woman can also be seen at the festival wearing colorful headwear. Here is a group of performers in uniform from the festival. It is a hot and sunny day and the performers are tired.\nC: A woman is holding a child during a festival while the child looks at the camera.\nD: A group of performers in uniform from the festival are excited and energized.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_34_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_34_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_34_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_34_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_34_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: A closet has a lot of sneakers within it. There is a pair of yellow sneakers on a shelf within the closet. It appears all the shelves are filled with sneakers. A pair of red sneakers has a chevron stripe on them. Someone comes into the closet to select a pair of sneakers.\nB: A shoe store with different types of shoes on display.\nC: A pantry with various food items neatly placed inside it.\nD: A wardrobe filled with clothes and accessories.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: A closet has a lot of sneakers within it. There is a pair of yellow sneakers on a shelf within the closet. It appears all the shelves are filled with sneakers. A pair of red sneakers has a chevron stripe on them. Someone comes into the closet to select a pair of sneakers.\nB: A shoe store with different types of shoes on display.\nC: A pantry with various food items neatly placed inside it.\nD: A wardrobe filled with clothes and accessories.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_35_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_35_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_35_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_35_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_35_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: the mountains have a snowy weather the land is filled with dense forests the hills are used for rock climbing there is a river flowing between the fields the man is hiking alone\nB: the mountains have a rainy weather the land is covered with tall buildings the hills are used for grazing cattle there is no road between the fields the man is riding a bicycle\nC: the mountains have a clear weather the land is barren without any trees the hills have no farms there is no road between the fields the man is sitting in the jeep and not showing the place to the blue shirt man\nD: the mountains have a cloudy weather the land is having cut farms with trees the hills have farms for the cultivation of rice there was a road in between the fields the man is driving a jeep and showing the place to the blue shirt man", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: the mountains have a snowy weather the land is filled with dense forests the hills are used for rock climbing there is a river flowing between the fields the man is hiking alone\nB: the mountains have a rainy weather the land is covered with tall buildings the hills are used for grazing cattle there is no road between the fields the man is riding a bicycle\nC: the mountains have a clear weather the land is barren without any trees the hills have no farms there is no road between the fields the man is sitting in the jeep and not showing the place to the blue shirt man\nD: the mountains have a cloudy weather the land is having cut farms with trees the hills have farms for the cultivation of rice there was a road in between the fields the man is driving a jeep and showing the place to the blue shirt man", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_36_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_36_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_36_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_36_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_36_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: it was an adventurous day for tavi and his friends .they had some snacks while waiting for everyone to arrive .then came the challenges , each thrilling him more .and what adventure would be complete without a trophy !at the end of the day he reminds us he is now 8 .\nB: it was a very special birthday for tavi this year .he and his friends had some snacks while waiting for everyone to arrive .then came the gifts , each exiting him more .and what birthday would be complete without a cake !at the end of the party he reminds us he is now 4 .\nC: it was an ordinary day in the park for tavi and his friends .they had some snacks while waiting for everyone to arrive .then came the games , each exciting him more .and what day would be complete without a song !at the end of the day he reminds us he is now 6 .\nD: it was a very boring day for tavi this year .he and his friends had some snacks while waiting for everyone to arrive .then came the homework , each exhausting him more .and what day would be complete without a nap !at the end of the day he reminds us he is now 10 .", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: it was an adventurous day for tavi and his friends .they had some snacks while waiting for everyone to arrive .then came the challenges , each thrilling him more .and what adventure would be complete without a trophy !at the end of the day he reminds us he is now 8 .\nB: it was a very special birthday for tavi this year .he and his friends had some snacks while waiting for everyone to arrive .then came the gifts , each exiting him more .and what birthday would be complete without a cake !at the end of the party he reminds us he is now 4 .\nC: it was an ordinary day in the park for tavi and his friends .they had some snacks while waiting for everyone to arrive .then came the games , each exciting him more .and what day would be complete without a song !at the end of the day he reminds us he is now 6 .\nD: it was a very boring day for tavi this year .he and his friends had some snacks while waiting for everyone to arrive .then came the homework , each exhausting him more .and what day would be complete without a nap !at the end of the day he reminds us he is now 10 .", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_37_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_37_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_37_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_37_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_37_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: One day a family went on a road trip in their van. They set out in the early morning, took the scenic route, and made several stops at picturesque locations. As the sun set, they arrived back home, concluding their eventful road trip.\nB: One day a man took a bicycle and went on a mountain biking trip. He started his journey in the afternoon, took several turns and enjoyed the adventure. Finally, he reached a hilltop at sunset, concluding his thrilling mountain biking trip.\nC: One day a man take a car and went a trip for somewhere. He wanted to take video of his trip. Then he take his mobile phone and fix in the win shield like front glass of the car. He started his car in the morning, went on the high ways, don't take any turns. He enjoyed. In the end, he came to his car shed at night, his travel trip was the end and full of high ways.\nD: One day a woman drove her car to the beach and filmed the beautiful sunset. She enjoyed the cool breeze and the sound of the waves. As the night fell, she returned home, completing her tranquil beach trip.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: One day a family went on a road trip in their van. They set out in the early morning, took the scenic route, and made several stops at picturesque locations. As the sun set, they arrived back home, concluding their eventful road trip.\nB: One day a man took a bicycle and went on a mountain biking trip. He started his journey in the afternoon, took several turns and enjoyed the adventure. Finally, he reached a hilltop at sunset, concluding his thrilling mountain biking trip.\nC: One day a man take a car and went a trip for somewhere. He wanted to take video of his trip. Then he take his mobile phone and fix in the win shield like front glass of the car. He started his car in the morning, went on the high ways, don't take any turns. He enjoyed. In the end, he came to his car shed at night, his travel trip was the end and full of high ways.\nD: One day a woman drove her car to the beach and filmed the beautiful sunset. She enjoyed the cool breeze and the sound of the waves. As the night fell, she returned home, completing her tranquil beach trip.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_38_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_38_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_38_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_38_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_38_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: the sisters are having drinks to celebrate .it 's our little sister 's baby shower and she 's looking at the gifts .she opened the present to find it 's a gift for a baby toy .another present is a baby 's outfit .everyone is gathered around the table to watch her open her gifts .\nB: the brothers are having a party with drinks\nC: it 's a wedding celebration with gifts and presents\nD: a group of friends are having a birthday party", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: the sisters are having drinks to celebrate .it 's our little sister 's baby shower and she 's looking at the gifts .she opened the present to find it 's a gift for a baby toy .another present is a baby 's outfit .everyone is gathered around the table to watch her open her gifts .\nB: the brothers are having a party with drinks\nC: it 's a wedding celebration with gifts and presents\nD: a group of friends are having a birthday party", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_39_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_39_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_39_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_39_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_39_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: the girl has created a powerful stick creature on the atoll\nB: the stick man is merely a symbol of the girl's creation\nC: the giant stick man will conquer all the humans .he stands mightily on his mound of pixels .this forceful center is the source all of his power .all hail the giant stick creature !it is really just atoll created by this girl .\nD: the giant stick man is controlling a small group of people", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: the girl has created a powerful stick creature on the atoll\nB: the stick man is merely a symbol of the girl's creation\nC: the giant stick man will conquer all the humans .he stands mightily on his mound of pixels .this forceful center is the source all of his power .all hail the giant stick creature !it is really just atoll created by this girl .\nD: the giant stick man is controlling a small group of people", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_40_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_40_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_40_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_40_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_40_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: the family enjoyed a peaceful day at the park .they looked at a massive aquarium .they could even see a range of animals  .after the sunset, they admired a stunning view of the city .they went to the market and talked about the fantastic trip they had .\nB: a group of friends spent a relaxing day at the lake .they checked out a gigantic stadium .they could even observe numerous athletes   .as the night descended, they beheld the spectacular sights of the skyscrapers .they visited the stores and reminisced about the amazing adventure they had .\nC: a couple had a pleasant outing by the river .they witnessed a colossal gathering .they even noticed all the individuals   .when the night came, they witnessed the lovely hues of the cityscape .they journeyed to the malls and reflected on the wonderful outing they had .\nD: the family took a nice trip to the beach .they saw an enormous organization organization .they even got to see all the people on the beach .when the sun fell , they could see the beautiful colors on the skyline .they traveled to the shops and thought about the great day they had .", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: the family enjoyed a peaceful day at the park .they looked at a massive aquarium .they could even see a range of animals  .after the sunset, they admired a stunning view of the city .they went to the market and talked about the fantastic trip they had .\nB: a group of friends spent a relaxing day at the lake .they checked out a gigantic stadium .they could even observe numerous athletes   .as the night descended, they beheld the spectacular sights of the skyscrapers .they visited the stores and reminisced about the amazing adventure they had .\nC: a couple had a pleasant outing by the river .they witnessed a colossal gathering .they even noticed all the individuals   .when the night came, they witnessed the lovely hues of the cityscape .they journeyed to the malls and reflected on the wonderful outing they had .\nD: the family took a nice trip to the beach .they saw an enormous organization organization .they even got to see all the people on the beach .when the sun fell , they could see the beautiful colors on the skyline .they traveled to the shops and thought about the great day they had .", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_41_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_41_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_41_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_41_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_41_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: the images depict a busy market with vendors selling various items\nB: the images represent a construction site with workers building a new structure\nC: the images show a city street with a person walking a dog\nD: we are in a temple with a bell under a wooden arch painted red a woman with a red backpack is walking people walk in the aisles outside a small car is parked in front of a porch a man installs things on a large table", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: the images depict a busy market with vendors selling various items\nB: the images represent a construction site with workers building a new structure\nC: the images show a city street with a person walking a dog\nD: we are in a temple with a bell under a wooden arch painted red a woman with a red backpack is walking people walk in the aisles outside a small car is parked in front of a porch a man installs things on a large table", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_42_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_42_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_42_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_42_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_42_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: A man in a red bandana is seen talking to food merchants. Here is some food being sold at the stall. The man is sitting down with his food order and talking. The man tries his food with chopsticks. Noodles and vegetables in brown sauce between the chopsticks.\nB: A man is buying groceries at a market.\nC: A man is having a conversation with friends at a restaurant.\nD: A man is sampling different dishes at a food festival.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: A man in a red bandana is seen talking to food merchants. Here is some food being sold at the stall. The man is sitting down with his food order and talking. The man tries his food with chopsticks. Noodles and vegetables in brown sauce between the chopsticks.\nB: A man is buying groceries at a market.\nC: A man is having a conversation with friends at a restaurant.\nD: A man is sampling different dishes at a food festival.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_43_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_43_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_43_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_43_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_43_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: one of my favorite eateries from vacation this year was boudin . this bakery is not popular and not recommended.\nB: one of my favorite eateries from vacation this year was boudin . this bakery is found in many locations in the location .sculpted bread is a favorite , especially with our kids ! look at the cute turtle .the breads are made from sourdough , hearth breads , and other specialties and the prices are reasonable .we even had the opportunity to watch the bakers in action .if you visit a town with a organization organization organization organization organization , be sure to stop in !\nC: The breads are not fresh and made from low-quality ingredients.\nD: This bakery is only found in one location and it's not very good.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: one of my favorite eateries from vacation this year was boudin . this bakery is not popular and not recommended.\nB: one of my favorite eateries from vacation this year was boudin . this bakery is found in many locations in the location .sculpted bread is a favorite , especially with our kids ! look at the cute turtle .the breads are made from sourdough , hearth breads , and other specialties and the prices are reasonable .we even had the opportunity to watch the bakers in action .if you visit a town with a organization organization organization organization organization , be sure to stop in !\nC: The breads are not fresh and made from low-quality ingredients.\nD: This bakery is only found in one location and it's not very good.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_44_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_44_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_44_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_44_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_44_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: they all went to a fancy dinner at a restaurant\nB: we all met at the club to celebrate his big night .she was sure happy to see me there .we started to take goofy pictures after a few drinks .i think he was even happier to see his friend .it was a great time had by all .\nC: they went to the beach and played volleyball\nD: it was a quiet and boring evening at home", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: they all went to a fancy dinner at a restaurant\nB: we all met at the club to celebrate his big night .she was sure happy to see me there .we started to take goofy pictures after a few drinks .i think he was even happier to see his friend .it was a great time had by all .\nC: they went to the beach and played volleyball\nD: it was a quiet and boring evening at home", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_45_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_45_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_45_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_45_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_45_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: we went to the beach for a picnic .the weather was perfect .we played games and enjoyed the sunshine .it was a great day .\nB: we gathered at the state park for the annual fireworks show .it started with a bang ! literally .the show was breathtaking . the kids loved every minute .is that a heart in there ? i think it was supposed to be .all the oohs and aahs finally came to an end . the kids wanted more !\nC: the family had a barbecue in the backyard .we grilled burgers and hot dogs .there were lots of laughs and good food .it was a memorable evening .\nD: we visited the museum and saw amazing exhibits .the kids were fascinated by the artifacts .we learned a lot and had a great time . it was a lovely experience .", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: we went to the beach for a picnic .the weather was perfect .we played games and enjoyed the sunshine .it was a great day .\nB: we gathered at the state park for the annual fireworks show .it started with a bang ! literally .the show was breathtaking . the kids loved every minute .is that a heart in there ? i think it was supposed to be .all the oohs and aahs finally came to an end . the kids wanted more !\nC: the family had a barbecue in the backyard .we grilled burgers and hot dogs .there were lots of laughs and good food .it was a memorable evening .\nD: we visited the museum and saw amazing exhibits .the kids were fascinated by the artifacts .we learned a lot and had a great time . it was a lovely experience .", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_46_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_46_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_46_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_46_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_46_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: this photo was taken on a boring day with nothing special to see\nB: i don't really like any of these pictures, they're all pretty boring\nC: these photos are not worth sharing, they're not interesting at all\nD: i had to post some of these amazing photos from vacationi love this photo of the rocks out in the oceanwe spent this day walking the beach looking for shellsthere are some amazing view of the beautiful waterthis is one of my favorites on the last day visiting", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: this photo was taken on a boring day with nothing special to see\nB: i don't really like any of these pictures, they're all pretty boring\nC: these photos are not worth sharing, they're not interesting at all\nD: i had to post some of these amazing photos from vacationi love this photo of the rocks out in the oceanwe spent this day walking the beach looking for shellsthere are some amazing view of the beautiful waterthis is one of my favorites on the last day visiting", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_47_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_47_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_47_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_47_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_47_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: here is the car that we took to the national park ! how cool is that ?we arrived safely at location location location , and were excited to start our trip .here is a beautiful little waterfall .the views were so spectacular ! look at those mountains !we got some great fly-fishing in . we had such a memorable , fun trip .\nB: here is the plane that we took to the national park ! how cool is that ?we arrived safely at location location location , and were excited to start our trip .here is a beautiful little waterfall .the views were so spectacular ! look at those mountains !we got some great fly-fishing in . we had such a memorable , fun trip .\nC: here is the train that we took to the national park ! how cool is that ?we arrived safely at location location location , and were excited to start our trip .here is a beautiful little waterfall .the views were so spectacular ! look at those mountains !we got some great fly-fishing in . we had such a memorable , fun trip .\nD: here is the boat that we took to the national park ! how cool is that ?we arrived safely at location location location , and were excited to start our trip .here is a beautiful little waterfall .the views were so spectacular ! look at those mountains !we got some great fly-fishing in . we had such a memorable , fun trip .", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: here is the car that we took to the national park ! how cool is that ?we arrived safely at location location location , and were excited to start our trip .here is a beautiful little waterfall .the views were so spectacular ! look at those mountains !we got some great fly-fishing in . we had such a memorable , fun trip .\nB: here is the plane that we took to the national park ! how cool is that ?we arrived safely at location location location , and were excited to start our trip .here is a beautiful little waterfall .the views were so spectacular ! look at those mountains !we got some great fly-fishing in . we had such a memorable , fun trip .\nC: here is the train that we took to the national park ! how cool is that ?we arrived safely at location location location , and were excited to start our trip .here is a beautiful little waterfall .the views were so spectacular ! look at those mountains !we got some great fly-fishing in . we had such a memorable , fun trip .\nD: here is the boat that we took to the national park ! how cool is that ?we arrived safely at location location location , and were excited to start our trip .here is a beautiful little waterfall .the views were so spectacular ! look at those mountains !we got some great fly-fishing in . we had such a memorable , fun trip .", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_48_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_48_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_48_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_48_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_48_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: [child] is opening presents for their birthday.\nB: [male] is pregnant and having a baby shower .her first gift looks like a big one .there 's an organization book in it .she also pulls out a onesie .and finally , some little hand booties so the baby does not scratch their cheeks .\nC: [mother] is shopping for baby clothes and accessories.\nD: [female] is attending a baby shower and receiving gifts.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: [child] is opening presents for their birthday.\nB: [male] is pregnant and having a baby shower .her first gift looks like a big one .there 's an organization book in it .she also pulls out a onesie .and finally , some little hand booties so the baby does not scratch their cheeks .\nC: [mother] is shopping for baby clothes and accessories.\nD: [female] is attending a baby shower and receiving gifts.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_49_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_49_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_49_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_49_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_49_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: A male subject hads food on a plate and two other males are watching. Food is being prepaired on a grill with the use of tongs. Several food items are being displayed from above on a table. A group of children are gathered around a table with food located in front of them. Two females are seated on a couch on is talking and the other is eating.\nB: A female subject is cooking on a grill while others are watching.\nC: Two males are seated on a couch, one is talking and the other is eating.\nD: A group of adults are gathered around a table with food located in front of them.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: A male subject hads food on a plate and two other males are watching. Food is being prepaired on a grill with the use of tongs. Several food items are being displayed from above on a table. A group of children are gathered around a table with food located in front of them. Two females are seated on a couch on is talking and the other is eating.\nB: A female subject is cooking on a grill while others are watching.\nC: Two males are seated on a couch, one is talking and the other is eating.\nD: A group of adults are gathered around a table with food located in front of them.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_50_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_50_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_50_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_50_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_50_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: The couple was dancing on the beach. They practised some dance moves. They stopped dancing. They decided to go for a swim. The couple started to run towards the ocean.\nB: The couple was shopping in the city. They visited different stores. They bought souvenirs. They ended the day with a nice dinner.\nC: The couple was having a picnic on the beach. They enjoyed some food. They flew a kite. They relaxed on the sand.\nD: The couple was hiking in the mountains. They explored the trails. They took photos of the scenery. They enjoyed the fresh air.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: The couple was dancing on the beach. They practised some dance moves. They stopped dancing. They decided to go for a swim. The couple started to run towards the ocean.\nB: The couple was shopping in the city. They visited different stores. They bought souvenirs. They ended the day with a nice dinner.\nC: The couple was having a picnic on the beach. They enjoyed some food. They flew a kite. They relaxed on the sand.\nD: The couple was hiking in the mountains. They explored the trails. They took photos of the scenery. They enjoyed the fresh air.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_51_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_51_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_51_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_51_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_51_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: One day I woke up really tired and stayed in bed all day. I didn't feel like doing anything. I spent the day alone and didn't have anyone to talk to. It was a really boring and uneventful day.\nB: One day I woke up really hungry and headed downtown to my favorite cafe. The food was really good. I met up with my friends and their kid and we all had some laughs. We said our goodbyes and then headed our separate ways. As I drove back home I reflected on the beautiful day and felt appreciation for my life.\nC: One day I woke up feeling unwell and decided to stay home. I watched movies all day and ordered some food delivery. I didn't feel like going out at all and just wanted to rest.\nD: One day I woke up early and went for a long walk in the park. I enjoyed the fresh air and the beautiful scenery. I saw some cute animals and took some photos. It was a peaceful and relaxing day.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: One day I woke up really tired and stayed in bed all day. I didn't feel like doing anything. I spent the day alone and didn't have anyone to talk to. It was a really boring and uneventful day.\nB: One day I woke up really hungry and headed downtown to my favorite cafe. The food was really good. I met up with my friends and their kid and we all had some laughs. We said our goodbyes and then headed our separate ways. As I drove back home I reflected on the beautiful day and felt appreciation for my life.\nC: One day I woke up feeling unwell and decided to stay home. I watched movies all day and ordered some food delivery. I didn't feel like going out at all and just wanted to rest.\nD: One day I woke up early and went for a long walk in the park. I enjoyed the fresh air and the beautiful scenery. I saw some cute animals and took some photos. It was a peaceful and relaxing day.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_52_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_52_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_52_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_52_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_52_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: A woman holding a bag of apples\nB: A group of people walking in the city\nC: A male carrying bananas. A person riding a bike with a tree on it. A beautiful sky up in the mountains. Trail up in the mountains. Several people riding bikes with trees on their backs.\nD: A car driving through a desert", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: A woman holding a bag of apples\nB: A group of people walking in the city\nC: A male carrying bananas. A person riding a bike with a tree on it. A beautiful sky up in the mountains. Trail up in the mountains. Several people riding bikes with trees on their backs.\nD: A car driving through a desert", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_53_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_53_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_53_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_53_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_53_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: A family picnicking in a park\nB: Hiking in a snowstorm\nC: A small group of people prepare to climb a rocky hill. A person in a red jacket is the first to climb. Another person that is climbing is wearing  a green jacket. The person in the red jacket is wearing black boots. One of the members of the small group is wearing a scarf to cover most of their face.\nD: Rock climbing in a desert", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: A family picnicking in a park\nB: Hiking in a snowstorm\nC: A small group of people prepare to climb a rocky hill. A person in a red jacket is the first to climb. Another person that is climbing is wearing  a green jacket. The person in the red jacket is wearing black boots. One of the members of the small group is wearing a scarf to cover most of their face.\nD: Rock climbing in a desert", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_54_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_54_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_54_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_54_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_54_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: they went for a hike in the mountains\nB: they spent the day shopping in the city\nC: it was a fun day at the beach\nD: it was time for a little bike ridingfirst it was one on onethen it become two on twoit was a great time at the racethey enjoyed dinner afterwards", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: they went for a hike in the mountains\nB: they spent the day shopping in the city\nC: it was a fun day at the beach\nD: it was time for a little bike ridingfirst it was one on onethen it become two on twoit was a great time at the racethey enjoyed dinner afterwards", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_55_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_55_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_55_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_55_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_55_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: The man and woman were cooking dinner at home.\nB: The man and woman were attending a formal dinner event.\nC: The couple were having a picnic in the park.\nD: The woman and man were eating some street food. The food was on the stick. They dipped the stick in the broth. The man enjoyed the meal. A woman smiled at him.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: The man and woman were cooking dinner at home.\nB: The man and woman were attending a formal dinner event.\nC: The couple were having a picnic in the park.\nD: The woman and man were eating some street food. The food was on the stick. They dipped the stick in the broth. The man enjoyed the meal. A woman smiled at him.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_56_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_56_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_56_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_56_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_56_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: A guy is waving at a family. The family look back at him. A kid is trying to sell something. The guys buy from that family. The kid does book keeping.\nB: A guy is yelling at a family. The family ignore him. A kid is trying to sell something. The guys buy from that family. The kid does book keeping.\nC: A guy is waving at a family. The family look back at him. A kid is trying to sell something. The guys ignore the family. The kid does book keeping.\nD: A guy is waving at a family. The family look back at him. A kid is playing with toys. The guys buy from that family. The kid does book keeping.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: A guy is waving at a family. The family look back at him. A kid is trying to sell something. The guys buy from that family. The kid does book keeping.\nB: A guy is yelling at a family. The family ignore him. A kid is trying to sell something. The guys buy from that family. The kid does book keeping.\nC: A guy is waving at a family. The family look back at him. A kid is trying to sell something. The guys ignore the family. The kid does book keeping.\nD: A guy is waving at a family. The family look back at him. A kid is playing with toys. The guys buy from that family. The kid does book keeping.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_57_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_57_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_57_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_57_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_57_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: a crowded hotel in a beautiful location with well-maintained surroundings.\nB: an upscale resort with luxurious accommodations and stunning views.\nC: a touristy area with clean streets and high-end shopping.\nD: my friend and i went traveling around location last summer .we walked through the market and met a man selling meat .we stayed in this dumpy hotel .the view from the hotel was a graffitied out abandoned building .there was an artist who set up toys on the ground outside our hotel .", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: a crowded hotel in a beautiful location with well-maintained surroundings.\nB: an upscale resort with luxurious accommodations and stunning views.\nC: a touristy area with clean streets and high-end shopping.\nD: my friend and i went traveling around location last summer .we walked through the market and met a man selling meat .we stayed in this dumpy hotel .the view from the hotel was a graffitied out abandoned building .there was an artist who set up toys on the ground outside our hotel .", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_58_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_58_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_58_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_58_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_58_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: the album was in the basement\nB: there were no old photos in the album\nC: i found the album at the store\nD: i was going through an old album last week .there were so many old pieces of memorabilia in there .i had a lot of fun looking at all of the old photos .they all reminded me of good times .i was very happy to have found the album in the attic .", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: the album was in the basement\nB: there were no old photos in the album\nC: i found the album at the store\nD: i was going through an old album last week .there were so many old pieces of memorabilia in there .i had a lot of fun looking at all of the old photos .they all reminded me of good times .i was very happy to have found the album in the attic .", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_59_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_59_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_59_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_59_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_59_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: A male is standing eating a hamburger. The male with the hot dog has a red ball cap on.\nB: A male has answered the door and a female in a black shirt is standing on the other side.\nC: A female has answered the door and a male in a black shirt is standing on the other side. Another male is standing eating a hot dog. The male with the hot dog has a blue ball cap on. A female is peering out from behind a window from the inside of her home. Two women are working in an office.\nD: A female is peering out from behind a door from the inside of her home. Two men are working in an office.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: A male is standing eating a hamburger. The male with the hot dog has a red ball cap on.\nB: A male has answered the door and a female in a black shirt is standing on the other side.\nC: A female has answered the door and a male in a black shirt is standing on the other side. Another male is standing eating a hot dog. The male with the hot dog has a blue ball cap on. A female is peering out from behind a window from the inside of her home. Two women are working in an office.\nD: A female is peering out from behind a door from the inside of her home. Two men are working in an office.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_60_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_60_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_60_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_60_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_60_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: They are at a music concert. They are dancing and singing along with the crowd.\nB: They are in a off road bike rally. They enjoying their weekend in this event. A small boy also running his cycle in this place. They bring their bikes for repairing in the workshop. All girls are enjoying the race and this one girl is drinking the juice.\nC: They are at a beach party. They are playing volleyball and sunbathing.\nD: They are at a car show. They are admiring the vintage cars.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: They are at a music concert. They are dancing and singing along with the crowd.\nB: They are in a off road bike rally. They enjoying their weekend in this event. A small boy also running his cycle in this place. They bring their bikes for repairing in the workshop. All girls are enjoying the race and this one girl is drinking the juice.\nC: They are at a beach party. They are playing volleyball and sunbathing.\nD: They are at a car show. They are admiring the vintage cars.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_61_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_61_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_61_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_61_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_61_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: The family enjoys a peaceful evening\nB: our family always does a fireworks display for the 4th of july .my brother shot one off of the balcony that nearly caught the house on fire .we finally got the hang of it by the third shot fired .they lit the sky up in magnificent beauty .i had to be careful though because the fireworks were a bit faulty and were not wanting to burn right all night long .\nC: The family enjoys a bonfire on 4th of July\nD: The family does not celebrate any festival", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: The family enjoys a peaceful evening\nB: our family always does a fireworks display for the 4th of july .my brother shot one off of the balcony that nearly caught the house on fire .we finally got the hang of it by the third shot fired .they lit the sky up in magnificent beauty .i had to be careful though because the fireworks were a bit faulty and were not wanting to burn right all night long .\nC: The family enjoys a bonfire on 4th of July\nD: The family does not celebrate any festival", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_62_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_62_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_62_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_62_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_62_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: this area is barren with no natural beauty .the ocean water is murky and polluted .the rocks are eroded and unattractive .the beach is surrounded by unsightly buildings .\nB: this area is filled with nature 's beauty , like the clean sand and blue ocean water .the rocks carved by the waves are stunning to look at .the naturally formed arched rock feature is especially stunning to look at .the rock formation perfectly frames the ocean waves .such a beautiful beach is starkly contrasted by the asphalt road nearby .\nC: this area is a desert with no water or vegetation .the rocks are featureless and dull .the beach is littered with garbage and waste .the asphalt road is the only notable feature .\nD: this area is filled with industrial structures and pollution .the rocks are carved with graffiti and vandalism .the rock formation looks ordinary and unimpressive .the beach is crowded with litter and debris .", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: this area is barren with no natural beauty .the ocean water is murky and polluted .the rocks are eroded and unattractive .the beach is surrounded by unsightly buildings .\nB: this area is filled with nature 's beauty , like the clean sand and blue ocean water .the rocks carved by the waves are stunning to look at .the naturally formed arched rock feature is especially stunning to look at .the rock formation perfectly frames the ocean waves .such a beautiful beach is starkly contrasted by the asphalt road nearby .\nC: this area is a desert with no water or vegetation .the rocks are featureless and dull .the beach is littered with garbage and waste .the asphalt road is the only notable feature .\nD: this area is filled with industrial structures and pollution .the rocks are carved with graffiti and vandalism .the rock formation looks ordinary and unimpressive .the beach is crowded with litter and debris .", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_63_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_63_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_63_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_63_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_63_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: this weekend we went to a serene beach .the sandy beach was calm , and peaceful .the ocean was quiet and tranquil , it was soothing to swim in the water !there was a pier for fishing .the wooden and serene pier was relaxing .\nB: this weekend we went to a bustling city .the crowded streets were lively, and noisy .the market was busy and crowded , it was chaotic to walk in the crowd !there was a shopping complex for entertainment .the vibrant and colorful shops were eye-catching .\nC: this weekend we went to a peace garden .the colorful flowers were beautiful , and smelled wonderful .the gazebo was quiet and peaceful , it was relaxing to sit in the shade !there was a religious garden for meditation .the bright and colorful flowers were breathtaking .\nD: this weekend we went to a busy park .the green trees were beautiful , and smelled fresh .the playground was noisy and crowded , it was tiring to play in the sun !there was a fountain for photography .the clear and beautiful water was refreshing .", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: this weekend we went to a serene beach .the sandy beach was calm , and peaceful .the ocean was quiet and tranquil , it was soothing to swim in the water !there was a pier for fishing .the wooden and serene pier was relaxing .\nB: this weekend we went to a bustling city .the crowded streets were lively, and noisy .the market was busy and crowded , it was chaotic to walk in the crowd !there was a shopping complex for entertainment .the vibrant and colorful shops were eye-catching .\nC: this weekend we went to a peace garden .the colorful flowers were beautiful , and smelled wonderful .the gazebo was quiet and peaceful , it was relaxing to sit in the shade !there was a religious garden for meditation .the bright and colorful flowers were breathtaking .\nD: this weekend we went to a busy park .the green trees were beautiful , and smelled fresh .the playground was noisy and crowded , it was tiring to play in the sun !there was a fountain for photography .the clear and beautiful water was refreshing .", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_64_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_64_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_64_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_64_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_64_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: A peaceful village in the countryside. Children playing in the meadows. Cows grazing in the fields.\nB: A rainy city street in an Asian city. Two girls walk into a ship in an Asian City. Femals walking around a shop with lights hanging from the ceiling. A bunny eating grass out of a wooden box. Someone petting a hedgehog with a glove on.\nC: An office building in a busy city. Employees rushing in and out of the building. Traffic jam on the road.\nD: A sunny beach in a tropical island. Coconut trees sway in the breeze. People surfing in the clear blue water.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: A peaceful village in the countryside. Children playing in the meadows. Cows grazing in the fields.\nB: A rainy city street in an Asian city. Two girls walk into a ship in an Asian City. Femals walking around a shop with lights hanging from the ceiling. A bunny eating grass out of a wooden box. Someone petting a hedgehog with a glove on.\nC: An office building in a busy city. Employees rushing in and out of the building. Traffic jam on the road.\nD: A sunny beach in a tropical island. Coconut trees sway in the breeze. People surfing in the clear blue water.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_65_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_65_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_65_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_65_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_65_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: the staff members were genuinely friendly towards each other.\nB: the school function was a disaster.\nC: the kids were bored and uninterested in the activities.\nD: the bingo party was going well at the school function !the staff members even pretended to be friendly for pictures when they really all hated each other !mothers and fathers could only really cope with being at the function through alcohol .the kids were having fun though , even if they did n't want to be there in the first place .the spelling bee went well ! the school took first place !", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: the staff members were genuinely friendly towards each other.\nB: the school function was a disaster.\nC: the kids were bored and uninterested in the activities.\nD: the bingo party was going well at the school function !the staff members even pretended to be friendly for pictures when they really all hated each other !mothers and fathers could only really cope with being at the function through alcohol .the kids were having fun though , even if they did n't want to be there in the first place .the spelling bee went well ! the school took first place !", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_66_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_66_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_66_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_66_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_66_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: the family was excited for their first vacation together .first stop was to the crowded beach .next they went to the amusement park for some rest .then they headed towards the countryside for dinner .they had a common dessert to cap off their first day of relaxation .\nB: the couple was not prepared for their first vacation together .first stop was to the unpopular beach .next they went to the countryside for some rest .then they headed towards the office for dinner .they had a typical dessert to cap off their first day of work .\nC: the couple was ready for their first vacation together .first stop was to the landmark beach .next they went to the boardwalk for some rest .then they headed towards town for dinner .they had a unique dessert to cap off their first day of vacation .\nD: the friends were ready for their first day of work together .first stop was to the unimpressive park .next they went to the shopping mall for some rest .then they headed towards the city for dinner .they had an ordinary dessert to cap off their first day of work .", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: the family was excited for their first vacation together .first stop was to the crowded beach .next they went to the amusement park for some rest .then they headed towards the countryside for dinner .they had a common dessert to cap off their first day of relaxation .\nB: the couple was not prepared for their first vacation together .first stop was to the unpopular beach .next they went to the countryside for some rest .then they headed towards the office for dinner .they had a typical dessert to cap off their first day of work .\nC: the couple was ready for their first vacation together .first stop was to the landmark beach .next they went to the boardwalk for some rest .then they headed towards town for dinner .they had a unique dessert to cap off their first day of vacation .\nD: the friends were ready for their first day of work together .first stop was to the unimpressive park .next they went to the shopping mall for some rest .then they headed towards the city for dinner .they had an ordinary dessert to cap off their first day of work .", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_67_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_67_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_67_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_67_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_67_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: everyone entered the room , ready for the presentation .there were great speakers , who provided good education .the camera man made sure to document the important event .the audience enjoyed the presentation and company of others .they were satisfied with the material presented .\nB: some people left the room , unprepared for the display .there were mediocre presenters , who delivered average training .the videographer made sure to record the insignificant event .the viewers endured the presentation and solitude of others .they were dissatisfied with the information presented .\nC: few people entered the room , prepared for the speech .there were outstanding orators , who offered excellent guidance .the photographer made sure to capture the significant occasion .the spectators relished the talk and socialization with others .they were content with the content presented .\nD: nobody arrived in the room , waiting for the discussion .there were terrible lecturers , who provided bad instruction .the photographer made sure to neglect the unimportant event .the participants disliked the presentation and absence of others .they were displeased with the material presented .", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: everyone entered the room , ready for the presentation .there were great speakers , who provided good education .the camera man made sure to document the important event .the audience enjoyed the presentation and company of others .they were satisfied with the material presented .\nB: some people left the room , unprepared for the display .there were mediocre presenters , who delivered average training .the videographer made sure to record the insignificant event .the viewers endured the presentation and solitude of others .they were dissatisfied with the information presented .\nC: few people entered the room , prepared for the speech .there were outstanding orators , who offered excellent guidance .the photographer made sure to capture the significant occasion .the spectators relished the talk and socialization with others .they were content with the content presented .\nD: nobody arrived in the room , waiting for the discussion .there were terrible lecturers , who provided bad instruction .the photographer made sure to neglect the unimportant event .the participants disliked the presentation and absence of others .they were displeased with the material presented .", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_68_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_68_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_68_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_68_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_68_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: She was smiling at the camera.\nB: She was looking down. She was curious about something. She was putting something on her food. While he looked on. She was looking down at something. She was stirring the object in the bowl.\nC: She was playing with a dog.\nD: She was walking on the beach.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: She was smiling at the camera.\nB: She was looking down. She was curious about something. She was putting something on her food. While he looked on. She was looking down at something. She was stirring the object in the bowl.\nC: She was playing with a dog.\nD: She was walking on the beach.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_69_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_69_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_69_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_69_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_69_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: people having a picnic in the park\nB: getting help with the kayak for a day on the water .friends and family having fun on the lake .what great exercise while skiing on the water .friends and family having fun on a large raft on the water .teaching baby how to swim while making sure he 's safe .\nC: children playing in the snow\nD: individuals riding horses on the beach", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: people having a picnic in the park\nB: getting help with the kayak for a day on the water .friends and family having fun on the lake .what great exercise while skiing on the water .friends and family having fun on a large raft on the water .teaching baby how to swim while making sure he 's safe .\nC: children playing in the snow\nD: individuals riding horses on the beach", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_70_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_70_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_70_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_70_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_70_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: he woke up extra early that morning , for his first day of his new job .he was very nervous , but could n't manage to wake up on his own .he had a few coffees on his way out .he had dressed his best , but the coffee had kept him jittery .at his desk , he found that he had a lot of paperwork to fill out just for his first day . what a pain .\nB: he slept in and missed his first day of work\nC: he woke up late and missed his first day of work\nD: he was calm and relaxed on his first day of work", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: he woke up extra early that morning , for his first day of his new job .he was very nervous , but could n't manage to wake up on his own .he had a few coffees on his way out .he had dressed his best , but the coffee had kept him jittery .at his desk , he found that he had a lot of paperwork to fill out just for his first day . what a pain .\nB: he slept in and missed his first day of work\nC: he woke up late and missed his first day of work\nD: he was calm and relaxed on his first day of work", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_71_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_71_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_71_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_71_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_71_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: the boat party was just what the [male] 's family needed .the elderly women gathered in the corner to talk .mr. rodriguez sat in the corner alone for a while until his wife showed up .his wife [female] brought their son and they had fun .i took a picture of the interior of the boat just for memory 's sake .\nB: the boat party was just what the [female] 's family needed .the elderly men gathered in the corner to talk .mr. rodriguez sat in the corner alone for a while until his wife showed up .his wife [female] brought their daughter and they had fun .i took a picture of the exterior of the boat just for memory 's sake .\nC: the boat party was crowded with people .the elderly men gathered in the corner to talk .mr. rodriguez sat in the corner alone for a while until his wife showed up .his wife brought their son and they had fun .i took a picture of the sunset just for memory 's sake .\nD: the boat party was boring .the elderly men gathered in the corner to talk .mr. rodriguez was alone in the corner until his wife showed up .his wife brought their daughter and they had fun .i took a picture of the exterior of the boat just for memory 's sake .", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: the boat party was just what the [male] 's family needed .the elderly women gathered in the corner to talk .mr. rodriguez sat in the corner alone for a while until his wife showed up .his wife [female] brought their son and they had fun .i took a picture of the interior of the boat just for memory 's sake .\nB: the boat party was just what the [female] 's family needed .the elderly men gathered in the corner to talk .mr. rodriguez sat in the corner alone for a while until his wife showed up .his wife [female] brought their daughter and they had fun .i took a picture of the exterior of the boat just for memory 's sake .\nC: the boat party was crowded with people .the elderly men gathered in the corner to talk .mr. rodriguez sat in the corner alone for a while until his wife showed up .his wife brought their son and they had fun .i took a picture of the sunset just for memory 's sake .\nD: the boat party was boring .the elderly men gathered in the corner to talk .mr. rodriguez was alone in the corner until his wife showed up .his wife brought their daughter and they had fun .i took a picture of the exterior of the boat just for memory 's sake .", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_72_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_72_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_72_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_72_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_72_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: a group of children visit an amusement park for a birthday celebration .they ride on different attractions and have a delicious lunch .the birthday child hugs one of their friends tightly after the meal .the children are thrilled to receive presents .the group rides the carousel before leaving the park .\nB: a boy rides a bike to the park .he meets his friends and they all have a picnic lunch .the boy shakes hands with his friend after the meal .the boy is eager to open his birthday presents .the boy opens his presents and thanks his friends before they leave .\nC: a family goes to the zoo for the day .they see many animals and have a tasty lunch .the family takes a group photo after their meal .the young girl is excited to see a lion .the family leaves the zoo after a fun day .\nD: a girl goes to chucky cheese 's for her birthday party .many of the girl 's friends show up to her party and enjoy a nice meal .the birthday girl hugs one of her friends tightly after dinner .the birthday girl is very excited about her cake .before leaving , the birthday girl blows out her candle and prepares to eat her cake .", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: a group of children visit an amusement park for a birthday celebration .they ride on different attractions and have a delicious lunch .the birthday child hugs one of their friends tightly after the meal .the children are thrilled to receive presents .the group rides the carousel before leaving the park .\nB: a boy rides a bike to the park .he meets his friends and they all have a picnic lunch .the boy shakes hands with his friend after the meal .the boy is eager to open his birthday presents .the boy opens his presents and thanks his friends before they leave .\nC: a family goes to the zoo for the day .they see many animals and have a tasty lunch .the family takes a group photo after their meal .the young girl is excited to see a lion .the family leaves the zoo after a fun day .\nD: a girl goes to chucky cheese 's for her birthday party .many of the girl 's friends show up to her party and enjoy a nice meal .the birthday girl hugs one of her friends tightly after dinner .the birthday girl is very excited about her cake .before leaving , the birthday girl blows out her candle and prepares to eat her cake .", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_73_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_73_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_73_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_73_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_73_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: we got into location on july 4th .we had to ride these long escalators to get to the exit .some of the advertisements in the terminal caught our eye , in particular this one for the tourism board ...and this one advertising the beauty of location location .as we left , these friendly police officers wished us a good vacation !\nB: we arrived at the destination on july 4th .we had to take these long elevators to reach the exit .some of the billboards in the terminal attracted our attention , especially this one for the travel bureau ...and this one promoting the charm of destination destination .as we departed , these amiable security personnel bid us a pleasant holiday !\nC: we arrived at the venue on july 4th .we had to climb these long stairs to get to the exit .some of the banners in the terminal caught our attention , in particular this one for the travel agency ...and this one promoting the attractiveness of destination destination .as we left , these friendly security guards wished us a great holiday !\nD: we entered the place on july 4th .we had to use these tall stairs to access the exit .some of the posters in the terminal grabbed our attention , particularly this one for the sightseeing committee ...and this one marketing the allure of place place .as we departed , these affable law enforcement officers wished us a wonderful trip !", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: we got into location on july 4th .we had to ride these long escalators to get to the exit .some of the advertisements in the terminal caught our eye , in particular this one for the tourism board ...and this one advertising the beauty of location location .as we left , these friendly police officers wished us a good vacation !\nB: we arrived at the destination on july 4th .we had to take these long elevators to reach the exit .some of the billboards in the terminal attracted our attention , especially this one for the travel bureau ...and this one promoting the charm of destination destination .as we departed , these amiable security personnel bid us a pleasant holiday !\nC: we arrived at the venue on july 4th .we had to climb these long stairs to get to the exit .some of the banners in the terminal caught our attention , in particular this one for the travel agency ...and this one promoting the attractiveness of destination destination .as we left , these friendly security guards wished us a great holiday !\nD: we entered the place on july 4th .we had to use these tall stairs to access the exit .some of the posters in the terminal grabbed our attention , particularly this one for the sightseeing committee ...and this one marketing the allure of place place .as we departed , these affable law enforcement officers wished us a wonderful trip !", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_74_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_74_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_74_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_74_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_74_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: jen was all dressed up for her 30th birthday party .friends and coworkers joined in the celebration .[male] and [male] were their usual crazy selves .missy made this beautiful cake , accented with her favorite flower - gerbera daisies .it was a shame to cut the masterpiece , but everyone was hungry !\nB: jen was all dressed up for her 30th birthday party .friends and coworkers joined in the celebration .[female] and [female] were their usual crazy selves .missy made this beautiful cake , accented with her favorite flower - gerbera daisies .it was a shame to cut the masterpiece , but everyone was hungry !\nC: jen was all dressed up for her 25th birthday party .friends and coworkers joined in the celebration .[male] and [male] were their usual crazy selves .missy made this beautiful cake , accented with her favorite flower - gerbera daisies .it was a shame to cut the masterpiece , but everyone was hungry !\nD: jen was all dressed up for her 30th birthday party .friends and coworkers joined in the celebration .[male] and [female] were their usual crazy selves .missy made this beautiful cake , accented with her favorite flower - gerbera daisies .it was a shame to cut the masterpiece , but everyone was hungry !", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: jen was all dressed up for her 30th birthday party .friends and coworkers joined in the celebration .[male] and [male] were their usual crazy selves .missy made this beautiful cake , accented with her favorite flower - gerbera daisies .it was a shame to cut the masterpiece , but everyone was hungry !\nB: jen was all dressed up for her 30th birthday party .friends and coworkers joined in the celebration .[female] and [female] were their usual crazy selves .missy made this beautiful cake , accented with her favorite flower - gerbera daisies .it was a shame to cut the masterpiece , but everyone was hungry !\nC: jen was all dressed up for her 25th birthday party .friends and coworkers joined in the celebration .[male] and [male] were their usual crazy selves .missy made this beautiful cake , accented with her favorite flower - gerbera daisies .it was a shame to cut the masterpiece , but everyone was hungry !\nD: jen was all dressed up for her 30th birthday party .friends and coworkers joined in the celebration .[male] and [female] were their usual crazy selves .missy made this beautiful cake , accented with her favorite flower - gerbera daisies .it was a shame to cut the masterpiece , but everyone was hungry !", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_75_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_75_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_75_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_75_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_75_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: A young boy and his dog explore a forest trail. The boy climbs over a fallen tree on the path. The boy stops to play with his dog. Then they continue their exploration and come across a hidden pond. Feeling tired, they take a rest and relax by the water.\nB: A young boy and his dog go for a walk in the park. The boy jumps over a small branch on the path. The boy stops to pick up some flowers. After that, the boy continues his walk and finds a beautiful garden. Tired from the walk, he sits down and enjoys the view.\nC: A young boy and his dog go for a run down a dirt path. The boy hops a dead log that is blocking the dirt path. Boy stops at dead log and takes a moment to reflect. After reflecting the boy continues along the path and discovers a body of water. Exhausted from the adventure, he lays down on the bank and enjoys the afternoon.\nD: A young boy and his dog take a stroll in the countryside. The boy jumps over a small obstacle on the path. The boy stops to take a selfie. Then they continue their walk and find a hidden waterfall. Feeling tired, they sit down and admire the scenery.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: A young boy and his dog explore a forest trail. The boy climbs over a fallen tree on the path. The boy stops to play with his dog. Then they continue their exploration and come across a hidden pond. Feeling tired, they take a rest and relax by the water.\nB: A young boy and his dog go for a walk in the park. The boy jumps over a small branch on the path. The boy stops to pick up some flowers. After that, the boy continues his walk and finds a beautiful garden. Tired from the walk, he sits down and enjoys the view.\nC: A young boy and his dog go for a run down a dirt path. The boy hops a dead log that is blocking the dirt path. Boy stops at dead log and takes a moment to reflect. After reflecting the boy continues along the path and discovers a body of water. Exhausted from the adventure, he lays down on the bank and enjoys the afternoon.\nD: A young boy and his dog take a stroll in the countryside. The boy jumps over a small obstacle on the path. The boy stops to take a selfie. Then they continue their walk and find a hidden waterfall. Feeling tired, they sit down and admire the scenery.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_76_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_76_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_76_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_76_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_76_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: A group of children were playing on the beach.\nB: The [female] was with her mouth wide open in shock. The [female] walked in the [location] with two other [females] Five people walked up the stairs to the airplane. The airplane started to take off down the runway. The male smiled while working.\nC: The boy was playing with his toys in the park.\nD: The man was singing loudly on stage.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: A group of children were playing on the beach.\nB: The [female] was with her mouth wide open in shock. The [female] walked in the [location] with two other [females] Five people walked up the stairs to the airplane. The airplane started to take off down the runway. The male smiled while working.\nC: The boy was playing with his toys in the park.\nD: The man was singing loudly on stage.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_77_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_77_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_77_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_77_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_77_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: A family picnic in the mountains\nB: A man is helping a child climb the mountain. A group of people climbing a snowy mountain. A group of people climbing a mountain for an adventure. A picture of a person using a stick and snowboots to climb a dangerous mountain. The face of a person exhausted from climbing a dangerous mountain.\nC: A leisurely walk in the park\nD: A group of hikers exploring a forest trail", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: A family picnic in the mountains\nB: A man is helping a child climb the mountain. A group of people climbing a snowy mountain. A group of people climbing a mountain for an adventure. A picture of a person using a stick and snowboots to climb a dangerous mountain. The face of a person exhausted from climbing a dangerous mountain.\nC: A leisurely walk in the park\nD: A group of hikers exploring a forest trail", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_78_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_78_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_78_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_78_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_78_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: it was a terrible night full of darkness\nB: the night was dim and uneventful\nC: it was a great night full of lightsthe night shined bright throughout the citythe buildings were amazing to look atand the food was just as goodthis was the perfect night for a night out\nD: the buildings were unimpressive and dull", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: it was a terrible night full of darkness\nB: the night was dim and uneventful\nC: it was a great night full of lightsthe night shined bright throughout the citythe buildings were amazing to look atand the food was just as goodthis was the perfect night for a night out\nD: the buildings were unimpressive and dull", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_79_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_79_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_79_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_79_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_79_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: Silvia and Grace are at a party, dancing and having fun. Grace feels confident with her solo and is happy to brag about this. Silvia is excited and seems unable to hide her joy. Grace stays away from the party and watches Silvia flaunt her moves. Only time will tell, there is 6 hours till the show.\nB: Grace and Silvia are in the studio, practising for the show. Grace feels confident with her solo and is happy to brag about this. Silvia is very nervous and seems unable to hide her fear. Silvia stays away from the practice and watches Grace flaunt her moves. Only time will tell, there is 6 hours till the show.\nC: Grace and Silvia are at the beach, relaxing and enjoying the sun. Grace feels confident with her solo and is happy to brag about this. Silvia is very nervous and seems unable to hide her fear. Silvia stays away from the beach and watches Grace flaunt her moves. Only time will tell, there is 6 hours till the show.\nD: Grace and Silvia are in the studio, practising for the show. Grace is nervous and seems unable to hide her fear. Grace stays away from the practice and watches Silvia flaunt her moves. Only time will tell, there is 6 hours till the show.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: Silvia and Grace are at a party, dancing and having fun. Grace feels confident with her solo and is happy to brag about this. Silvia is excited and seems unable to hide her joy. Grace stays away from the party and watches Silvia flaunt her moves. Only time will tell, there is 6 hours till the show.\nB: Grace and Silvia are in the studio, practising for the show. Grace feels confident with her solo and is happy to brag about this. Silvia is very nervous and seems unable to hide her fear. Silvia stays away from the practice and watches Grace flaunt her moves. Only time will tell, there is 6 hours till the show.\nC: Grace and Silvia are at the beach, relaxing and enjoying the sun. Grace feels confident with her solo and is happy to brag about this. Silvia is very nervous and seems unable to hide her fear. Silvia stays away from the beach and watches Grace flaunt her moves. Only time will tell, there is 6 hours till the show.\nD: Grace and Silvia are in the studio, practising for the show. Grace is nervous and seems unable to hide her fear. Grace stays away from the practice and watches Silvia flaunt her moves. Only time will tell, there is 6 hours till the show.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_80_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_80_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_80_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_80_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_80_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: on a camping trip in the forest . the kids were worn out from the long hike .we set up a campfire in the woods . roasted marshmallows and hot dogs with the family .we had a lot of fun fishing in the river .here i am taking a selfie by the campfire .afterwards i went hiking to the waterfall . it was a lot of fun .\nB: at an amusement park . the children were tired from the roller coaster rides .we set up a picnic area in the park . burgers and sodas with the family .we had a lot of fun playing games and riding the merry-go-round .here i am taking a selfie on the ferris wheel .afterwards i went on the giant water slide . it was a lot of fun .\nC: on our way to the beach today . the boys passed out from the long drive .we set up on the beach . food and drinks with the family .we had a lot of fun setting up in the sand .here i am taking a selfie in the sun .afterwards i went surfing in the water . it was a lot of fun .\nD: on a snowy day in the mountains . the kids were exhausted from the long hike .we set up a tent in the snow . hot cocoa and marshmallows with the family .we enjoyed building a snowman together .here i am skiing down the slope .afterwards i went ice skating on the frozen lake . it was a lot of fun .", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: on a camping trip in the forest . the kids were worn out from the long hike .we set up a campfire in the woods . roasted marshmallows and hot dogs with the family .we had a lot of fun fishing in the river .here i am taking a selfie by the campfire .afterwards i went hiking to the waterfall . it was a lot of fun .\nB: at an amusement park . the children were tired from the roller coaster rides .we set up a picnic area in the park . burgers and sodas with the family .we had a lot of fun playing games and riding the merry-go-round .here i am taking a selfie on the ferris wheel .afterwards i went on the giant water slide . it was a lot of fun .\nC: on our way to the beach today . the boys passed out from the long drive .we set up on the beach . food and drinks with the family .we had a lot of fun setting up in the sand .here i am taking a selfie in the sun .afterwards i went surfing in the water . it was a lot of fun .\nD: on a snowy day in the mountains . the kids were exhausted from the long hike .we set up a tent in the snow . hot cocoa and marshmallows with the family .we enjoyed building a snowman together .here i am skiing down the slope .afterwards i went ice skating on the frozen lake . it was a lot of fun .", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_81_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_81_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_81_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_81_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_81_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: I took a cab to return to the hotel\nB: the front of the mall was somewhat crowded .i ran past them and took the escalator down .after shopping for a few hours , i returned to the street .i tried to catch a cab but a bush blocked me .i decided to just walk back to my hotel .\nC: the mall was empty and I took the stairs up\nD: I quickly caught a bus to my hotel", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: I took a cab to return to the hotel\nB: the front of the mall was somewhat crowded .i ran past them and took the escalator down .after shopping for a few hours , i returned to the street .i tried to catch a cab but a bush blocked me .i decided to just walk back to my hotel .\nC: the mall was empty and I took the stairs up\nD: I quickly caught a bus to my hotel", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_82_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_82_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_82_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_82_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_82_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: there were many sharks in the water .we were scared .\nB: we went to the beach today .there was a lot of seals .then we saw a castle .there was a lot of cool decorations .we had a really good day .\nC: we went to a mountain and climbed to the top .\nD: we visited the zoo and saw some tigers .", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: there were many sharks in the water .we were scared .\nB: we went to the beach today .there was a lot of seals .then we saw a castle .there was a lot of cool decorations .we had a really good day .\nC: we went to a mountain and climbed to the top .\nD: we visited the zoo and saw some tigers .", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_83_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_83_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_83_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_83_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_83_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: on the last day of our vacation we decided to visit snow mountain .there were a lot of beautiful snow-capped peaks .we noticed as we got there there were skiers enjoying the slopes .one skier was carrying skis .he told us part of the mountain was closed due to an avalanche so we ended up leaving .\nB: on the last day of our vacation we decided to visit rock mountain .there were a lot of beautiful mountains .we noticed as we got there there were workers doing work .one worker was carrying a huge rock .he told us part of the park was closed due to the construction so we ended up leaving .\nC: on the first day of our vacation we decided to visit beach mountain .there were a lot of beautiful beaches .we noticed as we got there there were lifeguards on duty .one lifeguard was carrying a surfboard .he told us part of the beach was closed due to the high waves so we ended up leaving .\nD: on the last day of our vacation we decided to visit forest mountain .there were a lot of beautiful trees and wildlife .we noticed as we got there there were rangers patrolling .one ranger was carrying a backpack .he told us part of the forest was closed due to the fire risk so we ended up leaving .", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: on the last day of our vacation we decided to visit snow mountain .there were a lot of beautiful snow-capped peaks .we noticed as we got there there were skiers enjoying the slopes .one skier was carrying skis .he told us part of the mountain was closed due to an avalanche so we ended up leaving .\nB: on the last day of our vacation we decided to visit rock mountain .there were a lot of beautiful mountains .we noticed as we got there there were workers doing work .one worker was carrying a huge rock .he told us part of the park was closed due to the construction so we ended up leaving .\nC: on the first day of our vacation we decided to visit beach mountain .there were a lot of beautiful beaches .we noticed as we got there there were lifeguards on duty .one lifeguard was carrying a surfboard .he told us part of the beach was closed due to the high waves so we ended up leaving .\nD: on the last day of our vacation we decided to visit forest mountain .there were a lot of beautiful trees and wildlife .we noticed as we got there there were rangers patrolling .one ranger was carrying a backpack .he told us part of the forest was closed due to the fire risk so we ended up leaving .", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_84_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_84_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_84_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_84_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_84_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: the family went on a snowy hike .the youngest child was having fun , dressed in red .he explored the trees in the brisk air .they held their hands out with sunflower seeds to feed the birds .later , the young boy hitched a ride on the dads back .\nB: the family went on a beach picnic .the youngest child was having fun , dressed in red .he explored the trees in the brisk air .they held their hands out with sunflower seeds to feed the birds .later , the young boy hitched a ride on the dads back .\nC: the family went on a mountain climbing trip .the youngest child was having fun , dressed in red .he explored the trees in the brisk air .they held their hands out with sunflower seeds to feed the birds .later , the young boy hitched a ride on the dads back .\nD: the family went on a tropical vacation .the youngest child was having fun , dressed in red .he explored the trees in the brisk air .they held their hands out with sunflower seeds to feed the birds .later , the young boy hitched a ride on the dads back .", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: the family went on a snowy hike .the youngest child was having fun , dressed in red .he explored the trees in the brisk air .they held their hands out with sunflower seeds to feed the birds .later , the young boy hitched a ride on the dads back .\nB: the family went on a beach picnic .the youngest child was having fun , dressed in red .he explored the trees in the brisk air .they held their hands out with sunflower seeds to feed the birds .later , the young boy hitched a ride on the dads back .\nC: the family went on a mountain climbing trip .the youngest child was having fun , dressed in red .he explored the trees in the brisk air .they held their hands out with sunflower seeds to feed the birds .later , the young boy hitched a ride on the dads back .\nD: the family went on a tropical vacation .the youngest child was having fun , dressed in red .he explored the trees in the brisk air .they held their hands out with sunflower seeds to feed the birds .later , the young boy hitched a ride on the dads back .", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_85_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_85_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_85_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_85_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_85_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: We spotted a small fishing boat.\nB: The water was crystal clear and blue today.\nC: The cargo ship was brand new.\nD: We saw the coolest thing today while on the water, it was a cargo ship house! The water today resembled a murky green color. The cargo ship we saw today on the ocean was huge! Its crazy to think how it floats on the ocean. Today we saw a crane remove the accommodation from the cargo ship! The cargo ship seemed to have rusted over the years.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: We spotted a small fishing boat.\nB: The water was crystal clear and blue today.\nC: The cargo ship was brand new.\nD: We saw the coolest thing today while on the water, it was a cargo ship house! The water today resembled a murky green color. The cargo ship we saw today on the ocean was huge! Its crazy to think how it floats on the ocean. Today we saw a crane remove the accommodation from the cargo ship! The cargo ship seemed to have rusted over the years.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_86_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_86_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_86_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_86_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_86_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: i never ride roller coasters\nB: i am scared of amusement parks\nC: i prefer to stay at home on my birthday\nD: i always ride the brain buster roller coaster on my birthday . a broken transmission is n't going to stop me .the only other transportation available at my home is designed for a 2-year-old .nothing was going to stop me , so i walked the train tracks .it took forever but i finally made it to the amusement park .that 's me , in the second car , turning green and about to lose my lunch . i ca n't imagine a better way to spend my birthday .", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: i never ride roller coasters\nB: i am scared of amusement parks\nC: i prefer to stay at home on my birthday\nD: i always ride the brain buster roller coaster on my birthday . a broken transmission is n't going to stop me .the only other transportation available at my home is designed for a 2-year-old .nothing was going to stop me , so i walked the train tracks .it took forever but i finally made it to the amusement park .that 's me , in the second car , turning green and about to lose my lunch . i ca n't imagine a better way to spend my birthday .", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_87_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_87_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_87_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_87_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_87_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: A group of friends go on a hike in the mountains. They have a picnic at a scenic spot and take photos. They spot a deer in the distance and watch it as it wanders off. Later, they recount their adventure to their families.\nB: A man loads his dogs in the car for a trip to the beach. The man has made it to the beach and has some coffee while taking a walk. He stops to look at a seal on a rock, who is surrounded by seagulls. The seal dives into the water as the birds swim around it. The man is back home, telling his viewers about his trip.\nC: A family goes on a road trip to visit a zoo. They enjoy observing the various animals and have a picnic lunch. They see a monkey swinging in the trees and a peacock displaying its feathers. They reminisce about their fun day during dinner.\nD: A woman takes her cats to a park for a picnic. She enjoys her lunch while sitting on a bench and watching the ducks in the pond. She notices a turtle on a log, which quickly disappears into the water. She returns home and shares her experience with her friends.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: A group of friends go on a hike in the mountains. They have a picnic at a scenic spot and take photos. They spot a deer in the distance and watch it as it wanders off. Later, they recount their adventure to their families.\nB: A man loads his dogs in the car for a trip to the beach. The man has made it to the beach and has some coffee while taking a walk. He stops to look at a seal on a rock, who is surrounded by seagulls. The seal dives into the water as the birds swim around it. The man is back home, telling his viewers about his trip.\nC: A family goes on a road trip to visit a zoo. They enjoy observing the various animals and have a picnic lunch. They see a monkey swinging in the trees and a peacock displaying its feathers. They reminisce about their fun day during dinner.\nD: A woman takes her cats to a park for a picnic. She enjoys her lunch while sitting on a bench and watching the ducks in the pond. She notices a turtle on a log, which quickly disappears into the water. She returns home and shares her experience with her friends.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_88_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_88_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_88_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_88_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_88_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: A man riding a bike in the mountains.\nB: A family having a picnic in the woods.\nC: In the beautiful sea, a man take his boat with his dog. He enjoyed to ride the boat and the end he came to sea shore for stop the boat. He had a tattoos on his full of hand. He wants to take a pictures of his tattoos. He had fun the trip with his dog.\nD: A woman walking her cat in a park.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: A man riding a bike in the mountains.\nB: A family having a picnic in the woods.\nC: In the beautiful sea, a man take his boat with his dog. He enjoyed to ride the boat and the end he came to sea shore for stop the boat. He had a tattoos on his full of hand. He wants to take a pictures of his tattoos. He had fun the trip with his dog.\nD: A woman walking her cat in a park.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_89_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_89_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_89_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_89_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_89_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: these images capture the peaceful serenity of natural landscapes and scenic views\nB: these images illustrate a bustling city with various attractions and activities to explore\nC: we have always enjoyed travelling to far away places .with so many opportunities to enjoy unique experiences .many times , even the places we stayed provided new experiences , like sleeping in bunk beds .we also enjoyed staying in places where we were able to gather with other visitors and share stories .with so many sites to see , it always seems like we have to leave far too soon .\nD: these images depict a cozy home where we can relax and unwind after a long day of exploring", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: these images capture the peaceful serenity of natural landscapes and scenic views\nB: these images illustrate a bustling city with various attractions and activities to explore\nC: we have always enjoyed travelling to far away places .with so many opportunities to enjoy unique experiences .many times , even the places we stayed provided new experiences , like sleeping in bunk beds .we also enjoyed staying in places where we were able to gather with other visitors and share stories .with so many sites to see , it always seems like we have to leave far too soon .\nD: these images depict a cozy home where we can relax and unwind after a long day of exploring", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_90_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_90_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_90_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_90_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_90_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: a political rally with a famous speaker\nB: the statue was finally in place ...the crowd began to gather to hear the donation ceremony and the speech .finally , pastor smith took the stage and began to speak .the veterans that served with the colonel were all present .and , they could not have been happier with the beautiful statue that honored their friend .\nC: a group of people gathering for a wedding ceremony\nD: a memorial service for a fallen soldier", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: a political rally with a famous speaker\nB: the statue was finally in place ...the crowd began to gather to hear the donation ceremony and the speech .finally , pastor smith took the stage and began to speak .the veterans that served with the colonel were all present .and , they could not have been happier with the beautiful statue that honored their friend .\nC: a group of people gathering for a wedding ceremony\nD: a memorial service for a fallen soldier", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_91_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_91_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_91_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_91_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_91_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: we arrived at our destination as the moon was going down . that was fine because we were wide awake from the adventurous trip .after a restless night we went adventuring and came across a deserted pier . we meandered down the pier for a bit .when we reached the end we found ourselves on a pebbly beach where we spent some time looking for starfish .in the distance the clouds gathered and we thought we would get caught in a storm .fortunately , the clouds dispersed and we enjoyed the rest of our day at the seashore .\nB: we arrived at our destination as the sun was going down . that was fine because we were pretty tired from the long trip .after a good nights sleep we went exploring and came across a long the dock . we walked down the dock for a while .when we came to the end we found ourselves on a rocky beach where we spent some time looking for crabs .in the distance the clouds rolled in and we thought we would get caught in the rain .fortunately , the clouds passed and we enjoyed the rest of our day at the seashore .\nC: we arrived at our destination as the sun was setting . that was fine because we were exhausted from the lengthy trip .after a rough night's sleep we went exploring and came across a wooden dock . we trudged down the dock for a bit .when we came to the end we found ourselves on a sandy beach where we spent some time looking for shells .in the distance the clouds dissipated and we thought we would have clear skies .unfortunately , the clouds gathered and we got caught in the rain .fortunately , the clouds dispersed and we enjoyed the rest of our day at the beach .\nD: we arrived at our destination as the sun was coming up . that was fine because we were well-rested from the short trip .after a sleepless night we went wandering and came across a short pier . we strolled down the pier for a short time .when we reached the end we found ourselves on a sandy beach where we spent some time looking for seashells .in the distance the clouds dispersed and we thought we would have clear skies .unfortunately , the clouds gathered and we got caught in the rain .fortunately , the clouds passed and we enjoyed the rest of our day at the seashore .", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: we arrived at our destination as the moon was going down . that was fine because we were wide awake from the adventurous trip .after a restless night we went adventuring and came across a deserted pier . we meandered down the pier for a bit .when we reached the end we found ourselves on a pebbly beach where we spent some time looking for starfish .in the distance the clouds gathered and we thought we would get caught in a storm .fortunately , the clouds dispersed and we enjoyed the rest of our day at the seashore .\nB: we arrived at our destination as the sun was going down . that was fine because we were pretty tired from the long trip .after a good nights sleep we went exploring and came across a long the dock . we walked down the dock for a while .when we came to the end we found ourselves on a rocky beach where we spent some time looking for crabs .in the distance the clouds rolled in and we thought we would get caught in the rain .fortunately , the clouds passed and we enjoyed the rest of our day at the seashore .\nC: we arrived at our destination as the sun was setting . that was fine because we were exhausted from the lengthy trip .after a rough night's sleep we went exploring and came across a wooden dock . we trudged down the dock for a bit .when we came to the end we found ourselves on a sandy beach where we spent some time looking for shells .in the distance the clouds dissipated and we thought we would have clear skies .unfortunately , the clouds gathered and we got caught in the rain .fortunately , the clouds dispersed and we enjoyed the rest of our day at the beach .\nD: we arrived at our destination as the sun was coming up . that was fine because we were well-rested from the short trip .after a sleepless night we went wandering and came across a short pier . we strolled down the pier for a short time .when we reached the end we found ourselves on a sandy beach where we spent some time looking for seashells .in the distance the clouds dispersed and we thought we would have clear skies .unfortunately , the clouds gathered and we got caught in the rain .fortunately , the clouds passed and we enjoyed the rest of our day at the seashore .", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_92_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_92_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_92_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_92_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_92_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: Getting in an argument\nB: Hearing a happy news\nC: Family members visiting for a small celebration in an old age home\nD: Kids being grounded", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: Getting in an argument\nB: Hearing a happy news\nC: Family members visiting for a small celebration in an old age home\nD: Kids being grounded", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_93_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_93_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_93_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_93_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_93_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: The girls father never arrives after he found out where she was. the three friends stood calmly against the older male The older male rephrases his first sentence about a disciplinary action he will take if she doesnt g the female reflects on her prior decision making and furture consequences One of the younger males decides against her by saying something to the father\nB: The girls mother finally arrives after he found out where she was. the three friends stood defiantly against the older male The older male rephrases his first sentence about a disciplinary action he will take if she doesnt g the female reflects on her prior decision making and furture consequences One of the younger males decides for her by saying something to the mother\nC: The girls father finally arrives after he found out where she was. the three friends stood defiantly against the older male The older male rephrases his first sentence about a disciplinary action he will take if she doesnt g the female reflects on her prior decision making and furture consequences One of the younger males decides for her by saying something to the father\nD: The boys father finally arrives after he found out where he was. the three friends stood defiantly against the older male The older male rephrases his first sentence about a disciplinary action he will take if she doesnt g the female reflects on her prior decision making and furture consequences One of the younger females decides for her by saying something to the father", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: The girls father never arrives after he found out where she was. the three friends stood calmly against the older male The older male rephrases his first sentence about a disciplinary action he will take if she doesnt g the female reflects on her prior decision making and furture consequences One of the younger males decides against her by saying something to the father\nB: The girls mother finally arrives after he found out where she was. the three friends stood defiantly against the older male The older male rephrases his first sentence about a disciplinary action he will take if she doesnt g the female reflects on her prior decision making and furture consequences One of the younger males decides for her by saying something to the mother\nC: The girls father finally arrives after he found out where she was. the three friends stood defiantly against the older male The older male rephrases his first sentence about a disciplinary action he will take if she doesnt g the female reflects on her prior decision making and furture consequences One of the younger males decides for her by saying something to the father\nD: The boys father finally arrives after he found out where he was. the three friends stood defiantly against the older male The older male rephrases his first sentence about a disciplinary action he will take if she doesnt g the female reflects on her prior decision making and furture consequences One of the younger females decides for her by saying something to the father", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_94_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_94_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_94_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_94_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_94_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: The man is cooking in the kitchen. His son is also helping him. They bake a delicious cake!\nB: The man is swimming in the pool. His son is also swimming with him. They find a lost toy!\nC: The man is fishing on the dock. His son is also fishing with him. They catch a fish! The son is super excited to have caught his first fish. He then catches another fish right away!\nD: The man is gardening in the backyard. His son is also helping him. They plant a new tree!", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: The man is cooking in the kitchen. His son is also helping him. They bake a delicious cake!\nB: The man is swimming in the pool. His son is also swimming with him. They find a lost toy!\nC: The man is fishing on the dock. His son is also fishing with him. They catch a fish! The son is super excited to have caught his first fish. He then catches another fish right away!\nD: The man is gardening in the backyard. His son is also helping him. They plant a new tree!", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_95_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_95_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_95_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_95_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_95_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: a rocky beach in a black and white photograph .the waves come crashing onto the beach .a bench in the middle of the beach .a couple walk hand in hand along the rocky coastline .sunset over a forgotten beach .\nB: a busy city street with tall buildings\nC: a sunny beach with palm trees and clear blue water\nD: a snowy mountain peak with skiers", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: a rocky beach in a black and white photograph .the waves come crashing onto the beach .a bench in the middle of the beach .a couple walk hand in hand along the rocky coastline .sunset over a forgotten beach .\nB: a busy city street with tall buildings\nC: a sunny beach with palm trees and clear blue water\nD: a snowy mountain peak with skiers", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_96_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_96_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_96_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_96_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_96_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: woman cooking in the living room while the children eat\nB: man playing video games in the kitchen while children watch TV\nC: man prepare a food in the kitchen and grill some bacon and meet to prepare arrange the food to serve children's are eat the food in the table with happyness and the mom and grandma  watching the children's with happy ness\nD: no one in the kitchen while children prepare their own food", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: woman cooking in the living room while the children eat\nB: man playing video games in the kitchen while children watch TV\nC: man prepare a food in the kitchen and grill some bacon and meet to prepare arrange the food to serve children's are eat the food in the table with happyness and the mom and grandma  watching the children's with happy ness\nD: no one in the kitchen while children prepare their own food", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_97_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_97_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_97_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_97_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_97_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: the couples posed for a picture in the crowdmany of the couples gathered to celebrate achievements in the communitythey all gathered inside to socializemany pictures were taken between friendsand also solo pictures were taken\nB: pictures of friends taken at an event\nC: a gathering of people for socializing and photography\nD: the people gathered for a group photo at a partycelebrations to mark community successes", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: the couples posed for a picture in the crowdmany of the couples gathered to celebrate achievements in the communitythey all gathered inside to socializemany pictures were taken between friendsand also solo pictures were taken\nB: pictures of friends taken at an event\nC: a gathering of people for socializing and photography\nD: the people gathered for a group photo at a partycelebrations to mark community successes", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_98_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_98_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_98_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_98_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_98_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: The kid looks bery excited about the wedding. The guy looks pretty shocked about something. Shes trying to figure out what dress she would like to have for her wedding. She seem to be happy about the dress she picked out. All of her family memebers seem to like it as well.\nB: She's confused about what dress she wants for the wedding.\nC: The kid looks bored at the wedding.\nD: The guy looks amused by something.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: The kid looks bery excited about the wedding. The guy looks pretty shocked about something. Shes trying to figure out what dress she would like to have for her wedding. She seem to be happy about the dress she picked out. All of her family memebers seem to like it as well.\nB: She's confused about what dress she wants for the wedding.\nC: The kid looks bored at the wedding.\nD: The guy looks amused by something.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_99_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_99_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_99_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_99_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_99_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: They are create new one They mom seen this They mom looking some surprice The mom angry with they The little boy afraid here\nB: The family is having a picnic\nC: The boy is happy and excited\nD: The mom is peacefully reading a book", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: They are create new one They mom seen this They mom looking some surprice The mom angry with they The little boy afraid here\nB: The family is having a picnic\nC: The boy is happy and excited\nD: The mom is peacefully reading a book", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_100_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_100_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_100_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_100_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_100_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: There was no river in the hill station.\nB: A man is driving in the car.\nC: The car was moving back from the hill station.\nD: A woman is driving in the car. The car was moving ahead to the hill station. There was a river in the hill station which makes the hill station beautiful. She going to the river. She is close to the river.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: There was no river in the hill station.\nB: A man is driving in the car.\nC: The car was moving back from the hill station.\nD: A woman is driving in the car. The car was moving ahead to the hill station. There was a river in the hill station which makes the hill station beautiful. She going to the river. She is close to the river.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_101_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_101_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_101_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_101_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_101_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: i have been working hard all day .time to have a nice meal to end my day .then some drinks with family .it is nice enough day to take a drive .then try on some new shoes at the store .\nB: i have been working hard all day .time to have a nice meal to end my day .then some drinks alone .it is nice enough day to take a drive .then try on some new shoes at the store .\nC: i have been working hard all day .time to have a plain meal to end my day .then some drinks with family .it is nice enough day to take a drive .then try on some old shoes at the store .\nD: i have been relaxing all day .time to have a nice meal to end my day .then some drinks with family .it is nice enough day to take a drive .then try on some new shoes at the store .", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: i have been working hard all day .time to have a nice meal to end my day .then some drinks with family .it is nice enough day to take a drive .then try on some new shoes at the store .\nB: i have been working hard all day .time to have a nice meal to end my day .then some drinks alone .it is nice enough day to take a drive .then try on some new shoes at the store .\nC: i have been working hard all day .time to have a plain meal to end my day .then some drinks with family .it is nice enough day to take a drive .then try on some old shoes at the store .\nD: i have been relaxing all day .time to have a nice meal to end my day .then some drinks with family .it is nice enough day to take a drive .then try on some new shoes at the store .", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_102_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_102_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_102_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_102_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_102_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: The climbers are struggling with extreme cold and harsh weather conditions, making it difficult for them to continue their journey.\nB: The climbers are enjoying a leisurely hike on a sunny day with no obstacles.\nC: The climbers are at the base of the mountain and are just starting their ascent.\nD: The climbers seems to be almost at the zenith of the daring mountain. They struggle to overcome changes in elevation thru the ice filled rocky terrain. Both climbers seem exhausted from the adventure and the weight they are carrying. However, they make it to the top in an outstanding show of dedication to enjoy the view. The rest of the journey awaits as they begin to trek down the slippery slopes.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: The climbers are struggling with extreme cold and harsh weather conditions, making it difficult for them to continue their journey.\nB: The climbers are enjoying a leisurely hike on a sunny day with no obstacles.\nC: The climbers are at the base of the mountain and are just starting their ascent.\nD: The climbers seems to be almost at the zenith of the daring mountain. They struggle to overcome changes in elevation thru the ice filled rocky terrain. Both climbers seem exhausted from the adventure and the weight they are carrying. However, they make it to the top in an outstanding show of dedication to enjoy the view. The rest of the journey awaits as they begin to trek down the slippery slopes.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_103_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_103_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_103_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_103_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_103_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: A group of friends visited a zoo and saw various animals\nB: A family went to the beach for a picnic\nC: On a regular day, two boys went to a library\nD: On the weekend, five lovely girls went to a playground park. They arrived with a guide who is knowledgeable about the park.  Before entering the park, she gave the playing role instructions. They had a wonderful time and participated in a variety of activities.  They played golf, which was incredibly interesting to them. finally, they immensely enjoyed it.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: A group of friends visited a zoo and saw various animals\nB: A family went to the beach for a picnic\nC: On a regular day, two boys went to a library\nD: On the weekend, five lovely girls went to a playground park. They arrived with a guide who is knowledgeable about the park.  Before entering the park, she gave the playing role instructions. They had a wonderful time and participated in a variety of activities.  They played golf, which was incredibly interesting to them. finally, they immensely enjoyed it.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_104_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_104_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_104_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_104_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_104_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: A gathering takes place in the market of people hailing from numerous countries in Africa. As the number of people present increases, a growing number of them are also working alongside one another in order to make a living. Some are trying to make a living by selling food, cleaning clothing, and providing for their family through these activities. When it comes to discovering a wide variety of opportunities, Africa is among the most prominent regions to be located in. Those who are willing to keep pushing the limit through increasingly difficult circumstances experience severe hunger.\nB: A group of people at a concert enjoying music.\nC: A group of people sitting in a classroom studying.\nD: A group of people having a picnic in a park.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: A gathering takes place in the market of people hailing from numerous countries in Africa. As the number of people present increases, a growing number of them are also working alongside one another in order to make a living. Some are trying to make a living by selling food, cleaning clothing, and providing for their family through these activities. When it comes to discovering a wide variety of opportunities, Africa is among the most prominent regions to be located in. Those who are willing to keep pushing the limit through increasingly difficult circumstances experience severe hunger.\nB: A group of people at a concert enjoying music.\nC: A group of people sitting in a classroom studying.\nD: A group of people having a picnic in a park.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_105_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_105_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_105_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_105_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_105_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: A group of locals visited a foreigner and asked for recommendations on what to do in the area.\nB: A traveler visited a market and bought some local food to try.\nC: A foreigner visited locals ina  restaurant. He asked what was good to eat. They suggested some options for them. They discussed this as a group. He decided on what he wanted.\nD: A local visited foreigners in a restaurant and asked for recommendations on what to eat.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: A group of locals visited a foreigner and asked for recommendations on what to do in the area.\nB: A traveler visited a market and bought some local food to try.\nC: A foreigner visited locals ina  restaurant. He asked what was good to eat. They suggested some options for them. They discussed this as a group. He decided on what he wanted.\nD: A local visited foreigners in a restaurant and asked for recommendations on what to eat.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_106_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_106_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_106_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_106_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_106_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: These people are going for a picnic in the park.\nB: These people are gathering to go ride their bikes. They took a selfie on their bikes before riding. They rode down a steep hill on their bikes. The man takes another selfie of himself with his gear on. Then he pops a wheelie on his bike.\nC: These people are having a barbecue in their backyard.\nD: These people are preparing to go swimming at the beach.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: These people are going for a picnic in the park.\nB: These people are gathering to go ride their bikes. They took a selfie on their bikes before riding. They rode down a steep hill on their bikes. The man takes another selfie of himself with his gear on. Then he pops a wheelie on his bike.\nC: These people are having a barbecue in their backyard.\nD: These people are preparing to go swimming at the beach.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_107_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_107_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_107_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_107_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_107_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: this is a snapshot of a busy city street.\nB: this is a photo of a crowded marketplace.\nC: this is our first preview of the new restaurant .the decorations did n't seem like much .we had heard that this restaurant would be good though .we had a view of the train out of the window .the walls were plastered with these designs .\nD: this is a collection of ancient artifacts.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: this is a snapshot of a busy city street.\nB: this is a photo of a crowded marketplace.\nC: this is our first preview of the new restaurant .the decorations did n't seem like much .we had heard that this restaurant would be good though .we had a view of the train out of the window .the walls were plastered with these designs .\nD: this is a collection of ancient artifacts.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_108_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_108_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_108_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_108_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_108_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: The graffiti in our city is not good. The paintings on the building are mediocre. The people and statues are not impressive.\nB: The graffiti in our city is terrible and ugly. The paintings depict ugly people and statues.\nC: our city is really a lovely place .even the graffiti is done in good taste .paintings on the building depicting beautiful people .and gorgeous statues .and who could resist the baby penguin named [female] ?\nD: The graffiti in this city is so-so. The building looks unimpressive. The people and statues are nothing special.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: The graffiti in our city is not good. The paintings on the building are mediocre. The people and statues are not impressive.\nB: The graffiti in our city is terrible and ugly. The paintings depict ugly people and statues.\nC: our city is really a lovely place .even the graffiti is done in good taste .paintings on the building depicting beautiful people .and gorgeous statues .and who could resist the baby penguin named [female] ?\nD: The graffiti in this city is so-so. The building looks unimpressive. The people and statues are nothing special.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_109_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_109_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_109_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_109_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_109_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: Woman with blonde hair in front of a purple wall. Woman with black hair in front of what seems to be a painting. White woman serving food to a hispanic man in mid-twenties. Woman with strong (greek) nose looking down. Hands pouring soup in front of water.\nB: Woman with brown hair in front of a purple wall. Woman with black hair in front of what seems to be a painting. Black man serving food to a hispanic man in mid-twenties. Woman with strong (greek) nose looking down. Hands pouring soup in front of water.\nC: Man with blonde hair in front of a purple wall. Woman with black hair in front of what seems to be a painting. White woman serving food to a hispanic man in mid-twenties. Woman with strong (greek) nose looking down. Hands pouring soup in front of water.\nD: Woman with blonde hair in front of a yellow wall. Woman with black hair in front of what seems to be a painting. White woman serving food to an asian man in mid-twenties. Woman with strong (greek) nose looking down. Hands pouring soup in front of water.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: Woman with blonde hair in front of a purple wall. Woman with black hair in front of what seems to be a painting. White woman serving food to a hispanic man in mid-twenties. Woman with strong (greek) nose looking down. Hands pouring soup in front of water.\nB: Woman with brown hair in front of a purple wall. Woman with black hair in front of what seems to be a painting. Black man serving food to a hispanic man in mid-twenties. Woman with strong (greek) nose looking down. Hands pouring soup in front of water.\nC: Man with blonde hair in front of a purple wall. Woman with black hair in front of what seems to be a painting. White woman serving food to a hispanic man in mid-twenties. Woman with strong (greek) nose looking down. Hands pouring soup in front of water.\nD: Woman with blonde hair in front of a yellow wall. Woman with black hair in front of what seems to be a painting. White woman serving food to an asian man in mid-twenties. Woman with strong (greek) nose looking down. Hands pouring soup in front of water.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_110_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_110_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_110_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_110_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_110_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: I took my daughter out to eat for her birthday. Then we went to the arcade and we played some games.\nB: I took my son out to eat for his birthday. Then we went to the arcade and we played some games. He picked out some prizes to get with the tickets he won. After that we played some arcade video games together. We both enjoyed playing duck hunt the most!\nC: I took my son out to eat for his birthday. Then we went to the movies and we watched a movie.\nD: I took my son out to eat for his birthday. Then we went to the park and we played on the swings.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: I took my daughter out to eat for her birthday. Then we went to the arcade and we played some games.\nB: I took my son out to eat for his birthday. Then we went to the arcade and we played some games. He picked out some prizes to get with the tickets he won. After that we played some arcade video games together. We both enjoyed playing duck hunt the most!\nC: I took my son out to eat for his birthday. Then we went to the movies and we watched a movie.\nD: I took my son out to eat for his birthday. Then we went to the park and we played on the swings.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_111_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_111_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_111_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_111_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_111_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: The images capture a scenic mountain hike.\nB: The images depict a casual walk in the park.\nC: The images show a day at the beach with friends.\nD: today started off like an ordinary work day . little did i know that today would be different than all the rest !i walked along the crowded location location streets until i came to my office building . i went in and started to work .at the end of the day , i finally got the courage to ask my coworker out . we walked the town and talked a lot .we had a great dinner in a restaurant here . we really hit it off .we talked until dawn , and then i walked her home to her apartment . i ca n't wait to see her again !", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: The images capture a scenic mountain hike.\nB: The images depict a casual walk in the park.\nC: The images show a day at the beach with friends.\nD: today started off like an ordinary work day . little did i know that today would be different than all the rest !i walked along the crowded location location streets until i came to my office building . i went in and started to work .at the end of the day , i finally got the courage to ask my coworker out . we walked the town and talked a lot .we had a great dinner in a restaurant here . we really hit it off .we talked until dawn , and then i walked her home to her apartment . i ca n't wait to see her again !", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_112_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_112_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_112_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_112_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_112_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: there is a birthday party celebration with vegetarian dishes being served and family members are bored\nB: there is a family party preparation going on there non veg dishes in the party like expensive party family members are enjoying kids are enjoying so much\nC: the images show a formal event with only vegetarian dishes and the guests seem uninterested\nD: the pictures depict a casual gathering with no food and kids are getting bored", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: there is a birthday party celebration with vegetarian dishes being served and family members are bored\nB: there is a family party preparation going on there non veg dishes in the party like expensive party family members are enjoying kids are enjoying so much\nC: the images show a formal event with only vegetarian dishes and the guests seem uninterested\nD: the pictures depict a casual gathering with no food and kids are getting bored", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_113_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_113_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_113_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_113_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_113_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: the fireworks were a great addition ,to the annual celebration .[female] is holding up her vegetables on sticks ,while her uncle cooks on the grill .the view was breathtaking .\nB: the fireworks are a great addition ,to the annual celebration .[female] is holding up her meat on sticks ,while her uncle cooks on the grill .the view was breathtaking .\nC: the parade was a great addition ,to the annual celebration .[female] is holding up her meat on sticks ,while her uncle cooks on the grill .the view was breathtaking .\nD: the picnic was a great addition ,to the annual celebration .[male] is holding up his meat on sticks ,while his nephew cooks on the grill .the view was breathtaking .", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: the fireworks were a great addition ,to the annual celebration .[female] is holding up her vegetables on sticks ,while her uncle cooks on the grill .the view was breathtaking .\nB: the fireworks are a great addition ,to the annual celebration .[female] is holding up her meat on sticks ,while her uncle cooks on the grill .the view was breathtaking .\nC: the parade was a great addition ,to the annual celebration .[female] is holding up her meat on sticks ,while her uncle cooks on the grill .the view was breathtaking .\nD: the picnic was a great addition ,to the annual celebration .[male] is holding up his meat on sticks ,while his nephew cooks on the grill .the view was breathtaking .", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_114_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_114_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_114_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_114_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_114_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: The guests were not happy with the food.\nB: today is a good day to invite family over .it is such a nice day today , and we finished cleaning up the yard .we should water all the grass before people come over .some of the guests made a campfire to cook some of the food .everyone is having a great time , the meal was great today .\nC: It is raining heavily today.\nD: The yard is not clean.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: The guests were not happy with the food.\nB: today is a good day to invite family over .it is such a nice day today , and we finished cleaning up the yard .we should water all the grass before people come over .some of the guests made a campfire to cook some of the food .everyone is having a great time , the meal was great today .\nC: It is raining heavily today.\nD: The yard is not clean.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_115_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_115_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_115_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_115_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_115_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: she had a terrible time\nB: she fell only once\nC: i taught my daughter how to ride her bike today .she had a great time .she only fell four times .she was okay though .afterward i bought a cake for her .\nD: she was injured", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: she had a terrible time\nB: she fell only once\nC: i taught my daughter how to ride her bike today .she had a great time .she only fell four times .she was okay though .afterward i bought a cake for her .\nD: she was injured", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_116_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_116_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_116_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_116_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_116_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: heres a certificate for our baby , told ya mom he was real !the whole family met to celebrate , generations all under one roof .mom loves taking pictures with us , she cant let go of the baby .grandma too had to have some pictures . we didnt mind though , we are one big happy family .[male] [male] might be too young for his cake but we all loved it for him .\nB: A couple sharing a special moment with their newborn baby.\nC: A family reunion with multiple generations coming together.\nD: A group of friends celebrating a birthday party.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: heres a certificate for our baby , told ya mom he was real !the whole family met to celebrate , generations all under one roof .mom loves taking pictures with us , she cant let go of the baby .grandma too had to have some pictures . we didnt mind though , we are one big happy family .[male] [male] might be too young for his cake but we all loved it for him .\nB: A couple sharing a special moment with their newborn baby.\nC: A family reunion with multiple generations coming together.\nD: A group of friends celebrating a birthday party.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_117_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_117_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_117_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_117_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_117_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: the evening started out relaxed and laid-back, everyone was confident. they had a short warm journey ahead of them. they had to walk a long distance to get to their destination. once there, everyone was reserved and cautious. the d.j.`s started to set up their booth for a casual setting and soothing music.\nB: the night started out calm and peaceful, everyone was on the same page. they had a short warm journey ahead of them. they had to drive a short distance to get to their destination. once there, everyone stayed reserved and stayed formal. the d.j.`s started to set up their booth for a calm atmosphere and slow music.\nC: the day started out a bit hectic , everyone was a little confused .they had a long cold journey ahead of them .they had to walk a bit of a distance to get to their destination .once there , everyone pretty much let loose and got comfortable .the d.j.`s started to set up their booth for a good time and good music .\nD: the morning started out exciting and energetic, everyone was highly motivated. they had a short hot journey ahead of them. they had to run a long distance to get to their destination. once there, everyone eagerly got to work and got busy. the d.j.`s started to set up their booth for an early start and hyped music.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: the evening started out relaxed and laid-back, everyone was confident. they had a short warm journey ahead of them. they had to walk a long distance to get to their destination. once there, everyone was reserved and cautious. the d.j.`s started to set up their booth for a casual setting and soothing music.\nB: the night started out calm and peaceful, everyone was on the same page. they had a short warm journey ahead of them. they had to drive a short distance to get to their destination. once there, everyone stayed reserved and stayed formal. the d.j.`s started to set up their booth for a calm atmosphere and slow music.\nC: the day started out a bit hectic , everyone was a little confused .they had a long cold journey ahead of them .they had to walk a bit of a distance to get to their destination .once there , everyone pretty much let loose and got comfortable .the d.j.`s started to set up their booth for a good time and good music .\nD: the morning started out exciting and energetic, everyone was highly motivated. they had a short hot journey ahead of them. they had to run a long distance to get to their destination. once there, everyone eagerly got to work and got busy. the d.j.`s started to set up their booth for an early start and hyped music.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_118_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_118_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_118_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_118_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_118_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: the vacation spot was empty .nobody wanted to spend memorial weekend here .the pool was dirty and unappealing .the garden was an eyesore .the canyon was a boring place to visit .\nB: the vacation spot was lively and vibrant .many people came out here to spend memorial weekend .the pool was a crowded and noisy place to relax .this garden was a beautiful and picturesque sight .a few miles away from the vacation spot there was a quiet woodland people liked to visit .\nC: the vacation spot was crowded .everyone came out here to spend memorial weekend .the pool was a refreshing way to relax .this garden was an attractive sight .a few miles away from the vacation spot there was a canyon people liked to visit .\nD: the vacation spot was peaceful and serene .hardly anyone came out here to spend memorial weekend .the pool was closed and inaccessible .this garden was a dull and uninteresting sight .a few miles away from the vacation spot there was a bustling city people liked to visit .", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: the vacation spot was empty .nobody wanted to spend memorial weekend here .the pool was dirty and unappealing .the garden was an eyesore .the canyon was a boring place to visit .\nB: the vacation spot was lively and vibrant .many people came out here to spend memorial weekend .the pool was a crowded and noisy place to relax .this garden was a beautiful and picturesque sight .a few miles away from the vacation spot there was a quiet woodland people liked to visit .\nC: the vacation spot was crowded .everyone came out here to spend memorial weekend .the pool was a refreshing way to relax .this garden was an attractive sight .a few miles away from the vacation spot there was a canyon people liked to visit .\nD: the vacation spot was peaceful and serene .hardly anyone came out here to spend memorial weekend .the pool was closed and inaccessible .this garden was a dull and uninteresting sight .a few miles away from the vacation spot there was a bustling city people liked to visit .", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_119_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_119_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_119_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_119_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_119_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: the family went to the beach to watch the sunrise\nB: the family went to the beach to see the sunset .it was a beautiful day !there were a lot of people sitting in the sand .our dog enjoyed the trip , too !then we came home and had a delicious dinner !\nC: the family went to the mountains for a hike\nD: the family went to the park to have a picnic", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: the family went to the beach to watch the sunrise\nB: the family went to the beach to see the sunset .it was a beautiful day !there were a lot of people sitting in the sand .our dog enjoyed the trip , too !then we came home and had a delicious dinner !\nC: the family went to the mountains for a hike\nD: the family went to the park to have a picnic", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_120_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_120_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_120_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_120_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_120_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: She appeared nervous and unprepared for the tough competition, lacking the confidence to give her best swing.\nB: She looked indifferent about bowling, uninterested in the tough competition, and unwilling to give her best swing.\nC: She seemed disinterested in bowling and unprepared for the tough competition, showing no intention to give her best swing.\nD: She looked excited to be bowling, she knew the competition was tough, but she loved a challenege. She was about to gove her best swing, bevause we all wish for a 20 pin knock out. The anticipation as you watch the ball roll down the runway, although it only takes a second Its the competition throw now, i hope he doesnt roll as good as me! You Question how many oins can you STRIKE OUT", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: She appeared nervous and unprepared for the tough competition, lacking the confidence to give her best swing.\nB: She looked indifferent about bowling, uninterested in the tough competition, and unwilling to give her best swing.\nC: She seemed disinterested in bowling and unprepared for the tough competition, showing no intention to give her best swing.\nD: She looked excited to be bowling, she knew the competition was tough, but she loved a challenege. She was about to gove her best swing, bevause we all wish for a 20 pin knock out. The anticipation as you watch the ball roll down the runway, although it only takes a second Its the competition throw now, i hope he doesnt roll as good as me! You Question how many oins can you STRIKE OUT", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_121_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_121_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_121_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_121_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_121_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: a good day to fish at lake a guy is ready to fish he catches one and puts in cooler he closes the cooler he greets dog\nB: a woman is feeding birds in the park\nC: a man is dancing in the rain with his dog\nD: a group of friends having a picnic by the beach", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: a good day to fish at lake a guy is ready to fish he catches one and puts in cooler he closes the cooler he greets dog\nB: a woman is feeding birds in the park\nC: a man is dancing in the rain with his dog\nD: a group of friends having a picnic by the beach", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_122_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_122_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_122_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_122_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_122_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: Shows a wooden house from the bottom of the deck a man is using a fire with logs to make something the smoke exits through a gap in the cieling theres a ring around the fire the house is held up by beams\nB: Features a serene beach with crystal clear water and palm trees\nC: Portrays a snow-covered mountain peak with skiers in the distance\nD: Depicts a modern cityscape with tall skyscrapers and bustling streets", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: Shows a wooden house from the bottom of the deck a man is using a fire with logs to make something the smoke exits through a gap in the cieling theres a ring around the fire the house is held up by beams\nB: Features a serene beach with crystal clear water and palm trees\nC: Portrays a snow-covered mountain peak with skiers in the distance\nD: Depicts a modern cityscape with tall skyscrapers and bustling streets", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_123_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_123_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_123_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_123_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_123_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: A group of friends enjoying a scooter ride in the mountains.\nB: A group of professional off-road cyclists.\nC: A family enjoying a day out in the countryside.\nD: That is the best scooterist in Poland So sick! Off roading, I didn't even think it was possible. Tandem off roading, oh my god! Kids! Cabbage soup is ready!", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: A group of friends enjoying a scooter ride in the mountains.\nB: A group of professional off-road cyclists.\nC: A family enjoying a day out in the countryside.\nD: That is the best scooterist in Poland So sick! Off roading, I didn't even think it was possible. Tandem off roading, oh my god! Kids! Cabbage soup is ready!", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_124_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_124_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_124_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_124_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_124_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: we waited in line for a long time and it was frustrating !\nB: the field was small and unimpressive !\nC: we were on the bus and ready to have some fun !we waited in line , but only for a little while !it was time to go and enjoy ourselves !while in the stadium , we gazed at all of the awesome sites !the field was huge and beautiful ! we had fun !\nD: we were bored and didn't enjoy the stadium at all !", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: we waited in line for a long time and it was frustrating !\nB: the field was small and unimpressive !\nC: we were on the bus and ready to have some fun !we waited in line , but only for a little while !it was time to go and enjoy ourselves !while in the stadium , we gazed at all of the awesome sites !the field was huge and beautiful ! we had fun !\nD: we were bored and didn't enjoy the stadium at all !", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_125_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_125_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_125_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_125_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_125_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: we first went at the shores to get a closer look at the incredible view of the city .we can see the apartment we rented for the week . it has a great view .i took this pictures of the mallards . they enjoying the light breeze of the afternoon .i asked the local about this log , apparently it came from a old crusade boat that sinked here .this is where we sat and admired the vast location sea .\nB: The city view from the shores was disappointing\nC: The apartment we rented had a terrible view\nD: The mallards were scared of the light breeze", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: we first went at the shores to get a closer look at the incredible view of the city .we can see the apartment we rented for the week . it has a great view .i took this pictures of the mallards . they enjoying the light breeze of the afternoon .i asked the local about this log , apparently it came from a old crusade boat that sinked here .this is where we sat and admired the vast location sea .\nB: The city view from the shores was disappointing\nC: The apartment we rented had a terrible view\nD: The mallards were scared of the light breeze", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_126_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_126_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_126_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_126_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_126_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: Diana is getting ready for a party. Marzia warns him about the offensive poster. Marzia approaches Diana and asks him to remove the poster. Marzia apologizes to everyone in the room. They both remove the manifest.\nB: Diana is planning for a meeting. Marzia advises him to change the poster displayed. Marzia whispers something in Diana's ear and suggests removing the poster. Marzia is pleased to reconcile with everyone in the room. They both take down the poster.\nC: Diana is preparing for a rally. Marzia approaches him to warn that the poster displayed is offensive Marzia approaches Diana's ear and suggests that he remove the poster Marzia is happy to apologize with everyone  in the room They both remove the manifest.\nD: Diana is packing for a trip. Marzia informs him about an offensive poster. Marzia talks to Diana and advises him to take down the poster. Marzia is seen apologizing to everyone in the room. They both remove the manifesto.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: Diana is getting ready for a party. Marzia warns him about the offensive poster. Marzia approaches Diana and asks him to remove the poster. Marzia apologizes to everyone in the room. They both remove the manifest.\nB: Diana is planning for a meeting. Marzia advises him to change the poster displayed. Marzia whispers something in Diana's ear and suggests removing the poster. Marzia is pleased to reconcile with everyone in the room. They both take down the poster.\nC: Diana is preparing for a rally. Marzia approaches him to warn that the poster displayed is offensive Marzia approaches Diana's ear and suggests that he remove the poster Marzia is happy to apologize with everyone  in the room They both remove the manifest.\nD: Diana is packing for a trip. Marzia informs him about an offensive poster. Marzia talks to Diana and advises him to take down the poster. Marzia is seen apologizing to everyone in the room. They both remove the manifesto.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_127_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_127_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_127_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_127_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_127_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: a busy street market in Japan\nB: a fast-food restaurant serving traditional Japanese cuisine\nC: as a japanese exchange student i 'm asked quite often about what i find different .actually , we are quite the same . we have our version of fast-food restaurants .we also have our traditional restaurants and markets .miso soup ! our steaming hot fast-food eaten rapidly with ohashi ( chop sticks ) .my car . see , we are quite similar to you . : )\nD: a group of friends eating sushi at a traditional restaurant", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: a busy street market in Japan\nB: a fast-food restaurant serving traditional Japanese cuisine\nC: as a japanese exchange student i 'm asked quite often about what i find different .actually , we are quite the same . we have our version of fast-food restaurants .we also have our traditional restaurants and markets .miso soup ! our steaming hot fast-food eaten rapidly with ohashi ( chop sticks ) .my car . see , we are quite similar to you . : )\nD: a group of friends eating sushi at a traditional restaurant", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_128_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_128_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_128_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_128_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_128_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: we saw terrible sand sculptures\nB: we had a terrible weekend together\nC: the weather was terrible for hanging out at the beach\nD: we had a great weekend together .before hitting the beach , we stopped at a burger stand .the weather was great for hanging out at the beach .we saw awesome sand sculptures .we ca n't wait to come back here !", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: we saw terrible sand sculptures\nB: we had a terrible weekend together\nC: the weather was terrible for hanging out at the beach\nD: we had a great weekend together .before hitting the beach , we stopped at a burger stand .the weather was great for hanging out at the beach .we saw awesome sand sculptures .we ca n't wait to come back here !", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_129_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_129_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_129_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_129_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_129_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: At the park on a sunny day, the Thompson family enjoyed a picnic. They played games and had a great time.\nB: Late one Friday night, the Smith family gathered for dinner. The kids were well-behaved and polite.\nC: Early one Sunday morning,  the Sandler family gathered for breakfast. Of course,  Brody made a mess while eating his cereal. The girls were gabbing. The family planned their day. Dad suggested going to the zoo.\nD: On a quiet Saturday afternoon, the Johnson family cleaned their house together.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: At the park on a sunny day, the Thompson family enjoyed a picnic. They played games and had a great time.\nB: Late one Friday night, the Smith family gathered for dinner. The kids were well-behaved and polite.\nC: Early one Sunday morning,  the Sandler family gathered for breakfast. Of course,  Brody made a mess while eating his cereal. The girls were gabbing. The family planned their day. Dad suggested going to the zoo.\nD: On a quiet Saturday afternoon, the Johnson family cleaned their house together.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_130_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_130_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_130_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_130_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_130_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: The man was selling bananas at a fruit market\nB: The man was walking in the mountains\nC: The man was riding a bicycle in the city\nD: The man was admiring the bunches of bananas he had picked. He had his load of bananas tied to his sbike as he road them on the curved road. The moutains where ahead and you could see the grey sky promising rain. The mountains went higher and higher with trees and dirt. Soon the man was joined with others on bikes with bananas as well on the road.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: The man was selling bananas at a fruit market\nB: The man was walking in the mountains\nC: The man was riding a bicycle in the city\nD: The man was admiring the bunches of bananas he had picked. He had his load of bananas tied to his sbike as he road them on the curved road. The moutains where ahead and you could see the grey sky promising rain. The mountains went higher and higher with trees and dirt. Soon the man was joined with others on bikes with bananas as well on the road.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_131_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_131_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_131_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_131_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_131_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: The male is standing out of his vehicle. They are with their partner looking happy. They head down the road to see the sites. Whilst in Canada they see boats and the lakes. They are excited for their adventures.\nB: The couple is sitting inside the car. They look tired and frustrated. They are stuck in traffic and unable to move. While in Canada they are stressed about the long journey. They worry about the delays.\nC: The female is alone in the car. She looks bored and uninterested. She drives to a deserted place. While in Canada she is lost and confused. She is afraid of the unknown.\nD: There are no people in the images. Only empty vehicles and a desolate road. The surroundings are gloomy and unwelcoming. There is no sense of excitement or adventure. The atmosphere is dull and depressing.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: The male is standing out of his vehicle. They are with their partner looking happy. They head down the road to see the sites. Whilst in Canada they see boats and the lakes. They are excited for their adventures.\nB: The couple is sitting inside the car. They look tired and frustrated. They are stuck in traffic and unable to move. While in Canada they are stressed about the long journey. They worry about the delays.\nC: The female is alone in the car. She looks bored and uninterested. She drives to a deserted place. While in Canada she is lost and confused. She is afraid of the unknown.\nD: There are no people in the images. Only empty vehicles and a desolate road. The surroundings are gloomy and unwelcoming. There is no sense of excitement or adventure. The atmosphere is dull and depressing.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_132_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_132_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_132_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_132_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_132_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: local art gallery showcasing professional artists' work.\nB: local art museum to host children 's art ,artwork done by 5th grader that reminded her of spring .8th grade student created sculptures to capture winter 's ice .12th grade student converted moms van into a shaded oasis and a great view of hills .10th grader who was amazed by organisms in the body , drew this for health class .\nC: exhibition of historical artifacts and ancient relics.\nD: collection of artwork from famous painters around the world.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: local art gallery showcasing professional artists' work.\nB: local art museum to host children 's art ,artwork done by 5th grader that reminded her of spring .8th grade student created sculptures to capture winter 's ice .12th grade student converted moms van into a shaded oasis and a great view of hills .10th grader who was amazed by organisms in the body , drew this for health class .\nC: exhibition of historical artifacts and ancient relics.\nD: collection of artwork from famous painters around the world.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_133_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_133_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_133_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_133_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_133_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: these pictures showcase the experience of a day at the amusement park\nB: here we are on the first day of our trip to the beach .we were so excited that we both had to take pictures .we took a short break from the beach , but we got lost .however , we found more beach and it was more peaceful .we finally got in the water after a while .\nC: this is a collection of photos from a hiking trip in the mountains\nD: these images capture the excitement of a city tour", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: these pictures showcase the experience of a day at the amusement park\nB: here we are on the first day of our trip to the beach .we were so excited that we both had to take pictures .we took a short break from the beach , but we got lost .however , we found more beach and it was more peaceful .we finally got in the water after a while .\nC: this is a collection of photos from a hiking trip in the mountains\nD: these images capture the excitement of a city tour", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_134_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_134_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_134_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_134_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_134_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: Nobody enjoyed themselves\nB: Some people were sleeping\nC: The party was boring\nD: i went to my friend 's party last night .some of the guest were dressed up .i had a lot of fun talking to everyone there .we spent some time playing games in the living room .after a few hours everyone was very tired .", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: Nobody enjoyed themselves\nB: Some people were sleeping\nC: The party was boring\nD: i went to my friend 's party last night .some of the guest were dressed up .i had a lot of fun talking to everyone there .we spent some time playing games in the living room .after a few hours everyone was very tired .", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_135_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_135_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_135_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_135_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_135_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: Rain was pouring on the swim school sign. Inside the school there were swimwears lined up. The boy was practicing swimming. The instructor was teaching the strokes. They started to swim.\nB: Moon was shining on the gym sign. Inside the gym there were yoga mats lined up. The woman was practicing yoga. The instructor was demonstrating the poses. They started to meditate.\nC: The sun was setting on the football field. Inside the field there were football jerseys lined up. The boy was practicing football. The coach was explaining the game plan. They started to play.\nD: Sun was shining on the ballet school sign. Inside the school there was dresses lined up. The girl was practising ballet. The trainer was showing them the moves. They started to dance.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: Rain was pouring on the swim school sign. Inside the school there were swimwears lined up. The boy was practicing swimming. The instructor was teaching the strokes. They started to swim.\nB: Moon was shining on the gym sign. Inside the gym there were yoga mats lined up. The woman was practicing yoga. The instructor was demonstrating the poses. They started to meditate.\nC: The sun was setting on the football field. Inside the field there were football jerseys lined up. The boy was practicing football. The coach was explaining the game plan. They started to play.\nD: Sun was shining on the ballet school sign. Inside the school there was dresses lined up. The girl was practising ballet. The trainer was showing them the moves. They started to dance.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_136_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_136_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_136_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_136_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_136_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: The family were cleaning. They set up cleaning supplies.\nB: The family were playing games. They set up a board game.\nC: The family were gardening. They planted new flowers.\nD: The family were preparing food. They set up the grill. Food was ready to be eaten. The kids were happy that food was ready. Everyone started eating.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: The family were cleaning. They set up cleaning supplies.\nB: The family were playing games. They set up a board game.\nC: The family were gardening. They planted new flowers.\nD: The family were preparing food. They set up the grill. Food was ready to be eaten. The kids were happy that food was ready. Everyone started eating.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_137_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_137_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_137_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_137_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_137_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: The climbers are struggling with extreme cold and harsh weather conditions, making it difficult for them to continue their journey.\nB: The climbers are enjoying a leisurely hike on a sunny day with no obstacles.\nC: The climbers seems to be almost at the zenith of the daring mountain. They struggle to overcome changes in elevation thru the ice filled rocky terrain. Both climbers seem exhausted from the adventure and the weight they are carrying. However, they make it to the top in an outstanding show of dedication to enjoy the view. The rest of the journey awaits as they begin to trek down the slippery slopes.\nD: The climbers are at the base of the mountain and are just starting their ascent.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: The climbers are struggling with extreme cold and harsh weather conditions, making it difficult for them to continue their journey.\nB: The climbers are enjoying a leisurely hike on a sunny day with no obstacles.\nC: The climbers seems to be almost at the zenith of the daring mountain. They struggle to overcome changes in elevation thru the ice filled rocky terrain. Both climbers seem exhausted from the adventure and the weight they are carrying. However, they make it to the top in an outstanding show of dedication to enjoy the view. The rest of the journey awaits as they begin to trek down the slippery slopes.\nD: The climbers are at the base of the mountain and are just starting their ascent.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_138_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_138_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_138_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_138_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_138_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: The nature walk was not enjoyable.\nB: on our nature walk we explored nature on a wooden bridge .the beautiful greenery covered a bright sunshine .fences had ivy growing up them , and were noticed surrounding the park .we had treasures to take home from our nature walk .i picked some plants to bring home and place in to my garden .\nC: There were no fences and ivy in the park.\nD: The wooden bridge was covered in snow and ice.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: The nature walk was not enjoyable.\nB: on our nature walk we explored nature on a wooden bridge .the beautiful greenery covered a bright sunshine .fences had ivy growing up them , and were noticed surrounding the park .we had treasures to take home from our nature walk .i picked some plants to bring home and place in to my garden .\nC: There were no fences and ivy in the park.\nD: The wooden bridge was covered in snow and ice.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_139_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_139_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_139_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_139_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_139_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: The view is set in front of a modern office with a red door garage. A woman is talking on her phone. She looks surprised. Enters another female figure. She's looking at the woman talking on her phone. The second female figure (working) goes to the coffee machine and makes a cup. The second female figure returns to the talking first with the cup.\nB: The view is set in front of an old home with a white door garage. A woman is reading a book. She looks relaxed. Entering another female figure. She's looking around the room. The second female figure (working) goes to the fridge and takes out a sandwich. The second female figure returns to the relaxed first with the sandwich.\nC: The view is set in front of a modern home with a brown door garage. A man is typing on his laptop. He looks focused. Enters another male figure. He's looking over at the man who is focusing on his laptop. The second male figure (nonworking) goes to the fridge and cracks it open. The second male figure returns to the working first empty handed.\nD: The view is set in front of a traditional house with a wooden door garage. A woman is painting on a canvas. She looks artistic. Enters another male figure. He's looking at the woman painting. The second male figure (working) goes to the shelf and picks up a book. The second male figure returns to the painting first with the book.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: The view is set in front of a modern office with a red door garage. A woman is talking on her phone. She looks surprised. Enters another female figure. She's looking at the woman talking on her phone. The second female figure (working) goes to the coffee machine and makes a cup. The second female figure returns to the talking first with the cup.\nB: The view is set in front of an old home with a white door garage. A woman is reading a book. She looks relaxed. Entering another female figure. She's looking around the room. The second female figure (working) goes to the fridge and takes out a sandwich. The second female figure returns to the relaxed first with the sandwich.\nC: The view is set in front of a modern home with a brown door garage. A man is typing on his laptop. He looks focused. Enters another male figure. He's looking over at the man who is focusing on his laptop. The second male figure (nonworking) goes to the fridge and cracks it open. The second male figure returns to the working first empty handed.\nD: The view is set in front of a traditional house with a wooden door garage. A woman is painting on a canvas. She looks artistic. Enters another male figure. He's looking at the woman painting. The second male figure (working) goes to the shelf and picks up a book. The second male figure returns to the painting first with the book.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_140_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_140_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_140_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_140_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_140_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: The little girl is unhappy with her outfit choice.\nB: I, the little girl, want to show Mom my outfit for the day so i will start with my black dress. Mom looks impressed sitting on little girls bed. Little girl likes the reaction from mom and feels good about the choice. Little girl now decides to dress up in something different and ganders in the mirror. Little girl turns to strike a pose and blow a kiss.\nC: The little girl wants to show her friend her new dress.\nD: The little girl wants to show Dad her new dress.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: The little girl is unhappy with her outfit choice.\nB: I, the little girl, want to show Mom my outfit for the day so i will start with my black dress. Mom looks impressed sitting on little girls bed. Little girl likes the reaction from mom and feels good about the choice. Little girl now decides to dress up in something different and ganders in the mirror. Little girl turns to strike a pose and blow a kiss.\nC: The little girl wants to show her friend her new dress.\nD: The little girl wants to show Dad her new dress.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_141_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_141_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_141_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_141_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_141_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: The boy jumped into the clear blue ocean. The boy who had jumped into the ocean lookrd out for his friend. His friend finally joined him on the shoreline where they both donned swim trunks. The boys realized the water was warm as theybstared at eachother in surprise. The boys took each others hands and jumped into the water as they played all day.\nB: The girl ran into the stormy ocean. The girl who had run into the ocean watched out for her friend. Her friend finally joined her on the shoreline where they both wore raincoats. The girls realized the water was cold as they looked at each other in surprise. The girls took each others hands and jumped into the water as they played all day.\nC: The boy walked away from the dull ocean. The boy who had walked away from the ocean looked out for his friend. His friend finally joined him on the shoreline where they both wore heavy jackets. The boys realized the water was cold as they stared at each other in surprise. The boys took each others hands and jumped into the water as they played all day.\nD: The boy sat by the calm blue ocean. The boy who had sat by the ocean looked out for his friend. His friend finally joined him on the shoreline where they both wore sandals. The boys realized the water was cold as they looked at each other in disappointment. The boys took each others hands and jumped into the water as they played all day.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: The boy jumped into the clear blue ocean. The boy who had jumped into the ocean lookrd out for his friend. His friend finally joined him on the shoreline where they both donned swim trunks. The boys realized the water was warm as theybstared at eachother in surprise. The boys took each others hands and jumped into the water as they played all day.\nB: The girl ran into the stormy ocean. The girl who had run into the ocean watched out for her friend. Her friend finally joined her on the shoreline where they both wore raincoats. The girls realized the water was cold as they looked at each other in surprise. The girls took each others hands and jumped into the water as they played all day.\nC: The boy walked away from the dull ocean. The boy who had walked away from the ocean looked out for his friend. His friend finally joined him on the shoreline where they both wore heavy jackets. The boys realized the water was cold as they stared at each other in surprise. The boys took each others hands and jumped into the water as they played all day.\nD: The boy sat by the calm blue ocean. The boy who had sat by the ocean looked out for his friend. His friend finally joined him on the shoreline where they both wore sandals. The boys realized the water was cold as they looked at each other in disappointment. The boys took each others hands and jumped into the water as they played all day.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_142_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_142_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_142_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_142_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_142_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: we ended up in a crowded garden with modern statues and no sight of the beach or pier.\nB: they took us to a dirty garden with broken statues and no beach or pier in sight.\nC: they dropped us in an immaculate garden . filled with bronzed statues ...like the protector of the camp , a strong man with a spear .we then walked to the beach and saw the gorgeous coastline..and , the first concrete pier i had ever seen .that is where i took my final photo of the trip .\nD: we were left in a messy garden with old statues and no beach or pier nearby.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: we ended up in a crowded garden with modern statues and no sight of the beach or pier.\nB: they took us to a dirty garden with broken statues and no beach or pier in sight.\nC: they dropped us in an immaculate garden . filled with bronzed statues ...like the protector of the camp , a strong man with a spear .we then walked to the beach and saw the gorgeous coastline..and , the first concrete pier i had ever seen .that is where i took my final photo of the trip .\nD: we were left in a messy garden with old statues and no beach or pier nearby.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_143_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_143_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_143_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_143_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_143_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: they went to the beach .they relaxed on the shore .this was a familiar structure .the waves crashed peacefully .finally, they got a worse view of the waters .\nB: they explored the city .they admired the cityscape .this was a well-known structure .the waves were calm and serene .at last, they got a great view of the city .\nC: they went on a forest hike .they took a break to rest .this was a famous structure .the leaves rustled softly .in the end, they got an amazing view of the forest .\nD: we took a trip to the mountains .we took a break to enjoy the view .this was a structure we were unfamiliar with .the waves crashed violently .lastly , we got a better view of the waters .", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: they went to the beach .they relaxed on the shore .this was a familiar structure .the waves crashed peacefully .finally, they got a worse view of the waters .\nB: they explored the city .they admired the cityscape .this was a well-known structure .the waves were calm and serene .at last, they got a great view of the city .\nC: they went on a forest hike .they took a break to rest .this was a famous structure .the leaves rustled softly .in the end, they got an amazing view of the forest .\nD: we took a trip to the mountains .we took a break to enjoy the view .this was a structure we were unfamiliar with .the waves crashed violently .lastly , we got a better view of the waters .", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_144_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_144_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_144_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_144_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_144_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: the girls got silly after a few beers .\nB: the deer head was looking at us from the wall of the bar .\nC: the deer head stared at us from the wall of the bar .after a few beers the girls got a little sillyjust a bunch of friends out for a good time .the all enjoyed the drinks and the company .this man was planning for after he and his buxom lady got home .\nD: a group of friends out for a good time .", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: the girls got silly after a few beers .\nB: the deer head was looking at us from the wall of the bar .\nC: the deer head stared at us from the wall of the bar .after a few beers the girls got a little sillyjust a bunch of friends out for a good time .the all enjoyed the drinks and the company .this man was planning for after he and his buxom lady got home .\nD: a group of friends out for a good time .", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_145_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_145_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_145_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_145_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_145_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: Group of females standing with green hills jerseys standing in front of a city\nB: Females standing happily in front of a mountain range\nC: Female standing normally with a tent behind her Female unhappy with an orange hills jersey on and a field behind her Group of females standing with orange hills jerseys standing in front of a field Same female from the first picture standing happy with a tent behind her Same female from second picture with orange jersey with field behind her\nD: Male standing with a tent behind him", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: Group of females standing with green hills jerseys standing in front of a city\nB: Females standing happily in front of a mountain range\nC: Female standing normally with a tent behind her Female unhappy with an orange hills jersey on and a field behind her Group of females standing with orange hills jerseys standing in front of a field Same female from the first picture standing happy with a tent behind her Same female from second picture with orange jersey with field behind her\nD: Male standing with a tent behind him", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_146_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_146_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_146_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_146_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_146_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: Take the moment to celebrate this special date.\nB: At this event we are pleased to introduce Nadi! This artist who has been very successful in recent years. At the end of the event, anyone who wants to take a photo will be available. Take the moment to celebrate this special date. And our dishes are delicious.\nC: This artist who has been very successful in recent years.\nD: At the end of the event, anyone who wants to take a photo will be available.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: Take the moment to celebrate this special date.\nB: At this event we are pleased to introduce Nadi! This artist who has been very successful in recent years. At the end of the event, anyone who wants to take a photo will be available. Take the moment to celebrate this special date. And our dishes are delicious.\nC: This artist who has been very successful in recent years.\nD: At the end of the event, anyone who wants to take a photo will be available.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_147_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_147_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_147_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_147_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_147_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: Asked a question by a foreigner. both of them watching the ice and raw fish. Another question asked by foreigner. Cutting the fish. Frying the fish on fire.\nB: Both of them watching a movie. Asked a question by a friend. Cutting the vegetables. Frying the vegetables on fire.\nC: Both of them watching TV. Asked a question by a local. Cutting the bread. Frying the bread on fire.\nD: Both of them watching the game. Asked a question by a tourist. Cutting the meat. Frying the meat on fire.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: Asked a question by a foreigner. both of them watching the ice and raw fish. Another question asked by foreigner. Cutting the fish. Frying the fish on fire.\nB: Both of them watching a movie. Asked a question by a friend. Cutting the vegetables. Frying the vegetables on fire.\nC: Both of them watching TV. Asked a question by a local. Cutting the bread. Frying the bread on fire.\nD: Both of them watching the game. Asked a question by a tourist. Cutting the meat. Frying the meat on fire.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_148_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_148_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_148_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_148_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_148_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: i hate animals and they knew i would hate the dog frisbee competition as part of the entertainment for the day\nB: i thought they might have forgotten but they really surprised me when we got home with a birthday cake just for me but of course i will eat it all\nC: i thought they might have forgotten and they really did forget when we got home with a birthday cake just for me and i didn't have to share it\nD: they told me it was my day , they were taking me out for a good time and that i deserved it .there is nothing like good friends/family , carnival style food and good entertainment to really lift a girl 's spirits .i love animals and they knew i would just love the dog frisbee competition as part of the entertainment for the day .i thought they might have forgotten but they really surprised me when we got home with a birthday cake just for me but of course i will share it .this really was my day and i am so lucky to have such special family and friends to spend time with that would surprise me with this cake which is just so pretty to me that i had to take a good picture of it for my scrap book so i never forget it .", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: i hate animals and they knew i would hate the dog frisbee competition as part of the entertainment for the day\nB: i thought they might have forgotten but they really surprised me when we got home with a birthday cake just for me but of course i will eat it all\nC: i thought they might have forgotten and they really did forget when we got home with a birthday cake just for me and i didn't have to share it\nD: they told me it was my day , they were taking me out for a good time and that i deserved it .there is nothing like good friends/family , carnival style food and good entertainment to really lift a girl 's spirits .i love animals and they knew i would just love the dog frisbee competition as part of the entertainment for the day .i thought they might have forgotten but they really surprised me when we got home with a birthday cake just for me but of course i will share it .this really was my day and i am so lucky to have such special family and friends to spend time with that would surprise me with this cake which is just so pretty to me that i had to take a good picture of it for my scrap book so i never forget it .", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_149_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_149_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_149_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_149_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_149_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: it 's a fun night out and this guy just wants to have fun .he meets with his friend who looks very disinterested in hanging out .he meets someone else who shows more interest in hanging with him .he pops a cigar in his mouth and smiles .time for a group picture while continuing to smoke his cigar .\nB: it 's a casual night out and this guy just wants to relax .he meets with his friend who looks interested in hanging out .he meets someone else who shows more interest in hanging with him .he pops a cigar in his mouth and frowns .time for a group picture while putting away his cigar .\nC: it 's a boring night out and this guy just wants to be alone .he meets with his friend who looks very disinterested in hanging out .he meets someone else who shows no interest in hanging with him .he pops a cigar in his mouth and frowns .time for a group picture while continuing to smoke his cigar .\nD: it 's a exciting night out and this guy just wants to have fun .he meets with his friend who looks very interested in hanging out .he meets someone else who shows more interest in hanging with him .he pops a cigarette in his mouth and smiles .time for a group picture while continuing to smoke his cigar .", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: it 's a fun night out and this guy just wants to have fun .he meets with his friend who looks very disinterested in hanging out .he meets someone else who shows more interest in hanging with him .he pops a cigar in his mouth and smiles .time for a group picture while continuing to smoke his cigar .\nB: it 's a casual night out and this guy just wants to relax .he meets with his friend who looks interested in hanging out .he meets someone else who shows more interest in hanging with him .he pops a cigar in his mouth and frowns .time for a group picture while putting away his cigar .\nC: it 's a boring night out and this guy just wants to be alone .he meets with his friend who looks very disinterested in hanging out .he meets someone else who shows no interest in hanging with him .he pops a cigar in his mouth and frowns .time for a group picture while continuing to smoke his cigar .\nD: it 's a exciting night out and this guy just wants to have fun .he meets with his friend who looks very interested in hanging out .he meets someone else who shows more interest in hanging with him .he pops a cigarette in his mouth and smiles .time for a group picture while continuing to smoke his cigar .", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_150_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_150_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_150_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_150_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_150_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: we stayed at a desert with sand dunes\nB: we visited a big city with skyscrapers\nC: on our vacation we traveled through several small towns , each one of them unique .outside of the towns we often saw rolling meadows lined with flowers .the meadows , in turn , were lined with rocky mountain cliffs , such as this one .laying between the mountains and the meadows were some beautiful rivers .finally after passing over one last mountain , we arrived at the beach . our vacation destination was achieved .\nD: we explored a forest with dense foliage", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: we stayed at a desert with sand dunes\nB: we visited a big city with skyscrapers\nC: on our vacation we traveled through several small towns , each one of them unique .outside of the towns we often saw rolling meadows lined with flowers .the meadows , in turn , were lined with rocky mountain cliffs , such as this one .laying between the mountains and the meadows were some beautiful rivers .finally after passing over one last mountain , we arrived at the beach . our vacation destination was achieved .\nD: we explored a forest with dense foliage", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_151_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_151_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_151_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_151_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_151_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: occasionally , the whole family gets together for a camping trip .some of the family have small campers .others have huge rvs .all that really matters is that they get to spend time together .they even do activities like ride bikes through the woods .\nB: every year , the whole family gets together for a camping trip .some of the family have small campers .others have huge rvs .all that really matters is that they get to spend time together .they even do activities like ride bikes through the woods .\nC: each year , the entire family gets together for a beach trip .some of the family have small tents .others have big houses .all that really matters is that they get to spend time together .they even do activities like play volleyball on the sand .\nD: every year , the whole family gets together for a hiking trip .some of the family have small tents .others have large cabins .all that really matters is that they get to spend time together .they even do activities like go for a swim in the river .", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: occasionally , the whole family gets together for a camping trip .some of the family have small campers .others have huge rvs .all that really matters is that they get to spend time together .they even do activities like ride bikes through the woods .\nB: every year , the whole family gets together for a camping trip .some of the family have small campers .others have huge rvs .all that really matters is that they get to spend time together .they even do activities like ride bikes through the woods .\nC: each year , the entire family gets together for a beach trip .some of the family have small tents .others have big houses .all that really matters is that they get to spend time together .they even do activities like play volleyball on the sand .\nD: every year , the whole family gets together for a hiking trip .some of the family have small tents .others have large cabins .all that really matters is that they get to spend time together .they even do activities like go for a swim in the river .", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_152_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_152_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_152_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_152_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_152_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: i went for a bike ride in the suburbs .then i discovered a bustling city .all the paths in the city were quite busy .later i spotted some historic mail boxes .finally i got to a crowded playground .\nB: i went for a run in the mountains .then i found a deserted town .all the pathways in the town were extremely dusty .next i saw some modern mail boxes .finally i arrived at a big park .\nC: i went for a swim in the city .then i stumbled upon a big city .all the streets in the city were very clean .later i found some new mail boxes .finally i reached a large stadium .\nD: i took a walk on in the country .then i came across a small neighborhood .all the roads in the neighborhood were really muddy .after that i came across some really old mail boxes .at the end of my walk i came to a small playground .", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: i went for a bike ride in the suburbs .then i discovered a bustling city .all the paths in the city were quite busy .later i spotted some historic mail boxes .finally i got to a crowded playground .\nB: i went for a run in the mountains .then i found a deserted town .all the pathways in the town were extremely dusty .next i saw some modern mail boxes .finally i arrived at a big park .\nC: i went for a swim in the city .then i stumbled upon a big city .all the streets in the city were very clean .later i found some new mail boxes .finally i reached a large stadium .\nD: i took a walk on in the country .then i came across a small neighborhood .all the roads in the neighborhood were really muddy .after that i came across some really old mail boxes .at the end of my walk i came to a small playground .", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_153_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_153_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_153_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_153_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_153_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: boo , what a great place he told them .\nB: wow , what a terrible place he told them .\nC: eh , what a mediocre place he told them .\nD: wow , what a great place he told them .casino ' .live entertainment .excellent libraries .and exotic foods .", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: boo , what a great place he told them .\nB: wow , what a terrible place he told them .\nC: eh , what a mediocre place he told them .\nD: wow , what a great place he told them .casino ' .live entertainment .excellent libraries .and exotic foods .", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_154_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_154_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_154_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_154_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_154_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: A black car is parked outside of the cabin and it's sunny.\nB: A white van is waiting outside of the cabin and its raining.Behind the van lots of trees can be seen A woman sitting on the floor of the cabin, making bun of her hair and light is on. The woman is listening very keenly what is going outside of the cabin. She is trying to open the door, as if she is being locked in this cabin. She is bending her head and thinking how to escape from this cabin.\nC: A man is standing outside of the cabin and the lights are off.\nD: A woman is dancing inside the cabin and there is no door.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: A black car is parked outside of the cabin and it's sunny.\nB: A white van is waiting outside of the cabin and its raining.Behind the van lots of trees can be seen A woman sitting on the floor of the cabin, making bun of her hair and light is on. The woman is listening very keenly what is going outside of the cabin. She is trying to open the door, as if she is being locked in this cabin. She is bending her head and thinking how to escape from this cabin.\nC: A man is standing outside of the cabin and the lights are off.\nD: A woman is dancing inside the cabin and there is no door.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_155_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_155_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_155_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_155_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_155_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: This is a grocery shop there are variety of fruits here This is all fresh fruits The red apple is so nice They bought green apple\nB: This a shirt shop there are variet yof shirts here This is all second-hand shirt The red color shirt is so nice They bought blue color shirt\nC: This a shoe shop there are verity of shoe here This is all brand new shoe The red color shoe is so nice They bought black color shoe\nD: This is a furniture shop with a variety of tables and chairs This is all used furniture The red color chair is so nice They bought green color table", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: This is a grocery shop there are variety of fruits here This is all fresh fruits The red apple is so nice They bought green apple\nB: This a shirt shop there are variet yof shirts here This is all second-hand shirt The red color shirt is so nice They bought blue color shirt\nC: This a shoe shop there are verity of shoe here This is all brand new shoe The red color shoe is so nice They bought black color shoe\nD: This is a furniture shop with a variety of tables and chairs This is all used furniture The red color chair is so nice They bought green color table", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_156_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_156_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_156_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_156_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_156_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: a pink , purple and orange cake is the desert of choice at the gathering .guests sat at green tablecloth tables and ate their dinner .several people enjoyed the chocolate fountain .a guest speaker stole the show before the cake was served .kids and adults danced after dinner .\nB: a blue , white and yellow cake is the desert of choice at the gathering .guests sat at red tablecloth tables and ate their dinner .few people enjoyed the chocolate fountain .a guest speaker stole the show before the cake was served .kids and adults danced after dinner .\nC: a green , black and orange cake is the desert of choice at the gathering .guests sat at blue tablecloth tables and ate their dinner .some people enjoyed the chocolate fountain .a guest speaker stole the show before the cake was served .kids and adults danced after dinner .\nD: a brown , white and purple cake is the desert of choice at the gathering .guests sat at white tablecloth tables and ate their dinner .many people enjoyed the chocolate fountain .a guest speaker stole the show before the cake was served .kids and adults danced after dinner .", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: a pink , purple and orange cake is the desert of choice at the gathering .guests sat at green tablecloth tables and ate their dinner .several people enjoyed the chocolate fountain .a guest speaker stole the show before the cake was served .kids and adults danced after dinner .\nB: a blue , white and yellow cake is the desert of choice at the gathering .guests sat at red tablecloth tables and ate their dinner .few people enjoyed the chocolate fountain .a guest speaker stole the show before the cake was served .kids and adults danced after dinner .\nC: a green , black and orange cake is the desert of choice at the gathering .guests sat at blue tablecloth tables and ate their dinner .some people enjoyed the chocolate fountain .a guest speaker stole the show before the cake was served .kids and adults danced after dinner .\nD: a brown , white and purple cake is the desert of choice at the gathering .guests sat at white tablecloth tables and ate their dinner .many people enjoyed the chocolate fountain .a guest speaker stole the show before the cake was served .kids and adults danced after dinner .", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_157_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_157_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_157_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_157_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_157_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: A man exploring the city with his family and capturing memorable moments\nB: A photographer capturing the natural beauty of the city with stunning landscapes\nC: One woman wanted to film the city's beauty abroad. She hopes everyone likes her page. She started with people videos. After that, watch the movie on the buildings, paying particular attention to the architecture. Finally, she posts statue videos to her profile and becomes popular.\nD: A group of tourists enjoying the local cuisine and cultural activities", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: A man exploring the city with his family and capturing memorable moments\nB: A photographer capturing the natural beauty of the city with stunning landscapes\nC: One woman wanted to film the city's beauty abroad. She hopes everyone likes her page. She started with people videos. After that, watch the movie on the buildings, paying particular attention to the architecture. Finally, she posts statue videos to her profile and becomes popular.\nD: A group of tourists enjoying the local cuisine and cultural activities", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_158_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_158_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_158_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_158_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_158_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: the town square was a deserted place to shop .townspeople walked through the square to get to their boats in the morning .fishermen usually left the docks in the morning to work .the ocean was a fruitful way to make a living .this coastal city provided a way of life for the residents that kept everyone fed and productive .\nB: the town square was a vibrant place to shop .townspeople walked through the square to get to their boats in the morning .fishermen usually left the docks in the morning to work .the ocean was a fruitful way to make a living .this coastal city provided a way of life for the residents that kept everyone fed and productive .\nC: the town square was a crowded place to shop .townspeople walked through the square to get to their boats in the morning .fishermen usually left the docks in the morning to work .the ocean was a fruitful way to make a living .this coastal city provided a way of life for the residents that kept everyone fed and productive .\nD: the town square was a quiet place to shop .townspeople walked through the square to get to their boats in the morning .fishermen usually left the docks in the morning to work .the ocean was a fruitful way to make a living .this coastal city provided a way of life for the residents that kept everyone fed and productive .", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: the town square was a deserted place to shop .townspeople walked through the square to get to their boats in the morning .fishermen usually left the docks in the morning to work .the ocean was a fruitful way to make a living .this coastal city provided a way of life for the residents that kept everyone fed and productive .\nB: the town square was a vibrant place to shop .townspeople walked through the square to get to their boats in the morning .fishermen usually left the docks in the morning to work .the ocean was a fruitful way to make a living .this coastal city provided a way of life for the residents that kept everyone fed and productive .\nC: the town square was a crowded place to shop .townspeople walked through the square to get to their boats in the morning .fishermen usually left the docks in the morning to work .the ocean was a fruitful way to make a living .this coastal city provided a way of life for the residents that kept everyone fed and productive .\nD: the town square was a quiet place to shop .townspeople walked through the square to get to their boats in the morning .fishermen usually left the docks in the morning to work .the ocean was a fruitful way to make a living .this coastal city provided a way of life for the residents that kept everyone fed and productive .", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_159_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_159_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_159_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_159_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_159_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: Sitting in a park on a sunny day enjoying a picnic\nB: Walking through a forest and looking at the tall trees\nC: Sitting on the river side on a beautiful day fishing my life away. Got one hooked now to real it in nice and slow to not rip the fishes lip. Caught just a little baby fish but its okay because we catch and release. Releasing the fish because I don't see the point in keeping it when I am still fishing. Throw your line back in and just keep on catching them.\nD: Standing on the beach on a cloudy day watching the waves", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: Sitting in a park on a sunny day enjoying a picnic\nB: Walking through a forest and looking at the tall trees\nC: Sitting on the river side on a beautiful day fishing my life away. Got one hooked now to real it in nice and slow to not rip the fishes lip. Caught just a little baby fish but its okay because we catch and release. Releasing the fish because I don't see the point in keeping it when I am still fishing. Throw your line back in and just keep on catching them.\nD: Standing on the beach on a cloudy day watching the waves", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_160_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_160_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_160_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_160_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_160_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: Kat visited her best friend alone\nB: They all went to a restaurant\nC: Kat took her little sister Julie along to visit her best friend. They arrived at Michelle's house and Julie knocked on the door. Michelle opened the door and greeted them. Kat explained how Julie wanted to tag along. They all went into the living room to plan on what to do.\nD: They went to Michelle's house but no one was home", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: Kat visited her best friend alone\nB: They all went to a restaurant\nC: Kat took her little sister Julie along to visit her best friend. They arrived at Michelle's house and Julie knocked on the door. Michelle opened the door and greeted them. Kat explained how Julie wanted to tag along. They all went into the living room to plan on what to do.\nD: They went to Michelle's house but no one was home", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_161_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_161_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_161_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_161_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_161_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: Friend supporting the girl boss\nB: Girl showing respect to her elders\nC: Girl boss wants to show off that she is better even better than this old Mom person in here This other Friend of hers says something to her It seems to be something mean because the Mom is here too Then another girls shows up as being the best one\nD: Mom scolding the girl boss", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: Friend supporting the girl boss\nB: Girl showing respect to her elders\nC: Girl boss wants to show off that she is better even better than this old Mom person in here This other Friend of hers says something to her It seems to be something mean because the Mom is here too Then another girls shows up as being the best one\nD: Mom scolding the girl boss", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_162_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_162_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_162_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_162_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_162_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: The captain gives flight time as flight 997 leaves Hartfield-Jackson Airport on its way to Seattle One of the window seat passengers is amazed by birds' eye view' About mid-flight Angel, one of the attendants get ready to prepare the meals for the passengers. Angel checks with the passengers to see who is ready for a meal and explains what she has. Angel and Tam move aside in the kitchen so Charles, another attendant could get through.\nB: The passengers are preparing for a sports event.\nC: The passengers are getting ready for a music performance.\nD: The passengers are enjoying a movie on the flight.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: The captain gives flight time as flight 997 leaves Hartfield-Jackson Airport on its way to Seattle One of the window seat passengers is amazed by birds' eye view' About mid-flight Angel, one of the attendants get ready to prepare the meals for the passengers. Angel checks with the passengers to see who is ready for a meal and explains what she has. Angel and Tam move aside in the kitchen so Charles, another attendant could get through.\nB: The passengers are preparing for a sports event.\nC: The passengers are getting ready for a music performance.\nD: The passengers are enjoying a movie on the flight.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_163_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_163_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_163_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_163_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_163_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: Someone they dont know is walking up to their house. The stranger asked to take a selfie. The men took a selfie together. The man left abruptly so everyone was confused. The 2 men discussed the man leaving abruptly.\nB: A group of friends celebrating a birthday at a restaurant.\nC: A family having a picnic in the park.\nD: A couple shopping for groceries at the supermarket.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: Someone they dont know is walking up to their house. The stranger asked to take a selfie. The men took a selfie together. The man left abruptly so everyone was confused. The 2 men discussed the man leaving abruptly.\nB: A group of friends celebrating a birthday at a restaurant.\nC: A family having a picnic in the park.\nD: A couple shopping for groceries at the supermarket.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_164_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_164_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_164_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_164_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_164_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: excited to be at the birthday party .waiting for the cake to be cut .sitting and watching the balloons on the ground .running in the hallway .ready to blow out the candles and have some cake .\nB: so happy to be at the birthday party .waiting for the cake to be cut .enjoying running through the balloons on the ground .playing hide and seek in the hallway .ready to blow out the candles and have some cake .\nC: feeling bored at the party .waiting for the cake to be cut .tripping on the balloons on the ground .hiding in the hallway .ready to blow out the candles and have some cake .\nD: thrilled to be at the birthday party .waiting for the cake to be cut .popping the balloons on the ground .standing around in the hallway .ready to blow out the candles and have some cake .", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: excited to be at the birthday party .waiting for the cake to be cut .sitting and watching the balloons on the ground .running in the hallway .ready to blow out the candles and have some cake .\nB: so happy to be at the birthday party .waiting for the cake to be cut .enjoying running through the balloons on the ground .playing hide and seek in the hallway .ready to blow out the candles and have some cake .\nC: feeling bored at the party .waiting for the cake to be cut .tripping on the balloons on the ground .hiding in the hallway .ready to blow out the candles and have some cake .\nD: thrilled to be at the birthday party .waiting for the cake to be cut .popping the balloons on the ground .standing around in the hallway .ready to blow out the candles and have some cake .", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_165_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_165_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_165_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_165_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_165_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: A woman and a man are at a car dealership test driving different cars. They try out a few models before deciding on a red convertible. They then negotiate the price with the salesperson before completing the purchase.\nB: A group of friends are at an amusement park trying out different rides. They first go on the roller coaster and then the Ferris wheel. They finally end their day with some snacks and decide to go home.\nC: A mother and son are at the toys store looking for toys that the kid may enjoy. He is looking through the aisles looking at whatever he might be interested in. He takes notice of the car type toys and he's very interested in driving it. The mother wants to have a slight bit of fun so she hops in the car with him to have fun. They both decide on which toys to get and the proceed to the checkout to buy them.\nD: A father and daughter are at the bakery looking for a cake to buy. The daughter looks through the cakes and chooses a chocolate cake. The father wants to surprise her, so he decides to buy a strawberry cake instead.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: A woman and a man are at a car dealership test driving different cars. They try out a few models before deciding on a red convertible. They then negotiate the price with the salesperson before completing the purchase.\nB: A group of friends are at an amusement park trying out different rides. They first go on the roller coaster and then the Ferris wheel. They finally end their day with some snacks and decide to go home.\nC: A mother and son are at the toys store looking for toys that the kid may enjoy. He is looking through the aisles looking at whatever he might be interested in. He takes notice of the car type toys and he's very interested in driving it. The mother wants to have a slight bit of fun so she hops in the car with him to have fun. They both decide on which toys to get and the proceed to the checkout to buy them.\nD: A father and daughter are at the bakery looking for a cake to buy. The daughter looks through the cakes and chooses a chocolate cake. The father wants to surprise her, so he decides to buy a strawberry cake instead.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_166_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_166_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_166_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_166_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_166_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: it was a quiet day with only a few people at the stadium\nB: the stadium was old and in poor condition\nC: it was opening day and the crowd was lining up to cheer their team onthe new stadium was a great undertaking but it came out looking really goodit was a sold out ball game for the opening day and the weather was beautifulthe crowd roared as the first pitch was thrownafter the ball game the crowd quickly dispersed to their local watering holes\nD: the game was cancelled due to bad weather", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: it was a quiet day with only a few people at the stadium\nB: the stadium was old and in poor condition\nC: it was opening day and the crowd was lining up to cheer their team onthe new stadium was a great undertaking but it came out looking really goodit was a sold out ball game for the opening day and the weather was beautifulthe crowd roared as the first pitch was thrownafter the ball game the crowd quickly dispersed to their local watering holes\nD: the game was cancelled due to bad weather", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_167_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_167_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_167_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_167_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_167_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: A man is talking to a man. The men are not responding. They are having a conversation. They are drinking their coffee. The guy is sitting on the chair.\nB: A man is asking the woman something. The women are not responding. They are having a gathering. They are eating their food. The guy is around the table.\nC: A woman is talking to a man. The man is not responding. They are having a meeting. They are discussing a project. The lady is sitting at the desk.\nD: A woman is talking to a woman. The women are not responding. They are having a chat. They are reading their books. The lady is standing by the window.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: A man is talking to a man. The men are not responding. They are having a conversation. They are drinking their coffee. The guy is sitting on the chair.\nB: A man is asking the woman something. The women are not responding. They are having a gathering. They are eating their food. The guy is around the table.\nC: A woman is talking to a man. The man is not responding. They are having a meeting. They are discussing a project. The lady is sitting at the desk.\nD: A woman is talking to a woman. The women are not responding. They are having a chat. They are reading their books. The lady is standing by the window.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_168_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_168_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_168_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_168_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_168_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: he had put up the decorations for the celebration .he was looking forward to a day off .he thought he 'd go for a hike .maybe do a little fishing .if the fish were n't biting he could always go hunting .\nB: he was getting ready for a party .he wanted to spend the day with friends .he was excited to socialize .\nC: he was preparing for a quiet day at home .he was not interested in going out .he was planning to stay indoors .\nD: he was disappointed with the decorations .he was not satisfied with the arrangements .he was frustrated with the celebration .", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: he had put up the decorations for the celebration .he was looking forward to a day off .he thought he 'd go for a hike .maybe do a little fishing .if the fish were n't biting he could always go hunting .\nB: he was getting ready for a party .he wanted to spend the day with friends .he was excited to socialize .\nC: he was preparing for a quiet day at home .he was not interested in going out .he was planning to stay indoors .\nD: he was disappointed with the decorations .he was not satisfied with the arrangements .he was frustrated with the celebration .", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_169_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_169_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_169_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_169_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_169_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: if you work in television and do a good job , you 've probably been invited to our awards night .there are some real legends that show up every year .i loved meeting the cast of `` wait , wait , do n't tell me ! '' what a thrill !there 's a lot of meeting and greeting time too -- -it 's a good networking opportunity .the coveted golden television award is always the highlight of the night .\nB: if you work in radio and do a good job , you 've probably been invited to our awards night .there are some fake legends that show up every year .i hated meeting the cast of `` wait , wait , do n't tell me ! '' what a thrill !there 's not a lot of meeting and greeting time too -- -it 's not a good networking opportunity .the dreaded golden radio award is always the highlight of the night .\nC: if you work in radio and do a bad job , you 've probably been invited to our awards night .there are some real legends that show up every year .i loved meeting the cast of `` wait , wait , do n't tell me ! '' what a thrill !there 's a lot of meeting and greeting time too -- -it 's a good networking opportunity .the coveted golden radio award is always the highlight of the night .\nD: if you work in radio and do a good job , you 've probably been invited to our awards night .there are some real legends that show up every year .i loved meeting the cast of `` wait , wait , do n't tell me ! '' what a thrill !there 's a lot of meeting and greeting time too -- -it 's a good networking opportunity .the coveted golden radio award is always the highlight of the night .", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: if you work in television and do a good job , you 've probably been invited to our awards night .there are some real legends that show up every year .i loved meeting the cast of `` wait , wait , do n't tell me ! '' what a thrill !there 's a lot of meeting and greeting time too -- -it 's a good networking opportunity .the coveted golden television award is always the highlight of the night .\nB: if you work in radio and do a good job , you 've probably been invited to our awards night .there are some fake legends that show up every year .i hated meeting the cast of `` wait , wait , do n't tell me ! '' what a thrill !there 's not a lot of meeting and greeting time too -- -it 's not a good networking opportunity .the dreaded golden radio award is always the highlight of the night .\nC: if you work in radio and do a bad job , you 've probably been invited to our awards night .there are some real legends that show up every year .i loved meeting the cast of `` wait , wait , do n't tell me ! '' what a thrill !there 's a lot of meeting and greeting time too -- -it 's a good networking opportunity .the coveted golden radio award is always the highlight of the night .\nD: if you work in radio and do a good job , you 've probably been invited to our awards night .there are some real legends that show up every year .i loved meeting the cast of `` wait , wait , do n't tell me ! '' what a thrill !there 's a lot of meeting and greeting time too -- -it 's a good networking opportunity .the coveted golden radio award is always the highlight of the night .", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_170_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_170_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_170_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_170_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_170_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: two men talk next to a bus stop three kids play on a dock a man records with a little brown-haired girl on the sand people cross the pathway the family walks on the pavement\nB: two women discuss next to a parking meter three adults walk on a pontoon a woman makes a video with a little blonde boy on the sand people cross the street the family walks on the sidewalk\nC: two men discuss next to a parking meter three children walk on a pontoon a man makes a video with a little blonde girl on the sand people cross the road the family walks on the side of the road\nD: two men argue next to a parking meter three teenagers walk on a bridge a woman takes a photo with a little brunette girl on the sand people cross the intersection the family walks on the edge of the road", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: two men talk next to a bus stop three kids play on a dock a man records with a little brown-haired girl on the sand people cross the pathway the family walks on the pavement\nB: two women discuss next to a parking meter three adults walk on a pontoon a woman makes a video with a little blonde boy on the sand people cross the street the family walks on the sidewalk\nC: two men discuss next to a parking meter three children walk on a pontoon a man makes a video with a little blonde girl on the sand people cross the road the family walks on the side of the road\nD: two men argue next to a parking meter three teenagers walk on a bridge a woman takes a photo with a little brunette girl on the sand people cross the intersection the family walks on the edge of the road", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_171_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_171_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_171_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_171_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_171_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: i waited for the bus to arrive to take us to the beach .it was a beautiful day for the beach .after the beach we went for a walk .there were a lot of people on the pier walking as well .after our walk we got back in the bus . we were very tired .\nB: people waiting at a train station\nC: sunny day at a ski resort\nD: crowded city street", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: i waited for the bus to arrive to take us to the beach .it was a beautiful day for the beach .after the beach we went for a walk .there were a lot of people on the pier walking as well .after our walk we got back in the bus . we were very tired .\nB: people waiting at a train station\nC: sunny day at a ski resort\nD: crowded city street", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_172_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_172_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_172_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_172_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_172_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: The guests at the wedding stand up from their tables. The couple makes a speech. The couple make a joke. The guests laugh at the joke. The guests await the marriage ceremony.\nB: There is no speech or joke at the wedding.\nC: The guests are not laughing at any joke at the wedding.\nD: The guests are sitting at their tables during the wedding.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: The guests at the wedding stand up from their tables. The couple makes a speech. The couple make a joke. The guests laugh at the joke. The guests await the marriage ceremony.\nB: There is no speech or joke at the wedding.\nC: The guests are not laughing at any joke at the wedding.\nD: The guests are sitting at their tables during the wedding.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_173_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_173_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_173_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_173_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_173_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: The teammates are supportive and cheering for the girl with a ponytail.\nB: The coach is happy with her team's performance.\nC: The girl with a ponytail is content with her coach.\nD: The blonde coach looks disapprovingly at her team. The girl with a ponytail is standing by herself and she is not happy with her coach. The girl's three teammates are disappointed with what the coach is saying, but they stay quiet. The blonde coach begins her dressing down of the team. The girl on her own talks back to her coach in a disrespectful manner.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: The teammates are supportive and cheering for the girl with a ponytail.\nB: The coach is happy with her team's performance.\nC: The girl with a ponytail is content with her coach.\nD: The blonde coach looks disapprovingly at her team. The girl with a ponytail is standing by herself and she is not happy with her coach. The girl's three teammates are disappointed with what the coach is saying, but they stay quiet. The blonde coach begins her dressing down of the team. The girl on her own talks back to her coach in a disrespectful manner.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_174_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_174_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_174_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_174_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_174_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: i took a selfie when my husband and i went to the beach .he made the decision to play pool after the beach .i did n't realize how well he played and he picked a great bar to play at .we had a good time laughing and playing .i took another selfie at the end of the day because i was so happy with the time i had .\nB: we went to the movies and watched a comedy.\nC: i went to the beach with a friend and we played volleyball.\nD: my husband and i went to a restaurant and had a nice dinner.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: i took a selfie when my husband and i went to the beach .he made the decision to play pool after the beach .i did n't realize how well he played and he picked a great bar to play at .we had a good time laughing and playing .i took another selfie at the end of the day because i was so happy with the time i had .\nB: we went to the movies and watched a comedy.\nC: i went to the beach with a friend and we played volleyball.\nD: my husband and i went to a restaurant and had a nice dinner.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_175_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_175_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_175_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_175_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_175_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: this girl is happy and beautiful !this llama looks crazy !what a great time at the fair !this clown is so funny and entertaining !what a beautiful and lit up sight !\nB: this girl looks sad and dull !this llama looks calm !what a boring and gloomy time at the fair !this clown is serious and uninteresting !what a ugly and dark sight !\nC: this girl looks worried and plain !this llama looks normal !what a mediocre and uninteresting time at the fair !this clown is not funny and unentertaining !what a dull and unexciting sight !\nD: this girl looks angry and ugly !this llama looks peaceful !what a terrible and scary time at the fair !this clown is boring and dull !what a horrible and dull sight !", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: this girl is happy and beautiful !this llama looks crazy !what a great time at the fair !this clown is so funny and entertaining !what a beautiful and lit up sight !\nB: this girl looks sad and dull !this llama looks calm !what a boring and gloomy time at the fair !this clown is serious and uninteresting !what a ugly and dark sight !\nC: this girl looks worried and plain !this llama looks normal !what a mediocre and uninteresting time at the fair !this clown is not funny and unentertaining !what a dull and unexciting sight !\nD: this girl looks angry and ugly !this llama looks peaceful !what a terrible and scary time at the fair !this clown is boring and dull !what a horrible and dull sight !", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_176_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_176_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_176_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_176_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_176_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: A woman is shown looking through a crowd. Women sitting next to each other do not face the camera. People are watching someone present. People are gather at long dining tables to eat food. A man and woman are happily looking towards the front of the room.\nB: A woman is shown looking through a crowd. Women sitting next to each other turn and face the camera. People are watching someone present. People are gather at long dining tables to eat food. A man and woman are happily looking towards the front of the room.\nC: A man is shown looking through a crowd. Women sitting next to each other turn and face the camera. People are watching someone present. People are gather at long dining tables to eat food. A woman and man are happily looking towards the front of the room.\nD: A woman is shown looking through a crowd. Men sitting next to each other turn and face the camera. People are watching someone present. People are gather at long dining tables to eat food. A man and woman are happily looking towards the front of the room.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: A woman is shown looking through a crowd. Women sitting next to each other do not face the camera. People are watching someone present. People are gather at long dining tables to eat food. A man and woman are happily looking towards the front of the room.\nB: A woman is shown looking through a crowd. Women sitting next to each other turn and face the camera. People are watching someone present. People are gather at long dining tables to eat food. A man and woman are happily looking towards the front of the room.\nC: A man is shown looking through a crowd. Women sitting next to each other turn and face the camera. People are watching someone present. People are gather at long dining tables to eat food. A woman and man are happily looking towards the front of the room.\nD: A woman is shown looking through a crowd. Men sitting next to each other turn and face the camera. People are watching someone present. People are gather at long dining tables to eat food. A man and woman are happily looking towards the front of the room.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_177_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_177_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_177_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_177_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_177_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: The images reveal a small and unremarkable garden.\nB: The images depict a crowded and untidy garden.\nC: The images show a dreary and uninviting garden.\nD: i am very fond of garden we spand some hover over these flowers there are many beautiful places we have a very beautiful garden in our house we gather twice a week in this room", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: The images reveal a small and unremarkable garden.\nB: The images depict a crowded and untidy garden.\nC: The images show a dreary and uninviting garden.\nD: i am very fond of garden we spand some hover over these flowers there are many beautiful places we have a very beautiful garden in our house we gather twice a week in this room", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_178_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_178_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_178_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_178_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_178_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: The family was at a barbecue. They were roasting marshmallows on the fire. The dad was grilling. His kids were playing in the background. Mom joined them with a smile.\nB: The family was hiking. They were roasting marshmallows on the fire. The dad was cooking. His kids were fighting in the background. Mom joined them with a smile.\nC: The family was at a picnic. They were roasting marshmallows on the fire. The dad was cooking. His kids were reading in the background. Mom ignored them with a frown.\nD: The family was camping. They were grilling meat on the fire. The dad was eating. His kids were playing in the background. Mom joined them with a smile.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: The family was at a barbecue. They were roasting marshmallows on the fire. The dad was grilling. His kids were playing in the background. Mom joined them with a smile.\nB: The family was hiking. They were roasting marshmallows on the fire. The dad was cooking. His kids were fighting in the background. Mom joined them with a smile.\nC: The family was at a picnic. They were roasting marshmallows on the fire. The dad was cooking. His kids were reading in the background. Mom ignored them with a frown.\nD: The family was camping. They were grilling meat on the fire. The dad was eating. His kids were playing in the background. Mom joined them with a smile.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_179_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_179_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_179_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_179_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_179_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: a group picture on the beach with beach toys and a picnic basket\nB: a scenic view of the mountains with a river flowing through\nC: a family gathering at a park with a barbecue grill\nD: our first breakfast on the cruise ship .we enjoyed a little 7up while eating .then we took a stroll on the deck .we checked out the cars in the garage of the shipwe asked a bystander to take this photo of the three of us", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: a group picture on the beach with beach toys and a picnic basket\nB: a scenic view of the mountains with a river flowing through\nC: a family gathering at a park with a barbecue grill\nD: our first breakfast on the cruise ship .we enjoyed a little 7up while eating .then we took a stroll on the deck .we checked out the cars in the garage of the shipwe asked a bystander to take this photo of the three of us", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_180_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_180_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_180_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_180_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_180_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: Two young women in green are seen looking at the screen of their mobile devices.\nB: A young woman dressed in blue is seen looking at the screen of her mobile device. She is accompanied by a man who is dressed entirely in purple, and they are currently strolling around a city.\nC: A young man and a young woman are seen glancing at the screen of their mobile devices.\nD: A young man dressed in purple is seen glancing at the screen of his mobile device. He is accompanied by another man who is dressed entirely in purple, and they are currently strolling around a city. They are located on a street in a coastal city that is directly across from a beach. As they stroll alongside one another, they are pondering the question, \"Where are we going?\n\u201d. The first person in the pair looks through a trash can to see if there's anything they can use.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: Two young women in green are seen looking at the screen of their mobile devices.\nB: A young woman dressed in blue is seen looking at the screen of her mobile device. She is accompanied by a man who is dressed entirely in purple, and they are currently strolling around a city.\nC: A young man and a young woman are seen glancing at the screen of their mobile devices.\nD: A young man dressed in purple is seen glancing at the screen of his mobile device. He is accompanied by another man who is dressed entirely in purple, and they are currently strolling around a city. They are located on a street in a coastal city that is directly across from a beach. As they stroll alongside one another, they are pondering the question, \"Where are we going?\n\u201d. The first person in the pair looks through a trash can to see if there's anything they can use.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_181_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_181_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_181_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_181_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_181_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: Two men are sitting on a bench waiting for their friends to arrive.\nB: Three men are standing by the car chatting happily.\nC: Three men are standing in front of a building waiting for someone meeting them there. A man shows up with news and two of the men step closer to hear him better. He tells the 3 men that their other friend couldn't come with him, because his mom needed him today. One man thinks that is so funny he stops and asks Are you serious man? with a clownish face He can't b it and asks the other men if they believed their friend had to stay home with mom.\nD: A group of people are posing for a picture in front of a building.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: Two men are sitting on a bench waiting for their friends to arrive.\nB: Three men are standing by the car chatting happily.\nC: Three men are standing in front of a building waiting for someone meeting them there. A man shows up with news and two of the men step closer to hear him better. He tells the 3 men that their other friend couldn't come with him, because his mom needed him today. One man thinks that is so funny he stops and asks Are you serious man? with a clownish face He can't b it and asks the other men if they believed their friend had to stay home with mom.\nD: A group of people are posing for a picture in front of a building.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_182_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_182_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_182_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_182_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_182_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: The flight attendants are not providing any services.\nB: A flight attendant addresses someone who is out of screen shot. A flight attendant jokes with another person in the galley. A view out the window shows another jet at the airport. The airplane is crowded. Flight attendants demonstrate safety equipment.\nC: The airplane is empty and not ready for a flight.\nD: The passengers are quiet and seated comfortably.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: The flight attendants are not providing any services.\nB: A flight attendant addresses someone who is out of screen shot. A flight attendant jokes with another person in the galley. A view out the window shows another jet at the airport. The airplane is crowded. Flight attendants demonstrate safety equipment.\nC: The airplane is empty and not ready for a flight.\nD: The passengers are quiet and seated comfortably.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_183_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_183_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_183_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_183_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_183_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: The man is fixing a car.\nB: The two women are arguing about money.\nC: We are in our room. We are talking about cleaning. The woman wants to read instead. The old woman talks to us. We grow old together.\nD: They are outdoors having a picnic.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: The man is fixing a car.\nB: The two women are arguing about money.\nC: We are in our room. We are talking about cleaning. The woman wants to read instead. The old woman talks to us. We grow old together.\nD: They are outdoors having a picnic.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_184_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_184_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_184_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_184_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_184_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: the sea stretched before us in an endless ocean of blue .in the evening , the sun began to set in the west over the water .rock climbing was an interesting experience .i was afraid to fall , but took a chance and waved .the water is so blue , it almost looks like ice .\nB: the city skyline was stunning as the sun set .i was mesmerized by the beautiful colors of the sky .i couldn't take my eyes off the horizon .the buildings were silhouetted against the orange and pink sky .\nC: we climbed a huge mountain and saw a breathtaking view .the sky was clear and the sun was shining brightly .i couldn't believe the stunning scenery .i felt alive and free at the top .\nD: the sky was clear and the water was warm .it was a perfect day for swimming .i was amazed by the beautiful sunset .the water looked so inviting .", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: the sea stretched before us in an endless ocean of blue .in the evening , the sun began to set in the west over the water .rock climbing was an interesting experience .i was afraid to fall , but took a chance and waved .the water is so blue , it almost looks like ice .\nB: the city skyline was stunning as the sun set .i was mesmerized by the beautiful colors of the sky .i couldn't take my eyes off the horizon .the buildings were silhouetted against the orange and pink sky .\nC: we climbed a huge mountain and saw a breathtaking view .the sky was clear and the sun was shining brightly .i couldn't believe the stunning scenery .i felt alive and free at the top .\nD: the sky was clear and the water was warm .it was a perfect day for swimming .i was amazed by the beautiful sunset .the water looked so inviting .", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_185_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_185_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_185_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_185_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_185_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: we decided to go to city on vacation this year .city was absolutely beautiful , i love how the buildings sit right on the street .the architecture was amazing , we do n't have buildings like these in the states .we took a taxi ride down the road , it was very romatic .the food was very different than i 'm used to , i never was quite sure what i was eating .\nB: we decided to go to beach on vacation this year .beach was absolutely beautiful , i love how the buildings sit right on the sand .the architecture was amazing , we do n't have buildings like these in the states .we took a boat ride down the river , it was very romatic .the food was very different than i 'm used to , i never was quite sure what i was eating .\nC: we decided to go to village on vacation this year .village was absolutely beautiful , i love how the buildings sit right on the hill .the architecture was amazing , we do n't have buildings like these in the states .we took a bicycle ride down the path , it was very romatic .the food was very different than i 'm used to , i never was quite sure what i was eating .\nD: we decided to go to location on vacation this year .location was absolutely beautiful , i love how the buildings sit right on the water .the architecture was amazing , we do n't have buildings like these in the states .we took a gondola down the canal , it was very romatic .the food was very different than i 'm used to , i never was quite sure what i was eating .", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: we decided to go to city on vacation this year .city was absolutely beautiful , i love how the buildings sit right on the street .the architecture was amazing , we do n't have buildings like these in the states .we took a taxi ride down the road , it was very romatic .the food was very different than i 'm used to , i never was quite sure what i was eating .\nB: we decided to go to beach on vacation this year .beach was absolutely beautiful , i love how the buildings sit right on the sand .the architecture was amazing , we do n't have buildings like these in the states .we took a boat ride down the river , it was very romatic .the food was very different than i 'm used to , i never was quite sure what i was eating .\nC: we decided to go to village on vacation this year .village was absolutely beautiful , i love how the buildings sit right on the hill .the architecture was amazing , we do n't have buildings like these in the states .we took a bicycle ride down the path , it was very romatic .the food was very different than i 'm used to , i never was quite sure what i was eating .\nD: we decided to go to location on vacation this year .location was absolutely beautiful , i love how the buildings sit right on the water .the architecture was amazing , we do n't have buildings like these in the states .we took a gondola down the canal , it was very romatic .the food was very different than i 'm used to , i never was quite sure what i was eating .", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_186_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_186_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_186_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_186_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_186_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: The city was quiet one morning. The cage was barely visible. A strange man appeared with a tour guide. A cat emerged to be pet by the new strange man. The guide was in awe of the stranger petting the cat.\nB: The city was loud one morning. The cage was clearly visible. A familiar man appeared with a tour guide. A dog emerged to be pet by the new familiar man. The guide was uninterested in the stranger petting the dog.\nC: The city was empty one morning. The cage was invisible. A mysterious man appeared with a tour guide. A mouse emerged to be pet by the new mysterious man. The guide was scared of the stranger petting the mouse.\nD: The city was bustling one morning. The cage was completely hidden. A typical man appeared with a tour guide. A bird emerged to be pet by the new typical man. The guide was skeptical of the stranger petting the bird.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: The city was quiet one morning. The cage was barely visible. A strange man appeared with a tour guide. A cat emerged to be pet by the new strange man. The guide was in awe of the stranger petting the cat.\nB: The city was loud one morning. The cage was clearly visible. A familiar man appeared with a tour guide. A dog emerged to be pet by the new familiar man. The guide was uninterested in the stranger petting the dog.\nC: The city was empty one morning. The cage was invisible. A mysterious man appeared with a tour guide. A mouse emerged to be pet by the new mysterious man. The guide was scared of the stranger petting the mouse.\nD: The city was bustling one morning. The cage was completely hidden. A typical man appeared with a tour guide. A bird emerged to be pet by the new typical man. The guide was skeptical of the stranger petting the bird.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_187_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_187_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_187_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_187_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_187_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: we worked the game that day .the crowd was so excited for the team .the show at halftime was entertaining .the marcee displayed many important players for the event .the evening ended with fireworks .\nB: they sang the anthem before the game .the crowd was disappointed with the team .the halftime show was boring .the marcee was not well-prepared for the event .the evening ended quietly .\nC: the players competed in the game .the crowd was silent during the game .there was no halftime show .the marcee was not present at the event .the evening ended with no special event .\nD: the players practiced before the game .the crowd was indifferent to the team .there was no halftime show .the marcee was not present at the event .the evening ended with no special event .", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: we worked the game that day .the crowd was so excited for the team .the show at halftime was entertaining .the marcee displayed many important players for the event .the evening ended with fireworks .\nB: they sang the anthem before the game .the crowd was disappointed with the team .the halftime show was boring .the marcee was not well-prepared for the event .the evening ended quietly .\nC: the players competed in the game .the crowd was silent during the game .there was no halftime show .the marcee was not present at the event .the evening ended with no special event .\nD: the players practiced before the game .the crowd was indifferent to the team .there was no halftime show .the marcee was not present at the event .the evening ended with no special event .", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_188_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_188_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_188_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_188_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_188_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: it was a boring night with nothing much happening .people didn't enjoy themselves .broken relationships remained broken .no fun stories were told .and a boring speech was heard by all .\nB: it was a chaotic night with no sense of accomplishment .people had a terrible time together .broken relationships remained broken .many sad stories were told .and a terrible speech was heard by all .\nC: it was a quiet night with no excitement .people had a boring time together .lost relationships stayed lost .no stories were told .and a quiet speech was heard by all .\nD: it was a night to celebrate great accomplishments .people had a great time together .lost relationships were rekindled .many fun stories were told .and a great speech was heard by all .", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: it was a boring night with nothing much happening .people didn't enjoy themselves .broken relationships remained broken .no fun stories were told .and a boring speech was heard by all .\nB: it was a chaotic night with no sense of accomplishment .people had a terrible time together .broken relationships remained broken .many sad stories were told .and a terrible speech was heard by all .\nC: it was a quiet night with no excitement .people had a boring time together .lost relationships stayed lost .no stories were told .and a quiet speech was heard by all .\nD: it was a night to celebrate great accomplishments .people had a great time together .lost relationships were rekindled .many fun stories were told .and a great speech was heard by all .", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_189_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_189_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_189_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_189_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_189_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: the karaoke bar was empty when [female] stepped in\nB: the karaoke bar was a mess when [female] stepped in .there was a fat man trying to sing a [female] [male] song .[female] decided to go up on the table and fight for the microphone .it did n't work because some other fat man won it over .[female] gave up after the man in blue decided that he was eminem .\nC: the karaoke bar was quiet when [female] stepped in\nD: the karaoke bar was crowded when [female] stepped in", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: the karaoke bar was empty when [female] stepped in\nB: the karaoke bar was a mess when [female] stepped in .there was a fat man trying to sing a [female] [male] song .[female] decided to go up on the table and fight for the microphone .it did n't work because some other fat man won it over .[female] gave up after the man in blue decided that he was eminem .\nC: the karaoke bar was quiet when [female] stepped in\nD: the karaoke bar was crowded when [female] stepped in", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_190_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_190_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_190_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_190_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_190_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: a group of people playing beach volleyball by the sea.\nB: a person swimming in a tropical ocean with colorful fish.\nC: there are two people racing in the snow. they are enjoying the snow skating. then, they go on a mountain top to take a snowy risk at the highest peaks. The mountain peak is high and beautiful.  This picture expresses the suffering of this person between snow and mountains\nD: a couple hiking in the desert under the hot sun.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: a group of people playing beach volleyball by the sea.\nB: a person swimming in a tropical ocean with colorful fish.\nC: there are two people racing in the snow. they are enjoying the snow skating. then, they go on a mountain top to take a snowy risk at the highest peaks. The mountain peak is high and beautiful.  This picture expresses the suffering of this person between snow and mountains\nD: a couple hiking in the desert under the hot sun.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_191_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_191_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_191_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_191_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_191_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: Two boys are playing in the park.\nB: A group of students are sitting in a classroom.\nC: A girl is walking in school uniform. In a lush green walk way surrounded with green mountain also, two boys are on the way to school in uniform with a girl holding school bag and walking in school uniform.\nD: A girl is riding a bicycle in a city street.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: Two boys are playing in the park.\nB: A group of students are sitting in a classroom.\nC: A girl is walking in school uniform. In a lush green walk way surrounded with green mountain also, two boys are on the way to school in uniform with a girl holding school bag and walking in school uniform.\nD: A girl is riding a bicycle in a city street.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_192_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_192_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_192_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_192_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_192_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: After reaching the snow coated area, we immediately turned back\nB: During the climb, we were hit by an unexpected snowstorm\nC: At the snow covered area, we decided to turn back\nD: As i approach the snow coated area I climbed the snow covered rocks But we had to stop to put on our gloves and ensure safety measures We then reach our destination, the hot water springs After spending time there , we head home", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: After reaching the snow coated area, we immediately turned back\nB: During the climb, we were hit by an unexpected snowstorm\nC: At the snow covered area, we decided to turn back\nD: As i approach the snow coated area I climbed the snow covered rocks But we had to stop to put on our gloves and ensure safety measures We then reach our destination, the hot water springs After spending time there , we head home", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_193_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_193_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_193_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_193_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_193_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: The group arrived at the coffee table. The group was discussing the various drinks available. Some drinks were served into the cups. The group was chatting and enjoying their drinks.\nB: The female arrived at the dining table. The female was talking with the two males about how good the soup was. The female had served the males some soup into each one of their bowls. The males were happily enjoying their soup. The female sat down to join the males and eat her own bowl of soup.\nC: The male arrived at the dining table. The male was talking with the two females about how good the soup was. The male had served the females some soup into each one of their bowls. The females were happily enjoying their soup. The male sat down to join the females and eat his own bowl of soup.\nD: The male arrived at the dining table. The male was talking with the two females about how good the salad was. The male had served the females some salad into each one of their bowls. The females were happily enjoying their salad. The male sat down to join the females and eat his own bowl of salad.", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: The group arrived at the coffee table. The group was discussing the various drinks available. Some drinks were served into the cups. The group was chatting and enjoying their drinks.\nB: The female arrived at the dining table. The female was talking with the two males about how good the soup was. The female had served the males some soup into each one of their bowls. The males were happily enjoying their soup. The female sat down to join the males and eat her own bowl of soup.\nC: The male arrived at the dining table. The male was talking with the two females about how good the soup was. The male had served the females some soup into each one of their bowls. The females were happily enjoying their soup. The male sat down to join the females and eat his own bowl of soup.\nD: The male arrived at the dining table. The male was talking with the two females about how good the salad was. The male had served the females some salad into each one of their bowls. The females were happily enjoying their salad. The male sat down to join the females and eat his own bowl of salad.", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_194_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_194_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_194_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_194_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_194_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "SSID", "options": "A: this people seem to have fun on top of these horses they are going trough these mountains they are going to have to pass through out some little weird doors it seems to be like a very long ride and at the end they just come back to these little houses\nB: these people are skiing down the mountain\nC: these people are riding bicycles in the city\nD: these people are hiking in the forest", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: this people seem to have fun on top of these horses they are going trough these mountains they are going to have to pass through out some little weird doors it seems to be like a very long ride and at the end they just come back to these little houses\nB: these people are skiing down the mountain\nC: these people are riding bicycles in the city\nD: these people are hiking in the forest", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_195_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_195_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_195_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_195_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_195_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: the sun is rising behind the hills on a beautiful day .\nB: the boat is seen in the distance in front of the hills .the sun is attempting to peak out from the clouds .the sun is hidden by the clouds .the waterway and hillside are blended as one .the sun is setting behind the hills on a beautiful night .\nC: the boat is seen in the distance in front of the mountains .the sun is attempting to peak out from the clouds .the sun is hidden by the clouds .the waterway and mountain are blended as one .the sun is setting behind the mountains on a beautiful night .\nD: the boat is seen up close in front of the hills .the sun is hidden by the clouds .the waterway and hillside are blended as one .the sun is setting behind the hills on a beautiful night .", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: the sun is rising behind the hills on a beautiful day .\nB: the boat is seen in the distance in front of the hills .the sun is attempting to peak out from the clouds .the sun is hidden by the clouds .the waterway and hillside are blended as one .the sun is setting behind the hills on a beautiful night .\nC: the boat is seen in the distance in front of the mountains .the sun is attempting to peak out from the clouds .the sun is hidden by the clouds .the waterway and mountain are blended as one .the sun is setting behind the mountains on a beautiful night .\nD: the boat is seen up close in front of the hills .the sun is hidden by the clouds .the waterway and hillside are blended as one .the sun is setting behind the hills on a beautiful night .", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_196_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_196_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_196_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_196_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_196_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: a mountain trip out to the island .lunch at the mountain top . had a terrible time , do go .the mountain restaurant had awful food .the water was polluted , and the seagulls were aggressivenext time we will go on a nature hike .\nB: a day trip out to the island .lunch at the location location location . got indigestion , do n't go .the harbor restaurant had better food .but the water smelled , and the seagulls were pestsnext time we will go on a historical battlefield tour .\nC: a night trip out to the island .dinner at the location location location . had a great time , do go .the harbor restaurant had terrible food .the water smelled , and the seagulls were a nuisancenext time we will go on a shopping spree .\nD: a beach trip out to the island .lunch at the beach bar . had a great time , do n't go .the beach restaurant had delicious food .but the water was clear , and the seagulls were friendlynext time we will go on a boat cruise .", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: a mountain trip out to the island .lunch at the mountain top . had a terrible time , do go .the mountain restaurant had awful food .the water was polluted , and the seagulls were aggressivenext time we will go on a nature hike .\nB: a day trip out to the island .lunch at the location location location . got indigestion , do n't go .the harbor restaurant had better food .but the water smelled , and the seagulls were pestsnext time we will go on a historical battlefield tour .\nC: a night trip out to the island .dinner at the location location location . had a great time , do go .the harbor restaurant had terrible food .the water smelled , and the seagulls were a nuisancenext time we will go on a shopping spree .\nD: a beach trip out to the island .lunch at the beach bar . had a great time , do n't go .the beach restaurant had delicious food .but the water was clear , and the seagulls were friendlynext time we will go on a boat cruise .", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_197_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_197_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_197_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_197_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_197_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: a child refusing to eat his meal\nB: a group of strangers in the park playing games\nC: our loving family enjoyed our pool play day !father and son enjoy the water activities !here our little guy is getting ready to eat .he is digging into his delicious meal here !he is having a great time eating the cake !\nD: a solo man enjoying some water activities", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: a child refusing to eat his meal\nB: a group of strangers in the park playing games\nC: our loving family enjoyed our pool play day !father and son enjoy the water activities !here our little guy is getting ready to eat .he is digging into his delicious meal here !he is having a great time eating the cake !\nD: a solo man enjoying some water activities", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_198_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_198_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_198_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_198_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_198_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "multiple_image_captioning", "visual_input_component": "Video image or Natural image", "source": "VIST", "options": "A: the weather was terrible !we had to stay indoors the whole time .the food was awful and we were all sick by the end .\nB: our family trip around location was so awesome !we saw many wild animals , including big crocodiles .the historical sites were interesting .there were so many things to see , it was hard choosing !finally we went back to the airport to go home .\nC: the scenery was beautiful !we saw amazing landscapes and breathtaking views .the people were so friendly and welcoming .we had a truly wonderful time !\nD: it was a boring trip .there was nothing exciting to see or do at all .we regretted going there .", "question": "Describe this set of images briefly.", "context": "Select from the following choices.\nA: the weather was terrible !we had to stay indoors the whole time .the food was awful and we were all sick by the end .\nB: our family trip around location was so awesome !we saw many wild animals , including big crocodiles .the historical sites were interesting .there were so many things to see , it was hard choosing !finally we went back to the airport to go home .\nC: the scenery was beautiful !we saw amazing landscapes and breathtaking views .the people were so friendly and welcoming .we had a truly wonderful time !\nD: it was a boring trip .there was nothing exciting to see or do at all .we regretted going there .", "input_image_path": ["./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_199_0.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_199_1.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_199_2.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_199_3.jpg", "./High-level-sub-semantic/multiple_image_captioning/multiple_image_captioning_199_4.jpg"], "output": "B", "qwen3-vl": "image none"}]
\ No newline at end of file
diff --git a/results/next_img_prediction/qwen3-vl/metadata_info.json b/results/next_img_prediction/qwen3-vl/metadata_info.json
new file mode 100644
index 0000000..6660338
--- /dev/null
+++ b/results/next_img_prediction/qwen3-vl/metadata_info.json
@@ -0,0 +1 @@
+[{"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_0_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_0_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_0_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_0_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_0_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_1_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_1_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_1_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_1_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_1_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_2_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_2_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_2_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_2_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_2_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_3_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_3_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_3_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_3_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_3_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_4_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_4_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_4_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_4_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_4_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_5_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_5_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_5_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_5_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_5_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_6_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_6_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_6_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_6_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_6_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_7_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_7_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_7_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_7_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_7_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_8_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_8_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_8_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_8_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_8_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_9_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_9_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_9_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_9_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_9_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_10_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_10_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_10_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_10_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_10_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_11_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_11_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_11_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_11_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_11_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_12_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_12_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_12_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_12_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_12_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_13_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_13_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_13_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_13_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_13_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_14_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_14_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_14_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_14_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_14_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_15_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_15_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_15_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_15_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_15_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_16_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_16_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_16_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_16_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_16_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_17_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_17_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_17_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_17_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_17_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_18_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_18_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_18_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_18_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_18_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_19_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_19_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_19_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_19_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_19_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_20_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_20_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_20_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_20_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_20_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_21_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_21_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_21_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_21_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_21_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_22_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_22_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_22_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_22_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_22_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_23_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_23_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_23_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_23_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_23_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_24_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_24_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_24_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_24_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_24_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_25_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_25_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_25_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_25_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_25_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_26_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_26_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_26_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_26_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_26_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_27_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_27_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_27_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_27_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_27_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_28_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_28_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_28_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_28_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_28_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_29_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_29_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_29_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_29_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_29_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_30_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_30_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_30_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_30_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_30_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_31_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_31_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_31_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_31_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_31_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_32_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_32_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_32_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_32_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_32_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_33_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_33_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_33_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_33_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_33_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_34_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_34_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_34_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_34_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_34_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_35_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_35_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_35_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_35_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_35_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_36_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_36_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_36_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_36_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_36_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_37_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_37_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_37_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_37_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_37_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_38_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_38_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_38_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_38_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_38_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_39_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_39_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_39_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_39_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_39_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_40_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_40_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_40_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_40_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_40_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_41_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_41_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_41_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_41_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_41_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_42_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_42_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_42_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_42_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_42_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_43_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_43_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_43_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_43_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_43_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_44_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_44_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_44_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_44_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_44_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_45_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_45_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_45_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_45_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_45_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_46_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_46_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_46_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_46_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_46_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_47_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_47_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_47_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_47_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_47_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_48_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_48_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_48_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_48_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_48_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_49_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_49_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_49_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_49_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_49_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_50_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_50_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_50_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_50_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_50_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_51_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_51_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_51_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_51_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_51_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_52_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_52_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_52_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_52_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_52_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_53_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_53_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_53_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_53_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_53_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_54_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_54_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_54_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_54_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_54_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_55_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_55_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_55_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_55_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_55_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_56_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_56_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_56_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_56_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_56_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_57_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_57_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_57_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_57_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_57_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_58_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_58_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_58_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_58_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_58_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_59_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_59_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_59_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_59_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_59_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_60_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_60_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_60_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_60_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_60_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_61_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_61_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_61_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_61_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_61_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_62_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_62_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_62_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_62_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_62_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_63_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_63_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_63_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_63_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_63_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_64_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_64_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_64_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_64_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_64_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_65_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_65_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_65_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_65_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_65_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_66_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_66_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_66_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_66_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_66_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_67_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_67_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_67_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_67_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_67_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_68_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_68_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_68_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_68_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_68_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_69_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_69_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_69_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_69_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_69_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_70_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_70_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_70_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_70_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_70_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_71_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_71_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_71_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_71_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_71_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_72_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_72_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_72_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_72_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_72_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_73_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_73_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_73_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_73_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_73_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_74_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_74_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_74_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_74_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_74_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_75_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_75_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_75_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_75_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_75_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_76_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_76_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_76_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_76_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_76_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_77_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_77_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_77_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_77_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_77_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_78_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_78_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_78_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_78_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_78_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_79_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_79_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_79_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_79_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_79_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_80_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_80_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_80_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_80_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_80_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_81_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_81_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_81_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_81_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_81_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_82_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_82_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_82_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_82_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_82_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_83_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_83_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_83_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_83_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_83_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_84_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_84_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_84_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_84_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_84_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_85_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_85_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_85_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_85_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_85_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_86_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_86_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_86_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_86_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_86_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_87_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_87_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_87_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_87_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_87_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_88_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_88_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_88_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_88_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_88_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_89_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_89_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_89_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_89_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_89_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_90_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_90_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_90_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_90_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_90_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_91_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_91_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_91_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_91_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_91_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_92_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_92_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_92_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_92_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_92_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_93_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_93_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_93_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_93_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_93_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_94_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_94_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_94_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_94_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_94_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_95_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_95_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_95_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_95_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_95_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_96_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_96_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_96_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_96_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_96_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_97_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_97_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_97_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_97_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_97_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_98_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_98_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_98_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_98_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_98_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_99_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_99_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_99_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_99_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_99_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_100_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_100_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_100_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_100_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_100_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_101_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_101_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_101_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_101_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_101_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_102_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_102_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_102_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_102_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_102_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_103_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_103_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_103_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_103_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_103_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_104_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_104_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_104_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_104_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_104_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_105_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_105_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_105_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_105_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_105_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_106_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_106_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_106_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_106_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_106_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_107_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_107_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_107_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_107_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_107_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_108_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_108_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_108_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_108_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_108_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_109_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_109_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_109_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_109_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_109_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_110_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_110_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_110_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_110_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_110_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_111_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_111_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_111_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_111_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_111_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_112_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_112_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_112_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_112_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_112_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_113_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_113_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_113_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_113_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_113_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_114_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_114_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_114_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_114_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_114_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_115_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_115_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_115_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_115_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_115_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_116_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_116_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_116_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_116_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_116_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_117_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_117_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_117_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_117_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_117_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_118_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_118_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_118_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_118_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_118_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_119_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_119_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_119_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_119_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_119_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_120_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_120_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_120_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_120_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_120_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_121_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_121_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_121_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_121_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_121_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_122_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_122_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_122_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_122_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_122_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_123_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_123_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_123_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_123_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_123_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_124_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_124_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_124_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_124_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_124_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_125_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_125_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_125_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_125_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_125_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_126_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_126_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_126_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_126_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_126_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_127_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_127_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_127_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_127_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_127_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_128_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_128_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_128_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_128_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_128_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_129_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_129_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_129_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_129_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_129_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_130_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_130_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_130_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_130_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_130_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_131_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_131_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_131_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_131_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_131_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_132_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_132_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_132_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_132_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_132_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_133_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_133_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_133_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_133_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_133_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_134_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_134_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_134_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_134_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_134_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_135_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_135_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_135_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_135_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_135_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_136_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_136_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_136_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_136_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_136_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_137_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_137_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_137_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_137_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_137_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_138_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_138_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_138_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_138_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_138_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_139_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_139_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_139_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_139_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_139_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_140_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_140_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_140_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_140_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_140_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_141_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_141_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_141_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_141_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_141_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_142_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_142_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_142_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_142_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_142_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_143_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_143_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_143_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_143_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_143_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_144_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_144_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_144_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_144_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_144_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_145_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_145_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_145_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_145_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_145_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_146_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_146_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_146_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_146_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_146_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_147_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_147_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_147_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_147_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_147_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_148_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_148_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_148_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_148_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_148_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_149_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_149_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_149_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_149_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_149_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_150_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_150_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_150_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_150_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_150_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_151_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_151_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_151_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_151_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_151_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_152_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_152_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_152_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_152_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_152_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_153_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_153_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_153_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_153_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_153_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_154_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_154_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_154_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_154_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_154_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_155_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_155_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_155_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_155_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_155_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_156_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_156_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_156_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_156_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_156_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_157_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_157_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_157_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_157_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_157_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_158_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_158_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_158_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_158_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_158_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_159_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_159_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_159_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_159_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_159_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_160_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_160_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_160_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_160_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_160_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_161_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_161_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_161_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_161_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_161_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_162_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_162_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_162_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_162_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_162_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_163_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_163_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_163_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_163_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_163_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_164_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_164_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_164_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_164_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_164_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_165_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_165_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_165_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_165_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_165_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_166_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_166_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_166_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_166_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_166_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_167_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_167_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_167_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_167_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_167_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_168_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_168_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_168_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_168_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_168_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_169_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_169_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_169_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_169_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_169_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_170_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_170_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_170_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_170_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_170_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_171_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_171_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_171_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_171_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_171_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_172_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_172_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_172_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_172_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_172_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_173_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_173_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_173_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_173_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_173_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_174_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_174_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_174_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_174_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_174_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_175_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_175_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_175_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_175_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_175_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_176_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_176_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_176_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_176_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_176_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_177_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_177_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_177_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_177_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_177_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_178_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_178_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_178_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_178_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_178_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_179_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_179_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_179_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_179_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_179_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_180_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_180_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_180_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_180_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_180_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_181_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_181_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_181_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_181_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_181_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_182_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_182_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_182_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_182_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_182_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_183_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_183_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_183_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_183_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_183_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_184_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_184_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_184_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_184_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_184_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_185_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_185_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_185_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_185_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_185_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_186_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_186_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_186_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_186_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_186_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_187_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_187_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_187_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_187_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_187_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_188_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_188_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_188_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_188_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_188_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_189_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_189_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_189_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_189_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_189_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_190_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_190_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_190_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_190_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_190_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_191_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_191_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_191_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_191_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_191_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_192_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_192_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_192_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_192_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_192_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_193_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_193_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_193_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_193_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_193_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_194_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_194_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_194_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_194_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_194_4.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_195_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_195_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_195_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_195_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_195_4.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_196_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_196_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_196_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_196_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_196_4.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_197_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_197_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_197_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_197_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_197_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_198_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_198_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_198_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_198_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_198_4.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "next_img_prediction", "visual_input_component": "Video image or Natural image", "source": "MovingMNIST", "options": "A: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "question": "Please predict the last 10 frames of the video based on the first 10 frames of the input video. Note that the order is from left to right.The input first 10 frames is the first image", "context": "Select from the following choices.\nA: The second image \nB: The third image \nC: The forth image \nD: The fifth image", "input_image_path": ["./Continuous-temporal/next_img_prediction/next_img_prediction_199_0.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_199_1.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_199_2.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_199_3.png", "./Continuous-temporal/next_img_prediction/next_img_prediction_199_4.png"], "output": "D", "qwen3-vl": "image none"}]
\ No newline at end of file
diff --git a/results/person_reid/qwen3-vl/metadata_info.json b/results/person_reid/qwen3-vl/metadata_info.json
new file mode 100644
index 0000000..4a31882
--- /dev/null
+++ b/results/person_reid/qwen3-vl/metadata_info.json
@@ -0,0 +1 @@
+[{"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_0_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_0_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_0_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_0_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_0_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_1_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_1_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_1_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_1_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_1_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_2_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_2_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_2_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_2_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_2_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_3_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_3_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_3_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_3_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_3_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_4_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_4_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_4_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_4_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_4_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_5_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_5_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_5_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_5_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_5_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_6_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_6_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_6_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_6_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_6_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_7_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_7_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_7_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_7_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_7_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_8_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_8_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_8_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_8_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_8_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_9_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_9_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_9_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_9_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_9_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_10_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_10_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_10_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_10_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_10_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_11_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_11_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_11_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_11_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_11_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_12_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_12_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_12_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_12_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_12_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_13_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_13_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_13_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_13_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_13_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_14_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_14_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_14_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_14_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_14_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_15_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_15_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_15_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_15_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_15_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_16_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_16_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_16_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_16_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_16_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_17_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_17_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_17_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_17_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_17_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_18_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_18_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_18_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_18_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_18_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_19_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_19_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_19_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_19_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_19_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_20_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_20_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_20_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_20_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_20_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_21_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_21_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_21_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_21_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_21_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_22_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_22_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_22_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_22_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_22_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_23_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_23_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_23_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_23_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_23_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_24_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_24_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_24_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_24_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_24_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_25_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_25_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_25_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_25_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_25_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_26_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_26_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_26_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_26_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_26_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_27_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_27_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_27_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_27_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_27_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_28_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_28_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_28_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_28_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_28_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_29_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_29_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_29_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_29_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_29_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_30_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_30_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_30_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_30_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_30_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_31_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_31_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_31_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_31_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_31_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_32_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_32_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_32_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_32_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_32_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_33_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_33_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_33_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_33_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_33_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_34_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_34_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_34_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_34_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_34_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_35_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_35_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_35_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_35_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_35_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_36_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_36_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_36_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_36_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_36_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_37_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_37_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_37_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_37_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_37_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_38_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_38_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_38_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_38_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_38_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_39_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_39_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_39_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_39_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_39_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_40_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_40_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_40_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_40_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_40_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_41_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_41_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_41_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_41_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_41_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_42_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_42_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_42_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_42_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_42_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_43_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_43_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_43_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_43_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_43_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_44_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_44_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_44_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_44_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_44_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_45_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_45_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_45_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_45_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_45_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_46_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_46_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_46_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_46_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_46_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_47_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_47_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_47_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_47_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_47_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_48_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_48_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_48_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_48_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_48_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_49_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_49_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_49_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_49_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_49_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_50_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_50_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_50_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_50_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_50_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_51_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_51_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_51_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_51_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_51_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_52_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_52_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_52_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_52_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_52_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_53_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_53_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_53_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_53_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_53_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_54_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_54_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_54_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_54_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_54_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_55_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_55_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_55_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_55_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_55_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_56_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_56_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_56_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_56_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_56_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_57_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_57_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_57_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_57_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_57_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_58_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_58_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_58_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_58_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_58_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_59_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_59_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_59_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_59_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_59_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_60_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_60_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_60_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_60_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_60_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_61_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_61_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_61_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_61_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_61_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_62_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_62_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_62_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_62_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_62_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_63_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_63_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_63_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_63_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_63_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_64_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_64_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_64_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_64_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_64_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_65_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_65_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_65_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_65_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_65_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_66_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_66_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_66_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_66_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_66_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_67_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_67_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_67_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_67_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_67_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_68_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_68_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_68_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_68_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_68_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_69_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_69_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_69_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_69_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_69_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_70_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_70_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_70_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_70_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_70_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_71_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_71_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_71_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_71_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_71_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_72_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_72_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_72_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_72_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_72_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_73_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_73_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_73_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_73_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_73_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_74_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_74_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_74_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_74_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_74_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_75_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_75_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_75_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_75_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_75_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_76_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_76_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_76_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_76_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_76_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_77_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_77_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_77_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_77_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_77_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_78_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_78_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_78_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_78_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_78_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_79_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_79_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_79_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_79_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_79_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_80_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_80_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_80_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_80_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_80_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_81_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_81_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_81_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_81_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_81_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_82_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_82_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_82_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_82_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_82_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_83_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_83_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_83_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_83_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_83_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_84_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_84_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_84_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_84_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_84_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_85_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_85_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_85_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_85_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_85_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_86_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_86_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_86_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_86_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_86_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_87_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_87_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_87_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_87_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_87_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_88_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_88_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_88_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_88_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_88_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_89_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_89_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_89_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_89_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_89_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_90_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_90_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_90_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_90_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_90_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_91_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_91_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_91_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_91_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_91_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_92_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_92_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_92_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_92_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_92_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_93_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_93_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_93_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_93_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_93_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_94_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_94_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_94_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_94_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_94_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_95_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_95_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_95_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_95_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_95_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_96_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_96_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_96_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_96_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_96_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_97_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_97_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_97_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_97_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_97_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_98_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_98_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_98_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_98_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_98_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_99_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_99_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_99_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_99_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_99_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_100_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_100_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_100_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_100_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_100_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_101_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_101_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_101_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_101_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_101_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_102_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_102_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_102_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_102_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_102_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_103_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_103_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_103_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_103_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_103_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_104_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_104_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_104_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_104_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_104_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_105_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_105_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_105_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_105_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_105_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_106_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_106_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_106_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_106_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_106_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_107_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_107_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_107_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_107_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_107_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_108_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_108_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_108_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_108_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_108_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_109_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_109_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_109_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_109_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_109_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_110_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_110_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_110_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_110_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_110_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_111_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_111_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_111_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_111_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_111_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_112_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_112_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_112_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_112_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_112_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_113_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_113_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_113_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_113_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_113_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_114_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_114_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_114_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_114_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_114_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_115_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_115_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_115_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_115_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_115_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_116_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_116_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_116_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_116_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_116_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_117_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_117_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_117_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_117_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_117_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_118_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_118_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_118_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_118_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_118_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_119_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_119_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_119_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_119_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_119_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_120_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_120_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_120_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_120_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_120_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_121_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_121_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_121_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_121_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_121_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_122_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_122_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_122_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_122_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_122_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_123_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_123_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_123_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_123_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_123_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_124_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_124_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_124_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_124_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_124_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_125_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_125_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_125_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_125_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_125_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_126_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_126_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_126_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_126_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_126_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_127_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_127_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_127_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_127_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_127_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_128_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_128_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_128_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_128_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_128_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_129_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_129_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_129_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_129_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_129_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_130_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_130_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_130_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_130_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_130_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_131_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_131_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_131_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_131_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_131_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_132_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_132_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_132_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_132_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_132_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_133_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_133_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_133_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_133_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_133_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_134_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_134_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_134_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_134_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_134_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_135_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_135_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_135_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_135_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_135_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_136_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_136_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_136_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_136_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_136_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_137_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_137_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_137_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_137_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_137_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_138_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_138_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_138_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_138_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_138_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_139_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_139_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_139_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_139_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_139_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_140_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_140_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_140_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_140_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_140_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_141_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_141_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_141_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_141_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_141_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_142_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_142_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_142_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_142_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_142_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_143_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_143_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_143_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_143_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_143_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_144_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_144_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_144_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_144_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_144_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_145_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_145_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_145_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_145_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_145_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_146_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_146_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_146_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_146_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_146_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_147_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_147_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_147_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_147_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_147_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_148_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_148_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_148_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_148_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_148_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_149_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_149_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_149_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_149_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_149_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_150_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_150_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_150_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_150_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_150_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_151_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_151_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_151_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_151_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_151_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_152_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_152_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_152_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_152_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_152_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_153_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_153_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_153_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_153_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_153_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_154_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_154_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_154_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_154_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_154_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_155_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_155_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_155_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_155_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_155_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_156_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_156_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_156_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_156_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_156_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_157_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_157_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_157_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_157_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_157_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_158_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_158_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_158_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_158_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_158_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_159_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_159_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_159_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_159_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_159_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_160_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_160_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_160_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_160_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_160_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_161_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_161_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_161_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_161_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_161_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_162_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_162_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_162_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_162_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_162_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_163_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_163_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_163_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_163_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_163_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_164_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_164_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_164_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_164_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_164_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_165_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_165_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_165_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_165_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_165_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_166_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_166_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_166_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_166_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_166_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_167_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_167_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_167_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_167_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_167_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_168_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_168_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_168_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_168_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_168_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_169_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_169_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_169_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_169_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_169_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_170_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_170_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_170_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_170_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_170_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_171_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_171_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_171_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_171_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_171_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_172_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_172_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_172_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_172_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_172_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_173_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_173_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_173_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_173_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_173_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_174_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_174_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_174_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_174_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_174_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_175_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_175_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_175_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_175_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_175_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_176_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_176_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_176_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_176_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_176_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_177_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_177_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_177_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_177_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_177_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_178_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_178_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_178_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_178_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_178_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_179_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_179_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_179_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_179_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_179_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_180_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_180_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_180_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_180_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_180_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_181_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_181_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_181_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_181_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_181_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_182_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_182_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_182_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_182_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_182_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_183_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_183_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_183_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_183_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_183_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_184_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_184_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_184_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_184_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_184_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_185_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_185_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_185_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_185_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_185_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_186_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_186_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_186_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_186_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_186_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_187_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_187_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_187_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_187_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_187_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_188_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_188_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_188_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_188_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_188_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_189_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_189_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_189_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_189_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_189_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_190_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_190_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_190_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_190_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_190_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_191_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_191_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_191_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_191_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_191_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_192_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_192_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_192_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_192_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_192_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_193_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_193_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_193_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_193_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_193_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_194_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_194_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_194_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_194_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_194_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_195_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_195_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_195_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_195_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_195_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_196_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_196_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_196_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_196_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_196_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_197_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_197_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_197_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_197_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_197_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_198_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_198_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_198_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_198_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_198_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "person_reid", "visual_input_component": "['natural_image']", "source": "market_1501", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar person to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/person_reid/person_reid_199_0.jpg", "./High-level-obj-semantic/person_reid/person_reid_199_1.jpg", "./High-level-obj-semantic/person_reid/person_reid_199_2.jpg", "./High-level-obj-semantic/person_reid/person_reid_199_3.jpg", "./High-level-obj-semantic/person_reid/person_reid_199_4.jpg"], "output": "D", "qwen3-vl": "image none"}]
\ No newline at end of file
diff --git a/results/point_tracking/qwen3-vl/metadata_info.json b/results/point_tracking/qwen3-vl/metadata_info.json
new file mode 100644
index 0000000..c3746a4
--- /dev/null
+++ b/results/point_tracking/qwen3-vl/metadata_info.json
@@ -0,0 +1 @@
+[{"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.627, 0.2]\nB: [0.166, 0.657]\nC: [0.95, 0.907]\nD: [0.328, 0.477]", "question": "What is the position coordinates of the point with coordinates ([0.0, 0.0]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.627, 0.2]\nB: [0.166, 0.657]\nC: [0.95, 0.907]\nD: [0.328, 0.477]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_0_0.jpg", "./2D-spatial/point_tracking/point_tracking_0_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.368, 0.265]\nB: [0.925, 0.128]\nC: [0.133, 0.261]\nD: [0.488, 0.101]", "question": "What is the position coordinates of the point with coordinates ([0.366, 0.265]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.368, 0.265]\nB: [0.925, 0.128]\nC: [0.133, 0.261]\nD: [0.488, 0.101]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_1_0.jpg", "./2D-spatial/point_tracking/point_tracking_1_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.398, 0.165]\nB: [0.606, 0.999]\nC: [0.955, 0.756]\nD: [0.976, 0.964]", "question": "What is the position coordinates of the point with coordinates ([0.488, -0.073]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.398, 0.165]\nB: [0.606, 0.999]\nC: [0.955, 0.756]\nD: [0.976, 0.964]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_2_0.jpg", "./2D-spatial/point_tracking/point_tracking_2_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.411, 0.483]\nB: [0.624, 0.13]\nC: [0.256, 0.845]\nD: [0.393, 0.328]", "question": "What is the position coordinates of the point with coordinates ([0.0, 0.0]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.411, 0.483]\nB: [0.624, 0.13]\nC: [0.256, 0.845]\nD: [0.393, 0.328]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_3_0.jpg", "./2D-spatial/point_tracking/point_tracking_3_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.851, 0.69]\nB: [0.112, 0.164]\nC: [0.561, 0.3]\nD: [0.69, 0.205]", "question": "What is the position coordinates of the point with coordinates ([0.572, 0.294]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.851, 0.69]\nB: [0.112, 0.164]\nC: [0.561, 0.3]\nD: [0.69, 0.205]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_4_0.jpg", "./2D-spatial/point_tracking/point_tracking_4_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.71, 0.765]\nB: [0.039, 0.565]\nC: [0.599, 0.897]\nD: [0.077, 0.037]", "question": "What is the position coordinates of the point with coordinates ([0.127, 0.205]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.71, 0.765]\nB: [0.039, 0.565]\nC: [0.599, 0.897]\nD: [0.077, 0.037]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_5_0.jpg", "./2D-spatial/point_tracking/point_tracking_5_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.0, 0.0]\nB: [0.75, 0.266]\nC: [0.658, 0.765]\nD: [0.825, 0.377]", "question": "What is the position coordinates of the point with coordinates ([0.0, 0.0]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.0, 0.0]\nB: [0.75, 0.266]\nC: [0.658, 0.765]\nD: [0.825, 0.377]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_6_0.jpg", "./2D-spatial/point_tracking/point_tracking_6_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.458, 0.112]\nB: [0.522, 0.216]\nC: [0.672, 0.493]\nD: [0.435, 0.891]", "question": "What is the position coordinates of the point with coordinates ([0.392, 0.15]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.458, 0.112]\nB: [0.522, 0.216]\nC: [0.672, 0.493]\nD: [0.435, 0.891]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_7_0.jpg", "./2D-spatial/point_tracking/point_tracking_7_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.0, 0.0]\nB: [0.055, 0.212]\nC: [0.926, 0.897]\nD: [0.088, 0.69]", "question": "What is the position coordinates of the point with coordinates ([0.0, 0.0]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.0, 0.0]\nB: [0.055, 0.212]\nC: [0.926, 0.897]\nD: [0.088, 0.69]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_8_0.jpg", "./2D-spatial/point_tracking/point_tracking_8_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.572, 0.347]\nB: [0.822, 0.524]\nC: [0.668, 0.975]\nD: [0.228, 0.421]", "question": "What is the position coordinates of the point with coordinates ([0.84, 0.359]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.572, 0.347]\nB: [0.822, 0.524]\nC: [0.668, 0.975]\nD: [0.228, 0.421]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_9_0.jpg", "./2D-spatial/point_tracking/point_tracking_9_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.55, 0.157]\nB: [0.225, 0.407]\nC: [0.428, 0.202]\nD: [0.848, 0.045]", "question": "What is the position coordinates of the point with coordinates ([0.195, 0.402]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.55, 0.157]\nB: [0.225, 0.407]\nC: [0.428, 0.202]\nD: [0.848, 0.045]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_10_0.jpg", "./2D-spatial/point_tracking/point_tracking_10_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.606, 0.811]\nB: [0.463, 0.023]\nC: [0.307, 0.429]\nD: [0.789, 0.214]", "question": "What is the position coordinates of the point with coordinates ([0.793, 0.216]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.606, 0.811]\nB: [0.463, 0.023]\nC: [0.307, 0.429]\nD: [0.789, 0.214]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_11_0.jpg", "./2D-spatial/point_tracking/point_tracking_11_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.276, 0.532]\nB: [0.401, 0.534]\nC: [0.28, 0.157]\nD: [0.0, 0.0]", "question": "What is the position coordinates of the point with coordinates ([0.0, 0.0]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.276, 0.532]\nB: [0.401, 0.534]\nC: [0.28, 0.157]\nD: [0.0, 0.0]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_12_0.jpg", "./2D-spatial/point_tracking/point_tracking_12_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.447, 0.29]\nB: [0.574, 0.304]\nC: [0.111, 0.034]\nD: [0.966, 0.262]", "question": "What is the position coordinates of the point with coordinates ([0.574, 0.304]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.447, 0.29]\nB: [0.574, 0.304]\nC: [0.111, 0.034]\nD: [0.966, 0.262]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_13_0.jpg", "./2D-spatial/point_tracking/point_tracking_13_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.769, 0.374]\nB: [0.69, -0.054]\nC: [0.182, 0.457]\nD: [0.423, 0.809]", "question": "What is the position coordinates of the point with coordinates ([0.723, -0.019]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.769, 0.374]\nB: [0.69, -0.054]\nC: [0.182, 0.457]\nD: [0.423, 0.809]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_14_0.jpg", "./2D-spatial/point_tracking/point_tracking_14_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.861, 0.924]\nB: [0.976, 0.801]\nC: [0.63, 0.946]\nD: [0.457, 0.566]", "question": "What is the position coordinates of the point with coordinates ([0.491, 0.572]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.861, 0.924]\nB: [0.976, 0.801]\nC: [0.63, 0.946]\nD: [0.457, 0.566]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_15_0.jpg", "./2D-spatial/point_tracking/point_tracking_15_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.737, 0.324]\nB: [0.346, 0.386]\nC: [0.464, 0.662]\nD: [0.24, 0.833]", "question": "What is the position coordinates of the point with coordinates ([0.24, 0.833]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.737, 0.324]\nB: [0.346, 0.386]\nC: [0.464, 0.662]\nD: [0.24, 0.833]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_16_0.jpg", "./2D-spatial/point_tracking/point_tracking_16_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.902, 0.889]\nB: [0.734, 0.179]\nC: [0.695, 0.313]\nD: [0.552, 0.586]", "question": "What is the position coordinates of the point with coordinates ([0.552, 0.586]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.902, 0.889]\nB: [0.734, 0.179]\nC: [0.695, 0.313]\nD: [0.552, 0.586]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_17_0.jpg", "./2D-spatial/point_tracking/point_tracking_17_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.015, 0.757]\nB: [0.493, 0.371]\nC: [0.002, 0.142]\nD: [0.438, 0.698]", "question": "What is the position coordinates of the point with coordinates ([0.496, 0.371]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.015, 0.757]\nB: [0.493, 0.371]\nC: [0.002, 0.142]\nD: [0.438, 0.698]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_18_0.jpg", "./2D-spatial/point_tracking/point_tracking_18_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.629, 0.185]\nB: [0.357, 0.413]\nC: [0.521, 0.95]\nD: [0.591, 0.415]", "question": "What is the position coordinates of the point with coordinates ([0.598, 0.417]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.629, 0.185]\nB: [0.357, 0.413]\nC: [0.521, 0.95]\nD: [0.591, 0.415]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_19_0.jpg", "./2D-spatial/point_tracking/point_tracking_19_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.244, 0.498]\nB: [0.749, 0.317]\nC: [0.76, 0.581]\nD: [0.806, 0.63]", "question": "What is the position coordinates of the point with coordinates ([0.678, 0.324]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.244, 0.498]\nB: [0.749, 0.317]\nC: [0.76, 0.581]\nD: [0.806, 0.63]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_20_0.jpg", "./2D-spatial/point_tracking/point_tracking_20_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.786, 0.763]\nB: [0.139, 0.661]\nC: [0.549, 0.391]\nD: [0.901, 0.478]", "question": "What is the position coordinates of the point with coordinates ([0.577, 0.479]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.786, 0.763]\nB: [0.139, 0.661]\nC: [0.549, 0.391]\nD: [0.901, 0.478]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_21_0.jpg", "./2D-spatial/point_tracking/point_tracking_21_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.84, 0.364]\nB: [0.664, 0.326]\nC: [0.643, 0.579]\nD: [0.486, 0.458]", "question": "What is the position coordinates of the point with coordinates ([0.836, 0.364]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.84, 0.364]\nB: [0.664, 0.326]\nC: [0.643, 0.579]\nD: [0.486, 0.458]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_22_0.jpg", "./2D-spatial/point_tracking/point_tracking_22_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.667, 0.104]\nB: [0.801, 0.792]\nC: [0.271, 0.317]\nD: [0.699, 0.539]", "question": "What is the position coordinates of the point with coordinates ([0.631, 0.551]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.667, 0.104]\nB: [0.801, 0.792]\nC: [0.271, 0.317]\nD: [0.699, 0.539]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_23_0.jpg", "./2D-spatial/point_tracking/point_tracking_23_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.433, 0.435]\nB: [0.509, 0.298]\nC: [0.517, 0.969]\nD: [0.096, 0.626]", "question": "What is the position coordinates of the point with coordinates ([0.517, 0.969]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.433, 0.435]\nB: [0.509, 0.298]\nC: [0.517, 0.969]\nD: [0.096, 0.626]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_24_0.jpg", "./2D-spatial/point_tracking/point_tracking_24_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.572, 0.447]\nB: [0.317, 0.394]\nC: [0.276, 0.148]\nD: [0.404, 0.225]", "question": "What is the position coordinates of the point with coordinates ([0.571, 0.446]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.572, 0.447]\nB: [0.317, 0.394]\nC: [0.276, 0.148]\nD: [0.404, 0.225]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_25_0.jpg", "./2D-spatial/point_tracking/point_tracking_25_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.082, 0.932]\nB: [0.086, 0.159]\nC: [0.711, 0.457]\nD: [0.056, 0.373]", "question": "What is the position coordinates of the point with coordinates ([0.082, 0.932]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.082, 0.932]\nB: [0.086, 0.159]\nC: [0.711, 0.457]\nD: [0.056, 0.373]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_26_0.jpg", "./2D-spatial/point_tracking/point_tracking_26_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.151, 0.743]\nB: [0.746, 0.222]\nC: [0.439, 0.384]\nD: [0.367, 0.888]", "question": "What is the position coordinates of the point with coordinates ([0.717, 0.34]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.151, 0.743]\nB: [0.746, 0.222]\nC: [0.439, 0.384]\nD: [0.367, 0.888]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_27_0.jpg", "./2D-spatial/point_tracking/point_tracking_27_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.215, 0.968]\nB: [0.558, 0.522]\nC: [0.967, 0.723]\nD: [0.212, 0.809]", "question": "What is the position coordinates of the point with coordinates ([0.584, 0.596]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.215, 0.968]\nB: [0.558, 0.522]\nC: [0.967, 0.723]\nD: [0.212, 0.809]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_28_0.jpg", "./2D-spatial/point_tracking/point_tracking_28_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.413, 0.07]\nB: [0.437, 0.318]\nC: [0.155, 0.833]\nD: [0.607, 0.498]", "question": "What is the position coordinates of the point with coordinates ([0.525, 0.482]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.413, 0.07]\nB: [0.437, 0.318]\nC: [0.155, 0.833]\nD: [0.607, 0.498]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_29_0.jpg", "./2D-spatial/point_tracking/point_tracking_29_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.438, 0.097]\nB: [0.631, 0.018]\nC: [0.215, 0.313]\nD: [0.263, 0.723]", "question": "What is the position coordinates of the point with coordinates ([0.0, 0.0]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.438, 0.097]\nB: [0.631, 0.018]\nC: [0.215, 0.313]\nD: [0.263, 0.723]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_30_0.jpg", "./2D-spatial/point_tracking/point_tracking_30_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.896, 0.061]\nB: [0.167, 0.451]\nC: [0.216, 0.513]\nD: [0.57, 0.361]", "question": "What is the position coordinates of the point with coordinates ([0.569, 0.361]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.896, 0.061]\nB: [0.167, 0.451]\nC: [0.216, 0.513]\nD: [0.57, 0.361]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_31_0.jpg", "./2D-spatial/point_tracking/point_tracking_31_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.917, 0.582]\nB: [0.858, 0.833]\nC: [0.962, 0.955]\nD: [0.285, 0.385]", "question": "What is the position coordinates of the point with coordinates ([0.285, 0.385]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.917, 0.582]\nB: [0.858, 0.833]\nC: [0.962, 0.955]\nD: [0.285, 0.385]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_32_0.jpg", "./2D-spatial/point_tracking/point_tracking_32_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.65, 0.016]\nB: [0.761, 0.985]\nC: [0.538, 0.359]\nD: [0.842, 0.025]", "question": "What is the position coordinates of the point with coordinates ([0.537, 0.35]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.65, 0.016]\nB: [0.761, 0.985]\nC: [0.538, 0.359]\nD: [0.842, 0.025]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_33_0.jpg", "./2D-spatial/point_tracking/point_tracking_33_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.9, 0.904]\nB: [0.664, 0.466]\nC: [0.273, 0.03]\nD: [0.393, 0.275]", "question": "What is the position coordinates of the point with coordinates ([0.427, 0.335]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.9, 0.904]\nB: [0.664, 0.466]\nC: [0.273, 0.03]\nD: [0.393, 0.275]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_34_0.jpg", "./2D-spatial/point_tracking/point_tracking_34_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.051, 0.768]\nB: [0.363, 0.364]\nC: [0.376, 0.685]\nD: [0.454, 0.177]", "question": "What is the position coordinates of the point with coordinates ([0.0, 0.0]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.051, 0.768]\nB: [0.363, 0.364]\nC: [0.376, 0.685]\nD: [0.454, 0.177]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_35_0.jpg", "./2D-spatial/point_tracking/point_tracking_35_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.086, 0.538]\nB: [0.209, 0.589]\nC: [0.727, 0.366]\nD: [0.529, 0.299]", "question": "What is the position coordinates of the point with coordinates ([0.789, 0.359]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.086, 0.538]\nB: [0.209, 0.589]\nC: [0.727, 0.366]\nD: [0.529, 0.299]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_36_0.jpg", "./2D-spatial/point_tracking/point_tracking_36_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.5, 0.007]\nB: [0.731, 0.113]\nC: [0.636, 0.642]\nD: [0.325, 0.315]", "question": "What is the position coordinates of the point with coordinates ([0.5, 0.007]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.5, 0.007]\nB: [0.731, 0.113]\nC: [0.636, 0.642]\nD: [0.325, 0.315]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_37_0.jpg", "./2D-spatial/point_tracking/point_tracking_37_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.0, 0.0]\nB: [0.032, 0.829]\nC: [0.507, 0.48]\nD: [0.697, 0.839]", "question": "What is the position coordinates of the point with coordinates ([0.296, 0.358]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.0, 0.0]\nB: [0.032, 0.829]\nC: [0.507, 0.48]\nD: [0.697, 0.839]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_38_0.jpg", "./2D-spatial/point_tracking/point_tracking_38_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.773, 0.291]\nB: [0.256, 0.091]\nC: [0.561, 0.908]\nD: [0.572, 0.294]", "question": "What is the position coordinates of the point with coordinates ([0.572, 0.294]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.773, 0.291]\nB: [0.256, 0.091]\nC: [0.561, 0.908]\nD: [0.572, 0.294]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_39_0.jpg", "./2D-spatial/point_tracking/point_tracking_39_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.278, 0.564]\nB: [0.995, 0.367]\nC: [0.923, 0.335]\nD: [0.942, 0.46]", "question": "What is the position coordinates of the point with coordinates ([0.995, 0.367]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.278, 0.564]\nB: [0.995, 0.367]\nC: [0.923, 0.335]\nD: [0.942, 0.46]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_40_0.jpg", "./2D-spatial/point_tracking/point_tracking_40_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.791, 0.587]\nB: [0.006, 0.092]\nC: [0.454, 0.459]\nD: [0.339, 0.211]", "question": "What is the position coordinates of the point with coordinates ([0.339, 0.211]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.791, 0.587]\nB: [0.006, 0.092]\nC: [0.454, 0.459]\nD: [0.339, 0.211]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_41_0.jpg", "./2D-spatial/point_tracking/point_tracking_41_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.834, 0.042]\nB: [0.0, 0.0]\nC: [0.657, 0.031]\nD: [0.366, 0.215]", "question": "What is the position coordinates of the point with coordinates ([0.0, 0.0]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.834, 0.042]\nB: [0.0, 0.0]\nC: [0.657, 0.031]\nD: [0.366, 0.215]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_42_0.jpg", "./2D-spatial/point_tracking/point_tracking_42_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.522, 0.936]\nB: [0.431, 0.505]\nC: [0.056, 0.43]\nD: [0.445, 0.055]", "question": "What is the position coordinates of the point with coordinates ([0.465, 0.516]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.522, 0.936]\nB: [0.431, 0.505]\nC: [0.056, 0.43]\nD: [0.445, 0.055]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_43_0.jpg", "./2D-spatial/point_tracking/point_tracking_43_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.441, 0.076]\nB: [0.638, 0.275]\nC: [0.844, 0.793]\nD: [0.485, 0.944]", "question": "What is the position coordinates of the point with coordinates ([0.638, 0.276]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.441, 0.076]\nB: [0.638, 0.275]\nC: [0.844, 0.793]\nD: [0.485, 0.944]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_44_0.jpg", "./2D-spatial/point_tracking/point_tracking_44_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.464, 0.801]\nB: [0.453, 0.251]\nC: [0.254, 0.642]\nD: [0.099, 0.252]", "question": "What is the position coordinates of the point with coordinates ([0.254, 0.642]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.464, 0.801]\nB: [0.453, 0.251]\nC: [0.254, 0.642]\nD: [0.099, 0.252]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_45_0.jpg", "./2D-spatial/point_tracking/point_tracking_45_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.281, 0.178]\nB: [0.162, 0.715]\nC: [0.761, 0.046]\nD: [0.557, 0.001]", "question": "What is the position coordinates of the point with coordinates ([0.571, 0.033]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.281, 0.178]\nB: [0.162, 0.715]\nC: [0.761, 0.046]\nD: [0.557, 0.001]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_46_0.jpg", "./2D-spatial/point_tracking/point_tracking_46_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [1.0, 0.279]\nB: [0.584, 0.204]\nC: [0.191, 0.877]\nD: [0.563, 0.267]", "question": "What is the position coordinates of the point with coordinates ([0.582, 0.204]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [1.0, 0.279]\nB: [0.584, 0.204]\nC: [0.191, 0.877]\nD: [0.563, 0.267]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_47_0.jpg", "./2D-spatial/point_tracking/point_tracking_47_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.002, 0.108]\nB: [0.549, 0.37]\nC: [0.846, 0.072]\nD: [0.502, 0.698]", "question": "What is the position coordinates of the point with coordinates ([0.552, 0.368]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.002, 0.108]\nB: [0.549, 0.37]\nC: [0.846, 0.072]\nD: [0.502, 0.698]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_48_0.jpg", "./2D-spatial/point_tracking/point_tracking_48_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.509, 0.858]\nB: [0.643, 0.572]\nC: [0.432, 0.735]\nD: [0.542, 0.338]", "question": "What is the position coordinates of the point with coordinates ([0.542, 0.339]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.509, 0.858]\nB: [0.643, 0.572]\nC: [0.432, 0.735]\nD: [0.542, 0.338]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_49_0.jpg", "./2D-spatial/point_tracking/point_tracking_49_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.048, 0.391]\nB: [0.787, 0.747]\nC: [0.518, 0.517]\nD: [0.507, 0.833]", "question": "What is the position coordinates of the point with coordinates ([0.515, 0.514]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.048, 0.391]\nB: [0.787, 0.747]\nC: [0.518, 0.517]\nD: [0.507, 0.833]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_50_0.jpg", "./2D-spatial/point_tracking/point_tracking_50_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.528, 0.45]\nB: [0.74, 0.315]\nC: [0.482, 0.584]\nD: [0.088, 0.042]", "question": "What is the position coordinates of the point with coordinates ([0.723, 0.386]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.528, 0.45]\nB: [0.74, 0.315]\nC: [0.482, 0.584]\nD: [0.088, 0.042]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_51_0.jpg", "./2D-spatial/point_tracking/point_tracking_51_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.04, 0.904]\nB: [0.4, 0.187]\nC: [0.134, 0.465]\nD: [0.294, 0.45]", "question": "What is the position coordinates of the point with coordinates ([0.056, 0.907]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.04, 0.904]\nB: [0.4, 0.187]\nC: [0.134, 0.465]\nD: [0.294, 0.45]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_52_0.jpg", "./2D-spatial/point_tracking/point_tracking_52_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.232, 0.071]\nB: [0.57, 0.335]\nC: [0.206, 0.4]\nD: [0.554, 0.081]", "question": "What is the position coordinates of the point with coordinates ([0.585, 0.342]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.232, 0.071]\nB: [0.57, 0.335]\nC: [0.206, 0.4]\nD: [0.554, 0.081]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_53_0.jpg", "./2D-spatial/point_tracking/point_tracking_53_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.043, 0.026]\nB: [0.536, 0.287]\nC: [0.878, 0.179]\nD: [0.519, 0.466]", "question": "What is the position coordinates of the point with coordinates ([0.537, 0.483]) in Image 1 within the Image 2? Note that the width of the input RGB image is 910 and the height is 480.", "context": "Select from the following choices.\nA: [0.043, 0.026]\nB: [0.536, 0.287]\nC: [0.878, 0.179]\nD: [0.519, 0.466]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_54_0.jpg", "./2D-spatial/point_tracking/point_tracking_54_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.712, 0.402]\nB: [0.937, 0.199]\nC: [0.286, 0.017]\nD: [0.843, 0.865]", "question": "What is the position coordinates of the point with coordinates ([0.309, -0.011]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.712, 0.402]\nB: [0.937, 0.199]\nC: [0.286, 0.017]\nD: [0.843, 0.865]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_55_0.jpg", "./2D-spatial/point_tracking/point_tracking_55_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.014, 0.882]\nB: [0.728, 0.689]\nC: [0.088, 0.375]\nD: [0.554, 0.511]", "question": "What is the position coordinates of the point with coordinates ([0.6, 0.808]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.014, 0.882]\nB: [0.728, 0.689]\nC: [0.088, 0.375]\nD: [0.554, 0.511]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_56_0.jpg", "./2D-spatial/point_tracking/point_tracking_56_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.109, 0.199]\nB: [0.021, 0.741]\nC: [0.0, 0.0]\nD: [0.405, 0.69]", "question": "What is the position coordinates of the point with coordinates ([0.572, 0.171]) in Image 1 within the Image 2? Note that the width of the input RGB image is 910 and the height is 480.", "context": "Select from the following choices.\nA: [0.109, 0.199]\nB: [0.021, 0.741]\nC: [0.0, 0.0]\nD: [0.405, 0.69]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_57_0.jpg", "./2D-spatial/point_tracking/point_tracking_57_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.38, 0.932]\nB: [0.47, 0.409]\nC: [0.528, 0.936]\nD: [0.533, 0.686]", "question": "What is the position coordinates of the point with coordinates ([0.459, 0.412]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.38, 0.932]\nB: [0.47, 0.409]\nC: [0.528, 0.936]\nD: [0.533, 0.686]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_58_0.jpg", "./2D-spatial/point_tracking/point_tracking_58_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.092, 0.225]\nB: [0.605, 0.232]\nC: [0.39, 0.458]\nD: [0.377, 0.065]", "question": "What is the position coordinates of the point with coordinates ([0.064, 0.24]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.092, 0.225]\nB: [0.605, 0.232]\nC: [0.39, 0.458]\nD: [0.377, 0.065]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_59_0.jpg", "./2D-spatial/point_tracking/point_tracking_59_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.307, 0.516]\nB: [0.508, 0.388]\nC: [0.368, 0.937]\nD: [0.527, 0.106]", "question": "What is the position coordinates of the point with coordinates ([0.513, 0.478]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.307, 0.516]\nB: [0.508, 0.388]\nC: [0.368, 0.937]\nD: [0.527, 0.106]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_60_0.jpg", "./2D-spatial/point_tracking/point_tracking_60_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.621, 0.322]\nB: [0.757, 0.909]\nC: [0.765, 0.887]\nD: [0.485, 0.282]", "question": "What is the position coordinates of the point with coordinates ([0.543, 0.573]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.621, 0.322]\nB: [0.757, 0.909]\nC: [0.765, 0.887]\nD: [0.485, 0.282]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_61_0.jpg", "./2D-spatial/point_tracking/point_tracking_61_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.148, 0.593]\nB: [0.867, 0.594]\nC: [0.363, 0.725]\nD: [0.988, 0.381]", "question": "What is the position coordinates of the point with coordinates ([0.363, 0.725]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.148, 0.593]\nB: [0.867, 0.594]\nC: [0.363, 0.725]\nD: [0.988, 0.381]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_62_0.jpg", "./2D-spatial/point_tracking/point_tracking_62_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.538, 0.67]\nB: [0.113, 0.312]\nC: [0.781, 0.017]\nD: [0.78, 0.124]", "question": "What is the position coordinates of the point with coordinates ([0.113, 0.312]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.538, 0.67]\nB: [0.113, 0.312]\nC: [0.781, 0.017]\nD: [0.78, 0.124]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_63_0.jpg", "./2D-spatial/point_tracking/point_tracking_63_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.409, 0.184]\nB: [0.327, 0.555]\nC: [0.304, 0.166]\nD: [0.398, 0.141]", "question": "What is the position coordinates of the point with coordinates ([0.318, 0.204]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.409, 0.184]\nB: [0.327, 0.555]\nC: [0.304, 0.166]\nD: [0.398, 0.141]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_64_0.jpg", "./2D-spatial/point_tracking/point_tracking_64_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.663, 0.685]\nB: [0.5, 0.562]\nC: [0.628, 0.094]\nD: [0.876, 0.492]", "question": "What is the position coordinates of the point with coordinates ([0.5, 0.546]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.663, 0.685]\nB: [0.5, 0.562]\nC: [0.628, 0.094]\nD: [0.876, 0.492]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_65_0.jpg", "./2D-spatial/point_tracking/point_tracking_65_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.148, 0.822]\nB: [0.654, 0.462]\nC: [0.274, 0.087]\nD: [0.294, 0.87]", "question": "What is the position coordinates of the point with coordinates ([0.225, -0.034]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.148, 0.822]\nB: [0.654, 0.462]\nC: [0.274, 0.087]\nD: [0.294, 0.87]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_66_0.jpg", "./2D-spatial/point_tracking/point_tracking_66_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.443, 0.32]\nB: [0.907, 0.404]\nC: [0.451, 0.543]\nD: [0.775, 0.465]", "question": "What is the position coordinates of the point with coordinates ([0.775, 0.465]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.443, 0.32]\nB: [0.907, 0.404]\nC: [0.451, 0.543]\nD: [0.775, 0.465]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_67_0.jpg", "./2D-spatial/point_tracking/point_tracking_67_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.912, 0.423]\nB: [0.477, 0.187]\nC: [0.439, 0.609]\nD: [0.127, 0.162]", "question": "What is the position coordinates of the point with coordinates ([0.475, 0.188]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.912, 0.423]\nB: [0.477, 0.187]\nC: [0.439, 0.609]\nD: [0.127, 0.162]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_68_0.jpg", "./2D-spatial/point_tracking/point_tracking_68_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.348, 0.247]\nB: [0.53, 0.395]\nC: [0.894, 0.004]\nD: [0.561, 0.958]", "question": "What is the position coordinates of the point with coordinates ([0.528, 0.394]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.348, 0.247]\nB: [0.53, 0.395]\nC: [0.894, 0.004]\nD: [0.561, 0.958]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_69_0.jpg", "./2D-spatial/point_tracking/point_tracking_69_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.451, 0.01]\nB: [0.588, 0.525]\nC: [0.542, 0.784]\nD: [0.271, 0.069]", "question": "What is the position coordinates of the point with coordinates ([0.21, 0.024]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.451, 0.01]\nB: [0.588, 0.525]\nC: [0.542, 0.784]\nD: [0.271, 0.069]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_70_0.jpg", "./2D-spatial/point_tracking/point_tracking_70_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.545, 0.343]\nB: [0.872, 0.767]\nC: [0.848, 0.331]\nD: [0.082, 0.655]", "question": "What is the position coordinates of the point with coordinates ([0.546, 0.344]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.545, 0.343]\nB: [0.872, 0.767]\nC: [0.848, 0.331]\nD: [0.082, 0.655]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_71_0.jpg", "./2D-spatial/point_tracking/point_tracking_71_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.246, 0.558]\nB: [0.213, 0.365]\nC: [0.605, 0.491]\nD: [0.56, -0.031]", "question": "What is the position coordinates of the point with coordinates ([0.527, 0.136]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.246, 0.558]\nB: [0.213, 0.365]\nC: [0.605, 0.491]\nD: [0.56, -0.031]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_72_0.jpg", "./2D-spatial/point_tracking/point_tracking_72_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.497, 0.448]\nB: [0.783, 0.271]\nC: [0.406, 0.738]\nD: [0.416, 0.886]", "question": "What is the position coordinates of the point with coordinates ([0.783, 0.271]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.497, 0.448]\nB: [0.783, 0.271]\nC: [0.406, 0.738]\nD: [0.416, 0.886]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_73_0.jpg", "./2D-spatial/point_tracking/point_tracking_73_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.939, 0.844]\nB: [0.453, 0.842]\nC: [0.019, 0.701]\nD: [0.33, 0.019]", "question": "What is the position coordinates of the point with coordinates ([0.382, 0.074]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.939, 0.844]\nB: [0.453, 0.842]\nC: [0.019, 0.701]\nD: [0.33, 0.019]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_74_0.jpg", "./2D-spatial/point_tracking/point_tracking_74_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.712, 0.468]\nB: [0.757, 0.203]\nC: [0.602, 0.149]\nD: [0.624, 0.442]", "question": "What is the position coordinates of the point with coordinates ([0.625, 0.442]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.712, 0.468]\nB: [0.757, 0.203]\nC: [0.602, 0.149]\nD: [0.624, 0.442]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_75_0.jpg", "./2D-spatial/point_tracking/point_tracking_75_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.775, 0.586]\nB: [0.403, 0.947]\nC: [0.0, 0.0]\nD: [0.095, 0.525]", "question": "What is the position coordinates of the point with coordinates ([0.0, 0.0]) in Image 1 within the Image 2? Note that the width of the input RGB image is 910 and the height is 480.", "context": "Select from the following choices.\nA: [0.775, 0.586]\nB: [0.403, 0.947]\nC: [0.0, 0.0]\nD: [0.095, 0.525]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_76_0.jpg", "./2D-spatial/point_tracking/point_tracking_76_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.306, 0.517]\nB: [0.404, 0.704]\nC: [0.0, 0.0]\nD: [0.389, 0.429]", "question": "What is the position coordinates of the point with coordinates ([0.0, 0.0]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.306, 0.517]\nB: [0.404, 0.704]\nC: [0.0, 0.0]\nD: [0.389, 0.429]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_77_0.jpg", "./2D-spatial/point_tracking/point_tracking_77_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.421, 0.202]\nB: [0.936, 0.193]\nC: [0.836, 0.093]\nD: [0.892, 0.905]", "question": "What is the position coordinates of the point with coordinates ([0.603, 0.295]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.421, 0.202]\nB: [0.936, 0.193]\nC: [0.836, 0.093]\nD: [0.892, 0.905]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_78_0.jpg", "./2D-spatial/point_tracking/point_tracking_78_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.415, 0.336]\nB: [0.147, 0.444]\nC: [0.469, 0.996]\nD: [0.759, 0.125]", "question": "What is the position coordinates of the point with coordinates ([0.415, 0.336]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.415, 0.336]\nB: [0.147, 0.444]\nC: [0.469, 0.996]\nD: [0.759, 0.125]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_79_0.jpg", "./2D-spatial/point_tracking/point_tracking_79_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.0, 0.0]\nB: [0.001, 0.519]\nC: [0.21, 0.901]\nD: [0.72, 0.872]", "question": "What is the position coordinates of the point with coordinates ([0.0, 0.0]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.0, 0.0]\nB: [0.001, 0.519]\nC: [0.21, 0.901]\nD: [0.72, 0.872]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_80_0.jpg", "./2D-spatial/point_tracking/point_tracking_80_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.188, 0.294]\nB: [0.08, 0.837]\nC: [0.878, 0.923]\nD: [0.39, 0.215]", "question": "What is the position coordinates of the point with coordinates ([0.39, 0.215]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.188, 0.294]\nB: [0.08, 0.837]\nC: [0.878, 0.923]\nD: [0.39, 0.215]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_81_0.jpg", "./2D-spatial/point_tracking/point_tracking_81_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.725, 0.505]\nB: [0.825, 0.634]\nC: [0.772, 0.85]\nD: [0.521, 0.137]", "question": "What is the position coordinates of the point with coordinates ([0.423, 0.126]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.725, 0.505]\nB: [0.825, 0.634]\nC: [0.772, 0.85]\nD: [0.521, 0.137]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_82_0.jpg", "./2D-spatial/point_tracking/point_tracking_82_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.937, 0.437]\nB: [0.932, 0.955]\nC: [0.443, 0.473]\nD: [0.57, -0.021]", "question": "What is the position coordinates of the point with coordinates ([0.379, -0.029]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.937, 0.437]\nB: [0.932, 0.955]\nC: [0.443, 0.473]\nD: [0.57, -0.021]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_83_0.jpg", "./2D-spatial/point_tracking/point_tracking_83_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.482, 0.199]\nB: [0.043, 0.981]\nC: [0.419, 0.373]\nD: [0.325, 0.861]", "question": "What is the position coordinates of the point with coordinates ([0.482, 0.199]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.482, 0.199]\nB: [0.043, 0.981]\nC: [0.419, 0.373]\nD: [0.325, 0.861]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_84_0.jpg", "./2D-spatial/point_tracking/point_tracking_84_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.27, 0.844]\nB: [0.082, -0.197]\nC: [0.942, 0.56]\nD: [0.625, 0.212]", "question": "What is the position coordinates of the point with coordinates ([0.314, -0.235]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.27, 0.844]\nB: [0.082, -0.197]\nC: [0.942, 0.56]\nD: [0.625, 0.212]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_85_0.jpg", "./2D-spatial/point_tracking/point_tracking_85_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.493, 0.952]\nB: [0.403, 0.455]\nC: [0.764, 0.389]\nD: [0.3, 0.08]", "question": "What is the position coordinates of the point with coordinates ([0.442, 0.361]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.493, 0.952]\nB: [0.403, 0.455]\nC: [0.764, 0.389]\nD: [0.3, 0.08]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_86_0.jpg", "./2D-spatial/point_tracking/point_tracking_86_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [-0.038, 0.253]\nB: [0.77, 0.338]\nC: [0.766, 0.061]\nD: [0.958, 0.882]", "question": "What is the position coordinates of the point with coordinates ([0.028, 0.3]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [-0.038, 0.253]\nB: [0.77, 0.338]\nC: [0.766, 0.061]\nD: [0.958, 0.882]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_87_0.jpg", "./2D-spatial/point_tracking/point_tracking_87_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.005, 0.571]\nB: [0.168, 0.518]\nC: [0.523, 0.466]\nD: [0.784, 0.541]", "question": "What is the position coordinates of the point with coordinates ([0.553, 0.401]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.005, 0.571]\nB: [0.168, 0.518]\nC: [0.523, 0.466]\nD: [0.784, 0.541]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_88_0.jpg", "./2D-spatial/point_tracking/point_tracking_88_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.256, 0.018]\nB: [0.492, 0.583]\nC: [0.579, 0.753]\nD: [0.756, 0.803]", "question": "What is the position coordinates of the point with coordinates ([0.536, 0.384]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.256, 0.018]\nB: [0.492, 0.583]\nC: [0.579, 0.753]\nD: [0.756, 0.803]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_89_0.jpg", "./2D-spatial/point_tracking/point_tracking_89_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.4, 0.819]\nB: [0.315, 0.418]\nC: [0.695, 0.574]\nD: [0.934, 0.028]", "question": "What is the position coordinates of the point with coordinates ([0.315, 0.418]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.4, 0.819]\nB: [0.315, 0.418]\nC: [0.695, 0.574]\nD: [0.934, 0.028]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_90_0.jpg", "./2D-spatial/point_tracking/point_tracking_90_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.161, 0.915]\nB: [0.55, -0.109]\nC: [0.025, 0.306]\nD: [0.859, 0.383]", "question": "What is the position coordinates of the point with coordinates ([0.556, -0.112]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.161, 0.915]\nB: [0.55, -0.109]\nC: [0.025, 0.306]\nD: [0.859, 0.383]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_91_0.jpg", "./2D-spatial/point_tracking/point_tracking_91_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.796, 0.095]\nB: [0.902, 0.871]\nC: [0.454, 0.805]\nD: [0.399, 0.254]", "question": "What is the position coordinates of the point with coordinates ([0.336, 0.127]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.796, 0.095]\nB: [0.902, 0.871]\nC: [0.454, 0.805]\nD: [0.399, 0.254]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_92_0.jpg", "./2D-spatial/point_tracking/point_tracking_92_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.78, 0.578]\nB: [0.586, 0.492]\nC: [0.362, 0.862]\nD: [0.308, 0.418]", "question": "What is the position coordinates of the point with coordinates ([0.516, 0.501]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.78, 0.578]\nB: [0.586, 0.492]\nC: [0.362, 0.862]\nD: [0.308, 0.418]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_93_0.jpg", "./2D-spatial/point_tracking/point_tracking_93_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.771, 0.142]\nB: [0.516, 0.41]\nC: [0.068, 0.844]\nD: [0.331, 0.532]", "question": "What is the position coordinates of the point with coordinates ([0.0, 0.0]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.771, 0.142]\nB: [0.516, 0.41]\nC: [0.068, 0.844]\nD: [0.331, 0.532]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_94_0.jpg", "./2D-spatial/point_tracking/point_tracking_94_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.0, 0.0]\nB: [0.53, 0.297]\nC: [0.365, 0.027]\nD: [0.781, 0.768]", "question": "What is the position coordinates of the point with coordinates ([0.372, 0.327]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.0, 0.0]\nB: [0.53, 0.297]\nC: [0.365, 0.027]\nD: [0.781, 0.768]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_95_0.jpg", "./2D-spatial/point_tracking/point_tracking_95_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.198, 0.526]\nB: [0.435, 0.603]\nC: [0.508, 0.551]\nD: [0.55, 0.363]", "question": "What is the position coordinates of the point with coordinates ([0.508, 0.551]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.198, 0.526]\nB: [0.435, 0.603]\nC: [0.508, 0.551]\nD: [0.55, 0.363]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_96_0.jpg", "./2D-spatial/point_tracking/point_tracking_96_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.12, 0.762]\nB: [0.674, 0.29]\nC: [0.557, 0.641]\nD: [0.055, 0.586]", "question": "What is the position coordinates of the point with coordinates ([0.597, 0.28]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.12, 0.762]\nB: [0.674, 0.29]\nC: [0.557, 0.641]\nD: [0.055, 0.586]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_97_0.jpg", "./2D-spatial/point_tracking/point_tracking_97_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.064, 0.785]\nB: [0.378, 0.667]\nC: [0.522, 0.235]\nD: [0.437, 0.118]", "question": "What is the position coordinates of the point with coordinates ([0.378, 0.667]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.064, 0.785]\nB: [0.378, 0.667]\nC: [0.522, 0.235]\nD: [0.437, 0.118]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_98_0.jpg", "./2D-spatial/point_tracking/point_tracking_98_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.875, 0.931]\nB: [0.087, 0.702]\nC: [0.508, 0.69]\nD: [0.046, 0.524]", "question": "What is the position coordinates of the point with coordinates ([0.251, 0.5]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.875, 0.931]\nB: [0.087, 0.702]\nC: [0.508, 0.69]\nD: [0.046, 0.524]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_99_0.jpg", "./2D-spatial/point_tracking/point_tracking_99_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.449, 0.349]\nB: [0.497, 0.606]\nC: [0.545, 0.303]\nD: [0.125, 0.458]", "question": "What is the position coordinates of the point with coordinates ([0.527, 0.379]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.449, 0.349]\nB: [0.497, 0.606]\nC: [0.545, 0.303]\nD: [0.125, 0.458]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_100_0.jpg", "./2D-spatial/point_tracking/point_tracking_100_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.276, 0.406]\nB: [0.94, 0.417]\nC: [0.807, 0.617]\nD: [0.151, 0.326]", "question": "What is the position coordinates of the point with coordinates ([0.266, 0.607]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.276, 0.406]\nB: [0.94, 0.417]\nC: [0.807, 0.617]\nD: [0.151, 0.326]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_101_0.jpg", "./2D-spatial/point_tracking/point_tracking_101_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.851, 0.365]\nB: [0.558, 0.074]\nC: [0.378, 0.002]\nD: [0.075, 0.676]", "question": "What is the position coordinates of the point with coordinates ([0.852, 0.365]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.851, 0.365]\nB: [0.558, 0.074]\nC: [0.378, 0.002]\nD: [0.075, 0.676]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_102_0.jpg", "./2D-spatial/point_tracking/point_tracking_102_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.0, 0.0]\nB: [0.515, 0.178]\nC: [0.197, 0.534]\nD: [0.536, 0.497]", "question": "What is the position coordinates of the point with coordinates ([0.0, 0.0]) in Image 1 within the Image 2? Note that the width of the input RGB image is 910 and the height is 480.", "context": "Select from the following choices.\nA: [0.0, 0.0]\nB: [0.515, 0.178]\nC: [0.197, 0.534]\nD: [0.536, 0.497]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_103_0.jpg", "./2D-spatial/point_tracking/point_tracking_103_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.133, 0.966]\nB: [0.167, 0.473]\nC: [0.808, 0.497]\nD: [0.597, 0.39]", "question": "What is the position coordinates of the point with coordinates ([0.502, 0.304]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.133, 0.966]\nB: [0.167, 0.473]\nC: [0.808, 0.497]\nD: [0.597, 0.39]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_104_0.jpg", "./2D-spatial/point_tracking/point_tracking_104_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.635, 0.971]\nB: [0.243, 0.351]\nC: [0.0, 0.0]\nD: [0.995, 0.403]", "question": "What is the position coordinates of the point with coordinates ([0.0, 0.0]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.635, 0.971]\nB: [0.243, 0.351]\nC: [0.0, 0.0]\nD: [0.995, 0.403]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_105_0.jpg", "./2D-spatial/point_tracking/point_tracking_105_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.0, 0.0]\nB: [0.766, 0.625]\nC: [0.702, 0.537]\nD: [0.4, 0.901]", "question": "What is the position coordinates of the point with coordinates ([0.0, 0.0]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.0, 0.0]\nB: [0.766, 0.625]\nC: [0.702, 0.537]\nD: [0.4, 0.901]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_106_0.jpg", "./2D-spatial/point_tracking/point_tracking_106_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.04, 0.521]\nB: [0.013, 0.863]\nC: [0.041, 0.677]\nD: [0.471, 0.865]", "question": "What is the position coordinates of the point with coordinates ([0.554, 0.401]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.04, 0.521]\nB: [0.013, 0.863]\nC: [0.041, 0.677]\nD: [0.471, 0.865]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_107_0.jpg", "./2D-spatial/point_tracking/point_tracking_107_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.422, 0.094]\nB: [0.036, 0.241]\nC: [0.832, 0.759]\nD: [0.084, 0.371]", "question": "What is the position coordinates of the point with coordinates ([0.016, 0.253]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.422, 0.094]\nB: [0.036, 0.241]\nC: [0.832, 0.759]\nD: [0.084, 0.371]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_108_0.jpg", "./2D-spatial/point_tracking/point_tracking_108_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.435, 0.035]\nB: [0.758, 0.725]\nC: [0.428, 0.944]\nD: [0.191, 0.586]", "question": "What is the position coordinates of the point with coordinates ([0.758, 0.725]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.435, 0.035]\nB: [0.758, 0.725]\nC: [0.428, 0.944]\nD: [0.191, 0.586]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_109_0.jpg", "./2D-spatial/point_tracking/point_tracking_109_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.486, 0.472]\nB: [0.0, 0.409]\nC: [0.679, 0.71]\nD: [0.474, 0.443]", "question": "What is the position coordinates of the point with coordinates ([0.383, 0.481]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.486, 0.472]\nB: [0.0, 0.409]\nC: [0.679, 0.71]\nD: [0.474, 0.443]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_110_0.jpg", "./2D-spatial/point_tracking/point_tracking_110_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.832, 0.16]\nB: [0.767, 0.295]\nC: [0.238, 0.998]\nD: [0.231, 0.345]", "question": "What is the position coordinates of the point with coordinates ([0.278, 0.312]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.832, 0.16]\nB: [0.767, 0.295]\nC: [0.238, 0.998]\nD: [0.231, 0.345]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_111_0.jpg", "./2D-spatial/point_tracking/point_tracking_111_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.98, 0.565]\nB: [0.053, 0.674]\nC: [0.564, 0.876]\nD: [0.452, 0.539]", "question": "What is the position coordinates of the point with coordinates ([0.98, 0.565]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.98, 0.565]\nB: [0.053, 0.674]\nC: [0.564, 0.876]\nD: [0.452, 0.539]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_112_0.jpg", "./2D-spatial/point_tracking/point_tracking_112_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.335, 0.563]\nB: [0.88, 0.001]\nC: [0.119, 0.693]\nD: [0.484, 0.412]", "question": "What is the position coordinates of the point with coordinates ([0.521, 0.426]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.335, 0.563]\nB: [0.88, 0.001]\nC: [0.119, 0.693]\nD: [0.484, 0.412]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_113_0.jpg", "./2D-spatial/point_tracking/point_tracking_113_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.626, 0.275]\nB: [0.815, 0.877]\nC: [0.004, 0.083]\nD: [0.871, 0.172]", "question": "What is the position coordinates of the point with coordinates ([0.631, 0.278]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.626, 0.275]\nB: [0.815, 0.877]\nC: [0.004, 0.083]\nD: [0.871, 0.172]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_114_0.jpg", "./2D-spatial/point_tracking/point_tracking_114_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.017, 0.757]\nB: [0.637, 0.134]\nC: [0.823, 0.303]\nD: [0.415, 0.038]", "question": "What is the position coordinates of the point with coordinates ([0.475, 0.03]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.017, 0.757]\nB: [0.637, 0.134]\nC: [0.823, 0.303]\nD: [0.415, 0.038]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_115_0.jpg", "./2D-spatial/point_tracking/point_tracking_115_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.278, 0.07]\nB: [0.287, 0.704]\nC: [0.387, 0.197]\nD: [0.443, 0.105]", "question": "What is the position coordinates of the point with coordinates ([0.414, -0.045]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.278, 0.07]\nB: [0.287, 0.704]\nC: [0.387, 0.197]\nD: [0.443, 0.105]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_116_0.jpg", "./2D-spatial/point_tracking/point_tracking_116_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.243, 0.44]\nB: [0.089, 0.367]\nC: [0.322, 0.069]\nD: [0.126, 0.424]", "question": "What is the position coordinates of the point with coordinates ([0.126, 0.424]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.243, 0.44]\nB: [0.089, 0.367]\nC: [0.322, 0.069]\nD: [0.126, 0.424]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_117_0.jpg", "./2D-spatial/point_tracking/point_tracking_117_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.878, 0.91]\nB: [0.509, 0.022]\nC: [0.259, 0.162]\nD: [0.213, 0.977]", "question": "What is the position coordinates of the point with coordinates ([0.265, 0.16]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.878, 0.91]\nB: [0.509, 0.022]\nC: [0.259, 0.162]\nD: [0.213, 0.977]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_118_0.jpg", "./2D-spatial/point_tracking/point_tracking_118_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.451, 0.674]\nB: [0.529, 0.336]\nC: [0.137, 0.847]\nD: [0.081, 0.187]", "question": "What is the position coordinates of the point with coordinates ([0.529, 0.336]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.451, 0.674]\nB: [0.529, 0.336]\nC: [0.137, 0.847]\nD: [0.081, 0.187]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_119_0.jpg", "./2D-spatial/point_tracking/point_tracking_119_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.354, 0.137]\nB: [0.831, 0.926]\nC: [0.473, 0.743]\nD: [0.228, 0.73]", "question": "What is the position coordinates of the point with coordinates ([0.473, 0.743]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.354, 0.137]\nB: [0.831, 0.926]\nC: [0.473, 0.743]\nD: [0.228, 0.73]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_120_0.jpg", "./2D-spatial/point_tracking/point_tracking_120_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.703, 0.241]\nB: [0.985, 0.235]\nC: [0.439, 0.494]\nD: [0.614, 0.184]", "question": "What is the position coordinates of the point with coordinates ([0.0, 0.0]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.703, 0.241]\nB: [0.985, 0.235]\nC: [0.439, 0.494]\nD: [0.614, 0.184]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_121_0.jpg", "./2D-spatial/point_tracking/point_tracking_121_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.035, 0.992]\nB: [0.994, 0.321]\nC: [0.839, 0.258]\nD: [0.414, 0.367]", "question": "What is the position coordinates of the point with coordinates ([0.306, 0.334]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.035, 0.992]\nB: [0.994, 0.321]\nC: [0.839, 0.258]\nD: [0.414, 0.367]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_122_0.jpg", "./2D-spatial/point_tracking/point_tracking_122_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.613, 0.309]\nB: [0.457, 0.931]\nC: [0.669, 0.383]\nD: [0.938, 0.837]", "question": "What is the position coordinates of the point with coordinates ([0.602, 0.319]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.613, 0.309]\nB: [0.457, 0.931]\nC: [0.669, 0.383]\nD: [0.938, 0.837]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_123_0.jpg", "./2D-spatial/point_tracking/point_tracking_123_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.533, 0.568]\nB: [0.394, 0.545]\nC: [0.429, 0.604]\nD: [0.299, 0.66]", "question": "What is the position coordinates of the point with coordinates ([0.0, 0.0]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.533, 0.568]\nB: [0.394, 0.545]\nC: [0.429, 0.604]\nD: [0.299, 0.66]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_124_0.jpg", "./2D-spatial/point_tracking/point_tracking_124_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.418, 0.337]\nB: [0.703, 0.614]\nC: [0.256, 0.811]\nD: [0.753, 0.192]", "question": "What is the position coordinates of the point with coordinates ([0.419, 0.337]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.418, 0.337]\nB: [0.703, 0.614]\nC: [0.256, 0.811]\nD: [0.753, 0.192]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_125_0.jpg", "./2D-spatial/point_tracking/point_tracking_125_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.451, 0.63]\nB: [0.161, 0.672]\nC: [0.117, 0.38]\nD: [0.918, 0.717]", "question": "What is the position coordinates of the point with coordinates ([0.161, 0.672]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.451, 0.63]\nB: [0.161, 0.672]\nC: [0.117, 0.38]\nD: [0.918, 0.717]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_126_0.jpg", "./2D-spatial/point_tracking/point_tracking_126_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.487, 0.784]\nB: [0.155, 0.004]\nC: [0.336, 0.564]\nD: [0.045, 0.917]", "question": "What is the position coordinates of the point with coordinates ([0.616, 0.544]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.487, 0.784]\nB: [0.155, 0.004]\nC: [0.336, 0.564]\nD: [0.045, 0.917]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_127_0.jpg", "./2D-spatial/point_tracking/point_tracking_127_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.352, 0.409]\nB: [0.959, 0.481]\nC: [0.373, 0.245]\nD: [0.977, 0.091]", "question": "What is the position coordinates of the point with coordinates ([0.352, 0.409]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.352, 0.409]\nB: [0.959, 0.481]\nC: [0.373, 0.245]\nD: [0.977, 0.091]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_128_0.jpg", "./2D-spatial/point_tracking/point_tracking_128_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.859, 0.311]\nB: [0.589, 0.682]\nC: [0.306, 0.308]\nD: [0.219, 0.979]", "question": "What is the position coordinates of the point with coordinates ([0.304, 0.307]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.859, 0.311]\nB: [0.589, 0.682]\nC: [0.306, 0.308]\nD: [0.219, 0.979]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_129_0.jpg", "./2D-spatial/point_tracking/point_tracking_129_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.472, 0.938]\nB: [0.873, 0.948]\nC: [0.511, 0.28]\nD: [0.829, 0.346]", "question": "What is the position coordinates of the point with coordinates ([0.0, 0.0]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.472, 0.938]\nB: [0.873, 0.948]\nC: [0.511, 0.28]\nD: [0.829, 0.346]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_130_0.jpg", "./2D-spatial/point_tracking/point_tracking_130_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.446, 0.638]\nB: [0.628, 0.456]\nC: [0.455, 0.627]\nD: [0.379, 0.405]", "question": "What is the position coordinates of the point with coordinates ([0.56, 0.467]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.446, 0.638]\nB: [0.628, 0.456]\nC: [0.455, 0.627]\nD: [0.379, 0.405]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_131_0.jpg", "./2D-spatial/point_tracking/point_tracking_131_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.322, 0.321]\nB: [0.15, 0.133]\nC: [0.989, 0.972]\nD: [0.16, 0.862]", "question": "What is the position coordinates of the point with coordinates ([0.232, 0.188]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.322, 0.321]\nB: [0.15, 0.133]\nC: [0.989, 0.972]\nD: [0.16, 0.862]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_132_0.jpg", "./2D-spatial/point_tracking/point_tracking_132_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.931, 0.959]\nB: [0.506, 0.286]\nC: [0.391, 0.531]\nD: [0.469, 0.383]", "question": "What is the position coordinates of the point with coordinates ([0.465, 0.381]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.931, 0.959]\nB: [0.506, 0.286]\nC: [0.391, 0.531]\nD: [0.469, 0.383]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_133_0.jpg", "./2D-spatial/point_tracking/point_tracking_133_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.109, 0.806]\nB: [0.197, 0.457]\nC: [0.203, 0.114]\nD: [0.135, 0.938]", "question": "What is the position coordinates of the point with coordinates ([0.2, 0.047]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.109, 0.806]\nB: [0.197, 0.457]\nC: [0.203, 0.114]\nD: [0.135, 0.938]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_134_0.jpg", "./2D-spatial/point_tracking/point_tracking_134_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.24, 0.833]\nB: [0.308, 0.425]\nC: [0.339, 0.639]\nD: [0.077, 0.998]", "question": "What is the position coordinates of the point with coordinates ([0.24, 0.833]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.24, 0.833]\nB: [0.308, 0.425]\nC: [0.339, 0.639]\nD: [0.077, 0.998]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_135_0.jpg", "./2D-spatial/point_tracking/point_tracking_135_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.0, 0.0]\nB: [0.721, 0.606]\nC: [0.121, 0.428]\nD: [0.252, 0.486]", "question": "What is the position coordinates of the point with coordinates ([0.496, 0.539]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.0, 0.0]\nB: [0.721, 0.606]\nC: [0.121, 0.428]\nD: [0.252, 0.486]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_136_0.jpg", "./2D-spatial/point_tracking/point_tracking_136_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.675, 0.84]\nB: [0.087, 0.791]\nC: [0.736, 0.705]\nD: [0.092, 0.465]", "question": "What is the position coordinates of the point with coordinates ([0.131, 0.527]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.675, 0.84]\nB: [0.087, 0.791]\nC: [0.736, 0.705]\nD: [0.092, 0.465]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_137_0.jpg", "./2D-spatial/point_tracking/point_tracking_137_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.188, 0.925]\nB: [0.922, 0.115]\nC: [0.894, 0.22]\nD: [0.022, 0.091]", "question": "What is the position coordinates of the point with coordinates ([0.195, 0.928]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.188, 0.925]\nB: [0.922, 0.115]\nC: [0.894, 0.22]\nD: [0.022, 0.091]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_138_0.jpg", "./2D-spatial/point_tracking/point_tracking_138_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.962, 0.897]\nB: [0.754, 0.628]\nC: [0.384, 0.96]\nD: [0.784, 0.178]", "question": "What is the position coordinates of the point with coordinates ([0.384, 0.96]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.962, 0.897]\nB: [0.754, 0.628]\nC: [0.384, 0.96]\nD: [0.784, 0.178]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_139_0.jpg", "./2D-spatial/point_tracking/point_tracking_139_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.021, 0.739]\nB: [0.0, 0.0]\nC: [0.701, 0.818]\nD: [0.335, 0.057]", "question": "What is the position coordinates of the point with coordinates ([0.0, 0.0]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.021, 0.739]\nB: [0.0, 0.0]\nC: [0.701, 0.818]\nD: [0.335, 0.057]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_140_0.jpg", "./2D-spatial/point_tracking/point_tracking_140_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.167, 0.345]\nB: [0.618, 0.201]\nC: [0.805, 0.514]\nD: [0.027, 0.731]", "question": "What is the position coordinates of the point with coordinates ([0.609, 0.209]) in Image 1 within the Image 2? Note that the width of the input RGB image is 910 and the height is 480.", "context": "Select from the following choices.\nA: [0.167, 0.345]\nB: [0.618, 0.201]\nC: [0.805, 0.514]\nD: [0.027, 0.731]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_141_0.jpg", "./2D-spatial/point_tracking/point_tracking_141_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.373, 0.459]\nB: [0.416, 0.278]\nC: [0.662, 0.648]\nD: [0.304, 0.781]", "question": "What is the position coordinates of the point with coordinates ([0.443, 0.285]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.373, 0.459]\nB: [0.416, 0.278]\nC: [0.662, 0.648]\nD: [0.304, 0.781]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_142_0.jpg", "./2D-spatial/point_tracking/point_tracking_142_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.53, 0.42]\nB: [0.638, 0.766]\nC: [0.517, 0.984]\nD: [0.344, 0.268]", "question": "What is the position coordinates of the point with coordinates ([0.345, 0.268]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.53, 0.42]\nB: [0.638, 0.766]\nC: [0.517, 0.984]\nD: [0.344, 0.268]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_143_0.jpg", "./2D-spatial/point_tracking/point_tracking_143_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.603, 0.414]\nB: [0.61, 0.464]\nC: [0.292, 0.626]\nD: [0.062, 0.813]", "question": "What is the position coordinates of the point with coordinates ([0.602, 0.412]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.603, 0.414]\nB: [0.61, 0.464]\nC: [0.292, 0.626]\nD: [0.062, 0.813]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_144_0.jpg", "./2D-spatial/point_tracking/point_tracking_144_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.493, 0.798]\nB: [0.963, 0.818]\nC: [0.245, 0.105]\nD: [0.982, 0.515]", "question": "What is the position coordinates of the point with coordinates ([0.169, 0.075]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.493, 0.798]\nB: [0.963, 0.818]\nC: [0.245, 0.105]\nD: [0.982, 0.515]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_145_0.jpg", "./2D-spatial/point_tracking/point_tracking_145_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.84, 0.083]\nB: [0.114, 0.077]\nC: [0.273, 0.23]\nD: [0.485, 0.534]", "question": "What is the position coordinates of the point with coordinates ([0.443, 0.524]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.84, 0.083]\nB: [0.114, 0.077]\nC: [0.273, 0.23]\nD: [0.485, 0.534]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_146_0.jpg", "./2D-spatial/point_tracking/point_tracking_146_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.345, 0.262]\nB: [0.512, 0.224]\nC: [0.657, 0.276]\nD: [0.166, 0.841]", "question": "What is the position coordinates of the point with coordinates ([0.501, 0.22]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.345, 0.262]\nB: [0.512, 0.224]\nC: [0.657, 0.276]\nD: [0.166, 0.841]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_147_0.jpg", "./2D-spatial/point_tracking/point_tracking_147_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.357, 0.196]\nB: [0.42, 0.234]\nC: [0.718, 0.336]\nD: [0.573, 0.896]", "question": "What is the position coordinates of the point with coordinates ([0.0, 0.0]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.357, 0.196]\nB: [0.42, 0.234]\nC: [0.718, 0.336]\nD: [0.573, 0.896]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_148_0.jpg", "./2D-spatial/point_tracking/point_tracking_148_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.793, 0.03]\nB: [0.879, 0.871]\nC: [0.781, 0.418]\nD: [0.549, 0.338]", "question": "What is the position coordinates of the point with coordinates ([0.53, 0.332]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.793, 0.03]\nB: [0.879, 0.871]\nC: [0.781, 0.418]\nD: [0.549, 0.338]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_149_0.jpg", "./2D-spatial/point_tracking/point_tracking_149_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.342, 0.072]\nB: [0.574, 0.028]\nC: [0.795, 0.301]\nD: [0.752, 0.99]", "question": "What is the position coordinates of the point with coordinates ([0.342, 0.072]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.342, 0.072]\nB: [0.574, 0.028]\nC: [0.795, 0.301]\nD: [0.752, 0.99]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_150_0.jpg", "./2D-spatial/point_tracking/point_tracking_150_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.082, 0.932]\nB: [0.262, 0.046]\nC: [0.434, 0.576]\nD: [0.686, 0.437]", "question": "What is the position coordinates of the point with coordinates ([0.082, 0.932]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.082, 0.932]\nB: [0.262, 0.046]\nC: [0.434, 0.576]\nD: [0.686, 0.437]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_151_0.jpg", "./2D-spatial/point_tracking/point_tracking_151_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.053, 0.623]\nB: [0.624, 0.428]\nC: [0.518, 0.784]\nD: [0.141, 0.376]", "question": "What is the position coordinates of the point with coordinates ([0.624, 0.428]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.053, 0.623]\nB: [0.624, 0.428]\nC: [0.518, 0.784]\nD: [0.141, 0.376]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_152_0.jpg", "./2D-spatial/point_tracking/point_tracking_152_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.059, 0.533]\nB: [0.697, 0.415]\nC: [0.114, 0.313]\nD: [0.328, 0.618]", "question": "What is the position coordinates of the point with coordinates ([0.113, 0.313]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.059, 0.533]\nB: [0.697, 0.415]\nC: [0.114, 0.313]\nD: [0.328, 0.618]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_153_0.jpg", "./2D-spatial/point_tracking/point_tracking_153_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.488, 0.838]\nB: [0.287, 0.106]\nC: [0.472, 0.074]\nD: [0.079, 0.354]", "question": "What is the position coordinates of the point with coordinates ([0.572, -0.121]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.488, 0.838]\nB: [0.287, 0.106]\nC: [0.472, 0.074]\nD: [0.079, 0.354]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_154_0.jpg", "./2D-spatial/point_tracking/point_tracking_154_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.631, 0.352]\nB: [0.646, 0.557]\nC: [0.682, 0.502]\nD: [0.586, 0.751]", "question": "What is the position coordinates of the point with coordinates ([0.0, 0.0]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.631, 0.352]\nB: [0.646, 0.557]\nC: [0.682, 0.502]\nD: [0.586, 0.751]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_155_0.jpg", "./2D-spatial/point_tracking/point_tracking_155_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.125, 0.593]\nB: [0.518, 0.506]\nC: [0.515, 0.327]\nD: [0.285, 0.07]", "question": "What is the position coordinates of the point with coordinates ([0.588, 0.496]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.125, 0.593]\nB: [0.518, 0.506]\nC: [0.515, 0.327]\nD: [0.285, 0.07]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_156_0.jpg", "./2D-spatial/point_tracking/point_tracking_156_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.176, 0.766]\nB: [0.337, 0.765]\nC: [0.905, 0.67]\nD: [0.04, 0.456]", "question": "What is the position coordinates of the point with coordinates ([0.04, 0.456]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.176, 0.766]\nB: [0.337, 0.765]\nC: [0.905, 0.67]\nD: [0.04, 0.456]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_157_0.jpg", "./2D-spatial/point_tracking/point_tracking_157_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.232, 0.766]\nB: [0.161, 0.72]\nC: [0.323, 0.222]\nD: [0.795, 0.138]", "question": "What is the position coordinates of the point with coordinates ([0.361, 0.266]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.232, 0.766]\nB: [0.161, 0.72]\nC: [0.323, 0.222]\nD: [0.795, 0.138]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_158_0.jpg", "./2D-spatial/point_tracking/point_tracking_158_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.333, 0.389]\nB: [0.691, 0.301]\nC: [0.868, 0.47]\nD: [0.649, 0.094]", "question": "What is the position coordinates of the point with coordinates ([0.333, 0.389]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.333, 0.389]\nB: [0.691, 0.301]\nC: [0.868, 0.47]\nD: [0.649, 0.094]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_159_0.jpg", "./2D-spatial/point_tracking/point_tracking_159_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.412, 0.254]\nB: [0.803, 0.989]\nC: [0.898, 0.497]\nD: [0.43, 0.295]", "question": "What is the position coordinates of the point with coordinates ([0.392, 0.302]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.412, 0.254]\nB: [0.803, 0.989]\nC: [0.898, 0.497]\nD: [0.43, 0.295]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_160_0.jpg", "./2D-spatial/point_tracking/point_tracking_160_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.0, 0.0]\nB: [0.864, 0.427]\nC: [0.189, 0.222]\nD: [0.86, 0.108]", "question": "What is the position coordinates of the point with coordinates ([0.0, 0.0]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.0, 0.0]\nB: [0.864, 0.427]\nC: [0.189, 0.222]\nD: [0.86, 0.108]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_161_0.jpg", "./2D-spatial/point_tracking/point_tracking_161_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.961, 0.515]\nB: [0.312, 0.682]\nC: [0.209, 0.16]\nD: [0.943, 0.395]", "question": "What is the position coordinates of the point with coordinates ([0.181, 0.189]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.961, 0.515]\nB: [0.312, 0.682]\nC: [0.209, 0.16]\nD: [0.943, 0.395]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_162_0.jpg", "./2D-spatial/point_tracking/point_tracking_162_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.606, 0.797]\nB: [0.0, 0.0]\nC: [0.538, 0.287]\nD: [0.14, 0.104]", "question": "What is the position coordinates of the point with coordinates ([0.0, 0.0]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.606, 0.797]\nB: [0.0, 0.0]\nC: [0.538, 0.287]\nD: [0.14, 0.104]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_163_0.jpg", "./2D-spatial/point_tracking/point_tracking_163_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.289, 0.952]\nB: [0.872, 0.205]\nC: [0.0, 0.0]\nD: [0.633, 0.427]", "question": "What is the position coordinates of the point with coordinates ([0.0, 0.0]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.289, 0.952]\nB: [0.872, 0.205]\nC: [0.0, 0.0]\nD: [0.633, 0.427]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_164_0.jpg", "./2D-spatial/point_tracking/point_tracking_164_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.47, 0.37]\nB: [0.35, 0.4]\nC: [0.042, 0.785]\nD: [0.081, 0.262]", "question": "What is the position coordinates of the point with coordinates ([0.351, 0.401]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.47, 0.37]\nB: [0.35, 0.4]\nC: [0.042, 0.785]\nD: [0.081, 0.262]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_165_0.jpg", "./2D-spatial/point_tracking/point_tracking_165_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.161, 0.071]\nB: [0.948, 0.753]\nC: [0.387, 0.629]\nD: [0.408, 0.774]", "question": "What is the position coordinates of the point with coordinates ([0.718, 0.256]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.161, 0.071]\nB: [0.948, 0.753]\nC: [0.387, 0.629]\nD: [0.408, 0.774]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_166_0.jpg", "./2D-spatial/point_tracking/point_tracking_166_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.18, 0.986]\nB: [0.148, 0.12]\nC: [0.474, 0.356]\nD: [0.634, 0.061]", "question": "What is the position coordinates of the point with coordinates ([0.551, 0.394]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.18, 0.986]\nB: [0.148, 0.12]\nC: [0.474, 0.356]\nD: [0.634, 0.061]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_167_0.jpg", "./2D-spatial/point_tracking/point_tracking_167_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.642, 0.55]\nB: [0.894, 0.525]\nC: [0.887, 0.681]\nD: [0.583, 0.912]", "question": "What is the position coordinates of the point with coordinates ([0.724, 0.512]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.642, 0.55]\nB: [0.894, 0.525]\nC: [0.887, 0.681]\nD: [0.583, 0.912]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_168_0.jpg", "./2D-spatial/point_tracking/point_tracking_168_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.987, 0.403]\nB: [0.465, 0.446]\nC: [0.05, 0.858]\nD: [0.457, 0.194]", "question": "What is the position coordinates of the point with coordinates ([0.504, 0.202]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.987, 0.403]\nB: [0.465, 0.446]\nC: [0.05, 0.858]\nD: [0.457, 0.194]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_169_0.jpg", "./2D-spatial/point_tracking/point_tracking_169_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.852, 0.571]\nB: [0.771, 0.593]\nC: [0.19, 0.794]\nD: [0.512, 0.314]", "question": "What is the position coordinates of the point with coordinates ([0.513, 0.314]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.852, 0.571]\nB: [0.771, 0.593]\nC: [0.19, 0.794]\nD: [0.512, 0.314]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_170_0.jpg", "./2D-spatial/point_tracking/point_tracking_170_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.998, 0.808]\nB: [0.0, 0.0]\nC: [0.98, 0.396]\nD: [0.419, 0.553]", "question": "What is the position coordinates of the point with coordinates ([0.0, 0.0]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.998, 0.808]\nB: [0.0, 0.0]\nC: [0.98, 0.396]\nD: [0.419, 0.553]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_171_0.jpg", "./2D-spatial/point_tracking/point_tracking_171_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.52, 0.284]\nB: [0.475, 0.251]\nC: [0.321, 0.629]\nD: [0.432, 0.371]", "question": "What is the position coordinates of the point with coordinates ([0.427, 0.372]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.52, 0.284]\nB: [0.475, 0.251]\nC: [0.321, 0.629]\nD: [0.432, 0.371]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_172_0.jpg", "./2D-spatial/point_tracking/point_tracking_172_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.0, 0.0]\nB: [0.781, 0.578]\nC: [0.642, 0.382]\nD: [0.679, 0.324]", "question": "What is the position coordinates of the point with coordinates ([0.751, 0.277]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.0, 0.0]\nB: [0.781, 0.578]\nC: [0.642, 0.382]\nD: [0.679, 0.324]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_173_0.jpg", "./2D-spatial/point_tracking/point_tracking_173_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.525, 0.662]\nB: [0.774, 0.504]\nC: [0.263, 0.754]\nD: [0.896, 0.303]", "question": "What is the position coordinates of the point with coordinates ([0.0, 0.0]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.525, 0.662]\nB: [0.774, 0.504]\nC: [0.263, 0.754]\nD: [0.896, 0.303]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_174_0.jpg", "./2D-spatial/point_tracking/point_tracking_174_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.336, 0.241]\nB: [0.754, 0.592]\nC: [0.711, 0.154]\nD: [0.814, 0.269]", "question": "What is the position coordinates of the point with coordinates ([0.711, 0.154]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.336, 0.241]\nB: [0.754, 0.592]\nC: [0.711, 0.154]\nD: [0.814, 0.269]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_175_0.jpg", "./2D-spatial/point_tracking/point_tracking_175_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.357, 0.26]\nB: [0.145, 0.457]\nC: [0.26, 0.791]\nD: [0.896, 0.054]", "question": "What is the position coordinates of the point with coordinates ([0.357, 0.259]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.357, 0.26]\nB: [0.145, 0.457]\nC: [0.26, 0.791]\nD: [0.896, 0.054]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_176_0.jpg", "./2D-spatial/point_tracking/point_tracking_176_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.0, 0.0]\nB: [0.249, 0.178]\nC: [0.969, 0.236]\nD: [0.363, 0.049]", "question": "What is the position coordinates of the point with coordinates ([0.509, 0.617]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.0, 0.0]\nB: [0.249, 0.178]\nC: [0.969, 0.236]\nD: [0.363, 0.049]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_177_0.jpg", "./2D-spatial/point_tracking/point_tracking_177_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.396, 0.165]\nB: [0.966, 0.511]\nC: [0.101, 0.549]\nD: [0.871, 0.899]", "question": "What is the position coordinates of the point with coordinates ([0.0, 0.0]) in Image 1 within the Image 2? Note that the width of the input RGB image is 910 and the height is 480.", "context": "Select from the following choices.\nA: [0.396, 0.165]\nB: [0.966, 0.511]\nC: [0.101, 0.549]\nD: [0.871, 0.899]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_178_0.jpg", "./2D-spatial/point_tracking/point_tracking_178_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.335, 0.835]\nB: [0.526, 0.468]\nC: [0.441, 0.847]\nD: [0.584, 0.202]", "question": "What is the position coordinates of the point with coordinates ([0.491, 0.453]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.335, 0.835]\nB: [0.526, 0.468]\nC: [0.441, 0.847]\nD: [0.584, 0.202]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_179_0.jpg", "./2D-spatial/point_tracking/point_tracking_179_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.352, 0.43]\nB: [0.396, 0.842]\nC: [0.544, 0.168]\nD: [0.755, 0.432]", "question": "What is the position coordinates of the point with coordinates ([0.352, 0.43]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.352, 0.43]\nB: [0.396, 0.842]\nC: [0.544, 0.168]\nD: [0.755, 0.432]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_180_0.jpg", "./2D-spatial/point_tracking/point_tracking_180_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.738, 0.079]\nB: [0.295, 0.566]\nC: [0.04, 0.229]\nD: [0.771, 0.673]", "question": "What is the position coordinates of the point with coordinates ([0.292, 0.642]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.738, 0.079]\nB: [0.295, 0.566]\nC: [0.04, 0.229]\nD: [0.771, 0.673]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_181_0.jpg", "./2D-spatial/point_tracking/point_tracking_181_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.888, 0.387]\nB: [0.016, 0.294]\nC: [0.918, 0.591]\nD: [0.308, 0.501]", "question": "What is the position coordinates of the point with coordinates ([0.0, 0.0]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.888, 0.387]\nB: [0.016, 0.294]\nC: [0.918, 0.591]\nD: [0.308, 0.501]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_182_0.jpg", "./2D-spatial/point_tracking/point_tracking_182_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.383, 0.798]\nB: [0.668, 0.133]\nC: [0.133, 0.739]\nD: [0.192, 0.076]", "question": "What is the position coordinates of the point with coordinates ([0.0, 0.0]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.383, 0.798]\nB: [0.668, 0.133]\nC: [0.133, 0.739]\nD: [0.192, 0.076]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_183_0.jpg", "./2D-spatial/point_tracking/point_tracking_183_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.634, 0.284]\nB: [0.0, 0.0]\nC: [0.315, 0.604]\nD: [0.141, 0.357]", "question": "What is the position coordinates of the point with coordinates ([0.0, 0.0]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.634, 0.284]\nB: [0.0, 0.0]\nC: [0.315, 0.604]\nD: [0.141, 0.357]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_184_0.jpg", "./2D-spatial/point_tracking/point_tracking_184_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.691, 0.879]\nB: [0.362, 0.72]\nC: [0.157, 0.764]\nD: [0.272, 0.551]", "question": "What is the position coordinates of the point with coordinates ([0.272, 0.551]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.691, 0.879]\nB: [0.362, 0.72]\nC: [0.157, 0.764]\nD: [0.272, 0.551]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_185_0.jpg", "./2D-spatial/point_tracking/point_tracking_185_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.448, 0.266]\nB: [0.5, 0.567]\nC: [0.943, 0.037]\nD: [0.019, 0.535]", "question": "What is the position coordinates of the point with coordinates ([0.448, 0.266]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.448, 0.266]\nB: [0.5, 0.567]\nC: [0.943, 0.037]\nD: [0.019, 0.535]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_186_0.jpg", "./2D-spatial/point_tracking/point_tracking_186_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.664, 0.291]\nB: [0.629, 0.96]\nC: [0.638, 0.438]\nD: [0.072, 0.128]", "question": "What is the position coordinates of the point with coordinates ([0.59, 0.45]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.664, 0.291]\nB: [0.629, 0.96]\nC: [0.638, 0.438]\nD: [0.072, 0.128]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_187_0.jpg", "./2D-spatial/point_tracking/point_tracking_187_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.628, 0.379]\nB: [0.793, 0.079]\nC: [0.084, 0.828]\nD: [0.959, 0.595]", "question": "What is the position coordinates of the point with coordinates ([0.959, 0.595]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.628, 0.379]\nB: [0.793, 0.079]\nC: [0.084, 0.828]\nD: [0.959, 0.595]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_188_0.jpg", "./2D-spatial/point_tracking/point_tracking_188_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.086, 0.897]\nB: [0.891, 0.598]\nC: [0.731, 0.612]\nD: [0.338, -0.004]", "question": "What is the position coordinates of the point with coordinates ([0.417, 0.005]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.086, 0.897]\nB: [0.891, 0.598]\nC: [0.731, 0.612]\nD: [0.338, -0.004]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_189_0.jpg", "./2D-spatial/point_tracking/point_tracking_189_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.314, 0.635]\nB: [0.437, 0.344]\nC: [0.11, 0.731]\nD: [0.763, 0.089]", "question": "What is the position coordinates of the point with coordinates ([0.437, 0.344]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.314, 0.635]\nB: [0.437, 0.344]\nC: [0.11, 0.731]\nD: [0.763, 0.089]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_190_0.jpg", "./2D-spatial/point_tracking/point_tracking_190_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.303, 0.199]\nB: [0.353, 0.651]\nC: [0.302, 0.987]\nD: [0.305, 0.316]", "question": "What is the position coordinates of the point with coordinates ([0.0, 0.0]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.303, 0.199]\nB: [0.353, 0.651]\nC: [0.302, 0.987]\nD: [0.305, 0.316]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_191_0.jpg", "./2D-spatial/point_tracking/point_tracking_191_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.145, 0.87]\nB: [0.947, 0.301]\nC: [0.046, 0.995]\nD: [0.0, 0.0]", "question": "What is the position coordinates of the point with coordinates ([0.465, 0.564]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.145, 0.87]\nB: [0.947, 0.301]\nC: [0.046, 0.995]\nD: [0.0, 0.0]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_192_0.jpg", "./2D-spatial/point_tracking/point_tracking_192_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.622, 0.432]\nB: [0.421, 0.201]\nC: [0.707, 0.491]\nD: [0.55, 0.329]", "question": "What is the position coordinates of the point with coordinates ([0.513, 0.53]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.622, 0.432]\nB: [0.421, 0.201]\nC: [0.707, 0.491]\nD: [0.55, 0.329]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_193_0.jpg", "./2D-spatial/point_tracking/point_tracking_193_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.556, 0.744]\nB: [0.085, 0.886]\nC: [0.475, 0.451]\nD: [0.417, 0.52]", "question": "What is the position coordinates of the point with coordinates ([0.735, 0.381]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.556, 0.744]\nB: [0.085, 0.886]\nC: [0.475, 0.451]\nD: [0.417, 0.52]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_194_0.jpg", "./2D-spatial/point_tracking/point_tracking_194_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['natural_image']", "source": "tapvid_davis", "options": "A: [0.089, 0.936]\nB: [0.642, 0.328]\nC: [0.611, 0.959]\nD: [0.166, 0.377]", "question": "What is the position coordinates of the point with coordinates ([0.114, 0.302]) in Image 1 within the Image 2? Note that the width of the input RGB image is 854 and the height is 480.", "context": "Select from the following choices.\nA: [0.089, 0.936]\nB: [0.642, 0.328]\nC: [0.611, 0.959]\nD: [0.166, 0.377]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_195_0.jpg", "./2D-spatial/point_tracking/point_tracking_195_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.443, 0.616]\nB: [0.663, 0.356]\nC: [0.079, 0.21]\nD: [0.586, -0.124]", "question": "What is the position coordinates of the point with coordinates ([0.733, -0.02]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.443, 0.616]\nB: [0.663, 0.356]\nC: [0.079, 0.21]\nD: [0.586, -0.124]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_196_0.jpg", "./2D-spatial/point_tracking/point_tracking_196_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.385, 0.321]\nB: [0.931, 0.242]\nC: [0.011, 0.867]\nD: [0.917, 0.788]", "question": "What is the position coordinates of the point with coordinates ([0.385, 0.321]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.385, 0.321]\nB: [0.931, 0.242]\nC: [0.011, 0.867]\nD: [0.917, 0.788]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_197_0.jpg", "./2D-spatial/point_tracking/point_tracking_197_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.757, 0.024]\nB: [0.333, -0.045]\nC: [0.773, 0.154]\nD: [0.253, 0.821]", "question": "What is the position coordinates of the point with coordinates ([0.314, -0.005]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.757, 0.024]\nB: [0.333, -0.045]\nC: [0.773, 0.154]\nD: [0.253, 0.821]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_198_0.jpg", "./2D-spatial/point_tracking/point_tracking_198_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "point_tracking", "visual_input_component": "['synthetic_image']", "source": "tapvid_rgb_stacking", "options": "A: [0.627, 0.508]\nB: [0.71, 0.649]\nC: [0.888, 0.125]\nD: [0.302, 0.307]", "question": "What is the position coordinates of the point with coordinates ([0.302, 0.306]) in Image 1 within the Image 2? Note that the width of the input RGB image is 256 and the height is 256.", "context": "Select from the following choices.\nA: [0.627, 0.508]\nB: [0.71, 0.649]\nC: [0.888, 0.125]\nD: [0.302, 0.307]", "input_image_path": ["./2D-spatial/point_tracking/point_tracking_199_0.jpg", "./2D-spatial/point_tracking/point_tracking_199_1.jpg"], "output": "D", "qwen3-vl": "image none"}]
\ No newline at end of file
diff --git a/results/ravens_progressive_matrices/qwen3-vl/metadata_info.json b/results/ravens_progressive_matrices/qwen3-vl/metadata_info.json
new file mode 100644
index 0000000..f1aac3f
--- /dev/null
+++ b/results/ravens_progressive_matrices/qwen3-vl/metadata_info.json
@@ -0,0 +1 @@
+[{"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_0_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_0_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_0_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_0_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_0_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_0_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_0_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_0_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_0_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_0_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_0_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_0_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_0_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_0_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_0_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_0_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_1_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_1_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_1_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_1_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_1_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_1_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_1_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_1_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_1_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_1_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_1_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_1_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_1_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_1_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_1_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_1_15.png"], "output": "G", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_2_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_2_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_2_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_2_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_2_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_2_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_2_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_2_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_2_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_2_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_2_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_2_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_2_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_2_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_2_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_2_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_3_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_3_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_3_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_3_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_3_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_3_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_3_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_3_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_3_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_3_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_3_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_3_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_3_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_3_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_3_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_3_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_4_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_4_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_4_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_4_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_4_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_4_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_4_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_4_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_4_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_4_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_4_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_4_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_4_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_4_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_4_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_4_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_5_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_5_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_5_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_5_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_5_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_5_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_5_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_5_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_5_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_5_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_5_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_5_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_5_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_5_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_5_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_5_15.png"], "output": "H", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_6_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_6_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_6_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_6_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_6_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_6_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_6_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_6_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_6_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_6_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_6_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_6_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_6_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_6_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_6_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_6_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_7_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_7_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_7_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_7_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_7_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_7_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_7_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_7_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_7_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_7_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_7_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_7_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_7_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_7_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_7_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_7_15.png"], "output": "H", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_8_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_8_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_8_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_8_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_8_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_8_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_8_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_8_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_8_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_8_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_8_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_8_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_8_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_8_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_8_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_8_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_9_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_9_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_9_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_9_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_9_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_9_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_9_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_9_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_9_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_9_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_9_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_9_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_9_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_9_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_9_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_9_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_10_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_10_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_10_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_10_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_10_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_10_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_10_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_10_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_10_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_10_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_10_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_10_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_10_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_10_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_10_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_10_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_11_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_11_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_11_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_11_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_11_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_11_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_11_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_11_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_11_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_11_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_11_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_11_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_11_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_11_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_11_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_11_15.png"], "output": "E", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_12_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_12_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_12_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_12_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_12_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_12_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_12_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_12_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_12_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_12_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_12_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_12_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_12_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_12_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_12_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_12_15.png"], "output": "F", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_13_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_13_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_13_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_13_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_13_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_13_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_13_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_13_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_13_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_13_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_13_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_13_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_13_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_13_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_13_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_13_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_14_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_14_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_14_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_14_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_14_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_14_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_14_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_14_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_14_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_14_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_14_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_14_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_14_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_14_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_14_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_14_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_15_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_15_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_15_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_15_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_15_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_15_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_15_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_15_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_15_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_15_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_15_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_15_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_15_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_15_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_15_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_15_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_16_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_16_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_16_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_16_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_16_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_16_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_16_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_16_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_16_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_16_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_16_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_16_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_16_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_16_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_16_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_16_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_17_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_17_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_17_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_17_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_17_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_17_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_17_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_17_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_17_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_17_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_17_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_17_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_17_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_17_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_17_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_17_15.png"], "output": "G", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_18_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_18_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_18_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_18_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_18_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_18_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_18_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_18_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_18_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_18_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_18_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_18_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_18_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_18_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_18_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_18_15.png"], "output": "H", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_19_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_19_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_19_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_19_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_19_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_19_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_19_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_19_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_19_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_19_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_19_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_19_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_19_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_19_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_19_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_19_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_20_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_20_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_20_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_20_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_20_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_20_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_20_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_20_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_20_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_20_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_20_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_20_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_20_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_20_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_20_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_20_15.png"], "output": "G", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_21_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_21_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_21_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_21_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_21_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_21_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_21_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_21_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_21_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_21_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_21_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_21_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_21_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_21_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_21_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_21_15.png"], "output": "G", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_22_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_22_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_22_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_22_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_22_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_22_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_22_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_22_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_22_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_22_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_22_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_22_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_22_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_22_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_22_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_22_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_23_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_23_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_23_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_23_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_23_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_23_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_23_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_23_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_23_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_23_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_23_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_23_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_23_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_23_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_23_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_23_15.png"], "output": "H", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_24_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_24_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_24_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_24_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_24_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_24_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_24_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_24_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_24_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_24_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_24_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_24_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_24_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_24_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_24_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_24_15.png"], "output": "E", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_25_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_25_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_25_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_25_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_25_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_25_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_25_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_25_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_25_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_25_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_25_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_25_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_25_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_25_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_25_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_25_15.png"], "output": "E", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_26_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_26_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_26_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_26_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_26_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_26_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_26_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_26_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_26_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_26_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_26_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_26_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_26_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_26_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_26_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_26_15.png"], "output": "H", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_27_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_27_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_27_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_27_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_27_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_27_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_27_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_27_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_27_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_27_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_27_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_27_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_27_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_27_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_27_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_27_15.png"], "output": "E", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_28_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_28_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_28_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_28_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_28_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_28_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_28_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_28_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_28_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_28_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_28_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_28_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_28_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_28_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_28_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_28_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_29_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_29_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_29_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_29_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_29_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_29_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_29_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_29_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_29_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_29_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_29_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_29_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_29_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_29_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_29_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_29_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_30_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_30_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_30_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_30_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_30_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_30_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_30_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_30_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_30_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_30_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_30_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_30_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_30_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_30_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_30_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_30_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_31_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_31_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_31_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_31_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_31_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_31_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_31_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_31_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_31_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_31_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_31_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_31_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_31_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_31_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_31_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_31_15.png"], "output": "H", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_32_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_32_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_32_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_32_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_32_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_32_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_32_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_32_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_32_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_32_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_32_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_32_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_32_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_32_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_32_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_32_15.png"], "output": "F", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_33_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_33_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_33_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_33_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_33_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_33_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_33_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_33_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_33_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_33_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_33_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_33_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_33_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_33_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_33_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_33_15.png"], "output": "E", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_34_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_34_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_34_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_34_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_34_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_34_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_34_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_34_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_34_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_34_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_34_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_34_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_34_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_34_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_34_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_34_15.png"], "output": "G", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_35_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_35_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_35_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_35_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_35_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_35_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_35_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_35_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_35_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_35_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_35_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_35_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_35_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_35_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_35_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_35_15.png"], "output": "E", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_36_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_36_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_36_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_36_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_36_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_36_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_36_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_36_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_36_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_36_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_36_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_36_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_36_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_36_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_36_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_36_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_37_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_37_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_37_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_37_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_37_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_37_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_37_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_37_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_37_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_37_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_37_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_37_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_37_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_37_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_37_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_37_15.png"], "output": "F", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_38_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_38_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_38_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_38_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_38_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_38_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_38_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_38_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_38_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_38_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_38_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_38_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_38_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_38_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_38_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_38_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_39_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_39_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_39_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_39_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_39_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_39_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_39_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_39_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_39_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_39_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_39_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_39_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_39_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_39_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_39_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_39_15.png"], "output": "F", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_40_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_40_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_40_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_40_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_40_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_40_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_40_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_40_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_40_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_40_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_40_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_40_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_40_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_40_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_40_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_40_15.png"], "output": "G", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_41_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_41_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_41_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_41_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_41_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_41_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_41_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_41_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_41_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_41_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_41_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_41_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_41_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_41_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_41_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_41_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_42_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_42_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_42_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_42_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_42_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_42_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_42_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_42_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_42_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_42_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_42_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_42_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_42_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_42_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_42_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_42_15.png"], "output": "H", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_43_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_43_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_43_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_43_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_43_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_43_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_43_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_43_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_43_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_43_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_43_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_43_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_43_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_43_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_43_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_43_15.png"], "output": "G", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_44_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_44_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_44_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_44_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_44_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_44_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_44_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_44_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_44_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_44_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_44_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_44_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_44_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_44_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_44_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_44_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_45_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_45_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_45_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_45_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_45_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_45_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_45_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_45_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_45_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_45_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_45_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_45_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_45_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_45_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_45_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_45_15.png"], "output": "F", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_46_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_46_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_46_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_46_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_46_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_46_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_46_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_46_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_46_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_46_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_46_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_46_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_46_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_46_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_46_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_46_15.png"], "output": "E", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_47_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_47_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_47_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_47_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_47_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_47_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_47_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_47_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_47_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_47_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_47_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_47_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_47_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_47_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_47_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_47_15.png"], "output": "G", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_48_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_48_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_48_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_48_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_48_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_48_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_48_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_48_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_48_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_48_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_48_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_48_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_48_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_48_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_48_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_48_15.png"], "output": "H", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_49_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_49_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_49_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_49_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_49_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_49_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_49_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_49_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_49_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_49_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_49_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_49_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_49_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_49_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_49_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_49_15.png"], "output": "F", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_50_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_50_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_50_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_50_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_50_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_50_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_50_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_50_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_50_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_50_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_50_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_50_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_50_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_50_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_50_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_50_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_51_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_51_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_51_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_51_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_51_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_51_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_51_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_51_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_51_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_51_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_51_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_51_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_51_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_51_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_51_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_51_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_52_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_52_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_52_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_52_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_52_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_52_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_52_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_52_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_52_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_52_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_52_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_52_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_52_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_52_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_52_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_52_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_53_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_53_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_53_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_53_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_53_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_53_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_53_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_53_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_53_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_53_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_53_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_53_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_53_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_53_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_53_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_53_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_54_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_54_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_54_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_54_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_54_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_54_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_54_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_54_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_54_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_54_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_54_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_54_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_54_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_54_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_54_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_54_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_55_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_55_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_55_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_55_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_55_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_55_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_55_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_55_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_55_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_55_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_55_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_55_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_55_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_55_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_55_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_55_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_56_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_56_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_56_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_56_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_56_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_56_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_56_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_56_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_56_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_56_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_56_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_56_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_56_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_56_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_56_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_56_15.png"], "output": "G", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_57_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_57_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_57_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_57_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_57_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_57_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_57_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_57_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_57_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_57_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_57_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_57_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_57_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_57_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_57_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_57_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_58_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_58_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_58_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_58_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_58_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_58_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_58_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_58_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_58_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_58_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_58_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_58_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_58_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_58_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_58_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_58_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_59_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_59_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_59_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_59_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_59_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_59_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_59_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_59_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_59_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_59_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_59_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_59_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_59_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_59_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_59_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_59_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_60_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_60_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_60_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_60_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_60_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_60_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_60_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_60_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_60_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_60_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_60_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_60_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_60_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_60_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_60_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_60_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_61_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_61_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_61_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_61_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_61_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_61_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_61_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_61_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_61_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_61_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_61_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_61_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_61_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_61_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_61_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_61_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_62_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_62_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_62_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_62_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_62_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_62_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_62_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_62_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_62_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_62_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_62_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_62_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_62_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_62_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_62_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_62_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_63_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_63_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_63_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_63_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_63_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_63_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_63_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_63_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_63_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_63_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_63_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_63_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_63_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_63_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_63_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_63_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_64_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_64_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_64_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_64_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_64_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_64_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_64_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_64_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_64_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_64_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_64_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_64_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_64_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_64_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_64_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_64_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_65_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_65_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_65_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_65_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_65_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_65_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_65_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_65_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_65_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_65_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_65_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_65_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_65_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_65_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_65_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_65_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_66_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_66_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_66_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_66_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_66_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_66_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_66_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_66_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_66_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_66_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_66_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_66_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_66_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_66_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_66_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_66_15.png"], "output": "F", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_67_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_67_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_67_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_67_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_67_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_67_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_67_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_67_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_67_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_67_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_67_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_67_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_67_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_67_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_67_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_67_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_68_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_68_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_68_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_68_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_68_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_68_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_68_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_68_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_68_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_68_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_68_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_68_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_68_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_68_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_68_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_68_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_69_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_69_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_69_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_69_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_69_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_69_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_69_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_69_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_69_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_69_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_69_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_69_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_69_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_69_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_69_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_69_15.png"], "output": "H", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_70_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_70_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_70_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_70_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_70_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_70_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_70_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_70_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_70_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_70_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_70_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_70_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_70_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_70_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_70_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_70_15.png"], "output": "E", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_71_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_71_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_71_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_71_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_71_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_71_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_71_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_71_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_71_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_71_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_71_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_71_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_71_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_71_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_71_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_71_15.png"], "output": "E", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_72_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_72_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_72_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_72_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_72_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_72_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_72_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_72_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_72_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_72_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_72_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_72_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_72_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_72_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_72_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_72_15.png"], "output": "F", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_73_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_73_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_73_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_73_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_73_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_73_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_73_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_73_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_73_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_73_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_73_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_73_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_73_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_73_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_73_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_73_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_74_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_74_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_74_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_74_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_74_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_74_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_74_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_74_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_74_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_74_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_74_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_74_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_74_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_74_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_74_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_74_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_75_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_75_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_75_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_75_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_75_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_75_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_75_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_75_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_75_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_75_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_75_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_75_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_75_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_75_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_75_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_75_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_76_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_76_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_76_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_76_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_76_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_76_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_76_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_76_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_76_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_76_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_76_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_76_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_76_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_76_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_76_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_76_15.png"], "output": "F", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_77_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_77_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_77_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_77_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_77_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_77_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_77_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_77_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_77_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_77_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_77_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_77_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_77_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_77_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_77_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_77_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_78_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_78_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_78_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_78_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_78_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_78_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_78_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_78_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_78_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_78_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_78_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_78_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_78_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_78_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_78_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_78_15.png"], "output": "E", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_79_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_79_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_79_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_79_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_79_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_79_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_79_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_79_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_79_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_79_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_79_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_79_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_79_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_79_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_79_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_79_15.png"], "output": "H", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_80_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_80_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_80_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_80_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_80_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_80_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_80_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_80_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_80_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_80_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_80_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_80_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_80_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_80_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_80_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_80_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_81_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_81_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_81_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_81_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_81_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_81_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_81_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_81_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_81_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_81_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_81_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_81_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_81_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_81_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_81_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_81_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_82_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_82_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_82_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_82_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_82_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_82_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_82_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_82_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_82_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_82_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_82_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_82_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_82_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_82_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_82_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_82_15.png"], "output": "H", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_83_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_83_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_83_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_83_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_83_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_83_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_83_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_83_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_83_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_83_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_83_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_83_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_83_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_83_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_83_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_83_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_84_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_84_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_84_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_84_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_84_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_84_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_84_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_84_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_84_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_84_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_84_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_84_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_84_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_84_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_84_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_84_15.png"], "output": "G", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_85_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_85_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_85_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_85_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_85_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_85_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_85_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_85_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_85_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_85_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_85_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_85_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_85_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_85_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_85_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_85_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_86_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_86_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_86_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_86_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_86_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_86_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_86_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_86_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_86_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_86_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_86_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_86_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_86_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_86_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_86_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_86_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_87_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_87_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_87_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_87_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_87_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_87_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_87_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_87_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_87_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_87_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_87_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_87_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_87_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_87_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_87_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_87_15.png"], "output": "F", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_88_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_88_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_88_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_88_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_88_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_88_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_88_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_88_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_88_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_88_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_88_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_88_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_88_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_88_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_88_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_88_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_89_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_89_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_89_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_89_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_89_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_89_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_89_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_89_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_89_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_89_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_89_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_89_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_89_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_89_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_89_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_89_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_90_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_90_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_90_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_90_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_90_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_90_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_90_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_90_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_90_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_90_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_90_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_90_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_90_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_90_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_90_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_90_15.png"], "output": "G", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_91_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_91_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_91_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_91_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_91_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_91_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_91_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_91_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_91_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_91_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_91_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_91_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_91_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_91_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_91_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_91_15.png"], "output": "F", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_92_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_92_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_92_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_92_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_92_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_92_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_92_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_92_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_92_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_92_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_92_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_92_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_92_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_92_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_92_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_92_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_93_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_93_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_93_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_93_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_93_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_93_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_93_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_93_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_93_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_93_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_93_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_93_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_93_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_93_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_93_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_93_15.png"], "output": "E", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_94_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_94_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_94_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_94_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_94_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_94_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_94_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_94_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_94_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_94_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_94_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_94_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_94_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_94_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_94_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_94_15.png"], "output": "G", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_95_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_95_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_95_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_95_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_95_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_95_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_95_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_95_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_95_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_95_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_95_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_95_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_95_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_95_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_95_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_95_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_96_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_96_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_96_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_96_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_96_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_96_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_96_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_96_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_96_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_96_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_96_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_96_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_96_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_96_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_96_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_96_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_97_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_97_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_97_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_97_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_97_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_97_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_97_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_97_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_97_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_97_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_97_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_97_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_97_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_97_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_97_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_97_15.png"], "output": "H", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_98_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_98_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_98_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_98_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_98_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_98_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_98_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_98_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_98_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_98_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_98_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_98_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_98_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_98_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_98_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_98_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_99_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_99_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_99_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_99_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_99_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_99_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_99_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_99_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_99_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_99_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_99_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_99_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_99_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_99_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_99_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_99_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_100_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_100_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_100_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_100_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_100_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_100_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_100_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_100_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_100_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_100_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_100_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_100_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_100_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_100_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_100_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_100_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_101_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_101_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_101_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_101_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_101_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_101_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_101_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_101_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_101_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_101_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_101_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_101_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_101_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_101_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_101_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_101_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_102_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_102_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_102_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_102_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_102_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_102_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_102_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_102_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_102_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_102_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_102_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_102_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_102_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_102_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_102_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_102_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_103_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_103_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_103_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_103_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_103_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_103_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_103_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_103_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_103_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_103_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_103_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_103_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_103_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_103_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_103_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_103_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_104_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_104_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_104_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_104_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_104_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_104_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_104_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_104_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_104_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_104_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_104_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_104_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_104_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_104_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_104_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_104_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_105_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_105_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_105_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_105_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_105_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_105_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_105_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_105_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_105_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_105_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_105_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_105_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_105_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_105_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_105_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_105_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_106_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_106_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_106_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_106_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_106_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_106_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_106_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_106_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_106_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_106_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_106_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_106_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_106_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_106_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_106_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_106_15.png"], "output": "H", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_107_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_107_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_107_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_107_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_107_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_107_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_107_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_107_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_107_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_107_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_107_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_107_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_107_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_107_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_107_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_107_15.png"], "output": "H", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_108_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_108_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_108_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_108_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_108_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_108_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_108_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_108_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_108_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_108_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_108_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_108_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_108_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_108_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_108_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_108_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_109_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_109_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_109_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_109_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_109_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_109_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_109_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_109_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_109_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_109_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_109_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_109_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_109_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_109_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_109_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_109_15.png"], "output": "F", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_110_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_110_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_110_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_110_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_110_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_110_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_110_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_110_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_110_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_110_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_110_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_110_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_110_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_110_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_110_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_110_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_111_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_111_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_111_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_111_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_111_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_111_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_111_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_111_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_111_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_111_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_111_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_111_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_111_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_111_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_111_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_111_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_112_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_112_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_112_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_112_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_112_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_112_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_112_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_112_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_112_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_112_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_112_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_112_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_112_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_112_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_112_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_112_15.png"], "output": "F", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_113_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_113_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_113_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_113_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_113_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_113_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_113_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_113_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_113_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_113_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_113_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_113_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_113_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_113_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_113_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_113_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_114_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_114_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_114_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_114_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_114_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_114_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_114_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_114_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_114_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_114_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_114_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_114_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_114_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_114_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_114_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_114_15.png"], "output": "H", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_115_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_115_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_115_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_115_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_115_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_115_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_115_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_115_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_115_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_115_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_115_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_115_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_115_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_115_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_115_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_115_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_116_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_116_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_116_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_116_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_116_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_116_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_116_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_116_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_116_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_116_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_116_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_116_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_116_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_116_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_116_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_116_15.png"], "output": "G", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_117_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_117_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_117_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_117_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_117_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_117_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_117_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_117_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_117_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_117_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_117_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_117_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_117_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_117_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_117_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_117_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_118_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_118_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_118_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_118_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_118_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_118_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_118_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_118_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_118_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_118_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_118_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_118_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_118_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_118_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_118_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_118_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_119_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_119_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_119_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_119_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_119_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_119_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_119_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_119_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_119_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_119_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_119_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_119_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_119_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_119_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_119_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_119_15.png"], "output": "G", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_120_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_120_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_120_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_120_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_120_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_120_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_120_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_120_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_120_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_120_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_120_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_120_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_120_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_120_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_120_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_120_15.png"], "output": "F", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_121_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_121_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_121_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_121_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_121_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_121_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_121_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_121_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_121_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_121_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_121_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_121_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_121_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_121_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_121_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_121_15.png"], "output": "E", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_122_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_122_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_122_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_122_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_122_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_122_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_122_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_122_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_122_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_122_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_122_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_122_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_122_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_122_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_122_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_122_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_123_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_123_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_123_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_123_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_123_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_123_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_123_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_123_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_123_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_123_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_123_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_123_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_123_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_123_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_123_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_123_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_124_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_124_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_124_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_124_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_124_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_124_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_124_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_124_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_124_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_124_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_124_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_124_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_124_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_124_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_124_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_124_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_125_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_125_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_125_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_125_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_125_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_125_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_125_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_125_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_125_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_125_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_125_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_125_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_125_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_125_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_125_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_125_15.png"], "output": "H", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_126_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_126_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_126_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_126_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_126_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_126_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_126_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_126_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_126_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_126_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_126_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_126_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_126_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_126_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_126_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_126_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_127_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_127_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_127_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_127_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_127_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_127_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_127_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_127_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_127_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_127_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_127_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_127_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_127_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_127_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_127_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_127_15.png"], "output": "G", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_128_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_128_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_128_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_128_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_128_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_128_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_128_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_128_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_128_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_128_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_128_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_128_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_128_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_128_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_128_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_128_15.png"], "output": "G", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_129_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_129_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_129_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_129_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_129_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_129_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_129_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_129_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_129_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_129_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_129_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_129_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_129_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_129_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_129_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_129_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_130_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_130_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_130_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_130_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_130_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_130_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_130_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_130_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_130_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_130_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_130_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_130_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_130_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_130_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_130_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_130_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_131_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_131_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_131_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_131_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_131_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_131_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_131_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_131_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_131_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_131_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_131_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_131_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_131_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_131_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_131_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_131_15.png"], "output": "F", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_132_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_132_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_132_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_132_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_132_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_132_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_132_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_132_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_132_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_132_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_132_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_132_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_132_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_132_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_132_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_132_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_133_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_133_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_133_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_133_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_133_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_133_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_133_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_133_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_133_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_133_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_133_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_133_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_133_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_133_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_133_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_133_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_134_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_134_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_134_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_134_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_134_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_134_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_134_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_134_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_134_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_134_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_134_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_134_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_134_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_134_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_134_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_134_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_135_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_135_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_135_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_135_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_135_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_135_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_135_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_135_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_135_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_135_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_135_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_135_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_135_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_135_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_135_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_135_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_136_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_136_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_136_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_136_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_136_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_136_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_136_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_136_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_136_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_136_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_136_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_136_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_136_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_136_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_136_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_136_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_137_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_137_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_137_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_137_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_137_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_137_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_137_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_137_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_137_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_137_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_137_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_137_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_137_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_137_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_137_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_137_15.png"], "output": "H", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_138_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_138_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_138_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_138_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_138_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_138_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_138_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_138_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_138_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_138_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_138_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_138_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_138_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_138_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_138_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_138_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_139_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_139_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_139_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_139_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_139_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_139_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_139_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_139_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_139_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_139_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_139_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_139_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_139_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_139_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_139_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_139_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_140_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_140_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_140_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_140_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_140_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_140_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_140_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_140_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_140_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_140_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_140_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_140_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_140_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_140_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_140_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_140_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_141_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_141_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_141_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_141_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_141_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_141_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_141_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_141_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_141_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_141_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_141_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_141_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_141_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_141_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_141_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_141_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_142_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_142_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_142_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_142_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_142_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_142_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_142_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_142_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_142_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_142_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_142_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_142_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_142_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_142_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_142_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_142_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_143_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_143_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_143_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_143_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_143_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_143_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_143_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_143_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_143_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_143_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_143_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_143_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_143_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_143_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_143_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_143_15.png"], "output": "G", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_144_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_144_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_144_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_144_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_144_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_144_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_144_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_144_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_144_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_144_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_144_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_144_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_144_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_144_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_144_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_144_15.png"], "output": "H", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_145_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_145_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_145_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_145_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_145_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_145_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_145_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_145_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_145_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_145_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_145_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_145_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_145_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_145_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_145_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_145_15.png"], "output": "F", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_146_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_146_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_146_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_146_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_146_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_146_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_146_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_146_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_146_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_146_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_146_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_146_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_146_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_146_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_146_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_146_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_147_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_147_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_147_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_147_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_147_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_147_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_147_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_147_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_147_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_147_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_147_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_147_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_147_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_147_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_147_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_147_15.png"], "output": "H", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_148_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_148_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_148_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_148_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_148_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_148_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_148_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_148_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_148_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_148_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_148_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_148_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_148_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_148_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_148_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_148_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_149_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_149_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_149_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_149_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_149_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_149_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_149_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_149_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_149_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_149_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_149_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_149_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_149_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_149_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_149_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_149_15.png"], "output": "E", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_150_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_150_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_150_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_150_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_150_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_150_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_150_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_150_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_150_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_150_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_150_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_150_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_150_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_150_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_150_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_150_15.png"], "output": "F", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_151_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_151_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_151_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_151_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_151_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_151_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_151_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_151_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_151_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_151_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_151_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_151_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_151_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_151_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_151_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_151_15.png"], "output": "E", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_152_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_152_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_152_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_152_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_152_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_152_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_152_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_152_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_152_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_152_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_152_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_152_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_152_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_152_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_152_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_152_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_153_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_153_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_153_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_153_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_153_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_153_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_153_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_153_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_153_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_153_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_153_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_153_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_153_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_153_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_153_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_153_15.png"], "output": "F", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_154_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_154_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_154_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_154_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_154_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_154_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_154_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_154_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_154_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_154_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_154_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_154_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_154_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_154_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_154_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_154_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_155_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_155_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_155_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_155_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_155_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_155_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_155_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_155_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_155_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_155_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_155_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_155_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_155_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_155_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_155_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_155_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_156_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_156_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_156_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_156_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_156_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_156_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_156_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_156_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_156_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_156_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_156_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_156_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_156_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_156_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_156_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_156_15.png"], "output": "E", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_157_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_157_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_157_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_157_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_157_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_157_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_157_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_157_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_157_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_157_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_157_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_157_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_157_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_157_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_157_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_157_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_158_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_158_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_158_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_158_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_158_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_158_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_158_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_158_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_158_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_158_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_158_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_158_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_158_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_158_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_158_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_158_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_159_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_159_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_159_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_159_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_159_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_159_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_159_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_159_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_159_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_159_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_159_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_159_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_159_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_159_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_159_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_159_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_160_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_160_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_160_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_160_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_160_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_160_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_160_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_160_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_160_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_160_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_160_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_160_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_160_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_160_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_160_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_160_15.png"], "output": "E", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_161_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_161_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_161_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_161_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_161_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_161_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_161_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_161_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_161_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_161_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_161_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_161_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_161_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_161_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_161_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_161_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_162_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_162_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_162_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_162_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_162_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_162_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_162_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_162_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_162_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_162_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_162_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_162_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_162_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_162_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_162_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_162_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_163_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_163_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_163_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_163_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_163_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_163_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_163_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_163_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_163_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_163_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_163_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_163_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_163_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_163_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_163_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_163_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_164_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_164_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_164_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_164_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_164_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_164_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_164_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_164_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_164_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_164_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_164_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_164_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_164_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_164_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_164_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_164_15.png"], "output": "F", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_165_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_165_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_165_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_165_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_165_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_165_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_165_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_165_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_165_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_165_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_165_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_165_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_165_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_165_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_165_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_165_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_166_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_166_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_166_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_166_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_166_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_166_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_166_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_166_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_166_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_166_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_166_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_166_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_166_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_166_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_166_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_166_15.png"], "output": "F", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_167_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_167_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_167_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_167_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_167_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_167_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_167_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_167_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_167_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_167_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_167_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_167_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_167_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_167_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_167_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_167_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_168_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_168_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_168_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_168_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_168_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_168_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_168_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_168_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_168_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_168_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_168_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_168_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_168_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_168_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_168_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_168_15.png"], "output": "G", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_169_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_169_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_169_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_169_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_169_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_169_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_169_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_169_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_169_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_169_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_169_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_169_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_169_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_169_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_169_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_169_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_170_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_170_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_170_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_170_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_170_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_170_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_170_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_170_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_170_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_170_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_170_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_170_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_170_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_170_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_170_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_170_15.png"], "output": "H", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_171_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_171_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_171_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_171_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_171_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_171_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_171_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_171_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_171_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_171_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_171_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_171_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_171_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_171_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_171_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_171_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_172_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_172_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_172_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_172_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_172_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_172_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_172_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_172_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_172_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_172_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_172_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_172_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_172_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_172_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_172_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_172_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_173_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_173_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_173_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_173_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_173_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_173_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_173_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_173_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_173_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_173_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_173_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_173_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_173_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_173_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_173_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_173_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_174_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_174_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_174_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_174_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_174_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_174_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_174_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_174_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_174_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_174_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_174_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_174_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_174_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_174_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_174_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_174_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_175_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_175_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_175_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_175_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_175_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_175_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_175_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_175_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_175_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_175_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_175_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_175_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_175_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_175_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_175_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_175_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_176_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_176_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_176_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_176_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_176_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_176_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_176_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_176_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_176_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_176_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_176_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_176_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_176_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_176_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_176_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_176_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_177_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_177_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_177_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_177_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_177_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_177_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_177_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_177_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_177_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_177_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_177_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_177_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_177_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_177_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_177_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_177_15.png"], "output": "F", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_178_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_178_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_178_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_178_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_178_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_178_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_178_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_178_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_178_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_178_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_178_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_178_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_178_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_178_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_178_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_178_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_179_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_179_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_179_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_179_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_179_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_179_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_179_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_179_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_179_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_179_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_179_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_179_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_179_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_179_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_179_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_179_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_180_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_180_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_180_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_180_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_180_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_180_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_180_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_180_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_180_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_180_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_180_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_180_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_180_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_180_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_180_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_180_15.png"], "output": "F", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_181_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_181_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_181_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_181_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_181_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_181_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_181_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_181_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_181_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_181_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_181_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_181_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_181_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_181_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_181_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_181_15.png"], "output": "E", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_182_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_182_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_182_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_182_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_182_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_182_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_182_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_182_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_182_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_182_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_182_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_182_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_182_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_182_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_182_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_182_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_183_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_183_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_183_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_183_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_183_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_183_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_183_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_183_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_183_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_183_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_183_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_183_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_183_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_183_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_183_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_183_15.png"], "output": "E", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_184_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_184_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_184_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_184_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_184_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_184_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_184_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_184_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_184_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_184_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_184_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_184_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_184_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_184_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_184_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_184_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_185_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_185_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_185_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_185_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_185_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_185_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_185_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_185_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_185_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_185_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_185_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_185_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_185_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_185_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_185_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_185_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_186_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_186_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_186_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_186_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_186_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_186_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_186_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_186_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_186_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_186_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_186_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_186_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_186_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_186_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_186_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_186_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_187_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_187_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_187_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_187_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_187_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_187_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_187_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_187_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_187_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_187_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_187_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_187_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_187_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_187_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_187_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_187_15.png"], "output": "F", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_188_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_188_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_188_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_188_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_188_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_188_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_188_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_188_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_188_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_188_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_188_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_188_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_188_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_188_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_188_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_188_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_189_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_189_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_189_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_189_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_189_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_189_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_189_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_189_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_189_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_189_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_189_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_189_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_189_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_189_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_189_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_189_15.png"], "output": "E", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_190_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_190_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_190_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_190_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_190_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_190_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_190_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_190_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_190_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_190_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_190_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_190_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_190_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_190_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_190_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_190_15.png"], "output": "G", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_191_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_191_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_191_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_191_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_191_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_191_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_191_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_191_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_191_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_191_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_191_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_191_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_191_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_191_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_191_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_191_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_192_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_192_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_192_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_192_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_192_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_192_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_192_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_192_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_192_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_192_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_192_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_192_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_192_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_192_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_192_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_192_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_193_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_193_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_193_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_193_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_193_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_193_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_193_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_193_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_193_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_193_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_193_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_193_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_193_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_193_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_193_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_193_15.png"], "output": "H", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_194_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_194_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_194_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_194_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_194_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_194_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_194_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_194_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_194_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_194_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_194_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_194_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_194_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_194_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_194_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_194_15.png"], "output": "G", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_195_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_195_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_195_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_195_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_195_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_195_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_195_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_195_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_195_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_195_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_195_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_195_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_195_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_195_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_195_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_195_15.png"], "output": "F", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_196_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_196_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_196_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_196_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_196_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_196_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_196_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_196_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_196_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_196_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_196_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_196_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_196_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_196_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_196_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_196_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_197_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_197_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_197_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_197_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_197_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_197_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_197_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_197_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_197_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_197_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_197_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_197_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_197_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_197_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_197_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_197_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_198_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_198_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_198_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_198_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_198_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_198_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_198_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_198_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_198_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_198_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_198_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_198_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_198_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_198_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_198_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_198_15.png"], "output": "E", "qwen3-vl": "image none"}, {"task": "ravens_progressive_matrices", "visual_input_component": "['synthetic image']", "source": "RAVEN_10000", "options": "A: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "question": "Following the structural and analogical relations, which image best completes the problem matrix?", "context": "In the input images, the first 8 are the images from the question, and the last 8 are the images for the choices.Select from the following choices.\nA: The 9th image\nB: The 10th image\nC: The 11th image\nD: The 12th image\nE: The 13th image\nF: The 14th image\nG: The 15th image\nH: The 16th image", "input_image_path": ["./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_199_0.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_199_1.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_199_2.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_199_3.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_199_4.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_199_5.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_199_6.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_199_7.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_199_8.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_199_9.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_199_10.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_199_11.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_199_12.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_199_13.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_199_14.png", "./2D-spatial/ravens_progressive_matrices/ravens_progressive_matrices_199_15.png"], "output": "A", "qwen3-vl": "image none"}]
\ No newline at end of file
diff --git a/results/semantic_correspondence_blink/qwen3-vl/metadata_info.json b/results/semantic_correspondence_blink/qwen3-vl/metadata_info.json
new file mode 100644
index 0000000..8e29b04
--- /dev/null
+++ b/results/semantic_correspondence_blink/qwen3-vl/metadata_info.json
@@ -0,0 +1 @@
+[{"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_0_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_0_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_1_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_1_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_2_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_2_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_3_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_3_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_4_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_4_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_5_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_5_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_6_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_6_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_7_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_7_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_8_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_8_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_9_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_9_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_10_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_10_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_11_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_11_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_12_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_12_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_13_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_13_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_14_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_14_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_15_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_15_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_16_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_16_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_17_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_17_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_18_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_18_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_19_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_19_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_20_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_20_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_21_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_21_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_22_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_22_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_23_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_23_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_24_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_24_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_25_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_25_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_26_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_26_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_27_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_27_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_28_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_28_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_29_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_29_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_30_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_30_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_31_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_31_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_32_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_32_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_33_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_33_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_34_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_34_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_35_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_35_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_36_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_36_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_37_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_37_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_38_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_38_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_39_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_39_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_40_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_40_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_41_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_41_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_42_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_42_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_43_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_43_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_44_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_44_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_45_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_45_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_46_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_46_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_47_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_47_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_48_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_48_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_49_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_49_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_50_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_50_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_51_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_51_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_52_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_52_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_53_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_53_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_54_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_54_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_55_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_55_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_56_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_56_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_57_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_57_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_58_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_58_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_59_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_59_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_60_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_60_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_61_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_61_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_62_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_62_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_63_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_63_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_64_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_64_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_65_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_65_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_66_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_66_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_67_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_67_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_68_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_68_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_69_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_69_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_70_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_70_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_71_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_71_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_72_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_72_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_73_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_73_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_74_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_74_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_75_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_75_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_76_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_76_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_77_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_77_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_78_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_78_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_79_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_79_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_80_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_80_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_81_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_81_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_82_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_82_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_83_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_83_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_84_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_84_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_85_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_85_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_86_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_86_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_87_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_87_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_88_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_88_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_89_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_89_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_90_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_90_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_91_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_91_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_92_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_92_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_93_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_93_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_94_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_94_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_95_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_95_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_96_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_96_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_97_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_97_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_98_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_98_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_99_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_99_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_100_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_100_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_101_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_101_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_102_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_102_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_103_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_103_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_104_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_104_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_105_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_105_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_106_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_106_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_107_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_107_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_108_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_108_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_109_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_109_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_110_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_110_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_111_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_111_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_112_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_112_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_113_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_113_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_114_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_114_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_115_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_115_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_116_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_116_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_117_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_117_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_118_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_118_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_119_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_119_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_120_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_120_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_121_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_121_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_122_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_122_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_123_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_123_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_124_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_124_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_125_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_125_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_126_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_126_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_127_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_127_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_128_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_128_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_129_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_129_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_130_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_130_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_131_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_131_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_132_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_132_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_133_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_133_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_134_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_134_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_135_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_135_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_136_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_136_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_137_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_137_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_138_0.jpg", "./High-level-obj-semantic/semantic_correspondence_blink/semantic_correspondence_blink_138_1.jpg"], "output": "B", "qwen3-vl": "image none"}]
\ No newline at end of file
diff --git a/results/semantic_correspondence_misc210k/qwen3-vl/metadata_info.json b/results/semantic_correspondence_misc210k/qwen3-vl/metadata_info.json
new file mode 100644
index 0000000..4090231
--- /dev/null
+++ b/results/semantic_correspondence_misc210k/qwen3-vl/metadata_info.json
@@ -0,0 +1 @@
+[{"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_0_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_0_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_1_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_1_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_2_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_2_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_3_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_3_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_4_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_4_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_5_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_5_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_6_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_6_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_7_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_7_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_8_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_8_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_9_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_9_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_10_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_10_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_11_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_11_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_12_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_12_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_13_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_13_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_14_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_14_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_15_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_15_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_16_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_16_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_17_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_17_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_18_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_18_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_19_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_19_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_20_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_20_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_21_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_21_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_22_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_22_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_23_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_23_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_24_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_24_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_25_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_25_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_26_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_26_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_27_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_27_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_28_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_28_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_29_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_29_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_30_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_30_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_31_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_31_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_32_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_32_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_33_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_33_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_34_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_34_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_35_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_35_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_36_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_36_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_37_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_37_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_38_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_38_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_39_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_39_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_40_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_40_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_41_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_41_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_42_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_42_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_43_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_43_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_44_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_44_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_45_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_45_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_46_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_46_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_47_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_47_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_48_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_48_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_49_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_49_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_50_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_50_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_51_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_51_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_52_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_52_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_53_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_53_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_54_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_54_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_55_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_55_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_56_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_56_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_57_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_57_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_58_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_58_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_59_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_59_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_60_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_60_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_61_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_61_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_62_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_62_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_63_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_63_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_64_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_64_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_65_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_65_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_66_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_66_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_67_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_67_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_68_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_68_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_69_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_69_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_70_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_70_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_71_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_71_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_72_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_72_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_73_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_73_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_74_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_74_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_75_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_75_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_76_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_76_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_77_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_77_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_78_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_78_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_79_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_79_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_80_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_80_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_81_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_81_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_82_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_82_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_83_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_83_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_84_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_84_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_85_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_85_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_86_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_86_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_87_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_87_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_88_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_88_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_89_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_89_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_90_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_90_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_91_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_91_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_92_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_92_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_93_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_93_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_94_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_94_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_95_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_95_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_96_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_96_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_97_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_97_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_98_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_98_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_99_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_99_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_100_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_100_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_101_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_101_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_102_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_102_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_103_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_103_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_104_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_104_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_105_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_105_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_106_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_106_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_107_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_107_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_108_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_108_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_109_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_109_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_110_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_110_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_111_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_111_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_112_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_112_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_113_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_113_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_114_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_114_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_115_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_115_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_116_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_116_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_117_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_117_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_118_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_118_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_119_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_119_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_120_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_120_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_121_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_121_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_122_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_122_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_123_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_123_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_124_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_124_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_125_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_125_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_126_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_126_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_127_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_127_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_128_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_128_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_129_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_129_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_130_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_130_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_131_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_131_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_132_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_132_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_133_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_133_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_134_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_134_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_135_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_135_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_136_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_136_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_137_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_137_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_138_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_138_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_139_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_139_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_140_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_140_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_141_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_141_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_142_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_142_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_143_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_143_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_144_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_144_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_145_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_145_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_146_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_146_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_147_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_147_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_148_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_148_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_149_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_149_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_150_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_150_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_151_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_151_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_152_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_152_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_153_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_153_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_154_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_154_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_155_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_155_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_156_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_156_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_157_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_157_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_158_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_158_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_159_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_159_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_160_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_160_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_161_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_161_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_162_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_162_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_163_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_163_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_164_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_164_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_165_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_165_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_166_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_166_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_167_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_167_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_168_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_168_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_169_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_169_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_170_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_170_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_171_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_171_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_172_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_172_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_173_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_173_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_174_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_174_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_175_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_175_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_176_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_176_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_177_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_177_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_178_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_178_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_179_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_179_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_180_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_180_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_181_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_181_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_182_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_182_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_183_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_183_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_184_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_184_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_185_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_185_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_186_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_186_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_187_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_187_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_188_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_188_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_189_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_189_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_190_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_190_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_191_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_191_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_192_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_192_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_193_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_193_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_194_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_194_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_195_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_195_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_196_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_196_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_197_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_197_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_198_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_198_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "semantic_correspondence_misc210k", "visual_input_component": "2 natural images", "source": "misc210k", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "Humans can find corresponding points for different objects in the same category. For instance, if there are images of two different cats, then the left ear tip of one cat corresponds to the left ear tip of the other cat, and the right front paw of one cat corresponds to the right front paw of the other cat.\nGiven the following two images, a reference point is annotated on the first image, labeled with REF. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Select between the choices on the second image and find the corresponding point for the reference point. Which point is corresponding to the reference point?\nSelect from the following choices.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_199_0.jpg", "./High-level-obj-semantic/semantic_correspondence_misc210k/semantic_correspondence_misc210k_199_1.jpg"], "output": "C", "qwen3-vl": "image none"}]
\ No newline at end of file
diff --git a/results/single_object_tracking/qwen3-vl/metadata_info.json b/results/single_object_tracking/qwen3-vl/metadata_info.json
new file mode 100644
index 0000000..a52445a
--- /dev/null
+++ b/results/single_object_tracking/qwen3-vl/metadata_info.json
@@ -0,0 +1 @@
+[{"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.105, 0.0, 0.539, 1.0]\nB: [0.231, 0.444, 0.698, 0.771]\nC: [0.204, 0.496, 0.49, 0.761]\nD: [0.105, 0.0, 0.624, 0.922]", "question": "Here is an object ([0.166, 0.0, 0.589, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.105, 0.0, 0.539, 1.0]\nB: [0.231, 0.444, 0.698, 0.771]\nC: [0.204, 0.496, 0.49, 0.761]\nD: [0.105, 0.0, 0.624, 0.922]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_0_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_0_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.37, 0.132, 0.788, 0.61]\nB: [0.457, 0.328, 0.655, 0.681]\nC: [0.457, 0.328, 0.673, 0.635]\nD: [0.457, 0.328, 0.656, 0.582]", "question": "Here is an object ([0.326, 0.224, 0.691, 0.644]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.37, 0.132, 0.788, 0.61]\nB: [0.457, 0.328, 0.655, 0.681]\nC: [0.457, 0.328, 0.673, 0.635]\nD: [0.457, 0.328, 0.656, 0.582]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_1_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_1_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.546, 0.242, 0.943, 1.0]\nB: [0.173, 0.0, 0.57, 0.758]\nC: [0.516, 0.2, 0.912, 0.958]\nD: [0.367, 0.242, 0.764, 1.0]", "question": "Here is an object ([0.358, 0.26, 0.744, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.546, 0.242, 0.943, 1.0]\nB: [0.173, 0.0, 0.57, 0.758]\nC: [0.516, 0.2, 0.912, 0.958]\nD: [0.367, 0.242, 0.764, 1.0]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_2_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_2_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.302, 0.299, 0.571, 1.0]\nB: [0.802, 0.301, 0.919, 0.514]\nC: [0.302, 0.299, 0.607, 0.882]\nD: [0.255, 0.124, 0.525, 0.825]", "question": "Here is an object ([0.649, 0.335, 0.85, 0.992]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.302, 0.299, 0.571, 1.0]\nB: [0.802, 0.301, 0.919, 0.514]\nC: [0.302, 0.299, 0.607, 0.882]\nD: [0.255, 0.124, 0.525, 0.825]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_3_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_3_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.901, 0.0, 1.0, 0.304]\nB: [0.394, 0.317, 0.758, 0.617]\nC: [0.389, 0.294, 0.495, 0.551]\nD: [0.901, 0.0, 0.994, 0.3]", "question": "Here is an object ([0.832, 0.0, 0.977, 0.472]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.901, 0.0, 1.0, 0.304]\nB: [0.394, 0.317, 0.758, 0.617]\nC: [0.389, 0.294, 0.495, 0.551]\nD: [0.901, 0.0, 0.994, 0.3]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_4_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_4_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.987, 0.792, 0.998, 0.876]\nB: [0.987, 0.792, 0.998, 0.871]\nC: [0.987, 0.792, 1.0, 0.892]\nD: [0.987, 0.792, 1.002, 0.901]", "question": "Here is an object ([0.952, 0.703, 1.0, 0.904]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.987, 0.792, 0.998, 0.876]\nB: [0.987, 0.792, 0.998, 0.871]\nC: [0.987, 0.792, 1.0, 0.892]\nD: [0.987, 0.792, 1.002, 0.901]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_5_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_5_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.287, 0.453, 0.617, 0.774]\nB: [0.384, 0.432, 0.713, 0.753]\nC: [0.287, 0.453, 0.623, 0.828]\nD: [0.26, 0.356, 0.59, 0.676]", "question": "Here is an object ([0.284, 0.369, 0.636, 0.674]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.287, 0.453, 0.617, 0.774]\nB: [0.384, 0.432, 0.713, 0.753]\nC: [0.287, 0.453, 0.623, 0.828]\nD: [0.26, 0.356, 0.59, 0.676]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_6_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_6_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.282, 0.0, 1.0, 0.736]\nB: [0.282, 0.0, 1.047, 0.79]\nC: [0.248, 0.156, 0.966, 0.892]\nD: [0.186, 0.067, 0.904, 0.803]", "question": "Here is an object ([0.312, 0.0, 1.0, 0.736]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.282, 0.0, 1.0, 0.736]\nB: [0.282, 0.0, 1.047, 0.79]\nC: [0.248, 0.156, 0.966, 0.892]\nD: [0.186, 0.067, 0.904, 0.803]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_7_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_7_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.863, 0.157, 0.884, 0.624]\nB: [0.159, 0.19, 0.68, 1.0]\nC: [0.159, 0.19, 0.737, 1.156]\nD: [0.159, 0.19, 0.72, 1.014]", "question": "Here is an object ([0.174, 0.19, 0.691, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.863, 0.157, 0.884, 0.624]\nB: [0.159, 0.19, 0.68, 1.0]\nC: [0.159, 0.19, 0.737, 1.156]\nD: [0.159, 0.19, 0.72, 1.014]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_8_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_8_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.309, 0.435, 0.693, 1.0]\nB: [0.786, 0.633, 0.927, 0.776]\nC: [0.263, 0.193, 0.647, 0.758]\nD: [0.263, 0.193, 0.617, 0.678]", "question": "Here is an object ([0.227, 0.218, 0.607, 0.787]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.309, 0.435, 0.693, 1.0]\nB: [0.786, 0.633, 0.927, 0.776]\nC: [0.263, 0.193, 0.647, 0.758]\nD: [0.263, 0.193, 0.617, 0.678]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_9_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_9_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.013, 0.201, 0.494, 0.7]\nB: [0.177, 0.003, 0.868, 1.0]\nC: [0.129, 0.61, 0.238, 0.793]\nD: [0.243, 0.0, 0.934, 0.997]", "question": "Here is an object ([0.125, 0.0, 0.804, 0.988]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.013, 0.201, 0.494, 0.7]\nB: [0.177, 0.003, 0.868, 1.0]\nC: [0.129, 0.61, 0.238, 0.793]\nD: [0.243, 0.0, 0.934, 0.997]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_10_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_10_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.095, 0.672, 0.513, 0.838]\nB: [0.0, 0.257, 0.27, 1.056]\nC: [0.096, 0.265, 0.371, 1.0]\nD: [0.0, 0.257, 0.275, 0.992]", "question": "Here is an object ([0.0, 0.261, 0.28, 0.997]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.095, 0.672, 0.513, 0.838]\nB: [0.0, 0.257, 0.27, 1.056]\nC: [0.096, 0.265, 0.371, 1.0]\nD: [0.0, 0.257, 0.275, 0.992]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_11_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_11_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.173, 0.05, 0.833, 1.1]\nB: [0.173, 0.05, 0.754, 1.04]\nC: [0.173, 0.05, 0.789, 1.0]\nD: [0.173, 0.05, 0.712, 0.936]", "question": "Here is an object ([0.223, 0.032, 0.773, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.173, 0.05, 0.833, 1.1]\nB: [0.173, 0.05, 0.754, 1.04]\nC: [0.173, 0.05, 0.789, 1.0]\nD: [0.173, 0.05, 0.712, 0.936]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_12_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_12_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.0, 0.042, 0.523, 1.0]\nB: [0.0, 0.042, 0.434, 0.851]\nC: [0.205, 0.0, 0.728, 0.958]\nD: [0.259, 0.042, 0.782, 1.0]", "question": "Here is an object ([0.116, 0.024, 0.778, 0.988]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.0, 0.042, 0.523, 1.0]\nB: [0.0, 0.042, 0.434, 0.851]\nC: [0.205, 0.0, 0.728, 0.958]\nD: [0.259, 0.042, 0.782, 1.0]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_13_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_13_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.177, 0.135, 0.627, 0.492]\nB: [0.442, 0.412, 0.921, 0.872]\nC: [0.249, 0.269, 0.701, 0.599]\nD: [0.241, 0.326, 0.693, 0.656]", "question": "Here is an object ([0.232, 0.326, 0.684, 0.656]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.177, 0.135, 0.627, 0.492]\nB: [0.442, 0.412, 0.921, 0.872]\nC: [0.249, 0.269, 0.701, 0.599]\nD: [0.241, 0.326, 0.693, 0.656]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_14_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_14_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.124, 0.142, 0.57, 0.851]\nB: [0.124, 0.142, 0.712, 1.079]\nC: [0.124, 0.142, 0.643, 0.931]\nD: [0.445, 0.485, 0.785, 0.751]", "question": "Here is an object ([0.123, 0.161, 0.635, 0.931]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.124, 0.142, 0.57, 0.851]\nB: [0.124, 0.142, 0.712, 1.079]\nC: [0.124, 0.142, 0.643, 0.931]\nD: [0.445, 0.485, 0.785, 0.751]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_15_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_15_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.337, 0.263, 0.374, 0.406]\nB: [0.325, 0.269, 0.362, 0.412]\nC: [0.325, 0.269, 0.362, 0.392]\nD: [0.265, 0.403, 0.642, 0.54]", "question": "Here is an object ([0.311, 0.271, 0.363, 0.412]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.337, 0.263, 0.374, 0.406]\nB: [0.325, 0.269, 0.362, 0.412]\nC: [0.325, 0.269, 0.362, 0.392]\nD: [0.265, 0.403, 0.642, 0.54]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_16_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_16_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.07, 0.04, 0.201, 0.126]\nB: [0.326, 0.0, 0.671, 0.593]\nC: [0.326, 0.0, 0.797, 0.729]\nD: [0.326, 0.0, 0.73, 0.738]", "question": "Here is an object ([0.37, 0.0, 0.701, 0.883]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.07, 0.04, 0.201, 0.126]\nB: [0.326, 0.0, 0.671, 0.593]\nC: [0.326, 0.0, 0.797, 0.729]\nD: [0.326, 0.0, 0.73, 0.738]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_17_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_17_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.467, 0.647, 0.685, 0.889]\nB: [0.494, 0.543, 0.712, 0.785]\nC: [0.649, 0.751, 0.892, 0.776]\nD: [0.494, 0.543, 0.677, 0.764]", "question": "Here is an object ([0.523, 0.457, 0.773, 0.708]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.467, 0.647, 0.685, 0.889]\nB: [0.494, 0.543, 0.712, 0.785]\nC: [0.649, 0.751, 0.892, 0.776]\nD: [0.494, 0.543, 0.677, 0.764]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_18_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_18_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.0, 0.0, 0.879, 0.878]\nB: [0.0, 0.0, 0.892, 0.821]\nC: [0.0, 0.0, 0.992, 0.739]\nD: [0.0, 0.0, 0.894, 1.05]", "question": "Here is an object ([0.0, 0.0, 0.883, 0.86]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.0, 0.0, 0.879, 0.878]\nB: [0.0, 0.0, 0.892, 0.821]\nC: [0.0, 0.0, 0.992, 0.739]\nD: [0.0, 0.0, 0.894, 1.05]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_19_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_19_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.005, 0.846, 0.023, 0.993]\nB: [0.005, 0.846, 0.023, 0.994]\nC: [0.005, 0.846, 0.02, 0.965]\nD: [0.311, 0.061, 0.434, 0.287]", "question": "Here is an object ([0.0, 0.8, 0.043, 0.996]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.005, 0.846, 0.023, 0.993]\nB: [0.005, 0.846, 0.023, 0.994]\nC: [0.005, 0.846, 0.02, 0.965]\nD: [0.311, 0.061, 0.434, 0.287]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_20_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_20_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.323, 0.332, 0.442, 0.565]\nB: [0.323, 0.332, 0.47, 0.558]\nC: [0.323, 0.332, 0.495, 0.537]\nD: [0.323, 0.332, 0.477, 0.526]", "question": "Here is an object ([0.0, 0.05, 0.374, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.323, 0.332, 0.442, 0.565]\nB: [0.323, 0.332, 0.47, 0.558]\nC: [0.323, 0.332, 0.495, 0.537]\nD: [0.323, 0.332, 0.477, 0.526]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_21_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_21_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.292, 0.053, 0.432, 0.251]\nB: [0.27, 0.118, 0.41, 0.317]\nC: [0.27, 0.049, 0.409, 0.247]\nD: [0.689, 0.281, 0.863, 0.306]", "question": "Here is an object ([0.245, 0.086, 0.4, 0.275]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.292, 0.053, 0.432, 0.251]\nB: [0.27, 0.118, 0.41, 0.317]\nC: [0.27, 0.049, 0.409, 0.247]\nD: [0.689, 0.281, 0.863, 0.306]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_22_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_22_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.322, 0.804, 0.577, 0.947]\nB: [0.353, 0.024, 0.68, 0.975]\nC: [0.353, 0.024, 0.669, 0.874]\nD: [0.733, 0.014, 0.774, 0.058]", "question": "Here is an object ([0.0, 0.0, 0.524, 0.828]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.322, 0.804, 0.577, 0.947]\nB: [0.353, 0.024, 0.68, 0.975]\nC: [0.353, 0.024, 0.669, 0.874]\nD: [0.733, 0.014, 0.774, 0.058]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_23_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_23_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.184, 0.0, 0.909, 0.993]\nB: [0.184, 0.0, 1.027, 0.949]\nC: [0.199, 0.0, 0.924, 0.993]\nD: [0.184, 0.0, 1.048, 0.854]", "question": "Here is an object ([0.086, 0.0, 0.87, 0.919]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.184, 0.0, 0.909, 0.993]\nB: [0.184, 0.0, 1.027, 0.949]\nC: [0.199, 0.0, 0.924, 0.993]\nD: [0.184, 0.0, 1.048, 0.854]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_24_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_24_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.047, 0.869, 0.259, 0.904]\nB: [0.443, 0.679, 0.677, 1.011]\nC: [0.443, 0.679, 0.708, 0.976]\nD: [0.566, 0.703, 0.831, 1.0]", "question": "Here is an object ([0.441, 0.71, 0.689, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.047, 0.869, 0.259, 0.904]\nB: [0.443, 0.679, 0.677, 1.011]\nC: [0.443, 0.679, 0.708, 0.976]\nD: [0.566, 0.703, 0.831, 1.0]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_25_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_25_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.0, 0.001, 0.45, 1.0]\nB: [0.173, 0.001, 0.623, 1.0]\nC: [0.0, 0.001, 0.441, 1.01]\nD: [0.0, 0.001, 0.377, 1.032]", "question": "Here is an object ([0.0, 0.0, 0.402, 0.996]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.0, 0.001, 0.45, 1.0]\nB: [0.173, 0.001, 0.623, 1.0]\nC: [0.0, 0.001, 0.441, 1.01]\nD: [0.0, 0.001, 0.377, 1.032]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_26_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_26_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.285, 0.181, 0.881, 0.531]\nB: [0.325, 0.09, 0.738, 0.329]\nC: [0.6, 0.492, 0.78, 0.658]\nD: [0.285, 0.181, 0.992, 0.608]", "question": "Here is an object ([0.277, 0.196, 0.994, 0.61]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.285, 0.181, 0.881, 0.531]\nB: [0.325, 0.09, 0.738, 0.329]\nC: [0.6, 0.492, 0.78, 0.658]\nD: [0.285, 0.181, 0.992, 0.608]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_27_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_27_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.596, 0.311, 0.855, 0.971]\nB: [0.656, 0.222, 0.916, 0.882]\nC: [0.181, 0.185, 0.651, 0.349]\nD: [0.57, 0.24, 0.83, 0.9]", "question": "Here is an object ([0.67, 0.219, 0.91, 0.886]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.596, 0.311, 0.855, 0.971]\nB: [0.656, 0.222, 0.916, 0.882]\nC: [0.181, 0.185, 0.651, 0.349]\nD: [0.57, 0.24, 0.83, 0.9]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_28_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_28_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.33, 0.006, 0.963, 0.889]\nB: [0.427, 0.299, 0.457, 0.435]\nC: [0.323, 0.642, 0.555, 0.656]\nD: [0.33, 0.006, 0.966, 1.0]", "question": "Here is an object ([0.304, 0.001, 0.951, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.33, 0.006, 0.963, 0.889]\nB: [0.427, 0.299, 0.457, 0.435]\nC: [0.323, 0.642, 0.555, 0.656]\nD: [0.33, 0.006, 0.966, 1.0]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_29_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_29_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.298, 0.661, 0.788, 0.753]\nB: [0.0, 0.0, 0.955, 0.996]\nC: [0.502, 0.29, 0.769, 0.553]\nD: [0.0, 0.004, 0.955, 1.0]", "question": "Here is an object ([0.0, 0.0, 0.893, 0.999]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.298, 0.661, 0.788, 0.753]\nB: [0.0, 0.0, 0.955, 0.996]\nC: [0.502, 0.29, 0.769, 0.553]\nD: [0.0, 0.004, 0.955, 1.0]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_30_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_30_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.378, 0.415, 0.458, 0.821]\nB: [0.378, 0.415, 0.467, 0.843]\nC: [0.378, 0.415, 0.462, 0.858]\nD: [0.378, 0.415, 0.473, 0.764]", "question": "Here is an object ([0.366, 0.428, 0.459, 0.826]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.378, 0.415, 0.458, 0.821]\nB: [0.378, 0.415, 0.467, 0.843]\nC: [0.378, 0.415, 0.462, 0.858]\nD: [0.378, 0.415, 0.473, 0.764]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_31_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_31_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.266, 0.356, 0.714, 0.89]\nB: [0.255, 0.425, 0.769, 0.932]\nC: [0.266, 0.356, 0.78, 0.863]\nD: [0.33, 0.275, 0.54, 0.724]", "question": "Here is an object ([0.268, 0.399, 0.774, 0.89]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.266, 0.356, 0.714, 0.89]\nB: [0.255, 0.425, 0.769, 0.932]\nC: [0.266, 0.356, 0.78, 0.863]\nD: [0.33, 0.275, 0.54, 0.724]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_32_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_32_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.878, 0.772, 0.995, 0.833]\nB: [0.134, 0.675, 0.455, 0.933]\nC: [0.397, 0.556, 0.869, 0.714]\nD: [0.134, 0.675, 0.518, 0.892]", "question": "Here is an object ([0.108, 0.626, 0.434, 0.892]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.878, 0.772, 0.995, 0.833]\nB: [0.134, 0.675, 0.455, 0.933]\nC: [0.397, 0.556, 0.869, 0.714]\nD: [0.134, 0.675, 0.518, 0.892]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_33_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_33_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.564, 0.332, 0.63, 0.44]\nB: [0.545, 0.307, 0.611, 0.415]\nC: [0.528, 0.263, 0.595, 0.371]\nD: [0.547, 0.319, 0.613, 0.428]", "question": "Here is an object ([0.593, 0.332, 0.659, 0.447]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.564, 0.332, 0.63, 0.44]\nB: [0.545, 0.307, 0.611, 0.415]\nC: [0.528, 0.263, 0.595, 0.371]\nD: [0.547, 0.319, 0.613, 0.428]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_34_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_34_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.217, 0.24, 0.541, 0.761]\nB: [0.119, 0.435, 0.442, 0.956]\nC: [0.217, 0.24, 0.478, 0.786]\nD: [0.138, 0.474, 0.461, 0.994]", "question": "Here is an object ([0.23, 0.247, 0.55, 0.715]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.217, 0.24, 0.541, 0.761]\nB: [0.119, 0.435, 0.442, 0.956]\nC: [0.217, 0.24, 0.478, 0.786]\nD: [0.138, 0.474, 0.461, 0.994]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_35_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_35_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.003, 0.11, 0.245, 0.188]\nB: [0.0, 0.0, 0.334, 0.435]\nC: [0.0, 0.0, 0.31, 0.529]\nD: [0.0, 0.0, 0.304, 0.487]", "question": "Here is an object ([0.0, 0.0, 0.263, 0.576]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.003, 0.11, 0.245, 0.188]\nB: [0.0, 0.0, 0.334, 0.435]\nC: [0.0, 0.0, 0.31, 0.529]\nD: [0.0, 0.0, 0.304, 0.487]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_36_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_36_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.0, 0.317, 1.094, 0.986]\nB: [0.428, 0.043, 0.832, 0.14]\nC: [0.0, 0.368, 1.0, 0.989]\nD: [0.0, 0.317, 1.0, 0.938]", "question": "Here is an object ([0.609, 0.0, 0.853, 0.433]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.0, 0.317, 1.094, 0.986]\nB: [0.428, 0.043, 0.832, 0.14]\nC: [0.0, 0.368, 1.0, 0.989]\nD: [0.0, 0.317, 1.0, 0.938]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_37_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_37_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.477, 0.165, 0.613, 0.461]\nB: [0.477, 0.083, 0.614, 0.379]\nC: [0.486, 0.0, 0.623, 0.296]\nD: [0.18, 0.044, 0.549, 0.249]", "question": "Here is an object ([0.469, 0.09, 0.59, 0.317]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.477, 0.165, 0.613, 0.461]\nB: [0.477, 0.083, 0.614, 0.379]\nC: [0.486, 0.0, 0.623, 0.296]\nD: [0.18, 0.044, 0.549, 0.249]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_38_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_38_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.204, 0.226, 0.796, 0.667]\nB: [0.85, 0.326, 0.93, 0.539]\nC: [0.317, 0.108, 0.652, 0.579]\nD: [0.204, 0.226, 0.846, 0.713]", "question": "Here is an object ([0.187, 0.107, 0.821, 0.719]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.204, 0.226, 0.796, 0.667]\nB: [0.85, 0.326, 0.93, 0.539]\nC: [0.317, 0.108, 0.652, 0.579]\nD: [0.204, 0.226, 0.846, 0.713]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_39_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_39_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.222, 0.174, 0.717, 1.0]\nB: [0.369, 0.033, 0.864, 0.86]\nC: [0.403, 0.0, 0.898, 0.826]\nD: [0.105, 0.729, 0.198, 0.839]", "question": "Here is an object ([0.263, 0.168, 0.714, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.222, 0.174, 0.717, 1.0]\nB: [0.369, 0.033, 0.864, 0.86]\nC: [0.403, 0.0, 0.898, 0.826]\nD: [0.105, 0.729, 0.198, 0.839]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_40_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_40_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.101, 0.694, 0.357, 0.807]\nB: [0.592, 0.454, 0.698, 0.651]\nC: [0.592, 0.454, 0.694, 0.631]\nD: [0.34, 0.282, 0.835, 0.693]", "question": "Here is an object ([0.541, 0.482, 0.603, 0.624]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.101, 0.694, 0.357, 0.807]\nB: [0.592, 0.454, 0.698, 0.651]\nC: [0.592, 0.454, 0.694, 0.631]\nD: [0.34, 0.282, 0.835, 0.693]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_41_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_41_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.398, 0.606, 0.528, 0.767]\nB: [0.398, 0.606, 0.509, 0.774]\nC: [0.398, 0.606, 0.507, 0.794]\nD: [0.384, 0.192, 0.498, 0.551]", "question": "Here is an object ([0.359, 0.608, 0.466, 0.8]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.398, 0.606, 0.528, 0.767]\nB: [0.398, 0.606, 0.509, 0.774]\nC: [0.398, 0.606, 0.507, 0.794]\nD: [0.384, 0.192, 0.498, 0.551]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_42_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_42_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.216, 0.225, 0.783, 0.904]\nB: [0.216, 0.225, 0.738, 1.065]\nC: [0.216, 0.225, 0.701, 1.0]\nD: [0.216, 0.225, 0.73, 1.015]", "question": "Here is an object ([0.226, 0.208, 0.703, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.216, 0.225, 0.783, 0.904]\nB: [0.216, 0.225, 0.738, 1.065]\nC: [0.216, 0.225, 0.701, 1.0]\nD: [0.216, 0.225, 0.73, 1.015]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_43_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_43_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.425, 0.474, 0.487, 0.668]\nB: [0.425, 0.474, 0.493, 0.706]\nC: [0.425, 0.474, 0.496, 0.647]\nD: [0.439, 0.428, 0.502, 0.622]", "question": "Here is an object ([0.417, 0.481, 0.48, 0.7]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.425, 0.474, 0.487, 0.668]\nB: [0.425, 0.474, 0.493, 0.706]\nC: [0.425, 0.474, 0.496, 0.647]\nD: [0.439, 0.428, 0.502, 0.622]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_44_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_44_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.059, 0.0, 0.673, 0.483]\nB: [0.0, 0.0, 0.576, 0.44]\nC: [0.123, 0.692, 0.539, 0.775]\nD: [0.059, 0.0, 0.634, 0.44]", "question": "Here is an object ([0.203, 0.0, 0.616, 0.404]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.059, 0.0, 0.673, 0.483]\nB: [0.0, 0.0, 0.576, 0.44]\nC: [0.123, 0.692, 0.539, 0.775]\nD: [0.059, 0.0, 0.634, 0.44]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_45_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_45_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.541, 0.742, 0.866, 0.985]\nB: [0.602, 0.447, 0.882, 0.914]\nC: [0.471, 0.222, 0.751, 0.689]\nD: [0.471, 0.222, 0.738, 0.765]", "question": "Here is an object ([0.479, 0.236, 0.73, 0.683]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.541, 0.742, 0.866, 0.985]\nB: [0.602, 0.447, 0.882, 0.914]\nC: [0.471, 0.222, 0.751, 0.689]\nD: [0.471, 0.222, 0.738, 0.765]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_46_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_46_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.201, 0.0, 0.727, 0.497]\nB: [0.201, 0.0, 0.724, 0.519]\nC: [0.201, 0.0, 0.67, 0.574]\nD: [0.201, 0.0, 0.666, 0.542]", "question": "Here is an object ([0.152, 0.0, 0.666, 0.521]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.201, 0.0, 0.727, 0.497]\nB: [0.201, 0.0, 0.724, 0.519]\nC: [0.201, 0.0, 0.67, 0.574]\nD: [0.201, 0.0, 0.666, 0.542]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_47_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_47_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.03, 0.099, 0.872, 1.0]\nB: [0.093, 0.028, 0.167, 0.235]\nC: [0.158, 0.0, 1.0, 0.901]\nD: [0.158, 0.099, 1.0, 1.0]", "question": "Here is an object ([0.044, 0.067, 0.886, 0.978]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.03, 0.099, 0.872, 1.0]\nB: [0.093, 0.028, 0.167, 0.235]\nC: [0.158, 0.0, 1.0, 0.901]\nD: [0.158, 0.099, 1.0, 1.0]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_48_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_48_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.285, 0.124, 0.72, 0.76]\nB: [0.463, 0.321, 0.897, 0.957]\nC: [0.372, 0.103, 0.648, 0.493]\nD: [0.285, 0.124, 0.778, 0.754]", "question": "Here is an object ([0.282, 0.122, 0.711, 0.85]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 960 and the height is 720.", "context": "Select from the following choices.\nA: [0.285, 0.124, 0.72, 0.76]\nB: [0.463, 0.321, 0.897, 0.957]\nC: [0.372, 0.103, 0.648, 0.493]\nD: [0.285, 0.124, 0.778, 0.754]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_49_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_49_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.259, 0.365, 0.986, 0.932]\nB: [0.01, 0.172, 0.737, 0.739]\nC: [0.273, 0.2, 1.0, 0.767]\nD: [0.091, 0.126, 0.818, 0.693]", "question": "Here is an object ([0.291, 0.342, 0.989, 0.933]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.259, 0.365, 0.986, 0.932]\nB: [0.01, 0.172, 0.737, 0.739]\nC: [0.273, 0.2, 1.0, 0.767]\nD: [0.091, 0.126, 0.818, 0.693]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_50_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_50_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.152, 0.16, 0.498, 0.782]\nB: [0.28, 0.342, 0.627, 0.964]\nC: [0.582, 0.299, 0.674, 0.576]\nD: [0.314, 0.397, 0.733, 0.424]", "question": "Here is an object ([0.072, 0.21, 0.463, 0.842]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.152, 0.16, 0.498, 0.782]\nB: [0.28, 0.342, 0.627, 0.964]\nC: [0.582, 0.299, 0.674, 0.576]\nD: [0.314, 0.397, 0.733, 0.424]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_51_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_51_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.269, 0.0, 1.13, 0.608]\nB: [0.163, 0.639, 0.546, 0.861]\nC: [0.069, 0.075, 0.8, 0.803]\nD: [0.269, 0.0, 1.0, 0.728]", "question": "Here is an object ([0.222, 0.0, 1.0, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.269, 0.0, 1.13, 0.608]\nB: [0.163, 0.639, 0.546, 0.861]\nC: [0.069, 0.075, 0.8, 0.803]\nD: [0.269, 0.0, 1.0, 0.728]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_52_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_52_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.62, 0.253, 0.788, 0.403]\nB: [0.0, 0.097, 0.658, 0.765]\nC: [0.0, 0.0, 0.658, 0.668]\nD: [0.0, 0.097, 0.552, 0.667]", "question": "Here is an object ([0.0, 0.143, 0.421, 0.783]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.62, 0.253, 0.788, 0.403]\nB: [0.0, 0.097, 0.658, 0.765]\nC: [0.0, 0.0, 0.658, 0.668]\nD: [0.0, 0.097, 0.552, 0.667]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_53_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_53_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.582, 0.371, 1.0, 1.0]\nB: [0.544, 0.328, 0.894, 0.85]\nC: [0.205, 0.11, 0.595, 0.439]\nD: [0.544, 0.328, 0.962, 0.957]", "question": "Here is an object ([0.555, 0.332, 0.999, 0.976]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.582, 0.371, 1.0, 1.0]\nB: [0.544, 0.328, 0.894, 0.85]\nC: [0.205, 0.11, 0.595, 0.439]\nD: [0.544, 0.328, 0.962, 0.957]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_54_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_54_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.391, 0.686, 0.805, 0.978]\nB: [0.07, 0.022, 0.341, 0.492]\nC: [0.041, 0.082, 0.181, 0.229]\nD: [0.289, 0.708, 0.703, 1.0]", "question": "Here is an object ([0.306, 0.303, 0.735, 0.643]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.391, 0.686, 0.805, 0.978]\nB: [0.07, 0.022, 0.341, 0.492]\nC: [0.041, 0.082, 0.181, 0.229]\nD: [0.289, 0.708, 0.703, 1.0]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_55_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_55_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.277, 0.308, 0.294, 0.386]\nB: [0.463, 0.725, 0.892, 0.968]\nC: [0.277, 0.274, 0.294, 0.351]\nD: [0.113, 0.29, 0.595, 0.572]", "question": "Here is an object ([0.277, 0.307, 0.298, 0.386]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.277, 0.308, 0.294, 0.386]\nB: [0.463, 0.725, 0.892, 0.968]\nC: [0.277, 0.274, 0.294, 0.351]\nD: [0.113, 0.29, 0.595, 0.572]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_56_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_56_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.296, 0.147, 0.609, 0.639]\nB: [0.296, 0.147, 0.559, 0.657]\nC: [0.296, 0.147, 0.628, 0.557]\nD: [0.296, 0.147, 0.656, 0.542]", "question": "Here is an object ([0.292, 0.154, 0.622, 0.629]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 960 and the height is 720.", "context": "Select from the following choices.\nA: [0.296, 0.147, 0.609, 0.639]\nB: [0.296, 0.147, 0.559, 0.657]\nC: [0.296, 0.147, 0.628, 0.557]\nD: [0.296, 0.147, 0.656, 0.542]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_57_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_57_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.2, 0.249, 0.502, 0.331]\nB: [0.226, 0.244, 0.654, 0.808]\nC: [0.296, 0.436, 0.724, 1.0]\nD: [0.289, 0.436, 0.717, 1.0]", "question": "Here is an object ([0.207, 0.207, 0.639, 0.775]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.2, 0.249, 0.502, 0.331]\nB: [0.226, 0.244, 0.654, 0.808]\nC: [0.296, 0.436, 0.724, 1.0]\nD: [0.289, 0.436, 0.717, 1.0]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_58_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_58_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.133, 0.075, 0.634, 1.0]\nB: [0.228, 0.069, 0.748, 0.904]\nC: [0.011, 0.0, 0.512, 0.925]\nD: [0.228, 0.069, 0.73, 0.994]", "question": "Here is an object ([0.227, 0.072, 0.729, 0.996]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.133, 0.075, 0.634, 1.0]\nB: [0.228, 0.069, 0.748, 0.904]\nC: [0.011, 0.0, 0.512, 0.925]\nD: [0.228, 0.069, 0.73, 0.994]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_59_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_59_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.616, 0.381, 0.995, 0.599]\nB: [0.62, 0.293, 1.0, 0.511]\nC: [0.62, 0.276, 1.0, 0.494]\nD: [0.616, 0.381, 0.992, 0.575]", "question": "Here is an object ([0.579, 0.451, 0.773, 0.635]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.616, 0.381, 0.995, 0.599]\nB: [0.62, 0.293, 1.0, 0.511]\nC: [0.62, 0.276, 1.0, 0.494]\nD: [0.616, 0.381, 0.992, 0.575]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_60_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_60_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.452, 0.315, 0.981, 0.608]\nB: [0.471, 0.414, 1.0, 0.707]\nC: [0.645, 0.575, 0.883, 0.993]\nD: [0.471, 0.414, 1.002, 0.722]", "question": "Here is an object ([0.427, 0.403, 1.0, 0.747]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.452, 0.315, 0.981, 0.608]\nB: [0.471, 0.414, 1.0, 0.707]\nC: [0.645, 0.575, 0.883, 0.993]\nD: [0.471, 0.414, 1.002, 0.722]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_61_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_61_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.473, 0.0, 0.755, 0.89]\nB: [0.411, 0.11, 0.694, 1.0]\nC: [0.411, 0.11, 0.677, 1.015]\nD: [0.411, 0.11, 0.737, 0.967]", "question": "Here is an object ([0.456, 0.044, 0.677, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.473, 0.0, 0.755, 0.89]\nB: [0.411, 0.11, 0.694, 1.0]\nC: [0.411, 0.11, 0.677, 1.015]\nD: [0.411, 0.11, 0.737, 0.967]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_62_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_62_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.31, 0.115, 0.683, 1.0]\nB: [0.216, 0.115, 0.589, 1.0]\nC: [0.367, 0.115, 0.74, 1.0]\nD: [0.455, 0.0, 0.827, 0.885]", "question": "Here is an object ([0.31, 0.121, 0.82, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.31, 0.115, 0.683, 1.0]\nB: [0.216, 0.115, 0.589, 1.0]\nC: [0.367, 0.115, 0.74, 1.0]\nD: [0.455, 0.0, 0.827, 0.885]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_63_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_63_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.252, 0.497, 0.337, 0.736]\nB: [0.29, 0.536, 0.373, 0.743]\nC: [0.442, 0.25, 0.609, 0.683]\nD: [0.252, 0.497, 0.334, 0.704]", "question": "Here is an object ([0.245, 0.492, 0.323, 0.704]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.252, 0.497, 0.337, 0.736]\nB: [0.29, 0.536, 0.373, 0.743]\nC: [0.442, 0.25, 0.609, 0.683]\nD: [0.252, 0.497, 0.334, 0.704]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_64_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_64_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.389, 0.19, 0.691, 0.878]\nB: [0.389, 0.19, 0.753, 0.963]\nC: [0.442, 0.512, 0.885, 0.811]\nD: [0.063, 0.44, 0.373, 0.589]", "question": "Here is an object ([0.397, 0.182, 0.833, 0.95]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.389, 0.19, 0.691, 0.878]\nB: [0.389, 0.19, 0.753, 0.963]\nC: [0.442, 0.512, 0.885, 0.811]\nD: [0.063, 0.44, 0.373, 0.589]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_65_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_65_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.529, 0.0, 0.904, 0.494]\nB: [0.529, 0.0, 0.939, 0.551]\nC: [0.52, 0.0, 0.93, 0.551]\nD: [0.331, 0.583, 0.744, 0.786]", "question": "Here is an object ([0.49, 0.0, 0.793, 0.537]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.529, 0.0, 0.904, 0.494]\nB: [0.529, 0.0, 0.939, 0.551]\nC: [0.52, 0.0, 0.93, 0.551]\nD: [0.331, 0.583, 0.744, 0.786]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_66_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_66_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.217, 0.232, 0.977, 1.054]\nB: [0.217, 0.232, 1.0, 1.0]\nC: [0.457, 0.535, 0.472, 0.564]\nD: [0.217, 0.232, 1.141, 1.086]", "question": "Here is an object ([0.18, 0.047, 1.0, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.217, 0.232, 0.977, 1.054]\nB: [0.217, 0.232, 1.0, 1.0]\nC: [0.457, 0.535, 0.472, 0.564]\nD: [0.217, 0.232, 1.141, 1.086]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_67_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_67_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.53, 0.529, 0.565, 0.629]\nB: [0.536, 0.481, 0.57, 0.581]\nC: [0.53, 0.554, 0.564, 0.654]\nD: [0.514, 0.487, 0.548, 0.588]", "question": "Here is an object ([0.497, 0.794, 0.542, 0.875]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.53, 0.529, 0.565, 0.629]\nB: [0.536, 0.481, 0.57, 0.581]\nC: [0.53, 0.554, 0.564, 0.654]\nD: [0.514, 0.487, 0.548, 0.588]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_68_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_68_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.546, 0.211, 0.739, 1.0]\nB: [0.359, 0.875, 0.488, 0.932]\nC: [0.701, 0.114, 0.773, 0.361]\nD: [0.546, 0.211, 0.75, 1.131]", "question": "Here is an object ([0.492, 0.349, 0.672, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.546, 0.211, 0.739, 1.0]\nB: [0.359, 0.875, 0.488, 0.932]\nC: [0.701, 0.114, 0.773, 0.361]\nD: [0.546, 0.211, 0.75, 1.131]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_69_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_69_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.338, 0.222, 0.376, 0.406]\nB: [0.332, 0.132, 0.37, 0.315]\nC: [0.338, 0.222, 0.373, 0.375]\nD: [0.461, 0.596, 0.902, 0.999]", "question": "Here is an object ([0.28, 0.2, 0.309, 0.4]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.338, 0.222, 0.376, 0.406]\nB: [0.332, 0.132, 0.37, 0.315]\nC: [0.338, 0.222, 0.373, 0.375]\nD: [0.461, 0.596, 0.902, 0.999]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_70_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_70_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.016, 0.403, 0.025, 0.493]\nB: [0.373, 0.369, 0.523, 0.775]\nC: [0.373, 0.369, 0.531, 0.815]\nD: [0.328, 0.226, 0.478, 0.632]", "question": "Here is an object ([0.359, 0.286, 0.48, 0.728]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.016, 0.403, 0.025, 0.493]\nB: [0.373, 0.369, 0.523, 0.775]\nC: [0.373, 0.369, 0.531, 0.815]\nD: [0.328, 0.226, 0.478, 0.632]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_71_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_71_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.489, 0.412, 0.794, 0.653]\nB: [0.0, 0.0, 1.0, 1.0]\nC: [0.0, 0.0, 1.031, 1.006]\nD: [0.0, 0.0, 0.987, 1.135]", "question": "Here is an object ([0.0, 0.0, 1.0, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.489, 0.412, 0.794, 0.653]\nB: [0.0, 0.0, 1.0, 1.0]\nC: [0.0, 0.0, 1.031, 1.006]\nD: [0.0, 0.0, 0.987, 1.135]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_72_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_72_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.607, 0.454, 0.948, 0.86]\nB: [0.589, 0.282, 0.745, 0.399]\nC: [0.095, 0.358, 0.78, 1.0]\nD: [0.0, 0.358, 0.686, 1.0]", "question": "Here is an object ([0.0, 0.194, 0.704, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.607, 0.454, 0.948, 0.86]\nB: [0.589, 0.282, 0.745, 0.399]\nC: [0.095, 0.358, 0.78, 1.0]\nD: [0.0, 0.358, 0.686, 1.0]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_73_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_73_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.412, 0.367, 0.697, 0.811]\nB: [0.287, 0.19, 0.572, 0.635]\nC: [0.38, 0.192, 0.665, 0.636]\nD: [0.237, 0.568, 0.317, 0.772]", "question": "Here is an object ([0.397, 0.174, 0.659, 0.717]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.412, 0.367, 0.697, 0.811]\nB: [0.287, 0.19, 0.572, 0.635]\nC: [0.38, 0.192, 0.665, 0.636]\nD: [0.237, 0.568, 0.317, 0.772]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_74_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_74_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.367, 0.144, 0.611, 0.964]\nB: [0.337, 0.181, 0.58, 1.0]\nC: [0.367, 0.144, 0.632, 1.079]\nD: [0.031, 0.693, 0.361, 0.975]", "question": "Here is an object ([0.369, 0.153, 0.609, 0.965]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.367, 0.144, 0.611, 0.964]\nB: [0.337, 0.181, 0.58, 1.0]\nC: [0.367, 0.144, 0.632, 1.079]\nD: [0.031, 0.693, 0.361, 0.975]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_75_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_75_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.066, 0.762, 0.502, 0.981]\nB: [0.448, 0.158, 0.876, 0.285]\nC: [0.0, 0.782, 0.437, 1.0]\nD: [0.158, 0.832, 0.645, 0.868]", "question": "Here is an object ([0.0, 0.443, 0.603, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.066, 0.762, 0.502, 0.981]\nB: [0.448, 0.158, 0.876, 0.285]\nC: [0.0, 0.782, 0.437, 1.0]\nD: [0.158, 0.832, 0.645, 0.868]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_76_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_76_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.232, 0.325, 0.599, 0.582]\nB: [0.321, 0.0, 0.77, 1.0]\nC: [0.286, 0.044, 0.386, 0.461]\nD: [0.321, 0.0, 0.858, 0.894]", "question": "Here is an object ([0.394, 0.001, 0.947, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.232, 0.325, 0.599, 0.582]\nB: [0.321, 0.0, 0.77, 1.0]\nC: [0.286, 0.044, 0.386, 0.461]\nD: [0.321, 0.0, 0.858, 0.894]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_77_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_77_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.673, 0.375, 1.052, 0.929]\nB: [0.673, 0.375, 1.0, 1.0]\nC: [0.673, 0.375, 1.007, 0.979]\nD: [0.248, 0.358, 0.261, 0.589]", "question": "Here is an object ([0.532, 0.296, 1.0, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.673, 0.375, 1.052, 0.929]\nB: [0.673, 0.375, 1.0, 1.0]\nC: [0.673, 0.375, 1.007, 0.979]\nD: [0.248, 0.358, 0.261, 0.589]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_78_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_78_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.327, 0.242, 0.479, 0.401]\nB: [0.397, 0.235, 0.548, 0.394]\nC: [0.669, 0.562, 0.69, 0.632]\nD: [0.766, 0.242, 0.848, 0.442]", "question": "Here is an object ([0.326, 0.249, 0.481, 0.422]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.327, 0.242, 0.479, 0.401]\nB: [0.397, 0.235, 0.548, 0.394]\nC: [0.669, 0.562, 0.69, 0.632]\nD: [0.766, 0.242, 0.848, 0.442]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_79_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_79_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.072, 0.064, 0.542, 0.249]\nB: [0.645, 0.408, 0.869, 0.582]\nC: [0.428, 0.0, 0.695, 0.919]\nD: [0.301, 0.0, 0.568, 0.919]", "question": "Here is an object ([0.31, 0.074, 0.576, 0.826]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.072, 0.064, 0.542, 0.249]\nB: [0.645, 0.408, 0.869, 0.582]\nC: [0.428, 0.0, 0.695, 0.919]\nD: [0.301, 0.0, 0.568, 0.919]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_80_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_80_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.08, 0.778, 0.391, 1.0]\nB: [0.508, 0.303, 0.577, 0.432]\nC: [0.155, 0.778, 0.466, 1.0]\nD: [0.046, 0.778, 0.357, 1.0]", "question": "Here is an object ([0.044, 0.793, 0.334, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.08, 0.778, 0.391, 1.0]\nB: [0.508, 0.303, 0.577, 0.432]\nC: [0.155, 0.778, 0.466, 1.0]\nD: [0.046, 0.778, 0.357, 1.0]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_81_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_81_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.0, 0.0, 1.0, 0.999]\nB: [0.0, 0.001, 1.0, 1.0]\nC: [0.411, 0.328, 0.752, 0.585]\nD: [0.525, 0.542, 0.97, 0.881]", "question": "Here is an object ([0.0, 0.001, 1.0, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.0, 0.0, 1.0, 0.999]\nB: [0.0, 0.001, 1.0, 1.0]\nC: [0.411, 0.328, 0.752, 0.585]\nD: [0.525, 0.542, 0.97, 0.881]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_82_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_82_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.304, 0.521, 0.474, 0.738]\nB: [0.18, 0.349, 0.421, 0.765]\nC: [0.18, 0.349, 0.401, 0.797]\nD: [0.282, 0.438, 0.726, 0.922]", "question": "Here is an object ([0.183, 0.338, 0.426, 0.754]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.304, 0.521, 0.474, 0.738]\nB: [0.18, 0.349, 0.421, 0.765]\nC: [0.18, 0.349, 0.401, 0.797]\nD: [0.282, 0.438, 0.726, 0.922]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_83_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_83_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.084, 0.0, 0.97, 0.975]\nB: [0.588, 0.258, 0.977, 0.639]\nC: [0.465, 0.197, 0.775, 0.597]\nD: [0.0, 0.0, 0.886, 0.975]", "question": "Here is an object ([0.0, 0.0, 0.884, 0.967]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.084, 0.0, 0.97, 0.975]\nB: [0.588, 0.258, 0.977, 0.639]\nC: [0.465, 0.197, 0.775, 0.597]\nD: [0.0, 0.0, 0.886, 0.975]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_84_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_84_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.096, 0.446, 1.0, 1.0]\nB: [0.089, 0.375, 0.936, 0.829]\nC: [0.089, 0.375, 0.993, 0.929]\nD: [0.096, 0.436, 1.0, 0.99]", "question": "Here is an object ([0.084, 0.376, 0.99, 0.903]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 406 and the height is 720.", "context": "Select from the following choices.\nA: [0.096, 0.446, 1.0, 1.0]\nB: [0.089, 0.375, 0.936, 0.829]\nC: [0.089, 0.375, 0.993, 0.929]\nD: [0.096, 0.436, 1.0, 0.99]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_85_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_85_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.121, 0.364, 0.447, 1.0]\nB: [0.17, 0.364, 0.495, 1.0]\nC: [0.26, 0.364, 0.586, 1.0]\nD: [0.149, 0.364, 0.475, 1.0]", "question": "Here is an object ([0.291, 0.444, 0.606, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.121, 0.364, 0.447, 1.0]\nB: [0.17, 0.364, 0.495, 1.0]\nC: [0.26, 0.364, 0.586, 1.0]\nD: [0.149, 0.364, 0.475, 1.0]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_86_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_86_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.687, 0.894, 0.72, 0.951]\nB: [0.17, 0.021, 0.422, 0.332]\nC: [0.263, 0.164, 0.547, 0.483]\nD: [0.263, 0.164, 0.514, 0.475]", "question": "Here is an object ([0.247, 0.165, 0.501, 0.479]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.687, 0.894, 0.72, 0.951]\nB: [0.17, 0.021, 0.422, 0.332]\nC: [0.263, 0.164, 0.547, 0.483]\nD: [0.263, 0.164, 0.514, 0.475]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_87_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_87_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.528, 0.439, 0.573, 0.536]\nB: [0.45, 0.644, 0.761, 0.756]\nC: [0.528, 0.439, 0.577, 0.55]\nD: [0.542, 0.478, 0.588, 0.575]", "question": "Here is an object ([0.536, 0.414, 0.573, 0.528]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 640 and the height is 360.", "context": "Select from the following choices.\nA: [0.528, 0.439, 0.573, 0.536]\nB: [0.45, 0.644, 0.761, 0.756]\nC: [0.528, 0.439, 0.577, 0.55]\nD: [0.542, 0.478, 0.588, 0.575]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_88_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_88_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.598, 0.618, 0.884, 0.681]\nB: [0.141, 0.233, 0.448, 1.0]\nC: [0.473, 0.306, 0.578, 0.575]\nD: [0.141, 0.233, 0.455, 1.097]", "question": "Here is an object ([0.13, 0.26, 0.435, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.598, 0.618, 0.884, 0.681]\nB: [0.141, 0.233, 0.448, 1.0]\nC: [0.473, 0.306, 0.578, 0.575]\nD: [0.141, 0.233, 0.455, 1.097]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_89_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_89_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.0, 0.25, 0.742, 1.0]\nB: [0.258, 0.14, 1.0, 0.89]\nC: [0.016, 0.108, 0.757, 0.858]\nD: [0.809, 0.283, 0.925, 0.317]", "question": "Here is an object ([0.065, 0.108, 1.0, 0.822]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1080 and the height is 720.", "context": "Select from the following choices.\nA: [0.0, 0.25, 0.742, 1.0]\nB: [0.258, 0.14, 1.0, 0.89]\nC: [0.016, 0.108, 0.757, 0.858]\nD: [0.809, 0.283, 0.925, 0.317]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_90_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_90_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.508, 0.017, 0.915, 0.2]\nB: [0.0, 0.004, 0.701, 0.935]\nC: [0.0, 0.004, 0.752, 1.0]\nD: [0.248, 0.004, 1.0, 1.0]", "question": "Here is an object ([0.0, 0.021, 0.759, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.508, 0.017, 0.915, 0.2]\nB: [0.0, 0.004, 0.701, 0.935]\nC: [0.0, 0.004, 0.752, 1.0]\nD: [0.248, 0.004, 1.0, 1.0]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_91_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_91_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.518, 0.521, 0.582, 0.715]\nB: [0.512, 0.44, 0.566, 0.604]\nC: [0.518, 0.521, 0.573, 0.685]\nD: [0.518, 0.521, 0.578, 0.675]", "question": "Here is an object ([0.504, 0.521, 0.551, 0.662]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.518, 0.521, 0.582, 0.715]\nB: [0.512, 0.44, 0.566, 0.604]\nC: [0.518, 0.521, 0.573, 0.685]\nD: [0.518, 0.521, 0.578, 0.675]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_92_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_92_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.275, 0.518, 0.76, 1.0]\nB: [0.275, 0.518, 0.763, 0.928]\nC: [0.275, 0.518, 0.738, 1.083]\nD: [0.131, 0.343, 0.616, 0.825]", "question": "Here is an object ([0.677, 0.49, 0.845, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.275, 0.518, 0.76, 1.0]\nB: [0.275, 0.518, 0.763, 0.928]\nC: [0.275, 0.518, 0.738, 1.083]\nD: [0.131, 0.343, 0.616, 0.825]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_93_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_93_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.748, 0.189, 0.783, 0.533]\nB: [0.038, 0.267, 0.163, 0.346]\nC: [0.064, 0.235, 0.188, 0.314]\nD: [0.071, 0.322, 0.296, 0.649]", "question": "Here is an object ([0.109, 0.24, 0.23, 0.322]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.748, 0.189, 0.783, 0.533]\nB: [0.038, 0.267, 0.163, 0.346]\nC: [0.064, 0.235, 0.188, 0.314]\nD: [0.071, 0.322, 0.296, 0.649]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_94_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_94_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.436, 0.774, 0.797, 0.901]\nB: [0.478, 0.286, 0.601, 0.464]\nC: [0.439, 0.328, 0.561, 0.506]\nD: [0.652, 0.426, 0.946, 0.767]", "question": "Here is an object ([0.449, 0.339, 0.614, 0.582]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 960 and the height is 720.", "context": "Select from the following choices.\nA: [0.436, 0.774, 0.797, 0.901]\nB: [0.478, 0.286, 0.601, 0.464]\nC: [0.439, 0.328, 0.561, 0.506]\nD: [0.652, 0.426, 0.946, 0.767]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_95_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_95_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.0, 0.217, 0.427, 1.0]\nB: [0.0, 0.217, 0.466, 0.968]\nC: [0.156, 0.217, 0.584, 1.0]\nD: [0.0, 0.217, 0.461, 0.944]", "question": "Here is an object ([0.0, 0.206, 0.405, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.0, 0.217, 0.427, 1.0]\nB: [0.0, 0.217, 0.466, 0.968]\nC: [0.156, 0.217, 0.584, 1.0]\nD: [0.0, 0.217, 0.461, 0.944]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_96_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_96_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.412, 0.321, 0.501, 0.774]\nB: [0.047, 0.485, 0.552, 1.0]\nC: [0.119, 0.485, 0.623, 1.0]\nD: [0.119, 0.485, 0.693, 0.969]", "question": "Here is an object ([0.133, 0.522, 0.686, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.412, 0.321, 0.501, 0.774]\nB: [0.047, 0.485, 0.552, 1.0]\nC: [0.119, 0.485, 0.623, 1.0]\nD: [0.119, 0.485, 0.693, 0.969]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_97_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_97_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.416, 0.165, 0.517, 0.535]\nB: [0.43, 0.064, 0.532, 0.433]\nC: [0.422, 0.135, 0.523, 0.504]\nD: [0.422, 0.135, 0.505, 0.537]", "question": "Here is an object ([0.439, 0.157, 0.559, 0.557]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.416, 0.165, 0.517, 0.535]\nB: [0.43, 0.064, 0.532, 0.433]\nC: [0.422, 0.135, 0.523, 0.504]\nD: [0.422, 0.135, 0.505, 0.537]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_98_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_98_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.14, 0.257, 0.775, 0.714]\nB: [0.066, 0.125, 0.656, 0.619]\nC: [0.14, 0.257, 0.826, 0.689]\nD: [0.14, 0.257, 0.73, 0.751]", "question": "Here is an object ([0.154, 0.225, 0.735, 0.743]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.14, 0.257, 0.775, 0.714]\nB: [0.066, 0.125, 0.656, 0.619]\nC: [0.14, 0.257, 0.826, 0.689]\nD: [0.14, 0.257, 0.73, 0.751]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_99_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_99_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.321, 0.429, 0.502, 0.562]\nB: [0.321, 0.429, 0.493, 0.571]\nC: [0.399, 0.408, 0.58, 0.542]\nD: [0.287, 0.482, 0.467, 0.615]", "question": "Here is an object ([0.313, 0.362, 0.605, 0.611]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.321, 0.429, 0.502, 0.562]\nB: [0.321, 0.429, 0.493, 0.571]\nC: [0.399, 0.408, 0.58, 0.542]\nD: [0.287, 0.482, 0.467, 0.615]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_100_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_100_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.145, 0.1, 1.0, 1.0]\nB: [0.81, 0.383, 0.819, 0.604]\nC: [0.145, 0.0, 1.0, 0.9]\nD: [0.145, 0.1, 0.912, 0.946]", "question": "Here is an object ([0.15, 0.078, 1.0, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.145, 0.1, 1.0, 1.0]\nB: [0.81, 0.383, 0.819, 0.604]\nC: [0.145, 0.0, 1.0, 0.9]\nD: [0.145, 0.1, 0.912, 0.946]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_101_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_101_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.077, 0.319, 0.395, 0.588]\nB: [0.563, 0.496, 0.853, 0.843]\nC: [0.576, 0.646, 0.911, 0.826]\nD: [0.498, 0.457, 0.788, 0.804]", "question": "Here is an object ([0.535, 0.507, 0.81, 0.825]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.077, 0.319, 0.395, 0.588]\nB: [0.563, 0.496, 0.853, 0.843]\nC: [0.576, 0.646, 0.911, 0.826]\nD: [0.498, 0.457, 0.788, 0.804]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_102_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_102_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.099, 0.274, 0.831, 1.0]\nB: [0.268, 0.156, 0.948, 0.758]\nC: [0.268, 0.156, 1.018, 0.989]\nD: [0.268, 0.156, 1.0, 0.882]", "question": "Here is an object ([0.295, 0.115, 0.986, 0.876]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 960 and the height is 720.", "context": "Select from the following choices.\nA: [0.099, 0.274, 0.831, 1.0]\nB: [0.268, 0.156, 0.948, 0.758]\nC: [0.268, 0.156, 1.018, 0.989]\nD: [0.268, 0.156, 1.0, 0.882]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_103_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_103_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.159, 0.364, 0.504, 0.894]\nB: [0.192, 0.314, 0.537, 0.844]\nC: [0.198, 0.429, 0.423, 0.867]\nD: [0.72, 0.679, 0.87, 0.814]", "question": "Here is an object ([0.155, 0.296, 0.512, 0.847]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.159, 0.364, 0.504, 0.894]\nB: [0.192, 0.314, 0.537, 0.844]\nC: [0.198, 0.429, 0.423, 0.867]\nD: [0.72, 0.679, 0.87, 0.814]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_104_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_104_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.63, 0.36, 0.953, 0.414]\nB: [0.463, 0.172, 0.638, 0.432]\nC: [0.355, 0.146, 0.53, 0.406]\nD: [0.409, 0.218, 0.584, 0.478]", "question": "Here is an object ([0.372, 0.129, 0.613, 0.461]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.63, 0.36, 0.953, 0.414]\nB: [0.463, 0.172, 0.638, 0.432]\nC: [0.355, 0.146, 0.53, 0.406]\nD: [0.409, 0.218, 0.584, 0.478]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_105_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_105_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.712, 0.625, 0.841, 0.796]\nB: [0.351, 0.718, 0.397, 0.847]\nC: [0.364, 0.769, 0.41, 0.899]\nD: [0.409, 0.537, 0.505, 0.747]", "question": "Here is an object ([0.334, 0.714, 0.382, 0.814]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.712, 0.625, 0.841, 0.796]\nB: [0.351, 0.718, 0.397, 0.847]\nC: [0.364, 0.769, 0.41, 0.899]\nD: [0.409, 0.537, 0.505, 0.747]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_106_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_106_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.322, 0.412, 0.575, 0.818]\nB: [0.242, 0.253, 0.484, 0.642]\nC: [0.322, 0.412, 0.563, 0.801]\nD: [0.306, 0.432, 0.548, 0.821]", "question": "Here is an object ([0.298, 0.354, 0.506, 0.793]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.322, 0.412, 0.575, 0.818]\nB: [0.242, 0.253, 0.484, 0.642]\nC: [0.322, 0.412, 0.563, 0.801]\nD: [0.306, 0.432, 0.548, 0.821]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_107_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_107_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.48, 0.347, 0.798, 0.393]\nB: [0.207, 0.154, 0.544, 0.531]\nC: [0.207, 0.154, 0.597, 0.501]\nD: [0.332, 0.514, 0.696, 0.872]", "question": "Here is an object ([0.229, 0.156, 0.602, 0.49]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.48, 0.347, 0.798, 0.393]\nB: [0.207, 0.154, 0.544, 0.531]\nC: [0.207, 0.154, 0.597, 0.501]\nD: [0.332, 0.514, 0.696, 0.872]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_108_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_108_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.222, 0.832, 0.42, 0.985]\nB: [0.277, 0.832, 0.502, 1.0]\nC: [0.222, 0.832, 0.447, 1.0]\nD: [0.222, 0.832, 0.476, 1.031]", "question": "Here is an object ([0.0, 0.457, 0.234, 0.799]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.222, 0.832, 0.42, 0.985]\nB: [0.277, 0.832, 0.502, 1.0]\nC: [0.222, 0.832, 0.447, 1.0]\nD: [0.222, 0.832, 0.476, 1.031]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_109_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_109_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.0, 0.507, 1.0, 0.747]\nB: [0.0, 0.59, 1.0, 0.831]\nC: [0.0, 0.507, 1.165, 0.767]\nD: [0.72, 0.235, 0.856, 0.468]", "question": "Here is an object ([0.0, 0.514, 1.0, 0.725]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 960 and the height is 720.", "context": "Select from the following choices.\nA: [0.0, 0.507, 1.0, 0.747]\nB: [0.0, 0.59, 1.0, 0.831]\nC: [0.0, 0.507, 1.165, 0.767]\nD: [0.72, 0.235, 0.856, 0.468]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_110_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_110_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.198, 0.206, 0.652, 0.844]\nB: [0.374, 0.235, 0.77, 0.818]\nC: [0.626, 0.379, 0.905, 0.808]\nD: [0.198, 0.206, 0.594, 0.789]", "question": "Here is an object ([0.207, 0.212, 0.609, 0.786]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.198, 0.206, 0.652, 0.844]\nB: [0.374, 0.235, 0.77, 0.818]\nC: [0.626, 0.379, 0.905, 0.808]\nD: [0.198, 0.206, 0.594, 0.789]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_111_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_111_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.305, 0.0, 0.776, 0.582]\nB: [0.343, 0.211, 0.813, 0.793]\nC: [0.399, 0.029, 0.635, 0.49]\nD: [0.305, 0.0, 0.734, 0.481]", "question": "Here is an object ([0.302, 0.0, 0.73, 0.333]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.305, 0.0, 0.776, 0.582]\nB: [0.343, 0.211, 0.813, 0.793]\nC: [0.399, 0.029, 0.635, 0.49]\nD: [0.305, 0.0, 0.734, 0.481]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_112_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_112_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.322, 0.349, 1.0, 0.732]\nB: [0.127, 0.397, 0.805, 0.781]\nC: [0.314, 0.597, 0.748, 0.897]\nD: [0.003, 0.468, 0.254, 0.578]", "question": "Here is an object ([0.306, 0.381, 1.0, 0.722]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.322, 0.349, 1.0, 0.732]\nB: [0.127, 0.397, 0.805, 0.781]\nC: [0.314, 0.597, 0.748, 0.897]\nD: [0.003, 0.468, 0.254, 0.578]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_113_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_113_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.285, 0.218, 0.616, 0.626]\nB: [0.42, 0.044, 0.645, 0.375]\nC: [0.285, 0.218, 0.609, 0.671]\nD: [0.285, 0.218, 0.62, 0.713]", "question": "Here is an object ([0.395, 0.212, 0.702, 0.669]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.285, 0.218, 0.616, 0.626]\nB: [0.42, 0.044, 0.645, 0.375]\nC: [0.285, 0.218, 0.609, 0.671]\nD: [0.285, 0.218, 0.62, 0.713]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_114_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_114_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.861, 0.214, 0.959, 0.562]\nB: [0.861, 0.214, 0.968, 0.524]\nC: [0.893, 0.111, 1.0, 0.421]\nD: [0.147, 0.603, 0.412, 0.931]", "question": "Here is an object ([0.87, 0.222, 0.975, 0.528]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.861, 0.214, 0.959, 0.562]\nB: [0.861, 0.214, 0.968, 0.524]\nC: [0.893, 0.111, 1.0, 0.421]\nD: [0.147, 0.603, 0.412, 0.931]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_115_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_115_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.18, 0.113, 0.617, 0.567]\nB: [0.057, 0.256, 0.484, 0.771]\nC: [0.427, 0.164, 0.723, 0.478]\nD: [0.18, 0.113, 0.608, 0.628]", "question": "Here is an object ([0.164, 0.11, 0.591, 0.624]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.18, 0.113, 0.617, 0.567]\nB: [0.057, 0.256, 0.484, 0.771]\nC: [0.427, 0.164, 0.723, 0.478]\nD: [0.18, 0.113, 0.608, 0.628]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_116_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_116_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.362, 0.272, 0.645, 0.729]\nB: [0.362, 0.272, 0.713, 0.839]\nC: [0.241, 0.231, 0.585, 0.494]\nD: [0.604, 0.682, 0.843, 0.971]", "question": "Here is an object ([0.323, 0.211, 0.684, 0.831]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.362, 0.272, 0.645, 0.729]\nB: [0.362, 0.272, 0.713, 0.839]\nC: [0.241, 0.231, 0.585, 0.494]\nD: [0.604, 0.682, 0.843, 0.971]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_117_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_117_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.035, 0.06, 1.0, 1.0]\nB: [0.035, 0.06, 1.012, 1.072]\nC: [0.035, 0.06, 1.058, 1.111]\nD: [0.035, 0.06, 1.018, 0.933]", "question": "Here is an object ([0.105, 0.153, 1.0, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.035, 0.06, 1.0, 1.0]\nB: [0.035, 0.06, 1.012, 1.072]\nC: [0.035, 0.06, 1.058, 1.111]\nD: [0.035, 0.06, 1.018, 0.933]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_118_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_118_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.452, 0.0, 1.0, 0.858]\nB: [0.434, 0.0, 0.982, 0.858]\nC: [0.277, 0.025, 0.845, 0.994]\nD: [0.277, 0.025, 0.824, 0.883]", "question": "Here is an object ([0.275, 0.033, 0.816, 0.889]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.452, 0.0, 1.0, 0.858]\nB: [0.434, 0.0, 0.982, 0.858]\nC: [0.277, 0.025, 0.845, 0.994]\nD: [0.277, 0.025, 0.824, 0.883]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_119_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_119_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.0, 0.267, 0.72, 0.754]\nB: [0.044, 0.375, 0.259, 0.868]\nC: [0.0, 0.267, 0.838, 0.692]\nD: [0.0, 0.239, 0.838, 0.664]", "question": "Here is an object ([0.0, 0.268, 0.805, 0.74]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.0, 0.267, 0.72, 0.754]\nB: [0.044, 0.375, 0.259, 0.868]\nC: [0.0, 0.267, 0.838, 0.692]\nD: [0.0, 0.239, 0.838, 0.664]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_120_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_120_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.0, 0.001, 0.979, 0.851]\nB: [0.0, 0.001, 1.0, 1.0]\nC: [0.0, 0.0, 1.0, 0.999]\nD: [0.0, 0.0, 1.0, 0.999]", "question": "Here is an object ([0.302, 0.026, 1.0, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 480 and the height is 720.", "context": "Select from the following choices.\nA: [0.0, 0.001, 0.979, 0.851]\nB: [0.0, 0.001, 1.0, 1.0]\nC: [0.0, 0.0, 1.0, 0.999]\nD: [0.0, 0.0, 1.0, 0.999]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_121_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_121_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.281, 0.379, 0.819, 0.546]\nB: [0.018, 0.447, 0.457, 0.604]\nC: [0.018, 0.447, 0.555, 0.614]\nD: [0.414, 0.225, 0.912, 0.421]", "question": "Here is an object ([0.025, 0.489, 0.583, 0.636]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.281, 0.379, 0.819, 0.546]\nB: [0.018, 0.447, 0.457, 0.604]\nC: [0.018, 0.447, 0.555, 0.614]\nD: [0.414, 0.225, 0.912, 0.421]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_122_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_122_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.502, 0.561, 0.619, 0.736]\nB: [0.462, 0.122, 0.881, 0.621]\nC: [0.517, 0.637, 0.634, 0.812]\nD: [0.502, 0.561, 0.606, 0.724]", "question": "Here is an object ([0.515, 0.581, 0.582, 0.721]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.502, 0.561, 0.619, 0.736]\nB: [0.462, 0.122, 0.881, 0.621]\nC: [0.517, 0.637, 0.634, 0.812]\nD: [0.502, 0.561, 0.606, 0.724]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_123_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_123_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.2, 0.082, 0.523, 1.019]\nB: [0.413, 0.683, 0.617, 0.865]\nC: [0.2, 0.082, 0.583, 1.0]\nD: [0.178, 0.69, 0.47, 0.832]", "question": "Here is an object ([0.189, 0.138, 0.595, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.2, 0.082, 0.523, 1.019]\nB: [0.413, 0.683, 0.617, 0.865]\nC: [0.2, 0.082, 0.583, 1.0]\nD: [0.178, 0.69, 0.47, 0.832]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_124_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_124_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.36, 0.119, 0.56, 0.476]\nB: [0.095, 0.053, 0.541, 0.535]\nC: [0.36, 0.119, 0.557, 0.432]\nD: [0.36, 0.119, 0.534, 0.429]", "question": "Here is an object ([0.371, 0.131, 0.545, 0.589]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.36, 0.119, 0.56, 0.476]\nB: [0.095, 0.053, 0.541, 0.535]\nC: [0.36, 0.119, 0.557, 0.432]\nD: [0.36, 0.119, 0.534, 0.429]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_125_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_125_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.156, 0.181, 0.855, 1.158]\nB: [0.534, 0.085, 0.951, 0.11]\nC: [0.63, 0.921, 0.958, 0.972]\nD: [0.156, 0.181, 0.795, 1.0]", "question": "Here is an object ([0.303, 0.033, 0.923, 0.899]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.156, 0.181, 0.855, 1.158]\nB: [0.534, 0.085, 0.951, 0.11]\nC: [0.63, 0.921, 0.958, 0.972]\nD: [0.156, 0.181, 0.795, 1.0]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_126_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_126_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.0, 0.142, 0.242, 0.715]\nB: [0.0, 0.142, 0.28, 0.631]\nC: [0.749, 0.119, 0.961, 0.493]\nD: [0.0, 0.142, 0.267, 0.637]", "question": "Here is an object ([0.0, 0.143, 0.256, 0.608]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.0, 0.142, 0.242, 0.715]\nB: [0.0, 0.142, 0.28, 0.631]\nC: [0.749, 0.119, 0.961, 0.493]\nD: [0.0, 0.142, 0.267, 0.637]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_127_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_127_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.131, 0.228, 0.717, 0.76]\nB: [0.219, 0.322, 0.315, 0.586]\nC: [0.131, 0.228, 0.64, 0.701]\nD: [0.648, 0.182, 0.732, 0.421]", "question": "Here is an object ([0.113, 0.224, 0.618, 0.713]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 406 and the height is 720.", "context": "Select from the following choices.\nA: [0.131, 0.228, 0.717, 0.76]\nB: [0.219, 0.322, 0.315, 0.586]\nC: [0.131, 0.228, 0.64, 0.701]\nD: [0.648, 0.182, 0.732, 0.421]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_128_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_128_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.531, 0.565, 0.643, 0.656]\nB: [0.008, 0.514, 0.429, 0.842]\nC: [0.531, 0.565, 0.629, 0.65]\nD: [0.077, 0.757, 0.463, 0.997]", "question": "Here is an object ([0.548, 0.553, 0.65, 0.622]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.531, 0.565, 0.643, 0.656]\nB: [0.008, 0.514, 0.429, 0.842]\nC: [0.531, 0.565, 0.629, 0.65]\nD: [0.077, 0.757, 0.463, 0.997]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_129_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_129_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.292, 0.122, 0.642, 0.711]\nB: [0.312, 0.0, 0.662, 0.589]\nC: [0.291, 0.154, 0.641, 0.743]\nD: [0.462, 0.358, 0.812, 0.947]", "question": "Here is an object ([0.295, 0.146, 0.641, 0.739]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.292, 0.122, 0.642, 0.711]\nB: [0.312, 0.0, 0.662, 0.589]\nC: [0.291, 0.154, 0.641, 0.743]\nD: [0.462, 0.358, 0.812, 0.947]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_130_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_130_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.416, 0.371, 0.48, 0.562]\nB: [0.404, 0.312, 0.468, 0.504]\nC: [0.241, 0.174, 0.517, 0.3]\nD: [0.426, 0.392, 0.49, 0.583]", "question": "Here is an object ([0.431, 0.4, 0.509, 0.603]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.416, 0.371, 0.48, 0.562]\nB: [0.404, 0.312, 0.468, 0.504]\nC: [0.241, 0.174, 0.517, 0.3]\nD: [0.426, 0.392, 0.49, 0.583]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_131_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_131_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.0, 0.597, 0.102, 0.949]\nB: [0.0, 0.597, 0.112, 1.035]\nC: [0.0, 0.597, 0.096, 0.986]\nD: [0.493, 0.408, 0.527, 0.66]", "question": "Here is an object ([0.0, 0.621, 0.077, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.0, 0.597, 0.102, 0.949]\nB: [0.0, 0.597, 0.112, 1.035]\nC: [0.0, 0.597, 0.096, 0.986]\nD: [0.493, 0.408, 0.527, 0.66]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_132_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_132_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.089, 0.829, 0.118, 0.868]\nB: [0.54, 0.044, 0.668, 0.643]\nC: [0.537, 0.218, 0.666, 0.817]\nD: [0.54, 0.044, 0.655, 0.714]", "question": "Here is an object ([0.595, 0.092, 0.691, 0.7]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.089, 0.829, 0.118, 0.868]\nB: [0.54, 0.044, 0.668, 0.643]\nC: [0.537, 0.218, 0.666, 0.817]\nD: [0.54, 0.044, 0.655, 0.714]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_133_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_133_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.395, 0.317, 0.602, 0.821]\nB: [0.504, 0.408, 0.513, 0.686]\nC: [0.484, 0.439, 0.69, 0.943]\nD: [0.313, 0.244, 0.52, 0.749]", "question": "Here is an object ([0.429, 0.154, 0.625, 0.786]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.395, 0.317, 0.602, 0.821]\nB: [0.504, 0.408, 0.513, 0.686]\nC: [0.484, 0.439, 0.69, 0.943]\nD: [0.313, 0.244, 0.52, 0.749]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_134_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_134_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.13, 0.0, 0.852, 1.0]\nB: [0.071, 0.0, 0.793, 1.0]\nC: [0.095, 0.306, 0.59, 0.322]\nD: [0.98, 0.435, 0.996, 0.803]", "question": "Here is an object ([0.063, 0.0, 1.0, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.13, 0.0, 0.852, 1.0]\nB: [0.071, 0.0, 0.793, 1.0]\nC: [0.095, 0.306, 0.59, 0.322]\nD: [0.98, 0.435, 0.996, 0.803]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_135_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_135_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.277, 0.0, 0.519, 0.45]\nB: [0.395, 0.013, 0.637, 0.463]\nC: [0.497, 0.199, 0.843, 0.696]\nD: [0.281, 0.114, 0.523, 0.564]", "question": "Here is an object ([0.264, 0.0, 0.491, 0.404]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.277, 0.0, 0.519, 0.45]\nB: [0.395, 0.013, 0.637, 0.463]\nC: [0.497, 0.199, 0.843, 0.696]\nD: [0.281, 0.114, 0.523, 0.564]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_136_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_136_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.901, 0.401, 0.985, 1.051]\nB: [0.901, 0.401, 1.0, 1.0]\nC: [0.504, 0.157, 0.877, 0.589]\nD: [0.901, 0.206, 1.0, 0.804]", "question": "Here is an object ([0.934, 0.432, 1.0, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.901, 0.401, 0.985, 1.051]\nB: [0.901, 0.401, 1.0, 1.0]\nC: [0.504, 0.157, 0.877, 0.589]\nD: [0.901, 0.206, 1.0, 0.804]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_137_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_137_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.0, 0.267, 0.299, 0.561]\nB: [0.0, 0.267, 0.309, 0.537]\nC: [0.0, 0.267, 0.323, 0.568]\nD: [0.0, 0.171, 0.323, 0.472]", "question": "Here is an object ([0.0, 0.246, 0.424, 0.611]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.0, 0.267, 0.299, 0.561]\nB: [0.0, 0.267, 0.309, 0.537]\nC: [0.0, 0.267, 0.323, 0.568]\nD: [0.0, 0.171, 0.323, 0.472]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_138_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_138_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.0, 0.0, 0.606, 1.0]\nB: [0.502, 0.601, 0.622, 0.924]\nC: [0.287, 0.311, 0.747, 0.39]\nD: [0.0, 0.0, 0.535, 1.157]", "question": "Here is an object ([0.0, 0.0, 0.923, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.0, 0.0, 0.606, 1.0]\nB: [0.502, 0.601, 0.622, 0.924]\nC: [0.287, 0.311, 0.747, 0.39]\nD: [0.0, 0.0, 0.535, 1.157]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_139_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_139_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.05, 0.728, 0.195, 0.956]\nB: [0.193, 0.054, 0.217, 0.426]\nC: [0.434, 0.371, 0.787, 1.0]\nD: [0.519, 0.371, 0.872, 1.0]", "question": "Here is an object ([0.529, 0.507, 0.775, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.05, 0.728, 0.195, 0.956]\nB: [0.193, 0.054, 0.217, 0.426]\nC: [0.434, 0.371, 0.787, 1.0]\nD: [0.519, 0.371, 0.872, 1.0]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_140_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_140_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.409, 0.479, 0.546, 0.554]\nB: [0.409, 0.479, 0.537, 0.55]\nC: [0.429, 0.487, 0.557, 0.558]\nD: [0.409, 0.479, 0.516, 0.56]", "question": "Here is an object ([0.455, 0.471, 0.564, 0.543]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.409, 0.479, 0.546, 0.554]\nB: [0.409, 0.479, 0.537, 0.55]\nC: [0.429, 0.487, 0.557, 0.558]\nD: [0.409, 0.479, 0.516, 0.56]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_141_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_141_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.514, 0.244, 0.854, 0.649]\nB: [0.601, 0.221, 1.0, 0.662]\nC: [0.514, 0.244, 0.913, 0.686]\nD: [0.601, 0.308, 1.0, 0.75]", "question": "Here is an object ([0.589, 0.235, 0.943, 0.722]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.514, 0.244, 0.854, 0.649]\nB: [0.601, 0.221, 1.0, 0.662]\nC: [0.514, 0.244, 0.913, 0.686]\nD: [0.601, 0.308, 1.0, 0.75]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_142_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_142_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.488, 0.207, 0.569, 0.358]\nB: [0.469, 0.228, 0.549, 0.379]\nC: [0.432, 0.458, 0.816, 0.517]\nD: [0.019, 0.432, 0.448, 0.564]", "question": "Here is an object ([0.496, 0.242, 0.566, 0.381]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.488, 0.207, 0.569, 0.358]\nB: [0.469, 0.228, 0.549, 0.379]\nC: [0.432, 0.458, 0.816, 0.517]\nD: [0.019, 0.432, 0.448, 0.564]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_143_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_143_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.0, 0.161, 0.049, 0.542]\nB: [0.699, 0.242, 0.79, 0.568]\nC: [0.0, 0.099, 0.049, 0.479]\nD: [0.0, 0.101, 0.049, 0.482]", "question": "Here is an object ([0.0, 0.094, 0.1, 0.554]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.0, 0.161, 0.049, 0.542]\nB: [0.699, 0.242, 0.79, 0.568]\nC: [0.0, 0.099, 0.049, 0.479]\nD: [0.0, 0.101, 0.049, 0.482]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_144_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_144_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.377, 0.336, 1.024, 0.835]\nB: [0.377, 0.336, 0.956, 0.956]\nC: [0.377, 0.336, 1.061, 1.003]\nD: [0.101, 0.381, 0.68, 1.0]", "question": "Here is an object ([0.433, 0.271, 0.981, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.377, 0.336, 1.024, 0.835]\nB: [0.377, 0.336, 0.956, 0.956]\nC: [0.377, 0.336, 1.061, 1.003]\nD: [0.101, 0.381, 0.68, 1.0]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_145_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_145_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.364, 0.487, 0.55, 0.693]\nB: [0.364, 0.487, 0.529, 0.668]\nC: [0.273, 0.447, 0.459, 0.653]\nD: [0.378, 0.558, 0.564, 0.764]", "question": "Here is an object ([0.342, 0.415, 0.542, 0.607]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.364, 0.487, 0.55, 0.693]\nB: [0.364, 0.487, 0.529, 0.668]\nC: [0.273, 0.447, 0.459, 0.653]\nD: [0.378, 0.558, 0.564, 0.764]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_146_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_146_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.116, 0.26, 0.833, 0.936]\nB: [0.116, 0.26, 0.734, 1.0]\nC: [0.0, 0.26, 0.619, 1.0]\nD: [0.116, 0.626, 0.322, 0.66]", "question": "Here is an object ([0.113, 0.256, 0.725, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.116, 0.26, 0.833, 0.936]\nB: [0.116, 0.26, 0.734, 1.0]\nC: [0.0, 0.26, 0.619, 1.0]\nD: [0.116, 0.626, 0.322, 0.66]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_147_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_147_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.409, 0.407, 0.471, 0.524]\nB: [0.402, 0.449, 0.465, 0.565]\nC: [0.404, 0.357, 0.466, 0.474]\nD: [0.137, 0.357, 0.261, 0.697]", "question": "Here is an object ([0.479, 0.539, 0.527, 0.662]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.409, 0.407, 0.471, 0.524]\nB: [0.402, 0.449, 0.465, 0.565]\nC: [0.404, 0.357, 0.466, 0.474]\nD: [0.137, 0.357, 0.261, 0.697]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_148_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_148_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.453, 0.503, 0.507, 0.681]\nB: [0.128, 0.867, 0.552, 0.899]\nC: [0.276, 0.35, 0.747, 0.397]\nD: [0.453, 0.503, 0.503, 0.706]", "question": "Here is an object ([0.487, 0.506, 0.544, 0.672]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.453, 0.503, 0.507, 0.681]\nB: [0.128, 0.867, 0.552, 0.899]\nC: [0.276, 0.35, 0.747, 0.397]\nD: [0.453, 0.503, 0.503, 0.706]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_149_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_149_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.22, 0.242, 0.862, 0.635]\nB: [0.161, 0.114, 0.634, 0.354]\nC: [0.562, 0.422, 0.925, 0.835]\nD: [0.359, 0.388, 1.0, 0.781]", "question": "Here is an object ([0.209, 0.215, 0.863, 0.618]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.22, 0.242, 0.862, 0.635]\nB: [0.161, 0.114, 0.634, 0.354]\nC: [0.562, 0.422, 0.925, 0.835]\nD: [0.359, 0.388, 1.0, 0.781]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_150_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_150_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.285, 0.511, 1.0, 0.756]\nB: [0.606, 0.539, 0.62, 0.972]\nC: [0.22, 0.585, 0.935, 0.829]\nD: [0.285, 0.511, 1.085, 0.719]", "question": "Here is an object ([0.435, 0.412, 1.0, 0.749]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.285, 0.511, 1.0, 0.756]\nB: [0.606, 0.539, 0.62, 0.972]\nC: [0.22, 0.585, 0.935, 0.829]\nD: [0.285, 0.511, 1.085, 0.719]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_151_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_151_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.209, 0.343, 0.797, 0.886]\nB: [0.028, 0.369, 0.616, 0.912]\nC: [0.0, 0.146, 0.588, 0.689]\nD: [0.337, 0.056, 0.549, 0.196]", "question": "Here is an object ([0.021, 0.375, 0.605, 0.915]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.209, 0.343, 0.797, 0.886]\nB: [0.028, 0.369, 0.616, 0.912]\nC: [0.0, 0.146, 0.588, 0.689]\nD: [0.337, 0.056, 0.549, 0.196]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_152_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_152_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.36, 0.068, 0.727, 0.486]\nB: [0.048, 0.221, 0.545, 0.911]\nC: [0.116, 0.31, 0.613, 1.0]\nD: [0.116, 0.31, 0.68, 1.039]", "question": "Here is an object ([0.116, 0.312, 0.606, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.36, 0.068, 0.727, 0.486]\nB: [0.048, 0.221, 0.545, 0.911]\nC: [0.116, 0.31, 0.613, 1.0]\nD: [0.116, 0.31, 0.68, 1.039]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_153_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_153_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.251, 0.228, 1.0, 1.0]\nB: [0.0, 0.0, 0.749, 0.772]\nC: [0.0, 0.228, 0.749, 1.0]\nD: [0.0, 0.113, 0.749, 0.885]", "question": "Here is an object ([0.0, 0.119, 0.75, 0.885]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 960 and the height is 720.", "context": "Select from the following choices.\nA: [0.251, 0.228, 1.0, 1.0]\nB: [0.0, 0.0, 0.749, 0.772]\nC: [0.0, 0.228, 0.749, 1.0]\nD: [0.0, 0.113, 0.749, 0.885]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_154_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_154_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.084, 0.149, 0.391, 0.438]\nB: [0.0, 0.071, 0.836, 1.133]\nC: [0.0, 0.071, 0.905, 1.0]\nD: [0.095, 0.0, 1.0, 0.929]", "question": "Here is an object ([0.0, 0.001, 0.894, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.084, 0.149, 0.391, 0.438]\nB: [0.0, 0.071, 0.836, 1.133]\nC: [0.0, 0.071, 0.905, 1.0]\nD: [0.095, 0.0, 1.0, 0.929]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_155_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_155_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.066, 0.203, 0.189, 0.603]\nB: [0.204, 0.146, 0.611, 1.0]\nC: [0.03, 0.146, 0.437, 1.0]\nD: [0.03, 0.146, 0.445, 0.939]", "question": "Here is an object ([0.034, 0.21, 0.511, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.066, 0.203, 0.189, 0.603]\nB: [0.204, 0.146, 0.611, 1.0]\nC: [0.03, 0.146, 0.437, 1.0]\nD: [0.03, 0.146, 0.445, 0.939]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_156_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_156_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.3, 0.251, 0.613, 1.0]\nB: [0.39, 0.0, 0.712, 0.697]\nC: [0.708, 0.621, 0.739, 0.844]\nD: [0.39, 0.0, 0.703, 0.749]", "question": "Here is an object ([0.242, 0.0, 0.613, 0.656]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.3, 0.251, 0.613, 1.0]\nB: [0.39, 0.0, 0.712, 0.697]\nC: [0.708, 0.621, 0.739, 0.844]\nD: [0.39, 0.0, 0.703, 0.749]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_157_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_157_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.408, 0.212, 0.655, 0.739]\nB: [0.17, 0.383, 0.197, 0.639]\nC: [0.408, 0.212, 0.661, 0.754]\nD: [0.408, 0.212, 0.665, 0.856]", "question": "Here is an object ([0.403, 0.207, 0.651, 0.767]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.408, 0.212, 0.655, 0.739]\nB: [0.17, 0.383, 0.197, 0.639]\nC: [0.408, 0.212, 0.661, 0.754]\nD: [0.408, 0.212, 0.665, 0.856]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_158_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_158_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.594, 0.279, 0.91, 0.968]\nB: [0.486, 0.013, 0.765, 0.59]\nC: [0.446, 0.122, 0.805, 0.543]\nD: [0.594, 0.279, 0.872, 0.857]", "question": "Here is an object ([0.596, 0.289, 0.867, 0.853]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1270 and the height is 720.", "context": "Select from the following choices.\nA: [0.594, 0.279, 0.91, 0.968]\nB: [0.486, 0.013, 0.765, 0.59]\nC: [0.446, 0.122, 0.805, 0.543]\nD: [0.594, 0.279, 0.872, 0.857]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_159_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_159_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.002, 0.087, 0.223, 0.472]\nB: [0.194, 0.114, 0.683, 0.775]\nC: [0.069, 0.221, 0.233, 0.621]\nD: [0.179, 0.339, 0.668, 1.0]", "question": "Here is an object ([0.228, 0.0, 0.719, 0.607]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.002, 0.087, 0.223, 0.472]\nB: [0.194, 0.114, 0.683, 0.775]\nC: [0.069, 0.221, 0.233, 0.621]\nD: [0.179, 0.339, 0.668, 1.0]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_160_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_160_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.459, 0.085, 0.476, 0.549]\nB: [0.248, 0.667, 0.747, 0.828]\nC: [0.512, 0.371, 0.652, 0.542]\nD: [0.512, 0.371, 0.626, 0.522]", "question": "Here is an object ([0.509, 0.357, 0.635, 0.535]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.459, 0.085, 0.476, 0.549]\nB: [0.248, 0.667, 0.747, 0.828]\nC: [0.512, 0.371, 0.652, 0.542]\nD: [0.512, 0.371, 0.626, 0.522]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_161_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_161_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.32, 0.046, 0.584, 0.879]\nB: [0.177, 0.0, 0.491, 0.917]\nC: [0.494, 0.643, 0.716, 0.814]\nD: [0.32, 0.046, 0.634, 0.963]", "question": "Here is an object ([0.324, 0.046, 0.635, 0.968]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.32, 0.046, 0.584, 0.879]\nB: [0.177, 0.0, 0.491, 0.917]\nC: [0.494, 0.643, 0.716, 0.814]\nD: [0.32, 0.046, 0.634, 0.963]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_162_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_162_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.43, 0.485, 0.783, 0.656]\nB: [0.502, 0.41, 0.579, 0.64]\nC: [0.463, 0.338, 0.54, 0.568]\nD: [0.488, 0.294, 0.566, 0.525]", "question": "Here is an object ([0.476, 0.335, 0.562, 0.568]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.43, 0.485, 0.783, 0.656]\nB: [0.502, 0.41, 0.579, 0.64]\nC: [0.463, 0.338, 0.54, 0.568]\nD: [0.488, 0.294, 0.566, 0.525]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_163_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_163_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.461, 0.357, 0.537, 0.714]\nB: [0.461, 0.357, 0.526, 0.771]\nC: [0.095, 0.572, 0.489, 0.808]\nD: [0.465, 0.401, 0.541, 0.758]", "question": "Here is an object ([0.466, 0.358, 0.545, 0.706]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.461, 0.357, 0.537, 0.714]\nB: [0.461, 0.357, 0.526, 0.771]\nC: [0.095, 0.572, 0.489, 0.808]\nD: [0.465, 0.401, 0.541, 0.758]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_164_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_164_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.442, 0.604, 0.598, 0.832]\nB: [0.49, 0.487, 0.658, 0.771]\nC: [0.442, 0.604, 0.61, 0.887]\nD: [0.409, 0.69, 0.577, 0.974]", "question": "Here is an object ([0.455, 0.621, 0.626, 0.886]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.442, 0.604, 0.598, 0.832]\nB: [0.49, 0.487, 0.658, 0.771]\nC: [0.442, 0.604, 0.61, 0.887]\nD: [0.409, 0.69, 0.577, 0.974]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_165_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_165_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.416, 0.133, 0.897, 0.514]\nB: [0.416, 0.133, 0.995, 0.537]\nC: [0.433, 0.497, 0.685, 0.806]\nD: [0.421, 0.0, 1.0, 0.404]", "question": "Here is an object ([0.436, 0.083, 0.995, 0.561]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 406 and the height is 720.", "context": "Select from the following choices.\nA: [0.416, 0.133, 0.897, 0.514]\nB: [0.416, 0.133, 0.995, 0.537]\nC: [0.433, 0.497, 0.685, 0.806]\nD: [0.421, 0.0, 1.0, 0.404]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_166_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_166_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.228, 0.108, 0.396, 0.479]\nB: [0.171, 0.0, 0.923, 0.742]\nC: [0.171, 0.093, 1.0, 0.824]\nD: [0.171, 0.0, 1.0, 0.731]", "question": "Here is an object ([0.165, 0.0, 1.0, 0.726]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.228, 0.108, 0.396, 0.479]\nB: [0.171, 0.0, 0.923, 0.742]\nC: [0.171, 0.093, 1.0, 0.824]\nD: [0.171, 0.0, 1.0, 0.731]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_167_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_167_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.074, 0.186, 0.488, 1.0]\nB: [0.058, 0.151, 0.472, 0.965]\nC: [0.159, 0.186, 0.639, 0.935]\nD: [0.159, 0.186, 0.573, 1.0]", "question": "Here is an object ([0.179, 0.022, 0.554, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.074, 0.186, 0.488, 1.0]\nB: [0.058, 0.151, 0.472, 0.965]\nC: [0.159, 0.186, 0.639, 0.935]\nD: [0.159, 0.186, 0.573, 1.0]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_168_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_168_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.0, 0.287, 0.342, 0.665]\nB: [0.078, 0.428, 0.42, 0.806]\nC: [0.34, 0.412, 0.643, 0.438]\nD: [0.0, 0.287, 0.341, 0.682]", "question": "Here is an object ([0.0, 0.297, 0.397, 0.665]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.0, 0.287, 0.342, 0.665]\nB: [0.078, 0.428, 0.42, 0.806]\nC: [0.34, 0.412, 0.643, 0.438]\nD: [0.0, 0.287, 0.341, 0.682]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_169_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_169_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.464, 0.276, 0.727, 1.0]\nB: [0.464, 0.276, 0.745, 0.993]\nC: [0.517, 0.276, 0.78, 1.0]\nD: [0.464, 0.276, 0.692, 0.875]", "question": "Here is an object ([0.455, 0.276, 0.688, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.464, 0.276, 0.727, 1.0]\nB: [0.464, 0.276, 0.745, 0.993]\nC: [0.517, 0.276, 0.78, 1.0]\nD: [0.464, 0.276, 0.692, 0.875]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_170_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_170_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.534, 0.237, 0.687, 0.515]\nB: [0.534, 0.237, 0.662, 0.522]\nC: [0.534, 0.237, 0.641, 0.497]\nD: [0.499, 0.261, 0.628, 0.546]", "question": "Here is an object ([0.58, 0.235, 0.755, 0.518]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.534, 0.237, 0.687, 0.515]\nB: [0.534, 0.237, 0.662, 0.522]\nC: [0.534, 0.237, 0.641, 0.497]\nD: [0.499, 0.261, 0.628, 0.546]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_171_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_171_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.081, 0.196, 0.637, 1.131]\nB: [0.081, 0.196, 0.748, 1.113]\nC: [0.081, 0.196, 0.658, 0.994]\nD: [0.611, 0.761, 0.737, 0.843]", "question": "Here is an object ([0.136, 0.15, 0.672, 0.881]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.081, 0.196, 0.637, 1.131]\nB: [0.081, 0.196, 0.748, 1.113]\nC: [0.081, 0.196, 0.658, 0.994]\nD: [0.611, 0.761, 0.737, 0.843]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_172_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_172_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.23, 0.069, 0.931, 1.0]\nB: [0.23, 0.069, 0.792, 1.121]\nC: [0.218, 0.069, 0.919, 1.0]\nD: [0.457, 0.265, 0.69, 0.581]", "question": "Here is an object ([0.231, 0.124, 0.86, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.23, 0.069, 0.931, 1.0]\nB: [0.23, 0.069, 0.792, 1.121]\nC: [0.218, 0.069, 0.919, 1.0]\nD: [0.457, 0.265, 0.69, 0.581]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_173_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_173_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.159, 0.225, 0.294, 0.533]\nB: [0.218, 0.453, 0.636, 0.631]\nC: [0.292, 0.406, 0.459, 0.643]\nD: [0.292, 0.406, 0.456, 0.7]", "question": "Here is an object ([0.29, 0.426, 0.471, 0.7]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.159, 0.225, 0.294, 0.533]\nB: [0.218, 0.453, 0.636, 0.631]\nC: [0.292, 0.406, 0.459, 0.643]\nD: [0.292, 0.406, 0.456, 0.7]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_174_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_174_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.143, 0.454, 0.249, 0.654]\nB: [0.077, 0.669, 0.136, 0.985]\nC: [0.145, 0.525, 0.252, 0.725]\nD: [0.143, 0.454, 0.266, 0.657]", "question": "Here is an object ([0.12, 0.461, 0.237, 0.653]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.143, 0.454, 0.249, 0.654]\nB: [0.077, 0.669, 0.136, 0.985]\nC: [0.145, 0.525, 0.252, 0.725]\nD: [0.143, 0.454, 0.266, 0.657]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_175_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_175_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.602, 0.0, 0.946, 0.739]\nB: [0.468, 0.376, 0.48, 0.842]\nC: [0.44, 0.261, 0.783, 1.0]\nD: [0.393, 0.261, 0.736, 1.0]", "question": "Here is an object ([0.446, 0.211, 0.622, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 960 and the height is 720.", "context": "Select from the following choices.\nA: [0.602, 0.0, 0.946, 0.739]\nB: [0.468, 0.376, 0.48, 0.842]\nC: [0.44, 0.261, 0.783, 1.0]\nD: [0.393, 0.261, 0.736, 1.0]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_176_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_176_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.0, 0.001, 0.722, 1.126]\nB: [0.0, 0.001, 0.598, 1.193]\nC: [0.0, 0.001, 0.724, 0.999]\nD: [0.0, 0.001, 0.738, 1.196]", "question": "Here is an object ([0.0, 0.0, 0.755, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.0, 0.001, 0.722, 1.126]\nB: [0.0, 0.001, 0.598, 1.193]\nC: [0.0, 0.001, 0.724, 0.999]\nD: [0.0, 0.001, 0.738, 1.196]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_177_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_177_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.183, 0.761, 0.235, 0.919]\nB: [0.683, 0.257, 0.857, 0.718]\nC: [0.351, 0.0, 1.0, 1.0]\nD: [0.351, 0.0, 0.877, 0.803]", "question": "Here is an object ([0.313, 0.0, 1.0, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.183, 0.761, 0.235, 0.919]\nB: [0.683, 0.257, 0.857, 0.718]\nC: [0.351, 0.0, 1.0, 1.0]\nD: [0.351, 0.0, 0.877, 0.803]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_178_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_178_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.334, 0.014, 1.103, 1.108]\nB: [0.2, 0.281, 0.454, 0.629]\nC: [0.334, 0.014, 0.926, 0.993]\nD: [0.334, 0.014, 1.0, 1.0]", "question": "Here is an object ([0.235, 0.001, 1.0, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.334, 0.014, 1.103, 1.108]\nB: [0.2, 0.281, 0.454, 0.629]\nC: [0.334, 0.014, 0.926, 0.993]\nD: [0.334, 0.014, 1.0, 1.0]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_179_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_179_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.356, 0.011, 0.83, 0.357]\nB: [0.183, 0.207, 0.581, 1.011]\nC: [0.183, 0.207, 0.68, 0.996]\nD: [0.183, 0.207, 0.616, 1.11]", "question": "Here is an object ([0.211, 0.165, 0.67, 0.982]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.356, 0.011, 0.83, 0.357]\nB: [0.183, 0.207, 0.581, 1.011]\nC: [0.183, 0.207, 0.68, 0.996]\nD: [0.183, 0.207, 0.616, 1.11]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_180_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_180_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.479, 0.108, 0.884, 0.665]\nB: [0.552, 0.097, 0.956, 0.654]\nC: [0.317, 0.204, 0.722, 0.761]\nD: [0.479, 0.108, 0.859, 0.699]", "question": "Here is an object ([0.457, 0.218, 0.777, 0.725]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.479, 0.108, 0.884, 0.665]\nB: [0.552, 0.097, 0.956, 0.654]\nC: [0.317, 0.204, 0.722, 0.761]\nD: [0.479, 0.108, 0.859, 0.699]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_181_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_181_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.603, 0.522, 0.715, 0.79]\nB: [0.531, 0.461, 0.641, 0.671]\nC: [0.523, 0.396, 0.632, 0.606]\nD: [0.537, 0.519, 0.702, 0.668]", "question": "Here is an object ([0.584, 0.392, 0.634, 0.551]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.603, 0.522, 0.715, 0.79]\nB: [0.531, 0.461, 0.641, 0.671]\nC: [0.523, 0.396, 0.632, 0.606]\nD: [0.537, 0.519, 0.702, 0.668]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_182_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_182_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.0, 0.013, 0.871, 1.0]\nB: [0.129, 0.013, 1.047, 0.982]\nC: [0.696, 0.489, 0.793, 0.943]\nD: [0.129, 0.013, 1.0, 1.0]", "question": "Here is an object ([0.059, 0.0, 1.0, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.0, 0.013, 0.871, 1.0]\nB: [0.129, 0.013, 1.047, 0.982]\nC: [0.696, 0.489, 0.793, 0.943]\nD: [0.129, 0.013, 1.0, 1.0]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_183_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_183_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.42, 0.268, 0.625, 0.971]\nB: [0.42, 0.268, 0.636, 0.778]\nC: [0.42, 0.268, 0.66, 0.865]\nD: [0.42, 0.268, 0.639, 0.919]", "question": "Here is an object ([0.411, 0.272, 0.654, 0.865]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 960 and the height is 720.", "context": "Select from the following choices.\nA: [0.42, 0.268, 0.625, 0.971]\nB: [0.42, 0.268, 0.636, 0.778]\nC: [0.42, 0.268, 0.66, 0.865]\nD: [0.42, 0.268, 0.639, 0.919]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_184_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_184_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.174, 0.0, 0.691, 0.558]\nB: [0.483, 0.21, 1.0, 0.768]\nC: [0.382, 0.046, 0.899, 0.604]\nD: [0.432, 0.364, 0.76, 0.779]", "question": "Here is an object ([0.384, 0.018, 0.968, 0.479]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.174, 0.0, 0.691, 0.558]\nB: [0.483, 0.21, 1.0, 0.768]\nC: [0.382, 0.046, 0.899, 0.604]\nD: [0.432, 0.364, 0.76, 0.779]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_185_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_185_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.595, 0.536, 0.865, 0.782]\nB: [0.595, 0.536, 0.829, 0.744]\nC: [0.074, 0.478, 0.275, 0.861]\nD: [0.059, 0.325, 0.287, 0.339]", "question": "Here is an object ([0.487, 0.554, 0.705, 0.758]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.595, 0.536, 0.865, 0.782]\nB: [0.595, 0.536, 0.829, 0.744]\nC: [0.074, 0.478, 0.275, 0.861]\nD: [0.059, 0.325, 0.287, 0.339]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_186_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_186_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.384, 0.524, 0.459, 0.842]\nB: [0.0, 0.0, 0.77, 0.999]\nC: [0.126, 0.49, 0.423, 0.603]\nD: [0.0, 0.0, 0.685, 0.894]", "question": "Here is an object ([0.0, 0.0, 0.784, 0.999]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.384, 0.524, 0.459, 0.842]\nB: [0.0, 0.0, 0.77, 0.999]\nC: [0.126, 0.49, 0.423, 0.603]\nD: [0.0, 0.0, 0.685, 0.894]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_187_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_187_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.0, 0.046, 0.939, 0.84]\nB: [0.057, 0.29, 0.25, 0.646]\nC: [0.578, 0.11, 0.852, 0.163]\nD: [0.0, 0.046, 0.89, 0.84]", "question": "Here is an object ([0.0, 0.001, 0.961, 0.874]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.0, 0.046, 0.939, 0.84]\nB: [0.057, 0.29, 0.25, 0.646]\nC: [0.578, 0.11, 0.852, 0.163]\nD: [0.0, 0.046, 0.89, 0.84]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_188_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_188_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.334, 0.31, 0.566, 0.938]\nB: [0.275, 0.312, 0.504, 1.0]\nC: [0.334, 0.31, 0.563, 0.997]\nD: [0.591, 0.644, 0.888, 0.765]", "question": "Here is an object ([0.262, 0.143, 0.509, 0.997]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.334, 0.31, 0.566, 0.938]\nB: [0.275, 0.312, 0.504, 1.0]\nC: [0.334, 0.31, 0.563, 0.997]\nD: [0.591, 0.644, 0.888, 0.765]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_189_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_189_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.0, 0.565, 0.12, 0.9]\nB: [0.0, 0.565, 0.126, 0.917]\nC: [0.055, 0.589, 0.181, 0.94]\nD: [0.825, 0.094, 0.94, 0.535]", "question": "Here is an object ([0.0, 0.05, 1.0, 0.86]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.0, 0.565, 0.12, 0.9]\nB: [0.0, 0.565, 0.126, 0.917]\nC: [0.055, 0.589, 0.181, 0.94]\nD: [0.825, 0.094, 0.94, 0.535]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_190_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_190_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.644, 0.44, 0.805, 0.861]\nB: [0.587, 0.544, 0.748, 0.965]\nC: [0.644, 0.44, 0.811, 0.821]\nD: [0.644, 0.44, 0.801, 0.908]", "question": "Here is an object ([0.572, 0.41, 0.747, 0.842]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.644, 0.44, 0.805, 0.861]\nB: [0.587, 0.544, 0.748, 0.965]\nC: [0.644, 0.44, 0.811, 0.821]\nD: [0.644, 0.44, 0.801, 0.908]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_191_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_191_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.6, 0.292, 0.677, 0.412]\nB: [0.747, 0.479, 0.991, 1.056]\nC: [0.747, 0.479, 1.0, 1.0]\nD: [0.042, 0.16, 0.117, 0.547]", "question": "Here is an object ([0.755, 0.472, 1.0, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.6, 0.292, 0.677, 0.412]\nB: [0.747, 0.479, 0.991, 1.056]\nC: [0.747, 0.479, 1.0, 1.0]\nD: [0.042, 0.16, 0.117, 0.547]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_192_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_192_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.268, 0.356, 0.665, 1.0]\nB: [0.384, 0.329, 0.781, 0.974]\nC: [0.5, 0.258, 0.897, 0.903]\nD: [0.466, 0.153, 0.863, 0.797]", "question": "Here is an object ([0.386, 0.329, 0.791, 0.968]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.268, 0.356, 0.665, 1.0]\nB: [0.384, 0.329, 0.781, 0.974]\nC: [0.5, 0.258, 0.897, 0.903]\nD: [0.466, 0.153, 0.863, 0.797]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_193_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_193_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.243, 0.956, 0.284, 0.975]\nB: [0.382, 0.301, 0.875, 0.646]\nC: [0.382, 0.301, 1.019, 0.606]\nD: [0.382, 0.301, 0.919, 0.646]", "question": "Here is an object ([0.411, 0.268, 0.728, 0.903]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 960 and the height is 720.", "context": "Select from the following choices.\nA: [0.243, 0.956, 0.284, 0.975]\nB: [0.382, 0.301, 0.875, 0.646]\nC: [0.382, 0.301, 1.019, 0.606]\nD: [0.382, 0.301, 0.919, 0.646]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_194_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_194_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.388, 0.347, 0.992, 0.839]\nB: [0.388, 0.347, 0.977, 0.91]\nC: [0.477, 0.579, 0.912, 0.9]\nD: [0.388, 0.347, 1.089, 0.938]", "question": "Here is an object ([0.386, 0.367, 0.98, 0.921]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.388, 0.347, 0.992, 0.839]\nB: [0.388, 0.347, 0.977, 0.91]\nC: [0.477, 0.579, 0.912, 0.9]\nD: [0.388, 0.347, 1.089, 0.938]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_195_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_195_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.228, 0.114, 0.254, 0.601]\nB: [0.327, 0.138, 1.0, 1.0]\nC: [0.327, 0.138, 1.021, 0.939]\nD: [0.327, 0.0, 1.0, 0.863]", "question": "Here is an object ([0.332, 0.122, 1.0, 1.0]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.228, 0.114, 0.254, 0.601]\nB: [0.327, 0.138, 1.0, 1.0]\nC: [0.327, 0.138, 1.021, 0.939]\nD: [0.327, 0.0, 1.0, 0.863]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_196_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_196_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.426, 0.447, 0.531, 0.756]\nB: [0.426, 0.447, 0.534, 0.776]\nC: [0.426, 0.447, 0.53, 0.783]\nD: [0.867, 0.138, 0.923, 0.214]", "question": "Here is an object ([0.431, 0.433, 0.585, 0.769]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.426, 0.447, 0.531, 0.756]\nB: [0.426, 0.447, 0.534, 0.776]\nC: [0.426, 0.447, 0.53, 0.783]\nD: [0.867, 0.138, 0.923, 0.214]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_197_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_197_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "youtubevis2019_sot", "options": "A: [0.341, 0.492, 0.753, 0.8]\nB: [0.152, 0.436, 0.563, 0.744]\nC: [0.168, 0.04, 0.502, 0.061]\nD: [0.593, 0.619, 0.761, 0.656]", "question": "Here is an object ([0.366, 0.504, 0.786, 0.806]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.341, 0.492, 0.753, 0.8]\nB: [0.152, 0.436, 0.563, 0.744]\nC: [0.168, 0.04, 0.502, 0.061]\nD: [0.593, 0.619, 0.761, 0.656]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_198_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_198_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "single_object_tracking", "visual_input_component": "['synthetic_image']", "source": "ovis_sot", "options": "A: [0.049, 0.143, 0.895, 0.719]\nB: [0.049, 0.143, 0.806, 0.606]\nC: [0.049, 0.143, 0.788, 0.667]\nD: [0.246, 0.05, 0.512, 0.212]", "question": "Here is an object ([0.056, 0.144, 0.791, 0.665]) in the Image 1. Please give the coordinations of this object in the Image 2. The bounding box coordinates are in the format [x1, y1, x2, y2], where [x1, y1] are the top-left coordinates and [x2, y2] are the bottom-right coordinates of the target object's bounding box. Note that the width of the input RGB image is 1280 and the height is 720.", "context": "Select from the following choices.\nA: [0.049, 0.143, 0.895, 0.719]\nB: [0.049, 0.143, 0.806, 0.606]\nC: [0.049, 0.143, 0.788, 0.667]\nD: [0.246, 0.05, 0.512, 0.212]", "input_image_path": ["./2D-spatial/single_object_tracking/single_object_tracking_199_0.jpg", "./2D-spatial/single_object_tracking/single_object_tracking_199_1.jpg"], "output": "C", "qwen3-vl": "image none"}]
\ No newline at end of file
diff --git a/results/sketch2image_retrieval/qwen3-vl/metadata_info.json b/results/sketch2image_retrieval/qwen3-vl/metadata_info.json
new file mode 100644
index 0000000..2334c0d
--- /dev/null
+++ b/results/sketch2image_retrieval/qwen3-vl/metadata_info.json
@@ -0,0 +1 @@
+[{"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_0_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_0_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_0_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_0_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_1_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_1_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_1_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_1_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_2_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_2_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_2_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_2_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_3_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_3_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_3_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_3_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_4_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_4_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_4_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_4_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_5_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_5_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_5_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_5_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_6_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_6_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_6_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_6_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_7_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_7_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_7_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_7_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_8_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_8_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_8_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_8_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_9_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_9_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_9_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_9_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_10_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_10_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_10_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_10_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_11_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_11_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_11_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_11_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_12_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_12_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_12_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_12_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_13_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_13_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_13_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_13_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_14_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_14_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_14_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_14_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_15_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_15_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_15_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_15_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_16_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_16_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_16_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_16_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_17_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_17_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_17_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_17_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_18_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_18_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_18_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_18_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_19_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_19_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_19_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_19_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_20_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_20_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_20_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_20_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_21_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_21_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_21_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_21_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_22_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_22_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_22_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_22_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_23_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_23_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_23_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_23_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_24_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_24_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_24_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_24_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_25_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_25_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_25_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_25_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_26_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_26_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_26_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_26_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_27_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_27_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_27_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_27_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_28_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_28_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_28_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_28_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_29_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_29_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_29_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_29_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_30_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_30_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_30_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_30_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_31_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_31_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_31_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_31_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_32_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_32_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_32_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_32_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_33_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_33_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_33_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_33_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_34_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_34_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_34_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_34_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_35_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_35_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_35_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_35_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_36_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_36_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_36_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_36_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_37_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_37_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_37_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_37_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_38_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_38_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_38_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_38_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_39_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_39_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_39_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_39_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_40_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_40_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_40_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_40_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_41_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_41_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_41_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_41_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_42_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_42_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_42_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_42_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_43_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_43_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_43_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_43_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_44_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_44_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_44_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_44_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_45_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_45_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_45_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_45_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_46_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_46_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_46_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_46_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_47_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_47_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_47_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_47_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_48_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_48_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_48_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_48_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_49_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_49_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_49_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_49_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_50_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_50_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_50_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_50_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_51_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_51_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_51_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_51_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_52_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_52_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_52_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_52_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_53_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_53_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_53_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_53_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_54_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_54_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_54_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_54_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_55_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_55_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_55_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_55_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_56_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_56_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_56_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_56_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_57_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_57_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_57_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_57_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_58_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_58_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_58_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_58_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_59_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_59_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_59_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_59_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_60_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_60_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_60_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_60_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_61_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_61_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_61_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_61_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_62_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_62_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_62_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_62_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_63_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_63_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_63_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_63_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_64_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_64_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_64_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_64_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_65_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_65_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_65_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_65_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_66_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_66_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_66_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_66_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_67_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_67_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_67_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_67_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_68_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_68_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_68_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_68_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_69_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_69_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_69_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_69_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_70_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_70_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_70_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_70_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_71_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_71_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_71_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_71_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_72_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_72_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_72_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_72_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_73_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_73_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_73_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_73_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_74_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_74_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_74_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_74_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_75_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_75_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_75_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_75_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_76_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_76_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_76_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_76_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_77_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_77_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_77_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_77_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_78_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_78_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_78_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_78_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_79_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_79_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_79_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_79_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_80_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_80_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_80_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_80_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_81_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_81_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_81_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_81_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_82_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_82_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_82_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_82_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_83_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_83_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_83_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_83_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_84_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_84_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_84_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_84_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_85_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_85_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_85_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_85_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_86_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_86_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_86_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_86_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_87_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_87_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_87_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_87_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_88_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_88_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_88_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_88_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_89_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_89_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_89_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_89_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_90_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_90_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_90_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_90_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_91_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_91_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_91_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_91_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_92_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_92_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_92_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_92_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_93_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_93_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_93_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_93_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_94_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_94_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_94_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_94_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_95_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_95_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_95_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_95_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_96_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_96_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_96_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_96_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_97_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_97_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_97_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_97_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_98_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_98_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_98_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_98_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_99_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_99_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_99_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_99_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_100_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_100_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_100_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_100_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_101_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_101_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_101_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_101_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_102_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_102_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_102_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_102_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_103_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_103_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_103_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_103_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_104_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_104_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_104_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_104_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_105_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_105_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_105_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_105_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_106_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_106_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_106_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_106_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_107_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_107_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_107_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_107_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_108_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_108_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_108_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_108_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_109_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_109_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_109_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_109_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_110_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_110_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_110_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_110_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_111_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_111_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_111_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_111_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_112_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_112_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_112_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_112_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_113_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_113_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_113_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_113_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_114_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_114_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_114_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_114_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_115_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_115_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_115_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_115_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_116_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_116_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_116_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_116_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_117_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_117_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_117_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_117_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_118_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_118_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_118_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_118_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_119_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_119_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_119_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_119_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_120_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_120_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_120_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_120_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_121_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_121_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_121_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_121_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_122_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_122_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_122_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_122_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_123_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_123_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_123_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_123_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_124_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_124_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_124_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_124_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_125_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_125_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_125_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_125_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_126_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_126_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_126_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_126_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_127_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_127_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_127_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_127_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_128_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_128_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_128_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_128_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_129_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_129_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_129_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_129_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_130_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_130_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_130_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_130_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_131_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_131_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_131_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_131_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_132_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_132_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_132_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_132_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_133_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_133_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_133_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_133_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_134_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_134_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_134_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_134_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_135_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_135_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_135_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_135_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_136_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_136_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_136_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_136_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_137_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_137_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_137_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_137_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_138_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_138_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_138_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_138_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_139_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_139_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_139_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_139_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_140_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_140_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_140_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_140_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_141_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_141_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_141_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_141_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_142_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_142_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_142_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_142_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_143_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_143_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_143_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_143_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_144_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_144_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_144_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_144_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_145_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_145_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_145_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_145_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_146_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_146_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_146_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_146_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_147_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_147_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_147_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_147_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_148_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_148_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_148_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_148_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_149_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_149_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_149_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_149_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_150_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_150_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_150_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_150_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_151_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_151_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_151_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_151_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_152_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_152_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_152_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_152_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_153_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_153_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_153_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_153_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_154_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_154_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_154_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_154_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_155_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_155_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_155_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_155_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_156_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_156_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_156_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_156_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_157_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_157_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_157_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_157_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_158_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_158_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_158_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_158_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_159_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_159_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_159_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_159_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_160_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_160_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_160_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_160_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_161_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_161_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_161_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_161_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_162_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_162_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_162_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_162_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_163_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_163_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_163_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_163_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_164_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_164_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_164_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_164_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_165_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_165_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_165_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_165_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_166_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_166_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_166_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_166_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_167_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_167_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_167_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_167_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_168_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_168_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_168_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_168_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_169_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_169_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_169_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_169_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_170_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_170_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_170_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_170_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_171_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_171_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_171_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_171_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_172_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_172_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_172_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_172_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_173_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_173_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_173_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_173_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_174_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_174_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_174_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_174_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_175_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_175_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_175_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_175_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_176_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_176_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_176_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_176_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_177_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_177_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_177_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_177_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_178_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_178_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_178_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_178_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_179_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_179_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_179_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_179_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_180_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_180_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_180_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_180_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_181_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_181_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_181_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_181_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_182_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_182_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_182_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_182_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_183_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_183_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_183_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_183_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_184_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_184_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_184_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_184_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_185_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_185_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_185_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_185_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_186_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_186_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_186_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_186_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_187_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_187_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_187_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_187_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_188_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_188_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_188_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_188_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_189_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_189_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_189_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_189_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_190_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_190_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_190_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_190_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_191_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_191_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_191_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_191_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_192_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_192_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_192_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_192_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_193_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_193_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_193_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_193_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_194_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_194_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_194_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_194_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_195_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_195_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_195_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_195_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "quickdraw_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_196_0.png", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_196_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_196_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_196_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_197_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_197_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_197_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_197_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_198_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_198_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_198_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_198_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "sketch2image_retrieval", "visual_input_component": "['natural_image', 'sketch_image']", "source": "DomainNet_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image", "question": "Please retrieve the most similar image to the Query Image in the candidate Images.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image", "input_image_path": ["./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_199_0.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_199_1.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_199_2.jpg", "./High-level-obj-semantic/sketch2image_retrieval/sketch2image_retrieval_199_3.jpg"], "output": "C", "qwen3-vl": "image none"}]
\ No newline at end of file
diff --git a/results/spot_the_diff/qwen3-vl/metadata_info.json b/results/spot_the_diff/qwen3-vl/metadata_info.json
new file mode 100644
index 0000000..8038220
--- /dev/null
+++ b/results/spot_the_diff/qwen3-vl/metadata_info.json
@@ -0,0 +1 @@
+[{"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: the guy is no longer by his car he is further towards the middle of the parking lot\nB: The man is now riding a bicycle in the park.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: the guy is no longer by his car he is further towards the middle of the parking lot\nB: The man is now riding a bicycle in the park.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_0_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_0_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: the 2 people on the sidewalk are gone\nB: A colorful mural of a cityscape fills the entire wall, with no people in sight.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: the 2 people on the sidewalk are gone\nB: A colorful mural of a cityscape fills the entire wall, with no people in sight.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_1_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_1_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: Empty streets with no signs of activity\nB: less cars parked", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: Empty streets with no signs of activity\nB: less cars parked", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_2_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_2_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: A group of friends enjoying a picnic in the park\nB: the silver car left the parking lot", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: A group of friends enjoying a picnic in the park\nB: the silver car left the parking lot", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_3_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_3_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: two pepole standing by building is no longer there\nB: A colorful hot air balloon floating in the sky over a peaceful lake with a small cabin on the shore.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: two pepole standing by building is no longer there\nB: A colorful hot air balloon floating in the sky over a peaceful lake with a small cabin on the shore.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_4_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_4_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: The second picture features a diverse collection of plants from various climates and regions.\nB: there are more people in the second picture", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: The second picture features a diverse collection of plants from various climates and regions.\nB: there are more people in the second picture", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_5_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_5_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: there are now two people by the yellow poles\nB: A colorful parrot sits on a tree branch, preening its feathers.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: there are now two people by the yellow poles\nB: A colorful parrot sits on a tree branch, preening its feathers.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_6_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_6_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: there s one more car in the group of cars on the far left of the parking lot and also in the group in the center of the lot\nB: A flock of birds is flying in the clear blue sky above the parking lot", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: there s one more car in the group of cars on the far left of the parking lot and also in the group in the center of the lot\nB: A flock of birds is flying in the clear blue sky above the parking lot", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_7_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_7_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: The images appear to be in motion due to the blur effect, creating a sense of dynamism and energy.\nB: there are some people in the right image", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: The images appear to be in motion due to the blur effect, creating a sense of dynamism and energy.\nB: there are some people in the right image", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_8_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_8_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: The color of the sky is a vibrant shade of purple, and the ground is covered in a shimmering layer of silver dust.\nB: there are 6 people on the right side and none on the left", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: The color of the sky is a vibrant shade of purple, and the ground is covered in a shimmering layer of silver dust.\nB: there are 6 people on the right side and none on the left", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_9_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_9_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: The people in the left picture are wearing hats.\nB: there are four more people in the right picture", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: The people in the left picture are wearing hats.\nB: there are four more people in the right picture", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_10_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_10_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: Both images show a beautiful landscape with colorful flowers and lush greenery. In the distance, a majestic mountain range rises against the horizon, creating a stunning backdrop for the scene.\nB: there are more people in the one on the right than on the left", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: Both images show a beautiful landscape with colorful flowers and lush greenery. In the distance, a majestic mountain range rises against the horizon, creating a stunning backdrop for the scene.\nB: there are more people in the one on the right than on the left", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_11_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_11_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: The images show a difference in the weather between them.\nB: there are less people in the after image", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: The images show a difference in the weather between them.\nB: there are less people in the after image", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_12_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_12_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: The road is now empty with no signs of any vehicles or people.\nB: a person on a motor cycle is no longer in the image", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: The road is now empty with no signs of any vehicles or people.\nB: a person on a motor cycle is no longer in the image", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_13_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_13_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: four people have appeared on the picture\nB: The image shows a colorful landscape with a beautiful sunset.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: four people have appeared on the picture\nB: The image shows a colorful landscape with a beautiful sunset.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_14_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_14_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: The colors in the first image sure are vibrant! I love how the light hits the scenery.\nB: the picture on the right has less people than the one on the left", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: The colors in the first image sure are vibrant! I love how the light hits the scenery.\nB: the picture on the right has less people than the one on the left", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_15_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_15_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: The colorful mural on the wall seems to come to life as the vibrant hues blend together in a mesmerizing dance.\nB: the people on the stairs are closer", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: The colorful mural on the wall seems to come to life as the vibrant hues blend together in a mesmerizing dance.\nB: the people on the stairs are closer", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_16_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_16_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: more pedestrians visiable\nB: A group of colorful hot air balloons soaring through the sky", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: more pedestrians visiable\nB: A group of colorful hot air balloons soaring through the sky", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_17_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_17_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: there was six people standing around and now there are two\nB: The room was brightly lit with colorful decorations and lively music playing in the background.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: there was six people standing around and now there are two\nB: The room was brightly lit with colorful decorations and lively music playing in the background.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_18_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_18_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: no noticeable differences\nB: A red apple sits on a wooden table, next to a stack of books.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: no noticeable differences\nB: A red apple sits on a wooden table, next to a stack of books.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_19_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_19_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: A colorful meadow with a blue sky and fluffy clouds.\nB: there is no difference", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: A colorful meadow with a blue sky and fluffy clouds.\nB: there is no difference", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_20_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_20_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: no noticeable changes have occurred\nB: An interesting juxtaposition of colors and shadows is evident.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: no noticeable changes have occurred\nB: An interesting juxtaposition of colors and shadows is evident.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_21_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_21_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: the white car is no longer on the road adjacent to the car parking lot\nB: A group of people are playing on a sunny beach near the ocean.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: the white car is no longer on the road adjacent to the car parking lot\nB: A group of people are playing on a sunny beach near the ocean.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_22_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_22_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: A cat is sitting on a tree branch\nB: there are two people standing in the street", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: A cat is sitting on a tree branch\nB: there are two people standing in the street", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_23_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_23_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: A small red house sits on top of a hill with a beautiful sunset in the background.\nB: the after image contains two individuals standing near the yellow poles in the upper left hand quadrant of the image", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: A small red house sits on top of a hill with a beautiful sunset in the background.\nB: the after image contains two individuals standing near the yellow poles in the upper left hand quadrant of the image", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_24_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_24_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: the left picture has people standing around talking\nB: The left picture shows a colorful array of geometric shapes floating in mid-air.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: the left picture has people standing around talking\nB: The left picture shows a colorful array of geometric shapes floating in mid-air.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_25_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_25_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: The sky is filled with colorful hot air balloons.\nB: the man is road", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: The sky is filled with colorful hot air balloons.\nB: the man is road", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_26_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_26_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: more cars in picture\nB: A group of people playing frisbee in a park.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: more cars in picture\nB: A group of people playing frisbee in a park.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_27_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_27_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: one person has left the image from the before to after image\nB: The color blue dominates the scene, with various shades blending together in an abstract pattern.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: one person has left the image from the before to after image\nB: The color blue dominates the scene, with various shades blending together in an abstract pattern.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_28_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_28_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: all the people are new\nB: The room is decorated with vintage furniture and colorful murals.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: all the people are new\nB: The room is decorated with vintage furniture and colorful murals.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_29_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_29_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: The left image features a beautiful sunset, while the right image captures a bustling city at night.\nB: there are only 3 people in the left image but the right image has 7 people", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: The left image features a beautiful sunset, while the right image captures a bustling city at night.\nB: there are only 3 people in the left image but the right image has 7 people", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_30_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_30_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: there are less pedestrians\nB: The buildings in the background have a unique architecture that stands out in the cityscape.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: there are less pedestrians\nB: The buildings in the background have a unique architecture that stands out in the cityscape.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_31_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_31_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: The blue car is now the only one in the parking lot\nB: the red car is no longer there", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: The blue car is now the only one in the parking lot\nB: the red car is no longer there", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_32_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_32_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: the group of people has moved to the left\nB: A colorful mural adorns the side of the building, depicting various abstract shapes and patterns.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: the group of people has moved to the left\nB: A colorful mural adorns the side of the building, depicting various abstract shapes and patterns.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_33_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_33_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: the white truck is not there anumore\nB: A group of birds is flying in the clear blue sky.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: the white truck is not there anumore\nB: A group of birds is flying in the clear blue sky.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_34_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_34_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: The clouds overhead resemble an elaborate maze, with no clear path to navigate.\nB: the people in the lot have only moved slightly", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: The clouds overhead resemble an elaborate maze, with no clear path to navigate.\nB: the people in the lot have only moved slightly", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_35_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_35_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: nothing in the second shot appears to have changed\nB: The second image features a completely different setting and subject matter than the first image", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: nothing in the second shot appears to have changed\nB: The second image features a completely different setting and subject matter than the first image", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_36_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_36_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: A group of people are enjoying a picnic in a park\nB: the van has backed out the parking space it was in", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: A group of people are enjoying a picnic in a park\nB: the van has backed out the parking space it was in", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_37_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_37_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: the white car is no longer there\nB: An unexpected gathering of birds occurred in the vicinity.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: the white car is no longer there\nB: An unexpected gathering of birds occurred in the vicinity.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_38_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_38_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: The van seems to be covered in graffiti and shows signs of wear and tear, while a squirrel is perched on top of it.\nB: there appears to be a person standing next to the van which is parked closest to the building", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: The van seems to be covered in graffiti and shows signs of wear and tear, while a squirrel is perched on top of it.\nB: there appears to be a person standing next to the van which is parked closest to the building", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_39_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_39_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: the group of people have changed places in the circle\nB: The sun is setting, casting an orange glow across the landscape.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: the group of people have changed places in the circle\nB: The sun is setting, casting an orange glow across the landscape.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_40_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_40_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: A group of birds is flying over the lake between the trees.\nB: there are two fewer people standing near the nellow poles", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: A group of birds is flying over the lake between the trees.\nB: there are two fewer people standing near the nellow poles", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_41_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_41_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: people are in the photo in after\nB: The photo features a vibrant sunset over a peaceful lake with a lone boat drifting across the water.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: people are in the photo in after\nB: The photo features a vibrant sunset over a peaceful lake with a lone boat drifting across the water.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_42_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_42_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: the two people are no longer there near the dividers\nB: The empty chairs overlook the vast desert landscape.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: the two people are no longer there near the dividers\nB: The empty chairs overlook the vast desert landscape.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_43_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_43_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: there are 6 people rather than 2 in the drive\nB: The picture shows a scenic mountain landscape with a lake, instead of an urban cityscape.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: there are 6 people rather than 2 in the drive\nB: The picture shows a scenic mountain landscape with a lake, instead of an urban cityscape.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_44_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_44_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: the left image has two people in it and the right image has the top of one person in it\nB: The left image features a beautiful sunset over the ocean, while the right image shows a busy city street with tall buildings.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: the left image has two people in it and the right image has the top of one person in it\nB: The left image features a beautiful sunset over the ocean, while the right image shows a busy city street with tall buildings.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_45_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_45_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: The birds are gathering on the roof to form a plan for world domination.\nB: a truck in the corner has dissapeared", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: The birds are gathering on the roof to form a plan for world domination.\nB: a truck in the corner has dissapeared", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_46_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_46_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: A group of friends enjoying a picnic in the park\nB: only two people outside of parking lot", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: A group of friends enjoying a picnic in the park\nB: only two people outside of parking lot", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_47_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_47_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: A flock of birds is flying high in the sky, casting shadows on the ground.\nB: a car is leaving the parking lot near the top of the picture to the left of the white box truck", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: A flock of birds is flying high in the sky, casting shadows on the ground.\nB: a car is leaving the parking lot near the top of the picture to the left of the white box truck", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_48_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_48_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: in the before photo there appear to be 5 people huddled close together and in the after photo there are about 7 people a little more spread out with some beginning to walk away from the others\nB: In the first image, a butterfly is resting on a flower, while in the second image, a squirrel is scampering up a tree.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: in the before photo there appear to be 5 people huddled close together and in the after photo there are about 7 people a little more spread out with some beginning to walk away from the others\nB: In the first image, a butterfly is resting on a flower, while in the second image, a squirrel is scampering up a tree.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_49_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_49_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: A group of colorful balloons floating in the sky\nB: the three humanis availble", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: A group of colorful balloons floating in the sky\nB: the three humanis availble", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_50_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_50_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: The second image features a vibrant and diverse collection of flowers in a garden, with a rainbow visible in the background.\nB: there are less people on 2nd photo also they are located off the photo", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: The second image features a vibrant and diverse collection of flowers in a garden, with a rainbow visible in the background.\nB: there are less people on 2nd photo also they are located off the photo", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_51_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_51_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: The colors of the sunset are vibrant and striking, painting the sky in a beautiful array of hues.\nB: the car driving is not there anymore", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: The colors of the sunset are vibrant and striking, painting the sky in a beautiful array of hues.\nB: the car driving is not there anymore", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_52_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_52_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: The colorful balloons filled the room as the clown performed magic tricks for the delighted audience.\nB: the kids appear again to play a game in the lot", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: The colorful balloons filled the room as the clown performed magic tricks for the delighted audience.\nB: the kids appear again to play a game in the lot", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_53_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_53_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: A group of people are having a picnic on the grass near the parking lot.\nB: there is a red car parked at the far end of the middle row of cars", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: A group of people are having a picnic on the grass near the parking lot.\nB: there is a red car parked at the far end of the middle row of cars", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_54_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_54_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: The yellow sun sets behind the mountains, casting a warm glow over the landscape.\nB: group is gone", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: The yellow sun sets behind the mountains, casting a warm glow over the landscape.\nB: group is gone", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_55_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_55_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: before picture truck leaving after picture car entering before picture people beside red car before picture white car between blue and grey car after picture more cars after picture person walking across parking lot\nB: The photos capture the transformation of a barren landscape into a vibrant garden with colorful flowers and trees.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: before picture truck leaving after picture car entering before picture people beside red car before picture white car between blue and grey car after picture more cars after picture person walking across parking lot\nB: The photos capture the transformation of a barren landscape into a vibrant garden with colorful flowers and trees.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_56_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_56_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: the cars in the intersection have moved ahead\nB: The traffic lights in the intersection are all green", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: the cars in the intersection have moved ahead\nB: The traffic lights in the intersection are all green", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_57_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_57_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: A flock of colorful birds flies across the clear blue sky.\nB: there is a person walking with a red umbrella", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: A flock of colorful birds flies across the clear blue sky.\nB: there is a person walking with a red umbrella", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_58_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_58_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: a silver car has pulled up near the dumpster\nB: A group of friends are enjoying a picnic in a sunny park", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: a silver car has pulled up near the dumpster\nB: A group of friends are enjoying a picnic in a sunny park", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_59_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_59_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: The painting on the wall seems to be incomplete, with just one corner colored in.\nB: the people in the stairs are in different locations", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: The painting on the wall seems to be incomplete, with just one corner colored in.\nB: the people in the stairs are in different locations", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_60_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_60_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: no changes were observed\nB: A swirling vortex of colorful shapes and patterns filled the frame, creating a hypnotic and mesmerizing effect.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: no changes were observed\nB: A swirling vortex of colorful shapes and patterns filled the frame, creating a hypnotic and mesmerizing effect.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_61_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_61_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: In the new image, the background has been completely transformed into a lush green forest with a waterfall in the distance.\nB: the group of people on the right hand image has changed to where the person to the far left is in a red shirt and the other image it is a blue shirt", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: In the new image, the background has been completely transformed into a lush green forest with a waterfall in the distance.\nB: the group of people on the right hand image has changed to where the person to the far left is in a red shirt and the other image it is a blue shirt", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_62_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_62_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: there is more cars\nB: The sky is filled with vibrant colors and swirling patterns while birds fly in formation.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: there is more cars\nB: The sky is filled with vibrant colors and swirling patterns while birds fly in formation.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_63_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_63_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: the crowd of people in the parking lot have shifted slightly\nB: A flock of seagulls flew overhead, casting long shadows across the deserted beach.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: the crowd of people in the parking lot have shifted slightly\nB: A flock of seagulls flew overhead, casting long shadows across the deserted beach.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_64_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_64_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: The color of the sky has changed from blue to purple in the blink of an eye\nB: the people have moved slightly", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: The color of the sky has changed from blue to purple in the blink of an eye\nB: the people have moved slightly", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_65_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_65_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: the black has moved slightly down the road\nB: A thick layer of fog envelops the entire city, creating an eerie and mysterious atmosphere.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: the black has moved slightly down the road\nB: A thick layer of fog envelops the entire city, creating an eerie and mysterious atmosphere.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_66_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_66_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: The image does not feature a group of people playing sports on a grass field.\nB: the picture on the right does not have a dark colored vehicle backing out of a parking spot", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: The image does not feature a group of people playing sports on a grass field.\nB: the picture on the right does not have a dark colored vehicle backing out of a parking spot", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_67_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_67_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: A group of colorful hot air balloons floating in the sky.\nB: the people at the end of th elot have moved", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: A group of colorful hot air balloons floating in the sky.\nB: the people at the end of th elot have moved", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_68_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_68_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: there is three more people\nB: The room is filled with colorful balloons and streamers, creating a festive atmosphere.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: there is three more people\nB: The room is filled with colorful balloons and streamers, creating a festive atmosphere.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_69_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_69_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: people have moved towards the left frame with some leaving the frame\nB: The sunlight creates interesting patterns on the ground, with shadows forming unique shapes.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: people have moved towards the left frame with some leaving the frame\nB: The sunlight creates interesting patterns on the ground, with shadows forming unique shapes.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_70_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_70_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: in the after image the people walking in the lot are in a different location than in the before image\nB: The before and after images show a comparison of different types of flowers blooming in a garden.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: in the after image the people walking in the lot are in a different location than in the before image\nB: The before and after images show a comparison of different types of flowers blooming in a garden.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_71_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_71_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: there is now a person walking on the left hand side of the lot\nB: A group of birds is perched on the building's roof.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: there is now a person walking on the left hand side of the lot\nB: A group of birds is perched on the building's roof.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_72_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_72_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: The second image features a vibrant display of colorful flowers and plants, with a soothing waterfall in the background.\nB: there is four more people standing around in the second photo", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: The second image features a vibrant display of colorful flowers and plants, with a soothing waterfall in the background.\nB: there is four more people standing around in the second photo", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_73_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_73_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: there are people now in the front\nB: The image shows a vibrant city skyline with a beautiful sunset casting colorful hues across the sky.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: there are people now in the front\nB: The image shows a vibrant city skyline with a beautiful sunset casting colorful hues across the sky.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_74_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_74_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: there are two cars on the road on the right side of the screen\nB: A group of people is playing beach volleyball under a clear blue sky.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: there are two cars on the road on the right side of the screen\nB: A group of people is playing beach volleyball under a clear blue sky.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_75_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_75_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: A small kitten playing with a ball of yarn\nB: a group of people walking", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: A small kitten playing with a ball of yarn\nB: a group of people walking", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_76_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_76_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: there are now 6 people standing near the yellow poles that weren t there before\nB: A group of birds are perched on top of the yellow poles, looking out over the water below.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: there are now 6 people standing near the yellow poles that weren t there before\nB: A group of birds are perched on top of the yellow poles, looking out over the water below.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_77_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_77_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: there is a person walking next to a car\nB: A pair of colorful parrots are perched on a tree branch.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: there is a person walking next to a car\nB: A pair of colorful parrots are perched on a tree branch.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_78_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_78_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: the picture on the right contains more people near the yellow poles\nB: The image depicts a bustling city with colorful umbrellas scattered throughout the scene, giving it a vibrant and lively atmosphere.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: the picture on the right contains more people near the yellow poles\nB: The image depicts a bustling city with colorful umbrellas scattered throughout the scene, giving it a vibrant and lively atmosphere.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_79_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_79_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: there ar cars on the side road\nB: A group of people are gathering on the beach and enjoying a bonfire.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: there ar cars on the side road\nB: A group of people are gathering on the beach and enjoying a bonfire.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_80_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_80_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: The left picture shows a beautiful garden with colorful flowers and a small pond in the middle.\nB: right picture has a few less people standing around", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: The left picture shows a beautiful garden with colorful flowers and a small pond in the middle.\nB: right picture has a few less people standing around", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_81_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_81_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: The image depicts a variety of colorful flowers arranged in an aesthetically pleasing pattern.\nB: there are people visible in different sections of the frame", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: The image depicts a variety of colorful flowers arranged in an aesthetically pleasing pattern.\nB: there are people visible in different sections of the frame", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_82_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_82_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: bus is no longer there\nB: The sky is painted green in this image.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: bus is no longer there\nB: The sky is painted green in this image.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_83_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_83_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: this picture about two by the stairs and one by the red door\nB: The ravishing orange sunset illuminates the tranquil lake, casting a mesmerizing reflection in the water.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: this picture about two by the stairs and one by the red door\nB: The ravishing orange sunset illuminates the tranquil lake, casting a mesmerizing reflection in the water.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_84_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_84_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: The first image captures the beautiful sunrise over a calm lake, while the second image showcases a colorful street market filled with vibrant flowers and produce.\nB: the photo on the left has 2 people and the photo on the right has 6", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: The first image captures the beautiful sunrise over a calm lake, while the second image showcases a colorful street market filled with vibrant flowers and produce.\nB: the photo on the left has 2 people and the photo on the right has 6", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_85_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_85_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: The first image showcases a busy city street with cars and pedestrians, while the second image features a serene beach with crashing waves and a clear blue sky.\nB: their is 6 people in the first photo and none in the other", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: The first image showcases a busy city street with cars and pedestrians, while the second image features a serene beach with crashing waves and a clear blue sky.\nB: their is 6 people in the first photo and none in the other", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_86_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_86_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: A cat is sleeping on a windowsill in the sunlight.\nB: the group of people have moved slightly", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: A cat is sleeping on a windowsill in the sunlight.\nB: the group of people have moved slightly", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_87_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_87_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: more people and the driver is walking away from his car\nB: A colorful hot air balloon is floating in the sky above a vast green field.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: more people and the driver is walking away from his car\nB: A colorful hot air balloon is floating in the sky above a vast green field.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_88_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_88_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: A group of people are having a picnic in the park in the first picture\nB: there is a black car driving in the parking lot in the second picture", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: A group of people are having a picnic in the park in the first picture\nB: there is a black car driving in the parking lot in the second picture", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_89_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_89_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: The photo captured a beautiful sunset over the ocean with vibrant colors reflecting off the water.\nB: 5 people that were in photo are now gone", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: The photo captured a beautiful sunset over the ocean with vibrant colors reflecting off the water.\nB: 5 people that were in photo are now gone", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_90_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_90_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: the black car is backed up on the first picture and not the 2nd picture\nB: There are tall buildings in the background of both pictures, but in the first picture, they are painted with vibrant colors, while in the second picture, they are painted with neutral colors.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: the black car is backed up on the first picture and not the 2nd picture\nB: There are tall buildings in the background of both pictures, but in the first picture, they are painted with vibrant colors, while in the second picture, they are painted with neutral colors.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_91_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_91_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: The artwork features an abundance of vibrant colors and textures.\nB: there is more people", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: The artwork features an abundance of vibrant colors and textures.\nB: there is more people", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_92_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_92_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: The after image features a vibrant array of colors and patterns that create a sense of movement and energy.\nB: in the after image there are fewer people", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: The after image features a vibrant array of colors and patterns that create a sense of movement and energy.\nB: in the after image there are fewer people", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_93_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_93_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: there are now three people standing walking in the parking lot\nB: The parking lot is full of colorful balloons and streamers, creating a festive atmosphere.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: there are now three people standing walking in the parking lot\nB: The parking lot is full of colorful balloons and streamers, creating a festive atmosphere.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_94_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_94_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: A flock of seagulls is flying above the ocean at sunset.\nB: the men aren t in the picture anymore by the yellow concrete posts", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: A flock of seagulls is flying above the ocean at sunset.\nB: the men aren t in the picture anymore by the yellow concrete posts", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_95_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_95_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: the people are not there anymore\nB: The sky is filled with bright colors, creating a dazzling display.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: the people are not there anymore\nB: The sky is filled with bright colors, creating a dazzling display.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_96_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_96_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: The after image captures a vibrant sunset over a calm lake, with silhouettes of trees lining the shore.\nB: the after image shows three people further down the steps compared to the before image", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: The after image captures a vibrant sunset over a calm lake, with silhouettes of trees lining the shore.\nB: the after image shows three people further down the steps compared to the before image", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_97_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_97_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: A group of people is having a picnic in the park\nB: there is no car driving around in the parking lot", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: A group of people is having a picnic in the park\nB: there is no car driving around in the parking lot", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_98_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_98_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: there are no people in the second photo\nB: The second photo features a tranquil sunset over a calm lake.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: there are no people in the second photo\nB: The second photo features a tranquil sunset over a calm lake.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_99_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_99_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: A red car is parked in front of a brick building.\nB: on the stairs case there is two people walking up", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: A red car is parked in front of a brick building.\nB: on the stairs case there is two people walking up", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_100_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_100_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: In the left picture, a group of animals is roaming freely in the wilderness, while in the right picture, a lone figure stands on a mountaintop, gazing into the distance.\nB: there is more people standing in the right picture then the left", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: In the left picture, a group of animals is roaming freely in the wilderness, while in the right picture, a lone figure stands on a mountaintop, gazing into the distance.\nB: there is more people standing in the right picture then the left", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_101_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_101_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: A lively marketplace with vendors selling colorful wares under the bright summer sun.\nB: the number of people congregating in the group has gone down", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: A lively marketplace with vendors selling colorful wares under the bright summer sun.\nB: the number of people congregating in the group has gone down", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_102_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_102_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: The colorful flowers bloom in the garden, attracting bees and butterflies.\nB: the people get into a tight group to have a conversation", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: The colorful flowers bloom in the garden, attracting bees and butterflies.\nB: the people get into a tight group to have a conversation", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_103_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_103_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: the people walking around in the lot have left\nB: The colorful balloons are slowly deflating as they hang on the back of the chairs.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: the people walking around in the lot have left\nB: The colorful balloons are slowly deflating as they hang on the back of the chairs.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_104_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_104_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: The image appears to be taken during the evening, with a reddish hue dominating the scene.\nB: there are less people and they have moved to the left", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: The image appears to be taken during the evening, with a reddish hue dominating the scene.\nB: there are less people and they have moved to the left", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_105_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_105_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: The sunset painted the sky with vibrant streaks of purple and orange, creating a mesmerizing backdrop for the tranquil lake.\nB: there are now two people there", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: The sunset painted the sky with vibrant streaks of purple and orange, creating a mesmerizing backdrop for the tranquil lake.\nB: there are now two people there", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_106_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_106_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: in the before image there are 4 people visible 2 by the car and 2 by the cones while in the after image there are 6 people visible 3 at the cones 1 at the car and 3 leaving the scene\nB: The images show a transformation from day to night, with the sky changing from blue to vibrant purple, and the surroundings shifting from bright to dark.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: in the before image there are 4 people visible 2 by the car and 2 by the cones while in the after image there are 6 people visible 3 at the cones 1 at the car and 3 leaving the scene\nB: The images show a transformation from day to night, with the sky changing from blue to vibrant purple, and the surroundings shifting from bright to dark.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_107_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_107_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: The after picture shows a small park with several benches and a fountain in the center.\nB: a group of young men has gathered in the after picture on the sidewalk", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: The after picture shows a small park with several benches and a fountain in the center.\nB: a group of young men has gathered in the after picture on the sidewalk", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_108_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_108_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: right image shows three vehicles traveling in same direction while left image shows to vehicles traveling in opposite directions\nB: The images depict various patterns of traffic flow in urban and rural settings.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: right image shows three vehicles traveling in same direction while left image shows to vehicles traveling in opposite directions\nB: The images depict various patterns of traffic flow in urban and rural settings.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_109_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_109_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: The image captures a surreal landscape with swirling colors and distorted shapes.\nB: there are no visible people in the frame", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: The image captures a surreal landscape with swirling colors and distorted shapes.\nB: there are no visible people in the frame", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_110_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_110_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: In the first picture, there is a small child playing with a dog while in the second picture, there are several people riding bikes.\nB: in the second picture there is one big group of people as compare to two groups in the first picture", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: In the first picture, there is a small child playing with a dog while in the second picture, there are several people riding bikes.\nB: in the second picture there is one big group of people as compare to two groups in the first picture", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_111_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_111_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: there are people walking towards the sidewalk\nB: The vibrant colors of the street art create a dynamic contrast against the dull cityscape.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: there are people walking towards the sidewalk\nB: The vibrant colors of the street art create a dynamic contrast against the dull cityscape.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_112_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_112_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: the picture on the right has more people\nB: The composition of the image is dominated by bright colors and geometric shapes, creating a visual contrast between order and chaos.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: the picture on the right has more people\nB: The composition of the image is dominated by bright colors and geometric shapes, creating a visual contrast between order and chaos.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_113_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_113_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: people are grouped closer together\nB: A clear blue sky with white fluffy clouds", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: people are grouped closer together\nB: A clear blue sky with white fluffy clouds", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_114_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_114_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: there are people now where it was empty before\nB: The sun was shining brightly and casting long shadows across the scene.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: there are people now where it was empty before\nB: The sun was shining brightly and casting long shadows across the scene.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_115_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_115_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: there is a person in a blue shirt and black pants walking on the left towards the bottom of the photo\nB: A colorful parrot is perched on a branch with vibrant feathers.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: there is a person in a blue shirt and black pants walking on the left towards the bottom of the photo\nB: A colorful parrot is perched on a branch with vibrant feathers.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_116_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_116_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: A colorful display of art and nature intertwining.\nB: group is spreaded", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: A colorful display of art and nature intertwining.\nB: group is spreaded", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_117_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_117_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: The images depict a busy city intersection with cars and buildings, and a park with a fountain, and people exercising\nB: there is one person in the first picture and in the second there is two more who are walking and then another towards the bottom of the screen and the man in the first picture has moved towards the car", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: The images depict a busy city intersection with cars and buildings, and a park with a fountain, and people exercising\nB: there is one person in the first picture and in the second there is two more who are walking and then another towards the bottom of the screen and the man in the first picture has moved towards the car", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_118_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_118_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: there is many more people that are in the 2nd picture\nB: The second image features a vibrant display of colorful flowers in a field, with a clear blue sky in the background.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: there is many more people that are in the 2nd picture\nB: The second image features a vibrant display of colorful flowers in a field, with a clear blue sky in the background.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_119_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_119_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: The image appears to have been taken in the evening, but the lighting suggests it is morning.\nB: the group near the center of the image are now moved slightly from their original positions but still in roughly the same places", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: The image appears to have been taken in the evening, but the lighting suggests it is morning.\nB: the group near the center of the image are now moved slightly from their original positions but still in roughly the same places", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_120_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_120_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: The truck on the left is transporting a load of colorful balloons.\nB: truck on right is now where cars were", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: The truck on the left is transporting a load of colorful balloons.\nB: truck on right is now where cars were", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_121_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_121_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: there are two people by the poles\nB: The sunrise casts a warm glow over the serene landscape", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: there are two people by the poles\nB: The sunrise casts a warm glow over the serene landscape", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_122_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_122_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: after image does not show three people and group by the tree are in a different position\nB: The photograph captures a serene mountain landscape with a winding river cutting through the valley.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: after image does not show three people and group by the tree are in a different position\nB: The photograph captures a serene mountain landscape with a winding river cutting through the valley.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_123_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_123_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: there is a car exiting the lot on the far left\nB: A group of people are playing frisbee in the park", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: there is a car exiting the lot on the far left\nB: A group of people are playing frisbee in the park", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_124_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_124_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: three people are standing in the parking lot before and are not after\nB: A flock of seagulls is flying over a quiet beach on a sunny day.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: three people are standing in the parking lot before and are not after\nB: A flock of seagulls is flying over a quiet beach on a sunny day.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_125_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_125_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: The trees are greener and taller.\nB: people are more spread out", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: The trees are greener and taller.\nB: people are more spread out", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_126_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_126_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: The person on the hand truck is balancing on one foot while juggling colorful balls.\nB: the person pulling a hand truck has moved position", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: The person on the hand truck is balancing on one foot while juggling colorful balls.\nB: the person pulling a hand truck has moved position", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_127_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_127_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: bus is driving down lot\nB: A group of people are having a picnic near a lake", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: bus is driving down lot\nB: A group of people are having a picnic near a lake", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_128_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_128_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: to the upper right of the parking spaces the group of people seems to have moved or dispersed with only 3 people left who have moved to the left\nB: In the top left corner of the image, a flock of seagulls appears to be gathering and organizing themselves into a precise formation.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: to the upper right of the parking spaces the group of people seems to have moved or dispersed with only 3 people left who have moved to the left\nB: In the top left corner of the image, a flock of seagulls appears to be gathering and organizing themselves into a precise formation.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_129_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_129_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: the position of the trolley has been changed\nB: A group of people are having a discussion near the trolley.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: the position of the trolley has been changed\nB: A group of people are having a discussion near the trolley.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_130_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_130_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: The image depicts a colorful mural on the side of a building with various geometric patterns and shapes.\nB: the after image shows a group of four people on a side walk with a man in a white t shirt and jeans walking towards them", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: The image depicts a colorful mural on the side of a building with various geometric patterns and shapes.\nB: the after image shows a group of four people on a side walk with a man in a white t shirt and jeans walking towards them", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_131_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_131_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: In the after picture, a colorful hot air balloon is flying over the landscape.\nB: there is a care driving towards the picture in the before picture", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: In the after picture, a colorful hot air balloon is flying over the landscape.\nB: there is a care driving towards the picture in the before picture", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_132_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_132_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: a group of people slightly spread out as another member joins the group\nB: A crowd of people gathered around a performance on a stage", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: a group of people slightly spread out as another member joins the group\nB: A crowd of people gathered around a performance on a stage", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_133_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_133_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: The sky is a beautiful shade of purple, and the trees are covered in glittering fairy lights.\nB: there are more cars and one more person", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: The sky is a beautiful shade of purple, and the trees are covered in glittering fairy lights.\nB: there are more cars and one more person", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_134_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_134_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: A group of people hiking in the mountains\nB: silver care with trunk open", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: A group of people hiking in the mountains\nB: silver care with trunk open", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_135_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_135_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: the car has driven forward a little near the intersection\nB: A group of pedestrians are walking on the sidewalk near a park", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: the car has driven forward a little near the intersection\nB: A group of pedestrians are walking on the sidewalk near a park", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_136_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_136_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: The images showcase a scenic countryside landscape with a beautiful river flowing through the valley.\nB: the before photograph appears to have three people whereas the after photograph appears to have two people", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: The images showcase a scenic countryside landscape with a beautiful river flowing through the valley.\nB: the before photograph appears to have three people whereas the after photograph appears to have two people", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_137_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_137_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: the people has change position\nB: The colors in the image are inverted", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: the people has change position\nB: The colors in the image are inverted", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_138_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_138_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: The left picture features a colorful garden, while the right picture showcases a bustling city street.\nB: on the left picture there are couple more people than what s showing on the right", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: The left picture features a colorful garden, while the right picture showcases a bustling city street.\nB: on the left picture there are couple more people than what s showing on the right", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_139_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_139_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: in the right hand image the group of people has moved further towards the left side of the sidewalk\nB: In the left image, there is a large bird flying in the background while the group of people is walking on the street.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: in the right hand image the group of people has moved further towards the left side of the sidewalk\nB: In the left image, there is a large bird flying in the background while the group of people is walking on the street.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_140_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_140_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: black car is gone\nB: The moon is shining brightly in the night sky", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: black car is gone\nB: The moon is shining brightly in the night sky", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_141_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_141_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: six people are gathered in the parking lot on the right image\nB: The sunset casts a warm orange glow over the city skyline in the left image.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: six people are gathered in the parking lot on the right image\nB: The sunset casts a warm orange glow over the city skyline in the left image.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_142_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_142_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: two people are shown in the after image\nB: A beautiful sunset is reflected in the still waters of a lake, creating a stunning mirror image.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: two people are shown in the after image\nB: A beautiful sunset is reflected in the still waters of a lake, creating a stunning mirror image.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_143_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_143_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: The color of the sky in the before photo is different from the after photo.\nB: there is a car driving in the before photo and no car in the after photo", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: The color of the sky in the before photo is different from the after photo.\nB: there is a car driving in the before photo and no car in the after photo", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_144_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_144_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: there were three people standing in the street now there are two people behind the yellow poles\nB: The orange cat is sitting on the windowsill while the sun sets over the distant mountains.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: there were three people standing in the street now there are two people behind the yellow poles\nB: The orange cat is sitting on the windowsill while the sun sets over the distant mountains.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_145_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_145_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: A large spaceship is preparing for a launch\nB: four new people have arrived", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: A large spaceship is preparing for a launch\nB: four new people have arrived", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_146_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_146_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: the group of people have grouped closer together\nB: The group of people is standing in a circle, each wearing a different colored hat.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: the group of people have grouped closer together\nB: The group of people is standing in a circle, each wearing a different colored hat.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_147_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_147_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: A colorful mural with abstract shapes and patterns\nB: 6 people in picture", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: A colorful mural with abstract shapes and patterns\nB: 6 people in picture", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_148_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_148_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: the motor cycle is in a different area now near the light blue truck\nB: In front of the fire station, a group of people are practicing tai chi in the park.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: the motor cycle is in a different area now near the light blue truck\nB: In front of the fire station, a group of people are practicing tai chi in the park.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_149_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_149_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: people have changed initial location in the group on the side walk\nB: A dog is chasing a butterfly in a sunny field.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: people have changed initial location in the group on the side walk\nB: A dog is chasing a butterfly in a sunny field.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_150_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_150_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: the black van is not there anymore\nB: A group of colorful balloons is floating in the sky", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: the black van is not there anymore\nB: A group of colorful balloons is floating in the sky", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_151_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_151_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: the people have shifted positions\nB: The colors have inverted and become brighter", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: the people have shifted positions\nB: The colors have inverted and become brighter", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_152_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_152_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: the people in the image on the left seem to have moved positions in the right image\nB: The lighting in both images seems to have been altered using filters to create a different atmosphere.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: the people in the image on the left seem to have moved positions in the right image\nB: The lighting in both images seems to have been altered using filters to create a different atmosphere.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_153_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_153_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: A colorful butterfly landed on the flower in the center of the image, showcasing its vibrant wings.\nB: a person got out of the silver car in the right foreground of the picture and opened it s trunk", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: A colorful butterfly landed on the flower in the center of the image, showcasing its vibrant wings.\nB: a person got out of the silver car in the right foreground of the picture and opened it s trunk", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_154_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_154_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: there are more people now\nB: The colors in the image seem to have a different hue than in real life.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: there are more people now\nB: The colors in the image seem to have a different hue than in real life.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_155_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_155_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: A group of people are having a picnic on the beach.\nB: the person is walking on the grass", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: A group of people are having a picnic on the beach.\nB: the person is walking on the grass", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_156_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_156_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: A flock of birds is perched on the yellow poles.\nB: the group of people by the yellow poles are no longer there", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: A flock of birds is perched on the yellow poles.\nB: the group of people by the yellow poles are no longer there", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_157_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_157_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: A group of people are having a picnic near the lake.\nB: there is a car in the back that is no longer there", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: A group of people are having a picnic near the lake.\nB: there is a car in the back that is no longer there", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_158_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_158_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: less people\nB: Abandoned amusement park", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: less people\nB: Abandoned amusement park", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_159_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_159_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: there are three guys in the first image and two in the second and they moved\nB: The first image features a colorful abstract painting, and the second image shows a close-up of a flower petal.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: there are three guys in the first image and two in the second and they moved\nB: The first image features a colorful abstract painting, and the second image shows a close-up of a flower petal.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_160_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_160_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: there is a red car in the back now\nB: The image features a serene lake surrounded by tall mountains and a clear blue sky overhead.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: there is a red car in the back now\nB: The image features a serene lake surrounded by tall mountains and a clear blue sky overhead.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_161_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_161_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: The after image features a vibrant color palette with dynamic patterns and shapes, invoking a sense of energy and movement.\nB: there are more people in the after image than the before", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: The after image features a vibrant color palette with dynamic patterns and shapes, invoking a sense of energy and movement.\nB: there are more people in the after image than the before", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_162_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_162_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: The image features a bustling cityscape with tall buildings and busy traffic on the streets.\nB: the picture on the right contains two people near the black suv near the bottom right corner", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: The image features a bustling cityscape with tall buildings and busy traffic on the streets.\nB: the picture on the right contains two people near the black suv near the bottom right corner", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_163_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_163_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: A colorful parrot is perched on a branch in a lush green jungle.\nB: the positions of the people standing in the group have changed", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: A colorful parrot is perched on a branch in a lush green jungle.\nB: the positions of the people standing in the group have changed", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_164_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_164_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: there appears to be no difference\nB: The image shows a serene countryside with a clear blue sky and lush green trees.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: there appears to be no difference\nB: The image shows a serene countryside with a clear blue sky and lush green trees.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_165_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_165_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: The building in the first image appears to have a sloped roof with large windows, and in the second image, there are several people walking by a brick building.\nB: there is not a person walking by the ivory colored building in the second picture", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: The building in the first image appears to have a sloped roof with large windows, and in the second image, there are several people walking by a brick building.\nB: there is not a person walking by the ivory colored building in the second picture", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_166_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_166_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: i do not see anything different within these pictures\nB: A picturesque scene of a bustling city street filled with people and lively activity.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: i do not see anything different within these pictures\nB: A picturesque scene of a bustling city street filled with people and lively activity.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_167_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_167_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: number of kids and their location\nB: The painting depicts a colorful garden filled with vibrant flowers and exotic animals.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: number of kids and their location\nB: The painting depicts a colorful garden filled with vibrant flowers and exotic animals.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_168_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_168_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: The image shows a beautiful landscape with colorful flowers and a clear blue sky.\nB: there are more visible people in the frame", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: The image shows a beautiful landscape with colorful flowers and a clear blue sky.\nB: there are more visible people in the frame", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_169_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_169_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: The after image features a group of people engaging in a lively discussion.\nB: a lot less guys in the after image", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: The after image features a group of people engaging in a lively discussion.\nB: a lot less guys in the after image", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_170_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_170_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: no differences\nB: A colorful bouquet of flowers sitting on a table", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: no differences\nB: A colorful bouquet of flowers sitting on a table", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_171_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_171_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: A group of bicyclists are passing by while a street musician performs on the corner.\nB: the car on the right is slightly further away and there is no pedestrian in the street", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: A group of bicyclists are passing by while a street musician performs on the corner.\nB: the car on the right is slightly further away and there is no pedestrian in the street", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_172_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_172_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: A group of friends enjoying a picnic in the park\nB: car driving down the street", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: A group of friends enjoying a picnic in the park\nB: car driving down the street", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_173_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_173_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: The parking lot is filled with colorful balloons floating in the air.\nB: there are no people walking in the parking lot", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: The parking lot is filled with colorful balloons floating in the air.\nB: there are no people walking in the parking lot", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_174_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_174_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: The people in the image are all wearing green shirts and blue jeans.\nB: there are now six people in the group instead of two", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: The people in the image are all wearing green shirts and blue jeans.\nB: there are now six people in the group instead of two", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_175_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_175_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: The images show different weather conditions, with one being sunny and the other being cloudy.\nB: there are more people in the right image", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: The images show different weather conditions, with one being sunny and the other being cloudy.\nB: there are more people in the right image", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_176_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_176_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: A group of birds are flying above the building\nB: the people are now next to the building", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: A group of birds are flying above the building\nB: the people are now next to the building", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_177_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_177_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: two people not walking in front of picture\nB: A mesmerizing sunset over a calm lake with a vibrant display of colors reflecting in the water.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: two people not walking in front of picture\nB: A mesmerizing sunset over a calm lake with a vibrant display of colors reflecting in the water.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_178_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_178_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: Several colorful balloons floating in the sky\nB: group of pepole walking", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: Several colorful balloons floating in the sky\nB: group of pepole walking", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_179_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_179_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: The photo shows a colorful landscape with a tranquil lake, surrounded by tall trees and a clear blue sky.\nB: there is one more person in the after photo", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: The photo shows a colorful landscape with a tranquil lake, surrounded by tall trees and a clear blue sky.\nB: there is one more person in the after photo", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_180_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_180_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: A colorful parrot is perched on a branch in a lush, tropical rainforest.\nB: the people are not there anymore in the front", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: A colorful parrot is perched on a branch in a lush, tropical rainforest.\nB: the people are not there anymore in the front", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_181_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_181_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: there is more people\nB: The color palette is very vibrant and cheerful, with a mix of bold and pastel shades.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: there is more people\nB: The color palette is very vibrant and cheerful, with a mix of bold and pastel shades.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_182_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_182_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: A colorful hot air balloon is floating in the sky.\nB: theres a group of people", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: A colorful hot air balloon is floating in the sky.\nB: theres a group of people", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_183_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_183_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: One image depicts a bustling city skyline at sunset, while the other shows a tranquil beach scene with palm trees and a vibrant ocean.\nB: these images are the same", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: One image depicts a bustling city skyline at sunset, while the other shows a tranquil beach scene with palm trees and a vibrant ocean.\nB: these images are the same", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_184_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_184_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: the group of people in the parking lot have moved out of view for the most part\nB: The colorful balloons are floating high in the sky, creating a beautiful display of color and movement.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: the group of people in the parking lot have moved out of view for the most part\nB: The colorful balloons are floating high in the sky, creating a beautiful display of color and movement.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_185_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_185_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: A group of rare flowers is blooming in the field.\nB: the people are less visible", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: A group of rare flowers is blooming in the field.\nB: the people are less visible", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_186_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_186_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: The sun is setting over the horizon, casting a beautiful orange glow across the landscape.\nB: there are more people near each other now", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: The sun is setting over the horizon, casting a beautiful orange glow across the landscape.\nB: there are more people near each other now", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_187_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_187_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: people have changed initial location\nB: The landscape has been altered by recent weather patterns.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: people have changed initial location\nB: The landscape has been altered by recent weather patterns.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_188_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_188_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: there is an extra red car in picture 2\nB: A group of people are gathered around a large bonfire in the middle of the field.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: there is an extra red car in picture 2\nB: A group of people are gathered around a large bonfire in the middle of the field.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_189_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_189_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: bigger group of pepole\nB: A colorful hot air balloon floating over a mountain range", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: bigger group of pepole\nB: A colorful hot air balloon floating over a mountain range", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_190_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_190_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: In the second image, a landscape painted with bright, vibrant colors is the focal point.\nB: th there is less people in the 2nd pic", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: In the second image, a landscape painted with bright, vibrant colors is the focal point.\nB: th there is less people in the 2nd pic", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_191_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_191_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: A cat sleeping on a windowsill\nB: people on the stairs", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: A cat sleeping on a windowsill\nB: people on the stairs", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_192_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_192_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: The purple and grey individuals are participating in a synchronized dance routine.\nB: the individual in purple has traded spots with the one in grey", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: The purple and grey individuals are participating in a synchronized dance routine.\nB: the individual in purple has traded spots with the one in grey", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_193_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_193_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: The color of the sky is bright pink.\nB: there is more people", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: The color of the sky is bright pink.\nB: there is more people", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_194_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_194_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: there were about 5 boys congregated in the parking lot before and now there seems to be 7 of them\nB: The sun was setting behind the skyscrapers, casting a warm orange glow over the city streets.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: there were about 5 boys congregated in the parking lot before and now there seems to be 7 of them\nB: The sun was setting behind the skyscrapers, casting a warm orange glow over the city streets.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_195_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_195_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: The yellow taxi drove away at high speed\nB: red bus is gone", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: The yellow taxi drove away at high speed\nB: red bus is gone", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_196_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_196_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: there are 2 people on the picture on the left but at least 6 on the right\nB: The picture features a landscape with a large body of water on the left and a dense forest on the right.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: there are 2 people on the picture on the left but at least 6 on the right\nB: The picture features a landscape with a large body of water on the left and a dense forest on the right.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_197_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_197_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: The scene is bathed in a warm, orange glow, with the setting sun creating long shadows and a feeling of tranquility.\nB: there is more people", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: The scene is bathed in a warm, orange glow, with the setting sun creating long shadows and a feeling of tranquility.\nB: there is more people", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_198_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_198_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_diff", "visual_input_component": "Video image or Natural image", "source": "spot_the_diff", "options": "A: in the first photo you see two groups of men in the after you can only see the top of one persons head and the rest of the men are gone\nB: The photo captures a beautiful sunset over the ocean with vibrant colors reflecting in the water.", "question": "The following is a description of the differences between two pictures. Which one is incorrect?", "context": "Select from the following choices.\nA: in the first photo you see two groups of men in the after you can only see the top of one persons head and the rest of the men are gone\nB: The photo captures a beautiful sunset over the ocean with vibrant colors reflecting in the water.", "input_image_path": ["./High-level-obj-semantic/spot_the_diff/spot_the_diff_199_0.png", "./High-level-obj-semantic/spot_the_diff/spot_the_diff_199_1.png"], "output": "A", "qwen3-vl": "image none"}]
\ No newline at end of file
diff --git a/results/spot_the_similarity/qwen3-vl/metadata_info.json b/results/spot_the_similarity/qwen3-vl/metadata_info.json
new file mode 100644
index 0000000..e1381c0
--- /dev/null
+++ b/results/spot_the_similarity/qwen3-vl/metadata_info.json
@@ -0,0 +1 @@
+[{"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_0_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_0_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_1_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_1_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_2_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_2_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_3_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_3_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_4_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_4_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_5_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_5_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_6_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_6_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_7_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_7_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_8_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_8_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_9_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_9_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_10_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_10_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_11_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_11_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_12_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_12_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_13_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_13_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_14_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_14_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_15_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_15_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_16_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_16_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_17_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_17_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_18_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_18_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_19_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_19_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_20_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_20_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_21_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_21_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_22_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_22_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_23_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_23_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_24_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_24_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_25_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_25_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_26_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_26_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_27_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_27_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_28_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_28_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_29_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_29_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_30_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_30_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_31_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_31_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_32_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_32_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_33_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_33_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_34_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_34_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_35_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_35_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_36_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_36_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_37_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_37_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_38_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_38_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_39_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_39_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_40_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_40_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_41_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_41_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_42_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_42_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_43_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_43_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_44_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_44_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_45_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_45_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_46_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_46_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_47_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_47_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_48_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_48_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_49_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_49_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_50_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_50_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_51_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_51_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_52_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_52_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_53_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_53_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_54_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_54_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_55_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_55_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_56_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_56_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_57_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_57_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_58_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_58_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_59_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_59_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_60_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_60_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_61_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_61_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_62_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_62_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_63_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_63_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_64_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_64_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_65_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_65_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_66_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_66_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_67_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_67_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_68_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_68_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_69_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_69_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_70_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_70_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_71_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_71_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_72_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_72_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_73_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_73_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_74_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_74_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_75_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_75_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_76_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_76_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_77_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_77_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_78_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_78_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_79_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_79_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_80_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_80_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_81_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_81_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_82_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_82_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_83_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_83_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_84_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_84_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_85_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_85_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_86_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_86_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_87_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_87_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_88_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_88_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_89_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_89_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_90_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_90_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_91_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_91_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_92_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_92_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_93_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_93_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_94_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_94_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_95_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_95_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_96_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_96_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_97_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_97_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_98_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_98_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_99_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_99_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_100_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_100_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_101_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_101_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_102_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_102_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_103_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_103_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_104_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_104_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_105_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_105_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_106_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_106_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_107_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_107_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_108_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_108_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_109_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_109_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_110_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_110_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_111_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_111_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_112_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_112_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_113_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_113_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_114_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_114_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_115_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_115_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_116_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_116_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_117_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_117_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_118_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_118_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_119_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_119_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_120_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_120_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_121_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_121_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_122_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_122_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_123_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_123_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_124_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_124_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_125_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_125_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_126_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_126_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_127_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_127_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_128_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_128_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_129_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_129_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_130_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_130_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_131_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_131_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_132_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_132_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_133_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_133_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_134_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_134_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_135_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_135_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_136_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_136_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_137_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_137_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_138_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_138_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_139_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_139_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_140_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_140_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_141_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_141_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_142_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_142_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_143_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_143_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_144_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_144_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_145_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_145_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_146_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_146_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_147_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_147_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_148_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_148_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_149_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_149_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_150_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_150_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_151_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_151_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_152_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_152_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_153_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_153_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_154_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_154_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_155_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_155_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_156_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_156_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_157_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_157_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_158_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_158_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_159_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_159_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_160_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_160_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_161_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_161_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_162_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_162_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_163_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_163_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_164_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_164_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_165_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_165_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_166_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_166_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_167_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_167_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_168_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_168_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_169_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_169_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_170_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_170_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_171_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_171_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_172_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_172_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_173_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_173_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_174_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_174_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_175_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_175_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_176_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_176_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_177_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_177_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_178_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_178_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_179_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_179_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_180_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_180_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_181_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_181_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_182_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_182_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_183_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_183_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_184_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_184_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_185_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_185_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_186_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_186_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_187_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_187_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_188_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_188_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_189_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_189_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_190_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_190_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_191_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_191_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_192_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_192_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_193_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_193_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_194_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_194_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_195_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_195_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: Yes\nB: No", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: Yes\nB: No", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_196_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_196_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "image_alike", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_197_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_197_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_198_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_198_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "spot_the_similarity", "visual_input_component": "['natural_image']", "source": "Totally_Looks_Like_Data", "options": "A: No\nB: Yes", "question": "Are there any similarities between the two pictures?", "context": "Select from the following choices.\nA: No\nB: Yes", "input_image_path": ["./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_199_0.jpg", "./High-level-obj-semantic/spot_the_similarity/spot_the_similarity_199_1.jpg"], "output": "B", "qwen3-vl": "image none"}]
\ No newline at end of file
diff --git a/results/temporal_localization/qwen3-vl/metadata_info.json b/results/temporal_localization/qwen3-vl/metadata_info.json
new file mode 100644
index 0000000..a383f26
--- /dev/null
+++ b/results/temporal_localization/qwen3-vl/metadata_info.json
@@ -0,0 +1 @@
+[{"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: BaseballPitch.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_0_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_0_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_0_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_0_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: GolfSwing.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_1_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_1_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_1_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_1_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: Billiards.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_2_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_2_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_2_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_2_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: HighJump.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_3_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_3_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_3_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_3_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: Shotput.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_4_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_4_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_4_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_4_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: HighJump.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_5_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_5_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_5_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_5_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: CricketBowling.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_6_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_6_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_6_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_6_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: BaseballPitch.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_7_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_7_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_7_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_7_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: CricketShot.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_8_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_8_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_8_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_8_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: JavelinThrow.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_9_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_9_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_9_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_9_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: PoleVault.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_10_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_10_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_10_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_10_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: VolleyballSpiking.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_11_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_11_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_11_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_11_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: LongJump.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_12_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_12_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_12_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_12_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: HighJump.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_13_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_13_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_13_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_13_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: BasketballDunk.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_14_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_14_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_14_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_14_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: TennisSwing.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_15_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_15_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_15_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_15_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: SoccerPenalty.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_16_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_16_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_16_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_16_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: JavelinThrow.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_17_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_17_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_17_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_17_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: CleanAndJerk.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_18_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_18_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_18_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_18_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: CricketShot.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_19_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_19_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_19_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_19_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: CleanAndJerk.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_20_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_20_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_20_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_20_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: CricketShot.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_21_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_21_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_21_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_21_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: CleanAndJerk.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_22_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_22_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_22_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_22_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: PoleVault.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_23_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_23_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_23_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_23_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: Shotput.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_24_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_24_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_24_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_24_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: CliffDiving.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_25_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_25_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_25_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_25_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: Shotput.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_26_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_26_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_26_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_26_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: HammerThrow.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_27_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_27_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_27_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_27_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: LongJump.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_28_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_28_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_28_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_28_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: JavelinThrow.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_29_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_29_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_29_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_29_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: CricketBowling.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_30_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_30_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_30_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_30_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: Billiards.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_31_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_31_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_31_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_31_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: LongJump.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_32_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_32_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_32_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_32_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: Billiards.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_33_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_33_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_33_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_33_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: HammerThrow.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_34_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_34_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_34_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_34_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: TennisSwing.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_35_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_35_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_35_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_35_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: Diving.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_36_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_36_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_36_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_36_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: HighJump.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_37_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_37_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_37_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_37_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: BaseballPitch.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_38_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_38_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_38_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_38_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: Shotput.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_39_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_39_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_39_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_39_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: PoleVault.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_40_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_40_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_40_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_40_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: BasketballDunk.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_41_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_41_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_41_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_41_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: CleanAndJerk.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_42_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_42_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_42_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_42_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: Diving.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_43_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_43_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_43_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_43_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: CliffDiving.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_44_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_44_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_44_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_44_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: GolfSwing.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_45_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_45_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_45_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_45_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: ThrowDiscus.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_46_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_46_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_46_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_46_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: PoleVault.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_47_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_47_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_47_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_47_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: GolfSwing.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_48_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_48_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_48_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_48_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: CliffDiving.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_49_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_49_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_49_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_49_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: FrisbeeCatch.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_50_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_50_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_50_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_50_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: FrisbeeCatch.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_51_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_51_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_51_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_51_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: CricketBowling.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_52_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_52_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_52_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_52_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: GolfSwing.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_53_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_53_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_53_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_53_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: ThrowDiscus.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_54_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_54_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_54_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_54_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: CliffDiving.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_55_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_55_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_55_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_55_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: SoccerPenalty.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_56_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_56_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_56_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_56_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: Diving.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_57_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_57_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_57_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_57_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: CleanAndJerk.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_58_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_58_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_58_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_58_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: VolleyballSpiking.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_59_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_59_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_59_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_59_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: BaseballPitch.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_60_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_60_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_60_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_60_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: PoleVault.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_61_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_61_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_61_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_61_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: CricketBowling.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_62_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_62_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_62_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_62_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: Diving.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_63_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_63_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_63_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_63_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: BasketballDunk.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_64_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_64_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_64_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_64_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: LongJump.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_65_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_65_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_65_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_65_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: HighJump.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_66_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_66_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_66_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_66_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: LongJump.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_67_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_67_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_67_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_67_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: Diving.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_68_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_68_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_68_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_68_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: PoleVault.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_69_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_69_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_69_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_69_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: JavelinThrow.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_70_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_70_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_70_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_70_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: CricketShot.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_71_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_71_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_71_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_71_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: CricketShot.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_72_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_72_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_72_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_72_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: HighJump.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_73_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_73_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_73_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_73_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: VolleyballSpiking.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_74_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_74_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_74_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_74_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: BaseballPitch.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_75_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_75_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_75_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_75_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: GolfSwing.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_76_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_76_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_76_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_76_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: Shotput.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_77_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_77_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_77_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_77_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: HammerThrow.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_78_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_78_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_78_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_78_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: CricketShot.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_79_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_79_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_79_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_79_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: FrisbeeCatch.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_80_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_80_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_80_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_80_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: TennisSwing.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_81_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_81_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_81_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_81_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: CricketBowling.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_82_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_82_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_82_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_82_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: Diving.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_83_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_83_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_83_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_83_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: Diving.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_84_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_84_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_84_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_84_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: Diving.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_85_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_85_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_85_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_85_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: CricketShot.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_86_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_86_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_86_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_86_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: Shotput.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_87_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_87_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_87_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_87_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: TennisSwing.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_88_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_88_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_88_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_88_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: CricketBowling.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_89_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_89_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_89_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_89_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: VolleyballSpiking.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_90_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_90_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_90_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_90_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: ThrowDiscus.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_91_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_91_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_91_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_91_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: CricketBowling.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_92_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_92_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_92_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_92_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: VolleyballSpiking.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_93_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_93_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_93_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_93_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: TennisSwing.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_94_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_94_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_94_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_94_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: BasketballDunk.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_95_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_95_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_95_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_95_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: VolleyballSpiking.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_96_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_96_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_96_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_96_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: CleanAndJerk.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_97_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_97_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_97_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_97_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: CricketBowling.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_98_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_98_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_98_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_98_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: VolleyballSpiking.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_99_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_99_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_99_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_99_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: HammerThrow.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_100_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_100_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_100_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_100_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: HighJump.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_101_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_101_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_101_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_101_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: Diving.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_102_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_102_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_102_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_102_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: BasketballDunk.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_103_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_103_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_103_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_103_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: GolfSwing.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_104_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_104_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_104_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_104_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: CricketBowling.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_105_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_105_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_105_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_105_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: ThrowDiscus.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_106_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_106_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_106_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_106_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: SoccerPenalty.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_107_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_107_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_107_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_107_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: SoccerPenalty.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_108_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_108_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_108_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_108_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: ThrowDiscus.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_109_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_109_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_109_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_109_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: Diving.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_110_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_110_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_110_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_110_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: VolleyballSpiking.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_111_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_111_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_111_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_111_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: Diving.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_112_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_112_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_112_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_112_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: CricketBowling.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_113_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_113_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_113_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_113_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: SoccerPenalty.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_114_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_114_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_114_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_114_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: PoleVault.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_115_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_115_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_115_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_115_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: JavelinThrow.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_116_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_116_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_116_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_116_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: CliffDiving.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_117_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_117_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_117_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_117_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: CliffDiving.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_118_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_118_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_118_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_118_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: Shotput.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_119_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_119_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_119_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_119_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: CricketShot.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_120_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_120_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_120_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_120_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: JavelinThrow.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_121_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_121_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_121_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_121_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: CricketShot.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_122_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_122_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_122_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_122_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: CricketShot.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_123_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_123_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_123_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_123_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: GolfSwing.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_124_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_124_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_124_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_124_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: CricketBowling.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_125_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_125_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_125_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_125_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: ThrowDiscus.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_126_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_126_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_126_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_126_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: BaseballPitch.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_127_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_127_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_127_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_127_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: SoccerPenalty.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_128_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_128_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_128_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_128_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: BaseballPitch.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_129_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_129_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_129_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_129_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: HammerThrow.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_130_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_130_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_130_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_130_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: TennisSwing.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_131_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_131_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_131_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_131_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: Shotput.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_132_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_132_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_132_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_132_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: FrisbeeCatch.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_133_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_133_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_133_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_133_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: Diving.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_134_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_134_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_134_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_134_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: ThrowDiscus.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_135_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_135_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_135_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_135_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: GolfSwing.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_136_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_136_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_136_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_136_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: LongJump.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_137_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_137_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_137_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_137_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: BaseballPitch.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_138_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_138_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_138_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_138_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: Billiards.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_139_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_139_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_139_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_139_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: Billiards.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_140_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_140_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_140_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_140_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: Billiards.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_141_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_141_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_141_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_141_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: PoleVault.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_142_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_142_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_142_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_142_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: Diving.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_143_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_143_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_143_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_143_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: TennisSwing.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_144_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_144_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_144_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_144_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: Billiards.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_145_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_145_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_145_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_145_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: FrisbeeCatch.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_146_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_146_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_146_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_146_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: Billiards.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_147_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_147_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_147_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_147_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: HammerThrow.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_148_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_148_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_148_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_148_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: VolleyballSpiking.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_149_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_149_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_149_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_149_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: GolfSwing.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_150_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_150_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_150_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_150_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: CliffDiving.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_151_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_151_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_151_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_151_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: Diving.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_152_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_152_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_152_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_152_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: CricketBowling.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_153_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_153_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_153_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_153_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: BaseballPitch.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_154_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_154_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_154_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_154_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: Diving.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_155_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_155_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_155_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_155_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: CricketBowling.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_156_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_156_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_156_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_156_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: BasketballDunk.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_157_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_157_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_157_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_157_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: CleanAndJerk.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_158_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_158_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_158_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_158_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: CricketBowling.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_159_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_159_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_159_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_159_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: Diving.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_160_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_160_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_160_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_160_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: SoccerPenalty.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_161_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_161_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_161_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_161_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: CricketBowling.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_162_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_162_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_162_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_162_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: CricketBowling.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_163_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_163_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_163_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_163_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: JavelinThrow.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_164_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_164_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_164_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_164_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: SoccerPenalty.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_165_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_165_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_165_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_165_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: PoleVault.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_166_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_166_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_166_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_166_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: SoccerPenalty.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_167_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_167_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_167_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_167_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: FrisbeeCatch.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_168_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_168_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_168_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_168_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: PoleVault.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_169_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_169_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_169_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_169_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: HammerThrow.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_170_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_170_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_170_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_170_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: ThrowDiscus.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_171_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_171_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_171_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_171_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: GolfSwing.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_172_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_172_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_172_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_172_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: SoccerPenalty.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_173_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_173_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_173_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_173_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: HighJump.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_174_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_174_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_174_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_174_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: ThrowDiscus.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_175_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_175_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_175_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_175_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: JavelinThrow.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_176_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_176_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_176_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_176_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: CliffDiving.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_177_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_177_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_177_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_177_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: BasketballDunk.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_178_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_178_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_178_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_178_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: CricketShot.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_179_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_179_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_179_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_179_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: Diving.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_180_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_180_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_180_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_180_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: VolleyballSpiking.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_181_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_181_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_181_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_181_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: TennisSwing.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_182_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_182_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_182_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_182_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: CleanAndJerk.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_183_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_183_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_183_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_183_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: BasketballDunk.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_184_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_184_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_184_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_184_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: Shotput.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_185_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_185_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_185_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_185_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: HighJump.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_186_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_186_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_186_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_186_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: TennisSwing.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_187_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_187_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_187_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_187_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: CricketShot.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_188_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_188_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_188_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_188_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: HammerThrow.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_189_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_189_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_189_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_189_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: LongJump.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_190_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_190_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_190_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_190_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: HammerThrow.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_191_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_191_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_191_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_191_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_localization", "visual_input_component": "Video image or Natural image", "source": "THUMOS14", "options": "A: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "question": "Given the sequence of images, please identify the image consistent with the text description: Billiards.", "context": "Select from the following choices.\nA: Image 0\nB: Image 1\nC: Image 2\nD: Image 3", "input_image_path": ["./Continuous-temporal/temporal_localization/temporal_localization_192_0.png", "./Continuous-temporal/temporal_localization/temporal_localization_192_1.png", "./Continuous-temporal/temporal_localization/temporal_localization_192_2.png", "./Continuous-temporal/temporal_localization/temporal_localization_192_3.png"], "output": "A", "qwen3-vl": "image none"}]
\ No newline at end of file
diff --git a/results/temporal_ordering/qwen3-vl/metadata_info.json b/results/temporal_ordering/qwen3-vl/metadata_info.json
new file mode 100644
index 0000000..f00d75b
--- /dev/null
+++ b/results/temporal_ordering/qwen3-vl/metadata_info.json
@@ -0,0 +1 @@
+[{"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [2, 0, 1, 3]\nB: [1, 0, 3, 2]\nC: [3, 1, 0, 2]\nD: [0, 2, 1, 3]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [2, 0, 1, 3]\nB: [1, 0, 3, 2]\nC: [3, 1, 0, 2]\nD: [0, 2, 1, 3]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_0_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_0_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_0_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_0_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [10, 0, 8, 2, 3, 11, 7, 5, 6, 1, 4, 9]\nB: [1, 8, 6, 5, 3, 2, 10, 9, 0, 4, 11, 7]\nC: [4, 6, 10, 8, 1, 3, 2, 5, 0, 9, 7, 11]\nD: [10, 9, 0, 5, 8, 11, 6, 3, 1, 7, 4, 2]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [10, 0, 8, 2, 3, 11, 7, 5, 6, 1, 4, 9]\nB: [1, 8, 6, 5, 3, 2, 10, 9, 0, 4, 11, 7]\nC: [4, 6, 10, 8, 1, 3, 2, 5, 0, 9, 7, 11]\nD: [10, 9, 0, 5, 8, 11, 6, 3, 1, 7, 4, 2]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_1_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_1_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_1_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_1_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_1_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_1_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_1_6.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_1_7.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_1_8.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_1_9.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_1_10.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_1_11.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [2, 0, 3, 1]\nB: [2, 1, 0, 3]\nC: [2, 1, 3, 0]\nD: [1, 2, 3, 0]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [2, 0, 3, 1]\nB: [2, 1, 0, 3]\nC: [2, 1, 3, 0]\nD: [1, 2, 3, 0]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_2_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_2_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_2_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_2_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [0, 5, 4, 3, 8, 2, 1, 6, 7]\nB: [3, 6, 4, 5, 2, 8, 0, 1, 7]\nC: [6, 1, 0, 3, 8, 5, 7, 4, 2]\nD: [2, 8, 0, 7, 5, 6, 3, 4, 1]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [0, 5, 4, 3, 8, 2, 1, 6, 7]\nB: [3, 6, 4, 5, 2, 8, 0, 1, 7]\nC: [6, 1, 0, 3, 8, 5, 7, 4, 2]\nD: [2, 8, 0, 7, 5, 6, 3, 4, 1]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_3_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_3_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_3_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_3_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_3_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_3_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_3_6.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_3_7.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_3_8.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [0, 1, 3, 2]\nB: [0, 3, 2, 1]\nC: [2, 1, 0, 3]\nD: [2, 3, 1, 0]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [0, 1, 3, 2]\nB: [0, 3, 2, 1]\nC: [2, 1, 0, 3]\nD: [2, 3, 1, 0]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_4_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_4_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_4_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_4_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [2, 1, 0, 3]\nB: [1, 0, 3, 2]\nC: [3, 1, 2, 0]\nD: [2, 0, 1, 3]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [2, 1, 0, 3]\nB: [1, 0, 3, 2]\nC: [3, 1, 2, 0]\nD: [2, 0, 1, 3]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_5_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_5_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_5_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_5_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [3, 1, 0, 2]\nB: [1, 2, 0, 3]\nC: [1, 3, 0, 2]\nD: [0, 2, 3, 1]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [3, 1, 0, 2]\nB: [1, 2, 0, 3]\nC: [1, 3, 0, 2]\nD: [0, 2, 3, 1]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_6_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_6_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_6_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_6_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [15, 11, 17, 9, 12, 10, 4, 7, 0, 8, 5, 1, 14, 3, 2, 16, 13, 6]\nB: [5, 4, 13, 11, 1, 17, 16, 12, 0, 10, 2, 3, 9, 15, 8, 14, 6, 7]\nC: [4, 5, 11, 1, 16, 12, 10, 13, 8, 7, 15, 3, 9, 14, 17, 2, 6, 0]\nD: [7, 9, 0, 3, 2, 12, 1, 17, 4, 10, 13, 6, 14, 8, 16, 15, 5, 11]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [15, 11, 17, 9, 12, 10, 4, 7, 0, 8, 5, 1, 14, 3, 2, 16, 13, 6]\nB: [5, 4, 13, 11, 1, 17, 16, 12, 0, 10, 2, 3, 9, 15, 8, 14, 6, 7]\nC: [4, 5, 11, 1, 16, 12, 10, 13, 8, 7, 15, 3, 9, 14, 17, 2, 6, 0]\nD: [7, 9, 0, 3, 2, 12, 1, 17, 4, 10, 13, 6, 14, 8, 16, 15, 5, 11]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_7_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_7_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_7_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_7_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_7_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_7_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_7_6.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_7_7.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_7_8.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_7_9.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_7_10.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_7_11.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_7_12.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_7_13.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_7_14.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_7_15.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_7_16.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_7_17.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [3, 0, 5, 2, 1, 4]\nB: [0, 2, 1, 4, 5, 3]\nC: [1, 2, 4, 5, 0, 3]\nD: [0, 3, 1, 4, 2, 5]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [3, 0, 5, 2, 1, 4]\nB: [0, 2, 1, 4, 5, 3]\nC: [1, 2, 4, 5, 0, 3]\nD: [0, 3, 1, 4, 2, 5]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_8_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_8_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_8_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_8_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_8_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_8_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [0, 3, 1, 2]\nB: [2, 1, 3, 0]\nC: [0, 1, 3, 2]\nD: [0, 3, 2, 1]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [0, 3, 1, 2]\nB: [2, 1, 3, 0]\nC: [0, 1, 3, 2]\nD: [0, 3, 2, 1]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_9_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_9_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_9_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_9_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [3, 5, 1, 2, 0, 4]\nB: [4, 5, 0, 2, 3, 1]\nC: [1, 2, 0, 4, 5, 3]\nD: [2, 1, 0, 3, 5, 4]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [3, 5, 1, 2, 0, 4]\nB: [4, 5, 0, 2, 3, 1]\nC: [1, 2, 0, 4, 5, 3]\nD: [2, 1, 0, 3, 5, 4]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_10_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_10_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_10_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_10_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_10_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_10_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [2, 1, 0, 3]\nB: [0, 2, 1, 3]\nC: [1, 2, 3, 0]\nD: [3, 2, 0, 1]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [2, 1, 0, 3]\nB: [0, 2, 1, 3]\nC: [1, 2, 3, 0]\nD: [3, 2, 0, 1]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_11_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_11_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_11_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_11_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [4, 2, 0, 1, 3]\nB: [3, 0, 2, 1, 4]\nC: [4, 0, 2, 1, 3]\nD: [4, 3, 2, 1, 0]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [4, 2, 0, 1, 3]\nB: [3, 0, 2, 1, 4]\nC: [4, 0, 2, 1, 3]\nD: [4, 3, 2, 1, 0]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_12_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_12_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_12_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_12_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_12_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [3, 1, 2, 0]\nB: [1, 2, 3, 0]\nC: [2, 0, 3, 1]\nD: [0, 3, 1, 2]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [3, 1, 2, 0]\nB: [1, 2, 3, 0]\nC: [2, 0, 3, 1]\nD: [0, 3, 1, 2]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_13_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_13_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_13_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_13_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [1, 4, 2, 0, 3]\nB: [0, 3, 4, 2, 1]\nC: [3, 0, 4, 1, 2]\nD: [1, 2, 0, 3, 4]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [1, 4, 2, 0, 3]\nB: [0, 3, 4, 2, 1]\nC: [3, 0, 4, 1, 2]\nD: [1, 2, 0, 3, 4]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_14_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_14_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_14_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_14_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_14_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [0, 1, 2, 3]\nB: [0, 2, 1, 3]\nC: [1, 3, 0, 2]\nD: [1, 3, 2, 0]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 2, 1, 3]\nC: [1, 3, 0, 2]\nD: [1, 3, 2, 0]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_15_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_15_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_15_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_15_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [2, 1, 3, 0, 4]\nB: [3, 4, 0, 1, 2]\nC: [1, 4, 3, 0, 2]\nD: [0, 3, 2, 1, 4]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [2, 1, 3, 0, 4]\nB: [3, 4, 0, 1, 2]\nC: [1, 4, 3, 0, 2]\nD: [0, 3, 2, 1, 4]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_16_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_16_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_16_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_16_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_16_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [0, 1, 2, 3]\nB: [3, 2, 0, 1]\nC: [3, 2, 0, 1]\nD: [2, 0, 3, 1]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 2, 0, 1]\nC: [3, 2, 0, 1]\nD: [2, 0, 3, 1]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_17_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_17_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_17_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_17_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [2, 3, 0, 1]\nB: [1, 0, 3, 2]\nC: [3, 2, 1, 0]\nD: [1, 3, 2, 0]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [2, 3, 0, 1]\nB: [1, 0, 3, 2]\nC: [3, 2, 1, 0]\nD: [1, 3, 2, 0]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_18_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_18_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_18_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_18_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [3, 2, 1, 0]\nB: [0, 2, 3, 1]\nC: [2, 0, 1, 3]\nD: [1, 2, 0, 3]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [3, 2, 1, 0]\nB: [0, 2, 3, 1]\nC: [2, 0, 1, 3]\nD: [1, 2, 0, 3]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_19_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_19_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_19_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_19_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [1, 3, 2, 0]\nB: [0, 2, 1, 3]\nC: [3, 2, 0, 1]\nD: [3, 1, 0, 2]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [1, 3, 2, 0]\nB: [0, 2, 1, 3]\nC: [3, 2, 0, 1]\nD: [3, 1, 0, 2]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_20_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_20_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_20_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_20_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [0, 10, 3, 17, 4, 6, 8, 7, 1, 12, 2, 14, 5, 19, 15, 20, 9, 23, 11, 24, 22, 16, 13, 21, 18]\nB: [23, 3, 10, 16, 20, 2, 17, 19, 21, 7, 15, 11, 0, 9, 12, 24, 4, 13, 18, 5, 1, 22, 6, 8, 14]\nC: [15, 23, 6, 13, 17, 18, 7, 1, 16, 10, 8, 11, 20, 0, 21, 2, 9, 22, 14, 5, 24, 3, 4, 12, 19]\nD: [14, 7, 3, 5, 4, 6, 10, 21, 23, 18, 16, 22, 0, 19, 1, 12, 17, 8, 24, 11, 13, 9, 20, 15, 2]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [0, 10, 3, 17, 4, 6, 8, 7, 1, 12, 2, 14, 5, 19, 15, 20, 9, 23, 11, 24, 22, 16, 13, 21, 18]\nB: [23, 3, 10, 16, 20, 2, 17, 19, 21, 7, 15, 11, 0, 9, 12, 24, 4, 13, 18, 5, 1, 22, 6, 8, 14]\nC: [15, 23, 6, 13, 17, 18, 7, 1, 16, 10, 8, 11, 20, 0, 21, 2, 9, 22, 14, 5, 24, 3, 4, 12, 19]\nD: [14, 7, 3, 5, 4, 6, 10, 21, 23, 18, 16, 22, 0, 19, 1, 12, 17, 8, 24, 11, 13, 9, 20, 15, 2]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_21_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_21_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_21_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_21_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_21_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_21_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_21_6.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_21_7.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_21_8.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_21_9.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_21_10.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_21_11.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_21_12.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_21_13.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_21_14.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_21_15.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_21_16.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_21_17.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_21_18.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_21_19.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_21_20.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_21_21.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_21_22.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_21_23.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_21_24.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [1, 5, 6, 0, 3, 2, 4]\nB: [6, 4, 5, 0, 2, 3, 1]\nC: [4, 6, 1, 3, 2, 0, 5]\nD: [6, 3, 1, 0, 2, 5, 4]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [1, 5, 6, 0, 3, 2, 4]\nB: [6, 4, 5, 0, 2, 3, 1]\nC: [4, 6, 1, 3, 2, 0, 5]\nD: [6, 3, 1, 0, 2, 5, 4]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_22_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_22_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_22_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_22_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_22_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_22_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_22_6.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [3, 5, 0, 2, 1, 4, 6]\nB: [0, 5, 1, 2, 4, 6, 3]\nC: [0, 1, 3, 2, 5, 6, 4]\nD: [3, 1, 0, 5, 4, 2, 6]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [3, 5, 0, 2, 1, 4, 6]\nB: [0, 5, 1, 2, 4, 6, 3]\nC: [0, 1, 3, 2, 5, 6, 4]\nD: [3, 1, 0, 5, 4, 2, 6]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_23_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_23_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_23_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_23_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_23_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_23_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_23_6.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [7, 2, 1, 9, 8, 3, 5, 4, 6, 0]\nB: [8, 0, 2, 1, 6, 5, 9, 7, 4, 3]\nC: [7, 9, 5, 4, 3, 6, 8, 1, 0, 2]\nD: [5, 6, 8, 4, 0, 9, 1, 3, 7, 2]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [7, 2, 1, 9, 8, 3, 5, 4, 6, 0]\nB: [8, 0, 2, 1, 6, 5, 9, 7, 4, 3]\nC: [7, 9, 5, 4, 3, 6, 8, 1, 0, 2]\nD: [5, 6, 8, 4, 0, 9, 1, 3, 7, 2]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_24_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_24_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_24_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_24_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_24_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_24_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_24_6.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_24_7.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_24_8.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_24_9.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [2, 3, 4, 1, 0]\nB: [1, 3, 2, 0, 4]\nC: [2, 1, 0, 4, 3]\nD: [4, 2, 3, 1, 0]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [2, 3, 4, 1, 0]\nB: [1, 3, 2, 0, 4]\nC: [2, 1, 0, 4, 3]\nD: [4, 2, 3, 1, 0]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_25_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_25_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_25_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_25_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_25_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [11, 14, 9, 7, 0, 6, 2, 3, 13, 1, 8, 10, 5, 4, 12]\nB: [5, 6, 10, 2, 9, 0, 14, 8, 13, 3, 1, 12, 7, 4, 11]\nC: [13, 9, 14, 8, 6, 10, 11, 4, 1, 2, 7, 12, 3, 0, 5]\nD: [10, 7, 13, 12, 3, 5, 1, 9, 6, 14, 8, 11, 0, 4, 2]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [11, 14, 9, 7, 0, 6, 2, 3, 13, 1, 8, 10, 5, 4, 12]\nB: [5, 6, 10, 2, 9, 0, 14, 8, 13, 3, 1, 12, 7, 4, 11]\nC: [13, 9, 14, 8, 6, 10, 11, 4, 1, 2, 7, 12, 3, 0, 5]\nD: [10, 7, 13, 12, 3, 5, 1, 9, 6, 14, 8, 11, 0, 4, 2]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_26_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_26_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_26_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_26_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_26_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_26_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_26_6.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_26_7.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_26_8.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_26_9.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_26_10.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_26_11.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_26_12.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_26_13.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_26_14.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [6, 5, 3, 1, 2, 4, 0]\nB: [4, 6, 1, 5, 0, 3, 2]\nC: [3, 0, 6, 5, 1, 4, 2]\nD: [5, 2, 6, 1, 4, 3, 0]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [6, 5, 3, 1, 2, 4, 0]\nB: [4, 6, 1, 5, 0, 3, 2]\nC: [3, 0, 6, 5, 1, 4, 2]\nD: [5, 2, 6, 1, 4, 3, 0]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_27_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_27_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_27_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_27_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_27_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_27_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_27_6.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [3, 1, 2, 0]\nB: [1, 2, 3, 0]\nC: [0, 2, 1, 3]\nD: [2, 1, 0, 3]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [3, 1, 2, 0]\nB: [1, 2, 3, 0]\nC: [0, 2, 1, 3]\nD: [2, 1, 0, 3]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_28_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_28_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_28_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_28_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [7, 8, 2, 3, 1, 0, 5, 4, 6]\nB: [1, 6, 2, 4, 5, 7, 0, 8, 3]\nC: [8, 0, 6, 3, 7, 1, 4, 2, 5]\nD: [8, 6, 2, 7, 0, 4, 1, 3, 5]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [7, 8, 2, 3, 1, 0, 5, 4, 6]\nB: [1, 6, 2, 4, 5, 7, 0, 8, 3]\nC: [8, 0, 6, 3, 7, 1, 4, 2, 5]\nD: [8, 6, 2, 7, 0, 4, 1, 3, 5]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_29_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_29_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_29_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_29_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_29_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_29_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_29_6.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_29_7.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_29_8.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [2, 5, 0, 4, 1, 3]\nB: [0, 3, 5, 4, 2, 1]\nC: [5, 1, 0, 3, 4, 2]\nD: [5, 4, 3, 1, 2, 0]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [2, 5, 0, 4, 1, 3]\nB: [0, 3, 5, 4, 2, 1]\nC: [5, 1, 0, 3, 4, 2]\nD: [5, 4, 3, 1, 2, 0]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_30_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_30_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_30_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_30_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_30_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_30_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [3, 0, 4, 2, 5, 6, 1]\nB: [0, 6, 3, 1, 2, 5, 4]\nC: [6, 5, 0, 3, 2, 4, 1]\nD: [3, 6, 5, 0, 4, 1, 2]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [3, 0, 4, 2, 5, 6, 1]\nB: [0, 6, 3, 1, 2, 5, 4]\nC: [6, 5, 0, 3, 2, 4, 1]\nD: [3, 6, 5, 0, 4, 1, 2]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_31_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_31_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_31_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_31_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_31_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_31_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_31_6.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [2, 0, 1, 3]\nB: [3, 0, 2, 1]\nC: [0, 1, 3, 2]\nD: [2, 1, 3, 0]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [2, 0, 1, 3]\nB: [3, 0, 2, 1]\nC: [0, 1, 3, 2]\nD: [2, 1, 3, 0]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_32_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_32_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_32_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_32_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [0, 1, 4, 2, 3]\nB: [3, 0, 1, 2, 4]\nC: [4, 2, 0, 3, 1]\nD: [1, 0, 4, 3, 2]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [0, 1, 4, 2, 3]\nB: [3, 0, 1, 2, 4]\nC: [4, 2, 0, 3, 1]\nD: [1, 0, 4, 3, 2]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_33_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_33_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_33_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_33_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_33_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [3, 2, 1, 0, 4, 5]\nB: [5, 0, 4, 1, 2, 3]\nC: [5, 0, 1, 3, 4, 2]\nD: [1, 0, 5, 4, 3, 2]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [3, 2, 1, 0, 4, 5]\nB: [5, 0, 4, 1, 2, 3]\nC: [5, 0, 1, 3, 4, 2]\nD: [1, 0, 5, 4, 3, 2]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_34_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_34_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_34_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_34_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_34_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_34_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [3, 0, 2, 1]\nB: [0, 2, 1, 3]\nC: [0, 3, 2, 1]\nD: [1, 3, 0, 2]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [3, 0, 2, 1]\nB: [0, 2, 1, 3]\nC: [0, 3, 2, 1]\nD: [1, 3, 0, 2]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_35_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_35_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_35_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_35_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [3, 2, 0, 1]\nB: [1, 3, 0, 2]\nC: [0, 2, 1, 3]\nD: [1, 3, 0, 2]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [3, 2, 0, 1]\nB: [1, 3, 0, 2]\nC: [0, 2, 1, 3]\nD: [1, 3, 0, 2]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_36_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_36_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_36_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_36_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [4, 1, 2, 0, 3]\nB: [0, 3, 2, 1, 4]\nC: [3, 2, 4, 0, 1]\nD: [1, 3, 4, 2, 0]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [4, 1, 2, 0, 3]\nB: [0, 3, 2, 1, 4]\nC: [3, 2, 4, 0, 1]\nD: [1, 3, 4, 2, 0]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_37_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_37_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_37_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_37_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_37_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [1, 2, 3, 0]\nB: [3, 0, 2, 1]\nC: [1, 3, 0, 2]\nD: [1, 0, 2, 3]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [1, 2, 3, 0]\nB: [3, 0, 2, 1]\nC: [1, 3, 0, 2]\nD: [1, 0, 2, 3]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_38_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_38_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_38_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_38_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [0, 4, 3, 2, 1]\nB: [1, 0, 3, 4, 2]\nC: [4, 3, 2, 0, 1]\nD: [3, 1, 2, 0, 4]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [0, 4, 3, 2, 1]\nB: [1, 0, 3, 4, 2]\nC: [4, 3, 2, 0, 1]\nD: [3, 1, 2, 0, 4]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_39_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_39_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_39_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_39_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_39_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [7, 1, 6, 0, 2, 5, 3, 4]\nB: [4, 2, 7, 6, 0, 3, 5, 1]\nC: [2, 0, 5, 1, 4, 3, 7, 6]\nD: [5, 3, 1, 2, 4, 7, 6, 0]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [7, 1, 6, 0, 2, 5, 3, 4]\nB: [4, 2, 7, 6, 0, 3, 5, 1]\nC: [2, 0, 5, 1, 4, 3, 7, 6]\nD: [5, 3, 1, 2, 4, 7, 6, 0]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_40_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_40_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_40_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_40_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_40_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_40_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_40_6.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_40_7.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [1, 2, 3, 0]\nB: [1, 2, 3, 0]\nC: [2, 0, 3, 1]\nD: [3, 2, 0, 1]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [1, 2, 3, 0]\nB: [1, 2, 3, 0]\nC: [2, 0, 3, 1]\nD: [3, 2, 0, 1]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_41_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_41_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_41_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_41_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [3, 1, 0, 4, 2]\nB: [4, 2, 1, 0, 3]\nC: [2, 3, 0, 1, 4]\nD: [2, 3, 4, 1, 0]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [3, 1, 0, 4, 2]\nB: [4, 2, 1, 0, 3]\nC: [2, 3, 0, 1, 4]\nD: [2, 3, 4, 1, 0]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_42_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_42_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_42_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_42_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_42_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [3, 0, 1, 2]\nB: [0, 1, 2, 3]\nC: [1, 3, 2, 0]\nD: [1, 3, 2, 0]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [3, 0, 1, 2]\nB: [0, 1, 2, 3]\nC: [1, 3, 2, 0]\nD: [1, 3, 2, 0]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_43_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_43_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_43_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_43_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [9, 0, 2, 5, 8, 3, 1, 6, 10, 4, 7]\nB: [9, 2, 1, 4, 7, 8, 3, 0, 10, 5, 6]\nC: [2, 5, 0, 9, 6, 4, 10, 7, 3, 1, 8]\nD: [6, 0, 2, 7, 5, 10, 3, 1, 9, 8, 4]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [9, 0, 2, 5, 8, 3, 1, 6, 10, 4, 7]\nB: [9, 2, 1, 4, 7, 8, 3, 0, 10, 5, 6]\nC: [2, 5, 0, 9, 6, 4, 10, 7, 3, 1, 8]\nD: [6, 0, 2, 7, 5, 10, 3, 1, 9, 8, 4]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_44_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_44_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_44_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_44_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_44_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_44_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_44_6.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_44_7.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_44_8.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_44_9.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_44_10.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [4, 2, 1, 0, 3]\nB: [3, 4, 0, 1, 2]\nC: [0, 1, 4, 3, 2]\nD: [3, 0, 4, 1, 2]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [4, 2, 1, 0, 3]\nB: [3, 4, 0, 1, 2]\nC: [0, 1, 4, 3, 2]\nD: [3, 0, 4, 1, 2]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_45_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_45_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_45_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_45_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_45_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [4, 2, 3, 1, 5, 0]\nB: [1, 5, 0, 2, 4, 3]\nC: [3, 1, 0, 4, 5, 2]\nD: [2, 3, 1, 4, 0, 5]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [4, 2, 3, 1, 5, 0]\nB: [1, 5, 0, 2, 4, 3]\nC: [3, 1, 0, 4, 5, 2]\nD: [2, 3, 1, 4, 0, 5]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_46_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_46_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_46_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_46_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_46_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_46_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [0, 2, 3, 1]\nB: [2, 3, 0, 1]\nC: [3, 2, 1, 0]\nD: [1, 2, 3, 0]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [0, 2, 3, 1]\nB: [2, 3, 0, 1]\nC: [3, 2, 1, 0]\nD: [1, 2, 3, 0]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_47_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_47_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_47_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_47_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [8, 3, 1, 4, 9, 5, 6, 10, 7, 0, 2]\nB: [8, 6, 3, 0, 9, 4, 7, 5, 10, 2, 1]\nC: [0, 2, 9, 10, 6, 7, 8, 3, 4, 1, 5]\nD: [7, 3, 1, 10, 6, 2, 0, 8, 5, 9, 4]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [8, 3, 1, 4, 9, 5, 6, 10, 7, 0, 2]\nB: [8, 6, 3, 0, 9, 4, 7, 5, 10, 2, 1]\nC: [0, 2, 9, 10, 6, 7, 8, 3, 4, 1, 5]\nD: [7, 3, 1, 10, 6, 2, 0, 8, 5, 9, 4]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_48_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_48_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_48_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_48_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_48_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_48_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_48_6.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_48_7.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_48_8.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_48_9.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_48_10.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [4, 6, 12, 5, 11, 23, 16, 21, 9, 25, 0, 8, 20, 13, 1, 10, 7, 3, 24, 14, 19, 17, 22, 15, 18, 2]\nB: [7, 1, 14, 9, 13, 20, 0, 3, 10, 21, 6, 16, 17, 22, 25, 2, 15, 24, 5, 23, 4, 11, 12, 18, 8, 19]\nC: [11, 7, 8, 12, 2, 13, 25, 21, 17, 15, 10, 1, 9, 23, 4, 19, 3, 6, 20, 16, 22, 14, 24, 5, 18, 0]\nD: [0, 3, 15, 18, 12, 4, 5, 24, 19, 1, 16, 9, 25, 22, 8, 2, 13, 14, 20, 6, 21, 23, 17, 10, 7, 11]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [4, 6, 12, 5, 11, 23, 16, 21, 9, 25, 0, 8, 20, 13, 1, 10, 7, 3, 24, 14, 19, 17, 22, 15, 18, 2]\nB: [7, 1, 14, 9, 13, 20, 0, 3, 10, 21, 6, 16, 17, 22, 25, 2, 15, 24, 5, 23, 4, 11, 12, 18, 8, 19]\nC: [11, 7, 8, 12, 2, 13, 25, 21, 17, 15, 10, 1, 9, 23, 4, 19, 3, 6, 20, 16, 22, 14, 24, 5, 18, 0]\nD: [0, 3, 15, 18, 12, 4, 5, 24, 19, 1, 16, 9, 25, 22, 8, 2, 13, 14, 20, 6, 21, 23, 17, 10, 7, 11]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_49_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_49_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_49_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_49_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_49_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_49_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_49_6.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_49_7.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_49_8.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_49_9.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_49_10.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_49_11.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_49_12.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_49_13.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_49_14.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_49_15.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_49_16.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_49_17.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_49_18.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_49_19.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_49_20.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_49_21.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_49_22.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_49_23.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_49_24.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_49_25.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [1, 8, 2, 6, 5, 0, 9, 7, 4, 3]\nB: [9, 5, 4, 2, 3, 7, 1, 8, 0, 6]\nC: [5, 0, 1, 2, 7, 8, 3, 4, 6, 9]\nD: [1, 4, 3, 5, 6, 8, 9, 0, 7, 2]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [1, 8, 2, 6, 5, 0, 9, 7, 4, 3]\nB: [9, 5, 4, 2, 3, 7, 1, 8, 0, 6]\nC: [5, 0, 1, 2, 7, 8, 3, 4, 6, 9]\nD: [1, 4, 3, 5, 6, 8, 9, 0, 7, 2]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_50_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_50_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_50_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_50_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_50_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_50_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_50_6.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_50_7.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_50_8.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_50_9.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [6, 1, 0, 5, 2, 3, 4]\nB: [5, 4, 0, 3, 6, 2, 1]\nC: [2, 4, 3, 1, 5, 0, 6]\nD: [5, 4, 6, 1, 3, 0, 2]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [6, 1, 0, 5, 2, 3, 4]\nB: [5, 4, 0, 3, 6, 2, 1]\nC: [2, 4, 3, 1, 5, 0, 6]\nD: [5, 4, 6, 1, 3, 0, 2]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_51_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_51_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_51_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_51_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_51_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_51_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_51_6.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [0, 3, 2, 1, 4]\nB: [4, 1, 0, 3, 2]\nC: [3, 4, 2, 1, 0]\nD: [4, 0, 1, 3, 2]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [0, 3, 2, 1, 4]\nB: [4, 1, 0, 3, 2]\nC: [3, 4, 2, 1, 0]\nD: [4, 0, 1, 3, 2]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_52_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_52_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_52_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_52_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_52_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [3, 1, 0, 2]\nB: [1, 0, 2, 3]\nC: [0, 1, 2, 3]\nD: [1, 3, 2, 0]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [3, 1, 0, 2]\nB: [1, 0, 2, 3]\nC: [0, 1, 2, 3]\nD: [1, 3, 2, 0]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_53_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_53_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_53_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_53_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [7, 6, 0, 2, 1, 3, 4, 5]\nB: [6, 7, 2, 4, 1, 3, 0, 5]\nC: [3, 4, 6, 7, 1, 2, 5, 0]\nD: [2, 7, 5, 0, 6, 1, 4, 3]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [7, 6, 0, 2, 1, 3, 4, 5]\nB: [6, 7, 2, 4, 1, 3, 0, 5]\nC: [3, 4, 6, 7, 1, 2, 5, 0]\nD: [2, 7, 5, 0, 6, 1, 4, 3]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_54_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_54_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_54_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_54_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_54_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_54_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_54_6.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_54_7.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [2, 3, 0, 5, 6, 1, 8, 4, 7]\nB: [2, 5, 8, 3, 0, 6, 1, 7, 4]\nC: [1, 0, 6, 7, 5, 4, 2, 3, 8]\nD: [5, 8, 0, 3, 4, 1, 6, 7, 2]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [2, 3, 0, 5, 6, 1, 8, 4, 7]\nB: [2, 5, 8, 3, 0, 6, 1, 7, 4]\nC: [1, 0, 6, 7, 5, 4, 2, 3, 8]\nD: [5, 8, 0, 3, 4, 1, 6, 7, 2]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_55_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_55_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_55_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_55_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_55_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_55_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_55_6.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_55_7.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_55_8.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [4, 1, 3, 5, 2, 0]\nB: [5, 2, 4, 3, 0, 1]\nC: [5, 3, 2, 0, 1, 4]\nD: [4, 1, 2, 3, 5, 0]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [4, 1, 3, 5, 2, 0]\nB: [5, 2, 4, 3, 0, 1]\nC: [5, 3, 2, 0, 1, 4]\nD: [4, 1, 2, 3, 5, 0]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_56_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_56_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_56_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_56_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_56_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_56_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [1, 3, 4, 0, 2, 5]\nB: [5, 2, 0, 1, 3, 4]\nC: [3, 1, 2, 4, 5, 0]\nD: [0, 1, 5, 3, 2, 4]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [1, 3, 4, 0, 2, 5]\nB: [5, 2, 0, 1, 3, 4]\nC: [3, 1, 2, 4, 5, 0]\nD: [0, 1, 5, 3, 2, 4]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_57_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_57_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_57_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_57_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_57_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_57_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [2, 3, 1, 0]\nB: [0, 2, 3, 1]\nC: [0, 2, 3, 1]\nD: [0, 2, 3, 1]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [2, 3, 1, 0]\nB: [0, 2, 3, 1]\nC: [0, 2, 3, 1]\nD: [0, 2, 3, 1]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_58_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_58_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_58_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_58_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [0, 3, 5, 15, 12, 11, 2, 13, 1, 10, 6, 14, 9, 4, 8, 7]\nB: [9, 3, 12, 11, 6, 0, 5, 14, 4, 10, 15, 8, 7, 13, 2, 1]\nC: [5, 1, 2, 3, 14, 7, 0, 13, 11, 15, 8, 9, 12, 4, 10, 6]\nD: [13, 6, 14, 10, 2, 3, 5, 12, 7, 15, 9, 8, 4, 1, 0, 11]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [0, 3, 5, 15, 12, 11, 2, 13, 1, 10, 6, 14, 9, 4, 8, 7]\nB: [9, 3, 12, 11, 6, 0, 5, 14, 4, 10, 15, 8, 7, 13, 2, 1]\nC: [5, 1, 2, 3, 14, 7, 0, 13, 11, 15, 8, 9, 12, 4, 10, 6]\nD: [13, 6, 14, 10, 2, 3, 5, 12, 7, 15, 9, 8, 4, 1, 0, 11]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_59_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_59_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_59_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_59_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_59_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_59_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_59_6.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_59_7.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_59_8.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_59_9.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_59_10.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_59_11.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_59_12.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_59_13.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_59_14.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_59_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [9, 8, 4, 6, 2, 5, 1, 7, 0, 10, 3, 11]\nB: [11, 4, 2, 5, 3, 6, 8, 10, 9, 7, 1, 0]\nC: [7, 8, 1, 6, 11, 5, 4, 3, 0, 2, 9, 10]\nD: [4, 6, 8, 2, 5, 0, 3, 1, 10, 7, 9, 11]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [9, 8, 4, 6, 2, 5, 1, 7, 0, 10, 3, 11]\nB: [11, 4, 2, 5, 3, 6, 8, 10, 9, 7, 1, 0]\nC: [7, 8, 1, 6, 11, 5, 4, 3, 0, 2, 9, 10]\nD: [4, 6, 8, 2, 5, 0, 3, 1, 10, 7, 9, 11]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_60_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_60_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_60_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_60_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_60_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_60_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_60_6.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_60_7.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_60_8.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_60_9.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_60_10.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_60_11.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [0, 4, 1, 2, 5, 3]\nB: [4, 2, 5, 0, 3, 1]\nC: [3, 1, 0, 2, 5, 4]\nD: [0, 2, 3, 1, 4, 5]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [0, 4, 1, 2, 5, 3]\nB: [4, 2, 5, 0, 3, 1]\nC: [3, 1, 0, 2, 5, 4]\nD: [0, 2, 3, 1, 4, 5]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_61_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_61_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_61_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_61_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_61_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_61_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [3, 2, 0, 1]\nB: [0, 3, 2, 1]\nC: [1, 2, 0, 3]\nD: [1, 3, 2, 0]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [3, 2, 0, 1]\nB: [0, 3, 2, 1]\nC: [1, 2, 0, 3]\nD: [1, 3, 2, 0]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_62_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_62_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_62_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_62_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "pouring", "options": "A: [8, 1, 0, 6, 5, 7, 2, 3, 4]\nB: [2, 6, 8, 7, 4, 0, 1, 5, 3]\nC: [5, 8, 2, 7, 1, 3, 0, 4, 6]\nD: [1, 3, 2, 6, 5, 8, 0, 7, 4]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [8, 1, 0, 6, 5, 7, 2, 3, 4]\nB: [2, 6, 8, 7, 4, 0, 1, 5, 3]\nC: [5, 8, 2, 7, 1, 3, 0, 4, 6]\nD: [1, 3, 2, 6, 5, 8, 0, 7, 4]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_63_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_63_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_63_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_63_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_63_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_63_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_63_6.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_63_7.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_63_8.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [3, 4, 2, 5, 0, 1]\nB: [4, 5, 1, 2, 0, 3]\nC: [4, 2, 0, 1, 3, 5]\nD: [0, 4, 3, 1, 2, 5]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [3, 4, 2, 5, 0, 1]\nB: [4, 5, 1, 2, 0, 3]\nC: [4, 2, 0, 1, 3, 5]\nD: [0, 4, 3, 1, 2, 5]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_64_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_64_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_64_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_64_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_64_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_64_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [3, 4, 0, 2, 1]\nB: [4, 1, 0, 2, 3]\nC: [1, 0, 2, 3, 4]\nD: [0, 1, 3, 4, 2]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [3, 4, 0, 2, 1]\nB: [4, 1, 0, 2, 3]\nC: [1, 0, 2, 3, 4]\nD: [0, 1, 3, 4, 2]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_65_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_65_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_65_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_65_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_65_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [0, 5, 4, 3, 2, 1]\nB: [1, 3, 2, 4, 5, 0]\nC: [4, 2, 5, 1, 0, 3]\nD: [4, 0, 3, 5, 2, 1]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [0, 5, 4, 3, 2, 1]\nB: [1, 3, 2, 4, 5, 0]\nC: [4, 2, 5, 1, 0, 3]\nD: [4, 0, 3, 5, 2, 1]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_66_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_66_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_66_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_66_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_66_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_66_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [0, 1, 5, 4, 2, 3]\nB: [0, 5, 1, 2, 4, 3]\nC: [1, 3, 0, 4, 5, 2]\nD: [4, 0, 3, 1, 2, 5]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [0, 1, 5, 4, 2, 3]\nB: [0, 5, 1, 2, 4, 3]\nC: [1, 3, 0, 4, 5, 2]\nD: [4, 0, 3, 1, 2, 5]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_67_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_67_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_67_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_67_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_67_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_67_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [0, 6, 2, 5, 7, 3, 1, 4]\nB: [5, 1, 0, 6, 4, 7, 3, 2]\nC: [6, 4, 2, 5, 7, 1, 0, 3]\nD: [0, 2, 4, 7, 3, 5, 6, 1]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [0, 6, 2, 5, 7, 3, 1, 4]\nB: [5, 1, 0, 6, 4, 7, 3, 2]\nC: [6, 4, 2, 5, 7, 1, 0, 3]\nD: [0, 2, 4, 7, 3, 5, 6, 1]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_68_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_68_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_68_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_68_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_68_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_68_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_68_6.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_68_7.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [3, 1, 2, 0]\nB: [3, 1, 2, 0]\nC: [0, 1, 2, 3]\nD: [1, 3, 0, 2]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [3, 1, 2, 0]\nB: [3, 1, 2, 0]\nC: [0, 1, 2, 3]\nD: [1, 3, 0, 2]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_69_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_69_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_69_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_69_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [1, 2, 0, 3]\nB: [1, 3, 0, 2]\nC: [2, 0, 1, 3]\nD: [0, 1, 3, 2]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [1, 2, 0, 3]\nB: [1, 3, 0, 2]\nC: [2, 0, 1, 3]\nD: [0, 1, 3, 2]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_70_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_70_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_70_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_70_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [3, 0, 2, 1]\nB: [3, 0, 2, 1]\nC: [3, 2, 0, 1]\nD: [2, 1, 3, 0]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [3, 0, 2, 1]\nB: [3, 0, 2, 1]\nC: [3, 2, 0, 1]\nD: [2, 1, 3, 0]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_71_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_71_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_71_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_71_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [5, 3, 1, 0, 2, 4]\nB: [5, 4, 0, 2, 1, 3]\nC: [3, 5, 4, 0, 2, 1]\nD: [2, 0, 3, 5, 1, 4]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [5, 3, 1, 0, 2, 4]\nB: [5, 4, 0, 2, 1, 3]\nC: [3, 5, 4, 0, 2, 1]\nD: [2, 0, 3, 5, 1, 4]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_72_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_72_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_72_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_72_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_72_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_72_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [0, 1, 3, 2]\nB: [3, 2, 0, 1]\nC: [3, 1, 2, 0]\nD: [3, 0, 2, 1]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [0, 1, 3, 2]\nB: [3, 2, 0, 1]\nC: [3, 1, 2, 0]\nD: [3, 0, 2, 1]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_73_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_73_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_73_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_73_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "pouring", "options": "A: [3, 4, 5, 6, 2, 1, 0]\nB: [2, 6, 3, 0, 1, 5, 4]\nC: [5, 2, 4, 6, 3, 1, 0]\nD: [4, 0, 6, 1, 2, 5, 3]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [3, 4, 5, 6, 2, 1, 0]\nB: [2, 6, 3, 0, 1, 5, 4]\nC: [5, 2, 4, 6, 3, 1, 0]\nD: [4, 0, 6, 1, 2, 5, 3]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_74_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_74_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_74_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_74_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_74_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_74_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_74_6.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [2, 5, 3, 4, 1, 0, 6]\nB: [5, 6, 1, 0, 4, 2, 3]\nC: [0, 4, 2, 5, 1, 6, 3]\nD: [4, 3, 2, 6, 1, 5, 0]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [2, 5, 3, 4, 1, 0, 6]\nB: [5, 6, 1, 0, 4, 2, 3]\nC: [0, 4, 2, 5, 1, 6, 3]\nD: [4, 3, 2, 6, 1, 5, 0]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_75_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_75_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_75_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_75_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_75_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_75_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_75_6.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [5, 2, 0, 6, 3, 4, 1]\nB: [5, 4, 3, 6, 2, 0, 1]\nC: [4, 6, 3, 2, 5, 0, 1]\nD: [3, 4, 2, 5, 0, 1, 6]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [5, 2, 0, 6, 3, 4, 1]\nB: [5, 4, 3, 6, 2, 0, 1]\nC: [4, 6, 3, 2, 5, 0, 1]\nD: [3, 4, 2, 5, 0, 1, 6]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_76_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_76_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_76_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_76_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_76_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_76_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_76_6.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [0, 3, 2, 1]\nB: [2, 1, 0, 3]\nC: [1, 0, 2, 3]\nD: [1, 0, 2, 3]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [0, 3, 2, 1]\nB: [2, 1, 0, 3]\nC: [1, 0, 2, 3]\nD: [1, 0, 2, 3]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_77_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_77_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_77_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_77_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "pouring", "options": "A: [7, 9, 0, 8, 4, 2, 5, 6, 1, 3]\nB: [3, 4, 8, 9, 2, 1, 6, 7, 0, 5]\nC: [4, 0, 9, 1, 6, 7, 3, 5, 8, 2]\nD: [7, 2, 1, 9, 4, 6, 5, 3, 8, 0]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [7, 9, 0, 8, 4, 2, 5, 6, 1, 3]\nB: [3, 4, 8, 9, 2, 1, 6, 7, 0, 5]\nC: [4, 0, 9, 1, 6, 7, 3, 5, 8, 2]\nD: [7, 2, 1, 9, 4, 6, 5, 3, 8, 0]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_78_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_78_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_78_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_78_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_78_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_78_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_78_6.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_78_7.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_78_8.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_78_9.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [1, 2, 3, 0]\nB: [2, 0, 3, 1]\nC: [0, 1, 3, 2]\nD: [3, 2, 1, 0]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [1, 2, 3, 0]\nB: [2, 0, 3, 1]\nC: [0, 1, 3, 2]\nD: [3, 2, 1, 0]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_79_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_79_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_79_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_79_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [12, 19, 11, 1, 15, 16, 5, 3, 7, 6, 4, 14, 9, 17, 2, 10, 0, 8, 18, 13]\nB: [8, 19, 1, 11, 9, 14, 10, 4, 0, 7, 6, 12, 15, 13, 17, 2, 5, 16, 18, 3]\nC: [13, 2, 0, 15, 10, 17, 11, 7, 4, 1, 19, 16, 5, 18, 9, 8, 6, 14, 12, 3]\nD: [19, 11, 3, 18, 7, 8, 12, 16, 17, 2, 0, 10, 6, 15, 1, 13, 14, 5, 9, 4]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [12, 19, 11, 1, 15, 16, 5, 3, 7, 6, 4, 14, 9, 17, 2, 10, 0, 8, 18, 13]\nB: [8, 19, 1, 11, 9, 14, 10, 4, 0, 7, 6, 12, 15, 13, 17, 2, 5, 16, 18, 3]\nC: [13, 2, 0, 15, 10, 17, 11, 7, 4, 1, 19, 16, 5, 18, 9, 8, 6, 14, 12, 3]\nD: [19, 11, 3, 18, 7, 8, 12, 16, 17, 2, 0, 10, 6, 15, 1, 13, 14, 5, 9, 4]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_80_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_80_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_80_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_80_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_80_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_80_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_80_6.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_80_7.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_80_8.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_80_9.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_80_10.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_80_11.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_80_12.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_80_13.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_80_14.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_80_15.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_80_16.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_80_17.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_80_18.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_80_19.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [0, 3, 2, 1]\nB: [1, 2, 3, 0]\nC: [2, 3, 1, 0]\nD: [1, 3, 0, 2]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [0, 3, 2, 1]\nB: [1, 2, 3, 0]\nC: [2, 3, 1, 0]\nD: [1, 3, 0, 2]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_81_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_81_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_81_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_81_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "pouring", "options": "A: [3, 5, 1, 9, 10, 2, 4, 6, 0, 8, 7]\nB: [3, 7, 6, 8, 2, 4, 9, 5, 0, 1, 10]\nC: [0, 9, 6, 7, 10, 1, 2, 3, 4, 5, 8]\nD: [3, 9, 10, 0, 4, 8, 1, 5, 7, 2, 6]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [3, 5, 1, 9, 10, 2, 4, 6, 0, 8, 7]\nB: [3, 7, 6, 8, 2, 4, 9, 5, 0, 1, 10]\nC: [0, 9, 6, 7, 10, 1, 2, 3, 4, 5, 8]\nD: [3, 9, 10, 0, 4, 8, 1, 5, 7, 2, 6]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_82_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_82_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_82_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_82_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_82_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_82_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_82_6.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_82_7.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_82_8.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_82_9.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_82_10.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "pouring", "options": "A: [8, 6, 1, 7, 0, 5, 3, 4, 2, 9]\nB: [2, 0, 9, 8, 6, 3, 4, 5, 7, 1]\nC: [4, 7, 3, 0, 1, 2, 5, 6, 8, 9]\nD: [9, 5, 2, 3, 6, 0, 1, 7, 4, 8]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [8, 6, 1, 7, 0, 5, 3, 4, 2, 9]\nB: [2, 0, 9, 8, 6, 3, 4, 5, 7, 1]\nC: [4, 7, 3, 0, 1, 2, 5, 6, 8, 9]\nD: [9, 5, 2, 3, 6, 0, 1, 7, 4, 8]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_83_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_83_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_83_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_83_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_83_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_83_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_83_6.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_83_7.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_83_8.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_83_9.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [4, 1, 6, 2, 5, 0, 3]\nB: [6, 5, 3, 0, 4, 2, 1]\nC: [3, 5, 6, 2, 4, 0, 1]\nD: [5, 3, 4, 1, 0, 6, 2]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [4, 1, 6, 2, 5, 0, 3]\nB: [6, 5, 3, 0, 4, 2, 1]\nC: [3, 5, 6, 2, 4, 0, 1]\nD: [5, 3, 4, 1, 0, 6, 2]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_84_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_84_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_84_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_84_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_84_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_84_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_84_6.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [7, 0, 5, 6, 3, 4, 1, 2]\nB: [2, 6, 7, 0, 5, 1, 3, 4]\nC: [0, 2, 3, 5, 4, 6, 7, 1]\nD: [0, 4, 2, 5, 3, 7, 1, 6]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [7, 0, 5, 6, 3, 4, 1, 2]\nB: [2, 6, 7, 0, 5, 1, 3, 4]\nC: [0, 2, 3, 5, 4, 6, 7, 1]\nD: [0, 4, 2, 5, 3, 7, 1, 6]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_85_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_85_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_85_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_85_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_85_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_85_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_85_6.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_85_7.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [5, 1, 6, 2, 0, 7, 3, 4]\nB: [6, 0, 3, 1, 5, 2, 7, 4]\nC: [1, 5, 6, 0, 4, 7, 3, 2]\nD: [0, 4, 7, 3, 2, 6, 1, 5]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [5, 1, 6, 2, 0, 7, 3, 4]\nB: [6, 0, 3, 1, 5, 2, 7, 4]\nC: [1, 5, 6, 0, 4, 7, 3, 2]\nD: [0, 4, 7, 3, 2, 6, 1, 5]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_86_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_86_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_86_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_86_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_86_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_86_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_86_6.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_86_7.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [1, 5, 0, 4, 3, 2, 6]\nB: [3, 6, 1, 5, 0, 2, 4]\nC: [5, 6, 4, 1, 0, 2, 3]\nD: [6, 2, 1, 5, 4, 0, 3]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [1, 5, 0, 4, 3, 2, 6]\nB: [3, 6, 1, 5, 0, 2, 4]\nC: [5, 6, 4, 1, 0, 2, 3]\nD: [6, 2, 1, 5, 4, 0, 3]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_87_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_87_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_87_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_87_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_87_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_87_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_87_6.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [0, 13, 28, 14, 21, 5, 27, 1, 24, 23, 2, 12, 18, 22, 29, 9, 17, 30, 6, 11, 10, 3, 16, 8, 25, 7, 19, 15, 20, 4, 26]\nB: [5, 2, 10, 30, 8, 15, 29, 22, 27, 12, 13, 3, 24, 11, 0, 25, 19, 16, 23, 17, 18, 21, 7, 20, 6, 1, 28, 14, 26, 9, 4]\nC: [1, 2, 29, 6, 0, 22, 19, 21, 9, 13, 12, 18, 23, 3, 30, 5, 7, 20, 8, 25, 16, 28, 10, 11, 26, 15, 14, 4, 24, 27, 17]\nD: [21, 24, 8, 10, 5, 29, 13, 19, 26, 17, 28, 20, 7, 16, 14, 25, 15, 0, 6, 9, 3, 2, 4, 1, 22, 12, 18, 30, 23, 27, 11]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [0, 13, 28, 14, 21, 5, 27, 1, 24, 23, 2, 12, 18, 22, 29, 9, 17, 30, 6, 11, 10, 3, 16, 8, 25, 7, 19, 15, 20, 4, 26]\nB: [5, 2, 10, 30, 8, 15, 29, 22, 27, 12, 13, 3, 24, 11, 0, 25, 19, 16, 23, 17, 18, 21, 7, 20, 6, 1, 28, 14, 26, 9, 4]\nC: [1, 2, 29, 6, 0, 22, 19, 21, 9, 13, 12, 18, 23, 3, 30, 5, 7, 20, 8, 25, 16, 28, 10, 11, 26, 15, 14, 4, 24, 27, 17]\nD: [21, 24, 8, 10, 5, 29, 13, 19, 26, 17, 28, 20, 7, 16, 14, 25, 15, 0, 6, 9, 3, 2, 4, 1, 22, 12, 18, 30, 23, 27, 11]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_88_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_88_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_88_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_88_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_88_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_88_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_88_6.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_88_7.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_88_8.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_88_9.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_88_10.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_88_11.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_88_12.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_88_13.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_88_14.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_88_15.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_88_16.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_88_17.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_88_18.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_88_19.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_88_20.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_88_21.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_88_22.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_88_23.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_88_24.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_88_25.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_88_26.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_88_27.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_88_28.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_88_29.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_88_30.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [0, 2, 1, 3]\nB: [2, 1, 3, 0]\nC: [2, 3, 1, 0]\nD: [2, 3, 0, 1]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [0, 2, 1, 3]\nB: [2, 1, 3, 0]\nC: [2, 3, 1, 0]\nD: [2, 3, 0, 1]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_89_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_89_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_89_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_89_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [0, 1, 3, 2]\nB: [1, 0, 2, 3]\nC: [3, 0, 1, 2]\nD: [0, 2, 1, 3]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [0, 1, 3, 2]\nB: [1, 0, 2, 3]\nC: [3, 0, 1, 2]\nD: [0, 2, 1, 3]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_90_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_90_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_90_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_90_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [2, 1, 3, 0]\nB: [1, 0, 3, 2]\nC: [0, 1, 3, 2]\nD: [0, 1, 3, 2]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [2, 1, 3, 0]\nB: [1, 0, 3, 2]\nC: [0, 1, 3, 2]\nD: [0, 1, 3, 2]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_91_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_91_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_91_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_91_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [1, 2, 0, 3]\nB: [3, 2, 0, 1]\nC: [0, 2, 3, 1]\nD: [2, 0, 1, 3]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [1, 2, 0, 3]\nB: [3, 2, 0, 1]\nC: [0, 2, 3, 1]\nD: [2, 0, 1, 3]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_92_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_92_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_92_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_92_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [4, 2, 1, 0, 3]\nB: [2, 3, 4, 1, 0]\nC: [3, 1, 4, 0, 2]\nD: [4, 3, 0, 2, 1]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [4, 2, 1, 0, 3]\nB: [2, 3, 4, 1, 0]\nC: [3, 1, 4, 0, 2]\nD: [4, 3, 0, 2, 1]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_93_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_93_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_93_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_93_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_93_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [3, 4, 1, 5, 2, 0]\nB: [0, 2, 4, 3, 1, 5]\nC: [5, 1, 2, 0, 3, 4]\nD: [0, 3, 4, 5, 1, 2]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [3, 4, 1, 5, 2, 0]\nB: [0, 2, 4, 3, 1, 5]\nC: [5, 1, 2, 0, 3, 4]\nD: [0, 3, 4, 5, 1, 2]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_94_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_94_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_94_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_94_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_94_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_94_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [0, 2, 1, 3]\nB: [0, 2, 3, 1]\nC: [0, 1, 3, 2]\nD: [3, 1, 0, 2]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [0, 2, 1, 3]\nB: [0, 2, 3, 1]\nC: [0, 1, 3, 2]\nD: [3, 1, 0, 2]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_95_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_95_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_95_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_95_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [4, 1, 0, 3, 2]\nB: [0, 1, 3, 4, 2]\nC: [1, 4, 3, 2, 0]\nD: [2, 4, 0, 3, 1]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [4, 1, 0, 3, 2]\nB: [0, 1, 3, 4, 2]\nC: [1, 4, 3, 2, 0]\nD: [2, 4, 0, 3, 1]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_96_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_96_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_96_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_96_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_96_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [3, 0, 4, 2, 6, 1, 5]\nB: [1, 6, 4, 0, 2, 5, 3]\nC: [4, 6, 3, 1, 0, 2, 5]\nD: [3, 1, 5, 2, 6, 0, 4]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [3, 0, 4, 2, 6, 1, 5]\nB: [1, 6, 4, 0, 2, 5, 3]\nC: [4, 6, 3, 1, 0, 2, 5]\nD: [3, 1, 5, 2, 6, 0, 4]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_97_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_97_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_97_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_97_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_97_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_97_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_97_6.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "pouring", "options": "A: [5, 3, 6, 0, 2, 1, 4]\nB: [3, 0, 2, 4, 5, 6, 1]\nC: [3, 6, 5, 1, 0, 2, 4]\nD: [3, 1, 2, 0, 4, 6, 5]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [5, 3, 6, 0, 2, 1, 4]\nB: [3, 0, 2, 4, 5, 6, 1]\nC: [3, 6, 5, 1, 0, 2, 4]\nD: [3, 1, 2, 0, 4, 6, 5]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_98_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_98_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_98_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_98_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_98_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_98_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_98_6.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [0, 3, 4, 2, 1]\nB: [1, 0, 4, 2, 3]\nC: [2, 1, 4, 3, 0]\nD: [2, 0, 4, 1, 3]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [0, 3, 4, 2, 1]\nB: [1, 0, 4, 2, 3]\nC: [2, 1, 4, 3, 0]\nD: [2, 0, 4, 1, 3]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_99_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_99_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_99_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_99_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_99_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [5, 2, 3, 0, 4, 1]\nB: [3, 5, 4, 1, 2, 0]\nC: [1, 0, 4, 5, 2, 3]\nD: [0, 1, 5, 3, 4, 2]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [5, 2, 3, 0, 4, 1]\nB: [3, 5, 4, 1, 2, 0]\nC: [1, 0, 4, 5, 2, 3]\nD: [0, 1, 5, 3, 4, 2]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_100_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_100_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_100_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_100_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_100_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_100_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [2, 1, 3, 0]\nB: [3, 0, 2, 1]\nC: [0, 1, 2, 3]\nD: [3, 1, 0, 2]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [2, 1, 3, 0]\nB: [3, 0, 2, 1]\nC: [0, 1, 2, 3]\nD: [3, 1, 0, 2]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_101_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_101_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_101_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_101_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [3, 2, 1, 0]\nB: [3, 2, 1, 0]\nC: [1, 2, 3, 0]\nD: [1, 3, 0, 2]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [3, 2, 1, 0]\nB: [3, 2, 1, 0]\nC: [1, 2, 3, 0]\nD: [1, 3, 0, 2]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_102_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_102_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_102_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_102_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "pouring", "options": "A: [3, 1, 2, 0, 6, 4, 5]\nB: [6, 2, 4, 3, 5, 1, 0]\nC: [3, 4, 2, 6, 5, 1, 0]\nD: [5, 0, 1, 6, 2, 4, 3]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [3, 1, 2, 0, 6, 4, 5]\nB: [6, 2, 4, 3, 5, 1, 0]\nC: [3, 4, 2, 6, 5, 1, 0]\nD: [5, 0, 1, 6, 2, 4, 3]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_103_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_103_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_103_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_103_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_103_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_103_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_103_6.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [8, 2, 0, 5, 7, 4, 6, 3, 1]\nB: [7, 8, 1, 0, 6, 4, 5, 3, 2]\nC: [7, 1, 6, 5, 4, 2, 8, 0, 3]\nD: [0, 4, 5, 1, 2, 6, 3, 7, 8]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [8, 2, 0, 5, 7, 4, 6, 3, 1]\nB: [7, 8, 1, 0, 6, 4, 5, 3, 2]\nC: [7, 1, 6, 5, 4, 2, 8, 0, 3]\nD: [0, 4, 5, 1, 2, 6, 3, 7, 8]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_104_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_104_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_104_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_104_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_104_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_104_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_104_6.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_104_7.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_104_8.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [1, 2, 3, 0]\nB: [2, 0, 3, 1]\nC: [1, 2, 3, 0]\nD: [0, 3, 2, 1]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [1, 2, 3, 0]\nB: [2, 0, 3, 1]\nC: [1, 2, 3, 0]\nD: [0, 3, 2, 1]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_105_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_105_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_105_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_105_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [0, 3, 1, 2]\nB: [2, 3, 0, 1]\nC: [1, 0, 3, 2]\nD: [1, 0, 2, 3]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [0, 3, 1, 2]\nB: [2, 3, 0, 1]\nC: [1, 0, 3, 2]\nD: [1, 0, 2, 3]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_106_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_106_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_106_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_106_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [3, 4, 7, 10, 2, 6, 8, 11, 0, 9, 12, 13, 5, 1]\nB: [8, 1, 12, 0, 9, 2, 11, 13, 5, 6, 4, 3, 10, 7]\nC: [5, 11, 12, 4, 13, 6, 3, 0, 9, 10, 1, 2, 8, 7]\nD: [4, 13, 1, 11, 2, 8, 10, 9, 6, 5, 0, 7, 3, 12]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [3, 4, 7, 10, 2, 6, 8, 11, 0, 9, 12, 13, 5, 1]\nB: [8, 1, 12, 0, 9, 2, 11, 13, 5, 6, 4, 3, 10, 7]\nC: [5, 11, 12, 4, 13, 6, 3, 0, 9, 10, 1, 2, 8, 7]\nD: [4, 13, 1, 11, 2, 8, 10, 9, 6, 5, 0, 7, 3, 12]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_107_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_107_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_107_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_107_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_107_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_107_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_107_6.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_107_7.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_107_8.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_107_9.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_107_10.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_107_11.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_107_12.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_107_13.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [0, 2, 9, 10, 6, 11, 5, 3, 8, 13, 4, 7, 1, 12]\nB: [1, 6, 10, 7, 9, 13, 2, 0, 5, 3, 4, 8, 12, 11]\nC: [6, 12, 3, 8, 5, 11, 4, 10, 1, 0, 13, 2, 7, 9]\nD: [2, 7, 9, 4, 10, 11, 3, 8, 0, 13, 6, 12, 1, 5]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [0, 2, 9, 10, 6, 11, 5, 3, 8, 13, 4, 7, 1, 12]\nB: [1, 6, 10, 7, 9, 13, 2, 0, 5, 3, 4, 8, 12, 11]\nC: [6, 12, 3, 8, 5, 11, 4, 10, 1, 0, 13, 2, 7, 9]\nD: [2, 7, 9, 4, 10, 11, 3, 8, 0, 13, 6, 12, 1, 5]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_108_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_108_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_108_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_108_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_108_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_108_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_108_6.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_108_7.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_108_8.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_108_9.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_108_10.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_108_11.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_108_12.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_108_13.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [7, 5, 6, 2, 0, 1, 3, 4]\nB: [7, 6, 4, 0, 1, 3, 5, 2]\nC: [4, 5, 2, 1, 6, 7, 0, 3]\nD: [4, 5, 6, 2, 7, 1, 0, 3]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [7, 5, 6, 2, 0, 1, 3, 4]\nB: [7, 6, 4, 0, 1, 3, 5, 2]\nC: [4, 5, 2, 1, 6, 7, 0, 3]\nD: [4, 5, 6, 2, 7, 1, 0, 3]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_109_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_109_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_109_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_109_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_109_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_109_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_109_6.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_109_7.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [3, 2, 1, 0]\nB: [3, 0, 2, 1]\nC: [0, 1, 3, 2]\nD: [3, 0, 2, 1]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [3, 2, 1, 0]\nB: [3, 0, 2, 1]\nC: [0, 1, 3, 2]\nD: [3, 0, 2, 1]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_110_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_110_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_110_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_110_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [1, 4, 0, 3, 2]\nB: [2, 3, 1, 0, 4]\nC: [4, 1, 0, 3, 2]\nD: [3, 0, 2, 4, 1]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [1, 4, 0, 3, 2]\nB: [2, 3, 1, 0, 4]\nC: [4, 1, 0, 3, 2]\nD: [3, 0, 2, 4, 1]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_111_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_111_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_111_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_111_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_111_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "pouring", "options": "A: [4, 0, 1, 7, 5, 2, 3, 8, 6]\nB: [4, 2, 3, 1, 7, 8, 0, 5, 6]\nC: [5, 6, 7, 1, 4, 0, 3, 2, 8]\nD: [3, 1, 8, 7, 5, 6, 0, 4, 2]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [4, 0, 1, 7, 5, 2, 3, 8, 6]\nB: [4, 2, 3, 1, 7, 8, 0, 5, 6]\nC: [5, 6, 7, 1, 4, 0, 3, 2, 8]\nD: [3, 1, 8, 7, 5, 6, 0, 4, 2]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_112_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_112_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_112_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_112_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_112_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_112_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_112_6.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_112_7.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_112_8.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [2, 1, 3, 0, 4]\nB: [4, 2, 0, 1, 3]\nC: [2, 0, 4, 3, 1]\nD: [1, 3, 2, 0, 4]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [2, 1, 3, 0, 4]\nB: [4, 2, 0, 1, 3]\nC: [2, 0, 4, 3, 1]\nD: [1, 3, 2, 0, 4]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_113_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_113_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_113_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_113_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_113_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [3, 2, 0, 1, 4, 5]\nB: [3, 0, 1, 5, 2, 4]\nC: [5, 3, 0, 4, 2, 1]\nD: [4, 1, 5, 2, 3, 0]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [3, 2, 0, 1, 4, 5]\nB: [3, 0, 1, 5, 2, 4]\nC: [5, 3, 0, 4, 2, 1]\nD: [4, 1, 5, 2, 3, 0]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_114_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_114_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_114_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_114_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_114_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_114_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [1, 0, 3, 2]\nB: [0, 3, 2, 1]\nC: [2, 0, 3, 1]\nD: [1, 3, 0, 2]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [1, 0, 3, 2]\nB: [0, 3, 2, 1]\nC: [2, 0, 3, 1]\nD: [1, 3, 0, 2]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_115_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_115_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_115_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_115_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [1, 0, 2, 4, 3]\nB: [4, 3, 1, 0, 2]\nC: [0, 2, 4, 1, 3]\nD: [4, 3, 2, 1, 0]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [1, 0, 2, 4, 3]\nB: [4, 3, 1, 0, 2]\nC: [0, 2, 4, 1, 3]\nD: [4, 3, 2, 1, 0]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_116_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_116_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_116_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_116_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_116_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [2, 4, 3, 0, 1]\nB: [1, 0, 3, 2, 4]\nC: [1, 2, 4, 0, 3]\nD: [2, 4, 3, 1, 0]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [2, 4, 3, 0, 1]\nB: [1, 0, 3, 2, 4]\nC: [1, 2, 4, 0, 3]\nD: [2, 4, 3, 1, 0]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_117_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_117_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_117_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_117_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_117_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [2, 3, 0, 1, 4, 6, 8, 5, 7]\nB: [3, 1, 7, 4, 0, 8, 2, 5, 6]\nC: [3, 5, 8, 1, 7, 0, 2, 4, 6]\nD: [5, 7, 0, 1, 4, 6, 3, 8, 2]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [2, 3, 0, 1, 4, 6, 8, 5, 7]\nB: [3, 1, 7, 4, 0, 8, 2, 5, 6]\nC: [3, 5, 8, 1, 7, 0, 2, 4, 6]\nD: [5, 7, 0, 1, 4, 6, 3, 8, 2]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_118_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_118_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_118_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_118_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_118_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_118_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_118_6.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_118_7.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_118_8.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [2, 4, 0, 3, 1]\nB: [2, 4, 3, 1, 0]\nC: [0, 3, 4, 1, 2]\nD: [1, 2, 4, 3, 0]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [2, 4, 0, 3, 1]\nB: [2, 4, 3, 1, 0]\nC: [0, 3, 4, 1, 2]\nD: [1, 2, 4, 3, 0]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_119_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_119_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_119_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_119_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_119_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [1, 0, 3, 2]\nB: [1, 3, 2, 0]\nC: [1, 2, 3, 0]\nD: [2, 0, 1, 3]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [1, 0, 3, 2]\nB: [1, 3, 2, 0]\nC: [1, 2, 3, 0]\nD: [2, 0, 1, 3]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_120_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_120_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_120_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_120_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [1, 2, 3, 0]\nB: [0, 1, 3, 2]\nC: [1, 2, 3, 0]\nD: [3, 1, 0, 2]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [1, 2, 3, 0]\nB: [0, 1, 3, 2]\nC: [1, 2, 3, 0]\nD: [3, 1, 0, 2]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_121_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_121_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_121_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_121_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [3, 2, 4, 1, 0]\nB: [1, 3, 0, 4, 2]\nC: [4, 0, 1, 2, 3]\nD: [1, 4, 2, 0, 3]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [3, 2, 4, 1, 0]\nB: [1, 3, 0, 4, 2]\nC: [4, 0, 1, 2, 3]\nD: [1, 4, 2, 0, 3]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_122_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_122_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_122_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_122_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_122_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [5, 3, 4, 1, 7, 12, 13, 10, 2, 6, 0, 9, 11, 8]\nB: [11, 10, 2, 4, 13, 8, 5, 3, 0, 6, 7, 9, 12, 1]\nC: [4, 6, 12, 1, 5, 11, 2, 8, 13, 3, 0, 9, 10, 7]\nD: [6, 2, 5, 7, 12, 10, 8, 0, 3, 13, 9, 11, 1, 4]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [5, 3, 4, 1, 7, 12, 13, 10, 2, 6, 0, 9, 11, 8]\nB: [11, 10, 2, 4, 13, 8, 5, 3, 0, 6, 7, 9, 12, 1]\nC: [4, 6, 12, 1, 5, 11, 2, 8, 13, 3, 0, 9, 10, 7]\nD: [6, 2, 5, 7, 12, 10, 8, 0, 3, 13, 9, 11, 1, 4]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_123_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_123_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_123_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_123_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_123_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_123_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_123_6.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_123_7.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_123_8.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_123_9.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_123_10.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_123_11.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_123_12.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_123_13.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [1, 2, 3, 0]\nB: [2, 3, 1, 0]\nC: [3, 1, 2, 0]\nD: [1, 2, 0, 3]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [1, 2, 3, 0]\nB: [2, 3, 1, 0]\nC: [3, 1, 2, 0]\nD: [1, 2, 0, 3]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_124_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_124_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_124_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_124_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [0, 5, 3, 1, 2, 10, 19, 16, 8, 7, 21, 15, 22, 18, 17, 4, 14, 13, 20, 11, 9, 6, 12]\nB: [19, 2, 1, 20, 8, 14, 18, 7, 15, 12, 13, 17, 10, 5, 9, 3, 21, 4, 6, 11, 16, 22, 0]\nC: [3, 0, 7, 10, 15, 21, 14, 6, 22, 11, 1, 8, 18, 5, 17, 2, 4, 12, 19, 13, 20, 9, 16]\nD: [3, 11, 7, 6, 22, 15, 1, 9, 20, 19, 4, 18, 10, 21, 12, 2, 13, 17, 16, 0, 8, 14, 5]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [0, 5, 3, 1, 2, 10, 19, 16, 8, 7, 21, 15, 22, 18, 17, 4, 14, 13, 20, 11, 9, 6, 12]\nB: [19, 2, 1, 20, 8, 14, 18, 7, 15, 12, 13, 17, 10, 5, 9, 3, 21, 4, 6, 11, 16, 22, 0]\nC: [3, 0, 7, 10, 15, 21, 14, 6, 22, 11, 1, 8, 18, 5, 17, 2, 4, 12, 19, 13, 20, 9, 16]\nD: [3, 11, 7, 6, 22, 15, 1, 9, 20, 19, 4, 18, 10, 21, 12, 2, 13, 17, 16, 0, 8, 14, 5]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_125_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_125_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_125_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_125_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_125_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_125_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_125_6.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_125_7.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_125_8.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_125_9.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_125_10.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_125_11.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_125_12.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_125_13.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_125_14.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_125_15.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_125_16.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_125_17.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_125_18.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_125_19.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_125_20.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_125_21.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_125_22.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [3, 2, 1, 0, 4]\nB: [4, 1, 3, 0, 2]\nC: [1, 4, 2, 3, 0]\nD: [1, 3, 4, 0, 2]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [3, 2, 1, 0, 4]\nB: [4, 1, 3, 0, 2]\nC: [1, 4, 2, 3, 0]\nD: [1, 3, 4, 0, 2]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_126_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_126_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_126_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_126_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_126_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [1, 7, 6, 9, 5, 8, 3, 0, 4, 2]\nB: [4, 2, 5, 1, 0, 3, 7, 6, 8, 9]\nC: [0, 9, 2, 7, 1, 3, 5, 8, 4, 6]\nD: [6, 7, 4, 2, 9, 0, 5, 1, 8, 3]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [1, 7, 6, 9, 5, 8, 3, 0, 4, 2]\nB: [4, 2, 5, 1, 0, 3, 7, 6, 8, 9]\nC: [0, 9, 2, 7, 1, 3, 5, 8, 4, 6]\nD: [6, 7, 4, 2, 9, 0, 5, 1, 8, 3]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_127_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_127_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_127_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_127_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_127_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_127_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_127_6.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_127_7.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_127_8.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_127_9.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [0, 1, 2, 3]\nB: [2, 1, 3, 0]\nC: [0, 2, 3, 1]\nD: [3, 1, 0, 2]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 1, 3, 0]\nC: [0, 2, 3, 1]\nD: [3, 1, 0, 2]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_128_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_128_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_128_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_128_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [4, 5, 0, 7, 1, 3, 6, 2]\nB: [2, 6, 3, 1, 0, 4, 5, 7]\nC: [7, 1, 6, 0, 5, 2, 4, 3]\nD: [4, 3, 7, 6, 2, 0, 5, 1]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [4, 5, 0, 7, 1, 3, 6, 2]\nB: [2, 6, 3, 1, 0, 4, 5, 7]\nC: [7, 1, 6, 0, 5, 2, 4, 3]\nD: [4, 3, 7, 6, 2, 0, 5, 1]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_129_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_129_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_129_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_129_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_129_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_129_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_129_6.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_129_7.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [1, 2, 3, 0]\nB: [2, 3, 1, 0]\nC: [0, 1, 2, 3]\nD: [1, 3, 0, 2]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [1, 2, 3, 0]\nB: [2, 3, 1, 0]\nC: [0, 1, 2, 3]\nD: [1, 3, 0, 2]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_130_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_130_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_130_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_130_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [1, 2, 0, 3]\nB: [3, 0, 1, 2]\nC: [1, 3, 0, 2]\nD: [0, 2, 1, 3]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [1, 2, 0, 3]\nB: [3, 0, 1, 2]\nC: [1, 3, 0, 2]\nD: [0, 2, 1, 3]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_131_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_131_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_131_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_131_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [1, 3, 2, 0]\nB: [2, 3, 0, 1]\nC: [3, 0, 2, 1]\nD: [1, 2, 0, 3]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [1, 3, 2, 0]\nB: [2, 3, 0, 1]\nC: [3, 0, 2, 1]\nD: [1, 2, 0, 3]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_132_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_132_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_132_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_132_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [3, 4, 2, 5, 1, 0, 6]\nB: [4, 6, 3, 2, 5, 0, 1]\nC: [4, 5, 3, 1, 6, 2, 0]\nD: [5, 0, 2, 6, 3, 1, 4]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [3, 4, 2, 5, 1, 0, 6]\nB: [4, 6, 3, 2, 5, 0, 1]\nC: [4, 5, 3, 1, 6, 2, 0]\nD: [5, 0, 2, 6, 3, 1, 4]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_133_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_133_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_133_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_133_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_133_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_133_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_133_6.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [3, 1, 0, 2]\nB: [1, 3, 0, 2]\nC: [0, 2, 3, 1]\nD: [3, 1, 0, 2]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [3, 1, 0, 2]\nB: [1, 3, 0, 2]\nC: [0, 2, 3, 1]\nD: [3, 1, 0, 2]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_134_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_134_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_134_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_134_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [7, 6, 4, 8, 3, 2, 0, 1, 5]\nB: [0, 1, 7, 8, 3, 4, 5, 2, 6]\nC: [3, 2, 7, 4, 1, 8, 6, 0, 5]\nD: [5, 7, 2, 1, 6, 0, 8, 4, 3]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [7, 6, 4, 8, 3, 2, 0, 1, 5]\nB: [0, 1, 7, 8, 3, 4, 5, 2, 6]\nC: [3, 2, 7, 4, 1, 8, 6, 0, 5]\nD: [5, 7, 2, 1, 6, 0, 8, 4, 3]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_135_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_135_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_135_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_135_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_135_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_135_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_135_6.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_135_7.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_135_8.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [0, 2, 1, 3]\nB: [0, 1, 2, 3]\nC: [3, 0, 2, 1]\nD: [3, 1, 0, 2]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [0, 2, 1, 3]\nB: [0, 1, 2, 3]\nC: [3, 0, 2, 1]\nD: [3, 1, 0, 2]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_136_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_136_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_136_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_136_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [20, 10, 5, 3, 21, 12, 19, 16, 9, 7, 2, 8, 11, 6, 1, 13, 17, 0, 18, 15, 14, 4]\nB: [19, 10, 12, 21, 3, 17, 11, 1, 20, 18, 9, 14, 5, 8, 6, 13, 15, 2, 4, 0, 16, 7]\nC: [19, 6, 4, 2, 11, 13, 17, 5, 10, 7, 20, 8, 1, 15, 16, 21, 18, 9, 12, 3, 0, 14]\nD: [5, 15, 18, 21, 1, 19, 14, 16, 9, 3, 12, 2, 8, 20, 7, 4, 10, 13, 11, 17, 0, 6]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [20, 10, 5, 3, 21, 12, 19, 16, 9, 7, 2, 8, 11, 6, 1, 13, 17, 0, 18, 15, 14, 4]\nB: [19, 10, 12, 21, 3, 17, 11, 1, 20, 18, 9, 14, 5, 8, 6, 13, 15, 2, 4, 0, 16, 7]\nC: [19, 6, 4, 2, 11, 13, 17, 5, 10, 7, 20, 8, 1, 15, 16, 21, 18, 9, 12, 3, 0, 14]\nD: [5, 15, 18, 21, 1, 19, 14, 16, 9, 3, 12, 2, 8, 20, 7, 4, 10, 13, 11, 17, 0, 6]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_137_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_137_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_137_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_137_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_137_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_137_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_137_6.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_137_7.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_137_8.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_137_9.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_137_10.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_137_11.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_137_12.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_137_13.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_137_14.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_137_15.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_137_16.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_137_17.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_137_18.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_137_19.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_137_20.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_137_21.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [4, 3, 1, 6, 2, 0, 5]\nB: [5, 1, 2, 0, 6, 3, 4]\nC: [1, 6, 2, 4, 0, 5, 3]\nD: [0, 5, 3, 1, 2, 4, 6]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [4, 3, 1, 6, 2, 0, 5]\nB: [5, 1, 2, 0, 6, 3, 4]\nC: [1, 6, 2, 4, 0, 5, 3]\nD: [0, 5, 3, 1, 2, 4, 6]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_138_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_138_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_138_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_138_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_138_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_138_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_138_6.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [2, 3, 0, 1]\nB: [2, 1, 0, 3]\nC: [0, 1, 2, 3]\nD: [2, 3, 1, 0]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [2, 3, 0, 1]\nB: [2, 1, 0, 3]\nC: [0, 1, 2, 3]\nD: [2, 3, 1, 0]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_139_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_139_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_139_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_139_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [3, 0, 1, 2]\nB: [0, 1, 2, 3]\nC: [2, 0, 1, 3]\nD: [1, 3, 2, 0]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [3, 0, 1, 2]\nB: [0, 1, 2, 3]\nC: [2, 0, 1, 3]\nD: [1, 3, 2, 0]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_140_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_140_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_140_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_140_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [6, 4, 1, 5, 3, 2, 0]\nB: [3, 6, 5, 0, 2, 4, 1]\nC: [5, 3, 2, 1, 6, 0, 4]\nD: [0, 4, 6, 2, 1, 3, 5]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [6, 4, 1, 5, 3, 2, 0]\nB: [3, 6, 5, 0, 2, 4, 1]\nC: [5, 3, 2, 1, 6, 0, 4]\nD: [0, 4, 6, 2, 1, 3, 5]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_141_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_141_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_141_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_141_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_141_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_141_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_141_6.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [5, 2, 7, 3, 1, 6, 0, 4]\nB: [3, 5, 6, 0, 7, 4, 2, 1]\nC: [0, 1, 5, 4, 3, 7, 6, 2]\nD: [1, 0, 3, 5, 7, 6, 2, 4]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [5, 2, 7, 3, 1, 6, 0, 4]\nB: [3, 5, 6, 0, 7, 4, 2, 1]\nC: [0, 1, 5, 4, 3, 7, 6, 2]\nD: [1, 0, 3, 5, 7, 6, 2, 4]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_142_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_142_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_142_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_142_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_142_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_142_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_142_6.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_142_7.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [1, 3, 2, 0]\nB: [2, 0, 3, 1]\nC: [2, 3, 1, 0]\nD: [0, 2, 1, 3]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [1, 3, 2, 0]\nB: [2, 0, 3, 1]\nC: [2, 3, 1, 0]\nD: [0, 2, 1, 3]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_143_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_143_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_143_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_143_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "pouring", "options": "A: [0, 4, 2, 5, 1, 3]\nB: [3, 2, 0, 5, 1, 4]\nC: [1, 3, 0, 4, 5, 2]\nD: [3, 5, 4, 2, 0, 1]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [0, 4, 2, 5, 1, 3]\nB: [3, 2, 0, 5, 1, 4]\nC: [1, 3, 0, 4, 5, 2]\nD: [3, 5, 4, 2, 0, 1]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_144_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_144_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_144_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_144_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_144_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_144_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "pouring", "options": "A: [7, 6, 4, 2, 5, 1, 3, 9, 8, 0]\nB: [5, 8, 6, 1, 9, 3, 4, 0, 2, 7]\nC: [7, 8, 5, 1, 0, 4, 9, 6, 3, 2]\nD: [7, 9, 6, 3, 5, 0, 8, 2, 1, 4]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [7, 6, 4, 2, 5, 1, 3, 9, 8, 0]\nB: [5, 8, 6, 1, 9, 3, 4, 0, 2, 7]\nC: [7, 8, 5, 1, 0, 4, 9, 6, 3, 2]\nD: [7, 9, 6, 3, 5, 0, 8, 2, 1, 4]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_145_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_145_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_145_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_145_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_145_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_145_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_145_6.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_145_7.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_145_8.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_145_9.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [3, 0, 1, 2]\nB: [1, 3, 2, 0]\nC: [0, 1, 3, 2]\nD: [3, 2, 1, 0]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [3, 0, 1, 2]\nB: [1, 3, 2, 0]\nC: [0, 1, 3, 2]\nD: [3, 2, 1, 0]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_146_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_146_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_146_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_146_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [2, 3, 1, 0]\nB: [1, 0, 2, 3]\nC: [1, 2, 0, 3]\nD: [1, 3, 2, 0]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [2, 3, 1, 0]\nB: [1, 0, 2, 3]\nC: [1, 2, 0, 3]\nD: [1, 3, 2, 0]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_147_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_147_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_147_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_147_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [1, 0, 3, 4, 2]\nB: [1, 3, 4, 2, 0]\nC: [0, 3, 4, 1, 2]\nD: [1, 3, 4, 0, 2]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [1, 0, 3, 4, 2]\nB: [1, 3, 4, 2, 0]\nC: [0, 3, 4, 1, 2]\nD: [1, 3, 4, 0, 2]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_148_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_148_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_148_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_148_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_148_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [0, 3, 2, 1]\nB: [3, 2, 0, 1]\nC: [2, 1, 3, 0]\nD: [2, 3, 0, 1]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [0, 3, 2, 1]\nB: [3, 2, 0, 1]\nC: [2, 1, 3, 0]\nD: [2, 3, 0, 1]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_149_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_149_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_149_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_149_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [5, 3, 1, 2, 4, 0, 6]\nB: [1, 3, 0, 6, 2, 5, 4]\nC: [4, 5, 3, 0, 1, 6, 2]\nD: [2, 1, 0, 6, 4, 5, 3]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [5, 3, 1, 2, 4, 0, 6]\nB: [1, 3, 0, 6, 2, 5, 4]\nC: [4, 5, 3, 0, 1, 6, 2]\nD: [2, 1, 0, 6, 4, 5, 3]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_150_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_150_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_150_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_150_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_150_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_150_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_150_6.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [4, 6, 0, 1, 2, 3, 5]\nB: [5, 1, 0, 6, 2, 4, 3]\nC: [2, 6, 5, 4, 0, 1, 3]\nD: [0, 4, 1, 3, 6, 2, 5]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [4, 6, 0, 1, 2, 3, 5]\nB: [5, 1, 0, 6, 2, 4, 3]\nC: [2, 6, 5, 4, 0, 1, 3]\nD: [0, 4, 1, 3, 6, 2, 5]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_151_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_151_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_151_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_151_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_151_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_151_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_151_6.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [3, 0, 2, 1]\nB: [0, 3, 2, 1]\nC: [1, 2, 3, 0]\nD: [1, 0, 2, 3]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [3, 0, 2, 1]\nB: [0, 3, 2, 1]\nC: [1, 2, 3, 0]\nD: [1, 0, 2, 3]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_152_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_152_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_152_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_152_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [0, 7, 12, 10, 2, 5, 3, 9, 6, 4, 13, 8, 11, 1]\nB: [11, 13, 7, 3, 1, 8, 5, 0, 4, 6, 12, 9, 10, 2]\nC: [0, 2, 11, 13, 12, 9, 10, 1, 8, 4, 3, 6, 7, 5]\nD: [10, 5, 4, 13, 11, 12, 0, 6, 8, 7, 2, 9, 1, 3]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [0, 7, 12, 10, 2, 5, 3, 9, 6, 4, 13, 8, 11, 1]\nB: [11, 13, 7, 3, 1, 8, 5, 0, 4, 6, 12, 9, 10, 2]\nC: [0, 2, 11, 13, 12, 9, 10, 1, 8, 4, 3, 6, 7, 5]\nD: [10, 5, 4, 13, 11, 12, 0, 6, 8, 7, 2, 9, 1, 3]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_153_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_153_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_153_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_153_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_153_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_153_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_153_6.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_153_7.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_153_8.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_153_9.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_153_10.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_153_11.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_153_12.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_153_13.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [2, 3, 0, 1, 4]\nB: [1, 0, 2, 3, 4]\nC: [4, 2, 3, 1, 0]\nD: [0, 1, 2, 4, 3]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [2, 3, 0, 1, 4]\nB: [1, 0, 2, 3, 4]\nC: [4, 2, 3, 1, 0]\nD: [0, 1, 2, 4, 3]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_154_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_154_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_154_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_154_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_154_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [0, 3, 1, 2]\nB: [0, 1, 2, 3]\nC: [1, 0, 3, 2]\nD: [0, 2, 1, 3]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [0, 3, 1, 2]\nB: [0, 1, 2, 3]\nC: [1, 0, 3, 2]\nD: [0, 2, 1, 3]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_155_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_155_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_155_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_155_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [2, 3, 0, 1]\nB: [0, 1, 2, 3]\nC: [0, 1, 2, 3]\nD: [0, 2, 1, 3]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [2, 3, 0, 1]\nB: [0, 1, 2, 3]\nC: [0, 1, 2, 3]\nD: [0, 2, 1, 3]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_156_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_156_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_156_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_156_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [1, 4, 3, 0, 2]\nB: [2, 4, 0, 3, 1]\nC: [1, 4, 2, 0, 3]\nD: [4, 1, 0, 2, 3]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [1, 4, 3, 0, 2]\nB: [2, 4, 0, 3, 1]\nC: [1, 4, 2, 0, 3]\nD: [4, 1, 0, 2, 3]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_157_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_157_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_157_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_157_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_157_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [0, 4, 1, 2, 3, 5]\nB: [1, 5, 2, 4, 3, 0]\nC: [3, 0, 1, 2, 4, 5]\nD: [3, 2, 1, 4, 0, 5]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [0, 4, 1, 2, 3, 5]\nB: [1, 5, 2, 4, 3, 0]\nC: [3, 0, 1, 2, 4, 5]\nD: [3, 2, 1, 4, 0, 5]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_158_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_158_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_158_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_158_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_158_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_158_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [5, 1, 0, 8, 9, 4, 2, 6, 3, 7]\nB: [5, 1, 7, 6, 0, 4, 9, 3, 8, 2]\nC: [2, 1, 8, 6, 0, 3, 7, 9, 4, 5]\nD: [5, 7, 9, 8, 2, 0, 6, 4, 3, 1]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [5, 1, 0, 8, 9, 4, 2, 6, 3, 7]\nB: [5, 1, 7, 6, 0, 4, 9, 3, 8, 2]\nC: [2, 1, 8, 6, 0, 3, 7, 9, 4, 5]\nD: [5, 7, 9, 8, 2, 0, 6, 4, 3, 1]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_159_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_159_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_159_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_159_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_159_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_159_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_159_6.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_159_7.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_159_8.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_159_9.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [0, 2, 6, 7, 4, 5, 10, 1, 8, 3, 9]\nB: [8, 4, 9, 2, 5, 0, 6, 3, 1, 10, 7]\nC: [6, 8, 3, 10, 2, 7, 5, 4, 9, 0, 1]\nD: [5, 7, 10, 0, 9, 4, 3, 6, 1, 2, 8]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [0, 2, 6, 7, 4, 5, 10, 1, 8, 3, 9]\nB: [8, 4, 9, 2, 5, 0, 6, 3, 1, 10, 7]\nC: [6, 8, 3, 10, 2, 7, 5, 4, 9, 0, 1]\nD: [5, 7, 10, 0, 9, 4, 3, 6, 1, 2, 8]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_160_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_160_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_160_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_160_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_160_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_160_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_160_6.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_160_7.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_160_8.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_160_9.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_160_10.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [3, 0, 2, 1]\nB: [2, 0, 1, 3]\nC: [0, 2, 1, 3]\nD: [1, 3, 2, 0]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [3, 0, 2, 1]\nB: [2, 0, 1, 3]\nC: [0, 2, 1, 3]\nD: [1, 3, 2, 0]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_161_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_161_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_161_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_161_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [0, 3, 1, 4, 5, 2]\nB: [3, 2, 4, 0, 5, 1]\nC: [3, 2, 5, 0, 1, 4]\nD: [5, 1, 2, 3, 4, 0]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [0, 3, 1, 4, 5, 2]\nB: [3, 2, 4, 0, 5, 1]\nC: [3, 2, 5, 0, 1, 4]\nD: [5, 1, 2, 3, 4, 0]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_162_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_162_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_162_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_162_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_162_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_162_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [0, 1, 5, 2, 4, 3]\nB: [3, 1, 5, 4, 0, 2]\nC: [5, 0, 4, 3, 2, 1]\nD: [1, 3, 5, 2, 0, 4]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [0, 1, 5, 2, 4, 3]\nB: [3, 1, 5, 4, 0, 2]\nC: [5, 0, 4, 3, 2, 1]\nD: [1, 3, 5, 2, 0, 4]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_163_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_163_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_163_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_163_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_163_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_163_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [0, 1, 2, 9, 4, 7, 6, 5, 10, 8, 3]\nB: [5, 3, 2, 1, 0, 4, 6, 8, 9, 10, 7]\nC: [9, 5, 8, 6, 10, 2, 1, 3, 0, 4, 7]\nD: [4, 10, 3, 2, 8, 7, 0, 9, 5, 6, 1]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [0, 1, 2, 9, 4, 7, 6, 5, 10, 8, 3]\nB: [5, 3, 2, 1, 0, 4, 6, 8, 9, 10, 7]\nC: [9, 5, 8, 6, 10, 2, 1, 3, 0, 4, 7]\nD: [4, 10, 3, 2, 8, 7, 0, 9, 5, 6, 1]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_164_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_164_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_164_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_164_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_164_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_164_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_164_6.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_164_7.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_164_8.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_164_9.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_164_10.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [3, 2, 1, 4, 0]\nB: [1, 4, 3, 2, 0]\nC: [1, 4, 3, 0, 2]\nD: [2, 4, 1, 3, 0]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [3, 2, 1, 4, 0]\nB: [1, 4, 3, 2, 0]\nC: [1, 4, 3, 0, 2]\nD: [2, 4, 1, 3, 0]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_165_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_165_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_165_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_165_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_165_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "pouring", "options": "A: [4, 0, 6, 2, 7, 3, 5, 1]\nB: [4, 1, 0, 7, 3, 6, 5, 2]\nC: [5, 7, 4, 6, 2, 0, 3, 1]\nD: [7, 2, 5, 4, 1, 0, 6, 3]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [4, 0, 6, 2, 7, 3, 5, 1]\nB: [4, 1, 0, 7, 3, 6, 5, 2]\nC: [5, 7, 4, 6, 2, 0, 3, 1]\nD: [7, 2, 5, 4, 1, 0, 6, 3]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_166_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_166_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_166_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_166_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_166_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_166_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_166_6.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_166_7.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [2, 10, 6, 0, 12, 1, 13, 4, 9, 8, 3, 7, 5, 11, 14]\nB: [2, 7, 5, 6, 11, 3, 1, 14, 8, 4, 12, 10, 0, 13, 9]\nC: [4, 6, 7, 1, 5, 10, 12, 2, 0, 14, 13, 8, 9, 3, 11]\nD: [6, 2, 5, 14, 9, 4, 11, 1, 10, 12, 7, 13, 8, 3, 0]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [2, 10, 6, 0, 12, 1, 13, 4, 9, 8, 3, 7, 5, 11, 14]\nB: [2, 7, 5, 6, 11, 3, 1, 14, 8, 4, 12, 10, 0, 13, 9]\nC: [4, 6, 7, 1, 5, 10, 12, 2, 0, 14, 13, 8, 9, 3, 11]\nD: [6, 2, 5, 14, 9, 4, 11, 1, 10, 12, 7, 13, 8, 3, 0]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_167_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_167_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_167_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_167_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_167_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_167_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_167_6.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_167_7.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_167_8.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_167_9.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_167_10.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_167_11.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_167_12.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_167_13.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_167_14.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [0, 2, 3, 1]\nB: [1, 2, 0, 3]\nC: [2, 0, 1, 3]\nD: [1, 0, 3, 2]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [0, 2, 3, 1]\nB: [1, 2, 0, 3]\nC: [2, 0, 1, 3]\nD: [1, 0, 3, 2]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_168_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_168_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_168_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_168_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [0, 5, 1, 3, 4, 2]\nB: [0, 5, 1, 3, 2, 4]\nC: [3, 0, 4, 1, 2, 5]\nD: [2, 4, 1, 0, 5, 3]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [0, 5, 1, 3, 4, 2]\nB: [0, 5, 1, 3, 2, 4]\nC: [3, 0, 4, 1, 2, 5]\nD: [2, 4, 1, 0, 5, 3]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_169_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_169_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_169_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_169_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_169_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_169_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "pouring", "options": "A: [8, 0, 7, 5, 1, 2, 3, 4, 6]\nB: [0, 7, 8, 4, 1, 6, 5, 3, 2]\nC: [1, 5, 3, 0, 4, 2, 6, 7, 8]\nD: [7, 2, 4, 1, 5, 0, 3, 6, 8]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [8, 0, 7, 5, 1, 2, 3, 4, 6]\nB: [0, 7, 8, 4, 1, 6, 5, 3, 2]\nC: [1, 5, 3, 0, 4, 2, 6, 7, 8]\nD: [7, 2, 4, 1, 5, 0, 3, 6, 8]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_170_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_170_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_170_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_170_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_170_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_170_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_170_6.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_170_7.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_170_8.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [0, 4, 3, 2, 1, 5]\nB: [3, 5, 0, 1, 4, 2]\nC: [2, 5, 3, 1, 4, 0]\nD: [4, 2, 0, 5, 1, 3]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [0, 4, 3, 2, 1, 5]\nB: [3, 5, 0, 1, 4, 2]\nC: [2, 5, 3, 1, 4, 0]\nD: [4, 2, 0, 5, 1, 3]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_171_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_171_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_171_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_171_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_171_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_171_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [4, 5, 2, 3, 1, 0]\nB: [2, 1, 5, 0, 4, 3]\nC: [0, 5, 2, 1, 3, 4]\nD: [1, 0, 3, 4, 2, 5]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [4, 5, 2, 3, 1, 0]\nB: [2, 1, 5, 0, 4, 3]\nC: [0, 5, 2, 1, 3, 4]\nD: [1, 0, 3, 4, 2, 5]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_172_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_172_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_172_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_172_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_172_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_172_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [3, 1, 0, 2]\nB: [3, 2, 1, 0]\nC: [2, 1, 3, 0]\nD: [3, 2, 0, 1]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [3, 1, 0, 2]\nB: [3, 2, 1, 0]\nC: [2, 1, 3, 0]\nD: [3, 2, 0, 1]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_173_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_173_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_173_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_173_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [6, 2, 1, 0, 4, 5, 3]\nB: [5, 3, 1, 0, 2, 4, 6]\nC: [1, 0, 5, 3, 4, 6, 2]\nD: [4, 6, 5, 0, 1, 2, 3]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [6, 2, 1, 0, 4, 5, 3]\nB: [5, 3, 1, 0, 2, 4, 6]\nC: [1, 0, 5, 3, 4, 6, 2]\nD: [4, 6, 5, 0, 1, 2, 3]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_174_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_174_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_174_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_174_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_174_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_174_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_174_6.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [1, 4, 5, 2, 3, 0]\nB: [5, 0, 1, 2, 4, 3]\nC: [2, 4, 3, 0, 1, 5]\nD: [5, 2, 0, 4, 3, 1]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [1, 4, 5, 2, 3, 0]\nB: [5, 0, 1, 2, 4, 3]\nC: [2, 4, 3, 0, 1, 5]\nD: [5, 2, 0, 4, 3, 1]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_175_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_175_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_175_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_175_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_175_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_175_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [2, 1, 0, 3]\nB: [0, 1, 3, 2]\nC: [2, 1, 3, 0]\nD: [3, 2, 0, 1]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [2, 1, 0, 3]\nB: [0, 1, 3, 2]\nC: [2, 1, 3, 0]\nD: [3, 2, 0, 1]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_176_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_176_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_176_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_176_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [3, 1, 4, 5, 0, 2]\nB: [2, 4, 3, 5, 1, 0]\nC: [5, 3, 4, 0, 2, 1]\nD: [2, 3, 0, 4, 1, 5]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [3, 1, 4, 5, 0, 2]\nB: [2, 4, 3, 5, 1, 0]\nC: [5, 3, 4, 0, 2, 1]\nD: [2, 3, 0, 4, 1, 5]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_177_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_177_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_177_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_177_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_177_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_177_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [1, 0, 3, 2]\nB: [3, 2, 1, 0]\nC: [1, 2, 0, 3]\nD: [2, 3, 0, 1]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [1, 0, 3, 2]\nB: [3, 2, 1, 0]\nC: [1, 2, 0, 3]\nD: [2, 3, 0, 1]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_178_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_178_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_178_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_178_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [5, 0, 3, 2, 1, 4]\nB: [4, 1, 5, 3, 2, 0]\nC: [0, 3, 4, 2, 5, 1]\nD: [5, 4, 2, 0, 1, 3]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [5, 0, 3, 2, 1, 4]\nB: [4, 1, 5, 3, 2, 0]\nC: [0, 3, 4, 2, 5, 1]\nD: [5, 4, 2, 0, 1, 3]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_179_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_179_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_179_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_179_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_179_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_179_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [3, 1, 2, 0]\nB: [0, 2, 1, 3]\nC: [3, 1, 0, 2]\nD: [2, 3, 1, 0]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [3, 1, 2, 0]\nB: [0, 2, 1, 3]\nC: [3, 1, 0, 2]\nD: [2, 3, 1, 0]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_180_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_180_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_180_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_180_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [2, 1, 3, 0]\nB: [1, 3, 2, 0]\nC: [3, 2, 1, 0]\nD: [0, 1, 3, 2]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [2, 1, 3, 0]\nB: [1, 3, 2, 0]\nC: [3, 2, 1, 0]\nD: [0, 1, 3, 2]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_181_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_181_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_181_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_181_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [11, 1, 2, 5, 15, 3, 0, 12, 6, 10, 8, 9, 14, 7, 13, 4]\nB: [9, 5, 6, 7, 2, 3, 4, 8, 0, 11, 12, 10, 15, 13, 14, 1]\nC: [10, 5, 6, 4, 15, 14, 12, 9, 11, 3, 13, 2, 0, 1, 8, 7]\nD: [10, 11, 7, 3, 0, 14, 4, 12, 8, 2, 1, 13, 5, 6, 15, 9]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [11, 1, 2, 5, 15, 3, 0, 12, 6, 10, 8, 9, 14, 7, 13, 4]\nB: [9, 5, 6, 7, 2, 3, 4, 8, 0, 11, 12, 10, 15, 13, 14, 1]\nC: [10, 5, 6, 4, 15, 14, 12, 9, 11, 3, 13, 2, 0, 1, 8, 7]\nD: [10, 11, 7, 3, 0, 14, 4, 12, 8, 2, 1, 13, 5, 6, 15, 9]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_182_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_182_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_182_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_182_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_182_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_182_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_182_6.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_182_7.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_182_8.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_182_9.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_182_10.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_182_11.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_182_12.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_182_13.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_182_14.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_182_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [1, 5, 0, 7, 4, 2, 3, 6]\nB: [6, 4, 7, 1, 3, 0, 5, 2]\nC: [0, 3, 6, 7, 1, 4, 2, 5]\nD: [5, 0, 3, 7, 2, 1, 6, 4]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [1, 5, 0, 7, 4, 2, 3, 6]\nB: [6, 4, 7, 1, 3, 0, 5, 2]\nC: [0, 3, 6, 7, 1, 4, 2, 5]\nD: [5, 0, 3, 7, 2, 1, 6, 4]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_183_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_183_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_183_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_183_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_183_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_183_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_183_6.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_183_7.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [2, 4, 0, 3, 1]\nB: [1, 0, 4, 3, 2]\nC: [0, 3, 1, 2, 4]\nD: [4, 0, 3, 2, 1]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [2, 4, 0, 3, 1]\nB: [1, 0, 4, 3, 2]\nC: [0, 3, 1, 2, 4]\nD: [4, 0, 3, 2, 1]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_184_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_184_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_184_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_184_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_184_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [0, 1, 9, 4, 10, 2, 7, 8, 5, 6, 3]\nB: [7, 1, 10, 6, 8, 3, 0, 9, 4, 2, 5]\nC: [9, 10, 8, 0, 2, 1, 4, 3, 7, 5, 6]\nD: [10, 5, 0, 3, 8, 9, 4, 2, 1, 7, 6]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [0, 1, 9, 4, 10, 2, 7, 8, 5, 6, 3]\nB: [7, 1, 10, 6, 8, 3, 0, 9, 4, 2, 5]\nC: [9, 10, 8, 0, 2, 1, 4, 3, 7, 5, 6]\nD: [10, 5, 0, 3, 8, 9, 4, 2, 1, 7, 6]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_185_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_185_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_185_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_185_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_185_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_185_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_185_6.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_185_7.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_185_8.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_185_9.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_185_10.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [6, 0, 2, 7, 8, 4, 1, 3, 5, 9, 10]\nB: [9, 5, 1, 3, 10, 4, 2, 0, 7, 6, 8]\nC: [5, 10, 1, 8, 4, 2, 3, 6, 9, 7, 0]\nD: [5, 7, 6, 0, 4, 3, 1, 2, 9, 8, 10]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [6, 0, 2, 7, 8, 4, 1, 3, 5, 9, 10]\nB: [9, 5, 1, 3, 10, 4, 2, 0, 7, 6, 8]\nC: [5, 10, 1, 8, 4, 2, 3, 6, 9, 7, 0]\nD: [5, 7, 6, 0, 4, 3, 1, 2, 9, 8, 10]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_186_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_186_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_186_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_186_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_186_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_186_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_186_6.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_186_7.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_186_8.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_186_9.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_186_10.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [4, 0, 1, 2, 3]\nB: [4, 3, 2, 1, 0]\nC: [1, 0, 3, 2, 4]\nD: [0, 4, 1, 2, 3]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [4, 0, 1, 2, 3]\nB: [4, 3, 2, 1, 0]\nC: [1, 0, 3, 2, 4]\nD: [0, 4, 1, 2, 3]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_187_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_187_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_187_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_187_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_187_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [3, 0, 1, 2]\nB: [1, 3, 0, 2]\nC: [2, 3, 0, 1]\nD: [3, 1, 2, 0]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [3, 0, 1, 2]\nB: [1, 3, 0, 2]\nC: [2, 3, 0, 1]\nD: [3, 1, 2, 0]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_188_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_188_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_188_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_188_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "pouring", "options": "A: [5, 4, 2, 6, 3, 1, 0]\nB: [5, 2, 3, 0, 6, 4, 1]\nC: [5, 1, 4, 2, 3, 0, 6]\nD: [2, 4, 0, 5, 6, 1, 3]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [5, 4, 2, 6, 3, 1, 0]\nB: [5, 2, 3, 0, 6, 4, 1]\nC: [5, 1, 4, 2, 3, 0, 6]\nD: [2, 4, 0, 5, 6, 1, 3]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_189_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_189_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_189_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_189_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_189_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_189_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_189_6.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [3, 2, 0, 1]\nB: [3, 0, 2, 1]\nC: [2, 3, 0, 1]\nD: [2, 3, 1, 0]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [3, 2, 0, 1]\nB: [3, 0, 2, 1]\nC: [2, 3, 0, 1]\nD: [2, 3, 1, 0]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_190_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_190_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_190_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_190_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [10, 0, 8, 5, 11, 4, 1, 7, 3, 2, 9, 6]\nB: [10, 5, 7, 2, 3, 4, 1, 6, 11, 8, 0, 9]\nC: [11, 9, 4, 10, 7, 6, 8, 1, 5, 2, 0, 3]\nD: [2, 0, 1, 4, 9, 10, 5, 6, 8, 3, 11, 7]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [10, 0, 8, 5, 11, 4, 1, 7, 3, 2, 9, 6]\nB: [10, 5, 7, 2, 3, 4, 1, 6, 11, 8, 0, 9]\nC: [11, 9, 4, 10, 7, 6, 8, 1, 5, 2, 0, 3]\nD: [2, 0, 1, 4, 9, 10, 5, 6, 8, 3, 11, 7]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_191_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_191_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_191_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_191_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_191_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_191_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_191_6.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_191_7.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_191_8.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_191_9.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_191_10.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_191_11.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [5, 2, 4, 0, 6, 1, 3]\nB: [1, 4, 6, 0, 3, 2, 5]\nC: [6, 2, 3, 4, 0, 5, 1]\nD: [6, 2, 3, 4, 0, 1, 5]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [5, 2, 4, 0, 6, 1, 3]\nB: [1, 4, 6, 0, 3, 2, 5]\nC: [6, 2, 3, 4, 0, 5, 1]\nD: [6, 2, 3, 4, 0, 1, 5]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_192_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_192_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_192_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_192_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_192_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_192_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_192_6.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [3, 0, 2, 1]\nB: [0, 2, 1, 3]\nC: [0, 3, 1, 2]\nD: [2, 3, 1, 0]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [3, 0, 2, 1]\nB: [0, 2, 1, 3]\nC: [0, 3, 1, 2]\nD: [2, 3, 1, 0]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_193_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_193_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_193_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_193_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [6, 5, 3, 1, 2, 4, 0]\nB: [3, 0, 6, 5, 1, 4, 2]\nC: [4, 6, 1, 5, 0, 3, 2]\nD: [5, 2, 6, 1, 4, 3, 0]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [6, 5, 3, 1, 2, 4, 0]\nB: [3, 0, 6, 5, 1, 4, 2]\nC: [4, 6, 1, 5, 0, 3, 2]\nD: [5, 2, 6, 1, 4, 3, 0]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_194_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_194_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_194_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_194_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_194_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_194_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_194_6.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [0, 1, 2, 3]\nB: [2, 0, 3, 1]\nC: [2, 3, 1, 0]\nD: [2, 3, 1, 0]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 0, 3, 1]\nC: [2, 3, 1, 0]\nD: [2, 3, 1, 0]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_195_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_195_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_195_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_195_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [4, 3, 2, 0, 1]\nB: [3, 0, 1, 2, 4]\nC: [0, 2, 4, 1, 3]\nD: [0, 3, 4, 1, 2]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [4, 3, 2, 0, 1]\nB: [3, 0, 1, 2, 4]\nC: [0, 2, 4, 1, 3]\nD: [0, 3, 4, 1, 2]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_196_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_196_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_196_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_196_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_196_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [6, 0, 1, 7, 5, 4, 3, 2]\nB: [5, 0, 1, 6, 4, 7, 3, 2]\nC: [2, 1, 6, 7, 4, 3, 0, 5]\nD: [3, 6, 0, 1, 7, 4, 5, 2]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [6, 0, 1, 7, 5, 4, 3, 2]\nB: [5, 0, 1, 6, 4, 7, 3, 2]\nC: [2, 1, 6, 7, 4, 3, 0, 5]\nD: [3, 6, 0, 1, 7, 4, 5, 2]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_197_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_197_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_197_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_197_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_197_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_197_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_197_6.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_197_7.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [1, 2, 3, 4, 0]\nB: [3, 2, 1, 0, 4]\nC: [4, 1, 0, 2, 3]\nD: [0, 3, 2, 4, 1]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [1, 2, 3, 4, 0]\nB: [3, 2, 1, 0, 4]\nC: [4, 1, 0, 2, 3]\nD: [0, 3, 2, 4, 1]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_198_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_198_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_198_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_198_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_198_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "temporal_ordering", "visual_input_component": "Video image or Natural image", "source": "penn_action", "options": "A: [0, 8, 2, 6, 5, 4, 7, 3, 9, 1, 11, 10]\nB: [8, 7, 1, 2, 11, 10, 5, 9, 4, 6, 3, 0]\nC: [1, 4, 11, 9, 3, 0, 10, 5, 7, 6, 2, 8]\nD: [3, 8, 2, 11, 1, 7, 5, 10, 0, 4, 9, 6]", "question": "Please predict the order of the following pictures, and give each picture a sequential index. This index starts from 0. The larger the index, the later the order.", "context": "Select from the following choices.\nA: [0, 8, 2, 6, 5, 4, 7, 3, 9, 1, 11, 10]\nB: [8, 7, 1, 2, 11, 10, 5, 9, 4, 6, 3, 0]\nC: [1, 4, 11, 9, 3, 0, 10, 5, 7, 6, 2, 8]\nD: [3, 8, 2, 11, 1, 7, 5, 10, 0, 4, 9, 6]", "input_image_path": ["./Continuous-temporal/temporal_ordering/temporal_ordering_199_0.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_199_1.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_199_2.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_199_3.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_199_4.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_199_5.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_199_6.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_199_7.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_199_8.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_199_9.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_199_10.jpg", "./Continuous-temporal/temporal_ordering/temporal_ordering_199_11.jpg"], "output": "A", "qwen3-vl": "image none"}]
\ No newline at end of file
diff --git a/results/text2image_retrieval/qwen3-vl/metadata_info.json b/results/text2image_retrieval/qwen3-vl/metadata_info.json
new file mode 100644
index 0000000..07be070
--- /dev/null
+++ b/results/text2image_retrieval/qwen3-vl/metadata_info.json
@@ -0,0 +1 @@
+[{"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this bird is entirely black and has a pointed black beak.\na black bird with slick feathers, and a black bill.\nthis is a black bird with a white eye and a large black beak.\nthis bird is black with green eyes and has a long, pointy beak.\nthis bird is almost all black with the exception of yellow eyes.\nsolid black bird with a medium beak and a yellow eye.\nthis bird has wings that are black and has yellow eyes\nthis bird is solid black, with a penetrating gaze and a sharp bill.\nthis particular bird has a belly that is black with white eye rings\nthis bird has a black bill and crown and black breast, belly, and wings.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_0_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_0_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_0_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_0_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: a bird with a very long wing span and a long pointed beak.\nthe long-beaked bird has a white body with long brown wings.\nthis is a white bird with brown wings and a large pointy beak.\nthis large bird has long bill, a white breast, belly & head and a black back & wings.\nbird has an extremely long wingspan with a darker top and white belly and head.\nthis bird has wings that are brown and has a white belly\nthis bird has extended wings and a white head and body.\nthis bird is white and brown in color, with a long curved beak.\nthis white and grey bird has an enormous wing span.\nthis bird has wings that are brown and has a white body\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_1_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_1_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_1_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_1_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the petals on this flower are numerous colors: green white, purple, and brown.\nthis unusual flower has light green pointed petals with green and yellow stamens, purple sigma and filamented blue and white petals on the inside.\nthis flower has visible sepals, petals, corona filaments that are frilly and three pronged stamen at the top making it a very unique flower.\nthis flower is green white and blue in color, with petals that are oval shaped.\nthe petals of this flower are white and green with a long stigma\nthe many petals of this flower are white and purple and the pedicel is green\nthis flower has ten evenly spaced petals protruding from its center.\nthe petals of the flower are a solid white color, and the thin stamens are a blue color, with white at the base.\nthis flower has petals that are white and has purple stamen\nthis flower is white, purple and green in color, and has petals that are oval shaped.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_2_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_2_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_2_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_2_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this small bird has a very shiny black body and feathers, a tiny black bill and bright yellow eyes.\nthis is a blue bird with a white eye and a pointy beak.\nthis bird, with a very prominent yellow eye, is all black but with light, the fur can appear dark bluish or purple.\nthis is a medium sized bird with a black head, black beak, black wings and black feet, the eye is a bright white and the black has an iridescent sheen.\na medium sized bird that has shiny feathers and a narrow pointed bill\nthis particular bird has a black breast with metallic blue wingbars\na bird with a black bill, black crown and black secondaries.\nthe bird has a black beak and yellow eyes and a very color ful gradient on its body.\na very small black bird with a small black bill, it has long legs for its body which are black as well.\nblack head blue body, with black wings.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_3_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_3_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_3_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_3_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: purple flower with curvy string-like petals and a group of large yellow stamen in the center.\nthis purple flower has unusual parts and consists of perianth segmants, corona filaments, and a three pronged stigma.\nthe petals of this flower are purple with a long stigma\nreally cool squiggly lavender petals with purple stigma and white stamen.\nthis flower is purple and white in color, with petals that are oval shaped.\nthis flower has purple petals as well as a green pistil.\nthis flower has purple petals and purple and white stamen.\nthis flower is purple and white in color, and has petals that are oval shaped.\nthis flower has purple peyals that has long and stringy stamen\nthe flower on this particular picture has petals as well as a pistil.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_4_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_4_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_4_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_4_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower has large green-yellow stamens and two types of purple petals.\nthe flower has medium size petals that are purple and smaller skinny white petals.\nlight purple and white petals white and dark purple middle petals green and yellow middle dark green leaves\nthis flower has petals that are pink with purple stringy stamen\nthis flower is pink and white in color, with petals that are oval shaped.\nthis group of two flowers have pink petals that bend backward exposing large light green stamen in the center.\nthe flower is so big with petals that are pink and arranged in disc like manner below the disc of stamen\nthis flower has thick purple petals under a layer of white fringe.\nthis flower is pink, white, and green in color, and has petals that are oval shaped.\nthis flower has petals that are pink and has grteen stamen\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_5_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_5_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_5_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_5_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower has protruding green stamen and several layers of distinctly-shaped light purple petals.\nthis flower is purple and white in color, and has petals that are oddly shaped and skinny.\nthe petals of the flower are purple in color and have a center made of yellow stigmas.\nthin light purple petals with frayed light purple, white and darker purple petals above them and a white, pink, green pistil.\nthe stringy petals are purple with the green pollen tubes in the middle\na flower with long and narrow pistils that are purple.\nthe long, narrow lavender petals are overlaid with spaghetti-like lavender strands, and striped white and purple in the center near the pistil.\nthis flower has petals that are purple and has stringy stamern\nthis flower has a back row of light violet petals with a second row of thin petals with purple and white stripes.\nthis flower has light purple petals and a corona of striped purple and white filaments between the petals and the stamens.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_6_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_6_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_6_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_6_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this is a white bird with grey wings and a dark brown head.\nthis is a white bird with grey wings and a long dark beak.\nthis bird has a white belly and breast, with a brown crown and a long blunt bill.\nthis bird is white, brown, black in color with a curved black beak, and black eye rings.\na bird with a brown nape and a white back, with black inner rectrices.\nthis bird has a dark bill and brown head along with a pale colored body and tan to grey wings.\nthis bird has wings that are brown and has a long black bill\nthis is a gray bird with a dark brown head and webbed pink feet.\nthis bird is brown in color, with a black beak.\nthis particular bird has a belly that is white and brown\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_7_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_7_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_7_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_7_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the bird has a white body with a grey rump and a yellow bill.\nthis bird is white, with a large beak and black wings.\nthis bird is white with black wings, and has a long, orange beak.\nthis bird has a white head, the bill is long and curved, with a white belly and black wings.\na medium sized bird that is mostly white with a very large hooked bill\na very large bird with mostly white body, and a large beak.\nthis bird has wings that are black and has a yellow bill\nthe bird has black wings, with a white breast, a white neck, white head, and a yellow beak.\nthe bird has white feathers on its body and black wing feathers, it has a thick orange bill.\nthis bird has a snow white breast color and a long curved bill\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_8_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_8_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_8_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_8_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: outer petals are light purple in color and klarger,inner petals are needle shaped\nthis flower has a layer of white petals on bottom, a layer of light purple petals in the middle, and a layer of very thing purple petals on top.\nthis flower is white and purple in color, with petals that are oval shaped.\nthis flower has petals that are purple with stringy stamen\nthis is a strange flower with purple and white petals and green stamen.\nthe flower has five pale purple petals, five with petals and an oddly shaped green pistil.\nthis flower has rounded green and purple petals and a fringe of purple hairs.\nthis flower has petals that are purple and white and has stringy stamen\na flower with long and narrow petals that are purple.\nthis flower has a row of long green and purple petals under a row of long needle petals.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_9_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_9_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_9_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_9_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this is a grey bird with a black beak and a white eye.\nbird with goofy round head, wide curved bill and black all over its body.\nthis is a dark grey bird with a white eye and a large black beak.\nthis bird is white and grey in color with a curved black beak, and white eye rings.\na gray bird with a wide beak and webbed feet.\na small bird with a grey head and black nape, with blue and grey covering the rest of its body\nthis bird is black and gray in color, with a large curved beak.\nthis bird is gray and black in color, with a large black beak.\nthis small bird has a black flat bill, fuzzy black feathers and small feet.\nthis bird has wings that are lack and has white eyes\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_10_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_10_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_10_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_10_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this is a purple flower with long purple anthers on it.\nthe petals of this flower are purple with a long stigma\nthis flower has petals that are purple with purple stamen\nthis flower is white and purple in color, with petals that are oval shaped.\nthis is a large flower with purple petals and white stigma.\nthe flower is so big with petals that are soft, smooth and separately arranged in single disc layer below the layer of blue curly stamen\nthis flower has purple petals as well as a green stamen.\nthis flower is purple and white in color, and has petals that are oval shaped.\nthis flower has petals that are purple and has stringy purple stamen\nit has very frilly leaves!\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_11_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_11_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_11_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_11_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower is purple and white in color, with petals that are oval shaped.\nthe flower petals is spiked and lite purple and dark purple\nthis flower has long purple petals and long purple stamen in the middle\nthe flower is so big and large with disc like arrangements of petals and stamen with stamen disc on top of petals disc\nthis flower has purple petals and green pistil as its main features\nthis flower is white and purple in color, and has petals that are oval shaped.\nthe petals are purple and white and the stamens are green and yellow with brown spots.\nthis flower has very long purple filaments on top of a layer of flat purple petals with bright yellow stamens and green pistils.\nthis flower has petals that are white and has stringy stamen\nthe petals on this flower are mostly stringy purple, and yellow, green is the color of the stamen.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_12_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_12_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_12_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_12_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this bird has a long wide beak and feathers over the eyes like lashes.\nthis bird has a yellow orange bill with white eyebrow and grey breast.\nthis bird has a black body with an orange beak\nthe bird as a grey belly, the bill is short and pointed, with black and grey covering the rest of its body\nthis waterbird features a rather large, yellow beak and small, red eyes.\nthis bird has wings that are black wtih a short yellow bill\nthis bird has wings that are black and has a yellow bill\na medium size bird with a short, pointed, orange beak.\ngrey chest, black head, yellow beak\nthis bird has wings that are black and has a yellow bill\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_13_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_13_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_13_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_13_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: a black and brown bird with small brown feet with a small bill\na black bird with a long tail and a large gray beak\nthis is a black bird with a large pointy grey beak.\nthis crested black bird has spiky plumage, long tail feathers, and a short, thick gray beak.\nthis bird is completely black with a thick blunt bill.\nthis bird has wings that are black with long tail feathers\nthis bird is mostly black with a long tail and a larger beak.\na bird with a very large grey beak and black and white feathers.\nthis bird has wings that are black and has a short bill\nthis bird has a black crown with black belly and black sides.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_14_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_14_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_14_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_14_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: a black bird with a yellow tarsus and a tall black bill.\nthis bird is black with a super thick and short beak.\nthis is a black bird with black feet and a large black beak.\nthis bird is black in color with a large black beak, and black eye rings.\nlarge black bird with long tail feathers and a thick stout black beak.\na black bird with a short, slightly curved beak, a somewhat elongated body and tapering, black tail feathers.\nthis bird has a large wide beak and black feathers covering the rest of its body.\na very tall black bird with a large black beak.\nlong black bird with a short and fat beak.\nthis bird has wings that are black and has a thick bill\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_15_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_15_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_15_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_15_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the bird has a small orange bill that is stubbed.\na black and white bird with a short blunt orange beak.\na small bird with black feathers covering its nape, back, and wings, along with a white and grey speckled throat.\nthis bird is white and black in color with a red beak, and white eye rings.\nthis is a black bird with a white belly and a red beka.\nthis bird has a white belly, black wings and crown, and a small red bill.\nthis is a black bird with a white spot on the head and a white breast with an orange beak.\nthis particular bird has a belly that is white with black spots\na bird with black wings and back, white belly and breast, the bill is short and flat\nthis bird is black and white in color, and has a bright orange beak.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_16_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_16_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_16_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_16_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: lower petals are white in color,and larger in size,inner petals are purple in clor\nwhite petals with blue white and purple petals purple green and yellow middle brown scam and green leaves\nthis flower has petals that are white, with purple and white filaments, and green stigma.\nthis flower is white and blue in color, with petals that are oval shaped.\nthis flower has petals that are white with purple stringy stamen\nthe petals of this flower are white and arranged in a star formation around a blue pistil and green stamen.\nthis flower has petals that are white and has purple stringy stamen\nthis flower has rounded green petals under a purple and white fringe and thick stamen.\nthis flower is white and purple in color, and has petals that are oval shaped.\nthis flower has purple thin petals and large light green sepals.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_17_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_17_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_17_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_17_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the flower has white sepal with purple and green pollen tubes\nthis flower has large green stamen and pistil surrounded by a purple fringe and white petals with rounded tips.\nthis flower has long white petals and long purple stamen in the middle\nthe white and purple flower has waxy leaves.\nthis flower is white and purple in color, with petals that are oval shaped.\nthis flower has white petals and large purple, white, and blue stamen.\nthis flower has white and purple petals and a green pedicel\nthis flower is white, blue, and green in color, and has petals that are oval shaped.\nthis flower has white petals that has long stringy and purple stamen\na flower with long and narrow petals that are white.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_18_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_18_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_18_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_18_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower has petals that are purple with stringy purple stamen\nthis flower has purple petals with the inner ones very thin and stringy with purple pollen tube and a green stigma\nthe violet flower has petals that are soft, smooth and separately arranged in disc like manner that is below the disc of stamens that are curly through out their length\nthis flower is white and purple in color, with petals that are oval shaped.\nthe flower has petals that are lavender, with wavy lavender filaments and white anther.\nthis flower has flat oblong purple petals with a wavy layer of filaments and a tall stamen and pistil.\nthis flower has petals that are purple and has yellow stamen\nthis flower has purple petals as well as a yellow pistil.\nthis flower has long rounded petals under a fringe of frizzy purple hairs.\nthis flower is purple and white in color, and has petals that are oval shaped.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_19_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_19_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_19_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_19_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower has petals that are purple with green steman\nthis flower is white and purple in color, with petals that are pointed.\nthis flower has purple petals and a lot of purple stamen coming out\nthis pale purple flower has string like petals that are above light purple rounded petals below.\nthe flower shown has green pollen tubes with stringy petals and a purple center\nthis flower has tall yellow stamen and green pistils, curvy white filaments, and white petals.\nthe pale purple flowers are long and thin, with darker purple encircling the base of each petal.\nthis flower has petals that are white and has purple stringy stamen\nthe flower has a lavendar petals with thin purple stamens around the green pollen tube\nthis flower has skinny, curly-looking light purple petals and large anther.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_20_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_20_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_20_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_20_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower is white in color, with petals that are oval shaped.\nthis flower's leaves appear to be large and pointy at the ends. a burnt orange color makes it exquisite to look at. the stamen is yellow in color, easy to see, and easy to get to for pollination. the pollen tube is green in color and long, sticking up above flower. the whole pistil is green in color and easy to see and get to.\nthis flower has protruding pollen tube and stamen surrounded by several slightly pointed white petals.\nthis unique white flower has lots of thin blue petals with a dark green center and purple lines.\nthe flower is so big and has petals that are white, separated, thick and arranged in a disc like manner below the stamen which is also arranged in a disclike manner that are blue, white and brown\nthis flower has ten white petals and a corona of fine, vivid purple appendages between the petals and the light green stamens.\nthis flower has white petals with blue and white filaments arranged in a flat disk formation.\nthis flower has thick pale green petals under thick purple and white fringe.\nthis flower has petals that are white and has a stringy stamen\nthis flower is white, green, and purple in color, with oval shaped petals.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_21_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_21_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_21_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_21_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the bird has a long black bill that is somewhat curved.\nthe bird has a long black bill that is curved as well.\nthis grey bird has an impressive wingspan, a grey bill, and a white stripe that surrounds the feathers near the bill.\nthis large bird is mostly grey with a long hooked bill.\nbird with long fat beak that is curved at tip, and the head is proportional to its body size with whole body covered in black\nlarge bird that is complete brown, with white stripes littering it's wings and a long blunted bill.\na black bird with very long wings and short tail, black beak with some white markings on the face around the beak and eyes\nthis bird is all black and has a long, pointy beak.\nthis appears to be a large bird that is almost completely black. it also has a very large black bill with slight white on the face and crown.\nthis bird has long triangular wings and a thick heavy beak.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_22_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_22_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_22_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_22_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the bottom layer of the flower is purple and the middle is light purple made of strings.\na purple flower with little stands curling from the bottom and large petals on top.\nthis flower has a white pistil with green pedicel and purple petals\nthis flower has petals that are purple with purple stamen\nthis flower is purple in color, with petals that are oval shaped.\nthis flower has purple petals and a purplish-white colored stamen.\nthis flower is purple and white in color, and has petals that are oval shaped.\nthis flower has petals that are purple and has stringy stamen\na flower that has long and curly pistils that is purple.\nthis flower has long purple petals bent back from a center with many long fringes of purple and an elaborate center.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_23_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_23_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_23_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_23_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this bird is completely black.\na medium bird with all black body, tarsus, and beak.\nthis bird is black in color, with a black beak and a black eye ring.\nall black bird with white eye ring, black tarsus and feet.\na bird with a black crown and a black body.\nthis bird is all black and has a long, pointy beak.\nthis bird is large and black with white eyes and black pupil and long black tail.\nthe bird is pitch black including its feet and beak, it has yellow eyes.\nthis muted black bird unveils a distinctive yellow eye behind a long pointed beak.\nthis bird has a jet black body, a short beak and bright white eyes.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_24_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_24_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_24_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_24_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: prominent purple stigma,petals are white inc olor\nthis flower has bright purple, spiky petals, and greenish sepals below them.\nthis flower has a row of white petals with the multi-colored stamens and a pistil at the centre\nthis flower is white and blue in color, with petals that are oval shaped.\nthis flower has petals that are green with stringy purple stamen\nthis flower has flat elongated creamy petals around a fringe of purple white and brown petals and large stamen.\nthis flower has petals that are white and has purple stamen\nthis flower has blue petals as well as a green and purple pistil.\nthis flower is purple, blue and white in color, and has stamen that are very long and skinny.\nthis flower has white oblong petals with blue filaments, purple pistils, and green stamen.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_25_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_25_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_25_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_25_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: an all black bird with small beady eyes.\nbird has black body feathers, black breast feather, and black beak\nthis is a black bird with a long black tail wing and a pointy beak.\nthese two birds are black all over and have very long retrices and gray bills.\nthis bird is all black with a very long tail and the area around its eye is lacking feathers.\nthis bird has feathers that are black and has a thick black bill\na small black bird, with a flat tail, and a short bill.\na small bird with a black color covering and long tail.\nthis bird is dark black and featherless around is eyes, and has a short black beak.\nthis bird has wings that are black and has a thick bill\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_26_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_26_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_26_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_26_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower is white and blue in color, with petals that are oval shaped.\ninner petals are needle shaped and are purple incolor\nthis flower has large green pistil and skinny purple and white petals as its main features\nthe petals of this flower are green with a long stigma\nthis flower has petals that are white with purple stamen\nthis flower has yellow anthers and green filaments and purple petals.\nthis purple flower has many pointed petals and a yellow and purple stamen.\nthis flower has petals that are white and has yellow stamen and\nthin, needle-like, purple petals, yellow-green anthers and a green stigma.\nthis unique flower has a lower row of white petals and an upper row of long, thin purple petals.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_27_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_27_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_27_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_27_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower has petals that are green wiht purple stiamne\nthe center of the flower is of various colors such as purple, maroon, white and yellow.\nthe flower has many colors such as blue white purple with a green style\nthis flower has white petals with a white and purple colored stigma.\na flower with a lot of different colors with a large ovary\nflower that has long, skinny, fringed petals with a white stigma and dark brown anther.\nthis flower has blue petals as well as a green pistil.\nthis flower is white and blue in color, with oval shaped petals.\nthis flower has green petals with long and string purple stamen\na flower that has long skinny petals that are purple and white.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_28_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_28_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_28_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_28_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: a flower with very thing purple, white, and red petals with very large red anther and filaments\nthe petals of this flower are green with a long stigma\nthis flower is white and purple in color, with petals that are oval shaped.\nthis flower has petals that are yellow with stringy stamen\na star shaped flower with long multiple colored stamen surrounded by green and white flat petals.\nthis flower has petals that are white and has purple stringy stamen\nthis flower has very prominent green stamen and purple pistils that stand upright in contrast to a flat ring of blue filaments and white petals.\nthis flower is white, green, and purple in color, and has petals that are oval shaped.\na flower with long and pointed pistils that are blue and yellow.\nthis flower has large white petals with a light green and brown pistil.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_29_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_29_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_29_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_29_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: leaves are green in color,outer petals are white green in color\nthis flower is white and purple in color, with petals that are oval shaped.\nthis purple flower has regular petals and noodle-shaped petals accompanied by one big pistil.\nthe petals of this flower are green with a long stigma\nthis flower has petals that are green with purple stamen\nthis flower has thick green petals surrounding a layer of thin hairlike purple petals.\nthis petal has purple and light green colors throughout its long, string-like petals.\na flower with long and narrow pedals that are white.\nan odd looking pinwheel shaped flower with wavy stringy pedals and a large center.\nthis flower has white petals that has longy stringy and purple stamen\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_30_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_30_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_30_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_30_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the bird has head and beak proportional to its body and the bird is brown in color.\nthis bird has grey neck, head, wings and back, it has white around its bill, and a long tall bill that is curved and black at its tip.\nthis bird is brown in color with a long curved beak and dark eye rings.\nthis bird is grey with some white and has a long, pointy beak.\nthis is a solid brown bird with webbed feet and a long slightly hooked bill.\nthis particular bird has a brown body and brown bill\nthe brown colored albatross has white ring at the base of its beak, white undertail and white eyebrow.\nthis bird has wings that are brown and has a big bill\nthis bird is brown in color, with a large curved beak.\na large bird with a grey coloring and long beak.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_31_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_31_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_31_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_31_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this bird is stark black from outer rectrices to beak, with bright orange wing bars and yellow secondaries.\nthis is a black bird with orange wings and a pointy black beak.\nthis is a black bird that has orange coverts and a yellow wingbar.\nthis bird is black and orange in color with a black beak, and black eye rings.\nblack bird with bright orange stripes on the wing bars and black eyes.\nthis is a bird with a black body, head and beak and it has red and yellow patches on both of it's wings.\nthis bird has wings that are black and has an orange and yellow patch\nthis bird is black with red and has a very short beak.\nthis bird is black with red and has a long, pointy beak.\na small black bird, with 1 yellow bar, and a sharp bill.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_32_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_32_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_32_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_32_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this purple flower has thin string like petals and green stamen.\nthis flower has long rounded white petals on the outer row and long, thin, wavy purple petals on the inner row.\nthis flower is purple and yellow in color with skinny wavy petals.\nthis is a flower with light purple petals and flowing flaments.\nthis flower has lavender peddles with thorns and hair like purple stigmas and white pistil.\nthis flower has petals that are purple and has stringy stamen\nthe flower has long, thin purple petals and long green stamens.\nthis flower has white petals, purple stamen and yellow pistil\nthis purple flower has pointed petals and pale purple sepals.\nthis flower has purple petals and purple stigma in a flat circle shape.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_33_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_33_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_33_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_33_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: a bird with a meedium sized, medium width pointed bill, all black feathers, a small head, and yellowish eyes\nthis bird has a black pointed beak, and a black body.\nthis is a black bird with black feet and a pointy black beak.\nthis bird is completely black, with a short pointed bill and yellow eyes.\nthis bird is all black, except for a region around its head that is so black it almost looks blue, and its eyes are yellow.\nthis is a black scary looking bird with beading eyes.\nthis bird has feathers that are black and has a black bill\nthis bird has wings that are black and has a black bill\nthis bird has wings that are black and has a thick bill\nthis bird has a deep black crown and a back bill and black wings.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_34_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_34_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_34_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_34_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the flower has a green prominent pisil and stamen that are green in color\nthis flower is white and purple in color, with petals that are oval shaped.\nthis flower has petals that are white with very stringy stamen\nthe flower has five anthers with skinny purple petals.\nthis flower has white petals as well as an interior row of thin, purple petals.\nthis flower has white petals with purple stamen and green stigma in the center.\nthe flower is so big with petals that are soft, smooth and arranged separately in disc like manner below the the disc of stamen\nthe petals are white in color underneath small, thin petals that are purple and large green stigma\nthe purple and white petals of the flower, are hidden by the purple stamens surrounding the petals.\nthis flower is characterized by its white-purple outer petals, deep purple, stringy inner petals as well as green stamen and purple pistil at the center.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_35_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_35_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_35_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_35_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the flower has white petals and a flat center with purple and white filaments and green stamen.\nthis flower is white and blue in color, with petals that are oval shaped.\nthis flower has the white petals arranged in the bottom and the top the purple shaded long pistils closely arranged in the circle order\nthe flower has light green petals with thin purple petals on top.\nthis flower has petals that are white with long purple steman\nthis flower has white petals with flat blue filaments and green stamen.\nthis flower has green symetrical filaments and long thin petals.\nthe stamens of the flower are of a hair like texture, and have a distinctive color pattern.\nthis flower has petals that are white and has stringy stamen\nthis flower is green, white, and purple in color, and has stamen that are very long and skinny.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_36_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_36_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_36_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_36_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: a large bird with a mohawk, and gray feathers, and an orange beak.\nthis bird has an orange bill, with solid brown feathers covering the rest of its body except for the small cheek patch which is cream and speckled with light brown.\nthis is a brown bird with a small orange beak.\nthis is a brown and grey bird with a small orange beak.\nthis bird has a short orange pointed bill and a brown/black mottled breast & body.\nthis bird has wings that are grey and has an orange bill\nthis bird is brown with a white line coming from its eye, it has a curled feather that comes up before its orange beak.\nthis bird is grey with white and has a very short beak.\nthis bird is gray and brown in color, and has a bright orange beak.\nthis bird has a dark orange bill, with a brown back.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_37_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_37_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_37_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_37_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the petals of this flower are magenta with a long stigma\nthis flower has large green stamen and pollen tube, and fringed purple hairlike petals, surrounded by longer white and purple petals with rounded edges.\nthis flower has petals that are purple with many stamen\nthis flower is white and purple in color, with petals that are oval shaped.\na flower with lavender petals and lavender squiggly pistils showing.\nthis flower has rounded pale purple petals and a fringe of frizzy purple hairs.\nthis flower has flat purple petals with purple stringy stamen forming from the center.\nthis flower is purple in color, and has petals that are oval shaped.\nthis flower has purple petals that has stringy stamen and as yellow style\nthe petals are long and thin and purple and form a flat flower.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_38_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_38_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_38_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_38_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: flower with white long white petals and very long purple stamen\nthis flower is white and pink in color, with petals that are oval shaped.\nthis flower has petals that are green with stringy purple stamen\na flower with stringy looking purple and yellow petals and a green and yellow center.\nthis flower has long white petals beneath a row of slender white and lavender petals surrounding a large erect pistil of green filaments topped with white anthers.\nthis flower has tall green stamen on top of a layer of wavy blue filaments and oblong white petals.\nthis flower is white, purple and yellow in color, and has petals that are oval shaped.\nthe flower on this particular picture has petals as well as a stamen.\nthis flower has petals that are white and has stringy purple stamen\nthe petals of this flower are stringy purple and white and the pistil is green\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_39_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_39_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_39_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_39_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: flower with purple petals and very long purple stamen\nthis flower is purple and white in color, with petals that are oval shaped.\nthis flower has a large amount of long and very thin purple petals that look like they are wiggling.\nthis flower has petals that are purple with long stringy stamen\nthis flower has long stringy light purple petals , with alternating layers of white and dark purple, with a yellow receptacle.\nthe flower is so big with petals that are soft, smooth and arranged in disc like manner below the disc of curly stamens\nthis flower is purple and white in color, and has petals that are oval shaped.\nthis ornate flower has a white, wheel like center, with indigo feathers, and a very unique indigo styles that string out like spaghetti.\nthis flower has a bottom layer of oblong purple petals followed by wavy purple filaments and tall white stamens and pistil.\nthis flower has petals that are purple and has stringy stamen\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_40_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_40_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_40_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_40_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the bird has grey webbed feet and a bright orange beak.\nbird has gray body feathers,white breast feather, and orange beak\na grey bird with webbed feet, a short and blunt orange bill, grey head and wings and has white eyes, a white stripe behind its eyes and white belly and breast.\nthis black and white bird has a short and fat body with a small orange beak.\nthis bird is white with black on its head and has a very short beak.\nthis bird has feathers that are black and has an orange bill\nthis bird has wings that are black and has an orange bill\nthis bird is white with black and has a very short beak.\nthis bird is white and gray in color, and has a vivid orange beak.\nthis bird has a small snubbed orange bill with a bright white eyering\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_41_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_41_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_41_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_41_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower is purple and white in color, with petals that are oval shaped.\nthis flower has a first layer of white petals with thin purple brush like petals inside along with a large green style.\nthis flower has a lower layer of white petals with an upper layer of very long and thin purple petals\nthis flower has petals in the shape of a circle and are purple\nthere are large white petals with thin purple filaments and green stamen.\nthis flower is white and purple in color, and has petals that are oval shaped.\nthis flower has petals that are white and has purple stringy stamen\nthis unique flower has long slender purple petals with stamen and stigma standing straight up from the middle.\nthe flower is so big with petals that are so soft, smooth and arranged separately below the disc of separately arranged purple stamens\nthis flower has rounded green petals under a thick fringe colored dark purple.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_42_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_42_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_42_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_42_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this bird has a completely black body and white eyes.\na fat and shiny all black bird with white eyes.\na large chested black bird with white eyes.\nthis bird is mostly black with a bright yellow eye.\nthis is a large black bird with a white eye and a pointy black beak.\nthis particular bird has a puffy black breast and belly and yellow eyering\nthis round, black bird's small head sticks out prominently, and its yellow eye rings stand out from the the rest of the head.\nthis black bird has pure white color eye ring bulged belly and a sharp pointed beak\nthis bird is black in color, and has a black beak.\nthis bird is all black and has a long, pointy beak.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_43_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_43_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_43_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_43_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: a bird with a very large, hooked bill with black tip and all white plumage across its body.\na large white bird with a long curved bill. an all white body, black eye rings, and black wing feathers.\na larger sized bird with a glowing white body and a large orange beak.\nthis large white bird has a large yellow beak which points down towards the end.\nthe bird is very large and has a white belly, breast, and head with a long orange beak.\na long beaked bird with mostly white and black feathers.\nthe bird has a white belly, long peach bill that is curved and a white crown.\nthis bird has wings that are black and has a long bill\na large bird with a white breast, throat, and head with black eye rings and a large pointed beak.\na pale beak and smokey looking eyebrows on this white breasted bird are striking.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_44_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_44_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_44_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_44_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: a bird with a short, rounded beak which ends in a point, stark white eyes, and white throat.\nthis bird has a black eyering, a bright red bill, and a white throat.\nthis bird looks black and white lines drawn on it, it's small beak is blood red and it's eyes are alert, white with a small black pupil.\nthis bird has a speckled belly and breast with a blunt orange bill.\na black and white bird with white eyes and a short beak.\nthis bird has a white throat and a short orange bill\nthe bill of the bird is short, puffy, and a distinctive red color.\nthis bird has a long neck and has a red bill\nthis bird has a stubby red bill and white throat with a black crown and white eyes.\nthis bird has a black and dark orange bill, with white eyes.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_45_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_45_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_45_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_45_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: a black bird with orange and yellow wingbars and black eyes.\nthis bird is all-black except for a blaze of red on the coverts with a short, pointy black beak and black eyes.\nbird has black body feathers, black breast feather, and pointed beak\nthe black bird has a bright orange stripe on its wings and a large tail.\nthis bird is black in color with a black beak, and black eye rings.\nthis bird is black with red and has a long, pointy beak.\nthis bird is all black with a little bit of red and yellow on it's wings and a very sharp beak.\nthis black bird has a black bill and a red patch on its wings.\na dark black bird with a short black pointed beak.\nthis bird is black and red in color, and has a black beak.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_46_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_46_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_46_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_46_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower has petals that are pink with stringy white stamen\nthe petals of the flower are pink in color and have green leaves.\nthe petals of this flower are pink with a long stigma\nthis flower is pink and white in color, with petals that are oval shaped.\nthe flower is big and has disc of petals that are soft, smooth and has disc of stamens in the above that are white\nthis flower has purple petals as well as a green pistil.\nthis flower has petals that are pink and has stringy stamen\nthis flower is pink and white in color, and has petals that are oval shaped.\nthe main color of the pedals are deep pink with white filaments poking out.\nthis flower has long, skinny purple petals and white and dark purple stamen.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_47_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_47_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_47_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_47_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: a bird with a pointed bill, red and white wingbars, and with black covering the rest of its body.\nthis bird is black with a red and white spot on its wing and has a long, pointy beak.\na bird that is covered in solid black feathers with the exception of its wings which have a few orange and light yellow feathers on it.\nthis black bird features brief areas of red and white on its wings, and it has a sharp, modest size beak.\nthis bird is nearly all black with a red and white covert and short pointy bill.\nthis black bird has a red and white patch on its wing, along with a sharply pointed small black beak.\nthis bird is black white and red in color, with a black beak.\nthe bird is almost entirely jet black with a sharp bright patch of red and yellow on the wings.\nthis bird has wings that are black and has a short bill\nthis bird has a black bill and crown and breast and black wings with a white and red wingbar.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_48_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_48_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_48_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_48_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this imposing bird is all black including its eyes, feet, and sharp pointed bill and it has longer tail feathers.\nthis bird is all black.\na small bird covered in black feathers from head to tail, with a sharp but short pointed beak.\nthe wholly black bird features a strong, thick beak and beady black eyes.\na black bird that is medium in size very long outer rectrices.\nthis all black bird has long rectrices compared to the rest of its body and a short black bill.\nthis medium sized bird appears to be all black.\nthis bird is all black and has a very short beak.\nthe bird is small with a pointed bill, and the belly is black.\nthe bird has a black belly, black back and a black bill.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_49_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_49_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_49_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_49_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this bird has thick, white feet and tarsi with a striking red bill.\nthis is a bird with a white belly, black wings and a white eye.\nthis bird has a white eye ring, belly and vent along with a white with grey speckled throat and breast.\nthis bird has a white belly and breast with a black wing and crown.\na black bird with a wide orange beak, white chest, short tarsus and elongated foot.\nthis bird has wings that are black and has a orange bill\nblack back and crown with very long feet.\nthis bird has wings that are black and has a white belly\nthis bird has a black back, wings and head, with a white belly and speckled nape.\na small black bird, with a white belly, and webbed feet.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_50_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_50_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_50_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_50_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the gray bird has a light grey head and grey webbed feet.\nthis bird has a large, straight bill, large black feet, and a white and gray crown.\na large flying bird with and all grey body, long dark wings, webbed feet, and a long sharp bill.\na medium bird with a gray body, feet,wings and bill.\nthis bird is black with white on its tail and has a long, pointy beak.\nthis is a very large charcoal colored bird with a huge wing span and webbed feet.\nthis bird has wings that are brown and has a long bill\nthis large black bird has a long wingspan and webbed feet.\nthis bird has large feet and a broad wingspan with all grey coloration\nthis bird has large, black, webbed feet, and is covered in gray plumage.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_51_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_51_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_51_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_51_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower has petals that are purple with long stamen\nthe pedals of this flower are purple and magenta with a long stigma\nthis flower has long and skinny purple petals followed by even thinner petals above them.\nthis vibrant purple bloom features a ring of coronal filaments over the rounded sepals and a tall pistil ringed with purple filaments, each capped with a bold yellow anther.\nthis flower is yellow and purple in color, with petals that are oval shaped.\nthis flower has long purple petals with a yellow and pink pistil.\nthe petals of this flower are purple and the pedicel is green\nthis flower has bright purple oblong petals underneath a layer of long blue filaments with curly ends and tall yellow stamen with purple pistils.\nthis flower has petals that are purple and has stringy stamen\nthis unquie flower has a lot f strange looking purple petals to it\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_52_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_52_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_52_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_52_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this is a bird with a white belly and a black back\nwhite and black bird with a short orange beak and white eyes.\nthis bird has yellow abdomen and breast, black coverts and orange bill.\nbird has orange beak white belly the rest of the bird is black.\nthis goofy looking creature has a white belly and breast, black head, and short bright orange bill.\nthis particular bird has a white belly and a black breast\nthis bird is black with white and has a very short beak.\na black bird with a white breast and black feet.\nthis bird has a black crown, black primaries, and a white belly.\na dark black bird with a white belly and flank.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_53_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_53_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_53_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_53_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower has prominent green stamen surrounded by a hairlike purple fringe and long rounded white and purple petals.\nthe petals on this flower are pink with a long stigma\nthis flower is white and pink in color, with petals that are oval shaped.\nthis flower has petals that are purple with stringy stamen\nthe petals on this flower are pink with an elaborate pistil.\nthis flower has a double row of elongated petals under a row of needle shape petals.\nthis flower has pink and white petals and has purple stamen\nthis flower has bright green stamen, with an inner layer of dark purple petals and an outer layer of light pink petals.\nthis flower is white and pink in color, and has petals that are oval shaped.\nthis flower has purple and green petals and a hair-like purple fringe in its center.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_54_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_54_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_54_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_54_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the petals of the flower are pink and have short white filaments with green leaves.\nthis flower has petals that are purple with stringy stamen\nthis flower's petals are thing and long, changing from white to maroon at the edges, and they surround a pair of long, red stamens.\na pink petal flower with white filaments and yellow and green anther.\nthis flower is purple and white in color, with petals that are oval shaped.\nthis flower has petals that are pink and has stringy stamen\nthis flower has long pinkish petals with white stripes and white filaments.\nthe flower on this particular picture has petals as well as a sepal.\nthis flower has a flat layer of long pink petals underneath another flat layer of white filaments with tall green stamen and purple pistils at the center.\na large flower with a long purple and white pedals and a green center.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_55_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_55_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_55_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_55_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower has green anther and a purple stigma on top of white and purple petals.\nthis flower has long purple petals and long purple stamen coming out of it\nthis flower has petals that are light blue with purple and stringy stamen\nthis flower is purple in color, with petals that are oval shaped.\nthis flower has petals that are lavender with wavy filaments and white anthers.\nthe petals of the flower are light purple in color with broad green leaves.\nthe petals are long and thin and purple with long pointy pistil.\na flat purple flower with purple petals in two layers surrounding many green pistils.\nthis flower is purple in color, and has petals that are oval shaped.\nthis flower has petals that are purple and has stringy stamen\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_56_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_56_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_56_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_56_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower has star shaped white petals as its main feature.\nthis flower has prominent green and purple stamen and pollen tube surrounded by two layers of thin purple and wide white petals.\nthe petals of this flower are white with a long stigma\nthe petals of this flower are white with long purple stigma\nthis flower is white and blue in color, with petals that are oval shaped.\na large pinwheel shaped and gray flower with a large colorful center.\na flower with white petals and purple stamen.\nthis flower is white, purple, and green in color, and has petals that are oval shaped.\nthis flower has petals that are white and has stringy stamen\nthe flower has white long thin petals and purple anthers.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_57_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_57_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_57_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_57_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the bird has a long curved black bill and a black eyering.\nthis a dark grey bird with a brown head and a white eye.\na small bird with a brown head and brown nape, with brown covering the rest of its body, and the head is small compared to the body.\nthis is a dark grey bird with a white eye and a large beak.\nthis bird is black with long wings and has a long, pointy beak.\na large brown bird with long wings, a long blunt beak and white around the eyes.\nthis is a bird with a long tail and beak with a reddish head and black wings.\nthis brown bird has large eyes.\nthis bird has wings that are black and has a long bill\nslate grey smooth feathered bird with a large head and a wide wingspan.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_58_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_58_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_58_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_58_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the petals on this flower are purple with an elaborate pistil.\nthe flower has purple petals on it and the stamen is seen.\nthis flower has long, finger like purple petals and long, white and purple stamen.\nthis flower is purple and yellow in color, with petals that are oval shaped.\nthis flower has petals that are purple with stringy stamen\nthis flower is purple and white in color, and has petals that are oval shaped.\na flower with long and narrow petals that are light purple.\nthe petals on this flower are mostly pink and purple, with yellow stamen.\nthis flower has thick purple petals under a fringe of purple and thick stamen.\nthis flower has petals that are pink and has stringy stamen\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_59_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_59_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_59_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_59_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower has petals that are green with stringy purple stamen\nthis flower is white and blue in color, with petals that are oval shaped.\nthe petals on this flower are white with an elaborate pistil.\nthe flower is unique because the petals aren't separated and they have a round tip\nthis flower has blue petals as well as a green and purple pistil.\nthis flower has thick and pale green petals under a thick fringe of purple and white.\nthis flower has petals that are white and has stringy stamen\nthis flower has white oblong petals and white flat filaments.\na flower with long and narrow petals that are whtie.\na flower with long and narrow petals that are whtie.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_60_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_60_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_60_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_60_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the petals on this flower are white with an elaborate pistil.\nthis flower has petals that are green with purple stringy stgma\ninnr petals are needle shaped and are puple inc olor\nthis flower is white and blue in color, with petals that are oval shaped.\na white pedaled flower with wavy blue pedals coming from the green stigma.\nthis flower has blue petals as well as a green and purple pistil.\nthis white, purple, and blue flower has pointed petals and light green sepals.\nthis flower has long white petals and a light green and yellow pistil.\nthis flower has petals that are white and has stringy purple stamen\na flower with many white petals and long purple and white stamen at it's core.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_61_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_61_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_61_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_61_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the bird is completely black with thick feather and a thick beak.\nthis bird has a black back, a black outer rectrices, and a short bill\nmedium sized fully black bird with thin white strips on the tail feathers, and short stubby black beak.\nthis bird is all black with a large, curved beak and long, narrow tail feathers.\nthis is a black bird with tan on the crown and wings.\nthis bird is black with white and has a very short beak.\nthis is a black bird with a long tail and a thick, short beak.\nthis bird is all black and has a long, pointy beak.\nthis bird is jet black in color with a long tail in comparison to it's body length\nthis bird has wings that are black and has a thick bill\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_62_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_62_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_62_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_62_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: a dark grey bird with white eyes and a very short red beak.\na small bird with an orange bill and grey crown and breast.\nthe bird is grey and white with a white eye and white throat.\nthis is a black and white spotted bird with a white throat, eye and red beak.\nthis bird is black and white in color with a red beak, and black eye rings.\nthis bird is black with white and has a very short beak.\nthis bird is black with white and has a very short beak.\nthis mottled white and black bird has white eyes, a white throat, and a stubby dark orange beak.\nthis bird has black eyes with a white throat and a red beak.\nthis bird is black with white and has a very short beak.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_63_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_63_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_63_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_63_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower is red and white in color, with petals that are oval shaped.\na flower with small crimson petals with flowing red and white stamen leading into a central manyfold pollen tube cluster.\nthis flower has red thin like petals and white and yellow pistil.\nthis flower has a large red petal and a bright white anther filament\nthe flower has red petals with yellow and white pollen tubes in the center\nthis burgundy flower has 8 prominent stamen surrounded by a layer thin filament like petals and a layer of fuller petals.\nthis flower has petals that are red and has yellow stamen\nthe flower is has petals that are maroon in color and a center that is white in color.\nthis flower has maroon petals underneath and then long hair-like petals on the top.\nthe flower has large red pedals with long red and white needle like pedals and bright yellow stamen\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_64_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_64_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_64_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_64_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the stamen are longer with larger brown in color anthers\nthe petals of this flower are blue with a large stigma\nthis flower is purple and white in color, with petals that are oval shaped.\nthis flower has petals that are green with stringy purple stamen\nthis is a strange, multicolored flower with long, skinny petals.\nthe thin blue petals point outward while the stigma droops over the stamen.\npurple flower with interesting center and long petals\nthis flower has large white petals and a strange shaped pistil.\nthis flower is white and purple in color, and has petals that are oval shaped.\nthis flower has petals that are green and has purple stamen\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_65_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_65_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_65_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_65_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the bird has black feathers, white feet, and a long black beak.\na blue blackish large bird with a yellow striped long beak, and a yellow rings outlining the eyes.\nthis bird is an almost gray/brown on its nape, back, breast, and belly, it's wings and rump are black, and it has white a white eyering.\na large dark smoky grey body white and black eyes beak is thick\nthis bird has a really long black bill and gray feathers\ngrey bird with white eye ring and beak is black with yellow line in it, feet is white color, and tail is black color.\nthis medium-sized bird is solid dark grey, almost black, with a large, thick bill and a white ring around black eyes.\nthis hefty gray bird has white eyerings and a long black bill.\nthis bird is grey with black and has a long, pointy beak.\na bird with a white eyering, all gray chest and belly and a black and yellow bill.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_66_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_66_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_66_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_66_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower has petals that are white with purple stringy stamen\nthis flower is white and purple in color, with petals that are oval shaped.\nthis flower has green pollen tubes and purple stringy petals as its main features\nthe petals of this flower are purple with a logn stigma\nthe plants has white petals with many more violet stamens\nthis flower has petals that are white and has purple stringy stamen\nthis flower has long white petals and wavy purple filaments.\nthis flower has long purple petals with a large white pistil.\nthis flower has rounded pale green petals under a fringe of paler purple.\nthis flower has white petals with stringy purple stamen in the middle of it.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_67_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_67_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_67_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_67_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the bird had a white and grey speckled chest with a short orange beak.\nthis bird has a black crown and back, with small accents, as well as a black and white spotted breast.\nthe bird has a grey body and a white and grey speckled chest along with an orange beak.\nthe birds has a white throat, breast, belly, side, abdomen and vent area.\nthis bird is white with black on its head and has a very short beak.\na small white and black bird. it has nearly completely white eyes, and a white breast with light black spots. the throat is completely white.\nthis bird is white with black and has a very short beak.\nthis bird has a black head with a white breast that is speckled with black.\nthis bird has a small orange bill with grey crown and white and grey spotted belly.\nthe small bird has a bright white eye, a short black and orange bill, and a spotted belly.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_68_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_68_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_68_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_68_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this bird has gray tarsus and feet with an orange beak and white eyes.\nthis mostly black bird has a bright orange bill, white eyes, a white stripe leading from his eyes down his head and a feathery lifted crown.\nthis insane looking bird has a greyish-black body, a very short, crushed orange beak, and pinhole pupils.\na bird with a short rounded orange bill, stark whtie eye with white brown, and feathered point coming off its superciliary.\nthe bird has an orange bill with feathers sticking up in front of it as well as an entirely black body.\na brown medium size bird with a short orange beak.\nthis bird has black primaries with a black crown and black belly.\nthe bird has black feathers and an orange bill. it also has a line of white feathers behind its eye and a large black tuft of feathers between its eyes.\nthis particular bird has a belly that is black with white eye rings\nthis bird has wings that are black and has an orange bill\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_69_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_69_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_69_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_69_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: a white bird with black tipped wings and a long grey beak.\nthis bird has a white chest with a long pointed white beak.\nthis bird has a curved white bill, a white belly, and black primaries.\nthis white bird has black along the ends of its wings and a pale, long beak.\nthis medium sized to large bird has a white belly, breast, head and tail with a long, pointed beak.\nthe bird has a long white bill and long black secondaries.\nthe large bird has a long light colored bill, a white rump, and a white belly.\nthis large bird has a white head and belly, white wings with black on the ends of the feathers, and a white tail.\nthis bird has wings that are black and white and has a long bill\nthis bird is white with black and has a long, pointy beak.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_70_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_70_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_70_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_70_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this is a strange flower with purple petals and a white stigma.\nthe petals of this flower are purple with a long stigma\nthis flower is purple and white in color, with petals that are oval shaped.\nthe petals on this flower are purple with an elaborate pistil.\nthis flower has petals that are purple with yellow stamen\nthis flower has flat purple petals, blue stamen, and stigma with yellow and white coloring.\nthis flower has green sepals, purple petals stamen arranged alternate pattern\nthis flower has purple petals with long purple pistils that go outwards.\nthis flower has thick purple petals under a round fringe of darker purple.\nthis flower has petals that are purple and has stringy stamen\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_71_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_71_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_71_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_71_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this mostly black bird has a red and white spot on its shoulder\nblack bird shaped like a cylinder with red and orange wing bars\nthe bird has a small black bill, a black eyering and black belly.\nthis small, all black bird's only coloring is on its shoulders. it has a small patch of bright red and white.\nthis bird has a black crown, a black bill, and a red spot on the wing\nblack bird with red and white strip on the shoulder of the wing. he has a black beak. he is standing on a blade of beige straw\nthis bird has wings that are black and has a red and yellow patch\nthis bird has a black crown as well as a orange wingbar\nthis bird is black with red and has a long, pointy beak.\nthis bird is black with red and white on it wing, a long tail, long black legs, a small head, and pointy beak.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_72_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_72_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_72_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_72_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower has white petals with a green ovule and stigma.\nthis flower is white and purple in color, with petals that are oval shaped.\nthis flower has petals that are green with string stamen\nthis flower has long white petals and long white stamen in the middle and a green stigma\nthis flower has three green prominent stigma over a base of cream colored narrow petals with purple bases, and fine entangled stamen.\na flower with long and narrow petals that are light purple.\nthis flower is white, purple, and green in color, with oval shaped petals.\nthis flower has white petals that have long ad stringy stamen with a green style\nthe petals of the flower are light white in color with an inner ring that is purple.\nthis unique flower has purple and white petals and a prominent pistil.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_73_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_73_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_73_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_73_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the color of the flower is purple and has a stamen.\nthis flower has petals that are purple with purple stringy stamen\nthis flower is purple and yellow in color, with petals that are oval shaped.\nthe petals on this flower are purple with an elaborate pistil.\na large purple and white pedaled flower with a large green and purple stigma.\nthis flower has purple petals with purple stamen in the center of it .\nthe pointed leaves of this flower are white graduating to a purple outline and purple rays in the center.\nthis flower is green and purple in color, and has petals that are oval shaped.\nthis flower has petals that are purple and has stringy purple stamen\nthis flower has greenish yellow stamen, purple filaments and magenta petals.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_74_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_74_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_74_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_74_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: flower has larger anthers that are green in color with purple petals\nthis flower has long purple petals and long white stamen in the center\nthe sepal on this flower is purple with white stamen\nthis flower has petals that are purple with white stamen\nthis flower is white and purple in color, with petals that are oval shaped.\nthis flower has light purple oblong petals with long white filaments that lay flat over the petals.\nthis flower has petals that are purple and has stringy stamen\nthe flower has a purple petals with many white stamens around the light green pollen tube\nthis flower has purple petals with many white stamen in the middle, the stigma seems to be split into three and is also white.\nthis flower has a row pf purple petals under a row of long white needle shaped petals.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_75_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_75_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_75_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_75_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower has petals that are yellow with purple stamen\nthe petals of the flower are white in color and have thin white filaments with a yellow center.\nthis flower has petals that are white with purple filaments and green anthers.\nthis flower is white and purple in color, with petals that are oval shaped.\nthis white petal flower has a green stigma and purple and white filament.\nthis flower is white and purple in color, and has petals that are oval shaped.\nthe petals on this flower are white and purple and the pedicel is green\nthis flower has petals that are white and has purple stamen\nthe flower shown has a large green pollen tube with white sepal and white and purple petals.\nthe busy stamens on the white and purple petals around the green pollen tube\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_76_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_76_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_76_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_76_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: a medium sized bird that has a white belly and a very short stout bill\nthis is a black bird with a white eyering and a white belly and a orange bill\nthis is a bird with a white belly, black back and an orange beak.\na large goofy looking bird with a blunt beak and white stripe behind its eye.\nblack bird with white belly and breast. distinctive orange rounded small bill and white eyebrow.\nthis bird is black with white and has a very short beak.\nthis bird is white and black in color, with a bright colored beak.\nbird with a black back and white belly with a distinctive white superciliary.\nthis bird is white and black in color, and has a orange beak.\nthis is a large bird with a white breast and belly, black, crown, throat, wings, back, an unusual short orange bill, and white eyes.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_77_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_77_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_77_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_77_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: a black bird with white feathers like whiskers and a white stripe on the side of its head.\nthis bird has a long beak with a black and white body.\na big rounded yellow bill, long white eyebrow, eyering, long neck.\na black bird with white eyebrows, white whiskers, and a horn-like structure at the base of its big orange beak.\nthis black headed bird has a pale yellow eye and white eyebrow and white malar stripe.\nthis particular bird has a white cheek patch and white eyebrows\nthis bird has a striped head and an orange bill\nthis bird has black plumage, with a large orange beak and white stripes on its head.\nwhite eyebrows and a bright orange bill with a black outline are a stark contrast to the black feathers on the birds head.\na bird with a black head with a white stripe on its eyebrow and cheek, it has a medium length broad orange bill.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_78_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_78_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_78_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_78_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower is purple and yellow in color, with petals that are oval shaped.\nthis flower has a white pollen tube, white stigma, purple petals, and purple anther and filament.\nthe flower is so big and has petals that are soft, thick, smooth and separately arranged in a disc like manner below the disc of stamen that has curly tip\nthe flower has petals that are lavender with purple filaments and large center with white stigma.\nthe petals of the flower are purple with a yellow center and have thin filaments coming from the petals.\nthis flower has long white petals and a white pistil.\nthe petals of this flower are purple and stringy and the pedicel is green\nthis flower is purple and yellow in color, and has petals that are oval shaped.\nthis flower has white petals laying under lavender hairy like stamen.\nthis flower has petals that are white and has purple stringy stamen\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_79_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_79_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_79_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_79_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: small bird with jet black body and beak and bright white eyes.\nbird is totally black with small beak and long retrices.\nthis bird is almost entirely black, except for a grey patch on the breast.\nthis bird has all black feathers and yellow eyes, and black feet.\nthis bird has an all black body and black feet.\nthis is a black bird with iridescent feathers on its breast.\nthis bird is shiny black in color, with a black beak.\na large black bodied bird with a small head that is a darker obsidian.\nthis particular bird has a belly that is black and gray\nthis bird has wings that are black and has a thick bill\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_80_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_80_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_80_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_80_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower has petals that are white with purple filaments and pale green stamen.\nthis flower is white and purple in color, with petals that are oval shaped.\nthis flower has petals that are white with string stamen\na yellow and purple flower that has large rounded petals on the bottom and skinny purple filament.\nthe petals on this flower are white with an elaborate pistil.\nthis flower has outer cream petals and inner light purple straight line petals that become darker purple towards the center that displays enlarged yellow stamens.\nthis flower has purple petals as well as a yellow pistil.\nthis flower has petals that are green and has stringy stamen\nthe magenta filament and anthers fan out from the yellow pistil region to extend over the petals.\nthe petals of the flower are white in color with light purple inner stringy petals.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_81_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_81_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_81_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_81_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower has thick star shaped maroon petals as its main feature.\na red petal flower with white filament and yellow anther.\nthis flower is white and red in color, with petals that are oval shaped.\ninteresting red petals and red filament with white anthers.\nflower has petals that are burgundy with white stamen and burgundy filaments.\nthis red flower has pointed petals, white stamen and a green pedicel.\nthis purple flower has petals as well as a stamen.\nthis flower has petals that are red and has stringy stamen\na flower with long and narrow petals that are purple and long pistils.\nthis flower is red and white in color, and has petals that are oval shaped.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_82_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_82_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_82_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_82_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: a small bird has a short neck with a black breast, a black crown, and black secondaries.\nsmall black bird with a short black beak and bright orange wingbars.\nthis is a black bird with an orange wing and a pointy beak.\nthe bird has a black body, wings, head and tail with a orange spot on side.\na black fat looking bird with a black beak and a fully black body and a red patch on its' upper wing.\nthis bird is black with red and has a very short beak.\na small bird black breast, crown, wing, orange patch near covert.\nthis small bird is primarily dark gray, with a short black beak, a and a bright orange spot on it's breast.\nthis bird is black and orange in color, with a black beak.\na black bird with an orange spot on its wing.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_83_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_83_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_83_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_83_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this is a strange flower with multicolored petals and green stamen.\nthis flower has petals that are purple with purple stringy stamen\nthis flower has different shades of purple and a large green pistil.\nthe petals on this flower are purple with an elaborate pistil.\nthis flower is white and pink in color, with petals that are oval shaped.\nthe big flower has petals that are so soft, smooth and arranged separately forming disc like shape below the disc of purple stamens\nthis flower has large pink and white petals with a prominent green pistil.\nthis flower has tapered lavender petals surrounding a later of dark purple petals which surround the green stamen, pistil, and ovary.\nthis flower has petals that are pink and white and has stringy stamen\nthis flower has light purple and white petals with blue filaments.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_84_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_84_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_84_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_84_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower has prominent green and purple stamen and pollen tube, surrounded by two layers of thin purple and wide white petals.\nthe flower has green sepal with green anther and purple filament\nthis flower is white and purple in color, with petals that are oval shaped.\nthe flower shown has light petals and purple, white and burgundy as its main feature.\nthis flower has large yellow petals and long and blue stamen in the middle\nthis flower is blue and yellow in color, and has petals that are oval shaped.\nthis flower has petals that are green and has purple stamen\nthis flower has beautiful purple and black petals and the pistil is green\nthis unique flower has a lower row of white petals and an upper row of long, thin purple petals.\nthis flower has a open row of green petals under a row of lavender white and maroon needle shaped petals.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_85_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_85_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_85_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_85_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this colorful bird has a black head and black coverts , an orange flat beak and white body.\nthis bird has a long black neck, black crown and back and a wide, orange bill.\na small bird with a white belly and orange beak.\nthis bird has a white breast, black crown, and short bright orange bill.\na bird with a black head, white eyerings and white cheek patches. the bill is short, round and orange. the neck is black and grey and the belly is light grey. the coverts are brown.\nthis bird is black with white and has a long, pointy beak.\nthis bird is white and black in color, with a bright orange beak.\nthe bird has a black head with an orange beak.\nthis bird has a white belly with white eyes and an orange beak.\npretty bird with a round orange bill, a white eye and white superciliary, a white throat, chest and belly, and a black head.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_86_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_86_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_86_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_86_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the petals of the flowers are white and the leaves are green in color.\nthis flower is purple and white in color, with petals that are oval shaped.\na purple and white flower with white and green filaments and anther.\nthis flower has stringy purple petals and green pistil as its main features\nthis pale purple flower has very prominent stamen and stigma with long, skinny petals.\nthis flower has white petals that have stringy purple stamen\na flower with long and curly pistils that are pale purple.\nthe flower on this particular picture has petals as well as a stamen.\nthis unique flower has purple petals and a very prominent pistol and stamen.\nthis flower has blue petals as well as a green pedicel.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_87_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_87_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_87_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_87_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this birds color is blue on his body and a gray long tail.\nsmall black bird with long tail feathers and a short beak\na black bird with relatively long tail feathers and a short but pronounced beak.\nthis small bird has a black & blue colored body, and a black bill.\nthe small black bird has a short, stout beak and beady black eyes.\nthis bird has wings that are black and has a thick bill\nthe bird has a black eyering, long black outer rectrices, and black back.\nthe long tail on the black body bird with a grey bill\nthis bird has a short, downward-curved grey bill, a long tail, and black plumage covering its body.\nthis bird has wings that are black and has a white belly\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_88_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_88_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_88_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_88_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: a medium sized bird with black feathers and an orange beak.\nthis bird has a brown crown, a pointed bill, and a brown back.\nthis is a brown and black bird with a yellow eye and an orange beak.\na medium sized bird that has tones of dark brown with a large sized bill\nthis bird is black in color with a orange beak, and black eye rings.\nthis bird is black with white and has a very short beak.\nthis bird has a yellow bill, dark grey crown with white superciliary and cheek patch, dark primaries and secondaries\nthis bird has wings that are brown and has a yellow belly\nthis bird has an orange beak with a mostly black body.\na bird with an orange colored downward-curved beak, also has a black colored crown with a stripe of white.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_89_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_89_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_89_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_89_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the flower is so big and large and has petals that appears like sepals and also it has stamen and petals arranged in disc like manner one above the other\nthis flower has long white petals, long white stamen and a tall green stigma on it\nthis flower has petals that are white with purple and stringy stamen\nthis flower has a bottom layer of white petals with an upper layer of very thin and long petals that are dark purple at the base, white in the center and light purple at the ends.\nthis flower is white and purple in color, with petals that are oval shaped.\nthis flower has petals that are white and has stringy stamen\nthis flower has rounded pale green lower petals surrounding a layer of thin hair-like purple upper petals.\nthis flower is white and purple in color, with oval shaped petals.\nthis white flower has one layer of flat white petals layered under a broader row of small needle like white and purple tipped petals.\nthis flower has white petals arranged in a disk formation with purple stigma and green stamen.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_90_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_90_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_90_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_90_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the bird has webbed feet that are pale pink as well as skinny tarsus.\na large sea bird with blue feat and a large orange bill. it's body and head are white and wings are dark grey to black.\nthis white bird has a long, curved-at-the-end beak and webbed feet.\nthis large sea bird is white with black wings, has a long orange bill that curves downward at the end with a black tip, and its feet are pink.\nthis bird has a white crown, a long neck, and an orange bill\nthis bird has wings that are black and has a yellow bill\nthis bird has wings that are black and has a white belly\nthis large bird is mostly white with dark grey wings, light purplish feet, and a long orange beak.\nthis large bird has white feathers and webbed feet.\nthis white and brown bird has a long bill and webbed feet.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_91_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_91_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_91_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_91_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the stamen are towering over the stigma which cannot be seen.\nthe flower is so and has disc of petals below the disc of stamens that are blue, white and violet\nthe petals of this flower are blue and white with a long stigma\nthis flower is white and blue in color, with petals that are oval shaped.\nthis flower has petals that are white with purple stamen\nthere is a single layer of brilliant white long oval petals in a star configuration below a single layer of bright blue and white bristle like petals surrounding bright green stamen and a deep purple pistil.\nthis flower has white petals with a second row of purple and white striped needle-like petals on top.\nthis flower has blue petals as well as a green and purple pistil.\nthis flower has petals that are whgite and has purple stamen\nthis blue white and purple flower has pointed petals and lime green and purple stamen.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_92_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_92_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_92_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_92_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this bird has a bright white head, throat, breast and belly, dark grey wings and tail, and a long beak with a hooked upper beak, the area in front of the eye is black.\na large bird with a white head, neck, nape, throat, and breast, with black feathers covering the rest of its body except for white tips on some feathers.\nthis bird is black with white on its chest and head and has a long, pointy beak.\nthis bird has a white head and breast, with black covering the rest of its body.\nthis bird has a white head and chest and a beak that curves down.\nthis bird has wings that are black with a white belly\nthis bird is white and black in color, with a large white beak.\na white and black bird with black eyes sitting on the ground.\nbird with white beak and curved at the end, crown, nape, throat, breast, belly and abdomen are white, primaries and secondaries are black.\nthis bird has a white breast with a long bill.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_93_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_93_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_93_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_93_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the bird has a white chest and belly and a black body with a red bill.\nthis is a black and white bird with webbed feet and a thick beak.\nthis bird has a white belly, with webbed feet, beady white eyes and a small bill.\na medium sized bird with distinctive webbed feet and small tarsi with a bright orange, short bill and white belly with black head and wings.\na black bird with webbed feet, a white belly, and a short orange bill.\nthis black bird has a white breast and belly, gray webbed feet and a short, thick pink beak.\na white bellied bird with a black body and red beak.\nthis bird has a short bright orange bill, white eyes, and is black across the top and white across the bottom.\nthis bird has a black crown and wings with a white breast and webbed feet.\nthis bird has wings that are black and has an orange bill\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_94_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_94_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_94_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_94_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the petals of the flower are purple in color with thin purple filaments.\nthis flower is purple and green in color, with petals that are oval shaped.\nthe petals on this flower are purple with an elaborate pistil.\nthis flower has purple medium size petals topped by tooth-picked shape petals and green stamens\nthe flower is so big and has a disc of separate petals below a disc of separate stamen\nthis flower is pink, purple, and green in color, and has petals that are oval shaped.\nthis flower has petals that are pink and has purple stamen\nthis flower's ovule has a unique and characteristic design. the pedals are purple and splay out in a non conformed pattern.\nthis flower has light purple and grey petals around with dark purple stamen spread in front of the petals in a sun ray shape and a green huge pistol and stigma that have a unique shape.\nthis flower has white and pink petals with bright blue filaments and yellow stamen.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_95_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_95_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_95_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_95_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the petals of this flower are green with a long stigma\nthis flower has petals that are green with purple filaments and green anthers.\nthe flower has petals that are green, thick and separately arranged in disc like formation and also has a disc of stamens on top of it\nthis flower is blue and green in color, with petals that are oval shaped.\nthis flower has petals that are green with purple and stringy stamen\nthis flower has petals that are green and has stringy purple stamen\nthis large flower has long, thin purple petals surrounded by wider green petals with rounded edges.\nthis flower has light green petals with light blue filaments along with green stamens.\nthick green pollen tubes sit on top of the purple, white and blue stick like petals.\nthis flower has petals as well as a pistil. it is green and purple\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_96_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_96_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_96_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_96_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this bird has white eyes as well as a white breast and sides with black tarsus.\na small sized bird that has a grey belly and a very short pointed bill\na small bird with white throat, breast and belly, black small head and short bill, black feet and tarsus.\na small white and black duck with white eye and short black beak.\nwhite bellied bird with black crown and bill with webbed feet.\na bird with a white breast and a black crown and black webbed feet.\nthe bird has a white eyering, large white belly and grey back.\nthe bird has a head that is grey in color and a chest that is white.\nthis bird has wings that are black and has a white belly\nthis bird has wings that are black and has a white belly\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_97_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_97_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_97_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_97_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this large bird has a bright orange bill, a white colored belly, and white eyebrows and cheek patches.\nthe bird has a white side and belly, with a yellow bill and black coverts.\nthis bird has a curved orange bill, a white cheek patch, and a white breast.\nthis bird has an orange bill and a white chest.\nthis bird has a large orange bill,a white belly, a brown & black side, and a white supercilliary.\nthis bird has an orange bill, a white belly and white eyebrows.\nthis bird has wings that are brown and has a white belly\nthis bird is white with black and has a very short beak.\nthis bird has an orange beak and a white belly.\nthis bird is brown with white and has a very short beak.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_98_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_98_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_98_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_98_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this small bird has mostly black featrues with white speckles through out and a dab of red at the stem of his wings.\nthis is a black bird with white spots and a small beak.\na small sized black bird that has white spots and a short pointed bill\nthe bird has a black crown and a small black eyering.\nthis is a small, black bird with white spots on the nape and wingbars.\nthis bird has wings that are black and has a rotund body\nthis bird has wings that are black and has a small bill\nthis particular bird has a belly that is black with white patches\nthis bird has a black belly with black feathers and a yellowish black beak.\nthe bird has a black bill and a black back and crown.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_99_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_99_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_99_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_99_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: large flower with fading purple-white petals, stamens yellow-green, at the receptacle are sprouting numerous white ,long filaments\nthis pale purple flower with pointed medium sized petals and arranged in leaves.\nthe flower is large with big petals that are mostly pink and has stamen forming a disc like layer on top of the petals around the pistil\nthis flower has purple petals that are long and pointed with long white petals coming from the receptacle and yellow stamens.\nthis flower is purple and white in color, and has petals that are long and multi colored.\nthe petals of this flower are purple and white and the stigma is light green.\nthis flower is pink and white in color, and has petals that are oval shaped.\na flower that has purple and white petals with green stamen.\nthis flower has pink and white petals that have stringy stigma\nthis flower has white and lavender petals with white stamen in the center and a green stigma.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_100_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_100_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_100_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_100_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the large brown bird has a big bill and white throat\nthis medium sized bird is primarily black and has a large wingspan and a long black bill with a strip of white at the beginning of it.\nthis bird has crown, a black bill, and a large wingspan\nthis bird features a broad wingspan and a slightly curved, dark bill.\nthis larger bird is black and has a large black beak\nthis bird is mostly black with white around the base of the large curved bill.\nthis is a mostly black and grey bird with a spectrum of white and grey secondaries and wingbars.\nthis bird is all black and has a long, pointy beak.\na medium sized bird with a long bill and brown wings\nthis bird is black with white and has a long, pointy beak.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_101_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_101_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_101_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_101_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the bird has a white and tan belly with a brown head.\nthis small, white-bellied bird has a brown head and a red-tipped beak.\nthis bird is black with a white chest and belly and has a long neck\na medium sized bird that has a white belly that has black spots on it\na medium bird with a white chest and light eyes.\nthis particular bird has a white belly and breasts and black head and back\nthis bird has wings that are black and has a white belly\nthis bird is white and brown in color, with a stubby beak.\nthis bird has wings that are brown and has a white belly\nthis bird has a white breast and belly with grey wings and a grey crown.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_102_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_102_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_102_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_102_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the outer petals of the flower are white while the spiky petals are blue, white, and maroon.\nthis distinctive flower features an inner layer of purple and white petals framed by a layer of white elongated petals.\nthis flower has a very unuqie look and color\nthis plant has multiple blue and white stamens that hang just above the white petals.\nthis flower has white petals and many filaments that are colored purple, white, and blue.\nthis is a light blue flower, with thick and long petals on the outside and thin, and short ombre colored petals on the inside.\nthis flower has rounded green outer petals and pale, thin, pointed purple and white inner petals.\nthis flower has two large yellow pistils, several bright pink and green stamen, a layer of bright purple, blue, and white petals, and an outer layer of light green petals.\nthis flower has long white green petals and blue tip anthers.\nthis flower has petals that are white and has string stamen\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_103_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_103_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_103_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_103_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower is white and purple in color, with petals that are oval shaped.\nthis flower has long white petals and long purple stamen in the center of it\nthis flower has petals that are white with long purple steman\nthe flower shown has white petals as its main feature with green and purple stamen\nthis strange flower has pointed petals and is white ,purple and green.\nthe flower has long oval white petals and purple and green stamen and pistil.\nthis flower has white petals with purple stamen and a big style\nthis flower has white petals as well as a green and purple pistil.\na flower with white petals and white stamen.\nthis flower has white petals with fuzzy white and yellow stamen in the center of it.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_104_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_104_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_104_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_104_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the bird is black with a yellow eye and long black tarsals.\nthis is an all black bird with a pointy beak and a white eye.\nthis is a black and gray bird with a yellow bill and a black crown\nthis bird is all black with a short beak and buggy yellow and black eyes.\nthe small bird is entirely colored black. it's black bill is short and pointed. it's tarsus and foot are also black.\nthis bird has a yellow eye ring, and black feathers covering the rest of its body.\nthis bird has wings that are black and has a short bill\nthis bird has dark black, slightly iridescent feathers all over its body, with bright yellow eyes and a thin, pointed beak.\nthis bird is black in color, and has a black beak.\nthis bird has wings that are black and has white eyes\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_105_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_105_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_105_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_105_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: a bird with a slightly downward curved bill, which also has a white point coming off of it, stark orange eyes, and gray covering its body.\nthe bird is grey and black with an orange bill and white eyes.\na black water-bird with stout orange beak and eyes, and has a white tooth-like structure at the origin of the beak.\nthe bird has an orange bill that is outlined in black.\nthis is a grey bird with an orange bill and black on the wingbars.\nthis bird has feathers that are black and has a long yellow bill\nthis is a black bird with a thick orange beak that has a white piece at the base of it.\na large black bird with a dull range curved beak.\na bird with a black wings, throat and crown and the bill is short and curved\nan all black bird with a orange short slightly round bill\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_106_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_106_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_106_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_106_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this dark bird has an orange beak, light orange tarsus and feet, dark gray wings, whitish breast and belly, and grey eyes.\nthis is a black bird with a white belly and a large orange beak.\nthis is a black and white bird with a orange bill and long wingbars\na bird with an orange thick bill and a black coat with white at its throat.\nthis bird has a orange beak, black throat, and a black and white belly.\nthis bird is black with grey and has a long, pointy beak.\nthe color of the bird is black with an orange beak and a grey belly.\nthe bird has a black belly that is long and two large wingbars.\nthis is a large all dark colored bird with a yellow beak\na medium-sized black bird with a white superciliary, white malar stripe, and small orange beak\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_107_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_107_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_107_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_107_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the bird has black crown and nape, gray throat, breast, belly and abdomen, the tarsus and feet are light brown.\nthis bird has a black face with white long feathers sticking out of its head at random points of location, a grey throat, belly and tarsus, and black feathers covering the rest of its body.\na larger bird with a black and grey body, a curved orange beak, and yellow claws.\nthis is an all grey bird with a light grey breast and a bright orange bill.\na grey bird with an orange beak and distinctive feathering on the upper and lower face.\nthis bird has wings that are black and has a white belly\nthis bird has wings that are black and has a white bill\na large grey orange billed bird, with long beige cheek patch feathers.\nthis particular bird has a belly that is white with gray patches\na small bird with black back feathers, grey belly and throat, white malar stripe, and a small orange beak.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_108_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_108_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_108_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_108_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower has star shaped purple petals as its main feature.\nthis flower has petals that are white with purple stamen\nthis flower is purple and whte in color, with petals that are oval shaped.\nthe flower has several boat shaped purple petals, and pale yellow stamen\nthe flower is so big and has petals that are soft, smooth, thick and are separately arranged separately around the sepal below the disc of separate of stamen with curly tip\nthis flower is purple, and green in color, and has petals that are oval shaped.\nthis flower has white petals, purple stamen and yellow pistils\nthis purple flower has pointed petals, yellow stamen, and green sepals.\nthis flower has petals that are white and has purple stringy stamen\nthis is a purple flower with a green pistil and long curvy petals.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_109_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_109_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_109_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_109_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: black feathers on the top of the bird with gray feathers on the breast and underside of bird orange color on the face of bird and long gray claws\na large and tall bird that is gray and black in color, and an orange mouth.\nthis goofy looking, large bird has a bright orange break with a musky gray body and charcoal wing feathers.\nthis bird is grey in color with a vibrant orange beak, and white eye rings.\nthis bird has a hairy crown, a gray belly, breast, throat, tarsus & feet, and a bright orange area surrounding its bill.\na very distinctive gray bird with a black fringe on its crown, white eyes that contrast with its darker gray head coloring, and vibrant orange coloring surrounding its beak.\na gray and white body bird with a small head in comparison to the body.\nthis bird has lovely orange colored throat with white colored beak\nthis bird is great it has a feather mohock on it had it has a very bright orange face and a white colored beak\nthis bird has a black plum raising from its head, with grey crown and bright orange moon shape markings from eye to eye, with dirty grey white belly colors.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_110_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_110_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_110_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_110_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this bird is black all over its body with orange in its beak.\nthis is a grey bird with a white eye and a large orange beak.\nthe crown of this exotic bird is made to attract eligible females.\nthis black bird has a orange bill with hair coming out of it, small pupils, and a white line across its face.\nthis bird resembles one in cartoons, sleek and shiny black with a white check patch to match it's large round white eyes and a very colorful beak that has a long feather patch resting upon it.\nthis bird has wings that are black and has an orange bill\nthis particular bird has a belly that is gray and black\nthis bird is distinct due to it's single hair part coming off the head then over the beak.\nthis bird has a long black crest and an orange beak.\na small black bird with white eyes, a white malar stripe, and a small orange beak.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_111_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_111_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_111_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_111_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower has protuberant green and purple stamen and pollen tube surrounded by fringed thin purple petals, which are in turn surrounded by slightly pointed wide white petals.\nthe flower shown has white petals and purple and red anthers as its main feature.\nthis flower is white and blue in color, with petals that are oval shaped.\nthis flower has a feathered purple receptacle and white leaves.\nthis flower has a circular shaped sepal as its main feature.\nthis flower has white petals with a layer of thin purple petals.\nthis flower has light green petals surrounding a hair-like purple fringe and prominent green stamen.\nthis flower has petals that are white and has purple stamen\nthe stamens of the flower have a distinctive color pattern, and in a circle formation.\nthe petals of this flower are white and purple and the pedicel is green\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_112_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_112_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_112_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_112_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the crown and most of the body is black with patches of white on the wings and under tail.\nthe bird has a black overall body color aside from several white patches all over it.\nthe bird has a white eyering and a black small bill.\nthis bird has beautiful black feathers with some white spots and a white undertail covets\nthis bird has a black overall body color except from several white patches.\nthis bird is black with white and has a very short beak.\nthis bird is black with white and has a very short beak.\nthis bird is a bigger bird, it has a yellow wand brown belly, the top is mainly black and brown.\nthis bird has a black crown and bill with black wings flecked in grey spots.\nthe bird has a full belly and a black and white back.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_113_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_113_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_113_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_113_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: a bird with stark white eyes, a rounded, snubbed orange bill, and very large feet.\nthis small bird has a curved, orange bill, a black head and back and wings and a white breast and belly.\nthe bird has white breast and abdomen with black crown and coverts as well as an orange colored bill.\nthis bird has a speckled belly and breast with a short orange bill.\nthis bird is black and white in color with a red beak, and white eye rings.\nthis strange bird has a while belly and black back with a white cheek patch and an orange bill.\na medium-sized bird with a yellow belly, blue-grey feet, a speckled grey and white breast, and black back and head.\nthis bird is white with black and has a very short beak.\na large and plump bird with an unique orange beak.\na medium sized black bird, with a white belly, and webbed feet.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_114_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_114_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_114_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_114_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower has prominent green and purple stamen and pollen tube surrounded by two layers of thin purple and wide white petals.\nthe petals of this flower are purple with a long stigma\nthis flower is white and blue in color, with petals that are oval shaped.\nthe flower has tapered white petals and tiny thin blue petals and a large three part stigma.\nthis complex flower has white petals, a purple pollen tube and purple and white filaments.\nthis flower is white, blue, purple, and yellow in color, and has petals that are oval shaped.\nthis flower has a wheel of white oblong petals underneath a layer of flat blue filaments with a purple pistil and green stamen.\nthis flower has petals that are white and has purple stamen\nthis flower is made up of long white petals under a row of lavender white and purple needle shaped petals.\nthis blue and white flower has pointed petals and green sepals.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_115_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_115_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_115_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_115_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower has purple sepal and white pistil as its main features\nthis flower is purple and white in color, with petals that are oval shaped.\nthis flower has petals that are purple with stringy stamen\nthis flower has the row of blue petals and a row of string like structures with the prominent white stamens at the middle\nthis flower has large purple petals under a fringe of long thin purple hairs.\npurple sepal surround the large yellow pollen tubes on this flower.\na flower with long and narrow petals that round at the top.\nthis flower has petals that are purple and has stringy stamen\nthis flower has light blue petals with long and flat purple filaments with curvy ends.\nthis flower has light blue petals with long and flat purple filaments with curvy ends.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_116_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_116_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_116_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_116_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower has petals that are green with purple and white stamen\nthis purple blue and white flower had larger petals underneath thin wispy petals.\nlong white petals with multicolored blue, white, and purple stamen\nthis flower has a lower layer of white petals with an upper layer of very long and thin purple petals that are white at the base.\nthis flower is white and blue in color, with petals that are oval shaped.\nthis flower has petals that are green and has purple stamen\nthis flower has long rounded white petals with skinny white and blue petals on top of those.\nthis flower has a green ovule, green stamen , a purple pollen tube and pure white leaves.\nthis flower has white petals in a ring followed by a ring of blue filaments.\nthere are needle like blue and white petals, a purple stamen, oily leaves, and wide sepals.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_117_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_117_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_117_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_117_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the flower shown has stringy purple petals which are its dominant feature\nthe inner petals are light purple in color and are needle shaped\nthe petals of this flower are purple with a large stigma\nthis flower is white and purple in color, with petals that are oval shaped.\nthis flower has petals that are purple with purple stamen\nthis purple flower has a base of string-like leaves with a white stamen in a criss-cross pattern.\nthis flower has purple petals and a wide fringe of hair-like purple.\nthis flower has petals that are purple and has stringy stamen\nthis flower has purple petals as well as a yellow pistil.\nthis flower has very fine, curl ended vivid purple petals and yellow stamen.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_118_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_118_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_118_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_118_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower has thick green stamen and purple fringe surrounded by wide white petals with pointed tips.\nthis flower has thin wiry petals that are dark purple from the center and white on the outer edges.\nthis flower is purple and white in color, with petals that are oval shaped.\nthis flower has petals that are green with white and purple stamen\nthis is a strange flower with white petals and purple near the ovary.\nthe flower is so big with petals that are soft, smooth and arranged in disc like manner below the disc of curly white disc layer of stamens\na flower with long and narrow pistils that are curly.\nthis flower has petals that are white and has a big green style\nthis flower has a large upright green pistils and yellow stamen with a layer of wavy white filaments and oblong petals.\na wavy flower with a green long stigma in the center.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_119_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_119_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_119_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_119_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the bird is black with a white detail on the wing and speck of orange on the wingbar.\nthis is a black bird with orange in the wingbar with black feet\na completely black bird except a white and orange wingbar.\nthis is a small bird with black fur and feathers.\na black bird with long legs and a white and orange stripe on its wing.\nthe body of the bird is black while the wingbars are white and orange.\nthis striking bird is entirely black with the exception of its orange and white wingbars.\nthis bird is black with red and has a long, pointy beak.\nblack bird with a thin, pointy bill and a distinct red and white stripe on the wing,\nthis bird has a pointed black bill with a black back.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_120_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_120_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_120_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_120_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: there are several shapes, sizes, and colors of petals on this complex flower.\nthe stamen are seen and the filaments pointing out.\nthis flower is purple and white in color, with petals that are oval shaped.\nthe petals are very slim and wavy in shape and are white and purple-striped in color.\nthis flower has petals that are green with stringy purple stamen\nthe flower has long, stringy purple petals with long green stamens and green sepals.\nthis flower has petals that are white and has a stringy stamen\nthis flower has purple petals as well as a green pistil.\nthis purple flower's petals are like thick purple threads surrounding a pale yellow center with pale green stamen.\nthis flower is white, purple and yellow in color, and has petals that are multi shaped.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_121_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_121_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_121_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_121_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: a large bird is black and has a bright patch on it's coverts, a sharp black bill, and a black crown.\na black bird that has red and yellow spots on its wingbars.\na large contrast between the black body and orange/red spots on the wings with long legs and average sized bill and eyes.\nthis black bird has beautiful orange-yellow wingbars, with hints of orange peaking through the black primary feathers on its back.\nthis is a black bird and gray feet with a orange and yellow on the coverts\nthis bird is mostly black with a red covert and yellow wingbar.\nthis bird has wings that are black and has a orange and yellow patch\nthis bird has wings that are black and has a red and yellow patch\nthe bird is small with a pointed bill, black except for a bit of red and yellow on the covert area.\nthis bird has wings that are black and has a red and yellow patchj\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_122_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_122_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_122_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_122_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the blossom has a layer of rounded purple and white petals topped by a layer of fringed purple petals.\nthis flower has a long pink petal and a lot of blot stamen in the center\nthe flower has purple petals as well as a green stigma surrounded by purple.\nthis flower is pink, white, and purple in color, and has petals that are very skinny.\nthe flower has purple and white petals and green stigma.\nthis flower has light purple and white petals with blue filaments in a disk formation.\nthis flower has petals that are pink and white and has purple stamen\nthis flower has a purple fringe surrounded by white and green petals.\nthe anthers are very large, and the stigmas are very large.\nthis flower is white, pink, and purple in color, and has petals that are oval shaped.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_123_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_123_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_123_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_123_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this bird is mostly black, shows significant head, a short, pointy black bill, and red coverts with white tipping.\nthis bird is black with red on its wing and has a very short beak.\na bird with an all black body with bright red coverts, and black tarsus.\nthis large bird is black with small accents of red and yellow on its wings.\nthis bird is mostly black and has a red and white covert.\nthis particular bird has a black body with a red patch on its coverts\na black bird with bright red on its wing, its head is small as well as its beak, the beak is pointy.\nthis is a black bird that has red and yellow coverts.\nthis bird is predominantly black but either the covert or the secondary colors on the wings has red and the tip is white or orange/yellow.\nthis bird's body, bill, and feet are entirely black, except for a splash of bright red on the wings and a yellow wingbar.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_124_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_124_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_124_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_124_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this black bird has red and yellow on its wings and a long black beak.\nthe bird has a small black bill and black wingbar.\nthis bird is nearly all black with orange coverts, and yellow wingbars.\nblack bird with red and yellow on wing coverts\nthis bird is black with red on its wing and has a long, pointy beak.\nthis is a large black bird with a red-orange and yellow coverts.\nthis is a large mostly black bird with a red and yellow marking on its wing.\na medium size bird with a black covering and orange coverts.\nthe bird is jet black in color and has a side feather that is orange in color.\na black bird with red coverts on it's wings.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_125_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_125_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_125_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_125_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this small bird has a white and brown speckled breast and belly with a bright orange beak.\na brown and white auklet with orange is has white colored spotted with brown on its ventral side and has brown wings.\nthis bird is brown and white with speckling on the lighter-colored belly, white eyes and a short beak.\nthis bird has a speckled belly and breast with a short pointy bill.\nthis bird has a white eye, an orange bill, and a breast that is spotted\nthis bird is brown with white and has a very short beak.\nthis particular bird has a belly that is gray and white\nthis brown bird has a white speckled belly and breast and a short orange and brown bill\na medium sized bird with a short bill, and white eyes\nthis bird has a grey crown and small orange beak with grey back.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_126_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_126_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_126_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_126_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: a bird with a curved and hooked, large bill, black eyes, white breast and brown primaries and secondaries, and a large head.\nthis large bird is white with black wings that have orange accents, along with an orange beak and touch of black on the tips of the tail feathers.\nthe bird is white, though the wings and tip of the tail are brown and grey.\nthis white bird features brown wings and a long, orange beak with black shading around its black eyes.\nthis bird is large with a white head and chest and brown wings and an orange beak.\nthis bird has wings that are grey and has a long orange bill\nthis bird is white with grey and has a long, pointy beak.\nthis bird has wings that are brown and has a white belly\nthis bird has a white crown as well as a orange bill\nthis bird has a white body and head, brown wings, and a long slightly curved orange beak.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_127_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_127_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_127_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_127_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower has noodle like petals that are dull pink.\na flower with white petals and squiggly pistils accompanied by large anther filaments.\nthis flower is white and purple in color, with petals that are oval shaped.\nthis is a strange flower with purple petals and yellow stigma.\nthis flower has petals that are white with purple stamen\nthis flower has long white petals with a wavy layer of filaments.\nthis flower has blue petals as well as a green pistil.\nthe petals are thin and string with purple and white stripes and the filament are green with yellow anther.\nthis flower has petals that are white and has purple string stamen\nthis flower has long white petals and a large white pistil.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_128_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_128_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_128_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_128_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower has petals that are purple and very stringy\nthe petals on this flower are purple with an elaborate pistil.\nthis flower is purple and white in color, with petals that are oval shaped.\nthis is strange flower with purple petals and yellow stigma.\ninner petal;s are needle shped and are purple in color\nthe flower on this particular picture has petals as well as a sepal.\nthis flower has purple petals as well as a white stamen.\nthis flower has pink petals that have long, stringy and purple stamen\na large plant with purple pedals and a white tip and a white stigma.\na flower with long and narrow petals that are purple.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_129_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_129_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_129_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_129_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the flower is so big and has disc of petals below the disc of blue, white and violet stamens\nthis flower has petals that are white with purple filaments and green anthers.\npetals are white in color,inner petals are needle shpaed\nthis flower is white and blue in color, with petals that are oval shaped.\nthis flower has petals that are green with purple and stringy stamen\nthe beautiful big flower has petals that are soft, smooth and separately arranged in single layer forming disc like shape below the layer of disc like arranged stamens\nthis flower is white and blue in color, and has petals that are oval shaped.\na multi petaled white flower with visible pistons, stamens, and numerous blue filaments in the center.\nthis flower has white oblong petals underneath a layer of bright blue filaments with upright stamen and pistils.\nthis flower has petals that are pink and has purple stringy stamen\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_130_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_130_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_130_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_130_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower is blue and white in color, with petals that are oval shaped.\nthis is a flower with green sepal, purple and white spiky petals, purple style, and green stamen.\nthe flower has petals that are white with purple filaments.\nthis flower has an outer layer of white petals with an inner layer of blue and white petals surrounding stamen that are in alternating colors of purple and green.\nthis dramatic and complex flower displays a geometrical arrangement of purple and green stamens and pistils at the center, surrounded by many petals like filaments in stripes of purple, white and periwinkle, complemented by ten sepals whose inner surface is pale green.\nthis flower has blue petals as well as a green and purple pistil.\nthis flower has oval creamy whitish green petals with an inner layer of fringed petals of white and blue and very large stamen.\nthis flower has large white petals with a dark purple stigma.\nthis flower is white, and blue in color, and has petals that are oval shaped.\nthis flower has petals that are green and has purple stamen\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_131_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_131_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_131_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_131_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: here we have a bird soaring above the water and it's color is dark brown, he seems to have a long dark colored bill with a white ring around it on the upper part near the bill.\nthis large bird is black all over, with a large flattened bill.\na large bird covered n green feathers except for the bits of white on the edges of its wings, and the circle around its bill.\nthis is a large grey bird with a large grey beak.\nthis bird is brown and has a black bill with a white bit surrounding it.\nthis bird has uniformly brown plumage, with a white ring around the base of its long, brown beak.\nthis bird has wings that are brown and has a thick bill\nthis dark brown bird has long, angular wings, short rectrices, and a blunt, medium length black bill.\nthis bird has a large beak and is brown with a white ring on its face.\na large brown and black bird with large thick bill has white stripe along bill.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_132_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_132_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_132_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_132_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: a black bird with long tail feathers and a large curved beak.\nthis bird has a black crown, a short and thick bill and a black belly\nthe bird is black with a thick black hooked beak and a long black tail.\na long bodied bird that is entirely black with a large beak\nthis bird is black in color with a black beak, and black eye rings.\nthis bird is all black and has a very short beak.\nthis bird is black in color, with a black beak.\nthis bird has wings that are black and has a thick bill\nthis is a black bird with a long tail and a thick beak that curves downward.\nthis large black bird has a large, thick and blunted black beak.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_133_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_133_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_133_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_133_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the flower is so big and has stamens that are arranged in a disc like manner with curly tips above the petals that are soft, smooth and arranged separately forming a disc\nthe flower shown has yellow and green pistil with purple petals\nthe flower has petals that are purple with green stigma and wavy filaments.\nthis flower is purple in color, with petals that are oval shaped.\nthe flower has oval purple petals and skinny purple petals on top.\nthis flower has purple petals, with lavender stamen in the center.\nthe flower has elongated purple petals with purple squiggly needle shaped on top.\nthis flower has large purple petals under a thick purple fringe and green stamen.\nthis flower has purple petals as well as a purple sepal.\nthis flower has petals that are purple and a big green style\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_134_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_134_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_134_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_134_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the flower shown has green pollen tubes with stringy white petals\nwhite petals green white yellow and purple middle green and light green leaves\nthis flower has petals that are white and very stringy\nthe petals are tendril like and purple in color and stamen are visible.\nthis flower is white and purple in color, with petals that are oval shaped.\nthis flower has long white petals and a light green pistil.\nthis flower has a ring of oblong white petals topped with a layer of wavy filaments with tall pistils and stamen at the center.\nthis flower has a row of green petals under a row of very long curvy needle petals on top.\nthis flower has many stringy fibers sticking out from the pistil in the center.\nthis flower has petals that are white and has stringy stamen\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_135_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_135_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_135_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_135_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower has long thin petals that are black at the base, white at the center, and blue at the tips.\nthis flower is white and blue in color, with petals that are oval shaped.\nthis blue and green flower has a distinctive stigma and what looks like fringe along the inner style.\nthis pretty flower has long thin petal that are white and blue.\nthis flower has one row of white petals and an inner row of tri-colored petals, the inner row is medium blue at the tips, white in the middle, and purple toward the center of the flower.\nthis flower is white, blue, and green in color, and has petals that are oval shaped.\nthis flower has petals that are green and has purple stringy stamen\nthis is a beautiful unique flowers with multiple colors of petals and stems that is easy on the eyes.\nthis flower is characterized by its light green petals, vibrant blue and white stamen, and ornate stigma.\nthis flower has large upright green stamen and purple pistils along with white petals.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_136_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_136_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_136_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_136_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower has white petals with white anthers and a yellow and green stigma.\nthis flower has petals that are green with long stigma\nthis flower is white, purple, and yellow in color, with petals that are oval shaped.\nthis flower has the simple row of white petals at the bottom with the double colored flattened stamens in the circle order\nthis strange looking flower is green and has pedal with a point\na flower with a pinwheel shape base and a large wavy and multicolored stigma.\nthis flower has rounded pale green petals underneath a fringe of white and purple.\nthis flower has petals that are white and has green stamen\nthis flower has has a lower layer of white petals, a second layer of thin strands of wavy, white petals and thick yellow stigma.\nthis flower has white oblong petals and long white filaments with curly ends.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_137_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_137_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_137_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_137_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: a dark grey and white bird with a wide speckled breast and webbed feet.\nthis bird has a gray speckled appearance with large gray webbed feet.\na bird with stark white eyes, webbed feet, and small orange tipped bill.\nblack back, wings, neck, nape, crown, and feet. with white spotted breast. white streaks on eyebrows and superciliary\nthis black and white bird is mostly black with flecks of white on the head, chest, throat and feet.\nmedium sized dark grey bird with white spots, webbed feet and white eyes.\nthis unique bird is mottled black and white, with striking white eyering, a large breast, and a tiny bill.\nthis web-footed bird has a short beak, white eyes, and grey and white mottled feathers.\nthe wide the feet are gray, the eye is white and black, the corvets are black and gray\na medium size bird with a black and white mixture color.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_138_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_138_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_138_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_138_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the wings are brown, long and narrow, and have white markings on the secondaries, the bill is blunt tipped and black, the head is light brown with a white thin marking between the eye and the beak.\ngrey bird with black flat beak with grey and white big wings\nthe dark brown bird has black eye ring and black rectrices.\nthis bird's most distinct feature is its long, flat beak as well as its large wingspan.\na bird with a large black bill with downward curve, white superciliaries and brown plumage.\nthis bird has a large black bill with a white ring around the base of the bill.\nthis bird has wings that are black and has a long black bill\nthe bird has a curved black bill and two large brown wingbars.\nthis bird has a wide wing span covered in brown, grey and white feathers with a broad, blunt beak.\nthis bird has a brown crown, brown primaries, and a brown throat.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_139_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_139_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_139_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_139_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower has blue and white petals which circle the green and purple pollen tubes\nthis unique flower features a strange arrangement of white-blue petals and alien-like pistil.\nthe flower is big and disc shaped with petals and stamen forming two layers around the green pistil\na flower with little pistil and is surrounded by leaves\nthis colorful flower has star shaped petals and a bright blue and white pistil.\nthis flower is white, blue, and purple in color, and has petals that are oval shaped.\na flower with large shiny white oval sepal, blue and white bristle like petals, large green stamen, and a deep purple pistil.\nthis flower has petals that are white and has purple stamen\nthis flower has white petals with blue, white, and purple filaments.\na flower that has long narrow pistils that are white and blue.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_140_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_140_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_140_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_140_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: some sort of web-footed bird with an orange beak sits atop a rock.\na black bird with wide black feet, a white eye, and bright orange bill.\nthe bird has a white eyering and a black throat that is medium.\nthis bird is mostly grey with webbed feet, and blunt orange bill.\na multi-toned orange and white beak, webbed feet, and white eyeing make up the dramatic characteristics of this small bird.\nthis is a gray bird with webbed feet, white eye, an orange beak and a feather on its crown that stands straight up.\nthis is a medium sized black bird, with white yes, a short bill and webbed feet.\nthis bird is black with black webbed feet, a short tail, a pale eye, a black plume on its forehead, and a short thick beak with bright orange blotches.\na black bird with a short orange beak, a white eye, and a black feather on the top of its head.\na small bird with blue feet and black wings with a orange rounded beak.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_141_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_141_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_141_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_141_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower is blue and green in color, with petals that are oval shaped.\nthe petals of the flower are bright blue with white lines, and the stamen is bright green with black dots.\nthe flower has stamen and the petals are green in color.\nthis flower has green petals and purple and green stamen.\nthe flower shown has green pollen tubes with green sepal and blue petals\na large flower with neon colors and a large green stigma.\nthis blue and white flower has pointed petals and green sepals.\nthis flower has a flat row of pointed white petals and a flat row of thin blue filament on top of that.\nthis flower has petals that are green and has purple stringy stamen\nthis flower has large green petals under a fringed set of purple and white quills.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_142_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_142_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_142_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_142_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the bird has a black crown, belly, wings, the color covers its entire body.\nthe bird has a black belly, wings and a black tail.\nthis bird is black in color with a broad tail longer in length compared to the rest of its body.\nthis bird is black in color with a black beak, and black eye rings.\na black bird with a large, broad tail and a broad bill.\na completly black bird, with a rounded bill, and long tail.\nthis bird is black in color, with a curved black beak.\nthis bird is entirely black with a wide retrices and large top bill.\nthis all black bird has a long tail and a medium sized black bill.\na medium sized black bird, with a tail that is large for its body.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_143_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_143_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_143_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_143_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the flower has very thin pointy petals colored in blue white and purple\nthis flower is white and blue in color, with petals that are oval shaped.\nthis medium white flower has rows of thin blue petals and thick stamen.\nthis flower has petals and sepals shaped like a plate with an exposed pistil\nthis flower has prominent green and purple stamen and pollen tube surrounded by two layers of thin purple and wide white petals.\na clear pedal base flower with a white and purple stigma.\nthis flower is green, blue, and white in color, and has petals that are oval shaped.\nthe flower has a white petals with many stamen around the green pollen tube with a tan receptacle\nthis flower has light white petals with a small amount of green, shooting stamen that are white at the root and purple at the tips\nthis flower has petals that are green and has purple stringy stamen\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_144_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_144_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_144_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_144_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the bird is grey with a white body and small red beak.\nthis is a bird with a white belly, black back and an orange beak.\nthis is a bird with black and white feathers and a small straight beak.\na white bodied bird with black top feathers and small white eyes.\na small bird with a white spotted belly, black feet, black back and crown and bright yellow eyes.\nthis bird has wings that are black and white with a short orange beak\nthis bird has wings that are black and has an orange bill\nthis bird has a short orange bill, white breast and belly, and black crown and webbed feet.\nthis bird is black with a white throat, breast and belly that have some gray spots, the black feet are webbed and beak is pink with dark gray.\nthis bird has a grey crown with a small orange beak and grey feet.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_145_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_145_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_145_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_145_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the bird has a white breast and belly as well as a curved bill.\nthis is a white bird with a black wing and a large beak.\nthis is a tall white bird with a brown inner retrices and a long bill\na medium sized bird with a bill that curves downwards, and a white belly\nthe bird has a white body, with black primary and secondary wings, and black retrices.\nthis white and black bird has a long beak which curves downward.\nthis is a white bird with brown wings and a beak that curves downwards.\nthis bird has black winds and a white body with a long curved beak\na tall white bird with black wings, black eyes and a long curved bill with a black tip.\nthis bird is white with black and has a long, pointy beak.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_146_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_146_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_146_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_146_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this bird is all black with a slight blue tinge, and has a puffed body.\nthis bird is black with blue iridescent throughout and yellow eyes.\nthis bird is black with long wings and a very short beak.\nmedium black and blue bird with short tarsus and medium black beak\nthis bird is black with flecks of deep blue throughout, round yellow eyes with black pupils, and a beak that comes to a straight point.\nthis bird is black with blue and has a very short beak.\nthis bird is mostly black with sort of iridescent green and blue to the wings and body.\nthis bird has wings that are black and has yellow eyes\nthis bird is black with brown and has a very short beak.\na small black bird, with white eyes, and a sharp bill.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_147_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_147_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_147_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_147_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower is white and purple in color, with petals that are oval shaped.\nthe petals of the flower are mostly white while the inner layer is detailed with purple.\nthis flower has a white petal and a lot of purple anthers surrounding the petals\nthis flower has a large pistil with several stamen, purple and white frills, and white, skinny petals.\nthis medium white flower has rows of thin blue petals and thick stamen.\nthis flower has wide rounded pale petals surrounding a fringe of hair-like purple petals.\nthe purple and white petals are thin and the sepals are green and wrinkly.\nthis flower has petals that are white and has stringy stamen\nthis flower has a lower row of white petals, an upper row of long, pointed purple petals and a prominent stamen and pistil.\na small flower with thin purple and white petals surrounded by white broad leaf petals.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_148_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_148_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_148_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_148_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: a large bird has a stumpy bill, large tufts of black feathers on its breast, and a black crown.\na small bird with a black eye, black head, and dark bill.\na solid black bird with long tail feathers and a rounded beak that looks vey unusual.\nmedium black white and brown bird with medium black tarsus and medium black and white beak\nall black bird with a small bird and all black eyes.\nthis particular bird has all black feathers and a black bill and black eyes\nthe bird is completely black with a small head and rounded beak that blends into the head.\nthis bird has shiny black feathers and a curved, short beak.\nthis bird is all black and has a very short beak.\nthis bird has wings that are black and has a thick bill\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_149_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_149_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_149_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_149_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower is pink in color, with petals that are oval shaped.\nthis light purple flower has long oval petals with light green stamen.\na purple and green flower with thick filament and anther.\nflower with star shaped purple petals and long blue stamen.\nthis flower has green pistil and purple petals as its main features\nthis flower has petals that are pink and has stringy stamen\nthis flower has purple petals, purple and white stamen and green anther filaments.\nthis particular flower has petals that are long and purple with a light green center\nthis flower can be characterized by its beautiful purple petals, blue stamen organzied in a circle around the flower, as well as its ornate stigma jetting out of the flower.\nthis flower has tall purple pistils, tall green stamen, purple petals and filaments.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_150_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_150_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_150_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_150_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this bird has a gray belly and breast with darker gray head, and wings.\ndark gray bird with extremely long wings, large pointed beak, and no neck\nthis is a grey bird with a white back and a long pointed beak.\nthe bird has a white eyering and long secondaries that are dark grey.\nbird is really big with medium bill, it has dark grey and black feathers.\nthis bird has a long wingspan, and smooth feathers that are light gray tapering to dark gray on its head.\nthe bird has long black wingbars, a black eyering and curved black bill.\nthis dark grey bird has a wide wingspan with a white back and bright light eyes.\nthis bird has a black crown with a black bill and long wings with black secondaries.\nthis bird has very long wings that are mostly dark with some white in there. small eyes and small beak.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_151_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_151_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_151_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_151_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the bird has a green breast, black belly, and yellow eyering.\nthis bird is completely black and has a very short beak.\nthis bird has a large, curved, black bill, a blue throat, and a yellow eyering.\nthis bird is mostly black with a blue irdescent ring around it's neck.\nthis bird has a shiny black body and long black tail feathers, a pointy black bill and bright yellow eyes.\na small bird containing all black feathers except for the splash of teal feathers along its neck.\nthe bird has a yellow eyering, long outer rectrice, and black back.\nthis bird is mostly colored a very dark green (almost black) and has a blue throat, short bill, and black feet.\nthis bird is black and blue in color, with a black beak.\nthis bird has a blue and black breast coloration with a bright yellow eyering\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_152_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_152_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_152_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_152_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower is white and purple in color, with petals that are oval shaped.\nthe petals on this flower are white with an elaborate pistil.\nthis flower has bright white petals with purple filaments and purple anthers.\nthis flower has wide rounded pale green petals and a hair-like fringe of purple.\nthis flower is white and purple in color, and has petals that are oval shaped.\nthis flower has petals that are white and has purple stringy stamen\nthis flower has white petals as well as a green pistil.\nthis flower has long light green petals under needle like lavender white and purple petals.\nthis flower has long light green petals under needle like lavender white and purple petals.\nthis flower has long light green petals under needle like lavender white and purple petals.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_153_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_153_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_153_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_153_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: a larger black and grey bird with an orange beak.\nmedium black bird with medium black and orange beak and a small orange eye\nthis large bird has a large head compared to the body size, black wings, and a grey throat, belly and side.\nthis large bird is mostly black with a long blunt bill.\nmedium to large grey and black bird with medium black and orange beak\nthis bird has wings that are black and has an orange bill\na strange looking bird with a curved beak and small head in proportion to its body.\nthis bird is black with white and has a long, pointy beak.\nthis bird has wings that are black and has an orange bill\na black bird with a large orange beak and white eyebrows.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_154_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_154_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_154_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_154_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the petals are purple, the flower is completely open reveling the off white stamen.\nthis flower has petals that are purple with stringy purple stamen\nthe flower shown has purple petals with a yellow pistil\nthis flower is purple and white in color, with petals that are oval shaped.\nthis flower has a bottom layer of dark purple petals with a top layer of very long and thin purple petals that look like they are wiggling.\nthis flower has petals that are white and has stringy stamen\nthe petals of the flower has a hair like texture, and consist of various shades or purple and blue.\nthis flower has short petals and hairy anthers.\nthis flower has long purple petals and a layer of purple filaments with wavy ends.\nthis flower has long purple petals and a large pale yellow pistil.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_155_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_155_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_155_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_155_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the petals of this flower are green with a long stigma\nthis flower is purple and white in color, with petals that are oval shaped.\nthis flower has petals that are yellow with purple stamen\nthis flower has thick green stamen and purple fringe surrounded by wide white petals with pointed tips.\nan odd looking flower with string like purple petals over long white petals which surround large white stigma.\nthe variegated purple and white petals look like fringe.\nthis flower has petals that are white and has purple stamen\nthis flower is purple and white in color, and has petals that are oval shaped.,\nthis flower has white petals with stringy purple stamen overlapping it.\nthis flower has a bottom layer of white petals followed by a layer of wavy purple filaments.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_156_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_156_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_156_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_156_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: gray and charcoal bird with a large wing span and pointed beak\na medium bird with a gray body, wings, and dark gray face and bill.\nthe bird has a white eyering and a long grey bill.\nthis bird is grey with white on its chest and has a long, pointy beak.\nthe bird has a white belly and back with a black head and striped wings and tail.\nthis bird is grey with black and has a long, pointy beak.\nthis bird has a bright white eye ring, black head, and grey and white feathers covering the rest of its body.\nthis bird has no neck and a long beak and is gray.\nthis bird is gray it has a very white eye-it has a long beak and big wings\nthis bird has a black crown with a grey belly and long grey wings.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_157_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_157_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_157_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_157_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the flower has white and purple petals, with the yellow stamen clearly visible.\nthis flower is purple and white in color, with petals that are oval shaped.\nthe flower has stringy purple petals with green pollen tubes in the middle\nthis flower has petals that are purple with green stamen\nthis flower has protuberant green and purple stamen and pollen tube surrounded by fringed thin purple petals, which are in turn surrounded by slightly pointed wide white petals.\nthis flower has petals that are purple and has stringy purple stamen\nthis flower has white petals with blue filaments and light yellow pistils.\nthis flower has purple and green petals under a purple fringe and thick green stamen.\nthis flower is white and purple in color, and has petals that are multi shaped.\nthere is a bottom row of pale purple petals, and an inner row with numerous thin, dark purple spikes, and a prominent pale green pistol.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_158_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_158_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_158_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_158_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: small grey bird with white feathers, white stripe and orange beak.\nthis bird is grey in color with a orange short beak and white eye ring.\nthis bird is mainly all grey in color except for the white spots all over its body, and its white vent.\nthis small bird has an orange bill and a brown body with white spots.\nthis is a black bird with a white eye and an orange bill.\nthis bird has wings that are brown and has an orange bill\nthis bird has a short rounded orange bill, grey to dark grey crown, white cheek patch, and grey with white spots brease.\nthis bird who is swimming in the water has an orange beak, white streaks going down its head, and a dark gray body.\nthis bird has wings that are grey and has an orange vbill\nthis is a swimming bird and has an orange beck with white eyes and gray and white body.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_159_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_159_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_159_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_159_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the flower shown has purple petals with purple anther and filament and a green pollen tube\nthis unique white flower has long thin purple petals and a strange green center.\nthis strange flower has light-pink and purple petals with a ridiculous set of green stamen and pistil.\nthe petals of the flower are light purple and the stamen is pale green.\nlower fleshy pale pink petals with upper filament corolla and prominent stamen and stigma.\nthis flower has light and dark purple petals and the pedicel is green\nthis flower has flat elongated oval petals of a creamy lavender with central petals that are a spiky purple and large greenish stamen.\nthis flower has petals that are pink and has purple stamen\nthe pistal is very busy with lavendar petals at the bottom and many stamens and pollen tube\na flower with long and narrow petals that are purple.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_160_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_160_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_160_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_160_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: white petals on the outside purple white blue pedals inside purple yellow and white middle dark green leaves\nthe flower has large petals that are white and thin blue petals.\nthis flower is white and blue in color, with petals that are oval shaped.\nthis flower has petals that are white with stringy purple stamen\nthis flower has white petals and green and purple stamen.\nthis flower is whit , blue and green in color, and has petals that are oval shaped.\nthis flower has petals that are white and has stringy purple stamen\na clear pedaled flower base with a white and blue with a long stigma.\na flower with long and narrow petals that are white.\nthis flower has rounded pale-green petals under a thick fringe of quill-like white and purple.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_161_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_161_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_161_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_161_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the petals of the flower are pink in color and have filaments that are pink in color.\na purple flower with stringy petals on top and large petals underneath.\nthe flower has waved petals that are lavender and very thin.\nthis flower is purple and green in color, with petals that are oval shaped.\nthe petals on this flower are purple with an elaborate pistil\nthis flower has elongated purple petals under a ring of squiggley lavender white and plumb needle like petals.\nthis flower is purple in color, and has petals that are oval shaped.\nthis flower has light lavender petals layered with purple fringe petals and a large pistil.\nthis flower has horizontal lavender petals with many squiggly stamen laying sideways.\nthis flower has petals that are purple and has stringy stamen\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_162_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_162_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_162_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_162_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the bird has a white eyering and a small red bill.\na small bird with black and white markings and bright white eyes.\nthis is a black bird with a white spotted belly, a white eye and a red beak.\nthis funny looking medium sized bird has a red beak and is grey and white\nthis bird has a large white and grey breast, with a red beak and round eyes.\nthis bird is black with white and has a very short beak.\nthe small bird has a large white eye, a short orange and gray bill, and dark colored secondaries.\nthis is a highly unusual bird with a white eye with a small black pupil, and a short red beak.\nthis erect bird has big, almost white eyes, a white belly spotted with black, and white spotting at the crown.\nthis colorful bird has red beak white eyes ring gray all over\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_163_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_163_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_163_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_163_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower has light greens sepals and thin lavender petals.\nthis flower is purple and green in color, with petals that are oval shaped.\nthis flower has petals that are green with purple stamen\nthe flower has large oval white petals and thin purple petals.\nthe petals on this flower are green with an elaborate pistil.\nthis flower has wide green petals beneath a round layer of purple hairy fringe.\nthis flower has petals that are green and has stringy purple stamen\nthe stamens of the flower are in the shape of a circle, and have various shades of purple, white, and maroon throughout.\nthis flower is white, green, and purple in color, and has petals that are oval shaped.\nthis flower has large open green petals topped by needle shaped lavender white and maroon petals.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_164_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_164_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_164_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_164_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: a small bird which is black all over with a long tail and a very fat black bill.\nthe all black large bird has a large bill and small eyerings.\nthis is a black bird with a large crooked bill.\na small black bird with a large black bill and a spiked crown.\nthis bird is black in color with a black beak, and black eye rings.\nmedium to large black bird with large black beak and medium black eyes\nthis particular bird has a black body with a short black bill\na black body bird with a regularly sized head in comparison to the body.\na black bird with a large beak.\nthis bird is black it has a very large beak it looks to be a large bird\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_165_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_165_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_165_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_165_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: outer petals are green in color and klarger,inner petals are needle shaped\nthe flower has petals that are purple and white, with spread purple filaments, and green stamen.\nthis flower has long purple petals and long purple stamen in the middle\nthe pretty flower has large light purple petals with long thin stamen in it center.\nthis flower is pink and purple in color, with petals that are oval shaped.\nthis flower has flat petals that are light purple and white in color along with a layer of flat, blue filaments.\na flower with long and pointed pistils that are dark puruple.\nthis flower has pink and white petals with long stringy and purple stamen\nthis flower is pink and purple in color, and has petals that are oval shaped.\na light purple flower with thick stigma and dark purple stamen hiding under green leaves with purple veins.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_166_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_166_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_166_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_166_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: brown headed bird with a long, black, curved bill and darker tail feathers.\nthis is a medium sized grey bird with light coverts and dark primary and secondary wings.\na large grey bird with black cheek patches and a black bill.\nthis is a larger type bird with a gray body and a darker face.\nvery large bird with a long large black beak and white eyes.\nthis bird has wings that are grey and has a black bill\na brown bird with black eyebrow, the bill is long and curved and the eyebrow is black\nthis bird has brown waterproof plumage and a curved black bill.\nthis bird is grey with darker brown and black tint on the face and ends of the wings and tail feathers.\na medium size bird with grey coloring and black beak.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_167_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_167_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_167_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_167_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the bird is dark gray in color, has a stubby bill with some type of feather growths coming out of the top of it.\nthis bird has feathers pointing upward just before its beak, small white eyes, and brown feathers covering the rest of its body.\na grayish black bird with white eyes and orange stubby beak that has a tuft of gray feathers on it.\nthis magnificent specimen is mostly brown, with a white superciliary and large plumes on top of it's bill.\nan exotic looking brown bird with a white highlight on its face and orange beak.\nsmall bird with a black body, white eyes, orange beak, and black comb.\nthis bird has wings that are brown and has a long orange bill\nthis black colored bird has a bright orange beak and white eyes.\na medium size bird with a white eye and thick, orange beak.\nthis bird has wings that are brown and has an orange belly\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_168_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_168_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_168_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_168_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this unique flower has a strange set of pistil and stamen with white-pink petals.\nthis flower pale purple and has lost of petals around a green center.\nthe stamen of this purple flower appear to be a purple color as well.\nthis flower has green anther and filaments surmounting a cluster of fine purple petals and larger leaf shaped mauve petals.\nthe flower features light purple petals and dark, vibrant purple stamen surrounding many green style.\nthis flower has pale purple petals with many purple stamen and green stigma in the center.\nthis flower has petals that are pink and has stringy stamen\nthis flower has a layer of flat oblong white petals underneath a separate layer of flat blue filaments with upright stamen and pistils.\nthis flower has wide purple and green petals surrounding thick green stamen and a purple fringe.\na flower with long and narrow petals that are light purple.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_169_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_169_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_169_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_169_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this unique bird has webbed feed, is all-around black and, with a sharp malar stripe from its striking white eyes, leading from a bright orange bill and wispy, thin black feathers adorning the front of the face stemming from its crown.\na large bird with a white eye and orange bill, with a gray breast and gray belly.\nthis bird is black in color, with a vibrant orange beak and a white eye ring.\nthis distinct black bird has a bright orange bill and a feathered hat on its nose.\nthis is a dark gray bird with a white eye and a small orange bill.\nthis medium sized bird has a very long neck, a bright orange beak and a tall feather on it's crown.\nthis bird has a white eye ring and eye brow, a grey chest, belly and vent and black feathers on the rest of its body.\nthe bird has black feathers and a bright orange bill. it has white eyes and an odd black plume near it's bill.\nthis bird has a long neck covered in black feathers with white eyes and a white stripe behind his eyes and feathers standing vertical to it's short orange beak.\nthis bird has wings that are black and has an orange bill\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_170_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_170_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_170_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_170_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this bird has a spiky black crown and white eyebrow. the rest of the bird is gray and it has webbed feet.\nthis bird is mostly gray, with a short orange bill and white superciliary.\nsmall black bird with large grey feet, grey breast and belly, small white cheek patch and malar stripe, black feathered crown, and short rounded orange beak.\na small bird with a grey belly and black back with a feather on its beak.\nthis sautty bird has a grey chest and breast area and white eyes.\nthis distinctive bird has white eyes, a small orange beak, a grayish black body, and black feathers on its bill.\nthis particular bird has a gray belly and breasts and a short orange bill\na black bird with a gray breast and white eyes.\nthis bird has a yellow bill, and a tuft of feathers that stand up right at the top of the bill.\nthis bird is black and gray in color, and has a bright orange beak.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_171_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_171_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_171_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_171_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower is pink and purple in color, with petals that are oval shaped.\nthe petals on this flower are purple with green stamen.\nthis flower has petals that are pink with purple stamen\nthis flower is bright purple with purple anthers and filaments and yellow stigma.\nthis flower has large green stamen surrounded by a purple fringe and wide purple and white petals with rounded edges.\nthe petals are long and light purple and the stamens are purple with green anther.\nthis flower has petals that are pink and has stringy purple stamen\nthis flower has long tapered lavender petals that surround long, thin purple petals and surround yellow stamen with a dark purple stigma and pollen tube.\nthis purple and pink flower has many pointed petals with green and yellow anthers.\nthis flower is purple in color, and has petals that are oval shaped .,\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_172_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_172_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_172_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_172_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this dark grey bird has a orange bill with white eyes and a feather hanging over its bill.\nthis bird is all black, with black webbed feet, a black plume, and orange beak.\nthis bird is black with an orange, short, stubby beak.\nthis bird is mostly gray with a short orange bill and webbed feet.\nthis tall black bird has an orange bill and a long feather protruding from its face.\nthis bird has wings that are black with an orange beak\nthis bird is all black and has a very short beak.\nthis black bird has white eyes and black plumage on top of a bright orange shortened beak.\nthis bird has wings that are black and has an orange bill\nthis bird has wings that are black and has an orange bill\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_173_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_173_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_173_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_173_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the medium sized bird has a dark grey color, a black downward curved beak, and long wings.\nthe bird is dark grey brown with a thick curved bill and a flat shaped tail.\nbird has brown body feathers, white breast feathers and black beak\nthis bird has a dark brown overall body color, with a small white patch around the base of the bill.\nthe bird has very long and large brown wings, as well as a black body and a long black beak.\nit is a type of albatross with black wings, tail, back and beak, and has a white ring at the base of its beak.\nthis bird has brown plumage and a white ring at the base of its long, curved brown beak.\nthe entire body is dark brown, as is the bill, with a white band encircling where the bill meets the head.\nthis bird is gray in color, with a large curved beak.\na large gray bird with a long wingspan and a long black beak.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_174_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_174_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_174_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_174_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower is white and purple in color, with petals that are oval shaped.\nthe flower has big and long skinny petals that are light purple.\nthe petals on this flower are purple with an elaborate pistil.\nthis flower has petals that are purple with stringy stamen\nthis flower is bright purple with purple petals and anthers and a yellow stigma.\nthis flower is purple and white in color, and has petals that are oval shaped.\nthis flower has oblong shaped purple petals covered by long and wavy purple filaments.\nthis flower has a bottom row of lavender rounded petals and a top row of hair-like curly lavender petals with white and dark purple stripes.\nthis flower has petals that are purple and has stringy stamen\nthis flower has dozens of stringy light purple petals that have alternating white and dark purple rings towards the ovule.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_175_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_175_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_175_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_175_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this bird is brown with a lighter brown crest.\naquatic large bird with long hooked bill, white face, and brown body.\nbird has brown body feathers, brown breast feathers, and brown beak\nthis bird has a white superciliary and brown all around its body with a long bill\nthis is a brown bird with a white face and a long downward pointing beak.\nthis bird is brown with white and has a long, pointy beak.\nbrown duck playing on the lake making a poodle\nthis bird has wings that are brown and has a long bill\nthis bird has long brown bill, with a brown body.\nthis is a medium sized brown bird, with a long pointed bill.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_176_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_176_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_176_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_176_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the anthers are on think filaments that are curved, with tiny purple petals.\nthis flower is white and blue in color, with petals that are oval shaped.\nthis blossom has very large sepals, the pedals are long and very narrow, the ovary and pistil are very large with elaborate shapes.\na odd shaped flower with a center series of flower petals in the shape of a star with long purple, yellow and green stamen.\nthis flower has petals that are green with stringy stamen\nthis flower is white, yellow, and blue in color, and has petals that are oval shaped.\nthis flower has green petals as well as a purple and green pistil.\nthis flower is multicolored, with light green wedge shaped petals and blue-tipped stigma.\nthis flower has petals that are green and has purple stringy stamen\nthis blue, white, and purple flower has pointed pedals and green sepals.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_177_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_177_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_177_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_177_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this is a bird, which is rather large and is white and black in color flying through the sky.\nthe bird has a white underbelly, black feathers in the wings, a large wingspan, and a white beak.\nthis bird is a large bird with dark gray wings with hints of light gray its body and head all all white, the beak is also white but with a black tip.\nthis bird has a white breast and crown, yellow bill and black tipped primaries.\nthis large black bird has a white throat, breast, and abdomen.\nthis bird has a long wingspan, a white belly, and a white crown\nthis bird has a white crown, throat, belly, and abdomen with black inner rectrices.\nthis particular bird has a white belly and breasts with a large black wingspan\nthis particular bird has a belly that is white and has black patches on it\nthis bird is black with white and has a long, pointy beak.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_178_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_178_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_178_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_178_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the pistil area of this flower appears similar to a venus fly trap in design.\na purple and white flower with skinny long petals and a very large pistil.\nthere are many purple stamens and one large pistil with two colored overlapiing petals that are white and purple in color\nthis flower is purple and white in color, with petals that are oval shaped.\nthe flower shown has purple and white petals with a green pollen tube\nthis flower has petals that are pink and white and has purple stamen\nthis flower has a tall green pistil, dark blue filaments, and purple and white petals.\nthis flower is white, pink, and purple in color, and has petals that are oval shaped.\nthis flower has purple petals as well as a green pistil.\nthis flower has a row of alternating purple and white petals.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_179_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_179_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_179_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_179_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: a unique looking flower that has multiple colors and long petals\nthe flower has white thin petals around blue stamen with green pistil in its centre\na very unique flower with white and lavender petals and purple anther filaments.\nthis flower is white and purple in color, and has petals that are round and long.\nthe stigma is purple and the stamen are purple whereas the petals are white.\nthe flower on this particular picture has petals as well as a pistil.\nthis flower is green white and purple in color, and has petals that are oval shaped.\nthis flower has petals that are white and has stringy purple stamen\nthis flower has white and purple petals and a green pedicel\nthis flower has white petals arranged in a disk type of shape and light blue filaments on top of the petals.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_180_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_180_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_180_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_180_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower has petals in the shape of a circle and are purple and stringy\nthis flower has violet petals with light green stamen and bell-shaped anthers.\nthe flower is so big and has petals that are soft, smooth, thick and separately arranged around the stamen forming a bowl like shape\nthis flower is white and purple in color, with petals that are oval shaped.\nthis flower is very unique as it has different kinds of petals and three branches of stamen and pistil.\nthis flower has petals that are purple and has stringy stamen\na flower with long and narrow petals that are white.\nthis flower has a ghostly lavender petals surrounding curly green stamen.\nthe flower on this particular picture has petals as well as a stamen.\nthis flower has elongate green and purple petals below a ring of purple and white needle like petals.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_181_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_181_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_181_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_181_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this bird has an orange and brown bill and a grey breast.\na larger bird with a large mostly yellow beak and two toned brown feathers.\nthis is a black bird with a grey breast and an orange beak.\nthis is a water bird with an orange bill, brown feathers and yellow eyes.\nthis is a brown, swimming bird with an orange bill and tan on the breast.\nthis large bird is solid dark gray color with a bit of white mixed in on its belly.\nthis bird has wings that are black and has a big orange bill\nthis particular bird has a belly that is brown and black\nthis bird has a black crown as well as a yellow bill\nthis large bird has a predator's beak, with a big head, black body, and brown chest,\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_182_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_182_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_182_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_182_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this bird has a dark grey color, with a large bill and long wingspan.\nthis grey bird has an impressive wingspan, a black head, and comparatively medium-sized black bill.\nthis is a grey bird with black wings and a black head and beak.\nthis bird has a gray body with a black head and very long darker gray wings.\nthis bird has large black wings and head, black bill and white abdomen.\nthis is a bird with a light grey body, darker grey wings, a black bill and lighter grey wingbars.\nthe large bird has a dark colored bill, long dark wings, and a gray back.\nthis bird is very small with a black crown and point black beak, but has a very expansive wingspan.\nbird has black crown and beak, grey back, long black wings and tail.\nthis bird has a wide wing span, with a long black bill.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_183_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_183_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_183_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_183_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this bird has a long beak, white face, brown wings and white legs.\na large bird with a large wingspan, covered in brown feathers from its back, to its wings, with a white head, and rump.\na bird with a large downward curved bill, white throat and head, brown breast and white abdomen.\na brown winged bird with a white rump and head, a brown tail and a long light yellow beak with a slightly curved tip.\nthis seabird has a white head and brown wings, with a nice shapely yellow beak.\nthe head of the bird is white in color and the body is grey.\nthis bird has a long pointy bill and a white head.\nthis bird has wings that are brown and has a white belly\nthis bird has a white crown, brown primaries, and a white throat.\na cigar-shaped white bird with long brown wings and hooked long bill.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_184_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_184_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_184_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_184_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this bird is mostly gray with a bright orange bill.\na bird with a gray body and wings, a white eye with cheek patch, and orange bill.\na large bird with an all dark grey body, grey and black wing feathers, and a curved bright orange bill.\nthis bird is mostly grey with a short bright orange bill.\nthis is a gray bird with a white eyering and a large gray wingbar\nthis bird is black with white and has a very short beak.\na black bird with white eyes and a orange bill.\nthis is a medium sized black bird, with a short yellow bill.\nthis bird has an ashy black coloring with a streak of white behind the eyes and a small pink beak.\na medium sized bird with black feet and a black breast and belly with a orange and black beak.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_185_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_185_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_185_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_185_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower has green pistil and purple and white petals as its main features\nthis flower is blue and white in color, with petals that are oval shaped.\nthe petals of this flower are purple with a long stigma\nthis flower has petals that are are white with purple filaments and purple anthers.\ninner petals are purple in color and are needle shaped,outer petals are white in color\nthis blue purple and white flower has pointed petals and white sepals.\nthis flower has very thin stamen that are colored dark purple, lavender, and white.\nthis flower has petals that are white and has purple stamen\nthis flower has pale green rounded petals with a fringe of purple and white quills.\nthis flower has a single row of white oblong petals followed by a row of flat blue filaments.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_186_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_186_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_186_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_186_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flying bird has long brown wings and a black bill with a white stripe around it.\nbird has brown body feathers, brown breast feathers, and brown thick beak\na large brown bird with white secondaries, a black bill and yellow eyes.\nthis bird is black with brown on its stomach and has a long, pointy beak.\nthis is a brown bird with a white eye and a long and pointy bill.\nthis bird is mostly dark grey and has a white ring around its bill.\nthis is a brown bird with an incredible wingspan and an extended bill for its size.\na large long bill, wide wing span dark brown bird with light beige tipped primaries.\nthe large bird has a black bill, crown and nape and long wings that have brown secondaries and black converts together with a brown belly and tail.\nthis bird is brown with black and has a long, pointy beak.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_187_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_187_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_187_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_187_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: two layers of pale violet petals are present, including a lower ring of pointed petals and an upper ring of wavy, filament-like petals, surrounding a pale green stamen.\nthe flowers has needles petals around the the stigmas and stamens.\nthis flower is white and purple in color, with petals that are oval shaped.\nthis flower has long white stamen and petals that are white in color\nthe outer petals are green in color,inner peals are purple and needle shaped\nthis purple and white flower has pointed petals and green yellow stamen.\nthe flower has bright green and purple petals with green and yellow stamens.\nthis flower has hair like petals with a yellow stamen in the center of it.\nthis flower has petals that are white and has purple stringy stamen\nthis flower is purple, white, and yellow in color, and has petals that are oval shaped.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_188_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_188_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_188_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_188_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this bird's coloration is varying shades of gray and has dark primaries, a dark crown and a long, slender bill.\na larger bird that is gray and white with a large wingspan.\nthe bird has a white and grey body with a grey beak and grey wings.\na large gray bird with a dark gray beak and gray wings.\nthis large bird has a gray body, white eyering, and long hooked bill.\nthis bird has a white and gray back, a white eyering and a black bill; the rest of its body is varying shades of gray.\nthis particular bird has a white back and gray secondaries\nthe bird has a white eyering and two large black wingbars.\nthis bird has a very large wing span, and a long black bill.\nthis bird is gray and black in color, and has a long black beak.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_189_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_189_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_189_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_189_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower has an elaborate golden stamen and two different types of purple petals.\nthis flower is purple and pink in color, with petals that are oval shaped.\nthis flower has purple petals with blue and purple filaments and a yellow and green stigma.\nthe petals of this flower are purple with a long stigma\nthis flower has petals that are pink with purple stamen\na flower with long and narrow petals that are light purple.\nthis multi-purple flower has large petals ranging in 2 shades of purple with a stigma that is the color of light green and yellow.\nthe flower has white and pink petals and the pedicel is green\nthis flower has petals that are pink and has purple stamen\nthis flower has wide purple and green petals surrounding green stamen and a purple hair-like fringe.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_190_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_190_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_190_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_190_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the bird has a black crown, and has long white feathers on its cheek patches.\na close up of a bird with black crown, white eyebrow, white malar stripe, a blueish throat and brast, and a yellow beak with a blakened top to the bill.\nthis is a grey bird with a black head and a pointy orange beak.\nlarge bird with a short orange beak with a curve on it; has white whiskers.\nthis bird is black with white on its feathers and has a long, pointy beak.\nthis particular bird has a gray neck and black head with white cheek patches\nthis is a large bird with a light blue body dark blue wings and a darker head with white on it and a yellow beak.\nthis bird has wings that are black and has a yellow bill\nthis gray bird has a dark gray head, white cheek patches, and an orange bill.\nthis bird is grey with black and has a long, pointy beak.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_191_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_191_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_191_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_191_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this weird flower has a dense arrangement of blue-white petals and an ornate pistil-stamen arrangement.\na unique blue and white flower with green anther filaments.\nthis unusual flower has long thin blue and white petals and a green center.\nthe flower is big with petals and stamen formed like layers of discs with pistil sticking out in the centre\nthis flower is white and purple in color, and has petals that are light green.\nthis flower is white and purple in color, with oval shaped petals.\na unique flower with large white and green petals, long purple stamen sprouting from an ovary covered pistil.\nthis flower has petals that are green and has stringy purple stamen\na flower with long and narrow pistils that are blue with white centers.\na very beautiful yellow flower with exquisite petals and striking beauty of a plant at its finest moment\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_192_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_192_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_192_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_192_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower is white and purple in color, with petals that are oval shaped.\ninner petals are needle shaped,and are purple in color\nthis flower has petals that are white with purple stamen\nthe petals on this flower are white with an elaborate pistil.\nthis flower has large white petals and a light green pistil.\nthis flower is purple, white, and green in color, and has petals that are oval shaped.\nthis flower has white petals and has long and stringy stamen\nthe flower has thin long white petals with green stamen.\nthis flower has a row of elongated greenish petals under a row of needle like lavender white and purple petals.\nthis flower has a row of elongated greenish petals under a row of needle like lavender white and purple petals.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_193_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_193_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_193_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_193_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this large bird has a buff colored belly, a long heavy beak on a white-fronted face , and long, dark brown wings.\nthis bird has a large, curved, gray bill, a white cheek patch, and a gray breast and belly.\na medium sized bird with a grey body and a bill that curves down wards\nthis gray bird has black wings and a white head, and a long beak.\nthis bird is black and brown in color with a curved black beak, and black eye rings.\nthe black wings have brown wingbars, the bill is short and pointed, and the head is small compared to the body.\nthis bird has brown and white wings, grey breast, belly and vent, and a white ring around its bill.\nthis bird has wings that are grey and has a long black bill\na large bird with large wings and bill.\nthis bird has a slight hook shaped beak and a wide wing span, it's body is a light brown color.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_194_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_194_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_194_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_194_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this is a gray bird with a whitish belly and a short orange bill.\nan average sized bird, its black and white body blends in with the rock it is sitting on\na large footed bird with a short, blunt, orange bill, distinct long white feathers coming from its eye, is white from breast down around to its undertail coverts, and grey from its head, on its wings and to its tail.\nbird has gray body feathers,white breast feather, and orange beak\nthis bird has a short orange bill, a white belly & breast, white tarsus & feet, and a gray crown.\nthis bird is grey with white on its chest and has a very short beak.\nthis stout bird has a white belly, a bright orange bill, a white eyering, with a gray on the side and the wings.\na small bird with a white belly, a small orange beak, and a white feather sticking out from its eye.\nthis bird has wings that are black and has a white belly\nthis bird has wings that are black and has a white belly\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_195_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_195_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_195_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_195_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this flower has petals that are white with purple stringy stamen\nthis flower is blue and white in color, with petals that are oval shaped.\nthe flower is so big and has disc of petals below the disc of blue and white stamens\nseapls are green in color,petals are white in color and inner petals are purple\nthis flower has petals that are white with purple filaments and green anthers.\nthis flower has pure white petals with lavender stigma and a green stamen.\nthis flower has white petals with purple and white anthers.\nthis flower has thin white petals and stringy purple stamen\nthis flower has white petals with bright blue filaments, purple pistils, and green stamens.\nthis flower has rounded pale green petals and a layer of thick purple and white quills.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_196_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_196_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_196_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_196_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the bird has a white eyering and a white breast and belly.\na large bird with a long black bill, white back, breast and belly, and white and grey wings.\nthis big bird has a white belly and back, black wings and head, and a blue bill.\nthis bird has a black crown, a flat bill, and a white breast\na medium bird with a white belly and back, gray rump and a large gray head and bill.\nthis bird has wings that are black and has a white belly and chest\nthis bird is white and black in color, with a large black beak.\nthis bird has a large black and white body, a large black head, a long black beak that curves downward.\nlarge bird with a white chest and brownish grey wings and head. its beak is rounded and somewhat long.\nthis bird has wings that are black and has a white belly\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_197_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_197_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_197_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_197_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "Oxford_102_flower_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: the pistil is white and is very noticeable, and the petals are purple.\nthis flower is purple and white in color, and has petals that are very skinny like strings.\nthe flower shown has green sepal and lots of purple and white anther\nthe rounded and notched bright green leaves of this plant surround a vibrant purple bloom that features curling lavender petals, rounded lavender sepals, and a tall, white pistil.\nthis purple flower has two different types of petals, one type is stink like and the others are oval shaped.\nthis flower has large purple petals and a white pistil.\nthis flower has many purple petals as well as some strange curly hair-like whitish-purple things.\npurple string like petals above another wider purple set. bright yellow pistil and stamen.\nthis flower has petals that are purple and has stringy stamen\nthis flower has stringy purple petals and a green pedicel.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_198_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_198_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_198_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_198_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "text2image_retrieval", "visual_input_component": "['natural_image']", "source": "CUB220_2011_retrieval", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Please find the most relevant picture among the candidate images for this description.\nDescription: this longtailed black bird has a black and white spotted breast.\nthis bird is shiny blue in color with a small black beak and black eye rings.\nthis is a dark blue bird with white eyes and a small beak.\nthis bird has solid black wings and a solid black head.\nthis bird has a rounded breast, a small bill, and a short neck\nthis is a jet black bird with mottled black and white belly and long black tail feathers.\nblack bird with long tail sitting on a rail.\nthis bird has a black pointed beak, with yellow eyes.\nthe bird has a small bill and a black back and belly.\nthis bird has a black pointed bill, with a black breast.\n", "context": "Select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_199_0.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_199_1.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_199_2.jpg", "./High-level-obj-semantic/text2image_retrieval/text2image_retrieval_199_3.jpg"], "output": "D", "qwen3-vl": "image none"}]
\ No newline at end of file
diff --git a/results/textual_cloze/qwen3-vl/metadata_info.json b/results/textual_cloze/qwen3-vl/metadata_info.json
new file mode 100644
index 0000000..d3b1425
--- /dev/null
+++ b/results/textual_cloze/qwen3-vl/metadata_info.json
@@ -0,0 +1 @@
+[{"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Baconpancakes\nB: 7. Con't Sweet & Sour Sauce\nC: 8. Thicken the Sauce\nD: Ingredients", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Garlic & Quorn', 'Bake', 'Freezing and Serving']", "context": "Here is the context of these images:\n. Quorn 600gr - or ground meat or just veggies .. perhaps fish could work?\nOnions, garlic, corn, baked beans, paprika powder and everything else you need to make a filling. \nSet the oven for medium heat. I use 220'C. . Add oil and butter at medium heat to a big pot. Do not burn the butter. . Add chilli flakes to the pot and remove from heat. Chop some onions and add to the pot. Put the heat on again and stir. Do not burn. \nI find that adding the chilli in this way releases the flavour more evenly. . Add chopped garlic and quorn to the pot and stir. Let it fry a bit. . Add tomato sauce , taco sauce, water , spices, paprika powder and top it off with some mustard. Stir and let simmer for a while. . Make a basic sweet white dough. I'm doing a double load in the bread maker , enough for two loaves. My basic recepie is ;\nFlour 1,8 liter\nWater 0,6 liter\nOil 0,2 liter\nSalt 1tablespoon\nSugar 8 tablespoons\nDry jeast 1 packet\nYou'll need twice this for this much sauce. . Divide you dough into 16 pieces. Roll a piece round and flatten it to a disc. Put the disc on a bakingplate with a cookie sheet on. \nAdd a bit of filling, cover with a slice of cheese and brush the edges with water. Fold the edges up to make a small package. \nRepeat. . Bake for 10-15 minutes at 220'C or until slightly brown. . If you made them to freeze like I do make the sauce a bit spicier since it looses some after freezing. \nHeat from frozen to edible in a microwave at full power for three minutes. Serve with a salad for a complete meal. \nHeat on a BBQ by letting them thaw out to room temperature first. \nRead the question below and select from the following choices.\nA: Baconpancakes\nB: 7. Con't Sweet & Sour Sauce\nC: 8. Thicken the Sauce\nD: Ingredients", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_0_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_0_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_0_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_0_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_0_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_0_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_0_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_0_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_0_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_0_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_0_10.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Outback \"Copy Cat Recipe\" From Aunt Jo\nB: Next, Make a Bath\nC: Fold Down Retention Band\nD: Fold and Crimp", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Cut Top Off', 'Cut the Retention Band', 'Measure and Cut Flaps', '@placeholder']", "context": "Here is the context of these images:\n. You'll need one standard milk cartonbox cutterpencil or permanent markerruler. Measure 1 cm down from the top edge of the carton body.  Mark an horizontal line on each side.  Be careful, knives are sharp!  Cut along these lines through all four sides.  Remove the top and discard.. Measure down 2.5 cm from the new top edge.  Mark an horizontal line at this point on three sides.  Cut along line through three sides.  Leave band attached to one side of carton body.. Measure up 9 cm from bottom of carton, make a mark at corner junction.  Make a cut from this mark, through the corner up to the band.. There should be three flaps and one flap with an attached band.. Fold three side flaps down evenly into the cavity of the carton.. Finally fold down the last flap with the retention band.  Work the band down around the side of the carton to form the closure.  And Voila!This is my very first instructable!  Please comment!  And I hope you will enjoy!\nRead the question below and select from the following choices.\nA: Outback \"Copy Cat Recipe\" From Aunt Jo\nB: Next, Make a Bath\nC: Fold Down Retention Band\nD: Fold and Crimp", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_1_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_1_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_1_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_1_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_1_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_1_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_1_6.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: How to Make Marmalade\nB: How to Make Brioche\nC: BONUS\nD: Store", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Gather Walnuts From Ground', 'There Are Two Methods to Remove the Green Skin of Walnuts', 'Protect Your Hands With Gloves Or...', '@placeholder']", "context": "Here is the context of these images:\n. After we hit the walnuts from walnut tree, walnuts dropt to the ground, and we filled our sacks with walnuts. . There are two methods to remove the green skin of walnuts:1. Use knife to cut directly.2. Use crevises to divide the green shell with your hands with gloves. (Protect your hands not to get coloured)While my father is cutting walnuts, I prefered to use second option to peel walnuts. . Use your hands to extract all walnuts one by one to peel.. Although I used gloves to protect my hands, still I get colour changes on my skins... I can't understand how my gloves unable to protect my hands from walnut's shell.. Next time, I will use two gloves each to my hands.. Check for subtitles for your native language. Video is Turkish. .That's all !. This year, we collected a little bit early, but I think just in time because next week it rained, and made tree and walnuts wet. We protected walnuts from wet to prevent walnuts become moldy. I used three gloves on my hand, but still got the reddish color on my hands :) \nRead the question below and select from the following choices.\nA: How to Make Marmalade\nB: How to Make Brioche\nC: BONUS\nD: Store", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_2_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_2_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_2_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_2_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_2_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_2_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_2_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_2_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_2_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_2_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_2_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_2_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_2_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_2_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_2_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_2_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_2_16.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Homemade in the Jar Pickles\nB: Homemade Pumpkin Spice\nC: Screw on Blade\nD: You Will Need", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Gather Your Ingredients', 'Limes', '@placeholder', 'Eat']", "context": "Here is the context of these images:\n. The food:\n1 14.5 oz can of peeled whole tomatoes\n1 can of chipotle peppers in adobo sauce (can size doesn\u2019t matter as you need less than one pepper)\nLimes (you\u2019ll need \u00bc cup of freshly squeezed lime juice, so you\u2019ll need either 1 large lime or 2 small limes)\n1 clove of garlic\n\u00bc teaspoon of salt\n\u00bc cup (overflowing) of cilantroThe tools:\nBlender\nKnife\nCan opener\nCutting board\nWhile I use a Magic Bullet when I make my salsa, any blender will work. This recipe was just designed to perfectly fill a Magic Bullet cup.. Using your can opener, open your can of tomatoes and pour the can\u2019s contents into the blender.Warning: Be very careful on this step as the can lid will be very sharp after you cut it.Holly\u2019s Helpful Hints:\nI recommend washing the top of the can before you cut it. As you cut the can, some tomato juice will get on the lid, and who knows where that lid has been.. Open your can of chipotle peppers and take out one pepper. On your cutting board, cut the pepper in half and put one of the halves in the blender with the tomatoes.Warning:\u00a0 Be very careful again on this step because, like the previous step, after you open the can, the lid will be very sharp.Holly\u2019s Helpful Hints:\nBe sure to wash this lid too because you never know who could have touched it before you.\nAlso, the point of the chipotle pepper is to add some spice your salsa. If you can handle the heat, add more than half. If you prefer your food milder, add a little less or no pepper at all.\nYou can usually find chipotle peppers in the same aisle in the grocery store as the canned tomatoes.. On your cutting board, slice your lime into quarters. Then squeeze each quarter into your \u00bc cup. Keep squeezing your limes until you have a full \u00bc cup. Then pour the lime juice into your blender.Holly\u2019s Helpful Hints:\nCutting the limes into quarters makes it easier to squeeze and get the most juice out of every lime.\nAlso, I recommend washing your lime before cutting it to make sure all of the pesticides and dirt are off it.. Take your garlic and peel off the outer, papery layer. Then choose one of the smaller cloves, cut both ends off the clove, and peel off its outer layer until you reach the smooth, shiny skin of the clove. Put this clove in the blender.Holly\u2019s Helpful Hints:\nThe easiest way to peel garlic is with a knife. Use the sharp edge of the knife to help you fray the edges of the papery skin, and then pinch the skin with the blade and your finger and pull off the skin.\nGarlic cloves have green roots inside them. They can\u2019t hurt you, but I always think it is better to remove them. Cut your clove in half, and with the blade of the knife pull out the root.. Grab your salt and fill up \u00bc teaspoon. Then pour the salt into your blender container.Holly\u2019s Helpful Hints:\nIf you\u2019re trying to watch your weight, use less salt. Salt makes your body store more water, which increases your weight.\nSea salt or regular salt are equally acceptable in this recipe. Use whichever you prefer!. Grab a bushel of cilantro and wash it thoroughly. Cilantro commonly has dirt and (if it\u2019s not organic) pesticides on it, so you need to make sure your cilantro is thoroughly washed. Then start pulling off the cilantro leaves. You don\u2019t want the stalk of the cilantro, so be sure to pull off the leaves with as little stalk as possible. Fill up an overflowing \u00bc cup of cilantro, and I mean really overflowing. Then empty the cup into the blender.Holly\u2019s Helpful Hints:\nEven if you get the freshest cilantro at the store, it\u2019s still going to have a few leaves that are black or yellow. Just discard those. You only want leaves that are a beautiful, healthy green color.. Screw the blade onto the blender container. Make sure it\u2019s nice and tight as to not lose any of your delicious salsa!. Start blending. With my blender, I let it blend for anywhere from 5 to 10 seconds. It all depends on the power of your blender.Holly\u2019s Helpful Hints:\nIf you like chunky salsa, make it chunky! If you like smoother salsa, make it smooth. The less time you blend your salsa, the chunkier it is going to be.. Take off the blender blade, grab a bag of tortilla chips, and enjoy!. This step is completely optional. I just wanted to suggest some ingredients to make this salsa even more your own.Holly\u2019s Helpful Hints:\nTo make this salsa even more personalized, add some different flavors to it. My favorite extra ingredient to add is corn. Cut the corn right off the husk and add it to your already-blended salsa. The corn gives the salsa an added sweetness that makes this salsa even more irresistible. Frozen corn works well too. Take the bag out of the freezer, and without letting it defrost, toss some kernels in your salsa, just makes sure you don\u2019t add a bunch of ice crystals too. I don\u2019t recommend blending the corn because blended corn really dilutes the corn flavor.Other ingredients to try:\nMango \u2013 Cut up fresh mango into tiny chunks and add them to your blended salsa. Or try blending the mango with the salsa. It will add that sweet, savory taste to the entire batch of salsa.\nRaspberries or Blackberries \u2013 Another way to add some sweet to your spice! With either of these berries, I\u2019d put them in before you blend your salsa. If you add them after blending, the whole berry might be too chunky since neither slices well.\nJalapenos - Try substituting jalapenos for chipotle peppers. Jalapenos are the more classic ingredient used to add spice to salsa, but they have a much stronger taste than chipotle peppers. But like I've said, it's all about the flavor you want.\nRead the question below and select from the following choices.\nA: Homemade in the Jar Pickles\nB: Homemade Pumpkin Spice\nC: Screw on Blade\nD: You Will Need", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_3_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_3_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_3_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_3_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_3_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_3_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_3_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_3_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_3_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_3_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_3_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_3_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_3_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_3_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_3_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_3_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_3_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_3_17.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_3_18.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_3_19.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_3_20.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_3_21.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_3_22.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_3_23.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_3_24.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_3_25.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_3_26.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_3_27.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Strawberries & Lemon  Cheesecake\nB: Materials Needed\nC: Dissolve Gelatin\nD: Recipe", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Click to Watch Video Instructions', 'List of Ingredient', '@placeholder', 'Serve Chilled']", "context": "Here is the context of these images:\n. This cheesecake needs at least 8 hours to set in the refrigerator, plan ahead.. Ingredients:1 oz (28 g) powdered gelatin  1 cup water (at room temperature)  1 lb (454 g) cream cheese (at room temperature)  8 sachets sugar substitute (or to taste)  Lemon juice & zest (of 1 lemon, or to taste)  Pinch of saltFor garnishing:Some lemon slices & shredded lemon zest. Soak and gently heat the gelatin until dissolved.. Combine all ingredients and beat until well blended.. Use loose base cake pan for easy removal. Keep refrigerated for at least 8 hours or overnight.. \nRead the question below and select from the following choices.\nA: Strawberries & Lemon  Cheesecake\nB: Materials Needed\nC: Dissolve Gelatin\nD: Recipe", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_4_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_4_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_4_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_4_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_4_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_4_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Slutty Brownies\nB: Making Dough\nC: Ingradients\nD: Take Brownies Out of Oven", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Get Ingredients', 'Mix Together', 'Cook Brownies', '@placeholder']", "context": "Here is the context of these images:\n. 1) 3 Tablespoons of Water 2) 1/2 Cup Vegetable Oil 3) 2 Eggs . Mix all ingredients together and poor into 9x13 pan . Cook for 26-28 minutes . Let cool. Cut brownies into desired size and Enjoy! \nRead the question below and select from the following choices.\nA: Slutty Brownies\nB: Making Dough\nC: Ingradients\nD: Take Brownies Out of Oven", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_5_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_5_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_5_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_5_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_5_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_5_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_5_6.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Ingredients\nB: Perfect Pizza Dough Recipe\nC: \u200bTools\nD: Mix", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Add the Flour', 'Mix in the Milk', 'Fold in the Chocolate Chips + Additional Mix in Ideas']", "context": "Here is the context of these images:\n. 1/2 cup (1 stick) unsalted butter, softened3/4 cup brown sugar (light or dark - whatever you prefer)1 teaspoon vanilla extract1/2 teaspoon salt1 cup all purpose flour2 tablespoons milk1 cup chocolate chips of choiceBoom. Easy peasy!This amount of ingredients will make 25-30 cookie dough truffles. Recipe adapted from CenterCutCook.com.. Cream the butter and sugar together until nice and fluffy.Then add in the vanilla and salt and mix until well combined.. Mix the flour in until you can't see any dry spots and it's well incoporated.The dough will be very crumbly at this point and that's okay - we'll fix it on the next step.. Now add the two tablespoons of milk and mix again.The dough will get nice and creamy and look just like a typical cookie dough at this point. :). Once the dough is completed, you can add chocolate chips or whatever else you desire!This dough can accept 1 to 1 1/2 cups mix-ins - it all just depends on what it is. Quick cook oats, chopped nuts, shredded coconut or dried fruit would all be lovely. :D. I recommend rolling them into about 1.5 inch balls - you can freeze them on a cookie sheet and then transfer them to a freezer bag. That way you'll have bite size cookie dough whenever you want. They'll keep in the freezer about 3 months. You can also roll them in cocoa powder, coconut, or whatever other topping you like. Yay quick truffles! :D\nRead the question below and select from the following choices.\nA: Ingredients\nB: Perfect Pizza Dough Recipe\nC: \u200bTools\nD: Mix", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_6_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_6_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_6_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_6_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_6_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_6_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_6_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_6_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_6_8.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Rainbow Rasgullas\nB: Layer Your Fruit\nC: Transfer\nD: Add Your Liquids", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Butter Your Pan', 'Marshmallows', '@placeholder', 'Let Cool']", "context": "Here is the context of these images:\n. You'll need the following:4 Cups of Froot Loops* (or cereal of your choice...Trix? Lucky Charms?)1 Stick of Butter1 bag of Mini MarshmallowsTools:SaucepanSpoonMeasuring CupWax PaperBaking Dish*NOTE: I picked Froot Loops for the color aspect. Try other cereals too. Trix or Lucky Charms could be fun ones to experiment with. Ideally, when you are picking your cereal, you'd like to pick a crunchy corn or rice based cereal. This will allow for you to have a krispies treat with some crunch. . The whole process of making these delicious treats happens relatively fast, so you'll want to prep your pan first so that when the marshmallow coating on the cereal is still gooey you can transfer it easily. Butter your baking dish thoroughly. Make sure to butter the sides as well as the bottom. This will make it so your krispie treats will slide out when they are done instead of sticking to the pan. . Place your stick of butter in your sauce pan on your stove. Melt the butter over low heat so that you do not burn your butter. . Once the butter has melted you can add your marshmallows. I used the entire bag of mini marshmallows for this Instructable. Stir constantly, insuring even mixing of your melting marshmallows and the butter. Keep heating until the marshmallows have melted completely and you can no longer distinguish single marshmallows. . Once you have a uniform mixture of butter and marshmallows, add your Froot Loops. Mix gently with a wooden spoon until the cereal is coated in the marshmallow mixture. Since Froot Loops are much bigger than regular Rice Krispies, you'll want to be careful when mixing so that you don't break the loops up. . Once adequately mixed, transfer your cereal marshmallow mixture to your buttered baking dish. Then, using a piece of wax paper, press your krispies down so that they have a uniform shape and top. . Let your krispies treats cool for at least 10 minutes, allowing the marshmallow to harden. After they have cooled you can remove them from a pan (they should slide right out with the buttering you did earlier). Cut with a sharp knife and serve!If you plan on storing them, place them in an air tight container. They will keep for a few days. . Enjoy your new twist on the classic rice krispies treat! Take them to potlucks, parties, and wherever else you need a little rainbow marshmallow goodness. \nRead the question below and select from the following choices.\nA: Rainbow Rasgullas\nB: Layer Your Fruit\nC: Transfer\nD: Add Your Liquids", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_7_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_7_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_7_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_7_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_7_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_7_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_7_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_7_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_7_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_7_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_7_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_7_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_7_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_7_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_7_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_7_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Fresh Pumpkin Pie\nB: Prep\nC: Instructions\nD: Blend", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['The Pumpkins', '@placeholder', 'Roast', 'Puree']", "context": "Here is the context of these images:\n. Sugar pumpkins are usually recommended for baking because they aren't as stringy as larger pumpkins, they're also supposed to be sweeter too. I've baked with purees made from both and haven't noticed a big difference aside from sugar pumpkins being easier to manage due to their smaller size. However, depending on how many cups of puree you need, one big pumpkin might be cheaper than a couple small sugar pumpkins. 1 sugar pumpkin can usually yield 3-4 cups of puree. . Preheat oven to 400.I'm in the habit of washing all my fruits and vegetables before use, even though you don't use the skin of the pumpkin in puree, I still wash it. Cut your pumpkins down the middle so you have two even sides. Scoop out all of the pumpkin guts and seeds. (Seeds can be saved for roasting). On a lined baking sheet, place your pumpkin halves and stick them in the oven. How long the pumpkins take to roast depends on how big they are, but after 25 minutes I start poking them with a fork every 5-10 minutes until the fork goes in and out smoothly (kind of like a baked potato). When the pumpkins are done, remove from oven and let cool. . Pumpkin flesh is easier to remove when it's still hot, but also harder to handle. So as soon as you're able, start scooping all of the flesh from the pumpkin and put it in your blender. Once all of the pumpkin flesh is scooped, blend the flesh until you have a puree. Sometimes I will add a 1/8th cup of water or more, but this isn't usually needed.. Now that your puree is done you can use it right away in a recipe, store in the fridge up to 7 days, or in the freezer for about 3 months. \nRead the question below and select from the following choices.\nA: Fresh Pumpkin Pie\nB: Prep\nC: Instructions\nD: Blend", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_8_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_8_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_8_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_8_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_8_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_8_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_8_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_8_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_8_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_8_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_8_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_8_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_8_12.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Minecraft Chocolate Chip Cookies IRL\nB: Healthy Oatmeal Chocolate Chip Cookies\nC: Finish\nD: BAKING AND COOLING OF CHOCOLATE CHIP COOKIES", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['ADDING AND BEATING OF EGGS', 'ADDING AND BLENDING THE CHOCOLATE CHIPS', '@placeholder', 'HOW  WOULD I EAT THESE FRESH CHOCOLATE CHIP COOKIE']", "context": "Here is the context of these images:\n. List of ingredients for Chocolate Chip Cookies 2 \u00a0 \u00a0 \u00a0 cups (12 oz package) of NESTLE TOLL HOUSE\u00a0Semi-Sweet Chocolate Morsels 2 1/4 cup all-purpose flour 1 \u00a0 \u00a0 \u00a0 \u00a0teaspoon baking soda 1 \u00a0 \u00a0 \u00a0 \u00a0teaspoon soda 1 \u00a0 \u00a0 \u00a0 \u00a0cup (2 sticks) butter 3/4 \u00a0 \u00a0 cup brown sugar (packed) 3/4 \u00a0 \u00a0 cup granulated sugar 1 \u00a0 \u00a0 \u00a0 \u00a0 teaspoon vanilla 2 \u00a0 \u00a0 \u00a0 \u00a0 large eggs. In a small mixing bowl combine: \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 \u00a0\u00a02\u00a01/4 \u00a0 \u00a0cups all-purpose flour \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 1 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 teaspoon salt \u00a0 \u00a0 \u00a0 \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 \u00a0 1 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 teaspoon baking soda After placing the above dry ingredients in a bowl, use whisk (pictured) or large spoon and mix ingredients thoroughly together. \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0. In a large mixing bowl place the following ingredients: \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 3/4 \u00a0cup \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 brown sugar (packed) \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a03/4 \u00a0cup \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 granulated sugar \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a01\u00a0\u00a0 \u00a0cup \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0butter ( 2 sticks, softened) \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a01 \u00a0 \u00a0teaspoon \u00a0 vanilla extract\u00a0 After combining the above ingredients, blend until creamy with a hand mixer (shown) or a table top mixer (not shown).. Add eggs to mixture, one at a time. Make sure first egg is completely mixed in before adding the second egg. Do the same with the second egg before going onto the next step. \u00a0. Pour 2 cups or 1 package (12 oz) \u00a0of NESTLE TOLL HOUSE Semi-Sweet Chocolate Morsels into the mixing bowl, then blend with a large spoon.. Using a tablespoon, drop a rounded spoonful of cookie dough onto a cookie sheet (about an inch apart). \u00a0Place cookie sheet in a preheated oven of 375 degrees and bake for 9 to 11 minutes or until golden brown. \u00a0Allow cookies to cool on baking sheet for 2-4 minutes before moving to cooling rack. \u00a0. After making these chocolate chip cookies I like to kick back with a large glass of cold milk and down a few warm fresh delicious cookies. \u00a0Or I could be talked into eating them with a bowl of my favorite ice cream. \u00a0I would eat them with Jello or pudding. In fact I would eat them just by themselves. That's how I would eat these delightful cookies. \u00a0How would you eat these delicious cookies.. \u00a0\u00a0\nRead the question below and select from the following choices.\nA: Minecraft Chocolate Chip Cookies IRL\nB: Healthy Oatmeal Chocolate Chip Cookies\nC: Finish\nD: BAKING AND COOLING OF CHOCOLATE CHIP COOKIES", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_9_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_9_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_9_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_9_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_9_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_9_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_9_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_9_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_9_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_9_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_9_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_9_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_9_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_9_13.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Soundboard Cake With Working Volt Meters\nB: Final Product\nC: Prepare Your Cake Pans\nD: Step Five", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Fill and Crumb Coat Your Cakes', 'Prepare for Your Fondant', 'Create and Attach Your Girl With a Pearl Earring']", "context": "Here is the context of these images:\n. For this particular design, I decided to create a very tall 6\" cake which required 3 pans. If you want to make a slightly shorter, yet still very generously sized cake, use 2 pans. Prior to baking my cake, I had to prep my pans by greasing them with Crisco and lining them with parchment paper. I also made sure to preheat my oven to 350 degrees.Supplies: 3 round 6\" pans   Crisco   parchment paper   pencil   scissorsSteps:Preheat oven to 350 degrees.  Generously apply Crisco to the inside of your pans, spread it on the bottom and sides.  Place the parchment paper over your pans and lightly trace the rim.  Cut out each circle you traced and cut about 1/4 of an inch inside of the line you drew. (This will allow for your parchment paper to fit inside the pan).  Press your parchment paper to the bottom of your pan and smooth it out.  Set aside your pans and move on to baking your cakes.. This cake is going to be a delicious vanilla cake made with a doctored cake mix. For those of you who don't know, a doctored mix is just a regular store bought cake mix that is adjusted so that it has more of a homemade taste and sturdier structure in case you want to use it for decorative purposes, like a fondant cake. This cake will come out moist and delicious and does not require as much work as a homemade cake.IMPORTANT NOTE: As I mentioned in the previous slide, I wanted to make this a very tall 6\" cake. Since I decided to make it so tall, I had to make two batches of the cake recipe for my three pans. (If I tried making both batches at once it would be too much for my mixer). If you want to make a slightly shorter cake using just two pans, make just one batch of the cake recipe.Ingredients:1 box yellow (or white) cake mix1/2 cup water  1/3 cup oil  1 small package vanilla instant pudding  IMPORTANT NOTE: If using a cake mix that already has pudding in it (like Betty Crocker or Pillsbury) DO NOT add in the instant pudding1 cup sour creamSupplies:greased cake pans  food scale (if you have one)Steps:Notes: Though I will give you the order I used to mix my cake, the order of how you mix this does not matter. Be sure to combine each ingredient individually before moving on to the next one. If you have a standing mixer that is great, if not, a handheld will work just fine. Pour your cake mix into your mixer bowl.  Pour your egg whites over the cake mix and mix on low until just combined, you should still see small lumps of cake mix when you stop the mixer.   Dump in your sour cream (use a spatula to scrap all of it in). Once again, mix it until it is just combined.   Pour in your packet of instant vanilla pudding and mix until just combined.   Pour in your oil and, again, mix on low until combined.  Pour in your water and using your spatula, scrape down the sides and bottom of the bowl.   Use your spatula to combine the batter with the water (this will prevent the water from splattering when you turn on the mixer).   Start the mixer on low for 5 seconds and then put it on high for 2 minutes to thoroughly mix your batter.   Pour 1 pound of batter into each pan. (If you do not have a scale, fill the pans about halfway).   Tap your pans against the counter in order to settle the batter and remove any air bubbles. . Every oven is different so be sure to watch your cakes and test them when baking. Supplies:toothpicks knife  3 round 6\" cake boardsSteps: Bake your cakes at 350 degrees for 35 minutes Using a toothpick, test your cake. If a toothpick inserted in the middle comes out clean, your cake is ready. If the toothpick has wet batter on it, put the cakes back in for 5 minutes at a time until the toothpick comes out clean.  Let your cakes cool COMPLETELY in their pans. (If you try to handle the cakes while they are still warm they will break apart).  Run a knife around the sides of your cake to make sure that it is loose.  Quickly flip your cake over onto a clean counter space or a sheet of parchment paper. It should fall out easily. (If it doesn't you can gently tap the bottom of the pan or flip it over and use the knife again to loosen the cake.) Once you pull the pan away, the cake should remain on the counter.  Gently peel away the parchment paper.  Place a cake board on the bottom of each cake Quickly flip you cakes over so the cake boards are underneath Place your cakes in the fridge in order to let them chill. You can choose to wrap them in plastic wrap or not. If you will not be working on them for a while, you should wrap them. . Before your begin to level, split, and fill your cake, you should gather the necessary supplies and prepare your frosting.Supplies: 1 round 8\" cake board (You will notice that I am using a 10x4 inch board in my pictures. It doesn't matter what size board you use as long as it is bigger than your cake so you can easily carry it without touching the cake)1 round 6\" cake boardscotch tapebig serrated knifecake leveler (If you do not have one, a serrated knife will work fine.)3 decorating bags4 1/2 cups of frosting (I made my own Italian Meringue Buttercream BUT you can use any frosting you want - I suggest about 2-3 cans of frosting for this project)If you would like a quick, easy, and delicious frosting you can make at home, do the following: (1) Combine 2 softened sticks of butter, with one teaspoon of vanilla extract until light and fluffy. (2) Then add in 4 cups of powdered sugar (1 cup at a time) and mix thoroughly after each cup. (3) Add 2-3 tablespoons of milk (one tablespoon at a time) and beat on high speed until you get the consistency you would like. This is a delicious basic American Buttercream recipe. red food coloringblue food coloringtoothpicks3 bowlsoffset spatulabench scraper1/2 C measuring cupturntableSteps: Put 1/2 cup of frosting in the first bowl. Leave it white.Put 2 cups of frosting in the second bowl. Dye it red by using a toothpick to add a glob of red color and begin to mix it in with a toothpick. Then switch over to a spoon to make sure the frosting and color are thoroughly mixed together.Put two more cups of frosting in the third bowl and dye it blue using the same method you used for the red.Bring out your chilled cakesPlace some scotch tape on the bottom of your final 6\" round cake board.Attach the 6\" cake board to the wrong side of the 8\" cake board (we will refer to this as your work board from now on). You want the colors to contrast so you can see where your cake ends and the work board begins.Note: In my picture, I am using a rectangular workboard.Make sure that the board is firmly attached and place it onto your turntable.. You have three options as to how you level your cakes. You can use a cake leveler, which is what I use because it's easier and requires less effort; you could use a ruler and a serrated knife, or you could eyeball your cake and use a serrated knife. I am going to go over how to do this with a leveler but for those of you who have a serrated knife, all you have to do is measure how high you want your cake to be and cut off the excess on top so the cakes are all the same height. You can also reference the pictures above of how I eyeball my cakes and use the serrated knife to split them.  Supplies:3 baked and chilled cakes cake leveler or serrated knife bowl for scrapsSteps:Choose the appropriate height on your leveler that will allow all of your cakes to be the same height. For me, that was level 5.  Make sure that you have enough table space to run your leveler across without it slipping off the sides. That could lead to a very uneven cake.  Place your hand on top of your cake and gently move your leveler across the cake in a back and forth motion. Take your time.  Remove the pieces you cut off and place the excess in a bowl, you can eat it to taste test your cake or if you have enough you could make cake pops.  Repeat this process with the remaining two cakes. Set your cakes aside. Return to the work board that you taped your cake board to and place a dab of frosting in the middle.  Spread the frosting out so there is a thin even layer in the middle of your cake board. Create a tic-tac-toe board in the frosting on your cake board. The lines you create will allow air to move underneath the cake you are going to place on this board and ensure it sticks properly.  Grab one of your chilled cakes and gently flip it over onto your hand.  Gently remove the board attached to the bottom. You may have to pull a little.  Flip your cake back over and firmly press it onto the frosted cake board that is attached to your work board.  Eyeball your cake and determine where the middle is on the side, you want to split your cake parallel to the base. Once you find the middle, place your knife there and gently begin to cut into your cake. Do not try to cut straight through your cake in one try. Instead, run your knife along the side of your cake, keeping it level, and gently turn your cake on the turntable. (Think of it like slicing a bagel in half.) As you turn your cake, continue to cut deeper and deeper until you cut it all the way through.  Separate your two cake pieces then place the top part back on the bottom and set the cake aside.  Repeat this process for the other two cakes but DO NOT attach the other two cakes to their boards with frosting. . Now we are getting to the fun part. Try not to eat too much of your frosting as you do this. It really is delicious.Supplies:3 decorating bags 3 bowls of frosting (red, white, and blue) with spoons in each color scissors bag ties or rubber bands (or you can go without this) turntable offset spatulaSteps: Get a decorating bag (or a sandwich bag) and open it up.  Fold over the top so it creates an opening for the frosting. Using a spoon, fill your bag with your first color of frosting. Close the top of the bag and twist it shut. Fold over the twisted portion and secure it with a bag tie (or rubber band, or just hold it in place when you use the bag)  Repeat this process for the remaining two frostings. Cut off the tip of the decorating bag so you have a penny sized hole. Starting with your first split cake, remove the top layer and pipe a layer of frosting on the cake. Use your turntable so you can pipe one consistent layer. Use the same pressure throughout and take your time.  Note: My frosting pattern is blue, red, white, red, blue.Once you have piped your layer of frosting, use your offset spatula to spread the frosting evenly. Don't worry if the frosting squeezes out of the sides.  Take the next cake layer and stack it on top of your first frosted layer. Gently press down and make sure it is secure before piping your next layer of frosting. Make sure to wipe your spatula between colors so your frosting doesn't blend together. Repeat this process of piping, spreading, and stacking until you have stacked all your cake layers.  Once your cake is completely stacked, use the remaining frosting to crumb coat your cake. Begin by piping lines of frosting vertically down the sides of the cake. (Don't worry about the colors, you can mix them now.) Pipe some frosting on top of the cake as well.  Using your spatula, begin to spread the frosting on top of the cake. Since this is a crumb coat, you want to make sure the frosting presses into the cake and seals the crumbs in.  Be sure to scrape off any excess frosting into a bowl. Run your offset spatula along the side of your cake as you turn the turntable. This will allow you to spread the frosting evenly and cover the entire cake.  Continue to remove any excess frosting and repeat the process of running the spatula across the cake as many times as necessary until it is completely covered.  Be sure to cover all parts of the cake and press in the crumbs. Set your crumb coated cake into the fridge to chill for at least 30 minutes. . Once your cake is nice and chilled, meaning the frosting is firm, it is time to give it a good final coat of frosting. Prior to frosting your cake, you will want to remove your 6\" cake board from the work board and place it on a cake base. You can use a large offset spatula to remove the cake from its workboard and place it on the cake base. You can also apply a dab of frosting (or tape) to the cake base in order to help secure the cake board once you place it. Supplies:Chilled and crumb coated cake A decorator bag filled with two cups of white frosting turntable bench scraper offset spatulaSteps:Place two cups of white frosting into the same decorator bag you used earlier (or if you want to, you can use a new one).  While turning your cake on the turntable, gently pipe around the entire cake a nice thick layer of frosting.  Pipe an additional layer of frosting on top of the cake.  Place your bench scraper at a 90-degree angle to your cake.  Gently turn the cake on the turntable and allow your bench scraper to smooth out your frosting, let the turntable do most of the work.  Smooth out the top of your cake as well, make sure to scrape away any excess frosting into a bowl. You can always add more later but you want to make the cake as smooth as possible so it may be necessary to remove some frosting.  As you smooth your cake, if you notice any holes, just fill them in with the piping bag and smooth it over again.  Take your time, be patient, and repeat the process as many times as necessary until your cake is as smooth as possible. I still struggle to get it super smooth so I did the best I could.  Set your cake in the fridge to chill for at least 30 minutes. . I chose to get my colors ready for my Girl with a Pearl Earring before covering my cake with black fondant. I honestly could have covered my cake first and it would have worked too. I gathered all the supplies I would need to both cover my cake and create my girl, before focusing on each step individually.Supplies:One box of black fondant (I used Wilton's decorator preferred)fondant roller (or regular rolling pin - it really doesn't matter)rulerpowdered sugar (or cornstarch)sharp knifefondant cutter (if you have one, if not, keep using the sharp knife)fondant smoother (not pictured)about 2 ounces of white fondant (once again, I used Wilton's decorator preferred)Exacto knife (not pictured)a printout of the Girl with a Pearl Earring that is the exact size you want to use on your cakescissorscutting boardsome vodka (not pictured)7 small containers (bowls, espresso cups, ramekin, etc.)orange, red, brown, copper, blue, black, yellow, green food coloringsilver pearl luster dustpaintbrushes*If you are like me and like to have things thoroughly planned out in advance, you could print out another picture of a Girl with a Pearl Earring and use it to write down the colors you need to mix in order to achieve the right shade for each part of the painting. **I also pulled up a picture of a Girl with a Pearl Earring on my laptop since my printer didn't print the colors properly. I wanted to get my version as close to the actual painting as possible. Steps:As part of my preparation, and as you can see in the pictures above, I mixed my colors in advance. I had to play around with it a little and test it on a spare piece of white fondant before determining which combination I liked best, I suggest you do the same. Some of these colors I mixed as a base and eventually darkened to add depth to the overall painting. Don't worry about this yet, I will explain it more thoroughly in the following steps. As you can see in the picture above, these are the colors I mixed for each part:For her skin: orange, red, and brownFor her clothes: copper and brown For the blue portion of her turban: just blueFor the yellow portion of her turban: yellow and brownFor her pearl earring: silver luster dust For the blue and yellow portion of her turban: I did not create a color For her lips: mostly red with a hint of copperFor her eyes: mostly green with a hint of black to make them darker . This part of the cake can be a little frustrating but do not let it defeat you! Initially, I tried to cover my cake the right way by rolling out one piece of fondant that would completely cover the cake smoothly. However, since the fondant I used tends to be a bit dry, it immediately began to rip when I tried to put it on my cake. I could have rolled it out thicker but I really didn't want a thick layer of fondant on my cake. So instead, I decided to panel my fondant which is what I'm going to describe here. Supplies: box of black fondant    frosted and chilled cake     fondant roller    fondant smoothers    ruler    powdered sugar    turntable     fondant cutter (if you have one)    sharp knife Steps:Measure your cake's diameter and height     Calculate the circumference of your cake.     spread some powdered sugar on your clean work surface (you may want some on your hands too)     take out your fondant and begin kneading it until it's pliable     mold a piece of fondant (not the whole thing) into a rectangular shape (make sure all the folds and creases are on the bottom)    roll your fondant (to about 1/4-1/8 inch) out until it is as long as (or longer than) half of your circumference and as high as your cake (you may want to give yourself an extra inch both ways, just in case)     Using your ruler, trim your fondant so it is exactly as long and as high as you need it    Stick your fondant to one side of your cake and use the fondant smoother to firmly attach it     Use your sharp knife to cut off any excess fondant that may be sticking out on top    Repeat steps 3-9 to cover the other side of your cake     Take a piece of fondant and mold it into a circle with all the creases and folds underneath    roll out your fondant (to about 1/4-1/8 inch) until it is big enough to cover the top of your cake     Place your fondant on the top of your cake and firmly attach it with your fondant smoother    Using your fondant cutter (or sharp knife) carefully trim away the excess fondant hanging over the top    Use a paintbrush to brush away any powdered sugar on your fondant.. Now you are on your final, and in my opinion most fun, part! You get to be an artist and paint your own Girl with a Pearl Earring! My biggest piece of advice for this part is to have fun with it. Unless you are extremely artistically talented, you probably aren't going to recreate Vermeer's work but I'm sure you can pull off an awesome likeness.Supplies:your mixed colorscovered cakepaintbrushes (I only used a very fine tip one)2 ounces of white fondantprint out of Girl with a Pearl Earring that is the exact size you want it to be on your cakeExacto knifesharp knifecutting boardfondant rollerSteps:If you haven't done so already, cut out your print out of a Girl with a Pearl EarringSpread some powdered sugar on your clean work surfaceKnead your white fondantRoll out your fondant (about 1/8 inch thick) so that it is big enough to fit the cut out of the GirlIf at any point you need to leave your fondant alone, cover it with plastic wrap so it does not dry outPlace your fondant on a cutting board and place the print out on topCarefully cut your fondant in the shape of your girl by tracing it with an Exacto knifeOnce you have a solid silhouette of your Girl, place your print out back on the fondant (make sure it lines up perfectly)Using a toothpick or sculpting tool (like the one I have pictured) carefully trace the key features of the girl so you have guidelines for when you are painting.Do not press down too hard or you will leave indentsOutline the eyes, ears, nose, mouth, turban, clothes, earringThough it may be a little difficult to see, you will have a good impression on your fondant that will guide your paintingBegin painting slowly and lightly, it is much easier to add more color than it is to remove.remember that her face is illuminated on one side so do not paint the entire face the same colorPaint a light base of her face, clothes, and blue portion of her turbanPaint a light base of her lips and the yellow portion of her turbanat this point, I added some concentrated (directly from the container) blue to the middle portion of her turban and added more brown to the yellow to create shadows in her yellow portionI also painted the ends of her turban and her pearl earringThen I added black to blue color so I could paint the back of her blue turban and began coloring in her eyesI then darkened her skin color with more brown so I could create shadows in her face and I darkened her clothes color with more brown so I could add distinction to her clothesI gave her eyebrows and gave her a very light (by adding vodka to her skin color container) sweep of color on the lighter side of her faceI added concentrated (directly from the container) red to her lipsI added black to her eyesAt any point, if I added to much color, I used vodka to \"erase\" my mistake. Be careful though because too much vodka can ruin the fondant.Experiment with this part and do whatever makes you happy.Once you are happy with your Girl, take a knife and lightly make some impressions all over her (if you look closely at the painting you will notice that it appears cracked)Let her dry for a few minutes then carefully pick her up (without touching the food coloring) and using a broader paintbrush, paint some vodka on the area of the cake you want to place herquickly secure her to the vodka (it dries quickly)Add more vodka strokes to any areas that are not attached yet. DO NOT USE THE FONDANT SMOOTHER - it will ruin your art.Step back, admire your work...and eat your cake! =)*If you want nice clean slices like the one pictured at the beginning, use a sharp chef's knife and wipe the blade in-between each slice.\nRead the question below and select from the following choices.\nA: Soundboard Cake With Working Volt Meters\nB: Final Product\nC: Prepare Your Cake Pans\nD: Step Five", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_10_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_10_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_10_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_10_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_10_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_10_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_10_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_10_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_10_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_10_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_10_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_10_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_10_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_10_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_10_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_10_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_10_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_10_17.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_10_18.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_10_19.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_10_20.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_10_21.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_10_22.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_10_23.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_10_24.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_10_25.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_10_26.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_10_27.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_10_28.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_10_29.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_10_30.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_10_31.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_10_32.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_10_33.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_10_34.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_10_35.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_10_36.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_10_37.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_10_38.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_10_39.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_10_40.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_10_41.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_10_42.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_10_43.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_10_44.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_10_45.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_10_46.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_10_47.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_10_48.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_10_49.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_10_50.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_10_51.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_10_52.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_10_53.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_10_54.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_10_55.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_10_56.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Mince the Pepper & Ginger\nB: Hot Pepper Jelly\nC: Make the Cake/Cupcakes\nD: Frosting Ingredients", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Ingredients', '@placeholder', 'Add the Pectin Mixture', 'Fill Sterile Jars']", "context": "Here is the context of these images:\n. \n\tPLEASE PLEASEPLEASE \n\t\u2022 Use gloves when handling hot peppers.\n\t\u2022 If the Capsaicin (which is what makes the heat in Chilies) gets in your eyes you will be miserable for a while.\n\tCapsaicin affects epithelial tissue especially in the non keratinized epithelium in the mouth, esophagus, nose, and eyes.What increases the heat?\n\t\t\u2022 Water washes away the oils or mucus that protects tissues and so will increase the heat from Capsaicin.\n\t\t\u2022 Anything that is salty or contains alcohol will increase the heat as well.What decreases the heat?\n\t\t\u2022 The fat in Cold milk bring the Capsaicin into solution and thus decrease a burning sensation (and according to Wikipedia caseins in milk have a detergent effect bringing capsaicin into solution to disolve it).\n\t\t\u2022 Cold sugar solution (10%) at 20 \u00b0C (68 \u00b0F) is almost as effective.\n\t\t\u00a0. \n\t\t3/4 pound of washed and chunked mixed hot Peppers\n\t\t1/3 cup fresh peeled Ginger\n\t\t4 cups sugar (I often mix 2 cups sugar with the Stevia equivalent of 2 cups sugar)\n\t\t2 cups of 5% apple cider vinegar\n\t\t2 packets of low sugar dry pectin. In a food processor finely mince the peppers with the ginger and set aside. Mix the dry pectin with about 1/2 cup of sugar and set aside.. \u2022 Mix the vinegar and remaining sugar\n\u2022 Add the minced pepper & ginger to the pot\n\u2022 Boil for 10 minutes over medium heat, while stirring periodically, to prevent burning.. \u2022 Remove the pot from heat\n\u2022 Add the pectin sugar mixture to the pot and stir briskly,\n\u2022 Return the mix to the heat and boil hard for 1 minute, stirring constantly. \u2022 chill a metal tablespoon by sitting it in an ice water bath,\n\u2022 Take a half spoonful of the pepper mix and let it cool on top of the ice to room temp\nIf it thickens up to the consistency of jelly it is ready. If not, mix in a little more pectin (about 1/3 to 1/2 of another package) and bring to a boil for 1 minute or cook a bit longer.. For the pepper jelly I use 8-12 ounce jars. I prepare the jars by running them and their their caps through the dishwasher. They can also be boiled in a large pot prior to filling\n\u2022 Fill jars to within 1/8-inch of the top and screw on covers tightly\n\u2022 Place in boiling bath 10 min and cool\nOnce cooled, the caps should be concave.ENJOY!\nRead the question below and select from the following choices.\nA: Mince the Pepper & Ginger\nB: Hot Pepper Jelly\nC: Make the Cake/Cupcakes\nD: Frosting Ingredients", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_11_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_11_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_11_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_11_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_11_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_11_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_11_6.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Vegan Coconut Chocolate Ice Cream\nB: and OPTIONS\nC: Making the Enchilada\nD: Ice Cream Spaghetti", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Ingredients', 'Mix Ingredients & Set Aside for 2 Hours', 'Add Ice Cream Filling', '@placeholder']", "context": "Here is the context of these images:\n. Chocolate Tortilla\n1/2 cup flour\n1/2 cup sugar\n3 tablespoons unsweetened cocoa\n1/4 cup milk\n2 eggs \n1/4 cup vegetable oil\n1 teaspoon vanilla\ndash of salt\nFilling\nice cream  (any flavor)\nGarnish\nchocolate syrup or sauce\nfresh strawberries. Combine ingredients and hand mix for about 5 minutes until texture is smooth.\nCover and store in refrigerator for about 2 hours.\n. Heat a nonstick skillet over medium heat.\nPour 1/4 cup mixture in center and tilt pan to spread batter into a circle.\nLet cook for about 2 minutes then flip to cook other side for another 1-2 minutes.\nBe careful not to burn or else the tortilla will be too stiff for folding.\n. Add 3 to 4 heaping spoons of your favorite ice cream on top of the chocolate tortilla.\nI used strawberry. \n. Fold both sides of the chocolate tortilla to wrap the ice cream filling.. Drizzle with chocolate syrup and garnish with whole or cut strawberries.\nRead the question below and select from the following choices.\nA: Vegan Coconut Chocolate Ice Cream\nB: and OPTIONS\nC: Making the Enchilada\nD: Ice Cream Spaghetti", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_12_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_12_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_12_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_12_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_12_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_12_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_12_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_12_7.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: 3.14 Quick and Easy Mini Pies\nB: Assembling the Crust\nC: Get Ready With the Pastry\nD: Things You Will Need", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Gather the Elements', '@placeholder', 'Assemble Your Mince Pies....', 'Bake, and Enjoy']", "context": "Here is the context of these images:\n. You need (for 12 mince pies like these)1 x ready rolled puff pastry - take out of the fridge and allow to get to room temperature1 x jar mincemeat (actually you only need 12 tea-spoonsfull - about 120g or 4Oz)1 x egg (for optional egg wash)a baking tray, and an oven... pre-heat to gas mark 6, 200C, 400F, 'moderately hot', (180C or equivalent if its a fan oven)and a couple of tools.. Allow around 10 minutes to reach room temperatureCollect your tools : - a teaspoon and a knife work for me, with a fork to beat the egg.....Beat the egg while the pastry, and the oven, warm up.. Open out the pastry sheetUse the knife to turn into 12 rectanglesAdd a tea-spoonful of mincemeat to one end of each rectangle, leaving enough space to seal the pastry when you fold it over.Make a parcel, and press the sides together to keep the mincemeat in the pastry.Prick each parcel, to let steam escapeApply the eggwash - as uniformly as you like..... The pastry comes on its own baking paper which can go into the oven on a baking tray.Cook for 20-25 minutes (see below)leave to cool on a rackEnjoy!I have used supermarket (Aldi) pastry, and JusRol. The Aldi pastry is a slightly bigger sheet, so the pies are bigger and take slightly longer to cook. They taste just as good!\nRead the question below and select from the following choices.\nA: 3.14 Quick and Easy Mini Pies\nB: Assembling the Crust\nC: Get Ready With the Pastry\nD: Things You Will Need", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_13_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_13_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_13_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_13_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_13_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_13_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_13_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_13_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_13_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_13_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_13_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_13_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_13_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_13_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_13_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_13_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_13_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_13_17.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Eggs Florentine\nB: Crack Egg\nC: Do Your Homework\nD: Prep and Cooking", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Get a Coop', 'Put Them Outside', 'Meet the Girls']", "context": "Here is the context of these images:\n. Are chickens allowed in your neighborhood/city?Do your neighbors mind if you have chickens?Do you have a feed store nearby for chicken feed?Do you have room for chickens?If you can answer yes to these questions, then ask:What kind of chickens do I want? All of this will cause you to ask more questions and then you can ask your significant other, \"Honey, do you mind if we get some chickens?\". There are several types of chicken coops available to build or buy.  Purina offers a great free chicken coop plan on their site that I really liked.  But since my husband was going to be the one building it and I was going to have to buy the materials, I spent a couple weeks trolling craigslist until I found one just like if for $75 and rented a trailer.  My husband was more than happy to spend a morning picking one up instead of two weekends building one from scratch and I saved about $150 in building materials.. There are several places to order chicks from.  I recommend you visit sites and read the reviews about the temperament and laying habits of the different breeds to find out which will suit you best.  I used Murray McMurray Hatchery in Iowa.  They shipped me my one day old chicks on October 1.  They arrived at the post office the next day.  The US Post Office was excellent in giving me a call to let me know they were there.  I could hear them chirping in the backroom when I arrived.When they arrive Murray McMurray has awesome support tips on their website to ensure you can take care of them.  Basically you dip their beaks in food and water and they figure out the rest.  Be sure to put a lamp on them as they need to stay over 90 degrees.  Here in Texas we still have 90 degree weather in Oct, but we kept a lamp on them anyway.  . Pretty soon, like in 3 - 4 weeks they start hopping out of the box (or brooder) and they need to go out to the coop.  Plus they stink.  . The next 3 months are spent trying to determine how the heck you can keep them from soiling their food and water.  After several debates we decided on a 4 inch PVC pipe with a Y connection on the bottom strapped inside the coop for their food.   Once they could get out of the coop we left their water outside.  . The last thing you want is drama in the hen house.  Any stress will cause the gals not to lay and lead to bad relations all around.  Anyone that is too noisy will cause your neighbors to call you in as owning a nuisance pet.  These birds make better dinner guests.. Then on February 8 we got our first eggs. The first two held four yolks.  Now most of them have one yolk and they give us five eggs a day. . We kept five hens, Lily, Ivory, Ebony, Ruby and Dotty.  They will have to do their own instructable about how they make the eggs.  It amazes me everyday that they just create food. They are very sweet, get along with the dogs and greet me when I come home from work.  They eat bugs out of the yard and are just easy going nice pets that offer food too!\nRead the question below and select from the following choices.\nA: Eggs Florentine\nB: Crack Egg\nC: Do Your Homework\nD: Prep and Cooking", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_14_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_14_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_14_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_14_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_14_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_14_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_14_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_14_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_14_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_14_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_14_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_14_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_14_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_14_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_14_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_14_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_14_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_14_17.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_14_18.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_14_19.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_14_20.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_14_21.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: RECIPE | MANGO AVOCADO SALAD\nB: Frozen Peas\nC: Make the Dressing\nD: Subscibe/Follow Us to Get the Latest Updates", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Ingredients', '@placeholder', 'Cut the Potatoes', 'Combining + Serving']", "context": "Here is the context of these images:\n. 2-3 pounds golf ball sized new potatoes1/3 cup olive oil2 shallots, finely diced3 tablespoons white vinegar1 tablespoon dijon mustardpinch of sugar1 teaspoon salt (plus more for salting the cooking water!)black pepper to taste1/2 cup fresh herbs - I'm using half parsley, half dillThe recipe above is fairly salty - if you're sensitive to salt, maybe do 1/2 teaspoon mixed with the dressing at first and then try more if you like it. :DThis recipe adapted from a CHOW recipe found here. . Place the potatoes in a large pot and cover with water. Add a couple huge pinches of salt.Bring to a boil and let cook for 10-15 minutes. Then check a few of the largest poatoes with a paring knife - if you can easily insert into the potatoes you are good to go! If not, set the timer for a couple more minutes and check again. Try not to overcook them too much - they'll burst their skins!I think mine took right around 15 minutes to cook.Once they're done cooking, drain them and set them aside to cool. Speed up the cooling by laying them out on a baking sheet. :). As the potatoes cool, dice the shallots pretty finely. Throw them into a bowl large enough to toss the potatoes. Add the vinegar, olive oil, salt, dijon mustard and a pinch of sugar. Whisk everything together until it's emulsified and taste test. Add whatever else you'd like! Now set it aside until the potatoes are nice and cool.. Once the potatoes are cool, cut them into bite sized pieces. Some I cut right in half, larger ones I cut into three slices.. Add the potatoes into the bowl on top of the dressing. Chop the herbs finely and add them on top. I normally add a crack of black pepper to the top too. Use a spatula to stir everything gently so all the potatoes get covered with the dressing. It's fabulous just as it is right now, but I think it tastes even better after a night in the fridge. I recommend making it one day ahead of when you want to serve it. You can serve it cold or bring it to room temp - both are tasty! The dill really comes through after a bit of time mingling. :D\nRead the question below and select from the following choices.\nA: RECIPE | MANGO AVOCADO SALAD\nB: Frozen Peas\nC: Make the Dressing\nD: Subscibe/Follow Us to Get the Latest Updates", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_15_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_15_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_15_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_15_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_15_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_15_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_15_6.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Elvis Ice Cream\nB: Shake Vigorously (up and Down AND Side to Side to Allow for Even Distribution of the Cold) for Approximately 10 Minutes, or Until the Mixture in the Smaller Bag Thickens Into Ice Cream.\nC: Ingredients\nD: Seal the Smaller Bag Tightly, and Place It Inside the Bigger Bag. Then, Seal the Bigger Bag Tightly, Ensuring the Smaller Bag Is Inside.", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Ice Cream Mix (1)', 'Ice Cream Mix (2)', 'Finishing the Ice Cream']", "context": "Here is the context of these images:\n. We will need:1 1/2 cup of milk whole or non-fat (I used whole milk) 3/4 cup of sugar1 cup of whipping cream 4 egg yolks4 tablespoons of vanila flavorYellow colorant (optional)A medium bowlA large bowlElectric mixerA medium saucepan. 1. Place the whipping cream and the large bowl in the freezer for 30  - 45 minutes.2. Meanwhile, pour the milk into the sauce pan and add the 4 vanila flavor tablespoons. Heat until 55 - 60 C (130 - 140 F). Set aside and let it cool until 40 C (104 F)3. In the small bowl stir the egg yolks until light. 4. Add sugar gradually and continue mixing until spreading consistency5. Add the milk to the yolks and continue mixing until you have an homogeneus mix.Note: These are extra steps added to prevent any Salmonella risks due to the raw egg and is strongly recommended to do it. Moreover, if you are going to make ice cream for your kids you must do it. However, if you feel comfortable knowing that you are going to consume raw egg, you can skip the following steps but do not tell me later that I didn't warn you about the risk.6. Heat the mix up to 65 C (140 F) for 10 minutes. Do not forget to stir the mix with a spoon while heating and do no let the temperature rise too much. Try to keep the temperature constant. A tip is to heat up to 75 C (170 F), then turn off the stove and continue stirring with the spoon by 10 minutes. After 10 minutes the temperature should be more or less 65  C.7. Cool the mix up to room temperature. Do not forget to continue stirring with the spoon while cooling. . It's time to use the large bowl and the whipping cream we have on the frezzer1. Pour a cup of the whipping cream into the bowl and beat it with an electric mixer until stiff peaks form. 2. Fold milk mixture into whipped cream3. Continue mixing at a low speed for five minutes more.4. Cover the bowl and freeze it for two hours . After two hours, the edges should be hard. 1. Using a spoon, break the edges and incorporate it into the mix.2. Stir gentle with the electric mixer and freeze again After an hour, repeat the above steps and continue freezing for 4 hours more or until the ice cream hardens completely.Congratulations, you have made ice cream. Enjoy it!A final note: The freezing times may depend on your freezer. It is advisable to let the ice cream rip for 24 hours before serving. However if you do not want to wait, you can enjoy it as soon is hard enough for scooping. \nRead the question below and select from the following choices.\nA: Elvis Ice Cream\nB: Shake Vigorously (up and Down AND Side to Side to Allow for Even Distribution of the Cold) for Approximately 10 Minutes, or Until the Mixture in the Smaller Bag Thickens Into Ice Cream.\nC: Ingredients\nD: Seal the Smaller Bag Tightly, and Place It Inside the Bigger Bag. Then, Seal the Bigger Bag Tightly, Ensuring the Smaller Bag Is Inside.", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_16_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_16_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_16_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_16_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_16_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_16_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_16_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_16_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_16_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_16_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_16_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_16_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_16_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_16_13.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Cous Cous and Halloumi Cheese\nB: Griddle and Flip\nC: Enjoy\nD: The Final Product", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Ingredients', 'Dice', 'Shape It', '@placeholder']", "context": "Here is the context of these images:\n. 1 small Camembert (125g - that equals about 4.5 ounces)1,5 heaped teaspoons of powdered sweet paprika0,5 teaspoon caraway seeds1 tablespoon of fine diced onion (I cut them as small as possible, the smaller they are the better they integrate / dissolve into the spread...)3 heaped tablespoons cream cheese (we used a chili spiced variety but any cream cheese will do the job - you can also use sour cream, your obatzer may just become a little softer)1 or 2 dashes of salt and pepperThe Camembert should be ripe/aged and soft. There is no need to use fancy imported French Camembert for this recipe, I always use the cheaper German Camembert. You could use other Camembert-like cheese like Brie as well.I don't recommend using low fat cheese for this recipe, it's consistency isn't creamy enough to get the Obatzda right.I highly recommend to allow the Camembert to come down to room temperature. If you take the cheese just from the fridge it will be much sturdier and harder to mash (also the aroma evolves at room temperature).My mother used to add butter (about two tablespoons) and also sometimes used processed cheese (about a tablespoon) and therefore she reduced the cream cheese to about one tablespoon.But my grandma should watch her weight so \"our\" recipe replaces the butter with cream cheese - and because processed cheese gives me itchy teeth we don't add it either... This recipe is pretty forgiving, in fact before I wrote this 'ible I always just eyeballed the amounts and always turned out nice. So feel free to experiment with the different ingredients and find your own favorite mixture.. Place your Camembert on a plate and find a pretty Hand model who cuts the Camembert into little dices. No need to be especially accurate, it will be mashed soon anyway... . Add 1,5 teaspoons of sweet powdered paprika, half a teaspoon on caraway seeds and about a tablespoon of super fine dices onions. Mix everything together.. Add about three tablespoons of cream cheese. Continue mishmashing squishsquashing everything together. Use a tool according to your preferences: My grandma prefers to use a knife, I like to use a fork. I never used a food processor or a hand mixer for this process (nor did my mother) I'm not sure if machines are a able to \"batz\" as nicely as humans do.... Mix and mash and squash until the spread reaches your desired consistency. It usually takes about three to five minutes to get it right.. Traditionally the Obatzda is served on a plate and shaped like a dome, but my grandma prefers to shape it like a disk. Use a knife to shape the cheese spread according to your styling preferences. We use the same plate for mixing and serving, you can use a paper towel to clean up the rim of the plate. And of course you could as well serve your Obatzda in a bowl.We like to sprinkle some chive on top - it looks nice and also the aroma fits well.... I enjoy Obatzda the most one or two hours after it's mashed together, I think the flavors develop even better after a little resting time. But you can consume it right away as well. (Obatzda tastes the best the day it is made, the next day the flavors are even more advanced - which is still fine for my grandma, but maybe not for everyone)In the third picture you can see my grandmother and me enjoying Obatzda with some fresh pretzels ;). This rustic hearty snack goes very well with a glass of beer or Radler (beer and lemonade).Pretzels and lye rolls are a perfect base for this spread. Rustic dark bread fits as well.You can serve it with fresh radishes, they are a nice company for Obatzda.You may sprinkle some chives on top enhance visual appearance and flavor. If you don't like onions just skip them, but I think onionless Obatzda is just half as nice as the original... Another option might be to slightly stew the onions to melt down their harsh flavor.Some people like to garnish the Obatzda with onion rings instead of integrating diced ones, this may be a good option if an onion lover and an onion hater want to share a portion of this spread.Some people like to use stronger soft cheeses (German Romadour or Limburger). If Camembert isn't hearty enough for you you might to try those.Some Obbatzda recipes integrate a splash of beer into the mix but I've never tried this myself. As long as you don't share your Obatzda with children you might give this it a try. (In Frankonia they add a splash wine instead of beer - feel free to experiment)Fun Fact: Originally Obaztda was just used as a way to use up overaged cheese. \nRead the question below and select from the following choices.\nA: Cous Cous and Halloumi Cheese\nB: Griddle and Flip\nC: Enjoy\nD: The Final Product", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_17_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_17_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_17_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_17_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_17_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_17_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_17_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_17_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_17_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_17_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_17_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_17_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_17_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_17_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_17_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_17_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_17_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_17_17.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_17_18.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_17_19.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_17_20.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_17_21.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Graveyard Brownies\nB: Mix the Dough\nC: Bake the Brownies\nD: Mix Up the Brownies", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Blend the Brownies', 'Place the Base', 'Enjoy!']", "context": "Here is the context of these images:\n. Preheat the oven to 350\u00b0 Fahrenheit to prepare for the baking of your deliciously decadent creation. Then, spray PAM in each cup of the cupcake pan and wipe cooking spray evenly with a napkin throughout.. You will then prepare the cookie dough. I always prefer and use Betty Crocker\u2019s Chocolate Chip Cookie mix for the base of the brownie because it's just my favorite tasting cookie dough. Simply follow the instructions on the packaging to prepare the dough by adding 1 stick of softened (not melted) butter and 1 large egg to the dry mix and mixing well together.. Once the cookie dough is mixed, you will begin to prepare the brownie mix. I chose to use Pillsbury Chocolate Fudge Brownie mix. Again, you will follow those instructions by adding the dry mix with 2 eggs, 1/3 cup of water, and 1/3 cup of vegetable oil and blending.. Place a little smaller than a spoonful of cookie dough into one of the cups of the pan so that there is about a half inch thick layer of cookie dough that covers the base of the cup. Repeat this until all cups have the same layer of cookie dough filled.. Next, you will add one Oreo to the center of each cup and press firmly so the cookie stays in place while baking. Be sure not to press too hard, otherwise the cookie will break!. Use a spoon or ladle and fill the remaining area of the cups with brownie mix, but be careful not to fill them directly to the brim because they will overflow; just a little below the brim is perfect.. Place the cupcake pans in the oven set at 350\u00b0 Fahrenheit and let bake for about 18 minutes. Once cooled, take a knife to carefully separate the brownie from the pan and remove onto a plate.. Take a bite and enjoy the sweet taste of your delicious Slutty Brownies!\nRead the question below and select from the following choices.\nA: Graveyard Brownies\nB: Mix the Dough\nC: Bake the Brownies\nD: Mix Up the Brownies", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_18_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_18_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_18_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_18_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_18_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_18_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_18_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_18_7.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Pizza Sauce\nB: A Simple and Delicious Pizza Sauce\nC: Make the Alfredo Sauce\nD: Make the Fettuccine Noodles", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Supplies and Ingredients', '@placeholder', 'Pizza Dough', 'Putting Your Pizza Together']", "context": "Here is the context of these images:\n. \n          Supplies for Pizza Sauce:   Medium skillet or saucepan     Knife     Measuring spoons     Measuring cups     Stove  Supplies for Pizza Dough:   Large bowl     Kitchen towel     Timer     Baking sheet     Parchment paper     Measuring spoons     Measuring cups     Rolling pin     Oven  Ingredients for Pizza Sauce:   2-tablespoons olive oil     \u00bd-cup onion, chopped     1-teaspoon sugar     1 garlic clove, minced (or minced garlic out of jar - 1/2 teaspoon = 1 clove)     15oz can tomato sauce     1-tablespoon italian seasoning     Salt and pepper to taste  Ingredients for Pizza Dough:   1-teaspoon sugar     2 \u00bc-teaspoon yeast (one store bought package)     1-cup warm water     2 \u00bc-cup flour, all-purpose (2 cups of flour go into dough, other 1/4-cup is for flouring surfaces and rolling pin)     1/4-teaspoon salt     Cooking spray  Supplies for Safety:   Apron or other clothing you don't mind getting dirty     Potholders  Note: One pizza should yield about 12 pieces (squares). The number of pieces will depend on how thick or thin you want the crust. You will yield almost 2 cups of tomato sauce. If not all sauce is used in one pizza prep you can certainly freeze the sauce for next time.\n        .   Safety First: Please be careful. Sauce will be hot. If children are helping you cook, please supervise them at all times.\u00a0  Time: 25 minutes  \u00a0   Heat olive oil in a small skillet or saucepan over medium heat.     Add the onions, sprinkle with the sugar, then lower the heat.     Cook the onions and sugar over medium-low heat\u00a0for 10 minutes.     Add garlic and cook one minute longer.     Add tomato sauce and\u00a0italian seasoning.     Cook on low, uncovered until thick (about 15 minutes - the longer you let it go the better it is).     Season to taste with salt and pepper. .   Suggestion: Wear an apron or other item of clothing you don't mind getting dirty. The flour tends to get everywhere!  Time: 1 hour, 15 minutes  \u00a0   Dissolve sugar and yeast in warm water in large bowl, let stand 5 minutes.     Add 1-cup flour and \u00bc-teaspoon salt to yeast, mix well.     Add 1-cup flour, stirring well.     Turn dough out of bowl\u00a0onto lightly floured surface. Knead until smooth and elastic, about 10 minutes. (If you have a mixer with a dough hook you can let the machine do the work for you.)\u00a0Add additional flour as necessary to keep dough from sticking.     Place dough in large bowl coated with cooking spray, turning to coat.     Cover bowl with kitchen towel and let rise 45 minutes.     Punch dough down; cover and let rest 5 minutes. .   Safety First: Please be careful. Pizza will be hot. If children are helping you cook, please supervise them at all times. Use potholders to protect hands.  Time: 20 minutes  \u00a0   Preheat oven to 450-degrees.     Roll dough into desired size and shape (smaller = thicker crust, larger = thinner crust - round or rectangular shape is up to you)     Place dough on baking sheet covered with parchment.     Top with desired sauce and other toppings.     Place in oven for 12-15 minutes (the thicker the crust the longer the bake time), until crust is golden brown and cheese is melted. \nRead the question below and select from the following choices.\nA: Pizza Sauce\nB: A Simple and Delicious Pizza Sauce\nC: Make the Alfredo Sauce\nD: Make the Fettuccine Noodles", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_19_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_19_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_19_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_19_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_19_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_19_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_19_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_19_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_19_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_19_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_19_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_19_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_19_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_19_13.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Cambodian Beef Salad (Lok Lak)\nB: Satay\nC: Chop Vegetables\nD: Peanut Sauce", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Add Water, Boil', 'Let Cool for Beef or Chicken Broth and Slim Fat Before Storage', 'Store and Use Later']", "context": "Here is the context of these images:\n. The main ingredients are simple...OnionGarlicCeleryCarrotBell PepperWaterAdd to this some meaty beef bones, or chicken and simmer for a couple of hours and you have it!. I save everything in the freezer. Veggie trimmings, chicken bones and skin, meaty bones for the pot of beef broth. Not much waste in our house. You can of course use a whole chicken, but I have enough \"scrap\" to make mine.  I do buy beef - normally neck bones and stew meat for this because we don't normally get a lot of beef trimmings in our normal menu items.. Just chunk all the veggies up, no peeling required, no precision chopping.  Just get it in the pot.. I have an herb garden outside my kitchen so I add them to most everything! Not a required ingredient at all, but a few bay leaves and some parsley just add a little something. I also add thyme to mine, just because I can.  I also throw in a good amount of whole peppercorns.. Once all your ingredients are in the pot, cover completely with water, bring to a boil, reduce to a simmer and let it go for at least an hour.  I usually let mine go for a couple of hours just because I get easily distracted by shiny things, but that is a whole different story!!!!. Strain the liquid from the solids and throw them away, they have done their job. All the flavor has been cooked out and the liquid you have left is a rich, golden color, full of flavor!. I always chill the completed beef and chicken broth, that makes it super easy to skim off any fat that has hardened, giving you a fat free, sodium free broth to add to your dishes.  It makes me feel better knowing that I am in control of the salt in my diet. And no preservatives either!. I got these awesome containers and use them for EVERYTHING! I freeze the broth and pull it out to use as needed. Label everything, once frozen it's hard to tell veggie from chicken broth.  I thaw a container and keep it in the fridge to add to dishes. You could also freeze some in ice cube trays for those times when you just need a splash of something. I hope you enjoy!. Watch more videos at Jeanna's Good Lookin' Cookin' on Youtube!Also visit my Facebook page Jeanna's Good Lookin' Cookin' for quick tips and ideas!\nRead the question below and select from the following choices.\nA: Cambodian Beef Salad (Lok Lak)\nB: Satay\nC: Chop Vegetables\nD: Peanut Sauce", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_20_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_20_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_20_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_20_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_20_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_20_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_20_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_20_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_20_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_20_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_20_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_20_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_20_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_20_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_20_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_20_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_20_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_20_17.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_20_18.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_20_19.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_20_20.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Spag Bol With a Thai Twist.\nB: Additional Notes/nutrition\nC: Vegetable Preparations.\nD: Potato Leek Soup With Cheerio Croutons", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Sauteing the Vegetables in Butter.', 'Adding Spice', 'Allow to Cook']", "context": "Here is the context of these images:\n. I like to have the vegetables chopped up first and leave them neatly in bowls so that i can use them later. all of the ingredients that go in to the soup are eventually going to be blended. This means that you don't need to have all the vegetables neatly chopped, just give them a rough chop but try to ensure that they are all the same size.Picture 1. Chop the garlic to a puree consistency, it doesn't need to be perfect, but roughly equal is fine.Picture 2. Same goes for the onions Chop them finely in to pieces in to roughly the same size.Picture 3. Finally chop the carrot up, the larger pieces that you chop this up in to the longer it will take to cook in the pan.. You can chose to use just normal oil at this stage, but i wanted to give this dish a lovely rich flavour and so decided to saute all my vegetables in butter instead. Using just normal oil would make the soup suitable for vegans.Picture 1 Melt the 250g  of salted butter in the pan until the butter browns slightly. Picture 2. Add the onions and garlic in to the pan and allow to soften. this will often take about 5 minutes or so on a medium heat.. Adding the spices at this I think allows the flavours of the spices to integrate better. Once you have added the spices to the onions mix them in thoroughly and then allow the onions to cook in the spice mix for a few minutes. Picture 1-3 Add in 1Tbsp of Cumin, Coriander powder and Turmeric Picture 4. Combine all of the ingredients with the onions.Picture 5. Allow to cook together for a few more minutes. . Time to form the base of the soup with the bulk of the ingredients, you want to lightly cook the chopped carrot in the hot butter before adding the stock. Picture 1. Pour the 900g of chopped carrots in to the pot containing your cooked onions/ garlic with spices.Picture 2. Stir in the carrots as to cover them in the onions and spices, cook them slightly at this stage in the hot butter they only need a few minutes.Picture 3. After you have cooked them for a few minutes time to add your stock. . The carrots need to cook entirely before you can move on to the blending stage, for this they need to simmer gently on a low heat until the carrots are completely soft i would recommend around 20 minutes then check upon them to see if they need slightly longer. Picture 1. Cook the carrots in the stock for around 20 minutes check they are cooked before you blend them.. This is going to be a thick soup, spend your time blending it entirely. You can if you would like pass this soup through a sieve to get a finer texture, but your stick blender should be able to sufficiently blend up the soup.Picture 1. Once the carrots have cooked until soft and they are soft enough to be crushed against the side of the pot with a fork, the soup is ready to be blended.Picture 2. Blend the ingredients until smooth.. I have chosen to garnish this dish with a poached egg, you can garnish with other ingredients if you wish. When poaching always ensure that you use fresh eggs, the older the egg the less of the egg white is going to stay around the egg yolk when you cook it. Picture 1. In a small pot of water boil seasoned water with a touch of olive oil. Picture 2. Always put the egg in a cup, opposed to cracking it directly in to the pot this. this allows you to pour the egg in slower are more gently. Picture 3. Once the water is boiling create a gentle spin in the water and pour the egg in the middle. the spin will pull the egg around its self. once the egg has risen to the top your poached egg is cooked this will take around 2 minutes or so. . I have garnished this soup with a small amount of paprika, pepper, olive oil a poached egg and coriander leaves. I would recommend eating this soup on the day that it has been cooked, soup tends to be rather difficult to store in the fridge. This soup can also been frozen but i get i wouldn't recommend it as you would will lose flavour heating it to a high temperature in order to make it safe to eat again. Thank you for reading , if you have any comments or queries please contact me and I will try to answer them as soon as possible.\nRead the question below and select from the following choices.\nA: Spag Bol With a Thai Twist.\nB: Additional Notes/nutrition\nC: Vegetable Preparations.\nD: Potato Leek Soup With Cheerio Croutons", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_21_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_21_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_21_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_21_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_21_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_21_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_21_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_21_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_21_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_21_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_21_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_21_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_21_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_21_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_21_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_21_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_21_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_21_17.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_21_18.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_21_19.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_21_20.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_21_21.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_21_22.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_21_23.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Adding Pure and Simple H20!!!\nB: Hibiscus Tea\nC: Final Measurements and Glue\nD: The \"Real\" Persian Tea", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Strange Brew!!!', 'Boil Me Tender...', 'When Life Gives You Lemons...', '@placeholder']", "context": "Here is the context of these images:\n. The ingredients in this wonderful and OFFICIAL Peet's recipe are as follows:1/4 lb. of Hibiscus C tea12 oz. of boiling water12 oz. lemonade1/2 cup of white sugar4 oz. cold waterIce cubes to coolY'all will also need a tea-kettle, an iced-tea pitcher, and a tea pot of some sort. Boil 12 oz. of water, DO NOT REMOVE KETTLE UNTIL THE WHISTLE BLOWS.. Pour the boiling water over the Hibiscus C tea into a teapot (or other heat-safe container, in our case we used a coffee presspot) and let steep for 5 minutes. (If you prefer a stronger tea taste, feel free to let it steep a bit longer). After the tea has steeped for 5 minutes or so, use a strainer to separate the hot liquid from the loose tea into an iced-tea pitcher, and immediately afterward add the 1/2 cup of sugar. This is critical to do directly after the tea has steeped so the sugar can dissolve in the hot liquid. Gently stir to ensure that all sugar is dissolved.. ... pour lemonade into the mix y'all!After the sugar is dissolved into the concentrated tea, pour 12 oz. of cold lemonade into the pitch.Continue to stir the mixture. This step is simple y'all, while stirring, pour 4 oz. of cold fresh water into the pitcher. yep, that's all for this step.. So what would an iced tea cooler be without the ice, right? Once the mixture is completely stirred together, add a few handfuls of ice cubes to chill the drink. If you really want to get festive, you can use fun ice cube shapes...we used puzzle and triangle ice-cube molds. Special ice shapes are the perfect mundane detail to dazzle your friends and show up Martha!!!. Add some more flare to this fabulous drink by pouring it into your favorite cocktail glass and adding a colorful garnish like a slice of lime or a lemon twist.Your friends and dog will love it!!. This drink is best served chilled on a hot day. Add some banana loungers, Girl Talk's \"Feed the Animals\" album and a friend or two and you have an instant party!!\nRead the question below and select from the following choices.\nA: Adding Pure and Simple H20!!!\nB: Hibiscus Tea\nC: Final Measurements and Glue\nD: The \"Real\" Persian Tea", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_22_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_22_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_22_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_22_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_22_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_22_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_22_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_22_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_22_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_22_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_22_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_22_11.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Easy Banana Bread\nB: Making the Dough\nC: Oil and Spices\nD: Ingredients", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Prepare the Bread', '@placeholder', 'Bake the Croutons', 'Done!']", "context": "Here is the context of these images:\n. The first step is to remove the crust from the bread. This bread I got from a cook, who couldn\u2019t serve it anymore as it was slightly dry. The crust was delicious, but it\u2019s easier to give the croutons an even look without it. However if you\u2019re strongly against wasting food it\u2019s possible to leave the crust on.When the crust has been removed, cut the bread into little squares. Mine were around 2 x 2 x 2 cm. Then add the little bread cubes to a bowl.. The next step is to add oil and spices. I chose to add thyme, but other spices can be used as well. (Sometimes garlic is great). I added the oil first, so that the spices had something to stick to. Then I added salt and pepper \u2013 just add what you feel like here. Then I mixed the croutons very well, to make sure that the spices and oil was evenly spread and on all the little bread cubes. It might be necessary to add a bit more oil, but be careful not to add too much.. All that\u2019sleft now is to bake the croutons in the oven. Spread the croutons on a baking tray, and make sure they\u2019re not covering each other.Bake the croutons in the oven at 200\u00b0C for 8 \u2013 10 min. They are ready when they\u2019re golden brown and crunchy.. Now your croutons are ready to eat! They can lest for a couple of weeks if kept in a closed container. As mentioned in the intro they taste great, and (as you've seen now) are easy to make. :) I hope you'll enjoy this instructable, stay creative out there! :D \nRead the question below and select from the following choices.\nA: Easy Banana Bread\nB: Making the Dough\nC: Oil and Spices\nD: Ingredients", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_23_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_23_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_23_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_23_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_23_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_23_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_23_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_23_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_23_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_23_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_23_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_23_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_23_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_23_13.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Make Delicious Haleem\nB: Ingredients\nC: Leipziger Lerchen (typical Saxon Speciality)\nD: Time for a Coffee Break", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Add Some', 'Garlic We Need Garlic', '@placeholder', 'The Last Step']", "context": "Here is the context of these images:\n. To make this delicous Ajam Ketjap you will need:500 grams of chicken breast (Ajam)a splash of oil 2 large onions 6 cloves garlic 1 small can of tomato puree 1 theespoon of salt 6 tablespoons of Sweet soy sauce (Ketjap manis)You also need a cutting board and knife a stirrer and a large pan.. Cut the onion into pieces, put a little bit of oil in your pan and add the sliced onion and tomato puree together in the pan and fry until the onions are translucent. (it is very importent to put them in together, for the taste of the end product). Whille you fry the unions an tomato puree, Cut the chicken breasts in dices, when the unions are translucent add the chicken and fry it until the chicken is brown.. crush the garlic and put it in the pan stir and fry for 1 or 2 minutes. (Some times people say that 6 cloves is to much and there breath will be terible afterwards. But you do not have to be afraid this wont hapen.). Now add the Theespoon of salt and 6 tablesppoons of Sweet soy sauce also called Ketjap manis, stir it and add about 1 cup of water ( the chicken has to be covered with the sauce you made.. Put the lid on youre pan and let it simmer for about 15 minutes occasionaly stir it, this is a good time to get yourself a nice cup of coffee.. After about 15 minutes get the lid off of your pan and let it simer for another 5 to 10 minutes depending on the amount of watehr that was added in step 5, this has to be done for 2 very important reasons, first of all the excess liquid wil vaporize and second every body in the house will come towards the kitchen wondering what it is that smells so good.You can eat this with bread or rice, both is delicious.Enjoy your meal!\nRead the question below and select from the following choices.\nA: Make Delicious Haleem\nB: Ingredients\nC: Leipziger Lerchen (typical Saxon Speciality)\nD: Time for a Coffee Break", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_24_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_24_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_24_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_24_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_24_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_24_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_24_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_24_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_24_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_24_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_24_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_24_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_24_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_24_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_24_14.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: How to Make Baka Bana\nB: \nC: The Final Product!!!\nD: Eat!", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Grating the Cheese!', 'Pasta Cooking and Butter Melting.', 'Making the Sauce!', '@placeholder']", "context": "Here is the context of these images:\n. \n          For this recipe, you will need:1 cup heavy cream1 pound fettuccine6 tbs. unsalted butter2 cups grated parmigiano reggiano cheesesalt and pepper to tasteYou'll also need a large pot for boiling the pasta, a grater, a strainer, and a large frying or saute pan to toss everything in and melt the butter. . I do this the old fashioned way because I lack money for fancy food processors and I like the physcial act of grating the cheese. It smells amazing!Unwrap your cheese and cut the rind off. Put the rind in a plastic bag. You can keep this in the freezer and use it in soups! I recommend putting it into a herby bean soup - those are the best!Then, cut your cheese into two smaller pieces if it's especially large.At this point, either tear off a sheet of parchment paper or use a large plate to set the grater on. That way, you'll be grating onto a surface that will allow you to dump the cheese right into the pot. So grate away! Eyeball the amount. Use less or more according to your level of cheese desire. . Fill the big pot with water (I'm using a stockpot) and throw in a couple of generous pinches of salt. Bring this to a boil.As soon as the water comes to a boil, dump your pasta in. You need to cook the pasta so that it is slightly underdone. You want it to still be able to accept some liquid so that it'll soak up the cream later!Put the 6 tbs. of butter into the saute pan over medium/low heat and start it melting while the pasta cooks. As soon as it's melted, turn off the heat. You don't want the butter to brown.When the pasta is done cooking, drain it and we'll move on to the next step. . When the pasta is draining, reheat the butter. You'll want the heat to be medium high. Now, turn the pasta into the pan and pour the cream over top. You'll fold this mixture together for a few minutes until the pasta soaks up nearly all of the cream.Then, add the cheese and fold again. It'll take a little elbow grease to get it all combined.Once it's combined, taste test and add pepper and salt as desired. I like a ton of pepper on mine. :D. Enjoy your pasta. :D\nAnd maybe invite some friends or family over to help you finish it off?\nRead the question below and select from the following choices.\nA: How to Make Baka Bana\nB: \nC: The Final Product!!!\nD: Eat!", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_25_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_25_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_25_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_25_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_25_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_25_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_25_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_25_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_25_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_25_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_25_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_25_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_25_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_25_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_25_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_25_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_25_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_25_17.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_25_18.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_25_19.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_25_20.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: ( Pumkin Part )\nB: Bruin Beskuit (Multigrain Rusks)\nC: Cooking of the Pancakes\nD: Preparing for Cooking of the Pancakes", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', '( Beatroot Part )', '( Beatroot Part )', '( Beatroot Part )']", "context": "Here is the context of these images:\n. - Two table spoon Ghee - 300 ml Milk - 100 gms grated Pumkin- 100 gms grated Beatroot- 100 gms sugar- Pinch of chilly flakes . - Heat 1 tablespoon Ghee in a pan at medium flame. - Add 100gms of grated Pumkin and stir it continuously becomes soft ( 4 to 5 mins  approx ).. Add 150ml of Milk and continue to stir till the milk evaporates ( 4 to 5 mins approx ). Add 50gms of sugar and continue to stir it till it becomes Yellow Shiny.Place the Yellow Shiny Pumkin dessert in a bowl.. Heat 1 tablespoon of Ghee in a pan at medium flame. Add 100gms of Beatroot and stir it continuously till it becomes soft on medium flame. ( 4 to 5 mins approx ) . Add 150 ml of Milk and continue to stir it till Milk evaporates. . Add 50gms of Sugar and continue to stir till it becomes shiny red ( Approx 4 to 5 mins ). Sprinkle pinch of  chilly flakes and serve the Dessert ( Hot ) in a plate or bowl.- 2 Servings\nRead the question below and select from the following choices.\nA: ( Pumkin Part )\nB: Bruin Beskuit (Multigrain Rusks)\nC: Cooking of the Pancakes\nD: Preparing for Cooking of the Pancakes", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_26_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_26_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_26_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_26_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_26_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_26_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_26_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_26_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_26_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_26_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_26_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_26_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_26_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_26_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_26_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_26_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_26_16.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: How to Make Coffee at TechShop\nB: Make Chipped Ice\nC: Streaming the Milk (Leche).\nD: Coffee Is Cooking.", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Ingredients', '@placeholder', 'Make Yogurt Mixture', 'Pour the Coffee Already Made to the Glass']", "context": "Here is the context of these images:\n. A jar of yogurt  Two teaspoons of condensed milk  Chipped ice  A  teaspoon of coconut milk  1 slice of lemon  20ml of coffee (see here)  A blender. Firstly, you use blender to make some chipped ice then put them into a glass..  Put condensed milk, yogurt, coconut milk and a few drops of lemon juice then blend it for 3 minutes.Then pour this mixture onto the chipped ice.. You can make a cup of milk coffee or black coffee. . Finally you pour 20ml of coffee onto the chipped ice and yogurt mixture. In the summer, Vietnamese people often drink this kind of drink.Enjoy the great taste. \nRead the question below and select from the following choices.\nA: How to Make Coffee at TechShop\nB: Make Chipped Ice\nC: Streaming the Milk (Leche).\nD: Coffee Is Cooking.", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_27_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_27_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_27_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_27_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_27_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Pumpkin Butter\nB: Processing\nC: Smear\nD: Storing", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Blend', '@placeholder', 'Top', 'Eat']", "context": "Here is the context of these images:\n. Combine the walnuts, olive oil, salt, and honey and blend them. You may have to add more honey or olive oil or even a small amount of water for texture or taste. You should end up with the texture of peanut butter but with little pieces of walnuts.. At the restaurant we tried this at they have us four simple steps to eating this walnut bitter! So after making the butter there are only a few simple steps left until this delicious snack is in your mouth! Smear the walnut butter on any type of bread! It is really good on whole wheat which is also very healthy!. Cut strawberries into small slices and add them on top of the bread and walnut butter.. Drizzle some honey onto the strawberries.. Finally it's time to enjoy your delicious snack! Refrigerate the extra walnut butter for about a week at the most if you can make it last that long!\nRead the question below and select from the following choices.\nA: Pumpkin Butter\nB: Processing\nC: Smear\nD: Storing", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_28_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_28_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_28_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_28_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_28_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Peanut Butter Pie\nB: Don't Make the Oven Too Hot! 275*\nC: Bake the Cookies\nD: Wet Ingredients", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Whip It Whip It Good!', 'Fill Me Up Buttercup!', '@placeholder', 'ENJOY!']", "context": "Here is the context of these images:\n. Crust:1 cup of roasted peanuts2 Packages of Oreos (I have three because that way when I eat some I wont run out)Full size Reese's PB cups 10-12 or so1 stick of butter (I'm from the south cannot help myself)A spring-form pan. Filling:3 Blocks of cream cheese1 8oz sour cream4 Eggs3/4th cup of Brown Sugar1 cup of Peanut butter (if you can find Reese PB get it)1 Teaspoon of Vanilla extract1/2 cup of heavy whipping creamone package of mini Reese's PB cups. First remember do not use all of the crust in the pan because you will want some for the top.Take the first package of Oreos and pulverize it to dust with the 1 cup of Roasted peanuts. This will be the base or bottom of the pan crust.Hand chop 10 - 12 of the regular cups place in bowlHand chop or hand crush the other package of the Oreos it takes time but its worth it if you refrigerate them before hand they will crush and chop with ease.Mix the hand chopped items with some of the powdered crust.Melt a stick of butter.Mix melted butter with both parts of crust. No need to be 100% exact just make both parts pliable. Butter up the pan with some left over butter or some margarine.Form the bottom of the pan with a thin layer of crumbed crust. Mix the leftovers with the cut up chunks.Form a thick wall around the pan. Be sure to push on the bottom edges so they meet.Place crust in freezer.. Mix each of the blocks of cream cheese and the 8 ozs of sour cream together one at a time on medium low. Its easier to use a bowl scraper attachment but my bowl is bent so I cannot really do that... Just scrape the bowl in between for an even mix. you will know when you are done when its all smooth and creamy.. Turn up to a medium setting like 5-6For the brown sugar make sure you grate it blend it or do whatever you need to to make sure there are no clumps. Nothing is worse in a cheesecake than hard clumps.Mix each egg individually until each is incorporated fully in.Pretty much just dump that cup of Peanut Butter right in there. One TSP or tablespoon of Vanilla extract. what ever is to your liking. I normally do a tablespoon because PB is such a strong flavor.Hand chop the minis really the amount is whatever is to your liking. I normally do 16-20 thrown them in and mix.. 1/2 cup of whipping cream then mix on high for 25 seconds to get that air in there.After this you can fold more PB cups into the batter. Really its up to you its your cheesecake.. Take the shell out of the freezer and pour the batter in the pan. Try and make sure there is about 1 in of wiggle room for the Cheesecake to rise.. Important part. Put a pan or pot of water on lower rack. I'm not all about that water bath.Oven should be set to 275* you are now probably asking hey why not 350 everyone is all about the 350. Eggs that's why. Eggs expand rapidly when exposed to high temps at don't quote me but 325. Rapidly expanding means cracked cheesecake and less creamy cheesecake. Bake for 1 to 1 and 1/2 hours depends on your oven. You know when it is done because the top of the cheesecake will be far from the sides of the crust. Be sure to clean the bowls and kitchen while this is baking. Also I normally do some sort of cardio or pushups and sit ups to prepare myself for the Calories I am about to consume.. Here is where you preform a self marshmallow test. Leave in oven while off for 20 minutes Open door for 20 minutesPlace on cooling rack for another hour until cool. Normally it takes about 2 hours for the whole process. Stages are because cheesecakes frighten easy and are prone to temperature shock. Which will have the effect like popping a balloon. When it is completely room temperature and not warm to the touch sprinkle the rest of the crust on top generously. Cling wrap it and place in fridge for minimum of 9 hours.. Here are some NSFD photos.Use a knife warmed by hot water to cut and enjoy.Make sure if you take this to a party one put a paper towel under the pan before you open it crumbs are crummy gifts to give.Also make sure no one with peanut allergies will be there. Because this \"may contain nuts\"\nRead the question below and select from the following choices.\nA: Peanut Butter Pie\nB: Don't Make the Oven Too Hot! 275*\nC: Bake the Cookies\nD: Wet Ingredients", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_29_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_29_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_29_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_29_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_29_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_29_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_29_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_29_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_29_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_29_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_29_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_29_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_29_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_29_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_29_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_29_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_29_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_29_17.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_29_18.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_29_19.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_29_20.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_29_21.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_29_22.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_29_23.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_29_24.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_29_25.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_29_26.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_29_27.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_29_28.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_29_29.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Sheet Pan Mediterranean Chicken and Potatoes\nB: Fill 'er Up!\nC: Oh Yea.. Forgot the Shameless Pandering\nD: Ingredients", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Cooking', 'Enjoy', 'Food Science']", "context": "Here is the context of these images:\n. I went with 6 Russet potatoes (mostly because that was what we had in the pantry) but you can use any spud of your choosing one large white onion three bell peppers one package mild Italian sausage one package hot Italian sausage. before I detail each delicious layer of yummy goodness .. i just need to say ... all hail the mighty mandolin slicer :)  It makes food prep so easy.apply a light coat of olive oil to the panlayer in your potato slices to cover the bottom of the pan (can you think of a better way to soak up all the flavor as it drips down through the layers?)generously distribute your assorted peppers for maximum color enhancementlayer in the onions ... or as I like to call it .. the sausage suspension systemand now .. the final piece of the puzzle ... the sausage.  Don't be afraid to cram the pieces in tightly, remember there will be shrinkage during cooking.. Set your oven to 375 degrees Cover the pan with foil Bake for an hour and 40 minutes Uncover the pan Bake for one more hour. Now comes the hard part ....\u00a0 deciding how you want to serve your meal. Do you delicately disassemble your creation and serve each tasty layer in its own glory? OR Do you just ladle a heaping spoonful of heaven into a roll, top it with some steaming hot tomato sauce and chow down? Choose your path ... AND ENJOY!!!. Sensory Analysis (Affective Testing) - Also known as consumer testing, this type of testing is concerned with obtaining subjective data, or how well products are likely to be accepted. Using a focus group of 8 people covering the ages 2 to 55 demographic I was able to obtain the following reaction results very flavorful .. especially the potatoes *two thumbs up* Terrific Scrumptions Delicious Thanks for cooking so I didn't have to yummy it was very good .. thank youFood Physics \u2013 the physical aspects of foods (such as viscosity, creaminess, and texture) By baking the dish uncovered for the last hour, the sausages browned nicely. \u00a0 There was a 15-20% size reduction, which is to be expected. The peppers and onions softened nicely but retained more of their flavor than i was expecting. by far the nicest surprise of the dish had to be the potatoes.\u00a0 Because of their location at the base of the dish they soaked in an amazing blend of flavors.\u00a0 \nRead the question below and select from the following choices.\nA: Sheet Pan Mediterranean Chicken and Potatoes\nB: Fill 'er Up!\nC: Oh Yea.. Forgot the Shameless Pandering\nD: Ingredients", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_30_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_30_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_30_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_30_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_30_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_30_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_30_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_30_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_30_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_30_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_30_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_30_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_30_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_30_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_30_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_30_15.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Bubur Lambuk\nB: Carving Apple Version 2\nC: It's Showtime\nD: Assembly", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Gather the Materials', 'Add the Cold Beverages of Your Choice', 'Build BBQ Tool Holders', '@placeholder']", "context": "Here is the context of these images:\n. The materials you'll need are...\nFor the Drink Helmet\n1. duct tape\n2. two 14.5 ounce cans\n3. about 3 feet of plastic tubing\n4. a cold beverage of your choice\n5. a general plan\nFor the Utility Belt\n5. duct tape\n6. grilling utensils\n7. condiments. Start the Drink Helmet by making a headband.\n1. Measure off about 27 inches of duct tape, then two more about 16 inches each.\n2. Fold the strips in half then tape the sticky sides together to make a band.\n3. The longer band will serve go around your head like a ball cap. The shorter bands will go over your head from ear-to-ear and from forehead to the back of your head.\n4. Secure the contact points and intersections of the band as shown in the pic.. Next, duct tape two 14.5 ounce cans to the sides of the headband. See pics.... 1. Cut 2 pieces of plastic tubing about 16 inches in length.\n2. Insert high-quality cold beverages.\n3. Insert tubing.. 1. Place Drink Helmet (with drinks) on your head.\n2. Suck.. 1. Strip off enough duct tape to go around your waist (lengths vary).\n2. Fold it in half to (a) get rid of the sticky stuff and (b) make a waistband.. 1. Pick your favorite condiments.\n2. Make holders for each by first making duct tape bands for the girth and then the circumference of each condiment.\n3. Attach and secure the intersections of each band as shown in the pics.\n4.Attach the condiment holders to the utility belt as shown in the pics.. 1. Build your tong and/or flipper holders for each side.\n2. Just make two short bands as before then attach to the belt. See pics...\n3. You're now ready to put it on and get to cookin'/showin' out. Video on next step.. \n          For a man, grilling out is less preparing food and more communing in the age-old rite of cooking over fire. The only difference between now and the men described in the book of Leviticus is that today grilling out is an art form--a dance between man, machine, fire, and food. To wit, watch the video...\nRead the question below and select from the following choices.\nA: Bubur Lambuk\nB: Carving Apple Version 2\nC: It's Showtime\nD: Assembly", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_31_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_31_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_31_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_31_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_31_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_31_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_31_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_31_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_31_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_31_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_31_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_31_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_31_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_31_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_31_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_31_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_31_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_31_17.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_31_18.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_31_19.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_31_20.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_31_21.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_31_22.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_31_23.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_31_24.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Summertime Peach Melba Pie\nB: The Directions!\nC: Bake and Enjoy!\nD: Make the Meringue", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Assemble the Ingredients', 'Prepare the Crust', 'Mix the Filling', '@placeholder']", "context": "Here is the context of these images:\n. For the filling:\n5 egg yolks\n1 15 oz can of sweetened condensed milk (like Eagle Brand)\n1/2 cup dairy sour cream\n1/2 cup fresh squeezed red grapefruit juice\u00a0 OR\nfor a more intense flavor, 1 cup of grapefruit juice, simmered over low heat until reduced to 1/2 cup\n2 teaspoons grapefruit zest\n3 Tablespoons fresh lemon juice\n2 teaspoons Rose's Grenadine syrup\npinch of salt\na drop or two of red food coloring (optional)\nFor the topping:\n5 egg whites, room temperature (\"old\" eggs provide the better whites for meringue than really fresh eggs.\u00a0 It helps to leave the eggs out at room temperature for 12-24 hours before assembling your pie)\n3/4 cup white sugar\nFor the crust:\n1\u00a0 1/2 cups finely ground graham cracker crumbs\n1/2 cup finely chopped hazelnuts\n1/4 cup brown sugar\n1/2 cup (1 stick) salted butter, melted. Preheat oven to 350 degrees F (175 C).\nCrush the graham crackers by placing one package in a paper sack and roll with a rolling pin OR roughly break up with your hands, place in a deep bowl and use an immersion blender to pulverize OR add crackers to a conventional blender or food processer.\u00a0 Do the same with the hazelnuts.\nToss the graham cracker crumbs, hazelnuts, brown sugar and melted butter in a bowl and blend thoroughly.\u00a0 Place the crumb mixture in a 9\" pie pan and pat gently onto the bottom and up the sides of the pan.\u00a0 Brush away any loose crumbs (these may burn during baking).\nBake the pie shell at 350 degrees for 8 minutes.\u00a0 Remove from oven and cool completely.. Separate the 5 eggs, being very careful to get no yolk in the whites.\u00a0 Put the whites in a clean, deep bowl, and set aside so they may come to room temperature.\nJuice a large Ruby Red or Rio Star grapefruit;\u00a0 strain out any seeds or pulp.\u00a0 Set aside 1/2 cup of juice.\u00a0 Zest grapefruit, add 2 tsp of the zest to the juice..**on edit-\u00a0 I have made this pie twice more, and I find that you'll get a much more intense grapefruit flavor if you start with 1 full cup of freshly squeezed juice and simmer it until it reduces to 1/2 cup.\u00a0 This tends to eliminate the need for lemon juice for added tartness, as well.**\nPlace the yolks in a larger bowl and blend briefly with a hand\u00a0 or stand mixer.\u00a0 Add the sweetened condensed milk and sour cream, and blend briefly again.\u00a0 Add the grapefruit juice, and Grenadine -\u00a0 blend just long enough to thoroughly mix the ingredients.\u00a0 Do not over blend.\nTaste the filling and adjust the tartness by adding the lemon juice one Tablespoon at a time until you are satisfied. with the level of acid.**Taste FIRST.\u00a0 Lemon juice may not be necessary if you have used reduced grapefruit juice**\u00a0 Add a drop or two of red food coloring if you wish to enhance the pink color of the filling.\u00a0 Blend just briefly again.\nPour the filling into the cooled graham cracker crust and let pie sit for 10 minutes to set up. Bake at 350 degrees F (175 C) for 15 minutes.\u00a0 Take the pie out of the oven, assemble your meringue and top while the pie filling is still hot.. Using a stand or hand mixer, beat the room temperature egg whites on high speed until they form soft peaks.\u00a0 Slowly sprinkle in the sugar and continue to beat until the meringue is stiff, thick and glossy.\u00a0 Check to make sure the sugar is dissolved (meringue not gritty).\nSpread meringue over the still-hot pie filling, making sure that meringue reaches past the filling to the pie pan to \"seal\" the edges and prevent shrinkage.\u00a0 Create peaks in the topping with the back of your spoon\nBake the pie at 350 degrees (175 C) for 8 to 10 minutes or until meringue is delicately browned.\nCool pie completely before serving (but do not place in refrigerator).\nMay also be served with whipped cream if preferred.\nGrapefruit Pie- a trippy tropical treat!\nRead the question below and select from the following choices.\nA: Summertime Peach Melba Pie\nB: The Directions!\nC: Bake and Enjoy!\nD: Make the Meringue", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_32_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_32_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_32_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_32_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_32_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_32_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_32_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_32_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_32_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_32_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_32_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_32_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_32_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_32_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_32_14.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Beer Can Mug\nB: Cook the Chicken\nC: Gather Your Ingredients\nD: Preparing the Marinade", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Prepare the Chicken', 'Prepare Your Beer Can', 'Move the Chicken to the Grill.', '@placeholder']", "context": "Here is the context of these images:\n. Start your grill preheating (you only need one burner on low heat). Then gather the ingredients for your spice rub.\nAny good spice rub will work with this recipe. It's okay to use a store-bought rub if you have a chicken or rib bbq rub that you like. Or, you can mix one up using spices from your pantry. Feel free to experiment to fit your tastes!\nThe one in this picture used:\n1/4 cup paprika\n1 tbls brown sugar\n1 tbls granulated sugr\n2 tsp salt\n1 tsp onion salt\n1 tsp  black pepper\n2 tsp cayenne pepper\n1 teaspoon garlic powder\n1 teaspoon onion powder\nMix all ingredients together in a bowl.. Clean out the cavity of the chicken to make sure the kidneys and giblets are removed.\nRinse the chicken in cold water, then pat with paper towels to dry.\nAnd of course make sure that you thoroughly sanitize your hands and workspaces after handling the chicken.. Sprinkle a tablespoon of spice into the cavity. Insert another tablespoon under the skin of the chicken and try and spread it out evenly. Finally, rub another tablespoon of spice all over the skin on both sides.. You will need a tall can of beer to hold your chicken. Any brand will do, so if you have a 20 ounce can of beer on hand, feel free to use that. Otherwise, you can buy single cans of beer in the refrigerator section of most liquor stores. I chose fosters because their keg-style can is wider and sturdier than the slightly cheaper budweiser.\nPop open the tab on the can and empty out 1/4 of the beer. I'm not into beer so I poured it down the drain, but feel free to drink it if you like!\nYou want to make a few more small holes to let the vapors escape into the chicken. You can use the pointy end of a beer bottle/can opener for this. I did not have one handy, so I tapped an awl into the top a few times.\nPour the extra spice rub into the beer can.. Keeping the can upright, place it into the cavity of the chicken.\nPlug the top of the chicken so that the vapors are sealed in. I used a peeled onion, but you could also use a potato, lemon, or lime.. Since you started preheating your grill during the first step, it should be nice and hot by now. \nMove the chicken to the grill and stand it up with the legs spread apart (this will help maintain balance). You want to cook it indirectly, so put it next to the burner that is turned on.\n. Close the grill and let the chicken cook on low indirect heat until the internal temperature reaches 185-190 degrees (F). If your chicken come with one of those handy pop-tabs it will tell you when it is done. Otherwise, you can monitor the tempetature with a meat thermometer.\nMy six pound chicken took an hour and fourty five minutes. . Like I said before, you will know your chicken has finished when it has reached the right internal temperature (185-190 F). When your chicken is finished, the outside will be crispy, but the meat will be fall-off-the-bone tender. Carefully remove the chicken from the grill. Place the finished chicken on a plate and discard the beer and can.. Let the chicken rest for 5 minutes before carving. Serve with your choice of sides; I reccomend mashed potatoes and corn. \nBon appetite!\nFor those of you who are interested, this cost $1.50-$2 a serving. My six serving chicken cost:\n$6 chicken\n$3 beer\n(?) spices - you are only using a little of each spice, so we'll say a couple dollars.\nRead the question below and select from the following choices.\nA: Beer Can Mug\nB: Cook the Chicken\nC: Gather Your Ingredients\nD: Preparing the Marinade", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_33_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_33_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_33_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_33_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_33_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_33_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_33_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_33_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_33_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_33_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_33_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_33_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_33_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_33_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_33_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_33_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_33_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_33_17.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_33_18.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Easy Rainbow Cookies\nB: Add Milk and Stir\nC: Cupcake #3\nD: Cupcake #2", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Ingredients/Tools', 'Add Butter', '@placeholder', 'Cut Out the Biscuits']", "context": "Here is the context of these images:\n. Ingredients:2 cups of all-purpose flour (256g)2 Tbsp. of baking powder (30g)1 tsp. salt (5g)1 Tbsp. white granulated sugar (12.5g)5 Tbsp. of cold unsalted butter (70g)1 cup of cold milk (whole milk, 1%, 2%) (240ml)Tools:SifterFood processor or Pastry blenderBowlsForkPlastic wrap  or towelBiscuit Cutter or GlassBaking sheet. First let's mix all of our dry ingredients together using a sifter. If you don't have a sifter you can use a fine mesh hand strainer as well. So we will mix our flour, baking powder, salt, and sugar. . Now using a food processor or alternatively you can use a pastry cutter and fork, we will blend in our cold butter. I have the butter cut up into little pieces, which just helps to blend it in. Using the pulse setting on the food processor will work fine. . Now we just transfer the flour and butter mixture back to our large bowl and add in our cold milk. Then using a fork or spoon mix the ingredients together until a dough forms. Now if your dough is sticky, add a couple more tablespoons of flour to it. The dough should not be sticky. . Now we dust our work surface with some flour and place our biscuit dough on it. Then shape it in your hands until it is a flat rectangle, then fold it over on itself once and push down, then flatten it out again, and fold it over, etc. So essentially we are just doing a quick basic kneading of the dough. Do this for 2 or 3 times, we don't need to knead it like we would a yeast dough. Now just shape it out into a rectangle or circle about 3/4 to 1 inch thick. Then cover it with plastic wrap and let the dough rest for 20 to 30 minutes. . Let's go ahead and preheat our oven to 425 degrees F. (218 C) Now we just need to cut out our biscuits, you can use a biscuit cutter or use a glass. Take your biscuit cutter or glass and push down on the dough, don't twist it around too much, then place your cut out biscuit on an un-greased baking sheet. You can take the leftover pieces and gather them back up and push them together then form them back into a rectangle, in order to cut out more biscuit shapes. Once we have cut out all of our biscuits lets bake them in the oven for 10 to 15 minutes until they puff up and get nice and golden brown on the top. . All right once they come out of the oven they will look like this. They are ready to be eaten! However you would like to eat them. One of my favorite ways is to cut them in half and then butter them and add honey. Mmm yummy! Enjoy!. Now watch those steps in action by checking out the video tutorial!\nRead the question below and select from the following choices.\nA: Easy Rainbow Cookies\nB: Add Milk and Stir\nC: Cupcake #3\nD: Cupcake #2", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_34_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_34_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_34_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_34_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_34_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_34_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_34_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_34_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_34_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_34_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_34_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_34_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_34_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_34_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_34_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_34_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_34_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_34_17.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_34_18.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_34_19.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_34_20.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_34_21.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_34_22.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_34_23.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_34_24.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_34_25.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_34_26.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_34_27.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_34_28.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_34_29.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_34_30.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Easy Homemade Ice Cream\nB: Bottom Layer\nC: Stack the Ice Cream Sandwiches\nD: Add Fruit.", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Ingredients', '@placeholder', 'Pour Into Pan', 'Freeze']", "context": "Here is the context of these images:\n. For this dessert you will need the following:-Bryer 's Vanilla Ice Cream-12 oz of raspberries, fresh or frozen-Raspberry Jell-O (or any flavor gelatin in a 6 oz. package.). I couldn't find a small packet of Raspberry Jell-O,  so I measured out half the packet on a kitchen scale. It was a little bit less than a half cup. *****REVISION USE ALL 6 OZ OF JELLO! IT COMES OUT WAY BETTER!*****I heated up a little more than a cup of water in the microwave, and stirred the gelatin into the water.. Allow the ice cream to soften for 30 minutes before this step.Once the ice cream is soft, place it in a metal mixing bowl and mash it with a whisk or a potato masher. Slowly mix in the hot gelatin mixture with the ice cream.. This part is fun. Pick up the raspberries and squish them in your hands. Give it a quick stir to coat all the raspberries and then let it sit for about 3 minutes.. I like to use glass or ceramic baking dishes for this dessert - they hold the cold in a little bit better than a metal square pan could. . This generally takes a little more than an hour to set-up. But once its firm to the touch, it's pretty much ready.. If you let this freeze long enough, it stays pretty solid. It's easy to serve with just a knife, and goes fast. It's a great frozen treat in the summer, and you can try all different kinds of fruits and Jell-O flavors.Enjoy!\nRead the question below and select from the following choices.\nA: Easy Homemade Ice Cream\nB: Bottom Layer\nC: Stack the Ice Cream Sandwiches\nD: Add Fruit.", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_35_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_35_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_35_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_35_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_35_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_35_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_35_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_35_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_35_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_35_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_35_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_35_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_35_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_35_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_35_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_35_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_35_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_35_17.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_35_18.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_35_19.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_35_20.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_35_21.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_35_22.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_35_23.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Slime Juice\nB: Add the Juice\nC: Pour the Kombucha!\nD: Waiting...", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Preparing Berries', 'Acid and Water', '@placeholder', 'Filtering']", "context": "Here is the context of these images:\n. Clean the berries and crush them in a bucket.It is easier to crush them in small batches. Make sure that most of the berries are crushed. Some berries can be whole, we will later on re-use the leftovers and crush them again.. Tartaric acid is crystallic white powder. Dissolve 25g of acid to about half a litre of water.Add the acid to the bucket with the crushed berries.Add water until you have approximately 3,5 litres to 4 litres of berry-water mix.. Store your berry-water mix in a cool place covered with lid. Wait for 3 days and your juice is ready for filtering.. You can start by mixing the water-berry mixture a bit.Place sieve to the other bucket and place the filtering cloth in it.Scoop the berry mixture to the sieve and let it filter. This is slow process and easier with bit smaller batches.Collect all filtered berry mash for re-use.. Now you should have about 2 litres of juice.You need 1/2 kg of sugar for each litre of juice. So if you have 2 litres of juice add 1 kg of sugar and mix until the sugar is dissolved.Clean bottles and use funnel to fill them.Cap the bottles and your juice is ready!Store the juice in cool place and enjoy your vitamins through the long dark winter.Bonus round:You can re-use the collected berry mash. Just follow the same instructions with once used berry mash and you get almost as good lingonberry juice for the second run. It will probably have slightly lighter color, but the taste is there.\nRead the question below and select from the following choices.\nA: Slime Juice\nB: Add the Juice\nC: Pour the Kombucha!\nD: Waiting...", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_36_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_36_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_36_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_36_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_36_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_36_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_36_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_36_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_36_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_36_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_36_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_36_11.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Caramel Apple Cheesecake\nB: Pie Assembly Part 2\nC: Prepare the Garnishes\nD: Pour Your Cooled Apples on Top", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Making the Pie Filling', 'MEANWHILE... Let the Creativity Flow', '@placeholder', 'Put Some Heat to the Creativity']", "context": "Here is the context of these images:\n. I decided to make pretty much everything from scratch for this pie, including the crust.  If you wish to cut time by buying pre-made crust, then by all means GO AHEAD. This is 2014. But just keep in mind it may not be as tasty.  TIME: 20 MIN total for crust Crust is not too painful to make. You will need for crust   1 1/2 cups of Gram Cracker crumbs or 12 Gram Crackers made to crumbs.       3 tbsp (tablespoon) of sugar      3/4 cup of non salted butter melted      1 tsp (teaspoon) of cinnamon      Springform pan 9\" or 10\" pie plate  Turn your oven on to 375 degrees Whisk or with a fork mix together the graham cracker crumbs, sugar and cinnamon. (The cinnamon will give it some nice flavor.) Melt the unsalted  butter in the microwave and use a fork to mix the butter with the crumb mixture until they are all are moistened. It will look clumpy and much darker and that's a good thing. Spread the crumbs into a 9-inch spring-form pan OR a 10-inch pie plate and press them firmly into an even layer over the bottom and half of the way up the sides of the pan. Bake for 6 to 8 minutes, or until golden brown. Let it sit for 8 minutes to cool, or just stick it in the fridge to save time. By baking the crust you will get a more crunchy crust. Which will go beautifully with the crunchy top I have planned for this pie =). The secret is toasting the nuts!! Forgive me Grandma!! Haha, I'm just kidding. No, but seriously. Any time you have a dish with nuts, the secret to ultimate flavor is to toast them. It only takes 5 minutes, and enhances the flavors so much! TIME for Sauce: 8 MINWhat you will need for the Special Caramel sauce.    1 packet of apple-cinnamon granola from Quaker Oats.       3/4 cup of chopped pecans      1 cup salted caramel sauce. I used sugar-free in order to not go overboard with the sugar.      small cooking sheet for toasting the nut mixture in the oven.   Open the packet of granola and pour in a nut chopper as well as the pecans. You could also break them up yourself by putting them on a cooking sheet and breaking with a spoon, but it may get messy.  Since the oven is already going because the crust was just made toss the nuts in!  After 5 min of toasting pull them out. They should smell amazing. Take the crust out of the fridge. It should be cooled by now. Pour the caramel on top of the crust and sprinkle the toasted nut mixture on top of the caramel.  Place the springform pan into the fridge to chill out. MAKE SURE YOU SAVE SOME TOASTED NUTS FOR LATER. ;) You will use them as a garnish. The Infinity pie is based off an apple cheesecake pie. So making the apple pie part is very much like making a regular apple pie as you would have guessed. You can either BUY (it's 2014) your apple pie filling OR you can make it. I chose to make it because I want a delicious pie this time. Dedicating something to my hero only deserves the best! ;)  NOTE: if you are using a can of apple pie filling you only need to use half!! TIME for Pie filling 40-50 min (depending of if you have an apple corer)What you will need for Apple pie filling   5 small granny smith apples. They must be peeled and cored and cut them thinly (slave work)      3/4 cup of unsalted butter      1/2 cup of light brown sugar      2 tsp of cinnamon      a couple dashes of salt       a large pan for cooking on the stove  I DON'T have an apple corer. So this part took extra long.... my boyfriend wasn't too thrilled. But it's only 5 little apples. While you are peeling apples, put the butter on the stove and begin melting it. It will only take a few minutes. When it's melted add the brown sugar and cinnamon to the butter and mix until gently bubbling. Again it only takes a few minutes so you probably won't be done with your apples. The\" brown apple syndrome\" will happen and it's alright. These apples are destined to go into a brown sugar liquid and cooked extremely soft. No harm so don't stress! ;) when you're finished with the apples slide them in the large cooking pan and coat them well with the liquid. Put a lid on the pan and stir occasionally for 10 min. Remove the lid and up the temperature to med-hi to boil off most of the remaining liquid. Throw a few dashes of salt in. After another 15 min the apples should be very very soft and that's what you're looking for.  LET SIT FOR 20 min to cool before adding to your pie crust. Getting tired yet??. You can turn the stove off if you want to save electricity for 20 min while the apple pie filling cools....  But OK, you have 20 min to make a design to top your Infinity pie. Me, because I didn't want to have to make a batch just for crust I broke down and bought my pre-made crust. FORGIVE ME GRANDMA. ;) haha Pre-made crust is very easy to work with. You just unroll and cut out whatever design you want. I see pie design tops (much like pumpkins today) as a big fad soon. It is taking off but not like I think it will soon. But anyways, cut out whatever your heart desires! If you mess up, crust is easy to erase... just flatten out and try again. For stencils, I just found shooting stars online, printed them out, and laid them over the dough and cut it. Easy as pie. My shooting star is dedicated to Carl Sagan and the infinite universe. =). Exactly as the title says... pour the cooled apples on top of the cooled caramel mixture that's been chilling in the fridge.  This is the easiest step! ;). I love cheesecake. If it were me I'd put cheesecake in everything. But I probably wouldn't live long. Anyways, again, this is only technically half a cheesecake so the ingredients aren't as heavy. Turn that stove back on to 350 degreesWhat you will need for cheesecake topping:   8 ounces of soft at room temperature cream cheese   1/2 cup of granulated sugar   1 egg medium sized      1 tsp of vanilla extract      1tbsp of lemon juice      lemon wedge for lemon zest      electric mixer and a medium sized bowl  First you will need to beat the cream cheese and sugar together on medium speed for about a minute. They must be well mixed. Then add the egg and beat it in until it is combined for about a minute.  Then add the lemon juice and vanilla extract and beat for another minute. Zest the lemon wedge in. Just a few times is all it needs. Pour the cheesecake batter over the apples in the pan, smoothing it into an even layer as much as you can. Bake until the cheesecake is set about 25-30 minutes. While this is happening, as you will see in the next step, coat your design you made with the pie crust with egg whites and bake at the same time in the oven with the pie.. Because pie crust is usually not belonging on cheesecake, I decided to bake it separately on a cooking sheet.  I coated it with an egg white to give it shine and baked it next to the cheesecake for 5-8 min. When it was done, I pulled it out and sprinkled it with sugar while it was still hot to give it some sweetness.  The cheesecake should be done within 30 min. Transfer the cheesecake pan to a wire rack to cool, the cheesecake must refrigerate for at least 4 hours or overnight. (For me, since it was already midnight when we were finished... lol, we ended up just chilling over night.).   Before you put your infinity pie in the fridge...., ta dahhhh, the toasted crust design goes on top of the cheesecake like a crowning jewel! Then, add some of the remaining crunchy toasted nuts on top and along the outsides to bring it to life. Then, put that sucker in the fridge overnight. I know it will be REAL HARD. But trust me, it needs to cool for at least 4 hours.  When serving your Infinity pie, put some caramel on the plate along with the special crunchy nut mixture. It will definitely knock someone's socks off! Pair with vanilla bean ice cream for a real desert! Be sure to refrigerate any leftovers.\"The sky calls to us; if we do not destroy ourselves. We will one day, venture to the stars\"  -Carl Sagan  This one's for you Carl! Enjoy your Infinity pie everyone =) PLEASE let me know if you make it!\nRead the question below and select from the following choices.\nA: Caramel Apple Cheesecake\nB: Pie Assembly Part 2\nC: Prepare the Garnishes\nD: Pour Your Cooled Apples on Top", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_37_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_37_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_37_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_37_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_37_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_37_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_37_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_37_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_37_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_37_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_37_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_37_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_37_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_37_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_37_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_37_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_37_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_37_17.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_37_18.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_37_19.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_37_20.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_37_21.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_37_22.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_37_23.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_37_24.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_37_25.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_37_26.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_37_27.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_37_28.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_37_29.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_37_30.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_37_31.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: pumpkins / Pokemon Rice Crispy Ball\nB: Cover With Chocolate (four Coats!)\nC: Supplies\nD: Melting", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['To Start', 'Make Rice Crispy Treats & Melt the Chocolate', '@placeholder', 'Done!']", "context": "Here is the context of these images:\n. Ingredients:\n3 tablespoons butter\n10 oz (1 package) mini marshmallows - separate out about 12 for the mini marshmallow centers\n6 cups rice crispy cereal\n1\u00a0\u00bc oz almond paste (optional)\njumbo marshmallows (optional - for the large surprise center)Toppings:\n21 oz good quality chocolate (six 3.5 oz bars)\ncoconut flakes (optional)\nchocolate powder (optional)Special Tools:Lollipop sticks\nNon stick aluminium foil\nSpray cooking oil (for your hands)\nStyrofoam for sticking pops into\nI\u00a0re-purposed\u00a0a pasta drying stand with binder clips to hang the pops to set\na double boiler to melt the chocolate in (or a metal bowl that can fit atop a small pot)\nI made 3 different sizes and got 22 \u00a0pops from this recipe. \u00a0\n(2 jumbo pops, 8 small plain bite size, 12 with marshmallow centers). In a large pot over medium/low heat:\nmelt 3 tablespoons butter\nadd 10 ounces mini marshmallows\nadd 1.25 ounces almond paste (optional) - the almond taste was barely noticeable next time I may add more.\nstir until melted and smooth\nstir in 6 cups rice crispy cereal\nPour out onto a a cookie sheet lined with non-stick aluminium foil.\nLet cool for a few minutes before forming into balls.\nMeanwhile heat up a small amount of water in your double boiler or pot. \u00a0The higher quality chocolate you use the easier it melts so the water should not be boiling hot. \u00a0When the water is steamy hot remove it from the stove and place the metal bowl atop the pot and break your chocolate into small pieces into the bowl. \u00a0The chocolate will melt and the stay liquid for quite a long time without having to be put back onto the stove. \u00a0If your chocolate does start to harden just heat up the water more without the bowl on top. \u00a0If the chocolate gets too hot it may seize up and you'll have to start over with new chocolate.\u00a0\nNow you can chocolate cover some mini marshmallows for the center of the pops.\nI stuck the marshmallow onto a toothpick to dip it into the chocolate and another toothpick to remove it. \u00a0Put them into the freezer for a few minutes so they set up quickly and can be handled when forming the rice crispy balls.. Once the treats have cooled enough to be handled (a few minutes) spray your hands lightly with cooking oil. \u00a0Just a little bit. \u00a0This will help so much and your hands will be so soft afterwards.\nForm the rice crispy treats into balls.\nJust form the rice cereal like a bowl around a marshmallow to make a surprise center.\nLet the balls sit for a few minutes to firm up.\nOnce they feel firm push the lollipop stick in and then remove it and fill the hole with a bit of chocolate to \"glue\" the stick in. \u00a0\nPlace in the freezer for a few minutes to set up quickly.Note: \u00a0I used different colors of pen to mark the bottoms of the sticks so I knew which ones had which centers.. Coat each pop with chocolate and let the excess drip off.\u00a0\nI used binder clips on the bowl to hold the pops so the excess could drain back into the bowl.\nThe picture shows only one binder clip set but I made three sets so by the time I covered the third pop with chocolate the first could be taken out and hung from the pasta dryer to set for a few minutes.\nThis was done to keep the excess chocolate from dripping down the stick. \u00a0If you use candy coating instead of real chocolate then I think you can just skip this and place them into the styrofoam sheet.\nAfter 8 pops were done they were placed into a styrofoam sheet and put into the freezer for a few minutes to set up quickly.\nI repeated this four times for each pop so they have a THICK coating of chocolate (I love chocolate).\nBefore the last coat dries completely add any additional toppings: coconut flakes, chocolate powder, etc.\nI used a chocolate mocha hot chocolate powder to coat one of the pops. \u00a0Sweet chocolate with coffee flavor - yum!. As I added each coat of chocolate I lost the perfect roundness of my rice crispy balls, especially the one with the jumbo marshmallow center. \u00a0It's heavy & lopsided so I had to eat it myself :)\nSince I used real chocolate and used a freezer to set the chocolate quickly and not candy coating melts I suggest keeping these in the refrigerator to keep the chocolate from blooming - which isn't pretty but still tasty. \u00a0\nTake them out of the fridge and come to room temperature before serving for the rice crispy treat to soften up to eat.\nRead the question below and select from the following choices.\nA: pumpkins / Pokemon Rice Crispy Ball\nB: Cover With Chocolate (four Coats!)\nC: Supplies\nD: Melting", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_38_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_38_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_38_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_38_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_38_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_38_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_38_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_38_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_38_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_38_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_38_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_38_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_38_12.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Classic Chicken Noodle Soup\nB: Prep Work\nC: Enjoy!\nD: Prep Work", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Ramen Noodle Breadcrumbs', 'Breading', 'Cooking', '@placeholder']", "context": "Here is the context of these images:\n. This recipe serves two people or one extremely hungry student :)Ingredientsone chicken breast ($1.50)one ramen noodle packet with seasoning ($0.25)one egg ($0.20)Total Cost: $1.95That's all the ingredients that there is to it! Feel free to double or even triple the recipe if there are more people.. 1. Start off buy putting your ramen in a plastic bag. Gallon bags work best but are not necessary. 2. Pour as much seasoning as you like. I usually use 3/4 the packet but add more or less to taste. Seal the plastic bag so that you don't make a big mess.3. Put your textbooks to work by using them to crush the ramen into small bits. Make the pieces bigger for crunchier nuggets or extremely fine for a softer nugget.4. Pour the ramen breadcrumbs into a bowl. If the plastic baggie you used to crush the noodles hasn't broken, you don't have to do this step, but most likely there will be a few small tears from where the textbook has stabbed it.. 1. Take your chicken breast and use a knife (ones that have broken and have been repaired by duct tape are fine) to cut it into bite sized pieces like in the second photo.2. Crack one egg into a bowl and whisk it using a fork/chopsticks/whatever.. 1. Start off by making sure you have a plate close by to put the breaded chicken on. Then place some chicken bits into the egg mixture making sure that every bit is coated.2. Place the eggy chicken into your crushed ramen noodles and use your hand to make sure every part of the chicken is coated and that there are no bare spots.3. Put the nuggets onto a place and get ready to cook!Tip: Have one hand do the wet stuff (coating the chicken with egg) and your other hand do the dry stuff (coating the chicken with ramen noodle, placing nuggets onto a plate). . 1. Pour some oil onto a pan. Doesn't really matter what kind of oil or what kind of pan, whatever you have. I used olive oil for this demonstration. Also, the more oil you use, the more tender and generally tastier the nuggets will be. Heat the pan on medium until the oil is hot.2. Place all of the nuggets in an even layer on the pan. Don't worry if some of the ramen noodle coating falls off, you can pick those up later.3. Cook until the bottom of the nuggets are a golden brown. The nugget in the third photo isn't done yet, it needs to cook for longer.4. Once the nuggets are golden brown like in the last photo, turn them over so that the other side can cook. Once you can see that the other side is also golden, remove the nuggets from the pan and transfer them to a plate (or just eat them out of the pan, less dishes amirite?)Tip: Don't put the cover on the pan! Condensation will form and drip onto your chicken nuggets making them soggy and wet.. Eat your nuggets when they are still warm and enjoy your delicious meal! If you liked this Instructible, please take a second to vote for me in the DIY University Contest! It would mean the world to me!Have a fantastic day,thederpyninja\nRead the question below and select from the following choices.\nA: Classic Chicken Noodle Soup\nB: Prep Work\nC: Enjoy!\nD: Prep Work", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_39_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_39_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_39_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_39_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_39_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_39_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_39_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_39_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_39_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_39_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_39_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_39_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_39_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_39_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_39_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_39_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_39_16.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Hot Jenever Toddy\nB: Eat and Enjoy!\nC: What You Need\nD: Blitz It!", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Mix It Up', 'Baking', 'Spread on the Butter', '@placeholder']", "context": "Here is the context of these images:\n. . Knead the dough in a mixer, or by hand for 5 min., until it's smooth, soft, and pretty slack. Then dust it with flour and put it in a plastic bag. Close the bag, (leave room for the dough to expand) and Let it rest for 30 min.. Split the dough into 8 equal pieces. Let the pieces rest, not covered, for 5 minutes.. Roll the pieces of dough into a thin rope, about 22 inches long. Twist each rope into a pretzel shape. Brush each of the pretzels with the warm water mixed with a teaspoon of sugar and set them on the baking sheets. Sprinkle them with a little salt. Then let them rest for 10 min. uncovered.. Bake the pretzels in a preheated, 500F oven for 8 to 10 min. or until they're golden brown. But don't forget to reverse the baking sheets half way through.. Take the pretzels out of the oven, and thoroughly brush on three tablespoons of melted butter on the pretzels. It may seem like a lot, but that's what gives them their yummy flavor.. I hope you enjoy the pretzels! They taste best when eaten warm.\nRead the question below and select from the following choices.\nA: Hot Jenever Toddy\nB: Eat and Enjoy!\nC: What You Need\nD: Blitz It!", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_40_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_40_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_40_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_40_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_40_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_40_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_40_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_40_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_40_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_40_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_40_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_40_11.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Azul Camaron Mariposa\nB: The Final Product\nC: Mash Browns\nD: Place on the Grill", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Ingredients List', \"Assemble Your ATB's\", '@placeholder', 'Remove From Grill']", "context": "Here is the context of these images:\n. 5 large Jalape\u00f1os 1 Pack of Bacon 1 container of cream cheeseBBQ RubBBQ Sauce (optional). First you will need to prepare your ABT's using all the ingredients above. Start by slicing the Jalapeno in half and with a spoon cleaning out the seeds and membrane. Next you will fill the half Jalapeno with cream cheese. Make sure to fill it full. Next you are going to want to apply some of your bbq rub onto the cream cheese. Lastly you will need to wrap your stuffed jalapenos with one full slice of bacon. TIP: Make sure to wrap it firm, this will help it cook together and you won't have to use toothpicks.. One your Atomic Buffalo Turds have been put together you will them place them onto your grill using indirect cooking with a tempreture of around 300-325 degrees. Place a small chunk of hardwood in for smoking (optional) and then close the lid and begin cooking for 1 hour 15 minutes.. After 1hr 15 mins, your bacon wrapped jalape\u00f1os should be done. If you like your bacon more cooked feel free to leave them on for a few more minutes or until your preferred doneness. Let them cool for a few minutes to allow the cream cheese to cool a bit. Serve it up with your favourite BBQ Sauce, Ranch Dressing or blue cheese sauce and enjoy.\nRead the question below and select from the following choices.\nA: Azul Camaron Mariposa\nB: The Final Product\nC: Mash Browns\nD: Place on the Grill", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_41_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_41_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_41_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_41_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_41_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_41_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_41_6.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Green Mountain Gronala\nB: Ingredients.\nC: Green Bean Bundles\nD: Enjoy!", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Prep Avacados', 'Add Some Flava!', 'Monster Mash', '@placeholder']", "context": "Here is the context of these images:\n. 2 Avocados\nLime Juice\nSalt\nGarlic\nOptional:\nDiced Tomato\nOnion Flakes. Cut avocados in half length wise.\nRemove pits and set aside.\nUse knife and cut slits in avocado vertically and horizontally, don't cut through skin.\nScoop out avocado meat with spoon.. Add salt to taste. And Yes, I melted my salt shaker to improve flavor..\nAdd lime juice to taste, fresh or concentrated.\nAdd garlic to taste.\nOptional:\nAdd onion flakes to taste, I was out :(\nAdd diced tomatoes to taste. I am mashing with a pastry cutter with works wonderfully.\nYou can also mash with fork, spoon, potato masher or fingers.\nOnce you've completed mashing, add avocado pits. They keep the guacamole from turning brown as fast!. There you have it! Some gooey, green, just darn good guacamole!!\nEven my cute critic like it!\nServe with chips, on burgers, or eat it off the spoon!\nRead the question below and select from the following choices.\nA: Green Mountain Gronala\nB: Ingredients.\nC: Green Bean Bundles\nD: Enjoy!", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_42_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_42_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_42_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_42_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_42_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_42_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_42_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_42_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_42_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_42_9.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Chocolate Coverd Peanut Butter Cups (Reese's)\nB: Topping 'em Up!\nC: Enjoy\nD: Melt and Mix Ingredients", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Mix Up the Filling', 'Important Chocolate!', \"Fill 'er Up!\", '@placeholder']", "context": "Here is the context of these images:\n. Healthier-For-You Peanut Butter CupsX tbsps. of peanut butter (X being the number of peanut butter cups that you want)icing sugarchocolate chips (or molding chocolate wafers. I just used what I had at home at the time of the craving, which was chocolate chips!)Mix peanut butter and icing sugar to taste. Seriously, I cannot give you an exact amount, because everyone likes a different amount of sweetness. I used chunky peanut butter, but you can use all-natural or already sweetened or hey, why not almond butter! Add enough icing sugar to the peanut butter that it becomes a sort of dough. It has to be able to be rolled into little balls of about a tbsp. each. If you put in too much icing sugar, it will crack, but too little icing sugar and the peanut butter will stick to your hands. Roll the peanut butter mixture into little balls, the size of the molds. I happen to have a candy mold that I bought on sale in a craft store, but frankly, you could use the molded plastic that chocolates often come in, when they're in layers in boxes. Or even a mini muffin tin in a pinch.. Melt chocolate chips or chocolate wafers in a mug in the microwave, stirring every 30 seconds until fully melted. Paint the inside of the molds, and set the mold in the freezer until the chocolate is set.. Push a ball of the peanut butter mixture into every mold. Press down with your thumb to make sure that it fills up the space. I also made up some wine jelly bonbons. These are great for a more adult end to a nice dinner, to serve with coffee! I used some wine jelly that I had lying around the kitchen, possibly from this recipe. You follow the same instructions as above in terms of the chocolate coating, but just use the wine jelly as an alternative to the peanut butter filling. Easy-peasy homemade fancy chocolates! Can we say holiday entertaining? *grin*. Brush the top with the melted chocolate and return to freezer (technically, a fridge is better, but my cravings have no patience). When set, pop out the peanut butter cups from the molds, and voila! Healthier-for-you peanut butter cups!Want to fancy them up? Why not sprinkle a little something on top the chocolates before they set up? Maybe some fleur de sel? Some chopped peanuts? A candied violet for the wine bonbons? It's up to you! For more yummy and easy recipes, check out my blog, at www.approachingfood.com! You can also follow me on twitter @approachingfood, or on pinterest @approachingfood. Even on fb: www.facebook.com/approachingfood! \nRead the question below and select from the following choices.\nA: Chocolate Coverd Peanut Butter Cups (Reese's)\nB: Topping 'em Up!\nC: Enjoy\nD: Melt and Mix Ingredients", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_43_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_43_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_43_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_43_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: What You Will Need\nB: How to Make Bulgogi\nC: Melt Your Chocolate\nD: \"Paint\" the Sides of the Mold", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Add the Vegetable Stock and Mix Well With a Spoon.', 'Knead Again for About a Minute and Let Rest for 10 Minutes.', 'Cook Seitan']", "context": "Here is the context of these images:\n. Ingredients:1/4 cup flour1/4 cup corn flour (or fine grind corn meal)1/4 cup soy flour1 1/2 cup vital wheat gluten*1/2 tsp salt1/2 tsp baking powder1 1/2 cups vegetable stock**another 2 quarts vegetable stock**    *Vital wheat gluten or wheat gluten can be found in regular grocery stores and health food stores.    **The vegetable stock gives the seitan it's flavor, feel free to add other spices or seasoning to your taste ie; garlic, soy sauce, tamari etc.Equipment:measuring cups and spoonslarge bowlspoon for stirringsifterlarge cooking pot. . . . . . Knead again, this time putting pressure in the centre of the dough so that it forms a ring.\u00a0Cut apart the ring.\u00a0Then slice ~1cm thick pieces.. Bring the 2 quarts vegetable stock to a boil and add the slices of dough.\u00a0Stir occasionally so that they don't stick together.Turn the heat down and let simmer for 20 minutes.\u00a0After 20 minutes, remove pieces from the pot and place on a plate to cool.. Seitan is best sauted before eaten.\u00a0 It's yummy in stir-fries or try it in fajitas.\u00a0\u00a0 Seitan can be stored \"as is\" in the freezer, I usually divide the pieces up into meal size portions and store it in a freezers bags.\nRead the question below and select from the following choices.\nA: What You Will Need\nB: How to Make Bulgogi\nC: Melt Your Chocolate\nD: \"Paint\" the Sides of the Mold", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_44_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_44_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_44_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_44_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_44_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_44_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_44_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_44_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_44_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_44_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_44_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_44_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_44_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_44_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_44_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_44_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_44_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_44_17.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_44_18.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_44_19.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_44_20.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_44_21.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_44_22.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_44_23.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_44_24.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_44_25.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_44_26.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_44_27.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_44_28.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Making the Egg.\nB: Cheers to Valentine's Day!\nC: Preparing the Oven for Broiling\nD: Broiling the Biscuits", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Getting Ready and Mixing It Up.', 'Wait a Few Minutes and Start on the Egg Mcmuffin.', '@placeholder', 'Make the Sandwich.']", "context": "Here is the context of these images:\n. Ingredients 2 cups flour (all purpose or whole wheat) 1/3 cup oil 2/3 cup buttermilk 2 tsp baking powder 1 tsp baking soda Pinch of salt. 1 - 2 eggs per Mcmuffin. 1 tsp water per Mcmuffin. 1 small dab butter per Mcmuffin. 1 slice cheese per unit (or make your own) 1 slice luncheon meat per unit Mayonaise. (Tons of instructables on making home made mayonnaise) Utensils: Stove capable of supporting 425 degrees F. Microwave Microwave proof egg containers Biscuit cutter Medium to large cookie sheet pan. Large sturdy wooden spoon Rolling pin or equivalent. 1 mini food processor Large bowl to combine measure 1 tsp measuring spoon 1 cup measuring cup 1 tbl measuring spoon. \n          Preheat oven to 425 degrees Fahrenheit. In the large mixing bowl add the dry ingredients (salt, baking powder, baking soda, and flour) stir around to evenly distribute them. Take the cup and fill it one third full of oil. Does not have to be perfect but close is important. Fill the rest of the cup with buttermilk. Pour that into the flour mixture. Stir it all until it becomes a dough and is in a ball. Do not over mix. Take the ball and spread it out evenly as much as you can on a prepared counter, You can either use your hands like I do or use the rolling pin.\u00a0 Use a biscuit cutter ( I use a tin can with both ends removed) to make biscuit dough shapes Put in the baking pan. Between the oil and the buttermilk, you should not have to grease the pan. Though it would not hurt to do it is the first time you do this recipe.   Note: I used whole wheat flour instead and that accounts for the darker color dough in the next step).. By this time the oven should be ready. Notice the time. Put the cookie pan in the oven. Keep an eye on the biscuits so they does not burn. Should not take more than 12 minutes at most. Watch carefully!! The edges of the biscuits will plump up and turn brown. After 10 to 12 minutes put on the cooking mitt and remove the cookie pan. Note: Biscuit cutter was made from a small mushroom can.\u00a0. Yes, put the cookie pan on top of the stove or somewhere safe from the hot bottom of the cookie pan so the biscuits can cool and rest a bit. While that is cooling, lets put together the other parts. If you want to make your own cheese, see https://www.instructables.com/id/Our-pizza/. I will probably use just a slice of store bought cheese for this set up. Take the slice of cheese and cut it with the biscuit cutter. Do the same with the luncheon meat. Note: I probably could of made the biscuits a bit bigger and thicker.. Get your microwave safe little bowl and crack one egg. then add one teaspoon of water plus a dab of butter. Stir well. Cover the egg container, but do not seal it closed. Cook in the microwave about 45-50 seconds per egg.\u00a0 Your microwave may vary. The egg should be well cooked and firm.. Take one of the biscuits and slice it into two disks so to speak. Add mayo, mustard, or whatever on the inside sides of the biscuit. Add the micro-waved egg, meat, and cheese. MMMM goody goody! Coming Soon: Beat the cost of a fast food hamburger.. if you like biscuits and gravy like I do, if you have time you may want to go one step further. Take several slices of bacon or other meat that will make grease when you fry it. Fry the meat, but keep the stove on. (substitution: take solidified fat, lard, or butter) Remove the meat (if you did not use the fat or butter) and put on some paper towels. Remove all the oil except for about a tablespoon and a half. Add about a tablespoon and a half of flour. mix well with a heat proof whisk. till it is light brown. Add a cup or so of milk. Stir till you get a nice thick brown slurry. Add a little black pepper and mix it in. Pour\u00a0 the mixture on the biscuits. Crumple the fried bacon on top. (if you fried the bacon) Eat and you are in high heaven.. Biscuits do not have to be round. You can make them any shape you want.\nRead the question below and select from the following choices.\nA: Making the Egg.\nB: Cheers to Valentine's Day!\nC: Preparing the Oven for Broiling\nD: Broiling the Biscuits", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_45_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_45_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_45_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_45_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_45_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_45_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_45_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_45_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_45_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_45_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_45_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_45_11.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Lazy Valentine's Brownie\nB: Add Marshmallow Fluff and Brownie Mix\nC: Prepare the Flour\nD: Measure", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Melt the Chocolate', '@placeholder', 'Making the Brownie Mixture', 'Baking the Brownie']", "context": "Here is the context of these images:\n. 1. Plain Flour: 1 1/4 cups 2. Baking soda: 1/2 teaspoon 3. Salt: 1/2 teaspoon 4. Chocolate chips: 1 cup 5. Unsalted Butter: 150 grams6. Vanilla extract: 3 teaspoons7. Cocoa Powder: 3 tablespoons8. Yogurt: 3/4 cup 9. Skim milk: 1/4 cup 10. Sugar: 1 cup . 1. Add the chocolate chips and butter to a microwave safe bowl and microwave for 1 min2. Mix the melted chocolate and butter together until combined3. Add the vanilla extract and cocoa powder to the chocolate mixture and combine to form a smooth batter. 1. Add the flour to a mixing bowl2. Add the baking soda and salt to the flour 3. Mix well and set aside . 1. Add yogurt, milk and sugar to a bowl and mix together2. Add the chocolate mixture to this and combine 3. Add the plain flour in and combine everything using a whisk to form a cakey batter . 1. Line a baking dish with baking paper and grease with butter or oil spray 2. Pour the brownie mixture into the baking dish3. Bake the brownie at 180 degrees for 35 mins, let it cool down and enjoy!\nRead the question below and select from the following choices.\nA: Lazy Valentine's Brownie\nB: Add Marshmallow Fluff and Brownie Mix\nC: Prepare the Flour\nD: Measure", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_46_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_46_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_46_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_46_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_46_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_46_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_46_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_46_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_46_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_46_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_46_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_46_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_46_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_46_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_46_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_46_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_46_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_46_17.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_46_18.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_46_19.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_46_20.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Spicy Chicken Curry\nB: Put Them Together!\nC: Prep Ingredients\nD: And Enjoy!", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Ingredients', '@placeholder', 'Put It All Together', 'Enjoy']", "context": "Here is the context of these images:\n. Here are the ingredients you will need2 Chicken breast or any meat you have 1 Onion- chopped 1 can of Chopped tomatoes or 5 large tomatoes- cubed 1 tablespoon of Tomato puree 2 Scotch bonnet (habanero) 1 Red Bell Pepper (Tatashe) 1 tablespoon of vegetable oil I clove of garlic (optional) Curry  Thyme 2 stock cubes Salt to taste.   Chop bell peppers and onions into cubes Finely chop 2 Habanero peppersThen using a different chopping board cut the chicken breast into bite sized chunks.  Heat up the vegetable oil in a large pan (you need as little oil as possible-just enough to stop the chicken from sticking to the pan).When the pan gets hot, fry the chicken for 3 minutes or till in turns white.Then add the peppers, onions and garlic and continue to cook till the chicken is golden brown.Then add the scotch bonnet, curry, thyme and stock cubes.Then add the tomato puree and stir fry.After 1 minute of stirring,  add the can of chopped tomatoes and salt to taste, then simmer on medium heat for 10 minutes or  till the tomatoes are cooked.. And voila all done!!!Serve as a dip for chips, fries, baked potatoes or on a bed of steamed  rice and enjoy :) Nemi.\nRead the question below and select from the following choices.\nA: Spicy Chicken Curry\nB: Put Them Together!\nC: Prep Ingredients\nD: And Enjoy!", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_47_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_47_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_47_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_47_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_47_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_47_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_47_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_47_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_47_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_47_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_47_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_47_11.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Crispy Fried Tofu\nB: Marinade Chicken\nC: Tofu\nD: Make Dry Mixture", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Oil', 'Golden', 'Done!']", "context": "Here is the context of these images:\n. First off grab your tofu, firm or extra-firm works best. Cut it into small cubes with a sharp knife. Blot it with a paper towel to remove excess moisture. Tip: Store the leftover tofu in a container filled with fresh cold water and keep it in the fridge. Change the water every day to keep your tofu fresh.. Turn your stovetop on high and heat up some oil (enough to generously cover the bottom) in a pan (I used extra virgin olive oil). Use this time to pick out some seasonings for your tofu. I went with just salt, and some Japanese mixed spices (the one I used is called S&B - Nanami Togarashi and includes\u00a0chili pepper, orange peel, sesame seeds, Japanese pepper, ginger, and seaweed).. To test if your oil is hot enough drop one small piece of tofu into the pan and if it bubbles, it's ready. Drop all your tofu in and cover the pan so that oil doesn't splash everywhere and stir occasionally (stir gently in the beginning).. Add in your seasoning to taste and stir.. Eventually the tofu will start to brown. Check often and stir to make sure they aren't burning. It's ready when the cubes have shrunk to about \u00be of the original size and they are evenly golden brown on the outside. You may also prefer to cook them less.. Pour the tofu cubes into a bowl lined with paper towel and let cool slightly before serving. Enjoy!\nRead the question below and select from the following choices.\nA: Crispy Fried Tofu\nB: Marinade Chicken\nC: Tofu\nD: Make Dry Mixture", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_48_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_48_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_48_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_48_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_48_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_48_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_48_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_48_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_48_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_48_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_48_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_48_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_48_12.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Cheescake Cookies\nB: Adding Jam\nC: Ingredients and Supplies\nD: Shape the Dough", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Color the Dough', '@placeholder', 'Cut the Cookies', 'The Final Cookies']", "context": "Here is the context of these images:\n. I mostly love this cookie recipe because it is a one bowl cookie dough! You will be kneading and mixing in food coloring so the dough will be well mixed without needing to sift the dry ingredients separately.Icebox Sugar Cookie\n1 c. butter\n1 c. sugar\n1 egg\n1 t. vanilla\n2 c. flour\n1 1/2 t. baking powder\nIn a large mixing bowl cream butter and sugar until combined. Add egg and vanilla mixing well. Sift flour on top of wet mixture and before stirring add the baking powder. When you start to combine the wet and dry ingredients, the flour and baking powder will incorporate well enough throughout the dough\nDump dough onto a piece of waxed paper and divide dough into six equal pieces. You can obviously use more or less colors but I chose six.. After dough is divided choose food coloring colors and mix into dough using your hands. I find that there is enough butter in the recipe that your hands don't take on the food coloring but you could use gloves if you want make sure you don't have rainbow hands when you are finished!. Turn your imagination on high and start making your patterns. This is a relatively easy process but it does take some time and patience.\nTo make a bullseye shape:\n1. Choose a color and form a cylinder.\n2. Roll out another color large and long enough to wrap the cylinder.\n3. Gently press or squeeze dough to make sure the pieces stick together well.\n4. Keep wrapping with colors until you have the bullseye you want.\n5. Roll the completed bullseye into a longer log shape.\n6. Cut in half, thirds or as many as needed.\nTo make a flower shape:\n1. Choose a color that will be the center of the flower and form a cylinder.\n2. Roll out another color large and long enough to wrap the cylinder.\n3. Gently press or squeeze dough to make sure the pieces stick together well.\n4. Make a coil of dough and pinch the top to make a triangular shape. Repeat for amount of petals you want.\n5. Stick the triangles on the sides of the covered cylinder.\n6. Press another color of dough in between the triangles.\n7. Wrap the entire cylinder again with an outer color.\n8. Cut in half, thirds or as many as needed.\nThose are the two basic techniques I used but be creative and make anything you want!. Once you have all of the patterns of dough made, arrange them together to get the final pattern that will be the finished cookie.\nRoll patterns into longer logs if you want the pattern to be smaller in the final cookie. If necessary cut logs so they are all the same height.\nWrap in plastic wrap and freeze until hard - 2 to 3 hours.. Preheat oven to 350 degrees F.\nRemove cookie dough from the freezer and slice in 1/4 inch pieces. Repeat with all of your patterns (if you have more than one).. Bake for 7-9 minutes depending on how soft or crisp you like your cookies. Remove cookies from oven and let cool on baking sheet.\nColor will not fade while baking.. Eat and enjoy! Yum!\nRead the question below and select from the following choices.\nA: Cheescake Cookies\nB: Adding Jam\nC: Ingredients and Supplies\nD: Shape the Dough", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_49_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_49_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_49_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_49_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_49_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_49_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_49_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_49_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_49_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_49_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_49_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_49_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_49_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_49_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_49_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_49_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_49_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_49_17.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_49_18.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_49_19.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_49_20.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_49_21.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_49_22.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_49_23.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_49_24.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_49_25.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_49_26.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_49_27.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_49_28.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_49_29.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_49_30.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_49_31.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Chocolate Shadow Cake\nB: Bake the Cake\nC: Mixture\nD: Cut the Cake Board for the Top of the Cake", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Optional', \"Fill 'the Box' With Chocolates\", '@placeholder', \"Lean 'the Lid' on the Box\"]", "context": "Here is the context of these images:\n. \nrectange cake board\ncircular cake boards\nspatula\ncake mix or your favorite scratch recipe (see my\u00a0Old fashioned sour cream fudge recipe\u00a0below)\nheart shaped cake\u00a0pan\ncake release\nrolling pin\nsaran wrap\nred or pink pearl dust\nclean (new) make- up brush\npliers\ndowel\nscissors\nsharp knife\nblack marker\nroller cutter (optional)\nred gel paste food coloring (if using white fondant)\nfondant ( you can use white and color or purchase red fondant)\ncandy cups\nchocolates (at least\u00a024\u00a0 )\nfood wrap and tin foil to cover\u00a0cake board\u00a0(optional)\nyour favorite buttercream icing (see my favorite below)\ngumtex or tylose or use\u00a0gumpaste insteadOld Fashioned Sour cream fudge cake\u00a0\u00a0Ingredients:AmountIngredient2 \u00bc cupscake and pastry flour2 tsp.Baking soda\u00bd cupbutter, softened2 \u00bc cupsfirmly packed brown sugar1/2 tspsalt3eggs1 1/2 tspvanilla1 cupboiling water3 ouncesbakers unsweetened chocolate (melted01 cupsour cream (cooled)\u00a0 \u00a0 Directions:Sift together flour, baking soda and salt; set aside. Cream butter. If you use salted butter (skip the salt). Gradually add brown sugar and continue beating for 5 minutes. Add eggs one at a time, beating well after each addition. Add vanilla and chocolate. Alternately blend in flour mixture and sour cream, one third at a time, on low speed of electric mixer. Add boiling water; blend well. (Batter will be thin.) Pour into one greased and floured, waxed paper lined 9 \u00bd inches layer pan. Bake at 350 degrees for 35 to 40 minutes, or until cake tester inserted into center comes out clean. Cool in pans for 10 minutes. Remove and finish cooling on racks.Optional Filling: Kirsh Cream with Strawberries\n\t250 ml. Heavy cream 250 g. chopped strawberries (about 1 \u00bd cups)\n\t1 to 1 \u00bd tbsp. Kirsh cream or any other\n\tfruit liquer.\n\tBeat cream until whipped. Fold in strawberries and liquer and fill cake. \n\t\u00a0\u00a0Frosting: 5 squares Unsweetened Chocolate \u00bd cup butter, softened 1/3 cup water 3 cups icing sugar 1 egg Melt chocolate with butter and water over low heat; cool. (Mixture may appear curdled.) Add icing sugar and egg. Blend; then beat on low speed of electric mixer for 2 minutes. Chill until of spreading consistency.\u00a0Alternative Frosting (Bittersweet Chocolate Frosting): Amount is for a wedding cake therefore cut in half. 1 lb. Bittersweet chocolate, chopped \u00be cup heavy cream 3 tbsp. Unsalted butter In medium saucepan, boil water. In medium steel bowl combine approximately 2/3 of the chocolate and cream. Place bowl over saucepan and sir frequently until melted and smooth. Remove from heat and stir in remaining chocolate until smooth. Gradually beat in butter, 1 tablespoon at a time. Let stand until cooled to room temperature. \u00a0Bittersweet Chocolate Whipped Cream Buttercream IcingIngredientsPart One 1 lb. powdered sugar (sifted) 2 1/2 cups Crisco, 4 oz melted bittersweet chocolatePart Two 3/4 cup granulated sugar 1/2 tsp. salt 2 TBSP. Meringue powder (add 1 additional TBSP for slight crusting) 1/2 cup BOILING water (less 2 TBSP) 1 TBSP Vanilla (or flavor of your choice)InstructionsPart one... put crisco in bowl and gradually add powdered sugar. Beat about 5 minutes until mixture is very creamy and fluffy then add melted chocolate squares.\u00a0Set this aside.Part two... In a very clean bowl mix dry ingredients. Add BOILING water and immediately mix on high speed. Beat until stiff peaks form, about 8 minutes. When mixture begins to get stiff add flavoring.NOW combine both mixtures\u00a0and beat together for another 8 minutes. When finished, use a rubber spatula to down beat a little to remove some of the air bubbles. Frosting will be very light and creamy. Cover. DO NOT REFRIGERATE.The frosting may be kept at room temperature for 3 months. Whip with a spoon each time you use it to restore fluffiness.. Optional: Line the cake board with tin foil and food safe plastic wrap (this is not necessary but makes it easier to wipe messes off the board) I usually use neutral gold or silver gift wrap I purchase at Michael's , but I had run out. . Bake 2 heart shaped cakes. I always use a generous amount of Cake Release to prevent the cake from sticking. Level the cake, but cut \u00a0the one for the top (lid) of the cake a little shorter\u00a0than the bottom and place it on a circular cake board. Put it aside.\nPut the bottom cake on the main rectangular cake board. Fill the bottom cake \u00a0with filling of your choice (this is optional). Ice the cake, being sure to fill in the area where the cake was cut to fill, if you filled it. This doesn't have to be a thick layer covering everything, only a crumb coat. If a few crumbs mix in, it's not a big deal. Smooth as best as you can.\nRepeat for the top of the cake. It is important to get the top of the cake very smooth, as you will be placing fondant on top of it.\nTip: Take a metal spatula, soak it in boiling water (I use a pot on the stove)\u00a0 and wipe the\u00a0water off on a clean tea towel, then\u00a0smooth the icing with the dry hot spatula over the surface of the cake.\u00a0\u00a0Then remove excess icing off spatula. \u00a0Keep repeating until your cake is smooth. Add\u00a0about 2 Tblsp of \u00a0tylose or gumtex to\u00a0your\u00a0fondant, roll out and cut into a long strip. Alternatively, you can use gumpaste that can be purchase at Michael's craft store or any cake decorating store. But you will still have to color it.\nMake sure the strip is wide enough to go about a 3/4 of an inch above the cake (measure with chocolate on top) and let\u00a0the strip\u00a0dry for about 15 minutes. It needs to be dry enough so it won't sag or droop.\nCarefully\u00a0place the\u00a0strip (you will\u00a0likely need 2) \u00a0around the cake,\u00a0and close the seam at the back with a little water.\u00a0. If you have a\u00a0sugarcraft gun,\u00a0 then use the rope attachment to make the rope border.\nIf you do not, then roll out 3 narrow strips with the flat of your\u00a0hands and twist the pieces\u00a0\u00a0together. Don't worry if it doesn't go all the way around. You can do it in pieces and use a little water to 'glue' it together - it won't be noticeable.\nThen 'glue' the strips on the cake with a\u00a0 little water. Do a little strip of rope\u00a0for the seam at the back. And you will also do this for the top of the cake when the time comes. . Fill the surface of the cake with chocolates in candy cups (you can buy at Michaels;) . You will need at least a 24 chocolates. . Outline the circular cakeboard and cut to fit under the top\u00a0cake. You will need this to support the cake. . Roll out colored fondant (1/8\" thick ) and cover the top of the cake. I usually just guage how much I need by looking at it. But you can tell approximately how much you'll need be measuring the cake across and adding a couple inches all around. You can cut off the excess and reuse. If you have no idea how to smooth fondant on a cake, google it - there are lots of tutorials. Some prefer a smoother, but I use my hands (wedding ring off!)\nPlace on cake, smooth and trim.\nTip #1: Stick\u00a0the top of the cake\u00a0in the freezer for 10 minutes while you roll out your fondant - this makes it easier to cover with fondant. Don't leave it longer than 10 minutes!\nTip#2: To transfer the fondant, I roll it up a little with my rolling pin and gently unroll over the cake. . \nRoll out the remaining \u00a0fondant with gumtex or tylose (or gumpaste) \u00a0as thinly as possible (as least half as thin\u00a0as you rolled it to cover the cake)\n\u00a0Cut two lengths of the\u00a0fondant (or gumpaste) \u00a0the same length and width. These will form the loops. I generally cut mine around 7.5 cm/3 inches wide and about 15 cm/6 inches long. The length of these loops will determine the size of you bow, so If you want a bow about 10cms/4 inches long the loops will need to be a little more than double that length when you cut them. Its a little bit of trial and error, but the length can be adjusted after they' ve been cut quite easily.\nTurn one loop piece over and pinch the ends together, then do the same with the other end, and pinch the two ends together. Put some saran wrap in the bow pieces to set it in place.\nRoll out the tails of the bow in the same manner as the loops but make them a little thinner, maybe \u00be of the width of the loop pieces. Cut the ends at a 45 degree angle. Pinch them as you did the loop piece.\nMake the centre knot with another piece of fondant, rolled and marked in the same manner as the other pieces, but only make it about \u00bd the length of the tail pieces. The knot is just a short strip (maybe 1' by 1\") and it is just wrapped around all the other scrunched up ends so that there arent any rough edges showing. It doesnt need to go all the way around the back of the bow, just tuck the edges under so they dont show.\n\"Glue\" the pieces together with a little water on a paint brush\nCut a long, narrow strip and put directly on the cake\nDry brush on the red or pink pearl dust (I use a never used new make-up brush).\nThen place the bow on the cake on top of the narrow strip.. Take a wooden dowel (I use the wooden ones you can buy in art section at Michaels and boil it) , cut to size with pliers and sharpen with pencil sharpener. It should stick out about 1- 1 1/2 \"\u00a0above chocolates. Carefully place the top of the cake onto the sharpened dowel. You may need to poke a little hole in it from behind first (through the back and into the cake board.) You want it resting just above the rim of the bottom cake, so it doesn't put weight on the rim and wreck it.\nServe and enjoy!\nRead the question below and select from the following choices.\nA: Chocolate Shadow Cake\nB: Bake the Cake\nC: Mixture\nD: Cut the Cake Board for the Top of the Cake", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_50_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_50_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_50_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_50_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_50_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_50_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_50_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_50_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_50_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_50_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_50_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_50_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_50_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_50_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_50_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_50_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_50_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_50_17.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_50_18.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_50_19.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_50_20.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_50_21.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_50_22.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_50_23.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_50_24.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_50_25.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_50_26.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_50_27.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_50_28.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_50_29.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_50_30.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_50_31.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_50_32.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Cowboy Beans\nB: Prep the Beans\nC: Looking Good\nD: Popeye's Red Beans and Rice...Hacked!", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Boil Water', 'Saute the Beans', 'Serve']", "context": "Here is the context of these images:\n. green beans (figure 1/4 pound per person)almond slivers2 tbsp unsalted buttersalt and pepper. To prepare the beans, the stems and ends of the beans must be either cut, or pulled off.  Full length green beans are also a bit unwieldy to eat, so it's a good idea to cut or break them in half.Place the prepped beans into a bowl or colander and rinse under cool running water.. Bring a large pot of salted water to a rolling boil.. Cook the green beans in boiling water for 5 minutes.. Strain beans into a colander and run them under cold water.  Better yet, place them in an ice bath in order to fully stop the blanching process.If you are cooking the beans ahead of time, you can take them and put them into the fridge at this point.. Saute the green beans in 2 tbsp of unsalted butter.  Salt and pepper the beans to taste.  I like to add in some (around 1/4 cup) of almond slivers just before the beans are finished cooking. . Plate the beans, serve and enjoy!\nRead the question below and select from the following choices.\nA: Cowboy Beans\nB: Prep the Beans\nC: Looking Good\nD: Popeye's Red Beans and Rice...Hacked!", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_51_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_51_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_51_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_51_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_51_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_51_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_51_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_51_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_51_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_51_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_51_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_51_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_51_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_51_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_51_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_51_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: under\nB: Call Me Bloody\nC: Drain and Cool\nD: Color Me Red", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Dredge', 'Fry', '@placeholder', 'Serve']", "context": "Here is the context of these images:\n. This is probably going to be the hard part.Snakes do a fine job keeping the world free of unnecessary rodents; don't kill them unless absolutely necessary!  That said, if you do kill a snake, or find one dead, don't let it go to waste.The snake in this Instructable was run over by a car; Eric found it a couple minutes later, its heart still beating, in the process of expiring by the side of the road.  Since we knew both time and cause1 of death, and refrigerated the carcass promptly, it was safe to eat.  A bit of internet research identified it as a probable Black Rat Snake, a non-poisonous Indiana resident. 1 Note that snakes can also die from eating poisoned rodents.  You dont want to eat a snake dosed up with warfarin or other toxin2.  Pay attention to context.2 It's apparently fine to cook and eat poisonous snakes- cooking is sufficient to inactivate any venomous residue.  . Cut off the head, strip off the skin, and remove the guts as described in this Instructable.Rinse the carcass, and wipe down with a clean paper towel, then cut the body in to manageable lengths with a sharp knife or pair of poultry shears.  . We're going to treat the snake much like you would a small lake fish, though you can also treat it like chicken.  This is my favorite way to cook bluegill.  \nI dipped the segments in a bit of egg white (milk would also do) before dredging them in a pepper and sweet cornmeal mix (actually just Jiffy mix with some extra black pepper).\nKnock off the excess.. Heat about 3/4\" of canola, vegetable, or peanut oil in a heavy frying pan (I prefer cast iron) until quite hot.  A bit of dry batter should bubble nicely.\nAdd the snake pieces one at a time to avoid dropping the temperature in the pan too quickly.  \nUse tongs to keep your fingers away from the sizzling hot oil, watch for dangerous splatters, and use a screen if necessary to prevent mess.\nTurn the snake pieces just as the batter begins to turn golden- by the time it starts to brown the snake will be overcooked.  There's not much meat on the bones, and the muscles are thin and lean.  (Yes, we  mostly overcooked ours, but it was still tasty.). Remove the snake pieces before they're quite done- they'll continue to cook after removal from the pan- and set them on paper towels to drain and cool.\nIf you've still got more batter, chop up some veggies, dip them in the egg whites and/or milk, dredge in batter, and fry.  You can also just mix the liquid into the batter and fry hushpuppies.  It's all good.\nWe fried some fresh okra from the farmers' market.. Serve your fried snake bits warm, and provide napkins- this is finger food.  Accompany with most anything you'd serve with fried fish. \nThere should be a line of muscle along either side of the spine; this is the thickest piece of meat on the snake's body.  The ribs are quite firmly attached to the spine, so scrape your teeth over them firmly to remove the rest of the meat from the ribs.\nSince our snake was a bit overcooked it mostly tasted fried, but some of the thicker bits had a distinctive nutty snake flavor.  I'm definitely looking forward to getting my hands on another (hopefully bigger) snake and trying this again!\nRead the question below and select from the following choices.\nA: under\nB: Call Me Bloody\nC: Drain and Cool\nD: Color Me Red", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_52_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_52_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_52_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_52_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_52_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_52_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_52_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_52_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_52_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_52_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_52_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_52_11.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The 5 Minute 35&cent; Pizza\nB: Da Bac\u00f6n\nC: Moar Paper\nD: Finishing Touches", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Ingredients', 'Preparation', 'Cooking', '@placeholder']", "context": "Here is the context of these images:\n. For this pizza, you will need...\n     -2 english muffins\n     -Cheese of your choice (I used mozzerella and cheddar)\n     -Topping (optional, I used the classic pepperoni)\n     -Microwave (duh)\n     -Freezer (optional)\n     -Ketchup. Cut the english muffins in half by piercing the sides with forks, add the cheese and topping (whatever they may be) and arrange on a plate.. Put the pizza muffins in the microwave for 1- 1 and a half minutes.  Cooking times might be different depending on the kind of microwave you have, so just try it out, and see how long it takes to melt the cheese. Once te cheese is melted, take it out of the microwave and marvel at the cheesy goodness.. I heard everyone reading step one, you all said, \"What do we need a freezer for??? Is he out of his mind?\" I assure you, I am out of my mind, but not on this subject. Put your pizza muffins in the freezer for about a minute to aid in the cooling process, it makes them cool quickly, but not too quickly.. Cut your pizza muffins in half and add ketchup to the side. Viola! Enjoy the cheesy goodness you have been craving. adios amigos.\nRead the question below and select from the following choices.\nA: The 5 Minute 35&cent; Pizza\nB: Da Bac\u00f6n\nC: Moar Paper\nD: Finishing Touches", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_53_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_53_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_53_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_53_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_53_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_53_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Outback \"Copy Cat Recipe\" From Aunt Jo\nB: Next, Make a Bath\nC: Fold and Crimp\nD: Cut the Retention Band", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Cut Top Off', '@placeholder', 'Fold Flaps', 'Fold Down Retention Band']", "context": "Here is the context of these images:\n. You'll need one standard milk cartonbox cutterpencil or permanent markerruler. Measure 1 cm down from the top edge of the carton body.  Mark an horizontal line on each side.  Be careful, knives are sharp!  Cut along these lines through all four sides.  Remove the top and discard.. Measure down 2.5 cm from the new top edge.  Mark an horizontal line at this point on three sides.  Cut along line through three sides.  Leave band attached to one side of carton body.. Measure up 9 cm from bottom of carton, make a mark at corner junction.  Make a cut from this mark, through the corner up to the band.. There should be three flaps and one flap with an attached band.. Fold three side flaps down evenly into the cavity of the carton.. Finally fold down the last flap with the retention band.  Work the band down around the side of the carton to form the closure.  And Voila!This is my very first instructable!  Please comment!  And I hope you will enjoy!\nRead the question below and select from the following choices.\nA: Outback \"Copy Cat Recipe\" From Aunt Jo\nB: Next, Make a Bath\nC: Fold and Crimp\nD: Cut the Retention Band", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_54_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_54_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_54_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_54_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_54_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_54_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_54_6.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Classic Hot Ham and Cheese With a Garlic Italian Twist\nB: Add the Two Creams\nC: Boiling and Stirring Until Thickened\nD: Drain the Pastaand", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Preparing to Cook the Diced Chicken Breast', '@placeholder', 'Tossing to Coat Pasta With Sauce', 'Plate and Serve']", "context": "Here is the context of these images:\n. Cook 2 cups of whole wheat Penne pasta according to directions on package. Drain well and set aside. While you are boiling the pasta you can mix the spices, including the corn starch in a small bowl, then set aside. If you have allergies to corn, than you can use arrowroot starch. You will find this starch in your local \"Whole foods Market\" or other local health food store. The spices you will need: 1 tablespoon sun-dried tomatoes flaked or powder 1 teaspoon coarse sea salt 1 teaspoon marjoram 1 teaspoon dried thyme 1 teaspoon dried oregano 1 \u00bd teaspoon sugar 1 tablespoons dried onion 1 teaspoon corn starch or arrowroot powder \u00bd teaspoon black pepper \u00bc teaspoon garlic powder You may not find flaked or powdered sun-dried tomatoes. Just buy whole dried sun-dried tomatoes and place one or two small dried tomatoes in a food processor and process to flakes or powder. It is your preference. By the way the way the colander in the image was part of a prize I won here at Instructables. It included this colander, a Instructable T-shirt and a 257 page book called \"GLUTEN - FREE ON A Shoestring\" (prize value was worth $65.00). The book includes 125 easy to make gluten free recipes. You can view the Instructable I entered in the contest, then you can do so by linking here: Almond Flour Honey Cake. If you haven't entered a contest here at Instructables, you really need to do so. It is a lot of fun. But first on to the next step ------------------->>>. You will need 1 pound of chicken breast, and cut it into 1 inch chunks. One pound is usually 2 or 3 breast.. Melt 2 tbsp. of unsalted butter in a large skillet over medium-high heat. Add the diced chicken; cook and stir 5 minutes or until lightly browned. You should not see any pink remaining in\nthe meat before proceeding to the next step.  . Next add spices, then a 14.5 oz. can of petite cut tomatoes, undrained.  Next add 1 cup of cream or half and half followed by 1/2 cup of fresh grated Parmesan-Rigatino cheese. Mix all the ingredients until well incorporated. Bring\nto a boil, and stirring constantly until well blended and starts to thicken some.\nReduce the heat to low and simmer for 5 minutes.. . After the sauce has thickened stir in pasta and...... . toss gently to coat. . cover the pasta mix and let stand with heat off for 5 minutes.. \n          Plate and serve with additional cheese, if desired. Prepare your favorite salad to accompany the meal. We have more recipes here at Recipes for a Healthy You as well as here at Instructables. Thanks always for taking the time to view my Instructables. Here is one that I entered into a contest, I would appreciate your vote: Blinding Baking Pie Crust [includes pie recipes] Follow us on Twitter Like us on Face Book Join our group at Google  Check out what we've pinned at Pinterest Thanks for your time and viewing our Instructable. Eat and Be Healthy!! Regards, Randywww.savorthefood.com Click image and:  \u00a0\nRead the question below and select from the following choices.\nA: Classic Hot Ham and Cheese With a Garlic Italian Twist\nB: Add the Two Creams\nC: Boiling and Stirring Until Thickened\nD: Drain the Pastaand", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_55_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_55_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_55_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_55_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_55_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_55_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_55_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_55_7.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Curried Sweet Potato Soup With Bacon\nB: Moroccan Three Bean and Kale Soup\nC: Clean the Kale\nD: For the Pros Out There", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Dice the Onion', 'n', 'Results']", "context": "Here is the context of these images:\n.    1 Large\u00a0(or 2 small) Bundles\u00a0of Kale, Torn into Bit Sized Pieces\u00a0     10 Whole Red Potatoes, Sliced Thin\u00a0     1 Medium Onion, Diced\u00a0     2 Tbsp Olive Oil\u00a0     2 Cloves of Garlic- Minced     1-1/2 Pounds Italian Sausage\u00a0     1 Tbsp\u00a0Crushed Red Pepper\u00a0(adjust to taste)     1/2 Tbsp\u00a0oregano\u00a0     2 cups Chicken Broth\u00a0     2 cups Whole Milk\u00a0     4 cups Half and Half     Splash of Heavy Cream\u00a0     Salt and Pepper to Taste **Ignore the flour in the picture, there is no flour in this recipe. For some reason I had a brain fart when taking this picture and added it in. **. First tear kale into bit sized pieces and rinse with cold water. Set aside.\u00a0. Thinly slice the red potatoes. Boil sliced potatoes until tender. Drain. . Heat oil over medium high heat, add minced garlic and onion and cook until slightly browned. About 3-5 minutes.\u00a0. Add italian sausage and crumble while it cooks. Drain off as much fat as possible.\u00a0. . Add Whole Milk and Half and Half, let simmer for about 30 minutes.\u00a0. . I like to only add about 1/3 of the kale and then save the rest of it for right before I heat up the left overs so I have some semi-crunchy kale in the soup as well as cooked kale.\u00a0. Add a splash of Heavy Cream at this point, unless you are making this soup ahead of time or for left overs, and then leave the added heavy cream for right before you eat your serving of soup.\u00a0. This soup is amazing for those cold winter nights. It's filling and warm and so delicious. Who needs to go to Olive Garden when you can whip this soup up quickly and have leftovers for many bowls to come.\u00a0\nRead the question below and select from the following choices.\nA: Curried Sweet Potato Soup With Bacon\nB: Moroccan Three Bean and Kale Soup\nC: Clean the Kale\nD: For the Pros Out There", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_56_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_56_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_56_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_56_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_56_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_56_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_56_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_56_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_56_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_56_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_56_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_56_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_56_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_56_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_56_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_56_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_56_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_56_17.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Soft Pretzel Bites\nB: Combining Ingredients.\nC: Ingredients\nD: Egg Wash (Optional..but Optimal!)", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'The Dough', 'Boiling Your Pretzels', 'Baking and Finishing']", "context": "Here is the context of these images:\n. 1 package of yeast1 1/4 cup of warm water (110 degrees)1 tsp sugar3 cups of bread flour3 tablespoons brown sugar1 tablespoon butter melted1 tsp salt6 cups of water1/2 cup of baking soda1 /2 cup sugar1 tablespoon cinnamon1/2 cup butter melted. Lots of people are afraid to cook with yeast.  It is really easy to do.  Mix the yeast, 1 tsp of sugar and 1 1/4 cup of warm water.  Make sure the water is around 110 degrees.  Also check the date of the yeast.  It may not work if it is old.  You are going to let it rest for 10 minutes.  It will start to look like it has bubbles.. Mix the yeast mixture, brown sugar, salt and 1 tablespoon of butter.  Mix this with the wire whisk attachment.  Now start adding the flour.  When it starts to separate from the bowl, change the attachment to the hook.  Now finish mixing with the hook. Finish mixing with the hook and add the rest of the flour.  It will be sticky but easy to pull out of the bowl.. Put a little oil in your bowl and cover the bottom and sides.  Now add the dough and cover with plastic wrap.  Put the bowl in a warm spot.  Let it rise for a hour.. When the dough has risen you are going to heat your water.  Put 6 cups of water in a large pan with the baking soda.  Let the water start to heat.  Turn your oven on to 425 degrees.  Also put a piece of parchment paper on your cookie sheet.  Take the dough out and cut in to four pieces.  Now roll out each piece in a strip that is around 30 inches long.  Cut each rope into pieces an inch wide.  Drop the pieces into the hot water for 10 seconds.  Drain and put on a cookie sheet with parchment paper.. Bake the pretzel bites for 10 minutes.  After you take them out of the oven melt the butter in a glass bowl.  Put the pretzel bites in the butter and then drain and put in a separate container.  Mix the sugar and cinnamon.  Put the pretzel bites in a brown paper bag and pour in the sugar mixture.  Shake and put them in a container with a lid.  Try not to eat them all while they are warm.  They taste just like Auntie Anne's.\nRead the question below and select from the following choices.\nA: Soft Pretzel Bites\nB: Combining Ingredients.\nC: Ingredients\nD: Egg Wash (Optional..but Optimal!)", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_57_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_57_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_57_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_57_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_57_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_57_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_57_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_57_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_57_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_57_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_57_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_57_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_57_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_57_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_57_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_57_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_57_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_57_17.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_57_18.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Press/form\nB: Lumberjack Cookies C. 1917\nC: Bacon Cheesecake Brownies\nD: Video Recipe and Youtube Channel", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['\"Cream\" the Bacon Fat', 'Add Flour', '@placeholder', 'Bake']", "context": "Here is the context of these images:\n. Add 1c. Bacon fat to 2 cup white sugar, 1/4 cup brown sugar, 2 tsp salt, and 4 tbs molasses. In a stand mixer, mix on high for a few minutes.. Ad 2 eggs and cream again. Your batter should form stiff peaks before you add spices and flour. . Add the following to the mixture:\n1 1/2 teaspoon ground cinnamon\n1 1/2 teaspoon ground ginger\n3/4 teaspoon ground cardamom\n1/4 tsp black pepper. Add 2 1/2 cup flour, mix on high. . I had a cookie press so I decided to use that for these cookies - but as long as each cookie is about the size of a tablespoon they should be the right size. They flatten out while baking.. Bake the cookies in a 350 degree oven for 8-10 minutes - you don't want the bottoms to get burnt.. Great with milk, or ice cream. They are rich! This recipe yields many cookies, so bring them into work, or share with friends. \nRead the question below and select from the following choices.\nA: Press/form\nB: Lumberjack Cookies C. 1917\nC: Bacon Cheesecake Brownies\nD: Video Recipe and Youtube Channel", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_58_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_58_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_58_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_58_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_58_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_58_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_58_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_58_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_58_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_58_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_58_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_58_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_58_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_58_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_58_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_58_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_58_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_58_17.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_58_18.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_58_19.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_58_20.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_58_21.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Potato Volcanoes\nB: Wash\nC: Volcano Potato\nD: Deep Fry Time", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Prep the Ingredients', 'Roll and Coat', '@placeholder', \"It's Serving Time\"]", "context": "Here is the context of these images:\n. For this mouth watering snack, you'll need to include the following in your shopping cart:PotatoesBeansCarrotsGreen ChiliesBread CrumbsCorn Flour SlurryGinger Garlic PasteSalt Oil. The first and the most important thing to do is to prep all the ingredients before starting to cook. This is a pretty straight forward recipe once all the ingredients are ready.Boil a few potatoes and mash them well in a bowl. Also chop down the beans and carrots into small pieces and boil them till they're soft enough. . The next thing to do is to prepare the mixture that's going to be deep fried. Add the boiled vegetables in the bowl of the mashed potatoes and mix it well. Add salt, Red Chili Powder and a Teaspoon of cumin seeds to it. Also add in about half a cup of bread crumbs to it and mix it well.. Next part is the fun part. Take the mashed veggies and roughly make it into a rough sphere in your hands. Now take small chunks from that sphere and roll them into small balls using your hands and keep them aside. It's time now for coating the balls. Dip them in the corn flour slurry and then in bread crumbs. Make sure the balls are completely covered in bread crumbs. Coating the kebab balls is a completely optional step, but this definitely adds a lot of crispiness to the lollipops when they're fried. . The next step is to deep fry the kebab balls. Heat some oil in a deep frying pan. Now dust off any extra bread crumbs sticking on to the balls and slowly drop them into oil and fry it until it turns golden yellow in color. Once it's all fried properly, remove it out of the oil and drain out any excess oil from it by placing it on a paper napkin.. After deep frying the balls, it can be savored directly, but what's a lollipop without a stick.So now take a few tooth picks and gently fix them into each of the deep fried balls. Now top it up with some coriander leaves and also some chili flakes if you'd like. Serve it while hot with some tomato ketchup or yogurt dip.Bon Apetit !!\nRead the question below and select from the following choices.\nA: Potato Volcanoes\nB: Wash\nC: Volcano Potato\nD: Deep Fry Time", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_59_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_59_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_59_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_59_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_59_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_59_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_59_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_59_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_59_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_59_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_59_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_59_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_59_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_59_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_59_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_59_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_59_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_59_17.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_59_18.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_59_19.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_59_20.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_59_21.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_59_22.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: How to Make Your Own Sprinkles at Home\nB: The Legs and Shelf\nC: The Mould\nD: Burn Off the Inside", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['The Dough', '@placeholder', 'The Mould', 'The Glaze']", "context": "Here is the context of these images:\n. I used Serious Eats' \"Best Chocolate Chip Cookie\" recipe, but bastardized it* by adding a half cup of flour and beating the dough longer than recommended, so it would be easier to roll out. Use whatever dough you want, just make sure it's stiff enough that you can manipulate it a little.\u00a0 *I did make one attempt with the original recipe--more on that towards the end.. Make sure to use mini-chips and/or hand-chopped chocolate. If your chunks are too big, you get weird holes rolling out the dough.. I used a mini popover pan lined with strips of parchment paper for the outside, and wrapped corks in foil for for the center piece.. Not sure what would hold up the best, I experimented with another centerpiece made from a paper towel tube and masking tape (which yes, is safe in the oven).. I used plenty of flour and a pizza cutter to get my lines straight.\u00a0 My first attempt was wrapping the dough around the original shot glass, but I found it more effective to use the centerpiece that would eventually go in during the baking process. And it's much easier to get the parchment paper around before you put it in the pan. Use a real shot glass to make a circle for the bottom of the cookie shot glass, the use the aluminum foil-cork-plug to smash it all together. Straighten out the top edge with your finger if it gets crooked. It won't fix itself while it bakes. I greased some of them and found that (with this pan, at least) it didn't make a difference.. My preferred temperature setting was basking at 375F for 10-14 minutes.\u00a0 I tried the first batch at 350F and they were meltier and puffier, while the higher temperature made the next batch take their shape more quickly. The aluminum foil was definitely more effective than the cardboard. On second batches, I filled the cardboard with foil so the cookie wouldn't puff up inside, but the cardboard middles were a lot stickier to pull out. It's best to pull out the centers while they're still warm, before they fully set, just don't burn yourself. You can see the one that I didn't give a plug to just filled right in. Cookie Shot Glass fail.. \n          My obsessive tendencies won--two days later, I had to test the glaze ideas. First, wait 'til your cookie shotglasses are completely cool before glazing. Here are the two types I tested: 1) A \"Confectioners Glaze\" -- this is what I grew up knowing as \"cinnamon roll frosting.\" A little powdered sugar, splash of vanilla, and a couple teaspoons of milk. I like to make mine thick--like a thick paint. 2) A variation on Royal Icing -- an idea inspired by my days of using Royal Icing to glue together my gingerbread mansion/castle/lighthouse/city, this stuff hardens like glue when it dries. For my little test batch, I used 1 pasteurized egg white, a splash of vanilla, beat it in a mixer til frothy, and added about 3/4 c. powdered sugar. I used clean paintbrushes to brush the inside, and also tested the pour-n'-swirl method--which wasn't really as effective since my glazes were fairly thick. I recommend continuing to re-distribute the frostings with the paintbrush as they dry, since they sink to the bottom. You can also let them dry on their sides and roll em around as you see fit. For each different glaze, I tested one single coat vs. two coats.\u00a0 I tested one unheated test a couple hours after applying a glaze to see if it was holding at all--and seemed to work pretty well  I painted a few more, and let them sit overnight.. \n          In the morning, they were dry. I reheated one of each type in the oven at 350F for 5 minutes, and then added milk:  The confectioner's glaze worked great for both the single and double coats, but the royal icing was not a success (which makes sense, now that I think about it).\u00a0. The regular cookie dough (which had less flour and was beaten less, making it softer and more malleable) did not yield a successful cookie shot glass. Its structure was much looser and, while still delicious, was extremely porous and was difficult to get the foil plug out of. Also, if you don't line the outside of the cookie shot glass with parchment paper, it doesn't stick, but it does spill over the edge, making a weird, muffin-top-y-style glass. Lastly, when the bottoms of my first batch turned out thicker than I would've liked, I tried putting no bottom at all on some in the second batch, hoping that the dough would melt and drip down... ...but since I switched to a higher temperature on the second batch, this didn't happen at all, so I got a cookie spyglass instead.. Edit: leaving this here so you guys can see the failure caused by not glazing. They only kind of worked. A couple of them held milk for almost a few seconds before seepage occurred, and since this article\u00a0that revealed that Dominique Ansel uses a glaze inside the cookie to keep the milk in didn't come out until I was finished with my experimentation, I haven't had a chance to try that....yet. But I'll update you once I do..   Final thoughts: for all the work, $3 a pop for Dominique Ansel's seems pretty reasonable to me if you're in NYC. I hear they come with complimentary vanilla milk refills. I'll definitely be checking them out when I visit in April!\nRead the question below and select from the following choices.\nA: How to Make Your Own Sprinkles at Home\nB: The Legs and Shelf\nC: The Mould\nD: Burn Off the Inside", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_60_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_60_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_60_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_60_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_60_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_60_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_60_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_60_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_60_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_60_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_60_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_60_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_60_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_60_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_60_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_60_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_60_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_60_17.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_60_18.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_60_19.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_60_20.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_60_21.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_60_22.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_60_23.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_60_24.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_60_25.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_60_26.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_60_27.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_60_28.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_60_29.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_60_30.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_60_31.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_60_32.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_60_33.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_60_34.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_60_35.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_60_36.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_60_37.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_60_38.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Conchiglioni With Herbs\nB: Additional Notes/nutrition\nC: Fast Asparagus\nD: Let\u00e2\u0080\u0099s Make the Cheese Spread!", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Start the Cooking', '@placeholder', 'Quick Cheat', 'Plate Up']", "context": "Here is the context of these images:\n. To make this dish for 2 you will need the following:2 fillets of sea bass, your fishmonger can do this for youSalt and pepper1 knob of butter1 yellow bell pepper, thickly sliced and the seeds removedAround 10 cherry or vine tomatoes, roughly sliced in half or left whole1 bunch of asparagus with the tough ends broken off, this will be anywhere between 10 and 15 pieces, depends on the thickness of the asparagus. If they are a very thick then give them a little longer or slice them length ways2 Tbsp of spicy pesto, or any pesto you like, simply from a jar100ml dry white wine1 handful of flat leaf parsley, you will only need the leaves which you can just pick offoptional, a little water in case you want to loosen the saucesome mixed leaf to your liking, you could use rocket, watercress, spinach or what ever you like in any combination you like. The vegetables with the exception of the asparagus will be cooked in a griddle pan, you want charing so this needs to be as HOT as possible, put this on full wack.NOTE: do not put oil on the griddle otherwise you will be frantically opening windows and hitting the smoke detector. You wont need oil and it will only burn which will ruin lunch and no one wants that!Put another pan on a medium heat and allow to heat up while seasoning the fish, season both sides with salt and pepper. Salt on the skin will help to give you that infamous crispy skin. You could score the skin if it is a thick fillet which will actually help the heat to permeate allowing the fish to cook quicker. But there really isn't any need.TIP: Never put put your fish into a cold pan, you want it to be up to temperature first for a nice crispy skin.. Put your peppers on the griddle, these are going to char and blacken which is just what we want, they will go soft and sweeten as the natural sugars cook.In the fish pan, put a good knob of butter, let this melt down for a few seconds and move the butter around the pan. Then, gently lay the fish in the pan skin side down - do not touch the fish or the pan now, it can be tempting to mess around with the fish but you want the skin the crisp up and the meat to gently cook.TIP: Don't be tempted to move the pan around and mess with the fish, just let it cook.. Keep an eye on your peppers, move them around.After 4 - 5 minutes you will see the edge of the fish at the thinnest points start to turn a light white colour, when this happens it is time to turn the fish. Take a fish slice and very carefully turn the fish over, keep close to the pan so not to splash butter everywhere and keep the delicate fish in one piece. Cook the fish for 2 - 3 minutes more, keep checking it to make sure it doesn't overcook/ burn.Get some foil or a plate ready for when the fish is cooked to put it to one side.Check the fish by gently lifting it with the fish slice and peaking underneath, it should be just brown, remove from the pan and put to one side.TIP: Fish is considered tricky and many people over cook it but if you keep an eye on it then it is really easy, as soon as the fish looses it raw colour and the flakes of meat just start to come away from each other it is ready. Just be patient and as soon as it is done, get it out of the pan.. Now we are coming to the end and the last of the ingredients cook super fast.Turn the peppers again and throw the wine in the fish pan, you want to save all the delicious flavour from the pan so don't wash it first. This is called deglazing the pan.Put the asparagus in the wine and put a lid on top, the asparagus will take around 2 minutes to become tender and steaming them in wine and the fish butter will make them shinny and delicious.At the same time, put your tomatoes on the griddle, they will cook fast because of the sweet sugars and the soft flesh. They will be ready around the same time as the asparagus.. Asparagus really doesn't take very long, as soon as the stems are tender use some tongs and get them out of the pan, put to one side for plating up later.Don't throw the wine away from the fish pan, this is going to be the base for the super simple sauce - the flavours of the fish and asparagus are too good to waste.. When it comes to sauces there is nothing more rewarding than making your own from scratch but sometimes you want something quick and easy so there is no shame in using a nifty cheat here and there.For this one the secret is pesto (you could even make your own pesto), here we used a spicy tomato pesto. Add your pesto to the wine in the pan and mix in. You may need to add a splash of water to loosen the sauce. Add the flat leaf parsley at the end and stir in.Take the vegetables off the heat and put in a bowl, set to one side. It is best to get the veg out of the pan, the griddle is a big heavy chunk of metal and will hold the heat for a while, consequently continuing to cook the food in it.TIP: When you are making sauces, a splash of water in many cases can do wonders. If you take a sauce too far or kept it warm a little too long, reduced a little too much then a dash of water can be your saving grace.. Bring your dish together and serve with a glass of white wine, spoon the sauce on and around your perfectly cooked fish.Add a light garnish of green leaf, peppery rocket works a treat here. Enjoy as a great quick lunch, alfresco if you can :) \nRead the question below and select from the following choices.\nA: Conchiglioni With Herbs\nB: Additional Notes/nutrition\nC: Fast Asparagus\nD: Let\u00e2\u0080\u0099s Make the Cheese Spread!", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_61_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_61_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_61_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_61_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_61_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_61_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_61_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_61_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_61_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_61_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_61_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_61_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_61_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_61_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_61_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_61_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_61_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_61_17.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_61_18.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_61_19.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_61_20.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_61_21.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_61_22.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_61_23.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_61_24.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_61_25.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_61_26.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_61_27.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_61_28.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_61_29.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_61_30.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_61_31.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_61_32.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_61_33.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_61_34.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_61_35.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Marshmellow Peanutbutter Bananabarbequeboats With Honeydip\nB: Cook Chicken, Green Pepper, and Tomato\nC: Let the Dough Rest\nD: Pizzaiola", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Gather Ingredients and Supplies', 'Prepare Green Pepper, Tomato, and Chicken for Cooking', '@placeholder', 'Enjoy Your Quesadillas']", "context": "Here is the context of these images:\n. The following ingredients will make four quesadillas.\u00a0\u00a0 One quesadilla will feed approximately one adult, so adjust the recipe accordingly for the desired number of quesadillas.\u00a0Ingredients:\u00b7\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 8 \u2013 Flour Tortillas\u00b7\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 4 \u2013 Thawed Chicken Breast\u00b7\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 3 Cups Shredded Cheddar Cheese\u00b7\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 1 Packet Taco Seasoning\u00b7\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 1 Tomato\u00b7\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 1 Green Pepper\u00b7\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Sour Cream\u00b7\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 SalsaSupplies:\u00b7\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Pizzazz Pizza Oven\u00b7\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Medium Size Nonstick Frying Pan\u00b7\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Steak Knife\u00b7\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Plastic Spatula. In this step you will prepare the vegetables and the chicken for cooking.Green Pepper1.\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Wash the green pepper.2.\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Cut the green pepper in half and clean out the seeds and stem.\u00a03.\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Chop the green pepper into pieces that are approximately \u00bd in by \u00bd in (see picture below) and set aside.\u00a0Tomato1.\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Wash the tomato.2.\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Cut stem and core out of the tomato and discard.3.\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Chop the rest of the tomato into \u00bd in by \u00bd in pieces (see picture below) like the green pepper, and add to the green pepper.\u00a0Chicken1.\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Make sure that the chicken is fully thawed.2.\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Cut the chicken into small strips that are about \u00bd in by 2 in (see picture below).\u00a03.\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Set the chicken aside, separate from the vegetables as you will be cooking it first.\u00a0Once you have prepared the green pepper, tomato, and chicken you are ready to begin cooking.\u00a0. Begin by putting the chicken in the nonstick frying pan; do not put the vegetables in at this time.\u00a0Place the frying pan on the stovetop burner and set to medium heat.\u00a0Stir and flip the chicken as needed in order to keep it from sticking.\u00a0Cook the chicken until it is cooked through, this will take about 8 minutes.\u00a0In order to test if the chicken is done, cut a piece of chicken in half.\u00a0If it is a white color inside, and all of the pink is gone it is done; if it is still pink, continue cooking until done.\u00a0Once the chicken is done, push it up the sides of the pan to make an opening in the center of the pan for the tomato and green pepper.\u00a0Turn the burner down to medium-low and add the vegetables to the center of the pan.\u00a0Stir the vegetables as needed in order to cook all sides.\u00a0Cook the vegetables for about 4 minutes or until they are warm and tender.\u00a0. Once the vegetables are done as described in Step 3, mix the chicken and the vegetables together in the frying pan.\u00a0Turn the burner down to low or simmer.\u00a0You want enough heat to keep the mixture warm, but not too much heat that it dries out and burns.\u00a0Add the taco seasoning to the center of the chicken and vegetable mixture.\u00a0Dump 1/3 cup water on top of seasoning.\u00a0Stir the seasoning and water in the chicken and vegetables until everything is evenly coated.\u00a0Continue to let the mixture simmer until you are ready to use it in Step 6.\u00a0. Place two tortilla shells on the Pizzazz, offset as in the picture below.\u00a0Since the Pizzazz isn\u2019t quite large enough, the tortilla shells will overlap.\u00a0Turn the Pizzazz on the double burner setting so heat will come from the top and the bottom.\u00a0Add 1/2 Cup Cheese to the top tortilla shell so it covers the entire shell.\u00a0Allow the Pizzazz to cook for approximately two minutes, or until the cheese begins to melt.\u00a0. Once the cheese has begun to melt, place one fourth of the chicken and vegetable mixture on top of the melted cheese.\u00a0Then sprinkle a little more cheese on top of the chicken and vegetable mixture on the tortilla.\u00a0Note that it may be easier to unplug the Pizzazz to keep it from rotating while adding the chicken and vegetables and the cheese.\u00a0Just remember to plug it back in after you are done.\u00a0Allow the freshly placed cheese to melt on top of the chicken and vegetables, by letting the Pizzazz cook for another minute.\u00a0\u00a0After the cheese on top is melted, take the tortilla shell that is on the bottom of the Pizzazz and place on top of the other tortilla shell, chicken, vegetables, and cheese.\u00a0Apply pressure with the spatula to get both tortilla shells to stick to the middle ingredients.\u00a0Next allow the quesadilla to cook on the Pizzazz for approximately one more minute, so it becomes a little crispy.\u00a0Remove the quesadilla from the Pizzazz using the spatula, and place on a dinner plate.\u00a0In order to make the next three quesadillas, repeat Steps 5 and 6 three more times.\u00a0Once you have used up all of the chicken and vegetable mixture, remember to turn off the stove.\u00a0Also unplug the Pizzazz once your last quesadilla is done.\u00a0. Now that the quesadillas are cooked it is time to enjoy them. I prefer to spread sour cream and salsa to the top of mine. Serve the quesadillas with chips and salsa as a side to finish of a great meal. Note that there are many possible variations to this recipe that allow you to personalize this meal to your taste. For example try adding jalapeno peppers to chicken and vegetable mixture for a little spicier version. You could also add mushrooms and onions to the chicken mixture for added flavor. Try different combinations of ingredients until you find what you like best. Most importantly, enjoy your new found recipe, and use for a Pizzazz pizza oven. \nRead the question below and select from the following choices.\nA: Marshmellow Peanutbutter Bananabarbequeboats With Honeydip\nB: Cook Chicken, Green Pepper, and Tomato\nC: Let the Dough Rest\nD: Pizzaiola", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_62_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_62_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_62_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_62_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_62_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_62_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_62_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_62_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_62_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_62_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_62_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_62_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_62_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_62_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_62_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_62_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_62_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_62_17.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_62_18.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Goi Cuon (Vietnamese Summer Rolls)\nB: Make Sugar Syrup\nC: Into the Oven\nD: Decorate and Garnish Your Dessert", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Prepare the Lemon', 'Blend Together the Drink', '@placeholder', 'Serve and Enjoy']", "context": "Here is the context of these images:\n. This dessert focuses on utilizing different parts of the lemon to create a tasty drink. First you will need to cut the lemon into two halves. Next you will need to cut a thin slice about a 1/4 of an inch thick from one of the lemon halves. Set this aside on a piece of paper towel. Now, juice both halves of the lemon and pour through a strainer into a bowl to remove pulp and seeds. You can now set this aside as well. Finally we will zest the juiced lemon halves on a small-holed grater, until the shiny surface of the lemon has been removed. Lay all of your lemon parts aside on piece of paper towel and store in the fridge.. For this next step you will need a blender. I would recommend gathering your ingredients ahead of time so that the ice cream does not melt in between adding them. While these are the measurements I found to work effectively you can adjust them based off of personal preference via taste.Measure and add to your blender:1 cup ice cream1/2 cup milk1 teaspoon vanilla1/4 teaspoon peppermint extract1 Tablespoon of your lemon zest2-4 Tablespoons of your lemon juice depending on personal preference. (I added 3)Blend together until mixed, but still relatively thick. I would suggest using a low blender setting. If you are not serving the dessert immediately, put it in the fridge or freezer accordingly until needed.. Use the drink that you blended in the last step and pour until brimming in a glass of your choice. Use the flat edge of a knife to level off the drink. Now sprinkle a light coat of your lemon zest on top.You will now need your lemon slice. Cut it first in half. Cut a small slit in one half and place it on the edge of your glass. Now cut the other half into four slices and cut small slits on the peel side and place on the edge of your glass as shown in the picture.Take a pinch of cinnamon powder and sprinkle it in a decorative pattern over the top of your drink.Your Drink Is Now Complete!. Add a straw to your drink and enjoy outside in the warm summer weather.\nRead the question below and select from the following choices.\nA: Goi Cuon (Vietnamese Summer Rolls)\nB: Make Sugar Syrup\nC: Into the Oven\nD: Decorate and Garnish Your Dessert", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_63_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_63_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_63_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_63_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_63_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_63_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_63_6.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Pi Is Starting to Look Like a PI!\nB: William Jones' Pi Pie (mathematically Infused)\nC: Come Together....right Now....over Me....\nD: Egghead?! Custard in the Making....", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Gather Your Ingredients!', 'There Are More Wonderful Things to Add to the Bowl!', '@placeholder', 'Throw Me in the Fire!']", "context": "Here is the context of these images:\n. Ingredients:\n1 pie crust (store bought or home made)\n1/2 cup pine nuts (1/4 cup chopped very finely, 1/4 chopped and reserved)\n1 tablespoon olive oil\n1 onion (chopped)\n2 cloves garlic (minced)\n2 cups spinach\n1/4 cup sun dried tomatoes (chopped)\n1/4 cup Greek olives (chopped)\n1/2 cup feta cheese (crumbled)\n2 eggs\n1/4 cup milk\n1 teaspoon sea salt\n1 teaspoon black cracked pepper\n1 teaspoon onion powder\n1/4 teaspoon cayenne pepper\n10 sheets phyllo dough (thawed)\n3-4 tablespoons butter (melted). Preheat your oven to 350 degrees.\nOn a lightly floured surface, lay out your pie dough. Sprinkle with finely chopped pine nuts and gently roll the nuts into the crust using a rolling pin.\u00a0 . Lay the pie dough into the bottom of a glass pie plate (9 inch). Set aside.. In a saute pan, on medium heat, add olive oil.\u00a0 once oil is hot, add onion.\u00a0 Saute for 3.14 minutes and then add garlic to the pan.\u00a0 Continue to saute for an additional 3.14 minutes.\u00a0 Add spinach to the pan and saute for an additional 3.14 minutes. Remove from heat and move mixture to a mixing bowl.\u00a0 Let cool for 3.14 minutes.. Once the spinach mix has cooled a bit, add sun dried tomatoes, olives, pine nuts and feta cheese to the spinach mix and stir to combine.. In a small bowl, lightly beat the eggs and milk together until combined. Add the salt, pepper, onion powder and cayenne pepper to the egg mixture and stir to combine.. Add the egg mixture to the spinach mixture, and stir to combine.. Pour the spinach mixture into the prepared pie plate.. Lay a sheet of phyllo dough on top of the spinach mixture and brush with a bit of melted butter.\u00a0 Layer another sheet of phyllo, and then another bit of butter.\u00a0 Continue until you have 10 layers of phyllo, and make sure you brush that top layer with butter! . Bring the pie dough up along the sides and pinch the dough over top of the phyllo, making the crust.\u00a0 . Place pie into the oven.\u00a0 Bake for 30-35 minutes or until the top is a golden brown and the filling has set.\u00a0 . Serve hot, warm or room temp! \nRead the question below and select from the following choices.\nA: Pi Is Starting to Look Like a PI!\nB: William Jones' Pi Pie (mathematically Infused)\nC: Come Together....right Now....over Me....\nD: Egghead?! Custard in the Making....", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_64_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_64_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_64_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_64_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_64_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_64_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_64_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_64_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_64_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_64_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_64_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_64_11.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Oven Baked Jerky\nB: Broil or Bake!\nC: Oven Grilled Chicken\nD: Filling the Dishpan", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['What You Need for This Dish', 'The Smoked Sausage', '@placeholder', 'The Magic']", "context": "Here is the context of these images:\n. 1 a package of sauerkraut contains 0,5 kg 2 bags of instant mashed potatoes (per bag you need to add 0,5 liter boiled water 3 1 large hand of raisins 4 butter to grease the dishpan 5 1 can of pineapple pieces (also about 0,5 kg) 6 1 smoked sausage 7 curry powder (these are the packages as they are\u00a0available in the Netherlands, try to get about the same content if you use other packages). Open the can with pineapple pieces and pour the juice in the small pan. Add the sauerkraut, and put the pan on the stove. Once the juice/water boils put the stove on low heat and let it boil for 5 min.. In this case you need to add water \u00e1nd milk\u00a0according to the package of the instant version. Since I am allergic to milk I only add water, the same amount as the milk and water would be together. So I boil 1 liter water and add half of it to my measuring bowl. I add the instant powder, mix and let it sit for a while. In the mean time I grease the dish pan.. I don't know if you ever had a change to eat it but I love it! Especially the ones HEMA sells, so if you visit the Netherlands you definitely need to visit a HEMA for their ROOKWORST. Of course this sausage doesn't taste the same, but is also very good. I slice it the way the picture shows and usually end up eating the leftover part.... Also pre-heat the oven at 180 degrees Celsius / 356 Fahrenheit.. 1st layer: mashed potatoes 2nd layer: sauerkraut, pineapple pieces and raisins 3rd layer: mashed potatoes 4th layer: sliced smoked sausage & curry powder About the 2nd layer: Once the sauerkraut has boiled for 5 min. I pour off the juice/water it boiled in and pour it in the measuring bowl. I add enough water to get to half a liter and make the second bag of mashed potatoes with it.\u00a0. After you've prepared the dish, put it in the preheated oven for 30 min. After that, serve and enjoy your meal.\nRead the question below and select from the following choices.\nA: Oven Baked Jerky\nB: Broil or Bake!\nC: Oven Grilled Chicken\nD: Filling the Dishpan", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_65_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_65_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_65_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_65_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_65_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_65_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Menudo\nB: Continue With a Whole Bunch of Buns\nC: Load Up Katamari\nD: Whip the Cream", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Construct Prince', 'Prince Placement', '@placeholder', 'Background']", "context": "Here is the context of these images:\n. \nA head of cabbage makes an excellent katamari and radishes are the perfect size to use as the ends...\n\t\tWash and cut radishes in half\n\t\tSlice off the very end off each half\n\t\tPush toothpicks into cabbage, leaving about 1/2\" exposed\n\t\tPress radish halves onto toothpicks\n\t\tEvenly distribute radishes around entire cabbage. \nThe Prince is constructed from 2 cucumbers, a baby carrot and 4 green beans...Head\n\t\tCut both ends off\u00a0 two small cucumbers\n\t\tCut one of the cucumbers in half\n\t\tTake one of the halves and carve out a rectangle from the outer peel\n\t\tRemove small band of peel from both ends\n\t\tTake two of the end pieces and attach one to each side of the head with toothpicks\n\t\tStick a baby carrot in the top of the head to make the antennaBody\n\t\tTake the other half of the cucumber and press one or two toothpick in one end\n\t\tAttach head to body using the aforementioned toothpicksLegs/Feet\n\t\tRun toothpicks through two green beans, leaving a bit of toothpick exposed on one end\n\t\tPress legs into body at toothpick end\n\t\tTake the ends cut off the second cucumber (these will be the feet)\n\t\tCut a small circle out of the middle of each foot (approximately green bean in diameter)Arms\n\t\tRun toothpicks through two green beans, leaving a bit of toothpick exposed on one end\n\t\tPress toothpick ends into body at a reasonable arm position. \nSet The Prince up next to the cabbage katamari in a rolling stance.\nNow, The Prince did remain standing for the duration of the display, but i won't lie, it was precarious.\u00a0 I recommend setting up the veggies in the same place it will remain throughout the event.\u00a0 Also, make sure that the cabbage is stable, as it provides most of the support for The Prince.. \nToothpicks and/or skewers of fruits, veggies and cheeses can now be added...\n...along with turnip flowers : ). \nAdd Brussels sprout bushes, mixed green grass and weird fruit trees of strawberry and melon atop artichokes (or whatever weird fruit trees you can imagine).\nAnd don't discard those rinds!\u00a0 They can be filled with dips or salsa.\u00a0 The lemon pictured here is happily holding a yogurt fruit dip.\nThe example here, while a little out of control, is a very simple example of what can be done with the Katamari theme.\u00a0 It could be applied to a wide variety of foods and/or represent different levels of the game.\u00a0 A skilled garnish maker could do an amazing representation of the flower level...\n...and yes, that is a request.\nRead the question below and select from the following choices.\nA: Menudo\nB: Continue With a Whole Bunch of Buns\nC: Load Up Katamari\nD: Whip the Cream", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_66_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_66_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_66_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_66_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_66_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_66_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_66_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_66_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_66_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_66_9.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Enjoy!\nB: How to Make a Portal Cake\nC: \nD: ", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Poke Holes Into the Cake', 'Pour Condensed Milk Onto Cake', 'Pour Chocolate Syrup on Cake', '@placeholder']", "context": "Here is the context of these images:\n. Gather the following list of ingredients and utensils: Mixing Bowl Mixing Spoon or Fork Can Opener Spatula Chopstick, Straw or other similar utensil for poking holes in cake 9 by 13 pan with lid Measuring cup 1 - 14 oz can Sweetened Condensed Milk 1 - Bottle of Chocolate Syrup 1 - 8 oz Tub of Whipped Topping 1 - Bag of English Toffee bits 1 - Box of Cake Mix The following ingredients vary depending on the type of cake mix used: 3 Eggs 1 1/4 Cup water 1/3 Cup Vegetable Oil Non-stick spray. 1. Bake your favorite cake in a 9 by 13 pan using the instructions on the box or your own recipe. I have only ever used chocolate cake, but vanilla cake would work as well. 2. Let the cake cool enough to be handled by bare hands before moving on to the next step. This should take around 15 mins and will help make the rest of the process easier.. Take a chopstick, straw or other utensil of similar size and shape, and poke holes into the cake spacing them approximately 1/2 inch apart. Be sure that whatever utensil you use to make the holes leaves a hole big enough that the condensed milk will be able to flow into the holes. I have found that a plastic straw works best or a utensil that is 1/4 inch in diameter.. Pour one 14-ounce can of condensed milk onto the cake evenly. It should drain into the holes and settle on top of the cake as shown in the picture.. Pour chocolate syrup onto the cake evenly making sure that some of it drains into the holes. I use approximately one cup of chocolate syrup for this step, but more or less could be used to suit personal taste.. Cover the cake and place it in a refrigerator to chill for at least 45 minutes.. Spread whipped topping on the cake using a spatula making the layer about 1/2 an inch thick. More or less whipped topping can be used to suit personal taste. Be sure that the whipped topping is completely thawed, otherwise spreading it on the cake will be a difficult and messy job.. Drizzle some more chocolate syrup onto the whipped topping in any pattern you want. The amount you use for this step is completely up to you and what you want your cake to look and taste like.. Sprinkle English toffee bits evenly on top of the cake.. Place the cake back in the fridge to chill some more. This step is optional as the cake is ready to eat anytime, however, the cake should be stored in the fridge to prevent the whipped topping from melting and keeping all the other ingredients from spoiling.. Cut yourself a piece and enjoy the fruits of your labor!\nRead the question below and select from the following choices.\nA: Enjoy!\nB: How to Make a Portal Cake\nC: \nD: ", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_67_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_67_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_67_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_67_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_67_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_67_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_67_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_67_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_67_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_67_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_67_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_67_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_67_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_67_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_67_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_67_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Eggless Muffins\nB: Fill 'em Up\nC: Mix It All Up\nD: Blackberries", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['In Another Bowl..', '@placeholder', 'Fill the Muffin Pan', \"Bakin'\"]", "context": "Here is the context of these images:\n. You will need:1 c blackberries2 1/2 c all-purpose flour2 eggs1/2 tsp salt1/2 tsp cinnamon1/2 tsp baking soda2 tbsp milk1/2 tsp ground cloves1 c yogurt (I used the light vanilla kind)1 tbsp baking powder8 tbsp melted butter1 c sugar1 tsp vanilla extract. In a large bowl, you are going to combine the flour, baking powder, baking soda, cinnamon, salt and ground cloves.. In a separate bowl combine the sugar, eggs, vanilla, milk, butter and yogurt together.. Pour the second bowl into the first and mix all the ingredients together.. Cut the blackberries in half, then sprinkle them with 2 tbsp flour. This will keep them from sinking to the bottom of the mixture.. Fold the blackberries into the mixture.. Spray the muffin pan with non-stick cooking spray. Fill the batter to the top of the pan.. Bake in an oven that's been preheated to 400\u00b0 for 17-20 minutes, or until you can stick a fork in a muffin and it comes out clean.. Thanks for checking this recipe out. If you make them, comment and let me know what you think!\nRead the question below and select from the following choices.\nA: Eggless Muffins\nB: Fill 'em Up\nC: Mix It All Up\nD: Blackberries", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_68_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_68_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_68_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_68_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_68_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_68_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_68_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_68_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_68_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_68_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_68_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_68_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_68_12.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Edible Rosebush\nB: Decorate\nC: Bake and Pipe\nD: Make Eyes", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Ingredients', 'Prep Crackers', 'Add Pretzel Legs', '@placeholder']", "context": "Here is the context of these images:\n. Here's what you will need to make your own spider snacks:round crackers peanut butter pretzel sticks small candies or mini chocolate chipsbutter knife. Spread some peanut butter on two round crackers. This part will be the spider's body.. Place 8 pretzels on one of the crackers, 4 on each side and sandwich the other cracker on top. The pretzel sticks are the spider's legs.. Use two small dabs of peanut butter on top as glue to stick two small candies or mini chocolate chips for eyes! I used mini M&M's.. Now, eat the spiders and enjoy!\nRead the question below and select from the following choices.\nA: Edible Rosebush\nB: Decorate\nC: Bake and Pipe\nD: Make Eyes", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_69_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_69_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_69_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_69_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_69_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_69_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_69_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_69_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_69_8.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Cake Decorating With Fondant\nB: Preparing the Cake\nC: Making Frida Kahlo Topper\nD: What You Will Make & Learn", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Why Should You Follow Along With Me?', 'About Me', 'Tools for Cake Making & Decorating']", "context": "Here is the context of these images:\n. Whether you are a total amateur or looking to start your own cake decorating business, these projects are for you and I'm so glad you're here to learn! Throughout the lessons in the collection I will offer tips and tricks, secrets of the trade, and suggestions for making cake decorating a very simple process. Here is a rundown of what you will learn!Dark Chocolate Cake RecipeLeveling a Cake & Cutting LayersGanache Recipe & Applying GanacheVanilla Buttercream RecipeFilling & Stacking Cake LayersDark Chocolate Truffle RecipeApplying Rolled FondantSculpting Fondant DecorationsDuring the lessons in the collection you will design and create two different cakes \u2014 a trendy, decadent drip cake topped with sweet confections and an adorable fondant covered cake decorated with flowers and birds. Both will start with a rich double chocolate cake covered in dark chocolate ganache as a sturdy (and delicious) base!Making a Drip CakeDecorating a Fondant CakeWhat is a drip cake? A drip cake is traditionally a ganache or buttercream covered cake that has thinned ganache (or glaze) dripping down the sides. Drip cakes can be decorated in any way imaginable. We will be using store-bought candies and fondant flowers to decorate with. This cake will be perfect for a small wedding, an anniversary, or a birthday party.What is a fondant cake? A fondant cake is covered with an edible sugar paste that can be molded and formed over a cake. It can also be used to create sculptural and decorative items. Fondant adds nice structure to a cake that will hold up in the elements.. There are tons of cake tutorials and classes out there, what's different about mine?When\n I first started decorating cakes there weren't many videos or classes \navailable so I learned mostly by trial-and-error. I made mistakes\u2014lots \nof them! Now, you can find a video tutorial for almost anything related \nto cake decorating. I have watched several short Vimeo and YouTube \nvideos to see how other cake decorators make various things. There is \ncertainly no lack of videos and they range from working with fondant \nmolds to making isomalt jewels but there aren't many comprehensive \nlessons/classes out there that can take you from the first ingredient to a \ncompletely finished and decorated cake! I look forward to any questions you might have along the way and making things as easy for you as possible. Feel free to shoot me a message on any project or a private message to my account. I can't wait to see what you create.. Hi, my name is Jen Wold. While I'm not working away at Instructables I run a cake decorating business called Clever Wren Cakes & Sweets located along the mighty Mississippi River and \nan Etsy shop called Thermies. I've been creating custom cakes for over 10 years and my kitchen is \nalways filled with the delicious aroma of cake! My other creative hobbies include: sewing, quilting, embroidery, fused glass, stained glass, weaving, and trying and creating new recipes.. Now that you have a basic overview of how these lessons are structured and what you will be learning, let's find out about the required tools and ingredients to make fun decorated cakes!If you have ever been to the cake decorating section at your local craft supply store you may have noticed that there are literally hundreds of specialty tools and supplies for cake making and decorating. Since one could spend a small fortune buying all of these items (some completely unnecessary) I will limit the amount of specialty tools used to keep costs down.REQUIRED TOOLS - All of the tools in this list are necessary to complete the lessons in this collection.Mixing bowlsMeasuring cupsMeasuring spoonsTwo - 6-inch x 2-inch round cake pansParchment paperOne - 8 inch round cake base, cake drum, or cake standOffset spatulaFour - 6 inch round cake boardsRubber spatulaBench scraper or Plastic rulerSerrated bread knifeParing knifeRolling pinHand mixerRound cookie cuttersGeometric cookie cuttersFondant smootherFood coloringSmall sauce panSpoonMasking tapePaper towelOPTIONAL TOOLS - The tools listed below will make some of the steps in these lessons easier, but none of them are required to create beautiful cakes. I will offer creative solutions and substitutions for the following items that you might find in your home kitchen.Turntable12-inch piping bagPiping tip (Wilton tip 2A)Large coupler Cake levelerRolling (pastry) mat, or other silicone matClear vanilla extract or vodkaStand mixerCooling rackPetal foamBall head sculpting tool1-inch cookie scoop or Melon ballerSmall flower shaped cutterSugar pearlsSheet pan\nRead the question below and select from the following choices.\nA: Cake Decorating With Fondant\nB: Preparing the Cake\nC: Making Frida Kahlo Topper\nD: What You Will Make & Learn", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_70_0.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Preparing the Fruit\nB: Salmon Carpaccio\nC: Roll Up and Slice\nD: Bake and Enjoy!", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Preparing the Salmon', 'Proper Wrapping and Baking', 'Garnish and Enjoy']", "context": "Here is the context of these images:\n. \n\tAfter obtaining a pineapple, wash it well to remove any dirt and debris. Then, using a long sharp knife, cut off the top and bottom of the fruit to remove the leaves and stalk. Again use the knife to slice down the side of the pineapple, cutting roughly 4 inch wide strips, deep enough to leave no skin. Continue cutting the fruit, until the skin is completely removed. The skinned fruit can then be eaten, avoiding the circular core.\n\tTo garnish the plate, cut 6 leaves with slightly varying lengths from the foliage of the fruit, and set aside.\n\t\u00a0. - Set the oven to 375\n- Tear a sheet of tin foil large enough to wrap around the largest pineapple slice, and place that slice in the middle of the foil\n- Spread a dash of ginger and salt over the pineapple slice and place the thawed or fresh salmon fillet over that.\n- Spread another dash of ginger and salt over the fillet, and top with another pineapple slice.\n- Then use another pineapple slice and cut it down the middle, length-wise. Use both slices to\u00a0cover the sides of the fillet.. \n\tThis folding method is essential to keep the fish from drying out and the juices from spilling.\n\t- Fold in 2 opposite sides\u00a0of the tin foil until they meet in the middle, then pinch them together and roll them down\u00a0tight.\n\t-\u00a0Next, roll in the un-touched sides of the foil to create an enclosed envelope.\n\t- Place the foil wrap in the middle of the oven for 15 minutes, then flip to cook for another 15 minutes. *\n\t*Cook time may vary slightly due to the thickness of pineapple slices and the\u00a0salmon fillet.. - Carefully remove the foil wrap from the oven and un-roll the edges.\n-\u00a0Remove the top and side pineapple slices, and set on a plate in a stacked pyramid design.\n- Lift up the final pineapple slice with the salmon on top, and place on the plate with the pineapple leaves under it.\n- Lastly, use a toothpick to spear a triangle of fresh pineapple to the salmon.\nMore salt and ginger can be added, depending on tastes. The pineapple slices can be juiced onto the fish, or the fruit eaten off the skin.\nTo further complement the dish\u00a0I used homemade sweet potato fries and a shot of Barbancourt rhum (Haitian rum).\nRead the question below and select from the following choices.\nA: Preparing the Fruit\nB: Salmon Carpaccio\nC: Roll Up and Slice\nD: Bake and Enjoy!", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_71_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_71_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_71_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_71_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_71_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_71_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_71_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_71_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_71_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_71_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_71_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_71_11.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Sunday Go to Meeting Meatloaf\nB: Egg Noodles\nC: Combine All Ingredients\nD: How to Cook", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Preheat Oven', 'Cut the Bacon and Onion', 'Eggs', '@placeholder']", "context": "Here is the context of these images:\n. \u2022 1/2 pound sliced bacon (diced)\u2022 1/2 medium sweet onion (chopped)\u2022 3 eggs (lightly beaten)\u20222 cups frozen shredded hash brown potatoes(thawed)\u2022 1 cup shredded cheddar cheese\u2022 3/4 cups 4% cottage cheese\u2022 5/8 cups shredded Swiss cheese6 servings. Preheat oven to 350 degrees.  . Cut up the bacon and onion.  Dice the bacon and chop the onion.. In a large skillet cook the bacon and onion on medium heat until the bacon is crisp.  If you need to put the bacon in the microwave start with 30 seconds and add any additional time needed.  When it is cooked drain the bacon and onion.. Lightly beat the eggs and put them in a large bowl.. Shred the potatoes or just buy shredded hash browns and put them in the large bowl.. Add the remaining ingredients into the large bowl. (Shredded cheddar cheese, cottage cheese, shredded Swiss cheese, bacon and onions). Next transfer the ingredients to a 9 inch round or square dish. Put the dish in the oven for 35-40 minutes.  When done let stand for 10 minutes and enjoy your \"Go To Omish Egg Caserole\".\nRead the question below and select from the following choices.\nA: Sunday Go to Meeting Meatloaf\nB: Egg Noodles\nC: Combine All Ingredients\nD: How to Cook", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_72_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_72_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_72_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_72_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_72_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_72_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_72_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_72_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_72_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_72_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_72_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_72_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_72_12.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Sweet Potato Ravioli With Coconut Curry Sauce\nB: Add Other Ingredients\nC: Vegetable Rice Cooked With Coconut Milk\nD: Ingredients", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Half Boil the Potatoes', 'Stir Fry the Potato Ubes', '@placeholder', 'Add Grated Coconut']", "context": "Here is the context of these images:\n. Six to seven medium sized potatoesGrated coconut from half a piece of coconut.Salt to tasteOne teaspoon of red chili powderOne teaspoon of cumin seed powderOne tablespoon of cooking oil. Wash and place the potatoes in a pressure cooker Pour enough water to cover the potatoesPressure cook for about five minutes or one whistle. Do not over-cookRemove from stove, release pressure and pour cold water over the potatoes. Keep the potatoes in cold water and peel the skinCut potatoes into 3/4th inch cubes. Place a frying pan over medium heat and add a tablespoon of cooking oilAdd the potatoes and stir fry till the cubes turn to golden brown. Once the potato cubes turn golden brown, add a teaspoon of saltThen add one teaspoon of red chili powder and one teaspoon of cumin seed powderMix all ingredients together and cook for five more minutes. Once all other ingredients are properly cooked with potato cubes, add the grated coconutsStir fry till the grated coconuts turn golden brownRemove from stove and transfer to serving dishThis can be used as a side dish with rice and chapatis.\nRead the question below and select from the following choices.\nA: Sweet Potato Ravioli With Coconut Curry Sauce\nB: Add Other Ingredients\nC: Vegetable Rice Cooked With Coconut Milk\nD: Ingredients", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_73_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_73_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_73_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_73_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_73_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_73_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_73_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_73_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_73_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_73_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_73_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_73_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_73_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_73_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_73_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_73_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_73_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_73_17.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_73_18.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_73_19.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_73_20.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_73_21.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: gado and Cowboy Bread\nB: Shaping the Loaves.\nC: Tempering\nD: Steaming", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Ingredients', 'Method', 'Next Morning.', '@placeholder']", "context": "Here is the context of these images:\n. This recipe makes 2 large loaves and I small loaf:7\u00bd cups                 all purpose flour2 teaspoons           instant dried yeast3 cups                   warm water3 Tablespoons      dark brown sugar (not Demerara)1/3 cup liquid malt1 large                  egg (beaten)\u00bc cup                  oil (eg olive oil)1 teaspoon          salt2\u00bd teaspoons     mixed spice (usually cinnamon, ginger & cloves)2 cups                 raisins1 cup                   dried cranberriesAnd for giving a glaze to the finished loaves:2 Tablespoons     milk1 Tablespoons     sugarNotes: 1.  You can use all white flour or switch one or two cups to whole wheat.2.  You will probably need up to half a cup of extra flour to get a good dough.Malt is a type of sugar that can be found in most homemade beer supply stores.  It is a thick brown liquid that adds a wonderful flavour to the bread and makes it a little chewy.  You can NOT substitute molasses for malt.  Molasses is a different thing completely.  And do not think \u2018if 1/3 cup is called for, \u00bd cup must be better\u2019 as malt can do funny things to bread.  Use in moderation is the key.If you can\u2019t find malt, you could add another tablespoon of brown sugar instead of the malt and you will have a close copy of my version of Barmbrack.. This is what I do for all my bread.1. Before going to bed get a BIG bowl and add:   3 cups   white flour   2 tsp      yeast   3 cups   warm water (body temperature) and stir to a thin batter.  Cover well and put in a warm place overnight.You will notice there is no sugar in this over night stage (called a poolish).  Sugar is totally unnecessary in bread because an enzyme in flour converts the starch to sugar that the yeast can use.  In my every-day bread I never use sugar at all partly because we have too much sugar in our diets already but mostly because it is unnecessary.  This bread, being a sweet bread, has sugar added for flavor, not for the yeast\u2019s benefit.A friend once put the batter up on top of the kitchen cupboards overnight but he didn\u2019t use a big enough bowl.  In the morning there was batter dripping down the cupboards and spreading out all over the counter!  What a mess to clean up! So a big bowl. 2. Also before going to bed, in a second bowl pre-mix:    4\u00bd cups    flour (may be 2 white and 1\u00bd whole wheat)    1 tsp          salt    2\u00bd tsp       spices    \u00bc cup       oil  (OK it\u2019s not dry but this is a good time to mix it in)    3 cups     dried fruit. Next morning you will find the gluten has developed all by itself and you will have a lump of gooey gluten sitting in a very watery fluid.  So, now pour off some of the watery liquid into a small bowl and dissolve the sugar and malt in that before returning it to the main mixture.  (Or you could add the sugar to the main bowl, but it is easier in a small bowl).  Then add the contents of the second bowl and the beaten egg.  Stir until you can\u2019t stir any more and then get your hands in it to make an even ball of dough.  You will probably need about \u00bd cup more flour depending on humidity etc.  You may work it on the kitchen counter, though on this occasion I did not.  Then put the dough ball back in the bowl, covered, in its warm place for about 30 minutes.  The gluten will develop during this time without, needless to say, the need to knead.This 30 minutes is a good opportunity to butter/grease your bread tins.. During the 30 minutes in your warm place the gluten develops nicely.  Tip & scrape the dough onto your work surface and knead it a few times.  Stretch and fold, turn, stretch and fold again.  Then divide the dough into 3 pieces.  This recipe made 6\u00bc pounds of dough, so for the 2 large tins I used 2\u00bc lbs (1 kg) and for the smaller tin 1\u00be lbs (800 gms).  Stretch and fold each piece of the dough to make a sausage shape that will go into your tins.. I have an old apartment size fridge that I have converted to a warming cabinet by removing all the fridge stuff and putting a 60 watt light bulb at the bottom with a thermostat at the top.  I can set whatever temperature I choose and know it will be constant.Allow the dough to rise in your warm place for 45 \u2013 60 minutes and when well risen bake at 350 degrees F (180 C) for 45 minutes.  The sugar in the bread will caramelize and make a nice brown crust.  Immediately the bread comes out of the oven, brush over the top with the milk/sugar syrup to give a nice glaze.  Two or three coats in quick succession may be necessary to get a nice shiny glaze.  Allow to cool.  And then you know what to do\u2026..\nRead the question below and select from the following choices.\nA: gado and Cowboy Bread\nB: Shaping the Loaves.\nC: Tempering\nD: Steaming", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_74_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_74_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_74_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_74_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_74_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_74_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_74_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_74_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_74_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_74_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_74_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_74_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_74_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_74_13.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Materials\nB: Pineapple Upside Down Cake Updated\nC: Preparation of Cake Batter\nD: free Upside Down Cake", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Pineapple Filling', 'Assembly', 'Om Nom Nom...']", "context": "Here is the context of these images:\n. For this recipe you will need:\n20 oz. can pineapple, chopped or crushed\n20 oz. can pineapple rings\n3/4 cup sugar\n3 tblsp corn starch\n1 tblsp lemon juice\n1 pre-made dbl. crust pie crust\nYou will also need:\nA cupcake pan (jumbo if you can get it; here I've used the standard size)\nMedium Saucepan\nA wooden spoon\nA spoon\nA fork\nA rolling pin\nA can opener\nMeasuring cup and spoons\nShortening to grease the cupcake pan\nA 4\" circular cookie cutter. Pour your can of crushed or cut pineapple, juice and all, into your saucepan. Add cornstarch, sugar, and lemon juice and put on medium heat until the mixture thickens.\nSet aside.. Open up your pre-made pie crust and use your cookie cutter to cut out as many circles as you can. You'll need twelve to fill the 1 cupcake pan. Six of the circles need to be rolled thinner so that you can line each entire cup.\nYou can ball up and re-roll the scraps if you need more crust, or plan to try and fill up a second pan.. Pre-heat your oven to 425 Fahrenheit and grease your pan.\nOpen and drain your pineapple rings. Place one in the bottom of each cup of your cupcake pan. If it won't fit, just cut it up so that it will. It didn't occur to me until after I'd made the pies, but you could use a smaller cookie cutter on the pineapple rings to make them fit into your cups. Either way, once you've got your bit of pineapple on the bottom, set your wider, thinner circle of crust on top of it, lining the cup.\nSpoon in your filling until you are level with the top of the pan.\nPlace your smaller circle of pie crust on top and seal the seams.\nPlace into the oven and bake for about 25 minutes.. When the pies are done, the crust will be that nice golden-brown colour indicative of all that is wonderful in the world. Give them a moment to cool before trying to remove them from the pan. I had no trouble just plucking them up by the edges of the crust, so don't worry about trying to flip them over onto a pan or anything like that...\nServe upside down and with anything you want. I went with a nice vanilla ice cream because I've found that pineapple and vanilla go wonderfully together, but that's a story for another Instructable.\nEnjoy!\nRead the question below and select from the following choices.\nA: Materials\nB: Pineapple Upside Down Cake Updated\nC: Preparation of Cake Batter\nD: free Upside Down Cake", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_75_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_75_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_75_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_75_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_75_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_75_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_75_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_75_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_75_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_75_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_75_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_75_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_75_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_75_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_75_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_75_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The Best Part\nB: Homemade Salted Caramel\nC: All in the Pan\nD: Stir and Boil", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Scrub a Dub', 'Prep', '@placeholder', 'Finish']", "context": "Here is the context of these images:\n. Add all of the caramel sauce ingredients into a sauce pot and cook on medium low heat. Check the sauce often and make sure to stir constantly to avoid the sauce from burning and sticking. Using a candy thermometer,cook the sauce until it reaches a temperature of 235 degrees OR just cook the sauce for about 30 minutes until it's thick and creamy!. While the caramel sauce is cooking, this is a great time to remove the stem of the apples and add your sticks. I used wooden cake dowels that I cut in half. You can also use Popsicle sticks, actual sticks (this looks beautiful and rustic) or perhaps a long lollipop stick. Once that's done, place your apple one by one in the hot water and allow to sit for about 10-15 seconds to melt the outside wax. Using a clean towel, gently rub away that wax. You'll know the wax is gone because the apple won't be as shiny. Without this step, the caramel might slide off. . Grab a plate or counter space and lay down some waxed or parchment paper. Apply about a tablespoon of shortening to the waxed paper or a good amount of nonstick spray to prevent the apples from sticking. If you haven't done so already, prepare all of your toppings in bowls. . As the title says, this is the best part. Now that your caramel sauce is ready, remove it from the stove and place with the rest of your setup. Carefully dip an apple (holding it by the stick) into the caramel sauce and roll the apple to coat the entire surface. Hold the apple over the sauce, allowing some to drip away. Immediately dip the apple into the toppings of your choice and place on the parchment paper. Do this for all of your apples. Be aware that the caramel sauce will start to stiffen up. Get it back to it's consistency by placing it back on the stove for a couple of minutes while stirring, then continue. . Drizzle your caramel apples with white chocolate, dark chocolate or colorful candy melts. For a Christmas theme, try drizzling red and/or green candy melts onto the apples. The caramel will stiffen pretty quickly, so if using sprinkle candies apply them with haste or sprinkle into a bowl to dip the apple into. Allow the apples to sit in the fridge for at least 15 minutes to ensure that the caramels holds onto those toppings. . Turn these apples into gourmet gifts by tying a silk ribbon around the stick and placing into treat bags. I highly encourage you to recreate this recipe rather than melting caramel candies. The quality of the caramel is to die for. Enjoy and Happy Holidays! \nRead the question below and select from the following choices.\nA: The Best Part\nB: Homemade Salted Caramel\nC: All in the Pan\nD: Stir and Boil", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_76_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_76_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_76_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_76_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_76_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_76_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_76_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_76_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_76_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_76_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_76_10.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Kinder Bueno Cheesecake\nB: Make the Crust\nC: Make the Filling and Bake\nD: The Cheese Filling", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['The Ingredients', 'Preparing the Tin', 'The Crust', '@placeholder']", "context": "Here is the context of these images:\n. ingredients:\n- 1 pack (200g) of Butterkekse\n- 100g butter\n- 330g each of cream cheese and quark\n- 1 egg\n- 180g of sugar\n- 1 pack of vanilla sugar or vanilla extract or 1 vanilla bean\n- 1 bowl (200g) of sour cream\ntools:\n- 1 springform tin (26 cm diameter)\n- handheld mixer\n- baking paper\ni am from germany and butterkekse (butter cookies) and quark are easily available. i am not completely sure how you could subsitute them. maybe graham crackers and some sugar instead of the butter cookies and the double amount of cream cheese and no quark.. take a piece of baking paper and put it over the bottom of the springform pan before puttung it together. this way it will be much easier later to remove the cake from the tin once it is finished.\npreheat the oven to 180 C.. crush the cookies:\n- either put them in a freezer bag and crush them with a rolling pin\n- or put them in a big enough bowl and crush them with something heavy (like a meat tenderizer) - this is my favored method, see pictures\n- or put them in a food processor\nmelt the butter in a pan or the microwave and mix it with the cookie crumbs.\nput the crumb mixture into the prepared tin and push it flat evenly.. with the handheld mixer in a bowl mix the quark, cream cheese, sugar, egg, and vanilla until everything is thoroughly mixed.\n(you could also use a wire wisk)\npour on top of the cookie crumbs and flatten if necessary.\nput the cake into the middle of the preheated oven and bake for 35 min.. take the cake out of the oven and spread the sour cream evenly on top. then put it back in the oven for 5 min.\nthis will not make a big difference on the taste, but the cake will have a nice white top and won't look yellowish.\ntake the cake out of the oven and leave it to cool completely.\ni like to decorate it with a couple of dried flowers. (see picture from the intro)\nenjoy!\nRead the question below and select from the following choices.\nA: Kinder Bueno Cheesecake\nB: Make the Crust\nC: Make the Filling and Bake\nD: The Cheese Filling", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_77_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_77_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_77_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_77_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_77_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_77_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_77_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_77_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_77_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_77_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_77_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_77_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_77_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_77_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_77_14.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Old Fashioned Cream Scones\nB: Your Cream Tea\nC: You Will Need...\nD: Make Coffee", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Rub Flour and Butter Together', 'Mix to a Ball and Rest', 'Roll Out', '@placeholder']", "context": "Here is the context of these images:\n. Pre-heat your oven to Gas #7 / 220 C / 425 FIn a chilled mixing bowl, rub together the butter cubes with the flour and the baking powder until the mixture resembles bread crumbs.. Add into the mix both the sugar and sultanas.Add both the eggs and, using a wooden spoon, mix well.If the mix is too stiff, add the milk a little at a time. On average, I end up using about 100ml of milk. Do not make the mixture too wet.Your mix should be able to 'clean' the bowl when mixed to form a ball - see next step. Once you have added the wet ingredients, you should be able to use your hand to mould the mix into a ball.It should not be sticky but should be able to clean the bowl of all its ingredients.Once this is done, wrap your ball in some cling film or place a tea towel over the bowl and place the mix and the bowl in the refrigerator for about 20 minutes.You can now take a rest yourself, or do the dishes . Turn the chilled mixture out onto a floured work surface and roll out so they are about 2cm thick and, using your scone cutter, cup, glass, or mug, cut shapes outIf you don't have a rolling pin, shape and pat flat before cutting.Handle as little as you possibly can.. Once rolled out, place on your greased baking tray and brush with milk. You can brush with beaten egg if you prefer.Bake in the centre of the oven for 15 mins. Allow to cool completely before eating. You will need:Tea - I adore Earl Grey, but house tea is fine. And nowhere does it say you shouldn't change the tea for a coffee.SconesClotted CreamJam (Jelly) / preserve'Proper' Cream Tea etiquette would be as follows: A lose leaf tea is bestAllow tea to brew for at least three minutesTea before milk, never milk before teaOnce stirred, your spoon should be placed on your saucerA good scone should easily break apart, you shouldn't have to cut it in halfSpoon the clotted cream and jam onto your plate before spreading onto your sconeCheck out the big debate on next step. There has always been the debate of what goes first, jam or cream?Now, etiquette would dictate that it is jam before cream. But you know, I'm a rebel and, as you can see, I'm a cream before jam kinda lass :)So, it is at this point that I will apologise to all the traditionalists that are reading this, but I love mine this wayWhy not let me know your preference.Most importantly - ENJOY!\nRead the question below and select from the following choices.\nA: Old Fashioned Cream Scones\nB: Your Cream Tea\nC: You Will Need...\nD: Make Coffee", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_78_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_78_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_78_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_78_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_78_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_78_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_78_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_78_7.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Clean and Chop Squid\nB: Catalina Salad\nC: INGREDIENTS (3 Entrees or 4 to 6 Side Salads)\nD: PREPARATION", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Boil Squid', 'Add Squid and Marinate', 'Serve']", "context": "Here is the context of these images:\n. Pull the head/tentacles away from the body. Scoop any remaining guts out of the body, and remove the thin plasticy quill. Rinse the body inside and out, then slice into approximately 3/4-1 inch thick rings.\nSqueeze behind the head to extrude the beak, and remove it from the center of the tentacles. Cut just below the eyes to free the tentacles, then add them to the bowl with the body rings.\nTentacles are the best part. No, really- they're fantastic.. Bring a pot of water to a boil.  Add a bit of salt and a bit (1-2 Tablespoons) of wine or citrus juice. Drop the squid into the water in batches, removing it just as it turns opaque.  This should take less than a minute, so be ready with a slotted spoon.Deposit the cooked squid on a paper towel to cool and dry.. Combine:\njuice of 2 limes\n~1 Tablespoon hot chili/garlic sauce (sriracha)\n~1 teaspoon sesame oil\n~1/2 teaspoon fish sauce (or to taste)\n~1 teaspoon rice vinegar\n1 kaffir lime leaf, finely minced (zest from those limes makes a fine substitute)\n3 finely minced shallots\n2 Tablespoons brown or palm sugar (honey or agave nectar are good substitutes)\nhandful fresh mint, finely minced\nhandful fresh cilantro, finely minced\nsalt and pepper to taste\nStir it up and taste.  It should be aromatic, spicy, and acidic with a touch of sweet.  Adjust the seasonings as necessary to make the sauce taste good to you.\nNote that I resisted the temptation to add a grated garlic clove to the mix- there's already garlic in the sriracha, and I didn't want to overpower the squid.. Add squid and give it a stir.  Let it sit in the marinade for a bit, preferably in the refrigerator for about half an hour.  More marination certainly won't hurt; you can leave it overnight if you like.. Serve cold.  The longer the squid marinates the better the flavors will penetrate.  This will keep for a day or two, but like any seafood it shouldn't be left to moulder in the refrigerator.  We've never had any problems of this type, as this salad disappears quickly.\nGarnish with any of the herbs used in the salad and serve on funny-looking plates.  For best results, make sure all the tentacles are showing.\nRead the question below and select from the following choices.\nA: Clean and Chop Squid\nB: Catalina Salad\nC: INGREDIENTS (3 Entrees or 4 to 6 Side Salads)\nD: PREPARATION", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_79_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_79_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_79_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_79_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_79_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_79_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_79_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_79_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_79_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_79_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_79_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_79_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_79_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_79_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_79_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_79_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Make It Cheezy\nB: Macaroni and Cheese\nC: Ritz Cracker Topping Ingredients\nD: Ingredients", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Start the Macaroni Water', 'Cook the Pasta, Preheat the Oven', 'Make the Roux', '@placeholder']", "context": "Here is the context of these images:\n. Clear off your kitchen counterspace to give you enough room for safe food preparation. If you have a dishrack or dishwasher full of clean dishes, put them away so there's room for dirty dishes in your way.Once you have the space, get your your supplies ready: 2 medium-large saucepans (pots for the stove, about 4 quart capacity)Heatproof kitchen colander (usually made of metal)Heat-resistant spoon (such as a wooden spoon)Heat-resistant whisk (such as a metal whisk)Kitchen scissorsLiquid measuring cup (2 cup capacity)Measuring spoons. Fill the larger of your two pots 3/4 full of water, or 8 cups, whichever is smaller.  Place it on a burner and turn the burner on to medium-high.  . With a kitchen knife, cut 1/4 cup of butter (use the lines on the butter package - it will likely be 1/2 a cube.) Place the butter in your second medium-large saucepan (do not turn on burner yet.)Measure out 3 Tablespoons of flour, place in a small bowl near your workspace.Measure out 2 cups of milk, ideally in a liquid measuring pourer, keep near your workplace.Measure out 1 cup of breadcrumbs, set aside.Be sure your salt, pepper, and mustard containers are nearby.Get out a 8\"x11\" baking dish (lasagne pan or similar), and spray the bottom and sides with oil. If you don't have spray oil, pour about 1 tsp of oil in the base of the baking dish and using a paper towel rub the oil all along the inside surface.. As the water is boiling, check the package of your pasta to learn how long it takes to cook.  You'll see this package indicates 9 - 11 minutes. Once the water is boiling, carefully pour 12 oz of your pasta into the boiling water.  Then, set a kitchen timer for the number of minutes needed for cooking (I recommend the lowest of the times, in this case 9 minutes.)  Let the noodles boil, without a lid (uncovered.)While the pasta is boiling, turn on your oven to 400 degrees.When your timer goes off, drain your pasta in a metal collander.  Be very careful - the pot will be heavy, if in doubt ask an adult or older sibling to help with this step.Once the pasta has drained, place it back into the cooking pot you just emptied, and mix in a little bit (1 teaspoon) of oil to make sure the pasta doesn't stick to itself. . Place your second saucepan (the one that should have 1/4 cup butter in it) on a burner, turn to medium heat.The butter will start melting.  When it is fully melted, add your pre-measured flour and stir with a wooden or heat-resistant spoon. It'll be sticky, keep mixing until you can't see white bits of flour.Then, keep mixing for about 5 minutes, over medium heat.  You should notice the mixture turns slightly yellow, maybe even a little brown, and get a little thinner.Once it has turned this color & texture, get your whisk ready.  Slowly and steadily, pour the milk into the saucepan and whisk it into the flour & butter mixture.  It'll sizzle a little at first. Stir with the whisk quickly, with the heat still on.As you stir with your whisk (it's called 'whisking'), you should notice that after a few minutes the mixture will suddenly get thick.  This is the flour-butter mixture reacting with the milk.  Once you notice this thickness, turn the burner off.. As soon as you turn your burner off, mix in the 12 oz of grated cheese.  The warmth of the roux will melt it in.Then, measure out & add 1 Tablespoon mustard1/2 teaspoon salt1/2 teaspoon pepper.  You can measure each over a plate to the side of the pot, then once measured carefully pour directly into the pot.  That way, if you spill by mistake the spilt ingredient won't over-season what is already in the pot.You should now have a gooey sauce in your saucepan.. You should now have one pot of macaroni, and one pot of cheesey sauce.  Pour whichever is in the smaller pot into the larger pot (e.g. hopefully the sauce is in the smaller pot, so you'll pour the sauce into the pot with the pasta).  If it looks too small, you can get a large bowl and put the contents of both pots into one bowl.  Mix well with your wooden spoon, then pour into the oiled baking dish.Top with pre-measured 1 cup of breadcrumbs.. Carefully place in the pre-heated oven, and set a timer for 15 minutes.While the Macaroni & Cheese bakes, set your table with the following items per person:Large plateForkGlass for beverage of choiceIf you are serving with a side salad, now's a great time to put that on the table too.  A fruit salad or a green salad goes well with this meal.When the timer goes off, check to see if the Macaroni & Cheese looks done.  It should be a little brown on top, and/or bubbling a little on the sides.  If it doesn't look ready, set your timer for another 5 minutes.When done, get your oven mitts or potholders and carefully pull out of the oven.  If your family is careful, you can put the dish on a trivet or two potholders on your dining table and serve at the table.Bon Apetit!. Now that you've made Macaroni and Cheese once, you can get creative!Try using other cheeses! You can hand-grate your favorites, or chop into small cubes (the smaller they are, the better they will melt)  This recipe is a great way to use extra cheese you have in the refrigerator.  Add in other ingredients: after the cheese has melted you can add many other items.  Cooked bacon, sliced sausages, even some chopped-up vegetables can be tasty.  Try different spices: instead of 1 Tablespoon of mustard, try:1 teaspoon cumin and 1/2 tsp hot sauce for a Mexican-style flavor1 teaspoon Italian seasoning (or oregano, basil, or thyme) and 2 Tablespoons grated parmesan cheese for an Italian-style flavor\nRead the question below and select from the following choices.\nA: Make It Cheezy\nB: Macaroni and Cheese\nC: Ritz Cracker Topping Ingredients\nD: Ingredients", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_80_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_80_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_80_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_80_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_80_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_80_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_80_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_80_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_80_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_80_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_80_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_80_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_80_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_80_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_80_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_80_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_80_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_80_17.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_80_18.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_80_19.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_80_20.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_80_21.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_80_22.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_80_23.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_80_24.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_80_25.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: FOLLOW, COMMENT AND SUGGEST\nB: Epic Twice Baked Potatoes\nC: Second Baking\nD: Tortilla Chips and Tostada Bases", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Watch the Video for This Recipe', 'Ingredients', 'Directions', '@placeholder']", "context": "Here is the context of these images:\n. www.youtube.com/watch?v=rkOF_wMFo6I. 4 Large Potatoes300g (10.5 oz) Tomatoes300g (10.5 oz) Leeks150g (5.3 oz)  Chorizo100g (3.5 oz) Cheddar2 Tablespoons  Worcestershire Sauce1 Tablespoon Butter for Frying. Wash potatoes and place them onto a baking tray lined with baking paper. Bake in a preheated oven at 200C/400F for 50 minutes or until soft in the middle. Help the potatoes roast faster by piercing them with a skewer about 20 minutes into baking.Meanwhile: Wash the leeks & slice them thin. Throw them into a frying pan with melted butter and saut\u00e9 2-3 minutes before adding chopped tomatoes. Saut\u00e9 for 2 more minutes or until the tomatoes have softened. Add finely diced chorizo and saut\u00e9 for a further 3-4 minutes. Turn off the heat and set aside.When the potatoes are done, let them cool completely before cutting them in half (lengthwise).Scoop out the inside of each potato. In a bowl, mash the potatoes with fork. Throw in the chorizo mixture, Worcestershire sauce and mix until well combined. Taste it and season with salt & pepper, if needed.Fill the potato shells with the mixture. Top with cheddar cheese & bake in a preheated oven at 200C/400F for 15 minutes.Serve as a main, started or side!. \u25ba DON\u2019T FORGET TO FOLLOW ME ON INSTRUCTABLES, OVER 200 RECIPES AND TUTORIALS!LEAVE YOUR COMMENTS, QUESTIONS, IDEAS AND SUGGESTIONS! \u25ba Website:  www.happyfoodstube.com\u25ba Pinterest:  https://www.pinterest.com/happyfoodstube\u25ba YouTube:  www.youtube.com/happyfoodstube\u25ba Google+:  https://plus.google.com/+happyfoodstube\u25ba Facebook:  https://www.facebook.com/happyfoodstube\u25ba Instagram:  http://instagram.com/happyfoodstube\u25ba Twitter:  https://twitter.com/happyfoodstube\nRead the question below and select from the following choices.\nA: FOLLOW, COMMENT AND SUGGEST\nB: Epic Twice Baked Potatoes\nC: Second Baking\nD: Tortilla Chips and Tostada Bases", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_81_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_81_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_81_2.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Mint Chip Chocolate Cake Batter Cookies\nB: Bake\nC: Mixing in the Oats\nD: Mixing the Dough", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Ingredients', 'Preperation', '@placeholder', 'Storing']", "context": "Here is the context of these images:\n. 3 large egg whites - let them warm up to room temperature for best results\n1/4 teaspoon cream of tartar\n1 cup superfine granulated sugar\n1/2 tsp peppermint extract\n1/2 cup miniature semi-sweet chocolate chips\nNote:\nIf you cannot find superfine granulated sugar, measure a little over 1 cup of granulated table sugar into a food processor and process about 2 minutes.\u00a0 . Chill mixing bowl and beaters for 15 minutes.\nPreheat oven to 250 F.\nLine cookie sheets with parchment paper.\nBeat egg whites and cream of tartar on high speed until soft peaks form. (this is the part where that first prize Kitchen aid stand mixer would be ooooohhhh so convenient *hint hint*)\nAdd sugar a tablespoon at a time until all sugar is incorporated and melted into the meringue. The meringue should be shiny and form stiff peaks when you lift your beaters out.\nyou need to be careful not to overbeat your mix,\u00a0 if your peaks start to soften .. stop mixing immediately.\u00a0 At that point the more you mix the mushier it will get.\nFold in mint or peppermint extract, miniature chocolate chips.\nSome people like their minty confections to be green so if you feel you need to color code your goodies now is when you would add green food coloring.\u00a0 About 3 drops should do the trick. (don't worry the color will lighten up as they cook)\nUsing a pastry bag (or a teaspoon) drop small dollops of cookie mix onto prepared cookie sheets, placing cookies about 1 inch apart.\nNote:\nIf you don't have a pastry bag you can easily make one by putting your filling into a sealable plastic bag and then snipping off one corner with a pair of scissors.. Bake for one hour.\u00a0 After an hour turn off the oven but leave the cookies in to cool and harden for 2 hours.. I have never had these cookies last long enough in my house to go stale, but if you think it may be a possibility then you will want to store the cookies in an airtight container.\nRead the question below and select from the following choices.\nA: Mint Chip Chocolate Cake Batter Cookies\nB: Bake\nC: Mixing in the Oats\nD: Mixing the Dough", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_82_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_82_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_82_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_82_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_82_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_82_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_82_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_82_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_82_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_82_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_82_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_82_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_82_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_82_13.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Fiery Pumpkin Samosas\nB: Finish Off the Chocolate\nC: Boil and Bubble, Toil and Trouble...\nD: Bag Method", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['We Need Pumpkin...', 'Filler Up!', '@placeholder', 'Putting It All Togeather..']", "context": "Here is the context of these images:\n. One of the most fun parts of this recipe is getting the \"ingredients\" out of the pumpkin... I decided that the Instructables Robot would give me added inspiration so I carved him into my pumpkin! Since I did not need a whole lot of pumpkin flesh I decided to do the \"newer\" method of carving where you shave off the outer skin to let the light shine through the flesh. I thought this would be perfect for the orange Instructables Robot.\nIf you want to know how I carved the pumpkin I created a second Instructable here: https://www.instructables.com/id/Instructables-Robot-Halloween-Pumpkin/\nOtherwise we continue with the ingredient gathering... Separate the seeds from the pumpkin guts and spread out on a cookie sheet and let dry for a bit. Pre-heat the oven to 350 degrees and sprinkle a bit of olive oil and a good amount of salt over the seeds and roast in the oven until they turn golden brown.. Fry one clove of minced garlic along with half a medium onion in butter. Once the onion is tender add the finely chopped pumpkin and cook on medium high until it is fully cooked and the fibres have broken down. Add salt, pepper, a dash of cloves & nutmeg. Set aside and let cool.\nTake one Won-Ton wrapper and put a small spoon full of pumpkin mix just off center. Wet the edges of the Won-Ton and fold it over to make a triangle. Press the edges with a fork to join the \"Ravioli\" and create a seal. Repeat until you use up all of the Won-Tons.. To make the Caramelized Onion \"sauce\" fry one clove of garlic in butter until slightly browned on medium / high heat. Add a whole onion that has been sliced thinly and fry until it is tender and has browned. Add more butter to the pan and cook for 5 more minutes. Remove the onions and butter from the frying pan and set aside. Do not clean the frying pan at this time (you will see why a bit later).. In a pot of boiling salted water drop a few raviolis at a time and cook until they float (the first one in the pot tends to stick to the bottom so you may want to nudge it with a fork after it has cooked for a minute). It does not take long to cook these as the pumpkin is already cooked so you are just cooking the \"Pasta\". When the raviolis are done cooking they will float to the top of the pot just remove them with a strainer and let them drain.\nPut the frying pan (that you fried the onions in) back on the heat and quickly fry each ravioli in the left-over butter. This will add additional flavor and texture to the raviolis and give it some nice color.. Place the Raviolis on a plate and with a spoon drop the fried onions and butter over the top of each one. Take a handful of the pumpkin seeds and sprinkle over the plate. Lastly shave some Parmigianino Reggiano over everything. It is amazing how much flavor is in the pumpkin. This dish is a perfect example of sweet / salty & soft / crunchy all working together it is like a party in your mouth!\nAll that is left to do is to light your Jack-O-Lantern (https://www.instructables.com/id/Instructables-Robot-Halloween-Pumpkin/)\u00a0turn down the lights and enjoy dinner!Full RecipeIngredients\n1 Package Won-Ton Wrappers\n2 Cups Pumpkin\n1\u00a01/2\u00a0Onions\n2 Cloves Garlic\n3 Teaspoons Salt\n1 Teaspoon Pepper\n1 Teaspoon Cloves (Ground)\n1 Teaspoon Nutmeg (Ground)\n5 Tablespoons Butter\n1 Teaspoon Olive OilDirections:Filling:\n1) In a saucepan fry 1 clove of garlic in butter on medium high heat.\n2) When garlic is browned add 1/2 onion finely chopped.\n3) Add 2 cups pumpkin and cook until soft about 30 minutes.\n4) Add 1 teaspoon salt, pepper, nutmeg & cloves and stir.\n5) Take off the heat and set aside to cool.Roasted Pumpkin Seeds:\n1) Spread pumpkin seed on cookie sheet and let dry for 30 minutes.\n2) Preheat oven to 325 degrees.\n3) Sprinkle Olive Oil and 2 teaspoons of salt over the seeds.\n4) Roast seeds until the turn golden brown about 40 minutes.Raviolis:\n1) Separate the Won-Ton wrappers.\n2) Place a small spoonful of Pumpkin on the wrapper.\n3) Wet the edges of the wrapper and fold over into a triangle.\n4) With a fork press edges to seal and create decorative edge.\n5) Boil in salted water until the raviolis float about 7 minutes.\n6) Remove from pot and strain.Caramelized Onions:\n1) In a saucepan fry 1 clove of garlic in butter on medium high heat.\n2) When garlic is browned add 1 onion thinly sliced.\n3) Cook until onion is soft and browned about 20 minutes.\n4) Add rest of butter and cook down for another 5 minutes.Plating:\n1) Fry raviolis in the pan used to fry the onions 2 minutes each.\n2) Place on plate and spoon over caramelized onions and butter.\n3) Sprinkle with roasted pumpkin seeds and shaved parmigianino reggiano.\nRead the question below and select from the following choices.\nA: Fiery Pumpkin Samosas\nB: Finish Off the Chocolate\nC: Boil and Bubble, Toil and Trouble...\nD: Bag Method", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_83_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_83_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_83_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_83_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_83_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_83_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_83_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_83_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_83_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_83_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_83_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_83_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_83_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_83_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_83_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_83_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_83_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_83_17.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_83_18.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_83_19.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_83_20.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_83_21.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_83_22.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_83_23.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_83_24.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_83_25.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Plumb Smoothie\nB: Things You Will Need\nC: Wash Hands, Vegetables, and All Food Preparation Surfaces\nD: Cut Into Pieces", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Remove Seeds', 'Cut Into Slices, Peel', '@placeholder', 'Serve']", "context": "Here is the context of these images:\n. Cut the honeydew in half and then remove and discard the seeds using a metal spoon.. Cut the honeydew melon into slices and then peel the skin off of the individual slices using a vegetable peeler.. \nCut the honeydew melon into roughly 2 x 3 inch chunks. Place half of the pieces in the freezer, and the other half in the refrigerator. Let them chill for 1 hour.. Remove the honeydew pieces from the freezer and refrigerator and place half of them them in a blender with half of the other ingredients. Put the lid on the blender and blend on high until the mixture is completely blended, about 45 seconds. Pour the mixture into glasses, then repeat this process with the other half of the ingredients.. \nServe immediately with straws and a small slices of fresh honeydew, if desired. Makes about 4 servings.\nRead the question below and select from the following choices.\nA: Plumb Smoothie\nB: Things You Will Need\nC: Wash Hands, Vegetables, and All Food Preparation Surfaces\nD: Cut Into Pieces", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_84_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_84_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_84_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_84_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_84_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: How to Make Bread Without an Oven!\nB: Removing Idli From the Mould\nC: The Ingredients\nD: Use Sugru to Make an Ice Mould", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Prepare the Egg and Sugar Mixture', 'Churn', 'Serve Up and Eat']", "context": "Here is the context of these images:\n. \n          You will need:\n375ml full cream milk220g sugar4 bourbon vanilla beans8 drops food grade bergamot oil8 egg yolks600ml double cream. Add the milk to a saucepan.With each vanilla bean, slice it down the length and scrape the seeds out. Add the seeds and the beans to the milk.Add 8 drops of bergamot oil.Very slowly heat up the mixture till it almost boils.. \n\tWhilst the milk mixture is heating up, add the egg yolks and sugar to a mixing bowl. Beat these until the mixture is thick and pale.. \n\tSlowly beat in the milk mixture to the egg/sugar mixture in the mixing bowl. Discard the vanilla beans at the bottom of the saucepan, but keep any seeds that may have accumulated at the bottom.\n\tReturn the mixture to the saucepan and heat at a medium heat, continually stirring. Keep on stirring till the mixture thickens up, then take it off the heat.. \n\tReturn the mixture to the mixing bowl and beat it lightly to release a bit of heat. Place the mixing bowl in an ice bath and let it cool, stirring it every 3 or 4 minutes.\n\tWhen it has cooled down (10-15 minutes), add the double cream and stir it through.. \n\tPlace the mixing bowl in the freezer, taking it out every hour or so to beat it. This can be done with a stand mixer or by hand.\n\t\u00a0\n\tWhen the mixture is sufficiently thick (5-8 hours), transfer it to a storage container, (e.g. icecream tub or metal tin) then let it freeze overnight.. Use your imagination. I served it up with some toffee, but if you had some bergamot fruit you could make a syrupy marmalade and serve it with that.\nRead the question below and select from the following choices.\nA: How to Make Bread Without an Oven!\nB: Removing Idli From the Mould\nC: The Ingredients\nD: Use Sugru to Make an Ice Mould", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_85_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_85_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_85_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_85_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_85_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_85_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_85_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_85_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_85_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_85_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_85_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_85_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_85_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_85_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_85_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_85_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_85_16.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: In\nB: Go to Store\nC: Check Your Cupboards\nD: Dry Ingredients", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Melt Some Butter', 'Finish the Batter', 'Prep the Pan']", "context": "Here is the context of these images:\n. You will need:2 cups flour4 tbsp sugar5 tsp baking powder1 tsp salt2-1/2 cups milk6 tbsp butter 2 eggsQuart of blueberriesSemisweet Chocolate Chipsvegetable oilPowdered sugar (optional)Maple syrup (optional). In a large bowl mix together 2 cups of flour, 4 tablespoons of sugar, 5 teaspoons of baking powder and 1 teaspoon of salt salt.. Melt 6 tablespoons of butter in a small bowl. I typically microwave it for a minute, but you can do it the old fashioned way if you are patient.Be careful when taking it out of the microwave because some dishes tend to heat up. . Put two eggs in a small bowl and lightly beat them together. . Mix 2-1/2 cups of milk into the flour mixture. Follow this with the melted butter and the two eggs. Mix until the flour is completely wet and the mixture has an even color in the bowl. Don't over mix it. . Turn on the burner to a medium to a medium-high flame. Pour a little bit of oil into a large frying pan and spread it around to coat the bottom. Or if you have a griddle, just use that. One neat trick I learned recently was to wipe up the excess oil with a paper towel and set is aside somewhere safe. Between pancakes, you can use this paper towel to simultaneously wipe the pan clean and re-grease it. . Stir a generous amount of blueberries into your batter. Also, stir in a few handfuls of chocolate chips, but keep in mind that these have a tendency to sink to the bottom. Make certain that you have some leftover blueberries and chocolate chips lying around so that, when you start cooking, you can ensure an optimal \"Blueberry to Chocolate Chip\" ratio. . Pour some batter into your pan. Through visual measurement, make certain that there is roughly an even number of blueberries to chocolate chips by volume in the pancake you are cooking. This 1:1 relationship is considered an optimal \"Blueberry to Chocolate Chip Ratio.\" If after you pour some batter into the frying pan, you feel that the \"Blueberry to Chocolate Chip Ratio\"is imbalanced, you can fix it by adding either blueberries or chocolate chips as appropriate.After your pancake has started to cook on one side, you will see the wet batter on the other side start to bubble. After it has been bubbling for a short while, this is indication to flip it over. Slide your spatula underneath and gently flip it onto the other side.If left to their own devices, the blueberries will elevate the pancake off the surface of the pan and cause improper browning. I find it ideal to use the spatula to push down upon the pancake to flatten it out. This may cause some blueberries to hiss and explode. Don't worry about that.\u00a0They had it coming. Once the pancake has been sitting there for a minute or two flip it over once more. Make certain that the underside had cooked and then use your spatula and set it aside on a plateRepeat this process until all the batter has been used up. Serve immediately with powdered sugar and maple syrup. \nRead the question below and select from the following choices.\nA: In\nB: Go to Store\nC: Check Your Cupboards\nD: Dry Ingredients", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_86_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_86_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_86_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_86_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_86_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_86_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_86_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_86_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_86_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_86_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_86_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_86_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_86_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_86_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_86_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_86_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_86_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_86_17.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_86_18.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_86_19.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_86_20.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_86_21.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_86_22.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_86_23.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_86_24.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_86_25.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_86_26.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_86_27.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_86_28.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_86_29.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_86_30.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_86_31.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Form the Crust\nB: Banana Pudding\nC: Ingredients\nD: Magically Make Powder Into Pudding", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Ingredients', '@placeholder', 'Now Whip It', 'Finished']", "context": "Here is the context of these images:\n.  Gather and measure out the following ingredients: * 3 cups of cold milk* 1 TSP of vanilla extract* 1 (8 ounce) package cream cheese* 1 (14 ounce) can sweetened condensed milk* 1 (5 ounce) package instant vanilla pudding* 1 (8 ounce) container frozen whipped topping, thawedAdditional ingredients required to set aside:* 5 large bananas, sliced* 1 (12 ounce) package vanilla wafersServings: 12 . Take the block of cream cheese and blend in a large mixing bowl until fluffy. After reaching a fluffy consistency, gather the additional ingredients that make up the filling: condensed milk, pudding mix, cold milk and vanilla extract into your large mixing bowl. . Continue mixing until you notice a beautiful spreadable texture. I like to use a 5-Speed Hand Mixer and gradually go from a medium speed to high. . Line the bottom of a 9x13 inch dish with vanilla wafers. It's important to line the dish completely with wafers leaving very little open space. This bottom layer of wafers eventually serves as the crust upon the application of the additional layers to come.. Take the filling from your large mixing bowl and slowly spread over the wafers with an even distribution from right to left.. Take the the other half of the remaining whipped topping and spread over the filling and for smooth results try using a cake spatula. After this layer is applied, there should still be a little whipped cream left over. Set aside the remaining whipped topping left in the container. . Arrange sliced bananas evenly around the perimeter of the pan and place them row by row across the entire dish.. Finally take the last bit of whipped cream and spread across the layer of bananas. The bananas will tend to darken the second day so this coating is also protective. Additionally, everything tastes better with a little whipped cream on top. Next, you will need to take a handful of wafers crush them all the way up and sprinkle on top. Quick Tip: I like to use my NutriBullet Blender for a refined cookie crumble.Chill for three hours until serving.*Additional Tips*A friend told told me if you soak the bananas in acidulated water it prevents them from turning brown.For a reduced fat version, you can also opt for light cream cheese, 1% milk, sugar free pudding and light whipped cream.. Voil\u00e0, all done!\nRead the question below and select from the following choices.\nA: Form the Crust\nB: Banana Pudding\nC: Ingredients\nD: Magically Make Powder Into Pudding", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_87_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_87_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_87_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_87_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_87_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_87_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_87_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_87_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_87_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_87_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_87_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_87_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_87_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_87_13.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Khara Boondi (Savory Fried Balls)\nB: Fry It Up and Serve Warm\nC: Serve and Enjoy!\nD: The Fried Chicken", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['For the Filling', 'Microwave Mochi', 'Put It Together', '@placeholder']", "context": "Here is the context of these images:\n. (If you're using premade fillings like anko, red bean paste, skip this step)Wash, Peel, Chop, and put the Potato in Boiling Water (Boil for 15-20min)Mash the PotatoAdd the flavoring you want while the potato is still hot to fit your taste(pinch of salt, lil bit of coconut milk...go crazy)Then set it aside. In a Microwavable Bowl, mix together the Mochiko, Water, and Sugar. You can add food coloring if you'd like at this point.Microwave for 2-4min With the Cornstarch, spread it on a large plate or working surface and scoop the mochi on top (ITS REALLY HOT so take precaution) Sprinkle cornstarch on top of the mochi, cutting knife, and also your hands (cornstarch helps keep the mochi from being too sticky)Cut the mochi into 4 equal pieces. Stretch out one of the mochi pieces a little and fill with 1/2 - 1 tbsp. of filling.Wrap and pinch to close Brush off cornstarch (Note* You can stop at this point if you want regular mochi treat)Flatten the mochi to a 1/2inch circular disc.Repeat with the rest. On medium heat, put the oil in a large frying pan Fry till golden brown, 3-5min, then flip and fry the other side  Serve warm/hot\nRead the question below and select from the following choices.\nA: Khara Boondi (Savory Fried Balls)\nB: Fry It Up and Serve Warm\nC: Serve and Enjoy!\nD: The Fried Chicken", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_88_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_88_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_88_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_88_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_88_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_88_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_88_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_88_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_88_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_88_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_88_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_88_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_88_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_88_13.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Coriander Sweet Potato\nB: Simmer and Enjoy!\nC: Serving\nD: Sweet Potato Dough", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Ingredients', 'Cook Down the Onion and Sweet Potato', 'Add in the Coconut Milk and Chickpeas', '@placeholder']", "context": "Here is the context of these images:\n. 1 sweet potato cut into 1/2 inch chunks1 onion, diced 3 cloves garlic, minced1 inch ginger, grated 2+ tablespoons curry powder/garam masala of choice hot pepper of choice to taste - I'm using chipotle pepper powder 1/2 teaspoon+ amchoor powder 14.5 oz can chickpeas + their liquid 14.5 oz can coconut milkturmeric for color (optional) brown sugar to tastesalt to tastefresh limes for serving and seasoningrice for servingparsley and cilantro for servingcoconut oil or other oil for cookingFor this recipe, I suggest using a bright and spicy curry powder - nothing too sweet! The coconut milk and sweet potatoes are already quite sweet, so you want to balance it out. I used mostly curry powder and a bit of garam masala to round it out. I'm using amchoor powder here for tartness, but you can also just add fresh lime juice near the end of cooking. I normally do that any way. :). Heat a pan over medium heat. Add a bit of coconut oil to your pan and let it melt. Add in the sweet potatoes and onions and a pinch of salt. Let this cook for 5-10 minutes, or until the onions have softened and are the sweet potatoes start to darken in color a bit. . Add in garlic, ginger and dry spices. You may need to add more coconut oil if it's very dry!Mix this around and let it cook for a few minutes, just until everything smells awesome. At this point I normally add about a 1/2 teaspoon of chipotle pepper powder, too! The smokiness works really well with the coconut milk and sweet potatoes. But go easy if you're not too into spicy food - do less and work your way up!. Pour in the coconut milk and chickpeas and give it a good stir. Sometimes I'll add a good pinch of turmeric at this point just to give it extra color. :)This curry will darken slightly as it cooks, but turmeric gives it a nice boost!. Once the liquids are in, bring the curry to a boil and then reduce to a simmer. I normally simmer it covered for 15 minutes, and then with the lid off for 15 minutes. The first 15 minutes, it's all about getting the potatoes nice and soft. After that, you'll want to reduce the cooking liquid so it's nice and thick. :)After it's been a half hour, turn the heat all the way down and start tasting. If it needs to be more sweet, add a little of the brown sugar! If it's too sweet, try adding more salt or some lime juice. Salt with also boost the flavors of the spices, so if it seems a little bland, add salt!Keep adding and tasting until it's perfect for you. I like to serve it over rice with fresh cilantro and parsley over the top and a bit of lime on the side so you can season it just right! :D\nRead the question below and select from the following choices.\nA: Coriander Sweet Potato\nB: Simmer and Enjoy!\nC: Serving\nD: Sweet Potato Dough", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_89_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_89_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_89_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_89_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_89_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_89_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_89_6.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: How to Make Mickey Mouse Cookies\nB: Flatten Dough\nC: Roll It, Shape It and Bake It\nD: Make Cookie Dough", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Roll the Dough', '@placeholder', 'Bake the Cookies', 'My Finished Cookies']", "context": "Here is the context of these images:\n. Roll the dough into eight dough balls. For cute snack time cookies make the dough\u00a0balls\u00a0smaller. . Flatten the dough slightly.. Grease a baking tray then place the cookies on and bake at 180 degrees c/ 350 degrees F/ Fan 160 degrees c/ Gas mark 4 for 10 to 12 minutes.. Here are some photos of the finished cookies and the baby snack cookies as they proccesed.\nRead the question below and select from the following choices.\nA: How to Make Mickey Mouse Cookies\nB: Flatten Dough\nC: Roll It, Shape It and Bake It\nD: Make Cookie Dough", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_90_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_90_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_90_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_90_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_90_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_90_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Delicious Colombian Arepa\nB: Add Oil and Vanilla\nC: Eat/serve\nD: Mix Well", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['What You Will Need', 'Chopping Up and Preparing the Pizza.', 'The Outline.', '@placeholder']", "context": "Here is the context of these images:\n. \u2022Scissors \n\u2022Knife\n\u2022Pepperoni\n\u2022tomato sauce (pasta type works fine)\n\u2022two plain pizzas\n\u2022orange peppers \n\u2022 pineapples \n\u2022Can opener \n\u2022Cheddar or mozzarella cheese. (Or in my case both.\n\nWhen you have all these things ready move in to the next step.\n\n\n. For the yellow birds eyebrow you will want the pepper to be quite thick as well as flat.\n\nYou will want to quarter the olives because in a minute you will use them to make the outline....\n\nFor eyes just half them.\n\n\n\nPizza:  \nGet your sauce and spread it all over the  pizza. And add all the cheese you want (I would use mozzarella if you have both as it is white.)\n\n. You may want an image with you for this or just do it off memory like I did. \n\ngrab all of the quartered olives and begin to place them around the pizza until you get an outline like mine. \n\n. First of all I would begin by adding the cheese at the bottom of the bird then work your way up from there. Then slighty above half way begin working on the eyes and eyebrows. Note - for the white parts of the eye you may want to use mozzarella. \n\nI was trying to use a flat piece of cheese for a beak but I kind of messed up.... But I'm sure you clever people will find a way around that. (<-_->)\n\nBy the way guys - your pizzas will nearly definitely end up better looking then mine... I'm not a very artistic person :D. I just whacked both of the pizzas in the oven at about 200 degrees for ten minutes. The red one came out much better than the yellow one. \n\nBut it's on your judgement to decide wether it's cooked or not.. There you go guys thanks for looking at this instructable (my first of many hopefully) and I hope your pizza is delicious and artistic - and better than mine. If any of you guys do the beak successfully then please leave a comment :D\n\nAnd don't forget to favourite! I would really like a pizza oven! :D\nRead the question below and select from the following choices.\nA: Delicious Colombian Arepa\nB: Add Oil and Vanilla\nC: Eat/serve\nD: Mix Well", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_91_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_91_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_91_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_91_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_91_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_91_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_91_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_91_7.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Form the Mac'N'Cheese Bites\nB: Spanakopita Bites\nC: All Equipment Used\nD: Mix All Dry Ingredients Together", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Make the Atomic Filling', '@placeholder', 'Coat the Macaroni', \"Fry 'em Up!\"]", "context": "Here is the context of these images:\n. Mix your macaroni and cheese, hot sauce, cheese, cayenne, and 2 tbs of flour in a medium bowl until evenly distributed. . Take about 2 tbs of the mixture and squeeze it to compress the filling. Roll it around to form a ball and place on a sheet pan. If you find it's sticking to your hands or not rolling nicely, dampen your hands with a touch of cold water. It's okay if a piece of macaroni is sticking out a bit, it doesn't need to be perfect! Repeat this process with the remaining mixture. You should make about 16 - 1 1/2 in balls. Chill the balls in the refrigerator for 1 hour to let them set up. . You will need 3 small bowls. In the first bowl put the remaining 1/2 cup of flour. In the second, whisk the egg with the milk. Finally, in the third bowl mix the breadcrumbs, salt and pepper. Arrange the bowls in a line: flour, egg, breadcrumbs.. Remove the macaroni balls from the refrigerator. Take one ball and give it a quick squeeze and roll to finalize the shape. Roll it in the flour, shaking off the excess. Then, dunk it into the egg wash. Finally, roll it into the bread crumbs and place back on the sheet. Repeat with the remaining balls. TIP!: Use one hand for the flour and breadcrumbs, and the other for the egg wash or you will end up with super coated finger tips!. Heat a deep fryer to 325 degrees. If you don't have a deep fryer, you can fill a small sauce pan 2/3 full with oil and monitor the temperature with a candy thermometer. Fry the balls in batches for 2-3 minutes until the coating reaches a deep golden color. Remove from oil and drain on paper towels for 5 minutes. Now, eat them!! If they are a bit too spicy for your liking, you can dip them in a little ranch or blue cheese dressing.\nRead the question below and select from the following choices.\nA: Form the Mac'N'Cheese Bites\nB: Spanakopita Bites\nC: All Equipment Used\nD: Mix All Dry Ingredients Together", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_92_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_92_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_92_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_92_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_92_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_92_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_92_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_92_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_92_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_92_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_92_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_92_11.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Bacon Infused Venison Burgers!\nB: Bacon Flavored Caramel Syrup!\nC: Cut Bacon\nD: Put a Lid on It and Chill Out.", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Cook Up Bacon', 'Remove Bacon Fat', 'Filter Out the Bacon Bits']", "context": "Here is the context of these images:\n. Open up your package of bacon, and begin to slice it into smallish pieces, about 1/2\" x 1/2\". Cook up the bacon in a skillet. Pour the bacon grease and bits into a glass pitcher or sizable equivalent. Mix in the vodka!!!!  . Put the vodka into the freezer for at least 30 minutes. Take the vodka out of the freezer, and remove what I like to call the moon pie of bacon fat that has congealed at the top of the pitcher.  . Filter the bacon bits and other floaty pieces out of the vodka by pouring it through a strainer of some sort.  I started with a mesh strainer to remove all the bigger pieces, then I started pouring it through a finer filter to get out the small stuff.   I used the grease guard that I normally put over skillets when I'm cooking things that shoot off hot grease.  I recommend repeating this step several times to remove as many particulates as you possibly can.  . Pour your vodka into a fancy glass container, and then enjoy!  \nRead the question below and select from the following choices.\nA: Bacon Infused Venison Burgers!\nB: Bacon Flavored Caramel Syrup!\nC: Cut Bacon\nD: Put a Lid on It and Chill Out.", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_93_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_93_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_93_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_93_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_93_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_93_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_93_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_93_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_93_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_93_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_93_10.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: minute Tea Kettle Meal for One\nB: Dressing\nC: Mix Up the Jello!\nD: Refrigerate and Add Cheese!", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Saute Celery and Onions', '@placeholder', 'Assemble Chicken', 'Serve']", "context": "Here is the context of these images:\n. Ingredients for the chicken:2 Chicken wings  (make sure they are right and left side) I forgot and did not save the correct wings to match the body of the chicken.2 chicken legs2 Tablespoons of butter2 Tablespoons of olive oilPoultry seasoning to tasteThyme to tasteYour choice of salt To tasteFresh ground black pepper to tasteIngredients for the Stuffing for the chicken body:1 box of stuffing mix or make the recipe below.5 cups soft old bread crumbs cubed.2 Celery stalks chopped.1 Medium onion chopped.Approximately 1- 1 1/2 cups chicken stock canned or homemade. Please note: I started out with 1/2 cup and it was not enough so I am not sure how much more I had to add. This is an estimate.Craisins to taste optional4 Tablespoons real butter and (extra for cooking celery and onions) add olive oil to the butter to prevent the butter from browning.Poultry seasoning, dried or fresh thyme, salt and fresh ground black pepper to taste. I recommend pink salt or sea salt to receive the best nutritional benefits from the natural minerals.I modified Betty Crocker's stuffing recipe by reducing the ingredients to make a much smaller portion  and added Craisins, olive oil replacing a ton of butter and replaced the water with homemade chicken broth. Extra Ingredients for the chicken body:1 medium size russet potato.1/4 Cup frozen peas, corn,or diced carrots. Garnishes you prefer:I used an orange, small tomatoes,cooked squash, fresh rosemary,and fresh dark greens.Utensils:Basic utensils were used: Cast iron skillet with oven safe lid but any multi purpose cooker would probably work, crock pot, or I used the stove top and oven,aluminium foil, spoons,fork, knife, bowls, cutting board, scissors to cut the skewer, tooth picks, and skewers to attach the legs.. Procedure:Pre-heat oven to 350 degrees F. Measure ingredients.Wash potato and poke a hole in it to vent it.Place the potato in the oven with or without aluminium foil. Bake 30-45 minutes. About half way done.If you have not already washed and chopped the celery and onions do so now.Place raw chicken on a plate and dry it with a paper towel or napkin. This will make the skins crispy.Place butter and olive oil into a skillet and melt the butter. . Method:Saute the celery and onions in the butter and olive oil and cook for several minutes then . . . Add the chicken broth or water to the mixture and season according to taste.Cook until the onions and celery are translucent.While this mixture is cooking rub olive oil all over the chicken legs and wings.If the skins are not covering the legs, use toothpicks to secure the loose skin for appearance. I should have waited to season the chicken until after I used the toothpicks. Season the legs and wings.Remove the celery and onions from the heat without the broth and set aside. Place the broth into a separate bowl. If you decide to cook the stuffing and chicken all at the same time, then go on to the next step and season the chicken how you like . . . otherwise follow these instructions the way I made our chicken.In a clean skillet, melt the butter and olive oil. Add the raw chicken. Start cooking the chicken browning on both sides. You may wish to add more seasonings to the chicken along the way.When the chicken is half way cooked remove it from the heat. . . and to attach them to the chicken body. If it takes longer than 5 or ten minutes to add it to the body of the chicken, I would go ahead and fully cook the chicken to prevent Salmonella. You can add the legs and wings to the chicken's body after the dressing has been baked, by using the skewers and the tin foil for support.  Remove the tooth picks. It does not take very long at all to position the chicken pieces to shape the body of the chicken and as soon as the body is formed it goes immediately back into the hot oven. Be safe and be smart, pay close attention to how long the chicken has been out of the oven. It is best to cook the chicken all the way through than to risk getting sick.. Procedure:If you will be making a box mix just follow the instructions for making the mix except you will use the stuffing to form the shape of a chicken and then bake it in the oven or multi purpose cookware.If you will be making the stuffing recipe, the instructions are:4-5 cups of soft bread crumbs (cubed) preferably older bread.Gently mix the celery and onion mixture with the bread crumbs. Add water or chicken stock, a little at a time until the dressing will form a ball without falling apart. Add more stock or more bread as needed to form a ball that will keep its form. I made my own stock from scratch.Form the chicken body in the next step.Check on the potato if you haven't already, you need it for the next step.. Method: Pre-heat oven to 350 degrees FRemove some of the bake potato using a spoon as shown in the picture.Place aluminium foil over a cutting board or heavy piece of cardboard overlapping the sides. The foil will firm up the chicken's body so it can be transferred into a skillet or multi purpose cookware using the edges of the foil for hand grips.Place the potato in the center of the cutting board over the foil. Pick up a handful of the dressing and begin to form the chicken's body with the dressing mixture.Lay the chicken legs and wings along the side of the chicken dressing form; making sure you have the right and left wings on the correct side.Place a skewer through the chicken legs and push the legs slightly into the body of the chicken.Trim off the extra length of the skewer; so it won't be in the way of the tin foil when you position the chicken into the skillet.Tie the legs together using the bakers twine for a nice presentation.Position the chicken wings in place and press up against the body of the chicken so that they are not going to move. Position the wings pretty. Stabilize the wings with a skewer if needed. I did not use anything.Carefully lift the chicken  into the skillet and position the wings if needed; making sure they are attached to the sides of the chicken so during the cooking process the dressing will act as a bond to keep the wings into place when you remove the chicken from the skillet.Cover the skillet with foil or an oven proof lid that won't mash the bird.If baking squash, place it into the oven. I did not use a baking dish for it and placed it directly on the oven rack. Bake until it is done and then remove it from the oven. Timing is the key element for making this recipe so everything is finished baking at the same time. My turkey legs were too large for the amount of dressing I made so I had to make an adjustment by making chicken legs which were much smaller or increasing the amount of dressing I would be making. The chicken legs were perfect for this recipe.Set the timer for 20 minutes and check the bird. Check the inside temperature of the dressing and the meat every fifteen minutes until done. If I am correct the internal temperature should be at least 165-170 degrees F for several minutes to destroy any bacteria.Remove the aluminium foil the last fifteen minutes of the cooking time to brown the chicken and dressing. The dressing takes about 30 minutes covered and 15 minutes uncovered to cook. This is a good base point to estimate how long to cook a chicken leg according to your preference. We cook ours very well done so I can't give you an idea other than what I mentioned. I usually cook whole chickens.. Presentation:Lay the platter on the table.Position the dark greens along the outer edges of the platter as shown.Cut the garnishes into pretty shapes but do not arrange them on the platter until after the chicken is centered on the platter.When the chicken is done, allow it to cool for a few minutes.Carefully center the chicken on the platter and tuck the aluminium foil under the chicken so it does not show. Remove the skewers and double check to make sure the tooth picks are removed. I did not get any pictures when I removed the skewers. Add the rest of the garnishes if you are using them.. Crank up some festive music and light the candles and enjoy your holiday meal~  . This recipe is easy to make and does not require a lot of ingredients to make it a fancy affair. It is healthy and delicious~ perfect for a single person or couple who do not wish to make a full holiday meal with all the trimmings. Even a college student who wants to impress their date and has limited kitchen resources could make this if they had a multi cooker.  I will be making another turkey platter before Christmas and will share how I made it with you if it is a success. This was such a fun cooking project.Thanks so much for stopping by . . . be safe and happy~sunshiine~\nRead the question below and select from the following choices.\nA: minute Tea Kettle Meal for One\nB: Dressing\nC: Mix Up the Jello!\nD: Refrigerate and Add Cheese!", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_94_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_94_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_94_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_94_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_94_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_94_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_94_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_94_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_94_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_94_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_94_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_94_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_94_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_94_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_94_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_94_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_94_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_94_17.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_94_18.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_94_19.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_94_20.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_94_21.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_94_22.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_94_23.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_94_24.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_94_25.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_94_26.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_94_27.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_94_28.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_94_29.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_94_30.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_94_31.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_94_32.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_94_33.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_94_34.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_94_35.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_94_36.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_94_37.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Sugar Skull Cake\nB: R2D2 Projector Cake\nC: Cake Stand\nD: Gather Your Ingredients", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Making the Skull', 'Skull Candy', 'The Finishing Touches']", "context": "Here is the context of these images:\n. For this cake you will need:1 Cup unsalted butter, at room temperature1 1/2 Cups sugar4 large eggs, room temperature1 Teaspoons vanilla extract1 Teaspoon almond extract2 Tablespoons espresso powder1/2 Cup sour cream1/2 Cup whole milk2 Cups plus 4 Tablespoons flour1 Cup unsweetened cocoa powder (I prefer the special dark)1 Teaspoons salt1 Teaspoons baking sodaTo decorate the cake you will need:White chocolate skull (recipe to follow)1 can of chocolate frosting2 cups of Nilla wafers2 cups of chocolate cookie waferscocoa powdergreen food coloringblack food coloringyellow food coloringMint leavesCooking sprayTo make the skull you will need:3 bags of white candy meltscocoa powdercooking sprayvodkaYou will also need a standard 9X9 cake pan and a skull mold. An optional food safe airbrush is also suggested.A few weeks ago I found this absolutely incredible mold online. It\u2019s a bit pricey, but if you can swing it, I can\u2019t recommend it enough. It\u2019s a bit of a beast to mold, but I\u2019ll walk you through it.. Start by first setting your oven to 350F/175C and allowing it to pre-heat.Prep your cake pan by thoroughly buttering and flouring it.Cream your butter and sugar until light and fluffy. Add in your eggs one at a time, scraping down the sides of your bowl as you mix. Add in your vanilla, almond, and espresso powder.In a separate bowl, whisk together your sour cream and milk until well incorporated and then add to your butter and sugar mixture.Sift together your flour, cocoa powder, salt and baking soda and add the mixture slowly to your wet ingredients. Blend until thoroughly mixed but don\u2019t overwork.Pour your batter into your cake pan and bake in your oven for 30-35 minutes or until a toothpick inserted comes out clean.Allow to cool for at least 10 minutes before attempting to remove from the pan.. Next we\u2019ll make our chocolate skull.   For the purposesof this recipe, we will only be using the cranial section of the mold.  The jaw (which is a separate piece altogether) will not be used.  The first thing you want to do with this mold is to make sure it\u2019s good and greased.  Normally you don\u2019t have to oil up a silicone mold, but I\u2019ve found with trial and error on this beast that everything you can possibly do to make it release your chocolate works in your favor.I spray the whole thing down with cooking spray and then go back over it again with a pastry brush to make sure the spray is in the deep nooks and crannies.  The brush also helps to spread out any areas where it might pool.  You want a thin coat of spray\u2026Assemble the two halves of the upper cranium and secure.  I placed mine inside a box that just happens to be almost the perfect size to hold the two halves together.  I brace the sides with a little extra foam to keep it from wiggling.Melt down one bag of candy melts.  You can do this either by placing them in a crock pot or electric fondue pot set to low, or by zapping in the microwave for 30 seconds at a time and stirring between cookings.Once your candy melts are melted and smooth, pour the entire pot into the half of your mold that makes up the upper cranium.  Tilt the mold back and forth to make sure you get an even coat on all sides.  A pastry brush can also assist in getting the chocolate into the grooves and spots that might be a bit tougher to reach just by tilting.  Set this aside and allow the chocolate to cool.Melt your second bag and repeat the process with the lower portion of the mold, again allowing it to cool and harden.Melt your third and final bag of candy melts, but this time allow it to cool almost to room temperature.  You want to be able to pour it into your mold without having it melt through the layer you\u2019ve already poured.When it\u2019s cooled down enough, pour the entire bag into the upper half of your cranium and then assemble the mold, placing the two halves together.Now comes the fun part\u2026rotational casting.Make sure your mold halves are secured together.  I use a strap wrapped around the entire thing to make sure all the pieces stay where they are supposed to stay.. Carefully start rotating your mold around 360 degrees. You want to make sure that the liquid chocolate inside the mold fully coats and covers every inch of the mold which means you have to turn it upside down and all around.Do this for a good 20 minutes.  It\u2019s a workout, but worth it.Now place your mold in the fridge.  Every two minutes for the next 30 minutes, rotate your mold by flipping it onto each side.At the end of those thirty minutes, turn the whole thing upside down and leave it alone for 2 hours!  WALK AWAY.  Go watch a movie.  Take a stroll.  Do whatever you want, but leave the mold alone.When it comes time to open the mold, do it carefully.  Gently rock the silicone pieces back and forth to help release their hold on your chocolate.Be prepared, you\u2019re going to have breaks.  It happens\u2026but for this cake, it\u2019s okay\u2026it\u2019s supposed to look worn and old.  If it happens, save the pieces and you can either glue it back together using more liquid candy melt, or simply leave it broken and tell everyone you meant to do that.  It\u2019s art\u2026it\u2019s subjective.  Do what makes you happy.Now that your skull is out of the mold, it\u2019s time to age it down.. For this project, I decided to inscribe it with ancient Welsh symbols for love.  I used a skewer and carved them into the chocolate and then brushed the whole thing with cocoa powder mixed with vodka to give it an aged and worn look.Now that that\u2019s done, it\u2019s time to begin assembling.. Gently press your skull into your cake where you would like to have it rest. You want to push hard enough to leave a dent or mark, but not so hard that you run the risk of crushing either the cake or the skull. Now remove your skull and set it aside while we prep the cake.With a sharp knife, carve out the areas where the skull was pressed into the cake.Frost the entire cake with a thick layer of your dark chocolate frosting. Don\u2019t worry about filling in the holes we just carved. The frosting will act like a glue and help hold the skull in place.Crumble up your dark chocolate wafer cookies. You can do this either in a food processor or in a Ziplock bag using a rolling pin.Sprinkle this down on top of your frosting\u2026it will be your dirt layer. Once you are happy with your dirt, add in your skull.Crumble up your Nilla wafers the same way. You want as fine a powder as you can possibly get.When your Nilla wafers are good and pulverized, add in your green food coloring to the crumbs and either pulse in your food processor to coat evenly or place in a Ziplock bag and knead until all the green coats your cookie crumbs. This will be your moss.Sprinkle your moss crumbs down over your dirt and your skull. You can use a bit of frosting or more vodka to wet down the skull to help the moss stick. A little cocoa powder can also help add more depth and contrast.I admit that I used my airbrush to add in more color. This is purely optional and doesn\u2019t have to be done\u2026a paintbrush and food coloring works just as well.You can see how the addition of more green, yellow and a little black helps add to the aged look of the skull and helps sell the realism.. Finally, garnish with your edible mint leaves, arranging them as though they\u2019re naturally growing out and around your embedded chocolate skull.And there you have it\u2026your mossy skull cake is complete! If you want even more creepy recipes like this for Halloween, swing by my  main Instructables page or check out my horror themed food blog,  The Necro Nom-nom-nomicon.Bone appetite!\nRead the question below and select from the following choices.\nA: Sugar Skull Cake\nB: R2D2 Projector Cake\nC: Cake Stand\nD: Gather Your Ingredients", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_95_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_95_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_95_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_95_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_95_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_95_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_95_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_95_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_95_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_95_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_95_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_95_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_95_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_95_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_95_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_95_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_95_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_95_17.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_95_18.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_95_19.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_95_20.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_95_21.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_95_22.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Sweet Dragon Wings With Spicy Peanut Sauce\nB: Process\nC: Slice Your Veggies\nD: Making the Sauce", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['The Ingredients & Equipment', '@placeholder', 'Cook the Chorizo & Scallops', 'Plate Up Your Meal']", "context": "Here is the context of these images:\n. This recipe will serve 2 as a starter, and should only take less than 20 minutes to cook if you're prepared\nYou could make the sauce even a day in advance and just warm it through if you were cooking for guests, so you'd just need 5 minutes to cook the scallops and you're ready to go!Ingredients:\n6 - 10 Scallops, mine were already shelled, but fresh in the shell would have been even better\nChorizo, preferably raw but cured will do as wellFor the Sauce:\n1 small onion\n1 clove garlic\n150ml passata\n1/3 - 1/2 red bell pepper, or any sweet red pepper\nWorcestershire sauce ( a good dash to taste)\nHot pepper sauce ( depending how hot you like it)\n1 tsp Dijon mustard\n1tsp honey\n1 tsp Paprika\n2 tsp tomato ketchup\n1 tbsp soft brown sugar\nsome Olive oil or flavourless oil (such as vegetable, sunflower etc)\na small knob of butter\nSome salad leaves to serveEquipment:\nSmall Saucepan\nFrying pan\nKnife\nChopping board\nA heat sauce\nKitchen utensils for stirring and flipping the scallops, serving the sauce etc.\nA hand blender ( if you dont have one chop the veggies fine and serve as a slightly chunky sauce)\n2 smart plates to serve. Method:\n\t\tChop the garlic, onion and pepper. I went for fine dice, but if you're blending the sauce chunky is fine.\n\t\tGently fry to onion in the saucepan with a little oil until it starts to go translucent, then add the garlic and pepper. (don't add the garlic with the onion or it will cook to quickly, burn and make the sauce a little bitter) cook until the veggies have softened.\n\t\tAdd all the other sauce ingredients, stir thoroughly, bring to a gentle simmer for 5-10 minutes or so until slightly reduced.\n\t\tBlend the sauce with a hand blender. Preparing the Scallops & Chorizo:\n\t\tIf they're in the shell you will need to open them and take them out\n\t\tremove the coral and the grey membrane around the main meat (the corals can be saved to use for a sauce, if you like)\n\t\tRinse the scallop meats under a running tap then pat dry with kitchen paper\n\t\tChop the Chorizo into small dice, or fine discs\n\t\tFeed the scraps to your cat if she's hassling you for attenton/foodCooking the meat:\n\t\tPut some oil and butter into the frying pan over a medium high heat (the oil will stop the butter burning to quickly)\n\t\tAdd the chorizo and fry until the oils are released and the chorizo is starting to brown, then remove the chorizo and drain on kitchen paper.\n\t\tIn the same pan you now add the Scallops to the oil, which is now flavoured with the Chorizo. Fry the Scallops for 30secs-2mins on each side, depending how well cooked you like them. don't overcook them, or they will be like little rubber bullets.\nThat's it, we're ready to serve up!\n        . \n          Plating your food is an art in it's own right. You may like my presentation, or you may wish to do something even fancier. I've just started beautifying my food after reading 'Working the Plate' by Christoper Styler.\nHow I plated my dish\n\n\t\tStart with a pile of small mixed salad leaves and shredded beetroot off centre\n\t\tPlace 3 - 5 teaspoon dollops of the sauce around the salad leaves\n\t\tCarefully put a cooked scallop on top of each dollop\n\t\tSprinkle the chorizo around and on top of the scallops\nPresent your dish to your guests, and bask in their praise.\nFeel free to vary the recipe, you may fancy replacing the Chorizo with Black Pudding, cripsy Prosciutto or bacon for example\n        \nRead the question below and select from the following choices.\nA: Sweet Dragon Wings With Spicy Peanut Sauce\nB: Process\nC: Slice Your Veggies\nD: Making the Sauce", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_96_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_96_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_96_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_96_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_96_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_96_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_96_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_96_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_96_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_96_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_96_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_96_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_96_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_96_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_96_14.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Carrot Lentil Soup\nB: Add Stock and Simmer\nC: Stir, Top With Herbs, and Serve!\nD: Vegetable Beef Soup", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Chop & Soften Onions & Celery', '@placeholder', 'Eat the Bone Marrow', 'Finish and Serve']", "context": "Here is the context of these images:\n. Ingredients:2 pounds grass-fed beef shanks (usually 3-4 depending on size)2 pounds carrots, shredded2 pounds parsnips, shredded (select small parsnips, or remove woody center from larger ones)2 medium onions, chopped4 stalks celery, chopped2 Tablespoons butter or oil of your preference2 bay leaves8 cups homemade beef stock (or 8 cups water plus 8 1-cup bullion cubes) (can swap out other stock - I used some chicken here)watersalt & pepperthyme and/or oregano (fresh or dried) - mostly optional but I find Thyme a necessary flavor1/2 teaspoon Worcestershire sauce (optional)handful parsley, choppedfull-fat sour cream (serve at the table)shredded cheddar cheese (optional and can distract from the flavor, but kids often like it)Tools:8 quart dutch oven or soup potgrater or food processor with shredding disc (see intro note about shredding ahead of time for travel!)knifecutting boardwooden spoon or spatula. Heat the pot on over medium heat, and drop in 2T butter or the fat of your choice.Chop onions and celery into small bite-size bits, and dump into the butter with a pinch of salt to speed breakdown. Break two bay leaves in half, and add to the pot. Cook, stirring as needed, until the vegetables are soft.. Increase heat to medium/high, scrape vegetables aside or remove from the pot temporarily, and place beef shanks directly on bottom of pan. You can scoop the veggies over top while they cook, but ensure the meat is directly in contact with the bottom of the pan for best browning. I'm too lazy to scoop the veggies out and get another plate dirty, so tend to do it this way.When they start to get brown and curl up, scoot the veggies again and flip your shanks, again ensuring the meat is in direct contact with the pan bottom. If the meat starts to stick or burn, add a bit more butter to the pan and/or lower the heat. You can also deglaze with sherry or the booze of your choice before adding the root veg and stock.. Wash, peel, and grate your root vegetables, then add them to the pot with the browned meat and onion/celery mix. If the pot is hot and dry, you may want to add water or stock at the same time to avoid burning.Note that big parsnips become woody in the center - select the smallest parsnips available. The best ones are similar in size to carrots. If you must deal with giant parsnips, you may need to cut out the centers. Note that this will change your weights - you want roughly the same amount of carrot and parsnip in your soup.I generally prep my veggies and run them through the food processor while the meat is browning, or deputize the 3-year-old to do the shredding. As noted in the intro, you can pre-shred the veggies and vacuum-pack with a bit of olive oil to prevent oxidation if you're traveling and want the veggies ready when you arrive. Highly recommended.. Add stock or water/boullion to cover, and simmer on low/medium, stirring occasionally, for roughly 3 hours. Make sure the liquid always covers the vegetables - add more if needed.. Remove your beef shanks, and let them cool on the cutting board for a few minutes. Use a sharp knife to cut all meat and cartilage from the bone, then chop into small bite-size pieces.  Return meat, cartilage, and bones to the pot and continue simmering for approximately another hour.  Chop and add fresh thyme or oregano if you have them, as well as a half-teaspoon of Worcestershire sauce.The meat will likely still be a bit tough, but further cooking will soften it up the rest of the way. . Before returning the bones to the pot, pop out any bone marrow. Let it cool a bit, sprinkle with salt, and eat. You're making everyone else delicious soup - this is your treat.  Consider sharing with your kids (their little brains are growing and could use the excellent fat!) but make sure you get at least one piece for yourself.. When the beef is tender, adjust seasonings. Make sure there's enough salt, pepper, thyme, and oregano for your taste. If you need a bit more umami kick, add a bit more worchestershire sauce.Remove bones and bay leaves, stir in a handful of chopped fresh parsley, and serve. I like to provide a tub of good (read: full-fat) sour cream for people to add to their soup bowls at the table. The fat is a great flavor binder, and it helps cool the hot soup more quickly.This soup saves well and reheats beautifully. Microwave some for lunch the next day.\nRead the question below and select from the following choices.\nA: Carrot Lentil Soup\nB: Add Stock and Simmer\nC: Stir, Top With Herbs, and Serve!\nD: Vegetable Beef Soup", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_97_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_97_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_97_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_97_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_97_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_97_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_97_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_97_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_97_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_97_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_97_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_97_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_97_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_97_13.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Stir Frying!\nB: CHICKEN FONTINA\nC: Prepare Avocados\nD: Prepare Serrano Pepper", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Ingredients', 'Do Your Prep Work', '@placeholder', 'Add the Basil and Serve!']", "context": "Here is the context of these images:\n. \n\t\t1 pound ground chicken\n\t\t2 tablespoons grated or minced ginger\n\t\t2 tablespoons minced garlic\n\t\t1 tablespoon sesame oil\n\t\t1 1/2 tablespoons hoisin sauce\n\t\t2 tablespoons soy sauce\n\t\t1 tablespoon rice vinegar\n\t\ta jalapeno or serrano pepper, minced, de-seed it if you want to!\n\t\thandful of basil\n\t\t1 tablespoon chili/garlic sauce (optional)\nIf you can find Thai basil, use that! I have a hard time finding it at my local grocery store, so I go for normal basil. I think that this is best when it's spicy, so if you'd like a little kick, add some chili/garlic sauce at the end. It'll also give it nice color. :)\n        . Because everything comes together so quick, make sure sure have everything minced and measured out.\nPut the pepper, garlic and ginger in a bowl together and measure out the soy sauce, hoisin and rice vinegar into another bowl and mix.. Heat a large skillet over medium high heat and pour in a tablespoon of sesame oil. When it's really hot, dump in the peppers, ginger and garlic.\nStir these around for a minute or so, until nice and fragrant. Then add in the ground chicken and break it up into smaller pieces. You won't want to cook it through all the way here, because we're about to simmer it for a few minutes. Cooking it all the way through here will make it tough!\nAs soon as the chicken is all in 1/2 inch or smaller pieces, turn the heat down to medium low. . Add in the liquids, stir it well, and let it simmer for a few minutes. I like to cover it but you don't have to. Stir it every so often, until most of the liquid is gone.. Turn the heat back up to medium high, and add in a handful of basil. (I've attached a photo of what I mean by \"handful of basil\") I like to rip it into slightly smaller pieces!\nStir this around just until the basil is wilted and then turn off the heat. Add in your chili garlic sauce now if you want it. Serve it hot over rice or noodles - it's perfect that way :D\nRead the question below and select from the following choices.\nA: Stir Frying!\nB: CHICKEN FONTINA\nC: Prepare Avocados\nD: Prepare Serrano Pepper", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_98_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_98_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_98_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_98_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_98_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_98_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_98_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_98_7.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Deathberry Scones\nB: Cook\nC: Finished\nD: Ingredients and Equipment", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Weigh the Dry Ingredients', 'Roll and Cut', 'Eat!']", "context": "Here is the context of these images:\n. This recipe makes around a dozen 3 inch diameter scones.For the scones:3 cups self raising flour1/2 tsp salt1 1/2 tsp dried lavender, ground finely (buy culinary or organic lavender to make sure it is free of pesticides & other unwanted chemicals)170 g cold unsalted butter1 large egg, lightly beaten3/4 cup of buttermilk (or you can make an easy buttermilk substitute by taking 3/4 cup of milk and stirring in the juice of half a lemon. Leave it to sour for 15 minutes, then use as the recipe directs)1/4 cup honey1/2 tsp bicarbonate of sodaFor the glaze:2 tbsp honey1 tbsp water1 tsp dried lavenderEquipment:Mortar and pestle, or similar for grinding the lavenderScales or measuring cupsMixing bowlSmall bowlsSpoons, knives, spatulas etc.WhiskMicrowave/stovetopSieveRolling pinRound scone/cookie cutterBaking traysBaking parchmentPreheat your oven to 180C/350F/Gas 4(For anyone without scales, a good weight to volume conversion guide can be found here.). Measure the flour into a large mixing bowl. Add the bicarbonate of soda and salt, and give the whole thing a good whisk to aerate the flour and combine the ingredients thoroughly.. Cut your butter into small cubes and add it to the flour mix. Rub the butter into the flour with your fingertips until there are no large pieces left and the mixture resembles breadcrumbs. Stir in the ground lavender.. Make a dip in the centre of your flour mixture and pour in the egg, honey and 1/2 a cup of buttermilk.Stir the mixture together with a flat bladed knife until a rough dough forms. If the the mixture seems too dry, add more buttermilk, a little at a time, until you have incorporated all of the flour.Tip the dough out onto a well floured surface and knead it gently and briefly to bring it together into a ball. Handle the dough gently to keep your scones light and airy.. Roll out your dough to around 3/4 of an inch thick. The dough will be fairly soft but should't be too sticky. If you have problems rolling it out put the dough in the fridge for half an hour or so to firm it up.Cut out the scones with your cutter of choice, it's a good idea to dip the cutter in flour every so often to stop it getting too sticky with dough. Cut out as many scones as you can, then re-roll the scraps to cut out some more. Again, treating the dough gently is the best way to get fluffy scones, so use a light touch with the rolling pin, and try not to re-roll the scraps too many times.Place your scones on a lined baking sheet, and bake for 12-15 minutes. While the scones are cooking, prepare the glaze.. For the glaze measure the 2 tbsps of honey and 1 tbsp of water into a bowl. Stir in the dried lavender. Microwave the bowl in a couple of 20 second bursts until the glaze is hot and steaming. (You can also heat the glaze on the stovetop.)Once the glaze is hot leave it to stand for five minutes to let the lavender infuse, then strain out the flowers.. When the scones are nearly done, after 12-15 minutes in the oven when they are nicely golden, remove the tray from the oven and brush the scones with the glaze.Return the scones to the oven for another 2-3 minutes to set the glaze, then take them out and leave to cool a little.. Scones are best served warm, so eat soon after baking, or rewarm them in the oven or microwave. To serve, split the scones in half and spread with clotted cream, whipped cream, or whipped butter (with some added honey of you're feeling decadent). They are also great filled with lemon curd. Eat them for breakfast, brunch or an afternoon snack with a nice cup of tea.\nRead the question below and select from the following choices.\nA: Deathberry Scones\nB: Cook\nC: Finished\nD: Ingredients and Equipment", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_99_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_99_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_99_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_99_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_99_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_99_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_99_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_99_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_99_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_99_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_99_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_99_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_99_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_99_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_99_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_99_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_99_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_99_17.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_99_18.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_99_19.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_99_20.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_99_21.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_99_22.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_99_23.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_99_24.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_99_25.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_99_26.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_99_27.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_99_28.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_99_29.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_99_30.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Chocolate Covered Matzo\nB: Chocolate Croquembouche\nC: Chocolate Pastry Cream\nD: Arrange Your Matzo", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Ingredients', '@placeholder', 'Melt the Butter and Sugar', 'Start Cracking!']", "context": "Here is the context of these images:\n.   Recipe adapted from the queen of all things delicious: Smitten Kitchen  You will need:  4 - 6 Matzos  12 oz. semi-sweet chocolate  2 sticks of butter (I use salted)  1 cup light brown sugar, packed  A handful of slivered almonds or chopped walnuts.   Place your matzo on the baking sheet so that it fits. You can break it up and pieces can overlap to cover the area. We also wrapped the baking sheet in aluminum foil for easy clean up.. Preheat the oven to 350 degrees. Place the butter and sugar in a saucepan over medium heat. Melt and stir until the mixture starts to thicken and bubble. Once it begins to bubble, leave it on the heat, continuing to stir, for a few more minutes. At the last minute, add about a half teaspoon vanilla (if you desire) and stir in.. Pour the mixture on top of your matzo and spread quickly before it begins to set. Once it is thoroughly distributed, place the sheet in the oven and bake for about 15 minutes. Keep an eye on it and turn the heat down if it starts to bubble or brown too much.. Once your matzo and toffee have set in the oven, remove it and immediately sprinkle on the chocolate chips, distributing evenly. Let them sit to melt on the hot toffee for a few minutes, then spread the chocolate with a spatula. When the chocolate is still melted, add your almonds or walnuts (if desired). Let the entire sheet cool. You can put it in the fridge to the speed up the process, it should take about an hour in the fridge. . We just used our hands to crack the matzo deliciousness into smaller pieces. They'll be uneven, but trying to cut them with a knife proved unsuccessful. They are just as delightful, no matter what shape they are. . This is the part where you have to stop yourself. It's too good. Share your matzo crack with some friends. \nRead the question below and select from the following choices.\nA: Chocolate Covered Matzo\nB: Chocolate Croquembouche\nC: Chocolate Pastry Cream\nD: Arrange Your Matzo", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_100_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_100_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_100_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_100_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_100_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_100_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_100_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_100_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_100_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_100_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_100_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_100_11.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Barbecued Honey Seafood\nB: Not Quite Yet!\nC: Prepare the Ingredients\nD: Bread", "question": "Choose the best title for the @placeholder to correctly complete the recipe.[\"Let's Get Ingredients\", '@placeholder', 'Prepare the Other Ingredients', 'Make the Sauce']", "context": "Here is the context of these images:\n. It doesn't matter how you get these, whether it be through a grocery store heist, or an Indiana Jones style quest. \u2022 1 pound of shrimp. These shrimp should be uncooked and gray. If they're already cooked they'll be tough when you re-cook them. \u20221/2 pound of scallops. \u20221/2 pound of squid tubes. These are basically squid scalps. They're the skin on the outside of the squids head. They look like white rubbery wizard hats.\u20221 pound of clams and oysters. There won't be a pound of clam meat, because the shells make up a lot of the weight, but a pound should be enough, even including the shells.\u2022Linguini. One package will probably be enough. It expands a lot when it's boiled.\u2022butter and olive oil. \u2022heavy cream. \u2022Take the shrimp. Chop off the tail, and peel the thin shell off the creature. Play death metal to set the mood. Now take a fish knife and cut a small slit in the shrimps belly. You should see a black line running through it. This is either it's spine or some type of vein. Either way it often has sand in it, so make sure and clean it thoroughly, removing the line. After all your shrimp have been thoroughly eviscerated be sure and wash them under water.\u2022 Depending on the size of your scallops you may need to chop them into smaller, more bite size pieces. This helps it to cook better and to mix with the rest of the dish.\u2022Take the squid tubes and chop them up, so that they form rings of squid. If you've ever had calamari shaped like rings, this is how they do it.. \u2022Take your clams and oysters and set them in a steamer and put it over the stove to boil. If you don't own a steamer, take a colander and put it onto a pot of water. Set the clams in the colander, but make sure that the water doesn't touch them. The steam will cause them to open up, but we don't want them to be cooked just yet. Now pry them out of the shell with your thumb and wash them\u2022Take the box of linguine noodles. Set it on the table and allow it to continue to be linguine. We don't need it just yet.. Take a pan and mix butter and olive oil together. Heat up the mixture until everything is completely warm and melted. Toss in the seafood and let it cook. Fry to taste, but make sure the shrimp turns pink before you take it off.Now boil some water and toss in the linguine. . I chose to do a white sauce. Take some butter, and some heavy cream and mix it in the pan (after you remove the seafood). Let it melt, and toss in some chives, or lemongrass. Onions go well with this sauce too, but make sure they're very finely chopped.You could choose other sauces as well. I hear white wine sauce goes well with this type of dish.. Take the linguine and the seafood and mix them thoroughly. Leave the sauce in the pan so that diners can take enough sauce to suit their preferences.Have a linguine party! \nRead the question below and select from the following choices.\nA: Barbecued Honey Seafood\nB: Not Quite Yet!\nC: Prepare the Ingredients\nD: Bread", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_101_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_101_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_101_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_101_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: How to Make a Vampie\nB: Prepare\nC: Add Pudding and Whipped Cream, Then Repeat\nD: How to Make a BLT", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Chop Cake and Layer', 'Drizzle With Liqueur', 'Add Fruit', '@placeholder']", "context": "Here is the context of these images:\n. You'll need:angelfood cake (made from a box is dandy; purchased pre-made is also fine, and faster) 2 boxes french vanilla pudding (I used Jell-O instant) 4 cups whole milk (I often substitute half-and-half for extra richness)frozen fruit (I used cherries, blueberries, and raspberries from Trader Joe's) liqueur(s) of your choice (I like amaretto, irish cream, and any fruit-flavored liqueur) 2 cups heavy/whipping cream (make this fresh)1 teaspoon vanilla extractpinch of saltbrown sugar to taste (I use about 1/4 cup)grated chocolate on top (optional)Go ahead and bake your angelfood cake, mix up your pudding, and whip your cream.  These can be stored until you're ready to assemble the trifle.I've made angelfood cake from a box and from scratch, and the difference isn't major- here you're combining it with so many other flavors that any such distinction would be lost.  Purchased pre-made angelfood cakes will do, but tend to be a bit dry and sometimes taste off.  Pound cake may be traditional, but I prefer angelfood because it's lighter, more absorbent, and complements the berries better.I used a trifle bowl (purchased at Target for $14), but you can use any bowl.  Glass is preferable, because then you can see all the pretty layers.. Chop your cake into smaller pieces, about 1 inch thick, and spread them in a dense layer across the bottom of your trifle dish or bowl.  The white interior of the cake looks best facing outwards for contrast with the fruit, so keep the darker edge pieces facing up.  They'll disappear into the layers.. Pick the liqueur of your choice to drizzle over the cake pieces.  You can use a different liqueur for each layer if you like- that worked quite nicely for me this time. This time I used plum brandy on the bottom layer, marsala in the middle layer, and kirschwasser (cherry) on the top layer.    Other good choices:  amaretto (almond), goldschlager (cinnamon), chambord (raspberry), triple sec (orange), irish cream (one of my favorites), or kahlua.  Sherry is traditional, but doesn't add much flavor.  Adjust quantity and proof of liqueur to your preferences.You can skip this step if you don't want the alcohol, or add a bit of fruit juice to help soften the cake.  There are enough wet ingredients that everything will turn out well anyway. . Cover the cake layer with the fruit of your choice.I used frozen cherries, raspberries, and blueberries:  you can use most any fresh fruit available, though I find berries best complement the texture.  Don't worry about thawing frozen fruit, as it will thoroughly melt while the trifle sits.. Cover the berries with a layer of vanilla pudding, then a thin layer of whipped cream..  Don't worry about 100% coverage or being tidy- everything will get layered over, so just dump it on there.  Keep an eye on the sides to get a nice layered look.Now add another layer of cake, drizzle it with liqueur, sprinkle with berries, and add more pudding and whipped cream.  Continue until you've filled your trifle bowl or run out of ingredients.  My trifle bowl held 3 layers of cake and fruit, with two layers of pudding and cream.  Depending on the size of your trifle bowl, you'll likely have leftovers.  Grab a glass bowl or some wine glasses, and make more little trifles with the extras, then stash them in the back of your fridge.  They'll make excellent leftovers.Cover the top in a final layer of whipped cream, then grate chocolate over the top if you like for bonus style points.. Cover the finished trifle in plastic wrap, and store it in the refrigerator until ready to serve.  It can easily be made the night before, or earlier in the day, as the flavors only improve upon sitting and mingling.Garnish with a couple of fresh berries, a dusting of spice, or a sprig of mint if you're feeling particularly giddy, then just add a large spoon and step back to avoid the ravening hordes.\nRead the question below and select from the following choices.\nA: How to Make a Vampie\nB: Prepare\nC: Add Pudding and Whipped Cream, Then Repeat\nD: How to Make a BLT", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_102_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_102_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_102_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_102_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_102_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_102_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_102_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_102_7.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Broccoli and Potato Soup\nB: Saute the Veggies\nC: Add Chicken Broth, Bring to a Boil, Add Rice\nD: Eat It", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Ingredients', 'Start', 'Blendeer', '@placeholder']", "context": "Here is the context of these images:\n. 1 Head of broccoli.\n1 Head of\u00a0cauliflower.\n2 Onions (you can use two white onions if you like I use a red one because I have one).\n3 stalks of celery (this one is to your liking).\nhalf stick of butter.\n3 cloves of Garlic.\nenough chicken stock to cook all the ingredients.. First chop the celery and the onions, you want medium size\u00a0pieces\u00a0not to thin.\nSecond turn on the stove and melt the butter in a frying pan large enough onion and celery, when the butter melts add the vegetables and saute.. While you saute the\u00a0vegetables prepare the chicken stock and check the salt level, this is the moment to add more water or salt if required, next cut the broccoli and\u00a0cauliflower you want\u00a0medium\u00a0to big pieces and remember remove the stalk from the broccoli.\u00a0\nWhen the onions and the celery are\u00a0sauteed add then to the stock with the chopped broccoli and cauliflower\u00a0and\u00a0let them cook for a 15 - 20 minutes.. When you fell the broccoli and the cauliflower tender turn off the stove and carefully\u00a0(Because that soup is really hot)\u00a0place the soup in the blender until you have the right consistency, do it little by little and \u00a1voil\u00e0!\u00a0you finish the\u00a0preparation.. Serve a little on a plate and enjoy, you can garnish\u00a0with a\u00a0little\u00a0of laurel or other green herb like coriander but remember this is for decoration only.\nRead the question below and select from the following choices.\nA: Broccoli and Potato Soup\nB: Saute the Veggies\nC: Add Chicken Broth, Bring to a Boil, Add Rice\nD: Eat It", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_103_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_103_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_103_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_103_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_103_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_103_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_103_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_103_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_103_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_103_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_103_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_103_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_103_12.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Mixing Molasses Syrup With Popcorn\nB: Popcorn Ball Brains\nC: Cover the Snowmen\nD: Stack the Snowmen", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Ingredients', 'Getting Your Work Space Ready', '@placeholder', 'Cleanup, Storage, and Later Eating']", "context": "Here is the context of these images:\n. Remember, this is for a pretty large batch. I would recommend halving this recipe.\n*12 1/2\u00a0 qts of popped popcorn (for this I used ~1 cup of popcorn kernels\nvegetable oil\n3 c. light molasses\n2 c. white karo syrup (I think this is just corn syrup)\n1 1/2\u00a0 c. white table sugar\n4 tbsp. butter + 1/4 c. butter at room temperature\nSpecial Equipment:\nCandy thermometer\nLARGE bowl (heat proof)\nmedium bowl (heat proof)\nLarge pot\n* 1 quart = 4 cups. This is not hard but it may be a little strange for some people who have never cooked popcorn without a microwave. Follow the instructions and everything will be fine!\n1. Place 3-4 corn kernels in your large pot and pour enough oil in the pot to come up about half way to three quarters of the way up the kernels. Turn on the stove to medium heat and wait for your 3 kernels to pop. They will tell you when the oil is hot enough. PLACE THE LID ON YOUR POT. You do NOT want hot kernels or oil jumping out of the pot and hurting you, your loved ones, or fuzzy friends.\n2. Once your 3-4 kernels have popped place enough popcorn kernels in the pot to cover the bottom of the pot and not stack on top of each other. Put the lid back on the pot.\n3. The popping will start out slowly and quicken. Once your popcorn has slowed its popping (but has not stopped) remove it from the heat and pour into a medium sized heat-proof bowl.\n4. Use a heat proof utensil to scrape anything left in the pot that you don't want burning when you do the next batch of popcorn.\n5. Place 3-4 more kernals in the pot and put more oil in as you did before.\n6. While oil is heating up shake your bowl with the popcorn in it a little bit so the kernels that didn't pop move down to the bottom.\n7. CAREFULLY scoop handfuls of popcorn out of the bowl its in and into your very large heat-proof bowl. Try to avoid unpopped kernals because they can be VERY hot for a little while after you pour them into the bowl. Do not try to reuse these unpopped kernals.\n8. Once the oil is heated add more popcorn kernals to it as you did before in and repeat steps #2 - #8. You will most likely make 3-4 batches of popcorn if you are using the proportions in this recipe. . This may seem like a strange step but when you mix the molasses syrup with the popcorn and then form the balls you're going to take up a bit of space.\nWherever you're going to do this make sure that you put wax paper down on your work space (see picture)\nTake your 1/4 cup of butter at room temperature and put it on a plate. This will be the only thing that keeps your hands from sticky madness.. Ok! The popcorn is made and your work area is ready!\n9. Pour the molasses, karo syrup, and sugar into your pot. Stir it up until mostly homogeneous. Put the lids back on your molasses jars and karo syrup container and flip it upside down so that the syrup pools to the top of the container. This way you get everything you paid for! Pour the rest of the syrups into the pot.\n10. Set your candy thermometer in pot. Turn on the heat medium or medium high.DO NOT STIR YOUR SYRUP AT ALL ONCE IT HAS STARTED HEATING! RESIST THE URGE!\n11. Once your syrup starts boiling throw in 1 tbsp of butter and DO NOT STIR.\n12. When the syrup reaches your desired temperature (discussed below) remove it from heat and add the other 3 tbsp of butter and let it melt on top of the syrup. Once it is completely melted it probably wont hurt anything to gently stir it in so it's mixed well.TEMPERATURE:\nPLEASE don't let this section intimidate you.\nNana has the syrup go to 238 degrees in Tucson, AZ (Elevation: ~2,389') according to google)\nI looked on a website and it said that you should reduce the cooking temperature by 1 degree every 500' increase in elevation. I currently live in Colorado at an elevation of 8437'. The difference in elevation is ~6000' so by this rule I should bring the syrup down 12 degrees to 226 degrees, which is what I did. I might take it down 1-2 degrees next time but it still came out very well. If you would like to figure out what temperature you should cook it to I would use this 1 degree per 500' rule. Don't forget that if you live at a LOWER elevation than Tucson you will add 1 degree per 500'. To find your elevation you can check google. I hope this makes sense. . This is a very messy part. There will be lots of popcorn that spilled over your bowl. That's okay - I always like to eat the jumpers while someone mixes the popcorn (if I am lucky enough to have someone else mix it for me).\nAgain, splitting your popcorn into two bowls is not ideal because of this part. If you put too much syrup in one bowl the other is not going to have a very nice coating of molasses.\n13. Pour about 1/2 of your molasses batch over the popcorn.\n14. Mix the popcorn gently by scooping spoons/spatulas down the sides of the bowl to the bottom. Gently bring the spoons up through the popcorn so that the molasses that drips to the bottom is brought up to the top of the popcorn. Repeat this step many times until most of the molasses is coating the popcorn and not pooling at the bottom. The motions are the same as if you were tossing a garden salad. This may take a little while. Be patient. Then pour more molasses over and mix again. Repeat until all of the molasses has been mixed into the popcorn.. This step is my favorite part.\n15. Once the molasses syrup is mixed thoroughly into the popcorn WASH YOUR HANDS.\n16. Once your hands are clean and sanitized scoop up some of the room temperature butter (remember, you placed 1/4 cup on a small plate in your work area). Smear it all over the inside of your hands and between your fingers if need be. This will keep the popcorn from sticking to you.\n17. Grab some popcorn (make sure its not too hot to touch) and form it into balls. It may fall apart if its too warm or the cooking temperature was too low. That's ok. Wait for it to cool a little longer and then reform them. Don't be too rough - you don't want to squish the popcorn!\n18. At this point you can eat or store the popcorn balls. For information on storage see the next step. ENJOY!\nNOM NOM NOM!. The bits of popcorn left in the bowl are good to eat and help you postpone tapping into your popcorn ball stash (even if it's for an hour). If you let your bowls and utensils soak in hot water for a little while you will have no trouble removing the molasses syrup.\nWe always freeze about 2/3 of the popcorn balls in ziploc bags. When you want to eat one you can just take it out and microwave it for ~30 seconds.\nThe ones that you don't freeze can just be kept in a ziploc on the counter or stacked on a cake holder with a lid.\nOne last thing - if you have sensitive teeth or you just want to have your popcorn ball softer just throw it in the microwave for 10-20 seconds. They're not hard but you may just want yours a little softer.\nI hope you enjoy these. Have a wonderful day!\nRead the question below and select from the following choices.\nA: Mixing Molasses Syrup With Popcorn\nB: Popcorn Ball Brains\nC: Cover the Snowmen\nD: Stack the Snowmen", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_104_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_104_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_104_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_104_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_104_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_104_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_104_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_104_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_104_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_104_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_104_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_104_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_104_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_104_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_104_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_104_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_104_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_104_17.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_104_18.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_104_19.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_104_20.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Tenderizing the Veggies\nB: Easy Shepherd\u2019s Pie Recipe\nC: Easy Croissants Recipe\nD: Conclusion", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Ingredients', 'Preparing the Meat Broth', 'Preparing the Veggies', '@placeholder']", "context": "Here is the context of these images:\n. 1 lb of boneless meat (beef sirloin, pork loin or chops)   2 large onions (white or red)   3 bay leaves 4 medium-sized beets   1-2 Tbsp lemon juice   3-4 medium-sized potatoes, preferably Yukon Gold    2 carrots    4 celery sticks    2 red bell peppers    1/4 - 1/2 of small head of cabbage    4 tomatoes 2 Tbsp olive oil    1 can of beans, preferably Cannellini white, but black or red will also work    1 bunch of parsley    1/2 - 1 bunch of dill (optional) 5 cloves of garlic    Salt, pepper, sugar    Sour cream (optional)   Tomato paste (optional). Note: I recommend completing this step a day in advance, since it takes a while.Caution - Meat Handling: Keep meat frozen when storing. Move it to the refrigerator at least a few hours before cooking, or defrost. Wash your hands and equipment after handling raw meat.Cut meat into about 1/2-inch cubes and toss into a medium-sized pot    Fill the pot 3/4 of the way with water    Peel the skin off of 1 onion and add the whole onion to the pot   Add bay leaves, and salt and pepper to taste    Bring the pot to a boil; then lower the heat to medium and let it simmer for about 2 hours    When the meat is ready (soft and chewy), throw away the used onion   Set aside the finished broth and let it cool; Refrigerate until ready to use. Cut off the leaves and stems, keeping only the heads of the beets   Peel the beets   Place beets into a pot, add enough water to cover them, and cook until you see bubbles start accumulating. That means the beets are about to boil.Note: Do not boil or the beets will lose their bright color    Strain the beets over a separate container to keep the liquid     Add lemon juice to the liquid to preserve color. Dice the onions  Cut the potatoes and tomatoes into bite-sized cubes  Cut carrots, celery, bell peppers, cabbage, and cooked beets into thin strips   Chop garlic into tiny pieces or grate it. Place the large pot over medium heat and add 2 Tbsp. of olive oil   Add onions, celery and carrots and saut\u00e9 (cook them in the oil) for 4 - 5 minutes   Add cabbage, strained beets, potatoes, bell peppers, 1.5 - 2 cups of the beet liquid prepared earlier, salt, and pepper    Cover with a lid and cook on low heat until the veggies are tenderNote: The veggies should still have some texture and crunch; otherwise they're too soft  Lift the lid about every 10 minutes, mix, and check if the veggies are ready--it should take about 30 minutes  Add tomatoes, salt, pepper, and sugar as needed, and cook for a few more minutes. Add beans, the rest of the beet liquid, and meat with broth. If needed to fill up the pot, add some water and bring to boil. Let simmer for a few minutesTaste the dish and add extra salt, pepper, and sugar as desired. It should taste sour-sweet (not too sour or too sweet)Note: Actually taste the dish and see if you like it at this step--this is the most important step     Add chopped parsley and dill and mix it with the rest of the ingredients and bring it to boil again. Turn off the heat completely immediately after it starts boiling or the greens will overcook   Add garlic right after turning off the heat   Partially cover the pot with the lid, so the veggies can keep cooking in the hot water   Let it cool, then serve   Add a teaspoon of sour cream to your bowl if you want and enjoy :)\nRead the question below and select from the following choices.\nA: Tenderizing the Veggies\nB: Easy Shepherd\u2019s Pie Recipe\nC: Easy Croissants Recipe\nD: Conclusion", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_105_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_105_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_105_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_105_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_105_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_105_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_105_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_105_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_105_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_105_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_105_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_105_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_105_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_105_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_105_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_105_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_105_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_105_17.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: 100% Whole Wheat Focaccia\nB: Knead Dough, Shape, Cover, & Let Rise\nC: Ingredients\nD: Egg Wash & Cover With Remaining Mixed Seeds", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Grate!', 'Stir Em Up', \"It's Ready for Perfection\"]", "context": "Here is the context of these images:\n. Gather your Ingredients. 2 Cups Whole Wheat Flour 1/2 Cup White Flour 1/2 Cup Flax- You can use ground if your family doesnt like flax or omit it and just add a extra 1/2 cup of white flour 1 1/2 cups \u00a0Salted Butter- Cold 1 Tbsp Vinegar 4 Tbsp Cold Water. 1 Egg. Add your flours, and flax into a bowl. Grate your Butter into the Bowl. Grating it on the largest setting on your Grater is a wonderful was to get the perfect size of Butter pieces without trying to mix it into pea sizes pieces. \u00a0. Add remaining ingredients and mix until it forms a ball.\u00a0 Stick that bad boy in the fridge for at least a half hour before using.\u00a0. now use it to your hearts desire! \u00a0\u00a0 i used mine to make mini Meat Pies!\nRead the question below and select from the following choices.\nA: 100% Whole Wheat Focaccia\nB: Knead Dough, Shape, Cover, & Let Rise\nC: Ingredients\nD: Egg Wash & Cover With Remaining Mixed Seeds", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_106_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_106_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_106_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_106_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Green Chile Cheeseburgers\nB: Frying the Onion Rings\nC: Making the Patties\nD: Onion Bacon Cheeseball", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Chilling the Meat', '@placeholder', 'Time to Serve', 'Enjoying Your Meal']", "context": "Here is the context of these images:\n. For two pounds of burgers:1/2 cup of fresh minced green onion1 tablespoon of Worcestershire sauce3 cloves of minced garlic1 teaspoon of salt1/2 teaspoon of black pepper1 teaspoon of Italian seasoning. Add the 2 pounds of venison burger to the bowl and mix thoroughly. . For best flavor, it is advised that you let the meat rest in the refrigerator for at least an hour. . After letting the meat cool down in the refrigerator, it is time to make the patties. I made 8 1/4 pound patties with 2 pounds of meet. . Grill the patties for 3 minutes on each side. There should be an amazing smell and grill marks on your patties. . This part is optional. After grilling for 6 minutes, I turned the heat off and put cheese slices on top for some delicious cheesy taste. I would recommend to leave at least one without cheese so you can have a full and unaltered taste of the burger. . I toasted kaiser buns and regular buns. It's optional but I enjoy the buttery crunch. You can treat these like a cheeseburger and put on lettuce, tomato, onions, etc. . With everything done, it's time to dig in! I went traditional with my sides. I ate the burgers with baked beans and a salad. It was a great meal. I hope you enjoyed!\nRead the question below and select from the following choices.\nA: Green Chile Cheeseburgers\nB: Frying the Onion Rings\nC: Making the Patties\nD: Onion Bacon Cheeseball", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_107_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_107_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_107_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_107_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_107_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_107_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_107_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_107_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_107_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_107_9.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Campfire Boiled Eggs\nB: Check the Rice\nC: Done\nD: The Triple Rinse", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Purchasing Raw Dried Peanuts', '@placeholder', 'Seasoning and Cooking', 'Enjoy and Spread That Peanut Love']", "context": "Here is the context of these images:\n. For this, the ingredients are simple.1.5 pounds of raw, dried peanuts1/2 cup of saltPressure CookerWater. For folks not in the southern United States, green peanuts are very difficult (or impossible) to find. Raw dried peanuts, though not found in every grocery store, can be found in specialty and health stores. It's important to note that dried roasted peanuts are completely different. They will not work! For this, I bought about 1.5 lbs from the local health food store, and paid $3.00. The peanut on the left is a raw, dried peanut. The right is a dried roasted peanut.. Peanuts grow from the ground, therefore they're, well...dirty. The first three steps are rinse. rinse. rinse.  I collect the water from my rinses so that I can compare the cloudiness of each rinse to the next. Three solid rinses should have the peanuts ready to boil.. Boiled peanuts require time. With few ingredients, they need time to soak in whatever seasonings they're boiling with. I prefer the original method, plain old salt and water. For this round I purchased about 1.5 lbs of raw, dried peanuts, and I set aside about a 1/2 cup of salt. After rinsing thoroughly three times, I put my peanuts into a pressure cooker, filled to the fill line with water. It's important that you don't overfill with water! I then added about 1/3 of the salt that I set aside.Pressure cookers are great because they cook about 8X faster than a pot on a stovetop, and you don't need to continuously add water.Place on high heat until boiling, the rocker on the top of the pressure cooker will begin to shake furiously. Once it does, lower the heat until you get a soft, side to side rock from the rocker at the top. After an hour and a half remove from heat for about 5-10 minutes to let the pressure subside, then slowly twist off the top of the pressure cooker. Add the rest of the salt, stir it up, twist the lid back on and place back onto high heat and bring to boil, repeating the simmer process. After about an hour and a half remove from heat, but this time let it cool completely without removing the lid. When you do, you'll notice an oily sheen on top of the water, that's normal!The rest is up to you! Like your peanuts a little more firm? Remove from the water once cooled. If you like them on the mushy side, leave them in for as long as you'd like. If the peanuts aren't salty enough DON'T add more salt, just let them sit and soak.. After your labor of love comes the very best step, the eating! Enjoy your boiled peanuts much like you enjoy sunflower seeds, and toss the shells aside. Great boiled peanuts require a bit of trial and error, but once you get it right you can never go wrong! Share them with a friend and spread that peanut love wherever you are!\nRead the question below and select from the following choices.\nA: Campfire Boiled Eggs\nB: Check the Rice\nC: Done\nD: The Triple Rinse", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_108_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_108_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_108_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_108_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_108_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_108_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_108_6.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: (perfect) Lemon Meringue Pie\nB: Continued Crust\nC: Making of the Crust\nD: Ingredients & Equipment", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Baking the Cupcakes', 'Filling the Cupcakes', 'Making the Meringue']", "context": "Here is the context of these images:\n. \n          For the cupcakes:\n\t\t215 g self raising flour\n\t\t60 g caster sugar\n\t\t1 egg\n\t\t1 egg yolk\n\t\t170 ml milk\n\t\t90 unsalted butter (melted)\n\t\t1 tsp finely grated lemon zest\n\t\t1/2 tsp vanilla extract\nFor the filling:\n\t\tLemon curd (shop bought or home made)\nFor the meringue topping:\n\t\t125 g caster sugar\n\t\t2 egg whites\nPreheat your oven to 180 C (350 F / Gas 4)\nFor the Americans or those without scales, here is a good guide for conversion of weight to cups.\nYou'll also need bowls, spoons, and general kitchen equipment, a 12 cup cupcake tin, and an electric whisk will really save your arm when it comes to making the meringue. A piping bag will also come in handy.. To make the cupcakes, first stir together the flour, sugar and lemon zest in a large bowl. Make a well in the centre of the dry ingredients.\nPut the egg, egg yolk, milk and vanilla into a medium bowl and pour in the melted butter. Mix together and then pour the liquid into the flour well.\nFold the wet ingredients through the dry ingredients until just combined.. Spoon the batter into the cupcake cases. Put the tin in the oven (180 C / 350 F / Gas 4) and bake for around fifteen minutes, until risen and golden.\nRemove the cakes from the oven and allow to cool for ten minutes. Leave the oven on for the meringue topping later.. Once the cakes have cooled a little, you need to hollow out the middle to create a space to put the lemon curd. If you have an apple corer this will be a breeze, or you can do as I did and just cut a small cone out of the centre with a small, sharp knife. (The leftover cake middles are the bakers perks)\nFill the hole with lemon curd. A piping bag will make this easier, or you can carefully spoon it in.. The final stage is to make the meringue topping.\u00a0\nPut the two egg whites in a clean, dry bowl (any grease or dirt will prevent them from whipping up), and whisk them until they form a firm peak. This means that when you remove the whisk the mixture will cling to it in fluffy clouds, and peaks will stand up from the bowl like small snowy mountains.\nAt this stage you can start adding the sugar. Do this gradually, in about four lots, making sure to beat well in between to incorporate the sugar properly. When the last of the sugar has been added keep beating until you get stiff, glossy peaks. These should stand firm, and the mixture will have a silky shine.\n(A classic test of your meringue is to hold the bowl upside down over your head and if ready it will not fall out. This requires extreme confidence in your meringue, or you'll be washing sticky sugar out of your hair for the next half hour).\nNow you need to put the meringue topping on your cupcakes.. Once the meringue is ready, pipe or spread a little on top of each of the cupcakes, making sure to cover all of the lemon curd.\nSprinkle the tops of the meringue with a litte caster sugar, return the cakes to the oven and bake for about five minutes, until the meringues are slightly golden brown.\nLeave the cakes to cool for around ten minutes (red hot sugar is not kind to the tongue), and then devour!\nRead the question below and select from the following choices.\nA: (perfect) Lemon Meringue Pie\nB: Continued Crust\nC: Making of the Crust\nD: Ingredients & Equipment", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_109_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_109_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_109_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_109_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_109_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_109_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_109_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_109_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_109_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_109_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_109_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_109_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_109_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_109_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_109_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_109_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_109_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_109_17.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Make Coffee Using the French Press\nB: Making the Meringue\nC: Macaronage\nD: Clean the Carafe", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Dispose of Grounds', '@placeholder', 'Clean the Filter', 'Reassemble Your Press']", "context": "Here is the context of these images:\n. 1.A - Remove the lid and plunger (and don't worry about grounds on the side of the carafe)1.B - Fill about half full with water 1.C - Dump contents into a wire mesh strainer (over the sink). The water will run clear and the grounds will be kept out of the sink. Again, don't worry about a few grounds on the side of the carafe. If there is still a lot of grounds, repeat 1.B and 1.C.1.D Dump strained grounds into the compost or trash. Tap the strainer if needed to dislodge any remaining grounds. . 2.A - Rinse the filter from above to knock off most of the grounds2.B - Add warm soapy water to fill half the carafe2.C - Use the plunger up and down a few times to clean the carafe. This helps get all of the oils out of carafe. 2.D - Rinse clear. 3.A - Remove the filter from the plunger - usually it just screws on near the base. 3.B - Now separate the filter parts and wash them wish soapy water. 3.C - Reassemble. Go grind some beans, because you have a clean press ready to make a fresh pot of coffee!\nRead the question below and select from the following choices.\nA: Make Coffee Using the French Press\nB: Making the Meringue\nC: Macaronage\nD: Clean the Carafe", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_110_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_110_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_110_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_110_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_110_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_110_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_110_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_110_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_110_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_110_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_110_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_110_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_110_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_110_13.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Adding the Basket\nB: Spiced Up Jalapeno Poppers!\nC: In/out Cupcake Chess Set\nD: Ingredients", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Connect the Drive Unit.', '@placeholder', 'Strainer & Cutter', 'The Lid & Juice']", "context": "Here is the context of these images:\n. Remove the accessories port cover on the kitchen-aid.  Take the juicer base and turn it until the drive shaft slides into the slot.  Then turn the drive base until it lines up with the accessory port notch.  Tighten the lock knob to hold it in place. . Slide the basket down onto the base assembly.  You should feel some resistance as you do this from the the gasket sealing the central shaft and the basket juice holder.  Add the basket wiper into the basket.  It should simply insert and remain loosely in place. . Choose the basket that you want to use based on the type of juice you want.  The hole size on the strainer denotes how thick the finished juice will be.  The smaller the holes the thinner the juice.  Larger hole strainers are recommended for making sauces and soups.  Place the strainer in the center and slide down in place on the drive shaft.  Twist the basket so that the arrows line up on the basket and the strainer.  Insert the cutter insert on top of the drive shaft and push down into place.  The outside of the cutter should line up even with the basket. . Add the lid to the top of the basket.  Line the unlock area up with the left top of the basket until it fits over top the basket.  Once lined up twist the basket right until it slides completely into the locked position.  This takes a considerable amount of force to get in place because the lid pushes down on a safety switch that allows the whole system to rotate.  If the unit does not spin when the mixer is on, then the lid is not completely locked in place.  Place the juice pitcher under the juice spout, and the pulp container under the pulp shoot. Turn the mixer on to setting 10, and add what you want to juice.  You are Done!\nRead the question below and select from the following choices.\nA: Adding the Basket\nB: Spiced Up Jalapeno Poppers!\nC: In/out Cupcake Chess Set\nD: Ingredients", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_111_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_111_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_111_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_111_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_111_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_111_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_111_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_111_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_111_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_111_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_111_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_111_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_111_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_111_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_111_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_111_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Add Tomatoes\nB: Curry Omurice\nC: Choping\nD: Mash It Up", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Dry Roast Masala Paste Ingredients', 'Add Chopped Onions', '@placeholder', 'Add Sauted Brinjal Slices']", "context": "Here is the context of these images:\n. -cut brinjals to long slices and keep ready-Take a pan add little oil in it.Add sliced brinjals and shallow fry until they turn little brown color. -Main flavour of this dish is with masala paste.For preparing masala paste take 1tbsp sesame seeds,2 tbsp dry dessicated coconut and few peanuts in blender. -Take a pan add dry roast sesame seeds,peanuts and dry dessicated coconut on low flame.. -Add above roasted ingredients in blender and blend by adding water to paste. -chop onions,tomatoes and one potato and keep ready-Take all spices as per mesurement in ingredient list. -First step is to take a wide vessel and add little oil.when is heated add cumin,mustard seeds.-when they crackle add split black gram and few curry leaves. -Next add finely chopped onions and saute for few minutes-Once onions are little cooked add one diced potato .. -Add one or 2 chopped tomatoes.choose tomatoes which are fully ripen or add mushy tomato puree.-Add tomatoes and saute until they turn mushy.. -Next step is to add blended masala paste and mix with cooked onion,tomatoes.. -Add a pinch of turmeric powder and 2tsp coriander powder and mix. -Even add tsp red chilli powder and salt as per taste.Mix spices properly with masala paste. -Finally add shallow fried brinjal slices and mix every thing- If the gravy is too thick add little water,cover and cook for some time. - Garnish with spring onions and serve curry hot with biryani or chapathi\nRead the question below and select from the following choices.\nA: Add Tomatoes\nB: Curry Omurice\nC: Choping\nD: Mash It Up", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_112_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_112_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_112_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_112_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_112_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_112_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_112_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_112_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_112_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_112_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_112_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_112_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_112_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_112_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_112_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_112_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_112_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_112_17.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_112_18.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_112_19.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_112_20.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_112_21.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_112_22.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_112_23.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_112_24.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Chicken Ballotine\nB: Heat Things Up.\nC: Prepare the Chicken\nD: Chop the Onions Lengthwise", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['More Chopping.', 'Relax Your Chicken!', '@placeholder', 'Recipe']", "context": "Here is the context of these images:\n. Soak the pot and saucer in cool water for at least 15 minutes prior to putting it in the oven.  Prepare all your food before and during the soak, so you can quickly fill it and put it in the oven  Your oven MUST be cold when you begin or the rapid change in temp may crack your clay pot.  Let the pot heat up gradually with the oven.. Chop your vegetables while the pot is soaking.  You can use whatever you like for this, root vegetables mixed with onions are always a nice base.  This time I used leeks, bell peppers, garlic and red onions. . Chop everything up and set aside.  To avoid crying while chopping onions people suggest sucking on a piece of white bread, slicing under cool water, breathing through your nose... none of these work however, because it is a gas that is in the air coming in contact with your eyes, not the fine mist of onion juice going in your mouth or nose.  Try wearing a gas mask, which can be purchased from most army supply stores.. Massage your chicken until it is very relaxed.  Coat the chicken liberally with seasalt, fresh black pepper and your favorite poultry spice rub (my sister swears by Paul Prudhommes Poulty Seasoning).  Pour all your veggies in the saucer, plop your bird on top of that cozy nest... . Cover the whole party with the flowerpot, and pop it in the COLD oven.  Close the door and put the temp at 325f. degrees for 1 hour.  You can also drop a remote thermometer sensor down through the hole and into the thickest part of your meat, whatever it may be, and set it to go off when it is about 10 degrees lower than your target temp.  I put a pizza tray or cookie sheet under it to catch juices.  You will likely have to remove all your oven racks to fit it in.. Grate a mountain (about 1 loose cup) of your favorite hard cheese (asiago, romano, parmesan etc.)  using a microplane if you have one, a fine shredder of any sort will do.  A decent food processor will save you some time here.  \nWhen the hour is up (or your temp alarm goes off) open the oven, pull the pot out far enough to remove the top (using heavy duty burn protection, not just a kitchen towel, please).  Sprinkle the cheese over the bird and cook uncovered for ten minutes more. . You should end up with something like this at the end.  Refrain from tearing into it immediately.  Let it cool for about ten minutes so the juices don't squirt out.  Your rice should be done just when it is time to cut the chicken.. Mmmmm! Scrumptious Delights!  Be sure to drizzle some of that gravy onto your rice as well.. Add a side of asparagus, an artichoke or some other favorite green vegetable and you have yourself a simple, succulent feast!  Ala cuisine!. For those who work better with a detailed recipe, this is my recipe from the first time I used this flower pot at my father's house when I dug it out of the pile of dirt behind the shed.  It varies from the instructable only in the extra vegetables used as a \"nest\" but the process and seasoning is very close.Rupa's Flowerpot Chicken Geyserville1 chicken, approx 4 lbs. 2 lbs red or white new potatoes1 lb plum or roma tomatoes2 med. onions (approximately 2 cups chopped)2 med. green bell peppers3-7 cloves of garlic (depending upon your taste)1 tsp marjoram and/or thyme1 tsp salt1 tsp fresh ground black pepper (double if using preground)1/4 tsp cayenne pepper1/8 tsp nutmeg2 sprigs fresh rosemary3/4 cup red wine1/4 cup Parmesan, asiago or other hard cheese, gratedInstructions:In advance if possible set your chicken in an cold brine to soak--at least 30 minutes per pound, but not more than 8 hours total.1 quart cool water1/2 cup kosher Salt1/2 cup sugar12 peppercorns6 allspice berries3 whole cloves1 sprig fresh rosemaryMix the salt and sugar in the water, add the  whole spices and submurge the chicken in the pot, cover and place in refrigerator's bottom shelf.  Be careful not to let the water drip on anything!  Place the pot in a shallow dish such as a pie plate lined with a few layers of paper towels to be extra safe.Make a rub mixture of all dry spices (marjoram and or thyme, salt, black pepper, cayenne pepper and nutmeg).Lay onions, tomatoes, garlic and green peppers into pre-soaked flower pot and pour 1/4 cup red wine over the veg mix.Remove chicken from brine and dry with paper towels from under the pot, then lightly coat with olive oil and rub chicken liberally with spice mix, being sure to coat all over, inside and out!Place chicken on the bed of vegies in the saucer, add a sprig of fresh rosemary and cover with the flower pot. Place the whole thing into cold oven, turn temperature to 350* and bake for one hour without interuption.At one hour open and remove top. Baste liberally with juices from under chicken. Turn oven temp up to 500*Sprinkle with fresh grated parmesan or other hard cheese of your choice and cook for another ten minutes uncovered.Serve over brown rice.\nRead the question below and select from the following choices.\nA: Chicken Ballotine\nB: Heat Things Up.\nC: Prepare the Chicken\nD: Chop the Onions Lengthwise", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_113_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_113_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_113_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_113_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_113_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_113_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_113_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_113_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_113_8.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Icing\nB: Dragon Bowl\nC: Modular Cheeseball\nD: Eat and Enjoy!", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Supplies', 'Assembly', '@placeholder', 'Wings & Fire']", "context": "Here is the context of these images:\n. You will need-\n2 9-inch cakes\n6 cups of frosting, your choice of colour  (more or less, depending on how thick you like your icing. \n8 Keebler chocolate covered graham crackers\n8 Sunkist fruit gems\n2 Sunkist fruit slices \n2 Blue Fruit Roll-Ups\n1 Red Fruit Roll-Up\n1 Yellow Fruit Roll-Up\n1 Marshmallow\n2 Chocolate Chips\n2 Hershey's Kisses\n2 Wooden Skewers\nYou will want all of your fruit gems, slices, and icing to be all colour co-ordinated-I chose orange candies and yellow icing, but you can do what ever colours will match the plates or napkins you will have at the party.\n. Take a serrated bread knife and cut the middle part out of all the rectangular crackers so that  you have 2 triangular pieces out of each one. Next, slice all of the Fruit Gems in half.. Now, let's start assembling!Take one of your 9 inch round cakes and slice it in half. Take your frosting and spread some on one half, then place the other half on top, and place the whole thing on a cardboard cake board or aluminum foil covered cardboard.Next, you will cut up the remaining cake. The diagram below shows exactly how to do it. Don't worry if it isn't exact-you can always make a smallish piece bigger looking with icing. ;-). Next, assemble the cake pieces according to the picture below, trimming any pieces if necessary.\nNow, you will frost Mr. Dragon. I find that using a flat icing tip in your full icing bag works wonders on those difficult, moist and crummy parts, and once you've covered up the crumbs, you can use your spatula to smooth things over. This method keeps those crumbs from showing up in your icing.. Now, you will decorate the Dragon!\nFirst, to make the Dragon look like he is scaly, use a child's marker cap to imprint the design onto the creamy frosting. Arrange the Graham crackers along his back and tail; place Fruit Gems and Slices on his toes and head. Press the Hershey's Kisses pointy side in onto the end of his snout, and cut the marshmallow in half and place the chocolate chips on top.. To make his wings, you'll need a skewer and a  blue fruit roll-up. Trim off a corner of the roll-up to keep them from looking too bulky, and after rolling it up the skewer, trim off the edges bat wing style. Repeat for other wing. Make sure not to stick these in until right before serving, because they are heavy and will sag over time.\nFor the fire, trim your yellow and red roll-ups into curvy, twisty pieces, and position near mouth.\nAnd, there you have it!! ENJOY!!\nRead the question below and select from the following choices.\nA: Icing\nB: Dragon Bowl\nC: Modular Cheeseball\nD: Eat and Enjoy!", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_114_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_114_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_114_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_114_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_114_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_114_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_114_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_114_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_114_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_114_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_114_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_114_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_114_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_114_13.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: free Pizza\nB: Suggestions\nC: Toppings\nD: Sauces", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Add Base', 'Make It Meaty', 'Cheese Please!', '@placeholder']", "context": "Here is the context of these images:\n. Tip #1 Recycle leftovers into new dishes! Heat some olive oil in a skillet and add in diced onions and garlic.\u00a0 Let onions and garlic cook until soft. Take a low carb tortilla and spread a layer of pesto to coat the base. Finally add on the garlic and onions.. Next add on your leftover meat!\u00a0 I had leftover meatballs and sausage, but anything will work!\u00a0 Maybe some grilled chicken or some roasted veggies.. Now add on the cheese!\u00a0 I used muenster because I had no mozzarella, but it actually melted really well and I would use it again!\u00a0 Choose whichever cheese you prefer; for a Greek twist try some feta with kalamata olives, or maybe some goat cheese and figs.\u00a0 Now add on some extra marinara sauce, fresh basil leaves, garlic powder, red pepper flakes, and oregano. Put it in the oven on 350 and let it cook until edges are golden brown and cheese is bubbly!. Suggestions:   Serve with a side salad and some red wine.     Try out all of the different options!\u00a0 Greek, Italian, maybe even try a Mexican style pizza.     Use the marinara recipe from my food blog everythingbutfish.tumblr.com\nRead the question below and select from the following choices.\nA: free Pizza\nB: Suggestions\nC: Toppings\nD: Sauces", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_115_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_115_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_115_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_115_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Cut the Metal Strips\nB: Make Your Own Colored Decorating Sugars!\nC: Hide\nD: Make Your Own Kahlua!", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Materials', '@placeholder', 'Bend', 'Finis']", "context": "Here is the context of these images:\n. sheet metal (hobby store)\ntin snips\npliers for bending metal\nfile for smoothing edges\nnuts and bolts\ndrill\nclamps\nsafety glasses. cut metal strips about 3/4\" thick or greater.\nsmooth the rough edges with a file. bend the metal into the desired shape. secure the closure with clamps.\nalign the metal edges so the interior of the curve is smoothest.  Put the screw hole near the edge so the amount of metal overlapping in the interior is minimized.\ndrill a hole for the screw to fit\nassemble so the head of the screw is inside of the form and the remainder of the screw extends outward.  Secure with a bolt.\n. tada!  your own custom cookie shapes.  pretty cool\nRead the question below and select from the following choices.\nA: Cut the Metal Strips\nB: Make Your Own Colored Decorating Sugars!\nC: Hide\nD: Make Your Own Kahlua!", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_116_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_116_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_116_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_116_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_116_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_116_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_116_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_116_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_116_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_116_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_116_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_116_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_116_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_116_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_116_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_116_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Fresh Veggie Tart\nB: Gather Your Ingredients and Supplies...\nC: Make the Tart Crust...\nD: Arrange the Strawberries & Blueberries", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Make the Pastry Cream', 'Place Strawberries & Fill W/ Pastry Cream', '@placeholder', 'Enjoy!']", "context": "Here is the context of these images:\n. Here's a list of the ingredients that you'll need: For the fruit topping: strawberries blueberries 1 nectarine or 2 plums (I have made it with both) For the pastry cream: 2 cups half-and-half 1/2 cup sugar A pinch of salt 5 large egg yolks 3 tbsp. cornstarch 4 tbsp. cold unsalted butter, cut into 4 pieces 1 1/2 tsp. vanilla extract (we modeled our recipe after this one) 1 pie crust We didn't have very much time to make this so we ended up purchasing a graham cracker crust, like one you would use for a cheesecake.. To make the pastry cream, combine egg yolks and 2 tablespoons of sugar in a medium bowl and whisk until the sugar has begun to dissolve and the mixture is creamy, about 15 seconds. \u00a0Then, whisk in the cornstarch until the mixture is pale yellow and thick, about 30 seconds. On the stove, heat the half-and-half, 6 tablespoons of sugar and salt in a saucepan over medium-high heat until simmering, stirring occasionally to dissolve the sugar.\u00a0When the half-and-half mixture has reached a simmer, slowly add it to the egg yolk mixture, whisking constantly.\u00a0 Return the mixture to a simmer over medium heat, whisking constantly, until a few bubbles burst on the surface and the mixture is thickened and glossy, about 30 seconds.\u00a0 Off the heat, whisk in the butter and vanilla. \u00a0 Strain the pastry cream through a\u00a0 fine mesh sieve. You can use a spatula or a spoon to push the pastry cream through the mesh. After you've finished, place plastic wrap directly on the surface of the pastry cream. This will help to prevent a skin from forming. Refrigerate the pastry cream until it's cold and set. We didn't have much time so we took ours out after only about 2 hours, other recipes recommend at least 3 hours and up to 2 days.. Slice the plum into thin slivers and cut up the strawberries into flat pieces. You can arrange the plum slices in a ring to get an idea for how it'll look once it's assembled on the tart.. Line the bottom of the crust with the sliced strawberries. Most people won't expect to find fruit at the bottom so it'll be a delicious surprise! Then, evenly fill the crust with pastry cream. Smooth out the surface so that it's relatively flat.. This is the most fun part! creating the fruit topping. First, start by arranging the slivers of nectarine around the border of the tart. Leave a little space between each one. Try as best as you can to keep a circular hole in the center of the tart that isn't covered in nectarine. You can see that we didn't do the best job at this. No worries if it isn't looking quite right. Later on it'll be covered in strawberries and blueberries.. After you're happy with how the nectarines look, arrange the strawberries in a floral shape in the center of the tart. (unfortunately I was so focused on this step that I forgot to take pictures!) When arranging the strawberries, use the larger pieces for the outer ring and work your way inwards, placing one on top of another. Layer them in an alternating pattern. Then, where there's just a little space in the center add the blueberries until there's a small pile of them.. Step back and admire your work! It's delicious too.\nRead the question below and select from the following choices.\nA: Fresh Veggie Tart\nB: Gather Your Ingredients and Supplies...\nC: Make the Tart Crust...\nD: Arrange the Strawberries & Blueberries", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_117_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_117_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_117_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_117_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_117_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_117_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_117_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_117_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_117_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_117_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_117_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_117_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_117_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_117_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_117_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_117_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_117_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_117_17.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_117_18.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Prepare the Filling\nB: Gol Guppas\nC: Set Up the Workspace\nD: Melt the Dark Cocoa Candy Melts", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Making a Fractal', 'Making Cheese Dough', '@placeholder', 'Fill, Arrange, Bake!']", "context": "Here is the context of these images:\n. The crust design is an Apollonian Gasket.\u00a0Wikipedia has a great article all about Apollonian Gaskets\nhttp://en.wikipedia.org/wiki/Apollonian_gasket\nThe essential things to know are:\n- An Apollonian Gasket is a space-filling fractal. In theory you could make many more circles to continue to fill the top of the pie, but in practice I found that 16 provided a nice design and kept circles at a reasonable size to work with.\n- Curvature  is how \"sharp\" a curve is, and is inversely proportional to the radius of curvature . A straight line would have an infinite radius of curvature, the edge of a large circle would curve slowly (low curvature, large radius), while a small circle would curve very quickly (high curvature, small radius). The designs shown on Wikipedia list the relative curvatures  of the circles within the fractal (with the first, negative, number listed being the radius of the largest \"frame circle\"), so we need to do a little number-crunching to figure out the actual radii we want to use.\nI decided to use the {-12, 25, 25, 28, 48} pattern that is shown on Wikipedia (I prefer the almost-D3 symmetry). To calculate the radii of the circles you will use, you need to take the radius of your pie dish (my 9.5 in dish has a 4.25 in radius), and multiply that radius by the first number in the pattern (in my case 12, the negative number) then divide by the curvature in question.\nThe pattern I chose to follow\nhttp://en.wikipedia.org/wiki/File:ApollonianGasket-12_25_25_28-Labels.png\nFor example, to find the radius of the \"25\" circle in the pattern, I used my spreadsheet to multiply 4.25 in * 12 / 25 = 2.28 in, or approximately 2 + 4/16 inches. If you are using a different sized pie dish, you can take each number below, multiply it by the diameter of your pie dish, and divide by 9.5\nThe radii, in inches, for the sixteen circles I used are as follows\nCurvature \u00a0 Radius (decimal) \u00a0 \u00a0Radius (fraction)\n25 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a02.280 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 2 \u00a04/16\n25 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a02.280 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 2 \u00a04/16\n28 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a02.036 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 2 \u00a01/16\n48 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a01.188 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 1 \u00a03/16\n57 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a01.000 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 1 \u00a00/16\n57 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a01.000 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 1 \u00a00/16\n97 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a00.588 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 0 \u00a09/16\n97 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a00.588 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 0 \u00a09/16\n112 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a00.509 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 0 \u00a08/16\n112 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a00.509 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 0 \u00a08/16\n121 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a00.471 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 0 \u00a08/16\n121 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a00.471 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 0 \u00a08/16\n168 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a00.339 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 0 \u00a05/16\n208 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a00.274 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 0 \u00a04/16\n232 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a00.246 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 0 \u00a04/16\n232 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a00.246 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 \u00a0 0 \u00a04/16\nOnce you calculate the radii of the circles you intend to cut out, use your trusty compass to carefully measure off each radius on the ruler, then draw the circles on a large sheet of parchment paper. Do NOT cut them out yet. I also found it helpful to label each circle by its curvature, for reference later.... You will need enough cheese dough to line the pie dish as well as cover the pie, so if you feel like buying pre-made dough, just get enough for two 9.5 inch dishes.\nTo make your own dough:\nIn a large bowl, mix the flour, sugar, salt, and cheese.\nDice the frozen butter into small cubes, then pulse in a food processor with the dry ingredients until just mixed. Alternatively, cut the butter into the flour mix by hand. The dough should still be powdery and there should be small balls of butter throughout.\u00a0\nAdd 6 tablespoons of ice water to the mix to moisten. If the dough still seems too dry, add 2-3 more tablespoons until you can form the dough into a thin disk.\nWrap the disk in plastic and refrigerate for 30 minutes.. Once your dough has chilled, separate it into two even portions. One for the dish, one for the design. Make sure to flour your cutting board for easy removal of the dough once it is rolled.\nHeat your oven to 400 degrees Fahrenheit.\nFor the dish, roll the dough out until it is a large circle 1/4 inch thick, then press the dough into a 9.5 inch pie dish. Dock the dough with a fork.\nFor the crust design, roll the dough out to 1/4 inch thick, and lightly press the parchment paper with the circles on to the rolled dough. With the tip of a sharp knife, carefully cut out each circle, and leave the marked paper pressed on to each circle of dough for easy labeling and transport. Place the circles, parchment side down, on a large cookie sheet.\nIf you have left over dough, cut it into fun shapes and cook it along side the circles. Cheese dough is quite tasty on its own.\nFreeze the dish and the circles for 10 minutes to set the butter. Then cover the dish with foil or parchment, and add pie weights or beans to help the dish hold its shape. Cover the circles with one large sheet of foil as well.\nBake both the dish and sheet of circles for 10 minutes. Remove the weights and bake both for 10 more minutes until the crust is lightly brown.\nLet dough chill while you prepare the filling.. Prepare your filling in a heavy skillet over medium heat.\nIf you are using bacon, start by cooking the bacon first, until crisp. Then add the onions and apples and saute them with the bacon until they soften.\nIf you are using prosciutto, saute the onions and apples first in good olive oil (a couple of tablespoons), then add the prosciutto at the end for just a few minutes to add to the mix.\nIn a bowl, whisk the milk and eggs with the spices, while the pan filling cools a bit. A little spice goes a long way!\nGrate another 3/4 cup cheese to go on the filling.. Add the sauted filling to the pie dish first.\nNext, top the filling with the grated cheese.\nNext, pour the egg and milk mixture over everything.\nLastly, carefully arrange the dough circles according to the Apollonian Gasket pattern you are following. The circles should absolutely touch each other, and may still overlap the edge of the pie a little.\u00a0\nBake at 400F for 30-35 minutes or until the filling is golden.\nAllow to cool for 10 minutes before serving. It tastes great warm or chilled from the refrigerator and keeps for a few days.\nEnjoy!\nRead the question below and select from the following choices.\nA: Prepare the Filling\nB: Gol Guppas\nC: Set Up the Workspace\nD: Melt the Dark Cocoa Candy Melts", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_118_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_118_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_118_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_118_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_118_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_118_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_118_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_118_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_118_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_118_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_118_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_118_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_118_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_118_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_118_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_118_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_118_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_118_17.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_118_18.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_118_19.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_118_20.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_118_21.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_118_22.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_118_23.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Fig Newt Gingriches\nB: Mitt Hominy (a.k.a. Grit Romney)\nC: Making the Frangipane\nD: Prawn Pauls", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Making the Frangipane', 'Chilling the Layers', 'Trimming']", "context": "Here is the context of these images:\n. Frangipane (Almond Cake)One of the secrets to good baking is weighing your ingredients. Weighing rather than measuring ingredients by volume gives more consistent results. If you don't have a food scale, you can pick them up fairly cheap online. We had gotten ours originally for weighing coffee, but it's been great for baking as well. I did try to put an estimate of the quantity in parenthesis because it definitely makes it easier when you're shopping.Also, you'll see a couple of ingredients listed twice. This is because you will use them at different times, so it's just easier to weigh them separately. Note: All your cake ingredients should be at room temperature13 ounce Almond Paste (Almost two boxes)   4 ounce Sugar   1 ounce Egg (You'll use around 9-10 large eggs total for this recipe.)   9 ounce Sugar   6.5 ounce Unsalted Butter (Around a stick and a quarter)   6.5 ounce Shortening (About a cup)   12 ounce Eggs   1/4 ounce Vanilla extract (About 1/8 a cup)   6 ounce Cake Flour (Sifted) (Around a cup)Bottom CoveringMarzipan Dough - Chilled (We used a 7 oz. box, but you could always make your own.)FillingA jar of your favorite preserves (You could also make your own fruit filling. Just make sure it's isn't too chunky. We thinned ours with a little warm water, just to make it more spreadable.) Ganache Coating / Filling16 oz Dark Chocolate (Four big Ghirardelli-sized bars of chocolate)   16 oz Heavy Cream (One pint)DecorationYour favorite buttercream icing, royal icing, fondant, etc. You could also use dragees (the little hard shelled silver candies) or sprinklesNote: You'll need 8 to 10 pounds of weights (for compressing the cake pans later).. Pre-heat your oven to 375\u00b0F. If you're like us, you have half-sheet baking pans (13\" x 18\"). This recipe fills a full-sheet pan or two half-sheets. If you're using half-sheets, you'll also need a third one (you don't have to flour / grease it) for transferring. If you're using full-sheets you'll need a second one. You'll want to use a sheet with a lip.Take a baking sheet and grease it with shortening or butter. Lay a sheet of parchment on top of the greased baking sheet and smooth it down. The first bit of grease will ensure the parchment sticks nice and flat to the pan. Now, grease the parchment so the cake won't stick to it. Sift a little flour into the pan and shake it around making sure it's well-coated before pouring off the excess.This seems like a lot of work, but when your cake practically slides off the pan onto your table  it will all pay off.. Blend together all the almond paste (13 ounces) and sugar (4 ounces). Break up the almond paste and get the whole thing looking like sand. We used the paddle attachment on the mixer, but a food processor would have worked too. If you used a food processor, you will definitely want to transfer it to a mixer after you've combined the paste and sugar. Your end goal is to create a smooth lump-free mix. Lightly whisk the egg (1 ounce) and then slowly add it to the almond paste / sugar mix and blend until smooth. While you want a smooth mix, you don't want to over mix. Add the sugar (9 ounces), butter (6.5 ounces), and shortening (6.5 ounces) and mix together until light and fluffy. Slowly add the rest of the eggs (12 ounces) and vanilla (1/4 ounce). Slowly add in the sifted cake flour (6 ounces) and blend until smooth and creamy. Like with the almond mixture, you don't want to over mix.. Spread the batter into the lined baking sheet. The smoother you make the top the smoother your final cake will be. A cake spatula is the perfect tool for this.Bake at 375\u00b0F for 10 to 12 minutes. You want the cake to be firm but don't let the edges get dry. Dry = crumbly = wasted cake!. Ganache is super simple to make. You'll make it twice for this recipe. Once as a filling and once as a coating. If you make the ganache before you cook the frangipane, it will have enough time to cool to a spreadable consistency. For coating the cakes, you'll want to be ready to dip.For the filling, we used two bars and 8 ounces of the heavy cream. Break up your chocolate (you can use a food processor, just don't melt it). The finer the chocolate is chopped the easier it will melt. Using a double boiler, bring the cream to a barely a boil. We use medium-high heat on my stove. Pour the cream over the chocolate. Let stand for ten minutes. Gently stir until the chocolate and cream are smoothly mixed. You don't want to overwork it. The results should be smooth and glossy.For the filling, let it cool until it's spreadable.. Lay parchment on the back of your clean pan and lay it on top of the finished cake. Flip it over to transfer the cake from the pan to the back of the sheet. If you're using a half-sheet, you're going to want to cut the cake into thirds. (You need three equal-sized pieces of cake to make the \"sandwich.\") Don't worry about being pretty during this step, we'll trim the edges later on.Lay one sheet of cake on the back of a parchment-lined baking pan, and spread a 1/8\" thin layer of your jam (we thinned the jam with a little warm water). Top with the second sheet of cake and add your chocolate. Top with the third and final layer of cake, but this time spread a very thin layer of jam or chocolate (your choice) on top.. Roll out a 1/16\" thick sheet of marzipan about the same size as the cake. Roll it loosely around the rolling pin and unroll it on top of the cake. Run the rolling pin over the top (carefully, you don't want to pick the marzipan back up). This will become the bottom of the cake. The marzipan keeps the cake moist and it gives it a smooth bottom.. Put a piece of parchment on top of the marzipan and put your second baking pan on top.Use that pan to flip the entire cake upside down. Remove the original pan, and wrap the entire cake and bottom pan with plastic wrap. Now place the empty baking pan on top. Put weights on top of this pan. A couple of hand weights (no more than 10 pounds total) will work. (We used a big bowl of left over chili.) This squishes the cake layers together and makes sure everything gets sealed down.  Put the whole thing in the refrigerator and chill the cake overnight.. Now comes the fun part. Trim the cake so you have clean, smooth edges. (The trimmings are also delicious. Especially with a bowl of ice cream.) In the photos, we're using a pizza cutter. However, a serrated knife would have been a better choice for less crumbs and a smoother profile. Cut the cake into 1\" x 1\" pieces.. Follow the same recipe for ganache as before, but don't let it cool down. Traditionally, the cakes sit on a wire mesh rack and the ganache gets poured over them. Our wire racks were a little too wide for the cakes, so we skewered the cakes and twirled them in the ganache.We also took a few shortcuts when it came to the decorating. You can do the same, or you can lose yourself in the world of decorating. (Here\u2019s a collection of my favorite cake decorating instructables.) In the end, I chose to go with simple off-the-shelf cans of icing because I didn\u2019t have a lot of time. My decorations were very minimal, but I really think you do need some kind of decoration to make the cakes really pop.\nRead the question below and select from the following choices.\nA: Fig Newt Gingriches\nB: Mitt Hominy (a.k.a. Grit Romney)\nC: Making the Frangipane\nD: Prawn Pauls", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_119_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_119_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_119_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_119_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_119_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_119_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_119_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_119_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_119_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_119_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_119_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_119_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_119_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_119_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_119_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_119_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_119_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_119_17.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_119_18.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_119_19.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_119_20.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_119_21.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_119_22.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_119_23.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_119_24.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_119_25.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_119_26.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_119_27.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_119_28.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_119_29.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_119_30.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_119_31.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_119_32.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_119_33.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_119_34.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_119_35.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_119_36.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_119_37.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_119_38.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Blue Cheese Palmiers\nB: Deep Fry\nC: Making the Patties\nD: Roll Them Up", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['What You Will Need', '@placeholder', 'Cooking the Patties', 'Cleanup']", "context": "Here is the context of these images:\n. 1 1/2 lbs ground beef (leanness is your preference.  Less lean will be juicier and more lean healthier.  I used 85% lean)2oz. blue cheese1/4 cup chives (chopped)1/8 teaspoon hot sauce1/2 teaspoon Worcestershire sauce1/2 teaspoon pepper3/4 teaspoon salt1/2 teaspoon dry mustardBuns (something hearty, like pretzel). Large plastic bowlMisc. measuring spoons and cups (standard)Cutting boardKnife (for chopping chives)Plastic wrap. WARNING:  Raw beef may contain bacteria and cause food poisoning.  Wash your hands immediately after handlingthe beef1.      Using the cutting board and knife chop \u00bc cup worth of fresh chives2.      Combine all ingredients into the bowl3.      Mix thoroughly (hands are best used to mix evenly)4.      Leave in the bowl and cover in plastic wrap and let set in the refrigerator for around 2 hours5.      After the mixture has set, divide into individual patties (makes 4-6 patties.). Grill the patties until cooked to your liking (well done, medium, etc.).  If you do not have a grill, broil them in a large Pyrex dish and flip them after 5-10 minutes (I used this method personally, works great)Caution:  Keep patties on a lower rack in the oven if broiling.  This will keep them from cooking too quickly on the outside and leaving the inside under cooked. Serve on the buns and enjoy. Thoroughly wash all utensils and surfaces, particularly those that came into contact with the beef.  Use either disinfectant wipes or soap and hot water to clean any drops or spills.\nRead the question below and select from the following choices.\nA: Blue Cheese Palmiers\nB: Deep Fry\nC: Making the Patties\nD: Roll Them Up", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_120_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_120_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Second Layer of Chips\nB: CARTS Nachos\nC: Ingredients\nD: Start Filling", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['First Layer of Chips', 'First Layer of Cheese', '@placeholder', 'Heating Up the Nachos']", "context": "Here is the context of these images:\n. So you take the chips and put them on the plate. Put as much as u want . Now put the first layer of cheese on top of the chips again put as much as u want remember it's only the first layer. Now it's time to put the second layer of chips so just put chips on top of the first layer . It's time to up the second layer of cheese on top of the second layer of chips and u can put as much cheese because this is ur last layer. Now heat up the nachos for 30 seconds in the microwave . Now you get to eat it!!!You can add different toppings if u want it's totally up to you \nRead the question below and select from the following choices.\nA: Second Layer of Chips\nB: CARTS Nachos\nC: Ingredients\nD: Start Filling", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_121_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_121_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_121_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_121_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_121_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_121_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Make Your Own Yogurt\nB: Save the Bones\nC: Heat 1 Cup of Water on Medium Heat in a Saucepan\nD: Add Brown Sugar", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Add Ingrediences to Crock Pot', 'Separate Liquid From Solids', 'Strain Liquid']", "context": "Here is the context of these images:\n. Whenever you cook a chicken, whichever way you make it, there will be bones left over. Don't throw them away! Put them in the refrigerator or freeze them. Don't let them spoil. When you are ready to use them,place the bones on a cookie sheet. Heat oven to 400 degrees F. Place the cookie sheet on the top shelf of the oven and bake for 30 minutes.. I use a 4 quart crock pot. Add vegetables such as celery, carrots, onion, garlic and parsley, if you like.. Add water to fill. Turn the heat to low. Cook for 8-10 hours. The temperature on high will cook faster. . When complete, place a colander in a large pan and empty the contents into the colander. The liquid is extremely hot at this point. Be careful not to burn yourself. Discard the depleted bones and vegetables.. Place a strainer in a funnel and pour liquid from the pan into quart jars. If planning to freeze, only fill to the shoulder of the jar, otherwise the jar will break when the ice expands. I keep one jar in the refrigerator and freeze the rest. Some sediment may get through the strainer. If sediment is not wanted, cover the strainer with cheese cloth to filter this out.Making your own broth creates a for better tasting soup or gravy than any commercial product I have ever tried. It is worth the effort!. There will be a lot of chicken fat on all the surfaces. Use lots of detergent or run them in the dishwasher.. \nRead the question below and select from the following choices.\nA: Make Your Own Yogurt\nB: Save the Bones\nC: Heat 1 Cup of Water on Medium Heat in a Saucepan\nD: Add Brown Sugar", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_122_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_122_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_122_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_122_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_122_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_122_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_122_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_122_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_122_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_122_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_122_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_122_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_122_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_122_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_122_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_122_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_122_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_122_17.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_122_18.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_122_19.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Faux Bordeaux Candy\nB: Ruby Gem Candy\nC: Gather Your Ingredients\nD: Wrap Thinks Up...", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Hulling the Pods', 'Making the Pieces', 'Enjoy']", "context": "Here is the context of these images:\n. Here is what you will need:\n1 pound Tamarind pods\n1 to 1 1/2 cups of water\n2 1/2 cups sugar (save a 1/2 cup for later)\n1 teaspoon salt\n1 to 2 tablespoons Chile de arbol (optional and to taste) or cayenne. This step probably takes the longest. You have to remove the outer shell and inner strings to get to the sticky fruit. After you remove the shell, break them apart in smaller pieces. You can either leave the seeds or remove them. I normally just leave them as that is how this candy is normally made in Mexico.\nGive them a quick rinse with some water to remove any debris from open pods or stuck shell pieces. Make sure its a quick rinse.\u00a0. Pour half of the water over the fruit and cook it over a low to medium heat while stirring until the fruit breaks down and looks more like a paste. You can help it along by mashing the fruit as it cooks. If needed you can add water little by little to get a thick consistancy.\u00a0. Once you have a nice thick paste, add the salt and start adding the sugar. Add 1 cup of sugar at a time until it is completely incorporated. You can add the Chile de Arbol at this time as well if you choose to do so. Turn up the heat a little, stirring constantly until it boils.. Make sure you use something that will fit in your Fridge.\nUsing a cookie sheet and some wax paper I dropped a little more than a tablespoon for each. I then placed them in the fridge till they cooled and somewhat hardened. About 1 to 2 hours.. After they cooled, I used the rest of the sugar and rolled each piece. I find it useful to use one hand to grab the candy and the other to roll it in the sugar.. I made some with the Chile de Arbol and some without. The sweet, sour and spicy together is great.\nRead the question below and select from the following choices.\nA: Faux Bordeaux Candy\nB: Ruby Gem Candy\nC: Gather Your Ingredients\nD: Wrap Thinks Up...", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_123_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_123_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_123_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_123_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_123_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_123_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_123_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_123_7.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Scary (ok, Cute) Spider Pumpkin Cupcakes\nB: Pumpkin Top Cupcakes\nC: OREO PUMPKIN CUPCAKES!\nD: Supplies", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Supplies', 'Dunking Oreos', 'Putting Together Your Pumpkin Patch', '@placeholder']", "context": "Here is the context of these images:\n. Edible Supplies:\t\tCupcake Mix (any flavor)\t\tFrosting (any flavor, but you'll need it to be green)Mini Oreos (I bought 2 $1 packs from Target and that was plenty)\t\tNormal Sized Oreos Brown Candy Melts (you'll need very few) Green Candy Melts (you'll need very few, way less than pictured) Orange Candy Melts (I used a whole bag to dunk all of the mini oreos shown and three big oreos) Brown SprinklesSupplies You Shouldn't Eat:\t\tCupcake Pan\t\tCupcake Liners\t\tContainer to melt melts\t\tCling Wrap\t\tZip Lock Bag\t\tWilton Piping Tips, I used the  #2 round and  #5 round\t\tWax Paper. You'll need to make your cupcakes. \u00a0There is nothing special there, just make them and give them time to cool.\nTo get ready to dunk your Oreos, get a piece of wax paper out. \u00a0You will need a pretty big piece if you do as many as I did. \u00a0Then, in a container,\u00a0carefully\u00a0melt your candy melts. \u00a0Gather your Oreos and you are ready to go!\nMy initial idea for dunking these Oreos, was to stick toothpicks in the cream and then dunk them like that so I could easily remove the toothpick after they were dunked. \u00a0This does not work. \u00a0Because the candy melts are so dense, they cause the toothpick to act as a lever and instead of dunking it in and taking it out all nicely, it pries your Oreo apart and causes a mess.\nSo, to dunk my oreos, I just threw them in the candy melts and used a fork to get them out. \u00a0Sometimes they had too much coating and when they did I would lightly press them against the side of the bowl to get off some of the excess. \u00a0Once they were dunked, I carefully set them on the wax paper. \u00a0I put them so they were standing on their side if I could.. Use the same method for melting candy melts and getting them ready with the wilton tips as I did with the Skeleton Cupcakes\u00a0(Step 3). \u00a0You will not need many candy melts of green or brown at all. \u00a0You are only doing small details and it goes a long way. \u00a0I had extra after I did everything and so I drew out chocolate bats and did green vines, which I did use later.Stems:\nHeat up your chocolate candy melts first. \u00a0Prepare a ziplock bag and you will be using a #4 round tip. \u00a0I show in the pictures above how I did the stems. \u00a0It's fairly simple. \u00a0All I really tried to make sure I did was got a nice thick stem that sort of stuck up. \u00a0Their stems aren't always that long, so you just need a little stubby one on top.Vines:\nHeat up your green candy melts for your stems. \u00a0I used 12 pieces and it was\u00a0definitely\u00a0enough. \u00a0Now just draw some vines on your pumpkins. \u00a0I did a couple leaves using the same method as the stems, except, in stead of pulling up and away from the pumpkin, I kinda of went along the pumpkin. \u00a0You can see a little leaf in Photo 5. \u00a0With your extra green, draw some vines on your wax paper. \u00a0I put these on some of the cupcakes later, just for a little extra something, something.\n*Tip: Since you don't really get the zip lock dirty because the candy melts are wrapped in cling wrap, you can use both corners of the bag. \u00a0Then you only need one bag to do the stems and the vines.\n**Another Tip: Make sure when you put the melts in the cling wrap, that you really twist the ends and get the candy melts all grouped in the middle. \u00a0Otherwise they will spread out in the cling wrap as they melt and as you smush them.. Now all you need to do is frost up your cupcakes. \u00a0Throw on some sprinkles and put on a pumpkin or too. \u00a0Do not press the pumpkin in like you did with the bones in the Skeleton Cupcakes. \u00a0This won't push them in the cupcake because the pumpkins are too fat. \u00a0This will just make a mess of the frosting. \u00a0Just set them on top. \u00a0They should stay fairly well. \u00a0The more frosting you use the better,\u00a0because\u00a0while they won't push into the cupcake, you can bury them in the frosting. \u00a0I put some more sprinkles around the base of the pumpkin once it was on the cupcake.\nFor the Great Pumpkin, you are going to need to cut a slice out of the pumpkin. \u00a0See photos 8 - 10. \u00a0Once you cut out the slice and frost it, make sure you remember where it is because it is hard to tell once the cupcake is frosted :)\nNow you can put your pumpkins on all of your cupcakes and throw some vines in as well. \u00a0I tried to make it look like the vines were coming from under the pumpkins (though, I know the vines would be around the stems).. I always take so many pictures of my\u00a0finished\u00a0projects to get just the right one. \u00a0So I am sharing a bunch with you here :)\nRead the question below and select from the following choices.\nA: Scary (ok, Cute) Spider Pumpkin Cupcakes\nB: Pumpkin Top Cupcakes\nC: OREO PUMPKIN CUPCAKES!\nD: Supplies", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_124_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_124_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_124_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_124_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_124_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_124_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_124_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_124_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_124_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_124_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_124_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_124_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_124_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_124_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_124_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_124_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_124_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_124_17.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_124_18.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_124_19.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_124_20.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_124_21.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: cream\nB: How to Make Papaya Ice Cream\nC: How to Turn Royal Icing Into Homemade Sprinkles\nD: Cooking", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Ingredients', 'Mixture', 'Packaging', '@placeholder']", "context": "Here is the context of these images:\n. Milk - 200 mlHalf&Half - 200 mlSugar - 5 tbspZip Lock Bag - 2 unitsIce and Salt. We'll put 200 ml of milk and 200 ml of Half & Half. Then we'll put 5 tablespoon of sugar and mix it. You can mixed it in a bag. . We take zip lock bag and put it mixture in zip lock bag. Mixture you don't have much air in your zip lock bag.. Then we'll put ice in other bag and put a salt. Next we'll close bag and shaking. Then you want a gloves if you don't want freeze your hands. Next we'll put mixture in bag with ice about 5-10 minutes. Then you want mixing and shaking around.In 6 minutes the ice cream froze.. We put it in a bowl. If you want make more ice cream you just double everything. It's very delicious homemade ice cream. Quick and simple recipe.Thank you for watching! =)\nRead the question below and select from the following choices.\nA: cream\nB: How to Make Papaya Ice Cream\nC: How to Turn Royal Icing Into Homemade Sprinkles\nD: Cooking", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_125_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_125_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_125_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_125_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_125_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_125_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_125_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_125_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_125_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_125_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_125_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_125_11.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Chocolate Mousse\nB: Chocolate Mehndi Mousse Cakes\nC: Time for the White Chocolate\nD: Add the Whipped Topping", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Ingredients', 'Prepare the Wantons', '@placeholder', 'Filling the Wantons']", "context": "Here is the context of these images:\n. You will need:1 Package of Wantons 1 Tablespoon Butter1/4 Cup Sugar 1 Tablespoon Cocoa 2 Tablespoons Powdered Sugar1/2 Cup Heavy Whipping Cream. Start by melting the butter in the microwave. Lay a wanton on a plate or clean counter. Brush on the melted butter and then sprinkle a pinch of sugar evenly over it. Flip it over and do the same on that side. Repeat with 23 more wantons.. Center a wanton over a hole in a mini muffin tin. Gently press the center down to the bottom of the tin. The sides of the wanton should start to fold toward the center. Pinch them lightly and then press them against the sides of the tin. Repeat with the rest. The first one may be a bit tricky, but after you finish one, the others should only take a few seconds each. Bake at 375\u00b0F for 6 minutes, or until the corners are golden brown. Let them sit in the pan for a minute and then place them on a cooling rack.. While the Wantons cool, make the chocolate mousse. Add the heavy cream, cocoa, and powdered sugar to a mixing bowl. Beat with a hand mixer on medium speed until stiff peaks form. . Make sure the wantons are cool before filling. Scoop the mousse into a piping bag and cut the tip off of it, about a 1/4\" up. Pipe the mousse into the wanton cups. Eat them immediately or keep refrigerated up to 1 day.The mousse won't store for very long, but you can make the wanton cups and store for several days.\nRead the question below and select from the following choices.\nA: Chocolate Mousse\nB: Chocolate Mehndi Mousse Cakes\nC: Time for the White Chocolate\nD: Add the Whipped Topping", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_126_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_126_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_126_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_126_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_126_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_126_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_126_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_126_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_126_8.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: White Chocolate Chip and Macadamia Nut Cookies !!\nB: Ingredients\nC: Vegan Spelt Chocolate Chip Creamcheese Cookies\nD: Optional (add Chocolate Chips)", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Cream the Bown Sugar With the Egg', '@placeholder', 'Fill Up the Paper Cups', 'Video Tutorial']", "context": "Here is the context of these images:\n. First we will go ahead and preheat our oven to 350 degrees F. Now let's cream the brown sugar with the egg.. Next let's go ahead and pour the almond milk in, and mix. . Now let's combine our gluten-free flour mix, ground flaxseed, baking powder, cinnamon, and nutmeg. Now stir it all together with a spoon or wooden spoon. . And if you like sweets like the girls, and when I wasn't looking, they added a cup of dark chocolate chips (dairy free).. After the chocolate chips are folded in, fill up the paper cups about 2/3 to 3/4 depending on the size you want your muffins to be. Then put them in the oven and bake them for 30 minutes at 350 degrees F. . Leah wanted to have a dance party while we waited for the muffins to be done. Feel free to have one as well. :) or you know, just watch some tv or something. . Now eat them and enjoy!\nRead the question below and select from the following choices.\nA: White Chocolate Chip and Macadamia Nut Cookies !!\nB: Ingredients\nC: Vegan Spelt Chocolate Chip Creamcheese Cookies\nD: Optional (add Chocolate Chips)", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_127_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_127_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_127_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_127_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_127_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_127_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_127_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_127_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_127_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_127_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_127_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_127_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_127_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_127_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_127_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_127_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_127_16.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Chocolate Mice\nB: Beautiful*\nC: Spun Sugar\nD: Chocolate Sled", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Melting Chocolate', 'Strawberries! Yum', 'Petals', '@placeholder']", "context": "Here is the context of these images:\n. You will need the following:\n> 8-12oz of chocolate chips(this makes about two roses)> 1/2cup of corn syrup\n> strawberries (as many as the amount of roses you wants to make)> water> plastic bag> plastic container\n> bamboo sticks or any strong wood\u00a0sticks\n> pair of hands:D. there are two ways to melt chocolate in the microwave or on the stove. I chose the stove because it the easiest way not scorch or burn it.\n\u00a0\u00a0\u00a0 to melt on the stove put the fire on low(if you put it on high its most likely gonna burn)\n\u00a0\u00a0\u00a0 then boil the water in a saucepan and put a plate on top (make sure the bottom of the plate doesnt touch the boiling water)\n\u00a0\u00a0\u00a0 put chocolate chips inthe plate little at a time\n\u00a0\u00a0\u00a0 stir continuously\u00a0\u00a0\u00a0\n\u00a0\u00a0\u00a0 when its completely liquid turn off the stove\nTo melt chocolate in the micro wave:\n\u00a0\u00a0\u00a0\u00a0 choose a microwavable container\n\u00a0\u00a0\u00a0\u00a0 put in half of the amount of chocolate and put as much time as it\nneeds to melt(if it burns or scoches add the remaining chocolate and stir.. \n\tAfter melting the chocolate add 1/2 a cup of corn syrup and mix untill the glossy coat is gone.when your done mixing pour into a plastic container and put it into the freezer for 15 min. after the 15 minutes take out of the freezer and put into plastic wrap (i used a plastic bag its the same) and put it in the refrigerator\u00a0for half\u00a0an hour to harden a bit more.(\u00a0the chocolate should come out of the plastic container easily like soft clay). While the chocolate hardens, wash the strawberries and chop off leaves(or if you want leaves for the rose leave them on). the strawberries have to be perfect so if they are a bit freeformed shaped then with a knife shape them perfectly.after shaping place them in a cup of ice because we dont want the chocolate to melt.\u00a0since the chocolate will fall off if it is wet it is preferable if you dry them with a dry towel. after cutting,cooling and drying wash you hands with freezing water and dry completely and get ready to work with the chocolate.. For this step you will have to take the chocolate out of the refrigerator and cover the strawberries with it. First, take a chunk of chocolate and play with it alittle untill it is moldable. Then, cover the strawberry completely (make sure the chocolate cover is not to thin or it will tear and also make sure not to work too much w/ the chocolate or it will melt). After that let the chocolate cool before you work on the petals.. for the petals you will have to be super careful(they tear easily). First make a roll of chocolate to make it easier to make petals. with a knife slice a piece of the roll about 2cm wide.repeat that about 15 times\u00a0. when you have finished that pat them down around the edges so the inside is fatter than the outside and begin placing them around the chocolate covered strawberry. place them to were you think looks best.. The only thing left to do is to\u00a0insert a bamboo stick on the bottom of the rose and put it in the freezer to chill and TAHDAH!!!!!you have a beatiful flower!!!you can decorate it with edible paint or glitter. you can also put it on a cake for a great topper!!! i\u00a0would really love\u00a0to see your flowers so dont forget to send me pictures of them!!!<3\nRead the question below and select from the following choices.\nA: Chocolate Mice\nB: Beautiful*\nC: Spun Sugar\nD: Chocolate Sled", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_128_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_128_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_128_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_128_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_128_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_128_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_128_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_128_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_128_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_128_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_128_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_128_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_128_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_128_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_128_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_128_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_128_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_128_17.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_128_18.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_128_19.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_128_20.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_128_21.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_128_22.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_128_23.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_128_24.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_128_25.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_128_26.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: White Chocolate Mousse\nB: Temper Chocolate\nC: Chill\nD: Fill the Molds", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Add the Chocolate', 'Add the Whites to the Chocolate Mixture', 'Add the Whipping Cream to the Chocolate Mixture', '@placeholder']", "context": "Here is the context of these images:\n. Gather your ingredients. -Approximately 9 ounces of dark or semi-sweet chocolate-Four large eggs-1/2 pint of heavy whipping cream (one cup)-One cup of baker's sugar (regular sugar will work; more or less to taste)You will also need four medium or large bowls and a mixing utensil (preferably flexible in order to scoop up the most chocolaty goodness possible).  . Separate the eggs into two bowls. Put the whites in one bowl and set aside for later. Put the yolks in another bowl. We will be using the yolks in the next step.Separate the whites from the yolks by splitting the egg in half and carefully sliding the yolk from one half to the other so that the whites will fall into the bowl but the yolk will stay in the egg. . Just as the title implies, add one cup of sugar to the yolks and beat until the yolks are very light. The new mixture should be an off-white or cream color. While you are beating the yolks, you should put the chocolate into the microwave for two and a half to three minutes at 40% power. Stir the chocolate until it is smooth and allow it to cool while you continue beating the yolks. . Add the chocolate to the egg yolks and fold until it is well mixed. After you have mixed the chocolate and egg yolks, you should start beating the egg whites. . The egg whites should be beaten until very stiff. Once the egg whites are stiff, they should be folded into the chocolate mixture. After you have thoroughly mixed the whites into the chocolate mixture, you  can start beating the whipping cream. . When you are finished beating the whipping cream, it should be stiff; even more stiff than the egg whites were. Fold the whipped cream into the chocolate mixture. . After you have folded the whipped cream into the chocolate mixture, you should let it cool in the fridge for about two hours. You could eat the mousse now, but if you let it chill first it will stiffen and the bits of chocolate will harden and provide a better texture.You can add flavors, such as a teaspoon of instant coffee or some white chocolate, to add some variety to your dessert. \nRead the question below and select from the following choices.\nA: White Chocolate Mousse\nB: Temper Chocolate\nC: Chill\nD: Fill the Molds", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_129_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_129_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_129_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_129_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_129_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_129_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_129_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_129_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_129_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_129_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_129_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_129_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_129_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_129_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_129_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_129_15.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: How to Bedazzle Your Pimp Cup!\nB: Cook Until Golden Brown\nC: Heat the Oil to Medium Heat\nD: Get Ready to Print", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Find an Image', 'Edit the Image', '@placeholder', 'Print Already']", "context": "Here is the context of these images:\n. Find the image you want to etch onto your piece of candy. The ring maker in the aforementioned video chose to replicate The King's face. I decided to replicate Lebron's logo, an L and J with a crown on top. Google Images is the best.. An image by itself is boring, right? Use Adobe Illustrator or CorelDRAW to edit your image so it's only black and white. It's also important to keep it simple. Remember, you're etching onto a small surface so don't overdo it. I added Lebron's last name and number to this jem.. If using a 60 watt Epilog laser such as the one I used at TechShop San Francisco, set your speed to 100% and power to 35%. It worked for me, so obviously it will work wonders for you.. Print your awesome design! This 8th wonder of the world took just 7 seconds to appear. That's more time than Lebron needs to sink a game winning field goal.. Show off your one of a kind piece of candy to kings and friends alike. Which ring do you like better?\nRead the question below and select from the following choices.\nA: How to Bedazzle Your Pimp Cup!\nB: Cook Until Golden Brown\nC: Heat the Oil to Medium Heat\nD: Get Ready to Print", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_130_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_130_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_130_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_130_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_130_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: You Will Need...\nB: Easy One Egg Omelet\nC: Assemble\nD: \"Coquito\" Puerto Rican Egg Nog.", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Technique', 'Cooking Egg Foo Yung', 'Turning the Egg Foo Yung']", "context": "Here is the context of these images:\n. For this recipe you will need:1/2 cup Cooked [chopped]meat (i'm using turkey but you can really use anything or no meat at all)1 cup Cooked Vegetables1/4 cup chopped cooked onion3 Eggs3 Tablespoons Soy SauceCooked RiceCooking spray or oil Gravy (optional)cooking mold (optional)Saute the onions (and the veggies if you're using raw vegetables).  Combine the cooked onion with chopped meat, veggies, onion, eggs and soy sauce.. This dish till rise or fall on your cooking technique, so pay close attention!  Get your skillet REALLY HOT.  If this is your first time making egg foo yung consider using a cooking mold.  I've been known to use the lid of a mason jar as a mold.  If you happen to have cooking mold GREAT!  If you don't have anything to use as a mold, don't panic.  In lieu of a cooling mold use a 1/3 c measuring cup.. Once your skillet is VERY HOT spray the cooking mold and skillet with cooking spray (or drizzle it with oil).  Immediately spoon the egg mixture into the mold.  If you're using a measuring cup (instead of a mold), pour 1/3 cup of the egg mixture into the hot skillet.  Without a mold you'll need to use the edge of your spatula to keep the eggs from running.Allow the mixture to cook for approx 3 minutes or until the bottom of the egg foo yung is golden brown and the egg is not runny.. Once the first side is brown, remove the cooking mold and turn the egg foo yung over.  Cook the second side until golden brown.  Press each egg foo yung with a spatula to make sure that all of the egg is cooked at the center.    . Heat leftover rice and gravy.  Serve the egg foo yung over rice.Enjoy!for more recipes check me out at www.OneBrownMom.com\nRead the question below and select from the following choices.\nA: You Will Need...\nB: Easy One Egg Omelet\nC: Assemble\nD: \"Coquito\" Puerto Rican Egg Nog.", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_131_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_131_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_131_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_131_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_131_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_131_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_131_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_131_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_131_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_131_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_131_10.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: the Countryside, Doublewide, BLT Grilled Cheese\nB: Finish Cooking and Cut\nC: Build the Sandwich\nD: Combining the Two Pieces of Bread", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Preheat the Oven', 'Place the Bread in the Frypan', '@placeholder', 'Brown the Bread']", "context": "Here is the context of these images:\n. Ingredients:1 1/2 tbsp. of butter  2 slices of white bread  1 slice of muenster cheese  1 slice of gouda cheese  1 slice of American cheeseSupplies:Stove   Frypan  Butter Knife  Spatula  Plate. Preheat a burner on the stove to medium heat. Place the frypan on the burner to be used.WARNING: The burner and frypan will be hot. Avoid touching both of these.. Use the knife to place 1/2 tbsp. of butter in the frypan. Try to melt the butter evenly throughout the frypan. Margarine can be used instead if preferred.. Use a butter knife to apply 1/4 tbsp. of butter to only one side of the white bread. Apply the butter in multiple areas so that it melts evenly. Repeat the previous steps for the other piece of bread. Again, margarine can be used instead if preferred. Also, other varieties of bread can be used instead.. Make sure the butter in the frypan has fully melted. Place the bread in the frypan with the buttered side face down.. Remove all plastic or paper from the 3 cheese slices.  Place the gouda cheese slice on a slice of bread.  Place the American cheese slice on the other slice of bread.  Place the muenster cheese slice on top of the American one.. Wait approximately 2-4 minutes until the cheese has melted. Use the spatula to place the 2 slices of bread together so that the all of the cheeses are now touching.. Use the spatula to flip the sandwich periodically to brown the sides. Perform this step for however long you prefer.The longer you keep the sandwich cooking, the more burnt the bread is.. Transfer the grilled cheese from the frypan to a plate using the spatula. Wait a minute to let the sandwich cool down and then enjoy!. The Grilled Three Cheese is a quick, delicious meal that will have you wanting more. While it already is tasty, feel free to experiment with it by possibly adding ham. It's simple enough to make for an amateur at cooking while also delicious enough for anyone. I hope you enjoy your Grilled Three Cheese!\nRead the question below and select from the following choices.\nA: the Countryside, Doublewide, BLT Grilled Cheese\nB: Finish Cooking and Cut\nC: Build the Sandwich\nD: Combining the Two Pieces of Bread", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_132_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_132_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_132_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_132_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_132_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_132_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_132_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_132_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_132_8.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: How to Make Oatmeal Chocolate Chip Cookies\nB: Ingredients\nC: Choosing the Right Drink\nD: Recipe", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Add Chocolate', \"Don't Overcook Your Cookies\", 'Rock Out With Your Cookie']", "context": "Here is the context of these images:\n. Dry ingredients: - 1 cup flour - 1 tsp baking soda - 1/4 tsp salt Stir in small bowlWet ingredients: - 1 cup light brown sugar, packed* - 1 cup crunchy peanut butter - 1 stick (1/4 pound or 8 tbsp) unsalted butter, room temperature - 1 tsp vanilla extract - 1 large egg - 1/4 cup honey Blend in medium bowl Stir the dry ingredients into the wet ingredients in two additions.*Yes, brown sugar is technically dry, but since it's mixed with all of the rest of the wet ingredients I just put it there. I've used chocolate chunks hacked from a huge chocolate bar, broken up smaller chocolate bars, and semi-sweet chocolate chips. All worked out great.In these photos I'm using 6 oz. semi-sweet chocolate chips. Stir them into the mix.Cover the bowl and put in the fridge for 30-40 minutes so it's easier to handle.. Ovens lie. It's a fact of life that some ovens will tell you that they're at the temperature you want and be off by a full 80 degrees. In my previous apartment, the oven was nice and new and had lovely digital controls. It was also a horrible liar hell-bent on ruining my baking attempts. If I wanted the oven to be at 450, it would let me know it was ready when it was only 370. Five minutes later it would stabilize at 430. That's a full five minutes of your dough being cooked at the wrong temperature, completely throwing you off. So long story short, buy a thermometer and stick it in the oven. It doesn't have to be pretty, it just has to work so you'll know what the true temperature is. For this recipe, you'll want it at 350 Fahrenheit or 175 Celsius.. I roll my dough into balls about 1.25\" (3cm) wide. You can go a little bigger if you want. I've found that a little bit of variation does not have a noticeable effect on the cooking time. Put them onto a buttered cookie tray.The trays I use are insulated ones. Getting fancy cookie trays may seem a bit extreme, but there was a period last year where I got a bit obsessed and these have been totally worth it.So now that you're sure your oven is at 350F/175C and it's stable at that temperature, put the dough in for 12 minutes. . After 12 minutes the cookies will not look ready, but take them out anyway and put the whole pan out on the counter to cool. Do not touch them!Oh, you'll want to touch them. You'll want to put one in your mouth right away, your tender tongue be damned, but don't do it. Really.OK, so you did touch one and it seems fragile and undercooked. That's why you need to let them cool for 5 minutes before moving them to a rack or a plate. . Yay! You waited for the cookies to cool and now they're good to go! You can eat them now or save some for later. These are good on their own or combined with other sugary treats. Like ice cream. Ice cream loves these cookies and vice versa. Let the love flow. To get the fresh baked warmth and goodness feeling back, quickly zap in a microwave or briefly toss into a toaster. Eat.\nRead the question below and select from the following choices.\nA: How to Make Oatmeal Chocolate Chip Cookies\nB: Ingredients\nC: Choosing the Right Drink\nD: Recipe", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_133_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_133_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_133_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_133_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_133_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_133_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_133_6.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: How to Cook Papadums\nB: Wait...\nC: Peel the Peppers\nD: Mix", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Prepare the Cooksite', 'Put the Turkey Under the Can', '@placeholder', 'The Moment of Truth...']", "context": "Here is the context of these images:\n. The turkey needs a small amount of preparation.  The giblets and neck should be removed.  Bend the turkey wings behind the bird (i.e. put the bird in a full-nelson). If you are injecting with marinade, inject the night before up to an hour or so before cooking.  Salt and pepper the outside and inside of bird.  Just before cooking, divide the stick of butter in half.  Shove a half between the skin and meat over each breast.. Prepare the cooksite about 30 minutes to one hour before starting to cook.Crumple some news paper and stuff into the bottom of the charcoal chimneys.  Set down upright on a non-flammable, out of the way surface.  Fill each chimney with charcoal.  Douse with lighter fluid to your level of risk tolerance.  Light the paper.  In my experience, if I can get the kindling to light well, the charcoal takes about 30 minutes to get going.While the charcoal gets going, lay out about a 2 ft x 2 ft square of heavy duty aluminum foil on the ground.  I usually have to fold together two pieces of foil.  Weight the corners with rocks or bricks.  Drive the stake into the center, so that 12 to 18 inches is above the foil.  There should be just enough above the ground so that the turkey's legs just touch the foil when hanged on, and the paint can will invert over the bird and sit firmly on the ground.. Hang the turkey on the stake.  The legs of the turkey should just touch the aluminum foil, but the turkey must be low enough that, when the can is inverted over the bird, it sits firmly on the ground.  Adjust the stake for optimal bird placement.Invert the paint can over the bird.  Be certain the opening of the can sits flat on the ground.  Dump the now whitish charcoals in the chimneys around the can.  Be sure to wear the welding gloves when you do this! Use the charcoal tongs to evenly spread the coals around the can.When you are initially sighting the turkey cooking spot, chose a place away from any structures, and in an inconspicuous place.  The heat of the charcoal will scorch any grass under the foil, so chose an out of the way place (in front of the front door is probably not a great idea).. Now we wait...1 hour and 50 minutes, to be exact.  Do not peek.  Do not raise the can.  If it is cool or windy (less than 50 F, steady wind), you might want to add another chimney of charcoal after 1 hour.  This is a judgment call.So, just relax.  My uncle says the bird takes a 6 pack of beers to cook.  If my aunt is within earshot, he says it takes three beers.. After 1 hour and 50 minutes have passed, you can remove the can.  You should hear the bird sizzling.  Put on the welders gloves.  Use the charcoal tongs to pull back the charcoal from the can.Fetch all of your skeptical guests...Remove the can and bask in the oohs and ahhs.. Cover the coals with another piece of foil, or fold over the foil on the ground to cover the coals.  Place a large pan near the bird hanging on the stake.Wearing the welders gloves, carefully remove the bird from the stake.  At this point, if this is your first time, you will notice that the bird is very VERY tender.  It will have a tendency to fall apart, and into the coals if you let it.  Put the pan close, cover the coals as best as you can, remove the bird as swiftly and cleanly as possible, and pray.Nothing stifles those oohs and ahhs more quickly than a bird dropped in the charcoal.  If you do drop the bird, raise your arms high and shout, \"Fear not, I am uninjured.\"  This may distract your guests just long enough to brush the ash off the turkey...Take the bird to the kitchen or picnic table, carve, and serve.  This is the most tender, juicy turkey.  You will rule the day.  Everyone will want to be your friend.  Enjoy your moment in the sun!Clean up is pretty easy.  I usually wash the stake in the dishwasher.  I scrub some of the cooked on ash and fat off of the paint can with a steel wool pad.  When the can is fairly clean, I rub the can inside with some vegetable oil, to keep the rust down.\nRead the question below and select from the following choices.\nA: How to Cook Papadums\nB: Wait...\nC: Peel the Peppers\nD: Mix", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_134_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_134_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_134_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_134_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_134_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_134_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_134_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_134_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_134_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_134_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_134_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_134_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_134_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_134_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_134_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_134_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_134_16.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Chewy Chocolate Chip Cookies\nB: Make Those Cookies!!\nC: Ingredients\nD: Dough", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Mix Dry Ingredients', 'Bake It Up!!', 'Eat Those Cookies!!']", "context": "Here is the context of these images:\n. Yield: 4 3/4 dozen cookies \u00a0 \u00a0 Prep Time: 15 minutes \u00a0 \u00a0Cook Time: 10-13 minutes\nIngredients:\n2 1/2 cups all-purpose flour\n1 teaspoon baking soda\n1/4 teaspoon salt\n3/4 cup cocoa\n1 cup butter, at room temperature\n1 cup granulated sugar\n1 cup brown sugar\n2 large eggs\n2 teaspoons vanilla extract\n1 cup semi-sweet chocolate chips\n1 cup nuts, I used a mix of walnuts and pecans\n1 cup Jet-Puffed Mallow Bits\nTools:\n1 large bowl\n1 medium bowl\nwhisk or sifter\nmeasuring cups and spoons\nbaking sheet\nparchment paper or silicone baking mat\nmixer or wooden spoon\nmini ice cream scoop\nI used a stand mixer for this recipe, but you can mix it up with a hand mixer or a wooden spoon as well. For the best results use parchment paper or a silicone baking mat. You don't need to replace the parchment paper for each batch, just continue to use parchment until all the dough has been baked. \u00a0I also use a mini ice cream scooper to make perfectly portioned cookies.. Preheat oven to 350 degrees F.\u00a0\nSpread nuts evenly on a baking sheet. Bake 10 to 15 minutes until nuts are lightly toasted.\nWhen nuts are cool, chop roughly.\nLine a baking sheet with parchment or silicone baking mat and set aside.. In a medium bowl, combine flour, baking soda, salt and cocoa.\nWhisk well or sift together with a sifter. Set aside.. With a mixer, cream butter. \u00a0\nAdd sugars and mix until smooth.\nAdd in eggs, one at a time.\nNext, add in vanilla extract and mix until blended.. Slowly add flour mixture to sugar mixture a little at a time until flour disappears. Scraping the sides of the bowl\u00a0occasionally.\nStir in chocolate chips, nuts, and mallow bits.. Drop cookie dough by rounded tablespoons or using a mini ice cream scooper onto prepared baking sheet, about 2 inches apart.\nBake cookies for 10 to 13\u00a0minutes, or until the cookies are set around the edges, but still soft in the center. When I bake 2 cookie sheets at a time I rotate the sheets and switch shelves halfway through the baking time.\nRemove from oven and let sit on baking sheet for 3 to 5 minutes. Move to a cooling rack and cool completely.. Mmmmm chocolaty gooey goodness!! Pour yourself a glass of milk and munch away!\nThanks for checking out my instructable! I hope you enjoy this recipe!\nRead the question below and select from the following choices.\nA: Chewy Chocolate Chip Cookies\nB: Make Those Cookies!!\nC: Ingredients\nD: Dough", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_135_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_135_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_135_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_135_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_135_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_135_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_135_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_135_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_135_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_135_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_135_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_135_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_135_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_135_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_135_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_135_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_135_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_135_17.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_135_18.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_135_19.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_135_20.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_135_21.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_135_22.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_135_23.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_135_24.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Slow Cooked Italian Beef!\nB: Finish\nC: Add Chicken and Spices to Slow Cooker\nD: Shred the Chicken", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['What You Will Need', 'Cut the Tofu in Half', 'Cut the Mushrooms', '@placeholder']", "context": "Here is the context of these images:\n. 1 block of extra firm tofu1.5 cups of mushrooms (baby Bella mushrooms pictured)2 cups of your favorite sauce (teriyaki pictured)1 can of nonstick cooking spray1 cutting knife1 cutting board1 waffle iron1 slow cookerPrep time: 30 minutesCook time: Flexible. Carefully, use your cutting knife to half the tofu block so you are left with two ~3/4 inch thick slabs.  Note: This thickness was chosen to optimize the surface area to internal-tofu ratio on my waffle iron.  Your personal taste or waffle iron may call for different thickness tofu slabs.. The tofu is cooked on the waffle iron to increase the surface area and remove some of the water in the tofu to make room for the flavorful fluids to come in and later explode with deliciousness.Preheat the waffle iron completely.  Spray the waffle iron with nonstick spray and quickly add the tofu then carefully close the waffle iron. After approximately 5 minutes when the tofu is lightly browned and crispy to the touch on both sides, remove it from the waffle iron and set to the side to rest. Repeat with all remaining slabs of tofu.Note: You may find that slight pressure is required to close the waffle iron completely.. Cut the mushrooms as desired. I prefer ~1/4 inch slices.Note: These instructions use mushrooms; however, you can use whatever you want: broccoli, bok choy, carrots, meat, nothing, etc.. Cut the tofu into bit size (or larger) morsels.If the waffle pattern is similar to that shown, I suggest cutting so each waffle well is quartered. Do so by repeating the cut seen on the left twice to receive tofu blocks similiar to those on the right.  Consider trying a block of the tofu now to compare the before and after of the slow cooking.. Combine ingredients in slow cooker and cover with desired sauce. These instructions used a generic store bought teriyaki sauce from Trader Joes; however, any sauce will do.  If necessary use broth to increase the fluid to solid ratio, but keep in mind that mushrooms and tofu both tend to release some fluid and cook down.Turn on high heat for and cook for a minimum of 1-3 hours depending on your preferred texture.. Cook a carbohydrate to go with your delicious tofu dish and absorb extra sauce. I recommend short grained rice or soba noodles. Seen above is short grain brown rice cooked in a rice cooker for ~90 minutes. Cooking times will vary so plan accordingly.. Plate your food in your favorite bowl or plate and enjoy the delicious meal in front of you.\nRead the question below and select from the following choices.\nA: Slow Cooked Italian Beef!\nB: Finish\nC: Add Chicken and Spices to Slow Cooker\nD: Shred the Chicken", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_136_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_136_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_136_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_136_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_136_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_136_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_136_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_136_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_136_8.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Minute Boiled Dumplings\nB: Bring Water to a Boil.\nC: Boiled Beef Tongue\nD: Acquire Tongue", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Salt to Water Ratio', 'Soak!', '@placeholder', 'Are They Done Yet?']", "context": "Here is the context of these images:\n. For this recipe, you only need the following:PeanutsSaltWaterBig ol' stock pot.Yup. That's it. Leave it to southerners to figure out this amazing snack, with such simple stuff. They know.. From what I had experienced, I was certain that the magic was in how salty the water is. I did a little digging around with otherfoodexperts, to figure out how much salt and water they were using.What the experts said varied, but I ended up going with a ratio of 3 Tbs of salt for every 5 quarts of water - and maybe adding a little bit more as it cooks.This snack is designed to sit in this water after cooking is complete, so it will get some of it's salt from remaining in the brine.. The raw peanuts are really dirty. After rinsing them a few times, it's a good idea to let them soak in a warm bath for about 45 minutes, stirring occasionally.I was impressed how much silt came off of them. (Warning: When I made these for a second time, I was a little lax on soak time - and they came out kind of gritty. YUCK.). In a big 15 quart stock pot, I brought about 10 quarts of water and the salt to a boil - it took a while to really get ripping, so maybe boil the water as the peanuts soak.. Add the peanuts to the salt water, and try and maintain a rolling boil. Keep a wooden spoon near by, and stir every 20 minutes. Keep covered when not stirring.Do NOT Simmer - you want this guy to be bubbling throughout the entire cook time. Depending on your range, you'll have to figure out what setting to cook these at, but the flames were about medium-high here in our test kitchen.. When they are done, the husks are soft and the peanuts inside are not crunchy at all. It should have the texture similar to a refried bean :)It should take anywhere between 3 and 4 hours to get them this mushy and perfectly brined.. When they are cool enough to scoop with a slatted spoon, serve in small bowls, and maybe offer a second bowl for shells.I knew these would go fast, but I wasn't expecting people around the office to be as voracious as they were - mikeasaurus proclaimed \"this is my new favorite thing\", JON-A-TRON and jessyratfink - the actual southerners I work with were stoked, and others never knew that a peanut could be cooked and served in this fashion. My roommate asked \"are you sure this isn't crack?\"Success!\nRead the question below and select from the following choices.\nA: Minute Boiled Dumplings\nB: Bring Water to a Boil.\nC: Boiled Beef Tongue\nD: Acquire Tongue", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_137_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_137_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_137_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_137_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_137_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_137_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_137_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_137_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_137_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_137_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_137_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_137_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_137_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_137_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_137_14.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Cheesecake Pops\nB: Supplies\nC: Make Fingers\nD: Haggis Pops", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Make', 'Cover and Stick', 'Decorate and Enjoy']", "context": "Here is the context of these images:\n. \n          So, what are you going to need?Supplies:\n\n\t\tCake Mix of ChoiceMarshmallow Fondant\u00a0- I made a double batch for the Valentine Rice Krispie Treats and had about half left (which is the 8oz recipe), this seemed good for coving the cake pops, but if you want to decorate them with more fondant, you are going to need to make a bit more.\n\t\tBag of Marshmallows (I used 8 oz)\n\t\tSticks\n\t\tSprinkles if you would likeHeart Cookie Cutters (or you can freehand it) \u00a0I like to get a lot of use out of things I buy so it's nice to invest if you can be inventive :). Most know the procedure, but if you don't, here it is :)\nBake a cake.\nLet it cool. \u00a0I broke it into 6 pieces to try to speed the cooling.\nCrumble the cake.\nHeat up marshmallows in microwave. \u00a0I went about a minute at a time at half power and then stirred and heated again till melted.\nMix melted marshmallow and cake. \u00a0You can put this in the fridge for a bit, but I found it wasn't necessarily helpful.\nTime to form them!\u00a0\nI went with a similar method as the Rice Krispie Treats. \u00a0I shoved the mixture into the cookie cutter, but then I heaped it a bit to make it a bit more round. \u00a0 Photo 7 shows this.\nDon't put these in the fridge or freezer if you can help it. \u00a0If the cake mixture is cold, when you put the marshmallow fondant on them it gets all wet and sticky.. Time to cover them in fondant. \u00a0I didn't really find a best way to do this, but I'll talk about some ways I tried.\nUsing the next size up cookie cutter, I cut out two hearts in fondant. \u00a0I then wrapped this around the heart. \u00a0I think this would have worked nicely except the dip in the top of the hearts didn't meet up very well at the top of the heart.\nNext I treated it like the rice krispie treats and just wrapped it around starting from the front and closed it up in the back.\nAnother way I tried that I didn't photograph is I wrapped it around the heart (like ^) but wrapped it around the top and then cut the excess of the sides of the heart.\nI'd love to hear any tips on covering cake pops in fondant.\nOnce it is covered, stick a stick in the bottom.. You can decorate them if you would like. \u00a0I tried out a few things. \u00a0\nCut out smaller hearts to decorate them. \u00a0Use corn syrup to stick them to each other.\nUs corn syrup to stick sprinkles to the pop. \u00a0You can do a specific design like I did with the pearl sprinkles on the pink one in the back right or just completely cover part of the pop with corn syrup and drench it in sprinkles like with the white one in the middle back.\nNow eat them!\nRead the question below and select from the following choices.\nA: Cheesecake Pops\nB: Supplies\nC: Make Fingers\nD: Haggis Pops", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_138_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_138_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_138_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_138_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_138_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_138_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_138_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_138_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_138_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_138_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_138_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_138_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_138_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_138_13.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Wiring Your Home Brewery\nB: Attempt to Eat\nC: Play With Your Food\nD: Harvest Those Hops!", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['How to Know When Your Hops Are Ready to Harvest', '@placeholder', 'Dry Hopping With Wet Hops', 'Dry Hopping or Adding to Boil']", "context": "Here is the context of these images:\n. Native to Europe, Asia and North America- hops now grow on many continents and in many countries. In the Pacific NW many feral Cascade hop plants can be found growing on telephone poles, shared fences etc. Please always be courteous to the people that grow or cultivate hops and ask them before you harvest hops about town unless it is clear that they are truly \"wild\" plants.I grow many different varietals of hops (they are all the same species, just different cultivated varietals, just like wine grapes) in my own yard and have many friends that grow them for the pleasant shade they provide when allowed to grow on pergolas and fences. It takes about two years to have a crop of hop flowers large enough to make a batch or two of homebrewed beer from, but after that the plants can be very prolific growers and producers. About this time of year I end up with a large hop harvest from my own hops (Cascade, Galena, Golding, Sterling, Willamette and Zeus) and extra Cascade from my neighbors. To know that the hops are ready to be harvested, they should be fully grown (more than an inch long for most varietals) and the blades or petals of the cone should be a little bit papery. Some of the hops may already have some browning on the tips. They are not ready if they are very springy and wet feeling, they are over-ready if they have opened up into full bloom and have turned yellow/brown. Please wear gloves that go as far up your wrists as possible when picking hops or pulling down bines because they cause \"hop-rash.\" They seriously do, and it's no fun.. Get a good helper like I did and fill some paper bags with fresh hops-Then get them in the fridge or in your brewing beer asap!. Well... this certainly sounds like an oxymoron.Fresh hops right off of the bine (yes, bine- not vine) must be used quickly or they will start to mold or go all cheesy and gross-tasting due to the oxidation of the oils in the hops. They are called both \"Fresh Hops\" and \"Wet Hops\" interchangeably. Using wet hops or fresh hops is super fun and makes for some amazingly floral beers, but if you've got more than you can use in a harvest, don't throw them away, see my other instructable for building a Hop Drying Oast. . Let's talk about avoiding bacteria or wild yeast infection a bit-Depending on where you are with your brewing process you can either:1. Add the fresh hops into the boil of a new batch of beer to make it a fresh hop beer. In this method the boil kills the bad bugs you don't want infecting your beer.2. Add the fresh hops to the secondary fermentation vessel (in this case a glass carboy) after the vigorous primary fermentation stage and enough alcohol and CO2 has been created to kill the nasty bugs you don't want in your beer. 3. Add them into a filter cartridge or what is called a Randall between your keg's output line and the tapMany people add fresh or wet hops to other stages of beer making like after flame-out as a steeped hop addition for aromatics, but try at your own risk of spoilage :)\nRead the question below and select from the following choices.\nA: Wiring Your Home Brewery\nB: Attempt to Eat\nC: Play With Your Food\nD: Harvest Those Hops!", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_139_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_139_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_139_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_139_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_139_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Chicken Yakisoba\nB: Sterilize Jars\nC: Serve and Enjoy\nD: white  Chicken", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Finish Jar Prep', 'Wrapping Up', 'Conclusions']", "context": "Here is the context of these images:\n. If you are using old jars, it's a good idea to sterilize them.  If they are new jars, it still wouldn't hurt, but you are probably fine to skip this step.Fill a pot with about 1/2 to 1 inch of water and let it boil.  Then place the jars upside-down in the pot for about 5 minutes.  Then remove them from the pot to cool down.. Cut the chicken into medium-largeish sized pieces and stuff them into the jars leaving about 1/4 inch of room at the top.. Clean around the opening of the jars with a clean, wet wash towel.  The main reason for this is to prevent the jars from sticking when you finally want to eat the chicken.Put new lids on the jars with fitted rings.(IMPORTANT!!!  The lids need to be new or the seal won't work!  The fitted rings can be as old as you want). Fill the pressure cooker with:5 quarts of water1 Tablespoon Vinegar (this prevents your pressure cooker from turning black)Then place your jars in face up (jars should not be completely submerged, the water should be somewhere below the fitted rings).. Seal off the pressure cooker and place over high heat.  Let the water boil, which will cause steam to escape from the top.10 minutes after it began to steam, skillfully place the 5 pound weight on top.Now set your timer for 1 hour and 30 minutes.  Once the weight starts spinning \"quickly,\" turn down the heat a little bit.. After the timer goes off, turn off the heat.Let the pressure cooker cool before removing the lid (there should be a pressure indicator on your cooker, once that goes down it means the pressure inside the cooker is the same as the pressure outside the cooker).  This is important for two reasons, one is your safety (lots of really hot steam will shoot at you) and the quick change in pressure can break the jars.  So just be patient.Remove the jars and let them cool.Store the chicken until hungry.. Chicken does not need to be refrigerated.  This stuff will last for a long time (at least a year, maybe more).The main reason we do this is for quick and easy meals.  The chicken is already cooked so we just throw it in our favorite recipe and heat it up.  The chicken is very tender and easy to shred after this process.What can you make with Canned Chicken:Any meal that requires shredded chicken  Chicken Salad  Chicken Taco Soup  Chicken Enchiladas  Hawaiian Haystacks  Chicken Sandwiches  Chicken and Ritz Casserole (probably our absolute favorite, so we made an instructable for this meal!\nRead the question below and select from the following choices.\nA: Chicken Yakisoba\nB: Sterilize Jars\nC: Serve and Enjoy\nD: white  Chicken", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_140_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_140_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_140_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_140_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_140_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_140_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_140_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_140_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_140_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_140_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_140_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_140_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_140_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_140_13.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Bruin Beskuit (Multigrain Rusks)\nB: Items\nC: Bubur Lambuk\nD: Cooking", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Ingredients (all 4 of Them!)', '@placeholder', 'A Bit of History', 'Bonus Round!']", "context": "Here is the context of these images:\n. Ingredient 1: Potatoes - How many depends on both the size of the taters and the number of people you want to feed. One tater per person is a good rule of thumb. In my case I had some big ol' russets and just 4 taters was enough for 6 people.\u00a0Ingredient 2: Barley Flour - Don't sweat it, you can find this at most grocery stores. Regular all-purpose is fine too, if you have allergies to look out for then by all means. Barley flour is just what we've always used.Ingredient 3: Salt Pork - Call it sailors' meat, call it white bacon, call it whatever you like. This sodium rich swine is just what's needed to give these taters some real flavour. A small package goes a long way.Ingredient 4:\u00a0It's a secret! - but don't worry, I'll spill the beans in due course. Bear with me.. Step 1:Peel your tatersStep 2: Shred your taters - I've always used a cheese grater for this but presumably there are other tools that would do the job just as well.Step 3:Add the secret ingredient\u00a0- Caraway seeds!\u00a0What a revelation! I sprinkled in about 1 tablespoon worth to achieve a present but not overwhelming flavour.\u00a0Step 4: Add flour and mix about\u00a0- Stir in a 1/4 cup at a time until you reach a wet dough consistency (or until your arm falls off, whichever comes first). The idea is to soak up some of the water from the potatoes and thicken the mixture, but not so much that it becomes dry or crumbly. Some potatoes are more watery than others so the amount of flour needed will vary. I used just shy of two cups for this batch. After you've achieved the desired consistency, or something close too it (believe me it's not a precision process), you can set it aside.\n*note*\u00a0If the mixture looks thoroughly unappetizing at this point you're doing it right.Step 5:Cut salt pork - I made 1inch cubes, but a little bigger or a little smaller is fine. Set them aside when your done.Step 6: Form your raspekaker\u00a0-First wet your hands, then grab a tennis ball-sized amount of the potato mixture and form it into a ball. Next grab one of your salt pork cubes and push it into the middle of the ball. Reform the surface of the ball\u00a0concealing the meat inside. Set the ball aside on a plate and continue to form raspekaker until you've used the entire mixture. It's advisable to clean off your hands after forming each ball as the sticky mixture can accumulate making the process more difficult than it has to be.. First get out your biggest pot, fill it with water and bring to a boil.\u00a0Next begin dropping your raspekaker into the pot (not on top of each other but beside). Use a spoon or laddle for this operation if you want to avoid spashling hot water around when you plop them in the pot.\nSimmer for 1 hour. It's a good idea to use a spoon push them around the pot once in a while, just to make sure they're not stuck to the bottom.. While it's cooking I'll take a moment here to recount the history of the humble raspekaker within my family. It entered our diet by way of my father, who picked it up from his mother who learned it up from her husband who was himself Norwegian. As the recipe traveled from one kitchen to the next so to did it evolve. My father recalls eating it with cooked bacon in the middle instead of salt pork, and the drippings from the bacon was saved and drizzled over the raspekaker after they'd finished cooking. On special occasions, namely Christmas morning, they would drape slices of\u00a0gjetost over the steaming raspekaker, which would melt and create a delicious cheesy coating. Gjetost, another little piece of Norwegian influence that my grandfather brought with him, is a brown goat cheese... the flavour is very unique.\u00a0\n*note* When I say brown goat cheese I don't mean cheese from a brown goat, but brown cheese from a ordinary coloured goat (brown being but one possibility in the spectrum of goat colours).\nFor me raspekaker is something of a comfort food but it can also be a very practical part of your diet. I find 1 ball makes for an adequate breakfast. They freeze well and for their size pack a lot of calories. It's a nice change up to oatmeal or breakfast cereal in the morning. They also lend themselves well to experimentation. At its most basic it's just potatoes and flour; a blank slate. This is probably why if you look around everybody is making raspekaker in slightly different ways. You can pick what meat and seasoning you like best, or forgo such complications and just dress it up after cooking with a sauce or spead of your choice.\nAnyways, that's enough of me expounding on the virtues of raspekaker, lets get back to the kitchen and see what we've got.. Use a ladle to retrieve the raspekaker and let the water drip off them as you do so. Now they're ready to serve.\nI like to cut them in half exposing the\u00a0succulent\u00a0porcine core. Then I drizzle some melted butter on top for added flavour and sprinkle it with a bit of parsley for looks. That's that! Eat up.. It's almost always the case that you'll have leftover raspekaker, but if you think that means you made too much think again. Arguably the leftovers are the best part. Just pop 'em in the fridge or freezer until you're ready for them. When it's time for more raspekaker just bust out the frying pan and brown 'em up in some butter. Oh boy do they taste great like this!\nWell folks, that's all for this instructable. From my family to yours, happy cooking.\nRead the question below and select from the following choices.\nA: Bruin Beskuit (Multigrain Rusks)\nB: Items\nC: Bubur Lambuk\nD: Cooking", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_141_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_141_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_141_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_141_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_141_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_141_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_141_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_141_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_141_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_141_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_141_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_141_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_141_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_141_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_141_14.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Watermelon Limeade With Chervil Infusion\nB: Make Watermelon and Chervil PopCorn\nC: Milk\nD: Watermelon Cucumber Smoothie", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Sugar', 'Chervil', 'Drink and Enjoy!']", "context": "Here is the context of these images:\n. Serves 2.- 1 and a half cups of roughly chopped watermelon flesh- 2 and a quarter cups of milk- 2 tablespoons of cream- 2 teaspoons of white sugar- 1 teaspoon of finely chopped chervil. Roughly chop enough watermelon to equal 2 cups. Remove seeds and place in blender.. Measure out 2 and a quarter cups of icy cold milk and add to blender along with 2 tablespoons of cream.. To this add 2 teaspoons of white sugar.. Finely chop enough chervil to fill a teaspoon and add to blender mix.. Hold down lid of blender and hit the start button. Blend until all ingredients are combined and mixture becomes light and frothy.. Warning: this milkshake is sooo good you mightn't want to share!Pour blended mixture into glasses and enjoy!\nRead the question below and select from the following choices.\nA: Watermelon Limeade With Chervil Infusion\nB: Make Watermelon and Chervil PopCorn\nC: Milk\nD: Watermelon Cucumber Smoothie", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_142_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_142_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_142_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_142_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_142_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Bake!\nB: Simple Bread Dough\nC: \nD: ", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Mix the Dries, Add the Wets.', 'You Need to Knead.', 'Cut and Roll', '@placeholder']", "context": "Here is the context of these images:\n. Take your 2 1/4 Cups warm water and add your sugar to it.  I used brown sugar, which is why it looks dark.Add your yeast, stir it in, and set it aside while you work on the rest of it.  You can do this before setting up for baking, because it takes a little time.  During this time, set your oven on low if you are using it for rising.  Proving the yeast allows the little yeastie beasties to wake up and get busy reproducing and converting the sugars.  After a couple minutes, you should have some foam on top.  Sometimes I get real good foam, sometimes weak foam, but I think it's usually temperature, and the bread comes out fine unless the yeastie beasties are deceasedie.  . Mix most of the flour (5 cups or a little more) and the 1 T salt.  A mixer is not necessary, but I've got one, you know?Add the proved yeast, add the 1/4C vegetable or olive oil, and mix it up!  Saving some of the flour is so that it doesn't get too dry, because measurements sometimes change and it's better to have wet dough for rising.  If it's really wet and sticky, add more of the flour until you reach the full 6 cups.  . Once it's well mixed and still kinda sticky, turn it onto a floured counter and knead it until nice and smooth.  Add flour so it stops sticking to you, but err on the wet side.  The picture is pre-kneading.  It will turn into a beautiful ball afterwards.  . Get a big bowl with a little oil, roll your dough in the oil, and set it in a warm spot.  If using the oven, you will want to preheat it on its lowest setting and turn it off before putting the dough in.  I forget to turn it off sometimes, so I just leave the door open for a couple minutes to let it cool to a nice warm cozy temperature for the yeasties.  Leave it for 60-90 minutes, until about doubled in size.  . After the dough has risen take it out of the oven, and turn the oven to 475 F (245 C).  Put your baking pan or cookie sheet in the oven as it heats, you want it to be hot!For this size recipe, you can cut the dough into 16 equal pieces, make them into little balls, and roll them flat.  Give them a good coat of flour so they don't stick to the pan.  Roll them about 6 inches in diameter, so that you can hold one in a hand fairly flat.  I only rolled them two at a time to save counter space.  . Hold a flat piece in one hand.  With an oven mitt on the other hand, open the oven, and pull out the hot pan just enough to flop your bread onto it, nice and flat.  You can grab and pull it if it folds, but don't burn yourself!It'll take about 5 minutes for each one, and you'll see them puff up a bit.  If you're lucky, you'll get full balloon-type pitas, but the others are good too!  Bake a little longer if you want some darker crust.  That's it!These go well with just about anything.  Bake/microwave with some cheese, eat it with chili, dip it in your borscht, or just heat it up and give it some butter.  Enjoy!\nRead the question below and select from the following choices.\nA: Bake!\nB: Simple Bread Dough\nC: \nD: ", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_143_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_143_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_143_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_143_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_143_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_143_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_143_6.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Saos and Cream Cheese {yum}\nB: Fry the Bread\nC: The Mix\nD: Cous Cous and Halloumi Cheese", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Lets Go Shopping', 'Pre Step.. Cuttiing the Cheese.......', '@placeholder', 'Serve It Up']", "context": "Here is the context of these images:\n. A simple line up of ingredients done to my taste. I like velvetta, a processed cheese, as well as chedder. Miricle Whip is a bit more heart friendly than mayonaise but its kind of sacrilidges in this dish. For the kicked up spice I'm using jalapeno's in adobo sauce.\n\u00a0Velvetta cheese\n\u00a0 Chedder cheese (Im using white extra sharp)\n\u00a0 Jalapenos in adobo sauce\n\u00a0 Grater\n\u00a0 Container with lid\n\u00a0 Knife. Velvetta cheese is very soft and difficult to grate, I cut off a chunk, stick it in a plastic bag and into the freezer for about a half hour. For a very simple tool a grater can be\u00a0 very dangerous to your fingertips and knuckles so be careful here unless you like blood in your food and bandages on your fingers.\n\u00a0Grate your cheese or cheeses into the container your going to mix and store the cheese in, this will save on washing up. take a couple of your jalapenos and some adobo sauce, put it all on a plate or other cutting surface. Chop, slice, dice, mash or smash the peppers. scrape, spoon or otherwise get the peppers and sauce from the previous step into the bowl of grated cheese. Add your mayonaise or miricle whip and\u00a0stir. Does it look to dry? (pic2) add more miricle whip and stir some more until it looks like pic 4. There's a lot of ways to enjoy pimento cheese, the basic but very popular spread on white bread, as a dip with dorito's, corn chips or pita crisps, on crackers or as Im doing here on lightly toasted whole grain. It also makes an awesome grilled cheese sandwich\nRead the question below and select from the following choices.\nA: Saos and Cream Cheese {yum}\nB: Fry the Bread\nC: The Mix\nD: Cous Cous and Halloumi Cheese", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_144_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_144_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_144_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_144_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_144_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_144_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_144_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_144_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_144_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_144_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_144_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_144_11.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Professor Phineas' Infamous Ginger Beer\nB: Chip Cookies\nC: Measure Butter\nD: Ingredients and Equipment", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Flour and Spices', 'Putting It All Together', 'Bake']", "context": "Here is the context of these images:\n. \u00bd cup butter\n1 cup packed golden or dark brown sugar\n1 large egg\n\u00bc cup molasses\n1 tsp vanilla\n3-6\u00a0 pieces candied ginger, chopped\n2 cups all-purpose flour\n1 tbsp fresh grated ginger\n1 tbsp ground ginger\n1 tsp ground cinnamon\n\u00bd tsp ground cloves\npinch ground cardmom\n2 tsp baking soda\n\u00bc tsp salt\ngranulated sugar for rolling. I don't like measuring butter after it's softened like you're supposed to do for cookie recipes, when it's already marked out in handy increments on the wrapper. Take the butter out of the fridge and cut off \u00bd cup, which is a quarter of a brick, one stick, 113 grams or 4 oz, however you want to quantify it.. Measuring brown sugar is my favourite part of cookie baking, next to sampling the result! Spoon sugar into your measuring cup, pressing down with the spoon as you go. When the cup is full, unmold the cute little cake of sugar into the bowl with the butter. If you packed it enough, the sugar won't fall apart when you do this.. Now that the butter and sugar are together, you'll have to wait for the butter to soften enough to be able to cream them. Cutting the block of butter into small pieces will help it warm up faster while you get the rest of the ingredients ready.. Put the two cups of flour into another bowl, and stir in all the ground spices, baking soda, and salt. I like to save the cardamom seeds from the cardamom pods I get in expensive tea blends after I've drunk the tea, and grind them into whatever I'm baking.. Chop the candied ginger and peel the bottom inch or inch and a half of the fresh ginger. Don't cut the peeled part off, you'll have something to hold on to when you grate it.. By now, your butter should have softened enough to mix. If not, take a tea break, or zap it in the microwave for a few seconds if you're impatient and using a microwave safe bowl.\nBeat the butter and sugar together on low speed until thoroughly mixed. Then add the egg, molasses, and vanilla. Grate the ginger and add that too. Beat again until smooth and creamy.. This is the part where I clean the beaters on the mixer because I won't be needing it anymore. When combining the wet and dry mixtures in any baking, you don't want to overmix it because that will develop the gluten in the flour and make your baked goods tough. I'm mixing this with a spatula.\nAdd half the flour mixture and the chopped ginger to the creamed mixture. Stir just until mixed. Add the rest of the flour and stir just until mixed. A little bit of dry stuff around the edges is ok.. Now, set up a little manufacturing station with your dough, a saucer of sugar, and your baking sheet. I found that using parchment on the baking sheet makes for chewy cookies. Baking them directly on the greased sheet results in crisp cookies. I like my cookies crisp on the outside and chewy inside, so I will be baking them on parchment.\nPreheat your oven to 350\u00baF now, and start rolling the dough into 1\" balls. I like making the cookies small so I can eat more of them.\nI do most of the forming with the teaspoons to avoid handling the dough too much.\u00a0 Sorry there aren't more pictures of this process, but it's a two-handed job. What I do is scoop up a small lump of dough with one spoon, and then scrape it into the other spoon. Scrape it back into the first spoon. Repeat this a few times until the lump of dough is rounder and more compressed. Then I give it a quick roll between my hands and drop it into the dish of sugar. Roll it in the sugar until all sides are covered, and place on the cookie sheet.\nIf you stagger the lines of cookies, you can fit more of them on the sheet.\nKeep on rolling cookies until the oven is ready. After I put the sheet in, I start putting the cookies-in-waiting on a plate.. Bake for 8-10 minutes. When you take the cookies out, they will be puffy, but they will flatten when they cool. While they are baking, finish rolling the rest of the dough.\nIdeally, you should have two cookie sheets, so the next one will loaded with cookies and ready to go in the oven by the time the first one is done. I only have one sheet, so I move all the cookies off it after a minute or two, and then let the sheet cool before putting on the next batch of cookies. You don't have to let the sheet cool all the way - if you can pick up the sheet with your bare hands then it's ready for more cookies.. You're done! This recipe makes about 60 cookies.\nThese cookies travel very well, so share them with your friends. Or don't, and keep them all for your teatime.\nRead the question below and select from the following choices.\nA: Professor Phineas' Infamous Ginger Beer\nB: Chip Cookies\nC: Measure Butter\nD: Ingredients and Equipment", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_145_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_145_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_145_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_145_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_145_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_145_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_145_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_145_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_145_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_145_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_145_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_145_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_145_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_145_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_145_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_145_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_145_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_145_17.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_145_18.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_145_19.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_145_20.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_145_21.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_145_22.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_145_23.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_145_24.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_145_25.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_145_26.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_145_27.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_145_28.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_145_29.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_145_30.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_145_31.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_145_32.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_145_33.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_145_34.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_145_35.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Cake Base\nB: Rainbow Layer Cake\nC: How Many Guests Do You Need to Feed?\nD: Make Your Cakes, Icing, Fondant, Etc.", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Apple Layer', 'Poppy Seeds Layer', 'Walnut Layer']", "context": "Here is the context of these images:\n. 500 g flour250 g butter100 g icing sugar3 yolks1 dl white wine of your choiceMix dry ingredientsAdd in yolks and butter, work until mixture is crumblyPour in wine and kneadWrap dough with cling wrap and chill in the fridge for 2 hMeanwhile, prepare the other layers. 800 g green apples (tart but sweet)2 tbsp honey1 lime, juiced and grated the skinenough ground cloves and cinnamon powder (to taste)Skin and grate apple, then mix with the rest of the ingredients. 200 g poppy seeds, ground until fine100 g sugar50 g raisins1 dl white wine of your choiceMix everything in a pot, heat over low-medium heat until thickened (sugar dissolved)Cool to room temperature. 200 g ground walnut100 g sugar50 g raisins1 dl white wine of your choiceMix everything in a pot and heat over low-medium heat til thickenedCool to room temperature. Divide dough into 4 portions, roll flat each of themPlace one layer as bottom, spread walnut filling, top with another layer of dough, spread poppy seeds layer, then top again with another dough, spread with apple layer and top with the last doughBrush top dough with an egg wash and bake at 170 C for 60-65 minutesJust before slicing, sprinkle top with generous icing sugarEnjoy :)PS: You can place them into mini cupcake liners \nRead the question below and select from the following choices.\nA: Cake Base\nB: Rainbow Layer Cake\nC: How Many Guests Do You Need to Feed?\nD: Make Your Cakes, Icing, Fondant, Etc.", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_146_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_146_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_146_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_146_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_146_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_146_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_146_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_146_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_146_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_146_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_146_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_146_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_146_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_146_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_146_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_146_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_146_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_146_17.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_146_18.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Making the Pie Crust\nB: Scratch Pumpkin Pie\nC: Roll, Dry, and Cut the Dough\nD: Mix the Dough", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Scooping Your Pumpkin', 'Pumpkin Mixture', '@placeholder', 'Filling and Baking']", "context": "Here is the context of these images:\n. Sugar pumpkins are the best, butternut squash is a great substitute.   First wash the pumpkin then cut in half vertically. Remove the seeds and strings(reserve the seeds for toasting). Place on a baking sheet shell side up and bake at 325 deg. for 1 hour or more depending on size.  Bake until tender.( I bought this pumpkin from Home Depot.... This is the first time the shell did not collapse, usually the shell gets soft  so don't get worried if it gets soft ...that's about the time its done. ). When baked tender scoop out all  pulp and place into a processor or blender, blend until smooth. In a mixing bowl mix well 2 cups pumpkin puree.1 1/2 cups evaporated milk1/4 cup dark brown sugar1\\2 cup white sugar1/2 tsp. salt1 tsp. cinnamon1/2 tsp. ground ginger1\\4 tsp. ground nutmeg 1\\8 tsp. ground cloves2 slightly beaten eggs. I use Sweetzels spiced wafers for the shell.  in a processor  add 2, 7 oz. boxes of wafers. and blend until fine.add: 2 tbsp. light brown sugar and a dash of salt. With the processor still running add 6 tbsp melted butter until combined.Line a 9 inch pie pan with the crumb mixture about 1\\8 inch thick all around..Bake at 350 deg. for 6 to 8 minutes and let cool.. Fill the pie shell with the pumpkin mixture. very carefully place into a 425 deg. oven for 15 min.  Reduce heat to 350. deg. and bake for 45 minutes more or until an inserted knife comes out clean.  Any extra mixture can baked in a small ramekin .   Enjoy!!!! \nRead the question below and select from the following choices.\nA: Making the Pie Crust\nB: Scratch Pumpkin Pie\nC: Roll, Dry, and Cut the Dough\nD: Mix the Dough", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_147_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_147_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_147_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_147_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_147_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_147_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_147_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_147_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_147_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_147_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_147_10.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Chicken Shwarma\nB: Relax Your Chicken!\nC: More Chopping.\nD: Enjoy!", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Preparing the Samosa Sheets', 'Pasting in to  Triangle', 'Frying', '@placeholder']", "context": "Here is the context of these images:\n. For the\u00a0 filling\nChicken pieces with out bones (1/2 Kg)\nOnions(double the quantity of the chicken)\nGarlic\nGinger\nLime\n3-4 cloves\nFew green chillies (option)\nSalt\nChili powder/Paprika powder\nOil\nCurry powder\nTurmeric PowderFor the dough\nFlour\nSalt\nOil\nWater. Combine salt and water together mix and knead well. With your floured hands roll out ,make small balls in to pairs,double the size of a marble and flatten them with your finger tips.Apply oil to one side and place another flatten dough on top.Press them together ,roll out a bit.\nDust a little flour on the working board and roll them again\u00a0 to oval shape to about 7-8 inches in length.If you want you can roll it a little bit longer\u00a0 ,so that you can have bigger Samosa's. Have them all ready, so that you can warm it in the\u00a0 pan one by one. Toss them in to the heated non stick pan for few seconds.Just one side half done would be fine\nDon't wait until it is fully done.Take it out and slowly remove the two pasted sheets.Since I applied oil in between the doughs once it is heated slightly, it removes with out any problem\nYou can make the Samosa sheets as much as you want and freeze them for later use..Defrost\u00a0 very well before you use.. Make a paste by mixing flour and water.\nFold from\u00a0 the\u00a0 right side of the dough, with your finger tips apply the paste marked along the dots.(see picture)Press well and paste.\nNow fold from the left side and paste on top of the first fold.Se detailed\u00a0 picture.Once you do the pasting it should be in a\u00a0 shape of a triangle.. Combine chicken,salt and turmeric powder,cook\u00a0 on slow fire until is done.Drain the water out and chop the chicken in to tiny pieces.\nChop the onions and make the garlic and ginger paste.\nHeat oil and stir fry the garlic first,and add the ginger paste,when it is a bit golden\u00a0 in color add the onions ,salt. and curry leaves.\nIf you like to have hot ,add green chillies cut in to small rounds.The onions will reduce to half of the quantity once it is cooked\u00a0 on slow fire.Then add the chicken,stir well,add chili powder ,turmeric powder,curry powder one by one on slow fire.\nWhen it is fully cooked add half lime juice ,stir,cover and let it simmer in slow fire.Time to time m last mix well .This gives a very nice flavor.. I filled up all the triangle cones with the yummy chicken filling.\nFollow the rest of the pictures you will understand.\nHave them all ready in a try to be fried.. Depending\u00a0 on how big your frying pan, put a couple of the Samosa's in hot oil and fry them until golden brown.Turn them on both sides while frying.Transfer it to a sieve for the oil to drip and then to a kitchen tissue.Semi fry if you want to freeze them for later use.\nWhen it is cool pack them up and freeze it..Defrost two hours before and re fry to golden color.. Have them hot hot as it is very crispy.I love this Chicken Samosa so much that I end up eating about 8-10 at\u00a0 one go.\nEnjoy\u00a0 and thanks for reading.Note:\nI have posted this in the Hurriecane contest too.\nI am a day care mom who is working full time,and do not have enough time for my sewing and crafting work which I love it very much.If ever I win the laser cutter I will give up my full time job or reduce it to a couple of hours(because I do a lot of craft work\u00a0 with the kids and they will be dissapointed to see me no more)and start my own business at\u00a0 home as I have enough and more sewing orders which I can not do it on time.Every instructable I do with the help of the laser cutter,I will definetly post it on this great site.\nRead the question below and select from the following choices.\nA: Chicken Shwarma\nB: Relax Your Chicken!\nC: More Chopping.\nD: Enjoy!", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_148_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_148_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_148_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_148_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_148_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_148_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_148_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_148_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_148_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_148_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_148_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_148_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_148_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_148_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_148_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_148_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_148_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_148_17.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_148_18.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_148_19.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_148_20.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_148_21.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_148_22.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_148_23.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_148_24.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_148_25.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_148_26.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Blue Hawaii Parfait\nB: You Will Need...\nC: Just a Note...\nD: Snow Cone Surip", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Raspberries', 'Water', '@placeholder', 'Molding']", "context": "Here is the context of these images:\n. First you'll need 17 raspberries strawberry jello mix sugar  Hawaiian bunch snow cone syrup and Popsicle molds. . Take the raspberries and crush them in a bowl. Take one table spoon of jello mix and put it with the raspberries. The jello will help it harden quicker.. Add 1/4 a cup of warm water I the bowl and mix.. Add 1/4 a cup of snow cone mix and mix.. Once mixed pour the mix in the molds and freeze and in 15-20 minutes and enjoy    \nRead the question below and select from the following choices.\nA: Blue Hawaii Parfait\nB: You Will Need...\nC: Just a Note...\nD: Snow Cone Surip", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_149_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_149_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_149_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_149_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_149_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_149_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_149_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_149_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_149_8.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Man Cupcake Cake\nB: The Cake\nC: Spongebob Cake\nD: Decorate/Eat", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Gather Your Supplies', 'Vanilla Extract', 'Microwave', '@placeholder']", "context": "Here is the context of these images:\n. For this project you will needA coffee cupA 700 or 1000 watt microwaveTwo forks or One fork and One whisk5 Tablespoons, or one tablespoon you continue to whip off as you goA small bowl . For this recipe you will need: FlourSugarVanilla ExtractAn eggOil, (vegetable preferred)Milk. Measure out four tablespoons of flour and four tablespoons of sugar into your coffee cup, then mix them together with your fork. . Whisk an egg with either another fork or a whisk in a separate bowl. Then pour it into the cup with your flour and sugar. Next, stir all three ingredients together evenly.  . Now, measure out three tablespoons of milk and three tablespoons oil and add them to your coffee cup. Blend them evenly with the other ingredients. . Add one table spoon of vanilla extract and stir it in. . It's time to microwave.If it's a 700 wattIt needs to cook 4 minutes.If it's a 1000 wattIt needs to cook 3 minutes.. If you would like you can gather frosting or other toppings and decorate your 'Cup' Cake before you eat it. I, personally, did not and just ate it plain. \nRead the question below and select from the following choices.\nA: Man Cupcake Cake\nB: The Cake\nC: Spongebob Cake\nD: Decorate/Eat", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_150_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_150_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_150_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_150_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_150_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_150_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Pancake Mix to Banana Pancakes\nB: To Cover or Not to Cover\nC: Enjoy\nD: Clean and a Word From Emma", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Spray and Pour', 'Wait and Take Out', 'Serve and Enjoy!', '@placeholder']", "context": "Here is the context of these images:\n. Add...\u2022 Your 1 cup of pancake mix\u2022 3/4 cup of water\u2022 And the optional sprinkles Mix...\u2022 Mix all ingredients until you have a good mix- not watery and not lumpy . Spray...Take your bakers spray and spray the top and bottom of your makerPour...Pour your batter in each of the donut holes. Wait...Wait until each donut has fully cooked threw-until golden and you checked with a tooth pickTake Out...Take the donuts out when you checked them all to see if they are cooked threw. Serve... Serve the donuts on a plate of course!Enjoy...Then enjoy your donuts! If you like put syrup and/or whipped cream in a bowl to dip in! (I look bad!). Cleaning...\u2022After you are done wipe the counters, maker....\u2022Put all of your ingredients away_____________________________________And you are done!Hope you enjoyed!Like, and comment and other donut flavors that I should try!- Emma (iluvmy2pets)Btw this was my first DIY on here!\nRead the question below and select from the following choices.\nA: Pancake Mix to Banana Pancakes\nB: To Cover or Not to Cover\nC: Enjoy\nD: Clean and a Word From Emma", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_151_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_151_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_151_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_151_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_151_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_151_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_151_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_151_7.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Slow Cooker Potato Soup\nB: Heat Milk & Chocolate\nC: Whisk Milk\nD: ins", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Mix All Your Ingredients in Your Slow Cooker', 'Let the Slow Cooker Do Its Thing', '@placeholder', 'Travel to the Party and Enjoy']", "context": "Here is the context of these images:\n. To make the hot chocolate you will need: 1 bag of semi-sweet chocolate chips (~2 cups) 1 carton of cream (16oz) 1 can sweetened condensed milk (14oz) 6 cups of milk (I used 2% because that is what we drink) 1 Tablespoon of vanilla Mix everything together in the slow cooker. Alternatively you can use whatever your favorite hot chocolate recipe is and mix it all in the slow cooker. Subbing your favorite type of chocolate in for the semisweet chips is also a way to mix it up (white chocolate chips also make a very yummy hot chocolate).. Once everything is mix set the slow cooker to low and let it heat for about 2 hours. I whisked the mixture after about 1 hour and then 1.5 hours in. After 2 hours I whisked again to make sure all was incorporated, after 2 hrs all the chocolate should have melted and it should be a delicious hot chocolate.. The hot chocolate on it's own is pretty awesome, but everyone loves a little extra too! The event I am bring this to has both drinker and non-drinkers so I left everything to the side so people could choose what they want to add! Ideas for non-alcoholic mix-ins Mini marshmallows (or big ones) whipped cream caramel sauce Ideas for alcoholic mix-ins Peppermint Schnappps Butterscotch Schnapps Irish Cream RumChata Kahlua Infused vodkas Alcohol infused creams. Once the hot chocolate is made you can just transport your slow cooker to where ever the party is, plug it in and have it set to warm or low. People can ladle hot chocolate into cups and mix with all the extras! Yum! I forgot to take a picture before I went and there wasn't any left afterwards to take a picture of!\nRead the question below and select from the following choices.\nA: Slow Cooker Potato Soup\nB: Heat Milk & Chocolate\nC: Whisk Milk\nD: ins", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_152_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_152_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_152_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_152_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_152_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Bubur Lambuk\nB: Gut Stuffing Prep\nC: Garlic & Quorn\nD: Sauce", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Bloody Sundried Tomato Oil', '@placeholder', 'Boiling Pasta', 'Finish Them Guts']", "context": "Here is the context of these images:\n. \n          This instructable requires basic cooking and cutting skills.\nYou will be needing\u200b\n250g Spiral Pasta1 diced onion6 diced garlic cloves1 diced carrot1 stalk diced celery1 cup frozen peas3 sun dried tomatos18 large basil leafssalt and pepper100g Pancetta1 egg100ml cream100g parmesan cheese50ml olive oil (plus extra)1 length 30mm Sausage casingsFor the sausage casings, I did not want to have to recook the meal so I used non animal based casings. These required being soaked in warm water for at least 45 minutes. \u00a0After which I rinsed and let sit in bowl.\n        . This step is easy, I used three Sundried tomatoes and a bit of reserved oil from the jar. \u00a0Dice up the tomatoes finely and mix with the oil... taadaaa!! Next step, the Sausage Stuffing Device.. I wish I had a proper sausage stuffing horn, but I don't. \u00a0So instead I made one. \u00a0I found a cylindrical plastic wine glass and cut off the bottom. \u00a0You can see a marking on the picture where the I measured a cut. \u00a0I aimed to make it around 25mm wide.\nFrom there I took a rinsed sausage casing and tied a not in the base of it, pretty simple ey'. \u00a0The other end I carefully slid on my new homemade sausage casing horn...\u00a0. \n          Another relatively simple step. \u00a0\nGet a large pot and put in heaps of water, pasta likes to be able to swim around a bit...Get that water hot, I do this by igniting the small burning element underneath the pot.Salt your water, you should aim to make your water as salty as the ocean...\u00a0Take note, you will need about a cup of water from this step for the next step, either reserve a bit at the end or start the next step at the same time like a champ.Add pasta and give it a stir for a few seconds so that it doesn't clump together. \u00a0Notice the wooded spoon on top, that is to help prevent the water from boiling over.After around 5-7 minutes I begin testing the pasta, to do this I carefully scoop out a noodle and take a bit. \u00a0You want the pasta to be quite soft for this meal, a bit past al dente. If it is not done, wait a few more minutes and try again.When the pasta is swell, pour it into a strainer and give it a quick rinse. \u00a0I want the pasta to be warm still since I will be serving immediately.\n\u00a0\n        . I place the pancetta in a pan on low with a tablespoon of olive oil then\u00a0let it slowly warm up, this encourages as much of the delicious fats to ooze out of this delectable Italian style bacon.After about 10 minutes of occasionally stirring I add the onions, garlic, carrots, and celery.Increase the heat to medium and continue cooking for another 5-10 minutes occasionally stirring.After the onions begin to brown and carrots soften, I begin to take some of the pasta water out and pour it into the pan. \u00a0This is a process known as deglazing. \u00a0You may have noticed lots of browned bits in the bottom of the pan, but adding some liquid and stirring with a wooden spoon was scrape those delicious bits off and create a bit of a sauce at the same time.I next add the cream and the peas.. \n          By now you may be drooling a bit... That is to be expected. \u00a0The scent of garlic and fatty cured pork belly does that sort of thing.\nI next add the basil... thats it, just put the basil in the pan.Now I temper the eggs. \u00a0This is a process to slowly warm up eggs so that they do not curdle when added to a sauce. \u00a0To do this I mix the egg in a small bowl then add a few spoonfulls of the carbonara sauce.Next I pour the sauce into a bowl on top of the pasta...\u00a0Now throw on the parmesan cheese and egg mix. \u00a0The heat from the sauce and pasta will be enough to cook the egg and thicken the sauce a bit...Yeah, now it is getting cheesy and delicious...You can eat this now, or you can continue this instructable and make it look like disgusting intestines... your choice.. Now comes the fun and giggles...\nBe careful, I waited about 10 minutes for the pasta to cool down a bit so I did not burn myself.\nTo do this, I add a spoon full of pasta into the sausage horn and cram it down with the end of a spoon. \u00a0Not the most sophisticated of methods, but it works... \u00a0Do pay attention to the lovely noises the video makes, doesn't that sound delicious.\nKeep adding a bit more and cramming it down into the the casing. \u00a0Make sure you make a lot of childish jokes and unleash your true immaturity during this part.\nEventually you should have about a meter or so of dinner, tie off the other end and repeat the process until you are out of pasta. \u00a0I was able to make about 3 lengths of intestines with this.... Mix with the bloody oil, light some candles, and unleash your inner zombie.\nThese look disgusting, they really do... I am kinda cringing about making something that looks so gross, but it does taste quite delicious... so bon appetite... unless you are the walking dead, in which case I say aarrghh, yarrgghh, aaarrhhghgh.\nRead the question below and select from the following choices.\nA: Bubur Lambuk\nB: Gut Stuffing Prep\nC: Garlic & Quorn\nD: Sauce", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_153_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_153_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_153_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_153_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_153_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_153_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_153_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_153_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_153_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_153_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_153_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_153_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_153_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_153_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_153_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_153_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_153_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_153_17.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_153_18.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_153_19.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_153_20.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_153_21.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_153_22.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_153_23.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_153_24.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_153_25.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_153_26.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_153_27.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Salmon Carpaccio\nB: Cooking on the Stove\nC: Prepare Coronet Batter\nD: Greens", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Prepare Creme Fraiche', 'Prepare Salmon Tartare', 'Assemble and Eat']", "context": "Here is the context of these images:\n. To properly roll the cookies into their ice cream cone shapes, you'll need a conic form that can withstand some time in a 400F oven.\nThe cookbook suggests a #35 4 1/2\" coronet mold, but since this was a one-off for a French Laundry themed party we decided to make our own out of paper.\nAfter some rummaging, I found a 4\" diameter circular object for tracing (the base of a pitcher) and made some circles on a manila folder.  I also made one on a sheet of glossy paper, the thick stock used as the cover of an expensive yuppie magazine we magically get for free.  Note that I'm NOT putting the glossy stuff into the oven for fear of toxic bleeding or outgassing; that's what the manila folder is for.\nDraw another circle on the glossy paper ~1/2\" outside the original circle, and add a tab.  Now cut around the outside circle and inside of the 4\" circle to make a 4\" diameter stencil.\nCut out the manila circles; I used 5.  These need to be shaped into cones for use as your forms, so you've got to get them nice and tight.  I wanted to staple them into position, but they're too small to successfully staple.  We also nixed glue, tape, and rubber bands as unable to stand up to oven conditions.  Pinning sounded good in theory, but probably would have ended in tears.  I finally ended up sewing them in place, which was surprisingly fast.    The key is to pass the thread directly THROUGH the cone, then wrap around the flap as you prepare for your next pass.  After three or so stabs across the cone, exit next to the original knot (you should have made a BIG knot, and left an inch or so of tail) and tie off with the tail.  These worked beautifully, and looked sort of spooky. . Ingredients:\n1/4c + 3T all-purpose flou\n1T + 1t sugar\n1t kosher salt\n8T (1 stick) unsalted butter, soft but still cool\n2 large egg whites, cold\n2T black sesame seeds\nMix flour, sugar, and salt together.  Separately, whisk butter until it's completely smooth; I used my Kitchenaid with the whisk attachment.  Add egg whites to the dry ingredients, and mix thoroughly with a stiff spatula.  Dump the egg mixture into the butter, and whisk until batter is creamy and without lumps.\nI don't have a picture of the bowl of pasty goo, so here's some of it in the stencil.. Get out your Silpat.  If you don't have one, head to any kitchen store and shell out $15.  Once you have a Silpat you'll find a million uses for it.\nPlace the stencil on the Silpat, and scoop some batter into the center.  Use the sharp-edged spatula of your choice to spread the batter in an even layer over the stencil; scoop off any extra.  If it's grossly uneven you'll get localized browning/burning.  Don't leave any holes.  Lift stencil and repeat.  I did five coronets per sheet, which seemed like plenty. Also, I only had the patience to sew five molds- don't lay down more coronets than you have molds.\nSprinkle black sesame seeds over the top of each coronet.. Put the Silpat on a baking sheet, and transfer to your preheated 400F oven.  Cook for 4-6 minutes, until the batter is just set and you can see the batter ripple a bit.  They'll start sliding around on little melted-butter trails if your baking sheet isn't entirely flat, but this is easily fixable.\nPull the sheet out and sit it on the open oven door to keep warm while you work.  Hold the top of your paper mold with your off hand, and use a tool to manipulate the coronet with your dominant hand.  Be careful- the coronet is hot and greasy; you REALLY don't want to touch it directly. Roll the coronet around the mold as tightly as you can, and finish with the seam side down.  Roll the other coronets and place them up against each other to prevent unrolling.\nPop the sheet of rolled coronets back into the oven for 3-4 minutes to set the seams and let them color up a bit.  The French Laundry seems to make coronets that are entirely golden-brown, but I took mine out earlier for fear of burning. This worked just fine.\nLet the coronets cool/solidify on paper towels for a few minutes before removing the paper forms.. Ingredients:\n1T finely minced red onions\n1/2c creme fraiche\n1/4t kosher salt, or to taste\nfreshly ground white pepper to taste\nRinse red onions in a sieve under cold water, then dry on paper towels.  Whisk creme fraiche in a small metal bowl for 30sec-1minute, or until it holds soft peaks when you lift the whisk.  Fold in onions, then season with salt and pepper.  Refrigerate until ready to serve, up to 6 hours.\nI never got the creme fraiche to reach soft peaks, so shoved it in the fridge and hoped for the best.  It gets a bit more solid as it chills, but... not a lot.  Also, wash more than 1T onions as some get lost in the sieve; measure the 1T off of the paper towels.. Ingredients:\n4oz sashimi-grade salmon fillet (belly preferred), skin and any pin bones removed and very finely minced\n3/4t extra virgin olive oil\n3/4t lemon oil (zest is a potential substitute)\n1 1/2t finely minced chives\n1 1/2t finely minced shallots\n1/2t kosher salt, or to taste\npinch freshly ground white pepper, or to taste\nFind a nice big SHARP knife to mince the heck out of the salmon fillet.  They claim a food processor would ruin the texture; it would certainly be less fun.  Mix in remaining ingredients, then chill for 30 min to 12 hours.. Assembly is easy:  a dollop of each ingredient, presented like an ice cream cone.  They recommend serving them in a lucite holder, but I got lazy and it wouldn't have worked anyway (see below).  If you can't get at a laser cutter or machine tools, you could wedge the cones in rock salt, peppercorns, or the like for a snazzy presentation.\nFirst, scoop a bit of the creme fraiche into the top of the coronet.  Pipe it in with a pastry bag for bonus points.  Apparently if you prepared it properly, it will be thick enough to stick at the top of the cone; mine chose to be too runny for this to work.  Thus, the horizontal cone trick:  I poured the creme fraiche in, then kept it as close to level as possible while adding the salmon, and served it lying on a plate.\nYou can use a melonballer to create cute little salmon scoops, or just do it quickly with a small spoon and/or clean fingers.   Stick a chive tip out the top of the salmon ball to look extra classy, or possibly more like a Teletubby.  Eat immediately if not sooner.\nEither way, they were fantastically tasty.  If I do this again, I'd probably skip the cones and just plop the half-baked coronet rounds into mini-muffin pans to make non-leaky shells to hold my ingredients.  I'd probably substitute a mix of cream cheese with either sour cream or yogurt for the creme fraiche, as it's a lot cheaper, and it mainly provides a fatty foil for the salmon.  Could be made lower-fat if you care about these things.\nCertainly worthy of a repeat, though.\nThis made approximately 20 coronets.\nRead the question below and select from the following choices.\nA: Salmon Carpaccio\nB: Cooking on the Stove\nC: Prepare Coronet Batter\nD: Greens", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_154_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_154_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_154_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_154_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_154_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_154_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_154_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_154_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_154_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_154_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_154_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_154_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_154_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_154_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_154_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_154_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_154_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_154_17.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_154_18.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_154_19.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_154_20.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_154_21.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_154_22.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_154_23.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: cooked Rice & Chicken Recipe\nB: Rinse Rice & Lentil\nC: Chicken Pot Pie Recipe\nD: Ingredients", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Add Onion', 'Add Rice, Red Lentil and Water', 'Serve']", "context": "Here is the context of these images:\n. Step 1: Take 2 tbsp of lentil and 1/2 tbsp of rice into a bowl. Rinse couple of times and keep aside. Step 2 : Heat a sauce pan and pour 1/2 tsp of oil... Step 3 : Add ginger and saute for 20 seconds... Step 4: Add the Onion and saute for a minute... Step 5: Add the chopped chicken, spice powders, salt and cook for 2 minutes on medium-high heat... Step 6: Add the shredded vegetables, coriander leaves and cook for 1 minute... Step 7 : Add the rice, red lentil, 1 cup water and mix... Step 8: Put the heat and bring it to a boil. Step 9: When it boils, reduce the heat to low and cook for 20 minutes... To Serve your toddler: Take 1 Cup of cooked meal in a bowl, shred the chicken into tiny pieces and serve when warm :)..Hope your little-one enjoys this meal :)\nRead the question below and select from the following choices.\nA: cooked Rice & Chicken Recipe\nB: Rinse Rice & Lentil\nC: Chicken Pot Pie Recipe\nD: Ingredients", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_155_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_155_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_155_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_155_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_155_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_155_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_155_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_155_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_155_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_155_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_155_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_155_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_155_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_155_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_155_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_155_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_155_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_155_17.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_155_18.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Heck of a Mellony Dish\nB: Icing Time\nC: Flavor Combination #1\nD: Chilling and Storage", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['villi Orange Cake', 'Crumb Coating', '@placeholder', 'The Final Cakes']", "context": "Here is the context of these images:\n. The first stage of this cake is to make a lemon Victoria sponge.IngredientsMargarine (469g)  Caster sugar (469g)  Eggs (7 eggs)  Self raising flour (469g)  Baking powder (1.5 teaspoons)  Lemons (zest and juice of two lemons)To get a great Victoria sponge, weigh the eggs in their shells and then match that weight with each of the other ingredients. This cake was a 7 egg sponge and these weighed 469g.  Pre-heat the oven at 180C and line a rectangular tin. First, cream together the margarine and caster sugar until pale. Slowly add the egg to the mixture. Next add the flour and 1.5 teaspoons of baking powder. Once everything is combined add the zest and juice to the mixture. Put into the tin and smooth out and pop into the oven. This will take about 30-40 minutes. Try and resist the urge to open the oven as this will make the cake collapse. At the end of the time, check the cake is ready, pop a knife in the middle, if it comes out clean then it is done. If not, leave it for another 5 minutes. Leave to cool.. Now it's time for the gluten free damaged orange cake. This is adapted from a recipe from Nigella Lawson https://www.nigella.com/recipes/lemon-polenta-cake It was increased to being a 4 egg cake.IngredientsSoft butter (266g)  Caster sugar (266g)  Ground almonds (250g)  Polenta (150g)  Baking powder (1.5 teaspoons)  Eggs 4  Zest of two oranges and the juice of 1Preheat the oven to 180C and line the tin. Cream the butter and sugar together until pale. Combine the ground almonds, polenta and baking powder together. Add 1/3 to the butter and sugar mixture and then add some of the egg. Alternate the dry goods and the eggs. At the end add the zest and the juice together. Pop this in the tin and then in the oven for 40 minutes. Check that it is done in the same way as before. This cake will not rise as much as the sponge.. Once the lemon cake is cooled, cut it in half.Cut one half of the cake into strips length ways approximately 3 cm wide. These will make the beginning of the villi.Now cut the strips into smaller rectangles and carve off the top of the rectangles to make cylinder shapes.. An important part of decorative cake making is the crumb coat of icing. This is a thin layer of buttercream icing which catches up all the crumbs of a cake which you have cut. First whip up some buttercream.Buttercream icingSoft butter (160g) Icing sugar (500g) Vanilla essence (0.5 teaspoons) Splash of milkAllow the butter to be room temperature, often I will leave it out from when I start the cake making process. Mix the butter with a little bit of the icing sugar. Slowly incorporate the icing sugar and add the vanilla essence. Once all the icing sugar is incorporated, add a dash of milk and mix for at least 5 minutes. This allows the buttercream to become very soft and easy to work with.To spread it over the cake, take some hot water and place a knife in it. Using a warm knife allows you to spread the icing like butter! Cover both cakes with a thin layer of icing, keep popping the knife into the water to help spread the icing.Once covered, place the cakes into the fridge for at least 30 minutes to allow the icing to get firm.. As with the cakes, now add a crumb coat to each of the healthy villi. To attach these to the cake, place a cocktail stick in the bottom of the villi and then pop on the cake. This will give added security. Place the villi on the cake in a random fashion.For the damaged villi, these are made out of fondant icing. Roll a palm full of icing into a ball and then flatten to create disks of icing. Now place on the cake in a similar distribution as the healthy villi.. This is may favourite part of cake decorating. Making the icing the right colour, it's a bit like playing with playdoh!I use just off the shelf fondant icing. First, you need to work the fondant. I add a little cornflour to my hands so the fondant doesn't stick to me. Work the fondant for a minute or two until it is soft. For this cake I used 1.5kg of icing. 1/3 of that was used for the darker pink for the sides of the cake and the other 2/3 were to cover the top of the cake. When you are not using the fondant make sure you cover it in cling film so it doesn't dry out.To colour them I use gel colours, they are a bit more expensive but they last a long time and a little goes a long way. With a cocktail stick, add a little of the colouring to the fondant icing. The covering was a pink colour and the outside used a chestnut colour as this can give a realistic skin colour.I roll my icing out on a non-stick mat covered in cornflour. If you do not have this I would recommend clingfilm or non-stick baking paper. . Roll out the icing for the top of the flattened villi cake first, this is the easier of the two cakes to cover. Try not to roll this too thin otherwise you will not be able to lift it off the rolling mat.Once it is big enough to cover the cake, place the rolling pin in the middle of the fondant and flip on side of the icing over the pin. This will enable you to then lift the icing up and place it carefully on top of the cake. Slowly rub the icing down over the flattened villi until the icing is draped over the sides. Now cut the excess icing away. Roll out the darker skin colour and cut to fit the sides. Press this onto the cake and leave for about 10-20 mins so that the icing hardens somewhat and then trim off the bottom excess.If at any point you get a small split or crack in the icing, a top tip is using Trex (a solid vegetable fat) to smooth things out.. For the healthy villi cake, start in the same way as the flattened cake. Roll out the pink icing and place it on top of the cake. Work the icing down the villi, do not worry if the icing rips. This is where the Trex comes into its own. Mix it with some of the coloured icing and you can use it like plaster to cover up the cracks and patch up the cake. Add the sides to the cake.. Display your proud villi cakes!Now Enjoy!\nRead the question below and select from the following choices.\nA: Heck of a Mellony Dish\nB: Icing Time\nC: Flavor Combination #1\nD: Chilling and Storage", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_156_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_156_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_156_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_156_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_156_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_156_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_156_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_156_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_156_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_156_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_156_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_156_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_156_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_156_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_156_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_156_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_156_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_156_17.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_156_18.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_156_19.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_156_20.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_156_21.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_156_22.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_156_23.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_156_24.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_156_25.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_156_26.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_156_27.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_156_28.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_156_29.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_156_30.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_156_31.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_156_32.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_156_33.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_156_34.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_156_35.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Mix Your Lemonade\nB: Lemonade Slushy\nC: Mix and Serve!\nD: Basil Lemonade", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['What You Need', 'Juicy!', 'Prepare the Water', '@placeholder']", "context": "Here is the context of these images:\n. The ingredients' list is very simple:5 big lemons (mine are from Puglia, South Italy :D)4-5 stems of fresh rosemaryGrinded cinnamon1.5 liters (50.70oz) of bottled mineral waterOrganic honeyOther tools and equipment:A big jarA squeezer (electric or manual)Knife, spoonPotAs you can see, nothing special!Let's move on.. Start by cutting the lemons in half with a long knife.Don't scratch your eyes while cutting lemons!!! I've learned it the hard way XDThen, squeeze all the juice you can with your squeezers.My beloved Kenwood blender has a special accessory, which transform it in a electric and powerful squeezer. Very useful!At the end, you should have from 400ml to 600ml of fresh, 100% pure lemon juice!Pour it in the jar, you can help yourselves with a funnel, like I did.. The next step consist in boiling the bottled water.Why, you ask? Two reasons:help the melting of honey and cinnamonextract the essence from the rosemarySo, start by pouring the water in a pot, with a strong fire under it.When the water is hot (but still not boiling!), add 3 big spoons of organic honey!If you like your lemonade more sweet, add a 4th spoon.Mix everything until it's homogeneous.When the water starts boiling, it's time to add 2-3 stem of rosemary to the mix.This step is fundamental to add that special flavor to our lemonade!Boil everything for nothing more than 3 minutes!We want a drink, not a broth.Stop the fire and pour a full spoon of cinnamon into the hot water, and then mix everything until you have a light brow mixture.Wait for the mix to be at room temperature.. Now, it's time to remove the boiled rosemary.It look like an algae, now! :DIt's not a good idea leaving a boiled, squishy plant inside a fresh lemonade.So, we are replacing the cooked one with a new, fresh stem of rosemary, that we can put directly into the jar with lemons juice.Now, take you mix (be sure it's at room temperature!) and pour it into the jar.You can always get a help from the funnel! ;)Stir the drink with a long spoon for a minute or two.Then, let it rest in the fridge for some hours.. After some hours of fridge, your special lemonade is ready!Enjoy this new taste, feel the rosemary in the background and the strong lemons flavor on your tongue!Refreshing and thirst quenching!This lemonade has a very strong flavor, due to the high amount of lemon juice.It's also very healthy, because the only sugar we are putting in is from honey!Also, you can transform it:fill a mug with your drink and microwave it for 2 minutes. Now you have a purifying and digestive tea!I hope you like this recipe, and get the most out of summer!Like always, from Italy, this is Filippo!Ciao a tutti! :D\nRead the question below and select from the following choices.\nA: Mix Your Lemonade\nB: Lemonade Slushy\nC: Mix and Serve!\nD: Basil Lemonade", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_157_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_157_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_157_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_157_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_157_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_157_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_157_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_157_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_157_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_157_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_157_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_157_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_157_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_157_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_157_14.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Ingredients\nB: Millet, Rye and Beer Bread\nC: Rise and Knead.\nD: Mixing the Sponge", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'stuff', 'Dry Ingredients', 'BAKE']", "context": "Here is the context of these images:\n. You will need the following ingredients:\nFlour\nBaking Powder\nSalt\nSugar\nBeer (Shock Top or Blue Moon are my favorite)\nButter. 1. Preheat oven to 375 degrees\n2. Grease bread pan.\u00a0 I always use part of the butter I have set aside for the recipe.\n3. Sift flour. Mix Dry Ingredients\n\u00a0\u00a0\u00a0\u00a0\u00a0 3 cups flour- sifted\n\u00a0\u00a0\u00a0\u00a0\u00a0 3 teaspoons of baking powder\n\u00a0\u00a0\u00a0\u00a0\u00a0 1 teaspoon salt\n\u00a0\u00a0\u00a0\u00a0\u00a0 1/4 cup sugar. \nAdd room temperature Belgian White Beer and mix with dry ingredients.\u00a0 It will be slightly lumpy.. \n1. Pour the dough into your greased bread pan.\n2. Melt 6 tablespoons of butter.\n3. Pour melted butter over the bread dough in the pan.. \n1. Bake the dough in the preheated 375 degree oven for 1 hour.\n2. Remove from oven and remove from pan.\n3. Cool for 15 minutes, serve and enjoy.\nRead the question below and select from the following choices.\nA: Ingredients\nB: Millet, Rye and Beer Bread\nC: Rise and Knead.\nD: Mixing the Sponge", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_158_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_158_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_158_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_158_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_158_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_158_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_158_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_158_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_158_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_158_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_158_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_158_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_158_12.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Simple Tomato Basil Sauce\nB: Soak Seeds\nC: Spoon Onto Bread If Included Tomatoes, Otherwise Dip Bread Into Mixture\nD: Gently Stir With Spoon to Keep Tomato Chunks Intact", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['INGREDIENTS  and Equipment', '@placeholder', 'Food Processor', 'Dehydrating']", "context": "Here is the context of these images:\n. 1 cup flax seeds (ground optional for better nutrient digestion) soaked in\n   2 1/2 cups filtered water\n1/4 cup chia seeds soaked in\n   1 1/4 cup filtered water\n1/2 cup almonds raw (soaked and peeled optional)\n1 1/2 cups cashews raw \n1 cup sunflower seeds raw\n1/2 cup pumpkin seeds raw\n1 large or 2 medium red bell peppers large dice\n1 medium red onion  half large dice half small dice\n2-3 cloves garlic\n1 large lemon juiced\n4 Tablespoons nutritional yeast\n1 Tablespoon apple cider vinegar \n1 teaspoon salt\n1 bunch leafy greens chopped small\n10-15 large basil leaves chopped small\n1 cup tomatoes medium dice\n3 green onions small dice\n7-10 sheets nori seaweed \n\nEquipment\n\nFood processor\nRubber spatula or tool for spreading\nDehydrator . Soak flax seeds and chia seeds in water separately. Cover and let stand  for 10-24 hours on counter top. When seeds have soaked, put them together and set aside. De-seed and chop bell peppers. Add to the food processor. . Cut onion in half. Roughly chop half and small dice the other half. Add rough chop to the food processor saving small diced onion for later. . Add the cashews, lemon juice, apple cider vinegar, nutritional yeast and salt to the bell pepper, onion and garlic already in the food processor. Blend until mostly smooth or desired texture. . Add almonds and pulse until they are chopped up. It should yield about 2 cups  . Combine the vegetable and nut mixture with the seed slurry until thoroughly mixed. Add the sunflower and pumpkin seeds and mix well. Add the remaining ingredients saving the tomatoes for last. Take your nori sheets and cut them in half. Lay the rough side up on your dehydrator. Be sure to do the next steps on your dehydrator trays because you will not be able to move them one assembled. . Place a nice amount in a line down your nori. . Spread evenly making sure to get all the corners. You can do as thin a layer as you like but the thicker you go the longer it will take to dehydrate. I tend to do a 1/4 inch with this recipe. \n. When done assembling, cover and dehydrate until desired crunchiness. 24 hours is usually enough but on occasion 48 hours is needed. . 1. If you have an aversion to seaweed or prefer a lighter cracker you can use the plastic inlay that comes with some dehydrators and spread the mixture directly on it. \n\n2. To spice it up a little I sometimes put a tablespoon or two of Harrissa in the mixture.  \n. These crackers are wonderful with a soft cheese like brie. Also hummus, harissa and sliced cucumbers are a favorite. Pretty much anything you would put on a traditional cracker will be nice with your tomato basil flax crackers. \nThank you, enjoy \nand eat your way to healthy\nRead the question below and select from the following choices.\nA: Simple Tomato Basil Sauce\nB: Soak Seeds\nC: Spoon Onto Bread If Included Tomatoes, Otherwise Dip Bread Into Mixture\nD: Gently Stir With Spoon to Keep Tomato Chunks Intact", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_159_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_159_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_159_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_159_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_159_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_159_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_159_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_159_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_159_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_159_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_159_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_159_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_159_12.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Peel and Cut\nB: Sonoran Hot Dog\nC: Put Them on the Grill\nD: Cook Them Until They Are Done", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Supplies!', '@placeholder', 'Grill and Sautee!', 'Add the Works!']", "context": "Here is the context of these images:\n. So, these can be Vegan dogs...but perfect for tricking kids! You will need:CarrotsPeeler/KnifeMarinade: 1/4 cup soy sauce1/2 cup water1 Tablespoon Worcestershire sauce1 Tablespoon rice vinegar1/2 Tablespoon apple cider vinegar few shakes of pepper and garlic powder. First, peel and cut big carrots to the same shape and length as hot dogs.Use a knife to cut a star pattern on the ends too...Boil in water for 5 minutes. Quickly remove them and put in cold water to cease cooking.Add them to your mixed up marinade in a plastic container.Put carrots in the marinade...completely submersed overnight in the fridge. (This marinade will turn them brown, so make sure they are submerged completely)Overnight is best...but even 4 hours is great!. Next day, grill them up to give them some grill lines...just a few minutes. (optional)Then add them to a frying pan with 1/2 cup of the marinade and cook until browned.They look like any other hot dog I've ever cooked! They have about the same texture as a hot dog.They taste salty and a little \"meaty\".... Now just add the fixings you want! Presentation is everything!!!We even did this same technique and roasted the carrots on the fire.They are awesome for a low calorie version!Check out my blog Doodlecraft for more awesome ideas!Please vote for me in the Pranks contest!\nRead the question below and select from the following choices.\nA: Peel and Cut\nB: Sonoran Hot Dog\nC: Put Them on the Grill\nD: Cook Them Until They Are Done", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_160_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_160_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_160_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_160_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_160_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_160_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_160_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_160_7.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Bacon Wrapped Weenies\nB: Boil the Pretzel Dogs\nC: Grill\nD: Bake Pretzel Dogs", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Slice the Jalapenos', 'Bacon!!', '@placeholder', 'Mmmmmmm']", "context": "Here is the context of these images:\n. Ingredients:\nHot dogs\nBacon\nCanned/pickled Jalapenos\nSliced cheese\nHot dog buns\nFavorite condiments\nCooking spray\nMaterials:\nTongs\nGrill\nPlates\nSpray bottle. \nSlice the jalapenos so that they are little rings.\u00a0 Then cut open the rings. Remove seeds if desired.\u00a0 Place sliced jalapenos on hotdogs.\n**warning: wash hands after working with jalapenos.\u00a0 DO NOT TOUCH EYES. (cuts are probably a bad thing to touch as well.). \nCut or tear cheese slices in half.\u00a0 Place 1 or 2 on each hotdog, on top of the jalapenos.. Wrap the hotdog in bacon, making sure to not let the cheese or jalapenos fall out.\u00a0 We used 2 slices of bacon per hotdog.\u00a0 Adjust as desired.. Preheat grill for 5 minutes, then turn the heat to medium.\nCarefully place bacon wrapped hotdogs onto grill using tongs.\nFlip (or carefully roll) the dogs every 7-10 minutes until bacon is crispy and delicious.\nSpray any little fire with water to prevent the dogs from burning.\n** It might be wise to spray grill with cooking spray beforehand.\u00a0 oops.. Remove from grill when finished cooking.\nPlace hotdogs on buns.\nAdd favorite condiments.\u00a0 (We recommend Goulden's spicy brown mustard)\nEAT EAT EAT.\nRead the question below and select from the following choices.\nA: Bacon Wrapped Weenies\nB: Boil the Pretzel Dogs\nC: Grill\nD: Bake Pretzel Dogs", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_161_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_161_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_161_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_161_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_161_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_161_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_161_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_161_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_161_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_161_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_161_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_161_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_161_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_161_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_161_14.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Start the Maceration\nB: Homemade Ardennes Pate\nC: Rolling\nD: Ta Da!", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Prepare the Botanicals', '@placeholder', 'Fitrate Macerate', 'Gin Distillation']", "context": "Here is the context of these images:\n. There are nine ingredients. Per 1 L of grain spirit (36-40% ABV) prepare:12 g juniper berry/cone  (Juniperis communis)12 g coriander seeds (Coriandrum sativum)12 g cucumber slices (Cucumis sativus)3 g angelica root (Angelica archangelica)3 g rose petals (Rosa x centifolia)2 g cubeb pepper (Piper cubeba)1\u00d74 cm slice orange peel/zest1\u00d74 cm slice lemon peel/zest0.5 g caraway (Carum carvi)Weight out the ingredients and put into a jar.When preparing orange and lemon peel avoid the white on the back. I.e. only use the zest. If possible, use fresh cucumber from the garden or market. Its aroma is more intense. All ingredients are regular spices.. Transfer all ingredients into a jar. Fill up the jar with grain spirit (36-40% ABV).Close the jar. Store it for one week in the dark.. After one week the macerate coloured nicely. On opening the jar, beware of the explosion of flavours. I love that bouquet. It is something between summer flowers, herbal pharmacy, and gin.Pass the macerate through a fine mesh or coffee filter to remove the spices.. The macerate is now filtrated. At this moment it is way to strong to serve as regular gin. It needs to be distilled to obtain its mild flavour.Dilute 1 part Macerate with 3 parts potable water. The resulting wine will have about 8 % ABV. That is just perfect to be distilled in the microstill.Collect the spirit from the microstill and indulge your Egon Gin! To your health!\nRead the question below and select from the following choices.\nA: Start the Maceration\nB: Homemade Ardennes Pate\nC: Rolling\nD: Ta Da!", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_162_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_162_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_162_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_162_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_162_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_162_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Vegetarian Meatloaf\nB: Dice\nC: Make the Broth\nD: Dice", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Ingredients', '@placeholder', 'Prepare the Fixins', 'Finish It']", "context": "Here is the context of these images:\n. Broth\n60 Oz Vegetarian Broth\n8 star anise\n1-2 sticks cinnamon\n8 cloves\n1 thumb sized piece of ginger, sliced\n8 cloves of garlic, quartered\n1-2 tablespoons soy sauce\n2 onions, quartered\n3 shallots, halved\n6 cups waterVeggies (just some suggestions)\ncarrots\nbok choy\nbroccoli\ncauliflowerNoodles\n1 package thin rice noodlesPho Fixins\nfried tofu\njalapeno\nlime\nbean sprouts\ncilantro\nbasil\nmint\nSriacha hot chili sauce. - Heat a large pot over medium high heat.\n- Add the garlic, onions, star anise, cloves, ginger to to pot. Stir over heat until it begins to brown.\n- Add the broth, water and soy sauce.\n- Bring to a boil. Reduce heat and simmer for 30 minutes.\n- Strain the broth into a new pot, reheat.. - Wash the cilantro, basil, bean sprouts and mint.\n- Slice the jalapeno.\n- Quarter the lime.\n- Arrange everything on a plate.. Prepare the noodles as described on the package. The one I used were immersed in boiling water for 10-15 seconds, then rinsed.. - Add about a cup of noodles to the bottom of each bowl.\n- Pour hot broth over the noodles.\n- Add vegetables (carrots, broccoli, etc), allow to sit for a few minutes.\n- Add tofu.\n- Serve with chopsticks, a spoon, Sriacha hot chili sauce and the plate of fixins.\n- Everyone should add the fixins to their soup as desired, remove the jalapenos when it starts getting too spicy.\nRead the question below and select from the following choices.\nA: Vegetarian Meatloaf\nB: Dice\nC: Make the Broth\nD: Dice", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_163_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_163_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_163_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_163_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_163_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_163_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_163_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_163_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_163_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_163_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_163_10.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Place on Rack\nB: On the Bbq\nC: Baloney Sandwich...\nD: Mouthpeice", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Begin Your Flavor Layering.', '@placeholder', 'Fill Up Rack', 'Plate of Caramelized Teriyaki Bacon']", "context": "Here is the context of these images:\n. Line a baking sheet with foil. Spray your rack with olive oil spray\u00a0 - or canola spray -- \u00a0on both sides of rack. You wil throw away the foil so no need to spray it. If you don't use foil, you will have caramelized sugar forever stuck to your pan !. Begin preparing your bacon flavor layering by dipping the bacon in sauce.\u00a0 I used Kikoman's Low Sodium Teriyaki sauce to flavor the bacon.\n.. Ordinary light brown sugar is used in this recipe.\u00a0\nBreak up the lumps, if any, with a whisk or a pastry cutter or a fork. I started with about 1 1/2 cups of brown sugar, but I did add some more later so it is really about 2 cups that you need for this recipe.. Pour some teriyaki sauce in bowl. I started with about 3/4 cup... but I didn't actually measure it.\u00a0 You can always add more to your bowl, if you need it. . Meaty, thick-cut sliced bacon, like this package here, works GREAT.\u00a0. Pull out 6 or 7 slices. Using kitchen shears, cut slices in half. Why ? Because that will make them the correct size to lay on your hamburger or sandwich later on when finished.\u00a0. Assembly line: Teriyaki sauce, brown sugar, then on to rack.. First, dunk each slab of bacon into your teriyaki sauce to coat both sides.\u00a0\u00a0 Coat it well for really good flavor.. Next, dredge the sauce-coated bacon piece in your brow sugar.\u00a0 Flip and coat both sides.\u00a0. Place your sugar-coated bacon pieces on your sprayed rack.\u00a0\u00a0 Be SURE you sprayed your rack with canola oil or some sort of non-stick spray, other wise your bacon may become stuck to it permanently .\u00a0\u00a0. Fill up the rack\u00a0with coated bacon pieces.\u00a0\n(... lots of reflection going on in this photo from the over head light, the foil, \u00a0and the camera flash.. almost looks like the side of my pan has flames going, but it does not.\u00a0)\u00a0. Properly wrap and store the rest of your bacon in the refrigerator.\u00a0 We both know that thick bacon is not cheaply priced, so treat the remainder like it is gold and store it properly so you can repeat this recipe next week for your buddies.\u00a0 After you've told them how delicious this is and that YOU made it, you know they will be wanting you to make some for them too.\u00a0\u00a0\u00a0. Baking and caramelizing in the oven. Cook for at least 20 minutes, then take out, turn over, and put back in for another 12 to 15 minutes.\u00a0 YES, this takes a LOT of time. No wonder the burger shoppes charge an arm and a leg $ for this on your burger. LOL. Holy Cow that looks GOOD !!\u00a0 YUMMY !!. Out of the oven. Cooled on rack for a little bit, then transfered to a glass plate that I lightly sprayed with the olive oil cooking spray. Let bacon cool.\nHowever, if it still does not look \"done'' then by all means put it back into the oven until it does get \"done'' .. it should \"crisp up\" as it cools if properly cooked.. WOW, does that look GOOD !!\u00a0\u00a0 This is now ready to add to your burger or sandwich, or eaten plain.\u00a0\u00a0 We put our Caramelized Teriyaki Bacon on grilled turkey burgers with pineapple slices and red onions, all on a whole wheat bun.\u00a0 YUMMM-O !!\u00a0\u00a0\nRead the question below and select from the following choices.\nA: Place on Rack\nB: On the Bbq\nC: Baloney Sandwich...\nD: Mouthpeice", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_164_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_164_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_164_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_164_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_164_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_164_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_164_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_164_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_164_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_164_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_164_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_164_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_164_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_164_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_164_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_164_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_164_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_164_17.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Roll Each Piece in Your Hands to Form a Ball\nB: Delicious Colombian Arepa\nC: Baking Time\nD: Cheese", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Add the Milk Mixture to the Yeast in the Large Bowl', '@placeholder', '\\u200bPlace the Rolls With Smooth Side Facing Up', '20 Minutes']", "context": "Here is the context of these images:\n. First pour 1/4 cup warm water into a large bowl. Empty the yeast package into the bowl of water. Stir until the yeast dissolves (see images) and leave this alone for about 10 minutes so it can get foamy.Note: \"Proofing\" is just a fancy term for dissolving active dry yeast into warm water in order to activate it.. Stir this mixture until the sugar dissolves and until the butter is completely melted (see image). Allow the mixture to completely cool down.Caution: Failure to let this mixture cool down could cause the egg (used in step 4) to scramble or be cooked when added.. . Be sure the egg is fully mixed in (see images).Caution: Be careful not to get any of the egg shell into the mixture when cracking the egg.. Be sure to only mix in 1 cup of flour at a time, until all 4 cups are mixed in. Mix until this forms a soft dough (see images). The dough may not be fully mixed yet. . It is important that the surface is floured well so that the dough doesn't stick to the counter (see image).. Kneading dough involves folding the dough in half towards you and then using the palm of your hands to push down into the dough (see images). Add more flour to the dough if it feels too sticky. The dough should feel soft and elastic when done kneading.. This step is to prevent the dough from sticking to the bowl (see image).. Cover the bowl with a clean kitchen towel or cloth and let the dough rise until it has doubled in size (see images).Troubleshooting: If the dough doesn't rise enough, it will not make light and fluffy rolls, so you may need to restart.. Punch down the dough and follow step 7 on how to knead the dough. Knead for about 4-5 minutes (see images).Caution:The dough should be slightly sticky. Limit the amount of flour on the surface in order to prevent the dough from getting too tough or dry. The rolls won't be as light and fluffy if this happens.. This is important to prevent the rolls from sticking to the pan (see image).. First cut the dough into four large pieces with a butter knife (see image). Then cut each piece into four more pieces so that you end up with 16.. This will help you get the dough into the shape of a roll (see image).. This helps the roll have a smoother surface (see image). Repeat this step for all 16 pieces of dough.. Note: I only made 15 rolls as the picture shows (see image), but 16 can be made.. This may take about take 30 - 45 minutes. The reason it is important to let the rolls re-rise is because in step 10, the dough was re-kneaded which pushed all the air out of the dough (see image).. This helps the rolls have a crispy outside after being baked. . The rolls should be golden brown when pulling them out of the oven (see image in next step).. You have just successfully made fluffy and buttery dinner rolls! Whether you choose to eat them with a family holiday meal or just as a snack in your lunch, they will satisfy your craving for delicious rolls! You can eat these rolls with butter or dipping them in your soup. My personal favorite is eating them with honey. I'd advise trying every possibility until you find your favorite!\nRead the question below and select from the following choices.\nA: Roll Each Piece in Your Hands to Form a Ball\nB: Delicious Colombian Arepa\nC: Baking Time\nD: Cheese", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_165_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_165_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_165_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_165_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_165_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_165_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_165_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_165_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_165_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_165_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_165_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_165_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_165_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_165_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_165_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_165_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_165_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_165_17.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_165_18.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_165_19.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_165_20.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_165_21.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_165_22.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_165_23.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_165_24.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_165_25.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_165_26.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_165_27.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: How to Make Flourless Chocolate Cake\nB: How to Make an Amazing Chocolate Vegan Cake\nC: Decorate the Cake!!!!\nD: Have Fun With Chocolate Part 1", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Difference Between Chocolate', '@placeholder', 'Have Fun With Chocolate Part 2', 'Have Fun With Chocolate Part 3']", "context": "Here is the context of these images:\n. In general, there are two types of chocolate: couverture and compound. Couverture chocolate is made with cocoa mass and cocoa butter, while in compound chocolate, the cocoa butter is substituted with vegetable oil. Compound chocolates are cheap and don't need tempering, however it doesn't taste good, and the vegetable oil inside is also not good for your body, so it's also known as the \"fake chocolate\", some cheap bakery shops will use them as they will lower the expense for the shop.Couverture chocolates are the real chocolates with cocoa butter, they taste really nice and melt in the mouth. However cocoa butter is very sensitive to temperature, if you handled it wrongly it might not set, and will probably melt too fast in room temperature.. Well it's a bit hard to understand for people without any experience. We all know that chocolate will melt at a high temperature and will set at a low temperature. But some chocolate have white oil stripes on it, or taste grainy, or melt so fast that you cannot even touch it with bare hands, or don't shine well, while other good quality chocolates will have a snap when break it, and touch dry to the finger, hold shape better, have perfect shine and melt in the mouth perfectly.That's the difference between non-tempered chocolate and tempered chocolate. We temper chocolate for cake decorations to have a better shape, better taste, better look and longer shelf life. . For dark chocolate, milk chocolate and white chocolate, the temperature requirement is different. In general, the higher the milk content, the lower temperature it needs for tempering.There are also a few ways to do it, in commercial kitchen we always use tabling method-- to spread melted chocolate on the marble surface or bench to cool it down. However for home bakers, it's quite messy and needs more tools, so at home I always use seeding method, which is to add cold chocolate bits into melted chocolate. So I have some white cooking chocolates, I'll put some in a glass bowl, and the others I'll chop them into pieces for later use. 1. First melt the chocolate in the glass bowl. You can use a double boiler, but microwave will do the job just fine. Heat for 20 seconds, stir, another 10 seconds, stir, and repeat until it's completely melted. DO NOT OVERHEAT THE CHOCOLATE! It will burn very easily in the microwave and you won't be able to use a burnt chocolate.2. Add chopped chocolate pieces into the melted chocolate gradually, stir everytime until it melts. Check the temperature of the chocolate and keep adding cold chocolate pieces until the temperature drops to 28-29 Celsius Degree (about 83\u00b0F). 3. Now your chocolate is tempered and ready to use! Make sure you work fast, if the chocolates sets before you finish your work, you can reheat it on a double boiler and make sure the temperature doesn't go above 83\u00b0F, otherwise you'll have to temper it again. For milk chocolate, the temperature to work is about 29-30\u00b0C (84-85\u00b0F), and for dark chocolate, it's 30-31\u00b0C (86-87\u00b0F).  . This is the easiest way to work with tempered chocolate. Just simply fill the tempered chocolate in any mold you like, scrap the excess and once it's set, you can just use it. There are a lot of cute molds available online and in store, so use your imagination and find your favorite! I used a sea shell mold. You can use plastic mold or silicon mold, they both works for tempered chocolate. . You can fill a small piping bag or a paper piping bag with tempered chocolate, and paint whatever you like on a silicon paper or silicon mat. Once the chocolate is set, you can take out the pattern you just painted. Here I showed how to make some snowflakes which will look perfect on a Christmas cupcake. You can definitely pipe anything you like! If you pipe letters, make sure they connects to each other, or it will be impossible to take out the whole design without break apart. . This is another technique that you need to work with chocolate when it's half dry. Most of the fancy decorations are made this way. Here I'm using a chocolate feather as an example.1.  Pipe the basic shape of the decoration you want to make on a piece of silicon paper (or acetate).2. Use another sheet of silicon paper to put on top, shape it with thumb and take the top sheet away to make it thin and even (you can also use small spatula). 3. Wait till the surface is touch dry but not completely set yet. This step is crucial as if you didn't wait enough time, the chocolate is still wet, and it will be very messy if you start to work with it. However if the chocolate is completely set, it will crack easily and will be impossible to work with any more. 4. Make some cuts and lines with the tip of a toothpick.5. Pipe a line in the middle as the \"bone\" of the feather. . Here are some more ideas of chocolate decorations with half dry method. For the first cake with three hoops, I applied tempered chocolate on a rectangular shape silicon paper, and make the both ends connect to each other when it's half dry. Then I use chocolate to stick all three circles together to make the design. For the second cake with molded chocolate, I also made a chocolate \"coconut shell\" by applying two layers of chocolate: dark chocolate on the bottom, and white chocolate on top with coconut flakes.For the third cake, I used chocolate cigarette and used round cutter to make the design. Just apply tempered chocolate on a piece of silicon paper, and use cookie cutters (or round cutters) to cut out the shape you like.You can also use a comb to scrap half the chocolate off, and you will get chocolate strips afterwards.Sky is the limit, and use your imagination!Hope you will find this instructable useful! If you like it, please vote for me in the \"cake decoration\" contest, thanks a lot!\nRead the question below and select from the following choices.\nA: How to Make Flourless Chocolate Cake\nB: How to Make an Amazing Chocolate Vegan Cake\nC: Decorate the Cake!!!!\nD: Have Fun With Chocolate Part 1", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_166_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_166_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_166_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_166_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_166_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_166_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_166_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_166_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_166_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_166_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_166_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_166_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_166_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_166_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_166_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_166_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_166_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_166_17.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_166_18.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_166_19.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Bonus\nB: Ice Cream Cake\nC: Ingredients\nD: Banana Ice Cream", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Tools', 'Gingerbread', 'Store', '@placeholder']", "context": "Here is the context of these images:\n. \u00d75 dl (2.1 cups) of cream \u00d7a tin can of condensed milk, 2 dl (0.8 cups) \u00d720 pieces of gingerbread or more! \u00d7A bowl\u00d7Electric mixer \u00d7Spatula \u00d7Bread form\u00d7Freezer\u00d74 hours of time in feeezer. Fill the bowl with 5 dl (2.1 cups) of cream and whisk them with an electric mixer. After give some to your dog! . Add a tin can of condensed milk, about 2 dl (0.8 cups). Mix it around with a spatula. . Crush 20 pieces or more if you like, into the bowl. Mix it with the spatula. . Fill the ice cream into a bread form. Put some gingerbread crumbs on top. Store it in freezer for 4 hours minimum to make it hard ice cream. . After 4 hours of freezer time it's done! . The other day I did the same recepie exchanging gingerbread to blueberries. \nRead the question below and select from the following choices.\nA: Bonus\nB: Ice Cream Cake\nC: Ingredients\nD: Banana Ice Cream", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_167_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_167_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_167_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_167_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_167_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_167_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_167_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_167_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_167_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_167_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_167_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_167_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_167_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_167_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_167_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_167_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_167_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_167_17.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_167_18.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_167_19.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_167_20.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_167_21.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_167_22.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: How to Preserve Cucumbers\nB: How to Make Sauerkraut\nC: Pack the Jalapenos and Carrots Into the Pickling Jar\nD: Ingredients", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Prep Work Time!', '@placeholder', 'Heat the Pickling Liquid and Pour It Over', 'Cooling + Storing']", "context": "Here is the context of these images:\n. Ingredients:6-10 jalape\u00f1o peppers (this will all depend on the size of the jalape\u00f1os - start with six and work your way up!)1 medium carrot1/4 red onion1 cup white vinegar 1 cup  water2 cloves garlic, peeled and lightly crushed1-2 tablespoons salt (start with 1 tablespoon, 2 is pretty dang salty.)1-2 teaspoons sugar1/2 teaspoon dried oreganoa few black peppercorns, lightly crushed1 dried bay leafYou can absolutely add more sugar if you like sweeter pickles, but the carrots add a little bit of sweetness so I scaled the sugar down from the original recipe. Tools:32 ounce jar for canninggloves wouldn't be a bad idea either, you will have jalape\u00f1o hands for two days after this without them.. Slice the jalape\u00f1os into rounds as thin as you like, I normally do 1/4 inch. Test the jalape\u00f1os and remove the seeds and membranes if you think they'll be too spicy for you - this is your chance to make it as spicy as you like! You could also throw in a habanero (pierced with a knife) or some serranos to increase the heat. Slice the carrots as well - if you like them softer, go thinner than 1/4 inch. Slice the onion thinly, and crush the garlic cloves and remove the skins. Now you'll want to combine the water, vinegar, oregano, peppercorns, bay leaf, garlic, salt and sugar in a microwave-safe measuring cup. . Wash your jar and hands VERY well with hot water and dry well. You want everything to be as clean as possible.Put as much of the prepped jalapenos, onions and carrots into the jar that you can. Really push it down! . Heat the pickling mix in the microwave for 3-4 minutes, or until it's hot enough to boil.Pour it over the jalape\u00f1os and carrots you've packed in the jar. It might not all fit, but it should be close!IMPORTANT NOTE:This method results in slightly firmer pickles than the original recipe. If you like them to be softer, follow the method in the original recipe by heating up the pickling liquid in a pan until it boils, and then add in the jalapenos, carrots and onions and cook it for a minute before taking it off the heat. Then you'll need to pull the veggies out of the liquid, pack them in the jar, and pour the liquid over. . Once the veggies and liquid is in the jar, let it sit out open until it cools down to room temp. Now you'll want to seal the jar and place it in the fridge.The original recipe states that these pickles keep very well in the fridge for a month, but after that they're just not as tasty. Though I have to say we go through one of these 32 ounce jars in about two weeks, so I can't confirm that. We've pretty much been eating these with every meal since I started making them. ;)\nRead the question below and select from the following choices.\nA: How to Preserve Cucumbers\nB: How to Make Sauerkraut\nC: Pack the Jalapenos and Carrots Into the Pickling Jar\nD: Ingredients", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_168_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_168_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_168_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_168_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_168_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_168_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_168_6.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Tortellini Kale Soup\nB: Add the Golden Goodness\nC: Soups On!\nD: Spice Things Up", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Ingredients', 'Garlic Saute', 'Add Stocks', '@placeholder']", "context": "Here is the context of these images:\n. \n          Use ingredients \"off-the-shelf\".\u00a0 Quantities reflect typical units of sale.\u00a0 Standard measurements are included for convenience.\n\t\t(3 tbsp) Olive Oil\n\t\t(1 tbsp) Chopped Garlic\n\t\t50 fl-oz. can of Chicken Broth (~8 cups) \n\t\t15 fl-oz. can of Chopped Stew Tomatoes( ~21/2 cups) \n\t\t10 oz. package of\u00a0Chopped Frozen Spinach(~2 cups) \n\t\t(1 tsp) Basil\n\t\t8 oz. package of Dry Cheese Tortellini(~11/2 cups) \n\t\t(1/4 cup) Grated Parmesan CheeseIngredients #'s 1,2,6 and 8 are only sold in units larger than necessary.. Put the oil#1 in a large soup pot on medium heat.Open the can of Chicken Stock#3, but leave aside for now.Add in the garlic#2 and saute until brown.\u00a0 This will happen fast, so take a precious extra minute to avoid burning the garlic.Quench! the saute by dumping the Chicken Stock#3 into the pot.. Add the stew tomatoes#4.Add the spinach#5.Add the basil#6.\n\t\tBring to a boil.Cover and reduce to simmer for 30 minutes.\u00a0 Avoid boiling off the soup.. Add the tortellini#7.Add the Parmesan cheese#8.Recover and simmer another 20-30 minutes, until the tortellini is finished (rehydrated).. \n          Done, but before eating, allow the soup to cool.\u00a0 The spinach and oil conspire to trap super-heated water.\u00a0 A bit of venting is necessary to release the trapped energy.\nEnjoy with hearty bread and rich butter. Skip the sour cream.\nServe extra Parmesan cheese on the side, rather than adding during cooking.\nYou can use more tortellini, but you're risking a stew, in my experience.Reduce the amount of sodium in the soup by using (in order of magnitude):\n- - - - Sodium-free chicken stock,\u00a0 - - - Provolone cheese,\u00a0 \u00a0 - - Natural stew tomatoes,\u00a0 \u00a0 \u00a0 - Fresh or Frozen Tortellini\n\tThis soup does not freeze well, but will refrigerate for a few days. The flavors improve a bit after 24 hours.\u00a0 The real genius of this recipe- it's a \"kit\".\u00a0 Stock the ingredients and enjoy fresh when you want.\u00a0\n\tCooking... proactively, what every busy person wants to do!\nRead the question below and select from the following choices.\nA: Tortellini Kale Soup\nB: Add the Golden Goodness\nC: Soups On!\nD: Spice Things Up", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_169_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_169_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_169_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_169_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_169_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_169_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: How to Eat for Free Everyday (at a University)\nB: Casseroles\nC: A Little Off the Top...\nD: How to Open and Enjoy a Bottle of Wine.", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Plan the Types of Meals You Will Make', 'Gather Supplies', 'Lunch Sandwiches', '@placeholder']", "context": "Here is the context of these images:\n. Before getting started cooking I researched the types of foods that would freeze and reheat well.  Fortunately, there are lots of options for delicious meals you can make and freeze In advance! Unfortunately, potatoes and cream sauces are not the best opinions, so I chose recipes that weren't cream sauce based and also avoided potatoes; it's probably healthier this way, but that's of little consolation...It's also important to consider the date you'll start preparing your meals and the date you'll need them to last through. Some foods freeze well for several months, while others only do well for about a month or less frozen. I tried to stick with meals that would hold up well in the freezer for 3 months.Overall, the meals I prepped fit into the following categories:- Grab and Go Breakfasts- Lunch Sandwiches- Stews - Casseroles- Slow Cooker Meals- Family Favorites (meatballs, chili, Salisbury steak). You'll need:(1) Containers to freeze foods in.  I used a combination of: - Glass Storage Containers- Canning Jars- Disposable Aluminum Pans- Tin Foil- Plastic Wrap- Plastic Freezer Bags* Lots of people freeze their foods directly in plastic freezer bags. Whenever possible I avoid storing my food directly in plastic, opting instead to use it primarily as an outer layer of extra freezer protection.* Also consider the portion sizes you'll want to store your food in when deciding what types of storage containers will work best for you! (2) Recipes!! Gather up your personal favorites, or search online for some exciting new options!! (3) Groceries - For me, I've been gradually adding to my freezer stash for the past five weeks. This has been the right pace for me. You might be crazy ambitious and want to do all your cooking and freezer prep work in one massive batch, but that wasn't my approach. Before you clear out the grocery store, be realistic about when you'll do your cooking and only buy supplies for the meals you'll prep right away. You won't save any money letting a massive stockpile of food slowly go bad in the fridge.. 30 Breakfast Meals in No Time!!I focused mainly on breakfast sandwiches and smoothies. Pancakes and waffles are also great choices to make in advance and freeze. I would make a huge batch of pancakes if I had any freezer space remaining!  **BREAKFAST SANDWICHES ABD BURRITOS**- Baked a big pan of eggs. I used a dozen eggs whisked with a cup of milk and baked at 375F for about 30 minutes. - While my eggs baked, I cooked my meats (bacon, sausage). - Set up an assembly line of supplies: aluminum foil squares for sandwiches, English muffin on each square (or flour tortilla for the burritos), get your cheese out and ready...- When the eggs were done, I used an upside down drinking glass about the same size as my English Muffins to cut egg circles.  (Hint: the extra eggs after cutting out circles work perfectly in breakfast burritos!!)- Placed an egg circle on each muffin, added meat and cheese and muffin top- Wrapped muffin in aluminum foil- Labeled a plastic freezer bag and placed 4 individually wrapped muffins in each bag- Froze bags of muffins/burritosWhen you're ready to eat a sandwich, simply take one out and heat it in your microwave for ~2 min, or toaster oven / oven for ~15 min.Through this process, I made...8 - Bacon, Egg & Cheese English muffins 3 - Ham, Egg & Cheese Burritos7 - Sausage, Egg & Cheese burritos4 - Ham Egg and Cheese English Muffins2 - Egg and Cheese English muffins**BREAKFAST SMOOTHIES**Did you know you can make smoothies in advance and freeze them?!  I did not know this, but it's true! They're good and you can make lots of them at once and clean your blender out one time vs. daily!! Woo hoo!!I made 6 smoothies and stored them in glass containers.  You could make your smoothies with whatever you want, here are the ingredients I used: yogurt, milk, apple juice, kale, chia seeds, apple, banana, blueberry.  The trick to freezer smoothies is to make them, pour into your storage containers and then put them in the fridge for at least 4 hours BEFORE putting in your freezer.You do need to thaw a smoothie slightly before you eat it.  You can move one to the fridge from the freezer the night before or thaw in cold water if you need it sooner!I made 6 smoothies through this process.. Hot sandwiches are the perfect addition to a freezer stockpile! And they're something you can eat with one hand and are easy to eat on the go!Really you should customize these to your liking.  I kept them very simple, just meat and cheese on a bun. We can add condiments when we heat them up.The process is very similar to the breakfast sandwiches. Spread any condiments you want onto your bun (or don't, and add these when you're ready to eat), add your meat, add your cheese and wrap the sandwich in its foil. Label the outside of the foil with the sandwich contents and also label a plastic freezer bag. I bagged my sandwiches 4 to a bag. Then freeze -that's it!When you're ready to eat, microwave for ~2 minutes, or place in the toaster oven or oven for ~12-15 minutes and you'll have a delicious hot sandwich!I followed this process to make:10 - ham & cheddar sandwiches6 - roast beef & provolone sandwiches. Stews freeze exceptionally well and really retain their flavor when reheated. I made a few different options from recipes I found online that noted they freeze well. If you have a favorite stew recipe, make a great big batch of it and divide it out into containers portioned for what your family eats in a meal; freeze it and reheat it when you're your ready for an easy no effort home cooked meal!I followed the following recipes to make my freezer meals:- Made 3 freezer packages each with 2 servings of Sunday Stew (recipe from The Pioneer Woman)http://thepioneerwoman.com/cooking/sunday-night-stew/- Made 4 freezer packages with 1 serving each of Hungarian Meatball Stew (recipe from Rachel Ray)http://www.rachaelraymag.com/Recipes/rachael-ray-magazine-recipe-search/dinner-recipes/hungarian-meatball-stew- Made 2 freezer packages with 3 servings each of Braised Beef with Sweet Potatoes (recipe from Real Simple)http://www.realsimple.com/food-recipes/browse-all-recipes/spiced-braised-beef-sweet-potatoes. It wouldn't be freezer cooking without a couple casseroles!  Casseroles are classic freezer staples because they freeze and reheat really well.  Make a couple pans of you favorite lasagna recipe or pretty much any other pasta dish and stash them away in your freezer for a low effort meal!I made 2 trays of Lasagna Primavera, each with about 8 servings.  These will be perfect to pop in the oven for visitors!  I used the following recipe from Martha Stewart.  I haven't tried this yet, but it looked and smelled delicious while prepping!http://www.marthastewart.com/340876/freeze-ahead-lasagna-primaveraI also made 4 containers with about 3-4 servings per container of Baked Ziti, using our favorite recipe from The Pioneer Woman:http://thepioneerwoman.com/cooking/baked-ziti/. For fresh cooked food straight from your freezer, slow cooker meals are an excellent option.  Basically, you package together the ingredients of your slow cooker recipe and freeze them. The night before you want to prepare your easy home cooked meal, defrost them in your fridge. In the morning dump the ingredients into your slow cooker and cook! The benefit is you have all the prep work done (ingredients gathered, vegetables and meats prepped, spices added, etc) and all you have to do is let your slow cooker do the rest of the work! These meals come out tasting like you slaved away in the kitchen all day! These meals are a great option for when you'll have visitors over or when you just want an extra special dinner!I have the following slow cooker meals prepped and ready for a fresh from the freezer meal:Crock Pot Mongolian beef (approximately 4 servings)http://whoneedsacape.com/2012/11/easy-crockpot-mongolian-beef/Honey Teriyaki Chicken (approximately 6 servings)http://www.twindragonflydesigns.com/crock-pot-freeSalsa Chicken (approximately 6 servings)http://www.twindragonflydesigns.com/crock-pot-freeCrockpot beef vegetable soup (approximately 6 servings)Spiced Braised Beef with Sweet Potatoes (approximately 6 servings)http://www.realsimple.com/food-recipes/browse-all-recipes/spiced-braised-beef-sweet-potatoes. I also wanted to be sure we had plenty of small portions of our family staples frozen and ready to be reheated.  Think about your standby meals and how they'll hold up when frozen and reheated to customize these options with what will work best for you.  I made:Chili: I made a big pot and divvied it into 6 freezer containers with about 2 servings eachMeatballs: I filled 5 freezer containers each packed with 2-3 servingsChicken Pot Pies in Pocket form (for easy eating on the go): I made 11 pocketsSloppy Joes: I made 3 containers each with 5-6 servingsSalisbury Steaks: I made 6 \"steaks\" and packaged in 3 freezer packages of 2 steaks each. Finally, don't forget to stock up on side dishes and anything else you may need to accompany your freezer meals! I added several packages of buns to my freezer supply (so we have something to put all that sloppy joe meat on). I also bought some loaves of frozen garlic bread to go with our Baked Ziti and Lasagna Primavera.  In addition, I made sure we have several bags of frozen vegetables to use as sides. I also made sure our pantry supply includes enough pasta for all those yummy meatballs, and other grain choices like Quinoa, rice and couscous.. * Package your freezer meals leaving as little air as possible in the container.  * Remember water expands when frozen!! This is especially important when freezing meals in glass!! So when freezing a meal in glass containers, leave head room of about 1\" in the jar before freezing, this will ensure you don't open your freezer to see a cracked glass and a meal you worked on destroyed...* Never put hot meals straight into the freezer (again, super critical when freezing in glass, and a good best practice no matter what type of container you are freezing in).* Always label your meals before freezing so you know what everything is. Include instructions on reheating, suggestions for side dishes or any ingredients to add when cooking or reheating.* Make double or triple whatever you're cooking for dinner and freeze the rest for an easy way to build up your freezer stockpile!!* Look for sales on meats or vegetables you eat regularly and stock up then! Use these opportunities to prepare several portions of your favorites!* Keep notes on the recipes you used. As you heat your meals and serve them, add to your notes your own rating of the meal and any adjustments you'd incorporate if you prepared that meal again. This way, over time, you'll develop your own twists on recipes and a staple supply of your own freezer meal personal favorites!I hope this Instructable gave you some ideas for how you might stock up your own freezer in advance of a big event!! It was such an easy process, this just might become my new approach to cooking all the time!!\nRead the question below and select from the following choices.\nA: How to Eat for Free Everyday (at a University)\nB: Casseroles\nC: A Little Off the Top...\nD: How to Open and Enjoy a Bottle of Wine.", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_170_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_170_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_170_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_170_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_170_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_170_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_170_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_170_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_170_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_170_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_170_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_170_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_170_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_170_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_170_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_170_15.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Summer Fruit Iced Tea\nB: Top Layer\nC: Peel and Cut the Mango\nD: Top Layer", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Peel and Cut the Watermelon', 'Peel and Cut the Cucumber', 'Remove Kiwi Skin and Cut the Kiwi', '@placeholder']", "context": "Here is the context of these images:\n. You will need the following materials/ingredients:-1 2 liter bottle of Sprite- 1/2 mini watermelon- 1 large mango- 1 cucumber- 2 kiwis- Tajin chili powder- Pitcher- Knife- Cutting board- Fruit peeler- Spoon. . Place your Sprite in the freezer until right before it starts to freeze. You do not want to add ice to your drink because it will water down the flavor.. Using a fruit peeler, remove the entire peel from the cucumber and remove about 1/2 inch from each end. Cut the cucumber into several long strips. Then, cut the strips into small cubes. Place the diced cucumber into a pitcher.. The easiest way to dice the kiwis are to first cut the kiwis in half. Then, using a spoon, scoop the kiwi fruit out of it's peel. After scooping the kiwi out, dice the kiwis into equal sized pieces. Finally, place all of the diced pieces into the pitcher. . Carefully peel the skin off the mango using a fruit peeler. Dice the mango  as shown on the pictures. For presentation purposes only, try to dice the mango into even pieces. Then, placed the dices mango into the pitcher. . Season the diced fruit with Tajin chili powder.. Pour the entire 2-liter bottle of Sprite into the pitcher with the diced fruit.. Grab a spoon and enjoy a refreshing and delicious summer treat!\nRead the question below and select from the following choices.\nA: Summer Fruit Iced Tea\nB: Top Layer\nC: Peel and Cut the Mango\nD: Top Layer", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_171_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_171_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_171_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_171_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_171_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_171_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_171_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_171_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_171_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_171_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_171_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_171_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_171_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_171_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_171_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_171_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_171_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_171_17.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_171_18.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_171_19.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_171_20.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_171_21.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_171_22.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_171_23.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_171_24.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_171_25.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_171_26.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_171_27.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_171_28.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_171_29.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_171_30.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_171_31.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_171_32.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_171_33.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_171_34.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_171_35.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_171_36.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Add Flour Mixture\nB: Sweet Delicious Banana Bread\nC: Final Product\nD: Easy Homeade Banana Bread", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Coat Loaf Pan', '@placeholder', 'Bake', 'Cool']", "context": "Here is the context of these images:\n. Coat a 9-by-5-inch loaf pan with butter and dust it with flour, tapping out the excess.. Whisk together the measured flour, baking powder, salt, baking soda, and cinnamon in a large bowl to aerate and break up any lumps. Set aside.. Place the sugar, eggs, oil, and vanilla in the bowl of a stand mixer fitted with a paddle attachment and beat on medium speed until thoroughly combined, about 2 minutes.. Add the bananas and sour cream and mix until just combined.. Scrape down the sides of the bowl, add the flour mixture, and mix until just combined. Turn the batter into the prepared loaf pan.. Preheat oven to 350\u00b0F. Bake until a toothpick inserted in the center comes out clean, the top is golden brown, and the bread is pulling away from the sides of the pan, about 50-60 minutes. . Transfer to a wire rack to cool for 10 minutes. Slide a knife around the perimeter of the pan, invert to release the bread, and cool completely on the wire rack before serving.\nRead the question below and select from the following choices.\nA: Add Flour Mixture\nB: Sweet Delicious Banana Bread\nC: Final Product\nD: Easy Homeade Banana Bread", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_172_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_172_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_172_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_172_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_172_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_172_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_172_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_172_7.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Add Corn Syrup or Sugar\nB: Chewy Chocolate Chip and Walnut Cookies\nC: Bake the Cookies\nD: Scooping and Baking!", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Gather Your Supplies', '@placeholder', 'Add More Ingredients', 'Bake Until Golden']", "context": "Here is the context of these images:\n. You're going to need a little more stuff than you normally would to achieve freshly-baked chocolate chip cookie perfection. Why is that? Chewy cookies hold more moisture than crispy cookies, so therefore we need to use ingredients that hold on to more moisture than your average cookie recipe will have you use. Ingredients like corn starch and corn syrup hold on to more water when they bake, whereas more traditional ingredients bake-up crispier (you can also make cookies crispier by baking them longer and at a lower temperature.)So what's the secret to that long-lasting \"freshly-baked\" texture that so many store-bought cookies seem to have? You need to mix it up. You will need to make two batches of dough. One traditional batch with \"crispy\" ingredients, and one batch that substitutes the traditional ingredients for \"chewy\" ingredients. This is a trick that can be used with any recipe. The specifics of our recipe can be found on page 282 of Cooking for Geeks.Here's what you'll needButterBrown sugarWhite sugarCorn starchCorn syrupEggFlourOatmealSaltVanillaChocolate ChipsLemon juice. Using two separate bowls (one \"chewy\" and one \"crispy\"), cobble-together your ingredients. Add the butter and brown sugar per the recipe you are using to each bowl. This is where the batches will differ. Next you will add the white sugar to the traditional, \"crispy\" bowl and mix   To the \"chewy\" bowl, you will add corn syrup and mix. Next, you will add the following ingredients to both batches and mix:vanilla  lemon juice  egg  oatmeal  flour  salt. Add your cornstarch to the \"chewy\" batch and mix.. Add the rest of your ingredientschocolate chips walnuts (optional). To create your fresh-baked cookies that will stay as freshly-baked as they were out of the oven, here is how you put them together. Scoop a bit of \"crispy\" dough onto your parchment-lined baking tray Now take a slightly smaller scoop of \"chewy\" dough and press it into the middle of the \"crispy\" dough. Bake your cookies until they reach golden cookie perfection. Your cookies will be crispy on the outside and chewy in the center, and they should stay that way!. For the specifics on this and other scientifically-minded cookery, check out my book, Cooking for Geeks! This recipe appears on page 282 of the book. You can try it for yourself by clicking here and reading two chapters for free!If you liked this Instructable, you can like and subscribe to my YouTube channel. Cooking for Geeks is available on Amazon\nRead the question below and select from the following choices.\nA: Add Corn Syrup or Sugar\nB: Chewy Chocolate Chip and Walnut Cookies\nC: Bake the Cookies\nD: Scooping and Baking!", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_173_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_173_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_173_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_173_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_173_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_173_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_173_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_173_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_173_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_173_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_173_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_173_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_173_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_173_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_173_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_173_15.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Vegan Rainbow Cupcakes\nB: Haunted Grave Cupcakes\nC: Add the Decorations\nD: Veganir Butterycreamy", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Bakus Cakus!', 'Colouria Batterun', '@placeholder', 'Finitus Delicious']", "context": "Here is the context of these images:\n. IngredientsCake 250ml soy/vegan milk 1tsp cider vinegar 350g self raising flour 2 & 1/2 tbsp Corn flour 2 & 1/2 tbsp Icing sugar 80ml vegetable oil 180g Caster sugar Red/Blue/Green/Yellow food colouringButtercream 100g Veggie Shortening 100g Vegan butter 500g Icing sugar Any food colouring that isn't Red/Blue/Green/yellow. *WARNING* Images used may contain magical apparatusStart by pre-heating your oven to 175\u00baC/350\u00baF/gas 4 Get 250ml of soy/vegan milk and add a tsp of cider vinegar to it. Leave this to one side.  In a bowl sieve in 350g of self raising flour, add 2& 1/2 tbsp of corn flour and 2 & 1/2 tbsp icing sugar. Mix together.. Add to it 180g of caster sugar and mix together. Add in your milk/vinegar mix and mix again. Pour 80ml of rapeseed or vegetable oil. You guessed it, mix again. Add 2 tsp vanilla extract or any other flavour you wish. Take out 4x50-60g of the mixture and put into their own bowls. Colour them the house colours (Red, Blue, Green & Yellow). Take an ice-cream scoop and scoop the plain mixture into your cases, just under one scoop should be enough for each case. Take a teaspoon of any colour and put into the centre of the mixture by cork-screwing the teaspoon around in the middle. This will give the core the hidden house colour.Put into your oven for 10-15 minutes or until a knife comes out clean. I like to put tinfoil on the top to stop Hornwoggles stealing my dreams. But also, it makes them rise more evenly. Take the tinfoil off for the last couple of minutes baking to let the top solidify. Once ready, take out and leave to cool on a wire rack.. Now to make your buttercream. Its vegan buttercream, so that works well with the cakes. Take 100g of vegan butter and 100g of vegetable shortening and mix this in a bowl with 500g of icing to make it a good stiff consistency. Add Vanilla extract or any other flavour- for... flavour. Add some colouring to make it a colour, any colour you want as long as it isn't one of the house colours.. Take 200g of fondant icing and colour it brown to match the sorting hat. You can also use the brown food colouring and a paintbrush to add some texture to the hats. To sculpt the hat, take a small piece around the size of a chick pea, and turn it into a flat disc. It doesn't have to be perfectly round, as the Sorting Hat is a wise, old hat who has seen many things. Take another piece and roll it into a ball. The size can vary, but as long as the ball sits in the middle of the disc you just made with enough room around the outside for a brim. Roll one part of the ball up so it becomes tear drop shaped. Sculpt the eyes and mouth just above half way up, and add some indents to the tear drop. Push the whole thing down gently into the middle of the disc, and fold over the pointy end of the tear drop so the hat looks like its folded over.. Spoon the buttercream into a piping bag and pipe onto the cupcakes. Place the hats on top of each cupcake. When you bite into them, you will find out which house you have been sorted into. Make a game of it, or eat them all!Let me know what houses you get. Watch the video  for my first Muggle Baking class and another way to learn.See you soon Colin\nRead the question below and select from the following choices.\nA: Vegan Rainbow Cupcakes\nB: Haunted Grave Cupcakes\nC: Add the Decorations\nD: Veganir Butterycreamy", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_174_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_174_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_174_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_174_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_174_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_174_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_174_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_174_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_174_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_174_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_174_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_174_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_174_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_174_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_174_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_174_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_174_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_174_17.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_174_18.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_174_19.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_174_20.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_174_21.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_174_22.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_174_23.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_174_24.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_174_25.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_174_26.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The Slices of the Lamb\nB: Coconut Curry Lamb\nC: Its Ready?\nD: Pack", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Spices and Sauce', '@placeholder', 'Tay', 'Ring the Dinner Bell']", "context": "Here is the context of these images:\n. SpicesGinger - about a 2\" piece, skin removedGarlic - 5 cloves or more to suit youBlack Cumin - 1 teaspoon (tsp)Cumin - 1 tablespoon (TB)Coriander - 2 tspCardamom - 1/2 tspBlack Pepper - 1 tspRed Pepper - 1 tsp to 1 TB depending on how hot you want it.Turmeric - 2 tspFenugreek Leaves - 1 TBAll of these spices can be adjusted to whatever you feel like. Add different ones, leave out some, make it your own.SauceSkin the onions and cut them into quarters and toss them into a blender. Add 1 cup of yogurt and dump in all of your spices. If the mixture has trouble getting ground up in the blender just add a little water or milk to get it going. NotesSome of my spices I just left whole. I knew this particular blender would pulverize them and make a smooth blend of it all.. I used lamb cuts from the leg. I rarely cook a whole leg of lamb so I had the butchers cut the legs into 3/4\" steaks. I used about 2.5 pounds. Cut the meat up in to 1 to 2 inch pieces and toss them into your cooking vessel with a tablespoon of salt. . Add the sauce from the blender to the meat in the pot and add a cup of cream. Mix all of that together and turn on the heat. Bring it up to a simmer and then cover and cook covered on low to medium-low for 1 1/2 hours. Give it a stir occasionally to make sure nothing is sticking to the bottom.. While your meat is cooking away on the stovetop grab some red or gold potatoes (1 1/2 to 2 pounds total) and cut them into quarters. I like the smaller potatoes for this. Peel them if you wish, I don't.When you reach the end of the 1 1/2 hours of cooking add the potatoes and cook for another 30 to 45 minutes until the potatoes are tender.. While your curry is cooking away and making your house smell like an Indian street bazaar you should probably make basmati rice and dice up some cilantro to top your dish. Store bought naan is a lovely thing to keep around too.. Serve it up on the rice with some hot, buttered naan and sprinkle with cilantro. It doesn't get much better. Tender chunks of lamb in a thick and flavorful sauce.For those that are weird about eating lamb you can substitute whatever you like. If you use chicken though you'll want to cook the sauce separately for a while to reduce down and mix the flavors and then add your chicken later so you don't end up with chewy hockey pucks in your curry.\nRead the question below and select from the following choices.\nA: The Slices of the Lamb\nB: Coconut Curry Lamb\nC: Its Ready?\nD: Pack", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_175_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_175_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_175_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_175_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_175_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_175_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_175_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_175_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_175_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_175_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_175_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_175_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_175_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_175_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_175_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_175_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_175_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_175_17.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_175_18.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_175_19.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_175_20.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Ingredients\nB: Halloween Cake\nC: Yummy Ready\nD: Almost Ready", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Batter Up!', 'Prepare Your Frosting', 'Go to Town!']", "context": "Here is the context of these images:\n. ToolsYou'll want some basic cupcake making devices for this endeavor:Cupcake panCupcake linersHand or stand mixerRubber spatulaSpoonsMeasuring spoons and cupsLag screws or marbles - something like that.\u00a0 You know, to make the skull shapes.Best Cupcake Recipe EVAROf course you can use a box mix to speed up this process, or your own stand-by recipe for perfect cupcakes.\u00a0 But if you're looking for something new, look no further.\u00a0 With a whole new technique and tangy taste, these vanilla cupcakes will blow your socks off!3 cups (360g) of flour1 cup (2 sticks / 227g) of salted butter - room temperature2 cups (110g) of sugar1 TBSP +1 tsp (20mL) baking powder4 large eggs1/2 cup (4 oz / 237mL) whole milk1/2 cup (4 oz\u00a0 / 237 mL)\u00a0of plain yogurt2 tsp. (10mL) vanilla extractCan't Believe It's So Good and So Simple Buttercream Frosting RecipeThat's about all I\u00a0need to say about it.\u00a0 Only four ingredients and five minutes stand between you and frosting you'll actually want to eat with a spoon (this coming from someone who scrapes off the frosting to get to the good stuff!)1 cup (2 sticks / 227 g) of unsalted butter, room temperature4 cups(1 lb / 448 g)\u00a0 of powdered sugar1/2 cup (4 oz / 237mL) of milk2 tsp (10mL) of vanilla extractOpt.\u00a0 Chocolate chipsOpt. Food ColoringTasty ToppingsThis can be left up to your imagination and will depend on what devilish devices you have planned!I used M&Ms to create eyes, nostrils and beaks, Junior Mints for skully eye sockets, mini Oreos for owly eyes and red hots for special zing where needed.\u00a0 Get creative with what you can find!. \n          Let's make some cupcakes!\u00a0 Now pay attention, because this isn't going to go down like you expect.First, place cupcake liners in the pan. \u00a0 Okay, you probably expected that.Now, preheat your oven to 500oF (260oC)What??\u00a0Scooch have you lost your mind?\u00a0 See, I\u00a0know what you're thinking.\u00a0 But no, I have not lost my mind!\u00a0 This tip comes to me from a.nony.mouse via Cupcake Cones:if you want the crowned tops to be as high as possible, pre-heat the oven to 500 degrees F instead, and as soon as you put the cupcakes in there, lower the temperature to 350 and cook, it gets the crown nice and high! (it's the trick they use at bakeries on those giant cupcakes and muffins you always wished you could re-create!)Now for some mixed-up mixing!You don't need to sift the flour if you measured it properly like I\u00a0taught you!\u00a0 Stir in sugar and baking powder to combine well.\u00a0Chop up your butter into small pieces and add to the flour mixture.\u00a0Say what?\u00a0 That's right. Add your butter to your dry ingredients.\u00a0 Now blend, blend, blend until it resembles the texture of breadcrumbs.\u00a0In a separate bowl, blend together eggs, milk, yogurt and vanilla extract. You have a whisk, you say?\u00a0 Even better!\u00a0 I\u00a0just used my regular mixer beaters.\u00a0 Get it nice and frothy.Add this to the bowl with the dry ingredients and beat until just combined.\u00a0Try to abstain from licking the beaters.\u00a0 I can't condone eating uncooked eggs.\u00a0 But if you slip and get some in your mouth by mistake - YUM!!\u00a0 You'll notice right away what a unique tang the plain yogurt adds.\u00a0 This flavor will mellow out in the baking process, but I kind of wish it didn't!\n        . \n          As you know, you need bolts to make good skulls.Well, skull-shaped cupcakes.Not just bolts, I guess. Could be lag screws like I had on hand.\u00a0 Could be marbles as I've seen used before.\u00a0 You know, whatever you got that isn't going to melt or smell funny while baking.For Skull-shaped cupcakes, drop your implements into the cupcake pans on the OUTSIDE\u00a0of the cupcake papers where you want the jaws to form.\u00a0For non-skull-shaped cakelets, fill the cups 2/3 full, as is.Place cupcakes in preheated oven and immediately lower the temperatureto 350oF (180oC)Bake for 15-20 minutes, until the tops spring back when you poke 'em.. This technique for the most amazing buttercream recipe is going to require both your faith in me and some patience.\u00a0Can we agree to this?\u00a0 Prepare to be filled with disbelief before you are filled with awe.\u00a0 Let's go.You know by now that cutting your room-temperature butter into chunks is essential to successful baking, so I\u00a0won't even condescend to mention it to you.Add your butter (1 cup, remember?)\u00a0 to your powdered sugar (AKA confectioner's sugar, AKA icing sugar, AKA\u00a010x sugar - never really understood that last one) (4 cups / 1 lb / one whole entire box, yes).\u00a0Stir to fold the butter and sugar together.\u00a0 This is only a preventative step to keep from creating an enormous dust storm when you introduce the mixer.\u00a0 Not sure if it will work for you.\u00a0 Never does for me, but heck, give it a go and let me know how it goes.Mix on low speed until well blended and then increase speed to medium and beat for another 3 minutes.This is the point where your faith will be tested.\u00a0 At some point during minute two or three, you'll be ready to throw in the towel and crack open your can of Duncan Heinz.\u00a0 DON'T\u00a0DO\u00a0IT.\u00a0 Nothing can compare to the magic you are about to make when you just keep the mixer on for a moment more. Once these two ingredients have been magically transformed into something that actually resembles frosting instead of the dusty, sticky mess is did two seconds ago, it's time to add the rest.Add milk (1/2 c)\u00a0and vanilla (2 tsp)\u00a0and continue to blend on medium speed for one minute more. \u00a0To make chocolate frosting:Melt chocolate chips in a bowl - 14 oz to turn the full batch into chocolatey goodness, less if you're divvying it up.Once the chips have cooled, stir into prepared buttercream frosting.\u00a0 Adjust to taste!To make colored frosting:Add food coloring to suit your needs.\u00a0 I didn't really need to make a whole step about that, did I.. \n          On your cupcakes, I\u00a0mean. Not literally.\u00a0 Unless you need to pick something up from the market to finish them.\u00a0 That's annoying when that happens, isn't it?\u00a0 You end up using lag screws in your baking just to avoid another trip to the store!For Skullcakes:Junior MInts make great, quirky eye sockets, halved M&Ms stand in for nostrils, and sliced chocolate chips define teeth.\u00a0\u00a0For Owlcakes:Halved mini-Oreos make great eyes with M&M pupils.\u00a0 M's also make adorable beaks.\u00a0 Pipe extra chocolate frosting into expressive furrows.\u00a0\u00a0\u00a0For Mummies:Red hots are great for spooky red eyes (M&Ms provide other colors), and white and chocolate buttercream frosting do the rest!\u00a0For Braaaaiiiiinnnnnns:Mix red and green food coloring into the white butercream frosting.\u00a0 Pipe zig-zag lines through the cut tip of a plastic baggie or piping bag.\u00a0 If you have it, I\u00a0imagine a flat icing tip would do wonders here.\u00a0Make sure to post pictures of your own fiendish creations.\u00a0 Enjoy!\nRead the question below and select from the following choices.\nA: Ingredients\nB: Halloween Cake\nC: Yummy Ready\nD: Almost Ready", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_176_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_176_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_176_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_176_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_176_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_176_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_176_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_176_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_176_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_176_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_176_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_176_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_176_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_176_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_176_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_176_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_176_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_176_17.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_176_18.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Douwe Egbert Coffee Maker Hack\nB: Drink the Micky's\nC: Drill a Pilot Hole\nD: Get Your Cat and Start Building!", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Drill Holes for the Test Tubes', 'Stain or Seal', 'Make Some Coffee!']", "context": "Here is the context of these images:\n. 3/8\u201d Iron Pipe I used 3/8\" pipes and fittings from the plumbing department at my local Home Depot. The straight pipes, T-fittings, 90 degree elbows and couplings are all standard. The nuts are standard as well, but I've been to a few Home Depots that don't have them in stock. If you have a hard time finding the nuts, you can always order them from McMasters.  I used the following 3/8\" diameter pipes and fittings:  1 T-Fitting 1 90\u02daElbow 1 2\" Pipe 1 5.5\" Pipe 1 1.5\" Pipe 1 Coupling 2 LocknutsFunnel I used a 100mm glass funnel to hold the coffee filters.2x6 I used a short piece of scrap 2x6 to make the base. A piece of 2x8 or 2x10 would also work.Chemex Coffee Filters Chemex coffee filters work well. If you get ones that are too big, it's easy to cut them down with scissors.RYOBI 18 Volt Cordless DrillRYOBI 18 Volt Circular SawRYOBI Orbital Sander. I cut a 16\" long section of 2x6 to use as the base.. I drilled a 1/16\" pilot hole to serve as a guide for the larger holes I will drill to recess the pipes.. I used a 1 1/2\" diameter drill bit to drill a hole a little less than halfway through the 2x6. Then I centered the bit on the pre-drilled pilot hole. This hole will accommodate the lock nut and pipe end while allowing the 2x6 to sit flush on a table or shelf.. Flip the board over and drill a 3/4\" diameter hole centered on the pilot hole. This hole should go all the way through the board.. Select a drill bit that has a slightly larger diameter than the test tubes. You don\u2019t want a fit that is too tight or you might break the test tubes. I used a piece of blue painter's tape to mark 1\" from the tip of the drill bit. When I drilled the holes, I used this tape as a marker for knowing when I had drilled deep enough.. I used an orbital sander to sand the 2x6. I started with 100 grit sandpaper to round down the edges and then finished with 220 grit.. I used Danish oil to finish the 2x6. Bioshield or an acrylic finish would have also worked and would offer a bit more protection.. I used the 2 locknuts and the 1.5\" long piece of pipe to clamp on to the 2x6. The 3/8\" diameter pipe fits through the 3/4\" diameter hole and then a locknut on each side secures the pipe to the wood. The large 1 1/2\" diameter hole on the underside of the board hides the bottom locknut.. Insert the glass funnel into the top T-fitting and you're ready to add a filter, coffee grounds and hot water! I used Chemex filters, but accidentally bought the large ones, so I had to use scissors to cut them down to size.. Good luck making your own pipe coffee maker and please email or tweet photos to @benuyeda or ben@homemade-modern.com. For more DIY ideas and projects, visit us at HomeMade Modern.\nRead the question below and select from the following choices.\nA: Douwe Egbert Coffee Maker Hack\nB: Drink the Micky's\nC: Drill a Pilot Hole\nD: Get Your Cat and Start Building!", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_177_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_177_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_177_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_177_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_177_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_177_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_177_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_177_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_177_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_177_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_177_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_177_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_177_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_177_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_177_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_177_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_177_16.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Prepping Your Meat\nB: How to Make an Arnold Palmer Mocktail\nC: Eating the Core\nD: Almost Finished...", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['The Smoker', 'The Slab', '@placeholder', 'Smoking (finally!)']", "context": "Here is the context of these images:\n. Smoking is a unique method of cooking since it uses such low temperatures. Most ways of cooking meat focus on developing flavors through heavy use of the Maillard Reaction and Caramelization (don't worry too much about what that first one is it's kind of complex, but if you do want to look into it, it's actually really interesting). They do this by heating the target food to high temperatures for relatively short periods of time. These methods work great, but there's more to cooking than just slightly burning things, and you can't say you've lived until you've experienced that first hand. The key to smoking meat is maintaining a lower temperature -- usually around 200 - 300 Fahrenheit or 93 - 150 Celsius -- for long periods of time (and I do mean long). This will help to break down connective tissues in the meat (namely collagen) and render fats. The collagen is very important to meat because it becomes gelatin when it's broken down; gelatin gives meat a nice tender texture and fuller flavor. But, because of the length of time it takes and the volume of dry air that passes by the meat, it can become very dry, so the fat needs to render into the meat to keep it moist. Of course the big difference between roasting and smoking is smoke.Smoke is a combination of gasses, liquids, and very fine particles. Most smoke you see is very white and puffy; this is because of the amount of large ash particles. Ash is not something you want in your food. When you're smoking meat, you usually want to barely be able to see the smoke. This means there's a relatively low amount of ash and that the particles in the smoke are very fine. The best smokes are pale blue in color, so try and stay away from those towering pillars you might imagine coming out of the smoker. Ah, yes the smoker, well, that's a whole different issue.... At this point you might be saying to yourself \"Gee whiz, this whole smoking business sure does sound swell, but, shucks, I can't afford to go out and buy one of those newfangled contraptions to cook it!\" This is a common sentiment among would-be pitmasters, and it's one that has undoubtedly finished many great careers before they've even had a chance to begin. The real shame is that you don't need a smoker to smoke. All you need is a good sized grill with enough space between the grates and the bottom of the grill. If you don't have a good grill and you're on a tight budget, you can get away with just a kettle grill like a Weber. But smokers are designed with smoking in mind, so if you can afford one, I suggest you buy one.If you're in the market for a smoker, you'll find there are three main kinds: electric, gas, and wood. At the end of the day, real barbecue comes from a wood smoker, but they can be kind of fickle. I don't know much about electric smokers, but I've used gas smokers a few times when I'm visiting my father. All I can say about them is to save your money. Every time I've used a gas smoker, it can't reliably get cold enough to properly smoke meat. I'm not going to come out and say that gas smokers are all evil, I'm just going to advise you to not buy one. I'll talk a bit more about wood smokers since all that talk of gas smokers has left a bad taste in my mouth.As far as wood smokers go, you'll find there are three kinds: cabinets, horizontal drums, and vertical drums. I personally use a humble horizontal drum with an offset smoke box -- I like having the wood and the meat in different areas; it just makes my brain happy to have things organized like that -- but the vertical drum smokers are very popular. I won't go into too much detail here, because I'm sure it's not why most of you are reading this. This is a great page detailing the choices that are out there. Now that we have that out of the way, we can get down to the meat of this problem.. As I said in the first section, the two key elements in smoked meat are collagen and fat. These are the things you need to keep in mind when you're buying a slab of meat. You'll typically get a large slab of meat like a pork butt which is used for pulled pork. I also said in the first section that smoking and roasting are very similar, so anything labeled \"roast\" will work as well. You want to see good marbling in the meat because you need fat to keep it tender. Ribs are always a good choice, of course, but be aware of what kind of ribs you're getting. If you like to know more about ribs, you can go here and read about them, but I'm going to just breeze over them for now. All in all, just look for something that looks tasty and is pretty cheap. Barbecue isn't about expensive meats and fancy cooking, after all.I've included some pictures of good and bad choices for meats I saw while shopping at Walmart you can look at for reference. I must apologize, though; since I wasn't planning on taking the pictures and just used my phone and they are kind of hard to see. I'm going to be showing you pictures of some pork ribs as I talk more about the process of smoking, since that's what I've cooked most recently.. In the world of smoking, there's actually a lot of debate about wood. How much difference does the type of wood actually make? How big an impact does the origin play in the flavor of woods? Is it really necessary to soak wood before using it? These are just a few of the topics pitmasters argue about. I'll try and stay away from these issues, since there's a lot of very strong opinions out there and not too much evidence either way. Besides, most of these pertain only to high level competitive smokers. That being said, I still kind of need to talk about wood considering this Instructable is about smoking, so I'll try and not step on too many toes.Woods have different flavors depending on what species they are. Some woods, like hickory and mesquite, have very strong flavors, while others, like cherry or pecan, have pretty mild flavors. Once you start talking about pairings, people start to get opinionated. I understand that different people do things differently, so if you don't want to listen to me be my guest and skip to the next paragraph. I like the flavor of meat; there aren't too many things I like more than a nice medium-rare steak. Woods with strong flavors can drown out the nice meaty tones of beef, so I usually use milder woods on red meats and game. I'll save apple and the like for my pork since, frankly, pork doesn't have too much of it's own flavor. As far as poultry and fish are concerned, I like to play around with woods that don't get used as often like alder or acacia; there's no particular reason for this, I just don't care that much what happens as much as I do with meat. A lot of people disagree with me very strongly on this, saying that red meat is best suited for the intense flavors of hickory and the like. This is fine, but I want you to know that these people can't be trusted.Once you've picked out a wood, you're pretty much set. I do want to bring up a point, though, about soaking your wood. This is another topic people like to argue about. It's conventional wisdom to soak your wood before you use it, but there's some evidence saying that it's not a necessary step. The idea behind soaking your wood is to make it harder to ignite, releasing nasty carcinogens into your food. The thing is that soaking doesn't add all that much water to the wood, so it might not be an important step. I personally do soak my wood, since I live in the Mojave Desert where it's usually under 10% humidity and over 100 degrees Fahrenheit.If it were just about necessity I'd probably urge you to soak your wood anyways, but it might actually be beneficial to not soak your wood. If you soak your wood for, say, 12 hours, you'll see that the water has turned a brown color. It's not widely known what's soaked out of the wood, but some people think that certain aromatic chemicals may seep out of wood while you soak it. So give some thought to what wood you might want to use, and play around with soaking it; it's important you form your own opinions about this stuff instead of just listening to people like me on the internet.. You don't need to do much to most cuts of meat to get them ready to smoke. Certain cuts, however, like a beef tenderloin, have a tough membrane on them called silverskin. Silverskin is made of elastin, a type of tissue which is very tough and won't break down when cooked. It looks like you might imagine it would; it's a white patch of oddly metallic tissue. It's easy enough to remove, though and just takes a bit of care to not mutilate the meat. You need to get under the elastin and just run your knife through at an angle pointing into the meat (so you don't cut out of the silverskin and have to start over). Once your meat is elastin free, you need to look at the shape of the cut. The parts of the meat which are kind of skinny will cook and dry out faster than the rest of the meat, leaving you with dried up bits of jerky sticking out of your dinner.If you're dealing with pork or poultry, now's the time to prepare your brine. Brining cuts of meat helps to retain moisture and adds flavor. A brine is a salt-water solution, usually with some sugar and spices, that you soak the meat in. Salt is know far and wide for it's ability to dry out meats, so how is this supposed to make the meat more moist? The answer is a process you've probably forgotten since high school biology called Osmosis. The brine has a much higher concentration of salt than the meat, so the brine moves into the meat to balance out the concentrations. This is good because you'll wind up with more liquid in the meat than you started with. The Salt in the brine will also help break down proteins and make the meat more tender. This is a great brine for smoked pork. You can see from that page that brines do take a good amount of time (12 hours for that one), so plan ahead.Before you put your meat in the smoker, you'll probably want to rub down the meat. Rubs are mixtures of seasonings that get applied to the outside of meat. You typically want to use fairly strong spices for rubs since they're just on the outside of the meat. I usually just grab whatever sounds good for the cut, but if you aren't that comfortable with spices, you can find lots of great recipes online. To actually apply the rub, you'll usually want to put down a layer of sauce or mustard or maple syrup or really any kind of saucy condiment (less tender cuts like beef ribs don't necessarily need this extra layer, but it's still a good opportunity to get more flavor into the meat). This will add more flavor and keep the bark from becoming too hard (you can see on the pictures I skipped this on my last batch of ribs. That was a mistake; I could have gotten away with it on beef, but the pork ribs were just way too tender). Once you've worked the sauce into the meat, you need to sprinkle on a generous amount of rub. Now, the name rub can be deceiving, because you don't want to actually rub the rub. You want to massage the rub into the meat. If you rub the rub, it will ball up in the sauce and you won't get good coverage with the rub. This will need to rest for a little bit so let's go get the smoker going.. Even though we aren't going to cook over one, you're going to need to start a fire in your smoker. You can start it however you like, but I'd suggest not using lighter fluid because it tastes disgusting (don't ask how I know that). Personally, I put down a layer of newspaper, a layer of kindling, and a layer of charcoal. Once you get the newspaper going, each layer lights the one above it. Charcoal chimneys are very nice pieces of equipment and they usually don't cost too much; in fact, you can probably find some here on Instructables for free. I don't use one just because of my smoker set up; I'm not comfortable needing to reach into my side-box, since I've lost a lot of arm hair doing that. Once your fire is started, wait it out until you're just above your target temperature.Once you're nearly at the temp you want (usually 210-230 for pork, 210-240 for beef, 225-250 for poultry, and 150ish for fish) , you can add some wood chunks. Just put them in there on the embers so they can start smoldering. We aren't waiting for it to cool down all the way to the cooking temp because we want the wood to get going before we start cooking, and your temperature can do different things when you add wood (it depends on the wood's structure and whether you've soaked it and for how long). You can go fetch your meat as soon as the temperature is right and arrange it on the grate so that each piece will get a good coverage of smoke. You may see some recipes out there that say to wrap the meat in foil part of the way through the cooking. This is called the 'Texas Crutch'  and it sacrifices a lot of flavor for a relatively small amount of moisture. I don't suggest doing this; you can get the same moisture out of a cut of meat through proper preparation (brines are great) and still get the flavor you lose with the crutch.Now, we wait. You'll want to keep an eye on the smoke. If it looks white and fluffy, you have a problem, probably a fire. If it looks grey, you probably have a fire. If it looks black, you probably have a fire. Really, if you can see it without straining, you want to check it. Remember that the best smoke is barely visible and a light blue-grey. Of course, you also need to monitor temperature; most smokers have the little spinning vents you can close to limit oxygen if it gets too hot. This is really the test of a good pitmaster; you need to have the patience to just wait for hours on end and not take your eye off the smoker. Good luck.. Well, I think that's just about all I have to say, so enjoy your meal and take pride in the fact that you've created it out of a piece of muscle and a chunk of a tree. If you want to know anything, post a comment and I'll try and respond to you quickly; I can't make any promises, though, I'm going to be awfully busy coming up here. Good luck, and Good cooking. \nRead the question below and select from the following choices.\nA: Prepping Your Meat\nB: How to Make an Arnold Palmer Mocktail\nC: Eating the Core\nD: Almost Finished...", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_178_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_178_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_178_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_178_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_178_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_178_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_178_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_178_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_178_8.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Changing Martini\nB: Add Ice Cream and Milk to Bullet or Blender\nC: \u200bIn Skillet\nD: Pour Your Liquid Smiles Into Your Prepared Glass", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Add Vermouth to Glass', 'Fill the Shaker With Goodness', '@placeholder', 'Enjoy Resonsibly']", "context": "Here is the context of these images:\n. -A good quality Gin.  If you put it in the freezer and it becomes a block, then it is not good quality.\nI use Seagrams gin.  There is much better stuff, but it is not bad on a budget.\n-Vermouth.  Get a nice dry vermouth.  Gallo works but it is all I can really find easily.  There are better options, but lets not get all snooty over ingredients.\n-A nice clean, simple martini glass.  Leave the bendy, bubbly, goofy and eccentric ones in the cabinet.  Those are for the goofy new fangled martinis that girls like.  We are men here, we don't want no stinkin' chocolate in our martini, let alone vodka.  (don't get me started)\n-Shaker.  Use what you have.  Not everybody has 15 shakers in their house like me. . I use the plain ole big ice cubes you make in your freezer.  Add them to a martini glass.  If it is already cold it is a bonus.. Pour the vermouth over the ice cubes.  Just use a splash.  Maybe a half fingers worth or less.. Fill your shaker with ice.  Then pour your clear deliciousness into the shaker. \"How much gin do I use? It really depends on your shaker size and how much you plan on drinking.  I am using a medium sized shaker full of ice and I filled it maybe halfway.. Shake shake shake.\nI wrap a towel around the shaker since I will be shaking it for about 2 minutes.  The metal gets really cold and frosty and will get you from shaking it long enough.  Especially if your fingers go numb and fall off.\nRemember kids, protect your fingers from cold stuff when making dad a martini.. Take that vermouth in the ice cubed glass....\ndump it out.\nyes....dump it out.\nSpin the glass a bit so the vermouth kind of coats the glass, almost like a vermouthy glaze.\nGive it a few flicks so there is very little vermouth at all in the glass at all.\nThe less vermouth, the dryer the martini.  Some people actually don't use vermouth at all.  Most bars will give you about 20 times too much.\nExperiment with quantities to find your pleasure zone.. Unpop your shaker top and pour that liquid happiness into your freshly made glass.  I like it when there are little tiny shards of ice suspended in the drink.  You mouth never feels them, but they look cool in the glass for about a minute.. Now, try out your new found Martini making knowledge and make a few drinks.  Be sure to bring a few people who have problems with keeping their clothes on to taste test for you, and you will assuredly have a good time.\nRead the question below and select from the following choices.\nA: Changing Martini\nB: Add Ice Cream and Milk to Bullet or Blender\nC: \u200bIn Skillet\nD: Pour Your Liquid Smiles Into Your Prepared Glass", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_179_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_179_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_179_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_179_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_179_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_179_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_179_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_179_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_179_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_179_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_179_10.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Chop Chop Chop N Grate !!\nB: Plum Tarts for a Wannabe Fancy Pants\nC: Ahh, the Romance.\nD: Ingredients", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Ingredients & Supplies.', \"Let's Start With the Curd !!\", '@placeholder', 'Mix the Veggies, Barbeque It Then Go NOM Nom NOM Nom !']", "context": "Here is the context of these images:\n. Ingredients: 1) 1/2 kg Yogurt 2) 1 tablespoon Garlic 3) 1 tablespoon Ginger 4) 3/4th tablespoon Green Chilies 5) Half bowl chopped Mint leaves 6) Salt to taste 7) 1/2 teaspoon Cumin Powder 8) 1/4 teaspoon Garam Masala ( It is readily available in market ) 9) 1/4 teaspoon Dhaniya-jeera powder (Coriander - Cumin Spice Blend .... Now we make this at home but it is also readily available) 10)1/4 teaspoon Chili powder 11) 1/4 teaspoon Rock salt Supplies: For the Yoghurt: 1) Muslin cloth preferred (but you can use others) For the Rest: 1) Knife 2) Kitchen Grater/ shredder 3) a bowl to mix all ingredients 4) a big spoon ( to help mixing) This makes approx. 7 -8 skewers. Also, you can Increase or decrease the no. of spices as per your taste.. This is a very simple yet an important step. Pour the yogurt on the muslin cloth and tie it up like shown. Then let it hang for an hour or two. Usually, we hang it on the sink tap. This process allows the water in the yoghurt to drain out, leading to a nice solid yoghurt mass. The importance of letting the water drain is so that the final mixture sticks to all the vegetables nicely, which would otherwise lead to a liquidy marination causing it to drip during the BBQ process.. I like to keep all my ingredients ready before i start the mixing process and that's y... Finely chop the Mint leaves & Green Chilies. Finely grate Ginger & Garlic cloves.. Place the curd/yoghurt into the bowl and mix it with a spoon. Then add all the ingredients to it and mix it again. Your marinate is ready !!. Mix all the vegetables in the marinate and refrigerate it for a couple of hours. This step, helps in enhancing the flavor a lot. I used onions, Green chili peppers, Mushrooms And paneer(cottage cheese). You can add potatoes too, but i do not prefer them. Put the veggies, in any order u like, on the skewers. We have a small grill, so i keep them short. Now go ahead and BBQ them!! Do not forget to let me know how it turned out !! Feel free to make any changes to the recipe. As i was saying, you can add chicken pieces to it too. Serve it with a soda/ beer or orange juice..Happiii Barbecuing !!\nRead the question below and select from the following choices.\nA: Chop Chop Chop N Grate !!\nB: Plum Tarts for a Wannabe Fancy Pants\nC: Ahh, the Romance.\nD: Ingredients", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_180_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_180_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_180_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_180_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_180_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_180_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_180_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_180_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_180_8.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: How to Make Rice\nB: Filling Stuffing\nC: Preparation\nD: Making Carrot Parantha", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Preparation', '@placeholder', 'Cook Rice', 'Mix Cooked Rice With Ingredients in the Pan']", "context": "Here is the context of these images:\n. One raw Mango2 Medium sized Onions5 to 6 Garlic clovesOne inch long piece of Ginger2 to 3 Green chillies2 TomatoesBunch of Mint leavesCoriander leavesHandful of Curry leavesOne teaspoon of Mustard seeds mixed with split black gram lentilsOne teaspoon of Turmeric powderHalf a teaspoon of Asafoetida powder2 to 3 Dried Red chilliesOne teaspoon of Cumin seeds2 cups of Rice2 Tablespoons of Cooking OilSpices in small quantities (Optional)Star AniseClovesCinnamonCardamon pods. Cut both ends of raw mango and peel off the skinGrate the mango  and take about one cup of it. Finely chop the onionsDice the tomatoes into small piecesShuck Garlic and remove ginger skinMake Ginger Garlic paste using a mixer grinderSplit Green Chillies in halves. Heat a frying pan over medium flame and add 2 teaspoons of cooking oilAdd mustard seeds and cumin seeds to the oilBreak the spices like cloves, star anise, cinnamon and cardamom pods into pieces and add to the oilBreak the dried Red chillies into pieces and add to the oilAdd handful of curry leaves and split green chillies to oil and saute for few seconds. Add chopped onions and Ginger-Garlic paste to the pan and stir fry till raw smell disappearsAdd handful of mint leaves and mix well. Once the raw smell from onions and ginger-garlic paste disappears, add the chopped tomatoes to the panCook till water content evaporates from the mix and oil starts oozing out at sidesNow add the grated raw mango to the pan and mix wellYou can add little amount of salt to the mix. Most people add salt while cooking rice also, so take care not to add too much of salt to the mix. In the mean time, you can also cook the rice in a pressure cooker and keep aside till the pressure drops to normal.Here, the rice should be slightly under-cooked, other wise it will make a paste-like mix with the ingredients in the pan.. Take 2 cups of cooked rice and add to the ingredients in the frying pan.Mix everything together over low flameOnce it is properly mixed, transfer to a serving bowlGarnish with coriander leaves and serve hotNo side dishes are required with the Mango rice. However you can use potato chips or fried crispies with it. This dish is also good for taking along during outings / picnic with children\nRead the question below and select from the following choices.\nA: How to Make Rice\nB: Filling Stuffing\nC: Preparation\nD: Making Carrot Parantha", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_181_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_181_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_181_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_181_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_181_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_181_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_181_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_181_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_181_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_181_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_181_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_181_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_181_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_181_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_181_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_181_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_181_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_181_17.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_181_18.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_181_19.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_181_20.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_181_21.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_181_22.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_181_23.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_181_24.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_181_25.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_181_26.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_181_27.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_181_28.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_181_29.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_181_30.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_181_31.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_181_32.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_181_33.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_181_34.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_181_35.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_181_36.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_181_37.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Rolly Pollies\nB: Topping and Finishing the Basketballizza\nC: Croque Madame\nD: Fried Eggs", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Supplies and Ingredients', 'Hamburger and Rice', 'Brown Gravy', '@placeholder']", "context": "Here is the context of these images:\n. For this recipe you will need the following:4 cups cooked rice (I cooked my rice in beef broth, ground thyme, 1 tablespoon margarine, 1 tsp salt & 1 tsp minced onion) 1 pound lean ground hamburger, 4 large eggs, brown gravy mix.. Cook the rice according to package directions. While the rice is cooking, divide the hamburger meat into 4 equal parts, form into patties and cook in a frying pan over medium heat until done. 3 or 4 minutes on each side, until no longer pink in the center.. Empty the brown gravy mix into a skillet and add water. cook according to package directions, continue stirring with a whisk until thickened.. Place 2 Tablespoons vegetable oil in a skillet, cook eggs over-easy. Place 1 cup of cooked rice on each plate. Top with a hamburger patty and gravy. Carefully place an egg on top of this and there you go! Loco Moco. Delicious and filling. Serves 4.Enjoy!\nRead the question below and select from the following choices.\nA: Rolly Pollies\nB: Topping and Finishing the Basketballizza\nC: Croque Madame\nD: Fried Eggs", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_182_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_182_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_182_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_182_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_182_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Preparing the Mixture to Freeze\nB: Sangria Ice Cream\nC: Homemade Ice Cream\nD: Mix an Freeze", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Mixing the Main Ingredient', 'Adding the Second Ingredient', '@placeholder', 'After 12 Hours']", "context": "Here is the context of these images:\n. List of ingredients:1 can of sweetened condensed milk of 396 g.1 pint of Heavy Whipping Cream (473 ml or 16 FL OZ)1 can of HERSEHEY'S COCOA 100% CACAO & Natural Unsweetened of 8 OZ. Take a plastic container of 1 gallon more or less like the blue one of the photo.. Add the Heavy Whipping Cream inside the blue container.. Once the Heavy Whipping Cream has been mixed, you will  have a soft cream.. Now, you should add the can of condensed milk and mix it with the cream done.. Add 4-Tbsp of HERSHEY'S COCOA mixing it with the rest of the ingredients.. Take a refractory glass so that you can pour the mixture made previously.  Then get plastic wrapping to cover it and so you can freeze during 12 hours.  . After 12 hours, you will have a delicious homemade chocolate ice cream.\nRead the question below and select from the following choices.\nA: Preparing the Mixture to Freeze\nB: Sangria Ice Cream\nC: Homemade Ice Cream\nD: Mix an Freeze", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_183_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_183_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_183_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_183_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_183_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_183_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_183_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_183_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_183_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_183_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_183_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_183_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_183_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_183_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_183_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_183_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_183_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_183_17.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_183_18.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_183_19.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_183_20.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_183_21.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_183_22.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_183_23.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_183_24.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_183_25.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_183_26.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_183_27.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Quickie Homemade Mayonnaise\nB: Flavorize It...\nC: You Have Mayonnaise!\nD: Top It Off!", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Your Ingredients', 'Tools You Need', 'Add Sunflower Oil While Mixing', '@placeholder']", "context": "Here is the context of these images:\n. These are the ingredients you need:1/2 cup of soy milk (100 ml) 1 cup of sunflower oil (250 ml) 1 tablespoon of apple cider vinegar 1 teaspoon of mustard A pinch of salt & some pepper to tastePlease make sure you buy unsweetened soy milk. I like my soy milk made solely from water & soy beans, without any additives.Or you can make your own soy milk at home! I favored this great instructable from jen7714https://www.instructables.com/id/How-to-Make-Homema...You can replace the sunflower oil by any vegetable oil, but I believe the sunflower oil gives the best results. Some people like to use extra vierge olive oil.. Grab your hand-held blender! Never tried it with an upright blender, but it should work just fine as well.. Pour the soy milk + mustard + apple cider vinegar + salt + pepper in a jug.Blend for 3 seconds, just to get everything mixed.. I like to pour the sunflower oil slooowly into the mixture while blending...But that is because I like some kitchen drama. If you add the oil all in once, it still works. Just blend and watch the magic happen!It takes about 30 seconds to get your mayonnaise.. All done! Stored in a jar, it keeps up to 2 weeks in your fridge. Top tip: add some grated garlic, spring onion, rosemary or curry to your mayonnaise. Or my favorite, fresh dill!And guys, there is a Superbowl this Sunday, so I heard? Perfect, try your snacks with this mayonnaise. You won't be disappointed! Enjoy!\nRead the question below and select from the following choices.\nA: Quickie Homemade Mayonnaise\nB: Flavorize It...\nC: You Have Mayonnaise!\nD: Top It Off!", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_184_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_184_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_184_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_184_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_184_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Green Beans\nB: BEEF ENCHILADAS\nC: Defrost/shred the Potatoes\nD: Preheat and Combine!", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Hamburger Meat', '@placeholder', 'Stewed Tomatoes', 'Shredded Cheese']", "context": "Here is the context of these images:\n. one pound hamburger. Place hamburger into a pot with 1/3 cup of water, season with salt and pepper. Once the hamburger is browned and broken up.Add 1/3 cup diced onions 1/4 cup diced celery and continue cooking until done. While the hamburger is being browned In a separate pot boil potatoes, to make mashed potatoes. place the cooked hamburger into an oven proof baking dishtop with canned french style green beans that have been drained drain and mash the potatoes in a separate pan. now top with Stewed Tomatoesadd salt and pepper to taste. layer the mashed potatoes nextand cover with 1 cup shredded cheese. all that is left to do is heat at 350 degrees for 30 minutes, just to heat it through and melt the cheese Enjoy your one dish beef casserole\nRead the question below and select from the following choices.\nA: Green Beans\nB: BEEF ENCHILADAS\nC: Defrost/shred the Potatoes\nD: Preheat and Combine!", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_185_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_185_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_185_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_185_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_185_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_185_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_185_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_185_7.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Cheesy Jalapeno Bread\nB: Ingredients\nC: Make Dough Balls\nD: Melt the Garlic Butter and Brush It on the Top of the Bread.", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Cut and Grate', '@placeholder', 'Add Fixing', 'Bake']", "context": "Here is the context of these images:\n. Get the following ingredients ready to use:Whole wheat flourAll purpose white flourActive dry yeastTable saltMozzarella cheeseFresh herbs (I used parsley, dill, cilantro, basil and chives) . 300grams water (about 1 1/4 cups)1 tsp salt1/2 tsp yeast100grams whole wheat flour (about 3/4 cup)300grams all-purpose white flour (about 2 1/2 cups)Mix together, leave it to rest for 3 hours and then mix again. Leave it again for another 2 hours and then sprinkle with about a tablespoon of flour.Remove from the bowl and knead on a clean surface to work out the bubbles. . Chop up the herbs into fine bits. Grate 100 grams of cheese. . Weigh out about 50 grams of dough.Sprinkle your surface with a bit of four and roll each bun dough blog into a flat blob. . Add a small amount of cheese and herbs to the flat dough blob. Roll up the blob ensuring the end sticks together so it doesn't unroll. . Arrange the buns on parchment paper and cover with a tinfoil lid. Note/tip: the tinfoil top can easily be made using a large round bowl. Let the buns rise for 15 minutes before baking at 222C (430F) for 22 minutes. Remove the foil cap and broil at 233C (450F) for 5 minutes to make the top golden. Let cool, tear the buns apart and enjoy. \nRead the question below and select from the following choices.\nA: Cheesy Jalapeno Bread\nB: Ingredients\nC: Make Dough Balls\nD: Melt the Garlic Butter and Brush It on the Top of the Bread.", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_186_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_186_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_186_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_186_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_186_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_186_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_186_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_186_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_186_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_186_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_186_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_186_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_186_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_186_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_186_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_186_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_186_16.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Cheesecake Jello\nB: Get a Good Ginger Grater\nC: Marinate With Brown Sugar and Garlic\nD: Cool", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Milk', 'Sweetener', 'Unflavored Gelatin', '@placeholder']", "context": "Here is the context of these images:\n. Boil the milk medium heat. Turn off heat and add two tablespoons of sweetener stevia. Remember that this substitute has the feature to sweeten twice.. Add the warm water to hydrate the  unflavored gelatin  until a paste, then pour in the milk and stir until dissolved.. Finally, we must add the teaspoon of vanilla extract and the dye color of your choice and pour into glasses or moldsNote: It is important to use vanilla extract, not confused with the essence because if it is used not desired to be achieved vanilla. You can also change the extract by any flavor of your choice.. It should be refrigerated about 4 hours.Ready to eat!\nRead the question below and select from the following choices.\nA: Cheesecake Jello\nB: Get a Good Ginger Grater\nC: Marinate With Brown Sugar and Garlic\nD: Cool", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_187_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_187_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_187_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_187_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_187_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_187_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_187_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_187_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_187_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_187_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_187_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_187_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_187_12.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Chicken Divan Soup\nB: Add Chicken Broth, Bring to a Boil, Add Rice\nC: The Shopping List\nD: Chicken Soup Bones", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Physical Benefits of Chicken Soup', 'Cooking the Soup', '@placeholder', \"Sunshiine's Final Thoughts\"]", "context": "Here is the context of these images:\n. Homemade soup served by the hands of a loving wife or mother has been used as a home remedy with remarkable results. It\u00a0soothes\u00a0the throat and warms the body. \u00a0Canning large batches will serve you well when a family member comes home sick. \u00a0. \u00a0How many of you have ever tasted home canned food? The truth is home canned foods \u00a0have more nutritional value than store bought foods. You control what goes into the foods you can. It is very beneficial to can what you grow yourself, \u00a0because most farmers use harmful chemicals on their fields. If you can't grow it yourself consider buying produce that is grown organically. The flavor of home grown and canned produce is amazing!\u00a0 I grew up in a time when many people were still growing and canning their own produce. I know what a real dill pickle taste like and what you buy in the stores today don't even come close!\u00a0 Canning\u00a0 takes \u00a0time but if your time is limited consider growing your own garden and freezing what you grow. The benefits are worth the extra effort.\nIn this guide I have canned Grannie's soup recipe the lazy way. I canned the soup but have frozen it instead of using the pressure canner or pressure cooker method. This is an inexpensive way to get started and see if it is something you might be interested in doing. From there you will gain confidence and may decide to go for the real deal. I personally have canned fruits and jellies but have never attempted canning meats. Canning some foods require education because of the dangers involved if you don't do it properly.. \n\tThis is what you will need to make the soup:\n\t1 Boiled whole chicken adding only salt when cooking it.\n\tSave all the chicken broth.\n\tRemove the meat using a strainer if you have one, save and freeze the skins and bones if you have dogs or cats. I will show what to do with them later.\u00a0\n\tCut chicken in small bite size pieces.\n\t1 cup peeled chopped carrots.\u00a0\n\t1 Cup chopped celery.\n\t1 Cup chopped onion.\n\t1 Chopped jalapeno.\n\t4 garlic cloves.\n\t1 Lemon juiced. This is to add to the soup after it is cooked.\n\t2 Cups of fresh chopped tomatoes.\n\t1 Cup chives I used the tops of 6 green onions because I did not have chives.\n\t2 Chicken Bouillon cubes.\n\tI used curly noodles but you can add egg noodles as well.\u00a0\n\tThe secret to this recipe is use as many green spices as you can. I use what I have on hand.\u00a0You can add just about any kind of vegetable to this recipe and receive benefits from it.\u00a0 This is the recipe we have used for a very long time.\u00a0 I often use what ever I have at the time.\u00a0 Nothing is in stone.\u00a0 You can add parsnips, sweet potato and turnips for even better results.\u00a0 I did not have any on hand.\u00a0\n\tSpices:\u00a0 I adjusted my recipe for a larger group of taste buds.\u00a0 I like mine more seasoned and with more pepper.\u00a0Taste it after you add everything and adjust it for your taste buds.\u00a0 The more spices the better it works.\u00a0\n\t1/8Th Teaspoon of each of the following as desired:\n\tBasil\n\tParsley\n\tOregano\n\tPaprika\n\tChili Powder\n\tBay Leaves\n\tSage\n\tCumin\n\tRed pepper\n\tCilantro\n\tItalian seasoning\n\tDill weed\n\tCinnamon\n\tNutmeg\n\tSea salt\n\tPepper if desired\n\tYou may omit the peppers if your family is sensitive to it. Peppers help clean out the sinuses.\n\tUtensils:\n\t1 Large stock pot\n\t1 Large spoon\n\t1 Medium funnel with large opening\n\t1 Sharp knife\n\t1 Cutting board\n\tMixing bowls\n\tFood strainer if you have one.\n\tClean canning jars or heavy jars and lids with wide mouths. If this is your first time freezing in a jar just can/freeze a few to get the feel of it.\u00a0\n\tPlastic bags the number of jars you will be freezing.\n\tPlease note:\u00a0 If you are a\u00a0vegetarian you may substitute the chicken broth for a vegetarian broth and add rice and beans to make a complete protein.\u00a0\n\t\u00a0. Place the broth in the stock pot or cook it in a crock pot.\u00a0\nAdd all the spices.\nAdd the chicken.\nAdd all the vegetables reserving\u00a01 cup of\u00a0the tomatoes and a few green onion tops or chives for garnish.\nStir well.\nTurn on the burner and cook until the carrots are done but not over cooked.\nAdd the lemon juice to the cooked mixture.. Add the remaining tomatoes and chives to the jars.\nDo not fill the jars above the neck line. Leave at least 1 inch at the top for small jars and 2 inches for larger jars to allow for expansion. If you don't allow enough the jars could break. As it turned out my jars did not expand that much but it is best to be safe than sorry.\nLadle the soup into the jars.\nAllow to cool completely to ovoid breakage.\nWhen they are cooled completely carefully place them in the freezer with the lids off!\u00a0 As a safety measure: Place the jars into the plastic bags to prevent any glass from getting on other foods if the jar breaks.\nAfter they are completely frozen place the lids on the jars and screw down the lids.\nPut back in the freezer. There is no need to place them back into the plastic bags because they are frozen and there is no danger in them breaking.\nThat is all there is to it!\nWhen you thaw out the soup allow it to thaw in a bowl with cool water if you will be around to start cooking it when it is thawed.\u00a0 I personally feel safer defrosting it in the fridge. Avoid rapid thawing to prevent breakage.. I\u00a0promised\u00a0that I would add the link to my chicken soup bones recipe. \u00a0I made a completely different tutorial about how to cook the chicken bones to feed you dog/cat. \u00a0I had been visiting my sister and she was feeding her dogs chicken bones. \u00a0I never knew you could actually safely give them dog bones and they are very good for them. This tutorial also gives tips on how to potty train your dog and useful grooming tips on\u00a0\u00a0friendly products. Step 4 is about the dog food. \u00a0 \u00a0Here is the link on how to safely do that: \u00a0https://www.instructables.com/id/Potty-Training-Grooming-Nutrition-And-Choosing-/. I have pictures here of ways you can package the soup for gift ideas. You can begin to make the soup now and avoid that last minute holiday rush. It is important to place a large note on the package and tell them that the jar must be placed in the freezer or fridge asap or eaten within a few days. I know this is a repeat but it is very important and you would sure hate to find out that someone got sick on the soup you canned. The jars are not sealed so they need to be frozen until they will be used. Do not let them sit on the counter all day because bacteria can make you very ill. Thaw them in a bowl of cool water if you are going to be around to check on it often. Otherwise thaw in the fridge. Cook frozen soup as soon as you can remove it safely from the jar.\nFor a care package\u00a0 just add stuff one would take for a cold along with the soup. You can add a little or add a lot. You could make a family package because a lot of times everyone in the family gets sick. You can make the soup in a crock pot and take the entire pot to a sick family. Many different options you could do for this type of gift. Add bath salts recipe here: https://www.instructables.com/id/How-To-Make-Bath-Bombs/\u00a0\u00a0\u00a0 Lip balm: https://www.instructables.com/id/Delicious-Chocolate-Chapstick-Honey-Balm/, \u00a0candle, cough drops how to here:\u00a0https://www.instructables.com/id/Cough-Drops/ , Vapor rub\u00a0\u00a0, Orange juice, Vitamin C, Tea, Get well rock, Throat spray, or footie's just to name a few.\nThere are people who have concerns of storing foods in plastic containers or bags and this is a good alternative for them.\u00a0 You can use plastic to store them in and that is an option you might consider.\u00a0 This is a great way to get you comfortable putting up your own food.\u00a0To freeze broth simply place the broth in the fridge until the fat settles to the top.\u00a0 Skim off the fat and pour\u00a0the broth into a freezer bag and work out the air.\u00a0 Lay flat single layered on the freezer shelf.\u00a0 After it is completely frozen you may stack it to make more room in the freezer.\u00a0\nI am currently working on an Instructable using chicken bones for cat/dog treats. \u00a0When it is finished I will add a link here.\u00a0\nThank you for stopping by and have a super day!\nRead the question below and select from the following choices.\nA: Chicken Divan Soup\nB: Add Chicken Broth, Bring to a Boil, Add Rice\nC: The Shopping List\nD: Chicken Soup Bones", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_188_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_188_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_188_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_188_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_188_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_188_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_188_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_188_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_188_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_188_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_188_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_188_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_188_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_188_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_188_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_188_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_188_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_188_17.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_188_18.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_188_19.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_188_20.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_188_21.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_188_22.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_188_23.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_188_24.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_188_25.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_188_26.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_188_27.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_188_28.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_188_29.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Framing the Rose!\nB: Watermelon Shark\nC: Recipe for You to Print\nD: Watermelon Sherbet Watermelon", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Clean It Up!', 'Outer Petals Pt. 1', '@placeholder', 'Finishing Touches!']", "context": "Here is the context of these images:\n. You'll need: One watermelon Paring knife Medium circle cookie cutter. Start by using your paring knife to trace a circle on the side of the watermelon you want to carve. Once you have cut out a circle template to follow, use the paring knife to peel away the green part of the watermelon rind, leaving the white fleshy part exposed.Note: Save some of the bigger pieces of green that you peel off. Those will be needed down the line!. After peeling away the green bits, you'll see that there are darker parts of the rind that you might have missed due to uneven cutting. No problem, just grab the paring knife and clean it up by peeling away as much of the greenish coloring as possible until only white rind is showing. It doesn't have to be perfect though so don't spend hours on it or you'll end up carving away all the white part and hit the fruit instead!. So remember those green parts I had you save! Grab those and cut out a few leaves. (I made 3 total for mine) Then carve out some detail and put those aside again for later. At this point you can dispose of all the rest of the unwanted rind. . To start the rose, grab your circle cookie cutter and press it gently into the watermelon, no more than 1/3 of the cookie cutter in. You aren't cutting out shapes, just making an outline to follow. After you cut the circle, use the paring knife to cut a ring around the circle and carve out some of the watermelon around it, making it appear more 3D.. Take your paring knife and cut a small sliver off the circle in the shape of a crescent. Make this first cut straight down. Next cut a sliver in front of the one you previously cut, but make this one at an angle towards the first cut.\u00a0 Pull the second sliver you made out and you have your first petal made. Repeat this process around the circle for your first row of petals. For a better understanding, take a look at the pictures posted that shows this process.. For the second row, you are going to continue cutting a thin crescent shaped sliver straight down, then cutting a second thin crescent sliver parallel to in at an angle towards your first cut sliver. Pull the second angled sliver out to complete the petal. Repeat this process around the circle to complete your second row. Again, for a better understanding, refer to the photos I posted. . Keep using the sliver cutting method further into the circle to create more rows of petals, getting smaller with each row closer to the center. . When you get to the point of just a tiny bit of uncut circle in the middle. Use your paring knife to cut a hole in the middle of the last bit of the center. The center of your rose is now finished, time to move on to the outer layers!. Unlike the inner layers, all cuts will be angled in this stage. The first cut you make should be angled at about 45 degrees. Make a wavy/curvy petal shaped cut. Then behind that first cut ,at about a 20 degree angle, slice a rounded cut encircling your first cut. Pull this sliver out to make the first cut you made pop out from the melon. This sounds more complicated than it really is, for a better understanding see the photos.. Continue making wavy/curvy petal cuts and cutting away part of the melon behind them to make them appear 3D. The further from the center you get, the bigger the cuts and petals should become. See photos for reference!. \n          Go as far out as you would like, making the rose as big as you want. You can have it take up the whole melon, or part of it so you have multiple roses on one melon.   In the photos you can see where I stopped, this is how big I wanted this rose. I included a side perspective so you can see the angles and such. . Once you have completed the rose by making as many petals as you want. Finish it by framing it. Like the sunflowers, cut a large circle around the flower at an angle and pull out the bit of melon. This will make the rose look like it is popping out from the watermelon even more than it already does. You can repeat this process to make more roses on your melon, or maybe mix it up with sunflowers and other things. Your choice! For this example I made 2 roses on my watermelon.. Now that you have your roses all carved up, grab the leaves you made from the watermelon rind earlier and place them where you would like on the roses. Just slide them into the gaps. If you need to, use your paring knife to deepen the gaps so the leaves don't fall out and you are finished! Show off to friends and family, then slice it up and serve! Also included an additional photo of a recent melon I carved that does include using the roses and my first attempt at flames. \nRead the question below and select from the following choices.\nA: Framing the Rose!\nB: Watermelon Shark\nC: Recipe for You to Print\nD: Watermelon Sherbet Watermelon", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_189_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_189_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_189_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_189_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_189_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_189_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_189_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_189_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_189_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_189_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_189_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_189_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_189_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_189_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_189_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_189_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_189_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_189_17.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_189_18.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_189_19.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_189_20.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_189_21.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_189_22.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_189_23.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_189_24.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_189_25.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_189_26.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_189_27.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_189_28.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_189_29.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_189_30.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_189_31.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_189_32.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_189_33.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_189_34.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_189_35.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_189_36.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_189_37.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_189_38.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_189_39.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_189_40.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_189_41.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_189_42.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_189_43.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_189_44.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_189_45.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_189_46.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_189_47.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_189_48.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_189_49.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_189_50.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_189_51.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_189_52.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_189_53.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_189_54.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_189_55.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_189_56.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_189_57.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_189_58.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_189_59.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_189_60.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_189_61.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Bake It\nB: How to Make Homemade Bacon\nC: Important Information on Canning\nD: Getting Started", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Boil 1 Lb. of Dried Shell Pasta', 'Fry 10 Pieces of Thick Cut Hardwood Smoked Bacon', \"Assemble Bacon Mac N' Cheese\", '@placeholder']", "context": "Here is the context of these images:\n. You will need to gather the following ingredients:1 lb dried shell pasta4.5 cups of whole milk5 tbsp butter4 tbsp all-purpose flour8 oz. of New York Sharp Cheddar cheese, shredded7 oz. Mozzarella cheese, shredded8 oz. of Monterrey Jack cheese, shredded1/2 tsp rosemary1/2 tsp thyme2 tbsp basil2 tbsp of fresh green onion, chopped2 cloves fresh garlic, minced nutmegcayenne pepper saltground black pepper 10 slices of thick cut hardwood smoked bacon Seasoned breadcrumbs Caution:Do not use pre-shredded cheese for this meal. Pre-shredded cheese is covered in an anti-caking powder that will cause the cheese not to melt well. This will cause the cheese sauce to have a weird consistency.Tip: I recommend gathering, chopping, and shredding all ingredients prior to starting.Other materials or objects you will need are:A sharp chefs knifeA cutting boardA whiskA spoonA measuring utensils including a measuring cup, 1/2 teaspoon, and 1/2 tablespoonA large potA large panA 13\"X9\" glass baking panA cheese graterA noodle strainerPam cooking sprayPaper towelsA good pair of oven mitts. Fill a large pot with water.Put a pinch of salt in the water.Bring the water to a boil.Insert pasta into the boiling water.Strain the noodles once they are soft and tender.Warning: Boiling water can cause severe burns. Pour your noodles into the strainer with care.Rinse the strained noodles under cold tap water to stop them from cooking any more.Caution: Not running cold tap water might cause the noodles to over cook and become mushy.Set the noodles aside until later.Tip: Save a cup of the starch water you used to cook your pasta in. This may or may not be used later.. Heat up a large pan on medium high.Fry bacon in the pan flipping occasionally until semi crispy.Place the bacon onto of a paper towel.Blot the bacon with another paper towel to remove the grease.Caution: Not blotting the grease away will affect the consistency of the cheese sauce.Cut the bacon into thinly sliced strips.Warning: Fried bacon will be hot. Wait for the bacon to cool before handling.Set the bacon aside until later.. Heat up the same pot you used to boil the noodles on medium heat.Preheat your oven to 400 degrees Fahrenheit.Melt 5 tbsp of butter until it starts to bubble or simmer.Add 4 tbsp of all purpose flower.Whisk vigorously until a smooth even texture is achieved.Note: for steps 6-21 continue whisking vigorously to achieve and smooth even texture.Caution:  Not whisking vigorously for steps 6-21 may case flower to sink, clump, and burn at the bottom of the pot.Let the mixture cook for 1-2 minutes to get rid of the raw flour taste.Add 1 cup of milk.Add 1/2 a tsp of rosemary.Add 1/2 a tsp of thyme.Add 2 tbsp of basil.Add 2 generous pinches of nutmeg.Add 2 generous pinches of  cayenne pepper.Add the last 3.5 cups of milk.Add 2 tbsp of fresh chopped green onions.Add 2 cloves of fresh minced garlic.Add salt and/or black pepper as desired.Tip: taste it! adjust the seasoning accordingly.Increase the heat to high to thicken the sauce.Note: The sauce should be thick enough to add the cheese when it will coat the back of a utensil.Lower the heat back down to medium.Add 8 oz. of shredded New York Cheddar cheese.Add 7 oz. of shredded Mozzarella cheese.Add 8 oz. of shredded Monterrey Jack cheese.Turn off heat.Note: The sauce should be thick and creamy. If you believe the sauce is too thick, add some of the starch water you saved from step 2.. Lightly grease a glass baking pan with Pam. Other cooking spays or butter will also work.Pour 1/3 of the noodles in the bottom of the pan.Place 1/3 of the chopped bacon.Pour 1/3 of the sauce onto the bacon and noodles.Mix around with a spoon.Repeat 2 through 5 until all the ingredients are used up.Note: There should be an even distribution of bacon and noodles throughout the pasta.Cover the top with a thick layer of bread crumbs.. The oven should be at 400 degrees Fahrenheit due to you preheating it in step 4.Place the glass pan filled with mac n' cheese on the center rack.Bake for 35-35 minutes or until the sauce is burbling and the bread crumbs are golden brown.Remove the pan from the oven.Warning: The pan and mac n' cheese will be extremely hot! It is extremely recommended that you use a pair of good oven mitts.Let sit on the counter for 5-10 minutes to cool.Warning: The product will be extremely hot. Make sure it is cool enough before eating or you may burn your mouth.You now have a delicious meal to serve to your family or horde to yourself if you so choose. You may experiment using different types of cheeses to get different tastes, or add different seasonings. For example I sometimes use Pepper Jack instead of Monterrey Jack and add some hot sauce into the cheese sauce.\nRead the question below and select from the following choices.\nA: Bake It\nB: How to Make Homemade Bacon\nC: Important Information on Canning\nD: Getting Started", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_190_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_190_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_190_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_190_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_190_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_190_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_190_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_190_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_190_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_190_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_190_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_190_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_190_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_190_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_190_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_190_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_190_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_190_17.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_190_18.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_190_19.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_190_20.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Chocolate Chip Cookies for Dummies\nB: White Chocolate Chip Matcha Cookies\nC: Decorate\nD: Sugar Mixture", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Ingredients', 'Eggs and Butter', '@placeholder', 'Flour Mixture and Chocolate Chips']", "context": "Here is the context of these images:\n. Ingredients:\n- 2 eggs\n- 1 cup of butter (2 sticks) softened\n- 3/4 cup brown sugar (packed firmly)\n- 1/4 cup white sugar\n- 1 tsp. vanilla extract\n- 1 package of vanilla pudding\n- 2 1/4 cup white flour\n- 1 tsp. baking soda\n- 1 tsp. salt\n- 1 1/2 \u00a0chocolate chips\n- 1 cup chopped pecans (optional). Combine the eggs and butter and mix until well blended. Mix together sugars, vanilla extract and pudding. The pudding mix is what really makes this recipe different as well as better than others, although in a pinch you can substitute it with an extra 1/2 cup of white sugar. Beat together with egg mixture for about 2 minutes or until well blended.. Combine flour, baking soda, and salt in a separate bowl. Gradually add in to sugar and egg mixture until just blended. Add in chocolate chips and pecans if desired. DO NOT OVER MIX.. Drop by rounded teaspoonfuls onto 2 baking sheets. I have found that it usually works best when I put them in a 4x6 grid. .Bake at \u00a0 375\u00a0\u00a0for 8-10 minutes. Let cool for 5-7 minutes. Enjoy!\nRead the question below and select from the following choices.\nA: Chocolate Chip Cookies for Dummies\nB: White Chocolate Chip Matcha Cookies\nC: Decorate\nD: Sugar Mixture", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_191_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_191_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_191_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_191_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_191_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_191_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_191_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_191_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_191_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_191_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_191_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_191_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_191_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_191_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_191_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_191_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_191_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_191_17.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_191_18.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_191_19.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_191_20.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_191_21.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_191_22.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_191_23.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_191_24.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Easy Brioche Recipe\nB: \u200bIF USING POWDERED CHILES, Start Here\nC: Optional Step\nD: \u200bIF USING DRIED CHILES, Start Here", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['The Ingredients', 'Add All of the Dry Ingredients.', 'Putting It Into the Microwave', '@placeholder']", "context": "Here is the context of these images:\n. Ingredients: - 35g / 1/4 cup / 60ml of Self-Raising Flour - 55g / 1/4 cup / 60ml of Sugar - 2 Tablespoons of Cocoa Powder - 2 Tablespoons of Cooking Oil (doesn't have to be sunflower) - 3 Tablespoons of Water - Ice Cream (optional)Utensils: - A Spatula - A Wooden Spoon - A Mixing Bowl - A Mug - A Serving Plate (optional) - A Tablespoon - A Kitchen Scale or a Measuring Cup - Something to Eat With. - A sieve. The Dry Ingredients: Put all of the dry ingredients (35g flour, 55g sugar, 2 tablespoons of cocoa powder) into the bowl and mix them with a wooden spoon! It is important that you mix them otherwise it will not rise properly (I forgot to mix them...) It is recommended that you put them into the bowl flour first, then sugar, then sieve the cocoa powder to get rid of lumps.. The Rest Put all of the other ingredients (3 tablespoons of water, 2 tablespoons of oil) into the bowl with the mixed dry ingredients. Once you have done this, mix it with a spoon. You should mix until the mixture becomes quite thick and consistent (smooth). Try and get all the annoying ingredients that get stuck to the edge of the bowl in as well.. Putting the Mixture in the Mug Use your spoon or spatula to scrape out all of the mixture from the bowl and into the mug.. Microwave Set the microwave to 1 minute 40 and put in the mug. Press start!!. Microwave Take out the mug form the microwave (careful, it's hot!) If you're messy (like me) then the inside edge of the mug will have battle scars, but it doesn't matter unless you really want perfect presentation, in which case I'm not sure how to help. You can serve it with ice cream, or just eat it :) The next step is completely optional.Thank you for looking at this mug-brownie recipe. If you made it, please put it in the comments and if you didn't enjoy it, tell me why and I'll try to help. Thanks!. Optional Step This step is if you want to put your mug-brownie on a plate. Simply turn the mug upside-down onto the plate. If it doesn't seem to be working, put it back in the microwave for 30 seconds. However, putting it in the microwave hardens it up and makes the edge chewy.\nRead the question below and select from the following choices.\nA: Easy Brioche Recipe\nB: \u200bIF USING POWDERED CHILES, Start Here\nC: Optional Step\nD: \u200bIF USING DRIED CHILES, Start Here", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_192_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_192_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_192_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_192_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_192_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_192_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_192_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_192_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_192_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_192_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_192_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_192_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_192_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_192_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_192_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_192_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_192_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_192_17.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_192_18.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_192_19.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_192_20.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_192_21.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Enjoy\nB: SOB Sprite\nC: Line 'Em Up!\nD: It's Flour Time", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Sort the Candy', 'What to Do With the Two Red Flavors', 'Add the Sprite/7Up', '@placeholder']", "context": "Here is the context of these images:\n. Make sure you wash and dry the vials thoroughly. . Separate the Jolly Rancher flavors into the five small bowls.. Add four green apples to the first vial. This will be your green potion.Add four blue raspberry to the second vial. This will be your blue potion.Add four grape to the third vial. This will be your purple potion.Go To Next Step To Learn What To Do With The Two Red Flavors. Take two cherry flavored JRs and put them in the fourth vial.Add two watermelon flavored JRs to the vial.This will be your red potion.. You can use either Sprite or 7Up for this step.Use a funnel to pour it into the vials.. Fill the vials to the desired height.Place the lids or corks on LOOSELY. Be careful with them as you transfer them from the counter to the fridge. . CAREFULLY place drinks in the fridge. Make sure the lids are LOOSE as to keep the tops from popping off.. It may take a whole night for the candies to dissolve.When they do, you can keep them in the fridge or serve them immediately. If you are making these for an event, you will have to make them a day or two ahead.\nRead the question below and select from the following choices.\nA: Enjoy\nB: SOB Sprite\nC: Line 'Em Up!\nD: It's Flour Time", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_193_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_193_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_193_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_193_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_193_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_193_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_193_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_193_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_193_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_193_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_193_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_193_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_193_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_193_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_193_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_193_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_193_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_193_17.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_193_18.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Top Hem\nB: Coffee Tea Bag\nC: Mis En Place\nD: Making the Ganache", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Draw String', 'Weights', \"Sunshiine's Final Thoughts\"]", "context": "Here is the context of these images:\n. What you will need: \n\t\tScraps of muslin, sheer, lace, or Netting cut into 5 X 5 squares\n\t\tEmbroidery thread\n\t\tSewing needle\n\t\tScissors\n\t\tRuler\n\t\tMarking pen\n\t\tLoose tea leaves\n\t\tGlue\u00a0\n\t\tButtons\n\t\tGlass jewels\u00a0\n\t\tTea spoonPlease note: \u00a0I liked the glass jewels but buttons glued together will be sufficient weight. \u00a0\n        . Cutting out the pattern:\n\t\tYou will need to cut a square piece of fabric 5X5 for each weighted tea bag.. Seams:\n\t\tFold the square in half with right sides together.\n\t\tThread the needle and sew along the bottom and the side as shown.\n\t\tTie it off.\n\t\tClip the threads.. Hemming the top:\n\t\tFold the top edge down (about the 1 1/2 inches\u00a0and:\n\t\tTurn it under the same amount again.\u00a0\n\t\tYou will sew it in the next step.\n\t\tThe pictures show one sewn and one not sewn. \u00a0. Method: This is the draw string.\n\n\t\tCarefully turn the bag right side out without disturbing the hem.\n\t\tBegin sewing at the seam ( through all the layers, \u00a0leaving the bag open as you sew,) \u00a0across the top just below the top edge.\n\t\tStitch all the way around until you meet up with the beginning stitch. \u00a0Don't trim the threads\u00a0or back stitch.\n\t\t\u00e2\u0080\u008b Mine are a little too long so I would cut them about 9 inches.\n\t\tTrim off what you don't want.\u00a0\n\t\tTie each end\u00a0in a knot.\u00a0. Method:\n\t\tSpread the draw strings out as shown.\n\t\tDab some glue on the center of the glass jewels.\n\t\tStick the string in the glue.\n\t\tCenter the button over the string and press.. Filling\u00a0the tea bag:\n\t\tFill the desired amount of tea into the bag.\n\t\tAdd 1 or 2 \u00a0glass jewels.\u00a0\n\t\tClose the bag.\nPlease note: I used 1 glass jewel but later added another one to balance it out better.\n        . I have been wanting to make these weights for sometime. \u00a0I do not like removing the string from my tea that has fallen into the cup and how the tea bag rises to the top of the water. \u00a0These are easy to clean by shaking out the leaves, rinse\u00a0the bag under running water and let them air dry. \u00a0It reminds me of the old days. \u00a0I am\u00a0using\u00a0 these decorated cans to store them in and for gift packaging. \u00a0I thought they were very stylish. \u00a0If I were to win the Shopbot challenge I would make my own wooden buttons and many things that would require precision cutting. \u00a0I can't draw but I sure could use a pattern to cut my own stuff. \u00a0\nThank you for stopping by and do have a safe and happy December.\nSunshiine\nRead the question below and select from the following choices.\nA: Top Hem\nB: Coffee Tea Bag\nC: Mis En Place\nD: Making the Ganache", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_194_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_194_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_194_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_194_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_194_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_194_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_194_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_194_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_194_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_194_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_194_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_194_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_194_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_194_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_194_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_194_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_194_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_194_17.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_194_18.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_194_19.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Husk and Halve the 3 Large Garlic Cloves.\nB: How to Make Salsa Verde\nC: Combine and Blend\nD: How to Make Salsa", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Open the Canned Tomatoes', 'Peal the Red Bell Pepper and Jalapeno', 'Blend and Add Salt and Red Pepper Flake to Blender']", "context": "Here is the context of these images:\n. For this recipe you will need:Ingredients:\n\u00a0\u00a0 \u00a0 \u2022 1 Red Bell Pepper\n\u00a0\u00a0 \u00a0 \u2022 1 Jalapeno\n\u00a0\u00a0 \u00a0 \u2022 \u00bd White Onion\n\u00a0\u00a0 \u00a0 \u2022 3 Large Cloves of Garlic\n\u00a0\u00a0 \u00a0 \u2022 1 Handful of Cilantro\n\u00a0\u00a0 \u00a0 \u2022 42 ounces of Canned Tomatoes\n\u00a0\u00a0 \u00a0 \u2022 1 Lime\n\u00a0\u00a0 \u00a0 \u2022 Salt\n\u00a0\u00a0 \u00a0 \u2022 Crushed Red PepperUtensils:\n\u00a0\u00a0 \u00a0 \u2022 Oven\n\u00a0\u00a0 \u00a0 \u2022 Aluminum Foil\n\u00a0\u00a0 \u00a0 \u2022 Cutting Board\n\u00a0\u00a0 \u00a0 \u2022 Large Kitchen Knife\n\u00a0\u00a0 \u00a0 \u2022 Can Opener\n\u00a0\u00a0 \u00a0 \u2022 Blender/Food Processor\n\u00a0\u00a0 \u00a0 \u2022 Bowl\n\u00a0. Place the red bell pepper and the jalapeno on the aluminum foil and set on the bottom oven rack. \u00a0Turn the oven to broil. Warning: Ovens produce heat quickly so when handling the red bell pepper and jalapeno be careful to not touch inside of the oven.Note: \u00a0For this recipe we want the red bell pepper and jalapeno to be \u2018Fire Roasted\u2019 so we will place them in the oven to perform this task. Broiling consists of cooking directly under heat so on your oven it only turns the top burner on and cooks from this heat, in contrast to regular cooking which cooks by heating up the air inside the oven. You will need to check on the red bell pepper and jalapeno approximately every 5 minutes rotating them to a new side when one side has blistered.. Cut the white onion in half.\na) Remove outside skin and inner sprout. Warning: When using a knife keep fingers clear of blade.Note: \u00a0As a general rule when cooking, if it doesn\u2019t look like something you want to eat then throw it out. This rule applies to the onion; there is a sprout at the core of the onion remove that from the onion. Likewise the outside dry skin of an onion will alter the taste and you should remove that as well. For this recipe we will only need half of the onion unless you really like onion then add more as you see fit.\nb) Mince the onion into many fine pieces.Note: \u00a0Mincing simply means to chop into many small pieces. So go ahead and chop the onion up into many small pieces.\nc) Deposit the minced onion into the blender/food processor.\n\u00a0. Husk and Halve the 3 Large Garlic Cloves\na) Husk the Garlic.Note: \u00a0For those of you that don\u2019t know how to husk a clove of garlic, simply place the broad side of your large kitchen knife on the garlic and smash down the knife with your hand. This will break the clove and you can now remove the husk.\nb) Halve the GarlicNote: \u00a0Cut the clove of garlic in half and remove the sprout in the middle. This is typically an off white color, slender stem inside the garlic clove.\nc) Cut off the end.Note: On both of the pointed ends of the garlic a woody stem needs to be removed.\nd) Mince the garlic and add to blender.. Chop the Cilantro in Half\na) Chop the cilantro in half.Note: \u00a0Take a large handful of cilantro and place on the cutting board. You want to remove the bottom portion of the stem, from the leaves down and throw this away.\nb) Mince the leafy part of the cilantro and add to blender.. Remove the red bell pepper and jalapeno from the oven and place in a bowl. Cover the top of the bowl with the aluminum foil. Allow the red bell pepper and jalapeno to steam for approximately eight minutes inside covered bowl.\u00a0. Open and drain the water from the canned tomatoes then add to the blender. \u00a0Warning: Using the can opener leaves a rough edge on the can so be careful not to cut yourself holding the can lid down.. I like a stronger lime taste so I squeeze both halves of the lime into my salsa.\nTip: If you have a firm lime, microwave it for 10 seconds to soften it and make it easier to squeeze.. a) Remove the aluminum foil from the bowl and place the red bell pepper and jalapeno on the cutting board. The skin should be partially separated from the inside of the pepper.\nb) Cut the skin away from the red bell pepper and jalapeno and discard the skin. Warning: There is hot juice inside the red bell pepper and jalapeno and when you cut it open it will squirt and potentially burn you.\nc) Remove the seeds from the red bell pepper and discard the seeds.\nd) Chop the red bell pepper and jalapeno into small pieces and place in the blender.. Taste test and add salt and red pepper flake until you reach desired flavor.. Now that you have made the salsa you should have the confidence to make it again. All of the measurements for the ingredients are subject to my own specific taste and you should play around and alter them to fit your liking. The secret to great salsa begins with incorporating the right ratio of each ingredient for your taste. Once you know how you like it, you can make any amount following your own ratio guide.\nRead the question below and select from the following choices.\nA: Husk and Halve the 3 Large Garlic Cloves.\nB: How to Make Salsa Verde\nC: Combine and Blend\nD: How to Make Salsa", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_195_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_195_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_195_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_195_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_195_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_195_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_195_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_195_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_195_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_195_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_195_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_195_11.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Fo'shizzle Peanut Butter Cookies\nB: Add Flour\nC: Add Oats and Vanilla\nD: Ingredients", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Make the Batter', 'Chocolate Topping', 'Add the Ganache and Serve!']", "context": "Here is the context of these images:\n. For this recipe you will need:a package of brownie mix or the dry ingredients from your favorite brownie recipe1/4 cup peanut oil1/4 cup water1 cup heavy cream1 egg1 1/4 cup smooth peanut butter12 oz. chocolate chips. Preheat your oven to 325 degrees F and lightly coat a 9\"x9\" pan with peanut oil or other cooking spray.Take your package of brownie mix and add the egg, 1/4 cup heavy cream, 1/4 cup water, 1/4 cup peanut oil, and mix it all in until the batter is smooth and there are no lumps in it.  Then take 1 cup of peanut butter and gently fold it into the batter.  You don't want to mix it in all the way, but just mix it enough so that it's evenly distributed.  The light color of the peanut butter looks beautiful marbled with the dark color of the brownie batter.. Put the batter in your prepared pan  in an even layer and bake for 40-50 minutes at 325 degrees.  Check after 40 minutes and if the middle is still really really soup-y put them in for 10 more minutes.Once it's done, remove from oven and set aside to cool.. The chocolate ganache layer on top really makes this dessert special.  It might looks intimidating, but it's super easy.  Here we go!Take the rest of your heavy cream and put it in a small saucepan over low-medium heat and wait for it to boil.  Keep a close eye on it because you don't want it to burn or boil over.  As soon as the cream starts boiling, add it to the package of chocolate chips in a heat safe bowl and whisk it together until all the chocolate chips have disolved and the mixture is smooth.  It will look kind of weird at first, but that's ok, just keep whisking!. Once your ganache is all smooth, pour it over your cooled brownies and smooth it into an even layer.  To make the pretty pattern, take the rest of your peanut butter and put it in the microwave for 20-30 seconds.  Use a piping bag or even just a measuring cup with a spout to pour thin lines of liquid peanut butter diagonally across the pan.  Then take a toothpick or skewer and drag through the lines of peanut butter in the opposite diagonal.  Then do it again in the other direction to get the chevron pattern.  Set it aside to cool a bit and let the ganache solidify a bit and you're done!  Serve them and impress your friends and and win over your enemies!\nRead the question below and select from the following choices.\nA: Fo'shizzle Peanut Butter Cookies\nB: Add Flour\nC: Add Oats and Vanilla\nD: Ingredients", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_196_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_196_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_196_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_196_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_196_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_196_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_196_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_196_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_196_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_196_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_196_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_196_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_196_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_196_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_196_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_196_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_196_16.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Orange and Peach Trifle\nB: Pie Assembly\nC: Relax, Eat, and Enjoy!!!\nD: Pie Assembly", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['Ingredients', 'Pie Crust and Maceration', 'Filling the Tarts', '@placeholder']", "context": "Here is the context of these images:\n. 1 Single Pie Crust Rolled Flat (I made mine using this recipe, A Healthier Flaky Double Pie Crust w/ Coconut Oil) 4 Ripe Peaches - Pared and Sliced1/4 Cup Granulated Sugar1/4 Cup Brown Sugar 1/2 Tablespoon Lemon Juice1 Table spoon Cornstarch 1/16 Teaspoon Salt (literally a pinch)* For the pie crust, because I only needed a single crust, so I divided the ingredients by 50% and it worked like a charm. . I highly recommend making your own crust, for this recipe. It's super easy and 100% worth it in the end, plus there are tons of great recipes out there. I use A Healthier Flaky Double Pie Crust w/ Coconut Oil for all of my pies and tarts. It takes 5 minutes to pull together and only an hour to chill. The final results are melt in your mouth flaky.While the pie crust is chilling, combine the sliced peaches, granulated sugar, and brown sugar in a medium sized bowl. Cover with plastic wrap and set it in the refrigerator for 1 hour.* I call this prep work, because both of these items will need to be done, well in advance and will need to sit for at least an hour. ** Every 20 minutes or so, I give the peaches a toss in the sugar mixture as they macerate.. Once the pie crust has chilled, remove it and let the dough disk rest on a floured surface for about 10 minutes.Gently, roll out the dough evenly in all directions to form a 14\"x14\" sheet. Now cut 6 - 6\" rounds out of the dough sheet (I used the lid of a pot, which worked great, however any round object will do as a template). You may need to reshape the dough disk and roll it out a second time, to get all 6 rounds.Next, place each of the dough rounds in to the muffin pan, carefully working each round into the shape of the cup (if any small holes develop, you can easily patch them with a small piece of the the scrap dough).Finally, to top the tarts, using a small cookie cutter, stamp out 6 shapes (be creative, there are a million cool cookie cutters out there...I used a star shape) and transfer them to a foil lined baking sheet.Cover both the baking sheet and the muffin pan with plastic wrap and put them back into the refrigerator to chill.. Remove the macerated peach slices from the refrigerator and drain well, reserving the liquid in a medium sized pot and returning the peach slices to the bowl. Next, add lemon juice, cornstarch, and salt to the pot with the reserved peach juice. Bring to a rolling boil over medium heat, stirring constantly until the mixture begins to thicken (5-6 minutes).Once thickened to your desired consistency (I stir for about 10-12 minutes) , pour it back into the peach slices and stir until combined.. Preheat the oven to 425 degrees and move the oven rack to the lowest position.Next, spoon the peach tart filling into the prepared crusts and top with your decoration of choice.Finally, bake at 425 degrees until the edge of the crusts are a light golden brown. Reduce the temperature to 375 degrees and continue baking until the edge crusts are golden brown. . Remove the tarts from the oven and allow to cool in the pan for 4-5 minutes, until set. Then remove the tarts from the muffin pan and cool on a wire rack for 1 hour. Now it's time to sit back, relax, and enjoy. I promise these tarts will not disappoint. Plus, they are the perfect size to hold 1 scoop of vanilla ice cream on top, for a prefect peaches and cream experience. Warm or cold, they are delectable...In fact, I wish I had one right now (seriously). I really hope you've enjoyed this Instructable! Happy Baking!!!Just a note: Please, when ever possible, support your local growers. Not everyone is lucky enough to have access to locally grown produce, if you do, it's important to help keep it alive. Thanks! \nRead the question below and select from the following choices.\nA: Orange and Peach Trifle\nB: Pie Assembly\nC: Relax, Eat, and Enjoy!!!\nD: Pie Assembly", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_197_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_197_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_197_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_197_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_197_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_197_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_197_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_197_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_197_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_197_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_197_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_197_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_197_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_197_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_197_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_197_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_197_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_197_17.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Simple Caramel Apples\nB: Materials & Ingredients\nC: Caramel Sauce!\nD: Sift 1/2 Cup Powdered Sugar, Then Stir Into the Sauce.", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Apple Prep', 'The Caramel', 'Paint It Black']", "context": "Here is the context of these images:\n. -Apples-3-4 drops black food gel-1 bag of caramels-1/4-1/2 cup of cream-SticksThis recipe is for 6-8 small apples. . Wash your apples as you normally would and dry them. If you like, you can also use a piece of clean sandpaper and lightly sand the outside of the apples---this is just for better caramel adherence. Remove any stems and stab your apples with whatever sticks you plan to use. I found these great Halloween themed candy sticks at Michaels and had to get them. They come in brown or black. The green and white sticks I found at Walmart for $1.84. While these do appear fairly sturdy, pre-stab your apple before putting the stick in, otherwise the stick will bend. . You could make your own, but I find buying a bag of caramels to be easier and less time consuming. Unwrap all of the caramels and put into a pot. I used my smallest pot for this, figuring it would be easier to keep things from getting messy. Add cream. The more cream you add the thinner your caramel coating. Thin or thick, there's no wrong way to do this, it's just preference. Additionally, the thinner your caramel, the more there is to go around. I cooked the caramel over medium heat, stirring often and rarely taking my eyes off of the pot. . As soon as all the caramels seemed to be melted---maybe 5 minutes, I added the black food gel.A little goes a long way with black food gel, start with a little and add more as needed. ***Warning: too much gel and your mouth will turn black***One by one, take one of your apples by the stick and carefully swirl it around in the pot of caramel. I left the burner on low, so the caramel wouldn't cool off too quickly. Once you are satisfied with the caramel coverage on your apple, let the excess drip off (or have a friend help you wipe it off) and place on a sheet of parchment paper. Continue until all of your apples and coated.\nRead the question below and select from the following choices.\nA: Simple Caramel Apples\nB: Materials & Ingredients\nC: Caramel Sauce!\nD: Sift 1/2 Cup Powdered Sugar, Then Stir Into the Sauce.", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_198_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_198_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_198_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_198_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_198_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_198_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_198_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_198_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_198_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_198_9.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "textual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: Omlette in a Bag!\nB: Other Ingredients\nC: Pate a Choux\nD: Mise En Place!", "question": "Choose the best title for the @placeholder to correctly complete the recipe.['@placeholder', 'Making the Spice Paste', 'Casing', 'Fire']", "context": "Here is the context of these images:\n. Five-Spice is one of the main ingredients in making Beggar's Chicken. It is a roasted, powdered blend of spices like Cinnamon, Fennel seeds, Clove, Star Anise and Sichuan Pepper. It is mainly used in Chinese cooking as well in most of the Asian countries. All these spices are growing in our area too. Though I got all raw spices, I could get Sichuan pepper in powdered form only. Thanks to\u00a0lmnopeas for her instructable on \"Chinese Five Spice Powder\" at:\u00a0https://www.instructables.com/id/Chinese-Five-Spice-Powder/ I followed her instructable in making the Five-spice powder for the Beggar's Chicken recipe.. \n          Ingredients required: I have slightly modified the ingredients required for the original Beggar's Chicken Recipe. The following are the ingredients I have used:   One kg of Chicken cleaned     Two tablespoons of Five-Spice Powder     Three tablespoons of Soy Sauce     Four medium sized Onions     One full Garlic bulb     One inch Ginger piece     Three tablespoons of cooking oil     One\u00a0tablespoon Sugar     Salt to taste     One\u00a0tablespoon Red Chilly powder     Two Tomatoes     Four Green Chilies (for the aroma) .   Making the Spice Paste   Peeled Ginger and Garlic and Made\u00a0paste in a Mixer / Grinder     Peeled Onion and made\u00a0a paste     Pureed two tomatoes     Mixed all ingredients including Five-Spice Powder,\u00a0Soy sauce, Sugar, Salt, Cooking oil and Red Chilly Powder together \u00a0along with above pastes in a bowl (other than the Green Chilies)  The spice paste is ready.\u00a0I have not added any water to the mix.. Placed the cleaned chicken in a large sized bowl and applied the spice paste. Most of the spice paste went between the skin and flesh. Also applied the spice paste on the outer and inner sides of the chicken. Sliced green chilies were inserted behind the skin for added aroma.\u00a0 Now the chicken is ready to be wrapped.. The Chinese version of the recipe caters for a Lotus leaf for wrapping the spiced chicken. Here I have used a large sized tender Banana leaf in lieu of the Lotus leaf as\u00a0wrapping with banana leaves may confer an aroma to the chicken. The fresh Banana leaf tends to tear down in places while wrapping. Curing the leaf over low flame will prevent it from tearing down. Placed the spiced chicken in the cured Banana leaf, wrapped it around the chicken and tied it with cotton thread. The spiced chicken is now ready for Mud-casing. Mud-Casing the wrapped chicken is done prior to roasting it in open fire. I have dug out some red soil about six inches below the surface after clearing the top soil from our garden. Added little water to the soil and applied the casing over the banana leaf wrapped spiced chicken. Now we need to collect fire wood and make an open fire to roast it.. We made an open fire with old wood collected from the garden. Dried out leaves of coconut tree also helped us to keep the fire going. Initially, we placed the mud casing over an aluminium plate on a metal stand. When the bottom side of the casing was almost hard, we removed the plate and roasted it on open fire. Then turned it upside down to make it cooked evenly on all sides. It took us about two hours to roast and make the outer mud casing hard on all sides.. Removed the roasted Chicken from the fire and allowed it to cool for some time, then broke the hardened mud-casing. Untied the cotton thread and removed the banana leaf wrapping from around the cooked chicken. The roasted chicken was rightly done, neither overdone nor under-cooked. Now the Beggar's Chicken is ready to be served.\nRead the question below and select from the following choices.\nA: Omlette in a Bag!\nB: Other Ingredients\nC: Pate a Choux\nD: Mise En Place!", "input_image_path": ["./Discrete-temporal/textual_cloze/textual_cloze_199_0.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_199_1.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_199_2.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_199_3.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_199_4.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_199_5.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_199_6.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_199_7.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_199_8.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_199_9.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_199_10.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_199_11.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_199_12.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_199_13.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_199_14.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_199_15.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_199_16.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_199_17.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_199_18.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_199_19.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_199_20.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_199_21.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_199_22.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_199_23.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_199_24.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_199_25.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_199_26.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_199_27.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_199_28.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_199_29.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_199_30.jpg", "./Discrete-temporal/textual_cloze/textual_cloze_199_31.jpg"], "output": "B", "qwen3-vl": "image none"}]
\ No newline at end of file
diff --git a/results/threeD_Depth_Estimation/qwen3-vl/metadata_info.json b/results/threeD_Depth_Estimation/qwen3-vl/metadata_info.json
new file mode 100644
index 0000000..ae0cbff
--- /dev/null
+++ b/results/threeD_Depth_Estimation/qwen3-vl/metadata_info.json
@@ -0,0 +1 @@
+[{"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.891251, 0.378307, -0.25011], [0.443048, 0.608538, -0.658323], [-0.096846, -0.697542, -0.709969]] and translation vector: [4.935522, 3.588868, 1.45033], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.887006, 0.383874, -0.256633], [0.452131, 0.60913, -0.651566], [-0.093796, -0.693975, -0.713864]] and translation vector: [4.940225, 3.582454, 1.45688], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_0_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_0_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_0_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_0_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_0_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_0_5.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.997112, 0.02462, 0.071841], [-0.04661, 0.548461, -0.834876], [-0.059957, -0.835814, -0.545729]] and translation vector: [4.834615, 3.436689, 1.398379], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.998397, 0.025746, 0.050402], [-0.028149, 0.546702, -0.836854], [-0.0491, -0.836932, -0.545101]] and translation vector: [4.839047, 3.434593, 1.400064], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_1_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_1_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_1_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_1_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_1_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_1_5.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.485844, -0.617081, 0.619005], [-0.873216, -0.311825, 0.374512], [-0.038083, -0.722479, -0.690343]] and translation vector: [-0.164865, 3.073333, 1.323993], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.482952, -0.621872, 0.616468], [-0.874972, -0.315096, 0.367612], [-0.034361, -0.716931, -0.696297]] and translation vector: [-0.16601, 3.069565, 1.320265], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_2_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_2_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_2_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_2_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_2_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_2_5.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.934582, -0.143102, 0.325696], [-0.355737, 0.383069, -0.852473], [-0.002774, -0.912568, -0.408916]] and translation vector: [2.694367, 2.483235, 1.465763], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.935747, -0.141154, 0.323191], [-0.352667, 0.379116, -0.85551], [-0.001768, -0.91452, -0.404537]] and translation vector: [2.694351, 2.483417, 1.465522], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_3_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_3_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_3_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_3_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_3_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_3_5.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.08083, -0.463089, 0.882618], [-0.994842, 0.091929, -0.042874], [-0.061284, -0.881531, -0.468131]] and translation vector: [4.543997, 3.147744, 1.235262], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.097623, -0.477164, 0.873375], [-0.993778, 0.094019, -0.059714], [-0.05362, -0.873771, -0.483373]] and translation vector: [4.550471, 3.148599, 1.246367], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_4_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_4_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_4_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_4_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_4_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_4_5.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.877021, 0.121711, -0.464779], [0.46491, 0.459041, -0.75706], [0.12121, -0.880038, -0.459173]] and translation vector: [3.922419, 3.230202, 1.747047], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.876473, 0.11975, -0.466322], [0.465798, 0.455895, -0.758415], [0.121773, -0.881941, -0.455359]] and translation vector: [3.923546, 3.227255, 1.740959], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_5_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_5_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_5_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_5_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_5_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_5_5.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.246516, -0.470365, 0.847341], [-0.959136, 0.006886, 0.282862], [-0.138884, -0.882445, -0.449446]] and translation vector: [3.043058, 2.955299, 1.551102], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.243276, -0.470143, 0.8484], [-0.960213, 0.006937, 0.279182], [-0.13714, -0.882563, -0.44975]] and translation vector: [3.042024, 2.954946, 1.550413], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_6_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_6_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_6_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_6_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_6_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_6_5.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.236277, -0.452541, 0.859872], [-0.970097, 0.160455, -0.182119], [-0.055554, -0.877189, -0.47692]] and translation vector: [1.575898, 1.961144, 1.314442], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.238966, -0.451212, 0.859828], [-0.9694, 0.162109, -0.184349], [-0.056205, -0.87757, -0.476143]] and translation vector: [1.575219, 1.960128, 1.313122], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_7_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_7_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_7_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_7_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_7_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_7_5.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.633294, -0.360819, 0.684652], [-0.773758, -0.312806, 0.550863], [0.015401, -0.878613, -0.477285]] and translation vector: [3.241882, 3.386626, 1.367882], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.618852, -0.359339, 0.698497], [-0.785116, -0.311057, 0.535572], [0.02482, -0.87984, -0.47462]] and translation vector: [3.234923, 3.400149, 1.365622], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_8_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_8_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_8_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_8_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_8_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_8_5.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.974605, -0.106498, 0.196986], [-0.223762, -0.428932, 0.875185], [-0.008712, -0.897037, -0.44187]] and translation vector: [2.006689, 0.552817, 1.711334], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.976991, -0.101609, 0.187523], [-0.213093, -0.42809, 0.878254], [-0.008962, -0.898006, -0.439892]] and translation vector: [2.014877, 0.551422, 1.700123], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_9_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_9_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_9_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_9_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_9_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_9_5.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.874867, -0.0675, 0.479638], [-0.482919, 0.197999, -0.852987], [-0.037391, -0.977875, -0.205819]] and translation vector: [2.397274, 1.722858, 1.486845], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.874077, -0.063653, 0.4816], [-0.484123, 0.196153, -0.852731], [-0.040189, -0.978505, -0.202269]] and translation vector: [2.402604, 1.721845, 1.489477], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_10_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_10_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_10_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_10_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_10_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_10_5.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.606497, 0.359513, -0.709163], [0.793947, -0.321582, 0.515978], [-0.042553, -0.875977, -0.480473]] and translation vector: [5.898605, 1.464963, 1.329018], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.603336, 0.358994, -0.712116], [0.79647, -0.316333, 0.515334], [-0.040264, -0.878098, -0.476783]] and translation vector: [5.91512, 1.4588, 1.326343], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_11_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_11_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_11_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_11_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_11_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_11_5.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.810147, -0.229725, 0.539341], [-0.586224, 0.314131, -0.746769], [0.002128, -0.921167, -0.389162]] and translation vector: [3.108561, 2.950706, 1.466118], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.798041, -0.241673, 0.552019], [-0.602539, 0.306626, -0.736836], [0.00881, -0.920638, -0.390318]] and translation vector: [3.094201, 2.939754, 1.46817], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_12_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_12_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_12_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_12_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_12_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_12_5.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.994136, 0.036629, -0.101745], [0.107123, -0.462198, 0.880283], [-0.014782, -0.88602, -0.463411]] and translation vector: [3.8191, 1.340951, 1.354002], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.994264, 0.034625, -0.101195], [0.105882, -0.452335, 0.885541], [-0.015112, -0.891176, -0.453407]] and translation vector: [3.821174, 1.339834, 1.359098], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_13_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_13_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_13_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_13_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_13_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_13_5.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.199941, 0.263531, -0.943703], [0.979453, -0.027844, 0.19974], [0.026362, -0.964249, -0.263683]] and translation vector: [3.611549, 3.757055, 1.562045], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.20075, 0.267793, -0.94233], [0.97934, -0.030969, 0.199834], [0.024331, -0.962979, -0.268477]] and translation vector: [3.608934, 3.756757, 1.557843], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_14_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_14_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_14_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_14_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_14_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_14_5.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.24604, -0.551346, 0.797171], [-0.968826, -0.115295, 0.219278], [-0.028988, -0.826271, -0.562526]] and translation vector: [1.704247, 2.057158, 1.361636], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.236706, -0.55071, 0.800431], [-0.971342, -0.115817, 0.207564], [-0.021604, -0.826623, -0.562342]] and translation vector: [1.70792, 2.062619, 1.364929], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_15_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_15_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_15_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_15_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_15_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_15_5.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.695296, -0.421579, 0.582095], [-0.717067, -0.351947, 0.601622], [-0.048765, -0.835707, -0.547007]] and translation vector: [2.470866, 0.652559, 1.473924], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.695871, -0.418819, 0.583399], [-0.716734, -0.353708, 0.600986], [-0.045352, -0.83635, -0.546317]] and translation vector: [2.469546, 0.651931, 1.473078], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_16_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_16_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_16_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_16_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_16_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_16_5.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.937403, 0.174354, -0.301457], [0.34768, 0.517889, -0.781607], [0.019845, -0.837491, -0.54609]] and translation vector: [1.513881, 1.499843, 1.388066], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.93698, 0.17766, -0.300842], [0.348874, 0.522274, -0.77815], [0.018876, -0.834067, -0.551341]] and translation vector: [1.515168, 1.503997, 1.385631], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_17_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_17_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_17_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_17_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_17_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_17_5.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.693623, 0.392298, -0.604144], [0.720137, 0.397492, -0.568686], [0.017048, -0.82952, -0.558217]] and translation vector: [2.706242, 2.586761, 1.453005], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.690051, 0.396658, -0.605386], [0.723517, 0.399766, -0.56277], [0.018785, -0.826347, -0.562848]] and translation vector: [2.704536, 2.590014, 1.45316], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_18_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_18_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_18_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_18_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_18_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_18_5.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.530794, 0.426739, -0.732224], [0.841151, 0.159702, -0.516681], [-0.10355, -0.890162, -0.443721]] and translation vector: [5.418979, 4.373359, 1.385162], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.532043, 0.421439, -0.734384], [0.841755, 0.169492, -0.512564], [-0.091542, -0.890877, -0.444925]] and translation vector: [5.415919, 4.39552, 1.38299], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_19_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_19_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_19_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_19_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_19_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_19_5.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.484778, 0.389748, -0.782998], [0.874059, -0.248441, 0.417491], [-0.031813, -0.886777, -0.461102]] and translation vector: [2.948564, 2.712566, 1.480667], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.484062, 0.388161, -0.784229], [0.874419, -0.248162, 0.416902], [-0.03279, -0.887551, -0.459542]] and translation vector: [2.949191, 2.711738, 1.477649], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_20_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_20_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_20_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_20_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_20_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_20_5.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.998134, -0.025826, -0.055325], [0.04389, 0.326427, -0.944203], [0.042444, -0.94487, -0.324684]] and translation vector: [2.355182, 2.984659, 1.395898], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.998605, -0.022906, -0.047579], [0.037628, 0.323493, -0.945482], [0.037048, -0.945953, -0.32218]] and translation vector: [2.345251, 2.98743, 1.391141], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_21_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_21_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_21_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_21_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_21_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_21_5.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.473704, -0.275929, 0.836342], [-0.879436, -0.198746, 0.432542], [0.046868, -0.940406, -0.336809]] and translation vector: [2.984934, 2.048073, 1.446683], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.466625, -0.271085, 0.841888], [-0.8831, -0.195475, 0.426525], [0.048943, -0.942498, -0.330608]] and translation vector: [2.979092, 2.049407, 1.446378], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_22_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_22_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_22_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_22_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_22_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_22_5.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.566304, -0.590941, 0.574533], [-0.823945, 0.423135, -0.376925], [-0.020365, -0.686838, -0.726526]] and translation vector: [2.143516, 1.760119, 1.343188], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.561614, -0.596242, 0.57366], [-0.827171, 0.420904, -0.372329], [-0.019457, -0.683619, -0.729579]] and translation vector: [2.147258, 1.761594, 1.344016], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_23_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_23_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_23_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_23_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_23_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_23_5.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.054781, -0.427281, 0.902458], [-0.998013, -0.051617, 0.036143], [0.031139, -0.902644, -0.429259]] and translation vector: [1.328526, 0.849821, 1.501181], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.086578, -0.407933, 0.908898], [-0.995883, -0.060028, 0.067922], [0.026852, -0.911036, -0.41145]] and translation vector: [1.314662, 0.836147, 1.492068], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_24_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_24_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_24_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_24_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_24_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_24_5.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.941243, -0.209403, 0.264975], [-0.336113, 0.504116, -0.795548], [0.033012, -0.837865, -0.544878]] and translation vector: [4.828751, 9.008894, 1.463441], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.939528, -0.206646, 0.273103], [-0.341818, 0.516505, -0.785101], [0.021179, -0.830976, -0.555906]] and translation vector: [4.819307, 9.009376, 1.463735], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_25_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_25_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_25_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_25_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_25_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_25_5.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.68967, 0.288211, -0.664297], [0.724122, -0.27239, 0.633602], [0.001663, -0.918008, -0.396559]] and translation vector: [2.530043, 2.005069, 1.437417], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.68921, 0.288518, -0.66464], [0.724561, -0.273014, 0.632831], [0.001127, -0.917726, -0.397212]] and translation vector: [2.5334, 2.008455, 1.44069], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_26_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_26_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_26_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_26_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_26_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_26_5.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.994446, -0.078697, 0.06988], [-0.104992, -0.787844, 0.606859], [0.007297, -0.610826, -0.791731]] and translation vector: [1.305105, 0.510448, 1.183315], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.994112, -0.083607, 0.068931], [-0.10831, -0.785774, 0.608956], [0.003251, -0.612836, -0.790203]] and translation vector: [1.308194, 0.508844, 1.184721], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_27_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_27_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_27_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_27_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_27_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_27_5.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.928375, -0.17783, 0.326339], [-0.371449, 0.415395, -0.830345], [0.012101, -0.892089, -0.451697]] and translation vector: [2.096006, 1.919092, 1.36174], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.929206, -0.177937, 0.323905], [-0.369314, 0.414969, -0.83151], [0.013546, -0.892266, -0.451307]] and translation vector: [2.095672, 1.922099, 1.363168], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_28_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_28_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_28_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_28_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_28_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_28_5.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.684823, -0.326379, 0.651532], [-0.728707, -0.304485, 0.613413], [-0.001823, -0.894855, -0.446353]] and translation vector: [2.86358, 2.414664, 1.549631], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.684506, -0.325468, 0.652321], [-0.729004, -0.308374, 0.611113], [0.002261, -0.893855, -0.448351]] and translation vector: [2.864701, 2.413023, 1.547001], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_29_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_29_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_29_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_29_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_29_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_29_5.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.928108, -0.125197, 0.35063], [-0.371823, 0.3599, -0.855699], [-0.019061, -0.924553, -0.380577]] and translation vector: [5.296664, 4.137775, 1.856988], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.930637, -0.119308, 0.34595], [-0.365378, 0.355543, -0.860284], [-0.020361, -0.927014, -0.374474]] and translation vector: [5.29653, 4.126579, 1.856014], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_30_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_30_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_30_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_30_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_30_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_30_5.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.399387, 0.327689, -0.856218], [0.9115, 0.041819, -0.409169], [-0.098274, -0.94386, -0.315391]] and translation vector: [4.88233, 2.963563, 1.403722], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.394763, 0.316878, -0.86241], [0.913367, 0.033579, -0.40575], [-0.099614, -0.947872, -0.302681]] and translation vector: [4.88409, 2.965299, 1.400614], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_31_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_31_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_31_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_31_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_31_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_31_5.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.119369, -0.433868, 0.893034], [-0.990549, 0.113242, -0.077387], [-0.067553, -0.893832, -0.443285]] and translation vector: [3.407035, 4.679209, 1.397058], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.120544, -0.432859, 0.893366], [-0.990306, 0.115004, -0.077902], [-0.06902, -0.894096, -0.442526]] and translation vector: [3.401289, 4.681283, 1.397495], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_32_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_32_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_32_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_32_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_32_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_32_5.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.264492, -0.222038, 0.938479], [-0.962334, 0.002714, 0.271857], [-0.062909, -0.975034, -0.212957]] and translation vector: [0.925816, 4.784833, 1.497389], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.263009, -0.220134, 0.939344], [-0.962729, 0.003779, 0.270443], [-0.063084, -0.975462, -0.210935]] and translation vector: [0.925807, 4.784041, 1.498483], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_33_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_33_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_33_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_33_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_33_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_33_5.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.711391, -0.463973, 0.527875], [-0.700286, 0.531398, -0.476672], [-0.059349, -0.708763, -0.702945]] and translation vector: [2.53321, 4.394931, 1.530427], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.710702, -0.465347, 0.527594], [-0.701175, 0.5294, -0.477586], [-0.057065, -0.709357, -0.702536]] and translation vector: [2.526067, 4.393322, 1.526345], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_34_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_34_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_34_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_34_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_34_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_34_5.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.524333, 0.441188, -0.728305], [0.848808, -0.202677, 0.488311], [0.067827, -0.874228, -0.480754]] and translation vector: [3.10696, 1.250425, 1.344077], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.531491, 0.437044, -0.72561], [0.844432, -0.205894, 0.494513], [0.066725, -0.875557, -0.478485]] and translation vector: [3.107462, 1.25329, 1.344278], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_35_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_35_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_35_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_35_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_35_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_35_5.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.205964, -0.505778, 0.837716], [-0.978495, 0.11627, -0.170378], [-0.011228, -0.854792, -0.518849]] and translation vector: [2.901534, 4.292832, 1.280844], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.204012, -0.504726, 0.838827], [-0.978841, 0.118998, -0.166463], [-0.0158, -0.855039, -0.518324]] and translation vector: [2.909629, 4.290413, 1.285823], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_36_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_36_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_36_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_36_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_36_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_36_5.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.999847, -0.004634, 0.01689], [-0.017397, -0.374134, 0.927211], [0.002023, -0.927363, -0.374157]] and translation vector: [3.310194, 3.16458, 1.506432], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.999774, -0.010896, 0.018284], [-0.021018, -0.369724, 0.928904], [-0.003361, -0.929078, -0.369869]] and translation vector: [3.316631, 3.168954, 1.519748], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_37_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_37_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_37_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_37_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_37_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_37_5.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.951558, 0.16536, -0.259218], [0.307283, -0.481983, 0.820531], [0.010744, -0.860436, -0.509446]] and translation vector: [2.919862, 3.428013, 1.521081], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.951326, 0.167996, -0.258374], [0.307875, -0.4803, 0.821295], [0.013877, -0.860866, -0.508643]] and translation vector: [2.920042, 3.428186, 1.518811], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_38_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_38_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_38_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_38_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_38_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_38_5.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.567127, -0.123224, 0.81436], [-0.823556, -0.071568, 0.562702], [-0.011056, -0.989795, -0.14207]] and translation vector: [0.249561, 0.967409, 1.634127], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.566682, -0.123694, 0.814599], [-0.82386, -0.07149, 0.562268], [-0.011313, -0.989742, -0.142418]] and translation vector: [0.249762, 0.967631, 1.633273], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_39_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_39_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_39_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_39_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_39_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_39_5.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.464707, 0.496079, -0.733453], [0.882598, 0.326106, -0.338639], [0.071191, -0.804711, -0.589382]] and translation vector: [2.864701, 0.868861, 1.204561], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.473617, 0.501904, -0.723726], [0.878064, 0.332992, -0.343688], [0.068496, -0.798254, -0.598414]] and translation vector: [2.869803, 0.866998, 1.20304], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_40_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_40_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_40_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_40_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_40_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_40_5.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.943065, -0.17817, 0.280864], [-0.332105, 0.550897, -0.765649], [-0.018311, -0.815333, -0.578703]] and translation vector: [2.74599, 1.673222, 1.294065], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.942639, -0.173012, 0.285478], [-0.332909, 0.550136, -0.765848], [-0.024551, -0.816957, -0.576177]] and translation vector: [2.737266, 1.663808, 1.300966], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_41_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_41_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_41_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_41_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_41_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_41_5.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.590232, -0.352789, 0.726062], [-0.807221, -0.252962, 0.533296], [-0.004475, -0.900861, -0.434086]] and translation vector: [2.518124, 2.463328, 1.346668], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.586587, -0.358769, 0.726086], [-0.809845, -0.250747, 0.530356], [-0.008212, -0.899117, -0.437632]] and translation vector: [2.520116, 2.462175, 1.344964], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_42_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_42_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_42_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_42_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_42_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_42_5.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.454685, 0.144673, -0.878824], [0.890085, 0.109034, -0.442562], [0.031795, -0.983454, -0.178347]] and translation vector: [3.311996, 2.119304, 1.59409], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.453171, 0.138778, -0.880555], [0.890847, 0.10604, -0.441756], [0.032068, -0.98463, -0.171684]] and translation vector: [3.314367, 2.120091, 1.591769], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_43_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_43_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_43_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_43_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_43_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_43_5.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.86482, -0.183466, 0.467362], [-0.501092, -0.256948, 0.826368], [-0.031523, -0.948851, -0.314147]] and translation vector: [3.012278, 2.022242, 1.442339], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.863867, -0.189194, 0.466839], [-0.502557, -0.260784, 0.824274], [-0.034203, -0.946677, -0.320364]] and translation vector: [3.015002, 2.018446, 1.436262], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_44_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_44_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_44_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_44_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_44_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_44_5.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.880278, -0.246293, 0.405524], [-0.473973, 0.417832, -0.775091], [0.021459, -0.874503, -0.484545]] and translation vector: [3.281806, 2.754624, 1.352781], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.883446, -0.241464, 0.401521], [-0.467927, 0.41107, -0.782347], [0.023856, -0.879043, -0.476146]] and translation vector: [3.2823, 2.745028, 1.352692], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_45_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_45_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_45_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_45_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_45_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_45_5.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.133825, -0.39571, 0.908573], [-0.990975, -0.046263, 0.125813], [-0.007752, -0.91721, -0.398329]] and translation vector: [4.990516, 4.227292, 1.32289], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.168071, -0.388121, 0.906153], [-0.985699, -0.054747, 0.159375], [-0.012247, -0.919981, -0.391772]] and translation vector: [4.987841, 4.19209, 1.32312], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_46_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_46_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_46_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_46_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_46_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_46_5.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.48142, 0.335029, -0.809933], [0.872625, 0.096524, -0.478757], [-0.08222, -0.937251, -0.338823]] and translation vector: [4.429162, 2.287411, 1.464776], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.484328, 0.331289, -0.809737], [0.871134, 0.09698, -0.481374], [-0.080946, -0.938532, -0.335568]] and translation vector: [4.432656, 2.285767, 1.465956], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_47_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_47_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_47_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_47_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_47_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_47_5.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.672393, -0.274439, 0.687438], [-0.739855, -0.221079, 0.635404], [-0.022402, -0.935846, -0.351697]] and translation vector: [3.802358, 2.110255, 1.494557], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.672432, -0.275262, 0.687071], [-0.739825, -0.222066, 0.635095], [-0.022242, -0.93537, -0.35297]] and translation vector: [3.806542, 2.108163, 1.497405], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_48_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_48_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_48_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_48_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_48_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_48_5.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.176261, -0.039155, 0.983564], [-0.983722, -0.028492, -0.177423], [0.03497, -0.998827, -0.033496]] and translation vector: [3.054739, 2.437738, 1.503838], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.18153, -0.048874, 0.98217], [-0.982778, -0.026092, -0.182941], [0.034567, -0.998464, -0.043296]] and translation vector: [3.061021, 2.450195, 1.498681], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_49_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_49_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_49_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_49_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_49_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_49_5.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.467192, 0.317292, -0.825262], [0.883302, -0.126478, 0.451421], [0.038855, -0.939856, -0.339354]] and translation vector: [2.723032, 3.168159, 1.438168], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.467636, 0.312306, -0.826911], [0.883318, -0.130557, 0.450227], [0.03265, -0.940968, -0.336919]] and translation vector: [2.722188, 3.168039, 1.441817], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_50_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_50_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_50_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_50_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_50_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_50_5.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.51864, -0.44867, 0.727811], [-0.853934, -0.229463, 0.467059], [-0.04255, -0.863738, -0.502143]] and translation vector: [1.002297, 1.98866, 1.344191], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.519607, -0.444592, 0.729621], [-0.853432, -0.229314, 0.468049], [-0.040778, -0.865883, -0.498582]] and translation vector: [1.000441, 1.985865, 1.344846], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_51_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_51_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_51_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_51_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_51_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_51_5.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.688084, 0.423256, -0.589401], [0.725514, -0.415863, 0.54835], [-0.013017, -0.80493, -0.593227]] and translation vector: [3.968163, 0.8771, 1.421607], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.688048, 0.420794, -0.591205], [0.725576, -0.411726, 0.551381], [-0.011397, -0.80834, -0.588605]] and translation vector: [3.964529, 0.870938, 1.417962], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_52_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_52_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_52_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_52_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_52_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_52_5.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.042655, 0.409797, -0.911179], [0.998036, -0.024411, -0.0577], [-0.045888, -0.91185, -0.40795]] and translation vector: [2.423933, 1.356295, 3.282493], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.032887, 0.418885, -0.907444], [0.998611, -0.023628, -0.047098], [-0.041169, -0.907732, -0.417526]] and translation vector: [2.425306, 1.358764, 3.278826], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_53_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_53_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_53_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_53_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_53_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_53_5.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.925351, 0.122106, -0.358909], [0.376741, 0.190476, -0.906524], [-0.042329, -0.974068, -0.222259]] and translation vector: [4.735593, 2.732706, 1.21643], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.924788, 0.125024, -0.359357], [0.377675, 0.187086, -0.906841], [-0.046146, -0.974355, -0.220234]] and translation vector: [4.740286, 2.733964, 1.218072], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_54_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_54_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_54_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_54_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_54_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_54_5.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.908726, 0.150598, -0.389277], [0.406624, 0.108936, -0.907078], [-0.094198, -0.982575, -0.16023]] and translation vector: [8.822721, 3.830595, 1.476402], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.908663, 0.151907, -0.388916], [0.40641, 0.108245, -0.907256], [-0.09572, -0.98245, -0.160095]] and translation vector: [8.818814, 3.832555, 1.475788], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_55_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_55_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_55_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_55_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_55_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_55_5.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.984594, -0.069457, 0.160469], [-0.174127, -0.305795, 0.936039], [-0.015944, -0.949561, -0.313178]] and translation vector: [3.941113, 2.817773, 1.559826], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.984592, -0.069572, 0.160429], [-0.174152, -0.307406, 0.935507], [-0.015768, -0.949032, -0.314785]] and translation vector: [3.94407, 2.817183, 1.553188], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_56_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_56_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_56_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_56_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_56_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_56_5.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.677945, 0.409221, -0.610679], [0.735109, 0.38004, -0.561413], [0.00234, -0.829523, -0.558468]] and translation vector: [3.092599, 2.044437, 1.437429], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.678782, 0.408186, -0.610442], [0.734335, 0.380383, -0.562193], [0.002723, -0.829875, -0.557943]] and translation vector: [3.0892, 2.043949, 1.440375], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_57_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_57_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_57_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_57_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_57_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_57_5.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.330673, -0.328207, 0.884837], [-0.942686, -0.070458, 0.326157], [-0.044703, -0.941975, -0.332694]] and translation vector: [3.753276, 4.481459, 1.345242], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.306694, -0.326667, 0.893995], [-0.950878, -0.063631, 0.302957], [-0.04208, -0.942995, -0.330136]] and translation vector: [3.754864, 4.497246, 1.34429], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_58_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_58_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_58_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_58_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_58_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_58_5.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.305635, -0.390507, 0.868385], [-0.952144, 0.122302, -0.280116], [0.003183, -0.91244, -0.409198]] and translation vector: [4.266061, 1.773856, 1.285079], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.300987, -0.399102, 0.866097], [-0.953628, 0.125052, -0.273781], [0.00096, -0.908339, -0.418234]] and translation vector: [4.263163, 1.772832, 1.291083], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_59_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_59_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_59_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_59_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_59_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_59_5.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.853196, -0.330732, 0.403328], [-0.517406, -0.438892, 0.734619], [-0.065945, -0.835458, -0.545584]] and translation vector: [2.734716, 6.775187, 1.412962], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.853022, -0.336855, 0.398601], [-0.516617, -0.436898, 0.736361], [-0.0739, -0.834056, -0.546708]] and translation vector: [2.728871, 6.767794, 1.411126], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_60_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_60_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_60_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_60_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_60_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_60_5.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.207785, -0.462455, 0.861952], [-0.977184, 0.13779, -0.161637], [-0.044019, -0.875871, -0.480534]] and translation vector: [2.720584, 1.654419, 1.522448], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.211008, -0.462778, 0.860995], [-0.976592, 0.137438, -0.165466], [-0.04176, -0.875755, -0.480946]] and translation vector: [2.717844, 1.649691, 1.521912], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_61_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_61_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_61_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_61_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_61_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_61_5.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.624751, -0.31057, 0.716403], [-0.780527, -0.273701, 0.562018], [0.021534, -0.910293, -0.413403]] and translation vector: [-0.212106, 0.775797, 1.619325], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.624146, -0.312612, 0.716042], [-0.781019, -0.274551, 0.56092], [0.02124, -0.909338, -0.415515]] and translation vector: [-0.212874, 0.777223, 1.616059], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_62_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_62_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_62_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_62_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_62_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_62_5.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.964843, 0.186346, -0.185345], [0.252505, 0.461537, -0.850426], [-0.07293, -0.867329, -0.492364]] and translation vector: [3.779865, 2.337391, 1.461827], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.966867, 0.182729, -0.178267], [0.244986, 0.467845, -0.849178], [-0.071768, -0.864715, -0.49711]] and translation vector: [3.779708, 2.335608, 1.46105], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_63_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_63_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_63_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_63_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_63_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_63_5.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.935902, 0.160482, -0.313582], [0.351212, -0.493772, 0.795512], [-0.027173, -0.854655, -0.518485]] and translation vector: [4.465, -0.226232, 1.550028], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.933656, 0.161027, -0.319933], [0.356818, -0.495752, 0.791777], [-0.03111, -0.853405, -0.520319]] and translation vector: [4.478531, -0.229773, 1.540292], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_64_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_64_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_64_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_64_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_64_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_64_5.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.955421, 0.119616, -0.269932], [0.295248, 0.388339, -0.872939], [0.000408, -0.91372, -0.406343]] and translation vector: [2.65583, 2.981598, 1.368648], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.951595, 0.120375, -0.282803], [0.307283, 0.392547, -0.866882], [0.006663, -0.91182, -0.410535]] and translation vector: [2.655525, 2.981353, 1.361859], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_65_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_65_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_65_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_65_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_65_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_65_5.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.409087, -0.112571, 0.905525], [-0.910894, 0.109148, -0.397943], [-0.05404, -0.987631, -0.147191]] and translation vector: [4.421403, 3.579741, 1.526424], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.417977, -0.10834, 0.901974], [-0.906895, 0.107978, -0.407287], [-0.053267, -0.988232, -0.143386]] and translation vector: [4.418822, 3.582731, 1.526625], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_66_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_66_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_66_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_66_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_66_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_66_5.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.931668, 0.072515, -0.356001], [0.362912, -0.231685, 0.902561], [-0.017031, -0.970084, -0.24217]] and translation vector: [5.886859, 3.543659, 1.354971], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.931979, 0.073028, -0.355079], [0.362119, -0.233112, 0.902513], [-0.016864, -0.969704, -0.2437]] and translation vector: [5.882501, 3.543666, 1.354317], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_67_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_67_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_67_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_67_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_67_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_67_5.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.081815, 0.638296, -0.765431], [0.996577, -0.061545, 0.055199], [-0.011875, -0.767327, -0.641146]] and translation vector: [3.004073, 1.570726, 1.431248], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.083332, 0.64082, -0.763155], [0.996457, -0.062303, 0.056492], [-0.011346, -0.765159, -0.643742]] and translation vector: [3.00242, 1.571458, 1.432065], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_68_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_68_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_68_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_68_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_68_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_68_5.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.606468, -0.360414, 0.70873], [-0.789578, -0.16805, 0.590192], [-0.093612, -0.91753, -0.386492]] and translation vector: [2.373669, 6.226582, 1.48631], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.603564, -0.356146, 0.713352], [-0.791899, -0.163667, 0.588311], [-0.092772, -0.919986, -0.380815]] and translation vector: [2.370215, 6.229294, 1.484576], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_69_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_69_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_69_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_69_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_69_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_69_5.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.481759, -0.460793, 0.745371], [-0.875469, 0.290199, -0.386444], [-0.038235, -0.838722, -0.543216]] and translation vector: [3.08436, 2.075189, 1.468295], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.482142, -0.463533, 0.743422], [-0.87538, 0.289132, -0.387445], [-0.035354, -0.83758, -0.54517]] and translation vector: [3.085865, 2.079347, 1.468915], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_70_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_70_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_70_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_70_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_70_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_70_5.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.731293, 0.384445, -0.563394], [0.682011, 0.401944, -0.610984], [-0.008437, -0.831049, -0.556135]] and translation vector: [5.176627, 2.209938, 1.427488], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.733453, 0.387758, -0.558292], [0.679719, 0.411882, -0.606907], [-0.005383, -0.82462, -0.565663]] and translation vector: [5.175584, 2.209993, 1.422561], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_71_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_71_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_71_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_71_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_71_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_71_5.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.414473, -0.491559, 0.765887], [-0.909569, 0.196057, -0.366396], [0.029948, -0.848488, -0.528367]] and translation vector: [0.955419, 3.497842, 1.497559], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.410009, -0.490704, 0.768832], [-0.911757, 0.198024, -0.359841], [0.024328, -0.848526, -0.528594]] and translation vector: [0.937857, 3.503192, 1.495427], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_72_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_72_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_72_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_72_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_72_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_72_5.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.82141, -0.124481, 0.556588], [-0.562763, -0.33543, 0.755503], [0.092651, -0.933805, -0.345579]] and translation vector: [1.795382, 2.457259, 1.379582], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.820332, -0.124179, 0.558243], [-0.564621, -0.330977, 0.75608], [0.090876, -0.935432, -0.341626]] and translation vector: [1.795684, 2.460531, 1.380001], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_73_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_73_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_73_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_73_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_73_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_73_5.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.993805, -0.057016, 0.095394], [-0.110597, -0.423109, 0.899304], [-0.010913, -0.904283, -0.426794]] and translation vector: [3.282054, 2.568905, 1.512321], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.993106, -0.061381, 0.099861], [-0.116562, -0.427194, 0.896615], [-0.012375, -0.902074, -0.431404]] and translation vector: [3.283498, 2.568158, 1.509645], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_74_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_74_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_74_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_74_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_74_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_74_5.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.348231, 0.123124, -0.929288], [0.936413, -1.6e-05, 0.350899], [0.043189, -0.992391, -0.1153]] and translation vector: [2.712005, 2.075202, 1.464169], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.348319, 0.120186, -0.929639], [0.93641, 0.000395, 0.350907], [0.042542, -0.992751, -0.112406]] and translation vector: [2.712393, 2.076758, 1.463984], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_75_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_75_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_75_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_75_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_75_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_75_5.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.482968, -0.397392, 0.78027], [-0.874514, 0.173759, -0.452807], [0.044362, -0.901048, -0.431445]] and translation vector: [8.974016, 2.795387, 1.945192], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.496352, -0.388832, 0.776173], [-0.867003, 0.176647, -0.465943], [0.044064, -0.904216, -0.424797]] and translation vector: [8.98292, 2.792107, 1.939625], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_76_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_76_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_76_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_76_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_76_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_76_5.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.079918, -0.690871, 0.718547], [-0.996802, 0.055321, -0.057677], [9.6e-05, -0.720858, -0.693082]] and translation vector: [1.142658, 0.968078, 1.385987], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.080635, -0.691404, 0.717954], [-0.996742, 0.054488, -0.059473], [0.002, -0.72041, -0.693545]] and translation vector: [1.144302, 0.967344, 1.387927], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_77_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_77_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_77_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_77_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_77_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_77_5.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.725417, 0.297171, -0.620854], [0.687848, -0.279954, 0.669695], [0.025203, -0.912861, -0.407492]] and translation vector: [3.434752, 3.057745, 1.556519], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.722045, 0.303192, -0.621873], [0.691238, -0.278447, 0.666827], [0.029018, -0.911341, -0.410629]] and translation vector: [3.433538, 3.052318, 1.549734], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_78_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_78_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_78_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_78_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_78_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_78_5.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.156961, 0.257294, -0.953501], [0.986843, 0.002956, -0.161652], [-0.038773, -0.966329, -0.254373]] and translation vector: [1.838324, 1.205476, 1.480452], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.155829, 0.255617, -0.954137], [0.987039, 0.002796, -0.160453], [-0.038347, -0.966774, -0.252739]] and translation vector: [1.83996, 1.205416, 1.474648], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_79_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_79_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_79_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_79_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_79_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_79_5.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.922168, 0.178823, -0.342969], [0.38661, 0.453076, -0.803278], [0.011746, -0.873352, -0.486947]] and translation vector: [3.207336, 1.959871, 1.267555], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.914921, 0.180426, -0.361063], [0.403188, 0.450583, -0.796502], [0.018979, -0.874312, -0.484993]] and translation vector: [3.204391, 1.957541, 1.273759], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_80_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_80_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_80_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_80_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_80_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_80_5.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.241978, -0.427128, 0.871211], [-0.963615, 0.210861, -0.164264], [-0.113543, -0.879261, -0.462611]] and translation vector: [2.164319, 10.11033, 1.716674], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.23973, -0.426819, 0.871983], [-0.964754, 0.205144, -0.16482], [-0.108534, -0.880762, -0.460955]] and translation vector: [2.164643, 10.108889, 1.726434], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_81_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_81_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_81_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_81_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_81_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_81_5.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.424269, -0.366439, 0.828081], [-0.894198, -0.025281, 0.446957], [-0.142848, -0.930098, -0.338395]] and translation vector: [2.638367, 6.760901, 1.41712], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.432512, -0.37625, 0.819371], [-0.890339, -0.034872, 0.45396], [-0.14223, -0.925862, -0.350073]] and translation vector: [2.640049, 6.763855, 1.420073], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_82_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_82_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_82_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_82_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_82_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_82_5.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.645842, -0.099101, 0.757012], [-0.761541, -0.013148, 0.647984], [-0.054263, -0.994991, -0.083961]] and translation vector: [3.729951, 1.432448, 1.733539], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.649827, -0.099601, 0.753528], [-0.757797, -0.00807, 0.652441], [-0.058903, -0.994995, -0.080722]] and translation vector: [3.727943, 1.43259, 1.731865], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_83_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_83_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_83_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_83_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_83_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_83_5.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.686341, -0.358824, 0.632599], [-0.727213, -0.35045, 0.590209], [0.009912, -0.865119, -0.50147]] and translation vector: [2.486494, 4.601647, 1.455454], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.681394, -0.352774, 0.64129], [-0.731846, -0.340576, 0.590263], [0.010179, -0.871527, -0.490243]] and translation vector: [2.480601, 4.595852, 1.449959], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_84_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_84_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_84_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_84_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_84_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_84_5.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.515401, -0.339121, 0.786994], [-0.847541, -0.337435, 0.40965], [0.126638, -0.878143, -0.461333]] and translation vector: [4.776819, 1.138867, 1.280463], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.495978, -0.33911, 0.799381], [-0.859276, -0.324304, 0.395565], [0.125103, -0.88308, -0.452237]] and translation vector: [4.773187, 1.14016, 1.284317], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_85_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_85_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_85_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_85_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_85_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_85_5.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.799511, 0.533863, -0.275266], [0.600541, 0.71925, -0.349328], [0.011492, -0.4446, -0.895656]] and translation vector: [2.031323, 2.312379, 1.200993], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.794986, 0.540559, -0.275306], [0.606553, 0.715482, -0.346669], [0.009582, -0.442584, -0.896676]] and translation vector: [2.031011, 2.313572, 1.199732], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_86_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_86_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_86_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_86_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_86_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_86_5.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.830629, 0.239867, -0.502514], [0.556756, 0.37214, -0.742654], [0.008867, -0.896647, -0.442658]] and translation vector: [4.849209, 2.614689, 1.447477], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.826514, 0.239564, -0.509396], [0.562778, 0.371773, -0.738286], [0.012512, -0.89688, -0.442097]] and translation vector: [4.848542, 2.612423, 1.449706], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_87_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_87_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_87_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_87_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_87_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_87_5.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.977181, 0.077241, -0.197866], [0.211774, -0.426158, 0.879512], [-0.016388, -0.901345, -0.432791]] and translation vector: [0.977323, 0.877303, 1.40232], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.979446, 0.063797, -0.19135], [0.200663, -0.404476, 0.892263], [-0.020472, -0.912321, -0.408965]] and translation vector: [0.961423, 0.875672, 1.418643], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_88_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_88_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_88_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_88_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_88_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_88_5.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.977514, -0.102294, 0.184398], [-0.210796, -0.497303, 0.841578], [0.005613, -0.861525, -0.507684]] and translation vector: [3.555602, 1.207732, 1.356493], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.976582, -0.105336, 0.187593], [-0.215087, -0.498001, 0.840079], [0.00493, -0.860755, -0.508995]] and translation vector: [3.555365, 1.207812, 1.356155], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_89_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_89_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_89_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_89_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_89_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_89_5.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.869565, 0.231948, -0.435955], [0.492522, 0.471291, -0.731647], [0.035758, -0.850932, -0.524058]] and translation vector: [2.750575, 3.154689, 1.290553], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.871211, 0.246607, -0.424472], [0.49036, 0.478017, -0.72873], [0.023195, -0.843022, -0.53738]] and translation vector: [2.712538, 3.137298, 1.287246], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_90_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_90_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_90_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_90_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_90_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_90_5.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.991592, 0.052224, -0.118397], [0.1292, -0.348306, 0.928435], [0.007248, -0.935925, -0.352124]] and translation vector: [2.177373, 2.142725, 1.46728], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.992093, 0.047571, -0.11614], [0.125441, -0.346386, 0.929667], [0.003996, -0.936885, -0.349615]] and translation vector: [2.181058, 2.142908, 1.465582], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_91_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_91_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_91_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_91_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_91_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_91_5.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.987126, 0.106622, -0.119219], [0.159938, -0.652529, 0.740693], [0.00118, -0.750225, -0.661181]] and translation vector: [4.64166, 4.052867, 1.404314], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.987387, 0.107853, -0.115912], [0.158278, -0.654013, 0.73974], [0.003975, -0.748756, -0.662834]] and translation vector: [4.649776, 4.051806, 1.400746], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_92_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_92_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_92_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_92_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_92_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_92_5.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.892065, -0.360019, 0.273141], [-0.443019, -0.577417, 0.685801], [-0.089185, -0.732786, -0.674589]] and translation vector: [2.898737, 2.45906, 1.649541], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.888376, -0.366176, 0.276954], [-0.450762, -0.581088, 0.677606], [-0.087189, -0.726809, -0.681283]] and translation vector: [2.873446, 2.440832, 1.651115], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_93_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_93_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_93_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_93_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_93_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_93_5.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.610102, 0.375008, -0.697958], [0.791763, 0.255448, -0.554849], [-0.029781, -0.891132, -0.452767]] and translation vector: [2.349929, 1.419923, 1.358478], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.607496, 0.374505, -0.700496], [0.793845, 0.255679, -0.551759], [-0.027534, -0.891277, -0.452623]] and translation vector: [2.354864, 1.421781, 1.358478], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_94_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_94_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_94_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_94_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_94_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_94_5.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.386761, -0.304254, 0.870543], [-0.920043, 0.191539, -0.34181], [-0.062746, -0.933136, -0.354007]] and translation vector: [2.082368, 4.008438, 1.845888], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.387201, -0.298257, 0.872421], [-0.919947, 0.188025, -0.344013], [-0.061432, -0.935783, -0.347183]] and translation vector: [2.08001, 4.010775, 1.842824], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_95_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_95_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_95_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_95_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_95_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_95_5.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.993306, 0.029023, -0.111812], [0.110831, -0.512349, 0.851596], [-0.032571, -0.858287, -0.512136]] and translation vector: [2.482234, 1.391135, 1.348064], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.992702, 0.031717, -0.116349], [0.116167, -0.510508, 0.85199], [-0.032374, -0.859288, -0.510467]] and translation vector: [2.48213, 1.388715, 1.34704], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_96_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_96_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_96_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_96_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_96_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_96_5.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.660671, 0.426343, -0.617856], [0.749322, -0.423957, 0.508701], [-0.045063, -0.799057, -0.599565]] and translation vector: [1.739014, 2.260029, 1.323145], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.661948, 0.412501, -0.625834], [0.748146, -0.41469, 0.517987], [-0.045857, -0.811095, -0.583114]] and translation vector: [1.741474, 2.257287, 1.327618], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_97_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_97_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_97_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_97_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_97_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_97_5.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.283698, -0.38675, 0.877463], [-0.95878, 0.129662, -0.252839], [-0.015988, -0.913024, -0.407593]] and translation vector: [3.69525, 3.551647, 1.352095], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.292652, -0.378333, 0.878191], [-0.956147, 0.127043, -0.2639], [-0.011726, -0.91691, -0.398922]] and translation vector: [3.694781, 3.553972, 1.346799], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_98_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_98_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_98_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_98_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_98_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_98_5.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.988959, -0.006087, -0.148062], [0.148117, 0.009943, 0.98892], [-0.004548, -0.999932, 0.010735]] and translation vector: [3.911582, 2.672538, 1.565046], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.987297, -0.007995, -0.158684], [0.158774, 0.012251, 0.987239], [-0.005949, -0.999893, 0.013365]] and translation vector: [3.955948, 2.679338, 1.574419], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_99_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_99_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_99_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_99_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_99_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_99_5.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.95695, -0.100486, 0.272304], [-0.288986, 0.24231, -0.92616], [0.027085, -0.964981, -0.260918]] and translation vector: [1.227478, 4.879099, 1.55452], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.957752, -0.097454, 0.27058], [-0.286469, 0.240112, -0.927514], [0.025421, -0.965841, -0.257885]] and translation vector: [1.221714, 4.885019, 1.554874], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_100_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_100_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_100_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_100_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_100_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_100_5.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.895509, 0.17248, -0.410263], [0.444823, 0.375965, -0.812886], [0.014038, -0.91044, -0.413402]] and translation vector: [2.818061, 5.409916, 1.54775], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.895274, 0.172164, -0.410907], [0.445264, 0.376844, -0.812237], [0.01501, -0.910136, -0.414037]] and translation vector: [2.819061, 5.407142, 1.548651], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_101_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_101_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_101_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_101_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_101_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_101_5.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.355681, -0.20797, 0.911175], [-0.934036, 0.113197, -0.338769], [-0.032689, -0.971563, -0.234514]] and translation vector: [0.539195, 4.841905, 1.636959], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.354881, -0.205091, 0.912139], [-0.934375, 0.110848, -0.338608], [-0.031664, -0.972446, -0.230969]] and translation vector: [0.533365, 4.84225, 1.627512], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_102_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_102_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_102_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_102_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_102_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_102_5.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.506976, -0.449046, 0.735753], [-0.861802, 0.247713, -0.442646], [0.016513, -0.858485, -0.512574]] and translation vector: [1.568574, 4.423309, 1.333385], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.503836, -0.444181, 0.740846], [-0.863753, 0.25025, -0.437385], [0.008882, -0.860278, -0.509748]] and translation vector: [1.576928, 4.418399, 1.331934], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_103_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_103_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_103_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_103_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_103_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_103_5.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.221984, 0.421429, -0.879273], [0.97466, 0.121427, -0.187867], [0.027595, -0.898695, -0.437705]] and translation vector: [3.155292, 0.483793, 1.35371], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.224547, 0.416482, -0.880978], [0.973822, 0.128715, -0.187361], [0.035363, -0.899986, -0.434482]] and translation vector: [3.157119, 0.483672, 1.354178], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_104_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_104_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_104_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_104_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_104_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_104_5.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.997074, 0.061747, -0.045056], [0.074474, 0.651998, -0.754554], [-0.017215, -0.755702, -0.654689]] and translation vector: [1.815792, 5.369752, 1.288561], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.994543, 0.080066, -0.066881], [0.102674, 0.63762, -0.763478], [-0.018484, -0.766179, -0.642361]] and translation vector: [1.819087, 5.36055, 1.286161], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_105_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_105_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_105_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_105_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_105_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_105_5.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.476704, 0.41796, -0.773345], [0.878176, 0.186897, -0.440314], [-0.039498, -0.889033, -0.456137]] and translation vector: [2.405627, 4.675593, 1.276166], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.455958, 0.42895, -0.779811], [0.88909, 0.179883, -0.420905], [-0.040272, -0.885237, -0.463394]] and translation vector: [2.408911, 4.675395, 1.276879], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_106_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_106_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_106_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_106_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_106_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_106_5.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.573165, 0.475287, -0.667521], [0.819422, -0.337921, 0.462988], [-0.005517, -0.81235, -0.583144]] and translation vector: [4.230747, 1.597944, 1.425469], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.580595, 0.472456, -0.663095], [0.814187, -0.339873, 0.470729], [-0.002969, -0.813186, -0.581996]] and translation vector: [4.228813, 1.597838, 1.42741], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_107_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_107_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_107_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_107_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_107_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_107_5.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.218501, -0.721835, 0.656667], [-0.97193, -0.10083, 0.212566], [-0.087226, -0.684681, -0.723605]] and translation vector: [2.10902, 2.428258, 1.386435], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.218569, -0.722397, 0.656026], [-0.971546, -0.098231, 0.215522], [-0.091251, -0.684466, -0.723312]] and translation vector: [2.107975, 2.430531, 1.385643], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_108_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_108_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_108_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_108_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_108_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_108_5.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.819759, -0.274444, 0.502669], [-0.572709, 0.39303, -0.719397], [-0.00013, -0.877615, -0.479366]] and translation vector: [2.765326, 1.370172, 1.355227], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.819555, -0.26888, 0.505998], [-0.572993, 0.389095, -0.721307], [-0.002936, -0.881084, -0.472951]] and translation vector: [2.765196, 1.369276, 1.358405], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_109_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_109_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_109_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_109_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_109_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_109_5.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.769532, -0.429513, 0.472588], [-0.615738, -0.302759, 0.727464], [-0.169375, -0.850797, -0.49745]] and translation vector: [2.184386, 2.253813, 1.283805], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.76638, -0.428136, 0.478917], [-0.620171, -0.298738, 0.725357], [-0.167481, -0.85291, -0.494464]] and translation vector: [2.185226, 2.257666, 1.286817], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_110_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_110_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_110_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_110_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_110_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_110_5.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.493838, -0.420518, 0.76111], [-0.864926, -0.147366, 0.479777], [-0.089593, -0.895236, -0.436493]] and translation vector: [0.736944, 2.108944, 1.402726], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.487676, -0.423405, 0.763479], [-0.869284, -0.154634, 0.469504], [-0.080731, -0.892646, -0.443471]] and translation vector: [0.733117, 2.095654, 1.39687], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_111_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_111_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_111_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_111_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_111_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_111_5.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.341382, 0.594812, -0.727775], [0.932196, 0.11517, -0.343142], [-0.120287, -0.795572, -0.593798]] and translation vector: [7.151203, 3.587152, 1.581923], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.344041, 0.585523, -0.734029], [0.930897, 0.110501, -0.348168], [-0.122749, -0.803089, -0.583079]] and translation vector: [7.150104, 3.60012, 1.584136], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_112_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_112_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_112_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_112_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_112_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_112_5.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.060487, 0.154719, -0.986105], [0.998165, 0.006603, -0.060191], [-0.002801, -0.987936, -0.154835]] and translation vector: [6.630666, 2.572317, 1.44523], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.062036, 0.175232, -0.982571], [0.998074, 0.011306, -0.060998], [0.00042, -0.984462, -0.175596]] and translation vector: [6.62843, 2.567178, 1.442285], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_113_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_113_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_113_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_113_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_113_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_113_5.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.207705, 0.494542, -0.843971], [0.97739, -0.069996, 0.199524], [0.039599, -0.866331, -0.497898]] and translation vector: [4.53083, 2.291093, 1.52739], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.209269, 0.494574, -0.843566], [0.977066, -0.071037, 0.200739], [0.039356, -0.866228, -0.498097]] and translation vector: [4.529976, 2.291335, 1.526507], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_114_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_114_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_114_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_114_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_114_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_114_5.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.778266, 0.076502, -0.623257], [0.626532, 0.028295, -0.778882], [-0.041951, -0.996668, -0.069952]] and translation vector: [4.354075, 2.27787, 1.510689], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.774603, 0.078895, -0.627508], [0.631084, 0.031306, -0.775082], [-0.041505, -0.996391, -0.074039]] and translation vector: [4.353431, 2.276987, 1.507071], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_115_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_115_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_115_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_115_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_115_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_115_5.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.982764, 0.054289, -0.17671], [0.184841, -0.27426, 0.943724], [0.002769, -0.960122, -0.279568]] and translation vector: [4.072058, 1.220293, 1.47625], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.982485, 0.057917, -0.177113], [0.186218, -0.270474, 0.944546], [0.0068, -0.960984, -0.276522]] and translation vector: [4.071517, 1.218265, 1.477941], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_116_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_116_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_116_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_116_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_116_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_116_5.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.286652, 0.220257, -0.932372], [0.958024, -0.061246, 0.28007], [0.004584, -0.973517, -0.228568]] and translation vector: [3.76659, 1.676076, 1.452194], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.299829, 0.216367, -0.929133], [0.953977, -0.07366, 0.290693], [-0.005544, -0.973529, -0.228495]] and translation vector: [3.753121, 1.670498, 1.452776], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_117_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_117_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_117_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_117_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_117_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_117_5.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.848489, -0.131122, 0.512712], [-0.527579, 0.133483, -0.838954], [0.041567, -0.982339, -0.182436]] and translation vector: [2.702568, 1.718074, 1.602473], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.851363, -0.128939, 0.508484], [-0.523333, 0.142037, -0.840207], [0.036112, -0.981428, -0.188403]] and translation vector: [2.706553, 1.721294, 1.602035], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_118_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_118_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_118_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_118_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_118_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_118_5.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.764638, 0.028658, -0.643823], [0.64431, -0.055554, 0.762744], [-0.013909, -0.998044, -0.060944]] and translation vector: [3.061982, 3.98913, 1.495508], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.765028, 0.027801, -0.643396], [0.643825, -0.056098, 0.763114], [-0.014878, -0.998038, -0.060816]] and translation vector: [3.064652, 3.991985, 1.487138], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_119_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_119_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_119_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_119_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_119_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_119_5.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.935878, -0.161972, 0.312885], [-0.352322, 0.433116, -0.829627], [-0.001139, -0.886666, -0.46241]] and translation vector: [1.123681, 2.231354, 1.408983], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.935522, -0.159, 0.315466], [-0.353249, 0.430874, -0.830399], [-0.003893, -0.888294, -0.459258]] and translation vector: [1.123559, 2.231523, 1.408322], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_120_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_120_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_120_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_120_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_120_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_120_5.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.45377, -0.425062, 0.783208], [-0.891046, 0.227634, -0.392708], [-0.01136, -0.876074, -0.482043]] and translation vector: [2.25004, 3.862298, 1.519108], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.453547, -0.422981, 0.784463], [-0.891155, 0.226808, -0.392938], [-0.011717, -0.877294, -0.47981]] and translation vector: [2.249275, 3.861866, 1.519019], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_121_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_121_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_121_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_121_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_121_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_121_5.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.205292, 0.226186, -0.952205], [0.97316, -0.150555, 0.174048], [-0.103992, -0.962379, -0.251024]] and translation vector: [4.876985, 2.837537, 1.671042], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.210488, 0.22021, -0.952472], [0.971775, -0.153305, 0.17931], [-0.106533, -0.96333, -0.246263]] and translation vector: [4.87733, 2.840179, 1.675237], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_122_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_122_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_122_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_122_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_122_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_122_5.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.037281, 0.595041, -0.80283], [0.998378, -0.012419, -0.055566], [-0.043034, -0.803599, -0.593613]] and translation vector: [3.95675, 2.244474, 1.442954], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.038109, 0.594465, -0.803218], [0.998341, -0.012073, -0.056302], [-0.043167, -0.80403, -0.593019]] and translation vector: [3.957906, 2.244142, 1.441716], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_123_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_123_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_123_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_123_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_123_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_123_5.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.070416, -0.411804, 0.908548], [-0.99671, 0.065705, -0.047468], [-0.040148, -0.908901, -0.415075]] and translation vector: [2.214543, 1.806687, 1.391502], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.072195, -0.409813, 0.909308], [-0.996578, 0.066438, -0.049181], [-0.040258, -0.909747, -0.413207]] and translation vector: [2.216063, 1.808517, 1.395188], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_124_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_124_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_124_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_124_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_124_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_124_5.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.086843, 0.425015, -0.901011], [0.995696, 0.066429, -0.064634], [0.032383, -0.902745, -0.428955]] and translation vector: [4.261571, 5.85756, 1.66629], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.086953, 0.422316, -0.902268], [0.995713, 0.06553, -0.065286], [0.031554, -0.904077, -0.426204]] and translation vector: [4.260677, 5.865657, 1.669414], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_125_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_125_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_125_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_125_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_125_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_125_5.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.882784, 0.25224, -0.396318], [0.469583, -0.498211, 0.728888], [-0.013595, -0.829554, -0.55826]] and translation vector: [3.463734, 1.394934, 1.262723], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.883097, 0.250738, -0.396574], [0.468931, -0.499833, 0.728197], [-0.015634, -0.829034, -0.558979]] and translation vector: [3.462241, 1.393432, 1.262782], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_126_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_126_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_126_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_126_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_126_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_126_5.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.079656, -0.319192, 0.944337], [-0.994012, 0.096527, -0.051219], [-0.074805, -0.942762, -0.324969]] and translation vector: [4.3352, 2.935251, 1.464921], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.08136, -0.319768, 0.943996], [-0.993796, 0.098086, -0.052427], [-0.075828, -0.942405, -0.325765]] and translation vector: [4.335558, 2.933583, 1.460394], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_127_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_127_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_127_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_127_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_127_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_127_5.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.255252, -0.433184, 0.864406], [-0.966562, 0.137073, -0.216725], [-0.024605, -0.890821, -0.453687]] and translation vector: [1.468232, 3.881342, 1.432686], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.253329, -0.437174, 0.862962], [-0.967015, 0.138948, -0.213484], [-0.026577, -0.888579, -0.457953]] and translation vector: [1.469363, 3.879031, 1.438972], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_128_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_128_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_128_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_128_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_128_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_128_5.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.999403, 0.004498, 0.03425], [-0.034232, -0.004158, 0.999405], [0.004638, -0.999981, -0.004001]] and translation vector: [2.393484, 5.775056, 1.371464], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.998454, -0.001139, 0.055575], [-0.055569, 0.004857, 0.998443], [-0.001408, -0.999988, 0.004786]] and translation vector: [2.356134, 5.774678, 1.367739], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_129_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_129_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_129_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_129_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_129_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_129_5.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.233902, -0.58763, 0.774584], [-0.967246, -0.059828, 0.246692], [-0.098622, -0.806915, -0.582377]] and translation vector: [0.860343, 3.117731, 1.418568], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.233684, -0.587102, 0.775051], [-0.967496, -0.061159, 0.24538], [-0.096661, -0.8072, -0.58231]] and translation vector: [0.859973, 3.119137, 1.418853], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_130_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_130_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_130_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_130_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_130_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_130_5.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.767458, -0.265442, 0.583565], [-0.640543, 0.35536, -0.680752], [-0.026676, -0.896248, -0.442751]] and translation vector: [3.343537, 3.697402, 1.375352], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.780866, -0.263741, 0.566294], [-0.624403, 0.357431, -0.694525], [-0.019236, -0.895926, -0.443786]] and translation vector: [3.344022, 3.709659, 1.376654], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_131_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_131_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_131_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_131_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_131_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_131_5.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.623567, 0.536294, -0.568817], [0.781209, -0.455034, 0.427384], [-0.029628, -0.710867, -0.702702]] and translation vector: [1.790477, 1.816361, 1.229059], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.636074, 0.528408, -0.562313], [0.771074, -0.462894, 0.437235], [-0.029252, -0.711698, -0.701876]] and translation vector: [1.794875, 1.819226, 1.230937], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_132_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_132_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_132_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_132_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_132_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_132_5.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.140295, 0.625342, -0.767636], [0.990108, -0.090149, 0.107516], [-0.001967, -0.775126, -0.631804]] and translation vector: [3.410891, 3.073526, 1.198756], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.148525, 0.612201, -0.776627], [0.988818, -0.102561, 0.108258], [-0.013376, -0.784022, -0.620589]] and translation vector: [3.421496, 3.097678, 1.206193], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_133_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_133_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_133_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_133_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_133_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_133_5.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.14018, 0.443083, -0.885453], [0.989985, -0.07783, 0.117782], [-0.016727, -0.893096, -0.449556]] and translation vector: [3.549726, 0.935059, 1.485921], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.140682, 0.443565, -0.885132], [0.989931, -0.077142, 0.11868], [-0.015638, -0.892916, -0.449951]] and translation vector: [3.549777, 0.934132, 1.483108], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_134_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_134_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_134_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_134_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_134_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_134_5.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.631332, 0.312126, -0.709927], [0.775472, -0.26347, 0.573784], [-0.007951, -0.912776, -0.408382]] and translation vector: [1.600176, 0.624978, 1.327739], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.627277, 0.311053, -0.713982], [0.778666, -0.267257, 0.567673], [-0.014241, -0.912041, -0.409851]] and translation vector: [1.601099, 0.627571, 1.328079], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_135_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_135_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_135_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_135_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_135_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_135_5.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.963317, 0.154363, -0.219528], [0.260086, 0.335369, -0.905474], [-0.066149, -0.929355, -0.363214]] and translation vector: [5.972451, 2.818726, 1.468896], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.963149, 0.154275, -0.220326], [0.260736, 0.334417, -0.905639], [-0.066037, -0.929712, -0.362318]] and translation vector: [5.973901, 2.819783, 1.467855], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_136_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_136_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_136_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_136_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_136_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_136_5.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.983299, 0.047874, -0.175588], [0.180439, -0.382417, 0.9062], [-0.023764, -0.922749, -0.384668]] and translation vector: [2.208684, 3.483128, 1.468268], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.982577, 0.045136, -0.18029], [0.183889, -0.376806, 0.907856], [-0.026957, -0.925192, -0.378541]] and translation vector: [2.211137, 3.481059, 1.465482], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_137_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_137_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_137_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_137_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_137_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_137_5.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.824719, -0.175736, 0.537546], [-0.564369, 0.316962, -0.762249], [-0.036427, -0.932015, -0.360584]] and translation vector: [4.397487, 4.054199, 1.411764], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.821778, -0.181799, 0.540028], [-0.568729, 0.319986, -0.757731], [-0.035047, -0.929816, -0.366351]] and translation vector: [4.391561, 4.044915, 1.406417], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_138_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_138_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_138_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_138_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_138_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_138_5.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.804945, -0.278842, 0.523748], [-0.593014, 0.407765, -0.694307], [-0.019964, -0.869468, -0.493585]] and translation vector: [4.871809, 2.494869, 1.402737], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.804444, -0.274614, 0.526742], [-0.593612, 0.404842, -0.695506], [-0.022252, -0.872176, -0.488687]] and translation vector: [4.863627, 2.491699, 1.400121], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_139_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_139_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_139_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_139_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_139_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_139_5.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.000188, -0.47362, 0.88073], [-0.997828, 0.057931, 0.031365], [-0.065877, -0.878822, -0.47258]] and translation vector: [4.366519, 5.511691, 1.307889], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.002248, -0.465195, 0.885205], [-0.998254, 0.053289, 0.02547], [-0.05902, -0.883603, -0.464503]] and translation vector: [4.36891, 5.516212, 1.317108], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_140_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_140_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_140_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_140_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_140_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_140_5.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.881415, -0.308012, 0.3581], [-0.47008, 0.646119, -0.601294], [-0.046169, -0.698325, -0.71429]] and translation vector: [3.147524, 1.689608, 1.273114], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.879224, -0.311908, 0.360109], [-0.474637, 0.638627, -0.605703], [-0.041052, -0.703469, -0.709539]] and translation vector: [3.141599, 1.689583, 1.27073], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_141_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_141_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_141_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_141_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_141_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_141_5.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.954506, 0.05554, -0.292973], [0.288831, -0.41644, 0.862064], [-0.074127, -0.907465, -0.413536]] and translation vector: [2.66447, 1.005586, 1.476015], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.956668, 0.052296, -0.286448], [0.280824, -0.425753, 0.860158], [-0.076973, -0.903327, -0.42199]] and translation vector: [2.657996, 1.004761, 1.470821], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_142_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_142_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_142_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_142_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_142_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_142_5.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.804414, -0.195207, 0.561082], [-0.593456, -0.306943, 0.74404], [0.026978, -0.931494, -0.362756]] and translation vector: [4.397897, 1.805397, 1.263968], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.81043, -0.19082, 0.553888], [-0.585149, -0.309439, 0.749566], [0.028363, -0.931577, -0.362436]] and translation vector: [4.406421, 1.797547, 1.276681], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_143_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_143_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_143_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_143_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_143_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_143_5.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.924593, 0.219455, -0.311397], [0.371095, 0.334047, -0.86643], [-0.086121, -0.916653, -0.390296]] and translation vector: [7.650298, 2.745242, 1.444521], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.925403, 0.221817, -0.30729], [0.368562, 0.337876, -0.866026], [-0.088274, -0.914679, -0.394425]] and translation vector: [7.650829, 2.747432, 1.442508], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_144_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_144_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_144_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_144_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_144_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_144_5.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.173351, 0.592298, -0.78685], [0.984858, -0.105806, 0.137329], [-0.001913, -0.798742, -0.601671]] and translation vector: [3.264189, 1.940071, 1.28435], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.172933, 0.589263, -0.789217], [0.98493, -0.105695, 0.136901], [-0.002745, -0.800998, -0.598661]] and translation vector: [3.267153, 1.942133, 1.284021], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_145_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_145_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_145_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_145_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_145_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_145_5.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.861262, 0.35211, -0.366398], [0.508128, 0.60504, -0.61297], [0.005853, -0.714105, -0.700014]] and translation vector: [3.145762, 3.637784, 1.437024], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.859655, 0.347273, -0.374693], [0.510745, 0.600786, -0.614977], [0.011546, -0.720041, -0.693836]] and translation vector: [3.145171, 3.63531, 1.440385], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_146_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_146_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_146_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_146_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_146_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_146_5.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.927869, -0.125596, 0.351119], [-0.372891, -0.32108, 0.870551], [0.003399, -0.938687, -0.344754]] and translation vector: [5.442723, 4.031985, 1.348893], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.928984, -0.124208, 0.348657], [-0.370086, -0.32475, 0.870387], [0.005117, -0.937609, -0.347654]] and translation vector: [5.438782, 4.038163, 1.363364], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_147_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_147_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_147_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_147_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_147_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_147_5.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.32152, -0.4706, 0.821681], [-0.946681, 0.178549, -0.268172], [-0.020508, -0.864092, -0.502915]] and translation vector: [2.120097, 2.367636, 1.494245], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.324752, -0.471365, 0.819971], [-0.945715, 0.173395, -0.274877], [-0.012612, -0.864725, -0.502087]] and translation vector: [2.101204, 2.346659, 1.492081], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_148_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_148_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_148_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_148_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_148_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_148_5.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.857694, 0.203115, -0.472341], [0.513544, 0.293426, -0.806333], [-0.025181, -0.934155, -0.355978]] and translation vector: [3.161674, 3.662206, 1.335287], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.856666, 0.203827, -0.473897], [0.515344, 0.296604, -0.804019], [-0.023321, -0.932995, -0.359132]] and translation vector: [3.164327, 3.659025, 1.330704], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_149_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_149_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_149_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_149_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_149_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_149_5.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.975982, 0.033782, -0.215214], [0.215389, -0.297687, 0.930048], [-0.032648, -0.954066, -0.297814]] and translation vector: [2.838751, 1.414222, 1.664536], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.976127, 0.034525, -0.21444], [0.21483, -0.298963, 0.929769], [-0.03201, -0.95364, -0.299243]] and translation vector: [2.83798, 1.414721, 1.663024], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_150_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_150_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_150_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_150_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_150_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_150_5.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.844798, -0.442354, 0.301064], [-0.534849, 0.714819, -0.450523], [-0.015916, -0.541624, -0.84047]] and translation vector: [3.085932, 7.995926, 1.934485], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.833593, -0.457276, 0.309873], [-0.552243, 0.702368, -0.449118], [-0.012274, -0.545507, -0.838017]] and translation vector: [3.091993, 8.002051, 1.93396], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_151_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_151_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_151_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_151_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_151_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_151_5.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.956223, -0.170898, 0.237554], [-0.292595, -0.544035, 0.786393], [-0.005155, -0.821474, -0.570223]] and translation vector: [1.275326, 2.834272, 1.3185], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.956815, -0.170774, 0.235249], [-0.290631, -0.544392, 0.786875], [-0.00631, -0.821263, -0.570514]] and translation vector: [1.276568, 2.833979, 1.318089], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_152_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_152_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_152_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_152_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_152_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_152_5.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.852441, 0.228219, -0.470383], [0.522431, 0.337001, -0.78326], [-0.020235, -0.913426, -0.406502]] and translation vector: [1.798405, 5.320803, 1.619482], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.850776, 0.231102, -0.471988], [0.52508, 0.336676, -0.781627], [-0.021728, -0.91282, -0.407783]] and translation vector: [1.793927, 5.32593, 1.618758], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_153_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_153_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_153_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_153_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_153_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_153_5.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.999494, 0.005595, 0.031322], [-0.029883, 0.172936, -0.98448], [-0.010925, -0.984917, -0.172681]] and translation vector: [6.687301, 5.436423, 1.742894], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.999393, 0.00615, 0.034285], [-0.032681, 0.175053, -0.984017], [-0.012053, -0.98454, -0.174746]] and translation vector: [6.681215, 5.427393, 1.75699], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_154_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_154_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_154_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_154_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_154_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_154_5.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.99336, -0.011945, -0.114427], [0.103059, -0.349694, 0.931178], [-0.051137, -0.936788, -0.346141]] and translation vector: [2.948285, 4.432959, 1.460427], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.99314, -0.016022, -0.115825], [0.102925, -0.35027, 0.930977], [-0.055486, -0.936512, -0.346218]] and translation vector: [2.949102, 4.433566, 1.463483], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_155_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_155_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_155_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_155_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_155_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_155_5.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.40936, -0.486807, 0.77165], [-0.912164, 0.236459, -0.334729], [-0.019515, -0.840896, -0.540844]] and translation vector: [1.412713, 1.214489, 1.390939], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.417972, -0.487805, 0.766384], [-0.908352, 0.237425, -0.344277], [-0.014019, -0.840045, -0.542336]] and translation vector: [1.411881, 1.212071, 1.390231], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_156_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_156_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_156_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_156_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_156_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_156_5.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.442667, -0.46733, 0.765277], [-0.896368, 0.253361, -0.363776], [-0.023888, -0.847001, -0.531054]] and translation vector: [2.453469, 1.905797, 1.451684], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.441405, -0.472001, 0.763136], [-0.897015, 0.253848, -0.361837], [-0.022933, -0.844261, -0.535442]] and translation vector: [2.45238, 1.90449, 1.449179], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_157_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_157_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_157_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_157_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_157_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_157_5.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.88123, -0.188698, 0.433389], [-0.470321, -0.258404, 0.843816], [-0.047237, -0.947428, -0.316462]] and translation vector: [1.061636, 1.321782, 1.457525], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.879526, -0.187337, 0.437423], [-0.473303, -0.249401, 0.844857], [-0.049179, -0.950107, -0.308022]] and translation vector: [1.052651, 1.315727, 1.459226], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_158_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_158_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_158_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_158_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_158_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_158_5.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.15851, 0.420096, -0.893529], [0.981106, -0.034663, -0.190342], [-0.110934, -0.906817, -0.406664]] and translation vector: [4.004256, 0.910349, 2.578562], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.153085, 0.419732, -0.894645], [0.982322, -0.034068, -0.184071], [-0.107739, -0.907009, -0.407097]] and translation vector: [4.005316, 0.908549, 2.574668], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_159_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_159_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_159_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_159_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_159_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_159_5.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.565317, -0.50256, 0.654103], [-0.824719, 0.328974, -0.460017], [0.016003, -0.799506, -0.600445]] and translation vector: [4.07549, 5.065369, 1.281872], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.538132, -0.502349, 0.676801], [-0.842747, 0.30749, -0.441846], [0.013851, -0.808143, -0.588824]] and translation vector: [4.054681, 5.042427, 1.283033], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_160_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_160_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_160_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_160_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_160_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_160_5.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.998162, -0.007354, -0.06016], [0.055338, 0.294228, -0.954132], [0.024717, -0.955707, -0.293281]] and translation vector: [1.687981, 4.43329, 1.569003], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.998237, -0.004775, -0.059163], [0.055295, 0.287523, -0.956176], [0.021577, -0.957762, -0.286752]] and translation vector: [1.687716, 4.435163, 1.571974], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_161_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_161_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_161_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_161_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_161_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_161_5.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.30056, -0.511506, 0.805], [-0.953151, 0.130866, -0.272721], [0.034151, -0.849256, -0.526876]] and translation vector: [-0.281614, 2.924112, 1.306122], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.318531, -0.50267, 0.803655], [-0.947336, 0.139247, -0.288383], [0.033055, -0.85319, -0.520551]] and translation vector: [-0.284617, 2.924129, 1.305331], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_162_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_162_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_162_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_162_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_162_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_162_5.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.815869, 0.244354, -0.524069], [0.578211, -0.336271, 0.743367], [0.005416, -0.909513, -0.415641]] and translation vector: [2.358014, 1.230078, 1.369842], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.817563, 0.244526, -0.521342], [0.575764, -0.332513, 0.746947], [0.009295, -0.910847, -0.41264]] and translation vector: [2.355037, 1.229076, 1.372478], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_163_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_163_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_163_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_163_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_163_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_163_5.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.971613, -0.06682, 0.226943], [-0.235147, 0.378036, -0.89543], [-0.02596, -0.923376, -0.383017]] and translation vector: [2.775299, 4.618156, 1.427592], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.969099, -0.066923, 0.237421], [-0.244849, 0.377786, -0.892932], [-0.029937, -0.923471, -0.382498]] and translation vector: [2.770648, 4.620754, 1.418404], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_164_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_164_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_164_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_164_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_164_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_164_5.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.117057, -0.769276, 0.628102], [-0.987232, -0.021336, 0.157855], [-0.108033, -0.638561, -0.761951]] and translation vector: [1.032686, 1.226834, 2.186959], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.111522, -0.769903, 0.628341], [-0.98843, -0.020525, 0.150284], [-0.102807, -0.637831, -0.763284]] and translation vector: [1.037875, 1.232625, 2.186027], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_165_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_165_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_165_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_165_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_165_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_165_5.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.053762, 0.423971, -0.904079], [0.99709, -0.071809, 0.025618], [-0.05406, -0.902825, -0.426597]] and translation vector: [3.696534, 7.381392, 1.65485], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.059051, 0.424044, -0.903714], [0.996629, -0.076693, 0.029136], [-0.056954, -0.902388, -0.427143]] and translation vector: [3.693501, 7.384472, 1.654036], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_166_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_166_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_166_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_166_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_166_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_166_5.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.255196, -0.436856, 0.862573], [-0.966393, 0.143834, -0.213066], [-0.030988, -0.887958, -0.45888]] and translation vector: [1.734999, 0.744851, 1.432124], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.254375, -0.435236, 0.863634], [-0.966628, 0.142475, -0.21291], [-0.03038, -0.888972, -0.456953]] and translation vector: [1.735377, 0.747301, 1.433656], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_167_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_167_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_167_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_167_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_167_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_167_5.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.299058, 0.37418, -0.877812], [0.95368, -0.085842, 0.288314], [0.032528, -0.923375, -0.38252]] and translation vector: [3.908031, 4.993837, 1.41318], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.301871, 0.365699, -0.880419], [0.952911, -0.087746, 0.290279], [0.028901, -0.926588, -0.374966]] and translation vector: [3.903484, 4.991583, 1.422828], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_168_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_168_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_168_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_168_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_168_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_168_5.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.349467, 0.022881, -0.936669], [0.936944, -0.011774, 0.349282], [-0.003037, -0.999669, -0.025553]] and translation vector: [3.08553, 2.787215, 1.609269], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.348555, 0.021762, -0.937036], [0.937279, -0.012701, 0.34835], [-0.00432, -0.999682, -0.024824]] and translation vector: [3.086167, 2.787834, 1.610474], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_169_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_169_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_169_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_169_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_169_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_169_5.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.187285, -0.627824, 0.755488], [-0.982305, 0.118515, -0.145025], [0.001514, -0.76928, -0.63891]] and translation vector: [1.001752, 1.17634, 1.437838], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.187139, -0.630563, 0.75324], [-0.982328, 0.117514, -0.14568], [0.003345, -0.767191, -0.64141]] and translation vector: [1.00191, 1.178201, 1.437088], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_170_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_170_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_170_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_170_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_170_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_170_5.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.052123, 0.492225, -0.868906], [0.996177, 0.08671, -0.010637], [0.070107, -0.866138, -0.494863]] and translation vector: [3.27549, 2.071379, 1.287401], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.035278, 0.492309, -0.869705], [0.997133, 0.075637, 0.002369], [0.066948, -0.867128, -0.493566]] and translation vector: [3.286684, 2.076202, 1.285681], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_171_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_171_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_171_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_171_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_171_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_171_5.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.721847, -0.019511, -0.691778], [0.690918, -0.036893, 0.721991], [-0.039608, -0.999129, -0.013151]] and translation vector: [1.871862, 0.815296, 1.594356], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.723033, -0.022358, -0.690452], [0.689637, -0.034974, 0.723311], [-0.04032, -0.999138, -0.009869]] and translation vector: [1.872181, 0.815734, 1.596287], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_172_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_172_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_172_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_172_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_172_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_172_5.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.112591, -0.547395, 0.829266], [-0.992672, 0.098819, -0.069547], [-0.043877, -0.83102, -0.55451]] and translation vector: [1.18498, 1.814175, 1.496605], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.111637, -0.546351, 0.830083], [-0.992679, 0.100057, -0.067648], [-0.046096, -0.831558, -0.553521]] and translation vector: [1.186424, 1.810214, 1.495373], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_173_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_173_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_173_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_173_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_173_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_173_5.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.59597, 0.482312, -0.642025], [0.802979, -0.35126, 0.4815], [0.006716, -0.802491, -0.596626]] and translation vector: [3.449961, 1.112515, 1.412234], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.596047, 0.483799, -0.640833], [0.802896, -0.349913, 0.482617], [0.009254, -0.802184, -0.597005]] and translation vector: [3.451157, 1.111087, 1.411899], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_174_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_174_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_174_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_174_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_174_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_174_5.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.132001, -0.567775, 0.812532], [-0.991224, 0.069667, -0.112349], [0.007182, -0.820231, -0.571988]] and translation vector: [2.407685, 4.450429, 1.359714], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.130918, -0.563466, 0.8157], [-0.991376, 0.069526, -0.111087], [0.005882, -0.823209, -0.567709]] and translation vector: [2.40989, 4.444678, 1.359228], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_175_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_175_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_175_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_175_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_175_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_175_5.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.443363, -0.325026, 0.835337], [-0.895367, 0.117125, -0.429651], [0.041809, -0.938424, -0.342946]] and translation vector: [2.190343, 3.392878, 1.594635], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.439336, -0.32163, 0.838772], [-0.897253, 0.111545, -0.427195], [0.043838, -0.940272, -0.337589]] and translation vector: [2.183471, 3.393708, 1.586874], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_176_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_176_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_176_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_176_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_176_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_176_5.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.985254, -0.134646, 0.105573], [-0.142287, -0.302097, 0.942599], [-0.095024, -0.94372, -0.3168]] and translation vector: [1.134605, 1.549487, 1.505245], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.985752, -0.13049, 0.106142], [-0.141062, -0.297585, 0.944216], [-0.091624, -0.945736, -0.311752]] and translation vector: [1.131707, 1.551058, 1.506377], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_177_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_177_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_177_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_177_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_177_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_177_5.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.753053, 0.123809, -0.646206], [0.619922, -0.462608, 0.633791], [-0.220471, -0.877875, -0.42512]] and translation vector: [4.259223, 3.769218, 1.505729], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.760823, 0.125761, -0.636658], [0.611756, -0.466381, 0.638939], [-0.216572, -0.875599, -0.431768]] and translation vector: [4.257898, 3.775608, 1.505422], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_178_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_178_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_178_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_178_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_178_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_178_5.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.924746, 0.145405, -0.351715], [0.379908, 0.407811, -0.830277], [0.022707, -0.901414, -0.432362]] and translation vector: [3.891577, 4.106122, 1.335216], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.925289, 0.144931, -0.350479], [0.378485, 0.412032, -0.828842], [0.024284, -0.899569, -0.436102]] and translation vector: [3.892777, 4.104329, 1.336806], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_179_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_179_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_179_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_179_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_179_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_179_5.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.748873, -0.374013, 0.547087], [-0.662404, -0.447673, 0.600675], [0.020256, -0.812221, -0.582998]] and translation vector: [3.709567, 4.406117, 1.261793], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.747082, -0.370975, 0.551585], [-0.664465, -0.440253, 0.603874], [0.018814, -0.817652, -0.575405]] and translation vector: [3.708719, 4.403161, 1.261416], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_180_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_180_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_180_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_180_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_180_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_180_5.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.934222, -0.219071, 0.281493], [-0.356558, -0.595286, 0.72007], [0.009823, -0.773073, -0.634241]] and translation vector: [0.331108, 1.989283, 1.551545], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.93341, -0.222981, 0.281114], [-0.358788, -0.589093, 0.724045], [0.004154, -0.776691, -0.629868]] and translation vector: [0.338532, 1.98258, 1.554168], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_181_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_181_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_181_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_181_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_181_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_181_5.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.52463, -0.231347, 0.819293], [-0.850589, 0.102279, -0.515789], [0.03553, -0.96748, -0.25044]] and translation vector: [5.897326, 2.792535, 1.553822], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.52763, -0.228151, 0.818263], [-0.84888, 0.105585, -0.517933], [0.03177, -0.967884, -0.249382]] and translation vector: [5.897463, 2.790525, 1.551499], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_182_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_182_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_182_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_182_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_182_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_182_5.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.699126, -0.324611, 0.637064], [-0.713802, 0.265353, -0.648131], [0.041344, -0.907863, -0.417224]] and translation vector: [0.050403, 3.78209, 1.506908], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.698648, -0.327666, 0.636024], [-0.713993, 0.262294, -0.649166], [0.045885, -0.907654, -0.417203]] and translation vector: [0.047406, 3.786517, 1.504266], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_183_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_183_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_183_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_183_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_183_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_183_5.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.990268, -0.101591, 0.095124], [-0.135934, -0.559426, 0.817658], [-0.029851, -0.822631, -0.567792]] and translation vector: [6.679901, 2.488796, 1.402653], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.989948, -0.105417, 0.094292], [-0.137296, -0.556168, 0.819651], [-0.033963, -0.824357, -0.565051]] and translation vector: [6.681146, 2.493639, 1.408598], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_184_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_184_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_184_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_184_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_184_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_184_5.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.311411, -0.45253, 0.835607], [-0.948656, 0.199362, -0.245576], [-0.055457, -0.869179, -0.491379]] and translation vector: [2.299133, 2.388773, 1.459468], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.314195, -0.454542, 0.833471], [-0.947818, 0.20019, -0.248124], [-0.05407, -0.867937, -0.493722]] and translation vector: [2.299448, 2.389842, 1.45904], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_185_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_185_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_185_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_185_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_185_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_185_5.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.789457, 0.162095, -0.592016], [0.613764, 0.197318, -0.764434], [-0.007096, -0.966846, -0.255262]] and translation vector: [5.114759, 3.17533, 1.386193], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.785271, 0.158609, -0.598492], [0.619131, 0.193201, -0.761151], [-0.005096, -0.968255, -0.249915]] and translation vector: [5.11251, 3.170745, 1.383731], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_186_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_186_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_186_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_186_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_186_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_186_5.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.408988, -0.323891, 0.853126], [-0.912443, -0.158736, 0.37716], [0.013263, -0.932683, -0.360453]] and translation vector: [3.672612, 2.990265, 1.494339], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.403714, -0.307769, 0.861564], [-0.914697, -0.154884, 0.373283], [0.018558, -0.93877, -0.344045]] and translation vector: [3.67724, 2.998002, 1.501107], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_187_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_187_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_187_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_187_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_187_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_187_5.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.831143, 0.312948, -0.459636], [0.555586, 0.43327, -0.709649], [-0.022937, -0.845187, -0.533978]] and translation vector: [2.360292, 3.05803, 1.315354], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.8108, 0.328121, -0.484706], [0.584922, 0.423558, -0.691711], [-0.021664, -0.844355, -0.535346]] and translation vector: [2.374215, 3.08026, 1.318953], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_188_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_188_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_188_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_188_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_188_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_188_5.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.802837, 0.056561, -0.593509], [0.596192, 0.071654, -0.799638], [-0.002701, -0.995825, -0.091248]] and translation vector: [2.583219, 4.008804, 1.439254], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.802466, 0.056012, -0.594063], [0.59669, 0.070227, -0.799393], [-0.003056, -0.995957, -0.089777]] and translation vector: [2.583684, 4.008714, 1.434935], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_189_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_189_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_189_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_189_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_189_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_189_5.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.032646, 0.194727, -0.980314], [0.998594, -0.034636, -0.040135], [-0.04177, -0.980246, -0.193322]] and translation vector: [3.506056, 2.493951, 1.706783], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.038857, 0.192835, -0.980462], [0.998032, -0.040846, -0.047587], [-0.049225, -0.980381, -0.190868]] and translation vector: [3.502031, 2.499079, 1.701362], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_190_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_190_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_190_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_190_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_190_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_190_5.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.573389, -0.355745, 0.738018], [-0.818965, 0.223754, -0.528424], [0.02285, -0.907403, -0.419641]] and translation vector: [2.061407, 3.857203, 1.382209], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.569689, -0.351701, 0.742806], [-0.821614, 0.221591, -0.525212], [0.020118, -0.909508, -0.4152]] and translation vector: [2.058259, 3.848013, 1.384733], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_191_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_191_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_191_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_191_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_191_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_191_5.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.863619, -0.252896, 0.436126], [-0.502889, 0.371124, -0.780621], [0.03556, -0.893482, -0.447688]] and translation vector: [2.007098, 3.82416, 1.536992], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.862677, -0.255046, 0.436739], [-0.504412, 0.370978, -0.779707], [0.036841, -0.892932, -0.448682]] and translation vector: [2.007321, 3.81907, 1.542811], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_192_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_192_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_192_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_192_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_192_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_192_5.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.986418, -0.051155, 0.156087], [-0.152905, 0.633099, -0.758819], [-0.060001, -0.772379, -0.632322]] and translation vector: [2.055195, 1.600374, 1.268236], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.986809, -0.050817, 0.15371], [-0.151071, 0.630346, -0.761474], [-0.058194, -0.77465, -0.629707]] and translation vector: [2.054364, 1.600927, 1.26836], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_193_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_193_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_193_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_193_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_193_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_193_5.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.436119, -0.427186, 0.79203], [-0.89981, 0.218659, -0.377532], [-0.011909, -0.877326, -0.479747]] and translation vector: [1.992302, 3.72193, 1.553249], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.436462, -0.426736, 0.792084], [-0.899636, 0.219226, -0.377618], [-0.012502, -0.877403, -0.47959]] and translation vector: [1.991236, 3.722176, 1.553282], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_194_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_194_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_194_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_194_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_194_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_194_5.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.996429, -0.081152, -0.023325], [-0.01119, 0.400709, -0.916137], [0.083693, -0.912604, -0.400187]] and translation vector: [7.365378, 2.610504, 1.343957], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.997089, -0.075007, -0.013671], [-0.016913, 0.392439, -0.919623], [0.074343, -0.916715, -0.392565]] and translation vector: [7.36531, 2.61944, 1.344548], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_195_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_195_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_195_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_195_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_195_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_195_5.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.896132, -0.052356, 0.440688], [-0.436974, -0.277444, 0.855616], [0.07747, -0.959314, -0.271505]] and translation vector: [3.211431, 3.110947, 1.584554], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.889709, -0.065096, 0.451863], [-0.451099, -0.277541, 0.848222], [0.070195, -0.958506, -0.276295]] and translation vector: [3.215954, 3.116336, 1.570817], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_196_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_196_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_196_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_196_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_196_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_196_5.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.643628, -0.362528, 0.674031], [-0.765241, -0.290748, 0.574345], [-0.012243, -0.88546, -0.464555]] and translation vector: [2.632762, 2.243425, 1.452714], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.642371, -0.361874, 0.675579], [-0.76623, -0.285016, 0.575898], [-0.015852, -0.887589, -0.460364]] and translation vector: [2.634792, 2.237319, 1.452971], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_197_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_197_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_197_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_197_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_197_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_197_5.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[-0.612656, -0.411508, 0.674769], [-0.789543, 0.280105, -0.546043], [0.035694, -0.867296, -0.496511]] and translation vector: [1.897828, 2.372103, 1.388776], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[-0.615876, -0.406578, 0.674826], [-0.787242, 0.284147, -0.547275], [0.03076, -0.868305, -0.495075]] and translation vector: [1.892345, 2.36762, 1.390764], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_198_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_198_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_198_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_198_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_198_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_198_5.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Depth_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_depth_estimation", "options": "A: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "question": "Given the first color image view of the scene with the corresponding camera pose, i.e., rotation matrix: [[0.752445, 0.275595, -0.598225], [0.657828, -0.35994, 0.661593], [-0.032994, -0.891342, -0.452129]] and translation vector: [2.633805, 2.70906, 1.31733], and the second color image view of the same scene with the corresponding camera pose, i.e., rotation matrix: [[0.746128, 0.269733, -0.608718], [0.664676, -0.35493, 0.657443], [-0.038718, -0.895136, -0.444108]] and translation vector: [2.667176, 2.689206, 1.310347], please estimate the depth map for the first view of the RGB image. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to estimate the depth map for a color image based on two color images captured from two viewpoints, along with the corresponding camera poses.The input images are the first 2 images\nSelect from the following choices.\nA: The 3th image\nB: The 4th image\nC: The 5th image\nD: The 6th image", "input_image_path": ["./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_199_0.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_199_1.jpg", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_199_2.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_199_3.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_199_4.png", "./3D-spatial/threeD_Depth_Estimation/threeD_Depth_Estimation_199_5.png"], "output": "C", "qwen3-vl": "image none"}]
\ No newline at end of file
diff --git a/results/threeD_Object_Detection/qwen3-vl/metadata_info.json b/results/threeD_Object_Detection/qwen3-vl/metadata_info.json
new file mode 100644
index 0000000..06540ed
--- /dev/null
+++ b/results/threeD_Object_Detection/qwen3-vl/metadata_info.json
@@ -0,0 +1 @@
+[{"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[0.097, -2.343, -0.119, 0.31, 0.062, 0.564], [-0.682, 2.2, 0.854, 0.245, 0.21, 0.887], [1.912, 1.405, 1.111, 0.468, 0.452, 0.649], [1.666, 2.151, 1.319, 0.566, 0.243, 0.586], [1.68, 1.827, 0.754, -0.048, 0.681, 0.119], [1.224, 1.284, 0.947, -0.031, 0.464, 0.194], [1.776, 1.335, 0.376, 0.886, 0.055, 0.335], [1.559, 1.179, 1.34, -0.063, -0.136, -0.017], [0.95, 0.255, 1.046, -0.137, 0.36, -0.187], [1.987, 0.956, 0.62, 0.226, -0.239, 0.309], [1.652, 0.525, 1.179, 0.104, 0.539, -0.125], [1.409, 0.711, 0.681, 0.259, 0.215, 0.643], [1.086, -0.177, -0.123, 0.706, 0.621, 0.551], [1.203, 1.982, 0.324, 0.158, 0.718, -0.1], [-1.373, -0.521, 0.674, -0.225, 0.726, 0.299], [-0.949, 2.103, 0.551, 0.314, 0.298, 0.513], [-0.76, 1.491, 2.338, 0.158, -0.084, 0.396], [1.233, -0.382, 0.447, 0.091, 0.059, -0.038], [1.9, -1.181, 1.39, 0.188, -0.114, -0.011]]\nB: [[-0.029, -1.923, 0.096, 0.249, 0.113, 0.317], [-0.92, 1.815, 0.646, 0.107, 0.184, 0.516], [1.611, 1.619, 0.954, 0.253, 0.266, 0.249], [1.544, 1.826, 0.949, 0.149, 0.178, 0.182], [1.432, 1.79, 0.87, 0.251, 0.301, 0.054], [1.423, 1.326, 0.897, 0.087, 0.104, 0.188], [1.554, 0.837, 0.836, 0.609, 0.432, 0.17], [1.27, 0.844, 0.842, 0.093, 0.107, 0.179], [1.248, 0.676, 0.764, 0.25, 0.269, 0.064], [1.848, 0.645, 0.825, 0.074, 0.102, 0.07], [1.69, 0.454, 0.837, 0.248, 0.167, 0.182], [1.715, 0.276, 0.86, 0.335, 0.399, 0.17], [1.4, -0.193, 0.037, 0.353, 0.275, 0.161], [1.684, 1.781, 0.29, 0.433, 0.395, 0.339], [-1.62, -0.65, 0.742, 0.272, 0.321, 0.114], [-0.975, 1.832, 0.171, 0.262, 0.195, 0.125], [-1.021, 1.599, 2.098, 0.217, 0.273, 0.221], [1.433, 0.084, 0.074, 0.398, 0.344, 0.176], [1.733, -1.208, 1.076, 0.125, 0.108, 0.327]]\nC: [[0.373, -2.148, -0.291, 0.339, 0.018, 0.694], [-0.499, 2.165, 0.993, 0.276, 0.335, 0.775], [1.74, 1.478, 1.323, -0.069, 0.758, 0.607], [1.532, 2.302, 1.262, 0.285, 0.22, -0.252], [1.323, 1.537, 0.593, 0.351, 0.467, 0.392], [1.364, 1.041, 1.236, 0.12, 0.57, 0.444], [1.442, 1.263, 1.284, 1.004, 0.007, 0.304], [1.115, 0.536, 0.672, -0.113, -0.219, -0.082], [1.743, 0.762, 0.395, 0.159, 0.41, 0.323], [2.121, 0.573, 0.527, -0.324, 0.247, 0.462], [1.447, 0.752, 1.299, 0.299, 0.347, 0.233], [1.92, 0.62, 0.769, -0.13, 0.686, -0.059], [0.942, 0.049, -0.066, 0.316, 0.607, 0.459], [2.077, 2.024, 0.781, 0.373, -0.058, 0.752], [-1.491, -0.599, 0.622, 0.707, -0.171, -0.319], [-1.023, 1.772, -0.236, 0.203, 0.47, 0.117], [-0.596, 1.76, 1.726, 0.197, 0.073, 0.18], [1.574, 0.398, 0.118, 0.732, 0.235, 0.24], [1.654, -1.081, 1.126, -0.043, 0.128, 0.085]]\nD: [[-0.016, -2.182, 0.529, 0.012, -0.234, 0.082], [-0.638, 1.532, 1.107, 0.49, 0.648, 0.861], [1.27, 1.262, 1.438, 0.461, 0.457, 0.658], [1.731, 1.955, 1.411, 0.079, 0.038, 0.636], [1.916, 1.8, 0.455, 0.749, 0.555, 0.441], [1.273, 0.97, 0.909, 0.183, -0.155, 0.402], [1.478, 1.046, 1.305, 0.37, 0.729, 0.224], [1.279, 0.48, 0.354, 0.143, -0.211, 0.086], [1.055, 0.494, 1.055, -0.029, 0.559, -0.151], [2.256, 0.151, 1.167, -0.326, -0.138, 0.075], [1.326, 0.605, 0.815, -0.119, 0.42, 0.177], [2.172, 0.341, 0.688, 0.742, 0.292, 0.566], [1.621, -0.605, 0.175, 0.538, -0.117, 0.628], [1.477, 1.542, -0.082, 0.684, 0.168, -0.065], [-1.675, -0.211, 0.417, 0.169, -0.09, -0.164], [-1.277, 1.624, 0.657, -0.231, 0.334, -0.097], [-0.751, 1.371, 1.707, -0.044, 0.702, 0.452], [1.41, -0.207, 0.284, 0.114, 0.651, -0.312], [1.447, -0.888, 1.553, -0.369, 0.402, 0.174]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the object in the scene. The camera pose information includes: the rotation matrix: [[0.86482, -0.183466, 0.467362], [-0.501092, -0.256948, 0.826368], [-0.031523, -0.948851, -0.314147]]; the translation vector: [3.012278, 2.022242, 1.442339], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.097, -2.343, -0.119, 0.31, 0.062, 0.564], [-0.682, 2.2, 0.854, 0.245, 0.21, 0.887], [1.912, 1.405, 1.111, 0.468, 0.452, 0.649], [1.666, 2.151, 1.319, 0.566, 0.243, 0.586], [1.68, 1.827, 0.754, -0.048, 0.681, 0.119], [1.224, 1.284, 0.947, -0.031, 0.464, 0.194], [1.776, 1.335, 0.376, 0.886, 0.055, 0.335], [1.559, 1.179, 1.34, -0.063, -0.136, -0.017], [0.95, 0.255, 1.046, -0.137, 0.36, -0.187], [1.987, 0.956, 0.62, 0.226, -0.239, 0.309], [1.652, 0.525, 1.179, 0.104, 0.539, -0.125], [1.409, 0.711, 0.681, 0.259, 0.215, 0.643], [1.086, -0.177, -0.123, 0.706, 0.621, 0.551], [1.203, 1.982, 0.324, 0.158, 0.718, -0.1], [-1.373, -0.521, 0.674, -0.225, 0.726, 0.299], [-0.949, 2.103, 0.551, 0.314, 0.298, 0.513], [-0.76, 1.491, 2.338, 0.158, -0.084, 0.396], [1.233, -0.382, 0.447, 0.091, 0.059, -0.038], [1.9, -1.181, 1.39, 0.188, -0.114, -0.011]]\nB: [[-0.029, -1.923, 0.096, 0.249, 0.113, 0.317], [-0.92, 1.815, 0.646, 0.107, 0.184, 0.516], [1.611, 1.619, 0.954, 0.253, 0.266, 0.249], [1.544, 1.826, 0.949, 0.149, 0.178, 0.182], [1.432, 1.79, 0.87, 0.251, 0.301, 0.054], [1.423, 1.326, 0.897, 0.087, 0.104, 0.188], [1.554, 0.837, 0.836, 0.609, 0.432, 0.17], [1.27, 0.844, 0.842, 0.093, 0.107, 0.179], [1.248, 0.676, 0.764, 0.25, 0.269, 0.064], [1.848, 0.645, 0.825, 0.074, 0.102, 0.07], [1.69, 0.454, 0.837, 0.248, 0.167, 0.182], [1.715, 0.276, 0.86, 0.335, 0.399, 0.17], [1.4, -0.193, 0.037, 0.353, 0.275, 0.161], [1.684, 1.781, 0.29, 0.433, 0.395, 0.339], [-1.62, -0.65, 0.742, 0.272, 0.321, 0.114], [-0.975, 1.832, 0.171, 0.262, 0.195, 0.125], [-1.021, 1.599, 2.098, 0.217, 0.273, 0.221], [1.433, 0.084, 0.074, 0.398, 0.344, 0.176], [1.733, -1.208, 1.076, 0.125, 0.108, 0.327]]\nC: [[0.373, -2.148, -0.291, 0.339, 0.018, 0.694], [-0.499, 2.165, 0.993, 0.276, 0.335, 0.775], [1.74, 1.478, 1.323, -0.069, 0.758, 0.607], [1.532, 2.302, 1.262, 0.285, 0.22, -0.252], [1.323, 1.537, 0.593, 0.351, 0.467, 0.392], [1.364, 1.041, 1.236, 0.12, 0.57, 0.444], [1.442, 1.263, 1.284, 1.004, 0.007, 0.304], [1.115, 0.536, 0.672, -0.113, -0.219, -0.082], [1.743, 0.762, 0.395, 0.159, 0.41, 0.323], [2.121, 0.573, 0.527, -0.324, 0.247, 0.462], [1.447, 0.752, 1.299, 0.299, 0.347, 0.233], [1.92, 0.62, 0.769, -0.13, 0.686, -0.059], [0.942, 0.049, -0.066, 0.316, 0.607, 0.459], [2.077, 2.024, 0.781, 0.373, -0.058, 0.752], [-1.491, -0.599, 0.622, 0.707, -0.171, -0.319], [-1.023, 1.772, -0.236, 0.203, 0.47, 0.117], [-0.596, 1.76, 1.726, 0.197, 0.073, 0.18], [1.574, 0.398, 0.118, 0.732, 0.235, 0.24], [1.654, -1.081, 1.126, -0.043, 0.128, 0.085]]\nD: [[-0.016, -2.182, 0.529, 0.012, -0.234, 0.082], [-0.638, 1.532, 1.107, 0.49, 0.648, 0.861], [1.27, 1.262, 1.438, 0.461, 0.457, 0.658], [1.731, 1.955, 1.411, 0.079, 0.038, 0.636], [1.916, 1.8, 0.455, 0.749, 0.555, 0.441], [1.273, 0.97, 0.909, 0.183, -0.155, 0.402], [1.478, 1.046, 1.305, 0.37, 0.729, 0.224], [1.279, 0.48, 0.354, 0.143, -0.211, 0.086], [1.055, 0.494, 1.055, -0.029, 0.559, -0.151], [2.256, 0.151, 1.167, -0.326, -0.138, 0.075], [1.326, 0.605, 0.815, -0.119, 0.42, 0.177], [2.172, 0.341, 0.688, 0.742, 0.292, 0.566], [1.621, -0.605, 0.175, 0.538, -0.117, 0.628], [1.477, 1.542, -0.082, 0.684, 0.168, -0.065], [-1.675, -0.211, 0.417, 0.169, -0.09, -0.164], [-1.277, 1.624, 0.657, -0.231, 0.334, -0.097], [-0.751, 1.371, 1.707, -0.044, 0.702, 0.452], [1.41, -0.207, 0.284, 0.114, 0.651, -0.312], [1.447, -0.888, 1.553, -0.369, 0.402, 0.174]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_0_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_0_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-2.373, -1.08, 0.874, 0.298, 4.385, 1.982], [2.17, -0.036, 1.086, 0.309, 6.903, 1.887], [0.087, 4.155, 1.455, 2.931, 0.24, 1.054], [-2.394, 2.523, 0.998, 0.208, 1.864, 1.096]]\nB: [[-2.011, -0.956, 1.222, 0.755, 4.764, 2.451], [2.068, 0.255, 0.587, 0.436, 7.258, 1.991], [0.329, 4.55, 1.265, 3.421, 0.529, 1.367], [-2.081, 2.555, 0.715, 0.343, 1.488, 1.021]]\nC: [[-2.751, -0.591, 0.65, -0.115, 4.495, 2.351], [1.978, 0.331, 1.034, 0.171, 7.03, 2.051], [0.03, 3.84, 1.693, 3.348, 0.554, 1.247], [-2.636, 2.957, 1.408, -0.018, 1.435, 1.132]]\nD: [[-2.401, -1.35, 0.706, 0.08, 4.387, 2.134], [2.651, -0.401, 0.766, 0.612, 6.557, 1.511], [0.041, 3.914, 1.655, 3.173, 0.701, 0.821], [-2.147, 2.752, 0.898, 0.388, 2.028, 1.081]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[-0.060487, 0.154719, -0.986105], [0.998165, 0.006603, -0.060191], [-0.002801, -0.987936, -0.154835]]; the translation vector: [6.630666, 2.572317, 1.44523], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-2.373, -1.08, 0.874, 0.298, 4.385, 1.982], [2.17, -0.036, 1.086, 0.309, 6.903, 1.887], [0.087, 4.155, 1.455, 2.931, 0.24, 1.054], [-2.394, 2.523, 0.998, 0.208, 1.864, 1.096]]\nB: [[-2.011, -0.956, 1.222, 0.755, 4.764, 2.451], [2.068, 0.255, 0.587, 0.436, 7.258, 1.991], [0.329, 4.55, 1.265, 3.421, 0.529, 1.367], [-2.081, 2.555, 0.715, 0.343, 1.488, 1.021]]\nC: [[-2.751, -0.591, 0.65, -0.115, 4.495, 2.351], [1.978, 0.331, 1.034, 0.171, 7.03, 2.051], [0.03, 3.84, 1.693, 3.348, 0.554, 1.247], [-2.636, 2.957, 1.408, -0.018, 1.435, 1.132]]\nD: [[-2.401, -1.35, 0.706, 0.08, 4.387, 2.134], [2.651, -0.401, 0.766, 0.612, 6.557, 1.511], [0.041, 3.914, 1.655, 3.173, 0.701, 0.821], [-2.147, 2.752, 0.898, 0.388, 2.028, 1.081]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_1_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_1_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[0.552, 0.743, 0.817, 1.009, -0.013, 1.113], [0.943, 1.174, 0.929, -0.18, 0.779, 1.068]]\nB: [[0.748, 0.782, 0.621, 0.556, 0.127, 0.839], [1.697, 1.131, 0.375, -0.273, 1.039, 1.495]]\nC: [[0.612, 1.202, 0.708, 0.529, 0.114, 0.821], [1.612, 1.184, 0.198, 0.419, 0.977, 0.647]]\nD: [[0.368, 1.181, 0.615, 0.989, 0.028, 1.271], [1.284, 0.726, 0.504, 0.067, 0.905, 1.037]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the doorframe in the scene. The camera pose information includes: the rotation matrix: [[-0.610102, 0.375008, -0.697958], [0.791763, 0.255448, -0.554849], [-0.029781, -0.891132, -0.452767]]; the translation vector: [2.349929, 1.419923, 1.358478], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.552, 0.743, 0.817, 1.009, -0.013, 1.113], [0.943, 1.174, 0.929, -0.18, 0.779, 1.068]]\nB: [[0.748, 0.782, 0.621, 0.556, 0.127, 0.839], [1.697, 1.131, 0.375, -0.273, 1.039, 1.495]]\nC: [[0.612, 1.202, 0.708, 0.529, 0.114, 0.821], [1.612, 1.184, 0.198, 0.419, 0.977, 0.647]]\nD: [[0.368, 1.181, 0.615, 0.989, 0.028, 1.271], [1.284, 0.726, 0.504, 0.067, 0.905, 1.037]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_2_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_2_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[1.47, 0.453, 0.894, 0.2, 0.52, 0.291], [1.542, -0.676, 0.862, 0.217, 0.405, 0.289], [-1.666, -1.034, 0.158, 0.332, 0.363, 0.294]]\nB: [[1.471, 0.336, 1.375, -0.216, 0.786, 0.736], [1.469, -0.24, 1.152, 0.44, 0.255, 0.196], [-1.84, -1.112, 0.203, 0.586, 0.569, 0.159]]\nC: [[1.315, 0.861, 1.294, -0.145, 0.334, 0.615], [1.166, -0.686, 1.016, 0.2, 0.258, 0.346], [-2.029, -1.236, -0.071, 0.818, 0.37, 0.684]]\nD: [[1.59, 0.904, 1.331, 0.394, 0.302, 0.781], [1.626, -0.262, 1.266, -0.178, 0.337, 0.326], [-1.34, -1.047, 0.45, -0.149, 0.438, 0.179]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the speaker in the scene. The camera pose information includes: the rotation matrix: [[-0.283698, -0.38675, 0.877463], [-0.95878, 0.129662, -0.252839], [-0.015988, -0.913024, -0.407593]]; the translation vector: [3.69525, 3.551647, 1.352095], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.47, 0.453, 0.894, 0.2, 0.52, 0.291], [1.542, -0.676, 0.862, 0.217, 0.405, 0.289], [-1.666, -1.034, 0.158, 0.332, 0.363, 0.294]]\nB: [[1.471, 0.336, 1.375, -0.216, 0.786, 0.736], [1.469, -0.24, 1.152, 0.44, 0.255, 0.196], [-1.84, -1.112, 0.203, 0.586, 0.569, 0.159]]\nC: [[1.315, 0.861, 1.294, -0.145, 0.334, 0.615], [1.166, -0.686, 1.016, 0.2, 0.258, 0.346], [-2.029, -1.236, -0.071, 0.818, 0.37, 0.684]]\nD: [[1.59, 0.904, 1.331, 0.394, 0.302, 0.781], [1.626, -0.262, 1.266, -0.178, 0.337, 0.326], [-1.34, -1.047, 0.45, -0.149, 0.438, 0.179]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_3_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_3_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[0.415, 0.474, 0.662, 0.608, 0.318, 0.607], [0.137, 1.366, 0.214, 0.809, 0.687, 0.563], [-0.119, -1.513, 0.412, 0.991, 0.469, 0.436], [0.958, -1.756, 0.374, 0.393, 0.461, 0.273]]\nB: [[0.097, 0.337, 0.367, 0.736, 0.669, 0.76], [-0.118, 0.915, 0.406, 0.54, 0.71, 0.78], [0.039, -1.273, 0.366, 0.52, 0.703, 0.787], [0.484, -2.107, 0.393, 0.516, 0.773, 0.731]]\nC: [[-0.145, 0.33, 0.215, 0.329, 0.397, 1.235], [-0.041, 1.377, -0.008, 0.173, 0.698, 1.043], [0.354, -0.822, 0.479, 0.306, 0.474, 0.987], [0.885, -2.559, 0.576, 0.548, 1.045, 0.546]]\nD: [[-0.062, 0.608, 0.646, 1.11, 1.056, 0.374], [0.266, 1.23, 0.893, 0.95, 0.801, 1.268], [-0.368, -1.641, 0.003, 0.257, 0.709, 0.427], [0.563, -2.189, 0.486, 0.531, 1.025, 0.714]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the chair in the scene. The camera pose information includes: the rotation matrix: [[-0.857694, 0.203115, -0.472341], [0.513544, 0.293426, -0.806333], [-0.025181, -0.934155, -0.355978]]; the translation vector: [3.161674, 3.662206, 1.335287], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.415, 0.474, 0.662, 0.608, 0.318, 0.607], [0.137, 1.366, 0.214, 0.809, 0.687, 0.563], [-0.119, -1.513, 0.412, 0.991, 0.469, 0.436], [0.958, -1.756, 0.374, 0.393, 0.461, 0.273]]\nB: [[0.097, 0.337, 0.367, 0.736, 0.669, 0.76], [-0.118, 0.915, 0.406, 0.54, 0.71, 0.78], [0.039, -1.273, 0.366, 0.52, 0.703, 0.787], [0.484, -2.107, 0.393, 0.516, 0.773, 0.731]]\nC: [[-0.145, 0.33, 0.215, 0.329, 0.397, 1.235], [-0.041, 1.377, -0.008, 0.173, 0.698, 1.043], [0.354, -0.822, 0.479, 0.306, 0.474, 0.987], [0.885, -2.559, 0.576, 0.548, 1.045, 0.546]]\nD: [[-0.062, 0.608, 0.646, 1.11, 1.056, 0.374], [0.266, 1.23, 0.893, 0.95, 0.801, 1.268], [-0.368, -1.641, 0.003, 0.257, 0.709, 0.427], [0.563, -2.189, 0.486, 0.531, 1.025, 0.714]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_4_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_4_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[1.3, -0.383, 0.923, -0.356, 3.02, 2.635], [-2.115, 1.383, 0.841, 0.469, 0.927, 1.856], [-0.097, 1.484, 1.187, 3.044, -0.119, 1.807], [-1.377, 2.117, 0.899, -0.033, 1.21, 1.808], [1.159, -1.198, 1.874, 0.555, 0.176, 1.325]]\nB: [[1.941, -0.755, 1.598, -0.121, 2.674, 2.518], [-2.423, 1.211, 1.402, 0.192, 0.868, 2.164], [0.169, 1.203, 1.223, 2.673, -0.21, 2.157], [-1.688, 1.565, 0.939, 0.535, 1.926, 1.811], [1.734, -1.649, 1.385, 0.714, 0.109, 1.38]]\nC: [[1.696, -0.287, 1.129, 0.119, 2.8, 2.268], [-2.577, 1.535, 1.204, 0.613, 1.32, 2.198], [0.155, 1.133, 1.131, 3.032, 0.104, 2.204], [-1.285, 1.824, 1.154, 0.225, 1.437, 2.219], [1.488, -1.695, 1.419, 0.405, 0.098, 1.172]]\nD: [[1.453, -0.695, 1.348, 0.547, 3.272, 2.434], [-2.777, 1.417, 1.534, 1.007, 1.19, 1.834], [-0.034, 0.735, 1.208, 2.75, 0.551, 2.499], [-1.311, 1.97, 1.046, -0.151, 1.928, 1.721], [1.211, -2.136, 1.037, 0.829, -0.033, 1.518]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[0.199941, 0.263531, -0.943703], [0.979453, -0.027844, 0.19974], [0.026362, -0.964249, -0.263683]]; the translation vector: [3.611549, 3.757055, 1.562045], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.3, -0.383, 0.923, -0.356, 3.02, 2.635], [-2.115, 1.383, 0.841, 0.469, 0.927, 1.856], [-0.097, 1.484, 1.187, 3.044, -0.119, 1.807], [-1.377, 2.117, 0.899, -0.033, 1.21, 1.808], [1.159, -1.198, 1.874, 0.555, 0.176, 1.325]]\nB: [[1.941, -0.755, 1.598, -0.121, 2.674, 2.518], [-2.423, 1.211, 1.402, 0.192, 0.868, 2.164], [0.169, 1.203, 1.223, 2.673, -0.21, 2.157], [-1.688, 1.565, 0.939, 0.535, 1.926, 1.811], [1.734, -1.649, 1.385, 0.714, 0.109, 1.38]]\nC: [[1.696, -0.287, 1.129, 0.119, 2.8, 2.268], [-2.577, 1.535, 1.204, 0.613, 1.32, 2.198], [0.155, 1.133, 1.131, 3.032, 0.104, 2.204], [-1.285, 1.824, 1.154, 0.225, 1.437, 2.219], [1.488, -1.695, 1.419, 0.405, 0.098, 1.172]]\nD: [[1.453, -0.695, 1.348, 0.547, 3.272, 2.434], [-2.777, 1.417, 1.534, 1.007, 1.19, 1.834], [-0.034, 0.735, 1.208, 2.75, 0.551, 2.499], [-1.311, 1.97, 1.046, -0.151, 1.928, 1.721], [1.211, -2.136, 1.037, 0.829, -0.033, 1.518]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_5_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_5_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[2.228, -1.379, 0.968, 0.05, 0.868, 0.876]]\nB: [[1.972, -1.363, 0.684, 0.094, 1.002, 1.369]]\nC: [[2.421, -1.068, 1.034, 0.177, 1.167, 1.478]]\nD: [[2.274, -1.598, 1.168, 0.52, 1.421, 1.228]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the door in the scene. The camera pose information includes: the rotation matrix: [[-0.156961, 0.257294, -0.953501], [0.986843, 0.002956, -0.161652], [-0.038773, -0.966329, -0.254373]]; the translation vector: [1.838324, 1.205476, 1.480452], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[2.228, -1.379, 0.968, 0.05, 0.868, 0.876]]\nB: [[1.972, -1.363, 0.684, 0.094, 1.002, 1.369]]\nC: [[2.421, -1.068, 1.034, 0.177, 1.167, 1.478]]\nD: [[2.274, -1.598, 1.168, 0.52, 1.421, 1.228]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_6_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_6_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-0.821, -1.627, 0.918, 0.317, 0.52, -0.34]]\nB: [[-1.196, -1.892, 0.539, 0.329, 0.374, 0.146]]\nC: [[-1.472, -2.353, 0.745, 0.703, 0.302, -0.352]]\nD: [[-1.33, -1.906, 0.911, -0.061, 0.797, -0.104]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the jacket in the scene. The camera pose information includes: the rotation matrix: [[0.999847, -0.004634, 0.01689], [-0.017397, -0.374134, 0.927211], [0.002023, -0.927363, -0.374157]]; the translation vector: [3.310194, 3.16458, 1.506432], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.821, -1.627, 0.918, 0.317, 0.52, -0.34]]\nB: [[-1.196, -1.892, 0.539, 0.329, 0.374, 0.146]]\nC: [[-1.472, -2.353, 0.745, 0.703, 0.302, -0.352]]\nD: [[-1.33, -1.906, 0.911, -0.061, 0.797, -0.104]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_7_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_7_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[1.441, 1.064, 0.78, 0.6, 1.194, 1.759]]\nB: [[0.806, 1.175, 0.723, 1.145, 0.857, 1.307]]\nC: [[1.47, 1.204, 0.738, 1.161, 1.149, 1.298]]\nD: [[1.013, 1.023, 0.774, 0.913, 1.329, 1.578]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the desk in the scene. The camera pose information includes: the rotation matrix: [[0.977181, 0.077241, -0.197866], [0.211774, -0.426158, 0.879512], [-0.016388, -0.901345, -0.432791]]; the translation vector: [0.977323, 0.877303, 1.40232], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.441, 1.064, 0.78, 0.6, 1.194, 1.759]]\nB: [[0.806, 1.175, 0.723, 1.145, 0.857, 1.307]]\nC: [[1.47, 1.204, 0.738, 1.161, 1.149, 1.298]]\nD: [[1.013, 1.023, 0.774, 0.913, 1.329, 1.578]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_8_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_8_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-1.32, 0.548, 0.015, 3.907, 5.219, 0.432]]\nB: [[-0.869, 0.661, -0.307, 3.672, 4.614, -0.069]]\nC: [[-0.885, 0.436, 0.066, 3.44, 4.871, 0.305]]\nD: [[-1.228, 0.813, -0.025, 3.355, 4.94, 0.54]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the floor in the scene. The camera pose information includes: the rotation matrix: [[-0.30056, -0.511506, 0.805], [-0.953151, 0.130866, -0.272721], [0.034151, -0.849256, -0.526876]]; the translation vector: [-0.281614, 2.924112, 1.306122], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.32, 0.548, 0.015, 3.907, 5.219, 0.432]]\nB: [[-0.869, 0.661, -0.307, 3.672, 4.614, -0.069]]\nC: [[-0.885, 0.436, 0.066, 3.44, 4.871, 0.305]]\nD: [[-1.228, 0.813, -0.025, 3.355, 4.94, 0.54]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_9_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_9_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[0.236, 0.438, 2.571, 1.376, 1.585, 0.296]]\nB: [[0.105, 0.071, 2.209, 1.375, 1.269, -0.259]]\nC: [[-0.082, 0.088, 2.553, 1.257, 1.665, 0.102]]\nD: [[-0.566, -0.056, 2.979, 1.134, 1.904, -0.21]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the ceiling in the scene. The camera pose information includes: the rotation matrix: [[-0.255196, -0.436856, 0.862573], [-0.966393, 0.143834, -0.213066], [-0.030988, -0.887958, -0.45888]]; the translation vector: [1.734999, 0.744851, 1.432124], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.236, 0.438, 2.571, 1.376, 1.585, 0.296]]\nB: [[0.105, 0.071, 2.209, 1.375, 1.269, -0.259]]\nC: [[-0.082, 0.088, 2.553, 1.257, 1.665, 0.102]]\nD: [[-0.566, -0.056, 2.979, 1.134, 1.904, -0.21]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_10_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_10_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-0.043, 0.444, 0.066, 3.645, 4.94, 0.241]]\nB: [[0.071, 0.025, -0.197, 3.897, 4.771, 0.696]]\nC: [[-0.523, 0.143, -0.403, 3.288, 4.973, 0.134]]\nD: [[0.378, 0.809, 0.444, 3.764, 4.725, 0.036]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the floor in the scene. The camera pose information includes: the rotation matrix: [[-0.436119, -0.427186, 0.79203], [-0.89981, 0.218659, -0.377532], [-0.011909, -0.877326, -0.479747]]; the translation vector: [1.992302, 3.72193, 1.553249], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.043, 0.444, 0.066, 3.645, 4.94, 0.241]]\nB: [[0.071, 0.025, -0.197, 3.897, 4.771, 0.696]]\nC: [[-0.523, 0.143, -0.403, 3.288, 4.973, 0.134]]\nD: [[0.378, 0.809, 0.444, 3.764, 4.725, 0.036]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_11_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_11_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-0.786, -1.469, 1.112, 0.86, 0.176, 1.647]]\nB: [[-0.914, -1.825, 1.495, 0.394, 0.281, 2.141]]\nC: [[-1.155, -1.563, 0.935, 0.81, 0.246, 1.235]]\nD: [[-0.398, -1.079, 1.275, 0.659, -0.007, 1.932]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the curtain in the scene. The camera pose information includes: the rotation matrix: [[-0.112591, -0.547395, 0.829266], [-0.992672, 0.098819, -0.069547], [-0.043877, -0.83102, -0.55451]]; the translation vector: [1.18498, 1.814175, 1.496605], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.786, -1.469, 1.112, 0.86, 0.176, 1.647]]\nB: [[-0.914, -1.825, 1.495, 0.394, 0.281, 2.141]]\nC: [[-1.155, -1.563, 0.935, 0.81, 0.246, 1.235]]\nD: [[-0.398, -1.079, 1.275, 0.659, -0.007, 1.932]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_12_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_12_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[0.187, -2.136, 1.49, 0.407, 0.4, 0.612], [0.6, -1.205, 1.939, 0.176, 0.133, -0.205]]\nB: [[0.434, -1.704, 1.717, 0.327, 0.549, 0.278], [0.752, -1.616, 1.803, 0.403, 0.362, 0.211]]\nC: [[0.158, -1.92, 1.36, -0.055, 0.096, 0.484], [0.44, -1.879, 1.563, 0.594, 0.374, 0.673]]\nD: [[-0.017, -1.973, 1.957, -0.127, 0.324, 0.483], [0.88, -1.365, 2.154, 0.664, 0.083, -0.049]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the box in the scene. The camera pose information includes: the rotation matrix: [[0.645842, -0.099101, 0.757012], [-0.761541, -0.013148, 0.647984], [-0.054263, -0.994991, -0.083961]]; the translation vector: [3.729951, 1.432448, 1.733539], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.187, -2.136, 1.49, 0.407, 0.4, 0.612], [0.6, -1.205, 1.939, 0.176, 0.133, -0.205]]\nB: [[0.434, -1.704, 1.717, 0.327, 0.549, 0.278], [0.752, -1.616, 1.803, 0.403, 0.362, 0.211]]\nC: [[0.158, -1.92, 1.36, -0.055, 0.096, 0.484], [0.44, -1.879, 1.563, 0.594, 0.374, 0.673]]\nD: [[-0.017, -1.973, 1.957, -0.127, 0.324, 0.483], [0.88, -1.365, 2.154, 0.664, 0.083, -0.049]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_13_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_13_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[1.798, -1.611, 1.132, 0.285, 0.531, 1.165]]\nB: [[1.687, -1.332, 1.2, 0.199, 0.988, 0.799]]\nC: [[1.357, -0.901, 1.518, -0.194, 0.606, 1.017]]\nD: [[1.876, -1.168, 0.737, 0.277, 1.058, 1.074]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the window in the scene. The camera pose information includes: the rotation matrix: [[0.14018, 0.443083, -0.885453], [0.989985, -0.07783, 0.117782], [-0.016727, -0.893096, -0.449556]]; the translation vector: [3.549726, 0.935059, 1.485921], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.798, -1.611, 1.132, 0.285, 0.531, 1.165]]\nB: [[1.687, -1.332, 1.2, 0.199, 0.988, 0.799]]\nC: [[1.357, -0.901, 1.518, -0.194, 0.606, 1.017]]\nD: [[1.876, -1.168, 0.737, 0.277, 1.058, 1.074]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_14_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_14_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-0.819, -0.006, 0.434, 0.452, 1.821, 0.691], [-2.563, 0.098, 0.464, 0.939, 2.679, 0.721]]\nB: [[-1.198, -0.018, -0.225, 0.953, 2.14, 0.57], [-3.038, 0.583, 0.16, 0.212, 2.66, 1.39]]\nC: [[-1.115, -0.366, 0.124, 1.037, 1.922, 0.126], [-3.074, 0.098, -0.014, 0.214, 2.71, 0.451]]\nD: [[-0.889, -0.312, 0.236, 0.943, 2.266, 0.443], [-3.042, 0.305, 0.458, 0.511, 3.034, 0.927]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the table in the scene. The camera pose information includes: the rotation matrix: [[0.988959, -0.006087, -0.148062], [0.148117, 0.009943, 0.98892], [-0.004548, -0.999932, 0.010735]]; the translation vector: [3.911582, 2.672538, 1.565046], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.819, -0.006, 0.434, 0.452, 1.821, 0.691], [-2.563, 0.098, 0.464, 0.939, 2.679, 0.721]]\nB: [[-1.198, -0.018, -0.225, 0.953, 2.14, 0.57], [-3.038, 0.583, 0.16, 0.212, 2.66, 1.39]]\nC: [[-1.115, -0.366, 0.124, 1.037, 1.922, 0.126], [-3.074, 0.098, -0.014, 0.214, 2.71, 0.451]]\nD: [[-0.889, -0.312, 0.236, 0.943, 2.266, 0.443], [-3.042, 0.305, 0.458, 0.511, 3.034, 0.927]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_15_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_15_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[1.003, 1.71, 2.094, 0.541, 0.263, 0.118], [1.203, 1.68, 0.477, 1.133, 0.414, 1.172]]\nB: [[1.233, 1.735, 1.724, 0.591, 0.58, 0.487], [0.701, 1.63, 0.882, 0.862, 0.077, 0.96]]\nC: [[0.897, 1.469, 2.006, 0.732, 0.251, 0.23], [0.896, 1.321, 0.509, 1.078, 0.563, 0.956]]\nD: [[1.067, 1.911, 1.889, 0.58, 0.392, -0.26], [1.382, 1.503, 0.381, 1.083, 0.31, 0.478]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the shelf in the scene. The camera pose information includes: the rotation matrix: [[-0.767458, -0.265442, 0.583565], [-0.640543, 0.35536, -0.680752], [-0.026676, -0.896248, -0.442751]]; the translation vector: [3.343537, 3.697402, 1.375352], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.003, 1.71, 2.094, 0.541, 0.263, 0.118], [1.203, 1.68, 0.477, 1.133, 0.414, 1.172]]\nB: [[1.233, 1.735, 1.724, 0.591, 0.58, 0.487], [0.701, 1.63, 0.882, 0.862, 0.077, 0.96]]\nC: [[0.897, 1.469, 2.006, 0.732, 0.251, 0.23], [0.896, 1.321, 0.509, 1.078, 0.563, 0.956]]\nD: [[1.067, 1.911, 1.889, 0.58, 0.392, -0.26], [1.382, 1.503, 0.381, 1.083, 0.31, 0.478]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_16_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_16_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[1.138, 0.124, 0.676, -0.24, 4.388, 1.6], [-0.991, 0.479, 1.092, 0.412, 4.521, 1.772], [0.357, 1.729, 0.48, 2.481, 0.664, 1.17]]\nB: [[1.22, 0.284, 0.605, 0.161, 4.409, 2.133], [-1.442, 0.481, 1.313, -0.024, 4.063, 1.777], [-0.203, 2.498, 0.224, 1.968, 0.299, 1.17]]\nC: [[0.938, 0.113, 0.929, -0.179, 3.724, 1.454], [-0.685, 0.25, 0.853, -0.017, 3.941, 2.536], [-0.076, 2.502, 0.394, 2.11, -0.271, 0.712]]\nD: [[1.264, 0.297, 0.846, 0.212, 3.931, 1.71], [-1.088, 0.197, 1.004, 0.293, 4.063, 2.062], [0.059, 2.138, 0.489, 2.4, 0.176, 0.937]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[-0.311411, -0.45253, 0.835607], [-0.948656, 0.199362, -0.245576], [-0.055457, -0.869179, -0.491379]]; the translation vector: [2.299133, 2.388773, 1.459468], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.138, 0.124, 0.676, -0.24, 4.388, 1.6], [-0.991, 0.479, 1.092, 0.412, 4.521, 1.772], [0.357, 1.729, 0.48, 2.481, 0.664, 1.17]]\nB: [[1.22, 0.284, 0.605, 0.161, 4.409, 2.133], [-1.442, 0.481, 1.313, -0.024, 4.063, 1.777], [-0.203, 2.498, 0.224, 1.968, 0.299, 1.17]]\nC: [[0.938, 0.113, 0.929, -0.179, 3.724, 1.454], [-0.685, 0.25, 0.853, -0.017, 3.941, 2.536], [-0.076, 2.502, 0.394, 2.11, -0.271, 0.712]]\nD: [[1.264, 0.297, 0.846, 0.212, 3.931, 1.71], [-1.088, 0.197, 1.004, 0.293, 4.063, 2.062], [0.059, 2.138, 0.489, 2.4, 0.176, 0.937]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_17_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_17_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-1.286, 0.023, 0.43, 0.149, 0.955, 0.837], [-1.223, 1.506, 0.654, 0.172, 1.002, 1.099]]\nB: [[-1.716, 0.263, -0.049, 0.591, 1.354, 0.711], [-1.546, 1.706, 0.156, 0.078, 0.869, 0.807]]\nC: [[-0.94, -0.207, 0.36, 0.417, 1.226, 0.947], [-1.061, 1.683, 0.653, 0.491, 0.617, 1.297]]\nD: [[-1.505, -0.176, 0.438, -0.283, 0.675, 1.3], [-1.566, 1.679, 1.013, -0.274, 0.726, 0.998]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the doorframe in the scene. The camera pose information includes: the rotation matrix: [[-0.305635, -0.390507, 0.868385], [-0.952144, 0.122302, -0.280116], [0.003183, -0.91244, -0.409198]]; the translation vector: [4.266061, 1.773856, 1.285079], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.286, 0.023, 0.43, 0.149, 0.955, 0.837], [-1.223, 1.506, 0.654, 0.172, 1.002, 1.099]]\nB: [[-1.716, 0.263, -0.049, 0.591, 1.354, 0.711], [-1.546, 1.706, 0.156, 0.078, 0.869, 0.807]]\nC: [[-0.94, -0.207, 0.36, 0.417, 1.226, 0.947], [-1.061, 1.683, 0.653, 0.491, 0.617, 1.297]]\nD: [[-1.505, -0.176, 0.438, -0.283, 0.675, 1.3], [-1.566, 1.679, 1.013, -0.274, 0.726, 0.998]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_18_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_18_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-0.301, -0.248, -0.183, 3.399, 5.209, 0.282]]\nB: [[0.089, -0.015, -0.009, 3.337, 5.518, 0.258]]\nC: [[0.396, -0.159, 0.362, 3.257, 5.951, -0.2]]\nD: [[0.093, 0.115, -0.474, 3.284, 5.168, 0.122]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the floor in the scene. The camera pose information includes: the rotation matrix: [[-0.934582, -0.143102, 0.325696], [-0.355737, 0.383069, -0.852473], [-0.002774, -0.912568, -0.408916]]; the translation vector: [2.694367, 2.483235, 1.465763], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.301, -0.248, -0.183, 3.399, 5.209, 0.282]]\nB: [[0.089, -0.015, -0.009, 3.337, 5.518, 0.258]]\nC: [[0.396, -0.159, 0.362, 3.257, 5.951, -0.2]]\nD: [[0.093, 0.115, -0.474, 3.284, 5.168, 0.122]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_19_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_19_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-0.264, 0.68, 0.463, 0.725, 0.55, 0.967], [1.225, 0.142, 2.156, 0.68, 2.62, 0.669]]\nB: [[0.179, 0.182, 0.661, 0.23, 1.022, 0.617], [1.51, -0.031, 1.909, 0.955, 2.219, 0.647]]\nC: [[0.069, 0.653, 0.766, 0.622, 0.263, 0.911], [0.908, 0.073, 1.898, 0.615, 2.442, 0.579]]\nD: [[-0.094, 1.133, 0.833, 0.562, 0.551, 0.493], [1.073, 0.074, 2.645, 0.407, 2.814, 0.994]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the cabinet in the scene. The camera pose information includes: the rotation matrix: [[-0.928375, -0.17783, 0.326339], [-0.371449, 0.415395, -0.830345], [0.012101, -0.892089, -0.451697]]; the translation vector: [2.096006, 1.919092, 1.36174], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.264, 0.68, 0.463, 0.725, 0.55, 0.967], [1.225, 0.142, 2.156, 0.68, 2.62, 0.669]]\nB: [[0.179, 0.182, 0.661, 0.23, 1.022, 0.617], [1.51, -0.031, 1.909, 0.955, 2.219, 0.647]]\nC: [[0.069, 0.653, 0.766, 0.622, 0.263, 0.911], [0.908, 0.073, 1.898, 0.615, 2.442, 0.579]]\nD: [[-0.094, 1.133, 0.833, 0.562, 0.551, 0.493], [1.073, 0.074, 2.645, 0.407, 2.814, 0.994]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_20_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_20_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-1.718, -0.44, 1.96, 0.228, 0.897, 0.293], [-1.706, -1.293, 1.868, 0.22, 0.846, 0.362], [-1.707, -1.314, 0.762, 0.375, 0.826, 0.302], [-1.691, 1.543, 1.626, 0.337, 0.697, 0.437], [-1.573, 1.406, 1.291, 0.181, 0.564, 0.313]]\nB: [[-1.988, -0.706, 1.585, 0.615, 0.861, 0.547], [-1.403, -1.309, 1.785, -0.129, 1.049, -0.092], [-1.749, -1.25, 0.92, 0.869, 1.088, 0.428], [-1.92, 1.941, 1.694, 0.669, 1.107, 0.403], [-1.483, 1.056, 1.615, 0.417, 0.387, 0.739]]\nC: [[-1.546, -0.182, 1.499, 0.527, 1.029, 0.605], [-1.401, -1.244, 2.308, -0.222, 0.467, 0.206], [-2.193, -1.361, 0.929, 0.133, 0.525, 0.091], [-1.321, 1.865, 1.781, -0.053, 0.666, 0.358], [-1.503, 1.488, 1.689, 0.165, 0.203, 0.17]]\nD: [[-1.94, -0.345, 2.215, 0.028, 0.642, 0.092], [-2.035, -1.654, 1.937, 0.612, 1.134, -0.102], [-1.249, -1.385, 0.367, 0.613, 1.003, 0.682], [-2.01, 1.507, 1.513, 0.614, 0.573, 0.003], [-1.183, 1.016, 0.985, -0.012, 0.86, 0.417]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the books in the scene. The camera pose information includes: the rotation matrix: [[0.725417, 0.297171, -0.620854], [0.687848, -0.279954, 0.669695], [0.025203, -0.912861, -0.407492]]; the translation vector: [3.434752, 3.057745, 1.556519], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.718, -0.44, 1.96, 0.228, 0.897, 0.293], [-1.706, -1.293, 1.868, 0.22, 0.846, 0.362], [-1.707, -1.314, 0.762, 0.375, 0.826, 0.302], [-1.691, 1.543, 1.626, 0.337, 0.697, 0.437], [-1.573, 1.406, 1.291, 0.181, 0.564, 0.313]]\nB: [[-1.988, -0.706, 1.585, 0.615, 0.861, 0.547], [-1.403, -1.309, 1.785, -0.129, 1.049, -0.092], [-1.749, -1.25, 0.92, 0.869, 1.088, 0.428], [-1.92, 1.941, 1.694, 0.669, 1.107, 0.403], [-1.483, 1.056, 1.615, 0.417, 0.387, 0.739]]\nC: [[-1.546, -0.182, 1.499, 0.527, 1.029, 0.605], [-1.401, -1.244, 2.308, -0.222, 0.467, 0.206], [-2.193, -1.361, 0.929, 0.133, 0.525, 0.091], [-1.321, 1.865, 1.781, -0.053, 0.666, 0.358], [-1.503, 1.488, 1.689, 0.165, 0.203, 0.17]]\nD: [[-1.94, -0.345, 2.215, 0.028, 0.642, 0.092], [-2.035, -1.654, 1.937, 0.612, 1.134, -0.102], [-1.249, -1.385, 0.367, 0.613, 1.003, 0.682], [-2.01, 1.507, 1.513, 0.614, 0.573, 0.003], [-1.183, 1.016, 0.985, -0.012, 0.86, 0.417]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_21_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_21_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-2.266, 0.688, 1.427, 0.645, 4.144, 1.891], [1.793, 0.377, 0.356, 0.744, 3.81, 1.61], [-0.941, -1.334, 0.579, 2.335, 0.039, 1.673], [0.362, -1.428, 0.335, 0.484, 0.146, 1.101], [0.403, -1.616, 0.92, 1.722, 0.842, 1.057], [1.194, 3.045, 0.588, 0.013, 0.929, 1.121], [1.761, 2.38, 0.509, 0.767, -0.159, 0.975]]\nB: [[-2.307, 0.099, 1.206, 0.234, 3.804, 1.593], [2.401, 0.098, 0.732, 0.485, 3.642, 1.662], [-1.38, -1.427, 1.26, 2.485, 0.117, 1.419], [0.047, -1.412, 0.542, 0.454, 0.56, 1.109], [0.451, -2.029, 0.885, 1.313, 0.841, 0.961], [1.821, 2.284, 0.965, -0.181, 0.842, 1.118], [1.675, 2.553, 0.956, 0.491, 0.104, 1.591]]\nC: [[-1.548, 0.69, 1.589, 0.293, 3.816, 1.624], [2.411, 0.459, 0.334, 0.543, 4.513, 1.336], [-0.832, -2.051, 0.999, 1.925, 0.593, 1.075], [-0.217, -1.618, 0.815, -0.295, 0.494, 0.985], [1.068, -1.834, 1.273, 1.646, 0.657, 0.959], [1.898, 2.458, 0.543, 0.44, 1.518, 1.452], [2.071, 1.877, 0.293, 1.12, 0.358, 1.716]]\nD: [[-1.964, 0.397, 1.135, 0.305, 4.04, 1.813], [2.143, 0.114, 0.673, 0.413, 4.08, 1.439], [-0.926, -1.676, 0.892, 2.284, 0.231, 1.529], [0.195, -1.875, 0.811, 0.153, 0.424, 1.364], [0.78, -1.998, 0.788, 1.226, 0.36, 1.309], [1.439, 2.685, 0.692, 0.249, 1.216, 1.435], [1.802, 2.098, 0.616, 0.629, 0.105, 1.315]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[0.205292, 0.226186, -0.952205], [0.97316, -0.150555, 0.174048], [-0.103992, -0.962379, -0.251024]]; the translation vector: [4.876985, 2.837537, 1.671042], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-2.266, 0.688, 1.427, 0.645, 4.144, 1.891], [1.793, 0.377, 0.356, 0.744, 3.81, 1.61], [-0.941, -1.334, 0.579, 2.335, 0.039, 1.673], [0.362, -1.428, 0.335, 0.484, 0.146, 1.101], [0.403, -1.616, 0.92, 1.722, 0.842, 1.057], [1.194, 3.045, 0.588, 0.013, 0.929, 1.121], [1.761, 2.38, 0.509, 0.767, -0.159, 0.975]]\nB: [[-2.307, 0.099, 1.206, 0.234, 3.804, 1.593], [2.401, 0.098, 0.732, 0.485, 3.642, 1.662], [-1.38, -1.427, 1.26, 2.485, 0.117, 1.419], [0.047, -1.412, 0.542, 0.454, 0.56, 1.109], [0.451, -2.029, 0.885, 1.313, 0.841, 0.961], [1.821, 2.284, 0.965, -0.181, 0.842, 1.118], [1.675, 2.553, 0.956, 0.491, 0.104, 1.591]]\nC: [[-1.548, 0.69, 1.589, 0.293, 3.816, 1.624], [2.411, 0.459, 0.334, 0.543, 4.513, 1.336], [-0.832, -2.051, 0.999, 1.925, 0.593, 1.075], [-0.217, -1.618, 0.815, -0.295, 0.494, 0.985], [1.068, -1.834, 1.273, 1.646, 0.657, 0.959], [1.898, 2.458, 0.543, 0.44, 1.518, 1.452], [2.071, 1.877, 0.293, 1.12, 0.358, 1.716]]\nD: [[-1.964, 0.397, 1.135, 0.305, 4.04, 1.813], [2.143, 0.114, 0.673, 0.413, 4.08, 1.439], [-0.926, -1.676, 0.892, 2.284, 0.231, 1.529], [0.195, -1.875, 0.811, 0.153, 0.424, 1.364], [0.78, -1.998, 0.788, 1.226, 0.36, 1.309], [1.439, 2.685, 0.692, 0.249, 1.216, 1.435], [1.802, 2.098, 0.616, 0.629, 0.105, 1.315]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_22_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_22_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[0.121, -0.501, -0.221, 0.439, 0.387, 0.089], [1.109, -1.077, -0.06, 0.221, 0.081, 0.556]]\nB: [[-0.019, -0.497, 0.389, 1.016, 0.944, 0.39], [1.279, -1.542, -0.239, 0.195, 0.216, 0.872]]\nC: [[0.21, -0.049, -0.032, 0.947, 0.552, 0.204], [0.617, -0.987, 0.392, 0.284, 0.553, 0.828]]\nD: [[0.392, -0.219, 0.176, 0.595, 0.63, 0.481], [0.882, -1.099, 0.197, 0.524, 0.524, 0.466]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the ottoman in the scene. The camera pose information includes: the rotation matrix: [[0.133825, -0.39571, 0.908573], [-0.990975, -0.046263, 0.125813], [-0.007752, -0.91721, -0.398329]]; the translation vector: [4.990516, 4.227292, 1.32289], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.121, -0.501, -0.221, 0.439, 0.387, 0.089], [1.109, -1.077, -0.06, 0.221, 0.081, 0.556]]\nB: [[-0.019, -0.497, 0.389, 1.016, 0.944, 0.39], [1.279, -1.542, -0.239, 0.195, 0.216, 0.872]]\nC: [[0.21, -0.049, -0.032, 0.947, 0.552, 0.204], [0.617, -0.987, 0.392, 0.284, 0.553, 0.828]]\nD: [[0.392, -0.219, 0.176, 0.595, 0.63, 0.481], [0.882, -1.099, 0.197, 0.524, 0.524, 0.466]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_23_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_23_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[0.923, 3.072, 1.641, 0.406, 0.224, 0.224], [0.852, 2.684, 1.628, 0.411, 0.369, 0.342], [0.952, 2.353, 1.591, 0.332, 0.315, 0.303], [0.919, 1.934, 1.549, 0.278, 0.356, 0.3], [0.991, 1.596, 1.521, 0.302, 0.285, 0.248], [1.083, 1.197, 1.51, 0.2, 0.428, 0.292], [1.067, 0.874, 1.479, 0.258, 0.387, 0.349], [1.029, 0.682, 1.414, 0.27, 0.238, 0.229], [1.041, 0.446, 1.386, 0.31, 0.355, 0.267], [1.007, 0.119, 1.367, 0.313, 0.297, 0.251], [1.072, -0.152, 1.331, 0.368, 0.301, 0.196], [0.978, -0.542, 1.366, 0.293, 0.411, 0.344], [1.038, -0.846, 1.349, 0.398, 0.352, 0.371], [0.995, -1.285, 1.277, 0.273, 0.319, 0.287], [1.051, -1.623, 1.317, 0.372, 0.433, 0.346], [1.016, -1.909, 1.267, 0.375, 0.379, 0.355], [1.01, -2.206, 1.239, 0.32, 0.305, 0.33], [1.021, -2.389, 1.248, 0.292, 0.375, 0.256], [0.945, -2.669, 1.168, 0.312, 0.307, 0.249], [0.986, -2.904, 1.157, 0.265, 0.331, 0.203]]\nB: [[1.16, 2.801, 1.566, 0.581, 0.486, -0.268], [0.756, 2.458, 1.347, -0.04, 0.522, 0.189], [0.535, 2.032, 1.866, 0.051, 0.318, 0.012], [0.896, 2.321, 1.823, -0.143, 0.711, 0.696], [1.3, 1.485, 1.216, 0.089, 0.474, 0.726], [1.333, 1.63, 1.281, 0.587, 0.639, -0.131], [1.034, 0.752, 1.496, 0.694, 0.45, 0.002], [1.132, 0.488, 1.903, -0.121, -0.068, 0.586], [1.244, 0.056, 1.06, 0.343, 0.366, 0.492], [0.523, 0.369, 1.091, 0.036, 0.297, 0.341], [0.945, -0.379, 1.231, -0.009, 0.698, 0.282], [0.742, -0.538, 1.804, 0.143, 0.887, 0.377], [1.245, -0.568, 1.71, 0.143, 0.603, 0.41], [1.356, -0.879, 1.397, 0.576, 0.048, 0.554], [1.47, -2.036, 1.112, 0.54, 0.795, 0.096], [1.472, -1.52, 0.829, 0.648, 0.598, 0.49], [0.775, -2.633, 1.506, -0.16, -0.139, -0.099], [0.838, -2.702, 1.211, 0.137, 0.331, -0.011], [1.261, -2.818, 1.474, 0.679, -0.005, 0.352], [0.793, -2.949, 1.566, -0.008, 0.477, 0.693]]\nC: [[1.056, 2.871, 1.196, 0.82, -0.168, 0.476], [1.086, 3.168, 1.177, -0.05, 0.768, 0.624], [1.078, 2.314, 1.991, 0.481, -0.014, 0.382], [0.899, 1.855, 1.409, -0.073, 0.065, 0.078], [0.796, 1.846, 1.026, -0.008, 0.461, 0.294], [0.96, 0.751, 1.316, 0.52, 0.805, 0.752], [1.18, 1.031, 1.766, 0.673, 0.119, 0.034], [1.398, 0.505, 1.118, -0.168, 0.16, -0.249], [0.838, 0.65, 1.392, 0.173, 0.458, 0.332], [1.111, -0.328, 1.396, 0.558, 0.481, 0.366], [0.597, -0.355, 1.146, 0.623, 0.368, 0.632], [0.691, -0.514, 1.338, -0.157, 0.304, -0.124], [0.696, -1.125, 1.476, 0.501, 0.757, 0.356], [0.907, -0.859, 1.385, 0.656, 0.571, -0.029], [1.035, -1.127, 1.219, 0.093, 0.841, 0.704], [0.635, -1.763, 1.501, -0.076, -0.097, 0.162], [0.614, -1.848, 1.062, 0.328, 0.483, 0.674], [0.692, -2.453, 1.556, 0.665, 0.718, 0.625], [1.074, -2.937, 1.026, 0.776, 0.224, 0.639], [0.852, -3.222, 1.01, 0.571, -0.139, 0.12]]\nD: [[1.022, 3.318, 1.189, 0.205, -0.146, 0.042], [0.815, 2.622, 1.239, 0.213, 0.653, 0.265], [1.051, 2.623, 1.858, 0.743, -0.174, 0.425], [1.36, 1.47, 1.216, -0.071, -0.098, -0.074], [1.312, 2.017, 2.002, -0.015, 0.439, 0.124], [0.798, 1.663, 1.184, 0.218, 0.773, 0.512], [1.438, 0.663, 1.321, 0.334, 0.497, 0.799], [1.496, 1.067, 1.009, 0.492, 0.69, -0.197], [0.673, 0.916, 1.137, 0.692, -0.115, 0.537], [0.588, 0.319, 1.507, 0.723, 0.486, 0.106], [0.938, -0.596, 1.384, 0.378, 0.487, -0.284], [0.718, -0.867, 0.941, 0.405, 0.388, -0.074], [1.365, -0.417, 1.613, 0.897, 0.508, -0.003], [1.124, -1.228, 1.16, 0.374, 0.651, 0.692], [0.872, -1.666, 1.25, 0.857, 0.612, -0.1], [0.693, -1.777, 1.038, 0.754, 0.733, 0.072], [1.133, -1.714, 1.626, 0.475, -0.192, 0.478], [1.392, -2.804, 1.671, -0.124, 0.18, 0.524], [1.024, -2.671, 1.235, 0.602, 0.29, 0.162], [0.636, -2.621, 1.52, -0.11, 0.64, 0.18]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the book in the scene. The camera pose information includes: the rotation matrix: [[0.999403, 0.004498, 0.03425], [-0.034232, -0.004158, 0.999405], [0.004638, -0.999981, -0.004001]]; the translation vector: [2.393484, 5.775056, 1.371464], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.923, 3.072, 1.641, 0.406, 0.224, 0.224], [0.852, 2.684, 1.628, 0.411, 0.369, 0.342], [0.952, 2.353, 1.591, 0.332, 0.315, 0.303], [0.919, 1.934, 1.549, 0.278, 0.356, 0.3], [0.991, 1.596, 1.521, 0.302, 0.285, 0.248], [1.083, 1.197, 1.51, 0.2, 0.428, 0.292], [1.067, 0.874, 1.479, 0.258, 0.387, 0.349], [1.029, 0.682, 1.414, 0.27, 0.238, 0.229], [1.041, 0.446, 1.386, 0.31, 0.355, 0.267], [1.007, 0.119, 1.367, 0.313, 0.297, 0.251], [1.072, -0.152, 1.331, 0.368, 0.301, 0.196], [0.978, -0.542, 1.366, 0.293, 0.411, 0.344], [1.038, -0.846, 1.349, 0.398, 0.352, 0.371], [0.995, -1.285, 1.277, 0.273, 0.319, 0.287], [1.051, -1.623, 1.317, 0.372, 0.433, 0.346], [1.016, -1.909, 1.267, 0.375, 0.379, 0.355], [1.01, -2.206, 1.239, 0.32, 0.305, 0.33], [1.021, -2.389, 1.248, 0.292, 0.375, 0.256], [0.945, -2.669, 1.168, 0.312, 0.307, 0.249], [0.986, -2.904, 1.157, 0.265, 0.331, 0.203]]\nB: [[1.16, 2.801, 1.566, 0.581, 0.486, -0.268], [0.756, 2.458, 1.347, -0.04, 0.522, 0.189], [0.535, 2.032, 1.866, 0.051, 0.318, 0.012], [0.896, 2.321, 1.823, -0.143, 0.711, 0.696], [1.3, 1.485, 1.216, 0.089, 0.474, 0.726], [1.333, 1.63, 1.281, 0.587, 0.639, -0.131], [1.034, 0.752, 1.496, 0.694, 0.45, 0.002], [1.132, 0.488, 1.903, -0.121, -0.068, 0.586], [1.244, 0.056, 1.06, 0.343, 0.366, 0.492], [0.523, 0.369, 1.091, 0.036, 0.297, 0.341], [0.945, -0.379, 1.231, -0.009, 0.698, 0.282], [0.742, -0.538, 1.804, 0.143, 0.887, 0.377], [1.245, -0.568, 1.71, 0.143, 0.603, 0.41], [1.356, -0.879, 1.397, 0.576, 0.048, 0.554], [1.47, -2.036, 1.112, 0.54, 0.795, 0.096], [1.472, -1.52, 0.829, 0.648, 0.598, 0.49], [0.775, -2.633, 1.506, -0.16, -0.139, -0.099], [0.838, -2.702, 1.211, 0.137, 0.331, -0.011], [1.261, -2.818, 1.474, 0.679, -0.005, 0.352], [0.793, -2.949, 1.566, -0.008, 0.477, 0.693]]\nC: [[1.056, 2.871, 1.196, 0.82, -0.168, 0.476], [1.086, 3.168, 1.177, -0.05, 0.768, 0.624], [1.078, 2.314, 1.991, 0.481, -0.014, 0.382], [0.899, 1.855, 1.409, -0.073, 0.065, 0.078], [0.796, 1.846, 1.026, -0.008, 0.461, 0.294], [0.96, 0.751, 1.316, 0.52, 0.805, 0.752], [1.18, 1.031, 1.766, 0.673, 0.119, 0.034], [1.398, 0.505, 1.118, -0.168, 0.16, -0.249], [0.838, 0.65, 1.392, 0.173, 0.458, 0.332], [1.111, -0.328, 1.396, 0.558, 0.481, 0.366], [0.597, -0.355, 1.146, 0.623, 0.368, 0.632], [0.691, -0.514, 1.338, -0.157, 0.304, -0.124], [0.696, -1.125, 1.476, 0.501, 0.757, 0.356], [0.907, -0.859, 1.385, 0.656, 0.571, -0.029], [1.035, -1.127, 1.219, 0.093, 0.841, 0.704], [0.635, -1.763, 1.501, -0.076, -0.097, 0.162], [0.614, -1.848, 1.062, 0.328, 0.483, 0.674], [0.692, -2.453, 1.556, 0.665, 0.718, 0.625], [1.074, -2.937, 1.026, 0.776, 0.224, 0.639], [0.852, -3.222, 1.01, 0.571, -0.139, 0.12]]\nD: [[1.022, 3.318, 1.189, 0.205, -0.146, 0.042], [0.815, 2.622, 1.239, 0.213, 0.653, 0.265], [1.051, 2.623, 1.858, 0.743, -0.174, 0.425], [1.36, 1.47, 1.216, -0.071, -0.098, -0.074], [1.312, 2.017, 2.002, -0.015, 0.439, 0.124], [0.798, 1.663, 1.184, 0.218, 0.773, 0.512], [1.438, 0.663, 1.321, 0.334, 0.497, 0.799], [1.496, 1.067, 1.009, 0.492, 0.69, -0.197], [0.673, 0.916, 1.137, 0.692, -0.115, 0.537], [0.588, 0.319, 1.507, 0.723, 0.486, 0.106], [0.938, -0.596, 1.384, 0.378, 0.487, -0.284], [0.718, -0.867, 0.941, 0.405, 0.388, -0.074], [1.365, -0.417, 1.613, 0.897, 0.508, -0.003], [1.124, -1.228, 1.16, 0.374, 0.651, 0.692], [0.872, -1.666, 1.25, 0.857, 0.612, -0.1], [0.693, -1.777, 1.038, 0.754, 0.733, 0.072], [1.133, -1.714, 1.626, 0.475, -0.192, 0.478], [1.392, -2.804, 1.671, -0.124, 0.18, 0.524], [1.024, -2.671, 1.235, 0.602, 0.29, 0.162], [0.636, -2.621, 1.52, -0.11, 0.64, 0.18]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_24_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_24_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[0.127, 1.263, 0.842, 1.099, 0.165, 0.151], [0.899, 0.349, 0.833, 0.078, 0.633, 0.087]]\nB: [[-0.285, 1.099, 0.515, 1.523, 0.256, -0.319], [0.56, 0.838, 0.875, -0.327, 0.985, 0.228]]\nC: [[0.446, 1.442, 1.259, 1.427, 0.331, 0.006], [0.446, 0.556, 0.643, 0.276, 0.563, -0.341]]\nD: [[-0.356, 1.631, 0.612, 0.864, 0.511, -0.226], [0.523, 0.674, 0.57, 0.567, 0.594, 0.019]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the rail in the scene. The camera pose information includes: the rotation matrix: [[0.631332, 0.312126, -0.709927], [0.775472, -0.26347, 0.573784], [-0.007951, -0.912776, -0.408382]]; the translation vector: [1.600176, 0.624978, 1.327739], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.127, 1.263, 0.842, 1.099, 0.165, 0.151], [0.899, 0.349, 0.833, 0.078, 0.633, 0.087]]\nB: [[-0.285, 1.099, 0.515, 1.523, 0.256, -0.319], [0.56, 0.838, 0.875, -0.327, 0.985, 0.228]]\nC: [[0.446, 1.442, 1.259, 1.427, 0.331, 0.006], [0.446, 0.556, 0.643, 0.276, 0.563, -0.341]]\nD: [[-0.356, 1.631, 0.612, 0.864, 0.511, -0.226], [0.523, 0.674, 0.57, 0.567, 0.594, 0.019]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_25_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_25_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[0.787, 2.876, 1.457, 2.063, -0.079, 1.518], [-0.953, 1.998, 0.997, 0.069, 3.579, 2.369], [0.693, -0.876, 1.265, 0.33, 4.219, 2.072], [0.345, -2.632, 0.88, 1.928, -0.067, 1.565], [-0.455, -2.068, 1.068, 0.08, 1.617, 2.382], [-0.899, -1.196, 0.623, 0.651, 0.095, 2.066]]\nB: [[0.116, 3.687, 0.955, 2.711, 0.51, 1.785], [-0.801, 1.888, 0.628, 0.573, 3.732, 2.205], [1.093, -0.187, 0.521, -0.148, 5.045, 2.538], [0.42, -2.481, 1.79, 1.307, 0.36, 0.947], [-0.514, -1.855, 0.567, 0.468, 1.76, 1.511], [-1.181, -1.155, 0.753, 0.267, -0.268, 1.847]]\nC: [[0.302, 3.207, 1.219, 2.255, 0.306, 1.414], [-0.871, 1.59, 0.966, 0.239, 3.492, 2.066], [0.732, -0.454, 0.961, 0.242, 4.576, 2.069], [-0.078, -2.664, 1.355, 1.624, 0.192, 1.303], [-0.886, -1.849, 0.913, 0.175, 1.703, 1.972], [-1.091, -1.016, 0.816, 0.518, 0.228, 1.826]]\nD: [[0.349, 2.764, 1.178, 2.112, -0.17, 1.603], [-1.032, 1.356, 0.914, 0.415, 3.877, 2.097], [0.285, -0.798, 1.205, 0.303, 4.409, 2.223], [-0.233, -2.574, 1.577, 1.241, 0.359, 1.513], [-1.059, -2.05, 1.259, 0.503, 1.807, 1.753], [-1.108, -1.393, 0.584, 0.052, -0.001, 1.469]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[-0.386761, -0.304254, 0.870543], [-0.920043, 0.191539, -0.34181], [-0.062746, -0.933136, -0.354007]]; the translation vector: [2.082368, 4.008438, 1.845888], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.787, 2.876, 1.457, 2.063, -0.079, 1.518], [-0.953, 1.998, 0.997, 0.069, 3.579, 2.369], [0.693, -0.876, 1.265, 0.33, 4.219, 2.072], [0.345, -2.632, 0.88, 1.928, -0.067, 1.565], [-0.455, -2.068, 1.068, 0.08, 1.617, 2.382], [-0.899, -1.196, 0.623, 0.651, 0.095, 2.066]]\nB: [[0.116, 3.687, 0.955, 2.711, 0.51, 1.785], [-0.801, 1.888, 0.628, 0.573, 3.732, 2.205], [1.093, -0.187, 0.521, -0.148, 5.045, 2.538], [0.42, -2.481, 1.79, 1.307, 0.36, 0.947], [-0.514, -1.855, 0.567, 0.468, 1.76, 1.511], [-1.181, -1.155, 0.753, 0.267, -0.268, 1.847]]\nC: [[0.302, 3.207, 1.219, 2.255, 0.306, 1.414], [-0.871, 1.59, 0.966, 0.239, 3.492, 2.066], [0.732, -0.454, 0.961, 0.242, 4.576, 2.069], [-0.078, -2.664, 1.355, 1.624, 0.192, 1.303], [-0.886, -1.849, 0.913, 0.175, 1.703, 1.972], [-1.091, -1.016, 0.816, 0.518, 0.228, 1.826]]\nD: [[0.349, 2.764, 1.178, 2.112, -0.17, 1.603], [-1.032, 1.356, 0.914, 0.415, 3.877, 2.097], [0.285, -0.798, 1.205, 0.303, 4.409, 2.223], [-0.233, -2.574, 1.577, 1.241, 0.359, 1.513], [-1.059, -2.05, 1.259, 0.503, 1.807, 1.753], [-1.108, -1.393, 0.584, 0.052, -0.001, 1.469]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_26_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_26_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-2.506, 0.209, 0.255, 0.924, 1.928, 0.478], [0.463, -1.087, 0.626, 1.179, 0.62, 0.996], [2.049, 0.799, -0.061, 0.618, 1.041, 1.191]]\nB: [[-2.054, 0.6, 0.688, 1.044, 1.508, 0.567], [0.964, -1.042, 0.495, 1.122, 0.573, 0.421], [2.453, 0.513, 0.739, 0.463, 1.578, 0.424]]\nC: [[-2.686, -0.003, 0.374, 0.562, 1.486, 0.489], [1.084, -0.733, 0.31, 1.073, 1.131, 0.967], [1.7, 0.788, 0.091, 0.433, 1.461, 1.21]]\nD: [[-2.225, 0.184, 0.565, 0.652, 1.867, 0.966], [0.89, -0.986, 0.428, 1.537, 0.782, 0.844], [2.106, 0.485, 0.423, 0.73, 1.476, 0.84]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the couch in the scene. The camera pose information includes: the rotation matrix: [[-0.565317, -0.50256, 0.654103], [-0.824719, 0.328974, -0.460017], [0.016003, -0.799506, -0.600445]]; the translation vector: [4.07549, 5.065369, 1.281872], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-2.506, 0.209, 0.255, 0.924, 1.928, 0.478], [0.463, -1.087, 0.626, 1.179, 0.62, 0.996], [2.049, 0.799, -0.061, 0.618, 1.041, 1.191]]\nB: [[-2.054, 0.6, 0.688, 1.044, 1.508, 0.567], [0.964, -1.042, 0.495, 1.122, 0.573, 0.421], [2.453, 0.513, 0.739, 0.463, 1.578, 0.424]]\nC: [[-2.686, -0.003, 0.374, 0.562, 1.486, 0.489], [1.084, -0.733, 0.31, 1.073, 1.131, 0.967], [1.7, 0.788, 0.091, 0.433, 1.461, 1.21]]\nD: [[-2.225, 0.184, 0.565, 0.652, 1.867, 0.966], [0.89, -0.986, 0.428, 1.537, 0.782, 0.844], [2.106, 0.485, 0.423, 0.73, 1.476, 0.84]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_27_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_27_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-0.787, -0.535, 0.927, 0.017, -0.194, -0.206]]\nB: [[-1.049, -0.444, 0.739, 0.127, 0.097, 0.179]]\nC: [[-1.148, -0.307, 0.649, -0.194, 0.004, 0.501]]\nD: [[-1.423, -0.784, 0.923, 0.285, 0.539, 0.33]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the water bottle in the scene. The camera pose information includes: the rotation matrix: [[0.684823, -0.326379, 0.651532], [-0.728707, -0.304485, 0.613413], [-0.001823, -0.894855, -0.446353]]; the translation vector: [2.86358, 2.414664, 1.549631], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.787, -0.535, 0.927, 0.017, -0.194, -0.206]]\nB: [[-1.049, -0.444, 0.739, 0.127, 0.097, 0.179]]\nC: [[-1.148, -0.307, 0.649, -0.194, 0.004, 0.501]]\nD: [[-1.423, -0.784, 0.923, 0.285, 0.539, 0.33]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_28_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_28_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-0.125, -0.371, 0.52, 0.921, 0.949, 1.032], [-0.05, 0.47, 0.51, 0.929, 1.055, 1.018]]\nB: [[-0.03, 0.021, 0.629, 1.294, 0.744, 0.853], [0.141, 0.523, 0.057, 0.461, 0.601, 1.102]]\nC: [[-0.027, -0.543, 0.255, 1.392, 0.459, 1.351], [-0.542, 0.241, 0.854, 1.099, 1.281, 1.01]]\nD: [[-0.353, -0.617, 0.621, 0.568, 1.229, 1.321], [-0.327, 0.58, 0.56, 0.835, 0.644, 0.683]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the table in the scene. The camera pose information includes: the rotation matrix: [[0.935902, 0.160482, -0.313582], [0.351212, -0.493772, 0.795512], [-0.027173, -0.854655, -0.518485]]; the translation vector: [4.465, -0.226232, 1.550028], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.125, -0.371, 0.52, 0.921, 0.949, 1.032], [-0.05, 0.47, 0.51, 0.929, 1.055, 1.018]]\nB: [[-0.03, 0.021, 0.629, 1.294, 0.744, 0.853], [0.141, 0.523, 0.057, 0.461, 0.601, 1.102]]\nC: [[-0.027, -0.543, 0.255, 1.392, 0.459, 1.351], [-0.542, 0.241, 0.854, 1.099, 1.281, 1.01]]\nD: [[-0.353, -0.617, 0.621, 0.568, 1.229, 1.321], [-0.327, 0.58, 0.56, 0.835, 0.644, 0.683]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_29_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_29_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-1.113, 1.152, 0.299, 1.212, 0.824, 1.479], [-0.739, -1.761, 0.838, 1.759, 0.908, 0.436]]\nB: [[-1.686, 0.962, 0.402, 1.418, 0.984, 0.915], [-0.524, -1.303, 0.377, 1.429, 0.342, 0.995]]\nC: [[-1.37, 1.148, 0.616, 1.114, 0.537, 1.159], [-0.283, -1.543, 0.412, 1.531, 0.506, 0.887]]\nD: [[-1.358, 1.603, 0.665, 1.495, 0.045, 1.488], [0.077, -1.171, 0.113, 1.245, 0.683, 1.338]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the dresser in the scene. The camera pose information includes: the rotation matrix: [[0.993306, 0.029023, -0.111812], [0.110831, -0.512349, 0.851596], [-0.032571, -0.858287, -0.512136]]; the translation vector: [2.482234, 1.391135, 1.348064], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.113, 1.152, 0.299, 1.212, 0.824, 1.479], [-0.739, -1.761, 0.838, 1.759, 0.908, 0.436]]\nB: [[-1.686, 0.962, 0.402, 1.418, 0.984, 0.915], [-0.524, -1.303, 0.377, 1.429, 0.342, 0.995]]\nC: [[-1.37, 1.148, 0.616, 1.114, 0.537, 1.159], [-0.283, -1.543, 0.412, 1.531, 0.506, 0.887]]\nD: [[-1.358, 1.603, 0.665, 1.495, 0.045, 1.488], [0.077, -1.171, 0.113, 1.245, 0.683, 1.338]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_30_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_30_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[1.429, 0.564, 1.259, 0.514, 4.432, 2.586], [-1.998, 0.309, 1.385, 0.292, 3.896, 2.792], [0.693, 2.704, 1.079, 1.949, 0.124, 2.2]]\nB: [[1.111, 0.098, 1.082, 0.466, 4.575, 2.917], [-1.93, 0.083, 1.425, -0.025, 4.078, 2.389], [0.372, 3.074, 1.309, 1.613, 0.349, 2.653]]\nC: [[1.746, 0.141, 1.259, 0.14, 4.199, 2.418], [-1.8, 0.062, 1.744, -0.163, 3.558, 2.447], [0.931, 3.17, 1.18, 1.489, -0.095, 2.336]]\nD: [[1.116, 0.433, 1.412, 0.515, 4.324, 2.69], [-1.509, 0.174, 1.744, -0.053, 3.532, 2.532], [0.744, 2.248, 0.965, 1.964, 0.231, 1.764]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[-0.32152, -0.4706, 0.821681], [-0.946681, 0.178549, -0.268172], [-0.020508, -0.864092, -0.502915]]; the translation vector: [2.120097, 2.367636, 1.494245], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.429, 0.564, 1.259, 0.514, 4.432, 2.586], [-1.998, 0.309, 1.385, 0.292, 3.896, 2.792], [0.693, 2.704, 1.079, 1.949, 0.124, 2.2]]\nB: [[1.111, 0.098, 1.082, 0.466, 4.575, 2.917], [-1.93, 0.083, 1.425, -0.025, 4.078, 2.389], [0.372, 3.074, 1.309, 1.613, 0.349, 2.653]]\nC: [[1.746, 0.141, 1.259, 0.14, 4.199, 2.418], [-1.8, 0.062, 1.744, -0.163, 3.558, 2.447], [0.931, 3.17, 1.18, 1.489, -0.095, 2.336]]\nD: [[1.116, 0.433, 1.412, 0.515, 4.324, 2.69], [-1.509, 0.174, 1.744, -0.053, 3.532, 2.532], [0.744, 2.248, 0.965, 1.964, 0.231, 1.764]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_31_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_31_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[0.189, -0.394, 0.453, 1.615, 0.833, 0.943]]\nB: [[-0.04, -0.278, 0.23, 1.326, 1.046, 0.463]]\nC: [[-0.492, -0.1, 0.679, 1.535, 0.67, -0.014]]\nD: [[0.006, 0.067, 0.535, 1.038, 1.473, 0.446]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the coffee table in the scene. The camera pose information includes: the rotation matrix: [[-0.799511, 0.533863, -0.275266], [0.600541, 0.71925, -0.349328], [0.011492, -0.4446, -0.895656]]; the translation vector: [2.031323, 2.312379, 1.200993], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.189, -0.394, 0.453, 1.615, 0.833, 0.943]]\nB: [[-0.04, -0.278, 0.23, 1.326, 1.046, 0.463]]\nC: [[-0.492, -0.1, 0.679, 1.535, 0.67, -0.014]]\nD: [[0.006, 0.067, 0.535, 1.038, 1.473, 0.446]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_32_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_32_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[1.534, -3.167, 1.655, 0.929, 0.443, 2.126]]\nB: [[1.524, -3.177, 0.91, 0.507, 0.601, 2.272]]\nC: [[1.265, -3.361, 1.281, 0.587, 0.91, 2.343]]\nD: [[1.106, -3.397, 1.033, 0.365, 0.531, 2.023]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the shower walls in the scene. The camera pose information includes: the rotation matrix: [[0.590232, -0.352789, 0.726062], [-0.807221, -0.252962, 0.533296], [-0.004475, -0.900861, -0.434086]]; the translation vector: [2.518124, 2.463328, 1.346668], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.534, -3.167, 1.655, 0.929, 0.443, 2.126]]\nB: [[1.524, -3.177, 0.91, 0.507, 0.601, 2.272]]\nC: [[1.265, -3.361, 1.281, 0.587, 0.91, 2.343]]\nD: [[1.106, -3.397, 1.033, 0.365, 0.531, 2.023]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_33_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_33_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-1.432, -0.058, 1.349, -0.208, 0.669, 1.766], [0.527, -0.253, 0.616, -0.259, 1.051, 2.58]]\nB: [[-1.145, -0.538, 0.911, 0.071, 0.71, 1.954], [0.803, -0.422, 1.032, 0.108, 0.84, 2.211]]\nC: [[-1.363, -0.409, 0.647, 0.052, 0.929, 2.359], [0.332, 0.057, 1.462, -0.091, 0.807, 2.526]]\nD: [[-1.139, -0.369, 0.72, -0.007, 0.535, 2.292], [0.44, -0.22, 1.3, 0.54, 1.144, 2.107]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the door in the scene. The camera pose information includes: the rotation matrix: [[-0.464707, 0.496079, -0.733453], [0.882598, 0.326106, -0.338639], [0.071191, -0.804711, -0.589382]]; the translation vector: [2.864701, 0.868861, 1.204561], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.432, -0.058, 1.349, -0.208, 0.669, 1.766], [0.527, -0.253, 0.616, -0.259, 1.051, 2.58]]\nB: [[-1.145, -0.538, 0.911, 0.071, 0.71, 1.954], [0.803, -0.422, 1.032, 0.108, 0.84, 2.211]]\nC: [[-1.363, -0.409, 0.647, 0.052, 0.929, 2.359], [0.332, 0.057, 1.462, -0.091, 0.807, 2.526]]\nD: [[-1.139, -0.369, 0.72, -0.007, 0.535, 2.292], [0.44, -0.22, 1.3, 0.54, 1.144, 2.107]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_34_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_34_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-2.366, -0.589, 0.493, 0.271, 1.157, 0.396]]\nB: [[-2.148, -0.107, 0.643, 0.495, 1.354, 0.165]]\nC: [[-2.396, -0.378, 0.719, 0.293, 1.134, 0.807]]\nD: [[-2.162, -0.271, 0.293, -0.116, 0.802, 0.089]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the books in the scene. The camera pose information includes: the rotation matrix: [[0.467192, 0.317292, -0.825262], [0.883302, -0.126478, 0.451421], [0.038855, -0.939856, -0.339354]]; the translation vector: [2.723032, 3.168159, 1.438168], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-2.366, -0.589, 0.493, 0.271, 1.157, 0.396]]\nB: [[-2.148, -0.107, 0.643, 0.495, 1.354, 0.165]]\nC: [[-2.396, -0.378, 0.719, 0.293, 1.134, 0.807]]\nD: [[-2.162, -0.271, 0.293, -0.116, 0.802, 0.089]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_35_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_35_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[0.721, -0.518, 0.621, 0.489, 0.194, 0.671], [0.219, 1.2, 0.605, 0.561, 1.11, 0.032]]\nB: [[1.127, -0.237, 0.575, 0.571, 0.442, 0.463], [0.315, 0.86, 0.589, 0.436, 0.639, 0.436]]\nC: [[1.534, -0.019, 0.554, 1.064, 0.929, 0.813], [0.238, 0.541, 0.519, 0.085, 0.619, 0.329]]\nD: [[0.67, 0.235, 1.051, 0.639, -0.039, 0.084], [0.187, 0.38, 0.829, 0.452, 0.327, 0.898]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the chair in the scene. The camera pose information includes: the rotation matrix: [[0.473704, -0.275929, 0.836342], [-0.879436, -0.198746, 0.432542], [0.046868, -0.940406, -0.336809]]; the translation vector: [2.984934, 2.048073, 1.446683], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.721, -0.518, 0.621, 0.489, 0.194, 0.671], [0.219, 1.2, 0.605, 0.561, 1.11, 0.032]]\nB: [[1.127, -0.237, 0.575, 0.571, 0.442, 0.463], [0.315, 0.86, 0.589, 0.436, 0.639, 0.436]]\nC: [[1.534, -0.019, 0.554, 1.064, 0.929, 0.813], [0.238, 0.541, 0.519, 0.085, 0.619, 0.329]]\nD: [[0.67, 0.235, 1.051, 0.639, -0.039, 0.084], [0.187, 0.38, 0.829, 0.452, 0.327, 0.898]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_36_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_36_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[1.092, -0.678, 1.584, -0.059, 1.814, 1.264], [-1.925, -0.058, 1.478, -0.34, 2.948, 1.581]]\nB: [[1.208, -0.318, 1.322, 0.047, 1.935, 1.314], [-2.088, -0.879, 1.441, 0.235, 2.296, 1.085]]\nC: [[1.41, -0.38, 1.574, 0.141, 1.666, 1.41], [-1.712, -0.407, 1.364, 0.152, 2.69, 1.496]]\nD: [[1.415, -0.841, 1.953, -0.188, 1.625, 1.182], [-1.333, -0.183, 1.414, 0.172, 2.326, 1.539]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the blackboard in the scene. The camera pose information includes: the rotation matrix: [[0.24604, -0.551346, 0.797171], [-0.968826, -0.115295, 0.219278], [-0.028988, -0.826271, -0.562526]]; the translation vector: [1.704247, 2.057158, 1.361636], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.092, -0.678, 1.584, -0.059, 1.814, 1.264], [-1.925, -0.058, 1.478, -0.34, 2.948, 1.581]]\nB: [[1.208, -0.318, 1.322, 0.047, 1.935, 1.314], [-2.088, -0.879, 1.441, 0.235, 2.296, 1.085]]\nC: [[1.41, -0.38, 1.574, 0.141, 1.666, 1.41], [-1.712, -0.407, 1.364, 0.152, 2.69, 1.496]]\nD: [[1.415, -0.841, 1.953, -0.188, 1.625, 1.182], [-1.333, -0.183, 1.414, 0.172, 2.326, 1.539]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_37_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_37_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-0.294, -3.518, 1.054, 3.936, 0.361, 0.915], [0.879, 3.786, 1.41, 2.097, 0.63, 1.328]]\nB: [[-0.76, -3.309, 1.31, 3.985, 0.372, 1.047], [0.904, 3.311, 1.519, 1.8, 0.243, 1.41]]\nC: [[-0.969, -3.07, 1.797, 3.572, 0.172, 1.293], [1.021, 3.539, 1.127, 2.014, -0.169, 1.491]]\nD: [[-0.614, -3.4, 1.295, 4.114, 0.42, 0.553], [0.711, 2.902, 1.12, 1.656, 0.643, 1.016]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the whiteboard in the scene. The camera pose information includes: the rotation matrix: [[-0.852441, 0.228219, -0.470383], [0.522431, 0.337001, -0.78326], [-0.020235, -0.913426, -0.406502]]; the translation vector: [1.798405, 5.320803, 1.619482], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.294, -3.518, 1.054, 3.936, 0.361, 0.915], [0.879, 3.786, 1.41, 2.097, 0.63, 1.328]]\nB: [[-0.76, -3.309, 1.31, 3.985, 0.372, 1.047], [0.904, 3.311, 1.519, 1.8, 0.243, 1.41]]\nC: [[-0.969, -3.07, 1.797, 3.572, 0.172, 1.293], [1.021, 3.539, 1.127, 2.014, -0.169, 1.491]]\nD: [[-0.614, -3.4, 1.295, 4.114, 0.42, 0.553], [0.711, 2.902, 1.12, 1.656, 0.643, 1.016]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_38_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_38_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[1.694, -2.027, 0.441, 1.326, 1.107, 0.898], [-0.288, -2.078, 0.474, 1.039, 1.539, 0.924]]\nB: [[1.654, -2.115, 0.692, 1.098, 1.029, 0.654], [-0.011, -1.968, 0.288, 1.388, 1.994, 1.185]]\nC: [[2.035, -2.378, 0.613, 1.604, 1.492, 1.161], [-0.68, -1.93, 0.48, 0.656, 1.897, 0.701]]\nD: [[1.361, -2.093, 0.306, 1.061, 0.846, 0.974], [-0.065, -1.942, 0.682, 1.519, 1.648, 1.035]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the foosball table in the scene. The camera pose information includes: the rotation matrix: [[-0.699126, -0.324611, 0.637064], [-0.713802, 0.265353, -0.648131], [0.041344, -0.907863, -0.417224]]; the translation vector: [0.050403, 3.78209, 1.506908], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.694, -2.027, 0.441, 1.326, 1.107, 0.898], [-0.288, -2.078, 0.474, 1.039, 1.539, 0.924]]\nB: [[1.654, -2.115, 0.692, 1.098, 1.029, 0.654], [-0.011, -1.968, 0.288, 1.388, 1.994, 1.185]]\nC: [[2.035, -2.378, 0.613, 1.604, 1.492, 1.161], [-0.68, -1.93, 0.48, 0.656, 1.897, 0.701]]\nD: [[1.361, -2.093, 0.306, 1.061, 0.846, 0.974], [-0.065, -1.942, 0.682, 1.519, 1.648, 1.035]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_39_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_39_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-1.934, -0.844, -0.178, -0.06, 0.379, 0.377]]\nB: [[-2.118, -0.866, 0.424, -0.166, 0.218, 0.556]]\nC: [[-2.075, -0.928, -0.19, 0.558, 0.471, 0.431]]\nD: [[-1.78, -0.879, 0.057, 0.14, 0.194, 0.118]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the chair in the scene. The camera pose information includes: the rotation matrix: [[-0.079918, -0.690871, 0.718547], [-0.996802, 0.055321, -0.057677], [9.6e-05, -0.720858, -0.693082]]; the translation vector: [1.142658, 0.968078, 1.385987], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.934, -0.844, -0.178, -0.06, 0.379, 0.377]]\nB: [[-2.118, -0.866, 0.424, -0.166, 0.218, 0.556]]\nC: [[-2.075, -0.928, -0.19, 0.558, 0.471, 0.431]]\nD: [[-1.78, -0.879, 0.057, 0.14, 0.194, 0.118]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_40_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_40_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-1.004, -0.056, 0.156, 0.548, 2.574, 0.973], [0.863, 1.538, 0.8, 1.622, 0.932, 0.511]]\nB: [[-0.752, -0.451, 0.479, 0.974, 2.169, 0.971], [0.505, 1.322, 0.592, 1.774, 0.902, 0.995]]\nC: [[-0.502, -0.659, 0.53, 0.847, 2.257, 0.624], [0.069, 1.171, 0.213, 2.015, 1.277, 1.24]]\nD: [[-0.413, -0.371, 0.765, 1.102, 2.094, 1.312], [0.364, 1.532, 0.25, 2.233, 1.243, 0.916]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the couch in the scene. The camera pose information includes: the rotation matrix: [[-0.861262, 0.35211, -0.366398], [0.508128, 0.60504, -0.61297], [0.005853, -0.714105, -0.700014]]; the translation vector: [3.145762, 3.637784, 1.437024], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.004, -0.056, 0.156, 0.548, 2.574, 0.973], [0.863, 1.538, 0.8, 1.622, 0.932, 0.511]]\nB: [[-0.752, -0.451, 0.479, 0.974, 2.169, 0.971], [0.505, 1.322, 0.592, 1.774, 0.902, 0.995]]\nC: [[-0.502, -0.659, 0.53, 0.847, 2.257, 0.624], [0.069, 1.171, 0.213, 2.015, 1.277, 1.24]]\nD: [[-0.413, -0.371, 0.765, 1.102, 2.094, 1.312], [0.364, 1.532, 0.25, 2.233, 1.243, 0.916]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_41_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_41_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[0.464, -1.008, 0.704, 0.548, 0.603, 1.02], [-0.19, -0.382, 0.144, 0.733, 0.716, 0.646], [-0.404, 0.288, 0.436, 0.98, 0.357, 1.05], [1.664, -1.226, -0.006, 0.561, 0.79, 0.588], [0.842, 1.15, 0.487, 0.536, 0.705, 0.632], [0.433, 0.35, -0.074, 0.642, 0.688, 0.335], [1.494, 3.097, 0.552, 0.862, 0.855, 0.649], [-1.799, -1.668, 0.843, 1.135, 1.012, 0.533], [2.071, -0.103, 0.082, 0.7, 0.467, 1.183], [2.558, 1.11, 0.748, 0.486, 0.458, 0.736], [-0.954, 2.892, -0.06, 0.296, 0.278, 1.194], [-1.303, 2.083, 0.061, 0.236, 0.278, 0.444], [-1.569, 0.894, 0.218, 0.482, 1.049, 0.471]]\nB: [[0.772, -0.719, 0.389, 0.713, 0.789, 0.818], [-0.024, -0.745, 0.397, 0.693, 0.69, 0.791], [-0.445, -0.009, 0.396, 0.704, 0.6, 0.798], [1.881, -0.924, 0.405, 0.629, 0.643, 0.773], [0.681, 0.918, 0.401, 0.691, 0.741, 0.776], [0.646, 0.122, 0.392, 0.618, 0.697, 0.804], [1.675, 2.694, 0.343, 0.794, 0.824, 0.712], [-1.741, -1.918, 0.384, 0.689, 0.734, 0.793], [1.972, 0.182, 0.329, 0.798, 0.905, 0.759], [2.104, 1.432, 0.601, 0.176, 0.467, 0.305], [-1.26, 2.803, 0.397, 0.519, 0.618, 0.85], [-1.699, 1.837, 0.379, 0.732, 0.671, 0.798], [-1.685, 1.314, 0.409, 0.719, 0.764, 0.815]]\nC: [[0.533, -0.974, 0.234, 0.918, 0.378, 0.964], [-0.355, -1.19, 0.156, 0.302, 0.635, 0.774], [-0.597, 0.157, 0.288, 1.05, 0.184, 0.298], [2.072, -0.909, 0.536, 0.468, 0.691, 0.463], [0.786, 1.284, 0.692, 1.11, 1.012, 1.207], [0.407, 0.333, 0.418, 0.195, 0.858, 0.97], [1.968, 3.191, -0.153, 0.695, 1.269, 0.454], [-1.257, -1.997, 0.349, 0.303, 0.286, 0.552], [2.317, 0.459, 0.175, 0.403, 1.116, 1.213], [2.141, 1.823, 0.68, -0.29, 0.059, -0.035], [-1.354, 3.299, 0.362, 0.406, 0.802, 0.98], [-2.092, 2.265, 0.732, 1.224, 0.725, 0.93], [-1.784, 1.414, 0.713, 0.316, 1.116, 0.675]]\nD: [[0.989, -0.333, 0.223, 0.813, 0.656, 0.519], [0.19, -0.985, 0.389, 0.303, 0.729, 1.121], [-0.625, 0.156, 0.665, 1.074, 0.926, 0.429], [2.366, -0.669, 0.862, 0.551, 0.718, 0.409], [1.078, 0.548, 0.472, 1.129, 0.587, 0.295], [0.268, -0.298, 0.199, 0.384, 0.582, 0.724], [1.775, 3.124, 0.353, 0.87, 1.306, 0.424], [-2.119, -2.015, 0.712, 0.444, 0.613, 1.097], [2.125, 0.536, -0.025, 0.783, 0.67, 0.385], [2.16, 1.441, 0.464, 0.575, 0.443, 0.108], [-1.127, 3.006, 0.402, 0.226, 0.819, 0.552], [-1.981, 2.007, -0.054, 1.127, 0.372, 0.846], [-1.217, 1.009, -0.072, 0.967, 0.351, 1.126]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the chair in the scene. The camera pose information includes: the rotation matrix: [[0.951558, 0.16536, -0.259218], [0.307283, -0.481983, 0.820531], [0.010744, -0.860436, -0.509446]]; the translation vector: [2.919862, 3.428013, 1.521081], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.464, -1.008, 0.704, 0.548, 0.603, 1.02], [-0.19, -0.382, 0.144, 0.733, 0.716, 0.646], [-0.404, 0.288, 0.436, 0.98, 0.357, 1.05], [1.664, -1.226, -0.006, 0.561, 0.79, 0.588], [0.842, 1.15, 0.487, 0.536, 0.705, 0.632], [0.433, 0.35, -0.074, 0.642, 0.688, 0.335], [1.494, 3.097, 0.552, 0.862, 0.855, 0.649], [-1.799, -1.668, 0.843, 1.135, 1.012, 0.533], [2.071, -0.103, 0.082, 0.7, 0.467, 1.183], [2.558, 1.11, 0.748, 0.486, 0.458, 0.736], [-0.954, 2.892, -0.06, 0.296, 0.278, 1.194], [-1.303, 2.083, 0.061, 0.236, 0.278, 0.444], [-1.569, 0.894, 0.218, 0.482, 1.049, 0.471]]\nB: [[0.772, -0.719, 0.389, 0.713, 0.789, 0.818], [-0.024, -0.745, 0.397, 0.693, 0.69, 0.791], [-0.445, -0.009, 0.396, 0.704, 0.6, 0.798], [1.881, -0.924, 0.405, 0.629, 0.643, 0.773], [0.681, 0.918, 0.401, 0.691, 0.741, 0.776], [0.646, 0.122, 0.392, 0.618, 0.697, 0.804], [1.675, 2.694, 0.343, 0.794, 0.824, 0.712], [-1.741, -1.918, 0.384, 0.689, 0.734, 0.793], [1.972, 0.182, 0.329, 0.798, 0.905, 0.759], [2.104, 1.432, 0.601, 0.176, 0.467, 0.305], [-1.26, 2.803, 0.397, 0.519, 0.618, 0.85], [-1.699, 1.837, 0.379, 0.732, 0.671, 0.798], [-1.685, 1.314, 0.409, 0.719, 0.764, 0.815]]\nC: [[0.533, -0.974, 0.234, 0.918, 0.378, 0.964], [-0.355, -1.19, 0.156, 0.302, 0.635, 0.774], [-0.597, 0.157, 0.288, 1.05, 0.184, 0.298], [2.072, -0.909, 0.536, 0.468, 0.691, 0.463], [0.786, 1.284, 0.692, 1.11, 1.012, 1.207], [0.407, 0.333, 0.418, 0.195, 0.858, 0.97], [1.968, 3.191, -0.153, 0.695, 1.269, 0.454], [-1.257, -1.997, 0.349, 0.303, 0.286, 0.552], [2.317, 0.459, 0.175, 0.403, 1.116, 1.213], [2.141, 1.823, 0.68, -0.29, 0.059, -0.035], [-1.354, 3.299, 0.362, 0.406, 0.802, 0.98], [-2.092, 2.265, 0.732, 1.224, 0.725, 0.93], [-1.784, 1.414, 0.713, 0.316, 1.116, 0.675]]\nD: [[0.989, -0.333, 0.223, 0.813, 0.656, 0.519], [0.19, -0.985, 0.389, 0.303, 0.729, 1.121], [-0.625, 0.156, 0.665, 1.074, 0.926, 0.429], [2.366, -0.669, 0.862, 0.551, 0.718, 0.409], [1.078, 0.548, 0.472, 1.129, 0.587, 0.295], [0.268, -0.298, 0.199, 0.384, 0.582, 0.724], [1.775, 3.124, 0.353, 0.87, 1.306, 0.424], [-2.119, -2.015, 0.712, 0.444, 0.613, 1.097], [2.125, 0.536, -0.025, 0.783, 0.67, 0.385], [2.16, 1.441, 0.464, 0.575, 0.443, 0.108], [-1.127, 3.006, 0.402, 0.226, 0.819, 0.552], [-1.981, 2.007, -0.054, 1.127, 0.372, 0.846], [-1.217, 1.009, -0.072, 0.967, 0.351, 1.126]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_42_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_42_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[1.467, 4.66, 0.479, 1.188, 0.953, 0.41], [-0.153, 2.813, 0.487, 0.892, 1.061, 0.54], [0.179, 1.102, 0.898, 0.937, 1.029, 0.418], [1.771, 1.676, 0.076, 1.486, 0.58, 0.338], [1.933, -1.484, -0.005, 1.579, 0.28, 0.413], [-0.13, 5.403, 0.908, 0.195, 1.153, 0.536]]\nB: [[2.334, 4.242, 0.72, 1.63, 0.144, 0.491], [-0.388, 2.622, 0.373, 0.028, 1.059, 0.86], [0.405, 0.837, 0.295, 0.037, 1.048, 0.862], [2.233, 1.663, 0.713, 1.024, 0.482, 0.49], [1.733, -1.171, 0.524, 1.296, 0.141, 0.721], [-0.04, 4.994, 0.079, 0.373, 0.629, 0.062]]\nC: [[2.181, 4.661, 0.338, 1.164, 0.599, 0.637], [-0.181, 2.466, 0.019, 0.362, 0.849, 0.165], [-0.035, 1.291, 0.403, 0.392, 0.795, 0.638], [2.338, 1.45, 0.464, 0.895, 0.891, 0.816], [2.039, -1.264, 0.768, 1.237, 0.686, -0.053], [-0.249, 5.084, 0.593, 0.24, 0.421, 0.492]]\nD: [[1.918, 4.662, 0.478, 1.328, 0.546, 0.414], [0.093, 2.502, 0.4, 0.472, 0.776, 0.619], [0.138, 1.203, 0.414, 0.446, 0.869, 0.461], [1.918, 1.858, 0.513, 1.358, 0.507, 0.451], [2.021, -1.528, 0.41, 1.371, 0.472, 0.431], [0.209, 5.284, 0.463, 0.428, 0.778, 0.322]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the bench in the scene. The camera pose information includes: the rotation matrix: [[-0.482968, -0.397392, 0.78027], [-0.874514, 0.173759, -0.452807], [0.044362, -0.901048, -0.431445]]; the translation vector: [8.974016, 2.795387, 1.945192], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.467, 4.66, 0.479, 1.188, 0.953, 0.41], [-0.153, 2.813, 0.487, 0.892, 1.061, 0.54], [0.179, 1.102, 0.898, 0.937, 1.029, 0.418], [1.771, 1.676, 0.076, 1.486, 0.58, 0.338], [1.933, -1.484, -0.005, 1.579, 0.28, 0.413], [-0.13, 5.403, 0.908, 0.195, 1.153, 0.536]]\nB: [[2.334, 4.242, 0.72, 1.63, 0.144, 0.491], [-0.388, 2.622, 0.373, 0.028, 1.059, 0.86], [0.405, 0.837, 0.295, 0.037, 1.048, 0.862], [2.233, 1.663, 0.713, 1.024, 0.482, 0.49], [1.733, -1.171, 0.524, 1.296, 0.141, 0.721], [-0.04, 4.994, 0.079, 0.373, 0.629, 0.062]]\nC: [[2.181, 4.661, 0.338, 1.164, 0.599, 0.637], [-0.181, 2.466, 0.019, 0.362, 0.849, 0.165], [-0.035, 1.291, 0.403, 0.392, 0.795, 0.638], [2.338, 1.45, 0.464, 0.895, 0.891, 0.816], [2.039, -1.264, 0.768, 1.237, 0.686, -0.053], [-0.249, 5.084, 0.593, 0.24, 0.421, 0.492]]\nD: [[1.918, 4.662, 0.478, 1.328, 0.546, 0.414], [0.093, 2.502, 0.4, 0.472, 0.776, 0.619], [0.138, 1.203, 0.414, 0.446, 0.869, 0.461], [1.918, 1.858, 0.513, 1.358, 0.507, 0.451], [2.021, -1.528, 0.41, 1.371, 0.472, 0.431], [0.209, 5.284, 0.463, 0.428, 0.778, 0.322]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_43_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_43_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-1.653, 1.297, 0.818, 0.22, 0.426, 1.095]]\nB: [[-1.511, 1.726, 0.43, 0.986, 0.39, 0.401]]\nC: [[-0.81, 1.586, -0.129, 0.278, 0.94, 0.466]]\nD: [[-1.238, 1.344, 0.361, 0.491, 0.703, 0.77]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the toilet in the scene. The camera pose information includes: the rotation matrix: [[0.573165, 0.475287, -0.667521], [0.819422, -0.337921, 0.462988], [-0.005517, -0.81235, -0.583144]]; the translation vector: [4.230747, 1.597944, 1.425469], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.653, 1.297, 0.818, 0.22, 0.426, 1.095]]\nB: [[-1.511, 1.726, 0.43, 0.986, 0.39, 0.401]]\nC: [[-0.81, 1.586, -0.129, 0.278, 0.94, 0.466]]\nD: [[-1.238, 1.344, 0.361, 0.491, 0.703, 0.77]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_44_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_44_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-0.21, -0.485, 1.162, 0.745, 0.977, 0.286], [0.409, 1.308, 0.507, 0.07, 0.653, 0.254], [-1.112, -1.013, 0.307, 0.502, 0.872, 0.567], [-0.178, 2.738, 0.98, 0.567, 0.888, 0.53], [0.062, 1.603, 0.573, 0.692, 0.195, 0.048], [0.203, -0.892, 0.691, 0.653, 0.429, 0.535], [0.028, 1.784, 0.14, 1.147, 0.406, 0.851], [-1.458, 1.116, 0.904, -0.122, 0.156, 0.773], [-1.192, -0.149, 0.71, 0.375, 0.509, 0.581], [1.914, -2.0, 0.71, 0.384, 0.176, -0.331], [1.687, -2.156, 1.387, -0.058, 0.551, 0.368], [0.793, -1.724, 1.309, 1.148, 0.62, 0.588], [1.399, -0.955, 1.401, 0.399, 0.543, 0.388], [-0.785, -3.035, 1.174, 0.319, 0.082, 0.789], [-0.814, -2.329, 0.623, 0.245, 0.091, 0.496], [-1.62, -3.469, 0.316, 0.527, 0.537, -0.091]]\nB: [[-0.08, -0.154, 1.025, 0.591, 0.905, 0.775], [0.195, 0.928, 0.983, 0.833, 0.216, 0.071], [-0.777, -0.244, 0.921, 0.352, 0.434, 0.837], [-0.104, 1.99, 0.831, 0.825, 0.625, 0.159], [1.019, 2.186, 0.505, 0.763, 0.5, 0.673], [0.085, -0.695, 1.038, 0.323, 0.449, 0.684], [-0.014, 1.677, 0.448, 0.846, 0.305, -0.088], [-0.906, 1.351, 0.456, 0.541, 1.066, 0.626], [-1.282, 0.246, 0.87, 0.842, 0.096, -0.15], [1.42, -1.945, 0.918, 0.762, 0.341, 0.254], [1.009, -1.899, 1.409, -0.041, 0.531, 0.04], [0.874, -1.746, 1.047, 0.664, 0.437, 0.465], [0.723, -1.178, 0.705, 0.411, 0.715, 0.301], [-0.325, -2.808, 0.799, 0.443, 0.515, -0.023], [-1.586, -1.764, 0.236, 0.308, 0.382, 0.158], [-1.704, -3.657, 0.202, 0.579, -0.129, 0.217]]\nC: [[0.074, -0.449, 0.793, 0.473, 0.548, 0.492], [0.224, 1.038, 0.781, 0.486, 0.546, 0.522], [-0.941, -0.689, 0.578, 0.69, 0.615, 0.42], [-0.002, 2.387, 0.73, 0.7, 0.637, 0.471], [0.54, 1.814, 0.876, 0.434, 0.466, 0.544], [-0.295, -1.043, 0.729, 0.48, 0.533, 0.501], [-0.372, 1.676, 0.602, 0.676, 0.567, 0.405], [-1.148, 1.569, 0.485, 0.349, 0.639, 0.495], [-0.821, 0.09, 0.693, 0.409, 0.4, 0.269], [1.644, -1.897, 1.107, 0.364, 0.18, 0.12], [1.307, -2.14, 1.134, 0.414, 0.578, 0.428], [0.763, -1.797, 0.957, 0.648, 0.49, 0.409], [1.155, -1.373, 0.909, 0.317, 0.441, 0.117], [-0.563, -2.579, 0.735, 0.309, 0.521, 0.447], [-1.263, -2.059, 0.721, 0.472, 0.232, 0.232], [-1.688, -3.278, 0.604, 0.595, 0.313, 0.401]]\nD: [[0.369, -0.147, 1.222, 0.22, 0.106, 0.249], [0.37, 1.261, 1.11, 0.14, 1.02, 0.894], [-0.639, -0.96, 0.333, 0.677, 0.877, 0.601], [0.112, 1.921, 0.621, 0.682, 0.214, 0.04], [0.061, 1.445, 0.485, 0.375, 0.738, 0.414], [-0.478, -0.871, 0.684, 0.362, 0.566, 0.762], [-0.314, 1.927, 0.136, 0.42, 0.773, 0.685], [-1.086, 1.078, 0.616, 0.363, 0.796, 0.02], [-0.78, 0.455, 1.075, -0.039, 0.211, 0.125], [1.409, -1.503, 1.252, 0.797, 0.258, -0.146], [1.115, -1.981, 0.929, 0.053, 0.518, 0.484], [1.215, -2.2, 1.257, 0.76, 0.293, 0.427], [1.189, -1.058, 0.631, 0.369, 0.328, -0.119], [-0.365, -2.692, 1.041, -0.142, 0.542, 0.05], [-0.833, -2.437, 0.641, 0.718, 0.012, 0.121], [-2.016, -3.644, 1.062, 0.946, -0.031, -0.016]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the chair in the scene. The camera pose information includes: the rotation matrix: [[-0.844798, -0.442354, 0.301064], [-0.534849, 0.714819, -0.450523], [-0.015916, -0.541624, -0.84047]]; the translation vector: [3.085932, 7.995926, 1.934485], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.21, -0.485, 1.162, 0.745, 0.977, 0.286], [0.409, 1.308, 0.507, 0.07, 0.653, 0.254], [-1.112, -1.013, 0.307, 0.502, 0.872, 0.567], [-0.178, 2.738, 0.98, 0.567, 0.888, 0.53], [0.062, 1.603, 0.573, 0.692, 0.195, 0.048], [0.203, -0.892, 0.691, 0.653, 0.429, 0.535], [0.028, 1.784, 0.14, 1.147, 0.406, 0.851], [-1.458, 1.116, 0.904, -0.122, 0.156, 0.773], [-1.192, -0.149, 0.71, 0.375, 0.509, 0.581], [1.914, -2.0, 0.71, 0.384, 0.176, -0.331], [1.687, -2.156, 1.387, -0.058, 0.551, 0.368], [0.793, -1.724, 1.309, 1.148, 0.62, 0.588], [1.399, -0.955, 1.401, 0.399, 0.543, 0.388], [-0.785, -3.035, 1.174, 0.319, 0.082, 0.789], [-0.814, -2.329, 0.623, 0.245, 0.091, 0.496], [-1.62, -3.469, 0.316, 0.527, 0.537, -0.091]]\nB: [[-0.08, -0.154, 1.025, 0.591, 0.905, 0.775], [0.195, 0.928, 0.983, 0.833, 0.216, 0.071], [-0.777, -0.244, 0.921, 0.352, 0.434, 0.837], [-0.104, 1.99, 0.831, 0.825, 0.625, 0.159], [1.019, 2.186, 0.505, 0.763, 0.5, 0.673], [0.085, -0.695, 1.038, 0.323, 0.449, 0.684], [-0.014, 1.677, 0.448, 0.846, 0.305, -0.088], [-0.906, 1.351, 0.456, 0.541, 1.066, 0.626], [-1.282, 0.246, 0.87, 0.842, 0.096, -0.15], [1.42, -1.945, 0.918, 0.762, 0.341, 0.254], [1.009, -1.899, 1.409, -0.041, 0.531, 0.04], [0.874, -1.746, 1.047, 0.664, 0.437, 0.465], [0.723, -1.178, 0.705, 0.411, 0.715, 0.301], [-0.325, -2.808, 0.799, 0.443, 0.515, -0.023], [-1.586, -1.764, 0.236, 0.308, 0.382, 0.158], [-1.704, -3.657, 0.202, 0.579, -0.129, 0.217]]\nC: [[0.074, -0.449, 0.793, 0.473, 0.548, 0.492], [0.224, 1.038, 0.781, 0.486, 0.546, 0.522], [-0.941, -0.689, 0.578, 0.69, 0.615, 0.42], [-0.002, 2.387, 0.73, 0.7, 0.637, 0.471], [0.54, 1.814, 0.876, 0.434, 0.466, 0.544], [-0.295, -1.043, 0.729, 0.48, 0.533, 0.501], [-0.372, 1.676, 0.602, 0.676, 0.567, 0.405], [-1.148, 1.569, 0.485, 0.349, 0.639, 0.495], [-0.821, 0.09, 0.693, 0.409, 0.4, 0.269], [1.644, -1.897, 1.107, 0.364, 0.18, 0.12], [1.307, -2.14, 1.134, 0.414, 0.578, 0.428], [0.763, -1.797, 0.957, 0.648, 0.49, 0.409], [1.155, -1.373, 0.909, 0.317, 0.441, 0.117], [-0.563, -2.579, 0.735, 0.309, 0.521, 0.447], [-1.263, -2.059, 0.721, 0.472, 0.232, 0.232], [-1.688, -3.278, 0.604, 0.595, 0.313, 0.401]]\nD: [[0.369, -0.147, 1.222, 0.22, 0.106, 0.249], [0.37, 1.261, 1.11, 0.14, 1.02, 0.894], [-0.639, -0.96, 0.333, 0.677, 0.877, 0.601], [0.112, 1.921, 0.621, 0.682, 0.214, 0.04], [0.061, 1.445, 0.485, 0.375, 0.738, 0.414], [-0.478, -0.871, 0.684, 0.362, 0.566, 0.762], [-0.314, 1.927, 0.136, 0.42, 0.773, 0.685], [-1.086, 1.078, 0.616, 0.363, 0.796, 0.02], [-0.78, 0.455, 1.075, -0.039, 0.211, 0.125], [1.409, -1.503, 1.252, 0.797, 0.258, -0.146], [1.115, -1.981, 0.929, 0.053, 0.518, 0.484], [1.215, -2.2, 1.257, 0.76, 0.293, 0.427], [1.189, -1.058, 0.631, 0.369, 0.328, -0.119], [-0.365, -2.692, 1.041, -0.142, 0.542, 0.05], [-0.833, -2.437, 0.641, 0.718, 0.012, 0.121], [-2.016, -3.644, 1.062, 0.946, -0.031, -0.016]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_45_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_45_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[0.564, -1.252, 0.695, 0.857, 0.171, 1.483], [-0.805, -0.829, 1.211, 0.152, 2.16, 2.47], [-1.232, 0.204, 0.072, 0.257, 0.055, 0.162], [0.074, 0.233, 1.112, 1.831, 0.158, 2.315], [0.903, -0.43, 1.266, 0.188, 1.396, 2.012], [0.949, -1.643, 0.661, 0.101, 0.755, 1.399], [0.614, -1.995, 0.572, 0.724, 0.056, 1.162]]\nB: [[0.352, -1.05, 0.8, 0.589, 0.069, 1.588], [-0.381, -1.063, 0.88, -0.027, 2.004, 2.387], [-0.929, 0.409, 0.362, 0.309, 0.339, -0.301], [-0.26, 0.432, 1.078, 1.853, 0.513, 2.721], [1.171, -0.028, 1.724, -0.263, 0.948, 2.304], [1.072, -2.041, 1.024, -0.297, 0.869, 1.517], [0.352, -2.32, 0.85, 0.916, -0.424, 1.2]]\nC: [[0.424, -1.563, 1.009, 0.591, -0.023, 1.935], [-0.442, -0.344, 1.695, 0.23, 2.524, 2.736], [-1.205, 0.414, 0.154, -0.209, -0.177, -0.009], [-0.098, 0.328, 1.36, 1.735, 0.101, 1.922], [0.835, -0.195, 1.265, 0.532, 0.907, 2.267], [1.354, -1.455, 1.149, 0.399, 0.893, 1.521], [0.733, -1.909, 0.585, 1.055, -0.351, 1.621]]\nD: [[0.598, -1.618, 0.741, 0.612, 0.383, 1.422], [-0.532, -0.954, 1.597, 0.537, 2.362, 2.085], [-0.843, 0.31, -0.092, 0.065, -0.048, 0.556], [0.212, 0.31, 0.904, 1.605, 0.458, 1.973], [0.493, -0.221, 1.142, 0.015, 1.45, 2.441], [1.383, -2.104, 0.997, -0.035, 0.835, 1.803], [0.664, -2.077, 1.046, 1.1, 0.235, 1.396]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[0.982764, 0.054289, -0.17671], [0.184841, -0.27426, 0.943724], [0.002769, -0.960122, -0.279568]]; the translation vector: [4.072058, 1.220293, 1.47625], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.564, -1.252, 0.695, 0.857, 0.171, 1.483], [-0.805, -0.829, 1.211, 0.152, 2.16, 2.47], [-1.232, 0.204, 0.072, 0.257, 0.055, 0.162], [0.074, 0.233, 1.112, 1.831, 0.158, 2.315], [0.903, -0.43, 1.266, 0.188, 1.396, 2.012], [0.949, -1.643, 0.661, 0.101, 0.755, 1.399], [0.614, -1.995, 0.572, 0.724, 0.056, 1.162]]\nB: [[0.352, -1.05, 0.8, 0.589, 0.069, 1.588], [-0.381, -1.063, 0.88, -0.027, 2.004, 2.387], [-0.929, 0.409, 0.362, 0.309, 0.339, -0.301], [-0.26, 0.432, 1.078, 1.853, 0.513, 2.721], [1.171, -0.028, 1.724, -0.263, 0.948, 2.304], [1.072, -2.041, 1.024, -0.297, 0.869, 1.517], [0.352, -2.32, 0.85, 0.916, -0.424, 1.2]]\nC: [[0.424, -1.563, 1.009, 0.591, -0.023, 1.935], [-0.442, -0.344, 1.695, 0.23, 2.524, 2.736], [-1.205, 0.414, 0.154, -0.209, -0.177, -0.009], [-0.098, 0.328, 1.36, 1.735, 0.101, 1.922], [0.835, -0.195, 1.265, 0.532, 0.907, 2.267], [1.354, -1.455, 1.149, 0.399, 0.893, 1.521], [0.733, -1.909, 0.585, 1.055, -0.351, 1.621]]\nD: [[0.598, -1.618, 0.741, 0.612, 0.383, 1.422], [-0.532, -0.954, 1.597, 0.537, 2.362, 2.085], [-0.843, 0.31, -0.092, 0.065, -0.048, 0.556], [0.212, 0.31, 0.904, 1.605, 0.458, 1.973], [0.493, -0.221, 1.142, 0.015, 1.45, 2.441], [1.383, -2.104, 0.997, -0.035, 0.835, 1.803], [0.664, -2.077, 1.046, 1.1, 0.235, 1.396]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_46_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_46_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[0.784, -1.767, 0.476, 0.231, 0.24, 0.946], [-0.366, -0.774, 1.245, 0.024, 1.212, 0.879], [0.003, -1.957, 1.089, 0.298, 0.345, 0.685], [0.22, -0.648, 1.189, -0.053, 1.037, 0.737]]\nB: [[0.318, -1.739, 0.9, 0.365, 0.659, 0.502], [-0.143, -0.934, 0.904, 0.311, 0.754, 0.487], [-0.263, -1.46, 0.926, 0.248, 0.697, 0.452], [0.319, -1.069, 0.941, 0.277, 0.615, 0.5]]\nC: [[0.289, -1.409, 1.144, 0.67, 0.233, 0.02], [0.068, -0.634, 0.752, -0.119, 1.056, 0.899], [0.211, -1.754, 1.05, -0.206, 0.931, 0.732], [-0.148, -1.524, 1.046, -0.083, 1.07, 0.467]]\nD: [[0.118, -1.502, 0.988, 0.826, 0.676, 0.125], [-0.431, -1.392, 0.927, 0.243, 0.317, 0.128], [-0.041, -1.634, 0.476, -0.222, 0.764, 0.802], [0.199, -1.056, 1.182, 0.1, 0.287, 0.626]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the monitor in the scene. The camera pose information includes: the rotation matrix: [[-0.481759, -0.460793, 0.745371], [-0.875469, 0.290199, -0.386444], [-0.038235, -0.838722, -0.543216]]; the translation vector: [3.08436, 2.075189, 1.468295], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.784, -1.767, 0.476, 0.231, 0.24, 0.946], [-0.366, -0.774, 1.245, 0.024, 1.212, 0.879], [0.003, -1.957, 1.089, 0.298, 0.345, 0.685], [0.22, -0.648, 1.189, -0.053, 1.037, 0.737]]\nB: [[0.318, -1.739, 0.9, 0.365, 0.659, 0.502], [-0.143, -0.934, 0.904, 0.311, 0.754, 0.487], [-0.263, -1.46, 0.926, 0.248, 0.697, 0.452], [0.319, -1.069, 0.941, 0.277, 0.615, 0.5]]\nC: [[0.289, -1.409, 1.144, 0.67, 0.233, 0.02], [0.068, -0.634, 0.752, -0.119, 1.056, 0.899], [0.211, -1.754, 1.05, -0.206, 0.931, 0.732], [-0.148, -1.524, 1.046, -0.083, 1.07, 0.467]]\nD: [[0.118, -1.502, 0.988, 0.826, 0.676, 0.125], [-0.431, -1.392, 0.927, 0.243, 0.317, 0.128], [-0.041, -1.634, 0.476, -0.222, 0.764, 0.802], [0.199, -1.056, 1.182, 0.1, 0.287, 0.626]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_47_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_47_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[1.668, 1.082, 1.323, 0.017, 1.605, 2.936], [2.048, 0.314, 0.42, 0.03, 0.539, 1.761], [1.696, -0.749, 1.568, 0.626, 3.055, 2.333], [0.245, -2.087, 0.863, 3.306, 0.237, 1.624], [-1.623, -0.389, 0.569, -0.197, 4.462, 2.242], [-0.209, 2.531, 1.246, 3.173, 0.694, 2.668]]\nB: [[2.245, 1.07, 1.564, -0.254, 1.675, 2.343], [1.954, 0.142, 1.271, 0.081, 0.135, 1.547], [1.76, -0.992, 1.136, -0.068, 2.332, 2.454], [-0.223, -2.118, 0.902, 3.53, 0.211, 2.156], [-1.973, 0.007, 0.511, -0.091, 4.007, 2.11], [0.095, 1.842, 1.661, 2.856, 0.289, 2.599]]\nC: [[1.954, 0.955, 1.134, 0.469, 1.906, 3.136], [2.108, 0.777, 1.024, 0.49, 0.432, 1.736], [1.804, -1.161, 1.159, 0.579, 2.431, 2.741], [-0.075, -2.215, 1.141, 4.022, 0.647, 1.726], [-1.259, -0.097, 0.765, 0.259, 4.315, 1.492], [0.085, 2.397, 1.377, 2.831, 0.531, 2.459]]\nD: [[1.757, 1.207, 1.277, 0.171, 1.443, 2.662], [1.938, 0.553, 0.792, 0.372, 0.083, 1.699], [2.057, -0.732, 1.221, 0.308, 2.678, 2.532], [0.254, -2.161, 1.253, 3.68, 0.191, 2.014], [-1.595, -0.042, 0.831, 0.273, 4.363, 1.747], [0.19, 2.052, 1.297, 3.302, 0.408, 2.639]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[-0.935878, -0.161972, 0.312885], [-0.352322, 0.433116, -0.829627], [-0.001139, -0.886666, -0.46241]]; the translation vector: [1.123681, 2.231354, 1.408983], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.668, 1.082, 1.323, 0.017, 1.605, 2.936], [2.048, 0.314, 0.42, 0.03, 0.539, 1.761], [1.696, -0.749, 1.568, 0.626, 3.055, 2.333], [0.245, -2.087, 0.863, 3.306, 0.237, 1.624], [-1.623, -0.389, 0.569, -0.197, 4.462, 2.242], [-0.209, 2.531, 1.246, 3.173, 0.694, 2.668]]\nB: [[2.245, 1.07, 1.564, -0.254, 1.675, 2.343], [1.954, 0.142, 1.271, 0.081, 0.135, 1.547], [1.76, -0.992, 1.136, -0.068, 2.332, 2.454], [-0.223, -2.118, 0.902, 3.53, 0.211, 2.156], [-1.973, 0.007, 0.511, -0.091, 4.007, 2.11], [0.095, 1.842, 1.661, 2.856, 0.289, 2.599]]\nC: [[1.954, 0.955, 1.134, 0.469, 1.906, 3.136], [2.108, 0.777, 1.024, 0.49, 0.432, 1.736], [1.804, -1.161, 1.159, 0.579, 2.431, 2.741], [-0.075, -2.215, 1.141, 4.022, 0.647, 1.726], [-1.259, -0.097, 0.765, 0.259, 4.315, 1.492], [0.085, 2.397, 1.377, 2.831, 0.531, 2.459]]\nD: [[1.757, 1.207, 1.277, 0.171, 1.443, 2.662], [1.938, 0.553, 0.792, 0.372, 0.083, 1.699], [2.057, -0.732, 1.221, 0.308, 2.678, 2.532], [0.254, -2.161, 1.253, 3.68, 0.191, 2.014], [-1.595, -0.042, 0.831, 0.273, 4.363, 1.747], [0.19, 2.052, 1.297, 3.302, 0.408, 2.639]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_48_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_48_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-1.874, 0.432, 0.675, 0.547, 4.466, 2.765], [2.077, 1.025, 0.96, 0.435, 5.53, 2.428], [-0.302, -2.355, 0.907, 2.255, -0.132, 2.065], [1.309, 2.97, 0.649, 2.105, 0.601, 1.706], [1.394, -2.794, 0.572, 0.929, 0.444, 0.269]]\nB: [[-1.774, -0.336, 1.363, 0.137, 4.327, 2.118], [2.024, 0.331, 0.678, 0.299, 6.038, 2.411], [-0.973, -2.453, 1.173, 2.209, -0.12, 1.889], [0.907, 3.079, 0.375, 1.445, 0.297, 1.527], [1.457, -2.684, 0.632, 0.896, -0.39, 0.289]]\nC: [[-2.285, -0.2, 1.099, -0.248, 4.908, 2.313], [2.096, 0.355, 1.235, 0.363, 5.974, 2.044], [-1.085, -2.082, 0.91, 2.454, 0.239, 1.438], [0.732, 3.157, 0.493, 1.665, 0.182, 1.592], [1.088, -2.467, -0.003, 0.673, 0.233, 0.167]]\nD: [[-1.815, -0.066, 1.137, 0.19, 4.502, 2.283], [1.738, 0.547, 0.94, 0.42, 5.701, 2.065], [-0.777, -2.286, 1.047, 2.091, 0.123, 1.885], [1.023, 3.389, 0.735, 1.699, 0.123, 1.528], [1.495, -2.301, 0.226, 0.722, 0.028, 0.613]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[0.934222, -0.219071, 0.281493], [-0.356558, -0.595286, 0.72007], [0.009823, -0.773073, -0.634241]]; the translation vector: [0.331108, 1.989283, 1.551545], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.874, 0.432, 0.675, 0.547, 4.466, 2.765], [2.077, 1.025, 0.96, 0.435, 5.53, 2.428], [-0.302, -2.355, 0.907, 2.255, -0.132, 2.065], [1.309, 2.97, 0.649, 2.105, 0.601, 1.706], [1.394, -2.794, 0.572, 0.929, 0.444, 0.269]]\nB: [[-1.774, -0.336, 1.363, 0.137, 4.327, 2.118], [2.024, 0.331, 0.678, 0.299, 6.038, 2.411], [-0.973, -2.453, 1.173, 2.209, -0.12, 1.889], [0.907, 3.079, 0.375, 1.445, 0.297, 1.527], [1.457, -2.684, 0.632, 0.896, -0.39, 0.289]]\nC: [[-2.285, -0.2, 1.099, -0.248, 4.908, 2.313], [2.096, 0.355, 1.235, 0.363, 5.974, 2.044], [-1.085, -2.082, 0.91, 2.454, 0.239, 1.438], [0.732, 3.157, 0.493, 1.665, 0.182, 1.592], [1.088, -2.467, -0.003, 0.673, 0.233, 0.167]]\nD: [[-1.815, -0.066, 1.137, 0.19, 4.502, 2.283], [1.738, 0.547, 0.94, 0.42, 5.701, 2.065], [-0.777, -2.286, 1.047, 2.091, 0.123, 1.885], [1.023, 3.389, 0.735, 1.699, 0.123, 1.528], [1.495, -2.301, 0.226, 0.722, 0.028, 0.613]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_49_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_49_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-1.355, 0.849, 0.484, 0.583, 0.026, -0.166]]\nB: [[-0.954, 0.48, 0.115, 0.22, 0.221, 0.246]]\nC: [[-0.886, 0.23, -0.323, 0.388, 0.524, 0.544]]\nD: [[-0.877, -0.009, -0.082, -0.196, 0.347, 0.57]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the trash can in the scene. The camera pose information includes: the rotation matrix: [[-0.986418, -0.051155, 0.156087], [-0.152905, 0.633099, -0.758819], [-0.060001, -0.772379, -0.632322]]; the translation vector: [2.055195, 1.600374, 1.268236], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.355, 0.849, 0.484, 0.583, 0.026, -0.166]]\nB: [[-0.954, 0.48, 0.115, 0.22, 0.221, 0.246]]\nC: [[-0.886, 0.23, -0.323, 0.388, 0.524, 0.544]]\nD: [[-0.877, -0.009, -0.082, -0.196, 0.347, 0.57]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_50_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_50_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[0.231, -1.891, 1.007, 2.627, 2.266, 2.036], [-0.307, -0.367, 0.926, 1.513, 1.063, 1.867]]\nB: [[0.718, -2.089, 0.687, 2.342, 2.265, 1.579], [-0.634, -0.345, 1.176, 1.11, 1.202, 2.075]]\nC: [[0.464, -1.663, 1.056, 2.135, 2.464, 2.098], [-0.781, -0.219, 1.071, 1.18, 1.361, 1.522]]\nD: [[-0.112, -2.192, 0.852, 2.525, 1.965, 2.377], [0.128, -0.805, 0.888, 1.396, 1.418, 2.338]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the bathroom stall in the scene. The camera pose information includes: the rotation matrix: [[-0.255252, -0.433184, 0.864406], [-0.966562, 0.137073, -0.216725], [-0.024605, -0.890821, -0.453687]]; the translation vector: [1.468232, 3.881342, 1.432686], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.231, -1.891, 1.007, 2.627, 2.266, 2.036], [-0.307, -0.367, 0.926, 1.513, 1.063, 1.867]]\nB: [[0.718, -2.089, 0.687, 2.342, 2.265, 1.579], [-0.634, -0.345, 1.176, 1.11, 1.202, 2.075]]\nC: [[0.464, -1.663, 1.056, 2.135, 2.464, 2.098], [-0.781, -0.219, 1.071, 1.18, 1.361, 1.522]]\nD: [[-0.112, -2.192, 0.852, 2.525, 1.965, 2.377], [0.128, -0.805, 0.888, 1.396, 1.418, 2.338]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_51_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_51_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-2.293, -1.301, 1.261, 0.557, 0.076, 0.181]]\nB: [[-2.3, -0.603, 0.539, 0.144, 0.291, 0.744]]\nC: [[-2.289, -1.004, 0.913, 0.094, 0.463, 0.318]]\nD: [[-2.447, -0.778, 0.56, -0.086, 0.586, 0.687]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the object in the scene. The camera pose information includes: the rotation matrix: [[0.140295, 0.625342, -0.767636], [0.990108, -0.090149, 0.107516], [-0.001967, -0.775126, -0.631804]]; the translation vector: [3.410891, 3.073526, 1.198756], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-2.293, -1.301, 1.261, 0.557, 0.076, 0.181]]\nB: [[-2.3, -0.603, 0.539, 0.144, 0.291, 0.744]]\nC: [[-2.289, -1.004, 0.913, 0.094, 0.463, 0.318]]\nD: [[-2.447, -0.778, 0.56, -0.086, 0.586, 0.687]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_52_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_52_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-1.354, 0.454, 0.921, -0.335, 0.498, 0.874], [0.185, -0.574, 0.751, 0.09, -0.38, 0.911], [0.667, 0.586, 2.143, 0.441, 0.23, 0.505]]\nB: [[-1.09, 0.059, 1.019, 0.117, 0.263, 0.377], [0.279, -1.061, 0.877, 0.477, 0.116, 0.622], [0.666, 0.093, 1.789, 0.132, 0.373, 0.347]]\nC: [[-1.434, -0.263, 0.532, 0.445, 0.024, 0.383], [-0.034, -1.535, 0.533, 0.655, 0.426, 0.876], [0.704, 0.231, 1.687, 0.279, -0.11, 0.575]]\nD: [[-0.897, -0.345, 1.454, 0.607, 0.705, 0.804], [0.146, -1.337, 0.587, 0.096, 0.382, 0.839], [0.567, -0.339, 1.673, 0.166, 0.534, 0.522]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the towel in the scene. The camera pose information includes: the rotation matrix: [[-0.221984, 0.421429, -0.879273], [0.97466, 0.121427, -0.187867], [0.027595, -0.898695, -0.437705]]; the translation vector: [3.155292, 0.483793, 1.35371], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.354, 0.454, 0.921, -0.335, 0.498, 0.874], [0.185, -0.574, 0.751, 0.09, -0.38, 0.911], [0.667, 0.586, 2.143, 0.441, 0.23, 0.505]]\nB: [[-1.09, 0.059, 1.019, 0.117, 0.263, 0.377], [0.279, -1.061, 0.877, 0.477, 0.116, 0.622], [0.666, 0.093, 1.789, 0.132, 0.373, 0.347]]\nC: [[-1.434, -0.263, 0.532, 0.445, 0.024, 0.383], [-0.034, -1.535, 0.533, 0.655, 0.426, 0.876], [0.704, 0.231, 1.687, 0.279, -0.11, 0.575]]\nD: [[-0.897, -0.345, 1.454, 0.607, 0.705, 0.804], [0.146, -1.337, 0.587, 0.096, 0.382, 0.839], [0.567, -0.339, 1.673, 0.166, 0.534, 0.522]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_53_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_53_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-0.872, -1.053, 1.714, -0.4, 0.547, -0.388]]\nB: [[-0.641, -0.865, 2.002, 0.06, 0.688, 0.05]]\nC: [[-0.24, -0.538, 2.349, -0.015, 0.604, 0.452]]\nD: [[-0.437, -0.89, 1.743, -0.382, 0.608, -0.394]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the shower curtain rod in the scene. The camera pose information includes: the rotation matrix: [[0.173351, 0.592298, -0.78685], [0.984858, -0.105806, 0.137329], [-0.001913, -0.798742, -0.601671]]; the translation vector: [3.264189, 1.940071, 1.28435], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.872, -1.053, 1.714, -0.4, 0.547, -0.388]]\nB: [[-0.641, -0.865, 2.002, 0.06, 0.688, 0.05]]\nC: [[-0.24, -0.538, 2.349, -0.015, 0.604, 0.452]]\nD: [[-0.437, -0.89, 1.743, -0.382, 0.608, -0.394]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_54_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_54_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[0.74, 0.949, 0.805, 2.053, 0.051, 1.667], [-1.281, -0.065, 0.899, 0.112, 2.004, 1.85], [0.446, -1.039, 0.627, 1.399, 0.094, 1.27], [1.134, -0.792, 0.678, 0.041, 0.539, 1.365], [-1.434, 2.505, 0.652, 0.518, 0.203, 1.213]]\nB: [[1.029, 0.71, 0.916, 2.029, -0.176, 1.869], [-1.683, -0.445, 1.099, -0.259, 2.235, 1.713], [0.848, -1.08, 0.506, 1.798, 0.259, 1.153], [1.268, -0.42, 0.271, 0.287, 0.751, 1.048], [-1.043, 2.825, 0.333, 0.321, -0.246, 1.582]]\nC: [[0.966, 1.169, 0.637, 2.193, -0.193, 1.801], [-0.869, -0.535, 1.386, 0.092, 1.727, 2.164], [0.169, -1.108, 0.224, 1.056, -0.222, 1.304], [0.91, -1.037, 1.17, -0.025, 0.5, 1.639], [-0.958, 2.714, 0.971, 0.285, -0.285, 1.316]]\nD: [[0.741, 1.382, 0.663, 1.864, -0.249, 2.055], [-1.139, 0.311, 1.207, -0.23, 2.288, 2.067], [0.431, -1.158, 0.998, 1.247, 0.194, 1.309], [0.658, -1.111, 1.067, 0.365, 0.642, 0.899], [-1.437, 2.999, 0.509, 0.702, 0.182, 1.021]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[0.660671, 0.426343, -0.617856], [0.749322, -0.423957, 0.508701], [-0.045063, -0.799057, -0.599565]]; the translation vector: [1.739014, 2.260029, 1.323145], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.74, 0.949, 0.805, 2.053, 0.051, 1.667], [-1.281, -0.065, 0.899, 0.112, 2.004, 1.85], [0.446, -1.039, 0.627, 1.399, 0.094, 1.27], [1.134, -0.792, 0.678, 0.041, 0.539, 1.365], [-1.434, 2.505, 0.652, 0.518, 0.203, 1.213]]\nB: [[1.029, 0.71, 0.916, 2.029, -0.176, 1.869], [-1.683, -0.445, 1.099, -0.259, 2.235, 1.713], [0.848, -1.08, 0.506, 1.798, 0.259, 1.153], [1.268, -0.42, 0.271, 0.287, 0.751, 1.048], [-1.043, 2.825, 0.333, 0.321, -0.246, 1.582]]\nC: [[0.966, 1.169, 0.637, 2.193, -0.193, 1.801], [-0.869, -0.535, 1.386, 0.092, 1.727, 2.164], [0.169, -1.108, 0.224, 1.056, -0.222, 1.304], [0.91, -1.037, 1.17, -0.025, 0.5, 1.639], [-0.958, 2.714, 0.971, 0.285, -0.285, 1.316]]\nD: [[0.741, 1.382, 0.663, 1.864, -0.249, 2.055], [-1.139, 0.311, 1.207, -0.23, 2.288, 2.067], [0.431, -1.158, 0.998, 1.247, 0.194, 1.309], [0.658, -1.111, 1.067, 0.365, 0.642, 0.899], [-1.437, 2.999, 0.509, 0.702, 0.182, 1.021]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_55_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_55_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[2.289, 1.091, 1.235, 0.312, 0.445, 0.181], [2.614, -0.282, 1.115, 0.396, 0.952, 0.37], [2.915, -1.131, 1.323, 0.416, 1.127, 0.582]]\nB: [[2.865, 1.27, 1.277, 0.662, 1.126, 0.413], [2.634, -0.253, 1.185, 0.725, 0.57, 0.723], [2.702, -0.759, 1.232, 0.075, 0.776, 0.187]]\nC: [[2.596, 1.198, 1.179, 0.402, 0.868, 0.166], [2.565, 0.04, 1.202, 0.364, 0.895, 0.33], [2.601, -1.116, 1.104, 0.457, 0.792, 0.155]]\nD: [[2.413, 1.176, 1.278, 0.589, 0.574, -0.073], [2.181, 0.298, 1.094, 0.783, 1.368, 0.634], [2.395, -1.077, 1.082, 0.598, 1.002, 0.39]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the windowsill in the scene. The camera pose information includes: the rotation matrix: [[0.606468, -0.360414, 0.70873], [-0.789578, -0.16805, 0.590192], [-0.093612, -0.91753, -0.386492]]; the translation vector: [2.373669, 6.226582, 1.48631], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[2.289, 1.091, 1.235, 0.312, 0.445, 0.181], [2.614, -0.282, 1.115, 0.396, 0.952, 0.37], [2.915, -1.131, 1.323, 0.416, 1.127, 0.582]]\nB: [[2.865, 1.27, 1.277, 0.662, 1.126, 0.413], [2.634, -0.253, 1.185, 0.725, 0.57, 0.723], [2.702, -0.759, 1.232, 0.075, 0.776, 0.187]]\nC: [[2.596, 1.198, 1.179, 0.402, 0.868, 0.166], [2.565, 0.04, 1.202, 0.364, 0.895, 0.33], [2.601, -1.116, 1.104, 0.457, 0.792, 0.155]]\nD: [[2.413, 1.176, 1.278, 0.589, 0.574, -0.073], [2.181, 0.298, 1.094, 0.783, 1.368, 0.634], [2.395, -1.077, 1.082, 0.598, 1.002, 0.39]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_56_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_56_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-0.001, 2.948, 0.696, 1.051, 0.653, 1.017], [-1.218, 2.566, 0.334, 0.65, 1.143, 0.576], [-1.049, 4.68, 0.409, 0.92, 0.545, 1.203], [-0.848, -3.875, 0.54, 0.615, 0.936, 1.037], [1.041, 2.41, 0.443, 0.89, 0.787, 0.94], [1.624, -1.213, 0.004, 1.481, 1.102, 0.478], [1.287, 1.232, 0.545, 1.204, 1.228, 0.841], [1.818, -0.113, 0.532, 0.874, 0.706, 1.086], [0.092, -5.089, 0.693, 0.961, 0.819, 0.322], [-0.468, -1.491, 0.774, 0.97, 1.2, 1.024]]\nB: [[-0.126, 2.459, 0.086, 0.932, 1.114, 1.123], [-0.966, 2.226, 0.359, 1.537, 0.757, 0.462], [-1.21, 4.954, 0.104, 1.239, 0.624, 0.543], [-0.516, -4.249, 0.544, 1.157, 1.197, 1.269], [1.294, 2.428, 0.861, 1.276, 0.579, 0.451], [1.569, -1.608, 0.36, 0.726, 1.508, 0.636], [0.656, 1.11, -0.004, 0.679, 1.224, 0.752], [0.999, -0.375, 0.707, 0.664, 1.131, 0.788], [0.529, -5.125, 0.2, 0.899, 0.951, 0.927], [-1.058, -1.898, 0.447, 0.976, 1.149, 0.369]]\nC: [[0.237, 2.908, 0.463, 0.898, 0.83, 0.718], [-0.876, 2.53, 0.52, 1.072, 0.924, 0.781], [-1.088, 4.721, 0.492, 0.991, 0.901, 0.767], [-0.583, -3.833, 0.327, 0.92, 0.918, 0.773], [1.47, 1.953, 0.456, 0.894, 0.96, 0.795], [1.829, -1.442, 0.405, 1.045, 1.024, 0.748], [1.035, 0.766, 0.48, 0.857, 0.923, 0.799], [1.416, -0.318, 0.434, 1.021, 0.961, 0.774], [0.375, -5.051, 0.244, 0.861, 0.856, 0.761], [-0.588, -1.854, 0.411, 0.932, 0.952, 0.73]]\nD: [[0.31, 2.967, 0.642, 1.326, 0.654, 0.284], [-1.165, 2.181, 0.237, 1.304, 0.639, 0.395], [-0.721, 4.769, 0.925, 1.219, 0.928, 0.661], [-1.026, -3.416, 0.149, 0.806, 0.901, 0.778], [1.652, 1.761, 0.169, 1.115, 0.472, 1.022], [2.158, -1.036, 0.663, 0.749, 0.724, 1.014], [0.591, 0.853, 0.97, 1.294, 0.724, 0.816], [1.34, 0.03, 0.19, 1.304, 0.703, 0.552], [0.387, -4.975, 0.689, 0.413, 1.29, 0.685], [-0.424, -1.902, 0.121, 1.041, 0.562, 0.86]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the armchair in the scene. The camera pose information includes: the rotation matrix: [[0.974605, -0.106498, 0.196986], [-0.223762, -0.428932, 0.875185], [-0.008712, -0.897037, -0.44187]]; the translation vector: [2.006689, 0.552817, 1.711334], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.001, 2.948, 0.696, 1.051, 0.653, 1.017], [-1.218, 2.566, 0.334, 0.65, 1.143, 0.576], [-1.049, 4.68, 0.409, 0.92, 0.545, 1.203], [-0.848, -3.875, 0.54, 0.615, 0.936, 1.037], [1.041, 2.41, 0.443, 0.89, 0.787, 0.94], [1.624, -1.213, 0.004, 1.481, 1.102, 0.478], [1.287, 1.232, 0.545, 1.204, 1.228, 0.841], [1.818, -0.113, 0.532, 0.874, 0.706, 1.086], [0.092, -5.089, 0.693, 0.961, 0.819, 0.322], [-0.468, -1.491, 0.774, 0.97, 1.2, 1.024]]\nB: [[-0.126, 2.459, 0.086, 0.932, 1.114, 1.123], [-0.966, 2.226, 0.359, 1.537, 0.757, 0.462], [-1.21, 4.954, 0.104, 1.239, 0.624, 0.543], [-0.516, -4.249, 0.544, 1.157, 1.197, 1.269], [1.294, 2.428, 0.861, 1.276, 0.579, 0.451], [1.569, -1.608, 0.36, 0.726, 1.508, 0.636], [0.656, 1.11, -0.004, 0.679, 1.224, 0.752], [0.999, -0.375, 0.707, 0.664, 1.131, 0.788], [0.529, -5.125, 0.2, 0.899, 0.951, 0.927], [-1.058, -1.898, 0.447, 0.976, 1.149, 0.369]]\nC: [[0.237, 2.908, 0.463, 0.898, 0.83, 0.718], [-0.876, 2.53, 0.52, 1.072, 0.924, 0.781], [-1.088, 4.721, 0.492, 0.991, 0.901, 0.767], [-0.583, -3.833, 0.327, 0.92, 0.918, 0.773], [1.47, 1.953, 0.456, 0.894, 0.96, 0.795], [1.829, -1.442, 0.405, 1.045, 1.024, 0.748], [1.035, 0.766, 0.48, 0.857, 0.923, 0.799], [1.416, -0.318, 0.434, 1.021, 0.961, 0.774], [0.375, -5.051, 0.244, 0.861, 0.856, 0.761], [-0.588, -1.854, 0.411, 0.932, 0.952, 0.73]]\nD: [[0.31, 2.967, 0.642, 1.326, 0.654, 0.284], [-1.165, 2.181, 0.237, 1.304, 0.639, 0.395], [-0.721, 4.769, 0.925, 1.219, 0.928, 0.661], [-1.026, -3.416, 0.149, 0.806, 0.901, 0.778], [1.652, 1.761, 0.169, 1.115, 0.472, 1.022], [2.158, -1.036, 0.663, 0.749, 0.724, 1.014], [0.591, 0.853, 0.97, 1.294, 0.724, 0.816], [1.34, 0.03, 0.19, 1.304, 0.703, 0.552], [0.387, -4.975, 0.689, 0.413, 1.29, 0.685], [-0.424, -1.902, 0.121, 1.041, 0.562, 0.86]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_57_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_57_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[0.945, -0.877, -0.123, 1.546, 1.347, 0.112]]\nB: [[0.857, -1.188, 0.118, 1.684, 1.025, 0.263]]\nC: [[0.995, -0.398, -0.499, 1.213, 1.119, -0.036]]\nD: [[0.539, -0.587, -0.275, 1.351, 1.1, -0.319]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the floor in the scene. The camera pose information includes: the rotation matrix: [[-0.693623, 0.392298, -0.604144], [0.720137, 0.397492, -0.568686], [0.017048, -0.82952, -0.558217]]; the translation vector: [2.706242, 2.586761, 1.453005], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.945, -0.877, -0.123, 1.546, 1.347, 0.112]]\nB: [[0.857, -1.188, 0.118, 1.684, 1.025, 0.263]]\nC: [[0.995, -0.398, -0.499, 1.213, 1.119, -0.036]]\nD: [[0.539, -0.587, -0.275, 1.351, 1.1, -0.319]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_58_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_58_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-2.143, 1.575, 0.04, 0.27, 0.312, 0.15], [-0.377, 3.615, 0.346, 0.295, 0.569, 0.473], [-1.45, 0.655, 0.569, 0.701, 0.954, 0.318], [2.369, -0.854, 0.716, 0.861, 0.372, 0.384], [3.488, -0.639, 0.515, 0.374, 0.717, 0.642], [3.31, -2.144, 0.849, 0.423, 1.039, 0.351], [3.753, -1.021, 0.778, 0.709, 0.641, 0.792], [-1.949, 2.854, 0.124, 0.704, 0.146, 0.084]]\nB: [[-1.76, 1.841, 0.53, 0.73, 0.674, 0.544], [-0.679, 3.307, 0.476, 0.669, 0.734, 0.506], [-1.7, 0.568, 0.465, 0.694, 0.62, 0.52], [2.474, -1.195, 0.409, 0.606, 0.509, 0.689], [3.174, -0.614, 0.339, 0.542, 0.599, 0.782], [3.186, -2.158, 0.546, 0.503, 0.633, 0.516], [3.901, -1.236, 0.485, 0.592, 0.545, 0.635], [-1.787, 2.437, 0.508, 0.713, 0.589, 0.468]]\nC: [[-2.143, 1.685, 0.995, 0.615, 0.904, 0.263], [-1.005, 3.628, 0.394, 0.466, 0.405, 0.998], [-2.179, 0.615, 0.333, 0.233, 0.298, 0.889], [2.014, -1.057, 0.599, 0.68, 0.338, 0.974], [2.918, -0.471, 0.1, 0.575, 0.71, 0.376], [3.127, -2.436, 0.498, 0.497, 0.327, 0.902], [3.486, -1.558, 0.63, 0.593, 0.23, 0.81], [-1.407, 2.857, 0.881, 0.499, 1.07, 0.68]]\nD: [[-1.261, 2.229, 0.998, 1.215, 1.048, 0.703], [-1.092, 3.457, -0.005, 0.668, 1.114, 0.663], [-1.477, 0.865, 0.817, 0.301, 0.363, 0.292], [2.93, -1.308, 0.561, 1.073, 0.232, 1.069], [3.634, -0.503, -0.085, 0.796, 0.476, 0.342], [3.396, -2.322, 0.932, 0.945, 0.812, 0.616], [4.075, -1.495, 0.312, 0.703, 0.562, 0.973], [-2.275, 2.728, 0.786, 0.449, 0.77, 0.134]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the chair in the scene. The camera pose information includes: the rotation matrix: [[-0.891251, 0.378307, -0.25011], [0.443048, 0.608538, -0.658323], [-0.096846, -0.697542, -0.709969]]; the translation vector: [4.935522, 3.588868, 1.45033], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-2.143, 1.575, 0.04, 0.27, 0.312, 0.15], [-0.377, 3.615, 0.346, 0.295, 0.569, 0.473], [-1.45, 0.655, 0.569, 0.701, 0.954, 0.318], [2.369, -0.854, 0.716, 0.861, 0.372, 0.384], [3.488, -0.639, 0.515, 0.374, 0.717, 0.642], [3.31, -2.144, 0.849, 0.423, 1.039, 0.351], [3.753, -1.021, 0.778, 0.709, 0.641, 0.792], [-1.949, 2.854, 0.124, 0.704, 0.146, 0.084]]\nB: [[-1.76, 1.841, 0.53, 0.73, 0.674, 0.544], [-0.679, 3.307, 0.476, 0.669, 0.734, 0.506], [-1.7, 0.568, 0.465, 0.694, 0.62, 0.52], [2.474, -1.195, 0.409, 0.606, 0.509, 0.689], [3.174, -0.614, 0.339, 0.542, 0.599, 0.782], [3.186, -2.158, 0.546, 0.503, 0.633, 0.516], [3.901, -1.236, 0.485, 0.592, 0.545, 0.635], [-1.787, 2.437, 0.508, 0.713, 0.589, 0.468]]\nC: [[-2.143, 1.685, 0.995, 0.615, 0.904, 0.263], [-1.005, 3.628, 0.394, 0.466, 0.405, 0.998], [-2.179, 0.615, 0.333, 0.233, 0.298, 0.889], [2.014, -1.057, 0.599, 0.68, 0.338, 0.974], [2.918, -0.471, 0.1, 0.575, 0.71, 0.376], [3.127, -2.436, 0.498, 0.497, 0.327, 0.902], [3.486, -1.558, 0.63, 0.593, 0.23, 0.81], [-1.407, 2.857, 0.881, 0.499, 1.07, 0.68]]\nD: [[-1.261, 2.229, 0.998, 1.215, 1.048, 0.703], [-1.092, 3.457, -0.005, 0.668, 1.114, 0.663], [-1.477, 0.865, 0.817, 0.301, 0.363, 0.292], [2.93, -1.308, 0.561, 1.073, 0.232, 1.069], [3.634, -0.503, -0.085, 0.796, 0.476, 0.342], [3.396, -2.322, 0.932, 0.945, 0.812, 0.616], [4.075, -1.495, 0.312, 0.703, 0.562, 0.973], [-2.275, 2.728, 0.786, 0.449, 0.77, 0.134]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_59_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_59_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-0.205, 1.797, 1.094, 1.154, 0.654, 1.112], [0.658, 1.295, 1.132, 1.277, 0.037, 0.601]]\nB: [[-0.63, 1.531, 1.06, 1.175, 0.329, 0.727], [0.734, 1.578, 0.984, 1.15, 0.266, 0.361]]\nC: [[-0.726, 1.106, 1.434, 1.522, 0.658, 0.308], [1.201, 1.481, 1.246, 0.828, 0.067, 0.371]]\nD: [[-0.719, 1.086, 1.264, 0.78, 0.793, 0.35], [0.868, 1.98, 0.75, 1.049, 0.201, 0.363]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the window in the scene. The camera pose information includes: the rotation matrix: [[0.081815, 0.638296, -0.765431], [0.996577, -0.061545, 0.055199], [-0.011875, -0.767327, -0.641146]]; the translation vector: [3.004073, 1.570726, 1.431248], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.205, 1.797, 1.094, 1.154, 0.654, 1.112], [0.658, 1.295, 1.132, 1.277, 0.037, 0.601]]\nB: [[-0.63, 1.531, 1.06, 1.175, 0.329, 0.727], [0.734, 1.578, 0.984, 1.15, 0.266, 0.361]]\nC: [[-0.726, 1.106, 1.434, 1.522, 0.658, 0.308], [1.201, 1.481, 1.246, 0.828, 0.067, 0.371]]\nD: [[-0.719, 1.086, 1.264, 0.78, 0.793, 0.35], [0.868, 1.98, 0.75, 1.049, 0.201, 0.363]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_60_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_60_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[1.288, 3.663, 0.885, 1.742, 0.266, 1.508], [-1.19, -3.017, 0.715, 0.179, 0.346, 1.183], [-2.568, -0.991, 0.862, 0.362, 2.842, 1.652], [-2.356, 0.313, 1.087, 0.573, 0.323, 1.151], [-2.078, 0.891, 0.947, 0.102, 1.17, 1.498]]\nB: [[1.699, 3.879, 0.807, 1.466, 0.26, 1.658], [-1.338, -3.016, 0.481, -0.091, 0.125, 1.346], [-2.791, -0.651, 0.722, 0.588, 2.782, 1.444], [-2.133, -0.174, 1.179, 0.831, 0.459, 1.476], [-2.564, 1.303, 0.485, 0.444, 1.6, 1.79]]\nC: [[1.384, 3.837, 1.191, 2.116, 0.64, 1.217], [-1.185, -3.083, 1.042, 0.674, 0.205, 0.788], [-2.424, -0.728, 0.743, -0.005, 2.436, 1.937], [-2.645, -0.046, 0.933, 0.095, 0.125, 1.323], [-2.483, 0.961, 0.887, 0.154, 0.979, 1.595]]\nD: [[1.755, 3.461, 0.788, 1.786, 0.256, 1.208], [-1.596, -3.184, 0.789, 0.372, -0.041, 1.319], [-2.923, -1.052, 1.266, 0.216, 3.322, 1.837], [-2.525, 0.237, 1.346, 0.938, 0.473, 0.759], [-2.569, 1.257, 0.568, 0.003, 1.424, 1.337]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[-0.963317, 0.154363, -0.219528], [0.260086, 0.335369, -0.905474], [-0.066149, -0.929355, -0.363214]]; the translation vector: [5.972451, 2.818726, 1.468896], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.288, 3.663, 0.885, 1.742, 0.266, 1.508], [-1.19, -3.017, 0.715, 0.179, 0.346, 1.183], [-2.568, -0.991, 0.862, 0.362, 2.842, 1.652], [-2.356, 0.313, 1.087, 0.573, 0.323, 1.151], [-2.078, 0.891, 0.947, 0.102, 1.17, 1.498]]\nB: [[1.699, 3.879, 0.807, 1.466, 0.26, 1.658], [-1.338, -3.016, 0.481, -0.091, 0.125, 1.346], [-2.791, -0.651, 0.722, 0.588, 2.782, 1.444], [-2.133, -0.174, 1.179, 0.831, 0.459, 1.476], [-2.564, 1.303, 0.485, 0.444, 1.6, 1.79]]\nC: [[1.384, 3.837, 1.191, 2.116, 0.64, 1.217], [-1.185, -3.083, 1.042, 0.674, 0.205, 0.788], [-2.424, -0.728, 0.743, -0.005, 2.436, 1.937], [-2.645, -0.046, 0.933, 0.095, 0.125, 1.323], [-2.483, 0.961, 0.887, 0.154, 0.979, 1.595]]\nD: [[1.755, 3.461, 0.788, 1.786, 0.256, 1.208], [-1.596, -3.184, 0.789, 0.372, -0.041, 1.319], [-2.923, -1.052, 1.266, 0.216, 3.322, 1.837], [-2.525, 0.237, 1.346, 0.938, 0.473, 0.759], [-2.569, 1.257, 0.568, 0.003, 1.424, 1.337]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_61_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_61_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-1.195, 2.616, 0.764, 0.381, 0.904, 1.179], [1.178, 2.791, 1.06, 0.523, 1.876, 0.795], [2.137, -1.926, -0.065, 1.185, 1.084, 0.312], [-0.599, -1.923, 0.519, 0.294, 0.992, 1.206]]\nB: [[-1.068, 2.261, 1.292, 0.971, 1.029, 0.824], [1.489, 3.189, 0.594, 0.774, 1.161, 1.156], [1.843, -2.018, 0.792, 0.473, 1.372, 0.616], [-0.424, -2.012, 0.649, 0.681, 1.809, 0.928]]\nC: [[-0.884, 2.735, 0.809, 0.721, 1.274, 0.906], [1.496, 2.941, 0.628, 0.833, 1.571, 0.871], [1.868, -1.949, 0.395, 0.878, 0.892, 0.788], [-0.793, -2.3, 0.359, 0.741, 1.335, 0.757]]\nD: [[-1.225, 2.279, 0.349, 0.295, 0.789, 0.542], [1.131, 2.464, 0.4, 0.503, 1.911, 0.903], [1.57, -1.598, -0.016, 1.245, 1.391, 0.466], [-0.41, -2.691, 0.26, 1.21, 1.681, 0.98]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the couch in the scene. The camera pose information includes: the rotation matrix: [[-0.824719, -0.175736, 0.537546], [-0.564369, 0.316962, -0.762249], [-0.036427, -0.932015, -0.360584]]; the translation vector: [4.397487, 4.054199, 1.411764], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.195, 2.616, 0.764, 0.381, 0.904, 1.179], [1.178, 2.791, 1.06, 0.523, 1.876, 0.795], [2.137, -1.926, -0.065, 1.185, 1.084, 0.312], [-0.599, -1.923, 0.519, 0.294, 0.992, 1.206]]\nB: [[-1.068, 2.261, 1.292, 0.971, 1.029, 0.824], [1.489, 3.189, 0.594, 0.774, 1.161, 1.156], [1.843, -2.018, 0.792, 0.473, 1.372, 0.616], [-0.424, -2.012, 0.649, 0.681, 1.809, 0.928]]\nC: [[-0.884, 2.735, 0.809, 0.721, 1.274, 0.906], [1.496, 2.941, 0.628, 0.833, 1.571, 0.871], [1.868, -1.949, 0.395, 0.878, 0.892, 0.788], [-0.793, -2.3, 0.359, 0.741, 1.335, 0.757]]\nD: [[-1.225, 2.279, 0.349, 0.295, 0.789, 0.542], [1.131, 2.464, 0.4, 0.503, 1.911, 0.903], [1.57, -1.598, -0.016, 1.245, 1.391, 0.466], [-0.41, -2.691, 0.26, 1.21, 1.681, 0.98]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_62_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_62_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[0.115, -1.562, 0.911, 0.99, 0.176, 0.93], [0.903, -1.409, 1.128, 0.969, 0.316, 0.989]]\nB: [[0.419, -1.51, 1.351, 0.838, 0.104, 1.371], [0.797, -1.27, 0.95, 0.885, 0.56, 0.709]]\nC: [[-0.056, -1.325, 0.584, 1.397, 0.105, 0.436], [0.94, -1.633, 1.178, 0.648, 0.434, 1.044]]\nD: [[0.609, -1.088, 0.429, 1.463, 0.186, 0.54], [0.51, -1.323, 0.699, 1.115, 0.814, 1.432]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the window in the scene. The camera pose information includes: the rotation matrix: [[0.993805, -0.057016, 0.095394], [-0.110597, -0.423109, 0.899304], [-0.010913, -0.904283, -0.426794]]; the translation vector: [3.282054, 2.568905, 1.512321], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.115, -1.562, 0.911, 0.99, 0.176, 0.93], [0.903, -1.409, 1.128, 0.969, 0.316, 0.989]]\nB: [[0.419, -1.51, 1.351, 0.838, 0.104, 1.371], [0.797, -1.27, 0.95, 0.885, 0.56, 0.709]]\nC: [[-0.056, -1.325, 0.584, 1.397, 0.105, 0.436], [0.94, -1.633, 1.178, 0.648, 0.434, 1.044]]\nD: [[0.609, -1.088, 0.429, 1.463, 0.186, 0.54], [0.51, -1.323, 0.699, 1.115, 0.814, 1.432]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_63_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_63_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[1.753, 0.465, 1.403, 0.46, 4.996, 2.959], [-1.738, -1.218, 1.272, 1.062, 1.528, 2.501], [-0.405, 2.797, 1.433, 4.292, 0.332, 2.875], [-2.525, 2.379, 1.355, 0.074, 0.839, 1.664], [-2.109, 0.693, 1.971, 0.227, 2.364, 1.533], [0.282, -2.054, 1.197, 3.118, 0.28, 2.272], [0.151, -2.857, 1.262, 0.294, 1.776, 2.355]]\nB: [[1.644, 0.604, 1.06, 0.766, 5.332, 3.344], [-2.097, -1.683, 0.861, 1.264, 1.832, 2.762], [-0.249, 2.988, 1.171, 4.734, 0.777, 3.234], [-2.915, 2.214, 1.5, 0.285, 1.098, 1.997], [-1.7, 0.54, 1.692, 0.479, 2.794, 1.178], [0.193, -1.942, 1.679, 3.173, -0.143, 2.182], [0.278, -3.151, 1.749, -0.197, 1.898, 2.594]]\nC: [[1.329, 0.268, 1.849, 0.784, 4.719, 2.961], [-2.126, -1.458, 1.073, 0.788, 1.484, 2.789], [0.005, 2.714, 1.367, 3.948, 0.242, 2.522], [-2.545, 2.463, 1.604, 0.21, 1.144, 1.521], [-1.811, 0.332, 2.299, -0.123, 1.943, 1.085], [0.387, -2.373, 0.727, 2.861, -0.215, 2.059], [0.369, -3.057, 1.007, 0.316, 1.439, 1.965]]\nD: [[2.224, 0.601, 1.81, 0.297, 5.34, 2.544], [-1.441, -1.038, 1.648, 1.209, 1.768, 2.642], [-0.455, 2.785, 1.909, 4.119, 0.083, 3.179], [-2.32, 2.048, 1.417, -0.178, 0.398, 1.998], [-1.992, 0.619, 1.973, 0.251, 2.119, 1.3], [0.593, -1.72, 1.138, 2.733, 0.423, 2.378], [0.595, -2.424, 1.148, 0.122, 2.12, 2.512]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[0.994136, 0.036629, -0.101745], [0.107123, -0.462198, 0.880283], [-0.014782, -0.88602, -0.463411]]; the translation vector: [3.8191, 1.340951, 1.354002], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.753, 0.465, 1.403, 0.46, 4.996, 2.959], [-1.738, -1.218, 1.272, 1.062, 1.528, 2.501], [-0.405, 2.797, 1.433, 4.292, 0.332, 2.875], [-2.525, 2.379, 1.355, 0.074, 0.839, 1.664], [-2.109, 0.693, 1.971, 0.227, 2.364, 1.533], [0.282, -2.054, 1.197, 3.118, 0.28, 2.272], [0.151, -2.857, 1.262, 0.294, 1.776, 2.355]]\nB: [[1.644, 0.604, 1.06, 0.766, 5.332, 3.344], [-2.097, -1.683, 0.861, 1.264, 1.832, 2.762], [-0.249, 2.988, 1.171, 4.734, 0.777, 3.234], [-2.915, 2.214, 1.5, 0.285, 1.098, 1.997], [-1.7, 0.54, 1.692, 0.479, 2.794, 1.178], [0.193, -1.942, 1.679, 3.173, -0.143, 2.182], [0.278, -3.151, 1.749, -0.197, 1.898, 2.594]]\nC: [[1.329, 0.268, 1.849, 0.784, 4.719, 2.961], [-2.126, -1.458, 1.073, 0.788, 1.484, 2.789], [0.005, 2.714, 1.367, 3.948, 0.242, 2.522], [-2.545, 2.463, 1.604, 0.21, 1.144, 1.521], [-1.811, 0.332, 2.299, -0.123, 1.943, 1.085], [0.387, -2.373, 0.727, 2.861, -0.215, 2.059], [0.369, -3.057, 1.007, 0.316, 1.439, 1.965]]\nD: [[2.224, 0.601, 1.81, 0.297, 5.34, 2.544], [-1.441, -1.038, 1.648, 1.209, 1.768, 2.642], [-0.455, 2.785, 1.909, 4.119, 0.083, 3.179], [-2.32, 2.048, 1.417, -0.178, 0.398, 1.998], [-1.992, 0.619, 1.973, 0.251, 2.119, 1.3], [0.593, -1.72, 1.138, 2.733, 0.423, 2.378], [0.595, -2.424, 1.148, 0.122, 2.12, 2.512]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_64_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_64_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-2.229, -0.625, 1.789, 0.384, 0.619, 0.577], [-0.619, -2.375, 0.661, 0.567, 0.451, 0.111], [1.272, -2.43, 0.132, 0.447, 0.727, 0.3], [1.644, -2.684, 0.788, 0.577, 0.322, 0.308], [-0.576, -2.651, 0.078, 0.434, 0.523, 0.246], [0.194, -2.629, 0.081, 0.411, 0.494, 0.232], [0.495, -2.465, 0.021, 0.42, 0.459, 0.089], [-0.19, -2.648, 0.019, 0.384, 0.512, 0.116], [0.653, -2.701, 0.729, 0.282, 0.339, 0.17], [0.952, -2.696, 0.744, 0.349, 0.322, 0.22], [1.255, -2.766, 0.79, 0.368, 0.486, 0.32], [0.1, -2.746, 0.715, 0.314, 0.34, 0.156], [0.399, -2.748, 0.688, 0.296, 0.348, 0.103], [-0.127, -2.741, 0.687, 0.255, 0.368, 0.104], [-0.41, -2.745, 0.703, 0.341, 0.369, 0.132], [-1.782, -2.695, 0.512, 0.572, 0.485, 0.241], [-1.365, -2.686, 0.5, 0.492, 0.488, 0.277], [-1.027, -2.616, 0.39, 0.417, 0.378, 0.292], [-2.221, -0.682, 1.308, 0.347, 0.518, 0.497]]\nB: [[-2.177, -0.167, 1.791, 0.429, 0.305, 0.461], [-0.533, -2.428, 0.376, 0.625, 0.83, -0.181], [0.833, -2.571, -0.115, 0.273, 1.148, 0.126], [1.611, -2.253, 0.787, 0.359, 0.551, -0.134], [-1.058, -2.229, -0.315, 0.638, 0.268, -0.067], [0.333, -2.804, -0.071, 0.337, 0.161, 0.002], [0.886, -2.763, 0.464, 0.54, 0.824, 0.171], [0.083, -2.871, 0.059, 0.444, 0.352, 0.054], [0.665, -2.763, 0.558, 0.057, 0.308, 0.039], [0.563, -2.607, 1.101, 0.044, -0.169, 0.664], [1.085, -2.593, 0.464, 0.42, 0.951, 0.013], [-0.365, -2.365, 0.619, 0.59, 0.077, 0.369], [0.543, -2.864, 0.581, 0.554, 0.644, -0.05], [0.36, -3.102, 0.746, 0.301, -0.13, -0.221], [-0.22, -2.771, 1.165, 0.154, 0.295, 0.195], [-1.434, -2.444, 0.547, 0.734, 0.246, -0.108], [-0.997, -2.269, 0.094, 0.441, 0.845, 0.283], [-1.137, -2.213, 0.312, 0.148, 0.309, 0.772], [-2.498, -0.603, 1.369, 0.752, 0.555, 0.615]]\nC: [[-2.33, -1.056, 1.723, -0.065, 0.432, 0.415], [-0.968, -1.986, 0.569, 0.909, 0.497, 0.486], [1.32, -2.346, -0.114, 0.554, 0.588, 0.715], [1.949, -3.024, 0.857, 1.07, 0.018, 0.558], [-0.719, -2.255, 0.515, 0.899, 0.995, 0.643], [0.265, -2.28, -0.308, 0.384, 0.12, 0.468], [0.651, -2.056, -0.288, 0.45, 0.167, 0.402], [-0.058, -2.555, -0.352, 0.064, 0.242, 0.36], [0.516, -2.537, 1.033, 0.148, 0.192, 0.352], [0.983, -2.457, 0.904, -0.004, -0.102, -0.046], [1.601, -2.407, 0.354, 0.85, 0.773, 0.225], [0.09, -3.129, 0.278, 0.778, 0.065, 0.089], [0.498, -3.096, 0.49, 0.127, 0.025, 0.421], [0.282, -2.893, 0.585, 0.538, -0.078, 0.192], [-0.775, -2.875, 0.541, 0.822, 0.042, 0.614], [-1.444, -2.829, 0.956, 0.56, 0.015, 0.186], [-1.857, -2.941, 0.896, 0.404, 0.313, 0.437], [-1.23, -2.427, -0.01, 0.121, 0.029, 0.052], [-2.105, -0.861, 1.621, 0.843, 0.939, 0.137]]\nD: [[-2.187, -0.629, 2.131, 0.702, 0.488, 0.299], [-1.084, -2.454, 0.389, 0.263, 0.376, -0.0], [0.958, -2.794, -0.355, 0.189, 0.618, 0.078], [2.015, -2.977, 0.616, 0.785, -0.119, 0.807], [-0.732, -2.52, -0.405, 0.133, 0.556, -0.136], [0.099, -2.242, 0.21, 0.448, 0.703, 0.555], [0.296, -2.758, -0.175, 0.146, 0.559, 0.119], [0.107, -2.903, 0.259, 0.508, 0.683, 0.189], [0.807, -2.213, 0.988, -0.022, 0.827, 0.39], [1.137, -2.436, 0.849, 0.615, -0.156, -0.078], [1.291, -2.816, 0.462, 0.333, 0.002, 0.188], [-0.057, -2.486, 0.271, 0.707, 0.496, -0.343], [0.312, -2.462, 0.382, 0.486, 0.393, -0.299], [-0.367, -3.213, 1.027, 0.397, 0.32, -0.33], [-0.12, -2.591, 0.295, 0.767, -0.13, -0.295], [-1.934, -2.605, 0.653, 0.958, 0.354, 0.257], [-1.101, -2.538, 0.202, 0.148, 0.769, 0.141], [-0.928, -2.714, 0.387, 0.917, 0.787, 0.443], [-1.94, -0.799, 1.262, 0.381, 0.02, 0.723]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the box in the scene. The camera pose information includes: the rotation matrix: [[0.983299, 0.047874, -0.175588], [0.180439, -0.382417, 0.9062], [-0.023764, -0.922749, -0.384668]]; the translation vector: [2.208684, 3.483128, 1.468268], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-2.229, -0.625, 1.789, 0.384, 0.619, 0.577], [-0.619, -2.375, 0.661, 0.567, 0.451, 0.111], [1.272, -2.43, 0.132, 0.447, 0.727, 0.3], [1.644, -2.684, 0.788, 0.577, 0.322, 0.308], [-0.576, -2.651, 0.078, 0.434, 0.523, 0.246], [0.194, -2.629, 0.081, 0.411, 0.494, 0.232], [0.495, -2.465, 0.021, 0.42, 0.459, 0.089], [-0.19, -2.648, 0.019, 0.384, 0.512, 0.116], [0.653, -2.701, 0.729, 0.282, 0.339, 0.17], [0.952, -2.696, 0.744, 0.349, 0.322, 0.22], [1.255, -2.766, 0.79, 0.368, 0.486, 0.32], [0.1, -2.746, 0.715, 0.314, 0.34, 0.156], [0.399, -2.748, 0.688, 0.296, 0.348, 0.103], [-0.127, -2.741, 0.687, 0.255, 0.368, 0.104], [-0.41, -2.745, 0.703, 0.341, 0.369, 0.132], [-1.782, -2.695, 0.512, 0.572, 0.485, 0.241], [-1.365, -2.686, 0.5, 0.492, 0.488, 0.277], [-1.027, -2.616, 0.39, 0.417, 0.378, 0.292], [-2.221, -0.682, 1.308, 0.347, 0.518, 0.497]]\nB: [[-2.177, -0.167, 1.791, 0.429, 0.305, 0.461], [-0.533, -2.428, 0.376, 0.625, 0.83, -0.181], [0.833, -2.571, -0.115, 0.273, 1.148, 0.126], [1.611, -2.253, 0.787, 0.359, 0.551, -0.134], [-1.058, -2.229, -0.315, 0.638, 0.268, -0.067], [0.333, -2.804, -0.071, 0.337, 0.161, 0.002], [0.886, -2.763, 0.464, 0.54, 0.824, 0.171], [0.083, -2.871, 0.059, 0.444, 0.352, 0.054], [0.665, -2.763, 0.558, 0.057, 0.308, 0.039], [0.563, -2.607, 1.101, 0.044, -0.169, 0.664], [1.085, -2.593, 0.464, 0.42, 0.951, 0.013], [-0.365, -2.365, 0.619, 0.59, 0.077, 0.369], [0.543, -2.864, 0.581, 0.554, 0.644, -0.05], [0.36, -3.102, 0.746, 0.301, -0.13, -0.221], [-0.22, -2.771, 1.165, 0.154, 0.295, 0.195], [-1.434, -2.444, 0.547, 0.734, 0.246, -0.108], [-0.997, -2.269, 0.094, 0.441, 0.845, 0.283], [-1.137, -2.213, 0.312, 0.148, 0.309, 0.772], [-2.498, -0.603, 1.369, 0.752, 0.555, 0.615]]\nC: [[-2.33, -1.056, 1.723, -0.065, 0.432, 0.415], [-0.968, -1.986, 0.569, 0.909, 0.497, 0.486], [1.32, -2.346, -0.114, 0.554, 0.588, 0.715], [1.949, -3.024, 0.857, 1.07, 0.018, 0.558], [-0.719, -2.255, 0.515, 0.899, 0.995, 0.643], [0.265, -2.28, -0.308, 0.384, 0.12, 0.468], [0.651, -2.056, -0.288, 0.45, 0.167, 0.402], [-0.058, -2.555, -0.352, 0.064, 0.242, 0.36], [0.516, -2.537, 1.033, 0.148, 0.192, 0.352], [0.983, -2.457, 0.904, -0.004, -0.102, -0.046], [1.601, -2.407, 0.354, 0.85, 0.773, 0.225], [0.09, -3.129, 0.278, 0.778, 0.065, 0.089], [0.498, -3.096, 0.49, 0.127, 0.025, 0.421], [0.282, -2.893, 0.585, 0.538, -0.078, 0.192], [-0.775, -2.875, 0.541, 0.822, 0.042, 0.614], [-1.444, -2.829, 0.956, 0.56, 0.015, 0.186], [-1.857, -2.941, 0.896, 0.404, 0.313, 0.437], [-1.23, -2.427, -0.01, 0.121, 0.029, 0.052], [-2.105, -0.861, 1.621, 0.843, 0.939, 0.137]]\nD: [[-2.187, -0.629, 2.131, 0.702, 0.488, 0.299], [-1.084, -2.454, 0.389, 0.263, 0.376, -0.0], [0.958, -2.794, -0.355, 0.189, 0.618, 0.078], [2.015, -2.977, 0.616, 0.785, -0.119, 0.807], [-0.732, -2.52, -0.405, 0.133, 0.556, -0.136], [0.099, -2.242, 0.21, 0.448, 0.703, 0.555], [0.296, -2.758, -0.175, 0.146, 0.559, 0.119], [0.107, -2.903, 0.259, 0.508, 0.683, 0.189], [0.807, -2.213, 0.988, -0.022, 0.827, 0.39], [1.137, -2.436, 0.849, 0.615, -0.156, -0.078], [1.291, -2.816, 0.462, 0.333, 0.002, 0.188], [-0.057, -2.486, 0.271, 0.707, 0.496, -0.343], [0.312, -2.462, 0.382, 0.486, 0.393, -0.299], [-0.367, -3.213, 1.027, 0.397, 0.32, -0.33], [-0.12, -2.591, 0.295, 0.767, -0.13, -0.295], [-1.934, -2.605, 0.653, 0.958, 0.354, 0.257], [-1.101, -2.538, 0.202, 0.148, 0.769, 0.141], [-0.928, -2.714, 0.387, 0.917, 0.787, 0.443], [-1.94, -0.799, 1.262, 0.381, 0.02, 0.723]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_65_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_65_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-0.897, 0.522, 0.459, 3.876, 6.999, 0.483]]\nB: [[-1.103, 0.265, -0.086, 3.353, 6.924, 0.299]]\nC: [[-1.473, 0.764, 0.085, 3.499, 6.843, 0.726]]\nD: [[-1.038, 0.389, 0.109, 3.778, 6.648, 0.286]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the floor in the scene. The camera pose information includes: the rotation matrix: [[0.643628, -0.362528, 0.674031], [-0.765241, -0.290748, 0.574345], [-0.012243, -0.88546, -0.464555]]; the translation vector: [2.632762, 2.243425, 1.452714], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.897, 0.522, 0.459, 3.876, 6.999, 0.483]]\nB: [[-1.103, 0.265, -0.086, 3.353, 6.924, 0.299]]\nC: [[-1.473, 0.764, 0.085, 3.499, 6.843, 0.726]]\nD: [[-1.038, 0.389, 0.109, 3.778, 6.648, 0.286]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_66_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_66_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-0.591, 1.483, 1.058, 1.607, 0.601, 2.492], [-2.097, -1.408, 0.835, 0.5, 0.336, 2.389]]\nB: [[0.134, 2.153, 0.957, 1.663, 0.233, 2.443], [-2.254, -1.53, 0.609, 0.554, 0.988, 1.584]]\nC: [[-0.108, 1.926, 1.025, 1.195, 0.255, 2.095], [-1.989, -1.419, 0.985, 0.159, 0.822, 1.991]]\nD: [[0.21, 1.976, 1.369, 1.051, 0.491, 1.606], [-2.131, -1.497, 0.9, 0.432, 0.96, 1.514]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the door in the scene. The camera pose information includes: the rotation matrix: [[-0.925351, 0.122106, -0.358909], [0.376741, 0.190476, -0.906524], [-0.042329, -0.974068, -0.222259]]; the translation vector: [4.735593, 2.732706, 1.21643], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.591, 1.483, 1.058, 1.607, 0.601, 2.492], [-2.097, -1.408, 0.835, 0.5, 0.336, 2.389]]\nB: [[0.134, 2.153, 0.957, 1.663, 0.233, 2.443], [-2.254, -1.53, 0.609, 0.554, 0.988, 1.584]]\nC: [[-0.108, 1.926, 1.025, 1.195, 0.255, 2.095], [-1.989, -1.419, 0.985, 0.159, 0.822, 1.991]]\nD: [[0.21, 1.976, 1.369, 1.051, 0.491, 1.606], [-2.131, -1.497, 0.9, 0.432, 0.96, 1.514]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_67_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_67_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[1.347, 0.112, 0.25, 1.296, 0.747, 0.377], [-0.83, 0.341, 0.317, 1.198, 1.401, 0.615], [-0.794, -1.292, 0.479, 1.485, 1.035, 1.271], [0.127, 1.653, 0.718, 0.98, 0.518, 0.445]]\nB: [[1.529, 0.501, 0.301, 0.944, 1.585, 0.652], [-1.117, 0.064, 0.625, 0.722, 0.832, 1.296], [-1.406, -0.586, 0.491, 0.928, 1.591, 0.802], [0.516, 1.562, 0.345, 0.823, 1.417, 0.468]]\nC: [[1.382, -0.298, 0.162, 0.586, 1.271, 1.153], [-1.729, -0.043, 0.911, 1.507, 1.118, 1.281], [-0.888, -0.525, -0.057, 1.572, 1.192, 0.468], [-0.205, 1.524, 0.606, 0.689, 0.914, 0.382]]\nD: [[1.322, 0.194, 0.453, 1.016, 1.117, 0.863], [-1.253, 0.172, 0.421, 1.029, 1.045, 0.876], [-1.049, -0.979, 0.44, 1.12, 1.131, 0.861], [0.221, 1.294, 0.424, 0.876, 0.92, 0.832]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the armchair in the scene. The camera pose information includes: the rotation matrix: [[0.748873, -0.374013, 0.547087], [-0.662404, -0.447673, 0.600675], [0.020256, -0.812221, -0.582998]]; the translation vector: [3.709567, 4.406117, 1.261793], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.347, 0.112, 0.25, 1.296, 0.747, 0.377], [-0.83, 0.341, 0.317, 1.198, 1.401, 0.615], [-0.794, -1.292, 0.479, 1.485, 1.035, 1.271], [0.127, 1.653, 0.718, 0.98, 0.518, 0.445]]\nB: [[1.529, 0.501, 0.301, 0.944, 1.585, 0.652], [-1.117, 0.064, 0.625, 0.722, 0.832, 1.296], [-1.406, -0.586, 0.491, 0.928, 1.591, 0.802], [0.516, 1.562, 0.345, 0.823, 1.417, 0.468]]\nC: [[1.382, -0.298, 0.162, 0.586, 1.271, 1.153], [-1.729, -0.043, 0.911, 1.507, 1.118, 1.281], [-0.888, -0.525, -0.057, 1.572, 1.192, 0.468], [-0.205, 1.524, 0.606, 0.689, 0.914, 0.382]]\nD: [[1.322, 0.194, 0.453, 1.016, 1.117, 0.863], [-1.253, 0.172, 0.421, 1.029, 1.045, 0.876], [-1.049, -0.979, 0.44, 1.12, 1.131, 0.861], [0.221, 1.294, 0.424, 0.876, 0.92, 0.832]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_68_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_68_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-1.579, -0.488, 0.759, 0.356, 0.729, 0.206], [-1.432, 0.41, 0.224, 0.816, -0.16, 0.054], [-1.128, 1.211, 0.876, 0.072, 0.472, -0.431], [-0.056, 1.335, 1.059, 0.219, -0.158, 0.294], [0.39, 0.373, 0.895, 0.659, 0.538, 0.377], [-1.237, 2.65, 0.314, 0.655, 0.335, -0.177]]\nB: [[-1.898, -0.166, 1.244, 0.693, 0.01, 0.135], [-2.054, 0.428, 0.961, 0.919, 0.356, 0.407], [-1.294, 1.065, 0.511, 0.811, -0.08, -0.323], [0.085, 0.558, 1.04, 0.703, -0.22, -0.384], [1.147, 0.956, 0.305, 0.157, 0.461, -0.367], [-1.796, 2.739, 0.408, 0.015, 0.305, -0.245]]\nC: [[-1.472, -0.634, 0.769, 0.41, 0.312, 0.075], [-1.766, 0.861, 0.684, 0.449, 0.16, 0.051], [-0.868, 0.879, 0.668, 0.414, 0.211, 0.046], [-0.148, 0.874, 0.644, 0.427, 0.151, 0.056], [0.744, 0.838, 0.607, 0.528, 0.174, 0.072], [-1.369, 2.612, 0.558, 0.426, 0.186, 0.029]]\nD: [[-1.326, -0.492, 0.759, 0.773, 0.113, -0.399], [-1.742, 0.884, 0.249, 0.825, 0.051, -0.219], [-0.59, 0.654, 0.814, 0.491, -0.041, -0.171], [-0.618, 1.322, 0.366, 0.807, 0.377, 0.225], [1.165, 1.152, 0.365, 0.032, 0.059, 0.012], [-1.206, 2.669, 0.552, 0.305, 0.052, 0.19]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the keyboard in the scene. The camera pose information includes: the rotation matrix: [[0.053762, 0.423971, -0.904079], [0.99709, -0.071809, 0.025618], [-0.05406, -0.902825, -0.426597]]; the translation vector: [3.696534, 7.381392, 1.65485], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.579, -0.488, 0.759, 0.356, 0.729, 0.206], [-1.432, 0.41, 0.224, 0.816, -0.16, 0.054], [-1.128, 1.211, 0.876, 0.072, 0.472, -0.431], [-0.056, 1.335, 1.059, 0.219, -0.158, 0.294], [0.39, 0.373, 0.895, 0.659, 0.538, 0.377], [-1.237, 2.65, 0.314, 0.655, 0.335, -0.177]]\nB: [[-1.898, -0.166, 1.244, 0.693, 0.01, 0.135], [-2.054, 0.428, 0.961, 0.919, 0.356, 0.407], [-1.294, 1.065, 0.511, 0.811, -0.08, -0.323], [0.085, 0.558, 1.04, 0.703, -0.22, -0.384], [1.147, 0.956, 0.305, 0.157, 0.461, -0.367], [-1.796, 2.739, 0.408, 0.015, 0.305, -0.245]]\nC: [[-1.472, -0.634, 0.769, 0.41, 0.312, 0.075], [-1.766, 0.861, 0.684, 0.449, 0.16, 0.051], [-0.868, 0.879, 0.668, 0.414, 0.211, 0.046], [-0.148, 0.874, 0.644, 0.427, 0.151, 0.056], [0.744, 0.838, 0.607, 0.528, 0.174, 0.072], [-1.369, 2.612, 0.558, 0.426, 0.186, 0.029]]\nD: [[-1.326, -0.492, 0.759, 0.773, 0.113, -0.399], [-1.742, 0.884, 0.249, 0.825, 0.051, -0.219], [-0.59, 0.654, 0.814, 0.491, -0.041, -0.171], [-0.618, 1.322, 0.366, 0.807, 0.377, 0.225], [1.165, 1.152, 0.365, 0.032, 0.059, 0.012], [-1.206, 2.669, 0.552, 0.305, 0.052, 0.19]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_69_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_69_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-0.062, 0.255, 0.974, 0.478, 0.305, 1.9]]\nB: [[0.289, 0.114, 0.997, 0.421, 0.269, 2.332]]\nC: [[-0.529, -0.167, 1.248, 0.711, 0.631, 1.869]]\nD: [[-0.117, 0.693, 1.129, 0.484, 0.656, 2.156]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the shower curtain in the scene. The camera pose information includes: the rotation matrix: [[-0.95695, -0.100486, 0.272304], [-0.288986, 0.24231, -0.92616], [0.027085, -0.964981, -0.260918]]; the translation vector: [1.227478, 4.879099, 1.55452], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.062, 0.255, 0.974, 0.478, 0.305, 1.9]]\nB: [[0.289, 0.114, 0.997, 0.421, 0.269, 2.332]]\nC: [[-0.529, -0.167, 1.248, 0.711, 0.631, 1.869]]\nD: [[-0.117, 0.693, 1.129, 0.484, 0.656, 2.156]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_70_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_70_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[1.525, -2.231, 0.892, 0.349, 0.481, 0.091]]\nB: [[1.636, -2.317, 0.937, 0.292, 0.774, -0.26]]\nC: [[1.735, -2.218, 1.132, -0.039, 0.012, 0.228]]\nD: [[1.335, -2.53, 1.027, 0.634, 0.978, -0.278]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the book in the scene. The camera pose information includes: the rotation matrix: [[-0.863619, -0.252896, 0.436126], [-0.502889, 0.371124, -0.780621], [0.03556, -0.893482, -0.447688]]; the translation vector: [2.007098, 3.82416, 1.536992], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.525, -2.231, 0.892, 0.349, 0.481, 0.091]]\nB: [[1.636, -2.317, 0.937, 0.292, 0.774, -0.26]]\nC: [[1.735, -2.218, 1.132, -0.039, 0.012, 0.228]]\nD: [[1.335, -2.53, 1.027, 0.634, 0.978, -0.278]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_71_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_71_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-3.713, -2.322, 1.093, 0.437, 3.072, 2.045], [-1.081, -3.837, 1.495, 4.867, 0.501, 2.558], [2.258, -2.553, 1.174, 1.342, 1.564, 2.568], [3.141, 1.147, 1.716, 0.254, 5.221, 3.492], [1.44, -2.565, 1.73, 0.493, 2.338, 2.511], [1.459, -1.994, 0.755, 0.967, 0.884, 1.183], [1.27, 3.225, 1.429, 3.362, 0.06, 2.461], [2.687, -1.112, 0.928, 2.314, 0.606, 3.137], [3.573, 2.12, 0.945, -0.323, 1.165, 0.653]]\nB: [[-3.835, -1.629, 1.168, -0.265, 2.747, 2.333], [-1.318, -2.989, 1.688, 4.412, 0.48, 2.388], [2.689, -2.933, 1.545, 1.676, 2.18, 2.201], [3.228, 1.403, 1.452, 0.635, 4.562, 3.257], [1.389, -2.608, 0.976, 1.337, 2.222, 2.449], [1.683, -1.57, 0.448, 0.488, 1.125, 1.219], [1.286, 3.78, 1.634, 2.717, 0.735, 2.673], [2.077, -1.004, 0.831, 1.778, 0.571, 2.523], [3.367, 1.994, 0.998, 0.165, 1.01, 0.878]]\nC: [[-3.518, -1.854, 1.546, 0.215, 3.24, 2.228], [-1.249, -3.369, 1.199, 4.514, 0.422, 2.472], [2.581, -2.461, 1.261, 1.576, 1.946, 2.535], [3.098, 1.012, 1.522, 0.435, 4.946, 2.999], [1.343, -2.44, 1.234, 0.869, 1.985, 2.527], [1.357, -2.033, 0.708, 0.777, 1.087, 1.434], [1.727, 3.433, 1.218, 3.174, 0.459, 2.415], [2.388, -1.448, 1.321, 1.857, 0.151, 2.689], [3.207, 2.39, 1.139, 0.116, 1.457, 0.447]]\nD: [[-3.315, -1.725, 1.076, -0.072, 3.369, 2.316], [-1.509, -2.948, 1.263, 4.588, 0.454, 2.009], [2.958, -2.942, 0.867, 1.911, 2.392, 2.127], [3.566, 0.671, 1.618, 0.253, 5.112, 3.1], [1.816, -2.011, 1.094, 0.402, 1.679, 2.148], [0.957, -1.668, 0.579, 1.105, 0.683, 1.586], [1.676, 3.716, 1.075, 3.204, 0.902, 2.406], [2.655, -1.717, 0.827, 1.883, 0.155, 2.358], [3.511, 2.562, 1.175, -0.348, 1.486, 0.553]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[-0.831143, 0.312948, -0.459636], [0.555586, 0.43327, -0.709649], [-0.022937, -0.845187, -0.533978]]; the translation vector: [2.360292, 3.05803, 1.315354], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-3.713, -2.322, 1.093, 0.437, 3.072, 2.045], [-1.081, -3.837, 1.495, 4.867, 0.501, 2.558], [2.258, -2.553, 1.174, 1.342, 1.564, 2.568], [3.141, 1.147, 1.716, 0.254, 5.221, 3.492], [1.44, -2.565, 1.73, 0.493, 2.338, 2.511], [1.459, -1.994, 0.755, 0.967, 0.884, 1.183], [1.27, 3.225, 1.429, 3.362, 0.06, 2.461], [2.687, -1.112, 0.928, 2.314, 0.606, 3.137], [3.573, 2.12, 0.945, -0.323, 1.165, 0.653]]\nB: [[-3.835, -1.629, 1.168, -0.265, 2.747, 2.333], [-1.318, -2.989, 1.688, 4.412, 0.48, 2.388], [2.689, -2.933, 1.545, 1.676, 2.18, 2.201], [3.228, 1.403, 1.452, 0.635, 4.562, 3.257], [1.389, -2.608, 0.976, 1.337, 2.222, 2.449], [1.683, -1.57, 0.448, 0.488, 1.125, 1.219], [1.286, 3.78, 1.634, 2.717, 0.735, 2.673], [2.077, -1.004, 0.831, 1.778, 0.571, 2.523], [3.367, 1.994, 0.998, 0.165, 1.01, 0.878]]\nC: [[-3.518, -1.854, 1.546, 0.215, 3.24, 2.228], [-1.249, -3.369, 1.199, 4.514, 0.422, 2.472], [2.581, -2.461, 1.261, 1.576, 1.946, 2.535], [3.098, 1.012, 1.522, 0.435, 4.946, 2.999], [1.343, -2.44, 1.234, 0.869, 1.985, 2.527], [1.357, -2.033, 0.708, 0.777, 1.087, 1.434], [1.727, 3.433, 1.218, 3.174, 0.459, 2.415], [2.388, -1.448, 1.321, 1.857, 0.151, 2.689], [3.207, 2.39, 1.139, 0.116, 1.457, 0.447]]\nD: [[-3.315, -1.725, 1.076, -0.072, 3.369, 2.316], [-1.509, -2.948, 1.263, 4.588, 0.454, 2.009], [2.958, -2.942, 0.867, 1.911, 2.392, 2.127], [3.566, 0.671, 1.618, 0.253, 5.112, 3.1], [1.816, -2.011, 1.094, 0.402, 1.679, 2.148], [0.957, -1.668, 0.579, 1.105, 0.683, 1.586], [1.676, 3.716, 1.075, 3.204, 0.902, 2.406], [2.655, -1.717, 0.827, 1.883, 0.155, 2.358], [3.511, 2.562, 1.175, -0.348, 1.486, 0.553]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_72_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_72_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[1.046, -0.307, 0.362, 0.784, -0.024, 0.785], [1.165, -2.351, 1.138, 0.213, 0.736, 0.353], [1.15, -2.093, 0.722, 0.4, 0.33, 0.205], [1.084, -0.85, 1.131, 0.451, -0.085, 0.317], [1.331, -1.435, 0.691, 0.675, 0.723, 0.254], [-1.236, 0.563, -0.088, 0.27, -0.102, 0.794]]\nB: [[1.265, -0.056, 0.282, 0.326, 0.027, 0.886], [1.533, -2.203, 0.449, 0.341, 0.914, 0.835], [0.973, -1.818, 0.452, -0.205, -0.0, 0.557], [1.212, -0.809, 0.364, 0.233, 0.14, 0.279], [0.952, -0.74, 0.435, -0.133, 0.174, 0.554], [-1.162, 0.16, 0.691, 0.327, -0.202, 0.736]]\nC: [[1.057, -0.394, 0.235, 0.507, 0.4, 0.47], [1.152, -1.942, 0.923, 0.249, 0.43, 0.441], [1.185, -1.67, 0.793, 0.195, 0.105, 0.183], [0.815, -0.905, 0.823, 0.231, 0.165, 0.244], [0.988, -0.991, 0.818, 0.253, 0.25, 0.209], [-1.265, 0.61, 0.238, 0.204, 0.16, 0.435]]\nD: [[1.051, -0.65, -0.171, 0.578, 0.483, 0.109], [0.936, -1.859, 0.474, -0.087, 0.06, 0.148], [1.334, -2.107, 0.81, 0.465, 0.412, 0.633], [0.554, -0.966, 0.763, 0.354, 0.344, 0.116], [1.173, -0.543, 0.619, 0.486, 0.296, 0.039], [-1.019, 0.12, 0.267, -0.232, -0.155, 0.735]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the object in the scene. The camera pose information includes: the rotation matrix: [[0.264492, -0.222038, 0.938479], [-0.962334, 0.002714, 0.271857], [-0.062909, -0.975034, -0.212957]]; the translation vector: [0.925816, 4.784833, 1.497389], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.046, -0.307, 0.362, 0.784, -0.024, 0.785], [1.165, -2.351, 1.138, 0.213, 0.736, 0.353], [1.15, -2.093, 0.722, 0.4, 0.33, 0.205], [1.084, -0.85, 1.131, 0.451, -0.085, 0.317], [1.331, -1.435, 0.691, 0.675, 0.723, 0.254], [-1.236, 0.563, -0.088, 0.27, -0.102, 0.794]]\nB: [[1.265, -0.056, 0.282, 0.326, 0.027, 0.886], [1.533, -2.203, 0.449, 0.341, 0.914, 0.835], [0.973, -1.818, 0.452, -0.205, -0.0, 0.557], [1.212, -0.809, 0.364, 0.233, 0.14, 0.279], [0.952, -0.74, 0.435, -0.133, 0.174, 0.554], [-1.162, 0.16, 0.691, 0.327, -0.202, 0.736]]\nC: [[1.057, -0.394, 0.235, 0.507, 0.4, 0.47], [1.152, -1.942, 0.923, 0.249, 0.43, 0.441], [1.185, -1.67, 0.793, 0.195, 0.105, 0.183], [0.815, -0.905, 0.823, 0.231, 0.165, 0.244], [0.988, -0.991, 0.818, 0.253, 0.25, 0.209], [-1.265, 0.61, 0.238, 0.204, 0.16, 0.435]]\nD: [[1.051, -0.65, -0.171, 0.578, 0.483, 0.109], [0.936, -1.859, 0.474, -0.087, 0.06, 0.148], [1.334, -2.107, 0.81, 0.465, 0.412, 0.633], [0.554, -0.966, 0.763, 0.354, 0.344, 0.116], [1.173, -0.543, 0.619, 0.486, 0.296, 0.039], [-1.019, 0.12, 0.267, -0.232, -0.155, 0.735]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_73_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_73_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-1.085, -0.215, 0.983, 0.671, 0.944, 0.637]]\nB: [[-1.195, -0.19, 1.175, 0.471, 1.343, 0.221]]\nC: [[-1.17, -0.298, 0.934, 0.962, 1.213, 0.413]]\nD: [[-1.39, -0.221, 0.693, 0.182, 1.277, 0.167]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the sink in the scene. The camera pose information includes: the rotation matrix: [[-0.409087, -0.112571, 0.905525], [-0.910894, 0.109148, -0.397943], [-0.05404, -0.987631, -0.147191]]; the translation vector: [4.421403, 3.579741, 1.526424], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.085, -0.215, 0.983, 0.671, 0.944, 0.637]]\nB: [[-1.195, -0.19, 1.175, 0.471, 1.343, 0.221]]\nC: [[-1.17, -0.298, 0.934, 0.962, 1.213, 0.413]]\nD: [[-1.39, -0.221, 0.693, 0.182, 1.277, 0.167]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_74_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_74_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-1.095, 1.592, 1.222, 1.568, 0.744, 2.142]]\nB: [[-0.877, 2.359, 1.301, 1.758, 0.807, 2.272]]\nC: [[-0.883, 2.133, 0.636, 0.867, 0.763, 2.547]]\nD: [[-1.101, 1.96, 1.128, 1.33, 0.454, 2.075]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the mirror doors in the scene. The camera pose information includes: the rotation matrix: [[-0.998134, -0.025826, -0.055325], [0.04389, 0.326427, -0.944203], [0.042444, -0.94487, -0.324684]]; the translation vector: [2.355182, 2.984659, 1.395898], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.095, 1.592, 1.222, 1.568, 0.744, 2.142]]\nB: [[-0.877, 2.359, 1.301, 1.758, 0.807, 2.272]]\nC: [[-0.883, 2.133, 0.636, 0.867, 0.763, 2.547]]\nD: [[-1.101, 1.96, 1.128, 1.33, 0.454, 2.075]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_75_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_75_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[1.917, 0.769, 0.393, 0.162, 0.916, 0.83], [2.158, 0.091, 1.885, 0.64, 0.369, 0.373]]\nB: [[1.7, 0.645, 0.863, 0.067, 1.221, 0.849], [2.328, 0.352, 2.246, 0.627, 0.498, 0.253]]\nC: [[1.798, 1.202, -0.093, 0.135, 0.516, 1.131], [2.029, 0.523, 2.037, 0.813, 0.35, 0.6]]\nD: [[1.675, 0.61, 0.316, -0.227, 0.481, 0.35], [2.253, 0.494, 1.51, 1.013, 0.177, 0.842]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the kitchen cabinet in the scene. The camera pose information includes: the rotation matrix: [[-0.399387, 0.327689, -0.856218], [0.9115, 0.041819, -0.409169], [-0.098274, -0.94386, -0.315391]]; the translation vector: [4.88233, 2.963563, 1.403722], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.917, 0.769, 0.393, 0.162, 0.916, 0.83], [2.158, 0.091, 1.885, 0.64, 0.369, 0.373]]\nB: [[1.7, 0.645, 0.863, 0.067, 1.221, 0.849], [2.328, 0.352, 2.246, 0.627, 0.498, 0.253]]\nC: [[1.798, 1.202, -0.093, 0.135, 0.516, 1.131], [2.029, 0.523, 2.037, 0.813, 0.35, 0.6]]\nD: [[1.675, 0.61, 0.316, -0.227, 0.481, 0.35], [2.253, 0.494, 1.51, 1.013, 0.177, 0.842]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_76_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_76_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-1.494, 1.559, 0.938, 0.656, 5.217, 2.076], [2.053, 1.519, 1.142, 0.022, 3.126, 1.696], [1.526, 0.314, 1.467, 0.388, 0.432, 1.965], [1.119, -0.324, 1.442, 0.427, 0.845, 2.198], [1.576, -0.035, 0.321, 0.045, 0.534, 1.117], [1.692, -0.958, 0.006, 0.205, 2.737, 0.651], [2.137, -2.631, 1.077, 0.184, 0.904, 1.508], [1.333, -3.255, 1.466, 0.791, -0.082, 1.662], [1.459, -3.425, 1.79, -0.348, -0.024, 0.359], [0.387, -3.416, 1.314, 3.646, -0.097, 1.995], [-1.825, -3.194, 1.168, 0.422, 1.404, 1.529], [-0.304, 4.179, 0.452, 3.264, -0.007, 1.066], [1.999, 3.872, 0.716, 0.498, 0.487, 1.358]]\nB: [[-1.693, 1.424, 1.03, 0.376, 5.083, 2.034], [1.765, 1.957, 1.138, 0.161, 3.199, 2.18], [1.589, 0.333, 0.987, 0.355, 0.095, 1.877], [1.425, 0.157, 1.015, 0.112, 0.477, 1.967], [1.63, -0.081, 0.672, 0.331, 0.259, 1.339], [1.705, -1.447, 0.484, 0.238, 2.779, 0.873], [1.951, -2.837, 1.012, 0.146, 0.69, 1.445], [1.797, -3.186, 1.022, 0.444, 0.092, 1.424], [1.591, -3.334, 1.384, 0.106, 0.324, 0.652], [-0.022, -3.519, 0.892, 3.311, 0.275, 1.699], [-1.705, -2.728, 0.676, 0.126, 1.402, 1.204], [-0.01, 3.839, 0.745, 3.147, 0.481, 1.347], [1.63, 3.568, 0.892, 0.411, 0.327, 1.532]]\nC: [[-1.26, 1.503, 0.893, 0.629, 5.544, 1.914], [2.175, 1.47, 1.47, 0.109, 3.382, 1.686], [1.982, -0.011, 0.916, 0.426, -0.326, 1.566], [1.181, 0.067, 1.21, 0.067, 0.005, 2.351], [1.524, 0.001, 0.471, 0.286, 0.408, 1.265], [1.238, -1.52, 0.419, 0.599, 3.184, 1.176], [1.553, -3.177, 0.653, 0.32, 0.427, 1.885], [1.383, -3.363, 1.432, 0.865, -0.009, 1.444], [1.288, -3.498, 1.769, -0.257, 0.218, 1.054], [0.393, -3.522, 1.337, 3.619, 0.242, 1.594], [-1.576, -3.113, 0.753, 0.379, 1.777, 1.195], [-0.268, 3.894, 0.852, 2.983, 0.721, 1.393], [1.465, 3.133, 0.435, 0.617, 0.63, 1.96]]\nD: [[-1.946, 1.454, 1.304, 0.285, 4.759, 1.584], [1.735, 2.118, 1.431, 0.5, 3.38, 2.198], [2.01, 0.269, 1.406, 0.118, -0.362, 2.255], [1.665, -0.294, 0.623, -0.295, 0.208, 2.363], [1.918, 0.112, 1.078, 0.599, 0.597, 0.896], [1.655, -1.698, 0.75, 0.063, 2.896, 0.441], [2.382, -2.981, 1.161, 0.203, 0.379, 1.162], [1.828, -2.97, 0.979, 0.706, -0.194, 1.801], [1.717, -3.159, 1.188, 0.204, 0.385, 0.448], [0.303, -3.389, 1.008, 3.649, 0.715, 1.331], [-1.467, -2.443, 0.641, 0.545, 0.903, 1.371], [-0.151, 3.761, 0.508, 3.288, 0.802, 1.225], [1.797, 3.579, 1.179, 0.009, 0.008, 1.708]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[-0.810147, -0.229725, 0.539341], [-0.586224, 0.314131, -0.746769], [0.002128, -0.921167, -0.389162]]; the translation vector: [3.108561, 2.950706, 1.466118], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.494, 1.559, 0.938, 0.656, 5.217, 2.076], [2.053, 1.519, 1.142, 0.022, 3.126, 1.696], [1.526, 0.314, 1.467, 0.388, 0.432, 1.965], [1.119, -0.324, 1.442, 0.427, 0.845, 2.198], [1.576, -0.035, 0.321, 0.045, 0.534, 1.117], [1.692, -0.958, 0.006, 0.205, 2.737, 0.651], [2.137, -2.631, 1.077, 0.184, 0.904, 1.508], [1.333, -3.255, 1.466, 0.791, -0.082, 1.662], [1.459, -3.425, 1.79, -0.348, -0.024, 0.359], [0.387, -3.416, 1.314, 3.646, -0.097, 1.995], [-1.825, -3.194, 1.168, 0.422, 1.404, 1.529], [-0.304, 4.179, 0.452, 3.264, -0.007, 1.066], [1.999, 3.872, 0.716, 0.498, 0.487, 1.358]]\nB: [[-1.693, 1.424, 1.03, 0.376, 5.083, 2.034], [1.765, 1.957, 1.138, 0.161, 3.199, 2.18], [1.589, 0.333, 0.987, 0.355, 0.095, 1.877], [1.425, 0.157, 1.015, 0.112, 0.477, 1.967], [1.63, -0.081, 0.672, 0.331, 0.259, 1.339], [1.705, -1.447, 0.484, 0.238, 2.779, 0.873], [1.951, -2.837, 1.012, 0.146, 0.69, 1.445], [1.797, -3.186, 1.022, 0.444, 0.092, 1.424], [1.591, -3.334, 1.384, 0.106, 0.324, 0.652], [-0.022, -3.519, 0.892, 3.311, 0.275, 1.699], [-1.705, -2.728, 0.676, 0.126, 1.402, 1.204], [-0.01, 3.839, 0.745, 3.147, 0.481, 1.347], [1.63, 3.568, 0.892, 0.411, 0.327, 1.532]]\nC: [[-1.26, 1.503, 0.893, 0.629, 5.544, 1.914], [2.175, 1.47, 1.47, 0.109, 3.382, 1.686], [1.982, -0.011, 0.916, 0.426, -0.326, 1.566], [1.181, 0.067, 1.21, 0.067, 0.005, 2.351], [1.524, 0.001, 0.471, 0.286, 0.408, 1.265], [1.238, -1.52, 0.419, 0.599, 3.184, 1.176], [1.553, -3.177, 0.653, 0.32, 0.427, 1.885], [1.383, -3.363, 1.432, 0.865, -0.009, 1.444], [1.288, -3.498, 1.769, -0.257, 0.218, 1.054], [0.393, -3.522, 1.337, 3.619, 0.242, 1.594], [-1.576, -3.113, 0.753, 0.379, 1.777, 1.195], [-0.268, 3.894, 0.852, 2.983, 0.721, 1.393], [1.465, 3.133, 0.435, 0.617, 0.63, 1.96]]\nD: [[-1.946, 1.454, 1.304, 0.285, 4.759, 1.584], [1.735, 2.118, 1.431, 0.5, 3.38, 2.198], [2.01, 0.269, 1.406, 0.118, -0.362, 2.255], [1.665, -0.294, 0.623, -0.295, 0.208, 2.363], [1.918, 0.112, 1.078, 0.599, 0.597, 0.896], [1.655, -1.698, 0.75, 0.063, 2.896, 0.441], [2.382, -2.981, 1.161, 0.203, 0.379, 1.162], [1.828, -2.97, 0.979, 0.706, -0.194, 1.801], [1.717, -3.159, 1.188, 0.204, 0.385, 0.448], [0.303, -3.389, 1.008, 3.649, 0.715, 1.331], [-1.467, -2.443, 0.641, 0.545, 0.903, 1.371], [-0.151, 3.761, 0.508, 3.288, 0.802, 1.225], [1.797, 3.579, 1.179, 0.009, 0.008, 1.708]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_77_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_77_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-0.793, 1.247, 0.29, 0.296, 0.279, -0.014], [1.019, 0.024, 1.569, 0.553, 0.236, 0.679]]\nB: [[-0.837, 1.73, 0.172, 0.311, 0.446, 0.446], [0.579, -0.45, 1.284, 0.394, 0.372, 0.858]]\nC: [[-0.983, 2.19, 0.493, -0.03, 0.329, 0.928], [0.864, -0.587, 1.773, 0.118, 0.794, 0.799]]\nD: [[-0.553, 2.216, 0.459, 0.267, 0.459, 0.522], [0.806, 0.026, 1.267, 0.403, 0.702, 0.558]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the clothes in the scene. The camera pose information includes: the rotation matrix: [[-0.187285, -0.627824, 0.755488], [-0.982305, 0.118515, -0.145025], [0.001514, -0.76928, -0.63891]]; the translation vector: [1.001752, 1.17634, 1.437838], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.793, 1.247, 0.29, 0.296, 0.279, -0.014], [1.019, 0.024, 1.569, 0.553, 0.236, 0.679]]\nB: [[-0.837, 1.73, 0.172, 0.311, 0.446, 0.446], [0.579, -0.45, 1.284, 0.394, 0.372, 0.858]]\nC: [[-0.983, 2.19, 0.493, -0.03, 0.329, 0.928], [0.864, -0.587, 1.773, 0.118, 0.794, 0.799]]\nD: [[-0.553, 2.216, 0.459, 0.267, 0.459, 0.522], [0.806, 0.026, 1.267, 0.403, 0.702, 0.558]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_78_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_78_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-2.858, -1.05, -0.049, 0.631, 1.092, 1.022], [0.969, 2.457, 0.703, 0.33, 0.355, 0.535], [1.292, 0.687, 0.943, 0.724, 0.324, 1.126], [1.537, -0.024, 0.37, 0.738, 0.769, 0.806], [2.91, -1.195, 1.375, 0.242, 1.166, 0.582], [2.799, -1.708, 0.863, 0.877, 0.364, 0.812], [2.158, -1.992, 0.634, 0.411, 0.065, 1.19], [-2.861, 0.973, 1.098, 0.744, 0.232, 0.595], [-3.055, 1.702, 0.901, 0.639, 0.173, 0.718], [3.451, -0.934, 1.096, 0.502, 0.89, 0.387]]\nB: [[-2.77, -0.712, 0.41, 0.782, 0.713, 0.859], [1.367, 2.116, 0.842, 0.257, 0.504, 0.248], [1.716, 0.519, 0.519, 0.661, 0.573, 0.903], [1.577, -0.324, 0.811, 0.462, 0.54, 0.431], [3.037, -1.452, 0.953, 0.581, 0.687, 0.531], [2.669, -1.872, 0.986, 0.552, 0.48, 0.568], [2.211, -1.887, 0.725, 0.677, 0.554, 1.018], [-2.956, 0.672, 0.826, 0.436, 0.319, 0.465], [-2.626, 1.651, 0.53, 0.537, 0.47, 0.924], [2.995, -0.435, 0.615, 0.566, 0.706, 0.886]]\nC: [[-2.925, -0.243, 0.295, 0.519, 0.44, 0.711], [1.485, 1.766, 1.018, 0.081, 0.848, 0.483], [1.717, 0.68, 0.214, 0.236, 1.037, 0.434], [1.205, -0.323, 1.125, 0.097, 0.642, 0.242], [3.189, -1.068, 0.599, 0.36, 1.144, 0.939], [2.418, -1.941, 1.167, 0.598, 0.698, 0.702], [1.723, -2.159, 0.821, 0.484, 0.884, 0.696], [-3.03, 0.47, 1.025, 0.789, 0.045, 0.278], [-2.913, 1.461, 0.819, 0.202, 0.085, 1.03], [2.826, -0.221, 0.951, 0.339, 0.752, 1.266]]\nD: [[-3.135, -0.575, -0.082, 0.411, 0.399, 1.112], [1.76, 1.636, 0.661, -0.118, 0.316, 0.196], [2.067, 0.976, 0.67, 0.22, 0.315, 1.158], [1.439, -0.283, 0.584, 0.087, 0.218, 0.206], [2.848, -1.357, 1.295, 0.653, 0.266, 0.059], [2.99, -1.86, 1.333, 0.578, 0.108, 0.112], [2.118, -1.567, 1.178, 0.323, 0.289, 0.96], [-3.43, 1.005, 1.071, 0.331, 0.71, 0.959], [-3.114, 1.972, 0.571, 0.075, 0.864, 0.441], [2.987, 0.022, 0.923, 0.173, 0.274, 0.482]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the chair in the scene. The camera pose information includes: the rotation matrix: [[0.515401, -0.339121, 0.786994], [-0.847541, -0.337435, 0.40965], [0.126638, -0.878143, -0.461333]]; the translation vector: [4.776819, 1.138867, 1.280463], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-2.858, -1.05, -0.049, 0.631, 1.092, 1.022], [0.969, 2.457, 0.703, 0.33, 0.355, 0.535], [1.292, 0.687, 0.943, 0.724, 0.324, 1.126], [1.537, -0.024, 0.37, 0.738, 0.769, 0.806], [2.91, -1.195, 1.375, 0.242, 1.166, 0.582], [2.799, -1.708, 0.863, 0.877, 0.364, 0.812], [2.158, -1.992, 0.634, 0.411, 0.065, 1.19], [-2.861, 0.973, 1.098, 0.744, 0.232, 0.595], [-3.055, 1.702, 0.901, 0.639, 0.173, 0.718], [3.451, -0.934, 1.096, 0.502, 0.89, 0.387]]\nB: [[-2.77, -0.712, 0.41, 0.782, 0.713, 0.859], [1.367, 2.116, 0.842, 0.257, 0.504, 0.248], [1.716, 0.519, 0.519, 0.661, 0.573, 0.903], [1.577, -0.324, 0.811, 0.462, 0.54, 0.431], [3.037, -1.452, 0.953, 0.581, 0.687, 0.531], [2.669, -1.872, 0.986, 0.552, 0.48, 0.568], [2.211, -1.887, 0.725, 0.677, 0.554, 1.018], [-2.956, 0.672, 0.826, 0.436, 0.319, 0.465], [-2.626, 1.651, 0.53, 0.537, 0.47, 0.924], [2.995, -0.435, 0.615, 0.566, 0.706, 0.886]]\nC: [[-2.925, -0.243, 0.295, 0.519, 0.44, 0.711], [1.485, 1.766, 1.018, 0.081, 0.848, 0.483], [1.717, 0.68, 0.214, 0.236, 1.037, 0.434], [1.205, -0.323, 1.125, 0.097, 0.642, 0.242], [3.189, -1.068, 0.599, 0.36, 1.144, 0.939], [2.418, -1.941, 1.167, 0.598, 0.698, 0.702], [1.723, -2.159, 0.821, 0.484, 0.884, 0.696], [-3.03, 0.47, 1.025, 0.789, 0.045, 0.278], [-2.913, 1.461, 0.819, 0.202, 0.085, 1.03], [2.826, -0.221, 0.951, 0.339, 0.752, 1.266]]\nD: [[-3.135, -0.575, -0.082, 0.411, 0.399, 1.112], [1.76, 1.636, 0.661, -0.118, 0.316, 0.196], [2.067, 0.976, 0.67, 0.22, 0.315, 1.158], [1.439, -0.283, 0.584, 0.087, 0.218, 0.206], [2.848, -1.357, 1.295, 0.653, 0.266, 0.059], [2.99, -1.86, 1.333, 0.578, 0.108, 0.112], [2.118, -1.567, 1.178, 0.323, 0.289, 0.96], [-3.43, 1.005, 1.071, 0.331, 0.71, 0.959], [-3.114, 1.972, 0.571, 0.075, 0.864, 0.441], [2.987, 0.022, 0.923, 0.173, 0.274, 0.482]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_79_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_79_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[1.456, -1.689, 0.986, 1.238, 0.639, 1.143], [-1.189, -1.791, 0.864, 0.611, 1.379, 1.148]]\nB: [[1.709, -1.624, 1.232, 0.531, 0.409, 1.263], [-0.537, -2.035, 0.965, 0.162, 1.273, 1.399]]\nC: [[1.92, -1.614, 0.415, 0.301, 0.956, 1.133], [-0.648, -1.783, 0.191, 0.47, 1.3, 1.09]]\nD: [[1.863, -1.557, 0.74, 0.792, 0.462, 1.459], [-0.873, -1.717, 0.611, 0.169, 1.157, 1.341]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the door in the scene. The camera pose information includes: the rotation matrix: [[0.348231, 0.123124, -0.929288], [0.936413, -1.6e-05, 0.350899], [0.043189, -0.992391, -0.1153]]; the translation vector: [2.712005, 2.075202, 1.464169], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.456, -1.689, 0.986, 1.238, 0.639, 1.143], [-1.189, -1.791, 0.864, 0.611, 1.379, 1.148]]\nB: [[1.709, -1.624, 1.232, 0.531, 0.409, 1.263], [-0.537, -2.035, 0.965, 0.162, 1.273, 1.399]]\nC: [[1.92, -1.614, 0.415, 0.301, 0.956, 1.133], [-0.648, -1.783, 0.191, 0.47, 1.3, 1.09]]\nD: [[1.863, -1.557, 0.74, 0.792, 0.462, 1.459], [-0.873, -1.717, 0.611, 0.169, 1.157, 1.341]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_80_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_80_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-0.91, 0.435, 1.317, 0.162, -0.028, 0.372], [-1.612, 0.781, 1.119, -0.027, 0.477, 0.779], [-0.879, 0.442, 1.028, 0.015, 0.023, 0.191], [-1.689, 1.721, 1.33, 0.202, 0.203, 0.899]]\nB: [[-1.22, 0.565, 1.527, 0.13, 0.316, 0.334], [-1.214, 0.573, 1.041, 0.138, 0.311, 0.395], [-1.241, 0.926, 1.496, 0.134, 0.334, 0.376], [-1.254, 1.276, 1.499, 0.14, 0.375, 0.407]]\nC: [[-0.897, 0.321, 1.25, -0.192, -0.085, 0.628], [-1.027, 0.54, 0.746, 0.155, 0.593, 0.872], [-1.661, 1.141, 1.852, 0.038, 0.687, 0.36], [-1.716, 1.739, 1.744, 0.171, 0.366, 0.735]]\nD: [[-0.881, 0.818, 1.879, -0.183, 0.463, 0.205], [-0.767, 0.607, 0.616, 0.203, 0.246, 0.191], [-0.822, 0.77, 1.534, -0.248, 0.163, 0.71], [-1.508, 0.961, 1.625, -0.148, 0.39, 0.839]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the rack in the scene. The camera pose information includes: the rotation matrix: [[-0.937403, 0.174354, -0.301457], [0.34768, 0.517889, -0.781607], [0.019845, -0.837491, -0.54609]]; the translation vector: [1.513881, 1.499843, 1.388066], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.91, 0.435, 1.317, 0.162, -0.028, 0.372], [-1.612, 0.781, 1.119, -0.027, 0.477, 0.779], [-0.879, 0.442, 1.028, 0.015, 0.023, 0.191], [-1.689, 1.721, 1.33, 0.202, 0.203, 0.899]]\nB: [[-1.22, 0.565, 1.527, 0.13, 0.316, 0.334], [-1.214, 0.573, 1.041, 0.138, 0.311, 0.395], [-1.241, 0.926, 1.496, 0.134, 0.334, 0.376], [-1.254, 1.276, 1.499, 0.14, 0.375, 0.407]]\nC: [[-0.897, 0.321, 1.25, -0.192, -0.085, 0.628], [-1.027, 0.54, 0.746, 0.155, 0.593, 0.872], [-1.661, 1.141, 1.852, 0.038, 0.687, 0.36], [-1.716, 1.739, 1.744, 0.171, 0.366, 0.735]]\nD: [[-0.881, 0.818, 1.879, -0.183, 0.463, 0.205], [-0.767, 0.607, 0.616, 0.203, 0.246, 0.191], [-0.822, 0.77, 1.534, -0.248, 0.163, 0.71], [-1.508, 0.961, 1.625, -0.148, 0.39, 0.839]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_81_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_81_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[0.79, -0.98, 1.163, 0.352, 0.978, 2.049]]\nB: [[1.303, -0.943, 0.81, 0.085, 1.431, 2.157]]\nC: [[0.918, -1.038, 0.78, -0.022, 1.276, 1.887]]\nD: [[1.132, -1.26, 0.803, 0.268, 1.192, 2.17]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the door in the scene. The camera pose information includes: the rotation matrix: [[-0.15851, 0.420096, -0.893529], [0.981106, -0.034663, -0.190342], [-0.110934, -0.906817, -0.406664]]; the translation vector: [4.004256, 0.910349, 2.578562], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.79, -0.98, 1.163, 0.352, 0.978, 2.049]]\nB: [[1.303, -0.943, 0.81, 0.085, 1.431, 2.157]]\nC: [[0.918, -1.038, 0.78, -0.022, 1.276, 1.887]]\nD: [[1.132, -1.26, 0.803, 0.268, 1.192, 2.17]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_82_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_82_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-2.027, 0.959, 0.116, 0.065, 0.01, 0.668], [-1.502, 1.757, 0.887, 0.168, -0.298, 0.404], [-1.464, 1.693, 1.349, 0.508, -0.033, 0.751], [-1.515, 1.819, 1.174, 0.619, -0.056, 0.931], [-1.32, 1.579, 1.138, 0.221, -0.036, 0.586]]\nB: [[-1.555, 1.321, -0.005, -0.086, -0.066, 0.68], [-1.503, 1.647, 1.497, 0.094, 0.629, 0.772], [-1.545, 2.057, 0.682, 0.091, -0.365, -0.177], [-1.817, 2.125, 0.639, 0.421, 0.176, 0.148], [-2.148, 2.167, 0.268, 0.654, -0.085, 0.81]]\nC: [[-1.921, 0.926, 0.476, 0.205, 0.401, 1.004], [-1.317, 1.461, 1.183, 0.482, -0.087, -0.114], [-0.981, 1.858, 0.937, -0.085, -0.01, 0.117], [-1.804, 1.654, 1.126, 0.091, 0.345, 0.125], [-2.134, 1.498, 0.297, 0.016, 0.463, 0.232]]\nD: [[-2.011, 1.284, 0.385, 0.186, 0.39, 0.566], [-1.266, 1.943, 1.101, 0.313, 0.196, 0.371], [-1.224, 1.994, 0.869, 0.351, 0.116, 0.277], [-1.583, 1.923, 1.035, 0.381, 0.288, 0.498], [-1.707, 1.925, 0.764, 0.426, 0.259, 0.583]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the bag in the scene. The camera pose information includes: the rotation matrix: [[0.82141, -0.124481, 0.556588], [-0.562763, -0.33543, 0.755503], [0.092651, -0.933805, -0.345579]]; the translation vector: [1.795382, 2.457259, 1.379582], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-2.027, 0.959, 0.116, 0.065, 0.01, 0.668], [-1.502, 1.757, 0.887, 0.168, -0.298, 0.404], [-1.464, 1.693, 1.349, 0.508, -0.033, 0.751], [-1.515, 1.819, 1.174, 0.619, -0.056, 0.931], [-1.32, 1.579, 1.138, 0.221, -0.036, 0.586]]\nB: [[-1.555, 1.321, -0.005, -0.086, -0.066, 0.68], [-1.503, 1.647, 1.497, 0.094, 0.629, 0.772], [-1.545, 2.057, 0.682, 0.091, -0.365, -0.177], [-1.817, 2.125, 0.639, 0.421, 0.176, 0.148], [-2.148, 2.167, 0.268, 0.654, -0.085, 0.81]]\nC: [[-1.921, 0.926, 0.476, 0.205, 0.401, 1.004], [-1.317, 1.461, 1.183, 0.482, -0.087, -0.114], [-0.981, 1.858, 0.937, -0.085, -0.01, 0.117], [-1.804, 1.654, 1.126, 0.091, 0.345, 0.125], [-2.134, 1.498, 0.297, 0.016, 0.463, 0.232]]\nD: [[-2.011, 1.284, 0.385, 0.186, 0.39, 0.566], [-1.266, 1.943, 1.101, 0.313, 0.196, 0.371], [-1.224, 1.994, 0.869, 0.351, 0.116, 0.277], [-1.583, 1.923, 1.035, 0.381, 0.288, 0.498], [-1.707, 1.925, 0.764, 0.426, 0.259, 0.583]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_83_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_83_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-2.036, 1.866, 1.489, 0.307, 0.157, 2.451], [-1.162, -0.721, 0.524, 0.061, 0.505, 0.591], [-1.692, -0.087, 1.909, 0.08, 0.301, 0.1], [-1.275, -0.78, -0.299, 0.662, 0.631, -0.319]]\nB: [[-1.306, 1.944, 1.27, 0.242, 0.415, 1.919], [-1.846, -0.095, 0.652, 0.668, -0.011, -0.065], [-1.708, -0.182, 1.324, -0.259, 0.382, 0.757], [-0.989, -0.521, 0.267, 0.114, 0.569, -0.144]]\nC: [[-1.606, 1.5, 1.094, 0.082, 0.444, 2.163], [-1.349, -0.456, 0.266, 0.226, 0.434, 0.139], [-1.295, -0.266, 1.634, 0.118, 0.05, 0.32], [-1.418, -0.408, 0.197, 0.3, 0.329, 0.161]]\nD: [[-1.696, 1.738, 0.967, 0.285, -0.051, 2.413], [-0.922, -0.569, 0.642, 0.33, 0.259, -0.242], [-0.853, -0.408, 2.052, 0.046, 0.488, 0.615], [-1.165, -0.273, 0.19, 0.107, 0.57, 0.605]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the book in the scene. The camera pose information includes: the rotation matrix: [[0.954506, 0.05554, -0.292973], [0.288831, -0.41644, 0.862064], [-0.074127, -0.907465, -0.413536]]; the translation vector: [2.66447, 1.005586, 1.476015], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-2.036, 1.866, 1.489, 0.307, 0.157, 2.451], [-1.162, -0.721, 0.524, 0.061, 0.505, 0.591], [-1.692, -0.087, 1.909, 0.08, 0.301, 0.1], [-1.275, -0.78, -0.299, 0.662, 0.631, -0.319]]\nB: [[-1.306, 1.944, 1.27, 0.242, 0.415, 1.919], [-1.846, -0.095, 0.652, 0.668, -0.011, -0.065], [-1.708, -0.182, 1.324, -0.259, 0.382, 0.757], [-0.989, -0.521, 0.267, 0.114, 0.569, -0.144]]\nC: [[-1.606, 1.5, 1.094, 0.082, 0.444, 2.163], [-1.349, -0.456, 0.266, 0.226, 0.434, 0.139], [-1.295, -0.266, 1.634, 0.118, 0.05, 0.32], [-1.418, -0.408, 0.197, 0.3, 0.329, 0.161]]\nD: [[-1.696, 1.738, 0.967, 0.285, -0.051, 2.413], [-0.922, -0.569, 0.642, 0.33, 0.259, -0.242], [-0.853, -0.408, 2.052, 0.046, 0.488, 0.615], [-1.165, -0.273, 0.19, 0.107, 0.57, 0.605]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_84_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_84_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[0.105, -2.287, 1.298, 5.216, 0.87, 2.807], [1.522, -1.359, 1.295, 0.692, 3.211, 2.519], [-1.482, 0.95, 0.954, -0.118, 3.651, 2.351], [2.186, 1.699, 1.104, -0.162, 3.237, 1.812], [-2.278, -2.05, 0.889, 0.84, 1.577, 2.045], [1.673, 0.416, 0.493, 2.003, 0.505, 2.036], [0.961, 0.497, 0.688, -0.227, 0.745, 1.241], [-2.744, -1.281, 0.568, 1.033, 0.219, 1.748], [-1.203, 3.13, 0.749, 0.354, 1.039, 2.281], [0.453, 3.938, 1.358, 3.143, -0.207, 1.101], [1.577, -0.17, 0.647, 1.504, 0.197, 0.91], [0.55, 0.958, 0.74, -0.022, 0.508, 1.915], [-0.28, -2.346, 2.125, 4.551, 0.291, 0.411], [-2.245, 3.374, 2.167, 0.315, 0.57, 1.021], [-1.883, 4.031, 0.791, 0.575, 0.114, 1.338]]\nB: [[-0.045, -2.933, 0.729, 5.273, 0.38, 1.87], [1.815, -0.872, 1.2, 0.381, 3.199, 2.451], [-1.835, 0.525, 0.883, 0.663, 4.217, 2.299], [1.852, 1.563, 0.117, 0.236, 3.143, 0.945], [-2.89, -2.063, 0.836, 0.206, 1.672, 2.238], [1.326, 0.326, 0.98, 1.535, 0.601, 1.665], [0.245, 0.603, 0.825, -0.004, 1.064, 1.817], [-1.862, -0.49, 1.467, 1.026, 0.012, 1.363], [-1.379, 3.112, 1.213, 0.486, 0.543, 1.682], [0.858, 3.952, 1.318, 3.11, 0.53, 1.733], [1.684, 0.251, 1.226, 1.531, 0.586, 0.576], [0.354, 1.015, 0.82, 0.415, 0.222, 1.857], [-0.273, -2.214, 2.258, 4.248, 0.77, 0.29], [-2.467, 2.65, 1.797, 0.149, 0.618, 1.025], [-2.164, 4.048, 1.03, 0.675, 0.141, 1.166]]\nC: [[-0.372, -2.705, 1.171, 4.784, 0.513, 2.321], [1.95, -1.245, 1.075, 0.31, 2.969, 2.221], [-1.736, 0.974, 1.065, 0.251, 4.086, 2.141], [2.079, 1.895, 0.614, 0.176, 3.361, 1.364], [-2.712, -1.857, 1.256, 0.344, 1.764, 2.405], [1.315, 0.187, 0.814, 1.511, 0.109, 1.639], [0.561, 0.619, 0.751, 0.075, 0.858, 1.522], [-2.331, -0.947, 0.996, 1.029, 0.105, 1.863], [-0.884, 3.244, 0.946, 0.263, 0.884, 1.941], [0.617, 3.626, 1.612, 2.853, 0.244, 1.318], [1.37, 0.273, 0.995, 1.377, 0.117, 0.425], [0.697, 0.781, 1.082, 0.286, 0.193, 2.222], [-0.516, -2.505, 2.299, 4.488, 0.335, 0.203], [-2.543, 2.889, 1.67, 0.173, 0.883, 0.685], [-1.977, 3.626, 1.246, 0.551, 0.106, 1.503]]\nD: [[0.035, -2.86, 0.861, 4.728, 0.233, 1.918], [1.546, -1.395, 0.841, 0.548, 3.249, 1.737], [-1.897, 0.52, 1.01, -0.225, 4.45, 2.412], [1.773, 2.047, 0.256, 0.066, 3.328, 1.231], [-3.131, -2.108, 0.842, 0.6, 1.535, 2.741], [1.218, -0.29, 0.461, 1.245, -0.153, 2.098], [0.922, 0.65, 1.084, -0.181, 0.59, 1.506], [-2.276, -0.909, 0.599, 1.207, -0.285, 1.54], [-0.665, 3.431, 1.123, 0.223, 0.621, 1.641], [0.797, 3.806, 2.013, 2.472, 0.677, 1.495], [1.111, 0.293, 1.457, 1.431, 0.551, 0.85], [0.877, 1.185, 1.451, 0.625, -0.09, 2.43], [-0.138, -2.632, 2.484, 4.711, -0.137, 0.648], [-3.024, 2.792, 1.538, -0.201, 1.018, 0.323], [-2.319, 3.937, 1.522, 0.199, 0.289, 1.095]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[0.99336, -0.011945, -0.114427], [0.103059, -0.349694, 0.931178], [-0.051137, -0.936788, -0.346141]]; the translation vector: [2.948285, 4.432959, 1.460427], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.105, -2.287, 1.298, 5.216, 0.87, 2.807], [1.522, -1.359, 1.295, 0.692, 3.211, 2.519], [-1.482, 0.95, 0.954, -0.118, 3.651, 2.351], [2.186, 1.699, 1.104, -0.162, 3.237, 1.812], [-2.278, -2.05, 0.889, 0.84, 1.577, 2.045], [1.673, 0.416, 0.493, 2.003, 0.505, 2.036], [0.961, 0.497, 0.688, -0.227, 0.745, 1.241], [-2.744, -1.281, 0.568, 1.033, 0.219, 1.748], [-1.203, 3.13, 0.749, 0.354, 1.039, 2.281], [0.453, 3.938, 1.358, 3.143, -0.207, 1.101], [1.577, -0.17, 0.647, 1.504, 0.197, 0.91], [0.55, 0.958, 0.74, -0.022, 0.508, 1.915], [-0.28, -2.346, 2.125, 4.551, 0.291, 0.411], [-2.245, 3.374, 2.167, 0.315, 0.57, 1.021], [-1.883, 4.031, 0.791, 0.575, 0.114, 1.338]]\nB: [[-0.045, -2.933, 0.729, 5.273, 0.38, 1.87], [1.815, -0.872, 1.2, 0.381, 3.199, 2.451], [-1.835, 0.525, 0.883, 0.663, 4.217, 2.299], [1.852, 1.563, 0.117, 0.236, 3.143, 0.945], [-2.89, -2.063, 0.836, 0.206, 1.672, 2.238], [1.326, 0.326, 0.98, 1.535, 0.601, 1.665], [0.245, 0.603, 0.825, -0.004, 1.064, 1.817], [-1.862, -0.49, 1.467, 1.026, 0.012, 1.363], [-1.379, 3.112, 1.213, 0.486, 0.543, 1.682], [0.858, 3.952, 1.318, 3.11, 0.53, 1.733], [1.684, 0.251, 1.226, 1.531, 0.586, 0.576], [0.354, 1.015, 0.82, 0.415, 0.222, 1.857], [-0.273, -2.214, 2.258, 4.248, 0.77, 0.29], [-2.467, 2.65, 1.797, 0.149, 0.618, 1.025], [-2.164, 4.048, 1.03, 0.675, 0.141, 1.166]]\nC: [[-0.372, -2.705, 1.171, 4.784, 0.513, 2.321], [1.95, -1.245, 1.075, 0.31, 2.969, 2.221], [-1.736, 0.974, 1.065, 0.251, 4.086, 2.141], [2.079, 1.895, 0.614, 0.176, 3.361, 1.364], [-2.712, -1.857, 1.256, 0.344, 1.764, 2.405], [1.315, 0.187, 0.814, 1.511, 0.109, 1.639], [0.561, 0.619, 0.751, 0.075, 0.858, 1.522], [-2.331, -0.947, 0.996, 1.029, 0.105, 1.863], [-0.884, 3.244, 0.946, 0.263, 0.884, 1.941], [0.617, 3.626, 1.612, 2.853, 0.244, 1.318], [1.37, 0.273, 0.995, 1.377, 0.117, 0.425], [0.697, 0.781, 1.082, 0.286, 0.193, 2.222], [-0.516, -2.505, 2.299, 4.488, 0.335, 0.203], [-2.543, 2.889, 1.67, 0.173, 0.883, 0.685], [-1.977, 3.626, 1.246, 0.551, 0.106, 1.503]]\nD: [[0.035, -2.86, 0.861, 4.728, 0.233, 1.918], [1.546, -1.395, 0.841, 0.548, 3.249, 1.737], [-1.897, 0.52, 1.01, -0.225, 4.45, 2.412], [1.773, 2.047, 0.256, 0.066, 3.328, 1.231], [-3.131, -2.108, 0.842, 0.6, 1.535, 2.741], [1.218, -0.29, 0.461, 1.245, -0.153, 2.098], [0.922, 0.65, 1.084, -0.181, 0.59, 1.506], [-2.276, -0.909, 0.599, 1.207, -0.285, 1.54], [-0.665, 3.431, 1.123, 0.223, 0.621, 1.641], [0.797, 3.806, 2.013, 2.472, 0.677, 1.495], [1.111, 0.293, 1.457, 1.431, 0.551, 0.85], [0.877, 1.185, 1.451, 0.625, -0.09, 2.43], [-0.138, -2.632, 2.484, 4.711, -0.137, 0.648], [-3.024, 2.792, 1.538, -0.201, 1.018, 0.323], [-2.319, 3.937, 1.522, 0.199, 0.289, 1.095]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_85_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_85_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-2.057, -0.804, 1.454, 0.442, 9.194, 2.993], [-0.24, 3.939, 1.662, 3.88, 0.819, 2.915], [1.686, 1.778, 1.614, 0.375, 4.14, 2.879], [1.518, -0.292, 1.39, 0.502, 0.183, 1.29], [1.407, -0.606, 1.045, 0.392, 0.791, 2.018], [1.569, -2.479, 1.01, 0.478, 3.37, 1.883]]\nB: [[-1.712, -0.852, 0.991, 0.707, 9.653, 3.103], [0.145, 4.4, 1.88, 4.062, 1.231, 2.667], [1.473, 2.151, 1.876, 0.37, 4.413, 3.184], [1.75, -0.649, 1.384, 0.602, -0.213, 1.435], [1.139, -0.573, 1.304, 0.885, 0.718, 2.242], [1.077, -2.453, 0.735, 0.583, 3.786, 1.438]]\nC: [[-1.586, -0.802, 1.264, 0.785, 8.752, 2.813], [-0.226, 4.305, 1.323, 3.698, 1.086, 3.015], [1.969, 1.342, 1.623, -0.075, 3.888, 3.299], [1.213, -0.465, 1.751, 0.015, 0.594, 1.001], [0.993, -0.822, 1.254, 0.504, 1.181, 1.943], [1.069, -2.03, 1.336, 0.651, 3.224, 1.602]]\nD: [[-2.191, -0.396, 1.663, 0.009, 8.751, 3.114], [0.038, 3.888, 1.488, 4.056, 0.477, 3.26], [2.082, 1.991, 1.998, -0.123, 3.891, 2.467], [1.903, -0.079, 0.895, 0.439, 0.291, 0.791], [1.022, -0.776, 0.73, 0.121, 0.449, 1.843], [1.7, -2.034, 1.291, 0.089, 3.481, 2.087]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[-0.908726, 0.150598, -0.389277], [0.406624, 0.108936, -0.907078], [-0.094198, -0.982575, -0.16023]]; the translation vector: [8.822721, 3.830595, 1.476402], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-2.057, -0.804, 1.454, 0.442, 9.194, 2.993], [-0.24, 3.939, 1.662, 3.88, 0.819, 2.915], [1.686, 1.778, 1.614, 0.375, 4.14, 2.879], [1.518, -0.292, 1.39, 0.502, 0.183, 1.29], [1.407, -0.606, 1.045, 0.392, 0.791, 2.018], [1.569, -2.479, 1.01, 0.478, 3.37, 1.883]]\nB: [[-1.712, -0.852, 0.991, 0.707, 9.653, 3.103], [0.145, 4.4, 1.88, 4.062, 1.231, 2.667], [1.473, 2.151, 1.876, 0.37, 4.413, 3.184], [1.75, -0.649, 1.384, 0.602, -0.213, 1.435], [1.139, -0.573, 1.304, 0.885, 0.718, 2.242], [1.077, -2.453, 0.735, 0.583, 3.786, 1.438]]\nC: [[-1.586, -0.802, 1.264, 0.785, 8.752, 2.813], [-0.226, 4.305, 1.323, 3.698, 1.086, 3.015], [1.969, 1.342, 1.623, -0.075, 3.888, 3.299], [1.213, -0.465, 1.751, 0.015, 0.594, 1.001], [0.993, -0.822, 1.254, 0.504, 1.181, 1.943], [1.069, -2.03, 1.336, 0.651, 3.224, 1.602]]\nD: [[-2.191, -0.396, 1.663, 0.009, 8.751, 3.114], [0.038, 3.888, 1.488, 4.056, 0.477, 3.26], [2.082, 1.991, 1.998, -0.123, 3.891, 2.467], [1.903, -0.079, 0.895, 0.439, 0.291, 0.791], [1.022, -0.776, 0.73, 0.121, 0.449, 1.843], [1.7, -2.034, 1.291, 0.089, 3.481, 2.087]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_86_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_86_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-1.406, -0.499, 0.127, 1.547, 0.014, 1.403], [1.954, 0.044, 0.911, 0.502, 4.499, 1.21], [0.82, -1.82, 0.463, 3.057, 0.123, 1.62], [-1.403, -1.396, 1.129, 0.78, 1.058, 1.608], [-1.077, 2.261, 0.337, 0.558, -0.028, 0.985]]\nB: [[-1.494, -0.876, 0.44, 1.936, 0.339, 1.286], [1.687, 0.61, 0.241, 0.363, 4.812, 0.869], [0.73, -1.681, 0.677, 2.541, -0.283, 1.252], [-1.274, -1.409, 0.377, 0.698, 1.296, 0.85], [-1.137, 2.544, 0.383, 0.033, 0.761, 1.018]]\nC: [[-2.268, -0.745, 0.912, 1.628, 0.264, 0.904], [1.958, 0.486, 0.503, -0.211, 4.549, 1.566], [-0.072, -2.097, 0.667, 2.893, 0.559, 1.549], [-0.724, -1.085, 0.723, 0.553, 1.77, 1.227], [-0.926, 2.697, 1.076, 0.821, 0.34, 1.204]]\nD: [[-1.791, -0.394, 0.511, 1.684, 0.11, 0.995], [1.786, 0.422, 0.664, 0.168, 4.577, 1.321], [0.388, -1.877, 0.715, 2.729, 0.147, 1.176], [-1.044, -1.126, 0.648, 0.431, 1.546, 1.277], [-1.081, 2.203, 0.66, 0.386, 0.403, 0.882]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[-0.997112, 0.02462, 0.071841], [-0.04661, 0.548461, -0.834876], [-0.059957, -0.835814, -0.545729]]; the translation vector: [4.834615, 3.436689, 1.398379], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.406, -0.499, 0.127, 1.547, 0.014, 1.403], [1.954, 0.044, 0.911, 0.502, 4.499, 1.21], [0.82, -1.82, 0.463, 3.057, 0.123, 1.62], [-1.403, -1.396, 1.129, 0.78, 1.058, 1.608], [-1.077, 2.261, 0.337, 0.558, -0.028, 0.985]]\nB: [[-1.494, -0.876, 0.44, 1.936, 0.339, 1.286], [1.687, 0.61, 0.241, 0.363, 4.812, 0.869], [0.73, -1.681, 0.677, 2.541, -0.283, 1.252], [-1.274, -1.409, 0.377, 0.698, 1.296, 0.85], [-1.137, 2.544, 0.383, 0.033, 0.761, 1.018]]\nC: [[-2.268, -0.745, 0.912, 1.628, 0.264, 0.904], [1.958, 0.486, 0.503, -0.211, 4.549, 1.566], [-0.072, -2.097, 0.667, 2.893, 0.559, 1.549], [-0.724, -1.085, 0.723, 0.553, 1.77, 1.227], [-0.926, 2.697, 1.076, 0.821, 0.34, 1.204]]\nD: [[-1.791, -0.394, 0.511, 1.684, 0.11, 0.995], [1.786, 0.422, 0.664, 0.168, 4.577, 1.321], [0.388, -1.877, 0.715, 2.729, 0.147, 1.176], [-1.044, -1.126, 0.648, 0.431, 1.546, 1.277], [-1.081, 2.203, 0.66, 0.386, 0.403, 0.882]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_87_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_87_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[0.113, 0.087, 0.564, 0.343, 0.527, 0.305], [0.507, 0.467, 0.458, 0.596, 0.504, 0.317], [0.58, 0.988, 0.644, 0.601, 0.651, 0.477], [0.182, 1.04, 0.677, 0.777, 0.505, 0.512], [1.732, 0.733, 0.527, 0.634, 0.573, 0.263], [1.609, 1.049, 0.659, 0.686, 0.387, 0.426]]\nB: [[0.129, 0.521, 0.187, 0.313, 0.856, 0.592], [0.981, 0.918, 0.313, 0.429, 0.812, 0.551], [0.233, 0.816, 0.228, 0.26, 0.574, 0.165], [-0.257, 0.76, 1.031, 0.337, 0.304, 1.005], [1.703, 1.1, 0.991, 1.058, 0.84, 0.596], [1.167, 0.943, 0.538, 0.487, 0.187, 0.143]]\nC: [[0.577, 0.356, 0.8, 0.107, 0.25, -0.032], [0.156, 0.937, 0.399, 0.676, 0.726, 0.633], [0.215, 0.658, 0.629, 0.763, 0.937, 0.472], [0.377, 0.594, 0.698, 1.038, 0.047, 0.378], [1.421, 1.109, 0.213, 0.954, 0.857, -0.124], [1.144, 1.512, 0.746, 0.326, 0.254, -0.001]]\nD: [[-0.375, 0.568, 0.757, 0.525, 0.71, 0.684], [0.596, 0.141, 0.679, 0.896, 0.714, 0.623], [0.506, 1.007, 0.844, 0.63, 0.899, 0.696], [-0.281, 1.187, 1.15, 1.186, 0.539, 1.005], [1.823, 0.702, 0.5, 0.724, 0.202, 0.553], [1.882, 1.516, 0.881, 1.085, 0.712, 0.444]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the pillow in the scene. The camera pose information includes: the rotation matrix: [[-0.971613, -0.06682, 0.226943], [-0.235147, 0.378036, -0.89543], [-0.02596, -0.923376, -0.383017]]; the translation vector: [2.775299, 4.618156, 1.427592], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.113, 0.087, 0.564, 0.343, 0.527, 0.305], [0.507, 0.467, 0.458, 0.596, 0.504, 0.317], [0.58, 0.988, 0.644, 0.601, 0.651, 0.477], [0.182, 1.04, 0.677, 0.777, 0.505, 0.512], [1.732, 0.733, 0.527, 0.634, 0.573, 0.263], [1.609, 1.049, 0.659, 0.686, 0.387, 0.426]]\nB: [[0.129, 0.521, 0.187, 0.313, 0.856, 0.592], [0.981, 0.918, 0.313, 0.429, 0.812, 0.551], [0.233, 0.816, 0.228, 0.26, 0.574, 0.165], [-0.257, 0.76, 1.031, 0.337, 0.304, 1.005], [1.703, 1.1, 0.991, 1.058, 0.84, 0.596], [1.167, 0.943, 0.538, 0.487, 0.187, 0.143]]\nC: [[0.577, 0.356, 0.8, 0.107, 0.25, -0.032], [0.156, 0.937, 0.399, 0.676, 0.726, 0.633], [0.215, 0.658, 0.629, 0.763, 0.937, 0.472], [0.377, 0.594, 0.698, 1.038, 0.047, 0.378], [1.421, 1.109, 0.213, 0.954, 0.857, -0.124], [1.144, 1.512, 0.746, 0.326, 0.254, -0.001]]\nD: [[-0.375, 0.568, 0.757, 0.525, 0.71, 0.684], [0.596, 0.141, 0.679, 0.896, 0.714, 0.623], [0.506, 1.007, 0.844, 0.63, 0.899, 0.696], [-0.281, 1.187, 1.15, 1.186, 0.539, 1.005], [1.823, 0.702, 0.5, 0.724, 0.202, 0.553], [1.882, 1.516, 0.881, 1.085, 0.712, 0.444]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_88_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_88_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[0.081, -2.379, 1.113, 0.296, 0.447, 1.93], [-2.179, -2.328, 1.113, 1.155, -0.055, 2.191], [0.187, -2.593, 1.0, 1.525, -0.011, 1.61]]\nB: [[0.156, -1.688, 0.672, 1.099, 0.512, 1.787], [-1.601, -2.386, 1.059, 0.494, 0.257, 2.331], [1.089, -2.9, 1.408, 0.896, 0.19, 1.392]]\nC: [[-0.153, -1.917, 0.934, 0.637, 0.572, 1.999], [-2.071, -2.511, 0.942, 0.893, 0.199, 2.089], [0.673, -2.564, 1.392, 1.046, 0.131, 1.552]]\nD: [[0.141, -1.904, 0.579, 0.681, 0.731, 2.01], [-1.907, -2.369, 0.924, 0.512, 0.574, 2.053], [0.733, -2.632, 1.326, 0.995, 0.386, 1.64]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the door in the scene. The camera pose information includes: the rotation matrix: [[-0.086843, 0.425015, -0.901011], [0.995696, 0.066429, -0.064634], [0.032383, -0.902745, -0.428955]]; the translation vector: [4.261571, 5.85756, 1.66629], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.081, -2.379, 1.113, 0.296, 0.447, 1.93], [-2.179, -2.328, 1.113, 1.155, -0.055, 2.191], [0.187, -2.593, 1.0, 1.525, -0.011, 1.61]]\nB: [[0.156, -1.688, 0.672, 1.099, 0.512, 1.787], [-1.601, -2.386, 1.059, 0.494, 0.257, 2.331], [1.089, -2.9, 1.408, 0.896, 0.19, 1.392]]\nC: [[-0.153, -1.917, 0.934, 0.637, 0.572, 1.999], [-2.071, -2.511, 0.942, 0.893, 0.199, 2.089], [0.673, -2.564, 1.392, 1.046, 0.131, 1.552]]\nD: [[0.141, -1.904, 0.579, 0.681, 0.731, 2.01], [-1.907, -2.369, 0.924, 0.512, 0.574, 2.053], [0.733, -2.632, 1.326, 0.995, 0.386, 1.64]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_89_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_89_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[1.066, -4.092, 0.322, 1.809, 0.144, 0.711], [-0.452, -3.998, 0.399, -0.022, -0.207, 0.313]]\nB: [[0.967, -4.137, 0.415, 1.862, 0.896, 0.809], [-0.541, -3.922, 0.752, 0.892, 0.695, 1.151]]\nC: [[0.859, -4.189, 1.178, 1.424, -0.037, 1.276], [-0.399, -4.209, 0.397, 0.399, 0.232, 1.02]]\nD: [[0.733, -4.146, 0.771, 1.808, 0.42, 0.818], [-0.752, -4.266, 0.836, 0.396, 0.285, 0.778]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the window in the scene. The camera pose information includes: the rotation matrix: [[0.504428, 0.479717, -0.717931], [0.860003, -0.204862, 0.467362], [0.077124, -0.853173, -0.515896]]; the translation vector: [4.973708, 0.412451, 1.573636], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.066, -4.092, 0.322, 1.809, 0.144, 0.711], [-0.452, -3.998, 0.399, -0.022, -0.207, 0.313]]\nB: [[0.967, -4.137, 0.415, 1.862, 0.896, 0.809], [-0.541, -3.922, 0.752, 0.892, 0.695, 1.151]]\nC: [[0.859, -4.189, 1.178, 1.424, -0.037, 1.276], [-0.399, -4.209, 0.397, 0.399, 0.232, 1.02]]\nD: [[0.733, -4.146, 0.771, 1.808, 0.42, 0.818], [-0.752, -4.266, 0.836, 0.396, 0.285, 0.778]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_90_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_90_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-2.356, -1.033, 0.487, 1.053, 0.829, 1.337]]\nB: [[-1.861, -0.729, 0.172, 1.066, 1.25, 1.068]]\nC: [[-2.075, -0.604, 0.467, 1.418, 0.63, 1.288]]\nD: [[-2.244, -1.03, 0.539, 1.266, 0.775, 0.934]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the cabinet in the scene. The camera pose information includes: the rotation matrix: [[-0.132001, -0.567775, 0.812532], [-0.991224, 0.069667, -0.112349], [0.007182, -0.820231, -0.571988]]; the translation vector: [2.407685, 4.450429, 1.359714], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-2.356, -1.033, 0.487, 1.053, 0.829, 1.337]]\nB: [[-1.861, -0.729, 0.172, 1.066, 1.25, 1.068]]\nC: [[-2.075, -0.604, 0.467, 1.418, 0.63, 1.288]]\nD: [[-2.244, -1.03, 0.539, 1.266, 0.775, 0.934]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_91_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_91_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[0.569, 0.211, 0.319, 0.687, 0.401, 0.55], [-0.378, 2.451, 0.757, 1.108, 0.785, 1.152], [-0.442, -3.047, 0.599, 0.595, 0.53, 0.698], [-0.671, -2.103, 0.492, 0.589, 0.785, 1.436], [-0.536, -2.312, 0.381, 0.676, 0.927, 0.8], [0.694, -2.162, -0.024, 0.318, 0.238, 1.069], [0.8, -2.531, 0.157, 0.887, 0.472, 0.605], [-0.017, 0.764, 0.766, 0.464, 0.143, 1.084]]\nB: [[-0.14, -0.504, 0.958, 0.996, 0.333, 0.616], [-0.523, 2.406, 0.116, 1.014, 1.032, 0.584], [-1.041, -3.534, 0.221, 1.124, 0.509, 0.64], [-1.178, -1.955, 0.316, 0.454, 0.967, 0.762], [-0.074, -2.655, 0.057, 0.407, 0.341, 0.817], [0.498, -1.8, 0.525, 0.171, 1.003, 0.793], [0.349, -2.636, 0.785, 0.651, 0.822, 0.565], [-0.067, 1.46, 0.267, 0.865, 0.829, 0.524]]\nC: [[0.244, -0.138, 0.489, 0.688, 0.662, 1.02], [-0.663, 2.462, 0.398, 0.618, 0.647, 0.654], [-0.762, -3.211, 0.433, 0.631, 0.73, 0.899], [-0.866, -2.412, 0.459, 0.652, 0.663, 0.995], [-0.182, -2.73, 0.386, 0.664, 0.667, 0.841], [0.386, -2.023, 0.44, 0.586, 0.689, 0.943], [0.543, -2.581, 0.583, 0.445, 0.548, 0.641], [0.339, 1.261, 0.575, 0.571, 0.572, 0.783]]\nD: [[0.09, 0.046, 0.862, 0.335, 0.771, 1.401], [-0.263, 2.607, 0.862, 0.364, 1.092, 0.886], [-1.02, -3.334, 0.931, 1.001, 0.759, 0.875], [-0.888, -2.153, 0.017, 0.223, 0.261, 0.633], [-0.543, -2.555, 0.32, 1.086, 0.816, 0.575], [0.862, -2.2, 0.258, 0.465, 0.987, 0.866], [0.065, -2.865, 0.495, 0.697, 0.945, 0.331], [0.317, 1.592, 1.019, 0.326, 0.876, 0.791]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the office chair in the scene. The camera pose information includes: the rotation matrix: [[0.672393, -0.274439, 0.687438], [-0.739855, -0.221079, 0.635404], [-0.022402, -0.935846, -0.351697]]; the translation vector: [3.802358, 2.110255, 1.494557], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.569, 0.211, 0.319, 0.687, 0.401, 0.55], [-0.378, 2.451, 0.757, 1.108, 0.785, 1.152], [-0.442, -3.047, 0.599, 0.595, 0.53, 0.698], [-0.671, -2.103, 0.492, 0.589, 0.785, 1.436], [-0.536, -2.312, 0.381, 0.676, 0.927, 0.8], [0.694, -2.162, -0.024, 0.318, 0.238, 1.069], [0.8, -2.531, 0.157, 0.887, 0.472, 0.605], [-0.017, 0.764, 0.766, 0.464, 0.143, 1.084]]\nB: [[-0.14, -0.504, 0.958, 0.996, 0.333, 0.616], [-0.523, 2.406, 0.116, 1.014, 1.032, 0.584], [-1.041, -3.534, 0.221, 1.124, 0.509, 0.64], [-1.178, -1.955, 0.316, 0.454, 0.967, 0.762], [-0.074, -2.655, 0.057, 0.407, 0.341, 0.817], [0.498, -1.8, 0.525, 0.171, 1.003, 0.793], [0.349, -2.636, 0.785, 0.651, 0.822, 0.565], [-0.067, 1.46, 0.267, 0.865, 0.829, 0.524]]\nC: [[0.244, -0.138, 0.489, 0.688, 0.662, 1.02], [-0.663, 2.462, 0.398, 0.618, 0.647, 0.654], [-0.762, -3.211, 0.433, 0.631, 0.73, 0.899], [-0.866, -2.412, 0.459, 0.652, 0.663, 0.995], [-0.182, -2.73, 0.386, 0.664, 0.667, 0.841], [0.386, -2.023, 0.44, 0.586, 0.689, 0.943], [0.543, -2.581, 0.583, 0.445, 0.548, 0.641], [0.339, 1.261, 0.575, 0.571, 0.572, 0.783]]\nD: [[0.09, 0.046, 0.862, 0.335, 0.771, 1.401], [-0.263, 2.607, 0.862, 0.364, 1.092, 0.886], [-1.02, -3.334, 0.931, 1.001, 0.759, 0.875], [-0.888, -2.153, 0.017, 0.223, 0.261, 0.633], [-0.543, -2.555, 0.32, 1.086, 0.816, 0.575], [0.862, -2.2, 0.258, 0.465, 0.987, 0.866], [0.065, -2.865, 0.495, 0.697, 0.945, 0.331], [0.317, 1.592, 1.019, 0.326, 0.876, 0.791]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_92_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_92_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-1.126, -1.376, 0.347, 0.441, 1.044, 0.747]]\nB: [[-0.707, -1.056, 0.436, 0.481, 0.775, 0.862]]\nC: [[-1.072, -0.581, 0.729, 0.634, 0.411, 0.815]]\nD: [[-1.2, -0.714, 0.073, 0.598, 1.239, 1.356]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the toilet in the scene. The camera pose information includes: the rotation matrix: [[-0.943065, -0.17817, 0.280864], [-0.332105, 0.550897, -0.765649], [-0.018311, -0.815333, -0.578703]]; the translation vector: [2.74599, 1.673222, 1.294065], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.126, -1.376, 0.347, 0.441, 1.044, 0.747]]\nB: [[-0.707, -1.056, 0.436, 0.481, 0.775, 0.862]]\nC: [[-1.072, -0.581, 0.729, 0.634, 0.411, 0.815]]\nD: [[-1.2, -0.714, 0.073, 0.598, 1.239, 1.356]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_93_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_93_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-1.746, -0.676, 1.548, 0.354, 0.507, 0.554], [1.278, -0.21, 2.039, 0.253, 0.159, 0.277], [1.354, -0.174, 2.085, 0.187, 0.25, 0.284], [1.365, 0.302, 2.07, 0.178, 0.146, 0.195], [1.395, 1.775, 0.709, 0.116, 0.082, 0.239], [0.108, -1.232, 0.61, 0.37, 0.243, 0.232]]\nB: [[-2.116, -0.405, 1.974, 0.197, 0.992, 0.793], [1.595, -0.115, 1.898, 0.53, -0.095, 0.207], [1.74, -0.462, 1.811, 0.459, 0.366, 0.195], [1.756, -0.03, 2.139, 0.506, -0.218, -0.14], [1.496, 1.894, 0.22, -0.344, -0.274, 0.329], [0.12, -1.361, 0.247, 0.677, 0.431, 0.41]]\nC: [[-2.099, -0.677, 1.826, 0.111, 0.048, 0.88], [1.179, -0.084, 2.064, 0.353, -0.335, 0.047], [1.283, -0.017, 2.251, 0.548, 0.539, -0.139], [1.054, -0.131, 1.995, -0.052, 0.135, -0.266], [1.813, 1.809, 0.298, 0.268, -0.092, 0.575], [0.507, -1.135, 0.122, 0.102, 0.682, -0.107]]\nD: [[-2.013, -0.781, 2.031, 0.552, 0.053, 0.962], [1.49, 0.048, 1.694, 0.076, -0.303, 0.184], [1.646, 0.043, 2.403, 0.082, 0.014, 0.773], [1.068, 0.187, 2.309, 0.672, -0.201, 0.291], [1.861, 1.412, 0.913, 0.343, -0.022, 0.312], [-0.111, -1.095, 0.386, 0.723, 0.064, 0.108]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the object in the scene. The camera pose information includes: the rotation matrix: [[0.493838, -0.420518, 0.76111], [-0.864926, -0.147366, 0.479777], [-0.089593, -0.895236, -0.436493]]; the translation vector: [0.736944, 2.108944, 1.402726], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.746, -0.676, 1.548, 0.354, 0.507, 0.554], [1.278, -0.21, 2.039, 0.253, 0.159, 0.277], [1.354, -0.174, 2.085, 0.187, 0.25, 0.284], [1.365, 0.302, 2.07, 0.178, 0.146, 0.195], [1.395, 1.775, 0.709, 0.116, 0.082, 0.239], [0.108, -1.232, 0.61, 0.37, 0.243, 0.232]]\nB: [[-2.116, -0.405, 1.974, 0.197, 0.992, 0.793], [1.595, -0.115, 1.898, 0.53, -0.095, 0.207], [1.74, -0.462, 1.811, 0.459, 0.366, 0.195], [1.756, -0.03, 2.139, 0.506, -0.218, -0.14], [1.496, 1.894, 0.22, -0.344, -0.274, 0.329], [0.12, -1.361, 0.247, 0.677, 0.431, 0.41]]\nC: [[-2.099, -0.677, 1.826, 0.111, 0.048, 0.88], [1.179, -0.084, 2.064, 0.353, -0.335, 0.047], [1.283, -0.017, 2.251, 0.548, 0.539, -0.139], [1.054, -0.131, 1.995, -0.052, 0.135, -0.266], [1.813, 1.809, 0.298, 0.268, -0.092, 0.575], [0.507, -1.135, 0.122, 0.102, 0.682, -0.107]]\nD: [[-2.013, -0.781, 2.031, 0.552, 0.053, 0.962], [1.49, 0.048, 1.694, 0.076, -0.303, 0.184], [1.646, 0.043, 2.403, 0.082, 0.014, 0.773], [1.068, 0.187, 2.309, 0.672, -0.201, 0.291], [1.861, 1.412, 0.913, 0.343, -0.022, 0.312], [-0.111, -1.095, 0.386, 0.723, 0.064, 0.108]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_94_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_94_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[0.095, 0.369, 0.896, 1.864, 0.389, -0.037]]\nB: [[0.821, 1.024, 0.461, 1.589, 1.059, 0.417]]\nC: [[0.235, 0.419, 0.494, 1.232, 0.977, 0.5]]\nD: [[0.531, 0.805, 0.846, 1.569, 0.745, 0.229]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the counter in the scene. The camera pose information includes: the rotation matrix: [[0.882784, 0.25224, -0.396318], [0.469583, -0.498211, 0.728888], [-0.013595, -0.829554, -0.55826]]; the translation vector: [3.463734, 1.394934, 1.262723], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.095, 0.369, 0.896, 1.864, 0.389, -0.037]]\nB: [[0.821, 1.024, 0.461, 1.589, 1.059, 0.417]]\nC: [[0.235, 0.419, 0.494, 1.232, 0.977, 0.5]]\nD: [[0.531, 0.805, 0.846, 1.569, 0.745, 0.229]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_95_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_95_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-0.71, -0.427, 1.41, -0.084, 0.707, -0.208]]\nB: [[-1.108, -0.854, 1.201, 0.423, 0.471, 0.68]]\nC: [[-1.305, -0.718, 1.12, 0.437, 0.021, 0.653]]\nD: [[-1.106, -0.393, 0.937, 0.241, 0.317, 0.242]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the tray in the scene. The camera pose information includes: the rotation matrix: [[-0.998162, -0.007354, -0.06016], [0.055338, 0.294228, -0.954132], [0.024717, -0.955707, -0.293281]]; the translation vector: [1.687981, 4.43329, 1.569003], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.71, -0.427, 1.41, -0.084, 0.707, -0.208]]\nB: [[-1.108, -0.854, 1.201, 0.423, 0.471, 0.68]]\nC: [[-1.305, -0.718, 1.12, 0.437, 0.021, 0.653]]\nD: [[-1.106, -0.393, 0.937, 0.241, 0.317, 0.242]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_96_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_96_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[1.882, 1.56, 0.388, 0.537, 1.676, 0.722], [-0.93, 1.385, 0.286, 0.589, 0.589, 0.521], [-0.937, -1.858, 0.442, 0.583, 0.58, 0.542]]\nB: [[1.943, 1.267, 0.682, 0.711, 1.577, 0.374], [-1.208, 1.812, -0.196, 1.059, 0.169, 0.521], [-1.321, -1.601, 0.071, 0.85, 0.083, 0.59]]\nC: [[2.195, 1.182, 0.758, 0.43, 1.952, 0.35], [-1.23, 1.71, 0.54, 0.173, 0.389, 0.39], [-0.765, -1.788, 0.133, 0.882, 0.65, 0.803]]\nD: [[1.615, 1.264, -0.077, 0.87, 1.187, 0.662], [-0.905, 1.561, 0.641, 0.894, 0.612, 0.112], [-0.628, -2.319, 0.352, 0.102, 0.924, 0.919]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the table in the scene. The camera pose information includes: the rotation matrix: [[-0.530794, 0.426739, -0.732224], [0.841151, 0.159702, -0.516681], [-0.10355, -0.890162, -0.443721]]; the translation vector: [5.418979, 4.373359, 1.385162], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.882, 1.56, 0.388, 0.537, 1.676, 0.722], [-0.93, 1.385, 0.286, 0.589, 0.589, 0.521], [-0.937, -1.858, 0.442, 0.583, 0.58, 0.542]]\nB: [[1.943, 1.267, 0.682, 0.711, 1.577, 0.374], [-1.208, 1.812, -0.196, 1.059, 0.169, 0.521], [-1.321, -1.601, 0.071, 0.85, 0.083, 0.59]]\nC: [[2.195, 1.182, 0.758, 0.43, 1.952, 0.35], [-1.23, 1.71, 0.54, 0.173, 0.389, 0.39], [-0.765, -1.788, 0.133, 0.882, 0.65, 0.803]]\nD: [[1.615, 1.264, -0.077, 0.87, 1.187, 0.662], [-0.905, 1.561, 0.641, 0.894, 0.612, 0.112], [-0.628, -2.319, 0.352, 0.102, 0.924, 0.919]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_97_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_97_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[0.419, -1.385, 0.468, 0.603, 0.548, -0.002]]\nB: [[0.568, -0.742, 0.118, 0.677, 0.387, 0.514]]\nC: [[0.186, -1.693, 0.012, 1.09, 0.395, 0.456]]\nD: [[0.461, -1.208, 0.23, 0.711, 0.358, 0.459]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the toilet in the scene. The camera pose information includes: the rotation matrix: [[0.695296, -0.421579, 0.582095], [-0.717067, -0.351947, 0.601622], [-0.048765, -0.835707, -0.547007]]; the translation vector: [2.470866, 0.652559, 1.473924], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.419, -1.385, 0.468, 0.603, 0.548, -0.002]]\nB: [[0.568, -0.742, 0.118, 0.677, 0.387, 0.514]]\nC: [[0.186, -1.693, 0.012, 1.09, 0.395, 0.456]]\nD: [[0.461, -1.208, 0.23, 0.711, 0.358, 0.459]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_98_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_98_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-0.786, 0.016, 0.222, 1.217, 0.653, 0.431]]\nB: [[-1.037, -0.227, 0.31, 1.564, 0.876, 0.857]]\nC: [[-1.111, -0.46, 0.292, 1.65, 0.975, 0.105]]\nD: [[-0.725, -0.136, -0.167, 0.877, 0.479, 0.631]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the coffee table in the scene. The camera pose information includes: the rotation matrix: [[0.408988, -0.323891, 0.853126], [-0.912443, -0.158736, 0.37716], [0.013263, -0.932683, -0.360453]]; the translation vector: [3.672612, 2.990265, 1.494339], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.786, 0.016, 0.222, 1.217, 0.653, 0.431]]\nB: [[-1.037, -0.227, 0.31, 1.564, 0.876, 0.857]]\nC: [[-1.111, -0.46, 0.292, 1.65, 0.975, 0.105]]\nD: [[-0.725, -0.136, -0.167, 0.877, 0.479, 0.631]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_99_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_99_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[1.78, -0.369, 0.924, 0.56, 5.207, 1.466], [-1.469, 1.84, 1.529, 0.1, 3.46, 1.026], [1.603, 3.517, 0.975, 0.924, 0.045, 0.875], [-1.453, 3.239, 1.664, 0.75, -0.019, 1.939], [-1.132, -0.282, 1.01, 0.6, 1.619, 1.743], [-0.943, -1.737, 1.065, 0.938, -0.129, 1.932], [-0.462, -1.272, 0.976, 0.13, 0.263, 2.111], [-0.647, -3.37, 0.86, -0.005, 1.127, 1.632], [0.759, -3.788, 0.905, 0.472, 0.723, 1.787], [1.365, -2.765, 1.105, 0.336, 0.467, 1.277]]\nB: [[1.074, -0.516, 1.118, -0.086, 5.725, 2.158], [-1.396, 2.343, 1.47, -0.008, 3.692, 1.57], [1.315, 3.398, 1.101, 1.035, 0.612, 0.867], [-0.97, 3.198, 1.01, 0.853, 0.483, 1.503], [-1.961, -0.579, 0.799, 0.47, 0.959, 1.354], [-0.792, -1.093, 0.831, 0.98, 0.15, 1.422], [-0.764, -1.052, 0.538, 0.169, -0.099, 1.83], [-0.948, -3.534, 0.813, 0.512, 1.974, 2.262], [1.309, -3.86, 1.13, 0.074, 1.177, 0.95], [0.842, -2.685, 1.111, 0.225, 0.622, 1.342]]\nC: [[1.398, -0.078, 0.847, 0.238, 5.699, 1.741], [-1.453, 1.912, 1.74, 0.206, 3.243, 1.354], [1.514, 3.636, 0.972, 1.079, 0.266, 0.762], [-1.064, 3.584, 1.382, 0.689, 0.248, 1.654], [-1.552, -0.739, 0.879, 0.227, 1.257, 1.692], [-1.211, -1.342, 0.86, 0.655, 0.096, 1.73], [-0.902, -1.484, 0.9, 0.087, 0.331, 1.816], [-0.874, -3.114, 1.006, 0.184, 1.508, 2.084], [0.921, -3.404, 0.668, 0.136, 1.137, 1.434], [1.157, -2.863, 0.703, 0.531, 0.128, 1.521]]\nD: [[1.025, -0.536, 0.699, 0.592, 5.958, 2.064], [-1.605, 1.792, 2.153, -0.235, 3.185, 1.084], [1.02, 3.68, 1.082, 1.526, 0.082, 0.582], [-1.08, 3.95, 0.986, 0.299, -0.139, 1.856], [-1.893, -0.998, 0.689, 0.259, 1.727, 1.918], [-1.034, -1.551, 0.605, 0.948, 0.46, 1.541], [-1.095, -1.908, 1.355, 0.164, 0.298, 1.555], [-0.914, -3.165, 0.928, -0.077, 1.779, 1.639], [0.568, -3.209, 0.575, 0.598, 1.246, 1.226], [1.226, -3.252, 0.43, 0.831, 0.263, 1.38]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[-0.52463, -0.231347, 0.819293], [-0.850589, 0.102279, -0.515789], [0.03553, -0.96748, -0.25044]]; the translation vector: [5.897326, 2.792535, 1.553822], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.78, -0.369, 0.924, 0.56, 5.207, 1.466], [-1.469, 1.84, 1.529, 0.1, 3.46, 1.026], [1.603, 3.517, 0.975, 0.924, 0.045, 0.875], [-1.453, 3.239, 1.664, 0.75, -0.019, 1.939], [-1.132, -0.282, 1.01, 0.6, 1.619, 1.743], [-0.943, -1.737, 1.065, 0.938, -0.129, 1.932], [-0.462, -1.272, 0.976, 0.13, 0.263, 2.111], [-0.647, -3.37, 0.86, -0.005, 1.127, 1.632], [0.759, -3.788, 0.905, 0.472, 0.723, 1.787], [1.365, -2.765, 1.105, 0.336, 0.467, 1.277]]\nB: [[1.074, -0.516, 1.118, -0.086, 5.725, 2.158], [-1.396, 2.343, 1.47, -0.008, 3.692, 1.57], [1.315, 3.398, 1.101, 1.035, 0.612, 0.867], [-0.97, 3.198, 1.01, 0.853, 0.483, 1.503], [-1.961, -0.579, 0.799, 0.47, 0.959, 1.354], [-0.792, -1.093, 0.831, 0.98, 0.15, 1.422], [-0.764, -1.052, 0.538, 0.169, -0.099, 1.83], [-0.948, -3.534, 0.813, 0.512, 1.974, 2.262], [1.309, -3.86, 1.13, 0.074, 1.177, 0.95], [0.842, -2.685, 1.111, 0.225, 0.622, 1.342]]\nC: [[1.398, -0.078, 0.847, 0.238, 5.699, 1.741], [-1.453, 1.912, 1.74, 0.206, 3.243, 1.354], [1.514, 3.636, 0.972, 1.079, 0.266, 0.762], [-1.064, 3.584, 1.382, 0.689, 0.248, 1.654], [-1.552, -0.739, 0.879, 0.227, 1.257, 1.692], [-1.211, -1.342, 0.86, 0.655, 0.096, 1.73], [-0.902, -1.484, 0.9, 0.087, 0.331, 1.816], [-0.874, -3.114, 1.006, 0.184, 1.508, 2.084], [0.921, -3.404, 0.668, 0.136, 1.137, 1.434], [1.157, -2.863, 0.703, 0.531, 0.128, 1.521]]\nD: [[1.025, -0.536, 0.699, 0.592, 5.958, 2.064], [-1.605, 1.792, 2.153, -0.235, 3.185, 1.084], [1.02, 3.68, 1.082, 1.526, 0.082, 0.582], [-1.08, 3.95, 0.986, 0.299, -0.139, 1.856], [-1.893, -0.998, 0.689, 0.259, 1.727, 1.918], [-1.034, -1.551, 0.605, 0.948, 0.46, 1.541], [-1.095, -1.908, 1.355, 0.164, 0.298, 1.555], [-0.914, -3.165, 0.928, -0.077, 1.779, 1.639], [0.568, -3.209, 0.575, 0.598, 1.246, 1.226], [1.226, -3.252, 0.43, 0.831, 0.263, 1.38]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_100_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_100_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[1.346, -1.632, 0.468, 0.228, 0.435, 0.463], [1.166, -1.379, 0.353, 0.04, 0.901, -0.25], [1.058, -1.308, -0.427, 0.416, 0.148, -0.261], [1.588, -1.671, -0.302, 0.296, 0.611, 0.478], [1.29, -1.513, 0.294, 0.468, 0.683, 0.487]]\nB: [[1.331, -1.83, 0.338, 0.317, 0.297, 0.192], [1.086, -1.365, 0.034, 0.38, 0.508, 0.129], [1.22, -1.567, 0.058, 0.382, 0.375, 0.145], [1.153, -2.04, 0.055, 0.29, 0.371, 0.11], [1.391, -1.481, 0.041, 0.386, 0.621, 0.13]]\nC: [[1.118, -1.374, 0.329, -0.089, 0.113, 0.27], [1.322, -1.418, -0.243, 0.677, 0.961, -0.031], [1.042, -1.495, -0.402, 0.189, 0.317, 0.229], [1.027, -2.005, 0.379, 0.337, 0.077, -0.062], [1.617, -1.294, 0.41, -0.08, 0.836, 0.171]]\nD: [[1.601, -1.491, 0.468, 0.181, 0.51, -0.093], [0.965, -1.654, 0.463, 0.875, 0.478, 0.252], [1.604, -1.87, -0.185, 0.098, 0.676, 0.612], [1.637, -2.272, -0.12, 0.307, 0.185, 0.124], [1.563, -1.727, 0.204, 0.781, 0.373, 0.021]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the shoes in the scene. The camera pose information includes: the rotation matrix: [[-0.079656, -0.319192, 0.944337], [-0.994012, 0.096527, -0.051219], [-0.074805, -0.942762, -0.324969]]; the translation vector: [4.3352, 2.935251, 1.464921], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.346, -1.632, 0.468, 0.228, 0.435, 0.463], [1.166, -1.379, 0.353, 0.04, 0.901, -0.25], [1.058, -1.308, -0.427, 0.416, 0.148, -0.261], [1.588, -1.671, -0.302, 0.296, 0.611, 0.478], [1.29, -1.513, 0.294, 0.468, 0.683, 0.487]]\nB: [[1.331, -1.83, 0.338, 0.317, 0.297, 0.192], [1.086, -1.365, 0.034, 0.38, 0.508, 0.129], [1.22, -1.567, 0.058, 0.382, 0.375, 0.145], [1.153, -2.04, 0.055, 0.29, 0.371, 0.11], [1.391, -1.481, 0.041, 0.386, 0.621, 0.13]]\nC: [[1.118, -1.374, 0.329, -0.089, 0.113, 0.27], [1.322, -1.418, -0.243, 0.677, 0.961, -0.031], [1.042, -1.495, -0.402, 0.189, 0.317, 0.229], [1.027, -2.005, 0.379, 0.337, 0.077, -0.062], [1.617, -1.294, 0.41, -0.08, 0.836, 0.171]]\nD: [[1.601, -1.491, 0.468, 0.181, 0.51, -0.093], [0.965, -1.654, 0.463, 0.875, 0.478, 0.252], [1.604, -1.87, -0.185, 0.098, 0.676, 0.612], [1.637, -2.272, -0.12, 0.307, 0.185, 0.124], [1.563, -1.727, 0.204, 0.781, 0.373, 0.021]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_101_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_101_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-0.756, -0.319, 1.042, 0.429, 0.322, 0.448]]\nB: [[-0.968, 0.035, 0.911, 0.64, 0.607, 0.13]]\nC: [[-0.447, -0.667, 1.37, -0.048, 0.547, 0.66]]\nD: [[-0.868, -0.191, 0.853, 0.715, 0.451, 0.897]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the coffee maker in the scene. The camera pose information includes: the rotation matrix: [[-0.848489, -0.131122, 0.512712], [-0.527579, 0.133483, -0.838954], [0.041567, -0.982339, -0.182436]]; the translation vector: [2.702568, 1.718074, 1.602473], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.756, -0.319, 1.042, 0.429, 0.322, 0.448]]\nB: [[-0.968, 0.035, 0.911, 0.64, 0.607, 0.13]]\nC: [[-0.447, -0.667, 1.37, -0.048, 0.547, 0.66]]\nD: [[-0.868, -0.191, 0.853, 0.715, 0.451, 0.897]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_102_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_102_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[1.912, -2.332, 1.905, 0.812, 1.477, 1.176], [1.567, 2.82, 1.32, 0.327, 1.345, 2.178]]\nB: [[1.7, -2.606, 1.565, 0.678, 1.073, 1.573], [1.645, 3.122, 1.233, 0.724, 1.059, 2.428]]\nC: [[2.101, -2.524, 1.207, 0.883, 0.819, 1.727], [1.696, 3.351, 1.474, 0.479, 1.235, 2.058]]\nD: [[1.681, -2.688, 1.507, 0.221, 0.833, 1.776], [1.4, 2.653, 1.693, 1.075, 1.288, 2.071]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the cabinet in the scene. The camera pose information includes: the rotation matrix: [[0.606497, 0.359513, -0.709163], [0.793947, -0.321582, 0.515978], [-0.042553, -0.875977, -0.480473]]; the translation vector: [5.898605, 1.464963, 1.329018], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.912, -2.332, 1.905, 0.812, 1.477, 1.176], [1.567, 2.82, 1.32, 0.327, 1.345, 2.178]]\nB: [[1.7, -2.606, 1.565, 0.678, 1.073, 1.573], [1.645, 3.122, 1.233, 0.724, 1.059, 2.428]]\nC: [[2.101, -2.524, 1.207, 0.883, 0.819, 1.727], [1.696, 3.351, 1.474, 0.479, 1.235, 2.058]]\nD: [[1.681, -2.688, 1.507, 0.221, 0.833, 1.776], [1.4, 2.653, 1.693, 1.075, 1.288, 2.071]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_103_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_103_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-1.693, -1.201, 1.187, 0.828, 0.813, 1.996]]\nB: [[-1.283, -1.49, 1.157, 1.223, 0.635, 2.337]]\nC: [[-1.607, -1.608, 0.733, 1.415, 0.912, 2.422]]\nD: [[-1.367, -1.969, 1.373, 1.253, 1.096, 1.909]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the cabinets in the scene. The camera pose information includes: the rotation matrix: [[0.349467, 0.022881, -0.936669], [0.936944, -0.011774, 0.349282], [-0.003037, -0.999669, -0.025553]]; the translation vector: [3.08553, 2.787215, 1.609269], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.693, -1.201, 1.187, 0.828, 0.813, 1.996]]\nB: [[-1.283, -1.49, 1.157, 1.223, 0.635, 2.337]]\nC: [[-1.607, -1.608, 0.733, 1.415, 0.912, 2.422]]\nD: [[-1.367, -1.969, 1.373, 1.253, 1.096, 1.909]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_104_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_104_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[1.858, -0.632, 0.828, 0.126, 1.643, 1.687], [-1.33, 0.028, 0.915, 0.226, 2.888, 1.864], [-0.174, -1.42, 0.865, 2.224, 0.121, 1.722], [0.61, 1.413, 0.874, 4.003, 0.17, 1.77], [2.563, 1.11, 0.788, 0.118, 0.484, 1.649]]\nB: [[1.405, -0.208, 0.598, -0.114, 2.093, 1.602], [-1.061, 0.394, 1.019, -0.16, 3.193, 1.369], [-0.359, -0.986, 0.414, 1.802, -0.111, 1.429], [1.035, 1.154, 1.154, 3.812, 0.204, 2.113], [2.12, 1.579, 1.171, -0.054, 0.234, 1.478]]\nC: [[1.89, -0.153, 0.406, 0.028, 1.816, 1.93], [-1.451, -0.417, 1.393, -0.113, 3.307, 1.683], [-0.295, -1.25, 0.577, 1.985, -0.098, 1.447], [0.348, 1.382, 0.753, 3.885, 0.441, 1.993], [2.183, 0.625, 0.617, 0.117, 0.723, 1.324]]\nD: [[2.301, -0.62, 1.122, 0.26, 2.124, 2.126], [-0.834, -0.412, 1.071, -0.118, 2.484, 1.498], [-0.094, -1.494, 0.531, 2.098, -0.018, 2.208], [0.226, 1.164, 1.047, 4.422, 0.121, 1.595], [2.644, 1.556, 0.635, 0.354, 0.125, 1.662]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[0.991592, 0.052224, -0.118397], [0.1292, -0.348306, 0.928435], [0.007248, -0.935925, -0.352124]]; the translation vector: [2.177373, 2.142725, 1.46728], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.858, -0.632, 0.828, 0.126, 1.643, 1.687], [-1.33, 0.028, 0.915, 0.226, 2.888, 1.864], [-0.174, -1.42, 0.865, 2.224, 0.121, 1.722], [0.61, 1.413, 0.874, 4.003, 0.17, 1.77], [2.563, 1.11, 0.788, 0.118, 0.484, 1.649]]\nB: [[1.405, -0.208, 0.598, -0.114, 2.093, 1.602], [-1.061, 0.394, 1.019, -0.16, 3.193, 1.369], [-0.359, -0.986, 0.414, 1.802, -0.111, 1.429], [1.035, 1.154, 1.154, 3.812, 0.204, 2.113], [2.12, 1.579, 1.171, -0.054, 0.234, 1.478]]\nC: [[1.89, -0.153, 0.406, 0.028, 1.816, 1.93], [-1.451, -0.417, 1.393, -0.113, 3.307, 1.683], [-0.295, -1.25, 0.577, 1.985, -0.098, 1.447], [0.348, 1.382, 0.753, 3.885, 0.441, 1.993], [2.183, 0.625, 0.617, 0.117, 0.723, 1.324]]\nD: [[2.301, -0.62, 1.122, 0.26, 2.124, 2.126], [-0.834, -0.412, 1.071, -0.118, 2.484, 1.498], [-0.094, -1.494, 0.531, 2.098, -0.018, 2.208], [0.226, 1.164, 1.047, 4.422, 0.121, 1.595], [2.644, 1.556, 0.635, 0.354, 0.125, 1.662]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_105_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_105_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[1.401, -1.054, 0.005, 0.193, -0.189, 0.608], [-1.764, -0.727, 0.562, 0.549, 0.36, 0.374], [-2.181, 0.328, -0.167, 0.351, 0.064, 0.119]]\nB: [[1.152, -0.296, 0.418, 0.864, 0.385, 0.356], [-2.324, -0.32, 0.424, 0.485, 0.66, -0.082], [-1.955, 0.121, 0.148, 0.369, 0.415, 0.131]]\nC: [[1.282, -0.743, 0.129, 0.493, 0.257, 0.293], [-1.968, -0.763, 0.156, 0.467, 0.241, 0.31], [-1.95, 0.267, 0.16, 0.231, 0.318, 0.302]]\nD: [[1.109, -0.73, -0.038, 0.564, 0.587, 0.172], [-2.259, -0.589, 0.46, 0.771, -0.144, -0.09], [-1.478, 0.494, 0.535, 0.374, 0.223, 0.643]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the trash can in the scene. The camera pose information includes: the rotation matrix: [[-0.789457, 0.162095, -0.592016], [0.613764, 0.197318, -0.764434], [-0.007096, -0.966846, -0.255262]]; the translation vector: [5.114759, 3.17533, 1.386193], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.401, -1.054, 0.005, 0.193, -0.189, 0.608], [-1.764, -0.727, 0.562, 0.549, 0.36, 0.374], [-2.181, 0.328, -0.167, 0.351, 0.064, 0.119]]\nB: [[1.152, -0.296, 0.418, 0.864, 0.385, 0.356], [-2.324, -0.32, 0.424, 0.485, 0.66, -0.082], [-1.955, 0.121, 0.148, 0.369, 0.415, 0.131]]\nC: [[1.282, -0.743, 0.129, 0.493, 0.257, 0.293], [-1.968, -0.763, 0.156, 0.467, 0.241, 0.31], [-1.95, 0.267, 0.16, 0.231, 0.318, 0.302]]\nD: [[1.109, -0.73, -0.038, 0.564, 0.587, 0.172], [-2.259, -0.589, 0.46, 0.771, -0.144, -0.09], [-1.478, 0.494, 0.535, 0.374, 0.223, 0.643]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_106_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_106_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-0.628, -0.574, 0.244, 0.629, 0.377, 0.613]]\nB: [[-0.255, -0.118, 0.331, 1.064, 0.829, 0.169]]\nC: [[-0.907, -0.799, 0.595, 1.106, -0.043, 0.376]]\nD: [[-0.149, -0.775, 0.103, 0.329, 0.393, 1.004]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the toilet in the scene. The camera pose information includes: the rotation matrix: [[-0.881415, -0.308012, 0.3581], [-0.47008, 0.646119, -0.601294], [-0.046169, -0.698325, -0.71429]]; the translation vector: [3.147524, 1.689608, 1.273114], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.628, -0.574, 0.244, 0.629, 0.377, 0.613]]\nB: [[-0.255, -0.118, 0.331, 1.064, 0.829, 0.169]]\nC: [[-0.907, -0.799, 0.595, 1.106, -0.043, 0.376]]\nD: [[-0.149, -0.775, 0.103, 0.329, 0.393, 1.004]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_107_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_107_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-0.984, -0.791, 0.71, -0.025, 0.146, 1.95], [-2.845, 0.826, 0.79, -0.018, 2.045, 2.096], [-3.127, 1.767, 1.058, 0.668, 0.751, 1.034], [-2.598, 2.204, 1.456, 0.337, 0.226, 0.622], [-1.681, 2.399, 0.33, 0.897, -0.155, 1.345], [-3.032, -0.879, 1.701, 0.543, 0.324, 1.619], [-2.871, -1.012, 1.112, 0.372, 0.525, 1.953], [-1.989, -1.505, 1.05, 1.013, 0.57, 2.119], [-2.342, -1.032, 1.573, 0.099, 0.665, 2.113], [-1.812, -1.549, 0.936, 0.03, 0.702, 1.331], [-0.897, -1.334, 1.032, -0.375, 0.838, 1.222], [1.073, -1.243, 1.222, 0.074, 0.103, 1.889], [0.684, -1.361, 0.842, 0.81, 0.134, 2.122], [1.681, -0.985, 0.915, -0.15, 0.046, 2.017], [1.499, -1.143, 1.225, 1.125, 0.255, 2.053], [2.518, -0.611, 1.251, 0.147, 0.376, 1.463], [2.78, -0.555, 1.623, 0.321, 0.5, 1.453], [2.607, 0.422, 0.93, 0.428, 2.768, 1.865], [2.929, 2.246, 1.615, 0.791, 0.45, 0.84], [2.894, 2.607, 0.679, 0.418, 0.185, 1.771], [1.91, 1.95, 0.631, 1.792, 0.498, 2.141], [0.968, 2.657, -0.02, 0.142, 0.513, 1.049], [2.739, 1.468, 0.752, 0.297, 1.124, 0.649], [1.999, -0.528, 0.795, 0.19, 0.625, 0.528]]\nB: [[-1.697, -1.101, 1.32, 0.519, 0.607, 2.31], [-2.793, 0.346, 1.419, 0.299, 1.423, 2.149], [-2.764, 1.638, 1.178, 0.604, 0.448, 0.518], [-2.94, 2.09, 1.372, -0.13, -0.272, 0.296], [-1.734, 2.804, 0.582, 1.396, 0.541, 0.939], [-2.549, -0.196, 1.12, 0.785, 0.411, 1.926], [-2.871, -1.014, 0.799, 0.56, 0.597, 1.935], [-2.659, -0.762, 1.356, 0.825, 0.021, 2.649], [-1.977, -1.011, 1.131, 0.465, 0.035, 2.324], [-1.526, -1.598, 1.392, 0.441, -0.118, 2.102], [-1.353, -0.868, 0.591, 0.125, 0.493, 1.476], [1.173, -1.254, 0.599, -0.335, 0.938, 1.499], [1.444, -1.618, 1.332, 0.376, 0.369, 1.68], [1.582, -1.255, 0.456, -0.034, -0.048, 2.138], [2.452, -1.152, 1.16, 0.41, -0.305, 2.162], [2.246, -1.101, 0.993, 0.065, 0.725, 2.256], [2.414, -0.99, 1.12, 0.836, 0.744, 1.026], [2.607, 0.594, 0.728, -0.103, 2.445, 1.796], [2.075, 1.78, 1.433, 0.826, 1.27, 1.569], [2.842, 2.47, 1.179, 0.437, 0.717, 1.714], [2.073, 1.959, 0.513, 1.293, -0.057, 1.28], [0.684, 2.546, 0.647, 0.281, 0.423, 0.403], [1.985, 2.256, 0.609, 0.323, 0.304, 0.186], [1.904, -0.439, 0.116, 0.205, 0.913, 1.076]]\nC: [[-0.882, -0.996, 0.828, 0.066, -0.03, 2.259], [-2.906, 0.486, 0.584, 0.338, 1.448, 2.228], [-2.335, 1.79, 1.402, 0.799, 0.604, 0.979], [-2.492, 2.696, 0.852, 0.385, -0.119, 0.551], [-2.113, 2.369, 0.634, 1.634, -0.378, 1.33], [-2.9, -0.382, 1.544, 0.229, 0.561, 1.896], [-2.46, -0.938, 0.92, 0.562, 0.836, 1.812], [-2.236, -1.122, 1.385, 0.806, -0.301, 1.756], [-1.859, -1.439, 0.978, -0.087, 0.007, 2.232], [-1.477, -1.605, 1.119, -0.203, 0.225, 1.352], [-0.558, -1.702, 0.427, 0.133, 0.668, 1.46], [0.818, -0.885, 1.161, 0.455, 0.101, 1.667], [0.552, -1.308, 0.707, 0.978, 0.615, 1.676], [2.109, -1.305, 1.008, -0.007, 0.224, 2.013], [2.016, -1.577, 1.004, 0.572, 0.061, 2.141], [1.754, -1.027, 1.286, 0.147, 0.165, 1.509], [2.849, -0.613, 0.987, 0.617, 1.099, 1.162], [2.281, 0.428, 1.287, 0.612, 2.792, 1.8], [2.1, 1.909, 1.627, 0.042, 0.641, 1.338], [2.025, 1.994, 0.97, 0.816, 0.372, 1.93], [1.678, 2.705, 1.241, 1.93, -0.063, 1.837], [0.582, 2.314, 0.279, 0.554, 0.013, 1.151], [2.245, 1.504, 0.631, 0.05, 1.008, 1.066], [2.09, -0.514, 0.622, -0.006, 1.061, 1.06]]\nD: [[-1.212, -1.13, 1.017, 0.465, 0.161, 2.011], [-2.56, 0.64, 0.971, 0.201, 1.804, 1.935], [-2.744, 1.914, 1.197, 0.349, 0.771, 0.66], [-2.606, 2.363, 1.087, 0.038, 0.219, 0.424], [-1.931, 2.472, 0.667, 1.366, 0.094, 1.255], [-2.729, -0.603, 1.38, 0.39, 0.792, 1.541], [-2.531, -0.93, 1.084, 0.175, 0.507, 2.172], [-2.227, -1.13, 1.087, 0.723, 0.142, 2.167], [-1.887, -1.279, 1.074, 0.181, 0.413, 2.133], [-1.395, -1.301, 1.124, 0.117, 0.289, 1.814], [-0.99, -1.313, 0.763, 0.122, 0.477, 1.511], [0.768, -1.372, 0.865, 0.144, 0.573, 1.696], [0.958, -1.124, 0.866, 0.51, 0.163, 1.704], [1.687, -1.284, 0.89, 0.172, 0.422, 1.772], [1.97, -1.137, 0.897, 0.705, 0.139, 1.81], [2.237, -1.017, 0.895, 0.302, 0.335, 1.807], [2.506, -0.615, 1.189, 0.456, 0.783, 1.228], [2.295, 0.463, 0.865, 0.248, 2.384, 1.746], [2.549, 1.9, 1.178, 0.43, 0.874, 1.13], [2.396, 2.329, 0.87, 0.323, 0.269, 1.739], [1.621, 2.45, 0.875, 1.651, 0.189, 1.735], [0.789, 2.563, 0.425, 0.113, 0.177, 0.782], [2.336, 1.911, 0.338, 0.211, 0.688, 0.678], [2.287, -0.576, 0.338, 0.14, 0.728, 0.71]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[-0.731293, 0.384445, -0.563394], [0.682011, 0.401944, -0.610984], [-0.008437, -0.831049, -0.556135]]; the translation vector: [5.176627, 2.209938, 1.427488], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.984, -0.791, 0.71, -0.025, 0.146, 1.95], [-2.845, 0.826, 0.79, -0.018, 2.045, 2.096], [-3.127, 1.767, 1.058, 0.668, 0.751, 1.034], [-2.598, 2.204, 1.456, 0.337, 0.226, 0.622], [-1.681, 2.399, 0.33, 0.897, -0.155, 1.345], [-3.032, -0.879, 1.701, 0.543, 0.324, 1.619], [-2.871, -1.012, 1.112, 0.372, 0.525, 1.953], [-1.989, -1.505, 1.05, 1.013, 0.57, 2.119], [-2.342, -1.032, 1.573, 0.099, 0.665, 2.113], [-1.812, -1.549, 0.936, 0.03, 0.702, 1.331], [-0.897, -1.334, 1.032, -0.375, 0.838, 1.222], [1.073, -1.243, 1.222, 0.074, 0.103, 1.889], [0.684, -1.361, 0.842, 0.81, 0.134, 2.122], [1.681, -0.985, 0.915, -0.15, 0.046, 2.017], [1.499, -1.143, 1.225, 1.125, 0.255, 2.053], [2.518, -0.611, 1.251, 0.147, 0.376, 1.463], [2.78, -0.555, 1.623, 0.321, 0.5, 1.453], [2.607, 0.422, 0.93, 0.428, 2.768, 1.865], [2.929, 2.246, 1.615, 0.791, 0.45, 0.84], [2.894, 2.607, 0.679, 0.418, 0.185, 1.771], [1.91, 1.95, 0.631, 1.792, 0.498, 2.141], [0.968, 2.657, -0.02, 0.142, 0.513, 1.049], [2.739, 1.468, 0.752, 0.297, 1.124, 0.649], [1.999, -0.528, 0.795, 0.19, 0.625, 0.528]]\nB: [[-1.697, -1.101, 1.32, 0.519, 0.607, 2.31], [-2.793, 0.346, 1.419, 0.299, 1.423, 2.149], [-2.764, 1.638, 1.178, 0.604, 0.448, 0.518], [-2.94, 2.09, 1.372, -0.13, -0.272, 0.296], [-1.734, 2.804, 0.582, 1.396, 0.541, 0.939], [-2.549, -0.196, 1.12, 0.785, 0.411, 1.926], [-2.871, -1.014, 0.799, 0.56, 0.597, 1.935], [-2.659, -0.762, 1.356, 0.825, 0.021, 2.649], [-1.977, -1.011, 1.131, 0.465, 0.035, 2.324], [-1.526, -1.598, 1.392, 0.441, -0.118, 2.102], [-1.353, -0.868, 0.591, 0.125, 0.493, 1.476], [1.173, -1.254, 0.599, -0.335, 0.938, 1.499], [1.444, -1.618, 1.332, 0.376, 0.369, 1.68], [1.582, -1.255, 0.456, -0.034, -0.048, 2.138], [2.452, -1.152, 1.16, 0.41, -0.305, 2.162], [2.246, -1.101, 0.993, 0.065, 0.725, 2.256], [2.414, -0.99, 1.12, 0.836, 0.744, 1.026], [2.607, 0.594, 0.728, -0.103, 2.445, 1.796], [2.075, 1.78, 1.433, 0.826, 1.27, 1.569], [2.842, 2.47, 1.179, 0.437, 0.717, 1.714], [2.073, 1.959, 0.513, 1.293, -0.057, 1.28], [0.684, 2.546, 0.647, 0.281, 0.423, 0.403], [1.985, 2.256, 0.609, 0.323, 0.304, 0.186], [1.904, -0.439, 0.116, 0.205, 0.913, 1.076]]\nC: [[-0.882, -0.996, 0.828, 0.066, -0.03, 2.259], [-2.906, 0.486, 0.584, 0.338, 1.448, 2.228], [-2.335, 1.79, 1.402, 0.799, 0.604, 0.979], [-2.492, 2.696, 0.852, 0.385, -0.119, 0.551], [-2.113, 2.369, 0.634, 1.634, -0.378, 1.33], [-2.9, -0.382, 1.544, 0.229, 0.561, 1.896], [-2.46, -0.938, 0.92, 0.562, 0.836, 1.812], [-2.236, -1.122, 1.385, 0.806, -0.301, 1.756], [-1.859, -1.439, 0.978, -0.087, 0.007, 2.232], [-1.477, -1.605, 1.119, -0.203, 0.225, 1.352], [-0.558, -1.702, 0.427, 0.133, 0.668, 1.46], [0.818, -0.885, 1.161, 0.455, 0.101, 1.667], [0.552, -1.308, 0.707, 0.978, 0.615, 1.676], [2.109, -1.305, 1.008, -0.007, 0.224, 2.013], [2.016, -1.577, 1.004, 0.572, 0.061, 2.141], [1.754, -1.027, 1.286, 0.147, 0.165, 1.509], [2.849, -0.613, 0.987, 0.617, 1.099, 1.162], [2.281, 0.428, 1.287, 0.612, 2.792, 1.8], [2.1, 1.909, 1.627, 0.042, 0.641, 1.338], [2.025, 1.994, 0.97, 0.816, 0.372, 1.93], [1.678, 2.705, 1.241, 1.93, -0.063, 1.837], [0.582, 2.314, 0.279, 0.554, 0.013, 1.151], [2.245, 1.504, 0.631, 0.05, 1.008, 1.066], [2.09, -0.514, 0.622, -0.006, 1.061, 1.06]]\nD: [[-1.212, -1.13, 1.017, 0.465, 0.161, 2.011], [-2.56, 0.64, 0.971, 0.201, 1.804, 1.935], [-2.744, 1.914, 1.197, 0.349, 0.771, 0.66], [-2.606, 2.363, 1.087, 0.038, 0.219, 0.424], [-1.931, 2.472, 0.667, 1.366, 0.094, 1.255], [-2.729, -0.603, 1.38, 0.39, 0.792, 1.541], [-2.531, -0.93, 1.084, 0.175, 0.507, 2.172], [-2.227, -1.13, 1.087, 0.723, 0.142, 2.167], [-1.887, -1.279, 1.074, 0.181, 0.413, 2.133], [-1.395, -1.301, 1.124, 0.117, 0.289, 1.814], [-0.99, -1.313, 0.763, 0.122, 0.477, 1.511], [0.768, -1.372, 0.865, 0.144, 0.573, 1.696], [0.958, -1.124, 0.866, 0.51, 0.163, 1.704], [1.687, -1.284, 0.89, 0.172, 0.422, 1.772], [1.97, -1.137, 0.897, 0.705, 0.139, 1.81], [2.237, -1.017, 0.895, 0.302, 0.335, 1.807], [2.506, -0.615, 1.189, 0.456, 0.783, 1.228], [2.295, 0.463, 0.865, 0.248, 2.384, 1.746], [2.549, 1.9, 1.178, 0.43, 0.874, 1.13], [2.396, 2.329, 0.87, 0.323, 0.269, 1.739], [1.621, 2.45, 0.875, 1.651, 0.189, 1.735], [0.789, 2.563, 0.425, 0.113, 0.177, 0.782], [2.336, 1.911, 0.338, 0.211, 0.688, 0.678], [2.287, -0.576, 0.338, 0.14, 0.728, 0.71]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_108_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_108_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-1.196, -0.211, 0.68, 0.711, 0.576, 2.155]]\nB: [[-0.409, 0.533, 1.267, -0.113, 0.263, 1.631]]\nC: [[-0.799, 0.234, 0.962, 0.275, 0.234, 1.923]]\nD: [[-1.167, 0.457, 0.799, -0.179, 0.573, 2.357]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the shower curtain in the scene. The camera pose information includes: the rotation matrix: [[-0.506976, -0.449046, 0.735753], [-0.861802, 0.247713, -0.442646], [0.016513, -0.858485, -0.512574]]; the translation vector: [1.568574, 4.423309, 1.333385], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.196, -0.211, 0.68, 0.711, 0.576, 2.155]]\nB: [[-0.409, 0.533, 1.267, -0.113, 0.263, 1.631]]\nC: [[-0.799, 0.234, 0.962, 0.275, 0.234, 1.923]]\nD: [[-1.167, 0.457, 0.799, -0.179, 0.573, 2.357]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_109_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_109_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-1.94, 1.68, 0.837, 0.663, 0.508, 0.307]]\nB: [[-1.567, 0.924, 0.596, -0.078, 0.24, 0.881]]\nC: [[-1.847, 1.274, 0.842, 0.196, 0.441, 0.778]]\nD: [[-2.041, 1.755, 1.288, 0.168, 0.884, 0.741]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the object in the scene. The camera pose information includes: the rotation matrix: [[0.233902, -0.58763, 0.774584], [-0.967246, -0.059828, 0.246692], [-0.098622, -0.806915, -0.582377]]; the translation vector: [0.860343, 3.117731, 1.418568], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.94, 1.68, 0.837, 0.663, 0.508, 0.307]]\nB: [[-1.567, 0.924, 0.596, -0.078, 0.24, 0.881]]\nC: [[-1.847, 1.274, 0.842, 0.196, 0.441, 0.778]]\nD: [[-2.041, 1.755, 1.288, 0.168, 0.884, 0.741]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_110_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_110_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[1.505, -0.116, 0.747, 0.409, 0.695, 0.297], [-1.441, 0.909, 0.606, 0.695, 0.528, 0.24], [1.536, 0.64, 0.715, 0.483, 0.851, 0.175], [1.546, -0.374, 0.796, 0.374, 0.77, 0.35], [-1.45, 0.754, 0.484, 0.85, 0.766, 0.215]]\nB: [[1.607, 0.304, 0.83, 0.898, 0.58, 0.697], [-1.406, 0.619, 0.763, 0.933, 0.149, 0.108], [1.448, 0.861, 0.699, 0.254, 0.441, 0.026], [1.945, -0.851, 0.97, 0.08, 1.051, 0.781], [-1.319, 0.842, 0.31, 1.314, 0.811, 0.161]]\nC: [[1.451, -0.224, 1.202, 0.474, 0.259, 0.177], [-1.303, 1.145, 0.291, 1.141, 0.346, 0.272], [1.763, 0.401, 0.944, 0.92, 1.062, -0.044], [1.663, -0.056, 0.805, 0.848, 1.189, 0.211], [-1.93, 0.603, 0.76, 0.741, 0.586, -0.206]]\nD: [[1.888, 0.164, 1.08, 0.295, 0.332, 0.729], [-1.781, 1.348, 0.164, 0.674, 0.738, 0.722], [1.997, 0.742, 0.991, 0.029, 0.449, -0.1], [1.487, 0.076, 0.6, 0.156, 0.445, 0.145], [-1.75, 1.16, 0.275, 0.799, 1.235, 0.304]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the pillow in the scene. The camera pose information includes: the rotation matrix: [[0.484778, 0.389748, -0.782998], [0.874059, -0.248441, 0.417491], [-0.031813, -0.886777, -0.461102]]; the translation vector: [2.948564, 2.712566, 1.480667], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.505, -0.116, 0.747, 0.409, 0.695, 0.297], [-1.441, 0.909, 0.606, 0.695, 0.528, 0.24], [1.536, 0.64, 0.715, 0.483, 0.851, 0.175], [1.546, -0.374, 0.796, 0.374, 0.77, 0.35], [-1.45, 0.754, 0.484, 0.85, 0.766, 0.215]]\nB: [[1.607, 0.304, 0.83, 0.898, 0.58, 0.697], [-1.406, 0.619, 0.763, 0.933, 0.149, 0.108], [1.448, 0.861, 0.699, 0.254, 0.441, 0.026], [1.945, -0.851, 0.97, 0.08, 1.051, 0.781], [-1.319, 0.842, 0.31, 1.314, 0.811, 0.161]]\nC: [[1.451, -0.224, 1.202, 0.474, 0.259, 0.177], [-1.303, 1.145, 0.291, 1.141, 0.346, 0.272], [1.763, 0.401, 0.944, 0.92, 1.062, -0.044], [1.663, -0.056, 0.805, 0.848, 1.189, 0.211], [-1.93, 0.603, 0.76, 0.741, 0.586, -0.206]]\nD: [[1.888, 0.164, 1.08, 0.295, 0.332, 0.729], [-1.781, 1.348, 0.164, 0.674, 0.738, 0.722], [1.997, 0.742, 0.991, 0.029, 0.449, -0.1], [1.487, 0.076, 0.6, 0.156, 0.445, 0.145], [-1.75, 1.16, 0.275, 0.799, 1.235, 0.304]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_111_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_111_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[1.148, -1.819, 0.681, 1.73, 0.9, 0.465], [-1.44, 2.208, 0.821, 0.893, 2.303, 0.634], [0.765, 1.362, 0.255, 2.131, 1.052, 0.327], [-1.998, -1.691, 0.062, 1.757, 1.835, 0.718]]\nB: [[1.542, -1.233, 0.854, 2.268, 1.021, 0.755], [-2.098, 1.815, 0.076, 0.977, 1.531, 0.579], [1.499, 1.894, 0.799, 1.364, 1.243, 0.606], [-1.591, -1.777, -0.089, 1.375, 2.302, 0.818]]\nC: [[1.019, -1.513, 0.012, 1.939, 1.04, 0.603], [-1.397, 1.894, 0.192, 1.788, 2.263, 0.963], [0.794, 1.72, 0.728, 1.503, 1.344, 0.994], [-1.899, -1.035, 0.107, 1.802, 1.941, 0.705]]\nD: [[1.181, -1.566, 0.434, 1.91, 1.342, 0.847], [-1.636, 1.86, 0.387, 1.322, 1.894, 0.782], [1.234, 1.651, 0.4, 1.847, 1.393, 0.784], [-1.767, -1.535, 0.407, 1.331, 1.981, 0.802]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the table in the scene. The camera pose information includes: the rotation matrix: [[0.996822, -0.027813, -0.074656], [0.056495, -0.413943, 0.908548], [-0.056173, -0.909878, -0.411056]]; the translation vector: [4.405487, 5.403347, 1.494535], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.148, -1.819, 0.681, 1.73, 0.9, 0.465], [-1.44, 2.208, 0.821, 0.893, 2.303, 0.634], [0.765, 1.362, 0.255, 2.131, 1.052, 0.327], [-1.998, -1.691, 0.062, 1.757, 1.835, 0.718]]\nB: [[1.542, -1.233, 0.854, 2.268, 1.021, 0.755], [-2.098, 1.815, 0.076, 0.977, 1.531, 0.579], [1.499, 1.894, 0.799, 1.364, 1.243, 0.606], [-1.591, -1.777, -0.089, 1.375, 2.302, 0.818]]\nC: [[1.019, -1.513, 0.012, 1.939, 1.04, 0.603], [-1.397, 1.894, 0.192, 1.788, 2.263, 0.963], [0.794, 1.72, 0.728, 1.503, 1.344, 0.994], [-1.899, -1.035, 0.107, 1.802, 1.941, 0.705]]\nD: [[1.181, -1.566, 0.434, 1.91, 1.342, 0.847], [-1.636, 1.86, 0.387, 1.322, 1.894, 0.782], [1.234, 1.651, 0.4, 1.847, 1.393, 0.784], [-1.767, -1.535, 0.407, 1.331, 1.981, 0.802]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_112_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_112_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[2.632, 2.861, 0.973, 0.851, 0.88, 0.553]]\nB: [[2.217, 3.039, 0.859, 0.578, 0.679, 0.811]]\nC: [[2.372, 2.508, 1.395, 0.466, 0.758, 0.941]]\nD: [[2.418, 3.313, 1.363, 0.462, 1.217, 0.869]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the tv in the scene. The camera pose information includes: the rotation matrix: [[-0.869565, 0.231948, -0.435955], [0.492522, 0.471291, -0.731647], [0.035758, -0.850932, -0.524058]]; the translation vector: [2.750575, 3.154689, 1.290553], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[2.632, 2.861, 0.973, 0.851, 0.88, 0.553]]\nB: [[2.217, 3.039, 0.859, 0.578, 0.679, 0.811]]\nC: [[2.372, 2.508, 1.395, 0.466, 0.758, 0.941]]\nD: [[2.418, 3.313, 1.363, 0.462, 1.217, 0.869]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_113_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_113_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-2.523, -0.73, 1.669, 0.473, 3.389, 1.19]]\nB: [[-2.737, -0.956, 1.441, 0.102, 2.891, 0.9]]\nC: [[-2.415, -1.042, 1.71, -0.167, 2.518, 1.307]]\nD: [[-3.121, -1.319, 1.73, 0.166, 2.406, 0.532]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the board in the scene. The camera pose information includes: the rotation matrix: [[0.896132, -0.052356, 0.440688], [-0.436974, -0.277444, 0.855616], [0.07747, -0.959314, -0.271505]]; the translation vector: [3.211431, 3.110947, 1.584554], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-2.523, -0.73, 1.669, 0.473, 3.389, 1.19]]\nB: [[-2.737, -0.956, 1.441, 0.102, 2.891, 0.9]]\nC: [[-2.415, -1.042, 1.71, -0.167, 2.518, 1.307]]\nD: [[-3.121, -1.319, 1.73, 0.166, 2.406, 0.532]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_114_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_114_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[0.919, 2.881, 0.5, 1.076, 0.198, 0.512], [-0.387, 3.021, 0.763, 1.206, 0.18, 1.044]]\nB: [[0.967, 3.235, 0.454, 1.103, -0.268, 0.912], [-0.093, 2.837, 0.491, 1.606, 0.643, 1.265]]\nC: [[1.146, 2.813, 0.895, 1.333, -0.231, 0.884], [-0.108, 2.697, 0.646, 1.144, -0.245, 0.801]]\nD: [[1.405, 2.769, 0.583, 0.816, -0.053, 0.839], [-0.646, 2.953, 0.434, 1.464, 0.436, 0.68]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the mirror in the scene. The camera pose information includes: the rotation matrix: [[-0.880278, -0.246293, 0.405524], [-0.473973, 0.417832, -0.775091], [0.021459, -0.874503, -0.484545]]; the translation vector: [3.281806, 2.754624, 1.352781], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.919, 2.881, 0.5, 1.076, 0.198, 0.512], [-0.387, 3.021, 0.763, 1.206, 0.18, 1.044]]\nB: [[0.967, 3.235, 0.454, 1.103, -0.268, 0.912], [-0.093, 2.837, 0.491, 1.606, 0.643, 1.265]]\nC: [[1.146, 2.813, 0.895, 1.333, -0.231, 0.884], [-0.108, 2.697, 0.646, 1.144, -0.245, 0.801]]\nD: [[1.405, 2.769, 0.583, 0.816, -0.053, 0.839], [-0.646, 2.953, 0.434, 1.464, 0.436, 0.68]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_115_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_115_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[0.099, -1.623, 0.8, 1.091, 0.185, 1.674]]\nB: [[0.028, -1.324, 1.283, 0.847, -0.251, 1.976]]\nC: [[0.008, -1.165, 1.014, 1.132, -0.028, 1.19]]\nD: [[0.219, -1.325, 0.313, 1.01, 0.321, 1.757]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the doorframe in the scene. The camera pose information includes: the rotation matrix: [[-0.874867, -0.0675, 0.479638], [-0.482919, 0.197999, -0.852987], [-0.037391, -0.977875, -0.205819]]; the translation vector: [2.397274, 1.722858, 1.486845], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.099, -1.623, 0.8, 1.091, 0.185, 1.674]]\nB: [[0.028, -1.324, 1.283, 0.847, -0.251, 1.976]]\nC: [[0.008, -1.165, 1.014, 1.132, -0.028, 1.19]]\nD: [[0.219, -1.325, 0.313, 1.01, 0.321, 1.757]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_116_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_116_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[0.712, -1.245, 0.91, 1.048, 1.199, 2.013]]\nB: [[0.626, -1.611, 1.221, 1.09, 1.245, 2.069]]\nC: [[1.138, -1.446, 0.77, 0.846, 1.373, 1.96]]\nD: [[0.371, -1.441, 0.499, 0.655, 1.441, 2.321]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the shower in the scene. The camera pose information includes: the rotation matrix: [[-0.612656, -0.411508, 0.674769], [-0.789543, 0.280105, -0.546043], [0.035694, -0.867296, -0.496511]]; the translation vector: [1.897828, 2.372103, 1.388776], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.712, -1.245, 0.91, 1.048, 1.199, 2.013]]\nB: [[0.626, -1.611, 1.221, 1.09, 1.245, 2.069]]\nC: [[1.138, -1.446, 0.77, 0.846, 1.373, 1.96]]\nD: [[0.371, -1.441, 0.499, 0.655, 1.441, 2.321]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_117_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_117_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-0.851, -0.281, 1.012, 0.232, 0.838, 2.123]]\nB: [[-0.647, 0.167, 1.047, -0.111, 0.572, 1.688]]\nC: [[-0.968, -0.496, 1.046, -0.014, 1.192, 1.751]]\nD: [[-0.616, -0.07, 1.075, 0.231, 1.203, 1.991]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the doorframe in the scene. The camera pose information includes: the rotation matrix: [[-0.48142, 0.335029, -0.809933], [0.872625, 0.096524, -0.478757], [-0.08222, -0.937251, -0.338823]]; the translation vector: [4.429162, 2.287411, 1.464776], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.851, -0.281, 1.012, 0.232, 0.838, 2.123]]\nB: [[-0.647, 0.167, 1.047, -0.111, 0.572, 1.688]]\nC: [[-0.968, -0.496, 1.046, -0.014, 1.192, 1.751]]\nD: [[-0.616, -0.07, 1.075, 0.231, 1.203, 1.991]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_118_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_118_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[1.76, 1.613, 0.501, 0.748, 1.477, 2.283], [-1.256, 0.486, 0.695, 0.261, -0.004, 1.392]]\nB: [[1.66, 0.843, 1.041, 1.024, 1.548, 1.575], [-0.68, 1.177, 0.879, 0.467, 0.635, 2.319]]\nC: [[1.906, 1.059, 1.056, 0.263, 1.047, 1.4], [-0.793, 1.238, 0.654, 0.903, 0.438, 1.901]]\nD: [[1.788, 1.153, 0.954, 0.56, 1.154, 1.881], [-0.939, 0.896, 0.911, 0.636, 0.225, 1.837]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the door in the scene. The camera pose information includes: the rotation matrix: [[-0.414473, -0.491559, 0.765887], [-0.909569, 0.196057, -0.366396], [0.029948, -0.848488, -0.528367]]; the translation vector: [0.955419, 3.497842, 1.497559], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.76, 1.613, 0.501, 0.748, 1.477, 2.283], [-1.256, 0.486, 0.695, 0.261, -0.004, 1.392]]\nB: [[1.66, 0.843, 1.041, 1.024, 1.548, 1.575], [-0.68, 1.177, 0.879, 0.467, 0.635, 2.319]]\nC: [[1.906, 1.059, 1.056, 0.263, 1.047, 1.4], [-0.793, 1.238, 0.654, 0.903, 0.438, 1.901]]\nD: [[1.788, 1.153, 0.954, 0.56, 1.154, 1.881], [-0.939, 0.896, 0.911, 0.636, 0.225, 1.837]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_119_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_119_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[0.054, 1.184, 0.861, 1.846, 0.937, 1.341]]\nB: [[0.486, 0.802, 0.412, 1.751, 1.322, 0.856]]\nC: [[0.138, 0.31, 0.136, 1.361, 1.636, 1.27]]\nD: [[0.461, 1.003, 0.863, 1.591, 0.946, 0.97]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the bed in the scene. The camera pose information includes: the rotation matrix: [[-0.778266, 0.076502, -0.623257], [0.626532, 0.028295, -0.778882], [-0.041951, -0.996668, -0.069952]]; the translation vector: [4.354075, 2.27787, 1.510689], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.054, 1.184, 0.861, 1.846, 0.937, 1.341]]\nB: [[0.486, 0.802, 0.412, 1.751, 1.322, 0.856]]\nC: [[0.138, 0.31, 0.136, 1.361, 1.636, 1.27]]\nD: [[0.461, 1.003, 0.863, 1.591, 0.946, 0.97]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_120_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_120_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[1.119, 2.382, -1.353, 2.688, 3.135, 0.067], [0.13, -2.518, 2.241, 3.948, 1.36, 0.679], [-0.552, 1.97, 3.469, 0.872, 1.36, 0.237]]\nB: [[1.56, 2.895, -0.877, 2.03, 3.162, 0.481], [-0.012, -2.383, 2.434, 3.863, 1.73, 0.8], [-1.053, 2.303, 3.432, 0.863, 1.12, -0.287]]\nC: [[1.156, 2.743, -1.086, 2.211, 3.278, 0.076], [-0.143, -2.063, 2.035, 4.283, 1.757, 0.379], [-1.038, 2.35, 3.4, 1.326, 1.515, 0.161]]\nD: [[1.559, 2.394, -0.855, 1.997, 3.635, -0.357], [-0.381, -2.532, 1.718, 3.949, 1.906, 0.055], [-0.752, 2.698, 2.911, 0.92, 1.137, -0.299]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the floor in the scene. The camera pose information includes: the rotation matrix: [[0.485844, -0.617081, 0.619005], [-0.873216, -0.311825, 0.374512], [-0.038083, -0.722479, -0.690343]]; the translation vector: [-0.164865, 3.073333, 1.323993], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.119, 2.382, -1.353, 2.688, 3.135, 0.067], [0.13, -2.518, 2.241, 3.948, 1.36, 0.679], [-0.552, 1.97, 3.469, 0.872, 1.36, 0.237]]\nB: [[1.56, 2.895, -0.877, 2.03, 3.162, 0.481], [-0.012, -2.383, 2.434, 3.863, 1.73, 0.8], [-1.053, 2.303, 3.432, 0.863, 1.12, -0.287]]\nC: [[1.156, 2.743, -1.086, 2.211, 3.278, 0.076], [-0.143, -2.063, 2.035, 4.283, 1.757, 0.379], [-1.038, 2.35, 3.4, 1.326, 1.515, 0.161]]\nD: [[1.559, 2.394, -0.855, 1.997, 3.635, -0.357], [-0.381, -2.532, 1.718, 3.949, 1.906, 0.055], [-0.752, 2.698, 2.911, 0.92, 1.137, -0.299]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_121_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_121_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[1.806, 0.963, 2.18, -0.15, 0.284, 0.747], [1.781, 1.911, 0.25, 0.639, -0.36, 0.423], [0.587, 2.601, 0.863, 0.066, -0.232, 0.753], [-0.114, 0.242, -0.467, -0.049, 0.569, 0.595], [1.888, 2.229, 0.499, -0.382, 0.046, 0.309], [1.509, 1.846, 0.444, -0.228, 0.075, -0.285]]\nB: [[2.196, 0.809, 1.96, 0.418, -0.322, 0.279], [1.469, 1.583, 0.002, 0.664, -0.01, 0.038], [-0.066, 2.119, 1.533, 0.754, 0.595, 0.72], [0.503, 0.542, -0.003, 0.788, 0.94, -0.045], [2.066, 1.491, 0.9, -0.225, 0.433, 0.456], [1.952, 1.903, 0.373, -0.138, 0.52, 0.69]]\nC: [[2.185, 1.438, 1.612, -0.033, 0.189, 0.171], [2.033, 1.77, 0.709, 0.481, 0.536, -0.285], [-0.228, 2.34, 1.714, 0.595, 0.3, -0.06], [-0.139, 0.192, -0.291, 0.431, 0.48, -0.343], [2.39, 1.84, 0.691, -0.012, 0.252, 0.48], [1.891, 1.81, 0.417, -0.052, -0.296, 0.438]]\nD: [[2.211, 1.285, 1.775, 0.127, 0.174, 0.292], [1.829, 1.683, 0.248, 0.278, 0.134, 0.131], [0.255, 2.241, 1.304, 0.333, 0.221, 0.253], [0.094, 0.321, -0.047, 0.34, 0.473, 0.108], [1.975, 1.944, 0.507, 0.101, 0.048, 0.175], [1.799, 1.959, 0.282, 0.261, 0.114, 0.195]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the object in the scene. The camera pose information includes: the rotation matrix: [[-0.877021, 0.121711, -0.464779], [0.46491, 0.459041, -0.75706], [0.12121, -0.880038, -0.459173]]; the translation vector: [3.922419, 3.230202, 1.747047], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.806, 0.963, 2.18, -0.15, 0.284, 0.747], [1.781, 1.911, 0.25, 0.639, -0.36, 0.423], [0.587, 2.601, 0.863, 0.066, -0.232, 0.753], [-0.114, 0.242, -0.467, -0.049, 0.569, 0.595], [1.888, 2.229, 0.499, -0.382, 0.046, 0.309], [1.509, 1.846, 0.444, -0.228, 0.075, -0.285]]\nB: [[2.196, 0.809, 1.96, 0.418, -0.322, 0.279], [1.469, 1.583, 0.002, 0.664, -0.01, 0.038], [-0.066, 2.119, 1.533, 0.754, 0.595, 0.72], [0.503, 0.542, -0.003, 0.788, 0.94, -0.045], [2.066, 1.491, 0.9, -0.225, 0.433, 0.456], [1.952, 1.903, 0.373, -0.138, 0.52, 0.69]]\nC: [[2.185, 1.438, 1.612, -0.033, 0.189, 0.171], [2.033, 1.77, 0.709, 0.481, 0.536, -0.285], [-0.228, 2.34, 1.714, 0.595, 0.3, -0.06], [-0.139, 0.192, -0.291, 0.431, 0.48, -0.343], [2.39, 1.84, 0.691, -0.012, 0.252, 0.48], [1.891, 1.81, 0.417, -0.052, -0.296, 0.438]]\nD: [[2.211, 1.285, 1.775, 0.127, 0.174, 0.292], [1.829, 1.683, 0.248, 0.278, 0.134, 0.131], [0.255, 2.241, 1.304, 0.333, 0.221, 0.253], [0.094, 0.321, -0.047, 0.34, 0.473, 0.108], [1.975, 1.944, 0.507, 0.101, 0.048, 0.175], [1.799, 1.959, 0.282, 0.261, 0.114, 0.195]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_122_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_122_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-0.687, 1.332, 0.035, 0.175, 2.444, 0.931], [-0.771, -0.087, 1.874, 0.351, 2.708, 1.076], [1.073, -0.054, 0.17, 0.668, 1.99, 0.53], [0.962, 0.659, 2.221, 0.281, 1.867, 1.287]]\nB: [[-0.793, 1.344, 0.264, -0.144, 2.319, 0.303], [-0.892, -0.228, 1.276, 0.428, 2.505, 1.37], [0.859, 0.126, 0.332, 0.439, 1.529, 0.603], [0.425, 0.012, 2.016, 0.908, 2.008, 0.841]]\nC: [[-1.133, 0.424, 0.86, -0.054, 2.382, 0.943], [-1.282, -0.466, 1.739, 0.288, 2.29, 1.182], [0.76, 0.578, 0.124, 0.797, 1.631, 0.597], [0.974, 0.003, 1.857, 0.274, 1.983, 0.737]]\nD: [[-0.66, 0.92, 0.369, 0.068, 2.758, 0.803], [-0.938, -0.063, 1.743, 0.134, 2.421, 0.981], [0.672, 0.378, 0.36, 0.646, 1.681, 0.828], [0.776, 0.348, 1.743, 0.449, 1.71, 0.968]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the kitchen cabinets in the scene. The camera pose information includes: the rotation matrix: [[0.815869, 0.244354, -0.524069], [0.578211, -0.336271, 0.743367], [0.005416, -0.909513, -0.415641]]; the translation vector: [2.358014, 1.230078, 1.369842], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.687, 1.332, 0.035, 0.175, 2.444, 0.931], [-0.771, -0.087, 1.874, 0.351, 2.708, 1.076], [1.073, -0.054, 0.17, 0.668, 1.99, 0.53], [0.962, 0.659, 2.221, 0.281, 1.867, 1.287]]\nB: [[-0.793, 1.344, 0.264, -0.144, 2.319, 0.303], [-0.892, -0.228, 1.276, 0.428, 2.505, 1.37], [0.859, 0.126, 0.332, 0.439, 1.529, 0.603], [0.425, 0.012, 2.016, 0.908, 2.008, 0.841]]\nC: [[-1.133, 0.424, 0.86, -0.054, 2.382, 0.943], [-1.282, -0.466, 1.739, 0.288, 2.29, 1.182], [0.76, 0.578, 0.124, 0.797, 1.631, 0.597], [0.974, 0.003, 1.857, 0.274, 1.983, 0.737]]\nD: [[-0.66, 0.92, 0.369, 0.068, 2.758, 0.803], [-0.938, -0.063, 1.743, 0.134, 2.421, 0.981], [0.672, 0.378, 0.36, 0.646, 1.681, 0.828], [0.776, 0.348, 1.743, 0.449, 1.71, 0.968]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_123_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_123_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[1.177, -0.077, 0.773, 0.974, 8.45, 1.181]]\nB: [[1.408, -0.085, 0.96, 1.256, 8.826, 1.391]]\nC: [[1.29, 0.138, 0.989, 1.682, 8.854, 1.495]]\nD: [[1.087, -0.264, 0.505, 1.705, 9.131, 0.904]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the blinds in the scene. The camera pose information includes: the rotation matrix: [[0.117057, -0.769276, 0.628102], [-0.987232, -0.021336, 0.157855], [-0.108033, -0.638561, -0.761951]]; the translation vector: [1.032686, 1.226834, 2.186959], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.177, -0.077, 0.773, 0.974, 8.45, 1.181]]\nB: [[1.408, -0.085, 0.96, 1.256, 8.826, 1.391]]\nC: [[1.29, 0.138, 0.989, 1.682, 8.854, 1.495]]\nD: [[1.087, -0.264, 0.505, 1.705, 9.131, 0.904]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_124_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_124_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[1.333, 1.449, 0.466, 0.575, 0.885, 0.579], [-0.819, 0.179, 1.256, -0.187, 0.08, 0.261], [-0.881, -0.764, 1.817, 0.04, 0.55, 0.119], [-0.782, -0.821, 1.039, -0.465, -0.079, -0.041]]\nB: [[0.913, 1.406, 0.914, 0.154, 0.729, 0.951], [-0.918, 0.236, 1.614, 0.027, 0.343, 0.415], [-0.932, -0.471, 1.376, 0.043, 0.42, 0.318], [-0.937, -1.266, 1.202, 0.021, 0.397, 0.404]]\nC: [[0.638, 1.511, 1.273, 0.574, 0.958, 0.746], [-1.165, 0.389, 1.897, 0.474, -0.02, 0.527], [-0.474, 0.021, 1.802, 0.289, 0.006, -0.062], [-1.35, -1.672, 1.153, 0.07, 0.246, 0.557]]\nD: [[0.615, 1.775, 1.082, 0.394, 0.94, 1.366], [-0.883, -0.231, 1.634, -0.385, 0.134, 0.914], [-0.757, -0.827, 1.097, 0.253, 0.741, 0.546], [-1.013, -1.459, 1.475, -0.37, 0.862, 0.783]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the picture in the scene. The camera pose information includes: the rotation matrix: [[-0.042655, 0.409797, -0.911179], [0.998036, -0.024411, -0.0577], [-0.045888, -0.91185, -0.40795]]; the translation vector: [2.423933, 1.356295, 3.282493], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.333, 1.449, 0.466, 0.575, 0.885, 0.579], [-0.819, 0.179, 1.256, -0.187, 0.08, 0.261], [-0.881, -0.764, 1.817, 0.04, 0.55, 0.119], [-0.782, -0.821, 1.039, -0.465, -0.079, -0.041]]\nB: [[0.913, 1.406, 0.914, 0.154, 0.729, 0.951], [-0.918, 0.236, 1.614, 0.027, 0.343, 0.415], [-0.932, -0.471, 1.376, 0.043, 0.42, 0.318], [-0.937, -1.266, 1.202, 0.021, 0.397, 0.404]]\nC: [[0.638, 1.511, 1.273, 0.574, 0.958, 0.746], [-1.165, 0.389, 1.897, 0.474, -0.02, 0.527], [-0.474, 0.021, 1.802, 0.289, 0.006, -0.062], [-1.35, -1.672, 1.153, 0.07, 0.246, 0.557]]\nD: [[0.615, 1.775, 1.082, 0.394, 0.94, 1.366], [-0.883, -0.231, 1.634, -0.385, 0.134, 0.914], [-0.757, -0.827, 1.097, 0.253, 0.741, 0.546], [-1.013, -1.459, 1.475, -0.37, 0.862, 0.783]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_125_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_125_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-1.696, 2.84, 0.243, 0.913, 0.14, 0.965], [-2.524, -1.89, 1.517, 0.263, 1.169, 1.097]]\nB: [[-2.032, 3.081, 0.673, 0.874, 0.207, 1.282], [-2.435, -2.167, 1.207, 0.214, 0.953, 0.8]]\nC: [[-2.083, 2.817, 0.915, 0.407, -0.083, 1.119], [-2.185, -1.778, 0.77, 0.561, 0.888, 0.902]]\nD: [[-1.79, 3.485, 0.577, 0.547, 0.315, 1.286], [-2.516, -2.509, 1.071, 0.577, 1.197, 0.616]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the window in the scene. The camera pose information includes: the rotation matrix: [[0.299058, 0.37418, -0.877812], [0.95368, -0.085842, 0.288314], [0.032528, -0.923375, -0.38252]]; the translation vector: [3.908031, 4.993837, 1.41318], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.696, 2.84, 0.243, 0.913, 0.14, 0.965], [-2.524, -1.89, 1.517, 0.263, 1.169, 1.097]]\nB: [[-2.032, 3.081, 0.673, 0.874, 0.207, 1.282], [-2.435, -2.167, 1.207, 0.214, 0.953, 0.8]]\nC: [[-2.083, 2.817, 0.915, 0.407, -0.083, 1.119], [-2.185, -1.778, 0.77, 0.561, 0.888, 0.902]]\nD: [[-1.79, 3.485, 0.577, 0.547, 0.315, 1.286], [-2.516, -2.509, 1.071, 0.577, 1.197, 0.616]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_126_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_126_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[1.147, 0.119, 0.251, 0.463, 0.502, 0.493], [1.142, -0.546, 0.997, 0.457, 0.597, 0.473], [1.198, 0.632, 0.925, 0.452, 0.473, 0.432], [1.163, 0.092, 1.069, 0.44, 0.432, 0.506]]\nB: [[0.939, -0.362, 0.676, 0.67, 0.041, 0.58], [0.766, -0.402, 0.786, 0.189, 1.052, 0.915], [1.684, 0.428, 1.283, 0.635, 0.353, 0.864], [1.275, -0.104, 1.385, 0.008, 0.054, 0.956]]\nC: [[1.248, 0.165, 0.549, 0.255, 0.722, 0.454], [1.139, -0.967, 1.065, 0.247, 0.425, 0.531], [0.839, 1.106, 1.224, 0.271, 0.846, 0.671], [0.954, 0.329, 1.422, 0.774, 0.624, 0.313]]\nD: [[1.328, 0.233, 0.409, 0.859, 0.672, 0.071], [1.492, -0.434, 0.743, 0.731, 0.907, 0.382], [1.626, 0.478, 0.601, 0.312, 0.631, 0.904], [1.629, 0.385, 0.684, 0.845, 0.492, 0.801]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the printer in the scene. The camera pose information includes: the rotation matrix: [[0.985254, -0.134646, 0.105573], [-0.142287, -0.302097, 0.942599], [-0.095024, -0.94372, -0.3168]]; the translation vector: [1.134605, 1.549487, 1.505245], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.147, 0.119, 0.251, 0.463, 0.502, 0.493], [1.142, -0.546, 0.997, 0.457, 0.597, 0.473], [1.198, 0.632, 0.925, 0.452, 0.473, 0.432], [1.163, 0.092, 1.069, 0.44, 0.432, 0.506]]\nB: [[0.939, -0.362, 0.676, 0.67, 0.041, 0.58], [0.766, -0.402, 0.786, 0.189, 1.052, 0.915], [1.684, 0.428, 1.283, 0.635, 0.353, 0.864], [1.275, -0.104, 1.385, 0.008, 0.054, 0.956]]\nC: [[1.248, 0.165, 0.549, 0.255, 0.722, 0.454], [1.139, -0.967, 1.065, 0.247, 0.425, 0.531], [0.839, 1.106, 1.224, 0.271, 0.846, 0.671], [0.954, 0.329, 1.422, 0.774, 0.624, 0.313]]\nD: [[1.328, 0.233, 0.409, 0.859, 0.672, 0.071], [1.492, -0.434, 0.743, 0.731, 0.907, 0.382], [1.626, 0.478, 0.601, 0.312, 0.631, 0.904], [1.629, 0.385, 0.684, 0.845, 0.492, 0.801]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_127_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_127_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-1.978, 2.218, 0.88, 0.413, 0.1, 0.702], [-1.917, 2.1, 1.145, 0.538, 0.288, 0.643]]\nB: [[-1.584, 2.193, 0.205, 0.199, 0.268, 0.839], [-1.535, 2.333, 0.994, 0.342, 0.187, 0.134]]\nC: [[-1.966, 2.066, 0.622, 0.287, 0.189, 0.88], [-1.737, 2.041, 0.848, 0.173, 0.149, 0.382]]\nD: [[-1.998, 2.157, 0.963, 0.629, -0.078, 1.235], [-1.653, 2.214, 0.646, 0.156, 0.285, 0.243]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the towel in the scene. The camera pose information includes: the rotation matrix: [[0.686341, -0.358824, 0.632599], [-0.727213, -0.35045, 0.590209], [0.009912, -0.865119, -0.50147]]; the translation vector: [2.486494, 4.601647, 1.455454], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.978, 2.218, 0.88, 0.413, 0.1, 0.702], [-1.917, 2.1, 1.145, 0.538, 0.288, 0.643]]\nB: [[-1.584, 2.193, 0.205, 0.199, 0.268, 0.839], [-1.535, 2.333, 0.994, 0.342, 0.187, 0.134]]\nC: [[-1.966, 2.066, 0.622, 0.287, 0.189, 0.88], [-1.737, 2.041, 0.848, 0.173, 0.149, 0.382]]\nD: [[-1.998, 2.157, 0.963, 0.629, -0.078, 1.235], [-1.653, 2.214, 0.646, 0.156, 0.285, 0.243]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_128_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_128_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-1.047, -0.588, -0.114, 0.759, 2.976, 0.961]]\nB: [[-1.511, -0.608, 0.081, 1.102, 2.98, 1.011]]\nC: [[-1.203, -0.385, 0.359, 0.756, 2.647, 0.817]]\nD: [[-1.37, -0.358, 0.323, 0.77, 2.437, 0.409]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the desk in the scene. The camera pose information includes: the rotation matrix: [[-0.802837, 0.056561, -0.593509], [0.596192, 0.071654, -0.799638], [-0.002701, -0.995825, -0.091248]]; the translation vector: [2.583219, 4.008804, 1.439254], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.047, -0.588, -0.114, 0.759, 2.976, 0.961]]\nB: [[-1.511, -0.608, 0.081, 1.102, 2.98, 1.011]]\nC: [[-1.203, -0.385, 0.359, 0.756, 2.647, 0.817]]\nD: [[-1.37, -0.358, 0.323, 0.77, 2.437, 0.409]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_129_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_129_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[0.904, -0.732, 0.092, 0.243, 0.59, 0.859], [-1.501, -0.76, 0.757, 1.132, 0.498, 0.756]]\nB: [[1.126, -0.366, 0.392, 0.688, 0.942, 0.802], [-1.375, -0.274, 0.471, 1.076, 0.886, 0.947]]\nC: [[0.868, -0.772, 0.151, 0.633, 1.223, 0.791], [-1.775, -0.718, 0.331, 1.093, 0.846, 1.4]]\nD: [[1.114, -0.309, 0.254, 0.953, 0.846, 0.427], [-1.752, 0.101, 0.877, 0.811, 1.045, 0.651]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the dresser in the scene. The camera pose information includes: the rotation matrix: [[-0.442667, -0.46733, 0.765277], [-0.896368, 0.253361, -0.363776], [-0.023888, -0.847001, -0.531054]]; the translation vector: [2.453469, 1.905797, 1.451684], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.904, -0.732, 0.092, 0.243, 0.59, 0.859], [-1.501, -0.76, 0.757, 1.132, 0.498, 0.756]]\nB: [[1.126, -0.366, 0.392, 0.688, 0.942, 0.802], [-1.375, -0.274, 0.471, 1.076, 0.886, 0.947]]\nC: [[0.868, -0.772, 0.151, 0.633, 1.223, 0.791], [-1.775, -0.718, 0.331, 1.093, 0.846, 1.4]]\nD: [[1.114, -0.309, 0.254, 0.953, 0.846, 0.427], [-1.752, 0.101, 0.877, 0.811, 1.045, 0.651]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_130_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_130_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[0.61, -0.414, 0.408, 4.655, 4.102, -0.244]]\nB: [[0.311, -0.524, 0.039, 4.829, 4.569, 0.162]]\nC: [[0.203, -0.323, 0.325, 5.217, 4.229, 0.65]]\nD: [[0.089, -0.712, 0.114, 5.287, 4.148, 0.629]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the floor in the scene. The camera pose information includes: the rotation matrix: [[0.633294, -0.360819, 0.684652], [-0.773758, -0.312806, 0.550863], [0.015401, -0.878613, -0.477285]]; the translation vector: [3.241882, 3.386626, 1.367882], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.61, -0.414, 0.408, 4.655, 4.102, -0.244]]\nB: [[0.311, -0.524, 0.039, 4.829, 4.569, 0.162]]\nC: [[0.203, -0.323, 0.325, 5.217, 4.229, 0.65]]\nD: [[0.089, -0.712, 0.114, 5.287, 4.148, 0.629]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_131_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_131_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[2.917, -0.639, 0.621, 0.336, 6.104, 1.183], [0.361, -2.884, 0.637, 5.668, -0.024, 1.879], [-2.761, -1.433, 0.573, -0.228, 0.551, 1.578], [-3.018, -2.023, 1.21, 0.696, 0.033, 1.511], [-2.944, 0.173, 0.693, 0.375, 4.326, 1.889]]\nB: [[2.88, 0.258, 0.921, 0.635, 5.466, 1.974], [-0.061, -2.646, 0.552, 6.114, -0.006, 1.775], [-3.069, -1.6, 0.804, 0.521, 0.433, 1.489], [-3.084, -1.953, 1.232, 0.742, 0.11, 1.43], [-2.84, 1.121, 0.562, 0.204, 4.902, 1.824]]\nC: [[3.129, -0.248, 1.092, 0.28, 6.006, 1.69], [0.277, -3.248, 1.229, 5.639, 0.457, 1.83], [-2.943, -1.702, 1.206, 0.61, 0.818, 1.511], [-2.632, -1.423, 0.42, 0.373, 0.138, 1.635], [-3.02, 0.349, 0.427, 0.566, 4.15, 1.781]]\nD: [[3.003, -0.173, 0.772, 0.324, 5.743, 1.505], [-0.052, -3.097, 0.827, 6.005, 0.286, 1.553], [-3.164, -1.839, 0.77, 0.192, 0.577, 1.362], [-2.872, -1.562, 0.743, 0.498, 0.153, 1.361], [-2.619, 0.636, 0.832, 0.279, 4.454, 1.688]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[-0.924593, 0.219455, -0.311397], [0.371095, 0.334047, -0.86643], [-0.086121, -0.916653, -0.390296]]; the translation vector: [7.650298, 2.745242, 1.444521], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[2.917, -0.639, 0.621, 0.336, 6.104, 1.183], [0.361, -2.884, 0.637, 5.668, -0.024, 1.879], [-2.761, -1.433, 0.573, -0.228, 0.551, 1.578], [-3.018, -2.023, 1.21, 0.696, 0.033, 1.511], [-2.944, 0.173, 0.693, 0.375, 4.326, 1.889]]\nB: [[2.88, 0.258, 0.921, 0.635, 5.466, 1.974], [-0.061, -2.646, 0.552, 6.114, -0.006, 1.775], [-3.069, -1.6, 0.804, 0.521, 0.433, 1.489], [-3.084, -1.953, 1.232, 0.742, 0.11, 1.43], [-2.84, 1.121, 0.562, 0.204, 4.902, 1.824]]\nC: [[3.129, -0.248, 1.092, 0.28, 6.006, 1.69], [0.277, -3.248, 1.229, 5.639, 0.457, 1.83], [-2.943, -1.702, 1.206, 0.61, 0.818, 1.511], [-2.632, -1.423, 0.42, 0.373, 0.138, 1.635], [-3.02, 0.349, 0.427, 0.566, 4.15, 1.781]]\nD: [[3.003, -0.173, 0.772, 0.324, 5.743, 1.505], [-0.052, -3.097, 0.827, 6.005, 0.286, 1.553], [-3.164, -1.839, 0.77, 0.192, 0.577, 1.362], [-2.872, -1.562, 0.743, 0.498, 0.153, 1.361], [-2.619, 0.636, 0.832, 0.279, 4.454, 1.688]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_132_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_132_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[0.133, 0.902, 0.422, 0.039, 0.22, 0.632]]\nB: [[-0.076, 0.973, 0.415, -0.004, 1.174, 1.248]]\nC: [[0.144, 0.321, 0.705, -0.021, 0.284, 1.035]]\nD: [[0.355, 0.535, 0.346, 0.07, 0.677, 0.805]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the dishwasher in the scene. The camera pose information includes: the rotation matrix: [[0.975982, 0.033782, -0.215214], [0.215389, -0.297687, 0.930048], [-0.032648, -0.954066, -0.297814]]; the translation vector: [2.838751, 1.414222, 1.664536], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.133, 0.902, 0.422, 0.039, 0.22, 0.632]]\nB: [[-0.076, 0.973, 0.415, -0.004, 1.174, 1.248]]\nC: [[0.144, 0.321, 0.705, -0.021, 0.284, 1.035]]\nD: [[0.355, 0.535, 0.346, 0.07, 0.677, 0.805]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_133_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_133_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-1.578, -0.219, 1.033, 0.257, 3.748, 1.935], [2.209, 0.671, 0.788, -0.095, 3.689, 2.508], [0.223, -2.291, 0.584, 0.625, 0.246, 1.596], [0.409, -2.777, 1.049, -0.283, 0.131, 1.81]]\nB: [[-2.04, 0.586, 0.772, 0.304, 3.812, 1.848], [1.619, 0.488, 0.655, 0.555, 3.765, 1.94], [0.355, -2.984, 1.136, -0.107, 0.055, 1.74], [0.752, -2.78, 0.749, 0.33, 0.188, 1.815]]\nC: [[-1.581, 0.188, 1.09, 0.283, 3.526, 2.183], [1.935, 0.185, 1.045, 0.157, 3.57, 2.128], [0.384, -2.556, 0.863, 0.244, 0.135, 1.758], [0.278, -2.37, 1.022, 0.1, 0.539, 2.045]]\nD: [[-1.492, -0.235, 1.434, 0.171, 3.146, 1.819], [2.128, 0.651, 1.233, 0.526, 3.819, 1.664], [0.295, -2.822, 1.218, -0.087, 0.184, 1.532], [0.45, -1.956, 1.009, 0.299, 0.644, 2.242]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[-0.037281, 0.595041, -0.80283], [0.998378, -0.012419, -0.055566], [-0.043034, -0.803599, -0.593613]]; the translation vector: [3.95675, 2.244474, 1.442954], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.578, -0.219, 1.033, 0.257, 3.748, 1.935], [2.209, 0.671, 0.788, -0.095, 3.689, 2.508], [0.223, -2.291, 0.584, 0.625, 0.246, 1.596], [0.409, -2.777, 1.049, -0.283, 0.131, 1.81]]\nB: [[-2.04, 0.586, 0.772, 0.304, 3.812, 1.848], [1.619, 0.488, 0.655, 0.555, 3.765, 1.94], [0.355, -2.984, 1.136, -0.107, 0.055, 1.74], [0.752, -2.78, 0.749, 0.33, 0.188, 1.815]]\nC: [[-1.581, 0.188, 1.09, 0.283, 3.526, 2.183], [1.935, 0.185, 1.045, 0.157, 3.57, 2.128], [0.384, -2.556, 0.863, 0.244, 0.135, 1.758], [0.278, -2.37, 1.022, 0.1, 0.539, 2.045]]\nD: [[-1.492, -0.235, 1.434, 0.171, 3.146, 1.819], [2.128, 0.651, 1.233, 0.526, 3.819, 1.664], [0.295, -2.822, 1.218, -0.087, 0.184, 1.532], [0.45, -1.956, 1.009, 0.299, 0.644, 2.242]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_134_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_134_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[0.199, 1.125, 0.399, 0.214, 0.05, 0.589]]\nB: [[0.02, 1.322, 0.476, 0.689, 0.454, 0.768]]\nC: [[0.504, 0.831, 0.74, 0.202, 0.254, 0.39]]\nD: [[0.93, 1.224, 1.103, 0.115, -0.143, 0.862]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the shelf in the scene. The camera pose information includes: the rotation matrix: [[0.994446, -0.078697, 0.06988], [-0.104992, -0.787844, 0.606859], [0.007297, -0.610826, -0.791731]]; the translation vector: [1.305105, 0.510448, 1.183315], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.199, 1.125, 0.399, 0.214, 0.05, 0.589]]\nB: [[0.02, 1.322, 0.476, 0.689, 0.454, 0.768]]\nC: [[0.504, 0.831, 0.74, 0.202, 0.254, 0.39]]\nD: [[0.93, 1.224, 1.103, 0.115, -0.143, 0.862]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_135_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_135_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[1.403, 0.709, 0.086, 0.284, 0.032, 0.061]]\nB: [[1.358, 0.357, -0.003, 0.163, 0.142, 0.021]]\nC: [[1.451, 0.553, 0.13, 0.387, 0.236, 0.338]]\nD: [[1.592, 0.722, 0.492, 0.54, 0.067, 0.402]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the trash can in the scene. The camera pose information includes: the rotation matrix: [[-0.573389, -0.355745, 0.738018], [-0.818965, 0.223754, -0.528424], [0.02285, -0.907403, -0.419641]]; the translation vector: [2.061407, 3.857203, 1.382209], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.403, 0.709, 0.086, 0.284, 0.032, 0.061]]\nB: [[1.358, 0.357, -0.003, 0.163, 0.142, 0.021]]\nC: [[1.451, 0.553, 0.13, 0.387, 0.236, 0.338]]\nD: [[1.592, 0.722, 0.492, 0.54, 0.067, 0.402]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_136_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_136_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[2.019, 1.716, -0.199, 0.133, 0.443, 0.08]]\nB: [[1.539, 1.317, 0.169, 1.021, 0.789, 0.672]]\nC: [[1.691, 1.543, 0.248, 0.524, 0.565, 0.475]]\nD: [[1.676, 1.663, -0.114, 0.76, 0.881, 0.004]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the footrest in the scene. The camera pose information includes: the rotation matrix: [[-0.752388, 0.33007, -0.570058], [0.655329, 0.287372, -0.698542], [-0.066749, -0.89915, -0.43252]]; the translation vector: [3.814293, 2.583141, 1.394159], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[2.019, 1.716, -0.199, 0.133, 0.443, 0.08]]\nB: [[1.539, 1.317, 0.169, 1.021, 0.789, 0.672]]\nC: [[1.691, 1.543, 0.248, 0.524, 0.565, 0.475]]\nD: [[1.676, 1.663, -0.114, 0.76, 0.881, 0.004]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_137_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_137_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[2.123, 1.117, 1.321, 0.743, 0.381, 0.635], [2.682, 1.29, 0.828, 0.463, 1.156, 0.705], [2.834, 1.547, 0.399, 0.032, 0.368, 0.16], [3.096, 0.047, 0.87, -0.078, 0.326, -0.125], [2.563, 0.993, 1.356, 0.811, 0.73, 0.229], [2.143, 0.969, 1.037, 0.642, 1.052, 0.055], [2.659, -0.615, 0.836, 0.248, 1.054, 0.022], [3.223, -0.649, 1.168, 0.253, 1.288, 0.671], [2.454, -0.393, 0.682, 0.657, 1.137, 0.691], [-3.644, 1.213, 1.46, 0.605, 1.274, 0.706], [-3.657, -0.185, 0.669, 0.323, 1.16, 0.8], [-3.228, -0.103, 1.329, 0.441, 0.997, 0.754], [-3.305, 0.306, 0.543, 0.056, 1.942, 0.326], [-3.554, -0.503, 0.414, 0.642, 0.665, 0.745], [-2.947, -0.695, 0.368, 0.59, 0.436, 0.372], [-3.593, -0.106, 0.806, 0.216, 0.592, 0.301]]\nB: [[2.989, 1.126, 0.794, -0.179, 0.245, 0.369], [2.962, 1.061, 0.623, -0.057, 0.36, 0.431], [2.845, 1.185, 0.945, 0.308, 0.535, 0.574], [2.424, 0.962, 1.637, -0.272, 0.494, 0.77], [3.085, 0.394, 0.93, 0.245, 0.901, 0.482], [2.87, 0.321, 0.254, 0.308, 0.264, 0.679], [2.834, -0.509, 1.34, 0.641, 0.49, 0.271], [2.993, -0.295, 0.769, -0.075, 1.002, 0.589], [3.132, -0.129, 0.78, 0.069, 1.025, 0.007], [-2.917, 1.638, 1.353, 0.35, 0.736, 0.591], [-2.828, -0.168, 1.186, 0.057, 1.347, 0.51], [-3.297, -0.456, 0.362, 0.307, 0.654, 0.781], [-3.301, 0.612, 0.703, 0.328, 1.414, 0.306], [-2.89, -0.213, 0.298, -0.086, 1.058, 0.488], [-2.855, -0.016, -0.219, -0.168, 0.422, -0.035], [-3.555, 0.252, 0.516, -0.109, 1.029, 0.664]]\nC: [[2.568, 1.418, 1.271, 0.257, 0.709, 0.306], [2.646, 1.448, 0.95, 0.305, 0.76, 0.302], [2.592, 1.461, 0.636, 0.212, 0.718, 0.28], [2.65, 0.514, 1.213, 0.222, 0.814, 0.309], [2.738, 0.497, 0.888, 0.381, 0.863, 0.308], [2.639, 0.563, 0.627, 0.305, 0.736, 0.188], [2.693, -0.392, 1.14, 0.281, 0.891, 0.334], [2.727, -0.372, 0.833, 0.29, 0.926, 0.3], [2.691, -0.383, 0.563, 0.264, 0.854, 0.201], [-3.22, 1.231, 1.017, 0.313, 0.915, 0.346], [-3.273, 0.289, 0.923, 0.23, 1.204, 0.355], [-3.222, -0.487, 0.833, 0.334, 0.747, 0.368], [-3.341, 0.627, 0.626, 0.449, 1.466, 0.437], [-3.265, -0.411, 0.526, 0.337, 0.641, 0.343], [-3.203, -0.328, 0.27, 0.175, 0.592, 0.204], [-3.277, 0.365, 0.338, 0.332, 0.934, 0.242]]\nD: [[2.244, 1.249, 1.196, 0.59, 0.671, 0.591], [3.002, 1.584, 0.459, 0.732, 0.625, -0.064], [2.803, 1.399, 0.195, 0.554, 0.24, -0.185], [2.948, 0.428, 1.564, 0.649, 0.642, 0.076], [2.502, 0.944, 1.279, 0.724, 1.079, 0.788], [3.063, 0.247, 0.912, 0.247, 0.578, 0.126], [2.848, -0.809, 0.778, 0.441, 1.15, 0.263], [2.483, -0.756, 0.605, 0.63, 1.407, 0.292], [2.369, -0.586, 0.732, 0.348, 0.461, 0.12], [-3.238, 0.78, 0.778, 0.212, 1.143, -0.102], [-3.116, 0.426, 0.879, 0.248, 1.646, 0.306], [-2.875, -0.393, 1.087, 0.035, 1.245, 0.038], [-3.308, 0.845, 1.118, 0.472, 1.582, 0.109], [-3.33, -0.848, 0.583, 0.088, 1.108, -0.004], [-3.371, -0.081, 0.236, -0.02, 0.647, 0.543], [-3.267, -0.114, -0.13, -0.134, 1.197, -0.109]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the books in the scene. The camera pose information includes: the rotation matrix: [[0.892065, -0.360019, 0.273141], [-0.443019, -0.577417, 0.685801], [-0.089185, -0.732786, -0.674589]]; the translation vector: [2.898737, 2.45906, 1.649541], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[2.123, 1.117, 1.321, 0.743, 0.381, 0.635], [2.682, 1.29, 0.828, 0.463, 1.156, 0.705], [2.834, 1.547, 0.399, 0.032, 0.368, 0.16], [3.096, 0.047, 0.87, -0.078, 0.326, -0.125], [2.563, 0.993, 1.356, 0.811, 0.73, 0.229], [2.143, 0.969, 1.037, 0.642, 1.052, 0.055], [2.659, -0.615, 0.836, 0.248, 1.054, 0.022], [3.223, -0.649, 1.168, 0.253, 1.288, 0.671], [2.454, -0.393, 0.682, 0.657, 1.137, 0.691], [-3.644, 1.213, 1.46, 0.605, 1.274, 0.706], [-3.657, -0.185, 0.669, 0.323, 1.16, 0.8], [-3.228, -0.103, 1.329, 0.441, 0.997, 0.754], [-3.305, 0.306, 0.543, 0.056, 1.942, 0.326], [-3.554, -0.503, 0.414, 0.642, 0.665, 0.745], [-2.947, -0.695, 0.368, 0.59, 0.436, 0.372], [-3.593, -0.106, 0.806, 0.216, 0.592, 0.301]]\nB: [[2.989, 1.126, 0.794, -0.179, 0.245, 0.369], [2.962, 1.061, 0.623, -0.057, 0.36, 0.431], [2.845, 1.185, 0.945, 0.308, 0.535, 0.574], [2.424, 0.962, 1.637, -0.272, 0.494, 0.77], [3.085, 0.394, 0.93, 0.245, 0.901, 0.482], [2.87, 0.321, 0.254, 0.308, 0.264, 0.679], [2.834, -0.509, 1.34, 0.641, 0.49, 0.271], [2.993, -0.295, 0.769, -0.075, 1.002, 0.589], [3.132, -0.129, 0.78, 0.069, 1.025, 0.007], [-2.917, 1.638, 1.353, 0.35, 0.736, 0.591], [-2.828, -0.168, 1.186, 0.057, 1.347, 0.51], [-3.297, -0.456, 0.362, 0.307, 0.654, 0.781], [-3.301, 0.612, 0.703, 0.328, 1.414, 0.306], [-2.89, -0.213, 0.298, -0.086, 1.058, 0.488], [-2.855, -0.016, -0.219, -0.168, 0.422, -0.035], [-3.555, 0.252, 0.516, -0.109, 1.029, 0.664]]\nC: [[2.568, 1.418, 1.271, 0.257, 0.709, 0.306], [2.646, 1.448, 0.95, 0.305, 0.76, 0.302], [2.592, 1.461, 0.636, 0.212, 0.718, 0.28], [2.65, 0.514, 1.213, 0.222, 0.814, 0.309], [2.738, 0.497, 0.888, 0.381, 0.863, 0.308], [2.639, 0.563, 0.627, 0.305, 0.736, 0.188], [2.693, -0.392, 1.14, 0.281, 0.891, 0.334], [2.727, -0.372, 0.833, 0.29, 0.926, 0.3], [2.691, -0.383, 0.563, 0.264, 0.854, 0.201], [-3.22, 1.231, 1.017, 0.313, 0.915, 0.346], [-3.273, 0.289, 0.923, 0.23, 1.204, 0.355], [-3.222, -0.487, 0.833, 0.334, 0.747, 0.368], [-3.341, 0.627, 0.626, 0.449, 1.466, 0.437], [-3.265, -0.411, 0.526, 0.337, 0.641, 0.343], [-3.203, -0.328, 0.27, 0.175, 0.592, 0.204], [-3.277, 0.365, 0.338, 0.332, 0.934, 0.242]]\nD: [[2.244, 1.249, 1.196, 0.59, 0.671, 0.591], [3.002, 1.584, 0.459, 0.732, 0.625, -0.064], [2.803, 1.399, 0.195, 0.554, 0.24, -0.185], [2.948, 0.428, 1.564, 0.649, 0.642, 0.076], [2.502, 0.944, 1.279, 0.724, 1.079, 0.788], [3.063, 0.247, 0.912, 0.247, 0.578, 0.126], [2.848, -0.809, 0.778, 0.441, 1.15, 0.263], [2.483, -0.756, 0.605, 0.63, 1.407, 0.292], [2.369, -0.586, 0.732, 0.348, 0.461, 0.12], [-3.238, 0.78, 0.778, 0.212, 1.143, -0.102], [-3.116, 0.426, 0.879, 0.248, 1.646, 0.306], [-2.875, -0.393, 1.087, 0.035, 1.245, 0.038], [-3.308, 0.845, 1.118, 0.472, 1.582, 0.109], [-3.33, -0.848, 0.583, 0.088, 1.108, -0.004], [-3.371, -0.081, 0.236, -0.02, 0.647, 0.543], [-3.267, -0.114, -0.13, -0.134, 1.197, -0.109]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_138_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_138_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[0.568, 1.594, 2.336, 3.002, 0.215, 1.78], [-1.527, -0.451, 1.258, 0.103, 4.121, 1.756], [-1.386, -2.636, 1.043, -0.207, -0.22, 1.193], [1.834, 0.934, 1.595, 0.321, 1.764, 2.84], [1.018, -0.319, 0.583, 1.01, 0.681, 1.687]]\nB: [[0.072, 1.537, 1.845, 2.689, 0.191, 1.622], [-1.273, -0.316, 0.956, 0.156, 3.767, 1.891], [-1.14, -2.18, 0.679, 0.246, 0.067, 1.34], [1.381, 0.651, 1.354, 0.135, 1.692, 2.602], [0.889, -0.737, 0.87, 1.059, 1.122, 1.773]]\nC: [[0.21, 1.662, 1.825, 2.75, -0.252, 2.024], [-1.56, -0.058, 0.561, 0.054, 3.741, 2.333], [-1.055, -2.665, 0.535, 0.196, 0.05, 1.825], [1.164, 0.58, 1.628, 0.045, 1.482, 2.195], [1.198, -0.291, 1.331, 0.727, 1.34, 1.309]]\nD: [[-0.147, 1.793, 1.85, 3.103, 0.596, 1.69], [-1.538, -0.388, 0.463, 0.445, 3.441, 1.475], [-1.625, -1.946, 0.934, 0.072, -0.182, 1.409], [1.247, 1.123, 0.994, 0.033, 1.379, 2.521], [0.847, -0.38, 0.424, 0.888, 1.469, 2.148]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[0.721847, -0.019511, -0.691778], [0.690918, -0.036893, 0.721991], [-0.039608, -0.999129, -0.013151]]; the translation vector: [1.871862, 0.815296, 1.594356], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.568, 1.594, 2.336, 3.002, 0.215, 1.78], [-1.527, -0.451, 1.258, 0.103, 4.121, 1.756], [-1.386, -2.636, 1.043, -0.207, -0.22, 1.193], [1.834, 0.934, 1.595, 0.321, 1.764, 2.84], [1.018, -0.319, 0.583, 1.01, 0.681, 1.687]]\nB: [[0.072, 1.537, 1.845, 2.689, 0.191, 1.622], [-1.273, -0.316, 0.956, 0.156, 3.767, 1.891], [-1.14, -2.18, 0.679, 0.246, 0.067, 1.34], [1.381, 0.651, 1.354, 0.135, 1.692, 2.602], [0.889, -0.737, 0.87, 1.059, 1.122, 1.773]]\nC: [[0.21, 1.662, 1.825, 2.75, -0.252, 2.024], [-1.56, -0.058, 0.561, 0.054, 3.741, 2.333], [-1.055, -2.665, 0.535, 0.196, 0.05, 1.825], [1.164, 0.58, 1.628, 0.045, 1.482, 2.195], [1.198, -0.291, 1.331, 0.727, 1.34, 1.309]]\nD: [[-0.147, 1.793, 1.85, 3.103, 0.596, 1.69], [-1.538, -0.388, 0.463, 0.445, 3.441, 1.475], [-1.625, -1.946, 0.934, 0.072, -0.182, 1.409], [1.247, 1.123, 0.994, 0.033, 1.379, 2.521], [0.847, -0.38, 0.424, 0.888, 1.469, 2.148]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_139_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_139_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-0.013, -1.238, 0.461, 0.493, 0.288, 0.575], [-1.147, 3.348, 0.377, 0.564, 1.212, 0.508], [0.233, -3.522, 0.086, 0.938, 1.233, 0.371], [1.742, -2.531, 0.357, 1.04, 1.008, 0.183], [-0.823, -2.335, 0.1, 1.256, 0.934, 0.241], [-1.592, 0.953, 0.372, 1.131, 0.478, 0.788], [-0.027, 4.368, 1.235, 0.272, -0.107, 0.356], [2.853, 3.344, 0.339, 0.665, 0.157, 0.567], [2.027, 4.048, 1.116, 0.515, 0.345, 0.52], [0.87, 3.839, 0.8, 0.696, -0.287, 0.501], [1.369, 2.326, 0.538, 0.245, 0.786, 0.243], [0.357, 3.043, 0.662, 0.778, 0.111, 0.513], [1.447, 2.656, 0.359, 0.141, 0.33, 0.84]]\nB: [[0.068, -1.042, 0.544, 0.886, 0.779, 0.545], [-1.479, 3.034, 0.552, 0.898, 0.801, 0.497], [-0.06, -3.112, 0.543, 0.84, 0.784, 0.511], [1.274, -2.138, 0.543, 0.738, 0.855, 0.547], [-0.786, -2.2, 0.536, 0.806, 0.879, 0.474], [-1.39, 1.148, 0.549, 0.822, 0.745, 0.54], [0.444, 4.003, 0.791, 0.485, 0.139, 0.082], [2.511, 3.843, 0.762, 0.448, 0.131, 0.083], [1.884, 3.916, 0.775, 0.46, 0.149, 0.083], [1.153, 3.946, 0.791, 0.453, 0.166, 0.098], [1.053, 2.651, 0.606, 0.523, 0.61, 0.485], [0.449, 2.899, 0.606, 0.476, 0.557, 0.467], [1.688, 2.596, 0.605, 0.503, 0.592, 0.451]]\nC: [[0.102, -0.947, 0.484, 1.245, 0.79, 0.775], [-1.118, 3.375, 0.842, 0.401, 1.069, 0.196], [0.407, -2.782, 0.934, 1.07, 0.467, 0.067], [1.541, -2.237, 0.403, 0.888, 1.246, 0.245], [-0.917, -1.889, 0.628, 0.956, 1.204, 0.523], [-1.021, 1.176, 0.814, 0.368, 0.456, 0.678], [0.573, 4.084, 1.228, 0.815, 0.355, 0.385], [2.848, 3.659, 0.488, 0.047, 0.047, 0.092], [1.907, 4.123, 0.733, 0.026, 0.33, -0.009], [1.212, 4.443, 1.139, 0.078, -0.234, 0.21], [0.892, 2.632, 1.105, 0.392, 1.061, 0.435], [0.166, 3.349, 0.352, 0.282, 0.481, 0.755], [1.529, 2.634, 0.397, 0.324, 0.54, 0.072]]\nD: [[-0.194, -1.122, 0.104, 1.378, 1.12, 0.253], [-1.005, 3.518, 0.745, 0.428, 0.792, 0.08], [0.214, -2.901, 0.412, 0.728, 0.43, 0.91], [1.573, -2.219, 0.557, 0.934, 1.13, 0.876], [-0.782, -2.154, 0.858, 0.543, 1.135, 0.108], [-1.448, 1.097, 0.92, 1.197, 0.497, 0.181], [0.045, 3.571, 0.423, 0.736, -0.143, -0.417], [2.244, 4.297, 0.746, 0.101, 0.473, -0.26], [1.879, 3.692, 0.375, 0.596, -0.051, -0.206], [1.372, 4.096, 0.929, 0.827, -0.125, 0.334], [1.326, 2.984, 0.19, 0.493, 0.248, 0.576], [0.74, 2.996, 0.477, 0.655, 0.254, 0.849], [2.003, 3.037, 0.818, 0.844, 0.675, 0.272]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the chair in the scene. The camera pose information includes: the rotation matrix: [[0.853196, -0.330732, 0.403328], [-0.517406, -0.438892, 0.734619], [-0.065945, -0.835458, -0.545584]]; the translation vector: [2.734716, 6.775187, 1.412962], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.013, -1.238, 0.461, 0.493, 0.288, 0.575], [-1.147, 3.348, 0.377, 0.564, 1.212, 0.508], [0.233, -3.522, 0.086, 0.938, 1.233, 0.371], [1.742, -2.531, 0.357, 1.04, 1.008, 0.183], [-0.823, -2.335, 0.1, 1.256, 0.934, 0.241], [-1.592, 0.953, 0.372, 1.131, 0.478, 0.788], [-0.027, 4.368, 1.235, 0.272, -0.107, 0.356], [2.853, 3.344, 0.339, 0.665, 0.157, 0.567], [2.027, 4.048, 1.116, 0.515, 0.345, 0.52], [0.87, 3.839, 0.8, 0.696, -0.287, 0.501], [1.369, 2.326, 0.538, 0.245, 0.786, 0.243], [0.357, 3.043, 0.662, 0.778, 0.111, 0.513], [1.447, 2.656, 0.359, 0.141, 0.33, 0.84]]\nB: [[0.068, -1.042, 0.544, 0.886, 0.779, 0.545], [-1.479, 3.034, 0.552, 0.898, 0.801, 0.497], [-0.06, -3.112, 0.543, 0.84, 0.784, 0.511], [1.274, -2.138, 0.543, 0.738, 0.855, 0.547], [-0.786, -2.2, 0.536, 0.806, 0.879, 0.474], [-1.39, 1.148, 0.549, 0.822, 0.745, 0.54], [0.444, 4.003, 0.791, 0.485, 0.139, 0.082], [2.511, 3.843, 0.762, 0.448, 0.131, 0.083], [1.884, 3.916, 0.775, 0.46, 0.149, 0.083], [1.153, 3.946, 0.791, 0.453, 0.166, 0.098], [1.053, 2.651, 0.606, 0.523, 0.61, 0.485], [0.449, 2.899, 0.606, 0.476, 0.557, 0.467], [1.688, 2.596, 0.605, 0.503, 0.592, 0.451]]\nC: [[0.102, -0.947, 0.484, 1.245, 0.79, 0.775], [-1.118, 3.375, 0.842, 0.401, 1.069, 0.196], [0.407, -2.782, 0.934, 1.07, 0.467, 0.067], [1.541, -2.237, 0.403, 0.888, 1.246, 0.245], [-0.917, -1.889, 0.628, 0.956, 1.204, 0.523], [-1.021, 1.176, 0.814, 0.368, 0.456, 0.678], [0.573, 4.084, 1.228, 0.815, 0.355, 0.385], [2.848, 3.659, 0.488, 0.047, 0.047, 0.092], [1.907, 4.123, 0.733, 0.026, 0.33, -0.009], [1.212, 4.443, 1.139, 0.078, -0.234, 0.21], [0.892, 2.632, 1.105, 0.392, 1.061, 0.435], [0.166, 3.349, 0.352, 0.282, 0.481, 0.755], [1.529, 2.634, 0.397, 0.324, 0.54, 0.072]]\nD: [[-0.194, -1.122, 0.104, 1.378, 1.12, 0.253], [-1.005, 3.518, 0.745, 0.428, 0.792, 0.08], [0.214, -2.901, 0.412, 0.728, 0.43, 0.91], [1.573, -2.219, 0.557, 0.934, 1.13, 0.876], [-0.782, -2.154, 0.858, 0.543, 1.135, 0.108], [-1.448, 1.097, 0.92, 1.197, 0.497, 0.181], [0.045, 3.571, 0.423, 0.736, -0.143, -0.417], [2.244, 4.297, 0.746, 0.101, 0.473, -0.26], [1.879, 3.692, 0.375, 0.596, -0.051, -0.206], [1.372, 4.096, 0.929, 0.827, -0.125, 0.334], [1.326, 2.984, 0.19, 0.493, 0.248, 0.576], [0.74, 2.996, 0.477, 0.655, 0.254, 0.849], [2.003, 3.037, 0.818, 0.844, 0.675, 0.272]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_140_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_140_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-0.478, -1.621, 1.471, 0.04, 0.607, 0.77], [-0.622, -0.28, 0.778, 0.596, 0.791, 0.172], [0.68, -2.333, 1.441, 0.527, 0.774, 0.878], [0.47, -3.455, 1.236, 0.145, 0.605, 0.702], [-1.07, -2.607, 1.289, 0.421, 0.201, 0.117], [-0.819, -3.811, 1.501, 0.378, 0.459, 0.305], [-0.289, -1.598, 0.306, 0.193, 0.374, 1.351], [-0.144, -0.28, 1.064, 0.516, 0.462, 1.314], [-0.472, 1.134, 0.95, 0.612, 0.425, 0.242], [0.292, 1.559, -0.0, 1.024, 0.739, 0.637], [1.632, 1.532, 0.377, 0.961, 0.147, 0.54], [1.253, 1.503, 0.223, 0.356, 0.173, 0.917], [2.079, 0.639, 0.524, 0.497, 0.63, 1.101], [1.452, -0.032, 0.35, 1.029, 0.429, 0.469], [1.964, -1.067, 0.351, 1.202, 1.067, 0.649], [1.915, -1.339, 0.962, 0.392, 0.481, -0.02]]\nB: [[-1.05, -1.003, 1.329, 0.32, 0.076, 0.616], [-0.594, -0.426, 0.767, 0.622, 0.307, 0.007], [0.602, -2.305, 1.043, 0.218, 0.243, 0.681], [0.924, -3.409, 0.98, 0.773, 0.471, 1.089], [-0.329, -2.354, 0.789, 0.408, 0.875, 0.623], [-0.349, -3.787, 1.449, 0.31, 0.976, 0.266], [0.652, -1.018, 1.006, 0.796, 0.883, 0.697], [0.628, -0.604, 0.772, 0.114, 0.996, 0.953], [-1.118, 1.128, 0.061, 0.216, 0.338, 0.764], [0.65, 1.585, 0.323, 0.699, 0.859, 0.499], [1.631, 1.493, 0.088, 1.244, 0.636, 1.121], [1.187, 0.927, 0.824, 0.22, 0.275, 0.894], [1.693, 0.178, 0.2, 0.357, 0.96, 0.555], [1.798, -0.426, 0.556, 0.111, 1.016, 0.592], [1.891, -0.692, 0.467, 0.91, 1.42, 0.916], [1.538, -2.029, 0.941, 0.82, 1.037, 0.527]]\nC: [[-0.797, -1.314, 1.076, 0.174, 0.538, 0.297], [-0.804, -0.643, 0.993, 0.18, 0.483, 0.317], [0.265, -2.771, 0.976, 0.564, 0.483, 0.724], [0.443, -3.263, 1.105, 0.404, 0.755, 0.654], [-0.786, -2.701, 1.224, 0.235, 0.623, 0.423], [-0.579, -3.467, 1.386, 0.149, 0.491, 0.284], [0.195, -1.173, 0.617, 0.439, 0.594, 0.981], [0.152, -0.693, 0.576, 0.363, 0.648, 0.901], [-0.836, 1.438, 0.551, 0.438, 0.598, 0.552], [0.258, 1.345, 0.466, 0.561, 0.507, 0.736], [1.246, 1.609, 0.396, 0.752, 0.566, 0.883], [1.646, 1.19, 0.575, 0.611, 0.592, 0.619], [1.73, 0.493, 0.521, 0.445, 0.583, 0.771], [1.766, -0.179, 0.551, 0.536, 0.58, 0.774], [1.864, -0.697, 0.533, 0.816, 1.199, 0.994], [1.74, -1.667, 0.652, 0.516, 0.607, 0.36]]\nD: [[-0.49, -1.289, 0.9, 0.491, 0.951, 0.59], [-1.107, -1.021, 1.479, 0.523, 0.505, 0.09], [-0.233, -2.971, 1.208, 0.309, 0.946, 0.617], [0.587, -2.842, 0.811, 0.828, 0.821, 0.621], [-0.674, -2.976, 1.257, -0.139, 0.206, 0.639], [-0.269, -3.606, 1.299, -0.169, 0.133, 0.486], [0.033, -0.697, 1.063, 0.567, 1.022, 1.265], [0.252, -0.714, 0.426, 0.514, 0.322, 1.359], [-1.161, 1.486, 0.647, 0.683, 0.314, 0.187], [-0.11, 1.173, 0.725, 0.462, 0.264, 1.138], [1.341, 1.682, 0.277, 0.312, 0.356, 0.94], [1.815, 1.188, 0.624, 1.015, 0.174, 0.508], [1.714, 0.423, 0.79, 0.889, 0.659, 0.533], [1.648, -0.367, 0.718, 0.468, 1.049, 0.941], [2.335, -0.44, 0.71, 1.148, 1.407, 0.783], [1.632, -1.945, 0.223, 0.453, 0.239, 0.703]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the chair in the scene. The camera pose information includes: the rotation matrix: [[-0.476704, 0.41796, -0.773345], [0.878176, 0.186897, -0.440314], [-0.039498, -0.889033, -0.456137]]; the translation vector: [2.405627, 4.675593, 1.276166], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.478, -1.621, 1.471, 0.04, 0.607, 0.77], [-0.622, -0.28, 0.778, 0.596, 0.791, 0.172], [0.68, -2.333, 1.441, 0.527, 0.774, 0.878], [0.47, -3.455, 1.236, 0.145, 0.605, 0.702], [-1.07, -2.607, 1.289, 0.421, 0.201, 0.117], [-0.819, -3.811, 1.501, 0.378, 0.459, 0.305], [-0.289, -1.598, 0.306, 0.193, 0.374, 1.351], [-0.144, -0.28, 1.064, 0.516, 0.462, 1.314], [-0.472, 1.134, 0.95, 0.612, 0.425, 0.242], [0.292, 1.559, -0.0, 1.024, 0.739, 0.637], [1.632, 1.532, 0.377, 0.961, 0.147, 0.54], [1.253, 1.503, 0.223, 0.356, 0.173, 0.917], [2.079, 0.639, 0.524, 0.497, 0.63, 1.101], [1.452, -0.032, 0.35, 1.029, 0.429, 0.469], [1.964, -1.067, 0.351, 1.202, 1.067, 0.649], [1.915, -1.339, 0.962, 0.392, 0.481, -0.02]]\nB: [[-1.05, -1.003, 1.329, 0.32, 0.076, 0.616], [-0.594, -0.426, 0.767, 0.622, 0.307, 0.007], [0.602, -2.305, 1.043, 0.218, 0.243, 0.681], [0.924, -3.409, 0.98, 0.773, 0.471, 1.089], [-0.329, -2.354, 0.789, 0.408, 0.875, 0.623], [-0.349, -3.787, 1.449, 0.31, 0.976, 0.266], [0.652, -1.018, 1.006, 0.796, 0.883, 0.697], [0.628, -0.604, 0.772, 0.114, 0.996, 0.953], [-1.118, 1.128, 0.061, 0.216, 0.338, 0.764], [0.65, 1.585, 0.323, 0.699, 0.859, 0.499], [1.631, 1.493, 0.088, 1.244, 0.636, 1.121], [1.187, 0.927, 0.824, 0.22, 0.275, 0.894], [1.693, 0.178, 0.2, 0.357, 0.96, 0.555], [1.798, -0.426, 0.556, 0.111, 1.016, 0.592], [1.891, -0.692, 0.467, 0.91, 1.42, 0.916], [1.538, -2.029, 0.941, 0.82, 1.037, 0.527]]\nC: [[-0.797, -1.314, 1.076, 0.174, 0.538, 0.297], [-0.804, -0.643, 0.993, 0.18, 0.483, 0.317], [0.265, -2.771, 0.976, 0.564, 0.483, 0.724], [0.443, -3.263, 1.105, 0.404, 0.755, 0.654], [-0.786, -2.701, 1.224, 0.235, 0.623, 0.423], [-0.579, -3.467, 1.386, 0.149, 0.491, 0.284], [0.195, -1.173, 0.617, 0.439, 0.594, 0.981], [0.152, -0.693, 0.576, 0.363, 0.648, 0.901], [-0.836, 1.438, 0.551, 0.438, 0.598, 0.552], [0.258, 1.345, 0.466, 0.561, 0.507, 0.736], [1.246, 1.609, 0.396, 0.752, 0.566, 0.883], [1.646, 1.19, 0.575, 0.611, 0.592, 0.619], [1.73, 0.493, 0.521, 0.445, 0.583, 0.771], [1.766, -0.179, 0.551, 0.536, 0.58, 0.774], [1.864, -0.697, 0.533, 0.816, 1.199, 0.994], [1.74, -1.667, 0.652, 0.516, 0.607, 0.36]]\nD: [[-0.49, -1.289, 0.9, 0.491, 0.951, 0.59], [-1.107, -1.021, 1.479, 0.523, 0.505, 0.09], [-0.233, -2.971, 1.208, 0.309, 0.946, 0.617], [0.587, -2.842, 0.811, 0.828, 0.821, 0.621], [-0.674, -2.976, 1.257, -0.139, 0.206, 0.639], [-0.269, -3.606, 1.299, -0.169, 0.133, 0.486], [0.033, -0.697, 1.063, 0.567, 1.022, 1.265], [0.252, -0.714, 0.426, 0.514, 0.322, 1.359], [-1.161, 1.486, 0.647, 0.683, 0.314, 0.187], [-0.11, 1.173, 0.725, 0.462, 0.264, 1.138], [1.341, 1.682, 0.277, 0.312, 0.356, 0.94], [1.815, 1.188, 0.624, 1.015, 0.174, 0.508], [1.714, 0.423, 0.79, 0.889, 0.659, 0.533], [1.648, -0.367, 0.718, 0.468, 1.049, 0.941], [2.335, -0.44, 0.71, 1.148, 1.407, 0.783], [1.632, -1.945, 0.223, 0.453, 0.239, 0.703]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_141_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_141_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[0.214, 2.51, 1.365, 3.041, 0.194, 2.765], [-0.05, -2.478, 0.301, 2.986, 0.3, 0.641], [-1.526, -0.161, 1.336, 0.225, 4.64, 2.734]]\nB: [[0.048, 2.151, 1.156, 2.76, 0.655, 2.821], [0.225, -2.827, 0.677, 3.049, -0.043, 0.254], [-1.759, -0.568, 1.729, -0.249, 5.02, 2.969]]\nC: [[0.558, 2.736, 1.619, 3.45, -0.161, 2.854], [-0.367, -2.102, 0.777, 2.594, 0.161, 0.236], [-1.277, -0.254, 1.397, -0.136, 4.615, 2.411]]\nD: [[0.699, 2.21, 1.721, 3.445, -0.097, 2.767], [0.274, -2.743, -0.017, 2.983, 0.564, 0.816], [-1.467, -0.193, 1.628, 0.718, 4.962, 2.711]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[-0.207785, -0.462455, 0.861952], [-0.977184, 0.13779, -0.161637], [-0.044019, -0.875871, -0.480534]]; the translation vector: [2.720584, 1.654419, 1.522448], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.214, 2.51, 1.365, 3.041, 0.194, 2.765], [-0.05, -2.478, 0.301, 2.986, 0.3, 0.641], [-1.526, -0.161, 1.336, 0.225, 4.64, 2.734]]\nB: [[0.048, 2.151, 1.156, 2.76, 0.655, 2.821], [0.225, -2.827, 0.677, 3.049, -0.043, 0.254], [-1.759, -0.568, 1.729, -0.249, 5.02, 2.969]]\nC: [[0.558, 2.736, 1.619, 3.45, -0.161, 2.854], [-0.367, -2.102, 0.777, 2.594, 0.161, 0.236], [-1.277, -0.254, 1.397, -0.136, 4.615, 2.411]]\nD: [[0.699, 2.21, 1.721, 3.445, -0.097, 2.767], [0.274, -2.743, -0.017, 2.983, 0.564, 0.816], [-1.467, -0.193, 1.628, 0.718, 4.962, 2.711]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_142_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_142_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-1.781, -0.092, 1.437, 0.297, 5.42, 2.838], [-0.225, -2.203, -0.028, 3.04, -0.15, 0.159], [0.563, 2.457, 1.86, 3.628, 0.174, 3.621], [1.294, -0.516, 1.217, 0.088, 5.292, 2.913], [-1.437, -2.905, 0.867, 0.166, -0.17, 0.831]]\nB: [[-1.712, -0.169, 1.937, -0.166, 5.172, 3.249], [0.409, -2.556, 0.634, 3.096, 0.671, 0.11], [-0.287, 2.175, 1.704, 3.703, 0.15, 2.806], [2.158, 0.248, 1.0, 0.669, 5.195, 2.453], [-1.117, -2.267, 1.561, 0.31, -0.422, 1.139]]\nC: [[-1.796, -0.26, 1.071, 0.363, 4.986, 2.747], [-0.333, -2.47, 0.362, 3.532, -0.124, 0.597], [0.208, 2.122, 1.319, 3.656, -0.186, 2.723], [1.521, -0.537, 0.986, 0.704, 5.101, 2.943], [-1.457, -2.856, 0.86, 0.281, 0.313, 0.878]]\nD: [[-1.474, 0.024, 1.526, 0.216, 4.974, 3.09], [0.118, -2.408, 0.332, 3.201, 0.275, 0.54], [0.144, 2.522, 1.535, 3.347, 0.23, 3.137], [1.788, -0.144, 1.382, 0.213, 5.326, 2.779], [-1.437, -2.464, 1.35, 0.243, 0.036, 0.743]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[-0.45377, -0.425062, 0.783208], [-0.891046, 0.227634, -0.392708], [-0.01136, -0.876074, -0.482043]]; the translation vector: [2.25004, 3.862298, 1.519108], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.781, -0.092, 1.437, 0.297, 5.42, 2.838], [-0.225, -2.203, -0.028, 3.04, -0.15, 0.159], [0.563, 2.457, 1.86, 3.628, 0.174, 3.621], [1.294, -0.516, 1.217, 0.088, 5.292, 2.913], [-1.437, -2.905, 0.867, 0.166, -0.17, 0.831]]\nB: [[-1.712, -0.169, 1.937, -0.166, 5.172, 3.249], [0.409, -2.556, 0.634, 3.096, 0.671, 0.11], [-0.287, 2.175, 1.704, 3.703, 0.15, 2.806], [2.158, 0.248, 1.0, 0.669, 5.195, 2.453], [-1.117, -2.267, 1.561, 0.31, -0.422, 1.139]]\nC: [[-1.796, -0.26, 1.071, 0.363, 4.986, 2.747], [-0.333, -2.47, 0.362, 3.532, -0.124, 0.597], [0.208, 2.122, 1.319, 3.656, -0.186, 2.723], [1.521, -0.537, 0.986, 0.704, 5.101, 2.943], [-1.457, -2.856, 0.86, 0.281, 0.313, 0.878]]\nD: [[-1.474, 0.024, 1.526, 0.216, 4.974, 3.09], [0.118, -2.408, 0.332, 3.201, 0.275, 0.54], [0.144, 2.522, 1.535, 3.347, 0.23, 3.137], [1.788, -0.144, 1.382, 0.213, 5.326, 2.779], [-1.437, -2.464, 1.35, 0.243, 0.036, 0.743]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_143_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_143_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[2.29, -0.316, 1.432, 0.063, 4.92, 2.21], [-1.382, -1.33, 2.175, 0.695, 0.538, 0.2], [-1.806, -2.363, 0.83, 0.352, 1.114, 1.836], [-1.306, -1.788, 0.815, 0.84, 0.3, 1.933], [-2.082, -0.191, 1.078, 0.269, 3.774, 1.876], [-1.331, 2.183, 1.747, 0.179, -0.216, 1.553], [-1.305, 2.378, 1.485, 0.225, 1.117, 1.988], [0.552, 2.732, 1.097, 2.615, 0.213, 2.449]]\nB: [[2.019, 0.108, 1.002, 0.215, 5.208, 2.059], [-1.022, -1.729, 2.165, 0.645, 0.146, 0.247], [-1.383, -1.998, 1.17, 0.215, 0.925, 2.24], [-1.526, -1.582, 1.263, 0.348, 0.152, 2.156], [-1.694, 0.207, 1.164, 0.178, 3.686, 2.357], [-1.644, 1.973, 1.343, 0.176, 0.151, 1.146], [-1.605, 2.692, 1.055, 0.124, 1.358, 1.982], [0.625, 2.804, 0.995, 2.8, 0.361, 2.1]]\nC: [[1.702, 0.103, 1.118, 0.347, 5.169, 2.205], [-0.755, -1.506, 2.319, 1.022, 0.542, -0.064], [-1.248, -1.952, 1.27, 0.08, 1.199, 2.239], [-1.042, -1.657, 1.027, 0.155, -0.197, 2.421], [-1.513, 0.045, 1.167, -0.103, 3.723, 2.465], [-1.23, 1.582, 1.115, -0.014, -0.31, 1.511], [-1.196, 2.213, 1.364, -0.205, 1.046, 1.714], [0.962, 2.867, 0.955, 2.429, 0.313, 2.593]]\nD: [[2.083, 0.385, 1.347, 0.273, 5.186, 1.86], [-0.546, -1.555, 1.851, 0.975, 0.412, 0.638], [-1.077, -1.883, 1.417, -0.014, 0.602, 2.249], [-1.395, -1.99, 1.177, -0.094, -0.079, 2.003], [-1.5, 0.548, 1.221, 0.453, 3.489, 2.126], [-2.081, 1.694, 1.43, -0.163, 0.443, 1.038], [-1.256, 2.343, 0.839, 0.584, 1.506, 1.621], [0.155, 3.04, 0.757, 2.991, 0.014, 2.136]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[0.804414, -0.195207, 0.561082], [-0.593456, -0.306943, 0.74404], [0.026978, -0.931494, -0.362756]]; the translation vector: [4.397897, 1.805397, 1.263968], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[2.29, -0.316, 1.432, 0.063, 4.92, 2.21], [-1.382, -1.33, 2.175, 0.695, 0.538, 0.2], [-1.806, -2.363, 0.83, 0.352, 1.114, 1.836], [-1.306, -1.788, 0.815, 0.84, 0.3, 1.933], [-2.082, -0.191, 1.078, 0.269, 3.774, 1.876], [-1.331, 2.183, 1.747, 0.179, -0.216, 1.553], [-1.305, 2.378, 1.485, 0.225, 1.117, 1.988], [0.552, 2.732, 1.097, 2.615, 0.213, 2.449]]\nB: [[2.019, 0.108, 1.002, 0.215, 5.208, 2.059], [-1.022, -1.729, 2.165, 0.645, 0.146, 0.247], [-1.383, -1.998, 1.17, 0.215, 0.925, 2.24], [-1.526, -1.582, 1.263, 0.348, 0.152, 2.156], [-1.694, 0.207, 1.164, 0.178, 3.686, 2.357], [-1.644, 1.973, 1.343, 0.176, 0.151, 1.146], [-1.605, 2.692, 1.055, 0.124, 1.358, 1.982], [0.625, 2.804, 0.995, 2.8, 0.361, 2.1]]\nC: [[1.702, 0.103, 1.118, 0.347, 5.169, 2.205], [-0.755, -1.506, 2.319, 1.022, 0.542, -0.064], [-1.248, -1.952, 1.27, 0.08, 1.199, 2.239], [-1.042, -1.657, 1.027, 0.155, -0.197, 2.421], [-1.513, 0.045, 1.167, -0.103, 3.723, 2.465], [-1.23, 1.582, 1.115, -0.014, -0.31, 1.511], [-1.196, 2.213, 1.364, -0.205, 1.046, 1.714], [0.962, 2.867, 0.955, 2.429, 0.313, 2.593]]\nD: [[2.083, 0.385, 1.347, 0.273, 5.186, 1.86], [-0.546, -1.555, 1.851, 0.975, 0.412, 0.638], [-1.077, -1.883, 1.417, -0.014, 0.602, 2.249], [-1.395, -1.99, 1.177, -0.094, -0.079, 2.003], [-1.5, 0.548, 1.221, 0.453, 3.489, 2.126], [-2.081, 1.694, 1.43, -0.163, 0.443, 1.038], [-1.256, 2.343, 0.839, 0.584, 1.506, 1.621], [0.155, 3.04, 0.757, 2.991, 0.014, 2.136]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_144_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_144_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-1.219, -0.964, 1.58, 0.354, 3.912, 2.347], [0.255, 0.996, 1.173, 3.495, 1.006, 2.248], [1.513, 0.137, 0.79, 0.63, 3.182, 2.432]]\nB: [[-1.818, -0.647, 1.066, 0.433, 4.095, 1.902], [0.12, 0.784, 1.447, 3.712, 0.386, 2.623], [1.292, -0.011, 0.89, 0.451, 3.017, 2.105]]\nC: [[-1.598, -0.539, 1.125, 0.503, 3.791, 2.392], [-0.019, 1.26, 1.209, 3.332, 0.548, 2.478], [1.708, -0.009, 1.196, 0.447, 2.783, 2.468]]\nD: [[-1.147, -0.143, 1.224, 0.476, 4.202, 2.039], [-0.033, 1.197, 1.039, 3.572, 0.489, 2.65], [1.648, -0.334, 1.403, 0.735, 3.031, 2.541]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[0.218501, -0.721835, 0.656667], [-0.97193, -0.10083, 0.212566], [-0.087226, -0.684681, -0.723605]]; the translation vector: [2.10902, 2.428258, 1.386435], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.219, -0.964, 1.58, 0.354, 3.912, 2.347], [0.255, 0.996, 1.173, 3.495, 1.006, 2.248], [1.513, 0.137, 0.79, 0.63, 3.182, 2.432]]\nB: [[-1.818, -0.647, 1.066, 0.433, 4.095, 1.902], [0.12, 0.784, 1.447, 3.712, 0.386, 2.623], [1.292, -0.011, 0.89, 0.451, 3.017, 2.105]]\nC: [[-1.598, -0.539, 1.125, 0.503, 3.791, 2.392], [-0.019, 1.26, 1.209, 3.332, 0.548, 2.478], [1.708, -0.009, 1.196, 0.447, 2.783, 2.468]]\nD: [[-1.147, -0.143, 1.224, 0.476, 4.202, 2.039], [-0.033, 1.197, 1.039, 3.572, 0.489, 2.65], [1.648, -0.334, 1.403, 0.735, 3.031, 2.541]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_145_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_145_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-0.089, 0.93, 0.906, 1.952, 6.847, 0.759], [2.364, 2.183, -0.461, 0.089, -0.064, -0.412], [0.126, -4.445, 0.264, 1.544, 0.593, 0.846], [1.593, -4.523, 0.942, 1.504, 1.198, 0.582]]\nB: [[0.288, 1.03, -0.052, 1.738, 7.022, 0.872], [3.315, 2.231, -0.328, 0.592, 0.336, -0.023], [0.311, -4.176, 1.057, 1.806, 0.812, 1.384], [1.759, -3.771, 0.974, 2.086, 0.713, 1.164]]\nC: [[0.167, 0.689, 0.442, 1.571, 6.663, 0.887], [2.849, 2.011, -0.011, 0.132, 0.183, 0.035], [-0.085, -4.074, 0.615, 1.543, 0.713, 0.958], [1.39, -4.168, 0.506, 1.716, 0.715, 0.966]]\nD: [[0.313, 0.252, 0.284, 1.649, 6.826, 1.244], [2.392, 1.917, -0.34, 0.488, -0.05, 0.218], [0.064, -3.679, 0.658, 2.001, 0.36, 1.007], [1.092, -4.59, 0.839, 1.267, 0.336, 1.034]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the table in the scene. The camera pose information includes: the rotation matrix: [[-0.241978, -0.427128, 0.871211], [-0.963615, 0.210861, -0.164264], [-0.113543, -0.879261, -0.462611]]; the translation vector: [2.164319, 10.11033, 1.716674], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.089, 0.93, 0.906, 1.952, 6.847, 0.759], [2.364, 2.183, -0.461, 0.089, -0.064, -0.412], [0.126, -4.445, 0.264, 1.544, 0.593, 0.846], [1.593, -4.523, 0.942, 1.504, 1.198, 0.582]]\nB: [[0.288, 1.03, -0.052, 1.738, 7.022, 0.872], [3.315, 2.231, -0.328, 0.592, 0.336, -0.023], [0.311, -4.176, 1.057, 1.806, 0.812, 1.384], [1.759, -3.771, 0.974, 2.086, 0.713, 1.164]]\nC: [[0.167, 0.689, 0.442, 1.571, 6.663, 0.887], [2.849, 2.011, -0.011, 0.132, 0.183, 0.035], [-0.085, -4.074, 0.615, 1.543, 0.713, 0.958], [1.39, -4.168, 0.506, 1.716, 0.715, 0.966]]\nD: [[0.313, 0.252, 0.284, 1.649, 6.826, 1.244], [2.392, 1.917, -0.34, 0.488, -0.05, 0.218], [0.064, -3.679, 0.658, 2.001, 0.36, 1.007], [1.092, -4.59, 0.839, 1.267, 0.336, 1.034]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_146_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_146_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[0.766, -1.392, 1.241, 0.359, 0.114, 0.509]]\nB: [[0.818, -0.933, 0.887, 0.454, 0.574, 0.13]]\nC: [[0.354, -0.503, 0.874, 0.511, 0.736, 0.517]]\nD: [[1.154, -1.214, 1.131, 0.407, 0.409, -0.18]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the paper cutter in the scene. The camera pose information includes: the rotation matrix: [[0.624751, -0.31057, 0.716403], [-0.780527, -0.273701, 0.562018], [0.021534, -0.910293, -0.413403]]; the translation vector: [-0.212106, 0.775797, 1.619325], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.766, -1.392, 1.241, 0.359, 0.114, 0.509]]\nB: [[0.818, -0.933, 0.887, 0.454, 0.574, 0.13]]\nC: [[0.354, -0.503, 0.874, 0.511, 0.736, 0.517]]\nD: [[1.154, -1.214, 1.131, 0.407, 0.409, -0.18]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_147_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_147_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-0.354, -1.662, 1.012, 0.018, 0.115, 0.103]]\nB: [[-0.792, -1.485, 1.441, 0.393, -0.105, 0.505]]\nC: [[-0.528, -1.745, 1.201, 0.1, 0.492, -0.087]]\nD: [[-0.815, -1.664, 1.36, -0.019, -0.178, -0.353]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the light switch in the scene. The camera pose information includes: the rotation matrix: [[-0.677945, 0.409221, -0.610679], [0.735109, 0.38004, -0.561413], [0.00234, -0.829523, -0.558468]]; the translation vector: [3.092599, 2.044437, 1.437429], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.354, -1.662, 1.012, 0.018, 0.115, 0.103]]\nB: [[-0.792, -1.485, 1.441, 0.393, -0.105, 0.505]]\nC: [[-0.528, -1.745, 1.201, 0.1, 0.492, -0.087]]\nD: [[-0.815, -1.664, 1.36, -0.019, -0.178, -0.353]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_148_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_148_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[0.747, -2.431, 0.458, 6.984, 5.717, 0.813], [-0.691, 2.68, 0.322, 9.392, 3.093, 0.827]]\nB: [[0.057, -2.429, 0.573, 7.86, 5.95, 0.072], [-0.235, 2.427, 0.641, 8.832, 2.983, 0.523]]\nC: [[0.397, -2.595, 0.294, 7.23, 6.005, 0.625], [-0.735, 2.197, 0.57, 9.4, 2.894, 0.932]]\nD: [[0.26, -2.542, 0.108, 7.4, 6.111, 0.419], [-0.69, 2.286, 0.483, 9.253, 2.675, 0.512]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the floor in the scene. The camera pose information includes: the rotation matrix: [[-0.928108, -0.125197, 0.35063], [-0.371823, 0.3599, -0.855699], [-0.019061, -0.924553, -0.380577]]; the translation vector: [5.296664, 4.137775, 1.856988], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.747, -2.431, 0.458, 6.984, 5.717, 0.813], [-0.691, 2.68, 0.322, 9.392, 3.093, 0.827]]\nB: [[0.057, -2.429, 0.573, 7.86, 5.95, 0.072], [-0.235, 2.427, 0.641, 8.832, 2.983, 0.523]]\nC: [[0.397, -2.595, 0.294, 7.23, 6.005, 0.625], [-0.735, 2.197, 0.57, 9.4, 2.894, 0.932]]\nD: [[0.26, -2.542, 0.108, 7.4, 6.111, 0.419], [-0.69, 2.286, 0.483, 9.253, 2.675, 0.512]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_149_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_149_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[0.374, -0.417, 0.072, 5.927, 4.746, 0.404], [1.639, 0.598, 0.366, 1.118, 4.293, 0.368], [2.9, -1.425, 0.632, 1.428, 0.28, 0.254]]\nB: [[0.799, -0.748, 0.531, 6.317, 4.867, 0.446], [1.703, 0.912, 0.069, 1.474, 3.826, 0.769], [3.325, -1.637, 0.524, 1.447, 0.08, -0.103]]\nC: [[0.109, -0.618, 0.303, 6.211, 4.572, 0.089], [2.121, 0.402, 0.274, 1.014, 4.79, 0.298], [2.929, -1.018, 0.279, 1.903, -0.107, 0.311]]\nD: [[-0.053, -0.73, 0.366, 5.716, 4.946, 0.696], [1.94, 0.414, 0.125, 0.997, 4.139, 0.213], [2.533, -1.383, 0.826, 1.711, 0.446, 0.207]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the floor in the scene. The camera pose information includes: the rotation matrix: [[-0.052123, 0.492225, -0.868906], [0.996177, 0.08671, -0.010637], [0.070107, -0.866138, -0.494863]]; the translation vector: [3.27549, 2.071379, 1.287401], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.374, -0.417, 0.072, 5.927, 4.746, 0.404], [1.639, 0.598, 0.366, 1.118, 4.293, 0.368], [2.9, -1.425, 0.632, 1.428, 0.28, 0.254]]\nB: [[0.799, -0.748, 0.531, 6.317, 4.867, 0.446], [1.703, 0.912, 0.069, 1.474, 3.826, 0.769], [3.325, -1.637, 0.524, 1.447, 0.08, -0.103]]\nC: [[0.109, -0.618, 0.303, 6.211, 4.572, 0.089], [2.121, 0.402, 0.274, 1.014, 4.79, 0.298], [2.929, -1.018, 0.279, 1.903, -0.107, 0.311]]\nD: [[-0.053, -0.73, 0.366, 5.716, 4.946, 0.696], [1.94, 0.414, 0.125, 0.997, 4.139, 0.213], [2.533, -1.383, 0.826, 1.711, 0.446, 0.207]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_150_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_150_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-0.627, -3.376, 0.855, 6.372, 0.045, 1.93], [-4.559, -0.814, 0.815, 0.23, 4.236, 2.315], [2.457, -1.253, 1.135, 0.502, 4.3, 1.849], [2.763, 1.048, 0.711, 0.538, 0.174, 2.307], [2.637, 1.438, 0.737, -0.26, 0.359, 1.655], [2.838, 1.209, 1.15, -0.136, 0.413, 2.018], [2.547, 3.388, 0.7, 0.133, 3.272, 2.033], [1.842, 5.509, 1.228, 1.639, 0.473, 2.533], [1.462, 4.688, 1.401, -0.073, 1.6, 2.078], [3.735, 2.121, 1.265, 0.57, 0.876, 2.223], [3.212, 2.741, 1.782, 1.502, 0.578, -0.004]]\nB: [[-0.903, -3.755, 0.874, 6.176, 0.003, 1.35], [-4.554, -0.741, 1.026, -0.065, 4.304, 1.597], [2.299, -1.131, 0.865, 0.391, 4.838, 2.495], [2.651, 0.847, 1.18, 0.317, 0.031, 1.522], [1.968, 0.985, 1.174, 0.018, 0.505, 1.538], [2.371, 1.973, 0.767, 0.103, 0.399, 1.514], [2.144, 3.832, 1.306, -0.158, 3.954, 2.582], [1.91, 5.239, 0.926, 1.148, 0.013, 2.526], [1.285, 4.859, 0.706, 0.551, 0.734, 2.356], [3.523, 2.358, 1.085, 0.08, 1.478, 2.236], [3.224, 3.037, 2.38, 1.03, -0.257, 0.188]]\nC: [[-1.266, -3.485, 0.564, 6.835, 0.324, 1.764], [-3.629, -0.962, 1.016, -0.055, 4.598, 2.313], [2.437, -1.124, 1.463, -0.109, 4.49, 2.527], [2.113, 1.183, 0.667, 0.701, -0.094, 1.699], [2.045, 1.707, 0.812, -0.196, 0.311, 1.762], [2.268, 1.659, 0.591, -0.094, 0.192, 1.734], [2.994, 3.302, 1.563, 0.269, 3.696, 2.084], [1.688, 4.818, 0.728, 1.225, 0.665, 2.09], [0.999, 4.388, 1.202, -0.003, 0.99, 2.125], [3.205, 2.357, 1.438, 0.088, 1.176, 2.548], [3.215, 2.712, 1.754, 0.977, -0.046, 0.843]]\nD: [[-0.786, -3.408, 0.812, 6.62, 0.23, 1.627], [-4.064, -0.926, 0.97, 0.26, 4.308, 1.916], [2.527, -1.13, 1.126, 0.243, 4.695, 2.21], [2.347, 1.178, 0.964, 0.317, 0.117, 1.88], [2.227, 1.442, 1.014, 0.145, 0.618, 2.004], [2.348, 1.638, 0.879, 0.351, 0.101, 1.748], [2.53, 3.466, 1.165, 0.288, 3.589, 2.389], [1.982, 5.141, 1.204, 1.32, 0.458, 2.309], [1.319, 4.76, 1.151, 0.233, 1.14, 2.044], [3.664, 2.514, 1.17, 0.338, 1.259, 2.387], [3.264, 3.152, 2.173, 1.061, 0.098, 0.435]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[0.688084, 0.423256, -0.589401], [0.725514, -0.415863, 0.54835], [-0.013017, -0.80493, -0.593227]]; the translation vector: [3.968163, 0.8771, 1.421607], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.627, -3.376, 0.855, 6.372, 0.045, 1.93], [-4.559, -0.814, 0.815, 0.23, 4.236, 2.315], [2.457, -1.253, 1.135, 0.502, 4.3, 1.849], [2.763, 1.048, 0.711, 0.538, 0.174, 2.307], [2.637, 1.438, 0.737, -0.26, 0.359, 1.655], [2.838, 1.209, 1.15, -0.136, 0.413, 2.018], [2.547, 3.388, 0.7, 0.133, 3.272, 2.033], [1.842, 5.509, 1.228, 1.639, 0.473, 2.533], [1.462, 4.688, 1.401, -0.073, 1.6, 2.078], [3.735, 2.121, 1.265, 0.57, 0.876, 2.223], [3.212, 2.741, 1.782, 1.502, 0.578, -0.004]]\nB: [[-0.903, -3.755, 0.874, 6.176, 0.003, 1.35], [-4.554, -0.741, 1.026, -0.065, 4.304, 1.597], [2.299, -1.131, 0.865, 0.391, 4.838, 2.495], [2.651, 0.847, 1.18, 0.317, 0.031, 1.522], [1.968, 0.985, 1.174, 0.018, 0.505, 1.538], [2.371, 1.973, 0.767, 0.103, 0.399, 1.514], [2.144, 3.832, 1.306, -0.158, 3.954, 2.582], [1.91, 5.239, 0.926, 1.148, 0.013, 2.526], [1.285, 4.859, 0.706, 0.551, 0.734, 2.356], [3.523, 2.358, 1.085, 0.08, 1.478, 2.236], [3.224, 3.037, 2.38, 1.03, -0.257, 0.188]]\nC: [[-1.266, -3.485, 0.564, 6.835, 0.324, 1.764], [-3.629, -0.962, 1.016, -0.055, 4.598, 2.313], [2.437, -1.124, 1.463, -0.109, 4.49, 2.527], [2.113, 1.183, 0.667, 0.701, -0.094, 1.699], [2.045, 1.707, 0.812, -0.196, 0.311, 1.762], [2.268, 1.659, 0.591, -0.094, 0.192, 1.734], [2.994, 3.302, 1.563, 0.269, 3.696, 2.084], [1.688, 4.818, 0.728, 1.225, 0.665, 2.09], [0.999, 4.388, 1.202, -0.003, 0.99, 2.125], [3.205, 2.357, 1.438, 0.088, 1.176, 2.548], [3.215, 2.712, 1.754, 0.977, -0.046, 0.843]]\nD: [[-0.786, -3.408, 0.812, 6.62, 0.23, 1.627], [-4.064, -0.926, 0.97, 0.26, 4.308, 1.916], [2.527, -1.13, 1.126, 0.243, 4.695, 2.21], [2.347, 1.178, 0.964, 0.317, 0.117, 1.88], [2.227, 1.442, 1.014, 0.145, 0.618, 2.004], [2.348, 1.638, 0.879, 0.351, 0.101, 1.748], [2.53, 3.466, 1.165, 0.288, 3.589, 2.389], [1.982, 5.141, 1.204, 1.32, 0.458, 2.309], [1.319, 4.76, 1.151, 0.233, 1.14, 2.044], [3.664, 2.514, 1.17, 0.338, 1.259, 2.387], [3.264, 3.152, 2.173, 1.061, 0.098, 0.435]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_151_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_151_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[2.24, 0.127, 1.561, 0.17, 6.452, 1.322]]\nB: [[1.808, 0.445, 1.097, -0.163, 6.206, 1.162]]\nC: [[2.076, -0.046, 1.133, 0.371, 6.595, 0.908]]\nD: [[2.152, 0.397, 1.838, 0.141, 6.038, 1.758]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the whiteboard in the scene. The camera pose information includes: the rotation matrix: [[-0.176261, -0.039155, 0.983564], [-0.983722, -0.028492, -0.177423], [0.03497, -0.998827, -0.033496]]; the translation vector: [3.054739, 2.437738, 1.503838], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[2.24, 0.127, 1.561, 0.17, 6.452, 1.322]]\nB: [[1.808, 0.445, 1.097, -0.163, 6.206, 1.162]]\nC: [[2.076, -0.046, 1.133, 0.371, 6.595, 0.908]]\nD: [[2.152, 0.397, 1.838, 0.141, 6.038, 1.758]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_152_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_152_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[0.377, 1.75, 0.313, 0.637, 1.084, 1.086], [-0.879, 2.594, 0.422, 0.742, 0.234, 0.85]]\nB: [[-0.13, 2.575, 0.337, 1.18, 0.166, 1.328], [-0.585, 2.455, 0.693, 0.879, 0.311, 1.309]]\nC: [[-0.219, 1.756, 0.406, 1.056, 0.511, 0.967], [-0.363, 1.945, 0.501, 1.166, 0.939, 0.804]]\nD: [[-0.109, 2.202, 0.796, 0.742, 0.65, 0.918], [-0.778, 2.232, 0.83, 0.777, 0.638, 0.953]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the sofa chair in the scene. The camera pose information includes: the rotation matrix: [[0.753053, 0.123809, -0.646206], [0.619922, -0.462608, 0.633791], [-0.220471, -0.877875, -0.42512]]; the translation vector: [4.259223, 3.769218, 1.505729], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.377, 1.75, 0.313, 0.637, 1.084, 1.086], [-0.879, 2.594, 0.422, 0.742, 0.234, 0.85]]\nB: [[-0.13, 2.575, 0.337, 1.18, 0.166, 1.328], [-0.585, 2.455, 0.693, 0.879, 0.311, 1.309]]\nC: [[-0.219, 1.756, 0.406, 1.056, 0.511, 0.967], [-0.363, 1.945, 0.501, 1.166, 0.939, 0.804]]\nD: [[-0.109, 2.202, 0.796, 0.742, 0.65, 0.918], [-0.778, 2.232, 0.83, 0.777, 0.638, 0.953]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_153_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_153_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-0.251, 0.156, -0.128, -0.051, 0.628, 0.239], [0.767, -0.47, 0.609, 0.494, 0.504, 0.166], [-0.094, 1.499, 1.301, 0.409, 0.637, -0.038], [-0.218, 1.599, 2.043, 0.222, -0.051, -0.084], [0.774, 1.201, 1.813, 0.251, -0.38, 0.063]]\nB: [[-0.408, 0.627, 0.343, 0.188, 0.542, 0.081], [0.501, -0.329, 0.635, 0.154, 0.177, 0.084], [0.343, 1.237, 1.788, 0.265, 0.262, 0.091], [0.216, 1.167, 1.709, 0.275, 0.094, 0.094], [0.467, 1.14, 1.723, 0.259, 0.069, 0.108]]\nC: [[-0.807, 0.141, 0.815, 0.53, 0.607, 0.51], [0.917, -0.099, 0.427, -0.321, -0.28, 0.334], [0.531, 1.399, 2.196, 0.43, 0.03, 0.076], [0.47, 0.735, 1.914, -0.093, 0.374, 0.55], [-0.032, 1.587, 2.029, 0.611, -0.009, -0.144]]\nD: [[-0.225, 0.433, 0.214, 0.523, 1.033, -0.125], [0.497, -0.466, 0.903, 0.572, 0.328, -0.033], [0.249, 0.868, 1.316, 0.58, 0.558, -0.337], [0.688, 0.673, 1.442, -0.064, -0.139, -0.391], [0.045, 1.256, 1.359, -0.021, 0.452, 0.403]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the towel in the scene. The camera pose information includes: the rotation matrix: [[0.956223, -0.170898, 0.237554], [-0.292595, -0.544035, 0.786393], [-0.005155, -0.821474, -0.570223]]; the translation vector: [1.275326, 2.834272, 1.3185], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.251, 0.156, -0.128, -0.051, 0.628, 0.239], [0.767, -0.47, 0.609, 0.494, 0.504, 0.166], [-0.094, 1.499, 1.301, 0.409, 0.637, -0.038], [-0.218, 1.599, 2.043, 0.222, -0.051, -0.084], [0.774, 1.201, 1.813, 0.251, -0.38, 0.063]]\nB: [[-0.408, 0.627, 0.343, 0.188, 0.542, 0.081], [0.501, -0.329, 0.635, 0.154, 0.177, 0.084], [0.343, 1.237, 1.788, 0.265, 0.262, 0.091], [0.216, 1.167, 1.709, 0.275, 0.094, 0.094], [0.467, 1.14, 1.723, 0.259, 0.069, 0.108]]\nC: [[-0.807, 0.141, 0.815, 0.53, 0.607, 0.51], [0.917, -0.099, 0.427, -0.321, -0.28, 0.334], [0.531, 1.399, 2.196, 0.43, 0.03, 0.076], [0.47, 0.735, 1.914, -0.093, 0.374, 0.55], [-0.032, 1.587, 2.029, 0.611, -0.009, -0.144]]\nD: [[-0.225, 0.433, 0.214, 0.523, 1.033, -0.125], [0.497, -0.466, 0.903, 0.572, 0.328, -0.033], [0.249, 0.868, 1.316, 0.58, 0.558, -0.337], [0.688, 0.673, 1.442, -0.064, -0.139, -0.391], [0.045, 1.256, 1.359, -0.021, 0.452, 0.403]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_154_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_154_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[0.306, -0.014, 1.979, 4.019, 7.414, 0.307]]\nB: [[0.103, 0.292, 1.906, 4.176, 7.558, 0.724]]\nC: [[0.489, 0.437, 1.928, 3.337, 7.327, 0.317]]\nD: [[0.278, 0.096, 1.983, 3.8, 7.07, 0.334]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the ceiling in the scene. The camera pose information includes: the rotation matrix: [[-0.443363, -0.325026, 0.835337], [-0.895367, 0.117125, -0.429651], [0.041809, -0.938424, -0.342946]]; the translation vector: [2.190343, 3.392878, 1.594635], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.306, -0.014, 1.979, 4.019, 7.414, 0.307]]\nB: [[0.103, 0.292, 1.906, 4.176, 7.558, 0.724]]\nC: [[0.489, 0.437, 1.928, 3.337, 7.327, 0.317]]\nD: [[0.278, 0.096, 1.983, 3.8, 7.07, 0.334]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_155_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_155_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[0.28, 0.304, -0.341, 2.294, 3.044, -0.02], [-1.331, 0.139, -0.157, 1.326, 1.929, 0.397]]\nB: [[0.062, -0.149, 0.038, 2.484, 2.88, 0.127], [-1.587, -0.328, 0.005, 1.461, 1.909, 0.087]]\nC: [[0.029, -0.482, -0.368, 2.94, 2.904, 0.128], [-1.213, -0.173, 0.109, 1.128, 1.645, 0.334]]\nD: [[0.293, -0.453, -0.316, 2.555, 2.944, -0.31], [-1.538, -0.329, -0.099, 1.121, 2.246, -0.19]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the floor in the scene. The camera pose information includes: the rotation matrix: [[0.59597, 0.482312, -0.642025], [0.802979, -0.35126, 0.4815], [0.006716, -0.802491, -0.596626]]; the translation vector: [3.449961, 1.112515, 1.412234], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.28, 0.304, -0.341, 2.294, 3.044, -0.02], [-1.331, 0.139, -0.157, 1.326, 1.929, 0.397]]\nB: [[0.062, -0.149, 0.038, 2.484, 2.88, 0.127], [-1.587, -0.328, 0.005, 1.461, 1.909, 0.087]]\nC: [[0.029, -0.482, -0.368, 2.94, 2.904, 0.128], [-1.213, -0.173, 0.109, 1.128, 1.645, 0.334]]\nD: [[0.293, -0.453, -0.316, 2.555, 2.944, -0.31], [-1.538, -0.329, -0.099, 1.121, 2.246, -0.19]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_156_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_156_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[0.985, 1.36, 0.323, 0.802, 0.423, -0.124], [2.422, 0.684, 0.754, 0.948, 0.198, 0.546], [-0.687, -2.501, -0.196, 0.73, 0.321, 0.703]]\nB: [[1.365, 1.983, 0.034, 0.504, 0.741, 0.248], [1.92, 1.004, 0.632, 0.401, 0.542, 0.139], [-0.839, -2.927, -0.163, 0.351, 0.752, 0.304]]\nC: [[1.454, 1.792, 0.377, 0.63, 0.637, 0.263], [2.367, 0.546, 0.29, 0.458, 0.434, 0.427], [-1.072, -2.953, 0.222, 0.398, 0.377, 0.406]]\nD: [[1.15, 1.795, -0.026, 0.476, 0.371, 0.563], [2.092, 0.112, 0.084, 0.025, 0.591, 0.3], [-1.071, -3.278, 0.072, 0.031, 0.342, 0.689]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the seat in the scene. The camera pose information includes: the rotation matrix: [[0.000188, -0.47362, 0.88073], [-0.997828, 0.057931, 0.031365], [-0.065877, -0.878822, -0.47258]]; the translation vector: [4.366519, 5.511691, 1.307889], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.985, 1.36, 0.323, 0.802, 0.423, -0.124], [2.422, 0.684, 0.754, 0.948, 0.198, 0.546], [-0.687, -2.501, -0.196, 0.73, 0.321, 0.703]]\nB: [[1.365, 1.983, 0.034, 0.504, 0.741, 0.248], [1.92, 1.004, 0.632, 0.401, 0.542, 0.139], [-0.839, -2.927, -0.163, 0.351, 0.752, 0.304]]\nC: [[1.454, 1.792, 0.377, 0.63, 0.637, 0.263], [2.367, 0.546, 0.29, 0.458, 0.434, 0.427], [-1.072, -2.953, 0.222, 0.398, 0.377, 0.406]]\nD: [[1.15, 1.795, -0.026, 0.476, 0.371, 0.563], [2.092, 0.112, 0.084, 0.025, 0.591, 0.3], [-1.071, -3.278, 0.072, 0.031, 0.342, 0.689]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_157_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_157_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-1.531, 1.811, 0.438, 0.96, -0.248, 1.075], [-1.41, -1.002, 0.77, 0.47, 1.246, 1.71]]\nB: [[-0.951, 1.545, 0.971, 1.428, 0.629, 1.104], [-2.05, -1.043, 0.695, 0.095, 1.11, 1.626]]\nC: [[-1.029, 1.273, 0.377, 1.0, 0.707, 1.651], [-1.265, -0.353, 1.355, -0.02, 0.917, 2.297]]\nD: [[-1.312, 1.674, 0.691, 1.103, 0.228, 1.421], [-1.753, -0.603, 0.956, 0.349, 1.091, 2.04]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the door in the scene. The camera pose information includes: the rotation matrix: [[0.927869, -0.125596, 0.351119], [-0.372891, -0.32108, 0.870551], [0.003399, -0.938687, -0.344754]]; the translation vector: [5.442723, 4.031985, 1.348893], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.531, 1.811, 0.438, 0.96, -0.248, 1.075], [-1.41, -1.002, 0.77, 0.47, 1.246, 1.71]]\nB: [[-0.951, 1.545, 0.971, 1.428, 0.629, 1.104], [-2.05, -1.043, 0.695, 0.095, 1.11, 1.626]]\nC: [[-1.029, 1.273, 0.377, 1.0, 0.707, 1.651], [-1.265, -0.353, 1.355, -0.02, 0.917, 2.297]]\nD: [[-1.312, 1.674, 0.691, 1.103, 0.228, 1.421], [-1.753, -0.603, 0.956, 0.349, 1.091, 2.04]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_158_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_158_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-2.1, -0.155, 0.488, 0.884, 0.762, 1.139]]\nB: [[-1.609, -0.239, 0.938, 0.096, 1.426, 1.078]]\nC: [[-1.861, -0.273, 0.81, 0.598, 0.82, 0.783]]\nD: [[-1.644, -0.605, 0.583, 0.402, 1.231, 1.185]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the shelf in the scene. The camera pose information includes: the rotation matrix: [[-0.070416, -0.411804, 0.908548], [-0.99671, 0.065705, -0.047468], [-0.040148, -0.908901, -0.415075]]; the translation vector: [2.214543, 1.806687, 1.391502], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-2.1, -0.155, 0.488, 0.884, 0.762, 1.139]]\nB: [[-1.609, -0.239, 0.938, 0.096, 1.426, 1.078]]\nC: [[-1.861, -0.273, 0.81, 0.598, 0.82, 0.783]]\nD: [[-1.644, -0.605, 0.583, 0.402, 1.231, 1.185]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_159_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_159_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-0.605, -0.075, 2.599, 6.78, 7.188, 0.753]]\nB: [[-0.136, -0.074, 2.664, 7.091, 7.331, 0.624]]\nC: [[-0.11, -0.067, 2.645, 6.713, 7.047, 0.627]]\nD: [[-0.59, -0.552, 3.048, 6.673, 7.52, 0.884]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the ceiling in the scene. The camera pose information includes: the rotation matrix: [[-0.955421, 0.119616, -0.269932], [0.295248, 0.388339, -0.872939], [0.000408, -0.91372, -0.406343]]; the translation vector: [2.65583, 2.981598, 1.368648], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.605, -0.075, 2.599, 6.78, 7.188, 0.753]]\nB: [[-0.136, -0.074, 2.664, 7.091, 7.331, 0.624]]\nC: [[-0.11, -0.067, 2.645, 6.713, 7.047, 0.627]]\nD: [[-0.59, -0.552, 3.048, 6.673, 7.52, 0.884]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_160_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_160_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[1.773, 0.999, 2.176, 3.762, 2.16, 0.713]]\nB: [[1.402, 0.542, 2.42, 3.544, 2.145, 0.268]]\nC: [[0.962, 0.894, 1.956, 3.984, 2.23, 0.213]]\nD: [[1.508, 0.544, 2.14, 3.898, 2.009, 0.701]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the ceiling in the scene. The camera pose information includes: the rotation matrix: [[-0.454685, 0.144673, -0.878824], [0.890085, 0.109034, -0.442562], [0.031795, -0.983454, -0.178347]]; the translation vector: [3.311996, 2.119304, 1.59409], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.773, 0.999, 2.176, 3.762, 2.16, 0.713]]\nB: [[1.402, 0.542, 2.42, 3.544, 2.145, 0.268]]\nC: [[0.962, 0.894, 1.956, 3.984, 2.23, 0.213]]\nD: [[1.508, 0.544, 2.14, 3.898, 2.009, 0.701]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_161_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_161_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[1.445, -1.591, 0.401, 1.208, 0.362, 0.709], [0.85, 1.846, 0.287, 0.495, 1.135, -0.015]]\nB: [[1.318, -1.383, 0.256, 0.782, 0.724, 0.542], [1.339, 2.155, 0.239, 0.765, 0.899, 0.445]]\nC: [[1.731, -1.727, 0.665, 0.715, 0.694, 0.718], [0.941, 2.531, -0.018, 1.235, 0.51, 0.14]]\nD: [[1.454, -1.414, -0.024, 0.443, 0.46, 0.088], [0.989, 2.555, 0.477, 1.003, 1.282, 0.286]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the coffee table in the scene. The camera pose information includes: the rotation matrix: [[0.990268, -0.101591, 0.095124], [-0.135934, -0.559426, 0.817658], [-0.029851, -0.822631, -0.567792]]; the translation vector: [6.679901, 2.488796, 1.402653], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.445, -1.591, 0.401, 1.208, 0.362, 0.709], [0.85, 1.846, 0.287, 0.495, 1.135, -0.015]]\nB: [[1.318, -1.383, 0.256, 0.782, 0.724, 0.542], [1.339, 2.155, 0.239, 0.765, 0.899, 0.445]]\nC: [[1.731, -1.727, 0.665, 0.715, 0.694, 0.718], [0.941, 2.531, -0.018, 1.235, 0.51, 0.14]]\nD: [[1.454, -1.414, -0.024, 0.443, 0.46, 0.088], [0.989, 2.555, 0.477, 1.003, 1.282, 0.286]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_162_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_162_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[1.849, -1.336, 0.075, 0.117, 0.119, 0.051]]\nB: [[1.779, -1.476, 0.442, 0.485, 0.615, 0.395]]\nC: [[1.904, -1.448, 0.152, 0.474, 0.108, -0.174]]\nD: [[1.427, -1.203, 0.391, 0.059, -0.193, -0.208]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the object in the scene. The camera pose information includes: the rotation matrix: [[0.246516, -0.470365, 0.847341], [-0.959136, 0.006886, 0.282862], [-0.138884, -0.882445, -0.449446]]; the translation vector: [3.043058, 2.955299, 1.551102], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.849, -1.336, 0.075, 0.117, 0.119, 0.051]]\nB: [[1.779, -1.476, 0.442, 0.485, 0.615, 0.395]]\nC: [[1.904, -1.448, 0.152, 0.474, 0.108, -0.174]]\nD: [[1.427, -1.203, 0.391, 0.059, -0.193, -0.208]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_163_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_163_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[1.487, 1.645, 0.382, 0.302, 0.482, 0.482], [0.998, 1.485, 0.258, 0.64, 0.279, 0.771], [0.516, 1.799, 0.05, 0.639, 0.719, 0.616], [0.027, 2.307, 0.334, 0.459, 0.957, 1.087], [-0.766, 2.114, 0.879, 0.637, 0.926, 1.186], [-1.637, 1.716, 0.195, 0.889, 0.178, 0.887], [-1.878, 1.498, 0.042, 1.014, 0.211, 0.465], [-2.168, 2.812, 0.748, 0.089, 0.133, 0.111], [-2.511, -0.128, 0.278, 0.262, 0.627, 1.122], [-3.252, -1.403, 0.351, 0.543, 0.875, 0.964], [-2.67, -1.518, 0.047, 0.142, 0.368, 0.87], [-0.679, -1.008, 0.86, 0.74, 0.76, 1.005], [-0.054, -1.776, 0.013, 0.602, 0.383, 0.483], [0.442, -1.702, 0.129, 0.294, 0.491, 1.264], [-0.906, -1.527, 0.588, 0.773, 1.129, 1.323], [1.797, -0.711, 0.306, 0.186, 0.995, 1.019], [1.694, -1.236, 0.46, 0.778, 1.151, 1.284], [1.24, -1.901, 0.667, 0.364, 1.023, 0.918], [2.398, -1.858, 0.981, 0.284, 0.726, 0.83], [3.224, -1.673, 0.285, 0.245, 0.491, 0.975], [1.768, 2.317, 0.9, 0.325, 1.027, 1.062], [1.496, 2.661, 0.93, -0.007, 0.819, 0.169]]\nB: [[1.146, 0.948, 0.122, 0.95, 1.005, 0.549], [0.748, 0.973, 0.155, 0.6, 0.484, 0.381], [0.549, 2.17, 0.091, 0.666, 0.717, 0.866], [0.554, 2.811, 0.815, 0.733, 0.125, 0.7], [-0.925, 2.187, 0.558, 0.622, 0.213, 0.874], [-1.744, 1.277, 0.001, 0.646, 0.973, 0.704], [-1.809, 2.353, 0.808, 0.048, 0.797, 0.7], [-1.256, 2.641, 1.036, 0.522, 0.609, 0.158], [-1.879, -0.087, 0.596, 0.81, 0.571, 0.463], [-3.345, -0.961, 0.298, 0.354, 0.59, 1.207], [-2.802, -1.895, 0.135, 0.976, 1.183, 0.764], [-0.847, -1.618, 0.508, 0.783, 0.348, 1.292], [-0.511, -1.056, 0.376, 0.73, 0.392, 1.159], [-0.024, -2.057, 0.759, 0.532, 0.455, 0.817], [-0.251, -2.214, 0.173, 1.127, 0.862, 0.708], [2.174, -0.228, 0.822, 0.364, 0.554, 0.827], [1.928, -1.877, 0.198, 0.653, 1.131, 1.053], [1.218, -2.319, 0.663, 0.163, 0.153, 0.793], [2.951, -1.156, 0.405, 1.011, 0.624, 0.772], [3.153, -1.986, 0.421, 0.263, 0.33, 0.7], [1.367, 2.28, 0.547, 1.058, 0.935, 1.287], [2.006, 2.966, 0.782, 0.332, 0.619, 0.04]]\nC: [[1.346, 1.054, 0.767, 0.951, 0.758, 0.769], [0.659, 1.706, 0.684, 0.913, 0.914, 1.319], [0.805, 2.288, 0.288, 0.155, 0.839, 0.635], [0.287, 2.236, 0.545, 0.587, 0.976, 0.783], [-1.118, 2.319, 0.772, 1.192, 0.851, 0.415], [-1.552, 1.463, 0.231, 0.636, 0.79, 0.457], [-1.992, 2.15, 0.851, 0.919, 1.11, 0.624], [-1.923, 2.253, 1.26, 0.407, 0.257, 0.58], [-2.064, -0.023, 0.196, 0.32, 0.999, 0.859], [-3.224, -1.369, 0.324, 1.046, 0.849, 0.941], [-2.953, -2.153, 0.953, 0.315, 0.426, 0.39], [-0.29, -1.189, 0.464, 0.368, 1.039, 1.28], [-0.098, -2.0, 0.391, 0.817, 0.212, 1.036], [0.365, -1.822, 0.164, 0.214, 0.365, 0.378], [-0.847, -2.191, 0.875, 0.968, 0.479, 0.553], [2.332, -0.438, 0.431, 0.138, 0.956, 1.041], [1.345, -2.004, 0.538, 0.439, 0.287, 0.73], [1.393, -1.64, 0.88, 0.322, 0.297, 1.16], [3.052, -1.259, 0.761, 0.943, 0.828, 0.5], [2.735, -2.476, 0.875, 0.335, 1.087, 0.495], [1.336, 2.088, 0.504, 0.673, 1.053, 0.469], [1.386, 2.116, 0.605, 0.191, 0.61, 0.101]]\nD: [[1.518, 1.271, 0.394, 0.605, 0.594, 0.849], [0.943, 1.353, 0.378, 0.619, 0.666, 0.828], [0.701, 1.955, 0.404, 0.648, 0.696, 0.84], [0.523, 2.479, 0.454, 0.541, 0.563, 0.792], [-1.051, 2.117, 0.448, 0.79, 0.709, 0.804], [-1.341, 1.248, 0.462, 0.57, 0.622, 0.853], [-1.574, 1.994, 0.519, 0.538, 0.675, 0.779], [-1.737, 2.403, 0.858, 0.16, 0.317, 0.168], [-2.078, -0.466, 0.495, 0.568, 0.586, 0.801], [-2.925, -1.082, 0.538, 0.66, 0.66, 0.803], [-3.037, -1.752, 0.519, 0.574, 0.705, 0.845], [-0.539, -1.191, 0.375, 0.64, 0.637, 0.843], [-0.068, -1.536, 0.384, 0.646, 0.641, 0.825], [-0.052, -2.09, 0.408, 0.661, 0.773, 0.824], [-0.669, -1.919, 0.395, 0.676, 0.647, 0.832], [2.151, -0.689, 0.438, 0.636, 0.62, 0.802], [1.695, -1.528, 0.421, 0.589, 0.733, 0.82], [1.703, -2.028, 0.457, 0.561, 0.65, 0.798], [2.65, -1.483, 0.534, 0.701, 0.712, 0.852], [2.844, -2.087, 0.588, 0.548, 0.714, 0.804], [1.775, 1.985, 0.459, 0.664, 0.67, 0.811], [1.768, 2.603, 0.602, 0.329, 0.514, 0.537]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the chair in the scene. The camera pose information includes: the rotation matrix: [[0.424269, -0.366439, 0.828081], [-0.894198, -0.025281, 0.446957], [-0.142848, -0.930098, -0.338395]]; the translation vector: [2.638367, 6.760901, 1.41712], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.487, 1.645, 0.382, 0.302, 0.482, 0.482], [0.998, 1.485, 0.258, 0.64, 0.279, 0.771], [0.516, 1.799, 0.05, 0.639, 0.719, 0.616], [0.027, 2.307, 0.334, 0.459, 0.957, 1.087], [-0.766, 2.114, 0.879, 0.637, 0.926, 1.186], [-1.637, 1.716, 0.195, 0.889, 0.178, 0.887], [-1.878, 1.498, 0.042, 1.014, 0.211, 0.465], [-2.168, 2.812, 0.748, 0.089, 0.133, 0.111], [-2.511, -0.128, 0.278, 0.262, 0.627, 1.122], [-3.252, -1.403, 0.351, 0.543, 0.875, 0.964], [-2.67, -1.518, 0.047, 0.142, 0.368, 0.87], [-0.679, -1.008, 0.86, 0.74, 0.76, 1.005], [-0.054, -1.776, 0.013, 0.602, 0.383, 0.483], [0.442, -1.702, 0.129, 0.294, 0.491, 1.264], [-0.906, -1.527, 0.588, 0.773, 1.129, 1.323], [1.797, -0.711, 0.306, 0.186, 0.995, 1.019], [1.694, -1.236, 0.46, 0.778, 1.151, 1.284], [1.24, -1.901, 0.667, 0.364, 1.023, 0.918], [2.398, -1.858, 0.981, 0.284, 0.726, 0.83], [3.224, -1.673, 0.285, 0.245, 0.491, 0.975], [1.768, 2.317, 0.9, 0.325, 1.027, 1.062], [1.496, 2.661, 0.93, -0.007, 0.819, 0.169]]\nB: [[1.146, 0.948, 0.122, 0.95, 1.005, 0.549], [0.748, 0.973, 0.155, 0.6, 0.484, 0.381], [0.549, 2.17, 0.091, 0.666, 0.717, 0.866], [0.554, 2.811, 0.815, 0.733, 0.125, 0.7], [-0.925, 2.187, 0.558, 0.622, 0.213, 0.874], [-1.744, 1.277, 0.001, 0.646, 0.973, 0.704], [-1.809, 2.353, 0.808, 0.048, 0.797, 0.7], [-1.256, 2.641, 1.036, 0.522, 0.609, 0.158], [-1.879, -0.087, 0.596, 0.81, 0.571, 0.463], [-3.345, -0.961, 0.298, 0.354, 0.59, 1.207], [-2.802, -1.895, 0.135, 0.976, 1.183, 0.764], [-0.847, -1.618, 0.508, 0.783, 0.348, 1.292], [-0.511, -1.056, 0.376, 0.73, 0.392, 1.159], [-0.024, -2.057, 0.759, 0.532, 0.455, 0.817], [-0.251, -2.214, 0.173, 1.127, 0.862, 0.708], [2.174, -0.228, 0.822, 0.364, 0.554, 0.827], [1.928, -1.877, 0.198, 0.653, 1.131, 1.053], [1.218, -2.319, 0.663, 0.163, 0.153, 0.793], [2.951, -1.156, 0.405, 1.011, 0.624, 0.772], [3.153, -1.986, 0.421, 0.263, 0.33, 0.7], [1.367, 2.28, 0.547, 1.058, 0.935, 1.287], [2.006, 2.966, 0.782, 0.332, 0.619, 0.04]]\nC: [[1.346, 1.054, 0.767, 0.951, 0.758, 0.769], [0.659, 1.706, 0.684, 0.913, 0.914, 1.319], [0.805, 2.288, 0.288, 0.155, 0.839, 0.635], [0.287, 2.236, 0.545, 0.587, 0.976, 0.783], [-1.118, 2.319, 0.772, 1.192, 0.851, 0.415], [-1.552, 1.463, 0.231, 0.636, 0.79, 0.457], [-1.992, 2.15, 0.851, 0.919, 1.11, 0.624], [-1.923, 2.253, 1.26, 0.407, 0.257, 0.58], [-2.064, -0.023, 0.196, 0.32, 0.999, 0.859], [-3.224, -1.369, 0.324, 1.046, 0.849, 0.941], [-2.953, -2.153, 0.953, 0.315, 0.426, 0.39], [-0.29, -1.189, 0.464, 0.368, 1.039, 1.28], [-0.098, -2.0, 0.391, 0.817, 0.212, 1.036], [0.365, -1.822, 0.164, 0.214, 0.365, 0.378], [-0.847, -2.191, 0.875, 0.968, 0.479, 0.553], [2.332, -0.438, 0.431, 0.138, 0.956, 1.041], [1.345, -2.004, 0.538, 0.439, 0.287, 0.73], [1.393, -1.64, 0.88, 0.322, 0.297, 1.16], [3.052, -1.259, 0.761, 0.943, 0.828, 0.5], [2.735, -2.476, 0.875, 0.335, 1.087, 0.495], [1.336, 2.088, 0.504, 0.673, 1.053, 0.469], [1.386, 2.116, 0.605, 0.191, 0.61, 0.101]]\nD: [[1.518, 1.271, 0.394, 0.605, 0.594, 0.849], [0.943, 1.353, 0.378, 0.619, 0.666, 0.828], [0.701, 1.955, 0.404, 0.648, 0.696, 0.84], [0.523, 2.479, 0.454, 0.541, 0.563, 0.792], [-1.051, 2.117, 0.448, 0.79, 0.709, 0.804], [-1.341, 1.248, 0.462, 0.57, 0.622, 0.853], [-1.574, 1.994, 0.519, 0.538, 0.675, 0.779], [-1.737, 2.403, 0.858, 0.16, 0.317, 0.168], [-2.078, -0.466, 0.495, 0.568, 0.586, 0.801], [-2.925, -1.082, 0.538, 0.66, 0.66, 0.803], [-3.037, -1.752, 0.519, 0.574, 0.705, 0.845], [-0.539, -1.191, 0.375, 0.64, 0.637, 0.843], [-0.068, -1.536, 0.384, 0.646, 0.641, 0.825], [-0.052, -2.09, 0.408, 0.661, 0.773, 0.824], [-0.669, -1.919, 0.395, 0.676, 0.647, 0.832], [2.151, -0.689, 0.438, 0.636, 0.62, 0.802], [1.695, -1.528, 0.421, 0.589, 0.733, 0.82], [1.703, -2.028, 0.457, 0.561, 0.65, 0.798], [2.65, -1.483, 0.534, 0.701, 0.712, 0.852], [2.844, -2.087, 0.588, 0.548, 0.714, 0.804], [1.775, 1.985, 0.459, 0.664, 0.67, 0.811], [1.768, 2.603, 0.602, 0.329, 0.514, 0.537]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_164_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_164_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[2.682, 0.363, 0.619, 1.02, 0.762, 0.226], [0.295, -1.101, 1.206, 0.33, 0.677, 0.052], [-2.902, 0.765, -0.248, 0.755, 0.487, 0.427]]\nB: [[2.599, 0.763, 0.355, 0.291, 0.253, 0.563], [0.364, -0.942, 0.366, 0.823, 0.285, 0.293], [-3.251, -0.039, 0.534, 0.204, 0.315, 0.125]]\nC: [[2.754, 0.716, 0.728, 0.739, 0.536, 0.095], [-0.308, -0.319, 1.147, 0.102, 0.805, 0.177], [-3.102, -0.022, 0.312, 0.658, 0.474, 0.358]]\nD: [[2.461, 0.569, 0.328, 0.546, 0.491, 0.37], [-0.048, -0.818, 0.757, 0.462, 0.439, 0.351], [-2.848, 0.285, 0.131, 0.472, 0.5, 0.37]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the box in the scene. The camera pose information includes: the rotation matrix: [[0.764638, 0.028658, -0.643823], [0.64431, -0.055554, 0.762744], [-0.013909, -0.998044, -0.060944]]; the translation vector: [3.061982, 3.98913, 1.495508], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[2.682, 0.363, 0.619, 1.02, 0.762, 0.226], [0.295, -1.101, 1.206, 0.33, 0.677, 0.052], [-2.902, 0.765, -0.248, 0.755, 0.487, 0.427]]\nB: [[2.599, 0.763, 0.355, 0.291, 0.253, 0.563], [0.364, -0.942, 0.366, 0.823, 0.285, 0.293], [-3.251, -0.039, 0.534, 0.204, 0.315, 0.125]]\nC: [[2.754, 0.716, 0.728, 0.739, 0.536, 0.095], [-0.308, -0.319, 1.147, 0.102, 0.805, 0.177], [-3.102, -0.022, 0.312, 0.658, 0.474, 0.358]]\nD: [[2.461, 0.569, 0.328, 0.546, 0.491, 0.37], [-0.048, -0.818, 0.757, 0.462, 0.439, 0.351], [-2.848, 0.285, 0.131, 0.472, 0.5, 0.37]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_165_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_165_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[0.116, -0.769, 0.665, 0.588, 1.197, 1.025], [-0.796, -1.178, 0.171, 0.576, 0.604, 0.431], [-0.822, 1.221, 0.066, 0.696, 1.356, 0.542], [0.425, 0.671, 0.344, 0.824, 1.179, 0.866]]\nB: [[0.608, -0.936, 0.413, 0.854, 0.8, 0.77], [-0.425, -0.856, 0.339, 0.897, 0.767, 0.762], [-0.451, 1.126, 0.358, 0.838, 0.91, 0.764], [0.774, 1.047, 0.416, 0.815, 0.841, 0.775]]\nC: [[0.288, -1.283, 0.655, 0.817, 0.674, 0.566], [-0.233, -0.57, 0.023, 0.569, 0.942, 1.169], [-0.037, 0.77, 0.308, 0.824, 1.383, 0.685], [0.662, 1.515, 0.896, 0.594, 0.416, 0.9]]\nD: [[1.018, -1.113, 0.375, 0.665, 0.803, 1.039], [-0.701, -1.19, 0.042, 0.611, 0.648, 0.566], [-0.236, 1.15, 0.63, 1.12, 1.165, 0.969], [0.285, 0.606, 0.443, 1.268, 0.881, 0.591]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the chair in the scene. The camera pose information includes: the rotation matrix: [[-0.711391, -0.463973, 0.527875], [-0.700286, 0.531398, -0.476672], [-0.059349, -0.708763, -0.702945]]; the translation vector: [2.53321, 4.394931, 1.530427], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.116, -0.769, 0.665, 0.588, 1.197, 1.025], [-0.796, -1.178, 0.171, 0.576, 0.604, 0.431], [-0.822, 1.221, 0.066, 0.696, 1.356, 0.542], [0.425, 0.671, 0.344, 0.824, 1.179, 0.866]]\nB: [[0.608, -0.936, 0.413, 0.854, 0.8, 0.77], [-0.425, -0.856, 0.339, 0.897, 0.767, 0.762], [-0.451, 1.126, 0.358, 0.838, 0.91, 0.764], [0.774, 1.047, 0.416, 0.815, 0.841, 0.775]]\nC: [[0.288, -1.283, 0.655, 0.817, 0.674, 0.566], [-0.233, -0.57, 0.023, 0.569, 0.942, 1.169], [-0.037, 0.77, 0.308, 0.824, 1.383, 0.685], [0.662, 1.515, 0.896, 0.594, 0.416, 0.9]]\nD: [[1.018, -1.113, 0.375, 0.665, 0.803, 1.039], [-0.701, -1.19, 0.042, 0.611, 0.648, 0.566], [-0.236, 1.15, 0.63, 1.12, 1.165, 0.969], [0.285, 0.606, 0.443, 1.268, 0.881, 0.591]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_166_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_166_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[0.097, 1.124, 0.669, 0.502, 0.516, 0.549], [-0.719, 0.622, 0.51, 0.696, 0.696, 1.008], [0.747, 0.329, 0.449, 0.568, 0.565, 0.934], [0.72, 0.839, 0.522, 0.626, 0.707, 0.997], [-0.373, -0.636, 0.467, 0.582, 0.551, 0.906]]\nB: [[0.297, 0.852, 0.7, 0.103, 0.44, 0.966], [-0.93, 0.904, 0.062, 0.986, 0.828, 0.767], [0.468, 0.69, 0.657, 0.758, 0.619, 1.108], [0.682, 0.702, 0.346, 0.75, 0.569, 0.847], [-0.423, -0.68, 0.291, 0.082, 0.385, 1.192]]\nC: [[0.512, 0.853, 0.312, 0.021, 0.921, 0.339], [-0.518, 0.57, 0.844, 1.067, 0.275, 1.347], [0.721, 0.423, 0.574, 0.387, 0.991, 1.286], [0.648, 0.46, 0.149, 0.657, 0.835, 0.53], [-0.541, -0.731, 0.203, 0.127, 0.654, 0.996]]\nD: [[0.168, 0.81, 1.159, 0.247, 0.182, 0.73], [-0.91, 0.423, 0.9, 0.946, 0.519, 0.547], [1.221, 0.571, 0.284, 0.571, 0.987, 1.376], [1.146, 0.534, 0.507, 0.778, 0.702, 1.372], [-0.13, -0.402, 0.492, 0.884, 0.774, 1.331]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the chair in the scene. The camera pose information includes: the rotation matrix: [[-0.236277, -0.452541, 0.859872], [-0.970097, 0.160455, -0.182119], [-0.055554, -0.877189, -0.47692]]; the translation vector: [1.575898, 1.961144, 1.314442], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.097, 1.124, 0.669, 0.502, 0.516, 0.549], [-0.719, 0.622, 0.51, 0.696, 0.696, 1.008], [0.747, 0.329, 0.449, 0.568, 0.565, 0.934], [0.72, 0.839, 0.522, 0.626, 0.707, 0.997], [-0.373, -0.636, 0.467, 0.582, 0.551, 0.906]]\nB: [[0.297, 0.852, 0.7, 0.103, 0.44, 0.966], [-0.93, 0.904, 0.062, 0.986, 0.828, 0.767], [0.468, 0.69, 0.657, 0.758, 0.619, 1.108], [0.682, 0.702, 0.346, 0.75, 0.569, 0.847], [-0.423, -0.68, 0.291, 0.082, 0.385, 1.192]]\nC: [[0.512, 0.853, 0.312, 0.021, 0.921, 0.339], [-0.518, 0.57, 0.844, 1.067, 0.275, 1.347], [0.721, 0.423, 0.574, 0.387, 0.991, 1.286], [0.648, 0.46, 0.149, 0.657, 0.835, 0.53], [-0.541, -0.731, 0.203, 0.127, 0.654, 0.996]]\nD: [[0.168, 0.81, 1.159, 0.247, 0.182, 0.73], [-0.91, 0.423, 0.9, 0.946, 0.519, 0.547], [1.221, 0.571, 0.284, 0.571, 0.987, 1.376], [1.146, 0.534, 0.507, 0.778, 0.702, 1.372], [-0.13, -0.402, 0.492, 0.884, 0.774, 1.331]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_167_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_167_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[2.094, 0.511, 1.124, 0.123, 2.188, 0.539]]\nB: [[2.081, 0.516, 0.947, 0.355, 2.545, 0.592]]\nC: [[1.732, 0.343, 0.947, 0.511, 2.586, 0.308]]\nD: [[1.989, 0.949, 0.649, -0.276, 2.128, 0.539]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the whiteboard in the scene. The camera pose information includes: the rotation matrix: [[-0.997074, 0.061747, -0.045056], [0.074474, 0.651998, -0.754554], [-0.017215, -0.755702, -0.654689]]; the translation vector: [1.815792, 5.369752, 1.288561], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[2.094, 0.511, 1.124, 0.123, 2.188, 0.539]]\nB: [[2.081, 0.516, 0.947, 0.355, 2.545, 0.592]]\nC: [[1.732, 0.343, 0.947, 0.511, 2.586, 0.308]]\nD: [[1.989, 0.949, 0.649, -0.276, 2.128, 0.539]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_168_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_168_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-1.228, -1.39, 0.131, 0.85, 0.551, 0.683]]\nB: [[-1.305, -1.508, 0.232, 0.822, 0.566, 0.435]]\nC: [[-0.824, -1.786, 0.652, 1.27, 0.727, -0.053]]\nD: [[-0.844, -1.175, 0.453, 0.328, 0.627, 0.359]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the piano bench in the scene. The camera pose information includes: the rotation matrix: [[-0.804945, -0.278842, 0.523748], [-0.593014, 0.407765, -0.694307], [-0.019964, -0.869468, -0.493585]]; the translation vector: [4.871809, 2.494869, 1.402737], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.228, -1.39, 0.131, 0.85, 0.551, 0.683]]\nB: [[-1.305, -1.508, 0.232, 0.822, 0.566, 0.435]]\nC: [[-0.824, -1.786, 0.652, 1.27, 0.727, -0.053]]\nD: [[-0.844, -1.175, 0.453, 0.328, 0.627, 0.359]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_169_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_169_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[0.141, 1.242, 1.15, 2.629, 0.737, 2.324], [-1.855, -0.469, 1.171, 0.539, 3.779, 2.772], [0.841, 0.114, 0.787, 0.691, 3.588, 2.223], [0.571, -1.772, 1.52, 1.679, 0.887, 2.594]]\nB: [[-0.097, 1.504, 1.106, 3.029, 0.358, 2.218], [-1.545, 0.258, 1.211, 0.132, 3.756, 2.16], [0.905, 0.283, 1.397, -0.093, 3.002, 2.004], [0.394, -1.669, 1.139, 2.247, 0.169, 2.208]]\nC: [[-0.253, 1.653, 1.522, 3.078, 0.478, 2.791], [-1.503, -0.37, 1.376, 0.691, 4.127, 2.703], [1.422, -0.022, 0.986, 0.339, 3.887, 2.497], [-0.134, -1.344, 0.891, 2.344, 0.714, 2.451]]\nD: [[-0.058, 1.533, 1.269, 2.876, 0.624, 2.668], [-1.389, 0.007, 1.251, 0.231, 3.638, 2.637], [1.275, 0.042, 1.086, 0.289, 3.412, 2.272], [0.358, -1.537, 1.122, 1.906, 0.425, 2.129]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[-0.032646, 0.194727, -0.980314], [0.998594, -0.034636, -0.040135], [-0.04177, -0.980246, -0.193322]]; the translation vector: [3.506056, 2.493951, 1.706783], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.141, 1.242, 1.15, 2.629, 0.737, 2.324], [-1.855, -0.469, 1.171, 0.539, 3.779, 2.772], [0.841, 0.114, 0.787, 0.691, 3.588, 2.223], [0.571, -1.772, 1.52, 1.679, 0.887, 2.594]]\nB: [[-0.097, 1.504, 1.106, 3.029, 0.358, 2.218], [-1.545, 0.258, 1.211, 0.132, 3.756, 2.16], [0.905, 0.283, 1.397, -0.093, 3.002, 2.004], [0.394, -1.669, 1.139, 2.247, 0.169, 2.208]]\nC: [[-0.253, 1.653, 1.522, 3.078, 0.478, 2.791], [-1.503, -0.37, 1.376, 0.691, 4.127, 2.703], [1.422, -0.022, 0.986, 0.339, 3.887, 2.497], [-0.134, -1.344, 0.891, 2.344, 0.714, 2.451]]\nD: [[-0.058, 1.533, 1.269, 2.876, 0.624, 2.668], [-1.389, 0.007, 1.251, 0.231, 3.638, 2.637], [1.275, 0.042, 1.086, 0.289, 3.412, 2.272], [0.358, -1.537, 1.122, 1.906, 0.425, 2.129]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_170_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_170_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-2.22, -0.005, 0.817, 1.09, 2.894, 0.662], [-2.657, -0.552, 1.11, 0.97, 1.155, 0.566]]\nB: [[-1.955, 0.127, 0.536, 1.492, 2.79, 1.348], [-2.155, 0.305, 0.66, 0.159, 1.455, 0.072]]\nC: [[-1.433, 0.186, 1.02, 1.626, 2.332, 1.534], [-2.448, -0.356, 0.853, 0.017, 1.445, 0.618]]\nD: [[-1.798, 0.428, 0.571, 1.201, 2.441, 1.114], [-2.518, -0.083, 1.081, 0.488, 1.535, 0.157]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the couch in the scene. The camera pose information includes: the rotation matrix: [[-0.205964, -0.505778, 0.837716], [-0.978495, 0.11627, -0.170378], [-0.011228, -0.854792, -0.518849]]; the translation vector: [2.901534, 4.292832, 1.280844], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-2.22, -0.005, 0.817, 1.09, 2.894, 0.662], [-2.657, -0.552, 1.11, 0.97, 1.155, 0.566]]\nB: [[-1.955, 0.127, 0.536, 1.492, 2.79, 1.348], [-2.155, 0.305, 0.66, 0.159, 1.455, 0.072]]\nC: [[-1.433, 0.186, 1.02, 1.626, 2.332, 1.534], [-2.448, -0.356, 0.853, 0.017, 1.445, 0.618]]\nD: [[-1.798, 0.428, 0.571, 1.201, 2.441, 1.114], [-2.518, -0.083, 1.081, 0.488, 1.535, 0.157]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_171_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_171_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-1.442, -1.133, 0.562, 0.636, 0.657, 0.49], [-0.931, -0.023, 0.592, 0.548, 0.635, 0.449], [1.185, -0.67, 0.523, 0.55, 0.618, 0.447], [-0.778, 1.905, 0.84, 0.606, 0.538, 0.514], [-0.723, -1.153, 0.514, 0.657, 0.632, 0.473], [-1.434, -0.489, 0.591, 0.567, 0.545, 0.458], [-1.479, -1.704, 0.547, 0.555, 0.643, 0.506], [-1.06, 0.579, 0.646, 0.554, 0.57, 0.426], [1.728, -0.095, 0.592, 0.547, 0.596, 0.473], [-1.358, 1.889, 0.774, 0.643, 0.662, 0.446], [2.187, 1.992, 0.739, 0.592, 0.503, 0.463], [-0.349, 1.313, 0.568, 0.481, 0.31, 0.827], [0.659, 1.035, 0.643, 0.561, 0.458, 0.449], [1.351, 1.116, 0.663, 0.567, 0.545, 0.469], [1.67, 0.521, 0.73, 0.179, 0.508, 0.285], [0.482, -0.974, 0.492, 0.592, 0.586, 0.475]]\nB: [[-1.049, -1.529, 1.034, 0.201, 0.822, 0.539], [-0.9, 0.339, 0.327, 0.273, 0.766, 0.553], [0.737, -1.05, 0.211, 0.082, 0.504, 0.933], [-1.047, 2.226, 0.838, 0.996, 0.859, 0.972], [-0.719, -0.678, 0.784, 0.49, 0.145, 0.261], [-1.882, -0.392, 0.818, 0.955, 0.143, 0.713], [-1.551, -2.013, 0.366, 0.53, 0.75, 0.368], [-1.315, 0.463, 0.891, 0.81, 0.604, 0.638], [2.147, -0.334, 0.803, 0.499, 0.844, 0.692], [-1.677, 2.042, 0.864, 0.402, 1.157, 0.639], [1.976, 2.077, 0.904, 0.918, 0.711, 0.254], [-0.187, 1.603, 0.781, 0.267, -0.088, 1.027], [0.26, 0.795, 0.514, 0.847, -0.04, 0.297], [1.756, 1.456, 0.644, 0.597, 0.817, 0.47], [1.242, 0.068, 0.373, 0.448, 0.149, 0.381], [0.319, -0.553, 0.655, 0.691, 0.359, 0.589]]\nC: [[-0.988, -1.282, 0.732, 0.336, 0.483, 0.927], [-0.442, -0.073, 0.808, 0.229, 0.772, 0.639], [1.548, -1.036, 0.108, 0.525, 0.245, 0.035], [-1.266, 1.685, 1.335, 0.956, 0.747, 0.267], [-1.079, -1.607, 1.01, 0.83, 1.062, 0.521], [-1.264, -0.925, 0.343, 1.047, 0.715, 0.269], [-1.458, -1.958, 0.337, 0.66, 0.161, 0.546], [-0.733, 0.312, 0.474, 0.521, 0.178, -0.061], [2.105, 0.263, 0.727, 0.39, 0.976, 0.108], [-1.707, 1.787, 0.496, 0.472, 1.062, 0.821], [2.45, 1.544, 0.321, 1.018, 0.15, 0.075], [-0.837, 1.59, 0.268, 0.538, 0.245, 0.497], [0.297, 1.19, 0.423, 0.185, 0.686, 0.323], [0.857, 1.058, 0.937, 0.887, 0.209, 0.519], [1.802, 0.184, 0.797, 0.22, 0.094, 0.637], [0.094, -0.987, 0.725, 0.553, 1.059, 0.036]]\nD: [[-1.227, -0.819, 0.642, 0.301, 0.736, 0.894], [-1.335, 0.35, 0.132, 0.881, 0.202, 0.441], [1.374, -0.345, 0.698, 0.363, 1.089, 0.667], [-0.963, 1.843, 0.91, 0.493, 0.498, 0.35], [-1.186, -1.506, 0.169, 0.581, 0.638, 0.951], [-1.772, -0.025, 0.967, 0.473, 0.884, -0.032], [-1.614, -1.94, 0.374, 0.725, 0.441, 0.512], [-1.408, 0.285, 1.05, 0.486, 0.297, 0.835], [2.021, -0.535, 0.654, 0.219, 0.759, 0.901], [-1.57, 2.203, 0.527, 0.16, 0.291, 0.718], [1.825, 2.298, 0.457, 1.052, 0.655, 0.73], [-0.153, 1.778, 0.354, 0.514, 0.609, 0.42], [0.512, 1.223, 0.597, 0.407, 0.628, 0.692], [1.022, 1.172, 0.206, 0.702, 0.301, 0.176], [1.64, 0.74, 0.55, 0.197, 0.956, 0.52], [0.38, -0.727, 0.278, 0.877, 0.781, 0.837]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the chair in the scene. The camera pose information includes: the rotation matrix: [[-0.830629, 0.239867, -0.502514], [0.556756, 0.37214, -0.742654], [0.008867, -0.896647, -0.442658]]; the translation vector: [4.849209, 2.614689, 1.447477], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.442, -1.133, 0.562, 0.636, 0.657, 0.49], [-0.931, -0.023, 0.592, 0.548, 0.635, 0.449], [1.185, -0.67, 0.523, 0.55, 0.618, 0.447], [-0.778, 1.905, 0.84, 0.606, 0.538, 0.514], [-0.723, -1.153, 0.514, 0.657, 0.632, 0.473], [-1.434, -0.489, 0.591, 0.567, 0.545, 0.458], [-1.479, -1.704, 0.547, 0.555, 0.643, 0.506], [-1.06, 0.579, 0.646, 0.554, 0.57, 0.426], [1.728, -0.095, 0.592, 0.547, 0.596, 0.473], [-1.358, 1.889, 0.774, 0.643, 0.662, 0.446], [2.187, 1.992, 0.739, 0.592, 0.503, 0.463], [-0.349, 1.313, 0.568, 0.481, 0.31, 0.827], [0.659, 1.035, 0.643, 0.561, 0.458, 0.449], [1.351, 1.116, 0.663, 0.567, 0.545, 0.469], [1.67, 0.521, 0.73, 0.179, 0.508, 0.285], [0.482, -0.974, 0.492, 0.592, 0.586, 0.475]]\nB: [[-1.049, -1.529, 1.034, 0.201, 0.822, 0.539], [-0.9, 0.339, 0.327, 0.273, 0.766, 0.553], [0.737, -1.05, 0.211, 0.082, 0.504, 0.933], [-1.047, 2.226, 0.838, 0.996, 0.859, 0.972], [-0.719, -0.678, 0.784, 0.49, 0.145, 0.261], [-1.882, -0.392, 0.818, 0.955, 0.143, 0.713], [-1.551, -2.013, 0.366, 0.53, 0.75, 0.368], [-1.315, 0.463, 0.891, 0.81, 0.604, 0.638], [2.147, -0.334, 0.803, 0.499, 0.844, 0.692], [-1.677, 2.042, 0.864, 0.402, 1.157, 0.639], [1.976, 2.077, 0.904, 0.918, 0.711, 0.254], [-0.187, 1.603, 0.781, 0.267, -0.088, 1.027], [0.26, 0.795, 0.514, 0.847, -0.04, 0.297], [1.756, 1.456, 0.644, 0.597, 0.817, 0.47], [1.242, 0.068, 0.373, 0.448, 0.149, 0.381], [0.319, -0.553, 0.655, 0.691, 0.359, 0.589]]\nC: [[-0.988, -1.282, 0.732, 0.336, 0.483, 0.927], [-0.442, -0.073, 0.808, 0.229, 0.772, 0.639], [1.548, -1.036, 0.108, 0.525, 0.245, 0.035], [-1.266, 1.685, 1.335, 0.956, 0.747, 0.267], [-1.079, -1.607, 1.01, 0.83, 1.062, 0.521], [-1.264, -0.925, 0.343, 1.047, 0.715, 0.269], [-1.458, -1.958, 0.337, 0.66, 0.161, 0.546], [-0.733, 0.312, 0.474, 0.521, 0.178, -0.061], [2.105, 0.263, 0.727, 0.39, 0.976, 0.108], [-1.707, 1.787, 0.496, 0.472, 1.062, 0.821], [2.45, 1.544, 0.321, 1.018, 0.15, 0.075], [-0.837, 1.59, 0.268, 0.538, 0.245, 0.497], [0.297, 1.19, 0.423, 0.185, 0.686, 0.323], [0.857, 1.058, 0.937, 0.887, 0.209, 0.519], [1.802, 0.184, 0.797, 0.22, 0.094, 0.637], [0.094, -0.987, 0.725, 0.553, 1.059, 0.036]]\nD: [[-1.227, -0.819, 0.642, 0.301, 0.736, 0.894], [-1.335, 0.35, 0.132, 0.881, 0.202, 0.441], [1.374, -0.345, 0.698, 0.363, 1.089, 0.667], [-0.963, 1.843, 0.91, 0.493, 0.498, 0.35], [-1.186, -1.506, 0.169, 0.581, 0.638, 0.951], [-1.772, -0.025, 0.967, 0.473, 0.884, -0.032], [-1.614, -1.94, 0.374, 0.725, 0.441, 0.512], [-1.408, 0.285, 1.05, 0.486, 0.297, 0.835], [2.021, -0.535, 0.654, 0.219, 0.759, 0.901], [-1.57, 2.203, 0.527, 0.16, 0.291, 0.718], [1.825, 2.298, 0.457, 1.052, 0.655, 0.73], [-0.153, 1.778, 0.354, 0.514, 0.609, 0.42], [0.512, 1.223, 0.597, 0.407, 0.628, 0.692], [1.022, 1.172, 0.206, 0.702, 0.301, 0.176], [1.64, 0.74, 0.55, 0.197, 0.956, 0.52], [0.38, -0.727, 0.278, 0.877, 0.781, 0.837]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_172_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_172_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[1.285, 2.094, 0.223, 0.8, -0.126, 0.725], [0.89, 2.46, 0.447, -0.137, 0.338, 0.037]]\nB: [[0.378, 1.664, -0.029, 0.245, 0.007, 0.518], [0.551, 2.153, -0.045, 0.073, 0.825, 0.641]]\nC: [[0.842, 1.796, 0.181, 0.339, 0.338, 0.37], [0.768, 2.073, 0.205, 0.294, 0.394, 0.403]]\nD: [[0.562, 2.17, 0.079, 0.501, 0.638, 0.525], [0.42, 1.956, 0.647, 0.731, 0.278, 0.487]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the bucket in the scene. The camera pose information includes: the rotation matrix: [[-0.819759, -0.274444, 0.502669], [-0.572709, 0.39303, -0.719397], [-0.00013, -0.877615, -0.479366]]; the translation vector: [2.765326, 1.370172, 1.355227], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.285, 2.094, 0.223, 0.8, -0.126, 0.725], [0.89, 2.46, 0.447, -0.137, 0.338, 0.037]]\nB: [[0.378, 1.664, -0.029, 0.245, 0.007, 0.518], [0.551, 2.153, -0.045, 0.073, 0.825, 0.641]]\nC: [[0.842, 1.796, 0.181, 0.339, 0.338, 0.37], [0.768, 2.073, 0.205, 0.294, 0.394, 0.403]]\nD: [[0.562, 2.17, 0.079, 0.501, 0.638, 0.525], [0.42, 1.956, 0.647, 0.731, 0.278, 0.487]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_173_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_173_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[0.351, 1.709, 0.416, 3.301, 3.462, -0.205]]\nB: [[0.748, 1.385, 0.703, 3.676, 3.587, 0.247]]\nC: [[0.285, 1.079, 0.707, 4.151, 3.525, -0.098]]\nD: [[0.437, 1.63, 0.992, 3.864, 3.856, 0.472]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the table in the scene. The camera pose information includes: the rotation matrix: [[-0.119369, -0.433868, 0.893034], [-0.990549, 0.113242, -0.077387], [-0.067553, -0.893832, -0.443285]]; the translation vector: [3.407035, 4.679209, 1.397058], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.351, 1.709, 0.416, 3.301, 3.462, -0.205]]\nB: [[0.748, 1.385, 0.703, 3.676, 3.587, 0.247]]\nC: [[0.285, 1.079, 0.707, 4.151, 3.525, -0.098]]\nD: [[0.437, 1.63, 0.992, 3.864, 3.856, 0.472]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_174_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_174_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-2.171, -0.049, 0.811, 0.067, 1.74, 1.931], [2.378, 0.912, 0.851, 0.596, 2.524, 1.349], [0.415, -1.4, 0.688, 3.856, 0.243, 1.685], [2.428, -1.301, 0.364, 0.307, 0.039, 1.588], [-1.851, -0.771, 1.131, 0.437, 0.879, 1.981]]\nB: [[-2.124, 0.402, 0.972, 0.336, 1.814, 2.055], [2.714, 0.714, 0.306, -0.022, 2.698, 1.287], [0.218, -0.935, 0.625, 3.775, 0.411, 1.982], [2.74, -0.719, 0.42, -0.08, 0.24, 0.945], [-2.083, -0.772, 1.329, 0.652, 0.37, 2.117]]\nC: [[-2.229, 0.152, 1.164, 0.205, 1.859, 2.109], [2.442, 0.667, 0.678, 0.238, 2.976, 1.311], [0.131, -1.186, 0.807, 4.198, 0.217, 1.596], [2.311, -0.918, 0.648, 0.343, 0.478, 1.211], [-2.036, -0.925, 1.234, 0.571, 0.559, 1.867]]\nD: [[-2.234, 0.572, 0.912, 0.309, 1.595, 2.084], [2.569, 0.682, 1.117, -0.182, 2.613, 1.651], [-0.058, -0.811, 0.697, 4.093, -0.122, 2.058], [2.792, -1.246, 0.764, 0.753, 0.799, 0.76], [-1.948, -0.836, 1.559, 0.97, 0.14, 2.126]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[-0.924746, 0.145405, -0.351715], [0.379908, 0.407811, -0.830277], [0.022707, -0.901414, -0.432362]]; the translation vector: [3.891577, 4.106122, 1.335216], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-2.171, -0.049, 0.811, 0.067, 1.74, 1.931], [2.378, 0.912, 0.851, 0.596, 2.524, 1.349], [0.415, -1.4, 0.688, 3.856, 0.243, 1.685], [2.428, -1.301, 0.364, 0.307, 0.039, 1.588], [-1.851, -0.771, 1.131, 0.437, 0.879, 1.981]]\nB: [[-2.124, 0.402, 0.972, 0.336, 1.814, 2.055], [2.714, 0.714, 0.306, -0.022, 2.698, 1.287], [0.218, -0.935, 0.625, 3.775, 0.411, 1.982], [2.74, -0.719, 0.42, -0.08, 0.24, 0.945], [-2.083, -0.772, 1.329, 0.652, 0.37, 2.117]]\nC: [[-2.229, 0.152, 1.164, 0.205, 1.859, 2.109], [2.442, 0.667, 0.678, 0.238, 2.976, 1.311], [0.131, -1.186, 0.807, 4.198, 0.217, 1.596], [2.311, -0.918, 0.648, 0.343, 0.478, 1.211], [-2.036, -0.925, 1.234, 0.571, 0.559, 1.867]]\nD: [[-2.234, 0.572, 0.912, 0.309, 1.595, 2.084], [2.569, 0.682, 1.117, -0.182, 2.613, 1.651], [-0.058, -0.811, 0.697, 4.093, -0.122, 2.058], [2.792, -1.246, 0.764, 0.753, 0.799, 0.76], [-1.948, -0.836, 1.559, 0.97, 0.14, 2.126]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_175_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_175_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-1.059, -1.275, 0.883, 0.298, 1.018, 2.01], [0.236, 1.462, 0.631, 0.873, 0.486, 1.281], [1.1, -1.001, 0.952, 1.349, 0.172, 2.338], [-0.839, 2.079, 0.96, 0.395, 0.842, 1.983], [1.955, -4.485, 1.036, 0.141, 0.98, 2.593], [-0.255, -0.481, 0.905, 0.161, 0.928, 1.969]]\nB: [[-0.954, -1.046, 0.462, 0.658, 0.618, 2.138], [-0.213, 1.899, 0.694, 0.708, 0.841, 1.686], [1.362, -1.133, 1.001, 1.465, -0.084, 2.501], [-0.862, 2.364, 0.854, 0.124, 0.853, 2.159], [2.413, -4.964, 0.774, -0.345, 1.16, 3.015], [0.156, -0.554, 0.434, -0.07, 0.695, 2.392]]\nC: [[-0.732, -1.166, 0.44, 0.739, 0.991, 1.593], [0.338, 1.95, 0.672, 0.941, 0.589, 1.757], [0.743, -0.963, 1.147, 1.448, -0.135, 2.517], [-1.129, 2.483, 1.375, 0.132, 1.054, 2.43], [1.587, -4.551, 0.847, 0.35, 0.965, 2.767], [-0.661, -0.507, 0.612, -0.243, 0.847, 1.515]]\nD: [[-0.907, -1.27, 0.42, -0.059, 1.138, 1.561], [0.626, 1.256, 1.105, 1.202, 0.216, 1.006], [0.877, -0.877, 1.149, 0.987, -0.045, 2.737], [-0.783, 1.691, 0.606, 0.081, 0.643, 2.205], [2.363, -4.049, 1.139, 0.229, 0.955, 2.439], [-0.196, -0.854, 0.721, 0.566, 0.583, 2.254]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the door in the scene. The camera pose information includes: the rotation matrix: [[-0.996429, -0.081152, -0.023325], [-0.01119, 0.400709, -0.916137], [0.083693, -0.912604, -0.400187]]; the translation vector: [7.365378, 2.610504, 1.343957], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.059, -1.275, 0.883, 0.298, 1.018, 2.01], [0.236, 1.462, 0.631, 0.873, 0.486, 1.281], [1.1, -1.001, 0.952, 1.349, 0.172, 2.338], [-0.839, 2.079, 0.96, 0.395, 0.842, 1.983], [1.955, -4.485, 1.036, 0.141, 0.98, 2.593], [-0.255, -0.481, 0.905, 0.161, 0.928, 1.969]]\nB: [[-0.954, -1.046, 0.462, 0.658, 0.618, 2.138], [-0.213, 1.899, 0.694, 0.708, 0.841, 1.686], [1.362, -1.133, 1.001, 1.465, -0.084, 2.501], [-0.862, 2.364, 0.854, 0.124, 0.853, 2.159], [2.413, -4.964, 0.774, -0.345, 1.16, 3.015], [0.156, -0.554, 0.434, -0.07, 0.695, 2.392]]\nC: [[-0.732, -1.166, 0.44, 0.739, 0.991, 1.593], [0.338, 1.95, 0.672, 0.941, 0.589, 1.757], [0.743, -0.963, 1.147, 1.448, -0.135, 2.517], [-1.129, 2.483, 1.375, 0.132, 1.054, 2.43], [1.587, -4.551, 0.847, 0.35, 0.965, 2.767], [-0.661, -0.507, 0.612, -0.243, 0.847, 1.515]]\nD: [[-0.907, -1.27, 0.42, -0.059, 1.138, 1.561], [0.626, 1.256, 1.105, 1.202, 0.216, 1.006], [0.877, -0.877, 1.149, 0.987, -0.045, 2.737], [-0.783, 1.691, 0.606, 0.081, 0.643, 2.205], [2.363, -4.049, 1.139, 0.229, 0.955, 2.439], [-0.196, -0.854, 0.721, 0.566, 0.583, 2.254]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_176_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_176_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[0.662, -1.551, 0.955, 1.174, 1.083, 1.423]]\nB: [[0.488, -1.177, 0.89, 1.089, 0.729, 1.751]]\nC: [[0.483, -0.736, 0.965, 0.958, 0.277, 1.886]]\nD: [[0.649, -1.283, 0.609, 1.143, 1.139, 2.193]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the desk in the scene. The camera pose information includes: the rotation matrix: [[0.51864, -0.44867, 0.727811], [-0.853934, -0.229463, 0.467059], [-0.04255, -0.863738, -0.502143]]; the translation vector: [1.002297, 1.98866, 1.344191], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.662, -1.551, 0.955, 1.174, 1.083, 1.423]]\nB: [[0.488, -1.177, 0.89, 1.089, 0.729, 1.751]]\nC: [[0.483, -0.736, 0.965, 0.958, 0.277, 1.886]]\nD: [[0.649, -1.283, 0.609, 1.143, 1.139, 2.193]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_177_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_177_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[1.285, -1.171, 0.463, 0.532, 0.611, 0.923], [1.223, 1.763, 0.56, 0.667, 0.631, 0.966], [-0.307, 1.833, 0.5, 0.619, 0.589, 0.922], [-0.406, -0.951, 0.39, 0.539, 0.611, 0.908], [0.655, 1.709, 0.525, 0.699, 0.634, 0.947], [1.269, -2.956, 0.836, 0.629, 0.557, 0.393], [1.192, 0.557, 0.457, 0.623, 0.615, 0.94], [0.416, -2.765, 0.677, 0.614, 0.452, 0.612], [-0.522, 0.635, 0.418, 0.585, 0.574, 0.905], [-0.363, -3.094, 0.495, 0.715, 0.612, 0.912], [0.336, 0.66, 0.438, 0.602, 0.623, 0.904], [-2.007, -0.347, 0.408, 0.521, 0.585, 0.891], [0.411, -1.036, 0.417, 0.682, 0.634, 0.922], [-2.039, -2.805, 0.495, 0.561, 0.631, 0.9], [-1.956, -1.834, 0.436, 0.597, 0.728, 0.922], [-2.754, 1.479, 0.509, 0.58, 0.603, 0.892]]\nB: [[0.94, -0.734, 0.86, 0.847, 0.174, 1.366], [1.047, 1.724, 0.337, 1.114, 0.725, 1.18], [-0.446, 1.839, 0.399, 1.0, 0.211, 0.928], [-0.224, -0.996, 0.671, 0.902, 0.396, 0.957], [0.648, 2.199, 0.865, 0.644, 0.899, 0.978], [1.601, -3.15, 1.071, 0.541, 0.264, 0.224], [0.709, 0.399, 0.396, 0.628, 0.643, 1.257], [0.103, -2.816, 0.184, 1.095, 0.871, 0.909], [-0.735, 1.113, 0.158, 0.968, 0.355, 1.244], [-0.793, -3.536, 0.957, 0.881, 0.306, 1.233], [-0.114, 0.863, 0.498, 0.236, 0.716, 1.116], [-1.845, -0.397, 0.53, 0.528, 0.958, 0.727], [0.156, -0.653, 0.083, 0.658, 1.129, 0.686], [-2.166, -2.74, 0.163, 0.166, 0.842, 0.447], [-2.421, -1.954, 0.206, 0.882, 0.734, 0.761], [-3.119, 1.809, 0.685, 0.543, 0.98, 1.284]]\nC: [[0.87, -1.386, 0.953, 0.148, 0.539, 1.241], [0.822, 1.276, 0.128, 0.239, 0.572, 1.227], [-0.508, 2.214, 0.373, 0.683, 0.2, 1.183], [-0.547, -1.349, -0.07, 0.231, 0.312, 1.389], [0.457, 1.367, 0.965, 0.768, 0.185, 1.088], [1.563, -2.649, 0.498, 0.756, 0.364, 0.362], [1.083, 0.345, 0.921, 0.769, 0.695, 1.386], [0.143, -3.095, 0.202, 0.278, 0.051, 0.502], [-0.474, 0.978, 0.872, 0.559, 0.082, 1.262], [-0.01, -3.401, 0.115, 1.005, 0.452, 1.143], [-0.106, 1.086, 0.284, 0.105, 0.131, 0.844], [-2.44, -0.304, -0.054, 0.667, 0.457, 0.703], [0.747, -1.031, -0.051, 0.551, 0.84, 0.909], [-2.101, -2.554, 0.473, 1.017, 0.994, 1.065], [-1.883, -2.033, 0.423, 0.644, 1.201, 0.726], [-3.109, 1.24, 0.812, 0.728, 1.099, 0.829]]\nD: [[1.585, -0.899, 0.099, 0.724, 0.912, 0.466], [0.799, 2.074, 0.967, 0.764, 0.821, 0.506], [-0.531, 1.393, 0.134, 0.737, 1.022, 1.024], [-0.008, -0.984, -0.095, 0.085, 0.528, 0.524], [0.362, 2.074, 0.189, 0.835, 0.387, 0.74], [1.446, -2.709, 0.927, 0.329, 0.916, 0.373], [1.078, 0.299, 0.482, 0.303, 0.612, 0.521], [0.439, -2.308, 0.3, 0.788, 0.517, 0.416], [-0.314, 0.386, 0.749, 0.588, 0.522, 1.244], [-0.739, -2.845, 0.766, 0.695, 1.017, 0.779], [-0.116, 0.704, 0.487, 0.148, 0.185, 0.776], [-1.607, 0.118, 0.862, 0.934, 0.609, 0.752], [-0.074, -0.593, 0.062, 0.851, 0.522, 0.762], [-2.188, -3.11, 0.134, 0.427, 0.414, 0.637], [-1.911, -1.906, 0.292, 0.873, 0.728, 0.955], [-2.793, 1.335, 0.084, 0.946, 0.494, 0.463]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the chair in the scene. The camera pose information includes: the rotation matrix: [[0.931668, 0.072515, -0.356001], [0.362912, -0.231685, 0.902561], [-0.017031, -0.970084, -0.24217]]; the translation vector: [5.886859, 3.543659, 1.354971], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.285, -1.171, 0.463, 0.532, 0.611, 0.923], [1.223, 1.763, 0.56, 0.667, 0.631, 0.966], [-0.307, 1.833, 0.5, 0.619, 0.589, 0.922], [-0.406, -0.951, 0.39, 0.539, 0.611, 0.908], [0.655, 1.709, 0.525, 0.699, 0.634, 0.947], [1.269, -2.956, 0.836, 0.629, 0.557, 0.393], [1.192, 0.557, 0.457, 0.623, 0.615, 0.94], [0.416, -2.765, 0.677, 0.614, 0.452, 0.612], [-0.522, 0.635, 0.418, 0.585, 0.574, 0.905], [-0.363, -3.094, 0.495, 0.715, 0.612, 0.912], [0.336, 0.66, 0.438, 0.602, 0.623, 0.904], [-2.007, -0.347, 0.408, 0.521, 0.585, 0.891], [0.411, -1.036, 0.417, 0.682, 0.634, 0.922], [-2.039, -2.805, 0.495, 0.561, 0.631, 0.9], [-1.956, -1.834, 0.436, 0.597, 0.728, 0.922], [-2.754, 1.479, 0.509, 0.58, 0.603, 0.892]]\nB: [[0.94, -0.734, 0.86, 0.847, 0.174, 1.366], [1.047, 1.724, 0.337, 1.114, 0.725, 1.18], [-0.446, 1.839, 0.399, 1.0, 0.211, 0.928], [-0.224, -0.996, 0.671, 0.902, 0.396, 0.957], [0.648, 2.199, 0.865, 0.644, 0.899, 0.978], [1.601, -3.15, 1.071, 0.541, 0.264, 0.224], [0.709, 0.399, 0.396, 0.628, 0.643, 1.257], [0.103, -2.816, 0.184, 1.095, 0.871, 0.909], [-0.735, 1.113, 0.158, 0.968, 0.355, 1.244], [-0.793, -3.536, 0.957, 0.881, 0.306, 1.233], [-0.114, 0.863, 0.498, 0.236, 0.716, 1.116], [-1.845, -0.397, 0.53, 0.528, 0.958, 0.727], [0.156, -0.653, 0.083, 0.658, 1.129, 0.686], [-2.166, -2.74, 0.163, 0.166, 0.842, 0.447], [-2.421, -1.954, 0.206, 0.882, 0.734, 0.761], [-3.119, 1.809, 0.685, 0.543, 0.98, 1.284]]\nC: [[0.87, -1.386, 0.953, 0.148, 0.539, 1.241], [0.822, 1.276, 0.128, 0.239, 0.572, 1.227], [-0.508, 2.214, 0.373, 0.683, 0.2, 1.183], [-0.547, -1.349, -0.07, 0.231, 0.312, 1.389], [0.457, 1.367, 0.965, 0.768, 0.185, 1.088], [1.563, -2.649, 0.498, 0.756, 0.364, 0.362], [1.083, 0.345, 0.921, 0.769, 0.695, 1.386], [0.143, -3.095, 0.202, 0.278, 0.051, 0.502], [-0.474, 0.978, 0.872, 0.559, 0.082, 1.262], [-0.01, -3.401, 0.115, 1.005, 0.452, 1.143], [-0.106, 1.086, 0.284, 0.105, 0.131, 0.844], [-2.44, -0.304, -0.054, 0.667, 0.457, 0.703], [0.747, -1.031, -0.051, 0.551, 0.84, 0.909], [-2.101, -2.554, 0.473, 1.017, 0.994, 1.065], [-1.883, -2.033, 0.423, 0.644, 1.201, 0.726], [-3.109, 1.24, 0.812, 0.728, 1.099, 0.829]]\nD: [[1.585, -0.899, 0.099, 0.724, 0.912, 0.466], [0.799, 2.074, 0.967, 0.764, 0.821, 0.506], [-0.531, 1.393, 0.134, 0.737, 1.022, 1.024], [-0.008, -0.984, -0.095, 0.085, 0.528, 0.524], [0.362, 2.074, 0.189, 0.835, 0.387, 0.74], [1.446, -2.709, 0.927, 0.329, 0.916, 0.373], [1.078, 0.299, 0.482, 0.303, 0.612, 0.521], [0.439, -2.308, 0.3, 0.788, 0.517, 0.416], [-0.314, 0.386, 0.749, 0.588, 0.522, 1.244], [-0.739, -2.845, 0.766, 0.695, 1.017, 0.779], [-0.116, 0.704, 0.487, 0.148, 0.185, 0.776], [-1.607, 0.118, 0.862, 0.934, 0.609, 0.752], [-0.074, -0.593, 0.062, 0.851, 0.522, 0.762], [-2.188, -3.11, 0.134, 0.427, 0.414, 0.637], [-1.911, -1.906, 0.292, 0.873, 0.728, 0.955], [-2.793, 1.335, 0.084, 0.946, 0.494, 0.463]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_178_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_178_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-0.185, -1.981, 0.71, 1.746, 1.931, 1.092]]\nB: [[0.263, -1.622, 0.402, 1.389, 1.64, 0.804]]\nC: [[0.151, -1.735, 0.574, 1.767, 1.715, 0.631]]\nD: [[0.262, -2.035, 0.645, 1.006, 2.054, 1.049]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the table in the scene. The camera pose information includes: the rotation matrix: [[0.987126, 0.106622, -0.119219], [0.159938, -0.652529, 0.740693], [0.00118, -0.750225, -0.661181]]; the translation vector: [4.64166, 4.052867, 1.404314], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.185, -1.981, 0.71, 1.746, 1.931, 1.092]]\nB: [[0.263, -1.622, 0.402, 1.389, 1.64, 0.804]]\nC: [[0.151, -1.735, 0.574, 1.767, 1.715, 0.631]]\nD: [[0.262, -2.035, 0.645, 1.006, 2.054, 1.049]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_179_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_179_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[0.648, -0.593, 0.364, 0.758, 0.748, 0.835], [-1.189, -0.998, 0.388, 0.711, 0.664, 0.751], [-0.106, -0.14, 0.366, 0.681, 0.668, 0.806], [-0.467, -1.537, 0.381, 0.682, 0.66, 0.781]]\nB: [[0.715, -1.041, 0.651, 0.471, 0.834, 0.809], [-1.555, -0.972, 0.209, 0.39, 0.675, 1.08], [0.331, 0.337, 0.451, 0.906, 1.083, 1.138], [-0.367, -1.309, -0.086, 0.84, 1.029, 0.958]]\nC: [[0.76, -0.428, 0.328, 0.718, 0.602, 0.917], [-1.301, -1.169, 0.677, 0.824, 0.61, 0.712], [0.1, -0.045, 0.084, 0.878, 0.367, 0.431], [-0.14, -1.88, 0.43, 0.418, 0.474, 0.77]]\nD: [[0.587, -1.036, 0.299, 1.076, 1.171, 0.475], [-1.011, -1.458, 0.499, 0.276, 1.067, 0.759], [-0.487, -0.498, 0.17, 1.114, 0.58, 1.041], [-0.116, -1.819, 0.569, 0.961, 0.364, 1.204]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the armchair in the scene. The camera pose information includes: the rotation matrix: [[0.68967, 0.288211, -0.664297], [0.724122, -0.27239, 0.633602], [0.001663, -0.918008, -0.396559]]; the translation vector: [2.530043, 2.005069, 1.437417], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.648, -0.593, 0.364, 0.758, 0.748, 0.835], [-1.189, -0.998, 0.388, 0.711, 0.664, 0.751], [-0.106, -0.14, 0.366, 0.681, 0.668, 0.806], [-0.467, -1.537, 0.381, 0.682, 0.66, 0.781]]\nB: [[0.715, -1.041, 0.651, 0.471, 0.834, 0.809], [-1.555, -0.972, 0.209, 0.39, 0.675, 1.08], [0.331, 0.337, 0.451, 0.906, 1.083, 1.138], [-0.367, -1.309, -0.086, 0.84, 1.029, 0.958]]\nC: [[0.76, -0.428, 0.328, 0.718, 0.602, 0.917], [-1.301, -1.169, 0.677, 0.824, 0.61, 0.712], [0.1, -0.045, 0.084, 0.878, 0.367, 0.431], [-0.14, -1.88, 0.43, 0.418, 0.474, 0.77]]\nD: [[0.587, -1.036, 0.299, 1.076, 1.171, 0.475], [-1.011, -1.458, 0.499, 0.276, 1.067, 0.759], [-0.487, -0.498, 0.17, 1.114, 0.58, 1.041], [-0.116, -1.819, 0.569, 0.961, 0.364, 1.204]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_180_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_180_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-0.678, -1.667, 1.218, 1.055, -0.26, 2.418], [-1.464, 1.094, 0.83, 1.262, -0.304, 1.673], [0.614, 2.399, 0.708, 1.662, -0.241, 1.035], [0.965, -0.058, 1.477, 0.773, 4.136, 2.578], [-1.154, 1.558, 1.248, -0.057, 1.091, 2.617], [-1.717, -0.255, 1.603, -0.126, 2.491, 0.5]]\nB: [[-1.352, -1.046, 1.599, 1.44, 0.136, 2.792], [-1.654, 0.816, 0.791, 1.207, 0.132, 1.881], [0.521, 2.259, 1.273, 1.597, -0.332, 0.736], [0.806, 0.159, 1.62, 0.393, 4.568, 2.678], [-1.081, 1.507, 1.378, -0.253, 1.151, 2.742], [-1.495, -0.107, 1.278, 0.325, 1.906, 1.086]]\nC: [[-1.166, -1.418, 1.14, 1.061, 0.184, 2.392], [-1.606, 0.642, 1.143, 0.781, 0.159, 2.172], [0.167, 2.007, 1.118, 1.797, 0.138, 0.569], [0.908, -0.132, 1.206, 0.513, 4.305, 2.258], [-1.326, 1.1, 1.19, 0.242, 0.968, 2.282], [-1.838, -0.447, 1.348, 0.372, 2.121, 0.908]]\nD: [[-0.864, -1.506, 1.526, 0.637, 0.64, 2.228], [-1.745, 0.647, 0.899, 0.933, -0.243, 2.211], [-0.21, 1.507, 1.578, 2.065, 0.587, 0.484], [1.157, 0.294, 0.85, 0.968, 4.125, 2.603], [-1.113, 0.941, 1.165, 0.239, 0.756, 2.423], [-2.32, -0.87, 1.844, 0.517, 2.303, 0.518]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[-0.964843, 0.186346, -0.185345], [0.252505, 0.461537, -0.850426], [-0.07293, -0.867329, -0.492364]]; the translation vector: [3.779865, 2.337391, 1.461827], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.678, -1.667, 1.218, 1.055, -0.26, 2.418], [-1.464, 1.094, 0.83, 1.262, -0.304, 1.673], [0.614, 2.399, 0.708, 1.662, -0.241, 1.035], [0.965, -0.058, 1.477, 0.773, 4.136, 2.578], [-1.154, 1.558, 1.248, -0.057, 1.091, 2.617], [-1.717, -0.255, 1.603, -0.126, 2.491, 0.5]]\nB: [[-1.352, -1.046, 1.599, 1.44, 0.136, 2.792], [-1.654, 0.816, 0.791, 1.207, 0.132, 1.881], [0.521, 2.259, 1.273, 1.597, -0.332, 0.736], [0.806, 0.159, 1.62, 0.393, 4.568, 2.678], [-1.081, 1.507, 1.378, -0.253, 1.151, 2.742], [-1.495, -0.107, 1.278, 0.325, 1.906, 1.086]]\nC: [[-1.166, -1.418, 1.14, 1.061, 0.184, 2.392], [-1.606, 0.642, 1.143, 0.781, 0.159, 2.172], [0.167, 2.007, 1.118, 1.797, 0.138, 0.569], [0.908, -0.132, 1.206, 0.513, 4.305, 2.258], [-1.326, 1.1, 1.19, 0.242, 0.968, 2.282], [-1.838, -0.447, 1.348, 0.372, 2.121, 0.908]]\nD: [[-0.864, -1.506, 1.526, 0.637, 0.64, 2.228], [-1.745, 0.647, 0.899, 0.933, -0.243, 2.211], [-0.21, 1.507, 1.578, 2.065, 0.587, 0.484], [1.157, 0.294, 0.85, 0.968, 4.125, 2.603], [-1.113, 0.941, 1.165, 0.239, 0.756, 2.423], [-2.32, -0.87, 1.844, 0.517, 2.303, 0.518]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_181_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_181_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-1.143, 1.32, 0.902, 0.946, 0.582, 0.814], [1.314, 2.961, 1.36, 1.125, 0.499, 2.359]]\nB: [[-1.164, 1.549, 1.101, 1.224, -0.166, 1.131], [1.215, 3.687, 1.039, 0.93, -0.333, 2.264]]\nC: [[-1.48, 1.652, 0.846, 0.755, 0.321, 1.167], [1.066, 3.327, 1.091, 1.04, 0.081, 1.998]]\nD: [[-1.81, 1.616, 0.892, 0.552, 0.779, 1.45], [1.382, 3.534, 1.284, 1.152, 0.521, 2.312]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the door in the scene. The camera pose information includes: the rotation matrix: [[-0.08083, -0.463089, 0.882618], [-0.994842, 0.091929, -0.042874], [-0.061284, -0.881531, -0.468131]]; the translation vector: [4.543997, 3.147744, 1.235262], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.143, 1.32, 0.902, 0.946, 0.582, 0.814], [1.314, 2.961, 1.36, 1.125, 0.499, 2.359]]\nB: [[-1.164, 1.549, 1.101, 1.224, -0.166, 1.131], [1.215, 3.687, 1.039, 0.93, -0.333, 2.264]]\nC: [[-1.48, 1.652, 0.846, 0.755, 0.321, 1.167], [1.066, 3.327, 1.091, 1.04, 0.081, 1.998]]\nD: [[-1.81, 1.616, 0.892, 0.552, 0.779, 1.45], [1.382, 3.534, 1.284, 1.152, 0.521, 2.312]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_182_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_182_1.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-0.277, 2.29, 0.603, 1.696, 0.522, 1.643], [-1.375, -0.321, 0.794, 0.343, 5.813, 1.671], [1.159, 0.472, 1.342, 0.486, 3.254, 0.972], [0.747, -2.391, 1.046, 0.652, 1.455, 1.87], [1.085, -1.961, 0.723, 1.174, -0.018, 1.576], [0.586, 1.92, 1.575, 0.646, -0.32, 2.439], [0.415, 2.748, 1.312, -0.384, 0.485, 2.051]]\nB: [[-0.351, 2.608, 0.955, 1.541, 0.097, 1.824], [-1.089, -0.096, 0.669, 0.099, 5.464, 1.395], [1.127, 0.133, 1.411, 0.212, 3.643, 0.932], [0.392, -2.442, 0.754, 0.163, 1.609, 1.541], [0.746, -1.664, 0.806, 0.833, 0.085, 1.637], [0.806, 1.971, 1.104, 0.829, 0.129, 2.13], [0.393, 2.271, 1.106, 0.064, 0.665, 2.126]]\nC: [[-0.013, 2.804, 0.64, 1.604, -0.241, 1.907], [-0.99, 0.257, 0.762, -0.167, 5.28, 1.868], [1.095, -0.318, 1.309, 0.698, 4.021, 0.652], [0.346, -2.805, 0.465, -0.167, 2.086, 1.213], [0.527, -1.307, 1.185, 0.733, -0.294, 1.468], [1.191, 1.911, 1.165, 0.69, 0.519, 1.853], [0.342, 2.498, 1.557, -0.047, 0.494, 2.435]]\nD: [[-0.612, 2.451, 1.013, 1.076, 0.146, 2.285], [-0.642, -0.043, 0.498, -0.353, 5.408, 1.585], [1.099, -0.043, 0.972, -0.204, 4.141, 1.05], [0.087, -2.832, 0.317, 0.167, 1.848, 1.113], [0.399, -1.498, 0.656, 1.117, 0.566, 1.989], [0.495, 2.449, 0.82, 0.411, 0.228, 2.522], [0.507, 2.378, 1.381, -0.184, 0.771, 1.769]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[0.286652, 0.220257, -0.932372], [0.958024, -0.061246, 0.28007], [0.004584, -0.973517, -0.228568]]; the translation vector: [3.76659, 1.676076, 1.452194], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.277, 2.29, 0.603, 1.696, 0.522, 1.643], [-1.375, -0.321, 0.794, 0.343, 5.813, 1.671], [1.159, 0.472, 1.342, 0.486, 3.254, 0.972], [0.747, -2.391, 1.046, 0.652, 1.455, 1.87], [1.085, -1.961, 0.723, 1.174, -0.018, 1.576], [0.586, 1.92, 1.575, 0.646, -0.32, 2.439], [0.415, 2.748, 1.312, -0.384, 0.485, 2.051]]\nB: [[-0.351, 2.608, 0.955, 1.541, 0.097, 1.824], [-1.089, -0.096, 0.669, 0.099, 5.464, 1.395], [1.127, 0.133, 1.411, 0.212, 3.643, 0.932], [0.392, -2.442, 0.754, 0.163, 1.609, 1.541], [0.746, -1.664, 0.806, 0.833, 0.085, 1.637], [0.806, 1.971, 1.104, 0.829, 0.129, 2.13], [0.393, 2.271, 1.106, 0.064, 0.665, 2.126]]\nC: [[-0.013, 2.804, 0.64, 1.604, -0.241, 1.907], [-0.99, 0.257, 0.762, -0.167, 5.28, 1.868], [1.095, -0.318, 1.309, 0.698, 4.021, 0.652], [0.346, -2.805, 0.465, -0.167, 2.086, 1.213], [0.527, -1.307, 1.185, 0.733, -0.294, 1.468], [1.191, 1.911, 1.165, 0.69, 0.519, 1.853], [0.342, 2.498, 1.557, -0.047, 0.494, 2.435]]\nD: [[-0.612, 2.451, 1.013, 1.076, 0.146, 2.285], [-0.642, -0.043, 0.498, -0.353, 5.408, 1.585], [1.099, -0.043, 0.972, -0.204, 4.141, 1.05], [0.087, -2.832, 0.317, 0.167, 1.848, 1.113], [0.399, -1.498, 0.656, 1.117, 0.566, 1.989], [0.495, 2.449, 0.82, 0.411, 0.228, 2.522], [0.507, 2.378, 1.381, -0.184, 0.771, 1.769]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_183_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_183_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-0.457, -0.683, 0.495, 0.679, 0.597, 0.903], [0.426, 0.843, 0.458, 0.566, 0.562, 0.949], [-0.336, 0.792, 0.461, 0.555, 0.539, 0.932], [0.926, -0.823, 0.62, 0.473, 0.568, 0.621], [-1.992, 0.348, 0.596, 0.635, 0.634, 0.647], [1.1, 0.858, 0.465, 0.529, 0.569, 0.952], [-0.253, -1.834, 0.689, 0.639, 0.561, 0.602], [-0.397, 2.119, 0.473, 0.759, 0.651, 0.94], [-1.254, -0.965, 0.657, 0.558, 0.592, 0.636], [-1.564, -0.168, 0.649, 0.462, 0.615, 0.611], [-0.169, -2.406, 0.711, 0.757, 0.669, 0.597], [1.375, -1.924, 0.508, 0.658, 0.509, 0.96], [0.214, -0.572, 0.651, 0.695, 0.494, 0.584], [-2.356, 2.053, 0.675, 0.673, 0.595, 0.536], [-0.799, -2.241, 0.541, 0.662, 0.673, 0.96], [1.941, -1.89, 0.718, 0.565, 0.56, 0.52], [2.571, -0.575, 0.472, 0.572, 0.595, 0.956], [-0.865, 2.028, 0.487, 0.583, 0.461, 0.128], [0.361, -2.459, 0.78, 0.536, 0.269, 0.489], [1.938, -1.474, 0.649, 0.57, 0.554, 0.6], [0.743, -2.15, 0.817, 0.526, 0.164, 0.344], [2.542, -1.129, 0.644, 0.598, 0.586, 0.612], [1.955, 0.145, 0.785, 0.141, 0.509, 0.304], [-1.685, 2.109, 0.449, 0.566, 0.486, 0.153], [-2.437, -1.918, 0.584, 0.515, 0.45, 0.203]]\nB: [[-0.004, -0.217, 0.058, 0.956, 0.104, 0.992], [0.634, 0.895, 0.282, 0.697, 0.893, 1.438], [-0.387, 0.349, 0.019, 0.239, 0.901, 0.96], [1.064, -1.168, 0.964, 0.374, 1.0, 0.253], [-1.606, 0.8, 0.112, 0.967, 0.862, 0.256], [1.288, 0.417, 0.225, 0.427, 0.112, 1.268], [-0.365, -1.988, 0.842, 0.932, 0.117, 0.21], [0.056, 2.617, 0.694, 0.602, 0.776, 0.848], [-1.173, -1.184, 0.2, 0.567, 0.839, 0.497], [-1.108, -0.485, 0.838, 0.382, 0.723, 1.057], [0.031, -2.18, 0.477, 1.078, 0.774, 0.574], [1.32, -1.614, 0.5, 0.512, 0.791, 1.227], [0.57, -1.002, 0.878, 0.861, 0.739, 0.347], [-2.072, 2.433, 1.143, 0.504, 1.054, 0.551], [-0.569, -2.64, 0.278, 0.616, 1.122, 1.078], [2.089, -1.691, 0.769, 0.97, 0.148, 0.992], [2.274, -0.687, 0.634, 0.56, 0.654, 0.811], [-1.223, 2.009, 0.495, 1.006, -0.028, 0.186], [0.145, -2.547, 0.54, 0.793, 0.387, 0.825], [1.744, -1.228, 0.533, 0.139, 0.886, 1.027], [1.123, -2.589, 1.183, 0.079, 0.187, 0.548], [2.235, -0.842, 0.485, 0.73, 0.575, 0.903], [1.694, -0.054, 1.249, 0.468, 0.557, 0.748], [-1.216, 2.107, 0.803, 0.764, 0.267, 0.274], [-2.623, -2.077, 1.01, 0.838, 0.106, -0.148]]\nC: [[-0.042, -0.277, 0.622, 0.349, 0.954, 1.11], [0.421, 0.801, 0.437, 0.094, 0.078, 1.2], [-0.436, 0.855, 0.625, 0.341, 0.737, 1.353], [0.599, -0.582, 0.28, 0.836, 0.717, 0.357], [-2.398, -0.135, 0.951, 0.429, 1.038, 0.502], [1.554, 0.709, 0.624, 0.144, 0.967, 1.304], [-0.666, -1.374, 0.422, 0.517, 0.122, 0.8], [-0.203, 1.908, 0.093, 1.027, 0.556, 0.76], [-1.532, -0.535, 0.437, 0.57, 0.41, 0.413], [-1.535, -0.307, 0.814, 0.936, 0.544, 1.082], [-0.39, -2.044, 0.309, 0.76, 0.801, 0.62], [1.044, -2.393, 0.932, 1.048, 0.287, 1.261], [0.664, -0.294, 1.14, 0.882, 0.176, 0.207], [-2.135, 2.211, 0.272, 0.963, 0.668, 0.76], [-1.028, -2.103, 1.016, 0.918, 0.609, 1.31], [1.579, -2.37, 0.458, 0.202, 0.159, 0.166], [2.079, -0.505, 0.945, 0.57, 0.86, 0.725], [-0.396, 2.379, 0.489, 0.77, 0.063, 0.52], [0.423, -2.492, 0.598, 0.788, 0.241, 0.406], [2.219, -1.548, 0.415, 0.429, 0.702, 0.329], [1.236, -1.961, 0.849, 0.371, 0.256, -0.039], [2.93, -1.099, 1.108, 0.393, 0.388, 0.187], [1.738, -0.099, 0.354, 0.013, 0.06, 0.667], [-1.711, 2.599, 0.36, 0.548, 0.69, -0.323], [-1.988, -1.796, 0.232, 0.609, 0.912, -0.043]]\nD: [[-0.117, -0.712, 0.165, 0.707, 0.749, 0.416], [0.663, 1.109, 0.92, 0.786, 0.382, 0.761], [-0.485, 1.276, -0.006, 0.122, 0.579, 0.562], [0.651, -1.033, 0.48, 0.012, 0.291, 0.281], [-1.67, 0.137, 0.785, 1.091, 0.142, 0.851], [1.44, 0.455, 0.476, 0.133, 0.572, 0.925], [-0.342, -1.74, 0.35, 0.646, 0.394, 0.443], [-0.793, 2.134, 0.146, 1.105, 0.456, 0.742], [-1.574, -0.65, 0.985, 0.2, 0.168, 1.102], [-1.526, 0.104, 0.427, 0.23, 0.555, 0.818], [-0.21, -2.447, 0.593, 1.166, 1.051, 0.465], [1.11, -2.085, 0.532, 0.952, 0.334, 0.936], [-0.231, -0.532, 0.895, 0.826, 0.523, 0.78], [-2.843, 1.728, 0.764, 0.92, 0.672, 0.101], [-0.845, -1.905, 0.458, 0.184, 0.635, 1.348], [1.844, -1.433, 1.033, 0.147, 0.968, 0.118], [2.166, -0.542, 0.733, 0.117, 0.957, 0.814], [-0.92, 1.743, 0.237, 0.993, 0.477, 0.227], [0.31, -2.458, 0.659, 0.782, 0.696, 0.669], [1.626, -1.353, 0.953, 0.579, 0.517, 0.513], [0.856, -2.414, 0.37, 0.927, 0.46, -0.04], [2.564, -1.512, 0.576, 0.167, 0.528, 0.323], [2.064, -0.007, 0.399, -0.022, 0.444, 0.68], [-1.281, 1.93, 0.706, 0.411, 0.266, -0.223], [-2.823, -2.037, 0.823, 0.649, 0.539, 0.078]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the chair in the scene. The camera pose information includes: the rotation matrix: [[-0.895509, 0.17248, -0.410263], [0.444823, 0.375965, -0.812886], [0.014038, -0.91044, -0.413402]]; the translation vector: [2.818061, 5.409916, 1.54775], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.457, -0.683, 0.495, 0.679, 0.597, 0.903], [0.426, 0.843, 0.458, 0.566, 0.562, 0.949], [-0.336, 0.792, 0.461, 0.555, 0.539, 0.932], [0.926, -0.823, 0.62, 0.473, 0.568, 0.621], [-1.992, 0.348, 0.596, 0.635, 0.634, 0.647], [1.1, 0.858, 0.465, 0.529, 0.569, 0.952], [-0.253, -1.834, 0.689, 0.639, 0.561, 0.602], [-0.397, 2.119, 0.473, 0.759, 0.651, 0.94], [-1.254, -0.965, 0.657, 0.558, 0.592, 0.636], [-1.564, -0.168, 0.649, 0.462, 0.615, 0.611], [-0.169, -2.406, 0.711, 0.757, 0.669, 0.597], [1.375, -1.924, 0.508, 0.658, 0.509, 0.96], [0.214, -0.572, 0.651, 0.695, 0.494, 0.584], [-2.356, 2.053, 0.675, 0.673, 0.595, 0.536], [-0.799, -2.241, 0.541, 0.662, 0.673, 0.96], [1.941, -1.89, 0.718, 0.565, 0.56, 0.52], [2.571, -0.575, 0.472, 0.572, 0.595, 0.956], [-0.865, 2.028, 0.487, 0.583, 0.461, 0.128], [0.361, -2.459, 0.78, 0.536, 0.269, 0.489], [1.938, -1.474, 0.649, 0.57, 0.554, 0.6], [0.743, -2.15, 0.817, 0.526, 0.164, 0.344], [2.542, -1.129, 0.644, 0.598, 0.586, 0.612], [1.955, 0.145, 0.785, 0.141, 0.509, 0.304], [-1.685, 2.109, 0.449, 0.566, 0.486, 0.153], [-2.437, -1.918, 0.584, 0.515, 0.45, 0.203]]\nB: [[-0.004, -0.217, 0.058, 0.956, 0.104, 0.992], [0.634, 0.895, 0.282, 0.697, 0.893, 1.438], [-0.387, 0.349, 0.019, 0.239, 0.901, 0.96], [1.064, -1.168, 0.964, 0.374, 1.0, 0.253], [-1.606, 0.8, 0.112, 0.967, 0.862, 0.256], [1.288, 0.417, 0.225, 0.427, 0.112, 1.268], [-0.365, -1.988, 0.842, 0.932, 0.117, 0.21], [0.056, 2.617, 0.694, 0.602, 0.776, 0.848], [-1.173, -1.184, 0.2, 0.567, 0.839, 0.497], [-1.108, -0.485, 0.838, 0.382, 0.723, 1.057], [0.031, -2.18, 0.477, 1.078, 0.774, 0.574], [1.32, -1.614, 0.5, 0.512, 0.791, 1.227], [0.57, -1.002, 0.878, 0.861, 0.739, 0.347], [-2.072, 2.433, 1.143, 0.504, 1.054, 0.551], [-0.569, -2.64, 0.278, 0.616, 1.122, 1.078], [2.089, -1.691, 0.769, 0.97, 0.148, 0.992], [2.274, -0.687, 0.634, 0.56, 0.654, 0.811], [-1.223, 2.009, 0.495, 1.006, -0.028, 0.186], [0.145, -2.547, 0.54, 0.793, 0.387, 0.825], [1.744, -1.228, 0.533, 0.139, 0.886, 1.027], [1.123, -2.589, 1.183, 0.079, 0.187, 0.548], [2.235, -0.842, 0.485, 0.73, 0.575, 0.903], [1.694, -0.054, 1.249, 0.468, 0.557, 0.748], [-1.216, 2.107, 0.803, 0.764, 0.267, 0.274], [-2.623, -2.077, 1.01, 0.838, 0.106, -0.148]]\nC: [[-0.042, -0.277, 0.622, 0.349, 0.954, 1.11], [0.421, 0.801, 0.437, 0.094, 0.078, 1.2], [-0.436, 0.855, 0.625, 0.341, 0.737, 1.353], [0.599, -0.582, 0.28, 0.836, 0.717, 0.357], [-2.398, -0.135, 0.951, 0.429, 1.038, 0.502], [1.554, 0.709, 0.624, 0.144, 0.967, 1.304], [-0.666, -1.374, 0.422, 0.517, 0.122, 0.8], [-0.203, 1.908, 0.093, 1.027, 0.556, 0.76], [-1.532, -0.535, 0.437, 0.57, 0.41, 0.413], [-1.535, -0.307, 0.814, 0.936, 0.544, 1.082], [-0.39, -2.044, 0.309, 0.76, 0.801, 0.62], [1.044, -2.393, 0.932, 1.048, 0.287, 1.261], [0.664, -0.294, 1.14, 0.882, 0.176, 0.207], [-2.135, 2.211, 0.272, 0.963, 0.668, 0.76], [-1.028, -2.103, 1.016, 0.918, 0.609, 1.31], [1.579, -2.37, 0.458, 0.202, 0.159, 0.166], [2.079, -0.505, 0.945, 0.57, 0.86, 0.725], [-0.396, 2.379, 0.489, 0.77, 0.063, 0.52], [0.423, -2.492, 0.598, 0.788, 0.241, 0.406], [2.219, -1.548, 0.415, 0.429, 0.702, 0.329], [1.236, -1.961, 0.849, 0.371, 0.256, -0.039], [2.93, -1.099, 1.108, 0.393, 0.388, 0.187], [1.738, -0.099, 0.354, 0.013, 0.06, 0.667], [-1.711, 2.599, 0.36, 0.548, 0.69, -0.323], [-1.988, -1.796, 0.232, 0.609, 0.912, -0.043]]\nD: [[-0.117, -0.712, 0.165, 0.707, 0.749, 0.416], [0.663, 1.109, 0.92, 0.786, 0.382, 0.761], [-0.485, 1.276, -0.006, 0.122, 0.579, 0.562], [0.651, -1.033, 0.48, 0.012, 0.291, 0.281], [-1.67, 0.137, 0.785, 1.091, 0.142, 0.851], [1.44, 0.455, 0.476, 0.133, 0.572, 0.925], [-0.342, -1.74, 0.35, 0.646, 0.394, 0.443], [-0.793, 2.134, 0.146, 1.105, 0.456, 0.742], [-1.574, -0.65, 0.985, 0.2, 0.168, 1.102], [-1.526, 0.104, 0.427, 0.23, 0.555, 0.818], [-0.21, -2.447, 0.593, 1.166, 1.051, 0.465], [1.11, -2.085, 0.532, 0.952, 0.334, 0.936], [-0.231, -0.532, 0.895, 0.826, 0.523, 0.78], [-2.843, 1.728, 0.764, 0.92, 0.672, 0.101], [-0.845, -1.905, 0.458, 0.184, 0.635, 1.348], [1.844, -1.433, 1.033, 0.147, 0.968, 0.118], [2.166, -0.542, 0.733, 0.117, 0.957, 0.814], [-0.92, 1.743, 0.237, 0.993, 0.477, 0.227], [0.31, -2.458, 0.659, 0.782, 0.696, 0.669], [1.626, -1.353, 0.953, 0.579, 0.517, 0.513], [0.856, -2.414, 0.37, 0.927, 0.46, -0.04], [2.564, -1.512, 0.576, 0.167, 0.528, 0.323], [2.064, -0.007, 0.399, -0.022, 0.444, 0.68], [-1.281, 1.93, 0.706, 0.411, 0.266, -0.223], [-2.823, -2.037, 0.823, 0.649, 0.539, 0.078]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_184_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_184_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-0.878, 0.793, 0.525, 0.307, 1.138, 0.24], [-1.741, 1.942, 0.919, 0.374, 0.284, 0.479], [-0.97, -1.167, 0.396, 0.145, 0.451, -0.19], [-1.083, -1.878, 0.816, 0.138, 0.403, 0.089], [-0.905, -1.314, 0.204, 0.196, 0.671, 0.614], [0.896, -0.37, -0.05, 0.065, 0.48, 0.052], [-0.311, 2.442, 0.913, 0.38, 0.489, 1.491]]\nB: [[-0.86, 0.987, 0.429, 0.753, 0.659, 0.935], [-1.179, 1.249, 1.101, 0.219, 0.461, 0.432], [-1.149, -1.648, 0.72, 0.293, 0.482, 0.205], [-0.806, -1.928, 1.378, 0.631, 0.665, 0.667], [-0.762, -1.463, 0.175, -0.154, 0.327, 0.058], [1.471, -0.937, 0.34, 0.631, 0.439, 0.05], [0.043, 1.928, 0.641, 0.637, 0.49, 0.81]]\nC: [[-0.458, 0.879, 0.858, 0.541, 0.86, 0.578], [-1.685, 1.754, 1.421, 0.241, 0.756, -0.241], [-1.527, -1.671, 0.922, 0.635, 0.013, 0.552], [-1.217, -1.15, 1.123, 0.142, 0.166, 0.441], [-1.042, -1.813, 0.54, 0.438, 0.445, 0.211], [1.239, -0.633, 0.179, 0.181, 0.23, 0.74], [-0.011, 2.472, 1.042, 0.129, 0.472, 1.438]]\nD: [[-0.55, 0.944, 0.644, 0.68, 1.046, 0.521], [-1.267, 1.712, 1.229, 0.359, 0.367, 0.165], [-1.24, -1.459, 0.828, 0.501, 0.283, 0.305], [-1.303, -1.553, 1.014, 0.399, 0.228, 0.178], [-0.73, -1.694, 0.629, 0.212, 0.251, 0.137], [1.337, -0.693, 0.283, 0.145, 0.467, 0.539], [0.135, 2.342, 0.574, 0.546, 0.62, 1.068]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the object in the scene. The camera pose information includes: the rotation matrix: [[0.330673, -0.328207, 0.884837], [-0.942686, -0.070458, 0.326157], [-0.044703, -0.941975, -0.332694]]; the translation vector: [3.753276, 4.481459, 1.345242], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.878, 0.793, 0.525, 0.307, 1.138, 0.24], [-1.741, 1.942, 0.919, 0.374, 0.284, 0.479], [-0.97, -1.167, 0.396, 0.145, 0.451, -0.19], [-1.083, -1.878, 0.816, 0.138, 0.403, 0.089], [-0.905, -1.314, 0.204, 0.196, 0.671, 0.614], [0.896, -0.37, -0.05, 0.065, 0.48, 0.052], [-0.311, 2.442, 0.913, 0.38, 0.489, 1.491]]\nB: [[-0.86, 0.987, 0.429, 0.753, 0.659, 0.935], [-1.179, 1.249, 1.101, 0.219, 0.461, 0.432], [-1.149, -1.648, 0.72, 0.293, 0.482, 0.205], [-0.806, -1.928, 1.378, 0.631, 0.665, 0.667], [-0.762, -1.463, 0.175, -0.154, 0.327, 0.058], [1.471, -0.937, 0.34, 0.631, 0.439, 0.05], [0.043, 1.928, 0.641, 0.637, 0.49, 0.81]]\nC: [[-0.458, 0.879, 0.858, 0.541, 0.86, 0.578], [-1.685, 1.754, 1.421, 0.241, 0.756, -0.241], [-1.527, -1.671, 0.922, 0.635, 0.013, 0.552], [-1.217, -1.15, 1.123, 0.142, 0.166, 0.441], [-1.042, -1.813, 0.54, 0.438, 0.445, 0.211], [1.239, -0.633, 0.179, 0.181, 0.23, 0.74], [-0.011, 2.472, 1.042, 0.129, 0.472, 1.438]]\nD: [[-0.55, 0.944, 0.644, 0.68, 1.046, 0.521], [-1.267, 1.712, 1.229, 0.359, 0.367, 0.165], [-1.24, -1.459, 0.828, 0.501, 0.283, 0.305], [-1.303, -1.553, 1.014, 0.399, 0.228, 0.178], [-0.73, -1.694, 0.629, 0.212, 0.251, 0.137], [1.337, -0.693, 0.283, 0.145, 0.467, 0.539], [0.135, 2.342, 0.574, 0.546, 0.62, 1.068]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_185_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_185_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-0.565, -1.185, 1.353, 0.464, 0.006, 0.118], [0.412, -0.591, 0.819, 0.036, 0.543, 0.322], [0.775, 0.247, 0.843, 0.919, 0.864, 0.389]]\nB: [[-0.648, -1.262, 0.922, 0.446, 0.433, 0.522], [0.437, -0.235, 0.949, 0.366, 0.445, 0.454], [0.764, 0.145, 0.941, 0.483, 0.409, 0.473]]\nC: [[-0.794, -1.422, 1.325, 0.646, -0.011, 0.511], [0.55, -0.124, 0.97, 0.767, 0.276, 0.151], [0.692, 0.134, 0.818, 0.04, 0.142, 0.775]]\nD: [[-0.888, -1.602, 1.373, 0.357, 0.797, 0.596], [0.014, -0.496, 0.808, 0.816, 0.004, 0.14], [0.932, 0.14, 0.871, 0.799, 0.355, 0.358]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the monitor in the scene. The camera pose information includes: the rotation matrix: [[0.054781, -0.427281, 0.902458], [-0.998013, -0.051617, 0.036143], [0.031139, -0.902644, -0.429259]]; the translation vector: [1.328526, 0.849821, 1.501181], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.565, -1.185, 1.353, 0.464, 0.006, 0.118], [0.412, -0.591, 0.819, 0.036, 0.543, 0.322], [0.775, 0.247, 0.843, 0.919, 0.864, 0.389]]\nB: [[-0.648, -1.262, 0.922, 0.446, 0.433, 0.522], [0.437, -0.235, 0.949, 0.366, 0.445, 0.454], [0.764, 0.145, 0.941, 0.483, 0.409, 0.473]]\nC: [[-0.794, -1.422, 1.325, 0.646, -0.011, 0.511], [0.55, -0.124, 0.97, 0.767, 0.276, 0.151], [0.692, 0.134, 0.818, 0.04, 0.142, 0.775]]\nD: [[-0.888, -1.602, 1.373, 0.357, 0.797, 0.596], [0.014, -0.496, 0.808, 0.816, 0.004, 0.14], [0.932, 0.14, 0.871, 0.799, 0.355, 0.358]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_186_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_186_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-1.353, -1.905, 0.542, 0.198, 0.811, 0.866]]\nB: [[-1.69, -2.015, 0.887, 0.014, 0.72, 0.41]]\nC: [[-1.178, -2.25, 0.868, 0.547, 0.466, 0.935]]\nD: [[-1.26, -1.838, 0.523, -0.212, 0.311, 0.619]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the dishwasher in the scene. The camera pose information includes: the rotation matrix: [[0.752445, 0.275595, -0.598225], [0.657828, -0.35994, 0.661593], [-0.032994, -0.891342, -0.452129]]; the translation vector: [2.633805, 2.70906, 1.31733], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.353, -1.905, 0.542, 0.198, 0.811, 0.866]]\nB: [[-1.69, -2.015, 0.887, 0.014, 0.72, 0.41]]\nC: [[-1.178, -2.25, 0.868, 0.547, 0.466, 0.935]]\nD: [[-1.26, -1.838, 0.523, -0.212, 0.311, 0.619]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_187_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_187_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[0.575, -1.33, 0.913, 0.422, 0.222, 1.282], [-1.133, -0.569, 0.691, 0.502, 1.733, 0.362], [-1.177, -0.378, 1.14, 0.427, 2.109, 0.616], [-1.073, 1.114, 1.403, 0.582, 1.219, 1.213], [-0.205, 1.471, 1.068, 1.327, 0.559, 1.11], [0.394, 0.971, 1.401, 0.368, 1.532, 0.97], [0.465, 0.918, 0.399, 0.657, 1.295, 0.944], [-1.033, 0.855, 0.494, 0.561, 0.838, 0.96]]\nB: [[0.612, -1.146, 1.029, 0.876, 0.249, 1.097], [-1.186, -0.558, 0.781, 0.832, 1.793, 0.11], [-0.68, -0.259, 1.327, 0.321, 2.085, 0.39], [-0.751, 1.462, 1.244, 1.064, 1.13, 1.071], [-0.245, 1.694, 1.448, 1.271, 0.405, 0.826], [0.501, 1.197, 1.032, 0.635, 1.295, 1.137], [0.575, 1.298, 0.738, 0.961, 1.68, 0.895], [-1.135, 0.586, 0.775, 0.711, 1.079, 0.526]]\nC: [[0.678, -1.206, 0.812, 0.848, -0.174, 1.369], [-1.023, -0.705, 0.492, 0.502, 1.434, -0.09], [-1.388, -0.068, 1.103, 0.59, 1.707, 0.559], [-1.152, 1.027, 1.347, 0.752, 0.971, 1.412], [-0.198, 1.443, 1.383, 1.532, 0.499, 1.267], [0.854, 0.79, 1.691, 0.351, 1.682, 0.641], [0.791, 0.546, 0.687, 0.219, 1.088, 1.252], [-0.634, 1.336, 0.286, 0.814, 1.197, 1.221]]\nD: [[0.82, -1.281, 0.96, -0.009, 0.662, 1.248], [-1.615, -0.505, 0.267, 0.866, 1.991, 0.496], [-0.964, -0.4, 1.176, 0.247, 2.442, 0.894], [-1.088, 1.358, 1.232, 0.782, 1.082, 0.821], [0.043, 1.718, 1.49, 1.508, 0.835, 1.275], [0.739, 1.084, 1.461, 0.376, 1.382, 1.444], [0.407, 0.623, 0.633, 0.434, 1.281, 1.107], [-1.311, 0.453, 0.839, 0.768, 1.093, 0.774]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the clothes in the scene. The camera pose information includes: the rotation matrix: [[0.88123, -0.188698, 0.433389], [-0.470321, -0.258404, 0.843816], [-0.047237, -0.947428, -0.316462]]; the translation vector: [1.061636, 1.321782, 1.457525], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.575, -1.33, 0.913, 0.422, 0.222, 1.282], [-1.133, -0.569, 0.691, 0.502, 1.733, 0.362], [-1.177, -0.378, 1.14, 0.427, 2.109, 0.616], [-1.073, 1.114, 1.403, 0.582, 1.219, 1.213], [-0.205, 1.471, 1.068, 1.327, 0.559, 1.11], [0.394, 0.971, 1.401, 0.368, 1.532, 0.97], [0.465, 0.918, 0.399, 0.657, 1.295, 0.944], [-1.033, 0.855, 0.494, 0.561, 0.838, 0.96]]\nB: [[0.612, -1.146, 1.029, 0.876, 0.249, 1.097], [-1.186, -0.558, 0.781, 0.832, 1.793, 0.11], [-0.68, -0.259, 1.327, 0.321, 2.085, 0.39], [-0.751, 1.462, 1.244, 1.064, 1.13, 1.071], [-0.245, 1.694, 1.448, 1.271, 0.405, 0.826], [0.501, 1.197, 1.032, 0.635, 1.295, 1.137], [0.575, 1.298, 0.738, 0.961, 1.68, 0.895], [-1.135, 0.586, 0.775, 0.711, 1.079, 0.526]]\nC: [[0.678, -1.206, 0.812, 0.848, -0.174, 1.369], [-1.023, -0.705, 0.492, 0.502, 1.434, -0.09], [-1.388, -0.068, 1.103, 0.59, 1.707, 0.559], [-1.152, 1.027, 1.347, 0.752, 0.971, 1.412], [-0.198, 1.443, 1.383, 1.532, 0.499, 1.267], [0.854, 0.79, 1.691, 0.351, 1.682, 0.641], [0.791, 0.546, 0.687, 0.219, 1.088, 1.252], [-0.634, 1.336, 0.286, 0.814, 1.197, 1.221]]\nD: [[0.82, -1.281, 0.96, -0.009, 0.662, 1.248], [-1.615, -0.505, 0.267, 0.866, 1.991, 0.496], [-0.964, -0.4, 1.176, 0.247, 2.442, 0.894], [-1.088, 1.358, 1.232, 0.782, 1.082, 0.821], [0.043, 1.718, 1.49, 1.508, 0.835, 1.275], [0.739, 1.084, 1.461, 0.376, 1.382, 1.444], [0.407, 0.623, 0.633, 0.434, 1.281, 1.107], [-1.311, 0.453, 0.839, 0.768, 1.093, 0.774]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_188_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_188_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-1.405, 0.601, 0.764, -0.233, 0.102, -0.396]]\nB: [[-1.074, 0.387, 1.003, 0.303, 0.098, -0.079]]\nC: [[-1.416, 1.278, 0.402, -0.026, 0.472, 0.541]]\nD: [[-1.238, 0.875, 0.853, 0.207, 0.18, 0.059]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the washcloth in the scene. The camera pose information includes: the rotation matrix: [[-0.922168, 0.178823, -0.342969], [0.38661, 0.453076, -0.803278], [0.011746, -0.873352, -0.486947]]; the translation vector: [3.207336, 1.959871, 1.267555], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.405, 0.601, 0.764, -0.233, 0.102, -0.396]]\nB: [[-1.074, 0.387, 1.003, 0.303, 0.098, -0.079]]\nC: [[-1.416, 1.278, 0.402, -0.026, 0.472, 0.541]]\nD: [[-1.238, 0.875, 0.853, 0.207, 0.18, 0.059]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_189_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_189_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[1.762, -1.555, 0.827, 2.184, 0.001, 1.66], [-1.543, -2.246, 0.687, 2.166, 0.538, 1.705], [-0.782, 1.874, 0.612, 0.36, -0.393, 2.288], [-2.462, 0.204, 1.234, 0.129, 3.712, 1.462], [-0.383, -1.635, 0.309, 0.432, -0.146, 1.237], [2.291, 0.002, 1.16, 0.467, 3.329, 1.816], [1.638, 1.427, 0.73, 2.104, 0.102, 1.563], [-1.67, 1.91, 0.562, 0.741, 0.507, 1.714]]\nB: [[1.374, -1.714, 0.725, 2.507, 0.169, 1.413], [-1.211, -1.757, 0.826, 2.443, 0.176, 1.694], [-0.519, 1.79, 0.908, 0.294, 0.099, 1.833], [-2.419, 0.035, 0.987, 0.337, 3.555, 1.874], [0.072, -1.69, 0.634, 0.2, 0.284, 1.225], [2.688, 0.023, 0.867, 0.191, 3.55, 1.732], [1.91, 1.763, 0.852, 1.655, 0.149, 1.762], [-2.022, 1.78, 0.984, 1.051, 0.126, 1.927]]\nC: [[0.908, -1.987, 1.173, 2.894, 0.341, 1.45], [-1.176, -1.625, 1.254, 2.938, 0.258, 1.218], [-0.734, 1.406, 1.146, 0.597, 0.342, 1.626], [-2.253, 0.34, 1.308, 0.063, 3.579, 1.568], [-0.079, -1.858, 0.689, 0.18, 0.741, 0.85], [2.904, 0.375, 0.691, 0.079, 3.103, 2.186], [1.824, 1.499, 0.728, 1.255, 0.079, 1.787], [-2.124, 1.899, 1.164, 1.019, 0.481, 1.863]]\nD: [[1.462, -1.527, 0.599, 2.871, 0.537, 1.876], [-0.801, -1.454, 1.05, 2.817, -0.258, 1.392], [-0.063, 2.202, 0.566, 0.693, 0.023, 1.708], [-2.869, -0.081, 1.48, 0.816, 3.209, 2.127], [0.07, -1.284, 0.825, -0.242, 0.304, 1.406], [2.798, 0.497, 1.202, 0.386, 3.591, 2.066], [1.458, 2.138, 0.37, 1.504, 0.6, 1.542], [-2.145, 1.824, 0.999, 1.206, 0.504, 1.867]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[0.984594, -0.069457, 0.160469], [-0.174127, -0.305795, 0.936039], [-0.015944, -0.949561, -0.313178]]; the translation vector: [3.941113, 2.817773, 1.559826], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.762, -1.555, 0.827, 2.184, 0.001, 1.66], [-1.543, -2.246, 0.687, 2.166, 0.538, 1.705], [-0.782, 1.874, 0.612, 0.36, -0.393, 2.288], [-2.462, 0.204, 1.234, 0.129, 3.712, 1.462], [-0.383, -1.635, 0.309, 0.432, -0.146, 1.237], [2.291, 0.002, 1.16, 0.467, 3.329, 1.816], [1.638, 1.427, 0.73, 2.104, 0.102, 1.563], [-1.67, 1.91, 0.562, 0.741, 0.507, 1.714]]\nB: [[1.374, -1.714, 0.725, 2.507, 0.169, 1.413], [-1.211, -1.757, 0.826, 2.443, 0.176, 1.694], [-0.519, 1.79, 0.908, 0.294, 0.099, 1.833], [-2.419, 0.035, 0.987, 0.337, 3.555, 1.874], [0.072, -1.69, 0.634, 0.2, 0.284, 1.225], [2.688, 0.023, 0.867, 0.191, 3.55, 1.732], [1.91, 1.763, 0.852, 1.655, 0.149, 1.762], [-2.022, 1.78, 0.984, 1.051, 0.126, 1.927]]\nC: [[0.908, -1.987, 1.173, 2.894, 0.341, 1.45], [-1.176, -1.625, 1.254, 2.938, 0.258, 1.218], [-0.734, 1.406, 1.146, 0.597, 0.342, 1.626], [-2.253, 0.34, 1.308, 0.063, 3.579, 1.568], [-0.079, -1.858, 0.689, 0.18, 0.741, 0.85], [2.904, 0.375, 0.691, 0.079, 3.103, 2.186], [1.824, 1.499, 0.728, 1.255, 0.079, 1.787], [-2.124, 1.899, 1.164, 1.019, 0.481, 1.863]]\nD: [[1.462, -1.527, 0.599, 2.871, 0.537, 1.876], [-0.801, -1.454, 1.05, 2.817, -0.258, 1.392], [-0.063, 2.202, 0.566, 0.693, 0.023, 1.708], [-2.869, -0.081, 1.48, 0.816, 3.209, 2.127], [0.07, -1.284, 0.825, -0.242, 0.304, 1.406], [2.798, 0.497, 1.202, 0.386, 3.591, 2.066], [1.458, 2.138, 0.37, 1.504, 0.6, 1.542], [-2.145, 1.824, 0.999, 1.206, 0.504, 1.867]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_190_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_190_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-0.548, -0.723, 1.702, 0.029, 0.56, 0.548], [0.99, -0.353, 1.798, 0.254, 0.406, 1.307], [0.157, -0.505, 0.811, 2.866, -0.08, 2.257], [-1.059, 0.22, 1.187, 0.204, 1.96, 1.609], [-1.515, 1.374, 1.432, 0.39, 0.462, 2.145], [-1.181, 2.329, 0.518, 0.161, 0.641, 1.327], [1.31, 1.368, 1.072, 1.078, 3.27, 1.707]]\nB: [[-0.408, -0.944, 1.285, 0.383, 0.855, 1.329], [1.037, -1.048, 1.113, 0.346, 0.92, 1.337], [0.631, -0.546, 1.683, 2.674, 0.421, 2.371], [-1.406, 0.284, 0.493, 0.426, 1.745, 1.616], [-0.913, 1.243, 0.625, 0.944, -0.159, 1.495], [-0.749, 1.827, 0.664, -0.223, 0.933, 1.793], [1.27, 1.253, 1.221, 0.403, 2.724, 2.094]]\nC: [[-0.454, -0.86, 2.026, 0.451, 0.358, 1.257], [1.65, -0.511, 2.057, 0.183, 0.13, 0.645], [0.357, -0.781, 1.143, 3.085, -0.312, 2.705], [-1.785, 0.873, 0.92, 0.414, 1.805, 1.915], [-0.907, 0.946, 0.648, 1.086, 0.063, 2.046], [-0.884, 1.711, 1.057, -0.048, 0.722, 0.964], [1.337, 0.641, 0.462, 0.296, 3.312, 2.01]]\nD: [[-0.751, -0.786, 1.574, 0.095, 0.444, 1.034], [1.18, -0.773, 1.574, 0.094, 0.433, 1.033], [0.142, -0.562, 1.184, 3.142, 0.116, 2.394], [-1.437, 0.419, 0.848, 0.139, 1.974, 1.688], [-1.083, 1.379, 1.042, 0.807, 0.163, 1.776], [-0.694, 1.837, 0.766, 0.107, 0.954, 1.459], [1.355, 0.889, 0.903, 0.788, 2.903, 1.82]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[-0.355681, -0.20797, 0.911175], [-0.934036, 0.113197, -0.338769], [-0.032689, -0.971563, -0.234514]]; the translation vector: [0.539195, 4.841905, 1.636959], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.548, -0.723, 1.702, 0.029, 0.56, 0.548], [0.99, -0.353, 1.798, 0.254, 0.406, 1.307], [0.157, -0.505, 0.811, 2.866, -0.08, 2.257], [-1.059, 0.22, 1.187, 0.204, 1.96, 1.609], [-1.515, 1.374, 1.432, 0.39, 0.462, 2.145], [-1.181, 2.329, 0.518, 0.161, 0.641, 1.327], [1.31, 1.368, 1.072, 1.078, 3.27, 1.707]]\nB: [[-0.408, -0.944, 1.285, 0.383, 0.855, 1.329], [1.037, -1.048, 1.113, 0.346, 0.92, 1.337], [0.631, -0.546, 1.683, 2.674, 0.421, 2.371], [-1.406, 0.284, 0.493, 0.426, 1.745, 1.616], [-0.913, 1.243, 0.625, 0.944, -0.159, 1.495], [-0.749, 1.827, 0.664, -0.223, 0.933, 1.793], [1.27, 1.253, 1.221, 0.403, 2.724, 2.094]]\nC: [[-0.454, -0.86, 2.026, 0.451, 0.358, 1.257], [1.65, -0.511, 2.057, 0.183, 0.13, 0.645], [0.357, -0.781, 1.143, 3.085, -0.312, 2.705], [-1.785, 0.873, 0.92, 0.414, 1.805, 1.915], [-0.907, 0.946, 0.648, 1.086, 0.063, 2.046], [-0.884, 1.711, 1.057, -0.048, 0.722, 0.964], [1.337, 0.641, 0.462, 0.296, 3.312, 2.01]]\nD: [[-0.751, -0.786, 1.574, 0.095, 0.444, 1.034], [1.18, -0.773, 1.574, 0.094, 0.433, 1.033], [0.142, -0.562, 1.184, 3.142, 0.116, 2.394], [-1.437, 0.419, 0.848, 0.139, 1.974, 1.688], [-1.083, 1.379, 1.042, 0.807, 0.163, 1.776], [-0.694, 1.837, 0.766, 0.107, 0.954, 1.459], [1.355, 0.889, 0.903, 0.788, 2.903, 1.82]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_191_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_191_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[1.515, -3.241, 1.128, 2.444, 0.863, 2.298], [1.577, 0.871, 1.235, 2.218, 0.709, 2.09], [1.099, 3.677, 1.424, 1.498, 0.813, 2.316], [1.686, -0.521, 1.08, 2.449, 0.774, 1.941], [1.48, 2.234, 1.312, 2.224, 0.696, 2.161], [0.71, 4.953, 0.833, 0.669, 0.644, 1.154], [1.678, -1.888, 1.095, 2.523, 0.759, 2.102]]\nB: [[1.269, -3.321, 1.076, 2.021, 0.959, 2.397], [1.664, 1.284, 1.204, 2.329, 1.065, 2.182], [1.189, 3.832, 1.394, 1.94, 1.033, 1.829], [2.066, -0.941, 0.589, 2.315, 1.169, 1.455], [1.915, 2.253, 1.321, 2.418, 0.57, 2.378], [0.213, 5.41, 0.898, 0.409, 1.093, 1.517], [1.55, -2.082, 1.024, 2.82, 0.884, 2.344]]\nC: [[1.118, -3.575, 0.993, 1.946, 0.682, 2.318], [1.412, 0.928, 1.006, 2.495, 0.73, 2.187], [0.774, 3.36, 0.968, 1.482, 0.922, 2.574], [1.295, -0.734, 1.167, 2.189, 0.383, 1.587], [1.325, 2.548, 0.999, 2.413, 1.015, 2.532], [0.98, 5.017, 0.875, 0.448, 0.455, 0.917], [2.018, -1.5, 1.046, 2.717, 0.819, 2.55]]\nD: [[1.259, -3.521, 1.143, 2.894, 0.867, 2.663], [1.362, 1.016, 1.431, 2.314, 0.878, 2.2], [0.748, 3.481, 1.025, 1.495, 1.271, 2.75], [1.85, -0.752, 1.348, 2.468, 0.657, 1.566], [1.513, 2.006, 1.345, 1.751, 0.827, 2.159], [0.635, 4.802, 1.263, 0.202, 1.111, 1.501], [1.353, -2.331, 1.563, 2.89, 1.228, 2.108]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the bookshelf in the scene. The camera pose information includes: the rotation matrix: [[-0.941243, -0.209403, 0.264975], [-0.336113, 0.504116, -0.795548], [0.033012, -0.837865, -0.544878]]; the translation vector: [4.828751, 9.008894, 1.463441], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.515, -3.241, 1.128, 2.444, 0.863, 2.298], [1.577, 0.871, 1.235, 2.218, 0.709, 2.09], [1.099, 3.677, 1.424, 1.498, 0.813, 2.316], [1.686, -0.521, 1.08, 2.449, 0.774, 1.941], [1.48, 2.234, 1.312, 2.224, 0.696, 2.161], [0.71, 4.953, 0.833, 0.669, 0.644, 1.154], [1.678, -1.888, 1.095, 2.523, 0.759, 2.102]]\nB: [[1.269, -3.321, 1.076, 2.021, 0.959, 2.397], [1.664, 1.284, 1.204, 2.329, 1.065, 2.182], [1.189, 3.832, 1.394, 1.94, 1.033, 1.829], [2.066, -0.941, 0.589, 2.315, 1.169, 1.455], [1.915, 2.253, 1.321, 2.418, 0.57, 2.378], [0.213, 5.41, 0.898, 0.409, 1.093, 1.517], [1.55, -2.082, 1.024, 2.82, 0.884, 2.344]]\nC: [[1.118, -3.575, 0.993, 1.946, 0.682, 2.318], [1.412, 0.928, 1.006, 2.495, 0.73, 2.187], [0.774, 3.36, 0.968, 1.482, 0.922, 2.574], [1.295, -0.734, 1.167, 2.189, 0.383, 1.587], [1.325, 2.548, 0.999, 2.413, 1.015, 2.532], [0.98, 5.017, 0.875, 0.448, 0.455, 0.917], [2.018, -1.5, 1.046, 2.717, 0.819, 2.55]]\nD: [[1.259, -3.521, 1.143, 2.894, 0.867, 2.663], [1.362, 1.016, 1.431, 2.314, 0.878, 2.2], [0.748, 3.481, 1.025, 1.495, 1.271, 2.75], [1.85, -0.752, 1.348, 2.468, 0.657, 1.566], [1.513, 2.006, 1.345, 1.751, 0.827, 2.159], [0.635, 4.802, 1.263, 0.202, 1.111, 1.501], [1.353, -2.331, 1.563, 2.89, 1.228, 2.108]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_192_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_192_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-1.126, -0.51, 1.73, -0.359, 0.479, 0.324]]\nB: [[-1.548, -0.135, 1.59, 0.021, 0.457, 0.386]]\nC: [[-1.508, -0.035, 1.589, -0.436, 0.071, 0.171]]\nD: [[-1.888, -0.563, 1.28, -0.393, 0.688, 0.046]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the picture in the scene. The camera pose information includes: the rotation matrix: [[0.623567, 0.536294, -0.568817], [0.781209, -0.455034, 0.427384], [-0.029628, -0.710867, -0.702702]]; the translation vector: [1.790477, 1.816361, 1.229059], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.126, -0.51, 1.73, -0.359, 0.479, 0.324]]\nB: [[-1.548, -0.135, 1.59, 0.021, 0.457, 0.386]]\nC: [[-1.508, -0.035, 1.589, -0.436, 0.071, 0.171]]\nD: [[-1.888, -0.563, 1.28, -0.393, 0.688, 0.046]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_193_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_193_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-2.08, 0.154, 1.005, 0.283, 1.414, 1.731]]\nB: [[-1.974, 0.286, 1.416, 0.341, 1.457, 1.235]]\nC: [[-1.941, 0.29, 1.24, -0.098, 1.307, 1.381]]\nD: [[-1.581, 0.374, 0.521, 0.311, 1.136, 1.526]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the whiteboard in the scene. The camera pose information includes: the rotation matrix: [[-0.341382, 0.594812, -0.727775], [0.932196, 0.11517, -0.343142], [-0.120287, -0.795572, -0.593798]]; the translation vector: [7.151203, 3.587152, 1.581923], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-2.08, 0.154, 1.005, 0.283, 1.414, 1.731]]\nB: [[-1.974, 0.286, 1.416, 0.341, 1.457, 1.235]]\nC: [[-1.941, 0.29, 1.24, -0.098, 1.307, 1.381]]\nD: [[-1.581, 0.374, 0.521, 0.311, 1.136, 1.526]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_194_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_194_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-0.65, 1.626, 0.952, 1.426, 0.125, 1.867]]\nB: [[-0.34, 1.647, 1.105, 1.036, 0.294, 2.092]]\nC: [[-0.202, 1.219, 1.248, 1.308, -0.28, 1.829]]\nD: [[-1.114, 1.711, 0.518, 0.996, 0.291, 2.172]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the doorframe in the scene. The camera pose information includes: the rotation matrix: [[-0.40936, -0.486807, 0.77165], [-0.912164, 0.236459, -0.334729], [-0.019515, -0.840896, -0.540844]]; the translation vector: [1.412713, 1.214489, 1.390939], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-0.65, 1.626, 0.952, 1.426, 0.125, 1.867]]\nB: [[-0.34, 1.647, 1.105, 1.036, 0.294, 2.092]]\nC: [[-0.202, 1.219, 1.248, 1.308, -0.28, 1.829]]\nD: [[-1.114, 1.711, 0.518, 0.996, 0.291, 2.172]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_195_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_195_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[-1.253, 0.185, 0.949, 0.229, 3.97, 1.922], [-0.174, 1.794, 1.044, 2.121, 0.218, 2.116], [0.873, -0.38, 1.281, 0.158, 4.352, 2.497], [0.476, -2.537, 0.593, 0.677, 0.042, 1.129], [0.122, -2.616, 0.312, 0.063, 0.188, 0.596]]\nB: [[-1.703, -0.184, 0.581, -0.118, 3.494, 2.053], [0.248, 1.316, 1.102, 2.022, 0.319, 1.655], [1.35, -0.101, 1.108, 0.315, 4.473, 2.489], [0.441, -2.72, 0.688, 0.321, 0.469, 1.1], [-0.308, -2.248, -0.131, 0.362, 0.498, 0.335]]\nC: [[-1.001, 0.387, 0.855, 0.13, 4.223, 1.808], [-0.121, 2.25, 1.058, 2.216, 0.377, 2.185], [0.489, 0.025, 0.85, -0.341, 3.971, 2.77], [0.668, -2.895, 0.381, 0.972, 0.18, 1.122], [0.223, -2.648, 0.118, -0.29, 0.288, 0.814]]\nD: [[-1.615, 0.237, 0.631, 0.113, 3.734, 2.164], [-0.111, 1.6, 1.257, 2.2, 0.658, 1.704], [0.468, -0.376, 0.97, -0.134, 3.943, 2.668], [0.083, -2.476, 0.49, 0.836, 0.329, 1.629], [-0.101, -2.949, 0.022, 0.48, 0.426, 0.711]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the wall in the scene. The camera pose information includes: the rotation matrix: [[0.977514, -0.102294, 0.184398], [-0.210796, -0.497303, 0.841578], [0.005613, -0.861525, -0.507684]]; the translation vector: [3.555602, 1.207732, 1.356493], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[-1.253, 0.185, 0.949, 0.229, 3.97, 1.922], [-0.174, 1.794, 1.044, 2.121, 0.218, 2.116], [0.873, -0.38, 1.281, 0.158, 4.352, 2.497], [0.476, -2.537, 0.593, 0.677, 0.042, 1.129], [0.122, -2.616, 0.312, 0.063, 0.188, 0.596]]\nB: [[-1.703, -0.184, 0.581, -0.118, 3.494, 2.053], [0.248, 1.316, 1.102, 2.022, 0.319, 1.655], [1.35, -0.101, 1.108, 0.315, 4.473, 2.489], [0.441, -2.72, 0.688, 0.321, 0.469, 1.1], [-0.308, -2.248, -0.131, 0.362, 0.498, 0.335]]\nC: [[-1.001, 0.387, 0.855, 0.13, 4.223, 1.808], [-0.121, 2.25, 1.058, 2.216, 0.377, 2.185], [0.489, 0.025, 0.85, -0.341, 3.971, 2.77], [0.668, -2.895, 0.381, 0.972, 0.18, 1.122], [0.223, -2.648, 0.118, -0.29, 0.288, 0.814]]\nD: [[-1.615, 0.237, 0.631, 0.113, 3.734, 2.164], [-0.111, 1.6, 1.257, 2.2, 0.658, 1.704], [0.468, -0.376, 0.97, -0.134, 3.943, 2.668], [0.083, -2.476, 0.49, 0.836, 0.329, 1.629], [-0.101, -2.949, 0.022, 0.48, 0.426, 0.711]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_196_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_196_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[1.561, -0.516, 0.43, 0.03, 0.264, 0.023]]\nB: [[1.307, -0.077, 0.927, 0.18, 0.373, 0.438]]\nC: [[1.232, 0.339, 1.368, -0.266, 0.794, 0.386]]\nD: [[1.366, 0.134, 0.662, 0.477, 0.375, 0.57]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the toilet paper holder in the scene. The camera pose information includes: the rotation matrix: [[-0.566304, -0.590941, 0.574533], [-0.823945, 0.423135, -0.376925], [-0.020365, -0.686838, -0.726526]]; the translation vector: [2.143516, 1.760119, 1.343188], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[1.561, -0.516, 0.43, 0.03, 0.264, 0.023]]\nB: [[1.307, -0.077, 0.927, 0.18, 0.373, 0.438]]\nC: [[1.232, 0.339, 1.368, -0.266, 0.794, 0.386]]\nD: [[1.366, 0.134, 0.662, 0.477, 0.375, 0.57]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_197_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_197_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[0.989, -2.867, 2.62, 3.827, 5.879, 0.215]]\nB: [[0.557, -2.629, 2.447, 3.868, 5.161, -0.064]]\nC: [[0.767, -2.57, 3.32, 4.124, 4.999, -0.179]]\nD: [[0.538, -2.391, 2.899, 4.263, 5.407, 0.187]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the ceiling in the scene. The camera pose information includes: the rotation matrix: [[-0.999494, 0.005595, 0.031322], [-0.029883, 0.172936, -0.98448], [-0.010925, -0.984917, -0.172681]]; the translation vector: [6.687301, 5.436423, 1.742894], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.989, -2.867, 2.62, 3.827, 5.879, 0.215]]\nB: [[0.557, -2.629, 2.447, 3.868, 5.161, -0.064]]\nC: [[0.767, -2.57, 3.32, 4.124, 4.999, -0.179]]\nD: [[0.538, -2.391, 2.899, 4.263, 5.407, 0.187]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_198_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_198_1.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Detection", "visual_input_component": "3d image", "source": "SCANNET_threed_bbox_detection", "options": "A: [[0.728, -0.216, 1.391, -0.233, 0.319, 0.888]]\nB: [[1.382, -0.434, 1.41, 0.62, 0.036, 0.847]]\nC: [[1.017, -0.314, 0.963, 0.261, 0.326, 0.441]]\nD: [[1.373, -0.033, 0.749, 0.246, 0.609, 0.097]]", "question": "Given a RGB image and a depth image, please detect the 3D bounding box of the paper towel dispenser in the scene. The camera pose information includes: the rotation matrix: [[0.207705, 0.494542, -0.843971], [0.97739, -0.069996, 0.199524], [0.039599, -0.866331, -0.497898]]; the translation vector: [4.53083, 2.291093, 1.52739], representing the transformation from the camera coordinate system to the world coordinate system. For each detected object, provide the output in this format, i.e., [x, y, z, x_size, y_size, z_size]. Here, [x, y, z] represents the gravity center of the 3D bounding boxes in the world coordinate system, [x_size, y_size, z_size] represents the width, height, and length of the 3D bounding box.", "context": "Your task is to detect objects in 3D space using a scan of RGB-Depth image pair. \nSelect from the following choices.\nA: [[0.728, -0.216, 1.391, -0.233, 0.319, 0.888]]\nB: [[1.382, -0.434, 1.41, 0.62, 0.036, 0.847]]\nC: [[1.017, -0.314, 0.963, 0.261, 0.326, 0.441]]\nD: [[1.373, -0.033, 0.749, 0.246, 0.609, 0.097]]", "input_image_path": ["./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_199_0.jpg", "./3D-spatial/threeD_Object_Detection/threeD_Object_Detection_199_1.png"], "output": "C", "qwen3-vl": "image none"}]
\ No newline at end of file
diff --git a/results/threeD_Object_Tracking/qwen3-vl/metadata_info.json b/results/threeD_Object_Tracking/qwen3-vl/metadata_info.json
new file mode 100644
index 0000000..bc22800
--- /dev/null
+++ b/results/threeD_Object_Tracking/qwen3-vl/metadata_info.json
@@ -0,0 +1 @@
+[{"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[755.199, 1687.366, 0.912], [762.788, 1426.72, 1.06], [630.862, 1571.41, 1.003], [798.666, 1466.0, 0.68]]\nB: [[752.983, 1266.122, 0.837], [675.965, 1325.79, 0.95], [756.034, 1628.64, 0.801], [696.028, 1386.4, 0.67]]\nC: [[753.288, 1465.266, 0.978], [728.298, 1787.05, 0.81], [812.921, 1600.32, 0.911], [834.531, 1762.1, 0.91]]\nD: [[705.473, 1565.779, 0.995], [702.703, 1568.02, 0.92], [699.933, 1570.26, 0.845], [697.471, 1572.4, 0.77]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[755.199, 1687.366, 0.912], [762.788, 1426.72, 1.06], [630.862, 1571.41, 1.003], [798.666, 1466.0, 0.68]]\nB: [[752.983, 1266.122, 0.837], [675.965, 1325.79, 0.95], [756.034, 1628.64, 0.801], [696.028, 1386.4, 0.67]]\nC: [[753.288, 1465.266, 0.978], [728.298, 1787.05, 0.81], [812.921, 1600.32, 0.911], [834.531, 1762.1, 0.91]]\nD: [[705.473, 1565.779, 0.995], [702.703, 1568.02, 0.92], [699.933, 1570.26, 0.845], [697.471, 1572.4, 0.77]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_0_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_0_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_0_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_0_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_0_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_0_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_0_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_0_7.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[1779.824, 2603.51, 0.357], [1779.617, 2603.65, 0.307], [1779.419, 2603.795, 0.441], [1779.221, 2603.94, 0.574]]\nB: [[1820.656, 2604.08, 0.355], [1608.069, 2300.22, 0.346], [1590.874, 2776.0, 0.366], [1586.173, 2790.75, 0.602]]\nC: [[2053.203, 2562.85, 0.348], [1922.673, 2150.26, 0.297], [1762.465, 2275.213, 0.516], [1794.318, 2966.29, 0.652]]\nD: [[1676.53, 2378.45, 0.304], [1630.8, 2506.41, 0.34], [1460.959, 2537.73, 0.431], [1807.291, 2750.98, 0.686]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1779.824, 2603.51, 0.357], [1779.617, 2603.65, 0.307], [1779.419, 2603.795, 0.441], [1779.221, 2603.94, 0.574]]\nB: [[1820.656, 2604.08, 0.355], [1608.069, 2300.22, 0.346], [1590.874, 2776.0, 0.366], [1586.173, 2790.75, 0.602]]\nC: [[2053.203, 2562.85, 0.348], [1922.673, 2150.26, 0.297], [1762.465, 2275.213, 0.516], [1794.318, 2966.29, 0.652]]\nD: [[1676.53, 2378.45, 0.304], [1630.8, 2506.41, 0.34], [1460.959, 2537.73, 0.431], [1807.291, 2750.98, 0.686]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_1_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_1_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_1_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_1_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_1_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_1_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_1_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_1_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[648.721, 1650.064, 0.332], [648.899, 1649.775, 0.623], [649.829, 1649.485, 1.045], [649.829, 1649.485, 1.07]]\nB: [[652.771, 1330.238, 0.27], [755.559, 1907.786, 0.731], [646.182, 1892.589, 1.216], [597.495, 1779.123, 0.96]]\nC: [[699.141, 1374.83, 0.288], [751.036, 1823.862, 0.739], [640.56, 1789.673, 1.201], [595.069, 1390.425, 1.03]]\nD: [[747.646, 1793.494, 0.307], [651.728, 1395.546, 0.51], [557.034, 1729.201, 1.22], [743.254, 1745.25, 1.28]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[648.721, 1650.064, 0.332], [648.899, 1649.775, 0.623], [649.829, 1649.485, 1.045], [649.829, 1649.485, 1.07]]\nB: [[652.771, 1330.238, 0.27], [755.559, 1907.786, 0.731], [646.182, 1892.589, 1.216], [597.495, 1779.123, 0.96]]\nC: [[699.141, 1374.83, 0.288], [751.036, 1823.862, 0.739], [640.56, 1789.673, 1.201], [595.069, 1390.425, 1.03]]\nD: [[747.646, 1793.494, 0.307], [651.728, 1395.546, 0.51], [557.034, 1729.201, 1.22], [743.254, 1745.25, 1.28]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_2_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_2_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_2_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_2_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_2_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_2_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_2_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_2_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[372.341, 646.643, 0.41], [323.457, 728.14, 0.355], [328.402, 680.116, 0.356], [304.89, 638.729, 0.37]]\nB: [[374.71, 547.041, 0.452], [266.865, 747.941, 0.359], [360.504, 710.201, 0.414], [289.281, 637.508, 0.34]]\nC: [[324.105, 664.423, 0.389], [324.125, 664.423, 0.395], [324.145, 664.423, 0.402], [324.165, 664.423, 0.409]]\nD: [[382.975, 542.454, 0.448], [273.435, 575.926, 0.36], [306.415, 582.477, 0.37], [367.698, 624.849, 0.412]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[372.341, 646.643, 0.41], [323.457, 728.14, 0.355], [328.402, 680.116, 0.356], [304.89, 638.729, 0.37]]\nB: [[374.71, 547.041, 0.452], [266.865, 747.941, 0.359], [360.504, 710.201, 0.414], [289.281, 637.508, 0.34]]\nC: [[324.105, 664.423, 0.389], [324.125, 664.423, 0.395], [324.145, 664.423, 0.402], [324.165, 664.423, 0.409]]\nD: [[382.975, 542.454, 0.448], [273.435, 575.926, 0.36], [306.415, 582.477, 0.37], [367.698, 624.849, 0.412]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_3_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_3_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_3_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_3_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_3_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_3_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_3_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_3_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[319.582, 1213.1, 0.433], [414.088, 1032.0, 0.628], [421.328, 1137.51, 0.496], [344.955, 1253.44, 0.638]]\nB: [[363.433, 1098.33, 0.529], [363.433, 1098.33, 0.564], [363.433, 1098.33, 0.599], [363.433, 1098.33, 0.634]]\nC: [[310.015, 1243.97, 0.462], [343.153, 1122.0, 0.606], [333.209, 1019.58, 0.517], [431.855, 1307.51, 0.556]]\nD: [[300.468, 996.48, 0.537], [331.062, 1300.52, 0.537], [400.879, 1176.8, 0.602], [389.732, 1170.04, 0.637]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[319.582, 1213.1, 0.433], [414.088, 1032.0, 0.628], [421.328, 1137.51, 0.496], [344.955, 1253.44, 0.638]]\nB: [[363.433, 1098.33, 0.529], [363.433, 1098.33, 0.564], [363.433, 1098.33, 0.599], [363.433, 1098.33, 0.634]]\nC: [[310.015, 1243.97, 0.462], [343.153, 1122.0, 0.606], [333.209, 1019.58, 0.517], [431.855, 1307.51, 0.556]]\nD: [[300.468, 996.48, 0.537], [331.062, 1300.52, 0.537], [400.879, 1176.8, 0.602], [389.732, 1170.04, 0.637]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_4_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_4_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_4_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_4_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_4_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_4_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_4_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_4_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[393.191, 899.659, 0.591], [332.44, 1277.512, 0.54], [378.779, 1199.743, 0.483], [388.415, 1186.22, 0.761]]\nB: [[373.967, 1296.428, 0.56], [468.08, 1301.812, 0.52], [423.341, 1242.289, 0.478], [463.453, 1026.04, 0.769]]\nC: [[396.335, 1122.142, 0.513], [395.62, 1122.119, 0.55], [394.907, 1122.104, 0.586], [392.701, 1122.16, 0.734]]\nD: [[366.604, 1119.109, 0.592], [355.44, 1130.172, 0.57], [469.284, 957.093, 0.569], [384.2, 1040.44, 0.813]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[393.191, 899.659, 0.591], [332.44, 1277.512, 0.54], [378.779, 1199.743, 0.483], [388.415, 1186.22, 0.761]]\nB: [[373.967, 1296.428, 0.56], [468.08, 1301.812, 0.52], [423.341, 1242.289, 0.478], [463.453, 1026.04, 0.769]]\nC: [[396.335, 1122.142, 0.513], [395.62, 1122.119, 0.55], [394.907, 1122.104, 0.586], [392.701, 1122.16, 0.734]]\nD: [[366.604, 1119.109, 0.592], [355.44, 1130.172, 0.57], [469.284, 957.093, 0.569], [384.2, 1040.44, 0.813]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_5_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_5_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_5_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_5_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_5_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_5_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_5_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_5_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[1912.796, 2415.138, 0.226], [1612.76, 2000.136, 0.469], [1650.05, 2082.715, 0.705], [1870.661, 2666.852, 0.889]]\nB: [[2044.921, 2427.821, 0.251], [2197.918, 2811.408, 0.435], [1594.209, 2091.568, 0.541], [1595.884, 2911.557, 0.739]]\nC: [[1855.648, 2492.891, 0.267], [1855.098, 2493.555, 0.467], [1854.597, 2494.197, 0.634], [1854.096, 2494.841, 0.801]]\nD: [[1651.93, 2405.938, 0.246], [2153.625, 2215.89, 0.442], [1530.771, 2046.654, 0.746], [2201.19, 2084.755, 0.722]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1912.796, 2415.138, 0.226], [1612.76, 2000.136, 0.469], [1650.05, 2082.715, 0.705], [1870.661, 2666.852, 0.889]]\nB: [[2044.921, 2427.821, 0.251], [2197.918, 2811.408, 0.435], [1594.209, 2091.568, 0.541], [1595.884, 2911.557, 0.739]]\nC: [[1855.648, 2492.891, 0.267], [1855.098, 2493.555, 0.467], [1854.597, 2494.197, 0.634], [1854.096, 2494.841, 0.801]]\nD: [[1651.93, 2405.938, 0.246], [2153.625, 2215.89, 0.442], [1530.771, 2046.654, 0.746], [2201.19, 2084.755, 0.722]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_6_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_6_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_6_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_6_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_6_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_6_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_6_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_6_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[1835.457, 2530.979, -0.6], [1831.738, 2535.381, -0.475], [1828.016, 2539.789, -0.35], [1823.826, 2544.548, -0.226]]\nB: [[1728.159, 2657.767, -0.6], [1671.146, 2191.293, -0.456], [1889.85, 2711.258, -0.39], [1500.543, 2142.17, -0.266]]\nC: [[1868.34, 2656.949, -0.6], [1847.319, 3027.849, -0.442], [1621.372, 2206.666, -0.29], [1944.205, 2824.5, -0.259]]\nD: [[1798.206, 2853.486, -0.5], [1737.945, 2982.299, -0.415], [1782.37, 2464.903, -0.33], [2009.484, 2271.222, -0.188]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1835.457, 2530.979, -0.6], [1831.738, 2535.381, -0.475], [1828.016, 2539.789, -0.35], [1823.826, 2544.548, -0.226]]\nB: [[1728.159, 2657.767, -0.6], [1671.146, 2191.293, -0.456], [1889.85, 2711.258, -0.39], [1500.543, 2142.17, -0.266]]\nC: [[1868.34, 2656.949, -0.6], [1847.319, 3027.849, -0.442], [1621.372, 2206.666, -0.29], [1944.205, 2824.5, -0.259]]\nD: [[1798.206, 2853.486, -0.5], [1737.945, 2982.299, -0.415], [1782.37, 2464.903, -0.33], [2009.484, 2271.222, -0.188]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_7_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_7_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_7_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_7_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_7_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_7_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_7_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_7_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[2091.918, 2820.157, -0.699], [2085.742, 2471.429, -0.7], [1560.979, 2272.985, -0.534], [1615.9, 2272.87, -0.391]]\nB: [[1807.911, 2559.964, -0.854], [1804.558, 2563.859, -0.725], [1801.201, 2567.758, -0.596], [1797.7, 2572.03, -0.433]]\nC: [[2128.41, 2627.282, -0.79], [1547.739, 2837.704, -0.791], [1686.195, 2104.816, -0.492], [1645.0, 2561.72, -0.364]]\nD: [[1649.251, 2758.133, -0.686], [1533.206, 2890.142, -0.825], [2007.154, 2531.762, -0.478], [2127.3, 2070.45, -0.347]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[2091.918, 2820.157, -0.699], [2085.742, 2471.429, -0.7], [1560.979, 2272.985, -0.534], [1615.9, 2272.87, -0.391]]\nB: [[1807.911, 2559.964, -0.854], [1804.558, 2563.859, -0.725], [1801.201, 2567.758, -0.596], [1797.7, 2572.03, -0.433]]\nC: [[2128.41, 2627.282, -0.79], [1547.739, 2837.704, -0.791], [1686.195, 2104.816, -0.492], [1645.0, 2561.72, -0.364]]\nD: [[1649.251, 2758.133, -0.686], [1533.206, 2890.142, -0.825], [2007.154, 2531.762, -0.478], [2127.3, 2070.45, -0.347]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_8_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_8_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_8_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_8_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_8_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_8_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_8_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_8_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[437.202, 1086.964, 0.692], [437.221, 1087.01, 0.817], [437.244, 1087.066, 0.842], [437.244, 1087.066, 0.842]]\nB: [[357.432, 1159.623, 0.607], [351.412, 1296.28, 0.836], [516.977, 1219.588, 0.769], [425.277, 1005.318, 0.772]]\nC: [[520.991, 1274.564, 0.812], [478.068, 1065.93, 0.705], [398.533, 912.914, 0.73], [470.356, 1123.201, 0.712]]\nD: [[377.562, 951.154, 0.715], [472.017, 932.55, 0.727], [361.039, 1097.241, 0.701], [508.246, 1284.882, 0.804]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[437.202, 1086.964, 0.692], [437.221, 1087.01, 0.817], [437.244, 1087.066, 0.842], [437.244, 1087.066, 0.842]]\nB: [[357.432, 1159.623, 0.607], [351.412, 1296.28, 0.836], [516.977, 1219.588, 0.769], [425.277, 1005.318, 0.772]]\nC: [[520.991, 1274.564, 0.812], [478.068, 1065.93, 0.705], [398.533, 912.914, 0.73], [470.356, 1123.201, 0.712]]\nD: [[377.562, 951.154, 0.715], [472.017, 932.55, 0.727], [361.039, 1097.241, 0.701], [508.246, 1284.882, 0.804]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_9_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_9_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_9_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_9_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_9_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_9_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_9_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_9_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[506.527, 1099.076, 0.589], [420.185, 1298.755, 0.489], [362.914, 1033.947, 0.414], [396.859, 1002.99, 0.306]]\nB: [[424.014, 1100.606, 0.706], [424.133, 1100.728, 0.496], [424.173, 1100.769, 0.426], [424.212, 1100.81, 0.306]]\nC: [[456.889, 932.553, 0.793], [391.51, 1069.937, 0.527], [431.845, 933.545, 0.5], [394.898, 1320.05, 0.264]]\nD: [[378.115, 1221.413, 0.672], [347.816, 1131.373, 0.529], [364.847, 1229.038, 0.466], [397.183, 1091.0, 0.25]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[506.527, 1099.076, 0.589], [420.185, 1298.755, 0.489], [362.914, 1033.947, 0.414], [396.859, 1002.99, 0.306]]\nB: [[424.014, 1100.606, 0.706], [424.133, 1100.728, 0.496], [424.173, 1100.769, 0.426], [424.212, 1100.81, 0.306]]\nC: [[456.889, 932.553, 0.793], [391.51, 1069.937, 0.527], [431.845, 933.545, 0.5], [394.898, 1320.05, 0.264]]\nD: [[378.115, 1221.413, 0.672], [347.816, 1131.373, 0.529], [364.847, 1229.038, 0.466], [397.183, 1091.0, 0.25]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_10_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_10_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_10_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_10_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_10_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_10_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_10_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_10_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[420.731, 1013.531, 0.8], [361.227, 1037.43, 0.587], [485.796, 1006.664, 0.647], [418.217, 1072.225, 0.587]]\nB: [[374.363, 1267.963, 0.71], [402.578, 1232.818, 0.668], [434.034, 921.569, 0.52], [421.208, 1297.52, 0.506]]\nC: [[425.982, 1091.597, 0.73], [425.994, 1091.597, 0.733], [426.028, 1091.597, 0.541], [426.039, 1091.597, 0.619]]\nD: [[468.986, 997.688, 0.61], [441.053, 1239.106, 0.742], [435.348, 1170.376, 0.513], [358.562, 1151.219, 0.672]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[420.731, 1013.531, 0.8], [361.227, 1037.43, 0.587], [485.796, 1006.664, 0.647], [418.217, 1072.225, 0.587]]\nB: [[374.363, 1267.963, 0.71], [402.578, 1232.818, 0.668], [434.034, 921.569, 0.52], [421.208, 1297.52, 0.506]]\nC: [[425.982, 1091.597, 0.73], [425.994, 1091.597, 0.733], [426.028, 1091.597, 0.541], [426.039, 1091.597, 0.619]]\nD: [[468.986, 997.688, 0.61], [441.053, 1239.106, 0.742], [435.348, 1170.376, 0.513], [358.562, 1151.219, 0.672]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_11_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_11_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_11_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_11_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_11_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_11_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_11_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_11_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[2016.542, 845.631, 1.13], [2028.77, 874.497, 1.083], [1971.835, 957.221, 1.15], [1681.888, 953.919, 1.139]]\nB: [[1978.335, 863.179, 0.943], [1978.33, 863.187, 1.065], [1978.325, 863.194, 1.015], [1978.319, 863.201, 0.965]]\nC: [[1640.806, 1002.654, 1.092], [2125.94, 982.727, 1.09], [1765.046, 957.217, 1.116], [2264.988, 900.054, 0.911]]\nD: [[1688.119, 734.16, 0.877], [1887.56, 864.137, 1.092], [2139.033, 980.382, 1.191], [1969.445, 813.79, 0.775]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[2016.542, 845.631, 1.13], [2028.77, 874.497, 1.083], [1971.835, 957.221, 1.15], [1681.888, 953.919, 1.139]]\nB: [[1978.335, 863.179, 0.943], [1978.33, 863.187, 1.065], [1978.325, 863.194, 1.015], [1978.319, 863.201, 0.965]]\nC: [[1640.806, 1002.654, 1.092], [2125.94, 982.727, 1.09], [1765.046, 957.217, 1.116], [2264.988, 900.054, 0.911]]\nD: [[1688.119, 734.16, 0.877], [1887.56, 864.137, 1.092], [2139.033, 980.382, 1.191], [1969.445, 813.79, 0.775]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_12_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_12_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_12_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_12_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_12_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_12_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_12_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_12_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[639.138, 1624.989, -0.086], [636.359, 1627.431, -0.053], [632.807, 1630.318, 0.08], [629.158, 1633.096, 0.314]]\nB: [[543.626, 1367.896, -0.075], [653.208, 1574.861, -0.054], [757.25, 1346.07, 0.08], [540.23, 1650.674, 0.362]]\nC: [[537.409, 1426.609, -0.082], [626.472, 1686.779, -0.051], [691.803, 1387.102, 0.07], [744.081, 1369.746, 0.365]]\nD: [[557.32, 1516.073, -0.08], [526.841, 1596.276, -0.06], [611.464, 1793.408, 0.1], [674.543, 1593.857, 0.364]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[639.138, 1624.989, -0.086], [636.359, 1627.431, -0.053], [632.807, 1630.318, 0.08], [629.158, 1633.096, 0.314]]\nB: [[543.626, 1367.896, -0.075], [653.208, 1574.861, -0.054], [757.25, 1346.07, 0.08], [540.23, 1650.674, 0.362]]\nC: [[537.409, 1426.609, -0.082], [626.472, 1686.779, -0.051], [691.803, 1387.102, 0.07], [744.081, 1369.746, 0.365]]\nD: [[557.32, 1516.073, -0.08], [526.841, 1596.276, -0.06], [611.464, 1793.408, 0.1], [674.543, 1593.857, 0.364]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_13_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_13_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_13_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_13_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_13_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_13_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_13_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_13_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[407.887, 1163.323, 0.511], [407.929, 1163.41, 0.511], [407.934, 1163.409, 0.524], [407.951, 1163.403, 0.537]]\nB: [[388.853, 1125.736, 0.56], [434.747, 1231.09, 0.419], [348.138, 1361.198, 0.597], [328.283, 1154.348, 0.58]]\nC: [[374.741, 1227.419, 0.46], [461.986, 1151.55, 0.428], [486.887, 1127.556, 0.491], [354.147, 1359.889, 0.505]]\nD: [[471.139, 1113.037, 0.544], [333.263, 956.23, 0.501], [355.318, 1217.053, 0.538], [456.915, 1087.324, 0.512]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[407.887, 1163.323, 0.511], [407.929, 1163.41, 0.511], [407.934, 1163.409, 0.524], [407.951, 1163.403, 0.537]]\nB: [[388.853, 1125.736, 0.56], [434.747, 1231.09, 0.419], [348.138, 1361.198, 0.597], [328.283, 1154.348, 0.58]]\nC: [[374.741, 1227.419, 0.46], [461.986, 1151.55, 0.428], [486.887, 1127.556, 0.491], [354.147, 1359.889, 0.505]]\nD: [[471.139, 1113.037, 0.544], [333.263, 956.23, 0.501], [355.318, 1217.053, 0.538], [456.915, 1087.324, 0.512]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_14_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_14_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_14_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_14_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_14_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_14_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_14_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_14_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[1269.546, 1024.852, 1.042], [1269.744, 1025.178, 1.042], [1270.216, 1025.754, 0.992], [1270.837, 1026.506, 1.042]]\nB: [[1423.653, 1173.455, 1.097], [1300.351, 866.909, 0.934], [1179.097, 946.025, 1.104], [1411.454, 1138.532, 1.187]]\nC: [[1145.602, 896.06, 1.073], [1144.171, 966.324, 1.002], [1499.487, 1042.061, 0.91], [1482.233, 956.251, 1.138]]\nD: [[1137.684, 944.23, 0.905], [1316.46, 1218.835, 0.861], [1509.763, 1193.692, 1.048], [1361.774, 1108.409, 0.891]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1269.546, 1024.852, 1.042], [1269.744, 1025.178, 1.042], [1270.216, 1025.754, 0.992], [1270.837, 1026.506, 1.042]]\nB: [[1423.653, 1173.455, 1.097], [1300.351, 866.909, 0.934], [1179.097, 946.025, 1.104], [1411.454, 1138.532, 1.187]]\nC: [[1145.602, 896.06, 1.073], [1144.171, 966.324, 1.002], [1499.487, 1042.061, 0.91], [1482.233, 956.251, 1.138]]\nD: [[1137.684, 944.23, 0.905], [1316.46, 1218.835, 0.861], [1509.763, 1193.692, 1.048], [1361.774, 1108.409, 0.891]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_15_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_15_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_15_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_15_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_15_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_15_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_15_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_15_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[1251.433, 1108.948, 0.433], [1176.759, 1115.714, 0.456], [1227.53, 991.616, 0.633], [1095.585, 1183.286, 0.618]]\nB: [[1509.989, 949.628, 0.539], [1350.384, 1212.22, 0.56], [1071.64, 893.308, 0.484], [1153.706, 1063.833, 0.645]]\nC: [[1298.993, 1034.258, 0.529], [1299.542, 1034.749, 0.554], [1300.09, 1035.239, 0.579], [1300.639, 1035.729, 0.604]]\nD: [[1378.947, 975.996, 0.598], [1493.813, 900.58, 0.493], [1370.14, 1033.836, 0.656], [1047.788, 1106.271, 0.659]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1251.433, 1108.948, 0.433], [1176.759, 1115.714, 0.456], [1227.53, 991.616, 0.633], [1095.585, 1183.286, 0.618]]\nB: [[1509.989, 949.628, 0.539], [1350.384, 1212.22, 0.56], [1071.64, 893.308, 0.484], [1153.706, 1063.833, 0.645]]\nC: [[1298.993, 1034.258, 0.529], [1299.542, 1034.749, 0.554], [1300.09, 1035.239, 0.579], [1300.639, 1035.729, 0.604]]\nD: [[1378.947, 975.996, 0.598], [1493.813, 900.58, 0.493], [1370.14, 1033.836, 0.656], [1047.788, 1106.271, 0.659]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_16_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_16_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_16_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_16_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_16_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_16_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_16_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_16_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[306.186, 763.667, 1.488], [378.546, 697.6, 1.528], [320.79, 550.53, 1.74], [377.634, 523.623, 1.596]]\nB: [[387.559, 726.167, 1.211], [356.987, 561.8, 1.228], [377.54, 655.25, 1.8], [372.07, 602.526, 1.352]]\nC: [[392.768, 743.908, 1.542], [292.481, 723.4, 1.31], [330.74, 682.85, 1.79], [283.31, 638.538, 1.433]]\nD: [[348.147, 646.209, 1.444], [348.144, 646.2, 1.482], [348.14, 646.19, 1.52], [348.137, 646.181, 1.559]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[306.186, 763.667, 1.488], [378.546, 697.6, 1.528], [320.79, 550.53, 1.74], [377.634, 523.623, 1.596]]\nB: [[387.559, 726.167, 1.211], [356.987, 561.8, 1.228], [377.54, 655.25, 1.8], [372.07, 602.526, 1.352]]\nC: [[392.768, 743.908, 1.542], [292.481, 723.4, 1.31], [330.74, 682.85, 1.79], [283.31, 638.538, 1.433]]\nD: [[348.147, 646.209, 1.444], [348.144, 646.2, 1.482], [348.14, 646.19, 1.52], [348.137, 646.181, 1.559]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_17_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_17_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_17_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_17_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_17_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_17_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_17_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_17_7.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[1706.39, 1019.22, 0.455], [2191.986, 926.298, 0.316], [1675.02, 886.17, 0.299], [1941.62, 757.75, 0.341]]\nB: [[2247.24, 737.46, 0.384], [1527.442, 724.25, 0.347], [1575.02, 976.52, 0.327], [1630.08, 842.33, 0.316]]\nC: [[2075.96, 1012.24, 0.409], [1869.437, 795.581, 0.371], [2223.74, 1044.39, 0.397], [1567.73, 972.01, 0.379]]\nD: [[1895.77, 878.51, 0.433], [1895.672, 878.506, 0.338], [1895.77, 878.51, 0.343], [1895.77, 878.51, 0.393]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1706.39, 1019.22, 0.455], [2191.986, 926.298, 0.316], [1675.02, 886.17, 0.299], [1941.62, 757.75, 0.341]]\nB: [[2247.24, 737.46, 0.384], [1527.442, 724.25, 0.347], [1575.02, 976.52, 0.327], [1630.08, 842.33, 0.316]]\nC: [[2075.96, 1012.24, 0.409], [1869.437, 795.581, 0.371], [2223.74, 1044.39, 0.397], [1567.73, 972.01, 0.379]]\nD: [[1895.77, 878.51, 0.433], [1895.672, 878.506, 0.338], [1895.77, 878.51, 0.343], [1895.77, 878.51, 0.393]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_18_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_18_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_18_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_18_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_18_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_18_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_18_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_18_7.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[2234.916, 722.86, 0.39], [1901.638, 1017.1, 0.487], [1734.516, 780.849, 0.344], [1885.643, 867.521, 0.263]]\nB: [[1568.94, 897.301, 0.449], [2000.828, 702.741, 0.446], [1573.358, 1014.024, 0.477], [1578.275, 964.592, 0.265]]\nC: [[2141.663, 908.252, 0.394], [1802.749, 988.498, 0.349], [1873.147, 986.016, 0.413], [2189.02, 894.117, 0.265]]\nD: [[1895.727, 877.737, 0.418], [1895.727, 877.737, 0.418], [1895.727, 877.737, 0.418], [1895.716, 877.802, 0.292]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[2234.916, 722.86, 0.39], [1901.638, 1017.1, 0.487], [1734.516, 780.849, 0.344], [1885.643, 867.521, 0.263]]\nB: [[1568.94, 897.301, 0.449], [2000.828, 702.741, 0.446], [1573.358, 1014.024, 0.477], [1578.275, 964.592, 0.265]]\nC: [[2141.663, 908.252, 0.394], [1802.749, 988.498, 0.349], [1873.147, 986.016, 0.413], [2189.02, 894.117, 0.265]]\nD: [[1895.727, 877.737, 0.418], [1895.727, 877.737, 0.418], [1895.727, 877.737, 0.418], [1895.716, 877.802, 0.292]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_19_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_19_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_19_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_19_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_19_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_19_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_19_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_19_7.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[336.27, 647.992, 0.436], [346.74, 708.566, 0.649], [354.42, 746.112, 0.69], [376.74, 611.59, 0.61]]\nB: [[340.58, 661.842, 0.526], [340.58, 661.842, 0.576], [340.58, 661.842, 0.626], [340.58, 661.842, 0.676]]\nC: [[387.54, 767.29, 0.509], [330.38, 600.327, 0.526], [387.34, 562.731, 0.738], [287.65, 743.046, 0.73]]\nD: [[347.27, 591.306, 0.458], [329.15, 678.06, 0.571], [380.55, 710.329, 0.52], [408.38, 545.098, 0.802]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[336.27, 647.992, 0.436], [346.74, 708.566, 0.649], [354.42, 746.112, 0.69], [376.74, 611.59, 0.61]]\nB: [[340.58, 661.842, 0.526], [340.58, 661.842, 0.576], [340.58, 661.842, 0.626], [340.58, 661.842, 0.676]]\nC: [[387.54, 767.29, 0.509], [330.38, 600.327, 0.526], [387.34, 562.731, 0.738], [287.65, 743.046, 0.73]]\nD: [[347.27, 591.306, 0.458], [329.15, 678.06, 0.571], [380.55, 710.329, 0.52], [408.38, 545.098, 0.802]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_20_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_20_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_20_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_20_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_20_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_20_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_20_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_20_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[641.894, 1481.081, -0.116], [755.139, 1696.093, 0.085], [744.337, 1645.874, -0.021], [549.883, 1475.291, 0.091]]\nB: [[609.159, 1822.97, -0.114], [725.77, 1759.652, 0.076], [541.265, 1644.526, -0.022], [634.034, 1389.951, 0.08]]\nC: [[639.585, 1606.675, -0.122], [640.106, 1606.245, 0.078], [640.626, 1605.815, -0.022], [641.147, 1605.384, 0.078]]\nD: [[553.206, 1422.477, -0.138], [630.222, 1490.963, 0.087], [720.491, 1414.036, -0.022], [698.708, 1478.6, 0.08]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[641.894, 1481.081, -0.116], [755.139, 1696.093, 0.085], [744.337, 1645.874, -0.021], [549.883, 1475.291, 0.091]]\nB: [[609.159, 1822.97, -0.114], [725.77, 1759.652, 0.076], [541.265, 1644.526, -0.022], [634.034, 1389.951, 0.08]]\nC: [[639.585, 1606.675, -0.122], [640.106, 1606.245, 0.078], [640.626, 1605.815, -0.022], [641.147, 1605.384, 0.078]]\nD: [[553.206, 1422.477, -0.138], [630.222, 1490.963, 0.087], [720.491, 1414.036, -0.022], [698.708, 1478.6, 0.08]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_21_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_21_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_21_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_21_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_21_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_21_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_21_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_21_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[1654.688, 731.801, 1.203], [1825.149, 800.76, 1.006], [1536.825, 955.686, 1.262], [2011.454, 920.864, 1.228]]\nB: [[1716.132, 870.368, 1.137], [1714.324, 869.208, 1.137], [1712.096, 868.352, 1.187], [1709.574, 867.934, 1.232]]\nC: [[1523.418, 951.06, 0.924], [1452.823, 761.345, 1.206], [2023.787, 900.571, 0.99], [1938.184, 774.207, 1.182]]\nD: [[1653.54, 790.02, 1.21], [1790.64, 885.935, 1.33], [1634.81, 909.54, 1.184], [1807.277, 934.183, 1.469]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1654.688, 731.801, 1.203], [1825.149, 800.76, 1.006], [1536.825, 955.686, 1.262], [2011.454, 920.864, 1.228]]\nB: [[1716.132, 870.368, 1.137], [1714.324, 869.208, 1.137], [1712.096, 868.352, 1.187], [1709.574, 867.934, 1.232]]\nC: [[1523.418, 951.06, 0.924], [1452.823, 761.345, 1.206], [2023.787, 900.571, 0.99], [1938.184, 774.207, 1.182]]\nD: [[1653.54, 790.02, 1.21], [1790.64, 885.935, 1.33], [1634.81, 909.54, 1.184], [1807.277, 934.183, 1.469]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_22_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_22_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_22_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_22_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_22_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_22_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_22_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_22_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[1943.293, 1014.905, 2.05], [1962.947, 881.292, 1.464], [1771.641, 818.432, 1.783], [2024.383, 893.384, 1.855]]\nB: [[1842.723, 879.95, 1.901], [2117.17, 1006.474, 1.903], [1573.854, 942.118, 1.735], [2097.928, 1012.432, 1.953]]\nC: [[1897.834, 865.209, 1.738], [1897.834, 865.195, 1.688], [1897.833, 865.116, 1.688], [1897.831, 865.001, 1.688]]\nD: [[1801.762, 704.249, 1.493], [1762.225, 848.144, 1.446], [1867.693, 770.539, 1.836], [2098.827, 762.104, 1.81]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1943.293, 1014.905, 2.05], [1962.947, 881.292, 1.464], [1771.641, 818.432, 1.783], [2024.383, 893.384, 1.855]]\nB: [[1842.723, 879.95, 1.901], [2117.17, 1006.474, 1.903], [1573.854, 942.118, 1.735], [2097.928, 1012.432, 1.953]]\nC: [[1897.834, 865.209, 1.738], [1897.834, 865.195, 1.688], [1897.833, 865.116, 1.688], [1897.831, 865.001, 1.688]]\nD: [[1801.762, 704.249, 1.493], [1762.225, 848.144, 1.446], [1867.693, 770.539, 1.836], [2098.827, 762.104, 1.81]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_23_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_23_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_23_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_23_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_23_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_23_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_23_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_23_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[251.423, 613.532, -0.224], [353.651, 580.245, -0.187], [307.419, 820.39, -0.19], [319.555, 661.929, -0.115]]\nB: [[288.517, 703.944, -0.206], [287.575, 632.764, -0.222], [372.62, 616.315, -0.154], [261.943, 809.962, -0.108]]\nC: [[279.61, 776.103, -0.238], [372.908, 643.544, -0.172], [347.733, 585.413, -0.159], [339.729, 666.886, -0.117]]\nD: [[311.976, 694.922, -0.216], [311.533, 694.408, -0.203], [311.103, 693.883, -0.191], [309.589, 691.756, -0.099]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[251.423, 613.532, -0.224], [353.651, 580.245, -0.187], [307.419, 820.39, -0.19], [319.555, 661.929, -0.115]]\nB: [[288.517, 703.944, -0.206], [287.575, 632.764, -0.222], [372.62, 616.315, -0.154], [261.943, 809.962, -0.108]]\nC: [[279.61, 776.103, -0.238], [372.908, 643.544, -0.172], [347.733, 585.413, -0.159], [339.729, 666.886, -0.117]]\nD: [[311.976, 694.922, -0.216], [311.533, 694.408, -0.203], [311.103, 693.883, -0.191], [309.589, 691.756, -0.099]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_24_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_24_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_24_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_24_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_24_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_24_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_24_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_24_7.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[428.593, 999.538, 0.927], [376.399, 1068.754, 0.828], [470.584, 1252.944, 0.961], [513.123, 1108.855, 0.935]]\nB: [[449.491, 963.875, 0.968], [378.432, 1021.223, 1.012], [349.93, 1322.277, 1.125], [411.187, 1019.406, 0.996]]\nC: [[447.511, 981.997, 0.973], [404.158, 1082.968, 0.919], [454.929, 1283.771, 0.917], [471.926, 1109.792, 0.83]]\nD: [[435.351, 1103.132, 0.814], [435.351, 1103.132, 0.964], [435.351, 1103.132, 1.014], [435.351, 1103.132, 0.989]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[428.593, 999.538, 0.927], [376.399, 1068.754, 0.828], [470.584, 1252.944, 0.961], [513.123, 1108.855, 0.935]]\nB: [[449.491, 963.875, 0.968], [378.432, 1021.223, 1.012], [349.93, 1322.277, 1.125], [411.187, 1019.406, 0.996]]\nC: [[447.511, 981.997, 0.973], [404.158, 1082.968, 0.919], [454.929, 1283.771, 0.917], [471.926, 1109.792, 0.83]]\nD: [[435.351, 1103.132, 0.814], [435.351, 1103.132, 0.964], [435.351, 1103.132, 1.014], [435.351, 1103.132, 0.989]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_25_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_25_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_25_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_25_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_25_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_25_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_25_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_25_7.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[1289.156, 997.931, 0.17], [1528.988, 1149.542, 0.135], [1524.67, 1103.565, 0.144], [1254.51, 1059.655, 0.132]]\nB: [[1576.762, 1083.802, 0.16], [1394.53, 1020.578, 0.13], [1145.932, 1107.624, 0.169], [1436.14, 1231.523, 0.156]]\nC: [[1340.124, 1032.575, 0.154], [1340.123, 1032.575, 0.154], [1340.121, 1032.574, 0.154], [1340.12, 1032.574, 0.154]]\nD: [[1216.577, 1183.272, 0.123], [1258.5, 1034.393, 0.163], [1273.558, 1228.419, 0.14], [1288.46, 870.176, 0.174]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1289.156, 997.931, 0.17], [1528.988, 1149.542, 0.135], [1524.67, 1103.565, 0.144], [1254.51, 1059.655, 0.132]]\nB: [[1576.762, 1083.802, 0.16], [1394.53, 1020.578, 0.13], [1145.932, 1107.624, 0.169], [1436.14, 1231.523, 0.156]]\nC: [[1340.124, 1032.575, 0.154], [1340.123, 1032.575, 0.154], [1340.121, 1032.574, 0.154], [1340.12, 1032.574, 0.154]]\nD: [[1216.577, 1183.272, 0.123], [1258.5, 1034.393, 0.163], [1273.558, 1228.419, 0.14], [1288.46, 870.176, 0.174]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_26_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_26_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_26_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_26_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_26_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_26_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_26_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_26_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[1255.784, 1121.555, 1.33], [1075.123, 1055.841, 1.22], [1441.444, 1208.639, 1.1], [1537.429, 1076.298, 1.45]]\nB: [[1100.277, 1164.491, 1.56], [1180.448, 1259.127, 1.17], [1475.037, 1060.06, 1.36], [1311.756, 864.536, 1.05]]\nC: [[1328.793, 876.335, 1.12], [1429.236, 996.25, 1.26], [1195.871, 932.001, 1.51], [1480.133, 1028.558, 1.25]]\nD: [[1328.425, 1052.566, 1.31], [1328.425, 1052.566, 1.31], [1328.425, 1052.566, 1.31], [1328.425, 1052.566, 1.31]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1255.784, 1121.555, 1.33], [1075.123, 1055.841, 1.22], [1441.444, 1208.639, 1.1], [1537.429, 1076.298, 1.45]]\nB: [[1100.277, 1164.491, 1.56], [1180.448, 1259.127, 1.17], [1475.037, 1060.06, 1.36], [1311.756, 864.536, 1.05]]\nC: [[1328.793, 876.335, 1.12], [1429.236, 996.25, 1.26], [1195.871, 932.001, 1.51], [1480.133, 1028.558, 1.25]]\nD: [[1328.425, 1052.566, 1.31], [1328.425, 1052.566, 1.31], [1328.425, 1052.566, 1.31], [1328.425, 1052.566, 1.31]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_27_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_27_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_27_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_27_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_27_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_27_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_27_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_27_7.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[1273.894, 1072.524, 0.908], [1273.894, 1072.524, 0.909], [1273.894, 1072.524, 0.911], [1273.893, 1072.523, 0.912]]\nB: [[1252.346, 1105.514, 0.902], [1209.789, 1085.191, 0.984], [1114.268, 935.639, 0.74], [1170.16, 987.263, 0.918]]\nC: [[1108.639, 1162.182, 1.069], [1297.456, 1226.014, 0.862], [1466.955, 1006.358, 0.987], [1135.299, 1250.877, 0.943]]\nD: [[1221.891, 927.735, 0.939], [1126.972, 1155.177, 0.838], [1313.844, 1145.354, 1.042], [1328.412, 1083.367, 0.762]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1273.894, 1072.524, 0.908], [1273.894, 1072.524, 0.909], [1273.894, 1072.524, 0.911], [1273.893, 1072.523, 0.912]]\nB: [[1252.346, 1105.514, 0.902], [1209.789, 1085.191, 0.984], [1114.268, 935.639, 0.74], [1170.16, 987.263, 0.918]]\nC: [[1108.639, 1162.182, 1.069], [1297.456, 1226.014, 0.862], [1466.955, 1006.358, 0.987], [1135.299, 1250.877, 0.943]]\nD: [[1221.891, 927.735, 0.939], [1126.972, 1155.177, 0.838], [1313.844, 1145.354, 1.042], [1328.412, 1083.367, 0.762]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_28_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_28_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_28_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_28_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_28_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_28_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_28_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_28_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[522.342, 1943.251, 0.31], [505.853, 1457.715, 0.375], [513.011, 1502.032, 0.633], [529.48, 1609.413, 0.729]]\nB: [[626.523, 1972.698, 0.374], [529.275, 1724.592, 0.459], [517.251, 1365.431, 0.651], [714.07, 1806.899, 0.579]]\nC: [[576.087, 1806.167, 0.315], [734.652, 1339.382, 0.394], [725.143, 1697.177, 0.608], [592.16, 1326.812, 0.692]]\nD: [[622.249, 1646.081, 0.321], [621.683, 1646.405, 0.446], [621.109, 1646.715, 0.571], [620.64, 1647.021, 0.721]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[522.342, 1943.251, 0.31], [505.853, 1457.715, 0.375], [513.011, 1502.032, 0.633], [529.48, 1609.413, 0.729]]\nB: [[626.523, 1972.698, 0.374], [529.275, 1724.592, 0.459], [517.251, 1365.431, 0.651], [714.07, 1806.899, 0.579]]\nC: [[576.087, 1806.167, 0.315], [734.652, 1339.382, 0.394], [725.143, 1697.177, 0.608], [592.16, 1326.812, 0.692]]\nD: [[622.249, 1646.081, 0.321], [621.683, 1646.405, 0.446], [621.109, 1646.715, 0.571], [620.64, 1647.021, 0.721]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_29_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_29_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_29_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_29_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_29_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_29_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_29_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_29_7.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[1796.561, 874.996, 1.254], [1796.561, 874.982, 1.216], [1796.561, 874.969, 1.182], [1796.561, 874.957, 1.151]]\nB: [[1829.822, 1005.261, 1.194], [2129.106, 967.913, 1.335], [1439.644, 885.763, 1.155], [2034.051, 719.497, 0.987]]\nC: [[2134.229, 737.814, 1.149], [1953.993, 1047.896, 1.349], [1612.579, 940.305, 1.146], [1599.447, 982.485, 1.365]]\nD: [[1699.287, 941.961, 1.224], [1590.817, 729.191, 1.195], [1711.432, 908.722, 0.971], [1659.459, 924.897, 1.335]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1796.561, 874.996, 1.254], [1796.561, 874.982, 1.216], [1796.561, 874.969, 1.182], [1796.561, 874.957, 1.151]]\nB: [[1829.822, 1005.261, 1.194], [2129.106, 967.913, 1.335], [1439.644, 885.763, 1.155], [2034.051, 719.497, 0.987]]\nC: [[2134.229, 737.814, 1.149], [1953.993, 1047.896, 1.349], [1612.579, 940.305, 1.146], [1599.447, 982.485, 1.365]]\nD: [[1699.287, 941.961, 1.224], [1590.817, 729.191, 1.195], [1711.432, 908.722, 0.971], [1659.459, 924.897, 1.335]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_30_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_30_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_30_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_30_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_30_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_30_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_30_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_30_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[406.663, 1099.631, 0.814], [406.711, 1099.639, 0.923], [406.735, 1099.643, 0.978], [406.717, 1099.695, 0.749]]\nB: [[427.835, 1064.967, 0.714], [484.647, 916.921, 0.994], [411.142, 919.994, 1.029], [362.349, 1103.394, 0.701]]\nC: [[396.877, 1112.011, 0.828], [415.047, 1175.011, 0.772], [440.647, 980.302, 0.825], [395.393, 899.719, 0.603]]\nD: [[473.72, 956.4, 0.8], [485.155, 1094.253, 0.884], [398.711, 1081.924, 0.932], [430.802, 1000.92, 0.78]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[406.663, 1099.631, 0.814], [406.711, 1099.639, 0.923], [406.735, 1099.643, 0.978], [406.717, 1099.695, 0.749]]\nB: [[427.835, 1064.967, 0.714], [484.647, 916.921, 0.994], [411.142, 919.994, 1.029], [362.349, 1103.394, 0.701]]\nC: [[396.877, 1112.011, 0.828], [415.047, 1175.011, 0.772], [440.647, 980.302, 0.825], [395.393, 899.719, 0.603]]\nD: [[473.72, 956.4, 0.8], [485.155, 1094.253, 0.884], [398.711, 1081.924, 0.932], [430.802, 1000.92, 0.78]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_31_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_31_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_31_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_31_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_31_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_31_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_31_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_31_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[1313.621, 933.434, 0.218], [1514.36, 1049.794, 0.194], [1263.349, 1108.661, 0.157], [1490.47, 980.609, 0.195]]\nB: [[1232.867, 1016.208, 0.213], [1250.875, 1010.148, 0.221], [1205.37, 1035.121, 0.184], [1092.698, 953.727, 0.188]]\nC: [[1472.729, 957.241, 0.173], [1510.795, 1241.776, 0.219], [1118.45, 1223.791, 0.168], [1218.898, 1085.684, 0.171]]\nD: [[1337.482, 1035.208, 0.186], [1337.482, 1035.208, 0.186], [1337.482, 1035.208, 0.186], [1337.482, 1035.208, 0.186]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1313.621, 933.434, 0.218], [1514.36, 1049.794, 0.194], [1263.349, 1108.661, 0.157], [1490.47, 980.609, 0.195]]\nB: [[1232.867, 1016.208, 0.213], [1250.875, 1010.148, 0.221], [1205.37, 1035.121, 0.184], [1092.698, 953.727, 0.188]]\nC: [[1472.729, 957.241, 0.173], [1510.795, 1241.776, 0.219], [1118.45, 1223.791, 0.168], [1218.898, 1085.684, 0.171]]\nD: [[1337.482, 1035.208, 0.186], [1337.482, 1035.208, 0.186], [1337.482, 1035.208, 0.186], [1337.482, 1035.208, 0.186]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_32_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_32_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_32_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_32_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_32_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_32_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_32_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_32_7.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[1992.775, 875.132, 0.942], [1985.564, 874.76, 0.95], [1978.377, 874.483, 0.958], [1971.974, 874.315, 0.986]]\nB: [[1755.791, 1044.883, 0.825], [1877.163, 968.52, 1.04], [2106.974, 814.325, 0.994], [1945.338, 748.73, 1.14]]\nC: [[1656.177, 762.998, 0.871], [2009.557, 758.93, 0.8], [1914.45, 722.289, 1.067], [1703.798, 972.938, 1.065]]\nD: [[1816.649, 760.428, 1.116], [1730.801, 1023.39, 1.04], [2342.252, 816.69, 1.126], [2334.939, 947.14, 0.896]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1992.775, 875.132, 0.942], [1985.564, 874.76, 0.95], [1978.377, 874.483, 0.958], [1971.974, 874.315, 0.986]]\nB: [[1755.791, 1044.883, 0.825], [1877.163, 968.52, 1.04], [2106.974, 814.325, 0.994], [1945.338, 748.73, 1.14]]\nC: [[1656.177, 762.998, 0.871], [2009.557, 758.93, 0.8], [1914.45, 722.289, 1.067], [1703.798, 972.938, 1.065]]\nD: [[1816.649, 760.428, 1.116], [1730.801, 1023.39, 1.04], [2342.252, 816.69, 1.126], [2334.939, 947.14, 0.896]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_33_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_33_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_33_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_33_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_33_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_33_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_33_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_33_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[331.098, 1052.547, 0.565], [408.797, 1100.389, 0.637], [418.765, 1332.696, 0.625], [352.119, 1242.135, 0.631]]\nB: [[421.121, 1227.662, 0.557], [446.642, 1087.379, 0.513], [450.924, 1107.261, 0.47], [392.549, 1175.812, 0.691]]\nC: [[396.535, 1162.355, 0.498], [396.535, 1162.355, 0.534], [396.535, 1162.355, 0.571], [396.535, 1162.355, 0.608]]\nD: [[463.951, 972.839, 0.532], [365.417, 1075.626, 0.44], [381.022, 1300.867, 0.549], [368.078, 1350.532, 0.537]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[331.098, 1052.547, 0.565], [408.797, 1100.389, 0.637], [418.765, 1332.696, 0.625], [352.119, 1242.135, 0.631]]\nB: [[421.121, 1227.662, 0.557], [446.642, 1087.379, 0.513], [450.924, 1107.261, 0.47], [392.549, 1175.812, 0.691]]\nC: [[396.535, 1162.355, 0.498], [396.535, 1162.355, 0.534], [396.535, 1162.355, 0.571], [396.535, 1162.355, 0.608]]\nD: [[463.951, 972.839, 0.532], [365.417, 1075.626, 0.44], [381.022, 1300.867, 0.549], [368.078, 1350.532, 0.537]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_34_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_34_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_34_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_34_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_34_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_34_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_34_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_34_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[321.096, 668.091, 0.879], [321.112, 668.116, 0.885], [321.128, 668.141, 0.891], [321.143, 668.166, 0.897]]\nB: [[268.388, 688.723, 0.734], [302.215, 796.657, 0.989], [302.241, 565.326, 1.022], [265.213, 770.117, 0.814]]\nC: [[314.729, 566.271, 0.999], [287.802, 590.987, 1.045], [272.417, 724.544, 0.717], [323.87, 780.287, 0.926]]\nD: [[376.158, 594.596, 0.841], [277.747, 714.363, 0.978], [382.966, 588.719, 0.996], [345.414, 561.146, 0.948]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[321.096, 668.091, 0.879], [321.112, 668.116, 0.885], [321.128, 668.141, 0.891], [321.143, 668.166, 0.897]]\nB: [[268.388, 688.723, 0.734], [302.215, 796.657, 0.989], [302.241, 565.326, 1.022], [265.213, 770.117, 0.814]]\nC: [[314.729, 566.271, 0.999], [287.802, 590.987, 1.045], [272.417, 724.544, 0.717], [323.87, 780.287, 0.926]]\nD: [[376.158, 594.596, 0.841], [277.747, 714.363, 0.978], [382.966, 588.719, 0.996], [345.414, 561.146, 0.948]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_35_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_35_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_35_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_35_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_35_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_35_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_35_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_35_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[754.263, 1891.382, 0.199], [593.138, 1372.169, 0.319], [709.86, 1582.863, 0.554], [634.519, 1531.646, 0.56]]\nB: [[729.655, 1603.012, 0.212], [526.812, 1703.833, 0.343], [552.52, 1297.518, 0.437], [592.969, 1803.518, 0.61]]\nC: [[632.049, 1352.661, 0.204], [726.247, 1377.851, 0.377], [577.44, 1302.511, 0.523], [636.437, 1877.196, 0.48]]\nD: [[655.912, 1592.667, 0.218], [655.637, 1593.173, 0.377], [655.34, 1593.667, 0.535], [654.899, 1594.227, 0.56]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[754.263, 1891.382, 0.199], [593.138, 1372.169, 0.319], [709.86, 1582.863, 0.554], [634.519, 1531.646, 0.56]]\nB: [[729.655, 1603.012, 0.212], [526.812, 1703.833, 0.343], [552.52, 1297.518, 0.437], [592.969, 1803.518, 0.61]]\nC: [[632.049, 1352.661, 0.204], [726.247, 1377.851, 0.377], [577.44, 1302.511, 0.523], [636.437, 1877.196, 0.48]]\nD: [[655.912, 1592.667, 0.218], [655.637, 1593.173, 0.377], [655.34, 1593.667, 0.535], [654.899, 1594.227, 0.56]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_36_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_36_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_36_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_36_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_36_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_36_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_36_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_36_7.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[1863.967, 857.871, 0.65], [1863.962, 857.872, 0.65], [1863.962, 857.872, 0.65], [1863.962, 857.872, 0.65]]\nB: [[2064.908, 1013.124, 0.75], [2122.552, 822.014, 0.59], [2177.833, 1012.188, 0.75], [1595.769, 822.35, 0.73]]\nC: [[1731.702, 852.264, 0.72], [2128.868, 793.194, 0.77], [1755.246, 973.676, 0.67], [1568.102, 944.114, 0.53]]\nD: [[1764.474, 940.448, 0.74], [2091.49, 945.26, 0.67], [2118.947, 923.168, 0.72], [1633.719, 960.882, 0.7]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1863.967, 857.871, 0.65], [1863.962, 857.872, 0.65], [1863.962, 857.872, 0.65], [1863.962, 857.872, 0.65]]\nB: [[2064.908, 1013.124, 0.75], [2122.552, 822.014, 0.59], [2177.833, 1012.188, 0.75], [1595.769, 822.35, 0.73]]\nC: [[1731.702, 852.264, 0.72], [2128.868, 793.194, 0.77], [1755.246, 973.676, 0.67], [1568.102, 944.114, 0.53]]\nD: [[1764.474, 940.448, 0.74], [2091.49, 945.26, 0.67], [2118.947, 923.168, 0.72], [1633.719, 960.882, 0.7]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_37_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_37_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_37_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_37_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_37_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_37_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_37_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_37_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[1780.13, 3028.6, -0.525], [1674.509, 2149.928, -0.337], [1576.236, 2276.286, -0.134], [1537.853, 2314.916, 0.007]]\nB: [[1601.026, 2969.24, -0.541], [2094.18, 2097.632, -0.298], [2014.168, 2653.318, -0.14], [1803.211, 2667.419, 0.009]]\nC: [[1811.441, 2574.96, -0.473], [1814.647, 2570.443, -0.296], [1818.149, 2566.591, -0.119], [1820.651, 2564.035, 0.009]]\nD: [[1791.545, 2532.09, -0.471], [1731.966, 2573.436, -0.251], [1598.687, 2327.018, -0.116], [1468.52, 2562.672, 0.009]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1780.13, 3028.6, -0.525], [1674.509, 2149.928, -0.337], [1576.236, 2276.286, -0.134], [1537.853, 2314.916, 0.007]]\nB: [[1601.026, 2969.24, -0.541], [2094.18, 2097.632, -0.298], [2014.168, 2653.318, -0.14], [1803.211, 2667.419, 0.009]]\nC: [[1811.441, 2574.96, -0.473], [1814.647, 2570.443, -0.296], [1818.149, 2566.591, -0.119], [1820.651, 2564.035, 0.009]]\nD: [[1791.545, 2532.09, -0.471], [1731.966, 2573.436, -0.251], [1598.687, 2327.018, -0.116], [1468.52, 2562.672, 0.009]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_38_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_38_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_38_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_38_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_38_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_38_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_38_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_38_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[418.17, 1093.457, 0.829], [405.894, 1044.702, 0.431], [420.968, 1127.645, 0.604], [406.025, 1265.181, 0.687]]\nB: [[422.18, 1093.142, 0.749], [422.146, 1093.149, 0.523], [422.164, 1093.151, 0.575], [422.182, 1093.152, 0.627]]\nC: [[424.56, 1104.052, 0.696], [456.777, 1163.284, 0.489], [355.959, 1084.822, 0.587], [353.668, 881.288, 0.749]]\nD: [[472.5, 1170.954, 0.897], [500.203, 1162.062, 0.492], [472.1, 1132.062, 0.684], [450.284, 916.311, 0.647]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[418.17, 1093.457, 0.829], [405.894, 1044.702, 0.431], [420.968, 1127.645, 0.604], [406.025, 1265.181, 0.687]]\nB: [[422.18, 1093.142, 0.749], [422.146, 1093.149, 0.523], [422.164, 1093.151, 0.575], [422.182, 1093.152, 0.627]]\nC: [[424.56, 1104.052, 0.696], [456.777, 1163.284, 0.489], [355.959, 1084.822, 0.587], [353.668, 881.288, 0.749]]\nD: [[472.5, 1170.954, 0.897], [500.203, 1162.062, 0.492], [472.1, 1132.062, 0.684], [450.284, 916.311, 0.647]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_39_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_39_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_39_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_39_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_39_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_39_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_39_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_39_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[390.675, 1148.918, 0.446], [390.675, 1148.918, 0.486], [390.675, 1148.918, 0.526], [390.675, 1148.918, 0.566]]\nB: [[325.378, 1080.282, 0.378], [401.054, 1111.492, 0.413], [443.699, 1336.224, 0.541], [437.757, 1205.106, 0.494]]\nC: [[376.096, 1180.944, 0.535], [365.879, 1297.989, 0.536], [347.139, 1107.499, 0.489], [390.705, 1129.597, 0.653]]\nD: [[319.548, 938.981, 0.435], [320.089, 1375.531, 0.568], [447.751, 1028.646, 0.524], [462.869, 953.708, 0.657]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[390.675, 1148.918, 0.446], [390.675, 1148.918, 0.486], [390.675, 1148.918, 0.526], [390.675, 1148.918, 0.566]]\nB: [[325.378, 1080.282, 0.378], [401.054, 1111.492, 0.413], [443.699, 1336.224, 0.541], [437.757, 1205.106, 0.494]]\nC: [[376.096, 1180.944, 0.535], [365.879, 1297.989, 0.536], [347.139, 1107.499, 0.489], [390.705, 1129.597, 0.653]]\nD: [[319.548, 938.981, 0.435], [320.089, 1375.531, 0.568], [447.751, 1028.646, 0.524], [462.869, 953.708, 0.657]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_40_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_40_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_40_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_40_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_40_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_40_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_40_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_40_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[1843.257, 2538.618, -0.551], [1847.727, 2533.368, -0.252], [1851.69, 2528.728, 0.148], [1855.661, 2524.133, 0.447]]\nB: [[2149.832, 2452.89, -0.472], [2084.541, 3035.493, -0.262], [2202.07, 2375.125, 0.153], [1741.345, 2112.152, 0.38]]\nC: [[1481.384, 2461.292, -0.523], [1555.975, 2186.05, -0.244], [1900.07, 2064.722, 0.165], [2087.255, 2686.41, 0.442]]\nD: [[1970.321, 2572.246, -0.542], [1648.575, 2617.927, -0.295], [1998.79, 2542.913, 0.12], [2210.323, 2215.488, 0.469]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1843.257, 2538.618, -0.551], [1847.727, 2533.368, -0.252], [1851.69, 2528.728, 0.148], [1855.661, 2524.133, 0.447]]\nB: [[2149.832, 2452.89, -0.472], [2084.541, 3035.493, -0.262], [2202.07, 2375.125, 0.153], [1741.345, 2112.152, 0.38]]\nC: [[1481.384, 2461.292, -0.523], [1555.975, 2186.05, -0.244], [1900.07, 2064.722, 0.165], [2087.255, 2686.41, 0.442]]\nD: [[1970.321, 2572.246, -0.542], [1648.575, 2617.927, -0.295], [1998.79, 2542.913, 0.12], [2210.323, 2215.488, 0.469]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_41_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_41_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_41_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_41_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_41_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_41_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_41_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_41_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[405.649, 1108.528, 0.594], [405.644, 1108.505, 0.674], [405.626, 1108.416, 0.494], [405.656, 1108.482, 0.494]]\nB: [[334.296, 1327.717, 0.679], [384.849, 1314.532, 0.74], [423.319, 950.2, 0.426], [331.031, 1040.91, 0.551]]\nC: [[347.771, 1314.846, 0.498], [446.389, 1307.841, 0.727], [399.575, 1219.724, 0.443], [426.17, 1311.828, 0.436]]\nD: [[400.335, 1102.261, 0.598], [348.445, 1284.149, 0.65], [478.752, 1133.775, 0.474], [355.374, 1236.721, 0.511]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[405.649, 1108.528, 0.594], [405.644, 1108.505, 0.674], [405.626, 1108.416, 0.494], [405.656, 1108.482, 0.494]]\nB: [[334.296, 1327.717, 0.679], [384.849, 1314.532, 0.74], [423.319, 950.2, 0.426], [331.031, 1040.91, 0.551]]\nC: [[347.771, 1314.846, 0.498], [446.389, 1307.841, 0.727], [399.575, 1219.724, 0.443], [426.17, 1311.828, 0.436]]\nD: [[400.335, 1102.261, 0.598], [348.445, 1284.149, 0.65], [478.752, 1133.775, 0.474], [355.374, 1236.721, 0.511]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_42_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_42_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_42_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_42_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_42_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_42_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_42_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_42_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[1428.27, 974.381, 1.985], [1433.79, 878.793, 1.553], [1487.91, 991.551, 2.084], [1094.85, 1099.062, 1.834]]\nB: [[1126.35, 864.162, 1.48], [1369.77, 1079.18, 2.095], [1104.67, 888.249, 1.995], [1467.5, 1079.513, 1.593]]\nC: [[1319.41, 1031.387, 1.821], [1319.41, 1031.387, 1.821], [1319.41, 1031.387, 1.821], [1319.41, 1031.387, 1.821]]\nD: [[1325.9, 922.588, 2.092], [1241.85, 1191.619, 1.687], [1156.96, 1063.21, 1.942], [1396.09, 908.012, 1.846]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1428.27, 974.381, 1.985], [1433.79, 878.793, 1.553], [1487.91, 991.551, 2.084], [1094.85, 1099.062, 1.834]]\nB: [[1126.35, 864.162, 1.48], [1369.77, 1079.18, 2.095], [1104.67, 888.249, 1.995], [1467.5, 1079.513, 1.593]]\nC: [[1319.41, 1031.387, 1.821], [1319.41, 1031.387, 1.821], [1319.41, 1031.387, 1.821], [1319.41, 1031.387, 1.821]]\nD: [[1325.9, 922.588, 2.092], [1241.85, 1191.619, 1.687], [1156.96, 1063.21, 1.942], [1396.09, 908.012, 1.846]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_43_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_43_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_43_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_43_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_43_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_43_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_43_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_43_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[409.465, 1216.476, 0.476], [318.714, 1226.609, 0.544], [365.342, 995.006, 0.666], [409.876, 1110.956, 0.592]]\nB: [[394.755, 1113.151, 0.528], [394.774, 1113.143, 0.578], [394.793, 1113.134, 0.628], [394.793, 1113.134, 0.703]]\nC: [[420.334, 1155.862, 0.481], [398.422, 922.217, 0.485], [347.385, 1076.56, 0.624], [333.837, 1269.244, 0.608]]\nD: [[335.019, 1099.773, 0.481], [389.242, 976.46, 0.466], [401.879, 992.855, 0.713], [331.612, 1204.414, 0.622]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[409.465, 1216.476, 0.476], [318.714, 1226.609, 0.544], [365.342, 995.006, 0.666], [409.876, 1110.956, 0.592]]\nB: [[394.755, 1113.151, 0.528], [394.774, 1113.143, 0.578], [394.793, 1113.134, 0.628], [394.793, 1113.134, 0.703]]\nC: [[420.334, 1155.862, 0.481], [398.422, 922.217, 0.485], [347.385, 1076.56, 0.624], [333.837, 1269.244, 0.608]]\nD: [[335.019, 1099.773, 0.481], [389.242, 976.46, 0.466], [401.879, 992.855, 0.713], [331.612, 1204.414, 0.622]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_44_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_44_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_44_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_44_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_44_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_44_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_44_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_44_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[361.726, 683.196, 1.397], [329.217, 690.805, 1.946], [283.896, 621.516, 1.958], [356.569, 620.728, 2.064]]\nB: [[418.211, 729.764, 1.616], [294.113, 717.923, 1.419], [354.161, 578.812, 1.657], [290.406, 708.411, 2.08]]\nC: [[294.448, 559.154, 1.483], [317.072, 572.818, 2.094], [333.21, 533.806, 2.046], [288.729, 702.966, 1.84]]\nD: [[349.242, 634.568, 1.725], [349.228, 634.584, 1.748], [349.213, 634.601, 1.771], [349.198, 634.618, 1.794]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[361.726, 683.196, 1.397], [329.217, 690.805, 1.946], [283.896, 621.516, 1.958], [356.569, 620.728, 2.064]]\nB: [[418.211, 729.764, 1.616], [294.113, 717.923, 1.419], [354.161, 578.812, 1.657], [290.406, 708.411, 2.08]]\nC: [[294.448, 559.154, 1.483], [317.072, 572.818, 2.094], [333.21, 533.806, 2.046], [288.729, 702.966, 1.84]]\nD: [[349.242, 634.568, 1.725], [349.228, 634.584, 1.748], [349.213, 634.601, 1.771], [349.198, 634.618, 1.794]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_45_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_45_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_45_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_45_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_45_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_45_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_45_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_45_7.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[1342.828, 1030.123, -0.019], [1342.828, 1030.123, -0.019], [1342.828, 1030.123, -0.019], [1342.828, 1030.123, -0.019]]\nB: [[1097.533, 870.18, -0.018], [1280.182, 1133.641, -0.019], [1489.479, 998.305, -0.016], [1148.803, 1033.836, -0.018]]\nC: [[1127.233, 1005.075, -0.016], [1511.451, 847.909, -0.022], [1150.864, 1055.903, -0.02], [1443.444, 1006.94, -0.017]]\nD: [[1513.245, 1007.752, -0.022], [1331.585, 1065.932, -0.016], [1532.891, 854.441, -0.017], [1526.175, 951.171, -0.022]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1342.828, 1030.123, -0.019], [1342.828, 1030.123, -0.019], [1342.828, 1030.123, -0.019], [1342.828, 1030.123, -0.019]]\nB: [[1097.533, 870.18, -0.018], [1280.182, 1133.641, -0.019], [1489.479, 998.305, -0.016], [1148.803, 1033.836, -0.018]]\nC: [[1127.233, 1005.075, -0.016], [1511.451, 847.909, -0.022], [1150.864, 1055.903, -0.02], [1443.444, 1006.94, -0.017]]\nD: [[1513.245, 1007.752, -0.022], [1331.585, 1065.932, -0.016], [1532.891, 854.441, -0.017], [1526.175, 951.171, -0.022]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_46_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_46_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_46_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_46_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_46_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_46_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_46_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_46_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[368.243, 1147.276, 0.495], [320.188, 1220.441, 0.564], [378.673, 985.999, 0.66], [415.817, 1250.586, 0.729]]\nB: [[471.077, 1316.726, 0.65], [388.888, 998.99, 0.705], [397.125, 1213.868, 0.57], [326.301, 938.598, 0.746]]\nC: [[394.039, 1143.246, 0.615], [391.841, 1138.065, 0.615], [389.353, 1132.372, 0.64], [387.343, 1127.335, 0.765]]\nD: [[380.686, 1104.044, 0.666], [420.094, 1131.831, 0.503], [335.098, 1016.255, 0.76], [342.797, 1164.927, 0.672]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[368.243, 1147.276, 0.495], [320.188, 1220.441, 0.564], [378.673, 985.999, 0.66], [415.817, 1250.586, 0.729]]\nB: [[471.077, 1316.726, 0.65], [388.888, 998.99, 0.705], [397.125, 1213.868, 0.57], [326.301, 938.598, 0.746]]\nC: [[394.039, 1143.246, 0.615], [391.841, 1138.065, 0.615], [389.353, 1132.372, 0.64], [387.343, 1127.335, 0.765]]\nD: [[380.686, 1104.044, 0.666], [420.094, 1131.831, 0.503], [335.098, 1016.255, 0.76], [342.797, 1164.927, 0.672]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_47_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_47_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_47_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_47_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_47_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_47_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_47_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_47_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[529.721, 1110.52, 1.789], [418.305, 959.31, 1.737], [400.029, 1228.06, 2.221], [451.205, 1117.422, 2.204]]\nB: [[411.023, 1090.6, 1.799], [524.659, 1002.9, 1.558], [459.22, 1085.139, 1.901], [427.15, 911.884, 1.822]]\nC: [[503.852, 1131.29, 2.197], [402.563, 1323.31, 1.648], [532.361, 1202.739, 1.985], [480.913, 1034.846, 2.094]]\nD: [[456.587, 1114.23, 2.052], [448.914, 1116.73, 1.885], [448.331, 1116.811, 1.887], [446.322, 1116.881, 2.007]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[529.721, 1110.52, 1.789], [418.305, 959.31, 1.737], [400.029, 1228.06, 2.221], [451.205, 1117.422, 2.204]]\nB: [[411.023, 1090.6, 1.799], [524.659, 1002.9, 1.558], [459.22, 1085.139, 1.901], [427.15, 911.884, 1.822]]\nC: [[503.852, 1131.29, 2.197], [402.563, 1323.31, 1.648], [532.361, 1202.739, 1.985], [480.913, 1034.846, 2.094]]\nD: [[456.587, 1114.23, 2.052], [448.914, 1116.73, 1.885], [448.331, 1116.811, 1.887], [446.322, 1116.881, 2.007]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_48_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_48_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_48_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_48_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_48_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_48_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_48_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_48_7.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[410.862, 1106.326, 0.665], [410.862, 1106.326, 0.59], [410.862, 1106.326, 0.553], [410.862, 1106.326, 0.415]]\nB: [[404.294, 1051.995, 0.561], [398.816, 1084.635, 0.64], [356.338, 1249.05, 0.503], [446.466, 1282.71, 0.342]]\nC: [[336.012, 1230.001, 0.749], [456.309, 1162.403, 0.66], [488.514, 919.924, 0.477], [360.602, 1191.978, 0.369]]\nD: [[407.759, 1016.766, 0.63], [366.992, 935.62, 0.5], [484.755, 1037.534, 0.603], [490.174, 1145.425, 0.38]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[410.862, 1106.326, 0.665], [410.862, 1106.326, 0.59], [410.862, 1106.326, 0.553], [410.862, 1106.326, 0.415]]\nB: [[404.294, 1051.995, 0.561], [398.816, 1084.635, 0.64], [356.338, 1249.05, 0.503], [446.466, 1282.71, 0.342]]\nC: [[336.012, 1230.001, 0.749], [456.309, 1162.403, 0.66], [488.514, 919.924, 0.477], [360.602, 1191.978, 0.369]]\nD: [[407.759, 1016.766, 0.63], [366.992, 935.62, 0.5], [484.755, 1037.534, 0.603], [490.174, 1145.425, 0.38]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_49_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_49_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_49_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_49_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_49_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_49_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_49_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_49_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[674.817, 1495.218, -0.033], [637.599, 1735.138, 0.04], [753.378, 1282.425, 0.266], [579.881, 1534.752, 0.639]]\nB: [[709.455, 1518.871, -0.042], [743.219, 1536.668, 0.05], [546.398, 1312.775, 0.212], [543.916, 1348.998, 0.477]]\nC: [[646.543, 1481.061, -0.039], [690.227, 1278.475, 0.06], [658.101, 1765.87, 0.225], [673.992, 1863.036, 0.546]]\nD: [[654.306, 1593.839, -0.039], [654.897, 1593.314, 0.05], [655.544, 1592.867, 0.238], [656.181, 1592.404, 0.554]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[674.817, 1495.218, -0.033], [637.599, 1735.138, 0.04], [753.378, 1282.425, 0.266], [579.881, 1534.752, 0.639]]\nB: [[709.455, 1518.871, -0.042], [743.219, 1536.668, 0.05], [546.398, 1312.775, 0.212], [543.916, 1348.998, 0.477]]\nC: [[646.543, 1481.061, -0.039], [690.227, 1278.475, 0.06], [658.101, 1765.87, 0.225], [673.992, 1863.036, 0.546]]\nD: [[654.306, 1593.839, -0.039], [654.897, 1593.314, 0.05], [655.544, 1592.867, 0.238], [656.181, 1592.404, 0.554]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_50_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_50_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_50_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_50_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_50_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_50_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_50_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_50_7.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[318.904, 699.89, -0.305], [356.863, 749.45, -0.295], [342.145, 817.48, -0.165], [362.888, 813.169, -0.152]]\nB: [[311.846, 696.05, -0.326], [311.404, 695.55, -0.298], [309.653, 693.549, -0.188], [309.251, 693.028, -0.161]]\nC: [[270.331, 808.82, -0.365], [259.897, 752.35, -0.352], [311.667, 620.881, -0.218], [273.84, 705.279, -0.182]]\nD: [[285.234, 746.79, -0.327], [276.961, 728.57, -0.341], [274.698, 714.537, -0.21], [341.495, 714.928, -0.131]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[318.904, 699.89, -0.305], [356.863, 749.45, -0.295], [342.145, 817.48, -0.165], [362.888, 813.169, -0.152]]\nB: [[311.846, 696.05, -0.326], [311.404, 695.55, -0.298], [309.653, 693.549, -0.188], [309.251, 693.028, -0.161]]\nC: [[270.331, 808.82, -0.365], [259.897, 752.35, -0.352], [311.667, 620.881, -0.218], [273.84, 705.279, -0.182]]\nD: [[285.234, 746.79, -0.327], [276.961, 728.57, -0.341], [274.698, 714.537, -0.21], [341.495, 714.928, -0.131]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_51_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_51_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_51_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_51_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_51_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_51_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_51_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_51_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[432.564, 1385.721, 0.86], [382.527, 1126.126, 0.819], [443.57, 1262.347, 0.849], [446.802, 1286.824, 0.805]]\nB: [[370.754, 1092.547, 0.872], [331.67, 1167.043, 0.761], [399.201, 1018.42, 0.801], [365.027, 1292.343, 0.758]]\nC: [[408.524, 1190.723, 0.733], [408.524, 1190.723, 0.773], [408.524, 1190.723, 0.814], [408.524, 1190.723, 0.854]]\nD: [[402.914, 1215.467, 0.85], [450.76, 1135.126, 0.766], [461.237, 971.14, 0.851], [437.03, 1104.878, 0.788]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[432.564, 1385.721, 0.86], [382.527, 1126.126, 0.819], [443.57, 1262.347, 0.849], [446.802, 1286.824, 0.805]]\nB: [[370.754, 1092.547, 0.872], [331.67, 1167.043, 0.761], [399.201, 1018.42, 0.801], [365.027, 1292.343, 0.758]]\nC: [[408.524, 1190.723, 0.733], [408.524, 1190.723, 0.773], [408.524, 1190.723, 0.814], [408.524, 1190.723, 0.854]]\nD: [[402.914, 1215.467, 0.85], [450.76, 1135.126, 0.766], [461.237, 971.14, 0.851], [437.03, 1104.878, 0.788]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_52_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_52_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_52_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_52_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_52_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_52_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_52_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_52_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[509.54, 973.159, 0.18], [380.671, 1268.603, 0.2], [479.07, 1050.521, 0.236], [494.912, 1129.693, 0.227]]\nB: [[439.95, 1094.017, 0.17], [439.878, 1094.005, 0.2], [439.87, 1094.004, 0.204], [439.863, 1094.003, 0.207]]\nC: [[469.34, 1051.411, 0.16], [354.833, 915.321, 0.2], [409.43, 978.881, 0.18], [455.437, 1174.679, 0.197]]\nD: [[450.6, 1030.444, 0.18], [376.497, 1114.358, 0.2], [397.1, 1100.748, 0.221], [400.171, 883.327, 0.198]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[509.54, 973.159, 0.18], [380.671, 1268.603, 0.2], [479.07, 1050.521, 0.236], [494.912, 1129.693, 0.227]]\nB: [[439.95, 1094.017, 0.17], [439.878, 1094.005, 0.2], [439.87, 1094.004, 0.204], [439.863, 1094.003, 0.207]]\nC: [[469.34, 1051.411, 0.16], [354.833, 915.321, 0.2], [409.43, 978.881, 0.18], [455.437, 1174.679, 0.197]]\nD: [[450.6, 1030.444, 0.18], [376.497, 1114.358, 0.2], [397.1, 1100.748, 0.221], [400.171, 883.327, 0.198]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_53_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_53_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_53_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_53_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_53_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_53_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_53_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_53_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[745.24, 1895.181, 1.097], [763.549, 1536.044, 0.845], [570.809, 1601.968, 0.989], [652.246, 1423.25, 1.127]]\nB: [[532.12, 1872.39, 0.899], [664.607, 1965.812, 0.921], [689.926, 1479.159, 0.981], [613.897, 1574.3, 1.137]]\nC: [[548.35, 1392.059, 1.158], [726.228, 1698.055, 1.001], [546.517, 1691.289, 0.982], [644.748, 1773.44, 1.004]]\nD: [[638.38, 1644.304, 0.969], [638.651, 1644.538, 0.969], [639.028, 1644.741, 0.969], [639.302, 1644.98, 0.969]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[745.24, 1895.181, 1.097], [763.549, 1536.044, 0.845], [570.809, 1601.968, 0.989], [652.246, 1423.25, 1.127]]\nB: [[532.12, 1872.39, 0.899], [664.607, 1965.812, 0.921], [689.926, 1479.159, 0.981], [613.897, 1574.3, 1.137]]\nC: [[548.35, 1392.059, 1.158], [726.228, 1698.055, 1.001], [546.517, 1691.289, 0.982], [644.748, 1773.44, 1.004]]\nD: [[638.38, 1644.304, 0.969], [638.651, 1644.538, 0.969], [639.028, 1644.741, 0.969], [639.302, 1644.98, 0.969]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_54_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_54_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_54_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_54_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_54_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_54_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_54_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_54_7.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[498.649, 946.58, 0.483], [429.418, 1259.918, 0.62], [364.692, 1168.776, 0.667], [459.431, 959.241, 0.827]]\nB: [[455.692, 1030.221, 0.542], [455.48, 1267.707, 0.483], [500.554, 1295.934, 0.772], [397.8, 1198.962, 0.903]]\nC: [[424.598, 1092.173, 0.591], [424.547, 1092.198, 0.561], [424.495, 1092.222, 0.732], [424.504, 1092.223, 0.809]]\nD: [[442.794, 1098.547, 0.607], [348.384, 1268.237, 0.46], [435.072, 1179.416, 0.685], [349.227, 1283.876, 0.743]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[498.649, 946.58, 0.483], [429.418, 1259.918, 0.62], [364.692, 1168.776, 0.667], [459.431, 959.241, 0.827]]\nB: [[455.692, 1030.221, 0.542], [455.48, 1267.707, 0.483], [500.554, 1295.934, 0.772], [397.8, 1198.962, 0.903]]\nC: [[424.598, 1092.173, 0.591], [424.547, 1092.198, 0.561], [424.495, 1092.222, 0.732], [424.504, 1092.223, 0.809]]\nD: [[442.794, 1098.547, 0.607], [348.384, 1268.237, 0.46], [435.072, 1179.416, 0.685], [349.227, 1283.876, 0.743]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_55_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_55_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_55_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_55_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_55_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_55_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_55_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_55_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[1592.863, 1046.051, 0.228], [1890.036, 733.66, 0.198], [2193.82, 880.022, 0.24], [2269.051, 988.07, 0.246]]\nB: [[1799.697, 977.341, 0.188], [2255.207, 938.924, 0.229], [2189.995, 1042.344, 0.204], [1954.403, 990.03, 0.25]]\nC: [[1920.044, 873.356, 0.213], [1920.067, 873.333, 0.213], [1920.067, 873.333, 0.213], [1920.021, 873.38, 0.263]]\nD: [[2188.053, 984.479, 0.185], [1789.35, 823.078, 0.223], [2262.284, 775.407, 0.196], [2232.543, 929.3, 0.291]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1592.863, 1046.051, 0.228], [1890.036, 733.66, 0.198], [2193.82, 880.022, 0.24], [2269.051, 988.07, 0.246]]\nB: [[1799.697, 977.341, 0.188], [2255.207, 938.924, 0.229], [2189.995, 1042.344, 0.204], [1954.403, 990.03, 0.25]]\nC: [[1920.044, 873.356, 0.213], [1920.067, 873.333, 0.213], [1920.067, 873.333, 0.213], [1920.021, 873.38, 0.263]]\nD: [[2188.053, 984.479, 0.185], [1789.35, 823.078, 0.223], [2262.284, 775.407, 0.196], [2232.543, 929.3, 0.291]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_56_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_56_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_56_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_56_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_56_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_56_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_56_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_56_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[1812.249, 2571.668, -0.038], [1816.741, 2566.203, 0.124], [1820.586, 2560.847, 0.325], [1825.127, 2555.039, 0.499]]\nB: [[1810.749, 2477.016, -0.044], [1526.117, 2495.829, 0.129], [2120.606, 2682.221, 0.369], [1940.674, 2177.131, 0.513]]\nC: [[1882.741, 2318.424, -0.045], [1487.68, 2321.211, 0.127], [2151.691, 2137.892, 0.264], [1751.426, 2963.026, 0.451]]\nD: [[1614.268, 2747.937, -0.04], [1694.976, 3075.224, 0.115], [1495.647, 3054.549, 0.349], [2186.702, 2819.745, 0.446]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1812.249, 2571.668, -0.038], [1816.741, 2566.203, 0.124], [1820.586, 2560.847, 0.325], [1825.127, 2555.039, 0.499]]\nB: [[1810.749, 2477.016, -0.044], [1526.117, 2495.829, 0.129], [2120.606, 2682.221, 0.369], [1940.674, 2177.131, 0.513]]\nC: [[1882.741, 2318.424, -0.045], [1487.68, 2321.211, 0.127], [2151.691, 2137.892, 0.264], [1751.426, 2963.026, 0.451]]\nD: [[1614.268, 2747.937, -0.04], [1694.976, 3075.224, 0.115], [1495.647, 3054.549, 0.349], [2186.702, 2819.745, 0.446]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_57_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_57_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_57_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_57_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_57_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_57_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_57_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_57_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[1774.223, 2610.364, 0.996], [1774.053, 2610.352, 1.02], [1773.502, 2611.229, 1.045], [1773.029, 2612.151, 1.07]]\nB: [[1776.641, 2350.607, 1.067], [1644.964, 2423.862, 0.9], [1725.868, 3040.601, 1.103], [2124.437, 2193.747, 0.89]]\nC: [[1671.897, 2705.242, 1.029], [1872.101, 2819.316, 0.97], [1650.995, 2602.6, 1.046], [1436.928, 2842.091, 1.11]]\nD: [[1953.59, 2775.546, 0.952], [1810.746, 2706.189, 0.97], [1565.979, 2177.88, 1.084], [1805.0, 2120.155, 0.92]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1774.223, 2610.364, 0.996], [1774.053, 2610.352, 1.02], [1773.502, 2611.229, 1.045], [1773.029, 2612.151, 1.07]]\nB: [[1776.641, 2350.607, 1.067], [1644.964, 2423.862, 0.9], [1725.868, 3040.601, 1.103], [2124.437, 2193.747, 0.89]]\nC: [[1671.897, 2705.242, 1.029], [1872.101, 2819.316, 0.97], [1650.995, 2602.6, 1.046], [1436.928, 2842.091, 1.11]]\nD: [[1953.59, 2775.546, 0.952], [1810.746, 2706.189, 0.97], [1565.979, 2177.88, 1.084], [1805.0, 2120.155, 0.92]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_58_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_58_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_58_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_58_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_58_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_58_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_58_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_58_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[1736.392, 873.361, 1.079], [1734.892, 874.826, 1.069], [1733.79, 876.382, 1.059], [1732.878, 880.029, 1.041]]\nB: [[1927.606, 821.613, 1.179], [1635.106, 838.612, 1.118], [1798.04, 779.408, 1.262], [1906.044, 824.489, 1.153]]\nC: [[1655.684, 967.021, 1.001], [2054.919, 792.455, 1.056], [1489.06, 752.973, 1.09], [1601.794, 1000.219, 1.236]]\nD: [[2074.836, 711.213, 0.881], [1659.637, 744.201, 1.258], [2073.69, 988.772, 1.118], [1777.227, 826.859, 1.096]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1736.392, 873.361, 1.079], [1734.892, 874.826, 1.069], [1733.79, 876.382, 1.059], [1732.878, 880.029, 1.041]]\nB: [[1927.606, 821.613, 1.179], [1635.106, 838.612, 1.118], [1798.04, 779.408, 1.262], [1906.044, 824.489, 1.153]]\nC: [[1655.684, 967.021, 1.001], [2054.919, 792.455, 1.056], [1489.06, 752.973, 1.09], [1601.794, 1000.219, 1.236]]\nD: [[2074.836, 711.213, 0.881], [1659.637, 744.201, 1.258], [2073.69, 988.772, 1.118], [1777.227, 826.859, 1.096]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_59_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_59_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_59_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_59_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_59_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_59_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_59_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_59_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[283.606, 696.753, 1.329], [383.644, 532.364, 1.363], [332.149, 525.401, 1.473], [385.71, 734.583, 1.097]]\nB: [[383.086, 639.489, 1.063], [279.173, 585.298, 1.388], [361.433, 701.65, 1.375], [295.925, 719.625, 1.188]]\nC: [[330.789, 641.074, 1.158], [330.789, 641.074, 1.212], [330.789, 641.074, 1.267], [330.789, 641.074, 1.322]]\nD: [[306.13, 719.717, 1.368], [345.491, 726.192, 0.98], [345.908, 519.046, 1.407], [318.14, 654.942, 1.306]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[283.606, 696.753, 1.329], [383.644, 532.364, 1.363], [332.149, 525.401, 1.473], [385.71, 734.583, 1.097]]\nB: [[383.086, 639.489, 1.063], [279.173, 585.298, 1.388], [361.433, 701.65, 1.375], [295.925, 719.625, 1.188]]\nC: [[330.789, 641.074, 1.158], [330.789, 641.074, 1.212], [330.789, 641.074, 1.267], [330.789, 641.074, 1.322]]\nD: [[306.13, 719.717, 1.368], [345.491, 726.192, 0.98], [345.908, 519.046, 1.407], [318.14, 654.942, 1.306]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_60_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_60_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_60_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_60_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_60_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_60_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_60_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_60_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[661.561, 1356.246, 0.07], [523.515, 1451.812, 0.242], [629.357, 1441.913, 0.46], [693.951, 1658.121, 0.495]]\nB: [[596.26, 1440.992, 0.068], [660.286, 1441.26, 0.267], [710.568, 1651.675, 0.41], [567.432, 1395.938, 0.5]]\nC: [[628.289, 1618.572, 0.075], [628.026, 1618.937, 0.252], [627.783, 1619.317, 0.43], [627.525, 1619.686, 0.607]]\nD: [[603.153, 1669.791, 0.074], [606.05, 1635.908, 0.212], [518.765, 1574.758, 0.36], [706.901, 1431.785, 0.612]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[661.561, 1356.246, 0.07], [523.515, 1451.812, 0.242], [629.357, 1441.913, 0.46], [693.951, 1658.121, 0.495]]\nB: [[596.26, 1440.992, 0.068], [660.286, 1441.26, 0.267], [710.568, 1651.675, 0.41], [567.432, 1395.938, 0.5]]\nC: [[628.289, 1618.572, 0.075], [628.026, 1618.937, 0.252], [627.783, 1619.317, 0.43], [627.525, 1619.686, 0.607]]\nD: [[603.153, 1669.791, 0.074], [606.05, 1635.908, 0.212], [518.765, 1574.758, 0.36], [706.901, 1431.785, 0.612]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_61_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_61_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_61_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_61_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_61_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_61_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_61_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_61_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[330.056, 1212.866, 0.652], [437.866, 986.828, 0.616], [373.47, 964.81, 0.657], [399.422, 1352.779, 0.635]]\nB: [[376.04, 1070.497, 0.645], [362.804, 1103.987, 0.583], [385.25, 1290.416, 0.524], [372.683, 1131.605, 0.686]]\nC: [[399.012, 1167.878, 0.547], [399.016, 1167.877, 0.567], [399.02, 1167.875, 0.588], [399.024, 1167.873, 0.609]]\nD: [[341.259, 1087.552, 0.588], [364.207, 1061.407, 0.495], [465.16, 1225.627, 0.505], [436.212, 1297.403, 0.597]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[330.056, 1212.866, 0.652], [437.866, 986.828, 0.616], [373.47, 964.81, 0.657], [399.422, 1352.779, 0.635]]\nB: [[376.04, 1070.497, 0.645], [362.804, 1103.987, 0.583], [385.25, 1290.416, 0.524], [372.683, 1131.605, 0.686]]\nC: [[399.012, 1167.878, 0.547], [399.016, 1167.877, 0.567], [399.02, 1167.875, 0.588], [399.024, 1167.873, 0.609]]\nD: [[341.259, 1087.552, 0.588], [364.207, 1061.407, 0.495], [465.16, 1225.627, 0.505], [436.212, 1297.403, 0.597]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_62_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_62_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_62_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_62_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_62_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_62_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_62_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_62_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[1274.538, 1123.672, 0.2], [1346.842, 824.302, 0.2], [1292.774, 973.997, 0.26], [1054.281, 825.781, 0.247]]\nB: [[1328.749, 1065.512, 0.2], [1175.959, 833.053, 0.21], [1184.952, 824.911, 0.22], [1071.949, 1039.546, 0.188]]\nC: [[1098.549, 1020.422, 0.2], [1497.254, 1019.016, 0.22], [1382.864, 830.543, 0.21], [1100.12, 1050.184, 0.22]]\nD: [[1253.322, 1015.243, 0.2], [1253.424, 1015.978, 0.21], [1253.526, 1016.713, 0.22], [1253.637, 1017.522, 0.231]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1274.538, 1123.672, 0.2], [1346.842, 824.302, 0.2], [1292.774, 973.997, 0.26], [1054.281, 825.781, 0.247]]\nB: [[1328.749, 1065.512, 0.2], [1175.959, 833.053, 0.21], [1184.952, 824.911, 0.22], [1071.949, 1039.546, 0.188]]\nC: [[1098.549, 1020.422, 0.2], [1497.254, 1019.016, 0.22], [1382.864, 830.543, 0.21], [1100.12, 1050.184, 0.22]]\nD: [[1253.322, 1015.243, 0.2], [1253.424, 1015.978, 0.21], [1253.526, 1016.713, 0.22], [1253.637, 1017.522, 0.231]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_63_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_63_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_63_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_63_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_63_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_63_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_63_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_63_7.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[1875.125, 875.415, 0.935], [1875.191, 875.416, 0.935], [1875.193, 875.318, 0.985], [1875.195, 875.264, 0.952]]\nB: [[2028.088, 883.086, 0.815], [1648.373, 903.844, 1.102], [1520.401, 917.022, 1.065], [1867.355, 949.61, 0.806]]\nC: [[1847.629, 795.419, 1.055], [2059.556, 721.579, 0.94], [1862.226, 915.16, 1.105], [1795.239, 771.992, 0.777]]\nD: [[1908.891, 972.995, 0.987], [1782.46, 894.481, 0.828], [1850.807, 745.233, 1.094], [1559.966, 967.549, 0.89]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1875.125, 875.415, 0.935], [1875.191, 875.416, 0.935], [1875.193, 875.318, 0.985], [1875.195, 875.264, 0.952]]\nB: [[2028.088, 883.086, 0.815], [1648.373, 903.844, 1.102], [1520.401, 917.022, 1.065], [1867.355, 949.61, 0.806]]\nC: [[1847.629, 795.419, 1.055], [2059.556, 721.579, 0.94], [1862.226, 915.16, 1.105], [1795.239, 771.992, 0.777]]\nD: [[1908.891, 972.995, 0.987], [1782.46, 894.481, 0.828], [1850.807, 745.233, 1.094], [1559.966, 967.549, 0.89]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_64_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_64_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_64_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_64_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_64_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_64_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_64_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_64_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[582.71, 1411.821, -0.354], [557.0, 1488.19, -0.247], [651.447, 1741.537, -0.237], [520.268, 1329.718, -0.188]]\nB: [[635.447, 1620.546, -0.326], [637.445, 1618.566, -0.238], [639.933, 1616.457, -0.267], [642.736, 1614.065, -0.196]]\nC: [[522.996, 1413.245, -0.379], [659.983, 1928.523, -0.204], [766.979, 1315.798, -0.304], [599.616, 1825.248, -0.224]]\nD: [[534.996, 1707.261, -0.292], [614.808, 1704.145, -0.211], [523.563, 1883.81, -0.302], [672.146, 1371.116, -0.218]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[582.71, 1411.821, -0.354], [557.0, 1488.19, -0.247], [651.447, 1741.537, -0.237], [520.268, 1329.718, -0.188]]\nB: [[635.447, 1620.546, -0.326], [637.445, 1618.566, -0.238], [639.933, 1616.457, -0.267], [642.736, 1614.065, -0.196]]\nC: [[522.996, 1413.245, -0.379], [659.983, 1928.523, -0.204], [766.979, 1315.798, -0.304], [599.616, 1825.248, -0.224]]\nD: [[534.996, 1707.261, -0.292], [614.808, 1704.145, -0.211], [523.563, 1883.81, -0.302], [672.146, 1371.116, -0.218]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_65_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_65_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_65_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_65_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_65_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_65_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_65_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_65_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[1680.561, 2863.302, 2.542], [2169.563, 2794.924, 2.265], [1707.842, 2939.301, 2.152], [1632.945, 2507.032, 1.945]]\nB: [[1547.261, 2391.341, 2.475], [1833.795, 2236.842, 2.065], [1997.767, 2445.687, 1.715], [2056.371, 2356.627, 2.0]]\nC: [[1904.106, 2453.654, 2.215], [1897.838, 2460.219, 2.156], [1892.616, 2465.688, 2.107], [1887.387, 2471.164, 2.057]]\nD: [[2182.661, 2279.405, 1.852], [1930.577, 2763.231, 2.146], [1909.158, 2677.265, 2.297], [1596.829, 2331.093, 2.251]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1680.561, 2863.302, 2.542], [2169.563, 2794.924, 2.265], [1707.842, 2939.301, 2.152], [1632.945, 2507.032, 1.945]]\nB: [[1547.261, 2391.341, 2.475], [1833.795, 2236.842, 2.065], [1997.767, 2445.687, 1.715], [2056.371, 2356.627, 2.0]]\nC: [[1904.106, 2453.654, 2.215], [1897.838, 2460.219, 2.156], [1892.616, 2465.688, 2.107], [1887.387, 2471.164, 2.057]]\nD: [[2182.661, 2279.405, 1.852], [1930.577, 2763.231, 2.146], [1909.158, 2677.265, 2.297], [1596.829, 2331.093, 2.251]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_66_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_66_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_66_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_66_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_66_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_66_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_66_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_66_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[325.896, 989.839, 0.854], [326.517, 1140.353, 0.792], [331.478, 1339.384, 0.825], [382.05, 1348.154, 0.905]]\nB: [[393.357, 1149.173, 0.741], [392.945, 1148.426, 0.766], [392.836, 1148.208, 0.791], [392.641, 1147.242, 0.816]]\nC: [[349.533, 1155.715, 0.667], [378.661, 1084.815, 0.825], [431.355, 1125.036, 0.69], [366.861, 940.522, 0.728]]\nD: [[370.793, 1354.659, 0.611], [315.047, 1147.297, 0.791], [387.351, 947.719, 0.922], [465.223, 1022.515, 0.919]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[325.896, 989.839, 0.854], [326.517, 1140.353, 0.792], [331.478, 1339.384, 0.825], [382.05, 1348.154, 0.905]]\nB: [[393.357, 1149.173, 0.741], [392.945, 1148.426, 0.766], [392.836, 1148.208, 0.791], [392.641, 1147.242, 0.816]]\nC: [[349.533, 1155.715, 0.667], [378.661, 1084.815, 0.825], [431.355, 1125.036, 0.69], [366.861, 940.522, 0.728]]\nD: [[370.793, 1354.659, 0.611], [315.047, 1147.297, 0.791], [387.351, 947.719, 0.922], [465.223, 1022.515, 0.919]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_67_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_67_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_67_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_67_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_67_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_67_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_67_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_67_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[2150.649, 802.858, 0.294], [2285.669, 868.672, 0.285], [1699.227, 714.027, 0.212], [1879.379, 823.249, 0.262]]\nB: [[2057.083, 982.769, 0.286], [2194.778, 989.034, 0.224], [1802.969, 943.191, 0.277], [2004.998, 886.268, 0.304]]\nC: [[1585.591, 914.605, 0.27], [1552.179, 1019.735, 0.316], [1997.522, 917.351, 0.271], [2167.86, 906.565, 0.376]]\nD: [[1926.398, 878.499, 0.267], [1926.397, 878.517, 0.277], [1926.355, 878.551, 0.259], [1926.373, 878.505, 0.317]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[2150.649, 802.858, 0.294], [2285.669, 868.672, 0.285], [1699.227, 714.027, 0.212], [1879.379, 823.249, 0.262]]\nB: [[2057.083, 982.769, 0.286], [2194.778, 989.034, 0.224], [1802.969, 943.191, 0.277], [2004.998, 886.268, 0.304]]\nC: [[1585.591, 914.605, 0.27], [1552.179, 1019.735, 0.316], [1997.522, 917.351, 0.271], [2167.86, 906.565, 0.376]]\nD: [[1926.398, 878.499, 0.267], [1926.397, 878.517, 0.277], [1926.355, 878.551, 0.259], [1926.373, 878.505, 0.317]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_68_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_68_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_68_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_68_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_68_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_68_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_68_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_68_7.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[510.539, 1910.445, 0.57], [644.708, 1390.766, 0.487], [528.12, 1465.53, 0.743], [684.489, 1755.997, 0.708]]\nB: [[618.712, 1315.402, 0.58], [707.938, 1490.825, 0.655], [678.962, 1715.92, 0.704], [561.864, 1423.394, 0.869]]\nC: [[644.567, 1930.873, 0.501], [662.364, 1327.012, 0.538], [649.501, 1573.33, 0.633], [563.587, 1732.779, 0.975]]\nD: [[612.719, 1632.142, 0.491], [612.166, 1632.636, 0.566], [611.613, 1633.13, 0.641], [611.127, 1633.567, 0.816]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[510.539, 1910.445, 0.57], [644.708, 1390.766, 0.487], [528.12, 1465.53, 0.743], [684.489, 1755.997, 0.708]]\nB: [[618.712, 1315.402, 0.58], [707.938, 1490.825, 0.655], [678.962, 1715.92, 0.704], [561.864, 1423.394, 0.869]]\nC: [[644.567, 1930.873, 0.501], [662.364, 1327.012, 0.538], [649.501, 1573.33, 0.633], [563.587, 1732.779, 0.975]]\nD: [[612.719, 1632.142, 0.491], [612.166, 1632.636, 0.566], [611.613, 1633.13, 0.641], [611.127, 1633.567, 0.816]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_69_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_69_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_69_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_69_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_69_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_69_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_69_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_69_7.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[439.78, 1227.349, 1.05], [426.44, 1268.51, 1.02], [325.982, 1240.255, 1.035], [445.556, 1380.946, 1.053]]\nB: [[383.14, 979.995, 1.108], [307.405, 1164.897, 0.89], [394.307, 1087.049, 1.131], [410.908, 1074.81, 0.998]]\nC: [[310.31, 1005.482, 1.062], [353.18, 1020.342, 0.94], [407.431, 1247.448, 1.209], [366.856, 956.543, 1.01]]\nD: [[376.13, 1158.507, 0.938], [376.399, 1159.165, 0.98], [376.667, 1159.822, 1.022], [376.878, 1160.357, 1.013]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[439.78, 1227.349, 1.05], [426.44, 1268.51, 1.02], [325.982, 1240.255, 1.035], [445.556, 1380.946, 1.053]]\nB: [[383.14, 979.995, 1.108], [307.405, 1164.897, 0.89], [394.307, 1087.049, 1.131], [410.908, 1074.81, 0.998]]\nC: [[310.31, 1005.482, 1.062], [353.18, 1020.342, 0.94], [407.431, 1247.448, 1.209], [366.856, 956.543, 1.01]]\nD: [[376.13, 1158.507, 0.938], [376.399, 1159.165, 0.98], [376.667, 1159.822, 1.022], [376.878, 1160.357, 1.013]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_70_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_70_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_70_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_70_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_70_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_70_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_70_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_70_7.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[1700.614, 979.18, 0.431], [1688.381, 1025.064, 0.508], [2269.256, 744.117, 0.545], [2084.454, 772.655, 0.492]]\nB: [[2210.59, 1003.05, 0.548], [1727.203, 861.604, 0.538], [1904.192, 830.147, 0.392], [1890.542, 842.708, 0.427]]\nC: [[1895.763, 879.04, 0.501], [1895.752, 879.076, 0.488], [1895.741, 879.112, 0.476], [1895.739, 879.116, 0.464]]\nD: [[1616.91, 819.62, 0.433], [2209.262, 739.792, 0.446], [2172.452, 852.21, 0.474], [2247.291, 966.129, 0.485]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1700.614, 979.18, 0.431], [1688.381, 1025.064, 0.508], [2269.256, 744.117, 0.545], [2084.454, 772.655, 0.492]]\nB: [[2210.59, 1003.05, 0.548], [1727.203, 861.604, 0.538], [1904.192, 830.147, 0.392], [1890.542, 842.708, 0.427]]\nC: [[1895.763, 879.04, 0.501], [1895.752, 879.076, 0.488], [1895.741, 879.112, 0.476], [1895.739, 879.116, 0.464]]\nD: [[1616.91, 819.62, 0.433], [2209.262, 739.792, 0.446], [2172.452, 852.21, 0.474], [2247.291, 966.129, 0.485]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_71_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_71_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_71_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_71_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_71_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_71_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_71_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_71_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[492.434, 1780.838, 1.16], [577.631, 1809.358, 1.725], [545.194, 1817.556, 1.603], [496.388, 1841.178, 2.172]]\nB: [[528.48, 1765.528, 1.61], [545.13, 1628.646, 1.326], [540.257, 1713.355, 1.75], [658.49, 1577.115, 2.147]]\nC: [[560.672, 1608.19, 1.54], [649.701, 1442.289, 1.318], [663.362, 1707.871, 2.131], [530.811, 1383.352, 2.114]]\nD: [[582.374, 1660.997, 1.38], [577.424, 1663.687, 1.585], [572.406, 1666.247, 1.789], [567.347, 1668.872, 2.039]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[492.434, 1780.838, 1.16], [577.631, 1809.358, 1.725], [545.194, 1817.556, 1.603], [496.388, 1841.178, 2.172]]\nB: [[528.48, 1765.528, 1.61], [545.13, 1628.646, 1.326], [540.257, 1713.355, 1.75], [658.49, 1577.115, 2.147]]\nC: [[560.672, 1608.19, 1.54], [649.701, 1442.289, 1.318], [663.362, 1707.871, 2.131], [530.811, 1383.352, 2.114]]\nD: [[582.374, 1660.997, 1.38], [577.424, 1663.687, 1.585], [572.406, 1666.247, 1.789], [567.347, 1668.872, 2.039]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_72_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_72_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_72_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_72_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_72_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_72_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_72_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_72_7.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[752.506, 1329.917, 0.448], [704.23, 1387.5, 0.595], [747.496, 1349.796, 0.742], [584.265, 1616.56, 0.8]]\nB: [[803.078, 1389.349, 0.388], [620.34, 1764.02, 0.522], [772.362, 1430.724, 0.537], [790.895, 1648.104, 0.8]]\nC: [[607.176, 1721.605, 0.459], [602.85, 1587.2, 0.548], [708.169, 1620.023, 0.647], [790.325, 1466.233, 0.7]]\nD: [[672.574, 1595.791, 0.388], [670.89, 1597.24, 0.625], [669.207, 1598.689, 0.663], [667.523, 1600.138, 0.7]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[752.506, 1329.917, 0.448], [704.23, 1387.5, 0.595], [747.496, 1349.796, 0.742], [584.265, 1616.56, 0.8]]\nB: [[803.078, 1389.349, 0.388], [620.34, 1764.02, 0.522], [772.362, 1430.724, 0.537], [790.895, 1648.104, 0.8]]\nC: [[607.176, 1721.605, 0.459], [602.85, 1587.2, 0.548], [708.169, 1620.023, 0.647], [790.325, 1466.233, 0.7]]\nD: [[672.574, 1595.791, 0.388], [670.89, 1597.24, 0.625], [669.207, 1598.689, 0.663], [667.523, 1600.138, 0.7]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_73_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_73_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_73_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_73_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_73_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_73_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_73_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_73_7.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[430.291, 1175.505, 0.588], [429.167, 1272.84, 0.836], [415.87, 1257.143, 0.882], [372.618, 1001.865, 0.689]]\nB: [[410.066, 1196.767, 0.656], [410.072, 1196.78, 0.706], [410.08, 1196.795, 0.756], [410.101, 1196.811, 0.756]]\nC: [[386.139, 1116.452, 0.72], [464.376, 1221.72, 0.778], [364.35, 1233.741, 0.755], [330.159, 1270.327, 0.687]]\nD: [[409.837, 984.781, 0.668], [440.225, 1048.51, 0.572], [446.2, 1257.02, 0.834], [482.338, 985.937, 0.624]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[430.291, 1175.505, 0.588], [429.167, 1272.84, 0.836], [415.87, 1257.143, 0.882], [372.618, 1001.865, 0.689]]\nB: [[410.066, 1196.767, 0.656], [410.072, 1196.78, 0.706], [410.08, 1196.795, 0.756], [410.101, 1196.811, 0.756]]\nC: [[386.139, 1116.452, 0.72], [464.376, 1221.72, 0.778], [364.35, 1233.741, 0.755], [330.159, 1270.327, 0.687]]\nD: [[409.837, 984.781, 0.668], [440.225, 1048.51, 0.572], [446.2, 1257.02, 0.834], [482.338, 985.937, 0.624]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_74_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_74_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_74_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_74_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_74_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_74_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_74_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_74_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[392.514, 1072.298, 0.999], [329.808, 1223.789, 1.084], [463.813, 1302.961, 0.914], [451.609, 1167.4, 1.154]]\nB: [[401.879, 1242.697, 0.728], [341.309, 990.577, 1.02], [450.947, 906.714, 1.117], [328.165, 960.327, 1.095]]\nC: [[342.127, 1107.321, 0.98], [447.285, 926.593, 0.964], [394.127, 898.801, 1.186], [403.682, 1324.015, 1.131]]\nD: [[391.204, 1112.576, 0.863], [391.204, 1112.576, 0.913], [391.208, 1112.586, 1.013], [391.212, 1112.595, 0.993]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[392.514, 1072.298, 0.999], [329.808, 1223.789, 1.084], [463.813, 1302.961, 0.914], [451.609, 1167.4, 1.154]]\nB: [[401.879, 1242.697, 0.728], [341.309, 990.577, 1.02], [450.947, 906.714, 1.117], [328.165, 960.327, 1.095]]\nC: [[342.127, 1107.321, 0.98], [447.285, 926.593, 0.964], [394.127, 898.801, 1.186], [403.682, 1324.015, 1.131]]\nD: [[391.204, 1112.576, 0.863], [391.204, 1112.576, 0.913], [391.208, 1112.586, 1.013], [391.212, 1112.595, 0.993]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_75_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_75_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_75_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_75_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_75_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_75_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_75_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_75_7.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[1321.516, 1033.801, 1.008], [1321.517, 1033.801, 1.008], [1321.517, 1033.8, 1.008], [1321.518, 1033.8, 1.008]]\nB: [[1066.616, 1131.355, 0.921], [1219.6, 1098.492, 0.864], [1282.161, 961.0, 1.081], [1197.931, 1177.0, 1.016]]\nC: [[1190.352, 917.033, 1.028], [1155.143, 1161.133, 1.153], [1394.211, 959.1, 0.834], [1188.323, 1016.1, 1.08]]\nD: [[1067.426, 888.354, 0.843], [1441.448, 1176.105, 0.843], [1087.955, 967.6, 0.966], [1191.947, 906.7, 0.967]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1321.516, 1033.801, 1.008], [1321.517, 1033.801, 1.008], [1321.517, 1033.8, 1.008], [1321.518, 1033.8, 1.008]]\nB: [[1066.616, 1131.355, 0.921], [1219.6, 1098.492, 0.864], [1282.161, 961.0, 1.081], [1197.931, 1177.0, 1.016]]\nC: [[1190.352, 917.033, 1.028], [1155.143, 1161.133, 1.153], [1394.211, 959.1, 0.834], [1188.323, 1016.1, 1.08]]\nD: [[1067.426, 888.354, 0.843], [1441.448, 1176.105, 0.843], [1087.955, 967.6, 0.966], [1191.947, 906.7, 0.967]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_76_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_76_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_76_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_76_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_76_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_76_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_76_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_76_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[438.849, 951.711, 0.795], [314.542, 1143.97, 0.648], [362.865, 1209.907, 0.806], [438.018, 933.567, 0.84]]\nB: [[388.688, 1111.433, 0.677], [388.691, 1111.43, 0.695], [388.695, 1111.428, 0.713], [388.698, 1111.426, 0.716]]\nC: [[316.274, 909.79, 0.674], [451.235, 958.84, 0.605], [365.204, 1239.893, 0.608], [426.654, 1268.736, 0.816]]\nD: [[452.526, 1172.998, 0.585], [454.014, 1000.44, 0.724], [336.213, 1132.703, 0.811], [313.791, 1218.829, 0.612]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[438.849, 951.711, 0.795], [314.542, 1143.97, 0.648], [362.865, 1209.907, 0.806], [438.018, 933.567, 0.84]]\nB: [[388.688, 1111.433, 0.677], [388.691, 1111.43, 0.695], [388.695, 1111.428, 0.713], [388.698, 1111.426, 0.716]]\nC: [[316.274, 909.79, 0.674], [451.235, 958.84, 0.605], [365.204, 1239.893, 0.608], [426.654, 1268.736, 0.816]]\nD: [[452.526, 1172.998, 0.585], [454.014, 1000.44, 0.724], [336.213, 1132.703, 0.811], [313.791, 1218.829, 0.612]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_77_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_77_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_77_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_77_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_77_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_77_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_77_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_77_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[625.166, 1621.071, -0.136], [624.733, 1621.463, -0.074], [624.328, 1621.884, -0.011], [624.034, 1622.205, 0.176]]\nB: [[673.606, 1641.602, -0.151], [650.83, 1385.101, -0.066], [545.785, 1758.678, -0.013], [623.635, 1668.753, 0.151]]\nC: [[612.227, 1304.459, -0.119], [594.602, 1728.678, -0.08], [714.884, 1584.229, -0.012], [716.369, 1325.064, 0.2]]\nD: [[677.007, 1319.892, -0.145], [590.033, 1617.266, -0.079], [508.485, 1809.42, -0.012], [584.009, 1851.902, 0.196]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[625.166, 1621.071, -0.136], [624.733, 1621.463, -0.074], [624.328, 1621.884, -0.011], [624.034, 1622.205, 0.176]]\nB: [[673.606, 1641.602, -0.151], [650.83, 1385.101, -0.066], [545.785, 1758.678, -0.013], [623.635, 1668.753, 0.151]]\nC: [[612.227, 1304.459, -0.119], [594.602, 1728.678, -0.08], [714.884, 1584.229, -0.012], [716.369, 1325.064, 0.2]]\nD: [[677.007, 1319.892, -0.145], [590.033, 1617.266, -0.079], [508.485, 1809.42, -0.012], [584.009, 1851.902, 0.196]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_78_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_78_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_78_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_78_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_78_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_78_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_78_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_78_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[500.254, 1116.383, 0.109], [528.871, 1208.17, 0.09], [370.111, 1189.599, 0.128], [490.724, 1065.954, 0.15]]\nB: [[424.937, 1288.953, 0.124], [494.735, 1135.49, 0.095], [377.247, 1056.094, 0.129], [454.303, 1168.649, 0.16]]\nC: [[445.198, 1091.608, 0.107], [445.269, 1091.74, 0.084], [445.269, 1091.738, 0.117], [445.269, 1091.735, 0.15]]\nD: [[518.8, 1113.376, 0.123], [424.179, 929.73, 0.097], [415.562, 1089.363, 0.113], [387.889, 1032.784, 0.17]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[500.254, 1116.383, 0.109], [528.871, 1208.17, 0.09], [370.111, 1189.599, 0.128], [490.724, 1065.954, 0.15]]\nB: [[424.937, 1288.953, 0.124], [494.735, 1135.49, 0.095], [377.247, 1056.094, 0.129], [454.303, 1168.649, 0.16]]\nC: [[445.198, 1091.608, 0.107], [445.269, 1091.74, 0.084], [445.269, 1091.738, 0.117], [445.269, 1091.735, 0.15]]\nD: [[518.8, 1113.376, 0.123], [424.179, 929.73, 0.097], [415.562, 1089.363, 0.113], [387.889, 1032.784, 0.17]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_79_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_79_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_79_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_79_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_79_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_79_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_79_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_79_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[1484.746, 911.994, 1.171], [1137.15, 1003.822, 1.118], [1492.05, 930.243, 1.121], [1124.198, 1141.212, 1.261]]\nB: [[1219.797, 981.822, 1.234], [1419.86, 1093.926, 0.969], [1395.832, 917.571, 1.104], [1330.08, 1062.03, 1.216]]\nC: [[1453.95, 988.704, 0.898], [1551.29, 1210.843, 1.28], [1428.034, 1104.909, 1.233], [1371.047, 908.624, 1.137]]\nD: [[1328.982, 1049.561, 1.089], [1328.99, 1049.562, 1.089], [1328.997, 1049.563, 1.089], [1329.005, 1049.565, 1.089]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1484.746, 911.994, 1.171], [1137.15, 1003.822, 1.118], [1492.05, 930.243, 1.121], [1124.198, 1141.212, 1.261]]\nB: [[1219.797, 981.822, 1.234], [1419.86, 1093.926, 0.969], [1395.832, 917.571, 1.104], [1330.08, 1062.03, 1.216]]\nC: [[1453.95, 988.704, 0.898], [1551.29, 1210.843, 1.28], [1428.034, 1104.909, 1.233], [1371.047, 908.624, 1.137]]\nD: [[1328.982, 1049.561, 1.089], [1328.99, 1049.562, 1.089], [1328.997, 1049.563, 1.089], [1329.005, 1049.565, 1.089]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_80_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_80_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_80_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_80_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_80_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_80_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_80_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_80_7.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[456.432, 1282.157, 0.891], [479.945, 1042.78, 0.822], [407.218, 1084.054, 0.7], [446.375, 997.916, 0.683]]\nB: [[462.967, 894.315, 0.986], [392.745, 966.59, 0.805], [391.102, 1018.399, 0.622], [493.885, 1286.081, 0.965]]\nC: [[466.287, 926.158, 0.882], [352.099, 1212.35, 0.658], [429.631, 1077.672, 0.822], [411.455, 1150.981, 0.801]]\nD: [[430.242, 1089.779, 1.026], [430.279, 1089.87, 0.776], [430.299, 1089.898, 0.776], [430.321, 1089.952, 0.817]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[456.432, 1282.157, 0.891], [479.945, 1042.78, 0.822], [407.218, 1084.054, 0.7], [446.375, 997.916, 0.683]]\nB: [[462.967, 894.315, 0.986], [392.745, 966.59, 0.805], [391.102, 1018.399, 0.622], [493.885, 1286.081, 0.965]]\nC: [[466.287, 926.158, 0.882], [352.099, 1212.35, 0.658], [429.631, 1077.672, 0.822], [411.455, 1150.981, 0.801]]\nD: [[430.242, 1089.779, 1.026], [430.279, 1089.87, 0.776], [430.299, 1089.898, 0.776], [430.321, 1089.952, 0.817]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_81_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_81_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_81_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_81_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_81_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_81_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_81_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_81_7.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[1945.526, 876.296, 0.419], [1945.526, 876.242, 0.469], [1945.526, 876.177, 0.469], [1945.526, 876.26, 0.519]]\nB: [[2158.266, 800.911, 0.453], [2106.839, 1043.586, 0.501], [2049.112, 832.682, 0.434], [2030.483, 957.49, 0.61]]\nC: [[2028.562, 929.081, 0.457], [1728.295, 771.666, 0.406], [2125.198, 983.306, 0.535], [2151.856, 925.1, 0.483]]\nD: [[2333.669, 1007.109, 0.449], [1683.52, 730.695, 0.511], [2240.73, 776.757, 0.511], [1717.598, 731.99, 0.548]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1945.526, 876.296, 0.419], [1945.526, 876.242, 0.469], [1945.526, 876.177, 0.469], [1945.526, 876.26, 0.519]]\nB: [[2158.266, 800.911, 0.453], [2106.839, 1043.586, 0.501], [2049.112, 832.682, 0.434], [2030.483, 957.49, 0.61]]\nC: [[2028.562, 929.081, 0.457], [1728.295, 771.666, 0.406], [2125.198, 983.306, 0.535], [2151.856, 925.1, 0.483]]\nD: [[2333.669, 1007.109, 0.449], [1683.52, 730.695, 0.511], [2240.73, 776.757, 0.511], [1717.598, 731.99, 0.548]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_82_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_82_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_82_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_82_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_82_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_82_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_82_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_82_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[1937.491, 914.639, 0.22], [1901.969, 968.51, 0.273], [1716.987, 958.003, 0.262], [1808.463, 962.345, 0.288]]\nB: [[2007.196, 817.175, 0.254], [1773.432, 871.003, 0.284], [1939.918, 1002.574, 0.304], [2252.629, 811.34, 0.262]]\nC: [[2285.927, 936.67, 0.263], [1604.253, 825.974, 0.25], [2118.153, 905.274, 0.26], [1884.079, 918.838, 0.296]]\nD: [[1926.631, 877.571, 0.228], [1926.631, 877.571, 0.252], [1926.626, 877.593, 0.255], [1926.638, 877.538, 0.303]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1937.491, 914.639, 0.22], [1901.969, 968.51, 0.273], [1716.987, 958.003, 0.262], [1808.463, 962.345, 0.288]]\nB: [[2007.196, 817.175, 0.254], [1773.432, 871.003, 0.284], [1939.918, 1002.574, 0.304], [2252.629, 811.34, 0.262]]\nC: [[2285.927, 936.67, 0.263], [1604.253, 825.974, 0.25], [2118.153, 905.274, 0.26], [1884.079, 918.838, 0.296]]\nD: [[1926.631, 877.571, 0.228], [1926.631, 877.571, 0.252], [1926.626, 877.593, 0.255], [1926.638, 877.538, 0.303]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_83_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_83_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_83_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_83_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_83_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_83_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_83_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_83_7.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[680.728, 1749.077, -0.375], [745.701, 1474.694, -0.299], [610.56, 1380.492, -0.27], [635.92, 1915.689, -0.136]]\nB: [[660.851, 1604.404, -0.423], [657.771, 1607.079, -0.332], [654.69, 1609.754, -0.24], [651.61, 1612.428, -0.148]]\nC: [[647.562, 1445.984, -0.429], [659.321, 1909.729, -0.283], [754.69, 1382.093, -0.26], [549.55, 1888.817, -0.122]]\nD: [[751.514, 1476.27, -0.49], [654.016, 1488.662, -0.332], [753.33, 1931.072, -0.2], [574.93, 1792.093, -0.171]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[680.728, 1749.077, -0.375], [745.701, 1474.694, -0.299], [610.56, 1380.492, -0.27], [635.92, 1915.689, -0.136]]\nB: [[660.851, 1604.404, -0.423], [657.771, 1607.079, -0.332], [654.69, 1609.754, -0.24], [651.61, 1612.428, -0.148]]\nC: [[647.562, 1445.984, -0.429], [659.321, 1909.729, -0.283], [754.69, 1382.093, -0.26], [549.55, 1888.817, -0.122]]\nD: [[751.514, 1476.27, -0.49], [654.016, 1488.662, -0.332], [753.33, 1931.072, -0.2], [574.93, 1792.093, -0.171]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_84_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_84_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_84_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_84_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_84_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_84_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_84_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_84_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[356.068, 1144.504, 0.82], [356.64, 1144.191, 0.795], [358.929, 1142.941, 0.822], [359.501, 1142.629, 0.839]]\nB: [[401.199, 1094.551, 0.83], [308.99, 1334.228, 0.943], [415.452, 921.574, 0.753], [392.805, 1225.338, 0.965]]\nC: [[395.4, 1321.544, 0.95], [322.87, 1045.667, 0.91], [342.828, 1295.35, 0.695], [397.067, 940.796, 0.768]]\nD: [[418.406, 1138.796, 0.82], [416.34, 1311.233, 0.684], [355.451, 1305.707, 0.882], [410.239, 1120.033, 0.971]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[356.068, 1144.504, 0.82], [356.64, 1144.191, 0.795], [358.929, 1142.941, 0.822], [359.501, 1142.629, 0.839]]\nB: [[401.199, 1094.551, 0.83], [308.99, 1334.228, 0.943], [415.452, 921.574, 0.753], [392.805, 1225.338, 0.965]]\nC: [[395.4, 1321.544, 0.95], [322.87, 1045.667, 0.91], [342.828, 1295.35, 0.695], [397.067, 940.796, 0.768]]\nD: [[418.406, 1138.796, 0.82], [416.34, 1311.233, 0.684], [355.451, 1305.707, 0.882], [410.239, 1120.033, 0.971]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_85_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_85_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_85_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_85_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_85_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_85_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_85_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_85_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[1753.666, 1018.404, 0.365], [1524.0, 883.794, 0.327], [2021.935, 879.24, 0.307], [1882.989, 791.594, 0.35]]\nB: [[2170.362, 809.726, 0.373], [2168.605, 703.253, 0.314], [1918.642, 995.58, 0.329], [1602.549, 910.935, 0.29]]\nC: [[2178.785, 988.248, 0.285], [2227.998, 705.37, 0.287], [1566.17, 877.23, 0.318], [1931.258, 826.324, 0.31]]\nD: [[1902.434, 878.055, 0.343], [1902.434, 878.055, 0.293], [1902.429, 878.07, 0.302], [1902.423, 878.086, 0.31]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1753.666, 1018.404, 0.365], [1524.0, 883.794, 0.327], [2021.935, 879.24, 0.307], [1882.989, 791.594, 0.35]]\nB: [[2170.362, 809.726, 0.373], [2168.605, 703.253, 0.314], [1918.642, 995.58, 0.329], [1602.549, 910.935, 0.29]]\nC: [[2178.785, 988.248, 0.285], [2227.998, 705.37, 0.287], [1566.17, 877.23, 0.318], [1931.258, 826.324, 0.31]]\nD: [[1902.434, 878.055, 0.343], [1902.434, 878.055, 0.293], [1902.429, 878.07, 0.302], [1902.423, 878.086, 0.31]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_86_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_86_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_86_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_86_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_86_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_86_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_86_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_86_7.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[378.922, 1013.673, 0.734], [494.488, 1078.797, 0.456], [393.735, 912.986, 0.581], [419.965, 958.358, 0.767]]\nB: [[433.059, 1088.732, 0.713], [433.043, 1088.668, 0.553], [433.039, 1088.652, 0.513], [433.055, 1088.681, 0.703]]\nC: [[426.779, 1210.184, 0.683], [374.724, 1199.914, 0.579], [356.893, 998.508, 0.494], [512.758, 1067.304, 0.691]]\nD: [[351.961, 935.844, 0.571], [386.254, 1200.68, 0.61], [480.449, 1191.815, 0.483], [412.037, 930.978, 0.833]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[378.922, 1013.673, 0.734], [494.488, 1078.797, 0.456], [393.735, 912.986, 0.581], [419.965, 958.358, 0.767]]\nB: [[433.059, 1088.732, 0.713], [433.043, 1088.668, 0.553], [433.039, 1088.652, 0.513], [433.055, 1088.681, 0.703]]\nC: [[426.779, 1210.184, 0.683], [374.724, 1199.914, 0.579], [356.893, 998.508, 0.494], [512.758, 1067.304, 0.691]]\nD: [[351.961, 935.844, 0.571], [386.254, 1200.68, 0.61], [480.449, 1191.815, 0.483], [412.037, 930.978, 0.833]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_87_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_87_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_87_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_87_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_87_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_87_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_87_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_87_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[2045.551, 727.654, 1.102], [1833.302, 769.345, 1.055], [1827.43, 963.776, 1.306], [1702.867, 738.416, 0.98]]\nB: [[1661.503, 737.862, 0.962], [1861.333, 965.185, 0.908], [1821.87, 992.03, 0.919], [2075.341, 874.239, 1.077]]\nC: [[1741.077, 864.895, 1.109], [1745.181, 865.139, 1.105], [1748.91, 865.361, 1.102], [1752.336, 865.549, 1.096]]\nD: [[1791.757, 846.823, 0.925], [1982.741, 721.703, 1.059], [1406.89, 829.542, 1.078], [1577.41, 813.096, 1.053]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[2045.551, 727.654, 1.102], [1833.302, 769.345, 1.055], [1827.43, 963.776, 1.306], [1702.867, 738.416, 0.98]]\nB: [[1661.503, 737.862, 0.962], [1861.333, 965.185, 0.908], [1821.87, 992.03, 0.919], [2075.341, 874.239, 1.077]]\nC: [[1741.077, 864.895, 1.109], [1745.181, 865.139, 1.105], [1748.91, 865.361, 1.102], [1752.336, 865.549, 1.096]]\nD: [[1791.757, 846.823, 0.925], [1982.741, 721.703, 1.059], [1406.89, 829.542, 1.078], [1577.41, 813.096, 1.053]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_88_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_88_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_88_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_88_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_88_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_88_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_88_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_88_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[505.05, 877.226, 0.203], [445.953, 1244.021, 0.202], [392.706, 1145.406, 0.277], [366.753, 1183.669, 0.29]]\nB: [[457.03, 1257.26, 0.228], [463.22, 1274.147, 0.225], [370.296, 997.948, 0.259], [365.28, 1022.072, 0.29]]\nC: [[434.02, 1096.492, 0.241], [434.019, 1096.492, 0.222], [434.019, 1096.492, 0.236], [434.018, 1096.493, 0.25]]\nD: [[422.09, 929.091, 0.197], [407.921, 1195.795, 0.198], [469.259, 1267.695, 0.23], [354.798, 1155.602, 0.26]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[505.05, 877.226, 0.203], [445.953, 1244.021, 0.202], [392.706, 1145.406, 0.277], [366.753, 1183.669, 0.29]]\nB: [[457.03, 1257.26, 0.228], [463.22, 1274.147, 0.225], [370.296, 997.948, 0.259], [365.28, 1022.072, 0.29]]\nC: [[434.02, 1096.492, 0.241], [434.019, 1096.492, 0.222], [434.019, 1096.492, 0.236], [434.018, 1096.493, 0.25]]\nD: [[422.09, 929.091, 0.197], [407.921, 1195.795, 0.198], [469.259, 1267.695, 0.23], [354.798, 1155.602, 0.26]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_89_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_89_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_89_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_89_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_89_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_89_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_89_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_89_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[382.439, 944.308, 1.097], [282.072, 1346.475, 1.314], [317.768, 1056.456, 0.981], [328.142, 1067.671, 0.969]]\nB: [[348.689, 1130.152, 1.122], [348.689, 1130.152, 1.122], [348.689, 1130.152, 1.122], [348.689, 1130.152, 1.122]]\nC: [[290.351, 1111.854, 1.019], [383.245, 975.676, 1.11], [292.501, 1319.267, 0.953], [293.662, 1130.698, 0.975]]\nD: [[344.597, 1317.41, 1.004], [391.599, 1063.24, 1.128], [415.864, 1014.121, 0.9], [383.217, 1223.267, 1.321]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[382.439, 944.308, 1.097], [282.072, 1346.475, 1.314], [317.768, 1056.456, 0.981], [328.142, 1067.671, 0.969]]\nB: [[348.689, 1130.152, 1.122], [348.689, 1130.152, 1.122], [348.689, 1130.152, 1.122], [348.689, 1130.152, 1.122]]\nC: [[290.351, 1111.854, 1.019], [383.245, 975.676, 1.11], [292.501, 1319.267, 0.953], [293.662, 1130.698, 0.975]]\nD: [[344.597, 1317.41, 1.004], [391.599, 1063.24, 1.128], [415.864, 1014.121, 0.9], [383.217, 1223.267, 1.321]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_90_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_90_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_90_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_90_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_90_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_90_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_90_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_90_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[445.635, 1070.359, 0.563], [380.207, 1209.252, 0.55], [503.803, 1202.552, 0.618], [402.841, 878.242, 0.916]]\nB: [[355.09, 926.566, 0.532], [425.031, 931.852, 0.428], [396.378, 1283.26, 0.637], [419.782, 1021.681, 0.797]]\nC: [[353.198, 1052.673, 0.552], [387.581, 1075.215, 0.55], [453.134, 889.143, 0.766], [432.989, 976.738, 0.712]]\nD: [[435.434, 1087.782, 0.612], [435.403, 1087.706, 0.533], [435.405, 1087.711, 0.695], [435.407, 1087.716, 0.846]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[445.635, 1070.359, 0.563], [380.207, 1209.252, 0.55], [503.803, 1202.552, 0.618], [402.841, 878.242, 0.916]]\nB: [[355.09, 926.566, 0.532], [425.031, 931.852, 0.428], [396.378, 1283.26, 0.637], [419.782, 1021.681, 0.797]]\nC: [[353.198, 1052.673, 0.552], [387.581, 1075.215, 0.55], [453.134, 889.143, 0.766], [432.989, 976.738, 0.712]]\nD: [[435.434, 1087.782, 0.612], [435.403, 1087.706, 0.533], [435.405, 1087.711, 0.695], [435.407, 1087.716, 0.846]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_91_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_91_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_91_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_91_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_91_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_91_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_91_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_91_7.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[1858.977, 3005.32, 0.085], [1613.167, 2545.921, 0.166], [1534.084, 2595.697, 0.482], [1872.638, 2228.595, 0.519]]\nB: [[1516.054, 2338.945, 0.097], [1891.297, 2428.151, 0.236], [1796.827, 2149.677, 0.559], [1658.932, 2766.556, 0.382]]\nC: [[2005.599, 2965.349, 0.12], [1626.186, 2645.937, 0.181], [1937.717, 2253.069, 0.541], [1779.108, 2893.005, 0.435]]\nD: [[1824.199, 2571.318, 0.101], [1824.516, 2570.899, 0.205], [1825.469, 2569.639, 0.518], [1825.787, 2569.219, 0.434]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1858.977, 3005.32, 0.085], [1613.167, 2545.921, 0.166], [1534.084, 2595.697, 0.482], [1872.638, 2228.595, 0.519]]\nB: [[1516.054, 2338.945, 0.097], [1891.297, 2428.151, 0.236], [1796.827, 2149.677, 0.559], [1658.932, 2766.556, 0.382]]\nC: [[2005.599, 2965.349, 0.12], [1626.186, 2645.937, 0.181], [1937.717, 2253.069, 0.541], [1779.108, 2893.005, 0.435]]\nD: [[1824.199, 2571.318, 0.101], [1824.516, 2570.899, 0.205], [1825.469, 2569.639, 0.518], [1825.787, 2569.219, 0.434]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_92_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_92_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_92_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_92_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_92_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_92_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_92_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_92_7.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[598.103, 1642.075, 1.029], [598.744, 1641.829, 1.029], [599.384, 1641.583, 1.029], [600.026, 1641.338, 1.179]]\nB: [[701.368, 1416.38, 0.896], [530.801, 1778.726, 1.056], [579.309, 1558.364, 0.977], [683.542, 1838.774, 1.325]]\nC: [[715.233, 1896.029, 0.968], [530.58, 1520.538, 0.944], [596.209, 1472.502, 0.856], [536.626, 1453.346, 1.289]]\nD: [[626.221, 1751.17, 1.049], [568.701, 1547.296, 1.076], [640.532, 1458.354, 1.122], [626.284, 1959.943, 1.094]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[598.103, 1642.075, 1.029], [598.744, 1641.829, 1.029], [599.384, 1641.583, 1.029], [600.026, 1641.338, 1.179]]\nB: [[701.368, 1416.38, 0.896], [530.801, 1778.726, 1.056], [579.309, 1558.364, 0.977], [683.542, 1838.774, 1.325]]\nC: [[715.233, 1896.029, 0.968], [530.58, 1520.538, 0.944], [596.209, 1472.502, 0.856], [536.626, 1453.346, 1.289]]\nD: [[626.221, 1751.17, 1.049], [568.701, 1547.296, 1.076], [640.532, 1458.354, 1.122], [626.284, 1959.943, 1.094]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_93_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_93_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_93_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_93_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_93_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_93_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_93_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_93_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[413.954, 1081.236, 0.717], [451.481, 1023.032, 0.495], [397.92, 951.148, 0.604], [328.441, 1181.445, 0.598]]\nB: [[457.154, 1119.531, 0.668], [402.874, 923.594, 0.435], [447.684, 1012.752, 0.547], [341.799, 1237.225, 0.728]]\nC: [[354.36, 1174.911, 0.52], [371.321, 1042.76, 0.471], [448.856, 1142.068, 0.628], [427.925, 1261.83, 0.519]]\nD: [[389.399, 1112.311, 0.629], [389.356, 1112.334, 0.529], [389.379, 1112.321, 0.579], [389.403, 1112.309, 0.629]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[413.954, 1081.236, 0.717], [451.481, 1023.032, 0.495], [397.92, 951.148, 0.604], [328.441, 1181.445, 0.598]]\nB: [[457.154, 1119.531, 0.668], [402.874, 923.594, 0.435], [447.684, 1012.752, 0.547], [341.799, 1237.225, 0.728]]\nC: [[354.36, 1174.911, 0.52], [371.321, 1042.76, 0.471], [448.856, 1142.068, 0.628], [427.925, 1261.83, 0.519]]\nD: [[389.399, 1112.311, 0.629], [389.356, 1112.334, 0.529], [389.379, 1112.321, 0.579], [389.403, 1112.309, 0.629]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_94_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_94_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_94_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_94_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_94_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_94_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_94_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_94_7.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[611.892, 1806.578, 0.268], [743.567, 1822.952, 0.264], [555.945, 1619.632, 0.282], [736.456, 1610.791, 0.275]]\nB: [[647.522, 1603.835, 0.243], [647.522, 1603.835, 0.293], [647.522, 1603.835, 0.318], [647.522, 1603.835, 0.343]]\nC: [[613.269, 1753.144, 0.259], [701.513, 1670.735, 0.335], [698.245, 1622.138, 0.321], [547.954, 1706.296, 0.366]]\nD: [[518.452, 1481.659, 0.23], [579.958, 1410.188, 0.243], [523.377, 1912.789, 0.276], [661.654, 1701.9, 0.34]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[611.892, 1806.578, 0.268], [743.567, 1822.952, 0.264], [555.945, 1619.632, 0.282], [736.456, 1610.791, 0.275]]\nB: [[647.522, 1603.835, 0.243], [647.522, 1603.835, 0.293], [647.522, 1603.835, 0.318], [647.522, 1603.835, 0.343]]\nC: [[613.269, 1753.144, 0.259], [701.513, 1670.735, 0.335], [698.245, 1622.138, 0.321], [547.954, 1706.296, 0.366]]\nD: [[518.452, 1481.659, 0.23], [579.958, 1410.188, 0.243], [523.377, 1912.789, 0.276], [661.654, 1701.9, 0.34]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_95_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_95_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_95_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_95_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_95_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_95_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_95_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_95_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[631.815, 1636.973, 0.074], [631.58, 1636.905, 0.174], [631.313, 1636.901, 0.224], [631.183, 1636.842, 0.29]]\nB: [[559.113, 1943.842, 0.076], [518.03, 1864.19, 0.151], [546.229, 1683.354, 0.205], [539.475, 1389.243, 0.24]]\nC: [[689.175, 1624.485, 0.066], [688.09, 1571.158, 0.178], [563.905, 1790.085, 0.19], [581.151, 1421.06, 0.29]]\nD: [[705.525, 1667.251, 0.065], [604.36, 1921.35, 0.181], [722.147, 1476.341, 0.225], [572.745, 1584.256, 0.3]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[631.815, 1636.973, 0.074], [631.58, 1636.905, 0.174], [631.313, 1636.901, 0.224], [631.183, 1636.842, 0.29]]\nB: [[559.113, 1943.842, 0.076], [518.03, 1864.19, 0.151], [546.229, 1683.354, 0.205], [539.475, 1389.243, 0.24]]\nC: [[689.175, 1624.485, 0.066], [688.09, 1571.158, 0.178], [563.905, 1790.085, 0.19], [581.151, 1421.06, 0.29]]\nD: [[705.525, 1667.251, 0.065], [604.36, 1921.35, 0.181], [722.147, 1476.341, 0.225], [572.745, 1584.256, 0.3]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_96_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_96_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_96_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_96_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_96_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_96_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_96_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_96_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[1782.298, 752.391, 0.81], [2097.954, 882.859, 0.769], [1957.145, 911.959, 1.057], [1675.514, 745.849, 0.876]]\nB: [[1869.593, 872.653, 0.94], [1517.943, 982.023, 0.755], [2107.729, 753.584, 0.782], [1585.194, 934.333, 0.784]]\nC: [[1792.225, 846.971, 0.887], [1792.225, 846.971, 0.887], [1792.225, 846.971, 0.887], [1792.225, 846.971, 0.887]]\nD: [[1767.979, 682.569, 0.962], [2081.075, 907.416, 0.768], [2002.738, 790.434, 0.955], [1720.834, 852.507, 1.032]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1782.298, 752.391, 0.81], [2097.954, 882.859, 0.769], [1957.145, 911.959, 1.057], [1675.514, 745.849, 0.876]]\nB: [[1869.593, 872.653, 0.94], [1517.943, 982.023, 0.755], [2107.729, 753.584, 0.782], [1585.194, 934.333, 0.784]]\nC: [[1792.225, 846.971, 0.887], [1792.225, 846.971, 0.887], [1792.225, 846.971, 0.887], [1792.225, 846.971, 0.887]]\nD: [[1767.979, 682.569, 0.962], [2081.075, 907.416, 0.768], [2002.738, 790.434, 0.955], [1720.834, 852.507, 1.032]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_97_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_97_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_97_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_97_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_97_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_97_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_97_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_97_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[724.326, 1495.604, -0.327], [676.702, 1884.979, 0.087], [764.302, 1335.092, 0.381], [548.436, 1731.853, 0.371]]\nB: [[651.934, 1624.096, -0.297], [652.686, 1623.474, 0.103], [653.181, 1623.053, 0.328], [653.687, 1622.622, 0.353]]\nC: [[607.922, 1820.17, -0.295], [753.684, 1433.034, 0.106], [678.243, 1447.392, 0.296], [570.652, 1420.87, 0.385]]\nD: [[751.414, 1685.804, -0.244], [578.476, 1684.213, 0.096], [590.679, 1441.453, 0.28], [536.506, 1378.646, 0.32]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[724.326, 1495.604, -0.327], [676.702, 1884.979, 0.087], [764.302, 1335.092, 0.381], [548.436, 1731.853, 0.371]]\nB: [[651.934, 1624.096, -0.297], [652.686, 1623.474, 0.103], [653.181, 1623.053, 0.328], [653.687, 1622.622, 0.353]]\nC: [[607.922, 1820.17, -0.295], [753.684, 1433.034, 0.106], [678.243, 1447.392, 0.296], [570.652, 1420.87, 0.385]]\nD: [[751.414, 1685.804, -0.244], [578.476, 1684.213, 0.096], [590.679, 1441.453, 0.28], [536.506, 1378.646, 0.32]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_98_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_98_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_98_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_98_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_98_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_98_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_98_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_98_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[1335.014, 862.316, 0.287], [1100.87, 1037.094, 0.23], [1485.551, 931.776, 0.26], [1291.897, 1074.94, 0.272]]\nB: [[1414.862, 952.185, 0.246], [1191.79, 1180.934, 0.227], [1337.485, 931.666, 0.225], [1205.041, 976.826, 0.257]]\nC: [[1365.108, 1014.952, 0.254], [1365.101, 1014.929, 0.254], [1365.094, 1014.907, 0.254], [1365.086, 1014.885, 0.254]]\nD: [[1286.094, 1146.653, 0.233], [1369.879, 1146.619, 0.278], [1377.756, 963.077, 0.259], [1621.927, 877.672, 0.256]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1335.014, 862.316, 0.287], [1100.87, 1037.094, 0.23], [1485.551, 931.776, 0.26], [1291.897, 1074.94, 0.272]]\nB: [[1414.862, 952.185, 0.246], [1191.79, 1180.934, 0.227], [1337.485, 931.666, 0.225], [1205.041, 976.826, 0.257]]\nC: [[1365.108, 1014.952, 0.254], [1365.101, 1014.929, 0.254], [1365.094, 1014.907, 0.254], [1365.086, 1014.885, 0.254]]\nD: [[1286.094, 1146.653, 0.233], [1369.879, 1146.619, 0.278], [1377.756, 963.077, 0.259], [1621.927, 877.672, 0.256]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_99_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_99_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_99_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_99_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_99_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_99_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_99_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_99_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[365.047, 598.348, 1.241], [286.54, 738.564, 1.05], [387.446, 604.928, 1.198], [391.006, 671.391, 1.378]]\nB: [[377.966, 628.41, 1.434], [361.11, 614.334, 1.35], [284.927, 755.854, 1.174], [403.902, 539.249, 1.302]]\nC: [[341.337, 715.12, 1.189], [372.63, 619.563, 1.39], [402.819, 670.746, 1.313], [340.745, 536.458, 1.343]]\nD: [[345.848, 655.799, 1.196], [343.13, 656.562, 1.18], [340.412, 657.325, 1.165], [337.693, 658.088, 1.149]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[365.047, 598.348, 1.241], [286.54, 738.564, 1.05], [387.446, 604.928, 1.198], [391.006, 671.391, 1.378]]\nB: [[377.966, 628.41, 1.434], [361.11, 614.334, 1.35], [284.927, 755.854, 1.174], [403.902, 539.249, 1.302]]\nC: [[341.337, 715.12, 1.189], [372.63, 619.563, 1.39], [402.819, 670.746, 1.313], [340.745, 536.458, 1.343]]\nD: [[345.848, 655.799, 1.196], [343.13, 656.562, 1.18], [340.412, 657.325, 1.165], [337.693, 658.088, 1.149]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_100_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_100_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_100_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_100_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_100_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_100_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_100_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_100_7.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[423.365, 1081.357, 1.92], [358.32, 1065.751, 2.221], [393.86, 1013.258, 1.856], [426.12, 1121.332, 2.203]]\nB: [[374.803, 1125.969, 1.58], [323.517, 1358.518, 1.869], [421.71, 1325.966, 2.205], [374.867, 1307.159, 2.25]]\nC: [[345.831, 1321.961, 2.14], [438.53, 1243.074, 1.588], [406.44, 1418.879, 2.198], [347.216, 1381.306, 1.978]]\nD: [[382.736, 1209.839, 1.88], [383.093, 1209.198, 1.931], [383.45, 1208.557, 1.982], [383.786, 1207.915, 1.982]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[423.365, 1081.357, 1.92], [358.32, 1065.751, 2.221], [393.86, 1013.258, 1.856], [426.12, 1121.332, 2.203]]\nB: [[374.803, 1125.969, 1.58], [323.517, 1358.518, 1.869], [421.71, 1325.966, 2.205], [374.867, 1307.159, 2.25]]\nC: [[345.831, 1321.961, 2.14], [438.53, 1243.074, 1.588], [406.44, 1418.879, 2.198], [347.216, 1381.306, 1.978]]\nD: [[382.736, 1209.839, 1.88], [383.093, 1209.198, 1.931], [383.45, 1208.557, 1.982], [383.786, 1207.915, 1.982]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_101_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_101_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_101_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_101_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_101_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_101_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_101_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_101_7.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[633.909, 1608.489, 1.038], [670.109, 1891.19, 1.125], [522.041, 1396.622, 1.098], [541.68, 1745.168, 1.117]]\nB: [[635.642, 1415.988, 1.081], [633.621, 1654.57, 0.946], [601.731, 1438.83, 1.36], [652.901, 1593.526, 1.066]]\nC: [[555.337, 1356.247, 1.199], [627.708, 1668.4, 0.904], [707.793, 1894.062, 1.109], [480.816, 1651.213, 1.309]]\nD: [[583.549, 1656.391, 1.267], [587.422, 1654.32, 1.126], [591.257, 1652.222, 1.146], [594.995, 1650.206, 1.166]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[633.909, 1608.489, 1.038], [670.109, 1891.19, 1.125], [522.041, 1396.622, 1.098], [541.68, 1745.168, 1.117]]\nB: [[635.642, 1415.988, 1.081], [633.621, 1654.57, 0.946], [601.731, 1438.83, 1.36], [652.901, 1593.526, 1.066]]\nC: [[555.337, 1356.247, 1.199], [627.708, 1668.4, 0.904], [707.793, 1894.062, 1.109], [480.816, 1651.213, 1.309]]\nD: [[583.549, 1656.391, 1.267], [587.422, 1654.32, 1.126], [591.257, 1652.222, 1.146], [594.995, 1650.206, 1.166]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_102_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_102_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_102_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_102_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_102_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_102_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_102_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_102_7.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[697.83, 1848.354, 0.41], [586.56, 1505.654, 0.534], [677.42, 1687.731, 0.544], [605.12, 1621.517, 0.808]]\nB: [[519.11, 1562.82, 0.383], [612.23, 1842.267, 0.582], [524.07, 1920.47, 0.561], [598.47, 1708.973, 0.9]]\nC: [[723.89, 1578.062, 0.473], [519.71, 1405.785, 0.584], [581.29, 1953.42, 0.735], [668.56, 1675.091, 0.868]]\nD: [[619.03, 1648.941, 0.413], [618.43, 1649.273, 0.538], [617.83, 1649.605, 0.663], [617.17, 1649.888, 0.813]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[697.83, 1848.354, 0.41], [586.56, 1505.654, 0.534], [677.42, 1687.731, 0.544], [605.12, 1621.517, 0.808]]\nB: [[519.11, 1562.82, 0.383], [612.23, 1842.267, 0.582], [524.07, 1920.47, 0.561], [598.47, 1708.973, 0.9]]\nC: [[723.89, 1578.062, 0.473], [519.71, 1405.785, 0.584], [581.29, 1953.42, 0.735], [668.56, 1675.091, 0.868]]\nD: [[619.03, 1648.941, 0.413], [618.43, 1649.273, 0.538], [617.83, 1649.605, 0.663], [617.17, 1649.888, 0.813]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_103_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_103_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_103_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_103_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_103_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_103_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_103_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_103_7.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[403.885, 998.16, 0.544], [424.075, 1007.073, 0.82], [461.305, 1015.923, 0.726], [400.111, 1095.775, 0.878]]\nB: [[391.661, 1114.07, 0.663], [391.696, 1114.047, 0.738], [391.688, 1114.052, 0.813], [391.697, 1114.047, 0.818]]\nC: [[386.99, 1297.58, 0.743], [409.207, 1135.586, 0.659], [357.708, 1116.073, 0.868], [392.676, 1300.061, 0.744]]\nD: [[402.241, 1225.3, 0.56], [375.81, 983.912, 0.615], [412.224, 1111.715, 0.708], [393.052, 1232.005, 0.961]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[403.885, 998.16, 0.544], [424.075, 1007.073, 0.82], [461.305, 1015.923, 0.726], [400.111, 1095.775, 0.878]]\nB: [[391.661, 1114.07, 0.663], [391.696, 1114.047, 0.738], [391.688, 1114.052, 0.813], [391.697, 1114.047, 0.818]]\nC: [[386.99, 1297.58, 0.743], [409.207, 1135.586, 0.659], [357.708, 1116.073, 0.868], [392.676, 1300.061, 0.744]]\nD: [[402.241, 1225.3, 0.56], [375.81, 983.912, 0.615], [412.224, 1111.715, 0.708], [393.052, 1232.005, 0.961]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_104_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_104_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_104_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_104_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_104_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_104_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_104_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_104_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[1277.031, 1033.186, 0.322], [1277.662, 1033.929, 0.322], [1279.057, 1035.823, 0.322], [1280.749, 1038.066, 0.372]]\nB: [[1421.88, 1233.909, 0.31], [1157.401, 1129.096, 0.349], [1356.001, 893.496, 0.351], [1288.688, 983.139, 0.356]]\nC: [[1125.382, 1211.613, 0.317], [1176.913, 1001.679, 0.291], [1346.252, 1080.898, 0.373], [1066.545, 1136.811, 0.352]]\nD: [[1059.6, 1024.583, 0.258], [1367.51, 878.274, 0.29], [1278.315, 1180.834, 0.347], [1136.279, 1162.583, 0.374]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1277.031, 1033.186, 0.322], [1277.662, 1033.929, 0.322], [1279.057, 1035.823, 0.322], [1280.749, 1038.066, 0.372]]\nB: [[1421.88, 1233.909, 0.31], [1157.401, 1129.096, 0.349], [1356.001, 893.496, 0.351], [1288.688, 983.139, 0.356]]\nC: [[1125.382, 1211.613, 0.317], [1176.913, 1001.679, 0.291], [1346.252, 1080.898, 0.373], [1066.545, 1136.811, 0.352]]\nD: [[1059.6, 1024.583, 0.258], [1367.51, 878.274, 0.29], [1278.315, 1180.834, 0.347], [1136.279, 1162.583, 0.374]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_105_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_105_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_105_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_105_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_105_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_105_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_105_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_105_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[415.331, 1104.242, 0.613], [415.326, 1104.24, 0.64], [415.316, 1104.236, 0.695], [415.31, 1104.234, 0.723]]\nB: [[345.131, 1192.788, 0.505], [371.883, 1269.14, 0.69], [447.888, 1224.599, 0.614], [392.44, 1170.864, 0.783]]\nC: [[454.914, 1278.297, 0.71], [434.859, 1136.62, 0.63], [416.021, 1211.093, 0.643], [337.73, 960.185, 0.669]]\nD: [[480.222, 1069.404, 0.56], [365.517, 970.37, 0.54], [386.514, 975.732, 0.743], [393.7, 923.805, 0.642]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[415.331, 1104.242, 0.613], [415.326, 1104.24, 0.64], [415.316, 1104.236, 0.695], [415.31, 1104.234, 0.723]]\nB: [[345.131, 1192.788, 0.505], [371.883, 1269.14, 0.69], [447.888, 1224.599, 0.614], [392.44, 1170.864, 0.783]]\nC: [[454.914, 1278.297, 0.71], [434.859, 1136.62, 0.63], [416.021, 1211.093, 0.643], [337.73, 960.185, 0.669]]\nD: [[480.222, 1069.404, 0.56], [365.517, 970.37, 0.54], [386.514, 975.732, 0.743], [393.7, 923.805, 0.642]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_106_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_106_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_106_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_106_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_106_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_106_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_106_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_106_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[407.576, 1163.308, 0.729], [407.573, 1163.324, 0.746], [407.57, 1163.34, 0.762], [407.569, 1163.357, 0.779]]\nB: [[387.473, 1137.771, 0.644], [384.287, 1365.683, 0.681], [390.89, 1137.17, 0.617], [457.784, 1284.967, 0.839]]\nC: [[360.164, 1015.983, 0.678], [381.237, 1053.29, 0.859], [457.22, 1360.88, 0.63], [408.603, 1334.048, 0.816]]\nD: [[392.585, 1372.768, 0.686], [426.374, 1363.72, 0.752], [443.3, 955.82, 0.704], [326.364, 1211.631, 0.769]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[407.576, 1163.308, 0.729], [407.573, 1163.324, 0.746], [407.57, 1163.34, 0.762], [407.569, 1163.357, 0.779]]\nB: [[387.473, 1137.771, 0.644], [384.287, 1365.683, 0.681], [390.89, 1137.17, 0.617], [457.784, 1284.967, 0.839]]\nC: [[360.164, 1015.983, 0.678], [381.237, 1053.29, 0.859], [457.22, 1360.88, 0.63], [408.603, 1334.048, 0.816]]\nD: [[392.585, 1372.768, 0.686], [426.374, 1363.72, 0.752], [443.3, 955.82, 0.704], [326.364, 1211.631, 0.769]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_107_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_107_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_107_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_107_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_107_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_107_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_107_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_107_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[1183.549, 1091.134, 0.407], [1449.921, 972.434, 0.409], [1115.763, 1023.251, 0.366], [1402.669, 992.802, 0.432]]\nB: [[1243.405, 864.452, 0.467], [1368.17, 1085.65, 0.361], [1076.736, 1221.575, 0.333], [1435.133, 1172.523, 0.468]]\nC: [[1295.125, 1032.757, 0.415], [1295.611, 1033.251, 0.415], [1296.187, 1033.665, 0.415], [1296.747, 1033.991, 0.415]]\nD: [[1335.953, 913.089, 0.398], [1461.39, 864.58, 0.459], [1483.452, 900.406, 0.383], [1264.928, 1038.725, 0.375]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1183.549, 1091.134, 0.407], [1449.921, 972.434, 0.409], [1115.763, 1023.251, 0.366], [1402.669, 992.802, 0.432]]\nB: [[1243.405, 864.452, 0.467], [1368.17, 1085.65, 0.361], [1076.736, 1221.575, 0.333], [1435.133, 1172.523, 0.468]]\nC: [[1295.125, 1032.757, 0.415], [1295.611, 1033.251, 0.415], [1296.187, 1033.665, 0.415], [1296.747, 1033.991, 0.415]]\nD: [[1335.953, 913.089, 0.398], [1461.39, 864.58, 0.459], [1483.452, 900.406, 0.383], [1264.928, 1038.725, 0.375]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_108_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_108_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_108_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_108_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_108_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_108_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_108_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_108_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[339.449, 659.894, 0.573], [339.446, 659.895, 0.607], [339.443, 659.896, 0.64], [339.44, 659.897, 0.674]]\nB: [[398.935, 645.734, 0.599], [298.581, 729.947, 0.634], [401.409, 592.555, 0.67], [389.37, 745.064, 0.776]]\nC: [[317.88, 666.567, 0.669], [318.154, 636.677, 0.526], [319.442, 702.387, 0.7], [331.82, 647.551, 0.682]]\nD: [[348.987, 658.876, 0.645], [370.001, 591.87, 0.551], [346.212, 591.313, 0.75], [291.32, 620.068, 0.565]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[339.449, 659.894, 0.573], [339.446, 659.895, 0.607], [339.443, 659.896, 0.64], [339.44, 659.897, 0.674]]\nB: [[398.935, 645.734, 0.599], [298.581, 729.947, 0.634], [401.409, 592.555, 0.67], [389.37, 745.064, 0.776]]\nC: [[317.88, 666.567, 0.669], [318.154, 636.677, 0.526], [319.442, 702.387, 0.7], [331.82, 647.551, 0.682]]\nD: [[348.987, 658.876, 0.645], [370.001, 591.87, 0.551], [346.212, 591.313, 0.75], [291.32, 620.068, 0.565]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_109_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_109_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_109_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_109_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_109_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_109_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_109_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_109_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[1348.14, 1074.93, 0.373], [1537.25, 1072.09, 0.5], [1196.975, 870.732, 0.55], [1269.511, 1097.657, 0.609]]\nB: [[1369.98, 1210.47, 0.291], [1383.17, 1209.316, 0.44], [1098.297, 933.023, 0.49], [1055.724, 1184.093, 0.615]]\nC: [[1279.19, 1030.84, 0.349], [1282.49, 1034.214, 0.43], [1285.285, 1037.189, 0.51], [1288.217, 1040.319, 0.591]]\nD: [[1424.53, 1145.06, 0.417], [1294.63, 1198.674, 0.47], [1368.216, 886.452, 0.51], [1389.846, 1124.768, 0.48]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1348.14, 1074.93, 0.373], [1537.25, 1072.09, 0.5], [1196.975, 870.732, 0.55], [1269.511, 1097.657, 0.609]]\nB: [[1369.98, 1210.47, 0.291], [1383.17, 1209.316, 0.44], [1098.297, 933.023, 0.49], [1055.724, 1184.093, 0.615]]\nC: [[1279.19, 1030.84, 0.349], [1282.49, 1034.214, 0.43], [1285.285, 1037.189, 0.51], [1288.217, 1040.319, 0.591]]\nD: [[1424.53, 1145.06, 0.417], [1294.63, 1198.674, 0.47], [1368.216, 886.452, 0.51], [1389.846, 1124.768, 0.48]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_110_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_110_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_110_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_110_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_110_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_110_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_110_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_110_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[1180.699, 1025.35, 0.352], [1360.818, 1139.597, 0.397], [1152.166, 1159.568, 0.296], [1106.717, 1234.187, 0.313]]\nB: [[1378.182, 1100.4, 0.333], [1294.85, 1232.299, 0.398], [1173.547, 969.988, 0.388], [1171.591, 1158.384, 0.396]]\nC: [[1086.537, 1116.193, 0.36], [1109.417, 1116.907, 0.31], [1478.169, 1103.822, 0.341], [1122.704, 957.886, 0.337]]\nD: [[1275.412, 1026.886, 0.336], [1278.054, 1029.742, 0.336], [1280.696, 1032.599, 0.336], [1283.018, 1035.321, 0.336]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1180.699, 1025.35, 0.352], [1360.818, 1139.597, 0.397], [1152.166, 1159.568, 0.296], [1106.717, 1234.187, 0.313]]\nB: [[1378.182, 1100.4, 0.333], [1294.85, 1232.299, 0.398], [1173.547, 969.988, 0.388], [1171.591, 1158.384, 0.396]]\nC: [[1086.537, 1116.193, 0.36], [1109.417, 1116.907, 0.31], [1478.169, 1103.822, 0.341], [1122.704, 957.886, 0.337]]\nD: [[1275.412, 1026.886, 0.336], [1278.054, 1029.742, 0.336], [1280.696, 1032.599, 0.336], [1283.018, 1035.321, 0.336]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_111_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_111_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_111_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_111_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_111_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_111_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_111_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_111_7.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[1567.635, 999.11, 0.216], [1581.604, 1034.59, 0.201], [2215.085, 945.14, 0.266], [1822.791, 878.16, 0.212]]\nB: [[1547.391, 1001.36, 0.216], [1923.868, 772.06, 0.229], [1775.081, 857.84, 0.257], [1976.165, 741.51, 0.198]]\nC: [[1924.297, 873.96, 0.189], [1924.297, 873.96, 0.206], [1924.297, 873.96, 0.223], [1924.297, 873.96, 0.239]]\nD: [[1739.859, 831.14, 0.169], [1930.99, 1015.96, 0.221], [1891.889, 1021.13, 0.241], [2233.369, 854.3, 0.277]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1567.635, 999.11, 0.216], [1581.604, 1034.59, 0.201], [2215.085, 945.14, 0.266], [1822.791, 878.16, 0.212]]\nB: [[1547.391, 1001.36, 0.216], [1923.868, 772.06, 0.229], [1775.081, 857.84, 0.257], [1976.165, 741.51, 0.198]]\nC: [[1924.297, 873.96, 0.189], [1924.297, 873.96, 0.206], [1924.297, 873.96, 0.223], [1924.297, 873.96, 0.239]]\nD: [[1739.859, 831.14, 0.169], [1930.99, 1015.96, 0.221], [1891.889, 1021.13, 0.241], [2233.369, 854.3, 0.277]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_112_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_112_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_112_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_112_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_112_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_112_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_112_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_112_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[389.455, 1221.754, 1.957], [389.017, 1221.738, 1.957], [388.246, 1221.696, 2.082], [387.407, 1221.65, 2.007]]\nB: [[430.695, 1244.614, 1.598], [332.727, 1219.984, 2.062], [451.568, 1172.545, 2.304], [431.932, 1447.12, 2.075]]\nC: [[434.759, 1360.34, 1.798], [320.818, 1065.151, 2.275], [403.374, 995.774, 1.782], [399.338, 1318.27, 2.25]]\nD: [[338.306, 1065.478, 2.175], [359.176, 1170.276, 2.145], [422.221, 1295.741, 2.146], [318.234, 1189.1, 1.616]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[389.455, 1221.754, 1.957], [389.017, 1221.738, 1.957], [388.246, 1221.696, 2.082], [387.407, 1221.65, 2.007]]\nB: [[430.695, 1244.614, 1.598], [332.727, 1219.984, 2.062], [451.568, 1172.545, 2.304], [431.932, 1447.12, 2.075]]\nC: [[434.759, 1360.34, 1.798], [320.818, 1065.151, 2.275], [403.374, 995.774, 1.782], [399.338, 1318.27, 2.25]]\nD: [[338.306, 1065.478, 2.175], [359.176, 1170.276, 2.145], [422.221, 1295.741, 2.146], [318.234, 1189.1, 1.616]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_113_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_113_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_113_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_113_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_113_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_113_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_113_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_113_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[236.0, 747.95, 0.854], [256.234, 709.537, 0.697], [304.474, 800.037, 0.56], [331.877, 561.326, 0.514]]\nB: [[314.28, 598.25, 0.849], [236.191, 740.031, 0.728], [250.825, 688.597, 0.635], [341.366, 613.896, 0.475]]\nC: [[246.64, 693.8, 0.858], [290.257, 567.854, 0.783], [293.745, 750.544, 0.62], [309.807, 562.559, 0.531]]\nD: [[289.28, 669.01, 0.775], [291.627, 672.377, 0.668], [293.977, 675.748, 0.562], [296.214, 678.956, 0.455]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[236.0, 747.95, 0.854], [256.234, 709.537, 0.697], [304.474, 800.037, 0.56], [331.877, 561.326, 0.514]]\nB: [[314.28, 598.25, 0.849], [236.191, 740.031, 0.728], [250.825, 688.597, 0.635], [341.366, 613.896, 0.475]]\nC: [[246.64, 693.8, 0.858], [290.257, 567.854, 0.783], [293.745, 750.544, 0.62], [309.807, 562.559, 0.531]]\nD: [[289.28, 669.01, 0.775], [291.627, 672.377, 0.668], [293.977, 675.748, 0.562], [296.214, 678.956, 0.455]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_114_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_114_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_114_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_114_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_114_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_114_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_114_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_114_7.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[533.8, 1735.728, -0.009], [518.05, 1383.369, 0.045], [743.409, 1439.406, 0.374], [553.203, 1712.789, 0.68]]\nB: [[653.646, 1831.884, -0.01], [745.339, 1445.929, 0.044], [684.645, 1812.914, 0.333], [569.065, 1458.696, 0.754]]\nC: [[572.656, 1841.565, -0.01], [747.719, 1494.494, 0.038], [688.766, 1558.475, 0.402], [740.666, 1414.102, 0.689]]\nD: [[637.791, 1636.674, -0.011], [637.381, 1637.067, 0.039], [636.158, 1638.241, 0.389], [635.756, 1638.659, 0.689]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[533.8, 1735.728, -0.009], [518.05, 1383.369, 0.045], [743.409, 1439.406, 0.374], [553.203, 1712.789, 0.68]]\nB: [[653.646, 1831.884, -0.01], [745.339, 1445.929, 0.044], [684.645, 1812.914, 0.333], [569.065, 1458.696, 0.754]]\nC: [[572.656, 1841.565, -0.01], [747.719, 1494.494, 0.038], [688.766, 1558.475, 0.402], [740.666, 1414.102, 0.689]]\nD: [[637.791, 1636.674, -0.011], [637.381, 1637.067, 0.039], [636.158, 1638.241, 0.389], [635.756, 1638.659, 0.689]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_115_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_115_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_115_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_115_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_115_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_115_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_115_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_115_7.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[357.199, 1105.26, 0.925], [357.199, 1105.26, 0.874], [357.199, 1105.26, 0.901], [357.199, 1105.26, 1.083]]\nB: [[405.104, 1231.8, 0.941], [321.418, 916.12, 1.011], [382.371, 913.36, 0.794], [428.391, 1299.88, 1.177]]\nC: [[352.491, 1140.82, 0.829], [377.607, 964.69, 0.939], [341.493, 1094.81, 0.997], [329.979, 894.62, 0.879]]\nD: [[368.993, 920.78, 0.953], [328.671, 1054.8, 1.001], [426.057, 1241.84, 0.874], [319.642, 1019.55, 1.122]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[357.199, 1105.26, 0.925], [357.199, 1105.26, 0.874], [357.199, 1105.26, 0.901], [357.199, 1105.26, 1.083]]\nB: [[405.104, 1231.8, 0.941], [321.418, 916.12, 1.011], [382.371, 913.36, 0.794], [428.391, 1299.88, 1.177]]\nC: [[352.491, 1140.82, 0.829], [377.607, 964.69, 0.939], [341.493, 1094.81, 0.997], [329.979, 894.62, 0.879]]\nD: [[368.993, 920.78, 0.953], [328.671, 1054.8, 1.001], [426.057, 1241.84, 0.874], [319.642, 1019.55, 1.122]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_116_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_116_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_116_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_116_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_116_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_116_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_116_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_116_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[390.292, 1280.196, 0.538], [390.276, 1322.072, 0.606], [337.427, 1327.052, 0.501], [388.832, 1049.397, 0.585]]\nB: [[441.456, 1017.908, 0.543], [440.359, 1177.164, 0.491], [371.894, 1012.041, 0.514], [347.529, 1236.907, 0.616]]\nC: [[398.584, 1179.211, 0.555], [316.744, 1033.547, 0.547], [377.513, 1090.27, 0.445], [332.149, 1080.471, 0.473]]\nD: [[393.298, 1155.018, 0.485], [393.298, 1155.017, 0.514], [393.298, 1155.016, 0.542], [393.297, 1155.015, 0.571]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[390.292, 1280.196, 0.538], [390.276, 1322.072, 0.606], [337.427, 1327.052, 0.501], [388.832, 1049.397, 0.585]]\nB: [[441.456, 1017.908, 0.543], [440.359, 1177.164, 0.491], [371.894, 1012.041, 0.514], [347.529, 1236.907, 0.616]]\nC: [[398.584, 1179.211, 0.555], [316.744, 1033.547, 0.547], [377.513, 1090.27, 0.445], [332.149, 1080.471, 0.473]]\nD: [[393.298, 1155.018, 0.485], [393.298, 1155.017, 0.514], [393.298, 1155.016, 0.542], [393.297, 1155.015, 0.571]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_117_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_117_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_117_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_117_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_117_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_117_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_117_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_117_7.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[436.159, 952.778, 0.85], [380.614, 1063.82, 0.774], [459.351, 1286.857, 0.672], [356.13, 1196.862, 0.73]]\nB: [[393.174, 1367.574, 0.683], [466.835, 1298.26, 0.635], [356.883, 1226.503, 0.681], [446.634, 1121.248, 0.813]]\nC: [[399.863, 1143.574, 0.738], [398.996, 1141.132, 0.738], [398.116, 1138.632, 0.738], [397.624, 1136.322, 0.738]]\nD: [[344.514, 1172.922, 0.671], [413.852, 1079.671, 0.613], [361.577, 1132.234, 0.863], [334.055, 1043.733, 0.866]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[436.159, 952.778, 0.85], [380.614, 1063.82, 0.774], [459.351, 1286.857, 0.672], [356.13, 1196.862, 0.73]]\nB: [[393.174, 1367.574, 0.683], [466.835, 1298.26, 0.635], [356.883, 1226.503, 0.681], [446.634, 1121.248, 0.813]]\nC: [[399.863, 1143.574, 0.738], [398.996, 1141.132, 0.738], [398.116, 1138.632, 0.738], [397.624, 1136.322, 0.738]]\nD: [[344.514, 1172.922, 0.671], [413.852, 1079.671, 0.613], [361.577, 1132.234, 0.863], [334.055, 1043.733, 0.866]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_118_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_118_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_118_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_118_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_118_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_118_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_118_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_118_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[379.59, 1225.256, 1.8], [411.498, 1212.145, 1.432], [439.173, 1051.415, 1.512], [378.757, 1170.606, 1.817]]\nB: [[385.93, 1201.138, 1.613], [385.521, 1201.641, 1.663], [384.966, 1202.306, 1.763], [384.443, 1202.903, 1.763]]\nC: [[447.45, 996.224, 1.641], [321.511, 1225.058, 1.654], [320.686, 1029.24, 1.737], [312.326, 1161.223, 1.53]]\nD: [[395.57, 1047.807, 1.499], [340.373, 1260.222, 1.497], [439.995, 1104.894, 1.86], [369.975, 1070.189, 1.508]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[379.59, 1225.256, 1.8], [411.498, 1212.145, 1.432], [439.173, 1051.415, 1.512], [378.757, 1170.606, 1.817]]\nB: [[385.93, 1201.138, 1.613], [385.521, 1201.641, 1.663], [384.966, 1202.306, 1.763], [384.443, 1202.903, 1.763]]\nC: [[447.45, 996.224, 1.641], [321.511, 1225.058, 1.654], [320.686, 1029.24, 1.737], [312.326, 1161.223, 1.53]]\nD: [[395.57, 1047.807, 1.499], [340.373, 1260.222, 1.497], [439.995, 1104.894, 1.86], [369.975, 1070.189, 1.508]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_119_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_119_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_119_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_119_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_119_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_119_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_119_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_119_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[2078.323, 958.565, 0.161], [1559.705, 959.242, 0.151], [2197.62, 763.907, 0.152], [1804.38, 767.638, 0.197]]\nB: [[2138.05, 913.396, 0.132], [2178.71, 1013.596, 0.176], [1703.36, 733.089, 0.191], [1798.969, 734.79, 0.2]]\nC: [[1835.477, 1025.448, 0.137], [1705.515, 1015.928, 0.153], [1802.73, 873.747, 0.182], [1809.594, 823.179, 0.19]]\nD: [[1926.648, 875.886, 0.141], [1926.639, 875.864, 0.154], [1926.63, 875.841, 0.166], [1926.627, 875.833, 0.179]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[2078.323, 958.565, 0.161], [1559.705, 959.242, 0.151], [2197.62, 763.907, 0.152], [1804.38, 767.638, 0.197]]\nB: [[2138.05, 913.396, 0.132], [2178.71, 1013.596, 0.176], [1703.36, 733.089, 0.191], [1798.969, 734.79, 0.2]]\nC: [[1835.477, 1025.448, 0.137], [1705.515, 1015.928, 0.153], [1802.73, 873.747, 0.182], [1809.594, 823.179, 0.19]]\nD: [[1926.648, 875.886, 0.141], [1926.639, 875.864, 0.154], [1926.63, 875.841, 0.166], [1926.627, 875.833, 0.179]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_120_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_120_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_120_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_120_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_120_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_120_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_120_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_120_7.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[1640.64, 2308.693, 0.719], [2017.236, 2113.119, 0.967], [1994.76, 2513.033, 1.14], [1810.618, 2225.686, 1.546]]\nB: [[1973.19, 2747.385, 0.918], [1843.455, 2503.49, 1.052], [1630.78, 2460.524, 1.38], [1987.593, 2630.677, 1.375]]\nC: [[1576.98, 2536.869, 0.66], [2075.147, 2055.992, 1.144], [1827.84, 2639.901, 1.45], [2070.073, 2767.351, 1.167]]\nD: [[1866.27, 2481.021, 0.817], [1865.675, 2481.739, 1.031], [1865.18, 2482.337, 1.21], [1864.684, 2482.936, 1.389]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1640.64, 2308.693, 0.719], [2017.236, 2113.119, 0.967], [1994.76, 2513.033, 1.14], [1810.618, 2225.686, 1.546]]\nB: [[1973.19, 2747.385, 0.918], [1843.455, 2503.49, 1.052], [1630.78, 2460.524, 1.38], [1987.593, 2630.677, 1.375]]\nC: [[1576.98, 2536.869, 0.66], [2075.147, 2055.992, 1.144], [1827.84, 2639.901, 1.45], [2070.073, 2767.351, 1.167]]\nD: [[1866.27, 2481.021, 0.817], [1865.675, 2481.739, 1.031], [1865.18, 2482.337, 1.21], [1864.684, 2482.936, 1.389]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_121_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_121_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_121_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_121_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_121_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_121_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_121_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_121_7.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[1323.534, 1070.968, 0.039], [1242.872, 853.737, 0.042], [1492.766, 1030.756, 0.057], [1309.345, 1096.471, 0.046]]\nB: [[1556.573, 856.346, 0.057], [1108.062, 1183.213, 0.047], [1303.053, 903.29, 0.05], [1529.898, 1191.182, 0.045]]\nC: [[1351.279, 1022.468, 0.048], [1351.279, 1022.468, 0.048], [1351.279, 1022.468, 0.048], [1351.279, 1022.468, 0.048]]\nD: [[1160.276, 1004.554, 0.048], [1454.931, 1040.919, 0.039], [1167.504, 987.892, 0.046], [1457.735, 818.113, 0.042]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1323.534, 1070.968, 0.039], [1242.872, 853.737, 0.042], [1492.766, 1030.756, 0.057], [1309.345, 1096.471, 0.046]]\nB: [[1556.573, 856.346, 0.057], [1108.062, 1183.213, 0.047], [1303.053, 903.29, 0.05], [1529.898, 1191.182, 0.045]]\nC: [[1351.279, 1022.468, 0.048], [1351.279, 1022.468, 0.048], [1351.279, 1022.468, 0.048], [1351.279, 1022.468, 0.048]]\nD: [[1160.276, 1004.554, 0.048], [1454.931, 1040.919, 0.039], [1167.504, 987.892, 0.046], [1457.735, 818.113, 0.042]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_122_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_122_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_122_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_122_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_122_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_122_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_122_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_122_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[398.992, 1279.087, 0.118], [456.215, 1152.193, 0.133], [407.776, 1279.429, 0.095], [527.113, 1246.616, 0.136]]\nB: [[448.696, 1090.248, 0.117], [448.695, 1090.246, 0.117], [448.686, 1090.224, 0.115], [448.685, 1090.222, 0.114]]\nC: [[435.77, 875.144, 0.115], [440.962, 1303.479, 0.129], [413.225, 1290.42, 0.105], [511.071, 1036.309, 0.122]]\nD: [[438.596, 955.475, 0.137], [464.97, 1295.34, 0.118], [386.42, 1095.841, 0.125], [437.592, 1200.522, 0.127]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[398.992, 1279.087, 0.118], [456.215, 1152.193, 0.133], [407.776, 1279.429, 0.095], [527.113, 1246.616, 0.136]]\nB: [[448.696, 1090.248, 0.117], [448.695, 1090.246, 0.117], [448.686, 1090.224, 0.115], [448.685, 1090.222, 0.114]]\nC: [[435.77, 875.144, 0.115], [440.962, 1303.479, 0.129], [413.225, 1290.42, 0.105], [511.071, 1036.309, 0.122]]\nD: [[438.596, 955.475, 0.137], [464.97, 1295.34, 0.118], [386.42, 1095.841, 0.125], [437.592, 1200.522, 0.127]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_123_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_123_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_123_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_123_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_123_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_123_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_123_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_123_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[1835.491, 834.002, 0.48], [2125.794, 969.393, 0.677], [1614.306, 1027.608, 0.583], [1983.247, 970.603, 0.541]]\nB: [[1651.575, 740.269, 0.56], [1919.293, 887.545, 0.629], [1867.876, 908.887, 0.565], [1937.748, 943.609, 0.511]]\nC: [[1784.634, 874.597, 0.596], [1784.597, 874.576, 0.596], [1784.564, 874.558, 0.596], [1784.764, 874.582, 0.596]]\nD: [[1674.888, 950.802, 0.589], [2065.024, 902.619, 0.528], [2130.173, 1019.966, 0.552], [2067.829, 931.775, 0.63]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1835.491, 834.002, 0.48], [2125.794, 969.393, 0.677], [1614.306, 1027.608, 0.583], [1983.247, 970.603, 0.541]]\nB: [[1651.575, 740.269, 0.56], [1919.293, 887.545, 0.629], [1867.876, 908.887, 0.565], [1937.748, 943.609, 0.511]]\nC: [[1784.634, 874.597, 0.596], [1784.597, 874.576, 0.596], [1784.564, 874.558, 0.596], [1784.764, 874.582, 0.596]]\nD: [[1674.888, 950.802, 0.589], [2065.024, 902.619, 0.528], [2130.173, 1019.966, 0.552], [2067.829, 931.775, 0.63]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_124_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_124_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_124_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_124_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_124_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_124_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_124_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_124_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[276.129, 765.803, 0.706], [239.259, 629.205, 0.8], [265.678, 619.0, 0.624], [272.857, 563.448, 0.52]]\nB: [[267.858, 571.717, 0.774], [277.532, 772.037, 0.819], [265.677, 626.589, 0.69], [303.026, 678.599, 0.635]]\nC: [[307.434, 646.641, 0.793], [279.193, 720.372, 0.75], [342.062, 733.991, 0.756], [275.316, 788.349, 0.594]]\nD: [[287.863, 668.522, 0.723], [289.106, 670.134, 0.687], [290.511, 671.955, 0.702], [292.583, 674.718, 0.593]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[276.129, 765.803, 0.706], [239.259, 629.205, 0.8], [265.678, 619.0, 0.624], [272.857, 563.448, 0.52]]\nB: [[267.858, 571.717, 0.774], [277.532, 772.037, 0.819], [265.677, 626.589, 0.69], [303.026, 678.599, 0.635]]\nC: [[307.434, 646.641, 0.793], [279.193, 720.372, 0.75], [342.062, 733.991, 0.756], [275.316, 788.349, 0.594]]\nD: [[287.863, 668.522, 0.723], [289.106, 670.134, 0.687], [290.511, 671.955, 0.702], [292.583, 674.718, 0.593]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_125_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_125_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_125_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_125_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_125_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_125_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_125_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_125_7.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[622.35, 1624.018, -0.016], [621.599, 1624.59, 0.068], [620.967, 1625.056, 0.201], [620.312, 1625.598, 0.284]]\nB: [[715.37, 1646.519, -0.015], [571.805, 1818.41, 0.061], [556.199, 1828.057, 0.206], [719.663, 1568.216, 0.242]]\nC: [[619.74, 1639.913, -0.014], [696.763, 1306.55, 0.056], [512.97, 1560.484, 0.186], [567.584, 1424.02, 0.237]]\nD: [[676.88, 1409.501, -0.016], [537.018, 1735.64, 0.057], [546.621, 1339.978, 0.22], [568.0, 1888.129, 0.245]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[622.35, 1624.018, -0.016], [621.599, 1624.59, 0.068], [620.967, 1625.056, 0.201], [620.312, 1625.598, 0.284]]\nB: [[715.37, 1646.519, -0.015], [571.805, 1818.41, 0.061], [556.199, 1828.057, 0.206], [719.663, 1568.216, 0.242]]\nC: [[619.74, 1639.913, -0.014], [696.763, 1306.55, 0.056], [512.97, 1560.484, 0.186], [567.584, 1424.02, 0.237]]\nD: [[676.88, 1409.501, -0.016], [537.018, 1735.64, 0.057], [546.621, 1339.978, 0.22], [568.0, 1888.129, 0.245]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_126_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_126_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_126_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_126_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_126_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_126_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_126_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_126_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[627.008, 1617.877, -0.387], [626.7, 1618.657, -0.137], [626.332, 1619.431, 0.163], [626.034, 1619.837, 0.363]]\nB: [[514.156, 1782.736, -0.333], [575.6, 1636.318, -0.131], [743.355, 1576.589, 0.179], [505.541, 1559.477, 0.32]]\nC: [[712.578, 1866.613, -0.344], [558.1, 1427.09, -0.154], [677.337, 1665.044, 0.133], [550.249, 1826.976, 0.376]]\nD: [[618.87, 1499.776, -0.427], [647.6, 1861.481, -0.148], [699.281, 1872.065, 0.164], [640.722, 1817.452, 0.342]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[627.008, 1617.877, -0.387], [626.7, 1618.657, -0.137], [626.332, 1619.431, 0.163], [626.034, 1619.837, 0.363]]\nB: [[514.156, 1782.736, -0.333], [575.6, 1636.318, -0.131], [743.355, 1576.589, 0.179], [505.541, 1559.477, 0.32]]\nC: [[712.578, 1866.613, -0.344], [558.1, 1427.09, -0.154], [677.337, 1665.044, 0.133], [550.249, 1826.976, 0.376]]\nD: [[618.87, 1499.776, -0.427], [647.6, 1861.481, -0.148], [699.281, 1872.065, 0.164], [640.722, 1817.452, 0.342]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_127_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_127_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_127_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_127_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_127_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_127_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_127_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_127_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[353.535, 1093.572, 0.8], [325.884, 1175.321, 0.708], [298.264, 1160.539, 1.06], [403.515, 1196.443, 0.896]]\nB: [[298.277, 1279.808, 0.8], [334.872, 990.495, 0.719], [317.907, 1145.582, 0.785], [428.226, 1134.096, 1.096]]\nC: [[361.234, 1127.159, 0.743], [361.244, 1127.193, 0.761], [361.254, 1127.227, 0.979], [361.252, 1127.231, 1.019]]\nD: [[351.808, 979.748, 0.733], [299.091, 972.477, 0.91], [422.591, 1328.277, 1.109], [373.924, 1003.202, 0.826]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[353.535, 1093.572, 0.8], [325.884, 1175.321, 0.708], [298.264, 1160.539, 1.06], [403.515, 1196.443, 0.896]]\nB: [[298.277, 1279.808, 0.8], [334.872, 990.495, 0.719], [317.907, 1145.582, 0.785], [428.226, 1134.096, 1.096]]\nC: [[361.234, 1127.159, 0.743], [361.244, 1127.193, 0.761], [361.254, 1127.227, 0.979], [361.252, 1127.231, 1.019]]\nD: [[351.808, 979.748, 0.733], [299.091, 972.477, 0.91], [422.591, 1328.277, 1.109], [373.924, 1003.202, 0.826]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_128_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_128_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_128_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_128_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_128_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_128_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_128_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_128_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[467.084, 1013.697, 1.484], [410.637, 1087.555, 1.47], [503.896, 994.189, 1.833], [375.598, 1172.16, 1.996]]\nB: [[403.645, 1031.052, 1.679], [463.451, 943.028, 1.451], [499.44, 1242.514, 2.094], [468.957, 1220.61, 2.107]]\nC: [[479.961, 1178.515, 1.846], [421.912, 1195.377, 1.945], [395.807, 904.258, 1.58], [479.041, 963.66, 1.573]]\nD: [[443.949, 1116.592, 1.729], [443.607, 1116.621, 1.729], [442.518, 1116.448, 1.879], [442.143, 1116.34, 1.929]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[467.084, 1013.697, 1.484], [410.637, 1087.555, 1.47], [503.896, 994.189, 1.833], [375.598, 1172.16, 1.996]]\nB: [[403.645, 1031.052, 1.679], [463.451, 943.028, 1.451], [499.44, 1242.514, 2.094], [468.957, 1220.61, 2.107]]\nC: [[479.961, 1178.515, 1.846], [421.912, 1195.377, 1.945], [395.807, 904.258, 1.58], [479.041, 963.66, 1.573]]\nD: [[443.949, 1116.592, 1.729], [443.607, 1116.621, 1.729], [442.518, 1116.448, 1.879], [442.143, 1116.34, 1.929]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_129_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_129_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_129_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_129_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_129_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_129_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_129_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_129_7.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[1044.891, 1237.212, 0.684], [1071.2, 1248.461, 0.639], [1210.008, 933.973, 0.707], [1328.735, 877.082, 0.748]]\nB: [[1117.371, 1205.206, 0.822], [1089.2, 940.984, 0.629], [1072.282, 905.107, 0.824], [1173.176, 946.517, 0.885]]\nC: [[1227.559, 936.208, 0.663], [1471.6, 1143.386, 0.863], [1177.563, 842.525, 0.712], [1310.648, 1103.801, 0.83]]\nD: [[1267.451, 1047.078, 0.822], [1266.5, 1047.564, 0.754], [1265.609, 1047.998, 0.762], [1257.032, 1054.386, 0.759]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1044.891, 1237.212, 0.684], [1071.2, 1248.461, 0.639], [1210.008, 933.973, 0.707], [1328.735, 877.082, 0.748]]\nB: [[1117.371, 1205.206, 0.822], [1089.2, 940.984, 0.629], [1072.282, 905.107, 0.824], [1173.176, 946.517, 0.885]]\nC: [[1227.559, 936.208, 0.663], [1471.6, 1143.386, 0.863], [1177.563, 842.525, 0.712], [1310.648, 1103.801, 0.83]]\nD: [[1267.451, 1047.078, 0.822], [1266.5, 1047.564, 0.754], [1265.609, 1047.998, 0.762], [1257.032, 1054.386, 0.759]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_130_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_130_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_130_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_130_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_130_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_130_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_130_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_130_7.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[1275.825, 1026.459, 0.275], [1278.063, 1029.09, 0.375], [1280.981, 1032.367, 0.325], [1283.902, 1035.648, 0.374]]\nB: [[1030.171, 1170.182, 0.27], [1123.271, 830.3, 0.339], [1164.96, 938.971, 0.375], [1317.327, 864.217, 0.318]]\nC: [[1058.02, 1197.606, 0.255], [1412.723, 1041.94, 0.385], [1413.334, 1081.562, 0.344], [1284.333, 1092.197, 0.438]]\nD: [[1469.903, 1189.502, 0.313], [1314.332, 1032.81, 0.399], [1118.592, 1102.621, 0.281], [1269.138, 1091.852, 0.359]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1275.825, 1026.459, 0.275], [1278.063, 1029.09, 0.375], [1280.981, 1032.367, 0.325], [1283.902, 1035.648, 0.374]]\nB: [[1030.171, 1170.182, 0.27], [1123.271, 830.3, 0.339], [1164.96, 938.971, 0.375], [1317.327, 864.217, 0.318]]\nC: [[1058.02, 1197.606, 0.255], [1412.723, 1041.94, 0.385], [1413.334, 1081.562, 0.344], [1284.333, 1092.197, 0.438]]\nD: [[1469.903, 1189.502, 0.313], [1314.332, 1032.81, 0.399], [1118.592, 1102.621, 0.281], [1269.138, 1091.852, 0.359]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_131_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_131_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_131_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_131_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_131_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_131_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_131_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_131_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[421.972, 1212.966, 0.431], [433.3, 1273.552, 0.507], [464.634, 1128.046, 0.702], [349.601, 1228.731, 0.578]]\nB: [[400.984, 1376.129, 0.581], [391.113, 1165.646, 0.7], [457.469, 1280.832, 0.616], [442.522, 1062.927, 0.701]]\nC: [[449.392, 986.304, 0.601], [473.649, 1081.286, 0.52], [358.026, 1320.626, 0.568], [395.395, 1377.932, 0.573]]\nD: [[399.773, 1169.799, 0.536], [399.773, 1169.799, 0.586], [399.773, 1169.799, 0.636], [399.773, 1169.799, 0.681]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[421.972, 1212.966, 0.431], [433.3, 1273.552, 0.507], [464.634, 1128.046, 0.702], [349.601, 1228.731, 0.578]]\nB: [[400.984, 1376.129, 0.581], [391.113, 1165.646, 0.7], [457.469, 1280.832, 0.616], [442.522, 1062.927, 0.701]]\nC: [[449.392, 986.304, 0.601], [473.649, 1081.286, 0.52], [358.026, 1320.626, 0.568], [395.395, 1377.932, 0.573]]\nD: [[399.773, 1169.799, 0.536], [399.773, 1169.799, 0.586], [399.773, 1169.799, 0.636], [399.773, 1169.799, 0.681]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_132_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_132_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_132_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_132_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_132_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_132_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_132_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_132_7.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[417.658, 1171.585, 1.02], [344.402, 1146.41, 1.185], [367.351, 1178.349, 1.028], [487.83, 1043.697, 0.98]]\nB: [[445.605, 1419.113, 0.98], [352.2, 1288.12, 1.051], [459.196, 983.633, 1.107], [368.22, 1292.263, 1.32]]\nC: [[450.048, 1344.51, 1.16], [403.323, 1079.3, 1.248], [335.599, 1292.674, 1.335], [385.64, 1056.834, 1.11]]\nD: [[419.296, 1191.476, 1.11], [418.846, 1191.58, 1.143], [418.293, 1191.727, 1.176], [417.52, 1191.939, 1.21]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[417.658, 1171.585, 1.02], [344.402, 1146.41, 1.185], [367.351, 1178.349, 1.028], [487.83, 1043.697, 0.98]]\nB: [[445.605, 1419.113, 0.98], [352.2, 1288.12, 1.051], [459.196, 983.633, 1.107], [368.22, 1292.263, 1.32]]\nC: [[450.048, 1344.51, 1.16], [403.323, 1079.3, 1.248], [335.599, 1292.674, 1.335], [385.64, 1056.834, 1.11]]\nD: [[419.296, 1191.476, 1.11], [418.846, 1191.58, 1.143], [418.293, 1191.727, 1.176], [417.52, 1191.939, 1.21]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_133_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_133_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_133_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_133_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_133_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_133_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_133_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_133_7.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[364.455, 946.84, 0.64], [334.117, 1088.62, 0.558], [343.529, 1215.1, 0.571], [332.561, 991.64, 0.55]]\nB: [[398.222, 1166.03, 0.56], [398.222, 1166.03, 0.577], [398.222, 1166.03, 0.594], [398.222, 1166.03, 0.61]]\nC: [[452.892, 1109.11, 0.63], [362.882, 1081.56, 0.574], [328.005, 1052.37, 0.65], [326.765, 997.91, 0.68]]\nD: [[389.913, 1383.18, 0.51], [334.65, 1310.36, 0.682], [445.091, 1036.45, 0.591], [404.94, 1152.47, 0.57]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[364.455, 946.84, 0.64], [334.117, 1088.62, 0.558], [343.529, 1215.1, 0.571], [332.561, 991.64, 0.55]]\nB: [[398.222, 1166.03, 0.56], [398.222, 1166.03, 0.577], [398.222, 1166.03, 0.594], [398.222, 1166.03, 0.61]]\nC: [[452.892, 1109.11, 0.63], [362.882, 1081.56, 0.574], [328.005, 1052.37, 0.65], [326.765, 997.91, 0.68]]\nD: [[389.913, 1383.18, 0.51], [334.65, 1310.36, 0.682], [445.091, 1036.45, 0.591], [404.94, 1152.47, 0.57]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_134_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_134_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_134_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_134_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_134_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_134_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_134_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_134_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[627.271, 1619.557, -0.161], [627.006, 1619.87, -0.051], [626.747, 1620.187, 0.11], [626.52, 1620.528, 0.17]]\nB: [[569.336, 1657.23, -0.136], [526.963, 1384.47, -0.061], [669.247, 1891.64, 0.11], [671.16, 1857.428, 0.15]]\nC: [[684.005, 1527.275, -0.146], [739.824, 1494.52, -0.06], [521.003, 1884.978, 0.09], [553.11, 1840.593, 0.19]]\nD: [[532.728, 1841.748, -0.144], [536.854, 1368.26, -0.059], [622.506, 1400.948, 0.12], [562.38, 1942.023, 0.18]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[627.271, 1619.557, -0.161], [627.006, 1619.87, -0.051], [626.747, 1620.187, 0.11], [626.52, 1620.528, 0.17]]\nB: [[569.336, 1657.23, -0.136], [526.963, 1384.47, -0.061], [669.247, 1891.64, 0.11], [671.16, 1857.428, 0.15]]\nC: [[684.005, 1527.275, -0.146], [739.824, 1494.52, -0.06], [521.003, 1884.978, 0.09], [553.11, 1840.593, 0.19]]\nD: [[532.728, 1841.748, -0.144], [536.854, 1368.26, -0.059], [622.506, 1400.948, 0.12], [562.38, 1942.023, 0.18]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_135_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_135_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_135_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_135_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_135_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_135_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_135_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_135_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[1276.426, 1070.932, 0.876], [1276.425, 1070.932, 0.877], [1276.424, 1070.932, 0.878], [1276.423, 1070.932, 0.879]]\nB: [[1298.613, 1048.861, 0.963], [1211.744, 1284.0, 0.977], [1133.349, 1252.098, 0.958], [1442.465, 1081.694, 0.942]]\nC: [[1136.57, 1184.933, 0.959], [1263.407, 1137.283, 0.74], [1237.716, 1079.234, 0.996], [1254.286, 1092.816, 1.0]]\nD: [[1156.908, 984.436, 0.862], [1293.574, 1008.462, 0.755], [1072.394, 1109.853, 0.763], [1158.181, 1086.592, 0.975]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1276.426, 1070.932, 0.876], [1276.425, 1070.932, 0.877], [1276.424, 1070.932, 0.878], [1276.423, 1070.932, 0.879]]\nB: [[1298.613, 1048.861, 0.963], [1211.744, 1284.0, 0.977], [1133.349, 1252.098, 0.958], [1442.465, 1081.694, 0.942]]\nC: [[1136.57, 1184.933, 0.959], [1263.407, 1137.283, 0.74], [1237.716, 1079.234, 0.996], [1254.286, 1092.816, 1.0]]\nD: [[1156.908, 984.436, 0.862], [1293.574, 1008.462, 0.755], [1072.394, 1109.853, 0.763], [1158.181, 1086.592, 0.975]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_136_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_136_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_136_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_136_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_136_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_136_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_136_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_136_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[1863.527, 866.104, 1.229], [1863.553, 866.65, 1.085], [1863.585, 867.332, 1.016], [1863.611, 868.023, 1.0]]\nB: [[1741.116, 973.52, 1.473], [1586.126, 927.91, 1.219], [1837.83, 816.557, 1.029], [1765.354, 1012.863, 0.8]]\nC: [[2109.608, 749.973, 1.352], [2151.463, 723.35, 1.155], [2081.946, 774.988, 1.039], [1584.067, 819.061, 1.0]]\nD: [[1619.868, 705.702, 1.426], [2015.736, 882.4, 1.072], [1611.742, 1030.157, 1.001], [1809.71, 882.281, 0.9]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1863.527, 866.104, 1.229], [1863.553, 866.65, 1.085], [1863.585, 867.332, 1.016], [1863.611, 868.023, 1.0]]\nB: [[1741.116, 973.52, 1.473], [1586.126, 927.91, 1.219], [1837.83, 816.557, 1.029], [1765.354, 1012.863, 0.8]]\nC: [[2109.608, 749.973, 1.352], [2151.463, 723.35, 1.155], [2081.946, 774.988, 1.039], [1584.067, 819.061, 1.0]]\nD: [[1619.868, 705.702, 1.426], [2015.736, 882.4, 1.072], [1611.742, 1030.157, 1.001], [1809.71, 882.281, 0.9]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_137_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_137_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_137_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_137_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_137_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_137_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_137_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_137_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[370.828, 1093.568, 0.573], [438.229, 1239.617, 0.497], [355.298, 1015.971, 0.498], [469.743, 1216.196, 0.622]]\nB: [[334.534, 1298.472, 0.487], [330.759, 1369.516, 0.441], [394.543, 1079.174, 0.619], [471.577, 1146.247, 0.639]]\nC: [[394.842, 1158.711, 0.487], [394.842, 1158.711, 0.521], [394.842, 1158.711, 0.554], [394.842, 1158.711, 0.587]]\nD: [[370.596, 976.597, 0.509], [364.598, 996.341, 0.435], [427.969, 1274.101, 0.549], [391.146, 1206.744, 0.606]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[370.828, 1093.568, 0.573], [438.229, 1239.617, 0.497], [355.298, 1015.971, 0.498], [469.743, 1216.196, 0.622]]\nB: [[334.534, 1298.472, 0.487], [330.759, 1369.516, 0.441], [394.543, 1079.174, 0.619], [471.577, 1146.247, 0.639]]\nC: [[394.842, 1158.711, 0.487], [394.842, 1158.711, 0.521], [394.842, 1158.711, 0.554], [394.842, 1158.711, 0.587]]\nD: [[370.596, 976.597, 0.509], [364.598, 996.341, 0.435], [427.969, 1274.101, 0.549], [391.146, 1206.744, 0.606]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_138_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_138_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_138_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_138_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_138_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_138_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_138_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_138_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[350.427, 1144.305, 0.623], [349.868, 1144.535, 0.69], [349.308, 1144.766, 0.756], [348.749, 1144.996, 0.823]]\nB: [[364.104, 961.597, 0.533], [321.289, 1034.564, 0.81], [289.738, 1178.466, 0.654], [369.278, 927.402, 0.746]]\nC: [[301.407, 1085.027, 0.561], [353.922, 1230.167, 0.74], [385.078, 1056.365, 0.831], [353.967, 1321.653, 0.933]]\nD: [[332.499, 1323.247, 0.603], [328.44, 1217.95, 0.71], [304.408, 1248.393, 0.704], [312.725, 1041.977, 0.788]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[350.427, 1144.305, 0.623], [349.868, 1144.535, 0.69], [349.308, 1144.766, 0.756], [348.749, 1144.996, 0.823]]\nB: [[364.104, 961.597, 0.533], [321.289, 1034.564, 0.81], [289.738, 1178.466, 0.654], [369.278, 927.402, 0.746]]\nC: [[301.407, 1085.027, 0.561], [353.922, 1230.167, 0.74], [385.078, 1056.365, 0.831], [353.967, 1321.653, 0.933]]\nD: [[332.499, 1323.247, 0.603], [328.44, 1217.95, 0.71], [304.408, 1248.393, 0.704], [312.725, 1041.977, 0.788]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_139_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_139_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_139_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_139_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_139_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_139_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_139_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_139_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[1098.816, 1040.956, 2.25], [1517.027, 1201.267, 2.159], [1287.68, 1114.311, 1.84], [1553.384, 936.614, 1.891]]\nB: [[1537.939, 1003.593, 1.619], [1107.39, 826.486, 1.866], [1160.283, 877.892, 2.283], [1427.746, 1058.395, 2.165]]\nC: [[1325.647, 1026.158, 1.972], [1325.647, 1026.158, 1.972], [1325.647, 1026.158, 1.972], [1325.647, 1026.158, 1.972]]\nD: [[1454.649, 935.993, 2.155], [1581.869, 969.794, 1.581], [1203.456, 996.196, 2.063], [1385.658, 925.079, 2.322]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1098.816, 1040.956, 2.25], [1517.027, 1201.267, 2.159], [1287.68, 1114.311, 1.84], [1553.384, 936.614, 1.891]]\nB: [[1537.939, 1003.593, 1.619], [1107.39, 826.486, 1.866], [1160.283, 877.892, 2.283], [1427.746, 1058.395, 2.165]]\nC: [[1325.647, 1026.158, 1.972], [1325.647, 1026.158, 1.972], [1325.647, 1026.158, 1.972], [1325.647, 1026.158, 1.972]]\nD: [[1454.649, 935.993, 2.155], [1581.869, 969.794, 1.581], [1203.456, 996.196, 2.063], [1385.658, 925.079, 2.322]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_140_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_140_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_140_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_140_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_140_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_140_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_140_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_140_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[436.144, 1095.893, 0.64], [378.481, 1266.824, 0.483], [426.128, 1098.819, 0.65], [324.34, 949.634, 0.625]]\nB: [[397.389, 1164.192, 0.54], [397.389, 1164.192, 0.565], [397.389, 1164.192, 0.59], [397.389, 1164.192, 0.615]]\nC: [[380.365, 1356.637, 0.44], [339.063, 1111.83, 0.512], [337.584, 979.936, 0.64], [437.254, 1203.389, 0.683]]\nD: [[323.527, 1041.167, 0.63], [470.94, 1158.877, 0.637], [366.836, 1001.327, 0.61], [420.137, 1320.47, 0.577]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[436.144, 1095.893, 0.64], [378.481, 1266.824, 0.483], [426.128, 1098.819, 0.65], [324.34, 949.634, 0.625]]\nB: [[397.389, 1164.192, 0.54], [397.389, 1164.192, 0.565], [397.389, 1164.192, 0.59], [397.389, 1164.192, 0.615]]\nC: [[380.365, 1356.637, 0.44], [339.063, 1111.83, 0.512], [337.584, 979.936, 0.64], [437.254, 1203.389, 0.683]]\nD: [[323.527, 1041.167, 0.63], [470.94, 1158.877, 0.637], [366.836, 1001.327, 0.61], [420.137, 1320.47, 0.577]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_141_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_141_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_141_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_141_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_141_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_141_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_141_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_141_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[345.303, 927.986, 0.6], [322.702, 926.62, 0.836], [322.388, 1098.752, 0.715], [411.956, 1203.533, 1.062]]\nB: [[360.095, 1122.376, 0.7], [360.061, 1122.39, 0.773], [360.027, 1122.404, 0.846], [359.993, 1122.417, 0.918]]\nC: [[325.719, 970.767, 0.8], [352.264, 988.9, 0.891], [370.173, 1212.852, 0.847], [306.427, 1052.878, 0.831]]\nD: [[408.76, 1154.201, 0.8], [384.285, 1027.05, 0.647], [381.462, 1131.647, 0.827], [348.63, 1106.215, 0.947]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[345.303, 927.986, 0.6], [322.702, 926.62, 0.836], [322.388, 1098.752, 0.715], [411.956, 1203.533, 1.062]]\nB: [[360.095, 1122.376, 0.7], [360.061, 1122.39, 0.773], [360.027, 1122.404, 0.846], [359.993, 1122.417, 0.918]]\nC: [[325.719, 970.767, 0.8], [352.264, 988.9, 0.891], [370.173, 1212.852, 0.847], [306.427, 1052.878, 0.831]]\nD: [[408.76, 1154.201, 0.8], [384.285, 1027.05, 0.647], [381.462, 1131.647, 0.827], [348.63, 1106.215, 0.947]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_142_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_142_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_142_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_142_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_142_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_142_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_142_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_142_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[342.004, 1040.88, 0.744], [329.607, 1045.03, 0.684], [379.212, 1250.647, 0.868], [326.815, 1264.503, 0.847]]\nB: [[356.724, 1113.785, 0.625], [356.749, 1113.855, 0.775], [356.778, 1113.889, 0.975], [356.785, 1113.897, 1.025]]\nC: [[412.74, 910.82, 0.676], [365.411, 1210.523, 0.664], [303.937, 1114.862, 0.873], [419.175, 1333.448, 0.84]]\nD: [[295.038, 1240.739, 0.504], [341.219, 1044.42, 0.812], [352.463, 1064.815, 1.125], [371.869, 1069.702, 0.977]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[342.004, 1040.88, 0.744], [329.607, 1045.03, 0.684], [379.212, 1250.647, 0.868], [326.815, 1264.503, 0.847]]\nB: [[356.724, 1113.785, 0.625], [356.749, 1113.855, 0.775], [356.778, 1113.889, 0.975], [356.785, 1113.897, 1.025]]\nC: [[412.74, 910.82, 0.676], [365.411, 1210.523, 0.664], [303.937, 1114.862, 0.873], [419.175, 1333.448, 0.84]]\nD: [[295.038, 1240.739, 0.504], [341.219, 1044.42, 0.812], [352.463, 1064.815, 1.125], [371.869, 1069.702, 0.977]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_143_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_143_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_143_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_143_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_143_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_143_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_143_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_143_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[1394.419, 833.646, 0.583], [1328.905, 1140.762, 0.779], [1153.453, 1205.926, 0.657], [1146.98, 1211.025, 0.624]]\nB: [[1313.096, 1036.989, 0.652], [1313.096, 1036.989, 0.652], [1313.096, 1036.989, 0.652], [1313.096, 1036.989, 0.652]]\nC: [[1268.974, 982.967, 0.746], [1564.315, 845.479, 0.727], [1340.978, 1034.082, 0.715], [1495.256, 1214.335, 0.578]]\nD: [[1459.081, 1060.013, 0.734], [1278.902, 880.579, 0.542], [1345.294, 988.27, 0.706], [1466.974, 993.897, 0.65]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1394.419, 833.646, 0.583], [1328.905, 1140.762, 0.779], [1153.453, 1205.926, 0.657], [1146.98, 1211.025, 0.624]]\nB: [[1313.096, 1036.989, 0.652], [1313.096, 1036.989, 0.652], [1313.096, 1036.989, 0.652], [1313.096, 1036.989, 0.652]]\nC: [[1268.974, 982.967, 0.746], [1564.315, 845.479, 0.727], [1340.978, 1034.082, 0.715], [1495.256, 1214.335, 0.578]]\nD: [[1459.081, 1060.013, 0.734], [1278.902, 880.579, 0.542], [1345.294, 988.27, 0.706], [1466.974, 993.897, 0.65]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_144_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_144_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_144_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_144_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_144_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_144_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_144_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_144_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[483.59, 928.44, 0.641], [429.525, 1137.919, 0.585], [431.636, 1075.281, 0.417], [368.098, 942.18, 0.409]]\nB: [[506.828, 1075.62, 0.78], [523.808, 941.549, 0.581], [482.43, 968.916, 0.331], [419.206, 1128.103, 0.389]]\nC: [[440.798, 1086.59, 0.718], [440.809, 1086.616, 0.568], [440.809, 1086.616, 0.368], [440.809, 1086.616, 0.368]]\nD: [[495.844, 942.38, 0.674], [400.153, 884.092, 0.599], [357.631, 872.728, 0.373], [508.512, 889.858, 0.32]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[483.59, 928.44, 0.641], [429.525, 1137.919, 0.585], [431.636, 1075.281, 0.417], [368.098, 942.18, 0.409]]\nB: [[506.828, 1075.62, 0.78], [523.808, 941.549, 0.581], [482.43, 968.916, 0.331], [419.206, 1128.103, 0.389]]\nC: [[440.798, 1086.59, 0.718], [440.809, 1086.616, 0.568], [440.809, 1086.616, 0.368], [440.809, 1086.616, 0.368]]\nD: [[495.844, 942.38, 0.674], [400.153, 884.092, 0.599], [357.631, 872.728, 0.373], [508.512, 889.858, 0.32]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_145_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_145_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_145_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_145_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_145_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_145_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_145_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_145_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[431.795, 1089.293, 0.732], [431.79, 1089.298, 0.611], [431.763, 1089.236, 0.548], [431.76, 1089.235, 0.741]]\nB: [[430.055, 1135.006, 0.834], [362.83, 1266.13, 0.535], [510.725, 963.311, 0.449], [354.76, 1199.812, 0.852]]\nC: [[374.034, 1232.506, 0.835], [446.62, 1198.857, 0.654], [454.385, 1036.14, 0.539], [461.62, 1215.977, 0.65]]\nD: [[426.107, 1134.166, 0.869], [469.51, 941.329, 0.702], [490.047, 990.374, 0.543], [356.18, 1025.654, 0.728]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[431.795, 1089.293, 0.732], [431.79, 1089.298, 0.611], [431.763, 1089.236, 0.548], [431.76, 1089.235, 0.741]]\nB: [[430.055, 1135.006, 0.834], [362.83, 1266.13, 0.535], [510.725, 963.311, 0.449], [354.76, 1199.812, 0.852]]\nC: [[374.034, 1232.506, 0.835], [446.62, 1198.857, 0.654], [454.385, 1036.14, 0.539], [461.62, 1215.977, 0.65]]\nD: [[426.107, 1134.166, 0.869], [469.51, 941.329, 0.702], [490.047, 990.374, 0.543], [356.18, 1025.654, 0.728]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_146_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_146_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_146_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_146_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_146_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_146_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_146_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_146_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[1042.405, 896.49, 0.459], [1248.26, 1026.303, 0.519], [1554.895, 994.696, 0.5], [1480.656, 1055.338, 0.476]]\nB: [[1168.242, 1176.462, 0.406], [1219.42, 1198.448, 0.487], [1280.042, 1223.429, 0.5], [1301.768, 1040.639, 0.468]]\nC: [[1293.229, 1033.246, 0.388], [1296.13, 1035.001, 0.465], [1296.744, 1035.285, 0.5], [1297.358, 1035.569, 0.535]]\nD: [[1205.688, 1036.352, 0.443], [1283.41, 852.956, 0.509], [1152.749, 895.819, 0.4], [1317.874, 1012.868, 0.584]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1042.405, 896.49, 0.459], [1248.26, 1026.303, 0.519], [1554.895, 994.696, 0.5], [1480.656, 1055.338, 0.476]]\nB: [[1168.242, 1176.462, 0.406], [1219.42, 1198.448, 0.487], [1280.042, 1223.429, 0.5], [1301.768, 1040.639, 0.468]]\nC: [[1293.229, 1033.246, 0.388], [1296.13, 1035.001, 0.465], [1296.744, 1035.285, 0.5], [1297.358, 1035.569, 0.535]]\nD: [[1205.688, 1036.352, 0.443], [1283.41, 852.956, 0.509], [1152.749, 895.819, 0.4], [1317.874, 1012.868, 0.584]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_147_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_147_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_147_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_147_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_147_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_147_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_147_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_147_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[2143.23, 880.092, 1.463], [2237.79, 816.687, 1.792], [2021.67, 1004.854, 1.441], [2013.58, 762.901, 1.623]]\nB: [[2030.68, 939.082, 1.556], [1707.64, 827.801, 1.543], [2187.71, 957.754, 1.703], [1734.52, 730.487, 1.539]]\nC: [[2157.8, 869.883, 1.527], [1960.08, 1008.147, 1.759], [1779.88, 929.643, 1.772], [1538.45, 884.771, 1.889]]\nD: [[1907.37, 864.452, 1.647], [1907.37, 864.452, 1.647], [1907.37, 864.452, 1.647], [1907.37, 864.452, 1.647]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[2143.23, 880.092, 1.463], [2237.79, 816.687, 1.792], [2021.67, 1004.854, 1.441], [2013.58, 762.901, 1.623]]\nB: [[2030.68, 939.082, 1.556], [1707.64, 827.801, 1.543], [2187.71, 957.754, 1.703], [1734.52, 730.487, 1.539]]\nC: [[2157.8, 869.883, 1.527], [1960.08, 1008.147, 1.759], [1779.88, 929.643, 1.772], [1538.45, 884.771, 1.889]]\nD: [[1907.37, 864.452, 1.647], [1907.37, 864.452, 1.647], [1907.37, 864.452, 1.647], [1907.37, 864.452, 1.647]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_148_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_148_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_148_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_148_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_148_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_148_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_148_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_148_7.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[333.441, 1111.848, 0.881], [358.68, 1279.096, 0.848], [365.642, 1024.723, 1.135], [416.842, 1221.204, 1.061]]\nB: [[359.553, 1105.178, 0.934], [359.553, 1105.178, 1.045], [359.553, 1105.178, 1.082], [359.553, 1105.178, 1.005]]\nC: [[401.411, 1302.101, 0.9], [401.716, 1050.934, 1.205], [369.114, 1004.72, 0.951], [298.807, 954.194, 1.162]]\nD: [[326.721, 1063.94, 1.102], [316.366, 1058.028, 0.884], [321.691, 954.121, 1.273], [352.284, 1064.278, 1.172]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[333.441, 1111.848, 0.881], [358.68, 1279.096, 0.848], [365.642, 1024.723, 1.135], [416.842, 1221.204, 1.061]]\nB: [[359.553, 1105.178, 0.934], [359.553, 1105.178, 1.045], [359.553, 1105.178, 1.082], [359.553, 1105.178, 1.005]]\nC: [[401.411, 1302.101, 0.9], [401.716, 1050.934, 1.205], [369.114, 1004.72, 0.951], [298.807, 954.194, 1.162]]\nD: [[326.721, 1063.94, 1.102], [316.366, 1058.028, 0.884], [321.691, 954.121, 1.273], [352.284, 1064.278, 1.172]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_149_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_149_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_149_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_149_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_149_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_149_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_149_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_149_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[1914.957, 873.014, 0.241], [1914.951, 872.993, 0.241], [1914.944, 872.972, 0.241], [1914.937, 872.951, 0.241]]\nB: [[2029.589, 880.527, 0.258], [1539.423, 812.215, 0.227], [2219.176, 984.894, 0.264], [2240.588, 737.815, 0.2]]\nC: [[2084.127, 771.252, 0.23], [1889.253, 1036.727, 0.264], [1713.036, 984.106, 0.254], [1867.742, 808.403, 0.199]]\nD: [[1951.845, 910.003, 0.272], [2067.414, 726.492, 0.226], [1574.866, 827.537, 0.223], [2080.281, 1029.475, 0.221]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1914.957, 873.014, 0.241], [1914.951, 872.993, 0.241], [1914.944, 872.972, 0.241], [1914.937, 872.951, 0.241]]\nB: [[2029.589, 880.527, 0.258], [1539.423, 812.215, 0.227], [2219.176, 984.894, 0.264], [2240.588, 737.815, 0.2]]\nC: [[2084.127, 771.252, 0.23], [1889.253, 1036.727, 0.264], [1713.036, 984.106, 0.254], [1867.742, 808.403, 0.199]]\nD: [[1951.845, 910.003, 0.272], [2067.414, 726.492, 0.226], [1574.866, 827.537, 0.223], [2080.281, 1029.475, 0.221]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_150_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_150_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_150_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_150_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_150_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_150_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_150_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_150_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[775.145, 1609.526, 0.275], [728.049, 1548.593, 0.225], [726.281, 1889.554, 0.225], [701.777, 1569.883, 0.285]]\nB: [[635.753, 1733.477, 0.272], [720.552, 1845.796, 0.237], [810.826, 1848.81, 0.272], [673.334, 1345.738, 0.236]]\nC: [[563.331, 1620.514, 0.207], [692.265, 1578.918, 0.262], [633.015, 1756.886, 0.242], [605.445, 1415.589, 0.246]]\nD: [[696.721, 1578.786, 0.244], [696.693, 1578.758, 0.244], [696.674, 1578.723, 0.244], [696.664, 1578.684, 0.244]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[775.145, 1609.526, 0.275], [728.049, 1548.593, 0.225], [726.281, 1889.554, 0.225], [701.777, 1569.883, 0.285]]\nB: [[635.753, 1733.477, 0.272], [720.552, 1845.796, 0.237], [810.826, 1848.81, 0.272], [673.334, 1345.738, 0.236]]\nC: [[563.331, 1620.514, 0.207], [692.265, 1578.918, 0.262], [633.015, 1756.886, 0.242], [605.445, 1415.589, 0.246]]\nD: [[696.721, 1578.786, 0.244], [696.693, 1578.758, 0.244], [696.674, 1578.723, 0.244], [696.664, 1578.684, 0.244]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_151_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_151_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_151_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_151_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_151_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_151_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_151_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_151_7.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[650.646, 1603.342, 0.03], [650.62, 1603.362, 0.13], [650.688, 1603.329, 0.308], [650.756, 1603.295, 0.485]]\nB: [[567.786, 1547.911, 0.04], [679.87, 1773.392, 0.15], [682.07, 1467.32, 0.353], [776.72, 1507.964, 0.528]]\nC: [[765.242, 1777.09, 0.03], [719.37, 1669.37, 0.15], [581.778, 1910.816, 0.336], [602.521, 1760.712, 0.392]]\nD: [[522.813, 1797.126, 0.03], [721.02, 1459.791, 0.11], [758.676, 1452.614, 0.344], [628.48, 1519.176, 0.474]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[650.646, 1603.342, 0.03], [650.62, 1603.362, 0.13], [650.688, 1603.329, 0.308], [650.756, 1603.295, 0.485]]\nB: [[567.786, 1547.911, 0.04], [679.87, 1773.392, 0.15], [682.07, 1467.32, 0.353], [776.72, 1507.964, 0.528]]\nC: [[765.242, 1777.09, 0.03], [719.37, 1669.37, 0.15], [581.778, 1910.816, 0.336], [602.521, 1760.712, 0.392]]\nD: [[522.813, 1797.126, 0.03], [721.02, 1459.791, 0.11], [758.676, 1452.614, 0.344], [628.48, 1519.176, 0.474]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_152_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_152_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_152_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_152_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_152_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_152_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_152_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_152_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[1344.72, 1281.191, 3.09], [1075.39, 972.846, 3.969], [1227.93, 967.953, 2.882], [1313.85, 1191.672, 2.766]]\nB: [[1340.58, 1102.095, 3.358], [1340.58, 1102.095, 3.358], [1340.56, 1102.069, 3.358], [1340.56, 1102.069, 3.358]]\nC: [[1595.5, 1117.589, 3.115], [1363.06, 1051.692, 3.599], [1333.05, 1281.644, 3.54], [1459.61, 1297.063, 3.45]]\nD: [[1406.68, 1037.893, 3.818], [1555.01, 1107.104, 3.069], [1222.57, 1178.434, 3.387], [1139.88, 1136.126, 3.194]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1344.72, 1281.191, 3.09], [1075.39, 972.846, 3.969], [1227.93, 967.953, 2.882], [1313.85, 1191.672, 2.766]]\nB: [[1340.58, 1102.095, 3.358], [1340.58, 1102.095, 3.358], [1340.56, 1102.069, 3.358], [1340.56, 1102.069, 3.358]]\nC: [[1595.5, 1117.589, 3.115], [1363.06, 1051.692, 3.599], [1333.05, 1281.644, 3.54], [1459.61, 1297.063, 3.45]]\nD: [[1406.68, 1037.893, 3.818], [1555.01, 1107.104, 3.069], [1222.57, 1178.434, 3.387], [1139.88, 1136.126, 3.194]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_153_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_153_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_153_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_153_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_153_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_153_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_153_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_153_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[366.659, 1011.101, 1.22], [364.315, 1040.812, 1.078], [399.107, 1299.752, 1.185], [296.881, 916.886, 1.47]]\nB: [[363.259, 1094.238, 1.247], [363.277, 1094.229, 1.276], [363.296, 1094.221, 1.306], [363.315, 1094.212, 1.335]]\nC: [[349.227, 1179.598, 1.249], [388.454, 911.85, 1.32], [338.754, 1093.699, 1.127], [427.119, 948.416, 1.102]]\nD: [[420.809, 960.294, 1.148], [382.372, 1303.064, 1.394], [429.821, 1184.841, 1.121], [362.615, 1271.967, 1.2]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[366.659, 1011.101, 1.22], [364.315, 1040.812, 1.078], [399.107, 1299.752, 1.185], [296.881, 916.886, 1.47]]\nB: [[363.259, 1094.238, 1.247], [363.277, 1094.229, 1.276], [363.296, 1094.221, 1.306], [363.315, 1094.212, 1.335]]\nC: [[349.227, 1179.598, 1.249], [388.454, 911.85, 1.32], [338.754, 1093.699, 1.127], [427.119, 948.416, 1.102]]\nD: [[420.809, 960.294, 1.148], [382.372, 1303.064, 1.394], [429.821, 1184.841, 1.121], [362.615, 1271.967, 1.2]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_154_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_154_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_154_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_154_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_154_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_154_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_154_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_154_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[1954.841, 997.5, 0.236], [1604.826, 1018.173, 0.282], [2056.441, 825.549, 0.202], [2053.148, 806.675, 0.206]]\nB: [[1911.473, 872.92, 0.247], [1911.473, 872.927, 0.247], [1911.473, 872.935, 0.247], [1911.473, 872.912, 0.247]]\nC: [[1627.59, 704.51, 0.221], [1743.247, 820.718, 0.239], [1954.356, 974.622, 0.27], [1682.124, 823.985, 0.25]]\nD: [[1859.848, 855.57, 0.227], [1677.344, 943.885, 0.289], [1603.535, 883.7, 0.263], [2177.549, 800.573, 0.251]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1954.841, 997.5, 0.236], [1604.826, 1018.173, 0.282], [2056.441, 825.549, 0.202], [2053.148, 806.675, 0.206]]\nB: [[1911.473, 872.92, 0.247], [1911.473, 872.927, 0.247], [1911.473, 872.935, 0.247], [1911.473, 872.912, 0.247]]\nC: [[1627.59, 704.51, 0.221], [1743.247, 820.718, 0.239], [1954.356, 974.622, 0.27], [1682.124, 823.985, 0.25]]\nD: [[1859.848, 855.57, 0.227], [1677.344, 943.885, 0.289], [1603.535, 883.7, 0.263], [2177.549, 800.573, 0.251]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_155_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_155_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_155_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_155_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_155_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_155_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_155_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_155_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[405.086, 1259.825, 0.988], [341.109, 1024.661, 1.196], [400.072, 1270.26, 0.973], [361.016, 1307.675, 1.222]]\nB: [[493.579, 972.185, 0.834], [476.692, 1075.558, 0.912], [401.433, 969.577, 0.981], [403.596, 1269.671, 1.032]]\nC: [[431.9, 1414.092, 1.036], [482.304, 1353.899, 0.879], [441.439, 1311.735, 0.916], [400.616, 1079.275, 1.136]]\nD: [[417.374, 1192.132, 0.961], [416.718, 1192.286, 1.011], [416.058, 1192.412, 1.061], [415.392, 1192.512, 1.111]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[405.086, 1259.825, 0.988], [341.109, 1024.661, 1.196], [400.072, 1270.26, 0.973], [361.016, 1307.675, 1.222]]\nB: [[493.579, 972.185, 0.834], [476.692, 1075.558, 0.912], [401.433, 969.577, 0.981], [403.596, 1269.671, 1.032]]\nC: [[431.9, 1414.092, 1.036], [482.304, 1353.899, 0.879], [441.439, 1311.735, 0.916], [400.616, 1079.275, 1.136]]\nD: [[417.374, 1192.132, 0.961], [416.718, 1192.286, 1.011], [416.058, 1192.412, 1.061], [415.392, 1192.512, 1.111]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_156_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_156_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_156_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_156_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_156_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_156_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_156_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_156_7.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[326.56, 1024.467, 0.519], [375.34, 1251.894, 0.454], [324.82, 1119.168, 0.508], [372.44, 1074.004, 0.7]]\nB: [[446.37, 1235.831, 0.437], [419.37, 934.045, 0.644], [380.01, 1072.298, 0.649], [417.35, 1264.432, 0.693]]\nC: [[438.81, 1043.645, 0.444], [435.77, 1350.631, 0.617], [421.85, 1208.417, 0.602], [334.9, 991.841, 0.557]]\nD: [[387.52, 1143.568, 0.508], [387.52, 1143.568, 0.541], [387.52, 1143.568, 0.575], [387.52, 1143.568, 0.608]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[326.56, 1024.467, 0.519], [375.34, 1251.894, 0.454], [324.82, 1119.168, 0.508], [372.44, 1074.004, 0.7]]\nB: [[446.37, 1235.831, 0.437], [419.37, 934.045, 0.644], [380.01, 1072.298, 0.649], [417.35, 1264.432, 0.693]]\nC: [[438.81, 1043.645, 0.444], [435.77, 1350.631, 0.617], [421.85, 1208.417, 0.602], [334.9, 991.841, 0.557]]\nD: [[387.52, 1143.568, 0.508], [387.52, 1143.568, 0.541], [387.52, 1143.568, 0.575], [387.52, 1143.568, 0.608]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_157_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_157_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_157_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_157_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_157_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_157_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_157_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_157_7.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[482.206, 1161.483, 0.923], [362.768, 1002.877, 1.176], [464.329, 933.744, 1.058], [365.042, 1087.466, 0.59]]\nB: [[395.933, 960.032, 0.91], [390.823, 1104.508, 1.171], [437.592, 1073.569, 0.945], [481.154, 1034.808, 0.54]]\nC: [[418.967, 1094.306, 1.038], [418.951, 1094.348, 1.008], [418.987, 1094.368, 1.068], [418.873, 1094.555, 0.56]]\nD: [[425.42, 1279.316, 1.07], [448.65, 1259.388, 1.115], [445.464, 1156.231, 1.185], [492.603, 965.675, 0.64]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[482.206, 1161.483, 0.923], [362.768, 1002.877, 1.176], [464.329, 933.744, 1.058], [365.042, 1087.466, 0.59]]\nB: [[395.933, 960.032, 0.91], [390.823, 1104.508, 1.171], [437.592, 1073.569, 0.945], [481.154, 1034.808, 0.54]]\nC: [[418.967, 1094.306, 1.038], [418.951, 1094.348, 1.008], [418.987, 1094.368, 1.068], [418.873, 1094.555, 0.56]]\nD: [[425.42, 1279.316, 1.07], [448.65, 1259.388, 1.115], [445.464, 1156.231, 1.185], [492.603, 965.675, 0.64]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_158_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_158_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_158_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_158_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_158_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_158_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_158_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_158_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[2133.097, 774.816, 1.225], [1748.933, 832.577, 1.031], [1567.379, 834.66, 1.265], [1502.433, 691.935, 1.105]]\nB: [[1806.535, 861.266, 1.066], [1807.133, 859.654, 1.066], [1807.563, 858.13, 1.066], [1807.849, 856.657, 1.021]]\nC: [[2099.2, 882.41, 1.014], [2094.1, 791.51, 0.857], [1724.174, 770.83, 0.904], [1636.953, 895.976, 1.042]]\nD: [[1853.024, 785.737, 0.887], [1793.267, 868.356, 1.224], [1854.012, 828.75, 1.266], [2127.184, 793.379, 1.141]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[2133.097, 774.816, 1.225], [1748.933, 832.577, 1.031], [1567.379, 834.66, 1.265], [1502.433, 691.935, 1.105]]\nB: [[1806.535, 861.266, 1.066], [1807.133, 859.654, 1.066], [1807.563, 858.13, 1.066], [1807.849, 856.657, 1.021]]\nC: [[2099.2, 882.41, 1.014], [2094.1, 791.51, 0.857], [1724.174, 770.83, 0.904], [1636.953, 895.976, 1.042]]\nD: [[1853.024, 785.737, 0.887], [1793.267, 868.356, 1.224], [1854.012, 828.75, 1.266], [2127.184, 793.379, 1.141]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_159_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_159_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_159_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_159_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_159_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_159_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_159_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_159_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[1039.96, 1235.697, 1.849], [1056.893, 1292.385, 1.963], [1073.252, 1262.199, 2.07], [1057.534, 1122.452, 1.706]]\nB: [[1248.78, 1187.636, 2.366], [1189.102, 1066.566, 2.081], [1235.361, 1032.631, 1.92], [1111.357, 1241.726, 2.171]]\nC: [[1147.17, 1172.553, 1.994], [1344.188, 1301.314, 1.937], [974.406, 1174.168, 1.55], [1159.003, 1238.282, 2.257]]\nD: [[1181.87, 1122.359, 2.116], [1185.363, 1119.943, 2.248], [1188.856, 1117.528, 1.93], [1192.349, 1115.113, 1.954]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1039.96, 1235.697, 1.849], [1056.893, 1292.385, 1.963], [1073.252, 1262.199, 2.07], [1057.534, 1122.452, 1.706]]\nB: [[1248.78, 1187.636, 2.366], [1189.102, 1066.566, 2.081], [1235.361, 1032.631, 1.92], [1111.357, 1241.726, 2.171]]\nC: [[1147.17, 1172.553, 1.994], [1344.188, 1301.314, 1.937], [974.406, 1174.168, 1.55], [1159.003, 1238.282, 2.257]]\nD: [[1181.87, 1122.359, 2.116], [1185.363, 1119.943, 2.248], [1188.856, 1117.528, 1.93], [1192.349, 1115.113, 1.954]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_160_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_160_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_160_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_160_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_160_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_160_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_160_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_160_7.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[407.79, 1169.713, 1.047], [438.535, 914.451, 0.817], [474.944, 1174.579, 0.711], [407.788, 1311.155, 0.697]]\nB: [[411.096, 1097.949, 1.026], [411.096, 1097.949, 0.806], [411.096, 1097.949, 0.756], [411.181, 1097.914, 0.706]]\nC: [[380.059, 1294.905, 1.061], [462.302, 957.203, 0.843], [360.056, 1036.95, 0.635], [377.308, 1174.326, 0.803]]\nD: [[453.527, 1100.533, 1.002], [399.919, 928.877, 0.842], [458.797, 1077.795, 0.803], [333.443, 1294.271, 0.8]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[407.79, 1169.713, 1.047], [438.535, 914.451, 0.817], [474.944, 1174.579, 0.711], [407.788, 1311.155, 0.697]]\nB: [[411.096, 1097.949, 1.026], [411.096, 1097.949, 0.806], [411.096, 1097.949, 0.756], [411.181, 1097.914, 0.706]]\nC: [[380.059, 1294.905, 1.061], [462.302, 957.203, 0.843], [360.056, 1036.95, 0.635], [377.308, 1174.326, 0.803]]\nD: [[453.527, 1100.533, 1.002], [399.919, 928.877, 0.842], [458.797, 1077.795, 0.803], [333.443, 1294.271, 0.8]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_161_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_161_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_161_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_161_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_161_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_161_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_161_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_161_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[377.184, 977.334, 0.742], [375.911, 1030.014, 0.731], [428.729, 1127.517, 0.56], [402.816, 1027.37, 0.555]]\nB: [[447.851, 943.564, 0.851], [507.44, 1079.2, 0.691], [496.497, 1228.261, 0.42], [422.118, 1234.56, 0.566]]\nC: [[427.284, 1091.127, 0.774], [427.286, 1091.126, 0.707], [427.292, 1091.122, 0.507], [427.294, 1091.12, 0.628]]\nD: [[480.047, 995.531, 0.924], [418.676, 923.092, 0.59], [412.858, 1042.957, 0.421], [343.915, 1040.26, 0.623]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[377.184, 977.334, 0.742], [375.911, 1030.014, 0.731], [428.729, 1127.517, 0.56], [402.816, 1027.37, 0.555]]\nB: [[447.851, 943.564, 0.851], [507.44, 1079.2, 0.691], [496.497, 1228.261, 0.42], [422.118, 1234.56, 0.566]]\nC: [[427.284, 1091.127, 0.774], [427.286, 1091.126, 0.707], [427.292, 1091.122, 0.507], [427.294, 1091.12, 0.628]]\nD: [[480.047, 995.531, 0.924], [418.676, 923.092, 0.59], [412.858, 1042.957, 0.421], [343.915, 1040.26, 0.623]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_162_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_162_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_162_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_162_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_162_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_162_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_162_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_162_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[1470.303, 1200.503, -0.117], [1211.565, 906.028, -0.109], [1265.671, 860.24, -0.139], [1225.185, 861.066, -0.166]]\nB: [[1121.534, 848.02, -0.101], [1300.088, 882.168, -0.116], [1372.338, 987.41, -0.161], [1121.468, 1062.994, -0.117]]\nC: [[1239.215, 1012.078, -0.106], [1239.169, 1012.039, -0.126], [1239.146, 1012.02, -0.136], [1239.123, 1012.001, -0.146]]\nD: [[1353.598, 917.259, -0.096], [1218.603, 1014.25, -0.105], [1110.323, 1116.23, -0.13], [1224.972, 1200.395, -0.138]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1470.303, 1200.503, -0.117], [1211.565, 906.028, -0.109], [1265.671, 860.24, -0.139], [1225.185, 861.066, -0.166]]\nB: [[1121.534, 848.02, -0.101], [1300.088, 882.168, -0.116], [1372.338, 987.41, -0.161], [1121.468, 1062.994, -0.117]]\nC: [[1239.215, 1012.078, -0.106], [1239.169, 1012.039, -0.126], [1239.146, 1012.02, -0.136], [1239.123, 1012.001, -0.146]]\nD: [[1353.598, 917.259, -0.096], [1218.603, 1014.25, -0.105], [1110.323, 1116.23, -0.13], [1224.972, 1200.395, -0.138]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_163_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_163_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_163_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_163_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_163_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_163_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_163_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_163_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[390.311, 1121.642, 0.818], [389.934, 1120.974, 0.748], [389.624, 1120.4, 0.778], [389.38, 1119.804, 0.858]]\nB: [[315.313, 963.621, 0.732], [377.113, 912.889, 0.734], [351.418, 1317.0, 0.875], [422.52, 1203.28, 0.858]]\nC: [[416.232, 1151.901, 0.879], [421.322, 1109.197, 0.81], [452.884, 1049.1, 0.63], [415.15, 1098.518, 0.99]]\nD: [[432.466, 1326.016, 0.887], [398.73, 1272.899, 0.773], [453.388, 964.3, 0.838], [398.14, 1308.497, 0.972]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[390.311, 1121.642, 0.818], [389.934, 1120.974, 0.748], [389.624, 1120.4, 0.778], [389.38, 1119.804, 0.858]]\nB: [[315.313, 963.621, 0.732], [377.113, 912.889, 0.734], [351.418, 1317.0, 0.875], [422.52, 1203.28, 0.858]]\nC: [[416.232, 1151.901, 0.879], [421.322, 1109.197, 0.81], [452.884, 1049.1, 0.63], [415.15, 1098.518, 0.99]]\nD: [[432.466, 1326.016, 0.887], [398.73, 1272.899, 0.773], [453.388, 964.3, 0.838], [398.14, 1308.497, 0.972]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_164_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_164_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_164_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_164_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_164_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_164_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_164_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_164_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[288.389, 823.36, 1.04], [243.336, 770.263, 1.17], [329.163, 556.194, 0.924], [278.167, 724.049, 1.175]]\nB: [[303.148, 635.853, 0.872], [255.423, 572.035, 1.073], [262.191, 559.079, 0.89], [295.029, 679.27, 0.862]]\nC: [[250.538, 640.039, 0.892], [355.139, 799.298, 1.133], [280.545, 705.593, 1.08], [285.606, 739.161, 1.054]]\nD: [[301.073, 691.921, 1.008], [300.166, 690.701, 1.008], [299.259, 689.481, 1.008], [298.351, 688.262, 1.008]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[288.389, 823.36, 1.04], [243.336, 770.263, 1.17], [329.163, 556.194, 0.924], [278.167, 724.049, 1.175]]\nB: [[303.148, 635.853, 0.872], [255.423, 572.035, 1.073], [262.191, 559.079, 0.89], [295.029, 679.27, 0.862]]\nC: [[250.538, 640.039, 0.892], [355.139, 799.298, 1.133], [280.545, 705.593, 1.08], [285.606, 739.161, 1.054]]\nD: [[301.073, 691.921, 1.008], [300.166, 690.701, 1.008], [299.259, 689.481, 1.008], [298.351, 688.262, 1.008]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_165_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_165_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_165_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_165_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_165_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_165_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_165_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_165_7.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[374.966, 918.231, 1.798], [317.815, 1228.212, 1.721], [398.89, 875.569, 1.814], [428.79, 1016.473, 1.875]]\nB: [[360.439, 1086.932, 1.56], [381.745, 1179.649, 1.447], [395.985, 1168.142, 1.848], [396.108, 960.271, 1.709]]\nC: [[358.757, 1084.221, 1.582], [358.757, 1084.221, 1.575], [358.757, 1084.221, 1.585], [358.757, 1084.221, 1.664]]\nD: [[320.557, 873.472, 1.863], [409.377, 874.584, 1.304], [376.574, 953.049, 1.86], [336.743, 903.665, 1.608]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[374.966, 918.231, 1.798], [317.815, 1228.212, 1.721], [398.89, 875.569, 1.814], [428.79, 1016.473, 1.875]]\nB: [[360.439, 1086.932, 1.56], [381.745, 1179.649, 1.447], [395.985, 1168.142, 1.848], [396.108, 960.271, 1.709]]\nC: [[358.757, 1084.221, 1.582], [358.757, 1084.221, 1.575], [358.757, 1084.221, 1.585], [358.757, 1084.221, 1.664]]\nD: [[320.557, 873.472, 1.863], [409.377, 874.584, 1.304], [376.574, 953.049, 1.86], [336.743, 903.665, 1.608]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_166_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_166_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_166_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_166_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_166_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_166_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_166_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_166_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[2192.318, 842.128, 0.202], [2028.569, 1002.104, 0.362], [1823.353, 915.088, 0.434], [1848.897, 1015.376, 0.66]]\nB: [[2200.193, 887.932, 0.279], [2035.641, 762.88, 0.308], [1865.863, 809.786, 0.584], [1497.967, 1016.346, 0.579]]\nC: [[1842.467, 871.854, 0.249], [1837.266, 871.727, 0.362], [1831.484, 871.587, 0.487], [1825.702, 871.447, 0.612]]\nD: [[1510.791, 938.627, 0.271], [1853.106, 942.739, 0.426], [1927.734, 739.554, 0.479], [1666.087, 996.74, 0.708]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[2192.318, 842.128, 0.202], [2028.569, 1002.104, 0.362], [1823.353, 915.088, 0.434], [1848.897, 1015.376, 0.66]]\nB: [[2200.193, 887.932, 0.279], [2035.641, 762.88, 0.308], [1865.863, 809.786, 0.584], [1497.967, 1016.346, 0.579]]\nC: [[1842.467, 871.854, 0.249], [1837.266, 871.727, 0.362], [1831.484, 871.587, 0.487], [1825.702, 871.447, 0.612]]\nD: [[1510.791, 938.627, 0.271], [1853.106, 942.739, 0.426], [1927.734, 739.554, 0.479], [1666.087, 996.74, 0.708]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_167_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_167_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_167_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_167_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_167_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_167_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_167_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_167_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[342.032, 761.086, -0.593], [347.646, 679.25, -0.401], [364.231, 569.092, -0.328], [289.053, 634.842, -0.205]]\nB: [[301.298, 665.695, -0.633], [284.255, 584.96, -0.505], [255.024, 783.231, -0.315], [245.543, 830.729, -0.175]]\nC: [[311.284, 836.779, -0.649], [370.864, 573.4, -0.497], [360.873, 715.839, -0.301], [249.474, 613.179, -0.21]]\nD: [[312.445, 705.589, -0.612], [310.539, 703.33, -0.479], [308.633, 701.071, -0.346], [306.729, 698.815, -0.214]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[342.032, 761.086, -0.593], [347.646, 679.25, -0.401], [364.231, 569.092, -0.328], [289.053, 634.842, -0.205]]\nB: [[301.298, 665.695, -0.633], [284.255, 584.96, -0.505], [255.024, 783.231, -0.315], [245.543, 830.729, -0.175]]\nC: [[311.284, 836.779, -0.649], [370.864, 573.4, -0.497], [360.873, 715.839, -0.301], [249.474, 613.179, -0.21]]\nD: [[312.445, 705.589, -0.612], [310.539, 703.33, -0.479], [308.633, 701.071, -0.346], [306.729, 698.815, -0.214]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_168_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_168_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_168_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_168_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_168_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_168_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_168_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_168_7.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[363.209, 1123.747, 1.102], [363.143, 1123.593, 1.085], [363.102, 1123.498, 1.068], [363.137, 1123.58, 1.052]]\nB: [[378.885, 1276.718, 1.19], [435.59, 1222.123, 1.009], [430.113, 989.283, 1.158], [375.367, 990.91, 0.956]]\nC: [[413.521, 1104.065, 1.115], [428.494, 917.554, 1.251], [320.18, 1021.737, 0.933], [318.09, 1005.13, 1.022]]\nD: [[427.259, 1339.216, 1.291], [315.569, 1079.127, 0.936], [304.042, 1194.504, 1.038], [323.022, 982.56, 0.905]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[363.209, 1123.747, 1.102], [363.143, 1123.593, 1.085], [363.102, 1123.498, 1.068], [363.137, 1123.58, 1.052]]\nB: [[378.885, 1276.718, 1.19], [435.59, 1222.123, 1.009], [430.113, 989.283, 1.158], [375.367, 990.91, 0.956]]\nC: [[413.521, 1104.065, 1.115], [428.494, 917.554, 1.251], [320.18, 1021.737, 0.933], [318.09, 1005.13, 1.022]]\nD: [[427.259, 1339.216, 1.291], [315.569, 1079.127, 0.936], [304.042, 1194.504, 1.038], [323.022, 982.56, 0.905]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_169_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_169_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_169_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_169_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_169_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_169_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_169_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_169_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[1333.186, 1040.597, 0.635], [1333.186, 1040.597, 0.635], [1333.186, 1040.597, 0.635], [1333.186, 1040.597, 0.635]]\nB: [[1514.635, 950.684, 0.564], [1066.743, 1156.105, 0.658], [1204.49, 1019.642, 0.74], [1089.793, 1017.943, 0.569]]\nC: [[1321.867, 926.129, 0.741], [1261.374, 1241.754, 0.725], [1359.131, 1028.017, 0.564], [1440.895, 941.11, 0.737]]\nD: [[1471.054, 874.495, 0.591], [1148.049, 1089.103, 0.508], [1489.63, 929.92, 0.603], [1503.98, 1037.54, 0.682]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1333.186, 1040.597, 0.635], [1333.186, 1040.597, 0.635], [1333.186, 1040.597, 0.635], [1333.186, 1040.597, 0.635]]\nB: [[1514.635, 950.684, 0.564], [1066.743, 1156.105, 0.658], [1204.49, 1019.642, 0.74], [1089.793, 1017.943, 0.569]]\nC: [[1321.867, 926.129, 0.741], [1261.374, 1241.754, 0.725], [1359.131, 1028.017, 0.564], [1440.895, 941.11, 0.737]]\nD: [[1471.054, 874.495, 0.591], [1148.049, 1089.103, 0.508], [1489.63, 929.92, 0.603], [1503.98, 1037.54, 0.682]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_170_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_170_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_170_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_170_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_170_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_170_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_170_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_170_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[587.007, 1451.429, 1.011], [592.203, 1759.483, 1.117], [647.108, 1795.963, 1.03], [638.355, 1540.243, 0.93]]\nB: [[640.141, 1888.738, 0.846], [726.767, 1525.298, 0.951], [759.989, 1292.629, 0.84], [736.789, 1660.505, 0.979]]\nC: [[666.472, 1849.208, 0.854], [731.041, 1481.554, 0.835], [803.226, 1800.553, 1.04], [593.45, 1762.889, 1.042]]\nD: [[671.323, 1578.674, 0.957], [671.316, 1578.671, 0.964], [671.308, 1578.668, 0.97], [671.301, 1578.665, 0.976]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[587.007, 1451.429, 1.011], [592.203, 1759.483, 1.117], [647.108, 1795.963, 1.03], [638.355, 1540.243, 0.93]]\nB: [[640.141, 1888.738, 0.846], [726.767, 1525.298, 0.951], [759.989, 1292.629, 0.84], [736.789, 1660.505, 0.979]]\nC: [[666.472, 1849.208, 0.854], [731.041, 1481.554, 0.835], [803.226, 1800.553, 1.04], [593.45, 1762.889, 1.042]]\nD: [[671.323, 1578.674, 0.957], [671.316, 1578.671, 0.964], [671.308, 1578.668, 0.97], [671.301, 1578.665, 0.976]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_171_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_171_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_171_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_171_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_171_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_171_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_171_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_171_7.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[291.787, 679.069, -0.469], [277.905, 732.394, -0.336], [252.638, 766.267, -0.161], [332.611, 636.857, -0.009]]\nB: [[309.014, 658.598, -0.45], [335.072, 790.615, -0.333], [250.147, 699.934, -0.154], [283.681, 656.435, -0.007]]\nC: [[307.569, 699.998, -0.459], [305.228, 697.097, -0.309], [302.888, 694.195, -0.159], [300.547, 691.293, -0.008]]\nD: [[306.127, 672.26, -0.442], [332.611, 777.126, -0.361], [293.54, 777.55, -0.16], [277.878, 827.191, -0.008]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[291.787, 679.069, -0.469], [277.905, 732.394, -0.336], [252.638, 766.267, -0.161], [332.611, 636.857, -0.009]]\nB: [[309.014, 658.598, -0.45], [335.072, 790.615, -0.333], [250.147, 699.934, -0.154], [283.681, 656.435, -0.007]]\nC: [[307.569, 699.998, -0.459], [305.228, 697.097, -0.309], [302.888, 694.195, -0.159], [300.547, 691.293, -0.008]]\nD: [[306.127, 672.26, -0.442], [332.611, 777.126, -0.361], [293.54, 777.55, -0.16], [277.878, 827.191, -0.008]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_172_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_172_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_172_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_172_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_172_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_172_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_172_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_172_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[410.021, 1418.263, 1.006], [335.263, 1143.6, 0.762], [373.402, 1388.34, 1.144], [360.742, 1297.733, 1.1]]\nB: [[401.282, 1193.478, 0.862], [402.619, 1193.23, 0.782], [404.132, 1193.11, 1.001], [405.307, 1192.971, 1.1]]\nC: [[476.438, 1152.023, 0.752], [338.739, 1127.79, 0.816], [436.589, 1274.03, 1.097], [419.84, 1106.209, 1.0]]\nD: [[421.529, 1380.4, 0.797], [464.928, 1028.31, 0.794], [411.148, 983.52, 1.058], [384.373, 1245.097, 1.2]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[410.021, 1418.263, 1.006], [335.263, 1143.6, 0.762], [373.402, 1388.34, 1.144], [360.742, 1297.733, 1.1]]\nB: [[401.282, 1193.478, 0.862], [402.619, 1193.23, 0.782], [404.132, 1193.11, 1.001], [405.307, 1192.971, 1.1]]\nC: [[476.438, 1152.023, 0.752], [338.739, 1127.79, 0.816], [436.589, 1274.03, 1.097], [419.84, 1106.209, 1.0]]\nD: [[421.529, 1380.4, 0.797], [464.928, 1028.31, 0.794], [411.148, 983.52, 1.058], [384.373, 1245.097, 1.2]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_173_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_173_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_173_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_173_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_173_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_173_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_173_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_173_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[720.915, 1310.639, 0.073], [578.541, 1908.08, -0.015], [537.43, 1833.644, 0.24], [531.48, 1353.149, 0.504]]\nB: [[621.807, 1622.951, 0.061], [622.306, 1622.44, -0.014], [623.24, 1621.439, 0.236], [623.71, 1620.935, 0.436]]\nC: [[724.675, 1443.65, 0.065], [727.047, 1304.33, -0.013], [681.05, 1445.626, 0.193], [593.86, 1727.459, 0.372]]\nD: [[667.376, 1736.543, 0.069], [736.258, 1753.41, -0.016], [704.18, 1743.04, 0.25], [644.2, 1366.127, 0.492]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[720.915, 1310.639, 0.073], [578.541, 1908.08, -0.015], [537.43, 1833.644, 0.24], [531.48, 1353.149, 0.504]]\nB: [[621.807, 1622.951, 0.061], [622.306, 1622.44, -0.014], [623.24, 1621.439, 0.236], [623.71, 1620.935, 0.436]]\nC: [[724.675, 1443.65, 0.065], [727.047, 1304.33, -0.013], [681.05, 1445.626, 0.193], [593.86, 1727.459, 0.372]]\nD: [[667.376, 1736.543, 0.069], [736.258, 1753.41, -0.016], [704.18, 1743.04, 0.25], [644.2, 1366.127, 0.492]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_174_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_174_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_174_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_174_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_174_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_174_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_174_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_174_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[2009.229, 1014.34, 0.361], [1586.06, 754.207, 0.245], [2016.84, 904.343, 0.311], [2259.58, 852.389, 0.226]]\nB: [[1902.189, 877.268, 0.309], [1902.179, 877.284, 0.296], [1902.17, 877.299, 0.284], [1902.16, 877.315, 0.271]]\nC: [[1742.017, 880.394, 0.345], [2183.098, 837.873, 0.305], [2257.96, 877.436, 0.227], [1592.71, 1021.048, 0.257]]\nD: [[1584.307, 954.942, 0.354], [1730.467, 891.446, 0.254], [1805.84, 870.388, 0.252], [2140.35, 875.505, 0.278]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[2009.229, 1014.34, 0.361], [1586.06, 754.207, 0.245], [2016.84, 904.343, 0.311], [2259.58, 852.389, 0.226]]\nB: [[1902.189, 877.268, 0.309], [1902.179, 877.284, 0.296], [1902.17, 877.299, 0.284], [1902.16, 877.315, 0.271]]\nC: [[1742.017, 880.394, 0.345], [2183.098, 837.873, 0.305], [2257.96, 877.436, 0.227], [1592.71, 1021.048, 0.257]]\nD: [[1584.307, 954.942, 0.354], [1730.467, 891.446, 0.254], [1805.84, 870.388, 0.252], [2140.35, 875.505, 0.278]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_175_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_175_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_175_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_175_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_175_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_175_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_175_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_175_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[318.894, 1055.632, 0.551], [364.825, 1096.642, 0.53], [434.434, 1265.049, 0.623], [381.624, 937.181, 0.532]]\nB: [[473.765, 1023.067, 0.591], [358.411, 1214.0, 0.58], [399.844, 969.318, 0.568], [346.878, 1295.895, 0.701]]\nC: [[396.557, 1112.412, 0.545], [396.557, 1112.412, 0.57], [396.557, 1112.412, 0.595], [396.559, 1112.411, 0.612]]\nD: [[448.687, 1322.501, 0.469], [405.927, 1315.306, 0.58], [394.036, 899.204, 0.523], [445.892, 1047.925, 0.674]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[318.894, 1055.632, 0.551], [364.825, 1096.642, 0.53], [434.434, 1265.049, 0.623], [381.624, 937.181, 0.532]]\nB: [[473.765, 1023.067, 0.591], [358.411, 1214.0, 0.58], [399.844, 969.318, 0.568], [346.878, 1295.895, 0.701]]\nC: [[396.557, 1112.412, 0.545], [396.557, 1112.412, 0.57], [396.557, 1112.412, 0.595], [396.559, 1112.411, 0.612]]\nD: [[448.687, 1322.501, 0.469], [405.927, 1315.306, 0.58], [394.036, 899.204, 0.523], [445.892, 1047.925, 0.674]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_176_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_176_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_176_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_176_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_176_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_176_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_176_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_176_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[616.635, 1647.01, 0.068], [712.669, 1327.311, 0.03], [653.979, 1570.408, -0.012], [669.754, 1866.231, 0.168]]\nB: [[575.735, 1633.265, 0.067], [652.707, 1894.197, 0.04], [585.605, 1401.243, -0.011], [517.999, 1818.195, 0.152]]\nC: [[619.603, 1624.655, 0.071], [620.215, 1624.227, 0.03], [620.828, 1623.798, -0.012], [621.449, 1623.383, 0.146]]\nD: [[697.585, 1370.261, 0.08], [634.839, 1685.003, 0.02], [620.085, 1806.807, -0.014], [537.801, 1756.717, 0.169]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[616.635, 1647.01, 0.068], [712.669, 1327.311, 0.03], [653.979, 1570.408, -0.012], [669.754, 1866.231, 0.168]]\nB: [[575.735, 1633.265, 0.067], [652.707, 1894.197, 0.04], [585.605, 1401.243, -0.011], [517.999, 1818.195, 0.152]]\nC: [[619.603, 1624.655, 0.071], [620.215, 1624.227, 0.03], [620.828, 1623.798, -0.012], [621.449, 1623.383, 0.146]]\nD: [[697.585, 1370.261, 0.08], [634.839, 1685.003, 0.02], [620.085, 1806.807, -0.014], [537.801, 1756.717, 0.169]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_177_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_177_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_177_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_177_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_177_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_177_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_177_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_177_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[321.608, 1134.254, 0.47], [376.579, 1092.364, 0.561], [328.646, 1331.008, 0.5], [429.422, 1013.513, 0.542]]\nB: [[366.01, 1353.528, 0.41], [346.536, 1032.294, 0.437], [456.295, 1314.242, 0.49], [462.86, 1106.077, 0.501]]\nC: [[394.87, 1020.543, 0.46], [447.385, 1022.557, 0.479], [328.398, 1293.308, 0.63], [370.045, 1320.086, 0.474]]\nD: [[395.651, 1160.538, 0.51], [395.651, 1160.538, 0.535], [395.651, 1160.538, 0.56], [395.651, 1160.538, 0.585]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[321.608, 1134.254, 0.47], [376.579, 1092.364, 0.561], [328.646, 1331.008, 0.5], [429.422, 1013.513, 0.542]]\nB: [[366.01, 1353.528, 0.41], [346.536, 1032.294, 0.437], [456.295, 1314.242, 0.49], [462.86, 1106.077, 0.501]]\nC: [[394.87, 1020.543, 0.46], [447.385, 1022.557, 0.479], [328.398, 1293.308, 0.63], [370.045, 1320.086, 0.474]]\nD: [[395.651, 1160.538, 0.51], [395.651, 1160.538, 0.535], [395.651, 1160.538, 0.56], [395.651, 1160.538, 0.585]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_178_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_178_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_178_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_178_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_178_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_178_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_178_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_178_7.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[705.601, 1670.358, 1.653], [759.064, 1611.115, 1.64], [821.344, 1702.722, 1.626], [808.425, 1696.718, 1.401]]\nB: [[729.457, 1296.83, 1.225], [673.831, 1849.549, 1.6], [822.222, 1475.002, 1.59], [568.262, 1723.288, 1.529]]\nC: [[780.777, 1370.477, 1.602], [703.571, 1274.284, 1.37], [714.428, 1801.829, 1.419], [746.711, 1436.885, 1.447]]\nD: [[710.384, 1565.299, 1.507], [708.536, 1567.037, 1.44], [706.623, 1568.784, 1.524], [704.701, 1570.522, 1.457]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[705.601, 1670.358, 1.653], [759.064, 1611.115, 1.64], [821.344, 1702.722, 1.626], [808.425, 1696.718, 1.401]]\nB: [[729.457, 1296.83, 1.225], [673.831, 1849.549, 1.6], [822.222, 1475.002, 1.59], [568.262, 1723.288, 1.529]]\nC: [[780.777, 1370.477, 1.602], [703.571, 1274.284, 1.37], [714.428, 1801.829, 1.419], [746.711, 1436.885, 1.447]]\nD: [[710.384, 1565.299, 1.507], [708.536, 1567.037, 1.44], [706.623, 1568.784, 1.524], [704.701, 1570.522, 1.457]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_179_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_179_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_179_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_179_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_179_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_179_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_179_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_179_7.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[320.577, 1284.53, 0.932], [414.562, 975.368, 0.94], [402.331, 1119.979, 0.929], [395.463, 906.274, 0.959]]\nB: [[360.972, 1107.73, 0.916], [361.016, 1107.522, 0.816], [361.016, 1107.522, 0.816], [360.975, 1107.716, 1.016]]\nC: [[398.669, 1187.71, 1.007], [399.141, 1252.975, 0.956], [420.325, 1170.467, 0.694], [429.664, 938.089, 1.209]]\nD: [[297.101, 1040.56, 0.77], [316.568, 1235.213, 0.681], [321.404, 1279.995, 0.669], [348.852, 1148.111, 0.837]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[320.577, 1284.53, 0.932], [414.562, 975.368, 0.94], [402.331, 1119.979, 0.929], [395.463, 906.274, 0.959]]\nB: [[360.972, 1107.73, 0.916], [361.016, 1107.522, 0.816], [361.016, 1107.522, 0.816], [360.975, 1107.716, 1.016]]\nC: [[398.669, 1187.71, 1.007], [399.141, 1252.975, 0.956], [420.325, 1170.467, 0.694], [429.664, 938.089, 1.209]]\nD: [[297.101, 1040.56, 0.77], [316.568, 1235.213, 0.681], [321.404, 1279.995, 0.669], [348.852, 1148.111, 0.837]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_180_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_180_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_180_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_180_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_180_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_180_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_180_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_180_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[438.16, 1086.715, 0.696], [438.167, 1086.722, 0.834], [438.221, 1086.776, 0.858], [438.222, 1086.778, 0.851]]\nB: [[507.3, 1112.979, 0.562], [428.69, 1220.639, 0.948], [389.443, 1140.615, 0.714], [399.989, 1095.621, 0.887]]\nC: [[354.14, 1079.897, 0.71], [359.261, 923.891, 0.834], [367.923, 883.883, 0.807], [495.344, 1285.646, 0.707]]\nD: [[403.04, 1151.47, 0.76], [405.751, 1286.822, 0.742], [364.368, 898.997, 0.983], [413.957, 1207.042, 0.986]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[438.16, 1086.715, 0.696], [438.167, 1086.722, 0.834], [438.221, 1086.776, 0.858], [438.222, 1086.778, 0.851]]\nB: [[507.3, 1112.979, 0.562], [428.69, 1220.639, 0.948], [389.443, 1140.615, 0.714], [399.989, 1095.621, 0.887]]\nC: [[354.14, 1079.897, 0.71], [359.261, 923.891, 0.834], [367.923, 883.883, 0.807], [495.344, 1285.646, 0.707]]\nD: [[403.04, 1151.47, 0.76], [405.751, 1286.822, 0.742], [364.368, 898.997, 0.983], [413.957, 1207.042, 0.986]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_181_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_181_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_181_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_181_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_181_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_181_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_181_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_181_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[1355.191, 1159.351, -0.246], [1395.66, 1152.84, -0.228], [1240.863, 981.042, -0.233], [1262.588, 994.473, -0.268]]\nB: [[1217.511, 1111.827, -0.25], [1078.13, 1020.65, -0.204], [1571.618, 1018.832, -0.231], [1379.544, 1182.701, -0.209]]\nC: [[1345.406, 1027.941, -0.246], [1345.406, 1027.941, -0.246], [1345.406, 1027.941, -0.246], [1345.406, 1027.941, -0.246]]\nD: [[1421.753, 850.708, -0.231], [1362.386, 1149.899, -0.264], [1385.927, 906.596, -0.27], [1544.629, 853.12, -0.225]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1355.191, 1159.351, -0.246], [1395.66, 1152.84, -0.228], [1240.863, 981.042, -0.233], [1262.588, 994.473, -0.268]]\nB: [[1217.511, 1111.827, -0.25], [1078.13, 1020.65, -0.204], [1571.618, 1018.832, -0.231], [1379.544, 1182.701, -0.209]]\nC: [[1345.406, 1027.941, -0.246], [1345.406, 1027.941, -0.246], [1345.406, 1027.941, -0.246], [1345.406, 1027.941, -0.246]]\nD: [[1421.753, 850.708, -0.231], [1362.386, 1149.899, -0.264], [1385.927, 906.596, -0.27], [1544.629, 853.12, -0.225]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_182_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_182_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_182_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_182_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_182_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_182_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_182_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_182_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[1187.979, 1094.431, 0.392], [1347.424, 1034.148, 0.436], [1538.003, 902.017, 0.373], [1212.196, 1182.796, 0.356]]\nB: [[1108.77, 1168.85, 0.455], [1257.172, 1008.091, 0.364], [1442.783, 1234.413, 0.459], [1312.575, 918.52, 0.361]]\nC: [[1079.007, 1042.862, 0.339], [1523.203, 1091.192, 0.449], [1147.362, 1204.835, 0.348], [1308.995, 1207.074, 0.355]]\nD: [[1335.042, 1037.999, 0.407], [1335.042, 1037.999, 0.407], [1335.042, 1037.999, 0.407], [1335.042, 1037.999, 0.407]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1187.979, 1094.431, 0.392], [1347.424, 1034.148, 0.436], [1538.003, 902.017, 0.373], [1212.196, 1182.796, 0.356]]\nB: [[1108.77, 1168.85, 0.455], [1257.172, 1008.091, 0.364], [1442.783, 1234.413, 0.459], [1312.575, 918.52, 0.361]]\nC: [[1079.007, 1042.862, 0.339], [1523.203, 1091.192, 0.449], [1147.362, 1204.835, 0.348], [1308.995, 1207.074, 0.355]]\nD: [[1335.042, 1037.999, 0.407], [1335.042, 1037.999, 0.407], [1335.042, 1037.999, 0.407], [1335.042, 1037.999, 0.407]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_183_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_183_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_183_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_183_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_183_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_183_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_183_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_183_7.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[390.593, 1112.966, 0.519], [390.606, 1112.958, 0.557], [390.618, 1112.95, 0.594], [390.631, 1112.942, 0.632]]\nB: [[391.396, 942.794, 0.438], [401.44, 920.349, 0.468], [426.581, 916.13, 0.599], [338.619, 1261.56, 0.586]]\nC: [[374.135, 1327.311, 0.499], [345.631, 969.283, 0.639], [364.128, 913.86, 0.609], [339.45, 1013.046, 0.638]]\nD: [[447.459, 924.256, 0.485], [384.158, 1293.211, 0.601], [467.686, 901.08, 0.57], [450.581, 1198.823, 0.72]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[390.593, 1112.966, 0.519], [390.606, 1112.958, 0.557], [390.618, 1112.95, 0.594], [390.631, 1112.942, 0.632]]\nB: [[391.396, 942.794, 0.438], [401.44, 920.349, 0.468], [426.581, 916.13, 0.599], [338.619, 1261.56, 0.586]]\nC: [[374.135, 1327.311, 0.499], [345.631, 969.283, 0.639], [364.128, 913.86, 0.609], [339.45, 1013.046, 0.638]]\nD: [[447.459, 924.256, 0.485], [384.158, 1293.211, 0.601], [467.686, 901.08, 0.57], [450.581, 1198.823, 0.72]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_184_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_184_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_184_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_184_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_184_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_184_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_184_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_184_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[1529.469, 775.588, 0.372], [1793.42, 970.233, 0.339], [2009.523, 974.973, 0.358], [1610.13, 949.492, 0.279]]\nB: [[1591.041, 893.016, 0.43], [1803.48, 918.619, 0.324], [1586.65, 1006.415, 0.274], [2124.951, 926.48, 0.338]]\nC: [[1905.651, 875.006, 0.371], [1905.64, 875.027, 0.347], [1905.629, 875.048, 0.323], [1905.617, 875.069, 0.299]]\nD: [[1763.75, 935.01, 0.421], [2101.6, 837.057, 0.378], [1529.348, 1031.722, 0.334], [1741.729, 804.694, 0.321]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1529.469, 775.588, 0.372], [1793.42, 970.233, 0.339], [2009.523, 974.973, 0.358], [1610.13, 949.492, 0.279]]\nB: [[1591.041, 893.016, 0.43], [1803.48, 918.619, 0.324], [1586.65, 1006.415, 0.274], [2124.951, 926.48, 0.338]]\nC: [[1905.651, 875.006, 0.371], [1905.64, 875.027, 0.347], [1905.629, 875.048, 0.323], [1905.617, 875.069, 0.299]]\nD: [[1763.75, 935.01, 0.421], [2101.6, 837.057, 0.378], [1529.348, 1031.722, 0.334], [1741.729, 804.694, 0.321]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_185_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_185_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_185_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_185_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_185_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_185_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_185_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_185_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[1102.52, 1189.44, 1.15], [1247.531, 856.838, 0.847], [1591.837, 1070.296, 0.973], [1281.673, 1018.706, 0.914]]\nB: [[1378.756, 954.378, 1.005], [1282.349, 1065.012, 1.03], [1288.645, 985.253, 1.088], [1368.77, 1206.494, 1.161]]\nC: [[1330.066, 1046.334, 0.969], [1330.066, 1046.334, 0.969], [1330.066, 1046.334, 0.969], [1330.066, 1046.334, 0.969]]\nD: [[1490.554, 938.979, 1.093], [1292.466, 1009.582, 0.816], [1257.859, 968.601, 0.966], [1535.083, 927.732, 1.127]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1102.52, 1189.44, 1.15], [1247.531, 856.838, 0.847], [1591.837, 1070.296, 0.973], [1281.673, 1018.706, 0.914]]\nB: [[1378.756, 954.378, 1.005], [1282.349, 1065.012, 1.03], [1288.645, 985.253, 1.088], [1368.77, 1206.494, 1.161]]\nC: [[1330.066, 1046.334, 0.969], [1330.066, 1046.334, 0.969], [1330.066, 1046.334, 0.969], [1330.066, 1046.334, 0.969]]\nD: [[1490.554, 938.979, 1.093], [1292.466, 1009.582, 0.816], [1257.859, 968.601, 0.966], [1535.083, 927.732, 1.127]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_186_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_186_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_186_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_186_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_186_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_186_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_186_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_186_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[417.399, 1193.55, 0.994], [416.858, 1193.723, 1.044], [416.129, 1193.95, 1.119], [415.49, 1194.12, 1.144]]\nB: [[409.271, 1043.71, 0.802], [478.793, 1282.181, 1.098], [347.872, 1358.99, 1.243], [485.83, 1248.86, 0.927]]\nC: [[335.7, 1322.48, 1.133], [434.978, 1371.089, 0.975], [462.262, 1332.7, 0.913], [375.81, 1227.16, 1.032]]\nD: [[403.098, 1282.9, 0.834], [475.947, 968.443, 0.984], [492.927, 1029.78, 1.321], [332.54, 962.76, 1.165]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[417.399, 1193.55, 0.994], [416.858, 1193.723, 1.044], [416.129, 1193.95, 1.119], [415.49, 1194.12, 1.144]]\nB: [[409.271, 1043.71, 0.802], [478.793, 1282.181, 1.098], [347.872, 1358.99, 1.243], [485.83, 1248.86, 0.927]]\nC: [[335.7, 1322.48, 1.133], [434.978, 1371.089, 0.975], [462.262, 1332.7, 0.913], [375.81, 1227.16, 1.032]]\nD: [[403.098, 1282.9, 0.834], [475.947, 968.443, 0.984], [492.927, 1029.78, 1.321], [332.54, 962.76, 1.165]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_187_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_187_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_187_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_187_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_187_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_187_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_187_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_187_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[1542.554, 1185.251, 1.339], [1408.096, 1145.054, 1.499], [1475.525, 890.578, 1.509], [1249.888, 965.619, 1.875]]\nB: [[1536.937, 1095.924, 1.505], [1199.25, 1245.415, 1.478], [1539.555, 1123.021, 1.621], [1138.513, 1047.666, 1.622]]\nC: [[1308.987, 1052.606, 1.402], [1310.095, 1053.915, 1.502], [1311.245, 1055.225, 1.598], [1312.378, 1056.412, 1.693]]\nD: [[1455.025, 1135.652, 1.649], [1113.162, 854.503, 1.292], [1160.009, 1154.438, 1.809], [1412.034, 1067.9, 1.717]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1542.554, 1185.251, 1.339], [1408.096, 1145.054, 1.499], [1475.525, 890.578, 1.509], [1249.888, 965.619, 1.875]]\nB: [[1536.937, 1095.924, 1.505], [1199.25, 1245.415, 1.478], [1539.555, 1123.021, 1.621], [1138.513, 1047.666, 1.622]]\nC: [[1308.987, 1052.606, 1.402], [1310.095, 1053.915, 1.502], [1311.245, 1055.225, 1.598], [1312.378, 1056.412, 1.693]]\nD: [[1455.025, 1135.652, 1.649], [1113.162, 854.503, 1.292], [1160.009, 1154.438, 1.809], [1412.034, 1067.9, 1.717]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_188_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_188_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_188_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_188_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_188_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_188_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_188_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_188_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[1136.4, 1251.375, 0.964], [1098.34, 926.608, 0.962], [1167.04, 1063.589, 0.748], [1241.17, 957.965, 0.728]]\nB: [[1133.35, 933.225, 0.869], [1467.73, 1116.69, 0.711], [1336.16, 1216.158, 0.757], [1461.61, 1223.596, 0.736]]\nC: [[1339.79, 1239.08, 0.933], [1509.78, 946.412, 0.694], [1165.99, 1097.514, 0.848], [1235.1, 968.837, 0.75]]\nD: [[1264.56, 1079.653, 0.835], [1264.56, 1079.653, 0.836], [1264.56, 1079.653, 0.837], [1264.56, 1079.653, 0.837]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1136.4, 1251.375, 0.964], [1098.34, 926.608, 0.962], [1167.04, 1063.589, 0.748], [1241.17, 957.965, 0.728]]\nB: [[1133.35, 933.225, 0.869], [1467.73, 1116.69, 0.711], [1336.16, 1216.158, 0.757], [1461.61, 1223.596, 0.736]]\nC: [[1339.79, 1239.08, 0.933], [1509.78, 946.412, 0.694], [1165.99, 1097.514, 0.848], [1235.1, 968.837, 0.75]]\nD: [[1264.56, 1079.653, 0.835], [1264.56, 1079.653, 0.836], [1264.56, 1079.653, 0.837], [1264.56, 1079.653, 0.837]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_189_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_189_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_189_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_189_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_189_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_189_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_189_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_189_7.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[352.109, 1115.839, 0.635], [352.109, 1115.839, 0.814], [352.109, 1115.839, 0.993], [352.109, 1115.839, 0.801]]\nB: [[403.261, 1293.127, 0.727], [350.132, 1278.347, 0.888], [358.28, 1088.995, 1.055], [317.737, 1172.102, 0.72]]\nC: [[293.311, 1012.839, 0.541], [366.92, 1035.95, 0.854], [356.294, 946.374, 0.819], [390.703, 900.089, 0.781]]\nD: [[318.426, 1207.534, 0.602], [341.429, 1071.396, 0.779], [376.574, 1288.812, 1.059], [313.83, 1243.677, 0.781]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[352.109, 1115.839, 0.635], [352.109, 1115.839, 0.814], [352.109, 1115.839, 0.993], [352.109, 1115.839, 0.801]]\nB: [[403.261, 1293.127, 0.727], [350.132, 1278.347, 0.888], [358.28, 1088.995, 1.055], [317.737, 1172.102, 0.72]]\nC: [[293.311, 1012.839, 0.541], [366.92, 1035.95, 0.854], [356.294, 946.374, 0.819], [390.703, 900.089, 0.781]]\nD: [[318.426, 1207.534, 0.602], [341.429, 1071.396, 0.779], [376.574, 1288.812, 1.059], [313.83, 1243.677, 0.781]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_190_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_190_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_190_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_190_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_190_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_190_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_190_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_190_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[405.833, 996.81, 0.95], [355.845, 1117.198, 1.04], [354.65, 947.797, 0.81], [358.045, 1006.86, 0.9]]\nB: [[464.397, 1283.6, 0.75], [328.65, 936.28, 0.941], [365.855, 941.15, 0.91], [332.144, 978.284, 0.89]]\nC: [[390.721, 1120.16, 0.88], [390.397, 1119.603, 0.905], [390.144, 1119.015, 0.93], [389.874, 1118.388, 1.08]]\nD: [[338.901, 1032.25, 0.92], [452.113, 1110.634, 1.086], [401.774, 1235.48, 1.09], [348.321, 1273.502, 1.06]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[405.833, 996.81, 0.95], [355.845, 1117.198, 1.04], [354.65, 947.797, 0.81], [358.045, 1006.86, 0.9]]\nB: [[464.397, 1283.6, 0.75], [328.65, 936.28, 0.941], [365.855, 941.15, 0.91], [332.144, 978.284, 0.89]]\nC: [[390.721, 1120.16, 0.88], [390.397, 1119.603, 0.905], [390.144, 1119.015, 0.93], [389.874, 1118.388, 1.08]]\nD: [[338.901, 1032.25, 0.92], [452.113, 1110.634, 1.086], [401.774, 1235.48, 1.09], [348.321, 1273.502, 1.06]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_191_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_191_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_191_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_191_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_191_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_191_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_191_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_191_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[310.056, 702.514, -0.534], [308.348, 700.438, -0.379], [306.644, 698.366, -0.226], [299.451, 689.618, 0.29]]\nB: [[311.979, 810.175, -0.565], [260.058, 694.023, -0.342], [272.393, 835.115, -0.249], [357.323, 587.99, 0.25]]\nC: [[344.874, 770.786, -0.524], [341.8, 753.476, -0.452], [326.128, 633.003, -0.265], [319.66, 701.069, 0.24]]\nD: [[365.685, 814.947, -0.568], [301.272, 761.237, -0.383], [347.211, 755.567, -0.186], [320.282, 634.704, 0.25]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[310.056, 702.514, -0.534], [308.348, 700.438, -0.379], [306.644, 698.366, -0.226], [299.451, 689.618, 0.29]]\nB: [[311.979, 810.175, -0.565], [260.058, 694.023, -0.342], [272.393, 835.115, -0.249], [357.323, 587.99, 0.25]]\nC: [[344.874, 770.786, -0.524], [341.8, 753.476, -0.452], [326.128, 633.003, -0.265], [319.66, 701.069, 0.24]]\nD: [[365.685, 814.947, -0.568], [301.272, 761.237, -0.383], [347.211, 755.567, -0.186], [320.282, 634.704, 0.25]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_192_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_192_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_192_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_192_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_192_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_192_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_192_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_192_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[410.458, 1100.432, 0.572], [415.671, 1229.197, 0.401], [335.629, 978.571, 0.674], [395.098, 1300.552, 0.478]]\nB: [[401.269, 1173.484, 0.482], [401.265, 1173.449, 0.482], [401.261, 1173.415, 0.582], [401.213, 1173.399, 0.569]]\nC: [[344.262, 1103.389, 0.405], [334.337, 967.834, 0.523], [322.213, 1142.996, 0.661], [438.778, 1322.064, 0.463]]\nD: [[444.209, 1082.14, 0.539], [424.03, 1030.374, 0.393], [373.133, 1212.622, 0.685], [326.49, 1003.549, 0.642]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[410.458, 1100.432, 0.572], [415.671, 1229.197, 0.401], [335.629, 978.571, 0.674], [395.098, 1300.552, 0.478]]\nB: [[401.269, 1173.484, 0.482], [401.265, 1173.449, 0.482], [401.261, 1173.415, 0.582], [401.213, 1173.399, 0.569]]\nC: [[344.262, 1103.389, 0.405], [334.337, 967.834, 0.523], [322.213, 1142.996, 0.661], [438.778, 1322.064, 0.463]]\nD: [[444.209, 1082.14, 0.539], [424.03, 1030.374, 0.393], [373.133, 1212.622, 0.685], [326.49, 1003.549, 0.642]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_193_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_193_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_193_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_193_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_193_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_193_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_193_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_193_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[1266.882, 1077.846, 0.755], [1266.882, 1077.846, 0.755], [1266.882, 1077.846, 0.755], [1266.882, 1077.846, 0.755]]\nB: [[1271.739, 1048.404, 0.906], [1320.835, 1026.02, 0.819], [1396.815, 1235.0, 0.773], [1277.754, 1106.806, 0.618]]\nC: [[1317.337, 1260.195, 0.841], [1225.296, 1191.372, 0.797], [1170.961, 1158.338, 0.874], [1386.483, 1261.547, 0.715]]\nD: [[1218.794, 899.374, 0.833], [1062.39, 1230.912, 0.743], [1031.716, 1194.236, 0.798], [1438.585, 1159.315, 0.751]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1266.882, 1077.846, 0.755], [1266.882, 1077.846, 0.755], [1266.882, 1077.846, 0.755], [1266.882, 1077.846, 0.755]]\nB: [[1271.739, 1048.404, 0.906], [1320.835, 1026.02, 0.819], [1396.815, 1235.0, 0.773], [1277.754, 1106.806, 0.618]]\nC: [[1317.337, 1260.195, 0.841], [1225.296, 1191.372, 0.797], [1170.961, 1158.338, 0.874], [1386.483, 1261.547, 0.715]]\nD: [[1218.794, 899.374, 0.833], [1062.39, 1230.912, 0.743], [1031.716, 1194.236, 0.798], [1438.585, 1159.315, 0.751]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_194_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_194_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_194_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_194_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_194_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_194_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_194_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_194_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[1416.446, 917.476, 0.369], [1133.347, 1173.31, 0.385], [1116.37, 899.476, 0.388], [1512.12, 1114.073, 0.426]]\nB: [[1278.607, 1028.824, 0.314], [1281.473, 1031.847, 0.364], [1285.19, 1035.681, 0.414], [1288.26, 1038.767, 0.464]]\nC: [[1110.832, 1041.63, 0.259], [1144.062, 983.891, 0.355], [1120.76, 991.772, 0.443], [1528.04, 941.905, 0.384]]\nD: [[1057.696, 870.984, 0.36], [1077.37, 1211.58, 0.316], [1049.97, 1110.459, 0.384], [1427.8, 977.845, 0.531]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1416.446, 917.476, 0.369], [1133.347, 1173.31, 0.385], [1116.37, 899.476, 0.388], [1512.12, 1114.073, 0.426]]\nB: [[1278.607, 1028.824, 0.314], [1281.473, 1031.847, 0.364], [1285.19, 1035.681, 0.414], [1288.26, 1038.767, 0.464]]\nC: [[1110.832, 1041.63, 0.259], [1144.062, 983.891, 0.355], [1120.76, 991.772, 0.443], [1528.04, 941.905, 0.384]]\nD: [[1057.696, 870.984, 0.36], [1077.37, 1211.58, 0.316], [1049.97, 1110.459, 0.384], [1427.8, 977.845, 0.531]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_195_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_195_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_195_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_195_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_195_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_195_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_195_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_195_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[617.725, 1759.21, -0.3], [551.267, 1929.565, -0.167], [709.584, 1522.655, -0.14], [704.467, 1583.292, -0.112]]\nB: [[650.737, 1625.23, -0.3], [651.288, 1624.989, -0.175], [651.844, 1624.758, -0.15], [652.374, 1624.474, -0.125]]\nC: [[757.657, 1791.25, -0.3], [596.205, 1710.113, -0.196], [594.666, 1578.119, -0.17], [645.182, 1918.374, -0.147]]\nD: [[672.039, 1335.14, -0.3], [676.19, 1821.439, -0.205], [711.246, 1830.07, -0.17], [653.306, 1617.612, -0.115]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[617.725, 1759.21, -0.3], [551.267, 1929.565, -0.167], [709.584, 1522.655, -0.14], [704.467, 1583.292, -0.112]]\nB: [[650.737, 1625.23, -0.3], [651.288, 1624.989, -0.175], [651.844, 1624.758, -0.15], [652.374, 1624.474, -0.125]]\nC: [[757.657, 1791.25, -0.3], [596.205, 1710.113, -0.196], [594.666, 1578.119, -0.17], [645.182, 1918.374, -0.147]]\nD: [[672.039, 1335.14, -0.3], [676.19, 1821.439, -0.205], [711.246, 1830.07, -0.17], [653.306, 1617.612, -0.115]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_196_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_196_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_196_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_196_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_196_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_196_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_196_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_196_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[437.74, 1143.27, 0.674], [451.18, 1139.1, 0.835], [366.614, 1128.63, 0.978], [348.424, 1174.882, 0.985]]\nB: [[462.69, 887.14, 0.851], [342.59, 1042.19, 0.759], [433.546, 1124.124, 0.983], [359.663, 1221.9, 0.894]]\nC: [[398.14, 1103.34, 0.828], [398.08, 1103.37, 0.834], [398.005, 1103.406, 0.891], [398.065, 1103.387, 0.875]]\nD: [[378.04, 1027.98, 0.775], [428.89, 1003.86, 0.777], [374.706, 954.311, 1.028], [402.006, 1268.493, 0.732]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[437.74, 1143.27, 0.674], [451.18, 1139.1, 0.835], [366.614, 1128.63, 0.978], [348.424, 1174.882, 0.985]]\nB: [[462.69, 887.14, 0.851], [342.59, 1042.19, 0.759], [433.546, 1124.124, 0.983], [359.663, 1221.9, 0.894]]\nC: [[398.14, 1103.34, 0.828], [398.08, 1103.37, 0.834], [398.005, 1103.406, 0.891], [398.065, 1103.387, 0.875]]\nD: [[378.04, 1027.98, 0.775], [428.89, 1003.86, 0.777], [374.706, 954.311, 1.028], [402.006, 1268.493, 0.732]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_197_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_197_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_197_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_197_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_197_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_197_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_197_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_197_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[312.42, 996.53, 0.734], [392.368, 1071.586, 1.006], [362.03, 997.691, 1.059], [395.055, 894.894, 0.96]]\nB: [[292.47, 1178.83, 0.536], [385.591, 1223.459, 0.952], [347.65, 1157.122, 1.11], [325.064, 1035.946, 1.008]]\nC: [[424.62, 956.05, 0.586], [414.477, 1277.953, 1.04], [360.719, 916.865, 1.121], [431.128, 1236.879, 1.07]]\nD: [[364.33, 1100.33, 0.667], [364.332, 1100.333, 0.879], [364.336, 1100.342, 1.067], [364.336, 1100.342, 0.917]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[312.42, 996.53, 0.734], [392.368, 1071.586, 1.006], [362.03, 997.691, 1.059], [395.055, 894.894, 0.96]]\nB: [[292.47, 1178.83, 0.536], [385.591, 1223.459, 0.952], [347.65, 1157.122, 1.11], [325.064, 1035.946, 1.008]]\nC: [[424.62, 956.05, 0.586], [414.477, 1277.953, 1.04], [360.719, 916.865, 1.121], [431.128, 1236.879, 1.07]]\nD: [[364.33, 1100.33, 0.667], [364.332, 1100.333, 0.879], [364.336, 1100.342, 1.067], [364.336, 1100.342, 0.917]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_198_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_198_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_198_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_198_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_198_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_198_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_198_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_198_7.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Object_Tracking", "visual_input_component": "LiDAR depth image and natural image", "source": "NuScenes_threed_Object_Tracking", "options": "A: [[1802.663, 947.701, 0.425], [1936.882, 764.775, 0.32], [1669.337, 970.661, 0.35], [2083.327, 843.762, 0.422]]\nB: [[1895.725, 877.102, 0.355], [1895.725, 877.102, 0.34], [1895.725, 877.102, 0.39], [1895.773, 877.087, 0.415]]\nC: [[2260.495, 1033.212, 0.352], [1682.669, 730.505, 0.36], [1808.702, 968.32, 0.35], [2274.387, 847.273, 0.38]]\nD: [[2228.008, 726.803, 0.308], [1759.634, 872.261, 0.3], [2044.819, 846.131, 0.41], [2237.11, 733.52, 0.386]]", "question": "Given a sequence of RGB and LiDAR depth images capturing object motion over time, please track the movement of the object outlined in the RGB images. In the LiDAR depth images, LiDAR points were projected back to the corresponding RGB images. The output should be in the format of a sequence of 3D positions, i.e., [x, y, z], which represents the gravity center of the 3D bounding boxes in meters of the obejct, with respect to the global coordinate system.", "context": "Your task is to track the movement of objects in 3D space across multiple frames. \nSelect from the following choices.\nA: [[1802.663, 947.701, 0.425], [1936.882, 764.775, 0.32], [1669.337, 970.661, 0.35], [2083.327, 843.762, 0.422]]\nB: [[1895.725, 877.102, 0.355], [1895.725, 877.102, 0.34], [1895.725, 877.102, 0.39], [1895.773, 877.087, 0.415]]\nC: [[2260.495, 1033.212, 0.352], [1682.669, 730.505, 0.36], [1808.702, 968.32, 0.35], [2274.387, 847.273, 0.38]]\nD: [[2228.008, 726.803, 0.308], [1759.634, 872.261, 0.3], [2044.819, 846.131, 0.41], [2237.11, 733.52, 0.386]]", "input_image_path": ["./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_199_0.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_199_1.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_199_2.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_199_3.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_199_4.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_199_5.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_199_6.png", "./3D-spatial/threeD_Object_Tracking/threeD_Object_Tracking_199_7.png"], "output": "B", "qwen3-vl": "image none"}]
\ No newline at end of file
diff --git a/results/threeD_Pose_Estimation/qwen3-vl/metadata_info.json b/results/threeD_Pose_Estimation/qwen3-vl/metadata_info.json
new file mode 100644
index 0000000..d595b0a
--- /dev/null
+++ b/results/threeD_Pose_Estimation/qwen3-vl/metadata_info.json
@@ -0,0 +1 @@
+[{"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.9999860986788529, -0.0029206102888043426, 0.004512150995140729], [0.0029125077264400548, 0.9999942525374551, 0.0017791916930172049], [-0.004515929331115238, -0.0017660484162836249, 0.9999884136529374]], 'translation vector': [0.00035121537956284143, -0.00211204147587285, 0.0015269971399166637]}\nB: {'rotation matrix': [[0.992252, 0.033516, -0.119639], [0.120006, -0.507929, 0.852999], [-0.032179, -0.860747, -0.508015]], 'translation vector': [2.483829, 1.386735, 1.351847]}\nC: {'rotation matrix': [[0.992393, 0.03365, -0.118424], [0.118928, -0.510671, 0.851511], [-0.031822, -0.859118, -0.510788]], 'translation vector': [2.483625, 1.389348, 1.348027]}\nD: {'rotation matrix': [[0.992358, 0.033913, -0.118638], [0.11923, -0.511103, 0.85121], [-0.031769, -0.85885, -0.511241]], 'translation vector': [2.484339, 1.38954, 1.351903]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999860986788529, -0.0029206102888043426, 0.004512150995140729], [0.0029125077264400548, 0.9999942525374551, 0.0017791916930172049], [-0.004515929331115238, -0.0017660484162836249, 0.9999884136529374]], 'translation vector': [0.00035121537956284143, -0.00211204147587285, 0.0015269971399166637]}\nB: {'rotation matrix': [[0.992252, 0.033516, -0.119639], [0.120006, -0.507929, 0.852999], [-0.032179, -0.860747, -0.508015]], 'translation vector': [2.483829, 1.386735, 1.351847]}\nC: {'rotation matrix': [[0.992393, 0.03365, -0.118424], [0.118928, -0.510671, 0.851511], [-0.031822, -0.859118, -0.510788]], 'translation vector': [2.483625, 1.389348, 1.348027]}\nD: {'rotation matrix': [[0.992358, 0.033913, -0.118638], [0.11923, -0.511103, 0.85121], [-0.031769, -0.85885, -0.511241]], 'translation vector': [2.484339, 1.38954, 1.351903]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_0_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_0_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_0_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_0_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.947387, 0.126025, -0.294239], [0.319939, 0.401221, -0.858289], [0.009889, -0.90727, -0.420431]], 'translation vector': [2.649368, 2.97856, 1.365403]}\nB: {'rotation matrix': [[0.9999009689053274, -0.0005834477726320058, -0.014081260252404066], [0.0005185811040898338, 0.9999892566550749, -0.004650674323837413], [0.014082812752834694, 0.004642499383147887, 0.9998897887510193]], 'translation vector': [-0.0001697198246333187, -0.006057464737030116, -0.0030857621071840313]}\nC: {'rotation matrix': [[-0.946914, 0.131611, -0.293313], [0.321456, 0.400409, -0.858102], [0.004509, -0.906836, -0.42146]], 'translation vector': [2.644349, 2.98006, 1.361572]}\nD: {'rotation matrix': [[-0.946851, 0.128282, -0.294988], [0.321573, 0.400396, -0.858064], [0.008037, -0.907318, -0.420367]], 'translation vector': [2.647634, 2.978188, 1.36466]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.947387, 0.126025, -0.294239], [0.319939, 0.401221, -0.858289], [0.009889, -0.90727, -0.420431]], 'translation vector': [2.649368, 2.97856, 1.365403]}\nB: {'rotation matrix': [[0.9999009689053274, -0.0005834477726320058, -0.014081260252404066], [0.0005185811040898338, 0.9999892566550749, -0.004650674323837413], [0.014082812752834694, 0.004642499383147887, 0.9998897887510193]], 'translation vector': [-0.0001697198246333187, -0.006057464737030116, -0.0030857621071840313]}\nC: {'rotation matrix': [[-0.946914, 0.131611, -0.293313], [0.321456, 0.400409, -0.858102], [0.004509, -0.906836, -0.42146]], 'translation vector': [2.644349, 2.98006, 1.361572]}\nD: {'rotation matrix': [[-0.946851, 0.128282, -0.294988], [0.321573, 0.400396, -0.858064], [0.008037, -0.907318, -0.420367]], 'translation vector': [2.647634, 2.978188, 1.36466]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_1_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_1_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_1_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_1_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.932535, 0.164547, -0.321407], [0.359784, -0.498771, 0.788533], [-0.030558, -0.850971, -0.524323]], 'translation vector': [4.48804, -0.229774, 1.538571]}\nB: {'rotation matrix': [[0.999974189934778, 0.00023654440835110308, 0.007205305592304459], [-0.00021900350048315587, 0.9999974076812913, -0.002394096534537237], [-0.00720576870446999, 0.0023924811533001236, 0.9999711271279352]], 'translation vector': [-0.011672688463027825, -0.012243066982587869, 0.0020668703552249035]}\nC: {'rotation matrix': [[0.930699, 0.167887, -0.324983], [0.364431, -0.502007, 0.784334], [-0.031464, -0.848412, -0.5284]], 'translation vector': [4.497419, -0.228559, 1.538943]}\nD: {'rotation matrix': [[0.928253, 0.171766, -0.329913], [0.370592, -0.502789, 0.780939], [-0.031738, -0.847172, -0.53037]], 'translation vector': [4.506209, -0.230888, 1.537021]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.932535, 0.164547, -0.321407], [0.359784, -0.498771, 0.788533], [-0.030558, -0.850971, -0.524323]], 'translation vector': [4.48804, -0.229774, 1.538571]}\nB: {'rotation matrix': [[0.999974189934778, 0.00023654440835110308, 0.007205305592304459], [-0.00021900350048315587, 0.9999974076812913, -0.002394096534537237], [-0.00720576870446999, 0.0023924811533001236, 0.9999711271279352]], 'translation vector': [-0.011672688463027825, -0.012243066982587869, 0.0020668703552249035]}\nC: {'rotation matrix': [[0.930699, 0.167887, -0.324983], [0.364431, -0.502007, 0.784334], [-0.031464, -0.848412, -0.5284]], 'translation vector': [4.497419, -0.228559, 1.538943]}\nD: {'rotation matrix': [[0.928253, 0.171766, -0.329913], [0.370592, -0.502789, 0.780939], [-0.031738, -0.847172, -0.53037]], 'translation vector': [4.506209, -0.230888, 1.537021]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_2_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_2_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_2_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_2_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.9999922186975153, -0.0004929414354074684, -0.003990843308659959], [0.0004944153894448021, 1.0000000788246688, 0.0002887878542439764], [0.0039912978026401735, -0.00029052347125931846, 0.9999922538344134]], 'translation vector': [0.0015230291799757656, -0.0023232322897525567, 0.004482115182110835]}\nB: {'rotation matrix': [[-0.597501, 0.375338, -0.7086], [0.801649, 0.25893, -0.538808], [-0.018758, -0.889987, -0.4556]], 'translation vector': [2.357092, 1.421442, 1.358509]}\nC: {'rotation matrix': [[-0.595396, 0.37569, -0.710183], [0.803242, 0.259116, -0.536341], [-0.017478, -0.889784, -0.456047]], 'translation vector': [2.35612, 1.420569, 1.361782]}\nD: {'rotation matrix': [[-0.600812, 0.375021, -0.705963], [0.799114, 0.258529, -0.542752], [-0.021031, -0.890237, -0.455012]], 'translation vector': [2.356618, 1.42274, 1.357666]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999922186975153, -0.0004929414354074684, -0.003990843308659959], [0.0004944153894448021, 1.0000000788246688, 0.0002887878542439764], [0.0039912978026401735, -0.00029052347125931846, 0.9999922538344134]], 'translation vector': [0.0015230291799757656, -0.0023232322897525567, 0.004482115182110835]}\nB: {'rotation matrix': [[-0.597501, 0.375338, -0.7086], [0.801649, 0.25893, -0.538808], [-0.018758, -0.889987, -0.4556]], 'translation vector': [2.357092, 1.421442, 1.358509]}\nC: {'rotation matrix': [[-0.595396, 0.37569, -0.710183], [0.803242, 0.259116, -0.536341], [-0.017478, -0.889784, -0.456047]], 'translation vector': [2.35612, 1.420569, 1.361782]}\nD: {'rotation matrix': [[-0.600812, 0.375021, -0.705963], [0.799114, 0.258529, -0.542752], [-0.021031, -0.890237, -0.455012]], 'translation vector': [2.356618, 1.42274, 1.357666]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_3_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_3_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_3_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_3_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.690346, 0.288159, -0.663616], [0.723477, -0.272947, 0.634098], [0.001589, -0.917858, -0.396905]], 'translation vector': [2.536332, 2.010734, 1.438743]}\nB: {'rotation matrix': [[0.691208, 0.288183, -0.662708], [0.722652, -0.27257, 0.635201], [0.00242, -0.917963, -0.396658]], 'translation vector': [2.535653, 2.009964, 1.439474]}\nC: {'rotation matrix': [[0.9999995587474457, 0.00024022036499647738, 0.0007957711644081506], [-0.00023933007530568237, 0.9999998633783928, -0.0007108069241564525], [-0.0007964539455043593, 0.0007109455644655081, 1.000000560983507]], 'translation vector': [-0.004770728985455719, 0.002959587174171885, 0.0013885111462622612]}\nD: {'rotation matrix': [[0.690426, 0.287793, -0.663692], [0.723401, -0.272862, 0.634222], [0.001429, -0.917999, -0.396581]], 'translation vector': [2.53477, 2.009069, 1.43814]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.690346, 0.288159, -0.663616], [0.723477, -0.272947, 0.634098], [0.001589, -0.917858, -0.396905]], 'translation vector': [2.536332, 2.010734, 1.438743]}\nB: {'rotation matrix': [[0.691208, 0.288183, -0.662708], [0.722652, -0.27257, 0.635201], [0.00242, -0.917963, -0.396658]], 'translation vector': [2.535653, 2.009964, 1.439474]}\nC: {'rotation matrix': [[0.9999995587474457, 0.00024022036499647738, 0.0007957711644081506], [-0.00023933007530568237, 0.9999998633783928, -0.0007108069241564525], [-0.0007964539455043593, 0.0007109455644655081, 1.000000560983507]], 'translation vector': [-0.004770728985455719, 0.002959587174171885, 0.0013885111462622612]}\nD: {'rotation matrix': [[0.690426, 0.287793, -0.663692], [0.723401, -0.272862, 0.634222], [0.001429, -0.917999, -0.396581]], 'translation vector': [2.53477, 2.009069, 1.43814]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_4_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_4_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_4_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_4_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.9999658530200795, -0.008162385048605073, -0.000989578606081076], [0.008166400906506959, 0.9999603758313507, 0.0035456278045154508], [0.000960099553273771, -0.0035536229934212678, 0.9999930725472598]], 'translation vector': [0.0005115289741049189, -0.00032414464705918244, 0.0017902118924140176]}\nB: {'rotation matrix': [[-0.221487, 0.417059, -0.881479], [0.974313, 0.13239, -0.182174], [0.040721, -0.899186, -0.435668]], 'translation vector': [3.156802, 0.483491, 1.355875]}\nC: {'rotation matrix': [[-0.223193, 0.415497, -0.881786], [0.973999, 0.131126, -0.184746], [0.038864, -0.900094, -0.43396]], 'translation vector': [3.157208, 0.483314, 1.355186]}\nD: {'rotation matrix': [[-0.22378, 0.416079, -0.881363], [0.973939, 0.129755, -0.18603], [0.036958, -0.900023, -0.434273]], 'translation vector': [3.157156, 0.483591, 1.355072]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999658530200795, -0.008162385048605073, -0.000989578606081076], [0.008166400906506959, 0.9999603758313507, 0.0035456278045154508], [0.000960099553273771, -0.0035536229934212678, 0.9999930725472598]], 'translation vector': [0.0005115289741049189, -0.00032414464705918244, 0.0017902118924140176]}\nB: {'rotation matrix': [[-0.221487, 0.417059, -0.881479], [0.974313, 0.13239, -0.182174], [0.040721, -0.899186, -0.435668]], 'translation vector': [3.156802, 0.483491, 1.355875]}\nC: {'rotation matrix': [[-0.223193, 0.415497, -0.881786], [0.973999, 0.131126, -0.184746], [0.038864, -0.900094, -0.43396]], 'translation vector': [3.157208, 0.483314, 1.355186]}\nD: {'rotation matrix': [[-0.22378, 0.416079, -0.881363], [0.973939, 0.129755, -0.18603], [0.036958, -0.900023, -0.434273]], 'translation vector': [3.157156, 0.483591, 1.355072]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_5_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_5_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_5_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_5_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.294979, -0.395497, 0.869811], [-0.955406, 0.135138, -0.26256], [-0.013703, -0.908472, -0.417722]], 'translation vector': [4.231627, 1.757554, 1.314948]}\nB: {'rotation matrix': [[0.9999859365705687, 3.1558862291102445e-05, 0.005361241460385626], [2.1367515200134227e-05, 0.9999509201368397, -0.009913096398898577], [-0.0053616455101250845, 0.009912181817668633, 0.9999368747200875]], 'translation vector': [-0.00185453480108011, 0.004425119632380792, 0.004740673653586214]}\nC: {'rotation matrix': [[-0.295231, -0.385219, 0.874325], [-0.955253, 0.136423, -0.262452], [-0.018176, -0.912686, -0.408258]], 'translation vector': [4.225714, 1.76129, 1.315325]}\nD: {'rotation matrix': [[-0.297898, -0.402478, 0.865603], [-0.954572, 0.132313, -0.266996], [-0.007071, -0.905817, -0.42361]], 'translation vector': [4.239912, 1.761582, 1.310375]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.294979, -0.395497, 0.869811], [-0.955406, 0.135138, -0.26256], [-0.013703, -0.908472, -0.417722]], 'translation vector': [4.231627, 1.757554, 1.314948]}\nB: {'rotation matrix': [[0.9999859365705687, 3.1558862291102445e-05, 0.005361241460385626], [2.1367515200134227e-05, 0.9999509201368397, -0.009913096398898577], [-0.0053616455101250845, 0.009912181817668633, 0.9999368747200875]], 'translation vector': [-0.00185453480108011, 0.004425119632380792, 0.004740673653586214]}\nC: {'rotation matrix': [[-0.295231, -0.385219, 0.874325], [-0.955253, 0.136423, -0.262452], [-0.018176, -0.912686, -0.408258]], 'translation vector': [4.225714, 1.76129, 1.315325]}\nD: {'rotation matrix': [[-0.297898, -0.402478, 0.865603], [-0.954572, 0.132313, -0.266996], [-0.007071, -0.905817, -0.42361]], 'translation vector': [4.239912, 1.761582, 1.310375]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_6_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_6_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_6_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_6_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.43634, -0.426945, 0.792039], [-0.899692, 0.219464, -0.377346], [-0.012718, -0.877242, -0.47988]], 'translation vector': [1.991026, 3.721216, 1.553809]}\nB: {'rotation matrix': [[0.9999994163271791, 0.0007055386419518741, -5.183687097418444e-05], [-0.0007048159396394524, 1.0000001700794185, 0.00017964949547958028], [5.299300853873026e-05, -0.00017925701598013347, 1.000000048079148]], 'translation vector': [-0.0002435455336966541, -0.00047987538941862695, 0.0009530826592798469]}\nC: {'rotation matrix': [[-0.436198, -0.427205, 0.791977], [-0.899763, 0.219364, -0.377235], [-0.012574, -0.877141, -0.480069]], 'translation vector': [1.990491, 3.720783, 1.55354]}\nD: {'rotation matrix': [[-0.436159, -0.427335, 0.791928], [-0.899792, 0.218686, -0.377559], [-0.011839, -0.877246, -0.479894]], 'translation vector': [1.98993, 3.720837, 1.552023]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.43634, -0.426945, 0.792039], [-0.899692, 0.219464, -0.377346], [-0.012718, -0.877242, -0.47988]], 'translation vector': [1.991026, 3.721216, 1.553809]}\nB: {'rotation matrix': [[0.9999994163271791, 0.0007055386419518741, -5.183687097418444e-05], [-0.0007048159396394524, 1.0000001700794185, 0.00017964949547958028], [5.299300853873026e-05, -0.00017925701598013347, 1.000000048079148]], 'translation vector': [-0.0002435455336966541, -0.00047987538941862695, 0.0009530826592798469]}\nC: {'rotation matrix': [[-0.436198, -0.427205, 0.791977], [-0.899763, 0.219364, -0.377235], [-0.012574, -0.877141, -0.480069]], 'translation vector': [1.990491, 3.720783, 1.55354]}\nD: {'rotation matrix': [[-0.436159, -0.427335, 0.791928], [-0.899792, 0.218686, -0.377559], [-0.011839, -0.877246, -0.479894]], 'translation vector': [1.98993, 3.720837, 1.552023]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_7_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_7_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_7_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_7_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.9999987261495074, 0.0004975354064711766, -0.0014612465343166485], [-0.0004956714882147159, 0.9999976114101617, 0.002168145916521647], [0.0014629580715452147, -0.002166812264256858, 0.9999963843096157]], 'translation vector': [-0.0006911004437073487, 0.0010681685362672333, 0.00045340774962232544]}\nB: {'rotation matrix': [[0.254029, -0.222698, 0.941209], [-0.965413, 0.000689, 0.260725], [-0.058712, -0.974887, -0.21482]], 'translation vector': [0.927676, 4.785758, 1.499229]}\nC: {'rotation matrix': [[0.261058, -0.219751, 0.939978], [-0.963311, 0.003543, 0.268366], [-0.062304, -0.97555, -0.210763]], 'translation vector': [0.925951, 4.784105, 1.497862]}\nD: {'rotation matrix': [[0.253006, -0.222602, 0.941507], [-0.965684, 0.00092, 0.259721], [-0.058681, -0.974909, -0.21473]], 'translation vector': [0.928139, 4.78494, 1.499076]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999987261495074, 0.0004975354064711766, -0.0014612465343166485], [-0.0004956714882147159, 0.9999976114101617, 0.002168145916521647], [0.0014629580715452147, -0.002166812264256858, 0.9999963843096157]], 'translation vector': [-0.0006911004437073487, 0.0010681685362672333, 0.00045340774962232544]}\nB: {'rotation matrix': [[0.254029, -0.222698, 0.941209], [-0.965413, 0.000689, 0.260725], [-0.058712, -0.974887, -0.21482]], 'translation vector': [0.927676, 4.785758, 1.499229]}\nC: {'rotation matrix': [[0.261058, -0.219751, 0.939978], [-0.963311, 0.003543, 0.268366], [-0.062304, -0.97555, -0.210763]], 'translation vector': [0.925951, 4.784105, 1.497862]}\nD: {'rotation matrix': [[0.253006, -0.222602, 0.941507], [-0.965684, 0.00092, 0.259721], [-0.058681, -0.974909, -0.21473]], 'translation vector': [0.928139, 4.78494, 1.499076]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_8_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_8_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_8_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_8_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.298773, 0.351612, -0.887189], [0.953749, -0.077747, 0.290375], [0.033123, -0.932912, -0.358578]], 'translation vector': [3.912279, 4.982921, 1.420651]}\nB: {'rotation matrix': [[0.29932, 0.353357, -0.88631], [0.953697, -0.082092, 0.289349], [0.029485, -0.93188, -0.361567]], 'translation vector': [3.9112, 4.98563, 1.419169]}\nC: {'rotation matrix': [[0.999988451223679, 0.004467367701975065, -0.0013038134525021694], [-0.00445692043483639, 0.9999572940857223, 0.008125240965736606], [0.0013398420675200973, -0.008118916964061989, 0.9999668442244075]], 'translation vector': [0.0032416245875248606, 0.010404768814489485, 0.0002686970709979697]}\nD: {'rotation matrix': [[0.298213, 0.352721, -0.886937], [0.953989, -0.07977, 0.289034], [0.031197, -0.932323, -0.36028]], 'translation vector': [3.912466, 4.985029, 1.419803]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.298773, 0.351612, -0.887189], [0.953749, -0.077747, 0.290375], [0.033123, -0.932912, -0.358578]], 'translation vector': [3.912279, 4.982921, 1.420651]}\nB: {'rotation matrix': [[0.29932, 0.353357, -0.88631], [0.953697, -0.082092, 0.289349], [0.029485, -0.93188, -0.361567]], 'translation vector': [3.9112, 4.98563, 1.419169]}\nC: {'rotation matrix': [[0.999988451223679, 0.004467367701975065, -0.0013038134525021694], [-0.00445692043483639, 0.9999572940857223, 0.008125240965736606], [0.0013398420675200973, -0.008118916964061989, 0.9999668442244075]], 'translation vector': [0.0032416245875248606, 0.010404768814489485, 0.0002686970709979697]}\nD: {'rotation matrix': [[0.298213, 0.352721, -0.886937], [0.953989, -0.07977, 0.289034], [0.031197, -0.932323, -0.36028]], 'translation vector': [3.912466, 4.985029, 1.419803]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_9_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_9_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_9_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_9_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.9999651962705811, 0.003066448737415877, 0.007690547235694275], [-0.003089438182273661, 0.9999915450229314, 0.002959839315488226], [-0.007681501308992471, -0.0029840343790842917, 0.999966014383214]], 'translation vector': [-0.009845168086086709, -0.005623772397939042, 0.0006148134083248102]}\nB: {'rotation matrix': [[-0.998744, -0.022866, -0.044595], [0.034706, 0.326335, -0.944617], [0.036152, -0.944977, -0.325132]], 'translation vector': [2.332638, 2.988529, 1.390534]}\nC: {'rotation matrix': [[-0.998733, -0.022769, -0.044885], [0.035006, 0.326505, -0.944547], [0.036161, -0.944921, -0.325294]], 'translation vector': [2.335994, 2.987912, 1.391848]}\nD: {'rotation matrix': [[-0.998702, -0.02238, -0.045764], [0.035975, 0.326219, -0.94461], [0.03607, -0.945029, -0.32499]], 'translation vector': [2.340556, 2.987934, 1.391904]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999651962705811, 0.003066448737415877, 0.007690547235694275], [-0.003089438182273661, 0.9999915450229314, 0.002959839315488226], [-0.007681501308992471, -0.0029840343790842917, 0.999966014383214]], 'translation vector': [-0.009845168086086709, -0.005623772397939042, 0.0006148134083248102]}\nB: {'rotation matrix': [[-0.998744, -0.022866, -0.044595], [0.034706, 0.326335, -0.944617], [0.036152, -0.944977, -0.325132]], 'translation vector': [2.332638, 2.988529, 1.390534]}\nC: {'rotation matrix': [[-0.998733, -0.022769, -0.044885], [0.035006, 0.326505, -0.944547], [0.036161, -0.944921, -0.325294]], 'translation vector': [2.335994, 2.987912, 1.391848]}\nD: {'rotation matrix': [[-0.998702, -0.02238, -0.045764], [0.035975, 0.326219, -0.94461], [0.03607, -0.945029, -0.32499]], 'translation vector': [2.340556, 2.987934, 1.391904]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_10_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_10_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_10_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_10_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.088289, -0.769037, 0.633078], [-0.992448, -0.013575, 0.121917], [-0.085165, -0.63906, -0.764427]], 'translation vector': [1.06143, 1.251586, 2.183495]}\nB: {'rotation matrix': [[0.095281, -0.770575, 0.630187], [-0.991458, -0.016816, 0.129342], [-0.08907, -0.637128, -0.765594]], 'translation vector': [1.056131, 1.246655, 2.184574]}\nC: {'rotation matrix': [[0.101903, -0.771131, 0.628469], [-0.990357, -0.019031, 0.13723], [-0.093862, -0.636392, -0.765634]], 'translation vector': [1.04909, 1.241123, 2.18482]}\nD: {'rotation matrix': [[0.9999704077161121, 0.0009465902067394159, -0.007647096819155797], [-0.0009526969531251086, 0.9999995333260298, -0.0008216172460913287], [0.00764682863028495, 0.0008292870802229541, 0.9999701657998578]], 'translation vector': [0.005049491112872229, 0.003519427946364395, -0.004842133831311157]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.088289, -0.769037, 0.633078], [-0.992448, -0.013575, 0.121917], [-0.085165, -0.63906, -0.764427]], 'translation vector': [1.06143, 1.251586, 2.183495]}\nB: {'rotation matrix': [[0.095281, -0.770575, 0.630187], [-0.991458, -0.016816, 0.129342], [-0.08907, -0.637128, -0.765594]], 'translation vector': [1.056131, 1.246655, 2.184574]}\nC: {'rotation matrix': [[0.101903, -0.771131, 0.628469], [-0.990357, -0.019031, 0.13723], [-0.093862, -0.636392, -0.765634]], 'translation vector': [1.04909, 1.241123, 2.18482]}\nD: {'rotation matrix': [[0.9999704077161121, 0.0009465902067394159, -0.007647096819155797], [-0.0009526969531251086, 0.9999995333260298, -0.0008216172460913287], [0.00764682863028495, 0.0008292870802229541, 0.9999701657998578]], 'translation vector': [0.005049491112872229, 0.003519427946364395, -0.004842133831311157]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_11_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_11_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_11_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_11_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.236859, -0.585227, 0.775504], [-0.967029, -0.065142, 0.246196], [-0.093563, -0.808248, -0.581361]], 'translation vector': [0.85633, 3.124968, 1.418476]}\nB: {'rotation matrix': [[0.9999986227118945, -0.0014386707321173084, -0.0013720086731618905], [0.0014396405048904127, 0.9999985343757846, 0.0002480174905685849], [0.0013722163701430706, -0.00024858511187529484, 0.9999990039875143]], 'translation vector': [0.0014743108378150183, 9.881450519233503e-05, 0.00010772367212419365]}\nC: {'rotation matrix': [[0.234228, -0.586349, 0.775456], [-0.967526, -0.06262, 0.244894], [-0.095034, -0.807635, -0.581975]], 'translation vector': [0.858687, 3.12069, 1.418757]}\nD: {'rotation matrix': [[0.234642, -0.58546, 0.776002], [-0.967537, -0.063552, 0.24461], [-0.093893, -0.808206, -0.581366]], 'translation vector': [0.856906, 3.122666, 1.417663]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.236859, -0.585227, 0.775504], [-0.967029, -0.065142, 0.246196], [-0.093563, -0.808248, -0.581361]], 'translation vector': [0.85633, 3.124968, 1.418476]}\nB: {'rotation matrix': [[0.9999986227118945, -0.0014386707321173084, -0.0013720086731618905], [0.0014396405048904127, 0.9999985343757846, 0.0002480174905685849], [0.0013722163701430706, -0.00024858511187529484, 0.9999990039875143]], 'translation vector': [0.0014743108378150183, 9.881450519233503e-05, 0.00010772367212419365]}\nC: {'rotation matrix': [[0.234228, -0.586349, 0.775456], [-0.967526, -0.06262, 0.244894], [-0.095034, -0.807635, -0.581975]], 'translation vector': [0.858687, 3.12069, 1.418757]}\nD: {'rotation matrix': [[0.234642, -0.58546, 0.776002], [-0.967537, -0.063552, 0.24461], [-0.093893, -0.808206, -0.581366]], 'translation vector': [0.856906, 3.122666, 1.417663]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_12_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_12_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_12_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_12_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.582104, 0.470868, -0.662901], [0.81311, -0.339656, 0.472743], [-0.002559, -0.814197, -0.580583]], 'translation vector': [4.229822, 1.596572, 1.425168]}\nB: {'rotation matrix': [[0.9999560146697009, 0.003230475520535795, -0.00886892770497088], [-0.0032175833964298243, 0.9999942676924473, 0.0014734023416888033], [0.008873986014087454, -0.001444906579376565, 0.9999590747869875]], 'translation vector': [0.0012149381321875374, 0.0024560981455157282, -0.00010287317760537817]}\nC: {'rotation matrix': [[0.582444, 0.471641, -0.662053], [0.812867, -0.340629, 0.472461], [-0.002682, -0.813343, -0.581779]], 'translation vector': [4.230144, 1.598887, 1.426125]}\nD: {'rotation matrix': [[0.583525, 0.471082, -0.661499], [0.812092, -0.340805, 0.473665], [-0.002307, -0.813593, -0.58143]], 'translation vector': [4.230429, 1.59898, 1.426046]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.582104, 0.470868, -0.662901], [0.81311, -0.339656, 0.472743], [-0.002559, -0.814197, -0.580583]], 'translation vector': [4.229822, 1.596572, 1.425168]}\nB: {'rotation matrix': [[0.9999560146697009, 0.003230475520535795, -0.00886892770497088], [-0.0032175833964298243, 0.9999942676924473, 0.0014734023416888033], [0.008873986014087454, -0.001444906579376565, 0.9999590747869875]], 'translation vector': [0.0012149381321875374, 0.0024560981455157282, -0.00010287317760537817]}\nC: {'rotation matrix': [[0.582444, 0.471641, -0.662053], [0.812867, -0.340629, 0.472461], [-0.002682, -0.813343, -0.581779]], 'translation vector': [4.230144, 1.598887, 1.426125]}\nD: {'rotation matrix': [[0.583525, 0.471082, -0.661499], [0.812092, -0.340805, 0.473665], [-0.002307, -0.813593, -0.58143]], 'translation vector': [4.230429, 1.59898, 1.426046]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_13_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_13_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_13_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_13_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.990893, 0.057008, -0.121987], [0.134304, -0.353431, 0.92577], [0.009662, -0.933722, -0.357869]], 'translation vector': [2.186028, 2.144782, 1.462596]}\nB: {'rotation matrix': [[0.991569, 0.053062, -0.11822], [0.129357, -0.351493, 0.927211], [0.007646, -0.934686, -0.355393]], 'translation vector': [2.183204, 2.143093, 1.462234]}\nC: {'rotation matrix': [[0.9999874902969159, 0.004379197439594465, -0.002403723493325247], [-0.004372650351021444, 0.9999870005118716, 0.0026700250405810233], [0.0024158589744238553, -0.0026609890925621414, 0.9999939599581709]], 'translation vector': [-0.003672033476809222, -0.0017027412429904132, -0.0003357999980959647]}\nD: {'rotation matrix': [[0.991257, 0.055775, -0.119575], [0.131602, -0.352888, 0.926364], [0.009471, -0.934002, -0.357143]], 'translation vector': [2.184101, 2.143995, 1.46179]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.990893, 0.057008, -0.121987], [0.134304, -0.353431, 0.92577], [0.009662, -0.933722, -0.357869]], 'translation vector': [2.186028, 2.144782, 1.462596]}\nB: {'rotation matrix': [[0.991569, 0.053062, -0.11822], [0.129357, -0.351493, 0.927211], [0.007646, -0.934686, -0.355393]], 'translation vector': [2.183204, 2.143093, 1.462234]}\nC: {'rotation matrix': [[0.9999874902969159, 0.004379197439594465, -0.002403723493325247], [-0.004372650351021444, 0.9999870005118716, 0.0026700250405810233], [0.0024158589744238553, -0.0026609890925621414, 0.9999939599581709]], 'translation vector': [-0.003672033476809222, -0.0017027412429904132, -0.0003357999980959647]}\nD: {'rotation matrix': [[0.991257, 0.055775, -0.119575], [0.131602, -0.352888, 0.926364], [0.009471, -0.934002, -0.357143]], 'translation vector': [2.184101, 2.143995, 1.46179]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_14_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_14_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_14_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_14_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.9999986691328494, -0.001073261840895818, -0.0008802624854347336], [0.001080335550453663, 0.9999723432097648, 0.00736012496129083], [0.0008730903166935182, -0.007359768837914202, 0.9999729147693108]], 'translation vector': [0.0010486658094048806, -0.004009939681768326, 0.0017973568614269853]}\nB: {'rotation matrix': [[-0.386299, -0.298688, 0.872673], [-0.920393, 0.186791, -0.343491], [-0.060411, -0.935893, -0.347067]], 'translation vector': [2.08048, 4.009937, 1.840847]}\nC: {'rotation matrix': [[-0.383122, -0.307436, 0.871034], [-0.921947, 0.185316, -0.340108], [-0.056855, -0.933349, -0.354438]], 'translation vector': [2.080896, 4.009106, 1.847586]}\nD: {'rotation matrix': [[-0.384424, -0.301178, 0.872645], [-0.921297, 0.185141, -0.341959], [-0.058572, -0.935422, -0.348647]], 'translation vector': [2.077995, 4.010322, 1.837904]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999986691328494, -0.001073261840895818, -0.0008802624854347336], [0.001080335550453663, 0.9999723432097648, 0.00736012496129083], [0.0008730903166935182, -0.007359768837914202, 0.9999729147693108]], 'translation vector': [0.0010486658094048806, -0.004009939681768326, 0.0017973568614269853]}\nB: {'rotation matrix': [[-0.386299, -0.298688, 0.872673], [-0.920393, 0.186791, -0.343491], [-0.060411, -0.935893, -0.347067]], 'translation vector': [2.08048, 4.009937, 1.840847]}\nC: {'rotation matrix': [[-0.383122, -0.307436, 0.871034], [-0.921947, 0.185316, -0.340108], [-0.056855, -0.933349, -0.354438]], 'translation vector': [2.080896, 4.009106, 1.847586]}\nD: {'rotation matrix': [[-0.384424, -0.301178, 0.872645], [-0.921297, 0.185141, -0.341959], [-0.058572, -0.935422, -0.348647]], 'translation vector': [2.077995, 4.010322, 1.837904]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_15_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_15_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_15_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_15_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.9999787823940977, 0.006347499783107699, -0.0013436878206843045], [-0.006341718624184496, 0.9999688828294876, 0.004625015892328648], [0.0013735202687803576, -0.004616292980016383, 0.9999878757602079]], 'translation vector': [-0.006298849286503927, 0.01405890593176995, 0.0007444533799123576]}\nB: {'rotation matrix': [[0.999733, -0.006694, 0.022129], [-0.023039, -0.368118, 0.929494], [0.001924, -0.929755, -0.368173]], 'translation vector': [3.317142, 3.173762, 1.523565]}\nC: {'rotation matrix': [[0.999731, -0.010083, 0.02088], [-0.023127, -0.369367, 0.928996], [-0.001654, -0.929229, -0.3695]], 'translation vector': [3.314788, 3.169853, 1.521514]}\nD: {'rotation matrix': [[0.999712, -0.007131, 0.022924], [-0.023946, -0.364324, 0.930964], [0.001713, -0.931245, -0.36439]], 'translation vector': [3.320507, 3.174599, 1.524876]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999787823940977, 0.006347499783107699, -0.0013436878206843045], [-0.006341718624184496, 0.9999688828294876, 0.004625015892328648], [0.0013735202687803576, -0.004616292980016383, 0.9999878757602079]], 'translation vector': [-0.006298849286503927, 0.01405890593176995, 0.0007444533799123576]}\nB: {'rotation matrix': [[0.999733, -0.006694, 0.022129], [-0.023039, -0.368118, 0.929494], [0.001924, -0.929755, -0.368173]], 'translation vector': [3.317142, 3.173762, 1.523565]}\nC: {'rotation matrix': [[0.999731, -0.010083, 0.02088], [-0.023127, -0.369367, 0.928996], [-0.001654, -0.929229, -0.3695]], 'translation vector': [3.314788, 3.169853, 1.521514]}\nD: {'rotation matrix': [[0.999712, -0.007131, 0.022924], [-0.023946, -0.364324, 0.930964], [0.001713, -0.931245, -0.36439]], 'translation vector': [3.320507, 3.174599, 1.524876]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_16_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_16_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_16_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_16_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.998283, -0.004041, -0.058434], [0.054839, 0.286044, -0.956646], [0.020581, -0.958208, -0.285332]], 'translation vector': [1.688122, 4.435732, 1.572228]}\nB: {'rotation matrix': [[0.9999952600112642, 0.0029895442027471574, 0.0009673920050112699], [-0.0029953662693526914, 0.9999719574782809, 0.006845684012961962], [-0.000945954294544353, -0.006847856485920328, 0.9999767653082445]], 'translation vector': [-0.00043220364465224037, 0.0023057137872921907, 0.0026271806076847426]}\nC: {'rotation matrix': [[-0.998336, -0.002848, -0.057597], [0.054423, 0.283794, -0.95734], [0.019072, -0.958881, -0.283167]], 'translation vector': [1.687961, 4.436946, 1.571062]}\nD: {'rotation matrix': [[-0.998358, -0.001309, -0.057275], [0.054546, 0.284027, -0.957264], [0.017521, -0.958815, -0.283489]], 'translation vector': [1.688286, 4.43679, 1.571851]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.998283, -0.004041, -0.058434], [0.054839, 0.286044, -0.956646], [0.020581, -0.958208, -0.285332]], 'translation vector': [1.688122, 4.435732, 1.572228]}\nB: {'rotation matrix': [[0.9999952600112642, 0.0029895442027471574, 0.0009673920050112699], [-0.0029953662693526914, 0.9999719574782809, 0.006845684012961962], [-0.000945954294544353, -0.006847856485920328, 0.9999767653082445]], 'translation vector': [-0.00043220364465224037, 0.0023057137872921907, 0.0026271806076847426]}\nC: {'rotation matrix': [[-0.998336, -0.002848, -0.057597], [0.054423, 0.283794, -0.95734], [0.019072, -0.958881, -0.283167]], 'translation vector': [1.687961, 4.436946, 1.571062]}\nD: {'rotation matrix': [[-0.998358, -0.001309, -0.057275], [0.054546, 0.284027, -0.957264], [0.017521, -0.958815, -0.283489]], 'translation vector': [1.688286, 4.43679, 1.571851]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_17_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_17_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_17_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_17_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.083515, 0.422666, -0.902429], [0.995888, 0.067297, -0.060645], [0.035099, -0.903783, -0.426549]], 'translation vector': [4.26049, 5.866284, 1.66918]}\nB: {'rotation matrix': [[-0.080848, 0.422553, -0.902725], [0.996028, 0.068154, -0.057302], [0.037311, -0.903772, -0.426385]], 'translation vector': [4.26043, 5.866841, 1.668667]}\nC: {'rotation matrix': [[-0.081468, 0.422714, -0.902594], [0.995995, 0.068006, -0.058049], [0.036844, -0.903708, -0.426561]], 'translation vector': [4.260486, 5.864969, 1.669529]}\nD: {'rotation matrix': [[0.9999996666026792, 0.0007024902365725143, 0.00045309895052521656], [-0.0007041385629428506, 0.9999945142012453, 0.003060691886503368], [-0.00045146230549075186, -0.0030616184704849174, 0.9999957671575359]], 'translation vector': [-0.008238592634347341, 0.0026712907326988944, 0.0010534554726602252]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.083515, 0.422666, -0.902429], [0.995888, 0.067297, -0.060645], [0.035099, -0.903783, -0.426549]], 'translation vector': [4.26049, 5.866284, 1.66918]}\nB: {'rotation matrix': [[-0.080848, 0.422553, -0.902725], [0.996028, 0.068154, -0.057302], [0.037311, -0.903772, -0.426385]], 'translation vector': [4.26043, 5.866841, 1.668667]}\nC: {'rotation matrix': [[-0.081468, 0.422714, -0.902594], [0.995995, 0.068006, -0.058049], [0.036844, -0.903708, -0.426561]], 'translation vector': [4.260486, 5.864969, 1.669529]}\nD: {'rotation matrix': [[0.9999996666026792, 0.0007024902365725143, 0.00045309895052521656], [-0.0007041385629428506, 0.9999945142012453, 0.003060691886503368], [-0.00045146230549075186, -0.0030616184704849174, 0.9999957671575359]], 'translation vector': [-0.008238592634347341, 0.0026712907326988944, 0.0010534554726602252]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_18_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_18_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_18_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_18_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.192624, -0.379717, 0.904826], [-0.981148, -0.059825, 0.183766], [-0.015648, -0.923166, -0.384082]], 'translation vector': [4.984646, 4.164808, 1.32267]}\nB: {'rotation matrix': [[0.9993897560101446, -0.009672092115768357, 0.033569058676964116], [0.00944513640094131, 0.9999317047122397, 0.006930507743012298], [-0.033634032991460915, -0.006610261888877057, 0.9994127816547865]], 'translation vector': [-0.03424616147212767, -0.0027538632482175807, 0.008124405533084023]}\nC: {'rotation matrix': [[0.180272, -0.384554, 0.905329], [-0.983511, -0.056947, 0.171651], [-0.014453, -0.921344, -0.388479]], 'translation vector': [4.987018, 4.177592, 1.323464]}\nD: {'rotation matrix': [[0.205405, -0.377617, 0.902892], [-0.978531, -0.0633, 0.196139], [-0.016912, -0.923796, -0.382512]], 'translation vector': [4.985321, 4.152791, 1.324267]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.192624, -0.379717, 0.904826], [-0.981148, -0.059825, 0.183766], [-0.015648, -0.923166, -0.384082]], 'translation vector': [4.984646, 4.164808, 1.32267]}\nB: {'rotation matrix': [[0.9993897560101446, -0.009672092115768357, 0.033569058676964116], [0.00944513640094131, 0.9999317047122397, 0.006930507743012298], [-0.033634032991460915, -0.006610261888877057, 0.9994127816547865]], 'translation vector': [-0.03424616147212767, -0.0027538632482175807, 0.008124405533084023]}\nC: {'rotation matrix': [[0.180272, -0.384554, 0.905329], [-0.983511, -0.056947, 0.171651], [-0.014453, -0.921344, -0.388479]], 'translation vector': [4.987018, 4.177592, 1.323464]}\nD: {'rotation matrix': [[0.205405, -0.377617, 0.902892], [-0.978531, -0.0633, 0.196139], [-0.016912, -0.923796, -0.382512]], 'translation vector': [4.985321, 4.152791, 1.324267]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_19_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_19_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_19_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_19_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.084118, -0.329466, 0.940413], [-0.993483, 0.100574, -0.05363], [-0.076912, -0.938795, -0.335779]], 'translation vector': [4.338453, 2.933071, 1.462896]}\nB: {'rotation matrix': [[0.9999984493090167, 0.0015295352178733802, -0.0012886089849350254], [-0.0015301527829556035, 0.9999993364831051, -0.0007407082576259022], [0.0012866367449453698, 0.0007417730730382566, 0.99999926268211]], 'translation vector': [-0.001971799758651027, -0.003988184523042726, -0.0019001345524003455]}\nC: {'rotation matrix': [[-0.084181, -0.324678, 0.942071], [-0.993543, 0.09952, -0.054482], [-0.076066, -0.940574, -0.330959]], 'translation vector': [4.337488, 2.935505, 1.461639]}\nD: {'rotation matrix': [[-0.083371, -0.331462, 0.939778], [-0.993645, 0.099215, -0.053156], [-0.075621, -0.938238, -0.337627]], 'translation vector': [4.338066, 2.933557, 1.453891]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.084118, -0.329466, 0.940413], [-0.993483, 0.100574, -0.05363], [-0.076912, -0.938795, -0.335779]], 'translation vector': [4.338453, 2.933071, 1.462896]}\nB: {'rotation matrix': [[0.9999984493090167, 0.0015295352178733802, -0.0012886089849350254], [-0.0015301527829556035, 0.9999993364831051, -0.0007407082576259022], [0.0012866367449453698, 0.0007417730730382566, 0.99999926268211]], 'translation vector': [-0.001971799758651027, -0.003988184523042726, -0.0019001345524003455]}\nC: {'rotation matrix': [[-0.084181, -0.324678, 0.942071], [-0.993543, 0.09952, -0.054482], [-0.076066, -0.940574, -0.330959]], 'translation vector': [4.337488, 2.935505, 1.461639]}\nD: {'rotation matrix': [[-0.083371, -0.331462, 0.939778], [-0.993645, 0.099215, -0.053156], [-0.075621, -0.938238, -0.337627]], 'translation vector': [4.338066, 2.933557, 1.453891]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_20_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_20_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_20_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_20_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.486704, 0.327719, -0.809765], [0.869935, 0.097394, -0.483454], [-0.079571, -0.939742, -0.332496]], 'translation vector': [4.437128, 2.283443, 1.465507]}\nB: {'rotation matrix': [[0.9999935589617881, -0.0023123151478069903, 0.002638162918517237], [0.0023036775758451737, 0.9999925417720922, 0.0032437850296420778], [-0.0026449296307575294, -0.0032387699437697024, 0.9999912244700261]], 'translation vector': [0.0032199017210038927, 0.0001093759992842891, 0.0024338106134420556]}\nC: {'rotation matrix': [[-0.494127, 0.32769, -0.805269], [0.866163, 0.105829, -0.488427], [-0.074832, -0.938839, -0.336126]], 'translation vector': [4.441189, 2.279036, 1.469096]}\nD: {'rotation matrix': [[-0.489836, 0.32797, -0.807773], [0.868301, 0.100425, -0.485767], [-0.078196, -0.939335, -0.333968]], 'translation vector': [4.439312, 2.280933, 1.467607]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.486704, 0.327719, -0.809765], [0.869935, 0.097394, -0.483454], [-0.079571, -0.939742, -0.332496]], 'translation vector': [4.437128, 2.283443, 1.465507]}\nB: {'rotation matrix': [[0.9999935589617881, -0.0023123151478069903, 0.002638162918517237], [0.0023036775758451737, 0.9999925417720922, 0.0032437850296420778], [-0.0026449296307575294, -0.0032387699437697024, 0.9999912244700261]], 'translation vector': [0.0032199017210038927, 0.0001093759992842891, 0.0024338106134420556]}\nC: {'rotation matrix': [[-0.494127, 0.32769, -0.805269], [0.866163, 0.105829, -0.488427], [-0.074832, -0.938839, -0.336126]], 'translation vector': [4.441189, 2.279036, 1.469096]}\nD: {'rotation matrix': [[-0.489836, 0.32797, -0.807773], [0.868301, 0.100425, -0.485767], [-0.078196, -0.939335, -0.333968]], 'translation vector': [4.439312, 2.280933, 1.467607]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_21_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_21_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_21_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_21_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.591474, -0.360427, 0.721284], [-0.806244, -0.251769, 0.535335], [-0.011352, -0.898167, -0.439507]], 'translation vector': [2.523668, 2.4613, 1.342936]}\nB: {'rotation matrix': [[0.9999823901895873, 0.005315913984669213, -0.0024241384709724934], [-0.005325670093989841, 0.9999780230897936, -0.003916850861463004], [0.002402806763812986, 0.0039304838949891525, 0.9999899783862463]], 'translation vector': [-0.0021162233882717763, -0.0011065369325464758, -0.0015805869003076012]}\nC: {'rotation matrix': [[0.588358, -0.362651, 0.722717], [-0.808515, -0.250803, 0.532355], [-0.0118, -0.897542, -0.440771]], 'translation vector': [2.523157, 2.461525, 1.343416]}\nD: {'rotation matrix': [[0.586933, -0.361149, 0.724625], [-0.809565, -0.249931, 0.531168], [-0.010725, -0.898391, -0.439067]], 'translation vector': [2.521696, 2.461699, 1.342706]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.591474, -0.360427, 0.721284], [-0.806244, -0.251769, 0.535335], [-0.011352, -0.898167, -0.439507]], 'translation vector': [2.523668, 2.4613, 1.342936]}\nB: {'rotation matrix': [[0.9999823901895873, 0.005315913984669213, -0.0024241384709724934], [-0.005325670093989841, 0.9999780230897936, -0.003916850861463004], [0.002402806763812986, 0.0039304838949891525, 0.9999899783862463]], 'translation vector': [-0.0021162233882717763, -0.0011065369325464758, -0.0015805869003076012]}\nC: {'rotation matrix': [[0.588358, -0.362651, 0.722717], [-0.808515, -0.250803, 0.532355], [-0.0118, -0.897542, -0.440771]], 'translation vector': [2.523157, 2.461525, 1.343416]}\nD: {'rotation matrix': [[0.586933, -0.361149, 0.724625], [-0.809565, -0.249931, 0.531168], [-0.010725, -0.898391, -0.439067]], 'translation vector': [2.521696, 2.461699, 1.342706]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_22_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_22_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_22_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_22_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.519941, -0.4438, 0.729866], [-0.853216, -0.228773, 0.468706], [-0.041038, -0.866432, -0.497605]], 'translation vector': [1.000289, 1.985685, 1.347635]}\nB: {'rotation matrix': [[0.520738, -0.4401, 0.731535], [-0.852723, -0.226811, 0.470553], [-0.041171, -0.868832, -0.493393]], 'translation vector': [0.998782, 1.983781, 1.347411]}\nC: {'rotation matrix': [[0.9999982219593319, -0.002079266464163152, 4.9620397493296405e-05], [0.0020793800784517404, 0.99998989374647, 0.004115140408478336], [-5.7016409592928054e-05, -0.004114590657101431, 0.9999914687195216]], 'translation vector': [-0.0013942399199289301, -0.0008989415392872679, 0.0029889416631920795]}\nD: {'rotation matrix': [[0.521192, -0.438092, 0.732417], [-0.852373, -0.224319, 0.472378], [-0.04265, -0.870492, -0.490332]], 'translation vector': [0.999181, 1.981126, 1.348386]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.519941, -0.4438, 0.729866], [-0.853216, -0.228773, 0.468706], [-0.041038, -0.866432, -0.497605]], 'translation vector': [1.000289, 1.985685, 1.347635]}\nB: {'rotation matrix': [[0.520738, -0.4401, 0.731535], [-0.852723, -0.226811, 0.470553], [-0.041171, -0.868832, -0.493393]], 'translation vector': [0.998782, 1.983781, 1.347411]}\nC: {'rotation matrix': [[0.9999982219593319, -0.002079266464163152, 4.9620397493296405e-05], [0.0020793800784517404, 0.99998989374647, 0.004115140408478336], [-5.7016409592928054e-05, -0.004114590657101431, 0.9999914687195216]], 'translation vector': [-0.0013942399199289301, -0.0008989415392872679, 0.0029889416631920795]}\nD: {'rotation matrix': [[0.521192, -0.438092, 0.732417], [-0.852373, -0.224319, 0.472378], [-0.04265, -0.870492, -0.490332]], 'translation vector': [0.999181, 1.981126, 1.348386]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_23_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_23_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_23_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_23_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.961526, 0.04991, -0.270143], [0.263115, -0.450039, 0.853367], [-0.078983, -0.891613, -0.445856]], 'translation vector': [2.643601, 1.008587, 1.47483]}\nB: {'rotation matrix': [[0.9999611816262259, 0.006037262376624097, -0.006357733639992428], [-0.0060929360747264, 0.9999425771984013, -0.008789441445278661], [0.006305603526763411, 0.00882761527567942, 0.9999410260999323]], 'translation vector': [0.006025344459442472, -0.004704561758730241, -0.003336645842906716]}\nC: {'rotation matrix': [[0.958799, 0.0516, -0.27936], [0.272826, -0.441359, 0.85485], [-0.079187, -0.895846, -0.437252]], 'translation vector': [2.65219, 1.005876, 1.472401]}\nD: {'rotation matrix': [[0.963523, 0.050371, -0.262843], [0.256662, -0.452159, 0.854211], [-0.075819, -0.890514, -0.448594]], 'translation vector': [2.637859, 1.00927, 1.478429]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.961526, 0.04991, -0.270143], [0.263115, -0.450039, 0.853367], [-0.078983, -0.891613, -0.445856]], 'translation vector': [2.643601, 1.008587, 1.47483]}\nB: {'rotation matrix': [[0.9999611816262259, 0.006037262376624097, -0.006357733639992428], [-0.0060929360747264, 0.9999425771984013, -0.008789441445278661], [0.006305603526763411, 0.00882761527567942, 0.9999410260999323]], 'translation vector': [0.006025344459442472, -0.004704561758730241, -0.003336645842906716]}\nC: {'rotation matrix': [[0.958799, 0.0516, -0.27936], [0.272826, -0.441359, 0.85485], [-0.079187, -0.895846, -0.437252]], 'translation vector': [2.65219, 1.005876, 1.472401]}\nD: {'rotation matrix': [[0.963523, 0.050371, -0.262843], [0.256662, -0.452159, 0.854211], [-0.075819, -0.890514, -0.448594]], 'translation vector': [2.637859, 1.00927, 1.478429]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_24_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_24_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_24_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_24_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.9999982465629441, -0.0013465983865406563, -0.0009053084488285325], [0.0013501875456354963, 0.999982864157121, 0.005743205099670796], [0.0008980715136351007, -0.0057437521444019864, 0.9999827649954101]], 'translation vector': [0.006929822597295576, -0.003954296383870348, -0.0008962942027399556]}\nB: {'rotation matrix': [[0.678055, 0.431256, -0.595198], [0.734977, -0.40565, 0.543375], [-0.007108, -0.805894, -0.592017]], 'translation vector': [3.965842, 0.866337, 1.41271]}\nC: {'rotation matrix': [[0.680551, 0.428937, -0.594024], [0.732652, -0.407746, 0.544944], [-0.008465, -0.806075, -0.591754]], 'translation vector': [3.965306, 0.868392, 1.416605]}\nD: {'rotation matrix': [[0.681867, 0.427349, -0.593659], [0.731402, -0.409894, 0.545013], [-0.010426, -0.805829, -0.592057]], 'translation vector': [3.966104, 0.870012, 1.418402]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999982465629441, -0.0013465983865406563, -0.0009053084488285325], [0.0013501875456354963, 0.999982864157121, 0.005743205099670796], [0.0008980715136351007, -0.0057437521444019864, 0.9999827649954101]], 'translation vector': [0.006929822597295576, -0.003954296383870348, -0.0008962942027399556]}\nB: {'rotation matrix': [[0.678055, 0.431256, -0.595198], [0.734977, -0.40565, 0.543375], [-0.007108, -0.805894, -0.592017]], 'translation vector': [3.965842, 0.866337, 1.41271]}\nC: {'rotation matrix': [[0.680551, 0.428937, -0.594024], [0.732652, -0.407746, 0.544944], [-0.008465, -0.806075, -0.591754]], 'translation vector': [3.965306, 0.868392, 1.416605]}\nD: {'rotation matrix': [[0.681867, 0.427349, -0.593659], [0.731402, -0.409894, 0.545013], [-0.010426, -0.805829, -0.592057]], 'translation vector': [3.966104, 0.870012, 1.418402]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_25_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_25_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_25_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_25_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.999985554212974, -0.0031855347936376403, 0.004407056654662203], [0.003212953146819932, 0.9999747868858251, -0.006356723963664241], [-0.004386364338204691, 0.006371176808258376, 0.999970327730571]], 'translation vector': [-0.010337331339885125, -0.003078319020026643, -0.0057934529197571916]}\nB: {'rotation matrix': [[-0.816952, -0.193331, 0.543335], [-0.575587, 0.331994, -0.747315], [-0.035905, -0.923257, -0.382502]], 'translation vector': [4.389139, 4.029859, 1.398995]}\nC: {'rotation matrix': [[-0.817965, -0.190324, 0.542871], [-0.574258, 0.326013, -0.750961], [-0.034057, -0.926009, -0.375963]], 'translation vector': [4.389857, 4.037429, 1.401592]}\nD: {'rotation matrix': [[-0.817754, -0.196252, 0.541077], [-0.574392, 0.338327, -0.745392], [-0.036776, -0.920337, -0.389394]], 'translation vector': [4.391615, 4.02441, 1.397694]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.999985554212974, -0.0031855347936376403, 0.004407056654662203], [0.003212953146819932, 0.9999747868858251, -0.006356723963664241], [-0.004386364338204691, 0.006371176808258376, 0.999970327730571]], 'translation vector': [-0.010337331339885125, -0.003078319020026643, -0.0057934529197571916]}\nB: {'rotation matrix': [[-0.816952, -0.193331, 0.543335], [-0.575587, 0.331994, -0.747315], [-0.035905, -0.923257, -0.382502]], 'translation vector': [4.389139, 4.029859, 1.398995]}\nC: {'rotation matrix': [[-0.817965, -0.190324, 0.542871], [-0.574258, 0.326013, -0.750961], [-0.034057, -0.926009, -0.375963]], 'translation vector': [4.389857, 4.037429, 1.401592]}\nD: {'rotation matrix': [[-0.817754, -0.196252, 0.541077], [-0.574392, 0.338327, -0.745392], [-0.036776, -0.920337, -0.389394]], 'translation vector': [4.391615, 4.02441, 1.397694]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_26_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_26_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_26_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_26_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.989636, -0.107174, 0.095574], [-0.139302, -0.554918, 0.820159], [-0.034864, -0.824973, -0.564096]], 'translation vector': [6.683643, 2.494903, 1.406773]}\nB: {'rotation matrix': [[0.989623, -0.107386, 0.095468], [-0.139407, -0.55662, 0.818988], [-0.034808, -0.823798, -0.565814]], 'translation vector': [6.681599, 2.49535, 1.408922]}\nC: {'rotation matrix': [[0.989755, -0.10674, 0.094822], [-0.138471, -0.555821, 0.819688], [-0.034789, -0.824421, -0.564907]], 'translation vector': [6.681521, 2.493315, 1.407658]}\nD: {'rotation matrix': [[0.9999901303029469, 0.004176228929177566, 0.0011903596205295832], [-0.00418098989555596, 0.9999858303738082, 0.003280298331639201], [-0.0011769495287118517, -0.003284936107684076, 0.9999936907012513]], 'translation vector': [-0.000365649748907515, 0.007725567810238587, -0.0007277349140568656]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.989636, -0.107174, 0.095574], [-0.139302, -0.554918, 0.820159], [-0.034864, -0.824973, -0.564096]], 'translation vector': [6.683643, 2.494903, 1.406773]}\nB: {'rotation matrix': [[0.989623, -0.107386, 0.095468], [-0.139407, -0.55662, 0.818988], [-0.034808, -0.823798, -0.565814]], 'translation vector': [6.681599, 2.49535, 1.408922]}\nC: {'rotation matrix': [[0.989755, -0.10674, 0.094822], [-0.138471, -0.555821, 0.819688], [-0.034789, -0.824421, -0.564907]], 'translation vector': [6.681521, 2.493315, 1.407658]}\nD: {'rotation matrix': [[0.9999901303029469, 0.004176228929177566, 0.0011903596205295832], [-0.00418098989555596, 0.9999858303738082, 0.003280298331639201], [-0.0011769495287118517, -0.003284936107684076, 0.9999936907012513]], 'translation vector': [-0.000365649748907515, 0.007725567810238587, -0.0007277349140568656]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_27_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_27_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_27_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_27_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.9999990866555979, 0.0004093298689293289, 0.0008391460833273652], [-0.00041250820264666677, 0.9999930945433543, 0.0037605519712407454], [-0.0008378351814566854, -0.0037610473264969298, 0.9999924864591837]], 'translation vector': [-0.002544400937407598, -0.0017921618995201394, 0.0018599883369136982]}\nB: {'rotation matrix': [[0.155491, 0.600889, -0.784063], [0.987779, -0.103232, 0.116776], [-0.010771, -0.792638, -0.609597]], 'translation vector': [3.280226, 1.958162, 1.281368]}\nC: {'rotation matrix': [[0.159827, 0.598569, -0.784966], [0.987096, -0.104834, 0.121042], [-0.009839, -0.794182, -0.6076]], 'translation vector': [3.27763, 1.954194, 1.282551]}\nD: {'rotation matrix': [[0.164916, 0.595071, -0.786571], [0.986276, -0.105924, 0.126651], [-0.007951, -0.796662, -0.604372]], 'translation vector': [3.274219, 1.949482, 1.285722]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999990866555979, 0.0004093298689293289, 0.0008391460833273652], [-0.00041250820264666677, 0.9999930945433543, 0.0037605519712407454], [-0.0008378351814566854, -0.0037610473264969298, 0.9999924864591837]], 'translation vector': [-0.002544400937407598, -0.0017921618995201394, 0.0018599883369136982]}\nB: {'rotation matrix': [[0.155491, 0.600889, -0.784063], [0.987779, -0.103232, 0.116776], [-0.010771, -0.792638, -0.609597]], 'translation vector': [3.280226, 1.958162, 1.281368]}\nC: {'rotation matrix': [[0.159827, 0.598569, -0.784966], [0.987096, -0.104834, 0.121042], [-0.009839, -0.794182, -0.6076]], 'translation vector': [3.27763, 1.954194, 1.282551]}\nD: {'rotation matrix': [[0.164916, 0.595071, -0.786571], [0.986276, -0.105924, 0.126651], [-0.007951, -0.796662, -0.604372]], 'translation vector': [3.274219, 1.949482, 1.285722]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_28_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_28_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_28_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_28_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.9999748122492941, 0.0038384551465654687, -0.005906369455983883], [-0.0037983225864595374, 0.9999703256171942, 0.006729367156156027], [0.005932617650240982, -0.006707131114408277, 0.9999600560348555]], 'translation vector': [-0.00423530822573337, 0.003061759875670811, -0.009950114450137715]}\nB: {'rotation matrix': [[-0.937821, -0.115212, 0.32744], [-0.346749, 0.354456, -0.868405], [-0.016013, -0.927948, -0.372366]], 'translation vector': [5.30238, 4.116027, 1.850731]}\nC: {'rotation matrix': [[-0.932005, -0.116649, 0.343162], [-0.36182, 0.355063, -0.861984], [-0.021294, -0.927536, -0.373127]], 'translation vector': [5.291139, 4.11983, 1.856331]}\nD: {'rotation matrix': [[-0.934388, -0.115649, 0.336964], [-0.355745, 0.353624, -0.865099], [-0.01911, -0.928211, -0.371563]], 'translation vector': [5.294776, 4.11946, 1.854234]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999748122492941, 0.0038384551465654687, -0.005906369455983883], [-0.0037983225864595374, 0.9999703256171942, 0.006729367156156027], [0.005932617650240982, -0.006707131114408277, 0.9999600560348555]], 'translation vector': [-0.00423530822573337, 0.003061759875670811, -0.009950114450137715]}\nB: {'rotation matrix': [[-0.937821, -0.115212, 0.32744], [-0.346749, 0.354456, -0.868405], [-0.016013, -0.927948, -0.372366]], 'translation vector': [5.30238, 4.116027, 1.850731]}\nC: {'rotation matrix': [[-0.932005, -0.116649, 0.343162], [-0.36182, 0.355063, -0.861984], [-0.021294, -0.927536, -0.373127]], 'translation vector': [5.291139, 4.11983, 1.856331]}\nD: {'rotation matrix': [[-0.934388, -0.115649, 0.336964], [-0.355745, 0.353624, -0.865099], [-0.01911, -0.928211, -0.371563]], 'translation vector': [5.294776, 4.11946, 1.854234]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_29_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_29_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_29_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_29_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.431582, -0.098037, 0.896731], [-0.900431, 0.106787, -0.421688], [-0.054418, -0.989437, -0.134363]], 'translation vector': [4.412532, 3.596741, 1.526323]}\nB: {'rotation matrix': [[-0.43275, -0.095778, 0.896412], [-0.899777, 0.107595, -0.422878], [-0.055947, -0.989571, -0.132741]], 'translation vector': [4.410773, 3.601486, 1.526138]}\nC: {'rotation matrix': [[0.9999520333968362, 0.0006753791567632332, -0.009755571516398104], [-0.0006310509478896895, 0.999990797204237, 0.004384447971913257], [0.009758133830260066, -0.004378955576420755, 0.9999427994349811]], 'translation vector': [0.0016435229369795579, -0.00040384651884517453, 0.0035746064241082287]}\nD: {'rotation matrix': [[-0.433914, -0.093907, 0.896047], [-0.899123, 0.108519, -0.42403], [-0.057419, -0.989649, -0.131522]], 'translation vector': [4.40951, 3.606652, 1.52516]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.431582, -0.098037, 0.896731], [-0.900431, 0.106787, -0.421688], [-0.054418, -0.989437, -0.134363]], 'translation vector': [4.412532, 3.596741, 1.526323]}\nB: {'rotation matrix': [[-0.43275, -0.095778, 0.896412], [-0.899777, 0.107595, -0.422878], [-0.055947, -0.989571, -0.132741]], 'translation vector': [4.410773, 3.601486, 1.526138]}\nC: {'rotation matrix': [[0.9999520333968362, 0.0006753791567632332, -0.009755571516398104], [-0.0006310509478896895, 0.999990797204237, 0.004384447971913257], [0.009758133830260066, -0.004378955576420755, 0.9999427994349811]], 'translation vector': [0.0016435229369795579, -0.00040384651884517453, 0.0035746064241082287]}\nD: {'rotation matrix': [[-0.433914, -0.093907, 0.896047], [-0.899123, 0.108519, -0.42403], [-0.057419, -0.989649, -0.131522]], 'translation vector': [4.40951, 3.606652, 1.52516]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_30_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_30_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_30_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_30_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.9999917661523451, 0.003965604575831933, -0.00020113541169654973], [-0.0039642406405361414, 0.9999897644646852, 0.002088386147537849], [0.00020867644615939134, -0.0020887952983848278, 0.9999975122366389]], 'translation vector': [0.003940681703816118, 0.0007777989134077623, 0.003188885648093276]}\nB: {'rotation matrix': [[-0.926146, 0.120999, -0.357228], [0.374267, 0.177659, -0.910144], [-0.046662, -0.976625, -0.209824]], 'translation vector': [4.737155, 2.737478, 1.223721]}\nC: {'rotation matrix': [[-0.926101, 0.124421, -0.356169], [0.374063, 0.179874, -0.909792], [-0.049131, -0.975789, -0.213123]], 'translation vector': [4.73486, 2.737298, 1.223615]}\nD: {'rotation matrix': [[-0.927631, 0.118543, -0.354186], [0.370581, 0.173865, -0.912382], [-0.046576, -0.977609, -0.205212]], 'translation vector': [4.731637, 2.739449, 1.226493]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999917661523451, 0.003965604575831933, -0.00020113541169654973], [-0.0039642406405361414, 0.9999897644646852, 0.002088386147537849], [0.00020867644615939134, -0.0020887952983848278, 0.9999975122366389]], 'translation vector': [0.003940681703816118, 0.0007777989134077623, 0.003188885648093276]}\nB: {'rotation matrix': [[-0.926146, 0.120999, -0.357228], [0.374267, 0.177659, -0.910144], [-0.046662, -0.976625, -0.209824]], 'translation vector': [4.737155, 2.737478, 1.223721]}\nC: {'rotation matrix': [[-0.926101, 0.124421, -0.356169], [0.374063, 0.179874, -0.909792], [-0.049131, -0.975789, -0.213123]], 'translation vector': [4.73486, 2.737298, 1.223615]}\nD: {'rotation matrix': [[-0.927631, 0.118543, -0.354186], [0.370581, 0.173865, -0.912382], [-0.046576, -0.977609, -0.205212]], 'translation vector': [4.731637, 2.739449, 1.226493]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_31_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_31_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_31_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_31_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.854414, -0.337949, 0.394674], [-0.51408, -0.439475, 0.736602], [-0.075485, -0.832257, -0.549227]], 'translation vector': [2.728753, 6.764147, 1.410515]}\nB: {'rotation matrix': [[0.857663, -0.338131, 0.387404], [-0.508133, -0.441807, 0.739329], [-0.078832, -0.830948, -0.550737]], 'translation vector': [2.730525, 6.755143, 1.407191]}\nC: {'rotation matrix': [[0.856314, -0.338309, 0.390222], [-0.510605, -0.441176, 0.738002], [-0.077516, -0.83121, -0.550528]], 'translation vector': [2.731703, 6.760056, 1.408417]}\nD: {'rotation matrix': [[0.9999681707679642, 0.006356789386196493, 0.004849115684302276], [-0.0063488135736310385, 0.9999779524760675, -0.0017687198088092734], [-0.004860696689611514, 0.0017388114569044306, 0.9999869431575653]], 'translation vector': [0.001030885579985874, -0.006730226347642976, 0.006769981822561277]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.854414, -0.337949, 0.394674], [-0.51408, -0.439475, 0.736602], [-0.075485, -0.832257, -0.549227]], 'translation vector': [2.728753, 6.764147, 1.410515]}\nB: {'rotation matrix': [[0.857663, -0.338131, 0.387404], [-0.508133, -0.441807, 0.739329], [-0.078832, -0.830948, -0.550737]], 'translation vector': [2.730525, 6.755143, 1.407191]}\nC: {'rotation matrix': [[0.856314, -0.338309, 0.390222], [-0.510605, -0.441176, 0.738002], [-0.077516, -0.83121, -0.550528]], 'translation vector': [2.731703, 6.760056, 1.408417]}\nD: {'rotation matrix': [[0.9999681707679642, 0.006356789386196493, 0.004849115684302276], [-0.0063488135736310385, 0.9999779524760675, -0.0017687198088092734], [-0.004860696689611514, 0.0017388114569044306, 0.9999869431575653]], 'translation vector': [0.001030885579985874, -0.006730226347642976, 0.006769981822561277]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_32_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_32_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_32_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_32_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.230447, -0.471956, 0.850971], [-0.964157, 0.007445, 0.265227], [-0.131511, -0.881591, -0.453324]], 'translation vector': [3.039354, 2.955346, 1.549151]}\nB: {'rotation matrix': [[0.228449, -0.472123, 0.851417], [-0.96468, 0.008044, 0.2633], [-0.131159, -0.881496, -0.45361]], 'translation vector': [3.038737, 2.954341, 1.548813]}\nC: {'rotation matrix': [[0.9999932374461685, -2.1218966376535376e-05, -0.003832905020754294], [2.3397413103181216e-05, 0.9999998755264938, 0.00025510731414015743], [0.003833858200182055, -0.0002556535302727228, 0.9999922097121886]], 'translation vector': [-0.00018189815458224956, -0.001091765193039329, 0.0006659190149727046]}\nD: {'rotation matrix': [[0.234859, -0.471403, 0.850071], [-0.962925, 0.006583, 0.269689], [-0.132728, -0.881894, -0.452379]], 'translation vector': [3.04024, 2.955162, 1.549553]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.230447, -0.471956, 0.850971], [-0.964157, 0.007445, 0.265227], [-0.131511, -0.881591, -0.453324]], 'translation vector': [3.039354, 2.955346, 1.549151]}\nB: {'rotation matrix': [[0.228449, -0.472123, 0.851417], [-0.96468, 0.008044, 0.2633], [-0.131159, -0.881496, -0.45361]], 'translation vector': [3.038737, 2.954341, 1.548813]}\nC: {'rotation matrix': [[0.9999932374461685, -2.1218966376535376e-05, -0.003832905020754294], [2.3397413103181216e-05, 0.9999998755264938, 0.00025510731414015743], [0.003833858200182055, -0.0002556535302727228, 0.9999922097121886]], 'translation vector': [-0.00018189815458224956, -0.001091765193039329, 0.0006659190149727046]}\nD: {'rotation matrix': [[0.234859, -0.471403, 0.850071], [-0.962925, 0.006583, 0.269689], [-0.132728, -0.881894, -0.452379]], 'translation vector': [3.04024, 2.955162, 1.549553]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_33_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_33_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_33_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_33_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.9999722451927703, 0.006302215099590525, -0.00403981735739725], [-0.006327345454891822, 0.9999605703041478, -0.0062233586210535905], [0.004001166188993241, 0.006248748688806741, 0.9999728906669906]], 'translation vector': [0.005093628494140745, -0.0003734020522905279, 0.0005966377475724594]}\nB: {'rotation matrix': [[-0.852779, -0.130984, 0.505581], [-0.521088, 0.148208, -0.840537], [0.035166, -0.980244, -0.194643]], 'translation vector': [2.708243, 1.722235, 1.600397]}\nC: {'rotation matrix': [[-0.85558, -0.133703, 0.500106], [-0.51643, 0.153622, -0.842437], [0.035809, -0.979042, -0.200484]], 'translation vector': [2.710987, 1.723705, 1.596351]}\nD: {'rotation matrix': [[-0.853917, -0.132599, 0.503232], [-0.519221, 0.151792, -0.841052], [0.035136, -0.979478, -0.198466]], 'translation vector': [2.709099, 1.722802, 1.598917]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999722451927703, 0.006302215099590525, -0.00403981735739725], [-0.006327345454891822, 0.9999605703041478, -0.0062233586210535905], [0.004001166188993241, 0.006248748688806741, 0.9999728906669906]], 'translation vector': [0.005093628494140745, -0.0003734020522905279, 0.0005966377475724594]}\nB: {'rotation matrix': [[-0.852779, -0.130984, 0.505581], [-0.521088, 0.148208, -0.840537], [0.035166, -0.980244, -0.194643]], 'translation vector': [2.708243, 1.722235, 1.600397]}\nC: {'rotation matrix': [[-0.85558, -0.133703, 0.500106], [-0.51643, 0.153622, -0.842437], [0.035809, -0.979042, -0.200484]], 'translation vector': [2.710987, 1.723705, 1.596351]}\nD: {'rotation matrix': [[-0.853917, -0.132599, 0.503232], [-0.519221, 0.151792, -0.841052], [0.035136, -0.979478, -0.198466]], 'translation vector': [2.709099, 1.722802, 1.598917]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_34_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_34_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_34_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_34_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.478873, -0.428944, 0.765955], [-0.87533, -0.166797, 0.453846], [-0.066915, -0.887798, -0.455343]], 'translation vector': [0.725473, 2.084639, 1.401624]}\nB: {'rotation matrix': [[0.9999321559246817, -0.004700941469421, -0.010648795235394873], [0.0046277052826301434, 0.9999657024420984, -0.006814383513752801], [0.010680686020790284, 0.006765103656469935, 0.9999199949357819]], 'translation vector': [-0.010159202650746213, -0.00890278579572934, 0.006564575177659959]}\nC: {'rotation matrix': [[0.476891, -0.427829, 0.767814], [-0.876452, -0.165482, 0.452159], [-0.066387, -0.888582, -0.453888]], 'translation vector': [0.720453, 2.082574, 1.402557]}\nD: {'rotation matrix': [[0.480806, -0.429519, 0.764421], [-0.874127, -0.166433, 0.456292], [-0.068761, -0.887589, -0.455476]], 'translation vector': [0.729586, 2.089959, 1.401763]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.478873, -0.428944, 0.765955], [-0.87533, -0.166797, 0.453846], [-0.066915, -0.887798, -0.455343]], 'translation vector': [0.725473, 2.084639, 1.401624]}\nB: {'rotation matrix': [[0.9999321559246817, -0.004700941469421, -0.010648795235394873], [0.0046277052826301434, 0.9999657024420984, -0.006814383513752801], [0.010680686020790284, 0.006765103656469935, 0.9999199949357819]], 'translation vector': [-0.010159202650746213, -0.00890278579572934, 0.006564575177659959]}\nC: {'rotation matrix': [[0.476891, -0.427829, 0.767814], [-0.876452, -0.165482, 0.452159], [-0.066387, -0.888582, -0.453888]], 'translation vector': [0.720453, 2.082574, 1.402557]}\nD: {'rotation matrix': [[0.480806, -0.429519, 0.764421], [-0.874127, -0.166433, 0.456292], [-0.068761, -0.887589, -0.455476]], 'translation vector': [0.729586, 2.089959, 1.401763]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_35_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_35_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_35_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_35_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.745148, -0.37119, 0.554052], [-0.66666, -0.436838, 0.603934], [0.017857, -0.819385, -0.572966]], 'translation vector': [3.707678, 4.401502, 1.259793]}\nB: {'rotation matrix': [[0.9999956925108978, 0.0027628800894689116, -0.0013776131907199199], [-0.002750831230441356, 0.9999530244932063, 0.009285447903977378], [0.0014027365353670037, -0.00928147923100416, 0.9999562334978788]], 'translation vector': [-0.001323540742452639, -0.0019242276069570963, 0.0020358659652854882]}\nC: {'rotation matrix': [[0.745353, -0.370803, 0.554034], [-0.666429, -0.436771, 0.604239], [0.017933, -0.819595, -0.572662]], 'translation vector': [3.707908, 4.40198, 1.260519]}\nD: {'rotation matrix': [[0.746372, -0.37052, 0.55285], [-0.665272, -0.438418, 0.60432], [0.018468, -0.818844, -0.57372]], 'translation vector': [3.708833, 4.402057, 1.261367]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.745148, -0.37119, 0.554052], [-0.66666, -0.436838, 0.603934], [0.017857, -0.819385, -0.572966]], 'translation vector': [3.707678, 4.401502, 1.259793]}\nB: {'rotation matrix': [[0.9999956925108978, 0.0027628800894689116, -0.0013776131907199199], [-0.002750831230441356, 0.9999530244932063, 0.009285447903977378], [0.0014027365353670037, -0.00928147923100416, 0.9999562334978788]], 'translation vector': [-0.001323540742452639, -0.0019242276069570963, 0.0020358659652854882]}\nC: {'rotation matrix': [[0.745353, -0.370803, 0.554034], [-0.666429, -0.436771, 0.604239], [0.017933, -0.819595, -0.572662]], 'translation vector': [3.707908, 4.40198, 1.260519]}\nD: {'rotation matrix': [[0.746372, -0.37052, 0.55285], [-0.665272, -0.438418, 0.60432], [0.018468, -0.818844, -0.57372]], 'translation vector': [3.708833, 4.402057, 1.261367]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_36_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_36_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_36_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_36_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.116268, -0.545929, 0.829725], [-0.992164, 0.102308, -0.071715], [-0.045736, -0.831562, -0.553546]], 'translation vector': [1.188241, 1.804719, 1.496587]}\nB: {'rotation matrix': [[-0.114381, -0.546538, 0.829586], [-0.992382, 0.101334, -0.070067], [-0.045771, -0.83128, -0.553966]], 'translation vector': [1.18804, 1.806907, 1.497044]}\nC: {'rotation matrix': [[-0.116275, -0.545912, 0.829735], [-0.992183, 0.101947, -0.071965], [-0.045303, -0.831617, -0.553499]], 'translation vector': [1.188215, 1.807271, 1.496983]}\nD: {'rotation matrix': [[0.9999972656726313, 0.0013206868291442259, 0.0020218952923470846], [-0.0013233096218697464, 0.999999141112598, 0.0010787718934225417], [-0.0020206374390544144, -0.0010806530653509267, 0.9999977188154618]], 'translation vector': [-0.0038275910671821123, 0.000160776135250007, -0.0021485328355081296]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.116268, -0.545929, 0.829725], [-0.992164, 0.102308, -0.071715], [-0.045736, -0.831562, -0.553546]], 'translation vector': [1.188241, 1.804719, 1.496587]}\nB: {'rotation matrix': [[-0.114381, -0.546538, 0.829586], [-0.992382, 0.101334, -0.070067], [-0.045771, -0.83128, -0.553966]], 'translation vector': [1.18804, 1.806907, 1.497044]}\nC: {'rotation matrix': [[-0.116275, -0.545912, 0.829735], [-0.992183, 0.101947, -0.071965], [-0.045303, -0.831617, -0.553499]], 'translation vector': [1.188215, 1.807271, 1.496983]}\nD: {'rotation matrix': [[0.9999972656726313, 0.0013206868291442259, 0.0020218952923470846], [-0.0013233096218697464, 0.999999141112598, 0.0010787718934225417], [-0.0020206374390544144, -0.0010806530653509267, 0.9999977188154618]], 'translation vector': [-0.0038275910671821123, 0.000160776135250007, -0.0021485328355081296]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_37_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_37_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_37_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_37_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.997087, -0.025415, -0.071922], [0.054952, -0.414609, 0.908339], [-0.052905, -0.909645, -0.412004]], 'translation vector': [4.407682, 5.403047, 1.49649]}\nB: {'rotation matrix': [[1.0000003154168173, -0.00026066196587795827, 0.0009499731630029714], [0.00026257414143552785, 0.9999962311537921, -0.0026991488717855007], [-0.0009496510997198135, 0.002699452079922895, 0.9999961867215906]], 'translation vector': [-0.0035058102061285012, -0.0001503164701972537, 0.00022028015795205746]}\nC: {'rotation matrix': [[0.996877, -0.026735, -0.074307], [0.056551, -0.415107, 0.908013], [-0.055122, -0.909379, -0.412299]], 'translation vector': [4.407921, 5.402507, 1.494552]}\nD: {'rotation matrix': [[0.997138, -0.02412, -0.07165], [0.055312, -0.413324, 0.908903], [-0.051537, -0.910265, -0.410807]], 'translation vector': [4.410345, 5.401881, 1.497987]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.997087, -0.025415, -0.071922], [0.054952, -0.414609, 0.908339], [-0.052905, -0.909645, -0.412004]], 'translation vector': [4.407682, 5.403047, 1.49649]}\nB: {'rotation matrix': [[1.0000003154168173, -0.00026066196587795827, 0.0009499731630029714], [0.00026257414143552785, 0.9999962311537921, -0.0026991488717855007], [-0.0009496510997198135, 0.002699452079922895, 0.9999961867215906]], 'translation vector': [-0.0035058102061285012, -0.0001503164701972537, 0.00022028015795205746]}\nC: {'rotation matrix': [[0.996877, -0.026735, -0.074307], [0.056551, -0.415107, 0.908013], [-0.055122, -0.909379, -0.412299]], 'translation vector': [4.407921, 5.402507, 1.494552]}\nD: {'rotation matrix': [[0.997138, -0.02412, -0.07165], [0.055312, -0.413324, 0.908903], [-0.051537, -0.910265, -0.410807]], 'translation vector': [4.410345, 5.401881, 1.497987]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_38_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_38_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_38_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_38_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.317061, -0.465845, 0.826112], [-0.947162, 0.200121, -0.250671], [-0.048548, -0.86194, -0.504681]], 'translation vector': [2.298134, 2.388596, 1.453916]}\nB: {'rotation matrix': [[-0.317304, -0.461983, 0.828185], [-0.946993, 0.200626, -0.250908], [-0.05024, -0.863899, -0.501153]], 'translation vector': [2.298876, 2.392571, 1.455489]}\nC: {'rotation matrix': [[0.99999454935107, 0.0002197888751369094, -0.0032128422811739327], [-0.00022937596583946223, 0.9999976816384528, -0.0024943946424807743], [0.0032131009034445414, 0.0024955910558726972, 0.9999912821080568]], 'translation vector': [0.001089045909415276, -0.0004423003337574727, -0.0002086110960047849]}\nD: {'rotation matrix': [[-0.314906, -0.456701, 0.83202], [-0.947639, 0.200286, -0.248728], [-0.053048, -0.866781, -0.49586]], 'translation vector': [2.297376, 2.389925, 1.457247]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.317061, -0.465845, 0.826112], [-0.947162, 0.200121, -0.250671], [-0.048548, -0.86194, -0.504681]], 'translation vector': [2.298134, 2.388596, 1.453916]}\nB: {'rotation matrix': [[-0.317304, -0.461983, 0.828185], [-0.946993, 0.200626, -0.250908], [-0.05024, -0.863899, -0.501153]], 'translation vector': [2.298876, 2.392571, 1.455489]}\nC: {'rotation matrix': [[0.99999454935107, 0.0002197888751369094, -0.0032128422811739327], [-0.00022937596583946223, 0.9999976816384528, -0.0024943946424807743], [0.0032131009034445414, 0.0024955910558726972, 0.9999912821080568]], 'translation vector': [0.001089045909415276, -0.0004423003337574727, -0.0002086110960047849]}\nD: {'rotation matrix': [[-0.314906, -0.456701, 0.83202], [-0.947639, 0.200286, -0.248728], [-0.053048, -0.866781, -0.49586]], 'translation vector': [2.297376, 2.389925, 1.457247]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_39_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_39_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_39_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_39_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.887986, -0.072993, 0.45404], [-0.454573, -0.288743, 0.84261], [0.069596, -0.95462, -0.28958]], 'translation vector': [3.216625, 3.12153, 1.569232]}\nB: {'rotation matrix': [[0.881294, -0.091283, 0.463668], [-0.468392, -0.298889, 0.831429], [0.06269, -0.949912, -0.306165]], 'translation vector': [3.22503, 3.133041, 1.572641]}\nC: {'rotation matrix': [[0.9998534926564413, 0.011234421547280734, -0.012941695383424894], [-0.01131150095792877, 0.9999182350466784, -0.005915937307264116], [0.012873405071388795, 0.006062261057420049, 0.9998987281608317]], 'translation vector': [-0.0006289172879170302, -0.011376901257172278, -0.010410317713380746]}\nD: {'rotation matrix': [[0.883743, -0.084646, 0.460254], [-0.463409, -0.29531, 0.83549], [0.065197, -0.951644, -0.300204]], 'translation vector': [3.211292, 3.12843, 1.571525]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.887986, -0.072993, 0.45404], [-0.454573, -0.288743, 0.84261], [0.069596, -0.95462, -0.28958]], 'translation vector': [3.216625, 3.12153, 1.569232]}\nB: {'rotation matrix': [[0.881294, -0.091283, 0.463668], [-0.468392, -0.298889, 0.831429], [0.06269, -0.949912, -0.306165]], 'translation vector': [3.22503, 3.133041, 1.572641]}\nC: {'rotation matrix': [[0.9998534926564413, 0.011234421547280734, -0.012941695383424894], [-0.01131150095792877, 0.9999182350466784, -0.005915937307264116], [0.012873405071388795, 0.006062261057420049, 0.9998987281608317]], 'translation vector': [-0.0006289172879170302, -0.011376901257172278, -0.010410317713380746]}\nD: {'rotation matrix': [[0.883743, -0.084646, 0.460254], [-0.463409, -0.29531, 0.83549], [0.065197, -0.951644, -0.300204]], 'translation vector': [3.211292, 3.12843, 1.571525]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_40_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_40_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_40_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_40_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.9999922215867046, 0.00022785537094286064, 0.0038428059210989744], [-0.0002767566400686762, 0.9999180395239896, 0.0128255569964717], [-0.0038395911947271166, -0.01282698821636408, 0.9999095074136637]], 'translation vector': [-0.012178319620715694, 0.0009877403245357463, 0.004998693428563072]}\nB: {'rotation matrix': [[-0.349791, 0.571502, -0.742315], [0.927295, 0.098467, -0.361147], [-0.133303, -0.814672, -0.564394]], 'translation vector': [7.153554, 3.625007, 1.584927]}\nC: {'rotation matrix': [[-0.352738, 0.563812, -0.746788], [0.92567, 0.093586, -0.366575], [-0.13679, -0.820584, -0.554915]], 'translation vector': [7.154875, 3.637451, 1.583088]}\nD: {'rotation matrix': [[-0.346362, 0.577228, -0.739487], [0.929238, 0.103006, -0.354833], [-0.128648, -0.81006, -0.57206]], 'translation vector': [7.154236, 3.613202, 1.583063]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999922215867046, 0.00022785537094286064, 0.0038428059210989744], [-0.0002767566400686762, 0.9999180395239896, 0.0128255569964717], [-0.0038395911947271166, -0.01282698821636408, 0.9999095074136637]], 'translation vector': [-0.012178319620715694, 0.0009877403245357463, 0.004998693428563072]}\nB: {'rotation matrix': [[-0.349791, 0.571502, -0.742315], [0.927295, 0.098467, -0.361147], [-0.133303, -0.814672, -0.564394]], 'translation vector': [7.153554, 3.625007, 1.584927]}\nC: {'rotation matrix': [[-0.352738, 0.563812, -0.746788], [0.92567, 0.093586, -0.366575], [-0.13679, -0.820584, -0.554915]], 'translation vector': [7.154875, 3.637451, 1.583088]}\nD: {'rotation matrix': [[-0.346362, 0.577228, -0.739487], [0.929238, 0.103006, -0.354833], [-0.128648, -0.81006, -0.57206]], 'translation vector': [7.154236, 3.613202, 1.583063]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_41_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_41_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_41_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_41_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.295342, -0.337479, 0.893802], [-0.954338, -0.060275, 0.292587], [-0.044868, -0.939402, -0.33987]], 'translation vector': [3.757863, 4.507889, 1.342911]}\nB: {'rotation matrix': [[0.300826, -0.332026, 0.894015], [-0.952739, -0.063015, 0.297182], [-0.042336, -0.941163, -0.33529]], 'translation vector': [3.757184, 4.502328, 1.344268]}\nC: {'rotation matrix': [[0.280244, -0.341756, 0.897032], [-0.959084, -0.060487, 0.276585], [-0.040265, -0.93784, -0.344724]], 'translation vector': [3.749212, 4.541941, 1.346336]}\nD: {'rotation matrix': [[0.9996750004430802, 0.005976407862532351, -0.024761409167148862], [-0.005880950295372731, 0.9999756244730406, 0.0039275528492273585], [0.024784884729748376, -0.0037817785482656863, 0.9996858791488856]], 'translation vector': [0.014484406993793275, 0.0006255655612603661, -0.006516735656635575]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.295342, -0.337479, 0.893802], [-0.954338, -0.060275, 0.292587], [-0.044868, -0.939402, -0.33987]], 'translation vector': [3.757863, 4.507889, 1.342911]}\nB: {'rotation matrix': [[0.300826, -0.332026, 0.894015], [-0.952739, -0.063015, 0.297182], [-0.042336, -0.941163, -0.33529]], 'translation vector': [3.757184, 4.502328, 1.344268]}\nC: {'rotation matrix': [[0.280244, -0.341756, 0.897032], [-0.959084, -0.060487, 0.276585], [-0.040265, -0.93784, -0.344724]], 'translation vector': [3.749212, 4.541941, 1.346336]}\nD: {'rotation matrix': [[0.9996750004430802, 0.005976407862532351, -0.024761409167148862], [-0.005880950295372731, 0.9999756244730406, 0.0039275528492273585], [0.024784884729748376, -0.0037817785482656863, 0.9996858791488856]], 'translation vector': [0.014484406993793275, 0.0006255655612603661, -0.006516735656635575]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_42_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_42_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_42_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_42_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.408105, -0.298824, 0.862644], [-0.912691, -0.155393, 0.377953], [0.021107, -0.941572, -0.33615]], 'translation vector': [3.68854, 2.987475, 1.504179]}\nB: {'rotation matrix': [[0.414415, -0.280989, 0.865625], [-0.909796, -0.152002, 0.386221], [0.023053, -0.947597, -0.318634]], 'translation vector': [3.695469, 2.977012, 1.528306]}\nC: {'rotation matrix': [[0.9999691985275884, -0.0028723049102575577, -0.007257311467463551], [0.0029984988555096107, 0.9998441859378951, 0.01739980616527257], [0.007206986340996781, -0.017422228683580315, 0.9998224150329578]], 'translation vector': [0.005083024714500617, 0.008976294562036191, -0.004546920528859744]}\nD: {'rotation matrix': [[0.410301, -0.290948, 0.864293], [-0.911655, -0.15496, 0.38062], [0.02319, -0.944106, -0.328824]], 'translation vector': [3.694328, 2.984669, 1.517045]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.408105, -0.298824, 0.862644], [-0.912691, -0.155393, 0.377953], [0.021107, -0.941572, -0.33615]], 'translation vector': [3.68854, 2.987475, 1.504179]}\nB: {'rotation matrix': [[0.414415, -0.280989, 0.865625], [-0.909796, -0.152002, 0.386221], [0.023053, -0.947597, -0.318634]], 'translation vector': [3.695469, 2.977012, 1.528306]}\nC: {'rotation matrix': [[0.9999691985275884, -0.0028723049102575577, -0.007257311467463551], [0.0029984988555096107, 0.9998441859378951, 0.01739980616527257], [0.007206986340996781, -0.017422228683580315, 0.9998224150329578]], 'translation vector': [0.005083024714500617, 0.008976294562036191, -0.004546920528859744]}\nD: {'rotation matrix': [[0.410301, -0.290948, 0.864293], [-0.911655, -0.15496, 0.38062], [0.02319, -0.944106, -0.328824]], 'translation vector': [3.694328, 2.984669, 1.517045]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_43_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_43_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_43_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_43_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.794148, 0.542055, -0.274783], [0.607642, 0.715697, -0.34431], [0.010026, -0.440403, -0.897744]], 'translation vector': [2.029685, 2.312871, 1.199782]}\nB: {'rotation matrix': [[-0.789274, 0.545355, -0.282196], [0.613822, 0.713018, -0.338862], [0.016411, -0.440674, -0.897518]], 'translation vector': [2.029754, 2.312013, 1.198812]}\nC: {'rotation matrix': [[-0.792558, 0.542991, -0.277512], [0.609667, 0.714954, -0.342268], [0.01256, -0.440457, -0.897686]], 'translation vector': [2.028831, 2.312793, 1.199579]}\nD: {'rotation matrix': [[0.9999690183091173, 0.00758939386438217, -0.0016348482581436392], [-0.007592849178659918, 0.9999686021169869, -0.0023322818814674375], [0.0016172805944689142, 0.0023449024761050914, 0.9999961697739403]], 'translation vector': [-0.0009595698380654716, -0.001243015152278204, -0.0008030280503015241]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.794148, 0.542055, -0.274783], [0.607642, 0.715697, -0.34431], [0.010026, -0.440403, -0.897744]], 'translation vector': [2.029685, 2.312871, 1.199782]}\nB: {'rotation matrix': [[-0.789274, 0.545355, -0.282196], [0.613822, 0.713018, -0.338862], [0.016411, -0.440674, -0.897518]], 'translation vector': [2.029754, 2.312013, 1.198812]}\nC: {'rotation matrix': [[-0.792558, 0.542991, -0.277512], [0.609667, 0.714954, -0.342268], [0.01256, -0.440457, -0.897686]], 'translation vector': [2.028831, 2.312793, 1.199579]}\nD: {'rotation matrix': [[0.9999690183091173, 0.00758939386438217, -0.0016348482581436392], [-0.007592849178659918, 0.9999686021169869, -0.0023322818814674375], [0.0016172805944689142, 0.0023449024761050914, 0.9999961697739403]], 'translation vector': [-0.0009595698380654716, -0.001243015152278204, -0.0008030280503015241]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_44_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_44_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_44_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_44_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.595869, 0.488846, -0.637158], [0.802996, -0.351087, 0.481596], [0.011728, -0.798604, -0.601743]], 'translation vector': [3.453696, 1.113575, 1.412785]}\nB: {'rotation matrix': [[0.596116, 0.489381, -0.636516], [0.802814, -0.351792, 0.481386], [0.01166, -0.797966, -0.60259]], 'translation vector': [3.45192, 1.112521, 1.411639]}\nC: {'rotation matrix': [[0.9999972585421386, -0.0019706644639079346, -0.0016038162580140711], [0.0019696489330632795, 0.9999981939869151, -0.0004898637930363391], [0.00160459924260016, 0.0004861135599392089, 0.9999985246041514]], 'translation vector': [0.0004367632436044211, -0.0013470306629629059, 0.001255614697354801]}\nD: {'rotation matrix': [[0.596167, 0.487305, -0.638059], [0.802791, -0.351322, 0.481768], [0.010604, -0.799442, -0.60065]], 'translation vector': [3.452477, 1.114933, 1.412574]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.595869, 0.488846, -0.637158], [0.802996, -0.351087, 0.481596], [0.011728, -0.798604, -0.601743]], 'translation vector': [3.453696, 1.113575, 1.412785]}\nB: {'rotation matrix': [[0.596116, 0.489381, -0.636516], [0.802814, -0.351792, 0.481386], [0.01166, -0.797966, -0.60259]], 'translation vector': [3.45192, 1.112521, 1.411639]}\nC: {'rotation matrix': [[0.9999972585421386, -0.0019706644639079346, -0.0016038162580140711], [0.0019696489330632795, 0.9999981939869151, -0.0004898637930363391], [0.00160459924260016, 0.0004861135599392089, 0.9999985246041514]], 'translation vector': [0.0004367632436044211, -0.0013470306629629059, 0.001255614697354801]}\nD: {'rotation matrix': [[0.596167, 0.487305, -0.638059], [0.802791, -0.351322, 0.481768], [0.010604, -0.799442, -0.60065]], 'translation vector': [3.452477, 1.114933, 1.412574]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_45_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_45_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_45_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_45_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.454351, -0.425578, 0.782591], [-0.890802, 0.223047, -0.395881], [-0.006077, -0.877003, -0.480447]], 'translation vector': [2.248463, 3.862178, 1.517095]}\nB: {'rotation matrix': [[-0.455273, -0.423684, 0.783083], [-0.890312, 0.224946, -0.395908], [-0.008411, -0.877434, -0.479623]], 'translation vector': [2.248543, 3.862554, 1.517483]}\nC: {'rotation matrix': [[0.9999998220095317, 0.00019344203418205028, 0.0003904002954544705], [-0.00019414720576528636, 0.99999726822095, 0.002542005647158053], [-0.00038905761812178927, -0.002542402173661647, 0.9999965389998547]], 'translation vector': [-0.0007329855911066829, -0.0003036787606989222, 0.00038766167203613255]}\nD: {'rotation matrix': [[-0.455182, -0.424042, 0.782942], [-0.890372, 0.223522, -0.39658], [-0.006838, -0.877625, -0.479299]], 'translation vector': [2.247845, 3.863035, 1.516836]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.454351, -0.425578, 0.782591], [-0.890802, 0.223047, -0.395881], [-0.006077, -0.877003, -0.480447]], 'translation vector': [2.248463, 3.862178, 1.517095]}\nB: {'rotation matrix': [[-0.455273, -0.423684, 0.783083], [-0.890312, 0.224946, -0.395908], [-0.008411, -0.877434, -0.479623]], 'translation vector': [2.248543, 3.862554, 1.517483]}\nC: {'rotation matrix': [[0.9999998220095317, 0.00019344203418205028, 0.0003904002954544705], [-0.00019414720576528636, 0.99999726822095, 0.002542005647158053], [-0.00038905761812178927, -0.002542402173661647, 0.9999965389998547]], 'translation vector': [-0.0007329855911066829, -0.0003036787606989222, 0.00038766167203613255]}\nD: {'rotation matrix': [[-0.455182, -0.424042, 0.782942], [-0.890372, 0.223522, -0.39658], [-0.006838, -0.877625, -0.479299]], 'translation vector': [2.247845, 3.863035, 1.516836]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_46_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_46_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_46_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_46_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.503003, -0.389292, 0.771648], [-0.863104, 0.179601, -0.472011], [0.045161, -0.903435, -0.426339]], 'translation vector': [8.991447, 2.792113, 1.935809]}\nB: {'rotation matrix': [[0.9998820105783071, 0.006892269339613531, -0.013715431860878731], [-0.006800370949834618, 0.9999536838119013, 0.00673862053023821], [0.013761408703211613, -0.006644159929287518, 0.9998834142169373]], 'translation vector': [0.0018210588463203337, -0.0009922093443961444, -0.010804184818734797]}\nC: {'rotation matrix': [[-0.507392, -0.392271, 0.767253], [-0.860549, 0.184347, -0.474839], [0.044825, -0.901188, -0.431104]], 'translation vector': [8.996889, 2.787546, 1.938329]}\nD: {'rotation matrix': [[-0.511945, -0.391945, 0.76439], [-0.857826, 0.186392, -0.47895], [0.045246, -0.900909, -0.431643]], 'translation vector': [9.004251, 2.788493, 1.934378]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.503003, -0.389292, 0.771648], [-0.863104, 0.179601, -0.472011], [0.045161, -0.903435, -0.426339]], 'translation vector': [8.991447, 2.792113, 1.935809]}\nB: {'rotation matrix': [[0.9998820105783071, 0.006892269339613531, -0.013715431860878731], [-0.006800370949834618, 0.9999536838119013, 0.00673862053023821], [0.013761408703211613, -0.006644159929287518, 0.9998834142169373]], 'translation vector': [0.0018210588463203337, -0.0009922093443961444, -0.010804184818734797]}\nC: {'rotation matrix': [[-0.507392, -0.392271, 0.767253], [-0.860549, 0.184347, -0.474839], [0.044825, -0.901188, -0.431104]], 'translation vector': [8.996889, 2.787546, 1.938329]}\nD: {'rotation matrix': [[-0.511945, -0.391945, 0.76439], [-0.857826, 0.186392, -0.47895], [0.045246, -0.900909, -0.431643]], 'translation vector': [9.004251, 2.788493, 1.934378]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_47_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_47_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_47_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_47_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.347731, 0.021734, -0.937343], [0.937583, -0.012995, 0.347518], [-0.004627, -0.999679, -0.024896]], 'translation vector': [3.086271, 2.7877, 1.609772]}\nB: {'rotation matrix': [[0.349639, 0.022266, -0.93662], [0.936876, -0.012692, 0.349433], [-0.004108, -0.999671, -0.025298]], 'translation vector': [3.085923, 2.787744, 1.608445]}\nC: {'rotation matrix': [[0.9999983503028658, 0.0012588328917477426, 0.0010046121758577645], [-0.001258491549862339, 0.999999582567488, 0.0007249187267349242], [-0.0010031229358240259, -0.0007259449856603071, 0.9999988669360486]], 'translation vector': [-0.0007969980165536406, 0.0011986171600251172, 0.00041117594518969014]}\nD: {'rotation matrix': [[0.348071, 0.022212, -0.937205], [0.937459, -0.012734, 0.347863], [-0.004208, -0.999672, -0.025256]], 'translation vector': [3.0862, 2.78781, 1.60897]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.347731, 0.021734, -0.937343], [0.937583, -0.012995, 0.347518], [-0.004627, -0.999679, -0.024896]], 'translation vector': [3.086271, 2.7877, 1.609772]}\nB: {'rotation matrix': [[0.349639, 0.022266, -0.93662], [0.936876, -0.012692, 0.349433], [-0.004108, -0.999671, -0.025298]], 'translation vector': [3.085923, 2.787744, 1.608445]}\nC: {'rotation matrix': [[0.9999983503028658, 0.0012588328917477426, 0.0010046121758577645], [-0.001258491549862339, 0.999999582567488, 0.0007249187267349242], [-0.0010031229358240259, -0.0007259449856603071, 0.9999988669360486]], 'translation vector': [-0.0007969980165536406, 0.0011986171600251172, 0.00041117594518969014]}\nD: {'rotation matrix': [[0.348071, 0.022212, -0.937205], [0.937459, -0.012734, 0.347863], [-0.004208, -0.999672, -0.025256]], 'translation vector': [3.0862, 2.78781, 1.60897]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_48_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_48_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_48_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_48_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.671878, -0.274721, 0.687829], [-0.740313, -0.220562, 0.635051], [-0.022753, -0.935885, -0.35157]], 'translation vector': [3.807358, 2.10759, 1.500018]}\nB: {'rotation matrix': [[0.9999995287042943, -0.00016620863165950158, -1.0138120576769867e-05], [0.00016705647891749608, 0.9999995162619548, -0.0013603662243301747], [1.062657980571793e-05, 0.001360311886111166, 0.9999990686101642]], 'translation vector': [-0.004297820144825049, 0.003351067382226791, -0.0005408272137925607]}\nC: {'rotation matrix': [[0.670129, -0.272494, 0.690416], [-0.741875, -0.216557, 0.634606], [-0.023412, -0.93747, -0.347278]], 'translation vector': [3.805446, 2.107442, 1.49456]}\nD: {'rotation matrix': [[0.670813, -0.272809, 0.689627], [-0.741265, -0.217599, 0.634962], [-0.023161, -0.937137, -0.348192]], 'translation vector': [3.805646, 2.107794, 1.49708]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.671878, -0.274721, 0.687829], [-0.740313, -0.220562, 0.635051], [-0.022753, -0.935885, -0.35157]], 'translation vector': [3.807358, 2.10759, 1.500018]}\nB: {'rotation matrix': [[0.9999995287042943, -0.00016620863165950158, -1.0138120576769867e-05], [0.00016705647891749608, 0.9999995162619548, -0.0013603662243301747], [1.062657980571793e-05, 0.001360311886111166, 0.9999990686101642]], 'translation vector': [-0.004297820144825049, 0.003351067382226791, -0.0005408272137925607]}\nC: {'rotation matrix': [[0.670129, -0.272494, 0.690416], [-0.741875, -0.216557, 0.634606], [-0.023412, -0.93747, -0.347278]], 'translation vector': [3.805446, 2.107442, 1.49456]}\nD: {'rotation matrix': [[0.670813, -0.272809, 0.689627], [-0.741265, -0.217599, 0.634962], [-0.023161, -0.937137, -0.348192]], 'translation vector': [3.805646, 2.107794, 1.49708]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_49_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_49_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_49_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_49_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.479262, 0.501356, -0.720382], [0.874939, 0.337636, -0.347107], [0.069203, -0.796646, -0.600472]], 'translation vector': [2.874844, 0.864648, 1.19894]}\nB: {'rotation matrix': [[-0.476726, 0.503154, -0.720812], [0.876238, 0.337562, -0.343889], [0.070289, -0.795543, -0.601806]], 'translation vector': [2.872792, 0.865184, 1.200293]}\nC: {'rotation matrix': [[-0.480917, 0.499307, -0.720702], [0.874259, 0.335212, -0.351147], [0.066258, -0.798953, -0.597732]], 'translation vector': [2.877507, 0.861745, 1.198945]}\nD: {'rotation matrix': [[0.9999458075169825, -0.0037298030767581817, 0.009658870905228063], [0.003830453413616424, 0.9999377550296561, -0.010410573829210937], [-0.009619405173591757, 0.010446297554888797, 0.999899528124321]], 'translation vector': [0.004156407549993579, -0.0031544955662062835, 0.0021419719379069946]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.479262, 0.501356, -0.720382], [0.874939, 0.337636, -0.347107], [0.069203, -0.796646, -0.600472]], 'translation vector': [2.874844, 0.864648, 1.19894]}\nB: {'rotation matrix': [[-0.476726, 0.503154, -0.720812], [0.876238, 0.337562, -0.343889], [0.070289, -0.795543, -0.601806]], 'translation vector': [2.872792, 0.865184, 1.200293]}\nC: {'rotation matrix': [[-0.480917, 0.499307, -0.720702], [0.874259, 0.335212, -0.351147], [0.066258, -0.798953, -0.597732]], 'translation vector': [2.877507, 0.861745, 1.198945]}\nD: {'rotation matrix': [[0.9999458075169825, -0.0037298030767581817, 0.009658870905228063], [0.003830453413616424, 0.9999377550296561, -0.010410573829210937], [-0.009619405173591757, 0.010446297554888797, 0.999899528124321]], 'translation vector': [0.004156407549993579, -0.0031544955662062835, 0.0021419719379069946]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_50_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_50_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_50_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_50_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.408839, -0.490635, 0.769499], [-0.912309, 0.19818, -0.358354], [0.023322, -0.848529, -0.528634]], 'translation vector': [0.933751, 3.504875, 1.495928]}\nB: {'rotation matrix': [[-0.405517, -0.49009, 0.771601], [-0.913834, 0.197477, -0.354839], [0.02153, -0.849008, -0.527941]], 'translation vector': [0.92311, 3.508826, 1.494794]}\nC: {'rotation matrix': [[-0.406934, -0.490269, 0.770741], [-0.913168, 0.197092, -0.356762], [0.023003, -0.848994, -0.527902]], 'translation vector': [0.928096, 3.507151, 1.495325]}\nD: {'rotation matrix': [[0.9999720790780128, 0.0021449080565870337, 0.007189565686557225], [-0.0021449387549180303, 0.9999977939892998, -4.554193667967342e-05], [-0.007190098560018757, 3.036958542106401e-05, 0.9999745575458397]], 'translation vector': [-0.0022708049168844724, -0.01148622047209824, 0.014300413115339472]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.408839, -0.490635, 0.769499], [-0.912309, 0.19818, -0.358354], [0.023322, -0.848529, -0.528634]], 'translation vector': [0.933751, 3.504875, 1.495928]}\nB: {'rotation matrix': [[-0.405517, -0.49009, 0.771601], [-0.913834, 0.197477, -0.354839], [0.02153, -0.849008, -0.527941]], 'translation vector': [0.92311, 3.508826, 1.494794]}\nC: {'rotation matrix': [[-0.406934, -0.490269, 0.770741], [-0.913168, 0.197092, -0.356762], [0.023003, -0.848994, -0.527902]], 'translation vector': [0.928096, 3.507151, 1.495325]}\nD: {'rotation matrix': [[0.9999720790780128, 0.0021449080565870337, 0.007189565686557225], [-0.0021449387549180303, 0.9999977939892998, -4.554193667967342e-05], [-0.007190098560018757, 3.036958542106401e-05, 0.9999745575458397]], 'translation vector': [-0.0022708049168844724, -0.01148622047209824, 0.014300413115339472]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_51_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_51_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_51_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_51_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.812183, -0.277025, 0.513436], [-0.583176, 0.410049, -0.70126], [-0.016267, -0.868975, -0.494589]], 'translation vector': [4.864513, 2.490983, 1.398342]}\nB: {'rotation matrix': [[-0.813534, -0.276094, 0.511796], [-0.581281, 0.411223, -0.702146], [-0.016604, -0.868716, -0.495032]], 'translation vector': [4.865518, 2.490622, 1.399591]}\nC: {'rotation matrix': [[0.999997135129029, 0.001605805084179667, 0.0018057529572375504], [-0.0016158596045621908, 0.9999825699875163, 0.0055795127618636095], [-0.0017979244064656866, -0.005583390464990996, 0.9999826090372864]], 'translation vector': [-0.008521917625347264, -0.003245150959530152, 0.000826648743016356]}\nD: {'rotation matrix': [[-0.814102, -0.273924, 0.512058], [-0.580426, 0.411969, -0.702416], [-0.018543, -0.86905, -0.494377]], 'translation vector': [4.865249, 2.49003, 1.4009]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.812183, -0.277025, 0.513436], [-0.583176, 0.410049, -0.70126], [-0.016267, -0.868975, -0.494589]], 'translation vector': [4.864513, 2.490983, 1.398342]}\nB: {'rotation matrix': [[-0.813534, -0.276094, 0.511796], [-0.581281, 0.411223, -0.702146], [-0.016604, -0.868716, -0.495032]], 'translation vector': [4.865518, 2.490622, 1.399591]}\nC: {'rotation matrix': [[0.999997135129029, 0.001605805084179667, 0.0018057529572375504], [-0.0016158596045621908, 0.9999825699875163, 0.0055795127618636095], [-0.0017979244064656866, -0.005583390464990996, 0.9999826090372864]], 'translation vector': [-0.008521917625347264, -0.003245150959530152, 0.000826648743016356]}\nD: {'rotation matrix': [[-0.814102, -0.273924, 0.512058], [-0.580426, 0.411969, -0.702416], [-0.018543, -0.86905, -0.494377]], 'translation vector': [4.865249, 2.49003, 1.4009]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_52_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_52_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_52_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_52_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.065966, 0.424992, -0.90279], [0.996327, -0.077558, 0.03629], [-0.054596, -0.901868, -0.428547]], 'translation vector': [3.688125, 7.382969, 1.65232]}\nB: {'rotation matrix': [[0.06445, 0.42596, -0.902444], [0.996336, -0.078418, 0.034141], [-0.056225, -0.901337, -0.429453]], 'translation vector': [3.691079, 7.385597, 1.655249]}\nC: {'rotation matrix': [[0.065269, 0.425434, -0.902633], [0.996301, -0.078461, 0.035062], [-0.055905, -0.901582, -0.428981]], 'translation vector': [3.688166, 7.384302, 1.653993]}\nD: {'rotation matrix': [[0.9999823018718307, 0.004888113491700664, -0.003558790677819051], [-0.004889637706536031, 0.9999888180833659, -0.00037767294265026564], [0.0035570250944470354, 0.00039493287161776724, 0.9999941352091379]], 'translation vector': [-0.0029368758378494064, 0.0007877967404965047, -0.003178400438716089]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.065966, 0.424992, -0.90279], [0.996327, -0.077558, 0.03629], [-0.054596, -0.901868, -0.428547]], 'translation vector': [3.688125, 7.382969, 1.65232]}\nB: {'rotation matrix': [[0.06445, 0.42596, -0.902444], [0.996336, -0.078418, 0.034141], [-0.056225, -0.901337, -0.429453]], 'translation vector': [3.691079, 7.385597, 1.655249]}\nC: {'rotation matrix': [[0.065269, 0.425434, -0.902633], [0.996301, -0.078461, 0.035062], [-0.055905, -0.901582, -0.428981]], 'translation vector': [3.688166, 7.384302, 1.653993]}\nD: {'rotation matrix': [[0.9999823018718307, 0.004888113491700664, -0.003558790677819051], [-0.004889637706536031, 0.9999888180833659, -0.00037767294265026564], [0.0035570250944470354, 0.00039493287161776724, 0.9999941352091379]], 'translation vector': [-0.0029368758378494064, 0.0007877967404965047, -0.003178400438716089]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_53_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_53_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_53_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_53_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.664692, -0.099755, 0.740428], [-0.744592, -0.007027, 0.667483], [-0.061382, -0.994987, -0.078948]], 'translation vector': [3.729187, 1.43308, 1.737059]}\nB: {'rotation matrix': [[0.660717, -0.100231, 0.743913], [-0.748068, -0.006018, 0.663595], [-0.062036, -0.994946, -0.078955]], 'translation vector': [3.728547, 1.433503, 1.735599]}\nC: {'rotation matrix': [[0.9999743209792964, 0.004172766279522147, 0.005832278992731541], [-0.004189646737158265, 0.9999872000751636, 0.0029117442423506165], [-0.0058200970417569145, -0.0029366540837721536, 0.9999788200004149]], 'translation vector': [0.0013138555211342773, -0.001864474585307807, 0.0012853107981345424]}\nD: {'rotation matrix': [[0.656146, -0.099388, 0.74806], [-0.75226, -0.007571, 0.658823], [-0.059815, -0.99502, -0.079733]], 'translation vector': [3.729275, 1.433124, 1.734442]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.664692, -0.099755, 0.740428], [-0.744592, -0.007027, 0.667483], [-0.061382, -0.994987, -0.078948]], 'translation vector': [3.729187, 1.43308, 1.737059]}\nB: {'rotation matrix': [[0.660717, -0.100231, 0.743913], [-0.748068, -0.006018, 0.663595], [-0.062036, -0.994946, -0.078955]], 'translation vector': [3.728547, 1.433503, 1.735599]}\nC: {'rotation matrix': [[0.9999743209792964, 0.004172766279522147, 0.005832278992731541], [-0.004189646737158265, 0.9999872000751636, 0.0029117442423506165], [-0.0058200970417569145, -0.0029366540837721536, 0.9999788200004149]], 'translation vector': [0.0013138555211342773, -0.001864474585307807, 0.0012853107981345424]}\nD: {'rotation matrix': [[0.656146, -0.099388, 0.74806], [-0.75226, -0.007571, 0.658823], [-0.059815, -0.99502, -0.079733]], 'translation vector': [3.729275, 1.433124, 1.734442]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_54_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_54_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_54_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_54_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.482382, -0.62548, 0.613257], [-0.875317, -0.317346, 0.364844], [-0.033588, -0.712788, -0.700575]], 'translation vector': [-0.164493, 3.070356, 1.320176]}\nB: {'rotation matrix': [[0.482432, -0.626604, 0.612068], [-0.875266, -0.317577, 0.364766], [-0.034185, -0.711697, -0.701654]], 'translation vector': [-0.163574, 3.070977, 1.321051]}\nC: {'rotation matrix': [[0.9999881393517817, -0.00035757344474597234, -0.005016643883040467], [0.0003166564592981544, 0.999967468385718, -0.008020454596773861], [0.005019965204090123, 0.008019005544894866, 0.9999547455108736]], 'translation vector': [-0.0028720129328290156, -0.004572041684123063, -0.0005047793498673403]}\nD: {'rotation matrix': [[0.482883, -0.62302, 0.615362], [-0.875067, -0.316913, 0.36582], [-0.032897, -0.715131, -0.698216]], 'translation vector': [-0.165581, 3.069752, 1.319227]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.482382, -0.62548, 0.613257], [-0.875317, -0.317346, 0.364844], [-0.033588, -0.712788, -0.700575]], 'translation vector': [-0.164493, 3.070356, 1.320176]}\nB: {'rotation matrix': [[0.482432, -0.626604, 0.612068], [-0.875266, -0.317577, 0.364766], [-0.034185, -0.711697, -0.701654]], 'translation vector': [-0.163574, 3.070977, 1.321051]}\nC: {'rotation matrix': [[0.9999881393517817, -0.00035757344474597234, -0.005016643883040467], [0.0003166564592981544, 0.999967468385718, -0.008020454596773861], [0.005019965204090123, 0.008019005544894866, 0.9999547455108736]], 'translation vector': [-0.0028720129328290156, -0.004572041684123063, -0.0005047793498673403]}\nD: {'rotation matrix': [[0.482883, -0.62302, 0.615362], [-0.875067, -0.316913, 0.36582], [-0.032897, -0.715131, -0.698216]], 'translation vector': [-0.165581, 3.069752, 1.319227]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_55_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_55_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_55_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_55_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.254741, -0.436809, 0.862731], [-0.966661, 0.138975, -0.215064], [-0.025957, -0.888754, -0.457649]], 'translation vector': [1.470391, 3.880589, 1.437084]}\nB: {'rotation matrix': [[-0.255547, -0.436424, 0.862688], [-0.966448, 0.139272, -0.215827], [-0.025956, -0.888897, -0.457372]], 'translation vector': [1.471235, 3.880077, 1.436326]}\nC: {'rotation matrix': [[-0.25517, -0.435877, 0.863076], [-0.966551, 0.138838, -0.215646], [-0.025832, -0.889233, -0.456724]], 'translation vector': [1.470861, 3.880012, 1.43644]}\nD: {'rotation matrix': [[0.9999966837298163, 0.0008619563673921012, 0.0026549103476475175], [-0.0008486493068450321, 0.999987344007734, -0.004872285830769469], [-0.0026592683333448467, 0.0048702326102797195, 0.9999843899240968]], 'translation vector': [-0.0017811924174484517, 0.006401158448355426, 0.0014093231794466143]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.254741, -0.436809, 0.862731], [-0.966661, 0.138975, -0.215064], [-0.025957, -0.888754, -0.457649]], 'translation vector': [1.470391, 3.880589, 1.437084]}\nB: {'rotation matrix': [[-0.255547, -0.436424, 0.862688], [-0.966448, 0.139272, -0.215827], [-0.025956, -0.888897, -0.457372]], 'translation vector': [1.471235, 3.880077, 1.436326]}\nC: {'rotation matrix': [[-0.25517, -0.435877, 0.863076], [-0.966551, 0.138838, -0.215646], [-0.025832, -0.889233, -0.456724]], 'translation vector': [1.470861, 3.880012, 1.43644]}\nD: {'rotation matrix': [[0.9999966837298163, 0.0008619563673921012, 0.0026549103476475175], [-0.0008486493068450321, 0.999987344007734, -0.004872285830769469], [-0.0026592683333448467, 0.0048702326102797195, 0.9999843899240968]], 'translation vector': [-0.0017811924174484517, 0.006401158448355426, 0.0014093231794466143]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_56_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_56_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_56_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_56_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.877903, 0.287785, -0.382709], [0.47693, 0.596815, -0.645252], [0.042713, -0.748994, -0.661199]], 'translation vector': [3.16702, 3.626466, 1.453681]}\nB: {'rotation matrix': [[0.9999791193908465, -0.001916910413648059, -0.0061771002364880544], [0.001968865784627184, 0.9999613398790298, 0.008534807689413037], [0.006161535037181589, -0.008547114556570383, 0.9999441766313318]], 'translation vector': [0.0007167215000418725, 0.004111635033621663, 0.00058908656396639]}\nC: {'rotation matrix': [[-0.874912, 0.294682, -0.384306], [0.482557, 0.597398, -0.640512], [0.040836, -0.745841, -0.664871]], 'translation vector': [3.163697, 3.627347, 1.450583]}\nD: {'rotation matrix': [[-0.871313, 0.303569, -0.385564], [0.489353, 0.596266, -0.636396], [0.036709, -0.743177, -0.668087]], 'translation vector': [3.163155, 3.630899, 1.446354]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.877903, 0.287785, -0.382709], [0.47693, 0.596815, -0.645252], [0.042713, -0.748994, -0.661199]], 'translation vector': [3.16702, 3.626466, 1.453681]}\nB: {'rotation matrix': [[0.9999791193908465, -0.001916910413648059, -0.0061771002364880544], [0.001968865784627184, 0.9999613398790298, 0.008534807689413037], [0.006161535037181589, -0.008547114556570383, 0.9999441766313318]], 'translation vector': [0.0007167215000418725, 0.004111635033621663, 0.00058908656396639]}\nC: {'rotation matrix': [[-0.874912, 0.294682, -0.384306], [0.482557, 0.597398, -0.640512], [0.040836, -0.745841, -0.664871]], 'translation vector': [3.163697, 3.627347, 1.450583]}\nD: {'rotation matrix': [[-0.871313, 0.303569, -0.385564], [0.489353, 0.596266, -0.636396], [0.036709, -0.743177, -0.668087]], 'translation vector': [3.163155, 3.630899, 1.446354]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_57_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_57_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_57_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_57_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.078798, -0.401808, 0.912327], [-0.996147, 0.067086, -0.056491], [-0.038506, -0.913263, -0.405546]], 'translation vector': [2.214502, 1.810217, 1.39288]}\nB: {'rotation matrix': [[-0.078108, -0.404311, 0.91128], [-0.996161, 0.067892, -0.055261], [-0.039526, -0.912098, -0.408062]], 'translation vector': [2.215161, 1.809587, 1.395775]}\nC: {'rotation matrix': [[0.9999986851865202, 0.0008401734129283168, -0.0015770679658950186], [-0.0008378992163381685, 0.9999975925523196, 0.0021246735415499604], [0.0015786984008670537, -0.002123956111471475, 0.9999965494818978]], 'translation vector': [0.0020818624690659426, 0.003854659633225399, 0.00023093351122471795]}\nD: {'rotation matrix': [[-0.078693, -0.405741, 0.910594], [-0.996048, 0.069734, -0.055005], [-0.041182, -0.911324, -0.409625]], 'translation vector': [2.217248, 1.812374, 1.391779]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.078798, -0.401808, 0.912327], [-0.996147, 0.067086, -0.056491], [-0.038506, -0.913263, -0.405546]], 'translation vector': [2.214502, 1.810217, 1.39288]}\nB: {'rotation matrix': [[-0.078108, -0.404311, 0.91128], [-0.996161, 0.067892, -0.055261], [-0.039526, -0.912098, -0.408062]], 'translation vector': [2.215161, 1.809587, 1.395775]}\nC: {'rotation matrix': [[0.9999986851865202, 0.0008401734129283168, -0.0015770679658950186], [-0.0008378992163381685, 0.9999975925523196, 0.0021246735415499604], [0.0015786984008670537, -0.002123956111471475, 0.9999965494818978]], 'translation vector': [0.0020818624690659426, 0.003854659633225399, 0.00023093351122471795]}\nD: {'rotation matrix': [[-0.078693, -0.405741, 0.910594], [-0.996048, 0.069734, -0.055005], [-0.041182, -0.911324, -0.409625]], 'translation vector': [2.217248, 1.812374, 1.391779]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_58_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_58_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_58_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_58_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.529336, -0.227144, 0.817441], [-0.847759, 0.103788, -0.520128], [0.033303, -0.968315, -0.247502]], 'translation vector': [5.896173, 2.790533, 1.549775]}\nB: {'rotation matrix': [[0.9999873040333512, 0.004505537820277509, -0.002397070922509149], [-0.004504137793554688, 0.9999890779279523, 0.0010154010037770195], [0.0024022565915601136, -0.0010036320370672355, 0.999996209316038]], 'translation vector': [-0.001560160498486951, -0.0020049110640587564, -0.0017324624821037915]}\nC: {'rotation matrix': [[-0.531472, -0.2283, 0.815731], [-0.846401, 0.104685, -0.522156], [0.033813, -0.967947, -0.24887]], 'translation vector': [5.895259, 2.788617, 1.559572]}\nD: {'rotation matrix': [[-0.53062, -0.226646, 0.816746], [-0.846944, 0.10358, -0.521495], [0.033596, -0.968454, -0.246918]], 'translation vector': [5.896636, 2.790495, 1.551807]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.529336, -0.227144, 0.817441], [-0.847759, 0.103788, -0.520128], [0.033303, -0.968315, -0.247502]], 'translation vector': [5.896173, 2.790533, 1.549775]}\nB: {'rotation matrix': [[0.9999873040333512, 0.004505537820277509, -0.002397070922509149], [-0.004504137793554688, 0.9999890779279523, 0.0010154010037770195], [0.0024022565915601136, -0.0010036320370672355, 0.999996209316038]], 'translation vector': [-0.001560160498486951, -0.0020049110640587564, -0.0017324624821037915]}\nC: {'rotation matrix': [[-0.531472, -0.2283, 0.815731], [-0.846401, 0.104685, -0.522156], [0.033813, -0.967947, -0.24887]], 'translation vector': [5.895259, 2.788617, 1.559572]}\nD: {'rotation matrix': [[-0.53062, -0.226646, 0.816746], [-0.846944, 0.10358, -0.521495], [0.033596, -0.968454, -0.246918]], 'translation vector': [5.896636, 2.790495, 1.551807]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_59_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_59_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_59_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_59_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.9999735234224584, -0.0071926018984539266, -0.00013127408012229648], [0.007193266878786128, 0.999950750728307, 0.006784512971560901], [8.152710211630049e-05, -0.006784805304509037, 0.9999770195971557]], 'translation vector': [-0.005319134943293502, -0.0035091612770564717, 0.0004269635666275251]}\nB: {'rotation matrix': [[-0.879528, -0.314344, 0.357236], [-0.47435, 0.638683, -0.605869], [-0.03771, -0.702334, -0.710848]], 'translation vector': [3.140295, 1.690182, 1.269802]}\nC: {'rotation matrix': [[-0.879673, -0.316123, 0.355304], [-0.474189, 0.640089, -0.604509], [-0.036327, -0.700252, -0.712971]], 'translation vector': [3.138628, 1.688987, 1.26968]}\nD: {'rotation matrix': [[-0.879671, -0.317176, 0.354371], [-0.474219, 0.641391, -0.603103], [-0.036001, -0.698582, -0.714624]], 'translation vector': [3.137942, 1.687445, 1.270163]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999735234224584, -0.0071926018984539266, -0.00013127408012229648], [0.007193266878786128, 0.999950750728307, 0.006784512971560901], [8.152710211630049e-05, -0.006784805304509037, 0.9999770195971557]], 'translation vector': [-0.005319134943293502, -0.0035091612770564717, 0.0004269635666275251]}\nB: {'rotation matrix': [[-0.879528, -0.314344, 0.357236], [-0.47435, 0.638683, -0.605869], [-0.03771, -0.702334, -0.710848]], 'translation vector': [3.140295, 1.690182, 1.269802]}\nC: {'rotation matrix': [[-0.879673, -0.316123, 0.355304], [-0.474189, 0.640089, -0.604509], [-0.036327, -0.700252, -0.712971]], 'translation vector': [3.138628, 1.688987, 1.26968]}\nD: {'rotation matrix': [[-0.879671, -0.317176, 0.354371], [-0.474219, 0.641391, -0.603103], [-0.036001, -0.698582, -0.714624]], 'translation vector': [3.137942, 1.687445, 1.270163]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_60_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_60_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_60_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_60_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.303413, -0.396105, 0.866627], [-0.952854, 0.129076, -0.274605], [-0.003088, -0.909088, -0.416593]], 'translation vector': [3.699021, 3.5579, 1.347225]}\nB: {'rotation matrix': [[-0.298621, -0.390383, 0.870877], [-0.954343, 0.1293, -0.26928], [-0.007482, -0.911527, -0.411171]], 'translation vector': [3.695972, 3.555829, 1.344301]}\nC: {'rotation matrix': [[-0.295385, -0.381895, 0.875731], [-0.955321, 0.128105, -0.266366], [-0.010462, -0.915284, -0.402672]], 'translation vector': [3.694636, 3.554343, 1.343555]}\nD: {'rotation matrix': [[0.9999470991216314, -8.645859163103856e-05, -0.010261215579303299], [0.00018576960757566105, 0.999954044961018, 0.009631176353467179], [0.010258362047762553, -0.00963308504359663, 0.9999007352405366]], 'translation vector': [0.002023687193802637, -0.005328769253949428, -0.0010872499872041086]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.303413, -0.396105, 0.866627], [-0.952854, 0.129076, -0.274605], [-0.003088, -0.909088, -0.416593]], 'translation vector': [3.699021, 3.5579, 1.347225]}\nB: {'rotation matrix': [[-0.298621, -0.390383, 0.870877], [-0.954343, 0.1293, -0.26928], [-0.007482, -0.911527, -0.411171]], 'translation vector': [3.695972, 3.555829, 1.344301]}\nC: {'rotation matrix': [[-0.295385, -0.381895, 0.875731], [-0.955321, 0.128105, -0.266366], [-0.010462, -0.915284, -0.402672]], 'translation vector': [3.694636, 3.554343, 1.343555]}\nD: {'rotation matrix': [[0.9999470991216314, -8.645859163103856e-05, -0.010261215579303299], [0.00018576960757566105, 0.999954044961018, 0.009631176353467179], [0.010258362047762553, -0.00963308504359663, 0.9999007352405366]], 'translation vector': [0.002023687193802637, -0.005328769253949428, -0.0010872499872041086]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_61_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_61_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_61_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_61_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.996466, -0.001244, 0.083992], [-0.083908, 0.032374, 0.995948], [-0.003958, -0.999475, 0.032156]], 'translation vector': [2.320184, 5.772667, 1.343957]}\nB: {'rotation matrix': [[0.994982, 0.000179, 0.100051], [-0.099963, 0.043922, 0.994021], [-0.004216, -0.999035, 0.043719]], 'translation vector': [2.304385, 5.780403, 1.335008]}\nC: {'rotation matrix': [[0.995982, -5.3e-05, 0.089553], [-0.089485, 0.038471, 0.995245], [-0.003498, -0.99926, 0.038311]], 'translation vector': [2.323582, 5.777781, 1.339158]}\nD: {'rotation matrix': [[0.9997533523434441, 0.006129226982668424, -0.021333118212536917], [-0.0059433732378316425, 0.9999427902508286, 0.008815866423587717], [0.02138545815854184, -0.008687653795007199, 0.9997333060483455]], 'translation vector': [0.03726599986839263, -0.00376568964599322, 0.0024709716040858254]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.996466, -0.001244, 0.083992], [-0.083908, 0.032374, 0.995948], [-0.003958, -0.999475, 0.032156]], 'translation vector': [2.320184, 5.772667, 1.343957]}\nB: {'rotation matrix': [[0.994982, 0.000179, 0.100051], [-0.099963, 0.043922, 0.994021], [-0.004216, -0.999035, 0.043719]], 'translation vector': [2.304385, 5.780403, 1.335008]}\nC: {'rotation matrix': [[0.995982, -5.3e-05, 0.089553], [-0.089485, 0.038471, 0.995245], [-0.003498, -0.99926, 0.038311]], 'translation vector': [2.323582, 5.777781, 1.339158]}\nD: {'rotation matrix': [[0.9997533523434441, 0.006129226982668424, -0.021333118212536917], [-0.0059433732378316425, 0.9999427902508286, 0.008815866423587717], [0.02138545815854184, -0.008687653795007199, 0.9997333060483455]], 'translation vector': [0.03726599986839263, -0.00376568964599322, 0.0024709716040858254]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_62_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_62_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_62_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_62_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.980568, 0.060238, -0.186705], [0.195333, -0.388226, 0.900625], [-0.018232, -0.919594, -0.392448]], 'translation vector': [0.950732, 0.877848, 1.428266]}\nB: {'rotation matrix': [[0.980128, 0.059665, -0.18918], [0.197203, -0.396203, 0.896735], [-0.02145, -0.916222, -0.400096]], 'translation vector': [0.955184, 0.877183, 1.426427]}\nC: {'rotation matrix': [[0.979822, 0.061197, -0.190271], [0.198756, -0.398709, 0.89528], [-0.021074, -0.915034, -0.402827]], 'translation vector': [0.958185, 0.874355, 1.42036]}\nD: {'rotation matrix': [[0.9999275128023173, 0.008592189796214917, -0.008452948749791484], [-0.008364555101980847, 0.9996139276735185, 0.026478714279731284], [0.008677131864842291, -0.02640733905967608, 0.9996135651618278]], 'translation vector': [0.01623466192676637, 0.01524648577924892, 0.005088344597766196]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.980568, 0.060238, -0.186705], [0.195333, -0.388226, 0.900625], [-0.018232, -0.919594, -0.392448]], 'translation vector': [0.950732, 0.877848, 1.428266]}\nB: {'rotation matrix': [[0.980128, 0.059665, -0.18918], [0.197203, -0.396203, 0.896735], [-0.02145, -0.916222, -0.400096]], 'translation vector': [0.955184, 0.877183, 1.426427]}\nC: {'rotation matrix': [[0.979822, 0.061197, -0.190271], [0.198756, -0.398709, 0.89528], [-0.021074, -0.915034, -0.402827]], 'translation vector': [0.958185, 0.874355, 1.42036]}\nD: {'rotation matrix': [[0.9999275128023173, 0.008592189796214917, -0.008452948749791484], [-0.008364555101980847, 0.9996139276735185, 0.026478714279731284], [0.008677131864842291, -0.02640733905967608, 0.9996135651618278]], 'translation vector': [0.01623466192676637, 0.01524648577924892, 0.005088344597766196]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_63_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_63_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_63_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_63_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.986729, -0.120978, 0.108304], [-0.140205, -0.298357, 0.944101], [-0.081902, -0.946757, -0.311359]], 'translation vector': [1.126551, 1.554919, 1.507245]}\nB: {'rotation matrix': [[0.9999934208085578, -0.0036450224249741864, 0.00012972614797144], [0.003645024213464494, 0.9999784140945303, 0.005329513561381964], [-0.0001495669882096765, -0.005329922173518521, 0.999986531390115]], 'translation vector': [0.0031820360164642736, 0.0011599203443171113, -0.0008228633488331916]}\nC: {'rotation matrix': [[0.986279, -0.125902, 0.106787], [-0.140227, -0.297512, 0.944364], [-0.087128, -0.94638, -0.311084]], 'translation vector': [1.129178, 1.552708, 1.506911]}\nD: {'rotation matrix': [[0.987067, -0.117318, 0.109254], [-0.140068, -0.29963, 0.943717], [-0.077979, -0.946815, -0.312188]], 'translation vector': [1.12401, 1.557217, 1.508026]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.986729, -0.120978, 0.108304], [-0.140205, -0.298357, 0.944101], [-0.081902, -0.946757, -0.311359]], 'translation vector': [1.126551, 1.554919, 1.507245]}\nB: {'rotation matrix': [[0.9999934208085578, -0.0036450224249741864, 0.00012972614797144], [0.003645024213464494, 0.9999784140945303, 0.005329513561381964], [-0.0001495669882096765, -0.005329922173518521, 0.999986531390115]], 'translation vector': [0.0031820360164642736, 0.0011599203443171113, -0.0008228633488331916]}\nC: {'rotation matrix': [[0.986279, -0.125902, 0.106787], [-0.140227, -0.297512, 0.944364], [-0.087128, -0.94638, -0.311084]], 'translation vector': [1.129178, 1.552708, 1.506911]}\nD: {'rotation matrix': [[0.987067, -0.117318, 0.109254], [-0.140068, -0.29963, 0.943717], [-0.077979, -0.946815, -0.312188]], 'translation vector': [1.12401, 1.557217, 1.508026]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_64_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_64_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_64_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_64_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.34785, 0.12218, -0.929555], [0.936582, -0.000241, 0.350448], [0.042594, -0.992508, -0.114515]], 'translation vector': [2.709976, 2.082475, 1.464411]}\nB: {'rotation matrix': [[0.348346, 0.120067, -0.929645], [0.936396, 0.00053, 0.350944], [0.042629, -0.992766, -0.112245]], 'translation vector': [2.711116, 2.081261, 1.464473]}\nC: {'rotation matrix': [[0.9999996813993267, 0.0006538118368534332, -8.345927351154747e-06], [-0.0006528854877680812, 0.9999955547316733, 0.002915205326606667], [1.0134396361542957e-05, -0.002915969719233099, 0.9999961918791215]], 'translation vector': [-0.0015843322088442413, -0.00023090688010451998, -0.00020610665531961558]}\nD: {'rotation matrix': [[0.34832, 0.118845, -0.929811], [0.936428, 0.00049, 0.350861], [0.042154, -0.992913, -0.111119]], 'translation vector': [2.712512, 2.080143, 1.464219]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.34785, 0.12218, -0.929555], [0.936582, -0.000241, 0.350448], [0.042594, -0.992508, -0.114515]], 'translation vector': [2.709976, 2.082475, 1.464411]}\nB: {'rotation matrix': [[0.348346, 0.120067, -0.929645], [0.936396, 0.00053, 0.350944], [0.042629, -0.992766, -0.112245]], 'translation vector': [2.711116, 2.081261, 1.464473]}\nC: {'rotation matrix': [[0.9999996813993267, 0.0006538118368534332, -8.345927351154747e-06], [-0.0006528854877680812, 0.9999955547316733, 0.002915205326606667], [1.0134396361542957e-05, -0.002915969719233099, 0.9999961918791215]], 'translation vector': [-0.0015843322088442413, -0.00023090688010451998, -0.00020610665531961558]}\nD: {'rotation matrix': [[0.34832, 0.118845, -0.929811], [0.936428, 0.00049, 0.350861], [0.042154, -0.992913, -0.111119]], 'translation vector': [2.712512, 2.080143, 1.464219]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_65_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_65_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_65_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_65_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.436759, -0.378371, 0.816135], [-0.888864, -0.041911, 0.45625], [-0.138426, -0.924705, -0.354625]], 'translation vector': [2.634246, 6.766675, 1.418575]}\nB: {'rotation matrix': [[0.9999585207206515, -0.003693386756648477, 0.008341273999683681], [0.0038081841800440604, 0.9998961013275416, -0.013845142600948103], [-0.008289566528701709, 0.013875695158089622, 0.9998694901286989]], 'translation vector': [0.0023225809960010224, 0.0034699322670346255, -0.0016854173948939177]}\nC: {'rotation matrix': [[0.435508, -0.378095, 0.816931], [-0.889287, -0.039919, 0.455605], [-0.139651, -0.924906, -0.35362]], 'translation vector': [2.637863, 6.764602, 1.421504]}\nD: {'rotation matrix': [[0.436668, -0.378575, 0.81609], [-0.888812, -0.041337, 0.456404], [-0.139048, -0.924647, -0.354533]], 'translation vector': [2.636727, 6.764818, 1.421436]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.436759, -0.378371, 0.816135], [-0.888864, -0.041911, 0.45625], [-0.138426, -0.924705, -0.354625]], 'translation vector': [2.634246, 6.766675, 1.418575]}\nB: {'rotation matrix': [[0.9999585207206515, -0.003693386756648477, 0.008341273999683681], [0.0038081841800440604, 0.9998961013275416, -0.013845142600948103], [-0.008289566528701709, 0.013875695158089622, 0.9998694901286989]], 'translation vector': [0.0023225809960010224, 0.0034699322670346255, -0.0016854173948939177]}\nC: {'rotation matrix': [[0.435508, -0.378095, 0.816931], [-0.889287, -0.039919, 0.455605], [-0.139651, -0.924906, -0.35362]], 'translation vector': [2.637863, 6.764602, 1.421504]}\nD: {'rotation matrix': [[0.436668, -0.378575, 0.81609], [-0.888812, -0.041337, 0.456404], [-0.139048, -0.924647, -0.354533]], 'translation vector': [2.636727, 6.764818, 1.421436]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_66_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_66_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_66_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_66_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.994032, -0.08462, 0.068848], [-0.109061, -0.785368, 0.609346], [0.002509, -0.613218, -0.78991]], 'translation vector': [1.310442, 0.50567, 1.185464]}\nB: {'rotation matrix': [[0.994117, -0.083714, 0.068724], [-0.108266, -0.785982, 0.608696], [0.003059, -0.612556, -0.790421]], 'translation vector': [1.309119, 0.507232, 1.184932]}\nC: {'rotation matrix': [[0.9999858152282176, 0.005111825032549412, 0.0011655701624676482], [-0.005114663160965653, 0.9999842406155692, 0.0025048411154926643], [-0.0011536230586058512, -0.002510397377259177, 0.9999958649343827]], 'translation vector': [-0.003249111441538055, -0.00014047167946484862, 0.0018748641056286486]}\nD: {'rotation matrix': [[0.99397, -0.085854, 0.068209], [-0.109644, -0.78528, 0.609356], [0.001247, -0.61316, -0.789958]], 'translation vector': [1.312051, 0.504544, 1.186353]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.994032, -0.08462, 0.068848], [-0.109061, -0.785368, 0.609346], [0.002509, -0.613218, -0.78991]], 'translation vector': [1.310442, 0.50567, 1.185464]}\nB: {'rotation matrix': [[0.994117, -0.083714, 0.068724], [-0.108266, -0.785982, 0.608696], [0.003059, -0.612556, -0.790421]], 'translation vector': [1.309119, 0.507232, 1.184932]}\nC: {'rotation matrix': [[0.9999858152282176, 0.005111825032549412, 0.0011655701624676482], [-0.005114663160965653, 0.9999842406155692, 0.0025048411154926643], [-0.0011536230586058512, -0.002510397377259177, 0.9999958649343827]], 'translation vector': [-0.003249111441538055, -0.00014047167946484862, 0.0018748641056286486]}\nD: {'rotation matrix': [[0.99397, -0.085854, 0.068209], [-0.109644, -0.78528, 0.609356], [0.001247, -0.61316, -0.789958]], 'translation vector': [1.312051, 0.504544, 1.186353]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_67_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_67_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_67_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_67_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.184838, -0.635323, 0.7498], [-0.98276, 0.116273, -0.143746], [0.004144, -0.763444, -0.645861]], 'translation vector': [1.003433, 1.175637, 1.437383]}\nB: {'rotation matrix': [[-0.184201, -0.636583, 0.748887], [-0.982879, 0.11585, -0.143279], [0.00445, -0.762457, -0.647023]], 'translation vector': [1.004527, 1.174467, 1.438164]}\nC: {'rotation matrix': [[-0.184847, -0.63596, 0.749258], [-0.982754, 0.115597, -0.144335], [0.005179, -0.763016, -0.646359]], 'translation vector': [1.003287, 1.175642, 1.437097]}\nD: {'rotation matrix': [[0.9999986055688241, -0.001502869212153915, -0.0010557523049182509], [0.0014994906185130158, 0.9999931829288878, -0.003258507979167455], [0.0010610991854197703, 0.003257203501798957, 0.999994335142242]], 'translation vector': [0.0018601875903911935, -0.0006944560630759433, -0.0003289584369169929]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.184838, -0.635323, 0.7498], [-0.98276, 0.116273, -0.143746], [0.004144, -0.763444, -0.645861]], 'translation vector': [1.003433, 1.175637, 1.437383]}\nB: {'rotation matrix': [[-0.184201, -0.636583, 0.748887], [-0.982879, 0.11585, -0.143279], [0.00445, -0.762457, -0.647023]], 'translation vector': [1.004527, 1.174467, 1.438164]}\nC: {'rotation matrix': [[-0.184847, -0.63596, 0.749258], [-0.982754, 0.115597, -0.144335], [0.005179, -0.763016, -0.646359]], 'translation vector': [1.003287, 1.175642, 1.437097]}\nD: {'rotation matrix': [[0.9999986055688241, -0.001502869212153915, -0.0010557523049182509], [0.0014994906185130158, 0.9999931829288878, -0.003258507979167455], [0.0010610991854197703, 0.003257203501798957, 0.999994335142242]], 'translation vector': [0.0018601875903911935, -0.0006944560630759433, -0.0003289584369169929]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_68_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_68_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_68_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_68_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.200306, -0.502555, 0.841021], [-0.979587, 0.117581, -0.163047], [-0.016948, -0.856512, -0.515849]], 'translation vector': [2.912164, 4.287547, 1.28791]}\nB: {'rotation matrix': [[0.9999873120379679, 0.002880995332778331, 0.004067114056895998], [-0.002882835524779029, 0.9999951519449766, 0.0005437699978073844], [-0.004064760122873662, -0.0005563594395375273, 0.9999918873961958]], 'translation vector': [-0.0006376699678316555, 0.00863085045062073, -0.004612247460962893]}\nC: {'rotation matrix': [[-0.196493, -0.497947, 0.844654], [-0.980345, 0.115377, -0.160041], [-0.017762, -0.859498, -0.51083]], 'translation vector': [2.914041, 4.284364, 1.288676]}\nD: {'rotation matrix': [[-0.191542, -0.494391, 0.847873], [-0.981313, 0.112624, -0.156017], [-0.018357, -0.861913, -0.506724]], 'translation vector': [2.915548, 4.280207, 1.289523]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.200306, -0.502555, 0.841021], [-0.979587, 0.117581, -0.163047], [-0.016948, -0.856512, -0.515849]], 'translation vector': [2.912164, 4.287547, 1.28791]}\nB: {'rotation matrix': [[0.9999873120379679, 0.002880995332778331, 0.004067114056895998], [-0.002882835524779029, 0.9999951519449766, 0.0005437699978073844], [-0.004064760122873662, -0.0005563594395375273, 0.9999918873961958]], 'translation vector': [-0.0006376699678316555, 0.00863085045062073, -0.004612247460962893]}\nC: {'rotation matrix': [[-0.196493, -0.497947, 0.844654], [-0.980345, 0.115377, -0.160041], [-0.017762, -0.859498, -0.51083]], 'translation vector': [2.914041, 4.284364, 1.288676]}\nD: {'rotation matrix': [[-0.191542, -0.494391, 0.847873], [-0.981313, 0.112624, -0.156017], [-0.018357, -0.861913, -0.506724]], 'translation vector': [2.915548, 4.280207, 1.289523]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_69_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_69_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_69_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_69_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.734788, 0.391108, -0.554185], [0.678289, 0.419895, -0.603003], [-0.00314, -0.818977, -0.573818]], 'translation vector': [5.172049, 2.206823, 1.423276]}\nB: {'rotation matrix': [[-0.735622, 0.390311, -0.55364], [0.677385, 0.420178, -0.603821], [-0.003051, -0.819212, -0.573483]], 'translation vector': [5.170603, 2.207022, 1.424387]}\nC: {'rotation matrix': [[0.9999900939462747, -0.004290505816425898, 0.0009195812709430778], [0.0042999758462496885, 0.9999235506202053, -0.011514550430676607], [-0.0008694094505867219, 0.011517701317526751, 0.9999331376988827]], 'translation vector': [-0.00082889643265327, -0.0036811171481734295, -0.003335935402839496]}\nD: {'rotation matrix': [[-0.734029, 0.39088, -0.55535], [0.679109, 0.418321, -0.603174], [-0.003454, -0.81989, -0.57251]], 'translation vector': [5.174113, 2.207384, 1.421993]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.734788, 0.391108, -0.554185], [0.678289, 0.419895, -0.603003], [-0.00314, -0.818977, -0.573818]], 'translation vector': [5.172049, 2.206823, 1.423276]}\nB: {'rotation matrix': [[-0.735622, 0.390311, -0.55364], [0.677385, 0.420178, -0.603821], [-0.003051, -0.819212, -0.573483]], 'translation vector': [5.170603, 2.207022, 1.424387]}\nC: {'rotation matrix': [[0.9999900939462747, -0.004290505816425898, 0.0009195812709430778], [0.0042999758462496885, 0.9999235506202053, -0.011514550430676607], [-0.0008694094505867219, 0.011517701317526751, 0.9999331376988827]], 'translation vector': [-0.00082889643265327, -0.0036811171481734295, -0.003335935402839496]}\nD: {'rotation matrix': [[-0.734029, 0.39088, -0.55535], [0.679109, 0.418321, -0.603174], [-0.003454, -0.81989, -0.57251]], 'translation vector': [5.174113, 2.207384, 1.421993]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_70_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_70_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_70_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_70_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.231441, -0.548452, 0.803514], [-0.97266, -0.114172, 0.202231], [-0.019175, -0.828351, -0.559882]], 'translation vector': [1.70644, 2.067417, 1.36452]}\nB: {'rotation matrix': [[0.226329, -0.547002, 0.805955], [-0.973947, -0.114973, 0.195472], [-0.01426, -0.829199, -0.558772]], 'translation vector': [1.704179, 2.073727, 1.363978]}\nC: {'rotation matrix': [[0.9999250411381765, -0.0006651889723176079, -0.012145965376701085], [0.0006720606381267904, 0.999998823757648, 0.0005907315128157197], [0.012145773622422931, -0.0005986694012902582, 0.9999254709423802]], 'translation vector': [0.004506212132421972, 0.005377300872088764, -0.0022216956686449407]}\nD: {'rotation matrix': [[0.220852, -0.547607, 0.807063], [-0.975257, -0.115603, 0.188439], [-0.009892, -0.828711, -0.559589]], 'translation vector': [1.70298, 2.078271, 1.364741]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.231441, -0.548452, 0.803514], [-0.97266, -0.114172, 0.202231], [-0.019175, -0.828351, -0.559882]], 'translation vector': [1.70644, 2.067417, 1.36452]}\nB: {'rotation matrix': [[0.226329, -0.547002, 0.805955], [-0.973947, -0.114973, 0.195472], [-0.01426, -0.829199, -0.558772]], 'translation vector': [1.704179, 2.073727, 1.363978]}\nC: {'rotation matrix': [[0.9999250411381765, -0.0006651889723176079, -0.012145965376701085], [0.0006720606381267904, 0.999998823757648, 0.0005907315128157197], [0.012145773622422931, -0.0005986694012902582, 0.9999254709423802]], 'translation vector': [0.004506212132421972, 0.005377300872088764, -0.0022216956686449407]}\nD: {'rotation matrix': [[0.220852, -0.547607, 0.807063], [-0.975257, -0.115603, 0.188439], [-0.009892, -0.828711, -0.559589]], 'translation vector': [1.70298, 2.078271, 1.364741]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_71_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_71_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_71_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_71_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.122634, -0.436101, 0.891503], [-0.989963, 0.117329, -0.078783], [-0.070242, -0.892216, -0.446113]], 'translation vector': [3.397121, 4.680246, 1.399477]}\nB: {'rotation matrix': [[-0.131936, -0.437064, 0.889701], [-0.988639, 0.123231, -0.08607], [-0.072021, -0.890949, -0.448357]], 'translation vector': [3.380324, 4.680538, 1.400463]}\nC: {'rotation matrix': [[0.9999977526314023, 0.0018483257426729351, -0.00041760047623725873], [-0.0018481932934636253, 0.9999980766573896, 0.00088155464161366], [0.00041944957876658323, -0.0008809692307902862, 0.9999991600629605]], 'translation vector': [0.0013914092466897898, -0.0023350058272084695, 0.005488229626908758]}\nD: {'rotation matrix': [[-0.127448, -0.436966, 0.890403], [-0.989339, 0.119783, -0.082825], [-0.070464, -0.891467, -0.447574]], 'translation vector': [3.388002, 4.681844, 1.400749]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.122634, -0.436101, 0.891503], [-0.989963, 0.117329, -0.078783], [-0.070242, -0.892216, -0.446113]], 'translation vector': [3.397121, 4.680246, 1.399477]}\nB: {'rotation matrix': [[-0.131936, -0.437064, 0.889701], [-0.988639, 0.123231, -0.08607], [-0.072021, -0.890949, -0.448357]], 'translation vector': [3.380324, 4.680538, 1.400463]}\nC: {'rotation matrix': [[0.9999977526314023, 0.0018483257426729351, -0.00041760047623725873], [-0.0018481932934636253, 0.9999980766573896, 0.00088155464161366], [0.00041944957876658323, -0.0008809692307902862, 0.9999991600629605]], 'translation vector': [0.0013914092466897898, -0.0023350058272084695, 0.005488229626908758]}\nD: {'rotation matrix': [[-0.127448, -0.436966, 0.890403], [-0.989339, 0.119783, -0.082825], [-0.070464, -0.891467, -0.447574]], 'translation vector': [3.388002, 4.681844, 1.400749]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_72_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_72_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_72_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_72_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.018523, 0.490425, -0.871286], [0.99775, 0.065233, 0.015507], [0.064442, -0.869039, -0.49053]], 'translation vector': [3.293662, 2.081986, 1.287803]}\nB: {'rotation matrix': [[-0.012031, 0.49001, -0.871634], [0.99805, 0.059277, 0.019548], [0.061246, -0.869699, -0.489768]], 'translation vector': [3.297196, 2.086649, 1.289788]}\nC: {'rotation matrix': [[0.9998524821662119, 0.011110486941731442, -0.013083720076090504], [-0.011104516921568186, 0.9999377181682902, 0.0005342849090069752], [0.013088749073536516, -0.0003893477266632072, 0.9999144546094512]], 'translation vector': [-0.004299120253105748, -0.007367168150444914, 0.008875125679755236]}\nD: {'rotation matrix': [[-0.025914, 0.492003, -0.870208], [0.997577, 0.068946, 0.009274], [0.06456, -0.867859, -0.492598]], 'translation vector': [3.28927, 2.078913, 1.287729]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.018523, 0.490425, -0.871286], [0.99775, 0.065233, 0.015507], [0.064442, -0.869039, -0.49053]], 'translation vector': [3.293662, 2.081986, 1.287803]}\nB: {'rotation matrix': [[-0.012031, 0.49001, -0.871634], [0.99805, 0.059277, 0.019548], [0.061246, -0.869699, -0.489768]], 'translation vector': [3.297196, 2.086649, 1.289788]}\nC: {'rotation matrix': [[0.9998524821662119, 0.011110486941731442, -0.013083720076090504], [-0.011104516921568186, 0.9999377181682902, 0.0005342849090069752], [0.013088749073536516, -0.0003893477266632072, 0.9999144546094512]], 'translation vector': [-0.004299120253105748, -0.007367168150444914, 0.008875125679755236]}\nD: {'rotation matrix': [[-0.025914, 0.492003, -0.870208], [0.997577, 0.068946, 0.009274], [0.06456, -0.867859, -0.492598]], 'translation vector': [3.28927, 2.078913, 1.287729]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_73_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_73_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_73_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_73_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.963707, 0.155577, -0.216944], [0.257928, 0.333002, -0.906964], [-0.06886, -0.930003, -0.361044]], 'translation vector': [5.977247, 2.820638, 1.467431]}\nB: {'rotation matrix': [[-0.963568, 0.155178, -0.217846], [0.258718, 0.334184, -0.906303], [-0.067837, -0.929646, -0.362157]], 'translation vector': [5.975011, 2.821235, 1.467201]}\nC: {'rotation matrix': [[-0.963289, 0.1552, -0.219062], [0.259892, 0.33449, -0.905855], [-0.067315, -0.929532, -0.362545]], 'translation vector': [5.973778, 2.820463, 1.46621]}\nD: {'rotation matrix': [[0.9999997264810032, 0.0001398888061159318, -0.0006664023459939352], [-0.00013916119976689398, 1.0000000334978612, 0.0010113123861037441], [0.0006662597489821222, -0.0010109047313490711, 0.9999993871422703]], 'translation vector': [0.0010522232087115668, -0.0015450075589019119, 0.0008995589608247201]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.963707, 0.155577, -0.216944], [0.257928, 0.333002, -0.906964], [-0.06886, -0.930003, -0.361044]], 'translation vector': [5.977247, 2.820638, 1.467431]}\nB: {'rotation matrix': [[-0.963568, 0.155178, -0.217846], [0.258718, 0.334184, -0.906303], [-0.067837, -0.929646, -0.362157]], 'translation vector': [5.975011, 2.821235, 1.467201]}\nC: {'rotation matrix': [[-0.963289, 0.1552, -0.219062], [0.259892, 0.33449, -0.905855], [-0.067315, -0.929532, -0.362545]], 'translation vector': [5.973778, 2.820463, 1.46621]}\nD: {'rotation matrix': [[0.9999997264810032, 0.0001398888061159318, -0.0006664023459939352], [-0.00013916119976689398, 1.0000000334978612, 0.0010113123861037441], [0.0006662597489821222, -0.0010109047313490711, 0.9999993871422703]], 'translation vector': [0.0010522232087115668, -0.0015450075589019119, 0.0008995589608247201]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_74_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_74_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_74_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_74_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.608915, -0.355061, 0.709333], [-0.792506, -0.310645, 0.524818], [0.034009, -0.881721, -0.470545]], 'translation vector': [3.226315, 3.403534, 1.349898]}\nB: {'rotation matrix': [[0.615195, -0.359882, 0.701442], [-0.787888, -0.311929, 0.530973], [0.027713, -0.87931, -0.475444]], 'translation vector': [3.233385, 3.405524, 1.366998]}\nC: {'rotation matrix': [[0.613018, -0.357666, 0.704474], [-0.789518, -0.310613, 0.529321], [0.029499, -0.880678, -0.472795]], 'translation vector': [3.232441, 3.405463, 1.362879]}\nD: {'rotation matrix': [[0.9997863328706507, 0.00048772072536455783, -0.020639341477887214], [-0.000435326360773427, 0.9999962939853105, 0.002563461941184151], [0.020641174681034113, -0.002552850852538859, 0.9997836062327491]], 'translation vector': [0.014979793826815802, -0.00028266630712581176, -0.003454343250508085]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.608915, -0.355061, 0.709333], [-0.792506, -0.310645, 0.524818], [0.034009, -0.881721, -0.470545]], 'translation vector': [3.226315, 3.403534, 1.349898]}\nB: {'rotation matrix': [[0.615195, -0.359882, 0.701442], [-0.787888, -0.311929, 0.530973], [0.027713, -0.87931, -0.475444]], 'translation vector': [3.233385, 3.405524, 1.366998]}\nC: {'rotation matrix': [[0.613018, -0.357666, 0.704474], [-0.789518, -0.310613, 0.529321], [0.029499, -0.880678, -0.472795]], 'translation vector': [3.232441, 3.405463, 1.362879]}\nD: {'rotation matrix': [[0.9997863328706507, 0.00048772072536455783, -0.020639341477887214], [-0.000435326360773427, 0.9999962939853105, 0.002563461941184151], [0.020641174681034113, -0.002552850852538859, 0.9997836062327491]], 'translation vector': [0.014979793826815802, -0.00028266630712581176, -0.003454343250508085]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_75_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_75_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_75_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_75_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.9999991463733822, 0.0008702753414361648, -0.0009523887468317617], [-0.0008598677378153328, 0.999935869624125, 0.011274652427134848], [0.0009622841974666614, -0.011274638788349072, 0.9999353623727418]], 'translation vector': [-0.0018668216035679919, 0.003964358597176476, 0.0035095844812185473]}\nB: {'rotation matrix': [[0.995169, 0.04021, -0.089565], [0.098119, -0.43886, 0.893182], [-0.003392, -0.897655, -0.440686]], 'translation vector': [3.819187, 1.33594, 1.360146]}\nC: {'rotation matrix': [[0.994619, 0.036032, -0.097136], [0.10302, -0.443404, 0.890382], [-0.010989, -0.895597, -0.44473]], 'translation vector': [3.820524, 1.337409, 1.359976]}\nD: {'rotation matrix': [[0.995617, 0.045838, -0.081525], [0.09337, -0.436452, 0.89487], [0.005437, -0.898559, -0.438819]], 'translation vector': [3.821348, 1.335292, 1.36241]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999991463733822, 0.0008702753414361648, -0.0009523887468317617], [-0.0008598677378153328, 0.999935869624125, 0.011274652427134848], [0.0009622841974666614, -0.011274638788349072, 0.9999353623727418]], 'translation vector': [-0.0018668216035679919, 0.003964358597176476, 0.0035095844812185473]}\nB: {'rotation matrix': [[0.995169, 0.04021, -0.089565], [0.098119, -0.43886, 0.893182], [-0.003392, -0.897655, -0.440686]], 'translation vector': [3.819187, 1.33594, 1.360146]}\nC: {'rotation matrix': [[0.994619, 0.036032, -0.097136], [0.10302, -0.443404, 0.890382], [-0.010989, -0.895597, -0.44473]], 'translation vector': [3.820524, 1.337409, 1.359976]}\nD: {'rotation matrix': [[0.995617, 0.045838, -0.081525], [0.09337, -0.436452, 0.89487], [0.005437, -0.898559, -0.438819]], 'translation vector': [3.821348, 1.335292, 1.36241]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_76_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_76_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_76_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_76_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.882555, 0.386221, -0.268197], [0.46123, 0.600144, -0.653524], [-0.091448, -0.700472, -0.707797]], 'translation vector': [4.952963, 3.575409, 1.461658]}\nB: {'rotation matrix': [[-0.884605, 0.387053, -0.260122], [0.456774, 0.606746, -0.650551], [-0.09397, -0.694298, -0.713526]], 'translation vector': [4.944654, 3.579183, 1.459738]}\nC: {'rotation matrix': [[-0.883899, 0.386346, -0.263552], [0.458302, 0.603262, -0.652713], [-0.093182, -0.697719, -0.710286]], 'translation vector': [4.946745, 3.577697, 1.460677]}\nD: {'rotation matrix': [[0.9999447182645155, 0.005004874877878123, -0.00920632642357687], [-0.0050456554798214885, 0.9999777740634089, -0.004314913443862226], [0.009184864298060464, 0.004360756390993736, 0.9999481114756171]], 'translation vector': [0.007685923361448133, 0.0066471354724360054, 0.0017036092209536946]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.882555, 0.386221, -0.268197], [0.46123, 0.600144, -0.653524], [-0.091448, -0.700472, -0.707797]], 'translation vector': [4.952963, 3.575409, 1.461658]}\nB: {'rotation matrix': [[-0.884605, 0.387053, -0.260122], [0.456774, 0.606746, -0.650551], [-0.09397, -0.694298, -0.713526]], 'translation vector': [4.944654, 3.579183, 1.459738]}\nC: {'rotation matrix': [[-0.883899, 0.386346, -0.263552], [0.458302, 0.603262, -0.652713], [-0.093182, -0.697719, -0.710286]], 'translation vector': [4.946745, 3.577697, 1.460677]}\nD: {'rotation matrix': [[0.9999447182645155, 0.005004874877878123, -0.00920632642357687], [-0.0050456554798214885, 0.9999777740634089, -0.004314913443862226], [0.009184864298060464, 0.004360756390993736, 0.9999481114756171]], 'translation vector': [0.007685923361448133, 0.0066471354724360054, 0.0017036092209536946]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_77_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_77_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_77_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_77_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.625113, 0.311868, -0.715523], [0.780406, -0.266348, 0.565708], [-0.014152, -0.912029, -0.409881]], 'translation vector': [1.602076, 0.627028, 1.325196]}\nB: {'rotation matrix': [[0.622635, 0.31497, -0.716324], [0.782419, -0.264717, 0.563689], [-0.012077, -0.911438, -0.411261]], 'translation vector': [1.601839, 0.627416, 1.324643]}\nC: {'rotation matrix': [[0.624152, 0.313246, -0.715759], [0.781196, -0.26537, 0.565077], [-0.012933, -0.911842, -0.410338]], 'translation vector': [1.601807, 0.626749, 1.324787]}\nD: {'rotation matrix': [[0.999966975162773, 0.003632343802915245, 0.007281117690826753], [-0.0036218018444724924, 0.9999920045733598, -0.0017116834889060193], [-0.007286301405287572, 0.0016849146368096719, 0.9999717827304806]], 'translation vector': [-0.0025932164044990547, 0.0007159923074403496, -0.0006736212556601728]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.625113, 0.311868, -0.715523], [0.780406, -0.266348, 0.565708], [-0.014152, -0.912029, -0.409881]], 'translation vector': [1.602076, 0.627028, 1.325196]}\nB: {'rotation matrix': [[0.622635, 0.31497, -0.716324], [0.782419, -0.264717, 0.563689], [-0.012077, -0.911438, -0.411261]], 'translation vector': [1.601839, 0.627416, 1.324643]}\nC: {'rotation matrix': [[0.624152, 0.313246, -0.715759], [0.781196, -0.26537, 0.565077], [-0.012933, -0.911842, -0.410338]], 'translation vector': [1.601807, 0.626749, 1.324787]}\nD: {'rotation matrix': [[0.999966975162773, 0.003632343802915245, 0.007281117690826753], [-0.0036218018444724924, 0.9999920045733598, -0.0017116834889060193], [-0.007286301405287572, 0.0016849146368096719, 0.9999717827304806]], 'translation vector': [-0.0025932164044990547, 0.0007159923074403496, -0.0006736212556601728]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_78_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_78_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_78_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_78_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.198116, 0.271577, -0.941805], [0.979964, -0.034779, 0.196115], [0.020505, -0.961788, -0.273026]], 'translation vector': [3.606948, 3.761193, 1.556592]}\nB: {'rotation matrix': [[0.193825, 0.274451, -0.941864], [0.980909, -0.038747, 0.19057], [0.015807, -0.96082, -0.276722]], 'translation vector': [3.608205, 3.76769, 1.544741]}\nC: {'rotation matrix': [[0.999997674889758, 0.0021739401100205674, -0.00025104493249259135], [-0.002175952729401346, 0.9999848730149307, -0.004981286419615835], [0.0002396488755047073, 0.004981659902978713, 0.9999879544274658]], 'translation vector': [0.0009190453627176964, -0.0033553865594018184, -0.0035327864721872437]}\nD: {'rotation matrix': [[0.190217, 0.28361, -0.939884], [0.981635, -0.040777, 0.186362], [0.014528, -0.958072, -0.286158]], 'translation vector': [3.605221, 3.771751, 1.549751]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.198116, 0.271577, -0.941805], [0.979964, -0.034779, 0.196115], [0.020505, -0.961788, -0.273026]], 'translation vector': [3.606948, 3.761193, 1.556592]}\nB: {'rotation matrix': [[0.193825, 0.274451, -0.941864], [0.980909, -0.038747, 0.19057], [0.015807, -0.96082, -0.276722]], 'translation vector': [3.608205, 3.76769, 1.544741]}\nC: {'rotation matrix': [[0.999997674889758, 0.0021739401100205674, -0.00025104493249259135], [-0.002175952729401346, 0.9999848730149307, -0.004981286419615835], [0.0002396488755047073, 0.004981659902978713, 0.9999879544274658]], 'translation vector': [0.0009190453627176964, -0.0033553865594018184, -0.0035327864721872437]}\nD: {'rotation matrix': [[0.190217, 0.28361, -0.939884], [0.981635, -0.040777, 0.186362], [0.014528, -0.958072, -0.286158]], 'translation vector': [3.605221, 3.771751, 1.549751]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_79_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_79_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_79_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_79_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.631227, -0.321947, 0.705622], [-0.775405, -0.28226, 0.564869], [0.017311, -0.903703, -0.427809]], 'translation vector': [-0.207113, 0.785695, 1.605991]}\nB: {'rotation matrix': [[0.9999995913321388, 0.0005903556746734515, -0.0005873839987845575], [-0.0005915541354645518, 0.9999966980036584, -0.0023348317149788846], [0.0005870160507556152, 0.00233575631512544, 0.9999968115853851]], 'translation vector': [0.0016624461367128474, -0.0028184747771204943, -0.001607026045218174]}\nC: {'rotation matrix': [[0.628117, -0.317695, 0.71031], [-0.777888, -0.278629, 0.563255], [0.018969, -0.906331, -0.422142]], 'translation vector': [-0.210483, 0.781575, 1.607029]}\nD: {'rotation matrix': [[0.626043, -0.313657, 0.713926], [-0.779508, -0.27628, 0.562171], [0.020914, -0.908454, -0.417462]], 'translation vector': [-0.212996, 0.77858, 1.610364]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.631227, -0.321947, 0.705622], [-0.775405, -0.28226, 0.564869], [0.017311, -0.903703, -0.427809]], 'translation vector': [-0.207113, 0.785695, 1.605991]}\nB: {'rotation matrix': [[0.9999995913321388, 0.0005903556746734515, -0.0005873839987845575], [-0.0005915541354645518, 0.9999966980036584, -0.0023348317149788846], [0.0005870160507556152, 0.00233575631512544, 0.9999968115853851]], 'translation vector': [0.0016624461367128474, -0.0028184747771204943, -0.001607026045218174]}\nC: {'rotation matrix': [[0.628117, -0.317695, 0.71031], [-0.777888, -0.278629, 0.563255], [0.018969, -0.906331, -0.422142]], 'translation vector': [-0.210483, 0.781575, 1.607029]}\nD: {'rotation matrix': [[0.626043, -0.313657, 0.713926], [-0.779508, -0.27628, 0.562171], [0.020914, -0.908454, -0.417462]], 'translation vector': [-0.212996, 0.77858, 1.610364]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_80_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_80_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_80_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_80_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.9999006085846415, 0.014105987403183131, 0.0007516964397581732], [-0.014115634211093402, 0.9997972933579915, 0.014373215967786759], [-0.0005489990557007766, -0.014382715808085993, 0.9998969169271865]], 'translation vector': [-0.02535757146051898, 0.0018154305901567636, 0.010236799804218322]}\nB: {'rotation matrix': [[0.151948, 0.599833, -0.785565], [0.987995, -0.114601, 0.103597], [-0.027885, -0.791875, -0.610046]], 'translation vector': [3.432288, 3.133084, 1.213871]}\nC: {'rotation matrix': [[0.14922, 0.604558, -0.78246], [0.988532, -0.109774, 0.103704], [-0.023198, -0.788961, -0.614005]], 'translation vector': [3.429968, 3.121084, 1.211424]}\nD: {'rotation matrix': [[0.14748, 0.608832, -0.77947], [0.988883, -0.105872, 0.104407], [-0.018958, -0.786202, -0.617678]], 'translation vector': [3.426714, 3.1102, 1.209074]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999006085846415, 0.014105987403183131, 0.0007516964397581732], [-0.014115634211093402, 0.9997972933579915, 0.014373215967786759], [-0.0005489990557007766, -0.014382715808085993, 0.9998969169271865]], 'translation vector': [-0.02535757146051898, 0.0018154305901567636, 0.010236799804218322]}\nB: {'rotation matrix': [[0.151948, 0.599833, -0.785565], [0.987995, -0.114601, 0.103597], [-0.027885, -0.791875, -0.610046]], 'translation vector': [3.432288, 3.133084, 1.213871]}\nC: {'rotation matrix': [[0.14922, 0.604558, -0.78246], [0.988532, -0.109774, 0.103704], [-0.023198, -0.788961, -0.614005]], 'translation vector': [3.429968, 3.121084, 1.211424]}\nD: {'rotation matrix': [[0.14748, 0.608832, -0.77947], [0.988883, -0.105872, 0.104407], [-0.018958, -0.786202, -0.617678]], 'translation vector': [3.426714, 3.1102, 1.209074]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_81_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_81_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_81_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_81_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.720072, 0.306191, -0.62269], [0.693309, -0.280455, 0.663829], [0.028622, -0.909721, -0.414232]], 'translation vector': [3.433251, 3.053234, 1.552574]}\nB: {'rotation matrix': [[0.715824, 0.307759, -0.626802], [0.697706, -0.278807, 0.659904], [0.028335, -0.909698, -0.414302]], 'translation vector': [3.42786, 3.050569, 1.552797]}\nC: {'rotation matrix': [[0.99998073687791, -0.005433231351435887, 0.0028092605219069734], [0.0054430621429484745, 0.9999793374936902, -0.0033485077732525377], [-0.00279094918696337, 0.0033635449074639256, 0.9999906279011188]], 'translation vector': [0.0048247922808197785, -0.007326500694675886, 7.779843116662022e-05]}\nD: {'rotation matrix': [[0.717959, 0.306862, -0.624797], [0.695515, -0.279904, 0.66175], [0.028183, -0.909665, -0.414386]], 'translation vector': [3.431505, 3.053102, 1.552563]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.720072, 0.306191, -0.62269], [0.693309, -0.280455, 0.663829], [0.028622, -0.909721, -0.414232]], 'translation vector': [3.433251, 3.053234, 1.552574]}\nB: {'rotation matrix': [[0.715824, 0.307759, -0.626802], [0.697706, -0.278807, 0.659904], [0.028335, -0.909698, -0.414302]], 'translation vector': [3.42786, 3.050569, 1.552797]}\nC: {'rotation matrix': [[0.99998073687791, -0.005433231351435887, 0.0028092605219069734], [0.0054430621429484745, 0.9999793374936902, -0.0033485077732525377], [-0.00279094918696337, 0.0033635449074639256, 0.9999906279011188]], 'translation vector': [0.0048247922808197785, -0.007326500694675886, 7.779843116662022e-05]}\nD: {'rotation matrix': [[0.717959, 0.306862, -0.624797], [0.695515, -0.279904, 0.66175], [0.028183, -0.909665, -0.414386]], 'translation vector': [3.431505, 3.053102, 1.552563]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_82_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_82_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_82_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_82_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.863341, -0.254981, 0.435461], [-0.503057, 0.367008, -0.782457], [0.039694, -0.894589, -0.445123]], 'translation vector': [2.006748, 3.81545, 1.542323]}\nB: {'rotation matrix': [[-0.863173, -0.254818, 0.43589], [-0.503388, 0.367381, -0.782069], [0.039148, -0.894483, -0.445386]], 'translation vector': [2.007018, 3.816806, 1.542476]}\nC: {'rotation matrix': [[-0.863454, -0.255279, 0.435064], [-0.502805, 0.366433, -0.782888], [0.040433, -0.89474, -0.444754]], 'translation vector': [2.007318, 3.814646, 1.54216]}\nD: {'rotation matrix': [[0.9999972277888285, -0.0019486605164344517, 0.0010264732410869921], [0.0019490971963479567, 0.9999969267726074, -0.0010703044149313807], [-0.0010241404575203601, 0.0010719252856060263, 0.9999989421516985]], 'translation vector': [-0.0025894589048998107, 0.007141119527829198, -0.0014552230705469071]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.863341, -0.254981, 0.435461], [-0.503057, 0.367008, -0.782457], [0.039694, -0.894589, -0.445123]], 'translation vector': [2.006748, 3.81545, 1.542323]}\nB: {'rotation matrix': [[-0.863173, -0.254818, 0.43589], [-0.503388, 0.367381, -0.782069], [0.039148, -0.894483, -0.445386]], 'translation vector': [2.007018, 3.816806, 1.542476]}\nC: {'rotation matrix': [[-0.863454, -0.255279, 0.435064], [-0.502805, 0.366433, -0.782888], [0.040433, -0.89474, -0.444754]], 'translation vector': [2.007318, 3.814646, 1.54216]}\nD: {'rotation matrix': [[0.9999972277888285, -0.0019486605164344517, 0.0010264732410869921], [0.0019490971963479567, 0.9999969267726074, -0.0010703044149313807], [-0.0010241404575203601, 0.0010719252856060263, 0.9999989421516985]], 'translation vector': [-0.0025894589048998107, 0.007141119527829198, -0.0014552230705469071]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_83_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_83_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_83_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_83_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.987787, 0.108072, -0.112241], [0.155697, -0.656841, 0.73778], [0.006009, -0.746244, -0.665645]], 'translation vector': [4.649458, 4.057209, 1.404581]}\nB: {'rotation matrix': [[0.988022, 0.106035, -0.112112], [0.15424, -0.656197, 0.738658], [0.004756, -0.747102, -0.664692]], 'translation vector': [4.650307, 4.057695, 1.405486]}\nC: {'rotation matrix': [[0.9999939235077703, -0.0009864268743032946, -0.003107875618402941], [0.0009789613139893545, 0.9999966009200537, -0.0022176346298812244], [0.003110843270450784, 0.0022141652882606867, 0.9999926277610204]], 'translation vector': [-0.007831508873088033, -0.00424079700623059, -0.0006393879079424902]}\nD: {'rotation matrix': [[0.987654, 0.108357, -0.113131], [0.15654, -0.65545, 0.738837], [0.005906, -0.747425, -0.66432]], 'translation vector': [4.648766, 4.054578, 1.401957]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.987787, 0.108072, -0.112241], [0.155697, -0.656841, 0.73778], [0.006009, -0.746244, -0.665645]], 'translation vector': [4.649458, 4.057209, 1.404581]}\nB: {'rotation matrix': [[0.988022, 0.106035, -0.112112], [0.15424, -0.656197, 0.738658], [0.004756, -0.747102, -0.664692]], 'translation vector': [4.650307, 4.057695, 1.405486]}\nC: {'rotation matrix': [[0.9999939235077703, -0.0009864268743032946, -0.003107875618402941], [0.0009789613139893545, 0.9999966009200537, -0.0022176346298812244], [0.003110843270450784, 0.0022141652882606867, 0.9999926277610204]], 'translation vector': [-0.007831508873088033, -0.00424079700623059, -0.0006393879079424902]}\nD: {'rotation matrix': [[0.987654, 0.108357, -0.113131], [0.15654, -0.65545, 0.738837], [0.005906, -0.747425, -0.66432]], 'translation vector': [4.648766, 4.054578, 1.401957]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_84_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_84_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_84_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_84_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.9997717908276891, -0.014061705239683872, 0.016112763216433307], [0.0140800110532502, 0.9999007363199941, -0.0010460438789438157], [-0.01609611583081038, 0.0012720455953134791, 0.9998692891144138]], 'translation vector': [-0.018076948566243978, 0.0017768934909982992, 0.0006580952284183095]}\nB: {'rotation matrix': [[-0.782674, -0.257014, 0.566891], [-0.62216, 0.296092, -0.724739], [0.018416, -0.919931, -0.391647]], 'translation vector': [3.075882, 2.930909, 1.465913]}\nC: {'rotation matrix': [[-0.790591, -0.247688, 0.560015], [-0.61223, 0.301979, -0.730742], [0.011883, -0.920576, -0.390384]], 'translation vector': [3.085087, 2.935415, 1.467454]}\nD: {'rotation matrix': [[-0.773889, -0.266244, 0.574639], [-0.632944, 0.293814, -0.716279], [0.021868, -0.918034, -0.395897]], 'translation vector': [3.064209, 2.92712, 1.46117]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9997717908276891, -0.014061705239683872, 0.016112763216433307], [0.0140800110532502, 0.9999007363199941, -0.0010460438789438157], [-0.01609611583081038, 0.0012720455953134791, 0.9998692891144138]], 'translation vector': [-0.018076948566243978, 0.0017768934909982992, 0.0006580952284183095]}\nB: {'rotation matrix': [[-0.782674, -0.257014, 0.566891], [-0.62216, 0.296092, -0.724739], [0.018416, -0.919931, -0.391647]], 'translation vector': [3.075882, 2.930909, 1.465913]}\nC: {'rotation matrix': [[-0.790591, -0.247688, 0.560015], [-0.61223, 0.301979, -0.730742], [0.011883, -0.920576, -0.390384]], 'translation vector': [3.085087, 2.935415, 1.467454]}\nD: {'rotation matrix': [[-0.773889, -0.266244, 0.574639], [-0.632944, 0.293814, -0.716279], [0.021868, -0.918034, -0.395897]], 'translation vector': [3.064209, 2.92712, 1.46117]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_85_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_85_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_85_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_85_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.516456, 0.471348, -0.714916], [0.852952, -0.209256, 0.47821], [0.075803, -0.856763, -0.510109]], 'translation vector': [4.97866, 0.423553, 1.591931]}\nB: {'rotation matrix': [[0.514459, 0.473148, -0.715167], [0.854068, -0.208015, 0.476757], [0.076811, -0.856073, -0.511116]], 'translation vector': [4.979161, 0.423603, 1.588672]}\nC: {'rotation matrix': [[0.513176, 0.475448, -0.714563], [0.854688, -0.206948, 0.476112], [0.078489, -0.855056, -0.51256]], 'translation vector': [4.976408, 0.420953, 1.588878]}\nD: {'rotation matrix': [[0.9999760932186582, 0.0012257187804321542, -0.006834405192096566], [-0.001218782424370881, 0.999999818921041, 0.0008912149688576313], [0.006835564452074455, -0.0008827669412868106, 0.9999768915925589]], 'translation vector': [-0.008703367750305002, 0.013496314561282197, 0.004560884153690381]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.516456, 0.471348, -0.714916], [0.852952, -0.209256, 0.47821], [0.075803, -0.856763, -0.510109]], 'translation vector': [4.97866, 0.423553, 1.591931]}\nB: {'rotation matrix': [[0.514459, 0.473148, -0.715167], [0.854068, -0.208015, 0.476757], [0.076811, -0.856073, -0.511116]], 'translation vector': [4.979161, 0.423603, 1.588672]}\nC: {'rotation matrix': [[0.513176, 0.475448, -0.714563], [0.854688, -0.206948, 0.476112], [0.078489, -0.855056, -0.51256]], 'translation vector': [4.976408, 0.420953, 1.588878]}\nD: {'rotation matrix': [[0.9999760932186582, 0.0012257187804321542, -0.006834405192096566], [-0.001218782424370881, 0.999999818921041, 0.0008912149688576313], [0.006835564452074455, -0.0008827669412868106, 0.9999768915925589]], 'translation vector': [-0.008703367750305002, 0.013496314561282197, 0.004560884153690381]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_86_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_86_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_86_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_86_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.448824, 0.136877, -0.883075], [0.892984, 0.105987, -0.437432], [0.033721, -0.984902, -0.169799]], 'translation vector': [3.315047, 2.127717, 1.592265]}\nB: {'rotation matrix': [[-0.452202, 0.137197, -0.8813], [0.891317, 0.105713, -0.440885], [0.032677, -0.984887, -0.170089]], 'translation vector': [3.315698, 2.124716, 1.590659]}\nC: {'rotation matrix': [[-0.449366, 0.136914, -0.882794], [0.892692, 0.106685, -0.437859], [0.034232, -0.984821, -0.170162]], 'translation vector': [3.315906, 2.123902, 1.590809]}\nD: {'rotation matrix': [[0.9999985219155758, 3.3460926462502616e-05, -0.0017161305114821916], [-2.2107949587045813e-05, 0.9999776535003064, 0.006715346047559243], [0.0017166465785429835, -0.006715383734168117, 0.9999757458350781]], 'translation vector': [0.0004478029195722488, -0.00269782175769262, 0.002036977128338613]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.448824, 0.136877, -0.883075], [0.892984, 0.105987, -0.437432], [0.033721, -0.984902, -0.169799]], 'translation vector': [3.315047, 2.127717, 1.592265]}\nB: {'rotation matrix': [[-0.452202, 0.137197, -0.8813], [0.891317, 0.105713, -0.440885], [0.032677, -0.984887, -0.170089]], 'translation vector': [3.315698, 2.124716, 1.590659]}\nC: {'rotation matrix': [[-0.449366, 0.136914, -0.882794], [0.892692, 0.106685, -0.437859], [0.034232, -0.984821, -0.170162]], 'translation vector': [3.315906, 2.123902, 1.590809]}\nD: {'rotation matrix': [[0.9999985219155758, 3.3460926462502616e-05, -0.0017161305114821916], [-2.2107949587045813e-05, 0.9999776535003064, 0.006715346047559243], [0.0017166465785429835, -0.006715383734168117, 0.9999757458350781]], 'translation vector': [0.0004478029195722488, -0.00269782175769262, 0.002036977128338613]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_87_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_87_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_87_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_87_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.993234, -0.016226, -0.114989], [0.102235, -0.347461, 0.932104], [-0.055078, -0.937554, -0.343451]], 'translation vector': [2.952414, 4.433719, 1.459459]}\nB: {'rotation matrix': [[0.993467, -0.015486, -0.113064], [0.100672, -0.347655, 0.932202], [-0.053743, -0.937495, -0.343825]], 'translation vector': [2.95506, 4.435545, 1.464879]}\nC: {'rotation matrix': [[0.9999909736646528, 0.004123633453855257, 0.0014048261378039103], [-0.004123349991970072, 0.9999914014789152, -0.00016491391217041438], [-0.0014064329904194771, 0.0001595777583995875, 0.9999985433300184]], 'translation vector': [-0.0007043054873738797, 0.0030876829023283037, 0.0005875691155496909]}\nD: {'rotation matrix': [[0.993543, -0.018943, -0.111866], [0.098443, -0.346258, 0.93296], [-0.056408, -0.937948, -0.342158]], 'translation vector': [2.958581, 4.436487, 1.463224]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.993234, -0.016226, -0.114989], [0.102235, -0.347461, 0.932104], [-0.055078, -0.937554, -0.343451]], 'translation vector': [2.952414, 4.433719, 1.459459]}\nB: {'rotation matrix': [[0.993467, -0.015486, -0.113064], [0.100672, -0.347655, 0.932202], [-0.053743, -0.937495, -0.343825]], 'translation vector': [2.95506, 4.435545, 1.464879]}\nC: {'rotation matrix': [[0.9999909736646528, 0.004123633453855257, 0.0014048261378039103], [-0.004123349991970072, 0.9999914014789152, -0.00016491391217041438], [-0.0014064329904194771, 0.0001595777583995875, 0.9999985433300184]], 'translation vector': [-0.0007043054873738797, 0.0030876829023283037, 0.0005875691155496909]}\nD: {'rotation matrix': [[0.993543, -0.018943, -0.111866], [0.098443, -0.346258, 0.93296], [-0.056408, -0.937948, -0.342158]], 'translation vector': [2.958581, 4.436487, 1.463224]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_88_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_88_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_88_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_88_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.481483, 0.389974, -0.784917], [0.875782, -0.249176, 0.413422], [-0.034359, -0.886471, -0.461507]], 'translation vector': [2.949051, 2.713893, 1.478454]}\nB: {'rotation matrix': [[0.478541, 0.391858, -0.785777], [0.877371, -0.248969, 0.410164], [-0.034908, -0.885699, -0.462947]], 'translation vector': [2.947931, 2.717417, 1.47825]}\nC: {'rotation matrix': [[0.48013, 0.390884, -0.785293], [0.876512, -0.249161, 0.41188], [-0.034667, -0.886075, -0.462245]], 'translation vector': [2.948499, 2.715565, 1.478062]}\nD: {'rotation matrix': [[0.9999997612782662, 0.0004988640565728734, 0.0011620670948559774], [-0.0004998749589789403, 0.9999982856757066, 0.0017162390398127588], [-0.0011605634579595124, -0.0017165704161403005, 0.9999970539013514]], 'translation vector': [0.0003215494383659312, -0.003127483043635193, -0.0005499886913977736]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.481483, 0.389974, -0.784917], [0.875782, -0.249176, 0.413422], [-0.034359, -0.886471, -0.461507]], 'translation vector': [2.949051, 2.713893, 1.478454]}\nB: {'rotation matrix': [[0.478541, 0.391858, -0.785777], [0.877371, -0.248969, 0.410164], [-0.034908, -0.885699, -0.462947]], 'translation vector': [2.947931, 2.717417, 1.47825]}\nC: {'rotation matrix': [[0.48013, 0.390884, -0.785293], [0.876512, -0.249161, 0.41188], [-0.034667, -0.886075, -0.462245]], 'translation vector': [2.948499, 2.715565, 1.478062]}\nD: {'rotation matrix': [[0.9999997612782662, 0.0004988640565728734, 0.0011620670948559774], [-0.0004998749589789403, 0.9999982856757066, 0.0017162390398127588], [-0.0011605634579595124, -0.0017165704161403005, 0.9999970539013514]], 'translation vector': [0.0003215494383659312, -0.003127483043635193, -0.0005499886913977736]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_89_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_89_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_89_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_89_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.564799, -0.352124, 0.746332], [-0.825057, 0.22251, -0.519395], [0.016825, -0.909119, -0.416196]], 'translation vector': [2.054545, 3.84102, 1.387591]}\nB: {'rotation matrix': [[-0.564546, -0.353818, 0.745722], [-0.825222, 0.223074, -0.518891], [0.017242, -0.908323, -0.417914]], 'translation vector': [2.054274, 3.838, 1.389919]}\nC: {'rotation matrix': [[0.9999863571433116, 0.000569020369849223, 0.005277349616356428], [-0.0005964877721919992, 0.9999870648779766, 0.0050113119474318605], [-0.005273998232851741, -0.005013939911459714, 0.9999743290291354]], 'translation vector': [-0.009394794053914524, 0.0032248655541047277, -0.0014403793043347157]}\nD: {'rotation matrix': [[-0.566299, -0.350153, 0.746122], [-0.824022, 0.221689, -0.521385], [0.017157, -0.91008, -0.414077]], 'translation vector': [2.055187, 3.843729, 1.385575]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.564799, -0.352124, 0.746332], [-0.825057, 0.22251, -0.519395], [0.016825, -0.909119, -0.416196]], 'translation vector': [2.054545, 3.84102, 1.387591]}\nB: {'rotation matrix': [[-0.564546, -0.353818, 0.745722], [-0.825222, 0.223074, -0.518891], [0.017242, -0.908323, -0.417914]], 'translation vector': [2.054274, 3.838, 1.389919]}\nC: {'rotation matrix': [[0.9999863571433116, 0.000569020369849223, 0.005277349616356428], [-0.0005964877721919992, 0.9999870648779766, 0.0050113119474318605], [-0.005273998232851741, -0.005013939911459714, 0.9999743290291354]], 'translation vector': [-0.009394794053914524, 0.0032248655541047277, -0.0014403793043347157]}\nD: {'rotation matrix': [[-0.566299, -0.350153, 0.746122], [-0.824022, 0.221689, -0.521385], [0.017157, -0.91008, -0.414077]], 'translation vector': [2.055187, 3.843729, 1.385575]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_90_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_90_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_90_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_90_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.037266, 0.594373, -0.803326], [0.998697, -0.005895, -0.05069], [-0.034865, -0.804168, -0.593379]], 'translation vector': [3.957977, 2.244087, 1.44004]}\nB: {'rotation matrix': [[0.9999996431737382, -0.0003859815143722872, 0.000745633467728039], [0.0003848347144862704, 1.0000000520852899, 0.00069936137332727], [-0.0007460186813055825, -0.0006987638070698045, 0.9999991664939947]], 'translation vector': [0.0003220625244955144, -0.0016866012265464025, 0.00017566976974592308]}\nC: {'rotation matrix': [[-0.03699, 0.597433, -0.801066], [0.998659, -0.006964, -0.051308], [-0.036231, -0.801889, -0.596374]], 'translation vector': [3.95766, 2.242744, 1.440408]}\nD: {'rotation matrix': [[-0.039909, 0.596654, -0.801506], [0.998413, -0.00808, -0.055729], [-0.039727, -0.802458, -0.595385]], 'translation vector': [3.959598, 2.247142, 1.43878]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.037266, 0.594373, -0.803326], [0.998697, -0.005895, -0.05069], [-0.034865, -0.804168, -0.593379]], 'translation vector': [3.957977, 2.244087, 1.44004]}\nB: {'rotation matrix': [[0.9999996431737382, -0.0003859815143722872, 0.000745633467728039], [0.0003848347144862704, 1.0000000520852899, 0.00069936137332727], [-0.0007460186813055825, -0.0006987638070698045, 0.9999991664939947]], 'translation vector': [0.0003220625244955144, -0.0016866012265464025, 0.00017566976974592308]}\nC: {'rotation matrix': [[-0.03699, 0.597433, -0.801066], [0.998659, -0.006964, -0.051308], [-0.036231, -0.801889, -0.596374]], 'translation vector': [3.95766, 2.242744, 1.440408]}\nD: {'rotation matrix': [[-0.039909, 0.596654, -0.801506], [0.998413, -0.00808, -0.055729], [-0.039727, -0.802458, -0.595385]], 'translation vector': [3.959598, 2.247142, 1.43878]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_91_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_91_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_91_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_91_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.875953, -0.064411, 0.478078], [-0.480708, 0.199407, -0.853907], [-0.040331, -0.977798, -0.205634]], 'translation vector': [2.420033, 1.712699, 1.489589]}\nB: {'rotation matrix': [[0.9999951897323609, 0.002443562746852685, 0.0019815417907405445], [-0.0024514031523570133, 0.9999905126626161, 0.0035483645433095957], [-0.001973097402749593, -0.0035542813588159473, 0.9999914538041953]], 'translation vector': [0.004274186437530858, 0.003113397542267471, -0.0028983696999268505]}\nC: {'rotation matrix': [[-0.873782, -0.064754, 0.481987], [-0.484652, 0.197901, -0.852026], [-0.040214, -0.978081, -0.204306]], 'translation vector': [2.408279, 1.71933, 1.490834]}\nD: {'rotation matrix': [[-0.874383, -0.064777, 0.480894], [-0.483517, 0.199683, -0.852255], [-0.04082, -0.977717, -0.20592]], 'translation vector': [2.41403, 1.715424, 1.490696]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.875953, -0.064411, 0.478078], [-0.480708, 0.199407, -0.853907], [-0.040331, -0.977798, -0.205634]], 'translation vector': [2.420033, 1.712699, 1.489589]}\nB: {'rotation matrix': [[0.9999951897323609, 0.002443562746852685, 0.0019815417907405445], [-0.0024514031523570133, 0.9999905126626161, 0.0035483645433095957], [-0.001973097402749593, -0.0035542813588159473, 0.9999914538041953]], 'translation vector': [0.004274186437530858, 0.003113397542267471, -0.0028983696999268505]}\nC: {'rotation matrix': [[-0.873782, -0.064754, 0.481987], [-0.484652, 0.197901, -0.852026], [-0.040214, -0.978081, -0.204306]], 'translation vector': [2.408279, 1.71933, 1.490834]}\nD: {'rotation matrix': [[-0.874383, -0.064777, 0.480894], [-0.483517, 0.199683, -0.852255], [-0.04082, -0.977717, -0.20592]], 'translation vector': [2.41403, 1.715424, 1.490696]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_92_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_92_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_92_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_92_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.222027, -0.462333, 0.858459], [-0.974308, 0.139332, -0.176951], [-0.0378, -0.875691, -0.48139]], 'translation vector': [2.717101, 1.647348, 1.522281]}\nB: {'rotation matrix': [[-0.218431, -0.46311, 0.858963], [-0.975079, 0.138612, -0.173227], [-0.038839, -0.875395, -0.481846]], 'translation vector': [2.716881, 1.647519, 1.52132]}\nC: {'rotation matrix': [[-0.21644, -0.463451, 0.859283], [-0.975487, 0.138488, -0.171017], [-0.039742, -0.875234, -0.482065]], 'translation vector': [2.718464, 1.6518, 1.521331]}\nD: {'rotation matrix': [[0.9999923983379827, -0.0004070717907623284, -0.003958509569048427], [0.0004054339454520325, 1.0000004886800318, -0.0002774375137664041], [0.003959398020789269, 0.00027564706640871675, 0.9999922541189072]], 'translation vector': [-0.005217870906849331, -0.0010876072780465762, 0.0013190166897232292]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.222027, -0.462333, 0.858459], [-0.974308, 0.139332, -0.176951], [-0.0378, -0.875691, -0.48139]], 'translation vector': [2.717101, 1.647348, 1.522281]}\nB: {'rotation matrix': [[-0.218431, -0.46311, 0.858963], [-0.975079, 0.138612, -0.173227], [-0.038839, -0.875395, -0.481846]], 'translation vector': [2.716881, 1.647519, 1.52132]}\nC: {'rotation matrix': [[-0.21644, -0.463451, 0.859283], [-0.975487, 0.138488, -0.171017], [-0.039742, -0.875234, -0.482065]], 'translation vector': [2.718464, 1.6518, 1.521331]}\nD: {'rotation matrix': [[0.9999923983379827, -0.0004070717907623284, -0.003958509569048427], [0.0004054339454520325, 1.0000004886800318, -0.0002774375137664041], [0.003959398020789269, 0.00027564706640871675, 0.9999922541189072]], 'translation vector': [-0.005217870906849331, -0.0010876072780465762, 0.0013190166897232292]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_93_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_93_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_93_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_93_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.937153, -0.143335, 0.318118], [-0.348912, 0.379164, -0.857027], [0.002223, -0.914161, -0.405346]], 'translation vector': [2.695696, 2.482015, 1.468683]}\nB: {'rotation matrix': [[0.9999943196573092, 0.0004252932585082039, -0.003407353327997387], [-0.0004083231357033075, 0.9999884704113977, 0.004802465167927532], [0.0034098975295442984, -0.004801063633517885, 0.9999828785878939]], 'translation vector': [4.878723511492211e-05, -0.0002916568078848991, 6.338042778675224e-05]}\nC: {'rotation matrix': [[-0.936491, -0.141969, 0.320669], [-0.350691, 0.379755, -0.856039], [-0.000244, -0.914129, -0.405424]], 'translation vector': [2.694833, 2.48135, 1.466405]}\nD: {'rotation matrix': [[-0.938082, -0.144128, 0.315008], [-0.346377, 0.376958, -0.859026], [0.005065, -0.914948, -0.40354]], 'translation vector': [2.697706, 2.481531, 1.470994]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.937153, -0.143335, 0.318118], [-0.348912, 0.379164, -0.857027], [0.002223, -0.914161, -0.405346]], 'translation vector': [2.695696, 2.482015, 1.468683]}\nB: {'rotation matrix': [[0.9999943196573092, 0.0004252932585082039, -0.003407353327997387], [-0.0004083231357033075, 0.9999884704113977, 0.004802465167927532], [0.0034098975295442984, -0.004801063633517885, 0.9999828785878939]], 'translation vector': [4.878723511492211e-05, -0.0002916568078848991, 6.338042778675224e-05]}\nC: {'rotation matrix': [[-0.936491, -0.141969, 0.320669], [-0.350691, 0.379755, -0.856039], [-0.000244, -0.914129, -0.405424]], 'translation vector': [2.694833, 2.48135, 1.466405]}\nD: {'rotation matrix': [[-0.938082, -0.144128, 0.315008], [-0.346377, 0.376958, -0.859026], [0.005065, -0.914948, -0.40354]], 'translation vector': [2.697706, 2.481531, 1.470994]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_94_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_94_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_94_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_94_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.9999926818767, 0.0006670318036002099, -0.0037525660402691484], [-0.0006413298116955215, 0.9999778975651503, 0.0065610016234285765], [0.0037569249243787368, -0.006558428485894876, 0.999971386151166]], 'translation vector': [0.00407147231806082, -0.002381515530327949, 0.00020808612264033854]}\nB: {'rotation matrix': [[0.598948, -0.354434, 0.718079], [-0.795274, -0.158225, 0.585238], [-0.093811, -0.921597, -0.376641]], 'translation vector': [2.366687, 6.228749, 1.483315]}\nC: {'rotation matrix': [[0.595688, -0.354051, 0.720975], [-0.797698, -0.155728, 0.582604], [-0.093996, -0.92217, -0.37519]], 'translation vector': [2.365015, 6.231124, 1.484416]}\nD: {'rotation matrix': [[0.602088, -0.354098, 0.715615], [-0.793005, -0.160904, 0.587582], [-0.092916, -0.921263, -0.37768]], 'translation vector': [2.370181, 6.228135, 1.483056]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999926818767, 0.0006670318036002099, -0.0037525660402691484], [-0.0006413298116955215, 0.9999778975651503, 0.0065610016234285765], [0.0037569249243787368, -0.006558428485894876, 0.999971386151166]], 'translation vector': [0.00407147231806082, -0.002381515530327949, 0.00020808612264033854]}\nB: {'rotation matrix': [[0.598948, -0.354434, 0.718079], [-0.795274, -0.158225, 0.585238], [-0.093811, -0.921597, -0.376641]], 'translation vector': [2.366687, 6.228749, 1.483315]}\nC: {'rotation matrix': [[0.595688, -0.354051, 0.720975], [-0.797698, -0.155728, 0.582604], [-0.093996, -0.92217, -0.37519]], 'translation vector': [2.365015, 6.231124, 1.484416]}\nD: {'rotation matrix': [[0.602088, -0.354098, 0.715615], [-0.793005, -0.160904, 0.587582], [-0.092916, -0.921263, -0.37768]], 'translation vector': [2.370181, 6.228135, 1.483056]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_95_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_95_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_95_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_95_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.749807, 0.343159, -0.565713], [0.658704, 0.306467, -0.687158], [-0.062432, -0.887874, -0.455832]], 'translation vector': [3.78087, 2.559782, 1.382918]}\nB: {'rotation matrix': [[0.9999937348891738, -5.242465291256345e-05, -0.003554980216800454], [4.743594033927849e-05, 0.9999987930802742, -0.0013704756295102494], [0.0035549170275783653, 0.0013698614468464884, 0.9999933438233037]], 'translation vector': [-0.006430893779766134, -0.00441205739948114, -0.013902381507369554]}\nC: {'rotation matrix': [[-0.753941, 0.344397, -0.559431], [0.653858, 0.310968, -0.68976], [-0.063586, -0.885827, -0.459639]], 'translation vector': [3.768856, 2.553297, 1.380708]}\nD: {'rotation matrix': [[-0.758857, 0.345337, -0.552158], [0.64782, 0.313263, -0.694403], [-0.066832, -0.884652, -0.461438]], 'translation vector': [3.75736, 2.54705, 1.379026]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.749807, 0.343159, -0.565713], [0.658704, 0.306467, -0.687158], [-0.062432, -0.887874, -0.455832]], 'translation vector': [3.78087, 2.559782, 1.382918]}\nB: {'rotation matrix': [[0.9999937348891738, -5.242465291256345e-05, -0.003554980216800454], [4.743594033927849e-05, 0.9999987930802742, -0.0013704756295102494], [0.0035549170275783653, 0.0013698614468464884, 0.9999933438233037]], 'translation vector': [-0.006430893779766134, -0.00441205739948114, -0.013902381507369554]}\nC: {'rotation matrix': [[-0.753941, 0.344397, -0.559431], [0.653858, 0.310968, -0.68976], [-0.063586, -0.885827, -0.459639]], 'translation vector': [3.768856, 2.553297, 1.380708]}\nD: {'rotation matrix': [[-0.758857, 0.345337, -0.552158], [0.64782, 0.313263, -0.694403], [-0.066832, -0.884652, -0.461438]], 'translation vector': [3.75736, 2.54705, 1.379026]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_96_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_96_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_96_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_96_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.9999813823639969, 0.005886631638650806, 0.0017617762591203541], [-0.005897799441000984, 0.9999663260190635, 0.005653603868593601], [-0.0017278375660129883, -0.005663714705693189, 0.9999828776446472]], 'translation vector': [-0.009345482457980114, -0.0002560144178671564, 0.004418422526778709]}\nB: {'rotation matrix': [[0.930353, -0.229821, 0.285704], [-0.366636, -0.593139, 0.716774], [0.004732, -0.771601, -0.636089]], 'translation vector': [0.347034, 1.978598, 1.559374]}\nC: {'rotation matrix': [[0.932658, -0.225033, 0.281975], [-0.360742, -0.590178, 0.722188], [0.003899, -0.775274, -0.631613]], 'translation vector': [0.341015, 1.979035, 1.553548]}\nD: {'rotation matrix': [[0.93165, -0.227962, 0.282953], [-0.363337, -0.592976, 0.718587], [0.003973, -0.772278, -0.635273]], 'translation vector': [0.345174, 1.978811, 1.55669]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999813823639969, 0.005886631638650806, 0.0017617762591203541], [-0.005897799441000984, 0.9999663260190635, 0.005653603868593601], [-0.0017278375660129883, -0.005663714705693189, 0.9999828776446472]], 'translation vector': [-0.009345482457980114, -0.0002560144178671564, 0.004418422526778709]}\nB: {'rotation matrix': [[0.930353, -0.229821, 0.285704], [-0.366636, -0.593139, 0.716774], [0.004732, -0.771601, -0.636089]], 'translation vector': [0.347034, 1.978598, 1.559374]}\nC: {'rotation matrix': [[0.932658, -0.225033, 0.281975], [-0.360742, -0.590178, 0.722188], [0.003899, -0.775274, -0.631613]], 'translation vector': [0.341015, 1.979035, 1.553548]}\nD: {'rotation matrix': [[0.93165, -0.227962, 0.282953], [-0.363337, -0.592976, 0.718587], [0.003973, -0.772278, -0.635273]], 'translation vector': [0.345174, 1.978811, 1.55669]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_97_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_97_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_97_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_97_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.805636, 0.331099, -0.491249], [0.591886, 0.41495, -0.691005], [-0.024947, -0.847461, -0.530271]], 'translation vector': [2.379813, 3.089217, 1.318416]}\nB: {'rotation matrix': [[-0.795605, 0.337599, -0.503031], [0.605104, 0.402607, -0.686846], [-0.029355, -0.850844, -0.524598]], 'translation vector': [2.393777, 3.105406, 1.314663]}\nC: {'rotation matrix': [[0.9993623988750846, 0.01800106003136589, -0.030848731804103247], [-0.01802560053443595, 0.9998375187449363, -0.0005268676766097016], [0.03083392596568167, 0.0010819486754358148, 0.9995239906208094]], 'translation vector': [-0.0016360885893116628, -0.010945290948126685, 0.024052024973950203]}\nD: {'rotation matrix': [[-0.800158, 0.334375, -0.497936], [0.599132, 0.406738, -0.689642], [-0.02807, -0.850152, -0.525789]], 'translation vector': [2.38798, 3.097038, 1.316188]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.805636, 0.331099, -0.491249], [0.591886, 0.41495, -0.691005], [-0.024947, -0.847461, -0.530271]], 'translation vector': [2.379813, 3.089217, 1.318416]}\nB: {'rotation matrix': [[-0.795605, 0.337599, -0.503031], [0.605104, 0.402607, -0.686846], [-0.029355, -0.850844, -0.524598]], 'translation vector': [2.393777, 3.105406, 1.314663]}\nC: {'rotation matrix': [[0.9993623988750846, 0.01800106003136589, -0.030848731804103247], [-0.01802560053443595, 0.9998375187449363, -0.0005268676766097016], [0.03083392596568167, 0.0010819486754358148, 0.9995239906208094]], 'translation vector': [-0.0016360885893116628, -0.010945290948126685, 0.024052024973950203]}\nD: {'rotation matrix': [[-0.800158, 0.334375, -0.497936], [0.599132, 0.406738, -0.689642], [-0.02807, -0.850152, -0.525789]], 'translation vector': [2.38798, 3.097038, 1.316188]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_98_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_98_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_98_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_98_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.991094, 0.095151, -0.093156], [0.13246, 0.632786, -0.762913], [-0.013644, -0.768458, -0.639754]], 'translation vector': [1.823914, 5.346199, 1.288239]}\nB: {'rotation matrix': [[-0.988726, 0.104422, -0.107319], [0.149216, 0.627347, -0.764311], [-0.012484, -0.771707, -0.635855]], 'translation vector': [1.82699, 5.341948, 1.287049]}\nC: {'rotation matrix': [[-0.99302, 0.087717, -0.078848], [0.116766, 0.636826, -0.762115], [-0.016638, -0.766002, -0.642623]], 'translation vector': [1.820977, 5.35315, 1.28763]}\nD: {'rotation matrix': [[0.9995983225918212, 0.019501636556654815, -0.020561779482543004], [-0.019155805749363774, 0.9996737108067424, 0.016883013818148294], [0.020884207942970644, -0.01648212192569651, 0.9996460764084063]], 'translation vector': [0.004177467169132809, 0.0037647300644172432, -0.008346822766605477]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.991094, 0.095151, -0.093156], [0.13246, 0.632786, -0.762913], [-0.013644, -0.768458, -0.639754]], 'translation vector': [1.823914, 5.346199, 1.288239]}\nB: {'rotation matrix': [[-0.988726, 0.104422, -0.107319], [0.149216, 0.627347, -0.764311], [-0.012484, -0.771707, -0.635855]], 'translation vector': [1.82699, 5.341948, 1.287049]}\nC: {'rotation matrix': [[-0.99302, 0.087717, -0.078848], [0.116766, 0.636826, -0.762115], [-0.016638, -0.766002, -0.642623]], 'translation vector': [1.820977, 5.35315, 1.28763]}\nD: {'rotation matrix': [[0.9995983225918212, 0.019501636556654815, -0.020561779482543004], [-0.019155805749363774, 0.9996737108067424, 0.016883013818148294], [0.020884207942970644, -0.01648212192569651, 0.9996460764084063]], 'translation vector': [0.004177467169132809, 0.0037647300644172432, -0.008346822766605477]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_99_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_99_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_99_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_99_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.821677, -0.468788, 0.324168], [-0.569738, 0.691217, -0.444543], [-0.015674, -0.549962, -0.835043]], 'translation vector': [3.090628, 8.002418, 1.936363]}\nB: {'rotation matrix': [[-0.828255, -0.463511, 0.314882], [-0.560231, 0.696605, -0.4482], [-0.011603, -0.547631, -0.83664]], 'translation vector': [3.092081, 8.003743, 1.933112]}\nC: {'rotation matrix': [[-0.825245, -0.467159, 0.317384], [-0.564664, 0.693585, -0.44732], [-0.011164, -0.548364, -0.836165]], 'translation vector': [3.09483, 8.004893, 1.934166]}\nD: {'rotation matrix': [[0.9997794014417315, -0.01936250651215195, 0.008149128878679036], [0.019327506819120155, 0.9998035065129168, 0.004379447300995329], [-0.008232031850993044, -0.004222057574502129, 0.9999569918430281]], 'translation vector': [0.008428449014058259, -0.001816843944377755, 0.00043274814993932154]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.821677, -0.468788, 0.324168], [-0.569738, 0.691217, -0.444543], [-0.015674, -0.549962, -0.835043]], 'translation vector': [3.090628, 8.002418, 1.936363]}\nB: {'rotation matrix': [[-0.828255, -0.463511, 0.314882], [-0.560231, 0.696605, -0.4482], [-0.011603, -0.547631, -0.83664]], 'translation vector': [3.092081, 8.003743, 1.933112]}\nC: {'rotation matrix': [[-0.825245, -0.467159, 0.317384], [-0.564664, 0.693585, -0.44732], [-0.011164, -0.548364, -0.836165]], 'translation vector': [3.09483, 8.004893, 1.934166]}\nD: {'rotation matrix': [[0.9997794014417315, -0.01936250651215195, 0.008149128878679036], [0.019327506819120155, 0.9998035065129168, 0.004379447300995329], [-0.008232031850993044, -0.004222057574502129, 0.9999569918430281]], 'translation vector': [0.008428449014058259, -0.001816843944377755, 0.00043274814993932154]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_100_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_100_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_100_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_100_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.141521, 0.444417, -0.884571], [0.989842, -0.075821, 0.12027], [-0.013619, -0.892605, -0.450633]], 'translation vector': [3.547713, 0.933243, 1.481136]}\nB: {'rotation matrix': [[0.9999998646080918, -0.0007460665530377528, -0.0009402946708239287], [0.0007455507104745661, 1.0000000002230558, -0.0004258129745529856], [0.0009407423454210515, 0.0004259725827619916, 0.9999998211939707]], 'translation vector': [0.0008665006475636616, -0.0026059059125004003, -0.0011105548234366935]}\nC: {'rotation matrix': [[0.140147, 0.44482, -0.884587], [0.990025, -0.076044, 0.118612], [-0.014506, -0.892386, -0.45104]], 'translation vector': [3.548717, 0.935529, 1.481701]}\nD: {'rotation matrix': [[0.140907, 0.444584, -0.884585], [0.989916, -0.076415, 0.11928], [-0.014565, -0.892472, -0.450868]], 'translation vector': [3.549046, 0.934745, 1.482359]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.141521, 0.444417, -0.884571], [0.989842, -0.075821, 0.12027], [-0.013619, -0.892605, -0.450633]], 'translation vector': [3.547713, 0.933243, 1.481136]}\nB: {'rotation matrix': [[0.9999998646080918, -0.0007460665530377528, -0.0009402946708239287], [0.0007455507104745661, 1.0000000002230558, -0.0004258129745529856], [0.0009407423454210515, 0.0004259725827619916, 0.9999998211939707]], 'translation vector': [0.0008665006475636616, -0.0026059059125004003, -0.0011105548234366935]}\nC: {'rotation matrix': [[0.140147, 0.44482, -0.884587], [0.990025, -0.076044, 0.118612], [-0.014506, -0.892386, -0.45104]], 'translation vector': [3.548717, 0.935529, 1.481701]}\nD: {'rotation matrix': [[0.140907, 0.444584, -0.884585], [0.989916, -0.076415, 0.11928], [-0.014565, -0.892472, -0.450868]], 'translation vector': [3.549046, 0.934745, 1.482359]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_101_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_101_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_101_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_101_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.9999988361601905, 0.001006375069335518, -0.0012640568426656846], [-0.0010070223030151592, 0.9999994457401622, -0.0002863718291939501], [0.00126333848912293, 0.0002872673648295308, 0.9999986249179557]], 'translation vector': [-2.298300292791211e-05, -0.0003253221346838364, -0.001208803459929797]}\nB: {'rotation matrix': [[0.209622, 0.494864, -0.843308], [0.976967, -0.070778, 0.201312], [0.039935, -0.866083, -0.498303]], 'translation vector': [4.529501, 2.292687, 1.525847]}\nC: {'rotation matrix': [[0.210084, 0.49423, -0.843565], [0.976909, -0.071791, 0.201231], [0.038894, -0.866362, -0.4979]], 'translation vector': [4.52972, 2.291977, 1.52688]}\nD: {'rotation matrix': [[0.207746, 0.495681, -0.843292], [0.977345, -0.069508, 0.199914], [0.040478, -0.865719, -0.498891]], 'translation vector': [4.528935, 2.293617, 1.525752]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999988361601905, 0.001006375069335518, -0.0012640568426656846], [-0.0010070223030151592, 0.9999994457401622, -0.0002863718291939501], [0.00126333848912293, 0.0002872673648295308, 0.9999986249179557]], 'translation vector': [-2.298300292791211e-05, -0.0003253221346838364, -0.001208803459929797]}\nB: {'rotation matrix': [[0.209622, 0.494864, -0.843308], [0.976967, -0.070778, 0.201312], [0.039935, -0.866083, -0.498303]], 'translation vector': [4.529501, 2.292687, 1.525847]}\nC: {'rotation matrix': [[0.210084, 0.49423, -0.843565], [0.976909, -0.071791, 0.201231], [0.038894, -0.866362, -0.4979]], 'translation vector': [4.52972, 2.291977, 1.52688]}\nD: {'rotation matrix': [[0.207746, 0.495681, -0.843292], [0.977345, -0.069508, 0.199914], [0.040478, -0.865719, -0.498891]], 'translation vector': [4.528935, 2.293617, 1.525752]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_102_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_102_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_102_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_102_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.08541, 0.640528, -0.76317], [0.996306, -0.061741, 0.059682], [-0.008891, -0.765449, -0.643436]], 'translation vector': [3.003591, 1.574332, 1.432793]}\nB: {'rotation matrix': [[0.08501, 0.641279, -0.762584], [0.996355, -0.060148, 0.06049], [-0.007077, -0.764946, -0.644055]], 'translation vector': [3.00634, 1.575815, 1.433934]}\nC: {'rotation matrix': [[0.9999991614836521, 0.0005695811207308883, -0.0015075373347832835], [-0.0005748316229898535, 0.9999942080654551, -0.0033643004544446934], [0.0015050239127193494, 0.003364654292322913, 0.9999927444380246]], 'translation vector': [-0.0005823890138008103, 0.0017300160779236684, -0.0007769099832195536]}\nD: {'rotation matrix': [[0.085438, 0.641091, -0.762694], [0.996316, -0.060644, 0.060635], [-0.00738, -0.765065, -0.64391]], 'translation vector': [3.005707, 1.574798, 1.4333]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.08541, 0.640528, -0.76317], [0.996306, -0.061741, 0.059682], [-0.008891, -0.765449, -0.643436]], 'translation vector': [3.003591, 1.574332, 1.432793]}\nB: {'rotation matrix': [[0.08501, 0.641279, -0.762584], [0.996355, -0.060148, 0.06049], [-0.007077, -0.764946, -0.644055]], 'translation vector': [3.00634, 1.575815, 1.433934]}\nC: {'rotation matrix': [[0.9999991614836521, 0.0005695811207308883, -0.0015075373347832835], [-0.0005748316229898535, 0.9999942080654551, -0.0033643004544446934], [0.0015050239127193494, 0.003364654292322913, 0.9999927444380246]], 'translation vector': [-0.0005823890138008103, 0.0017300160779236684, -0.0007769099832195536]}\nD: {'rotation matrix': [[0.085438, 0.641091, -0.762694], [0.996316, -0.060644, 0.060635], [-0.00738, -0.765065, -0.64391]], 'translation vector': [3.005707, 1.574798, 1.4333]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_103_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_103_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_103_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_103_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.9999836186953653, 0.00108451635874016, -0.00565701955892193], [-0.0010583782720287828, 0.9999885698645751, 0.004627019650109258], [0.005661574895801672, -0.004620937029738945, 0.9999737330988163]], 'translation vector': [0.0022502124816869973, 0.004079635447382657, -0.0017077678174191036]}\nB: {'rotation matrix': [[0.764916, -0.419696, 0.48863], [-0.623144, -0.290098, 0.726316], [-0.163081, -0.860057, -0.483431]], 'translation vector': [2.190224, 2.255941, 1.286466]}\nC: {'rotation matrix': [[0.764173, -0.416772, 0.492281], [-0.624792, -0.28869, 0.725461], [-0.160235, -0.861951, -0.481005]], 'translation vector': [2.189569, 2.253508, 1.282023]}\nD: {'rotation matrix': [[0.763152, -0.417481, 0.493263], [-0.626561, -0.291187, 0.722933], [-0.158179, -0.860767, -0.483797]], 'translation vector': [2.190887, 2.252149, 1.282769]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999836186953653, 0.00108451635874016, -0.00565701955892193], [-0.0010583782720287828, 0.9999885698645751, 0.004627019650109258], [0.005661574895801672, -0.004620937029738945, 0.9999737330988163]], 'translation vector': [0.0022502124816869973, 0.004079635447382657, -0.0017077678174191036]}\nB: {'rotation matrix': [[0.764916, -0.419696, 0.48863], [-0.623144, -0.290098, 0.726316], [-0.163081, -0.860057, -0.483431]], 'translation vector': [2.190224, 2.255941, 1.286466]}\nC: {'rotation matrix': [[0.764173, -0.416772, 0.492281], [-0.624792, -0.28869, 0.725461], [-0.160235, -0.861951, -0.481005]], 'translation vector': [2.189569, 2.253508, 1.282023]}\nD: {'rotation matrix': [[0.763152, -0.417481, 0.493263], [-0.626561, -0.291187, 0.722933], [-0.158179, -0.860767, -0.483797]], 'translation vector': [2.190887, 2.252149, 1.282769]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_104_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_104_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_104_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_104_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.9999685089712825, -0.00485492735446149, 0.006197125791337409], [0.004892366141195013, 0.9999701266660436, -0.005981156791582798], [-0.006168136508286792, 0.006011265809070663, 0.9999622632239159]], 'translation vector': [0.00022924877864394233, 0.00019097290261571587, -0.001928325440709866]}\nB: {'rotation matrix': [[-0.968997, 0.179836, -0.169422], [0.236776, 0.48002, -0.8447], [-0.070582, -0.858627, -0.507719]], 'translation vector': [3.781446, 2.333063, 1.459816]}\nC: {'rotation matrix': [[-0.967651, 0.180929, -0.175829], [0.242263, 0.471818, -0.84776], [-0.070424, -0.862933, -0.500388]], 'translation vector': [3.780886, 2.334988, 1.460004]}\nD: {'rotation matrix': [[-0.968244, 0.180308, -0.173186], [0.239986, 0.476144, -0.845987], [-0.070076, -0.860684, -0.504294]], 'translation vector': [3.781386, 2.333968, 1.460791]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999685089712825, -0.00485492735446149, 0.006197125791337409], [0.004892366141195013, 0.9999701266660436, -0.005981156791582798], [-0.006168136508286792, 0.006011265809070663, 0.9999622632239159]], 'translation vector': [0.00022924877864394233, 0.00019097290261571587, -0.001928325440709866]}\nB: {'rotation matrix': [[-0.968997, 0.179836, -0.169422], [0.236776, 0.48002, -0.8447], [-0.070582, -0.858627, -0.507719]], 'translation vector': [3.781446, 2.333063, 1.459816]}\nC: {'rotation matrix': [[-0.967651, 0.180929, -0.175829], [0.242263, 0.471818, -0.84776], [-0.070424, -0.862933, -0.500388]], 'translation vector': [3.780886, 2.334988, 1.460004]}\nD: {'rotation matrix': [[-0.968244, 0.180308, -0.173186], [0.239986, 0.476144, -0.845987], [-0.070076, -0.860684, -0.504294]], 'translation vector': [3.781386, 2.333968, 1.460791]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_105_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_105_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_105_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_105_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.769476, 0.035457, -0.637691], [0.638618, -0.056212, 0.767468], [-0.008634, -0.997789, -0.065897]], 'translation vector': [3.059908, 3.99174, 1.48793]}\nB: {'rotation matrix': [[0.768334, 0.034359, -0.639126], [0.639975, -0.056434, 0.766321], [-0.009738, -0.997815, -0.065349]], 'translation vector': [3.063556, 3.993645, 1.487647]}\nC: {'rotation matrix': [[0.76637, 0.032495, -0.641577], [0.642284, -0.057724, 0.764291], [-0.012198, -0.997804, -0.065109]], 'translation vector': [3.065239, 3.993527, 1.488269]}\nD: {'rotation matrix': [[0.9999994770672102, 0.0010059593388553243, -0.0005629779278299809], [-0.0010051492699491647, 0.9999992963767129, 0.00013663416634814593], [0.0005621045000587302, -0.0001358922037830382, 1.0000001627120574]], 'translation vector': [-0.004005273497495132, -0.008267648490985158, -0.0009698463604679297]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.769476, 0.035457, -0.637691], [0.638618, -0.056212, 0.767468], [-0.008634, -0.997789, -0.065897]], 'translation vector': [3.059908, 3.99174, 1.48793]}\nB: {'rotation matrix': [[0.768334, 0.034359, -0.639126], [0.639975, -0.056434, 0.766321], [-0.009738, -0.997815, -0.065349]], 'translation vector': [3.063556, 3.993645, 1.487647]}\nC: {'rotation matrix': [[0.76637, 0.032495, -0.641577], [0.642284, -0.057724, 0.764291], [-0.012198, -0.997804, -0.065109]], 'translation vector': [3.065239, 3.993527, 1.488269]}\nD: {'rotation matrix': [[0.9999994770672102, 0.0010059593388553243, -0.0005629779278299809], [-0.0010051492699491647, 0.9999992963767129, 0.00013663416634814593], [0.0005621045000587302, -0.0001358922037830382, 1.0000001627120574]], 'translation vector': [-0.004005273497495132, -0.008267648490985158, -0.0009698463604679297]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_106_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_106_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_106_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_106_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.9999965024273574, 0.0018995589746770465, -0.002128604659346281], [-0.0018975946490258904, 0.9999971783528935, 0.0010235202394668671], [0.0021305967266473038, -0.00101920750556101, 0.9999966975642395]], 'translation vector': [-0.001221359216795559, -0.0013000622008119134, -0.00023198476015379166]}\nB: {'rotation matrix': [[-0.247804, -0.452831, 0.856468], [-0.967446, 0.162565, -0.193963], [-0.051399, -0.876651, -0.478373]], 'translation vector': [1.577581, 1.960365, 1.31447]}\nC: {'rotation matrix': [[-0.241822, -0.452397, 0.858405], [-0.968762, 0.162689, -0.18717], [-0.054978, -0.876852, -0.477607]], 'translation vector': [1.575634, 1.958436, 1.314538]}\nD: {'rotation matrix': [[-0.251836, -0.455153, 0.854058], [-0.966529, 0.162962, -0.198153], [-0.048989, -0.875374, -0.480959]], 'translation vector': [1.577733, 1.957285, 1.314553]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999965024273574, 0.0018995589746770465, -0.002128604659346281], [-0.0018975946490258904, 0.9999971783528935, 0.0010235202394668671], [0.0021305967266473038, -0.00101920750556101, 0.9999966975642395]], 'translation vector': [-0.001221359216795559, -0.0013000622008119134, -0.00023198476015379166]}\nB: {'rotation matrix': [[-0.247804, -0.452831, 0.856468], [-0.967446, 0.162565, -0.193963], [-0.051399, -0.876651, -0.478373]], 'translation vector': [1.577581, 1.960365, 1.31447]}\nC: {'rotation matrix': [[-0.241822, -0.452397, 0.858405], [-0.968762, 0.162689, -0.18717], [-0.054978, -0.876852, -0.477607]], 'translation vector': [1.575634, 1.958436, 1.314538]}\nD: {'rotation matrix': [[-0.251836, -0.455153, 0.854058], [-0.966529, 0.162962, -0.198153], [-0.048989, -0.875374, -0.480959]], 'translation vector': [1.577733, 1.957285, 1.314553]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_107_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_107_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_107_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_107_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.502725, -0.506922, 0.700212], [-0.864292, 0.279467, -0.418207], [0.016312, -0.815431, -0.578624]], 'translation vector': [4.022235, 5.007849, 1.281956]}\nB: {'rotation matrix': [[0.9994658881112836, -0.01787238768104933, 0.027367099286839433], [0.01746068119707397, 0.9997321901556896, 0.015206345300919908], [-0.027631559291770684, -0.01472107419323857, 0.9995094632380824]], 'translation vector': [-0.030548360022831567, -0.0024606871848007472, 0.004630350985881493]}\nC: {'rotation matrix': [[-0.511887, -0.50554, 0.694551], [-0.858867, 0.284356, -0.426016], [0.017868, -0.814599, -0.579749]], 'translation vector': [4.034731, 5.018784, 1.285057]}\nD: {'rotation matrix': [[-0.524185, -0.50567, 0.685221], [-0.851455, 0.296094, -0.432844], [0.015986, -0.810325, -0.585763]], 'translation vector': [4.046806, 5.029983, 1.286514]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.502725, -0.506922, 0.700212], [-0.864292, 0.279467, -0.418207], [0.016312, -0.815431, -0.578624]], 'translation vector': [4.022235, 5.007849, 1.281956]}\nB: {'rotation matrix': [[0.9994658881112836, -0.01787238768104933, 0.027367099286839433], [0.01746068119707397, 0.9997321901556896, 0.015206345300919908], [-0.027631559291770684, -0.01472107419323857, 0.9995094632380824]], 'translation vector': [-0.030548360022831567, -0.0024606871848007472, 0.004630350985881493]}\nC: {'rotation matrix': [[-0.511887, -0.50554, 0.694551], [-0.858867, 0.284356, -0.426016], [0.017868, -0.814599, -0.579749]], 'translation vector': [4.034731, 5.018784, 1.285057]}\nD: {'rotation matrix': [[-0.524185, -0.50567, 0.685221], [-0.851455, 0.296094, -0.432844], [0.015986, -0.810325, -0.585763]], 'translation vector': [4.046806, 5.029983, 1.286514]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_108_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_108_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_108_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_108_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.9997511181043846, 0.002627583195842048, -0.02210613011174363], [-0.002639480814720638, 0.9999968737080126, -0.0005594700790184048], [0.022104273186971824, 0.0006179932127646122, 0.9997554616613505]], 'translation vector': [0.008057060510321179, -0.003086615617105104, 0.008815946351156123]}\nB: {'rotation matrix': [[-0.793492, -0.269336, 0.545737], [-0.608499, 0.36581, -0.70421], [-0.009967, -0.890865, -0.454158]], 'translation vector': [3.342808, 3.719108, 1.377405]}\nC: {'rotation matrix': [[-0.799682, -0.271069, 0.535752], [-0.600405, 0.367946, -0.710021], [-0.004663, -0.889459, -0.456991]], 'translation vector': [3.342098, 3.723592, 1.379504]}\nD: {'rotation matrix': [[-0.786909, -0.267427, 0.556108], [-0.616914, 0.361106, -0.699299], [-0.013802, -0.893356, -0.449137]], 'translation vector': [3.343614, 3.714152, 1.377028]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9997511181043846, 0.002627583195842048, -0.02210613011174363], [-0.002639480814720638, 0.9999968737080126, -0.0005594700790184048], [0.022104273186971824, 0.0006179932127646122, 0.9997554616613505]], 'translation vector': [0.008057060510321179, -0.003086615617105104, 0.008815946351156123]}\nB: {'rotation matrix': [[-0.793492, -0.269336, 0.545737], [-0.608499, 0.36581, -0.70421], [-0.009967, -0.890865, -0.454158]], 'translation vector': [3.342808, 3.719108, 1.377405]}\nC: {'rotation matrix': [[-0.799682, -0.271069, 0.535752], [-0.600405, 0.367946, -0.710021], [-0.004663, -0.889459, -0.456991]], 'translation vector': [3.342098, 3.723592, 1.379504]}\nD: {'rotation matrix': [[-0.786909, -0.267427, 0.556108], [-0.616914, 0.361106, -0.699299], [-0.013802, -0.893356, -0.449137]], 'translation vector': [3.343614, 3.714152, 1.377028]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_109_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_109_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_109_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_109_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.470058, 0.310455, -0.826234], [0.882195, -0.135697, 0.450908], [0.027869, -0.940853, -0.337668]], 'translation vector': [2.719146, 3.165557, 1.444111]}\nB: {'rotation matrix': [[0.468253, 0.310351, -0.827298], [0.883071, -0.132138, 0.45025], [0.030418, -0.941394, -0.335936]], 'translation vector': [2.721684, 3.167619, 1.442076]}\nC: {'rotation matrix': [[0.9999808269927472, 0.005970992787416191, 0.00174631158613173], [-0.00597527498498062, 0.9999782630360619, 0.002650362855816552], [-0.0017306339108793102, -0.002661153861931357, 0.9999943916418343]], 'translation vector': [0.00038154468875628567, 0.0036815080791540167, 0.0005855298747257098]}\nD: {'rotation matrix': [[0.468431, 0.309409, -0.82755], [0.883026, -0.133283, 0.45], [0.028935, -0.941542, -0.33565]], 'translation vector': [2.722082, 3.167839, 1.441818]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.470058, 0.310455, -0.826234], [0.882195, -0.135697, 0.450908], [0.027869, -0.940853, -0.337668]], 'translation vector': [2.719146, 3.165557, 1.444111]}\nB: {'rotation matrix': [[0.468253, 0.310351, -0.827298], [0.883071, -0.132138, 0.45025], [0.030418, -0.941394, -0.335936]], 'translation vector': [2.721684, 3.167619, 1.442076]}\nC: {'rotation matrix': [[0.9999808269927472, 0.005970992787416191, 0.00174631158613173], [-0.00597527498498062, 0.9999782630360619, 0.002650362855816552], [-0.0017306339108793102, -0.002661153861931357, 0.9999943916418343]], 'translation vector': [0.00038154468875628567, 0.0036815080791540167, 0.0005855298747257098]}\nD: {'rotation matrix': [[0.468431, 0.309409, -0.82755], [0.883026, -0.133283, 0.45], [0.028935, -0.941542, -0.33565]], 'translation vector': [2.722082, 3.167839, 1.441818]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_110_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_110_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_110_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_110_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.886617, -0.372394, 0.274287], [-0.453667, -0.584855, 0.672407], [-0.089983, -0.720602, -0.687485]], 'translation vector': [2.862491, 2.429976, 1.648643]}\nB: {'rotation matrix': [[0.9999609940536269, 0.004336106194237876, -0.007665012330121596], [-0.0043994792450735756, 0.9999564170173506, -0.008231049758408536], [0.007628227917166668, 0.008264723677336768, 0.9999364519647637]], 'translation vector': [0.014388650201931252, -0.01870904449863442, 0.020428177278094317]}\nC: {'rotation matrix': [[0.881825, -0.378531, 0.281247], [-0.462696, -0.579299, 0.671062], [-0.091091, -0.721891, -0.685985]], 'translation vector': [2.843046, 2.410197, 1.648909]}\nD: {'rotation matrix': [[0.88465, -0.37678, 0.274647], [-0.457061, -0.584385, 0.670514], [-0.092137, -0.718701, -0.689188]], 'translation vector': [2.852998, 2.419565, 1.649377]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.886617, -0.372394, 0.274287], [-0.453667, -0.584855, 0.672407], [-0.089983, -0.720602, -0.687485]], 'translation vector': [2.862491, 2.429976, 1.648643]}\nB: {'rotation matrix': [[0.9999609940536269, 0.004336106194237876, -0.007665012330121596], [-0.0043994792450735756, 0.9999564170173506, -0.008231049758408536], [0.007628227917166668, 0.008264723677336768, 0.9999364519647637]], 'translation vector': [0.014388650201931252, -0.01870904449863442, 0.020428177278094317]}\nC: {'rotation matrix': [[0.881825, -0.378531, 0.281247], [-0.462696, -0.579299, 0.671062], [-0.091091, -0.721891, -0.685985]], 'translation vector': [2.843046, 2.410197, 1.648909]}\nD: {'rotation matrix': [[0.88465, -0.37678, 0.274647], [-0.457061, -0.584385, 0.670514], [-0.092137, -0.718701, -0.689188]], 'translation vector': [2.852998, 2.419565, 1.649377]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_111_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_111_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_111_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_111_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.937349, 0.183503, -0.296148], [0.348203, 0.521431, -0.779015], [0.011469, -0.833329, -0.552659]], 'translation vector': [1.516432, 1.509609, 1.382559]}\nB: {'rotation matrix': [[0.9999980782363276, 0.001503213640840046, -0.0005306957238598916], [-0.001506954921949503, 0.9999783283684359, -0.006294441787430057], [0.0005221180642734458, 0.006294562264384682, 0.9999799919937622]], 'translation vector': [-0.0001973645495118026, -0.004429123982855679, 0.002277103824624316]}\nC: {'rotation matrix': [[-0.936868, 0.179689, -0.299985], [0.349254, 0.523335, -0.777266], [0.017327, -0.832966, -0.553053]], 'translation vector': [1.516465, 1.50589, 1.383504]}\nD: {'rotation matrix': [[-0.936977, 0.182065, -0.298205], [0.349103, 0.5225, -0.777896], [0.014184, -0.832974, -0.55313]], 'translation vector': [1.516084, 1.508243, 1.382535]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.937349, 0.183503, -0.296148], [0.348203, 0.521431, -0.779015], [0.011469, -0.833329, -0.552659]], 'translation vector': [1.516432, 1.509609, 1.382559]}\nB: {'rotation matrix': [[0.9999980782363276, 0.001503213640840046, -0.0005306957238598916], [-0.001506954921949503, 0.9999783283684359, -0.006294441787430057], [0.0005221180642734458, 0.006294562264384682, 0.9999799919937622]], 'translation vector': [-0.0001973645495118026, -0.004429123982855679, 0.002277103824624316]}\nC: {'rotation matrix': [[-0.936868, 0.179689, -0.299985], [0.349254, 0.523335, -0.777266], [0.017327, -0.832966, -0.553053]], 'translation vector': [1.516465, 1.50589, 1.383504]}\nD: {'rotation matrix': [[-0.936977, 0.182065, -0.298205], [0.349103, 0.5225, -0.777896], [0.014184, -0.832974, -0.55313]], 'translation vector': [1.516084, 1.508243, 1.382535]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_112_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_112_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_112_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_112_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.857254, 0.207542, -0.471213], [0.514274, 0.300265, -0.803345], [-0.025239, -0.931003, -0.364137]], 'translation vector': [3.165454, 3.656282, 1.333704]}\nB: {'rotation matrix': [[0.99999640966509, -0.0010007660526934368, -0.0025990335284116336], [0.0009919500135136654, 0.9999945413184362, -0.003311856723933156], [0.0026023838793041037, 0.0033091782066769597, 0.999990415293157]], 'translation vector': [0.003805164660490079, -0.0038731744338753593, -0.0029462366598167478]}\nC: {'rotation matrix': [[-0.857583, 0.210228, -0.469422], [0.513576, 0.300052, -0.803871], [-0.028145, -0.930469, -0.365287]], 'translation vector': [3.164042, 3.653142, 1.337743]}\nD: {'rotation matrix': [[-0.856761, 0.210709, -0.470704], [0.514795, 0.294981, -0.804967], [-0.030765, -0.931981, -0.3612]], 'translation vector': [3.165054, 3.650114, 1.341357]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.857254, 0.207542, -0.471213], [0.514274, 0.300265, -0.803345], [-0.025239, -0.931003, -0.364137]], 'translation vector': [3.165454, 3.656282, 1.333704]}\nB: {'rotation matrix': [[0.99999640966509, -0.0010007660526934368, -0.0025990335284116336], [0.0009919500135136654, 0.9999945413184362, -0.003311856723933156], [0.0026023838793041037, 0.0033091782066769597, 0.999990415293157]], 'translation vector': [0.003805164660490079, -0.0038731744338753593, -0.0029462366598167478]}\nC: {'rotation matrix': [[-0.857583, 0.210228, -0.469422], [0.513576, 0.300052, -0.803871], [-0.028145, -0.930469, -0.365287]], 'translation vector': [3.164042, 3.653142, 1.337743]}\nD: {'rotation matrix': [[-0.856761, 0.210709, -0.470704], [0.514795, 0.294981, -0.804967], [-0.030765, -0.931981, -0.3612]], 'translation vector': [3.165054, 3.650114, 1.341357]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_113_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_113_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_113_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_113_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.496753, -0.455066, 0.739021], [-0.867832, 0.250391, -0.429153], [0.010249, -0.854529, -0.519303]], 'translation vector': [1.585927, 4.408765, 1.329075]}\nB: {'rotation matrix': [[-0.500222, -0.451271, 0.739008], [-0.865841, 0.250951, -0.432832], [0.009869, -0.856375, -0.51626]], 'translation vector': [1.58204, 4.414393, 1.331803]}\nC: {'rotation matrix': [[0.9999646386789894, 0.004658434745366248, 0.0070851516010133645], [-0.004681817359291345, 0.999983580482257, 0.0033767996583214267], [-0.007069472768969729, -0.0034085438936624457, 0.9999685018088521]], 'translation vector': [-1.9089315443032717e-05, 0.003691149021725071, -0.009076217757424399]}\nD: {'rotation matrix': [[-0.49386, -0.45517, 0.740893], [-0.869462, 0.246928, -0.427859], [0.011802, -0.855481, -0.5177]], 'translation vector': [1.591466, 4.4048, 1.328646]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.496753, -0.455066, 0.739021], [-0.867832, 0.250391, -0.429153], [0.010249, -0.854529, -0.519303]], 'translation vector': [1.585927, 4.408765, 1.329075]}\nB: {'rotation matrix': [[-0.500222, -0.451271, 0.739008], [-0.865841, 0.250951, -0.432832], [0.009869, -0.856375, -0.51626]], 'translation vector': [1.58204, 4.414393, 1.331803]}\nC: {'rotation matrix': [[0.9999646386789894, 0.004658434745366248, 0.0070851516010133645], [-0.004681817359291345, 0.999983580482257, 0.0033767996583214267], [-0.007069472768969729, -0.0034085438936624457, 0.9999685018088521]], 'translation vector': [-1.9089315443032717e-05, 0.003691149021725071, -0.009076217757424399]}\nD: {'rotation matrix': [[-0.49386, -0.45517, 0.740893], [-0.869462, 0.246928, -0.427859], [0.011802, -0.855481, -0.5177]], 'translation vector': [1.591466, 4.4048, 1.328646]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_114_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_114_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_114_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_114_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.216454, 0.211748, -0.953053], [0.970078, -0.156619, 0.185523], [-0.109982, -0.964693, -0.239313]], 'translation vector': [4.876326, 2.835873, 1.673403]}\nB: {'rotation matrix': [[0.9999815116666099, 0.003828941338965109, -0.004550071168900241], [-0.0038043597787837257, 0.999978256845473, 0.005451903660299082], [0.004571630830927769, -0.005433064547306739, 0.9999746221671975]], 'translation vector': [-0.0021931397990639923, 0.004370231111501255, 0.000887941045025542]}\nC: {'rotation matrix': [[0.223921, 0.203392, -0.953148], [0.967778, -0.16198, 0.192793], [-0.115179, -0.965606, -0.233109]], 'translation vector': [4.877863, 2.835087, 1.676992]}\nD: {'rotation matrix': [[0.219557, 0.208101, -0.953147], [0.969026, -0.159743, 0.188338], [-0.113066, -0.964975, -0.236728]], 'translation vector': [4.875911, 2.83788, 1.674953]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.216454, 0.211748, -0.953053], [0.970078, -0.156619, 0.185523], [-0.109982, -0.964693, -0.239313]], 'translation vector': [4.876326, 2.835873, 1.673403]}\nB: {'rotation matrix': [[0.9999815116666099, 0.003828941338965109, -0.004550071168900241], [-0.0038043597787837257, 0.999978256845473, 0.005451903660299082], [0.004571630830927769, -0.005433064547306739, 0.9999746221671975]], 'translation vector': [-0.0021931397990639923, 0.004370231111501255, 0.000887941045025542]}\nC: {'rotation matrix': [[0.223921, 0.203392, -0.953148], [0.967778, -0.16198, 0.192793], [-0.115179, -0.965606, -0.233109]], 'translation vector': [4.877863, 2.835087, 1.676992]}\nD: {'rotation matrix': [[0.219557, 0.208101, -0.953147], [0.969026, -0.159743, 0.188338], [-0.113066, -0.964975, -0.236728]], 'translation vector': [4.875911, 2.83788, 1.674953]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_115_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_115_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_115_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_115_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.9999402380460055, 0.006292872204581464, 0.009000301608311233], [-0.006374858865950918, 0.9999377706786996, 0.009079354369281548], [-0.008942504139812743, -0.009135104885608418, 0.9999183041835895]], 'translation vector': [3.939302955568991e-05, -0.002970151993936021, 0.008448821166219922]}\nB: {'rotation matrix': [[-0.997375, -0.070877, -0.01485], [-0.014261, 0.393282, -0.919307], [0.070998, -0.916682, -0.393261]], 'translation vector': [7.372805, 2.63008, 1.348598]}\nC: {'rotation matrix': [[-0.997269, -0.072413, -0.014556], [-0.015372, 0.396244, -0.918017], [0.072244, -0.915286, -0.396275]], 'translation vector': [7.36901, 2.625689, 1.34671]}\nD: {'rotation matrix': [[-0.997198, -0.073599, -0.01342], [-0.016859, 0.39584, -0.918165], [0.072888, -0.915366, -0.395972]], 'translation vector': [7.365971, 2.622898, 1.345074]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999402380460055, 0.006292872204581464, 0.009000301608311233], [-0.006374858865950918, 0.9999377706786996, 0.009079354369281548], [-0.008942504139812743, -0.009135104885608418, 0.9999183041835895]], 'translation vector': [3.939302955568991e-05, -0.002970151993936021, 0.008448821166219922]}\nB: {'rotation matrix': [[-0.997375, -0.070877, -0.01485], [-0.014261, 0.393282, -0.919307], [0.070998, -0.916682, -0.393261]], 'translation vector': [7.372805, 2.63008, 1.348598]}\nC: {'rotation matrix': [[-0.997269, -0.072413, -0.014556], [-0.015372, 0.396244, -0.918017], [0.072244, -0.915286, -0.396275]], 'translation vector': [7.36901, 2.625689, 1.34671]}\nD: {'rotation matrix': [[-0.997198, -0.073599, -0.01342], [-0.016859, 0.39584, -0.918165], [0.072888, -0.915366, -0.395972]], 'translation vector': [7.365971, 2.622898, 1.345074]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_116_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_116_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_116_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_116_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.051446, 0.205786, -0.977244], [0.99734, -0.040014, -0.06093], [-0.051642, -0.977778, -0.20318]], 'translation vector': [3.492872, 2.502008, 1.69891]}\nB: {'rotation matrix': [[-0.043348, 0.1967, -0.979505], [0.997776, -0.041176, -0.052425], [-0.050644, -0.979599, -0.194477]], 'translation vector': [3.495688, 2.502278, 1.699202]}\nC: {'rotation matrix': [[-0.045349, 0.201463, -0.978446], [0.997609, -0.041995, -0.054884], [-0.052147, -0.978595, -0.199077]], 'translation vector': [3.49477, 2.503383, 1.707673]}\nD: {'rotation matrix': [[0.999952584360321, 0.006117610645767056, 0.007552374046773563], [-0.006133995866929935, 0.9999788341295721, 0.0021300038842573614], [-0.007539309696335425, -0.0021763143472021637, 0.9999686644670424]], 'translation vector': [-0.005541149058866601, -0.004329021249491083, -0.004737026405577716]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.051446, 0.205786, -0.977244], [0.99734, -0.040014, -0.06093], [-0.051642, -0.977778, -0.20318]], 'translation vector': [3.492872, 2.502008, 1.69891]}\nB: {'rotation matrix': [[-0.043348, 0.1967, -0.979505], [0.997776, -0.041176, -0.052425], [-0.050644, -0.979599, -0.194477]], 'translation vector': [3.495688, 2.502278, 1.699202]}\nC: {'rotation matrix': [[-0.045349, 0.201463, -0.978446], [0.997609, -0.041995, -0.054884], [-0.052147, -0.978595, -0.199077]], 'translation vector': [3.49477, 2.503383, 1.707673]}\nD: {'rotation matrix': [[0.999952584360321, 0.006117610645767056, 0.007552374046773563], [-0.006133995866929935, 0.9999788341295721, 0.0021300038842573614], [-0.007539309696335425, -0.0021763143472021637, 0.9999686644670424]], 'translation vector': [-0.005541149058866601, -0.004329021249491083, -0.004737026405577716]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_117_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_117_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_117_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_117_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.92969, -0.177823, 0.322577], [-0.368073, 0.414955, -0.832066], [0.014105, -0.892296, -0.451231]], 'translation vector': [2.094699, 1.923867, 1.362793]}\nB: {'rotation matrix': [[-0.929496, -0.179835, 0.322021], [-0.368436, 0.412208, -0.833271], [0.017112, -0.893165, -0.449403]], 'translation vector': [2.092189, 1.927801, 1.363214]}\nC: {'rotation matrix': [[0.9999967402226891, -0.00025435497097484245, -0.0026972206948773017], [0.0002554991423686922, 0.9999998064927808, 0.00039832318899401273], [0.0026965738295479497, -0.00039944612541857925, 0.9999956863286328]], 'translation vector': [0.0007808272698826002, -3.308771117738196e-05, 0.0032529965763865576]}\nD: {'rotation matrix': [[-0.929672, -0.179046, 0.321952], [-0.368044, 0.413573, -0.832767], [0.015953, -0.892693, -0.450384]], 'translation vector': [2.09373, 1.925922, 1.362599]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.92969, -0.177823, 0.322577], [-0.368073, 0.414955, -0.832066], [0.014105, -0.892296, -0.451231]], 'translation vector': [2.094699, 1.923867, 1.362793]}\nB: {'rotation matrix': [[-0.929496, -0.179835, 0.322021], [-0.368436, 0.412208, -0.833271], [0.017112, -0.893165, -0.449403]], 'translation vector': [2.092189, 1.927801, 1.363214]}\nC: {'rotation matrix': [[0.9999967402226891, -0.00025435497097484245, -0.0026972206948773017], [0.0002554991423686922, 0.9999998064927808, 0.00039832318899401273], [0.0026965738295479497, -0.00039944612541857925, 0.9999956863286328]], 'translation vector': [0.0007808272698826002, -3.308771117738196e-05, 0.0032529965763865576]}\nD: {'rotation matrix': [[-0.929672, -0.179046, 0.321952], [-0.368044, 0.413573, -0.832767], [0.015953, -0.892693, -0.450384]], 'translation vector': [2.09373, 1.925922, 1.362599]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_118_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_118_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_118_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_118_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.462347, -0.272387, 0.843825], [-0.885267, -0.195868, 0.421827], [0.050379, -0.942041, -0.331694]], 'translation vector': [2.976725, 2.047585, 1.44742]}\nB: {'rotation matrix': [[0.9999659967455217, 0.000730682433916761, -0.008205012696381919], [-0.0006795411337798277, 0.9999808235064653, 0.006171696278259806], [0.008208518663254718, -0.006164897409722228, 0.9999474019525653]], 'translation vector': [0.003919003542433852, -0.0016103743394202397, 0.004248482748549165]}\nC: {'rotation matrix': [[0.463845, -0.2716, 0.843257], [-0.884483, -0.196093, 0.423364], [0.050371, -0.942221, -0.331182]], 'translation vector': [2.976598, 2.048301, 1.445946]}\nD: {'rotation matrix': [[0.465329, -0.271694, 0.842408], [-0.883772, -0.195467, 0.425135], [0.049156, -0.942324, -0.331072]], 'translation vector': [2.978186, 2.04869, 1.446578]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.462347, -0.272387, 0.843825], [-0.885267, -0.195868, 0.421827], [0.050379, -0.942041, -0.331694]], 'translation vector': [2.976725, 2.047585, 1.44742]}\nB: {'rotation matrix': [[0.9999659967455217, 0.000730682433916761, -0.008205012696381919], [-0.0006795411337798277, 0.9999808235064653, 0.006171696278259806], [0.008208518663254718, -0.006164897409722228, 0.9999474019525653]], 'translation vector': [0.003919003542433852, -0.0016103743394202397, 0.004248482748549165]}\nC: {'rotation matrix': [[0.463845, -0.2716, 0.843257], [-0.884483, -0.196093, 0.423364], [0.050371, -0.942221, -0.331182]], 'translation vector': [2.976598, 2.048301, 1.445946]}\nD: {'rotation matrix': [[0.465329, -0.271694, 0.842408], [-0.883772, -0.195467, 0.425135], [0.049156, -0.942324, -0.331072]], 'translation vector': [2.978186, 2.04869, 1.446578]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_119_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_119_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_119_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_119_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.819777, 0.236537, -0.521552], [0.57264, -0.327401, 0.751593], [0.007023, -0.9148, -0.403846]], 'translation vector': [2.353185, 1.22719, 1.374303]}\nB: {'rotation matrix': [[0.999987888286007, -0.002291943522874061, -0.004318624647106336], [0.002305286110298337, 0.9999924577563948, 0.0032570259245620907], [0.0043115657326277725, -0.0032678213093349246, 0.9999859589268988]], 'translation vector': [0.0029862992996512183, 0.0027957678410703846, 0.00028412393673649117]}\nC: {'rotation matrix': [[0.818568, 0.239176, -0.522246], [0.574347, -0.327388, 0.750296], [0.008476, -0.914118, -0.405359]], 'translation vector': [2.353795, 1.227513, 1.374115]}\nD: {'rotation matrix': [[0.821096, 0.234783, -0.520267], [0.570754, -0.327501, 0.752983], [0.006399, -0.915216, -0.402913]], 'translation vector': [2.353373, 1.227232, 1.3746]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.819777, 0.236537, -0.521552], [0.57264, -0.327401, 0.751593], [0.007023, -0.9148, -0.403846]], 'translation vector': [2.353185, 1.22719, 1.374303]}\nB: {'rotation matrix': [[0.999987888286007, -0.002291943522874061, -0.004318624647106336], [0.002305286110298337, 0.9999924577563948, 0.0032570259245620907], [0.0043115657326277725, -0.0032678213093349246, 0.9999859589268988]], 'translation vector': [0.0029862992996512183, 0.0027957678410703846, 0.00028412393673649117]}\nC: {'rotation matrix': [[0.818568, 0.239176, -0.522246], [0.574347, -0.327388, 0.750296], [0.008476, -0.914118, -0.405359]], 'translation vector': [2.353795, 1.227513, 1.374115]}\nD: {'rotation matrix': [[0.821096, 0.234783, -0.520267], [0.570754, -0.327501, 0.752983], [0.006399, -0.915216, -0.402913]], 'translation vector': [2.353373, 1.227232, 1.3746]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_120_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_120_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_120_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_120_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.416416, 0.454823, -0.787232], [0.907976, 0.163592, -0.38577], [-0.046672, -0.875428, -0.481091]], 'translation vector': [2.42158, 4.677908, 1.279661]}\nB: {'rotation matrix': [[-0.425105, 0.447282, -0.786908], [0.904054, 0.167175, -0.393368], [-0.044395, -0.878631, -0.475434]], 'translation vector': [2.418032, 4.676476, 1.278379]}\nC: {'rotation matrix': [[-0.405457, 0.458134, -0.791023], [0.912898, 0.15832, -0.376234], [-0.047131, -0.87467, -0.482422]], 'translation vector': [2.427205, 4.676823, 1.279665]}\nD: {'rotation matrix': [[0.999724588457419, 0.011399918490649034, -0.02049681987065299], [-0.011547405749944445, 0.9999079075389897, -0.007141934731129424], [0.020413044185954875, 0.00737693223574732, 0.9997642455565067]], 'translation vector': [0.0017021170863906754, -0.0007418791262142621, 0.002807958654513776]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.416416, 0.454823, -0.787232], [0.907976, 0.163592, -0.38577], [-0.046672, -0.875428, -0.481091]], 'translation vector': [2.42158, 4.677908, 1.279661]}\nB: {'rotation matrix': [[-0.425105, 0.447282, -0.786908], [0.904054, 0.167175, -0.393368], [-0.044395, -0.878631, -0.475434]], 'translation vector': [2.418032, 4.676476, 1.278379]}\nC: {'rotation matrix': [[-0.405457, 0.458134, -0.791023], [0.912898, 0.15832, -0.376234], [-0.047131, -0.87467, -0.482422]], 'translation vector': [2.427205, 4.676823, 1.279665]}\nD: {'rotation matrix': [[0.999724588457419, 0.011399918490649034, -0.02049681987065299], [-0.011547405749944445, 0.9999079075389897, -0.007141934731129424], [0.020413044185954875, 0.00737693223574732, 0.9997642455565067]], 'translation vector': [0.0017021170863906754, -0.0007418791262142621, 0.002807958654513776]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_121_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_121_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_121_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_121_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.9999991921613425, -0.0009534576315807749, 0.0007322231548443674], [0.0009539109746141209, 0.9999998849157189, 0.0006356436900991667], [-0.0007323849744554394, -0.0006353709242038023, 0.9999990842009482]], 'translation vector': [-0.0019568440092229133, 0.0040178639573226205, -0.0007055440032766036]}\nB: {'rotation matrix': [[-0.677088, 0.408379, -0.612192], [0.735888, 0.380882, -0.559819], [0.004555, -0.829551, -0.558412]], 'translation vector': [3.089066, 2.044868, 1.438859]}\nC: {'rotation matrix': [[-0.677557, 0.408197, -0.611794], [0.735465, 0.379263, -0.561472], [0.002839, -0.830382, -0.557187]], 'translation vector': [3.090277, 2.045193, 1.438377]}\nD: {'rotation matrix': [[-0.677242, 0.408267, -0.612096], [0.73575, 0.380087, -0.56054], [0.003799, -0.829971, -0.557794]], 'translation vector': [3.089461, 2.045596, 1.437863]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999991921613425, -0.0009534576315807749, 0.0007322231548443674], [0.0009539109746141209, 0.9999998849157189, 0.0006356436900991667], [-0.0007323849744554394, -0.0006353709242038023, 0.9999990842009482]], 'translation vector': [-0.0019568440092229133, 0.0040178639573226205, -0.0007055440032766036]}\nB: {'rotation matrix': [[-0.677088, 0.408379, -0.612192], [0.735888, 0.380882, -0.559819], [0.004555, -0.829551, -0.558412]], 'translation vector': [3.089066, 2.044868, 1.438859]}\nC: {'rotation matrix': [[-0.677557, 0.408197, -0.611794], [0.735465, 0.379263, -0.561472], [0.002839, -0.830382, -0.557187]], 'translation vector': [3.090277, 2.045193, 1.438377]}\nD: {'rotation matrix': [[-0.677242, 0.408267, -0.612096], [0.73575, 0.380087, -0.56054], [0.003799, -0.829971, -0.557794]], 'translation vector': [3.089461, 2.045596, 1.437863]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_122_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_122_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_122_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_122_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.9999933784388596, -0.0034228660793026973, 0.0010334015378228609], [0.003443417069472685, 0.9997723135838638, -0.0210485719107263], [-0.0009619583367667721, 0.021052364637286505, 0.9997776329131831]], 'translation vector': [0.004991626612900646, -0.0024493216023662445, -0.003027628610638544]}\nB: {'rotation matrix': [[-0.068724, 0.196407, -0.978111], [0.997631, 0.016511, -0.06678], [0.003034, -0.980384, -0.197076]], 'translation vector': [6.624384, 2.565858, 1.44421]}\nC: {'rotation matrix': [[-0.062271, 0.18592, -0.98059], [0.998056, 0.014281, -0.060673], [0.002724, -0.982461, -0.186448]], 'translation vector': [6.625182, 2.564143, 1.442555]}\nD: {'rotation matrix': [[-0.067121, 0.19262, -0.978975], [0.997737, 0.016917, -0.065078], [0.004026, -0.981128, -0.19332]], 'translation vector': [6.625297, 2.569471, 1.443187]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999933784388596, -0.0034228660793026973, 0.0010334015378228609], [0.003443417069472685, 0.9997723135838638, -0.0210485719107263], [-0.0009619583367667721, 0.021052364637286505, 0.9997776329131831]], 'translation vector': [0.004991626612900646, -0.0024493216023662445, -0.003027628610638544]}\nB: {'rotation matrix': [[-0.068724, 0.196407, -0.978111], [0.997631, 0.016511, -0.06678], [0.003034, -0.980384, -0.197076]], 'translation vector': [6.624384, 2.565858, 1.44421]}\nC: {'rotation matrix': [[-0.062271, 0.18592, -0.98059], [0.998056, 0.014281, -0.060673], [0.002724, -0.982461, -0.186448]], 'translation vector': [6.625182, 2.564143, 1.442555]}\nD: {'rotation matrix': [[-0.067121, 0.19262, -0.978975], [0.997737, 0.016917, -0.065078], [0.004026, -0.981128, -0.19332]], 'translation vector': [6.625297, 2.569471, 1.443187]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_123_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_123_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_123_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_123_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.9997697997860496, 0.001019787082464491, -0.021429481643640898], [-0.0009730181353209979, 0.9999969701208679, 0.002158925702055335], [0.021432088446976687, -0.002138201439636503, 0.9997675908556951]], 'translation vector': [0.004448630857523561, 0.0024420113720010628, -0.001058932632110654]}\nB: {'rotation matrix': [[-0.999487, 0.010341, 0.030333], [-0.019706, 0.548122, -0.836166], [-0.025273, -0.836334, -0.547637]], 'translation vector': [4.843515, 3.430529, 1.401708]}\nC: {'rotation matrix': [[-0.998846, 0.024735, 0.04116], [-0.020973, 0.546345, -0.837298], [-0.043199, -0.837195, -0.545196]], 'translation vector': [4.840129, 3.432139, 1.401112]}\nD: {'rotation matrix': [[-0.99921, 0.020117, 0.034274], [-0.017632, 0.548494, -0.835969], [-0.035616, -0.835913, -0.547706]], 'translation vector': [4.841137, 3.430736, 1.401886]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9997697997860496, 0.001019787082464491, -0.021429481643640898], [-0.0009730181353209979, 0.9999969701208679, 0.002158925702055335], [0.021432088446976687, -0.002138201439636503, 0.9997675908556951]], 'translation vector': [0.004448630857523561, 0.0024420113720010628, -0.001058932632110654]}\nB: {'rotation matrix': [[-0.999487, 0.010341, 0.030333], [-0.019706, 0.548122, -0.836166], [-0.025273, -0.836334, -0.547637]], 'translation vector': [4.843515, 3.430529, 1.401708]}\nC: {'rotation matrix': [[-0.998846, 0.024735, 0.04116], [-0.020973, 0.546345, -0.837298], [-0.043199, -0.837195, -0.545196]], 'translation vector': [4.840129, 3.432139, 1.401112]}\nD: {'rotation matrix': [[-0.99921, 0.020117, 0.034274], [-0.017632, 0.548494, -0.835969], [-0.035616, -0.835913, -0.547706]], 'translation vector': [4.841137, 3.430736, 1.401886]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_124_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_124_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_124_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_124_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.973747, -0.109977, 0.199301], [-0.227471, -0.502961, 0.833839], [0.008537, -0.857284, -0.514773]], 'translation vector': [3.554081, 1.206281, 1.35243]}\nB: {'rotation matrix': [[0.974665, -0.108996, 0.195317], [-0.223559, -0.502331, 0.835276], [0.007072, -0.857778, -0.513971]], 'translation vector': [3.555352, 1.206811, 1.353912]}\nC: {'rotation matrix': [[0.975504, -0.107044, 0.192183], [-0.219886, -0.500464, 0.837368], [0.006546, -0.859114, -0.511742]], 'translation vector': [3.5544, 1.207723, 1.355687]}\nD: {'rotation matrix': [[0.9999906036647181, 0.002816837265478036, -0.0034361334791159484], [-0.0028229898836968203, 0.9999947431038866, -0.0015384023641953812], [0.0034319714995403, 0.0015489580591809052, 0.9999926274107623]], 'translation vector': [0.0002503237708082473, -0.0002760600759463827, -0.00019478740093437086]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.973747, -0.109977, 0.199301], [-0.227471, -0.502961, 0.833839], [0.008537, -0.857284, -0.514773]], 'translation vector': [3.554081, 1.206281, 1.35243]}\nB: {'rotation matrix': [[0.974665, -0.108996, 0.195317], [-0.223559, -0.502331, 0.835276], [0.007072, -0.857778, -0.513971]], 'translation vector': [3.555352, 1.206811, 1.353912]}\nC: {'rotation matrix': [[0.975504, -0.107044, 0.192183], [-0.219886, -0.500464, 0.837368], [0.006546, -0.859114, -0.511742]], 'translation vector': [3.5544, 1.207723, 1.355687]}\nD: {'rotation matrix': [[0.9999906036647181, 0.002816837265478036, -0.0034361334791159484], [-0.0028229898836968203, 0.9999947431038866, -0.0015384023641953812], [0.0034319714995403, 0.0015489580591809052, 0.9999926274107623]], 'translation vector': [0.0002503237708082473, -0.0002760600759463827, -0.00019478740093437086]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_125_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_125_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_125_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_125_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.153679, 0.256881, -0.954146], [0.987333, 0.001369, -0.158656], [-0.03945, -0.966442, -0.253837]], 'translation vector': [1.842026, 1.203469, 1.473211]}\nB: {'rotation matrix': [[-0.151778, 0.257722, -0.954224], [0.987593, 0.000186, -0.157036], [-0.040294, -0.966219, -0.254553]], 'translation vector': [1.842306, 1.202322, 1.472604]}\nC: {'rotation matrix': [[-0.149914, 0.257434, -0.954596], [0.987791, -0.002361, -0.155764], [-0.042353, -0.966293, -0.253937]], 'translation vector': [1.843622, 1.201203, 1.472192]}\nD: {'rotation matrix': [[0.9999992738268638, -0.00012046241721948631, -0.0012199092118460354], [0.0001219402230910216, 0.9999989013124573, 0.0017389291337165482], [0.0012191992898708535, -0.0017382297355628953, 0.9999985296461054]], 'translation vector': [9.159323589502666e-05, -0.0060291788848427785, 8.443913047839757e-05]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.153679, 0.256881, -0.954146], [0.987333, 0.001369, -0.158656], [-0.03945, -0.966442, -0.253837]], 'translation vector': [1.842026, 1.203469, 1.473211]}\nB: {'rotation matrix': [[-0.151778, 0.257722, -0.954224], [0.987593, 0.000186, -0.157036], [-0.040294, -0.966219, -0.254553]], 'translation vector': [1.842306, 1.202322, 1.472604]}\nC: {'rotation matrix': [[-0.149914, 0.257434, -0.954596], [0.987791, -0.002361, -0.155764], [-0.042353, -0.966293, -0.253937]], 'translation vector': [1.843622, 1.201203, 1.472192]}\nD: {'rotation matrix': [[0.9999992738268638, -0.00012046241721948631, -0.0012199092118460354], [0.0001219402230910216, 0.9999989013124573, 0.0017389291337165482], [0.0012191992898708535, -0.0017382297355628953, 0.9999985296461054]], 'translation vector': [9.159323589502666e-05, -0.0060291788848427785, 8.443913047839757e-05]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_126_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_126_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_126_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_126_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.14824, 0.422945, -0.893948], [0.983241, -0.033972, -0.17912], [-0.106127, -0.905518, -0.410821]], 'translation vector': [4.004252, 0.906944, 2.572337]}\nB: {'rotation matrix': [[-0.144176, 0.428291, -0.892065], [0.983875, -0.034383, -0.175522], [-0.105847, -0.902987, -0.416427]], 'translation vector': [4.001886, 0.906293, 2.57387]}\nC: {'rotation matrix': [[0.9999802090304492, -0.0006608909445885984, -0.006377640026736867], [0.0006622709420172097, 0.9999989071934312, 0.0002893250642296396], [0.006377955510407445, -0.00029429125031086645, 0.9999796120375704]], 'translation vector': [0.0015109144602648002, -0.0040381270140653625, -0.0009682382039999382]}\nD: {'rotation matrix': [[-0.139849, 0.432923, -0.890517], [0.98469, -0.03371, -0.171026], [-0.10406, -0.9008, -0.42158]], 'translation vector': [3.996022, 0.9047, 2.579904]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.14824, 0.422945, -0.893948], [0.983241, -0.033972, -0.17912], [-0.106127, -0.905518, -0.410821]], 'translation vector': [4.004252, 0.906944, 2.572337]}\nB: {'rotation matrix': [[-0.144176, 0.428291, -0.892065], [0.983875, -0.034383, -0.175522], [-0.105847, -0.902987, -0.416427]], 'translation vector': [4.001886, 0.906293, 2.57387]}\nC: {'rotation matrix': [[0.9999802090304492, -0.0006608909445885984, -0.006377640026736867], [0.0006622709420172097, 0.9999989071934312, 0.0002893250642296396], [0.006377955510407445, -0.00029429125031086645, 0.9999796120375704]], 'translation vector': [0.0015109144602648002, -0.0040381270140653625, -0.0009682382039999382]}\nD: {'rotation matrix': [[-0.139849, 0.432923, -0.890517], [0.98469, -0.03371, -0.171026], [-0.10406, -0.9008, -0.42158]], 'translation vector': [3.996022, 0.9047, 2.579904]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_127_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_127_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_127_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_127_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.95128, 0.171677, -0.256112], [0.307849, -0.482535, 0.819993], [0.017191, -0.858887, -0.511877]], 'translation vector': [2.918653, 3.427386, 1.515216]}\nB: {'rotation matrix': [[0.9999949680507799, -0.0030198797325234252, -0.001049278425799119], [0.0030204031206264347, 0.9999953358677364, 0.000916435423205514], [0.0010474297666525848, -0.0009198822713830659, 0.9999990387486941]], 'translation vector': [-0.00019299961249297226, -0.0019013116010877518, -0.0012501965874700538]}\nC: {'rotation matrix': [[0.951329, 0.168071, -0.258311], [0.307858, -0.480204, 0.821357], [0.014004, -0.860905, -0.508574]], 'translation vector': [2.920244, 3.426191, 1.515625]}\nD: {'rotation matrix': [[0.951137, 0.174865, -0.254481], [0.308106, -0.483514, 0.81932], [0.020225, -0.857693, -0.513765]], 'translation vector': [2.916759, 3.427486, 1.515303]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.95128, 0.171677, -0.256112], [0.307849, -0.482535, 0.819993], [0.017191, -0.858887, -0.511877]], 'translation vector': [2.918653, 3.427386, 1.515216]}\nB: {'rotation matrix': [[0.9999949680507799, -0.0030198797325234252, -0.001049278425799119], [0.0030204031206264347, 0.9999953358677364, 0.000916435423205514], [0.0010474297666525848, -0.0009198822713830659, 0.9999990387486941]], 'translation vector': [-0.00019299961249297226, -0.0019013116010877518, -0.0012501965874700538]}\nC: {'rotation matrix': [[0.951329, 0.168071, -0.258311], [0.307858, -0.480204, 0.821357], [0.014004, -0.860905, -0.508574]], 'translation vector': [2.920244, 3.426191, 1.515625]}\nD: {'rotation matrix': [[0.951137, 0.174865, -0.254481], [0.308106, -0.483514, 0.81932], [0.020225, -0.857693, -0.513765]], 'translation vector': [2.916759, 3.427486, 1.515303]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_128_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_128_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_128_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_128_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.9999948635923562, 0.0019820469436732306, 0.002154140600797888], [-0.0019903388630383317, 0.9999924231566063, 0.003542917026556551], [-0.0021468846137928286, -0.0035469945961230523, 0.999992296254944]], 'translation vector': [-5.7007563989186494e-05, -0.0006793783086549987, -0.00012474555531971632]}\nB: {'rotation matrix': [[-0.933451, -0.165748, 0.318116], [-0.358704, 0.434072, -0.826385], [-0.001114, -0.885499, -0.464639]], 'translation vector': [1.119556, 2.234202, 1.400117]}\nC: {'rotation matrix': [[-0.933995, -0.170592, 0.31393], [-0.357261, 0.435306, -0.826362], [0.004315, -0.883973, -0.467519]], 'translation vector': [1.117768, 2.23249, 1.399859]}\nD: {'rotation matrix': [[-0.93341, -0.169242, 0.31639], [-0.358807, 0.435851, -0.825404], [0.001794, -0.883963, -0.467553]], 'translation vector': [1.117643, 2.232584, 1.400741]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999948635923562, 0.0019820469436732306, 0.002154140600797888], [-0.0019903388630383317, 0.9999924231566063, 0.003542917026556551], [-0.0021468846137928286, -0.0035469945961230523, 0.999992296254944]], 'translation vector': [-5.7007563989186494e-05, -0.0006793783086549987, -0.00012474555531971632]}\nB: {'rotation matrix': [[-0.933451, -0.165748, 0.318116], [-0.358704, 0.434072, -0.826385], [-0.001114, -0.885499, -0.464639]], 'translation vector': [1.119556, 2.234202, 1.400117]}\nC: {'rotation matrix': [[-0.933995, -0.170592, 0.31393], [-0.357261, 0.435306, -0.826362], [0.004315, -0.883973, -0.467519]], 'translation vector': [1.117768, 2.23249, 1.399859]}\nD: {'rotation matrix': [[-0.93341, -0.169242, 0.31639], [-0.358807, 0.435851, -0.825404], [0.001794, -0.883963, -0.467553]], 'translation vector': [1.117643, 2.232584, 1.400741]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_129_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_129_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_129_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_129_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.748267, 0.274864, -0.603777], [0.662514, -0.356598, 0.658721], [-0.034247, -0.892909, -0.448932]], 'translation vector': [2.689408, 2.67138, 1.313352]}\nB: {'rotation matrix': [[0.746223, 0.273332, -0.606993], [0.664679, -0.356291, 0.656703], [-0.036768, -0.893502, -0.447551]], 'translation vector': [2.678885, 2.679979, 1.310144]}\nC: {'rotation matrix': [[0.9999399674236885, 0.0008968793534765223, 0.01089808273108475], [-0.000989300454985631, 0.9999632239345498, 0.008537240164508266], [-0.010889507071785043, -0.008547033142130317, 0.9999033050990314]], 'translation vector': [-0.01197293045142045, -0.02229876967815736, 0.03026515941132618]}\nD: {'rotation matrix': [[0.750155, 0.270973, -0.603193], [0.660278, -0.356699, 0.660908], [-0.03607, -0.894058, -0.446497]], 'translation vector': [2.698287, 2.659688, 1.315667]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.748267, 0.274864, -0.603777], [0.662514, -0.356598, 0.658721], [-0.034247, -0.892909, -0.448932]], 'translation vector': [2.689408, 2.67138, 1.313352]}\nB: {'rotation matrix': [[0.746223, 0.273332, -0.606993], [0.664679, -0.356291, 0.656703], [-0.036768, -0.893502, -0.447551]], 'translation vector': [2.678885, 2.679979, 1.310144]}\nC: {'rotation matrix': [[0.9999399674236885, 0.0008968793534765223, 0.01089808273108475], [-0.000989300454985631, 0.9999632239345498, 0.008537240164508266], [-0.010889507071785043, -0.008547033142130317, 0.9999033050990314]], 'translation vector': [-0.01197293045142045, -0.02229876967815736, 0.03026515941132618]}\nD: {'rotation matrix': [[0.750155, 0.270973, -0.603193], [0.660278, -0.356699, 0.660908], [-0.03607, -0.894058, -0.446497]], 'translation vector': [2.698287, 2.659688, 1.315667]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_130_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_130_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_130_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_130_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.696591, -0.332063, 0.636], [-0.715704, 0.259464, -0.648419], [0.050297, -0.90687, -0.418398]], 'translation vector': [0.055261, 3.785911, 1.510756]}\nB: {'rotation matrix': [[0.9999892202506441, -0.00432806217171397, -0.0014665396078038379], [0.0043283836537151305, 0.9999910348386083, -5.023853182741767e-05], [0.0014665058361795998, 4.44487020677531e-05, 0.9999983390207243]], 'translation vector': [0.0011882249140264811, -0.004541217231683825, 0.003677767622445316]}\nC: {'rotation matrix': [[-0.698852, -0.328674, 0.635279], [-0.713642, 0.26058, -0.65024], [0.048176, -0.907784, -0.416662]], 'translation vector': [0.047395, 3.788746, 1.502043]}\nD: {'rotation matrix': [[-0.698666, -0.330448, 0.634563], [-0.713793, 0.261647, -0.649647], [0.048643, -0.906832, -0.418676]], 'translation vector': [0.050863, 3.788018, 1.507423]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.696591, -0.332063, 0.636], [-0.715704, 0.259464, -0.648419], [0.050297, -0.90687, -0.418398]], 'translation vector': [0.055261, 3.785911, 1.510756]}\nB: {'rotation matrix': [[0.9999892202506441, -0.00432806217171397, -0.0014665396078038379], [0.0043283836537151305, 0.9999910348386083, -5.023853182741767e-05], [0.0014665058361795998, 4.44487020677531e-05, 0.9999983390207243]], 'translation vector': [0.0011882249140264811, -0.004541217231683825, 0.003677767622445316]}\nC: {'rotation matrix': [[-0.698852, -0.328674, 0.635279], [-0.713642, 0.26058, -0.65024], [0.048176, -0.907784, -0.416662]], 'translation vector': [0.047395, 3.788746, 1.502043]}\nD: {'rotation matrix': [[-0.698666, -0.330448, 0.634563], [-0.713793, 0.261647, -0.649647], [0.048643, -0.906832, -0.418676]], 'translation vector': [0.050863, 3.788018, 1.507423]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_131_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_131_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_131_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_131_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.9999994384686345, -0.0006788559243182462, -0.0008560007213187077], [0.0006780059132462143, 0.9999998393114027, -0.000711634963528042], [0.0008572035387077444, 0.0007099766954480035, 0.9999996039685791]], 'translation vector': [0.0021169101928586176, 0.0016932348180918044, -0.0014691902955634717]}\nB: {'rotation matrix': [[-0.895004, 0.171136, -0.411923], [0.445772, 0.376296, -0.812213], [0.016006, -0.910557, -0.413074]], 'translation vector': [2.821576, 5.408109, 1.547241]}\nC: {'rotation matrix': [[-0.895238, 0.170954, -0.411491], [0.44529, 0.377097, -0.812105], [0.016339, -0.910259, -0.413716]], 'translation vector': [2.819563, 5.407667, 1.547957]}\nD: {'rotation matrix': [[-0.895239, 0.171618, -0.411211], [0.445324, 0.376235, -0.812486], [0.015275, -0.910491, -0.413246]], 'translation vector': [2.820169, 5.40833, 1.547624]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999994384686345, -0.0006788559243182462, -0.0008560007213187077], [0.0006780059132462143, 0.9999998393114027, -0.000711634963528042], [0.0008572035387077444, 0.0007099766954480035, 0.9999996039685791]], 'translation vector': [0.0021169101928586176, 0.0016932348180918044, -0.0014691902955634717]}\nB: {'rotation matrix': [[-0.895004, 0.171136, -0.411923], [0.445772, 0.376296, -0.812213], [0.016006, -0.910557, -0.413074]], 'translation vector': [2.821576, 5.408109, 1.547241]}\nC: {'rotation matrix': [[-0.895238, 0.170954, -0.411491], [0.44529, 0.377097, -0.812105], [0.016339, -0.910259, -0.413716]], 'translation vector': [2.819563, 5.407667, 1.547957]}\nD: {'rotation matrix': [[-0.895239, 0.171618, -0.411211], [0.445324, 0.376235, -0.812486], [0.015275, -0.910491, -0.413246]], 'translation vector': [2.820169, 5.40833, 1.547624]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_132_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_132_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_132_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_132_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.637806, -0.366719, 0.67729], [-0.770059, -0.286951, 0.569797], [-0.014606, -0.884972, -0.465415]], 'translation vector': [2.635432, 2.237918, 1.453759]}\nB: {'rotation matrix': [[0.639756, -0.365353, 0.676188], [-0.768417, -0.286037, 0.572466], [-0.015738, -0.885833, -0.463738]], 'translation vector': [2.635672, 2.238828, 1.45525]}\nC: {'rotation matrix': [[0.999992450941996, 0.003938168732013472, 0.0002619981628988685], [-0.003940060772482587, 0.9999806955317443, 0.004722690751934828], [-0.00024268998302535557, -0.004722987499164323, 0.9999892225354342]], 'translation vector': [-0.005978537327156946, -0.0007775878287423765, 0.0022633181070532693]}\nD: {'rotation matrix': [[0.636585, -0.368058, 0.677712], [-0.771071, -0.287285, 0.568258], [-0.014455, -0.884308, -0.46668]], 'translation vector': [2.636608, 2.236841, 1.454577]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.637806, -0.366719, 0.67729], [-0.770059, -0.286951, 0.569797], [-0.014606, -0.884972, -0.465415]], 'translation vector': [2.635432, 2.237918, 1.453759]}\nB: {'rotation matrix': [[0.639756, -0.365353, 0.676188], [-0.768417, -0.286037, 0.572466], [-0.015738, -0.885833, -0.463738]], 'translation vector': [2.635672, 2.238828, 1.45525]}\nC: {'rotation matrix': [[0.999992450941996, 0.003938168732013472, 0.0002619981628988685], [-0.003940060772482587, 0.9999806955317443, 0.004722690751934828], [-0.00024268998302535557, -0.004722987499164323, 0.9999892225354342]], 'translation vector': [-0.005978537327156946, -0.0007775878287423765, 0.0022633181070532693]}\nD: {'rotation matrix': [[0.636585, -0.368058, 0.677712], [-0.771071, -0.287285, 0.568258], [-0.014455, -0.884308, -0.46668]], 'translation vector': [2.636608, 2.236841, 1.454577]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_133_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_133_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_133_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_133_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.354317, -0.208867, 0.911501], [-0.934632, 0.110757, -0.337929], [-0.030372, -0.971652, -0.234457]], 'translation vector': [0.531753, 4.839624, 1.62588]}\nB: {'rotation matrix': [[-0.359065, -0.216471, 0.907862], [-0.932968, 0.109695, -0.342839], [-0.025373, -0.970107, -0.241348]], 'translation vector': [0.533016, 4.840936, 1.625213]}\nC: {'rotation matrix': [[0.9999996059479784, -0.0012011060402040321, 0.0006047856645127561], [0.0011984060508814654, 0.9999921678086844, 0.003627178785230534], [-0.000607755557205814, -0.0036253860701085153, 0.999994001644102]], 'translation vector': [-0.0020457167014393818, -0.01042060812880563, 0.003252619468668172]}\nD: {'rotation matrix': [[-0.356177, -0.213479, 0.909706], [-0.934025, 0.10958, -0.339984], [-0.027106, -0.970783, -0.238424]], 'translation vector': [0.532497, 4.839391, 1.625248]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.354317, -0.208867, 0.911501], [-0.934632, 0.110757, -0.337929], [-0.030372, -0.971652, -0.234457]], 'translation vector': [0.531753, 4.839624, 1.62588]}\nB: {'rotation matrix': [[-0.359065, -0.216471, 0.907862], [-0.932968, 0.109695, -0.342839], [-0.025373, -0.970107, -0.241348]], 'translation vector': [0.533016, 4.840936, 1.625213]}\nC: {'rotation matrix': [[0.9999996059479784, -0.0012011060402040321, 0.0006047856645127561], [0.0011984060508814654, 0.9999921678086844, 0.003627178785230534], [-0.000607755557205814, -0.0036253860701085153, 0.999994001644102]], 'translation vector': [-0.0020457167014393818, -0.01042060812880563, 0.003252619468668172]}\nD: {'rotation matrix': [[-0.356177, -0.213479, 0.909706], [-0.934025, 0.10958, -0.339984], [-0.027106, -0.970783, -0.238424]], 'translation vector': [0.532497, 4.839391, 1.625248]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_134_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_134_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_134_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_134_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.9999831834333464, -3.507485354961902e-05, -0.005860591935981322], [1.2894070442508321e-05, 0.999992942835117, -0.003855834283817326], [0.00585975547828403, 0.003855906716345711, 0.9999751812718962]], 'translation vector': [-9.176265998767086e-05, -0.003526493044568424, -0.0013563859537040202]}\nB: {'rotation matrix': [[-0.77208, 0.081888, -0.630228], [0.634233, 0.036058, -0.772301], [-0.040517, -0.995989, -0.079776]], 'translation vector': [4.355151, 2.275217, 1.510745]}\nC: {'rotation matrix': [[-0.769009, 0.085964, -0.633432], [0.638035, 0.042436, -0.768838], [-0.039212, -0.995394, -0.087482]], 'translation vector': [4.353152, 2.272772, 1.50454]}\nD: {'rotation matrix': [[-0.770144, 0.083681, -0.632358], [0.636632, 0.03909, -0.770176], [-0.03973, -0.995726, -0.083379]], 'translation vector': [4.354443, 2.273597, 1.508503]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999831834333464, -3.507485354961902e-05, -0.005860591935981322], [1.2894070442508321e-05, 0.999992942835117, -0.003855834283817326], [0.00585975547828403, 0.003855906716345711, 0.9999751812718962]], 'translation vector': [-9.176265998767086e-05, -0.003526493044568424, -0.0013563859537040202]}\nB: {'rotation matrix': [[-0.77208, 0.081888, -0.630228], [0.634233, 0.036058, -0.772301], [-0.040517, -0.995989, -0.079776]], 'translation vector': [4.355151, 2.275217, 1.510745]}\nC: {'rotation matrix': [[-0.769009, 0.085964, -0.633432], [0.638035, 0.042436, -0.768838], [-0.039212, -0.995394, -0.087482]], 'translation vector': [4.353152, 2.272772, 1.50454]}\nD: {'rotation matrix': [[-0.770144, 0.083681, -0.632358], [0.636632, 0.03909, -0.770176], [-0.03973, -0.995726, -0.083379]], 'translation vector': [4.354443, 2.273597, 1.508503]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_135_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_135_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_135_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_135_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.872251, 0.269436, -0.408146], [0.489057, 0.477878, -0.729696], [-0.001562, -0.836084, -0.548599]], 'translation vector': [2.680995, 3.11951, 1.281605]}\nB: {'rotation matrix': [[0.9999175427731898, 0.009289450845399635, 0.008882740398963147], [-0.00915148449802066, 0.9998381508008006, -0.015456879997285015], [-0.00902528905222651, 0.015374430101541683, 0.9998409413313208]], 'translation vector': [-0.02453370105938546, 0.014905487027389919, -0.03059606364374634]}\nC: {'rotation matrix': [[-0.872521, 0.262383, -0.412143], [0.488544, 0.478168, -0.729849], [0.005573, -0.838159, -0.545398]], 'translation vector': [2.690634, 3.125973, 1.284562]}\nD: {'rotation matrix': [[-0.871338, 0.255355, -0.419003], [0.490471, 0.478377, -0.728419], [0.014436, -0.840208, -0.542072]], 'translation vector': [2.702949, 3.129856, 1.287257]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.872251, 0.269436, -0.408146], [0.489057, 0.477878, -0.729696], [-0.001562, -0.836084, -0.548599]], 'translation vector': [2.680995, 3.11951, 1.281605]}\nB: {'rotation matrix': [[0.9999175427731898, 0.009289450845399635, 0.008882740398963147], [-0.00915148449802066, 0.9998381508008006, -0.015456879997285015], [-0.00902528905222651, 0.015374430101541683, 0.9998409413313208]], 'translation vector': [-0.02453370105938546, 0.014905487027389919, -0.03059606364374634]}\nC: {'rotation matrix': [[-0.872521, 0.262383, -0.412143], [0.488544, 0.478168, -0.729849], [0.005573, -0.838159, -0.545398]], 'translation vector': [2.690634, 3.125973, 1.284562]}\nD: {'rotation matrix': [[-0.871338, 0.255355, -0.419003], [0.490471, 0.478377, -0.728419], [0.014436, -0.840208, -0.542072]], 'translation vector': [2.702949, 3.129856, 1.287257]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_136_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_136_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_136_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_136_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.333056, -0.473197, 0.815573], [-0.9429, 0.170506, -0.286124], [-0.003667, -0.864299, -0.502965]], 'translation vector': [2.099262, 2.343947, 1.49878]}\nB: {'rotation matrix': [[-0.341507, -0.468371, 0.814864], [-0.939879, 0.169312, -0.296582], [0.000944, -0.867158, -0.498033]], 'translation vector': [2.09227, 2.339374, 1.500507]}\nC: {'rotation matrix': [[-0.349241, -0.464358, 0.813882], [-0.937028, 0.170072, -0.305049], [0.003234, -0.869165, -0.494512]], 'translation vector': [2.088692, 2.33782, 1.505356]}\nD: {'rotation matrix': [[0.9999635418289009, -0.005130634438046651, -0.00688590414124517], [0.00513692191174813, 0.9999857138339602, 0.0010725741392655886], [0.00688015226508057, -0.0011072559806139443, 0.9999748317531684]], 'translation vector': [-0.026001101753266642, -0.0071394396285136, 0.008639096069164354]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.333056, -0.473197, 0.815573], [-0.9429, 0.170506, -0.286124], [-0.003667, -0.864299, -0.502965]], 'translation vector': [2.099262, 2.343947, 1.49878]}\nB: {'rotation matrix': [[-0.341507, -0.468371, 0.814864], [-0.939879, 0.169312, -0.296582], [0.000944, -0.867158, -0.498033]], 'translation vector': [2.09227, 2.339374, 1.500507]}\nC: {'rotation matrix': [[-0.349241, -0.464358, 0.813882], [-0.937028, 0.170072, -0.305049], [0.003234, -0.869165, -0.494512]], 'translation vector': [2.088692, 2.33782, 1.505356]}\nD: {'rotation matrix': [[0.9999635418289009, -0.005130634438046651, -0.00688590414124517], [0.00513692191174813, 0.9999857138339602, 0.0010725741392655886], [0.00688015226508057, -0.0011072559806139443, 0.9999748317531684]], 'translation vector': [-0.026001101753266642, -0.0071394396285136, 0.008639096069164354]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_137_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_137_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_137_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_137_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.030402, 0.425954, -0.904234], [0.998503, -0.028211, -0.04686], [-0.045469, -0.904305, -0.424459]], 'translation vector': [2.422483, 1.358004, 3.279846]}\nB: {'rotation matrix': [[-0.030422, 0.425378, -0.904504], [0.99853, -0.027681, -0.046602], [-0.044861, -0.904592, -0.42391]], 'translation vector': [2.423117, 1.357937, 3.279462]}\nC: {'rotation matrix': [[0.9999413192700902, -0.00031349790801336034, -0.010858677567769605], [0.00020550611921538246, 0.9999497154111819, -0.010006476843209223], [0.010861290904906616, 0.010003092803207396, 0.9998904023572843]], 'translation vector': [-0.0025713849698387747, -0.003845445277962156, -0.00016886354172340745]}\nD: {'rotation matrix': [[-0.029484, 0.425058, -0.904686], [0.998566, -0.027942, -0.045671], [-0.044692, -0.904735, -0.423624]], 'translation vector': [2.421348, 1.3572, 3.28135]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.030402, 0.425954, -0.904234], [0.998503, -0.028211, -0.04686], [-0.045469, -0.904305, -0.424459]], 'translation vector': [2.422483, 1.358004, 3.279846]}\nB: {'rotation matrix': [[-0.030422, 0.425378, -0.904504], [0.99853, -0.027681, -0.046602], [-0.044861, -0.904592, -0.42391]], 'translation vector': [2.423117, 1.357937, 3.279462]}\nC: {'rotation matrix': [[0.9999413192700902, -0.00031349790801336034, -0.010858677567769605], [0.00020550611921538246, 0.9999497154111819, -0.010006476843209223], [0.010861290904906616, 0.010003092803207396, 0.9998904023572843]], 'translation vector': [-0.0025713849698387747, -0.003845445277962156, -0.00016886354172340745]}\nD: {'rotation matrix': [[-0.029484, 0.425058, -0.904686], [0.998566, -0.027942, -0.045671], [-0.044692, -0.904735, -0.423624]], 'translation vector': [2.421348, 1.3572, 3.28135]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_138_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_138_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_138_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_138_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.765303, 0.126374, -0.631143], [0.606826, -0.468638, 0.641982], [-0.214648, -0.874305, -0.435336]], 'translation vector': [4.259322, 3.776065, 1.503445]}\nB: {'rotation matrix': [[0.771053, 0.12608, -0.624165], [0.599963, -0.472273, 0.645757], [-0.213359, -0.872388, -0.439791]], 'translation vector': [4.254354, 3.773882, 1.500145]}\nC: {'rotation matrix': [[0.9999284250414853, 0.0013159481143052354, -0.011853419692681406], [-0.0013716672277964584, 0.9999877220001062, -0.004620807232633075], [0.011847871043956123, 0.004636177506016426, 0.9999191533457081]], 'translation vector': [-0.0029675207616195465, 0.002877998549804417, -0.005058945419356364]}\nD: {'rotation matrix': [[0.774333, 0.127442, -0.619813], [0.595393, -0.478434, 0.645452], [-0.214281, -0.868826, -0.446345]], 'translation vector': [4.253978, 3.779827, 1.501383]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.765303, 0.126374, -0.631143], [0.606826, -0.468638, 0.641982], [-0.214648, -0.874305, -0.435336]], 'translation vector': [4.259322, 3.776065, 1.503445]}\nB: {'rotation matrix': [[0.771053, 0.12608, -0.624165], [0.599963, -0.472273, 0.645757], [-0.213359, -0.872388, -0.439791]], 'translation vector': [4.254354, 3.773882, 1.500145]}\nC: {'rotation matrix': [[0.9999284250414853, 0.0013159481143052354, -0.011853419692681406], [-0.0013716672277964584, 0.9999877220001062, -0.004620807232633075], [0.011847871043956123, 0.004636177506016426, 0.9999191533457081]], 'translation vector': [-0.0029675207616195465, 0.002877998549804417, -0.005058945419356364]}\nD: {'rotation matrix': [[0.774333, 0.127442, -0.619813], [0.595393, -0.478434, 0.645452], [-0.214281, -0.868826, -0.446345]], 'translation vector': [4.253978, 3.779827, 1.501383]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_139_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_139_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_139_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_139_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.90788, 0.151333, -0.390964], [0.408575, 0.110464, -0.906016], [-0.093922, -0.982291, -0.162118]], 'translation vector': [8.818443, 3.831761, 1.477683]}\nB: {'rotation matrix': [[0.9999985311997388, 0.0014822343891824376, 0.0004132906130215321], [-0.0014816934035833144, 0.9999987577699637, 9.7059989439923e-05], [-0.00041331535706021807, -9.78109674475786e-05, 1.0000005199156523]], 'translation vector': [-0.004405482725633902, -0.00022188509424603264, 0.00016042860388854052]}\nC: {'rotation matrix': [[-0.90752, 0.150251, -0.392214], [0.409699, 0.111045, -0.905437], [-0.092489, -0.982392, -0.162333]], 'translation vector': [8.816371, 3.832904, 1.475888]}\nD: {'rotation matrix': [[-0.907271, 0.148469, -0.393466], [0.410673, 0.111241, -0.904972], [-0.090591, -0.982641, -0.161898]], 'translation vector': [8.814532, 3.834109, 1.474353]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.90788, 0.151333, -0.390964], [0.408575, 0.110464, -0.906016], [-0.093922, -0.982291, -0.162118]], 'translation vector': [8.818443, 3.831761, 1.477683]}\nB: {'rotation matrix': [[0.9999985311997388, 0.0014822343891824376, 0.0004132906130215321], [-0.0014816934035833144, 0.9999987577699637, 9.7059989439923e-05], [-0.00041331535706021807, -9.78109674475786e-05, 1.0000005199156523]], 'translation vector': [-0.004405482725633902, -0.00022188509424603264, 0.00016042860388854052]}\nC: {'rotation matrix': [[-0.90752, 0.150251, -0.392214], [0.409699, 0.111045, -0.905437], [-0.092489, -0.982392, -0.162333]], 'translation vector': [8.816371, 3.832904, 1.475888]}\nD: {'rotation matrix': [[-0.907271, 0.148469, -0.393466], [0.410673, 0.111241, -0.904972], [-0.090591, -0.982641, -0.161898]], 'translation vector': [8.814532, 3.834109, 1.474353]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_140_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_140_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_140_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_140_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.999990277128188, -0.004263613841972057, 0.00022215499723240068], [0.004262324646153357, 0.9999855274813817, 0.003172824698500956], [-0.00023539502566653948, -0.003171844248534696, 0.9999944918809965]], 'translation vector': [0.0008976741232249452, 0.001107833658419377, 0.002287318056557207]}\nB: {'rotation matrix': [[0.982661, 0.058297, -0.176007], [0.185241, -0.268064, 0.945425], [0.007934, -0.961636, -0.274215]], 'translation vector': [4.071507, 1.217171, 1.479186]}\nC: {'rotation matrix': [[0.982484, 0.05703, -0.177406], [0.186213, -0.264319, 0.946288], [0.007075, -0.962748, -0.270309]], 'translation vector': [4.071419, 1.216069, 1.480649]}\nD: {'rotation matrix': [[0.98266, 0.058843, -0.175828], [0.185228, -0.2691, 0.945133], [0.008299, -0.961313, -0.275333]], 'translation vector': [4.071304, 1.217707, 1.478697]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.999990277128188, -0.004263613841972057, 0.00022215499723240068], [0.004262324646153357, 0.9999855274813817, 0.003172824698500956], [-0.00023539502566653948, -0.003171844248534696, 0.9999944918809965]], 'translation vector': [0.0008976741232249452, 0.001107833658419377, 0.002287318056557207]}\nB: {'rotation matrix': [[0.982661, 0.058297, -0.176007], [0.185241, -0.268064, 0.945425], [0.007934, -0.961636, -0.274215]], 'translation vector': [4.071507, 1.217171, 1.479186]}\nC: {'rotation matrix': [[0.982484, 0.05703, -0.177406], [0.186213, -0.264319, 0.946288], [0.007075, -0.962748, -0.270309]], 'translation vector': [4.071419, 1.216069, 1.480649]}\nD: {'rotation matrix': [[0.98266, 0.058843, -0.175828], [0.185228, -0.2691, 0.945133], [0.008299, -0.961313, -0.275333]], 'translation vector': [4.071304, 1.217707, 1.478697]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_141_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_141_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_141_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_141_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.566945, -0.119787, 0.815], [-0.823733, -0.07511, 0.561981], [-0.006103, -0.989954, -0.141256]], 'translation vector': [0.25398, 0.970235, 1.632712]}\nB: {'rotation matrix': [[0.566333, -0.122518, 0.81502], [-0.824133, -0.073956, 0.561548], [-0.008524, -0.989707, -0.142854]], 'translation vector': [0.252647, 0.969528, 1.633147]}\nC: {'rotation matrix': [[0.565918, -0.124531, 0.815003], [-0.824401, -0.073416, 0.561226], [-0.010056, -0.989496, -0.144211]], 'translation vector': [0.251636, 0.969331, 1.634009]}\nD: {'rotation matrix': [[0.9999988126320164, 0.0003312007428265276, -0.0004969284405179408], [-0.00033114016656371985, 0.9999998966821256, -0.0003458941517538565], [0.0004968245827711156, 0.000346552992150063, 0.9999997733502279]], 'translation vector': [5.9331917010907453e-05, -0.0008045063572443834, -0.0004101833402060384]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.566945, -0.119787, 0.815], [-0.823733, -0.07511, 0.561981], [-0.006103, -0.989954, -0.141256]], 'translation vector': [0.25398, 0.970235, 1.632712]}\nB: {'rotation matrix': [[0.566333, -0.122518, 0.81502], [-0.824133, -0.073956, 0.561548], [-0.008524, -0.989707, -0.142854]], 'translation vector': [0.252647, 0.969528, 1.633147]}\nC: {'rotation matrix': [[0.565918, -0.124531, 0.815003], [-0.824401, -0.073416, 0.561226], [-0.010056, -0.989496, -0.144211]], 'translation vector': [0.251636, 0.969331, 1.634009]}\nD: {'rotation matrix': [[0.9999988126320164, 0.0003312007428265276, -0.0004969284405179408], [-0.00033114016656371985, 0.9999998966821256, -0.0003458941517538565], [0.0004968245827711156, 0.000346552992150063, 0.9999997733502279]], 'translation vector': [5.9331917010907453e-05, -0.0008045063572443834, -0.0004101833402060384]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_142_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_142_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_142_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_142_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.9999807090620153, 0.004568922096108109, 0.004345106491864972], [-0.004581791622211755, 0.9999843128892885, 0.0029713844388234126], [-0.00433269576861675, -0.002991018231335416, 0.9999858202835764]], 'translation vector': [-0.011188164884389007, 0.009307427071622687, -0.0007429783939219003]}\nB: {'rotation matrix': [[-0.942483, -0.17354, 0.285674], [-0.333358, 0.550552, -0.765353], [-0.024459, -0.816564, -0.576737]], 'translation vector': [2.733535, 1.660706, 1.301168]}\nC: {'rotation matrix': [[-0.942594, -0.174193, 0.284909], [-0.333124, 0.550105, -0.765776], [-0.023337, -0.816726, -0.576554]], 'translation vector': [2.730048, 1.657302, 1.301829]}\nD: {'rotation matrix': [[-0.942586, -0.174069, 0.285013], [-0.333135, 0.550225, -0.765686], [-0.023539, -0.816672, -0.576622]], 'translation vector': [2.726519, 1.654368, 1.301906]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999807090620153, 0.004568922096108109, 0.004345106491864972], [-0.004581791622211755, 0.9999843128892885, 0.0029713844388234126], [-0.00433269576861675, -0.002991018231335416, 0.9999858202835764]], 'translation vector': [-0.011188164884389007, 0.009307427071622687, -0.0007429783939219003]}\nB: {'rotation matrix': [[-0.942483, -0.17354, 0.285674], [-0.333358, 0.550552, -0.765353], [-0.024459, -0.816564, -0.576737]], 'translation vector': [2.733535, 1.660706, 1.301168]}\nC: {'rotation matrix': [[-0.942594, -0.174193, 0.284909], [-0.333124, 0.550105, -0.765776], [-0.023337, -0.816726, -0.576554]], 'translation vector': [2.730048, 1.657302, 1.301829]}\nD: {'rotation matrix': [[-0.942586, -0.174069, 0.285013], [-0.333135, 0.550225, -0.765686], [-0.023539, -0.816672, -0.576622]], 'translation vector': [2.726519, 1.654368, 1.301906]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_143_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_143_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_143_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_143_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.662528, 0.400848, -0.632754], [0.747812, -0.402283, 0.528154], [-0.042837, -0.823097, -0.566283]], 'translation vector': [1.744816, 2.25794, 1.331918]}\nB: {'rotation matrix': [[0.662009, 0.40559, -0.630271], [0.748112, -0.408664, 0.522802], [-0.045526, -0.817613, -0.573966]], 'translation vector': [1.743048, 2.25768, 1.329749]}\nC: {'rotation matrix': [[0.6641, 0.398897, -0.632339], [0.746639, -0.397674, 0.533278], [-0.038742, -0.826279, -0.561927]], 'translation vector': [1.744615, 2.258795, 1.335923]}\nD: {'rotation matrix': [[0.9999981013423613, 0.0016775919055887754, -0.0009118672263504091], [-0.0016584483846788572, 0.999788802812158, 0.020485068642363036], [0.0009453490386347853, -0.020482492872648857, 0.9997898845593165]], 'translation vector': [0.0006281413363966593, 0.0014761974203534312, 0.005568137578716104]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.662528, 0.400848, -0.632754], [0.747812, -0.402283, 0.528154], [-0.042837, -0.823097, -0.566283]], 'translation vector': [1.744816, 2.25794, 1.331918]}\nB: {'rotation matrix': [[0.662009, 0.40559, -0.630271], [0.748112, -0.408664, 0.522802], [-0.045526, -0.817613, -0.573966]], 'translation vector': [1.743048, 2.25768, 1.329749]}\nC: {'rotation matrix': [[0.6641, 0.398897, -0.632339], [0.746639, -0.397674, 0.533278], [-0.038742, -0.826279, -0.561927]], 'translation vector': [1.744615, 2.258795, 1.335923]}\nD: {'rotation matrix': [[0.9999981013423613, 0.0016775919055887754, -0.0009118672263504091], [-0.0016584483846788572, 0.999788802812158, 0.020485068642363036], [0.0009453490386347853, -0.020482492872648857, 0.9997898845593165]], 'translation vector': [0.0006281413363966593, 0.0014761974203534312, 0.005568137578716104]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_144_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_144_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_144_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_144_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.44362, -0.475187, 0.759868], [-0.895973, 0.254861, -0.363701], [-0.020835, -0.842166, -0.538816]], 'translation vector': [2.452095, 1.901161, 1.451891]}\nB: {'rotation matrix': [[-0.440436, -0.475273, 0.761664], [-0.897497, 0.254555, -0.360142], [-0.02272, -0.84221, -0.538671]], 'translation vector': [2.449051, 1.900731, 1.449924]}\nC: {'rotation matrix': [[-0.441002, -0.475612, 0.761125], [-0.897234, 0.254511, -0.360825], [-0.022102, -0.842032, -0.538975]], 'translation vector': [2.451296, 1.899939, 1.450426]}\nD: {'rotation matrix': [[0.9999985864023682, -0.0015621221187893434, 0.0006936316269794283], [0.0015661737335896364, 0.9999849325621631, -0.005207380366478063], [-0.0006858389600734047, 0.005207931011548828, 0.9999859575854745]], 'translation vector': [-0.0017105359831024458, -0.002297103154811353, -0.000983146020886283]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.44362, -0.475187, 0.759868], [-0.895973, 0.254861, -0.363701], [-0.020835, -0.842166, -0.538816]], 'translation vector': [2.452095, 1.901161, 1.451891]}\nB: {'rotation matrix': [[-0.440436, -0.475273, 0.761664], [-0.897497, 0.254555, -0.360142], [-0.02272, -0.84221, -0.538671]], 'translation vector': [2.449051, 1.900731, 1.449924]}\nC: {'rotation matrix': [[-0.441002, -0.475612, 0.761125], [-0.897234, 0.254511, -0.360825], [-0.022102, -0.842032, -0.538975]], 'translation vector': [2.451296, 1.899939, 1.450426]}\nD: {'rotation matrix': [[0.9999985864023682, -0.0015621221187893434, 0.0006936316269794283], [0.0015661737335896364, 0.9999849325621631, -0.005207380366478063], [-0.0006858389600734047, 0.005207931011548828, 0.9999859575854745]], 'translation vector': [-0.0017105359831024458, -0.002297103154811353, -0.000983146020886283]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_145_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_145_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_145_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_145_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.648777, 0.514326, -0.560854], [0.760441, -0.465898, 0.452404], [-0.028617, -0.720006, -0.693378]], 'translation vector': [1.800914, 1.822078, 1.233863]}\nB: {'rotation matrix': [[0.644427, 0.520052, -0.560589], [0.76413, -0.465418, 0.446645], [-0.028629, -0.716192, -0.697315]], 'translation vector': [1.79848, 1.820985, 1.232666]}\nC: {'rotation matrix': [[0.9998704626137827, 0.011052401167760776, -0.011708701735332047], [-0.011032627408678441, 0.9999372608185829, 0.0017110617555067353], [0.011728123043277196, -0.001580982237636182, 0.9999298219541505]], 'translation vector': [-0.004951638312360451, 0.0003388210922784518, 0.0025384925972402606]}\nD: {'rotation matrix': [[0.639937, 0.524455, -0.56163], [0.767882, -0.464002, 0.441656], [-0.028969, -0.713897, -0.699651]], 'translation vector': [1.797021, 1.819882, 1.231178]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.648777, 0.514326, -0.560854], [0.760441, -0.465898, 0.452404], [-0.028617, -0.720006, -0.693378]], 'translation vector': [1.800914, 1.822078, 1.233863]}\nB: {'rotation matrix': [[0.644427, 0.520052, -0.560589], [0.76413, -0.465418, 0.446645], [-0.028629, -0.716192, -0.697315]], 'translation vector': [1.79848, 1.820985, 1.232666]}\nC: {'rotation matrix': [[0.9998704626137827, 0.011052401167760776, -0.011708701735332047], [-0.011032627408678441, 0.9999372608185829, 0.0017110617555067353], [0.011728123043277196, -0.001580982237636182, 0.9999298219541505]], 'translation vector': [-0.004951638312360451, 0.0003388210922784518, 0.0025384925972402606]}\nD: {'rotation matrix': [[0.639937, 0.524455, -0.56163], [0.767882, -0.464002, 0.441656], [-0.028969, -0.713897, -0.699651]], 'translation vector': [1.797021, 1.819882, 1.231178]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_146_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_146_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_146_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_146_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.480833, -0.463789, 0.74411], [-0.876135, 0.287476, -0.386968], [-0.034442, -0.838008, -0.54457]], 'translation vector': [3.084943, 2.078791, 1.469333]}\nB: {'rotation matrix': [[0.9999951018718091, -0.0022147244398398017, -0.0018850296839706259], [0.002209800973893798, 0.9999949398931387, -0.0022506347742728594], [0.0018898473296588083, 0.0022461573677036097, 0.9999958361862953]], 'translation vector': [0.004387368548260717, 1.4705105160661702e-05, 0.0008301488900060994]}\nC: {'rotation matrix': [[-0.476687, -0.464053, 0.746608], [-0.878377, 0.285219, -0.383541], [-0.034963, -0.838633, -0.543574]], 'translation vector': [3.080459, 2.078543, 1.469168]}\nD: {'rotation matrix': [[-0.479567, -0.463643, 0.745017], [-0.876821, 0.286706, -0.385985], [-0.034641, -0.838352, -0.544027]], 'translation vector': [3.083795, 2.079285, 1.469908]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.480833, -0.463789, 0.74411], [-0.876135, 0.287476, -0.386968], [-0.034442, -0.838008, -0.54457]], 'translation vector': [3.084943, 2.078791, 1.469333]}\nB: {'rotation matrix': [[0.9999951018718091, -0.0022147244398398017, -0.0018850296839706259], [0.002209800973893798, 0.9999949398931387, -0.0022506347742728594], [0.0018898473296588083, 0.0022461573677036097, 0.9999958361862953]], 'translation vector': [0.004387368548260717, 1.4705105160661702e-05, 0.0008301488900060994]}\nC: {'rotation matrix': [[-0.476687, -0.464053, 0.746608], [-0.878377, 0.285219, -0.383541], [-0.034963, -0.838633, -0.543574]], 'translation vector': [3.080459, 2.078543, 1.469168]}\nD: {'rotation matrix': [[-0.479567, -0.463643, 0.745017], [-0.876821, 0.286706, -0.385985], [-0.034641, -0.838352, -0.544027]], 'translation vector': [3.083795, 2.079285, 1.469908]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_147_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_147_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_147_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_147_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.724797, -0.022386, -0.688599], [0.687785, -0.034918, 0.725075], [-0.040276, -0.999139, -0.009911]], 'translation vector': [1.871804, 0.814995, 1.597738]}\nB: {'rotation matrix': [[0.729664, -0.019712, -0.683522], [0.682722, -0.035273, 0.729827], [-0.038496, -0.999183, -0.01228]], 'translation vector': [1.870321, 0.812422, 1.590842]}\nC: {'rotation matrix': [[0.9999971996131989, 0.0007346987095039795, -0.0017367305867328272], [-0.0007296724426740893, 0.9999944712211791, 0.003354984500413017], [0.0017384108444949038, -0.003353839188726123, 0.9999931577143994]], 'translation vector': [-0.0004548504518708807, 0.001951786856664972, -7.749986575322776e-05]}\nD: {'rotation matrix': [[0.728234, -0.022145, -0.684971], [0.684198, -0.033912, 0.728508], [-0.039361, -0.99918, -0.009544]], 'translation vector': [1.869489, 0.812101, 1.591189]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.724797, -0.022386, -0.688599], [0.687785, -0.034918, 0.725075], [-0.040276, -0.999139, -0.009911]], 'translation vector': [1.871804, 0.814995, 1.597738]}\nB: {'rotation matrix': [[0.729664, -0.019712, -0.683522], [0.682722, -0.035273, 0.729827], [-0.038496, -0.999183, -0.01228]], 'translation vector': [1.870321, 0.812422, 1.590842]}\nC: {'rotation matrix': [[0.9999971996131989, 0.0007346987095039795, -0.0017367305867328272], [-0.0007296724426740893, 0.9999944712211791, 0.003354984500413017], [0.0017384108444949038, -0.003353839188726123, 0.9999931577143994]], 'translation vector': [-0.0004548504518708807, 0.001951786856664972, -7.749986575322776e-05]}\nD: {'rotation matrix': [[0.728234, -0.022145, -0.684971], [0.684198, -0.033912, 0.728508], [-0.039361, -0.99918, -0.009544]], 'translation vector': [1.869489, 0.812101, 1.591189]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_148_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_148_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_148_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_148_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.9999774991279762, 0.0031690326046340087, -0.005996753212157376], [-0.003090362324076843, 0.9999123878108386, 0.012868971987653289], [0.006038036583570517, -0.012849517542798453, 0.9998986268025651]], 'translation vector': [-0.00016965780714706113, -0.008841569485431133, 0.004505805451807898]}\nB: {'rotation matrix': [[0.651481, -0.368876, 0.66295], [-0.758449, -0.337487, 0.557546], [0.018072, -0.866045, -0.49964]], 'translation vector': [2.471969, 4.600353, 1.449958]}\nC: {'rotation matrix': [[0.655694, -0.362412, 0.662362], [-0.754768, -0.337631, 0.562433], [0.019802, -0.868713, -0.49492]], 'translation vector': [2.472568, 4.599315, 1.447954]}\nD: {'rotation matrix': [[0.660006, -0.356371, 0.661356], [-0.750952, -0.338178, 0.567192], [0.021525, -0.870997, -0.490817]], 'translation vector': [2.470351, 4.598146, 1.447521]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999774991279762, 0.0031690326046340087, -0.005996753212157376], [-0.003090362324076843, 0.9999123878108386, 0.012868971987653289], [0.006038036583570517, -0.012849517542798453, 0.9998986268025651]], 'translation vector': [-0.00016965780714706113, -0.008841569485431133, 0.004505805451807898]}\nB: {'rotation matrix': [[0.651481, -0.368876, 0.66295], [-0.758449, -0.337487, 0.557546], [0.018072, -0.866045, -0.49964]], 'translation vector': [2.471969, 4.600353, 1.449958]}\nC: {'rotation matrix': [[0.655694, -0.362412, 0.662362], [-0.754768, -0.337631, 0.562433], [0.019802, -0.868713, -0.49492]], 'translation vector': [2.472568, 4.599315, 1.447954]}\nD: {'rotation matrix': [[0.660006, -0.356371, 0.661356], [-0.750952, -0.338178, 0.567192], [0.021525, -0.870997, -0.490817]], 'translation vector': [2.470351, 4.598146, 1.447521]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_149_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_149_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_149_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_149_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.550599, -0.608246, 0.571732], [-0.834661, 0.412205, -0.365279], [-0.013491, -0.678325, -0.734639]], 'translation vector': [2.153644, 1.764514, 1.342866]}\nB: {'rotation matrix': [[0.9999831845983815, -0.004759897354970956, 0.0032519818043716545], [0.004774936203352942, 0.9999789289359345, -0.004542082720936493], [-0.003229136166379075, 0.004558818034209968, 0.9999846215935988]], 'translation vector': [0.0033377456840164577, 0.0021763424534348985, -0.0009933558524138353]}\nC: {'rotation matrix': [[-0.553936, -0.603859, 0.573158], [-0.832399, 0.415212, -0.36703], [-0.016348, -0.680407, -0.732652]], 'translation vector': [2.15044, 1.76409, 1.342895]}\nD: {'rotation matrix': [[-0.551929, -0.606401, 0.572409], [-0.833765, 0.413226, -0.366169], [-0.014489, -0.679354, -0.733668]], 'translation vector': [2.152355, 1.764444, 1.342815]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.550599, -0.608246, 0.571732], [-0.834661, 0.412205, -0.365279], [-0.013491, -0.678325, -0.734639]], 'translation vector': [2.153644, 1.764514, 1.342866]}\nB: {'rotation matrix': [[0.9999831845983815, -0.004759897354970956, 0.0032519818043716545], [0.004774936203352942, 0.9999789289359345, -0.004542082720936493], [-0.003229136166379075, 0.004558818034209968, 0.9999846215935988]], 'translation vector': [0.0033377456840164577, 0.0021763424534348985, -0.0009933558524138353]}\nC: {'rotation matrix': [[-0.553936, -0.603859, 0.573158], [-0.832399, 0.415212, -0.36703], [-0.016348, -0.680407, -0.732652]], 'translation vector': [2.15044, 1.76409, 1.342895]}\nD: {'rotation matrix': [[-0.551929, -0.606401, 0.572409], [-0.833765, 0.413226, -0.366169], [-0.014489, -0.679354, -0.733668]], 'translation vector': [2.152355, 1.764444, 1.342815]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_150_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_150_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_150_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_150_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.254539, -0.436075, 0.863162], [-0.966632, 0.141348, -0.213642], [-0.028842, -0.88874, -0.457503]], 'translation vector': [1.735428, 0.748356, 1.43337]}\nB: {'rotation matrix': [[-0.254681, -0.434936, 0.863695], [-0.966604, 0.140846, -0.2141], [-0.028528, -0.889378, -0.456282]], 'translation vector': [1.735372, 0.748905, 1.433089]}\nC: {'rotation matrix': [[1.0000001555675329, -0.0009323657310693132, 0.0004792288755131117], [0.0009313044295569888, 0.9999972868981549, 0.0021526068874492825], [-0.0004812492477208415, -0.0021517811624316304, 0.9999974478268276]], 'translation vector': [0.0025109364715583116, 0.0011773606935274739, 0.0008952278670837366]}\nD: {'rotation matrix': [[-0.254466, -0.434667, 0.863893], [-0.966638, 0.141384, -0.213593], [-0.029299, -0.889424, -0.456143]], 'translation vector': [1.735598, 0.749181, 1.433436]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.254539, -0.436075, 0.863162], [-0.966632, 0.141348, -0.213642], [-0.028842, -0.88874, -0.457503]], 'translation vector': [1.735428, 0.748356, 1.43337]}\nB: {'rotation matrix': [[-0.254681, -0.434936, 0.863695], [-0.966604, 0.140846, -0.2141], [-0.028528, -0.889378, -0.456282]], 'translation vector': [1.735372, 0.748905, 1.433089]}\nC: {'rotation matrix': [[1.0000001555675329, -0.0009323657310693132, 0.0004792288755131117], [0.0009313044295569888, 0.9999972868981549, 0.0021526068874492825], [-0.0004812492477208415, -0.0021517811624316304, 0.9999974478268276]], 'translation vector': [0.0025109364715583116, 0.0011773606935274739, 0.0008952278670837366]}\nD: {'rotation matrix': [[-0.254466, -0.434667, 0.863893], [-0.966638, 0.141384, -0.213593], [-0.029299, -0.889424, -0.456143]], 'translation vector': [1.735598, 0.749181, 1.433436]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_151_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_151_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_151_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_151_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.874972, 0.120483, -0.468943], [0.467945, 0.459094, -0.755156], [0.124305, -0.88018, -0.458074]], 'translation vector': [3.924764, 3.210204, 1.74232]}\nB: {'rotation matrix': [[-0.872887, 0.124749, -0.471706], [0.47258, 0.456684, -0.75373], [0.121394, -0.880839, -0.457587]], 'translation vector': [3.924155, 3.192614, 1.742181]}\nC: {'rotation matrix': [[-0.873832, 0.122242, -0.470612], [0.470279, 0.458348, -0.754158], [0.123514, -0.880326, -0.458007]], 'translation vector': [3.924731, 3.201921, 1.742944]}\nD: {'rotation matrix': [[0.9999981203399105, -2.162890380847432e-05, -0.0011856852248069943], [2.6074892574446623e-05, 0.9999907421564089, 0.004166234818762547], [0.0011853825032994, -0.0041669736288708105, 0.9999895516522684]], 'translation vector': [0.0031018447373007962, -0.004160693298826068, -0.00448172332630925]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.874972, 0.120483, -0.468943], [0.467945, 0.459094, -0.755156], [0.124305, -0.88018, -0.458074]], 'translation vector': [3.924764, 3.210204, 1.74232]}\nB: {'rotation matrix': [[-0.872887, 0.124749, -0.471706], [0.47258, 0.456684, -0.75373], [0.121394, -0.880839, -0.457587]], 'translation vector': [3.924155, 3.192614, 1.742181]}\nC: {'rotation matrix': [[-0.873832, 0.122242, -0.470612], [0.470279, 0.458348, -0.754158], [0.123514, -0.880326, -0.458007]], 'translation vector': [3.924731, 3.201921, 1.742944]}\nD: {'rotation matrix': [[0.9999981203399105, -2.162890380847432e-05, -0.0011856852248069943], [2.6074892574446623e-05, 0.9999907421564089, 0.004166234818762547], [0.0011853825032994, -0.0041669736288708105, 0.9999895516522684]], 'translation vector': [0.0031018447373007962, -0.004160693298826068, -0.00448172332630925]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_152_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_152_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_152_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_152_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.910706, 0.180669, -0.371448], [0.412445, 0.446604, -0.793999], [0.022439, -0.876301, -0.481241]], 'translation vector': [3.200821, 1.957629, 1.277707]}\nB: {'rotation matrix': [[-0.909073, 0.181004, -0.375266], [0.415983, 0.444783, -0.793175], [0.023344, -0.877158, -0.479635]], 'translation vector': [3.199567, 1.957217, 1.278564]}\nC: {'rotation matrix': [[-0.912991, 0.180032, -0.36611], [0.407432, 0.448869, -0.795309], [0.021154, -0.875275, -0.483163]], 'translation vector': [3.201129, 1.957814, 1.275703]}\nD: {'rotation matrix': [[0.9998091200876347, 0.0024899817556151226, -0.01932364538042797], [-0.0024534091295022355, 0.9999947392396931, 0.0019198938596762963], [0.019328984816227378, -0.0018724870057808746, 0.9998116886101109]], 'translation vector': [-0.0018727616018998638, 0.007005445282938005, 8.97167203275373e-05]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.910706, 0.180669, -0.371448], [0.412445, 0.446604, -0.793999], [0.022439, -0.876301, -0.481241]], 'translation vector': [3.200821, 1.957629, 1.277707]}\nB: {'rotation matrix': [[-0.909073, 0.181004, -0.375266], [0.415983, 0.444783, -0.793175], [0.023344, -0.877158, -0.479635]], 'translation vector': [3.199567, 1.957217, 1.278564]}\nC: {'rotation matrix': [[-0.912991, 0.180032, -0.36611], [0.407432, 0.448869, -0.795309], [0.021154, -0.875275, -0.483163]], 'translation vector': [3.201129, 1.957814, 1.275703]}\nD: {'rotation matrix': [[0.9998091200876347, 0.0024899817556151226, -0.01932364538042797], [-0.0024534091295022355, 0.9999947392396931, 0.0019198938596762963], [0.019328984816227378, -0.0018724870057808746, 0.9998116886101109]], 'translation vector': [-0.0018727616018998638, 0.007005445282938005, 8.97167203275373e-05]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_153_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_153_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_153_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_153_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.9999837026106017, -0.005604330268430811, -0.00017160014562389917], [0.005604883614299803, 0.9999826757832918, 0.0019021058633347495], [0.00016097085173832394, -0.0019036155522551726, 0.9999985288928479]], 'translation vector': [-0.0002532483433466126, 0.009030133518173555, 0.0039788864378584865]}\nB: {'rotation matrix': [[-0.221399, -0.409647, 0.88497], [-0.971033, 0.176243, -0.161347], [-0.089875, -0.895057, -0.436801]], 'translation vector': [2.157726, 10.114248, 1.730212]}\nC: {'rotation matrix': [[-0.233773, -0.418838, 0.877454], [-0.967259, 0.191883, -0.166106], [-0.098797, -0.887556, -0.449982]], 'translation vector': [2.163177, 10.11361, 1.729991]}\nD: {'rotation matrix': [[-0.208812, -0.406957, 0.88926], [-0.974382, 0.164243, -0.153636], [-0.083532, -0.898561, -0.430827]], 'translation vector': [2.154821, 10.118629, 1.726458]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999837026106017, -0.005604330268430811, -0.00017160014562389917], [0.005604883614299803, 0.9999826757832918, 0.0019021058633347495], [0.00016097085173832394, -0.0019036155522551726, 0.9999985288928479]], 'translation vector': [-0.0002532483433466126, 0.009030133518173555, 0.0039788864378584865]}\nB: {'rotation matrix': [[-0.221399, -0.409647, 0.88497], [-0.971033, 0.176243, -0.161347], [-0.089875, -0.895057, -0.436801]], 'translation vector': [2.157726, 10.114248, 1.730212]}\nC: {'rotation matrix': [[-0.233773, -0.418838, 0.877454], [-0.967259, 0.191883, -0.166106], [-0.098797, -0.887556, -0.449982]], 'translation vector': [2.163177, 10.11361, 1.729991]}\nD: {'rotation matrix': [[-0.208812, -0.406957, 0.88926], [-0.974382, 0.164243, -0.153636], [-0.083532, -0.898561, -0.430827]], 'translation vector': [2.154821, 10.118629, 1.726458]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_154_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_154_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_154_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_154_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.864148, -0.19236, 0.465022], [-0.502138, -0.268523, 0.822042], [-0.033259, -0.943871, -0.328635]], 'translation vector': [3.016374, 2.015361, 1.429191]}\nB: {'rotation matrix': [[0.864117, -0.192314, 0.465099], [-0.502115, -0.266283, 0.822784], [-0.034385, -0.944515, -0.326663]], 'translation vector': [3.015528, 2.015384, 1.428328]}\nC: {'rotation matrix': [[0.864639, -0.19262, 0.464001], [-0.501175, -0.266422, 0.823312], [-0.034966, -0.944414, -0.326895]], 'translation vector': [3.015996, 2.015925, 1.430969]}\nD: {'rotation matrix': [[0.9999953542146168, 0.0030947830980374525, -0.0008131951628852412], [-0.0030994452771746766, 0.999974002426472, -0.006530521364187943], [0.00079388588123918, 0.0065324457136649635, 0.999978487244639]], 'translation vector': [-0.004468736350602409, -0.006227529584318603, -8.95755650023311e-05]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.864148, -0.19236, 0.465022], [-0.502138, -0.268523, 0.822042], [-0.033259, -0.943871, -0.328635]], 'translation vector': [3.016374, 2.015361, 1.429191]}\nB: {'rotation matrix': [[0.864117, -0.192314, 0.465099], [-0.502115, -0.266283, 0.822784], [-0.034385, -0.944515, -0.326663]], 'translation vector': [3.015528, 2.015384, 1.428328]}\nC: {'rotation matrix': [[0.864639, -0.19262, 0.464001], [-0.501175, -0.266422, 0.823312], [-0.034966, -0.944414, -0.326895]], 'translation vector': [3.015996, 2.015925, 1.430969]}\nD: {'rotation matrix': [[0.9999953542146168, 0.0030947830980374525, -0.0008131951628852412], [-0.0030994452771746766, 0.999974002426472, -0.006530521364187943], [0.00079388588123918, 0.0065324457136649635, 0.999978487244639]], 'translation vector': [-0.004468736350602409, -0.006227529584318603, -8.95755650023311e-05]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_155_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_155_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_155_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_155_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[1.0000001305586184, 4.504585202989606e-05, -0.0008677063367108602], [-4.562855519759043e-05, 0.9999983740715708, -0.001562168642633918], [0.0008668198225227341, 0.0015626467626293078, 0.9999978423516577]], 'translation vector': [0.004048003920170018, -0.000314296777893075, -0.001713133752151652]}\nB: {'rotation matrix': [[0.932237, 0.077212, -0.353515], [0.361617, -0.233791, 0.902538], [-0.012962, -0.969216, -0.24587]], 'translation vector': [5.874094, 3.546493, 1.351525]}\nC: {'rotation matrix': [[0.932237, 0.075134, -0.353962], [0.361548, -0.233262, 0.902703], [-0.014743, -0.969507, -0.24462]], 'translation vector': [5.877609, 3.546074, 1.353866]}\nD: {'rotation matrix': [[0.932245, 0.073757, -0.354232], [0.36147, -0.233417, 0.902694], [-0.016104, -0.969576, -0.244262]], 'translation vector': [5.878981, 3.545327, 1.355029]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[1.0000001305586184, 4.504585202989606e-05, -0.0008677063367108602], [-4.562855519759043e-05, 0.9999983740715708, -0.001562168642633918], [0.0008668198225227341, 0.0015626467626293078, 0.9999978423516577]], 'translation vector': [0.004048003920170018, -0.000314296777893075, -0.001713133752151652]}\nB: {'rotation matrix': [[0.932237, 0.077212, -0.353515], [0.361617, -0.233791, 0.902538], [-0.012962, -0.969216, -0.24587]], 'translation vector': [5.874094, 3.546493, 1.351525]}\nC: {'rotation matrix': [[0.932237, 0.075134, -0.353962], [0.361548, -0.233262, 0.902703], [-0.014743, -0.969507, -0.24462]], 'translation vector': [5.877609, 3.546074, 1.353866]}\nD: {'rotation matrix': [[0.932245, 0.073757, -0.354232], [0.36147, -0.233417, 0.902694], [-0.016104, -0.969576, -0.244262]], 'translation vector': [5.878981, 3.545327, 1.355029]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_156_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_156_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_156_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_156_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.967201, -0.070709, 0.243972], [-0.252115, 0.384385, -0.88808], [-0.030984, -0.920461, -0.389605]], 'translation vector': [2.768098, 4.614564, 1.418844]}\nB: {'rotation matrix': [[-0.968215, -0.068836, 0.240461], [-0.248279, 0.380909, -0.890655], [-0.030285, -0.922047, -0.385893]], 'translation vector': [2.770223, 4.618487, 1.418033]}\nC: {'rotation matrix': [[0.9999417292568685, -0.00016387346525461165, 0.010781387930699021], [0.00016087662233447972, 0.9999998236818622, 0.00023678654461243325], [-0.010781286291867453, -0.000235115727098794, 0.9999413646146618]], 'translation vector': [-0.004146218083109776, -0.009777600200150782, -9.031272009885072e-05]}\nD: {'rotation matrix': [[-0.966004, -0.071389, 0.248475], [-0.25638, 0.388148, -0.885218], [-0.03325, -0.918828, -0.393256]], 'translation vector': [2.766369, 4.610029, 1.423364]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.967201, -0.070709, 0.243972], [-0.252115, 0.384385, -0.88808], [-0.030984, -0.920461, -0.389605]], 'translation vector': [2.768098, 4.614564, 1.418844]}\nB: {'rotation matrix': [[-0.968215, -0.068836, 0.240461], [-0.248279, 0.380909, -0.890655], [-0.030285, -0.922047, -0.385893]], 'translation vector': [2.770223, 4.618487, 1.418033]}\nC: {'rotation matrix': [[0.9999417292568685, -0.00016387346525461165, 0.010781387930699021], [0.00016087662233447972, 0.9999998236818622, 0.00023678654461243325], [-0.010781286291867453, -0.000235115727098794, 0.9999413646146618]], 'translation vector': [-0.004146218083109776, -0.009777600200150782, -9.031272009885072e-05]}\nD: {'rotation matrix': [[-0.966004, -0.071389, 0.248475], [-0.25638, 0.388148, -0.885218], [-0.03325, -0.918828, -0.393256]], 'translation vector': [2.766369, 4.610029, 1.423364]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_157_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_157_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_157_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_157_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.819175, -0.184232, 0.543149], [-0.572951, -0.305906, 0.760361], [0.02607, -0.934066, -0.356147]], 'translation vector': [4.417876, 1.783865, 1.2754]}\nB: {'rotation matrix': [[0.815086, -0.186535, 0.548488], [-0.578729, -0.305625, 0.756086], [0.026595, -0.933701, -0.357064]], 'translation vector': [4.412413, 1.788676, 1.273151]}\nC: {'rotation matrix': [[0.9999460053456457, -0.005013531268923329, 0.00905468563745482], [0.0050087072013681256, 0.9999874429708866, 0.000635311021125049], [-0.009056848323318739, -0.0005890721337560823, 0.9999585496849335]], 'translation vector': [-0.01186208886847595, 0.011040583723211927, 0.005770402802990571]}\nD: {'rotation matrix': [[0.823616, -0.181983, 0.537158], [-0.566609, -0.305313, 0.765336], [0.024723, -0.934701, -0.354574]], 'translation vector': [4.422497, 1.777795, 1.277376]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.819175, -0.184232, 0.543149], [-0.572951, -0.305906, 0.760361], [0.02607, -0.934066, -0.356147]], 'translation vector': [4.417876, 1.783865, 1.2754]}\nB: {'rotation matrix': [[0.815086, -0.186535, 0.548488], [-0.578729, -0.305625, 0.756086], [0.026595, -0.933701, -0.357064]], 'translation vector': [4.412413, 1.788676, 1.273151]}\nC: {'rotation matrix': [[0.9999460053456457, -0.005013531268923329, 0.00905468563745482], [0.0050087072013681256, 0.9999874429708866, 0.000635311021125049], [-0.009056848323318739, -0.0005890721337560823, 0.9999585496849335]], 'translation vector': [-0.01186208886847595, 0.011040583723211927, 0.005770402802990571]}\nD: {'rotation matrix': [[0.823616, -0.181983, 0.537158], [-0.566609, -0.305313, 0.765336], [0.024723, -0.934701, -0.354574]], 'translation vector': [4.422497, 1.777795, 1.277376]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_158_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_158_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_158_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_158_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.9994834257163207, -0.009826144818556364, 0.03061241068052498], [0.009193271124449974, 0.9997423555682264, 0.020759381716764332], [-0.0308087787211077, -0.020466954157546582, 0.999315737926551]], 'translation vector': [-0.012172691673165703, -0.014778681947341665, 0.009780168112213383]}\nB: {'rotation matrix': [[0.091644, -0.414787, 0.905292], [-0.995361, -0.064895, 0.071028], [0.029288, -0.907601, -0.418811]], 'translation vector': [1.315562, 0.832314, 1.493138]}\nC: {'rotation matrix': [[0.099917, -0.418908, 0.902515], [-0.994362, -0.074403, 0.07555], [0.035502, -0.904975, -0.42398]], 'translation vector': [1.316564, 0.827844, 1.497329]}\nD: {'rotation matrix': [[0.095483, -0.419292, 0.902816], [-0.994923, -0.069178, 0.073096], [0.031806, -0.905212, -0.423769]], 'translation vector': [1.317067, 0.829593, 1.4954]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9994834257163207, -0.009826144818556364, 0.03061241068052498], [0.009193271124449974, 0.9997423555682264, 0.020759381716764332], [-0.0308087787211077, -0.020466954157546582, 0.999315737926551]], 'translation vector': [-0.012172691673165703, -0.014778681947341665, 0.009780168112213383]}\nB: {'rotation matrix': [[0.091644, -0.414787, 0.905292], [-0.995361, -0.064895, 0.071028], [0.029288, -0.907601, -0.418811]], 'translation vector': [1.315562, 0.832314, 1.493138]}\nC: {'rotation matrix': [[0.099917, -0.418908, 0.902515], [-0.994362, -0.074403, 0.07555], [0.035502, -0.904975, -0.42398]], 'translation vector': [1.316564, 0.827844, 1.497329]}\nD: {'rotation matrix': [[0.095483, -0.419292, 0.902816], [-0.994923, -0.069178, 0.073096], [0.031806, -0.905212, -0.423769]], 'translation vector': [1.317067, 0.829593, 1.4954]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_159_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_159_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_159_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_159_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.529968, 0.414216, -0.739972], [0.844868, 0.182762, -0.502789], [-0.073025, -0.891641, -0.446816]], 'translation vector': [5.418643, 4.410617, 1.386009]}\nB: {'rotation matrix': [[-0.531013, 0.418177, -0.736989], [0.843374, 0.176517, -0.507507], [-0.082137, -0.89105, -0.446413]], 'translation vector': [5.416763, 4.405288, 1.382813]}\nC: {'rotation matrix': [[0.9999262982512661, -0.011126048765088865, -0.004724929479929227], [0.011121653443920318, 0.9999373200698624, -0.0008601073344245507], [0.00473500376540592, 0.0008076436796648745, 0.9999882661936769]], 'translation vector': [-0.020480989578110398, -0.004401497485728045, 0.008145336008434256]}\nD: {'rotation matrix': [[-0.533157, 0.409147, -0.740501], [0.84318, 0.185365, -0.504666], [-0.06922, -0.893442, -0.443813]], 'translation vector': [5.417671, 4.419961, 1.384383]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.529968, 0.414216, -0.739972], [0.844868, 0.182762, -0.502789], [-0.073025, -0.891641, -0.446816]], 'translation vector': [5.418643, 4.410617, 1.386009]}\nB: {'rotation matrix': [[-0.531013, 0.418177, -0.736989], [0.843374, 0.176517, -0.507507], [-0.082137, -0.89105, -0.446413]], 'translation vector': [5.416763, 4.405288, 1.382813]}\nC: {'rotation matrix': [[0.9999262982512661, -0.011126048765088865, -0.004724929479929227], [0.011121653443920318, 0.9999373200698624, -0.0008601073344245507], [0.00473500376540592, 0.0008076436796648745, 0.9999882661936769]], 'translation vector': [-0.020480989578110398, -0.004401497485728045, 0.008145336008434256]}\nD: {'rotation matrix': [[-0.533157, 0.409147, -0.740501], [0.84318, 0.185365, -0.504666], [-0.06922, -0.893442, -0.443813]], 'translation vector': [5.417671, 4.419961, 1.384383]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_160_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_160_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_160_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_160_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.848714, 0.230926, -0.475771], [0.528497, 0.337392, -0.77901], [-0.019373, -0.912601, -0.408393]], 'translation vector': [1.792868, 5.329395, 1.618046]}\nB: {'rotation matrix': [[-0.84982, 0.23191, -0.473312], [0.526645, 0.337444, -0.780241], [-0.021229, -0.912332, -0.408901]], 'translation vector': [1.792819, 5.327111, 1.618396]}\nC: {'rotation matrix': [[-0.847697, 0.231565, -0.477271], [0.530179, 0.339488, -0.776955], [-0.017888, -0.911661, -0.410554]], 'translation vector': [1.79081, 5.325803, 1.623639]}\nD: {'rotation matrix': [[0.9999931647889151, 0.002636603414451914, -0.0022507832348392875], [-0.002639482011584797, 0.9999963226575643, -0.0013481976836917885], [0.002247569136725343, 0.001353829445398431, 0.9999964854536705]], 'translation vector': [-0.00651758527214108, -0.0013521488456044173, 0.0015986017122768814]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.848714, 0.230926, -0.475771], [0.528497, 0.337392, -0.77901], [-0.019373, -0.912601, -0.408393]], 'translation vector': [1.792868, 5.329395, 1.618046]}\nB: {'rotation matrix': [[-0.84982, 0.23191, -0.473312], [0.526645, 0.337444, -0.780241], [-0.021229, -0.912332, -0.408901]], 'translation vector': [1.792819, 5.327111, 1.618396]}\nC: {'rotation matrix': [[-0.847697, 0.231565, -0.477271], [0.530179, 0.339488, -0.776955], [-0.017888, -0.911661, -0.410554]], 'translation vector': [1.79081, 5.325803, 1.623639]}\nD: {'rotation matrix': [[0.9999931647889151, 0.002636603414451914, -0.0022507832348392875], [-0.002639482011584797, 0.9999963226575643, -0.0013481976836917885], [0.002247569136725343, 0.001353829445398431, 0.9999964854536705]], 'translation vector': [-0.00651758527214108, -0.0013521488456044173, 0.0015986017122768814]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_161_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_161_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_161_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_161_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.9999401570739126, -0.0046057655246039214, 0.009916971929525087], [0.004585227566153826, 0.9999870493451988, 0.0021285393012493415], [-0.009927494824793211, -0.0020825415412263123, 0.9999492671696912]], 'translation vector': [-0.008397334377028054, -0.00983275627665292, -0.005241897912080518]}\nB: {'rotation matrix': [[0.979087, -0.093288, 0.180791], [-0.203431, -0.440264, 0.874519], [-0.001987, -0.893009, -0.450035]], 'translation vector': [1.973386, 0.601511, 1.693802]}\nC: {'rotation matrix': [[0.980067, -0.092864, 0.175631], [-0.198651, -0.445785, 0.872819], [-0.002761, -0.89031, -0.455347]], 'translation vector': [1.967233, 0.628282, 1.699375]}\nD: {'rotation matrix': [[0.978621, -0.096572, 0.18159], [-0.205599, -0.435758, 0.876267], [-0.005494, -0.894868, -0.446298]], 'translation vector': [1.993303, 0.583546, 1.694907]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999401570739126, -0.0046057655246039214, 0.009916971929525087], [0.004585227566153826, 0.9999870493451988, 0.0021285393012493415], [-0.009927494824793211, -0.0020825415412263123, 0.9999492671696912]], 'translation vector': [-0.008397334377028054, -0.00983275627665292, -0.005241897912080518]}\nB: {'rotation matrix': [[0.979087, -0.093288, 0.180791], [-0.203431, -0.440264, 0.874519], [-0.001987, -0.893009, -0.450035]], 'translation vector': [1.973386, 0.601511, 1.693802]}\nC: {'rotation matrix': [[0.980067, -0.092864, 0.175631], [-0.198651, -0.445785, 0.872819], [-0.002761, -0.89031, -0.455347]], 'translation vector': [1.967233, 0.628282, 1.699375]}\nD: {'rotation matrix': [[0.978621, -0.096572, 0.18159], [-0.205599, -0.435758, 0.876267], [-0.005494, -0.894868, -0.446298]], 'translation vector': [1.993303, 0.583546, 1.694907]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_162_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_162_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_162_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_162_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.170548, -0.038579, 0.984594], [-0.984686, -0.02999, -0.171739], [0.036154, -0.998805, -0.032873]], 'translation vector': [3.062037, 2.44503, 1.503093]}\nB: {'rotation matrix': [[-0.179914, -0.049674, 0.982427], [-0.983099, -0.025325, -0.181318], [0.033887, -0.998444, -0.044279]], 'translation vector': [3.062168, 2.448737, 1.498158]}\nC: {'rotation matrix': [[-0.175595, -0.045495, 0.983411], [-0.983814, -0.028148, -0.176969], [0.035732, -0.998568, -0.039815]], 'translation vector': [3.062089, 2.446902, 1.500845]}\nD: {'rotation matrix': [[0.9999851894661843, 0.0005818244206442844, -0.005337253553840107], [-0.0006352685106689393, 0.9999500541572905, -0.009997050540678364], [0.005330302626435351, 0.010000486202961777, 0.9999353803458123]], 'translation vector': [0.013561096331675682, -0.004517035589624685, -0.0041143791607543]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.170548, -0.038579, 0.984594], [-0.984686, -0.02999, -0.171739], [0.036154, -0.998805, -0.032873]], 'translation vector': [3.062037, 2.44503, 1.503093]}\nB: {'rotation matrix': [[-0.179914, -0.049674, 0.982427], [-0.983099, -0.025325, -0.181318], [0.033887, -0.998444, -0.044279]], 'translation vector': [3.062168, 2.448737, 1.498158]}\nC: {'rotation matrix': [[-0.175595, -0.045495, 0.983411], [-0.983814, -0.028148, -0.176969], [0.035732, -0.998568, -0.039815]], 'translation vector': [3.062089, 2.446902, 1.500845]}\nD: {'rotation matrix': [[0.9999851894661843, 0.0005818244206442844, -0.005337253553840107], [-0.0006352685106689393, 0.9999500541572905, -0.009997050540678364], [0.005330302626435351, 0.010000486202961777, 0.9999353803458123]], 'translation vector': [0.013561096331675682, -0.004517035589624685, -0.0041143791607543]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_163_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_163_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_163_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_163_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.879731, -0.18442, 0.43825], [-0.471339, -0.216982, 0.854844], [-0.062558, -0.958597, -0.27781]], 'translation vector': [1.012821, 1.289106, 1.470609]}\nB: {'rotation matrix': [[0.877414, -0.186393, 0.442044], [-0.476234, -0.227312, 0.84943], [-0.057846, -0.955818, -0.288213]], 'translation vector': [1.033287, 1.302455, 1.466311]}\nC: {'rotation matrix': [[0.999991817168325, 0.002932358485082305, -0.002640226065149888], [-0.0029080462382595554, 0.9999549882789169, 0.009034252562416854], [0.0026665546966547606, -0.009026772459937056, 0.9999556430330881]], 'translation vector': [0.005120345387922942, -0.0015772155749180783, 0.009569803755594242]}\nD: {'rotation matrix': [[0.878466, -0.185862, 0.440175], [-0.473882, -0.221089, 0.852382], [-0.061107, -0.957379, -0.282296]], 'translation vector': [1.023458, 1.295782, 1.469602]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.879731, -0.18442, 0.43825], [-0.471339, -0.216982, 0.854844], [-0.062558, -0.958597, -0.27781]], 'translation vector': [1.012821, 1.289106, 1.470609]}\nB: {'rotation matrix': [[0.877414, -0.186393, 0.442044], [-0.476234, -0.227312, 0.84943], [-0.057846, -0.955818, -0.288213]], 'translation vector': [1.033287, 1.302455, 1.466311]}\nC: {'rotation matrix': [[0.999991817168325, 0.002932358485082305, -0.002640226065149888], [-0.0029080462382595554, 0.9999549882789169, 0.009034252562416854], [0.0026665546966547606, -0.009026772459937056, 0.9999556430330881]], 'translation vector': [0.005120345387922942, -0.0015772155749180783, 0.009569803755594242]}\nD: {'rotation matrix': [[0.878466, -0.185862, 0.440175], [-0.473882, -0.221089, 0.852382], [-0.061107, -0.957379, -0.282296]], 'translation vector': [1.023458, 1.295782, 1.469602]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_164_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_164_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_164_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_164_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.306256, 0.214312, -0.927512], [0.951867, -0.081783, 0.295401], [-0.012547, -0.973336, -0.229043]], 'translation vector': [3.740017, 1.664374, 1.453227]}\nB: {'rotation matrix': [[0.304194, 0.215718, -0.927864], [0.952563, -0.078625, 0.294012], [-0.009529, -0.973285, -0.229402]], 'translation vector': [3.747529, 1.6658, 1.453625]}\nC: {'rotation matrix': [[0.9998537516460376, 0.013008935130727867, -0.011104836010819949], [-0.013009085999586367, 0.9999155647701683, 0.00015274967336381323], [0.011105421497037882, -8.408899546142835e-06, 0.9999381123179228]], 'translation vector': [0.00936290533716333, 0.003069967953460928, -0.010760020039624951]}\nD: {'rotation matrix': [[0.309341, 0.212762, -0.926844], [0.950827, -0.084926, 0.297851], [-0.015342, -0.973406, -0.228571]], 'translation vector': [3.731516, 1.660707, 1.454311]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.306256, 0.214312, -0.927512], [0.951867, -0.081783, 0.295401], [-0.012547, -0.973336, -0.229043]], 'translation vector': [3.740017, 1.664374, 1.453227]}\nB: {'rotation matrix': [[0.304194, 0.215718, -0.927864], [0.952563, -0.078625, 0.294012], [-0.009529, -0.973285, -0.229402]], 'translation vector': [3.747529, 1.6658, 1.453625]}\nC: {'rotation matrix': [[0.9998537516460376, 0.013008935130727867, -0.011104836010819949], [-0.013009085999586367, 0.9999155647701683, 0.00015274967336381323], [0.011105421497037882, -8.408899546142835e-06, 0.9999381123179228]], 'translation vector': [0.00936290533716333, 0.003069967953460928, -0.010760020039624951]}\nD: {'rotation matrix': [[0.309341, 0.212762, -0.926844], [0.950827, -0.084926, 0.297851], [-0.015342, -0.973406, -0.228571]], 'translation vector': [3.731516, 1.660707, 1.454311]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_165_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_165_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_165_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_165_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.226506, -0.721874, 0.653906], [-0.969583, -0.103174, 0.221955], [-0.092757, -0.68429, -0.723287]], 'translation vector': [2.104302, 2.429349, 1.38499]}\nB: {'rotation matrix': [[0.223513, -0.721637, 0.655197], [-0.970242, -0.100499, 0.220298], [-0.093129, -0.684938, -0.722625]], 'translation vector': [2.105446, 2.427759, 1.384995]}\nC: {'rotation matrix': [[0.22885, -0.719341, 0.655878], [-0.96905, -0.104271, 0.223761], [-0.092571, -0.686787, -0.72094]], 'translation vector': [2.102429, 2.429695, 1.385047]}\nD: {'rotation matrix': [[0.9999909282503874, 0.0026675007233110163, 0.003038726855301391], [-0.0026681297586413737, 0.999996473984971, 2.7861207662840692e-05], [-0.0030386998838646344, -3.5943923369286046e-05, 0.9999953589686927]], 'translation vector': [0.002364456746013932, -0.0010737235666011813, -0.00037719790048196256]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.226506, -0.721874, 0.653906], [-0.969583, -0.103174, 0.221955], [-0.092757, -0.68429, -0.723287]], 'translation vector': [2.104302, 2.429349, 1.38499]}\nB: {'rotation matrix': [[0.223513, -0.721637, 0.655197], [-0.970242, -0.100499, 0.220298], [-0.093129, -0.684938, -0.722625]], 'translation vector': [2.105446, 2.427759, 1.384995]}\nC: {'rotation matrix': [[0.22885, -0.719341, 0.655878], [-0.96905, -0.104271, 0.223761], [-0.092571, -0.686787, -0.72094]], 'translation vector': [2.102429, 2.429695, 1.385047]}\nD: {'rotation matrix': [[0.9999909282503874, 0.0026675007233110163, 0.003038726855301391], [-0.0026681297586413737, 0.999996473984971, 2.7861207662840692e-05], [-0.0030386998838646344, -3.5943923369286046e-05, 0.9999953589686927]], 'translation vector': [0.002364456746013932, -0.0010737235666011813, -0.00037719790048196256]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_166_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_166_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_166_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_166_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.711418, -0.467017, 0.525147], [-0.700604, 0.529926, -0.477841], [-0.055129, -0.707865, -0.704193]], 'translation vector': [2.529564, 4.393072, 1.526695]}\nB: {'rotation matrix': [[0.9999966648975326, -0.0024113488419032422, -0.0008172418163523504], [0.0024114327512952905, 0.9999969977952643, 0.0006432367723817748], [0.0008165042296219522, -0.0006466099152102373, 0.9999990767691678]], 'translation vector': [-0.006437670952323948, -0.005367763877482813, 0.00013241538331776326]}\nC: {'rotation matrix': [[-0.711906, -0.467075, 0.524433], [-0.700166, 0.529878, -0.478536], [-0.054374, -0.707863, -0.704254]], 'translation vector': [2.530244, 4.39346, 1.526741]}\nD: {'rotation matrix': [[-0.711605, -0.467107, 0.524814], [-0.700478, 0.529425, -0.47858], [-0.054301, -0.708181, -0.70394]], 'translation vector': [2.529967, 4.393585, 1.525543]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.711418, -0.467017, 0.525147], [-0.700604, 0.529926, -0.477841], [-0.055129, -0.707865, -0.704193]], 'translation vector': [2.529564, 4.393072, 1.526695]}\nB: {'rotation matrix': [[0.9999966648975326, -0.0024113488419032422, -0.0008172418163523504], [0.0024114327512952905, 0.9999969977952643, 0.0006432367723817748], [0.0008165042296219522, -0.0006466099152102373, 0.9999990767691678]], 'translation vector': [-0.006437670952323948, -0.005367763877482813, 0.00013241538331776326]}\nC: {'rotation matrix': [[-0.711906, -0.467075, 0.524433], [-0.700166, 0.529878, -0.478536], [-0.054374, -0.707863, -0.704254]], 'translation vector': [2.530244, 4.39346, 1.526741]}\nD: {'rotation matrix': [[-0.711605, -0.467107, 0.524814], [-0.700478, 0.529425, -0.47858], [-0.054301, -0.708181, -0.70394]], 'translation vector': [2.529967, 4.393585, 1.525543]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_167_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_167_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_167_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_167_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.61412, -0.406634, 0.676392], [-0.788751, 0.286898, -0.543656], [0.027014, -0.867374, -0.496922]], 'translation vector': [1.884445, 2.364432, 1.389567]}\nB: {'rotation matrix': [[0.9999800934681147, 0.006249341845585854, -0.0009789493583592457], [-0.006247228158252656, 0.9999797870897016, 0.0016201321535261565], [0.0009895210241494333, -0.0016143397944822075, 0.9999982198013821]], 'translation vector': [-0.0069672096971418185, 0.0007707556249330061, 0.0022308491982188094]}\nC: {'rotation matrix': [[-0.614991, -0.407345, 0.675171], [-0.788025, 0.286731, -0.544795], [0.028327, -0.867096, -0.497335]], 'translation vector': [1.885989, 2.365962, 1.389016]}\nD: {'rotation matrix': [[-0.615135, -0.40626, 0.675693], [-0.787872, 0.284761, -0.546048], [0.029426, -0.868253, -0.495248]], 'translation vector': [1.88807, 2.366622, 1.388041]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.61412, -0.406634, 0.676392], [-0.788751, 0.286898, -0.543656], [0.027014, -0.867374, -0.496922]], 'translation vector': [1.884445, 2.364432, 1.389567]}\nB: {'rotation matrix': [[0.9999800934681147, 0.006249341845585854, -0.0009789493583592457], [-0.006247228158252656, 0.9999797870897016, 0.0016201321535261565], [0.0009895210241494333, -0.0016143397944822075, 0.9999982198013821]], 'translation vector': [-0.0069672096971418185, 0.0007707556249330061, 0.0022308491982188094]}\nC: {'rotation matrix': [[-0.614991, -0.407345, 0.675171], [-0.788025, 0.286731, -0.544795], [0.028327, -0.867096, -0.497335]], 'translation vector': [1.885989, 2.365962, 1.389016]}\nD: {'rotation matrix': [[-0.615135, -0.40626, 0.675693], [-0.787872, 0.284761, -0.546048], [0.029426, -0.868253, -0.495248]], 'translation vector': [1.88807, 2.366622, 1.388041]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_168_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_168_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_168_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_168_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.9999912398243648, -0.003460941321399837, -0.0022122658857095193], [0.003455741372162686, 0.9999910905464573, -0.0022383864983849893], [0.0022193515549835062, 0.0022307044509074837, 0.999995073124285]], 'translation vector': [-0.001957679288641462, -0.0024920290268601875, -0.000907578453192226]}\nB: {'rotation matrix': [[0.67484, -0.325973, 0.662067], [-0.73754, -0.328352, 0.590102], [0.025034, -0.886525, -0.462003]], 'translation vector': [2.869569, 2.417867, 1.545271]}\nC: {'rotation matrix': [[0.67798, -0.325694, 0.658989], [-0.734824, -0.323965, 0.595886], [0.019413, -0.88824, -0.45897]], 'translation vector': [2.868894, 2.415756, 1.54509]}\nD: {'rotation matrix': [[0.682626, -0.324357, 0.654839], [-0.73069, -0.316101, 0.605122], [0.010719, -0.891556, -0.452783]], 'translation vector': [2.86653, 2.411599, 1.544608]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999912398243648, -0.003460941321399837, -0.0022122658857095193], [0.003455741372162686, 0.9999910905464573, -0.0022383864983849893], [0.0022193515549835062, 0.0022307044509074837, 0.999995073124285]], 'translation vector': [-0.001957679288641462, -0.0024920290268601875, -0.000907578453192226]}\nB: {'rotation matrix': [[0.67484, -0.325973, 0.662067], [-0.73754, -0.328352, 0.590102], [0.025034, -0.886525, -0.462003]], 'translation vector': [2.869569, 2.417867, 1.545271]}\nC: {'rotation matrix': [[0.67798, -0.325694, 0.658989], [-0.734824, -0.323965, 0.595886], [0.019413, -0.88824, -0.45897]], 'translation vector': [2.868894, 2.415756, 1.54509]}\nD: {'rotation matrix': [[0.682626, -0.324357, 0.654839], [-0.73069, -0.316101, 0.605122], [0.010719, -0.891556, -0.452783]], 'translation vector': [2.86653, 2.411599, 1.544608]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_169_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_169_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_169_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_169_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.999502, 0.00746, 0.030678], [-0.028996, 0.167572, -0.985433], [-0.012492, -0.985832, -0.167272]], 'translation vector': [6.682728, 5.426456, 1.759702]}\nB: {'rotation matrix': [[-0.999516, 0.005588, 0.030613], [-0.029207, 0.171126, -0.984816], [-0.010741, -0.985233, -0.17088]], 'translation vector': [6.687027, 5.423337, 1.762554]}\nC: {'rotation matrix': [[-0.999427, 0.005452, 0.0334], [-0.031967, 0.171859, -0.984603], [-0.011109, -0.985106, -0.171586]], 'translation vector': [6.682628, 5.424977, 1.756356]}\nD: {'rotation matrix': [[0.9999959031714618, 0.0006283915085459037, 0.0029514431388234703], [-0.0006213832896176772, 0.9999971711296258, -0.0021323447476004893], [-0.0029538525457832297, 0.002130012321803009, 0.9999931612377413]], 'translation vector': [-0.006207505850969852, 0.015496225089857818, -0.006213786764486251]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.999502, 0.00746, 0.030678], [-0.028996, 0.167572, -0.985433], [-0.012492, -0.985832, -0.167272]], 'translation vector': [6.682728, 5.426456, 1.759702]}\nB: {'rotation matrix': [[-0.999516, 0.005588, 0.030613], [-0.029207, 0.171126, -0.984816], [-0.010741, -0.985233, -0.17088]], 'translation vector': [6.687027, 5.423337, 1.762554]}\nC: {'rotation matrix': [[-0.999427, 0.005452, 0.0334], [-0.031967, 0.171859, -0.984603], [-0.011109, -0.985106, -0.171586]], 'translation vector': [6.682628, 5.424977, 1.756356]}\nD: {'rotation matrix': [[0.9999959031714618, 0.0006283915085459037, 0.0029514431388234703], [-0.0006213832896176772, 0.9999971711296258, -0.0021323447476004893], [-0.0029538525457832297, 0.002130012321803009, 0.9999931612377413]], 'translation vector': [-0.006207505850969852, 0.015496225089857818, -0.006213786764486251]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_170_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_170_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_170_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_170_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.106477, -0.473799, 0.874173], [-0.992913, 0.097339, -0.068183], [-0.052786, -0.875237, -0.480805]], 'translation vector': [4.553204, 3.149855, 1.246823]}\nB: {'rotation matrix': [[-0.115243, -0.46998, 0.875122], [-0.991961, 0.100818, -0.076485], [-0.052282, -0.876901, -0.47782]], 'translation vector': [4.553743, 3.152171, 1.246409]}\nC: {'rotation matrix': [[0.999828950585133, 0.0011190525701963586, -0.018454829607259946], [-0.001416468295998283, 0.9998678811169985, -0.016145155593255522], [0.018434708990961175, 0.016168544891222898, 0.9996994385524549]], 'translation vector': [0.002077144261483088, 0.01271199054625649, -0.0002353155159546816]}\nD: {'rotation matrix': [[-0.120252, -0.468652, 0.87516], [-0.991418, 0.102239, -0.081478], [-0.051291, -0.877447, -0.476924]], 'translation vector': [4.555783, 3.154248, 1.246329]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.106477, -0.473799, 0.874173], [-0.992913, 0.097339, -0.068183], [-0.052786, -0.875237, -0.480805]], 'translation vector': [4.553204, 3.149855, 1.246823]}\nB: {'rotation matrix': [[-0.115243, -0.46998, 0.875122], [-0.991961, 0.100818, -0.076485], [-0.052282, -0.876901, -0.47782]], 'translation vector': [4.553743, 3.152171, 1.246409]}\nC: {'rotation matrix': [[0.999828950585133, 0.0011190525701963586, -0.018454829607259946], [-0.001416468295998283, 0.9998678811169985, -0.016145155593255522], [0.018434708990961175, 0.016168544891222898, 0.9996994385524549]], 'translation vector': [0.002077144261483088, 0.01271199054625649, -0.0002353155159546816]}\nD: {'rotation matrix': [[-0.120252, -0.468652, 0.87516], [-0.991418, 0.102239, -0.081478], [-0.051291, -0.877447, -0.476924]], 'translation vector': [4.555783, 3.154248, 1.246329]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_171_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_171_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_171_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_171_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.927045, 0.223636, -0.300956], [0.364198, 0.346229, -0.864573], [-0.08915, -0.911105, -0.402418]], 'translation vector': [7.648557, 2.747808, 1.440051]}\nB: {'rotation matrix': [[0.9999939386175598, 0.0009488285827030999, 0.0032872470438982896], [-0.0009337892397183689, 0.9999879134504162, -0.004824212466184848], [-0.0032917475553783573, 0.004820430801235839, 0.9999833362205853]], 'translation vector': [-0.000493455857793812, -0.002698981007177137, 0.0012657934234763246]}\nC: {'rotation matrix': [[-0.9261, 0.223085, -0.304257], [0.366658, 0.34218, -0.865144], [-0.08889, -0.912768, -0.398689]], 'translation vector': [7.650569, 2.747621, 1.441708]}\nD: {'rotation matrix': [[-0.927432, 0.22366, -0.299744], [0.363522, 0.350792, -0.863016], [-0.087874, -0.909352, -0.406641]], 'translation vector': [7.650677, 2.747929, 1.439487]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.927045, 0.223636, -0.300956], [0.364198, 0.346229, -0.864573], [-0.08915, -0.911105, -0.402418]], 'translation vector': [7.648557, 2.747808, 1.440051]}\nB: {'rotation matrix': [[0.9999939386175598, 0.0009488285827030999, 0.0032872470438982896], [-0.0009337892397183689, 0.9999879134504162, -0.004824212466184848], [-0.0032917475553783573, 0.004820430801235839, 0.9999833362205853]], 'translation vector': [-0.000493455857793812, -0.002698981007177137, 0.0012657934234763246]}\nC: {'rotation matrix': [[-0.9261, 0.223085, -0.304257], [0.366658, 0.34218, -0.865144], [-0.08889, -0.912768, -0.398689]], 'translation vector': [7.650569, 2.747621, 1.441708]}\nD: {'rotation matrix': [[-0.927432, 0.22366, -0.299744], [0.363522, 0.350792, -0.863016], [-0.087874, -0.909352, -0.406641]], 'translation vector': [7.650677, 2.747929, 1.439487]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_172_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_172_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_172_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_172_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.598936, 0.355502, -0.717562], [0.800003, -0.305531, 0.516379], [-0.035664, -0.883329, -0.467396]], 'translation vector': [5.964795, 1.444893, 1.32602]}\nB: {'rotation matrix': [[0.600188, 0.357296, -0.715622], [0.799089, -0.307102, 0.516861], [-0.035096, -0.882059, -0.46983]], 'translation vector': [5.950611, 1.450679, 1.325211]}\nC: {'rotation matrix': [[0.9999898156973296, -0.003952939496578174, 0.0024431521094752146], [0.003942856258073881, 0.9999843026055344, 0.004096575088743528], [-0.0024598279169757275, -0.004085831486072852, 0.9999883270401599]], 'translation vector': [-0.0051631563465806, -0.010227260523923531, 0.01366119668418353]}\nD: {'rotation matrix': [[0.593595, 0.358896, -0.720305], [0.803944, -0.304849, 0.510628], [-0.036322, -0.882191, -0.469489]], 'translation vector': [5.978991, 1.441471, 1.326102]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.598936, 0.355502, -0.717562], [0.800003, -0.305531, 0.516379], [-0.035664, -0.883329, -0.467396]], 'translation vector': [5.964795, 1.444893, 1.32602]}\nB: {'rotation matrix': [[0.600188, 0.357296, -0.715622], [0.799089, -0.307102, 0.516861], [-0.035096, -0.882059, -0.46983]], 'translation vector': [5.950611, 1.450679, 1.325211]}\nC: {'rotation matrix': [[0.9999898156973296, -0.003952939496578174, 0.0024431521094752146], [0.003942856258073881, 0.9999843026055344, 0.004096575088743528], [-0.0024598279169757275, -0.004085831486072852, 0.9999883270401599]], 'translation vector': [-0.0051631563465806, -0.010227260523923531, 0.01366119668418353]}\nD: {'rotation matrix': [[0.593595, 0.358896, -0.720305], [0.803944, -0.304849, 0.510628], [-0.036322, -0.882191, -0.469489]], 'translation vector': [5.978991, 1.441471, 1.326102]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_173_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_173_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_173_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_173_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.983068, 0.048576, -0.176687], [0.181735, -0.381917, 0.906152], [-0.023462, -0.922919, -0.384278]], 'translation vector': [2.212073, 3.484547, 1.465708]}\nB: {'rotation matrix': [[0.982686, 0.047982, -0.178958], [0.183606, -0.38172, 0.905858], [-0.024847, -0.923032, -0.38392]], 'translation vector': [2.212621, 3.48432, 1.466163]}\nC: {'rotation matrix': [[0.983078, 0.050132, -0.176192], [0.181887, -0.381466, 0.906312], [-0.021776, -0.923023, -0.384129]], 'translation vector': [2.213536, 3.486831, 1.465259]}\nD: {'rotation matrix': [[0.9999889301916728, 0.0015926412893515843, 0.004480728220099604], [-0.001621568987345314, 0.9999775972099906, 0.006504872079739442], [-0.004470952780905997, -0.0065123882198827605, 0.9999681868018854]], 'translation vector': [-0.002104900488902217, -0.0034679151915213424, 0.0012659901948626207]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.983068, 0.048576, -0.176687], [0.181735, -0.381917, 0.906152], [-0.023462, -0.922919, -0.384278]], 'translation vector': [2.212073, 3.484547, 1.465708]}\nB: {'rotation matrix': [[0.982686, 0.047982, -0.178958], [0.183606, -0.38172, 0.905858], [-0.024847, -0.923032, -0.38392]], 'translation vector': [2.212621, 3.48432, 1.466163]}\nC: {'rotation matrix': [[0.983078, 0.050132, -0.176192], [0.181887, -0.381466, 0.906312], [-0.021776, -0.923023, -0.384129]], 'translation vector': [2.213536, 3.486831, 1.465259]}\nD: {'rotation matrix': [[0.9999889301916728, 0.0015926412893515843, 0.004480728220099604], [-0.001621568987345314, 0.9999775972099906, 0.006504872079739442], [-0.004470952780905997, -0.0065123882198827605, 0.9999681868018854]], 'translation vector': [-0.002104900488902217, -0.0034679151915213424, 0.0012659901948626207]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_174_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_174_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_174_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_174_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.9999643490106183, 0.005007713450141965, -0.006820932250517381], [-0.004994218544548531, 0.9999851347119364, 0.0020857596614642536], [0.0068306095153302035, -0.0020519868188517945, 0.9999742349784666]], 'translation vector': [-0.002699516524657053, 0.0005464771955957654, -0.0009563459127281959]}\nB: {'rotation matrix': [[0.549558, 0.430394, -0.716064], [0.833614, -0.2256, 0.504176], [0.05545, -0.873994, -0.482762]], 'translation vector': [3.109701, 1.26111, 1.347453]}\nC: {'rotation matrix': [[0.545357, 0.429527, -0.719787], [0.836166, -0.218941, 0.502882], [0.05841, -0.876111, -0.478557]], 'translation vector': [3.10956, 1.258833, 1.347276]}\nD: {'rotation matrix': [[0.548441, 0.430496, -0.716859], [0.834301, -0.22414, 0.503689], [0.056159, -0.874319, -0.482091]], 'translation vector': [3.109132, 1.259955, 1.347698]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999643490106183, 0.005007713450141965, -0.006820932250517381], [-0.004994218544548531, 0.9999851347119364, 0.0020857596614642536], [0.0068306095153302035, -0.0020519868188517945, 0.9999742349784666]], 'translation vector': [-0.002699516524657053, 0.0005464771955957654, -0.0009563459127281959]}\nB: {'rotation matrix': [[0.549558, 0.430394, -0.716064], [0.833614, -0.2256, 0.504176], [0.05545, -0.873994, -0.482762]], 'translation vector': [3.109701, 1.26111, 1.347453]}\nC: {'rotation matrix': [[0.545357, 0.429527, -0.719787], [0.836166, -0.218941, 0.502882], [0.05841, -0.876111, -0.478557]], 'translation vector': [3.10956, 1.258833, 1.347276]}\nD: {'rotation matrix': [[0.548441, 0.430496, -0.716859], [0.834301, -0.22414, 0.503689], [0.056159, -0.874319, -0.482091]], 'translation vector': [3.109132, 1.259955, 1.347698]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_175_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_175_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_175_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_175_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.9999664473886144, -4.097872539031611e-05, -0.00815329503789991], [4.450303663576986e-05, 1.0000000556925084, 0.0005279321342564721], [0.008153564277134814, -0.0005283064017830043, 0.99996718864845]], 'translation vector': [0.0006960777394775519, 0.0030013724924222718, -0.001027289568414247]}\nB: {'rotation matrix': [[-0.825443, 0.242757, -0.509621], [0.56437, 0.373207, -0.736345], [0.011442, -0.895425, -0.445066]], 'translation vector': [4.848658, 2.610627, 1.449985]}\nC: {'rotation matrix': [[-0.825701, 0.242217, -0.50946], [0.563992, 0.37286, -0.73681], [0.01149, -0.895716, -0.444479]], 'translation vector': [4.848603, 2.611202, 1.449781]}\nD: {'rotation matrix': [[-0.825281, 0.242861, -0.509834], [0.5646, 0.373686, -0.735925], [0.011791, -0.895197, -0.445515]], 'translation vector': [4.848519, 2.6109, 1.44995]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.9999664473886144, -4.097872539031611e-05, -0.00815329503789991], [4.450303663576986e-05, 1.0000000556925084, 0.0005279321342564721], [0.008153564277134814, -0.0005283064017830043, 0.99996718864845]], 'translation vector': [0.0006960777394775519, 0.0030013724924222718, -0.001027289568414247]}\nB: {'rotation matrix': [[-0.825443, 0.242757, -0.509621], [0.56437, 0.373207, -0.736345], [0.011442, -0.895425, -0.445066]], 'translation vector': [4.848658, 2.610627, 1.449985]}\nC: {'rotation matrix': [[-0.825701, 0.242217, -0.50946], [0.563992, 0.37286, -0.73681], [0.01149, -0.895716, -0.444479]], 'translation vector': [4.848603, 2.611202, 1.449781]}\nD: {'rotation matrix': [[-0.825281, 0.242861, -0.509834], [0.5646, 0.373686, -0.735925], [0.011791, -0.895197, -0.445515]], 'translation vector': [4.848519, 2.6109, 1.44995]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_176_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_176_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_176_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_176_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.820406, -0.123018, 0.558391], [-0.564425, -0.330389, 0.756484], [0.091425, -0.935794, -0.340487]], 'translation vector': [1.795617, 2.461673, 1.379824]}\nB: {'rotation matrix': [[0.820181, -0.122779, 0.558774], [-0.564668, -0.330702, 0.756166], [0.091946, -0.935714, -0.340565]], 'translation vector': [1.795446, 2.463577, 1.379349]}\nC: {'rotation matrix': [[0.9999965570678159, 0.002414373935934805, -0.0013910970690502594], [-0.0024092149366839086, 0.9999893972371463, 0.004094256181755705], [0.0014001974871553952, -0.0040912377586492955, 0.9999902946910975]], 'translation vector': [0.0015616232476808878, 0.0015124074702654866, -0.0024993421767338653]}\nD: {'rotation matrix': [[0.81953, -0.122236, 0.559847], [-0.565374, -0.331709, 0.755196], [0.093395, -0.935429, -0.340955]], 'translation vector': [1.794011, 2.466618, 1.378419]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.820406, -0.123018, 0.558391], [-0.564425, -0.330389, 0.756484], [0.091425, -0.935794, -0.340487]], 'translation vector': [1.795617, 2.461673, 1.379824]}\nB: {'rotation matrix': [[0.820181, -0.122779, 0.558774], [-0.564668, -0.330702, 0.756166], [0.091946, -0.935714, -0.340565]], 'translation vector': [1.795446, 2.463577, 1.379349]}\nC: {'rotation matrix': [[0.9999965570678159, 0.002414373935934805, -0.0013910970690502594], [-0.0024092149366839086, 0.9999893972371463, 0.004094256181755705], [0.0014001974871553952, -0.0040912377586492955, 0.9999902946910975]], 'translation vector': [0.0015616232476808878, 0.0015124074702654866, -0.0024993421767338653]}\nD: {'rotation matrix': [[0.81953, -0.122236, 0.559847], [-0.565374, -0.331709, 0.755196], [0.093395, -0.935429, -0.340955]], 'translation vector': [1.794011, 2.466618, 1.378419]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_177_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_177_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_177_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_177_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.802386, 0.058378, -0.593943], [0.596799, 0.07378, -0.798992], [-0.002822, -0.995564, -0.09404]], 'translation vector': [2.583445, 4.00863, 1.432702]}\nB: {'rotation matrix': [[-0.80243, 0.05764, -0.593956], [0.596739, 0.072442, -0.799159], [-0.003036, -0.995706, -0.092526]], 'translation vector': [2.583423, 4.00901, 1.432499]}\nC: {'rotation matrix': [[-0.802147, 0.057229, -0.594377], [0.597116, 0.07095, -0.799012], [-0.003555, -0.995837, -0.091085]], 'translation vector': [2.584156, 4.008181, 1.433043]}\nD: {'rotation matrix': [[0.9999994783035012, 0.00041051927851740725, -0.0005862593604151125], [-0.00040931866521127987, 0.9999994989245719, 0.001478878293238192], [0.0005866244600769685, -0.00147872503352613, 0.9999988017009569]], 'translation vector': [0.00041364848275698973, -0.004321265289284559, -0.00018345117394513721]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.802386, 0.058378, -0.593943], [0.596799, 0.07378, -0.798992], [-0.002822, -0.995564, -0.09404]], 'translation vector': [2.583445, 4.00863, 1.432702]}\nB: {'rotation matrix': [[-0.80243, 0.05764, -0.593956], [0.596739, 0.072442, -0.799159], [-0.003036, -0.995706, -0.092526]], 'translation vector': [2.583423, 4.00901, 1.432499]}\nC: {'rotation matrix': [[-0.802147, 0.057229, -0.594377], [0.597116, 0.07095, -0.799012], [-0.003555, -0.995837, -0.091085]], 'translation vector': [2.584156, 4.008181, 1.433043]}\nD: {'rotation matrix': [[0.9999994783035012, 0.00041051927851740725, -0.0005862593604151125], [-0.00040931866521127987, 0.9999994989245719, 0.001478878293238192], [0.0005866244600769685, -0.00147872503352613, 0.9999988017009569]], 'translation vector': [0.00041364848275698973, -0.004321265289284559, -0.00018345117394513721]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_178_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_178_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_178_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_178_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.992585, -0.064418, 0.103079], [-0.120896, -0.435169, 0.892195], [-0.012617, -0.898041, -0.43973]], 'translation vector': [3.286474, 2.568909, 1.509796]}\nB: {'rotation matrix': [[0.992385, -0.067834, 0.102811], [-0.122233, -0.439432, 0.889921], [-0.015188, -0.895711, -0.444377]], 'translation vector': [3.289696, 2.56831, 1.509591]}\nC: {'rotation matrix': [[0.9999807964258803, 0.0038860910275422844, -0.004806691946462499], [-0.003909951367268014, 0.9999796651555257, -0.005033098084012006], [0.004787399082777756, 0.005051536745205018, 0.999976147031262]], 'translation vector': [-0.001554233624264434, -0.002644430194159053, -0.0006288644424583545]}\nD: {'rotation matrix': [[0.992758, -0.062357, 0.102681], [-0.119576, -0.430727, 0.894525], [-0.011552, -0.900325, -0.435064]], 'translation vector': [3.283188, 2.568117, 1.510042]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.992585, -0.064418, 0.103079], [-0.120896, -0.435169, 0.892195], [-0.012617, -0.898041, -0.43973]], 'translation vector': [3.286474, 2.568909, 1.509796]}\nB: {'rotation matrix': [[0.992385, -0.067834, 0.102811], [-0.122233, -0.439432, 0.889921], [-0.015188, -0.895711, -0.444377]], 'translation vector': [3.289696, 2.56831, 1.509591]}\nC: {'rotation matrix': [[0.9999807964258803, 0.0038860910275422844, -0.004806691946462499], [-0.003909951367268014, 0.9999796651555257, -0.005033098084012006], [0.004787399082777756, 0.005051536745205018, 0.999976147031262]], 'translation vector': [-0.001554233624264434, -0.002644430194159053, -0.0006288644424583545]}\nD: {'rotation matrix': [[0.992758, -0.062357, 0.102681], [-0.119576, -0.430727, 0.894525], [-0.011552, -0.900325, -0.435064]], 'translation vector': [3.283188, 2.568117, 1.510042]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_179_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_179_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_179_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_179_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.422089, -0.487348, 0.764417], [-0.906483, 0.237506, -0.349114], [-0.011414, -0.840287, -0.542021]], 'translation vector': [1.410195, 1.210537, 1.389714]}\nB: {'rotation matrix': [[-0.429509, -0.487246, 0.760337], [-0.903019, 0.239935, -0.356353], [-0.0088, -0.839656, -0.543047]], 'translation vector': [1.408282, 1.210133, 1.390728]}\nC: {'rotation matrix': [[0.9999401042348363, 0.0004722608940826321, -0.010894896595028812], [-0.0004891614331012371, 0.9999986952391333, -0.0015551949611865112], [0.010893168247730851, 0.0015589513371194136, 0.9999394642924967]], 'translation vector': [-0.002554071705175298, -0.000426511698759402, -0.0005788025297613075]}\nD: {'rotation matrix': [[-0.426247, -0.487547, 0.761978], [-0.904552, 0.238954, -0.353109], [-0.00992, -0.839761, -0.542865]], 'translation vector': [1.409365, 1.209862, 1.390977]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.422089, -0.487348, 0.764417], [-0.906483, 0.237506, -0.349114], [-0.011414, -0.840287, -0.542021]], 'translation vector': [1.410195, 1.210537, 1.389714]}\nB: {'rotation matrix': [[-0.429509, -0.487246, 0.760337], [-0.903019, 0.239935, -0.356353], [-0.0088, -0.839656, -0.543047]], 'translation vector': [1.408282, 1.210133, 1.390728]}\nC: {'rotation matrix': [[0.9999401042348363, 0.0004722608940826321, -0.010894896595028812], [-0.0004891614331012371, 0.9999986952391333, -0.0015551949611865112], [0.010893168247730851, 0.0015589513371194136, 0.9999394642924967]], 'translation vector': [-0.002554071705175298, -0.000426511698759402, -0.0005788025297613075]}\nD: {'rotation matrix': [[-0.426247, -0.487547, 0.761978], [-0.904552, 0.238954, -0.353109], [-0.00992, -0.839761, -0.542865]], 'translation vector': [1.409365, 1.209862, 1.390977]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_180_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_180_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_180_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_180_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.471816, -0.325425, 0.819444], [-0.872235, -0.30807, 0.379868], [0.128827, -0.893975, -0.429199]], 'translation vector': [4.769558, 1.138603, 1.289356]}\nB: {'rotation matrix': [[0.9997418114978017, 0.011895326282976197, -0.019384512225947795], [-0.0117477819809795, 0.9999024052361882, 0.007665229365566708], [0.019474149960248398, -0.007435647706830853, 0.99978314162453]], 'translation vector': [0.0024302909823366026, 0.0025910713671302155, 0.004134808496160325]}\nC: {'rotation matrix': [[0.463265, -0.315518, 0.828151], [-0.87672, -0.299624, 0.376281], [0.129411, -0.900374, -0.415426]], 'translation vector': [4.764074, 1.139958, 1.290116]}\nD: {'rotation matrix': [[0.466198, -0.319071, 0.825139], [-0.875193, -0.302567, 0.377479], [0.129217, -0.898136, -0.420304]], 'translation vector': [4.766454, 1.138272, 1.288707]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.471816, -0.325425, 0.819444], [-0.872235, -0.30807, 0.379868], [0.128827, -0.893975, -0.429199]], 'translation vector': [4.769558, 1.138603, 1.289356]}\nB: {'rotation matrix': [[0.9997418114978017, 0.011895326282976197, -0.019384512225947795], [-0.0117477819809795, 0.9999024052361882, 0.007665229365566708], [0.019474149960248398, -0.007435647706830853, 0.99978314162453]], 'translation vector': [0.0024302909823366026, 0.0025910713671302155, 0.004134808496160325]}\nC: {'rotation matrix': [[0.463265, -0.315518, 0.828151], [-0.87672, -0.299624, 0.376281], [0.129411, -0.900374, -0.415426]], 'translation vector': [4.764074, 1.139958, 1.290116]}\nD: {'rotation matrix': [[0.466198, -0.319071, 0.825139], [-0.875193, -0.302567, 0.377479], [0.129217, -0.898136, -0.420304]], 'translation vector': [4.766454, 1.138272, 1.288707]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_181_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_181_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_181_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_181_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.782202, 0.158811, -0.602444], [0.623011, 0.192986, -0.758033], [-0.004121, -0.968264, -0.249895]], 'translation vector': [5.112607, 3.166242, 1.386639]}\nB: {'rotation matrix': [[-0.778966, 0.157543, -0.606954], [0.627051, 0.189047, -0.75569], [-0.00431, -0.969248, -0.246049]], 'translation vector': [5.115294, 3.157473, 1.383296]}\nC: {'rotation matrix': [[-0.779462, 0.157557, -0.606313], [0.62644, 0.190695, -0.755783], [-0.003458, -0.968923, -0.247339]], 'translation vector': [5.116126, 3.162086, 1.384797]}\nD: {'rotation matrix': [[0.9999744617727225, -0.0001956738250342968, -0.007090775323277213], [0.00023566199396404743, 0.9999840487337286, 0.005570125228072977], [0.007090207625955035, -0.005572126334409686, 0.9999593181215198]], 'translation vector': [0.001060093909475146, -0.0011413036070948707, -0.0054511706559239315]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.782202, 0.158811, -0.602444], [0.623011, 0.192986, -0.758033], [-0.004121, -0.968264, -0.249895]], 'translation vector': [5.112607, 3.166242, 1.386639]}\nB: {'rotation matrix': [[-0.778966, 0.157543, -0.606954], [0.627051, 0.189047, -0.75569], [-0.00431, -0.969248, -0.246049]], 'translation vector': [5.115294, 3.157473, 1.383296]}\nC: {'rotation matrix': [[-0.779462, 0.157557, -0.606313], [0.62644, 0.190695, -0.755783], [-0.003458, -0.968923, -0.247339]], 'translation vector': [5.116126, 3.162086, 1.384797]}\nD: {'rotation matrix': [[0.9999744617727225, -0.0001956738250342968, -0.007090775323277213], [0.00023566199396404743, 0.9999840487337286, 0.005570125228072977], [0.007090207625955035, -0.005572126334409686, 0.9999593181215198]], 'translation vector': [0.001060093909475146, -0.0011413036070948707, -0.0054511706559239315]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_182_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_182_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_182_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_182_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.883351, 0.250777, -0.395983], [0.468408, -0.502792, 0.726495], [-0.016909, -0.827231, -0.561607]], 'translation vector': [3.460753, 1.393703, 1.261616]}\nB: {'rotation matrix': [[0.9999974556820361, 0.0020951454008892073, 0.0005388626757519509], [-0.0020947728022776887, 0.9999969776383335, -0.0008772711173625334], [-0.0005408474142489815, 0.0008761512281933679, 0.9999997549182328]], 'translation vector': [0.002023718546634523, -0.000327483901387704, 0.000534647049254211]}\nC: {'rotation matrix': [[0.882846, 0.250522, -0.397268], [0.469328, -0.502506, 0.726099], [-0.017726, -0.827482, -0.561212]], 'translation vector': [3.46034, 1.393395, 1.261018]}\nD: {'rotation matrix': [[0.883505, 0.250774, -0.395641], [0.468134, -0.50232, 0.726998], [-0.016426, -0.827519, -0.561198]], 'translation vector': [3.461493, 1.393772, 1.262191]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.883351, 0.250777, -0.395983], [0.468408, -0.502792, 0.726495], [-0.016909, -0.827231, -0.561607]], 'translation vector': [3.460753, 1.393703, 1.261616]}\nB: {'rotation matrix': [[0.9999974556820361, 0.0020951454008892073, 0.0005388626757519509], [-0.0020947728022776887, 0.9999969776383335, -0.0008772711173625334], [-0.0005408474142489815, 0.0008761512281933679, 0.9999997549182328]], 'translation vector': [0.002023718546634523, -0.000327483901387704, 0.000534647049254211]}\nC: {'rotation matrix': [[0.882846, 0.250522, -0.397268], [0.469328, -0.502506, 0.726099], [-0.017726, -0.827482, -0.561212]], 'translation vector': [3.46034, 1.393395, 1.261018]}\nD: {'rotation matrix': [[0.883505, 0.250774, -0.395641], [0.468134, -0.50232, 0.726998], [-0.016426, -0.827519, -0.561198]], 'translation vector': [3.461493, 1.393772, 1.262191]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_183_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_183_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_183_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_183_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.082095, -0.690035, 0.719105], [-0.996618, 0.054259, -0.061711], [0.003565, -0.72174, -0.692155]], 'translation vector': [1.142854, 0.964299, 1.384999]}\nB: {'rotation matrix': [[-0.082522, -0.690473, 0.718636], [-0.99657, 0.052619, -0.063881], [0.006294, -0.721442, -0.692446]], 'translation vector': [1.142415, 0.962891, 1.383926]}\nC: {'rotation matrix': [[-0.081714, -0.689876, 0.719301], [-0.996653, 0.054911, -0.060558], [0.00228, -0.721842, -0.692054]], 'translation vector': [1.143872, 0.96595, 1.386324]}\nD: {'rotation matrix': [[0.9999981813065101, -0.0008741599533984917, -0.0018370380112936338], [0.0008726557083801512, 0.9999996139578026, -0.0006456924823724538], [0.0018388517886205992, 0.0006438367932182827, 0.999997821691748]], 'translation vector': [-0.0006029244698710912, 0.002574260386327465, 0.00012150794273751986]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.082095, -0.690035, 0.719105], [-0.996618, 0.054259, -0.061711], [0.003565, -0.72174, -0.692155]], 'translation vector': [1.142854, 0.964299, 1.384999]}\nB: {'rotation matrix': [[-0.082522, -0.690473, 0.718636], [-0.99657, 0.052619, -0.063881], [0.006294, -0.721442, -0.692446]], 'translation vector': [1.142415, 0.962891, 1.383926]}\nC: {'rotation matrix': [[-0.081714, -0.689876, 0.719301], [-0.996653, 0.054911, -0.060558], [0.00228, -0.721842, -0.692054]], 'translation vector': [1.143872, 0.96595, 1.386324]}\nD: {'rotation matrix': [[0.9999981813065101, -0.0008741599533984917, -0.0018370380112936338], [0.0008726557083801512, 0.9999996139578026, -0.0006456924823724538], [0.0018388517886205992, 0.0006438367932182827, 0.999997821691748]], 'translation vector': [-0.0006029244698710912, 0.002574260386327465, 0.00012150794273751986]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_184_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_184_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_184_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_184_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.885696, -0.241091, 0.396758], [-0.463478, 0.409419, -0.785852], [0.027022, -0.879915, -0.474362]], 'translation vector': [3.284311, 2.742399, 1.352773]}\nB: {'rotation matrix': [[-0.887326, -0.240072, 0.393723], [-0.460299, 0.409476, -0.787689], [0.027883, -0.880168, -0.473844]], 'translation vector': [3.284908, 2.737404, 1.354156]}\nC: {'rotation matrix': [[0.9999731444122048, 0.0012103447086951903, -0.007132060128435475], [-0.0011432251198222655, 0.9999548360017081, 0.009399220003853712], [0.007143404275372449, -0.009390869439370925, 0.9999297253861809]], 'translation vector': [-0.004051670502601468, 0.003985677205533222, -0.007748124103307941]}\nD: {'rotation matrix': [[-0.88833, -0.238233, 0.392575], [-0.45841, 0.409739, -0.788653], [0.02703, -0.880545, -0.473192]], 'translation vector': [3.286612, 2.733624, 1.354709]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.885696, -0.241091, 0.396758], [-0.463478, 0.409419, -0.785852], [0.027022, -0.879915, -0.474362]], 'translation vector': [3.284311, 2.742399, 1.352773]}\nB: {'rotation matrix': [[-0.887326, -0.240072, 0.393723], [-0.460299, 0.409476, -0.787689], [0.027883, -0.880168, -0.473844]], 'translation vector': [3.284908, 2.737404, 1.354156]}\nC: {'rotation matrix': [[0.9999731444122048, 0.0012103447086951903, -0.007132060128435475], [-0.0011432251198222655, 0.9999548360017081, 0.009399220003853712], [0.007143404275372449, -0.009390869439370925, 0.9999297253861809]], 'translation vector': [-0.004051670502601468, 0.003985677205533222, -0.007748124103307941]}\nD: {'rotation matrix': [[-0.88833, -0.238233, 0.392575], [-0.45841, 0.409739, -0.788653], [0.02703, -0.880545, -0.473192]], 'translation vector': [3.286612, 2.733624, 1.354709]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_185_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_185_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_185_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_185_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.929629, 0.142082, -0.340003], [0.367757, 0.416138, -0.831615], [0.023331, -0.898132, -0.439106]], 'translation vector': [3.895597, 4.105544, 1.337128]}\nB: {'rotation matrix': [[0.9999978567692223, -0.002080974608083212, 0.0006913520673930054], [0.0020836393175395, 0.9999892180319079, -0.004134789557599625], [-0.0006820179303371059, 0.00413702467971305, 0.9999915554185601]], 'translation vector': [0.0017503586952924977, 0.001995171524919126, -0.0003721348284200232]}\nC: {'rotation matrix': [[-0.930698, 0.142163, -0.337032], [0.365142, 0.415816, -0.832928], [0.021732, -0.898269, -0.438909]], 'translation vector': [3.896934, 4.102128, 1.337288]}\nD: {'rotation matrix': [[-0.927672, 0.142632, -0.34508], [0.372556, 0.415512, -0.82979], [0.02503, -0.898335, -0.438597]], 'translation vector': [3.896674, 4.103256, 1.336071]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.929629, 0.142082, -0.340003], [0.367757, 0.416138, -0.831615], [0.023331, -0.898132, -0.439106]], 'translation vector': [3.895597, 4.105544, 1.337128]}\nB: {'rotation matrix': [[0.9999978567692223, -0.002080974608083212, 0.0006913520673930054], [0.0020836393175395, 0.9999892180319079, -0.004134789557599625], [-0.0006820179303371059, 0.00413702467971305, 0.9999915554185601]], 'translation vector': [0.0017503586952924977, 0.001995171524919126, -0.0003721348284200232]}\nC: {'rotation matrix': [[-0.930698, 0.142163, -0.337032], [0.365142, 0.415816, -0.832928], [0.021732, -0.898269, -0.438909]], 'translation vector': [3.896934, 4.102128, 1.337288]}\nD: {'rotation matrix': [[-0.927672, 0.142632, -0.34508], [0.372556, 0.415512, -0.82979], [0.02503, -0.898335, -0.438597]], 'translation vector': [3.896674, 4.103256, 1.336071]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_186_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_186_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_186_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_186_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.97654, 0.035326, -0.212419], [0.213134, -0.299258, 0.930064], [-0.030712, -0.953518, -0.299767]], 'translation vector': [2.83562, 1.415562, 1.663413]}\nB: {'rotation matrix': [[0.976528, 0.035613, -0.212425], [0.213211, -0.299739, 0.929891], [-0.030556, -0.953356, -0.300296]], 'translation vector': [2.836028, 1.415543, 1.663749]}\nC: {'rotation matrix': [[0.976477, 0.035047, -0.212752], [0.213359, -0.299571, 0.929912], [-0.031143, -0.95343, -0.300002]], 'translation vector': [2.836339, 1.415174, 1.663386]}\nD: {'rotation matrix': [[0.9999989734089719, -0.00043750334983729943, -0.0007413258006153675], [0.00043629581692446537, 0.9999991219954293, -0.0014728840089735186], [0.0007417507454709163, 0.0014731899576831099, 0.9999977641755871]], 'translation vector': [0.0005969954680278278, -0.001266102560834259, -0.001081742192190538]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.97654, 0.035326, -0.212419], [0.213134, -0.299258, 0.930064], [-0.030712, -0.953518, -0.299767]], 'translation vector': [2.83562, 1.415562, 1.663413]}\nB: {'rotation matrix': [[0.976528, 0.035613, -0.212425], [0.213211, -0.299739, 0.929891], [-0.030556, -0.953356, -0.300296]], 'translation vector': [2.836028, 1.415543, 1.663749]}\nC: {'rotation matrix': [[0.976477, 0.035047, -0.212752], [0.213359, -0.299571, 0.929912], [-0.031143, -0.95343, -0.300002]], 'translation vector': [2.836339, 1.415174, 1.663386]}\nD: {'rotation matrix': [[0.9999989734089719, -0.00043750334983729943, -0.0007413258006153675], [0.00043629581692446537, 0.9999991219954293, -0.0014728840089735186], [0.0007417507454709163, 0.0014731899576831099, 0.9999977641755871]], 'translation vector': [0.0005969954680278278, -0.001266102560834259, -0.001081742192190538]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_187_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_187_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_187_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_187_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.693693, -0.417889, 0.586651], [-0.719234, -0.35819, 0.595318], [-0.038645, -0.834907, -0.549033]], 'translation vector': [2.468094, 0.650908, 1.47083]}\nB: {'rotation matrix': [[0.694995, -0.417186, 0.58561], [-0.71783, -0.35584, 0.598413], [-0.041266, -0.836262, -0.546775]], 'translation vector': [2.468435, 0.652249, 1.472357]}\nC: {'rotation matrix': [[0.692825, -0.417185, 0.588176], [-0.720192, -0.359253, 0.593516], [-0.036302, -0.834802, -0.549352]], 'translation vector': [2.467356, 0.649437, 1.470088]}\nD: {'rotation matrix': [[0.9999933308482389, -0.003211931119773083, -0.0013319651843937527], [0.003213068050675667, 0.999995289918959, 0.0008982640947086999], [0.0013289957939952358, -0.000903126250102818, 0.9999984333444373]], 'translation vector': [0.0004300736920825887, -0.0014825221206589134, 0.0006853212942847797]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.693693, -0.417889, 0.586651], [-0.719234, -0.35819, 0.595318], [-0.038645, -0.834907, -0.549033]], 'translation vector': [2.468094, 0.650908, 1.47083]}\nB: {'rotation matrix': [[0.694995, -0.417186, 0.58561], [-0.71783, -0.35584, 0.598413], [-0.041266, -0.836262, -0.546775]], 'translation vector': [2.468435, 0.652249, 1.472357]}\nC: {'rotation matrix': [[0.692825, -0.417185, 0.588176], [-0.720192, -0.359253, 0.593516], [-0.036302, -0.834802, -0.549352]], 'translation vector': [2.467356, 0.649437, 1.470088]}\nD: {'rotation matrix': [[0.9999933308482389, -0.003211931119773083, -0.0013319651843937527], [0.003213068050675667, 0.999995289918959, 0.0008982640947086999], [0.0013289957939952358, -0.000903126250102818, 0.9999984333444373]], 'translation vector': [0.0004300736920825887, -0.0014825221206589134, 0.0006853212942847797]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_188_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_188_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_188_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_188_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.999996219803918, 0.0022950861591112606, 0.0016527156635626785], [-0.002306966520835195, 0.9999706555795355, 0.007290423829373102], [-0.0016351749151619617, -0.007294378220159148, 0.9999718191472952]], 'translation vector': [-0.0006106128955529755, 0.003113758643644493, 0.0009225265168792962]}\nB: {'rotation matrix': [[-0.815808, -0.262316, 0.51541], [-0.578233, 0.385672, -0.71896], [-0.010185, -0.884561, -0.466314]], 'translation vector': [2.767913, 1.370181, 1.363789]}\nC: {'rotation matrix': [[-0.81395, -0.261884, 0.518557], [-0.58082, 0.384609, -0.717443], [-0.011555, -0.885151, -0.46516]], 'translation vector': [2.76859, 1.370986, 1.364432]}\nD: {'rotation matrix': [[-0.813152, -0.262698, 0.519397], [-0.581967, 0.382082, -0.717863], [-0.009871, -0.886004, -0.463573]], 'translation vector': [2.770085, 1.372341, 1.364365]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.999996219803918, 0.0022950861591112606, 0.0016527156635626785], [-0.002306966520835195, 0.9999706555795355, 0.007290423829373102], [-0.0016351749151619617, -0.007294378220159148, 0.9999718191472952]], 'translation vector': [-0.0006106128955529755, 0.003113758643644493, 0.0009225265168792962]}\nB: {'rotation matrix': [[-0.815808, -0.262316, 0.51541], [-0.578233, 0.385672, -0.71896], [-0.010185, -0.884561, -0.466314]], 'translation vector': [2.767913, 1.370181, 1.363789]}\nC: {'rotation matrix': [[-0.81395, -0.261884, 0.518557], [-0.58082, 0.384609, -0.717443], [-0.011555, -0.885151, -0.46516]], 'translation vector': [2.76859, 1.370986, 1.364432]}\nD: {'rotation matrix': [[-0.813152, -0.262698, 0.519397], [-0.581967, 0.382082, -0.717863], [-0.009871, -0.886004, -0.463573]], 'translation vector': [2.770085, 1.372341, 1.364365]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_189_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_189_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_189_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_189_3.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.928393, -0.117955, 0.352381], [-0.371416, -0.324296, 0.86999], [0.011656, -0.938573, -0.344885]], 'translation vector': [5.42922, 4.041657, 1.370122]}\nB: {'rotation matrix': [[0.9999934814259427, -0.0026531579456658904, 0.00224130267955375], [0.0026602242897140562, 0.9999913138748001, -0.003078123639036722], [-0.0022329200630453808, 0.0030851593249457813, 0.9999929388785541]], 'translation vector': [0.005873456268956634, 0.01508492723074184, 0.0010277199164683282]}\nC: {'rotation matrix': [[0.928402, -0.120953, 0.351341], [-0.37149, -0.322693, 0.870554], [0.008079, -0.938743, -0.344522]], 'translation vector': [5.430759, 4.038916, 1.364124]}\nD: {'rotation matrix': [[0.929388, -0.122542, 0.348169], [-0.369059, -0.323333, 0.87135], [0.005798, -0.938317, -0.345726]], 'translation vector': [5.437048, 4.036695, 1.363649]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.928393, -0.117955, 0.352381], [-0.371416, -0.324296, 0.86999], [0.011656, -0.938573, -0.344885]], 'translation vector': [5.42922, 4.041657, 1.370122]}\nB: {'rotation matrix': [[0.9999934814259427, -0.0026531579456658904, 0.00224130267955375], [0.0026602242897140562, 0.9999913138748001, -0.003078123639036722], [-0.0022329200630453808, 0.0030851593249457813, 0.9999929388785541]], 'translation vector': [0.005873456268956634, 0.01508492723074184, 0.0010277199164683282]}\nC: {'rotation matrix': [[0.928402, -0.120953, 0.351341], [-0.37149, -0.322693, 0.870554], [0.008079, -0.938743, -0.344522]], 'translation vector': [5.430759, 4.038916, 1.364124]}\nD: {'rotation matrix': [[0.929388, -0.122542, 0.348169], [-0.369059, -0.323333, 0.87135], [0.005798, -0.938317, -0.345726]], 'translation vector': [5.437048, 4.036695, 1.363649]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_190_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_190_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_190_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_190_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.953183, -0.088424, 0.289177], [-0.302167, 0.24151, -0.922154], [0.011701, -0.966361, -0.256923]], 'translation vector': [1.211269, 4.890959, 1.556685]}\nB: {'rotation matrix': [[-0.956401, -0.093459, 0.276701], [-0.291436, 0.243606, -0.925052], [0.019048, -0.965361, -0.260222]], 'translation vector': [1.215449, 4.887503, 1.555227]}\nC: {'rotation matrix': [[-0.955218, -0.090693, 0.281661], [-0.295515, 0.243707, -0.92373], [0.015133, -0.965599, -0.259595]], 'translation vector': [1.213553, 4.889249, 1.555428]}\nD: {'rotation matrix': [[0.9999952251812577, 0.0022961074705375706, -0.0021161445756707063], [-0.002289678755021547, 0.9999926232978033, 0.0030864959240033065], [0.002122435340409546, -0.003080853599724993, 0.9999927920546235]], 'translation vector': [-0.0038335858537008605, -0.001641279240269633, 0.007141792646779166]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.953183, -0.088424, 0.289177], [-0.302167, 0.24151, -0.922154], [0.011701, -0.966361, -0.256923]], 'translation vector': [1.211269, 4.890959, 1.556685]}\nB: {'rotation matrix': [[-0.956401, -0.093459, 0.276701], [-0.291436, 0.243606, -0.925052], [0.019048, -0.965361, -0.260222]], 'translation vector': [1.215449, 4.887503, 1.555227]}\nC: {'rotation matrix': [[-0.955218, -0.090693, 0.281661], [-0.295515, 0.243707, -0.92373], [0.015133, -0.965599, -0.259595]], 'translation vector': [1.213553, 4.889249, 1.555428]}\nD: {'rotation matrix': [[0.9999952251812577, 0.0022961074705375706, -0.0021161445756707063], [-0.002289678755021547, 0.9999926232978033, 0.0030864959240033065], [0.002122435340409546, -0.003080853599724993, 0.9999927920546235]], 'translation vector': [-0.0038335858537008605, -0.001641279240269633, 0.007141792646779166]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_191_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_191_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_191_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_191_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.984658, -0.071177, 0.15932], [-0.173691, -0.312158, 0.934018], [-0.016748, -0.94736, -0.319732]], 'translation vector': [3.953827, 2.817107, 1.554211]}\nB: {'rotation matrix': [[0.984585, -0.072332, 0.159251], [-0.17407, -0.316233, 0.932575], [-0.017095, -0.94592, -0.323949]], 'translation vector': [3.956161, 2.818039, 1.553922]}\nC: {'rotation matrix': [[1.000000390586898, -0.00015899006682454229, -7.860499707006931e-05], [0.0001592053293816045, 0.9999988073723325, -0.001691416563679354], [7.911078695652774e-05, 0.0016920717225864005, 0.999998246065904]], 'translation vector': [-0.0031188610741375022, -0.006275319353865605, -0.0020119857094367255]}\nD: {'rotation matrix': [[0.985021, -0.07075, 0.157254], [-0.171705, -0.318523, 0.932234], [-0.015866, -0.945271, -0.325899]], 'translation vector': [3.958103, 2.817717, 1.548612]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.984658, -0.071177, 0.15932], [-0.173691, -0.312158, 0.934018], [-0.016748, -0.94736, -0.319732]], 'translation vector': [3.953827, 2.817107, 1.554211]}\nB: {'rotation matrix': [[0.984585, -0.072332, 0.159251], [-0.17407, -0.316233, 0.932575], [-0.017095, -0.94592, -0.323949]], 'translation vector': [3.956161, 2.818039, 1.553922]}\nC: {'rotation matrix': [[1.000000390586898, -0.00015899006682454229, -7.860499707006931e-05], [0.0001592053293816045, 0.9999988073723325, -0.001691416563679354], [7.911078695652774e-05, 0.0016920717225864005, 0.999998246065904]], 'translation vector': [-0.0031188610741375022, -0.006275319353865605, -0.0020119857094367255]}\nD: {'rotation matrix': [[0.985021, -0.07075, 0.157254], [-0.171705, -0.318523, 0.932234], [-0.015866, -0.945271, -0.325899]], 'translation vector': [3.958103, 2.817717, 1.548612]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_192_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_192_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_192_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_192_3.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.324348, -0.501243, 0.802218], [-0.945427, 0.143915, -0.292328], [0.031076, -0.853255, -0.520567]], 'translation vector': [-0.28287, 2.921737, 1.307859]}\nB: {'rotation matrix': [[0.9998206713039935, 0.01088419825826216, -0.015474672154137172], [-0.010778287023678227, 0.999918107104106, 0.00690122311170722], [0.015548906823126927, -0.006732006329303681, 0.9998567482634122]], 'translation vector': [-0.0009142965758486277, -0.0021867567072129113, 0.0020065217081752795]}\nC: {'rotation matrix': [[-0.336594, -0.496252, 0.800274], [-0.941144, 0.149432, -0.30318], [0.030867, -0.855222, -0.517342]], 'translation vector': [-0.283556, 2.919329, 1.307566]}\nD: {'rotation matrix': [[-0.33061, -0.49822, 0.801544], [-0.943277, 0.147056, -0.297664], [0.03043, -0.854489, -0.518578]], 'translation vector': [-0.283976, 2.918767, 1.308425]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.324348, -0.501243, 0.802218], [-0.945427, 0.143915, -0.292328], [0.031076, -0.853255, -0.520567]], 'translation vector': [-0.28287, 2.921737, 1.307859]}\nB: {'rotation matrix': [[0.9998206713039935, 0.01088419825826216, -0.015474672154137172], [-0.010778287023678227, 0.999918107104106, 0.00690122311170722], [0.015548906823126927, -0.006732006329303681, 0.9998567482634122]], 'translation vector': [-0.0009142965758486277, -0.0021867567072129113, 0.0020065217081752795]}\nC: {'rotation matrix': [[-0.336594, -0.496252, 0.800274], [-0.941144, 0.149432, -0.30318], [0.030867, -0.855222, -0.517342]], 'translation vector': [-0.283556, 2.919329, 1.307566]}\nD: {'rotation matrix': [[-0.33061, -0.49822, 0.801544], [-0.943277, 0.147056, -0.297664], [0.03043, -0.854489, -0.518578]], 'translation vector': [-0.283976, 2.918767, 1.308425]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_193_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_193_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_193_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_193_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.988022, -0.009517, -0.154018], [0.15411, 0.009774, 0.988005], [-0.007897, -0.999907, 0.011124]], 'translation vector': [3.954252, 2.675021, 1.588509]}\nB: {'rotation matrix': [[0.9999405001439704, 0.001517544746561925, 0.010769614189192206], [-0.0015446990948080185, 0.9999954653771669, 0.0025653888781814608], [-0.01076601396546154, -0.002581837929747806, 0.999938856444669]], 'translation vector': [-0.04482632526707597, 0.009643399205063075, 0.00020168741742709884]}\nC: {'rotation matrix': [[0.989616, -0.01086, -0.14333], [0.143408, 0.0068, 0.98964], [-0.009773, -0.999918, 0.008287]], 'translation vector': [3.942101, 2.673398, 1.591243]}\nD: {'rotation matrix': [[0.990689, -0.012911, -0.13553], [0.135653, 0.009179, 0.990714], [-0.011547, -0.999875, 0.010845]], 'translation vector': [3.935715, 2.670411, 1.599032]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.988022, -0.009517, -0.154018], [0.15411, 0.009774, 0.988005], [-0.007897, -0.999907, 0.011124]], 'translation vector': [3.954252, 2.675021, 1.588509]}\nB: {'rotation matrix': [[0.9999405001439704, 0.001517544746561925, 0.010769614189192206], [-0.0015446990948080185, 0.9999954653771669, 0.0025653888781814608], [-0.01076601396546154, -0.002581837929747806, 0.999938856444669]], 'translation vector': [-0.04482632526707597, 0.009643399205063075, 0.00020168741742709884]}\nC: {'rotation matrix': [[0.989616, -0.01086, -0.14333], [0.143408, 0.0068, 0.98964], [-0.009773, -0.999918, 0.008287]], 'translation vector': [3.942101, 2.673398, 1.591243]}\nD: {'rotation matrix': [[0.990689, -0.012911, -0.13553], [0.135653, 0.009179, 0.990714], [-0.011547, -0.999875, 0.010845]], 'translation vector': [3.935715, 2.670411, 1.599032]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_194_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_194_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_194_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_194_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.986946, -0.051965, 0.152438], [-0.150832, 0.630041, -0.761774], [-0.056457, -0.774822, -0.629654]], 'translation vector': [2.054614, 1.600808, 1.269291]}\nB: {'rotation matrix': [[-0.986874, -0.051472, 0.153072], [-0.151005, 0.630133, -0.761663], [-0.057252, -0.774779, -0.629634]], 'translation vector': [2.054307, 1.600529, 1.268919]}\nC: {'rotation matrix': [[-0.98698, -0.05266, 0.151977], [-0.150937, 0.629701, -0.762033], [-0.055572, -0.77505, -0.629451]], 'translation vector': [2.055977, 1.600957, 1.269368]}\nD: {'rotation matrix': [[0.9999963800999353, -0.00021451754451556594, -0.002594557385802436], [0.0002238699679127643, 0.9999932861643056, 0.0035784816404103737], [0.002594523779822574, -0.003578355478331594, 0.9999903397449379]], 'translation vector': [-0.000729278960620583, -0.000294753198244152, 0.0006269109678853635]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.986946, -0.051965, 0.152438], [-0.150832, 0.630041, -0.761774], [-0.056457, -0.774822, -0.629654]], 'translation vector': [2.054614, 1.600808, 1.269291]}\nB: {'rotation matrix': [[-0.986874, -0.051472, 0.153072], [-0.151005, 0.630133, -0.761663], [-0.057252, -0.774779, -0.629634]], 'translation vector': [2.054307, 1.600529, 1.268919]}\nC: {'rotation matrix': [[-0.98698, -0.05266, 0.151977], [-0.150937, 0.629701, -0.762033], [-0.055572, -0.77505, -0.629451]], 'translation vector': [2.055977, 1.600957, 1.269368]}\nD: {'rotation matrix': [[0.9999963800999353, -0.00021451754451556594, -0.002594557385802436], [0.0002238699679127643, 0.9999932861643056, 0.0035784816404103737], [0.002594523779822574, -0.003578355478331594, 0.9999903397449379]], 'translation vector': [-0.000729278960620583, -0.000294753198244152, 0.0006269109678853635]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_195_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_195_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_195_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_195_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.68897, 0.400817, -0.603877], [0.724521, 0.403569, -0.55875], [0.01975, -0.822483, -0.568447]], 'translation vector': [2.703838, 2.593028, 1.451995]}\nB: {'rotation matrix': [[0.9999867433099353, 0.0013037371773630478, -0.00505037420137311], [-0.0013327082277047075, 0.9999835004529126, -0.005699178965962697], [0.005042267928314986, 0.005705137184663826, 0.9999709125049986]], 'translation vector': [-0.003533739342698565, -0.0004956556588801009, 0.0008851453202214365]}\nC: {'rotation matrix': [[-0.687775, 0.405276, -0.60226], [0.725687, 0.405077, -0.55614], [0.018572, -0.819551, -0.572706]], 'translation vector': [2.702493, 2.593958, 1.452821]}\nD: {'rotation matrix': [[-0.6898, 0.39893, -0.604178], [0.723736, 0.402474, -0.560554], [0.019544, -0.823935, -0.566347]], 'translation vector': [2.703783, 2.591564, 1.452902]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.68897, 0.400817, -0.603877], [0.724521, 0.403569, -0.55875], [0.01975, -0.822483, -0.568447]], 'translation vector': [2.703838, 2.593028, 1.451995]}\nB: {'rotation matrix': [[0.9999867433099353, 0.0013037371773630478, -0.00505037420137311], [-0.0013327082277047075, 0.9999835004529126, -0.005699178965962697], [0.005042267928314986, 0.005705137184663826, 0.9999709125049986]], 'translation vector': [-0.003533739342698565, -0.0004956556588801009, 0.0008851453202214365]}\nC: {'rotation matrix': [[-0.687775, 0.405276, -0.60226], [0.725687, 0.405077, -0.55614], [0.018572, -0.819551, -0.572706]], 'translation vector': [2.702493, 2.593958, 1.452821]}\nD: {'rotation matrix': [[-0.6898, 0.39893, -0.604178], [0.723736, 0.402474, -0.560554], [0.019544, -0.823935, -0.566347]], 'translation vector': [2.703783, 2.591564, 1.452902]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_196_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_196_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_196_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_196_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[0.956857, -0.169845, 0.235747], [-0.290511, -0.544648, 0.786741], [-0.005224, -0.821286, -0.570492]], 'translation vector': [1.276382, 2.833935, 1.317457]}\nB: {'rotation matrix': [[0.9999970754093099, -0.00022106845961624886, 0.002343703924443124], [0.0002217235397933884, 1.000000114885585, -0.0003702254651105004], [-0.0023436320323324275, 0.00037212161436898277, 0.9999969566117395]], 'translation vector': [-0.0012761111666008684, -0.00028494477773222116, -0.0002961069063054378]}\nC: {'rotation matrix': [[0.956588, -0.169724, 0.236925], [-0.291413, -0.54533, 0.785935], [-0.00419, -0.820859, -0.571116]], 'translation vector': [1.27605, 2.834144, 1.316524]}\nD: {'rotation matrix': [[0.956511, -0.169195, 0.237615], [-0.291683, -0.546399, 0.785092], [-0.003001, -0.820257, -0.571988]], 'translation vector': [1.276076, 2.834318, 1.31658]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[0.956857, -0.169845, 0.235747], [-0.290511, -0.544648, 0.786741], [-0.005224, -0.821286, -0.570492]], 'translation vector': [1.276382, 2.833935, 1.317457]}\nB: {'rotation matrix': [[0.9999970754093099, -0.00022106845961624886, 0.002343703924443124], [0.0002217235397933884, 1.000000114885585, -0.0003702254651105004], [-0.0023436320323324275, 0.00037212161436898277, 0.9999969566117395]], 'translation vector': [-0.0012761111666008684, -0.00028494477773222116, -0.0002961069063054378]}\nC: {'rotation matrix': [[0.956588, -0.169724, 0.236925], [-0.291413, -0.54533, 0.785935], [-0.00419, -0.820859, -0.571116]], 'translation vector': [1.27605, 2.834144, 1.316524]}\nD: {'rotation matrix': [[0.956511, -0.169195, 0.237615], [-0.291683, -0.546399, 0.785092], [-0.003001, -0.820257, -0.571988]], 'translation vector': [1.276076, 2.834318, 1.31658]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_197_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_197_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_197_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_197_3.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.13245, -0.562539, 0.816092], [-0.991146, 0.067404, -0.114398], [0.009346, -0.824018, -0.566486]], 'translation vector': [2.413971, 4.448666, 1.362137]}\nB: {'rotation matrix': [[-0.128485, -0.560739, 0.817963], [-0.991659, 0.064113, -0.111818], [0.010258, -0.825507, -0.564299]], 'translation vector': [2.417159, 4.443525, 1.361777]}\nC: {'rotation matrix': [[-0.132037, -0.563719, 0.815345], [-0.991214, 0.068574, -0.113106], [0.007848, -0.823115, -0.567821]], 'translation vector': [2.411706, 4.446467, 1.360844]}\nD: {'rotation matrix': [[0.9999987411869987, 0.0004418575380488951, 0.0016406984025160954], [-0.0004493688554351619, 0.9999862647190665, 0.005219826446688158], [-0.0016383092715178728, -0.005221150249650812, 0.9999852785117939]], 'translation vector': [-0.005409867262581081, 0.0012422036096766398, -0.002713386665627704]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.13245, -0.562539, 0.816092], [-0.991146, 0.067404, -0.114398], [0.009346, -0.824018, -0.566486]], 'translation vector': [2.413971, 4.448666, 1.362137]}\nB: {'rotation matrix': [[-0.128485, -0.560739, 0.817963], [-0.991659, 0.064113, -0.111818], [0.010258, -0.825507, -0.564299]], 'translation vector': [2.417159, 4.443525, 1.361777]}\nC: {'rotation matrix': [[-0.132037, -0.563719, 0.815345], [-0.991214, 0.068574, -0.113106], [0.007848, -0.823115, -0.567821]], 'translation vector': [2.411706, 4.446467, 1.360844]}\nD: {'rotation matrix': [[0.9999987411869987, 0.0004418575380488951, 0.0016406984025160954], [-0.0004493688554351619, 0.9999862647190665, 0.005219826446688158], [-0.0016383092715178728, -0.005221150249650812, 0.9999852785117939]], 'translation vector': [-0.005409867262581081, 0.0012422036096766398, -0.002713386665627704]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_198_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_198_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_198_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_198_3.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Pose_Estimation", "visual_input_component": "3d image", "source": "SCANNET_threed_pose_estimation", "options": "A: {'rotation matrix': [[-0.394987, 0.317496, -0.862079], [0.912593, 0.027696, -0.407931], [-0.10564, -0.947855, -0.300685]], 'translation vector': [4.882912, 2.963368, 1.402415]}\nB: {'rotation matrix': [[-0.393743, 0.318728, -0.862194], [0.912946, 0.026185, -0.40724], [-0.107222, -0.947484, -0.301292]], 'translation vector': [4.884082, 2.960136, 1.407949]}\nC: {'rotation matrix': [[-0.393984, 0.318317, -0.862236], [0.913424, 0.031343, -0.405802], [-0.102149, -0.947466, -0.303106]], 'translation vector': [4.883262, 2.96182, 1.402411]}\nD: {'rotation matrix': [[0.9999866432276233, 0.002858711999338773, -0.0043010740025884895], [-0.0027980802651053054, 0.9998995896653632, 0.013894745582855106], [0.004339725514880082, -0.013881253175420062, 0.9998943009326048]], 'translation vector': [-0.001200424251745491, -0.0035619824296807545, 0.0012814893270478578]}", "question": "Given a pair of a RGB image and a depth image for scan 0, along with another pair of a RGB image and a depth image for scan 1, please estimate the relative camera pose from scan 0 to scan 1. The output should indicate the rotation matrix and the translation vector.", "context": "Your task is to estimate the relative camera pose between two scans. \nSelect from the following choices.\nA: {'rotation matrix': [[-0.394987, 0.317496, -0.862079], [0.912593, 0.027696, -0.407931], [-0.10564, -0.947855, -0.300685]], 'translation vector': [4.882912, 2.963368, 1.402415]}\nB: {'rotation matrix': [[-0.393743, 0.318728, -0.862194], [0.912946, 0.026185, -0.40724], [-0.107222, -0.947484, -0.301292]], 'translation vector': [4.884082, 2.960136, 1.407949]}\nC: {'rotation matrix': [[-0.393984, 0.318317, -0.862236], [0.913424, 0.031343, -0.405802], [-0.102149, -0.947466, -0.303106]], 'translation vector': [4.883262, 2.96182, 1.402411]}\nD: {'rotation matrix': [[0.9999866432276233, 0.002858711999338773, -0.0043010740025884895], [-0.0027980802651053054, 0.9998995896653632, 0.013894745582855106], [0.004339725514880082, -0.013881253175420062, 0.9998943009326048]], 'translation vector': [-0.001200424251745491, -0.0035619824296807545, 0.0012814893270478578]}", "input_image_path": ["./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_199_0.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_199_1.png", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_199_2.jpg", "./3D-spatial/threeD_Pose_Estimation/threeD_Pose_Estimation_199_3.png"], "output": "D", "qwen3-vl": "image none"}]
\ No newline at end of file
diff --git a/results/threeD_Scene_Reconstruction/qwen3-vl/metadata_info.json b/results/threeD_Scene_Reconstruction/qwen3-vl/metadata_info.json
new file mode 100644
index 0000000..9866bbd
--- /dev/null
+++ b/results/threeD_Scene_Reconstruction/qwen3-vl/metadata_info.json
@@ -0,0 +1 @@
+[{"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.424269, -0.366439, 0.828081], [-0.894198, -0.025281, 0.446957], [-0.142848, -0.930098, -0.338395]] and translation vector: [2.638367, 6.760901, 1.41712], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.432512, -0.37625, 0.819371], [-0.890339, -0.034872, 0.45396], [-0.14223, -0.925862, -0.350073]] and translation vector: [2.640049, 6.763855, 1.420073], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.438239, -0.392392, 0.808687], [-0.889665, -0.061011, 0.452519], [-0.128226, -0.917772, -0.375835]] and translation vector: [2.630422, 6.772062, 1.413381]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_0_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_0_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_0_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_0_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_0_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_0_5.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_0_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_0_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.156961, 0.257294, -0.953501], [0.986843, 0.002956, -0.161652], [-0.038773, -0.966329, -0.254373]] and translation vector: [1.838324, 1.205476, 1.480452], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.155829, 0.255617, -0.954137], [0.987039, 0.002796, -0.160453], [-0.038347, -0.966774, -0.252739]] and translation vector: [1.83996, 1.205416, 1.474648], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.143517, 0.25546, -0.956108], [0.988424, -0.011031, -0.151315], [-0.049202, -0.966757, -0.25092]] and translation vector: [1.851541, 1.18465, 1.4701]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_1_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_1_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_1_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_1_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_1_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_1_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_1_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_1_7.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.255252, -0.433184, 0.864406], [-0.966562, 0.137073, -0.216725], [-0.024605, -0.890821, -0.453687]] and translation vector: [1.468232, 3.881342, 1.432686], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.253329, -0.437174, 0.862962], [-0.967015, 0.138948, -0.213484], [-0.026577, -0.888579, -0.457953]] and translation vector: [1.469363, 3.879031, 1.438972], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.261321, -0.422366, 0.867939], [-0.964773, 0.142608, -0.221079], [-0.030398, -0.895137, -0.444754]] and translation vector: [1.471272, 3.88079, 1.429099]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_2_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_2_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_2_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_2_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_2_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_2_5.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_2_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_2_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.99336, -0.011945, -0.114427], [0.103059, -0.349694, 0.931178], [-0.051137, -0.936788, -0.346141]] and translation vector: [2.948285, 4.432959, 1.460427], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.99314, -0.016022, -0.115825], [0.102925, -0.35027, 0.930977], [-0.055486, -0.936512, -0.346218]] and translation vector: [2.949102, 4.433566, 1.463483], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.994232, -0.017087, -0.105881], [0.09324, -0.350155, 0.93204], [-0.053001, -0.936536, -0.346542]] and translation vector: [2.955784, 4.441682, 1.459117]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_3_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_3_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_3_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_3_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_3_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_3_5.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_3_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_3_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.30056, -0.511506, 0.805], [-0.953151, 0.130866, -0.272721], [0.034151, -0.849256, -0.526876]] and translation vector: [-0.281614, 2.924112, 1.306122], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.318531, -0.50267, 0.803655], [-0.947336, 0.139247, -0.288383], [0.033055, -0.85319, -0.520551]] and translation vector: [-0.284617, 2.924129, 1.305331], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.357195, -0.491936, 0.793984], [-0.933829, 0.17044, -0.314507], [0.019391, -0.853785, -0.520264]] and translation vector: [-0.283755, 2.908583, 1.310995]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_4_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_4_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_4_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_4_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_4_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_4_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_4_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_4_7.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.042655, 0.409797, -0.911179], [0.998036, -0.024411, -0.0577], [-0.045888, -0.91185, -0.40795]] and translation vector: [2.423933, 1.356295, 3.282493], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.032887, 0.418885, -0.907444], [0.998611, -0.023628, -0.047098], [-0.041169, -0.907732, -0.417526]] and translation vector: [2.425306, 1.358764, 3.278826], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.041885, 0.387609, -0.920872], [0.998138, -0.024683, -0.055789], [-0.044354, -0.921493, -0.385853]] and translation vector: [2.418078, 1.34298, 3.29873]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_5_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_5_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_5_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_5_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_5_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_5_5.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_5_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_5_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.241978, -0.427128, 0.871211], [-0.963615, 0.210861, -0.164264], [-0.113543, -0.879261, -0.462611]] and translation vector: [2.164319, 10.11033, 1.716674], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.23973, -0.426819, 0.871983], [-0.964754, 0.205144, -0.16482], [-0.108534, -0.880762, -0.460955]] and translation vector: [2.164643, 10.108889, 1.726434], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.169937, -0.42419, 0.889485], [-0.982379, 0.144175, -0.118927], [-0.077795, -0.894023, -0.441217]] and translation vector: [2.137954, 10.094281, 1.733226]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_6_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_6_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_6_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_6_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_6_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_6_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_6_6.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_6_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.753053, 0.123809, -0.646206], [0.619922, -0.462608, 0.633791], [-0.220471, -0.877875, -0.42512]] and translation vector: [4.259223, 3.769218, 1.505729], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.760823, 0.125761, -0.636658], [0.611756, -0.466381, 0.638939], [-0.216572, -0.875599, -0.431768]] and translation vector: [4.257898, 3.775608, 1.505422], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.792722, 0.129689, -0.595629], [0.575941, -0.479462, 0.662124], [-0.199711, -0.867927, -0.454772]] and translation vector: [4.245731, 3.788037, 1.507869]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_7_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_7_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_7_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_7_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_7_4.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_7_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_7_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_7_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.40936, -0.486807, 0.77165], [-0.912164, 0.236459, -0.334729], [-0.019515, -0.840896, -0.540844]] and translation vector: [1.412713, 1.214489, 1.390939], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.417972, -0.487805, 0.766384], [-0.908352, 0.237425, -0.344277], [-0.014019, -0.840045, -0.542336]] and translation vector: [1.411881, 1.212071, 1.390231], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.442659, -0.487865, 0.752356], [-0.896674, 0.245809, -0.368176], [-0.005316, -0.837595, -0.546266]] and translation vector: [1.400211, 1.203382, 1.386707]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_8_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_8_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_8_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_8_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_8_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_8_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_8_6.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_8_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.999403, 0.004498, 0.03425], [-0.034232, -0.004158, 0.999405], [0.004638, -0.999981, -0.004001]] and translation vector: [2.393484, 5.775056, 1.371464], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.998454, -0.001139, 0.055575], [-0.055569, 0.004857, 0.998443], [-0.001408, -0.999988, 0.004786]] and translation vector: [2.356134, 5.774678, 1.367739], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.989764, 0.002175, 0.142698], [-0.142529, 0.066115, 0.98758], [-0.007287, -0.99781, 0.065748]] and translation vector: [2.255451, 5.785594, 1.33032]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_9_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_9_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_9_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_9_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_9_4.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_9_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_9_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_9_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.133825, -0.39571, 0.908573], [-0.990975, -0.046263, 0.125813], [-0.007752, -0.91721, -0.398329]] and translation vector: [4.990516, 4.227292, 1.32289], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.168071, -0.388121, 0.906153], [-0.985699, -0.054747, 0.159375], [-0.012247, -0.919981, -0.391772]] and translation vector: [4.987841, 4.19209, 1.32312], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.233014, -0.364692, 0.901501], [-0.972471, -0.085505, 0.216767], [-0.00197, -0.927194, -0.374577]] and translation vector: [4.985941, 4.092797, 1.324644]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_10_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_10_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_10_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_10_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_10_4.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_10_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_10_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_10_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.993306, 0.029023, -0.111812], [0.110831, -0.512349, 0.851596], [-0.032571, -0.858287, -0.512136]] and translation vector: [2.482234, 1.391135, 1.348064], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.992702, 0.031717, -0.116349], [0.116167, -0.510508, 0.85199], [-0.032374, -0.859288, -0.510467]] and translation vector: [2.48213, 1.388715, 1.34704], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.989452, 0.033499, -0.140936], [0.139029, -0.492892, 0.858911], [-0.040694, -0.869445, -0.49235]] and translation vector: [2.480608, 1.381749, 1.351104]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_11_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_11_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_11_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_11_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_11_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_11_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_11_6.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_11_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.473704, -0.275929, 0.836342], [-0.879436, -0.198746, 0.432542], [0.046868, -0.940406, -0.336809]] and translation vector: [2.984934, 2.048073, 1.446683], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.466625, -0.271085, 0.841888], [-0.8831, -0.195475, 0.426525], [0.048943, -0.942498, -0.330608]] and translation vector: [2.979092, 2.049407, 1.446378], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.457049, -0.259072, 0.850875], [-0.888339, -0.18058, 0.422191], [0.044273, -0.948827, -0.312678]] and translation vector: [2.973803, 2.044357, 1.455601]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_12_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_12_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_12_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_12_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_12_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_12_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_12_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_12_7.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.996429, -0.081152, -0.023325], [-0.01119, 0.400709, -0.916137], [0.083693, -0.912604, -0.400187]] and translation vector: [7.365378, 2.610504, 1.343957], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.997089, -0.075007, -0.013671], [-0.016913, 0.392439, -0.919623], [0.074343, -0.916715, -0.392565]] and translation vector: [7.36531, 2.61944, 1.344548], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.997405, -0.064807, -0.031376], [0.004675, 0.376559, -0.926381], [0.071851, -0.924123, -0.375279]] and translation vector: [7.389543, 2.653858, 1.358479]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_13_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_13_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_13_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_13_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_13_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_13_5.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_13_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_13_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.994136, 0.036629, -0.101745], [0.107123, -0.462198, 0.880283], [-0.014782, -0.88602, -0.463411]] and translation vector: [3.8191, 1.340951, 1.354002], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.994264, 0.034625, -0.101195], [0.105882, -0.452335, 0.885541], [-0.015112, -0.891176, -0.453407]] and translation vector: [3.821174, 1.339834, 1.359098], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.998446, 0.039334, -0.039482], [0.052098, -0.407104, 0.911895], [0.019796, -0.912535, -0.408521]] and translation vector: [3.821787, 1.333543, 1.372052]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_14_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_14_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_14_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_14_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_14_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_14_5.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_14_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_14_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.844798, -0.442354, 0.301064], [-0.534849, 0.714819, -0.450523], [-0.015916, -0.541624, -0.84047]] and translation vector: [3.085932, 7.995926, 1.934485], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.833593, -0.457276, 0.309873], [-0.552243, 0.702368, -0.449118], [-0.012274, -0.545507, -0.838017]] and translation vector: [3.091993, 8.002051, 1.93396], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.810018, -0.472367, 0.347478], [-0.58602, 0.673547, -0.450461], [-0.02126, -0.56851, -0.822401]] and translation vector: [3.083665, 8.001425, 1.939036]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_15_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_15_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_15_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_15_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_15_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_15_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_15_6.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_15_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.937403, 0.174354, -0.301457], [0.34768, 0.517889, -0.781607], [0.019845, -0.837491, -0.54609]] and translation vector: [1.513881, 1.499843, 1.388066], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.93698, 0.17766, -0.300842], [0.348874, 0.522274, -0.77815], [0.018876, -0.834067, -0.551341]] and translation vector: [1.515168, 1.503997, 1.385631], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.940806, 0.177334, -0.288855], [0.338804, 0.516688, -0.786286], [0.009813, -0.837607, -0.546185]] and translation vector: [1.517717, 1.515309, 1.387193]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_16_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_16_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_16_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_16_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_16_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_16_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_16_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_16_7.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.964843, 0.186346, -0.185345], [0.252505, 0.461537, -0.850426], [-0.07293, -0.867329, -0.492364]] and translation vector: [3.779865, 2.337391, 1.461827], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.966867, 0.182729, -0.178267], [0.244986, 0.467845, -0.849178], [-0.071768, -0.864715, -0.49711]] and translation vector: [3.779708, 2.335608, 1.46105], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.975115, 0.169172, -0.14329], [0.209929, 0.496761, -0.842115], [-0.071282, -0.85124, -0.519913]] and translation vector: [3.784041, 2.330569, 1.454727]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_17_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_17_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_17_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_17_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_17_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_17_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_17_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_17_7.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.399387, 0.327689, -0.856218], [0.9115, 0.041819, -0.409169], [-0.098274, -0.94386, -0.315391]] and translation vector: [4.88233, 2.963563, 1.403722], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.394763, 0.316878, -0.86241], [0.913367, 0.033579, -0.40575], [-0.099614, -0.947872, -0.302681]] and translation vector: [4.88409, 2.965299, 1.400614], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.386874, 0.309114, -0.868779], [0.915474, 0.015736, -0.402069], [-0.110614, -0.950895, -0.289074]] and translation vector: [4.883719, 2.961581, 1.413125]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_18_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_18_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_18_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_18_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_18_4.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_18_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_18_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_18_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.830629, 0.239867, -0.502514], [0.556756, 0.37214, -0.742654], [0.008867, -0.896647, -0.442658]] and translation vector: [4.849209, 2.614689, 1.447477], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.826514, 0.239564, -0.509396], [0.562778, 0.371773, -0.738286], [0.012512, -0.89688, -0.442097]] and translation vector: [4.848542, 2.612423, 1.449706], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.822193, 0.245879, -0.513364], [0.569134, 0.369775, -0.734406], [0.009254, -0.895997, -0.443965]] and translation vector: [4.848, 2.609138, 1.450893]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_19_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_19_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_19_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_19_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_19_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_19_5.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_19_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_19_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.815869, 0.244354, -0.524069], [0.578211, -0.336271, 0.743367], [0.005416, -0.909513, -0.415641]] and translation vector: [2.358014, 1.230078, 1.369842], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.817563, 0.244526, -0.521342], [0.575764, -0.332513, 0.746947], [0.009295, -0.910847, -0.41264]] and translation vector: [2.355037, 1.229076, 1.372478], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.827304, 0.233324, -0.511006], [0.561698, -0.330711, 0.758371], [0.007951, -0.914434, -0.404656]] and translation vector: [2.3528, 1.226651, 1.376959]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_20_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_20_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_20_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_20_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_20_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_20_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_20_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_20_7.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.880278, -0.246293, 0.405524], [-0.473973, 0.417832, -0.775091], [0.021459, -0.874503, -0.484545]] and translation vector: [3.281806, 2.754624, 1.352781], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.883446, -0.241464, 0.401521], [-0.467927, 0.41107, -0.782347], [0.023856, -0.879043, -0.476146]] and translation vector: [3.2823, 2.745028, 1.352692], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.889317, -0.237291, 0.390907], [-0.456246, 0.402627, -0.793556], [0.030913, -0.884073, -0.466326]] and translation vector: [3.299646, 2.724283, 1.356988]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_21_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_21_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_21_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_21_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_21_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_21_5.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_21_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_21_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.752388, 0.33007, -0.570058], [0.655329, 0.287372, -0.698542], [-0.066749, -0.89915, -0.43252]] and translation vector: [3.814293, 2.583141, 1.394159], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.750374, 0.330815, -0.572276], [0.657793, 0.28836, -0.695813], [-0.065164, -0.89856, -0.433986]] and translation vector: [3.802971, 2.57897, 1.383742], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.774913, 0.365169, -0.515909], [0.625622, 0.32685, -0.708355], [-0.090045, -0.871677, -0.481738]] and translation vector: [3.702851, 2.52357, 1.379531]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_22_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_22_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_22_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_22_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_22_4.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_22_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_22_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_22_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.764638, 0.028658, -0.643823], [0.64431, -0.055554, 0.762744], [-0.013909, -0.998044, -0.060944]] and translation vector: [3.061982, 3.98913, 1.495508], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.765028, 0.027801, -0.643396], [0.643825, -0.056098, 0.763114], [-0.014878, -0.998038, -0.060816]] and translation vector: [3.064652, 3.991985, 1.487138], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.769869, 0.028995, -0.637544], [0.638044, -0.057257, 0.767869], [-0.01424, -0.997939, -0.06258]] and translation vector: [3.059477, 3.994236, 1.491082]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_23_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_23_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_23_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_23_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_23_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_23_5.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_23_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_23_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.436119, -0.427186, 0.79203], [-0.89981, 0.218659, -0.377532], [-0.011909, -0.877326, -0.479747]] and translation vector: [1.992302, 3.72193, 1.553249], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.436462, -0.426736, 0.792084], [-0.899636, 0.219226, -0.377618], [-0.012502, -0.877403, -0.47959]] and translation vector: [1.991236, 3.722176, 1.553282], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.436236, -0.428201, 0.791418], [-0.899775, 0.217489, -0.37829], [-0.010141, -0.877122, -0.480161]] and translation vector: [1.989599, 3.72313, 1.552786]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_24_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_24_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_24_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_24_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_24_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_24_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_24_6.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_24_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.985254, -0.134646, 0.105573], [-0.142287, -0.302097, 0.942599], [-0.095024, -0.94372, -0.3168]] and translation vector: [1.134605, 1.549487, 1.505245], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.985752, -0.13049, 0.106142], [-0.141062, -0.297585, 0.944216], [-0.091624, -0.945736, -0.311752]] and translation vector: [1.131707, 1.551058, 1.506377], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.987724, -0.11535, 0.105339], [-0.134913, -0.289999, 0.94747], [-0.078743, -0.95005, -0.302001]] and translation vector: [1.113611, 1.565945, 1.522577]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_25_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_25_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_25_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_25_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_25_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_25_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_25_6.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_25_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.567127, -0.123224, 0.81436], [-0.823556, -0.071568, 0.562702], [-0.011056, -0.989795, -0.14207]] and translation vector: [0.249561, 0.967409, 1.634127], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.566682, -0.123694, 0.814599], [-0.82386, -0.07149, 0.562268], [-0.011313, -0.989742, -0.142418]] and translation vector: [0.249762, 0.967631, 1.633273], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.570813, -0.115531, 0.812912], [-0.82106, -0.073224, 0.566127], [-0.005881, -0.990601, -0.136655]] and translation vector: [0.269192, 0.984284, 1.63838]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_26_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_26_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_26_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_26_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_26_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_26_5.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_26_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_26_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.08083, -0.463089, 0.882618], [-0.994842, 0.091929, -0.042874], [-0.061284, -0.881531, -0.468131]] and translation vector: [4.543997, 3.147744, 1.235262], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.097623, -0.477164, 0.873375], [-0.993778, 0.094019, -0.059714], [-0.05362, -0.873771, -0.483373]] and translation vector: [4.550471, 3.148599, 1.246367], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.130487, -0.461277, 0.877608], [-0.991003, 0.087264, -0.101481], [-0.029773, -0.882954, -0.468514]] and translation vector: [4.556965, 3.161462, 1.2534]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_27_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_27_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_27_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_27_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_27_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_27_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_27_6.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_27_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.117057, -0.769276, 0.628102], [-0.987232, -0.021336, 0.157855], [-0.108033, -0.638561, -0.761951]] and translation vector: [1.032686, 1.226834, 2.186959], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.111522, -0.769903, 0.628341], [-0.98843, -0.020525, 0.150284], [-0.102807, -0.637831, -0.763284]] and translation vector: [1.037875, 1.232625, 2.186027], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.047902, -0.766247, 0.640758], [-0.996596, 0.006426, 0.082189], [-0.067095, -0.642514, -0.763331]] and translation vector: [1.085053, 1.269848, 2.178721]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_28_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_28_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_28_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_28_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_28_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_28_5.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_28_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_28_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.824719, -0.175736, 0.537546], [-0.564369, 0.316962, -0.762249], [-0.036427, -0.932015, -0.360584]] and translation vector: [4.397487, 4.054199, 1.411764], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.821778, -0.181799, 0.540028], [-0.568729, 0.319986, -0.757731], [-0.035047, -0.929816, -0.366351]] and translation vector: [4.391561, 4.044915, 1.406417], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.814573, -0.211319, 0.540199], [-0.579135, 0.348873, -0.736811], [-0.032758, -0.913034, -0.406565]] and translation vector: [4.415594, 3.989866, 1.391957]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_29_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_29_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_29_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_29_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_29_4.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_29_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_29_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_29_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.14018, 0.443083, -0.885453], [0.989985, -0.07783, 0.117782], [-0.016727, -0.893096, -0.449556]] and translation vector: [3.549726, 0.935059, 1.485921], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.140682, 0.443565, -0.885132], [0.989931, -0.077142, 0.11868], [-0.015638, -0.892916, -0.449951]] and translation vector: [3.549777, 0.934132, 1.483108], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.137256, 0.445178, -0.88486], [0.99043, -0.074707, 0.116046], [-0.014444, -0.89232, -0.451172]] and translation vector: [3.545579, 0.936731, 1.483973]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_30_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_30_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_30_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_30_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_30_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_30_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_30_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_30_7.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.59597, 0.482312, -0.642025], [0.802979, -0.35126, 0.4815], [0.006716, -0.802491, -0.596626]] and translation vector: [3.449961, 1.112515, 1.412234], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.596047, 0.483799, -0.640833], [0.802896, -0.349913, 0.482617], [0.009254, -0.802184, -0.597005]] and translation vector: [3.451157, 1.111087, 1.411899], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.59137, 0.494753, -0.636789], [0.806303, -0.350525, 0.476453], [0.012516, -0.795205, -0.606211]] and translation vector: [3.452706, 1.109482, 1.412867]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_31_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_31_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_31_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_31_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_31_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_31_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_31_6.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_31_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.112591, -0.547395, 0.829266], [-0.992672, 0.098819, -0.069547], [-0.043877, -0.83102, -0.55451]] and translation vector: [1.18498, 1.814175, 1.496605], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.111637, -0.546351, 0.830083], [-0.992679, 0.100057, -0.067648], [-0.046096, -0.831558, -0.553521]] and translation vector: [1.186424, 1.810214, 1.495373], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.122401, -0.542747, 0.83093], [-0.991535, 0.103412, -0.078512], [-0.043316, -0.833506, -0.55081]] and translation vector: [1.193691, 1.805185, 1.501094]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_32_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_32_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_32_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_32_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_32_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_32_5.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_32_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_32_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.606497, 0.359513, -0.709163], [0.793947, -0.321582, 0.515978], [-0.042553, -0.875977, -0.480473]] and translation vector: [5.898605, 1.464963, 1.329018], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.603336, 0.358994, -0.712116], [0.79647, -0.316333, 0.515334], [-0.040264, -0.878098, -0.476783]] and translation vector: [5.91512, 1.4588, 1.326343], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.586247, 0.38946, -0.710377], [0.809914, -0.302115, 0.502759], [-0.018811, -0.870085, -0.492543]] and translation vector: [6.035654, 1.433116, 1.31748]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_33_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_33_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_33_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_33_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_33_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_33_5.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_33_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_33_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.590232, -0.352789, 0.726062], [-0.807221, -0.252962, 0.533296], [-0.004475, -0.900861, -0.434086]] and translation vector: [2.518124, 2.463328, 1.346668], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.586587, -0.358769, 0.726086], [-0.809845, -0.250747, 0.530356], [-0.008212, -0.899117, -0.437632]] and translation vector: [2.520116, 2.462175, 1.344964], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.595628, -0.375207, 0.710244], [-0.80316, -0.264233, 0.533961], [-0.012675, -0.888482, -0.458736]] and translation vector: [2.525984, 2.461792, 1.333971]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_34_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_34_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_34_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_34_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_34_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_34_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_34_6.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_34_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.934582, -0.143102, 0.325696], [-0.355737, 0.383069, -0.852473], [-0.002774, -0.912568, -0.408916]] and translation vector: [2.694367, 2.483235, 1.465763], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.935747, -0.141154, 0.323191], [-0.352667, 0.379116, -0.85551], [-0.001768, -0.91452, -0.404537]] and translation vector: [2.694351, 2.483417, 1.465522], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.94215, -0.147808, 0.300842], [-0.33486, 0.375166, -0.864361], [0.014894, -0.915098, -0.402958]] and translation vector: [2.702719, 2.477868, 1.47257]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_35_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_35_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_35_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_35_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_35_4.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_35_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_35_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_35_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.684823, -0.326379, 0.651532], [-0.728707, -0.304485, 0.613413], [-0.001823, -0.894855, -0.446353]] and translation vector: [2.86358, 2.414664, 1.549631], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.684506, -0.325468, 0.652321], [-0.729004, -0.308374, 0.611113], [0.002261, -0.893855, -0.448351]] and translation vector: [2.864701, 2.413023, 1.547001], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.67888, -0.327994, 0.656918], [-0.733931, -0.329441, 0.593981], [0.021593, -0.885375, -0.464376]] and translation vector: [2.877256, 2.417151, 1.541322]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_36_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_36_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_36_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_36_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_36_4.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_36_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_36_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_36_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.882784, 0.25224, -0.396318], [0.469583, -0.498211, 0.728888], [-0.013595, -0.829554, -0.55826]] and translation vector: [3.463734, 1.394934, 1.262723], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.883097, 0.250738, -0.396574], [0.468931, -0.499833, 0.728197], [-0.015634, -0.829034, -0.558979]] and translation vector: [3.462241, 1.393432, 1.262782], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.878878, 0.250773, -0.405817], [0.476653, -0.496234, 0.725641], [-0.019409, -0.831183, -0.55566]] and translation vector: [3.458656, 1.394662, 1.254618]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_37_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_37_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_37_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_37_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_37_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_37_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_37_6.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_37_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.752445, 0.275595, -0.598225], [0.657828, -0.35994, 0.661593], [-0.032994, -0.891342, -0.452129]] and translation vector: [2.633805, 2.70906, 1.31733], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.746128, 0.269733, -0.608718], [0.664676, -0.35493, 0.657443], [-0.038718, -0.895136, -0.444108]] and translation vector: [2.667176, 2.689206, 1.310347], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.736878, 0.253582, -0.626664], [0.67323, -0.359496, 0.646161], [-0.061428, -0.89803, -0.435624]] and translation vector: [2.744361, 2.610373, 1.319779]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_38_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_38_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_38_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_38_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_38_4.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_38_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_38_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_38_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.386761, -0.304254, 0.870543], [-0.920043, 0.191539, -0.34181], [-0.062746, -0.933136, -0.354007]] and translation vector: [2.082368, 4.008438, 1.845888], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.387201, -0.298257, 0.872421], [-0.919947, 0.188025, -0.344013], [-0.061432, -0.935783, -0.347183]] and translation vector: [2.08001, 4.010775, 1.842824], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.376594, -0.325714, 0.867229], [-0.924884, 0.185353, -0.332016], [-0.052601, -0.927122, -0.371051]] and translation vector: [2.082613, 4.009402, 1.837637]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_39_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_39_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_39_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_39_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_39_4.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_39_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_39_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_39_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.484778, 0.389748, -0.782998], [0.874059, -0.248441, 0.417491], [-0.031813, -0.886777, -0.461102]] and translation vector: [2.948564, 2.712566, 1.480667], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.484062, 0.388161, -0.784229], [0.874419, -0.248162, 0.416902], [-0.03279, -0.887551, -0.459542]] and translation vector: [2.949191, 2.711738, 1.477649], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.467232, 0.39177, -0.792597], [0.88347, -0.241629, 0.401368], [-0.034271, -0.887768, -0.459014]] and translation vector: [2.947397, 2.72527, 1.480424]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_40_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_40_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_40_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_40_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_40_4.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_40_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_40_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_40_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.493838, -0.420518, 0.76111], [-0.864926, -0.147366, 0.479777], [-0.089593, -0.895236, -0.436493]] and translation vector: [0.736944, 2.108944, 1.402726], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.487676, -0.423405, 0.763479], [-0.869284, -0.154634, 0.469504], [-0.080731, -0.892646, -0.443471]] and translation vector: [0.733117, 2.095654, 1.39687], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.480924, -0.423346, 0.767783], [-0.872629, -0.146192, 0.465989], [-0.085031, -0.894095, -0.439732]] and translation vector: [0.701425, 2.057617, 1.397946]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_41_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_41_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_41_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_41_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_41_4.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_41_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_41_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_41_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.874867, -0.0675, 0.479638], [-0.482919, 0.197999, -0.852987], [-0.037391, -0.977875, -0.205819]] and translation vector: [2.397274, 1.722858, 1.486845], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.874077, -0.063653, 0.4816], [-0.484123, 0.196153, -0.852731], [-0.040189, -0.978505, -0.202269]] and translation vector: [2.402604, 1.721845, 1.489477], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.887879, -0.058916, 0.456289], [-0.458188, 0.203011, -0.865362], [-0.041648, -0.977402, -0.207244]] and translation vector: [2.446714, 1.689918, 1.489633]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_42_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_42_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_42_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_42_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_42_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_42_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_42_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_42_7.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.993805, -0.057016, 0.095394], [-0.110597, -0.423109, 0.899304], [-0.010913, -0.904283, -0.426794]] and translation vector: [3.282054, 2.568905, 1.512321], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.993106, -0.061381, 0.099861], [-0.116562, -0.427194, 0.896615], [-0.012375, -0.902074, -0.431404]] and translation vector: [3.283498, 2.568158, 1.509645], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.991697, -0.07473, 0.104657], [-0.127453, -0.462749, 0.877279], [-0.017129, -0.883334, -0.468431]] and translation vector: [3.294037, 2.566846, 1.501968]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_43_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_43_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_43_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_43_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_43_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_43_5.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_43_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_43_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.672393, -0.274439, 0.687438], [-0.739855, -0.221079, 0.635404], [-0.022402, -0.935846, -0.351697]] and translation vector: [3.802358, 2.110255, 1.494557], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.672432, -0.275262, 0.687071], [-0.739825, -0.222066, 0.635095], [-0.022242, -0.93537, -0.35297]] and translation vector: [3.806542, 2.108163, 1.497405], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.662943, -0.279413, 0.694575], [-0.748414, -0.223073, 0.624593], [-0.019579, -0.933899, -0.357001]] and translation vector: [3.809607, 2.112622, 1.492454]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_44_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_44_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_44_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_44_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_44_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_44_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_44_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_44_7.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.187285, -0.627824, 0.755488], [-0.982305, 0.118515, -0.145025], [0.001514, -0.76928, -0.63891]] and translation vector: [1.001752, 1.17634, 1.437838], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.187139, -0.630563, 0.75324], [-0.982328, 0.117514, -0.14568], [0.003345, -0.767191, -0.64141]] and translation vector: [1.00191, 1.178201, 1.437088], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.182531, -0.636948, 0.748986], [-0.983189, 0.114531, -0.142208], [0.004797, -0.762352, -0.647145]] and translation vector: [1.004145, 1.176443, 1.437678]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_45_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_45_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_45_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_45_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_45_4.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_45_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_45_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_45_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.977181, 0.077241, -0.197866], [0.211774, -0.426158, 0.879512], [-0.016388, -0.901345, -0.432791]] and translation vector: [0.977323, 0.877303, 1.40232], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.979446, 0.063797, -0.19135], [0.200663, -0.404476, 0.892263], [-0.020472, -0.912321, -0.408965]] and translation vector: [0.961423, 0.875672, 1.418643], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.983838, 0.068482, -0.165447], [0.178902, -0.337078, 0.924323], [0.007531, -0.938983, -0.343882]] and translation vector: [0.935081, 0.882589, 1.453845]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_46_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_46_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_46_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_46_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_46_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_46_5.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_46_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_46_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.205964, -0.505778, 0.837716], [-0.978495, 0.11627, -0.170378], [-0.011228, -0.854792, -0.518849]] and translation vector: [2.901534, 4.292832, 1.280844], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.204012, -0.504726, 0.838827], [-0.978841, 0.118998, -0.166463], [-0.0158, -0.855039, -0.518324]] and translation vector: [2.909629, 4.290413, 1.285823], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.169049, -0.47943, 0.861144], [-0.985403, 0.100042, -0.137744], [-0.020112, -0.871859, -0.489344]] and translation vector: [2.918062, 4.255744, 1.296137]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_47_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_47_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_47_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_47_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_47_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_47_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_47_6.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_47_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.810147, -0.229725, 0.539341], [-0.586224, 0.314131, -0.746769], [0.002128, -0.921167, -0.389162]] and translation vector: [3.108561, 2.950706, 1.466118], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.798041, -0.241673, 0.552019], [-0.602539, 0.306626, -0.736836], [0.00881, -0.920638, -0.390318]] and translation vector: [3.094201, 2.939754, 1.46817], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.730942, -0.298846, 0.613526], [-0.681648, 0.276413, -0.677461], [0.03287, -0.913393, -0.40575]] and translation vector: [3.008661, 2.892656, 1.463078]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_48_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_48_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_48_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_48_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_48_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_48_5.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_48_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_48_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.983299, 0.047874, -0.175588], [0.180439, -0.382417, 0.9062], [-0.023764, -0.922749, -0.384668]] and translation vector: [2.208684, 3.483128, 1.468268], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.982577, 0.045136, -0.18029], [0.183889, -0.376806, 0.907856], [-0.026957, -0.925192, -0.378541]] and translation vector: [2.211137, 3.481059, 1.465482], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.983986, 0.057121, -0.168843], [0.177826, -0.379389, 0.907988], [-0.012192, -0.923472, -0.383472]] and translation vector: [2.214237, 3.490379, 1.461581]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_49_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_49_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_49_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_49_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_49_4.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_49_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_49_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_49_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.053762, 0.423971, -0.904079], [0.99709, -0.071809, 0.025618], [-0.05406, -0.902825, -0.426597]] and translation vector: [3.696534, 7.381392, 1.65485], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.059051, 0.424044, -0.903714], [0.996629, -0.076693, 0.029136], [-0.056954, -0.902388, -0.427143]] and translation vector: [3.693501, 7.384472, 1.654036], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.076295, 0.430516, -0.899353], [0.995602, -0.082082, 0.045168], [-0.054375, -0.898843, -0.434884]] and translation vector: [3.686877, 7.38459, 1.650219]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_50_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_50_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_50_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_50_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_50_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_50_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_50_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_50_7.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.079656, -0.319192, 0.944337], [-0.994012, 0.096527, -0.051219], [-0.074805, -0.942762, -0.324969]] and translation vector: [4.3352, 2.935251, 1.464921], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.08136, -0.319768, 0.943996], [-0.993796, 0.098086, -0.052427], [-0.075828, -0.942405, -0.325765]] and translation vector: [4.335558, 2.933583, 1.460394], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.082648, -0.359045, 0.929654], [-0.993327, 0.104973, -0.047767], [-0.080438, -0.927398, -0.365325]] and translation vector: [4.342546, 2.934833, 1.439448]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_51_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_51_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_51_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_51_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_51_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_51_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_51_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_51_7.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.299058, 0.37418, -0.877812], [0.95368, -0.085842, 0.288314], [0.032528, -0.923375, -0.38252]] and translation vector: [3.908031, 4.993837, 1.41318], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.301871, 0.365699, -0.880419], [0.952911, -0.087746, 0.290279], [0.028901, -0.926588, -0.374966]] and translation vector: [3.903484, 4.991583, 1.422828], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.301255, 0.344295, -0.889217], [0.952977, -0.076566, 0.293211], [0.032867, -0.935734, -0.351171]] and translation vector: [3.913385, 4.973511, 1.425571]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_52_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_52_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_52_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_52_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_52_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_52_5.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_52_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_52_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.769532, -0.429513, 0.472588], [-0.615738, -0.302759, 0.727464], [-0.169375, -0.850797, -0.49745]] and translation vector: [2.184386, 2.253813, 1.283805], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.76638, -0.428136, 0.478917], [-0.620171, -0.298738, 0.725357], [-0.167481, -0.85291, -0.494464]] and translation vector: [2.185226, 2.257666, 1.286817], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.752434, -0.422477, 0.505328], [-0.641308, -0.294924, 0.708339], [-0.150223, -0.857049, -0.492848]] and translation vector: [2.203988, 2.240772, 1.285116]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_53_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_53_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_53_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_53_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_53_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_53_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_53_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_53_7.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.255196, -0.436856, 0.862573], [-0.966393, 0.143834, -0.213066], [-0.030988, -0.887958, -0.45888]] and translation vector: [1.734999, 0.744851, 1.432124], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.254375, -0.435236, 0.863634], [-0.966628, 0.142475, -0.21291], [-0.03038, -0.888972, -0.456953]] and translation vector: [1.735377, 0.747301, 1.433656], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.252592, -0.430397, 0.866577], [-0.967061, 0.14143, -0.211638], [-0.031471, -0.891491, -0.451944]] and translation vector: [1.738514, 0.752667, 1.434948]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_54_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_54_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_54_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_54_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_54_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_54_5.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_54_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_54_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.721847, -0.019511, -0.691778], [0.690918, -0.036893, 0.721991], [-0.039608, -0.999129, -0.013151]] and translation vector: [1.871862, 0.815296, 1.594356], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.723033, -0.022358, -0.690452], [0.689637, -0.034974, 0.723311], [-0.04032, -0.999138, -0.009869]] and translation vector: [1.872181, 0.815734, 1.596287], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.722407, -0.014829, -0.691309], [0.690381, -0.040572, 0.722307], [-0.038759, -0.999067, -0.019072]] and translation vector: [1.866769, 0.812653, 1.587453]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_55_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_55_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_55_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_55_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_55_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_55_5.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_55_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_55_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.15851, 0.420096, -0.893529], [0.981106, -0.034663, -0.190342], [-0.110934, -0.906817, -0.406664]] and translation vector: [4.004256, 0.910349, 2.578562], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.153085, 0.419732, -0.894645], [0.982322, -0.034068, -0.184071], [-0.107739, -0.907009, -0.407097]] and translation vector: [4.005316, 0.908549, 2.574668], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.128813, 0.432758, -0.89226], [0.986418, -0.036555, -0.160137], [-0.101917, -0.900769, -0.422171]] and translation vector: [4.005799, 0.894308, 2.560097]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_56_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_56_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_56_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_56_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_56_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_56_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_56_6.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_56_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.974605, -0.106498, 0.196986], [-0.223762, -0.428932, 0.875185], [-0.008712, -0.897037, -0.44187]] and translation vector: [2.006689, 0.552817, 1.711334], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.976991, -0.101609, 0.187523], [-0.213093, -0.42809, 0.878254], [-0.008962, -0.898006, -0.439892]] and translation vector: [2.014877, 0.551422, 1.700123], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.983342, -0.080889, 0.162776], [-0.181747, -0.450774, 0.87394], [0.002683, -0.888966, -0.457967]] and translation vector: [1.906067, 0.734394, 1.70234]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_57_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_57_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_57_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_57_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_57_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_57_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_57_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_57_7.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.677945, 0.409221, -0.610679], [0.735109, 0.38004, -0.561413], [0.00234, -0.829523, -0.558468]] and translation vector: [3.092599, 2.044437, 1.437429], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.678782, 0.408186, -0.610442], [0.734335, 0.380383, -0.562193], [0.002723, -0.829875, -0.557943]] and translation vector: [3.0892, 2.043949, 1.440375], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.676872, 0.407734, -0.61286], [0.736083, 0.380637, -0.559729], [0.005057, -0.829981, -0.557769]] and translation vector: [3.08962, 2.045413, 1.436176]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_58_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_58_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_58_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_58_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_58_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_58_5.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_58_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_58_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.207705, 0.494542, -0.843971], [0.97739, -0.069996, 0.199524], [0.039599, -0.866331, -0.497898]] and translation vector: [4.53083, 2.291093, 1.52739], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.209269, 0.494574, -0.843566], [0.977066, -0.071037, 0.200739], [0.039356, -0.866228, -0.498097]] and translation vector: [4.529976, 2.291335, 1.526507], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.196766, 0.49564, -0.845946], [0.979799, -0.067948, 0.18809], [0.035744, -0.865866, -0.498997]] and translation vector: [4.530453, 2.296434, 1.524226]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_59_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_59_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_59_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_59_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_59_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_59_5.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_59_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_59_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.956223, -0.170898, 0.237554], [-0.292595, -0.544035, 0.786393], [-0.005155, -0.821474, -0.570223]] and translation vector: [1.275326, 2.834272, 1.3185], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.956815, -0.170774, 0.235249], [-0.290631, -0.544392, 0.786875], [-0.00631, -0.821263, -0.570514]] and translation vector: [1.276568, 2.833979, 1.318089], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.956011, -0.167954, 0.240486], [-0.293328, -0.545359, 0.785202], [-0.000727, -0.821203, -0.570635]] and translation vector: [1.277841, 2.834386, 1.31762]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_60_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_60_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_60_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_60_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_60_4.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_60_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_60_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_60_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.928108, -0.125197, 0.35063], [-0.371823, 0.3599, -0.855699], [-0.019061, -0.924553, -0.380577]] and translation vector: [5.296664, 4.137775, 1.856988], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.930637, -0.119308, 0.34595], [-0.365378, 0.355543, -0.860284], [-0.020361, -0.927014, -0.374474]] and translation vector: [5.29653, 4.126579, 1.856014], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.952426, -0.118849, 0.280641], [-0.304767, 0.367704, -0.878584], [0.001226, -0.922317, -0.386432]] and translation vector: [5.320154, 4.099401, 1.857875]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_61_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_61_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_61_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_61_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_61_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_61_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_61_6.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_61_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.86482, -0.183466, 0.467362], [-0.501092, -0.256948, 0.826368], [-0.031523, -0.948851, -0.314147]] and translation vector: [3.012278, 2.022242, 1.442339], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.863867, -0.189194, 0.466839], [-0.502557, -0.260784, 0.824274], [-0.034203, -0.946677, -0.320364]] and translation vector: [3.015002, 2.018446, 1.436262], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.859994, -0.189108, 0.473971], [-0.509792, -0.276775, 0.81456], [-0.022856, -0.942143, -0.33443]] and translation vector: [3.018664, 2.017763, 1.427395]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_62_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_62_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_62_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_62_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_62_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_62_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_62_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_62_7.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.951558, 0.16536, -0.259218], [0.307283, -0.481983, 0.820531], [0.010744, -0.860436, -0.509446]] and translation vector: [2.919862, 3.428013, 1.521081], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.951326, 0.167996, -0.258374], [0.307875, -0.4803, 0.821295], [0.013877, -0.860866, -0.508643]] and translation vector: [2.920042, 3.428186, 1.518811], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.948369, 0.180855, -0.260555], [0.316485, -0.485614, 0.814872], [0.020845, -0.85526, -0.517779]] and translation vector: [2.906806, 3.429147, 1.512746]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_63_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_63_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_63_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_63_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_63_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_63_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_63_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_63_7.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.802837, 0.056561, -0.593509], [0.596192, 0.071654, -0.799638], [-0.002701, -0.995825, -0.091248]] and translation vector: [2.583219, 4.008804, 1.439254], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.802466, 0.056012, -0.594063], [0.59669, 0.070227, -0.799393], [-0.003056, -0.995957, -0.089777]] and translation vector: [2.583684, 4.008714, 1.434935], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.802651, 0.061565, -0.593263], [0.596422, 0.0734, -0.799308], [-0.005664, -0.995401, -0.095633]] and translation vector: [2.580812, 4.010173, 1.435745]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_64_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_64_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_64_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_64_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_64_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_64_5.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_64_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_64_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.355681, -0.20797, 0.911175], [-0.934036, 0.113197, -0.338769], [-0.032689, -0.971563, -0.234514]] and translation vector: [0.539195, 4.841905, 1.636959], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.354881, -0.205091, 0.912139], [-0.934375, 0.110848, -0.338608], [-0.031664, -0.972446, -0.230969]] and translation vector: [0.533365, 4.84225, 1.627512], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.357394, -0.22244, 0.907078], [-0.933778, 0.10396, -0.34242], [-0.018132, -0.969388, -0.244864]] and translation vector: [0.528036, 4.836335, 1.624936]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_65_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_65_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_65_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_65_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_65_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_65_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_65_6.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_65_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.566304, -0.590941, 0.574533], [-0.823945, 0.423135, -0.376925], [-0.020365, -0.686838, -0.726526]] and translation vector: [2.143516, 1.760119, 1.343188], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.561614, -0.596242, 0.57366], [-0.827171, 0.420904, -0.372329], [-0.019457, -0.683619, -0.729579]] and translation vector: [2.147258, 1.761594, 1.344016], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.547252, -0.609389, 0.573725], [-0.836861, 0.409368, -0.363431], [-0.013394, -0.679017, -0.734001]] and translation vector: [2.154856, 1.762344, 1.343807]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_66_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_66_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_66_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_66_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_66_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_66_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_66_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_66_7.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.848489, -0.131122, 0.512712], [-0.527579, 0.133483, -0.838954], [0.041567, -0.982339, -0.182436]] and translation vector: [2.702568, 1.718074, 1.602473], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.851363, -0.128939, 0.508484], [-0.523333, 0.142037, -0.840207], [0.036112, -0.981428, -0.188403]] and translation vector: [2.706553, 1.721294, 1.602035], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.862925, -0.138489, 0.485985], [-0.504369, 0.176659, -0.845224], [0.031201, -0.974481, -0.222293]] and translation vector: [2.716626, 1.723908, 1.586826]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_67_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_67_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_67_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_67_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_67_4.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_67_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_67_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_67_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.205292, 0.226186, -0.952205], [0.97316, -0.150555, 0.174048], [-0.103992, -0.962379, -0.251024]] and translation vector: [4.876985, 2.837537, 1.671042], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.210488, 0.22021, -0.952472], [0.971775, -0.153305, 0.17931], [-0.106533, -0.96333, -0.246263]] and translation vector: [4.87733, 2.840179, 1.675237], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.247756, 0.187443, -0.950517], [0.962582, -0.158806, 0.219585], [-0.109788, -0.969353, -0.219774]] and translation vector: [4.877867, 2.827038, 1.675608]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_68_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_68_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_68_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_68_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_68_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_68_5.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_68_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_68_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.52463, -0.231347, 0.819293], [-0.850589, 0.102279, -0.515789], [0.03553, -0.96748, -0.25044]] and translation vector: [5.897326, 2.792535, 1.553822], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.52763, -0.228151, 0.818263], [-0.84888, 0.105585, -0.517933], [0.03177, -0.967884, -0.249382]] and translation vector: [5.897463, 2.790525, 1.551499], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.541576, -0.222735, 0.810608], [-0.840076, 0.107703, -0.53167], [0.031116, -0.968911, -0.245444]] and translation vector: [5.894893, 2.788883, 1.558074]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_69_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_69_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_69_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_69_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_69_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_69_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_69_6.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_69_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.988959, -0.006087, -0.148062], [0.148117, 0.009943, 0.98892], [-0.004548, -0.999932, 0.010735]] and translation vector: [3.911582, 2.672538, 1.565046], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.987297, -0.007995, -0.158684], [0.158774, 0.012251, 0.987239], [-0.005949, -0.999893, 0.013365]] and translation vector: [3.955948, 2.679338, 1.574419], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.992697, -0.03521, -0.115384], [0.116446, 0.029785, 0.99275], [-0.031518, -0.998936, 0.033668]] and translation vector: [3.907376, 2.643518, 1.623414]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_70_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_70_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_70_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_70_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_70_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_70_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_70_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_70_7.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.999494, 0.005595, 0.031322], [-0.029883, 0.172936, -0.98448], [-0.010925, -0.984917, -0.172681]] and translation vector: [6.687301, 5.436423, 1.742894], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.999393, 0.00615, 0.034285], [-0.032681, 0.175053, -0.984017], [-0.012053, -0.98454, -0.174746]] and translation vector: [6.681215, 5.427393, 1.75699], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.999512, 0.015203, 0.027277], [-0.02448, 0.160854, -0.986675], [-0.019388, -0.986861, -0.160403]] and translation vector: [6.678608, 5.424335, 1.758175]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_71_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_71_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_71_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_71_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_71_4.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_71_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_71_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_71_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.991592, 0.052224, -0.118397], [0.1292, -0.348306, 0.928435], [0.007248, -0.935925, -0.352124]] and translation vector: [2.177373, 2.142725, 1.46728], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.992093, 0.047571, -0.11614], [0.125441, -0.346386, 0.929667], [0.003996, -0.936885, -0.349615]] and translation vector: [2.181058, 2.142908, 1.465582], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.99009, 0.041581, -0.13414], [0.14016, -0.352521, 0.925248], [-0.008815, -0.93488, -0.354856]] and translation vector: [2.196626, 2.148474, 1.466161]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_72_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_72_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_72_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_72_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_72_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_72_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_72_6.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_72_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.132001, -0.567775, 0.812532], [-0.991224, 0.069667, -0.112349], [0.007182, -0.820231, -0.571988]] and translation vector: [2.407685, 4.450429, 1.359714], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.130918, -0.563466, 0.8157], [-0.991376, 0.069526, -0.111087], [0.005882, -0.823209, -0.567709]] and translation vector: [2.40989, 4.444678, 1.359228], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.104614, -0.562754, 0.819978], [-0.994308, 0.042438, -0.097729], [0.020199, -0.825534, -0.563991]] and translation vector: [2.433079, 4.433616, 1.362504]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_73_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_73_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_73_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_73_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_73_4.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_73_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_73_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_73_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.877021, 0.121711, -0.464779], [0.46491, 0.459041, -0.75706], [0.12121, -0.880038, -0.459173]] and translation vector: [3.922419, 3.230202, 1.747047], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.876473, 0.11975, -0.466322], [0.465798, 0.455895, -0.758415], [0.121773, -0.881941, -0.455359]] and translation vector: [3.923546, 3.227255, 1.740959], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.862892, 0.148989, -0.482928], [0.494148, 0.449135, -0.744376], [0.105996, -0.880954, -0.461178]] and translation vector: [3.903725, 3.133858, 1.745573]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_74_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_74_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_74_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_74_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_74_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_74_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_74_6.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_74_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.515401, -0.339121, 0.786994], [-0.847541, -0.337435, 0.40965], [0.126638, -0.878143, -0.461333]] and translation vector: [4.776819, 1.138867, 1.280463], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.495978, -0.33911, 0.799381], [-0.859276, -0.324304, 0.395565], [0.125103, -0.88308, -0.452237]] and translation vector: [4.773187, 1.14016, 1.284317], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.481026, -0.30789, 0.820864], [-0.867671, -0.301264, 0.395457], [0.125539, -0.902465, -0.412064]] and translation vector: [4.757284, 1.147171, 1.295988]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_75_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_75_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_75_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_75_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_75_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_75_5.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_75_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_75_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.623567, 0.536294, -0.568817], [0.781209, -0.455034, 0.427384], [-0.029628, -0.710867, -0.702702]] and translation vector: [1.790477, 1.816361, 1.229059], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.636074, 0.528408, -0.562313], [0.771074, -0.462894, 0.437235], [-0.029252, -0.711698, -0.701876]] and translation vector: [1.794875, 1.819226, 1.230937], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.674924, 0.4822, -0.558534], [0.737532, -0.464309, 0.49037], [-0.022876, -0.7429, -0.669012]] and translation vector: [1.813084, 1.825686, 1.243736]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_76_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_76_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_76_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_76_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_76_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_76_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_76_6.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_76_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.999847, -0.004634, 0.01689], [-0.017397, -0.374134, 0.927211], [0.002023, -0.927363, -0.374157]] and translation vector: [3.310194, 3.16458, 1.506432], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.999774, -0.010896, 0.018284], [-0.021018, -0.369724, 0.928904], [-0.003361, -0.929078, -0.369869]] and translation vector: [3.316631, 3.168954, 1.519748], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.999711, -0.01062, 0.02156], [-0.023945, -0.363153, 0.931422], [-0.002062, -0.931669, -0.363302]] and translation vector: [3.313389, 3.184942, 1.522696]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_77_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_77_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_77_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_77_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_77_4.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_77_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_77_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_77_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.573165, 0.475287, -0.667521], [0.819422, -0.337921, 0.462988], [-0.005517, -0.81235, -0.583144]] and translation vector: [4.230747, 1.597944, 1.425469], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.580595, 0.472456, -0.663095], [0.814187, -0.339873, 0.470729], [-0.002969, -0.813186, -0.581996]] and translation vector: [4.228813, 1.597838, 1.42741], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.590926, 0.466068, -0.658474], [0.806725, -0.340048, 0.483283], [0.00133, -0.816791, -0.576932]] and translation vector: [4.230728, 1.601094, 1.427952]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_78_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_78_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_78_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_78_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_78_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_78_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_78_6.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_78_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.246516, -0.470365, 0.847341], [-0.959136, 0.006886, 0.282862], [-0.138884, -0.882445, -0.449446]] and translation vector: [3.043058, 2.955299, 1.551102], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.243276, -0.470143, 0.8484], [-0.960213, 0.006937, 0.279182], [-0.13714, -0.882563, -0.44975]] and translation vector: [3.042024, 2.954946, 1.550413], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.220837, -0.468715, 0.855299], [-0.967151, 0.007957, 0.254077], [-0.125896, -0.883313, -0.451561]] and translation vector: [3.035462, 2.949861, 1.549809]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_79_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_79_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_79_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_79_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_79_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_79_5.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_79_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_79_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.998134, -0.025826, -0.055325], [0.04389, 0.326427, -0.944203], [0.042444, -0.94487, -0.324684]] and translation vector: [2.355182, 2.984659, 1.395898], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.998605, -0.022906, -0.047579], [0.037628, 0.323493, -0.945482], [0.037048, -0.945953, -0.32218]] and translation vector: [2.345251, 2.98743, 1.391141], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.998425, -0.028903, -0.048087], [0.035665, 0.334665, -0.941662], [0.04331, -0.941894, -0.333107]] and translation vector: [2.317253, 2.991597, 1.388493]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_80_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_80_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_80_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_80_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_80_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_80_5.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_80_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_80_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.176261, -0.039155, 0.983564], [-0.983722, -0.028492, -0.177423], [0.03497, -0.998827, -0.033496]] and translation vector: [3.054739, 2.437738, 1.503838], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.18153, -0.048874, 0.98217], [-0.982778, -0.026092, -0.182941], [0.034567, -0.998464, -0.043296]] and translation vector: [3.061021, 2.450195, 1.498681], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.163045, -0.034334, 0.986021], [-0.986093, -0.02694, -0.163995], [0.032194, -0.999047, -0.029464]] and translation vector: [3.066704, 2.437577, 1.507359]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_81_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_81_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_81_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_81_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_81_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_81_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_81_6.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_81_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.853196, -0.330732, 0.403328], [-0.517406, -0.438892, 0.734619], [-0.065945, -0.835458, -0.545584]] and translation vector: [2.734716, 6.775187, 1.412962], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.853022, -0.336855, 0.398601], [-0.516617, -0.436898, 0.736361], [-0.0739, -0.834056, -0.546708]] and translation vector: [2.728871, 6.767794, 1.411126], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.851443, -0.340578, 0.398812], [-0.519517, -0.44372, 0.730216], [-0.071735, -0.828927, -0.554738]] and translation vector: [2.722152, 6.743406, 1.39829]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_82_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_82_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_82_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_82_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_82_4.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_82_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_82_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_82_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.954506, 0.05554, -0.292973], [0.288831, -0.41644, 0.862064], [-0.074127, -0.907465, -0.413536]] and translation vector: [2.66447, 1.005586, 1.476015], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.956668, 0.052296, -0.286448], [0.280824, -0.425753, 0.860158], [-0.076973, -0.903327, -0.42199]] and translation vector: [2.657996, 1.004761, 1.470821], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.966986, 0.054866, -0.248854], [0.248498, -0.419376, 0.873139], [-0.056458, -0.906153, -0.419165]] and translation vector: [2.617702, 1.004602, 1.502791]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_83_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_83_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_83_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_83_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_83_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_83_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_83_6.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_83_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.804945, -0.278842, 0.523748], [-0.593014, 0.407765, -0.694307], [-0.019964, -0.869468, -0.493585]] and translation vector: [4.871809, 2.494869, 1.402737], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.804444, -0.274614, 0.526742], [-0.593612, 0.404842, -0.695506], [-0.022252, -0.872176, -0.488687]] and translation vector: [4.863627, 2.491699, 1.400121], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.82218, -0.26485, 0.503859], [-0.568804, 0.416386, -0.709285], [-0.021946, -0.869757, -0.492992]] and translation vector: [4.864128, 2.487759, 1.4037]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_84_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_84_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_84_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_84_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_84_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_84_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_84_6.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_84_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.330673, -0.328207, 0.884837], [-0.942686, -0.070458, 0.326157], [-0.044703, -0.941975, -0.332694]] and translation vector: [3.753276, 4.481459, 1.345242], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.306694, -0.326667, 0.893995], [-0.950878, -0.063631, 0.302957], [-0.04208, -0.942995, -0.330136]] and translation vector: [3.754864, 4.497246, 1.34429], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.246991, -0.34493, 0.905549], [-0.96808, -0.046739, 0.246244], [-0.042613, -0.937464, -0.345464]] and translation vector: [3.754345, 4.564482, 1.352383]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_85_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_85_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_85_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_85_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_85_4.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_85_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_85_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_85_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.119369, -0.433868, 0.893034], [-0.990549, 0.113242, -0.077387], [-0.067553, -0.893832, -0.443285]] and translation vector: [3.407035, 4.679209, 1.397058], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.120544, -0.432859, 0.893366], [-0.990306, 0.115004, -0.077902], [-0.06902, -0.894096, -0.442526]] and translation vector: [3.401289, 4.681283, 1.397495], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.162977, -0.454909, 0.875498], [-0.983038, 0.15052, -0.104785], [-0.084112, -0.877725, -0.471725]] and translation vector: [3.342063, 4.674428, 1.399173]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_86_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_86_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_86_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_86_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_86_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_86_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_86_6.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_86_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.767458, -0.265442, 0.583565], [-0.640543, 0.35536, -0.680752], [-0.026676, -0.896248, -0.442751]] and translation vector: [3.343537, 3.697402, 1.375352], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.780866, -0.263741, 0.566294], [-0.624403, 0.357431, -0.694525], [-0.019236, -0.895926, -0.443786]] and translation vector: [3.344022, 3.709659, 1.376654], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.822146, -0.253316, 0.509811], [-0.569276, 0.364542, -0.736908], [0.000823, -0.896069, -0.443913]] and translation vector: [3.329204, 3.745763, 1.383552]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_87_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_87_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_87_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_87_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_87_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_87_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_87_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_87_7.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.612656, -0.411508, 0.674769], [-0.789543, 0.280105, -0.546043], [0.035694, -0.867296, -0.496511]] and translation vector: [1.897828, 2.372103, 1.388776], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.615876, -0.406578, 0.674826], [-0.787242, 0.284147, -0.547275], [0.03076, -0.868305, -0.495075]] and translation vector: [1.892345, 2.36762, 1.390764], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.607068, -0.416916, 0.676498], [-0.79419, 0.289362, -0.534352], [0.027027, -0.861656, -0.506773]] and translation vector: [1.87873, 2.3614, 1.391886]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_88_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_88_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_88_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_88_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_88_4.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_88_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_88_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_88_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.221984, 0.421429, -0.879273], [0.97466, 0.121427, -0.187867], [0.027595, -0.898695, -0.437705]] and translation vector: [3.155292, 0.483793, 1.35371], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.224547, 0.416482, -0.880978], [0.973822, 0.128715, -0.187361], [0.035363, -0.899986, -0.434482]] and translation vector: [3.157119, 0.483672, 1.354178], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.215665, 0.423756, -0.879727], [0.975658, 0.130183, -0.176474], [0.039743, -0.896373, -0.441517]] and translation vector: [3.155366, 0.486351, 1.353433]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_89_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_89_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_89_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_89_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_89_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_89_5.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_89_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_89_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.955421, 0.119616, -0.269932], [0.295248, 0.388339, -0.872939], [0.000408, -0.91372, -0.406343]] and translation vector: [2.65583, 2.981598, 1.368648], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.951595, 0.120375, -0.282803], [0.307283, 0.392547, -0.866882], [0.006663, -0.91182, -0.410535]] and translation vector: [2.655525, 2.981353, 1.361859], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.943467, 0.154725, -0.293138], [0.331247, 0.407989, -0.850776], [-0.01204, -0.89978, -0.436177]] and translation vector: [2.636264, 2.98502, 1.345518]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_90_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_90_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_90_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_90_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_90_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_90_5.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_90_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_90_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.908726, 0.150598, -0.389277], [0.406624, 0.108936, -0.907078], [-0.094198, -0.982575, -0.16023]] and translation vector: [8.822721, 3.830595, 1.476402], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.908663, 0.151907, -0.388916], [0.40641, 0.108245, -0.907256], [-0.09572, -0.98245, -0.160095]] and translation vector: [8.818814, 3.832555, 1.475788], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.906287, 0.145588, -0.396797], [0.413103, 0.106574, -0.904427], [-0.089385, -0.983589, -0.156729]] and translation vector: [8.811844, 3.835278, 1.478992]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_91_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_91_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_91_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_91_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_91_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_91_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_91_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_91_7.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.895509, 0.17248, -0.410263], [0.444823, 0.375965, -0.812886], [0.014038, -0.91044, -0.413402]] and translation vector: [2.818061, 5.409916, 1.54775], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.895274, 0.172164, -0.410907], [0.445264, 0.376844, -0.812237], [0.01501, -0.910136, -0.414037]] and translation vector: [2.819061, 5.407142, 1.548651], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.894314, 0.169155, -0.414233], [0.446992, 0.379174, -0.810201], [0.020016, -0.909733, -0.414712]] and translation vector: [2.82614, 5.405447, 1.545731]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_92_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_92_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_92_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_92_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_92_4.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_92_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_92_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_92_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.852441, 0.228219, -0.470383], [0.522431, 0.337001, -0.78326], [-0.020235, -0.913426, -0.406502]] and translation vector: [1.798405, 5.320803, 1.619482], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.850776, 0.231102, -0.471988], [0.52508, 0.336676, -0.781627], [-0.021728, -0.91282, -0.407783]] and translation vector: [1.793927, 5.32593, 1.618758], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.843319, 0.217806, -0.491298], [0.537393, 0.333805, -0.774456], [-0.004683, -0.917134, -0.398552]] and translation vector: [1.789976, 5.331068, 1.629155]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_93_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_93_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_93_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_93_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_93_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_93_5.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_93_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_93_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.443363, -0.325026, 0.835337], [-0.895367, 0.117125, -0.429651], [0.041809, -0.938424, -0.342946]] and translation vector: [2.190343, 3.392878, 1.594635], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.439336, -0.32163, 0.838772], [-0.897253, 0.111545, -0.427195], [0.043838, -0.940272, -0.337589]] and translation vector: [2.183471, 3.393708, 1.586874], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.44052, -0.339041, 0.83126], [-0.896776, 0.123224, -0.424981], [0.041655, -0.932667, -0.358326]] and translation vector: [2.168168, 3.37614, 1.57519]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_94_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_94_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_94_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_94_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_94_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_94_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_94_6.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_94_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.236277, -0.452541, 0.859872], [-0.970097, 0.160455, -0.182119], [-0.055554, -0.877189, -0.47692]] and translation vector: [1.575898, 1.961144, 1.314442], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.238966, -0.451212, 0.859828], [-0.9694, 0.162109, -0.184349], [-0.056205, -0.87757, -0.476143]] and translation vector: [1.575219, 1.960128, 1.313122], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.271686, -0.463311, 0.843522], [-0.960992, 0.177771, -0.211879], [-0.051788, -0.868182, -0.493536]] and translation vector: [1.583445, 1.96149, 1.313418]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_95_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_95_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_95_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_95_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_95_4.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_95_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_95_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_95_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.931668, 0.072515, -0.356001], [0.362912, -0.231685, 0.902561], [-0.017031, -0.970084, -0.24217]] and translation vector: [5.886859, 3.543659, 1.354971], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.931979, 0.073028, -0.355079], [0.362119, -0.233112, 0.902513], [-0.016864, -0.969704, -0.2437]] and translation vector: [5.882501, 3.543666, 1.354317], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.932369, 0.086637, -0.350973], [0.36142, -0.244825, 0.899687], [-0.007981, -0.965689, -0.259579]] and translation vector: [5.853946, 3.560033, 1.352092]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_96_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_96_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_96_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_96_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_96_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_96_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_96_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_96_7.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.688084, 0.423256, -0.589401], [0.725514, -0.415863, 0.54835], [-0.013017, -0.80493, -0.593227]] and translation vector: [3.968163, 0.8771, 1.421607], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.688048, 0.420794, -0.591205], [0.725576, -0.411726, 0.551381], [-0.011397, -0.80834, -0.588605]] and translation vector: [3.964529, 0.870938, 1.417962], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.665465, 0.44657, -0.598107], [0.746417, -0.402654, 0.529841], [-0.004219, -0.799027, -0.60128]] and translation vector: [3.954065, 0.866652, 1.420457]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_97_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_97_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_97_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_97_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_97_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_97_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_97_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_97_7.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.45377, -0.425062, 0.783208], [-0.891046, 0.227634, -0.392708], [-0.01136, -0.876074, -0.482043]] and translation vector: [2.25004, 3.862298, 1.519108], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.453547, -0.422981, 0.784463], [-0.891155, 0.226808, -0.392938], [-0.011717, -0.877294, -0.47981]] and translation vector: [2.249275, 3.861866, 1.519019], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.445149, -0.42745, 0.786847], [-0.895457, 0.212955, -0.390907], [-0.00047, -0.878599, -0.47756]] and translation vector: [2.244179, 3.86012, 1.517719]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_98_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_98_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_98_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_98_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_98_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_98_5.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_98_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_98_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.778266, 0.076502, -0.623257], [0.626532, 0.028295, -0.778882], [-0.041951, -0.996668, -0.069952]] and translation vector: [4.354075, 2.27787, 1.510689], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.774603, 0.078895, -0.627508], [0.631084, 0.031306, -0.775082], [-0.041505, -0.996391, -0.074039]] and translation vector: [4.353431, 2.276987, 1.507071], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.765589, 0.09814, -0.635801], [0.642341, 0.061836, -0.76392], [-0.035656, -0.99325, -0.110381]] and translation vector: [4.348542, 2.268086, 1.503072]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_99_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_99_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_99_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_99_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_99_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_99_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_99_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_99_7.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.997112, 0.02462, 0.071841], [-0.04661, 0.548461, -0.834876], [-0.059957, -0.835814, -0.545729]] and translation vector: [4.834615, 3.436689, 1.398379], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.998397, 0.025746, 0.050402], [-0.028149, 0.546702, -0.836854], [-0.0491, -0.836932, -0.545101]] and translation vector: [4.839047, 3.434593, 1.400064], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.999077, -0.037699, 0.020609], [-0.036788, 0.502836, -0.863599], [0.022194, -0.863559, -0.503759]] and translation vector: [4.856574, 3.440762, 1.395837]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_100_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_100_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_100_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_100_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_100_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_100_5.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_100_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_100_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.924593, 0.219455, -0.311397], [0.371095, 0.334047, -0.86643], [-0.086121, -0.916653, -0.390296]] and translation vector: [7.650298, 2.745242, 1.444521], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.925403, 0.221817, -0.30729], [0.368562, 0.337876, -0.866026], [-0.088274, -0.914679, -0.394425]] and translation vector: [7.650829, 2.747432, 1.442508], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.931334, 0.218695, -0.291187], [0.355288, 0.37018, -0.858334], [-0.079922, -0.902851, -0.422461]] and translation vector: [7.652313, 2.75096, 1.431448]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_101_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_101_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_101_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_101_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_101_4.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_101_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_101_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_101_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.927869, -0.125596, 0.351119], [-0.372891, -0.32108, 0.870551], [0.003399, -0.938687, -0.344754]] and translation vector: [5.442723, 4.031985, 1.348893], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.928984, -0.124208, 0.348657], [-0.370086, -0.32475, 0.870387], [0.005117, -0.937609, -0.347654]] and translation vector: [5.438782, 4.038163, 1.363364], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.930142, -0.10574, 0.351647], [-0.366759, -0.314483, 0.87555], [0.018006, -0.943355, -0.331295]] and translation vector: [5.443505, 4.02862, 1.369591]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_102_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_102_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_102_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_102_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_102_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_102_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_102_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_102_7.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.32152, -0.4706, 0.821681], [-0.946681, 0.178549, -0.268172], [-0.020508, -0.864092, -0.502915]] and translation vector: [2.120097, 2.367636, 1.494245], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.324752, -0.471365, 0.819971], [-0.945715, 0.173395, -0.274877], [-0.012612, -0.864725, -0.502087]] and translation vector: [2.101204, 2.346659, 1.492081], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.35351, -0.420371, 0.835655], [-0.935423, 0.155099, -0.317693], [0.00394, -0.893998, -0.448054]] and translation vector: [2.068189, 2.338444, 1.524964]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_103_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_103_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_103_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_103_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_103_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_103_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_103_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_103_7.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.504428, 0.479717, -0.717931], [0.860003, -0.204862, 0.467362], [0.077124, -0.853173, -0.515896]] and translation vector: [4.973708, 0.412451, 1.573636], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.50991, 0.478461, -0.714889], [0.856537, -0.205494, 0.47341], [0.079603, -0.853725, -0.514603]] and translation vector: [4.974949, 0.42052, 1.588198], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.529349, 0.450337, -0.719018], [0.846093, -0.217693, 0.486556], [0.062589, -0.865914, -0.496262]] and translation vector: [4.987175, 0.423323, 1.59454]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_104_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_104_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_104_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_104_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_104_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_104_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_104_6.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_104_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.070416, -0.411804, 0.908548], [-0.99671, 0.065705, -0.047468], [-0.040148, -0.908901, -0.415075]] and translation vector: [2.214543, 1.806687, 1.391502], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.072195, -0.409813, 0.909308], [-0.996578, 0.066438, -0.049181], [-0.040258, -0.909747, -0.413207]] and translation vector: [2.216063, 1.808517, 1.395188], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.080916, -0.398975, 0.913384], [-0.996223, 0.061337, -0.061462], [-0.031503, -0.914908, -0.402432]] and translation vector: [2.214478, 1.812354, 1.396036]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_105_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_105_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_105_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_105_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_105_4.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_105_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_105_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_105_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.349467, 0.022881, -0.936669], [0.936944, -0.011774, 0.349282], [-0.003037, -0.999669, -0.025553]] and translation vector: [3.08553, 2.787215, 1.609269], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.348555, 0.021762, -0.937036], [0.937279, -0.012701, 0.34835], [-0.00432, -0.999682, -0.024824]] and translation vector: [3.086167, 2.787834, 1.610474], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.369988, 0.031522, -0.928502], [0.929035, -0.010749, 0.369835], [0.001677, -0.999445, -0.033262]] and translation vector: [3.084904, 2.78765, 1.611416]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_106_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_106_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_106_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_106_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_106_4.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_106_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_106_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_106_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.986418, -0.051155, 0.156087], [-0.152905, 0.633099, -0.758819], [-0.060001, -0.772379, -0.632322]] and translation vector: [2.055195, 1.600374, 1.268236], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.986809, -0.050817, 0.15371], [-0.151071, 0.630346, -0.761474], [-0.058194, -0.77465, -0.629707]] and translation vector: [2.054364, 1.600927, 1.26836], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.986971, -0.056701, 0.150577], [-0.152339, 0.630474, -0.761115], [-0.051779, -0.774137, -0.630897]] and translation vector: [2.055561, 1.60142, 1.26922]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_107_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_107_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_107_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_107_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_107_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_107_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_107_6.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_107_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.987126, 0.106622, -0.119219], [0.159938, -0.652529, 0.740693], [0.00118, -0.750225, -0.661181]] and translation vector: [4.64166, 4.052867, 1.404314], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.987387, 0.107853, -0.115912], [0.158278, -0.654013, 0.73974], [0.003975, -0.748756, -0.662834]] and translation vector: [4.649776, 4.051806, 1.400746], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.98973, 0.078153, -0.119695], [0.141622, -0.649931, 0.746681], [-0.019438, -0.755964, -0.654324]] and translation vector: [4.654046, 4.058671, 1.412681]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_108_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_108_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_108_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_108_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_108_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_108_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_108_6.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_108_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.994446, -0.078697, 0.06988], [-0.104992, -0.787844, 0.606859], [0.007297, -0.610826, -0.791731]] and translation vector: [1.305105, 0.510448, 1.183315], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.994112, -0.083607, 0.068931], [-0.10831, -0.785774, 0.608956], [0.003251, -0.612836, -0.790203]] and translation vector: [1.308194, 0.508844, 1.184721], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.994174, -0.088174, 0.061991], [-0.107635, -0.781912, 0.614026], [-0.00567, -0.617121, -0.786848]] and translation vector: [1.316761, 0.496028, 1.1951]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_109_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_109_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_109_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_109_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_109_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_109_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_109_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_109_7.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.481759, -0.460793, 0.745371], [-0.875469, 0.290199, -0.386444], [-0.038235, -0.838722, -0.543216]] and translation vector: [3.08436, 2.075189, 1.468295], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.482142, -0.463533, 0.743422], [-0.87538, 0.289132, -0.387445], [-0.035354, -0.83758, -0.54517]] and translation vector: [3.085865, 2.079347, 1.468915], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.466183, -0.466331, 0.751804], [-0.884097, 0.276631, -0.376626], [-0.03234, -0.840244, -0.541243]] and translation vector: [3.069418, 2.081707, 1.467716]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_110_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_110_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_110_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_110_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_110_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_110_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_110_6.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_110_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.975982, 0.033782, -0.215214], [0.215389, -0.297687, 0.930048], [-0.032648, -0.954066, -0.297814]] and translation vector: [2.838751, 1.414222, 1.664536], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.976127, 0.034525, -0.21444], [0.21483, -0.298963, 0.929769], [-0.03201, -0.95364, -0.299243]] and translation vector: [2.83798, 1.414721, 1.663024], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.977071, 0.035817, -0.209879], [0.210869, -0.299025, 0.930655], [-0.029426, -0.953573, -0.299721]] and translation vector: [2.830656, 1.415531, 1.663803]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_111_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_111_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_111_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_111_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_111_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_111_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_111_6.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_111_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.054781, -0.427281, 0.902458], [-0.998013, -0.051617, 0.036143], [0.031139, -0.902644, -0.429259]] and translation vector: [1.328526, 0.849821, 1.501181], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.086578, -0.407933, 0.908898], [-0.995883, -0.060028, 0.067922], [0.026852, -0.911036, -0.41145]] and translation vector: [1.314662, 0.836147, 1.492068], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.123316, -0.40327, 0.906734], [-0.991749, -0.082348, 0.098253], [0.035045, -0.911368, -0.410097]] and translation vector: [1.307532, 0.816785, 1.49678]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_112_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_112_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_112_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_112_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_112_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_112_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_112_6.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_112_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.140295, 0.625342, -0.767636], [0.990108, -0.090149, 0.107516], [-0.001967, -0.775126, -0.631804]] and translation vector: [3.410891, 3.073526, 1.198756], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.148525, 0.612201, -0.776627], [0.988818, -0.102561, 0.108258], [-0.013376, -0.784022, -0.620589]] and translation vector: [3.421496, 3.097678, 1.206193], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.180299, 0.582031, -0.792926], [0.982291, -0.148308, 0.114495], [-0.050958, -0.799528, -0.598463]] and translation vector: [3.423417, 3.182928, 1.218892]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_113_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_113_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_113_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_113_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_113_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_113_5.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_113_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_113_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.408988, -0.323891, 0.853126], [-0.912443, -0.158736, 0.37716], [0.013263, -0.932683, -0.360453]] and translation vector: [3.672612, 2.990265, 1.494339], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.403714, -0.307769, 0.861564], [-0.914697, -0.154884, 0.373283], [0.018558, -0.93877, -0.344045]] and translation vector: [3.67724, 2.998002, 1.501107], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.418114, -0.223767, 0.880403], [-0.907864, -0.136047, 0.396578], [0.031035, -0.965101, -0.260033]] and translation vector: [3.686426, 2.992862, 1.516855]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_114_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_114_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_114_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_114_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_114_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_114_5.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_114_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_114_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.804414, -0.195207, 0.561082], [-0.593456, -0.306943, 0.74404], [0.026978, -0.931494, -0.362756]] and translation vector: [4.397897, 1.805397, 1.263968], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.81043, -0.19082, 0.553888], [-0.585149, -0.309439, 0.749566], [0.028363, -0.931577, -0.362436]] and translation vector: [4.406421, 1.797547, 1.276681], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.835561, -0.15907, 0.525866], [-0.54802, -0.309079, 0.777267], [0.038894, -0.937639, -0.345428]] and translation vector: [4.454782, 1.746297, 1.281162]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_115_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_115_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_115_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_115_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_115_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_115_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_115_6.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_115_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.454685, 0.144673, -0.878824], [0.890085, 0.109034, -0.442562], [0.031795, -0.983454, -0.178347]] and translation vector: [3.311996, 2.119304, 1.59409], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.453171, 0.138778, -0.880555], [0.890847, 0.10604, -0.441756], [0.032068, -0.98463, -0.171684]] and translation vector: [3.314367, 2.120091, 1.591769], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.43605, 0.134523, -0.889811], [0.898328, 0.123911, -0.42149], [0.053558, -0.983133, -0.174877]] and translation vector: [3.332471, 2.052713, 1.580764]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_116_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_116_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_116_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_116_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_116_4.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_116_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_116_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_116_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.51864, -0.44867, 0.727811], [-0.853934, -0.229463, 0.467059], [-0.04255, -0.863738, -0.502143]] and translation vector: [1.002297, 1.98866, 1.344191], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.519607, -0.444592, 0.729621], [-0.853432, -0.229314, 0.468049], [-0.040778, -0.865883, -0.498582]] and translation vector: [1.000441, 1.985865, 1.344846], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.525099, -0.430062, 0.734383], [-0.8496, -0.214703, 0.48175], [-0.049508, -0.876898, -0.478121]] and translation vector: [0.994465, 1.977308, 1.35476]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_117_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_117_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_117_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_117_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_117_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_117_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_117_6.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_117_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.68967, 0.288211, -0.664297], [0.724122, -0.27239, 0.633602], [0.001663, -0.918008, -0.396559]] and translation vector: [2.530043, 2.005069, 1.437417], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.68921, 0.288518, -0.66464], [0.724561, -0.273014, 0.632831], [0.001127, -0.917726, -0.397212]] and translation vector: [2.5334, 2.008455, 1.44069], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.695343, 0.287777, -0.658546], [0.718659, -0.271639, 0.640111], [0.005323, -0.918366, -0.395696]] and translation vector: [2.535345, 2.010031, 1.440264]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_118_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_118_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_118_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_118_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_118_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_118_5.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_118_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_118_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.24604, -0.551346, 0.797171], [-0.968826, -0.115295, 0.219278], [-0.028988, -0.826271, -0.562526]] and translation vector: [1.704247, 2.057158, 1.361636], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.236706, -0.55071, 0.800431], [-0.971342, -0.115817, 0.207564], [-0.021604, -0.826623, -0.562342]] and translation vector: [1.70792, 2.062619, 1.364929], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.170375, -0.545117, 0.820866], [-0.98536, -0.099505, 0.138438], [0.006215, -0.832434, -0.554089]] and translation vector: [1.68849, 2.12587, 1.375528]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_119_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_119_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_119_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_119_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_119_4.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_119_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_119_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_119_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.711391, -0.463973, 0.527875], [-0.700286, 0.531398, -0.476672], [-0.059349, -0.708763, -0.702945]] and translation vector: [2.53321, 4.394931, 1.530427], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.710702, -0.465347, 0.527594], [-0.701175, 0.5294, -0.477586], [-0.057065, -0.709357, -0.702536]] and translation vector: [2.526067, 4.393322, 1.526345], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.710832, -0.469663, 0.523579], [-0.701381, 0.52914, -0.477573], [-0.052748, -0.706702, -0.705542]] and translation vector: [2.532494, 4.391185, 1.524071]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_120_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_120_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_120_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_120_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_120_4.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_120_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_120_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_120_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.506976, -0.449046, 0.735753], [-0.861802, 0.247713, -0.442646], [0.016513, -0.858485, -0.512574]] and translation vector: [1.568574, 4.423309, 1.333385], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.503836, -0.444181, 0.740846], [-0.863753, 0.25025, -0.437385], [0.008882, -0.860278, -0.509748]] and translation vector: [1.576928, 4.418399, 1.331934], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.476896, -0.475938, 0.738954], [-0.878865, 0.245876, -0.408828], [0.012886, -0.84441, -0.535543]] and translation vector: [1.618973, 4.377153, 1.328238]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_121_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_121_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_121_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_121_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_121_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_121_5.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_121_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_121_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.857694, 0.203115, -0.472341], [0.513544, 0.293426, -0.806333], [-0.025181, -0.934155, -0.355978]] and translation vector: [3.161674, 3.662206, 1.335287], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.856666, 0.203827, -0.473897], [0.515344, 0.296604, -0.804019], [-0.023321, -0.932995, -0.359132]] and translation vector: [3.164327, 3.659025, 1.330704], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.851543, 0.201203, -0.48414], [0.523447, 0.274112, -0.806762], [-0.029614, -0.940415, -0.338738]] and translation vector: [3.169208, 3.645592, 1.345035]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_122_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_122_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_122_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_122_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_122_4.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_122_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_122_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_122_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.861262, 0.35211, -0.366398], [0.508128, 0.60504, -0.61297], [0.005853, -0.714105, -0.700014]] and translation vector: [3.145762, 3.637784, 1.437024], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.859655, 0.347273, -0.374693], [0.510745, 0.600786, -0.614977], [0.011546, -0.720041, -0.693836]] and translation vector: [3.145171, 3.63531, 1.440385], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.904923, 0.242096, -0.350005], [0.423906, 0.585528, -0.690985], [0.037653, -0.773658, -0.632485]] and translation vector: [3.179198, 3.619442, 1.477378]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_123_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_123_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_123_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_123_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_123_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_123_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_123_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_123_7.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.624751, -0.31057, 0.716403], [-0.780527, -0.273701, 0.562018], [0.021534, -0.910293, -0.413403]] and translation vector: [-0.212106, 0.775797, 1.619325], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.624146, -0.312612, 0.716042], [-0.781019, -0.274551, 0.56092], [0.02124, -0.909338, -0.415515]] and translation vector: [-0.212874, 0.777223, 1.616059], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.642142, -0.354499, 0.679694], [-0.766394, -0.316707, 0.558871], [0.017145, -0.879788, -0.475057]] and translation vector: [-0.180935, 0.825968, 1.590205]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_124_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_124_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_124_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_124_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_124_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_124_5.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_124_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_124_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.984594, -0.069457, 0.160469], [-0.174127, -0.305795, 0.936039], [-0.015944, -0.949561, -0.313178]] and translation vector: [3.941113, 2.817773, 1.559826], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.984592, -0.069572, 0.160429], [-0.174152, -0.307406, 0.935507], [-0.015768, -0.949032, -0.314785]] and translation vector: [3.94407, 2.817183, 1.553188], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.986412, -0.069361, 0.14893], [-0.163547, -0.328462, 0.93025], [-0.015605, -0.941967, -0.335343]] and translation vector: [3.970874, 2.81883, 1.551708]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_125_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_125_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_125_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_125_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_125_4.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_125_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_125_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_125_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.565317, -0.50256, 0.654103], [-0.824719, 0.328974, -0.460017], [0.016003, -0.799506, -0.600445]] and translation vector: [4.07549, 5.065369, 1.281872], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.538132, -0.502349, 0.676801], [-0.842747, 0.30749, -0.441846], [0.013851, -0.808143, -0.588824]] and translation vector: [4.054681, 5.042427, 1.283033], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.45677, -0.532015, 0.712967], [-0.889546, 0.265624, -0.371689], [0.008363, -0.803993, -0.594581]] and translation vector: [3.985017, 4.950093, 1.286783]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_126_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_126_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_126_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_126_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_126_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_126_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_126_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_126_7.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.971613, -0.06682, 0.226943], [-0.235147, 0.378036, -0.89543], [-0.02596, -0.923376, -0.383017]] and translation vector: [2.775299, 4.618156, 1.427592], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.969099, -0.066923, 0.237421], [-0.244849, 0.377786, -0.892932], [-0.029937, -0.923471, -0.382498]] and translation vector: [2.770648, 4.620754, 1.418404], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.959375, -0.08118, 0.270203], [-0.280099, 0.388898, -0.877669], [-0.033832, -0.917697, -0.395838]] and translation vector: [2.756619, 4.594989, 1.414391]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_127_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_127_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_127_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_127_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_127_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_127_5.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_127_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_127_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.218501, -0.721835, 0.656667], [-0.97193, -0.10083, 0.212566], [-0.087226, -0.684681, -0.723605]] and translation vector: [2.10902, 2.428258, 1.386435], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.218569, -0.722397, 0.656026], [-0.971546, -0.098231, 0.215522], [-0.091251, -0.684466, -0.723312]] and translation vector: [2.107975, 2.430531, 1.385643], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.234983, -0.674252, 0.70012], [-0.966581, -0.086145, 0.241454], [-0.102489, -0.73346, -0.671961]] and translation vector: [2.089091, 2.418566, 1.400829]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_128_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_128_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_128_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_128_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_128_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_128_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_128_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_128_7.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.863619, -0.252896, 0.436126], [-0.502889, 0.371124, -0.780621], [0.03556, -0.893482, -0.447688]] and translation vector: [2.007098, 3.82416, 1.536992], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.862677, -0.255046, 0.436739], [-0.504412, 0.370978, -0.779707], [0.036841, -0.892932, -0.448682]] and translation vector: [2.007321, 3.81907, 1.542811], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.863059, -0.255804, 0.435538], [-0.503401, 0.36489, -0.783226], [0.041429, -0.89522, -0.443694]] and translation vector: [2.011345, 3.815826, 1.540639]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_129_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_129_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_129_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_129_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_129_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_129_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_129_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_129_7.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.311411, -0.45253, 0.835607], [-0.948656, 0.199362, -0.245576], [-0.055457, -0.869179, -0.491379]] and translation vector: [2.299133, 2.388773, 1.459468], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.314195, -0.454542, 0.833471], [-0.947818, 0.20019, -0.248124], [-0.05407, -0.867937, -0.493722]] and translation vector: [2.299448, 2.389842, 1.45904], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.319309, -0.479365, 0.817466], [-0.946515, 0.203543, -0.250358], [-0.046377, -0.853686, -0.518719]] and translation vector: [2.297309, 2.382683, 1.450072]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_130_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_130_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_130_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_130_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_130_4.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_130_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_130_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_130_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.922168, 0.178823, -0.342969], [0.38661, 0.453076, -0.803278], [0.011746, -0.873352, -0.486947]] and translation vector: [3.207336, 1.959871, 1.267555], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.914921, 0.180426, -0.361063], [0.403188, 0.450583, -0.796502], [0.018979, -0.874312, -0.484993]] and translation vector: [3.204391, 1.957541, 1.273759], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.899907, 0.183343, -0.395667], [0.435126, 0.437531, -0.786913], [0.028842, -0.880314, -0.473515]] and translation vector: [3.195998, 1.957617, 1.285169]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_131_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_131_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_131_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_131_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_131_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_131_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_131_6.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_131_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.037281, 0.595041, -0.80283], [0.998378, -0.012419, -0.055566], [-0.043034, -0.803599, -0.593613]] and translation vector: [3.95675, 2.244474, 1.442954], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.038109, 0.594465, -0.803218], [0.998341, -0.012073, -0.056302], [-0.043167, -0.80403, -0.593019]] and translation vector: [3.957906, 2.244142, 1.441716], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.035792, 0.584102, -0.810891], [0.99863, -0.010099, -0.051354], [-0.038185, -0.811617, -0.58294]] and translation vector: [3.956708, 2.24149, 1.443636]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_132_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_132_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_132_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_132_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_132_4.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_132_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_132_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_132_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.941243, -0.209403, 0.264975], [-0.336113, 0.504116, -0.795548], [0.033012, -0.837865, -0.544878]] and translation vector: [4.828751, 9.008894, 1.463441], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.939528, -0.206646, 0.273103], [-0.341818, 0.516505, -0.785101], [0.021179, -0.830976, -0.555906]] and translation vector: [4.819307, 9.009376, 1.463735], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.929333, -0.218512, 0.297646], [-0.368979, 0.519063, -0.770992], [0.013974, -0.826333, -0.563008]] and translation vector: [4.802584, 9.04943, 1.458571]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_133_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_133_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_133_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_133_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_133_4.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_133_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_133_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_133_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.341382, 0.594812, -0.727775], [0.932196, 0.11517, -0.343142], [-0.120287, -0.795572, -0.593798]] and translation vector: [7.151203, 3.587152, 1.581923], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.344041, 0.585523, -0.734029], [0.930897, 0.110501, -0.348168], [-0.122749, -0.803089, -0.583079]] and translation vector: [7.150104, 3.60012, 1.584136], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.381268, 0.567894, -0.729473], [0.913798, 0.111991, -0.390424], [-0.140025, -0.815448, -0.561639]] and translation vector: [7.153435, 3.678253, 1.582921]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_134_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_134_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_134_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_134_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_134_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_134_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_134_6.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_134_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.631332, 0.312126, -0.709927], [0.775472, -0.26347, 0.573784], [-0.007951, -0.912776, -0.408382]] and translation vector: [1.600176, 0.624978, 1.327739], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.627277, 0.311053, -0.713982], [0.778666, -0.267257, 0.567673], [-0.014241, -0.912041, -0.409851]] and translation vector: [1.601099, 0.627571, 1.328079], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.610657, 0.317655, -0.725393], [0.791862, -0.253314, 0.555685], [-0.007236, -0.913744, -0.406226]] and translation vector: [1.603666, 0.628049, 1.323957]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_135_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_135_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_135_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_135_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_135_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_135_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_135_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_135_7.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.283698, -0.38675, 0.877463], [-0.95878, 0.129662, -0.252839], [-0.015988, -0.913024, -0.407593]] and translation vector: [3.69525, 3.551647, 1.352095], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.292652, -0.378333, 0.878191], [-0.956147, 0.127043, -0.2639], [-0.011726, -0.91691, -0.398922]] and translation vector: [3.694781, 3.553972, 1.346799], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.31632, -0.391232, 0.864222], [-0.948647, 0.127329, -0.28958], [0.003253, -0.911441, -0.411418]] and translation vector: [3.701458, 3.559184, 1.352364]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_136_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_136_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_136_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_136_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_136_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_136_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_136_6.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_136_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.831143, 0.312948, -0.459636], [0.555586, 0.43327, -0.709649], [-0.022937, -0.845187, -0.533978]] and translation vector: [2.360292, 3.05803, 1.315354], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.8108, 0.328121, -0.484706], [0.584922, 0.423558, -0.691711], [-0.021664, -0.844355, -0.535346]] and translation vector: [2.374215, 3.08026, 1.318953], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.76064, 0.373644, -0.530865], [0.648502, 0.400127, -0.647568], [-0.029546, -0.836832, -0.546661]] and translation vector: [2.421989, 3.144455, 1.295588]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_137_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_137_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_137_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_137_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_137_4.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_137_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_137_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_137_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.963317, 0.154363, -0.219528], [0.260086, 0.335369, -0.905474], [-0.066149, -0.929355, -0.363214]] and translation vector: [5.972451, 2.818726, 1.468896], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.963149, 0.154275, -0.220326], [0.260736, 0.334417, -0.905639], [-0.066037, -0.929712, -0.362318]] and translation vector: [5.973901, 2.819783, 1.467855], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.966667, 0.155296, -0.203565], [0.245918, 0.341836, -0.907013], [-0.07127, -0.926839, -0.368632]] and translation vector: [5.982299, 2.822232, 1.456096]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_138_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_138_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_138_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_138_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_138_4.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_138_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_138_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_138_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.48142, 0.335029, -0.809933], [0.872625, 0.096524, -0.478757], [-0.08222, -0.937251, -0.338823]] and translation vector: [4.429162, 2.287411, 1.464776], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.484328, 0.331289, -0.809737], [0.871134, 0.09698, -0.481374], [-0.080946, -0.938532, -0.335568]] and translation vector: [4.432656, 2.285767, 1.465956], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.510728, 0.315618, -0.799714], [0.857732, 0.123483, -0.499047], [-0.058757, -0.940817, -0.333782]] and translation vector: [4.456876, 2.264055, 1.467574]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_139_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_139_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_139_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_139_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_139_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_139_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_139_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_139_7.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.233902, -0.58763, 0.774584], [-0.967246, -0.059828, 0.246692], [-0.098622, -0.806915, -0.582377]] and translation vector: [0.860343, 3.117731, 1.418568], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.233684, -0.587102, 0.775051], [-0.967496, -0.061159, 0.24538], [-0.096661, -0.8072, -0.58231]] and translation vector: [0.859973, 3.119137, 1.418853], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.249158, -0.592393, 0.766154], [-0.964448, -0.07981, 0.251935], [-0.088098, -0.801687, -0.591217]] and translation vector: [0.847042, 3.133789, 1.403155]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_140_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_140_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_140_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_140_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_140_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_140_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_140_6.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_140_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.645842, -0.099101, 0.757012], [-0.761541, -0.013148, 0.647984], [-0.054263, -0.994991, -0.083961]] and translation vector: [3.729951, 1.432448, 1.733539], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.649827, -0.099601, 0.753528], [-0.757797, -0.00807, 0.652441], [-0.058903, -0.994995, -0.080722]] and translation vector: [3.727943, 1.43259, 1.731865], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.662065, -0.092976, 0.743657], [-0.747389, -0.008433, 0.664333], [-0.055496, -0.995633, -0.075073]] and translation vector: [3.728372, 1.436196, 1.743771]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_141_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_141_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_141_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_141_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_141_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_141_5.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_141_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_141_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.924746, 0.145405, -0.351715], [0.379908, 0.407811, -0.830277], [0.022707, -0.901414, -0.432362]] and translation vector: [3.891577, 4.106122, 1.335216], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.925289, 0.144931, -0.350479], [0.378485, 0.412032, -0.828842], [0.024284, -0.899569, -0.436102]] and translation vector: [3.892777, 4.104329, 1.336806], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.936719, 0.138164, -0.321666], [0.349495, 0.42231, -0.836366], [0.020288, -0.89586, -0.443873]] and translation vector: [3.898582, 4.105442, 1.335634]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_142_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_142_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_142_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_142_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_142_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_142_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_142_6.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_142_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.896132, -0.052356, 0.440688], [-0.436974, -0.277444, 0.855616], [0.07747, -0.959314, -0.271505]] and translation vector: [3.211431, 3.110947, 1.584554], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.889709, -0.065096, 0.451863], [-0.451099, -0.277541, 0.848222], [0.070195, -0.958506, -0.276295]] and translation vector: [3.215954, 3.116336, 1.570817], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.866761, -0.113538, 0.485628], [-0.495946, -0.298858, 0.815305], [0.052566, -0.94752, -0.315347]] and translation vector: [3.24594, 3.15503, 1.569742]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_143_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_143_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_143_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_143_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_143_4.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_143_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_143_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_143_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.052123, 0.492225, -0.868906], [0.996177, 0.08671, -0.010637], [0.070107, -0.866138, -0.494863]] and translation vector: [3.27549, 2.071379, 1.287401], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.035278, 0.492309, -0.869705], [0.997133, 0.075637, 0.002369], [0.066948, -0.867128, -0.493566]] and translation vector: [3.286684, 2.076202, 1.285681], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.002481, 0.481037, -0.876697], [0.99848, 0.047075, 0.028655], [0.055055, -0.875436, -0.480189]] and translation vector: [3.329912, 2.119781, 1.289403]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_144_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_144_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_144_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_144_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_144_4.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_144_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_144_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_144_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.935878, -0.161972, 0.312885], [-0.352322, 0.433116, -0.829627], [-0.001139, -0.886666, -0.46241]] and translation vector: [1.123681, 2.231354, 1.408983], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.935522, -0.159, 0.315466], [-0.353249, 0.430874, -0.830399], [-0.003893, -0.888294, -0.459258]] and translation vector: [1.123559, 2.231523, 1.408322], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.93225, -0.177625, 0.315214], [-0.361774, 0.444334, -0.819565], [0.005515, -0.878076, -0.47849]] and translation vector: [1.117516, 2.230649, 1.39948]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_145_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_145_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_145_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_145_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_145_4.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_145_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_145_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_145_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.305635, -0.390507, 0.868385], [-0.952144, 0.122302, -0.280116], [0.003183, -0.91244, -0.409198]] and translation vector: [4.266061, 1.773856, 1.285079], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.300987, -0.399102, 0.866097], [-0.953628, 0.125052, -0.273781], [0.00096, -0.908339, -0.418234]] and translation vector: [4.263163, 1.772832, 1.291083], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.290604, -0.37367, 0.880863], [-0.956686, 0.130175, -0.260397], [-0.017364, -0.918382, -0.395314]] and translation vector: [4.197608, 1.767915, 1.309526]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_146_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_146_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_146_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_146_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_146_4.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_146_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_146_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_146_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.485844, -0.617081, 0.619005], [-0.873216, -0.311825, 0.374512], [-0.038083, -0.722479, -0.690343]] and translation vector: [-0.164865, 3.073333, 1.323993], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.482952, -0.621872, 0.616468], [-0.874972, -0.315096, 0.367612], [-0.034361, -0.716931, -0.696297]] and translation vector: [-0.16601, 3.069565, 1.320265], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.481893, -0.627462, 0.611613], [-0.875383, -0.314055, 0.367526], [-0.038529, -0.712503, -0.70061]] and translation vector: [-0.162661, 3.069695, 1.32373]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_147_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_147_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_147_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_147_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_147_4.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_147_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_147_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_147_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.82141, -0.124481, 0.556588], [-0.562763, -0.33543, 0.755503], [0.092651, -0.933805, -0.345579]] and translation vector: [1.795382, 2.457259, 1.379582], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.820332, -0.124179, 0.558243], [-0.564621, -0.330977, 0.75608], [0.090876, -0.935432, -0.341626]] and translation vector: [1.795684, 2.460531, 1.380001], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.815123, -0.112718, 0.568216], [-0.568956, -0.340207, 0.748698], [0.108919, -0.933571, -0.341442]] and translation vector: [1.795413, 2.484714, 1.377791]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_148_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_148_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_148_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_148_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_148_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_148_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_148_6.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_148_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.464707, 0.496079, -0.733453], [0.882598, 0.326106, -0.338639], [0.071191, -0.804711, -0.589382]] and translation vector: [2.864701, 0.868861, 1.204561], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.473617, 0.501904, -0.723726], [0.878064, 0.332992, -0.343688], [0.068496, -0.798254, -0.598414]] and translation vector: [2.869803, 0.866998, 1.20304], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.486908, 0.474562, -0.733288], [0.872245, 0.308313, -0.379646], [0.045917, -0.82446, -0.564055]] and translation vector: [2.890215, 0.843054, 1.203118]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_149_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_149_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_149_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_149_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_149_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_149_5.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_149_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_149_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.199941, 0.263531, -0.943703], [0.979453, -0.027844, 0.19974], [0.026362, -0.964249, -0.263683]] and translation vector: [3.611549, 3.757055, 1.562045], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.20075, 0.267793, -0.94233], [0.97934, -0.030969, 0.199834], [0.024331, -0.962979, -0.268477]] and translation vector: [3.608934, 3.756757, 1.557843], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.195501, 0.302185, -0.932986], [0.980511, -0.041383, 0.192056], [0.019427, -0.95235, -0.304386]] and translation vector: [3.586484, 3.775929, 1.547968]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_150_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_150_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_150_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_150_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_150_4.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_150_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_150_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_150_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.869565, 0.231948, -0.435955], [0.492522, 0.471291, -0.731647], [0.035758, -0.850932, -0.524058]] and translation vector: [2.750575, 3.154689, 1.290553], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.871211, 0.246607, -0.424472], [0.49036, 0.478017, -0.72873], [0.023195, -0.843022, -0.53738]] and translation vector: [2.712538, 3.137298, 1.287246], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.868111, 0.301221, -0.394523], [0.496051, 0.497976, -0.711305], [-0.017797, -0.813195, -0.581719]] and translation vector: [2.638672, 3.09301, 1.251808]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_151_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_151_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_151_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_151_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_151_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_151_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_151_6.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_151_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.606468, -0.360414, 0.70873], [-0.789578, -0.16805, 0.590192], [-0.093612, -0.91753, -0.386492]] and translation vector: [2.373669, 6.226582, 1.48631], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.603564, -0.356146, 0.713352], [-0.791899, -0.163667, 0.588311], [-0.092772, -0.919986, -0.380815]] and translation vector: [2.370215, 6.229294, 1.484576], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.585698, -0.348105, 0.731971], [-0.805739, -0.152014, 0.572431], [-0.087997, -0.925048, -0.369516]] and translation vector: [2.368074, 6.23172, 1.479712]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_152_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_152_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_152_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_152_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_152_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_152_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_152_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_152_7.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.414473, -0.491559, 0.765887], [-0.909569, 0.196057, -0.366396], [0.029948, -0.848488, -0.528367]] and translation vector: [0.955419, 3.497842, 1.497559], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.410009, -0.490704, 0.768832], [-0.911757, 0.198024, -0.359841], [0.024328, -0.848526, -0.528594]] and translation vector: [0.937857, 3.503192, 1.495427], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.398859, -0.490133, 0.775036], [-0.916862, 0.197836, -0.346736], [0.016617, -0.848899, -0.528293]] and translation vector: [0.908797, 3.515594, 1.497193]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_153_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_153_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_153_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_153_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_153_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_153_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_153_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_153_7.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.935902, 0.160482, -0.313582], [0.351212, -0.493772, 0.795512], [-0.027173, -0.854655, -0.518485]] and translation vector: [4.465, -0.226232, 1.550028], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.933656, 0.161027, -0.319933], [0.356818, -0.495752, 0.791777], [-0.03111, -0.853405, -0.520319]] and translation vector: [4.478531, -0.229773, 1.540292], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.918867, 0.198209, -0.341168], [0.393883, -0.511652, 0.763589], [-0.023209, -0.836017, -0.548212]] and translation vector: [4.561479, -0.239772, 1.527731]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_154_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_154_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_154_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_154_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_154_4.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_154_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_154_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_154_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.079918, -0.690871, 0.718547], [-0.996802, 0.055321, -0.057677], [9.6e-05, -0.720858, -0.693082]] and translation vector: [1.142658, 0.968078, 1.385987], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.080635, -0.691404, 0.717954], [-0.996742, 0.054488, -0.059473], [0.002, -0.72041, -0.693545]] and translation vector: [1.144302, 0.967344, 1.387927], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.084359, -0.697761, 0.711347], [-0.996391, 0.05228, -0.066881], [0.009477, -0.714421, -0.699652]] and translation vector: [1.144001, 0.956717, 1.378471]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_155_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_155_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_155_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_155_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_155_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_155_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_155_6.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_155_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.799511, 0.533863, -0.275266], [0.600541, 0.71925, -0.349328], [0.011492, -0.4446, -0.895656]] and translation vector: [2.031323, 2.312379, 1.200993], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.794986, 0.540559, -0.275306], [0.606553, 0.715482, -0.346669], [0.009582, -0.442584, -0.896676]] and translation vector: [2.031011, 2.313572, 1.199732], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.773021, 0.563749, -0.290906], [0.633995, 0.702534, -0.323259], [0.022134, -0.434318, -0.900488]] and translation vector: [2.034953, 2.302037, 1.199248]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_156_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_156_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_156_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_156_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_156_4.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_156_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_156_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_156_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.573389, -0.355745, 0.738018], [-0.818965, 0.223754, -0.528424], [0.02285, -0.907403, -0.419641]] and translation vector: [2.061407, 3.857203, 1.382209], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.569689, -0.351701, 0.742806], [-0.821614, 0.221591, -0.525212], [0.020118, -0.909508, -0.4152]] and translation vector: [2.058259, 3.848013, 1.384733], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.577204, -0.345215, 0.740042], [-0.816391, 0.223437, -0.532524], [0.018482, -0.911539, -0.410799]] and translation vector: [2.052109, 3.841456, 1.390313]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_157_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_157_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_157_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_157_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_157_4.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_157_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_157_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_157_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.998162, -0.007354, -0.06016], [0.055338, 0.294228, -0.954132], [0.024717, -0.955707, -0.293281]] and translation vector: [1.687981, 4.43329, 1.569003], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.998237, -0.004775, -0.059163], [0.055295, 0.287523, -0.956176], [0.021577, -0.957762, -0.286752]] and translation vector: [1.687716, 4.435163, 1.571974], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.998336, 0.001509, -0.057642], [0.055709, 0.283251, -0.957427], [0.014882, -0.959045, -0.282864]] and translation vector: [1.68694, 4.439428, 1.572118]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_158_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_158_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_158_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_158_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_158_4.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_158_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_158_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_158_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.633294, -0.360819, 0.684652], [-0.773758, -0.312806, 0.550863], [0.015401, -0.878613, -0.477285]] and translation vector: [3.241882, 3.386626, 1.367882], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.618852, -0.359339, 0.698497], [-0.785116, -0.311057, 0.535572], [0.02482, -0.87984, -0.47462]] and translation vector: [3.234923, 3.400149, 1.365622], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.596077, -0.384708, 0.704764], [-0.800029, -0.359087, 0.480636], [0.068167, -0.850327, -0.521821]] and translation vector: [3.228332, 3.407161, 1.324573]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_159_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_159_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_159_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_159_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_159_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_159_5.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_159_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_159_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.977514, -0.102294, 0.184398], [-0.210796, -0.497303, 0.841578], [0.005613, -0.861525, -0.507684]] and translation vector: [3.555602, 1.207732, 1.356493], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.976582, -0.105336, 0.187593], [-0.215087, -0.498001, 0.840079], [0.00493, -0.860755, -0.508995]] and translation vector: [3.555365, 1.207812, 1.356155], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.974531, -0.107289, 0.196922], [-0.224038, -0.504207, 0.834016], [0.009809, -0.856892, -0.515402]] and translation vector: [3.552069, 1.20032, 1.350158]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_160_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_160_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_160_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_160_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_160_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_160_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_160_6.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_160_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.934222, -0.219071, 0.281493], [-0.356558, -0.595286, 0.72007], [0.009823, -0.773073, -0.634241]] and translation vector: [0.331108, 1.989283, 1.551545], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.93341, -0.222981, 0.281114], [-0.358788, -0.589093, 0.724045], [0.004154, -0.776691, -0.629868]] and translation vector: [0.338532, 1.98258, 1.554168], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.924209, -0.231475, 0.303738], [-0.381819, -0.575084, 0.723528], [0.007196, -0.784664, -0.619879]] and translation vector: [0.352139, 1.976578, 1.57555]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_161_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_161_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_161_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_161_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_161_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_161_5.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_161_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_161_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.928375, -0.17783, 0.326339], [-0.371449, 0.415395, -0.830345], [0.012101, -0.892089, -0.451697]] and translation vector: [2.096006, 1.919092, 1.36174], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.929206, -0.177937, 0.323905], [-0.369314, 0.414969, -0.83151], [0.013546, -0.892266, -0.451307]] and translation vector: [2.095672, 1.922099, 1.363168], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.930649, -0.183615, 0.31651], [-0.365027, 0.405695, -0.837954], [0.025454, -0.895375, -0.444584]] and translation vector: [2.086709, 1.937528, 1.366332]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_162_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_162_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_162_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_162_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_162_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_162_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_162_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_162_7.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.699126, -0.324611, 0.637064], [-0.713802, 0.265353, -0.648131], [0.041344, -0.907863, -0.417224]] and translation vector: [0.050403, 3.78209, 1.506908], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.698648, -0.327666, 0.636024], [-0.713993, 0.262294, -0.649166], [0.045885, -0.907654, -0.417203]] and translation vector: [0.047406, 3.786517, 1.504266], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.671591, -0.353844, 0.650968], [-0.738623, 0.250587, -0.625813], [0.058316, -0.901111, -0.429649]] and translation vector: [0.057884, 3.801169, 1.498956]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_163_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_163_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_163_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_163_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_163_4.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_163_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_163_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_163_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.892065, -0.360019, 0.273141], [-0.443019, -0.577417, 0.685801], [-0.089185, -0.732786, -0.674589]] and translation vector: [2.898737, 2.45906, 1.649541], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.888376, -0.366176, 0.276954], [-0.450762, -0.581088, 0.677606], [-0.087189, -0.726809, -0.681283]] and translation vector: [2.873446, 2.440832, 1.651115], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.866588, -0.389647, 0.31177], [-0.495846, -0.601945, 0.625939], [-0.056227, -0.697021, -0.714843]] and translation vector: [2.802999, 2.373059, 1.651133]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_164_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_164_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_164_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_164_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_164_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_164_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_164_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_164_7.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.660671, 0.426343, -0.617856], [0.749322, -0.423957, 0.508701], [-0.045063, -0.799057, -0.599565]] and translation vector: [1.739014, 2.260029, 1.323145], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.661948, 0.412501, -0.625834], [0.748146, -0.41469, 0.517987], [-0.045857, -0.811095, -0.583114]] and translation vector: [1.741474, 2.257287, 1.327618], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.667808, 0.364392, -0.649039], [0.743671, -0.363436, 0.561132], [-0.031412, -0.857399, -0.513693]] and translation vector: [1.753926, 2.258369, 1.342793]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_165_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_165_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_165_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_165_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_165_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_165_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_165_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_165_7.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.997074, 0.061747, -0.045056], [0.074474, 0.651998, -0.754554], [-0.017215, -0.755702, -0.654689]] and translation vector: [1.815792, 5.369752, 1.288561], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.994543, 0.080066, -0.066881], [0.102674, 0.63762, -0.763478], [-0.018484, -0.766179, -0.642361]] and translation vector: [1.819087, 5.36055, 1.286161], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.977666, 0.151417, -0.145745], [0.209051, 0.629394, -0.748438], [-0.021596, -0.762191, -0.646992]] and translation vector: [1.833647, 5.312907, 1.282765]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_166_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_166_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_166_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_166_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_166_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_166_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_166_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_166_7.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.207785, -0.462455, 0.861952], [-0.977184, 0.13779, -0.161637], [-0.044019, -0.875871, -0.480534]] and translation vector: [2.720584, 1.654419, 1.522448], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.211008, -0.462778, 0.860995], [-0.976592, 0.137438, -0.165466], [-0.04176, -0.875755, -0.480946]] and translation vector: [2.717844, 1.649691, 1.521912], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.235215, -0.460817, 0.855758], [-0.971358, 0.142015, -0.190515], [-0.033738, -0.876059, -0.481022]] and translation vector: [2.714951, 1.646852, 1.521954]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_167_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_167_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_167_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_167_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_167_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_167_5.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_167_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_167_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.686341, -0.358824, 0.632599], [-0.727213, -0.35045, 0.590209], [0.009912, -0.865119, -0.50147]] and translation vector: [2.486494, 4.601647, 1.455454], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.681394, -0.352774, 0.64129], [-0.731846, -0.340576, 0.590263], [0.010179, -0.871527, -0.490243]] and translation vector: [2.480601, 4.595852, 1.449959], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.622935, -0.386366, 0.680202], [-0.78205, -0.328403, 0.52967], [0.018734, -0.861901, -0.50673]] and translation vector: [2.469727, 4.596006, 1.44499]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_168_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_168_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_168_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_168_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_168_4.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_168_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_168_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_168_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.482968, -0.397392, 0.78027], [-0.874514, 0.173759, -0.452807], [0.044362, -0.901048, -0.431445]] and translation vector: [8.974016, 2.795387, 1.945192], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.496352, -0.388832, 0.776173], [-0.867003, 0.176647, -0.465943], [0.044064, -0.904216, -0.424797]] and translation vector: [8.98292, 2.792107, 1.939625], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.528625, -0.374982, 0.76154], [-0.848241, 0.199205, -0.490719], [0.032308, -0.905376, -0.42338]] and translation vector: [9.019628, 2.751405, 1.924251]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_169_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_169_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_169_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_169_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_169_4.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_169_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_169_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_169_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.891251, 0.378307, -0.25011], [0.443048, 0.608538, -0.658323], [-0.096846, -0.697542, -0.709969]] and translation vector: [4.935522, 3.588868, 1.45033], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.887006, 0.383874, -0.256633], [0.452131, 0.60913, -0.651566], [-0.093796, -0.693975, -0.713864]] and translation vector: [4.940225, 3.582454, 1.45688], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.875452, 0.38739, -0.288987], [0.475285, 0.581583, -0.660201], [-0.087685, -0.715325, -0.693269]] and translation vector: [4.970656, 3.561422, 1.469218]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_170_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_170_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_170_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_170_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_170_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_170_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_170_6.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_170_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.530794, 0.426739, -0.732224], [0.841151, 0.159702, -0.516681], [-0.10355, -0.890162, -0.443721]] and translation vector: [5.418979, 4.373359, 1.385162], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.532043, 0.421439, -0.734384], [0.841755, 0.169492, -0.512564], [-0.091542, -0.890877, -0.444925]] and translation vector: [5.415919, 4.39552, 1.38299], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.539398, 0.40032, -0.740806], [0.839984, 0.194205, -0.506666], [-0.05896, -0.89556, -0.441017]] and translation vector: [5.414681, 4.463818, 1.378667]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_171_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_171_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_171_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_171_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_171_4.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_171_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_171_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_171_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.086843, 0.425015, -0.901011], [0.995696, 0.066429, -0.064634], [0.032383, -0.902745, -0.428955]] and translation vector: [4.261571, 5.85756, 1.66629], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.086953, 0.422316, -0.902268], [0.995713, 0.06553, -0.065286], [0.031554, -0.904077, -0.426204]] and translation vector: [4.260677, 5.865657, 1.669414], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.081846, 0.421358, -0.903194], [0.995927, 0.068976, -0.058071], [0.03783, -0.904268, -0.425287]] and translation vector: [4.263237, 5.864869, 1.673574]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_172_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_172_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_172_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_172_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_172_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_172_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_172_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_172_7.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.725417, 0.297171, -0.620854], [0.687848, -0.279954, 0.669695], [0.025203, -0.912861, -0.407492]] and translation vector: [3.434752, 3.057745, 1.556519], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.722045, 0.303192, -0.621873], [0.691238, -0.278447, 0.666827], [0.029018, -0.911341, -0.410629]] and translation vector: [3.433538, 3.052318, 1.549734], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.693174, 0.307801, -0.651742], [0.720057, -0.255516, 0.645158], [0.032049, -0.916499, -0.398751]] and translation vector: [3.420418, 3.038936, 1.558387]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_173_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_173_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_173_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_173_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_173_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_173_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_173_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_173_7.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.032646, 0.194727, -0.980314], [0.998594, -0.034636, -0.040135], [-0.04177, -0.980246, -0.193322]] and translation vector: [3.506056, 2.493951, 1.706783], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.038857, 0.192835, -0.980462], [0.998032, -0.040846, -0.047587], [-0.049225, -0.980381, -0.190868]] and translation vector: [3.502031, 2.499079, 1.701362], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.064111, 0.226282, -0.97195], [0.996323, -0.040955, -0.075254], [-0.056835, -0.9732, -0.222824]] and translation vector: [3.459589, 2.490182, 1.701209]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_174_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_174_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_174_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_174_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_174_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_174_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_174_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_174_7.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.881415, -0.308012, 0.3581], [-0.47008, 0.646119, -0.601294], [-0.046169, -0.698325, -0.71429]] and translation vector: [3.147524, 1.689608, 1.273114], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.879224, -0.311908, 0.360109], [-0.474637, 0.638627, -0.605703], [-0.041052, -0.703469, -0.709539]] and translation vector: [3.141599, 1.689583, 1.27073], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.878218, -0.323901, 0.351882], [-0.476941, 0.647734, -0.594111], [-0.035492, -0.689586, -0.723334]] and translation vector: [3.127244, 1.682619, 1.264528]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_175_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_175_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_175_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_175_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_175_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_175_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_175_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_175_7.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.467192, 0.317292, -0.825262], [0.883302, -0.126478, 0.451421], [0.038855, -0.939856, -0.339354]] and translation vector: [2.723032, 3.168159, 1.438168], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.467636, 0.312306, -0.826911], [0.883318, -0.130557, 0.450227], [0.03265, -0.940968, -0.336919]] and translation vector: [2.722188, 3.168039, 1.441817], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.470302, 0.305008, -0.828122], [0.881834, -0.125828, 0.454462], [0.034414, -0.944001, -0.328143]] and translation vector: [2.718763, 3.171866, 1.451475]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_176_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_176_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_176_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_176_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_176_4.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_176_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_176_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_176_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.943065, -0.17817, 0.280864], [-0.332105, 0.550897, -0.765649], [-0.018311, -0.815333, -0.578703]] and translation vector: [2.74599, 1.673222, 1.294065], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.942639, -0.173012, 0.285478], [-0.332909, 0.550136, -0.765848], [-0.024551, -0.816957, -0.576177]] and translation vector: [2.737266, 1.663808, 1.300966], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.942881, -0.164787, 0.289518], [-0.331772, 0.54291, -0.771477], [-0.030053, -0.823465, -0.566571]] and translation vector: [2.712684, 1.645235, 1.301017]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_177_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_177_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_177_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_177_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_177_4.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_177_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_177_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_177_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.695296, -0.421579, 0.582095], [-0.717067, -0.351947, 0.601622], [-0.048765, -0.835707, -0.547007]] and translation vector: [2.470866, 0.652559, 1.473924], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.695871, -0.418819, 0.583399], [-0.716734, -0.353708, 0.600986], [-0.045352, -0.83635, -0.546317]] and translation vector: [2.469546, 0.651931, 1.473078], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.693531, -0.42586, 0.581085], [-0.719633, -0.371637, 0.586528], [-0.033826, -0.824943, -0.564204]] and translation vector: [2.467637, 0.650008, 1.462326]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_178_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_178_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_178_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_178_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_178_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_178_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_178_6.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_178_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.748873, -0.374013, 0.547087], [-0.662404, -0.447673, 0.600675], [0.020256, -0.812221, -0.582998]] and translation vector: [3.709567, 4.406117, 1.261793], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.747082, -0.370975, 0.551585], [-0.664465, -0.440253, 0.603874], [0.018814, -0.817652, -0.575405]] and translation vector: [3.708719, 4.403161, 1.261416], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.743545, -0.377269, 0.552096], [-0.66849, -0.439378, 0.600057], [0.016196, -0.81524, -0.578898]] and translation vector: [3.708687, 4.402202, 1.259327]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_179_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_179_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_179_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_179_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_179_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_179_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_179_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_179_7.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.925351, 0.122106, -0.358909], [0.376741, 0.190476, -0.906524], [-0.042329, -0.974068, -0.222259]] and translation vector: [4.735593, 2.732706, 1.21643], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.924788, 0.125024, -0.359357], [0.377675, 0.187086, -0.906841], [-0.046146, -0.974355, -0.220234]] and translation vector: [4.740286, 2.733964, 1.218072], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.925715, 0.103215, -0.363867], [0.37582, 0.142741, -0.915633], [-0.042569, -0.984363, -0.170928]] and translation vector: [4.730338, 2.742957, 1.247444]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_180_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_180_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_180_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_180_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_180_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_180_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_180_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_180_7.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.264492, -0.222038, 0.938479], [-0.962334, 0.002714, 0.271857], [-0.062909, -0.975034, -0.212957]] and translation vector: [0.925816, 4.784833, 1.497389], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.263009, -0.220134, 0.939344], [-0.962729, 0.003779, 0.270443], [-0.063084, -0.975462, -0.210935]] and translation vector: [0.925807, 4.784041, 1.498483], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.243124, -0.227834, 0.942858], [-0.968357, -0.000546, 0.249567], [-0.056345, -0.9737, -0.220758]] and translation vector: [0.931793, 4.784123, 1.4987]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_181_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_181_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_181_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_181_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_181_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_181_5.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_181_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_181_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.173351, 0.592298, -0.78685], [0.984858, -0.105806, 0.137329], [-0.001913, -0.798742, -0.601671]] and translation vector: [3.264189, 1.940071, 1.28435], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.172933, 0.589263, -0.789217], [0.98493, -0.105695, 0.136901], [-0.002745, -0.800998, -0.598661]] and translation vector: [3.267153, 1.942133, 1.284021], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.139436, 0.623012, -0.769684], [0.990166, -0.096604, 0.101183], [-0.011316, -0.776224, -0.630355]] and translation vector: [3.29114, 1.970334, 1.268272]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_182_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_182_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_182_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_182_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_182_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_182_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_182_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_182_7.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.442667, -0.46733, 0.765277], [-0.896368, 0.253361, -0.363776], [-0.023888, -0.847001, -0.531054]] and translation vector: [2.453469, 1.905797, 1.451684], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.441405, -0.472001, 0.763136], [-0.897015, 0.253848, -0.361837], [-0.022933, -0.844261, -0.535442]] and translation vector: [2.45238, 1.90449, 1.449179], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.442687, -0.461983, 0.768505], [-0.8965, 0.24504, -0.369112], [-0.017791, -0.852366, -0.522643]] and translation vector: [2.451253, 1.899634, 1.462124]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_183_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_183_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_183_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_183_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_183_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_183_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_183_6.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_183_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.643628, -0.362528, 0.674031], [-0.765241, -0.290748, 0.574345], [-0.012243, -0.88546, -0.464555]] and translation vector: [2.632762, 2.243425, 1.452714], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.642371, -0.361874, 0.675579], [-0.76623, -0.285016, 0.575898], [-0.015852, -0.887589, -0.460364]] and translation vector: [2.634792, 2.237319, 1.452971], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.637523, -0.35682, 0.682821], [-0.770314, -0.279737, 0.573031], [-0.013459, -0.891306, -0.453202]] and translation vector: [2.638724, 2.233015, 1.462981]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_184_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_184_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_184_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_184_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_184_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_184_5.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_184_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_184_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.731293, 0.384445, -0.563394], [0.682011, 0.401944, -0.610984], [-0.008437, -0.831049, -0.556135]] and translation vector: [5.176627, 2.209938, 1.427488], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.733453, 0.387758, -0.558292], [0.679719, 0.411882, -0.606907], [-0.005383, -0.82462, -0.565663]] and translation vector: [5.175584, 2.209993, 1.422561], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.739748, 0.384821, -0.551984], [0.672884, 0.424134, -0.606084], [0.000881, -0.819771, -0.572692]] and translation vector: [5.164479, 2.208437, 1.426833]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_185_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_185_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_185_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_185_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_185_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_185_5.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_185_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_185_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.996822, -0.027813, -0.074656], [0.056495, -0.413943, 0.908548], [-0.056173, -0.909878, -0.411056]] and translation vector: [4.405487, 5.403347, 1.494535], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.996757, -0.027349, -0.075677], [0.057466, -0.416379, 0.907373], [-0.056327, -0.90878, -0.413457]] and translation vector: [4.408994, 5.403286, 1.494292], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.997265, -0.029561, -0.067745], [0.049832, -0.408017, 0.911613], [-0.05459, -0.912496, -0.405428]] and translation vector: [4.415172, 5.400004, 1.499593]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_186_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_186_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_186_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_186_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_186_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_186_5.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_186_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_186_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.789457, 0.162095, -0.592016], [0.613764, 0.197318, -0.764434], [-0.007096, -0.966846, -0.255262]] and translation vector: [5.114759, 3.17533, 1.386193], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.785271, 0.158609, -0.598492], [0.619131, 0.193201, -0.761151], [-0.005096, -0.968255, -0.249915]] and translation vector: [5.11251, 3.170745, 1.383731], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.782732, 0.165019, -0.600083], [0.622288, 0.192888, -0.758652], [-0.009443, -0.967245, -0.253669]] and translation vector: [5.104394, 3.153102, 1.37449]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_187_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_187_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_187_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_187_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_187_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_187_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_187_6.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_187_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.95695, -0.100486, 0.272304], [-0.288986, 0.24231, -0.92616], [0.027085, -0.964981, -0.260918]] and translation vector: [1.227478, 4.879099, 1.55452], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.957752, -0.097454, 0.27058], [-0.286469, 0.240112, -0.927514], [0.025421, -0.965841, -0.257885]] and translation vector: [1.221714, 4.885019, 1.554874], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.941817, -0.081741, 0.326036], [-0.336056, 0.20922, -0.91831], [0.00685, -0.974446, -0.224516]] and translation vector: [1.204022, 4.901892, 1.569033]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_188_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_188_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_188_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_188_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_188_4.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_188_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_188_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_188_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.982764, 0.054289, -0.17671], [0.184841, -0.27426, 0.943724], [0.002769, -0.960122, -0.279568]] and translation vector: [4.072058, 1.220293, 1.47625], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.982485, 0.057917, -0.177113], [0.186218, -0.270474, 0.944546], [0.0068, -0.960984, -0.276522]] and translation vector: [4.071517, 1.218265, 1.477941], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.980674, 0.05705, -0.187148], [0.195532, -0.252477, 0.947641], [0.006813, -0.96592, -0.258752]] and translation vector: [4.0711, 1.209071, 1.48705]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_189_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_189_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_189_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_189_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_189_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_189_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_189_6.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_189_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.693623, 0.392298, -0.604144], [0.720137, 0.397492, -0.568686], [0.017048, -0.82952, -0.558217]] and translation vector: [2.706242, 2.586761, 1.453005], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.690051, 0.396658, -0.605386], [0.723517, 0.399766, -0.56277], [0.018785, -0.826347, -0.562848]] and translation vector: [2.704536, 2.590014, 1.45316], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.674504, 0.428853, -0.600941], [0.737993, 0.414011, -0.53288], [0.020269, -0.80292, -0.595742]] and translation vector: [2.699649, 2.603579, 1.443268]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_190_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_190_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_190_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_190_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_190_4.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_190_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_190_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_190_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.610102, 0.375008, -0.697958], [0.791763, 0.255448, -0.554849], [-0.029781, -0.891132, -0.452767]] and translation vector: [2.349929, 1.419923, 1.358478], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.607496, 0.374505, -0.700496], [0.793845, 0.255679, -0.551759], [-0.027534, -0.891277, -0.452623]] and translation vector: [2.354864, 1.421781, 1.358478], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.579764, 0.373065, -0.724359], [0.814546, 0.24389, -0.526338], [-0.019694, -0.895176, -0.445277]] and translation vector: [2.359462, 1.423068, 1.367348]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_191_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_191_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_191_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_191_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_191_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_191_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_191_6.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_191_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.081815, 0.638296, -0.765431], [0.996577, -0.061545, 0.055199], [-0.011875, -0.767327, -0.641146]] and translation vector: [3.004073, 1.570726, 1.431248], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.083332, 0.64082, -0.763155], [0.996457, -0.062303, 0.056492], [-0.011346, -0.765159, -0.643742]] and translation vector: [3.00242, 1.571458, 1.432065], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.083112, 0.654572, -0.751417], [0.996444, -0.065065, 0.053535], [-0.013848, -0.753195, -0.657652]] and translation vector: [3.01468, 1.572497, 1.43131]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_192_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_192_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_192_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_192_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_192_4.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_192_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_192_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_192_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.348231, 0.123124, -0.929288], [0.936413, -1.6e-05, 0.350899], [0.043189, -0.992391, -0.1153]] and translation vector: [2.712005, 2.075202, 1.464169], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.348319, 0.120186, -0.929639], [0.93641, 0.000395, 0.350907], [0.042542, -0.992751, -0.112406]] and translation vector: [2.712393, 2.076758, 1.463984], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.330226, 0.128954, -0.935052], [0.94318, -0.00633, 0.332223], [0.036923, -0.99163, -0.123717]] and translation vector: [2.702959, 2.087481, 1.468829]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_193_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_193_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_193_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_193_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_193_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_193_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_193_6.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_193_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.524333, 0.441188, -0.728305], [0.848808, -0.202677, 0.488311], [0.067827, -0.874228, -0.480754]] and translation vector: [3.10696, 1.250425, 1.344077], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.531491, 0.437044, -0.72561], [0.844432, -0.205894, 0.494513], [0.066725, -0.875557, -0.478485]] and translation vector: [3.107462, 1.25329, 1.344278], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.56012, 0.431145, -0.707375], [0.826071, -0.226557, 0.516021], [0.062219, -0.873376, -0.483056]] and translation vector: [3.110022, 1.262991, 1.348097]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_194_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_194_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_194_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_194_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_194_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_194_5.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_194_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_194_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.000188, -0.47362, 0.88073], [-0.997828, 0.057931, 0.031365], [-0.065877, -0.878822, -0.47258]] and translation vector: [4.366519, 5.511691, 1.307889], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.002248, -0.465195, 0.885205], [-0.998254, 0.053289, 0.02547], [-0.05902, -0.883603, -0.464503]] and translation vector: [4.36891, 5.516212, 1.317108], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.024267, -0.440835, 0.89726], [-0.998159, 0.06059, 0.002773], [-0.055588, -0.895541, -0.441493]] and translation vector: [4.36929, 5.527184, 1.331889]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_195_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_195_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_195_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_195_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_195_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_195_5.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_195_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_195_7.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.819759, -0.274444, 0.502669], [-0.572709, 0.39303, -0.719397], [-0.00013, -0.877615, -0.479366]] and translation vector: [2.765326, 1.370172, 1.355227], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.819555, -0.26888, 0.505998], [-0.572993, 0.389095, -0.721307], [-0.002936, -0.881084, -0.472951]] and translation vector: [2.765196, 1.369276, 1.358405], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.80543, -0.264338, 0.530479], [-0.592674, 0.365802, -0.717584], [-0.004366, -0.892365, -0.451294]] and translation vector: [2.783833, 1.382351, 1.368477]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_196_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_196_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_196_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_196_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_196_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_196_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_196_6.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_196_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.060487, 0.154719, -0.986105], [0.998165, 0.006603, -0.060191], [-0.002801, -0.987936, -0.154835]] and translation vector: [6.630666, 2.572317, 1.44523], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.062036, 0.175232, -0.982571], [0.998074, 0.011306, -0.060998], [0.00042, -0.984462, -0.175596]] and translation vector: [6.62843, 2.567178, 1.442285], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.077658, 0.209818, -0.974652], [0.996978, 0.01426, -0.076367], [-0.002124, -0.977636, -0.210291]] and translation vector: [6.626263, 2.56408, 1.439607]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_197_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_197_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_197_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_197_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_197_4.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_197_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_197_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_197_7.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.286652, 0.220257, -0.932372], [0.958024, -0.061246, 0.28007], [0.004584, -0.973517, -0.228568]] and translation vector: [3.76659, 1.676076, 1.452194], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[0.299829, 0.216367, -0.929133], [0.953977, -0.07366, 0.290693], [-0.005544, -0.973529, -0.228495]] and translation vector: [3.753121, 1.670498, 1.452776], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[0.332229, 0.205241, -0.920597], [0.943053, -0.089416, 0.320398], [-0.016558, -0.974618, -0.22326]] and translation vector: [3.692962, 1.621141, 1.4585]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_198_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_198_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_198_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_198_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_198_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_198_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_198_6.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_198_7.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_Scene_Reconstruction", "visual_input_component": "3d image", "source": "SCANNET_threed_scene_reconstruction", "options": "A: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "question": "Given a pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.409087, -0.112571, 0.905525], [-0.910894, 0.109148, -0.397943], [-0.05404, -0.987631, -0.147191]] and translation vector: [4.421403, 3.579741, 1.526424], and another pair of RGB and depth images with the corresponding camera pose, i.e., rotation matrix: [[-0.417977, -0.10834, 0.901974], [-0.906895, 0.107978, -0.407287], [-0.053267, -0.988232, -0.143386]] and translation vector: [4.418822, 3.582731, 1.526625], please estimate the RGB image for the query camera pose, i.e., rotation matrix: [[-0.44932, -0.10036, 0.887716], [-0.891042, 0.12205, -0.437205], [-0.064468, -0.987437, -0.144264]] and translation vector: [4.403283, 3.625828, 1.518726]. The provided camera poses represent the the transformation from the camera coordinate system to the world coordinate system.", "context": "Your task is to reconstruct the 3D geometry of a scene. This is tested through the image retrieval for a specific camera pose. The input images are the first 4 images\nSelect from the following choices.\nA: The 5th image\nB: The 6th image\nC: The 7th image\nD: The 8th image", "input_image_path": ["./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_199_0.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_199_1.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_199_2.jpg", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_199_3.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_199_4.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_199_5.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_199_6.png", "./3D-spatial/threeD_Scene_Reconstruction/threeD_Scene_Reconstruction_199_7.jpg"], "output": "D", "qwen3-vl": "image none"}]
\ No newline at end of file
diff --git a/results/threeD_question_answering/qwen3-vl/metadata_info.json b/results/threeD_question_answering/qwen3-vl/metadata_info.json
new file mode 100644
index 0000000..fe8145a
--- /dev/null
+++ b/results/threeD_question_answering/qwen3-vl/metadata_info.json
@@ -0,0 +1 @@
+[{"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: yes\nB: uncertain\nC: no\nD: maybe", "question": "Are there any things?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: yes\nB: uncertain\nC: no\nD: maybe", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_0_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_0_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_0_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_0_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_0_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_0_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_0_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_0_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_0_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_0_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_0_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_0_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: walking\nB: lying down\nC: sitting\nD: standing", "question": "What is the status of the pedestrian to the back right of me?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: walking\nB: lying down\nC: sitting\nD: standing", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_1_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_1_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_1_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_1_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_1_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_1_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_1_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_1_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_1_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_1_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_1_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_1_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: 5\nB: 2\nC: 3\nD: 0", "question": "How many cars are to the front left of me?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 5\nB: 2\nC: 3\nD: 0", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_2_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_2_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_2_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_2_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_2_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_2_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_2_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_2_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_2_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_2_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_2_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_2_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: 5\nB: 12\nC: 3\nD: 8", "question": "How many moving things are there?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 5\nB: 12\nC: 3\nD: 8", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_3_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_3_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_3_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_3_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_3_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_3_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_3_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_3_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_3_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_3_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_3_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_3_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: moving\nB: idling\nC: broken down\nD: parked", "question": "The truck to the back right of the bus is in what status?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: moving\nB: idling\nC: broken down\nD: parked", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_4_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_4_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_4_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_4_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_4_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_4_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_4_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_4_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_4_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_4_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_4_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_4_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: no\nB: not sure\nC: yes\nD: unknown", "question": "Are there any other pedestrians of the same status as the thing that is to the front of the bicycle?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: no\nB: not sure\nC: yes\nD: unknown", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_5_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_5_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_5_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_5_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_5_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_5_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_5_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_5_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_5_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_5_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_5_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_5_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: new\nB: without rider\nC: for sale\nD: broken", "question": "There is a motorcycle; what status is it?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: new\nB: without rider\nC: for sale\nD: broken", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_6_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_6_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_6_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_6_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_6_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_6_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_6_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_6_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_6_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_6_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_6_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_6_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: sitting\nB: jumping\nC: stationary\nD: moving", "question": "What status is the pedestrian that is to the front of the bus?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: sitting\nB: jumping\nC: stationary\nD: moving", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_7_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_7_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_7_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_7_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_7_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_7_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_7_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_7_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_7_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_7_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_7_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_7_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: on the road\nB: without rider\nC: being ridden by someone\nD: inside the truck", "question": "What is the status of the motorcycle to the front of the parked truck?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: on the road\nB: without rider\nC: being ridden by someone\nD: inside the truck", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_8_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_8_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_8_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_8_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_8_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_8_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_8_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_8_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_8_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_8_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_8_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_8_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: 7\nB: 5\nC: 3\nD: 10", "question": "How many cars are to the back of me?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 7\nB: 5\nC: 3\nD: 10", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_9_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_9_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_9_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_9_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_9_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_9_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_9_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_9_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_9_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_9_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_9_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_9_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: with rider\nB: in repair\nC: being sold\nD: locked up", "question": "The bicycle is in what status?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: with rider\nB: in repair\nC: being sold\nD: locked up", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_10_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_10_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_10_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_10_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_10_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_10_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_10_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_10_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_10_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_10_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_10_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_10_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: 2\nB: 4\nC: 10\nD: 8", "question": "How many other things are there of the same status as the bus?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 2\nB: 4\nC: 10\nD: 8", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_11_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_11_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_11_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_11_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_11_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_11_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_11_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_11_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_11_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_11_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_11_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_11_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: maybe\nB: I don\u2019t know\nC: no\nD: yes", "question": "Is the status of the thing that is to the front left of the construction vehicle the same as the car to the back of the motorcycle?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: maybe\nB: I don\u2019t know\nC: no\nD: yes", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_12_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_12_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_12_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_12_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_12_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_12_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_12_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_12_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_12_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_12_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_12_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_12_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: waiting\nB: moving\nC: departing\nD: stopped", "question": "There is a bus; what status is it?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: waiting\nB: moving\nC: departing\nD: stopped", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_13_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_13_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_13_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_13_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_13_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_13_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_13_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_13_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_13_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_13_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_13_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_13_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: 6\nB: 4\nC: 2\nD: 7", "question": "There is a bicycle; how many moving things are to the back right of it?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 6\nB: 4\nC: 2\nD: 7", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_14_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_14_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_14_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_14_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_14_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_14_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_14_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_14_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_14_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_14_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_14_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_14_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: yes\nB: no\nC: possibly\nD: maybe", "question": "Are any trucks visible?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: yes\nB: no\nC: possibly\nD: maybe", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_15_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_15_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_15_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_15_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_15_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_15_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_15_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_15_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_15_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_15_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_15_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_15_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: 5\nB: 12\nC: 3\nD: 9", "question": "What number of parked cars are to the front left of me?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 5\nB: 12\nC: 3\nD: 9", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_16_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_16_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_16_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_16_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_16_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_16_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_16_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_16_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_16_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_16_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_16_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_16_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: broken\nB: with rider\nC: new\nD: without rider", "question": "The bicycle to the front of me is in what status?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: broken\nB: with rider\nC: new\nD: without rider", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_17_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_17_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_17_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_17_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_17_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_17_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_17_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_17_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_17_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_17_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_17_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_17_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: maybe\nB: no\nC: uncertain\nD: yes", "question": "There is a truck that is to the back right of the parked thing; is its status the same as the bus?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: maybe\nB: no\nC: uncertain\nD: yes", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_18_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_18_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_18_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_18_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_18_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_18_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_18_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_18_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_18_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_18_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_18_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_18_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: unsure\nB: yes\nC: no\nD: maybe", "question": "There is a car that is to the front left of the motorcycle; is it the same status as the bus?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: unsure\nB: yes\nC: no\nD: maybe", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_19_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_19_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_19_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_19_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_19_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_19_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_19_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_19_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_19_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_19_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_19_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_19_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: no, but there is a stationary bus\nB: yes, the bus is moving\nC: yes, there is a bus in the frame\nD: no", "question": "There is a with rider motorcycle; are there any moving buss to the back right of it?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: no, but there is a stationary bus\nB: yes, the bus is moving\nC: yes, there is a bus in the frame\nD: no", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_20_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_20_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_20_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_20_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_20_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_20_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_20_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_20_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_20_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_20_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_20_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_20_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: train\nB: trees\nC: bike\nD: car", "question": "The thing that is both to the back of the bus and the front left of me is what?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: train\nB: trees\nC: bike\nD: car", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_21_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_21_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_21_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_21_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_21_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_21_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_21_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_21_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_21_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_21_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_21_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_21_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: no\nB: uncertain\nC: maybe\nD: yes", "question": "Are there any other things of the same status as the traffic cone to the front left of the with rider thing?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: no\nB: uncertain\nC: maybe\nD: yes", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_22_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_22_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_22_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_22_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_22_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_22_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_22_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_22_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_22_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_22_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_22_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_22_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: uncertain\nB: no\nC: yes\nD: maybe", "question": "There is a truck that is to the back right of the bus; is it the same status as the car that is to the back right of the construction vehicle?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: uncertain\nB: no\nC: yes\nD: maybe", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_23_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_23_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_23_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_23_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_23_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_23_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_23_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_23_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_23_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_23_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_23_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_23_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: sitting\nB: lying down\nC: moving\nD: standing", "question": "What is the status of the pedestrian to the front of the parked thing?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: sitting\nB: lying down\nC: moving\nD: standing", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_24_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_24_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_24_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_24_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_24_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_24_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_24_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_24_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_24_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_24_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_24_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_24_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: not sure\nB: yes\nC: maybe\nD: no", "question": "Are there any other things that in the same status as the bus?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: not sure\nB: yes\nC: maybe\nD: no", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_25_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_25_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_25_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_25_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_25_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_25_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_25_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_25_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_25_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_25_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_25_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_25_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: tree\nB: bicycle\nC: lamp post\nD: pedestrian", "question": "What is the thing that is to the front left of me and the back right of the moving bus?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: tree\nB: bicycle\nC: lamp post\nD: pedestrian", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_26_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_26_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_26_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_26_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_26_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_26_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_26_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_26_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_26_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_26_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_26_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_26_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: sold\nB: moving\nC: parked\nD: broken down", "question": "What is the status of the trailer?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: sold\nB: moving\nC: parked\nD: broken down", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_27_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_27_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_27_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_27_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_27_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_27_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_27_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_27_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_27_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_27_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_27_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_27_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: with rider\nB: being serviced\nC: missing\nD: on stand", "question": "What status is the motorcycle to the front left of the parked thing?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: with rider\nB: being serviced\nC: missing\nD: on stand", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_28_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_28_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_28_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_28_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_28_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_28_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_28_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_28_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_28_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_28_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_28_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_28_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: maybe\nB: no\nC: sometimes\nD: yes", "question": "There is a bus; is its status the same as the bicycle to the back of the with rider motorcycle?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: maybe\nB: no\nC: sometimes\nD: yes", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_29_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_29_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_29_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_29_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_29_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_29_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_29_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_29_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_29_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_29_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_29_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_29_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: traffic light\nB: bus stop sign\nC: hydrant\nD: pedestrian", "question": "What is the standing pedestrian that is to the front left of the stopped bus?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: traffic light\nB: bus stop sign\nC: hydrant\nD: pedestrian", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_30_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_30_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_30_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_30_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_30_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_30_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_30_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_30_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_30_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_30_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_30_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_30_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: motorcycle\nB: truck\nC: bicycle\nD: car", "question": "What is the thing that is both to the back right of the stopped bus and the front left of the parked truck?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: motorcycle\nB: truck\nC: bicycle\nD: car", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_31_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_31_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_31_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_31_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_31_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_31_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_31_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_31_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_31_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_31_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_31_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_31_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: maybe\nB: uncertain\nC: no\nD: yes", "question": "Are there any other construction vehicles of the same status as the car that is to the back of the bus?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: maybe\nB: uncertain\nC: no\nD: yes", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_32_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_32_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_32_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_32_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_32_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_32_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_32_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_32_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_32_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_32_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_32_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_32_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: airplane\nB: train\nC: car\nD: motorcycle", "question": "The with rider thing to the front left of the with rider bicycle is what?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: airplane\nB: train\nC: car\nD: motorcycle", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_33_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_33_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_33_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_33_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_33_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_33_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_33_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_33_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_33_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_33_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_33_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_33_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: pedestrian\nB: building\nC: bus\nD: car", "question": "What is the thing that is both to the back of the standing pedestrian and the front left of the parked thing?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: pedestrian\nB: building\nC: bus\nD: car", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_34_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_34_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_34_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_34_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_34_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_34_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_34_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_34_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_34_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_34_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_34_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_34_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: yes\nB: no\nC: unknown\nD: maybe", "question": "There is a car to the back right of the stopped bus; does it have the same status as the bus?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: yes\nB: no\nC: unknown\nD: maybe", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_35_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_35_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_35_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_35_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_35_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_35_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_35_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_35_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_35_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_35_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_35_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_35_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: moving\nB: stopped\nC: under maintenance\nD: idle", "question": "What is the status of the bus?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: moving\nB: stopped\nC: under maintenance\nD: idle", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_36_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_36_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_36_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_36_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_36_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_36_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_36_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_36_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_36_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_36_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_36_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_36_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: yes\nB: no\nC: maybe\nD: sometimes", "question": "Are there any trailers to the front right of me?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: yes\nB: no\nC: maybe\nD: sometimes", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_37_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_37_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_37_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_37_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_37_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_37_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_37_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_37_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_37_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_37_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_37_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_37_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: yes\nB: maybe\nC: no\nD: uncertain", "question": "Is the status of the truck to the front left of the without rider motorcycle the same as the construction vehicle?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: yes\nB: maybe\nC: no\nD: uncertain", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_38_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_38_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_38_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_38_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_38_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_38_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_38_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_38_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_38_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_38_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_38_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_38_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: parked\nB: reversing\nC: broken down\nD: moving", "question": "What status is the car that is to the front of me?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: parked\nB: reversing\nC: broken down\nD: moving", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_39_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_39_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_39_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_39_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_39_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_39_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_39_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_39_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_39_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_39_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_39_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_39_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: yes\nB: maybe\nC: sometimes\nD: no", "question": "Are there any other pedestrians of the same status as the trailer?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: yes\nB: maybe\nC: sometimes\nD: no", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_40_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_40_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_40_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_40_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_40_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_40_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_40_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_40_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_40_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_40_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_40_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_40_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: yes\nB: no\nC: maybe\nD: not sure", "question": "Are there any other things that in the same status as the bus?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: yes\nB: no\nC: maybe\nD: not sure", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_41_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_41_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_41_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_41_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_41_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_41_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_41_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_41_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_41_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_41_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_41_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_41_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: 1\nB: 0\nC: 3\nD: 2", "question": "How many other motorcycles in the same status as the car that is to the back of the moving bus?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 1\nB: 0\nC: 3\nD: 2", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_42_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_42_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_42_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_42_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_42_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_42_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_42_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_42_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_42_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_42_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_42_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_42_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: maybe\nB: no\nC: not sure\nD: yes", "question": "There is a parked truck; are there any moving pedestrians to the front left of it?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: maybe\nB: no\nC: not sure\nD: yes", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_43_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_43_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_43_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_43_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_43_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_43_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_43_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_43_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_43_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_43_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_43_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_43_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: under maintenance\nB: moving\nC: parked\nD: stopped", "question": "There is a truck to the front left of the stopped construction vehicle; what status is it?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: under maintenance\nB: moving\nC: parked\nD: stopped", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_44_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_44_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_44_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_44_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_44_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_44_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_44_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_44_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_44_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_44_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_44_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_44_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: bicycle\nB: car\nC: motorcycle\nD: tricycle", "question": "There is a with rider thing that is to the front left of the bicycle; what is it?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: bicycle\nB: car\nC: motorcycle\nD: tricycle", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_45_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_45_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_45_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_45_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_45_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_45_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_45_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_45_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_45_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_45_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_45_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_45_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: maybe\nB: yes\nC: no\nD: I can't tell", "question": "Are there any motorcycles to the front right of the bus?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: maybe\nB: yes\nC: no\nD: I can't tell", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_46_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_46_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_46_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_46_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_46_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_46_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_46_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_46_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_46_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_46_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_46_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_46_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: bus\nB: car\nC: bike\nD: train", "question": "What is the stopped thing?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: bus\nB: car\nC: bike\nD: train", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_47_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_47_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_47_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_47_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_47_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_47_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_47_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_47_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_47_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_47_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_47_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_47_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: pedestrian\nB: car\nC: bike\nD: tree", "question": "The standing pedestrian that is to the front left of me is what?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: pedestrian\nB: car\nC: bike\nD: tree", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_48_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_48_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_48_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_48_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_48_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_48_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_48_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_48_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_48_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_48_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_48_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_48_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: 2\nB: 5\nC: 3\nD: 1", "question": "How many other things are in the same status as the bus that is to the back right of the moving bus?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 2\nB: 5\nC: 3\nD: 1", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_49_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_49_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_49_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_49_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_49_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_49_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_49_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_49_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_49_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_49_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_49_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_49_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: standing\nB: moving\nC: lying down\nD: sitting", "question": "What is the status of the pedestrian that is to the front of the traffic cone?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: standing\nB: moving\nC: lying down\nD: sitting", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_50_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_50_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_50_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_50_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_50_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_50_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_50_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_50_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_50_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_50_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_50_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_50_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: maybe\nB: no\nC: yes\nD: unknown", "question": "There is a motorcycle to the back right of the parked thing; does it have the same status as the bicycle that is to the back right of the bus?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: maybe\nB: no\nC: yes\nD: unknown", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_51_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_51_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_51_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_51_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_51_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_51_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_51_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_51_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_51_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_51_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_51_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_51_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: maybe\nB: possibly\nC: no\nD: yes", "question": "Are there any buss to the front right of me?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: maybe\nB: possibly\nC: no\nD: yes", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_52_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_52_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_52_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_52_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_52_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_52_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_52_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_52_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_52_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_52_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_52_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_52_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: parked\nB: moving\nC: under maintenance\nD: stopping", "question": "The bus is in what status?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: parked\nB: moving\nC: under maintenance\nD: stopping", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_53_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_53_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_53_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_53_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_53_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_53_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_53_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_53_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_53_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_53_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_53_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_53_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: parked\nB: broken down\nC: moving\nD: stopped", "question": "The bus that is to the front of me is in what status?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: parked\nB: broken down\nC: moving\nD: stopped", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_54_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_54_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_54_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_54_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_54_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_54_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_54_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_54_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_54_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_54_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_54_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_54_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: bicycle\nB: car\nC: motorcycle\nD: pedestrian", "question": "The thing that is both to the back of the stopped bus and the back right of me is what?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: bicycle\nB: car\nC: motorcycle\nD: pedestrian", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_55_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_55_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_55_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_55_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_55_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_55_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_55_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_55_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_55_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_55_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_55_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_55_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: yes\nB: maybe\nC: uncertain\nD: no", "question": "There is a with rider thing; are there any parked cars to the back of it?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: yes\nB: maybe\nC: uncertain\nD: no", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_56_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_56_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_56_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_56_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_56_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_56_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_56_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_56_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_56_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_56_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_56_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_56_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: sometimes\nB: no\nC: yes\nD: uncertain", "question": "Does the bicycle have the same status as the thing that is to the back right of the construction vehicle?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: sometimes\nB: no\nC: yes\nD: uncertain", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_57_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_57_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_57_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_57_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_57_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_57_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_57_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_57_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_57_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_57_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_57_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_57_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: only on weekends\nB: no\nC: sometimes\nD: yes", "question": "Are any without rider things visible?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: only on weekends\nB: no\nC: sometimes\nD: yes", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_58_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_58_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_58_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_58_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_58_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_58_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_58_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_58_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_58_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_58_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_58_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_58_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: maybe\nB: unknown\nC: no\nD: yes", "question": "Is the status of the bicycle the same as the truck that is to the back of the without rider motorcycle?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: maybe\nB: unknown\nC: no\nD: yes", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_59_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_59_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_59_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_59_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_59_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_59_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_59_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_59_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_59_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_59_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_59_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_59_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: not applicable\nB: no\nC: uncertain\nD: yes", "question": "Are there any other buss that in the same status as the motorcycle to the back right of the moving bus?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: not applicable\nB: no\nC: uncertain\nD: yes", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_60_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_60_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_60_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_60_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_60_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_60_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_60_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_60_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_60_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_60_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_60_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_60_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: 9\nB: 5\nC: 12\nD: 7", "question": "What number of cars are to the back right of the trailer?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 9\nB: 5\nC: 12\nD: 7", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_61_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_61_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_61_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_61_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_61_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_61_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_61_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_61_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_61_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_61_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_61_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_61_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: 3\nB: 0\nC: 1\nD: 2", "question": "What number of other things are there of the same status as the bicycle?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 3\nB: 0\nC: 1\nD: 2", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_62_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_62_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_62_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_62_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_62_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_62_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_62_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_62_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_62_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_62_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_62_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_62_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: with rider\nB: on the ground\nC: in repair\nD: broken", "question": "What is the status of the bicycle?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: with rider\nB: on the ground\nC: in repair\nD: broken", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_63_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_63_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_63_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_63_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_63_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_63_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_63_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_63_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_63_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_63_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_63_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_63_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: unsure\nB: no\nC: maybe\nD: yes", "question": "Are there any other bicycles of the same status as the car to the front left of the parked trailer?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: unsure\nB: no\nC: maybe\nD: yes", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_64_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_64_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_64_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_64_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_64_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_64_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_64_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_64_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_64_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_64_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_64_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_64_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: being repaired\nB: moving\nC: stopped\nD: parked", "question": "The construction vehicle is in what status?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: being repaired\nB: moving\nC: stopped\nD: parked", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_65_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_65_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_65_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_65_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_65_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_65_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_65_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_65_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_65_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_65_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_65_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_65_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: building\nB: bicycle\nC: tree\nD: car", "question": "The thing that is both to the front left of the construction vehicle and the back of me is what?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: building\nB: bicycle\nC: tree\nD: car", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_66_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_66_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_66_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_66_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_66_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_66_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_66_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_66_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_66_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_66_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_66_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_66_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: 4\nB: 2\nC: 0\nD: 1", "question": "What number of moving cars are to the front left of the moving bus?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 4\nB: 2\nC: 0\nD: 1", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_67_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_67_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_67_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_67_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_67_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_67_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_67_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_67_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_67_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_67_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_67_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_67_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: 9\nB: 5\nC: 12\nD: 7", "question": "How many cars are to the back of me?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 9\nB: 5\nC: 12\nD: 7", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_68_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_68_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_68_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_68_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_68_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_68_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_68_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_68_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_68_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_68_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_68_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_68_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: sometimes\nB: no\nC: only during peak hours\nD: yes", "question": "Are there any moving trailers?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: sometimes\nB: no\nC: only during peak hours\nD: yes", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_69_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_69_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_69_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_69_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_69_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_69_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_69_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_69_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_69_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_69_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_69_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_69_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: 6\nB: 12\nC: 9\nD: 3", "question": "What number of cars are there?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 6\nB: 12\nC: 9\nD: 3", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_70_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_70_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_70_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_70_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_70_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_70_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_70_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_70_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_70_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_70_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_70_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_70_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: unknown\nB: yes\nC: no\nD: maybe", "question": "Is the status of the car that is to the front of the trailer the same as the trailer?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: unknown\nB: yes\nC: no\nD: maybe", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_71_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_71_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_71_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_71_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_71_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_71_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_71_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_71_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_71_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_71_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_71_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_71_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: yes\nB: uncertain\nC: no\nD: maybe", "question": "There is a construction vehicle that is to the front left of the parked truck; does it have the same status as the bus?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: yes\nB: uncertain\nC: no\nD: maybe", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_72_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_72_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_72_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_72_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_72_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_72_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_72_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_72_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_72_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_72_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_72_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_72_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: stopped\nB: moving\nC: broken down\nD: cancelled", "question": "What is the status of the bus to the back right of me?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: stopped\nB: moving\nC: broken down\nD: cancelled", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_73_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_73_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_73_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_73_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_73_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_73_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_73_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_73_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_73_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_73_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_73_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_73_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: 4\nB: 9\nC: 7\nD: 12", "question": "How many other things in the same status as the thing that is to the front of the with rider bicycle?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 4\nB: 9\nC: 7\nD: 12", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_74_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_74_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_74_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_74_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_74_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_74_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_74_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_74_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_74_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_74_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_74_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_74_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: no\nB: maybe\nC: yes\nD: unsure", "question": "Are there any other buss of the same status as the construction vehicle?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: no\nB: maybe\nC: yes\nD: unsure", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_75_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_75_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_75_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_75_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_75_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_75_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_75_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_75_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_75_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_75_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_75_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_75_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: yes\nB: sometimes\nC: maybe\nD: no", "question": "There is a construction vehicle; is its status the same as the bus that is to the front of the truck?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: yes\nB: sometimes\nC: maybe\nD: no", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_76_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_76_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_76_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_76_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_76_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_76_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_76_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_76_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_76_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_76_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_76_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_76_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: tree\nB: car\nC: bicycle\nD: bench", "question": "The thing that is both to the back right of the trailer and the back right of me is what?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: tree\nB: car\nC: bicycle\nD: bench", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_77_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_77_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_77_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_77_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_77_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_77_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_77_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_77_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_77_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_77_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_77_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_77_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: 9\nB: 7\nC: 5\nD: 12", "question": "What number of other things in the same status as the car that is to the front left of the motorcycle?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 9\nB: 7\nC: 5\nD: 12", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_78_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_78_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_78_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_78_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_78_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_78_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_78_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_78_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_78_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_78_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_78_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_78_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: 10\nB: 3\nC: 1\nD: 5", "question": "What number of other things are in the same status as the bus?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 10\nB: 3\nC: 1\nD: 5", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_79_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_79_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_79_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_79_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_79_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_79_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_79_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_79_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_79_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_79_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_79_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_79_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: bench\nB: pedestrian\nC: tree\nD: bicycle", "question": "The thing that is both to the back right of the stopped bus and the back right of me is what?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: bench\nB: pedestrian\nC: tree\nD: bicycle", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_80_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_80_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_80_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_80_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_80_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_80_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_80_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_80_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_80_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_80_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_80_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_80_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: 5\nB: 3\nC: 7\nD: 2", "question": "What number of other things are in the same status as the construction vehicle?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 5\nB: 3\nC: 7\nD: 2", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_81_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_81_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_81_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_81_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_81_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_81_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_81_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_81_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_81_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_81_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_81_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_81_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: 3\nB: 5\nC: 2\nD: 0", "question": "What number of moving buss are to the front right of me?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 3\nB: 5\nC: 2\nD: 0", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_82_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_82_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_82_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_82_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_82_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_82_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_82_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_82_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_82_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_82_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_82_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_82_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: motorcycle\nB: bicycle\nC: trolley\nD: car", "question": "The without rider thing that is to the front left of me is what?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: motorcycle\nB: bicycle\nC: trolley\nD: car", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_83_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_83_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_83_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_83_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_83_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_83_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_83_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_83_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_83_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_83_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_83_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_83_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: with rider\nB: in front of the bus\nC: without rider\nD: parked", "question": "What status is the motorcycle to the back of the moving bus?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: with rider\nB: in front of the bus\nC: without rider\nD: parked", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_84_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_84_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_84_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_84_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_84_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_84_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_84_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_84_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_84_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_84_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_84_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_84_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: scooter\nB: rollerblades\nC: motorcycle\nD: bicycle", "question": "The with rider thing to the back right of the moving bus is what?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: scooter\nB: rollerblades\nC: motorcycle\nD: bicycle", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_85_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_85_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_85_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_85_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_85_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_85_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_85_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_85_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_85_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_85_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_85_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_85_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: bicycle\nB: dog\nC: pedestrian\nD: car", "question": "What is the moving thing that is both to the back right of the motorcycle and the front of the bus?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: bicycle\nB: dog\nC: pedestrian\nD: car", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_86_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_86_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_86_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_86_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_86_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_86_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_86_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_86_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_86_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_86_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_86_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_86_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: uncertain\nB: maybe\nC: yes\nD: no", "question": "Is there another car that has the same status as the thing that is to the front left of the with rider thing?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: uncertain\nB: maybe\nC: yes\nD: no", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_87_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_87_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_87_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_87_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_87_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_87_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_87_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_87_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_87_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_87_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_87_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_87_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: without rider\nB: parked\nC: with rider\nD: damaged", "question": "What status is the motorcycle to the back of the moving bus?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: without rider\nB: parked\nC: with rider\nD: damaged", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_88_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_88_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_88_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_88_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_88_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_88_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_88_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_88_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_88_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_88_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_88_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_88_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: 7\nB: 3\nC: 6\nD: 4", "question": "There is a stopped bus; what number of moving things are to the front left of it?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 7\nB: 3\nC: 6\nD: 4", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_89_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_89_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_89_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_89_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_89_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_89_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_89_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_89_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_89_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_89_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_89_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_89_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: turning\nB: stopped\nC: broken down\nD: moving", "question": "There is a truck to the back right of the moving truck; what status is it?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: turning\nB: stopped\nC: broken down\nD: moving", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_90_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_90_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_90_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_90_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_90_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_90_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_90_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_90_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_90_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_90_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_90_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_90_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: no\nB: maybe\nC: sometimes\nD: yes", "question": "Are there any cars?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: no\nB: maybe\nC: sometimes\nD: yes", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_91_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_91_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_91_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_91_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_91_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_91_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_91_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_91_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_91_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_91_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_91_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_91_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: moving\nB: stationary\nC: parked\nD: broken down", "question": "There is a car to the back right of me; what status is it?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: moving\nB: stationary\nC: parked\nD: broken down", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_92_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_92_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_92_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_92_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_92_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_92_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_92_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_92_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_92_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_92_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_92_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_92_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: unknown\nB: yes\nC: maybe\nD: no", "question": "There is a construction vehicle; does it have the same status as the car to the back right of the construction vehicle?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: unknown\nB: yes\nC: maybe\nD: no", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_93_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_93_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_93_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_93_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_93_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_93_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_93_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_93_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_93_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_93_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_93_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_93_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: motorcycle\nB: bicycle\nC: truck\nD: car", "question": "What is the moving thing that is both to the back right of the bus and the front left of the with rider bicycle?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: motorcycle\nB: bicycle\nC: truck\nD: car", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_94_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_94_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_94_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_94_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_94_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_94_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_94_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_94_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_94_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_94_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_94_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_94_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: no\nB: sometimes\nC: yes\nD: probably not", "question": "Are there any barriers?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: no\nB: sometimes\nC: yes\nD: probably not", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_95_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_95_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_95_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_95_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_95_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_95_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_95_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_95_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_95_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_95_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_95_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_95_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: 7\nB: 10\nC: 3\nD: 5", "question": "How many moving pedestrians are there?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 7\nB: 10\nC: 3\nD: 5", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_96_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_96_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_96_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_96_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_96_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_96_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_96_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_96_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_96_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_96_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_96_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_96_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: no\nB: yes\nC: there is a rider without a car\nD: maybe", "question": "There is a with rider thing; are there any stopped cars to the back left of it?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: no\nB: yes\nC: there is a rider without a car\nD: maybe", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_97_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_97_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_97_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_97_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_97_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_97_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_97_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_97_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_97_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_97_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_97_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_97_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: yes\nB: not sure\nC: maybe\nD: no", "question": "Is there another car of the same status as the pedestrian to the front of the with rider thing?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: yes\nB: not sure\nC: maybe\nD: no", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_98_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_98_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_98_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_98_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_98_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_98_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_98_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_98_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_98_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_98_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_98_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_98_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: not sure\nB: no\nC: maybe\nD: yes", "question": "Are there any moving buss?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: not sure\nB: no\nC: maybe\nD: yes", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_99_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_99_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_99_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_99_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_99_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_99_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_99_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_99_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_99_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_99_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_99_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_99_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: moving\nB: under maintenance\nC: being loaded\nD: parked", "question": "The construction vehicle is in what status?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: moving\nB: under maintenance\nC: being loaded\nD: parked", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_100_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_100_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_100_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_100_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_100_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_100_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_100_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_100_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_100_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_100_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_100_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_100_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: delayed\nB: stopped\nC: broken down\nD: moving", "question": "The bus is in what status?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: delayed\nB: stopped\nC: broken down\nD: moving", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_101_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_101_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_101_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_101_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_101_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_101_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_101_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_101_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_101_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_101_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_101_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_101_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: lost\nB: moving\nC: stopped\nD: waiting", "question": "There is a pedestrian that is to the front left of the stopped bus; what status is it?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: lost\nB: moving\nC: stopped\nD: waiting", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_102_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_102_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_102_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_102_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_102_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_102_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_102_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_102_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_102_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_102_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_102_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_102_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: yes\nB: maybe\nC: not sure\nD: no", "question": "There is a bus; is it the same status as the thing that is to the back of the moving bus?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: yes\nB: maybe\nC: not sure\nD: no", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_103_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_103_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_103_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_103_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_103_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_103_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_103_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_103_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_103_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_103_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_103_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_103_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: parked\nB: moving\nC: stopped\nD: overturned", "question": "What is the status of the construction vehicle to the front of the bicycle?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: parked\nB: moving\nC: stopped\nD: overturned", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_104_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_104_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_104_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_104_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_104_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_104_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_104_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_104_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_104_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_104_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_104_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_104_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: yes\nB: sometimes\nC: no\nD: maybe", "question": "Are any things visible?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: yes\nB: sometimes\nC: no\nD: maybe", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_105_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_105_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_105_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_105_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_105_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_105_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_105_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_105_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_105_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_105_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_105_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_105_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: yes\nB: I don't know\nC: maybe\nD: no", "question": "Are there any things to the front of me?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: yes\nB: I don't know\nC: maybe\nD: no", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_106_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_106_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_106_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_106_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_106_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_106_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_106_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_106_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_106_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_106_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_106_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_106_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: bicycle\nB: bus\nC: train\nD: plane", "question": "There is a stopped thing that is to the front of me; what is it?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: bicycle\nB: bus\nC: train\nD: plane", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_107_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_107_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_107_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_107_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_107_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_107_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_107_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_107_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_107_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_107_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_107_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_107_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: uncertain\nB: no\nC: yes\nD: maybe", "question": "Are there any other things that in the same status as the car to the front left of the barrier?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: uncertain\nB: no\nC: yes\nD: maybe", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_108_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_108_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_108_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_108_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_108_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_108_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_108_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_108_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_108_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_108_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_108_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_108_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: maybe\nB: no\nC: yes\nD: possibly", "question": "Does the car that is to the front left of the moving truck have the same status as the motorcycle?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: maybe\nB: no\nC: yes\nD: possibly", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_109_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_109_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_109_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_109_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_109_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_109_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_109_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_109_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_109_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_109_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_109_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_109_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: moving\nB: under maintenance\nC: accelerating\nD: stopped", "question": "The construction vehicle is in what status?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: moving\nB: under maintenance\nC: accelerating\nD: stopped", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_110_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_110_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_110_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_110_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_110_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_110_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_110_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_110_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_110_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_110_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_110_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_110_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: moving\nB: under maintenance\nC: delayed\nD: stopped", "question": "The bus is in what status?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: moving\nB: under maintenance\nC: delayed\nD: stopped", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_111_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_111_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_111_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_111_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_111_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_111_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_111_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_111_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_111_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_111_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_111_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_111_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: maybe\nB: no\nC: yes\nD: uncertain", "question": "There is a car to the front left of the bicycle; is its status the same as the truck to the back right of the moving bus?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: maybe\nB: no\nC: yes\nD: uncertain", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_112_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_112_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_112_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_112_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_112_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_112_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_112_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_112_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_112_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_112_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_112_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_112_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: maybe\nB: yes\nC: no\nD: uncertain", "question": "Is there another car that has the same status as the motorcycle to the back of the moving bus?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: maybe\nB: yes\nC: no\nD: uncertain", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_113_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_113_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_113_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_113_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_113_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_113_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_113_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_113_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_113_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_113_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_113_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_113_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: no\nB: maybe\nC: sometimes\nD: yes", "question": "There is a car to the front of the construction vehicle; is its status the same as the construction vehicle?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: no\nB: maybe\nC: sometimes\nD: yes", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_114_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_114_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_114_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_114_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_114_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_114_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_114_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_114_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_114_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_114_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_114_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_114_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: no\nB: yes\nC: only one other thing\nD: uncertain", "question": "Are there any other things of the same status as the motorcycle that is to the front left of the parked thing?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: no\nB: yes\nC: only one other thing\nD: uncertain", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_115_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_115_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_115_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_115_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_115_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_115_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_115_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_115_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_115_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_115_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_115_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_115_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: maybe\nB: no\nC: yes\nD: unknown", "question": "Are there any cars to the back left of the construction vehicle?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: maybe\nB: no\nC: yes\nD: unknown", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_116_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_116_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_116_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_116_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_116_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_116_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_116_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_116_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_116_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_116_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_116_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_116_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: broken down\nB: moving backward\nC: without rider\nD: with rider", "question": "There is a motorcycle that is to the back of me; what status is it?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: broken down\nB: moving backward\nC: without rider\nD: with rider", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_117_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_117_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_117_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_117_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_117_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_117_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_117_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_117_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_117_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_117_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_117_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_117_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: 10\nB: 15\nC: 3\nD: 5", "question": "What number of cars are to the front left of me?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 10\nB: 15\nC: 3\nD: 5", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_118_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_118_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_118_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_118_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_118_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_118_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_118_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_118_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_118_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_118_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_118_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_118_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: broken\nB: without rider\nC: missing\nD: with rider", "question": "There is a thing that is to the front left of me; what status is it?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: broken\nB: without rider\nC: missing\nD: with rider", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_119_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_119_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_119_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_119_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_119_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_119_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_119_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_119_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_119_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_119_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_119_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_119_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: sitting\nB: standing\nC: running\nD: walking", "question": "What status is the pedestrian that is to the back right of me?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: sitting\nB: standing\nC: running\nD: walking", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_120_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_120_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_120_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_120_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_120_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_120_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_120_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_120_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_120_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_120_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_120_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_120_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: not sure\nB: maybe\nC: yes\nD: no", "question": "Does the truck to the back of the bus have the same status as the construction vehicle that is to the back right of the with rider thing?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: not sure\nB: maybe\nC: yes\nD: no", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_121_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_121_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_121_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_121_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_121_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_121_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_121_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_121_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_121_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_121_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_121_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_121_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: tree\nB: mailbox\nC: sidewalk\nD: traffic cone", "question": "The thing that is to the back of the moving car and the front left of me is what?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: tree\nB: mailbox\nC: sidewalk\nD: traffic cone", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_122_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_122_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_122_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_122_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_122_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_122_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_122_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_122_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_122_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_122_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_122_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_122_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: unknown\nB: no\nC: yes\nD: maybe", "question": "Do the thing that is to the front of the stopped car and the pedestrian that is to the front left of the stopped car have the same status?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: unknown\nB: no\nC: yes\nD: maybe", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_123_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_123_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_123_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_123_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_123_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_123_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_123_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_123_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_123_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_123_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_123_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_123_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: unknown\nB: no\nC: maybe\nD: yes", "question": "Are there any other pedestrians of the same status as the bus that is to the front left of the stopped bus?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: unknown\nB: no\nC: maybe\nD: yes", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_124_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_124_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_124_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_124_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_124_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_124_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_124_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_124_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_124_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_124_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_124_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_124_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: broken down\nB: being repaired\nC: without rider\nD: with rider", "question": "What is the status of the motorcycle that is to the front left of me?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: broken down\nB: being repaired\nC: without rider\nD: with rider", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_125_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_125_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_125_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_125_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_125_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_125_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_125_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_125_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_125_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_125_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_125_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_125_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: moving\nB: departing\nC: stopped\nD: arriving", "question": "What is the status of the bus?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: moving\nB: departing\nC: stopped\nD: arriving", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_126_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_126_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_126_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_126_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_126_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_126_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_126_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_126_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_126_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_126_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_126_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_126_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: there are people\nB: yes\nC: a car\nD: no", "question": "Are there any moving things to the back left of the with rider bicycle?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: there are people\nB: yes\nC: a car\nD: no", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_127_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_127_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_127_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_127_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_127_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_127_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_127_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_127_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_127_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_127_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_127_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_127_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: absent\nB: moving\nC: dangerous\nD: stationary", "question": "There is a pedestrian; what status is it?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: absent\nB: moving\nC: dangerous\nD: stationary", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_128_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_128_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_128_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_128_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_128_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_128_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_128_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_128_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_128_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_128_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_128_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_128_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: some\nB: yes\nC: maybe\nD: no", "question": "There is a moving truck; are there any trucks to the front of it?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: some\nB: yes\nC: maybe\nD: no", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_129_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_129_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_129_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_129_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_129_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_129_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_129_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_129_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_129_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_129_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_129_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_129_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: train\nB: bicycle\nC: car\nD: pedestrian", "question": "The stopped thing to the back of the stopped bus is what?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: train\nB: bicycle\nC: car\nD: pedestrian", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_130_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_130_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_130_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_130_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_130_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_130_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_130_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_130_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_130_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_130_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_130_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_130_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: no\nB: not sure\nC: cannot tell\nD: yes", "question": "Are any with rider motorcycles visible?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: no\nB: not sure\nC: cannot tell\nD: yes", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_131_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_131_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_131_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_131_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_131_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_131_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_131_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_131_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_131_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_131_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_131_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_131_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: 6\nB: 4\nC: 3\nD: 2", "question": "How many things are to the front of me?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 6\nB: 4\nC: 3\nD: 2", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_132_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_132_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_132_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_132_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_132_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_132_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_132_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_132_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_132_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_132_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_132_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_132_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: 5\nB: 2\nC: 4\nD: 7", "question": "What number of moving things are there?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 5\nB: 2\nC: 4\nD: 7", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_133_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_133_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_133_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_133_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_133_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_133_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_133_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_133_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_133_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_133_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_133_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_133_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: no\nB: yes\nC: not sure\nD: maybe", "question": "Are there any barriers?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: no\nB: yes\nC: not sure\nD: maybe", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_134_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_134_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_134_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_134_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_134_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_134_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_134_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_134_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_134_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_134_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_134_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_134_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: maybe\nB: yes\nC: not sure\nD: no", "question": "Is the status of the truck that is to the front left of the moving car the same as the trailer?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: maybe\nB: yes\nC: not sure\nD: no", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_135_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_135_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_135_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_135_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_135_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_135_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_135_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_135_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_135_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_135_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_135_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_135_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: pedestrian\nB: tree\nC: bicycle\nD: traffic light", "question": "What is the thing that is both to the back right of the moving bus and the back right of me?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: pedestrian\nB: tree\nC: bicycle\nD: traffic light", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_136_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_136_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_136_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_136_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_136_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_136_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_136_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_136_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_136_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_136_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_136_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_136_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: pedestrian\nB: bicycle\nC: tree\nD: car", "question": "The moving thing that is both to the back of me and the front of the with rider motorcycle is what?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: pedestrian\nB: bicycle\nC: tree\nD: car", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_137_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_137_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_137_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_137_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_137_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_137_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_137_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_137_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_137_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_137_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_137_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_137_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: no\nB: maybe\nC: not sure\nD: yes", "question": "There is a bus to the front of the parked construction vehicle; is it the same status as the thing that is to the back of the moving truck?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: no\nB: maybe\nC: not sure\nD: yes", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_138_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_138_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_138_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_138_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_138_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_138_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_138_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_138_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_138_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_138_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_138_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_138_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: truck\nB: car\nC: scooter\nD: bus", "question": "What is the stopped thing that is both to the front left of the with rider motorcycle and the back of the with rider bicycle?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: truck\nB: car\nC: scooter\nD: bus", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_139_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_139_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_139_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_139_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_139_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_139_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_139_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_139_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_139_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_139_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_139_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_139_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: yes\nB: I don\u2019t know\nC: no\nD: maybe", "question": "There is a construction vehicle to the back right of the bus; is it the same status as the motorcycle that is to the back of the bus?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: yes\nB: I don\u2019t know\nC: no\nD: maybe", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_140_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_140_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_140_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_140_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_140_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_140_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_140_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_140_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_140_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_140_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_140_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_140_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: no\nB: maybe\nC: yes\nD: not sure", "question": "Are there any other things that in the same status as the pedestrian to the back right of the stopped bus?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: no\nB: maybe\nC: yes\nD: not sure", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_141_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_141_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_141_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_141_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_141_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_141_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_141_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_141_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_141_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_141_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_141_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_141_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: barrier\nB: fire hydrant\nC: tree\nD: light pole", "question": "The thing that is to the back right of the moving bus is what?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: barrier\nB: fire hydrant\nC: tree\nD: light pole", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_142_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_142_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_142_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_142_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_142_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_142_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_142_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_142_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_142_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_142_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_142_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_142_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: 3\nB: 5\nC: 9\nD: 12", "question": "How many moving things are there?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 3\nB: 5\nC: 9\nD: 12", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_143_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_143_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_143_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_143_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_143_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_143_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_143_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_143_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_143_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_143_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_143_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_143_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: car\nB: tree\nC: bench\nD: bicycle", "question": "What is the moving thing that is to the front left of me?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: car\nB: tree\nC: bench\nD: bicycle", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_144_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_144_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_144_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_144_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_144_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_144_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_144_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_144_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_144_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_144_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_144_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_144_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: 7\nB: 12\nC: 5\nD: 9", "question": "What number of other things in the same status as the car that is to the back right of the parked thing?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 7\nB: 12\nC: 5\nD: 9", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_145_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_145_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_145_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_145_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_145_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_145_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_145_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_145_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_145_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_145_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_145_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_145_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: banana\nB: car\nC: running water\nD: flying bird", "question": "There is a stopped thing; what is it?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: banana\nB: car\nC: running water\nD: flying bird", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_146_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_146_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_146_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_146_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_146_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_146_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_146_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_146_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_146_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_146_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_146_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_146_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: yes\nB: maybe\nC: cannot determine\nD: no", "question": "There is a motorcycle; does it have the same status as the car that is to the front left of the with rider thing?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: yes\nB: maybe\nC: cannot determine\nD: no", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_147_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_147_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_147_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_147_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_147_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_147_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_147_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_147_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_147_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_147_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_147_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_147_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: bicycle\nB: pedestrian\nC: crosswalk\nD: traffic light", "question": "There is a standing pedestrian to the front left of me; what is it?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: bicycle\nB: pedestrian\nC: crosswalk\nD: traffic light", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_148_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_148_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_148_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_148_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_148_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_148_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_148_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_148_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_148_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_148_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_148_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_148_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: 5\nB: 7\nC: 3\nD: 10", "question": "What number of motorcycles are there?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 5\nB: 7\nC: 3\nD: 10", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_149_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_149_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_149_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_149_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_149_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_149_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_149_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_149_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_149_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_149_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_149_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_149_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: no\nB: yes\nC: uncertain\nD: probably", "question": "Is there another construction vehicle of the same status as the truck that is to the front of the moving truck?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: no\nB: yes\nC: uncertain\nD: probably", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_150_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_150_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_150_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_150_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_150_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_150_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_150_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_150_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_150_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_150_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_150_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_150_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: yes\nB: maybe\nC: no\nD: sometimes", "question": "Are there any moving buss?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: yes\nB: maybe\nC: no\nD: sometimes", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_151_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_151_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_151_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_151_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_151_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_151_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_151_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_151_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_151_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_151_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_151_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_151_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: lane divider\nB: barrier\nC: tree\nD: cone", "question": "The thing that is to the back right of me and the back right of the construction vehicle is what?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: lane divider\nB: barrier\nC: tree\nD: cone", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_152_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_152_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_152_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_152_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_152_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_152_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_152_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_152_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_152_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_152_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_152_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_152_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: stationary\nB: disappearing\nC: transforming\nD: moving", "question": "There is a thing that is to the front left of me; what status is it?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: stationary\nB: disappearing\nC: transforming\nD: moving", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_153_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_153_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_153_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_153_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_153_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_153_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_153_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_153_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_153_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_153_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_153_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_153_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: yes\nB: no\nC: not sure\nD: maybe", "question": "There is a bus to the front left of the stopped bus; is it the same status as the motorcycle to the back of the moving bus?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: yes\nB: no\nC: not sure\nD: maybe", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_154_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_154_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_154_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_154_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_154_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_154_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_154_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_154_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_154_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_154_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_154_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_154_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: yes\nB: no\nC: maybe\nD: uncertain", "question": "There is a truck; is it the same status as the car to the back right of the stopped truck?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: yes\nB: no\nC: maybe\nD: uncertain", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_155_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_155_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_155_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_155_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_155_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_155_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_155_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_155_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_155_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_155_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_155_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_155_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: traffic cone\nB: tree\nC: hydrant\nD: bench", "question": "What is the thing that is both to the back right of the parked construction vehicle and the front left of me?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: traffic cone\nB: tree\nC: hydrant\nD: bench", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_156_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_156_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_156_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_156_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_156_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_156_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_156_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_156_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_156_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_156_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_156_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_156_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: stopped\nB: departed\nC: moving\nD: full", "question": "What is the status of the bus?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: stopped\nB: departed\nC: moving\nD: full", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_157_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_157_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_157_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_157_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_157_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_157_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_157_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_157_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_157_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_157_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_157_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_157_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: pedestrian\nB: trash can\nC: tree\nD: motorcycle", "question": "The moving thing that is to the front left of the moving bus and the back of me is what?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: pedestrian\nB: trash can\nC: tree\nD: motorcycle", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_158_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_158_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_158_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_158_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_158_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_158_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_158_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_158_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_158_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_158_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_158_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_158_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: bicycle\nB: bench\nC: car\nD: tree", "question": "The thing that is to the back of me and the front left of the parked construction vehicle is what?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: bicycle\nB: bench\nC: car\nD: tree", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_159_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_159_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_159_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_159_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_159_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_159_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_159_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_159_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_159_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_159_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_159_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_159_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: parked\nB: moving\nC: accelerating\nD: stopped", "question": "What is the status of the car that is to the back of me?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: parked\nB: moving\nC: accelerating\nD: stopped", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_160_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_160_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_160_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_160_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_160_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_160_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_160_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_160_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_160_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_160_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_160_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_160_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: turning\nB: disappearing\nC: stopped\nD: moving", "question": "There is a bus that is to the front of me; what status is it?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: turning\nB: disappearing\nC: stopped\nD: moving", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_161_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_161_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_161_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_161_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_161_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_161_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_161_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_161_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_161_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_161_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_161_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_161_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: 5\nB: 3\nC: 8\nD: 0", "question": "There is a bus; how many things are to the front right of it?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 5\nB: 3\nC: 8\nD: 0", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_162_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_162_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_162_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_162_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_162_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_162_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_162_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_162_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_162_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_162_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_162_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_162_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: stopped\nB: waiting for passengers\nC: moving\nD: broken down", "question": "What is the status of the bus?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: stopped\nB: waiting for passengers\nC: moving\nD: broken down", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_163_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_163_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_163_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_163_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_163_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_163_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_163_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_163_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_163_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_163_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_163_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_163_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: maybe\nB: sometimes\nC: no\nD: yes", "question": "Are there any not standing pedestrians?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: maybe\nB: sometimes\nC: no\nD: yes", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_164_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_164_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_164_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_164_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_164_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_164_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_164_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_164_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_164_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_164_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_164_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_164_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: maybe\nB: unable to determine\nC: no\nD: yes", "question": "Is the status of the truck that is to the front left of the moving car the same as the trailer?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: maybe\nB: unable to determine\nC: no\nD: yes", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_165_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_165_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_165_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_165_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_165_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_165_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_165_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_165_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_165_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_165_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_165_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_165_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: car\nB: bicycle\nC: dog\nD: pedestrian", "question": "What is the moving thing to the back right of me?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: car\nB: bicycle\nC: dog\nD: pedestrian", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_166_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_166_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_166_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_166_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_166_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_166_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_166_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_166_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_166_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_166_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_166_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_166_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: maybe\nB: no\nC: yes\nD: uncertain", "question": "Is there another bus that has the same status as the car that is to the front left of the stopped construction vehicle?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: maybe\nB: no\nC: yes\nD: uncertain", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_167_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_167_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_167_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_167_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_167_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_167_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_167_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_167_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_167_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_167_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_167_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_167_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: moving swiftly\nB: stopped\nC: being repaired\nD: broken down", "question": "The bus that is to the back right of the moving bus is in what status?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: moving swiftly\nB: stopped\nC: being repaired\nD: broken down", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_168_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_168_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_168_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_168_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_168_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_168_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_168_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_168_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_168_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_168_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_168_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_168_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: 5\nB: 10\nC: 7\nD: 3", "question": "What number of trucks are there?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 5\nB: 10\nC: 7\nD: 3", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_169_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_169_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_169_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_169_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_169_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_169_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_169_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_169_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_169_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_169_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_169_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_169_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: bicycle\nB: tree\nC: car\nD: building", "question": "The moving thing that is to the front of the construction vehicle and the front left of me is what?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: bicycle\nB: tree\nC: car\nD: building", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_170_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_170_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_170_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_170_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_170_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_170_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_170_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_170_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_170_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_170_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_170_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_170_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: car\nB: traffic light\nC: bicycle\nD: pedestrian", "question": "What is the thing that is both to the front left of the stopped bus and the back of the with rider thing?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: car\nB: traffic light\nC: bicycle\nD: pedestrian", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_171_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_171_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_171_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_171_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_171_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_171_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_171_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_171_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_171_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_171_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_171_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_171_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: moving\nB: missing\nC: broken down\nD: parked", "question": "What status is the truck to the back right of the bus?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: moving\nB: missing\nC: broken down\nD: parked", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_172_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_172_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_172_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_172_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_172_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_172_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_172_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_172_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_172_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_172_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_172_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_172_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: locked up\nB: in transit\nC: damaged\nD: with rider", "question": "What is the status of the bicycle?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: locked up\nB: in transit\nC: damaged\nD: with rider", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_173_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_173_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_173_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_173_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_173_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_173_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_173_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_173_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_173_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_173_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_173_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_173_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: 1\nB: 5\nC: 10\nD: 3", "question": "What number of stopped trucks are there?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 1\nB: 5\nC: 10\nD: 3", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_174_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_174_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_174_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_174_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_174_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_174_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_174_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_174_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_174_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_174_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_174_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_174_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: sometimes\nB: maybe\nC: yes\nD: no", "question": "Are there any moving buss to the back left of the bus?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: sometimes\nB: maybe\nC: yes\nD: no", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_175_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_175_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_175_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_175_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_175_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_175_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_175_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_175_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_175_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_175_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_175_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_175_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: moving\nB: broken down\nC: under maintenance\nD: parked", "question": "What is the status of the bus that is to the front left of the stopped bus?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: moving\nB: broken down\nC: under maintenance\nD: parked", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_176_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_176_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_176_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_176_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_176_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_176_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_176_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_176_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_176_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_176_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_176_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_176_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: bus\nB: bicycle\nC: car\nD: train", "question": "The moving thing that is both to the back right of the with rider motorcycle and the front of me is what?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: bus\nB: bicycle\nC: car\nD: train", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_177_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_177_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_177_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_177_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_177_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_177_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_177_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_177_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_177_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_177_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_177_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_177_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: boat\nB: bicycle\nC: house\nD: car", "question": "There is a parked thing that is to the back of me; what is it?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: boat\nB: bicycle\nC: house\nD: car", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_178_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_178_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_178_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_178_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_178_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_178_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_178_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_178_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_178_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_178_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_178_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_178_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: possibly\nB: no\nC: maybe\nD: yes", "question": "Are there any not standing pedestrians?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: possibly\nB: no\nC: maybe\nD: yes", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_179_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_179_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_179_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_179_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_179_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_179_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_179_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_179_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_179_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_179_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_179_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_179_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: no\nB: yes\nC: maybe\nD: uncertain", "question": "Are any stopped trucks visible?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: no\nB: yes\nC: maybe\nD: uncertain", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_180_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_180_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_180_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_180_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_180_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_180_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_180_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_180_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_180_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_180_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_180_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_180_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: 3\nB: 4\nC: 2\nD: 1", "question": "What number of things are to the back right of the motorcycle?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 3\nB: 4\nC: 2\nD: 1", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_181_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_181_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_181_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_181_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_181_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_181_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_181_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_181_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_181_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_181_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_181_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_181_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: 7\nB: 5\nC: 2\nD: 3", "question": "How many other things in the same status as the thing to the front left of the pedestrian?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 7\nB: 5\nC: 2\nD: 3", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_182_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_182_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_182_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_182_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_182_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_182_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_182_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_182_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_182_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_182_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_182_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_182_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: stopped\nB: disappeared\nC: moving\nD: broken down", "question": "What status is the bus that is to the front of the parked thing?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: stopped\nB: disappeared\nC: moving\nD: broken down", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_183_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_183_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_183_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_183_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_183_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_183_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_183_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_183_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_183_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_183_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_183_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_183_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: bus\nB: bicycle\nC: pedestrian\nD: traffic light", "question": "The thing that is to the front left of me and the front of the with rider motorcycle is what?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: bus\nB: bicycle\nC: pedestrian\nD: traffic light", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_184_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_184_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_184_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_184_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_184_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_184_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_184_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_184_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_184_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_184_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_184_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_184_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: no\nB: maybe\nC: I do not know\nD: yes", "question": "Are there any other cars that in the same status as the bus?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: no\nB: maybe\nC: I do not know\nD: yes", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_185_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_185_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_185_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_185_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_185_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_185_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_185_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_185_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_185_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_185_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_185_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_185_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: car\nB: tree\nC: bicycle\nD: bus", "question": "There is a stopped thing that is to the front of me; what is it?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: car\nB: tree\nC: bicycle\nD: bus", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_186_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_186_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_186_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_186_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_186_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_186_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_186_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_186_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_186_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_186_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_186_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_186_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: 5\nB: 3\nC: 9\nD: 7", "question": "What number of other things are there of the same status as the construction vehicle?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 5\nB: 3\nC: 9\nD: 7", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_187_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_187_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_187_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_187_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_187_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_187_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_187_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_187_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_187_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_187_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_187_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_187_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: yes\nB: unknown\nC: maybe\nD: no", "question": "Are there any stopped things to the back right of the trailer?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: yes\nB: unknown\nC: maybe\nD: no", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_188_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_188_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_188_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_188_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_188_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_188_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_188_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_188_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_188_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_188_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_188_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_188_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: bus\nB: tree\nC: car\nD: bicycle", "question": "What is the stopped thing that is to the front left of the with rider motorcycle and the back right of me?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: bus\nB: tree\nC: car\nD: bicycle", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_189_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_189_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_189_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_189_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_189_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_189_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_189_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_189_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_189_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_189_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_189_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_189_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: 3\nB: 5\nC: 10\nD: 7", "question": "How many standing pedestrians are there?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 3\nB: 5\nC: 10\nD: 7", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_190_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_190_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_190_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_190_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_190_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_190_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_190_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_190_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_190_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_190_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_190_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_190_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: broken down\nB: stationary\nC: under repair\nD: moving", "question": "There is a bus; what status is it?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: broken down\nB: stationary\nC: under repair\nD: moving", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_191_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_191_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_191_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_191_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_191_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_191_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_191_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_191_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_191_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_191_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_191_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_191_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: possibly\nB: unknown\nC: no\nD: yes", "question": "Does the thing to the front left of the construction vehicle have the same status as the construction vehicle?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: possibly\nB: unknown\nC: no\nD: yes", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_192_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_192_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_192_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_192_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_192_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_192_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_192_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_192_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_192_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_192_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_192_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_192_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: stopped\nB: broken down\nC: moving\nD: delayed", "question": "What is the status of the bus?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: stopped\nB: broken down\nC: moving\nD: delayed", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_193_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_193_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_193_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_193_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_193_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_193_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_193_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_193_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_193_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_193_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_193_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_193_11.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: bicycle\nB: car\nC: airplane\nD: bus", "question": "What is the stopped thing?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: bicycle\nB: car\nC: airplane\nD: bus", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_194_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_194_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_194_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_194_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_194_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_194_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_194_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_194_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_194_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_194_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_194_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_194_11.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: no\nB: only when moving\nC: sometimes\nD: yes", "question": "There is a truck to the front of the stopped construction vehicle; does it have the same status as the bicycle?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: no\nB: only when moving\nC: sometimes\nD: yes", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_195_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_195_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_195_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_195_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_195_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_195_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_195_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_195_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_195_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_195_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_195_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_195_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: 5\nB: 2\nC: 10\nD: 8", "question": "How many moving cars are there?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 5\nB: 2\nC: 10\nD: 8", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_196_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_196_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_196_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_196_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_196_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_196_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_196_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_196_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_196_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_196_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_196_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_196_11.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: 3\nB: 5\nC: 50\nD: 12", "question": "How many other things are in the same status as the truck?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 3\nB: 5\nC: 50\nD: 12", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_197_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_197_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_197_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_197_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_197_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_197_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_197_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_197_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_197_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_197_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_197_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_197_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: 7\nB: 1\nC: 3\nD: 5", "question": "What number of with rider things are there?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: 7\nB: 1\nC: 3\nD: 5", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_198_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_198_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_198_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_198_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_198_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_198_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_198_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_198_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_198_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_198_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_198_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_198_11.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "threeD_question_answering", "visual_input_component": "LiDAR image and natural image", "source": "NuScenes_threeD_question_answering", "options": "A: maybe\nB: no\nC: yes\nD: possibly", "question": "There is a car to the front of the parked construction vehicle; is its status the same as the construction vehicle to the front of the moving truck?", "context": "Your task is : Given inputs of the 3D information for a scene and a question about the 3D scene (real life), the model aims to output the correct answer. \nSelect from the following choices.\nA: maybe\nB: no\nC: yes\nD: possibly", "input_image_path": ["./3D-spatial/threeD_question_answering/threeD_question_answering_199_0.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_199_1.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_199_2.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_199_3.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_199_4.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_199_5.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_199_6.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_199_7.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_199_8.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_199_9.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_199_10.png", "./3D-spatial/threeD_question_answering/threeD_question_answering_199_11.png"], "output": "B", "qwen3-vl": "image none"}]
\ No newline at end of file
diff --git a/results/threed_cad_recognition/qwen3-vl/metadata_info.json b/results/threed_cad_recognition/qwen3-vl/metadata_info.json
new file mode 100644
index 0000000..3e20f83
--- /dev/null
+++ b/results/threed_cad_recognition/qwen3-vl/metadata_info.json
@@ -0,0 +1 @@
+[{"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: bottle\nB: lamp\nC: chair\nD: clock", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bottle\nB: lamp\nC: chair\nD: clock", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_0_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_0_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_0_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_0_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_0_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_0_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: wardrobe\nB: television stand\nC: radio\nD: xbox", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: wardrobe\nB: television stand\nC: radio\nD: xbox", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_1_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_1_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_1_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_1_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_1_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_1_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: tv stand\nB: sofa\nC: stool\nD: chair", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: tv stand\nB: sofa\nC: stool\nD: chair", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_2_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_2_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_2_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_2_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_2_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_2_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: radio\nB: loudspeaker\nC: guitar\nD: microphone", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: radio\nB: loudspeaker\nC: guitar\nD: microphone", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_3_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_3_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_3_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_3_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_3_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_3_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: cabinet\nB: bathtub\nC: glass box\nD: monitor", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: cabinet\nB: bathtub\nC: glass box\nD: monitor", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_4_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_4_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_4_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_4_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_4_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_4_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: mantel\nB: bookshelf\nC: curtain\nD: stool", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: mantel\nB: bookshelf\nC: curtain\nD: stool", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_5_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_5_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_5_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_5_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_5_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_5_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: toilet\nB: sink\nC: bathtub\nD: stool", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: toilet\nB: sink\nC: bathtub\nD: stool", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_6_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_6_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_6_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_6_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_6_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_6_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: bowl\nB: table\nC: stairs\nD: laptop", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bowl\nB: table\nC: stairs\nD: laptop", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_7_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_7_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_7_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_7_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_7_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_7_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: television stand\nB: radio\nC: vase\nD: lamp", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: television stand\nB: radio\nC: vase\nD: lamp", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_8_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_8_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_8_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_8_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_8_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_8_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: bookshelf\nB: telephone\nC: chair\nD: desk", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bookshelf\nB: telephone\nC: chair\nD: desk", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_9_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_9_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_9_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_9_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_9_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_9_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: loudspeaker\nB: watercraft\nC: airplane\nD: chair", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: loudspeaker\nB: watercraft\nC: airplane\nD: chair", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_10_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_10_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_10_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_10_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_10_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_10_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: chair\nB: dresser\nC: night stand\nD: bed", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: chair\nB: dresser\nC: night stand\nD: bed", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_11_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_11_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_11_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_11_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_11_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_11_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: bookshelf\nB: desk\nC: toilet\nD: lamp", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bookshelf\nB: desk\nC: toilet\nD: lamp", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_12_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_12_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_12_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_12_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_12_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_12_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: tv stand\nB: telephone\nC: clock\nD: laptop", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: tv stand\nB: telephone\nC: clock\nD: laptop", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_13_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_13_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_13_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_13_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_13_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_13_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: cabinet\nB: lamp\nC: mantel\nD: table", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: cabinet\nB: lamp\nC: mantel\nD: table", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_14_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_14_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_14_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_14_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_14_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_14_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: clock\nB: vase\nC: bookshelf\nD: telephone", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: clock\nB: vase\nC: bookshelf\nD: telephone", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_15_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_15_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_15_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_15_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_15_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_15_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: guitar\nB: speaker\nC: table\nD: chair", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: guitar\nB: speaker\nC: table\nD: chair", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_16_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_16_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_16_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_16_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_16_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_16_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: stool\nB: piano\nC: microphone\nD: guitar", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: stool\nB: piano\nC: microphone\nD: guitar", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_17_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_17_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_17_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_17_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_17_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_17_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: desk\nB: chair\nC: sofa\nD: table", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: desk\nB: chair\nC: sofa\nD: table", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_18_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_18_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_18_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_18_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_18_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_18_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: chair\nB: night stand\nC: bed\nD: lamp", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: chair\nB: night stand\nC: bed\nD: lamp", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_19_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_19_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_19_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_19_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_19_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_19_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: telephone\nB: sofa\nC: table\nD: chair", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: telephone\nB: sofa\nC: table\nD: chair", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_20_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_20_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_20_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_20_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_20_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_20_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: range hood\nB: clock\nC: telephone\nD: vase", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: range hood\nB: clock\nC: telephone\nD: vase", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_21_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_21_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_21_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_21_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_21_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_21_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: bathtub\nB: airplane\nC: watercraft\nD: car", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bathtub\nB: airplane\nC: watercraft\nD: car", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_22_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_22_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_22_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_22_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_22_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_22_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: airplane\nB: bicycle\nC: motorcycle\nD: car", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: airplane\nB: bicycle\nC: motorcycle\nD: car", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_23_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_23_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_23_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_23_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_23_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_23_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: table\nB: night stand\nC: lamp\nD: chair", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: table\nB: night stand\nC: lamp\nD: chair", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_24_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_24_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_24_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_24_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_24_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_24_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: car\nB: telephone\nC: toilet\nD: bottle", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: car\nB: telephone\nC: toilet\nD: bottle", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_25_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_25_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_25_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_25_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_25_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_25_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: flower pot\nB: lamp\nC: stairs\nD: plant", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: flower pot\nB: lamp\nC: stairs\nD: plant", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_26_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_26_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_26_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_26_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_26_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_26_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: bookshelf\nB: desk\nC: table\nD: chair", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bookshelf\nB: desk\nC: table\nD: chair", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_27_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_27_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_27_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_27_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_27_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_27_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: table\nB: chair\nC: night stand\nD: telephone", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: table\nB: chair\nC: night stand\nD: telephone", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_28_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_28_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_28_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_28_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_28_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_28_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: faucet\nB: telephone\nC: clock\nD: vase", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: faucet\nB: telephone\nC: clock\nD: vase", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_29_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_29_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_29_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_29_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_29_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_29_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: night stand\nB: chair\nC: lamp\nD: dresser", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: night stand\nB: chair\nC: lamp\nD: dresser", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_30_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_30_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_30_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_30_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_30_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_30_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: desk\nB: cabinet\nC: table\nD: chair", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: desk\nB: cabinet\nC: table\nD: chair", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_31_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_31_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_31_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_31_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_31_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_31_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: guitar\nB: microphone\nC: piano\nD: telephone", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: guitar\nB: microphone\nC: piano\nD: telephone", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_32_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_32_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_32_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_32_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_32_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_32_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: mantel\nB: chair\nC: sofa\nD: bed", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: mantel\nB: chair\nC: sofa\nD: bed", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_33_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_33_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_33_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_33_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_33_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_33_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: desk\nB: sofa\nC: table\nD: chair", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: desk\nB: sofa\nC: table\nD: chair", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_34_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_34_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_34_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_34_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_34_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_34_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: bike\nB: car\nC: airplane\nD: bus", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bike\nB: car\nC: airplane\nD: bus", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_35_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_35_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_35_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_35_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_35_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_35_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: lamp\nB: plant\nC: chair\nD: telephone", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: lamp\nB: plant\nC: chair\nD: telephone", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_36_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_36_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_36_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_36_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_36_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_36_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: mug\nB: bottle\nC: glass box\nD: faucet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: mug\nB: bottle\nC: glass box\nD: faucet", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_37_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_37_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_37_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_37_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_37_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_37_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: clock\nB: tv stand\nC: mantel\nD: telephone", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: clock\nB: tv stand\nC: mantel\nD: telephone", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_38_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_38_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_38_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_38_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_38_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_38_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: lamp\nB: sofa\nC: chair\nD: telephone", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: lamp\nB: sofa\nC: chair\nD: telephone", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_39_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_39_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_39_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_39_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_39_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_39_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: airplane\nB: boat\nC: sofa\nD: watercraft", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: airplane\nB: boat\nC: sofa\nD: watercraft", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_40_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_40_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_40_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_40_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_40_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_40_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: telephone\nB: clock\nC: vase\nD: car", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: telephone\nB: clock\nC: vase\nD: car", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_41_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_41_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_41_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_41_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_41_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_41_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: monitor\nB: keyboard\nC: television\nD: speaker", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: monitor\nB: keyboard\nC: television\nD: speaker", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_42_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_42_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_42_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_42_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_42_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_42_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: sink\nB: bathtub\nC: toilet\nD: faucet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: sink\nB: bathtub\nC: toilet\nD: faucet", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_43_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_43_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_43_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_43_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_43_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_43_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: vase\nB: telephone\nC: clock\nD: guitar", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: vase\nB: telephone\nC: clock\nD: guitar", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_44_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_44_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_44_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_44_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_44_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_44_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: monitor\nB: loudspeaker\nC: guitar\nD: piano", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: monitor\nB: loudspeaker\nC: guitar\nD: piano", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_45_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_45_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_45_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_45_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_45_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_45_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: tv stand\nB: radio\nC: chair\nD: cabinet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: tv stand\nB: radio\nC: chair\nD: cabinet", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_46_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_46_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_46_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_46_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_46_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_46_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: chair\nB: table\nC: sofa\nD: telephone", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: chair\nB: table\nC: sofa\nD: telephone", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_47_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_47_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_47_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_47_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_47_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_47_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: lamp\nB: sofa\nC: bed\nD: chair", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: lamp\nB: sofa\nC: bed\nD: chair", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_48_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_48_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_48_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_48_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_48_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_48_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: cabinet\nB: chair\nC: telephone\nD: tv stand", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: cabinet\nB: chair\nC: telephone\nD: tv stand", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_49_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_49_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_49_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_49_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_49_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_49_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: toilet\nB: chair\nC: lamp\nD: sink", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: toilet\nB: chair\nC: lamp\nD: sink", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_50_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_50_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_50_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_50_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_50_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_50_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: monitor\nB: airplane\nC: car\nD: person", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: monitor\nB: airplane\nC: car\nD: person", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_51_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_51_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_51_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_51_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_51_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_51_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: mantel\nB: sofa\nC: telephone\nD: clock", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: mantel\nB: sofa\nC: telephone\nD: clock", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_52_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_52_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_52_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_52_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_52_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_52_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: glass box\nB: bottle\nC: mug\nD: watercraft", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: glass box\nB: bottle\nC: mug\nD: watercraft", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_53_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_53_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_53_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_53_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_53_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_53_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: telephone\nB: guitar\nC: vase\nD: clock", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: telephone\nB: guitar\nC: vase\nD: clock", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_54_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_54_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_54_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_54_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_54_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_54_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: lamp\nB: pistol\nC: rifle\nD: tv stand", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: lamp\nB: pistol\nC: rifle\nD: tv stand", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_55_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_55_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_55_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_55_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_55_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_55_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: bookshelf\nB: sofa\nC: mantel\nD: television", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bookshelf\nB: sofa\nC: mantel\nD: television", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_56_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_56_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_56_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_56_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_56_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_56_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: lamp\nB: vase\nC: bookshelf\nD: curtain", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: lamp\nB: vase\nC: bookshelf\nD: curtain", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_57_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_57_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_57_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_57_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_57_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_57_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: dresser\nB: bookshelf\nC: stool\nD: night stand", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: dresser\nB: bookshelf\nC: stool\nD: night stand", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_58_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_58_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_58_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_58_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_58_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_58_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: vase\nB: clock\nC: tv stand\nD: telephone", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: vase\nB: clock\nC: tv stand\nD: telephone", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_59_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_59_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_59_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_59_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_59_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_59_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: desk\nB: laptop\nC: keyboard\nD: monitor", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: desk\nB: laptop\nC: keyboard\nD: monitor", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_60_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_60_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_60_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_60_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_60_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_60_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: sofa\nB: clock\nC: telephone\nD: guitar", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: sofa\nB: clock\nC: telephone\nD: guitar", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_61_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_61_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_61_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_61_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_61_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_61_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: range hood\nB: telephone\nC: clock\nD: bathtub", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: range hood\nB: telephone\nC: clock\nD: bathtub", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_62_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_62_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_62_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_62_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_62_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_62_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: loudspeaker\nB: radio\nC: telephone\nD: tv stand", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: loudspeaker\nB: radio\nC: telephone\nD: tv stand", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_63_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_63_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_63_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_63_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_63_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_63_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: guitar\nB: microphone\nC: table\nD: piano", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: guitar\nB: microphone\nC: table\nD: piano", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_64_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_64_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_64_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_64_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_64_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_64_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: telephone\nB: chair\nC: sofa\nD: table", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: telephone\nB: chair\nC: sofa\nD: table", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_65_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_65_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_65_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_65_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_65_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_65_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: stair\nB: keyboard\nC: laptop\nD: cellphone", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: stair\nB: keyboard\nC: laptop\nD: cellphone", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_66_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_66_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_66_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_66_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_66_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_66_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: clock\nB: telephone\nC: vase\nD: monitor", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: clock\nB: telephone\nC: vase\nD: monitor", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_67_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_67_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_67_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_67_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_67_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_67_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: night stand\nB: dresser\nC: television\nD: bookshelf", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: night stand\nB: dresser\nC: television\nD: bookshelf", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_68_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_68_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_68_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_68_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_68_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_68_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: laptop\nB: monitor\nC: keyboard\nD: telephone", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: laptop\nB: monitor\nC: keyboard\nD: telephone", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_69_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_69_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_69_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_69_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_69_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_69_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: chair\nB: bed\nC: lamp\nD: night stand", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: chair\nB: bed\nC: lamp\nD: night stand", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_70_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_70_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_70_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_70_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_70_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_70_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: telephone\nB: clock\nC: vase\nD: tv stand", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: telephone\nB: clock\nC: vase\nD: tv stand", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_71_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_71_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_71_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_71_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_71_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_71_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: bottle\nB: lamp\nC: glass box\nD: mug", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bottle\nB: lamp\nC: glass box\nD: mug", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_72_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_72_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_72_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_72_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_72_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_72_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: clock\nB: telephone\nC: tv stand\nD: chair", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: clock\nB: telephone\nC: tv stand\nD: chair", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_73_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_73_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_73_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_73_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_73_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_73_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: lamp\nB: piano\nC: dresser\nD: night stand", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: lamp\nB: piano\nC: dresser\nD: night stand", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_74_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_74_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_74_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_74_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_74_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_74_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: table\nB: sofa\nC: lamp\nD: chair", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: table\nB: sofa\nC: lamp\nD: chair", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_75_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_75_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_75_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_75_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_75_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_75_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: radio\nB: chair\nC: desk\nD: bench", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: radio\nB: chair\nC: desk\nD: bench", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_76_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_76_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_76_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_76_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_76_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_76_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: piano\nB: chair\nC: stool\nD: guitar", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: piano\nB: chair\nC: stool\nD: guitar", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_77_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_77_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_77_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_77_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_77_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_77_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: faucet\nB: telephone\nC: stool\nD: range hood", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: faucet\nB: telephone\nC: stool\nD: range hood", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_78_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_78_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_78_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_78_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_78_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_78_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: vase\nB: telephone\nC: bookshelf\nD: clock", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: vase\nB: telephone\nC: bookshelf\nD: clock", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_79_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_79_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_79_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_79_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_79_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_79_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: sink\nB: chair\nC: plant\nD: cabinet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: sink\nB: chair\nC: plant\nD: cabinet", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_80_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_80_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_80_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_80_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_80_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_80_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: faucet\nB: telephone\nC: table\nD: chair", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: faucet\nB: telephone\nC: table\nD: chair", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_81_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_81_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_81_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_81_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_81_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_81_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: telephone\nB: lamp\nC: bookshelf\nD: sink", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: telephone\nB: lamp\nC: bookshelf\nD: sink", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_82_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_82_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_82_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_82_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_82_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_82_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: tv stand\nB: desk\nC: monitor\nD: chair", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: tv stand\nB: desk\nC: monitor\nD: chair", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_83_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_83_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_83_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_83_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_83_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_83_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: night stand\nB: vase\nC: clock\nD: telephone", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: night stand\nB: vase\nC: clock\nD: telephone", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_84_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_84_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_84_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_84_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_84_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_84_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: chair\nB: desk\nC: stool\nD: bookshelf", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: chair\nB: desk\nC: stool\nD: bookshelf", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_85_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_85_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_85_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_85_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_85_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_85_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: rifle\nB: telephone\nC: car\nD: airplane", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: rifle\nB: telephone\nC: car\nD: airplane", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_86_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_86_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_86_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_86_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_86_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_86_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: bottle\nB: sink\nC: toilet\nD: chair", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bottle\nB: sink\nC: toilet\nD: chair", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_87_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_87_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_87_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_87_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_87_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_87_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: vase\nB: television\nC: mirror\nD: decorative bowl", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: vase\nB: television\nC: mirror\nD: decorative bowl", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_88_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_88_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_88_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_88_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_88_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_88_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: bed\nB: chair\nC: sofa\nD: table", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bed\nB: chair\nC: sofa\nD: table", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_89_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_89_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_89_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_89_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_89_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_89_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: sink\nB: faucet\nC: toilet\nD: bathtub", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: sink\nB: faucet\nC: toilet\nD: bathtub", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_90_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_90_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_90_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_90_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_90_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_90_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: table\nB: lamp\nC: vase\nD: chair", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: table\nB: lamp\nC: vase\nD: chair", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_91_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_91_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_91_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_91_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_91_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_91_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: telephone\nB: bottle\nC: glass box\nD: faucet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: telephone\nB: bottle\nC: glass box\nD: faucet", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_92_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_92_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_92_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_92_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_92_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_92_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: faucet\nB: telephone\nC: airplane\nD: clock", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: faucet\nB: telephone\nC: airplane\nD: clock", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_93_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_93_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_93_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_93_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_93_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_93_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: faucet\nB: clock\nC: telephone\nD: range hood", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: faucet\nB: clock\nC: telephone\nD: range hood", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_94_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_94_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_94_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_94_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_94_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_94_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: rifle\nB: laptop\nC: clock\nD: pistol", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: rifle\nB: laptop\nC: clock\nD: pistol", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_95_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_95_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_95_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_95_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_95_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_95_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: cup\nB: bottle\nC: glass box\nD: mug", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: cup\nB: bottle\nC: glass box\nD: mug", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_96_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_96_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_96_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_96_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_96_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_96_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: glass box\nB: television\nC: monitor\nD: chair", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: glass box\nB: television\nC: monitor\nD: chair", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_97_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_97_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_97_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_97_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_97_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_97_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: car\nB: telephone\nC: radio\nD: airplane", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: car\nB: telephone\nC: radio\nD: airplane", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_98_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_98_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_98_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_98_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_98_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_98_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: chair\nB: flower pot\nC: lamp\nD: vase", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: chair\nB: flower pot\nC: lamp\nD: vase", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_99_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_99_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_99_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_99_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_99_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_99_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: chair\nB: night stand\nC: bottle\nD: sofa", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: chair\nB: night stand\nC: bottle\nD: sofa", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_100_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_100_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_100_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_100_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_100_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_100_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: chair\nB: table\nC: tv stand\nD: telephone", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: chair\nB: table\nC: tv stand\nD: telephone", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_101_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_101_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_101_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_101_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_101_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_101_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: glass box\nB: table\nC: lamp\nD: telephone", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: glass box\nB: table\nC: lamp\nD: telephone", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_102_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_102_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_102_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_102_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_102_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_102_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: table\nB: vase\nC: chair\nD: lamp", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: table\nB: vase\nC: chair\nD: lamp", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_103_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_103_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_103_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_103_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_103_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_103_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: lamp\nB: chair\nC: table\nD: desk", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: lamp\nB: chair\nC: table\nD: desk", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_104_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_104_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_104_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_104_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_104_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_104_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: vase\nB: lamp\nC: flower pot\nD: plant", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: vase\nB: lamp\nC: flower pot\nD: plant", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_105_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_105_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_105_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_105_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_105_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_105_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: table\nB: sofa\nC: chair\nD: bed", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: table\nB: sofa\nC: chair\nD: bed", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_106_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_106_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_106_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_106_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_106_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_106_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: telephone\nB: guitar\nC: radio\nD: lamp", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: telephone\nB: guitar\nC: radio\nD: lamp", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_107_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_107_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_107_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_107_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_107_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_107_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: telephone\nB: bench\nC: table\nD: lamp", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: telephone\nB: bench\nC: table\nD: lamp", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_108_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_108_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_108_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_108_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_108_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_108_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: telephone\nB: clock\nC: vase\nD: lamp", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: telephone\nB: clock\nC: vase\nD: lamp", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_109_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_109_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_109_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_109_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_109_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_109_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: sofa\nB: table\nC: chair\nD: telephone", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: sofa\nB: table\nC: chair\nD: telephone", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_110_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_110_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_110_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_110_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_110_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_110_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: chair\nB: toilet\nC: table\nD: telephone", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: chair\nB: toilet\nC: table\nD: telephone", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_111_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_111_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_111_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_111_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_111_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_111_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: lamp\nB: toilet\nC: vase\nD: table", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: lamp\nB: toilet\nC: vase\nD: table", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_112_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_112_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_112_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_112_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_112_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_112_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: car\nB: vase\nC: telephone\nD: clock", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: car\nB: vase\nC: telephone\nD: clock", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_113_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_113_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_113_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_113_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_113_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_113_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: chair\nB: desk\nC: bookshelf\nD: wardrobe", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: chair\nB: desk\nC: bookshelf\nD: wardrobe", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_114_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_114_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_114_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_114_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_114_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_114_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: microphone\nB: table\nC: stool\nD: guitar", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: microphone\nB: table\nC: stool\nD: guitar", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_115_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_115_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_115_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_115_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_115_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_115_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: piano\nB: rifle\nC: guitar\nD: telephone", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: piano\nB: rifle\nC: guitar\nD: telephone", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_116_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_116_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_116_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_116_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_116_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_116_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: chair\nB: stairs\nC: laptop\nD: bed", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: chair\nB: stairs\nC: laptop\nD: bed", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_117_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_117_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_117_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_117_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_117_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_117_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: radio\nB: glass box\nC: monitor\nD: lamp", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: radio\nB: glass box\nC: monitor\nD: lamp", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_118_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_118_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_118_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_118_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_118_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_118_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: vase\nB: clock\nC: guitar\nD: telephone", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: vase\nB: clock\nC: guitar\nD: telephone", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_119_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_119_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_119_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_119_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_119_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_119_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: chair\nB: piano\nC: monitor\nD: laptop", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: chair\nB: piano\nC: monitor\nD: laptop", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_120_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_120_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_120_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_120_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_120_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_120_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: clock\nB: telephone\nC: guitar\nD: vase", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: clock\nB: telephone\nC: guitar\nD: vase", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_121_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_121_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_121_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_121_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_121_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_121_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: clock\nB: telephone\nC: vase\nD: plant", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: clock\nB: telephone\nC: vase\nD: plant", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_122_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_122_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_122_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_122_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_122_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_122_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: table\nB: lamp\nC: desk\nD: chair", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: table\nB: lamp\nC: desk\nD: chair", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_123_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_123_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_123_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_123_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_123_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_123_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: sink\nB: telephone\nC: chair\nD: toilet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: sink\nB: telephone\nC: chair\nD: toilet", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_124_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_124_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_124_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_124_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_124_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_124_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: night stand\nB: chair\nC: bed\nD: lamp", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: night stand\nB: chair\nC: bed\nD: lamp", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_125_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_125_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_125_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_125_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_125_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_125_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: table\nB: sofa\nC: chair\nD: cabinet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: table\nB: sofa\nC: chair\nD: cabinet", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_126_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_126_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_126_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_126_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_126_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_126_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: chair\nB: toilet\nC: telephone\nD: vase", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: chair\nB: toilet\nC: telephone\nD: vase", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_127_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_127_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_127_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_127_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_127_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_127_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: bottle\nB: watercraft\nC: airplane\nD: car", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bottle\nB: watercraft\nC: airplane\nD: car", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_128_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_128_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_128_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_128_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_128_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_128_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: chair\nB: desk\nC: table\nD: sofa", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: chair\nB: desk\nC: table\nD: sofa", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_129_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_129_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_129_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_129_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_129_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_129_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: clock\nB: vase\nC: stool\nD: telephone", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: clock\nB: vase\nC: stool\nD: telephone", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_130_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_130_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_130_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_130_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_130_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_130_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: piano\nB: telephone\nC: clock\nD: vase", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: piano\nB: telephone\nC: clock\nD: vase", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_131_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_131_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_131_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_131_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_131_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_131_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: bottle\nB: faucet\nC: glass box\nD: radio", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bottle\nB: faucet\nC: glass box\nD: radio", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_132_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_132_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_132_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_132_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_132_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_132_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: vase\nB: faucet\nC: clock\nD: telephone", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: vase\nB: faucet\nC: clock\nD: telephone", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_133_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_133_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_133_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_133_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_133_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_133_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: radio\nB: tv stand\nC: lamp\nD: telephone", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: radio\nB: tv stand\nC: lamp\nD: telephone", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_134_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_134_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_134_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_134_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_134_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_134_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: night stand\nB: dresser\nC: bed\nD: lamp", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: night stand\nB: dresser\nC: bed\nD: lamp", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_135_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_135_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_135_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_135_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_135_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_135_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: wardrobe\nB: curtain\nC: bathtub\nD: desk", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: wardrobe\nB: curtain\nC: bathtub\nD: desk", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_136_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_136_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_136_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_136_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_136_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_136_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: stool\nB: chair\nC: desk\nD: table", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: stool\nB: chair\nC: desk\nD: table", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_137_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_137_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_137_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_137_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_137_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_137_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: vase\nB: lamp\nC: telephone\nD: toilet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: vase\nB: lamp\nC: telephone\nD: toilet", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_138_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_138_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_138_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_138_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_138_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_138_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: guitar\nB: telephone\nC: radio\nD: laptop", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: guitar\nB: telephone\nC: radio\nD: laptop", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_139_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_139_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_139_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_139_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_139_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_139_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: sink\nB: tv stand\nC: clock\nD: telephone", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: sink\nB: tv stand\nC: clock\nD: telephone", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_140_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_140_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_140_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_140_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_140_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_140_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: flower pot\nB: clock\nC: vase\nD: telephone", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: flower pot\nB: clock\nC: vase\nD: telephone", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_141_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_141_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_141_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_141_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_141_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_141_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: lamp\nB: vase\nC: bottle\nD: stool", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: lamp\nB: vase\nC: bottle\nD: stool", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_142_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_142_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_142_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_142_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_142_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_142_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: sofa\nB: chair\nC: bed\nD: telephone", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: sofa\nB: chair\nC: bed\nD: telephone", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_143_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_143_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_143_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_143_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_143_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_143_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: telephone\nB: chair\nC: sofa\nD: bed", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: telephone\nB: chair\nC: sofa\nD: bed", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_144_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_144_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_144_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_144_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_144_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_144_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: dresser\nB: desk\nC: bathtub\nD: wardrobe", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: dresser\nB: desk\nC: bathtub\nD: wardrobe", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_145_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_145_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_145_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_145_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_145_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_145_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: stool\nB: chair\nC: desk\nD: bench", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: stool\nB: chair\nC: desk\nD: bench", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_146_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_146_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_146_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_146_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_146_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_146_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: telephone\nB: rifle\nC: lamp\nD: plant", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: telephone\nB: rifle\nC: lamp\nD: plant", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_147_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_147_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_147_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_147_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_147_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_147_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: chair\nB: bed\nC: dresser\nD: wardrobe", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: chair\nB: bed\nC: dresser\nD: wardrobe", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_148_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_148_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_148_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_148_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_148_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_148_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: curtain\nB: stool\nC: mantel\nD: bookshelf", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: curtain\nB: stool\nC: mantel\nD: bookshelf", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_149_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_149_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_149_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_149_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_149_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_149_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: vase\nB: tv stand\nC: clock\nD: telephone", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: vase\nB: tv stand\nC: clock\nD: telephone", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_150_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_150_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_150_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_150_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_150_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_150_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: desk\nB: bookshelf\nC: table\nD: chair", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: desk\nB: bookshelf\nC: table\nD: chair", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_151_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_151_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_151_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_151_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_151_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_151_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: lamp\nB: glass box\nC: mug\nD: bottle", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: lamp\nB: glass box\nC: mug\nD: bottle", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_152_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_152_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_152_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_152_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_152_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_152_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: chair\nB: cellphone\nC: watercraft\nD: laptop", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: chair\nB: cellphone\nC: watercraft\nD: laptop", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_153_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_153_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_153_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_153_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_153_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_153_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: chair\nB: sofa\nC: table\nD: lamp", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: chair\nB: sofa\nC: table\nD: lamp", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_154_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_154_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_154_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_154_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_154_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_154_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: microphone\nB: guitar\nC: piano\nD: stool", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: microphone\nB: guitar\nC: piano\nD: stool", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_155_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_155_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_155_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_155_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_155_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_155_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: bookshelf\nB: chair\nC: telephone\nD: table", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bookshelf\nB: chair\nC: telephone\nD: table", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_156_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_156_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_156_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_156_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_156_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_156_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: bed\nB: chair\nC: sofa\nD: lamp", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bed\nB: chair\nC: sofa\nD: lamp", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_157_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_157_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_157_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_157_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_157_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_157_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: telephone\nB: bowl\nC: mug\nD: lamp", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: telephone\nB: bowl\nC: mug\nD: lamp", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_158_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_158_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_158_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_158_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_158_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_158_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: faucet\nB: bathtub\nC: shower\nD: toilet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: faucet\nB: bathtub\nC: shower\nD: toilet", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_159_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_159_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_159_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_159_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_159_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_159_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: cabinet\nB: desk\nC: table\nD: chair", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: cabinet\nB: desk\nC: table\nD: chair", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_160_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_160_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_160_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_160_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_160_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_160_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: guitar\nB: stool\nC: telephone\nD: clock", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: guitar\nB: stool\nC: telephone\nD: clock", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_161_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_161_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_161_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_161_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_161_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_161_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: bed\nB: stool\nC: night stand\nD: desk", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bed\nB: stool\nC: night stand\nD: desk", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_162_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_162_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_162_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_162_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_162_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_162_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: airplane\nB: car\nC: motorcycle\nD: bicycle", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: airplane\nB: car\nC: motorcycle\nD: bicycle", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_163_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_163_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_163_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_163_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_163_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_163_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: tv stand\nB: sofa\nC: stool\nD: chair", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: tv stand\nB: sofa\nC: stool\nD: chair", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_164_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_164_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_164_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_164_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_164_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_164_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: piano\nB: clock\nC: guitar\nD: telephone", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: piano\nB: clock\nC: guitar\nD: telephone", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_165_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_165_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_165_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_165_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_165_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_165_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: laptop\nB: telephone\nC: stool\nD: airplane", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: laptop\nB: telephone\nC: stool\nD: airplane", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_166_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_166_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_166_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_166_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_166_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_166_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: telephone\nB: clock\nC: vase\nD: pistol", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: telephone\nB: clock\nC: vase\nD: pistol", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_167_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_167_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_167_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_167_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_167_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_167_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: plant\nB: television stand\nC: lamp\nD: chair", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: plant\nB: television stand\nC: lamp\nD: chair", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_168_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_168_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_168_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_168_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_168_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_168_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: airplane\nB: lamp\nC: radio\nD: tent", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: airplane\nB: lamp\nC: radio\nD: tent", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_169_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_169_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_169_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_169_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_169_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_169_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: lamp\nB: stairs\nC: piano\nD: telephone", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: lamp\nB: stairs\nC: piano\nD: telephone", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_170_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_170_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_170_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_170_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_170_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_170_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: sink\nB: faucet\nC: bottle\nD: tv stand", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: sink\nB: faucet\nC: bottle\nD: tv stand", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_171_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_171_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_171_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_171_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_171_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_171_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: table\nB: sofa\nC: desk\nD: chair", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: table\nB: sofa\nC: desk\nD: chair", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_172_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_172_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_172_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_172_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_172_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_172_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: telephone\nB: vase\nC: chair\nD: toilet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: telephone\nB: vase\nC: chair\nD: toilet", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_173_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_173_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_173_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_173_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_173_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_173_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: telephone\nB: chair\nC: table\nD: sofa", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: telephone\nB: chair\nC: table\nD: sofa", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_174_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_174_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_174_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_174_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_174_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_174_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: sink\nB: bathtub\nC: telephone\nD: toilet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: sink\nB: bathtub\nC: telephone\nD: toilet", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_175_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_175_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_175_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_175_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_175_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_175_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: clock\nB: tv stand\nC: telephone\nD: vase", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: clock\nB: tv stand\nC: telephone\nD: vase", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_176_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_176_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_176_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_176_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_176_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_176_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: car\nB: bookshelf\nC: airplane\nD: stool", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: car\nB: bookshelf\nC: airplane\nD: stool", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_177_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_177_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_177_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_177_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_177_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_177_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: chair\nB: table\nC: sofa\nD: stool", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: chair\nB: table\nC: sofa\nD: stool", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_178_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_178_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_178_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_178_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_178_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_178_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: faucet\nB: telephone\nC: range hood\nD: stool", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: faucet\nB: telephone\nC: range hood\nD: stool", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_179_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_179_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_179_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_179_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_179_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_179_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: telephone\nB: table\nC: chair\nD: bookshelf", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: telephone\nB: table\nC: chair\nD: bookshelf", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_180_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_180_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_180_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_180_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_180_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_180_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: lamp\nB: desk\nC: bookshelf\nD: chair", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: lamp\nB: desk\nC: bookshelf\nD: chair", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_181_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_181_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_181_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_181_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_181_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_181_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: sink\nB: bathtub\nC: faucet\nD: toilet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: sink\nB: bathtub\nC: faucet\nD: toilet", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_182_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_182_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_182_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_182_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_182_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_182_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: telephone\nB: stool\nC: chair\nD: tv stand", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: telephone\nB: stool\nC: chair\nD: tv stand", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_183_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_183_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_183_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_183_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_183_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_183_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: mantel\nB: stairs\nC: fireplace\nD: sofa", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: mantel\nB: stairs\nC: fireplace\nD: sofa", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_184_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_184_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_184_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_184_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_184_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_184_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: desktop\nB: lamp\nC: radio\nD: glass box", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: desktop\nB: lamp\nC: radio\nD: glass box", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_185_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_185_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_185_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_185_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_185_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_185_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: mantel\nB: plant\nC: radio\nD: tv stand", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: mantel\nB: plant\nC: radio\nD: tv stand", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_186_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_186_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_186_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_186_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_186_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_186_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: chair\nB: clock\nC: vase\nD: telephone", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: chair\nB: clock\nC: vase\nD: telephone", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_187_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_187_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_187_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_187_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_187_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_187_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: clock\nB: car\nC: vase\nD: telephone", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: clock\nB: car\nC: vase\nD: telephone", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_188_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_188_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_188_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_188_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_188_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_188_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: telephone\nB: clock\nC: piano\nD: vase", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: telephone\nB: clock\nC: piano\nD: vase", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_189_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_189_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_189_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_189_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_189_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_189_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: television\nB: glass box\nC: chair\nD: bookshelf", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: television\nB: glass box\nC: chair\nD: bookshelf", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_190_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_190_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_190_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_190_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_190_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_190_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: sink\nB: shower curtain\nC: monitor\nD: toilet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: sink\nB: shower curtain\nC: monitor\nD: toilet", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_191_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_191_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_191_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_191_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_191_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_191_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: lamp\nB: telephone\nC: stool\nD: sofa", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: lamp\nB: telephone\nC: stool\nD: sofa", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_192_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_192_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_192_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_192_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_192_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_192_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: chair\nB: keyboard\nC: guitar\nD: telephone", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: chair\nB: keyboard\nC: guitar\nD: telephone", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_193_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_193_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_193_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_193_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_193_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_193_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: bookshelf\nB: desk\nC: chair\nD: mantel", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bookshelf\nB: desk\nC: chair\nD: mantel", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_194_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_194_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_194_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_194_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_194_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_194_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: night stand\nB: chair\nC: lamp\nD: bed", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: night stand\nB: chair\nC: lamp\nD: bed", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_195_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_195_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_195_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_195_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_195_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_195_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: sink\nB: bathtub\nC: stool\nD: toilet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: sink\nB: bathtub\nC: stool\nD: toilet", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_196_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_196_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_196_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_196_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_196_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_196_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: lamp\nB: plant\nC: flower pot\nD: bookshelf", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: lamp\nB: plant\nC: flower pot\nD: bookshelf", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_197_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_197_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_197_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_197_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_197_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_197_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: bottle\nB: mug\nC: keyboard\nD: cup", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bottle\nB: mug\nC: keyboard\nD: cup", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_198_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_198_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_198_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_198_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_198_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_198_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_cad_recognition", "visual_input_component": "Poine cloud image", "source": "ModelNet40", "options": "A: stool\nB: chair\nC: piano\nD: guitar", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: stool\nB: chair\nC: piano\nD: guitar", "input_image_path": ["./3D-spatial/threed_cad_recognition/threed_cad_recognition_199_0.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_199_1.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_199_2.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_199_3.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_199_4.jpg", "./3D-spatial/threed_cad_recognition/threed_cad_recognition_199_5.jpg"], "output": "C", "qwen3-vl": "image none"}]
\ No newline at end of file
diff --git a/results/threed_indoor_recognition/qwen3-vl/metadata_info.json b/results/threed_indoor_recognition/qwen3-vl/metadata_info.json
new file mode 100644
index 0000000..01e602f
--- /dev/null
+++ b/results/threed_indoor_recognition/qwen3-vl/metadata_info.json
@@ -0,0 +1 @@
+[{"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: bed\nB: cabinet\nC: bin\nD: sink", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bed\nB: cabinet\nC: bin\nD: sink", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_0_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_0_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_0_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_0_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_0_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_0_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: box\nB: sink\nC: cabinet\nD: shelf", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: box\nB: sink\nC: cabinet\nD: shelf", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_1_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_1_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_1_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_1_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_1_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_1_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: sink\nB: cabinet\nC: bag\nD: bed", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: sink\nB: cabinet\nC: bag\nD: bed", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_2_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_2_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_2_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_2_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_2_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_2_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: box\nB: sink\nC: chair\nD: shelf", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: box\nB: sink\nC: chair\nD: shelf", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_3_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_3_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_3_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_3_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_3_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_3_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: chair\nB: sofa\nC: bed\nD: cabinet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: chair\nB: sofa\nC: bed\nD: cabinet", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_4_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_4_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_4_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_4_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_4_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_4_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: bag\nB: cabinet\nC: desk\nD: sink", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bag\nB: cabinet\nC: desk\nD: sink", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_5_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_5_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_5_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_5_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_5_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_5_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: desk\nB: cabinet\nC: shelf\nD: bed", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: desk\nB: cabinet\nC: shelf\nD: bed", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_6_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_6_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_6_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_6_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_6_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_6_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: cabinet\nB: sink\nC: bag\nD: bin", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: cabinet\nB: sink\nC: bag\nD: bin", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_7_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_7_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_7_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_7_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_7_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_7_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: sink\nB: bed\nC: bin\nD: display", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: sink\nB: bed\nC: bin\nD: display", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_8_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_8_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_8_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_8_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_8_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_8_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: cabinet\nB: sofa\nC: sink\nD: bag", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: cabinet\nB: sofa\nC: sink\nD: bag", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_9_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_9_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_9_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_9_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_9_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_9_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: bed\nB: shelf\nC: door\nD: cabinet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bed\nB: shelf\nC: door\nD: cabinet", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_10_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_10_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_10_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_10_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_10_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_10_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: desk\nB: chair\nC: sofa\nD: toilet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: desk\nB: chair\nC: sofa\nD: toilet", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_11_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_11_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_11_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_11_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_11_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_11_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: sink\nB: bed\nC: shelf\nD: cabinet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: sink\nB: bed\nC: shelf\nD: cabinet", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_12_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_12_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_12_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_12_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_12_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_12_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: shelf\nB: cabinet\nC: desk\nD: bed", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: shelf\nB: cabinet\nC: desk\nD: bed", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_13_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_13_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_13_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_13_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_13_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_13_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: cabinet\nB: bed\nC: shelf\nD: box", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: cabinet\nB: bed\nC: shelf\nD: box", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_14_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_14_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_14_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_14_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_14_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_14_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: bed\nB: chair\nC: cabinet\nD: sofa", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bed\nB: chair\nC: cabinet\nD: sofa", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_15_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_15_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_15_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_15_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_15_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_15_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: sink\nB: bag\nC: cabinet\nD: door", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: sink\nB: bag\nC: cabinet\nD: door", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_16_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_16_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_16_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_16_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_16_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_16_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: sink\nB: cabinet\nC: table\nD: pillow", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: sink\nB: cabinet\nC: table\nD: pillow", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_17_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_17_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_17_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_17_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_17_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_17_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: table\nB: sink\nC: bed\nD: cabinet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: table\nB: sink\nC: bed\nD: cabinet", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_18_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_18_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_18_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_18_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_18_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_18_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: cabinet\nB: bag\nC: sink\nD: box", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: cabinet\nB: bag\nC: sink\nD: box", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_19_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_19_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_19_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_19_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_19_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_19_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: cabinet\nB: sink\nC: shelf\nD: bed", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: cabinet\nB: sink\nC: shelf\nD: bed", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_20_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_20_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_20_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_20_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_20_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_20_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: sofa\nB: cabinet\nC: chair\nD: bed", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: sofa\nB: cabinet\nC: chair\nD: bed", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_21_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_21_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_21_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_21_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_21_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_21_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: sink\nB: cabinet\nC: bed\nD: display", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: sink\nB: cabinet\nC: bed\nD: display", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_22_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_22_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_22_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_22_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_22_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_22_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: display\nB: shelf\nC: bin\nD: cabinet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: display\nB: shelf\nC: bin\nD: cabinet", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_23_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_23_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_23_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_23_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_23_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_23_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: cabinet\nB: bed\nC: display\nD: shelf", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: cabinet\nB: bed\nC: display\nD: shelf", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_24_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_24_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_24_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_24_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_24_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_24_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: bed\nB: pillow\nC: chair\nD: desk", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bed\nB: pillow\nC: chair\nD: desk", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_25_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_25_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_25_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_25_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_25_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_25_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: pillow\nB: shelf\nC: sink\nD: bed", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: pillow\nB: shelf\nC: sink\nD: bed", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_26_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_26_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_26_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_26_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_26_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_26_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: cabinet\nB: bed\nC: shelf\nD: table", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: cabinet\nB: bed\nC: shelf\nD: table", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_27_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_27_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_27_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_27_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_27_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_27_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: cabinet\nB: bed\nC: bag\nD: sink", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: cabinet\nB: bed\nC: bag\nD: sink", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_28_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_28_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_28_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_28_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_28_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_28_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: shelf\nB: cabinet\nC: sofa\nD: desk", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: shelf\nB: cabinet\nC: sofa\nD: desk", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_29_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_29_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_29_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_29_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_29_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_29_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: sink\nB: chair\nC: door\nD: pillow", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: sink\nB: chair\nC: door\nD: pillow", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_30_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_30_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_30_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_30_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_30_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_30_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: chair\nB: display\nC: bed\nD: sofa", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: chair\nB: display\nC: bed\nD: sofa", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_31_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_31_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_31_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_31_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_31_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_31_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: pillow\nB: sofa\nC: cabinet\nD: toilet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: pillow\nB: sofa\nC: cabinet\nD: toilet", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_32_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_32_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_32_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_32_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_32_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_32_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: sink\nB: cabinet\nC: table\nD: bed", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: sink\nB: cabinet\nC: table\nD: bed", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_33_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_33_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_33_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_33_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_33_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_33_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: cabinet\nB: bag\nC: sink\nD: bed", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: cabinet\nB: bag\nC: sink\nD: bed", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_34_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_34_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_34_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_34_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_34_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_34_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: bed\nB: cabinet\nC: bin\nD: sink", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bed\nB: cabinet\nC: bin\nD: sink", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_35_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_35_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_35_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_35_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_35_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_35_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: sink\nB: cabinet\nC: door\nD: bag", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: sink\nB: cabinet\nC: door\nD: bag", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_36_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_36_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_36_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_36_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_36_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_36_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: display\nB: cabinet\nC: sink\nD: bed", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: display\nB: cabinet\nC: sink\nD: bed", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_37_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_37_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_37_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_37_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_37_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_37_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: bed\nB: sink\nC: cabinet\nD: bag", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bed\nB: sink\nC: cabinet\nD: bag", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_38_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_38_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_38_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_38_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_38_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_38_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: sofa\nB: cabinet\nC: bed\nD: shelf", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: sofa\nB: cabinet\nC: bed\nD: shelf", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_39_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_39_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_39_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_39_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_39_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_39_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: sink\nB: desk\nC: shelf\nD: cabinet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: sink\nB: desk\nC: shelf\nD: cabinet", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_40_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_40_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_40_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_40_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_40_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_40_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: sink\nB: cabinet\nC: bed\nD: table", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: sink\nB: cabinet\nC: bed\nD: table", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_41_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_41_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_41_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_41_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_41_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_41_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: display\nB: table\nC: bed\nD: cabinet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: display\nB: table\nC: bed\nD: cabinet", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_42_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_42_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_42_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_42_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_42_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_42_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: sink\nB: bed\nC: cabinet\nD: shelf", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: sink\nB: bed\nC: cabinet\nD: shelf", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_43_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_43_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_43_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_43_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_43_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_43_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: shelf\nB: bed\nC: cabinet\nD: door", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: shelf\nB: bed\nC: cabinet\nD: door", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_44_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_44_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_44_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_44_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_44_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_44_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: bag\nB: sink\nC: cabinet\nD: table", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bag\nB: sink\nC: cabinet\nD: table", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_45_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_45_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_45_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_45_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_45_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_45_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: display\nB: toilet\nC: chair\nD: table", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: display\nB: toilet\nC: chair\nD: table", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_46_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_46_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_46_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_46_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_46_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_46_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: bed\nB: sink\nC: cabinet\nD: bag", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bed\nB: sink\nC: cabinet\nD: bag", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_47_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_47_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_47_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_47_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_47_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_47_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: bag\nB: sink\nC: bed\nD: cabinet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bag\nB: sink\nC: bed\nD: cabinet", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_48_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_48_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_48_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_48_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_48_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_48_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: desk\nB: shelf\nC: bed\nD: cabinet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: desk\nB: shelf\nC: bed\nD: cabinet", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_49_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_49_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_49_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_49_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_49_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_49_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: sink\nB: bag\nC: bed\nD: cabinet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: sink\nB: bag\nC: bed\nD: cabinet", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_50_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_50_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_50_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_50_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_50_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_50_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: cabinet\nB: bag\nC: sink\nD: box", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: cabinet\nB: bag\nC: sink\nD: box", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_51_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_51_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_51_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_51_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_51_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_51_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: box\nB: display\nC: bed\nD: cabinet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: box\nB: display\nC: bed\nD: cabinet", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_52_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_52_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_52_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_52_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_52_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_52_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: bin\nB: cabinet\nC: bed\nD: display", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bin\nB: cabinet\nC: bed\nD: display", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_53_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_53_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_53_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_53_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_53_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_53_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: cabinet\nB: sink\nC: bag\nD: bed", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: cabinet\nB: sink\nC: bag\nD: bed", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_54_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_54_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_54_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_54_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_54_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_54_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: box\nB: cabinet\nC: bed\nD: sink", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: box\nB: cabinet\nC: bed\nD: sink", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_55_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_55_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_55_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_55_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_55_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_55_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: sink\nB: door\nC: bed\nD: cabinet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: sink\nB: door\nC: bed\nD: cabinet", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_56_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_56_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_56_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_56_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_56_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_56_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: cabinet\nB: bed\nC: chair\nD: bin", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: cabinet\nB: bed\nC: chair\nD: bin", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_57_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_57_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_57_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_57_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_57_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_57_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: bed\nB: sink\nC: cabinet\nD: bag", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bed\nB: sink\nC: cabinet\nD: bag", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_58_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_58_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_58_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_58_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_58_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_58_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: cabinet\nB: bed\nC: box\nD: table", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: cabinet\nB: bed\nC: box\nD: table", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_59_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_59_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_59_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_59_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_59_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_59_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: sofa\nB: cabinet\nC: sink\nD: display", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: sofa\nB: cabinet\nC: sink\nD: display", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_60_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_60_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_60_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_60_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_60_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_60_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: cabinet\nB: bag\nC: sink\nD: display", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: cabinet\nB: bag\nC: sink\nD: display", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_61_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_61_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_61_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_61_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_61_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_61_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: display\nB: cabinet\nC: sofa\nD: bed", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: display\nB: cabinet\nC: sofa\nD: bed", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_62_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_62_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_62_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_62_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_62_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_62_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: shelf\nB: box\nC: chair\nD: bed", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: shelf\nB: box\nC: chair\nD: bed", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_63_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_63_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_63_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_63_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_63_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_63_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: cabinet\nB: display\nC: chair\nD: bed", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: cabinet\nB: display\nC: chair\nD: bed", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_64_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_64_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_64_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_64_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_64_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_64_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: sink\nB: bag\nC: bed\nD: cabinet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: sink\nB: bag\nC: bed\nD: cabinet", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_65_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_65_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_65_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_65_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_65_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_65_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: shelf\nB: cabinet\nC: display\nD: bed", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: shelf\nB: cabinet\nC: display\nD: bed", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_66_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_66_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_66_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_66_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_66_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_66_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: bed\nB: pillow\nC: cabinet\nD: toilet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bed\nB: pillow\nC: cabinet\nD: toilet", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_67_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_67_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_67_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_67_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_67_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_67_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: shelf\nB: bed\nC: door\nD: cabinet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: shelf\nB: bed\nC: door\nD: cabinet", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_68_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_68_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_68_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_68_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_68_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_68_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: sink\nB: bin\nC: cabinet\nD: display", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: sink\nB: bin\nC: cabinet\nD: display", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_69_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_69_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_69_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_69_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_69_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_69_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: bed\nB: shelf\nC: chair\nD: cabinet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bed\nB: shelf\nC: chair\nD: cabinet", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_70_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_70_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_70_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_70_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_70_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_70_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: bed\nB: cabinet\nC: chair\nD: sink", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bed\nB: cabinet\nC: chair\nD: sink", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_71_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_71_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_71_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_71_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_71_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_71_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: sink\nB: bed\nC: shelf\nD: cabinet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: sink\nB: bed\nC: shelf\nD: cabinet", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_72_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_72_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_72_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_72_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_72_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_72_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: cabinet\nB: bed\nC: shelf\nD: pillow", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: cabinet\nB: bed\nC: shelf\nD: pillow", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_73_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_73_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_73_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_73_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_73_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_73_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: toilet\nB: bag\nC: cabinet\nD: sink", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: toilet\nB: bag\nC: cabinet\nD: sink", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_74_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_74_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_74_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_74_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_74_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_74_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: cabinet\nB: bin\nC: sink\nD: bed", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: cabinet\nB: bin\nC: sink\nD: bed", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_75_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_75_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_75_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_75_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_75_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_75_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: sink\nB: cabinet\nC: bed\nD: display", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: sink\nB: cabinet\nC: bed\nD: display", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_76_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_76_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_76_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_76_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_76_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_76_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: chair\nB: cabinet\nC: bed\nD: sofa", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: chair\nB: cabinet\nC: bed\nD: sofa", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_77_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_77_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_77_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_77_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_77_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_77_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: bed\nB: display\nC: cabinet\nD: bin", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bed\nB: display\nC: cabinet\nD: bin", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_78_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_78_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_78_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_78_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_78_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_78_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: cabinet\nB: bed\nC: bag\nD: sink", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: cabinet\nB: bed\nC: bag\nD: sink", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_79_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_79_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_79_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_79_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_79_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_79_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: bag\nB: sink\nC: bed\nD: cabinet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bag\nB: sink\nC: bed\nD: cabinet", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_80_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_80_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_80_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_80_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_80_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_80_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: sofa\nB: display\nC: sink\nD: cabinet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: sofa\nB: display\nC: sink\nD: cabinet", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_81_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_81_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_81_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_81_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_81_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_81_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: bed\nB: display\nC: shelf\nD: cabinet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bed\nB: display\nC: shelf\nD: cabinet", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_82_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_82_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_82_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_82_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_82_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_82_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: bed\nB: sofa\nC: chair\nD: cabinet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bed\nB: sofa\nC: chair\nD: cabinet", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_83_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_83_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_83_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_83_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_83_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_83_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: table\nB: cabinet\nC: bed\nD: sink", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: table\nB: cabinet\nC: bed\nD: sink", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_84_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_84_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_84_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_84_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_84_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_84_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: bin\nB: box\nC: cabinet\nD: sink", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bin\nB: box\nC: cabinet\nD: sink", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_85_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_85_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_85_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_85_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_85_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_85_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: sink\nB: cabinet\nC: bin\nD: bed", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: sink\nB: cabinet\nC: bin\nD: bed", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_86_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_86_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_86_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_86_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_86_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_86_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: sink\nB: cabinet\nC: bag\nD: display", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: sink\nB: cabinet\nC: bag\nD: display", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_87_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_87_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_87_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_87_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_87_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_87_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: sink\nB: chair\nC: door\nD: bed", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: sink\nB: chair\nC: door\nD: bed", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_88_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_88_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_88_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_88_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_88_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_88_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: cabinet\nB: bed\nC: sink\nD: sofa", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: cabinet\nB: bed\nC: sink\nD: sofa", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_89_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_89_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_89_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_89_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_89_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_89_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: bed\nB: chair\nC: cabinet\nD: desk", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bed\nB: chair\nC: cabinet\nD: desk", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_90_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_90_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_90_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_90_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_90_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_90_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: cabinet\nB: sink\nC: bag\nD: box", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: cabinet\nB: sink\nC: bag\nD: box", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_91_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_91_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_91_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_91_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_91_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_91_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: bed\nB: display\nC: sink\nD: cabinet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bed\nB: display\nC: sink\nD: cabinet", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_92_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_92_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_92_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_92_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_92_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_92_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: bag\nB: sink\nC: cabinet\nD: bed", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bag\nB: sink\nC: cabinet\nD: bed", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_93_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_93_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_93_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_93_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_93_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_93_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: cabinet\nB: sink\nC: chair\nD: display", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: cabinet\nB: sink\nC: chair\nD: display", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_94_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_94_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_94_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_94_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_94_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_94_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: desk\nB: bed\nC: chair\nD: cabinet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: desk\nB: bed\nC: chair\nD: cabinet", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_95_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_95_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_95_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_95_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_95_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_95_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: cabinet\nB: sofa\nC: shelf\nD: bed", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: cabinet\nB: sofa\nC: shelf\nD: bed", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_96_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_96_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_96_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_96_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_96_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_96_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: bed\nB: bin\nC: cabinet\nD: sofa", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bed\nB: bin\nC: cabinet\nD: sofa", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_97_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_97_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_97_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_97_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_97_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_97_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: cabinet\nB: sink\nC: bed\nD: bag", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: cabinet\nB: sink\nC: bed\nD: bag", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_98_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_98_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_98_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_98_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_98_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_98_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: cabinet\nB: toilet\nC: bag\nD: sink", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: cabinet\nB: toilet\nC: bag\nD: sink", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_99_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_99_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_99_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_99_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_99_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_99_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: shelf\nB: chair\nC: bed\nD: cabinet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: shelf\nB: chair\nC: bed\nD: cabinet", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_100_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_100_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_100_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_100_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_100_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_100_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: sink\nB: bin\nC: cabinet\nD: box", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: sink\nB: bin\nC: cabinet\nD: box", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_101_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_101_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_101_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_101_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_101_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_101_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: sink\nB: bin\nC: bag\nD: cabinet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: sink\nB: bin\nC: bag\nD: cabinet", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_102_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_102_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_102_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_102_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_102_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_102_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: bed\nB: cabinet\nC: toilet\nD: display", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bed\nB: cabinet\nC: toilet\nD: display", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_103_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_103_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_103_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_103_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_103_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_103_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: bin\nB: bed\nC: table\nD: shelf", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bin\nB: bed\nC: table\nD: shelf", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_104_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_104_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_104_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_104_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_104_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_104_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: cabinet\nB: table\nC: sink\nD: bed", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: cabinet\nB: table\nC: sink\nD: bed", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_105_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_105_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_105_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_105_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_105_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_105_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: cabinet\nB: bin\nC: bed\nD: sink", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: cabinet\nB: bin\nC: bed\nD: sink", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_106_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_106_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_106_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_106_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_106_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_106_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: bed\nB: cabinet\nC: chair\nD: shelf", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bed\nB: cabinet\nC: chair\nD: shelf", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_107_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_107_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_107_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_107_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_107_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_107_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: desk\nB: chair\nC: bin\nD: display", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: desk\nB: chair\nC: bin\nD: display", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_108_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_108_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_108_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_108_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_108_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_108_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: bed\nB: display\nC: sink\nD: cabinet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bed\nB: display\nC: sink\nD: cabinet", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_109_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_109_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_109_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_109_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_109_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_109_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: chair\nB: cabinet\nC: bin\nD: bed", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: chair\nB: cabinet\nC: bin\nD: bed", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_110_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_110_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_110_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_110_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_110_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_110_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: chair\nB: bed\nC: sofa\nD: cabinet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: chair\nB: bed\nC: sofa\nD: cabinet", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_111_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_111_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_111_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_111_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_111_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_111_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: chair\nB: table\nC: bed\nD: toilet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: chair\nB: table\nC: bed\nD: toilet", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_112_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_112_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_112_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_112_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_112_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_112_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: bed\nB: shelf\nC: table\nD: cabinet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bed\nB: shelf\nC: table\nD: cabinet", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_113_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_113_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_113_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_113_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_113_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_113_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: bed\nB: table\nC: chair\nD: cabinet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bed\nB: table\nC: chair\nD: cabinet", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_114_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_114_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_114_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_114_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_114_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_114_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: bag\nB: box\nC: toilet\nD: pillow", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bag\nB: box\nC: toilet\nD: pillow", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_115_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_115_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_115_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_115_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_115_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_115_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: cabinet\nB: door\nC: bag\nD: sink", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: cabinet\nB: door\nC: bag\nD: sink", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_116_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_116_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_116_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_116_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_116_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_116_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: chair\nB: desk\nC: bed\nD: cabinet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: chair\nB: desk\nC: bed\nD: cabinet", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_117_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_117_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_117_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_117_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_117_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_117_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: table\nB: sink\nC: cabinet\nD: bed", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: table\nB: sink\nC: cabinet\nD: bed", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_118_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_118_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_118_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_118_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_118_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_118_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: bin\nB: bed\nC: bag\nD: cabinet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bin\nB: bed\nC: bag\nD: cabinet", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_119_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_119_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_119_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_119_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_119_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_119_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: display\nB: cabinet\nC: sink\nD: bed", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: display\nB: cabinet\nC: sink\nD: bed", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_120_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_120_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_120_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_120_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_120_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_120_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: pillow\nB: bed\nC: cabinet\nD: background", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: pillow\nB: bed\nC: cabinet\nD: background", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_121_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_121_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_121_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_121_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_121_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_121_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: pillow\nB: sofa\nC: bed\nD: chair", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: pillow\nB: sofa\nC: bed\nD: chair", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_122_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_122_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_122_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_122_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_122_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_122_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: display\nB: bed\nC: sink\nD: cabinet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: display\nB: bed\nC: sink\nD: cabinet", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_123_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_123_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_123_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_123_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_123_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_123_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: sink\nB: cabinet\nC: shelf\nD: box", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: sink\nB: cabinet\nC: shelf\nD: box", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_124_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_124_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_124_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_124_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_124_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_124_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: sink\nB: cabinet\nC: shelf\nD: bed", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: sink\nB: cabinet\nC: shelf\nD: bed", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_125_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_125_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_125_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_125_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_125_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_125_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: sink\nB: bed\nC: cabinet\nD: bag", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: sink\nB: bed\nC: cabinet\nD: bag", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_126_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_126_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_126_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_126_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_126_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_126_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: door\nB: cabinet\nC: sink\nD: shelf", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: door\nB: cabinet\nC: sink\nD: shelf", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_127_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_127_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_127_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_127_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_127_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_127_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: bed\nB: cabinet\nC: toilet\nD: sink", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bed\nB: cabinet\nC: toilet\nD: sink", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_128_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_128_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_128_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_128_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_128_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_128_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: cabinet\nB: bed\nC: desk\nD: shelf", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: cabinet\nB: bed\nC: desk\nD: shelf", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_129_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_129_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_129_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_129_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_129_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_129_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: cabinet\nB: sink\nC: bed\nD: bag", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: cabinet\nB: sink\nC: bed\nD: bag", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_130_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_130_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_130_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_130_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_130_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_130_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: sofa\nB: cabinet\nC: toilet\nD: shelf", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: sofa\nB: cabinet\nC: toilet\nD: shelf", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_131_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_131_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_131_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_131_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_131_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_131_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: bed\nB: cabinet\nC: bag\nD: sink", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bed\nB: cabinet\nC: bag\nD: sink", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_132_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_132_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_132_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_132_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_132_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_132_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: bin\nB: cabinet\nC: display\nD: sink", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bin\nB: cabinet\nC: display\nD: sink", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_133_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_133_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_133_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_133_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_133_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_133_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: sink\nB: cabinet\nC: bag\nD: bed", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: sink\nB: cabinet\nC: bag\nD: bed", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_134_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_134_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_134_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_134_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_134_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_134_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: sink\nB: bed\nC: cabinet\nD: shelf", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: sink\nB: bed\nC: cabinet\nD: shelf", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_135_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_135_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_135_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_135_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_135_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_135_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: toilet\nB: cabinet\nC: sink\nD: bed", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: toilet\nB: cabinet\nC: sink\nD: bed", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_136_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_136_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_136_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_136_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_136_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_136_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: pillow\nB: display\nC: chair\nD: sink", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: pillow\nB: display\nC: chair\nD: sink", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_137_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_137_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_137_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_137_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_137_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_137_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: sink\nB: bed\nC: cabinet\nD: chair", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: sink\nB: bed\nC: cabinet\nD: chair", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_138_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_138_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_138_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_138_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_138_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_138_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: sink\nB: bed\nC: box\nD: cabinet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: sink\nB: bed\nC: box\nD: cabinet", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_139_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_139_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_139_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_139_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_139_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_139_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: bin\nB: display\nC: sink\nD: cabinet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bin\nB: display\nC: sink\nD: cabinet", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_140_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_140_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_140_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_140_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_140_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_140_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: bed\nB: cabinet\nC: table\nD: sink", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bed\nB: cabinet\nC: table\nD: sink", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_141_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_141_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_141_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_141_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_141_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_141_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: cabinet\nB: shelf\nC: bed\nD: sofa", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: cabinet\nB: shelf\nC: bed\nD: sofa", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_142_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_142_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_142_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_142_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_142_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_142_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: cabinet\nB: box\nC: sink\nD: bed", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: cabinet\nB: box\nC: sink\nD: bed", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_143_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_143_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_143_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_143_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_143_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_143_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: bin\nB: cabinet\nC: shelf\nD: bed", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bin\nB: cabinet\nC: shelf\nD: bed", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_144_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_144_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_144_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_144_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_144_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_144_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: sink\nB: shelf\nC: chair\nD: table", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: sink\nB: shelf\nC: chair\nD: table", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_145_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_145_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_145_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_145_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_145_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_145_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: sink\nB: bed\nC: box\nD: cabinet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: sink\nB: bed\nC: box\nD: cabinet", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_146_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_146_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_146_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_146_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_146_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_146_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: desk\nB: bed\nC: cabinet\nD: chair", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: desk\nB: bed\nC: cabinet\nD: chair", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_147_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_147_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_147_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_147_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_147_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_147_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: chair\nB: bed\nC: cabinet\nD: sofa", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: chair\nB: bed\nC: cabinet\nD: sofa", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_148_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_148_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_148_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_148_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_148_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_148_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: bed\nB: cabinet\nC: display\nD: sink", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bed\nB: cabinet\nC: display\nD: sink", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_149_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_149_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_149_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_149_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_149_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_149_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: bed\nB: sink\nC: bag\nD: cabinet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bed\nB: sink\nC: bag\nD: cabinet", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_150_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_150_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_150_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_150_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_150_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_150_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: display\nB: sink\nC: bag\nD: cabinet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: display\nB: sink\nC: bag\nD: cabinet", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_151_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_151_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_151_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_151_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_151_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_151_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: door\nB: cabinet\nC: bed\nD: shelf", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: door\nB: cabinet\nC: bed\nD: shelf", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_152_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_152_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_152_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_152_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_152_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_152_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: bed\nB: bin\nC: shelf\nD: cabinet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bed\nB: bin\nC: shelf\nD: cabinet", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_153_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_153_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_153_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_153_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_153_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_153_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: chair\nB: bed\nC: pillow\nD: sofa", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: chair\nB: bed\nC: pillow\nD: sofa", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_154_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_154_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_154_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_154_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_154_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_154_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: sink\nB: cabinet\nC: bed\nD: display", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: sink\nB: cabinet\nC: bed\nD: display", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_155_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_155_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_155_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_155_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_155_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_155_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: cabinet\nB: bag\nC: sink\nD: bed", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: cabinet\nB: bag\nC: sink\nD: bed", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_156_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_156_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_156_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_156_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_156_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_156_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: bin\nB: display\nC: toilet\nD: cabinet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bin\nB: display\nC: toilet\nD: cabinet", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_157_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_157_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_157_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_157_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_157_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_157_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: chair\nB: sink\nC: cabinet\nD: bed", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: chair\nB: sink\nC: cabinet\nD: bed", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_158_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_158_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_158_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_158_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_158_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_158_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: shelf\nB: bed\nC: table\nD: cabinet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: shelf\nB: bed\nC: table\nD: cabinet", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_159_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_159_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_159_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_159_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_159_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_159_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: sink\nB: pillow\nC: bed\nD: cabinet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: sink\nB: pillow\nC: bed\nD: cabinet", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_160_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_160_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_160_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_160_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_160_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_160_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: sink\nB: bin\nC: cabinet\nD: bag", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: sink\nB: bin\nC: cabinet\nD: bag", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_161_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_161_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_161_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_161_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_161_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_161_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: sink\nB: toilet\nC: cabinet\nD: bed", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: sink\nB: toilet\nC: cabinet\nD: bed", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_162_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_162_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_162_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_162_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_162_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_162_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: sink\nB: bed\nC: display\nD: cabinet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: sink\nB: bed\nC: display\nD: cabinet", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_163_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_163_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_163_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_163_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_163_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_163_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: bed\nB: bin\nC: cabinet\nD: door", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bed\nB: bin\nC: cabinet\nD: door", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_164_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_164_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_164_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_164_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_164_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_164_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: bag\nB: sink\nC: door\nD: cabinet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bag\nB: sink\nC: door\nD: cabinet", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_165_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_165_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_165_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_165_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_165_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_165_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: desk\nB: cabinet\nC: shelf\nD: bed", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: desk\nB: cabinet\nC: shelf\nD: bed", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_166_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_166_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_166_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_166_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_166_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_166_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: chair\nB: cabinet\nC: desk\nD: bed", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: chair\nB: cabinet\nC: desk\nD: bed", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_167_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_167_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_167_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_167_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_167_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_167_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: sofa\nB: chair\nC: desk\nD: bed", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: sofa\nB: chair\nC: desk\nD: bed", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_168_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_168_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_168_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_168_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_168_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_168_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: display\nB: bag\nC: sink\nD: cabinet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: display\nB: bag\nC: sink\nD: cabinet", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_169_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_169_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_169_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_169_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_169_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_169_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: bag\nB: cabinet\nC: bed\nD: sink", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bag\nB: cabinet\nC: bed\nD: sink", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_170_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_170_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_170_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_170_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_170_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_170_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: shelf\nB: cabinet\nC: sink\nD: door", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: shelf\nB: cabinet\nC: sink\nD: door", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_171_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_171_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_171_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_171_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_171_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_171_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: bed\nB: sink\nC: door\nD: shelf", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bed\nB: sink\nC: door\nD: shelf", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_172_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_172_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_172_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_172_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_172_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_172_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: cabinet\nB: shelf\nC: sink\nD: door", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: cabinet\nB: shelf\nC: sink\nD: door", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_173_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_173_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_173_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_173_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_173_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_173_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: bed\nB: cabinet\nC: sink\nD: table", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bed\nB: cabinet\nC: sink\nD: table", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_174_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_174_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_174_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_174_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_174_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_174_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: table\nB: cabinet\nC: sink\nD: bed", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: table\nB: cabinet\nC: sink\nD: bed", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_175_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_175_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_175_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_175_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_175_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_175_5.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: chair\nB: sink\nC: cabinet\nD: bed", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: chair\nB: sink\nC: cabinet\nD: bed", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_176_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_176_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_176_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_176_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_176_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_176_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: chair\nB: shelf\nC: cabinet\nD: display", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: chair\nB: shelf\nC: cabinet\nD: display", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_177_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_177_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_177_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_177_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_177_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_177_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: sink\nB: bed\nC: bag\nD: cabinet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: sink\nB: bed\nC: bag\nD: cabinet", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_178_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_178_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_178_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_178_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_178_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_178_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: sink\nB: chair\nC: cabinet\nD: bag", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: sink\nB: chair\nC: cabinet\nD: bag", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_179_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_179_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_179_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_179_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_179_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_179_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: sink\nB: bed\nC: table\nD: cabinet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: sink\nB: bed\nC: table\nD: cabinet", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_180_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_180_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_180_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_180_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_180_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_180_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: bed\nB: door\nC: sink\nD: cabinet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bed\nB: door\nC: sink\nD: cabinet", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_181_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_181_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_181_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_181_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_181_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_181_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: cabinet\nB: sofa\nC: bed\nD: bin", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: cabinet\nB: sofa\nC: bed\nD: bin", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_182_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_182_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_182_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_182_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_182_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_182_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: shelf\nB: display\nC: bed\nD: cabinet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: shelf\nB: display\nC: bed\nD: cabinet", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_183_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_183_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_183_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_183_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_183_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_183_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: display\nB: sofa\nC: pillow\nD: chair", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: display\nB: sofa\nC: pillow\nD: chair", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_184_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_184_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_184_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_184_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_184_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_184_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: bin\nB: bed\nC: cabinet\nD: pillow", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bin\nB: bed\nC: cabinet\nD: pillow", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_185_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_185_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_185_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_185_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_185_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_185_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: bed\nB: chair\nC: sink\nD: cabinet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bed\nB: chair\nC: sink\nD: cabinet", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_186_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_186_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_186_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_186_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_186_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_186_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: display\nB: cabinet\nC: shelf\nD: bed", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: display\nB: cabinet\nC: shelf\nD: bed", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_187_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_187_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_187_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_187_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_187_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_187_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: sink\nB: cabinet\nC: door\nD: bag", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: sink\nB: cabinet\nC: door\nD: bag", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_188_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_188_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_188_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_188_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_188_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_188_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: sink\nB: cabinet\nC: table\nD: bed", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: sink\nB: cabinet\nC: table\nD: bed", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_189_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_189_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_189_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_189_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_189_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_189_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: cabinet\nB: shelf\nC: bed\nD: chair", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: cabinet\nB: shelf\nC: bed\nD: chair", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_190_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_190_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_190_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_190_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_190_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_190_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: display\nB: cabinet\nC: bed\nD: sofa", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: display\nB: cabinet\nC: bed\nD: sofa", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_191_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_191_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_191_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_191_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_191_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_191_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: sink\nB: door\nC: cabinet\nD: bed", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: sink\nB: door\nC: cabinet\nD: bed", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_192_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_192_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_192_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_192_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_192_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_192_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: bag\nB: table\nC: sink\nD: cabinet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bag\nB: table\nC: sink\nD: cabinet", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_193_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_193_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_193_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_193_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_193_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_193_5.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: shelf\nB: bed\nC: cabinet\nD: door", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: shelf\nB: bed\nC: cabinet\nD: door", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_194_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_194_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_194_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_194_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_194_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_194_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: bed\nB: sink\nC: toilet\nD: cabinet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bed\nB: sink\nC: toilet\nD: cabinet", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_195_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_195_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_195_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_195_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_195_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_195_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: sink\nB: bed\nC: bin\nD: cabinet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: sink\nB: bed\nC: bin\nD: cabinet", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_196_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_196_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_196_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_196_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_196_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_196_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: bag\nB: sink\nC: bed\nD: cabinet", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: bag\nB: sink\nC: bed\nD: cabinet", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_197_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_197_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_197_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_197_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_197_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_197_5.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: chair\nB: table\nC: sofa\nD: bed", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: chair\nB: table\nC: sofa\nD: bed", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_198_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_198_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_198_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_198_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_198_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_198_5.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "threed_indoor_recognition", "visual_input_component": "Poine cloud image", "source": "ScanObjectNN", "options": "A: cabinet\nB: bin\nC: bed\nD: sink", "question": "What is the category of the point cloud based on the multi-view of the point cloud?", "context": "Select from the following choices.\nA: cabinet\nB: bin\nC: bed\nD: sink", "input_image_path": ["./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_199_0.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_199_1.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_199_2.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_199_3.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_199_4.jpg", "./3D-spatial/threed_indoor_recognition/threed_indoor_recognition_199_5.jpg"], "output": "D", "qwen3-vl": "image none"}]
\ No newline at end of file
diff --git a/results/vehicle_retrieval/qwen3-vl/metadata_info.json b/results/vehicle_retrieval/qwen3-vl/metadata_info.json
new file mode 100644
index 0000000..5fd9be2
--- /dev/null
+++ b/results/vehicle_retrieval/qwen3-vl/metadata_info.json
@@ -0,0 +1 @@
+[{"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_0_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_0_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_0_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_0_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_0_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_1_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_1_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_1_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_1_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_1_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_2_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_2_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_2_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_2_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_2_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_3_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_3_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_3_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_3_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_3_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_4_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_4_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_4_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_4_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_4_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_5_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_5_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_5_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_5_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_5_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_6_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_6_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_6_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_6_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_6_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_7_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_7_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_7_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_7_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_7_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_8_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_8_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_8_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_8_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_8_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_9_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_9_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_9_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_9_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_9_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_10_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_10_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_10_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_10_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_10_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_11_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_11_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_11_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_11_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_11_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_12_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_12_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_12_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_12_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_12_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_13_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_13_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_13_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_13_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_13_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_14_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_14_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_14_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_14_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_14_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_15_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_15_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_15_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_15_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_15_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_16_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_16_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_16_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_16_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_16_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_17_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_17_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_17_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_17_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_17_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_18_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_18_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_18_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_18_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_18_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_19_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_19_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_19_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_19_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_19_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_20_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_20_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_20_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_20_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_20_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_21_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_21_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_21_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_21_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_21_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_22_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_22_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_22_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_22_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_22_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_23_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_23_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_23_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_23_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_23_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_24_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_24_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_24_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_24_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_24_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_25_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_25_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_25_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_25_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_25_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_26_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_26_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_26_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_26_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_26_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_27_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_27_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_27_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_27_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_27_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_28_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_28_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_28_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_28_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_28_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_29_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_29_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_29_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_29_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_29_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_30_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_30_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_30_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_30_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_30_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_31_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_31_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_31_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_31_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_31_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_32_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_32_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_32_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_32_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_32_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_33_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_33_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_33_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_33_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_33_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_34_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_34_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_34_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_34_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_34_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_35_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_35_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_35_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_35_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_35_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_36_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_36_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_36_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_36_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_36_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_37_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_37_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_37_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_37_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_37_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_38_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_38_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_38_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_38_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_38_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_39_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_39_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_39_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_39_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_39_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_40_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_40_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_40_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_40_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_40_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_41_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_41_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_41_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_41_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_41_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_42_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_42_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_42_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_42_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_42_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_43_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_43_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_43_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_43_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_43_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_44_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_44_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_44_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_44_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_44_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_45_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_45_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_45_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_45_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_45_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_46_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_46_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_46_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_46_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_46_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_47_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_47_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_47_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_47_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_47_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_48_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_48_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_48_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_48_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_48_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_49_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_49_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_49_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_49_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_49_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_50_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_50_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_50_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_50_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_50_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_51_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_51_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_51_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_51_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_51_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_52_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_52_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_52_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_52_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_52_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_53_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_53_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_53_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_53_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_53_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_54_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_54_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_54_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_54_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_54_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_55_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_55_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_55_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_55_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_55_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_56_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_56_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_56_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_56_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_56_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_57_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_57_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_57_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_57_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_57_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_58_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_58_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_58_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_58_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_58_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_59_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_59_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_59_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_59_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_59_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_60_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_60_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_60_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_60_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_60_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_61_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_61_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_61_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_61_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_61_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_62_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_62_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_62_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_62_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_62_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_63_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_63_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_63_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_63_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_63_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_64_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_64_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_64_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_64_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_64_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_65_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_65_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_65_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_65_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_65_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_66_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_66_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_66_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_66_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_66_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_67_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_67_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_67_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_67_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_67_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_68_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_68_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_68_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_68_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_68_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_69_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_69_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_69_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_69_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_69_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_70_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_70_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_70_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_70_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_70_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_71_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_71_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_71_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_71_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_71_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_72_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_72_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_72_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_72_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_72_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_73_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_73_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_73_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_73_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_73_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_74_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_74_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_74_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_74_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_74_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_75_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_75_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_75_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_75_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_75_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_76_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_76_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_76_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_76_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_76_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_77_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_77_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_77_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_77_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_77_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_78_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_78_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_78_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_78_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_78_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_79_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_79_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_79_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_79_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_79_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_80_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_80_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_80_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_80_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_80_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_81_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_81_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_81_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_81_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_81_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_82_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_82_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_82_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_82_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_82_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_83_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_83_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_83_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_83_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_83_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_84_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_84_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_84_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_84_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_84_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_85_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_85_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_85_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_85_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_85_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_86_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_86_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_86_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_86_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_86_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_87_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_87_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_87_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_87_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_87_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_88_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_88_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_88_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_88_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_88_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_89_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_89_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_89_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_89_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_89_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_90_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_90_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_90_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_90_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_90_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_91_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_91_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_91_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_91_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_91_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_92_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_92_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_92_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_92_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_92_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_93_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_93_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_93_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_93_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_93_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_94_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_94_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_94_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_94_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_94_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_95_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_95_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_95_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_95_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_95_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_96_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_96_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_96_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_96_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_96_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_97_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_97_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_97_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_97_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_97_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_98_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_98_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_98_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_98_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_98_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_99_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_99_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_99_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_99_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_99_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_100_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_100_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_100_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_100_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_100_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_101_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_101_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_101_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_101_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_101_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_102_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_102_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_102_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_102_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_102_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_103_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_103_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_103_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_103_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_103_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_104_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_104_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_104_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_104_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_104_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_105_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_105_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_105_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_105_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_105_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_106_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_106_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_106_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_106_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_106_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_107_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_107_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_107_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_107_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_107_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_108_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_108_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_108_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_108_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_108_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_109_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_109_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_109_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_109_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_109_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_110_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_110_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_110_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_110_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_110_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_111_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_111_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_111_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_111_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_111_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_112_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_112_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_112_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_112_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_112_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_113_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_113_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_113_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_113_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_113_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_114_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_114_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_114_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_114_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_114_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_115_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_115_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_115_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_115_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_115_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_116_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_116_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_116_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_116_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_116_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_117_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_117_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_117_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_117_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_117_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_118_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_118_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_118_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_118_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_118_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_119_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_119_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_119_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_119_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_119_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_120_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_120_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_120_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_120_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_120_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_121_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_121_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_121_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_121_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_121_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_122_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_122_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_122_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_122_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_122_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_123_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_123_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_123_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_123_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_123_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_124_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_124_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_124_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_124_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_124_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_125_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_125_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_125_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_125_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_125_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_126_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_126_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_126_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_126_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_126_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_127_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_127_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_127_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_127_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_127_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_128_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_128_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_128_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_128_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_128_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_129_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_129_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_129_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_129_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_129_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_130_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_130_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_130_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_130_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_130_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_131_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_131_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_131_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_131_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_131_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_132_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_132_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_132_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_132_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_132_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_133_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_133_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_133_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_133_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_133_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_134_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_134_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_134_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_134_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_134_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_135_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_135_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_135_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_135_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_135_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_136_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_136_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_136_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_136_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_136_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_137_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_137_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_137_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_137_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_137_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_138_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_138_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_138_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_138_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_138_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_139_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_139_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_139_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_139_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_139_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_140_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_140_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_140_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_140_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_140_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_141_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_141_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_141_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_141_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_141_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_142_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_142_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_142_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_142_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_142_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_143_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_143_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_143_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_143_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_143_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_144_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_144_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_144_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_144_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_144_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_145_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_145_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_145_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_145_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_145_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_146_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_146_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_146_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_146_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_146_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_147_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_147_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_147_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_147_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_147_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_148_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_148_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_148_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_148_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_148_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_149_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_149_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_149_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_149_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_149_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_150_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_150_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_150_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_150_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_150_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_151_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_151_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_151_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_151_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_151_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_152_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_152_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_152_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_152_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_152_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_153_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_153_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_153_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_153_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_153_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_154_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_154_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_154_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_154_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_154_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_155_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_155_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_155_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_155_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_155_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_156_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_156_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_156_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_156_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_156_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_157_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_157_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_157_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_157_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_157_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_158_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_158_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_158_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_158_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_158_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_159_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_159_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_159_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_159_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_159_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_160_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_160_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_160_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_160_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_160_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_161_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_161_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_161_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_161_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_161_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_162_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_162_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_162_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_162_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_162_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_163_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_163_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_163_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_163_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_163_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_164_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_164_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_164_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_164_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_164_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_165_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_165_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_165_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_165_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_165_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_166_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_166_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_166_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_166_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_166_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_167_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_167_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_167_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_167_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_167_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_168_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_168_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_168_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_168_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_168_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_169_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_169_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_169_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_169_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_169_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_170_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_170_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_170_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_170_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_170_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_171_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_171_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_171_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_171_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_171_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_172_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_172_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_172_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_172_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_172_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_173_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_173_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_173_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_173_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_173_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_174_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_174_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_174_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_174_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_174_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_175_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_175_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_175_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_175_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_175_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_176_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_176_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_176_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_176_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_176_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_177_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_177_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_177_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_177_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_177_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_178_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_178_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_178_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_178_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_178_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_179_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_179_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_179_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_179_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_179_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_180_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_180_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_180_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_180_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_180_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_181_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_181_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_181_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_181_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_181_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_182_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_182_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_182_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_182_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_182_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_183_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_183_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_183_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_183_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_183_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_184_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_184_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_184_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_184_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_184_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_185_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_185_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_185_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_185_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_185_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_186_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_186_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_186_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_186_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_186_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_187_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_187_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_187_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_187_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_187_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_188_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_188_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_188_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_188_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_188_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_189_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_189_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_189_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_189_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_189_4.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_190_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_190_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_190_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_190_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_190_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_191_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_191_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_191_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_191_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_191_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_192_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_192_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_192_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_192_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_192_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_193_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_193_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_193_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_193_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_193_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_194_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_194_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_194_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_194_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_194_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_195_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_195_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_195_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_195_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_195_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_196_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_196_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_196_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_196_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_196_4.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_197_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_197_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_197_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_197_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_197_4.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_198_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_198_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_198_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_198_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_198_4.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "vehicle_retrieval", "visual_input_component": "['natural_image']", "source": "veri_retrieval", "options": "A: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "question": "Please retrieve the most similar vehicle to the query in the candidate.The query image is the first image.", "context": "Select from the following choices.\nA: The second image\nB: The third image\nC: The fourth image\nD: The fifth image", "input_image_path": ["./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_199_0.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_199_1.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_199_2.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_199_3.jpg", "./High-level-obj-semantic/vehicle_retrieval/vehicle_retrieval_199_4.jpg"], "output": "D", "qwen3-vl": "image none"}]
\ No newline at end of file
diff --git a/results/video_captioning/qwen3-vl/metadata_info.json b/results/video_captioning/qwen3-vl/metadata_info.json
new file mode 100644
index 0000000..ca1211a
--- /dev/null
+++ b/results/video_captioning/qwen3-vl/metadata_info.json
@@ -0,0 +1 @@
+[{"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a woman with a guitar sings on a farm\nB: a man with a guitar plays in a park\nC: a man with a guitar sings on a farm\nD: a man with a guitar sings in a city", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a woman with a guitar sings on a farm\nB: a man with a guitar plays in a park\nC: a man with a guitar sings on a farm\nD: a man with a guitar sings in a city", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_0_0.png", "./Continuous-temporal/video_captioning/video_captioning_0_1.png", "./Continuous-temporal/video_captioning/video_captioning_0_2.png", "./Continuous-temporal/video_captioning/video_captioning_0_3.png", "./Continuous-temporal/video_captioning/video_captioning_0_4.png", "./Continuous-temporal/video_captioning/video_captioning_0_5.png", "./Continuous-temporal/video_captioning/video_captioning_0_6.png", "./Continuous-temporal/video_captioning/video_captioning_0_7.png", "./Continuous-temporal/video_captioning/video_captioning_0_8.png", "./Continuous-temporal/video_captioning/video_captioning_0_9.png", "./Continuous-temporal/video_captioning/video_captioning_0_10.png", "./Continuous-temporal/video_captioning/video_captioning_0_11.png", "./Continuous-temporal/video_captioning/video_captioning_0_12.png", "./Continuous-temporal/video_captioning/video_captioning_0_13.png", "./Continuous-temporal/video_captioning/video_captioning_0_14.png", "./Continuous-temporal/video_captioning/video_captioning_0_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a woman is performing on a talent show\nB: a girl is singing in a karaoke competition\nC: a man is auditioning for a role in a musical\nD: a boy is trying out for a part on the voice kids", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a woman is performing on a talent show\nB: a girl is singing in a karaoke competition\nC: a man is auditioning for a role in a musical\nD: a boy is trying out for a part on the voice kids", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_1_0.png", "./Continuous-temporal/video_captioning/video_captioning_1_1.png", "./Continuous-temporal/video_captioning/video_captioning_1_2.png", "./Continuous-temporal/video_captioning/video_captioning_1_3.png", "./Continuous-temporal/video_captioning/video_captioning_1_4.png", "./Continuous-temporal/video_captioning/video_captioning_1_5.png", "./Continuous-temporal/video_captioning/video_captioning_1_6.png", "./Continuous-temporal/video_captioning/video_captioning_1_7.png", "./Continuous-temporal/video_captioning/video_captioning_1_8.png", "./Continuous-temporal/video_captioning/video_captioning_1_9.png", "./Continuous-temporal/video_captioning/video_captioning_1_10.png", "./Continuous-temporal/video_captioning/video_captioning_1_11.png", "./Continuous-temporal/video_captioning/video_captioning_1_12.png", "./Continuous-temporal/video_captioning/video_captioning_1_13.png", "./Continuous-temporal/video_captioning/video_captioning_1_14.png", "./Continuous-temporal/video_captioning/video_captioning_1_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a girl is riding a bicycle\nB: a man is painting a house\nC: a woman is playing the piano\nD: a boy is eating pizza", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a girl is riding a bicycle\nB: a man is painting a house\nC: a woman is playing the piano\nD: a boy is eating pizza", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_2_0.png", "./Continuous-temporal/video_captioning/video_captioning_2_1.png", "./Continuous-temporal/video_captioning/video_captioning_2_2.png", "./Continuous-temporal/video_captioning/video_captioning_2_3.png", "./Continuous-temporal/video_captioning/video_captioning_2_4.png", "./Continuous-temporal/video_captioning/video_captioning_2_5.png", "./Continuous-temporal/video_captioning/video_captioning_2_6.png", "./Continuous-temporal/video_captioning/video_captioning_2_7.png", "./Continuous-temporal/video_captioning/video_captioning_2_8.png", "./Continuous-temporal/video_captioning/video_captioning_2_9.png", "./Continuous-temporal/video_captioning/video_captioning_2_10.png", "./Continuous-temporal/video_captioning/video_captioning_2_11.png", "./Continuous-temporal/video_captioning/video_captioning_2_12.png", "./Continuous-temporal/video_captioning/video_captioning_2_13.png", "./Continuous-temporal/video_captioning/video_captioning_2_14.png", "./Continuous-temporal/video_captioning/video_captioning_2_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a football match in progress\nB: a cat drinking milk from a bowl\nC: a chef preparing a gourmet meal\nD: the actor playing thor talking about the new movie", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a football match in progress\nB: a cat drinking milk from a bowl\nC: a chef preparing a gourmet meal\nD: the actor playing thor talking about the new movie", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_3_0.png", "./Continuous-temporal/video_captioning/video_captioning_3_1.png", "./Continuous-temporal/video_captioning/video_captioning_3_2.png", "./Continuous-temporal/video_captioning/video_captioning_3_3.png", "./Continuous-temporal/video_captioning/video_captioning_3_4.png", "./Continuous-temporal/video_captioning/video_captioning_3_5.png", "./Continuous-temporal/video_captioning/video_captioning_3_6.png", "./Continuous-temporal/video_captioning/video_captioning_3_7.png", "./Continuous-temporal/video_captioning/video_captioning_3_8.png", "./Continuous-temporal/video_captioning/video_captioning_3_9.png", "./Continuous-temporal/video_captioning/video_captioning_3_10.png", "./Continuous-temporal/video_captioning/video_captioning_3_11.png", "./Continuous-temporal/video_captioning/video_captioning_3_12.png", "./Continuous-temporal/video_captioning/video_captioning_3_13.png", "./Continuous-temporal/video_captioning/video_captioning_3_14.png", "./Continuous-temporal/video_captioning/video_captioning_3_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a female reporter is interviewing a man on the beach when she falls into a sand castle\nB: a group of people playing volleyball on the beach\nC: a male reporter is interviewing a woman in a restaurant when he spills his coffee\nD: a man building a sand castle while talking on the phone", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a female reporter is interviewing a man on the beach when she falls into a sand castle\nB: a group of people playing volleyball on the beach\nC: a male reporter is interviewing a woman in a restaurant when he spills his coffee\nD: a man building a sand castle while talking on the phone", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_4_0.png", "./Continuous-temporal/video_captioning/video_captioning_4_1.png", "./Continuous-temporal/video_captioning/video_captioning_4_2.png", "./Continuous-temporal/video_captioning/video_captioning_4_3.png", "./Continuous-temporal/video_captioning/video_captioning_4_4.png", "./Continuous-temporal/video_captioning/video_captioning_4_5.png", "./Continuous-temporal/video_captioning/video_captioning_4_6.png", "./Continuous-temporal/video_captioning/video_captioning_4_7.png", "./Continuous-temporal/video_captioning/video_captioning_4_8.png", "./Continuous-temporal/video_captioning/video_captioning_4_9.png", "./Continuous-temporal/video_captioning/video_captioning_4_10.png", "./Continuous-temporal/video_captioning/video_captioning_4_11.png", "./Continuous-temporal/video_captioning/video_captioning_4_12.png", "./Continuous-temporal/video_captioning/video_captioning_4_13.png", "./Continuous-temporal/video_captioning/video_captioning_4_14.png", "./Continuous-temporal/video_captioning/video_captioning_4_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a dog is jumping on a trampoline\nB: a cat is sleeping on a sofa\nC: a car is driving on the highway\nD: a bird is flying in the sky", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a dog is jumping on a trampoline\nB: a cat is sleeping on a sofa\nC: a car is driving on the highway\nD: a bird is flying in the sky", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_5_0.png", "./Continuous-temporal/video_captioning/video_captioning_5_1.png", "./Continuous-temporal/video_captioning/video_captioning_5_2.png", "./Continuous-temporal/video_captioning/video_captioning_5_3.png", "./Continuous-temporal/video_captioning/video_captioning_5_4.png", "./Continuous-temporal/video_captioning/video_captioning_5_5.png", "./Continuous-temporal/video_captioning/video_captioning_5_6.png", "./Continuous-temporal/video_captioning/video_captioning_5_7.png", "./Continuous-temporal/video_captioning/video_captioning_5_8.png", "./Continuous-temporal/video_captioning/video_captioning_5_9.png", "./Continuous-temporal/video_captioning/video_captioning_5_10.png", "./Continuous-temporal/video_captioning/video_captioning_5_11.png", "./Continuous-temporal/video_captioning/video_captioning_5_12.png", "./Continuous-temporal/video_captioning/video_captioning_5_13.png", "./Continuous-temporal/video_captioning/video_captioning_5_14.png", "./Continuous-temporal/video_captioning/video_captioning_5_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a woman is planting flowers in the garden\nB: a group of people is having a picnic in the park\nC: a man is opening a box and showing a machine gun\nD: a child is playing with a toy car", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a woman is planting flowers in the garden\nB: a group of people is having a picnic in the park\nC: a man is opening a box and showing a machine gun\nD: a child is playing with a toy car", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_6_0.png", "./Continuous-temporal/video_captioning/video_captioning_6_1.png", "./Continuous-temporal/video_captioning/video_captioning_6_2.png", "./Continuous-temporal/video_captioning/video_captioning_6_3.png", "./Continuous-temporal/video_captioning/video_captioning_6_4.png", "./Continuous-temporal/video_captioning/video_captioning_6_5.png", "./Continuous-temporal/video_captioning/video_captioning_6_6.png", "./Continuous-temporal/video_captioning/video_captioning_6_7.png", "./Continuous-temporal/video_captioning/video_captioning_6_8.png", "./Continuous-temporal/video_captioning/video_captioning_6_9.png", "./Continuous-temporal/video_captioning/video_captioning_6_10.png", "./Continuous-temporal/video_captioning/video_captioning_6_11.png", "./Continuous-temporal/video_captioning/video_captioning_6_12.png", "./Continuous-temporal/video_captioning/video_captioning_6_13.png", "./Continuous-temporal/video_captioning/video_captioning_6_14.png", "./Continuous-temporal/video_captioning/video_captioning_6_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a woman eating an orange substance\nB: a woman cooking an orange substance\nC: a woman peeling an orange substance\nD: a woman cutting an orange substance", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a woman eating an orange substance\nB: a woman cooking an orange substance\nC: a woman peeling an orange substance\nD: a woman cutting an orange substance", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_7_0.png", "./Continuous-temporal/video_captioning/video_captioning_7_1.png", "./Continuous-temporal/video_captioning/video_captioning_7_2.png", "./Continuous-temporal/video_captioning/video_captioning_7_3.png", "./Continuous-temporal/video_captioning/video_captioning_7_4.png", "./Continuous-temporal/video_captioning/video_captioning_7_5.png", "./Continuous-temporal/video_captioning/video_captioning_7_6.png", "./Continuous-temporal/video_captioning/video_captioning_7_7.png", "./Continuous-temporal/video_captioning/video_captioning_7_8.png", "./Continuous-temporal/video_captioning/video_captioning_7_9.png", "./Continuous-temporal/video_captioning/video_captioning_7_10.png", "./Continuous-temporal/video_captioning/video_captioning_7_11.png", "./Continuous-temporal/video_captioning/video_captioning_7_12.png", "./Continuous-temporal/video_captioning/video_captioning_7_13.png", "./Continuous-temporal/video_captioning/video_captioning_7_14.png", "./Continuous-temporal/video_captioning/video_captioning_7_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: the chef is slicing tomatoes\nB: the woman is breading pork chop\nC: the man is grilling vegetables\nD: the girl is painting a landscape", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: the chef is slicing tomatoes\nB: the woman is breading pork chop\nC: the man is grilling vegetables\nD: the girl is painting a landscape", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_8_0.png", "./Continuous-temporal/video_captioning/video_captioning_8_1.png", "./Continuous-temporal/video_captioning/video_captioning_8_2.png", "./Continuous-temporal/video_captioning/video_captioning_8_3.png", "./Continuous-temporal/video_captioning/video_captioning_8_4.png", "./Continuous-temporal/video_captioning/video_captioning_8_5.png", "./Continuous-temporal/video_captioning/video_captioning_8_6.png", "./Continuous-temporal/video_captioning/video_captioning_8_7.png", "./Continuous-temporal/video_captioning/video_captioning_8_8.png", "./Continuous-temporal/video_captioning/video_captioning_8_9.png", "./Continuous-temporal/video_captioning/video_captioning_8_10.png", "./Continuous-temporal/video_captioning/video_captioning_8_11.png", "./Continuous-temporal/video_captioning/video_captioning_8_12.png", "./Continuous-temporal/video_captioning/video_captioning_8_13.png", "./Continuous-temporal/video_captioning/video_captioning_8_14.png", "./Continuous-temporal/video_captioning/video_captioning_8_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: the camera focuses on a flower blooming in slow motion\nB: the screen shows a person sleeping in bed\nC: there is no sound while the screen shows a person playing a computer game\nD: there is sound of birds chirping in the background", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: the camera focuses on a flower blooming in slow motion\nB: the screen shows a person sleeping in bed\nC: there is no sound while the screen shows a person playing a computer game\nD: there is sound of birds chirping in the background", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_9_0.png", "./Continuous-temporal/video_captioning/video_captioning_9_1.png", "./Continuous-temporal/video_captioning/video_captioning_9_2.png", "./Continuous-temporal/video_captioning/video_captioning_9_3.png", "./Continuous-temporal/video_captioning/video_captioning_9_4.png", "./Continuous-temporal/video_captioning/video_captioning_9_5.png", "./Continuous-temporal/video_captioning/video_captioning_9_6.png", "./Continuous-temporal/video_captioning/video_captioning_9_7.png", "./Continuous-temporal/video_captioning/video_captioning_9_8.png", "./Continuous-temporal/video_captioning/video_captioning_9_9.png", "./Continuous-temporal/video_captioning/video_captioning_9_10.png", "./Continuous-temporal/video_captioning/video_captioning_9_11.png", "./Continuous-temporal/video_captioning/video_captioning_9_12.png", "./Continuous-temporal/video_captioning/video_captioning_9_13.png", "./Continuous-temporal/video_captioning/video_captioning_9_14.png", "./Continuous-temporal/video_captioning/video_captioning_9_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: someone stirring soup\nB: someone writing on a chalkboard\nC: someone watering plants\nD: someone slicing vegetable", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: someone stirring soup\nB: someone writing on a chalkboard\nC: someone watering plants\nD: someone slicing vegetable", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_10_0.png", "./Continuous-temporal/video_captioning/video_captioning_10_1.png", "./Continuous-temporal/video_captioning/video_captioning_10_2.png", "./Continuous-temporal/video_captioning/video_captioning_10_3.png", "./Continuous-temporal/video_captioning/video_captioning_10_4.png", "./Continuous-temporal/video_captioning/video_captioning_10_5.png", "./Continuous-temporal/video_captioning/video_captioning_10_6.png", "./Continuous-temporal/video_captioning/video_captioning_10_7.png", "./Continuous-temporal/video_captioning/video_captioning_10_8.png", "./Continuous-temporal/video_captioning/video_captioning_10_9.png", "./Continuous-temporal/video_captioning/video_captioning_10_10.png", "./Continuous-temporal/video_captioning/video_captioning_10_11.png", "./Continuous-temporal/video_captioning/video_captioning_10_12.png", "./Continuous-temporal/video_captioning/video_captioning_10_13.png", "./Continuous-temporal/video_captioning/video_captioning_10_14.png", "./Continuous-temporal/video_captioning/video_captioning_10_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a person in a red top is holding a yellow drink\nB: a woman in a yellow top is holding a red drink\nC: a woman in a blue top is holding a yellow drink\nD: a man in a green shirt is holding a blue drink", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a person in a red top is holding a yellow drink\nB: a woman in a yellow top is holding a red drink\nC: a woman in a blue top is holding a yellow drink\nD: a man in a green shirt is holding a blue drink", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_11_0.png", "./Continuous-temporal/video_captioning/video_captioning_11_1.png", "./Continuous-temporal/video_captioning/video_captioning_11_2.png", "./Continuous-temporal/video_captioning/video_captioning_11_3.png", "./Continuous-temporal/video_captioning/video_captioning_11_4.png", "./Continuous-temporal/video_captioning/video_captioning_11_5.png", "./Continuous-temporal/video_captioning/video_captioning_11_6.png", "./Continuous-temporal/video_captioning/video_captioning_11_7.png", "./Continuous-temporal/video_captioning/video_captioning_11_8.png", "./Continuous-temporal/video_captioning/video_captioning_11_9.png", "./Continuous-temporal/video_captioning/video_captioning_11_10.png", "./Continuous-temporal/video_captioning/video_captioning_11_11.png", "./Continuous-temporal/video_captioning/video_captioning_11_12.png", "./Continuous-temporal/video_captioning/video_captioning_11_13.png", "./Continuous-temporal/video_captioning/video_captioning_11_14.png", "./Continuous-temporal/video_captioning/video_captioning_11_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: an artist is sketching the face of a cartoon woman on paper with her mouth wide open\nB: a doctor is performing surgery in the operating room\nC: a child is playing with a toy car on the floor\nD: a chef is cooking a meal in the kitchen", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: an artist is sketching the face of a cartoon woman on paper with her mouth wide open\nB: a doctor is performing surgery in the operating room\nC: a child is playing with a toy car on the floor\nD: a chef is cooking a meal in the kitchen", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_12_0.png", "./Continuous-temporal/video_captioning/video_captioning_12_1.png", "./Continuous-temporal/video_captioning/video_captioning_12_2.png", "./Continuous-temporal/video_captioning/video_captioning_12_3.png", "./Continuous-temporal/video_captioning/video_captioning_12_4.png", "./Continuous-temporal/video_captioning/video_captioning_12_5.png", "./Continuous-temporal/video_captioning/video_captioning_12_6.png", "./Continuous-temporal/video_captioning/video_captioning_12_7.png", "./Continuous-temporal/video_captioning/video_captioning_12_8.png", "./Continuous-temporal/video_captioning/video_captioning_12_9.png", "./Continuous-temporal/video_captioning/video_captioning_12_10.png", "./Continuous-temporal/video_captioning/video_captioning_12_11.png", "./Continuous-temporal/video_captioning/video_captioning_12_12.png", "./Continuous-temporal/video_captioning/video_captioning_12_13.png", "./Continuous-temporal/video_captioning/video_captioning_12_14.png", "./Continuous-temporal/video_captioning/video_captioning_12_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a warrior is cooking a meal\nB: a warrior is gardening\nC: a warrior is fighting a battle\nD: a warrior is reading a book", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a warrior is cooking a meal\nB: a warrior is gardening\nC: a warrior is fighting a battle\nD: a warrior is reading a book", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_13_0.png", "./Continuous-temporal/video_captioning/video_captioning_13_1.png", "./Continuous-temporal/video_captioning/video_captioning_13_2.png", "./Continuous-temporal/video_captioning/video_captioning_13_3.png", "./Continuous-temporal/video_captioning/video_captioning_13_4.png", "./Continuous-temporal/video_captioning/video_captioning_13_5.png", "./Continuous-temporal/video_captioning/video_captioning_13_6.png", "./Continuous-temporal/video_captioning/video_captioning_13_7.png", "./Continuous-temporal/video_captioning/video_captioning_13_8.png", "./Continuous-temporal/video_captioning/video_captioning_13_9.png", "./Continuous-temporal/video_captioning/video_captioning_13_10.png", "./Continuous-temporal/video_captioning/video_captioning_13_11.png", "./Continuous-temporal/video_captioning/video_captioning_13_12.png", "./Continuous-temporal/video_captioning/video_captioning_13_13.png", "./Continuous-temporal/video_captioning/video_captioning_13_14.png", "./Continuous-temporal/video_captioning/video_captioning_13_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: chefs are cooking in a restaurant kitchen\nB: spectators are watching a football match in a stadium\nC: dancers are performing on a stage\nD: models are walking the runway as part of a fashion show", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: chefs are cooking in a restaurant kitchen\nB: spectators are watching a football match in a stadium\nC: dancers are performing on a stage\nD: models are walking the runway as part of a fashion show", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_14_0.png", "./Continuous-temporal/video_captioning/video_captioning_14_1.png", "./Continuous-temporal/video_captioning/video_captioning_14_2.png", "./Continuous-temporal/video_captioning/video_captioning_14_3.png", "./Continuous-temporal/video_captioning/video_captioning_14_4.png", "./Continuous-temporal/video_captioning/video_captioning_14_5.png", "./Continuous-temporal/video_captioning/video_captioning_14_6.png", "./Continuous-temporal/video_captioning/video_captioning_14_7.png", "./Continuous-temporal/video_captioning/video_captioning_14_8.png", "./Continuous-temporal/video_captioning/video_captioning_14_9.png", "./Continuous-temporal/video_captioning/video_captioning_14_10.png", "./Continuous-temporal/video_captioning/video_captioning_14_11.png", "./Continuous-temporal/video_captioning/video_captioning_14_12.png", "./Continuous-temporal/video_captioning/video_captioning_14_13.png", "./Continuous-temporal/video_captioning/video_captioning_14_14.png", "./Continuous-temporal/video_captioning/video_captioning_14_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a woman works as a chef in a restaurant\nB: a man teaches piano lessons in his music studio\nC: a woman looks after abandoned children for free in her home\nD: a group of children play in a water park", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a woman works as a chef in a restaurant\nB: a man teaches piano lessons in his music studio\nC: a woman looks after abandoned children for free in her home\nD: a group of children play in a water park", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_15_0.png", "./Continuous-temporal/video_captioning/video_captioning_15_1.png", "./Continuous-temporal/video_captioning/video_captioning_15_2.png", "./Continuous-temporal/video_captioning/video_captioning_15_3.png", "./Continuous-temporal/video_captioning/video_captioning_15_4.png", "./Continuous-temporal/video_captioning/video_captioning_15_5.png", "./Continuous-temporal/video_captioning/video_captioning_15_6.png", "./Continuous-temporal/video_captioning/video_captioning_15_7.png", "./Continuous-temporal/video_captioning/video_captioning_15_8.png", "./Continuous-temporal/video_captioning/video_captioning_15_9.png", "./Continuous-temporal/video_captioning/video_captioning_15_10.png", "./Continuous-temporal/video_captioning/video_captioning_15_11.png", "./Continuous-temporal/video_captioning/video_captioning_15_12.png", "./Continuous-temporal/video_captioning/video_captioning_15_13.png", "./Continuous-temporal/video_captioning/video_captioning_15_14.png", "./Continuous-temporal/video_captioning/video_captioning_15_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: two players are playing chess\nB: two players are playing badminton\nC: two players are playing volleyball\nD: two players are playing table tennis", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: two players are playing chess\nB: two players are playing badminton\nC: two players are playing volleyball\nD: two players are playing table tennis", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_16_0.png", "./Continuous-temporal/video_captioning/video_captioning_16_1.png", "./Continuous-temporal/video_captioning/video_captioning_16_2.png", "./Continuous-temporal/video_captioning/video_captioning_16_3.png", "./Continuous-temporal/video_captioning/video_captioning_16_4.png", "./Continuous-temporal/video_captioning/video_captioning_16_5.png", "./Continuous-temporal/video_captioning/video_captioning_16_6.png", "./Continuous-temporal/video_captioning/video_captioning_16_7.png", "./Continuous-temporal/video_captioning/video_captioning_16_8.png", "./Continuous-temporal/video_captioning/video_captioning_16_9.png", "./Continuous-temporal/video_captioning/video_captioning_16_10.png", "./Continuous-temporal/video_captioning/video_captioning_16_11.png", "./Continuous-temporal/video_captioning/video_captioning_16_12.png", "./Continuous-temporal/video_captioning/video_captioning_16_13.png", "./Continuous-temporal/video_captioning/video_captioning_16_14.png", "./Continuous-temporal/video_captioning/video_captioning_16_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a person is cutting a pineapple\nB: a chef is grilling a steak\nC: someone is sorting vegetables\nD: someone is peeling a prawn", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a person is cutting a pineapple\nB: a chef is grilling a steak\nC: someone is sorting vegetables\nD: someone is peeling a prawn", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_17_0.png", "./Continuous-temporal/video_captioning/video_captioning_17_1.png", "./Continuous-temporal/video_captioning/video_captioning_17_2.png", "./Continuous-temporal/video_captioning/video_captioning_17_3.png", "./Continuous-temporal/video_captioning/video_captioning_17_4.png", "./Continuous-temporal/video_captioning/video_captioning_17_5.png", "./Continuous-temporal/video_captioning/video_captioning_17_6.png", "./Continuous-temporal/video_captioning/video_captioning_17_7.png", "./Continuous-temporal/video_captioning/video_captioning_17_8.png", "./Continuous-temporal/video_captioning/video_captioning_17_9.png", "./Continuous-temporal/video_captioning/video_captioning_17_10.png", "./Continuous-temporal/video_captioning/video_captioning_17_11.png", "./Continuous-temporal/video_captioning/video_captioning_17_12.png", "./Continuous-temporal/video_captioning/video_captioning_17_13.png", "./Continuous-temporal/video_captioning/video_captioning_17_14.png", "./Continuous-temporal/video_captioning/video_captioning_17_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a child is riding a bike\nB: a woman is watering plants\nC: a dog is fetching a ball\nD: a man is tilling a field", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a child is riding a bike\nB: a woman is watering plants\nC: a dog is fetching a ball\nD: a man is tilling a field", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_18_0.png", "./Continuous-temporal/video_captioning/video_captioning_18_1.png", "./Continuous-temporal/video_captioning/video_captioning_18_2.png", "./Continuous-temporal/video_captioning/video_captioning_18_3.png", "./Continuous-temporal/video_captioning/video_captioning_18_4.png", "./Continuous-temporal/video_captioning/video_captioning_18_5.png", "./Continuous-temporal/video_captioning/video_captioning_18_6.png", "./Continuous-temporal/video_captioning/video_captioning_18_7.png", "./Continuous-temporal/video_captioning/video_captioning_18_8.png", "./Continuous-temporal/video_captioning/video_captioning_18_9.png", "./Continuous-temporal/video_captioning/video_captioning_18_10.png", "./Continuous-temporal/video_captioning/video_captioning_18_11.png", "./Continuous-temporal/video_captioning/video_captioning_18_12.png", "./Continuous-temporal/video_captioning/video_captioning_18_13.png", "./Continuous-temporal/video_captioning/video_captioning_18_14.png", "./Continuous-temporal/video_captioning/video_captioning_18_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a hamster is eating broccoli\nB: a cat is sitting on a chair\nC: a bird is flying in the sky\nD: a dog is playing with a ball", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a hamster is eating broccoli\nB: a cat is sitting on a chair\nC: a bird is flying in the sky\nD: a dog is playing with a ball", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_19_0.png", "./Continuous-temporal/video_captioning/video_captioning_19_1.png", "./Continuous-temporal/video_captioning/video_captioning_19_2.png", "./Continuous-temporal/video_captioning/video_captioning_19_3.png", "./Continuous-temporal/video_captioning/video_captioning_19_4.png", "./Continuous-temporal/video_captioning/video_captioning_19_5.png", "./Continuous-temporal/video_captioning/video_captioning_19_6.png", "./Continuous-temporal/video_captioning/video_captioning_19_7.png", "./Continuous-temporal/video_captioning/video_captioning_19_8.png", "./Continuous-temporal/video_captioning/video_captioning_19_9.png", "./Continuous-temporal/video_captioning/video_captioning_19_10.png", "./Continuous-temporal/video_captioning/video_captioning_19_11.png", "./Continuous-temporal/video_captioning/video_captioning_19_12.png", "./Continuous-temporal/video_captioning/video_captioning_19_13.png", "./Continuous-temporal/video_captioning/video_captioning_19_14.png", "./Continuous-temporal/video_captioning/video_captioning_19_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a man effortlessly jumps over a railing and continues walking without any problems\nB: a man slides down a railing and falls and hurts himself badly\nC: a man gracefully slides down a railing and lands safely at the bottom\nD: a man climbs up a railing and maintains perfect balance", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a man effortlessly jumps over a railing and continues walking without any problems\nB: a man slides down a railing and falls and hurts himself badly\nC: a man gracefully slides down a railing and lands safely at the bottom\nD: a man climbs up a railing and maintains perfect balance", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_20_0.png", "./Continuous-temporal/video_captioning/video_captioning_20_1.png", "./Continuous-temporal/video_captioning/video_captioning_20_2.png", "./Continuous-temporal/video_captioning/video_captioning_20_3.png", "./Continuous-temporal/video_captioning/video_captioning_20_4.png", "./Continuous-temporal/video_captioning/video_captioning_20_5.png", "./Continuous-temporal/video_captioning/video_captioning_20_6.png", "./Continuous-temporal/video_captioning/video_captioning_20_7.png", "./Continuous-temporal/video_captioning/video_captioning_20_8.png", "./Continuous-temporal/video_captioning/video_captioning_20_9.png", "./Continuous-temporal/video_captioning/video_captioning_20_10.png", "./Continuous-temporal/video_captioning/video_captioning_20_11.png", "./Continuous-temporal/video_captioning/video_captioning_20_12.png", "./Continuous-temporal/video_captioning/video_captioning_20_13.png", "./Continuous-temporal/video_captioning/video_captioning_20_14.png", "./Continuous-temporal/video_captioning/video_captioning_20_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: the car is driving on the highway\nB: the car is parked in a parking lot\nC: the car is crashed in the road\nD: the car is parked in the garage", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: the car is driving on the highway\nB: the car is parked in a parking lot\nC: the car is crashed in the road\nD: the car is parked in the garage", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_21_0.png", "./Continuous-temporal/video_captioning/video_captioning_21_1.png", "./Continuous-temporal/video_captioning/video_captioning_21_2.png", "./Continuous-temporal/video_captioning/video_captioning_21_3.png", "./Continuous-temporal/video_captioning/video_captioning_21_4.png", "./Continuous-temporal/video_captioning/video_captioning_21_5.png", "./Continuous-temporal/video_captioning/video_captioning_21_6.png", "./Continuous-temporal/video_captioning/video_captioning_21_7.png", "./Continuous-temporal/video_captioning/video_captioning_21_8.png", "./Continuous-temporal/video_captioning/video_captioning_21_9.png", "./Continuous-temporal/video_captioning/video_captioning_21_10.png", "./Continuous-temporal/video_captioning/video_captioning_21_11.png", "./Continuous-temporal/video_captioning/video_captioning_21_12.png", "./Continuous-temporal/video_captioning/video_captioning_21_13.png", "./Continuous-temporal/video_captioning/video_captioning_21_14.png", "./Continuous-temporal/video_captioning/video_captioning_21_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a person is dancing in a disco club\nB: a child is playing a drum set\nC: a woman is playing a guitar\nD: a man is playing an electric keyboard", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a person is dancing in a disco club\nB: a child is playing a drum set\nC: a woman is playing a guitar\nD: a man is playing an electric keyboard", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_22_0.png", "./Continuous-temporal/video_captioning/video_captioning_22_1.png", "./Continuous-temporal/video_captioning/video_captioning_22_2.png", "./Continuous-temporal/video_captioning/video_captioning_22_3.png", "./Continuous-temporal/video_captioning/video_captioning_22_4.png", "./Continuous-temporal/video_captioning/video_captioning_22_5.png", "./Continuous-temporal/video_captioning/video_captioning_22_6.png", "./Continuous-temporal/video_captioning/video_captioning_22_7.png", "./Continuous-temporal/video_captioning/video_captioning_22_8.png", "./Continuous-temporal/video_captioning/video_captioning_22_9.png", "./Continuous-temporal/video_captioning/video_captioning_22_10.png", "./Continuous-temporal/video_captioning/video_captioning_22_11.png", "./Continuous-temporal/video_captioning/video_captioning_22_12.png", "./Continuous-temporal/video_captioning/video_captioning_22_13.png", "./Continuous-temporal/video_captioning/video_captioning_22_14.png", "./Continuous-temporal/video_captioning/video_captioning_22_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: advertisement for home decor\nB: display of sports shoes\nC: promotion of kitchen utensils\nD: advertisement of seat basket", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: advertisement for home decor\nB: display of sports shoes\nC: promotion of kitchen utensils\nD: advertisement of seat basket", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_23_0.png", "./Continuous-temporal/video_captioning/video_captioning_23_1.png", "./Continuous-temporal/video_captioning/video_captioning_23_2.png", "./Continuous-temporal/video_captioning/video_captioning_23_3.png", "./Continuous-temporal/video_captioning/video_captioning_23_4.png", "./Continuous-temporal/video_captioning/video_captioning_23_5.png", "./Continuous-temporal/video_captioning/video_captioning_23_6.png", "./Continuous-temporal/video_captioning/video_captioning_23_7.png", "./Continuous-temporal/video_captioning/video_captioning_23_8.png", "./Continuous-temporal/video_captioning/video_captioning_23_9.png", "./Continuous-temporal/video_captioning/video_captioning_23_10.png", "./Continuous-temporal/video_captioning/video_captioning_23_11.png", "./Continuous-temporal/video_captioning/video_captioning_23_12.png", "./Continuous-temporal/video_captioning/video_captioning_23_13.png", "./Continuous-temporal/video_captioning/video_captioning_23_14.png", "./Continuous-temporal/video_captioning/video_captioning_23_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a girl using her smartphone\nB: a boy playing with a toy car\nC: a man cooking in the kitchen\nD: a woman reading a book", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a girl using her smartphone\nB: a boy playing with a toy car\nC: a man cooking in the kitchen\nD: a woman reading a book", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_24_0.png", "./Continuous-temporal/video_captioning/video_captioning_24_1.png", "./Continuous-temporal/video_captioning/video_captioning_24_2.png", "./Continuous-temporal/video_captioning/video_captioning_24_3.png", "./Continuous-temporal/video_captioning/video_captioning_24_4.png", "./Continuous-temporal/video_captioning/video_captioning_24_5.png", "./Continuous-temporal/video_captioning/video_captioning_24_6.png", "./Continuous-temporal/video_captioning/video_captioning_24_7.png", "./Continuous-temporal/video_captioning/video_captioning_24_8.png", "./Continuous-temporal/video_captioning/video_captioning_24_9.png", "./Continuous-temporal/video_captioning/video_captioning_24_10.png", "./Continuous-temporal/video_captioning/video_captioning_24_11.png", "./Continuous-temporal/video_captioning/video_captioning_24_12.png", "./Continuous-temporal/video_captioning/video_captioning_24_13.png", "./Continuous-temporal/video_captioning/video_captioning_24_14.png", "./Continuous-temporal/video_captioning/video_captioning_24_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a man is driving\nB: a man is running\nC: a man is standing\nD: a man is shooting", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a man is driving\nB: a man is running\nC: a man is standing\nD: a man is shooting", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_25_0.png", "./Continuous-temporal/video_captioning/video_captioning_25_1.png", "./Continuous-temporal/video_captioning/video_captioning_25_2.png", "./Continuous-temporal/video_captioning/video_captioning_25_3.png", "./Continuous-temporal/video_captioning/video_captioning_25_4.png", "./Continuous-temporal/video_captioning/video_captioning_25_5.png", "./Continuous-temporal/video_captioning/video_captioning_25_6.png", "./Continuous-temporal/video_captioning/video_captioning_25_7.png", "./Continuous-temporal/video_captioning/video_captioning_25_8.png", "./Continuous-temporal/video_captioning/video_captioning_25_9.png", "./Continuous-temporal/video_captioning/video_captioning_25_10.png", "./Continuous-temporal/video_captioning/video_captioning_25_11.png", "./Continuous-temporal/video_captioning/video_captioning_25_12.png", "./Continuous-temporal/video_captioning/video_captioning_25_13.png", "./Continuous-temporal/video_captioning/video_captioning_25_14.png", "./Continuous-temporal/video_captioning/video_captioning_25_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a man clean another man s shirt\nB: a man folding a shirt\nC: a man ironing a shirt\nD: a man drying a shirt", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a man clean another man s shirt\nB: a man folding a shirt\nC: a man ironing a shirt\nD: a man drying a shirt", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_26_0.png", "./Continuous-temporal/video_captioning/video_captioning_26_1.png", "./Continuous-temporal/video_captioning/video_captioning_26_2.png", "./Continuous-temporal/video_captioning/video_captioning_26_3.png", "./Continuous-temporal/video_captioning/video_captioning_26_4.png", "./Continuous-temporal/video_captioning/video_captioning_26_5.png", "./Continuous-temporal/video_captioning/video_captioning_26_6.png", "./Continuous-temporal/video_captioning/video_captioning_26_7.png", "./Continuous-temporal/video_captioning/video_captioning_26_8.png", "./Continuous-temporal/video_captioning/video_captioning_26_9.png", "./Continuous-temporal/video_captioning/video_captioning_26_10.png", "./Continuous-temporal/video_captioning/video_captioning_26_11.png", "./Continuous-temporal/video_captioning/video_captioning_26_12.png", "./Continuous-temporal/video_captioning/video_captioning_26_13.png", "./Continuous-temporal/video_captioning/video_captioning_26_14.png", "./Continuous-temporal/video_captioning/video_captioning_26_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a chef cooking a meal in the kitchen\nB: a child playing with toys\nC: a woman exercising in the park\nD: a man describing how to do something in windows", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a chef cooking a meal in the kitchen\nB: a child playing with toys\nC: a woman exercising in the park\nD: a man describing how to do something in windows", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_27_0.png", "./Continuous-temporal/video_captioning/video_captioning_27_1.png", "./Continuous-temporal/video_captioning/video_captioning_27_2.png", "./Continuous-temporal/video_captioning/video_captioning_27_3.png", "./Continuous-temporal/video_captioning/video_captioning_27_4.png", "./Continuous-temporal/video_captioning/video_captioning_27_5.png", "./Continuous-temporal/video_captioning/video_captioning_27_6.png", "./Continuous-temporal/video_captioning/video_captioning_27_7.png", "./Continuous-temporal/video_captioning/video_captioning_27_8.png", "./Continuous-temporal/video_captioning/video_captioning_27_9.png", "./Continuous-temporal/video_captioning/video_captioning_27_10.png", "./Continuous-temporal/video_captioning/video_captioning_27_11.png", "./Continuous-temporal/video_captioning/video_captioning_27_12.png", "./Continuous-temporal/video_captioning/video_captioning_27_13.png", "./Continuous-temporal/video_captioning/video_captioning_27_14.png", "./Continuous-temporal/video_captioning/video_captioning_27_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a man sings while playing the guitar\nB: a woman dances in a ballet performance\nC: a child rides a bicycle in the park\nD: a woman explains playing violin", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a man sings while playing the guitar\nB: a woman dances in a ballet performance\nC: a child rides a bicycle in the park\nD: a woman explains playing violin", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_28_0.png", "./Continuous-temporal/video_captioning/video_captioning_28_1.png", "./Continuous-temporal/video_captioning/video_captioning_28_2.png", "./Continuous-temporal/video_captioning/video_captioning_28_3.png", "./Continuous-temporal/video_captioning/video_captioning_28_4.png", "./Continuous-temporal/video_captioning/video_captioning_28_5.png", "./Continuous-temporal/video_captioning/video_captioning_28_6.png", "./Continuous-temporal/video_captioning/video_captioning_28_7.png", "./Continuous-temporal/video_captioning/video_captioning_28_8.png", "./Continuous-temporal/video_captioning/video_captioning_28_9.png", "./Continuous-temporal/video_captioning/video_captioning_28_10.png", "./Continuous-temporal/video_captioning/video_captioning_28_11.png", "./Continuous-temporal/video_captioning/video_captioning_28_12.png", "./Continuous-temporal/video_captioning/video_captioning_28_13.png", "./Continuous-temporal/video_captioning/video_captioning_28_14.png", "./Continuous-temporal/video_captioning/video_captioning_28_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: two little girls playing outside\nB: a man sitting on a chair\nC: a dog running in the park\nD: one little girl is sleeping on bed", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: two little girls playing outside\nB: a man sitting on a chair\nC: a dog running in the park\nD: one little girl is sleeping on bed", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_29_0.png", "./Continuous-temporal/video_captioning/video_captioning_29_1.png", "./Continuous-temporal/video_captioning/video_captioning_29_2.png", "./Continuous-temporal/video_captioning/video_captioning_29_3.png", "./Continuous-temporal/video_captioning/video_captioning_29_4.png", "./Continuous-temporal/video_captioning/video_captioning_29_5.png", "./Continuous-temporal/video_captioning/video_captioning_29_6.png", "./Continuous-temporal/video_captioning/video_captioning_29_7.png", "./Continuous-temporal/video_captioning/video_captioning_29_8.png", "./Continuous-temporal/video_captioning/video_captioning_29_9.png", "./Continuous-temporal/video_captioning/video_captioning_29_10.png", "./Continuous-temporal/video_captioning/video_captioning_29_11.png", "./Continuous-temporal/video_captioning/video_captioning_29_12.png", "./Continuous-temporal/video_captioning/video_captioning_29_13.png", "./Continuous-temporal/video_captioning/video_captioning_29_14.png", "./Continuous-temporal/video_captioning/video_captioning_29_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a tv channel named how to cook great foodcom is telling how to prepare a dish\nB: a cooking show featuring recipes from famous chefs\nC: a baking tutorial on a popular cooking website\nD: a video showing the steps to create a delicious meal", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a tv channel named how to cook great foodcom is telling how to prepare a dish\nB: a cooking show featuring recipes from famous chefs\nC: a baking tutorial on a popular cooking website\nD: a video showing the steps to create a delicious meal", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_30_0.png", "./Continuous-temporal/video_captioning/video_captioning_30_1.png", "./Continuous-temporal/video_captioning/video_captioning_30_2.png", "./Continuous-temporal/video_captioning/video_captioning_30_3.png", "./Continuous-temporal/video_captioning/video_captioning_30_4.png", "./Continuous-temporal/video_captioning/video_captioning_30_5.png", "./Continuous-temporal/video_captioning/video_captioning_30_6.png", "./Continuous-temporal/video_captioning/video_captioning_30_7.png", "./Continuous-temporal/video_captioning/video_captioning_30_8.png", "./Continuous-temporal/video_captioning/video_captioning_30_9.png", "./Continuous-temporal/video_captioning/video_captioning_30_10.png", "./Continuous-temporal/video_captioning/video_captioning_30_11.png", "./Continuous-temporal/video_captioning/video_captioning_30_12.png", "./Continuous-temporal/video_captioning/video_captioning_30_13.png", "./Continuous-temporal/video_captioning/video_captioning_30_14.png", "./Continuous-temporal/video_captioning/video_captioning_30_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a child plays with a toy robot\nB: a man talks about dna force\nC: an old man reads a newspaper\nD: a woman discusses environmental conservation", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a child plays with a toy robot\nB: a man talks about dna force\nC: an old man reads a newspaper\nD: a woman discusses environmental conservation", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_31_0.png", "./Continuous-temporal/video_captioning/video_captioning_31_1.png", "./Continuous-temporal/video_captioning/video_captioning_31_2.png", "./Continuous-temporal/video_captioning/video_captioning_31_3.png", "./Continuous-temporal/video_captioning/video_captioning_31_4.png", "./Continuous-temporal/video_captioning/video_captioning_31_5.png", "./Continuous-temporal/video_captioning/video_captioning_31_6.png", "./Continuous-temporal/video_captioning/video_captioning_31_7.png", "./Continuous-temporal/video_captioning/video_captioning_31_8.png", "./Continuous-temporal/video_captioning/video_captioning_31_9.png", "./Continuous-temporal/video_captioning/video_captioning_31_10.png", "./Continuous-temporal/video_captioning/video_captioning_31_11.png", "./Continuous-temporal/video_captioning/video_captioning_31_12.png", "./Continuous-temporal/video_captioning/video_captioning_31_13.png", "./Continuous-temporal/video_captioning/video_captioning_31_14.png", "./Continuous-temporal/video_captioning/video_captioning_31_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a boy is playing the violin\nB: a girl is painting a picture\nC: a man is cooking in the kitchen\nD: a woman is riding a bicycle", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a boy is playing the violin\nB: a girl is painting a picture\nC: a man is cooking in the kitchen\nD: a woman is riding a bicycle", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_32_0.png", "./Continuous-temporal/video_captioning/video_captioning_32_1.png", "./Continuous-temporal/video_captioning/video_captioning_32_2.png", "./Continuous-temporal/video_captioning/video_captioning_32_3.png", "./Continuous-temporal/video_captioning/video_captioning_32_4.png", "./Continuous-temporal/video_captioning/video_captioning_32_5.png", "./Continuous-temporal/video_captioning/video_captioning_32_6.png", "./Continuous-temporal/video_captioning/video_captioning_32_7.png", "./Continuous-temporal/video_captioning/video_captioning_32_8.png", "./Continuous-temporal/video_captioning/video_captioning_32_9.png", "./Continuous-temporal/video_captioning/video_captioning_32_10.png", "./Continuous-temporal/video_captioning/video_captioning_32_11.png", "./Continuous-temporal/video_captioning/video_captioning_32_12.png", "./Continuous-temporal/video_captioning/video_captioning_32_13.png", "./Continuous-temporal/video_captioning/video_captioning_32_14.png", "./Continuous-temporal/video_captioning/video_captioning_32_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a man is riding the brown horse\nB: a person is taking pictures of the white horse\nC: the brown horse is playing with a child\nD: person is recording the brown horse which is having fun", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a man is riding the brown horse\nB: a person is taking pictures of the white horse\nC: the brown horse is playing with a child\nD: person is recording the brown horse which is having fun", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_33_0.png", "./Continuous-temporal/video_captioning/video_captioning_33_1.png", "./Continuous-temporal/video_captioning/video_captioning_33_2.png", "./Continuous-temporal/video_captioning/video_captioning_33_3.png", "./Continuous-temporal/video_captioning/video_captioning_33_4.png", "./Continuous-temporal/video_captioning/video_captioning_33_5.png", "./Continuous-temporal/video_captioning/video_captioning_33_6.png", "./Continuous-temporal/video_captioning/video_captioning_33_7.png", "./Continuous-temporal/video_captioning/video_captioning_33_8.png", "./Continuous-temporal/video_captioning/video_captioning_33_9.png", "./Continuous-temporal/video_captioning/video_captioning_33_10.png", "./Continuous-temporal/video_captioning/video_captioning_33_11.png", "./Continuous-temporal/video_captioning/video_captioning_33_12.png", "./Continuous-temporal/video_captioning/video_captioning_33_13.png", "./Continuous-temporal/video_captioning/video_captioning_33_14.png", "./Continuous-temporal/video_captioning/video_captioning_33_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a boy in green jacket talking about sofa cover\nB: a man in blue shirt playing with the pillow\nC: a woman in red dress explaining about cushion seat\nD: a girl in yellow dress sitting on the couch", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a boy in green jacket talking about sofa cover\nB: a man in blue shirt playing with the pillow\nC: a woman in red dress explaining about cushion seat\nD: a girl in yellow dress sitting on the couch", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_34_0.png", "./Continuous-temporal/video_captioning/video_captioning_34_1.png", "./Continuous-temporal/video_captioning/video_captioning_34_2.png", "./Continuous-temporal/video_captioning/video_captioning_34_3.png", "./Continuous-temporal/video_captioning/video_captioning_34_4.png", "./Continuous-temporal/video_captioning/video_captioning_34_5.png", "./Continuous-temporal/video_captioning/video_captioning_34_6.png", "./Continuous-temporal/video_captioning/video_captioning_34_7.png", "./Continuous-temporal/video_captioning/video_captioning_34_8.png", "./Continuous-temporal/video_captioning/video_captioning_34_9.png", "./Continuous-temporal/video_captioning/video_captioning_34_10.png", "./Continuous-temporal/video_captioning/video_captioning_34_11.png", "./Continuous-temporal/video_captioning/video_captioning_34_12.png", "./Continuous-temporal/video_captioning/video_captioning_34_13.png", "./Continuous-temporal/video_captioning/video_captioning_34_14.png", "./Continuous-temporal/video_captioning/video_captioning_34_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a bird is chirping at the wall\nB: a dog is barking at the wall\nC: a cat is sleeping on the wall\nD: a cat is meowing at the wall", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a bird is chirping at the wall\nB: a dog is barking at the wall\nC: a cat is sleeping on the wall\nD: a cat is meowing at the wall", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_35_0.png", "./Continuous-temporal/video_captioning/video_captioning_35_1.png", "./Continuous-temporal/video_captioning/video_captioning_35_2.png", "./Continuous-temporal/video_captioning/video_captioning_35_3.png", "./Continuous-temporal/video_captioning/video_captioning_35_4.png", "./Continuous-temporal/video_captioning/video_captioning_35_5.png", "./Continuous-temporal/video_captioning/video_captioning_35_6.png", "./Continuous-temporal/video_captioning/video_captioning_35_7.png", "./Continuous-temporal/video_captioning/video_captioning_35_8.png", "./Continuous-temporal/video_captioning/video_captioning_35_9.png", "./Continuous-temporal/video_captioning/video_captioning_35_10.png", "./Continuous-temporal/video_captioning/video_captioning_35_11.png", "./Continuous-temporal/video_captioning/video_captioning_35_12.png", "./Continuous-temporal/video_captioning/video_captioning_35_13.png", "./Continuous-temporal/video_captioning/video_captioning_35_14.png", "./Continuous-temporal/video_captioning/video_captioning_35_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a group of people dancing in a park\nB: a chef cooking in a restaurant kitchen\nC: someone speaking about a violent act regarding the police\nD: a cat sleeping on a windowsill", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a group of people dancing in a park\nB: a chef cooking in a restaurant kitchen\nC: someone speaking about a violent act regarding the police\nD: a cat sleeping on a windowsill", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_36_0.png", "./Continuous-temporal/video_captioning/video_captioning_36_1.png", "./Continuous-temporal/video_captioning/video_captioning_36_2.png", "./Continuous-temporal/video_captioning/video_captioning_36_3.png", "./Continuous-temporal/video_captioning/video_captioning_36_4.png", "./Continuous-temporal/video_captioning/video_captioning_36_5.png", "./Continuous-temporal/video_captioning/video_captioning_36_6.png", "./Continuous-temporal/video_captioning/video_captioning_36_7.png", "./Continuous-temporal/video_captioning/video_captioning_36_8.png", "./Continuous-temporal/video_captioning/video_captioning_36_9.png", "./Continuous-temporal/video_captioning/video_captioning_36_10.png", "./Continuous-temporal/video_captioning/video_captioning_36_11.png", "./Continuous-temporal/video_captioning/video_captioning_36_12.png", "./Continuous-temporal/video_captioning/video_captioning_36_13.png", "./Continuous-temporal/video_captioning/video_captioning_36_14.png", "./Continuous-temporal/video_captioning/video_captioning_36_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a man is playing a large guitar on the stage\nB: a woman is palying a small guitar or a ukulele on the street\nC: a person is sitting and holding a violin in the concert\nD: a girl is dancing with a small guitar in the park", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a man is playing a large guitar on the stage\nB: a woman is palying a small guitar or a ukulele on the street\nC: a person is sitting and holding a violin in the concert\nD: a girl is dancing with a small guitar in the park", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_37_0.png", "./Continuous-temporal/video_captioning/video_captioning_37_1.png", "./Continuous-temporal/video_captioning/video_captioning_37_2.png", "./Continuous-temporal/video_captioning/video_captioning_37_3.png", "./Continuous-temporal/video_captioning/video_captioning_37_4.png", "./Continuous-temporal/video_captioning/video_captioning_37_5.png", "./Continuous-temporal/video_captioning/video_captioning_37_6.png", "./Continuous-temporal/video_captioning/video_captioning_37_7.png", "./Continuous-temporal/video_captioning/video_captioning_37_8.png", "./Continuous-temporal/video_captioning/video_captioning_37_9.png", "./Continuous-temporal/video_captioning/video_captioning_37_10.png", "./Continuous-temporal/video_captioning/video_captioning_37_11.png", "./Continuous-temporal/video_captioning/video_captioning_37_12.png", "./Continuous-temporal/video_captioning/video_captioning_37_13.png", "./Continuous-temporal/video_captioning/video_captioning_37_14.png", "./Continuous-temporal/video_captioning/video_captioning_37_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a peaceful sunset at the beach\nB: a cute puppy playing in the park\nC: a fatality from mortal kombat is shown\nD: a group of friends laughing and having fun", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a peaceful sunset at the beach\nB: a cute puppy playing in the park\nC: a fatality from mortal kombat is shown\nD: a group of friends laughing and having fun", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_38_0.png", "./Continuous-temporal/video_captioning/video_captioning_38_1.png", "./Continuous-temporal/video_captioning/video_captioning_38_2.png", "./Continuous-temporal/video_captioning/video_captioning_38_3.png", "./Continuous-temporal/video_captioning/video_captioning_38_4.png", "./Continuous-temporal/video_captioning/video_captioning_38_5.png", "./Continuous-temporal/video_captioning/video_captioning_38_6.png", "./Continuous-temporal/video_captioning/video_captioning_38_7.png", "./Continuous-temporal/video_captioning/video_captioning_38_8.png", "./Continuous-temporal/video_captioning/video_captioning_38_9.png", "./Continuous-temporal/video_captioning/video_captioning_38_10.png", "./Continuous-temporal/video_captioning/video_captioning_38_11.png", "./Continuous-temporal/video_captioning/video_captioning_38_12.png", "./Continuous-temporal/video_captioning/video_captioning_38_13.png", "./Continuous-temporal/video_captioning/video_captioning_38_14.png", "./Continuous-temporal/video_captioning/video_captioning_38_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: the boy is riding the cycle\nB: the boy is driving a car\nC: the boy is walking the dog\nD: the girl is riding the cycle", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: the boy is riding the cycle\nB: the boy is driving a car\nC: the boy is walking the dog\nD: the girl is riding the cycle", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_39_0.png", "./Continuous-temporal/video_captioning/video_captioning_39_1.png", "./Continuous-temporal/video_captioning/video_captioning_39_2.png", "./Continuous-temporal/video_captioning/video_captioning_39_3.png", "./Continuous-temporal/video_captioning/video_captioning_39_4.png", "./Continuous-temporal/video_captioning/video_captioning_39_5.png", "./Continuous-temporal/video_captioning/video_captioning_39_6.png", "./Continuous-temporal/video_captioning/video_captioning_39_7.png", "./Continuous-temporal/video_captioning/video_captioning_39_8.png", "./Continuous-temporal/video_captioning/video_captioning_39_9.png", "./Continuous-temporal/video_captioning/video_captioning_39_10.png", "./Continuous-temporal/video_captioning/video_captioning_39_11.png", "./Continuous-temporal/video_captioning/video_captioning_39_12.png", "./Continuous-temporal/video_captioning/video_captioning_39_13.png", "./Continuous-temporal/video_captioning/video_captioning_39_14.png", "./Continuous-temporal/video_captioning/video_captioning_39_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: people talking about their trip and how they are taken care of\nB: people sitting silently and looking bored\nC: a person alone and lost in a foreign land\nD: a group of people arguing and fighting", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: people talking about their trip and how they are taken care of\nB: people sitting silently and looking bored\nC: a person alone and lost in a foreign land\nD: a group of people arguing and fighting", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_40_0.png", "./Continuous-temporal/video_captioning/video_captioning_40_1.png", "./Continuous-temporal/video_captioning/video_captioning_40_2.png", "./Continuous-temporal/video_captioning/video_captioning_40_3.png", "./Continuous-temporal/video_captioning/video_captioning_40_4.png", "./Continuous-temporal/video_captioning/video_captioning_40_5.png", "./Continuous-temporal/video_captioning/video_captioning_40_6.png", "./Continuous-temporal/video_captioning/video_captioning_40_7.png", "./Continuous-temporal/video_captioning/video_captioning_40_8.png", "./Continuous-temporal/video_captioning/video_captioning_40_9.png", "./Continuous-temporal/video_captioning/video_captioning_40_10.png", "./Continuous-temporal/video_captioning/video_captioning_40_11.png", "./Continuous-temporal/video_captioning/video_captioning_40_12.png", "./Continuous-temporal/video_captioning/video_captioning_40_13.png", "./Continuous-temporal/video_captioning/video_captioning_40_14.png", "./Continuous-temporal/video_captioning/video_captioning_40_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a man and a woman painting a picture\nB: a man and a woman playing basketball\nC: a man and a woman dancing at a party\nD: a man a woman cooking on a cooking show", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a man and a woman painting a picture\nB: a man and a woman playing basketball\nC: a man and a woman dancing at a party\nD: a man a woman cooking on a cooking show", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_41_0.png", "./Continuous-temporal/video_captioning/video_captioning_41_1.png", "./Continuous-temporal/video_captioning/video_captioning_41_2.png", "./Continuous-temporal/video_captioning/video_captioning_41_3.png", "./Continuous-temporal/video_captioning/video_captioning_41_4.png", "./Continuous-temporal/video_captioning/video_captioning_41_5.png", "./Continuous-temporal/video_captioning/video_captioning_41_6.png", "./Continuous-temporal/video_captioning/video_captioning_41_7.png", "./Continuous-temporal/video_captioning/video_captioning_41_8.png", "./Continuous-temporal/video_captioning/video_captioning_41_9.png", "./Continuous-temporal/video_captioning/video_captioning_41_10.png", "./Continuous-temporal/video_captioning/video_captioning_41_11.png", "./Continuous-temporal/video_captioning/video_captioning_41_12.png", "./Continuous-temporal/video_captioning/video_captioning_41_13.png", "./Continuous-temporal/video_captioning/video_captioning_41_14.png", "./Continuous-temporal/video_captioning/video_captioning_41_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a woman shopping for groceries in a supermarket\nB: a group of friends having a picnic in a park\nC: a man explains the condition of someone in the hospital to the press outside of a building\nD: a child playing in a playground", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a woman shopping for groceries in a supermarket\nB: a group of friends having a picnic in a park\nC: a man explains the condition of someone in the hospital to the press outside of a building\nD: a child playing in a playground", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_42_0.png", "./Continuous-temporal/video_captioning/video_captioning_42_1.png", "./Continuous-temporal/video_captioning/video_captioning_42_2.png", "./Continuous-temporal/video_captioning/video_captioning_42_3.png", "./Continuous-temporal/video_captioning/video_captioning_42_4.png", "./Continuous-temporal/video_captioning/video_captioning_42_5.png", "./Continuous-temporal/video_captioning/video_captioning_42_6.png", "./Continuous-temporal/video_captioning/video_captioning_42_7.png", "./Continuous-temporal/video_captioning/video_captioning_42_8.png", "./Continuous-temporal/video_captioning/video_captioning_42_9.png", "./Continuous-temporal/video_captioning/video_captioning_42_10.png", "./Continuous-temporal/video_captioning/video_captioning_42_11.png", "./Continuous-temporal/video_captioning/video_captioning_42_12.png", "./Continuous-temporal/video_captioning/video_captioning_42_13.png", "./Continuous-temporal/video_captioning/video_captioning_42_14.png", "./Continuous-temporal/video_captioning/video_captioning_42_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: the beautiful scene on the screen\nB: an unattractive scene on the screen\nC: a dull landscape on the screen\nD: a boring view on the screen", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: the beautiful scene on the screen\nB: an unattractive scene on the screen\nC: a dull landscape on the screen\nD: a boring view on the screen", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_43_0.png", "./Continuous-temporal/video_captioning/video_captioning_43_1.png", "./Continuous-temporal/video_captioning/video_captioning_43_2.png", "./Continuous-temporal/video_captioning/video_captioning_43_3.png", "./Continuous-temporal/video_captioning/video_captioning_43_4.png", "./Continuous-temporal/video_captioning/video_captioning_43_5.png", "./Continuous-temporal/video_captioning/video_captioning_43_6.png", "./Continuous-temporal/video_captioning/video_captioning_43_7.png", "./Continuous-temporal/video_captioning/video_captioning_43_8.png", "./Continuous-temporal/video_captioning/video_captioning_43_9.png", "./Continuous-temporal/video_captioning/video_captioning_43_10.png", "./Continuous-temporal/video_captioning/video_captioning_43_11.png", "./Continuous-temporal/video_captioning/video_captioning_43_12.png", "./Continuous-temporal/video_captioning/video_captioning_43_13.png", "./Continuous-temporal/video_captioning/video_captioning_43_14.png", "./Continuous-temporal/video_captioning/video_captioning_43_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a thin guy with a tie is looking at a woman\nB: a fat guy with a tie is looking at a man\nC: an overweight man with a bowtie is staring at a person\nD: a chubby man with a necktie is observing a gentleman", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a thin guy with a tie is looking at a woman\nB: a fat guy with a tie is looking at a man\nC: an overweight man with a bowtie is staring at a person\nD: a chubby man with a necktie is observing a gentleman", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_44_0.png", "./Continuous-temporal/video_captioning/video_captioning_44_1.png", "./Continuous-temporal/video_captioning/video_captioning_44_2.png", "./Continuous-temporal/video_captioning/video_captioning_44_3.png", "./Continuous-temporal/video_captioning/video_captioning_44_4.png", "./Continuous-temporal/video_captioning/video_captioning_44_5.png", "./Continuous-temporal/video_captioning/video_captioning_44_6.png", "./Continuous-temporal/video_captioning/video_captioning_44_7.png", "./Continuous-temporal/video_captioning/video_captioning_44_8.png", "./Continuous-temporal/video_captioning/video_captioning_44_9.png", "./Continuous-temporal/video_captioning/video_captioning_44_10.png", "./Continuous-temporal/video_captioning/video_captioning_44_11.png", "./Continuous-temporal/video_captioning/video_captioning_44_12.png", "./Continuous-temporal/video_captioning/video_captioning_44_13.png", "./Continuous-temporal/video_captioning/video_captioning_44_14.png", "./Continuous-temporal/video_captioning/video_captioning_44_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a cat is taking a nap\nB: a dog is fetching a ball\nC: a man is riding a bicycle\nD: a woman is showing nail polish", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a cat is taking a nap\nB: a dog is fetching a ball\nC: a man is riding a bicycle\nD: a woman is showing nail polish", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_45_0.png", "./Continuous-temporal/video_captioning/video_captioning_45_1.png", "./Continuous-temporal/video_captioning/video_captioning_45_2.png", "./Continuous-temporal/video_captioning/video_captioning_45_3.png", "./Continuous-temporal/video_captioning/video_captioning_45_4.png", "./Continuous-temporal/video_captioning/video_captioning_45_5.png", "./Continuous-temporal/video_captioning/video_captioning_45_6.png", "./Continuous-temporal/video_captioning/video_captioning_45_7.png", "./Continuous-temporal/video_captioning/video_captioning_45_8.png", "./Continuous-temporal/video_captioning/video_captioning_45_9.png", "./Continuous-temporal/video_captioning/video_captioning_45_10.png", "./Continuous-temporal/video_captioning/video_captioning_45_11.png", "./Continuous-temporal/video_captioning/video_captioning_45_12.png", "./Continuous-temporal/video_captioning/video_captioning_45_13.png", "./Continuous-temporal/video_captioning/video_captioning_45_14.png", "./Continuous-temporal/video_captioning/video_captioning_45_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a child wearing a yellow t-shirt running in the park\nB: a man in a blue shirt standing in front of the shelves\nC: a woman in a red dress sitting at the table\nD: the woman in the purple blouse talk as the shelves are behind her", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a child wearing a yellow t-shirt running in the park\nB: a man in a blue shirt standing in front of the shelves\nC: a woman in a red dress sitting at the table\nD: the woman in the purple blouse talk as the shelves are behind her", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_46_0.png", "./Continuous-temporal/video_captioning/video_captioning_46_1.png", "./Continuous-temporal/video_captioning/video_captioning_46_2.png", "./Continuous-temporal/video_captioning/video_captioning_46_3.png", "./Continuous-temporal/video_captioning/video_captioning_46_4.png", "./Continuous-temporal/video_captioning/video_captioning_46_5.png", "./Continuous-temporal/video_captioning/video_captioning_46_6.png", "./Continuous-temporal/video_captioning/video_captioning_46_7.png", "./Continuous-temporal/video_captioning/video_captioning_46_8.png", "./Continuous-temporal/video_captioning/video_captioning_46_9.png", "./Continuous-temporal/video_captioning/video_captioning_46_10.png", "./Continuous-temporal/video_captioning/video_captioning_46_11.png", "./Continuous-temporal/video_captioning/video_captioning_46_12.png", "./Continuous-temporal/video_captioning/video_captioning_46_13.png", "./Continuous-temporal/video_captioning/video_captioning_46_14.png", "./Continuous-temporal/video_captioning/video_captioning_46_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: an old man shakes hands with another man and then they hug each other\nB: a young woman dances alone in a dark room\nC: a person cooks food in a kitchen\nD: a group of children play soccer in a field", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: an old man shakes hands with another man and then they hug each other\nB: a young woman dances alone in a dark room\nC: a person cooks food in a kitchen\nD: a group of children play soccer in a field", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_47_0.png", "./Continuous-temporal/video_captioning/video_captioning_47_1.png", "./Continuous-temporal/video_captioning/video_captioning_47_2.png", "./Continuous-temporal/video_captioning/video_captioning_47_3.png", "./Continuous-temporal/video_captioning/video_captioning_47_4.png", "./Continuous-temporal/video_captioning/video_captioning_47_5.png", "./Continuous-temporal/video_captioning/video_captioning_47_6.png", "./Continuous-temporal/video_captioning/video_captioning_47_7.png", "./Continuous-temporal/video_captioning/video_captioning_47_8.png", "./Continuous-temporal/video_captioning/video_captioning_47_9.png", "./Continuous-temporal/video_captioning/video_captioning_47_10.png", "./Continuous-temporal/video_captioning/video_captioning_47_11.png", "./Continuous-temporal/video_captioning/video_captioning_47_12.png", "./Continuous-temporal/video_captioning/video_captioning_47_13.png", "./Continuous-temporal/video_captioning/video_captioning_47_14.png", "./Continuous-temporal/video_captioning/video_captioning_47_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: two different men pretending to be one person\nB: woman pretends to be two different people\nC: man pretends to be two different people\nD: man changes his appearance", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: two different men pretending to be one person\nB: woman pretends to be two different people\nC: man pretends to be two different people\nD: man changes his appearance", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_48_0.png", "./Continuous-temporal/video_captioning/video_captioning_48_1.png", "./Continuous-temporal/video_captioning/video_captioning_48_2.png", "./Continuous-temporal/video_captioning/video_captioning_48_3.png", "./Continuous-temporal/video_captioning/video_captioning_48_4.png", "./Continuous-temporal/video_captioning/video_captioning_48_5.png", "./Continuous-temporal/video_captioning/video_captioning_48_6.png", "./Continuous-temporal/video_captioning/video_captioning_48_7.png", "./Continuous-temporal/video_captioning/video_captioning_48_8.png", "./Continuous-temporal/video_captioning/video_captioning_48_9.png", "./Continuous-temporal/video_captioning/video_captioning_48_10.png", "./Continuous-temporal/video_captioning/video_captioning_48_11.png", "./Continuous-temporal/video_captioning/video_captioning_48_12.png", "./Continuous-temporal/video_captioning/video_captioning_48_13.png", "./Continuous-temporal/video_captioning/video_captioning_48_14.png", "./Continuous-temporal/video_captioning/video_captioning_48_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: everyone can strive for mediocrity acting drums or planets\nB: some people may just be average playing pianos or moons\nC: nobody can reach excellence dancing violins or galaxies\nD: we can all be overachievers playing guitars or stars", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: everyone can strive for mediocrity acting drums or planets\nB: some people may just be average playing pianos or moons\nC: nobody can reach excellence dancing violins or galaxies\nD: we can all be overachievers playing guitars or stars", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_49_0.png", "./Continuous-temporal/video_captioning/video_captioning_49_1.png", "./Continuous-temporal/video_captioning/video_captioning_49_2.png", "./Continuous-temporal/video_captioning/video_captioning_49_3.png", "./Continuous-temporal/video_captioning/video_captioning_49_4.png", "./Continuous-temporal/video_captioning/video_captioning_49_5.png", "./Continuous-temporal/video_captioning/video_captioning_49_6.png", "./Continuous-temporal/video_captioning/video_captioning_49_7.png", "./Continuous-temporal/video_captioning/video_captioning_49_8.png", "./Continuous-temporal/video_captioning/video_captioning_49_9.png", "./Continuous-temporal/video_captioning/video_captioning_49_10.png", "./Continuous-temporal/video_captioning/video_captioning_49_11.png", "./Continuous-temporal/video_captioning/video_captioning_49_12.png", "./Continuous-temporal/video_captioning/video_captioning_49_13.png", "./Continuous-temporal/video_captioning/video_captioning_49_14.png", "./Continuous-temporal/video_captioning/video_captioning_49_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a group of cartoon faces are singing and dancing\nB: a bunch of cartoon faces are chomping their teeth and making eating gestures\nC: a collection of cartoon faces are laughing and clapping\nD: several cartoon faces are looking sad and crying", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a group of cartoon faces are singing and dancing\nB: a bunch of cartoon faces are chomping their teeth and making eating gestures\nC: a collection of cartoon faces are laughing and clapping\nD: several cartoon faces are looking sad and crying", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_50_0.png", "./Continuous-temporal/video_captioning/video_captioning_50_1.png", "./Continuous-temporal/video_captioning/video_captioning_50_2.png", "./Continuous-temporal/video_captioning/video_captioning_50_3.png", "./Continuous-temporal/video_captioning/video_captioning_50_4.png", "./Continuous-temporal/video_captioning/video_captioning_50_5.png", "./Continuous-temporal/video_captioning/video_captioning_50_6.png", "./Continuous-temporal/video_captioning/video_captioning_50_7.png", "./Continuous-temporal/video_captioning/video_captioning_50_8.png", "./Continuous-temporal/video_captioning/video_captioning_50_9.png", "./Continuous-temporal/video_captioning/video_captioning_50_10.png", "./Continuous-temporal/video_captioning/video_captioning_50_11.png", "./Continuous-temporal/video_captioning/video_captioning_50_12.png", "./Continuous-temporal/video_captioning/video_captioning_50_13.png", "./Continuous-temporal/video_captioning/video_captioning_50_14.png", "./Continuous-temporal/video_captioning/video_captioning_50_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: an elderly man is gardening\nB: a little girl is dancing\nC: a little boy is playing basketball\nD: two cats are fighting", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: an elderly man is gardening\nB: a little girl is dancing\nC: a little boy is playing basketball\nD: two cats are fighting", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_51_0.png", "./Continuous-temporal/video_captioning/video_captioning_51_1.png", "./Continuous-temporal/video_captioning/video_captioning_51_2.png", "./Continuous-temporal/video_captioning/video_captioning_51_3.png", "./Continuous-temporal/video_captioning/video_captioning_51_4.png", "./Continuous-temporal/video_captioning/video_captioning_51_5.png", "./Continuous-temporal/video_captioning/video_captioning_51_6.png", "./Continuous-temporal/video_captioning/video_captioning_51_7.png", "./Continuous-temporal/video_captioning/video_captioning_51_8.png", "./Continuous-temporal/video_captioning/video_captioning_51_9.png", "./Continuous-temporal/video_captioning/video_captioning_51_10.png", "./Continuous-temporal/video_captioning/video_captioning_51_11.png", "./Continuous-temporal/video_captioning/video_captioning_51_12.png", "./Continuous-temporal/video_captioning/video_captioning_51_13.png", "./Continuous-temporal/video_captioning/video_captioning_51_14.png", "./Continuous-temporal/video_captioning/video_captioning_51_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a woman and man are talking with each other\nB: a woman and man are laughing with each other\nC: a woman and man are staring at each other\nD: a woman and man are arguing with each other", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a woman and man are talking with each other\nB: a woman and man are laughing with each other\nC: a woman and man are staring at each other\nD: a woman and man are arguing with each other", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_52_0.png", "./Continuous-temporal/video_captioning/video_captioning_52_1.png", "./Continuous-temporal/video_captioning/video_captioning_52_2.png", "./Continuous-temporal/video_captioning/video_captioning_52_3.png", "./Continuous-temporal/video_captioning/video_captioning_52_4.png", "./Continuous-temporal/video_captioning/video_captioning_52_5.png", "./Continuous-temporal/video_captioning/video_captioning_52_6.png", "./Continuous-temporal/video_captioning/video_captioning_52_7.png", "./Continuous-temporal/video_captioning/video_captioning_52_8.png", "./Continuous-temporal/video_captioning/video_captioning_52_9.png", "./Continuous-temporal/video_captioning/video_captioning_52_10.png", "./Continuous-temporal/video_captioning/video_captioning_52_11.png", "./Continuous-temporal/video_captioning/video_captioning_52_12.png", "./Continuous-temporal/video_captioning/video_captioning_52_13.png", "./Continuous-temporal/video_captioning/video_captioning_52_14.png", "./Continuous-temporal/video_captioning/video_captioning_52_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a monkey is eating a banana\nB: a dog is sniffing a baby duck\nC: a bird is swimming in the water\nD: a cat is playing with a ball", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a monkey is eating a banana\nB: a dog is sniffing a baby duck\nC: a bird is swimming in the water\nD: a cat is playing with a ball", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_53_0.png", "./Continuous-temporal/video_captioning/video_captioning_53_1.png", "./Continuous-temporal/video_captioning/video_captioning_53_2.png", "./Continuous-temporal/video_captioning/video_captioning_53_3.png", "./Continuous-temporal/video_captioning/video_captioning_53_4.png", "./Continuous-temporal/video_captioning/video_captioning_53_5.png", "./Continuous-temporal/video_captioning/video_captioning_53_6.png", "./Continuous-temporal/video_captioning/video_captioning_53_7.png", "./Continuous-temporal/video_captioning/video_captioning_53_8.png", "./Continuous-temporal/video_captioning/video_captioning_53_9.png", "./Continuous-temporal/video_captioning/video_captioning_53_10.png", "./Continuous-temporal/video_captioning/video_captioning_53_11.png", "./Continuous-temporal/video_captioning/video_captioning_53_12.png", "./Continuous-temporal/video_captioning/video_captioning_53_13.png", "./Continuous-temporal/video_captioning/video_captioning_53_14.png", "./Continuous-temporal/video_captioning/video_captioning_53_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a person dancing at a party\nB: a person giving his opinion on how crowded the world is\nC: a person painting a landscape\nD: a person cooking in the kitchen", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a person dancing at a party\nB: a person giving his opinion on how crowded the world is\nC: a person painting a landscape\nD: a person cooking in the kitchen", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_54_0.png", "./Continuous-temporal/video_captioning/video_captioning_54_1.png", "./Continuous-temporal/video_captioning/video_captioning_54_2.png", "./Continuous-temporal/video_captioning/video_captioning_54_3.png", "./Continuous-temporal/video_captioning/video_captioning_54_4.png", "./Continuous-temporal/video_captioning/video_captioning_54_5.png", "./Continuous-temporal/video_captioning/video_captioning_54_6.png", "./Continuous-temporal/video_captioning/video_captioning_54_7.png", "./Continuous-temporal/video_captioning/video_captioning_54_8.png", "./Continuous-temporal/video_captioning/video_captioning_54_9.png", "./Continuous-temporal/video_captioning/video_captioning_54_10.png", "./Continuous-temporal/video_captioning/video_captioning_54_11.png", "./Continuous-temporal/video_captioning/video_captioning_54_12.png", "./Continuous-temporal/video_captioning/video_captioning_54_13.png", "./Continuous-temporal/video_captioning/video_captioning_54_14.png", "./Continuous-temporal/video_captioning/video_captioning_54_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: someone tore a triangular red paper\nB: someone unfolded a round blue paper\nC: someone folded a square yellow paper\nD: someone crumpled a rectangular green paper", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: someone tore a triangular red paper\nB: someone unfolded a round blue paper\nC: someone folded a square yellow paper\nD: someone crumpled a rectangular green paper", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_55_0.png", "./Continuous-temporal/video_captioning/video_captioning_55_1.png", "./Continuous-temporal/video_captioning/video_captioning_55_2.png", "./Continuous-temporal/video_captioning/video_captioning_55_3.png", "./Continuous-temporal/video_captioning/video_captioning_55_4.png", "./Continuous-temporal/video_captioning/video_captioning_55_5.png", "./Continuous-temporal/video_captioning/video_captioning_55_6.png", "./Continuous-temporal/video_captioning/video_captioning_55_7.png", "./Continuous-temporal/video_captioning/video_captioning_55_8.png", "./Continuous-temporal/video_captioning/video_captioning_55_9.png", "./Continuous-temporal/video_captioning/video_captioning_55_10.png", "./Continuous-temporal/video_captioning/video_captioning_55_11.png", "./Continuous-temporal/video_captioning/video_captioning_55_12.png", "./Continuous-temporal/video_captioning/video_captioning_55_13.png", "./Continuous-temporal/video_captioning/video_captioning_55_14.png", "./Continuous-temporal/video_captioning/video_captioning_55_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: the man is playing the guitar\nB: a woman is dancing\nC: a child is riding a bicycle\nD: a cat is playing the piano", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: the man is playing the guitar\nB: a woman is dancing\nC: a child is riding a bicycle\nD: a cat is playing the piano", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_56_0.png", "./Continuous-temporal/video_captioning/video_captioning_56_1.png", "./Continuous-temporal/video_captioning/video_captioning_56_2.png", "./Continuous-temporal/video_captioning/video_captioning_56_3.png", "./Continuous-temporal/video_captioning/video_captioning_56_4.png", "./Continuous-temporal/video_captioning/video_captioning_56_5.png", "./Continuous-temporal/video_captioning/video_captioning_56_6.png", "./Continuous-temporal/video_captioning/video_captioning_56_7.png", "./Continuous-temporal/video_captioning/video_captioning_56_8.png", "./Continuous-temporal/video_captioning/video_captioning_56_9.png", "./Continuous-temporal/video_captioning/video_captioning_56_10.png", "./Continuous-temporal/video_captioning/video_captioning_56_11.png", "./Continuous-temporal/video_captioning/video_captioning_56_12.png", "./Continuous-temporal/video_captioning/video_captioning_56_13.png", "./Continuous-temporal/video_captioning/video_captioning_56_14.png", "./Continuous-temporal/video_captioning/video_captioning_56_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: two women are embracing\nB: a man and a woman are dancing\nC: a woman is crying alone\nD: two men are fighting", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: two women are embracing\nB: a man and a woman are dancing\nC: a woman is crying alone\nD: two men are fighting", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_57_0.png", "./Continuous-temporal/video_captioning/video_captioning_57_1.png", "./Continuous-temporal/video_captioning/video_captioning_57_2.png", "./Continuous-temporal/video_captioning/video_captioning_57_3.png", "./Continuous-temporal/video_captioning/video_captioning_57_4.png", "./Continuous-temporal/video_captioning/video_captioning_57_5.png", "./Continuous-temporal/video_captioning/video_captioning_57_6.png", "./Continuous-temporal/video_captioning/video_captioning_57_7.png", "./Continuous-temporal/video_captioning/video_captioning_57_8.png", "./Continuous-temporal/video_captioning/video_captioning_57_9.png", "./Continuous-temporal/video_captioning/video_captioning_57_10.png", "./Continuous-temporal/video_captioning/video_captioning_57_11.png", "./Continuous-temporal/video_captioning/video_captioning_57_12.png", "./Continuous-temporal/video_captioning/video_captioning_57_13.png", "./Continuous-temporal/video_captioning/video_captioning_57_14.png", "./Continuous-temporal/video_captioning/video_captioning_57_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a bird is swimming in a lake\nB: a dog is driving a car\nC: a cat is sitting on a bicycle\nD: a monkey is riding a bus", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a bird is swimming in a lake\nB: a dog is driving a car\nC: a cat is sitting on a bicycle\nD: a monkey is riding a bus", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_58_0.png", "./Continuous-temporal/video_captioning/video_captioning_58_1.png", "./Continuous-temporal/video_captioning/video_captioning_58_2.png", "./Continuous-temporal/video_captioning/video_captioning_58_3.png", "./Continuous-temporal/video_captioning/video_captioning_58_4.png", "./Continuous-temporal/video_captioning/video_captioning_58_5.png", "./Continuous-temporal/video_captioning/video_captioning_58_6.png", "./Continuous-temporal/video_captioning/video_captioning_58_7.png", "./Continuous-temporal/video_captioning/video_captioning_58_8.png", "./Continuous-temporal/video_captioning/video_captioning_58_9.png", "./Continuous-temporal/video_captioning/video_captioning_58_10.png", "./Continuous-temporal/video_captioning/video_captioning_58_11.png", "./Continuous-temporal/video_captioning/video_captioning_58_12.png", "./Continuous-temporal/video_captioning/video_captioning_58_13.png", "./Continuous-temporal/video_captioning/video_captioning_58_14.png", "./Continuous-temporal/video_captioning/video_captioning_58_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a man demonstrating how to assemble a bicycle\nB: a group of people doing yoga in a park\nC: two ladies in a cookery show explain how to marinate chicken already cleaned and ready with salt and cilantro sprigs\nD: a person painting a landscape with a brush and palette", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a man demonstrating how to assemble a bicycle\nB: a group of people doing yoga in a park\nC: two ladies in a cookery show explain how to marinate chicken already cleaned and ready with salt and cilantro sprigs\nD: a person painting a landscape with a brush and palette", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_59_0.png", "./Continuous-temporal/video_captioning/video_captioning_59_1.png", "./Continuous-temporal/video_captioning/video_captioning_59_2.png", "./Continuous-temporal/video_captioning/video_captioning_59_3.png", "./Continuous-temporal/video_captioning/video_captioning_59_4.png", "./Continuous-temporal/video_captioning/video_captioning_59_5.png", "./Continuous-temporal/video_captioning/video_captioning_59_6.png", "./Continuous-temporal/video_captioning/video_captioning_59_7.png", "./Continuous-temporal/video_captioning/video_captioning_59_8.png", "./Continuous-temporal/video_captioning/video_captioning_59_9.png", "./Continuous-temporal/video_captioning/video_captioning_59_10.png", "./Continuous-temporal/video_captioning/video_captioning_59_11.png", "./Continuous-temporal/video_captioning/video_captioning_59_12.png", "./Continuous-temporal/video_captioning/video_captioning_59_13.png", "./Continuous-temporal/video_captioning/video_captioning_59_14.png", "./Continuous-temporal/video_captioning/video_captioning_59_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a car is driving along a busy street\nB: a computer is displaying a software code\nC: several people are talking in a room\nD: a flower and other natural scenes are displaying", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a car is driving along a busy street\nB: a computer is displaying a software code\nC: several people are talking in a room\nD: a flower and other natural scenes are displaying", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_60_0.png", "./Continuous-temporal/video_captioning/video_captioning_60_1.png", "./Continuous-temporal/video_captioning/video_captioning_60_2.png", "./Continuous-temporal/video_captioning/video_captioning_60_3.png", "./Continuous-temporal/video_captioning/video_captioning_60_4.png", "./Continuous-temporal/video_captioning/video_captioning_60_5.png", "./Continuous-temporal/video_captioning/video_captioning_60_6.png", "./Continuous-temporal/video_captioning/video_captioning_60_7.png", "./Continuous-temporal/video_captioning/video_captioning_60_8.png", "./Continuous-temporal/video_captioning/video_captioning_60_9.png", "./Continuous-temporal/video_captioning/video_captioning_60_10.png", "./Continuous-temporal/video_captioning/video_captioning_60_11.png", "./Continuous-temporal/video_captioning/video_captioning_60_12.png", "./Continuous-temporal/video_captioning/video_captioning_60_13.png", "./Continuous-temporal/video_captioning/video_captioning_60_14.png", "./Continuous-temporal/video_captioning/video_captioning_60_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a player is putting a basket ball into the basket from distance\nB: a player is shooting a soccer ball into the goal from close range\nC: a player is hitting a tennis ball across the net with a racket\nD: a player is throwing a football into the end zone from a distance", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a player is putting a basket ball into the basket from distance\nB: a player is shooting a soccer ball into the goal from close range\nC: a player is hitting a tennis ball across the net with a racket\nD: a player is throwing a football into the end zone from a distance", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_61_0.png", "./Continuous-temporal/video_captioning/video_captioning_61_1.png", "./Continuous-temporal/video_captioning/video_captioning_61_2.png", "./Continuous-temporal/video_captioning/video_captioning_61_3.png", "./Continuous-temporal/video_captioning/video_captioning_61_4.png", "./Continuous-temporal/video_captioning/video_captioning_61_5.png", "./Continuous-temporal/video_captioning/video_captioning_61_6.png", "./Continuous-temporal/video_captioning/video_captioning_61_7.png", "./Continuous-temporal/video_captioning/video_captioning_61_8.png", "./Continuous-temporal/video_captioning/video_captioning_61_9.png", "./Continuous-temporal/video_captioning/video_captioning_61_10.png", "./Continuous-temporal/video_captioning/video_captioning_61_11.png", "./Continuous-temporal/video_captioning/video_captioning_61_12.png", "./Continuous-temporal/video_captioning/video_captioning_61_13.png", "./Continuous-temporal/video_captioning/video_captioning_61_14.png", "./Continuous-temporal/video_captioning/video_captioning_61_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a video of animals running in a jungle\nB: a video is shown showing different cars\nC: a video of a chef cooking in a restaurant\nD: a video of people shopping in a mall", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a video of animals running in a jungle\nB: a video is shown showing different cars\nC: a video of a chef cooking in a restaurant\nD: a video of people shopping in a mall", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_62_0.png", "./Continuous-temporal/video_captioning/video_captioning_62_1.png", "./Continuous-temporal/video_captioning/video_captioning_62_2.png", "./Continuous-temporal/video_captioning/video_captioning_62_3.png", "./Continuous-temporal/video_captioning/video_captioning_62_4.png", "./Continuous-temporal/video_captioning/video_captioning_62_5.png", "./Continuous-temporal/video_captioning/video_captioning_62_6.png", "./Continuous-temporal/video_captioning/video_captioning_62_7.png", "./Continuous-temporal/video_captioning/video_captioning_62_8.png", "./Continuous-temporal/video_captioning/video_captioning_62_9.png", "./Continuous-temporal/video_captioning/video_captioning_62_10.png", "./Continuous-temporal/video_captioning/video_captioning_62_11.png", "./Continuous-temporal/video_captioning/video_captioning_62_12.png", "./Continuous-temporal/video_captioning/video_captioning_62_13.png", "./Continuous-temporal/video_captioning/video_captioning_62_14.png", "./Continuous-temporal/video_captioning/video_captioning_62_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a child is riding a bicycle in the park\nB: a man is swimming in a pool\nC: a man is playing baseball\nD: a woman is cooking in the kitchen", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a child is riding a bicycle in the park\nB: a man is swimming in a pool\nC: a man is playing baseball\nD: a woman is cooking in the kitchen", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_63_0.png", "./Continuous-temporal/video_captioning/video_captioning_63_1.png", "./Continuous-temporal/video_captioning/video_captioning_63_2.png", "./Continuous-temporal/video_captioning/video_captioning_63_3.png", "./Continuous-temporal/video_captioning/video_captioning_63_4.png", "./Continuous-temporal/video_captioning/video_captioning_63_5.png", "./Continuous-temporal/video_captioning/video_captioning_63_6.png", "./Continuous-temporal/video_captioning/video_captioning_63_7.png", "./Continuous-temporal/video_captioning/video_captioning_63_8.png", "./Continuous-temporal/video_captioning/video_captioning_63_9.png", "./Continuous-temporal/video_captioning/video_captioning_63_10.png", "./Continuous-temporal/video_captioning/video_captioning_63_11.png", "./Continuous-temporal/video_captioning/video_captioning_63_12.png", "./Continuous-temporal/video_captioning/video_captioning_63_13.png", "./Continuous-temporal/video_captioning/video_captioning_63_14.png", "./Continuous-temporal/video_captioning/video_captioning_63_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a man throwing a ball and a man missing the catch\nB: a man kicking a ball and a man dropping the ball\nC: a man hitting a ball and man catching the ball in the field\nD: a man hitting a ball and the ball rolling away", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a man throwing a ball and a man missing the catch\nB: a man kicking a ball and a man dropping the ball\nC: a man hitting a ball and man catching the ball in the field\nD: a man hitting a ball and the ball rolling away", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_64_0.png", "./Continuous-temporal/video_captioning/video_captioning_64_1.png", "./Continuous-temporal/video_captioning/video_captioning_64_2.png", "./Continuous-temporal/video_captioning/video_captioning_64_3.png", "./Continuous-temporal/video_captioning/video_captioning_64_4.png", "./Continuous-temporal/video_captioning/video_captioning_64_5.png", "./Continuous-temporal/video_captioning/video_captioning_64_6.png", "./Continuous-temporal/video_captioning/video_captioning_64_7.png", "./Continuous-temporal/video_captioning/video_captioning_64_8.png", "./Continuous-temporal/video_captioning/video_captioning_64_9.png", "./Continuous-temporal/video_captioning/video_captioning_64_10.png", "./Continuous-temporal/video_captioning/video_captioning_64_11.png", "./Continuous-temporal/video_captioning/video_captioning_64_12.png", "./Continuous-temporal/video_captioning/video_captioning_64_13.png", "./Continuous-temporal/video_captioning/video_captioning_64_14.png", "./Continuous-temporal/video_captioning/video_captioning_64_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: the chef is grilling chicken\nB: the man is chopping tomatoes\nC: the man is cooking onions\nD: the woman is frying eggs", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: the chef is grilling chicken\nB: the man is chopping tomatoes\nC: the man is cooking onions\nD: the woman is frying eggs", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_65_0.png", "./Continuous-temporal/video_captioning/video_captioning_65_1.png", "./Continuous-temporal/video_captioning/video_captioning_65_2.png", "./Continuous-temporal/video_captioning/video_captioning_65_3.png", "./Continuous-temporal/video_captioning/video_captioning_65_4.png", "./Continuous-temporal/video_captioning/video_captioning_65_5.png", "./Continuous-temporal/video_captioning/video_captioning_65_6.png", "./Continuous-temporal/video_captioning/video_captioning_65_7.png", "./Continuous-temporal/video_captioning/video_captioning_65_8.png", "./Continuous-temporal/video_captioning/video_captioning_65_9.png", "./Continuous-temporal/video_captioning/video_captioning_65_10.png", "./Continuous-temporal/video_captioning/video_captioning_65_11.png", "./Continuous-temporal/video_captioning/video_captioning_65_12.png", "./Continuous-temporal/video_captioning/video_captioning_65_13.png", "./Continuous-temporal/video_captioning/video_captioning_65_14.png", "./Continuous-temporal/video_captioning/video_captioning_65_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: young man in a black jacket playing the guitar\nB: blonde woman in red dress dancing in the kitchen\nC: elderly woman in purple sweater knitting a scarf\nD: bearded guy in grey tshirt talking to the camera", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: young man in a black jacket playing the guitar\nB: blonde woman in red dress dancing in the kitchen\nC: elderly woman in purple sweater knitting a scarf\nD: bearded guy in grey tshirt talking to the camera", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_66_0.png", "./Continuous-temporal/video_captioning/video_captioning_66_1.png", "./Continuous-temporal/video_captioning/video_captioning_66_2.png", "./Continuous-temporal/video_captioning/video_captioning_66_3.png", "./Continuous-temporal/video_captioning/video_captioning_66_4.png", "./Continuous-temporal/video_captioning/video_captioning_66_5.png", "./Continuous-temporal/video_captioning/video_captioning_66_6.png", "./Continuous-temporal/video_captioning/video_captioning_66_7.png", "./Continuous-temporal/video_captioning/video_captioning_66_8.png", "./Continuous-temporal/video_captioning/video_captioning_66_9.png", "./Continuous-temporal/video_captioning/video_captioning_66_10.png", "./Continuous-temporal/video_captioning/video_captioning_66_11.png", "./Continuous-temporal/video_captioning/video_captioning_66_12.png", "./Continuous-temporal/video_captioning/video_captioning_66_13.png", "./Continuous-temporal/video_captioning/video_captioning_66_14.png", "./Continuous-temporal/video_captioning/video_captioning_66_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: the couple walked arm in arm down the path\nB: two people ran together along the sidewalk\nC: a single person strolled casually along the road\nD: the group skipped happily through the field", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: the couple walked arm in arm down the path\nB: two people ran together along the sidewalk\nC: a single person strolled casually along the road\nD: the group skipped happily through the field", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_67_0.png", "./Continuous-temporal/video_captioning/video_captioning_67_1.png", "./Continuous-temporal/video_captioning/video_captioning_67_2.png", "./Continuous-temporal/video_captioning/video_captioning_67_3.png", "./Continuous-temporal/video_captioning/video_captioning_67_4.png", "./Continuous-temporal/video_captioning/video_captioning_67_5.png", "./Continuous-temporal/video_captioning/video_captioning_67_6.png", "./Continuous-temporal/video_captioning/video_captioning_67_7.png", "./Continuous-temporal/video_captioning/video_captioning_67_8.png", "./Continuous-temporal/video_captioning/video_captioning_67_9.png", "./Continuous-temporal/video_captioning/video_captioning_67_10.png", "./Continuous-temporal/video_captioning/video_captioning_67_11.png", "./Continuous-temporal/video_captioning/video_captioning_67_12.png", "./Continuous-temporal/video_captioning/video_captioning_67_13.png", "./Continuous-temporal/video_captioning/video_captioning_67_14.png", "./Continuous-temporal/video_captioning/video_captioning_67_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a chef prepares a delicious meal\nB: a woman opens a door to find a surprise party\nC: a cat plays with a ball of yarn\nD: a man punches a faucet to show how much better bottled water is", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a chef prepares a delicious meal\nB: a woman opens a door to find a surprise party\nC: a cat plays with a ball of yarn\nD: a man punches a faucet to show how much better bottled water is", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_68_0.png", "./Continuous-temporal/video_captioning/video_captioning_68_1.png", "./Continuous-temporal/video_captioning/video_captioning_68_2.png", "./Continuous-temporal/video_captioning/video_captioning_68_3.png", "./Continuous-temporal/video_captioning/video_captioning_68_4.png", "./Continuous-temporal/video_captioning/video_captioning_68_5.png", "./Continuous-temporal/video_captioning/video_captioning_68_6.png", "./Continuous-temporal/video_captioning/video_captioning_68_7.png", "./Continuous-temporal/video_captioning/video_captioning_68_8.png", "./Continuous-temporal/video_captioning/video_captioning_68_9.png", "./Continuous-temporal/video_captioning/video_captioning_68_10.png", "./Continuous-temporal/video_captioning/video_captioning_68_11.png", "./Continuous-temporal/video_captioning/video_captioning_68_12.png", "./Continuous-temporal/video_captioning/video_captioning_68_13.png", "./Continuous-temporal/video_captioning/video_captioning_68_14.png", "./Continuous-temporal/video_captioning/video_captioning_68_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a yellow bird is swimming in the ocean\nB: a black poodle is giving a man a highfive\nC: a brown dog is playing with a ball\nD: a white cat is riding a bicycle", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a yellow bird is swimming in the ocean\nB: a black poodle is giving a man a highfive\nC: a brown dog is playing with a ball\nD: a white cat is riding a bicycle", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_69_0.png", "./Continuous-temporal/video_captioning/video_captioning_69_1.png", "./Continuous-temporal/video_captioning/video_captioning_69_2.png", "./Continuous-temporal/video_captioning/video_captioning_69_3.png", "./Continuous-temporal/video_captioning/video_captioning_69_4.png", "./Continuous-temporal/video_captioning/video_captioning_69_5.png", "./Continuous-temporal/video_captioning/video_captioning_69_6.png", "./Continuous-temporal/video_captioning/video_captioning_69_7.png", "./Continuous-temporal/video_captioning/video_captioning_69_8.png", "./Continuous-temporal/video_captioning/video_captioning_69_9.png", "./Continuous-temporal/video_captioning/video_captioning_69_10.png", "./Continuous-temporal/video_captioning/video_captioning_69_11.png", "./Continuous-temporal/video_captioning/video_captioning_69_12.png", "./Continuous-temporal/video_captioning/video_captioning_69_13.png", "./Continuous-temporal/video_captioning/video_captioning_69_14.png", "./Continuous-temporal/video_captioning/video_captioning_69_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a dog is chasing a squirrel\nB: a bird is building a nest\nC: someone is holding a skunk\nD: a cat is playing with a ball", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a dog is chasing a squirrel\nB: a bird is building a nest\nC: someone is holding a skunk\nD: a cat is playing with a ball", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_70_0.png", "./Continuous-temporal/video_captioning/video_captioning_70_1.png", "./Continuous-temporal/video_captioning/video_captioning_70_2.png", "./Continuous-temporal/video_captioning/video_captioning_70_3.png", "./Continuous-temporal/video_captioning/video_captioning_70_4.png", "./Continuous-temporal/video_captioning/video_captioning_70_5.png", "./Continuous-temporal/video_captioning/video_captioning_70_6.png", "./Continuous-temporal/video_captioning/video_captioning_70_7.png", "./Continuous-temporal/video_captioning/video_captioning_70_8.png", "./Continuous-temporal/video_captioning/video_captioning_70_9.png", "./Continuous-temporal/video_captioning/video_captioning_70_10.png", "./Continuous-temporal/video_captioning/video_captioning_70_11.png", "./Continuous-temporal/video_captioning/video_captioning_70_12.png", "./Continuous-temporal/video_captioning/video_captioning_70_13.png", "./Continuous-temporal/video_captioning/video_captioning_70_14.png", "./Continuous-temporal/video_captioning/video_captioning_70_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a woman wearing a red dress dances in front of a mirror\nB: two dogs playing in the park\nC: a chef cooking in the kitchen\nD: a guy wearing a black shirt talks and shows a chart on the tv screen", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a woman wearing a red dress dances in front of a mirror\nB: two dogs playing in the park\nC: a chef cooking in the kitchen\nD: a guy wearing a black shirt talks and shows a chart on the tv screen", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_71_0.png", "./Continuous-temporal/video_captioning/video_captioning_71_1.png", "./Continuous-temporal/video_captioning/video_captioning_71_2.png", "./Continuous-temporal/video_captioning/video_captioning_71_3.png", "./Continuous-temporal/video_captioning/video_captioning_71_4.png", "./Continuous-temporal/video_captioning/video_captioning_71_5.png", "./Continuous-temporal/video_captioning/video_captioning_71_6.png", "./Continuous-temporal/video_captioning/video_captioning_71_7.png", "./Continuous-temporal/video_captioning/video_captioning_71_8.png", "./Continuous-temporal/video_captioning/video_captioning_71_9.png", "./Continuous-temporal/video_captioning/video_captioning_71_10.png", "./Continuous-temporal/video_captioning/video_captioning_71_11.png", "./Continuous-temporal/video_captioning/video_captioning_71_12.png", "./Continuous-temporal/video_captioning/video_captioning_71_13.png", "./Continuous-temporal/video_captioning/video_captioning_71_14.png", "./Continuous-temporal/video_captioning/video_captioning_71_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a panda is searching something\nB: a tiger is hunting for prey\nC: a bear is swimming in the river\nD: a monkey is swinging from tree to tree", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a panda is searching something\nB: a tiger is hunting for prey\nC: a bear is swimming in the river\nD: a monkey is swinging from tree to tree", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_72_0.png", "./Continuous-temporal/video_captioning/video_captioning_72_1.png", "./Continuous-temporal/video_captioning/video_captioning_72_2.png", "./Continuous-temporal/video_captioning/video_captioning_72_3.png", "./Continuous-temporal/video_captioning/video_captioning_72_4.png", "./Continuous-temporal/video_captioning/video_captioning_72_5.png", "./Continuous-temporal/video_captioning/video_captioning_72_6.png", "./Continuous-temporal/video_captioning/video_captioning_72_7.png", "./Continuous-temporal/video_captioning/video_captioning_72_8.png", "./Continuous-temporal/video_captioning/video_captioning_72_9.png", "./Continuous-temporal/video_captioning/video_captioning_72_10.png", "./Continuous-temporal/video_captioning/video_captioning_72_11.png", "./Continuous-temporal/video_captioning/video_captioning_72_12.png", "./Continuous-temporal/video_captioning/video_captioning_72_13.png", "./Continuous-temporal/video_captioning/video_captioning_72_14.png", "./Continuous-temporal/video_captioning/video_captioning_72_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a chef prepares a meal\nB: an artist explains a sketch\nC: a teacher solves a math problem\nD: a construction worker operates heavy machinery", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a chef prepares a meal\nB: an artist explains a sketch\nC: a teacher solves a math problem\nD: a construction worker operates heavy machinery", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_73_0.png", "./Continuous-temporal/video_captioning/video_captioning_73_1.png", "./Continuous-temporal/video_captioning/video_captioning_73_2.png", "./Continuous-temporal/video_captioning/video_captioning_73_3.png", "./Continuous-temporal/video_captioning/video_captioning_73_4.png", "./Continuous-temporal/video_captioning/video_captioning_73_5.png", "./Continuous-temporal/video_captioning/video_captioning_73_6.png", "./Continuous-temporal/video_captioning/video_captioning_73_7.png", "./Continuous-temporal/video_captioning/video_captioning_73_8.png", "./Continuous-temporal/video_captioning/video_captioning_73_9.png", "./Continuous-temporal/video_captioning/video_captioning_73_10.png", "./Continuous-temporal/video_captioning/video_captioning_73_11.png", "./Continuous-temporal/video_captioning/video_captioning_73_12.png", "./Continuous-temporal/video_captioning/video_captioning_73_13.png", "./Continuous-temporal/video_captioning/video_captioning_73_14.png", "./Continuous-temporal/video_captioning/video_captioning_73_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a woman is standing on a table\nB: a man is sitting on a chair\nC: a cat is jumping on the bed\nD: a dog is lying on the floor", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a woman is standing on a table\nB: a man is sitting on a chair\nC: a cat is jumping on the bed\nD: a dog is lying on the floor", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_74_0.png", "./Continuous-temporal/video_captioning/video_captioning_74_1.png", "./Continuous-temporal/video_captioning/video_captioning_74_2.png", "./Continuous-temporal/video_captioning/video_captioning_74_3.png", "./Continuous-temporal/video_captioning/video_captioning_74_4.png", "./Continuous-temporal/video_captioning/video_captioning_74_5.png", "./Continuous-temporal/video_captioning/video_captioning_74_6.png", "./Continuous-temporal/video_captioning/video_captioning_74_7.png", "./Continuous-temporal/video_captioning/video_captioning_74_8.png", "./Continuous-temporal/video_captioning/video_captioning_74_9.png", "./Continuous-temporal/video_captioning/video_captioning_74_10.png", "./Continuous-temporal/video_captioning/video_captioning_74_11.png", "./Continuous-temporal/video_captioning/video_captioning_74_12.png", "./Continuous-temporal/video_captioning/video_captioning_74_13.png", "./Continuous-temporal/video_captioning/video_captioning_74_14.png", "./Continuous-temporal/video_captioning/video_captioning_74_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a woman discusses her favorite movie and its impact on her life\nB: a man talks about a new diet plan he follows every day\nC: a woman talks about a skin care treatment she takes with her everwhere\nD: a group of people participate in a cooking competition and share their recipes with the audience", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a woman discusses her favorite movie and its impact on her life\nB: a man talks about a new diet plan he follows every day\nC: a woman talks about a skin care treatment she takes with her everwhere\nD: a group of people participate in a cooking competition and share their recipes with the audience", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_75_0.png", "./Continuous-temporal/video_captioning/video_captioning_75_1.png", "./Continuous-temporal/video_captioning/video_captioning_75_2.png", "./Continuous-temporal/video_captioning/video_captioning_75_3.png", "./Continuous-temporal/video_captioning/video_captioning_75_4.png", "./Continuous-temporal/video_captioning/video_captioning_75_5.png", "./Continuous-temporal/video_captioning/video_captioning_75_6.png", "./Continuous-temporal/video_captioning/video_captioning_75_7.png", "./Continuous-temporal/video_captioning/video_captioning_75_8.png", "./Continuous-temporal/video_captioning/video_captioning_75_9.png", "./Continuous-temporal/video_captioning/video_captioning_75_10.png", "./Continuous-temporal/video_captioning/video_captioning_75_11.png", "./Continuous-temporal/video_captioning/video_captioning_75_12.png", "./Continuous-temporal/video_captioning/video_captioning_75_13.png", "./Continuous-temporal/video_captioning/video_captioning_75_14.png", "./Continuous-temporal/video_captioning/video_captioning_75_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: women are getting into the taxi\nB: men are walking away from the taxi\nC: girls are getting down from the taxi\nD: boys are getting down from the bus", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: women are getting into the taxi\nB: men are walking away from the taxi\nC: girls are getting down from the taxi\nD: boys are getting down from the bus", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_76_0.png", "./Continuous-temporal/video_captioning/video_captioning_76_1.png", "./Continuous-temporal/video_captioning/video_captioning_76_2.png", "./Continuous-temporal/video_captioning/video_captioning_76_3.png", "./Continuous-temporal/video_captioning/video_captioning_76_4.png", "./Continuous-temporal/video_captioning/video_captioning_76_5.png", "./Continuous-temporal/video_captioning/video_captioning_76_6.png", "./Continuous-temporal/video_captioning/video_captioning_76_7.png", "./Continuous-temporal/video_captioning/video_captioning_76_8.png", "./Continuous-temporal/video_captioning/video_captioning_76_9.png", "./Continuous-temporal/video_captioning/video_captioning_76_10.png", "./Continuous-temporal/video_captioning/video_captioning_76_11.png", "./Continuous-temporal/video_captioning/video_captioning_76_12.png", "./Continuous-temporal/video_captioning/video_captioning_76_13.png", "./Continuous-temporal/video_captioning/video_captioning_76_14.png", "./Continuous-temporal/video_captioning/video_captioning_76_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a chef slices a tomato with a kitchen knife\nB: a man chops a chicken in two with an axe\nC: a woman cuts a watermelon with a knife\nD: a man cooks scrambled eggs in a pan", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a chef slices a tomato with a kitchen knife\nB: a man chops a chicken in two with an axe\nC: a woman cuts a watermelon with a knife\nD: a man cooks scrambled eggs in a pan", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_77_0.png", "./Continuous-temporal/video_captioning/video_captioning_77_1.png", "./Continuous-temporal/video_captioning/video_captioning_77_2.png", "./Continuous-temporal/video_captioning/video_captioning_77_3.png", "./Continuous-temporal/video_captioning/video_captioning_77_4.png", "./Continuous-temporal/video_captioning/video_captioning_77_5.png", "./Continuous-temporal/video_captioning/video_captioning_77_6.png", "./Continuous-temporal/video_captioning/video_captioning_77_7.png", "./Continuous-temporal/video_captioning/video_captioning_77_8.png", "./Continuous-temporal/video_captioning/video_captioning_77_9.png", "./Continuous-temporal/video_captioning/video_captioning_77_10.png", "./Continuous-temporal/video_captioning/video_captioning_77_11.png", "./Continuous-temporal/video_captioning/video_captioning_77_12.png", "./Continuous-temporal/video_captioning/video_captioning_77_13.png", "./Continuous-temporal/video_captioning/video_captioning_77_14.png", "./Continuous-temporal/video_captioning/video_captioning_77_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: film critics interpreting the meaning behind movie moments\nB: couples describing the logic behind movie scenes\nC: individuals analyzing the emotions in movie clips\nD: actors discussing their favorite movie scenes", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: film critics interpreting the meaning behind movie moments\nB: couples describing the logic behind movie scenes\nC: individuals analyzing the emotions in movie clips\nD: actors discussing their favorite movie scenes", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_78_0.png", "./Continuous-temporal/video_captioning/video_captioning_78_1.png", "./Continuous-temporal/video_captioning/video_captioning_78_2.png", "./Continuous-temporal/video_captioning/video_captioning_78_3.png", "./Continuous-temporal/video_captioning/video_captioning_78_4.png", "./Continuous-temporal/video_captioning/video_captioning_78_5.png", "./Continuous-temporal/video_captioning/video_captioning_78_6.png", "./Continuous-temporal/video_captioning/video_captioning_78_7.png", "./Continuous-temporal/video_captioning/video_captioning_78_8.png", "./Continuous-temporal/video_captioning/video_captioning_78_9.png", "./Continuous-temporal/video_captioning/video_captioning_78_10.png", "./Continuous-temporal/video_captioning/video_captioning_78_11.png", "./Continuous-temporal/video_captioning/video_captioning_78_12.png", "./Continuous-temporal/video_captioning/video_captioning_78_13.png", "./Continuous-temporal/video_captioning/video_captioning_78_14.png", "./Continuous-temporal/video_captioning/video_captioning_78_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a person rides a bike downhill\nB: a person drives a car on the highway\nC: a person walks a dog in the park\nD: a person flies a kite in the field", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a person rides a bike downhill\nB: a person drives a car on the highway\nC: a person walks a dog in the park\nD: a person flies a kite in the field", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_79_0.png", "./Continuous-temporal/video_captioning/video_captioning_79_1.png", "./Continuous-temporal/video_captioning/video_captioning_79_2.png", "./Continuous-temporal/video_captioning/video_captioning_79_3.png", "./Continuous-temporal/video_captioning/video_captioning_79_4.png", "./Continuous-temporal/video_captioning/video_captioning_79_5.png", "./Continuous-temporal/video_captioning/video_captioning_79_6.png", "./Continuous-temporal/video_captioning/video_captioning_79_7.png", "./Continuous-temporal/video_captioning/video_captioning_79_8.png", "./Continuous-temporal/video_captioning/video_captioning_79_9.png", "./Continuous-temporal/video_captioning/video_captioning_79_10.png", "./Continuous-temporal/video_captioning/video_captioning_79_11.png", "./Continuous-temporal/video_captioning/video_captioning_79_12.png", "./Continuous-temporal/video_captioning/video_captioning_79_13.png", "./Continuous-temporal/video_captioning/video_captioning_79_14.png", "./Continuous-temporal/video_captioning/video_captioning_79_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a child is playing with his toy\nB: a man is drinking his drink\nC: a person is sleeping on the bed\nD: a woman is eating her food", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a child is playing with his toy\nB: a man is drinking his drink\nC: a person is sleeping on the bed\nD: a woman is eating her food", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_80_0.png", "./Continuous-temporal/video_captioning/video_captioning_80_1.png", "./Continuous-temporal/video_captioning/video_captioning_80_2.png", "./Continuous-temporal/video_captioning/video_captioning_80_3.png", "./Continuous-temporal/video_captioning/video_captioning_80_4.png", "./Continuous-temporal/video_captioning/video_captioning_80_5.png", "./Continuous-temporal/video_captioning/video_captioning_80_6.png", "./Continuous-temporal/video_captioning/video_captioning_80_7.png", "./Continuous-temporal/video_captioning/video_captioning_80_8.png", "./Continuous-temporal/video_captioning/video_captioning_80_9.png", "./Continuous-temporal/video_captioning/video_captioning_80_10.png", "./Continuous-temporal/video_captioning/video_captioning_80_11.png", "./Continuous-temporal/video_captioning/video_captioning_80_12.png", "./Continuous-temporal/video_captioning/video_captioning_80_13.png", "./Continuous-temporal/video_captioning/video_captioning_80_14.png", "./Continuous-temporal/video_captioning/video_captioning_80_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a child is playing with toys\nB: a man is shaving\nC: a woman is putting on makeup\nD: a woman is cooking a meal", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a child is playing with toys\nB: a man is shaving\nC: a woman is putting on makeup\nD: a woman is cooking a meal", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_81_0.png", "./Continuous-temporal/video_captioning/video_captioning_81_1.png", "./Continuous-temporal/video_captioning/video_captioning_81_2.png", "./Continuous-temporal/video_captioning/video_captioning_81_3.png", "./Continuous-temporal/video_captioning/video_captioning_81_4.png", "./Continuous-temporal/video_captioning/video_captioning_81_5.png", "./Continuous-temporal/video_captioning/video_captioning_81_6.png", "./Continuous-temporal/video_captioning/video_captioning_81_7.png", "./Continuous-temporal/video_captioning/video_captioning_81_8.png", "./Continuous-temporal/video_captioning/video_captioning_81_9.png", "./Continuous-temporal/video_captioning/video_captioning_81_10.png", "./Continuous-temporal/video_captioning/video_captioning_81_11.png", "./Continuous-temporal/video_captioning/video_captioning_81_12.png", "./Continuous-temporal/video_captioning/video_captioning_81_13.png", "./Continuous-temporal/video_captioning/video_captioning_81_14.png", "./Continuous-temporal/video_captioning/video_captioning_81_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: two animals are fighting over a bowl of fruit\nB: a single animal is eating from a plate\nC: two animals are eating what appears to be apple slices from a pan within a cage\nD: two animals are playing with a ball inside a cage", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: two animals are fighting over a bowl of fruit\nB: a single animal is eating from a plate\nC: two animals are eating what appears to be apple slices from a pan within a cage\nD: two animals are playing with a ball inside a cage", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_82_0.png", "./Continuous-temporal/video_captioning/video_captioning_82_1.png", "./Continuous-temporal/video_captioning/video_captioning_82_2.png", "./Continuous-temporal/video_captioning/video_captioning_82_3.png", "./Continuous-temporal/video_captioning/video_captioning_82_4.png", "./Continuous-temporal/video_captioning/video_captioning_82_5.png", "./Continuous-temporal/video_captioning/video_captioning_82_6.png", "./Continuous-temporal/video_captioning/video_captioning_82_7.png", "./Continuous-temporal/video_captioning/video_captioning_82_8.png", "./Continuous-temporal/video_captioning/video_captioning_82_9.png", "./Continuous-temporal/video_captioning/video_captioning_82_10.png", "./Continuous-temporal/video_captioning/video_captioning_82_11.png", "./Continuous-temporal/video_captioning/video_captioning_82_12.png", "./Continuous-temporal/video_captioning/video_captioning_82_13.png", "./Continuous-temporal/video_captioning/video_captioning_82_14.png", "./Continuous-temporal/video_captioning/video_captioning_82_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a person is riding a bicycle on a mountain trail\nB: a person is swimming in a pool\nC: a person is being pushed on a stretcher\nD: a person is playing with a dog in a park", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a person is riding a bicycle on a mountain trail\nB: a person is swimming in a pool\nC: a person is being pushed on a stretcher\nD: a person is playing with a dog in a park", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_83_0.png", "./Continuous-temporal/video_captioning/video_captioning_83_1.png", "./Continuous-temporal/video_captioning/video_captioning_83_2.png", "./Continuous-temporal/video_captioning/video_captioning_83_3.png", "./Continuous-temporal/video_captioning/video_captioning_83_4.png", "./Continuous-temporal/video_captioning/video_captioning_83_5.png", "./Continuous-temporal/video_captioning/video_captioning_83_6.png", "./Continuous-temporal/video_captioning/video_captioning_83_7.png", "./Continuous-temporal/video_captioning/video_captioning_83_8.png", "./Continuous-temporal/video_captioning/video_captioning_83_9.png", "./Continuous-temporal/video_captioning/video_captioning_83_10.png", "./Continuous-temporal/video_captioning/video_captioning_83_11.png", "./Continuous-temporal/video_captioning/video_captioning_83_12.png", "./Continuous-temporal/video_captioning/video_captioning_83_13.png", "./Continuous-temporal/video_captioning/video_captioning_83_14.png", "./Continuous-temporal/video_captioning/video_captioning_83_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a bird flies across the room\nB: a dog jumps over a table\nC: a toddler walks by pushing a chair\nD: a kitten runs by pulling a cart", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a bird flies across the room\nB: a dog jumps over a table\nC: a toddler walks by pushing a chair\nD: a kitten runs by pulling a cart", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_84_0.png", "./Continuous-temporal/video_captioning/video_captioning_84_1.png", "./Continuous-temporal/video_captioning/video_captioning_84_2.png", "./Continuous-temporal/video_captioning/video_captioning_84_3.png", "./Continuous-temporal/video_captioning/video_captioning_84_4.png", "./Continuous-temporal/video_captioning/video_captioning_84_5.png", "./Continuous-temporal/video_captioning/video_captioning_84_6.png", "./Continuous-temporal/video_captioning/video_captioning_84_7.png", "./Continuous-temporal/video_captioning/video_captioning_84_8.png", "./Continuous-temporal/video_captioning/video_captioning_84_9.png", "./Continuous-temporal/video_captioning/video_captioning_84_10.png", "./Continuous-temporal/video_captioning/video_captioning_84_11.png", "./Continuous-temporal/video_captioning/video_captioning_84_12.png", "./Continuous-temporal/video_captioning/video_captioning_84_13.png", "./Continuous-temporal/video_captioning/video_captioning_84_14.png", "./Continuous-temporal/video_captioning/video_captioning_84_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a bird is flying in the sky\nB: a dog is chasing a ball\nC: a person is reading a book\nD: a cat is eating food", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a bird is flying in the sky\nB: a dog is chasing a ball\nC: a person is reading a book\nD: a cat is eating food", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_85_0.png", "./Continuous-temporal/video_captioning/video_captioning_85_1.png", "./Continuous-temporal/video_captioning/video_captioning_85_2.png", "./Continuous-temporal/video_captioning/video_captioning_85_3.png", "./Continuous-temporal/video_captioning/video_captioning_85_4.png", "./Continuous-temporal/video_captioning/video_captioning_85_5.png", "./Continuous-temporal/video_captioning/video_captioning_85_6.png", "./Continuous-temporal/video_captioning/video_captioning_85_7.png", "./Continuous-temporal/video_captioning/video_captioning_85_8.png", "./Continuous-temporal/video_captioning/video_captioning_85_9.png", "./Continuous-temporal/video_captioning/video_captioning_85_10.png", "./Continuous-temporal/video_captioning/video_captioning_85_11.png", "./Continuous-temporal/video_captioning/video_captioning_85_12.png", "./Continuous-temporal/video_captioning/video_captioning_85_13.png", "./Continuous-temporal/video_captioning/video_captioning_85_14.png", "./Continuous-temporal/video_captioning/video_captioning_85_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a man with a cap and walking\nB: a boy wearing a helmet and playing\nC: a woman with a scarf and standing still\nD: a girl with a hat on and dancing", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a man with a cap and walking\nB: a boy wearing a helmet and playing\nC: a woman with a scarf and standing still\nD: a girl with a hat on and dancing", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_86_0.png", "./Continuous-temporal/video_captioning/video_captioning_86_1.png", "./Continuous-temporal/video_captioning/video_captioning_86_2.png", "./Continuous-temporal/video_captioning/video_captioning_86_3.png", "./Continuous-temporal/video_captioning/video_captioning_86_4.png", "./Continuous-temporal/video_captioning/video_captioning_86_5.png", "./Continuous-temporal/video_captioning/video_captioning_86_6.png", "./Continuous-temporal/video_captioning/video_captioning_86_7.png", "./Continuous-temporal/video_captioning/video_captioning_86_8.png", "./Continuous-temporal/video_captioning/video_captioning_86_9.png", "./Continuous-temporal/video_captioning/video_captioning_86_10.png", "./Continuous-temporal/video_captioning/video_captioning_86_11.png", "./Continuous-temporal/video_captioning/video_captioning_86_12.png", "./Continuous-temporal/video_captioning/video_captioning_86_13.png", "./Continuous-temporal/video_captioning/video_captioning_86_14.png", "./Continuous-temporal/video_captioning/video_captioning_86_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a female tennis player takes a break while the audience watches in silence\nB: a male basketball player receives a penalty while being booed by the crowd\nC: a female soccer player accepts a reward while being cheered on by the crowd\nD: a male football player scores a goal and celebrates alone on the field", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a female tennis player takes a break while the audience watches in silence\nB: a male basketball player receives a penalty while being booed by the crowd\nC: a female soccer player accepts a reward while being cheered on by the crowd\nD: a male football player scores a goal and celebrates alone on the field", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_87_0.png", "./Continuous-temporal/video_captioning/video_captioning_87_1.png", "./Continuous-temporal/video_captioning/video_captioning_87_2.png", "./Continuous-temporal/video_captioning/video_captioning_87_3.png", "./Continuous-temporal/video_captioning/video_captioning_87_4.png", "./Continuous-temporal/video_captioning/video_captioning_87_5.png", "./Continuous-temporal/video_captioning/video_captioning_87_6.png", "./Continuous-temporal/video_captioning/video_captioning_87_7.png", "./Continuous-temporal/video_captioning/video_captioning_87_8.png", "./Continuous-temporal/video_captioning/video_captioning_87_9.png", "./Continuous-temporal/video_captioning/video_captioning_87_10.png", "./Continuous-temporal/video_captioning/video_captioning_87_11.png", "./Continuous-temporal/video_captioning/video_captioning_87_12.png", "./Continuous-temporal/video_captioning/video_captioning_87_13.png", "./Continuous-temporal/video_captioning/video_captioning_87_14.png", "./Continuous-temporal/video_captioning/video_captioning_87_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a clip from fox news on the shelby north carolina shooting\nB: a clip from msnbc on the shelby north carolina shooting\nC: a clip from abc news on the shelby north carolina shooting\nD: a clip from cnn on the shelby north carolina shooting", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a clip from fox news on the shelby north carolina shooting\nB: a clip from msnbc on the shelby north carolina shooting\nC: a clip from abc news on the shelby north carolina shooting\nD: a clip from cnn on the shelby north carolina shooting", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_88_0.png", "./Continuous-temporal/video_captioning/video_captioning_88_1.png", "./Continuous-temporal/video_captioning/video_captioning_88_2.png", "./Continuous-temporal/video_captioning/video_captioning_88_3.png", "./Continuous-temporal/video_captioning/video_captioning_88_4.png", "./Continuous-temporal/video_captioning/video_captioning_88_5.png", "./Continuous-temporal/video_captioning/video_captioning_88_6.png", "./Continuous-temporal/video_captioning/video_captioning_88_7.png", "./Continuous-temporal/video_captioning/video_captioning_88_8.png", "./Continuous-temporal/video_captioning/video_captioning_88_9.png", "./Continuous-temporal/video_captioning/video_captioning_88_10.png", "./Continuous-temporal/video_captioning/video_captioning_88_11.png", "./Continuous-temporal/video_captioning/video_captioning_88_12.png", "./Continuous-temporal/video_captioning/video_captioning_88_13.png", "./Continuous-temporal/video_captioning/video_captioning_88_14.png", "./Continuous-temporal/video_captioning/video_captioning_88_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a woman cooking in the kitchen\nB: a man shows how a video game works\nC: a child playing with a pet in the park\nD: a group of students studying in a library", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a woman cooking in the kitchen\nB: a man shows how a video game works\nC: a child playing with a pet in the park\nD: a group of students studying in a library", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_89_0.png", "./Continuous-temporal/video_captioning/video_captioning_89_1.png", "./Continuous-temporal/video_captioning/video_captioning_89_2.png", "./Continuous-temporal/video_captioning/video_captioning_89_3.png", "./Continuous-temporal/video_captioning/video_captioning_89_4.png", "./Continuous-temporal/video_captioning/video_captioning_89_5.png", "./Continuous-temporal/video_captioning/video_captioning_89_6.png", "./Continuous-temporal/video_captioning/video_captioning_89_7.png", "./Continuous-temporal/video_captioning/video_captioning_89_8.png", "./Continuous-temporal/video_captioning/video_captioning_89_9.png", "./Continuous-temporal/video_captioning/video_captioning_89_10.png", "./Continuous-temporal/video_captioning/video_captioning_89_11.png", "./Continuous-temporal/video_captioning/video_captioning_89_12.png", "./Continuous-temporal/video_captioning/video_captioning_89_13.png", "./Continuous-temporal/video_captioning/video_captioning_89_14.png", "./Continuous-temporal/video_captioning/video_captioning_89_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: people are setting up for a cooking show\nB: a group of individuals are rehearsing for a theater performance\nC: men are getting ready for a music program\nD: women are preparing for a dance competition", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: people are setting up for a cooking show\nB: a group of individuals are rehearsing for a theater performance\nC: men are getting ready for a music program\nD: women are preparing for a dance competition", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_90_0.png", "./Continuous-temporal/video_captioning/video_captioning_90_1.png", "./Continuous-temporal/video_captioning/video_captioning_90_2.png", "./Continuous-temporal/video_captioning/video_captioning_90_3.png", "./Continuous-temporal/video_captioning/video_captioning_90_4.png", "./Continuous-temporal/video_captioning/video_captioning_90_5.png", "./Continuous-temporal/video_captioning/video_captioning_90_6.png", "./Continuous-temporal/video_captioning/video_captioning_90_7.png", "./Continuous-temporal/video_captioning/video_captioning_90_8.png", "./Continuous-temporal/video_captioning/video_captioning_90_9.png", "./Continuous-temporal/video_captioning/video_captioning_90_10.png", "./Continuous-temporal/video_captioning/video_captioning_90_11.png", "./Continuous-temporal/video_captioning/video_captioning_90_12.png", "./Continuous-temporal/video_captioning/video_captioning_90_13.png", "./Continuous-temporal/video_captioning/video_captioning_90_14.png", "./Continuous-temporal/video_captioning/video_captioning_90_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a woman is cutting a vegetable into slices with a knife\nB: a chef is grilling a steak on a barbecue\nC: a man is slicing bread with a fork\nD: a person is peeling an apple with a spoon", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a woman is cutting a vegetable into slices with a knife\nB: a chef is grilling a steak on a barbecue\nC: a man is slicing bread with a fork\nD: a person is peeling an apple with a spoon", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_91_0.png", "./Continuous-temporal/video_captioning/video_captioning_91_1.png", "./Continuous-temporal/video_captioning/video_captioning_91_2.png", "./Continuous-temporal/video_captioning/video_captioning_91_3.png", "./Continuous-temporal/video_captioning/video_captioning_91_4.png", "./Continuous-temporal/video_captioning/video_captioning_91_5.png", "./Continuous-temporal/video_captioning/video_captioning_91_6.png", "./Continuous-temporal/video_captioning/video_captioning_91_7.png", "./Continuous-temporal/video_captioning/video_captioning_91_8.png", "./Continuous-temporal/video_captioning/video_captioning_91_9.png", "./Continuous-temporal/video_captioning/video_captioning_91_10.png", "./Continuous-temporal/video_captioning/video_captioning_91_11.png", "./Continuous-temporal/video_captioning/video_captioning_91_12.png", "./Continuous-temporal/video_captioning/video_captioning_91_13.png", "./Continuous-temporal/video_captioning/video_captioning_91_14.png", "./Continuous-temporal/video_captioning/video_captioning_91_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: the cat is sleeping on the sofa\nB: the man is cooking in the kitchen\nC: the child is playing with a toy\nD: the woman has a baby monitor", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: the cat is sleeping on the sofa\nB: the man is cooking in the kitchen\nC: the child is playing with a toy\nD: the woman has a baby monitor", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_92_0.png", "./Continuous-temporal/video_captioning/video_captioning_92_1.png", "./Continuous-temporal/video_captioning/video_captioning_92_2.png", "./Continuous-temporal/video_captioning/video_captioning_92_3.png", "./Continuous-temporal/video_captioning/video_captioning_92_4.png", "./Continuous-temporal/video_captioning/video_captioning_92_5.png", "./Continuous-temporal/video_captioning/video_captioning_92_6.png", "./Continuous-temporal/video_captioning/video_captioning_92_7.png", "./Continuous-temporal/video_captioning/video_captioning_92_8.png", "./Continuous-temporal/video_captioning/video_captioning_92_9.png", "./Continuous-temporal/video_captioning/video_captioning_92_10.png", "./Continuous-temporal/video_captioning/video_captioning_92_11.png", "./Continuous-temporal/video_captioning/video_captioning_92_12.png", "./Continuous-temporal/video_captioning/video_captioning_92_13.png", "./Continuous-temporal/video_captioning/video_captioning_92_14.png", "./Continuous-temporal/video_captioning/video_captioning_92_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a person is cooking\nB: a person is sleeping\nC: a person is eating\nD: a person is swimming", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a person is cooking\nB: a person is sleeping\nC: a person is eating\nD: a person is swimming", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_93_0.png", "./Continuous-temporal/video_captioning/video_captioning_93_1.png", "./Continuous-temporal/video_captioning/video_captioning_93_2.png", "./Continuous-temporal/video_captioning/video_captioning_93_3.png", "./Continuous-temporal/video_captioning/video_captioning_93_4.png", "./Continuous-temporal/video_captioning/video_captioning_93_5.png", "./Continuous-temporal/video_captioning/video_captioning_93_6.png", "./Continuous-temporal/video_captioning/video_captioning_93_7.png", "./Continuous-temporal/video_captioning/video_captioning_93_8.png", "./Continuous-temporal/video_captioning/video_captioning_93_9.png", "./Continuous-temporal/video_captioning/video_captioning_93_10.png", "./Continuous-temporal/video_captioning/video_captioning_93_11.png", "./Continuous-temporal/video_captioning/video_captioning_93_12.png", "./Continuous-temporal/video_captioning/video_captioning_93_13.png", "./Continuous-temporal/video_captioning/video_captioning_93_14.png", "./Continuous-temporal/video_captioning/video_captioning_93_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a man is doing stunts on a motorcycle\nB: a girl is swimming in the pool\nC: a boy is playing basketball\nD: a woman is riding a bicycle", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a man is doing stunts on a motorcycle\nB: a girl is swimming in the pool\nC: a boy is playing basketball\nD: a woman is riding a bicycle", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_94_0.png", "./Continuous-temporal/video_captioning/video_captioning_94_1.png", "./Continuous-temporal/video_captioning/video_captioning_94_2.png", "./Continuous-temporal/video_captioning/video_captioning_94_3.png", "./Continuous-temporal/video_captioning/video_captioning_94_4.png", "./Continuous-temporal/video_captioning/video_captioning_94_5.png", "./Continuous-temporal/video_captioning/video_captioning_94_6.png", "./Continuous-temporal/video_captioning/video_captioning_94_7.png", "./Continuous-temporal/video_captioning/video_captioning_94_8.png", "./Continuous-temporal/video_captioning/video_captioning_94_9.png", "./Continuous-temporal/video_captioning/video_captioning_94_10.png", "./Continuous-temporal/video_captioning/video_captioning_94_11.png", "./Continuous-temporal/video_captioning/video_captioning_94_12.png", "./Continuous-temporal/video_captioning/video_captioning_94_13.png", "./Continuous-temporal/video_captioning/video_captioning_94_14.png", "./Continuous-temporal/video_captioning/video_captioning_94_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: making coffee in the microwave for flavor\nB: preparing tea in the oven for taste\nC: boiling water on the stove for fragrance\nD: brewing hot chocolate in the kettle for aroma", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: making coffee in the microwave for flavor\nB: preparing tea in the oven for taste\nC: boiling water on the stove for fragrance\nD: brewing hot chocolate in the kettle for aroma", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_95_0.png", "./Continuous-temporal/video_captioning/video_captioning_95_1.png", "./Continuous-temporal/video_captioning/video_captioning_95_2.png", "./Continuous-temporal/video_captioning/video_captioning_95_3.png", "./Continuous-temporal/video_captioning/video_captioning_95_4.png", "./Continuous-temporal/video_captioning/video_captioning_95_5.png", "./Continuous-temporal/video_captioning/video_captioning_95_6.png", "./Continuous-temporal/video_captioning/video_captioning_95_7.png", "./Continuous-temporal/video_captioning/video_captioning_95_8.png", "./Continuous-temporal/video_captioning/video_captioning_95_9.png", "./Continuous-temporal/video_captioning/video_captioning_95_10.png", "./Continuous-temporal/video_captioning/video_captioning_95_11.png", "./Continuous-temporal/video_captioning/video_captioning_95_12.png", "./Continuous-temporal/video_captioning/video_captioning_95_13.png", "./Continuous-temporal/video_captioning/video_captioning_95_14.png", "./Continuous-temporal/video_captioning/video_captioning_95_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: lego stormtroppers are in a facility\nB: lego stormtroppers in a spaceship\nC: lego stormtroppers at a beach\nD: lego stormtroppers in a forest", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: lego stormtroppers are in a facility\nB: lego stormtroppers in a spaceship\nC: lego stormtroppers at a beach\nD: lego stormtroppers in a forest", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_96_0.png", "./Continuous-temporal/video_captioning/video_captioning_96_1.png", "./Continuous-temporal/video_captioning/video_captioning_96_2.png", "./Continuous-temporal/video_captioning/video_captioning_96_3.png", "./Continuous-temporal/video_captioning/video_captioning_96_4.png", "./Continuous-temporal/video_captioning/video_captioning_96_5.png", "./Continuous-temporal/video_captioning/video_captioning_96_6.png", "./Continuous-temporal/video_captioning/video_captioning_96_7.png", "./Continuous-temporal/video_captioning/video_captioning_96_8.png", "./Continuous-temporal/video_captioning/video_captioning_96_9.png", "./Continuous-temporal/video_captioning/video_captioning_96_10.png", "./Continuous-temporal/video_captioning/video_captioning_96_11.png", "./Continuous-temporal/video_captioning/video_captioning_96_12.png", "./Continuous-temporal/video_captioning/video_captioning_96_13.png", "./Continuous-temporal/video_captioning/video_captioning_96_14.png", "./Continuous-temporal/video_captioning/video_captioning_96_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a green SUV with a group of people admiring the SUV\nB: a red pickup truck with a woman speaking about the truck\nC: a blue sedan with a person walking around the sedan\nD: a yellow sports car with a guy speaking about the car", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a green SUV with a group of people admiring the SUV\nB: a red pickup truck with a woman speaking about the truck\nC: a blue sedan with a person walking around the sedan\nD: a yellow sports car with a guy speaking about the car", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_97_0.png", "./Continuous-temporal/video_captioning/video_captioning_97_1.png", "./Continuous-temporal/video_captioning/video_captioning_97_2.png", "./Continuous-temporal/video_captioning/video_captioning_97_3.png", "./Continuous-temporal/video_captioning/video_captioning_97_4.png", "./Continuous-temporal/video_captioning/video_captioning_97_5.png", "./Continuous-temporal/video_captioning/video_captioning_97_6.png", "./Continuous-temporal/video_captioning/video_captioning_97_7.png", "./Continuous-temporal/video_captioning/video_captioning_97_8.png", "./Continuous-temporal/video_captioning/video_captioning_97_9.png", "./Continuous-temporal/video_captioning/video_captioning_97_10.png", "./Continuous-temporal/video_captioning/video_captioning_97_11.png", "./Continuous-temporal/video_captioning/video_captioning_97_12.png", "./Continuous-temporal/video_captioning/video_captioning_97_13.png", "./Continuous-temporal/video_captioning/video_captioning_97_14.png", "./Continuous-temporal/video_captioning/video_captioning_97_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: the children are walking\nB: the women are walking\nC: the men are walking\nD: the women are running", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: the children are walking\nB: the women are walking\nC: the men are walking\nD: the women are running", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_98_0.png", "./Continuous-temporal/video_captioning/video_captioning_98_1.png", "./Continuous-temporal/video_captioning/video_captioning_98_2.png", "./Continuous-temporal/video_captioning/video_captioning_98_3.png", "./Continuous-temporal/video_captioning/video_captioning_98_4.png", "./Continuous-temporal/video_captioning/video_captioning_98_5.png", "./Continuous-temporal/video_captioning/video_captioning_98_6.png", "./Continuous-temporal/video_captioning/video_captioning_98_7.png", "./Continuous-temporal/video_captioning/video_captioning_98_8.png", "./Continuous-temporal/video_captioning/video_captioning_98_9.png", "./Continuous-temporal/video_captioning/video_captioning_98_10.png", "./Continuous-temporal/video_captioning/video_captioning_98_11.png", "./Continuous-temporal/video_captioning/video_captioning_98_12.png", "./Continuous-temporal/video_captioning/video_captioning_98_13.png", "./Continuous-temporal/video_captioning/video_captioning_98_14.png", "./Continuous-temporal/video_captioning/video_captioning_98_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a cat is sitting on a tree\nB: one micky mouse is talking to other\nC: three cars are racing on a track\nD: two dogs are playing with a ball", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a cat is sitting on a tree\nB: one micky mouse is talking to other\nC: three cars are racing on a track\nD: two dogs are playing with a ball", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_99_0.png", "./Continuous-temporal/video_captioning/video_captioning_99_1.png", "./Continuous-temporal/video_captioning/video_captioning_99_2.png", "./Continuous-temporal/video_captioning/video_captioning_99_3.png", "./Continuous-temporal/video_captioning/video_captioning_99_4.png", "./Continuous-temporal/video_captioning/video_captioning_99_5.png", "./Continuous-temporal/video_captioning/video_captioning_99_6.png", "./Continuous-temporal/video_captioning/video_captioning_99_7.png", "./Continuous-temporal/video_captioning/video_captioning_99_8.png", "./Continuous-temporal/video_captioning/video_captioning_99_9.png", "./Continuous-temporal/video_captioning/video_captioning_99_10.png", "./Continuous-temporal/video_captioning/video_captioning_99_11.png", "./Continuous-temporal/video_captioning/video_captioning_99_12.png", "./Continuous-temporal/video_captioning/video_captioning_99_13.png", "./Continuous-temporal/video_captioning/video_captioning_99_14.png", "./Continuous-temporal/video_captioning/video_captioning_99_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a child is playing in the sand\nB: a woman is pulled into the water\nC: a woman is walking on the beach\nD: a man is pulled into the water", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a child is playing in the sand\nB: a woman is pulled into the water\nC: a woman is walking on the beach\nD: a man is pulled into the water", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_100_0.png", "./Continuous-temporal/video_captioning/video_captioning_100_1.png", "./Continuous-temporal/video_captioning/video_captioning_100_2.png", "./Continuous-temporal/video_captioning/video_captioning_100_3.png", "./Continuous-temporal/video_captioning/video_captioning_100_4.png", "./Continuous-temporal/video_captioning/video_captioning_100_5.png", "./Continuous-temporal/video_captioning/video_captioning_100_6.png", "./Continuous-temporal/video_captioning/video_captioning_100_7.png", "./Continuous-temporal/video_captioning/video_captioning_100_8.png", "./Continuous-temporal/video_captioning/video_captioning_100_9.png", "./Continuous-temporal/video_captioning/video_captioning_100_10.png", "./Continuous-temporal/video_captioning/video_captioning_100_11.png", "./Continuous-temporal/video_captioning/video_captioning_100_12.png", "./Continuous-temporal/video_captioning/video_captioning_100_13.png", "./Continuous-temporal/video_captioning/video_captioning_100_14.png", "./Continuous-temporal/video_captioning/video_captioning_100_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a helicopter is flying over a peaceful city\nB: a helicopter is shown flying in what seems to be a war zone in syria\nC: a helicopter is delivering supplies to a humanitarian mission in a conflict zone\nD: a helicopter is performing aerial acrobatics in a deserted area", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a helicopter is flying over a peaceful city\nB: a helicopter is shown flying in what seems to be a war zone in syria\nC: a helicopter is delivering supplies to a humanitarian mission in a conflict zone\nD: a helicopter is performing aerial acrobatics in a deserted area", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_101_0.png", "./Continuous-temporal/video_captioning/video_captioning_101_1.png", "./Continuous-temporal/video_captioning/video_captioning_101_2.png", "./Continuous-temporal/video_captioning/video_captioning_101_3.png", "./Continuous-temporal/video_captioning/video_captioning_101_4.png", "./Continuous-temporal/video_captioning/video_captioning_101_5.png", "./Continuous-temporal/video_captioning/video_captioning_101_6.png", "./Continuous-temporal/video_captioning/video_captioning_101_7.png", "./Continuous-temporal/video_captioning/video_captioning_101_8.png", "./Continuous-temporal/video_captioning/video_captioning_101_9.png", "./Continuous-temporal/video_captioning/video_captioning_101_10.png", "./Continuous-temporal/video_captioning/video_captioning_101_11.png", "./Continuous-temporal/video_captioning/video_captioning_101_12.png", "./Continuous-temporal/video_captioning/video_captioning_101_13.png", "./Continuous-temporal/video_captioning/video_captioning_101_14.png", "./Continuous-temporal/video_captioning/video_captioning_101_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a man rolls down a flight of stairs\nB: a man climbs up a flight of stairs\nC: a man rides a bicycle down a hill\nD: a man walks across a bridge", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a man rolls down a flight of stairs\nB: a man climbs up a flight of stairs\nC: a man rides a bicycle down a hill\nD: a man walks across a bridge", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_102_0.png", "./Continuous-temporal/video_captioning/video_captioning_102_1.png", "./Continuous-temporal/video_captioning/video_captioning_102_2.png", "./Continuous-temporal/video_captioning/video_captioning_102_3.png", "./Continuous-temporal/video_captioning/video_captioning_102_4.png", "./Continuous-temporal/video_captioning/video_captioning_102_5.png", "./Continuous-temporal/video_captioning/video_captioning_102_6.png", "./Continuous-temporal/video_captioning/video_captioning_102_7.png", "./Continuous-temporal/video_captioning/video_captioning_102_8.png", "./Continuous-temporal/video_captioning/video_captioning_102_9.png", "./Continuous-temporal/video_captioning/video_captioning_102_10.png", "./Continuous-temporal/video_captioning/video_captioning_102_11.png", "./Continuous-temporal/video_captioning/video_captioning_102_12.png", "./Continuous-temporal/video_captioning/video_captioning_102_13.png", "./Continuous-temporal/video_captioning/video_captioning_102_14.png", "./Continuous-temporal/video_captioning/video_captioning_102_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: the joggers walked slowly on the path\nB: the athletes dashed through the stadium\nC: the runners jogged leisurely on the course\nD: the racers sprinted down the track", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: the joggers walked slowly on the path\nB: the athletes dashed through the stadium\nC: the runners jogged leisurely on the course\nD: the racers sprinted down the track", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_103_0.png", "./Continuous-temporal/video_captioning/video_captioning_103_1.png", "./Continuous-temporal/video_captioning/video_captioning_103_2.png", "./Continuous-temporal/video_captioning/video_captioning_103_3.png", "./Continuous-temporal/video_captioning/video_captioning_103_4.png", "./Continuous-temporal/video_captioning/video_captioning_103_5.png", "./Continuous-temporal/video_captioning/video_captioning_103_6.png", "./Continuous-temporal/video_captioning/video_captioning_103_7.png", "./Continuous-temporal/video_captioning/video_captioning_103_8.png", "./Continuous-temporal/video_captioning/video_captioning_103_9.png", "./Continuous-temporal/video_captioning/video_captioning_103_10.png", "./Continuous-temporal/video_captioning/video_captioning_103_11.png", "./Continuous-temporal/video_captioning/video_captioning_103_12.png", "./Continuous-temporal/video_captioning/video_captioning_103_13.png", "./Continuous-temporal/video_captioning/video_captioning_103_14.png", "./Continuous-temporal/video_captioning/video_captioning_103_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a woman is demonstrating a nail painting technique\nB: a child is playing with a toy car\nC: a chef is preparing sushi\nD: a man is fixing a bicycle tire", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a woman is demonstrating a nail painting technique\nB: a child is playing with a toy car\nC: a chef is preparing sushi\nD: a man is fixing a bicycle tire", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_104_0.png", "./Continuous-temporal/video_captioning/video_captioning_104_1.png", "./Continuous-temporal/video_captioning/video_captioning_104_2.png", "./Continuous-temporal/video_captioning/video_captioning_104_3.png", "./Continuous-temporal/video_captioning/video_captioning_104_4.png", "./Continuous-temporal/video_captioning/video_captioning_104_5.png", "./Continuous-temporal/video_captioning/video_captioning_104_6.png", "./Continuous-temporal/video_captioning/video_captioning_104_7.png", "./Continuous-temporal/video_captioning/video_captioning_104_8.png", "./Continuous-temporal/video_captioning/video_captioning_104_9.png", "./Continuous-temporal/video_captioning/video_captioning_104_10.png", "./Continuous-temporal/video_captioning/video_captioning_104_11.png", "./Continuous-temporal/video_captioning/video_captioning_104_12.png", "./Continuous-temporal/video_captioning/video_captioning_104_13.png", "./Continuous-temporal/video_captioning/video_captioning_104_14.png", "./Continuous-temporal/video_captioning/video_captioning_104_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a group of people are dancing\nB: a child is playing with toys\nC: a man is cooking on the stove\nD: a woman is mixing ingrediants", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a group of people are dancing\nB: a child is playing with toys\nC: a man is cooking on the stove\nD: a woman is mixing ingrediants", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_105_0.png", "./Continuous-temporal/video_captioning/video_captioning_105_1.png", "./Continuous-temporal/video_captioning/video_captioning_105_2.png", "./Continuous-temporal/video_captioning/video_captioning_105_3.png", "./Continuous-temporal/video_captioning/video_captioning_105_4.png", "./Continuous-temporal/video_captioning/video_captioning_105_5.png", "./Continuous-temporal/video_captioning/video_captioning_105_6.png", "./Continuous-temporal/video_captioning/video_captioning_105_7.png", "./Continuous-temporal/video_captioning/video_captioning_105_8.png", "./Continuous-temporal/video_captioning/video_captioning_105_9.png", "./Continuous-temporal/video_captioning/video_captioning_105_10.png", "./Continuous-temporal/video_captioning/video_captioning_105_11.png", "./Continuous-temporal/video_captioning/video_captioning_105_12.png", "./Continuous-temporal/video_captioning/video_captioning_105_13.png", "./Continuous-temporal/video_captioning/video_captioning_105_14.png", "./Continuous-temporal/video_captioning/video_captioning_105_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: vest of sports vines\nB: collection of outdoor flowers\nC: jacket of workout branches\nD: shirt of athletic trees", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: vest of sports vines\nB: collection of outdoor flowers\nC: jacket of workout branches\nD: shirt of athletic trees", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_106_0.png", "./Continuous-temporal/video_captioning/video_captioning_106_1.png", "./Continuous-temporal/video_captioning/video_captioning_106_2.png", "./Continuous-temporal/video_captioning/video_captioning_106_3.png", "./Continuous-temporal/video_captioning/video_captioning_106_4.png", "./Continuous-temporal/video_captioning/video_captioning_106_5.png", "./Continuous-temporal/video_captioning/video_captioning_106_6.png", "./Continuous-temporal/video_captioning/video_captioning_106_7.png", "./Continuous-temporal/video_captioning/video_captioning_106_8.png", "./Continuous-temporal/video_captioning/video_captioning_106_9.png", "./Continuous-temporal/video_captioning/video_captioning_106_10.png", "./Continuous-temporal/video_captioning/video_captioning_106_11.png", "./Continuous-temporal/video_captioning/video_captioning_106_12.png", "./Continuous-temporal/video_captioning/video_captioning_106_13.png", "./Continuous-temporal/video_captioning/video_captioning_106_14.png", "./Continuous-temporal/video_captioning/video_captioning_106_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a bird is building a nest in a tree\nB: two guinea pigs are eating leaves\nC: a cat is sleeping on a chair\nD: two dogs are chasing a ball", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a bird is building a nest in a tree\nB: two guinea pigs are eating leaves\nC: a cat is sleeping on a chair\nD: two dogs are chasing a ball", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_107_0.png", "./Continuous-temporal/video_captioning/video_captioning_107_1.png", "./Continuous-temporal/video_captioning/video_captioning_107_2.png", "./Continuous-temporal/video_captioning/video_captioning_107_3.png", "./Continuous-temporal/video_captioning/video_captioning_107_4.png", "./Continuous-temporal/video_captioning/video_captioning_107_5.png", "./Continuous-temporal/video_captioning/video_captioning_107_6.png", "./Continuous-temporal/video_captioning/video_captioning_107_7.png", "./Continuous-temporal/video_captioning/video_captioning_107_8.png", "./Continuous-temporal/video_captioning/video_captioning_107_9.png", "./Continuous-temporal/video_captioning/video_captioning_107_10.png", "./Continuous-temporal/video_captioning/video_captioning_107_11.png", "./Continuous-temporal/video_captioning/video_captioning_107_12.png", "./Continuous-temporal/video_captioning/video_captioning_107_13.png", "./Continuous-temporal/video_captioning/video_captioning_107_14.png", "./Continuous-temporal/video_captioning/video_captioning_107_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: two lions are hunting for food in the jungle\nB: a group of monkeys are playing in the snow\nC: two elphants are cleaning themselves in some water\nD: a herd of zebras are grazing in the savanna", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: two lions are hunting for food in the jungle\nB: a group of monkeys are playing in the snow\nC: two elphants are cleaning themselves in some water\nD: a herd of zebras are grazing in the savanna", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_108_0.png", "./Continuous-temporal/video_captioning/video_captioning_108_1.png", "./Continuous-temporal/video_captioning/video_captioning_108_2.png", "./Continuous-temporal/video_captioning/video_captioning_108_3.png", "./Continuous-temporal/video_captioning/video_captioning_108_4.png", "./Continuous-temporal/video_captioning/video_captioning_108_5.png", "./Continuous-temporal/video_captioning/video_captioning_108_6.png", "./Continuous-temporal/video_captioning/video_captioning_108_7.png", "./Continuous-temporal/video_captioning/video_captioning_108_8.png", "./Continuous-temporal/video_captioning/video_captioning_108_9.png", "./Continuous-temporal/video_captioning/video_captioning_108_10.png", "./Continuous-temporal/video_captioning/video_captioning_108_11.png", "./Continuous-temporal/video_captioning/video_captioning_108_12.png", "./Continuous-temporal/video_captioning/video_captioning_108_13.png", "./Continuous-temporal/video_captioning/video_captioning_108_14.png", "./Continuous-temporal/video_captioning/video_captioning_108_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a woman is cooking in the kitchen\nB: a child is playing with toys\nC: a dog is barking at strangers\nD: a man is talking about appliances", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a woman is cooking in the kitchen\nB: a child is playing with toys\nC: a dog is barking at strangers\nD: a man is talking about appliances", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_109_0.png", "./Continuous-temporal/video_captioning/video_captioning_109_1.png", "./Continuous-temporal/video_captioning/video_captioning_109_2.png", "./Continuous-temporal/video_captioning/video_captioning_109_3.png", "./Continuous-temporal/video_captioning/video_captioning_109_4.png", "./Continuous-temporal/video_captioning/video_captioning_109_5.png", "./Continuous-temporal/video_captioning/video_captioning_109_6.png", "./Continuous-temporal/video_captioning/video_captioning_109_7.png", "./Continuous-temporal/video_captioning/video_captioning_109_8.png", "./Continuous-temporal/video_captioning/video_captioning_109_9.png", "./Continuous-temporal/video_captioning/video_captioning_109_10.png", "./Continuous-temporal/video_captioning/video_captioning_109_11.png", "./Continuous-temporal/video_captioning/video_captioning_109_12.png", "./Continuous-temporal/video_captioning/video_captioning_109_13.png", "./Continuous-temporal/video_captioning/video_captioning_109_14.png", "./Continuous-temporal/video_captioning/video_captioning_109_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: threee kids sing together on the voice\nB: two kids sing alone on the voice\nC: four kids dance together on the voice\nD: three kids talk together on the voice", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: threee kids sing together on the voice\nB: two kids sing alone on the voice\nC: four kids dance together on the voice\nD: three kids talk together on the voice", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_110_0.png", "./Continuous-temporal/video_captioning/video_captioning_110_1.png", "./Continuous-temporal/video_captioning/video_captioning_110_2.png", "./Continuous-temporal/video_captioning/video_captioning_110_3.png", "./Continuous-temporal/video_captioning/video_captioning_110_4.png", "./Continuous-temporal/video_captioning/video_captioning_110_5.png", "./Continuous-temporal/video_captioning/video_captioning_110_6.png", "./Continuous-temporal/video_captioning/video_captioning_110_7.png", "./Continuous-temporal/video_captioning/video_captioning_110_8.png", "./Continuous-temporal/video_captioning/video_captioning_110_9.png", "./Continuous-temporal/video_captioning/video_captioning_110_10.png", "./Continuous-temporal/video_captioning/video_captioning_110_11.png", "./Continuous-temporal/video_captioning/video_captioning_110_12.png", "./Continuous-temporal/video_captioning/video_captioning_110_13.png", "./Continuous-temporal/video_captioning/video_captioning_110_14.png", "./Continuous-temporal/video_captioning/video_captioning_110_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a boy is getting off a bicycle\nB: a guy is getting out of a plane\nC: a woman is getting out of a car\nD: a man is swimming in a pool", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a boy is getting off a bicycle\nB: a guy is getting out of a plane\nC: a woman is getting out of a car\nD: a man is swimming in a pool", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_111_0.png", "./Continuous-temporal/video_captioning/video_captioning_111_1.png", "./Continuous-temporal/video_captioning/video_captioning_111_2.png", "./Continuous-temporal/video_captioning/video_captioning_111_3.png", "./Continuous-temporal/video_captioning/video_captioning_111_4.png", "./Continuous-temporal/video_captioning/video_captioning_111_5.png", "./Continuous-temporal/video_captioning/video_captioning_111_6.png", "./Continuous-temporal/video_captioning/video_captioning_111_7.png", "./Continuous-temporal/video_captioning/video_captioning_111_8.png", "./Continuous-temporal/video_captioning/video_captioning_111_9.png", "./Continuous-temporal/video_captioning/video_captioning_111_10.png", "./Continuous-temporal/video_captioning/video_captioning_111_11.png", "./Continuous-temporal/video_captioning/video_captioning_111_12.png", "./Continuous-temporal/video_captioning/video_captioning_111_13.png", "./Continuous-temporal/video_captioning/video_captioning_111_14.png", "./Continuous-temporal/video_captioning/video_captioning_111_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a child is riding a bike\nB: a man is playing guitar\nC: a dog is chasing a ball\nD: a lady is cutting onion", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a child is riding a bike\nB: a man is playing guitar\nC: a dog is chasing a ball\nD: a lady is cutting onion", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_112_0.png", "./Continuous-temporal/video_captioning/video_captioning_112_1.png", "./Continuous-temporal/video_captioning/video_captioning_112_2.png", "./Continuous-temporal/video_captioning/video_captioning_112_3.png", "./Continuous-temporal/video_captioning/video_captioning_112_4.png", "./Continuous-temporal/video_captioning/video_captioning_112_5.png", "./Continuous-temporal/video_captioning/video_captioning_112_6.png", "./Continuous-temporal/video_captioning/video_captioning_112_7.png", "./Continuous-temporal/video_captioning/video_captioning_112_8.png", "./Continuous-temporal/video_captioning/video_captioning_112_9.png", "./Continuous-temporal/video_captioning/video_captioning_112_10.png", "./Continuous-temporal/video_captioning/video_captioning_112_11.png", "./Continuous-temporal/video_captioning/video_captioning_112_12.png", "./Continuous-temporal/video_captioning/video_captioning_112_13.png", "./Continuous-temporal/video_captioning/video_captioning_112_14.png", "./Continuous-temporal/video_captioning/video_captioning_112_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a cat is licking a toy\nB: a dog is licking a baby\nC: a cat is licking a baby\nD: a baby is licking a cat", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a cat is licking a toy\nB: a dog is licking a baby\nC: a cat is licking a baby\nD: a baby is licking a cat", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_113_0.png", "./Continuous-temporal/video_captioning/video_captioning_113_1.png", "./Continuous-temporal/video_captioning/video_captioning_113_2.png", "./Continuous-temporal/video_captioning/video_captioning_113_3.png", "./Continuous-temporal/video_captioning/video_captioning_113_4.png", "./Continuous-temporal/video_captioning/video_captioning_113_5.png", "./Continuous-temporal/video_captioning/video_captioning_113_6.png", "./Continuous-temporal/video_captioning/video_captioning_113_7.png", "./Continuous-temporal/video_captioning/video_captioning_113_8.png", "./Continuous-temporal/video_captioning/video_captioning_113_9.png", "./Continuous-temporal/video_captioning/video_captioning_113_10.png", "./Continuous-temporal/video_captioning/video_captioning_113_11.png", "./Continuous-temporal/video_captioning/video_captioning_113_12.png", "./Continuous-temporal/video_captioning/video_captioning_113_13.png", "./Continuous-temporal/video_captioning/video_captioning_113_14.png", "./Continuous-temporal/video_captioning/video_captioning_113_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a group of babies are sitting in the stage\nB: a group of adults are standing in the stage\nC: a group of babies are standing in the stage\nD: a crowd of children are running on the stage", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a group of babies are sitting in the stage\nB: a group of adults are standing in the stage\nC: a group of babies are standing in the stage\nD: a crowd of children are running on the stage", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_114_0.png", "./Continuous-temporal/video_captioning/video_captioning_114_1.png", "./Continuous-temporal/video_captioning/video_captioning_114_2.png", "./Continuous-temporal/video_captioning/video_captioning_114_3.png", "./Continuous-temporal/video_captioning/video_captioning_114_4.png", "./Continuous-temporal/video_captioning/video_captioning_114_5.png", "./Continuous-temporal/video_captioning/video_captioning_114_6.png", "./Continuous-temporal/video_captioning/video_captioning_114_7.png", "./Continuous-temporal/video_captioning/video_captioning_114_8.png", "./Continuous-temporal/video_captioning/video_captioning_114_9.png", "./Continuous-temporal/video_captioning/video_captioning_114_10.png", "./Continuous-temporal/video_captioning/video_captioning_114_11.png", "./Continuous-temporal/video_captioning/video_captioning_114_12.png", "./Continuous-temporal/video_captioning/video_captioning_114_13.png", "./Continuous-temporal/video_captioning/video_captioning_114_14.png", "./Continuous-temporal/video_captioning/video_captioning_114_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a man is washing a car with a hose\nB: a dog is chasing a ball in the park\nC: a woman is sitting in a chair reading a book\nD: a man is lifting the back end of a small pickup up off the ground", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a man is washing a car with a hose\nB: a dog is chasing a ball in the park\nC: a woman is sitting in a chair reading a book\nD: a man is lifting the back end of a small pickup up off the ground", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_115_0.png", "./Continuous-temporal/video_captioning/video_captioning_115_1.png", "./Continuous-temporal/video_captioning/video_captioning_115_2.png", "./Continuous-temporal/video_captioning/video_captioning_115_3.png", "./Continuous-temporal/video_captioning/video_captioning_115_4.png", "./Continuous-temporal/video_captioning/video_captioning_115_5.png", "./Continuous-temporal/video_captioning/video_captioning_115_6.png", "./Continuous-temporal/video_captioning/video_captioning_115_7.png", "./Continuous-temporal/video_captioning/video_captioning_115_8.png", "./Continuous-temporal/video_captioning/video_captioning_115_9.png", "./Continuous-temporal/video_captioning/video_captioning_115_10.png", "./Continuous-temporal/video_captioning/video_captioning_115_11.png", "./Continuous-temporal/video_captioning/video_captioning_115_12.png", "./Continuous-temporal/video_captioning/video_captioning_115_13.png", "./Continuous-temporal/video_captioning/video_captioning_115_14.png", "./Continuous-temporal/video_captioning/video_captioning_115_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a person is driving a car\nB: a bird is flying in the sky\nC: a dog is playing with a ball in the yard\nD: a cat is washing its head under a tap", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a person is driving a car\nB: a bird is flying in the sky\nC: a dog is playing with a ball in the yard\nD: a cat is washing its head under a tap", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_116_0.png", "./Continuous-temporal/video_captioning/video_captioning_116_1.png", "./Continuous-temporal/video_captioning/video_captioning_116_2.png", "./Continuous-temporal/video_captioning/video_captioning_116_3.png", "./Continuous-temporal/video_captioning/video_captioning_116_4.png", "./Continuous-temporal/video_captioning/video_captioning_116_5.png", "./Continuous-temporal/video_captioning/video_captioning_116_6.png", "./Continuous-temporal/video_captioning/video_captioning_116_7.png", "./Continuous-temporal/video_captioning/video_captioning_116_8.png", "./Continuous-temporal/video_captioning/video_captioning_116_9.png", "./Continuous-temporal/video_captioning/video_captioning_116_10.png", "./Continuous-temporal/video_captioning/video_captioning_116_11.png", "./Continuous-temporal/video_captioning/video_captioning_116_12.png", "./Continuous-temporal/video_captioning/video_captioning_116_13.png", "./Continuous-temporal/video_captioning/video_captioning_116_14.png", "./Continuous-temporal/video_captioning/video_captioning_116_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: two fairy with green hair is sitting on a cloud\nB: a mermaid swimming with blue hair\nC: two mermaid with red hair is sitting on a rock\nD: a mermaid with purple hair sitting on a rock", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: two fairy with green hair is sitting on a cloud\nB: a mermaid swimming with blue hair\nC: two mermaid with red hair is sitting on a rock\nD: a mermaid with purple hair sitting on a rock", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_117_0.png", "./Continuous-temporal/video_captioning/video_captioning_117_1.png", "./Continuous-temporal/video_captioning/video_captioning_117_2.png", "./Continuous-temporal/video_captioning/video_captioning_117_3.png", "./Continuous-temporal/video_captioning/video_captioning_117_4.png", "./Continuous-temporal/video_captioning/video_captioning_117_5.png", "./Continuous-temporal/video_captioning/video_captioning_117_6.png", "./Continuous-temporal/video_captioning/video_captioning_117_7.png", "./Continuous-temporal/video_captioning/video_captioning_117_8.png", "./Continuous-temporal/video_captioning/video_captioning_117_9.png", "./Continuous-temporal/video_captioning/video_captioning_117_10.png", "./Continuous-temporal/video_captioning/video_captioning_117_11.png", "./Continuous-temporal/video_captioning/video_captioning_117_12.png", "./Continuous-temporal/video_captioning/video_captioning_117_13.png", "./Continuous-temporal/video_captioning/video_captioning_117_14.png", "./Continuous-temporal/video_captioning/video_captioning_117_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: characters from video games are dancing to old mc donald had a farm\nB: characters from video games are singing old mc donald had a farm\nC: real-life people playing old mc donald had a farm on musical instruments\nD: animated animals dancing to old mc donald had a farm", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: characters from video games are dancing to old mc donald had a farm\nB: characters from video games are singing old mc donald had a farm\nC: real-life people playing old mc donald had a farm on musical instruments\nD: animated animals dancing to old mc donald had a farm", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_118_0.png", "./Continuous-temporal/video_captioning/video_captioning_118_1.png", "./Continuous-temporal/video_captioning/video_captioning_118_2.png", "./Continuous-temporal/video_captioning/video_captioning_118_3.png", "./Continuous-temporal/video_captioning/video_captioning_118_4.png", "./Continuous-temporal/video_captioning/video_captioning_118_5.png", "./Continuous-temporal/video_captioning/video_captioning_118_6.png", "./Continuous-temporal/video_captioning/video_captioning_118_7.png", "./Continuous-temporal/video_captioning/video_captioning_118_8.png", "./Continuous-temporal/video_captioning/video_captioning_118_9.png", "./Continuous-temporal/video_captioning/video_captioning_118_10.png", "./Continuous-temporal/video_captioning/video_captioning_118_11.png", "./Continuous-temporal/video_captioning/video_captioning_118_12.png", "./Continuous-temporal/video_captioning/video_captioning_118_13.png", "./Continuous-temporal/video_captioning/video_captioning_118_14.png", "./Continuous-temporal/video_captioning/video_captioning_118_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: the animals are fighting over food\nB: the animals are ignoring each other and not eating\nC: the animals are having nice time together and eating food\nD: the animals are alone and not interacting with each other", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: the animals are fighting over food\nB: the animals are ignoring each other and not eating\nC: the animals are having nice time together and eating food\nD: the animals are alone and not interacting with each other", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_119_0.png", "./Continuous-temporal/video_captioning/video_captioning_119_1.png", "./Continuous-temporal/video_captioning/video_captioning_119_2.png", "./Continuous-temporal/video_captioning/video_captioning_119_3.png", "./Continuous-temporal/video_captioning/video_captioning_119_4.png", "./Continuous-temporal/video_captioning/video_captioning_119_5.png", "./Continuous-temporal/video_captioning/video_captioning_119_6.png", "./Continuous-temporal/video_captioning/video_captioning_119_7.png", "./Continuous-temporal/video_captioning/video_captioning_119_8.png", "./Continuous-temporal/video_captioning/video_captioning_119_9.png", "./Continuous-temporal/video_captioning/video_captioning_119_10.png", "./Continuous-temporal/video_captioning/video_captioning_119_11.png", "./Continuous-temporal/video_captioning/video_captioning_119_12.png", "./Continuous-temporal/video_captioning/video_captioning_119_13.png", "./Continuous-temporal/video_captioning/video_captioning_119_14.png", "./Continuous-temporal/video_captioning/video_captioning_119_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a group of people are running\nB: a man is singing while walking\nC: a woman is playing guitar\nD: a man is dancing alone", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a group of people are running\nB: a man is singing while walking\nC: a woman is playing guitar\nD: a man is dancing alone", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_120_0.png", "./Continuous-temporal/video_captioning/video_captioning_120_1.png", "./Continuous-temporal/video_captioning/video_captioning_120_2.png", "./Continuous-temporal/video_captioning/video_captioning_120_3.png", "./Continuous-temporal/video_captioning/video_captioning_120_4.png", "./Continuous-temporal/video_captioning/video_captioning_120_5.png", "./Continuous-temporal/video_captioning/video_captioning_120_6.png", "./Continuous-temporal/video_captioning/video_captioning_120_7.png", "./Continuous-temporal/video_captioning/video_captioning_120_8.png", "./Continuous-temporal/video_captioning/video_captioning_120_9.png", "./Continuous-temporal/video_captioning/video_captioning_120_10.png", "./Continuous-temporal/video_captioning/video_captioning_120_11.png", "./Continuous-temporal/video_captioning/video_captioning_120_12.png", "./Continuous-temporal/video_captioning/video_captioning_120_13.png", "./Continuous-temporal/video_captioning/video_captioning_120_14.png", "./Continuous-temporal/video_captioning/video_captioning_120_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a man is playing a guitar\nB: a woman is watering plants in the garden\nC: a man is typing on a computer keyboard\nD: a child is riding a bicycle", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a man is playing a guitar\nB: a woman is watering plants in the garden\nC: a man is typing on a computer keyboard\nD: a child is riding a bicycle", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_121_0.png", "./Continuous-temporal/video_captioning/video_captioning_121_1.png", "./Continuous-temporal/video_captioning/video_captioning_121_2.png", "./Continuous-temporal/video_captioning/video_captioning_121_3.png", "./Continuous-temporal/video_captioning/video_captioning_121_4.png", "./Continuous-temporal/video_captioning/video_captioning_121_5.png", "./Continuous-temporal/video_captioning/video_captioning_121_6.png", "./Continuous-temporal/video_captioning/video_captioning_121_7.png", "./Continuous-temporal/video_captioning/video_captioning_121_8.png", "./Continuous-temporal/video_captioning/video_captioning_121_9.png", "./Continuous-temporal/video_captioning/video_captioning_121_10.png", "./Continuous-temporal/video_captioning/video_captioning_121_11.png", "./Continuous-temporal/video_captioning/video_captioning_121_12.png", "./Continuous-temporal/video_captioning/video_captioning_121_13.png", "./Continuous-temporal/video_captioning/video_captioning_121_14.png", "./Continuous-temporal/video_captioning/video_captioning_121_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a girl and a man are talking to each other\nB: a boy and a girl are playing a game\nC: a group of people are having a picnic\nD: a woman and a man are arguing with each other", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a girl and a man are talking to each other\nB: a boy and a girl are playing a game\nC: a group of people are having a picnic\nD: a woman and a man are arguing with each other", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_122_0.png", "./Continuous-temporal/video_captioning/video_captioning_122_1.png", "./Continuous-temporal/video_captioning/video_captioning_122_2.png", "./Continuous-temporal/video_captioning/video_captioning_122_3.png", "./Continuous-temporal/video_captioning/video_captioning_122_4.png", "./Continuous-temporal/video_captioning/video_captioning_122_5.png", "./Continuous-temporal/video_captioning/video_captioning_122_6.png", "./Continuous-temporal/video_captioning/video_captioning_122_7.png", "./Continuous-temporal/video_captioning/video_captioning_122_8.png", "./Continuous-temporal/video_captioning/video_captioning_122_9.png", "./Continuous-temporal/video_captioning/video_captioning_122_10.png", "./Continuous-temporal/video_captioning/video_captioning_122_11.png", "./Continuous-temporal/video_captioning/video_captioning_122_12.png", "./Continuous-temporal/video_captioning/video_captioning_122_13.png", "./Continuous-temporal/video_captioning/video_captioning_122_14.png", "./Continuous-temporal/video_captioning/video_captioning_122_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a man is fixing a car\nB: a child is playing with toys\nC: a woman is putting on makeup\nD: a chef is cooking in the kitchen", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a man is fixing a car\nB: a child is playing with toys\nC: a woman is putting on makeup\nD: a chef is cooking in the kitchen", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_123_0.png", "./Continuous-temporal/video_captioning/video_captioning_123_1.png", "./Continuous-temporal/video_captioning/video_captioning_123_2.png", "./Continuous-temporal/video_captioning/video_captioning_123_3.png", "./Continuous-temporal/video_captioning/video_captioning_123_4.png", "./Continuous-temporal/video_captioning/video_captioning_123_5.png", "./Continuous-temporal/video_captioning/video_captioning_123_6.png", "./Continuous-temporal/video_captioning/video_captioning_123_7.png", "./Continuous-temporal/video_captioning/video_captioning_123_8.png", "./Continuous-temporal/video_captioning/video_captioning_123_9.png", "./Continuous-temporal/video_captioning/video_captioning_123_10.png", "./Continuous-temporal/video_captioning/video_captioning_123_11.png", "./Continuous-temporal/video_captioning/video_captioning_123_12.png", "./Continuous-temporal/video_captioning/video_captioning_123_13.png", "./Continuous-temporal/video_captioning/video_captioning_123_14.png", "./Continuous-temporal/video_captioning/video_captioning_123_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: couple walking in a park\nB: children playing in the snow\nC: people swimming in a pool\nD: boys and girls dancing and singing on beach", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: couple walking in a park\nB: children playing in the snow\nC: people swimming in a pool\nD: boys and girls dancing and singing on beach", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_124_0.png", "./Continuous-temporal/video_captioning/video_captioning_124_1.png", "./Continuous-temporal/video_captioning/video_captioning_124_2.png", "./Continuous-temporal/video_captioning/video_captioning_124_3.png", "./Continuous-temporal/video_captioning/video_captioning_124_4.png", "./Continuous-temporal/video_captioning/video_captioning_124_5.png", "./Continuous-temporal/video_captioning/video_captioning_124_6.png", "./Continuous-temporal/video_captioning/video_captioning_124_7.png", "./Continuous-temporal/video_captioning/video_captioning_124_8.png", "./Continuous-temporal/video_captioning/video_captioning_124_9.png", "./Continuous-temporal/video_captioning/video_captioning_124_10.png", "./Continuous-temporal/video_captioning/video_captioning_124_11.png", "./Continuous-temporal/video_captioning/video_captioning_124_12.png", "./Continuous-temporal/video_captioning/video_captioning_124_13.png", "./Continuous-temporal/video_captioning/video_captioning_124_14.png", "./Continuous-temporal/video_captioning/video_captioning_124_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: how to open a banana properly\nB: methods for slicing a cucumber\nC: techniques for cutting an apple\nD: ways to peel a potato", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: how to open a banana properly\nB: methods for slicing a cucumber\nC: techniques for cutting an apple\nD: ways to peel a potato", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_125_0.png", "./Continuous-temporal/video_captioning/video_captioning_125_1.png", "./Continuous-temporal/video_captioning/video_captioning_125_2.png", "./Continuous-temporal/video_captioning/video_captioning_125_3.png", "./Continuous-temporal/video_captioning/video_captioning_125_4.png", "./Continuous-temporal/video_captioning/video_captioning_125_5.png", "./Continuous-temporal/video_captioning/video_captioning_125_6.png", "./Continuous-temporal/video_captioning/video_captioning_125_7.png", "./Continuous-temporal/video_captioning/video_captioning_125_8.png", "./Continuous-temporal/video_captioning/video_captioning_125_9.png", "./Continuous-temporal/video_captioning/video_captioning_125_10.png", "./Continuous-temporal/video_captioning/video_captioning_125_11.png", "./Continuous-temporal/video_captioning/video_captioning_125_12.png", "./Continuous-temporal/video_captioning/video_captioning_125_13.png", "./Continuous-temporal/video_captioning/video_captioning_125_14.png", "./Continuous-temporal/video_captioning/video_captioning_125_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: two birds in a nest one white chick and one green adult\nB: two pigeons in a nest one white baby and one green parent\nC: three parrots in a bird cage one white chick and two green adults\nD: two parrots in a bird cage one white chick and on green adult", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: two birds in a nest one white chick and one green adult\nB: two pigeons in a nest one white baby and one green parent\nC: three parrots in a bird cage one white chick and two green adults\nD: two parrots in a bird cage one white chick and on green adult", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_126_0.png", "./Continuous-temporal/video_captioning/video_captioning_126_1.png", "./Continuous-temporal/video_captioning/video_captioning_126_2.png", "./Continuous-temporal/video_captioning/video_captioning_126_3.png", "./Continuous-temporal/video_captioning/video_captioning_126_4.png", "./Continuous-temporal/video_captioning/video_captioning_126_5.png", "./Continuous-temporal/video_captioning/video_captioning_126_6.png", "./Continuous-temporal/video_captioning/video_captioning_126_7.png", "./Continuous-temporal/video_captioning/video_captioning_126_8.png", "./Continuous-temporal/video_captioning/video_captioning_126_9.png", "./Continuous-temporal/video_captioning/video_captioning_126_10.png", "./Continuous-temporal/video_captioning/video_captioning_126_11.png", "./Continuous-temporal/video_captioning/video_captioning_126_12.png", "./Continuous-temporal/video_captioning/video_captioning_126_13.png", "./Continuous-temporal/video_captioning/video_captioning_126_14.png", "./Continuous-temporal/video_captioning/video_captioning_126_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a group of people in winter clothing skiing down a snowy slope\nB: a man in striped collared shirt discusses jobs in news room of bloomberg\nC: a woman in a floral dress gardening in her backyard\nD: a child in a superhero costume playing in a playground", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a group of people in winter clothing skiing down a snowy slope\nB: a man in striped collared shirt discusses jobs in news room of bloomberg\nC: a woman in a floral dress gardening in her backyard\nD: a child in a superhero costume playing in a playground", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_127_0.png", "./Continuous-temporal/video_captioning/video_captioning_127_1.png", "./Continuous-temporal/video_captioning/video_captioning_127_2.png", "./Continuous-temporal/video_captioning/video_captioning_127_3.png", "./Continuous-temporal/video_captioning/video_captioning_127_4.png", "./Continuous-temporal/video_captioning/video_captioning_127_5.png", "./Continuous-temporal/video_captioning/video_captioning_127_6.png", "./Continuous-temporal/video_captioning/video_captioning_127_7.png", "./Continuous-temporal/video_captioning/video_captioning_127_8.png", "./Continuous-temporal/video_captioning/video_captioning_127_9.png", "./Continuous-temporal/video_captioning/video_captioning_127_10.png", "./Continuous-temporal/video_captioning/video_captioning_127_11.png", "./Continuous-temporal/video_captioning/video_captioning_127_12.png", "./Continuous-temporal/video_captioning/video_captioning_127_13.png", "./Continuous-temporal/video_captioning/video_captioning_127_14.png", "./Continuous-temporal/video_captioning/video_captioning_127_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: people are dancing but there are no advertisements\nB: no music is playing and there are no advertisements\nC: the video is silent and there are no advertisements\nD: music is playing and advertisements was showing", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: people are dancing but there are no advertisements\nB: no music is playing and there are no advertisements\nC: the video is silent and there are no advertisements\nD: music is playing and advertisements was showing", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_128_0.png", "./Continuous-temporal/video_captioning/video_captioning_128_1.png", "./Continuous-temporal/video_captioning/video_captioning_128_2.png", "./Continuous-temporal/video_captioning/video_captioning_128_3.png", "./Continuous-temporal/video_captioning/video_captioning_128_4.png", "./Continuous-temporal/video_captioning/video_captioning_128_5.png", "./Continuous-temporal/video_captioning/video_captioning_128_6.png", "./Continuous-temporal/video_captioning/video_captioning_128_7.png", "./Continuous-temporal/video_captioning/video_captioning_128_8.png", "./Continuous-temporal/video_captioning/video_captioning_128_9.png", "./Continuous-temporal/video_captioning/video_captioning_128_10.png", "./Continuous-temporal/video_captioning/video_captioning_128_11.png", "./Continuous-temporal/video_captioning/video_captioning_128_12.png", "./Continuous-temporal/video_captioning/video_captioning_128_13.png", "./Continuous-temporal/video_captioning/video_captioning_128_14.png", "./Continuous-temporal/video_captioning/video_captioning_128_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: audience watches a magic show\nB: street performer entertains a small group\nC: boy band performs for a crowd\nD: solo artist sings in an empty room", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: audience watches a magic show\nB: street performer entertains a small group\nC: boy band performs for a crowd\nD: solo artist sings in an empty room", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_129_0.png", "./Continuous-temporal/video_captioning/video_captioning_129_1.png", "./Continuous-temporal/video_captioning/video_captioning_129_2.png", "./Continuous-temporal/video_captioning/video_captioning_129_3.png", "./Continuous-temporal/video_captioning/video_captioning_129_4.png", "./Continuous-temporal/video_captioning/video_captioning_129_5.png", "./Continuous-temporal/video_captioning/video_captioning_129_6.png", "./Continuous-temporal/video_captioning/video_captioning_129_7.png", "./Continuous-temporal/video_captioning/video_captioning_129_8.png", "./Continuous-temporal/video_captioning/video_captioning_129_9.png", "./Continuous-temporal/video_captioning/video_captioning_129_10.png", "./Continuous-temporal/video_captioning/video_captioning_129_11.png", "./Continuous-temporal/video_captioning/video_captioning_129_12.png", "./Continuous-temporal/video_captioning/video_captioning_129_13.png", "./Continuous-temporal/video_captioning/video_captioning_129_14.png", "./Continuous-temporal/video_captioning/video_captioning_129_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a person is knitting a sweater\nB: a person is brushing a cat\nC: a person is cutting a cake\nD: a person is washing a car", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a person is knitting a sweater\nB: a person is brushing a cat\nC: a person is cutting a cake\nD: a person is washing a car", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_130_0.png", "./Continuous-temporal/video_captioning/video_captioning_130_1.png", "./Continuous-temporal/video_captioning/video_captioning_130_2.png", "./Continuous-temporal/video_captioning/video_captioning_130_3.png", "./Continuous-temporal/video_captioning/video_captioning_130_4.png", "./Continuous-temporal/video_captioning/video_captioning_130_5.png", "./Continuous-temporal/video_captioning/video_captioning_130_6.png", "./Continuous-temporal/video_captioning/video_captioning_130_7.png", "./Continuous-temporal/video_captioning/video_captioning_130_8.png", "./Continuous-temporal/video_captioning/video_captioning_130_9.png", "./Continuous-temporal/video_captioning/video_captioning_130_10.png", "./Continuous-temporal/video_captioning/video_captioning_130_11.png", "./Continuous-temporal/video_captioning/video_captioning_130_12.png", "./Continuous-temporal/video_captioning/video_captioning_130_13.png", "./Continuous-temporal/video_captioning/video_captioning_130_14.png", "./Continuous-temporal/video_captioning/video_captioning_130_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a baby throws a phone receiver\nB: a baby plays with a phone receiver\nC: a baby puts down a phone receiver\nD: a baby picks up a phone receiver", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a baby throws a phone receiver\nB: a baby plays with a phone receiver\nC: a baby puts down a phone receiver\nD: a baby picks up a phone receiver", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_131_0.png", "./Continuous-temporal/video_captioning/video_captioning_131_1.png", "./Continuous-temporal/video_captioning/video_captioning_131_2.png", "./Continuous-temporal/video_captioning/video_captioning_131_3.png", "./Continuous-temporal/video_captioning/video_captioning_131_4.png", "./Continuous-temporal/video_captioning/video_captioning_131_5.png", "./Continuous-temporal/video_captioning/video_captioning_131_6.png", "./Continuous-temporal/video_captioning/video_captioning_131_7.png", "./Continuous-temporal/video_captioning/video_captioning_131_8.png", "./Continuous-temporal/video_captioning/video_captioning_131_9.png", "./Continuous-temporal/video_captioning/video_captioning_131_10.png", "./Continuous-temporal/video_captioning/video_captioning_131_11.png", "./Continuous-temporal/video_captioning/video_captioning_131_12.png", "./Continuous-temporal/video_captioning/video_captioning_131_13.png", "./Continuous-temporal/video_captioning/video_captioning_131_14.png", "./Continuous-temporal/video_captioning/video_captioning_131_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: many people are doing a native dance\nB: a group of people are singing a popular song\nC: a couple of people are playing a traditional game\nD: a few people are quietly watching", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: many people are doing a native dance\nB: a group of people are singing a popular song\nC: a couple of people are playing a traditional game\nD: a few people are quietly watching", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_132_0.png", "./Continuous-temporal/video_captioning/video_captioning_132_1.png", "./Continuous-temporal/video_captioning/video_captioning_132_2.png", "./Continuous-temporal/video_captioning/video_captioning_132_3.png", "./Continuous-temporal/video_captioning/video_captioning_132_4.png", "./Continuous-temporal/video_captioning/video_captioning_132_5.png", "./Continuous-temporal/video_captioning/video_captioning_132_6.png", "./Continuous-temporal/video_captioning/video_captioning_132_7.png", "./Continuous-temporal/video_captioning/video_captioning_132_8.png", "./Continuous-temporal/video_captioning/video_captioning_132_9.png", "./Continuous-temporal/video_captioning/video_captioning_132_10.png", "./Continuous-temporal/video_captioning/video_captioning_132_11.png", "./Continuous-temporal/video_captioning/video_captioning_132_12.png", "./Continuous-temporal/video_captioning/video_captioning_132_13.png", "./Continuous-temporal/video_captioning/video_captioning_132_14.png", "./Continuous-temporal/video_captioning/video_captioning_132_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a man is kicking and punching water filled jars\nB: a man is cooking a meal in the kitchen\nC: a man is swimming in a pool\nD: a man is gardening in the backyard", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a man is kicking and punching water filled jars\nB: a man is cooking a meal in the kitchen\nC: a man is swimming in a pool\nD: a man is gardening in the backyard", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_133_0.png", "./Continuous-temporal/video_captioning/video_captioning_133_1.png", "./Continuous-temporal/video_captioning/video_captioning_133_2.png", "./Continuous-temporal/video_captioning/video_captioning_133_3.png", "./Continuous-temporal/video_captioning/video_captioning_133_4.png", "./Continuous-temporal/video_captioning/video_captioning_133_5.png", "./Continuous-temporal/video_captioning/video_captioning_133_6.png", "./Continuous-temporal/video_captioning/video_captioning_133_7.png", "./Continuous-temporal/video_captioning/video_captioning_133_8.png", "./Continuous-temporal/video_captioning/video_captioning_133_9.png", "./Continuous-temporal/video_captioning/video_captioning_133_10.png", "./Continuous-temporal/video_captioning/video_captioning_133_11.png", "./Continuous-temporal/video_captioning/video_captioning_133_12.png", "./Continuous-temporal/video_captioning/video_captioning_133_13.png", "./Continuous-temporal/video_captioning/video_captioning_133_14.png", "./Continuous-temporal/video_captioning/video_captioning_133_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a man is discussing his favorite movies and the top one is Jurassic Park\nB: a young girl is talking about her favorite books and the top one is Harry Potter\nC: a woman is describing her preferred TV shows and the second one is Game of Thrones\nD: a women is talking about the books she likes and the second favourite one is the amc the walking dead", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a man is discussing his favorite movies and the top one is Jurassic Park\nB: a young girl is talking about her favorite books and the top one is Harry Potter\nC: a woman is describing her preferred TV shows and the second one is Game of Thrones\nD: a women is talking about the books she likes and the second favourite one is the amc the walking dead", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_134_0.png", "./Continuous-temporal/video_captioning/video_captioning_134_1.png", "./Continuous-temporal/video_captioning/video_captioning_134_2.png", "./Continuous-temporal/video_captioning/video_captioning_134_3.png", "./Continuous-temporal/video_captioning/video_captioning_134_4.png", "./Continuous-temporal/video_captioning/video_captioning_134_5.png", "./Continuous-temporal/video_captioning/video_captioning_134_6.png", "./Continuous-temporal/video_captioning/video_captioning_134_7.png", "./Continuous-temporal/video_captioning/video_captioning_134_8.png", "./Continuous-temporal/video_captioning/video_captioning_134_9.png", "./Continuous-temporal/video_captioning/video_captioning_134_10.png", "./Continuous-temporal/video_captioning/video_captioning_134_11.png", "./Continuous-temporal/video_captioning/video_captioning_134_12.png", "./Continuous-temporal/video_captioning/video_captioning_134_13.png", "./Continuous-temporal/video_captioning/video_captioning_134_14.png", "./Continuous-temporal/video_captioning/video_captioning_134_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: there is a man walking alone in a park\nB: there is a man shooting other people in a corridor\nC: a woman shopping for groceries in a supermarket\nD: two kids playing with a ball in a playground", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: there is a man walking alone in a park\nB: there is a man shooting other people in a corridor\nC: a woman shopping for groceries in a supermarket\nD: two kids playing with a ball in a playground", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_135_0.png", "./Continuous-temporal/video_captioning/video_captioning_135_1.png", "./Continuous-temporal/video_captioning/video_captioning_135_2.png", "./Continuous-temporal/video_captioning/video_captioning_135_3.png", "./Continuous-temporal/video_captioning/video_captioning_135_4.png", "./Continuous-temporal/video_captioning/video_captioning_135_5.png", "./Continuous-temporal/video_captioning/video_captioning_135_6.png", "./Continuous-temporal/video_captioning/video_captioning_135_7.png", "./Continuous-temporal/video_captioning/video_captioning_135_8.png", "./Continuous-temporal/video_captioning/video_captioning_135_9.png", "./Continuous-temporal/video_captioning/video_captioning_135_10.png", "./Continuous-temporal/video_captioning/video_captioning_135_11.png", "./Continuous-temporal/video_captioning/video_captioning_135_12.png", "./Continuous-temporal/video_captioning/video_captioning_135_13.png", "./Continuous-temporal/video_captioning/video_captioning_135_14.png", "./Continuous-temporal/video_captioning/video_captioning_135_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: cartoon of a dolphin on a scooter looking up at a treehouse\nB: cartoon of a squid on a bike looking up at a treehouse\nC: drawing of a cat on a skateboard looking up at a treehouse\nD: animated picture of a monkey on a motorcycle looking up at a treehouse", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: cartoon of a dolphin on a scooter looking up at a treehouse\nB: cartoon of a squid on a bike looking up at a treehouse\nC: drawing of a cat on a skateboard looking up at a treehouse\nD: animated picture of a monkey on a motorcycle looking up at a treehouse", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_136_0.png", "./Continuous-temporal/video_captioning/video_captioning_136_1.png", "./Continuous-temporal/video_captioning/video_captioning_136_2.png", "./Continuous-temporal/video_captioning/video_captioning_136_3.png", "./Continuous-temporal/video_captioning/video_captioning_136_4.png", "./Continuous-temporal/video_captioning/video_captioning_136_5.png", "./Continuous-temporal/video_captioning/video_captioning_136_6.png", "./Continuous-temporal/video_captioning/video_captioning_136_7.png", "./Continuous-temporal/video_captioning/video_captioning_136_8.png", "./Continuous-temporal/video_captioning/video_captioning_136_9.png", "./Continuous-temporal/video_captioning/video_captioning_136_10.png", "./Continuous-temporal/video_captioning/video_captioning_136_11.png", "./Continuous-temporal/video_captioning/video_captioning_136_12.png", "./Continuous-temporal/video_captioning/video_captioning_136_13.png", "./Continuous-temporal/video_captioning/video_captioning_136_14.png", "./Continuous-temporal/video_captioning/video_captioning_136_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a dog sleeping on the couch\nB: a child playing in the park\nC: a woman cleaning the bathroom\nD: a man cooking his kichen", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a dog sleeping on the couch\nB: a child playing in the park\nC: a woman cleaning the bathroom\nD: a man cooking his kichen", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_137_0.png", "./Continuous-temporal/video_captioning/video_captioning_137_1.png", "./Continuous-temporal/video_captioning/video_captioning_137_2.png", "./Continuous-temporal/video_captioning/video_captioning_137_3.png", "./Continuous-temporal/video_captioning/video_captioning_137_4.png", "./Continuous-temporal/video_captioning/video_captioning_137_5.png", "./Continuous-temporal/video_captioning/video_captioning_137_6.png", "./Continuous-temporal/video_captioning/video_captioning_137_7.png", "./Continuous-temporal/video_captioning/video_captioning_137_8.png", "./Continuous-temporal/video_captioning/video_captioning_137_9.png", "./Continuous-temporal/video_captioning/video_captioning_137_10.png", "./Continuous-temporal/video_captioning/video_captioning_137_11.png", "./Continuous-temporal/video_captioning/video_captioning_137_12.png", "./Continuous-temporal/video_captioning/video_captioning_137_13.png", "./Continuous-temporal/video_captioning/video_captioning_137_14.png", "./Continuous-temporal/video_captioning/video_captioning_137_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a black cat sleeps on a sunny window sill\nB: two children play in a park with a ball\nC: a brunette woman stands in the kitchen cooking dinner\nD: a blonde man lies on a bed with a little baby", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a black cat sleeps on a sunny window sill\nB: two children play in a park with a ball\nC: a brunette woman stands in the kitchen cooking dinner\nD: a blonde man lies on a bed with a little baby", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_138_0.png", "./Continuous-temporal/video_captioning/video_captioning_138_1.png", "./Continuous-temporal/video_captioning/video_captioning_138_2.png", "./Continuous-temporal/video_captioning/video_captioning_138_3.png", "./Continuous-temporal/video_captioning/video_captioning_138_4.png", "./Continuous-temporal/video_captioning/video_captioning_138_5.png", "./Continuous-temporal/video_captioning/video_captioning_138_6.png", "./Continuous-temporal/video_captioning/video_captioning_138_7.png", "./Continuous-temporal/video_captioning/video_captioning_138_8.png", "./Continuous-temporal/video_captioning/video_captioning_138_9.png", "./Continuous-temporal/video_captioning/video_captioning_138_10.png", "./Continuous-temporal/video_captioning/video_captioning_138_11.png", "./Continuous-temporal/video_captioning/video_captioning_138_12.png", "./Continuous-temporal/video_captioning/video_captioning_138_13.png", "./Continuous-temporal/video_captioning/video_captioning_138_14.png", "./Continuous-temporal/video_captioning/video_captioning_138_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: the man rode a mechanical bull\nB: the man rode a real bull\nC: the man rode a bicycle\nD: the man rode a horse", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: the man rode a mechanical bull\nB: the man rode a real bull\nC: the man rode a bicycle\nD: the man rode a horse", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_139_0.png", "./Continuous-temporal/video_captioning/video_captioning_139_1.png", "./Continuous-temporal/video_captioning/video_captioning_139_2.png", "./Continuous-temporal/video_captioning/video_captioning_139_3.png", "./Continuous-temporal/video_captioning/video_captioning_139_4.png", "./Continuous-temporal/video_captioning/video_captioning_139_5.png", "./Continuous-temporal/video_captioning/video_captioning_139_6.png", "./Continuous-temporal/video_captioning/video_captioning_139_7.png", "./Continuous-temporal/video_captioning/video_captioning_139_8.png", "./Continuous-temporal/video_captioning/video_captioning_139_9.png", "./Continuous-temporal/video_captioning/video_captioning_139_10.png", "./Continuous-temporal/video_captioning/video_captioning_139_11.png", "./Continuous-temporal/video_captioning/video_captioning_139_12.png", "./Continuous-temporal/video_captioning/video_captioning_139_13.png", "./Continuous-temporal/video_captioning/video_captioning_139_14.png", "./Continuous-temporal/video_captioning/video_captioning_139_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a group of people in formal attire sitting at a table and having a business meeting\nB: two men in casual attire sitting in a car and discussing about a project\nC: a woman in traditional dress dancing on stage with a group of musicians playing instruments\nD: two girls in design dress wearing cloth standing holding mic in hand on street and person walking beside discusing on topic", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a group of people in formal attire sitting at a table and having a business meeting\nB: two men in casual attire sitting in a car and discussing about a project\nC: a woman in traditional dress dancing on stage with a group of musicians playing instruments\nD: two girls in design dress wearing cloth standing holding mic in hand on street and person walking beside discusing on topic", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_140_0.png", "./Continuous-temporal/video_captioning/video_captioning_140_1.png", "./Continuous-temporal/video_captioning/video_captioning_140_2.png", "./Continuous-temporal/video_captioning/video_captioning_140_3.png", "./Continuous-temporal/video_captioning/video_captioning_140_4.png", "./Continuous-temporal/video_captioning/video_captioning_140_5.png", "./Continuous-temporal/video_captioning/video_captioning_140_6.png", "./Continuous-temporal/video_captioning/video_captioning_140_7.png", "./Continuous-temporal/video_captioning/video_captioning_140_8.png", "./Continuous-temporal/video_captioning/video_captioning_140_9.png", "./Continuous-temporal/video_captioning/video_captioning_140_10.png", "./Continuous-temporal/video_captioning/video_captioning_140_11.png", "./Continuous-temporal/video_captioning/video_captioning_140_12.png", "./Continuous-temporal/video_captioning/video_captioning_140_13.png", "./Continuous-temporal/video_captioning/video_captioning_140_14.png", "./Continuous-temporal/video_captioning/video_captioning_140_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a man cutting carrots with a knife in a slow manner\nB: a person chopping onions with a fork slowly\nC: a woman peeling potatoes with a peeler quickly\nD: a man slicing tomatoes with a spoon in a fast manner", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a man cutting carrots with a knife in a slow manner\nB: a person chopping onions with a fork slowly\nC: a woman peeling potatoes with a peeler quickly\nD: a man slicing tomatoes with a spoon in a fast manner", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_141_0.png", "./Continuous-temporal/video_captioning/video_captioning_141_1.png", "./Continuous-temporal/video_captioning/video_captioning_141_2.png", "./Continuous-temporal/video_captioning/video_captioning_141_3.png", "./Continuous-temporal/video_captioning/video_captioning_141_4.png", "./Continuous-temporal/video_captioning/video_captioning_141_5.png", "./Continuous-temporal/video_captioning/video_captioning_141_6.png", "./Continuous-temporal/video_captioning/video_captioning_141_7.png", "./Continuous-temporal/video_captioning/video_captioning_141_8.png", "./Continuous-temporal/video_captioning/video_captioning_141_9.png", "./Continuous-temporal/video_captioning/video_captioning_141_10.png", "./Continuous-temporal/video_captioning/video_captioning_141_11.png", "./Continuous-temporal/video_captioning/video_captioning_141_12.png", "./Continuous-temporal/video_captioning/video_captioning_141_13.png", "./Continuous-temporal/video_captioning/video_captioning_141_14.png", "./Continuous-temporal/video_captioning/video_captioning_141_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a cat is sleeping on the windowsill\nB: an indian woman is applying makeup between her hair\nC: a chef is cooking in the kitchen\nD: a young man is playing guitar on the beach", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a cat is sleeping on the windowsill\nB: an indian woman is applying makeup between her hair\nC: a chef is cooking in the kitchen\nD: a young man is playing guitar on the beach", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_142_0.png", "./Continuous-temporal/video_captioning/video_captioning_142_1.png", "./Continuous-temporal/video_captioning/video_captioning_142_2.png", "./Continuous-temporal/video_captioning/video_captioning_142_3.png", "./Continuous-temporal/video_captioning/video_captioning_142_4.png", "./Continuous-temporal/video_captioning/video_captioning_142_5.png", "./Continuous-temporal/video_captioning/video_captioning_142_6.png", "./Continuous-temporal/video_captioning/video_captioning_142_7.png", "./Continuous-temporal/video_captioning/video_captioning_142_8.png", "./Continuous-temporal/video_captioning/video_captioning_142_9.png", "./Continuous-temporal/video_captioning/video_captioning_142_10.png", "./Continuous-temporal/video_captioning/video_captioning_142_11.png", "./Continuous-temporal/video_captioning/video_captioning_142_12.png", "./Continuous-temporal/video_captioning/video_captioning_142_13.png", "./Continuous-temporal/video_captioning/video_captioning_142_14.png", "./Continuous-temporal/video_captioning/video_captioning_142_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a man is sitting on a bench\nB: a man is asking for lift\nC: a man is talking on the phone\nD: a man is walking in the park", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a man is sitting on a bench\nB: a man is asking for lift\nC: a man is talking on the phone\nD: a man is walking in the park", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_143_0.png", "./Continuous-temporal/video_captioning/video_captioning_143_1.png", "./Continuous-temporal/video_captioning/video_captioning_143_2.png", "./Continuous-temporal/video_captioning/video_captioning_143_3.png", "./Continuous-temporal/video_captioning/video_captioning_143_4.png", "./Continuous-temporal/video_captioning/video_captioning_143_5.png", "./Continuous-temporal/video_captioning/video_captioning_143_6.png", "./Continuous-temporal/video_captioning/video_captioning_143_7.png", "./Continuous-temporal/video_captioning/video_captioning_143_8.png", "./Continuous-temporal/video_captioning/video_captioning_143_9.png", "./Continuous-temporal/video_captioning/video_captioning_143_10.png", "./Continuous-temporal/video_captioning/video_captioning_143_11.png", "./Continuous-temporal/video_captioning/video_captioning_143_12.png", "./Continuous-temporal/video_captioning/video_captioning_143_13.png", "./Continuous-temporal/video_captioning/video_captioning_143_14.png", "./Continuous-temporal/video_captioning/video_captioning_143_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a woman is speaking into a microphone\nB: a man is playing guitar on stage\nC: a man is speaking into a microphone\nD: a man is typing on a computer keyboard", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a woman is speaking into a microphone\nB: a man is playing guitar on stage\nC: a man is speaking into a microphone\nD: a man is typing on a computer keyboard", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_144_0.png", "./Continuous-temporal/video_captioning/video_captioning_144_1.png", "./Continuous-temporal/video_captioning/video_captioning_144_2.png", "./Continuous-temporal/video_captioning/video_captioning_144_3.png", "./Continuous-temporal/video_captioning/video_captioning_144_4.png", "./Continuous-temporal/video_captioning/video_captioning_144_5.png", "./Continuous-temporal/video_captioning/video_captioning_144_6.png", "./Continuous-temporal/video_captioning/video_captioning_144_7.png", "./Continuous-temporal/video_captioning/video_captioning_144_8.png", "./Continuous-temporal/video_captioning/video_captioning_144_9.png", "./Continuous-temporal/video_captioning/video_captioning_144_10.png", "./Continuous-temporal/video_captioning/video_captioning_144_11.png", "./Continuous-temporal/video_captioning/video_captioning_144_12.png", "./Continuous-temporal/video_captioning/video_captioning_144_13.png", "./Continuous-temporal/video_captioning/video_captioning_144_14.png", "./Continuous-temporal/video_captioning/video_captioning_144_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: some kittens are eating\nB: a cat is sleeping\nC: dogs are playing\nD: birds are flying", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: some kittens are eating\nB: a cat is sleeping\nC: dogs are playing\nD: birds are flying", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_145_0.png", "./Continuous-temporal/video_captioning/video_captioning_145_1.png", "./Continuous-temporal/video_captioning/video_captioning_145_2.png", "./Continuous-temporal/video_captioning/video_captioning_145_3.png", "./Continuous-temporal/video_captioning/video_captioning_145_4.png", "./Continuous-temporal/video_captioning/video_captioning_145_5.png", "./Continuous-temporal/video_captioning/video_captioning_145_6.png", "./Continuous-temporal/video_captioning/video_captioning_145_7.png", "./Continuous-temporal/video_captioning/video_captioning_145_8.png", "./Continuous-temporal/video_captioning/video_captioning_145_9.png", "./Continuous-temporal/video_captioning/video_captioning_145_10.png", "./Continuous-temporal/video_captioning/video_captioning_145_11.png", "./Continuous-temporal/video_captioning/video_captioning_145_12.png", "./Continuous-temporal/video_captioning/video_captioning_145_13.png", "./Continuous-temporal/video_captioning/video_captioning_145_14.png", "./Continuous-temporal/video_captioning/video_captioning_145_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: the football players practicing on the field in the evening\nB: two people running on the track in the morning\nC: the tennis players wearing blue and red t shirts and play the tennis in the tennis court at the night time\nD: a group of people playing basketball in the park during daytime", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: the football players practicing on the field in the evening\nB: two people running on the track in the morning\nC: the tennis players wearing blue and red t shirts and play the tennis in the tennis court at the night time\nD: a group of people playing basketball in the park during daytime", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_146_0.png", "./Continuous-temporal/video_captioning/video_captioning_146_1.png", "./Continuous-temporal/video_captioning/video_captioning_146_2.png", "./Continuous-temporal/video_captioning/video_captioning_146_3.png", "./Continuous-temporal/video_captioning/video_captioning_146_4.png", "./Continuous-temporal/video_captioning/video_captioning_146_5.png", "./Continuous-temporal/video_captioning/video_captioning_146_6.png", "./Continuous-temporal/video_captioning/video_captioning_146_7.png", "./Continuous-temporal/video_captioning/video_captioning_146_8.png", "./Continuous-temporal/video_captioning/video_captioning_146_9.png", "./Continuous-temporal/video_captioning/video_captioning_146_10.png", "./Continuous-temporal/video_captioning/video_captioning_146_11.png", "./Continuous-temporal/video_captioning/video_captioning_146_12.png", "./Continuous-temporal/video_captioning/video_captioning_146_13.png", "./Continuous-temporal/video_captioning/video_captioning_146_14.png", "./Continuous-temporal/video_captioning/video_captioning_146_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a woman walked by\nB: a child sat on a swing\nC: a man jumped high\nD: a man fell down", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a woman walked by\nB: a child sat on a swing\nC: a man jumped high\nD: a man fell down", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_147_0.png", "./Continuous-temporal/video_captioning/video_captioning_147_1.png", "./Continuous-temporal/video_captioning/video_captioning_147_2.png", "./Continuous-temporal/video_captioning/video_captioning_147_3.png", "./Continuous-temporal/video_captioning/video_captioning_147_4.png", "./Continuous-temporal/video_captioning/video_captioning_147_5.png", "./Continuous-temporal/video_captioning/video_captioning_147_6.png", "./Continuous-temporal/video_captioning/video_captioning_147_7.png", "./Continuous-temporal/video_captioning/video_captioning_147_8.png", "./Continuous-temporal/video_captioning/video_captioning_147_9.png", "./Continuous-temporal/video_captioning/video_captioning_147_10.png", "./Continuous-temporal/video_captioning/video_captioning_147_11.png", "./Continuous-temporal/video_captioning/video_captioning_147_12.png", "./Continuous-temporal/video_captioning/video_captioning_147_13.png", "./Continuous-temporal/video_captioning/video_captioning_147_14.png", "./Continuous-temporal/video_captioning/video_captioning_147_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a group of people are riding on a raft in a body of water\nB: a person is standing alone on a boat in a lake\nC: a canoe with people rowing in a river\nD: a group of people are swimming in a pool", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a group of people are riding on a raft in a body of water\nB: a person is standing alone on a boat in a lake\nC: a canoe with people rowing in a river\nD: a group of people are swimming in a pool", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_148_0.png", "./Continuous-temporal/video_captioning/video_captioning_148_1.png", "./Continuous-temporal/video_captioning/video_captioning_148_2.png", "./Continuous-temporal/video_captioning/video_captioning_148_3.png", "./Continuous-temporal/video_captioning/video_captioning_148_4.png", "./Continuous-temporal/video_captioning/video_captioning_148_5.png", "./Continuous-temporal/video_captioning/video_captioning_148_6.png", "./Continuous-temporal/video_captioning/video_captioning_148_7.png", "./Continuous-temporal/video_captioning/video_captioning_148_8.png", "./Continuous-temporal/video_captioning/video_captioning_148_9.png", "./Continuous-temporal/video_captioning/video_captioning_148_10.png", "./Continuous-temporal/video_captioning/video_captioning_148_11.png", "./Continuous-temporal/video_captioning/video_captioning_148_12.png", "./Continuous-temporal/video_captioning/video_captioning_148_13.png", "./Continuous-temporal/video_captioning/video_captioning_148_14.png", "./Continuous-temporal/video_captioning/video_captioning_148_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: someone pours a pot of milk into a larger pot\nB: someone sprinkles salt and pepper onto a plate of food\nC: someone pours a pot of tomato sauce into a larger pot\nD: someone mixes flour and water in a bowl", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: someone pours a pot of milk into a larger pot\nB: someone sprinkles salt and pepper onto a plate of food\nC: someone pours a pot of tomato sauce into a larger pot\nD: someone mixes flour and water in a bowl", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_149_0.png", "./Continuous-temporal/video_captioning/video_captioning_149_1.png", "./Continuous-temporal/video_captioning/video_captioning_149_2.png", "./Continuous-temporal/video_captioning/video_captioning_149_3.png", "./Continuous-temporal/video_captioning/video_captioning_149_4.png", "./Continuous-temporal/video_captioning/video_captioning_149_5.png", "./Continuous-temporal/video_captioning/video_captioning_149_6.png", "./Continuous-temporal/video_captioning/video_captioning_149_7.png", "./Continuous-temporal/video_captioning/video_captioning_149_8.png", "./Continuous-temporal/video_captioning/video_captioning_149_9.png", "./Continuous-temporal/video_captioning/video_captioning_149_10.png", "./Continuous-temporal/video_captioning/video_captioning_149_11.png", "./Continuous-temporal/video_captioning/video_captioning_149_12.png", "./Continuous-temporal/video_captioning/video_captioning_149_13.png", "./Continuous-temporal/video_captioning/video_captioning_149_14.png", "./Continuous-temporal/video_captioning/video_captioning_149_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a person is singing\nB: a person is running\nC: a person is sleeping\nD: a person is cooking", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a person is singing\nB: a person is running\nC: a person is sleeping\nD: a person is cooking", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_150_0.png", "./Continuous-temporal/video_captioning/video_captioning_150_1.png", "./Continuous-temporal/video_captioning/video_captioning_150_2.png", "./Continuous-temporal/video_captioning/video_captioning_150_3.png", "./Continuous-temporal/video_captioning/video_captioning_150_4.png", "./Continuous-temporal/video_captioning/video_captioning_150_5.png", "./Continuous-temporal/video_captioning/video_captioning_150_6.png", "./Continuous-temporal/video_captioning/video_captioning_150_7.png", "./Continuous-temporal/video_captioning/video_captioning_150_8.png", "./Continuous-temporal/video_captioning/video_captioning_150_9.png", "./Continuous-temporal/video_captioning/video_captioning_150_10.png", "./Continuous-temporal/video_captioning/video_captioning_150_11.png", "./Continuous-temporal/video_captioning/video_captioning_150_12.png", "./Continuous-temporal/video_captioning/video_captioning_150_13.png", "./Continuous-temporal/video_captioning/video_captioning_150_14.png", "./Continuous-temporal/video_captioning/video_captioning_150_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: three baseballs are being hit by a bat\nB: three basketballs are bouncing on a court\nC: three soccer balls are laying in a field and then three men in black athletic cloths attempt to shoot a goal\nD: three tennis balls are being served by players", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: three baseballs are being hit by a bat\nB: three basketballs are bouncing on a court\nC: three soccer balls are laying in a field and then three men in black athletic cloths attempt to shoot a goal\nD: three tennis balls are being served by players", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_151_0.png", "./Continuous-temporal/video_captioning/video_captioning_151_1.png", "./Continuous-temporal/video_captioning/video_captioning_151_2.png", "./Continuous-temporal/video_captioning/video_captioning_151_3.png", "./Continuous-temporal/video_captioning/video_captioning_151_4.png", "./Continuous-temporal/video_captioning/video_captioning_151_5.png", "./Continuous-temporal/video_captioning/video_captioning_151_6.png", "./Continuous-temporal/video_captioning/video_captioning_151_7.png", "./Continuous-temporal/video_captioning/video_captioning_151_8.png", "./Continuous-temporal/video_captioning/video_captioning_151_9.png", "./Continuous-temporal/video_captioning/video_captioning_151_10.png", "./Continuous-temporal/video_captioning/video_captioning_151_11.png", "./Continuous-temporal/video_captioning/video_captioning_151_12.png", "./Continuous-temporal/video_captioning/video_captioning_151_13.png", "./Continuous-temporal/video_captioning/video_captioning_151_14.png", "./Continuous-temporal/video_captioning/video_captioning_151_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: some people making a human pyramid\nB: a cat sleeping on a chair\nC: a chef cooking in a restaurant kitchen\nD: a basketball player shooting a three-pointer", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: some people making a human pyramid\nB: a cat sleeping on a chair\nC: a chef cooking in a restaurant kitchen\nD: a basketball player shooting a three-pointer", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_152_0.png", "./Continuous-temporal/video_captioning/video_captioning_152_1.png", "./Continuous-temporal/video_captioning/video_captioning_152_2.png", "./Continuous-temporal/video_captioning/video_captioning_152_3.png", "./Continuous-temporal/video_captioning/video_captioning_152_4.png", "./Continuous-temporal/video_captioning/video_captioning_152_5.png", "./Continuous-temporal/video_captioning/video_captioning_152_6.png", "./Continuous-temporal/video_captioning/video_captioning_152_7.png", "./Continuous-temporal/video_captioning/video_captioning_152_8.png", "./Continuous-temporal/video_captioning/video_captioning_152_9.png", "./Continuous-temporal/video_captioning/video_captioning_152_10.png", "./Continuous-temporal/video_captioning/video_captioning_152_11.png", "./Continuous-temporal/video_captioning/video_captioning_152_12.png", "./Continuous-temporal/video_captioning/video_captioning_152_13.png", "./Continuous-temporal/video_captioning/video_captioning_152_14.png", "./Continuous-temporal/video_captioning/video_captioning_152_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: depicting a group of people sitting\nB: featuring some persons running\nC: showing people walking slowly\nD: displaying individuals standing still", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: depicting a group of people sitting\nB: featuring some persons running\nC: showing people walking slowly\nD: displaying individuals standing still", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_153_0.png", "./Continuous-temporal/video_captioning/video_captioning_153_1.png", "./Continuous-temporal/video_captioning/video_captioning_153_2.png", "./Continuous-temporal/video_captioning/video_captioning_153_3.png", "./Continuous-temporal/video_captioning/video_captioning_153_4.png", "./Continuous-temporal/video_captioning/video_captioning_153_5.png", "./Continuous-temporal/video_captioning/video_captioning_153_6.png", "./Continuous-temporal/video_captioning/video_captioning_153_7.png", "./Continuous-temporal/video_captioning/video_captioning_153_8.png", "./Continuous-temporal/video_captioning/video_captioning_153_9.png", "./Continuous-temporal/video_captioning/video_captioning_153_10.png", "./Continuous-temporal/video_captioning/video_captioning_153_11.png", "./Continuous-temporal/video_captioning/video_captioning_153_12.png", "./Continuous-temporal/video_captioning/video_captioning_153_13.png", "./Continuous-temporal/video_captioning/video_captioning_153_14.png", "./Continuous-temporal/video_captioning/video_captioning_153_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: two women are wrestling\nB: two men playing tennis\nC: a man and a woman dancing\nD: a group of people doing yoga", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: two women are wrestling\nB: two men playing tennis\nC: a man and a woman dancing\nD: a group of people doing yoga", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_154_0.png", "./Continuous-temporal/video_captioning/video_captioning_154_1.png", "./Continuous-temporal/video_captioning/video_captioning_154_2.png", "./Continuous-temporal/video_captioning/video_captioning_154_3.png", "./Continuous-temporal/video_captioning/video_captioning_154_4.png", "./Continuous-temporal/video_captioning/video_captioning_154_5.png", "./Continuous-temporal/video_captioning/video_captioning_154_6.png", "./Continuous-temporal/video_captioning/video_captioning_154_7.png", "./Continuous-temporal/video_captioning/video_captioning_154_8.png", "./Continuous-temporal/video_captioning/video_captioning_154_9.png", "./Continuous-temporal/video_captioning/video_captioning_154_10.png", "./Continuous-temporal/video_captioning/video_captioning_154_11.png", "./Continuous-temporal/video_captioning/video_captioning_154_12.png", "./Continuous-temporal/video_captioning/video_captioning_154_13.png", "./Continuous-temporal/video_captioning/video_captioning_154_14.png", "./Continuous-temporal/video_captioning/video_captioning_154_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a transvestite shows what she bought for her dog including shampoo and conditioner made by martha stuart\nB: a woman presents her latest DIY home renovation tools and supplies\nC: a man showcases his new car detailing products from a popular brand\nD: a pet owner displays a range of organic treats and toys for their cat", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a transvestite shows what she bought for her dog including shampoo and conditioner made by martha stuart\nB: a woman presents her latest DIY home renovation tools and supplies\nC: a man showcases his new car detailing products from a popular brand\nD: a pet owner displays a range of organic treats and toys for their cat", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_155_0.png", "./Continuous-temporal/video_captioning/video_captioning_155_1.png", "./Continuous-temporal/video_captioning/video_captioning_155_2.png", "./Continuous-temporal/video_captioning/video_captioning_155_3.png", "./Continuous-temporal/video_captioning/video_captioning_155_4.png", "./Continuous-temporal/video_captioning/video_captioning_155_5.png", "./Continuous-temporal/video_captioning/video_captioning_155_6.png", "./Continuous-temporal/video_captioning/video_captioning_155_7.png", "./Continuous-temporal/video_captioning/video_captioning_155_8.png", "./Continuous-temporal/video_captioning/video_captioning_155_9.png", "./Continuous-temporal/video_captioning/video_captioning_155_10.png", "./Continuous-temporal/video_captioning/video_captioning_155_11.png", "./Continuous-temporal/video_captioning/video_captioning_155_12.png", "./Continuous-temporal/video_captioning/video_captioning_155_13.png", "./Continuous-temporal/video_captioning/video_captioning_155_14.png", "./Continuous-temporal/video_captioning/video_captioning_155_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a dog is sitting on a couch\nB: a fish is swimming in the ocean\nC: a cat is drinking water from a faucet\nD: a bird is flying in the sky", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a dog is sitting on a couch\nB: a fish is swimming in the ocean\nC: a cat is drinking water from a faucet\nD: a bird is flying in the sky", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_156_0.png", "./Continuous-temporal/video_captioning/video_captioning_156_1.png", "./Continuous-temporal/video_captioning/video_captioning_156_2.png", "./Continuous-temporal/video_captioning/video_captioning_156_3.png", "./Continuous-temporal/video_captioning/video_captioning_156_4.png", "./Continuous-temporal/video_captioning/video_captioning_156_5.png", "./Continuous-temporal/video_captioning/video_captioning_156_6.png", "./Continuous-temporal/video_captioning/video_captioning_156_7.png", "./Continuous-temporal/video_captioning/video_captioning_156_8.png", "./Continuous-temporal/video_captioning/video_captioning_156_9.png", "./Continuous-temporal/video_captioning/video_captioning_156_10.png", "./Continuous-temporal/video_captioning/video_captioning_156_11.png", "./Continuous-temporal/video_captioning/video_captioning_156_12.png", "./Continuous-temporal/video_captioning/video_captioning_156_13.png", "./Continuous-temporal/video_captioning/video_captioning_156_14.png", "./Continuous-temporal/video_captioning/video_captioning_156_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: the rabbit played with a pink stuffed rabbit\nB: a bird perched on a tree branch\nC: the dog barked at a squirrel\nD: a cat chased a red ball", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: the rabbit played with a pink stuffed rabbit\nB: a bird perched on a tree branch\nC: the dog barked at a squirrel\nD: a cat chased a red ball", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_157_0.png", "./Continuous-temporal/video_captioning/video_captioning_157_1.png", "./Continuous-temporal/video_captioning/video_captioning_157_2.png", "./Continuous-temporal/video_captioning/video_captioning_157_3.png", "./Continuous-temporal/video_captioning/video_captioning_157_4.png", "./Continuous-temporal/video_captioning/video_captioning_157_5.png", "./Continuous-temporal/video_captioning/video_captioning_157_6.png", "./Continuous-temporal/video_captioning/video_captioning_157_7.png", "./Continuous-temporal/video_captioning/video_captioning_157_8.png", "./Continuous-temporal/video_captioning/video_captioning_157_9.png", "./Continuous-temporal/video_captioning/video_captioning_157_10.png", "./Continuous-temporal/video_captioning/video_captioning_157_11.png", "./Continuous-temporal/video_captioning/video_captioning_157_12.png", "./Continuous-temporal/video_captioning/video_captioning_157_13.png", "./Continuous-temporal/video_captioning/video_captioning_157_14.png", "./Continuous-temporal/video_captioning/video_captioning_157_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: the man is sitting on the something\nB: the woman is cooking the something\nC: the girl is playing with the something\nD: the boy is eating the something", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: the man is sitting on the something\nB: the woman is cooking the something\nC: the girl is playing with the something\nD: the boy is eating the something", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_158_0.png", "./Continuous-temporal/video_captioning/video_captioning_158_1.png", "./Continuous-temporal/video_captioning/video_captioning_158_2.png", "./Continuous-temporal/video_captioning/video_captioning_158_3.png", "./Continuous-temporal/video_captioning/video_captioning_158_4.png", "./Continuous-temporal/video_captioning/video_captioning_158_5.png", "./Continuous-temporal/video_captioning/video_captioning_158_6.png", "./Continuous-temporal/video_captioning/video_captioning_158_7.png", "./Continuous-temporal/video_captioning/video_captioning_158_8.png", "./Continuous-temporal/video_captioning/video_captioning_158_9.png", "./Continuous-temporal/video_captioning/video_captioning_158_10.png", "./Continuous-temporal/video_captioning/video_captioning_158_11.png", "./Continuous-temporal/video_captioning/video_captioning_158_12.png", "./Continuous-temporal/video_captioning/video_captioning_158_13.png", "./Continuous-temporal/video_captioning/video_captioning_158_14.png", "./Continuous-temporal/video_captioning/video_captioning_158_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: sleeping and watching TV with 4 boys and 1 girl\nB: standing and staring at 3 kids and 1 adult\nC: sitting and converstion 2 lady and 2 gents\nD: walking and talking to 2 men and 3 women", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: sleeping and watching TV with 4 boys and 1 girl\nB: standing and staring at 3 kids and 1 adult\nC: sitting and converstion 2 lady and 2 gents\nD: walking and talking to 2 men and 3 women", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_159_0.png", "./Continuous-temporal/video_captioning/video_captioning_159_1.png", "./Continuous-temporal/video_captioning/video_captioning_159_2.png", "./Continuous-temporal/video_captioning/video_captioning_159_3.png", "./Continuous-temporal/video_captioning/video_captioning_159_4.png", "./Continuous-temporal/video_captioning/video_captioning_159_5.png", "./Continuous-temporal/video_captioning/video_captioning_159_6.png", "./Continuous-temporal/video_captioning/video_captioning_159_7.png", "./Continuous-temporal/video_captioning/video_captioning_159_8.png", "./Continuous-temporal/video_captioning/video_captioning_159_9.png", "./Continuous-temporal/video_captioning/video_captioning_159_10.png", "./Continuous-temporal/video_captioning/video_captioning_159_11.png", "./Continuous-temporal/video_captioning/video_captioning_159_12.png", "./Continuous-temporal/video_captioning/video_captioning_159_13.png", "./Continuous-temporal/video_captioning/video_captioning_159_14.png", "./Continuous-temporal/video_captioning/video_captioning_159_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a woman is painting on a canvas\nB: a man is cooking in the kitchen\nC: a girl is riding a bicycle\nD: a boy is playing a piano", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a woman is painting on a canvas\nB: a man is cooking in the kitchen\nC: a girl is riding a bicycle\nD: a boy is playing a piano", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_160_0.png", "./Continuous-temporal/video_captioning/video_captioning_160_1.png", "./Continuous-temporal/video_captioning/video_captioning_160_2.png", "./Continuous-temporal/video_captioning/video_captioning_160_3.png", "./Continuous-temporal/video_captioning/video_captioning_160_4.png", "./Continuous-temporal/video_captioning/video_captioning_160_5.png", "./Continuous-temporal/video_captioning/video_captioning_160_6.png", "./Continuous-temporal/video_captioning/video_captioning_160_7.png", "./Continuous-temporal/video_captioning/video_captioning_160_8.png", "./Continuous-temporal/video_captioning/video_captioning_160_9.png", "./Continuous-temporal/video_captioning/video_captioning_160_10.png", "./Continuous-temporal/video_captioning/video_captioning_160_11.png", "./Continuous-temporal/video_captioning/video_captioning_160_12.png", "./Continuous-temporal/video_captioning/video_captioning_160_13.png", "./Continuous-temporal/video_captioning/video_captioning_160_14.png", "./Continuous-temporal/video_captioning/video_captioning_160_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: stunning acting is too good\nB: mohan acting is not impressive\nC: vintage acting is too good\nD: mohan acting is too good", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: stunning acting is too good\nB: mohan acting is not impressive\nC: vintage acting is too good\nD: mohan acting is too good", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_161_0.png", "./Continuous-temporal/video_captioning/video_captioning_161_1.png", "./Continuous-temporal/video_captioning/video_captioning_161_2.png", "./Continuous-temporal/video_captioning/video_captioning_161_3.png", "./Continuous-temporal/video_captioning/video_captioning_161_4.png", "./Continuous-temporal/video_captioning/video_captioning_161_5.png", "./Continuous-temporal/video_captioning/video_captioning_161_6.png", "./Continuous-temporal/video_captioning/video_captioning_161_7.png", "./Continuous-temporal/video_captioning/video_captioning_161_8.png", "./Continuous-temporal/video_captioning/video_captioning_161_9.png", "./Continuous-temporal/video_captioning/video_captioning_161_10.png", "./Continuous-temporal/video_captioning/video_captioning_161_11.png", "./Continuous-temporal/video_captioning/video_captioning_161_12.png", "./Continuous-temporal/video_captioning/video_captioning_161_13.png", "./Continuous-temporal/video_captioning/video_captioning_161_14.png", "./Continuous-temporal/video_captioning/video_captioning_161_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a man calmly watches the snakes slither around the room\nB: the man dances with the snakes in a friendly manner\nC: the snakes attack the man and force him to leave the room\nD: a man grabs at snakes and throws them around the room", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a man calmly watches the snakes slither around the room\nB: the man dances with the snakes in a friendly manner\nC: the snakes attack the man and force him to leave the room\nD: a man grabs at snakes and throws them around the room", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_162_0.png", "./Continuous-temporal/video_captioning/video_captioning_162_1.png", "./Continuous-temporal/video_captioning/video_captioning_162_2.png", "./Continuous-temporal/video_captioning/video_captioning_162_3.png", "./Continuous-temporal/video_captioning/video_captioning_162_4.png", "./Continuous-temporal/video_captioning/video_captioning_162_5.png", "./Continuous-temporal/video_captioning/video_captioning_162_6.png", "./Continuous-temporal/video_captioning/video_captioning_162_7.png", "./Continuous-temporal/video_captioning/video_captioning_162_8.png", "./Continuous-temporal/video_captioning/video_captioning_162_9.png", "./Continuous-temporal/video_captioning/video_captioning_162_10.png", "./Continuous-temporal/video_captioning/video_captioning_162_11.png", "./Continuous-temporal/video_captioning/video_captioning_162_12.png", "./Continuous-temporal/video_captioning/video_captioning_162_13.png", "./Continuous-temporal/video_captioning/video_captioning_162_14.png", "./Continuous-temporal/video_captioning/video_captioning_162_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a person plays a popular song on a musical instrument\nB: a person dances to a popular song\nC: a person covers a popular song\nD: a person sings an original song", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a person plays a popular song on a musical instrument\nB: a person dances to a popular song\nC: a person covers a popular song\nD: a person sings an original song", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_163_0.png", "./Continuous-temporal/video_captioning/video_captioning_163_1.png", "./Continuous-temporal/video_captioning/video_captioning_163_2.png", "./Continuous-temporal/video_captioning/video_captioning_163_3.png", "./Continuous-temporal/video_captioning/video_captioning_163_4.png", "./Continuous-temporal/video_captioning/video_captioning_163_5.png", "./Continuous-temporal/video_captioning/video_captioning_163_6.png", "./Continuous-temporal/video_captioning/video_captioning_163_7.png", "./Continuous-temporal/video_captioning/video_captioning_163_8.png", "./Continuous-temporal/video_captioning/video_captioning_163_9.png", "./Continuous-temporal/video_captioning/video_captioning_163_10.png", "./Continuous-temporal/video_captioning/video_captioning_163_11.png", "./Continuous-temporal/video_captioning/video_captioning_163_12.png", "./Continuous-temporal/video_captioning/video_captioning_163_13.png", "./Continuous-temporal/video_captioning/video_captioning_163_14.png", "./Continuous-temporal/video_captioning/video_captioning_163_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a baby is playing with a bull dog\nB: a baby is playing with a kitten\nC: a baby is playing with a teddy bear\nD: a dog is playing with a baby", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a baby is playing with a bull dog\nB: a baby is playing with a kitten\nC: a baby is playing with a teddy bear\nD: a dog is playing with a baby", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_164_0.png", "./Continuous-temporal/video_captioning/video_captioning_164_1.png", "./Continuous-temporal/video_captioning/video_captioning_164_2.png", "./Continuous-temporal/video_captioning/video_captioning_164_3.png", "./Continuous-temporal/video_captioning/video_captioning_164_4.png", "./Continuous-temporal/video_captioning/video_captioning_164_5.png", "./Continuous-temporal/video_captioning/video_captioning_164_6.png", "./Continuous-temporal/video_captioning/video_captioning_164_7.png", "./Continuous-temporal/video_captioning/video_captioning_164_8.png", "./Continuous-temporal/video_captioning/video_captioning_164_9.png", "./Continuous-temporal/video_captioning/video_captioning_164_10.png", "./Continuous-temporal/video_captioning/video_captioning_164_11.png", "./Continuous-temporal/video_captioning/video_captioning_164_12.png", "./Continuous-temporal/video_captioning/video_captioning_164_13.png", "./Continuous-temporal/video_captioning/video_captioning_164_14.png", "./Continuous-temporal/video_captioning/video_captioning_164_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: waiters walking calmly in a professional kitchen\nB: behind the scenes in a professional kitchen as the chefs work and the waiters run food can be a very noisy experience\nC: a calm and quiet atmosphere in a professional kitchen\nD: chefs leisurely preparing food in a professional kitchen", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: waiters walking calmly in a professional kitchen\nB: behind the scenes in a professional kitchen as the chefs work and the waiters run food can be a very noisy experience\nC: a calm and quiet atmosphere in a professional kitchen\nD: chefs leisurely preparing food in a professional kitchen", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_165_0.png", "./Continuous-temporal/video_captioning/video_captioning_165_1.png", "./Continuous-temporal/video_captioning/video_captioning_165_2.png", "./Continuous-temporal/video_captioning/video_captioning_165_3.png", "./Continuous-temporal/video_captioning/video_captioning_165_4.png", "./Continuous-temporal/video_captioning/video_captioning_165_5.png", "./Continuous-temporal/video_captioning/video_captioning_165_6.png", "./Continuous-temporal/video_captioning/video_captioning_165_7.png", "./Continuous-temporal/video_captioning/video_captioning_165_8.png", "./Continuous-temporal/video_captioning/video_captioning_165_9.png", "./Continuous-temporal/video_captioning/video_captioning_165_10.png", "./Continuous-temporal/video_captioning/video_captioning_165_11.png", "./Continuous-temporal/video_captioning/video_captioning_165_12.png", "./Continuous-temporal/video_captioning/video_captioning_165_13.png", "./Continuous-temporal/video_captioning/video_captioning_165_14.png", "./Continuous-temporal/video_captioning/video_captioning_165_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a man is washing his car\nB: a child is playing with a toy\nC: a woman is making an eyeshadow\nD: a chef is cooking a dessert", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a man is washing his car\nB: a child is playing with a toy\nC: a woman is making an eyeshadow\nD: a chef is cooking a dessert", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_166_0.png", "./Continuous-temporal/video_captioning/video_captioning_166_1.png", "./Continuous-temporal/video_captioning/video_captioning_166_2.png", "./Continuous-temporal/video_captioning/video_captioning_166_3.png", "./Continuous-temporal/video_captioning/video_captioning_166_4.png", "./Continuous-temporal/video_captioning/video_captioning_166_5.png", "./Continuous-temporal/video_captioning/video_captioning_166_6.png", "./Continuous-temporal/video_captioning/video_captioning_166_7.png", "./Continuous-temporal/video_captioning/video_captioning_166_8.png", "./Continuous-temporal/video_captioning/video_captioning_166_9.png", "./Continuous-temporal/video_captioning/video_captioning_166_10.png", "./Continuous-temporal/video_captioning/video_captioning_166_11.png", "./Continuous-temporal/video_captioning/video_captioning_166_12.png", "./Continuous-temporal/video_captioning/video_captioning_166_13.png", "./Continuous-temporal/video_captioning/video_captioning_166_14.png", "./Continuous-temporal/video_captioning/video_captioning_166_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a tv shows review program hosts discuss about the performance and staying on air of star trek\nB: a group of friends are shown enjoying a beach vacation in Thailand\nC: a cooking show features the preparation of traditional Italian cuisine\nD: a documentary film depicts the history and significance of ancient Egyptian pyramids", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a tv shows review program hosts discuss about the performance and staying on air of star trek\nB: a group of friends are shown enjoying a beach vacation in Thailand\nC: a cooking show features the preparation of traditional Italian cuisine\nD: a documentary film depicts the history and significance of ancient Egyptian pyramids", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_167_0.png", "./Continuous-temporal/video_captioning/video_captioning_167_1.png", "./Continuous-temporal/video_captioning/video_captioning_167_2.png", "./Continuous-temporal/video_captioning/video_captioning_167_3.png", "./Continuous-temporal/video_captioning/video_captioning_167_4.png", "./Continuous-temporal/video_captioning/video_captioning_167_5.png", "./Continuous-temporal/video_captioning/video_captioning_167_6.png", "./Continuous-temporal/video_captioning/video_captioning_167_7.png", "./Continuous-temporal/video_captioning/video_captioning_167_8.png", "./Continuous-temporal/video_captioning/video_captioning_167_9.png", "./Continuous-temporal/video_captioning/video_captioning_167_10.png", "./Continuous-temporal/video_captioning/video_captioning_167_11.png", "./Continuous-temporal/video_captioning/video_captioning_167_12.png", "./Continuous-temporal/video_captioning/video_captioning_167_13.png", "./Continuous-temporal/video_captioning/video_captioning_167_14.png", "./Continuous-temporal/video_captioning/video_captioning_167_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: people are waiting for a bus at a bus stop\nB: passengers are boarding a train at a station\nC: passersby are crossing a road at a traffic signal\nD: peoples are disembarking from a train in a station", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: people are waiting for a bus at a bus stop\nB: passengers are boarding a train at a station\nC: passersby are crossing a road at a traffic signal\nD: peoples are disembarking from a train in a station", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_168_0.png", "./Continuous-temporal/video_captioning/video_captioning_168_1.png", "./Continuous-temporal/video_captioning/video_captioning_168_2.png", "./Continuous-temporal/video_captioning/video_captioning_168_3.png", "./Continuous-temporal/video_captioning/video_captioning_168_4.png", "./Continuous-temporal/video_captioning/video_captioning_168_5.png", "./Continuous-temporal/video_captioning/video_captioning_168_6.png", "./Continuous-temporal/video_captioning/video_captioning_168_7.png", "./Continuous-temporal/video_captioning/video_captioning_168_8.png", "./Continuous-temporal/video_captioning/video_captioning_168_9.png", "./Continuous-temporal/video_captioning/video_captioning_168_10.png", "./Continuous-temporal/video_captioning/video_captioning_168_11.png", "./Continuous-temporal/video_captioning/video_captioning_168_12.png", "./Continuous-temporal/video_captioning/video_captioning_168_13.png", "./Continuous-temporal/video_captioning/video_captioning_168_14.png", "./Continuous-temporal/video_captioning/video_captioning_168_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a child is playing with a toy car\nB: a chef is cutting vegetables in the kitchen\nC: a woman is mixing food in a mixing bowl\nD: a man is painting a wall with a brush", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a child is playing with a toy car\nB: a chef is cutting vegetables in the kitchen\nC: a woman is mixing food in a mixing bowl\nD: a man is painting a wall with a brush", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_169_0.png", "./Continuous-temporal/video_captioning/video_captioning_169_1.png", "./Continuous-temporal/video_captioning/video_captioning_169_2.png", "./Continuous-temporal/video_captioning/video_captioning_169_3.png", "./Continuous-temporal/video_captioning/video_captioning_169_4.png", "./Continuous-temporal/video_captioning/video_captioning_169_5.png", "./Continuous-temporal/video_captioning/video_captioning_169_6.png", "./Continuous-temporal/video_captioning/video_captioning_169_7.png", "./Continuous-temporal/video_captioning/video_captioning_169_8.png", "./Continuous-temporal/video_captioning/video_captioning_169_9.png", "./Continuous-temporal/video_captioning/video_captioning_169_10.png", "./Continuous-temporal/video_captioning/video_captioning_169_11.png", "./Continuous-temporal/video_captioning/video_captioning_169_12.png", "./Continuous-temporal/video_captioning/video_captioning_169_13.png", "./Continuous-temporal/video_captioning/video_captioning_169_14.png", "./Continuous-temporal/video_captioning/video_captioning_169_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a dense forest with a winding dirt path\nB: cabins on a sandy beach have walkways going up to their porches\nC: modern skyscrapers overlooking a busy city street\nD: colorful umbrellas on a rocky beach with no buildings in sight", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a dense forest with a winding dirt path\nB: cabins on a sandy beach have walkways going up to their porches\nC: modern skyscrapers overlooking a busy city street\nD: colorful umbrellas on a rocky beach with no buildings in sight", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_170_0.png", "./Continuous-temporal/video_captioning/video_captioning_170_1.png", "./Continuous-temporal/video_captioning/video_captioning_170_2.png", "./Continuous-temporal/video_captioning/video_captioning_170_3.png", "./Continuous-temporal/video_captioning/video_captioning_170_4.png", "./Continuous-temporal/video_captioning/video_captioning_170_5.png", "./Continuous-temporal/video_captioning/video_captioning_170_6.png", "./Continuous-temporal/video_captioning/video_captioning_170_7.png", "./Continuous-temporal/video_captioning/video_captioning_170_8.png", "./Continuous-temporal/video_captioning/video_captioning_170_9.png", "./Continuous-temporal/video_captioning/video_captioning_170_10.png", "./Continuous-temporal/video_captioning/video_captioning_170_11.png", "./Continuous-temporal/video_captioning/video_captioning_170_12.png", "./Continuous-temporal/video_captioning/video_captioning_170_13.png", "./Continuous-temporal/video_captioning/video_captioning_170_14.png", "./Continuous-temporal/video_captioning/video_captioning_170_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a panda bear climbs on a tree trunk\nB: a koala climbs on a tree trunk\nC: a raccoon climbs on a tree trunk\nD: a bear sits on a tree trunk", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a panda bear climbs on a tree trunk\nB: a koala climbs on a tree trunk\nC: a raccoon climbs on a tree trunk\nD: a bear sits on a tree trunk", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_171_0.png", "./Continuous-temporal/video_captioning/video_captioning_171_1.png", "./Continuous-temporal/video_captioning/video_captioning_171_2.png", "./Continuous-temporal/video_captioning/video_captioning_171_3.png", "./Continuous-temporal/video_captioning/video_captioning_171_4.png", "./Continuous-temporal/video_captioning/video_captioning_171_5.png", "./Continuous-temporal/video_captioning/video_captioning_171_6.png", "./Continuous-temporal/video_captioning/video_captioning_171_7.png", "./Continuous-temporal/video_captioning/video_captioning_171_8.png", "./Continuous-temporal/video_captioning/video_captioning_171_9.png", "./Continuous-temporal/video_captioning/video_captioning_171_10.png", "./Continuous-temporal/video_captioning/video_captioning_171_11.png", "./Continuous-temporal/video_captioning/video_captioning_171_12.png", "./Continuous-temporal/video_captioning/video_captioning_171_13.png", "./Continuous-temporal/video_captioning/video_captioning_171_14.png", "./Continuous-temporal/video_captioning/video_captioning_171_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a man is mowing the lawn\nB: a dog is barking at the mailman\nC: a woman is chopping herbs\nD: a child is playing with a toy", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a man is mowing the lawn\nB: a dog is barking at the mailman\nC: a woman is chopping herbs\nD: a child is playing with a toy", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_172_0.png", "./Continuous-temporal/video_captioning/video_captioning_172_1.png", "./Continuous-temporal/video_captioning/video_captioning_172_2.png", "./Continuous-temporal/video_captioning/video_captioning_172_3.png", "./Continuous-temporal/video_captioning/video_captioning_172_4.png", "./Continuous-temporal/video_captioning/video_captioning_172_5.png", "./Continuous-temporal/video_captioning/video_captioning_172_6.png", "./Continuous-temporal/video_captioning/video_captioning_172_7.png", "./Continuous-temporal/video_captioning/video_captioning_172_8.png", "./Continuous-temporal/video_captioning/video_captioning_172_9.png", "./Continuous-temporal/video_captioning/video_captioning_172_10.png", "./Continuous-temporal/video_captioning/video_captioning_172_11.png", "./Continuous-temporal/video_captioning/video_captioning_172_12.png", "./Continuous-temporal/video_captioning/video_captioning_172_13.png", "./Continuous-temporal/video_captioning/video_captioning_172_14.png", "./Continuous-temporal/video_captioning/video_captioning_172_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a woman discusses a peace treaty between two countries\nB: a teacher gives a lecture on ancient civilizations\nC: a man talks about a war between two generals one of which became king\nD: a child narrates a story about a magical kingdom", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a woman discusses a peace treaty between two countries\nB: a teacher gives a lecture on ancient civilizations\nC: a man talks about a war between two generals one of which became king\nD: a child narrates a story about a magical kingdom", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_173_0.png", "./Continuous-temporal/video_captioning/video_captioning_173_1.png", "./Continuous-temporal/video_captioning/video_captioning_173_2.png", "./Continuous-temporal/video_captioning/video_captioning_173_3.png", "./Continuous-temporal/video_captioning/video_captioning_173_4.png", "./Continuous-temporal/video_captioning/video_captioning_173_5.png", "./Continuous-temporal/video_captioning/video_captioning_173_6.png", "./Continuous-temporal/video_captioning/video_captioning_173_7.png", "./Continuous-temporal/video_captioning/video_captioning_173_8.png", "./Continuous-temporal/video_captioning/video_captioning_173_9.png", "./Continuous-temporal/video_captioning/video_captioning_173_10.png", "./Continuous-temporal/video_captioning/video_captioning_173_11.png", "./Continuous-temporal/video_captioning/video_captioning_173_12.png", "./Continuous-temporal/video_captioning/video_captioning_173_13.png", "./Continuous-temporal/video_captioning/video_captioning_173_14.png", "./Continuous-temporal/video_captioning/video_captioning_173_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a hockey player scores a goal during a hockey game\nB: a basketball player dunks a basketball during a basketball game\nC: a baseball player hits a home run during a baseball game\nD: a soccer player shoots a goal during a soccer game", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a hockey player scores a goal during a hockey game\nB: a basketball player dunks a basketball during a basketball game\nC: a baseball player hits a home run during a baseball game\nD: a soccer player shoots a goal during a soccer game", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_174_0.png", "./Continuous-temporal/video_captioning/video_captioning_174_1.png", "./Continuous-temporal/video_captioning/video_captioning_174_2.png", "./Continuous-temporal/video_captioning/video_captioning_174_3.png", "./Continuous-temporal/video_captioning/video_captioning_174_4.png", "./Continuous-temporal/video_captioning/video_captioning_174_5.png", "./Continuous-temporal/video_captioning/video_captioning_174_6.png", "./Continuous-temporal/video_captioning/video_captioning_174_7.png", "./Continuous-temporal/video_captioning/video_captioning_174_8.png", "./Continuous-temporal/video_captioning/video_captioning_174_9.png", "./Continuous-temporal/video_captioning/video_captioning_174_10.png", "./Continuous-temporal/video_captioning/video_captioning_174_11.png", "./Continuous-temporal/video_captioning/video_captioning_174_12.png", "./Continuous-temporal/video_captioning/video_captioning_174_13.png", "./Continuous-temporal/video_captioning/video_captioning_174_14.png", "./Continuous-temporal/video_captioning/video_captioning_174_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a child is playing in the park\nB: a woman is cooking in the kitchen\nC: a dog is barking at the door\nD: a man is reading a book on a couch", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a child is playing in the park\nB: a woman is cooking in the kitchen\nC: a dog is barking at the door\nD: a man is reading a book on a couch", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_175_0.png", "./Continuous-temporal/video_captioning/video_captioning_175_1.png", "./Continuous-temporal/video_captioning/video_captioning_175_2.png", "./Continuous-temporal/video_captioning/video_captioning_175_3.png", "./Continuous-temporal/video_captioning/video_captioning_175_4.png", "./Continuous-temporal/video_captioning/video_captioning_175_5.png", "./Continuous-temporal/video_captioning/video_captioning_175_6.png", "./Continuous-temporal/video_captioning/video_captioning_175_7.png", "./Continuous-temporal/video_captioning/video_captioning_175_8.png", "./Continuous-temporal/video_captioning/video_captioning_175_9.png", "./Continuous-temporal/video_captioning/video_captioning_175_10.png", "./Continuous-temporal/video_captioning/video_captioning_175_11.png", "./Continuous-temporal/video_captioning/video_captioning_175_12.png", "./Continuous-temporal/video_captioning/video_captioning_175_13.png", "./Continuous-temporal/video_captioning/video_captioning_175_14.png", "./Continuous-temporal/video_captioning/video_captioning_175_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: men pushing a car up assembly line\nB: men pushing a car down assembly line\nC: men pulling a car down assembly line\nD: women pushing a car down assembly line", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: men pushing a car up assembly line\nB: men pushing a car down assembly line\nC: men pulling a car down assembly line\nD: women pushing a car down assembly line", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_176_0.png", "./Continuous-temporal/video_captioning/video_captioning_176_1.png", "./Continuous-temporal/video_captioning/video_captioning_176_2.png", "./Continuous-temporal/video_captioning/video_captioning_176_3.png", "./Continuous-temporal/video_captioning/video_captioning_176_4.png", "./Continuous-temporal/video_captioning/video_captioning_176_5.png", "./Continuous-temporal/video_captioning/video_captioning_176_6.png", "./Continuous-temporal/video_captioning/video_captioning_176_7.png", "./Continuous-temporal/video_captioning/video_captioning_176_8.png", "./Continuous-temporal/video_captioning/video_captioning_176_9.png", "./Continuous-temporal/video_captioning/video_captioning_176_10.png", "./Continuous-temporal/video_captioning/video_captioning_176_11.png", "./Continuous-temporal/video_captioning/video_captioning_176_12.png", "./Continuous-temporal/video_captioning/video_captioning_176_13.png", "./Continuous-temporal/video_captioning/video_captioning_176_14.png", "./Continuous-temporal/video_captioning/video_captioning_176_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: two people are dancing\nB: a man and woman is eating\nC: a man is cooking alone\nD: a woman is eating alone", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: two people are dancing\nB: a man and woman is eating\nC: a man is cooking alone\nD: a woman is eating alone", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_177_0.png", "./Continuous-temporal/video_captioning/video_captioning_177_1.png", "./Continuous-temporal/video_captioning/video_captioning_177_2.png", "./Continuous-temporal/video_captioning/video_captioning_177_3.png", "./Continuous-temporal/video_captioning/video_captioning_177_4.png", "./Continuous-temporal/video_captioning/video_captioning_177_5.png", "./Continuous-temporal/video_captioning/video_captioning_177_6.png", "./Continuous-temporal/video_captioning/video_captioning_177_7.png", "./Continuous-temporal/video_captioning/video_captioning_177_8.png", "./Continuous-temporal/video_captioning/video_captioning_177_9.png", "./Continuous-temporal/video_captioning/video_captioning_177_10.png", "./Continuous-temporal/video_captioning/video_captioning_177_11.png", "./Continuous-temporal/video_captioning/video_captioning_177_12.png", "./Continuous-temporal/video_captioning/video_captioning_177_13.png", "./Continuous-temporal/video_captioning/video_captioning_177_14.png", "./Continuous-temporal/video_captioning/video_captioning_177_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: butter melting in a frying pan\nB: soup being boiled in a pot\nC: chocolate melting in hot water\nD: cheese melting in hot sauce", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: butter melting in a frying pan\nB: soup being boiled in a pot\nC: chocolate melting in hot water\nD: cheese melting in hot sauce", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_178_0.png", "./Continuous-temporal/video_captioning/video_captioning_178_1.png", "./Continuous-temporal/video_captioning/video_captioning_178_2.png", "./Continuous-temporal/video_captioning/video_captioning_178_3.png", "./Continuous-temporal/video_captioning/video_captioning_178_4.png", "./Continuous-temporal/video_captioning/video_captioning_178_5.png", "./Continuous-temporal/video_captioning/video_captioning_178_6.png", "./Continuous-temporal/video_captioning/video_captioning_178_7.png", "./Continuous-temporal/video_captioning/video_captioning_178_8.png", "./Continuous-temporal/video_captioning/video_captioning_178_9.png", "./Continuous-temporal/video_captioning/video_captioning_178_10.png", "./Continuous-temporal/video_captioning/video_captioning_178_11.png", "./Continuous-temporal/video_captioning/video_captioning_178_12.png", "./Continuous-temporal/video_captioning/video_captioning_178_13.png", "./Continuous-temporal/video_captioning/video_captioning_178_14.png", "./Continuous-temporal/video_captioning/video_captioning_178_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: two boys are playing in the park with a dog and laughing\nB: two girls are sitting in the bed with a cat and talking\nC: a man is cooking in the kitchen with a dog and dancing\nD: two girls are sitting on the bench with a cat and reading", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: two boys are playing in the park with a dog and laughing\nB: two girls are sitting in the bed with a cat and talking\nC: a man is cooking in the kitchen with a dog and dancing\nD: two girls are sitting on the bench with a cat and reading", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_179_0.png", "./Continuous-temporal/video_captioning/video_captioning_179_1.png", "./Continuous-temporal/video_captioning/video_captioning_179_2.png", "./Continuous-temporal/video_captioning/video_captioning_179_3.png", "./Continuous-temporal/video_captioning/video_captioning_179_4.png", "./Continuous-temporal/video_captioning/video_captioning_179_5.png", "./Continuous-temporal/video_captioning/video_captioning_179_6.png", "./Continuous-temporal/video_captioning/video_captioning_179_7.png", "./Continuous-temporal/video_captioning/video_captioning_179_8.png", "./Continuous-temporal/video_captioning/video_captioning_179_9.png", "./Continuous-temporal/video_captioning/video_captioning_179_10.png", "./Continuous-temporal/video_captioning/video_captioning_179_11.png", "./Continuous-temporal/video_captioning/video_captioning_179_12.png", "./Continuous-temporal/video_captioning/video_captioning_179_13.png", "./Continuous-temporal/video_captioning/video_captioning_179_14.png", "./Continuous-temporal/video_captioning/video_captioning_179_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: employees are working in an office\nB: players are playing a basketball match\nC: spectators are watching a tennis match\nD: dancers are performing on a stage", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: employees are working in an office\nB: players are playing a basketball match\nC: spectators are watching a tennis match\nD: dancers are performing on a stage", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_180_0.png", "./Continuous-temporal/video_captioning/video_captioning_180_1.png", "./Continuous-temporal/video_captioning/video_captioning_180_2.png", "./Continuous-temporal/video_captioning/video_captioning_180_3.png", "./Continuous-temporal/video_captioning/video_captioning_180_4.png", "./Continuous-temporal/video_captioning/video_captioning_180_5.png", "./Continuous-temporal/video_captioning/video_captioning_180_6.png", "./Continuous-temporal/video_captioning/video_captioning_180_7.png", "./Continuous-temporal/video_captioning/video_captioning_180_8.png", "./Continuous-temporal/video_captioning/video_captioning_180_9.png", "./Continuous-temporal/video_captioning/video_captioning_180_10.png", "./Continuous-temporal/video_captioning/video_captioning_180_11.png", "./Continuous-temporal/video_captioning/video_captioning_180_12.png", "./Continuous-temporal/video_captioning/video_captioning_180_13.png", "./Continuous-temporal/video_captioning/video_captioning_180_14.png", "./Continuous-temporal/video_captioning/video_captioning_180_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a girl getting dressed\nB: a boy playing basketball\nC: a woman reading a book\nD: a man cooking in the kitchen", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a girl getting dressed\nB: a boy playing basketball\nC: a woman reading a book\nD: a man cooking in the kitchen", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_181_0.png", "./Continuous-temporal/video_captioning/video_captioning_181_1.png", "./Continuous-temporal/video_captioning/video_captioning_181_2.png", "./Continuous-temporal/video_captioning/video_captioning_181_3.png", "./Continuous-temporal/video_captioning/video_captioning_181_4.png", "./Continuous-temporal/video_captioning/video_captioning_181_5.png", "./Continuous-temporal/video_captioning/video_captioning_181_6.png", "./Continuous-temporal/video_captioning/video_captioning_181_7.png", "./Continuous-temporal/video_captioning/video_captioning_181_8.png", "./Continuous-temporal/video_captioning/video_captioning_181_9.png", "./Continuous-temporal/video_captioning/video_captioning_181_10.png", "./Continuous-temporal/video_captioning/video_captioning_181_11.png", "./Continuous-temporal/video_captioning/video_captioning_181_12.png", "./Continuous-temporal/video_captioning/video_captioning_181_13.png", "./Continuous-temporal/video_captioning/video_captioning_181_14.png", "./Continuous-temporal/video_captioning/video_captioning_181_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a group of people are eating together\nB: women are dancing in silence\nC: men are singing a song\nD: children are playing a game", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a group of people are eating together\nB: women are dancing in silence\nC: men are singing a song\nD: children are playing a game", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_182_0.png", "./Continuous-temporal/video_captioning/video_captioning_182_1.png", "./Continuous-temporal/video_captioning/video_captioning_182_2.png", "./Continuous-temporal/video_captioning/video_captioning_182_3.png", "./Continuous-temporal/video_captioning/video_captioning_182_4.png", "./Continuous-temporal/video_captioning/video_captioning_182_5.png", "./Continuous-temporal/video_captioning/video_captioning_182_6.png", "./Continuous-temporal/video_captioning/video_captioning_182_7.png", "./Continuous-temporal/video_captioning/video_captioning_182_8.png", "./Continuous-temporal/video_captioning/video_captioning_182_9.png", "./Continuous-temporal/video_captioning/video_captioning_182_10.png", "./Continuous-temporal/video_captioning/video_captioning_182_11.png", "./Continuous-temporal/video_captioning/video_captioning_182_12.png", "./Continuous-temporal/video_captioning/video_captioning_182_13.png", "./Continuous-temporal/video_captioning/video_captioning_182_14.png", "./Continuous-temporal/video_captioning/video_captioning_182_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: an elderly man cooking in the kitchen\nB: a group of children riding bicycles in the park\nC: a middle aged woman giving another woman a message\nD: a young girl playing with a dog", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: an elderly man cooking in the kitchen\nB: a group of children riding bicycles in the park\nC: a middle aged woman giving another woman a message\nD: a young girl playing with a dog", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_183_0.png", "./Continuous-temporal/video_captioning/video_captioning_183_1.png", "./Continuous-temporal/video_captioning/video_captioning_183_2.png", "./Continuous-temporal/video_captioning/video_captioning_183_3.png", "./Continuous-temporal/video_captioning/video_captioning_183_4.png", "./Continuous-temporal/video_captioning/video_captioning_183_5.png", "./Continuous-temporal/video_captioning/video_captioning_183_6.png", "./Continuous-temporal/video_captioning/video_captioning_183_7.png", "./Continuous-temporal/video_captioning/video_captioning_183_8.png", "./Continuous-temporal/video_captioning/video_captioning_183_9.png", "./Continuous-temporal/video_captioning/video_captioning_183_10.png", "./Continuous-temporal/video_captioning/video_captioning_183_11.png", "./Continuous-temporal/video_captioning/video_captioning_183_12.png", "./Continuous-temporal/video_captioning/video_captioning_183_13.png", "./Continuous-temporal/video_captioning/video_captioning_183_14.png", "./Continuous-temporal/video_captioning/video_captioning_183_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a man explains the details of a historical event\nB: a man talks about the publication of a nasa technical report\nC: a person describes the process of creating a new recipe\nD: a woman discusses the release of a new music album", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a man explains the details of a historical event\nB: a man talks about the publication of a nasa technical report\nC: a person describes the process of creating a new recipe\nD: a woman discusses the release of a new music album", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_184_0.png", "./Continuous-temporal/video_captioning/video_captioning_184_1.png", "./Continuous-temporal/video_captioning/video_captioning_184_2.png", "./Continuous-temporal/video_captioning/video_captioning_184_3.png", "./Continuous-temporal/video_captioning/video_captioning_184_4.png", "./Continuous-temporal/video_captioning/video_captioning_184_5.png", "./Continuous-temporal/video_captioning/video_captioning_184_6.png", "./Continuous-temporal/video_captioning/video_captioning_184_7.png", "./Continuous-temporal/video_captioning/video_captioning_184_8.png", "./Continuous-temporal/video_captioning/video_captioning_184_9.png", "./Continuous-temporal/video_captioning/video_captioning_184_10.png", "./Continuous-temporal/video_captioning/video_captioning_184_11.png", "./Continuous-temporal/video_captioning/video_captioning_184_12.png", "./Continuous-temporal/video_captioning/video_captioning_184_13.png", "./Continuous-temporal/video_captioning/video_captioning_184_14.png", "./Continuous-temporal/video_captioning/video_captioning_184_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: the girl danced in the street\nB: the girl played with her hair\nC: the girl applied stickers to her face\nD: the girl put on a hat", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: the girl danced in the street\nB: the girl played with her hair\nC: the girl applied stickers to her face\nD: the girl put on a hat", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_185_0.png", "./Continuous-temporal/video_captioning/video_captioning_185_1.png", "./Continuous-temporal/video_captioning/video_captioning_185_2.png", "./Continuous-temporal/video_captioning/video_captioning_185_3.png", "./Continuous-temporal/video_captioning/video_captioning_185_4.png", "./Continuous-temporal/video_captioning/video_captioning_185_5.png", "./Continuous-temporal/video_captioning/video_captioning_185_6.png", "./Continuous-temporal/video_captioning/video_captioning_185_7.png", "./Continuous-temporal/video_captioning/video_captioning_185_8.png", "./Continuous-temporal/video_captioning/video_captioning_185_9.png", "./Continuous-temporal/video_captioning/video_captioning_185_10.png", "./Continuous-temporal/video_captioning/video_captioning_185_11.png", "./Continuous-temporal/video_captioning/video_captioning_185_12.png", "./Continuous-temporal/video_captioning/video_captioning_185_13.png", "./Continuous-temporal/video_captioning/video_captioning_185_14.png", "./Continuous-temporal/video_captioning/video_captioning_185_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a slideshow of landscape photographs\nB: a video showing footage from sporting events\nC: a documentary about wildlife conservation efforts\nD: a tutorial on cooking techniques", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a slideshow of landscape photographs\nB: a video showing footage from sporting events\nC: a documentary about wildlife conservation efforts\nD: a tutorial on cooking techniques", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_186_0.png", "./Continuous-temporal/video_captioning/video_captioning_186_1.png", "./Continuous-temporal/video_captioning/video_captioning_186_2.png", "./Continuous-temporal/video_captioning/video_captioning_186_3.png", "./Continuous-temporal/video_captioning/video_captioning_186_4.png", "./Continuous-temporal/video_captioning/video_captioning_186_5.png", "./Continuous-temporal/video_captioning/video_captioning_186_6.png", "./Continuous-temporal/video_captioning/video_captioning_186_7.png", "./Continuous-temporal/video_captioning/video_captioning_186_8.png", "./Continuous-temporal/video_captioning/video_captioning_186_9.png", "./Continuous-temporal/video_captioning/video_captioning_186_10.png", "./Continuous-temporal/video_captioning/video_captioning_186_11.png", "./Continuous-temporal/video_captioning/video_captioning_186_12.png", "./Continuous-temporal/video_captioning/video_captioning_186_13.png", "./Continuous-temporal/video_captioning/video_captioning_186_14.png", "./Continuous-temporal/video_captioning/video_captioning_186_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a man drives his car down the road\nB: a group of people having a picnic in the garden\nC: a child riding a bicycle on the sidewalk\nD: a woman walking her dog in the park", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a man drives his car down the road\nB: a group of people having a picnic in the garden\nC: a child riding a bicycle on the sidewalk\nD: a woman walking her dog in the park", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_187_0.png", "./Continuous-temporal/video_captioning/video_captioning_187_1.png", "./Continuous-temporal/video_captioning/video_captioning_187_2.png", "./Continuous-temporal/video_captioning/video_captioning_187_3.png", "./Continuous-temporal/video_captioning/video_captioning_187_4.png", "./Continuous-temporal/video_captioning/video_captioning_187_5.png", "./Continuous-temporal/video_captioning/video_captioning_187_6.png", "./Continuous-temporal/video_captioning/video_captioning_187_7.png", "./Continuous-temporal/video_captioning/video_captioning_187_8.png", "./Continuous-temporal/video_captioning/video_captioning_187_9.png", "./Continuous-temporal/video_captioning/video_captioning_187_10.png", "./Continuous-temporal/video_captioning/video_captioning_187_11.png", "./Continuous-temporal/video_captioning/video_captioning_187_12.png", "./Continuous-temporal/video_captioning/video_captioning_187_13.png", "./Continuous-temporal/video_captioning/video_captioning_187_14.png", "./Continuous-temporal/video_captioning/video_captioning_187_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a man cries uncontrollably\nB: a woman laughs until she chokes\nC: a child laughs with joy\nD: a group of people stare blankly", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a man cries uncontrollably\nB: a woman laughs until she chokes\nC: a child laughs with joy\nD: a group of people stare blankly", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_188_0.png", "./Continuous-temporal/video_captioning/video_captioning_188_1.png", "./Continuous-temporal/video_captioning/video_captioning_188_2.png", "./Continuous-temporal/video_captioning/video_captioning_188_3.png", "./Continuous-temporal/video_captioning/video_captioning_188_4.png", "./Continuous-temporal/video_captioning/video_captioning_188_5.png", "./Continuous-temporal/video_captioning/video_captioning_188_6.png", "./Continuous-temporal/video_captioning/video_captioning_188_7.png", "./Continuous-temporal/video_captioning/video_captioning_188_8.png", "./Continuous-temporal/video_captioning/video_captioning_188_9.png", "./Continuous-temporal/video_captioning/video_captioning_188_10.png", "./Continuous-temporal/video_captioning/video_captioning_188_11.png", "./Continuous-temporal/video_captioning/video_captioning_188_12.png", "./Continuous-temporal/video_captioning/video_captioning_188_13.png", "./Continuous-temporal/video_captioning/video_captioning_188_14.png", "./Continuous-temporal/video_captioning/video_captioning_188_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a man jumps off a diving board\nB: a woman swims in a pool\nC: a cat walks on a treadmill\nD: a child plays on a swing", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a man jumps off a diving board\nB: a woman swims in a pool\nC: a cat walks on a treadmill\nD: a child plays on a swing", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_189_0.png", "./Continuous-temporal/video_captioning/video_captioning_189_1.png", "./Continuous-temporal/video_captioning/video_captioning_189_2.png", "./Continuous-temporal/video_captioning/video_captioning_189_3.png", "./Continuous-temporal/video_captioning/video_captioning_189_4.png", "./Continuous-temporal/video_captioning/video_captioning_189_5.png", "./Continuous-temporal/video_captioning/video_captioning_189_6.png", "./Continuous-temporal/video_captioning/video_captioning_189_7.png", "./Continuous-temporal/video_captioning/video_captioning_189_8.png", "./Continuous-temporal/video_captioning/video_captioning_189_9.png", "./Continuous-temporal/video_captioning/video_captioning_189_10.png", "./Continuous-temporal/video_captioning/video_captioning_189_11.png", "./Continuous-temporal/video_captioning/video_captioning_189_12.png", "./Continuous-temporal/video_captioning/video_captioning_189_13.png", "./Continuous-temporal/video_captioning/video_captioning_189_14.png", "./Continuous-temporal/video_captioning/video_captioning_189_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a female inside a white themed bathroom while someone else makes her makeup\nB: a female outside a white themed bathroom while making her own makeup\nC: a female inside a colorful themed bathroom while someone else makes her makeup\nD: a male inside a white themed bathroom while someone else makes his makeup", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a female inside a white themed bathroom while someone else makes her makeup\nB: a female outside a white themed bathroom while making her own makeup\nC: a female inside a colorful themed bathroom while someone else makes her makeup\nD: a male inside a white themed bathroom while someone else makes his makeup", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_190_0.png", "./Continuous-temporal/video_captioning/video_captioning_190_1.png", "./Continuous-temporal/video_captioning/video_captioning_190_2.png", "./Continuous-temporal/video_captioning/video_captioning_190_3.png", "./Continuous-temporal/video_captioning/video_captioning_190_4.png", "./Continuous-temporal/video_captioning/video_captioning_190_5.png", "./Continuous-temporal/video_captioning/video_captioning_190_6.png", "./Continuous-temporal/video_captioning/video_captioning_190_7.png", "./Continuous-temporal/video_captioning/video_captioning_190_8.png", "./Continuous-temporal/video_captioning/video_captioning_190_9.png", "./Continuous-temporal/video_captioning/video_captioning_190_10.png", "./Continuous-temporal/video_captioning/video_captioning_190_11.png", "./Continuous-temporal/video_captioning/video_captioning_190_12.png", "./Continuous-temporal/video_captioning/video_captioning_190_13.png", "./Continuous-temporal/video_captioning/video_captioning_190_14.png", "./Continuous-temporal/video_captioning/video_captioning_190_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a documentary about lions in the wild\nB: a commercial for the website called eharmony\nC: a tutorial on how to bake a cake\nD: an advertisement for a new mobile phone", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a documentary about lions in the wild\nB: a commercial for the website called eharmony\nC: a tutorial on how to bake a cake\nD: an advertisement for a new mobile phone", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_191_0.png", "./Continuous-temporal/video_captioning/video_captioning_191_1.png", "./Continuous-temporal/video_captioning/video_captioning_191_2.png", "./Continuous-temporal/video_captioning/video_captioning_191_3.png", "./Continuous-temporal/video_captioning/video_captioning_191_4.png", "./Continuous-temporal/video_captioning/video_captioning_191_5.png", "./Continuous-temporal/video_captioning/video_captioning_191_6.png", "./Continuous-temporal/video_captioning/video_captioning_191_7.png", "./Continuous-temporal/video_captioning/video_captioning_191_8.png", "./Continuous-temporal/video_captioning/video_captioning_191_9.png", "./Continuous-temporal/video_captioning/video_captioning_191_10.png", "./Continuous-temporal/video_captioning/video_captioning_191_11.png", "./Continuous-temporal/video_captioning/video_captioning_191_12.png", "./Continuous-temporal/video_captioning/video_captioning_191_13.png", "./Continuous-temporal/video_captioning/video_captioning_191_14.png", "./Continuous-temporal/video_captioning/video_captioning_191_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: smart poultry cop\nB: astute fowl officer\nC: intelligent chicken law enforcement\nD: very cleaver chicken police", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: smart poultry cop\nB: astute fowl officer\nC: intelligent chicken law enforcement\nD: very cleaver chicken police", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_192_0.png", "./Continuous-temporal/video_captioning/video_captioning_192_1.png", "./Continuous-temporal/video_captioning/video_captioning_192_2.png", "./Continuous-temporal/video_captioning/video_captioning_192_3.png", "./Continuous-temporal/video_captioning/video_captioning_192_4.png", "./Continuous-temporal/video_captioning/video_captioning_192_5.png", "./Continuous-temporal/video_captioning/video_captioning_192_6.png", "./Continuous-temporal/video_captioning/video_captioning_192_7.png", "./Continuous-temporal/video_captioning/video_captioning_192_8.png", "./Continuous-temporal/video_captioning/video_captioning_192_9.png", "./Continuous-temporal/video_captioning/video_captioning_192_10.png", "./Continuous-temporal/video_captioning/video_captioning_192_11.png", "./Continuous-temporal/video_captioning/video_captioning_192_12.png", "./Continuous-temporal/video_captioning/video_captioning_192_13.png", "./Continuous-temporal/video_captioning/video_captioning_192_14.png", "./Continuous-temporal/video_captioning/video_captioning_192_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a scene from spongebob squarepants where the townspeople are dancing in a parade\nB: a scene from spongebob squarepants where the townspeople are carrying torches and chasing a giant squidward\nC: a scene from spongebob squarepants where the townspeople are peacefully enjoying a picnic\nD: a scene from spongebob squarepants where the townspeople are having a friendly conversation with a giant squidward", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a scene from spongebob squarepants where the townspeople are dancing in a parade\nB: a scene from spongebob squarepants where the townspeople are carrying torches and chasing a giant squidward\nC: a scene from spongebob squarepants where the townspeople are peacefully enjoying a picnic\nD: a scene from spongebob squarepants where the townspeople are having a friendly conversation with a giant squidward", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_193_0.png", "./Continuous-temporal/video_captioning/video_captioning_193_1.png", "./Continuous-temporal/video_captioning/video_captioning_193_2.png", "./Continuous-temporal/video_captioning/video_captioning_193_3.png", "./Continuous-temporal/video_captioning/video_captioning_193_4.png", "./Continuous-temporal/video_captioning/video_captioning_193_5.png", "./Continuous-temporal/video_captioning/video_captioning_193_6.png", "./Continuous-temporal/video_captioning/video_captioning_193_7.png", "./Continuous-temporal/video_captioning/video_captioning_193_8.png", "./Continuous-temporal/video_captioning/video_captioning_193_9.png", "./Continuous-temporal/video_captioning/video_captioning_193_10.png", "./Continuous-temporal/video_captioning/video_captioning_193_11.png", "./Continuous-temporal/video_captioning/video_captioning_193_12.png", "./Continuous-temporal/video_captioning/video_captioning_193_13.png", "./Continuous-temporal/video_captioning/video_captioning_193_14.png", "./Continuous-temporal/video_captioning/video_captioning_193_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: patrick is destroying memories of him with mr\nB: sandy is hiding memories of her with mr\nC: squidward is ignoring memories of him with mr\nD: spongebob is showing memories of him with mr", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: patrick is destroying memories of him with mr\nB: sandy is hiding memories of her with mr\nC: squidward is ignoring memories of him with mr\nD: spongebob is showing memories of him with mr", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_194_0.png", "./Continuous-temporal/video_captioning/video_captioning_194_1.png", "./Continuous-temporal/video_captioning/video_captioning_194_2.png", "./Continuous-temporal/video_captioning/video_captioning_194_3.png", "./Continuous-temporal/video_captioning/video_captioning_194_4.png", "./Continuous-temporal/video_captioning/video_captioning_194_5.png", "./Continuous-temporal/video_captioning/video_captioning_194_6.png", "./Continuous-temporal/video_captioning/video_captioning_194_7.png", "./Continuous-temporal/video_captioning/video_captioning_194_8.png", "./Continuous-temporal/video_captioning/video_captioning_194_9.png", "./Continuous-temporal/video_captioning/video_captioning_194_10.png", "./Continuous-temporal/video_captioning/video_captioning_194_11.png", "./Continuous-temporal/video_captioning/video_captioning_194_12.png", "./Continuous-temporal/video_captioning/video_captioning_194_13.png", "./Continuous-temporal/video_captioning/video_captioning_194_14.png", "./Continuous-temporal/video_captioning/video_captioning_194_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: valencia and hokit participating in a boxing match\nB: valencia and hokit competing in a tennis match\nC: valencia and hokit facing off in a chess tournament\nD: valencia vesus hokit in a wrestling match", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: valencia and hokit participating in a boxing match\nB: valencia and hokit competing in a tennis match\nC: valencia and hokit facing off in a chess tournament\nD: valencia vesus hokit in a wrestling match", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_195_0.png", "./Continuous-temporal/video_captioning/video_captioning_195_1.png", "./Continuous-temporal/video_captioning/video_captioning_195_2.png", "./Continuous-temporal/video_captioning/video_captioning_195_3.png", "./Continuous-temporal/video_captioning/video_captioning_195_4.png", "./Continuous-temporal/video_captioning/video_captioning_195_5.png", "./Continuous-temporal/video_captioning/video_captioning_195_6.png", "./Continuous-temporal/video_captioning/video_captioning_195_7.png", "./Continuous-temporal/video_captioning/video_captioning_195_8.png", "./Continuous-temporal/video_captioning/video_captioning_195_9.png", "./Continuous-temporal/video_captioning/video_captioning_195_10.png", "./Continuous-temporal/video_captioning/video_captioning_195_11.png", "./Continuous-temporal/video_captioning/video_captioning_195_12.png", "./Continuous-temporal/video_captioning/video_captioning_195_13.png", "./Continuous-temporal/video_captioning/video_captioning_195_14.png", "./Continuous-temporal/video_captioning/video_captioning_195_15.png"], "output": "D", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a child is playing with a toy car\nB: a man is driving a motorcycle\nC: a woman is riding a bicycle\nD: a person is walking a dog", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a child is playing with a toy car\nB: a man is driving a motorcycle\nC: a woman is riding a bicycle\nD: a person is walking a dog", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_196_0.png", "./Continuous-temporal/video_captioning/video_captioning_196_1.png", "./Continuous-temporal/video_captioning/video_captioning_196_2.png", "./Continuous-temporal/video_captioning/video_captioning_196_3.png", "./Continuous-temporal/video_captioning/video_captioning_196_4.png", "./Continuous-temporal/video_captioning/video_captioning_196_5.png", "./Continuous-temporal/video_captioning/video_captioning_196_6.png", "./Continuous-temporal/video_captioning/video_captioning_196_7.png", "./Continuous-temporal/video_captioning/video_captioning_196_8.png", "./Continuous-temporal/video_captioning/video_captioning_196_9.png", "./Continuous-temporal/video_captioning/video_captioning_196_10.png", "./Continuous-temporal/video_captioning/video_captioning_196_11.png", "./Continuous-temporal/video_captioning/video_captioning_196_12.png", "./Continuous-temporal/video_captioning/video_captioning_196_13.png", "./Continuous-temporal/video_captioning/video_captioning_196_14.png", "./Continuous-temporal/video_captioning/video_captioning_196_15.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a child plays with a ball in the park\nB: a man peels a potato with a spoon\nC: a woman cuts an onion in half with a knife\nD: a chef grills a steak on a barbecue", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a child plays with a ball in the park\nB: a man peels a potato with a spoon\nC: a woman cuts an onion in half with a knife\nD: a chef grills a steak on a barbecue", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_197_0.png", "./Continuous-temporal/video_captioning/video_captioning_197_1.png", "./Continuous-temporal/video_captioning/video_captioning_197_2.png", "./Continuous-temporal/video_captioning/video_captioning_197_3.png", "./Continuous-temporal/video_captioning/video_captioning_197_4.png", "./Continuous-temporal/video_captioning/video_captioning_197_5.png", "./Continuous-temporal/video_captioning/video_captioning_197_6.png", "./Continuous-temporal/video_captioning/video_captioning_197_7.png", "./Continuous-temporal/video_captioning/video_captioning_197_8.png", "./Continuous-temporal/video_captioning/video_captioning_197_9.png", "./Continuous-temporal/video_captioning/video_captioning_197_10.png", "./Continuous-temporal/video_captioning/video_captioning_197_11.png", "./Continuous-temporal/video_captioning/video_captioning_197_12.png", "./Continuous-temporal/video_captioning/video_captioning_197_13.png", "./Continuous-temporal/video_captioning/video_captioning_197_14.png", "./Continuous-temporal/video_captioning/video_captioning_197_15.png"], "output": "C", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: a cartoon shows two dogs talking to a bird\nB: an animated film depicting a family of rabbits playing with a turtle\nC: a sci-fi movie featuring robots communicating with aliens\nD: a documentary about cats hunting in the wild", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: a cartoon shows two dogs talking to a bird\nB: an animated film depicting a family of rabbits playing with a turtle\nC: a sci-fi movie featuring robots communicating with aliens\nD: a documentary about cats hunting in the wild", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_198_0.png", "./Continuous-temporal/video_captioning/video_captioning_198_1.png", "./Continuous-temporal/video_captioning/video_captioning_198_2.png", "./Continuous-temporal/video_captioning/video_captioning_198_3.png", "./Continuous-temporal/video_captioning/video_captioning_198_4.png", "./Continuous-temporal/video_captioning/video_captioning_198_5.png", "./Continuous-temporal/video_captioning/video_captioning_198_6.png", "./Continuous-temporal/video_captioning/video_captioning_198_7.png", "./Continuous-temporal/video_captioning/video_captioning_198_8.png", "./Continuous-temporal/video_captioning/video_captioning_198_9.png", "./Continuous-temporal/video_captioning/video_captioning_198_10.png", "./Continuous-temporal/video_captioning/video_captioning_198_11.png", "./Continuous-temporal/video_captioning/video_captioning_198_12.png", "./Continuous-temporal/video_captioning/video_captioning_198_13.png", "./Continuous-temporal/video_captioning/video_captioning_198_14.png", "./Continuous-temporal/video_captioning/video_captioning_198_15.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "video_captioning", "visual_input_component": "Video image or Natural image", "source": "source", "options": "A: there is a vehicle riding dangerously through forest\nB: a boat sailing calmly in a lake\nC: a person walking peacefully in a garden\nD: a group of animals playing in a zoo", "question": "Please generate textual descriptions for a sequence of video frames.", "context": "Select from the following choices.\nA: there is a vehicle riding dangerously through forest\nB: a boat sailing calmly in a lake\nC: a person walking peacefully in a garden\nD: a group of animals playing in a zoo", "input_image_path": ["./Continuous-temporal/video_captioning/video_captioning_199_0.png", "./Continuous-temporal/video_captioning/video_captioning_199_1.png", "./Continuous-temporal/video_captioning/video_captioning_199_2.png", "./Continuous-temporal/video_captioning/video_captioning_199_3.png", "./Continuous-temporal/video_captioning/video_captioning_199_4.png", "./Continuous-temporal/video_captioning/video_captioning_199_5.png", "./Continuous-temporal/video_captioning/video_captioning_199_6.png", "./Continuous-temporal/video_captioning/video_captioning_199_7.png", "./Continuous-temporal/video_captioning/video_captioning_199_8.png", "./Continuous-temporal/video_captioning/video_captioning_199_9.png", "./Continuous-temporal/video_captioning/video_captioning_199_10.png", "./Continuous-temporal/video_captioning/video_captioning_199_11.png", "./Continuous-temporal/video_captioning/video_captioning_199_12.png", "./Continuous-temporal/video_captioning/video_captioning_199_13.png", "./Continuous-temporal/video_captioning/video_captioning_199_14.png", "./Continuous-temporal/video_captioning/video_captioning_199_15.png"], "output": "A", "qwen3-vl": "image none"}]
\ No newline at end of file
diff --git a/results/visual_cloze/qwen3-vl/metadata_info.json b/results/visual_cloze/qwen3-vl/metadata_info.json
new file mode 100644
index 0000000..95e8b46
--- /dev/null
+++ b/results/visual_cloze/qwen3-vl/metadata_info.json
@@ -0,0 +1 @@
+[{"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_0_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_0_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_0_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_0_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_0_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_0_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_0_6.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_1_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_1_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_1_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_1_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_1_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_1_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_1_6.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_2_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_2_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_2_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_2_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_2_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_2_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_2_6.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_3_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_3_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_3_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_3_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_3_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_3_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_3_6.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_4_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_4_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_4_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_4_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_4_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_4_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_4_6.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_5_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_5_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_5_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_5_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_5_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_5_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_5_6.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_6_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_6_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_6_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_6_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_6_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_6_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_6_6.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_7_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_7_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_7_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_7_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_7_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_7_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_7_6.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_8_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_8_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_8_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_8_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_8_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_8_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_8_6.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_9_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_9_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_9_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_9_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_9_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_9_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_9_6.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_10_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_10_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_10_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_10_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_10_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_10_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_10_6.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_11_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_11_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_11_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_11_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_11_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_11_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_11_6.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_12_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_12_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_12_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_12_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_12_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_12_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_12_6.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_13_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_13_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_13_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_13_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_13_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_13_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_13_6.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_14_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_14_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_14_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_14_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_14_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_14_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_14_6.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_15_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_15_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_15_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_15_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_15_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_15_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_15_6.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_16_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_16_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_16_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_16_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_16_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_16_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_16_6.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_17_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_17_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_17_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_17_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_17_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_17_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_17_6.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_18_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_18_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_18_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_18_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_18_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_18_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_18_6.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_19_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_19_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_19_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_19_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_19_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_19_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_19_6.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_20_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_20_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_20_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_20_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_20_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_20_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_20_6.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_21_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_21_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_21_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_21_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_21_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_21_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_21_6.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_22_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_22_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_22_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_22_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_22_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_22_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_22_6.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_23_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_23_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_23_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_23_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_23_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_23_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_23_6.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_24_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_24_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_24_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_24_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_24_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_24_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_24_6.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_25_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_25_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_25_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_25_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_25_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_25_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_25_6.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_26_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_26_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_26_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_26_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_26_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_26_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_26_6.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_27_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_27_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_27_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_27_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_27_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_27_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_27_6.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_28_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_28_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_28_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_28_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_28_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_28_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_28_6.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_29_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_29_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_29_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_29_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_29_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_29_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_29_6.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_30_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_30_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_30_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_30_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_30_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_30_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_30_6.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_31_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_31_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_31_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_31_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_31_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_31_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_31_6.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_32_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_32_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_32_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_32_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_32_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_32_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_32_6.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_33_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_33_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_33_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_33_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_33_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_33_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_33_6.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_34_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_34_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_34_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_34_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_34_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_34_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_34_6.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_35_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_35_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_35_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_35_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_35_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_35_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_35_6.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_36_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_36_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_36_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_36_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_36_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_36_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_36_6.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_37_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_37_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_37_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_37_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_37_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_37_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_37_6.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_38_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_38_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_38_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_38_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_38_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_38_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_38_6.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_39_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_39_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_39_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_39_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_39_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_39_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_39_6.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_40_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_40_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_40_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_40_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_40_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_40_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_40_6.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_41_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_41_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_41_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_41_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_41_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_41_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_41_6.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_42_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_42_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_42_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_42_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_42_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_42_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_42_6.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_43_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_43_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_43_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_43_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_43_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_43_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_43_6.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_44_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_44_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_44_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_44_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_44_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_44_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_44_6.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_45_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_45_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_45_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_45_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_45_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_45_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_45_6.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_46_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_46_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_46_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_46_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_46_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_46_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_46_6.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_47_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_47_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_47_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_47_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_47_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_47_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_47_6.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_48_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_48_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_48_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_48_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_48_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_48_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_48_6.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_49_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_49_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_49_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_49_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_49_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_49_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_49_6.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_50_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_50_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_50_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_50_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_50_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_50_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_50_6.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_51_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_51_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_51_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_51_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_51_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_51_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_51_6.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_52_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_52_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_52_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_52_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_52_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_52_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_52_6.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_53_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_53_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_53_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_53_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_53_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_53_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_53_6.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_54_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_54_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_54_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_54_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_54_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_54_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_54_6.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_55_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_55_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_55_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_55_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_55_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_55_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_55_6.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_56_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_56_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_56_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_56_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_56_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_56_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_56_6.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_57_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_57_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_57_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_57_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_57_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_57_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_57_6.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_58_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_58_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_58_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_58_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_58_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_58_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_58_6.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_59_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_59_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_59_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_59_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_59_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_59_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_59_6.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_60_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_60_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_60_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_60_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_60_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_60_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_60_6.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_61_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_61_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_61_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_61_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_61_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_61_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_61_6.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_62_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_62_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_62_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_62_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_62_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_62_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_62_6.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_63_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_63_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_63_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_63_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_63_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_63_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_63_6.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_64_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_64_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_64_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_64_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_64_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_64_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_64_6.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_65_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_65_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_65_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_65_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_65_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_65_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_65_6.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_66_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_66_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_66_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_66_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_66_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_66_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_66_6.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_67_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_67_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_67_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_67_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_67_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_67_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_67_6.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_68_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_68_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_68_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_68_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_68_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_68_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_68_6.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_69_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_69_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_69_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_69_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_69_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_69_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_69_6.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_70_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_70_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_70_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_70_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_70_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_70_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_70_6.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_71_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_71_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_71_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_71_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_71_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_71_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_71_6.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_72_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_72_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_72_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_72_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_72_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_72_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_72_6.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_73_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_73_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_73_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_73_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_73_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_73_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_73_6.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_74_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_74_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_74_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_74_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_74_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_74_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_74_6.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_75_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_75_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_75_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_75_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_75_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_75_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_75_6.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_76_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_76_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_76_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_76_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_76_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_76_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_76_6.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_77_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_77_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_77_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_77_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_77_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_77_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_77_6.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_78_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_78_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_78_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_78_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_78_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_78_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_78_6.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_79_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_79_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_79_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_79_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_79_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_79_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_79_6.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_80_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_80_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_80_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_80_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_80_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_80_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_80_6.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_81_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_81_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_81_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_81_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_81_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_81_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_81_6.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_82_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_82_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_82_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_82_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_82_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_82_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_82_6.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_83_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_83_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_83_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_83_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_83_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_83_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_83_6.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_84_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_84_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_84_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_84_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_84_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_84_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_84_6.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_85_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_85_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_85_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_85_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_85_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_85_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_85_6.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_86_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_86_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_86_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_86_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_86_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_86_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_86_6.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_87_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_87_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_87_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_87_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_87_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_87_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_87_6.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_88_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_88_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_88_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_88_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_88_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_88_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_88_6.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_89_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_89_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_89_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_89_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_89_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_89_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_89_6.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_90_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_90_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_90_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_90_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_90_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_90_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_90_6.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_91_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_91_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_91_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_91_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_91_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_91_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_91_6.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_92_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_92_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_92_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_92_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_92_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_92_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_92_6.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_93_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_93_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_93_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_93_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_93_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_93_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_93_6.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_94_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_94_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_94_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_94_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_94_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_94_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_94_6.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_95_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_95_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_95_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_95_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_95_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_95_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_95_6.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_96_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_96_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_96_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_96_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_96_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_96_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_96_6.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_97_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_97_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_97_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_97_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_97_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_97_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_97_6.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_98_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_98_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_98_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_98_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_98_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_98_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_98_6.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_99_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_99_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_99_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_99_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_99_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_99_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_99_6.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_100_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_100_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_100_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_100_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_100_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_100_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_100_6.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_101_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_101_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_101_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_101_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_101_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_101_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_101_6.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_102_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_102_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_102_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_102_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_102_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_102_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_102_6.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_103_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_103_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_103_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_103_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_103_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_103_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_103_6.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_104_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_104_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_104_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_104_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_104_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_104_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_104_6.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_105_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_105_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_105_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_105_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_105_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_105_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_105_6.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_106_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_106_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_106_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_106_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_106_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_106_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_106_6.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_107_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_107_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_107_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_107_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_107_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_107_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_107_6.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_108_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_108_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_108_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_108_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_108_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_108_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_108_6.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_109_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_109_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_109_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_109_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_109_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_109_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_109_6.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_110_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_110_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_110_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_110_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_110_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_110_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_110_6.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_111_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_111_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_111_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_111_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_111_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_111_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_111_6.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_112_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_112_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_112_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_112_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_112_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_112_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_112_6.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_113_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_113_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_113_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_113_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_113_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_113_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_113_6.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_114_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_114_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_114_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_114_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_114_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_114_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_114_6.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_115_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_115_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_115_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_115_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_115_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_115_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_115_6.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_116_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_116_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_116_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_116_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_116_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_116_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_116_6.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_117_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_117_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_117_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_117_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_117_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_117_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_117_6.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_118_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_118_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_118_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_118_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_118_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_118_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_118_6.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_119_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_119_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_119_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_119_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_119_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_119_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_119_6.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_120_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_120_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_120_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_120_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_120_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_120_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_120_6.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_121_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_121_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_121_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_121_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_121_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_121_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_121_6.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_122_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_122_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_122_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_122_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_122_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_122_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_122_6.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_123_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_123_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_123_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_123_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_123_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_123_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_123_6.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_124_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_124_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_124_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_124_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_124_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_124_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_124_6.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_125_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_125_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_125_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_125_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_125_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_125_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_125_6.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_126_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_126_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_126_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_126_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_126_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_126_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_126_6.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_127_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_127_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_127_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_127_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_127_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_127_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_127_6.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_128_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_128_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_128_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_128_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_128_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_128_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_128_6.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_129_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_129_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_129_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_129_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_129_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_129_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_129_6.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_130_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_130_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_130_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_130_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_130_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_130_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_130_6.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_131_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_131_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_131_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_131_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_131_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_131_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_131_6.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_132_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_132_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_132_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_132_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_132_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_132_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_132_6.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_133_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_133_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_133_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_133_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_133_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_133_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_133_6.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_134_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_134_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_134_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_134_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_134_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_134_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_134_6.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_135_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_135_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_135_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_135_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_135_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_135_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_135_6.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_136_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_136_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_136_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_136_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_136_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_136_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_136_6.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_137_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_137_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_137_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_137_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_137_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_137_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_137_6.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_138_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_138_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_138_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_138_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_138_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_138_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_138_6.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_139_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_139_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_139_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_139_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_139_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_139_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_139_6.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_140_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_140_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_140_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_140_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_140_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_140_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_140_6.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_141_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_141_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_141_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_141_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_141_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_141_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_141_6.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_142_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_142_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_142_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_142_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_142_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_142_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_142_6.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_143_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_143_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_143_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_143_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_143_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_143_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_143_6.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_144_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_144_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_144_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_144_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_144_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_144_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_144_6.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_145_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_145_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_145_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_145_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_145_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_145_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_145_6.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_146_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_146_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_146_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_146_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_146_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_146_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_146_6.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_147_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_147_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_147_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_147_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_147_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_147_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_147_6.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_148_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_148_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_148_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_148_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_148_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_148_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_148_6.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_149_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_149_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_149_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_149_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_149_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_149_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_149_6.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_150_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_150_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_150_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_150_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_150_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_150_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_150_6.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_151_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_151_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_151_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_151_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_151_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_151_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_151_6.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_152_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_152_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_152_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_152_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_152_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_152_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_152_6.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_153_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_153_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_153_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_153_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_153_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_153_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_153_6.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_154_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_154_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_154_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_154_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_154_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_154_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_154_6.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_155_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_155_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_155_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_155_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_155_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_155_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_155_6.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_156_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_156_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_156_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_156_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_156_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_156_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_156_6.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_157_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_157_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_157_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_157_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_157_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_157_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_157_6.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_158_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_158_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_158_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_158_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_158_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_158_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_158_6.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_159_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_159_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_159_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_159_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_159_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_159_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_159_6.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_160_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_160_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_160_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_160_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_160_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_160_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_160_6.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_161_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_161_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_161_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_161_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_161_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_161_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_161_6.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_162_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_162_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_162_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_162_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_162_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_162_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_162_6.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_163_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_163_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_163_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_163_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_163_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_163_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_163_6.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_164_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_164_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_164_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_164_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_164_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_164_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_164_6.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_165_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_165_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_165_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_165_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_165_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_165_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_165_6.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_166_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_166_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_166_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_166_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_166_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_166_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_166_6.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_167_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_167_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_167_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_167_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_167_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_167_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_167_6.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_168_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_168_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_168_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_168_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_168_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_168_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_168_6.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_169_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_169_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_169_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_169_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_169_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_169_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_169_6.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_170_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_170_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_170_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_170_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_170_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_170_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_170_6.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_171_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_171_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_171_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_171_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_171_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_171_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_171_6.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_172_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_172_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_172_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_172_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_172_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_172_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_172_6.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_173_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_173_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_173_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_173_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_173_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_173_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_173_6.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_174_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_174_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_174_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_174_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_174_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_174_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_174_6.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_175_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_175_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_175_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_175_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_175_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_175_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_175_6.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_176_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_176_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_176_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_176_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_176_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_176_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_176_6.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_177_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_177_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_177_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_177_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_177_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_177_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_177_6.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_178_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_178_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_178_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_178_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_178_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_178_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_178_6.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_179_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_179_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_179_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_179_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_179_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_179_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_179_6.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_180_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_180_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_180_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_180_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_180_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_180_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_180_6.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_181_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_181_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_181_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_181_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_181_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_181_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_181_6.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_182_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_182_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_182_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_182_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_182_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_182_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_182_6.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_183_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_183_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_183_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_183_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_183_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_183_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_183_6.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_184_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_184_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_184_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_184_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_184_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_184_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_184_6.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_185_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_185_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_185_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_185_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_185_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_185_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_185_6.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_186_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_186_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_186_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_186_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_186_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_186_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_186_6.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_187_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_187_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_187_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_187_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_187_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_187_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_187_6.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_188_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_188_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_188_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_188_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_188_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_188_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_188_6.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_189_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_189_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_189_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_189_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_189_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_189_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_189_6.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_190_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_190_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_190_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_190_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_190_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_190_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_190_6.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_191_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_191_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_191_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_191_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_191_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_191_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_191_6.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_192_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_192_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_192_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_192_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_192_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_192_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_192_6.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_193_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_193_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_193_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_193_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_193_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_193_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_193_6.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_194_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_194_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_194_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_194_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_194_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_194_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_194_6.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_195_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_195_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_195_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_195_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_195_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_195_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_195_6.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_196_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_196_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_196_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_196_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_196_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_196_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_196_6.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_197_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_197_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_197_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_197_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_197_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_197_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_197_6.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_198_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_198_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_198_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_198_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_198_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_198_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_198_6.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_cloze", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "question": "Choose the best image for the missing blank to correctly complete the recipe.", "context": "The input images are the first 3 images and the other images are the candidate answer images. The missing image is just after 3th image. \nRead the question below and select from the following choices.\nA:The fourth image from the end\nB:The third image from the end\nC:The second image from the end\nD:The first image from the end", "input_image_path": ["./Discrete-temporal/visual_cloze/visual_cloze_199_0.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_199_1.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_199_2.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_199_3.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_199_4.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_199_5.jpg", "./Discrete-temporal/visual_cloze/visual_cloze_199_6.jpg"], "output": "A", "qwen3-vl": "image none"}]
\ No newline at end of file
diff --git a/results/visual_coherence/qwen3-vl/metadata_info.json b/results/visual_coherence/qwen3-vl/metadata_info.json
new file mode 100644
index 0000000..d5823d9
--- /dev/null
+++ b/results/visual_coherence/qwen3-vl/metadata_info.json
@@ -0,0 +1 @@
+[{"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. \n          Fundamentally all you need is:EggplantTomatoesMozzarellaOlive OilSaltPepperYou can also use something to season the olive oil. I had some basil from the farmer's market I needed to use up, so I decided to to add basil and garlic. Italian seasoning would also work very well.Pay no mind to the chicken in the picture - I used it in my meal but not in this dish, so I don't know why I put it in the picture.. You will be basting your eggplant and tomatoes with olive oil. You can use plain olive oil if you want, but it'll be more flavorful with herbs mixed in. If you're gonna mix things in, do it now so the flavors can mingle awhile while you prep the rest of the ingredients.\nI had some basil left over from the farmer's market, so I used that and some minced garlic. This would also be good with your favorite blend of Italian seasoning.. Cut the ends off of your eggplant and slice it the long way. Make sure that you cut all of the slices the same thickness so that they all cook evenly. Mine are about 1/4\" thick, and I wish I had a mandolin slicer so they'd be perfectly even.\nI think it would make a nicer presentation cut into rounds, so you can do that if you want, but it's easier to grill when it's sliced the long way\nEggplant (especially large ones like this one) can be bitter, so to prevent that you need to draw out the juices. Place your eggplant slices on paper towels and salt them generously on both sides. Then, place another layer or two of paper towel on top. Let them sit, and if the paper towels are too saturated, switch them out. . Slice your mozzarella into thin peices (about 1/8\" of an inch thick). If you are using a simple grilled tomato like I did, slice it into hearty pieces. I had two tomatoes, and cut them into 5 big slices with a little bit extra.If you are feeling more ambitious of having company over I would deffinately reccomend roasting your tomatoes using Canida's instructable. It would also work well with a fresh tomato sauce.. You've got extra tomato, plenty of sliced mozzarella, and some flavored olive oil. Make yourself a little appetizer!. Preheat your grill.\nAdd some salt and pepper to your tomatoes, and some pepper to your eggplant (it's already salted). Baste one side with oil.\nPut your veggies oil side down on the grill and cook for three minutes with the cover on. \nNote: you only see eggplant in the photo of this step, but I should have cooked my tomato longer so I would put it on at the same time as the eggplant in the future.. Baste the other side of the veggies and flip them over, then cook them another 3 minutes.\nFeel free to re-baste if you wish. When I flipped the eggplant I added my tomatoes to the grill. I cooked them 1 1/2 minutes on each side, but I think that I should have cooked it a little longer.. Flip the eggplant over again. Place the tomato on top (or sauce if you prefer), and the mozzarella on top of that. Cook it for 1-2 minutes with the cheese to get a little melty.. Your eggplant is ready to serve! I served this as a side dish with grilled chicken, but it would also be delicious with a lightly sauced pasta or a smaller grain like couscous. It would also be yummy in a sandwich.\nEnjoy!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_0_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_0_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_0_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_0_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Make sure you have everything you need before you start. There is nothing worse that having to stop in the middle of the process to get something you forgot.Ingredients:\n1 cup unsalted butter, softened\n1 cup brown sugar\n1 cup white sugar\n2 teaspoons vanilla extract\n2 tablespoons milk\n2 eggs, lightly beaten\n2 cups flour\n1 teaspoon salt\n1 teaspoon baking soda\n1 teaspoon baking powder\n2-1/2 cups old-fashioned oats (not instant)\n12 ounces semisweet chocolate chips \n1-1/2 cups chopped walnuts Tools:\nmeasuring cups\nspoon\nspatula\nlarge mixing bowl\nsmaller mixing bowl\nstand mixer (if you have one). Put the butter out on the counter about an hour before you start. The butter is ready when you can push a dent into it with a finger. \nCutting the butter into half inch pieces can speed up the thawing\nIf you forget of can not wait the butter can be placed into the microwave. \nStart with 10 seconds until the butter is soft.. Pour 1 cup brown sugar and 1 cup white sugar. Mix the sugar together before adding other ingredients. \nOnce the sugar is mixed add 1 teaspoon salt, 1 teaspoon baking soda, and 1 teaspoon baking powder. \nMix the ingredients again and then add 2 cups of flour. Mix the dry ingredients one last time and set them aside.\u00a0 . Crack open and lightly whisk two eggs. Then add 2 teaspoons vanilla extract\n2 tablespoons milk. Lightly whisk again and set the wet ingredients. . When mixing all of the ingredients together pour 1/4 of a cup of the dry mix into the bowl. Next put the butter in the bowl. Pour the rest of the dry mix into the bowl followed by the wet ingredients. Then mix the ingredients together until they form a dough.\u00a0 \nMake sure to scrape the side of the bowl making sure to mix all of the Ingredients into the dough.. Measure out 2 and 1/2 cups old-fashioned oats. Do not use instant oats. Pour the oats into the center of the dough. Then mix the oats into the dough.. Measure out 1 and 1/2 cups of chopped walnuts. Pour the walnuts onto a chopping board. Chop walnuts into smaller pieces. Pour the walnuts into the center of the dough. Then mix the walnuts into the dough.\nA good way to tell if the walnuts are chopped enough is to pass them through the cutting board handle. If the walnuts do not fit then they are not chopped enough.\u00a0 \n\u00a0 \u00a0. The last ingredient to mix into the dough is the chocolate chips. Measure out 12 ounces semisweet chocolate chips. Pour the chocolate chips into the center of the dough. Mix the dough until the chips are incorporated into the dough.\nOne bag of chocolate chips in a supermarket is usually 12 ounces so you can just pour the whole bag in. . Once the all of the ingredients have been mixed into the dough scrape the dough off the sides to make sure everything has been mixed. Then place the dough into the refrigerator for 1 to 24 hours. \n24 hours is ideal but the even after 1 hour the dough should be firm enough to put on the cooking trays.. Preheat the oven to 350 degrees. Well the oven is heating up take the dough out of the refrigerator. Take a table spoon and begin scooping out the dough and placing the dough on baking trays. \nFor best results place dough balls at least 1 inch apart from each other. \u00a0 . Once the oven reaches 350 degrees place the baking trays into the ovens. Set a timer for 10 to 12 minutes. \nAfter placing the baking trays into the oven set up the cooling trays to place the cookies on when they are done baking.. When the timer goes off remove the trays from the oven and allow them to sit for about 2 minutes. After 2 minutes remove the cookies from the baking trays and place the cookies on the cooling rack.. When the cookies cool it's time to eat.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_1_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_1_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_1_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_1_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Tools:Small hammerDrill1/8\" nail3/8 drill bit (wood bit works best)Round fileVice (or other flat hammering surface)Jar supplies:500ml mason jars with 70mm size opening Colored drinking straws Approximate Costs:A dozen 500ml/70mm jars are about $8Straws are about $2Four Handled 500ml/70mm jars are about $8*Always wear eye protection when using tools. Drill a hole in each lid. Slightly off-center is best. Use the drill at high speed but push softly for the cleanest hole. Pushing too hard and fast  will cause the hole to be jagged and mangled.. Use the round file to file down any rough edges or burrs. Insert the file and at an angle, run the file around the hole, rotating the file as you do so.. On the vice/hammering surface, hammer the hole a few times to further smooth the hole edges. You should be able to run your finger over the hole and it should be free of any edges.. Repeat this for all your jars.. If you mess up any if the lids, you can buy replacement tops. Also be sure to empty out the jars of any drill shavings. Please wash every jar for safety and sanitary reasons.. For a tropical theme, add an umbrella hole!Use a 1/8\" diameter nail to add an umbrella hole opposite the straw hole. Be sure to also flatten the back side of the lid with a hammer to eliminate any sharp edges.. After the jars are all complete, insert your straws and they're ready to go!You can decorate the jars with clear labels that you can print on with your inkjet printer at home. Or use stickers or even spray paint them with a stencil!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_2_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_2_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_2_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_2_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Fresh asparagus Seasonings (I use garlic salt)Extra virgin olive oilZiplock bag. Cut the base off of each asparagus stalk at a 45 degree angle. This part of the stalk is hard and not good to eat. It's usually white or brownish purple.Once cut, place them in a large ziplock bag. I use the gallon size. After they're all in the bag, pour some extra virgin olive oil. I don't measure the oil. Pour enough to coat all the asparagus. Seal the bag and shake/mix until all stalks are coated. Let it sit a bit. Every now and then shake up again. When I grill, prepping the asparagus is the first thing I do. That way it can absorb the oil and get nicely coated while grilling the rest of the food.. When grilling, asparagus is the last thing you should grill. You want to eat it as soon as you can after removed from the grill. It's best hot. Tonight I grilled my pork chops first. Then I grilled the asparagus and other veggies.Put the grill on medium heat and use tongs to place the asparagus on the grill. Be careful because the oil is flammable. Make sure to place the asparagus at an angle so it doesn't fall through the grate. The object is to get each stalk browned but not burnt.I close the lid and let them cook for five minutes. After five or so minutes, move the asparagus around so it cooks evenly. It easily rolls around so there is no need to actually flip it. At this point I usually let it go for another few minutes. If you use higher heat then make sure to watch and tend the asparagus to prevent burning. When it's browned/seared to your preference remove from the grill.. Eat as soon as you can for optimum deliciousness.Thanks for reading. Roll Tide!!!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_3_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_3_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_3_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_3_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Ingredients:\n\t\tFlour\n\t\tCanned biscuits\u00a0\n\t\t2 servings Fruit roll ups\nPlease note: \u00a0You will need 2 biscuits per serving to figure out how many cans you will be baking. \u00a0. Tools:\n\t\tDough board or cutting board\u00a0optional\u00a0\n\t\tKitchen scissors\n\t\tAltoids can for the Tic-tac-to\n\t\tSkewer\n\t\tSeveral different sizes and shapes of cookie cutters\n\t\tBaking sheet\n\t\tKnife. Instructions:\n\t\tPre-heat oven following the instructions on the can.\n\t\tFlour dough board.\nNote: If making these for the first time I would lower the baking temp and baking time because the smaller shapes bake quicker and you might want to remove them sooner.\n        . Cutting the shapes for the\u00a0puzzle:\n\t\tLay the biscuit on the floured surface and pat flat or use a rolling pin.\n\t\tCut out the shapes with the cookie cutter.\n\t\tBefore removing the cookie cutter move it gently back and forth and side to side to make the opening a little larger but still keeping the shape.\nPlease note: I baked the shapes separate from the biscuit, However I think it might be best to keep the shapes in the biscuit after they are cut and wiggled. This might make it work slightly better. I noticed some of the shapes did not fit as nicely but they still worked.\u00a0\nIf they cook together for some reason you could use a knife, \u00a0and gently trim them to remove from the biscuit. The next time I make these I will try baking half with the shapes in and the remaining half with the shapes out.\u00a0\n\u00a0\n        . Cutting the shapes:\n\t\tYou will need to flatten 1 biscuit for the board and leave it uncut.\n\t\tUse\u00a0another biscuit for the stars and hearts.\u00a0\n\t\tYou will need 2 biscuits per serving.\u00a0\n\t\tLay the biscuit on the floured surface and pat flat or use a rolling pin.\n\t\tCut out the shapes with the cookie cutter. \u00a0You will need 2 different shaped mini cutters.\n\t\tPlace the entire biscuit with the shape intact on a cookie sheet.\n\t\tBake the biscuits according to the instructions in step 3.\n\t\tI would bake \u00a0them on a slightly lower heat and also shorten the baking time, so you can keep an eye on them and remove them from the oven as soon as they are done.\n\t\tRemove from the oven.\n\t\tAllow to cool just enough not to burn your fingers.\n\t\tCarefully using the skewer push the shape through the biscuit.\u00a0 \u00a0\nBake the biscuits according to the instructions in step 3.\n\u00a0. Cutting the fruit roll ups:\n\t\tTrim 5 thin strips from the fruit roll up.\n\t\tPut one strip on the wax paper the fruit roll up came on.\n\t\tRoll it on the paper until all of the strip is covered with the paper.\n\t\tRepeat this with the rest of the strips.\n\t\tRoll it into a long small roll. \u00a0\n\t\tFold it over and place it in the Altoids can.\n\t\tPlace the shapes in the can.\n\t\tThe kids set up the board and then play the game!. The kids will enjoy decorating the board and playing the game. \u00a0The younger kids will have fun putting the puzzle together. \u00a0The cool thing about these are they are not messy at all, until they are ready to eat. \u00a0I hope this tutorial has motivated you to dig out the biscuits and start baking. \u00a0I had a lot of fun baking them. \u00a0\nThank you for stopping by and do visit again! \u00a0I am always tinkering here at Instructable's \u00a0and will have new tutorials in the near future.\nTake care and have a\u00a0stupendous\u00a0day! \u00a0\nSunshiine\u00a0\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_4_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_4_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_4_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_4_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. As mentioned, some are optional. See notes on each.The Dry300g (2.7cups) White Self Raising flour. We have also had success with standard wholemeal flour, but if you don't use self raising add some baking powder (1-2 teaspoons) so it rises.80g (0.7cups) Coco/carob powder. This can be 100% coco powder (let's call that be beginner level cake). We like the additional complex complex flavour of carob so use about 60g coco to 20g carob powder. 125-175g (0.6cups) Sugar. So we have been trying to reduce this (you know sugar is the lord of pain right?). Our experiments put a bit of a limit on it at 100g, after which people don't tend to identify it as cake and there are structural issues (though I still like it). So yeah 140g or more of sugar for the beginners cake, we went with 100g.2 tsp bi-carbonate of soda and a pinch of salt. To help with raising. 30g Dessicated coconut. Optional - make for interesting textural addition. They act like binding fibres.Small handful of cocoa nibs. Optional, can also use nuts or just leave it out.The Wet100ml (0.4cups) Whole organic milk. Lets not start the pasteurisation debate now.3 free range eggs.350ml (1.4cups) melted coconut oil. You can also use vegetable oil (but not sunflower oil, it tastes funny) but it's different A dash (1tsp) of vinila. Optional, not necessary.The Ambiguously Wet-dryOne average sized courgette. About 400g in weight. One fiery chilli pepper, or some chilli powder to taste - this is optional if you hate spicy food leave it out, no probs. Approximately 30 ripe blackberries. Another optional ingredient, You could also use blueberries (though I don't see the point as blueberries always seem to get lost in cakes and muffins, why are they such a popular choice???) or my favourite choice raspberries, or some other zingy fruit. Raisins work well too. . Two mixing vesselsA scales - or some cups or whatever witchcraft unit measuring system you go by.Measuring jug for the milkA cake tin (or muffin cases - the same mix can make muffins).  Oven Whisk. Concrete is easier to mix when you mix the ballast and cement dry before adding the water (well unless it is all very dry, then the dust is unmanageable). Anyway, same principal, mixing the dry ingredients first means no clumpying so you can get them together with ease. Weigh them out. Put them in your big mixing bowl. If you are being pro you can sieve them in. Keen enthusiasts just shove it in there, it's just the same ;) . Melt the coconut oil  (put the jar in hot water for a while) if it is solid, and mix it up with the milk, 3 eggs and vanilla.A whisk can be used, as can a wide variety of food processors and diy drill powered devices. A lowly fork will do fine too. . Time to get your grating freak on! We use the course side of the greater to great the courgette on top of the dry mix. Then the fine side to do the fresh chilli (you could also use chilli powder - which also works great for baking).Be sure to scrape the inside of the greater to get it all out, most of the chilli will cling there. Then rinse your hands before you touch eyes or nose with chilli hand.Add your chosen fruit - in our case blackberries. . We are going to mix all the mixes by adding the wet mix to the dry and stirring it up. It shouldn't take long, and a sturdy spoon is all you need. Now check your consistency. It is a fairly runny batter for a cake, think slightly too sloppy brick and block mortar. a viscosity similar to thick pouring cream, or quite stiff vomit.Taste a little bit like a pro, and smile! (assuming your not the ocd hygiene type - in that case spit that back out!)  . Grease proof paper and grease your cake tin. Observe illustrations. Avoid using petrochemical derived grease - coconut oil works well.   . Time to reap the rewards of your mixing. Pour the mix into the tin! Lick the mixing bowl clean...Put the cake in the pre heated oven at 180C, 357F, gas mark 4.Set an alarm for one hour (or less - see next step). . If you oven is like ours, and the chances are it's not, check the cake after one hour and follow testing procedure. If its a fan assisted oven, turn the fan off. If you can't turn it off, check the cake after 45 minutes instead of an hour. If you went for the muffin option, they will be done a lot quicker, check after 30 mins.Testing procedure:Don ppe (oven gloves), stand back and open oven. Hold face away from initial vigorous gas exhausting. After initial off-gassing peek inside. Shake the cake tin with with oven glove. If it wibbles like jelly, leave it in the oven and re-start the checking procedure in 20mins.If it isn't over wibbly remove and place on heat resistant surface and continue with prod test.Prod the centre of the cake with wooden prodder (or sharp knife), if it comes away clean, it is done, leave to cool. If uncooked cake mix remains on your prodder replace cake in oven and repeat prod test in 10mins.If the surface shows signs of burning, remove cake from oven and consult a qualified professional. Steaming is normal, but if on fire and visible flames can be observed then call for assistance and raise the alarm.      . Leave to cool, The longer you leave the more likely the cake will retain a fine and sturdy structure when you remove the tin. That said, if you are getting desperate and this whole affair is taking longer that expected, it is edible as soon as you can bear the temperature.Once cooled, test a portion of the cake to check it for taste and yummyness. Be sure to have a representative cross section. If deemed suitable, serve delicious chocolate cake to friends and family.I recommend telling them it is courgette chocolate cake after they have tried some as imprecise expectations of the taste can colour participants enjoyment.See photos for some serving suggestions, and please do add your own pics and modifications below. We'd love to hear about your twists on this one.      \nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_5_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_5_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_5_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_5_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. You will need: - A dog whose birthday it is, or a human who likes dog food - A dish the size of the desired cake - A banana - Some kibble - Dry oats - Mayonnaise - Food colouring - Wet dog food - Various kitchen tools. Grind up your kibbles. The amount will differ depending on the size of your cake, aim for about 1/3 of the final cake-volume. Here I used a magic bullet. But, any blender will work. If you don't have a blender, try a mortar and pestle, or use something heavy to smash them into powder. Get them as fine as you can - we are trying to mimic wheat flour.. Add some wet ingredients to hold it together. I used a banana because my dog likes bananas. If yours doesn't experiment with a few sticky things until you find something they like. Honey, peanut butter, molasses perhaps. Add just enough to hold everything together. You will play with the amount later. Add the wet dog food. Use about as much by volume as your dog-food flour.. You will need to balance the texture to make sure it stays up. Add a little milk or water if it is too thick. Add your oats, or more kibble flour if it is too runny. In the end, your batter should be about the consistency of play-dough. It should be firm, and hold its shape without assistance.\u00a0. Oil the inside of your cake pan and press your batter into it. Place the pan upside down on a plate, and tap the cake out.. Mix mayonnaise with your choice of food colouring to create designs on your cake. Use white or coloured icing as a base frosting.. Grab your mutt and celebrate!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_6_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_6_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_6_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_6_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Here's the basic recipe.\u00a0 Start by making this:\n(this is essentially a double batch...it will yield about 70-75 cookies...depending on variety)\nYou will bake 375* F for 8-10 minutes...so go ahead and preheat the oven!\n3 sticks of Butter (1 1/2 Cups)\n1 1/2 Cup Brown Sugar\n1 Cup granulated Sugar\nMix and cream\nThen add:\n3 eggs\n2 tsp Vanilla\nmix together\nThen add:\n1 tsp Baking Soda\n1 tsp Salt\n2 Cups Oats\n4 Cups Flour\nMix together. Here's what we're going to do.\nAdd 2 full bags of chocolate chips.\nI used the top 2...dark chocolate and mini's...the melts were used later.\nThese are loaded.\u00a0 But essentially it is a double batch...so 2 bags is just right!. Split up the dough into 5 bowls.\nEach bowl will make a different variety of chocolate Overlord cookies!\n(yes, these aren't just overload cookies...they are Overlord Cookies). First:CHOCOLATE CHIP COOKIES!\nRoll out balls of dough about the size of a ping pong ball.\nWe did 15 per tray. Tray is lightly greased...but I don't think it needs it.\nBake 8-10 minutes at 375 F.\u00a0 I do 8 minutes\nThen remove them and allow them at least 5 minutes before touching them!\nThey need to completely finish cooking...they will be gooey until they cool.\n(don't judge my pans...if you know how to clean them up perfectly...\ncome over and clean them, I will give you cookies!). Next we add some fun!. SMORE'S COOKIES!\nMake a tray of regular cookies.\u00a0 Bake 8 minutes\nPull out of oven and while gooey, place 3 marshmallows on\ntop with one baking melt chocolate disk for looks!\nThen pop them under the BROILER for just a minute or\ntwo until the marshmallows are toasted!\nGolden Perfection!. COOKIES AND CREAM\nStart with your cookie dough and oreo cookies...\nwrap an oreo completely in a \"thin\" layer of cookie dough, covering it completely!\nThese turn out quite large!\u00a0 We fit 8 on one pan.\nThey bake up perfectly with all that oreo goodness inside!\nThese were way better and bigger than I expected!. SWEET AND SALTY\nTake the Chocolate Chip cookie dough and add broken up pretzel sticks to it!\nMakes a sweet and salty awesome flavor!\nRoll out and bake the same as the regular cookies!. TURTLES\nBake a batch of regular cookies, like the smore's ones.\nPull out after 8-10 minutes and lightly press a pecan or two on top.\nThen drizzle with caramel topping!\nLet cool at least 5 minutes before plating!. Then plate up all your gourmet cookies!\nAdd some little name sticks so your guests know what they are getting into!\nOkay, so yes...you did the math right.\n15 cookies of each variety except the Cookies 'n Cream...only 8 of them\nGrand total: 68 cookies!\nAwesome spread for 1 simple batch of cookies!\nIn a blind taste test...the 8 year old and 10 year old loved the\nCookies and Cream the best!\nFollowed closely by the Smore's!\u00a0 :). Best part about these cookies is they FREEZE!\nThe dough freezes, the cookies freeze...you don't have to eat them all in one night!\u00a0 And they taste good frozen!\nNow you can have a party spread with only the time spent making a batch of cookies!!!\nThanks for viewing, which one do you think you'd like the most???\nVote for me in the Cookie Contest...I'll make you some cookies!\u00a0 :)\nCheck out my blog for more silliness!\u00a0\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_7_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_7_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_7_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_7_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. 8 Hardboiled eggs- peeled ( you can buy peeled hard boiled egg from the store)Tomatoes - 2 Nos medium sizeFrozen or fresh Peas- 1.5 cupsOnion- 1large Ginger- 2 small pieces about one inch eachGarlic- 6/8 large onesCorriender Leaf- 1/4 cup to garnish at the endSpicesPaprika Or Kashmir Red chili( For color) or adobe - 1 table spoon    Cayenne - Depends on taste    Salt - on taste    clove-4-6    Cinnamon-1 half inch or 1/4th teaspoon    Cumin seeds- 1 table spoon    Corriender Powder - 1and 1/2 table spoon.    Garam Masala- 1 teaspoon or according to taste   Cardemom - 2 pod or one pinch of you are using powder.(optional)   You can get these spices at any Indian Grocery store or in the international food isle of  big grocery stores carry it . . First think first remember to enjoy your cooking and there is no rules for cooking. You improve as you keep trying. Keep experimenting. If you don\u2019t like corrender add parsley, if that is what you fancy. In case you dont like peas add edmame. The most important lesson is to enjoy the journey rather than worring about the end destination. Feel like a chef, put on some music, open a bottle of wine and cook by yourself or with your friend. Furthermore, cooking is like meditation, a form of mindfulness. It will relax you and take you away form the daily stress.. 1. First Hard boil the egg. - One can do that by boiling eggs in a large pot of hot water for 15 to 20 minutes. Let it cool and then peel the skin off. Store bought hard boiled are easy and quick as they are even peeled.2. Now take these egg and slit them gently along the side from top to bottom in the center about one inch slit. do it on all 4 side. Don\u2019t worry of it is tough and you egg broke into half. Just get a fork and make whole all along the egg.3. Now heat oil and add all the eggs and REMEMBER to cover it with the lid. With the lid closed hold the handle and gently shake the pan so that the egg rotate. Do this for couple of minute till the eggs turn light brown about 3 to 5 minutes. Keep the eggs in medium heat also u will hear lot of spluttering sound which is ok. Switch off the heat and keep the lid on for few minutes.4. Now remove the lid and take the eggs out in a plate. Save the oil.Alternatively, u can skip frying the egg if you find it dificult. just take the egg poke holes on it with fork and keep it aside and follow steps to make the gravy . 1. In any blender puree all the onion, ginger, garlic, and all the  spices except salt into a nice paste. Add little water if needed to make a fine paste.2. Now add the puree in the pan with the remaining oil. Turn the heat on and fry the paste for about 10 minutes in medium heat. At this point you will notice that the paste is leaving slight oil. 3. Puree the tomato and add it to the paste along with the salt.. 4. Fry it for another 5 minutes. 5. Now add 1.5 cups if water to the paste and in the gravy(if needed add more water). Then add peas and eggs. 6. Bring it to a boil. Then lower the heat, cover the lid stirring every so often to check the gravy. If the gravy is too watery increase the heat and remove the lid, let the water evaporate.  7. Once i the gravy comes to a thick consistency switch off the heat and ass finly chopped corriender leaves. Serve this with Naan bread or pittas. It tastes very nice over a bed of rice. \nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_8_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_8_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_8_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_8_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Plug the blender in. Get a banana and peel it and then cut it up. Put it in the blender. Get an apple and cut it then put it in the blender. Secure the lid. Start mixing. Get a spoon stir it and put it in the cup. Leave it for a bit so it is not puffy. Put it in the fridge for about 30mins. Mmmmm lovely\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_9_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_9_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_9_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_9_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Chocolate sandwich cookies like Oreo or HydroxVanilla Ice Cream 2 pintsWhipped ToppingChocolate Syrup. Put sandwich cookies into a plastic bagCrush themSave 3/4 cup of mixture for toppingPour crushed cookies into the bottom of container. Let the ice cream softenScoop it out onto the cookie mixture crustSpread evenly over the top. Add the whipped topping to the top of the ice cream. Add the reserved cookie crumbs distributed evenly over the whipped topping.Drizzle chocolate syrup over the top. Freeze for at least one hour.It will be difficult to cut but will soften quickly in the heat.Running a knife through hot water before cutting will ease the process as well.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_10_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_10_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_10_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_10_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Only a few tools of the trade needed for this delicious dish.PlateButter KnifeCutting board or paper towel to put underneath the wrap (optional). The ingredients you will need include the following:A soft Burrito tortilla NutellaHoney. With the butter knife, spread Nutella all over a flat, round tortilla. Get out your honey and drizzle it out all over the nutella.  How much do you say?  All depends on your preference.. Roll your masterpiece like a Burrito!. Serve It and Eat It!I hope you enjoyed this tutorial on how to make a Nutella and Honey Wrap Burrito! This process might get a little messy so I would advise you to put a cutting board, a paper towel, or a napkin underneath before you make it. One Nutella and Honey Wrap Burrito will serve on person each, so if you are making them for a party I suggest making a few of them. Stay tuned for more Instructables by the DIS DemonsBe sure to follow the DIS Demons @ MYWLAKETECH.Made by DIS DemonGa349140\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_11_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_11_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_11_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_11_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. 1) 1/2 lb of bacon2) 2 white onions3) 2 tomatoes4) 2 Serrano peppers5) 2 quarts cooked pinto beans (see link to previous Instructable -> Pinto Bean Instructable )6) 4 cups water7) 1/2 cup of cilantro leaves8) 1 tbsp ground cumin9) Salt as needed. Cut the bacon into 1/2 inch strips. Cut the white onions into 1/2 in cubes. Cut the tomatoes into 1/2 inch cubes.Cut the Serrano peppers into 1/8 semi-circles. Brown the bacon in the pot. After the bacon is browned, add the onions and peppers into the pot. Cook until onions are translucent.. Add the tomatoes to the mixture and cook for another minute.. Add the 2 quarts of cooked pinto beans to the pot and 4 cups of water.. Add the 1 tbsp of ground cumin to the mixture.. Add 1/2 cup of fresh cilantro leaves to the mixture.. Even though the beans and the bacon have salt. I always check the salt of the beans at this point. The onion and tomatoes may dilute the seasoning of the entire mixture. I checked this batch and I had to add 1/2 tbsp of salt to get it to my preference.. Let the mixture simmer on low heat for 30 minutes.. Frijoles Rancheros are served as a side item to a main dish in a nice deep bowl.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_12_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_12_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_12_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_12_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. To make this scrumptious snack, you are going to need the following foods:Some kind of crackers: I highly recommend wheat Thins, as those have been my favorite out of all the crackers that I've tried so farCheese: I use cheddar cheese, and the only type of cheddar that I like with this is sharp cheddar cheese, so that's what I use and recommendPickled Jalapenos: They can be in a jar or in a can, I prefer them in a jar, but the ones I use were in a canAs a side note, I used a couple of mason jars to put the jalapenos in after I opened the can, so you might want a couple of those handy.. Now you just need to open up your can of jalapenos. Once it's open, then you can transfer the jalapenos into a mason jar for storing. This step is quite self explanatory... Once in the mason jar, you can refrigerate your jalapenos if you like.*This is unnecessary if you got jarred jalapenos to start off with... Its actually really completely optional, but recommended. Once again, another self explanatory step. All you really need to do is get your block of cheese, open it, and then cut a few slices from it. If you want, you can cut it into the size of the crackers. Once done cutting, you can easily store your cheese by getting a plastic sandwich bag, and just putting it over the open end of the block, then refrigerate your cheese.Oh... and uh, try not to cut yourself... it hurts. Now, for the good part... The construction of this delicacy. Get a cracker, and lay it downAdd a slice of cheese that is cut about the size of the cracker, and lay it on top of the crackerThe best Part!!!! ADD THE JALAPENOS!!!!You can also make a nice plate of this, just add some cheese, crackers, and a bowl of jalapenos to a plate and set it out for you and your friends, or... maybe just for you, either way, it's always good.... Now eat your amazing snack. In case you aren't too accustomed to a lot of spiciness, then you might want to have a glass of milk handy nearby... just saying. And if spicy isn't really your thing, well, umm, I recommend not eating this, you can go find another snack to munch on.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_13_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_13_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_13_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_13_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Ingredients for the Mushrooms' Body and Top\n\u00a0\u00a0\u00a0 1 cup butter (two sticks...yes, two sticks!)\n\u00a0\u00a0\u00a0 1/4 cup powdered sugar\n\u00a0\u00a0\u00a0 1/2 cup pecans (walnuts or hazelnuts work as well)\n\u00a0\u00a0\u00a0 1 3/4 cups all-purpose flour\n\u00a0\u00a0\u00a0 1 teaspoon pure vanilla extract\n\u00a0\u00a0\u00a0 1/4 teaspoon salt\n\u00a0\u00a0\u00a0 Pound cake (a frozen loaf is fine, that's what I used)\n\u00a0\u00a0\u00a0 Black decorator\u2019s gel\n\u00a0\u00a0\u00a0 Red and green food coloring\n\u00a0\u00a0\u00a0 White frosting or white fondantIngredients for Royal Icing (Using Egg Whites)\n\u00a0\u00a0\u00a0 1 3/4 cup powdered sugar\n\u00a0\u00a0\u00a0 1 teaspoon lemon juice\n\u00a0\u00a0\u00a0 1 egg white\nI used the above recipe, but if you have young 'uns in your household, you may want to try the below recipe for royal icing instead:Ingredients for Royal Icing (Using Meringue Powder)\n\u00a0 2 cups powdered sugar\n\u00a0 1 1/2 tablespoons meringue powder\n\u00a0 1 teaspoon lemon juice\n\u00a0 1/2 warm waterHot Tip: As with all things involving royal icing, there will be long drying times. If you\u2019d rather do these recipe in two parts (which is what I did), the top part of the cookie can be made the day before and stored in an air tight container. Then you can do all the frosting fun the following day.. To start off the cookies, beat together the softened butter and sugar until the mixture is fluffy and looks like mayo.\nOnce you have your bowl o'mayo, add the vanilla extract.\nNext, mix in the salt and flour until everything is well combined.\nAdd the pecans (or whatever nut makes your skirt fly up) and stir.\nWrap up the dough in plastic wrap and pop in the fridge for a hour or so for it to firm up. Cooling the dough isn\u2019t required, but it does make the ball forming a tiny bit easier. And isn't life hard enough to not have to fight with sticky dough?. Once you\u2019re ready to bake, preheat oven to 350\u00b0F.\nForm into one inch balls and place on a cookie sheet.\nWhile baking, they\u2019ll flatten out a tiny bit to give the mushroom top shape.\nBake for about 15 minutes, until the cookies are firm and the bottoms are golden brown. Allow the cookies to cool.. Creating the bottom part of the mushroom is easy!\nGrab a circular cookie cutter that\u2019s a little smaller than your baked cookies.\nTake a slice of pound cake that\u2019s about 1/2\u2033 thick. You'll get about a dozen and half slices out of your pound cake.\nIf you feel your energy lagging, feel free to test out one of the cookies you just baked! Delicious!. We have the shapes, now we need to decorate!\nWhen you\u2019re ready to make the royal icing, beat the egg white and the lemon juice together for a few minutes. Add the powdered sugar to the mixture until you get a thick icing. If it\u2019s too thick, add a few drops of water.\nLay down some tin foil and coat the sides of the pound cake pieces with the icing. If your icing is too thin, add a bit more powdered sugar to the icing mix.\nAllow the freshly iced pieces to dry for about an hour.. This next part gets quite fun and messy, so you\u2019ll want a lot of paper towels and a sink nearby!\nDivide the reminding icing into two bowls. Dye one green and one red.\nDip the cookies into the icing, making sure to get the tops fully colored. Place the iced cookies on a tin foil sheet. Don\u2019t worry about the icing run-off from the cookies, we\u2019ll take care of that next.\nAllow the cookies to sit for an hour.. Once the royal icing has set, move the cookies from their rather-messy location over to a clean sheet of tin foil. This way the icing run-off won\u2019t harden to the cookie. If the extra edge icing doesn\u2019t fall off the cookie when you pick it up, knock the edge of the cookie on the counter-top.\nThe cookies should sit for another 30 minutes to finish hardening, as the icing around the edge of the cookie will still be soft.\nTo finish the top of the mushroom, draw circles with white frosting or use fondant. I used tiny balls of fondant and pushed them onto the cookie.. To finish the bottom part of the mushroom, draw on eyes with black decorator\u2019s gel.\nWhew! Now to finally put them together!\nTake a bottom part of the mushroom, add a little frosting to glue the top part on, and then push on the cookie.. Makes about dozen and half delicious Super Mario Mushroom Cookies!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_14_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_14_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_14_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_14_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Here the ingredients:\nMilk\nChocolate Flavored Powder\nVanilla\nHere and the materials:\nCoffee cup\nSpoon (2)\nI c. measuring cup\nOnce you have that, you ready to move on!. Pour in 1 cup of milk into the measuring cup.\nThen, put it in the microwave for about 1 min. and 30 sec. You can put it in for longer if you choose.. Once you're done heat the milk, put about two drops of the 100% Pure Vanilla extract. Stir. Then put in two table spoons of the Chocolate Flavored Powder, stir. \nYou can add more Choclate powder if youwant but try not to add a lot of Vanilla, it can over power the entire drink.. The Pour the whole mix into the coffee cup and you're ready to relax, with your new favorite drink. \nYou can toy around with other ingredients. You can add cinnemon to get give a another different taste, but try to keep to to basics of the original recipe. \nEnjoy.\n~D\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_15_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_15_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_15_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_15_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. To properly roll the cookies into their ice cream cone shapes, you'll need a conic form that can withstand some time in a 400F oven.\nThe cookbook suggests a #35 4 1/2\" coronet mold, but since this was a one-off for a French Laundry themed party we decided to make our own out of paper.\nAfter some rummaging, I found a 4\" diameter circular object for tracing (the base of a pitcher) and made some circles on a manila folder.  I also made one on a sheet of glossy paper, the thick stock used as the cover of an expensive yuppie magazine we magically get for free.  Note that I'm NOT putting the glossy stuff into the oven for fear of toxic bleeding or outgassing; that's what the manila folder is for.\nDraw another circle on the glossy paper ~1/2\" outside the original circle, and add a tab.  Now cut around the outside circle and inside of the 4\" circle to make a 4\" diameter stencil.\nCut out the manila circles; I used 5.  These need to be shaped into cones for use as your forms, so you've got to get them nice and tight.  I wanted to staple them into position, but they're too small to successfully staple.  We also nixed glue, tape, and rubber bands as unable to stand up to oven conditions.  Pinning sounded good in theory, but probably would have ended in tears.  I finally ended up sewing them in place, which was surprisingly fast.    The key is to pass the thread directly THROUGH the cone, then wrap around the flap as you prepare for your next pass.  After three or so stabs across the cone, exit next to the original knot (you should have made a BIG knot, and left an inch or so of tail) and tie off with the tail.  These worked beautifully, and looked sort of spooky. . Ingredients:\n1/4c + 3T all-purpose flou\n1T + 1t sugar\n1t kosher salt\n8T (1 stick) unsalted butter, soft but still cool\n2 large egg whites, cold\n2T black sesame seeds\nMix flour, sugar, and salt together.  Separately, whisk butter until it's completely smooth; I used my Kitchenaid with the whisk attachment.  Add egg whites to the dry ingredients, and mix thoroughly with a stiff spatula.  Dump the egg mixture into the butter, and whisk until batter is creamy and without lumps.\nI don't have a picture of the bowl of pasty goo, so here's some of it in the stencil.. Get out your Silpat.  If you don't have one, head to any kitchen store and shell out $15.  Once you have a Silpat you'll find a million uses for it.\nPlace the stencil on the Silpat, and scoop some batter into the center.  Use the sharp-edged spatula of your choice to spread the batter in an even layer over the stencil; scoop off any extra.  If it's grossly uneven you'll get localized browning/burning.  Don't leave any holes.  Lift stencil and repeat.  I did five coronets per sheet, which seemed like plenty. Also, I only had the patience to sew five molds- don't lay down more coronets than you have molds.\nSprinkle black sesame seeds over the top of each coronet.. Put the Silpat on a baking sheet, and transfer to your preheated 400F oven.  Cook for 4-6 minutes, until the batter is just set and you can see the batter ripple a bit.  They'll start sliding around on little melted-butter trails if your baking sheet isn't entirely flat, but this is easily fixable.\nPull the sheet out and sit it on the open oven door to keep warm while you work.  Hold the top of your paper mold with your off hand, and use a tool to manipulate the coronet with your dominant hand.  Be careful- the coronet is hot and greasy; you REALLY don't want to touch it directly. Roll the coronet around the mold as tightly as you can, and finish with the seam side down.  Roll the other coronets and place them up against each other to prevent unrolling.\nPop the sheet of rolled coronets back into the oven for 3-4 minutes to set the seams and let them color up a bit.  The French Laundry seems to make coronets that are entirely golden-brown, but I took mine out earlier for fear of burning. This worked just fine.\nLet the coronets cool/solidify on paper towels for a few minutes before removing the paper forms.. Ingredients:\n1T finely minced red onions\n1/2c creme fraiche\n1/4t kosher salt, or to taste\nfreshly ground white pepper to taste\nRinse red onions in a sieve under cold water, then dry on paper towels.  Whisk creme fraiche in a small metal bowl for 30sec-1minute, or until it holds soft peaks when you lift the whisk.  Fold in onions, then season with salt and pepper.  Refrigerate until ready to serve, up to 6 hours.\nI never got the creme fraiche to reach soft peaks, so shoved it in the fridge and hoped for the best.  It gets a bit more solid as it chills, but... not a lot.  Also, wash more than 1T onions as some get lost in the sieve; measure the 1T off of the paper towels.. Ingredients:\n4oz sashimi-grade salmon fillet (belly preferred), skin and any pin bones removed and very finely minced\n3/4t extra virgin olive oil\n3/4t lemon oil (zest is a potential substitute)\n1 1/2t finely minced chives\n1 1/2t finely minced shallots\n1/2t kosher salt, or to taste\npinch freshly ground white pepper, or to taste\nFind a nice big SHARP knife to mince the heck out of the salmon fillet.  They claim a food processor would ruin the texture; it would certainly be less fun.  Mix in remaining ingredients, then chill for 30 min to 12 hours.. Assembly is easy:  a dollop of each ingredient, presented like an ice cream cone.  They recommend serving them in a lucite holder, but I got lazy and it wouldn't have worked anyway (see below).  If you can't get at a laser cutter or machine tools, you could wedge the cones in rock salt, peppercorns, or the like for a snazzy presentation.\nFirst, scoop a bit of the creme fraiche into the top of the coronet.  Pipe it in with a pastry bag for bonus points.  Apparently if you prepared it properly, it will be thick enough to stick at the top of the cone; mine chose to be too runny for this to work.  Thus, the horizontal cone trick:  I poured the creme fraiche in, then kept it as close to level as possible while adding the salmon, and served it lying on a plate.\nYou can use a melonballer to create cute little salmon scoops, or just do it quickly with a small spoon and/or clean fingers.   Stick a chive tip out the top of the salmon ball to look extra classy, or possibly more like a Teletubby.  Eat immediately if not sooner.\nEither way, they were fantastically tasty.  If I do this again, I'd probably skip the cones and just plop the half-baked coronet rounds into mini-muffin pans to make non-leaky shells to hold my ingredients.  I'd probably substitute a mix of cream cheese with either sour cream or yogurt for the creme fraiche, as it's a lot cheaper, and it mainly provides a fatty foil for the salmon.  Could be made lower-fat if you care about these things.\nCertainly worthy of a repeat, though.\nThis made approximately 20 coronets.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_16_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_16_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_16_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_16_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. What you see in the first picture here is some left-over cooked rice after dinner which is kept in a bowl.Add enough water to the bowl to cover the entire rice and leave it overnight.Next day, drain out the water from the rice and keep it aside. the rice is little fermented and becomes soft. Ingredients requiredTwo teaspoons of red chili powderTwo teaspoons of Cumin seedTwo teaspoons of Fennel seedSalt to tasteTwo medium sized onionsPreparationAdd all dry ingredients in a mixer grinder and make powderChop the onions and make a rough paste in the mixer grinder. Mash the fermented rice lightly with a spoonAdd the spice powder prepared earlier to the riceAdd the onion paste and mix wellNow our rice mix to prepare the crispies is ready. Now we need to make small rough shaped balls with the rice mixUse a large sized sifter or any such material. I have used a sifter made of bambooCover it with a clean cloth and keep it in the sunTake small amount of rice mix in your hand and make rough-shaped balls like this over the clothLeave this in the sun till all the crispies are completely dried . Once the crispies are fully dried, you can collect them and store in an air-tight container.These crispies can be deep fried in oil and served as side with Rice\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_17_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_17_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_17_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_17_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. When I offered to make a Marvel Superheroes cake I found myself wanting something cartoonish and something on the simple side. I didn't want to mimic designs of the actual characters. That would just be too hard. I searched google images and finally thought about looking at toys and came up with these Marvel plush toys. I figured it would be easiest to model my characters from these toys. **This image is not mine nor do I support the website I found the picture on. Just used it to make my characters. Here is the link to the site I found them on http://www.lostateminor.com/2011/01/15/marvel-superhero-plush-dolls/. The only ingredients in these characters are fondant and food coloring. I have made my own fondant in the past but prefer store bought. I always buy white and color it myself with Wilton Icing Colors. For Iron Man you will need the following colors of fondant: -red - or as red as you can get it! -yellow -black -light gray Okay, let's get to work on Iron Man. Of the three characters I highlight in this instructable Iron Man was the least difficult. A good place to start!Tips before we get started: -Working on waxed paper keeps pieces from sticking to your work surface -Adhere fondant pieces together with very small dabs of water -When ever a cut is made with a knife round and smooth the cut edges for a more polished lookIron Man Head - Approximately two inches tall 1. Roll a red circle of fondant in your hands and press it down to about a 1/2 inch thickness about two inches high and two inches wide with rounded edges. 2. Using your fingers stretch out a bit of fondant on either side of the circle to make small bumps. Not sure if they are ears or part of the suit. 3. Press (or roll) a piece of yellow fondant flat to about 1/16 of an inch. Cut it into a circle(ish) shape almost as large as the head. Keep the bottom rounded and cut the top like a heart. Stick to the head piece. 4. Roll two small black circles for eyes and make one small rectangle for the mouth. Stick in appropriate places and set aside!Iron Man Body - Approximately three inches tall 1. Roll an egg shape of red fondant in your palms and flatten it to about 1/2 inch. 2. Using a table knife cut out a shape like in picture 2. You are making the upper body and the arm stubs. Round off the cut edges with your fingers to give the piece dimension. 3. Make a line with your table knife for the waist without cutting all the way through.Iron Man Arms 1. Roll out two thick red snakes about one inch long with bulbous ends for the arms. Slightly press for a flat bottom. 2. Cut a thumb shape on each end of each arm in opposite directions (to make a right and left hand) and make dents for the fingers - as in picture three. We are making two arms at the same time! 3. Make two small yellow triangles about 1/16 inch thick and adhere them to the upper part of each arm where the stub meets the arm. 4. Roll two small thin red snakes and stick them on each edge of the yellow triangles. Repeat with other arm. 5. Using a small dab of water stick the arms on the arm stubs of the body piece.Iron Man Details 1. Roll a small ball of red fondant and press as flat as possible for the monobeam. 2. Roll a small ball of gray fondant slightly smaller than the red piece and stick to the top of the red piece. 3. Adhere to the center of his chest. 4. Add a belt consisting of a long red snake pressed flat with two indents to make the buckle part. 5. Roll two very tiny round balls of red fondant. Stick them on the upper part of the chest on either side.Iron Man Legs 1. Roll two yellow snakes about 1/2 inch thick and press slightly to flatten. 2. Cut two boot shapes out of red fondant. One for the right foot and one for the left foot. 3. Stick the boots to the leg pieces and edge the seam with a small snake of red fondant pressed flat. 4. Adhere the legs to the body piece with a small dab of water. 5. Use a toothpick to secure the head to the body if necessary. My head pieces were as heavy as the body pieces and since I wanted the characters standing on the cake I knew I would need a toothpick to make them more sturdy. Set Iron Man aside to dry and harden slightly. I recommend making characters in advance to avoid any damage to the characters once they are placed on the cake.. For Spider-Man you will need the following colors of fondant: -red -black -blue -white Spider-Man is by far the most time consuming character. Not necessarily as difficult as the other just time consuming! He is made in the same was as Iron Man just with different details. Same techniques and tips apply to all characters.Spider-Man Head - Approximately two inches tall 1. Roll a red circle of fondant in your hands and press it down to about a 1/2 inch thickness about two inches high and two inches wide with rounded edges. 2. Press (or roll) two pieces of black fondant flat to about 1/16 of an inch. Cut out two teardrop shapes for the eyes. Stick to the head piece. 3. Roll two small white circles for eyes. Stick on top of the teardrop shapes. 4. Roll very thin long snakes of black fondant. You will need several pieces to make the webbing on the face. I found it was easiest to roll the snakes in the palm of my hand. 5. Start by making a small circle of the fondant in the center of the face just lower than the eyes. 6. Layer on the webbing with pieces of the thin black fondant as in pictures 3-5 adhering all pieces with a bit of water. Set head aside.Spider-Man Body - Approximately three inches tall 1. Roll an egg shape of red fondant in your palms and flatten it to about 1/2 inch. 2. Using a table knife cut out the body shape and arms like in picture 6. You are making the upper body and the arms. Round off the cut edges with your fingers to give the piece dimension. 3. Line the armpits and inner sides of the body with blue fondant. This will hold the arms onto the body piece securely. 4. Cut an 'M' shape 1/2 inch thick out of blue fondant for the pants rounding the cut edges. 5. Make two boot shapes for the feet - one for left and one for right. 6. Stick pieces together in appropriate places.Spider-Man Details Now that the main body is finished it will need webbing like the face and a spider of course! 1. Pictures 8-11 show the route I took for applying the body webbing and the spider. The photos show better than words can explain! 2. Use a toothpick to secure the head to the body if necessary. My head pieces were as heavy as the body pieces and since I wanted the characters standing on the cake I knew I would need a toothpick to make them more sturdy. Set Spider-Man aside to dry and harden slightly. I recommend making characters in advance to avoid any damage to the characters once they are placed on the cake.. Wolverine was very fun to make! You will need a small piece of card stock or chipboard along with the following fondant colors for Wolverine: -black -yellow -blue -light gray -white -flesh color -red I am starting to simplify the instructions and photos by now since we are on the third character. Use that brain! Challenge yourself!Wolverine Head - Approximately two inches tall 1. Roll a yellow circle of fondant in your hands and press it down to about a 1/2 inch thickness about two inches high and two inches wide with rounded edges. 2. Press (or roll) a piece of flesh colored fondant flat to about 1/16 of an inch. Cut out a half circle the same width as the head and cut a slight 'V' in the top. Stick to the head piece. 3. Roll two small black thin strips for the sideburns. Stick on the very sides of the head from where the eyes will be down to the neck. 4. Cut card stock or chipboard into the eye shapes and cover with a thin layer of black fondant. I found if I didn't use the chipboard the eyes just sort of melted around the head piece. Make sure your guests or child knows not to eat that part. We aren't allowed to eat fondant at my house so it wasn't an issue. 5. Roll two small white circles for the eyes and place on the black fondant chipboard pieces. Adhere eye pieces to the head piece and set aside.Wolverine Body - Approximately three inches tall 1. Roll an egg shape of yellow fondant in your palms and flatten it to about 1/2 inch. 2. Using a table knife cut out the body shape and arms like in picture 2. You are making the upper body and the arm stubs. Round off the cut edges with your fingers to give the piece dimension. 3. Cut out the pelvic piece in blue fondant and stick to the yellow body piece. 4. Line the seam with a strip of red fondant.Wolverine Arms 1. Roll out two thick flesh colored snakes about 1/2 inch long. Slightly press for a flat bottom. 2. Cut two mitten shapes out of blue fondant in opposite directions (to make a right and left hand). We are making two arms at the same time! 3. Line the seams with a strip of blue fondant. 4. Stick arms to the arm stubs of the body piece.Wolverine Legs 1. Roll two yellow snakes about 1/2 inch thick and 1/2 inch long and press slightly to flatten. 2. Cut two boot shapes out of black fondant. One for the right foot and one for the left foot. 3. Stick the boots to the leg pieces with a small dab of water. 4. Top the fronts of the boots with thin pieces of blue fondant. The shape should be the same as the boot only slightly smaller.Wolverine Details 1. Make a small red rectangle with a black 'X' shape for the belt buckle. Adhere to the red belt. 2. Roll out a piece of black fondant about 1 inch square about 1/16 inch thick. 3. Cut out six small triangle shapes. 4. Stick them on the body piece as in picture 4. 5. Roll out a long thin snake with gray fondant. Cut it into six equal pieces about 1/2 inch long. 6. Make three indents on each hand using a toothpick. 7. With a dab of water stick on the claws! 8. Roll out a piece of blue fondant about 1/16 inch thick and cut two triangles about 3/4 inch long. 9. Adhere them to the seam where the arms meet the body. 10. Use a toothpick to secure the head to the body if necessary. My head pieces were as heavy as the body pieces and since I wanted the characters standing on the cake I knew I would need a toothpick to make them more sturdy. Set Wolverine aside to dry and harden slightly. I recommend making characters in advance to avoid any damage to the characters once they are placed on the cake.. Once you have the fondant characters made you will need a good home for them. A cake preferably!! I made a two tiered dark chocolate fudge cake (maybe I will post that recipe someday), covered it with colored buttercream, added some fun and very simple fondant details. I also added graphic signs drawn with sharpie markers and taped to skewers and stuck on my characters. I used a toothpick in each foot of each character to stand them upright. Except The Hulk. He is just leaning slightly on the cake. Watch your birthday boy smile and laugh when he sees the cake and eats it up with delight!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_18_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_18_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_18_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_18_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Always the same basic ingredients :\n- 17.65oz / 500g Bread Flour\n- 11oz / 312g Water\n- 1\u00bd tsp (0.21oz / 6g) Yeast\n- 1\u00bd tsp 0.31oz / 9g) Salt\nMake 2 1lb loaves(or 4 mini loaves in my case)\nRemember to use lukewarm water!. You need some basic tool. If you're lucky to have a stand mixer or bread machine (dough mode), use it!\nBut this can be made by hand\nBy Hand :\n- Large bowl\n- Large wooden spoon\n- Kitchen scale\n- Your hands\nWith Mixer:\n- Stand mixer or Bread machine\n- Dough hook or bread machine paddle\n- Kitchen scale . Mix the yeast and water.\nLet rest for 5 min.. Add the flour and salt to the bowl all at once\nMix until the dough holds its mass together.. When the dough is holding a \"shape\", start to knead.\nBy Hand:\n- Remove from the bowl and place on a oiled surface.\n- Oil your hands and start to knead.\n-\u00a0Stretch, fold then push down with your palm.\n- Turn 90 degree and repeat until the dough is elastic and smooth.\nWith Mixer:\n- Let the mixer knead until the dough is smooth.\u00a0The bowl should be almost clean when it's done.. Bread needs some time to rest and develop all it's goodness. This is called proofing.\n- Form in a round ball and place in a oiled bowl.\n- Cover with a cloth or plastic wrap.\n- Let rest for 1h or doubled in size. As useless as it sounds, your bread will be far better with this second proofing. You really don't want to skip this step.\n- Uncover.\n- Punch down with your fist.\n- Form into a ball (doesn't need to be perfect)\n- Cover with a cloth\n- Let rest for 1h (again!). Finally, it's almost done with all this waiting time!\n- Take the dough out of the bowl (try to keep the air bubbles!)\n- Separate in 2 (or 4) equal parts.\n- Shape into a log as big as your loaf pan\n- Put into a lightly oiled loaf pan\n- Cover\n- Let rest for 1h\n30 min prior to baking, preheat your oven to 425F. As with any other food, baking is the most important part of it.\n- Uncover.\n- Score the loaves as you like or leave them plain.\n- Put into your preheated oven on the middle rack.\n- Bake for about 25-30 min. (or 20-25 min. for mini loaf)\n- The crust should be golden brown.. Finaly! We're done...almost!\n- Remove from oven\n- Remove from loaf pan\n- Let cool on a cooling rack for an 1h\n- Cut into slices and eat!. Don't stop here, try new things!\nFor me, bread is a canvas for creating\u00a0wonderful\u00a0mixes. Think of it as a solid bloc pizza!\n- Add nuts, fruits, chocolate... whatever !\n- Try other flour types\n- Try other liquid (milk, beer, tea? why not)\nI hope this instructable will spark the bread baker in of you.\nHave fun!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_19_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_19_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_19_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_19_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Measure out 2 cups of heavy whipped cream and pour it into a bowl. Mix the whipped cream with your electric mixer on medium speed. If you do not have an electric mixer you may whisk by hand but I should warn you that this will take quite some time. I do not have a hand mixer so I taped a fork (I changed it to a whisk later) to a drill. You will need to mix this for quite some time, about 7 minutes, before it gets creamy. If your heavy cream is creamy and fluffy and keeps the indentations of your stirring utensil then you've got it! Do not over do the mixing or the mixture will fall apart and you will ruin the consistency.. Ok, now that your heavy cream is the right consistency pop open your can of condensed milk and add it into the heavy cream mixture. Now you can also add 1 large table spoon of vanilla extract. Mix away on medium speed, this will take approximately 5-10 minutes. Again, your ice cream mix should be fluffy and creamy. Once your ice cream is perfecto then add in your stir ins, if you so desire, and just mix them into the ice cream. Next you can transfer your ice cream into a container that has a lid. A lid is necessary to prevent the ice cream from getting icy and to preserve its creamy goodness. Place your ice cream in the freezer for 8-12 hours. . The ice cream should be firm after you take it out of the freezer- you do not want the ice cream to be too creamy or your sandwiches will fall apart. Take one cookie and place ice cream on top. The amount of ice cream you use is up to you; I used 3 tablespoons for each sandwich. Cover your ice cream cookie with another cookie and give them a gentle squeeze. You'll need to freeze these bad boys for another 6 hours. This time do not cover the cookies, you want the sandwiches to be nice and firm so that when you bite into them ice cream does not squirt out. . Now your cookies n' ice cream sandwiches are all ready. Sink your teeth into this heavenly goodness and enjoy!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_20_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_20_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_20_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_20_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. For this recipe you will need:12-15 slices bread8 ounces heavy cream3/4 cup packed brown sugar1 stick butter or margerine4 eggs1/2 tsp salt1 tsp cinnamon1/4 cup granulated sugar1 can pumpkin puree1/4 cup rum. Tear up each slice of bread into small pieces, approximately 1\" square. Place pieces into a large mixing bowl.. Melt the stick of margarine or butter and cool. In a separate mixing bowl, blend together the eggs, pumpkin, cream, sugars, salt, rum, cinnamon.Slowly add the melted margarine and stir well.. Pour mixture over bread and mix with large spoon until well coated. Pour into a greased large loaf pan.. Bake at 325 degrees for 1 hour & 15 minutes or until knife comes out clean.. Serve warm or refrigerate.. Mix 2 cups powdered sugar with 2 tablespoons rum and 2 tablespoons heavy cream. mix well, add more powdered sugar until you get the consistency you want. Pour over each slice of bread pudding just prior to serving. Enjoy!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_21_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_21_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_21_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_21_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Who has time to make stuff from scratch? Gas up that Hummer of yours and get to the mega-mart to shop for your wholey-processed convenience foods. Be prepared to pay more if you want it certified organic. If you really want to do things right, you of course can make your pie dough from scratch and even make fresh pie fruit filling if desired. Just search for the instructables on those topics.\nYou will need:\nPre-made pie crusts\ncan of apple pie filling\nan egg to use as a glaze\nexperiment with fruit jelly glaze or food coloring to decorate the display\nsome aluminum foil to make a ring mold\nbaking tray or cookie sheet to bake the pie\nCAUTION: Know how to operate your kitchen appliances, be careful with using sharp implements, know how to work with gas, clean up after yourself. If you can't stand the fn heat, get out of the fn kitchen.. Get a piece of aluminum foil to form a rectangular mold the shape of our ipod.\nGrease up a baking pan or cookie sheet with butter.\nPlace the ring mold in the center.\nDrape in a pie crust to fill the bottom and the sides.\nTrim away the excess at the top.. Fill with pie filling.\nDrape a piece of pie dough over the top.\nTrim away the excess.\nCrimp the edges all the way around.\nCut out pieces that make the display screen and control rings.\nYou may need to brush the dough with some water to get it to stick.\nUsing a fork, pierce some vent holes for excess steam to escape when baking.\nYou can carefully pick it up and do a custom inscription on the back. \u00a0Make note of the serial number for all warranty work.. At this point you should glaze your ipod. A final egg wash goes over everything to give it that baked gloss.\nBake in an oven at 350 degrees F for about 30 minutes, depending on the size of the storage space. Keep an eye on it so it doesn't burn. The pie filling may ooze over and make a mess so use a cookie tray as a drip pan. It makes it easier to get the pie out of the oven too.\nTake out of the oven to cool.\nEnjoy!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_22_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_22_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_22_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_22_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. The BBQ Pizza Dome uses a hooded 2 burner gas BBQ as the heat source. I found mine on Ebay for A$60 (minus a few non-essential parts such as the warming tray )\n4 x refractory oven tiles (300 x 300 mm) used in the floors of commercial pizza ovens\n1 bag of High temperature mortar mix.(they only seem come in really big bags)\n4 lengths of Hollow Square Section steel. (1 Metre)\n2 lengths of 12mm Solid Rod. (1 Metre)\n2 lengths of 6 mm solid steel rod. (1 Metre)\nElectric drill and Drill bits to match. 12mm and 6mm drill bits.\nTie wire.\nTape measure\nFelt tip pen (permanent)\nWater spray bottle\nRubber gloves\nMetal mesh (Chicken Wire)\nYou will need to cut 3 of the the tiles into strips so that they are square. For example my ties are 40mm thick, so I cut the tiles in to 40mm strips. \nA 300mm tile will give you 7 strips of 40mm. I needed 15 strips for my dome, plus a few spares.\nThe remaining tile is used as a pizza stone, which will sit under the dome.\n. In order to get the best results it is important to check the BBQ you intend to use for the conversion for the following features.A hood which offers good coverage of the cooking area. An inner lip inside the BBQ & this is where the hotplate and grill plate usually sit.I recommend that you scour sites like EBay and go to a few garage sales, as it\u0019s likely you might pick up what you need cheaply.Things like missing grill plates and side tables are often the reason other people choose not to buy, as most are looking for a BBQ to use in the conventional way.You could user a number of materials to accomplish this, I just happen to have some spare HSS and some steel rod left over from another project.The steps to construct the dome support are as follows.Measure up the grill area.Measure the grill area, and the inner support lip, taking into account any bolts or welded joints that may impede the placement of the steel support frame and furnace bricksMeasure and cut the steel to match the grill area.I used Hollow Square Section and some 12 mm solid round section (both mild steel). The square section was cut to length (less 2 mm for expansion) and then drilled to take the 12mm round section. Note: If you have access to an electric welder, you could produce a more rigid frame using HSS and reinforcing mesh.. Assemble the frame.\nThe frame consists of 4 pieces of Hollow Square Section (HSS) and 2 lengths of 12 mm solid round section.\n2 of the HSS pieces have 12 mm holes drilled completely through both the inner and outer faces  and the remaining 2 pieces have holes drilled only on the inner face) \n2 lengths of 12 mm solid round section were inserted through the 2 HSS with and then the HSS with the single hole  was placed on the end of the rod.\nWhen properly fitted together, the frame drops into the lip of the BBQ at roughly the same height as the original grill plate and hotplate.. Once the frame is installed you will need to confirm the maximum height the dome can be with the hood down. This will depend entirely on your BBQ and the type of hood that is fitted.Take care to look for any bolts or fittings which may impede a tight fit against the back wall of the hood.You also need to take into account the thickness of the refractory material you are using.Create a dome template (Paper)Based on your measurements in taken earlier, use a computer based drawing program to assist with the template creation and ensure that your scale is 1:1Draw a rectangle to represent the maximum height and width of the space in the BBQ.The rectangle should have a vertical centre line drawn through  it to assist with alignment.The straight side of the dome should be twice the thickness of the refractory material.  In my case the refractory tile was 40mm thick. So the straight sides need to be 80 mm.Using a tool draw a curve from the top of the inside edge of the wall to the centreline of your rectangle where it intersects with the top. Once done, copy and mirror the shape and align it on the other side.I have included a visio drawing with dimension to assist you and an enhanced metafile document for those who don't have visio.If you have a printer which only can print half of the template, you will need to ensure that you include the centre line in both prints so that you can align them using this line.Print 2 copies of the dome design and cut off the excess paper as neatly as possible.Align the two prints together using the centreline as a guide.  Use sticky tape to hold them together.TIP : If the paper you are using is thick, and you cannot see the centreline on the bottom sheet through top sheet, use hairspray or oily nail polish remover to make the top sheet transparent while you align it with the bottom one.To create a wooden former for the dome using the paper template you can use apply the template to the ply using watered down PVA glue. Coat the surface of the plywood before applying the paper template. You will need to work quickly before the glue dries.In order to make two identical templates, screw a second piece of ply to the first so that you can cut out both pieces in the one operation.It is important to remember that all adjustments to the ply template must be made to both, otherwise the fit of the refractory materials will not be neat.My dome consist of 3 main sections, the left hand side (5 pieces), the right hand side (5 pieces) and the top dome (5 pieces)The taller ones (80mm) are the two straight pieces and are made up from two 40mm tile stripsLay the tiles (dry fit) on the edges of the template to see if they fit correctly when laid on the template.Mark the point where the last of the 5 left tiles ends on the temple, repeat for the 5 tiles on the right hand side. When compled you should have something that looks like this.Use spacers to build up each side with strips of tile until you have filled in the whole template.Once completed, dismantle the dome and re-assemble in the BBQ and ensure that  the hood can be closed. Resolve any issues with fit by trimming the template and adjusting the placement of the tiles. Once you are happy with the fit label the end of each tile so you know where it goes.. Once the end tiles fit correctly, you can mortar them together.\nNote: You must use a minimum of mortar on the inside edge to ensure a strong bond. The outer surface is less critical and can be filled with mortar once you have completed the initial construction steps.\nSoak all the tile pieces in water for 30 minutes to ensure they are thoroughly wet. If the tiles strips are dry, it will instantly dry up the mortar as you apply it and the bond will not be strong.\nI made my dome in 3 main sections, the left hand side (5 pieces), the right hand side (5 pieces) and the top dome (5 pieces)\nIf you construct the dome in this way you can move the pieces easily and it allows you to fine tune their placement in the BBQ.\n. I decided that I might need to reuse the ply templates should the whole dome collapse or I decide to build a second dome.\nI made up two metal dome supports to help during the final rendering and firing processes, and it also  saved me from having to move the whole dome to remove the wooden template.\nI used 6 mm steel rod bent into the shape of the underside of the dome.  The rod is fitted into holes drilled in each support section where the steel and the tile meet.. Tie the steel supports and tiles together to help support the 3 major sections.\nIf at this point you want to test the dome (like I did) you can fire up the BBQ and take it on a test drive.\nLeave the BBQ on medium heat for 2 hours to heat through and remove any oil or reside which might taint the pizza.\nWhen you are happy with the performance of the oven, you can move to the final steps.\nOnce the BBQ is cold. Apply the mesh to the outside of the dome using the steel rod as a tie-down.. Apply mortar to the joints of the 3 segments and apply the finish coat of mortar to the outside of the dome ensuring that no mesh is visible. \nYou can apply an oxide to the mortar to match the BBQ if you wish.\nKeep a spray bottle with water handy to keep the mortar wet while you are applying and smoothing.\nThe mortar has a grey finish which is a bit boring, I decided to add a final render coat with some oxide as a colouring agent.\n. Once completely dry you should fire up the BBQ and slowly heat it. \nLeave the BBQ on medium heat for 1 hour and then check the dome for cracks and splits.\nThe one remaining refractory tile should be placed on the supports under the dome, this is used to cook the pizza on.\n. Now you are ready to cook pizza.\nYou may need to experiment with the stone height placement to ensure that the bottom stone is not too hot compared with the inner dome temperature. \nThe ideal temperature for me seems to be around 360 C which cooks a pizza between 3 and 4 minutes depending on the toppings and the dough hydration.\nI always cook with the hood closed, I check every 45 seconds and turn the pizza around halfway through the cooking about 2 minutes.\nHere are my first pizzas from the dome, cooling before serving.\nCooking with the dome is a definite improvement, I now have to re-learn how to cook them to get the best results.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_23_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_23_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_23_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_23_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Add one table spoon of minced garlic. Add more or less depending on how much garlic can handle.Mix the garlic into the butter.. Give it another mix before seasoning with salt.Once it\u2019s been mixed, set to one side.. A long loaf like this one works best.Slice across the loaf on the diagonal, leaving about an inch between each cut. Then slice the opposite direction, take your take because you don\u2019t want to rip a part the bread.. Make sure you really get it in there. You want every bite to get a beautiful taste of that garlic butter.Spread some of the butter across the top of the loaf.. When the cheese melts it will not only add an amazing texture but all help to hold the loaf together until it\u2019s ready.This is ready to hit the BBQ.. Bring the BBQ up to a medium to high temp.. Cover with the lid and let it cook for 15 to 20 minutes.. Take it off the heat, be careful as the bread is really hot.All that\u2019s left to do is serve it up.. It the perfect finger food for a party and quick and easy to make.The best part of making this way is being able to pull out squares of bread with the cheese and garlic butter.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_24_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_24_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_24_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_24_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Software\nMacaron shells\u00a0consist of four ingredients. Yes, four. That's it. What makes these tricky is the process, not the ingredients.\nFor best results, you should really get a kitchen scale. These things are magnificent and I promise that you'll be able to use them for many things other than these macarons (my mother uses ours to weigh her postal packages).\nWhen measuring ingredients, weigh your egg white (since you can't control exactly how much the chicken put in each egg) and scale this formula appropriately. In case you don't have a scale, I have included volumetric approximations below.Formula:\nEgg white\nAlmonds (whole, slivered, or ground): 1.2 X weight of egg whites\nPowdered sugar (aka icing sugar or confectioner's sugar): 2.25 X weight of egg whites\nCastor sugar (aka superfine sugar): 0.25 X weight of egg whitesVolume Conversion:\nOne large egg white (30 g)\nAlmonds (slivered): 1/4 cup\nPowdered sugar: 1/2 cup\nCastor sugar: 1/2 tablespoon. Hardware\nThis is what I'll be using, but you can use whatever works for you. I've included some alternative suggestions.\nFood processor\n\u00a0\u00a0\u00a0\u00a0 (Alternatives: ground almonds, see Step 3)\nStand mixer\n\u00a0\u00a0\u00a0\u00a0 (Alternatives: hand mixer or whisk)\nSpatula\n\u00a0\u00a0\u00a0\u00a0 (Preferably silicon)\nPiping bag and round tip\n\u00a0\u00a0\u00a0\u00a0 (Alternatives: disposable plastic bag with hole cut in the corner or parchment paper cone)\nBaking tray\n\u00a0\u00a0\u00a0\u00a0 (Alternatives: cookie sheet)\nSilicon baking mat\n\u00a0\u00a0\u00a0\u00a0 (Alternatives: parchment paper). What does \"tant pour tant\" mean? I can't translate it exactly, but roughly, it means \"equal amounts,\" referring to the almond meal and powdered sugar. In layman's terms, it's the dry ingredients.\nPlace the almonds and powdered sugar in the food processor. Pulse until the almonds are finely ground and the ingredients are completely combined.Tips for Success:Almonds :\u00a0If you don't want to use a food processor, simply buy pre-ground almonds and sift those together with the powdered sugar.Powdered sugar : Powdered sugar contains corn starch.\u00a0A little corn starch is helpful to the texture of the macaron, however, cheap brands bulk up on corn starch, too much of which will be bad for the macaron.. This recipe uses a French meringue, which means that uncooked egg whites are simply beaten with a little sugar.\nPlace the egg white in the bowl of your stand mixer. Start to whisk the egg whites. Once the whites have\u00a0soft peaks, gradually\u00a0add the sugar. Once the sugar is incorporated, increase the speed and whip the whites until they have almost firm peaks and the meringue is glossy and smooth. If the whites look watery and lumpy, you've gone too far.Tips for Success:Egg whites : To attain the most stable meringue possible you need aged egg whites. Leave the whites out at room temperature overnight (what? ew...). Egg whites have natural antimicrobial properties and baking these cookies will kill any bactera. Some recipes actually recommend a multi-day aging. Mine usually lasts about 8-12 hours. The purpose is to cause some water to evaporate, leaving a higher concentration of egg proteins. Alternately, keep the separated whites in the refrigerator for 2-3 days or add a pinch of dried egg whites to them.Whipping process : Make sure your bowl is clean! Any traces of fat will cause massive problems. Do not use plastic bowls. If you'd like, add 1/8 teaspoon of cream of tartar (not tartar sauce!), a drop of lemon juice, or a pinch of salt to the egg whites before whipping them. Also, copper bowls have been shown to produce better meingues due to chemistry I don't understand. If you have one, great. If you don't, I wouldn't suggest buying one ($$$).Superfine sugar: The sugar going into the meringue\u00a0should be as fine as possible so that it can dissolve quickly and completely. All supermarkets carry superfine sugar, but if you don't have any, simply put granulated sugar in a food processor.Coloring : If you want to create other colors, make sure not to use liquid coloring because they will loosen the meringue. Use either powders or gels and add them at the very end of the meringue-beating process.. Unlike \"tant pour tant,\" I can translate \"macaronage\": the act of creating macaron batter. Yes, the zany French have created a word specifically for this dessert. This is the most integral step of the whole process and the easiest one to mess up. The purpose of macaronage is two-fold. It combines the wet and dry ingredients and deflates the meringue just so. This is what transforms this recipe from an almond meringue to a macaron.\nBegin by adding 1/2 of the tant pour tant into the meringue. Fold until the powder is completely incorporated and add the other half. Once the second addition is fully incorporated, check the consistency of the batter by dabbing a bit on a plate. It should settle and leave no beak. If it isn't ready, continue folding and check the batter every couple of folds. Alternately, there should be a point at which the batter ceases to be completely solid. The batter is the right consistency if it sloshes slightly when you tilt the bowl. As soon as the batter appears ready, stop. It is better to undermix than overmix.\nIf you are using a stand mixer to do your macaronage, switch to the paddle attachment. Dump all the almond and sugar mixture into the mixing bowl and turn the mixer to its lowest speed. Mix until the powder is completely incorporated then check for consistency. You shouldn't need to mix for more than 10 seconds. A stand mixer can be faster, but make sure you know what consistency you're looking for. I wouldn't use one the first time through.Tips for Success:Macaronage : Recipes on the internet are littered with descriptions of how much to fold. Some say that the batter should \"flow like magma,\" but I've never been to a volcano. Others try to count folds, but the amount of batter and the differences in folding technique vary greatly. Really, a visual is necessary. Here is a helpful video (oui, c'est en francais) which shows the proper consistency:http://www.youtube.com/watch?v=yDo0SgDKLVw. \n\tOnce your macaron batter has been formed, the most difficult steps are over. Now, you just have to pipe the circular cookies. First, line your baking tray with either parchment or a silicon mat. Fit your piping bag with a relatively large round tip.\u00a0\n\tIf this is the first time you're making these (or you have a bad case of OCD) you can trace circles on the underside of parchment paper to guide the size of your macaron. I usually go for 1.5 inches in diameter, though you could easily make larger ones. Pipe as evenly as possible to prevent uneven cooking.\n\tOnce you've filled your tray, rap it a few times on the counter to settle the batter and get rid of big air bubbles.Tips for Success:Technique\u00e2\u0080\u008b: When piping, make sure that the piping bag is perfectly vertical and\u00a0perpendicular\u00a0to the baking sheet. Hold it about a centimeter above the sheet. To ensure evenness, I always squeeze with the same hand pressure and count out how long it takes to pipe each round (usually around 2 seconds).\n\t\u00a0Parchment paper: Some people claim that using parchment paper will yield straighter feet, though I have yet to prove\u00a0this. Parchment paper sold in flat sheets is preferable, because the rolled kind wants to, you know, roll up. If you only have the rolls, pipe a dab of macaron batter in each corner of the sheet to glue it to the tray.Silicon baking mat: I have observed several advantages.\u00a0They're reusable, prevent sticking extremely well, provide extra insulation to prevent the bottoms from burning, and are guaranteed to be level, so you won't end up with leaning cookies.Resting: After you pipe the macaron shells, let them rest for a few minutes before you put them in the oven. That way, they have a chance to start developing shells before you even bake them. My rule is to only preheat my oven after the shells have been piped. When my oven is ready, so are the macarons.. \n\tThere is much debate as to the best way to bake macarons. Professional suggest starting at a high temperature to dry the shells, then gently cooking the insides. However, this is too complicated for me (and my poor oven). I chose the low and slow method, with a decidedly middle-of-the-road temperature. There is no perfect temperature, rather a range with varying baking times, of course.\n\tPreheat your oven to 300 F (150 C). If your macarons are 1.5 inches in diameter, bake them for 12-13 minutes on the center rack of your oven. If they are 3 inches, you may need to go up to 15 minutes. They should rise vertically on straight feet and should not brown on top. Baking time depends on your oven. Your first time making them, I would check on the macarons (use the oven light and don't open the door!) after 10 minutes. They should be fully formed, but don't let them brown on top. When they are done, let the macarons cool on the tray. Then, pair them up by size.Tips for Success:\n\t\u00a0Baking Trays: If you find that your oven burns the bottom of your cookies, stack two baking trays on top of each other to emulate professional-quality equipment.Browned tops: If your oven makes the tops of your macarons brown before the insides are cooked, place another baking tray or some aluminium foil on the top rack of the oven to protect the macarons.. Strangely enough, the original macaron filling was nothing. they were just welded together and presented as a double-sided cookie. Many speculate that the first filling for macarons was fruit jam, which makes sense. There are a great variety of jams, and all of them blend nicely with plain macaron shells.\nHowever, we're here to talk about chocolate. For the chocoholic, there can and will only ever be one filling: chocolate ganache. Obviously, there is no way to get the amount of ganache perfect for the number of macaron shells. It really depends on how much you put\u00a0in each\u00a0cookie.\u00a0I generally plan on\u00a01 oz. of chocolate per egg white. Of course, if there's any left over, it's the chef's to eat!\n2\u00a0oz. bittersweet or semisweet chocolate, chopped\n1/4 cup milk, half-and-half, or cream\n1/2 tbsp butter\nScale this recipe as needed. Heat the milk or cream in the microwave or on the stove\u00a0until nearly boiling. Pour over the chocolate. Stir until the chocolate melts, then add the butter and incorporate.\nTo fill the macarons, simply take a butter knife and slather it on one macaron of each pair. If you want to be neater about it, you may certainly pipe the ganache. Lid them and your macarons are assembled!Tips for Success:Fillings : This isn't so much a tip as a suggestion. You can spice up your chocolate ganache with a number of different things. You could add a small pinch of salt for depth of flavor, some instant espresso powder for a mocha twist, a dash of vanilla extract for a nice undertone, or even some chili powder (!) for a Mexican-style kick. The fillings are where you get to experiment!. Once they are assembled, DO NOT eat the macarons right away! You will be very disappointed. Instead, let them sit in the refrigerator for at least several hours to mature, preferably overnight. This fixes all the potential macawrongs . Overly stiff shells loosen up, cracks become less apparent, and most importantly, the flavors meld. When you're ready to serve, let them sit at room temperature for a few minutes to take the chill off and enjoy!. The macaron-making community is quite a strong one. Those who know about macarons are usually obsessed, so there is a wealth of information to be had on the internet. Here are some of the websites that have given me the most help.\nIf you understand French, this is a very helpful video demonstration: http://www.youtube.com/watch?v=VsxzeehcI60\nMs. Humble has an incredibly detailed tutorial on her blog: http://notsohumblepie.blogspot.com/2010/04/macarons-101-french-meringue.html\nTartelette is an amazingly creative macaron baker and considered by many the \"queen of macarons\": http://www.tarteletteblog.com/2010/02/recipe-raspberry-mascarpone-macarons.html\nAnnie has created a very straightforward post for chocolate macarons: http://annies-eats.net/2010/06/04/chocolate-macarohns/\nVeronica at Kitchen Musings has a whole series of macaron articles: http://kitchenmusings.com/macaron-chronicles\nDuncan at Syrup & Tang has collected a wealth of macaron tips: http://www.syrupandtang.com/200712/la-macaronicite-1-an-introduction-to-the-macaron/\nDavid Lebovitz (who I mentioned earlier) includes authentic Parisian tips: http://www.davidlebovitz.com/2005/10/french-chocolat\nI hope this has been helpful! Don't be discouraged if your macarons fail the first time around. They may not be pretty, but they'll still taste good. It wasn't until my third batch that I finally got some decent ones. These make a unique and gourmet gift and you can experiment endlessly with flavor and color combinations. Bon appetit!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_25_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_25_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_25_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_25_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Chocolate truffles are a decadent candy that melt as soon as they touch your tongue. For truffles it's very important that you use the highest quality chocolate or they won't set up properly. I used Lindt and Ghirardelli chocolate but Guittard is better if you can find it. I definitely preferred the Lindt to the Ghirardelli. For my truffles I only used vanilla extract for added flavoring but you can infuse the cream with vanilla bean, spices, citrus zest, coffee, or a tablespoon of alcohol such as rum to get the flavor combination you want. .White Chocolate Truffles:4-oz white chocolate, chopped1/8 cup heavy cream1/4 tsp vanilla extract.White Chocolate Coconut Truffles:4-oz white chocolate, chopped1/8 cup heavy cream1/2 tbsp unsalted butter1/4 tsp vanilla extract1/4 cup coconut flakes (sweetened or unsweetened).Dark Chocolate Truffles:4-oz bittersweet chocolate, chopped; or 1/2 cup heaping bittersweet chocolate chips1/4 cup heavy cream1/2 tbsp unsalted butter1/4 tsp vanilla extract.Truffle Coatings:I usedChopped coconut flakes and powdered food coloringChopped coconut flakes and cocoa powderChopped milk chocolate chipsChopped candy meltsBut truffles can be coated in anything you want really! Melted Chocolate, Nuts, Bacon, Sprinkles, Powdered Sugar.Flower Petals:Made using Wilton Candy Melts. Candy melts are candy flavored like chocolate that you can easily melt in the microwave or in a double-boiler. . Place 4-oz of chopped white chocolate in bowl.Place 1/8 cup heavy cream in small saucepan over medium-high heat. As soon as the cream boils, add 1/4 tsp vanilla, and pour over chopped chocolateLet the mixture sit for about 30 seconds then stir until smooth.If the chocolate mixture still has lumps you can put it in the microwave for 10 second intervals (just make sure you stir in-between). Or heat in double-boiler, stirring until smooth.Cover the bowl and place it in the refrigerator until truffles are firm (about an hour). If it's not firm put it in the freezer for an hour. With a spoon, scoop out some of the truffle mixture. Roll it into a ball with your fingers and hands. This is very messy!Now you have a white chocolate truffle! We'll coat the truffles later. Truffles keep in the refrigerator for about 2 weeks or in the freezer for a few months. . Place 4-oz of chopped white chocolate in bowl.Chop 1/4 cup coconut flakes (sweetened or unsweetened) in a food processor or blender. Place 1/8 cup heavy cream and 1/2 tbsp butter in small saucepan over medium-high heat.  As soon as the cream boils, add 1/4 tsp vanilla, and pour over chopped chocolate. Let the mixture sit for about 30 seconds then stir until smooth. If the chocolate mixture still has lumps you can put it in the microwave for 10 second intervals (just make sure you stir in-between). Or heat in double-boiler, stirring until smooth.Add the chopped coconut and stir until combined. Cover the bowl and place it in the refrigerator until truffles are firm (about an hour). If it's not firm put it in the freezer for an hour. With a spoon, scoop out some of the truffle mixture.  Roll it into a ball with your fingers and hands. The mixture will stick to your hands. Now you have a white chocolate coconut truffle! We'll coat the truffles later. Truffles keep in the refrigerator for about 2 weeks or in the freezer for a few months.. Place 4-oz bittersweet chocolate, chopped; or 1/2 cup heaping bittersweet chocolate chips in bowl.Place 1/4 cup heavy cream and 1/2 tbsp butter in small saucepan over medium-high heat.  As soon as the cream boils, add 1/4 tsp vanilla, and pour over chopped chocolate.  Let the mixture sit for about 30 seconds then stir until smooth.  If the chocolate mixture still has lumps you can put it in the microwave for 10 second intervals (just make sure you stir in-between). Or heat in double-boiler, stirring until smooth. Cover the bowl and place it in the refrigerator until truffles are firm (about an hour). If it's not firm put it in the freezer for an hour.  With a spoon, scoop out some of the truffle mixture.   Roll it into a ball with your fingers and hands. This step is extremely messy!Now you have a dark chocolate truffle! We'll coat the truffles later. Truffles keep in the refrigerator for about 2 weeks or in the freezer for a few months.. A food processor or blender really comes in handy for this step. .Candy Melt Coating: place some candy melts (as many as you want) in a food processor and blend until they are in tiny pieces. Place chopped candy melts on a plate or bowl and roll truffle in coating until coated. .Colored Coconut Flakes: place some shredded coconut flakes (sweetened or unsweetened) in food processor and add a dab of food coloring. Blend until the color is well distributed. Roll truffle in coconut until coated. I was not at all happy with the yellowish-green I created, so be careful!.Cocoa Powder Coconut Flakes: place some shredded coconut flakes (sweetened or unsweetened) in food processor and add cocoa powder (1/2 tsp at a time). Blend until the color is well distributed. Add more cocoa powder if the coconut doesn't look chocolaty enough. Roll truffle in coconut until coated. .Milk Chocolate Coating: place some chocolate chips (as many as you want) in a food processor and blend until they are in tiny pieces. Roll truffle in chocolate bits until it's coated..Remember truffles can be coated in anything you want: nuts, bacon, sprinkles, cocoa powder, or powdered sugar! They are also fantastic dipped in chocolate. . Making the chocolate flower petals is the hardest part. The truffles are delicious with or without the chocolate flowers so this step is entirely optional. You can also melt the chocolate out of the bag: in a double-boiler or microwave it in a bowl, and then add it to a piping bag. .Figure out how many flowers you want to make (each flower gets a truffle) and get out that many baking cups plus a few extra. .Melting the Chocolate:Place candy melts in a plastic piping bag (one that hasn't been cut) or a freezer bag. Each flower requires 2-3 candy melts. Twist the bag and wrap a rubber band around the twist. Heat the bag of candy melts in the microwave on 50% power for 30 seconds; squeeze bag; put back in microwave on 50% power for 30 seconds; squeeze bag; repeat until chocolate is completely melted (be careful not to overheat).You can also melt the chocolate out of the bag: in a double-boiler or microwave it in a bowl, and then add it to a piping bag.Snip off the tip of the bag..Piping the Flower Petals:Cover the bottom of a mini baking cup with a thin layer of chocolate.Then, starting at the bottom of the baking cup, pipe a line of chocolate up the side of the cup that tapers off towards the top.Continue to pipe lines up the side of the baking cup, leaving a space between each line. Repeat with all baking cups..Place the baking cups with flowers on a plate or tray and place in the freezer for 10 minutes or until hardened. . To finish the flower truffles, simply place a coated truffle in each chocolate flower. You can leave the flower truffles in the mini baking cups or peel them off. Truffles keep in the refrigerator for about 2 weeks or in the freezer for a few months. Truffles are better served at room temperature. .Unwrapping the Chocolate Petals: Gently pull down on the top edge of the baking cup paper (only pull down about a 1/4 of an inch)! Then, going around the top edge, continue to carefully pull down the paper a 1/4 of an inch. Finally, pull the paper the rest of the way down and pull off the flower. If one of the petals break off, just stick it in the truffle (see picture). .Be sure to enjoy!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_26_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_26_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_26_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_26_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. . Cook the rice according to directions on the box. \nFor the brand I used, which is quick cook rice but not instant, it too about 12 mins to cook in the microwave (tightly covered with lid or if you don't have lid, use plastic wrap) then let stand for 5 mins.\n. 1 pkg. 12 oz frozen vegetables or 1 12 oz (approx.) can of mixed vegetables\nCook the vegetables according to the pkg.  (For this type it is cooked in the pkg. and takes about 7 mins. be sure to set the bag on a plate in the microwave as the bag tends to leak a bit when cooked).. 3 eggs\n1 tablespoon vegetable oil\nDash of Salt and pepper\nLightly beat the eggs with the salt and pepper (I prefer seasoned salt, but you can use regular salt).  \nHeat frying pan and add 2 tablespoons oil. When the oil is hot, add the eggs then scrambled them, but don't over cook them. . . 2 tablespoons vegetable oil\n1 tablespoon ground ginger \n5 teaspoons soy sauce or teriyaki sauce\nI use a dutch oven pan, but if you have a large wok that would work better.\nAdd 2 tablespoons oil, ginger and soy or teriyaki sauce then add the cooked rice the stir-fry it for a few minutes and then add the scrambled eggs.\nOptional:  Most fried rice has chopped in onion in (I don't as my son hates onions) it and you could chop up 1 small onion and add it to the pan and brown it in the 2 tablespoons of oil before adding the rice if desired. \n. Add the cooked mixed vegetables and stir for a few more mins then serve hot.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_27_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_27_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_27_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_27_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Chocolate (milk or dark), 90 gButter, 30 gEgg (regular size), 1Sugar, 1 tbspCocoa powder, 1 tbsp. Prepare the materials and weigh them.I used stainless bowl to melt it on hot water bath.. Above the boiled water (not boiling water), put the stainless bowl.Or use other method to properly melt them.If some water goes in, it will not melt properly! Be careful.. Egg + Sugar + Cocoa powder.Mix them with whisk.. Mix them evenly~. Don't fully fill the container.Fill 60~70% of it.It inflates while cooking.Put this in the microwave and cook it for20 secs + 20 secs + 20 secs Separately.. Great dessert with coffee.The name I know is pondant au chocolatbut I would rather call it hot moist chocolate pudding :)Especially in this cold weather this dessert gives a great relief.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_28_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_28_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_28_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_28_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Take the cup of dissolved yeast add sugar and leave covered in a warm place for a few minutes. . Place flour in a standing mixer add the yeast mixed with water..  Attach the kneading hook to the mixer.Turn the mixer on and start to add the water a little at a time. The mixture will appear crumbly and then it will start to leave the sides of the mixing bowl. Add salt, Keep mixing till it comes together in the form of a soft dough.. Leave to rise covered in a warm place till double in  size this may take 3-4 hours.. Divide the dough into equal sections. Make a ball out of each section and again set aside covered for 30 minutes.. Roll out one ball at a time with your hand, spread to desired size. If you can't manage with your hand use a rolling pin but usually a pizza is rolled out wth the help of a hand.. I keep my work simple by adding oregano salt and black pepper to tomato paste, but you can make pizza sauce yourself or buy it if you like.Spread the sauce over the rolled dough.. Cover with mozzarella, I use grated mozzarella because I find it better to handle.. Use the baked or grilled zucchini to make the curly hair.. Cut an aubergine slice in half and use it for the ears.. Cut an olive into half use each part for the eyes. Place a piece of corn inside the hole of each olive.. For the mouth use bell pepper as shown in the picture.Use a piece of olive for the nose.Place the prepared pizza in a preheated oven at 400 degrees. Bake till the cheese melts and is golden brown in colour.This is the best recipe that you can enjoy making with your kids. Involve them with you while making the face parts and they will surely have fun.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_29_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_29_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_29_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_29_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. 2 cups of any kind of low carb milk. For me, the best ones are soy milk or cashew milk. These are creamy but not high calorie. If you're not worried about the calories, you could also try half and half.Optional splash of heavy cream.Half a cup of Chia seeds.Half a cup of unsweetened cocoa powder.Six packets of Splenda or other non-sugar sweetener. (You can adjust the sweetness to know how you like it.)Quarter teaspoon of vanilla extract.Dash of salt.. You'll need a container with a leak proof lid. I use glass Rubbermaid containers. It will need to hold 4 cups.Put everything your container, starting with the wet ingredients.. . Put the lid on and make sure it's tight. . Then shake vigorously until you see that the cocoa powder is well blended in.. Put the container in the refrigerator for at least a half an hour. Try to take it out within an hour, but if you have to leave it in longer, it's OK.. After 30 to 60 minutes, take your pudding out of the refrigerator. It will have separated. You can either shake it or stir briskly with a spoon or whisk. It should be well blended at this point.Then return the covered container to the refrigerator. It will take another several hours for the liquid to be completely absorbed by the seeds and cocoa.. Give the pudding a final stir before serving. It will stay thick and puddingy at this point. You can add berries and cream for breakfast if you want or sugar free whipped cream for dessert. Or eat it just like it is - delicious!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_30_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_30_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_30_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_30_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Since this was the first time ive ever used fondant, I was not too sure how this cake would turn out, but I\u2019m happy with it and glad to share it with everyone \u263aYou\u2019ll Need: *6 cups rice cereal *6 cups miniature marshmallows *3 tbls. Butter or margerineDirections: 1. Melt butter over low heat in a large saucepan. Add marshmallows and stir until completely melted. Remove from heat. 2. Add cereal, one cup at a time, stirring until well coated.NOTE: When I made this for the first time, one batch wasn\u2019t enough so I highly recommend making two batches as that will turn out the same size as mine. Steps: 1. When you are done making your rice crispies, place in a greased/buttered round or square round medium glass bowl. 2. Since we want to achieve a cauldron look, begin by molding your shape, starting from the bottom and with the palms of your hands pack down the rice crispies forming a bowl shaped (or something like a bird\u2019s nest). It can get very sticky so if you would like to grease your hands before that might be a good idea to do so. 3. Leave about a 2 inch indent on the top of the cauldron to place the fondant props inside. *Set desired mold into freezer for at least an hour (or overnight) to set.. To make the green gooey bubbles, you\u2019ll want to make buttercream in order to get that special effect. I made my own, but you can buy premade in a can. This is how I made mine (homemade is always better ;)What you\u2019ll need: *3 cups powdered sugar *2 sticks butter, softened *1 tsp. vanilla extract (or whatever extract you desire) *2-3 tbs. milk *10 inch cake board\u00a0Recipe: 1. In a mixing bowl, whip butter until you get a nice fluffy consistency 2. Add powdered sugar, one cup at a time 3. One tablespoon at a time add the milk 4. Add vanilla and mix until creamy *Once you are done making the fondant, with a butter knife, spread a thin layer of buttercream onto the cauldron, making sure it\u2019s thin and not too thick. Place on cake board. Set rest of buttercream aside.OOPS! Moment: the first time I made this buttercream I used margarine and for some reason it got extremely runny so I would recommend using real butter as that will give you a firmer and workable material.. *When working with fondant, the key is to make sure it is soft enough to work with, but not too soft. I used Duff\u2019s brand white fondant because it is easy to work with and it tastes scrumptious \u263a I bought white because since I needed numerous colors, I decided to buy food coloring separately.What you\u2019ll need: *Duff\u2019s fondant (white) *10 inch cake board *Black food coloring *Rolling pin *powdered sugar for dustingDirections: 1. Lay your cauldron out on a flat surface. 2. Microwave 1/3 duff\u2019s fondant for about 5 seconds. 3. With black food coloring, work into fondant, making sure its covered with black (you may want to use gloves, it can get messy) 4. Sprinkle powdered sugar onto a flat surface 5. Roll out fondant about 1/8 inch thick rolling out enough to cover cake 6. Place 4 small black balls on each corner of cake board with excess fondant (about half inch thick) 7. Cover cake with fondant, making sure sides are fully coated.NOTE: when putting the fondant over my cake, it got very bumpy (which is I guess okay for a witches\u2019 cauldron cake), but I would suggest making it thick enough so it doesn\u2019t turn out too ridged. I ended up recovering it and trying a second time, it wasn't as bad. Plus, if you smooth out the butter cream beforehand it shouldn\u2019t get bumpy.. *This is the best part! Now you get to be creative and design whatever shapes/creepy molds you want! I made a bat wing, eyeballs, pigs foot, poison bottle, witches broomstick, and a bone. *Before making the fondant props, color the rest of the buttercream with lime green food coloring. *in a pastry bag, dollop around sides of cake, and inside so the props will stick to the top portion. *Color the additional fondant with whatever color you desire! *You can use a serrated knife for the edges or decorating tools to shape them. *I made fire at the bottom of the cake with my etra fondant as well, don\u2019t be afraid to get creative! *Place fondant props inside of cake, putting in whatever directions you wantVOILA!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_31_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_31_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_31_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_31_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Turn oven  on to 350 degrees.. Melt 1 stick of butter in the microwave for 1min.. In a regular size cake pan crack 1egg (well beaten). Add entire box of yellow cake mix. Pour melted butter over the egg and yellow cake mix. Mix ingredient. Press mixed ingredients into the bottom of the cake pan.. In a separate bowl add 2 eggs and beat them. Add the 8oz cream cheese to the bowl. Add 1/2 the bag of powder sugar. Then mix all with a mixer until it  is smooth .. Pour ingredients from the bowl into the cake pan.. Put the cake pan in the oven. Bake it for 45-50min. Cake will be a golden brown color.. Enjoy your cake\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_32_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_32_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_32_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_32_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Your going to need 2 apples one a little smaller than the other. And if you need it maybe a little peanut butter.. Cut some little notches about 4 on each side for the feathers. if you want them to stick out more add some peanut butter underneath them.. Cut 2 small circles on the smaller half and then take 2 little circles of pealing and put them in the circles for the eyes.. Now put the smaller apple on top of the bigger apple and if you need to put some peanut butter to help them stay.And thats it hope you enjoy.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_33_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_33_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_33_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_33_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. You will need: \u00a0\u00a0 -Ground meat \u00a0\u00a0 -Bacon strips \u00a0\u00a0 -Eggs \u00a0\u00a0 -Salt \u00a0\u00a0 -Pepper \u00a0\u00a0 -Nutmeg You need toothpicks and if you use leftovers as I did, you also need a food processor.. If you buy ground meat, you will probably not need eggs, but if you ground your own meat (in the food processor)\u00a0you will need eggs. Add an egg or two until the meet is \"solid\", by other words until you can mold it (like play-doh..). Season with salt, pepper, nutmeg and other seasoning that you want. Now you can start the shaping process.. Form little balls and then roll them\u00a0until they look like chuby sausages. Wrap\u00a0\u00a0them with a strip of bacon and insert a toothpick, so that the bacon does not escape, because nobody wants that (hmmm bacon!!). Aren't they gorgeous?! Yes they are! And\u00a0 even more tasty. \u00a0 (ignore, I'm hungry). Pre-heat the oven to 180\u00baC ( 350\u00baF). Put the mini rolls in a baking tray covered with aluminium foil and insert them in the oven. Count some painfull 20-30 minutes and remove them from the oven. Serve immediatly and eat like it's the end of the world! \u00a0This is a very\u00a0simple recipe that I made up to my lunch and\u00a0it came out pretty good! \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Thank you for watching and have a good meal!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_34_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_34_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_34_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_34_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. I decided to make pretty much everything from scratch for this pie, including the crust.  If you wish to cut time by buying pre-made crust, then by all means GO AHEAD. This is 2014. But just keep in mind it may not be as tasty.  TIME: 20 MIN total for crust Crust is not too painful to make. You will need for crust   1 1/2 cups of Gram Cracker crumbs or 12 Gram Crackers made to crumbs.       3 tbsp (tablespoon) of sugar      3/4 cup of non salted butter melted      1 tsp (teaspoon) of cinnamon      Springform pan 9\" or 10\" pie plate  Turn your oven on to 375 degrees Whisk or with a fork mix together the graham cracker crumbs, sugar and cinnamon. (The cinnamon will give it some nice flavor.) Melt the unsalted  butter in the microwave and use a fork to mix the butter with the crumb mixture until they are all are moistened. It will look clumpy and much darker and that's a good thing. Spread the crumbs into a 9-inch spring-form pan OR a 10-inch pie plate and press them firmly into an even layer over the bottom and half of the way up the sides of the pan. Bake for 6 to 8 minutes, or until golden brown. Let it sit for 8 minutes to cool, or just stick it in the fridge to save time. By baking the crust you will get a more crunchy crust. Which will go beautifully with the crunchy top I have planned for this pie =). The secret is toasting the nuts!! Forgive me Grandma!! Haha, I'm just kidding. No, but seriously. Any time you have a dish with nuts, the secret to ultimate flavor is to toast them. It only takes 5 minutes, and enhances the flavors so much! TIME for Sauce: 8 MINWhat you will need for the Special Caramel sauce.    1 packet of apple-cinnamon granola from Quaker Oats.       3/4 cup of chopped pecans      1 cup salted caramel sauce. I used sugar-free in order to not go overboard with the sugar.      small cooking sheet for toasting the nut mixture in the oven.   Open the packet of granola and pour in a nut chopper as well as the pecans. You could also break them up yourself by putting them on a cooking sheet and breaking with a spoon, but it may get messy.  Since the oven is already going because the crust was just made toss the nuts in!  After 5 min of toasting pull them out. They should smell amazing. Take the crust out of the fridge. It should be cooled by now. Pour the caramel on top of the crust and sprinkle the toasted nut mixture on top of the caramel.  Place the springform pan into the fridge to chill out. MAKE SURE YOU SAVE SOME TOASTED NUTS FOR LATER. ;) You will use them as a garnish. The Infinity pie is based off an apple cheesecake pie. So making the apple pie part is very much like making a regular apple pie as you would have guessed. You can either BUY (it's 2014) your apple pie filling OR you can make it. I chose to make it because I want a delicious pie this time. Dedicating something to my hero only deserves the best! ;)  NOTE: if you are using a can of apple pie filling you only need to use half!! TIME for Pie filling 40-50 min (depending of if you have an apple corer)What you will need for Apple pie filling   5 small granny smith apples. They must be peeled and cored and cut them thinly (slave work)      3/4 cup of unsalted butter      1/2 cup of light brown sugar      2 tsp of cinnamon      a couple dashes of salt       a large pan for cooking on the stove  I DON'T have an apple corer. So this part took extra long.... my boyfriend wasn't too thrilled. But it's only 5 little apples. While you are peeling apples, put the butter on the stove and begin melting it. It will only take a few minutes. When it's melted add the brown sugar and cinnamon to the butter and mix until gently bubbling. Again it only takes a few minutes so you probably won't be done with your apples. The\" brown apple syndrome\" will happen and it's alright. These apples are destined to go into a brown sugar liquid and cooked extremely soft. No harm so don't stress! ;) when you're finished with the apples slide them in the large cooking pan and coat them well with the liquid. Put a lid on the pan and stir occasionally for 10 min. Remove the lid and up the temperature to med-hi to boil off most of the remaining liquid. Throw a few dashes of salt in. After another 15 min the apples should be very very soft and that's what you're looking for.  LET SIT FOR 20 min to cool before adding to your pie crust. Getting tired yet??. You can turn the stove off if you want to save electricity for 20 min while the apple pie filling cools....  But OK, you have 20 min to make a design to top your Infinity pie. Me, because I didn't want to have to make a batch just for crust I broke down and bought my pre-made crust. FORGIVE ME GRANDMA. ;) haha Pre-made crust is very easy to work with. You just unroll and cut out whatever design you want. I see pie design tops (much like pumpkins today) as a big fad soon. It is taking off but not like I think it will soon. But anyways, cut out whatever your heart desires! If you mess up, crust is easy to erase... just flatten out and try again. For stencils, I just found shooting stars online, printed them out, and laid them over the dough and cut it. Easy as pie. My shooting star is dedicated to Carl Sagan and the infinite universe. =). Exactly as the title says... pour the cooled apples on top of the cooled caramel mixture that's been chilling in the fridge.  This is the easiest step! ;). I love cheesecake. If it were me I'd put cheesecake in everything. But I probably wouldn't live long. Anyways, again, this is only technically half a cheesecake so the ingredients aren't as heavy. Turn that stove back on to 350 degreesWhat you will need for cheesecake topping:   8 ounces of soft at room temperature cream cheese   1/2 cup of granulated sugar   1 egg medium sized      1 tsp of vanilla extract      1tbsp of lemon juice      lemon wedge for lemon zest      electric mixer and a medium sized bowl  First you will need to beat the cream cheese and sugar together on medium speed for about a minute. They must be well mixed. Then add the egg and beat it in until it is combined for about a minute.  Then add the lemon juice and vanilla extract and beat for another minute. Zest the lemon wedge in. Just a few times is all it needs. Pour the cheesecake batter over the apples in the pan, smoothing it into an even layer as much as you can. Bake until the cheesecake is set about 25-30 minutes. While this is happening, as you will see in the next step, coat your design you made with the pie crust with egg whites and bake at the same time in the oven with the pie.. Because pie crust is usually not belonging on cheesecake, I decided to bake it separately on a cooking sheet.  I coated it with an egg white to give it shine and baked it next to the cheesecake for 5-8 min. When it was done, I pulled it out and sprinkled it with sugar while it was still hot to give it some sweetness.  The cheesecake should be done within 30 min. Transfer the cheesecake pan to a wire rack to cool, the cheesecake must refrigerate for at least 4 hours or overnight. (For me, since it was already midnight when we were finished... lol, we ended up just chilling over night.).   Before you put your infinity pie in the fridge...., ta dahhhh, the toasted crust design goes on top of the cheesecake like a crowning jewel! Then, add some of the remaining crunchy toasted nuts on top and along the outsides to bring it to life. Then, put that sucker in the fridge overnight. I know it will be REAL HARD. But trust me, it needs to cool for at least 4 hours.  When serving your Infinity pie, put some caramel on the plate along with the special crunchy nut mixture. It will definitely knock someone's socks off! Pair with vanilla bean ice cream for a real desert! Be sure to refrigerate any leftovers.\"The sky calls to us; if we do not destroy ourselves. We will one day, venture to the stars\"  -Carl Sagan  This one's for you Carl! Enjoy your Infinity pie everyone =) PLEASE let me know if you make it!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_35_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_35_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_35_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_35_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Materials:\nSolo Cups\nCoffee Filters\nFunnel\n5-6 Flasks/Bottles - I used 10oz Orangina bottles (great to drink while making this stuff)\n5 plastic bottles - I used Gatorade bottles because of the wide mouth and screw top (also great to drink)\nIngredients:\n2 pounds of Skittles\n1.75L of Vodka\nLove. Sort out the five different skittle colors from all 2lbs, and put them in your 5 plastic bottles. \nAdd 12oz (1/5 of the 1.75L bottle) of vodka to each skittles container. Seal the bottles and shake each bottle vigorously every few hours. Takes about 24 hours to dissolve the skittles completely. \nAlso I would recommend wearing gloves when handling so many skittles and the mixtures as it stains your fingers and took four days to get out. \u00a0 \u00a0\u00a0. Once the skittles have dissolved, a foam is left on the mixtures and must be removed. Use your coffee filters and solo cups to accomplish this as quickly as possible. Wear those gloves!\nOriginally I tried using paper towels to filter the foam away but it soaked up too much of the vodka and took a really long time. Therefore I switched to the coffee filters, but these still took some time to filter the drink so I had multiple setups going at once. Make sure you only add a little vodka onto the filters at one time because the foam will build up and stop the drink from moving through. Scrap off the foam once it builds up and squeeze any vodka left in the filters into the cup below.\nThis process took a few hours and requires your attention.\u00a0. Depending on how the process has gone, you will be left with anywhere from 60-70oz of Love Potions.\u00a0\nFill up your flasks in any manner you choose, with my leftover liquid that didn't fit in the five 10oz bottles I combined them into a sixth 10oz bottle. Of course you could just have bigger bottles.\nImportant Note: Putting the drinks in the freezer will cause the drinks to condense so fill the bottles up more than necessary. \u00a0\nHopefully one of these potions will cause your Valentine to fall madly in love with you!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_36_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_36_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_36_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_36_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. For one drink2 1/4 cups of frozen, cubed seedless watermelon 2 tbsp of light rum2 tbsp of lime juice2 tbsp mint syrup (how-to = step 6)      - fresh mint      - white sugar4 fresh mint leaves*NOTE: the watermelon needs time to freeze, so it's best to prep the watermelon a day ahead so it can freeze overnight.Shopping List:3 small seedless watermelons2 limes1 bunch of fresh mint1 bottle light rumsmall bag of sugar*This amount of fruit will make approx. 4 generous drinks.Tools:blendercutting boardsharp chef's knifemeasuring cup1-2 lrg freezer bagscitrus juicersmall saucepanstirring spoonsmall strainersmall bowlserving glassesfun straws!. There are many ways to go about removing the rind from a watermelon. If you have a tried and true method, go ahead and use it. This is my favorite way:Cut a nice flat slice off of the bottom of each watermelon, so that it can sit upright with lots of stability.. Using a sharp chef's knife follow the curve of the watermelon's surface and remove the rind in strips, until the whole melon is rind free.. Slice and cube each watermelon.. Place the cubes into large freezer bags and place in the freezer overnight.. Take 1/3 of your bunch of washed, fresh mint and remove the leaves from their stems.Place the leaves in one hand, and clap down on the leaves with the other several times. This is a simple and quick way to release the flavor. Place the 'clapped' leaves along with 1/2 cup of sugar and 1/2 cup of water in a small saucepan.Bring to a boil, stirring occasionally.Remove from heat, strain, and let sit to cool to room temperature. Place in the fridge to cool it further.. Juice enough limes for as many drinks as you'd like to make. Each drink needs 2 tablespoons of juice.. Place all ingredients into a blender.. Blend.You are now one step closer to deliciousness.. Place your blended drink into a serving glass, add a sprig of mint and a fun straw.Now put your feet up, lean back in the last few rays of seasonal sunshine and enjoy!!NOTE: This is also scientifically formulated to give you even more to look forward to for next watermelon season...\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_37_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_37_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_37_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_37_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Tools\n-utility knife (Scissors work fine)\n-Oven or source of heat that can get up to 250 degrees\n-Oven mit's\n-Bowl, Knife, measuring cups, etc.\n-acetone\nMaterials\n-strawberries\n-rhubarb\n-salt\n-pie crust\n-can\n-sugar. Cut the can to how deep you want your pie to be with an exacto knife or a pair of scissors.\nIf you plan to bake inside use steel wool or a steel bristle brush first to remove the paint.. To make the pie you will need 1/2 cups of chopped rhubarb (Making sure to remove the leaves which are poisonous) into about half an inch pieces, 1/4 cup of chopped strawberries, 1/4 cup of sugar, and pie crust.First you layer the inside of your can with the pie crust, then you fill it with the ingredients listed above and then you put a the rest of the pie crust on top of your pie and cut a slot to let out steam. then you put it in the oven and/or fire and wait about 10 minutes.If you want to make you're own crust the link below will show you how i used a oil pie crust recipe because i didn't have shortining. Oil crust:http://busycooks.about.com/od/piecrustrecipes/r/oilpiecrust.htmRegular crust:http://allrecipes.com/HowTo/Making-a-Pie-Crust-Step-by-Step/Detail.aspx. Grab a spoon/fork/knife? and dig in!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_38_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_38_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_38_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_38_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. All of the 3d printed parts1 wiper motor (wiring instructions wiring instructions. I recommend the higher speed.)      1 power supply1 gearbox kit1 ring burr1 cone burr1 fastener kitOptional: 4x sorbothane feet. Insert the two bearings into the 3d printed part named \"bottom.stl\". Take the axle that comes from the gearbox kit, and thread it into the cone burr.    Add the white plastic piece that comes from the gearbox kit to the assembly, making sure the plastic tabs fit into the metal holes, and the piece seats properly.    Insert the conical burr/axle into the piece from the previous step. Add the printed part named shaft_coupler_top.stl to the assembly as shown, making sure to face the rounded corners up and into the part.Then add the internal tooth lock washer that came with the gearbox kit. Then add and screw on the nut that came with the gearbox kit. . You will need to remove the white plastic the ring burr comes with in order to perform this step. Insert the ring burr into the top as shown, making sure to align the flats of the ring burr with the nubs in the printed part. Turn the ring burr 90 degrees inside the printed part. Insert the worm/grub screws into the holes on the sides to lock the ring burr in the part. . Using template.stl, assemble and drill as shown. I recommend using a drill press and to center punch the holes to keep the drill from wandering. . Add the motor adapter you just prepared to the motor shaft.   Add the 5/16\" SAE washer   Add the m6 nut and tighten until the motor turns a little bit.   Add the shaft_coupler_bottom.stl  Add m3 washers and bolts  Attach to bottom with either m3 lock nuts or with lock washers and nuts. . Put the bottom assembly on to the motor assembly. If the bottom assembly does not seat all the way, you will need to rotate the conical burr to make sure the shaft couplers engage. The bottom of the bottom assembly should touch the top of the motor assembly. . Glue the feet on to the bottom of the motor. The grinder can get pretty noisy, and it's a good idea to use vibration dampening feet like the sorbothane feet seen here. . Screw on the adjuster (shown in blue) to the top piece (shown in yellow) as shown, taking care to line up the adjusters tabs with the nubs on the top piece. Insert the top assembly into the bottom assembly so that it is seated fully. Rotate the adjuster to adjust the grind. . \nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_39_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_39_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_39_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_39_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. For this recipe you will need:1 c blueberries2 c all-purpose flour2 1/4 c buttermilk4 tbsp melted, unsalted butter3 tbsp sugar1 tsp salt1/2 tsp vanilla extract1 tsp baking soda2 tsp baking powder2 eggs. Using an electric mixer, beat the eggs until they become frothy.. Add in all of the other ingredients except the blueberries to the eggs and stir with a spoon until they are combined, but do not over mix.. Now pour 1/3 c batter on a griddle set to medium high heat. Top the pancakes with blueberries.. Cook the pancakes for 2 minutes, or until golden brown, then flip and cook the other side for 2-3 minutes until it is golden brown also.. Keep the pancakes warm while cooking the rest by placing them on a baking sheet in an oven preheated to 200\u00b0.. 3 c blueberries1 c water1/2 c sugar1 1/2 tbsp cornstarch that's been dissolved in 3 tbsp water1/2 tsp vanilla extract. In a small saucepan, add 1 1/2 c blueberries, water, sugar and vanilla and cook on medium high heat until it starts to boil.. Stir in the cornstarch and lower the heat. Cook for an additional 2-3 minutes, or until you get the desired consistency of the syrup that you want.Remove the saucepan from the heat and stir in the rest of the blueberries.. Top the pancakes with the syrup and enjoy!If you make these, let me know what you think!!!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_40_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_40_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_40_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_40_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Ingredients: \u2022 1lb box of Duncan Hines Red Velvet Cake Mix \u2022 1 \u00be Cups of water \u2022 \u00be vegetable or canola oil \u2022 \u00be Cups of apple sauceUtensils: \u2022 Mixing bowel \u2022 Measuring cups \u2022 Non-stick pans (preferably round) \u2022 Mixer (optional). Once completely poured into the bowel, stir the mix to get out thick chunks before adding in other ingredients. Otherwise, they will be much harder to get out later on. Also, set your oven on at 350 for time to preheat. . First, add 1 \u00be cups of water. Second, add \u00be cups of\u00a0 vegetable or canola oil. Finally, add \u00be cups of apple sauce. Mix all ingredients thoroughly until it is a smooth batter. . If a non-stick pan for baking is unavailable, use grease or butter to coat another pan. If your pan is too small for all of the batter, use another one (should fill a 3 inch pan about half way). Allow for around 30min of baking time. Check on the cake once 30min is up.. Check to see if the cake is thoroughly baked by sticking a fork into it. The cake should be moist, but it shouldn't crumble or be smashed by the fork.. This cake will be much more fragile than a normal cake, so be very gentle when applying the frosting if keeping the cake intact is important. \nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_41_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_41_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_41_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_41_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Typically any old espresso grind will do, even Turkish grind can be okay if you don't mind the solids coming through.If you want to achieve superior quality though I HIGHLY recommend a nice burr grinder.I recently purchased the Hario Skerton, A few people on the internets believe its supposed to be Skeleton but the asian producers messed up. Either way its a fantastic product!Here's how I adjust mine;Tighten the grind setting until the mechanism will not turn freely (be gentle here)Now find the first notch that the lock will fit into, I call this position 0Now move the nut up a position this is position 1...Continue to position 3,This is the ideal grind for me while making espresso! I wouldn't exceed position 5.I use a 1:1 proportion when I make espresso. In a typical espresso double shot there will be 3 oz of finished espresso, Therefore I will start with 3 oz of beans or ground coffee.. Depending what drink you are making you may want different flavors in your brew. The flavor profile of your beans can be altered using different temperature water. Higher temperatures will bring out roasty/burnt tones and lower temperatures may only pull the fruity/acidic flavors. Finding a nice balance can be tricky and depends on the roast of coffee you use. I like to buy light roast coffee and brew around 150-160 degrees F to make a fruity/citrus flavor and add sugar to take off the acidic edge.. I line up the center of the (4) with the surface of the rubber plunger, this lets you reference shots (1.5 oz) on each progressive circle. Using the press upside down seals it from leaks and lets you fully saturate the coffee before steeping and pressing through.. Just add a tiny bit of water at first, this lets the coffee absorb the water fully before being saturated and helps later with frothing. stir it up and let it breathe for a few seconds.. Add the rest of your brew water and get the coffee suspended evenly. I typically fill a bit past the 2 while upside down for espresso. If you want drip style, perhaps a bit stronger, Fill to the top at this step. Cover the press with your filter and cap assembly, this will help retain some heat.Wait at least 2 minutes, or longer. The flavor may improve but this really depends on the roast your using. If you just want the caffeine boost steep for a long time! Like 10 minutes! Woo!. After waiting patiently Press that liquid gold through and watch the oils seep through in the foamy goodness! Enjoy a nice dark espresso or mix it into hot milk for a cappuccino or more milk and sugar for a latte. Makes a nice americano with hot water as well!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_42_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_42_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_42_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_42_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. You will need:Bacon - 1/4 lb or 4-5 strips/slicesBroccoli - approximately 4 cups of florets or 2 bunchesRed Onion - 1/2 large onion or approximately 1 & 1/2 cups when choppedPomegranate - 1/8 to 1/4 cup arils or approximately 1/4 of one pomegranateCheddar Cheese - 1/2 c shredded or 50 gramsMayonnaise- 1/3 cupWhite Vinegar- 1/2 cupWhite Sugar - 1/2 cupBlack Pepper- 1/4 tspSalt - 1/2 tsp Cutting Board &\u00a0Knife Cheese Grater Large Mixing Bowl & Salad Tongs Two Medium Bowls & a Spoon / Whisk Smaller Serving Bowls & Utensils. In a medium bowl, combine: 1/3 c Mayonnaise 1/4 tsp Black Pepper 1/2 tsp Salt Mix with a spoon or whisk until smoothly blended. Add 1/2 c Sugar and mix again. Add 1/2 c White Vinegar and mix once more. Refrigerate and shake or re-mix before serving on salad.. Cut your strips of bacon in half and pan fry. Drain on paper towels and let cool before breaking strips into smaller bite-sized pieces.. Seed your pomegranate ahead of time and set arils aside.https://www.instructables.com/id/How-To-Seed-A-Pomegranate/. Chop or dice broccoli & onion and set aside together in the large mixing bowl.. Grate 50 grams of cheese to make approximately 1/2 cup once shredded and add to large mixing bowl.. Add bacon bits and pomegranate arils to the large mixing bowl (which should already contain broccoli, onion and cheese).\u00a0Toss salad and serve with dressing on the side (or add dressing and toss again before serving). You can also opt to chop the ingredients more finely as shown in the third image of this step.This recipe makes enough to Serve: 5-6 people as a side dish 3-4 people as a lunch 2-3 people as an entree For a vegetarian meal, remove the bacon. For a vegan meal, remove the bacon and cheese, use vegan mayonnaise and add chopped walnuts. This recipe is gluten-free.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_43_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_43_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_43_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_43_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. 500 g blackberries100 g sugarAbout 500 ml vodkaStrip of lemon peel. Put the fruit and sugar in layers in a preserving jar. Add vodka to cover. Shake every so often on the first day to dissolve the sugar.Store in a dark place for about a month.. Strain the liquid into a clean dry bottle through a layer of paper towel. Seal and label.. Serve in small glasses - it isn't that high in alcohol but it is full of flavour and quite sweet.It is delicious mixed with sparkling wine.It is a lovely addition to desserts such as trifle.You can make similar liqueurs from any kind of berry. Raspberries are particularly nice, and the product is a beautiful ruby red colour. Black currants need more sugar.It keeps for ages - I have kept it a year, when I lost the bottle in the back of the cupboard!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_44_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_44_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_44_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_44_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Note: I usually make a few of these while cooking up other fried foods, so normally I would use a\n\u00a0 \u00a0 \u00a0 \u00a0 \u00a0 Bulb or two of garlic and whatever batter or breading That is left over. But for this, I will post a\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 mini version of one of my batter recipes , just in case you want to make the garlic by itself.\n5 bulbs of garlic\noil for fryingTHE BATTER\n1 cups flour\n1Tbsp Corn Starch\nA Pinch of salt\n1/4 tsp pepper\n1/4 tsp garlic powder\n1/4 tsp onion powder\n1/4 tbs cayenne\n1/2 tsp chili pepper\n1/2 tsp cumin\n1/4 tsp adobo seasoning\n1 beaten egg\n1/2 cup beer\n1/3 cup milk\n3 Tbsp melted Butter ( i use coconut oil)The Buffalo Sauce (optional):\n1/2 cup Louisiana hot sauce\n2 TBSP\u00a0 butter\n1 zip lock bag or small, air-tight container\n\u00a0Note: you can make half batches of the sauce for smaller amounts or garlic, if you are\u00a0 frying other treats.. 1.Mix flour, corn starch, salt, pepper, and all other dry seasonings together in large bowl\n2.In another bowl, Mix in eggs, Beer, milk, and melted butter\n3.Pour wet mixture into dry ingredients bowl and stir until well blended. It should be just a little runny\n4. Let sit in the refrigerator for 30 to 45 minutes.. 1. Peal off the garlic skin and cut off the hard bottom pieces.\n2. Heat oil in pan.\n2. Place cleaned garlic cloves into batter and coat thoroughly.\n3. Place coated garlic cloves into hot oil and cook until golden brown. (about 5-6 Minutes, flip half way\n\u00a0\u00a0\u00a0\u00a0 through)\n4. Remove from oil and drain on paper plate or towel.. 1. Combine hot sauce and butter in a small Glass or bowl.\n2. Microwave sauce on high until the butter is melted, About 45 to 60 seconds.\n3. Mix until well blended, then Pour sauce into a zip lock bag or container.\n4. Place cooked garlic cloves into zip lock bag (or container) and seat tight.\n5. Gently shake bag until all garlic cloves are completely coated.\n6. Remove garlic from bag and serve with blue cheese or ranch dressing.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_45_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_45_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_45_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_45_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. use plasterstrips to get an imprint of your chest - you want the bra the right size.\nI'm not showing that step, due I don't want to see my chest on the internet ;)\n- i'm sure you'll figure out how to do that.\nInstructions are found on the package of the plasterstrips.\nWhen the imprint is dry cover it with aluminiumfoil.. you can use all cookie doughs that can be used with cookiecutters.\nthe recipe I used:\n0,5 egg\n125g flour\n62g sugar\n62g butter\n1Tablespoon Cocoapowder\n1 teaspoon vanillasugar\n1 little bit bakingpowder\nand for flavour 2 tablespoons of instant cappuccino. Form the cups of the bra on your covered mold.\nmake sure to make it evenly thick - about 0,5 cm\nbake it on 200\u00b0C for about 10minutes ( may vary with another recipe). at this point you can get as creative as you want :)\nHere's what I did:\nmelt some white choclate in the still warm oven\nspread it with a clean brush on the warm bra.\nmix some white chocolate with cacoa-powder\nand paint whatever you like :)\nbrush some chocolate on the edge of the bra and sprinkle it with chocolate-pieces\nlet everything cool down.. carefully peel the foil of the mold\ntake a corkskrew and make holes to join the two cups in the middle - be very careful!\ntie the cups together with a nice lace or string.\nYour done!. Now surprise your beloved one and have a nice cup of tea!\n- Or whatever you like\u00a0 :D\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_46_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_46_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_46_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_46_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. You'll need the following ingredients:\n-5 ounces good quality dark chocolate, melted\n-1 pint of vanilla ice cream (there's 1.5 quarts in the picture because I always buy what's on sale--I won't tell if you do, too!)\n-2 (4 inch) cherry pies, chopped into small pieces\n-12 ounces chocolate cookies (can be plain chocolate cookies or chocolate animal crackers or chocolate graham crackers)\n-8 tablespoons unsalted butter, melted\nYou'll need the following equipment:\n-9.5 inch pie plate\n-parchment paper\n-pencil\n-food processor. This is the sketching stage. Aka the part of the project where I actually attempted to use pi. My pie plate is 9.5 inches in diameter. If you think back to high school math you might remember the following equation:\u00a0\nC=\u03c0D\ncircumference =\u00a0\u03c0 * diameter\nIn this case the diameter of my pie-plate is 9.5 inches so the circumference is approximately 29.85 inches knowing that I wanted to express\u00a0\u03c0 to 32 digits with room for ellipses, I estimated that I'd have to make each of my numerals less than an inch to fit it in. First I took a large sheet of parchment paper and I traced my pie plate. With my handy circumference estimate in mind, \u00a0in mind I drew the attached pattern free hand attempting to center the pi symbol in the middle.* Because I have no idea how food safe my charcoal drawing pencils are (and the idea of eating something that had been piped on top of charcoal was kind of gross) I affixed another blank piece of parchment on top so the pattern shows through but I can pipe on a clean surface.\n*When it came time to put my pie together everything fit, but it was a tight squeeze. I realize with the benefit of hindsight that I forgot to factor in the crust around the edges. To be more precise, you'll want to leave a 1/4 crust allowance, so in this case I'd draw my circle as 9 inches to reflect the layout with that allowance in mind.. Next fill a pastry bag with your melted chocolate and fit it with a small round tip (I used the #4 Ateco tip that came in my piping set). Carefully trace over the pattern that you created. Once you've piped all the numbers and the large pi symbol, you can pipe as many small pi symbols as you wish--I suggest at least 50 so you get crunchy chocolate bits in each bite. Place patterns with the piped chocolate in the refrigerator to harden for about 10 minutes.\u00a0. Add chocolate cookies to bowl of a food processor and pulse until crushed into tiny pieces. Pour in melted butter and pulse until thoroughly combined. Press mixture into a 9.5 inch pie plate.. In a large bowl combine 1 pint (16 ounces) worth of slightly softened vanilla ice cream with 2 (4 inch) cherry pies chopped into small pieces. Use a spatula to fold the pie pieces into the ice cream, working quickly to avoid a giant melty mess. Once the vanilla ice cream and pie have been thoroughly combined, gently fold in the small pi symbols and transfer the entire mixture into the prepared pie plate, using your spatula to spread and distribute the filling so the surface is smooth. Remove the piped chocolate pieces from the refrigerator and using the pattern as your guide place them on the top of the pie. Freeze for at least an hour to up to overnight to firm the pie up before cutting and serving. Enjoy!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_47_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_47_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_47_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_47_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Turn cake upside down so you have a nice flat top, cut a small vertical line in cake. Into three , put knife in the side and other hand on top of cake and use  turn table to cut across evenly , make two evenly spaced cuts. Fill with buttercream and jam , be careful to make sure the cut edges on cake match up , otherwise you may get a wonky cake . Use a large knife or spatula to cover cake in buttercream , use smoother and turn table to turn cake round whilst smoothing sides. Colour and kneed fondant until soft , using a little corn flour or icing sugar to prevent sticking on work surface. Roll fondant to shape of cake , approx 2-3  ml thick , keep turning fondant round to keep even width , use rolling pin to pick up fondant and place on top if cake . . Smooth with your hands and then use smoothers to smooth to fit cake, cut off spare icing , and continue to smooth using turntable to help \nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_48_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_48_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_48_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_48_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Pull the head/tentacles away from the body. Scoop any remaining guts out of the body, and remove the thin plasticy quill. Rinse the body inside and out, then slice into approximately 3/4-1 inch thick rings.\nSqueeze behind the head to extrude the beak, and remove it from the center of the tentacles. Cut just below the eyes to free the tentacles, then add them to the bowl with the body rings.\nTentacles are the best part. No, really- they're fantastic.. Bring a pot of water to a boil.  Add a bit of salt and a bit (1-2 Tablespoons) of wine or citrus juice. Drop the squid into the water in batches, removing it just as it turns opaque.  This should take less than a minute, so be ready with a slotted spoon.Deposit the cooked squid on a paper towel to cool and dry.. Combine:\njuice of 2 limes\n~1 Tablespoon hot chili/garlic sauce (sriracha)\n~1 teaspoon sesame oil\n~1/2 teaspoon fish sauce (or to taste)\n~1 teaspoon rice vinegar\n1 kaffir lime leaf, finely minced (zest from those limes makes a fine substitute)\n3 finely minced shallots\n2 Tablespoons brown or palm sugar (honey or agave nectar are good substitutes)\nhandful fresh mint, finely minced\nhandful fresh cilantro, finely minced\nsalt and pepper to taste\nStir it up and taste.  It should be aromatic, spicy, and acidic with a touch of sweet.  Adjust the seasonings as necessary to make the sauce taste good to you.\nNote that I resisted the temptation to add a grated garlic clove to the mix- there's already garlic in the sriracha, and I didn't want to overpower the squid.. Add squid and give it a stir.  Let it sit in the marinade for a bit, preferably in the refrigerator for about half an hour.  More marination certainly won't hurt; you can leave it overnight if you like.. Serve cold.  The longer the squid marinates the better the flavors will penetrate.  This will keep for a day or two, but like any seafood it shouldn't be left to moulder in the refrigerator.  We've never had any problems of this type, as this salad disappears quickly.\nGarnish with any of the herbs used in the salad and serve on funny-looking plates.  For best results, make sure all the tentacles are showing.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_49_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_49_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_49_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_49_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Ingredients for the cupcakes:-113g Unsalted butter (softened at room temperature)- 1 3/4 Cup plain flour - 2  Teaspoons baking powder - 1/4 Teaspoon salt - 1 Egg (room temperature) - 3/4 Cup milk - 2 Teaspoon vanilla bean paste - 2 Tablespoons Canola / vegetable oil Butter Cream Frosting- 3 Cups icing sugar- 113g Unsalted butter, soft - 3 Tablespoons Milk - 1 1/2 Teaspoons vanilla bean paste- Yellow food colouringDecoration/ Toppings- 1 Punnet of Fresh Strawberries (250g)- 1 Teaspoon of sugar- Some leftover dark chocolate cake (about 2 cups worth)- Grated white chocolate. Equipment- Electric beater- Large bowl- 12 hole muffin pan- 12 Paper muffin liners- 1/4 cup measurement- Whisk- Piping bag with multiple hole tip- Small sauce pan- Small sieve- Grater. The first thing to do is to make one large batch of cupcake batter by creaming the unsalted butter (113g) in the kitchen aid mixer until the butter becomes light and fluffy. Then add the 1 cup of caster sugar and continue to beat until light and fluffy. Then add the vanilla bean paste and egg to the butter and cream and continue to beat them.Add the baking powder to the plain flour and whisk it together to get rid of any lumps then add 1/3 of the flour mixture to the butter mixture and beat it together. Then add 1/3 of the milk and continue to beat the mixture. repeat adding 1/3 of the flour and milk mixture alternatively until all incorporated. Finally add the 2 tablespoons of oil to the mixture and stir until combined.. Fill the muffin tins with the paper liners.  Take the 1/4 cup measuring cup and fill each lined muffin hole with 1/4 cup of cake batter. Then bake the cupcakes in a preheated fan-forced oven at 180 degrees Celsius for 17-20 minutes until the cupcakes are golden brown and springy to the touch. Let them cool for about 10 minutes then transfer them to a wire rack to cool completely.. While the cupcakes are cooling make the strawberry 'tomato pasta' sauce as this will need to be cold when put on the cupcakes.  Wash the fresh strawberries and cut them into small quarters. Place them in a small saucepan and let the cook for about 3-5 minutes on medium low. Then add 1 teaspoon (or more) of caster sugar to the sauce and let it cook until it gets nice and thick. This will slowly break down the strawberries into a thick sauce. To make the sauce smooth you can also break up the berries even more with a spoon and/ or sieve it to remove seeds. Then place this in the fridge until it has cooled completely.. Take the 2 cups of left over mud cake (with the frosting on it) and squeeze it in your hands to roll out small chocolate 'meatballs'.  I prefer to have 3 small 'meatballs' on top of the cupcakes so you will need to make about 36 meatballs for 12 cupcakes.  You can also make large ones as well but it looks more cute with smaller ones as it is more true to actual spaghetti and meatball size. Then place the meatballs in the fridge to firm up.. Once the cupcakes and strawberry sauce have cooled completely make the butter cream frosting. Place the unsalted butter in the electric mixer and beat until it is light and fluffy. Then add 1 cup of icing sugar to the butter and beat until incorporated, then add 1 tablespoon of milk and mix, adding alternatively until all milk and icing sugar has been added. Then add 1 1/2 teaspoons of vanilla bean paste and a few drops of yellow food colouring to the icing and beat it until the colour spread equally through the icing.. One all the separate parts of the spaghetti have been made they can be assembled.  First using a small round tipped piping bag place some of the butter cream icing in it. Then in random or circular motions pipe the icing loosely on the cupcake to resemble spaghetti noodles. Then add your cold strawberry 'pasta sauce' on top of the 'noodles'. On top of the sauce place your chocolate cake 'meatballs' (3 small ones looks the best). Add a small amount of sauce on top of the cupcakes then to finish it add some grated white chocolate 'parmesan cheese'.If your friends are vegetarian do not add the 'meatballs' and if they are lactose intolerant do not add the 'cheese' as I have done for some of them.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_50_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_50_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_50_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_50_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Consumables(2) half-cup sticks of unsalted butter(4) grams of vaporizer leftovers*(1) gram of ground fresh cannabis(3/4) cup of watercheeseclothKitchen suppliesstovetop range with a low heat setting 1 quart saucepanwire mesh strainer 2-cup round dishrefrigerator*Vaporizer leftovers vary in potency. A potent strain will yield more potent leftovers. The amount of heat applied from the vaporizer will determine how dark your roast looks: lighter color generally indicates that you'll get more potency whereas a darker color indicates that you extracted the potency with the vaporizer and may need to supplement with additional leftovers or the addition of fresh herb.. Add the butter, ground herb, and water to the saucepan. Set the burner to the lowest heat setting. Set a timer for five or six hours, then wait. Stir occasionally, making sure to scrape any errant bits of material from the sides of the pan.The low heat setting ensures that you don't inadvertently overheat the THC, thereby compromising its potency. Because THC is not water-soluble and the dried herbs float, the addition of water to the mixture is solely to float the ground herb away from the heat source.Low and slow is the way to go to ensure that the THC bind with the lipids in the butter. . After your mixture has simmered for several hours, it's time to strain it to remove the solids from the mixture.To do so, simply lay a square of cheesecloth over your mesh strainer. Secure the sides with one hand while pouring the mixture over the cheesecloth. The warm butter and water will pass through the cloth and mesh, leaving behind a buttery pile of damp solids. Use a spoon to squeeze out the remaining liquids, and you should end up with a dish of greenish-brown butterwater. There may be some fine material in your cooling dish, and that's okay. It'll rinse off after the mixture separates and cools.Let the mixture cool at room temperature, then cover it and pop the dish into the fridge for several hours to help the fats solidify. I recommend leaving it overnight to solidify. If you must hurry, do not use the freezer. Instead, fill a larger dish with cold water and place your smaller buttery dish into it to speed up the chilling process.. Behold the power of specific gravity!You'll notice that the fats all rose to the top of the dish, where they solidified into a puck of butter. The water settled to the bottom, and any tiny bits of floating herb floated above the water but below the butter. To drain the water, I use a chopstick to poke a hole along one edge of the butter puck and another on the opposite side. Then simply pour the water out and discard it.My butter puck has little brown bits of floating herb stuck to it. A quick rinse under cool water washes that right off, and you're left with a clean batch of cannabutter. . You should be left with just under 2 cups of clean, green cannabis butter. Use it when baking, as a topping on toast, mixed into tea, or eat it plain. For the first few attempts at this recipe, pay close attention to the potency and use more or less vaporizer leftovers as needed. Ingested marijuana takes longer to reach your bloodstream than vaporized marijuana. It can take anywhere from 30-90 minutes to feel the effects. If you have just eaten a heavy meal, it will take longer for the THC to reach your bloodstream. Conversely, if you haven't eaten for several hours, the THC will hit your bloodstream faster. The psychoactive effect lasts longer when THC is ingested (up to eight hours) so don't drive, operate heavy machinery, or hang out with your in-laws in the hours immediately following ingestion. \nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_51_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_51_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_51_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_51_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. -Take little ghee in vessel and add chopped nuts and raisins of your choice.Saute them till they change little brown color.Remove and transfer them to plate. To same vessel add teaspoon ghee and add half cup semolina.Saute till it changes little color and you get nice aroma.. Add 4 cups milk and stir continuosly on medium flame.. Add 3/4 cup sugar and previously sauted nuts and raisins and mix. Cook for about 20 minutes by mixing in intervals till semolina is cooked and milk is reduced in volume. Finally garnish with nuts and raisins and serve .\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_52_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_52_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_52_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_52_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. We used a slight adaptation on Martha Stewart's Fresh Beet Pasta recipe (see link)Ingredients:8 ounces red or golden beets (about 2 medium)2 large eggs1 large egg yolk2 1/2 cups all-purpose flour (white or stone ground), plus more for dusting1 teaspoon salt. Preheat oven to 375F Rinse beets to remove dirtPlace beets in casserole dish and fill halfway with water We found that this took approximately an hour and a half with our large beets - would recommend an alternate roasting method (wrap in foil) or if you are also dealing with large beets, cutting them in quarters will helpCook beets until softenedBeets are done when skin rubs off easilyPuncture with a fork for an early read - should easily pierceCool beets and remove skin. Cut skinned beets to fit your food processor  Puree thoroughly - may still look a little chunky In food processor, mix the following: 1/2 cup pureed beets2 whole eggs 1 egg yolkPuree until lightened in color. Mix in a bowl  2 1/2 cups of flour (all purpose white or all purpose stone ground for a chewier texture) 1 teaspoon of saltAdd beet-egg mixture and mix until combined (should look slightly dry) Turn dough onto clean, lightly floured counter top and knead until firm  Dough should maintain form but not be crumbly. Separate dough into 3 parts Roll dough with rolling pin until very thin (thickness around a penny is ideal) Maintain a well floured surface  Frequently flip/move dough to reduce stickingNote: Divide into more pieces if you are working with less counter space. The thinness of the dough is critical to the final mouthfeel of the bowties. We used a small glass from a brewery tour as a make-shift cutting toolNote: You can buy a special tool for the more traditional crimped edge (link here) and cut pasta into wide strips, and then smaller squares and proceed . Place finger in center of circle Pinch outer edges towards center Remove finger and pinch middle (hard) to form a seal Admire your work. After forming bowties, set aside for 10-15 minutes to dry a little. This will help them maintain their form during and after cooking. . Drop bowties into simmering salted water in small batches You know the pasta is done when it floats to the top Strain pasta and drain of waterPlace cooked pasta aside, add olive oil to prevent stickingAdd any additional olive oil to tasteSprinkle goat cheese crumbles on top after plating. Rosemary Chicken (Roasted in the oven at 350F until cooked through) Chicken tendersFresh rosemaryOlive oilGarlicSalt & PepperBaby Squash Medallions (pan fried)Chopped baby squash Olive oilGarlicChopped onionsLemon juiceTruffle oilSalt & PepperShaved Parmesan3. Side saladRoasted beetsLettuceGoat cheese (crumbled on top). One serving shown here with Grapefruit Gin Fizz Cocktails\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_53_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_53_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_53_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_53_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. INGREDIENTS:\n1 lb. of thawed ground beef or deer burger\n1 tsp. of salt\n2 dashes of pepper\n1 dash of worchestershire sauce\n1 dash of liquid smoke\nA liberal covering of papaya enzyme or meat tenderizer\nA handful of chopped dried fruit (cherries, blueberries, and strawberries work well)\n. Chop the dried fruit and mix all of the ingredients by hand thoroughly.\n. Roll the meat into the size of a golf ball.  Place it on aluminum foil and press the meat flat so it dries evenly.  . Place it in your food dehydrator or the oven with the door open at 200 degrees for 4-6 hours.  . The jerky is done when it firmly bends.  If it has drops of oil on it, just dab with a paper towel.  \nTo store, just place in a ziplock and put it in the fridge.\nDelicious!\n\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_54_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_54_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_54_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_54_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. To match the colors of candy corn (yellow, orange and white), the following ingredients were used:Pineapple slices Mandarin OrangesYogurtWhipped Cream (optional)I didn't provide the amount of ingredients since it really depends on the size of your container.  To make these two small glasses, 3 pineapple slices, 12 pieces of oranges, 6 oz of yogurt and a little bit of whip cream were used.  . Dice the pineapple slices.  Add to the bottom of the container.. Add a layer of mandarin oranges over the pineapple.  . Add a layer of yogurt over the oranges.  Top with whip cream, if desired.. Now, go eat!. Added a few props, including the wood version of candy corn from last Halloween :).\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_55_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_55_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_55_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_55_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Here's how to make the fondant. (There's a bunch of instructables on this...so check them out if this isn't clear enough) It's fun to mold...and great for cupcake toppers! You'll need: Powdered sugar, marshmallows and water. Put 4 oz. of marshmallows in a microwave bowl with 1 tablespoon of water. Heat them up in the microwave for about a minute until they are puffy! Stir them up. Add powdered sugar until smooth and able to handle. (I didn't measure...you just have to keep adding it until it's clay like enough). Sprinkle powdered sugar on a silpat or counter. Put a little bit of the fondant on it.\u00a0 Roll it thin with a rolling pin. about 1/8 inch. Get your sprinkles and gel caps ready. Use scissors to cut off the ends of the gel caps.. On the rolled out fondant... Place a pearl or sprinkle... Place the gel cap over the top... And press the gel cap onto the fondant... Pick it up and it should stay in place! It's perfect and wiggly! Make some matching sets. Think of all the occasions you could use these... Add them to fondant figures for cupcake toppers! Great for Halloween!. Now for the cute pumpkins! We added some orange food coloring to some of the white fondant... And some green to a little bit of the white... Rolled the orange into balls...pumpkins! Used a toothpick to create lines... Add a green stem... And, of course, add the googly eyes! Then create a little scene of friendly pumpkins!. Great toppers for cakes or cupcakes...brownies or other fun Halloween dessert! Hope you liked this instructable--we had so much fun! Check out my blog Doodlecraft for more fun! And don't forget to vote!\u00a0 :)\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_56_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_56_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_56_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_56_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Heat your oven to 356f or 180c. Cook for 45 minutes to an hour.The potatoes are ready when they are folk tender.. . Place the sliced potatoes into a large mixing bowl.In a small bowl add two table spoons of whole egg mayonnaise, followed by one table spoon of sour cream.Mix together.. Then a cup of grated cheddar cheese.Next add our mixture of mayonnaise and sour cream.Season with salt and pepper.. To add some colour, add \u00bc of a cup of chopped chive. I like to use scissors for chives, it creates less mess and you only use what you need.Mix the chives into the salad.. To finish it off, top with the remaining cheese, bacon and chives.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_57_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_57_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_57_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_57_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. \n          Here is what you need for this tasty pie!Ingredients:\n\n\t\tFlour Tortillas (I cut up the small to make the mini pies, and use a whole large one for the pie)\n\t\tVanilla Ice Cream\n\t\tCorn Flakes\n\t\tCinnamon Sugar\n\t\tButter\n\t\tHoney (So important, but I forgot to put it in the picture!)Special\u00a0Supplies:\n\n\t\tCupcake Pan\n\t\tPie Pan\n\t\tPastry Brush (if you have it, if not, I found a spatula\u00a0worked fine)\n\t\tRamekin\u00a0(or something of similar size for cutting out mini pies)\n\t\tCutting Board and Knife\n\t\tPlates\n\t\tOven and Microwave. \n          Time to get the crusts ready. \u00a0I knew what I wanted to do, but not really how to do it. \u00a0Luckily, I found a nice tutorial online on how to bake cinnamon sugar tortillas.Mini Pies:\n\n\t\tTake your small tortilla, cutting board, knife and ramekin.\n\t\tTake your ramekin and lay it on the tortilla against one edge. \u00a0I planned it out so I could get two from each tortilla.\n\t\tCut around the ramekin.\n\t\tRepeat on other side of tortilla, they might not be perfectly round, but that doesn't matter.\n\t\tOnce you have how many you want, melt some butter in a bowl and get your pastry brush and cinnamon sugar.\n\t\tBrush butter on the tortilla and coat in cinnamon sugar. \u00a0Do this on a plage, it get's messy.\n\t\tFlip the\u00a0tortilla\u00a0and coat the other side with butter and cinnamon sugar.\n\t\tPut it in the cupcake pan, be careful not to tear it. \u00a0The sides will fold and ruffle.\n\t\tRepeat for all tortillas till the pan is full or until you have how many you want.Full Pie:\n\n\t\tGet a plate, your tortilla, melted butter, pastry brush and cinnamon sugar.\n\t\tCoat the tortilla, as you did with the minis, with butter and cinnamon sugar on both sides.\n\t\tPlace it in the pan, molding it to its shape.\nYou will be preheating your oven to 350 degrees\u00a0Fahrenheit. \u00a0I cooked both the mini and full for about 13.5 minutes. \u00a0I tried a full 15 like the linked tutorial says, but that was too much for mine. \u00a0You want it to start to get hard, but don't want them getting burned or too brown. \u00a0Let these cool. \u00a0If you use them right away, your ice cream will just melt in them. \u00a0You can make them ahead of time and store them in an airtight container until you are ready to use them.\n        . Might as well prepare your pie topping while the crusts are cooking. \u00a0Take some Corn Flakes (there is no exact amount) and crunch it up. \u00a0You can leave the corn flakes and cinnamon sugar\u00a0separate\u00a0and add them on top of the pie\u00a0individually\u00a0or you can mix them now. \u00a0If you mix them, make sure when you are putting it on the pie that you get both. \u00a0The cinnamon sugar will want to sink to below the corn flakes.. \n          Now it's time to put it all together. \u00a0So simple, isn't it!\n\t\tTake your pie crusts and put some ice cream inside. \u00a0Don't put too much in the full pie, a nice layer of up to an inch should do it. \u00a0\n\t\tFor the big pie, I put ice cream in a bowl first and smushed it up so it would be easier to put in the pie. \u00a0While the crust seems pretty strong, I didn't want to risk smashing it to pieces already. \u00a0I also used a knife to spread it in the pie pan after I put it in.\n\t\tSprinkle your pie topping on top of the ice cream.\n\t\tLastly, drizzle on some honey, it makes all the difference! \u00a0(It's kind of hard to see the honey in the pictures, but it's there! \u00a0Don't put too much now, a little goes a long way. \u00a0Best way to handle this, is put some on one of the minis and eat it and decide if you need more or less.). Eat your pie!\u00a0\nI originally wanted to do just a full pie, but didn't think it would work. \u00a0I thought that the crust (tortilla) would just crumble if you tried to cut pie slices. \u00a0Scoochmaroo suggested making minis and I thought that would be perfect!\u00a0 I love bite sized. \u00a0\nI decided to do a full sized pie at the same time, because I figured if it didn't work, I could still eat it! \u00a0In the end, it worked fine. \u00a0It cut pretty nicely, staying in tact.\u00a0\nYou can store the pies in the freezer and they seem to keep okay, but the sooner you can eat it, the better!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_58_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_58_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_58_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_58_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. What you need per serving for 2:100grams flour50grams Lievito Madre2 eggssaltYou will also need a filling, either take a pesto or mix 100grams spinach with 150grams ricotta. You can either make the Lievito Madre yourself which takes 25 days or buy a dried version.Tools:pasta makersilicon kneading matkitchen scale. This is actually easier than usually with pasta dough, because you already have a dough mixture and only add more to it. Keep in mind the Lievito Madre usually consists of flour and water in a ratio of 2 to 1. You probably use a discard of it after refreshing, keep the discard in a glass container in the fridge and when you want to prepare the pasta dough, add 100grams of flour and 2 beaten eggs. Now comes a little chemistry, the microorganisms in the Lievtio Madre are still alive, that is why you add a full teaspoon of salt. The microorganisms will die from plasmolysis, their enzymes are still active for some time. This will enrich the dough with a lot of flavour. Don't worry about the amount of salt, when you cook them in the water any excessive salt will diffuse out. Wrap the dough in cling film and store it in the fridge for 2 hours. If you think the dough is too wet then knead it on your silicon mat and gradually add more flour.. Take the dough out of the fridge and roll out the pasta on your kneading mat into thin sheets using the pasta machine. Then take a tumbler with a wide opening and press it into the dough to gain circular dough sheets.. Like I mentioned in the beginning, for the explanation I will use mashed Anko to achieve better pictures. Originally it is a mixture of herbs with fresh cheese or meat like in picture 2.. Lay one of the circular sheet between your thumb and index finger of your non-dominant hand. Put in a spoon of your filling. Push in a little fold with the index finger of your dominant hand. Then with the middle finger and index finger you make a fold from the right into the mid like in picture 3. With your thumb and middle finger you make a fold from the left like in the fourth picture.. Continue with the folding technique until you reached the end and it becomes too hard. Then just press the 2 sides together to form a tail (la coda).. Store the pasta on a plate with flour or they will stick to it. Do not dry them in the sun or the filling will leak on the bottom.. You need a big cooking pot with boiling salted water. If you need more information to improve your cooking procedure, have a look at my Nerikomi Pasta instructable. The pasta is big and it might make a splash if you just throw them in. If you have a small kitchen sieve, put the pasta in there and then gracefully lay them into the water. The pasta should be done after 5 minutes when they all swim on the surface. Drain them in your sink and saut\u00e9 them into a hot pan with butter. Add sage and then serve.. For an extraordinary effect when you serve guests, try black dough with a red filling. Don't be worried the white dough for storage could make it look grey, it will mostly wash off during cooking.Enjoy your pasta!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_59_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_59_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_59_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_59_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. What you will need:\nRice Krispies or similar\nSugar\n1 jar of smooth peanut butter, smallish size, probably 1lb but don't have any right now so can't check!\n1 1lb (454g) can of golden syrup\nChocolate for coating, 8oz (225g) or so.\n1 large saucepan\nSpoon for mixing, easier with a metal one\nLarge tray, this one is about 12\" by 18\" (31 by 36cm)\nSpatula, rolling pin or anything to smooth out a stiff mixture.\nGreaseproof paper (baking parchment)\nSome strong arms, either yours or borrow a pair from somewhere.. Put all the peanut butter and syrup into the pan, scraping it all out, then fill up the syrup can with sugar and pour that in too.\nHeat it while stirring over a low flame until the sugar has mostly dissolved and it looks as runny as it's likely to get.\u00a0 Don't heat too fast or leave it to stand for too long because it will start burning on the bottom easily - you're not making toffee, just melting it all.\nBeware, this gets very hot and will stick to you and burn if you try putting your finger in!\u00a0 Don't be tempted to try any without cooling it first.\nThe nutty smell may attract raiders from the garden, have a peanut or two on hand to fend them off!. Using the syrup can again, fill it up with Rice Krispies and tip them in.\u00a0 Do this five times, but not all at once because it'll be pretty hard to mix without spilling rice all over the place.\nNow mix it up!\u00a0 It's a very tough job so this is why you'll need the strong arms... hopefully you managed to borrow a pair.\u00a0 Keep going until all the rice is coated and it's an even mixture.\nWhen it's done, line the tray with baking paper and tip the mix onto it.\nSquish it roughly flat with a spatula or spoon and then use a rolling pin (or just the spatula) to get it as smooth as possible and covering the whole tray.\nSet it aside to cool for a bit while you melt the chocolate.\u00a0 Maybe take a short break if you think it (or you!) needs it.. Melt the chocolate the normal way over hot water, then pour it over the chew and make it even.\u00a0 It might not look like you have enough but this amount is just right.\nDone!\u00a0 Leave it for a while to cool until it's ready to cut.. Now another hard part.\u00a0 Score lines down the chocolate with a sharp knife and then slice it into squares.\nThis gets tiring quickly... I had a blister after doing several trays of this.\nAnd that's it!\u00a0 Except for the washing up.\u00a0 This may look like a nightmare but it actually comes off very easily with just warm soapy water apart from any burnt toffee on the bottom of the pan, which will need chipping off with something.\nThese freeze well, and even taste good when frozen so don't count on them being safe in the freezer.\u00a0 I've thought of other stuff like maybe adding small marshmallows with the rice but haven't tried it yet...\nEnjoy!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_60_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_60_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_60_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_60_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. So you take the chips and put them on the plate. Put as much as u want . Now put the first layer of cheese on top of the chips again put as much as u want remember it's only the first layer. Now it's time to put the second layer of chips so just put chips on top of the first layer . It's time to up the second layer of cheese on top of the second layer of chips and u can put as much cheese because this is ur last layer. Now heat up the nachos for 30 seconds in the microwave . Now you get to eat it!!!You can add different toppings if u want it's totally up to you \nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_61_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_61_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_61_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_61_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Cut ends off of the rutabaga and scrub it well under running water since its a root vegetable.\nPeel the root with a sharp knife, cutting away the outer skin all the way around the vegetable. Be careful to cut away from your body on a cutting board, the skin can be tough to cut.. Cut the root into matchstick sized pieces. Try to make the pieces as uniform as possible so they will cook in the same amount of time.. Boil a pot of salted water.\nWhen the water is simmering, add the matchsticks and let them cook for 3-4 minutes until they are just able to be pierced by a fork.. Drain the fries and shake out the water.\nPrepare a dish with olive oil and your choice of seasoning, I used salt and freshly ground pepper.\u00a0\nLay out the matchstick sized pieces on a cooking sheet in a single layer and baste the fries with the olive oil mixture.\nBake for 15 minutes. When they are ready they should be easily pierced by a fork but crispy on the surface and a little browned but not black. Be careful not to overcook the fries because they will be dry and chalky.. I prepared fries from the rutabaga (the orange colored ones) and also yucca fries (the white colored ones). Yuca fries are prepared in the same way. Boil them separately though so the flavors don't mix as rutabaga can be a strongly flavored vegetable.\nEnjoy! They are best served piping hot!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_62_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_62_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_62_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_62_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Dough\n1 package active dry yeast\n1 1/4 cup warm water\n1 tablespoon sugar\n2\u00a0 tablespoons olive oil\n1/2 teaspoon salt\n2 3/4 cups whole wheat flourFilling\n2 cups shredded mozerella\n1/2 cup tomato sauce\n2 cups of your favorite pizza toppings. 1. Mix warm water, yeast, sugar and oil. Let stand for about 10 min. Mixture should be bubbly.\n2. Add salt and flour. Mix until the dough sticks together. Knead dough until it becomes smooth, about 10 min. Add water as needed.\n3. Put the dough in a bowl, cover with and cloth and allow to rise for about an hour, or until it doubles in size. Placing the dough in a warm spot will help the dough rise faster. Prepare the filling while the dough is rising.\n4. Divide the dough into 12 equal pieces and form balls.. 1. Chop your favorite toppings into small pieces, this will be the bun filling.\n2. Stir shredded cheese and tomato sauce into chopped topping mixture.. 1. Roll each piece out until it is 4-5 inches in diameter.\n2. Place 2 tablespoons of topping in the middle of the dough.\n3. Pull the side up, twisting to seal the dough.. 1. Place the buns, seam down on small wax paper squares in the preheated rice cooker.\n2. Steam for 15-20 min. Add water to rice cooker as needed. These buns are easy to pack for lunches. They freeze and reheat in the micowave beautifully!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_63_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_63_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_63_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_63_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. \n\tThe cookie ingredients:\n\t\t2 sticks of unsalted\u00a0butter\n\t\t1 cup sugar\n\t\t1 cup brown sugar\n\t\t2 eggs\n\t\t1 tsp vanilla extract\n\t\t1 1/2 cups all-purpose flour\n\t\t1 tsp baking soda\n\t\t1 tsp salt\n\t\t1 tsp cinnamon\n\t\t2 cups oatmeal\n\t\t1\u00a0cup\u00a0sweetened shredded coconut\nAnd the additives:\n\n\t\tFritos (I crushed these up)\n\t\tMini Reese's cups\n\t\tMini chocolate chips\n\t\tToasted almond slivers (which I toasted and\u00a0crushed)\n\t\tPeanut butter chips (not in the photo since it was a late addition)\nI had many of these ingredients in my kitchen already. \u00a0So I suggest going through your cabinet, taking any delicious ingredients and adding them into the mix. \u00a0Yes, fritos are a bit outrageous, but I love them. \u00a0If that is too much, try pretzels or potato chips because crunchy and salty are a necessity folks.. Preheat the oven to 325\u00b0F. \u00a0\nNow for the pan I only had a 13\" x 9\" available so I chose to use it. \u00a0Unfortunately this made the bars a bit thick and somewhat overwhelming. \u00a0I suggest going following RecipeCarr's recommendation and using a 16\" x 12\" x 1\" pan for thinner bars. \u00a0Whichever pan you go with, make sure to grease it well.\nCream the butter and sugar. \u00a0Then add the eggs and vanilla to the mixture.. In a separate bowl whisk together flour, baking soda, salt, and cinnamon.. With the mixer on low, gradually add the dry to the wet ingredients. Then stir in the oatmeal and coconut. Make sure they are well mixed.\nAt this point I mixed in the fritos, Reese's, chocolate chips, almonds, and peanut butter chips until just combined. To make this less daunting, I added one at a time, but try not to overmix.. Press the dough into your pan. \u00a0Try to spread it evenly.\nBake for 40-45 minutes in a 13\" x 9\" pan. If you use a 16\" x 12\" pan then the time is reduced to 20-25 minutes. Make sure to check up on them and use your discretion. I waited until everything was evenly browned and the middle was no longer jiggly.\nAllow the bars to cool before cutting them into squares. \u00a0My pan yielded 24 bars which were a bit too large for some of the guests...\nEnjoy and remember to share! (Especially because eating more than one will increase your cholesterol drastically.)\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_64_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_64_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_64_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_64_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. 24 \u2013 28 - 1 quart/litre size Mason jars. Any brand will work nicely.\nLids and rings - make sure you buy/have the right size for your jars\nLarge bowls (for the peeled tomatoes)\nLarge pots for boiling water to scald the tomatoes\nSmaller pot of boiling water to simmer the seals & rings in (I like to rings to be warm when placed on the seals)\nA paring knife and a chef's knife\nMeasuring spoons\nSalt (I use sea salt - but any good table salt will work)\nClean towels\nJar lifter for grabbing those hot jars\nCanning funnel for filling the jars\nMagnetic lid lifter \u2013 for taking the hot seals and rings out of the water\nCanning Jar Wrench \u2013 I use this to empty the hot water out of the waiting jars (saves your fingers)\nPressure Canner - the one I use is a Mirro 22 Quart \u00a0weighted Gauge. Wash jars \u00a0in hot soapy water, rinse well.\nThe tomatoes will be cold pack, raw, so it is not necessary to keep the jars hot.\nI keep the jars warm,\u00a0so they don't crack when you add the water to the canner.\nFilling them 1/3 full with boiling water will keep them warm.\nNote:\u00a0 while you are washing\u00a0jars,\u00a0 put two larges pots of water on the stove to boil - these\u00a0will be used to scald the tomatoes.\n\u00a0. \u00a0Wash the tomatoes and set aside.\nTransfer the tomatoes in small batches into the boiling water for 30 to 60 seconds - until skin shows a crack.\nOnce the skin shows a crack, remove from boiling water and plunge into cold water.\nI use my kitchen sink filled with cold water, that way when the water gets warm from the tomatoes - I can release some of the warm and add more cold.\nHint:\u00a0 too many tomatoes in the pot cools the water right down - works best with 6 to 8 tomatoes in the pot.. The skin, should pull right off with a paring knife.\nAfter all the tomatoes are skinned it is time to chop and fill the jars.\u00a0 To keep the rim clean I insert the canning funnel into the jar before I fill them.\nWhile you are chopping warm the seals in rings in a pot of boiled water.\nAdd 1/2 cup white vinegar to the canner (keeps hard water stains off the jars and the inside of the canner), and 2 to 3 inches of hot water before adding the jars.\nUser the jar wrench to pick up the jar and empty the water, just before filling the jar.\nAdd 1/2 to 1tsp of salt to the jar - depending on your taste.\u00a0 I only use 1/2 tsp - just enough to flavour the tomatoes.\nRemove the top off\u00a0 the tomatoes chop into quarters and fill the jar.\u00a0\nUsing a wooden or plastic spoon press down on the tomatoes in the jar.\u00a0 You want to get as many tomatoes in the jar as possible.\u00a0\nI found that it to 8 to 10 tomatoes for each quart jar.\nOnce jar is full place a hot seal and ring on the jar, tighten down and place in pressure canner.\nThe canner will hold 7 - 1 quart jars on.\nOnce all the jars are in the canner the water should be 2 to 3 inches from the bottom of the jar.\u00a0 As you can see from the picture\u00a0 the jars raise the level of the water when they are added.\nIf the water is too high remove water with a measuring cup.. \nPut the lid on the canner\u00a0 secure tight, weighted petcock should be\u00a010 lbs pressures.\n0-1000 ft sealevel - 10lbs pressure for 15 minutes\nabove 1000 ft\u00a0 seal level - 15lbs pressure for 15 minutes\nVideo\u00a0shows 15lbs pressure.\nPicture shows 10 lbs pressure.\nTurn the burner on high, heat until the petcock is furiously dancing,\u00a0then turn the heat down to medium-high - the petcock should be doing a gentle dance at this time.\u00a0\nTime for 15 minutes.\nAfter 15 minutes remove the canner from the burner.\u00a0\nThe canner will take 20 to 30 minutes to cool and release all the pressure.\u00a0 You will know when it is safe to open the canner when\u00a0 you don't hear escaping pressure anymore.\nOpen the canner, remove the jars with the jar lifter, dry with a towel and place on a towel to cool.\u00a0\nIf you have more jars to process you can put them in the canner, make sure the water level is correct, seal the canner, turn on the heat and start again.. \nHere they are - finished and ready to store in a cool place, and enjoy until next summer!!!\nCanning the tomatoes this way gives you more versatility \u2013 you can throw them on pasta with olive oil and spices, make pasta sauce out of them, add them to stews and soups or any recipe you can dream up.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_65_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_65_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_65_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_65_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. For this cake I used a 4 inch 375ml spring form pan to make my cake, so it is pretty small because that is the way that I like it.  However, you may also use a more common 9 inch 2250ml spring form pan by just adding more crust, ice cream and bananas (I worked it out to be about 6.x this recipe).  To make this recipe for the 4 inch 375ml spring form pan you will need the following ingredients and materials:1 tablespoon butter1/4 cup Oreo baking crumbs1/2 cup strawberry ice cream1/2 cup vanilla ice cream1/2 cup chocolate ice cream1 banana1 maraschino cherrywhipped cream4 inch 375ml spring form panMeasuring spoonsIce cream scoopMeasuring cups. To make the crust, pour a half cup of Oreo baking crumbs into a small bowl.  Combine with 1 tablespoon of melted butter. Now press the mixture firmly into the bottom of your pan and put it into the freezer for 10-15 minutes. . To add the ice cream, first scoop a half cup (or enough to fill up a third of the space left in the pan) of chocolate ice cream into the pan, making sure to flatten it out using either the back of the ice cream scoop or a knife.  If you would like to have crisper borders between ice cream flavours you can clean the ice cream that has made its way onto the side of the pan with a cloth.  Put this into the freezer for about 20 minutes or until the ice cream feels hard to the touch.  Now, repeat the steps used in making the chocolate layer with the vanilla and then strawberry layer. Make sure that when you do the strawberry layer that you make it especially smooth because that is what you will see at the top of your cake. Once you are done the strawberry layer, let your cake stay in the freezer for about 2 hours or until completely solid. This way your cake will freeze through completely so you can get the best results!  To make any of the layers more smooth just use slightly more melted ice cream and freeze it for a bit longer.  After you have finished all the layers, take the cake out of the spring form pan and put it onto a plate. . Peel the banana.  Slice both ends off of the banana, and then from there, slice the banana into two pieces long enough that they would be able to fit from the bottom to the top of the ice cream cake (if you sliced the banana in half that would probably work about right, too).  Now cut both pieces lengthwise.  To put these on the cake, first make teeny notches (each of these represents where one banana will go) in the cake that divide it into quarters.  This is so that you are not ruining your entire cake once you realize that you have done a lopsided job of putting the bananas on.  When sticking the bananas to the cake  have them facing outside out and inside in.  Also, make sure that you are not moving them around too much as you put them on to avoid smudging the cake.  At this point if you are not going to serve the cake right away you should put it in the freezer until you are ready to decorate and serve it.. I think that this is the most fun of all the steps.  First, you put one small squirt of whipped cream onto the top of each of the bananas.  Now dust a small amount of leftover Oreo baking crumbs fairly centrally on the cake.  Now to put a cherry on top, we will literally put a cherry on top. First squirt a dollop of whipped cream into the center of the cake and then put your maraschino cherry on top of that.And there you have it: A Banana Split Ice Cream Cake! Thank you so much for taking the time to read through my intractable I really hope you enjoyed it and will enjoy eating your cake! :)\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_66_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_66_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_66_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_66_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. \n          Here is what you need for this tasty pie!Ingredients:\n\n\t\tFlour Tortillas (I cut up the small to make the mini pies, and use a whole large one for the pie)\n\t\tVanilla Ice Cream\n\t\tCorn Flakes\n\t\tCinnamon Sugar\n\t\tButter\n\t\tHoney (So important, but I forgot to put it in the picture!)Special\u00a0Supplies:\n\n\t\tCupcake Pan\n\t\tPie Pan\n\t\tPastry Brush (if you have it, if not, I found a spatula\u00a0worked fine)\n\t\tRamekin\u00a0(or something of similar size for cutting out mini pies)\n\t\tCutting Board and Knife\n\t\tPlates\n\t\tOven and Microwave. \n          Time to get the crusts ready. \u00a0I knew what I wanted to do, but not really how to do it. \u00a0Luckily, I found a nice tutorial online on how to bake cinnamon sugar tortillas.Mini Pies:\n\n\t\tTake your small tortilla, cutting board, knife and ramekin.\n\t\tTake your ramekin and lay it on the tortilla against one edge. \u00a0I planned it out so I could get two from each tortilla.\n\t\tCut around the ramekin.\n\t\tRepeat on other side of tortilla, they might not be perfectly round, but that doesn't matter.\n\t\tOnce you have how many you want, melt some butter in a bowl and get your pastry brush and cinnamon sugar.\n\t\tBrush butter on the tortilla and coat in cinnamon sugar. \u00a0Do this on a plage, it get's messy.\n\t\tFlip the\u00a0tortilla\u00a0and coat the other side with butter and cinnamon sugar.\n\t\tPut it in the cupcake pan, be careful not to tear it. \u00a0The sides will fold and ruffle.\n\t\tRepeat for all tortillas till the pan is full or until you have how many you want.Full Pie:\n\n\t\tGet a plate, your tortilla, melted butter, pastry brush and cinnamon sugar.\n\t\tCoat the tortilla, as you did with the minis, with butter and cinnamon sugar on both sides.\n\t\tPlace it in the pan, molding it to its shape.\nYou will be preheating your oven to 350 degrees\u00a0Fahrenheit. \u00a0I cooked both the mini and full for about 13.5 minutes. \u00a0I tried a full 15 like the linked tutorial says, but that was too much for mine. \u00a0You want it to start to get hard, but don't want them getting burned or too brown. \u00a0Let these cool. \u00a0If you use them right away, your ice cream will just melt in them. \u00a0You can make them ahead of time and store them in an airtight container until you are ready to use them.\n        . Might as well prepare your pie topping while the crusts are cooking. \u00a0Take some Corn Flakes (there is no exact amount) and crunch it up. \u00a0You can leave the corn flakes and cinnamon sugar\u00a0separate\u00a0and add them on top of the pie\u00a0individually\u00a0or you can mix them now. \u00a0If you mix them, make sure when you are putting it on the pie that you get both. \u00a0The cinnamon sugar will want to sink to below the corn flakes.. \n          Now it's time to put it all together. \u00a0So simple, isn't it!\n\t\tTake your pie crusts and put some ice cream inside. \u00a0Don't put too much in the full pie, a nice layer of up to an inch should do it. \u00a0\n\t\tFor the big pie, I put ice cream in a bowl first and smushed it up so it would be easier to put in the pie. \u00a0While the crust seems pretty strong, I didn't want to risk smashing it to pieces already. \u00a0I also used a knife to spread it in the pie pan after I put it in.\n\t\tSprinkle your pie topping on top of the ice cream.\n\t\tLastly, drizzle on some honey, it makes all the difference! \u00a0(It's kind of hard to see the honey in the pictures, but it's there! \u00a0Don't put too much now, a little goes a long way. \u00a0Best way to handle this, is put some on one of the minis and eat it and decide if you need more or less.). Eat your pie!\u00a0\nI originally wanted to do just a full pie, but didn't think it would work. \u00a0I thought that the crust (tortilla) would just crumble if you tried to cut pie slices. \u00a0Scoochmaroo suggested making minis and I thought that would be perfect!\u00a0 I love bite sized. \u00a0\nI decided to do a full sized pie at the same time, because I figured if it didn't work, I could still eat it! \u00a0In the end, it worked fine. \u00a0It cut pretty nicely, staying in tact.\u00a0\nYou can store the pies in the freezer and they seem to keep okay, but the sooner you can eat it, the better!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_67_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_67_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_67_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_67_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. After washing them, place them on the cutting board.. Make sure they're facing the same way.. Roll all the way from bottom to top\u00a0. Hold the roll down and get ready to cut.\nWatch your fingers.. Make sure to cut all the way through.. Now you have fancy garnish for your food !\u00a0. \nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_68_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_68_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_68_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_68_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Place almonds on a baking sheet and bake at 375F.\nCheck after 4-5 min, then every few minutes after that.\nShake the pan once in a while for even browning\nPull them out when you see just a hint of toasty brown.\nThe almonds will continue to bake as they cool.\nCool your almonds.\nIf you want to store them before making your hearts, you can protect them from humidity in a sealed jar.. Select nicely matched pairs of almonds, and lay them out in pairs\nUsing a mandolin or use a grater, shave or grate your almonds to make a half-heart shape.\nTrim to suit. .. Use a double boiler, or devise one.\nFit a bowl over a pan of simmering water,\nAdd your favorite dark chocolate.\nCover it to keep steam at bay.\nWait.\nCheck (stir- {er, taste}) every few minutes.\nWhen you have a smooth, easily poured substance, kill the burner, but let the chocolate sit in the hot water.. In this exercise, I just took a spatula out of my melted chocolate.\nWiping a bit of chocolate on an almond, I assembled the two pieces, using chocolate as a cement,\nWait for these to set up, before continuing on to the next step.. \nThis is pretty self explanatory.\nLet the excess chocolate drip off, before placing these on waxed paper.\nChopsticks work well as an implement. . I sprinkled my mandolin shavings onto these hearts, while the chocolate was still melty.\nCareful who you give your heart to.\nI heard a sad story of a lady who ribbon-wrapped a tin of these for her hopeful-who snarfed them down like so many candy bars on Halloween.\nI passed mine out at work, in the tissue-lined paper clip box you see here.\nNo one noticed the box.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_69_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_69_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_69_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_69_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Cake Ingredients: 1 cup butter-softened 1 1/2\u00a0cups sugar 4 large eggs 3 cups all-purpose white flour 3 tsp. baking powder 1 tsp. salt 1 cup milk 2 tsp. vanilla extract Paper baking cupcake liners Vegetable cooking sprayYield: 24 cupcakes or 2 dozenDirections: 1. Preheat oven to 350F. Prepare cupcake tins with liners, set aside. 2. Beat butter and sugar at medium speed with an electric mixer until creamy and smooth. Then add eggs, 1 at a time, mixing until well blended after each egg. 3. Combine flour, baking powder, and salt together in a small bowl. Add to butter mixture alternating with milk. You should begin and end with the flour mixture. Mix at low speed until bleneded. Add in vanilla extract. 4. Spoon cake mix into cups, filling 2/3 full. 5. Bake at 350 for 12-15 minutes or until toothpick inserted comes out clean. 6. Cool in pans on wire rack for 10 minutes, remove cupcakes from pans and set on wire racks to completely cool.. Filling Ingredients: 1 8oz cream cheese-softened 1/3 cup powdered sugar 2-3 Tbsp. coffee liqueur(Kahlua) or 1 Tbsp. coffee extract 1/8 tsp. saltYield 2 CupsDirections: 1. Combine all ingredients in a medium bowl, mixing until well blended. Store any remaining filling in container in refrigerator-up to 2 weeks.. Once cupcakes are completely cooled, cut tops off of the cupcakes using a serrated knife. Then spread 1 Tbsp. of Tiramisu Filling on the bottom part of the cupcake, gently place the top back on. . Frosting Ingredients: 1/2 cup butter-softened 1 8oz cream cheese-softened 2 16oz packages powdered sugar 1/4 tsp. saltYield 5 cupsDirections: 1. Beat butter and cream cheese at medium speed until creamy and smooth. 2.\u00a0Gradually add in the powdered sugar and salt, mixing at low speed. Beat at high speed for 2 minutes until creamy and thick. 3. Frost each cupcake by using a spatula, knife or piping bag and tip. . For the finishing touch dust/sprinkle with Hersheys Cocoa Powder. . After all your hard work, you can now enjoy your Tiramisu Cupcakes! Enjoy!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_70_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_70_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_70_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_70_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Unsalted butter is recommended, but not required. For availability's sake I used salted butter the last time I made it. From what Baba said, you could also use fresh cream. . The ghee is essentially heated butter. You'll want to melt the entire stick at high heat. When the whole stick is melted, reduce to a low heat. Remember to keep the pot uncovered while heating. One of the ideas is to remove moisture from the butter. You should see steam and bubbles escaping throughout the boil.. It takes about 10-15 minutes for the entire process. You'll know you're done when at low heat bubbles are slower and steam can't be seen. At this point there is little or no moisture left. \nThe next step will be to separate the ghee and the remaining milk solids. I used 2 bowls and a fine mesh strainer for this. Repeat the process of moving ghee from one bowl to the other until solids can't be seen. There will be solids left if you're using salted butter. Use oven mitts as ghee gets very hot. . The most confusing part of ghee is the colour. It's usually an olive oil colour. The first image is the hot ghee from this tutorial. The second is the ghee I made a year ago. The third is the first image several hours later. Ghee has a semisolid texture. \nThe cool thing is that by removing moisture and lactose sugars, bacteria have little to feed on. This gives ghee an unlimited shelf life. Refrigeration is not necessary. \nWhen finished, this oil can be used in many recipes. I would not recommend using a lot (1 Tbs is loads) since saturated fat and cholesterol are present. That means baking recipes may not be ideal. \nThis is my entry in the fried food contest. I thought that showing the creation of one of the ingredients would be clever. If you found this useful for your entries, please like or comment.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_71_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_71_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_71_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_71_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Makes 5 to 6, 250 ml jars.4 cups or 1 liter unsweetened apple sauce.\u00bd teaspoon vanilla extract.1 sachet of pectin powder.1 teaspoons of ground cinnamon.\u00bc tsp ground cloves.Wild apples are tart so adding a sweetener is necessary; you can substitute honey, maple syrup, or corn syrup for Splenda, just match the recipe cup for cup. If you do not worry about sugar intake just use 2 cups sugar. You can go sugar free with 2 cups Splenda or some other artificial sweetener, or 1 cup sugar and 1 cup Splenda for low sugar. I used No Cook Pectin, if you do not process the jam you will need to refrigerate it, refrigerated the jam will keep six months. Processing allows you to store the jam at room temperature in a dark place and it doesn\u2019t affect the pectin negatively.. I make 2 liters of apple sauce at a time. Dice 5 liters of apples and place them in an 8 liter pot or large saucepan, add 1 liter water and bring to a boil and reduce heat. Simmer and stir until apples are soft, approximately 30 minutes to an hour depending on the size you diced the apples.Using a large sieve mash the soft diced apples removing all the unwanted parts, you should have about 2 liters apple sauce.. Clean and sterilize and prepare 6 bottles.Measure 4 cups or 1 liter apple sauce and place the apple sauce into a clean saucepan and add the sweetener, cinnamon, cloves, and vanilla extract.Bring to a boil stirring often after 5 minutes reduce heat and simmer, still stirring, until puree thickens and holds its shape on a spoon.Add the no cook pectin and stir it in.. Fill the jars with jam to within \u00bc of aninch (5 mm) of the lip of the Jar.Remove air bubbles and add more jam if necessaryPlace the seals on the jars and screw on the rings loosely. Place in a pot of water covering the lids by one inch (25 mm) cover and bring to a boil for 10 to 15 minutes.. Once the Jam has processed for 10 to 15 minutes remove from the heat and let stand for 5 minutes.Remove the jars of jam from the processing pot wipe off the outsides and tighten the lids.Place the jars of jam on a rack to cool.Once cool label and place the jars in a dark place for storage.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_72_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_72_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_72_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_72_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. For the zombie eyeball martini: Gin (6 parts) Vermouth (1 part) Radishes Pimento-stuffed olives For the bloody eyeball cocktail: Vodka (2 parts) Triple sec (or any orange liquer, e.g. Cointreau, Grand Marnier; 1 part) Lime juice (1 part) Cranberry juice (2 parts) Lychees, canned and peeled in a can Cranberries (fresh or dried)Dry ice is optional for both, but makes for a great creepy effect - and of course very effectively chills the drink.. Wash a radish, taking care to retain the root (the \"optic nerve\" of the eyeball). Trim the crown off so as to leave an exposed white area the same diameter as your olives. Using a small sharp knife, carve out an olive-sized hole. Partially peel the radish, going for a venous and broken-capillary look. Pop an olive in the hole, pimento-stuffing poking out, and use the radish to garnish a martini (6:1 gin:vermouth). If you have no dry ice, shake the ingredients over ice in a cocktail mixer, and add to a chilled martini glasss. If you DO have dry ice, simply pop a chunk in the drink and serve (caution the recipient not to imbibe until the drink has stopped smoking). The original (?) recipe calls for making the eyeballs the day before, chopping off the root and freezing them into ice cubes overnight, but I imagine a frozen eyeball is harder to snack on. I recommend just making them fresh - their structural integrity is good, and it's easy to pull the eyeball out of the glass by the optic nerve and munch noisily on it.. This drink is a kamikaze and cranberry juice garnished with a fuming, rotting eyeball.\nStuff the cavity of a peeled lychee (from a can) with cranberries, as many as it takes to have them slightly protruding. Pop in a martini glass,and add triple sec (or any orange liqueur - we used Grand Marnier), then lime juice, and vodka (1:1:2) and top up with cranberry juice. Add a chunk of dry ice and serve (again, make sure the drinker knows to wait until the smoking stops before consuming). If you have no dry ice, mix the ingredients (but not the eyeball!) in a cocktail shaker with ice, shake then strain into the glass over the eyeball.\u00a0. Your guests should be encouraged to eat the eyeballs after knocking back the cocktails - both types are tasty and not just for decoration. The orange liqueur-soaked lychee+cranberry decomposing bloody eyeball is delicious, and the stuffed radish zombie eyeball is positively healthy.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_73_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_73_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_73_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_73_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Ingredients:Beef, top round or any other cheaper non-fatty type works best  1 cup Soy sauce 1 tbsp Molasses 2 tsp Liquid smoke 2 tsp Black pepper 2 tsp Garlic powder 2 tsp Onion powder 2 tsp Red pepper flakes 1 tsp Ghost pepper salt 1 tsp Cayenne pepperThis recipe makes enough marinade for about 2.5 pounds or a little over 1 kg.Equipment:A food dehydrator like this one from NescoA sharp knife  A large glass or ceramic bowl  A cutting board  Paper towels. Start the process the day before you want your finished jerky. Throw your beef in the freezer for a couple hours or, if frozen, remove from the freezer for about an hour (this will all depend on how much you have). Since thin slices of beef are ideal for jerky, having the beef partially frozen makes it easier to cut consistently thin pieces.Once the beef is thawed on the outside but still slightly frozen on the inside, put it on a well-washed cutting board and pat it dry with a paper towel. Trim as much of the fat off as possible then slice the beef into \u215b\" to \u00bc\" (3-6mm) slices. Cutting with the grain with a really sharp (not serrated) knife works best. Here I'm using a top round steak, you may use any cut of meat you like but remember that meat with a high fat content will become rancid faster, which makes this company's filet mignon jerky practical yet decadent!. In this instructable I'm using a marinade (wet method) to flavor the jerky. There are other methods you can chose, such as a dry rub, however I enjoy the flavor the marinade brings to the beef.Wash your hands and bowl well then start by adding all of your ingredients (minus the beef) in your large bowl. Separate the beef slices well, since they tend to re-freeze together when in a pile, and add the beef to the bowl a few slices at a time followed by mixing by hand. Ensure all of your beef is coated well.If you have more meat than marinade, simply prepare another bowl with marinade and repeat the steps above. It's easier to work in smaller batches than a large unmanageable pile that might risk an uneven marination of the beef.Cover and put the bowl in the refrigerator overnight or for at least 12 hours. For best results, mix the contents once or twice during this period.. The next day (anywhere from 12-24 hours later) remove the bowl from the refrigerator and wash and dry your dehydrator racks as the manufacturer recommends. If you do not have a dehydrator, wash the metal grates of your oven well and line the bottom of the oven with foil.Remove the strips of beef from the marinade and arrange on the racks in one layer without overlapping, allowing for a little bit of air flow around each piece. When removing the strips of beef from the marinade, allow them to drip-dry, you want some marinade to coat the beef strip but not too much. Assemble your dehydrator and set at 160\u00b0F (~70\u00b0C).Revisit your dehydrator every hour to check the progress and to dab away any fat that is collecting on the top of your strips. With my dehydrator, the process took about 5 hours, this will vary depending upon how thick your strips are and the model of your dehydrator.If you do not have a dehydrator, this can be done in your oven by setting it as close to 160\u00b0F as possible and laying the beefs strips across the oven's metal grates. Prop the door of the oven open slightly with a wooden spoon to allow for the warm, moist air to circulate out. Please be aware that gas ovens pose the risk of carbon monoxide/dioxide poisoning when propped open, so if you go this route make sure you have plenty of ventilation.. Your jerky is ready when you are able to tear the strips along the grain, they should be pliable but not soft and fairly stiff but not brittle. At this point, turn your dehydrator off and store your jerky in a clean and dry container lined with a paper towel and a loose fitting lid. Jerky is shelf stable for about 2 weeks at room temperature and one month in the refrigerator.Congratulations, you have now made some super simple, spicy and delicious jerky at home! I encourage you to try tweaking the recipe to your liking. Substitute in dried peppers, hot sauce, smoked salts, different herbs... the combinations are endless. Just remember to keep any added fats to an absolute minimum and if you decide to use anything but beef, cook the meat to the USDA recommended internal temperatures first before dehydrating (including game meats).\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_74_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_74_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_74_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_74_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. First, we need ingredients. Duh. A cake without ingredients is just, well, air. For the cake, you'll need...2 Tbsp Butter  2 Lg Eggs  1 \u00bc Cup Granulated Sugar  1 \u00bc Cup Flour  \u00bd Cup Cocoa Powder  1 Tsp Baking Soda  \u00bd Tsp Baking Powder  \u00bd Tsp Salt  \u00bc Cup Heavy Cream  \u00bc Cup Whole Milk  \u00bc Cup Water  1/8 Cup Vegetable Oil  1 Tsp Vanilla ExtractFor the buttercream, you'll need...1 Bar Butter  1 \u00be Cup Powdered Sugar  \u00bc Cup Cocoa Powder  1 Tsp Vanilla Extract  1 Tbsp Heavy CreamFor the ganache, you'll need...125g Chocolate Chips  \u00bd Cup Heavy CreamAnd to top your cupcakes, you can use chocolate sprinkles, Oreos, chocolate chips, chocolate chunks, cocoa powder dust, or anything else chocolate.. First, pour all your dry ingredients into a bowl. The dry ingredients are...Flour Cocoa Powder Baking Soda Baking Powder SaltMix it all together with a spoon, fork, or whisk. . The wet ingredients include...Heavy Cream Whole Milk Water Vegetable Oil Vanilla ExtractAnd whisk with a whisk (keep in mind that the oil will separate, so it doesn't have to be perfectly blended). . Beat the 2 Tbsp of butter in a mixer (or use a whisk) until smooth and creamy. Then slowly add in the granulated sugar until the mixture becomes fluffy and crumbly. Next, add in your two large eggs, one at a time. Be sure the first egg has mixed in completely before adding in the second.. Now, with the mixer on low speed, alternate adding in the dry ingredients and wet ingredients, ending with the dry.. This is super simple. Just line a cupcake/muffin tin with cupcake liners, and scoop two big tablespoons of the batter into each liner. This should come up to about 2/3 to 3/4 of the way up the cupcake liner. Give the tin a good shake to even out the tops of the tins. . If you haven't already, preheat your oven to 350 degrees F, and bake for 18 minutes. . Cream your butter but mixing it on medium speed until light and fluffy. Then sift in half of the powdered sugar along with all of the cocoa powder, and continue mixing, now on low to medium low speed. Once all the dry stuff has been well incorporated into the butter, sift in the second half of sugar and continue mixing until well incorporated. Add in the vanilla extract and mix until well incorporated. Then finally, if necessary, add in up to one tablespoon of heavy cream to lighten up your frosting. Mix until combined. . Heat the heavy cream until a low simmer, then pour over the chocolate chips and stir until completely melted (feel free to use a microwave if your chocolate still is not completely melted. Heat 15 seconds at a time).. Put your buttercream into a piping bag fitted with a large star tip. Start at the center of your cupcake, and pipe in an outward spiral. As you approach the edge of the cupcake, decrease the pressure to create a tapered end. Then, using a bottle with a squeeze tip or a piping bag, drizzle some ganache over the top, and sprinkle with some of your toppings. . And that's it! Surprising how easy that was, right? Now take that first bite. Mmmmmmm. So good. If you enjoyed this recipe, please check me out on other social media sites, including facebook, instagram, tumblr, twitter, and, of course, YouTube. You can find me as \"joshpancooking.\" Thanks for all your support, love you all, and goodbye!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_75_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_75_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_75_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_75_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Ingredients you will need:\n1\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Package of Oreos\n1/3\u00a0\u00a0\u00a0\u00a0 Cup of Butter or Margarine\n3\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Packages of Cream Cheese (8 oz. each)\n3/4\u00a0\u00a0\u00a0\u00a0 Cup of Sugar\n1\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Tsp of Vanilla\n1\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Cup of Sour Cream\n4\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Eggs\nTools and Equipment needed:\n9 in spring form pan\nFood Processor\nMixing bowls\nElectric Mixer (Optional)\nMixing spoon\nDry measuring cups\nKnife. 1. Preheat the oven to 350 degrees Fahrenheit\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 If your pan is a dark color preheat oven to 325 degrees instead2. Finely crush 28 cookies in a Food Processor\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Tip:Add only a few cookies in at a time. I find this the fastest and easiest way to crush the cookies. Also if you dont own a food\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 processor you can crush the cookies up in a large Ziploc bag with a mallet or any tool you can find. Just make sure it is a\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 fine powder before you mix it with the melted butter3. Melt the butter or margarine\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a04. Mix the crushed cookies and the melted butter\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Tip:\u00a0 Make sure all of the mixture is moist and that it is mixed well with no dry cookie powder\n\u00a0\u00a0\u00a05. Press the mixture onto the bottom of your spring form pan and 2 inches up the side of the pan\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Tip: I find it the easiest to press the mixture down with a 1/4 cup measuring cup. Also adding the mixture slowly at a scoop at a\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 time, making sure you are pressing the mixture down firmly before adding more of the mixture.. Tip: Try not to mix the batter to heavily since the more you mix the more air you add into the batter, which can lead to the cheesecake cracking when it is baked.Tip: Do not add all of the batter ingredients at once. You want to add the wet ingredients slowly so the risk of cracking your desert is as minimal as possible.Tip: Mixing the batter by hand with a wooden spoon instead of using an electric mixer your cheesecake is less likely to crack when it is baked. If you choose to mix the batter with an electric mixer, mix on the lowest speed possible.1. Warm the cream cheese to room temperature\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Warming the cream cheese should only take about 30 minutes.2. Chop the rest of the Oreo cookies into small pieces\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 WARNING: Using a sharp knife can lead to physical injury always be careful when handling sharp tools.\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Tip: I find it easiest to break the cookies up with my hands.3. Beat cream cheese and sugar in a large mixing bowl\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Make sure you are not mixing it to much. Your mixture does not have to be completely mixed before going on to the next step.\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 You will have time to make sure your mixture is smooth after all the ingredients are together.4. Add sour cream and vanilla to the mixture5. Add eggs\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Only add one egg at a time. Make sure that each egg is blended in thoroughly before adding the next egg.6. Mix in the chopped cookies\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Try as hard as you can to make sure that there as little chunks in your batter before you pour your batter into the crust.7. Pour the batter into the crust\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 . 1. Place the cheesecake in the oven\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Set the cheesecake in the center of the oven. If your cheesecake is not in the center of the oven you may burn the top of your\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 desert.2. Bake for 55 min to 1 hour\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Remove the cheesecake from the oven when you see that the center of the cheesecake is just about set.3. Run a knife lightly around the rim of the cheesecake\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 You want to make sure that the crust is not attached to the side of the pan while cooling. You only need to run the knife down\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 a few centimeters of the crust. There is no need to run the knife all the way down the crust. If you do this you may run into the\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 possibility of breaking your crust.4. Let cool at room temperature for 1 hour5. Remove Spring Form Pan's rim6. Refrigerate for 4 hours\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Your desert is not ready to eat until the cake has completely set. You can test this by slicing through the cake and if it pass\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 through with very little resistance like it is still has moisture in it refrigerate until it is firm. . Tip: If your cheesecake has large cracks in it you can smooth out the cracks by doing three easy steps.\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 1) Wet a rubber spatula\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 2) Run the spatula over the cracks to smooth out the cracks\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 3) Let dry in refrigerator\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\nNow your delicious desert is complete and you can enjoy your wonderful creation. You can add your own decorations to your cheesecake that you would like. I find drizzling chocolate syrup over the cake adds style and a great chocolate accent to your desert. Now show all your friends at your next party how great of a cheesecake you can bake!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_76_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_76_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_76_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_76_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. 1- INGREDIENTS:a) 1 box of JELL-O powder\nb) 1 cup of boiling water\nc) 1 cup of ice cold alcohol (the colder, the better)\n          Note: For the demonstration, I used plain 40% alcohol, but it works just as fine with vodka or rum.2- TOOLS:a) A big bowl\nb) Any kind of container you want to serve your shots in. (I recommend plastic shooters or plactic cups).\nc) A measuring cup that can measure at least 1 cup.. 1- Empty the Jell-O pouch into the big bowl.2- Add 1 cup of boiling water.3- Stir vigorously until the powder is completely dissolved.Note: Step 3 is crucial if you want to avoid the awful skin that you find sometimes at the bottom of your JELL-O... I heard stories about people that actually LIKE this skin.... I think it's gross.... we used to call it \"JELL-O placenta\"4- Add 1 cup of ice cold alcohol and mix well again.Note: The colder your alcohol is, the better the result. I think this has to do with the activation of the gelatin. Personally, when possible, I keep the bottle of alcohol in the freezer for several days before making my shots. Don't believe the skeptics: the bottle won't blow up!!. Once the Jell-O shot mix is ready, pour it in the containers you want to serve the shots in. Sometimes I just leave it in the bowl and then I eat it all at once... I guess you can call this a JELL-O shotgun!!!Put it all in the refrigerator for a couple of hours. Here you will have to check once in a while if the jelly has set because the exact time needed depens on the containers you use and the \"weather conditions\" inside your fridge.. It is now time to amaze your guests whit this very special treats!!!!To get the maximum effect out of your shots, I suggest you throw it as a chunk in the back of your throat. The chance of suffocation is non-existant and the less time it spends near your tongue, the more you can trunk in without being annoyed by the alcohol's taste.\nSo chug it down and feel the party heat come up from your belly!!!!!As you may or may not know, a generous amount of alcohol ingested in a short amout of time can be bad for you. In some instances, it can even kill you.Therefore, if you want to have fun with JELL-O shots for years to come, please use it responsibly.Thank you and have great parties,BILOU\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_77_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_77_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_77_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_77_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. You need a Non-ripen Guava fruit , choose it well.A knife to chop it.Chilli Powder for adding spiciness.Salt if you are looking for a better taste.. Take the Guava fruit into your hand can chop off its head as it wont be good.Now slice it vertically from the top till the bottom.Perpendicular to the sliced line again slice it in the same pattern mentioned above .. Now take all the slices separately  , you will have four slices.Take a slice of Guava and cut it into pieces as shown in the images.Repeat the same process for the rest of the slices.You will have this at the end of the process.. As you can see some nice slices of Guava don't be in any hurry and eat them ,there are still few things to be added to them.Its time to add chilli powder.Take chilli powder powder preferably into your figures and gently sprinkle the powder on the slices.It adds a beautiful look to the slices.. Now add a bit of salt to the chilli powdered slices if you don't want them to be very spicy.Add salt in such a way that it should be 1/4 th of the chilli powder that u added lately.Adding more of salt also ruins the taste of the Guava.Now just see how amazing it looks so that your mouth can't resist to have it.What are you waiting for, Have a Slice Baby:)\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_78_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_78_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_78_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_78_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. \n          You may make this with or without Cream Of Mushroom Soup. If you don't add the Cream Of Mushroom Soup; use 1 can of Tomato Soup with a mixture using the empty can \u00a0( 1/2 water to 1/2 cup milk or powdered milk. ) The powdered milk will mix better. Fresh herbs would be super with this if you have them available.\u00a0\n\t1 Can Tomato Soup\n\t1 Can Cream Of Mushroom Soup\n\t1 Can Water\n\t1 Can milk or powdered milk\n\tPinch of Baking Soda if using regular milk\n\t1/2 Teaspoon Italian seasoning using 1/4 t in the tomato soup before cooking and 1/4t \u00a0in the serving bowls.\u00a0\n\tPepper as desired at time of serving\n\tParmesan cheese as desired at time of serving. \n\t1 sauce pan\n\t1 Large spoon\n\t1 Whisk\n\t1\u00a0Ladle not shown\n\tCan opener\u00a0. \n\tPour the tomato soup into the\u00a0sauce\u00a0pan.\n\tAdd 1 Can of milk. If using regular milk add a pinch of Baking soda to help it mix together. If using powdered milk; it is not necessary.\n\tMix well.\n\tHeat on medium heat.. \n\tAdd 1 can of Cream Of Mushroom Soup.\n\tAdd 1 can water\n\tMix well.. \n\tAdd 1/4 teaspoon Italian Seasonings.\n\tMix well.\n\tCook until hot.. \n\t\u00e2\u0080\u008bPour into serving bowls.\n\tSprinkle the \u00a0 Italian seasoning , Parmesan, and Pepper to each bowl of soup if desired.\u00a0\n\tServe with crackers or . . .\n\tThis soup goes well with a grilled cheese\u00a0sandwiches!\u00a0. \n\tMy girlfriend has taught me a few tricks about adding flavor to dull foods. She always adds a little fresh ingredient to any prepared foods to give it a homemade flavor. It does not taste as wonderful as cooking from scratch but it certainly taste better! If you would like a great recipe for a grilled cheese sandwich here is my recipe:\u00a0https://www.instructables.com/id/Grilled-Cheese-And-Bacon-Sandwich/\n\tIn closing I would like to thank our instructables company, sponsors, authors, readers, and members; for making this community a great success! Many hours and hard work has been put into making this place the best DIY on the Internet. Have fun and thanks for stopping by!\n\tSunshiine\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_79_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_79_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_79_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_79_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. 1 (20 oz) pkg Oreo cookies4 Tbsp butter or margarine8 oz pkg cream cheese8 oz frozen whipped topping1 cup powdered sugar2 pkg instant vanilla pudding3 1/2 cups milk. I break the cookies up by hand, one at a time. It doesn't take very long. I divide the cookies in half and break half of them into the bottom of a 9x13 pan and the other half into the cookie container. (They will go on top at the end.). Cream together cream cheese, margarine or butter and powdered sugar.I use my stand mixer for this. You'll need to mix another part of the filling separately, so you can choose to use two separate bowls (and maybe a hand mixer for this one). I chose to scrape out the mixture with a spatula into a separate bowl. The spatula works well, and it doesn't matter if the bowl isn't completely clean because it all eventually gets mixed together.. In a separate bowl (or in empty stand mixer bowl), combine vanilla pudding and milk. I pour the milk into the bowl, and slowly pour in pudding mix as it is mixing.. Stop the mixer, dump in the whipped topping and blend well.. Add the cream cheese mixture to the filling and mix until well blended and smooth. (From my experience, some small lumps don't affect the end taste or texture. Larger lumps can sometimes affect texture. Just keep mixing!). Pour filling on top of bottom layer of crushed cookies. Sprinkle the remaining half of crushed cookies on top. (It will look like the cover picture.) Refrigerate for several hours before serving.Enjoy!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_80_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_80_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_80_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_80_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Prepare both Cake Mixes according to instructions.\nMeasure 2 c Red Velvet Cake Mix\nMeasure two 1 cup portions of White Cake Mix\nIn one of the White Cake Mix portions add food coloring to create a pink. I used the Neon Food Coloring Set and added 16 drops of Pink and 2 drops of Blue to come up with a Dusty Rose shade.. First pour in the two cups of Red Velvet Cake Mix, spoon the White Cake Mix over it. Don\u0019t worry about covering the entire area.\nLast add the Pink batter and use the back of a spoon to gently spread it over the surface. Bake according to Cake Mix Directions.. If you are using Fondant to cover your cake, you will need a 20 inch rolling pin with rings to make your fondant smooth have a uniform thickness all over.\nLay Fondant on a large cutting board.\nPlace measuring rings on the end of your rolling pin and start by working the Fondant from the center outward in both directions.\nAs Fondant thins out use a pizza cutter to trim away uneven ends while you create your 'sheet.'.\nNote: The beauty of Fondant is that it provides a super smooth surface on your cake to work on and you don't get all sticky or mess up your icing.. When ready to place Fondant over your cake roll the Fondant onto the rolling pin part way,\nLift it gently so as not to stretch your Fondant and have someone slide your cake underneath onto the cutting board and gently drape the fondant back over your cake.\nNote: It is preferable to spread a thin layer of butter cream frosting over the cake to help the Fondant stick to the cake, however the Fondant recipe I use has marshmallow in it and it shapes very well. Also the marshmallow gives your Fondant a pure white color.\nStarting from the center outward smooth the Fondant over the edge of the cake and shape it to the cakes outer edge down to the cutting board.\nUse a pizza cutter to trim away most of the excess Fondant to make it more workable and reduce weight as you lift and adjust. Leave about 1 inch excess until you have your entire cake covered.\nWhen you reach the top of the heart make a straight cut to the inverted point for easier fitting.. Once you are satisfied with the fit of your Fondant use the pizza cutter to trim away all excess.\nPrepare a cake board by laying a Grease Proof Doily designed for food use and cut around the outside edges.\nCenter Fondant Covered Cake on Doily and Cake Board.. For this Cake I used a Gel Writing Food Pen to write 'Be Mine' (always start in the center and work your way outward). \nI used a Red Icing Tube with a writing tip to create the flourishes then dabbed a small amount on Licorice Whips to hold them in place as the outline and put stripes down the sides. \nThen I switched to a star tip and put small star shapes at the top and bottom of each licorice stripe to better secure it in place. \nFinally I laid two foil wrapped chocolate cream hearts on top with a dab of icing to keep them secure.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_81_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_81_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_81_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_81_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Here is what I used to make my marinated frozen grapesWhite grapesSugarWhite WineA pitcher . I started by mixing the wine and sugar into a pitcherAdd 1/8 cup of sugar and half bottle of wine. This made 6 servings for me.After adding the ingredients I stirred the sugar and wine together until the sugar was completely dissolved in the wineOnce the mixture was complete I dumped about 1 1/4 cup of grapes into the pitcher. Enough grapes to completely submerge into the wine without being too cluttered together.. Next put your grape pitcher in the fridge to let the grapes marinate in the wine and sugar. Keep the grapes marinated for 12 to 24 hours.. Once the grapes are fully marinated I pulled the marinated grapes out and strained the grapes out then patted the grapes dry.I then added a sugar coating to the grapes. I started by putting a small coating of sugar in a bowl then i put a small handful of grapes into the sugar and rolled them around. Then I put the grapes into a plastic container.NOTE: I did not coat all of my grapes with sugar because I wanted some a little tarter.. After all the grapes wanted are coated in sugar I place the container of grapes in the freezer for about 4 hours before serving to your friends and family. You can either have these as a snack which makes a great summer snack. You can also choose to use them as ice cubes for your wine. Especially if you choose not to coat some of them with sugar. Enjoy!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_82_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_82_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_82_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_82_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. To make this dish for 2 you will need the following:2 fillets of sea bass, your fishmonger can do this for youSalt and pepper1 knob of butter1 yellow bell pepper, thickly sliced and the seeds removedAround 10 cherry or vine tomatoes, roughly sliced in half or left whole1 bunch of asparagus with the tough ends broken off, this will be anywhere between 10 and 15 pieces, depends on the thickness of the asparagus. If they are a very thick then give them a little longer or slice them length ways2 Tbsp of spicy pesto, or any pesto you like, simply from a jar100ml dry white wine1 handful of flat leaf parsley, you will only need the leaves which you can just pick offoptional, a little water in case you want to loosen the saucesome mixed leaf to your liking, you could use rocket, watercress, spinach or what ever you like in any combination you like. The vegetables with the exception of the asparagus will be cooked in a griddle pan, you want charing so this needs to be as HOT as possible, put this on full wack.NOTE: do not put oil on the griddle otherwise you will be frantically opening windows and hitting the smoke detector. You wont need oil and it will only burn which will ruin lunch and no one wants that!Put another pan on a medium heat and allow to heat up while seasoning the fish, season both sides with salt and pepper. Salt on the skin will help to give you that infamous crispy skin. You could score the skin if it is a thick fillet which will actually help the heat to permeate allowing the fish to cook quicker. But there really isn't any need.TIP: Never put put your fish into a cold pan, you want it to be up to temperature first for a nice crispy skin.. Put your peppers on the griddle, these are going to char and blacken which is just what we want, they will go soft and sweeten as the natural sugars cook.In the fish pan, put a good knob of butter, let this melt down for a few seconds and move the butter around the pan. Then, gently lay the fish in the pan skin side down - do not touch the fish or the pan now, it can be tempting to mess around with the fish but you want the skin the crisp up and the meat to gently cook.TIP: Don't be tempted to move the pan around and mess with the fish, just let it cook.. Keep an eye on your peppers, move them around.After 4 - 5 minutes you will see the edge of the fish at the thinnest points start to turn a light white colour, when this happens it is time to turn the fish. Take a fish slice and very carefully turn the fish over, keep close to the pan so not to splash butter everywhere and keep the delicate fish in one piece. Cook the fish for 2 - 3 minutes more, keep checking it to make sure it doesn't overcook/ burn.Get some foil or a plate ready for when the fish is cooked to put it to one side.Check the fish by gently lifting it with the fish slice and peaking underneath, it should be just brown, remove from the pan and put to one side.TIP: Fish is considered tricky and many people over cook it but if you keep an eye on it then it is really easy, as soon as the fish looses it raw colour and the flakes of meat just start to come away from each other it is ready. Just be patient and as soon as it is done, get it out of the pan.. Now we are coming to the end and the last of the ingredients cook super fast.Turn the peppers again and throw the wine in the fish pan, you want to save all the delicious flavour from the pan so don't wash it first. This is called deglazing the pan.Put the asparagus in the wine and put a lid on top, the asparagus will take around 2 minutes to become tender and steaming them in wine and the fish butter will make them shinny and delicious.At the same time, put your tomatoes on the griddle, they will cook fast because of the sweet sugars and the soft flesh. They will be ready around the same time as the asparagus.. Asparagus really doesn't take very long, as soon as the stems are tender use some tongs and get them out of the pan, put to one side for plating up later.Don't throw the wine away from the fish pan, this is going to be the base for the super simple sauce - the flavours of the fish and asparagus are too good to waste.. When it comes to sauces there is nothing more rewarding than making your own from scratch but sometimes you want something quick and easy so there is no shame in using a nifty cheat here and there.For this one the secret is pesto (you could even make your own pesto), here we used a spicy tomato pesto. Add your pesto to the wine in the pan and mix in. You may need to add a splash of water to loosen the sauce. Add the flat leaf parsley at the end and stir in.Take the vegetables off the heat and put in a bowl, set to one side. It is best to get the veg out of the pan, the griddle is a big heavy chunk of metal and will hold the heat for a while, consequently continuing to cook the food in it.TIP: When you are making sauces, a splash of water in many cases can do wonders. If you take a sauce too far or kept it warm a little too long, reduced a little too much then a dash of water can be your saving grace.. Bring your dish together and serve with a glass of white wine, spoon the sauce on and around your perfectly cooked fish.Add a light garnish of green leaf, peppery rocket works a treat here. Enjoy as a great quick lunch, alfresco if you can :) \nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_83_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_83_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_83_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_83_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. \nIngredients for the pie crust:\n2 cups all-purpose flour\n1 cup vegetable oil or butter\n1/2 cup cold water\nIngredients for the filling:\n3/4 cup sugar\n1/4 cup cornstarch (can be substituted with 1/2 cup flour)\n6 cups berries, frozen or fresh. I used 3 cups raspberries, 2 cups blackberries, and 1 cup blueberries, all fresh.\n1/2 or 1 lemon depending on preference\nExtra sugar to taste\nMaterials:\nMeasuring cups\nMix master or stirring spoon/fork\nRolling pin\nWax paper (or extra flour)\n2 large mixing bowls\nPlastic wrap\nPizza cutter (or normal kitchen knife)\nLemon juicer\n9-inch pie dish. \nAdd 2 cups flour in a bowl. Mix in 1 cup of oil in 1/4 cups. You can mix it either by hand with a fork or with a mix master. Mix until the flour and oil are a thick liquid that is almost solid (image 1).\nSlowly mix in 1/2 cup of cold water (it is important the water is cold!). The mixture will solidify into an oily dough (image 2).\nCut the dough into two slightly uneven \"halves\" (you want a bigger \"half\" and a smaller \"half\"). Wrap them in plastic wrap and stick them in the fridge for at least an hour, up to a day (image 3).\nNote: if you are in a time crunch, you can put the dough in the freezer for 30 minutes, but be sure to flip them over after 15 minutes. The side touching the metal grate cools faster.. \nMix 3/4 cups sugar and 1/4 cup cornstarch (or 1/2 cup flour) in a large bowl.\nAdd 3 cups of berries to a different bowl. The blueberries are there, just on the bottom (image 1).\nPour the sugar-cornstarch/flour mix over the berries (image 2).\nCoat the berries evenly using your hand. It is okay if the berries get crushed. It may be helpful to pour all the ingredients back and forth between the two different bowls (image 3).\nWhen thoroughly mixed, juice half a lemon and pour the lemon juice and the pulp into the mixture. Mix the lemon in. Let the berries stand for at least 10 minutes. You should at this point have a fruity syrup with a lot of solid fruit in it (image 4).. \nPreheat the oven to 400 degrees Fahrenheit. Use convection bake if your oven has that option.\nPut a sheet of wax paper on the counter (if you don't have wax paper, a floured surface works). Put your larger \"half\" of the dough on the wax paper, and put another piece of wax paper over it. Roll the dough to an approximately 12 inch round (image 1).\nTake off the top piece of wax paper. Oil or butter your pie dish (make it non-stick) and flip it upside down. Put the dish on top of the dough, and flip the whole thing back over. The pie crust should \"fall\" into the dish, with the piece of wax paper on top (image 2).\nPeel the wax paper off (carefully!). Fix any tears that you have. If your pie crust isn't large enough, you can take a some of the 2nd batch of dough and fill up the edges. The pie crust should hang over the dish very slightly (image 3).\nNote: you want your pie crust on the bottom to be relatively thick so it can hold the filling. Fill in gaps with extra dough, don't just pinch tears shut.\nSpoon/pour your filling into the pie crust. Be careful not to poke through the bottom of the crust. Make sure the filling is relatively flat (image 4).. \nRoll out the 2nd dough in the same way as you did the first. There are multiple options of how to top a pie. You can cover the entire thing, make a basket weave, corn rows, etc. I went with a basket weave.\nUse a pizza cutter or a knife to cut the dough into even-thickness strips (image 1).\nStart laying down strips in one of the corners. I chose top left because I'm right handed. This means that I pressed the left side of horizontal strips into the pie crust and pressed the top end of vertical strips into the pie crust. The right ends and bottom ends were not stuck down because I needed to lift them to weave an over-under pattern (image 2).\nKeep adding strips, alternating between columns and rows, weaving over-under until you have covered the top of the pie. Use the leftover strips to circle the rim of the pie dish to hold the basket weave in place, and to look prettier (image 3).\nNote: you can \"recycle\" scrap pieces of dough that are too short, that tear, etc. Just ball them up, re-roll, and re-cut.. \nAt this point, your oven should be at the correct temperature, 400 degrees F.\nSprinkle a little sugar on top of the pie to give it a nice, sweet crust. If you like your pies to be really tart, you can squeeze another 1/2 of a lemon on top of the entire thing. Try not to leave pulp on top of the dough because it tends to blacken the basket weave (like I did here).\nPut the pie in the oven, middle rack, for approximately 45 minutes or until the crust is brown and the filling settles. You can check by sticking a skewer into the middle of the pie - if it comes out clean, the pie is all set.\nTake the pie out when it's done and let it cool for 20-30 minutes at least. You can sprinkle more sugar on the top if you want. Serve it warm or cool, with or without ice cream, however you want.\nEnjoy! (image 1).\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_84_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_84_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_84_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_84_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. you will need oil - 2 table spoon cumin seeds - 2 tea spoons chopped garlic clove - 2 to 3 chopped green chilly - to taste ( i took 2 small once) chopped onion - a big one chopped tomato - about 2 plus 1 for garnishing boiled chopped potato - 1 or 2 salt - to taste garam masala - 2 tea spoon. heat oil in the pan. add the cumin seeds to it and stir for half  a minute. then add chopped garlic and green chilly and stir for a minute. add the chopped onion and let it cook till it softens. keep stirring in between. you can see the color  change from images. once done add tomatoes, chopped boiled potato, salt, garam masala powder and stir well. add some water and put the lid on and cook for 3 to 5 minutes on medium heat till everything is cooked well.take the pan off heat and let it cool.. once the mixture cools, grind it on blender. and the puree is ready.. poach the eggs. there are several instructables available on how to poach the egg and i am not going to repeat. in fact i learnt to poach egg from those instructables! . i served mine with fresh bread. and garnished with tomato. let your creativity fly. do something new....\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_85_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_85_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_85_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_85_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. You will need:For the buns7g dried yeast240ml whole milk60g caster sugar380g plain flour30g cocoa powder1 tsp cinnamon60g chilled butter (cubed)1 egg (beaten)100g chocolate chipsFor the glaze2 Tbsp of marmaladeFor the chocolate cross100g dark chocolate (70% cocoa)100ml double creamTo bakebake at 180 degree celsius for 10 minsthen, reduce temperature to 160 degrees celsius for 15 mins. Gently warm the milk so it is about body temperature, put your finger in the milk and when it feels just warm it is ready (be careful not to heat the milk too much and burn yourself). It will only take a short time to come up to temperature so keep an eye on it.Add the dried yeast and one Tbsp of the sugar and stir, the sugar and the warm milk will start to activate the yeast, you will see the mixture start to bubble. If it doesn't then it is possible the milk was too hot and you have killed the yeast, in which case you will need to start again.. Pass all the flour, cocoa powder and the cinnamon powder (optional) through a sieve, incorporate the dry ingredients and to this add chilled cubed butter. With your finger tips crumble the butter and dry ingredients together until the mixture resembles bread crumbs. You can use a food processor but using your hands is much more rewarding :). Lightly beat the egg and put to one side, make a well in the middle of the dry ingredients. Add the yeast mixture, egg and sugar to the well.. With your hands bring the wet and dry ingredients together, it will be sticky to start with but knead a little in the bowl. Then, lightly flour the surface with some plain flour and turn the formed dough out, knead the dough for around 10 minutes until it is elastic when you pinch it.. To prove the dough find a warm place and put the dough in the bowl, under the tap wet a cloth and squeeze the excess water out of it so it is damp. Cover the bowl with the cloth and leave the dough to rise for about 1 hour or until the dough has doubled in size.When the dough has proved, with your hand know it back and take out the bowl, returning to the work surface. flatten the dough.. When the dough has been flattened, pour the chocolate chips on top of the dough. You can use dark, white or milk chocolate chips for this.knead the dough for a further 3 - 5 minutes until the chocolate chips have been incorporated into the dough.. Roll the dough out into a sausage shape and cut it into 16 pieces, shape each of these pieces into a ball in your hands and place in a greased and lined baking tin. Cover the dough with the same damp cloth used earlier and leave to prove for 30 mins or until doubled in size again.. Pre heat the oven to 180 degrees celsius (fan) and bake the buns for 10 minutes then, reduce the temperature to 160 degrees celsius (fan) for a further 15 minutes. While baking continue with preparing the glaze and chocolate ganache.. Pour the cream into a cold pan and bring up to the boil, when the cream just starts bubbling, take off the heat and add the dark chocolate, stir in gently, but don't over stir and set to one side (5 mins).. In a cold pan add the marmalade and bring up to heat gently, melting the marmalade into a syrup. Stir the chocolate and leave to one side to cool slightly.Take the buns out of the oven and immediately brush on the melted marmalade to give a beautiful glossy sweet finish.. If you have a piping bag then you can use that but if you don't a food bag can work just as well.Take a food bag and put in a cup or glass to make it easier to fill. Spoon the ganache into the bag and twist the end pushing all the chocolate down to the bottom.Take some scissors and snip a small hole in the bottom of the food bag, pipe the chocolate ganache in single lines across the buns, then turn the tin and pipe in the perpendicular direction.Serve your hot cross buns with a generous amount of butter and some marmalade.If you like you can subscribe to the YouTube Channel and like the Facebook page.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_86_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_86_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_86_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_86_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. You will need water that is nonclorinated. I set some tap water out in a jug with the lid off afew days before to let the clorine evaporate. You can use bottled spring, drinking, or distilled water if you would like. It just cannot have clorine in it or it will kill the beneficial bacteria. Vegetables: \u2022Cabbages\u20221 head Bok choy\u20221 head Napa\u20222 Leeks\u20223-4Carrots \u20221 Daikon radishSpices:\u2022Garlic\u2022Red chili peppers\u2022Ginger root\u2022Himilayan sea salt\u2022Sriracha hot sauceTools:\u2022Knife\u2022Cutting board\u2022Large jars\u2022Large pot\u2022Plate (needs to fit inside pot)\u2022Plastic zipper bags\u2022Baking tray or large plate. Wash all veggies with cold water. Leeks tend to hide dirt near their base as well as the bokchoy so these need some more attention than some of the other veggies. I wash these as I cut because of this.Tips: \u2022 Don't cut peices too large for your mouth. I like cutting them large but keep in mind as you cut that the peices still need to fit comfortably in your face while you chew.\u2022Leeks: Cut the bottoms off near the roots. Cut any wilt from the tips. Cut in half lengthwise lining up with the base of the \"V\". Cut the white and yellow sections in large peices. Fan the cut sections kind of like a deck of cards as you throw them in the pot. This will separate the layers easily. I tried to get a picture of this but it is near impossible to do with one hand. Looking at the picture imagine the cutting board as your other hand. Dirt likes to hide just below the green. Set aside the outside green peices to be washed individualy as you cut up the stalk. You don't have to use the green parts but I try not to waste and don't like to set aside for additional recipes/work. The green parts take longer to soften so I cut these into thinner strips.\u2022Bokchoy: I prefer to break each leaf off and wash them. The base usually has some dirt that has collected between stalks. after I have done this I will cut into about 1 inch peices.\u2022Napa: Wash outside and remove wilt. Cut into quarters legthwise. Cut quarters into about 1 inch sections.\u2022Carrots: Scrub outside. I slice on an angle to make them longer than just having sliced coins. I have also shredded them with a potato peeler for very thin long peices. \u2022Daikon radish: Cut end off. Slice into coins. Any that seem too big can be cut in half as well. It may be easier for you to cut in half first but I personally I just cut stacks in half after slicing. Put everything into your large pot or other container. There are no specific veg measurements that I use when making this. But it does make enough that I usually have enough for a month or more.. This step starts the fermentation process off on the right foot.With all your veggies in a nonreactive container sprinkle 1Tbs salt (any salt as long as there is no iodine) per 2 cups (nonclorinated) water over your veggies. I don't add salt to many foods so my pallet doesnt require it for flavor. I add because of necessity for safe fermentation. I would recommend if you often use salt consider doubling the salt content. It is easier and best to add extra now than try to later. Using less salt will make it ferment faster as well which means spoilage if there is too little. I sprinkle one Tbs then use the 2 cups water to \"wash\" it in, then stir with my hand grabbing from the bottom and bringing to the top. Do this until the brine is about half the depth of the veggies. Compress and cover veggies with a plate, put some weight on it to push them into the brine (I use a gallon jug of water). As the salt pulls water from the veggies they will settle more and eventually submerge fully. Leave like this for 6-8 hours to let this happen. . This is where your pallet plays a large roll and will require some experimentation. For my recipe I use about 6 garlic peices, some dried red pepper, a piece of ginger about the size of my thumb and anywhere from 1/4 cup to 1/2 cup Sriracha.Remove garlic skins. Remove pepper stems. Put all spices into blender or food processor and blend until it has become a paste. . Taste. I don't rinse my Kimchi as long as it doesn't taste too salty for me. Leaving the plate in place hold the veggies in the bottom of the pot and pour brine through a strainer (no pic, used both hands). This strainer is just in place to catch whatever gets by. If yours is too salty after this step then add water (nonclorinated) to the veggies, stir and strain until you get the salt taste you like. You can save the brine if you would like for something else as it already has some culture started in it. I usually just strain into the sink. Put whatever your strainer caught back into the pot.. Mix spice paste into the veggies. use some to \"squeegee\" the spice paste out of the container blending container. Warning: If you have sensitive skin this could effect you. I would not consider my skin sensitive but my hand does feel warm after this part and touching eyes will burn. Wash hands thoroughly after.. Pack the kimchi into your jars. Push it down so that liquid covers as much of the veggies as possible. leave some space at the top of the jar. Put some plastic zipper bags on top filled with water (oops I ran out of bags). The bags are used to let gasses escape from the kimchi and keep nasties from getting in as well as keep it submerged. Place a baking tray or large plate under the jars. As they off gas the liquid may spill over. Using a clean utensil press the kimchi back down at least once a day to let trapped gasses out. (This is a great time to dish some to try its flavor development)Let it ferment at room temp for 4ish days before sealing with a lid and refrigerating. \nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_87_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_87_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_87_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_87_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Gather the following ingredients:1 Cup White Vinegar1 Cup Water4 Avocados1 Tablespoon Red Pepper1 Tablespoon Rainbow Peppercorns2 Teaspoons Pickling SaltGather the following materials:Mason Jar for Storage*Note: You could also add other spices or flavors like lemon, juniper berries, or bay leaf to your pickle brine if you'd like. I chose to keep it simple for this first avocado pickling.. For this Instructable you'll want ripe avocados on the firm side so that they hold their consistency when you pickle them. I have found that you can identify a ripe avocado easily by simply checking under the stem. To do this, hold the avocado in your hand and place your thumb on the stem. Roll your thumb against the stem until it rolls out of the avocado. If the stem separates from the avocado easily and leaves a bright green patch of flesh underneath, you've found the perfect avocado. Conversely, if your stem does not roll away from your fruit, the avocado is not ready. If it rolls away and the avocado flesh that is exposed is brown, your avocado is over ripe, mushy and not ideal for pickling. . Let's start by making your brine. This way, your brine can cool while you prepare your avocados for pickling. Place 1 cup of vinegar with 1 cup of water in a nonreactive pot on your stove. Add 2 teaspoons of salt and bring to a boil, stirring to dissolve the salt. Once boiling, reduce to a simmer until all of the salt has dissolved. Remove your pot from the heat and allow to cool. . Using a sharp knife, cut your avocados in half lengthwise by rotating the knife around the avocado. Once halved, separate the avocado halves by placing a hand on each half and rotate, twisting your hands in opposite directions. . To remove the pit of the avocado, hold the avocado half in your non-dominant hand. With your dominant hand, carefully whack your knife into the pit until it sticks firmly. Keeping the knife firmly stuck into the pit, twist the knife until the pit rotates and comes loosely away from the fruit. The avocado pit should now be stuck to the blade of your knife. To safely remove the knife from the pit, place your fingers against the pit from the back of the blade. This way when you apply force to remove the pit from the blade you are working in a direction with the blade, not against it. Apply force to the back of the pit, until it releases from the knife blade. . Gently pry away the skin from your avocado fruit using your fingers. The skin will come off in pieces but that is okay, keeping the fruit whole is more important. You can also use a spoon along the inside of the avocado fruit to loosen it from the skin, but I have found that a little patience and peeling leads to a better end result. . Place your avocado halves, cut side down on your cutting board. Cut slivers lengthwise and then cut your avocado into cubes by slicing your slivers crosswise. You should finish with a bunch of avocado cubes. . Start your jarring process with a clean mason jar. If it is not clean and clear of impurities, boil your jar in water for a minimum of 10 minutes and allow to dry. With your clean mason jar, add your tablespoon of red pepper flakes as well as your tablespoon of peppercorns. Add your sliced/cubed avocado until it's about an inch below the jar opening. . By now, your brine should have cooled and it is safe to add to your avocados. Pour carefully and slowly, taking care to completely cover your avocados with the brine. This will start the pickling process. Place your lid on your jar and store in the refrigerator for at least 6 hours. The longer you store your avocados, the more pickle like they will become, but this is a quick pickle, and your avocados will be done after 6 hours. Continue to store your pickled avocados in the refrigerator when you aren't currently enjoying them. . You're done! Your pickles have a shelf life of roughly a week--thats the longest they've lasted with me. Enjoy your pickled avocados. I enjoy putting them on top of kale salads, over rice, or with cheese and crackers. \nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_88_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_88_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_88_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_88_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Pan(s), bowls, utensils needed: You will need one 8.5  ( 8 1/2  ) inch spring form panone measuring cup of each of the following:  1 full cup 1/2 cup 1/3 cup 1/4 cupA food processor, and an electric mixer help tremendously. Along with various things as bowl spatulas, 2 mixing bowls, one double boiler and a sauce pan. Ingredients: ....For the torte itself1/2 cup blanched almonds, lightly toasted2 oz. unsweetened chocolate - dark2 tbl unsalted butter2 large eggs1 cup sugar1 tbl framboise or other raspberry brandy3/4 cup all purpose flour1 tsp. double-acting baking powder1/2 tsp salt1 cup raspberries plus additional for garnish and accompaniment....For the glaze1/3 cup raspberry jam  (seedless is more convenient)1 tbl sugar....For the Ganache1/4 cup heavy cream6 oz. fine quality bittersweet chocolate / choppedI always advise that one reads the recipe through once, gather their stuff, including utensils, then read down through the recipe again to make sure nothing was missed. . Start by grinding the almonds in the food processor for about 5 minutes or until they are of the consistency of a nut butter, and set it aside. Using the double boiler to melt the chocolate & butter, stirring occasionally. Once melted, remove the bowl from the top pan.   In the large bowl of an electric mixer beat the eggs until they are pale, adding the sugar gradually, and beat the mixture until it is very thick and pale.  Beat in the chocolate mixture, the framboise, and the reserved almond butter & beat the mixture until it is combined well.  Into the bowl sift together the flour, the baking powder, and the salt, beat the mixture until it is combined well, then fold in 1 cup of the raspberries gently.  Turn the mixture into a well buttered  8 1/2 in spring form pan, spreading it evenly and smoothing the top, then bake the torte in the middle of a preheated 3500 F oven for 40-45 minutes or until the tester comes out clean..  Let the torte cool in the pan. . Make the Glaze:   In a small heavy saucepan combine the jam and the sugar, bring the mixture to a boil,  stirring it for 3 minutes.  If you have the seeded kind, you will need to force the mixture through a fine sieve into a small bowl, pressing hard on the seeds.  Invert the torte onto the rack, set over wax paper, remove the bottom of the pan, and spread the glaze on the top and sides of the torte. Let the torte stand at room temp. for 2 hours  or chill it for 30 minutes, or until the glaze is set.  The torte may be prepared up to this point , up to 1 day  in advance and kept on the rack (cover with an inverted bowl).  . Make the Ganauche: In a small heavy saucepan bring the cream to a boil and remove the pan from the heat.  Stir in the chocolate, stirring until the the mixture is smooth, and let the ganauche cool for 3 minutes.  Pour the ganauche  over the torte, smoothing it with a spatula and letting the excess drip down the sides, and let the torte stand for 1 hour, or until the ganauche is set. Transfer the torte crefully to a serving plate, garnish it with some of the additional raspberries. If you are expecting this to be an overly sweet treat, you may be disappointed. But if you like Raspberries and chocolate, this a a great little torte ( I am not used to  torting my own horn :-) . I have had my torte now since I made it, and although I have been told that it was good (by someone else in my household), they haven't really had much of it.  So half of it is left ( no, I am not suggesting shipping a piece to the first 8 that reply :-),  and it was placed on a plate and covered with a mixing bowl, and is refrigerated.   Well, I am sure you know what is happening to it.   It is drying out. \nSo, if this happens, especially to Chocolate cake of any kind, what I like to do is cut off a piece.  Put it in a bowl (or a sturdy plate if you wish), and slather it with unsweetened applesauce.   Umm um.  The dryness doesn't seem to matter anymore and you get to enjoy the rest of your cake (and the applesauce is a healthy addition too).  You win on all counts. \n\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_89_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_89_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_89_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_89_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. To make carbonated fruit you only need to gather a few things:FruitWhen making carbonated fruit it's best to use firm fruits, like oranges, apples and pears.  I tried doing it with softer fruits like kiwis, strawberries and bananas and it just doesn't work as well.  Apples in particular seem to work the best.Bottle or ContainerYou will need a plastic bottle or a container to put the fruit into.  I have found that a wide mouth Nalgene works best.  You can use an empty 2 liter soda bottle however, just be careful not to add in too much dry ice, more on that later.  DO NOT use a glass jar.  The bottle will be under pressure and broken plastic is safer than broken glass.  If you have a vessel that is designed to take pressure, like a beer keg for example, than  by all means try using that.Dry IceThe final thing you will need is a block of dry ice.  You will only need a tiny tiny amount of dry ice to make the carbonated fruit, but its hard to buy less than a large block of the stuff.  Now, chances are that you have never seen dry ice for sale.  You can't make it on your own and you might not be able to find so easily.I used the Dry Ice Directory to find out where it was being sold locally - they have listings for all over the world.  I live in the east bay of California.  I was surprised that In all of Oakland there was only one distributor - the AM PM Gas Station on Market and Grand in West Oakland.  They oddly enough had a ton of the stuff for sale, and they are open 24/7!  I was very impressed that I could buy dry ice anytime I wanted even if it was only for sale at that one place.**Before you go to buy the dry ice please refer to thisDry Ice Safety Info website.  I am not going to go through all of the safety precautions that should be taken in this instructable, so take a minute to familiarize yourself with its possible safety hazards.**. The first step is to cut up the fruit and put it into the bottle(s).  Cut the fruit as if you were making fruit salad - no seeds or orange peels are wanted here.  \nI cut smaller pieces to fit through the narrow neck of the soda bottle and bigger ones for the wide mouth of the nalgene.  I  highly recommend using a nalgene to make carbonated fruit.. The next step is to cut off a small chunk of dry ice from the block.  You only need about 2 grams, or a piece about half the size of your thumb.  There is no harm to putting in too little dry ice - you will simply end up with only slightly fizzy fruit.  However, putting in too much dry ice IS dangerous and could make a really big mess.  Dry ice is constantly sublimating (not melting) from its solid form of CO2 to CO2 gas.  Unlike regular ice made from water, it goes directly from its solid phase to its gaseous phase - no liquid phase in between.  That is why it sublimates, rather than melts.As a result, the dry ice block will produce gaseous CO2 until there is nothing left of the solid block.  The bottles are going to be sealed tightly with their caps, so if too much CO2 gas is built up inside of the bottle they might explode (the soda bottle bursts at around 115 psi).  We are looking for only a little bit of pressure (30 psi) and so there is no need to add in a big hunk of dry ice.The dry ice in the picture below was enough for both of my bottles of fruit, so each one got about half of the small chips you see below.. As soon as I put the dry ice into the bottles and sealed the top I could see it turning into its gaseous phase.  Most of the dry ice will sublimate in an hour, so thats all the time it will take for the bottles to become fully pressurized.  Waiting overnight is a good idea to let the CO2 gas work its way into the fruit.\nI put the bottles into an empty drawer and closed it for the first hour - I have to be honest, it was the first time I was doing this and I didn't know what would happen.  After an hour I could see that the bottles were under pressure, but not in any danger of exploding, and so I transfered them to the refrigerator for the night.\nYou can only carbonate things that have water in them.  I thought about doing fizzy meat, but I don't think there is enough water in it to dissolve the CO2 into.\nI went to bed and brought the bottles with me to Instructables HQ the next morning.. Once the bottles have sat overnight you are ready to open, eat and burp.\nBleed the pressure from the bottle buy opening the cap like you would open a soda bottle that had been shaken.  \nI cut the top of the plastic soda bottle off with a sharp knife and poured it out into a bowl.  You can simply pour the fruit out of the nalgene bottle through the wide mouth of the bottle.  \nNow that the fruit is out of the bottles it's ready to eat!  It loses its fizzyness pretty quickly, so make sure you chow down in the first 15 minutes after opening the bottles.  \nCarbonated fruit tastes like regular fruit, but it tingles on your tongue.  It's a totally unique experience to eat, and makes you burp a whole lot if you have done it right.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_90_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_90_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_90_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_90_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. First make a hole in the side of the marshmallow by twisting\u00a0a pretzel into it.\u00a0 This way you will have an easier time sticking your sword in it later on.\u00a0\nDip your fork or spoon into the red food coloring and run it down the marshmallow. This will give the red blood vessel look to the \"eye.\" . Get your almond bark ready by melting it in the microwave.\u00a0Heat the almond bark in intervals so that you do not burn it.\u00a0Also, pull out a piece of wax paper\u00a0to put your pretzels on for drying. Once your almond bark is ready, dip the pretzels in\u00a0it.\u00a0 Because of the small amount that I melted, I found it easier to cover the pretzels by using my finger to spread the almond bark on the pretzel.\u00a0Place the pretzel on the wax paper and let it dry.\u00a0 Dip your pretzels one more time if they look bare. Place it on wax paper and let it dry again.\u00a0. To make the sword handles, break apart pretzels into smaller pieces. It may be easier to break a piece of the pretzel off and shave it down.\u00a0Then attach the smaller pretzels to the sword by using the almond bark as glue. Also, cover the handles with almond bark and let it dry.\nTo get the brown handle, take a block or two of your almond bark and melt it (you may have some left though). Then add in equal amounts \u00a0of red, blue, and yellow food color till you are happy with the color.\u00a0 Because I used liquid food color the almond bark turned to a gritty texture. Then apply your colored almond bark to the sword for the handle\nNow your sword is done!. Place the Sword in the fake eyeball where you made the hole earlier.\u00a0 Then add red food coloring to the sword where it sticks in the marshmallow.\u00a0\nTo make the iris and pupil, take an m&m and place a little bit of chocolate on it .\u00a0 Then dip the chocolate on the m&m in the food coloring.\u00a0 Attach the iris to the marshmallow using the almond bark as glue.\u00a0\nEnjoy and Be Careful :)\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_91_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_91_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_91_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_91_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. 1/4 Cup Extra Light Olive Oil  4 Cloves Garlic  2 or 3 slices raw Red Chili Pepper with seeds  3 Anchovy Fillets  1 Can (14.5 Fluid oz.) Petite Cut Diced No Salt Added Diced Tomatoes  1 Can (2.25 Fluid oz.) sliced Black Olives  1/2 Cup good Dry White Wine (I used Barefoot's California Riesling)  1 TBS dried crushed Oregano 1/2 teaspoon Coarse Ground Black Pepper  A sprig of Fresh Basil ,( plus 1/4 cup shredded Basil Leaves for Garnish)  1/2 lb. Fresh Raw Medium or Large Shrimp, peeled and deveined  1 Can (6.5 fluid oz.) Whole Shucked Cherrystone Clams (I used Trader Joe's' Maine Whole Cherrystone Clams)  1/2 oz. Shredded Sharp Provolone Cheese (BelGiosio's Sharp Provolone is excellent)  1/2 package (8 oz.) Fettuccine (or pasta of your choice). *Smash, peel, and finely chop the Garlic Slice the Red Chili Pepper Rinse and Drain the Canned Clams and set aside Drain the Black Olives and set aside Have your spices and the Anchovy Fillets ready to use when needed.*NOTE:  When preparing garlic, I smash the cloves with the flat side of a big knife or cleaver because this makes it easy to remove the skin (peel).  then I slice off that little brown piece (the stem end) from the garlic clove and discard it (it can add some bitterness to the garlic).  Then I finely chop or dice the clove.. *Add 3 quarts of water and a teaspoon of salt to a large pot and begin to bring it to a boil preparatory to cooking the Pasta.Heat the 1/4 cup of Oil in a heavy pan over medium heat.When the oil is hot, add the garlic, red chili pepper, and anchovy fillets.  Simmer, while mashing the Anchovy fillets with the back of a wooden spoon, just until the anchovies melt into the oil (about 2 minutes; you don't want to burn the garlic).Add the tomatoes, black olives, wine, Basil Leaves,  black pepper and oregano; bring mixture to a boil, reduce heat to low or medium low (depends on your stove; you want the sauce to \"bubble\" a little as it cooks), cover and simmer for 15 minutes.Add the shrimp; re-cover pan and continue to simmer for 4 or 5 minutes more, or just until the shrimp begin to turn pink.Add the clams; turn off the burner but leave the pot on the stove for a couple of minutes to allow the clams to heat through; stir in 1 ladle of pasta water (about 3 fluid oz.), taste sauce  and add salt or any seasonings needed, and remove pot from burner.*NOTE:  It will take approximately 10-12 minutes to cook the pasta AL DENTE (follow the instructions on the package).  Try to time it so that the pasta water is boiling and ready to receive the Fettuccine when you have about 15 minutes before the sauce is ready to serve. Plate the pasta, spread some shredded cheese over the it, top with a plenty of the Shrimp & Clam sauce, and scatter some chopped fresh Basil Leaves over it.  Since I had some of that good Riesling Wine left in the bottle, I thought I might as well enjoy a glass of it with my dinner!Mangiare!. Nutrition for this recipe was calculated by me using the MyFitnessPal Recipe Analyzer.  I serving of the Shrimp & Clam Sauce has an estimated 393 calories, but only 13 carbohydrates, and it is full of protein (an estimated 21 grams per serving).However, the Fettuccine (according to Ronzoni) will add about 200 calories and 42 grams of carbs to the dish.per 2 oz. serving. \nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_92_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_92_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_92_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_92_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Grind your coffee. There are several options here and really this is the only time I'm going to recommend a coffee-making-oriented solution. This solution is GET A COFFEE GRINDER. I bought a bur grinder, one with a hopper at the top (the glass funnel thing). If you have one, great, if you don't, get on Gumtree, Craigslist, eBay, whatever. I got a Delonghi one for $20 and it beats the pants off any bullet grinder. Bullet grinders are difficult to use, burn the coffee beans and are really inconsistent. You can use a bullet/blade/blender grinder, but be warned, it won't be an enjoyable experience.If you don't want to grind coffee, get your coffee provider to do it or buy it from the supermarket pre-ground. I'll give you an interesting statistic: a coffee barista told me that coffee grounds have an expected life of 15 minutes. Pre-ground from the supermarket? You're buying stuff that was dead and buried a long time ago. It'll do if you want one step up from instant coffee, but I'm thinking if you're reading this, you want something a bit more fancy.. Grind heaps of coffee or use heaps of pre-ground stuff. If you have a 1.5 Litre (imperial units be damned) jar, put 100 grams of coffee in. Honestly I never really measure it, but that feels about right. You could put half or double in, it would just make it stronger or weaker, but there's a point of saturation so don't overdo it or you're just wasting coffee beans.Pour room-temperature water into the jar and fill it up nearly to the top. Mix it up with a spatula or a spoon.You could also use a whisk if you want to be pointless and inefficient.. Put the jar in the corner and feel really cool about yourself. Leave it there a couple of days. Ambient temperature might play a factor so don't leave it in direct sunlight. Here's a guide - if you feel hot in the same room, it's too hot.Once a couple of days has gone by, put the jar in the fridge. My fridge door started to crack from the size of this jar so I now put this jar in the vegetable tray at the bottom. If you put it on it's side make sure it doesn't leak - coffee tastes great but looks horrible as a brown stain on your fridge shelf!. Take the jar out of the fridge once it is chilled. I leave the method to you but I prefer to use my hands.Get the things you need to filter the coffee.The filter can be:- a paper filter for a percolator- a filter for a drip-filter- or what I use, a generic paper towel. The thinner the better!Get a funnel. I have an Aeropress which comes with a weird hexagonal funnel. It fits the need perfectly.. Put the funnel into the top of the second jar. Put the paper towel into the top of the funnel and tuck the edges of the 'filter' inside the funnel. If you don't, the coffee will drip outside the jar. If you use a proper filter you should be fine to just sit it in the top.Pour the coffee into the funnel slowly, taking care you don't mix up the coffee grounds - it's best if the coffee grounds stay at the bottom of the jar. It will filter through slowly so don't overfill it.When it gets low in the funnel, top it up. The filtering should get slower and slower. Once it is too slow, take the paper towel out and wash it. Yes, you can reuse a paper towel! Put it back in, or if you're lazy just throw the paper towel out and put a new one in.. Keep going until you're finished, and then take the paper towel out and gently wring it out; carefully though, it could easily burst open and put grounds into your beautifully filtered brew. You should get another half a drink's worth if you squeeze it out well.. See this picture. It glows golden brown.That's because I shone a torch behind it. Well it looked cool.Drink it up good. It will taste light and almost tea-like, but it is very strong in caffeine.That's it!By the time I have filtered one lot I wash the first jar out and start filtering again. I never really run out that way. \nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_93_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_93_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_93_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_93_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. What you need for this is nothing out of the ordinary. Simple and quite frankly, fun! Most of this is simply to eye-ball what you need so don't be afraid of being precise with the amounts of the ingredients.you will need... A slow cookeramaz-n smoker ( to smoke the cheese. although we just modified and made our own. We will show you how we did it) smoked wood pellets (any flavor, but I recommend a hickory)------Recipe----------------------------------------------------------------------------pulled pork1 -2lb pork butt/shoulder1/2 cup of chopped onion 1 tablespoon smoked paprika4-5 chopped garlic clovessalt and pepper to taste12 ounces of a dark beer (or your favorite) for me I choose Sam Adams Winter Lagerhalf a jar of your favorite BBQ sauce plus additional for drizzling grilled cheese sandwiches1 tablespoon unsalted butter1 tablespoon flour1/3 cup of your favorite beer 1/4 cup milk3-4 ounces freshly grated smoked white cheddar cheese 4 slices slices white bread softened butter for spreading. Cheese is one of my most favorite things to eat! Even being lactose intolerant, I find myself gorging on delicious pizza or anything to satisfy my cheese cravings. It's even scientifically proven that cheese can be addicting!! Curse you cheese!For this recipe, we are using Vermont White cheddar from Boars Head.A great tasting cheese can always be elevated by something called cold smoking. It's called cold smoking because you do not use fire or heat to cook the cheese, you just use smoke to infuse it with flavor over a few hours.To start, all you need is a grill, some smoking wood pellets and something called amaz-n- smoker. However for us, we simply bought some thin grill grates and bent them to match one to save some cash. You can see how we bent the grate in the pictures provided. The grill plates cost $4.95 and it came with a pack of 3. So no brainer there, we made our own amazin-ly cheap smoker.  When ready to bend them, essentially you are making a square arch to put the pellets in. Worked great for us and we saved some $. Always a win.Light the smoked wood pellets, place the cheese on the other side of the grill (away from the smoldering pellets) and simply close the grill and let it smoke for 2 hours. Keep an eye on it and make sure the smoke is still rising from the vents.. Like your grill is having a secret party without you... After 2 hours wrap it up with plastic wrap and store it in the fridge for up to 2 weeks. The longer the cheese rests, the better the smokey flavor.  So you can smoke the cheese right before you decide to cook the pork or you can do it days ahead (recommended). In case you were wondering, No, it's not hamster food. It's smoking pellets!!~~a fun tip~~~ Did you know, cheddar is a cheese you can eat if you are lactose intolerant? Aged cheese in general most people with lactose intolerance can tolerate as most of the lactose is drained with the whey when making it. However, I wouldn't recommend gorging on them as the small traces of lactose can add up quick and cause havoc on your tummies. Trust me I know.... After you have smoked your cheddar and it is safely wrapped in its cool home in the fridge, you can now cook the pork. Sear the pork on the stove to get a nice flavor. Do this about 2-3 minutes per side.( this step is optional but adds more flavor!!) chop up your garlic and onions and place them in the slow cooker.Then place the pork on top of the chopped onions and garlic. Add the smoked paprika, dark beer, salt/pepper and BBQ sauce and then set the timer. The cook time will vary depending on your size of pork roast. It will be around 5-6  hours per pound on the low setting. I've seen recipes take 8 hours though. Low is slow is the way to go!! I'd save some BBQ sauce to use as extra toppings for the grilled cheese sandwich. Flip the pork every couple hours or so while cooking. Half way through the cook time make sure to shred the pork. . What you are making here is a sort of cheese gravy for your grilled cheese. It'll be the cheese that holds your grilled cheese sandwich together and It's gonna be super delicious!! Your cheese cravings WILL be answered! What makes this grilled cheese sandwich extra good is that the cheese has been infused with a delicious smokey flavor and then sauteed with beer.. What more could you ask! (more cheese maybe??) yeah ok I'll give ya that one. To make the smoked beer cheese sauceHeat a small saute' pan on medium heat and add 1 tbs of unsalted butter. When the butter starts bubbling, add about a tablespoon of flour and whisk it in for a minute or 2. Time to pop another bottle o' beer. I ain't complaining... This time I am using a lighter beer as to not cover the flavor of the cheddar (although I'm sure any type beer will suffice)  Add about 1/3 cup of the beer to the flour mixture continuing to whisk. Then, whisk a quarter cup of milk in. reduce heatAdd the grated smoked cheese in and continue whisking until cheese comes to a smooth consistency. Stir every few minutes while continuing to cook. tip: Remember that these measurements depend on how much you are using. If your cheese mixture is too clumpy, add a bit more milk, if too runny, turn up the heat and add a bit more flour. Now time to assemble your sandwiches. You're getting close to magic!. This is your basic, MAKE A GRILLED CHEESE sandwich part... but with more savory ingredients! The makings should be pretty straight forward, but in case you don't know...Heat another saute' pan on medium heat. Butter the outside of 4 pieces of the white bread (or more, depending on how much you cooked). I bought my own loaf of white bread to make extra big slices. Place each slice butter-side down on the skillet, then top with a few handfuls of pulled pork. Cover in the warm smoked beer-cheese sauce, then top with the other slices or bread, butter-side up. Turn the sandwich over in about 2 minutes per side. Add any more BBQ sauce you may want at any time assembling. Remember that the cheese is already melted and pork is already done, so it won't take as long to cook. Cook until each side is deeply golden and crispy and your desired effect of cheese heaven.. We found we poured more beer than was required into the cheese and it tasted amazing! This cheese recipe can also go very well with a nachos or a dip in a football party perhaps. I truly hope you enjoy your sandwiches. Let me know if you make it!! May the god of cheese be with you!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_94_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_94_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_94_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_94_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Tip #1 Start early for slow cooker recipes. Take your pork shoulder and completely coat with dry rub spices.\u00a0 If you would like a sweet taste add more brown sugar, if you prefer savory add less.\u00a0 After coating pork shoulder (5 lbs) place into crock pot with 2 cups of water on high for 6-8 hours.\u00a0 When ready remove from crock pot and using forks begin to tear pork away from bone.. Tip #2 Prepare your salsas before the meal and refrigerate for at least 30 minutes to get optimal flavors! Now time to make your salsas\u2026we have corn and tomato salsas that we are going to prepare.\u00a0 Open your can of corn and drain out the liquid.\u00a0 Place in small bowl and mix in chopped basil and\u00a0 diced red onions.\u00a0 Set aside.\u00a0 To make your tomato salsa take the diced tomatoes, mix in red onions, cilantro, and cumin\u2026that\u2019s all there is to it. Set it aside.. Now time to prep the vegetables for meat.\u00a0 Dice up red onions, green peppers, jalape\u00f1os, and garlic.. Tip #3\u00a0 Let garlic simmer in hot oil to get the most flavor! Heat olive oil in a deep skillet on medium heat. Once hot add garlic and let it it simmer until golden brown. Once lightly browned add in the rest of the vegetables, stir, and let them cook through for about 5 minutes. Add in about 2 pounds of the pulled pork, about 1 1/5 cups of water, and the spices listed above (use the rest for sandwiches, more tacos). When adding spices taste and add more as you need them.\u00a0 I found that I use a lot of cumin and paprika in comparison to the other spices.\u00a0 Also if you have a lime handy squeeze a bit in instead of using salt.\u00a0 Add in some chopped cilantro, stir, cover and let it simmer for about 30 minutes.. After 30 minutes taste the pork and adjust flavor if necessary.\u00a0 Now if pork mixture is to thin drain about 1/3 cup of the fluid into a small bowl, and mix in about 2 tablespoons of flour until smooth, and return back into the pork and stir.\u00a0 Repeat this until it is a desired thickness.. Tip #4\u00a0 Always pan fry your soft tortillas, it really makes a difference. Place a skillet on high heat and lightly grease with cooking spray.\u00a0 Add on your tortillas and flip until golden brown and bubbling on each side.. Once soft tortillas are browned you can begin to assemble!\u00a0 Place tortillas flat and add pork, followed by cheese, both salsas, and light sour cream. Now fold em\u2019 up and begin to feast!. Suggestions  \u2022 Serve with a side of tortilla chips and even add them into your taco for a bit of crunch.   \u2022 If you like your tacos extra spicy add on Sriracha or your favorite hot sauce.   \u2022 Serve this pulled pork recipe in any sort of style; nachos, burritos, or maybe a tostada. nom nom nom. For this recipe and more please visit my food blog at everythingbutfish.tumblr.com\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_95_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_95_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_95_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_95_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. - 250g caster sugar- 70mL water- 1/2-1tsp salt- 200g cold cubed butter- 120mL thickened cream (36% fat content this is important for a nice thick caramel)- 1tsp vanilla bean paste. - 1 Small saucepan- Mixing spoons- Candy thermometer- Whisk- Glass container for storage. Place the caster sugar and water in a small saucepan and cook the syrup on a medium high heat until the syrup turns a light yellow colour. (This should take 5-8 minutes).Now carefully watch the syrup until it turns a golden amber colour because this will change very suddenly and if not watched it may burn (be careful!). Once the syrup is an amber colour remove from the heat and add the thickened cream and stir in with a whisk. Be very careful when whisking this as there will be a lot of steam coming out of the pot. When the cream is mixed in place the pot back onto a high heat and allow the caramel to come up to 108 degrees Celcius while stirring (use a candy thermometer to read the temperature). Then remove the pot from the heat and allow it to cool until the caramel stops bubbling. Then add the vanilla bean paste and salt (the amount of salt depends on how salty you like your salted caramel).When the caramel has thickened  (stir for about 1 minute) then add the cold cubed butter and continue to whisk the caramel together. (If the mixture splits then you have added the butter when the caramel is too hot.). Serve the caramel while hot over ice cream or cookies. *Filling for baking/ macarons* use the caramel when cold out of the fridge.Place the caramel in a clean glass jar or container and store in the fridge for up to a week. If you need it at a runny consistency just heat in the micrwave until you have the desired consistency.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_96_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_96_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_96_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_96_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Plastic Champagne Glasses1 bag of M&Ms Valentines Colors1 bag of Chocolate HeartsHeart Shaped SuckersScissorsAdditional Items You may want to use:Some ribbon, stickers or any other item you may want to decorate your cupsGift Tags (Optional)Cling Wrap, clear or colored. Heat Sealer\nHeat Gun or Hand Held Hairdryer\nShrink Wrap Bags\n*Note: you can purchase shrink wrap bags that work with a hair dryer and not need a Heat Sealer or Gun.\n. Fill cups with M. Set a few Candy Hearts on top.. Cover Cup with desired wrap.\n*Go to Step 8 for shrink wrapping.. Poke a hole in Cling wrap and push suckers through. \nAdd a gift tag, ribbons, bows, etc.. Wah Lah!\nYour gift is finished and ready to go for a fraction of what you would pay to buy a similar item already made from any store. About $1.20 each.\n. Prepare your bag to fit over your gift. \nMy Bags were 6 X 6 inches, I folded over one side at the 41/4 inch mark.and creased the bag.. Lay your bag across the sealer and seal on the fold. Move the bag over slightly and seal again next to the first seal so your bag won\u0019t break open when shrinking.. Fold the bag in half and cut a notch in the top large enough to allow the suckers to pass through.. Place the bag over your cup and align the seam so it isn't running down the center of your cup.. Insert suckers through the hole. \nUse a Heat Gun or blow dryer to shrink the wrap to fit your cup. Because you are working with Chocolate this is a delicate operation. Make short, swift passes pausing between each one to keep your cup from becoming hot and melting your Chocolate, rotate the cup from the bottom as you work. \nAdd your final decorations.. Here is another variation of your project with curled ribbons.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_97_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_97_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_97_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_97_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Ingredients:\nHot Dogs - preferably grilled (for flavor), but since hot dogs are pre-cooked, you could use them straight from the package. I used about 2\u00bdPizza DoughCheddar Cheese (about 1 cup)Corn Meal and/or pan sprayChili (maybe around 1 cup)Onions (not pictured)Any other chili-dog-type toppings you'd like to add!Supplies:Pizza Pan (or you could use a pizza stone)Pizza Cutter\n\t\tCutting Board\n\t\tKnife\n\t\tGrill (optional)\n\t\tGrilling Utensils (optional)\n\t\tSpoon for spreading chili (not pictured). \n\t\tFirst, spray the pan with pan spray and sprinkle on some corn meal.\n\t\tThen stretch out the dough in your hands (I just do it in the air - let it hang until it's wide enough for the pan). I wish I knew how to spin it around ;)\n\t\tPut the dough in the pan and spread it to the edges.Preheat the oven to 450 degrees Fahrenheit. . Grill the hot dogs (if you want to). Hot dogs are fully cooked, so you don't need to grill them if you want to save a step. I grilled them to add some flavor.\nSlice the hot dogs into 1/4\" - 1/2\" thick pieces. I wound up using about 2.5 hot dogs. That was about perfect for 1 piece in each 1-2 bites of pizza. You could do more or less to taste.\nHeat up the chili if you want. Mine was frozen so I had to heat it. If it's fully cooked and not too cold, you could probably skip heating it. If it was in the fridge, I'd probably heat it a little.. \n\t\tMake sure the dough is still stretched out to the edges. Poke holes in the dough with your nails. (this prevents crazy bubbles but still allows for some little ones)\n\t\tSpread the chili out on the pizza. I tried to get some meat in each bite. I wound up using somewhere around 1 cup, maybe a little more.\n\t\tSprinkle on the cheese. I always do about 1 cup when I make a pizza.\n\t\tSprinkle on onions and any other extra toppings you're adding.\n\t\tEvenly space out the hot dog slices on the pizza. Mine have about 1 slice worth of space between them (or that's at least what I was going for). This turned out pretty good but you can tweak it to your taste.. The oven should be preheated to 450 now. Put the pizza in.\nBake the pizza for about 15-20 minutes. Normally when I make pizza, it takes about 20 minutes but it was faster this time. I think it was because the hot dogs and chili were warm to begin with. Turn the pizza 1/2 way through cooking to ensure even baking.\nOnce the crust is golden brown and the cheese is nice and melty, take it out of the oven and cut into slices. The hot dogs started to brown/burn on the top, but they were tasty.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_98_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_98_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_98_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_98_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. You'll need:FOR THE CUPCAKES1 cup all-purpose flour1/2 teaspoon baking soda1/4 teaspoon salt1/2 cup boiling water1/3 cup cocoa powder1/3 cup semisweet chocolate chips1 tablespoon instant espresso3/4 cup sugar1/2 cup sour cream1/2 cup vegetable oil2 large eggs1 teaspoon vanilla extractFOR THE FILLING:3 tablespoons water3/4 teaspoon unflavored gelatin4 tablespoon (1/2 stick) unsalted butter, softenedPinch salt1 teaspoon vanilla extract1 1/4 cups marshmallow cr\u00e8me (don't substitute marshmallow sauce)FOR THE GLAZE:1/2 cup semisweet chocolate chips3 tablespoons unsalted butter. Adjust oven rack to middle position and heat oven to 325 degrees F. Grease and flour 12-cup muffin tin. Combine flour, baking soda, and salt in bowl. Whisk water, cocoa, chocolate chips, and espresso in large bowl until smooth. Add sugar, sour cream, oil, eggs, and vanilla and mix until combined. Whisk in flour mixture until incorporated. Divide batter evenly among muffin cups. Bake until toothpick inserted into cupcake comes out with few dry crumbs attached, 18 to 22 minutes. Cool cupcakes in tin 10 minutes, then turn out onto wire rack and cool completely.. While the cupcakes bake, combine water and gelatin in large bowl and let sit until gelatin softens, about 5 minutes. Microwave until mixture is bubbling around edges and gelatin dissolves, about 30 seconds. Stir in butter, vanilla, and salt until combined. Let mixture cool until just warm to touch, about 5 minutes, then whisk in marshmallow creme until smooth (takes a bit of effort, just keep whisking); refrigerate until set, about 30 minutes. It will seem thin, but will set up quite thick. Transfer 1/3 cup marshmallow mixture to pastry bag fitted with small plain tip (I don't have small pastry bags, so I used a snack size Ziploc); reserve remaining mixture for filling cupcakes.. Microwave chocolate and butter in small bowl, stirring occasionally, until smooth, about 30 seconds. Cool glaze to room temperature, about 10 minutes.Cut cones from the cupcakes by inserting the tip of a paring knife at a 45-degree angle about 1/4 inch from the edge of the cupcake. Cut out and remove the cake cone. Since each cupcake cone is a little different, I placed the cones on a piece of wax paper in the same spot their cupcake was on the rack, so I could keep each cone with it's original cupcake. For example, if I cut the cone from the top left cupcake, I placed the cone on the top left corner of wax paper. Cut off all but the top 1/4 inch of the cone, leaving a circular disc of cake. Discard\u2014better yet, eat!\u2014the bottom of the cone.Using a spoon, fill each cupcake with one tablespoon marshmallow mixture and then top with the reserved, trimmed, cake cone. If the tops of your cupcakes aren't uniform, or perfectly smooth, don't fret, the glaze does a great job of coating the tops. . Frost each cupcake with 2 teaspoons cooled glaze, spreading it over the top as best as you can, and let sit 10 minutes. Using pastry bag with reserved filling, pipe curlicues across glazed cupcakes. (As you can tell, I tried to get a bit fancy with the writing, I did my best to pipe my mom's initials because I made these for her birthday.) Serve. Cupcakes can be stored in airtight container at room temperature for 2 days.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_99_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_99_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_99_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_99_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Pitted datesUnsweetened coconutVitamix or other strong blender/food processorWax paper. Add 1-2 cups of shredded coconut to your blender to make it finer. The smaller the pieces the better it will stick. Remove from blender and set aside. . Add all of your dates to your blender and slowly increase the speed, you want a pasty result. 10-15 seconds should be enough. Scoop the date puree and move to a small bowl. . Use a spoon or your hands to make small balls. Roll the balls in the coconut and then set aside on wax paper. . Freeze your date balls for a few hours to harden, then they can either be left in the freezer or moved to the fridge. Should stay good in the fridge for a couple of weeks, longer in the freezer. \nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_100_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_100_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_100_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_100_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. The materials you will need will depend on how many you want to make. Each one requires:\n3\" x 7\" piece of cardboard (preferably decorated)\n2.5\" x 2.5\" card for the front (you can make this or buy one, I'll describe the former)\n12\" ribbon\nDouble-sided tape or glue\nHolepunch (anything that will make a hole in cardboard will do)\nYou can make three of the cardboard pieces from one standard (8.5\" x 11\") piece of cardstock.\n. Once you have you 3\" x 7\" peice(s) of cardboard you should fold it in half and cut a half-circle out of the non-folded end. Just below the half-circle cut punch a hole with your holepunch. I used a square holepunch for that extra touch of fancy.\nPlace the double-sided tape along the edges (see picture below). Press firmly.\n. Cut two pieces of paper about 2.5\" x 2.5\". Make the outer paper (the nicer one) a bit smaller than the backing. Place two hole punches in the top corner. Then insert some ribbon around these two holes and tie a bow. The space inbetween the cards is where you write your note (sonnet, love poem, gum related joke, etc.).\nUse the double-sided tape or glue to stick this to one side of the larger cardboard piece we made in step 2.\n. Cut a piece of ribbon to about 12\". Thread the ribbon through your punchhole. Make sure that the ribbon is even on both sides of the container. Now take your gum and use it to push down the ribbon all the way to the bottom. Tie the ribbon in a knot(or bow) about an inch from the top of the container. Cut off any excess ribbon above the knot.\nWhen you pull on the ribbon the gum will move up the cardboard container and out of the package. To finish this make sure there are no ugly lines or cuts, exposed tape or poorly shortened ribbon.\nThese take almost no time to make and add that special 'hand-made' touch to your Valentine's Day. Thanks for reading.\n\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_101_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_101_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_101_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_101_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. A couple of months back I was given 50lbs of ground beef, yes FIFTY pounds! Its definately not premium meat something along the lines of 60/40 but it was free so... I broke it up into approx 2lb packs and froze it, thats where I started and while I was digging out the ground meat I came upon some hot italian sausage also in the freezer, popped them in a bowl of water to thaw, a couple of almost past use green peppers and a red onion, a couple eggs, some oatmeal and bread, various spices, sauces and milk. \u00a0There's a cooking term fro what I'm making in this step but I can't recall what it is, it lookis like a glass full of...welll... yea.. but I'll call it yuck, in a glass, bowl or other container put about 1 and a half cups of oatmeal, rip up a piece of bread and add about a half cup of milk and 2 eggs, mix it up and allow to soak while you do the next steps.. sorry about the 3rd pic, I guess my camera thought it looked like yuck as well. dice up your peppers and onions, add in some garlic , I had some left over spinach, you can add carrots, celery or any veggies you like. I often use a package of spicey breakfast sausage , this time I have hot italian sausage links, slice the links and peel the meat from its casing, get it in the bowl with the veggies, then add your ground meat, your oatmeal and eggs mix, any seasonings your going to use and mix completely. Ok remember when I said this wasn't the best ground beef? Well we need a way to get the grease away while cooking AND not make a mess of my oven, luckily I have a perforated baking tray (YES its clean! or as clean as it gets without a wirewheel) I covered the tray with tinfoil and using my repurposed awl/icepick (icepicks run around $5, the awl was .75) poked some holes .\nAlso remember this is about a meatloaf for SANDWICHS not your average small loaf for dinner that takes 2 slices to fill a piece of bread. Start forming your loaf and if you like create a well in the top to add some cheese, then cover with more meat, top it all off with bbq sauce. Your probably thinking.. \"He's crazy how can poking holes in the bottom to let grease out keep the oven clean\"\u00a0 ah grasshopper I may indeed be crazy but Im not stupid !\u00a0 Place a large baking dish under your meatloaf pan and fill it half way with water, okay one not so good side effect of the steam was the bottom of the loaf never browned but thats a small price to pay. Place in a preheated\u00a0350f oven for 90 minutes or so, I know thats longer than a usual meatloaf but this thing is HUGE\u00a0 and because of the steam bath its not going to dy out.\u00a0 The eagle eyed may see my cast iron frying pans that I left in the oven, it doesn't hurt the pans and helps provide regular heat with a flucuating electric stove. Once the meatloaf is done to your liking, take it out and allow to sit for 10-15 minutes before slicing or unless you cant wait like me, let it cool then place in the refrigerator for later, even after 10minutes the cheese in the middle was still molten so be careful ! MAke your sandwich and enjoy or of course you could add a baked potato and make it a meal\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_102_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_102_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_102_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_102_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. I winged the hot sauces and didn't really know what I was going to make until it was underway. You know how the creative process goes.\u00a0I put on a respirator because the air gets pretty thick when all those hot peppers are cooking. I made three hot sauces, bourbon ginger, Ardbeg thai chili and scotch jalape\u00f1o.\u00a0Bourbon Ginger Hot Sauce 6 serrano peppers 3\" of fresh ginger 2 oz bourbon 2-3 Tbs of pomegranate seeds 2 cloves garlic 2 tsp salt 1/2 lime 1/2 cup water 1 Tbs vinegar 1 Tbs olive oil Dash of pepper Split the serranos in half and put them in a cast iron skillet on high until they start to blister and turn black. Toss in the garlic for another couple of minutes. Lastly add the ginger and pomegranate and 1 oz of the bourbon. Toss it all around until the liquid has evaporated. Put all of the ingredients, including what was in the skillet, into the food processor. I left the skin on the lime and just tossed it in. Let it go for 3-5 minutes. Strain the mixture, bottle it and keep it in the fridge.Scotch Thai Chili Hot Sauce 1 cup of thai chilis 2 oz of Ardbeg scotch 1 tsp brown sugar 1 Clove garlic 1 Tbs salt 1 Tbs lime juice 1 Tbs olive oil Dash of pepper Do the same as above, but heating only the chilis and garlic.\u00a0Scotch Jalepeno Hot Sauce 4 Jalapenos 2 oz of Ardbeg scotch 1 Clove garlic 1 Tbs salt 1/2 lime whole 1/2 c. fresh cilantro 1/4 of a small onion 1 Tbs olive oil 1 tsp Cayenne 1 tsp Chipotle 1 tsp Pasilla\u00a0 Do the same as above, but heating only the Jalapenos, garlic and onion.. First off, you're going to make an 8.5x11 piece of paper from a brown paper bag. Just cut the two largest panels, iron them, and then use a piece of paper as a template. It needs to be fairly exact or your printer do its 'I hate you' noises.\u00a0. I pulled images of the ingredients off the interweb and arranged them into a nice grid. I turned the image black and white, pumped the contrast and lowered the brightness to get the mid-tones to show up on the brown paper.. For this bottle size (it's an old Soy Sauce bottle) one sheet of paper was perfect for two labels. Do a test run with a white sheet of paper first. When you print, you may need to tell your printer that you're using a heavier weight paper. Once it's printed, cut it to size, put some glue on the edges and voila. Lovely brown paper textured label.\u00a0. Another alternative is to create a bottle sized bag from the 8.5x11 print out. Since I had to fold and handle this page more, the ink started to crack and come off a bit. I used a laser printer, so you might have better luck with inkjet. I had a box of crackers about the right size and folded the paper around one side like a present. Then I creased the two sides inward. The whole project turned out pretty well and definitely got the \"I can't believe you made this!\" response.\u00a0\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_103_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_103_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_103_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_103_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Coffee:\nabout 2 cups of strong coffee\n1/4 tsp allspice\n1/2 tsp cinnamon\u00a0\nabout 1/8 of ginger\nPinch of nutmeg\n1 cinnamon stick or an additional 1/2 tsp of cinnamon\n1/2 c sugar to taste\n1/2 c milkSupplies:\nIce Tray (if making chilled)\nSomething to mix spices in\nA big pot. Make a strong cup of coffee, too watery and it'll be too sweet at the end. You want a good dark pot, it will get weaker by the end.. 1/4 tsp allspice\n1/2 tsp cinnamon\nabout 1/8 of ginger\nPinch of nutmeg\nand if not using the cinnamon stick add 1/2 tsp of cinnamon\nDo not mix the sugar yet.. Combine Coffee, cinnamon stick and spices in big pot over med-high heat. Add sugar slowly to taste.\nRemember, chai is supposed to be sweet!\nNow slowly add the milk and stir slowly. As soon as it starts to boil remove it from the heat.\nDrink up!\nIf you want to try it iced, I suggest adding a scoop or so of vanilla ice cream and blending up some frozen cubes of this blend. The cubes don't even need to freeze all the way through, I let mine freeze for about an hour and a half and the centers were a little soft but it turned out fine.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_104_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_104_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_104_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_104_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Oil Spray\nPotatoes (used Russet this time)\nAny type of seasoning you'd like, curry powder and salt/pepper was used this time around.. Preheat the oven to 400 F.\nWash your potato. If you feel like you dislike skin or find it too thick, use a peeler to get rid of the potato skin. If not, then just start slicing. I found that I enjoy not burnt chips, so I've been safeguarding myself from it by slicing them rather thickly. Harder to burn them that way.\nTip from multisync: Wash the slices in water in order to remove a bit of the starch and make the chip crispier.. Get some paper towels, line the potato slices up and pat dry of extra moisture.. Spray your baking pan or cookie sheet with the oil spray and start lining up the chips in. If you'd like, you can spray again on top of the potatoes for crispier chips.. Place the chips in the oven and wait for twenty minutes or so, depending on the thickness of your slices. The chips should be a golden brown by the time they are done. When they are baked to your approval, take them out of the oven and start seasoning them immediately, as the oil on the chips will still absorb the seasoning.\n**NOTE: You can also flavor your chips before baking, however I found that the smaller chips are most likely to burn and also burn the spices on it, making for an untasty chip. Flavoring afterwards is merely a precaution.**\nAnd you're done! Enjoy! Thanks for reading/looking. :)\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_105_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_105_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_105_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_105_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Ingredients:\n1 cup of butter (2 sticks)\n2 eggs\n1 tsp vanilla\n1 1/2 cups of sugar\n3 cups of flour\njelly or jam (flavor of choice)non-food materials: cookie press - use star attachmentMix butter, eggs, sugar, and vanilla together. Once smooth add in flour 1/2 a cup at a time.When all the ingredients are mixed let the dough sit out for 15 minutes. In my experience this makes the dough firmer and the cookies chewier.. You can buy a cookie press at most kitchen supply stores and online.  Many of the newer models operate and look like a caulk gun - this is what I use currently.  My mom has one of the really old school cookie presses.To fill the cookie press take a handful of dough and roll it in the palm of your hands so it looks like a snake.  This will help it easily slide into the cookie press tube. Once the dough is in (and not hanging out the end) put the star attachment on the top and secure it with the front ring.. Put cookies on a non-stick pan (don't grease).\u00a0 Position the cookie press as seen in the image and press down handle of cookie press and dough will come out.\u00a0 You want to fill the gap between the cookie sheet and the cookie press with dough.\u00a0 Don't press too hard, you don't want dough squeezing out the sides.\u00a0 Once you have pressed out enough dough lift the cookie press up and you will see your cookie!\nYou will get the hang of this over time.\u00a0 I get very particular about the size of my cookies.. To make room for the jam use your knuckle to press a small indentation in the center of each cookie.\nPut a little jam on the tip of a knife and use the knife to put the jam in indents of the cookies.. Bake at 350 degrees for 8-10 minutes.\u00a0 You will know they're done when the bottoms of the cookies are slightly browning.\nLet cool for 15 minutes and enjoy!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_106_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_106_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_106_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_106_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. You will need: Chocolate chess pieces mold (available online or in specialty cake supply shops) 12 oz White candy melts (they come in 12 oz bag) 12 oz Black candy melts (they come in 10 oz bags) Plastic piping bags or plastic squeeze bottles Small paring knife (not pictured) Pair of white cotton gloves--you'll thank me later on this one! You will NOT need: Anything to grease the mold--the chocolate will pop easily out of a clean, *dry* mold. (Water can make chocolate seize, so be sure to have everything dry.). Put a large handful of candy melts into the plastic piping bag or the plastic sqeeze bottle. \u00a0(I like to place mine on a kitchen towel to keep the plastic bag from melting on the hot glass turntable after a few rounds of microwaving). \u00a0Only microwave for 30 seconds at a time and remove after each 30 second session and squeeze bag/container with your hand until thoroughly mixed. \u00a0You don't want to overheat the chocolate and have it seize in the containers. \u00a0The chocolate will scorch if heated too hot and turn into a hard lump of chocolate that is unusable. \u00a0Squeezing the bag after each interval will mix up the hot spots with the cool spots and keep it more evenly melted. \u00a0Keep microwaving until chocolate is completely fluid, maybe even cutting down the time in the microwave to 15 seconds for the last lump of chocolate bits and then massaging the bag really well.. Fill the chess mold with your chocolate. \u00a0Do not fill past the edges of the reservoirs or you will give yourself more work later cleaning up the seams when joining the two halves together. \u00a0Air bubbles will have formed inside the chocolate piece so you will need to lightly tap the mold on the table several times until you see the bubbles rise to the surface. \u00a0You can then pop the bubbles with a toothpick. \u00a0Tapping the mold on the table also helps to level the liquid chocolate in the pieces, so it is a good thing to do it after filling each reservoir before the chocolate hardens at room temperature. \u00a0After all the reservoirs are filled, you can place the mold in the refrigerator for a few minutes to harden up the pieces. \u00a0Since the melted chocolate isn't too hot to begin with, it will harden rather quickly--maybe even hardening in the piping bag before you are done filling the mold. \u00a0When that happens, pop the bag or bottle back into the microwave to remelt for 30 seconds again. \u00a0. After the pieces are hard, you can remove them from the refrigerator and unmold them. \u00a0I would recommend using your hand as a brace until you can gently place the mold upside down on the table, or the fragile pieces will fall to their untimely doom. \u00a0For the more stubborn pieces, gentle pressure from your fingers should pop them from their plastic form. \u00a0Cotton gloves are recommend at this point since the warm body temperature of your hands will easily melt fingerprints into the sides of the pieces. There are two ways to attach the halves together to make a whole chess piece. 1) Use melted chocolate as a \"glue\" to attach the two halves together. \u00a0Hold halves together until pieces are stuck together. or\u00a0 2) Take a cold harden half and place it on top of the still liquid match in the mold before you place it in the refrigerator to cool. I've done both ways and prefer the sandwiching the two hardened halves together with melted chocolate. \u00a0I tend to have less to clean up on the seams later than when I have to perfectly float a piece of hard chocolate on the liquid chocolate and hope it doesn't slide off before I get it to the refrigerator. \u00a0But you may find that it works better for you. \u00a0. Using a sharp paring or exacto knife, cut the hardened chocolate seams flush with the piece. \u00a0Sometimes I will even \"buff\" out a seam with my finger if it is not too big or to fill in a small gap between the edges. \u00a0. You'll need to mold the pieces the following number of times for a complete chess set. Rooks: \u00a04 white halves for two complete pieces and 4 black halves for two complete pieces Knights: \u00a04 white halves for two complete pieces and 4 black halves for two complete pieces Bishops: \u00a04 white halves for two complete pieces and 4 black halves for two complete pieces Queen: \u00a02 white halves for one complete piece and 2 black halves for one complete piece King: \u00a02 white halves for one complete pieces and 2 black halves for one complete piece Pawns: \u00a016 white halves for eight complete pieces and 16 black halves for eight complete pieces But of course it is always wise to mold a few extra halves to account for breakage that might occur when unmolding the pieces or squeezing the two halves together too tight. \u00a0You can always throw the broken pieces back into the bag/bottle to remelt the chocolate.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_107_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_107_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_107_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_107_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Peel onion .. Find plastic lid that the onion will fit into .. (This onion is already sliced as i started the photo shoot after i was midway through the dicing process ). . Slice a checker board pattern into the onion. ( Cut into the onion starting at the right edge . Press the blade into the onion until the blade is impeded by the plastic lid .. Remove the blade from the onion and repeat the slicing process in parallel cuts across the rest of the onion to the left. Rotate the onion 90 degrees and repeat the cutting process perpendicular to the first cuts. ) ( I used a radial cut method in the photos. I describe the checker board cut pattern because I like it better.) . Turn the onion onto its side... Slice across the onion and the previous cuts creating nicely diced onions. \nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_108_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_108_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_108_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_108_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Always wash your hands before making food.. \u2665bread \u2665jelly \u2665butter \u2665paper \u2665knife \u2665toothpick \u2665pen \u2665tape \u2665plate\u2665scissors. lay out two peices of bread, side by side, on the plate.. open the butter. using the knife spread the butter on the peice of bread to your right. Put the lid back on the butter.. open the jelly. using the knife spread the jelly on the peice of bread to the right. Close the lid on the jelly.. lay the peice of bread to the left on top of the peice of bread to the right.. cut the bread into four/three lines using the knife.. fold the paper in half.. on the fold draw  flags and then cut the flags out.. slide the toothpicks inside the fold in the flag. tape th flag around the bottom to hold the toothpicks to the flags.. roll up the bread and stick your toothpick flag and stick it through the center of the roll.. . the rolls do not always roll perfectly like the one in the pictures. (Remove toothpick before eating.) Enjoy! \nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_109_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_109_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_109_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_109_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Wash, peel, and coarsely chop the carrots.\u00a0 I often weigh the carrots to help me better estimate how much salt and garlic to add.\u00a0 In the images, I'm using around half a kilogram.. \n          Use enough oil to cover the carrots by half-an-inch or so, and heat up both the oil and carrots.\u00a0 There's enough water in the carrots that under medium heat, the mixture will stay near 212\u00b0F.\u00a0 Unlike some frying recipes, like Pimientos de Padr\u00f3n where not much oil is absorbed, the carrots will soak up some oil.\u00a0 So, I\u00a0like to use exclusively high-quality olive oil.\u00a0 Steve McCulley of Apollo Olive Oil -- my favorite producer -- suggests that olive oil's polyphenols survive up to 320\u00b0F.\u00a0 Here's an email from Steve:The polyphenols are still  intact at 210 F, In fact polyphenols protect the oil when the heat rises up to  around 320 where they may begin to break down. I cook with extra virign olive  oil but others think it is waste. I cook with it because I notice a difference  in taste and I know polyphenols are still retained. Others do not notice a big  difference in taste so choose to cook with a lesser grade oil. Some people cook  with a high quality wine and others do not. Like wine you can taste the oil much  better when uncooked and drizzled fresh over cooked food. I, however, still cook  with real extra virgin olive oils because other oils are not only not really  extra virign they are frequently defective. If budget is a consideration I would  consider cooking in a lesser oil and dressing with ApolloOlive Oil.  When the carrots are cooked, you can filter and reuse the oil.\u00a0 I\u00a0keep my oil in the refrigerator: in the images, it's still cold, which is why it's an opaque\u00a0yellow.. Cook the carrots under oil until they've lost most of their water.\u00a0 The objective is to concentrate the carrots' flavor without exposing them to oxygen or temperatures much above boiling.\u00a0 You can tell when this happens because the carrots' volume will be significantly reduced, and the temperature of the oil will start creeping upwards -- I\u00a0usually call it done at around 230\u00b0F.\u00a0 Depending on your heat-level and amount of carrots, this will take 30-60 minutes.. Drain the carrots.\u00a0 I\u00a0use a kitchen sieve, and just let the oil drip back into the pot.I\u00a0then use the sieve to strain the oil for later reuse.. Add salt and garlic to taste.\u00a0 For 500 g of peeled, uncooked carrots, I\u00a0use 3-5 g of sea salt and 3 cloves of raw, micro-grated garlic.\u00a0 Note that if you want to reuse the oil, you don't want to add the salt and garlic to the oil.. Serve while still warm.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_110_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_110_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_110_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_110_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. 1 Single Pie Crust Rolled Flat (I made mine using this recipe, A Healthier Flaky Double Pie Crust w/ Coconut Oil) 4 Ripe Peaches - Pared and Sliced1/4 Cup Granulated Sugar1/4 Cup Brown Sugar 1/2 Tablespoon Lemon Juice1 Table spoon Cornstarch 1/16 Teaspoon Salt (literally a pinch)* For the pie crust, because I only needed a single crust, so I divided the ingredients by 50% and it worked like a charm. . I highly recommend making your own crust, for this recipe. It's super easy and 100% worth it in the end, plus there are tons of great recipes out there. I use A Healthier Flaky Double Pie Crust w/ Coconut Oil for all of my pies and tarts. It takes 5 minutes to pull together and only an hour to chill. The final results are melt in your mouth flaky.While the pie crust is chilling, combine the sliced peaches, granulated sugar, and brown sugar in a medium sized bowl. Cover with plastic wrap and set it in the refrigerator for 1 hour.* I call this prep work, because both of these items will need to be done, well in advance and will need to sit for at least an hour. ** Every 20 minutes or so, I give the peaches a toss in the sugar mixture as they macerate.. Once the pie crust has chilled, remove it and let the dough disk rest on a floured surface for about 10 minutes.Gently, roll out the dough evenly in all directions to form a 14\"x14\" sheet. Now cut 6 - 6\" rounds out of the dough sheet (I used the lid of a pot, which worked great, however any round object will do as a template). You may need to reshape the dough disk and roll it out a second time, to get all 6 rounds.Next, place each of the dough rounds in to the muffin pan, carefully working each round into the shape of the cup (if any small holes develop, you can easily patch them with a small piece of the the scrap dough).Finally, to top the tarts, using a small cookie cutter, stamp out 6 shapes (be creative, there are a million cool cookie cutters out there...I used a star shape) and transfer them to a foil lined baking sheet.Cover both the baking sheet and the muffin pan with plastic wrap and put them back into the refrigerator to chill.. Remove the macerated peach slices from the refrigerator and drain well, reserving the liquid in a medium sized pot and returning the peach slices to the bowl. Next, add lemon juice, cornstarch, and salt to the pot with the reserved peach juice. Bring to a rolling boil over medium heat, stirring constantly until the mixture begins to thicken (5-6 minutes).Once thickened to your desired consistency (I stir for about 10-12 minutes) , pour it back into the peach slices and stir until combined.. Preheat the oven to 425 degrees and move the oven rack to the lowest position.Next, spoon the peach tart filling into the prepared crusts and top with your decoration of choice.Finally, bake at 425 degrees until the edge of the crusts are a light golden brown. Reduce the temperature to 375 degrees and continue baking until the edge crusts are golden brown. . Remove the tarts from the oven and allow to cool in the pan for 4-5 minutes, until set. Then remove the tarts from the muffin pan and cool on a wire rack for 1 hour. Now it's time to sit back, relax, and enjoy. I promise these tarts will not disappoint. Plus, they are the perfect size to hold 1 scoop of vanilla ice cream on top, for a prefect peaches and cream experience. Warm or cold, they are delectable...In fact, I wish I had one right now (seriously). I really hope you've enjoyed this Instructable! Happy Baking!!!Just a note: Please, when ever possible, support your local growers. Not everyone is lucky enough to have access to locally grown produce, if you do, it's important to help keep it alive. Thanks! \nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_111_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_111_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_111_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_111_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. 5 large Jalape\u00f1os 1 Pack of Bacon 1 container of cream cheeseBBQ RubBBQ Sauce (optional). First you will need to prepare your ABT's using all the ingredients above. Start by slicing the Jalapeno in half and with a spoon cleaning out the seeds and membrane. Next you will fill the half Jalapeno with cream cheese. Make sure to fill it full. Next you are going to want to apply some of your bbq rub onto the cream cheese. Lastly you will need to wrap your stuffed jalapenos with one full slice of bacon. TIP: Make sure to wrap it firm, this will help it cook together and you won't have to use toothpicks.. One your Atomic Buffalo Turds have been put together you will them place them onto your grill using indirect cooking with a tempreture of around 300-325 degrees. Place a small chunk of hardwood in for smoking (optional) and then close the lid and begin cooking for 1 hour 15 minutes.. After 1hr 15 mins, your bacon wrapped jalape\u00f1os should be done. If you like your bacon more cooked feel free to leave them on for a few more minutes or until your preferred doneness. Let them cool for a few minutes to allow the cream cheese to cool a bit. Serve it up with your favourite BBQ Sauce, Ranch Dressing or blue cheese sauce and enjoy.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_112_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_112_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_112_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_112_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. For this recipe you will need:Small package vanilla instant pudding1 cup eggnog (with or without alchohol)8 ounces cream cheese8 ounces Cool WhipGraham cracker crust. Mix the instant pudding with 1 cup eggnog. Mix on medium speed for 2 minutes.. Add softened cream cheese and Cool Whip. Mix well on medium speed for 3 minutes.. Spoon filling in graham cracker pie crust. Cover and place in refrigerator for at least 1 hour.. Cut into 8 slices, garnish with chocolate candy (Optional) and serve.Enjoy!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_113_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_113_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_113_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_113_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. I know it\u2019s hard, but do your best to refrain from eating all of the cookies rightthisverysecond. See? The lite version. Looks exactly like the regular, but gives me an excuse to eat more piebecause it\u2019s lite! You may or may not need to taste the whipped topping. It depends on whether you want to save your family from potentially poisonous foods. I do, of course, so I always taste the things I bake for them. Several times. It\u2019s about their safety!. Mixing is hard work. Try to contract out to your kids if possible. Convince them that if they taste this before it\u2019s done they could die.. Beware of razor-sharp pie crust tin edges.Scrape all the pie filling into the pie dish. Taste again, because the crust could also be contaminated. You will have a big tower of deliciousness, as shown above.Who\u2019s boss by forcing it to look like a pie.  Take a scoop of the filling on your finger and eat it to make an example of what you\u2019ll do if it doesn\u2019t obey.. Freeze for 4-5 hours or until relatively solid before serving. Keep frozen if you have leftovers (HA!)\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_114_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_114_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_114_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_114_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Warm the milk in a small saucepan until it\u2019s about 110 degrees. While the milk is heating up go the to next step.. Chop up 2 tablespoons butter and let it soften in a warm place. We have found that on top of the refrigerator is handy for this.. Pour the now warm milk into a medium bowl and then sprinkle in the yeast. If you do not have yeast in an individual packet you can simply use a tablespoon of yeast. Let the yeast start propagating and growing for about 3 minutes. Once the yeast has had a chance to get going mix in the brown sugar and 1 cup of flour.Next add the softened butter that you prepped and then slowly mix in the remaining 1 1/4 cups flour and the fine salt to make a sticky dough.. Prep a clean place to knead the dough and lightly dust the surface with flour. Knead while adding more flour if needed until it is smooth but still slightly tacky. This should take about 5 minutes. Next, shape the dough into a ball and place it in a lightly greased bowl and cover it with plastic wrap. Finally let it rise in a warm spot (AGAIN, THE TOP OF THE FRIDGE IS USUALLY A GOOD SPOT). Give it a about an hour to rise; it should doubled in size.. Once the dough has risen you will want to \"punch\" it. This is exactly what it sounds like. You just punch it to deflate it some. Next preheat your oven to 450 degrees fahrenheit and then grease the cookie sheets where you will later place your finished pretzel creations. Get ready to get your hands on some dough!. Pretzel dough is a pretty easy medium to work with. If you have ever made anything with play-dough you should be well-equipped to let your imagination run wild. Our kids had a lot of fun getting creative with their creations. You can stick to more traditional shapes or go completely crazy and make everything from a butterfly to a baby like my kids did.. This is the secret ingredient that makes these pretzels taste so yummy and chewy. Don't skip it!You are going to want to dissolve the baking soda in 3 cups of warm water in a shallow baking dish. Gently dip each pretzel in the soda solution, then arrange them on the prepared cookie sheet.Finally, sprinkle them with the coarse salt before putting them in the oven. (WE USED FINE SALT AND IT WAS JUST FINE BUT WE WANT TO TRY IT WITH COURSE SALT NEXT TIME.). Place the pretzels in the preheated oven and bake until golden for 10 to 12 minutes. Then melt the remaining 8 tablespoons of butter in a shallow bowl or dish. Dip the hot pretzels in the butter or use a basting brush. Be sure to coat the entire pretzel before placing them on a wire cooling rack to let excess butter drip off.. Serve warm and enjoy! These were truly delicious and turned out really well. I hope you enjoy them as much as we did! Again, if you want to see these steps in action or just get some inspiration from our journey take a quick look at the video above.If you want to see the original foodnetwork.com recipe you can find it here. ____________________________________Thanks for joining us in this adventure! If you enjoy spending time with your kids and would appreciate fresh inspiration and instructions weekly, consider subscribing to my youtube channel where I do a weekly project with my own kids. I would love to have you join our family in the making! Blessings, Caleb\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_115_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_115_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_115_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_115_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Ingredients:\n1 bag mozzarella cheese.\n1 pack bologna.\n1 large can pasta sauce\n1 package of crackers (unsalted or lightly salted work best)\nOther ingredients to taste.\nTools\n1 microwave\n1 deep dish 8x8 that can be used in a microwave.\n1 spoon to spread sauce.\n1 microwave cover (optional). Spoon some pasta sauce in the bottom of the dish. Spread evenly. this will prevent any sticking as we are told.. Now cover the sauce with crackers. I like to put a lot on the bottom layer to make sure the lasagna will come out easily and absorb all the juices of the meats cooking. . Now it is time to lay down the Bologna (pronounced baloney in America) one level thick.. Add the cheese generously.Make sure it is spread evenly though.. Repeat the last four steps until the dish is full or you run out of ingredients. Ours was two layers thick. Be sure to remove the spoon before cooking!!!!. All microwaves are different. I would recommend cooking in more than one step so that it is not overdone or you have a mess in the microwave.Using a loose cover is a good idea.\u00a0 I think we did it in two steps for a total of three minutes. Your mileage way very.. Once the cooking is done, you might wait a minute till it cools down a bit before removing from the microwave. A touch of cilatro or italian parsley\u00a0 would give it more appeal.\u00a0 Then go ahead and eat!!!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_116_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_116_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_116_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_116_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Find your favorite vegan cupcake recipe, vegan brownie recipe, and vegan buttercream frosting recipe and get all the ingredients that you need. In addition to these ingredients, you also need food coloring (I used red for ketchup, yellow for cheese, and green for lettuce) and sesame seeds.\nI used this cupcake recipe ,\u00a0this brownie recipe , and this buttercream frosting recipe . The cupcake recipe asks for coconut oil, but if you don't have any (or your grocery store doesn't have any) you can substitute it with vegetable or olive oil.\nIn a nutshell, you need all of this:\n- apple cider vinegar\n- almond or soy milk\u00a0\n- all-purpose flour\n- white sugar\n- baking powder\n- baking soda\n- salt\n- coconut oil (or vegetable or olive oil)\n- vanilla extract\n- unsweetened cocoa powder\n- water\n- nonhydrogenated shortening\n- nonhydrogenated margarine (I used Earth Balance)\n- powdered sugar\n- food coloring (red, yellow, green)\n- sesame seeds\nAlso, if your margarine and/or shortening was\u00a0refrigerated, I would take it out of the fridge so that it can soften by the time you get to making the frosting.. Mix the batter for your cupcakes. Again, I used this recipe as a basis, but I've copied/pasted it below and tweaked it a bit.INGREDIENTS\n1 tablespoon apple cider vinegar\n1 1/2 cups almond milk\n2 cups all-purpose flour\n1 cup white sugar\n2 teaspoons baking powder\n1/2 teaspoon baking soda\n1/2 teaspoon salt\n1/2 cup coconut oil, warmed until liquid (or vegetable or olive oil)\n1 1/4 teaspoons vanilla extractDIRECTIONS\n1. Preheat oven to 350 degrees F (175 degrees C). Instead of lining your cupcake pans, grease them with a cooking spray.\n2. Measure the apple cider vinegar into a 2 cup measuring cup. Fill with almond milk to make 1 1/2 cups. Let stand until curdled, about 5 minutes. In a large bowl, Whisk together the flour, sugar, baking powder, baking soda and salt. In a separate bowl, whisk together the almond milk mixture, coconut oil and vanilla. Pour the wet ingredients into the dry ingredients and stir just until blended. Spoon the batter into the prepared cups, dividing evenly.\n3.\u00a0Make one batch fill up to half the depth of the tray (these are the top buns) and another batch that fill up to a quarter of the tray (these are the bottom buns).\n4. Bake until you can poke them with a toothpick and it comes back out clean (about 10 mins).\n5. Take them out of the oven and allow them to cool in the pan. Then carefully remove them. I used a fork to scoop them out, but be careful not to scratch the pan! If you try removing the cupcakes before they're cooled enough, they'll break apart.\nVegan cupcakes are different from regular cupcakes that use eggs and butter. For one, they don't rise as much as regular cupcakes. Secondly, they don't cut very well. Other cheeseburger cupcake recipes call for you to bake normal cupcakes and then cut through them. This doesn't work with vegan cupcakes. They tend to fall apart and crumble when you try to slice through them. Lastly, they tend to be a little stickier than regular cupcakes. If you don't grease the pan well enough, the cupcakes will burn.. Mix the batter for the brownie. You only need half the amount of the brownie mix, otherwise you'll end up with a lot of extra brownie mix.\nThis is the recipe I used with appropriate measurements (original recipe here ):INGREDIENTS\n1 cup unbleached all-purpose flour\n1 cup white sugar\n3/8 cup unsweetened cocoa powder\n1/2 teaspoon baking powder\n1/2 teaspoon salt\n1/2 cup water\n1/2 cup vegetable oil\n1/2 teaspoon vanilla extractDIRECTIONS\n1. Make sure your oven is still at 350 degrees F (175 degrees C).\n2. In a large bowl, stir together the flour, sugar, cocoa powder, baking powder and salt. Pour in water, vegetable oil and vanilla; mix until well blended.\n3.\u00a0Again, using the cupcake pans, only fill the pans half-way (or less if you want thinner paddies).\n4. Bake until a toothpick comes clean after poking the center of the brownie (about 15 mins).\n5. Allow to cool. Be careful when removing the brownies. They sometimes can break easily.\nYou can also just make chocolate cake batter if you don't like brownies. However, I liked the combination of cupcake and brownie. . Make your frosting. I used this recipe\u00a0as a basis and tweaked it.INGREDIENTS\n1/2 cup nonhydrogenated shortening (softened)\n1/2 cup nonhydrogenated margarine (softened) (a.k.a. Earth Balance)\n3 1/2 - 5 cups powdered sugar, sifted if clumpy\n1 1/2 teaspoons vanilla extract\n1/4 cup plain soy milk or soy creamerDIRECTIONS\n1. Beat the shortening and margarine together until well combined and fluffy. This is important. The consistency of your frosting depends on how fluffy you make your shortening and margarine combination.\n2. Add the vanilla and soy milk.\n3. Add the sugar and beat until fluffy.\u00a0Try to make your frosting thicker. The original recipe only calls for 3 1/2 cups of powdered sugar. To thicken the frosting, add more powdered sugar. I'd recommend using around 5 cups total of powdered sugar.\n4. Separate a decent amount of frosting per color food coloring that you'd like to make. Mix in enough food coloring until you've reached the desired color. I added in 5 drops at a time.\nIf you don't have frosting bags, you can use ziploc bags. Cut off the tip of one corner and secure in your frosting tips. Then, scoop in your frosting, making sure not to have the frosting leak through. Then, close the bags, making sure to get rid of as much air as possible.. Add frosting on the bottom bun. I put the \"lettuce\" on this layer. I put more of the green frosting on the edges so that it would stick out and can be seen once you put the brownie burger paddy on top of it.\nPlace the brownie burger paddy in the center.\nAdd more frosting on top of the burger paddy. I put the \"cheese\" on this layer. I drew a square with the yellow frosting to imitate a slice of cheese.\nTake the cupcake for the top bun and place it upside-down on top of the brownie burger paddy. This keeps the color of the top bun consistent with the bottom bun.. You don't really need  to do this, but it helps make the slider look more convincing.\nSome recipes use a lemon or an orange mixture to make the sesame seeds stick. This one uses a simple sugar mixture (1 part water, 2 parts sugar). I like this better because it doesn't add any additional taste to the cupcake.\nBrush some of the mixture on the top of the cupcake and sprinkle some sesame seeds on top.. Stuff your face! YOU'RE DONE!!! :)\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_117_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_117_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_117_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_117_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. For the Bars:1/2 cup Brown Rice Syrup (you can substitute corn syrup - but i'm not sure the bars could still be considered \"nice\") 1/2 cup Maple Syrup  1/2 cup Dark Chocolate Chips  2 TBS Coconut Oil  Pinch of Sea Salt For the Chocolate Topping:3/4 cup Dark Chocolate Chips  2 TBS Coconut OilAdditional Toppings (optional - but tasty)1/4 cup toasted nuts - chopped (almond, macadamia, hazelnut, etc.)  Sprinkling of cocoa nibs  Pinch of flaky salt. Over medium heat, bring the Brown Rice Syrup and Maple Syrup to a gentle boil.   Boil for 1 minute; continuously stirring with a heat proof spatula.. Add the almond butter, 1/2 cup Dark Chocolate Chips, 2 TBS Coconut Oil, and a pinch of sea salt. Stir, stir, stir ... Until chocolate is melted and mixture is smooth.. . Hint: Dampen your fingers with water or coconut oil before beginning this step.. In the original saucepan, over medium heat, melt 3/4 cup Dark Chocolate Chips with 2 TBS coconut oil. When the chocolate chips are melted, pour over the puffed cereal. Smooth topping with spatula. Add toppings of choice. Refrigerate for 1 hour - or until topping is firm.. Use parchment to lift mixture out of the pan, cut into 16 2-inch squares.Enjoy!Note... Best eaten within 48 hours (trust - this is not a problem).\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_118_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_118_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_118_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_118_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. you'll need:for the soup- 3 big beef tomatoes (about 400 - 500 g)- a bit more watermelon than tomato (about 600 g)- 2 cloves of garlic- a lime- olive oil- hot sauce or fresh chilli- some black olives- about 200 g of creamy sheep cheese- and of course some fresh chervil on a side note i have to say that chervil really deserves the title \"challenge ingredient\" in the watermelon contest, since it was quite the challenge to even obtain it. i think i went to five shops whitout success, when i wanted to bike to the market to see if i could find some there the mother of all rainstorms broke loose and so i finally decided to go to the supermarket to buy frozen chervil as a last resort and there it was: one last package of fresh chervil sitting amidst a bunch of basil...i hope you'll have an easier time finding it, otherwise substitute with frozen chervil or a smaller amount of tarragon which has a similar but much stronger taste.for the crispy bread- one small pita bread- a bit of fresh parsley, some herbs and spices- a bit of olive oilthis will be enough for 2 persons as a main course or 4 as an appetizer. remove the stem from the tomatoes and make a few shallow cuts into the skin.put the tomatoes in boiling water for one minute, then remove the skin. cut the skinned tomatoes into wedges and remove all the seeds. this is a quite messy affair...put the tomatoes in a blender jar or in a bowl if you don't have a blender.now cut the watermelon into chunks and remove the seeds, put the pieces into the blender jar with the tomato pieces.yeah! even more sticky mess all over the kitchen table. this would have been much faster and easier if i had bought seedless melon.... . now put the jar on the blender and blend (or puree with a handheld mixer). skin the garlic and add it to the mix, as well as some olive oil (about 50 ml) and blend some more until everything is smooth. if you use fresh chilli, add the chilli (seeds removed) to the blender as well.strain through a sieve to get rid of all the tomato and melon seeds that sneaked ther way in and any garlic that wasn't properly pureed.season your soup with salt, lime juice (i used half a lime, but it held a lot of juice) and hot sauce if you didn't add fresh chili.chill the soup in the fridge for several hours.. chop most of the chervil finely (keep some of the nicest leaves for decoration), put in a bowl and add olive oil until the consitency is like a runny pesto. let it stand for some time in a cool place.cut the olives into small pieces, cut or crumble the cheese into cubes.. preheat the oven to 200 degrees.cut the pita bread into slices about 1 cm thick and put on a baking tray next to each other. sprinkle each slice with some fresh parsley, other herbs an spices (i used paprika and some freshly ground pepper), then carefully trickle a bit of olive oil on the slices.bake the slices in the oven for about 15 minutes until crispy and golden brown.you can make the bread some hours in advance, but if you leave them sitting for too long, the bread will draw moisture and be less crispy.this bread works great with other soups and meals an is a perfect way to prevent leftover pita bread from going stale and unedible. although the crispiness will fade over time, it will stay good even for a few days if you store it in a cool and dry place.. put the cold soup into bowls and add some cheese cubes, olive pieces and a bit of chervil oil on top of each bowl.decorate with some fresh chervil and enjoy your fruity refreshing soup with the crispy pita bread.dr_peru and me enjoyed it very much!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_119_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_119_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_119_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_119_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Use a vector drawing tool to make your drawing. This can be anything from Inkscape to Illustrator.. Seperate the egg-white form the egg yolk. Put the egg-white in a small bowl and position it under the lasercutter.. Just play around with the speed and power of the laser to make the perfect white coloured egg (or black if you like). I used speed 50%, power 50% for my chicken on a LaserPro machine.. . The egg-white sculptures are perfect for animation.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_120_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_120_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_120_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_120_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Recipe ingredients 2 1/2 quart popped corn 2 cups corn chips, slightly broken 1/4 cup butter 1/4 cup Louisiana style hot sauce 1 teaspoon celery seed 1/4 teaspoon salt, optional Hardware Popcorn popper large bowl Gallon zip lock bag measuring cups, spoons large spoon or mixing thing. As a new trend for cooking ibbles lets have just the recipe on one page so i don't have use up all my ink printing pictures of your kitchen. Plus they fit better in the cookbook. Recipe ingredient 2 1/2 quart popped corn 2 cups corn chips, slightly broken 1/4 cup butter 1/4 cup Louisiana style hot sauce 1 teaspoon celery seed 1/4 teaspoon salt, optional How to make Buffalo style hot popcorn Pop the corn I used half a cup which makes about a gallon of popcorn. Place the chips in a zip lock bag and lightly brake them up. Mix the chips and pop corn save the zip lock bag for storing the popcorn In small saucepan, melt butter with hot sauce, celery seed and salt. Pour over popcorn-corn chips mixture, tossing gently to coat. This next step I skip. I think it makes the popcorn stale but it might be important for some reason. Spread on 15 x 10 inch baking sheet. (could also be left in the metal bowl) Bake at 350 degrees F for 10 minutes. Remove from baking sheet to large serving bowl. Serve immediately or store in airtight container.. Pop the corn I used half a cup which makes about a gallon of popcorn. Place the chips in a zip lock bag and lightly brake them up. Mix the chips and pop corn save the zip lock bag for storing the popcorn In small saucepan, melt butter with hot sauce, celery seed and salt. Pour over popcorn-corn chips mixture, tossing gently to coat..  In small saucepan, melt butter with hot sauce, celery seed and salt. Pour over popcorn-corn chips mixture, tossing gently to coat. Note: do not substitute celery salt their is already enough salt in this. That being said i like the salt. In fact my brothers would get me to make popcorn (because i was the only one who would not burn it) and they would hide the salt shaker so i would not over salt it.. In small saucepan, melt butter with hot sauce, celery seed and salt. Pour over popcorn-corn chips mixture, tossing gently to coat. This next step I skip. I think it makes the popcorn stale but it might be important for some reason. Spread on 15 x 10 inch baking sheet. (could also be left in the metal bowl) Bake at 350 degrees F for 10 minutes. Remove from baking sheet to large serving bowl. Serve immediately or store in airtight container (like the bag you crushed the chips in).\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_121_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_121_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_121_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_121_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Ingredients:1 1\\2 Cups water  1 Cup sugar  3 Tablespoons gelatin  3/4 Cups icing sugar  3/8 Cornflour  Pinch of cream of tartar  Teaspoon lemon essenceYou'll also need...Large microwave safe bowl  Bowl  Tablespoon  Teaspoon  Measuring cups  Cooking spray  Glass tray (I used a 21 x 21 cm tray that was about 5 cm deep.)  KnifeYou might also need...Beater. Combine water, sugar and gelatin in a large microwave safe bowl. Mix lightly. Next microwave the mixture on high for 3:30 minutes. After microwaving it, stir it well till all the sugar is dissolved, then microwave it again till it starts boiling.. Combine icing sugar, cornflour and cream of tartar in a bowl. Stir it into the sugar syrup till it's thoroughly dissolved.Note: You might need to use a beater to dissolve it completely.. After completing the previous step, microwave your mixture on high for about 3:00 minutes.Note: Your mixture should be thick when you've done microwaving it.. After microwaving your mixture, blend in the lemon essence.Note: You could also swap the lemon essence for rose water and add some pink food coloring. . Pour mixture into a lightly oiled glass tray and spread evenly. Refrigerate till firm.. You're almost there... Cut it into cubes and dust it with some more icing sugar.. Enjoy your divine Turkish delight either by eating it as a delicious treat, or by giving it as a marvelous gift!If you wanted to make your own gift box, check out Showcase Creative's awesome Instructable!Thanks for reading through my instructable and if you liked it, please consider voting for it in the upper right corner!Feel also free to comment and ask questions in the comment section below!...feel also free to watch some of out homemade short films.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_122_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_122_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_122_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_122_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Ingredients:Beef, top round or any other cheaper non-fatty type works best  1 cup Soy sauce 1 tbsp Molasses 2 tsp Liquid smoke 2 tsp Black pepper 2 tsp Garlic powder 2 tsp Onion powder 2 tsp Red pepper flakes 1 tsp Ghost pepper salt 1 tsp Cayenne pepperThis recipe makes enough marinade for about 2.5 pounds or a little over 1 kg.Equipment:A food dehydrator like this one from NescoA sharp knife  A large glass or ceramic bowl  A cutting board  Paper towels. Start the process the day before you want your finished jerky. Throw your beef in the freezer for a couple hours or, if frozen, remove from the freezer for about an hour (this will all depend on how much you have). Since thin slices of beef are ideal for jerky, having the beef partially frozen makes it easier to cut consistently thin pieces.Once the beef is thawed on the outside but still slightly frozen on the inside, put it on a well-washed cutting board and pat it dry with a paper towel. Trim as much of the fat off as possible then slice the beef into \u215b\" to \u00bc\" (3-6mm) slices. Cutting with the grain with a really sharp (not serrated) knife works best. Here I'm using a top round steak, you may use any cut of meat you like but remember that meat with a high fat content will become rancid faster, which makes this company's filet mignon jerky practical yet decadent!. In this instructable I'm using a marinade (wet method) to flavor the jerky. There are other methods you can chose, such as a dry rub, however I enjoy the flavor the marinade brings to the beef.Wash your hands and bowl well then start by adding all of your ingredients (minus the beef) in your large bowl. Separate the beef slices well, since they tend to re-freeze together when in a pile, and add the beef to the bowl a few slices at a time followed by mixing by hand. Ensure all of your beef is coated well.If you have more meat than marinade, simply prepare another bowl with marinade and repeat the steps above. It's easier to work in smaller batches than a large unmanageable pile that might risk an uneven marination of the beef.Cover and put the bowl in the refrigerator overnight or for at least 12 hours. For best results, mix the contents once or twice during this period.. The next day (anywhere from 12-24 hours later) remove the bowl from the refrigerator and wash and dry your dehydrator racks as the manufacturer recommends. If you do not have a dehydrator, wash the metal grates of your oven well and line the bottom of the oven with foil.Remove the strips of beef from the marinade and arrange on the racks in one layer without overlapping, allowing for a little bit of air flow around each piece. When removing the strips of beef from the marinade, allow them to drip-dry, you want some marinade to coat the beef strip but not too much. Assemble your dehydrator and set at 160\u00b0F (~70\u00b0C).Revisit your dehydrator every hour to check the progress and to dab away any fat that is collecting on the top of your strips. With my dehydrator, the process took about 5 hours, this will vary depending upon how thick your strips are and the model of your dehydrator.If you do not have a dehydrator, this can be done in your oven by setting it as close to 160\u00b0F as possible and laying the beefs strips across the oven's metal grates. Prop the door of the oven open slightly with a wooden spoon to allow for the warm, moist air to circulate out. Please be aware that gas ovens pose the risk of carbon monoxide/dioxide poisoning when propped open, so if you go this route make sure you have plenty of ventilation.. Your jerky is ready when you are able to tear the strips along the grain, they should be pliable but not soft and fairly stiff but not brittle. At this point, turn your dehydrator off and store your jerky in a clean and dry container lined with a paper towel and a loose fitting lid. Jerky is shelf stable for about 2 weeks at room temperature and one month in the refrigerator.Congratulations, you have now made some super simple, spicy and delicious jerky at home! I encourage you to try tweaking the recipe to your liking. Substitute in dried peppers, hot sauce, smoked salts, different herbs... the combinations are endless. Just remember to keep any added fats to an absolute minimum and if you decide to use anything but beef, cook the meat to the USDA recommended internal temperatures first before dehydrating (including game meats).\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_123_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_123_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_123_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_123_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Get your oven preheated and your friends ready to help! Here is what you need to get started:1. At least 100 cookies. They don't need to be large, make them 2\" diameter or less. I used cookie mix to expedite the process and had 8 packages of cookie mix based on how many cookies the package estimated making.If you are feeling ambitious, feel free to make your own cookies!2. A lot of frosting (I used 2 tubs)3. A ziplock bag, a piping bag, or a frosting pen4. A trifold poster board5. Lots of pens and markers for decorating the board6. A ruler or other straight edge7. Scrabble tilesIf you do not own scrabble and want to play, use print out pictures of the scrabble tiles.. Like I said, you need to make at least 100 cookies. I used different varieties of cookies because as much as I would love to each oatmeal raisin cookies for days, it's nice to have some variety.To make the cookies, follow the instructions on the package or follow your own recipe. As the cookies came out of the oven, we let them cool before getting to work.If your cookies are going to be bigger than 2\", you might need to trim them down to size. Our cookies came out rather larger so we created a little template out of a napkin of a 2\"x2\" square (our plan for the board), and cut the edges off the cookies so they fit specification. So much math all for cookies. But the plus side of trimming your cookies is you have a plate of cookie edges for snacking! . Fill your ziplock bag/piping bag/frosting pen with your frosting. If you are wanting to color your frosting, put a few drops of food dye in your frosting and mix it around a bit. I used a frosting pen which was not any easier than a ziplock bag. This process is just a test of patience, but you can do it!A list of how many tiles of each letter can be found here--. Take your trifold and cut off out fold, you don't need it. On a clean side, draw out a grid that is 30\"x30\". Divide this grid up into squares of 2\"x2\".Now using a scrabble board image for reference, mark where the special tiles on the board are. It may look like a lot of things, but it just repeats the same patterns for scores. The board and cookies are done and it's game time!We didn't have the room to write numbers at the bottom of the cookies so this is where the real scrabble tiles come in handy. You can't reach into a bag and pull out frosted cookies too easily, so we grabbed normal tiles to see our letters and when we were ready to play a word, we changed out our tiles for cookies and noted out score for that word on the side of the board.Try your hardest not to eat the cookies along the way, and if you do snack on the \"S\" tile, just put down the physical tile down instead! And when you're done with the game, enjoy the cookies with your friends!I hope you had fun playing and let me know if you had fun with this project! Drop a comment if you have any questions or other cookie based games!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_124_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_124_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_124_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_124_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Pork ShoulderDr Pepper BBQ Sauce White Onion Garlic Powder Salt & Pepper Fresh Garlic Crushed Red Pepper. Add the onions to your slow cooker on high heat.Season the meat with salt, pepper, and garlic powder on both sides. Add the 2 cloves of diced fresh garlic to the slow cooker. Add the meat to the slow cooker. Pour in 16oz of Dr Pepper. Cook for 3-4 hours. Drain off 3/4 of the juice. Shred the pork. Add a cup of BBQ Sauce and 1tsp of Crushed Red Pepper. Cook for another 30 mins.. Serve on a bun and top with onion and pickles or your favorite condiments!Recipe makes about 8 sandwiches.Pork Shoulder (2.7lbs @ 1.77/lb) ......... $4.76 Dr. Pepper (16oz) ......... $1.69 BBQ Sauce (1 cup) ...... $0.92 Buns .............................. $0.88 Onion ............................ $0.47 Garlic ............................ $0.15 Total .............................. $8.87Only $1.11 per delicious sandwich!** Dr Pepper logo and name are trademarks of the Dr Pepper Bottling Company and are in no way associated with this video other then being a very yummy ingredient.. If you enjoyed this instructable, please visit our YouTube Channel and subscribe! We have several other recipe videos available and bring you two new ones each week.Cooking With Chrissy Channel Follow us on Social Media! TwitterFacebook Google+ Instructables Tumblr Don't forget to share :D\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_125_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_125_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_125_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_125_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Obviously, you're going to need some meat.  I picked pork tenderloin for time constraints.  It is fairly lean, very tender, and cooks quickly.  With reunion plans comes a tight schedule and very little time to tend the smoke.  So, pork tenderloin, it is.I like to trim any extra fat and connective tissue from the outside of the loin.  In particular, you want to look for any silvery-looking tissue, since that is particularly chewy.  As for the fat, this will take approximately 3 hours to come up to temperature, so there won't be enough time for the fat and such to break down.  Don't worry - this dude will be tender and juicy.. If the meat is the foundation, this is the structure.  (Landscaping and curb appeal come later...)We're making a dry rub.  So, how do you make a rub with coffee?  One parts rub, one parts coffee?  Not exactly, but close.  Like cooking with wine, you want to use something you would actually drink.  (If you rub your tenderloin with instant coffee, you may deserve to be hit by a bus, but I digress...)  The beans I used were roasted within a week and ground for espresso - for the average Joe, espresso grind = dust.  You are about to eat ground coffee beans, and when you take a bite of juicy, mouth-watering pork, you don't want to crunch on big boulders of bean.  By the way, this is where things start to smell amazing.  Here is your secret weapon, i.e. the spectacular Seattle Rub:4 Tablespoons coffee (espresso grind)2 Tablespoons unsweetened cocoa powder1 Tablespoon kosher salt1 Tablespoon raw sugar2 teaspoons cumin1 teaspoon garlic powder1 teaspoon chipotle (smoked) chili powder1/2 teaspoon celery seed1/2 teaspoon paprika1/2 teaspoon cinnamon2-3 grinds of black pepperMix it like you're a DJ.  Feel free to up the quantities of salt or sugar to taste.. Chillax - a hybrid of chill and relax, AKA what the meat does now and what you do next.Pork tenderloin has it rough, so it deserves a good massage.  Give it a liberal coating of the Seattle rub, knead it in, and throw it under foil at room temperature for a couple hours.  Then what?  I don't know... make some coffee?  Take a nap?  Go to the movies?  Do what you feel.  You have time.. If you have a shred of ambition and a charcoal grill, you can BBQ like a champ.  Don't fear the charcoal.  Embrace it with gusto.  Just don't actually embrace it, or you might have to go to the hospital.Now, let's get the coals going.  Make sure the bottom of the grill is empty and not clogged with ashes.  Open the air vents in the top and bottom of the grill and clean the grate.  Fill up your charcoal chimney (easy to find at any hardware store), crumple some news paper in the bottom, and light it.  When flames are licking out the top and the upper coals are starting to turn white around the edges, pour the chimney to one side of your grill up against the wall.  If you were doing burgers or something, you could spread them out, but for barbecue, we want them off to the side in one pile.. My coals are ready - now what?  Two words to remember: Indirect.  Heat.  We're going for oven-like temperatures and not the low heat and long time required to soften something like a brisket or pork shoulder - Tenderloin just doesn't need it.  And remember, you've got hungry friends coming soon.While you are at it, drop a couple chunks of pecan, cherry, or apple wood onto the coals.  They will smolder while it cooks and add a touch of flavor to the meat that you definitely want.  You don't want white smoke churning out of the grill.  We're looking for thin wisps, at most.  You want to smell it but not see it.  Why?  If you over-smoke this bad boy, it's going to taste like a camp fire.  You might like eating a camp fire, but I've got other plans.  Balance, champ.  We want balance.As to the \"Indirect Heat\" bit - slap the meat on the grill opposite your pile of coals.  That's it.  The heat isn't underneath the meat, constantly searing it.  Instead, you have a nice clean fire, a delicious piece of rubbed meat, and the aroma of nut/fruit wood soaking into your dinner.  Punch a meat thermometer (hardware store) into the center of the loin and set it for 185.  Now, put the lid on and walk away.  When the thermometer beeps, smile, because you are about to win the prize.Remember: until your thermometer says so - hands off!  If you are looking, you aren't cooking.  In a few hours, you'll be here.... Your meat is done, but you aren't.  Pull it off, set it on a plate, and cover it with foil.  Let it rest!  It's going to be about 1/2 an hour until you can cut it.  But when you do, you're going to see this.... Mix up some pea salad and sweet-potato fries.  Make some coffee ice cream.  Enjoy the food and the conversation.  That's it.  That is the way to get people to beg you to move back home.  Don't worry - If you ever get tired of hearing \"this is the best thing I've ever eaten,\" you can always take the low road.  Just hang your head and plod back to the masses, where gas grilling and hot-dogs reign.  But if you're me, you live in a one bedroom apartment with limited resources, a beautiful wife, a charcoal grill and a deep love for all things food.  You have long-lost friends coming into town for the weekend, and a reputation as a foodie and grill-master to uphold.  It's time to make some people happy.  Get cooking!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_126_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_126_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_126_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_126_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Below ingredients are for all the variations ice cream of your choice - vanilla was mine :-) jello packs - red , yellow, green soft chocolate fudge banana sponge cake Push Pops - buy them....don't have them?? check next step. Well....we in Asia are sometimes not so lucky in having all the latest things...but if we don't have...we make do! I went out and got the larges syringes I can get from the drug store. I then carefully discarded the needle and cut off the end. The picture should say it all :-).  First you need to make your sponge cake you could buy it ...or make it. 5 eggs 250g\u00a0 sugar 1/2 cup water and 1 1/2 tsp. vanilla - mix together 150g\u00a0 cake flour + 1 1/2 tsp. baking powder + 1/2 tsp. salt - sift together Beat eggs till foamy.....add the sugar... and beat till well blended...about 5 minutes...now alternate the liquid and flour and beat. till smooth. Place in a greased tray and bake . \u00a0you could also check this instructable to make your cakeChocolate\u00a0 Fudge 400 g sweet condensed milk \u00a0 50 g sugar \u00a0 25 g cocoa powder \u00a0 50 g butter Mix melted butter and sugar and coco powder in pan till well blended...add the condensed milk and bring to boil in low heat...take off fire and pour in dish and let it cool.Jelly Buy colors of traffic light and mix according to packagingBanana slice circles and then cut with syringeOptional!color ice cream Now you may not have the required flavoring and colors in ice cream...or your child...like mine only likes vanilla!! well then...what do you do for colors??? color vanilla! yes...so I had some flavors and colors...but to satisfy one child I colored a bit of vanilla ice cream in the colors of the traffic light so he would be happy. Assembling once you have all your ingredients ready is pretty easy...have all your push pops close so it would be quicker...while one push pop is freezing you can move on to the next! Pull the syringe back each time you finish a layer...making room for the next layer The layers for the Ice Traffic Jello Push Pops goes like this - cake circle, green ice cream, freeze, green jelly and cake circle, orange ice cream, freeze, orange jelly , cake, red ice cream freeze, red jelly , cake and finally\u00a0 white ice cream or whip cream.. You've all had frozen banana's dipped in chocolate right? wasn't it delicious? well...how about in push pop form?? For Fuggy Ice Banana Push Pops the layers are Banana slice, vanilla ice cream, freeze, fudge! you could also go with Banana slice, fudge,freeze and then ice cream! both combinations are wonderful! Top up your layer with fudge!. Fudge Ice Cream Cake is one of my favorites...so I thought why not in Push Pops?? And\u00a0 so I layer!! The layers for this is - cake, ice cream, freeze, fudge, cake ice cream freeze , fudge! top it up ending with an Ice cream layer! adding meringue would be great...but obviously I cant stick the pops in the oven! \u00a0 . I enjoyed making and having my push pops...it really cooled my family down ;-) Let me know how you like it...and if you like it....please vote for the frozen food contest! Thank you!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_127_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_127_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_127_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_127_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. You will need Milk, Sugar and Cocoa powder. . In the bottom of a mug mix 1tbsp of Milk, Sugar and Cocoa powder. . Measure out 1 cup of milk in a glass measuring cup and heat in the microwave for 60 seconds.. Pour your heated milk into the mug until all the cocoa mix has dissolved.. I hope you enjoyed this post. please like and comment if you did.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_128_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_128_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_128_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_128_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Cake:- 225g Unsalted butter at room temperature (not melted)- 2 Cups of caster sugar- 2 Eggs- 3 1/2 Cups of Plain flour- 4 Teaspoons of baking powder- 1/2 Teaspoon of salt- 2-3 Teaspoons of vanilla bean paste- 4 Tablespoons of vegetable oil- 2 Teaspoons of strawberry essence (optional)Butter Cream Frosting- 225g unsalted butter, at room temperature- 6 Cups of icing sugar- 5 Tablespoons of milk- 3 Teaspoons of vanilla bean paste- Pink Food colouring. - 1 Giant cupcake baking tin (you can get these on-line I think from Wilton)- Regular muffin tin- Mini muffin tin- Whisk- Mixing bowls- Spoons- Measuring cups- A cake palete knife/ a smooth cooking tool to smooth out the icing on the cake- A star piping tip- Piping bag and different sized circle piping tips- Muffin paper liners. To make the cake batter you will need a kitchen mixer or a hand mixer. The bowl you will use needs to be large as you will make more tat 6 cups of batter.  The first thing to do is to beat the unsalted butter (225g) in the kitchen mixer until the butter becomes light and fluffy. Then add the 2 cups of caster sugar and continue to beat until light and fluffy. Then add the vanilla bean paste, strawberry essence and eggs to the butter and continue beat them.Add the baking powder to the plain flour and whisk it together to get rid of any lumps then add 1/3 of the flour mixture to the butter mixture and beat it together. Then add 1/3 of the milk and continue to beat the mixture. repeat adding 1/3 of the flour and milk mixture alternatively until all incorporated. Finally add the 4 tablespoons of oil to the mixture and stir until combined.. For the giant cake tin if it is NOT stick free coat the tin with butter or oil on all sides then dust with plain flour until it is completely coated.In the base of the giant cupcake place 3 1/2 cups of the cake batter and smooth it out. Then for the top of the cupcake (swirly pyramid shape) place 2 1/2 cups of cake batter and smooth the batter on the top.  For the regular and mini cupcakes line 1 or 2 of them with muffin paper liners.For the regular cupcakes place 1/4 cup of batter in the muffin pans. and place 1-2 teaspoons of batter in the mini cupcake pan.. Giant cupcake: Pre heat the oven to 160 degrees Celsius fan-forced and bake the cake for about 1 hour until you can tough the cake and insert a skewer or knife until it comes out slightly clean, Then let the cake rest for 20 minutes in the pan.  Let the cake cool completely on a wire cooling rack for about an hour. While this is cooling make the butter cream frosting.Normal cupcake: Bake the cupcake for about 16-18 minutes until golden brown and spongy to the touch.  Mini cupcake: Bake the mini cupcake for about 8-11 minutes until golden brown and spongy to the touch.. Place the unsalted butter in the electric mixer and beat until it is light and fluffy. Then add 1 cup of icing sugar to the butter and beat until incorporated, then add 1 tablespoon of milk and mix, adding alternatively until all milk and icing sugar has been added. Then add 3 teaspoons of vanilla bean paste and beat it all together. You will need about 4 shades of pink, each slightly lighter than the next and white icing to finish it off. . When the cake is completely cooled and the butter cream frosting is done place about a tablespoon of icing onto a plate and place the cupcake 'base' on top of it.  Then put about 1-2 tablespoons on top of the base and place the 'top' of the cupcake. Then with a think layer of butter cream frosting and your palette knife (or flat edged butter knife) go around the cake creating a crumb layer of frosting. This is just a layer that covers the cake to catch the crumbs so that they don't show up on your final layer of icing.  Then over the crumb layer put another coat of white icing.To make the cake seem more like a cupcake use the knife to make upward strokes of the icing to resemble the 'crinkles' of a cupcake paper liner.. To make the darkest shade of pink icing Take about 1/2 -3/4 cups of icing and add a lot of food colouring (i added about 1/2 a teaspoon of food colouring gel which is very potent). this will make the bottom petals of the op of the cake and the pretty mini flowers at the base of the cake. First place 3/4 of the dark icing into a piping bag with a large round tip. Then pipe a circle or oval of icing onto the cake and with your flat knife lightly scrape it away from the middle of the circle so it looks like a petal. Then place another circle on top of the streaked petal from before and continue until you go around the whole cake. Reserve the rest of the dark icing for the base of the cake.Continue this process for each lighter shade of icing until you reach the top of the cake then top it of with a small dollop of white icing. Remember to save some of each coloured icing as you will need it for the mini cupcakes.. Spread the white butter cream icing on top of the regular and mini cupcake. Then repeat the process in the previous step  for the small cakes if you are making them but with smaller piping tip holes. . With the left over dark icing place it into a piping bag with a star shaped tip and place the tip about 1/2 a centimetre away from the cake base, push on the piping bag until icing comes out then pull the bag away quickly so that it resembles a small flower. Then continue this action around the base of the cake and you are done!Then it is very important to share this with your friends I would recommend refrigerating it if it is a hot day otherwise it can stay at room temperature.. The most important part about the finished cake is to give it or share it with your friends. Trust me it makes the cake taste so much better!!! I gave mine to my friend Flavia for her birthday. She loved the cupcake family!!!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_129_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_129_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_129_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_129_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. You will need a few unusual things for this bubbly beverage.-bannana extract-seltzer-sugar (liquid sugar is easier to use)-cup. Pour a generous spoonful of the bannana extract into the cup so it forms a thin layer on the bottom.. Take your sugar (or liquid sugar), and pour several tablespoon of sugar into the cup. Then stir well. If there is anyone hesitant about that much sugar, remember, it's SODA and it has a bum load of sugar.. This step is easy because you just need to pour the seltzer into the cup without any measurements! When the liquids meet the top of the cup, stir it again.. This drink probably smells strong of bannana, but it's really great! Enjoy your bannana soda, and check out my fan fictions every once in a while! (My fan fictions appear to be more humorous than serious, so the characters I use will probably not act... Well... Normal...)P.S. If you make the bannana soda, have comments, questions, or suggestions, just write in the comments!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_130_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_130_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_130_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_130_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Eggs (as required)2 green chilies1/2 Tsp red chili powder1/4 Tsp turmeric powderA few curry leavesCoriander leaves for garnishingSalt to taste (as required). Boil the eggs.Remove the eggs shells..  Make a slit (like across) on all eggs in order to absorb the spice taste.Chopchilies, and coriander leaves.Add 1/2 Tsp turmeric.Add 2 Tsp salt.Add 3 Tsp chili powder.Add 1 Tsp oil.Mix it well.. Heat oil in a pan.Finally, add the cooked eggs.Add the mixed masala, on the eggs.Wait till the base of the egg turns golden brown.Flip the eggs after the base is turned golden brown.Stir gently.. So enjoy your meal with hot Boiled Egg Masala.That's it, Foodies.Thank you all.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_131_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_131_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_131_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_131_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. FruitA handful of blueberries1 banana but you can use more if you wantA bunch of grapesA portion of strawberriesTo serveA drizzle of agave nectar. Wash your berries under cold water and leave them to dry.Peel and evenly chop your banana.Slice the tops of your English strawberries.Then assemble into your bowl.. When you put your fruit in the bowl it will taste fantastic. I find that taking a few seconds to present your fruit enhances not only the presentation but the satisfaction of the end consumer. Good looking food really does taste better.I presented my strawberries cut side up and evenly spaces the fruit throughout the bowl, it went from being a bowl of fruit to an art piece.. Gentlly run your nectar over the fruit so that it glistens and looks good. It is ready to serve.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_132_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_132_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_132_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_132_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. One bunch of Green Amaranth leaves (About 250 grams)One medium sized Onion2 Green Chillies (Adjust to your own taste)About 100 grams of Chickpea flour3 teaspoons of Rice FlourOne teaspoon of Cumin seed powderOne teaspoon of Fennel seed powderOne teaspoon of Red Chilli powder (Adjust to your own taste)One teaspoon of Turmeric PowderHalf a teaspoon of Asafoetida powderSalt to taste. First you have to remove the hard stems from the green amaranth leaves. You can plant the stems in your garden or in a pot and It will grow and give you more green leavesRoughly chop the green leaves and keep aside in a wide pan. Finely chop the onion and green chillies and add them to the chopped Amaranth leaves. Add the rice flour and other spice powders and salt with Chickpea flour and mix well.Do not add any water. Add the chickpea flour with other ingredients to the Amaranth leaves in the panMix everything together in the pan. If required add little bit of water so that the mix can be shaped by hand without breakingIf the mix seems very dry and separates, add little amount of water. If it is watery and did not hold shape, add little bit of chickpea flourTake a lemon sized mix in your hand and shape into flattened balls as in the picture. It is very easyYou can make few flattened ball shaped pieces beforehand and keep them in a plate. Place a frying pan over medium flame and add sufficient cooking oil for deep fryingWhen the oil is heated, add one piece and check. If the temperature of oil seems ok, then you can add 4 to 5 pieces at a time to the oil.Turn over the fritters few times in oil so that both sides are evenly cooked . Once the outer shell of the fritters are deep fried to a dark brown and no longer bubbles in the oil, you can take them out with a strainer.Drain out excess oil and transfer over absorbent food-grade paperServe hot as evening snack with coffee or tea. Everybody will love it and ask for more\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_133_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_133_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_133_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_133_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Preheat your oven to 350 degree and then assemble your ingredients.Baking dish - I used a glass one1 box Gluten Free Yellow Cake Mix - Betty Crocker's is among the most reasonably priced, at about $4/box1 jar or can of fruit of your choice - peaches are wonderful, but so is pineapple or apple1/2  cup (1 stick) of butter1/3 cup Brown SugarCinnamon. Open the jar or can and dump it in.  I know, right?!  Don't bother draining your canned fruit - you'll want the juice in there.. That's it - dump your cake mix over the fruit, as evening as possible. Then spread more evenly with a fork or knife, tamping your mix down firmly into your fruit layer.. Slice pats off your stick of butter and layer evenly along the top of your cake mix layer.. Use about a 1/3 cup of brown sugar and sprinkle evenly over the top of your butter layer.  Sprinkle cinnamon as desired over the whole thing.. Bake and wait.. Remove your dump cake from the oven - you can test with toothpick or knife if you'd like.  It should come out clean.  Cool for about 10 -15 min... You've essentially got a cobbler here, and a darn good one! Top with a little bit of ice cream and you've got a lovely dessert, gluten free or otherwise!Enjoy!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_134_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_134_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_134_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_134_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Lacto vegetarian version: clean thoroughly and steam one whole small potato for 30 minutes. This will produce a drier lighter texture perfect for adding butter and cream. Vegan version: pierce potato several times with a fork, wrap tightly in aluminum foil and bake at 400 for 1 hour. This produces a stickier wetter texture that does not require the addition of dairy liquids.. Split open with a spoon and scoop out all of the steamed/baked flesh, discarding the skin, while stopping to marvel at one of the most amazing colors found in nature! *The potato pictured here has been steamed, notice the dry fluffy texture.. Place potato flesh in a mixing bowl, add approximately 1 tbs each, brown sugar and pumpkin pie spice (vegetarians who prefer a lighter creamier texture and flavor may also enjoy butter and/or cream to taste) beat with electric mixer until smooth.*The potato pictured here has been baked, notice the wet, sticky texture.. I like to add some whipped cream to the vegetarian version, and sprinkle with nutmeg for extra yumminess, enjoy!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_135_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_135_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_135_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_135_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Ingredients:\n1 -\u00a0 gallon milk\u00a0 (whole mile generally generates more curds.\n1 -\u00a0 rennet tablet (available at better grocery stores0\n1/2 cup\u00a0 - vinegar (cider vinegar tastes better to me)\nTools:\nStove\nLarge pot about 2 gallon size.\nSpider or equivalent (a large spoon will work)\nThin mesh pasta colander.\nPaper towels\nSink\nMeasuring cup\nInstant read thermometer.. Pour the milk in a large pot.\nCrush the rennet tablet into 1/4 cup water and let it dissolve well.\nAdd the rennet solution and vinegar to the milk then stir well..\nHeat the milk until it is about 100 degrees Fahrenheit. WATCH CAREFULLY!!!!\nAt 100 - 110 degrees Fahrenheit immediately TURN OFF Heat.\nLet sit till the curds have separated from the whey (greenish milky stuff).. Put some paper towels in the colander.\nSpoon out the curds into a find mesh colander that is sitting in a bowl the catch the excess whey.\nAdd salt and mix with the curds when all the curds have been retrieved.\nPut on a cover over the curds and then put a weight over the cheese curds.\nRefrigerate over night. Get rid of more unneeded liquid.\nYou can reheat the whey (second picture) to about 180 degrees and get more curds. (aka ricotta or recooked). Again Watch carefully.. You may want to add a bit more salt for preservation purposes.\u00a0 Anyway, you now have your own honest cheese. We could do a variation to make mozzarella.\u00a0 You now have cheese for quesadillas (aka piadini), pizza, spaghetti/lasagna, bagels, salads,\u00a0 cheesecake, cheese soup, and a host of other food items. A basic recipe that should be in all kitchens.. How to Make Swiss Cheese This recipe will teach you to make traditional Swiss cheese in the comforts of your own home. Difficulty: \u00a0\u00a0\u00a0 Challenging Instructions things you'll need: \u00a0\u00a0\u00a0 * 1 gallon of whole milk, 1/2 packet of direct-set thermophilic starter or 2 ounces of prepared thermophilic starter, 1/2 teaspoon of propionic shermanii powder, 1/4 teaspoon of liquid rennet or a 1/4 renbet tablet, 1 pound of cheese salt, for brine, plus a pinch of cheese salt, 1/2 gallon of cold water, for brine. curd knife, stainless steel whisk, cheesecloth. ladle -------------------------------------------------------------------------------------------------------------------------- \u00a0\u00a0\u00a0\u00a0\u00a0 Swiss Cheese \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 * 1 \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Heat the milk to 90 degrees Fahrenheit. Add the starter and mix well. \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 * 2 \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Remove 1/4 cup of milk from the pot and add the propionic shermanii to it. Mix thoroughly to dissolve the powder. Add the mixture to the milk and stir. Cover and allow the milk to ripen for approximately 10 minutes. \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 * 3 \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Make sure that the milk's temperature ALWAYS remains at 90 degrees. Add the diluted rennet and stir gently with an up-and-down motion for approximately 1 minute. If you are wanting to use farm fresh cow's milk, top stir for several minutes longer. Cover and let the milk set at 90 degrees for approximately 30 miutes. \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 * 4 \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Using a curd knife and a stainless-steel whisk, cut the curd into 1/4 inch cubes. \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 * 5 \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Keeping the curd temperatures at 90 degrees, gently stir the curds for approximately 40 minutes. This is called fore-working and helps expel whey from the curds before they are heated. \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 * 6 \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Heat the curds by one degree every minute until the temperature is 120 degrees Fahrenheit. This will take approximately 30 minutes. Maintain the temperature at 120 degrees Fahrenheit for another 30 minutes, stirring often. The curds must be cooked until they reach a stage called the \"proper break.\" To test for this, wad together a handful of curds and rub it gently between your palms. It the ball readily breaks apart into individual particles, the curds are sufficiently cooked. If they are not sufficiently cooked, they will be too soft to hold the cheese together. Let the curds set for approximately 5 minutes. \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 * 7 \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Pour off the whey and reserve it for other recipes. \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 * 8 \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Line a 1 pound mold with cheesecloth and place it in the sink or over a large pot. Quickly ladle the curds into the mold. You do not want the curds to cool. Press at 8-10 pounds of pressure for approximately 15 minutes. \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 * 9 \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Remove the cheese from the mold and gently peel away the cheesecloth. Turn over the cheese, re-dress it, and press at 14 pounds of pressure for 30 minutes. \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 * 10 \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Repeat the process but press at the same pressure of 14 pounds for 2 hours. \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 * 11 \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Repeat the process but press at 15 pounds of pressure for 12 hours. \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 * 12 \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Make a saturated brine bath by combining the salt and water in a noncorrosive pot; stir well. Remove the cheese from the mold, peel away the cheesecloth, and soak the cheese in the brine. Sprinkle the remaining pinch of salt on the surface of the floating cheese. Refrigerate the brine and let the cheese soak for 12 hours. \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 * 13 \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Remove the cheese from the brine and pat dry. You can reserve the brine for other recipe uses if you so desire. Place the cheese on a clean cheese board and store between 50 to 55 degrees Fahrenheit and at 85 percent humidity. Turn the cheese daily for one week, wiping it with a clean cheesecloth dampened in salt water. Do not wet the cheese. \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 * 14 \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Place the cheese in a warm, humid room, such as the kitchen, with the temperature between 68 and 74 degrees fahrenheit. Turn it daily and wipe it with a cheesecloth dampened in salt water. Do not wet the surface of the cheese. Let the cheese set for 2-3 weeks, until eye formation is noticeable. The cheese will swell somewhat and become slightly rounded. \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 * 15 \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Age the cheese at 45 degrees Fahrenheit. and at 80 percent humidity for at least 3 months. Turn the cheese several times a week. Remove any surface mold with cheesecloth dampened in salt water. A reddish coloration on the surface of the cheese is normal and should not be removed.This recipe will make about 1 pound of cheese. If you an additional pound, just double the recipe..   Ingredients1 Gallon Fresh Milk  1 oz. Mesophilic Starter Culture  1/4 tab Rennet  1 Tablespoon Salt   InstructionsUsing a double boiler, warm the milk to 90 F (32.25 C).  Add 1 oz of mesophilic starter culture and mix thoroughly with a whisk, the culture must be uniform throughout the milk.  Allow the milk to ripen for one hour.  Dissolve 1/4 tab rennet into 3-4 tablespoons COOL water. Hot water will DESTROY the rennet enzymes.  Slowly pour the rennet into the milk stirring constantly with a whisk.  Stir for at least 5 minutes.  Allow the milk to set for 1-2 hours until a firm curd is set and a clean break can be obtained when the curd is cut.  With a long knife, cut the curds into 1/4 inch cubes.  Allow the curds to sit for 15 minutes to firm up.  Slowly raise the temperature of the milk to 102 F (39 C). It should take as long as 45 minutes to reach this temperature. During this time, gently stir the curds every few minutes so they don't mat together.  Cook the curds at 102 F (39 C) for another 45 minutes. During this time, gently stir the curds every few minutes so they don't mat together.  Drain the whey by pouring through a cheesecloth lined colander. Do this quickly and do not allow the curds to mat.  Place the curds back into the double boiler at 102 F (39 C). Stir the curds to separate any particles that have matted. Add the tablespoon of salt and mix thoroughly.  Cook the curds at 102 F (39 C) for one hour, stirring every few minutes.  Carefully place the curds into your cheesecloth lined mold.  Press the cheese at about 20 lbs. (9 kg) for 45 minutes.  Remove the cheese from the press and flip it.  Press the cheese at about 40 lbs. (18 kg) for 3 hours.  Remove the cheese from the press and flip it.  Press the cheese at about 50 lbs. (22.75 kg) for 24 hours.  Remove the cheese from the press. Place the cheese on a cheese board and dry at room temperature for 3-5 days, until the cheese is dry to the touch.  Wax the cheese and age it in your refrigerator for 3-24 months. The longer the cheese is aged the sharper the flavor it will develop. Be sure to flip the cheese every few days. . Get a second use out of the whey you made from the cheese to be polenta.. Add some cornmeal into the boiling whey and stir until thick enough for the stirring tool to stand up straight./ Polenta\u00a0 makes an alternative to mached potatoes. Instead of the red sauce, you can use buttter garlic and or a little olive oil.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_136_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_136_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_136_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_136_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. MaterialMetric scale        A timer     A bowl at least 1 gal capacity (4 to 5 litres) and a smaller one         A large basket around 10 in (25 cm) inside diameter (I use the basket of my salad spinner)         2 kitchen towels (in linen or cotton)         A pizza peel or a metal tray         Parchment paper         A razor blade         Some plastic jar or bowl       Masson jars (for the levain)     A baking stone or a large cast iron  skillet I use the skillet)     A roasting panIngredients All purpose flour         Whole wheat flour         Rye flour         Salt         Water (room temperature). First and the most difficult is to start the \u00ab levain \u00bb Use organic flour, it will be easier Pour in a jar (half pint or half liter masson or equivalent) :25 g of all purpose flour          25 g of whole wheat flour          33 g of room temperature water (not directly from the faucet because there is some chlorine, so let it rest some hours before using it)Mix together, cover the jar with it\u2019s lid and keep in a warm place arround 25 \u00b0C (77 \u00b0F) (on the top of water heater for example).Next 2 days, once a day: Take 25 g of the previous mix (discard the rest), and add:25 g of all purpose flour          25 g of whole wheat flour          33 g of room temperature waterIf the culture has started, you may see that the mass rises (with a lot of bubbles inside) and falls after a while, then repeat the feeding preocess every 12 hours.The day after and indefinitely:Then if the culture stays very active replace the whole wheat by all purpose flour and do it once a day.To keep the culture active feed it once every 1 or 2 days and keep it at room temperature.Don\u2019t go under 15 Celcius otherwise you will have an acidic fermentation and you will have to restart it one or 2 days before using it.Then the levain (sour dough) should look like the two last pictures: before fermentation and 4 hours later. You will need 24 hours to do the complete process (to have a baked bread) but in fact it is around 20 min of work.You will also have to wait for a night before tasting the bread...You will need to have:All-purpose flour: 50 g + 150 g + 405 g + 250 g = 855 gWhole wheat flour: 200 gRye flour: 200 gSalt: 22 gWater: 35 g + 100 g + 740 g = 875 gUse organic flour,  the results will worth it.. 10 PM: Wakeup the levain (made the Rafraichi): In a bowl larger than the one used to keep the levain add:40 g levain      50 g all-purpose flour       35 g room-temperature waterMix, cover and store at 24,5 to 26,5 \u00b0C  (76 \u00b0F to 78 \u00b0F) Prepare the water for next step and store it at the same place as the levain so it will be at the good temperature: 100 g room-temperature water6 AM the next day: the levain build add to the Rafraichi: 150 g all-purpose flour and     the previously      100 g room-temperature water.So you have a levain build of 375 g Mix, cover and store at 24,5 to 26,5 \u00b0C  (76 \u00b0F to 78 \u00b0F) Prepare the water  for next step and store it at the same place as the levain so it will be at the good temperature: 740 g room-temperature water. Arround 1:45 PM: The final mix of the doughPrepare 22 g of salt   405 g of all purpose flour in a bowladd the salt to the 740 gr of room-temperature water and mix well In the big bowl pour: 200 g rye flour  200 g whole wheat flour   250 g all purpose flourand mix the flour together Then add 310 g of levain built to the flour (keep the leftover to continue your levain for next time)  Add the salted water to the flour and doughMix with one  hand (stirring and grasping) while the other keeps the bowl. The mix  should become more homogeneous after 1 to 2 minutes.  Finally add the remaining flour (405 g) and continue to mix untill all the flour is absorbed (3 to 4 minutes maximum).The total weight of the dough is: 2127 gWait 5 minutes and fold the dough 8 to 10 times.  Fold technique:  Hold the bowl with one hand, slip your fingers between the dough and  the inside of the bowl and grasp the dough, pull it and  and lay it on  the top of the dough.          Turn the bowl and repeat the movement. (on the pictures you see only one hand because the other one was holding the camera)Wait another 5 minutes and repeat 8 to 10 times. It should begin to look like a real dough and begin to detach from the bowl. Cover with a towel and keep in a warm place for 1 hour: 24,5 to 26,5 \u00b0C  (76 \u00b0F to 78 \u00b0F) 15 PM: Fold the dough 8 to 10 times Cover and keep in a warm place for 1 hour: 24,5 to 26,5 \u00b0C  (76 \u00b0F to 78 \u00b0F) 16 PM: Fold the dough 8 to 10 times Cover and keep in a warm place for 1 hour: 24,5 to 26,5 \u00b0C  (76 \u00b0F to 78 \u00b0F) 17 PM: Fold the dough 5 to 6 times Cover and keep in a warm place for 1 hour: 24,5 to 26,5 \u00b0C  (76 \u00b0F to 78 \u00b0F). Loaf formFold the dough 5 to 6 times in its bowl.Then verse the dough on a floured surface, fold it 2 to 3 times and give it a round shape.Line the basket with a towel, sprinkle flour on the towel (not too much, just enough to avoid the dough sticking to the towel). Place the dough in the basket bottom up (you will see the future base of your bread (la clef), the upside will be in the bottom of the basket)Cover and keep in a warm place and wait 1.5 to 3 hours. The dough will be ready when the total volume will be 3.5 times the initial volume. After 1 hour, check every 30 minutes: To verify if it is ready, enter a finger in the dough, when the dough is ready, the hole must close itself in 2 to 3 seconds. 1/2 hour before the end of the rising:Prepare the ovenon the bottom rack place a roasting pan (empty)      in the middle of the oven place a pizza stone or a cast iron skillet      preheat the oven to 210 \u00b0C (in convection), 230 \u00b0C in conventional (450 \u00b0F).Prepare the material necessary to put the loaf in the ovenA pizza peel or a metal tray lined with parchement paper A razor blade  2 cups of hot water. When the oven has reached the good temperaturePlace the paper on the top of the loaf and turn it upside down on the metal plate. Quickly incise a losange pattern on the top of the loaf, not too deap (arround 1/8 inch) Open the oven and quickly slide the loaf on the stone (or in the skillet). Close the oven door. Pour 1 cup of water in the roasting pan as quickly as possibleClose the door.The steam will help to obtain a better crust.So we are around 8 PM. Note carefully the hour.After half an hour (8:30 PM) Turn the loaf in the oven     Decrease the temperature to 175 \u00b0C (350 \u00b0F)After 50 to 60 minutes (8:50 PM - 9:00 PM)The loaf should be brown, and look cooked. But it is too early to stop the baking procedure. Even if it looks baked wait 5 to 10 minutes.To know if the bread is ready it should have an hollow sound when knocked on the bottom.9:10 - 9:15 PM remove the bread from the oven and put it on a rackIt should weight around 1890 gDon't cut it now, you have to wait at least 6 hours before tasting it. It is very difficult because it smells so good in the house.. The next morningHave a good french breakfast with some honey, marmelade and butter and a cafe latte.The first days the bread is better not toasted. You will be able to keep it 1 week (if you don't eat it) ...Enjoy\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_137_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_137_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_137_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_137_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. \n          Our ingredients will be...\n\t\t12-18\u00a0jalapeno\u00a0peppers (if they're huge, you need less)\n\t\t2 x\u00a08oz cream cheese\n\t\t2 x packets of bacon (NOT thick cut!!)\n\t\t1/4 cup shredded cheese (preferably cheddar)\n\t\t1 bunch of green onions/chives\n\t\t1/3 cup of BBQ sauce. First things first...\nGrab your iPad and put on an episode of Bones, because the prep will take you 42 minutes...\nDice your chives a little thin, then put your cream cheese, chives, and shredded cheese in a mixing bowl and mix.\nYou can do this by hand if you're used to baking and have the unnatural super strength in your forearms. \u00a0I don't. \u00a0I used a mixer.\nJust make sure it's blended well.. You may want to wear gloves for this. \u00a0If you don't, just be sure to not rub your eyes for a day or so. \u00a0Jalape\u00f1o oil is unforgiving...\nSlice your jalape\u00f1os length-wise. \u00a0You can leave the stem on, or take it off. \u00a0Let me give you the benefits of each:Stem on = something to hold on to, looks nicer, easier to tell when it's about to get hot (see next step)Stem off = can pop the whole stinking popper in your mouth for sheer delight. Before we go further, let me clarify...\nIt's a common misconception that the seeds have all the heat. \u00a0The membranes have just as much!\nWhat you're going to do here is grab a table spoon and scoop out the insides. \u00a0This is where you can control the amount of your heat. \u00a0For spicier poppers, leave more membrane. \u00a0For a more tame snack, scoop out all of the white. \u00a0Save a big pile of seeds for a future step.... Now you'll want to get your spacing to make sure you have enough pans, because after this we're going to get messy.\nLay out your jalape\u00f1o halves on the pan. \u00a0If they're curvy (hubba hubba), then you'll want to alternate directions so they fit better. \u00a0Something that you're looking for here is enough space so the bacon isn't touching from one popper to the next, otherwise it won't cook properly.\nI put foil on my sheets on my cookie sheets to make clean up easier. \u00a0But then again, I'm lazy.. Now it's time to pre-heat the oven. \u00a0Set it to 275 degrees - any more and the cream cheese gets hard.\nAnd now grab your filling and start filling (the English language just fails here) your pepper halves. \u00a0\nHere's a little aside...\nSince we're dealing with jalape\u00f1os, let's talk about mexican tradition. \u00a0There's this cake with a porcelain/plastic baby inside. \u00a0The tradition is whoever gets the baby has good luck for the year. \u00a0I think this is completely dumb, as I don't want to be the poor fool that is lucky just because I didn't choke on the foreign object inside my dessert. \u00a0BUT...that did inspire me to create a game of my own.\nHere, you can take all of those seeds you set aside, but them in one of the peppers, then cover with the filling and prepare it like the rest of them (you see where I'm going with this). \u00a0Then the\u00a0victim guest who gets the hot one can be the lucky person! \u00a0This is even more effectual when the rest of your guests have been saying over the last few minutes \"they're not even hot like I expected!\". Slice the bacon in half length-wise to create 2 long strips out of each piece. \u00a0Each half will wrap around a pepper. \u00a0Try not to layer the bacon too thick, or it won't cook through. \u00a0You can hold the bacon in place with a toothpick.\nAlso, when doing this step make sure not to touch other things. \u00a0Uncooked pork (and poultry if using turkey bacon) has a lot of bacteria that can cross-contaminate your kitchen. \u00a0So count your toothpicks first and lay them out, so your bacon-fingers aren' reaching into your toothpick box.\nA reminder again - make sure the peppers aren't touching, so the bacon cooks properly.\nGood news - we're almost done!. Grab your BBQ sauce, and you're going to brush/drizzle/fling/spoon your BBQ sauce on. \u00a0Sometimes if I'm in a hurry, I even just squeeze it straight out of the bottle (a lot of sauce does get wasted if you do it this way. \u00a0The most efficient way I've found is to brush it on.\nAt this point, your oven just beeped that it's ready and Bones and Booth should have just captured the killer, and you're waiting for the fluffy wrap up at the end with Hodgins and Zack.\nDo a dance, though, because you're done!. Put your poppers in the oven for 1 hour. \u00a0Then serve to your salivating guests!\nIf you made more than you need, stick them in the fridge - they taste great hot or cold!\nCongratulations! \u00a0You're done and everyone in the house is happy!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_138_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_138_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_138_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_138_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. This step will vary depending on what type of pit your dealing with and whether or not you have a fire rack. My fire pit is REALLY wide so I had to build stone supports for my cookie sheet. I did this by taking old patio blocks and leaning them together perpendicularly against the walls of my pit to make a space just big enough for my rack. Most camp sites have a relatively low and not-as wide pit and often come with a fire rack. If so, you will not require this extra step. Also, if your cookie sheet itself fits over your pit, you're good to go!Note that you do not need a rack over your pit. A rack is only necessary if your pit is too wide for your cookie sheet and you do not have the means to build a support structure inside the pit. Even though I COULD have placed a rack over my patio blocks, I didn't because the cookie sheet fit perfectly and it was unnecessary.**See step 4 regarding lowering/raising the rack/cookie sheet as necessary**. The first step in a successful fire is having all your starter materials ready before you begin: Tinder, Kindling, and Fuelwood. I made 5 piles of supplies:Leaves and tiny twigs (Tinder)Small twigs (Kindling)Medium twigs (Kindling)Large twigs (fulewood)Wood (Fulewood)Take a pile of leaves and light the underside. Once it starts going, add your smallest twigs. You want enough to catch flames, but not too many to smother the fire. Here I usually dump a bunch that are lightly packed and give the leaves a good blow with my mouth. The extra O2 helps the fire grow and catches the twigs.Once some of the small twigs have lit, immediately start adding medium twigs in the shape of a teepee over the flames. Do not worry if only a little part of it has caught, you can build on it. Add more sticks and blow as necessary. Once the medium sticks have caught, start adding larger sticks and wood in the same fashion. At this point you should have a steady flame and do not need to blow anymore.You may need to reposition your base fire to the center of the pit, or reposition sticks to allow the adding of larger wood. Make sure to leave space between the sticks and wood for your fire to \"breathe.\" If it doesn't get enough O2, it will go out!Now, sit back, relax, and enjoy your fire for a bit!. Take an OLD cookie sheet and line it with foil. Make sure it's an old one, or one you don't mind getting a little messy. Fires will tend to junk up your nice new cookie sheet! Spray the sheet with cooking spray and place your can in the center. Now you need cookie dough. I cooked my cookies in my backyard so I just followed the recipe on the back of the chocolate chip bag and made dough from scratch. After all, that's usually the kid-fun part of making cookies. If you want a little more convenience or are going camping, you might want to consider pre-made refrigerated dough. I would not recommend using frozen dough unless you thaw it beforehand. Either dough will work so it's totally up to you.Drop your cookie dough on the sheet making sure to leave room between the cookies, and around the can. Using 2 large sheets of foil, cover the top of the cookies and seal the sides. The can in the center will make a tent-like shape. This will allow air to properly circulate and cook your cookies just like a regular oven!. A burnt-out and flame-less fire is usually the worst part of making one because it means all the fun is over. However, that's where our fun begins! We want to cook with only super hot coals, not a pretty and flame-filled fire. Flames will unevenly heat the bottom of the pan and cause the cookie bottoms to burn. And well, nobody likes a burnt cookie!When the fire is mostly flameless, evenly distribute the hot, glowing coals. Place your mini cookie oven on your rack or support system and let them bake for about 8 minutes. Remove the \"oven\" using pot holders and check for doneness. At this point assess your cookie situation and determine how much longer the cookies should cook (if at all), or if you need to raise/lower your cookie oven. You need to make a judgement call here because you really don't want to keep removing the oven and letting all the heat escape by continuously checking the cookies. My cookies were REALLY underdone at the 8 minute mark (picture above) so I cooked them for another 10 minutes. After checking a second time when the 10 minutes were up, I again found rather under-cooked cookies. I quickly substituted smaller patio blocks to lower my oven closer to the heat source to help the cookies really get coking. I also rotated the cookie sheet because one side was getting a lot more heat than the other. If your cookies are browning on the bottom too quickly, you may need to raise you cookie sheet. Refer to the 1st step and rebuild/raise the rack with stones/blocks. You can also remove some coals to lower the heat level.My cookies baked for about 28 minutes total, but it could have been a lot less if I had lowered the sheet at the 8 minute check in (live and learn I guess!) Total baking time will vary based on how many coals you have, how close the oven is to the coals, and what type of cookie your cooking. . Like any cookie, you want to let it cool for a bit before removing them from the cookie sheet. I simply slid the foil off the cookie sheet heat and allowed them to completely cool right on the foil. Note how beautiful the bottoms of the cookies came out! If I had flames while cooking, they sure wouldn't look that nice!!Once cool, ENJOY your cookies!! I sure did!! Surprisingly, even though I had a small amount of smoke throughout my entire baking process (which is normal), the cookies didn't taste like camp fire at all! They looked perfect and tasted perfect!  I call that success! I had so much fun making these cookies, I can't wait to make them again! \nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_139_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_139_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_139_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_139_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Combine the yeast and honey in the lukewarm water and \"melt\" the yeast in the warm water. . To give the bread a nice even texture and look, its recommended that you grate your beetroots with a grater. But if you're lazy like me, you chop them up and use the blender. However, this does make it more chunky, and it does effect both the looks and to a certain degree also the taste of the bread (it taste slightly more like beetroot).To help blend better, add the yogurt (I used 'Cultura', the flavour doesn't really matter with such small amounts) and the yeast/honey mixture and blend some more.. Add salt, nutmeg and grate the entire skin of your organic orange. Mix.Add your flour and melted butter, and get your hands dirty (speaking off, probably better you wash your hands first!) and knead that dough!If its crumbly and hard to get to stick together, you can add a bit of orange juice, pressed from the orange you grated earlier.I used ryebread flour, which is why it looks a bit dark. I think wheat flour would have been better though, but I ran out... Onced kneaded into a smooth and even dough, roll it into a sausage shape to fit your tin, place it into your tin and let it rise for 30 min.. Once it has risen to almost twice the size, slice a cross with a knife, and bake in a preheated oven at 225\u00b0 for 30 minutes.Let it cool on a cooling rack.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_140_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_140_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_140_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_140_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. -Apples (you can use a variety of apples but i prefer red delicious with gala)-sugar-water-spices (i used cinnamon and nutmeg)-tools-blender/ food processor-an undershirt that you out grew or ripped-knife-pan-cutting board-spoon. wash the applesremove the apple cores and cut them into smaller pieces. toss the apples into a pan and add wateras the apples start to softenpour them out into a separate bowlthe apples should be soft enough for you to stick your finger into the slicesPS: I made some jello while the apples cooked. toss them into your blender or processorand wreak havoc!after the apples turn into apple sauce, dump them out onto your extra shirtand SQUEEZE and twistring out as much apple cider as you canthen after you have a good amount add spices to tasteyou can also add other juices at this time. now take those mushy apple fibers and using the same water you used to boil the applesmix them togetherand cook themwhile its cookingadd sugaras most of the sugar from the apples has already been squeezed outnow just as in the last stepring out all the juice you can get from your new mixture. Refine it if you want tomine has some mushy stuff still in itso i poured it back into the shirtand ringed it out againbut not as tightly as the first timeand then i re-spiced the ciderand added a little cream on the top\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_141_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_141_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_141_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_141_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. 4-8 sheets of nori (an edible seaweed, usually sold in sheets of 10 or 50) 8 kale leaves   a variety of julienned vegetables of your choice: red and yellow pepper, carrots, cucumbers, daikon, asparagus, or avocado  optional: tofu or tempeh   2 tablespoons sesame seeds (white, black or a gomazio mix)   soy sauce or Braggs aminos for dipping (optional). Lay the nori roll on a cutting board or chopping block. Line the end of one nori sheet with 1 kale leaf, stem removed.  Top with a row of vegetables, a few of each kind.Hint: Sometimes people like to add a layer of saran wrap between the cutting board and nori roll, but this isn't necessary if you're careful during the rolling process.. Using both hands, roll firmly (like a skinny burrito). Allow some veggies to stick out of the ends, about 1 cm.. Dip your fingers or a pastry brush in water.Spread water on the underside of the loose end, end to end. Continue spreading with water (or soy) until the seam is completely sealed.. Using a sharp knife, cut on the diagonal into 8 pieces. The easiest approach is to cut the roll in half, then in half, and half again.. Assemble pieces on a platter or woodblock, with small dishes of soy or Braggs for dipping. Garnish with fresh flowers, chopped parsley, or a sprinkle of sesame seeds.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_142_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_142_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_142_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_142_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. 1 cup of vodka or clear rum per 1.25oz of raw, unshelled almonds. . Put the almonds in a bowl and bring a cup of water to a boil. Pour the boiling water over the almonds and let them sit for about a minute. . Quickly put the almonds in a strainer and run under cold water for a few seconds. After this the skin should be easy to remove. . Give the almonds a rough chop and then put them in your bottle. Pour your alcohol over the the almonds. . Store the future extract in an airtight bottle in a cupboard or somewhere out of the way. Shake the bottle daily. In about 1-2 months the almond flavor starts to appear, it gets better with age. After a couple of months you can start using the extract, either by pouring off what you need when you need it or straining the almonds and keeping the extract. \nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_143_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_143_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_143_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_143_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Large jar of salad olives\nTwo cans of black, ripe olives\nTwo jars of black kalamata olives\n2-3 cloves of garlic\n1/4 of a medium onion\nThis is all you really need, but you can certainly include plenty of other ingredients. Good ones to try are capers, peperoncini, roasted bell peppers, dried tomatoes, or red pepper flakes.\nNOTE: I'm informed that capers are NOT optional. If you don't use capers, it will still be fabulous, but it won't really be tapenade.. A very sharp knife and a cutting board.\nYou can use a food processor, but your tapenade can get soupy and overworked. Also, I don't have a food processor. If I did, I might recommend it over the knife. Ya never know.\nIf you make this recipe with a food processor, add a comment and let us know how it worked.. These are the manufacturer's rejects. Which is fine, because they're cheaper and you're going to chop them up anyway.\n. Love these olives because they're pitted (yay!) and they're cheap (yay!).\nIn addition to these, get two 12-ounce cans of pitted ripe olives. I always get small olives, rather than large or jumbo, on the theory that more small ones will fit in the can than larges ones, and so I'm getting more total olive mass. Ya think?. Drain all your olives before beginning to chop. Be sure your knife is sharp.. Actually, I see some big hunks in there. You want a nice, fine mince.. These look kinda big, too. Who cut this stuff up?!\nEverything you put in your tapenade should be minced very fine.\nFor this quanitity of tapenade, use about 1/4 of a medium onion, or less, depending on how much you like onions. Raw onion is powerful.. Okay, this is is minced nicely. It needs to be nice and fine.\nFor this quantity of tapenade, I use two or three large cloves of garlic, but I really like garlic. You can use less. Or more!. Add hot pepper of any kind that you like, if you like foods spicy.\nLemon is a very good addition, too. Lime would also be delicious.\n(I don't have to tell you not to use salt, do I?). A zester is a very handy item. It gives you strings of the zest which you can use as is to garnish dishes and desserts, or chop fine to put in your tapenade.\nUse a light touch and take off just the yellow zest. Avoid the white, bitter pith.. Zest your lemon before you juice it.. Mince your zest and put it into the tapenade. With this quantity of tapenade, two or three lemons would not be too many.\nBefore cutting the de-zested lemon in half to juice it, roll it on the counter with your palm to break down the juice-containing cels inside the fruit.. This wooden reamer is a handy tool for juicing lemons.\nMix the lemon juice into your tapenade, cover and refrigerate for a few hours to develop flavors.\nIt will keep beautifully in the refrigerator for a couple of weeks, if covered tightly.. Tapenade makes an elegant appetizer for a party. Serve with crackers and raw vegetables such as celery and bell pepper.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_144_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_144_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_144_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_144_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. You'll need: Couscous: 1 Tbsp. olive oil 8 oz. (about 1 1/3 cups) Israeli Couscous\u00a0 14 oz. chicken broth (1 3/4 cups), if you want to keep this vegetarian, use vegetable broth or water 2 Tbsp. parsley, finely chopped 1 tsp. rosemary, finely chopped 1 tsp. thyme, finely chopped 1 medium green apple, diced (I love a Granny Smith) 1 cup dried cranberries 1/2 cup slivered almonds, toasted Vinaigrette: 1/4 cup apple cider vinegar 2 Tbsp. maple syrup 1 tsp. salt (I use kosher) 1/2 tsp. ground black pepper 1/4 tsp. cinnamon, optional 2 Tbsp. olive oil. For the couscous: In a medium saucepan, heat the olive oil over medium-high heat. Add the couscous and cook, stirring occasionally until lightly browned, about 3 to 5 minutes. \u00a0Add the chicken broth (I usually add a decent sized pinch of salt too) and bring to a boil. Cover, lower heat to medium-low and simmer for 10 to 12 minutes or until the liquid has evaporated. Set aside to cool. For the almonds: Preheat the oven to 350 degrees F. Arrange the almonds in a single layer on a foil-lined baking sheet. Bake for 8 to 10 minutes or until golden brown, stirring occasionally. Set aside to cool. (I toast mine in the toaster oven and it only takes about 5 minutes, you could also dry toast them in a skillet over medium heat until golden brown.). Finely chop the herbs, dice the apple. (I usually peel the apple and slice the apple off the core in quarters, this way I have a flat side for dicing. Slice an apple quarter into 1/8-inch slices, stack them, then make 1/8-inch horizontal cuts, then 1/8-inch lengthwise cuts\u00a0so you have 1/8-inch dice.)\u00a0 For the vinaigrette: In a large bowl, combine the vinegar, maple syrup, cinnamon (if using), salt, and pepper. Whisk in the olive oil until smooth. Add the cranberries and let them sit about 5 minutes so the cranberries soften.. Add the apple and herbs to the vinaigrette, mix well. Then add the Israeli couscous and almonds and stir everything until combined.\u00a0 As I said, this is wonderful at room temperature, and any leftovers can be covered, refrigerated, and served cold up to 2 days.\u00a0 Mangia and enjoy!\u00a0\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_145_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_145_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_145_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_145_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. You will need:1 package bread yeast2.5 cups, all-purpose flour1 tsp. salt2 Tbsp. warm milk1 Tbsp. sugar1 cup warm waterAdditionally, you may want some sesame seeds and a little melted butter to finish right before baking.. Combine the yeast, milk, sugar and water. Mix gently and set aside for 15 minutes. This allow the yeast to activate.When it looks frothy, it's ready.. In a larger container, combine 2 cups of flour and the salt. When your yeast is activated, mix the dry and wet ingredients together. When your ingredients are mixed well, sprinkle the remaining flour over the dough ball to prevent sticking. Cover and allow to rise for 1 hour.Optional, This dough can be very sticky. You can spray cooking spray onto whatever bowl you are mixing and rising in to further prevent sticking.. After an hour, punch down the dough and knead for 5 minutes. Add a small amount of extra flour to prevent sticking while kneading.  Cover again and allow to rise for an additional 25 minutes. Preheat your oven to 375 degrees F.After the second rising, knead lightly for 5 minutes and divide in three even parts. Roll the parts into snakes and pinch the ends together. Place on baking sheet and braid the pieces together carefully. Allow to rise for an additional 15 minutes. Brush lightly with butter and sprinkle with sesame seeds.Place your loaf in your 375 degree F oven and bake for 30 minutes. Allow to cool, if you can wait, and enjoy! Remember, there are no preservatives in this bread, so it will get stale after 3 of 4 days.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_146_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_146_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_146_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_146_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. 100g oats 2 bananas 150g pumpkin puree cinnamonPreparation time: 10-15 minfor 2 servings. Cook oats in the water until tender.. Add some cinnamon, 2 bananas, pumpkin puree and mix it all together.. Decorate the top with some fruit. I love to use grapefruit, grapes, banana and peanut butter for this meal.. . YOUTUBE:  https://www.youtube.com/channel/UC_GmntyQbCokHFwy...WEBSITE:  http://www.spineat.comINSTAGRAM:  https://www.instagram.com/spineatcook/TWITTER:   https://twitter.com/SpinEatCookFACEBOOK:   https://www.instagram.com/spineatcook/TWITTER:   https://www.pinterest.com/spineat/\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_147_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_147_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_147_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_147_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Avocados are properly ripe for this dish when they're slightly soft, giving a bit under pressure.  A few spots are fine.  Wash your avocado well, then slice in half, and remove the big pit.. You'll need:1 avocado half2-4 tablespoons apple cider vinegar (to taste)1-2 tablespoons olive oilItalian seasoning - a \"sprinkling\"Salt Pepper. Combine your oil and vinegar to taste inside the well of your avocado half.. Sprinkle with Italian seasoning and salt and pepper - to taste.  . You may want to use balsamic vinegar instead of apple cider vinegar, for a sweeter flavor, or use Italian dressing in place of your own oil and vinegar concoction. . Serve alone, as a side dish or with a side of corn chips.  It's a wonderful lunch, with only about 150 calories, and high in potassium and fiber. Enjoy! \nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_148_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_148_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_148_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_148_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Ingredients: (makes 4 burgers) 8 pieces of bacon (2 per burger) 1 lb. ground beef 2 large beefsteak/other large tomatoes  4 thick leaves of Romaine lettuce 4 thin slices cheddar cheese 2 tablespoons mustard  3 cups minced Mushrooms Salt/pepper for taste FOR BURGER SAUCE: \u00bc cup mayonnaise \u215b cup mustard 1 tablespoon sweet relish 1 teaspoon lemon juice. Prepare your lettuce, cheese, and tomatoes. Cut the tomatoes in thin slices. You should have about 10 - 12 slices of tomatoes. Place the lettuce, tomatoes, and cheese on a tray until further use.. Shape ground beef into patties. First, with your hands, divide the 1 lb. of beef into 4 equal balls. Break the meat up, with your hands, and then push each of the 4 balls down so that they are proper flatness.. Prepare burger sauce by combining the mayonnaise, \u215b cup mustard, relish, and lemon juice in a small bowl.. Mince the mushrooms finely. Put them in a medium bowl for future use.Place a cast iron griddle over a grill and heat it up. Or, alternatively, you can use a flat metal skillet/griddle over your stove top. Once sufficiently heated, cut your bacon pieces in half, so that you have 16 halves, and place on heated skillet. Sizzle until just cooked through. Remove bacon from griddle, and place in a bowl, and insulate it with aluminum foil. Do not turn off griddle, and leave the bacon grease hot.  Get your mushrooms ready. Dump them on top of hot bacon grease, and saute until mushrooms are very brown. Place them back in their original bowl, and insulate with aluminum foil. Do not turn off the griddle.. Now for the patties. Squirt the 1 tablespoon of mustard on the patties, and preferably smear around with a brush. Then place the thick burger patties on the hot griddle--yes the same uncleaned one you used to cook the bacon and mushrooms on.Sprinkle patties with salt and more mustard, and let cook on one side for two minutes or so.After the two minutes are up, flip the patties over, sprinkle the cooked sides with salt, and place a piece of cheese on each patty. Finish cooking on this side for another couple minutes. After the minute is up, place patties and their cheese on top on a large heat-proof plate, and cover with aluminum foil.. Now it\u2019s time to put together the burger. This part is easy. First, place the bottom burger bun on a serving plate. Then smear the mayo burger sauce over the top. Then place on 1-2 tomato slices. Then 2 halves of bacon. Then one of the cooked patties goes on top, and on top of the patty goes the additional 2 halves of bacon. On top of the bacon goes the sauteed mushrooms. Then on goes the piece of Romaine lettuce, and finally, on goes the last burger bun! Repeat 3x for a total of 4 delicious Mushroom Cheddar Bacon Burgers!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_149_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_149_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_149_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_149_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Ingredients and UtilitiesTwo eggs (If you want more eggs on the sandwich feel free to do so) Mayonnaise Two bread slices (I advise whole grain bread, but rye bread is also acceptable) Any kind of cheese (I make mine with Swiss) Deli Ham Optional:Any other meats or cheeses you would like to add (if you do add meat i suggest turkey)ButterFor step 4: A medium sized frying pan For step 5: A panini press. I suggest frying the egg over hard, because when using the panini press to sear the hag, the egg yolk might splatter over the panini press. Put butter in the medium sized frying pan and fry it till the yolk is hard (this is how you cook a over hard egg).. I usually do this as I am frying the egg, but if you do not have very much cooking experience I would prep the sandwich before or after frying the egg. Pick two bread slices of your choice and spread the desired amount of mayo on them. I would put the mayo on both sides for a better cooked sandwich experience. After spreading the mayo, put half slice of cheese on both sides; if you are using a different cheese or you just would like more Swiss, add it. Then put two slices of ham, evened out between the two bread slices, in the sandwich.Before I explain the next steps, there are two ways to cook the Hag. Step 4 explains how to pan fry the sandwich. Step 5 explains how to use a panini press to cook the sandwich. Skip to the step that you prefer the Hag, or try both!. Take your frying pan and place it over your fire. Next spread butter around the pan. Take your prepared Hag Sandwich (with the fried egg inside)and leave it on the pan until the bread is brownish-blackish, which adds a crispier experience  (or leave it on however long you want it).  After frying the Hag, put it on a plate and voil\u03ac the hag sandwich is complete! If you are using this step to cook the sandwich skip to step 6.. Before you start cooking the sandwich with the panini press, heat up the panini press. After the panini press is heated up , add the fried egg to the prepared sandwich and place the sandwich in the panini press, placing the top on after putting the sandwich in the panini press. Leave it for about two minutes (or leave it however long you want), and then flip the sandwich over; be careful though, because the panini press is very hot. Finally, put the sandwich on a plate and you are good to go. By the way, I use a cast iron panini press, but the instructions should be the same if you are using a electric panini press.. After creating the master sandwich, the next step is to finally give your mouth the privilege of feasting on a hag. Your mouth will rejoice in the glorious taste of the sandwich, and your stomach will crave the the hag. I usually leave the sandwich uncut when eating it, but if you want to you may cut the finished product to get a more formal look. I don\u2019t normally drink anything with the sandwich when eating, but if you were to drink something with it i suggest a tall, cold glass of milk.Before I leave you with the thought of making this sandwich, I would like to thank my father for teaching me my cooking skills, and the owner of this (https://www.instructables.com/id/The-Best-Turkey-Sandwich-Ever/) instructable for inspiring me to share my recipe to the world.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_150_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_150_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_150_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_150_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. for cookies:\n3/4 cup brown sugar\n1 stick softened butter\n1 egg\n1 teaspoon vanilla\n1 cup flour\n1/3 cup cocoa\n1/2 teaspoon baking powder\n1 cup semi or bitter sweet chocolate chipsglaze:\n1/2 cup powdered sugar\n2 tablespoons boiling water\n1 tablespoon cocoa powder\n1/2 teaspoon peppermint extract\nYou can use white sugar for these, but they won't taste as amazing. The brown sugar really gives them extra oomph. :D\nYou'll also need an oven preheated to 350 F and a baking sheet with parchment paper lining it.. Cream the butter and sugar together. Add one egg and a teaspoon of vanilla and mix that in.\nOnce everything's nicely combined, mix in the flour, cocoa and baking powder. I got lazy and just poured them into a sieve instead of combining them in a separate bowl before. :)\nNow add one cup of chocolate chips and mix them in.. Place twelve rounded tablespoonfuls of dough on a baking sheet and bake for 12 minutes at 350 F. You'll do this twice. :)\nPut the dough in the fridge between batches - or if you're fancy and forward thinking, do two baking sheets at once!\nOnce they're done, let them cool on the baking sheet until they're easy to handle, and then transfer them to a baking rack.. Put the cocoa powder and powdered sugar through a sieve to eliminate any clumps. Add in 1/2 teaspoon of peppermint extract and 2 tablespoons of boiling water. Stir well and then pop in the microwave for 30 seconds and stir again. It should be well combined now.\nAt first, the oil will float on the top, but as it cools it will mix in. As soon as it stops floating to the top, you know it's cool enough to glaze the cookies with.\nMake sure your cookies are cool before glazing!\nYou can either dip the cookies into the glaze or drizzle it on top. I like to do a few coats to make them extra pepperminty. :D Make sure you glaze them over a baking rack with wax paper below it - it's messy!\nThe glaze will be sticky for a little bit, but within a couple hours it'll harden up and look nice and glossy.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_151_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_151_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_151_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_151_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. INGREDIENTSQuick Pickled Cucumbers Sauce1/2 cup Rice vinegar\n1/2 cup Filtered water\n3 1/2 Tablespoons sugar\n2 Persian large Cucumbers, thinly sliced\n1 small Shallot\n1/8 teaspoon dried Chilli flakes\n1 whole fresh Thai chilli, chopped or whole depending on your spice preferenceTod Mon 1lb Chicken breast or thigh, cut into two inch pieces for easier food processing  1 lb Shrimp, De-shelled and de-veined 3-4 Tablespoons Thai green curry, home made or store bought3/4 teaspoon Sugar1/2 Teaspoon Fish sauce1 large egg1/2 large Onion, small dice, 125 grams12/ bundle of long green beans, 140 grams1/2 bundle Cilantro rough chop, stems and leaves, 45 grams25 Kaffir lime leaves, De-veined and thinly sliced, 9 grams 2 Tablespoons High heat oil for cooking. This Sauce is a wonderful compliment to the flavors of Tod Mon. You can have a few slices of cucumber with each bite of Tod Mon, which is how I like to do it, or you can simply dip it in the vinegar juice.. I like to use a mandolin to slice the cucumbers and shallots because you can set it to cut super thin, which works best for the quick pickle process. Slice and set aside. Bring your vinegar and water to a heavy simmer. Add sugar and stir until sugar has melted. Place cucumbers, shallots, dried chilli flakes and fresh Thai chilli in a mason jar. Pour the heated liquids over veggies and cover. Set aside for later. . Onions - chop your onion small. Small to medium dice is best.Green beans- cut the tips off. line them up together and cut them into 3/4 inch pieces. I find 3/4 inch pieces to be the best size for this recipe because if they are too big they tend to fall out of the balls and if cut too small, they get lost in the ball and you loose their great texture.Cilantro- I use the whole herb. Stems and leaves. The flavor is the same and they have a nice texture.  I'm not one to waste, so unless you're saving the stems for a stock, use them.Kaffir Lime Leaf- you can either pull the leaves off by grasping the stem between your fingers and pulling down on the leaves or just take a sharp knife and cut along the stem. Gather a pile of half leaves and stack them the same direction. Cut thin slices along the shortest distance.  Set ingredients aside. In a food processor add the chicken, shrimp, sugar, egg, fish sauce and curry. Process until well chopped but with some small chunks here and there. This part is really preference. I like mine a little chunkier. The kind you find in restaurants are generally well processed without chunks.. Empty the contents of the food processor into a large bowl. Add onions, green beans, cilantro and the chopped Kaffir lime leaf. Mix by hand until all ingredients are well incorporated and evenly distributed. . The best way to make the balls is by using two spoons. Take a spoon full with one spoon and use the other to shape it. when done shaping use that second spoon to pull your newly made ball into the deep fryer or frying pan. . You can cook it any way you like but the two best methods are stove top and deep fryerStove top- use a high heat oil like coconut or grape-seed oil. Over medium heat drop your spoonful onto a heated and oiled pan. Cook about a minute and a half or until browned then flip over and use a spatula to squish it down a little. Cook until browned on the other side. Set aside on paper towels or a paper bag.Deep fryer- Set it at 350 degrees. When oil is ready, drop spoonfuls into the raised basket. Lower into the oil and fry for about a minute and a half or until browned, moving them as not to stick to the bottom. Set on paper towel or grocery bag.. Serve with Cucumber sauce, sliced cucumbers and green beans on the side.Enjoy and thank you...\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_152_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_152_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_152_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_152_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. This project calls for adding a very small amount of dry ice to a closed container.\u00a0\u00a0\u00a0 This is only safe-ish because we are adding such a tiny (and well measured) amount.\nAdding too much dry ice - or using the wrong container may result in an explosion.\u00a0 In the worst case - this could seriously injure or possibly even kill you.---MANDATORY READING---\nHOW TO AVOID BLOWING YOURSELF UP:Never use a container other than a 2-liter plastic soda pop bottle-Do not use a glass container!!!!\u00a0 You really might kill yourself.\n-Do not use a smaller 16 or 20oz plastic pop container - these are not as robust - and may explode (trust me)\n-Do not use a bottle that seems damaged / stretched from pressureNEVER place the 2-liter soda pop bottle in the freezer\n- This may cause it to shatter / blow up in your face (trust me)Pay close attention to the measurements of dry ice\n- The provided measurements will be in teaspoons / grams\n- Do not confuse this with tablespoons / ounces\n- Do not add more dry ice than is specified\nWatch this video of too much dry ice in a 2-liter bottle blowing up a cinder block:. A Bag of Loose Yerba Mate\nAny brand is fine\nCheck your local Latin American grocery store\u00a0 - \u00a0they commonly have mate about 75% cheaper than other places\nAlso available at many grocery / health food stores or where loose tea is soldDry Ice\nOften available from the meat department of your grocery storeDo not handle directly - use gloves! \u00a0It can burn you quickly!\nWe'll only be using about 6-8 grams per bottle - but since it evaporates - get at least a pound or two2-Liter Plastic Pop Bottle\nNo substitutions allowed!\nBottle should be undamaged / in general good condition\nIf you want to end up with individual servings - carbonate in the 2-litter bottle - then transfer to other containers1/2 Teaspoon Measuring Spoon\nYou'll need other measuring cups / spoons - but this one is important!Hammer\nTo crush dry ice (may want to wash it if it's been sitting in your toolbox)Sturdy Metal or Plastic Bowl\nTo crush the dry ice in -\u00a0I use a small plastic measuring cup\nIf it's plastic - be prepared that it might break.Do not use glass / ceramic!French Press\nIf you don't have one - you can probably figure out an alternativeSugarLemon Juice\nLime juice also works. Dry ice is frozen CO2.\nBeverages are carbonated by dissolving CO2 into them under pressure.\u00a0One source suggests a 2-liter bottle of Pepsi contains roughly 10 grams CO2.\nTo allow a little extra safety - we're going to target 6-8 grams per bottle. \u00a0This also seems to be a nice, light carbonation level for mate.\nHowever - the 2-liter bottle isn't designed to handle the pressure required to carbonate - only to store carbonation.\nIf you added 8 grams of dry ice at once to the bottle - much of the resulting CO2 gas would not immediately dissolve into the liquid. \u00a0This would create more pressure than the same amount of CO2 would if it was already dissolved into the liquid.This level of pressure could cause an explosion!\nInstead - we will be adding a small amount (2 grams) of CO2 (dry ice) in several small doses. \u00a0Each time assuring the CO2 has fully dissolved into the liquid before adding more.\u00a0\nYou won't need a precision scale to measure the dry ice. \u00a0We'll be crushing it to a snow-like powder which reliably weighs about 2 grams per 1/2 level teaspoon.\nLet's stop and note that measurement now. \u00a0That's 1/2 level teaspoon. \u00a0Do not confuse this with tablespoon. \u00a0Do not use a \"rounded\" teaspoon.\nThe powder also provides a large surface area - and dissolves into the liquid quickly.\nCO2 dissolves more readily into cold liquids. \u00a0We don't need our mate ice-cold - but it needs to be at least cooler than room temperature to get good carbonation results.\nImage credit:\u00a0Arie Melamed-Katz. This page includes several references as to the caffeine content of yerba mate:http://www.erowid.org/plants/yerba_mate/yerba_mate_chemistry2.shtml\nIn short - using hot water to extract caffeine from mate typically yields 0.5% to 2.0% caffeine by weight. \u00a0However - most test results came in right around 1%. \u00a0We'll use that number to guesstimate.\nSo - we're making 2-liters (67.6oz) of mate in this recipe.\nLet's call that 4 generous 16.9oz servings.\nMy own measurements put 1 cup mate (volume) at 72 grams (weight).\nLet's say we put 1/2 cup (36 grams) of mate into our 2-liter batch:\n36 grams * 1% = 360 milligrams total caffeine\n360 milligrams / 4 servings = 90 milligrams per 16.9oz serving.\nFor reference Club Mate comes in at 100mg caffeine for the same size serving! \u00a0That's certainly within our margin of error.\nHowever - to get this level of caffeine - Club Mate adds extracted caffeine to their recipe. \u00a0So - using this amount of mate in our recipe yields a little \"richer\" flavor.\nMate varies in strength and density -\u00a0 be aware that it's possible these guesstimates are off by a factor of 2x in either direction.\u00a0 If you are caffeine sensitive - avoid mate.\nFor further reference - drip coffee comes in around 145mg caffeine for an 8oz serving.. Here are a few recipes to try - each one makes 2-liters mate (additional instructions in next steps)\nThe amount of sugar and citrus are based on a combination of research and my own taste.Mate Light\nSimilar in flavor to Club Mate - little less caffeine (60mg per serving)\n1/3 cup mate\n1/3 cup sugar\n1 teaspoon lemon juiceMate Extra\nA bit richer tasting than Club Mate - with similar caffeine (90mg per serving)\n1/2 cup mate\n1/3 cup + 1 tablespoon sugar\n1 1/4 teaspoon lemon juiceImperial Mate\nMore Flavor! More Caffeine! (135mg per serviing!)\n3/4 cup mate\n1/2 cup sugar\n1 1/2 teaspoon lemon juiceInsert Your Recipe Here\nTweak the variables a little - or do your own thing.\nAdd ginger, raspberry, chile powder!Note: Large amounts of strong mate can make your tummy unhappy.\u00a0 You've been warned.. Fill the 2-liter pop bottle half full of cold water (about 4 cups).\nRun the faucet a while to get the water as cold as possible.\nWhen done - put the bottle in your\u00a0refrigerator\u00a0to further chill.DO NOT PUT THE BOTTLE IN THE FREEZER\nThis will cause it to become brittle - and may shatter / explode when carbonating.\nNote - I don't actually recommend taking the bottle's label off as shown in the photo. \u00a0It leaves a sticky residue.. Pick which recipe you want to use from the prior step - then put all the ingredients into your French Press.. Add roughly 3 1/2 cups not-quite-boiling water to the French Press.\nIf you boil water - then let it sit a few minutes afterwards - you'll get the temperature about right.\nDon't worry about the exact temperature or amount of water (just not more than 4 cups - or you might run out of room in the bottle later).\nUse a spoon to stir everything (make sure the sugar isn't sticking to the bottom).\nNote: I have a giant French Press (thanks Beth!) - yours will probably look more full with the same amount of liquid.. Plunge the French Press after 5-15 minutes of steeping.\nCaffeine extracts pretty easily - you'll probably get most of it within a few minutes.\nI commonly plunge the French Press after about 7 minutes.\nBut - going a bit longer may get you a tad more caffeine - and more mate flavor.\nIt's up to you!. In order to effectively carbonate - you'll need to get the temperature of the mate in the French Press down to around room temperature.\nTo speed this process up - fill your sink with cold water - then let your French Press sit in it.\nYou should find it close to room temperature within about 15 minutes.\nWhen it's no longer warm to the touch - you're ready for the next step.. Pour the contents of the French Press into the 2-liter pop bottle.. Fill any remaining space in the 2-liter bottle with cold water. \u00a0Leave as little air in the top of bottle as possible.\nYou may optionally further chill the mate in your\u00a0refrigerator\u00a0(not freezer!) at this point. \u00a0Doing so may provide a bit better carbonation results. \u00a0The colder the liquid - the more easily the CO2 will dissolve into it.. Place a few small chunks of dry ice into your crushing container.Use gloves - the dry ice can burn you!\nLightly crush the dry ice using the hammer (you don't need to use a lot of force).\nContinue crushing until the powder resembles course snow.. Do this step over the sink! \u00a0If you don't get the cap back on quickly enough - the mate may overflow. \u00a0It may also overflow when opening the bottle. \u00a0The colder the mate - the less likely things will overflow on you.Do not add all the dry ice at once!\u00a0 It will explode.\nWe are going to be adding 1/2 level teaspoon (2 grams)\u00a0of dry ice at a time.One last time - that's 1/2 level teaspoon - \u00a0not tablespoon. \u00a0Tablespoons are bigger - and may in this case cause an explosion.And again - 1/2 level teaspoon. \u00a0Not a rounded 1/2 teaspoon.\nPerform the following steps quickly:\n1. Measure 1/2 level teaspoon of the dry ice powder\n2. Dump the 1/2 level teaspoon of dry ice into the 2-liter bottle3. Immediately place the cap back on the bottle4. Immediately shake the bottle\u00a0vigorously!\n5. Place thumb on bottle - note slight pressure building in bottle (keep shaking!)\n6. Once the pressure has peaked, then subsided - you may stop shaking\n7. Slowly remove cap from bottle (don't be alarmed if it fizzes over a bit)\nWhen done - repeat this process twice more. \u00a0Powder more dry ice if needed.\nIf you want more carbonation - you can repeat one additional time.. At this point the mate is probably a bit cooler than room temperature - drinkable - although would probably benefit from some time in the fridge.\nEnjoy your \"Tub Mate\" - but once you're comfortable with the process (or before) - hack up your own modified version!\nI (and others) have found lime and ginger make great additions to the recipe. \u00a0You probably have your own ideas.\u00a0There's lots of people making mate - check out their recipes and get inspired!https://www.noisebridge.net/wiki/Sudo_pophttps://github.com/jdukes/ShadowMatehttp://www.hackpittsburgh.org/brewing-open-mate-sodahttps://gist.github.com/1054447Blatant Self Promotion\nIf you like projects involving radioactivity, lasers or robots you should check out\u00a0http://nothinglabs.com\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_153_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_153_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_153_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_153_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. 1 cup of  wheat gluten1 cup of water1 spoon of flour1 spoon of beer yeast1 garlic cloveshoyu (soy sauce)saltpepperherbs, seeds and spices as you prefer (try parsley, poppy seed and nutmeg). Put all the dry ingredients into a bowl and mix them with a spoon. Also add the spices and herbs in this step.. Now you are ready to add a little bit of shoyu, the more you add the more brown will look the seitan, but depends on your taste. I like with lots of shoyu.After you'll add the water. Add without stir, and don't add all at the same time. Wait till the wheat gluten \"absorb\" all the water. If you see any solid put more water and mix kindly.The result needs to look something like the \"thing\" on the photo below.. Now you need to cook the seitan! Put water boiling and add some herbs if you like and a little bit more of salt. Add the pieces of seitan. If you have a big piece of seitan separate them into 2 pieces.Let the seitan cook at least half an hour.. When the seitan is cooked, put it into a slide with a pot full of water above, to make some pressure over the seitan and that way the excess water will go apart!After that you can store your seitan in the fridge (in a container full of water, shoyu and garlic) or in the freezer. Normally I put some on the fridge to use next days in some wonderful recipe, and I store other piece on the freezer to use later on.And that's it!I hope you like!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_154_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_154_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_154_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_154_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Ingredients:Four Bananas        One heaped tablespoon of Cocoa        One teaspoon of Vanilla Essence You'll also need:Knife       Handheld Blender ( I would actually suggest using a normal blender)  Spoon       Container with lid       JugIf you only have bananas, you can also make this... just follow the recipe, but leave out the vanilla essence and the cocoa. The frozen bananas are the base... what you combine then with it can be actually anything of your choice.. Cut the bananas into smaller pieces, because when your going to mash it, you don't want to break your blender.. Add your chopped bananas to your container and freeze for about 6-8 hours.. After freezing the process is quick. Blend the frozen bananas,cocoa and vanilla essence all together till smooth and creamy (like Ice cream). When blending, take it slowly, you don't want to break your blender! Tip: I would actually suggest using a normal blender.. Enjoy your healthy choc-banana alternative to ice-cream! For a more sweeter version, replace the cocoa with some chocolate spread! That won't be so healthy, but it will be much more sweeter! You can also just leave out the cocoa. Feel free to leave a comment, thanks for taking your time to look at my Instructable and Please Vote!...and visit our profile!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_155_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_155_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_155_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_155_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. For this salad, I used one package of quinoa, one bunch of kale, a package of cranberries and about a cup of feta. You can use different amounts of each, depending on what you like. I also used a few teaspoons of olive oil.I used this brown rice and quinoa mixture because it's all they had at Safeway, but you can use regular quinoa if you can find it.. The directions on my package of quinoa say to bring the quinoa, 1 and 3/4 cups of water and a teaspoon of olive oil to a boil, then turn heat to medium, add the packet of spices and cover for about 15 minutes. All the water should be absorbed when it's done.. Rinse your kale, cut off any brown ends you find and cut the rest of the kale into smaller pieces.. While the quinoa is still warm add it to the kale and mix together. Add in the cranberries and feta and toss. . Dress the salad with olive oil and lemon juice and enjoy!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_156_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_156_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_156_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_156_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Use five pieces of cabbage.. The cabbage need to be diced.. Prepare a white leek.. The leek need to be diced, too.. Prepare ground pork.. Season with soy sauce, sesame oil.. Mix it well.. Like this!!. Wrap meat in dumpling skins.. Wrap meat in dumpling skins.. Finished preparing the Chinese dumpling for cooking.. Fry it!!. Please subscribe to my channel!!BALISAGE Cooking\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_157_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_157_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_157_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_157_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. \n\t1 1/2 pounds of ground beef\n\t1 cup dried bread crumbs or corn flakes\n\t4\u00a0 Tablespoons brown sugar\n\t2 Tablespoons mustard\n\t1/2 Cup ketchup\n\t1 cup milk\n\t1 beaten egg\n\t1 Diced onion\n\t1/2 Cup diced carrots\n\t1 Cup diced celery\n\t1 Diced bell pepper\n\tSalt and pepper to tasteUtensils:\n\tBowls, measuring cups, loaf pan, knife, cutting board, spoon, whisk or fork, and oven.. \n\t1 1/2 pounds of ground beef\n\t1 cup dried bread crumbs or corn flakes\n\t4\u00a0 Tablespoons brown sugar\u00a0\n\t2 Tablespoons mustard\n\t1/2 Cup ketchup\n\t1 cup milk\n\t1 beaten egg\n\t1 Diced onion\n\t1/2 Cup diced carrots\n\t1 Cup diced celery\n\t1 Diced bell pepper\n\tSalt and pepper to taste. \n\tPreheat oven to 350 degrees.\n\tIn large bowl combine the beef, beaten egg, onion, Milk, and the bread crumbs or crushed cornflakes.\n\tSeason with salt and pepper to taste. \u00a0\n\tCombine the brown sugar, mustard and ketchup in a bowl.\n\t\u00a0. \n\tPlace the meat loaf into a lightly greased loaf pan.\n\tShape the edges.\u00a0\n\tSpread the brown sugar mixture over the top.\n\tPlace it in the oven for approx 1 hour. \u00a0We are at 3500 feet and it takes mine longer than an hour to cook until well done. \u00a0\n\tSave the left over brown sugar mixture and use as ketchup at the table for those who love the flavor.. \n\tI have used the base of this meat loaf recipe using different vegetables from left overs and garlic or other seasonings for a change.We always go back to my original, for the wonderful flavor and texture; as our favorite meat loaf recipe. \u00a0I do not speak for my daughter though. She makes it for her husband \"The Condiment King,\" who likes a little hamburger with his condiments! \u00a0She also uses corn flakes instead of bread crumbs. The texture is not as firm as mine. It is fun to experiment though. Meat loaf could be sliced ahead and taken to a\u00a0picnic for a quick slap on the grill.\u00a0\u00a0\u00a0\n\tIn closing I would like to thank our instructables company, sponsors, authors, readers, and members; for making this community a great success! Many hours and hard work has been put into making this place the best DIY on the Internet. Have fun and thanks for stopping by!\n\tSunshiine\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_158_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_158_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_158_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_158_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. For this instructable you will need only minimal, readily available ingredients:-1 jar baby fruit puree, around 100-120g-1 tsp gelatin powder-50ml boiling water-spoon to mix ingredients-jar to set and serve jelly. This is crazy easy, and simple to adapt. -Dissolve the gelatin in boiling water in your serving jar. Spoon out any stubborn chunks of gelatin that don't want to dissolve, they're just a pain and too small to make a noticeable difference. If you want a firmer jelly add another 1/2 tsp gelatin or use 1/2 teaspoon less for a very soft jelly. -Add in your puree and stir well.If it doesn't all come out pour some of the jelly mix back into the jar and give it a shake. This will loosen the puree and it should all pour it and mix evenly now. -Leave it to set in the fridge. Four hours is usually a good minimum setting time. . There it is, it's that simple! Use any puree you want, fruit or veg. It's a nice change for your little one, but still comforting to them as it is the flavours they know and love. I've made this with my own purees too. My favourite is to stew finely chopped apples and pears with a hint of cinnamon and a splash of vanilla, sugar isn't needed as the fruit is sweet enough. I cook until soft and chuck it in the blender until it reaches the right consistency then add in the gelatin mixture. . Full credit for this instructable goes to Rafferty's baby products. The original recipe can be found here https://www.raffertysgarden.com/snacks-and-treats I hope you and your little one enjoy and please leave questions, suggestions and experiences in the comments :)\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_159_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_159_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_159_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_159_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. \n          for the cake:\n\t\t2 eggs\n\t\t2/3 cup white sugar\n\t\t4 tablespooons pineapple juice (I used orange juice since I used fresh pineapple!)\n\t\t2/3 cup all purpose flour\n\t\t1 teaspoon baking powder\n\t\t1/4 teaspoon salt\nfor the brown sugar topping:\n\t\t2 tablespoons of butter/margarine (I used Earth Balance)\n\t\t1/3 cup brown sugar (packed)\n\t\tcan pineapple rings (or fresh pineapple!)\n\t\tmaraschino cherries\nThis recipe makes 8 cupcakes!\nYou'll also need a muffin tin, sifter, nonstick cooking spray, and an oven set to 350 F. :D\n        . Melt the butter and brown sugar in the microwave - 30 seconds should do it. Once melted, mix it well!\nSift the dry ingredients.\nMix the wet ingredients thoroughly!\nAnd preheat the oven to 350 F. . Once the wet ingredients are mixed well and the dry ingredients are sifted, you can mix them together.\nPour the dry into the wet, and mix until everything is combined and you can't see any clumps of dry ingredients. :). Spray your muffin tin with nonstick cooking spray. :)Spoon in a little bit of the butter/brown sugar mixture into the bottom of each cup - it ends up being a little less than a tablespoon each. Then place a maraschino cherry in the middle and surround it with bit of pineapple.Now you'll spoon the cake batter over the pineapple - fill the tin almost to the top!. Bake in a preheated 350 F oven for 20 minutes. Test their doneness with a toothpick - it should come out of the middle of a cupcake nice and clean!\nOnce they're done, you need to let them cool for at least 15-20 minutes, preferably on a wire rack. :) If you try to take them out of the pan right away, you'll probably lose your toppings because they'll be too hot.\nGo around the edge of each cupcake with a butter knife to help loosen them. Then cover the muffin pan with a large plate or a cutting board and flip it over! This should let them come out nice and easy, and in one piece! You can see one of mine did a flip. :P That's okay - just reassemble it carefully. :D. Because of the topping, I'd recommend keeping these in the fridge if it's warm where you are! They keep well for a couple days, but they have a tendency to dry out a little after that since the batter doesn't have any fat in it. :)\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_160_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_160_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_160_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_160_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Edible Supplies:\t\tCupcake Mix (any flavor)\t\tFrosting (any flavor, but you'll need it to be green)Mini Oreos (I bought 2 $1 packs from Target and that was plenty)\t\tNormal Sized Oreos Brown Candy Melts (you'll need very few) Green Candy Melts (you'll need very few, way less than pictured) Orange Candy Melts (I used a whole bag to dunk all of the mini oreos shown and three big oreos) Brown SprinklesSupplies You Shouldn't Eat:\t\tCupcake Pan\t\tCupcake Liners\t\tContainer to melt melts\t\tCling Wrap\t\tZip Lock Bag\t\tWilton Piping Tips, I used the  #2 round and  #5 round\t\tWax Paper. You'll need to make your cupcakes. \u00a0There is nothing special there, just make them and give them time to cool.\nTo get ready to dunk your Oreos, get a piece of wax paper out. \u00a0You will need a pretty big piece if you do as many as I did. \u00a0Then, in a container,\u00a0carefully\u00a0melt your candy melts. \u00a0Gather your Oreos and you are ready to go!\nMy initial idea for dunking these Oreos, was to stick toothpicks in the cream and then dunk them like that so I could easily remove the toothpick after they were dunked. \u00a0This does not work. \u00a0Because the candy melts are so dense, they cause the toothpick to act as a lever and instead of dunking it in and taking it out all nicely, it pries your Oreo apart and causes a mess.\nSo, to dunk my oreos, I just threw them in the candy melts and used a fork to get them out. \u00a0Sometimes they had too much coating and when they did I would lightly press them against the side of the bowl to get off some of the excess. \u00a0Once they were dunked, I carefully set them on the wax paper. \u00a0I put them so they were standing on their side if I could.. Use the same method for melting candy melts and getting them ready with the wilton tips as I did with the Skeleton Cupcakes\u00a0(Step 3). \u00a0You will not need many candy melts of green or brown at all. \u00a0You are only doing small details and it goes a long way. \u00a0I had extra after I did everything and so I drew out chocolate bats and did green vines, which I did use later.Stems:\nHeat up your chocolate candy melts first. \u00a0Prepare a ziplock bag and you will be using a #4 round tip. \u00a0I show in the pictures above how I did the stems. \u00a0It's fairly simple. \u00a0All I really tried to make sure I did was got a nice thick stem that sort of stuck up. \u00a0Their stems aren't always that long, so you just need a little stubby one on top.Vines:\nHeat up your green candy melts for your stems. \u00a0I used 12 pieces and it was\u00a0definitely\u00a0enough. \u00a0Now just draw some vines on your pumpkins. \u00a0I did a couple leaves using the same method as the stems, except, in stead of pulling up and away from the pumpkin, I kinda of went along the pumpkin. \u00a0You can see a little leaf in Photo 5. \u00a0With your extra green, draw some vines on your wax paper. \u00a0I put these on some of the cupcakes later, just for a little extra something, something.\n*Tip: Since you don't really get the zip lock dirty because the candy melts are wrapped in cling wrap, you can use both corners of the bag. \u00a0Then you only need one bag to do the stems and the vines.\n**Another Tip: Make sure when you put the melts in the cling wrap, that you really twist the ends and get the candy melts all grouped in the middle. \u00a0Otherwise they will spread out in the cling wrap as they melt and as you smush them.. Now all you need to do is frost up your cupcakes. \u00a0Throw on some sprinkles and put on a pumpkin or too. \u00a0Do not press the pumpkin in like you did with the bones in the Skeleton Cupcakes. \u00a0This won't push them in the cupcake because the pumpkins are too fat. \u00a0This will just make a mess of the frosting. \u00a0Just set them on top. \u00a0They should stay fairly well. \u00a0The more frosting you use the better,\u00a0because\u00a0while they won't push into the cupcake, you can bury them in the frosting. \u00a0I put some more sprinkles around the base of the pumpkin once it was on the cupcake.\nFor the Great Pumpkin, you are going to need to cut a slice out of the pumpkin. \u00a0See photos 8 - 10. \u00a0Once you cut out the slice and frost it, make sure you remember where it is because it is hard to tell once the cupcake is frosted :)\nNow you can put your pumpkins on all of your cupcakes and throw some vines in as well. \u00a0I tried to make it look like the vines were coming from under the pumpkins (though, I know the vines would be around the stems).. I always take so many pictures of my\u00a0finished\u00a0projects to get just the right one. \u00a0So I am sharing a bunch with you here :)\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_161_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_161_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_161_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_161_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. This  popcorn ball recipe produces soft and chewy Jello flavored popcorn balls. There are relatively few ingredients and should take less than 20 minutes to make. Popcorn Ball Ingredients:+ 6 Tbsp. butter or Margarine + 3 c. tiny marshmallows+ 1/2 package of 3oz.  (3 Tbsp.) Jello mix+3 quarts unsalted popcorn (about 1/2 cup of popcorn kernel)For Ornaments:+ saran wrap+ Ornament hooks (usually sold in the 100 packs for 99 cents)Misc. things you will also need:+ popcorn popper or large pan and oil to make popcorn+ 2-3 large mixing bowls+ Spatula and mixing spoon+ Microwave+ wax paper to set treats onOriginal Recipe from Chris Tanaka. First Wash Your Hands Good, because nobody likes food borne illnesses or diarrhea. Pop about 1/2 cup of popcorn kernels.  I used a popcorn popper but you could also use a large pot and some cooking oil if you do not have one.  Now use your hands to sift through the popcorn to make sure you there are no stray un-popped popcorn kernel. Transfer the sifted popcorn to a separate bowl. Measure out at least 3 quarts of popcorn, a little extra is fine.  'WARNING!!!'Un-popped popcorn kernel that find themselves in the in the popcorn balls can crack teeth. So it a good idea to hand sift the popcorn from one container to another to avoid a painful and costly accident. . Add 6 Tbsp. butter or Margarine, 3 c. tiny marshmallows, 1/2 package of 3oz.  (3 Tbsp.) Jello mix in a bowl. You can use any jello flavor or color you like. I made a double recipe of lime and a single recipe of cherry jello. Put mixture in the microwave for 40 seconds.  Mix the ingredients with a spatula and microwave again for about another 40 seconds. Make sure all ingredients are mixed together thoroughly and the butter and marshmallows are melted. You may need to put it in the microwave another 20 seconds or so but try not to over microwave or they may not stick together.  It should have a thick syrupy texture.You will have to work quickly. Pour the mixture over popcorn and mix well with a large spoon.Once the popcorn and mixture is thoroughly mixed, butter your hands and start forming popcorn balls. The butter prevents the sticky mixture from sticking to your hands and adds a hint of buttery goodness. I think one recipe would make about 10-15 balls, but it really depends on how big you make them. WARNING!!!'Un-popped popcorn kernel that find themselves in the in the popcorn balls can crack teeth. So it a good idea to hand sift the popcorn from one container to another to avoid a painful and costly accident. . Of course you could just eat these they way they are, but they would also make great Christmas ornaments. You will need some saran wrap and ornament hooks to complete this project.Double wrap each popcorn ball with a small piece of saran. You could single wrap them but the problem is that saran wrap can come off easily and then your popcorn balls will be on the floor. Now stick the hook through the saran wrap and hang them on the tree.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_162_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_162_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_162_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_162_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Recipe: For 2 personsIngredients: For the grilled courgettes & aubergines:2 medium courgettes, cleaned, thinly sliced 2 medium aubergines, cleaned, thinly sliced 2 tablespoons chopped lemon thyme, no big stalks, please! 3 fat cloves of garlic, peeled  finely chopped a fruity extra virgin olive oil black pepper pink salt in a grinder For the oven roasted potato fries: 4 white potatoes, peeled & cut up into thicker fries black pepper pink salt in a grinder that same fruity evoo, as used above For the feta: marinated feta chunks, drained For the lemon thyme dressing: 6 tablespoons of that same fruity extra virgin olive oil 2 tablespoons fresh lemon juice 1 tablespoon chopped lemon thyme black pepper pink salt in a grinder. Marinate the sliced courgettes & aubergines for about 2 hours with the marinade ingredients. Take a small bowl a add lemon juice, chopped lemon thyme, black pepper, pink salt, chopped garlic pieces & that fruity evoo. Mix well with a small whisk & pour this all over the sliced courgettes & aubergine slices. Brush it all on with a silicon brush.. About 40 minutes before dinner, preheat your oven to 200\u00b0C ( 400 F) for 10 minutes. In a fitted oven dish, arrange your potato fries. Scatter 5 grins of black pepper & 5 grins of pink salt over it. Drizzle about 6 drizzles of that fruity oil over them. with clean hands, mingle everything together & roast for about 25-30 minutes into the hot oven.. Heat up onto high heat your grill pan. When hot, brush your slices with the same marinade on one side & place them onto the hot grill. With a fork, push the slices onto the hot grill. Grill them for a few minutes & then brush this upper side with the marinade & turn them over to grill the other side. This side only takes a minute or 2. Place them onto a big plate layered with double kitchen paper to dip off the excess of oil. Place another big plate over it to keep warm.. While the potato fries are in the oven & your grilling is nearly ending, make your easy dressing! Take a jam pot with fitted lid & add all ingredients for your dressing in it. Screw lid on & shake your jam pot up & down for a few times. This way, your dressing is well mixed. Taste! You must taste the lemon & the chopped thyme! Yum yum!Plate up, like photo above & just before serving, drizzle a  bit of the yummy dressing all over your dish, in a circle! MMMMM! This is lovely enjoyed with a crisp white wine, like this one 2009 Jacob\u2019s Creek Riesling Reserve, South Australia. A perfect partner! Enjoy, sweeties! xxxYou can also find this tasty recipe here on my blog: http://sophiesfoodiefiles.wordpress.com/2016/10/16/vegetarian-grilled-veggies-dinner/\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_163_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_163_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_163_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_163_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. 4 boxes of lasagna noodles\n4 1 lb. packages of firm tofu\n4 1 lb. packages of soft tofu\n1 box or large bag of spinach leaves washed, dried and julienne\nolive oil\njuice from two lemons or 1/3 cup lemon juice\ngranulated garlic\nsalt and pepper to taste\n1/2 cup of soy milk\nand some homemade lasagna sauce\u00a0 https://www.instructables.com/id/Spaghetti-Lasagna-Sauce/\nfor the veggie wash you will also need some baking soda\na food processor is great but if you don't have one a sieve or potato masher will work as well. Drain the tofu. Don't discard it. I put it with my water that I use for my garden. Chop it up and set aside.\nIn the food processor you are going to add:\ntofu\nlemon juice\nmilk\ngranulated garlic\nsalt and pepper\nI did it in little batches. Add more milk or lemon juice if you need to. If you can only find extra firm tofu, you will have to add more lemon juice and milk.\nAfter it is all blended and smooth add the spinach. Side note: I wash my veggies with a bowl of water and add a couple or more tablespoons of baking soda. I let the veggies sit in the water for about fifteen minutes and carefully remove them so that the yucky stuff stays at the bottom. For the spinach I just laid it out on a kitchen towel to dry before I cut it up.. \nCook the noodles in small batches to prevent them from breaking. But if they do don't worry about it. I cook the noodles last because if they sit too long they tend to stick together. Even if you add oil to the pot of water.\u00a0 If your water is boiling nicely it should only take from 8-12 minutes for them to cook.. \nNow for the fun part!\nYou are going to layer the ingredients in the following order till it is all gone!\nolive oil\nsauce\nnoodles\n\"cheese\"\nrepeat\nBake at 350 degrees till it is all heated through. Probably about thirty minutes. I top mine with equal parts bread crumbs and parmigiana cheese before baking.Enjoy!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_164_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_164_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_164_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_164_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. -CAJUN SPICE MIX-1 tsp Cumin2 tsp Salt2 tsp Garlic Powder2.5 tsp Paprika2 tsp Black Pepper1 tsp Onion Powder1 tsp Cayenne2 tsp Oregano-FISH FILETS-4 Medium Tilapia FiletsOlive OilCajun Spice Mix-CAJUN RICE-2 TBSP Butter2 tsp Cajun Spice Mix1 Cup White Rice1 Celery Stalk2 TBSP Fresh White Onion-CREAM SAUCE-4 oz. Crab Meat2 TBSP Butter1 Cup Heavy Cream2 tsp Cajun Spices. Combine your spices in a bowl. This will yield about 1/4 cup. Keep the remainder in an air tight containerThis is my take on a Cajun spice mix. It is a mildly hot mix that can easily be changed. Feel free to modify the spice levels to your liking.. Tilapia is a fairly mild fish, so it works quite well with this recipe.Preheat the oven to 450 degrees Fahrenheit (230 C).Line a baking pan with aluminum foil.Pat the fillets dry and brush on a light coating of olive oil. Liberally apply the spice mix to both sides of the fish and place on the pan.Place the pan in your oven and bake for 12-15 minutes (thickness of fish will vary cooking times). The fish is done when it flakes easily with a fork.. Finely chop the celery and onion.Place your skillet over medium to high heat. Add 2 TBSP of butter to the skillet and saute the celery and onion.Add the rice and brown very slightly. Add the chicken stock and Cajun spice then simmer until done (app. 15 minutes). Place the butter and cream in a saucepan over medium heat, stirring constantly. Drain the crab and put it in the pot.  Add 2 tsp. Cajun seasoning and continue to stir. Watch it closely because it will boil over if you do not constantly stir it! Simmer for 5-6 minutes. Remove from heat and allow to cool slightly. -note- If you want a thicker sauce, add 1 tsp of corn starch to water, then add to the sauce while cooking. . Place the fish fillets on a bed of rice. Add the sauce and enjoy!Once again, thanks for reading!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_165_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_165_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_165_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_165_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. So this is a pretty basic bread except for the extra stuff. I have changed it some from the original one I tried. Among other things I substituted a cup of whole wheat flour because I like it. You start with beer. Now the original said 10 oz  of beer but I found that the final mix was to dry with that amount. I think that is because my flour is dry and I always end up adding extra water to almost any recipe. Why would my flour be dry? Well, the inside humidity is about 15 % at the moment and it sucks the moisture out of everything. My cats fur lights up with all the static charge they develop. So it needs the extra water, so why not use more beer?The type of beer? Well I tried one of those dark strong flavored ones but to me the bread had a bitter taste from it. Other people didn't think so, so basically go with what you like. For a mild flavor use a mild beer. if you like the strong beer then go with that. The beer needs to be flat and warm. In other words you need to ruin it. I found the best way is to first take a sip, and then pour the rest into a bowel or cup and use a spoon to stir it until a lot of the bubbles leave. Then microwave it for a little bit until its warm. Stick your finger in it and taste it and if you lament about having ruined it then its just right. Yum, warm flat beer. It should be a little less than 12 oz, unless your sip was more like a glug. Now just leave it alone and put the rest together.. You need 3 cups of flour. I used 2 of a bread flour and one of whole wheat. I think it has a little more body that way.Interestingly I had just put some wood in the wood stove when I started this loaf and I had some sawdust stuck to my shirt. When I plunked the bread pan down on the table some of the saw dust fell into the empty pan. And I had a moment of inspiration ---  I wonder if I add a tablespoon of sawdust to it can I enter it into the wood contest? I of course had to look up information about sawdust and bread. It has happened and been done. In fact around the turn of the century, well the one before this one,  it was not uncommon for it to be put in prisoners bread. It is undigestible and costs almost nothing but makes people feel like they are getting food. You might want to look up information on it, its very interesting.  And now its being added to bread for dieters, refined cellulose, because it is undigestable  but makes you feel like your eating something.  But now its all labeled and such so you know what your getting.OK, no sawdust in this one.3 cups of flour1 tablespoon of sugar (I often use honey instead of or even with) 1 1/2 teaspoons of salt1 tablespoon of butter, or margarine12 oz of flat warm beer1 package of yeast, you can use bread machine yeast or quick rise yeast, whatever works the best for you.8 oz of cheese  -   yeah its a lot of cheese.  I found the type of cheese has a big impact on the flavor of the bread. In one of my other tries, along with the dark beer I also used 4 oz of extra sharp cheddar cheese and 4 oz of Monterrey jack. Wow did that Cheddar flavor come through. If you want strong flavor go for it.  For this one I used mild Cheddar and Monterrey Jack, half and half.Cut up the cheese into bit size blocks. . I have read bread machine instructions where they say to put in ingredients in a certain order. I never saw the point since it all gets mixed up in the first few seconds anyway. So I start with one cup of flour and then melt the butter and add that along with the other dry stuff, add some cheese and then add some of the beer. Add another cup of flour on top of that and add the rest of the  cheese and some more beer and then the rest of the flour and beer. As long as it all gets in there, that is what matters. . This is the easy part, compliments of the bread machine. It does all the work.Set the program for a basic bread. I always set the crust to light, maybe that is just because of my machine. Do it the way you like it. Mine takes 3 hours to run the program.Push start.This is the one important part ----After a few minutes when it is all mixed check to see what the ball looks like. If it is to dry there will still be a lot of flour left that is not formed into the ball. You need to add water a little at a time until everything is picked up into the ball. If it is too wet it will have stuff sticking to the sides of the pan and looking a little gooey. If it does this you need to add a little flour a spoon at a time until nothing is sticking to the sides of the pan. You should have a nice solid ball that doesn't stick to the sides. You will notice that the cheese gets chopped up as the mixing progress. Pretty soon you wont even be able to see any pieces of cheese at all. It all gets blended into the ball.  . After 3 hours, the machine beeps and you get to take out your finished loaf. All that is left is to cut it up, put a little butter on it and enjoy it. Wow, that is good stuff. Make a sandwich by adding a few slices of bacon and you have a complete snack. Or if your into hot stuff you can dip pieces into hot sauce. However you like it.  By the way, in case your wondering, all the alcohol is evaporated out of this in the baking process so its fine for anybody to eat including children. \nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_166_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_166_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_166_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_166_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. First off we want to mix together the cinnamon, allspice, flour, and brown sugar.. Cutting the apples into the pi shape proved to be pretty tricky. The best way I found was to first core the apple, peel it, cut the apple into thin circular slices. Stack the slices on top of each other to resemble the original apple shape and begin carving. The spare scrap pieces of the apple will be used for the base. Toss the apple slices in with some lemon juice so as to prevent oxidation. Ginger ale will also do the trick.. Preheat the oven to 400F\nPour the dry ingredients in with the apples and evenly cover. Place the apples(only the scraps) into a pan and cook on low heat until mixture is soft and sticky.\nLet cool to room temperature.\nIn a Pie pan, unroll one of the pie pastries. Pour the cooked scrap filling into the pie shell. Top with the Pi shaped apples.\nTake the second pie pastry and slice into even lines, this will be the top of our pie. It will be woven in such a way to resemble a lattice.\nPlace in oven and bake until a golden brown crust has been achieved (about 15-20 minutes).. Happy Pi Day!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_167_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_167_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_167_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_167_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Take the water boiler and pour in your water. Be careful not to spill any of it!. Plug in your water boiler and wait for the water to boil. This could take a moment, just look and listen very carefully!\u00a0. Take your cup and insert a teabag to your choice. Fold the string around the edge of the cup, so the little cardboard at the end lies on the surface.. Now it is time to add the hot water! Pour it in slowly and carefully, so there is no risk of burning yourself or somebody else near you!. The color of the water you just poured in, will change color gradually. The long you wait, the stronger the flavour of your tea will be. Precise timing is essential here!. This step is completely optional. Take a spoon and stir the tea-in-the-making. This will help to:\n-Cool the tea\n-Mix in the flavour very well.. Listen up! Here comes the hardest part: be sure not to add sugar while stirring the tea. This could take some effort, but it will assure you an original and unaffected flavour sensation you have never tasted before.. And voila! You now achieved a nice warm cup of tea, without any sugar!\nEnjoy and be sure to blow well before drinking. This will cool the tea efficiently.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_168_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_168_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_168_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_168_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. You Will Need:\n1 cup flour\n1/2 cup butter\n3/4 cup romano cheese\n3 tsp paprika\n2 tsp poppy seeds\n1/2 tsp cinnamon\npinch salt\nOven preheated to 350\u00b0F. In a large mixing bowl, cream together 1/2 cup butter and 3/4 Romano cheese until blended as smoothly as possible.. Add 2 tsp paprika, 1/2 tsp cinnamon and a pinch of salt to the butter-cheese mixture and blend until smooth.. Add 1 cup flour and mix - the dough will be rather dry and slightly tough.. Separate the dough into 18 equal parts and roll into balls, then press and flatten slightly.. Using a sharp paring knife, cut an X into each ball. Only cut about halfway down to avoid splitting the dough balls into quarters.. Press a small hollow into the centre of each ball using a finger or other utensil. Dust the dough balls with approximately 3/4 tsp paprika.. Carefully sprinkle a pinch of poppy seeds into the centre of each ball and brush any strays into the hollow you made with your finger.. Bake in an over at 350\u00b0F for 15-20 minutes. Dust baked puffs with approximately 1/4 tsp paprika to give them a brighter colour (optional). The resulting puffs will be richly cheesy and spicy. Very decadent and filling, they make a great appetizer. Serve during autumn events between poppy season in late summer and Remembrance day in November.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_169_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_169_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_169_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_169_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Ingredients:   4 Tbsp Whole Oat Groats 2 tsp Whole Chia Seeds1/3 cup Water2 Tbsp Milk1 tsp sugar (optional)Frozen Fruit for topping Appliances: Coffee bean grinderMicrowave safe bowlPlateSpoonKnife . Place chia seeds and groats into coffee grinder until mixture is of a powder consistency with minimal chunks. . Place mixture into bowl and stir in water. Microwave for one minute on high. Remove and stir again. It should currently have a dense consistency with a thin layer of excess water. Cover with a towel and wait 15 minutes for the water to soak in fully. . Remove towel and stir. Microwave for another minute on high. . Add sugar and fold until dissolved. Chop up fruit and fold in to add flavor and color. I prefer apples, coconut, and black raspberries. Guide substance to the middle of the bowl leaving room to make a moat of milk. Lastly....enjoy! \nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_170_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_170_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_170_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_170_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. 10 (or however many can quench your insatiable hunger) Regular Oreos\n1 Bottle of Vegetable Oil\n1* Box of Pancake Mix\n1 Mixing bowl\n1 Saucepan\n* Instead of pancake mix, you can just substitute your own pancake batter or even waffle batter.. Whip out your favorite dry pancake mix (or wet pancake mix, but I doubt that exists) and use your eyeballs to read the instructions on it. Use everything it tells you. Don't assume that \"Oh nobody will know if I skimp out on using two eggs, and instead use this fake egg solution I found at the grocery store!\" They will know.\nWhen you are making your pancake batter, use more flour than it calls for, or less milk. You want the batter to be thicker than normal that way it clings to the oreos. I also threw in some vanilla extract and cinnamon for taste.. Pour in just enough oil to totally cover the Oreos and to give them room to move around without hitting bottom. Set your stove-top to medium-high (ours went to 10, and I had it on 7). Using a candy thermometer (or any thermometer, I won't tell anyone) check the temperature every few minutes. When that bad boy reaches 280 Fahrenheit (138 Celsius), your ready to fry!. Grab two or three Oreos and toss them in the pancake batter. Flip them about and make sure that no part of the Oreo is left uncovered. Make sure that you don't leave them in the batter for too long. Say the phone rings and you step away for a few seconds, that's okay. However, if your oil catches fire and you waste time trying to put it out, shame on you. Those Oreos could get soggy!. Now, this is where it gets difficult. Throw your battered Oreos into the molten oil and they should start floating around like little ghosts. Try not to put more than four or five in at a time because it gets very difficult to keep up with each one, and they may start plotting to destroy you.\nContinuously splash hot oil on the top sides of the Oreos while they are cooking. This makes sure that they cook throughout (at least, in theory)\nWatch the bottom of the Oreos. When they reach a rich, golden brown color, flip them over with your tool of choice.\nContinue to watch your Oreos while still splashing them with oil. When the entire oreo is a dark golden brown, (emphasis on GOLDEN, not a dark brown) they are done. Pull them out and place them on a paper towel-topped plate. Give yourself a pat on the back, 'cause you just fried an oreo.. Let all your Oreos cool down before indulging. I say cool down, I just mean don't dig into them five seconds after taking them out of the hot oil. Shove as many in your face as physically possible. These things are like ambrosia. Call your friends, call your neighbors, call your lawyer, call your doctor, call the emergency room. Share these artery-clogging monsters with anyone you can, because I can promise you, you cannot eat them all.\n**A note from the editor-\nWe tried frying other things while making this instructable:\nGolden Oreos - As good as their chocolate counterparts.\nSnickers - The chocolate melted but they are extremely tasty\nReeses Cups - The chocolate and penut butter melted and mixed together to make a delicious concoction.\nButterfingers - Terrible idea. The insides of the Butterfinger caramelized to create a sticky, hard mess.\nRaw Pancake Batter - Turned out sort of like a funnel cake. Very good.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_171_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_171_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_171_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_171_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. You will need to order a big ole bucket of chicken.\nAny fine dining establishment will do, as long as the product is served out of a paper bucket.\nYou will need an oven mitt.  Get one that you can cut up. Do not use mom's or you will be forever banished to your room.. Figure out how you normally hold the bucket with the oven mitt.\nYou can do a side or top hold but the easiest is with the oven mitt under the bucket.\nPlace the bucket down over the mitt.\nCut a slit where your hand would pop out of the oven mitt.  The cut should be small enough to be covered by the bucket. It is best to make the hole just big enough for your hand to squeeze through.\n. Turn the bucket over.\nOutline a hole that will be as big as a clenched fist.\nCut the hole out with a utility knife or just poke a starting hole with a pair of scissors and cut around.\nDiscard the cut out piece of cardboard.. Stuff the front of the oven mitt with paper towels, tissues, etc. to fill out the thumb and finger parts of the oven mitt.\nStick your hand through the hole.\nYou may need to keep your index finger in the oven mitt to support the oven mitt under the bucket while the rest of the fingers go through, kinda like a baseball glove.\nPlace the bucket on top of the oven mitt and place your hand through the hole in the bucket.. Place the Original greasy contents back into bucket, Extra Crispy if that's what you got.\nTo do this \"trick\":\nWalk into the crowded room holding the bucket of chicken.\nPull out a piece of chicken and exclaim \"Dibs on the leg\" or something.  And say \"Who wants the last piece?\"\nHolding the bucket tilted slightly away from the intended victim or whoever is hungry so that they cannot see inside, let them reach in deep to get the last piece of chicken. \nWhen they reach in, grab their hand and listen to them try to break away and scream.\nIt may help to add a bit of fake fur fabric or real fur to the bottom of the bucket if you have it.  That adds to the wonder of what kind of animal was in there.\nHave fun! Muhahhahahahahahhahah!!!!!!!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_172_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_172_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_172_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_172_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Serves: 4Prep time: 10 minutes Chilling time: 30 minutes2 ripe bananas1 cup chocolate flavoured coconut milk1/2 cup coconut oil, melted. Gently melt the coconut oil by placing it in a heat proof bowl set over a pan of steaming water with the heat turned off.This method melts the coconut oil but should keep it below 42\u00baC which in turn keeps it in it's raw form and the nutrients in the oil intact. Keeping the temperature low will also help the mousse set faster.note: you can also melt the coconut oil in the standard way and the mousse will still work.. Place the chocolate flavoured coconut milk and the banana into a high speed blender and blend until smooth. Slowly pour the melted coconut oil in through the top while the blender is running.  The mousse should whip up and turn a lighter chocolate colour.. Pour the mousse into serving bowls and place in the fridge to set for 30 minutes.  . Serve the mousse chilled, topped with banana slices, coconut flakes and cacao nibs.  The mousse will keep in the fridge for up to 5 days, but is best eaten fresh on the day it is made.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_173_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_173_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_173_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_173_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. 15-20 red potatoes1 bunch green onions1 strip of bacon for every two potatoes, I used 8.sour creamshredded cheddar cheesefresh garlicolive oilmustard powder (optional)smoked paprika (optional)salt and pepper!\nPlus, a sheet pan for roasting the taters, a cutting board + a knife and a big bowl to mix it all together. :). \u00a0Turn the heat up to 400 degrees F.\u00a0\nI normally just scrub mine and then leave them on a baking rack to dry most of the way. :). \u00a0Cut the smaller potatoes into eighths. Cut the larger ones to match the size of those pieces.\u00a0\nThe most important part here is getting them all close to the same size!\nOnce they're cut, put them on the sheet pan and season generously with salt and pepper. Then, drizzle on a bit of olive oil and toss them so they're all a wee bit shiny.\u00a0\nThese will go in the oven for 15 minutes!. Trim off the tops and bottoms.\nThen cut them into 1/2 inch wide slices. Put them aside in the big mixing bowl. :)\u00a0. \u00a0Cut the bacon into 1/2-3/4 inch strips.\nMake sure all the fat has been cut through all the way. :)\nYour potatoes should be done with their first 15 minutes now.... \u00a0They should be getting slightly colored and wrinkly now. Just give a good toss and put them back in the oven.. \u00a0Get a large skillet and put it over medium/high heat.\u00a0\nCook the bacon until the meat turns dark red and the fat goes a lovely orange color. Keep in mind it'll cook a bit after you pull it out of the pan so don't overdo it!\nYou may need to drain the fat a couple times while you're cooking - just pour it into a bowl or mug and let harden. You can either cover this and keep it in the fridge or chuck the solid fat into the trash. Your choice. :D\nDrain the bacon on paper towels.\u00a0. Empty all of the bacon fat from the pan except for a little less than a tablespoon or so.\u00a0\nChop up a few cloves of garlic and saute them over medium heat until they get golden brown on the edges - be careful not to burn them! Keep the garlic moving!\nOnce the garlic is done, scoop it onto the paper towels. Notice you've also\u00a0accumulated lots of yummy bacon fat and bits. NOM.\u00a0. \u00a0This time they should be even more brown and wrinkly and smelling pretty fancy.\nToss them around and put them back in for the last 15 minutes. :D. \u00a0The bacon and garlic will still be warm but not hot and everyone will mingle in the bowl and get delicious.\u00a0\nNow we just have to wait on the potatoes.. \u00a0let them cool for a bit (5-10) minutes or so and add them to the bowl. Mix it all together. :). I added a couple of handfuls of cheddar cheese, about 8 oz. (a small container's worth) of sour cream, a good shake of paprika and a couple pinches of mustard powder.\u00a0\nThese amounts are really up to you. Just mess with it and go with your\u00a0taste buds!\u00a0\nMix this all until everybody's happy and coated with deliciousness.\u00a0. \u00a0Or chill! It is also good cold.\u00a0\nI just made sure mine had enough salt and pepper at this point. Not much else to do but eat now!\nEnjoy! ^_^\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_174_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_174_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_174_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_174_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Preheat the oven to 325 degrees Fahrenheit .. Take out all the supplies and put them on the counter. . Put the 2\u00a0sticks\u00a0of\u00a0Butter in a bowl to soften.. Put the Butter (once it is softened) into the mixer.. Plug Mixer into the Electrical Outlet, then measure out 1 Cup of Sugar. Pour the Sugar into the Mixer.. Add 1 Egg to the mix.. Turn the mixer on 2 until well mixed.\u00a0. Once mixed flip the power to 1.. \u00a0Measure and add 3 Cups of Flour to the mixture .. Once the Flour is added, put in1 Teaspoon of Baking Soda into the mixture .. The last part of mixing is to add1' Teaspoon of Vanilla. Then mix until one solid mixture , not separate ingredients in a bowl.. Freezing is an optional thing that lets you freeze the dough until it is needed to be rolled and baked.. Grab a hand full of Flour and spread it out on the counter in about a1x3 foot space(use more Flour if needed).. If you decide not to freeze than you can go ahead start rolling the dough out, in a pushing/ pulling motion all the while pressing down.\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Tip: If the dough is sticking to the rolling pin then you can rub Flour over the rolling pin.. Roll the dough until about 1/2 centimeter thick.. Get out the cookie cutters and cut out the cookies by placing the cutters over the dough and applying pressure.\nTip: Make sure the sharp side of the cookie cutters are pointed towards the dough.. Once the cookies are cut use a small spatula to transfer them out of the dough and onto the stone tray.. Place the tray into the oven. Set the oven timer to 10-12 minutes and hit start(or let it start on its own). . After the timer has gone off, put on a oven mitt and take out the tray.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_175_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_175_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_175_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_175_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. \n          For the cup and saucer:\n\t\tAbout 100g dark chocolate (per cup)\n\t\tThin cardboard\n\t\tAcetate\n\t\tClingfilm\n\t\tSaucer\n\t\tTeaspoon (made from one piece of metal or plastic)\n\t\tScissors\n\t\tSellotape\n\t\tPaintbrush\nFor the mousse filling (this will make enough to fill about 4 cups):\n\t\t100 g dark chocolate\n\t\t1 egg yolk\n\t\t2 egg whites\n\t\t20 g caster sugar\n\t\t2 tbsp strongly made instant coffee\n\t\t(optional 1 tbsp of Kahlua)\n\t\tWhipped cream to finish. Start by making the central piece, the cup. To make the cup, first make the mold. Cut a strip of cardboard, roll it into a short, squat tube and secure it with a couple of pieces of tape. Test the size by sitting it on the saucer. I used a saucer 12 cm in diameter, and piece of card 18 x 5 cm, with about a centimetre of extra length to overlap for a secure fix.\nCut two pieces of acetate the same size as your piece of card. Wrap one of these around the tube, and secure it with some tape. This will provide a nonstick surface to mold the chocolate onto.. Now you have your mold, here comes the messy part. Melt some chocolate. Take the second piece of acetate, lay it on a surface you don't mind getting a bit splattered and spoon some chocolate on to it. Brush out the chocolate until you have an even and fairly thick layer across all of the acetate.\nNow pick up the acetate and wrap it around your mold. Make sure the bottom edge all lines up, as this will help seal in the mousse, and fill in any gaps where both ends meet by brushing on a little more chocolate.\nLeave this to set. Once this has done so very gently peel away the acetate from the outside, and add another layer of chocolate in exactly the same way.. Next make the saucer the cup is going to sit on. To do this, first cover the underside of the saucer in clingfilm. Smooth it out as much as possible, to create a nice surface for molding. If the clingfilm slips, secure it by first brushing the surface of the saucer with a little oil.\nNow, melt some chocolate and pour it onto the saucer. Spread the chocolate across the saucer with a paintbrush until it covers the whole thing in an even, fairly thick layer. Run your finger around the edge to even out any drips.\nLeave this to set. Once it has apply a second coat of chocolate in exactly the same way, and leave that to set.. Once everything has set, this is the scary part, unmolding all of this chocolate.\nFor the saucer, unwrap the clingfilm from underneath and lift it and the chocolate very gently from the saucer. Peel the clingfilm away from the upper side of the chocolate saucer. Place the saucer on a serving plate.\nTo free the cup, first peel off the outer layer of acetate. Now carefully cut the sellotape securing the card and twist the card in on itself, rolling it into a tighter tube. This should make it small enough to remove without cracking the chocolate. Finally, peel away the inner layer of chocolate.\nTo prepare the cup for the mousse, spread or pipe a little melted chocolate onto the saucer and place your cup on top. Fill in any obvious gaps with more chocolate and let the whole thing set while you make the mousse.. To make the chocolate mousse, melt 100 g dark chocolate and stir in the coffee (and kahlua if you are using it). When the mixture has cooled a little, stir in the egg yolk.\nPut the two egg whites in a clean bowl and, using an electric whisk or a lot of elbow grease, whisk the eggs until they form soft peaks. At this stage pour in the sugar. Continue whisking until stiff peaks form and the mixture is glossy.\nVery gently fold the egg whites through the chocolate mixture until fully combined. Spoon the mousse into the chocolate cups, wiping off any spillage.\nLeave this in the fridge for a couple of hours to set.. While the mousse is setting, make the spoon and the handle.\nTo make the spoon, cover the back of the teaspoon and its handle in clingfilm, in the same way as the saucer was done earlier. Brush chocolate onto the clingfilm in a thick layer and leave to set. This will need two or three layers to make sure it's strong enough.\nOnce all the layers have set, very carefully peel the clingfilm off the spoon, and then off the back of the chocolate, Put it somewhere cool to store until the dessert is served.. For the handle pour a little melted chocolate into a piping bag and pipe a thick question mark shape onto some acetate or a similar non stick material. Once that has hardened, pipe another on top of the first layer to build up a little thickness.\nLeave this somewhere cool and safe to store until the dessert is served.. Once the mousse has set the assembly can be finished.\nFirst, carefully secure the handle to the cup with a little melted chocolate. Place the spoon on the saucer.\nTake some whipped cream, and spoon it on top of the mousse.\nFinish off with some grated chocolate, or chocolate shards.\nNoe all that is left is to gather some guests, to impress them with your chocolate confection. Then dig in and enjoy.. \n\t\tIf not being used immediately, the cups can be stored in the fridge until they are needed.\n\t\tI haven't given quantities for how much chocolate makes each part of the cup, since it will depend on the size of your molds. I just melted a little of the chocolate at a time, since not much is needed for each layer. Any leftover chocolate can be remelted for next time. Of course, if you are tempering the chocolate this could make the whole thing a time consuming process.\n\t\tIt's best to make a few spoons and handles spare, as they are the most fragile components of the whole thing.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_176_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_176_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_176_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_176_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. You will need the following materials:\n-Pop Rocks (I used two packets of strawberry Pop Rocks, priced at $0.99 each.)\n-Chocolate (I started out with two $0.99 chocolate bars, which ended not being enough, so I added several cups of chocolate chips.)\nYou will also need these tools:\n-Something to melt the chocolate in (You can use a microwavable bowl in the microwave or a double boiler. I used the microwave.)\n-A mold (I used plastic containers, you could also use clean soap molds, ramekins, or a sheet pan.)\n-Something to line the mold with (use parchment paper for sheet pans, and cooking oil or cooking spray for anything else.)\n-A whisk\n-A spatula. Like I said in the previous step, spray the molds with cooking spray or brush it with oil. Line your mold with parchment paper if you are using a sheet pan.. Microwave the chocolate in a\u00a0microwavable bowl in thirty second\u00a0intervals and mix it around to get it a consistent texture. I started with two\u00a0chocolate\u00a0bars and quickly realized that this wasn't enough and added in a lot of chocolate chips.. After the chocolate is thoroughly melted, I poured in one packet of Pop Rocks, and had a minor panic attack: the Pop Rocks were popping in the chocolate! I suppose it inevitable, but I wasn't sure if the chocolate bars would end up being any good with most of the Pop Rocks having been already popped.. Pour the chocolate into the molds and use the spatula to make sure the mold is evenly coated.\nAfter pouring the chocolate, open up your second packet of Pop Rocks and pour them all over the top of the\u00a0chocolate\u00a0in the molds.. Grab your molds and stick them in the freezer. I'm sure a fridge would work fine, but a freezer is probably faster. Now wait a few hours.\nBe patient, young grasshopper.. The results? A good chocolate bar with a slight strawberry taste and a small pop. It doesn't have as much pop as I would have liked, but it's still there. I had some trouble getting the bars out of their molds (I ended up using a paring knife to pry them out), so next time I would go with a sheet pan and\u00a0parchment\u00a0paper. If you do try this, let me know what you think.\nEnjoy,\nNoahh\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_177_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_177_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_177_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_177_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Cinnamon SnickerPoodles2 3/4 cups flour1 tsp baking powder1 1/2 cups sugar1 cup room temperature butter1/4 tsp salt2 large eggs2 tsps vanilla1 tsp cinnamon1/4 cup sugar for dustingMakes 32 poodles.  Print out 4-8 sheets of the attached SnickerPoodle template. (8 if you have 4 baking sheets, 4 if you have 2 and will be making two rounds on the same ones.). I like to measure out all my ingredients before I get started mixing so there's less chance that I'll forget to add something!Make sure there's a rack in the middle of your oven and preheat it to 350 degrees F.. In a medium sized bowl, blend the butter and sugar with a spoon or hand mixer until well mixed. . Add the vanilla and eggs to the bowl and continue to mix until smooth.. Add the baking powder, salt, and cinnamon and stir well.. Slowly sift in the flour, mixing thoroughly until it has all been added. Be sure to stir all the way to the bottom of the bowl, getting any hidden flour bits.  The dough will look similar to the last picture above when properly mixed.. On a large baking sheet(s), place two print outs of the poodle templates.Cut a piece of parchment paper to fit just inside the edges of the baking sheet(s).. Set yourself up with a little dough ball making assembly line. Place your dough bowl next to a prepped baking sheet and have a small bowl with the 1/4 sugar for dough ball dipping nearby.. You'll be making small balls of dough that are about the size of the thinner inner circles on the templates. The dough will expand as it cooks to grow to the size of the thicker outer lines, connecting ('gluing') the poodle ball sections together.Make the balls by rolling small pieces of dough in between your hands, checking the sizes against the template as you go. Once you've achieved the correct size, dip JUST THE TOP of the balls in sugar and place side sugar up on top of that particular ball on the template. Flatten each a bit with the heal of your hand. Your pieces should be about 1/4\" thick.NOTE: The tiny balls that connect the tail and make up the feet don't need to be dipped or flattened.. Make sure to fill in all the circles with dough balls for each SnickerPoodle!. To shape the heads, use a sharp paring knife to follow the cut (dashed) lines in order to create the poodle nose profiles.. Once you've filled in all the circles and cut the faces out, this is what they should look like!. Before baking, please gently remove the print outs by holding the parchment on the opposite side and slowly sliding them out from underneath.. Bake one sheet at a time on the middle rack of the oven for 8-10 minutes. You want them to be only lightly browned on the bottom. Overcooking them will result in less inner chewiness. (read: less awesome)Once removed from the oven, let cool on the baking sheet to allow the cookies to crisp up. Trying to remove them before they've cooled can result in the poodle ball connections breaking apart. (= broken Snickerpoodles)Repeat with the remaining sheets of poodles. . These would be great for a French themed engagement party or for just any old reason. Have fun! and if you try making them, please post photos to the comments section below and click the 'I Made It' button!Bon Appetit.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_178_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_178_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_178_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_178_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. INGREDIENTS: \n5 cups of flour (unbleached all purpose works well)\n2 cups of warm water (100-110 degrees F)\n2 teaspoons yeast (instant or dry)\n2 teaspoons salt. \nIn a mixing bowl, mix 1 cup warm water, 1 teaspoon yeast and 2 cups of flour. If you are using dry yeast, let the yeast bloom (soak) in the water for a few minutes before adding the flour. Cover the mixture, and let in sit overnight. (For a stronger sour dough-like flavor, let the starter sit for a couple of days).Note: I have found that covering the starter with a cling wrap and then a tea towel helps keep the starter from drying out. You will have to scrape a little of the starter off of the wrap, but it makes further mixing easier.. Add 1 cup warm water, 1 teaspoon yeast and 2 teaspoons salt to the starter. Mix well. Add remaining flour a cup at a time and mix in the flour. I have found it useful to hand mix (with clean, bare hands) to finish mixing, wither using a mixer or not.\nNow knead the dough. If using a mixer, follow the machine's instructions for kneading. 2-5 minutes should do the job (I like going 4 to 5 minutes). If kneading by hand, knead the dough for 5-10 minutes.. Form the dough into a ball, and cover (I reuse the cling wrap and the towel). Now let the dough rise for 3 hours.Note: The dough will rise best if the room temperature is over 70 F.. \nNow, beat down the dough, and knead it once again (with a mixer, knead of a minute or so. By hand, knead for a couple of minutes). Shape and roll the dough into a ball. For a thicker, harder crust, roll the ball in flour (My wife prefers the bread with a thinner crust).\nCover the ball with a tea towel, and let rise for 45 minutes. (I preheat the oven to 400 F after 30 minutes).. Score the top of the bread. I like to make a cross on the top (like a shepherds' bread), but you could score parallel lines in the top or skip the scoring (the crust will rip open any way).\nNow place the bread into an oven preheated to 400F, and bake for 30-35 minutes.\nRemove the bread from the oven, wrap it up in a tea towel, and let it rest for 10 to 15 minutes.NOTE: I bake the bread on a pizza stone, but before we had one, we used a cookie sheet.. For a whole/multi grain bread use 2 cups whole wheat flour for the starter and add 2 cup of the wheat flour and 1 cup rolled oats (dry) when mixing. Rolling the dough in oats instead of flour for step #6 really bumps up the presentation.\n2 or 3 tablespoons of fresh rosemary (chopped) makes a nice addition.\nBrush the bread with an egg wash before baking for a shiny, deep brown crust. For the egg wash, beat together 1 egg and 2 tablespoons of water.\nLastly, have fun with the size and shape of the loaf. Long french bread like loaves work well. Also, the dough can be divided into 2-4 balls and used as bread bowls for soup. Note: You will need to adjust the bake time.. Like almost all breads, this one is most tasty while it is still fresh and warm. Enjoy!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_179_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_179_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_179_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_179_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. A dremel or other rotary toolA Pointed drill bit (see picture above)A cleaned out eggshell (this can be done by drilling two holes into opposite sides of the egg and blowing through one)Led's/Edible Slime(instructions to make can be found online)         (optional)and thats it.... Hold the egg at an angle to the rotary tool and press slightly to make a hole, move it sideways and make a line... thats it. Light it up, if you want by just putting an LED inside it.OR.. fill it up with edible slime for a disgusting looking but delicious desert.(it takes some practice to make slime of the right consistency so that it drips out). Don't get frustrated if you don't get it the first time, it took me a few tries too.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_180_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_180_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_180_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_180_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. 200gms whole Black lentils (urad dal) Kidney Beans (handful) 3 Onion (medium size) 5 Tomatoes (medium size) 3 tbsp Ginger and Garlic paste Chili powder Coriander powder Garam masala 5 tbsp Ghee  Salt Coriander leaves  4-5 tbsp CreamFor the tadka50gms unsalted Butter 2 tbsp  chili powder 1/2 tbsp Asafoetida 1 tbsp Cumin seeds 2 tbsp Garam masala. Soak the black lentils and kidney beans in water overnight. Boil the black lentils and kidney beans in a pressure cooker at high flame. After 5 mins turn the heat to low and allow the lentils to cook for 20 mins. After 25 mins turn off the heat and let the lentils rest in the pressure cooker.. Finely dice the onions and tomatoes. In a heavy bottomed dish add ghee (turn the heat to low). After the ghee is heated add onions and saut\u00e9. After the onions turn brown add ginger garlic paste and allow to cook for 5 min. After the raw smell of the paste disappears add the tomatoes.  Add chili powder, coriander powder, salt. Let this masala cook for 7-8 mins (till the tomatoes have cooked and the oil separates).Now its time to add the lentils.. After all the pressure from the pressure cooker has been released mash the lentils lightly with a masher. Next add the lentils to the masala. After add the lentils, mix well. If your Dal is very thick add hot water to get the required consistency. Add salt and let the Dal come to a boil. Boil at high heat for 5 mins and then turn down the heat to low and cook the Dal for 30 mins. Every 5 mins make sure you stir the Dal to avoid formation of clumps. After your Dal has achieved a thick consistency turn off the heat.. In a small pan heat butter (at low heat). Add cumin seeds to the heated butter. Add chili powder, garam masala, asafoeida, coriander powder. Saute some fresh coriander leaves. After cooking the tadka, add this to the Dal.. After adding the takda, mix well into the Dal. Finally add cream, mix well and garnish with fresh coriander leaves.And thus your Dal Makhni is ready to serve. Serve Dal Makhni with bread, naan or rice.Enjoy !\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_181_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_181_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_181_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_181_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Add all the ingredients (except butter) to a mixing bowl and whisk away until well blended.(This first step is also great for the budding sous chefs to get involved in).. 1. Add about a teaspoon of butter to the pan and let it heat up (on medium heat).2. Pour a big serving spoon dollop of the batter onto the pan and spread it around the pan.  If you find it difficult to spread it using a spoon, just lift the pan and move it back and forth to so that the batter forms somewhat of a round shape.3.And then let it cook for about a minute or so, or until you start noticing little bubble like holes starting to form . Carefully (using a wide spatula) flip the crepe to the other side and cook for another minute or so.  The crepe should have a nice golden brown color to it.. 1. And then the best part - simply spread the nutella onto the crepe. (Spread away and make sure you almost get every part of the crepe covered.). Then, simply roll up the crepe and sprinkle with some powdered sugar if desired.ENJOY!!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_182_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_182_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_182_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_182_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. To make these 3 snacks you will need:For the dough:All purpose flour- 4 cups,Cooking oil- Half cup,Salt- half tea spoon, just for taste,Baking powder- half cup.To make the crispy snacks you will need- half tea spoon of nigella seeds,For the twisted-sweet snacks you will need- 1 tbsp of sugar,For the crunchy coconut snacks you will need- a cup of shredded coconut and 1 cup of sugar.. To make the dough you'll need- all purpose flour, cooking oil (I used soybean oil), a pinch of salt (for taste) and baking powder (check the ingredients step for amount).Mix all the ingredients together. Add water to make dough. Knead the dough very well. Keep kneading until the dough is smooth.Divide the dough into 3 equal portions (since we're making 3 different snacks).. Take any one of the divided doughs and add half tea spoon of nigella seeds to it.Knead the dough very well until the nigella seeds are mixed well.Make a smooth ball and roll it into a roti bread.Cut the  roti bread into thin (1 inch width) slices and then again cut the slices diagonally into small pieces as shown in the picture of this step.Add oil to a frying pan and heat oil in high heat. After the oil heats bring the heat down to medium. Fry the pieces evenly on all sides and  until they're brown. Use a slotted spoon to take them out of the oil.. You don't have to mix anything with the dough for this one. Make a smooth ball and roll it into a roti bread.Cut the roti into small pieces as shown in the picture of this step. Take any of the pieces and twist it, press the ends to secure the twist. Similarly twist the other pieces.In a frying pan heat oil in high heat and once the oil is heated brig the heat down to medium. Fry the twisted pieces evenly on all sides and until they're light brown. Keep them in a dry place while preparing the sugar syrup.In a saucepan add 1 tbsp of sugar and 2 tbsp of water. Stir until the sugar is dissolved and bring the mixture to boil.Add the fried twisted snacks to the thick sugar syrup and stir them until they are covered with sugar syrup evenly on all sides. Take them out of the sugar syrup and serve.. Take a bowl and add the last dough, 1 cup of shredded coconut and 1 cup sugar. Mix them really well. You might need to sprinkle a little flour while kneading. Keep kneading until the dough is smooth and not sticky.Divide the dough into 2 equal portions and make 2 smooth balls. Roll them into thick roti breads.Cut the  roti bread into small pieces as shown in the picture of this step.In a frying pan heat oil in high heat and  once the oil is heated brig the heat down to low. Fry the  pieces until they turn light brown. Fry evenly on all sides. Use a slotted spoon to take them out of the oil. Done! Serve them or store them to serve later! Enjoy!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_183_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_183_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_183_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_183_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Leftover \u2022turkey\u2022stuffing\u2022cranberry sauce\u2022bread. Put turkey on bread. Get your sauce!. Put your stuffing in the microwave for 30 seconds. MAY BE HOT . Put stuffing on sandwich. Done Enjoy Have it for Breakfast LunchDinnerOr midnight snack. Follow like and comment\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_184_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_184_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_184_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_184_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. This should warm up the oranges, you can put it on a longer time, I just stopped because my smoke detector was on lol XD Make sure you don't cut your hand or get burned!. Mine is an automatic boiler so it would stop boiling by itself. However, you can use a microwave to heat up water, microwave the water for around 3-5 minutes.. Make sure don't overfill two teaspoons, if you do, you won't even notice a scent of orange in your tea. Also, you need to use a cup save from heat!!! If you use the plastic ones, it will melt by the boiling water. Make sure you don't get burned, if you need to wash the peels again, go ahead, I washed mine because it had grease everywhere lol. Make sure the plate is not too big or small or you will have a mess XD. Now decorate your tea using lemons, limes, oranges, or even small umbrellas lol XD. And as a bonus for over 3000 views total on my account the calories if this recipe isOnly around 15-20 calories, depending if you need the Vegan Recipe or notAlso, HAPPY HAPPY NEW YEAR!!!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_185_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_185_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_185_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_185_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. To plant an Earthbox, you need:\n1. An earthbox - My mother gave me one as a gift, but you can order one online at www.earthbox.com/.\n2. Plants - Look for your local nursery and spends some time wandering around looking for things you'll either want to see and smell or eat frequently. I had plants forced on to me last summer and ended up really enjoying growing my own herbs and vegetables in particular so I looked for edibles at one of the many independent plant stores in my neighborhood.  I bought lemon verbana, sweet basil, purple basil, tomatilla, arugula and tarragon from a nursery a few blocks from my home. I believe that you can plant up to eight items in the box, and you're encouraged to plant edible items.  I have a large basket on my bike so I biked all of the seedling plants home, which I have to admit greatly added to the fun of the project.\nAs I discovered in the process, you'll probably also need:\n3. Measuring cup\n4. Ruler\n5. Scissors or knife\n6. Plastic bags\n7. A big tub or bucket\n8. A small plastic dish with walls\n9. A tea kettle or watering can. These steps are very self-explanatory.\nSet up your box in a place that will get as much sun as possible.\nWhat you don't get in the instructions, though, is a reminder to look around to see if you're likely to drip mud, water and mud-water onto your neighbor's property.  My balcony, which is just a bunch of wooden slats, is right above my neighbor's balcony.  So I dripped potting soil water all over the table and chairs she has on her balcony.  I recommend avoiding having to apologize for this by laying out some plastic bags beneath your whole set up.. Open the large bag of potting soil that came in the Earthbox and pack it in the two holes on either end of the box. Note that the potting soil comes dry but you need to pack these holes with moist soil, so be prepared to mix it elsewhere.\nThen pour water through the tube until it runs out of the hole at the bottom of the box.  (Note: this is another reason to have a plastic bag or some other device for catching water from the box on the ground before you start.) It takes a lot of water to fill the reservoire, so be prepared to make lots of trips to the sink.  I used my tea kettle since I don't have a proper watering can, and it's really more or less the same thing.. These are the official steps paraphrased from the instructions:1. Add moist potting soil on top of the screen until the box is half full. Sprinkle that layer with water and pat it down, especially above the areas with the holes (which you filled in a previous step).2. Then fill the rest of the box -- completely to the top -- with dry potting soil. Sprinkle well with water and smooth out the soil.3. If you're growing tomatoes, mix in two cups of the dolomite that comes with the box into the top 3-4 inches of the box and re-smooth the dirt.4. The box also comes with fertilizer, which you should use to create a two-inch wide strip in the location that you want to put the fertilizer. The instructions have a handy diagram for where to put the fertilizer and seedlings based on how many and what types of plants you want to grow.Here are my notes on how to make sure this section doesn't take forever:This was the part that took the longest by far, mostly because I didn't have a big tub in which to mix water and the dry potting soil to make the moist soil the directions call for.  I ended up using this dinky little pot with built in plastic to hand mix the water and soil.  I recommend getting a big bucket or tub to do this in one big batch.. Cover the box with the black plastic thing that looks like a shower cap.\nYou should have already chosen a plant placement pattern in the last step; cut 3-inch Xs where you want to put your seedlings and plant them in the holes.  Make sure you firm up the dirt around the seedling once it's in the ground.\nThey don't tell you in the instructions, but it's not surprising that you will need scissors or a knife and a ruler or tape measure for this step. . This is the fun part.\nOne time and one time only, pull back the black plastic around the seedlings and water the plants directly.  Then put the flaps back and don't ever do that again.\nGoing forward, you will always water your box through the tube.  You water until the water starts coming out of the hole at the bottom of the box.  If you live somewhere where it wouldn't be a good idea to have this liquid draining onto someone else's property, I strongly recommend that you find a little plastic container to put below the hole to catch the run off.  I took the cheap plastic bottom of a planting pot and reassigned it to the Earthbox.\nI only water it once a day and so far that seems to be enough in the Chicago climate.\nI can't wait to eat some of these herbs.  Earthbox sells stakes for you to use to prop up any plants that need vertical support. I think I've reached that point with my tomatilla plant, but I'll probably create my own structure.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_186_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_186_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_186_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_186_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Step one: The mango should be half-ripe or ripe.\n                  Use a knife to cut it all around, lengthwise, like an avocado.. Step 2: Place a hand on each 2 sides of the mango (Photo shows only 1 hand; needed the\n             other hand to click the camera).\n             Twist in opposite directions, like an avocado. Don't squeeze the mango too hard.. Step 3: Wow! you've got 2 halves.\n              Use  a spoon to dig out the seed.. Step 4: Use that same spoon to dig in-enjoy !\n             You can share the other half with someone you love, or....\n. Step 5: Cut each half once again, so you have 4 pieces.\n             Pare off the skin.  You can cut the pieces into smaller pieces-put in a bowl to share.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_187_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_187_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_187_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_187_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. 1 \u00bc\u00a0 cup all-purpose flour\n\u00bc teaspoon salt\n\u00bd cup (1 stick) unsalted butter, softened\n\u00e2\u0085\u0093\u00a0 cup brown sugar\n1 teaspoon vanilla extract. Position rack in the center of the oven and preheat to 350. Line a 9X13 baking pan with parchment paper or grease bottom and sides very well.\u00a0 In a small bowl, whisk together flour and salt; set aside.\nIn the bowl of an electric mixer using the paddle attachment (or a bowl that you use your hand mixer with), beat the butter and brown sugar at medium speed until combined, about 1 minute. Beat in the vanilla extract.\u00a0 At low speed, add the flour mixture and mix just until the mixture is crumbly, 10 to 15 seconds.\nPat the dough evenly into the bottom of the prepared pan. Prick the dough well with a fork. Bake for 15 to 18 minutes, until golden brown around the edges. Allow to cool slightly as you prepare topping.. 1 \u00bd sticks unsalted butter\n1 \u00bc\u00a0 cups light brown sugar\n\u00bc honey\n\u00bc cup maple syrup\n\u00bd\u00a0 teaspoon salt\n1 cup heavy cream\n6 \u00bd cups small pretzel twists, lightly crushed. In a large saucepan, combine the butter, brown sugar, honey, syrup, and salt and cook over moderate heat, stirring, until foamy and slightly thickened, about 10 minutes. Add cream and cook, stirring occasionally, until a candy thermometer inserted in the caramel registers 240\u00b0F (soft ball stage), about 11 minutes longer. Add the pretzels, quickly incorporating it into the caramel.\u00a0 Pour the filling over the crust, spreading it evenly. Bake for about 15 minutes, until the topping is bubbling. Let cool completely.. Remove onto a cutting board. Cut into bars and serve.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_188_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_188_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_188_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_188_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Ingredients: 1 pound ground turkey  1 Onion  1 med to large jalapeno pepper  2 cloves garlic  2 cans of beans (I'm using six bean mix and red kidney)  2 cans diced tomatoes  1 tsp. turmeric  1/2 tsp. chili powder  Pinch of tarragon  1 tbsp. Worcestershire sauce   2 tbsp. Sriracha sauce  2 tbsp. BBQ sauce  About 1/8 cup of Franks hot sauce  1/4 quinoa (optional)Tools: BBQ or Smoker Cast Iron Pot  Cutting board  Knife  Stirring spoon  Strainer  Can opener  About 2 cups smoking chips (I'm using hickory with Maple lump charcoal)Fun Fact: If you or someone in your family is anemic, cooking in a cast iron pan can add extra iron to your meals.  The beans will also help.. Soak your wood chips 15 minutes or more. I actually like to soak mine overnight so that they're ready when I need them. I'm using hickory, but I've also used oak, and maple. Maple gives a sweeter flavour (if you're interested). The hickory is a little nutty and has more smell to it.. OnionChop up your onion in course chunks.JalapenoFinely chop your jalapeno. Tip: Don't put your fingers in your eye after.GarlicSlice off the ends of the garlic to make it easier to peel. Once it's peeled, crush and dice your garlic.. Cook turkey until it's brown on medium heat. Don't forget to wash your hands after.You're able to do this step on your bbq as well. I typically get everything up to temperature on the stove because it's faster and easier.. OnionsClear out space in the middle of your pot. Add your chopped onions and cook until semi-transparent. This usually takes about 5 min. Once it's cooked, stir the onions into the meat.GarlicDo the same for the garlic. The garlic won't take as long to brown, maybe 30 seconds. Once it's done, mix it into the meat and onions.. Add spices and stir. 1 tsp. turmeric1/2 tsp. chili powder pinch of tarragonI added the seasoning salt and club house vegetable spice to taste and smell. I didn't actually add any garlic powder, this time, the garlic I used was strong enough to do the trick.Fun Garlic Fact: It's thought to have the ability to reduce cholesterol. This is probably only is true when the garlic is fresh. . Add both cans of tomatoes and both cans of strained beans. I like to add a little extra Franks to give it a little bit of a sweeter kick. Add your pre-cut jalapenos and stir it in.I like to add about 1/4 cup of quinoa. It thickens up your chili while also giving it a nice texture. I add it in its raw state and let it cook while the chili simmers.. Light your charcoal. I build a small fire underneath the charcoal until it catches fire. Once your charcoal is lit, transfer it to the bottom tray.Close the lid and bring your BBQ up to temperature. I smoke at around 275 - 300 degrees F.Safety Tip: Use a  charcoal chimney. It's much safer than my approach. . Once your BBQ is up to temperature, bring out your chili. Strain your wood chips and add about 1/2 cup on top of the coals. Once you've added your chips, slide your chili over top. I leave the lid of my chili open so it can really absorb the smoke flavour. An hour of smoking will do, but I like to smoke for 3-4 hours.Close your lid and open your BBQ's bottom vent.You'll want to add more wood chips every 20-30 minutes. When you add your chips give your chili a stir.. It's hot coming off the BBQ so use caution. Top with cheese and enjoy. It will definitely taste better the next day, so bring some to work and make everyone else jealous.It may not look like much, but this smoked chili packs a huge punch. The nice thing about smoking your chili is that you don't need to make it very spicy for it to have a lot of flavour.If you make it, let me know how it turns out!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_189_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_189_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_189_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_189_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Tip #1 Use a deep fryer, it is unlike anything else!First slice tomatoes into equal slices, about 1/3 inch thick.  Fill deep fryer up to max line with oil and heat to 360 F.. Next set up you breading station.  Place flour on a plate and season with salt and pepper.  Next place eggs and a dash of milk into a bowl and beat together.  Finally take the panko breadcrumbs and season with sea salt, black pepper, garlic powder, onion powder, and parsley; place onto a plate.. Tip #2 Make sure that each tomato is completely coated.Now begin to bread.  First season the tomatoes with salt and pepper.  Now dredge tomato in flour, dip into egg mixture, let it drip, and coat with panko breadcrumbs.  If it is not coated completely dip into egg mixture then back into panko. Repeat this process for all the tomato slices and set them aside.  Next cook the bacon until crispy and set aside in warm place.. Tip #3 Brown paper bags work best when draining.Now begin the aioli.  Place garlic and parsley into the food processor and pulse until well minced.  Next add in mayonnaise, dijon mustard, drizzle of olive oil, squeeze of Sriracha, and season with salt and garlic powder.  Blend together on high until ingredients are all incorporated together.. Now begin to fry tomatoes.  Carefully place a few at a time into fryer and fry until golden brown.  Remove from fryer and set on paper bag until oil has drained off.Place onto plate and garnish with a dollop of aioli topped with bacon crumbles.Suggestions:Serve as an appetizer at any occasion. Great way to use up those unripe tomatoes. Serve with a white wine, or a cold beer.***For this recipe and more like it check out my food blog at everythingbutfish.wix.com/etbf\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_190_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_190_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_190_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_190_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Major Pieces Server CabinetGeorge Forman Grill (good will or some similar heating device) Commercial Kitchen Rack and trays 5 volt Power Supply (I used one from an old PC like this) qty(3) Server Cabinet Fans 2\" of rigid insulation around the cabinetControls Arduino Pro by Sparkfun (although any Arduino will work)\u00a0 LCD Button Shield by Adafruit or\u00a0Sparkfun Dallas Temperature Sensors, qty(3) Sparfkun, Adafruit 5Volt relay by Sparkfun or ebay Notes: The relay has to handle 120volts and 10 amps depending on the george foreman grill you get. Just read the specs.. The back of the unit has a piece of wood that creates a 3\" plenum all the way to the bottom. Hot air is warmed by the heater, rises to the top of the cabinet, and is blown by qty(3) fans into the plenum. \u00a0Air is blown down the backside of the unit and exits next to the heater. (see picture). In order to help moderate the radiant heat transfer we added tiles above the grill. (see picture). The front door does not seal to well. This allows quite a bit of fresh air to circulate into the cabinet which is key as moisture evaporating from the food needs to escape.. Wiring: 1) LCD Shield and Arduino need 5 volts. 2) wire 5 volts, ground, and digital control pin to George Forman Grill relay 3) Wire Server Fans always ON 4) Wire Dallas Temperature Sensors. One towards the top of the unit, one in the middle, and one at the bottom to get an average temperature across the cabinet.\u00a0. The program is attached below. \u00a0It is a pretty simple program using the Dallas Temperature one-wire library and the LCD Button Shield library. Dallas Temperature one-wire tutorial is here. LCD Shield tutorial is here.\u00a0 The three temperature sensors are averaged and the grill is turned ON and OFF based on the temperature setpoint.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_191_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_191_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_191_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_191_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. About 1 kg tomatoes1 small onion2 sticks celery4 - 5 tablespoons brown sugar2 tablespoons tomato puree1 tablespoon paprika1 tablespoon mustard Up to 2 teaspoons chilli powder2 teaspoons salt0.5 teaspoon ground allspice1 teaspoon tamarind paste 100 ml white wine vinegar. Finely chop the onion and celery. Roughly chop the tomatoes. Mix everything in the slow- cooker. Cook on high for 30 minutes, then stir and cook for 8 - 10 hours on low. Cool.. Use a hand blender to puree the sauce. Pass through a sieve to remove the seeds and skins. Re-cook on high for 2 hours to make sure the sauce is sterile before bottling.. Wash the bottles in hot soapy water, then dry in a low oven.Use a ladle and funnel to transfer the sauce to the bottles. Cover immediately. Label when cold.If you want to store the sauce for a long time, it should be sterilised - I have explained a way to do this in another instructable! https://www.instructables.com/id/Sterilise-Preserves-With-A-Sous-vide-Water-Bath/\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_192_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_192_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_192_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_192_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. \nI used a fair amount of golden pear tomatoes, and small plum tomatoes. They are pretty teeny, so I used a bunch... but they are also very sweet tasting tomatoes, which taste fantastic in this sauce. You can use regular tomatoes, though!\nI also used shallots, lots of garlic, olive oil, kosher salt, pepper, and a good handful of a bunch of fresh herbs... basil, oregano, thyme, and rosemary, to be exact. You can use dried, but the fresh ones taste amazing if you happen to have them growing.. I cooked everything on the grill, since it's about 1,000 degrees outside and I don't want my kitchen any hotter than it already is.\nI used part of my broiler pan from my oven to cook my tomatoes on... I just wrapped it in foil, since I don't want any of my drippings to escape (I don't want to lose any bit of the yummy!). You can use a foil tray, or a grill pan wrapped in foil, I'd just recommend that you use something with a bit of an edge to prevent oil leaking off the sides and causing flare ups.\nThen I start preheating my grill... I turn it on high and let it get super hot. This isn't really a very particular science. If you're using the oven, I usually roast tomatoes at about 450, and tomatoes this size take about 15-20 minutes. On the grill it takes about 10 tops.. This is pretty simple. You basically want everything about the same size so that it'll all roast at the same rate... so I sliced my plum tomatoes in half lengthwise, and left the pear tomatoes be (minus cutting off nasty spots on a few of them)\nLay all of your tomatoes out on your tray, and load it up with minced garlic, thinly sliced shallots, salt, pepper, and lots of olive oil. (Hold off on the herbs, for now) Toss it all with your hands so that everything is pretty evenly coated.\nThen it goes on the grill!. \nyour tomatoes are ready. And pretty amazing... I'm hungry just looking at this picture.. \nTake it inside and scrape everything into a pot, even the little bits that may stick to the foil. Take a potato masher and smush everything up really well, and use some tongs to pull out the tomato skins (or not, depends on how lazy you are... if you don't I don't think it'd be the end of the world, I know I certainly missed some)\nOnce you have everything mashed up pretty good, add more olive oil. I'm not much for measuring, but I probably added another 1/4 cup.\nMince up your herbs and toss them in, and let everything simmer on low for a good 20 minutes or so, stirring often so nothing burns.. \nAnd that easy, your sauce is totally ready to top your pizza! Or even toss in some pasta, that would be pretty good too. Enjoy!\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_193_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_193_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_193_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_193_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Hotdogs - Beef, chicken, pork, mystery meat. Ballpark, fancy, plain, Polish. There are so many styles and any will work.  Skewers - Wooden or metal it doesn't matter as long as they are longer than the dogs.  KnifeCutting BoardHotdog Fixings. Grab a dog and a skewer. Slowly push the skewer through the center as best you can. They tend to be a bit wobbly so you may have to back it out a little and recenter a couple of times. When you have the meat skewered, place it on the cutting board and grab your knife. Starting with the blade resting on the skewer, just above the top of the hotdog, begin making an angled cut all the way through to the skewer. Continue down the length, turning the dog as you go, cutting to the skewer. Finish the cut completely out of the end. Remove the skewer* and set aside the hotdog. You can use the same skewer to cut all of the dogs. *See Step 4 for an alternate method.. You should cook the hotdogs normally, just be aware that they will probably take less time since there is more exposed surface area. Also they are weaker due to the cuts so take care while turning them. Also watch for flames to spring up due to grease and juices from the cuts dripping onto the grill. Not necessarily a bad thing but it will significantly decrease the cook time. To help get the insides crispier, try to carefully expand the coils to expose the cuts. I used a funny little hotdog holder we found somewhere. It actually worked pretty well, although the ends cooked faster then the middle, probably due as much to my grill as the holder.. This is essentially the same except when you finish cutting each hotdog leave it on the skewer. Then, simply grill them on the skewers, remembering to stretch the coils at least a little. This is a good method if you are trying to cook a number of dogs at once and don't have a little holder contraption as they are easier to turn and won't fall apart as easily compared to putting them straight on the grill... One other bonus of the spiral cuts is that they hold condiments and toppings well so load 'em up, or just go simple and really enjoy the crispiness and caramelization. Also because of lost grease drippings a spiral cut dog will be marginally healthier. P.S. It may or may not be worth it to post a picture(s) of your favorite way to eat spiral dogs. ;)\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_194_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_194_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_194_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_194_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. 1/2 pound of bacon2 slices of grain toast1/2 cup crunchy peanut butter1/2 cup raspberry preserve1/4 cup butter1/4 cup blue cheese. Cut bacon into small pieces and cook on medium for 12 minutes. The cook on the bacon is really up to you. If you like chewier bacon then give it less cooking time. If you like crunchier bacon then cook longer.Drain away excess bacon grease. Preserve a small amount of the grease and mix into the butter.. Butter your bread and put butter side down over medium heat. While you wait for the bread to toast, spread peanut butter and raspberry preserve on one side. On the other side, put crumbled blue cheese and bacon pieces.You will notice the peanut butter and preserve side softening and becoming runny. The blue cheese side will become soft but will not melt. Cook until you begin to get a nice crust and golden color on your toast.. Combine both halves into a sandwich. Continue to cook over medium until you have a nice golden crust. Do the same for both sides adding more butter as needed. . Take sandwich off heat. Cut diagonally and serve. Enjoy with your favorite beer of coarse. I poured a nice brown ale with mine to cut through some of the richness of this ultra heavy dish.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_195_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_195_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_195_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_195_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Clean the Lotus Root, peeling with a carrot peeler.\u00a0 Rinse, cut into 2\" pieces and submerge in water with a drop of water or some citrus added. Slice the pieces into thin slices, and cut again lengthwise to make slivers.\u00a0 Put back into the water. As you prepare the rest of the meal, change the Lotus Root water as it becomes darker in color.\u00a0 Expect to change the Lotus Root water two or three times before cooking.. Prepare the carrot in the same manner.. Heat a skillet or wok with 2 tbsps olive oil, 1 tbsp low sodium soy sauce, 1 tbsp brown sugar. *The brown sugar will add a lovely caramelized flavor to parts of the stir fry dish.. Cook the lotus root a few minutes.\u00a0 The texture of the root is a bit tough, and it needs extra time to cook.. After a few minutes, add in the carrot and stir.. Add in about an ounce of Sake and a tablespoon of sesame seeds.\u00a0 Stir.. Cover the pot for a few minutes until parts of the vegetables are caramelized.. If you've got traditional Japanese bowls, use them!\u00a0 It already tastes great, but this will make it taste so much better! Here I have the dish pictured with the rest of the meal I cooked!\u00a0 It was absolutely delicious!\u00a0 The lotus root is still crunchy, even after all that cooking! Here's the video:https://www.youtube.com/watch?v=SCghPTWGL-g\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_196_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_196_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_196_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_196_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. I bought my gourd in New Mexico at a charity event and they got all their gourds at an\u00a0Indian reservation. \u00a0\nIngredients\u00a0are for 1 serving. \u00a0\n1 Warty looking gourd.\n1 Package of Raman noodles.\n1 Green \u00a0food coloring.\n1 Battery\u00a0operated\u00a0\u00a0pumpkin carving knife or small hack saw.\n1 Red and 1 Black licorice for eyes and mouth.\n1 sheet of black foam sheet for hat. (Optional. )\u00a0\n1 Plastic\u00a0zip-lock\u00a0\u00a0bag depending on the size of gourd you will be using for inside the gourd.\n1 Small cup of chopped nuts as a garnish.\n1 Custard cup.\n1 Pair of scissors.\n1 marker to mark gourd and craft foam.\nKayle opitional\nI did not glue the embellishments to the gourd because I am using the gourd for a different purpose later. \u00a0\nI don't feel a person would really need to glue them on, \u00a0unless they are making this for children. \u00a0. Isn't this the most beautiful gourd you have ever seen? \u00a0\nI plan to dry mine and use it for a decorative bowl for many years. \u00a0\nIf you plan on making this dish, \u00a0I would go ahead and get your gourd and cut it and let it dry out for 3 days.\u00a0\nLike pumpkins it has a smell and allowing it to air out will\u00a0eliminate the smell totally! \u00a0\u00a0\nGourds are very hard! \u00a0Be prepared to work a little for your efforts. \u00a0\nI used a pumpkin carver but it was still very hard. \u00a0\nI did great but was rather surprised even a battery operated carver was hard to cut through this baby.\nThe good news is all you need to carve is the top so the lid will come off. \u00a0\nCut the top of the gourd off, saving the lid.\nScrape out the inside until it is smooth.\nDiscard inside scrapings or . . . .\u00a0\nIf you have a garden area you might save the seeds for next year and plant them and have your own beautiful gourds! \u00a0\n\u00a0After carving a gourd, I have decided unless I am making pies, I will be using gourds for Halloween decorations. \u00a0\nGourds are great about not getting spoiled and you can use them later as a decoration for many years. \u00a0\nIt was difficult to choose which one I liked best because they had bright\u00a0reddish\u00a0orange, striped yellow , very very warty looking ones, and green ones. \u00a0They also came in many different sizes! \u00a0. You will have a few days to work on the embellishments, \u00a0so you won't be rushed at the last minute. \u00a0\n\u00a0I liked Gourdila without her hat the best. \u00a0\nCut out a triangle to fit the gourds face for a hat. \u00a0\nCut 2 small circles from the black licorice for eyes.\nCut 2 eye brows. \u00a0\nI made some hair to go under the hat ( see picture) \u00a0optional. \u00a0Or you can use cabbage or\u00a0Kayle. \u00a0\nCut one small mouth shape \u00a0from the red\u00a0licorice for the mouth. \u00a0The shape and size depends on where the warts fall and how it will look. \u00a0\n\u00a0\u00a0. Insert the \u00a0a\u00a0zip-lock\u00a0bag into the gourd bottom.\nPlace nuts in a small custard dish. \u00a0\nHave your embellishments ready.\nMake the Ramen noodles according to the directions \u00a0but add the food coloring at the beginning before adding the Ramen.\nScoop out the Ramen and place it in the gourd.\nPlace the witches hat across the top and put the lid on. \u00a0( Add the Kayle if using it for the hair. )\nEmbellish the lid with the eyes, mouth, hair, and eye brows. \u00a0\nI did not add the juice to mine but you can if you like. \u00a0. When my grandson was younger he could eat Ramen noodles and cereal for every meal. \u00a0\nI wished I had thought of this years ago, it might have been just the thing to make him not want to eat Ramen noodles again. \u00a0\nNo, probably not, he would be just the kid that would think it was totally awesome! \u00a0\nNever the less I hope you have a great time grossing people out \u00a0for this Halloween! \u00a0\nHAPPY HALLOWEEN!\n\u00a0\u00a0\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_197_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_197_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_197_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_197_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. Use dry measuring cups for the Flour and liquid measuring cups for the Water4 Cups sifted white all purpose flour1 Tablespoon iodized table salt1 + 1/2 Cups water1 Large Egg Yes I know it seems crazy simple but read on. Mix the Salt and Flour. Add the Egg to the Water and mix them. Now add the Liquid to the Dry ingredients.. Start with a fork but you're gonna have to get your hands into it. I use gloves because it is very sticky and I don't want anyone finding knuckle hair in their food. Just make sure you wash your gloved hands with soap and water before you start mixing to get rid of any rubbery tasting talcum powder. While kneading you will add 1/2 to 3/4 cups more flour to this! This will vary depending on your surroundings, the size of your egg and the moisture content of your flour. Kneed it till it doesn't stick to you and feels firm. About ten minutes. Cover with a damp cloth or plastic wrap and set it aside for at least an hour. I have tried using it early but it makes a tough perogie. Firm is what we are after here not tough. This needs time for the gluten to relax. Go make some filling.. \"A pound? Really?\"\"Yes.\"I know it seems like a lot but most of that is fat that will cook off.Remember to save the bacon fat to cook the onions, sauerkraut and perogies in later. Told you this would kill you young!. Drained Bick's Wine Sauerkraut fried in Safflo Sunflower oil, salted butter and bacon fat (this is Ukrainian vegetarian) add salt and pepper to taste. fry them till they start to dry out and begin to become brown.Fry the yellow onions the same way. You will need at least three large onions. These are added to the potato filling and used as a garnish. I recommend swim goggles cutting onions. These things are brutal. Fry them till golden brown and caramelized. Drain some cottage cheese. You just add some salt and pepper to taste.By the way you also need to peel, cube and boil in salted water at least three large red potatoes. Not the brown ones those are too mealy and crumbly, use red potatoes. Mash them with a little milk when done.Right about now everyone who lives near you will be thinking up excuses to drop by to see what you are cooking.. No point dieting here. Go for fresh and high fat.. Mix up half of the cooked bacon, half the cooked onion and the mashed red potatoes. Taste it and add salt and pepper as needed. Fold it. Do not stir.. At this point your dough has been resting about an hour on the counter. It should feel like a nice firm boob. You can throw everything in the fridge and take a rest at this point or carry on. You can even put it off till tomorrow if you like.. Fire up the TV and get cozy. This is gonna take a while. Punch down your dough and get a cutting board. Flour up the board. Start boiling a pot of salted water now too.. I cut chunks of dough the size of a walnut. I've also tried making huge perogies but I prefer normal ones.. I use an old coffee cup from my Grandfather's house. It just seems right.. Add a walnut sized chunk of whatever. You can use the cottage cheese, the potato and bacon or potato and cheese, the sauerkraut. Whatever. I have even seen them with cherries and sugar inside as a desert.. Pinch the edges together. Keep the filling off the edges that will meet so they stick.. Throw them in. Tip first so they don't splash you. Stir the pot so they don't stick to the pot or their brethren.. After they float they are done. You could eat them at this point but really? We already went down the rabbit hole we might as well stick around for tea!. Safflo sunflower oil, bacon fat and salted butter. Honestly it just doesn't get better than this.. The first one is always mine!. Just make a whole bunch! I usually double the recipe but it's a lot of work. If you're just starting only make one type of filling. You can freeze the extra but I really doubt there will be leftovers. I put mine in a roaster with the last of the fried bacon, onions, some oil and some butter. My family ends up going thru them all in about a day and a half. Serve with sour cream. I hope you find this useful.\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_198_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_198_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_198_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_198_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_coherence", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: The first image\nB: The second image\nC: The third image\nD: The fourth image", "question": "Select the incoherent image in the following sequence of images.", "context": "Here is the context:\n. \n          Gather the following:\n\n\t\t2 Ripe** Avocados\n\t\tVegetable Oil, or preferred frying oil\n\t\tBread Crumbs\n\t\t2 Eggs\n\t\tSalt to taste\n\t\tPepper to taste\n\t\tKitchen Knife\n\t\tSpoon\n\t\tPan\n\t\tTongs (utensil for flipping during Fry step)\n\t\tPaper Towels**NOTE: You want your avocados to be on the firm side of ripe as they will be easier to handle during the breading and egg dipping process.\u00a0. First things first, cut your avocados in half and remove the seeds. Scoop the avocado fruit out of it's leathery shell with a spoon, but make sure to keep the avocado fruit intact.\nPlace the avocado fruit upside down on your cutting board. With a kitchen knife, slice your avocado lengthwise into 3/4 inch (1.905 cm) wide strips.\u00a0. Beat two eggs in a bowl until uniform. These eggs will be used to create the outer coating on the avocados later.\u00a0. \n          Now repeat the following with each of your avocado slices.\u00a0\n\n\t\tCoat each slice in bread crumbs.\u00a0\n\t\tDip your coated avocado slice into your egg batter.\n\t\tCoat your slice again, making sure that all of the egg batter on your slice is covered.\u00a0I've found that it's easiest to do step 1 (above) with all of your slices, then step 2 (above) with all of your slices, and then finally step 3 (above) with all of your slices. I wasn't following this method at first and ended up with gummy fingers covered in bread crumbs and egg--which gets annoying after a couple slices.\u00a0\nAlternative Bread Crumb Idea: If you are GF or trying to avoid bread and gluten, try with a gluten free substitute or coconut flour and flakes. I haven't tried the coconut but it seems like it could be good!. Pour your fry oil of choice into your frying pan, I chose vegetable oil. Turn on the stove, and let the oil heat up.\u00a0\nOnce your oil is hot, place your breaded avocado slices into the pan. Fry the slices until they are brown on both sides. Flip the slices if necessary with kitchen tongs.\u00a0Alternative to Frying: Bake on a lightly greased cookie sheet in a preheated oven (400F-425F) until browned and crispy for 15-20 minutes.\u00a0. When your avocado slices are nicely browned on both sides, remove them from your pan with kitchen tongs. Set them on a plate lined with paper towels. The paper towels will absorb any oil residue on your slices and keep them from getting soggy later.\u00a0\nAllow your slices to dry on the paper towel, but don't let them get too cold. Fries are good hot!. Serve immediately! These tasty little snacks are great at parties, alongside sandwiches, and as an afternoon snack. They go really nicely with dipping sauces as well. They're pictured here with a chipotle dipping sauce (sour cream, lemon, and chipotle chile powder) but I would also try it with the following: cilantro lime, honey mustard, garlic lime, maybe even a soy-sesame, although I have yet to try them all.\u00a0\nFun Avocado Fact: They are also called Alligator Pears\nRead the question below and select from the following choices.\nA: The first image\nB: The second image\nC: The third image\nD: The fourth image", "input_image_path": ["./Discrete-temporal/visual_coherence/visual_coherence_199_0.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_199_1.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_199_2.jpg", "./Discrete-temporal/visual_coherence/visual_coherence_199_3.jpg"], "output": "D", "qwen3-vl": "image none"}]
\ No newline at end of file
diff --git a/results/visual_correspondence_blink/qwen3-vl/metadata_info.json b/results/visual_correspondence_blink/qwen3-vl/metadata_info.json
new file mode 100644
index 0000000..b7743c8
--- /dev/null
+++ b/results/visual_correspondence_blink/qwen3-vl/metadata_info.json
@@ -0,0 +1 @@
+[{"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_0_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_0_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_1_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_1_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_2_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_2_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_3_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_3_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_4_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_4_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_5_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_5_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_6_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_6_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_7_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_7_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_8_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_8_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_9_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_9_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_10_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_10_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_11_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_11_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_12_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_12_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_13_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_13_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_14_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_14_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_15_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_15_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_16_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_16_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_17_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_17_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_18_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_18_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_19_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_19_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_20_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_20_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_21_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_21_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_22_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_22_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_23_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_23_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_24_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_24_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_25_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_25_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_26_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_26_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_27_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_27_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_28_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_28_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_29_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_29_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_30_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_30_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_31_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_31_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_32_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_32_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_33_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_33_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_34_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_34_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_35_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_35_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_36_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_36_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_37_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_37_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_38_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_38_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_39_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_39_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_40_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_40_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_41_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_41_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_42_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_42_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_43_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_43_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_44_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_44_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_45_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_45_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_46_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_46_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_47_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_47_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_48_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_48_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_49_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_49_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_50_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_50_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_51_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_51_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_52_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_52_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_53_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_53_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_54_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_54_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_55_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_55_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_56_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_56_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_57_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_57_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_58_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_58_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_59_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_59_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_60_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_60_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_61_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_61_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_62_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_62_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_63_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_63_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_64_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_64_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_65_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_65_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_66_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_66_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_67_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_67_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_68_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_68_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_69_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_69_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_70_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_70_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_71_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_71_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_72_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_72_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_73_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_73_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_74_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_74_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_75_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_75_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_76_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_76_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_77_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_77_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_78_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_78_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_79_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_79_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_80_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_80_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_81_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_81_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_82_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_82_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_83_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_83_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_84_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_84_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_85_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_85_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_86_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_86_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_87_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_87_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_88_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_88_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_89_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_89_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_90_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_90_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_91_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_91_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_92_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_92_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_93_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_93_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_94_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_94_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_95_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_95_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_96_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_96_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_97_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_97_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_98_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_98_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_99_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_99_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_100_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_100_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_101_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_101_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_102_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_102_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_103_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_103_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_104_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_104_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_105_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_105_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_106_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_106_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_107_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_107_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_108_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_108_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_109_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_109_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_110_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_110_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_111_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_111_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_112_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_112_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_113_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_113_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_114_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_114_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_115_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_115_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_116_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_116_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_117_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_117_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_118_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_118_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_119_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_119_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_120_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_120_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_121_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_121_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_122_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_122_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_123_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_123_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_124_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_124_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_125_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_125_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_126_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_126_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_127_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_127_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_128_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_128_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_129_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_129_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_130_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_130_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_131_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_131_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_132_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_132_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_133_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_133_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_134_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_134_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_135_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_135_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_136_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_136_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_137_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_137_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_138_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_138_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_139_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_139_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_140_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_140_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_141_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_141_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_142_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_142_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_143_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_143_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_144_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_144_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_145_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_145_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_146_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_146_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_147_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_147_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_148_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_148_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_149_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_149_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_150_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_150_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_151_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_151_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_152_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_152_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_153_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_153_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_154_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_154_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_155_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_155_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_156_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_156_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_157_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_157_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_158_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_158_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_159_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_159_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_160_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_160_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_161_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_161_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_162_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_162_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_163_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_163_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_164_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_164_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_165_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_165_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_166_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_166_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_167_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_167_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_168_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_168_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_169_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_169_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_170_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_170_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_blink", "visual_input_component": "2 natural images", "source": "blink", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_171_0.jpg", "./High-level-obj-semantic/visual_correspondence_blink/visual_correspondence_blink_171_1.jpg"], "output": "B", "qwen3-vl": "image none"}]
\ No newline at end of file
diff --git a/results/visual_correspondence_scannet/qwen3-vl/metadata_info.json b/results/visual_correspondence_scannet/qwen3-vl/metadata_info.json
new file mode 100644
index 0000000..f6cdee8
--- /dev/null
+++ b/results/visual_correspondence_scannet/qwen3-vl/metadata_info.json
@@ -0,0 +1 @@
+[{"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_0_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_0_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_1_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_1_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_2_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_2_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_3_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_3_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_4_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_4_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_5_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_5_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_6_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_6_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_7_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_7_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_8_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_8_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_9_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_9_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_10_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_10_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_11_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_11_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_12_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_12_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_13_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_13_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_14_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_14_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_15_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_15_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_16_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_16_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_17_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_17_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_18_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_18_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_19_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_19_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_20_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_20_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_21_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_21_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_22_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_22_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_23_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_23_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_24_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_24_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_25_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_25_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_26_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_26_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_27_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_27_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_28_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_28_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_29_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_29_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_30_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_30_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_31_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_31_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_32_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_32_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_33_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_33_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_34_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_34_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_35_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_35_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_36_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_36_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_37_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_37_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_38_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_38_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_39_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_39_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_40_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_40_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_41_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_41_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_42_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_42_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_43_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_43_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_44_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_44_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_45_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_45_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_46_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_46_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_47_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_47_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_48_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_48_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_49_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_49_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_50_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_50_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_51_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_51_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_52_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_52_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_53_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_53_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_54_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_54_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_55_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_55_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_56_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_56_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_57_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_57_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_58_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_58_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_59_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_59_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_60_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_60_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_61_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_61_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_62_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_62_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_63_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_63_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_64_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_64_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_65_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_65_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_66_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_66_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_67_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_67_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_68_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_68_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_69_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_69_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_70_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_70_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_71_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_71_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_72_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_72_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_73_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_73_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_74_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_74_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_75_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_75_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_76_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_76_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_77_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_77_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_78_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_78_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_79_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_79_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_80_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_80_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_81_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_81_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_82_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_82_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_83_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_83_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_84_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_84_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_85_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_85_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_86_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_86_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_87_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_87_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_88_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_88_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_89_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_89_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_90_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_90_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_91_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_91_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_92_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_92_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_93_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_93_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_94_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_94_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_95_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_95_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_96_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_96_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_97_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_97_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_98_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_98_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_99_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_99_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_100_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_100_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_101_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_101_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_102_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_102_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_103_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_103_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_104_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_104_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_105_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_105_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_106_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_106_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_107_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_107_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_108_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_108_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_109_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_109_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_110_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_110_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_111_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_111_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_112_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_112_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_113_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_113_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_114_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_114_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_115_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_115_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_116_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_116_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_117_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_117_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_118_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_118_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_119_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_119_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_120_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_120_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_121_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_121_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_122_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_122_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_123_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_123_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_124_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_124_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_125_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_125_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_126_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_126_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_127_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_127_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_128_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_128_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_129_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_129_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_130_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_130_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_131_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_131_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_132_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_132_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_133_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_133_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_134_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_134_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_135_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_135_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_136_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_136_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_137_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_137_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_138_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_138_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_139_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_139_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_140_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_140_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_141_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_141_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_142_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_142_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_143_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_143_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_144_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_144_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_145_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_145_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_146_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_146_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_147_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_147_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_148_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_148_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_149_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_149_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_150_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_150_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_151_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_151_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_152_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_152_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_153_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_153_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_154_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_154_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_155_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_155_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_156_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_156_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_157_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_157_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_158_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_158_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_159_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_159_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_160_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_160_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_161_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_161_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_162_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_162_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_163_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_163_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_164_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_164_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_165_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_165_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_166_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_166_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_167_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_167_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_168_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_168_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_169_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_169_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_170_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_170_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_171_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_171_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_172_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_172_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_173_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_173_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_174_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_174_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_175_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_175_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_176_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_176_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_177_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_177_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_178_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_178_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_179_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_179_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_180_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_180_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_181_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_181_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_182_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_182_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_183_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_183_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_184_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_184_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_185_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_185_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_186_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_186_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_187_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_187_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_188_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_188_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_189_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_189_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_190_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_190_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_191_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_191_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_192_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_192_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_193_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_193_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_194_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_194_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_195_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_195_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_196_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_196_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_197_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_197_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_198_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_198_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_correspondence_scannet", "visual_input_component": "2 natural images", "source": "scannet", "options": "A: Point A\nB: Point B\nC: Point C\nD: Point D", "question": "Which point is corresponding to the reference point?", "context": "A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\nA: Point A\nB: Point B\nC: Point C\nD: Point D\n", "input_image_path": ["./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_199_0.jpg", "./High-level-obj-semantic/visual_correspondence_scannet/visual_correspondence_scannet_199_1.jpg"], "output": "D", "qwen3-vl": "image none"}]
\ No newline at end of file
diff --git a/results/visual_ordering/qwen3-vl/metadata_info.json b/results/visual_ordering/qwen3-vl/metadata_info.json
new file mode 100644
index 0000000..eb6c2a3
--- /dev/null
+++ b/results/visual_ordering/qwen3-vl/metadata_info.json
@@ -0,0 +1 @@
+[{"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [1, 3, 2, 0]\nC: [3, 2, 1, 0]\nD: [1, 3, 0, 2]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. - Three potatoes- One onion- Three tomatoes- Three chilies- Two pounds of basmati rice (NOTE: other types of rice not recommended)- One whole chicken- One tomato paste- One tahini - One lemon. - One tablespoon of salt- Three pay leaves- Three cloves- Three teaspoons of cumin- One tablespoon of chili powder- Two teaspoon of grounded Coriander- One-tablespoon ground ginger- Five teaspoon of cayenne paper. - Two teaspoons of black paper- Two Magie cubes for better taste.. - Wash all the ingredients with water and salt to take out the dust. Clean the chicken by adding rough sea salt while scratching it to eliminate the grass.- Cut the potatoes into cubes of medium sizes. Cut the onions in very small sizes to save you time when cooking. - Cut the three tomatoes in relatively large cubes. Cut two of the green chilies into very small pieces and leave the third for the representation. - Place the two-pound rice in a big bowl and wash it with cold water by shuffling the rice. Leave the rice aside to absorb the water. - Cut the chicken in quarters and take out the skin. Add species to the chicken for flavor.. - Turn up heat to medium temperature, add have cup of vegetable oil in a deep large pot. Stir oil till it worms up- Add onions cubes into pot and stir fry till they become golden. Add chicken flavored bouillon. - Place chicken slices into pot and stir frequently until they are half-cooked and place them on a plate to cool down on the side.. - Put potatoes cubes into the pot and add boiling water until potatoes cubes are completely submerged.- Close the pot with lid and let boil until potatoes are cooked for half an hour. - Put chicken back into the pot after potatoes are cooked; add tomatoes slices, tomato paste into the pot to be cooked with potatoes.- Add bay leaf (3) into pot.- Add 2 chilies strips into pot. - Add seasoning (coriander, chills, grounded ginger, 4 cloves)- Pour grain into pot.- Add water until rice is submerged at a depth approximately 1 inch.. - Add 5 tablespoons of tahini into bowl.- Squeeze 3 lemons into the same bowl.- Add sea salt and black peppers and some paprika (1 teaspoon each) into the bowl.- Add hot boiling water gradually into the bowl while stirring till it become smooth.- Place tahini into freezer while preparing kabsa for best taste and result.- Serve cold tahini sauce as a side with the kabsa.. -Transfer rice to plate with chicken on top. Serve with tahini sauce.. bon appetit\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 3, 2, 0]\nC: [3, 2, 1, 0]\nD: [1, 3, 0, 2]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_0_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_0_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_0_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_0_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [0, 2, 3, 1]\nC: [3, 0, 2, 1]\nD: [1, 3, 0, 2]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Cake Ingredients: 1 cup butter-softened 1 1/2\u00a0cups sugar 4 large eggs 3 cups all-purpose white flour 3 tsp. baking powder 1 tsp. salt 1 cup milk 2 tsp. vanilla extract Paper baking cupcake liners Vegetable cooking sprayYield: 24 cupcakes or 2 dozenDirections: 1. Preheat oven to 350F. Prepare cupcake tins with liners, set aside. 2. Beat butter and sugar at medium speed with an electric mixer until creamy and smooth. Then add eggs, 1 at a time, mixing until well blended after each egg. 3. Combine flour, baking powder, and salt together in a small bowl. Add to butter mixture alternating with milk. You should begin and end with the flour mixture. Mix at low speed until bleneded. Add in vanilla extract. 4. Spoon cake mix into cups, filling 2/3 full. 5. Bake at 350 for 12-15 minutes or until toothpick inserted comes out clean. 6. Cool in pans on wire rack for 10 minutes, remove cupcakes from pans and set on wire racks to completely cool.. Filling Ingredients: 1 8oz cream cheese-softened 1/3 cup powdered sugar 2-3 Tbsp. coffee liqueur(Kahlua) or 1 Tbsp. coffee extract 1/8 tsp. saltYield 2 CupsDirections: 1. Combine all ingredients in a medium bowl, mixing until well blended. Store any remaining filling in container in refrigerator-up to 2 weeks.. Once cupcakes are completely cooled, cut tops off of the cupcakes using a serrated knife. Then spread 1 Tbsp. of Tiramisu Filling on the bottom part of the cupcake, gently place the top back on. . Frosting Ingredients: 1/2 cup butter-softened 1 8oz cream cheese-softened 2 16oz packages powdered sugar 1/4 tsp. saltYield 5 cupsDirections: 1. Beat butter and cream cheese at medium speed until creamy and smooth. 2.\u00a0Gradually add in the powdered sugar and salt, mixing at low speed. Beat at high speed for 2 minutes until creamy and thick. 3. Frost each cupcake by using a spatula, knife or piping bag and tip. . For the finishing touch dust/sprinkle with Hersheys Cocoa Powder. . After all your hard work, you can now enjoy your Tiramisu Cupcakes! Enjoy!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 2, 3, 1]\nC: [3, 0, 2, 1]\nD: [1, 3, 0, 2]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_1_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_1_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_1_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_1_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [3, 1, 2, 0]\nC: [0, 2, 3, 1]\nD: [2, 0, 3, 1]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. This recipe makes 2 dozen small cookies.Butter - 1/2 CupBrown Sugar - 1/2 CupAll Purpose Flour - 3/4 CupMilk - 1 1/2 TablespoonChocolate Chunks - 1/4 Cup or moreVanilla Extract - 1 TeaspoonSea Salt - 1/2 Teaspoon + extra for sprinklingYou could use chocolate chips instead of chunks. For the chunks I just chopped up a bar of dark chocolate. Any chocolate would work, but i recommend not to use chocolate that has a cocoa solid content higher than 65%. These would be amazing with hazelnuts in them too!. 1. Place the butter in a pan and melt it over medium-high heat. Cook the butter till it becomes golden brown in colour. Immediately take off the heat. Let cool till it solidifies. The butter is ready to use once it solidifies.. Cream the brown butter till it is light and pale. Add the sugar and cream it till the mixture is light and creamy and the sugar has mostly dissolved. Mix in vanilla essence, 1/2 teaspoon salt and the milk. Combine well.Add the flour and 3/4 of the chocolate chunks and mix this till it comes together in a smooth ball. Refrigerate for 15 minutes.. Take 1/2 tablespoon of the dough and roll into balls. Place onto baking paper and flatten them lightly with the back of a spoon. Push the remaining chocolate chunks into the dough and sprinkle with the sea salt.. Bake these cookies in an oven that has been preheated to 350F (180C) for 15-20 minutes until the edges are light golden. Cool them slightly before moving them to a cooling rack and devouring them. Nom nom!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 1, 2, 0]\nC: [0, 2, 3, 1]\nD: [2, 0, 3, 1]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_2_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_2_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_2_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_2_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [1, 3, 0, 2]\nC: [0, 1, 2, 3]\nD: [2, 1, 0, 3]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Here's the basic recipe.\u00a0 Start by making this:\n(this is essentially a double batch...it will yield about 70-75 cookies...depending on variety)\nYou will bake 375* F for 8-10 minutes...so go ahead and preheat the oven!\n3 sticks of Butter (1 1/2 Cups)\n1 1/2 Cup Brown Sugar\n1 Cup granulated Sugar\nMix and cream\nThen add:\n3 eggs\n2 tsp Vanilla\nmix together\nThen add:\n1 tsp Baking Soda\n1 tsp Salt\n2 Cups Oats\n4 Cups Flour\nMix together. Here's what we're going to do.\nAdd 2 full bags of chocolate chips.\nI used the top 2...dark chocolate and mini's...the melts were used later.\nThese are loaded.\u00a0 But essentially it is a double batch...so 2 bags is just right!. Split up the dough into 5 bowls.\nEach bowl will make a different variety of chocolate Overlord cookies!\n(yes, these aren't just overload cookies...they are Overlord Cookies). First:CHOCOLATE CHIP COOKIES!\nRoll out balls of dough about the size of a ping pong ball.\nWe did 15 per tray. Tray is lightly greased...but I don't think it needs it.\nBake 8-10 minutes at 375 F.\u00a0 I do 8 minutes\nThen remove them and allow them at least 5 minutes before touching them!\nThey need to completely finish cooking...they will be gooey until they cool.\n(don't judge my pans...if you know how to clean them up perfectly...\ncome over and clean them, I will give you cookies!). Next we add some fun!. SMORE'S COOKIES!\nMake a tray of regular cookies.\u00a0 Bake 8 minutes\nPull out of oven and while gooey, place 3 marshmallows on\ntop with one baking melt chocolate disk for looks!\nThen pop them under the BROILER for just a minute or\ntwo until the marshmallows are toasted!\nGolden Perfection!. COOKIES AND CREAM\nStart with your cookie dough and oreo cookies...\nwrap an oreo completely in a \"thin\" layer of cookie dough, covering it completely!\nThese turn out quite large!\u00a0 We fit 8 on one pan.\nThey bake up perfectly with all that oreo goodness inside!\nThese were way better and bigger than I expected!. SWEET AND SALTY\nTake the Chocolate Chip cookie dough and add broken up pretzel sticks to it!\nMakes a sweet and salty awesome flavor!\nRoll out and bake the same as the regular cookies!. TURTLES\nBake a batch of regular cookies, like the smore's ones.\nPull out after 8-10 minutes and lightly press a pecan or two on top.\nThen drizzle with caramel topping!\nLet cool at least 5 minutes before plating!. Then plate up all your gourmet cookies!\nAdd some little name sticks so your guests know what they are getting into!\nOkay, so yes...you did the math right.\n15 cookies of each variety except the Cookies 'n Cream...only 8 of them\nGrand total: 68 cookies!\nAwesome spread for 1 simple batch of cookies!\nIn a blind taste test...the 8 year old and 10 year old loved the\nCookies and Cream the best!\nFollowed closely by the Smore's!\u00a0 :). Best part about these cookies is they FREEZE!\nThe dough freezes, the cookies freeze...you don't have to eat them all in one night!\u00a0 And they taste good frozen!\nNow you can have a party spread with only the time spent making a batch of cookies!!!\nThanks for viewing, which one do you think you'd like the most???\nVote for me in the Cookie Contest...I'll make you some cookies!\u00a0 :)\nCheck out my blog for more silliness!\u00a0\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 3, 0, 2]\nC: [0, 1, 2, 3]\nD: [2, 1, 0, 3]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_3_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_3_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_3_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_3_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [1, 0, 3, 2]\nC: [1, 2, 3, 0]\nD: [3, 1, 2, 0]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. -2 TOMATILLOS\n-4 LARGE RED TOMATOES\n-1 LARGE ONION\n-2 LB BEEF ROAST ( I USE CHUCK ROAST )\n-1 BIG BAG DEHYDRATED RED PEPPERS (THIS CAN BE ANY DEHYDRATED PEPPERS YOU PREFER, I LIKE THE SMALL RED ONES, SPICY!)\n-10 FLOUR OR CORN TORTILLAS\n-1 BUNCH OF CILANTRO\n-24OZ WHITE CHEESE OF YOUR CHOICE ( I USE BABY ANEJO ENCHILADO MEXICAN STYLE FARMER'S CHEESE)\n-2 TBS GARLIC SALT\n-2 TBS OIL\n-2 TBS CUMIN. SIMMER THE BEEF ON LOW/MEDIUM HEAT FOR 3 HRS.\nADD A HANDFUL OF THE RED PEPPERS ONE HOUR IN.\nPOUR AS MANY OF THE REMAINING PEPPERS AS YOU CAN HANDLE INTO A SEPERATE PAN OF BOILING WATER AND BOIL UNTIL SOFT. (ABOUT 30 MINS)THE AMOUNT OF PEPPERS WILL DETERMINE HOW SPICY THE SALSA IS. I USE ABOUT 6 BOILED SMALL RED PEPPERS.\nREMOVE THE BEEF WHEN FINISHED AND PULL APART WITH A FORK.\nSTIR IN CHOPPED ONION.. TO A BLENDER, ADD TOMATILLOS, TOMATOES, ONION, CILANTRO, CUMIN, AND A LITTLE GARLIC SALT. ADD BOILED RED PEPPERS AND LIQUIFY.\nPOUR SALSA INTO A PAN AND SIMMER FOR 15-20 MINUTES.\n. HEAT OIL IN A PAN, AND ADD TORTILLAS ONE BY ONE FOR ABOUT 10 SECONDS EACH SIDE.\nDO THIS UNTIL ALL TORTILLAS ARE FRIED, A LITTLE CRISPY, LIGHT BROWN.\n. POUR ABOUT 1/4 CUP OF THE SALSA INTO THE BOTTOM OF A BAKING PAN, SET ASIDE.\nDIP EACH FRIED TORTILLA INTO THE SIMMERING SALSA AND MOISTEN EACH SIDE CAREFULLY. PLACE ON A SEPERATE PLATE.\nADD BEEF AND CHEESE TO MOISTENED TORTILLA AND ROLL UP.\nARRANGE IN BAKING DISH.\nTOP THE ENCHILADAS WITH REMAINING SALSA AND SPRINKLE WITH CHEESE.\nBAKE 350 FOR 20 TO 30 MINUTES, UNTIL BUBBLY AND CHEESE IS MELTED.\nEAT AND  TRY NOT TO BURN YOUR TONGUE TASTELESS!!!\n\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 0, 3, 2]\nC: [1, 2, 3, 0]\nD: [3, 1, 2, 0]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_4_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_4_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_4_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_4_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [1, 0, 2, 3]\nC: [2, 1, 0, 3]\nD: [0, 1, 3, 2]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. \nAbout 3 dozen Cherry Tomatoes\n1 pound Bacon\n1 small/medium Onion\n1 stalk Celery\n20 leaves of Spinach\n3/4 cup Miracle Whip (or what ever salad dressing you prefer)\n1 teaspoon Olive Oil\n1/2 teaspoon fresh Parsley\n3 tablespoons Grated Parmesan Cheese. First, start out by cooking the bacon.\u00a0 After bacon is cooked, put it on a plate covered with paper towels.\u00a0 Take some extra paper towel and pat the top of bacon to try to get off as much grease as you can.\u00a0 Then, chop up the bacon, put into a bowl and set aside.\nNext, dice the onion and celery.\u00a0 Put into a small skillet and saute' in 1 tablespoon of olive oil for about 10 minutes on medium heat.\nThen, dice the leaves of spinach.\nFinally, to prepare the tomatoes the first step is to cut off the very top.\u00a0 Be sure not to cut off too much.\u00a0 then take a small spoon and carefully take out the seeds and insides.\u00a0 Discard them when done.\u00a0 Once that is done then place the tomato upside down on some paper towel to let them drain.. \nFirst, add the sauteed celery and onions in the bowl with bacon and mix together.\u00a0 Then, add the parsley, parmesan cheese and spinach leaves.\u00a0 Mix well.\u00a0 Next, add the mayo and mix it all together.. The final step is to stuff the tomatoes.\u00a0 All you do is take a half spoonful of bacon mixture and spoon into tomatoes.\u00a0 Careful not to over stuff the tomatoes.\u00a0 Then arrange them on a plate to display for company, as appetizers, or with a favorite side.\nPlace in refrigerator for 1 to 2 hours before eating.\nHope you enjoy them.\u00a0 They are so yummy.\u00a0 Thank you.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 0, 2, 3]\nC: [2, 1, 0, 3]\nD: [0, 1, 3, 2]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_5_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_5_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_5_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_5_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [1, 2, 3, 0]\nC: [0, 3, 1, 2]\nD: [2, 0, 1, 3]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. The biscuit (cookie):2 2/3 cup flour1 cup sugar1/2 teaspoon salt1/4 teaspoon baking powder1 cup butter, room temperature1 egg yolk2 teaspoons vanilla extractThe filling:1 cup shortening5 cups confectioner\u2019s sugar1 teaspoon vanilla extract1 packet unflavored gelatin1/4 cup water1 jar red jam (any flavor)Special supplies:cookie stampblack food coloringedible gold luster dustvodkapaintbrushbutcher\u2019s/parchment paperblack sealing waxwax stamp. Using a beater attachment, blend dry ingredients in a mixer (the dough will become too stiff for a whisk; believe me I tried, as you can see in my image).Then, mix in butter until ingredients are crumbly.Add egg yolk and vanilla.When stiff dough forms, gradually add black food coloring.Once fully blended, divide dough in half, wrapping both halves in plastic wrap, and chill for at least one hour.Tip: If you would prefer not to use an artificial food coloring due to allergies or preference, you can achieve a similar darkness with cocoa noir, which will add a delicious chocolaty taste. Simply add your preferred amount to the dough while it\u2019s still in the mixer. If the dough becomes too dry in doing so, add milk half a teaspoon at a time until cocoa is evenly blended but dough is still stiff.. On a floured surface, roll out one of your two balls of chilled dough to no thicker than 1/4 inch. The thinner the better; I rolled mine to as close as 1/8 inch as I dared.At this point, I used my cookie stamp to impress my skull and crossbones design all over the dough.To actually cut the dough, I used the lightly floured rim of a jam jar.Transfer cut biscuits to a cookie sheet lined in parchment, and bake in a 350 degree (F) preheated oven for 8\u201310 minutes.Let cool for two minutes on cookie sheet and then transfer to cooling rack or parchment to cool completely.Tip: Try keeping as clean a surface as possible on the dough while rolling. White flour will sometimes still show up on the black cookie even after baking.. Repeat the previous step with the second chilled ball of dough, though this time, you don\u2019t need to use the stamp, as these will be the bottoms of the sandwiches. Simply roll the dough and cut the circles with the same jam jar as before.Transfer cut biscuits to a cookie sheet lined in parchment, and bake in a 350 degree (F) preheated oven for 8\u201310 minutes.Let cool for two minutes on cookie sheet and then transfer to cooling rack or parchment to cool completely.. This step can be skipped if you decide to not stamp the biscuits or if you don\u2019t want to add the gold color to the stamped relief.After the biscuits have cooled completely, prepare the gold luster dust for painting onto the stamped design by adding a very small amount (I used the end of a butter knife to dip out perhaps 1/8 teaspoon from its vial) into a separate dish.Gently add vodka one drop at a time to make a paint (to control the addition of the vodka, I poured some into a shot glass, then dipped a paintbrush into the shot glass to retrieve a drop of liquid at a time to add to the luster dust).Once you have made a paint with the gold luster dust and vodka, you can use the paintbrush to gild the design on the biscuits. You may need to add a few drops of vodka to the luster dust while you work, as the vodka will evaporate quickly.Once you have finished gilding all the biscuits, allow them to dry completely while you complete the following step.Tip: You can use any clear grain alcohol for this step, but it must be alcohol\u2014not water. The alcohol acts as a carrying medium for the luster dust and will evaporate quickly once you\u2019ve painted the biscuit, ensuring that the biscuits do not become soggy.Important: The gold you\u2019ve painted onto the biscuits will be somewhat delicate, so in the following steps, handle the painted biscuits carefully so as not to dull or rub the gold off completely.. Dissolve unflavored gelatin in cold water.Heat in a cup until clear (or as clear as you can get it), then let cool.Using a beater attachment (the filling will be too stiff for a whisk), cream together shortening, confectioner\u2019s sugar, and vanilla extract.Add the gelatin, and beat until smooth. (This may take up to ten minutes.)Once filling is smooth, spoon it into a pastry bag (I used a resealable plastic bag that I cut the corner from).Add jam to a separate pastry bag (or resealable plastic bag).. You will be building the sandwich from the bottom up. With a bottom biscuit (the undecorated ones) in one hand and the pastry bag of filling in your other, apply the filling in a generous ring to the biscuit, leaving the center free of filling.Then, exchange the filling for the jam and fill the center of the ring with the jam. This will be the \u201cbloody\u201d surprise.For each biscuit bottom you do this to, gently press on one of the decorated top biscuits. Set aside and allow to set 30\u201460 minutes before wrapping.. The Victorians often printed obituaries, comforting poems, and biblical verses onto the wrappers of their burial biscuits in order to comfort those taking refreshment in them. While I did not do this to mine, the sky really is the limit on the possibilities this custom affords. Perhaps you could include spooky fortunes or the fictional death dates of your guests brave enough to try these elegant sandwiches. In any case, wrapping the biscuit is relatively straightforward.I created a simple envelope from paper (I recommend butcher\u2019s paper or parchment because regular paper will likely show oil stains after some time from contact with the butter in the biscuits).Secure the lip of your envelope by melting the black sealing wax onto its edge.Then gently impress the wax stamp for approximately five seconds or until the wax is cool.Gently remove the stamp and put the wrapped sandwich aside. Repeat with all the sandwiches.Tip: The paper envelope needn\u2019t be any fancier than you want it to be. You could even tear the edges to give the presentation a rougher look. In order to make my envelope, I used my cookie stamp in place of a real sandwich to create a template envelope, which I then used as a template to trace and cut the rest of the paper envelopes. This way I didn\u2019t ruin a sandwich in the process.. These burial biscuits are perfect for setting out at gatherings, as they needn\u2019t be refrigerated to stay together. However, if you make these ahead of time, I suggest keeping them in an airtight container in the fridge until you need to set up for the gathering. If you have any leftover, they will keep for 2\u20133 days this way in the fridge.Because these burial biscuits were at one time solely provided at funerals, don\u2019t be afraid to give them prominence amid your other snacks. I staged mine with a photograph of the deceased to honor his memory, a memorial cabinet card, and other somber, spooky d\u00e9cor.Best of luck! \u2013Brandon\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 2, 3, 0]\nC: [0, 3, 1, 2]\nD: [2, 0, 1, 3]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_6_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_6_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_6_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_6_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [1, 2, 0, 3]\nC: [1, 2, 3, 0]\nD: [2, 0, 1, 3]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Cupcakes:2 1/2 cups of all purpose flour     2 teaspoons of baking powder     1/2 teaspoon of baking soda     1/2 cup of softened butter     3 eggs     1/2 cup of milk     1/2 teaspoon of salt     1 cup of sugar     1 teaspoon of vanilla extract     1/2 cup of vegetable oilFrosting:3 tablespoons of softened butter     1 egg white     2 cups of powdered sugar     1/2 teaspoon of vanilla extract     1/2 tablespoon of waterGummy Decorations:A little bit of frosting (set aside from the frosting recipe)  Graham crackers     Green and orange fruit slices     Green and orange gumdrops     Peach rings  ToothpicksEquipment:Cooling rack Mixing bowls of all sizes A non serrated knife or clean food safe scissors Hand or stationary mixer Cupcake pan Cupcake liners Rubber spatula  Storage container Plastic sandwich bag Plate Hot pads. Preparation:Preheat the oven to 350 degrees.     Set out cupcake pans and line them with the cupcake liners.    Set aside the readied pan for later.. Dry Ingredients:Stir the flour, baking powder, baking soda, and salt together in a medium bowl.Wet Ingredients:Mix together the milk, vegetable oil, and vanilla extract in a small bowl. Set aside for later use.  In a separate large bowl, use the mixer on a low to medium speed to beat the butter and sugar together.  Tip: It should look light and fluffy.Mix in the eggs one at a time in the same bowl.Tip: Make sure to fully incorporate each egg before adding the next.Combining the Ingredients:Use the same large mixing bowl from before to alternate adding small parts of the dry mixture and the wet mixture that was previously combined. Tip: Again make sure you full incorporate each addition to the batter before you add more.  Tip: The consistency of the batter will vary between thick and thin as you alternate the dry and wet ingredients respectively.. Pour the Batter and Bake:Scrape the sides of the bowl with the rubber spatula to make sure everything is mixed well together.  Pour or scoop the batter into the lined cupcake pans.  Tip: Fill each cupcake until it is no more than 3/4 full. Start with half full then go back and fill them again until you run out of batter making sure they are evenly filled.Place Cupcakes in the center rack of the oven  Warning: Chance of burns when putting cupcakes in and out of the oven, use oven mitts.Bake for 15 to 20 minutes (depending on your oven).  Tip: A good way to check if cupcakes are done is to stick a toothpick in the middle and then take it out to see if it comes away clean then its good, if it has batter on it then they are not done.Place cupcakes aside on a cooling rack to cool before decorating. . While your cupcakes are baking and cooling you can make the frosting.Frosting:Get a medium sized bowl.     Put the sugar, butter, water, vanilla, and egg white in the bowl.     Slowly mix together the ingredients until the powdered sugar doesn't fly everywhere.     Mix the ingredients at higher speeds until it is light and fluffy.      Set aside a small amount of frosting to act as glue for the eyes of the animals.Color and Texture:Put graham crackers into a bag and crush them up until they are at a sand like consistency.      Tip: Start off with around three graham and add more if needed.Pour the crushed graham crackers onto a plate so they are ready for the decorating steps.      Set aside some frosting to dye blue to have an ocean colored frosting to go on the cupcakes.. Sand:Frost the top of the cupcake.   Roll the frosted cupcake in the plate of crushed graham crackers. Turtles:To make one turtle you need a peach ring, two gumdrops, and a fruit slice.Put one of the gumdrops into the center of the peach ring for the body of the turtle.   Place the other gumdrop on the outside of the peach ring for the head of the turtle.   Cut the fruit slice in half.   Take one half and cut along the middle the long way (see image above).Cut those two new half's in half again but this time at an angle along the short way (see image above).Cut three smaller triangles one equilateral and two isosceles these will make the tail and the two back fins.Place the pieces of the turtle onto the frosted cupcake.Crabs:To make one crab you need two gumdrops, one fruit slice, a half of peach ring, and two toothpicks.Cut the tooth picks in half so you have four pieces of toothpicks.   Stick the two gumdrops on two of the toothpicks, one gumdrop per toothpick piece. These will be the eyes of the crab.    Stick the other ends of those same toothpicks into the top of the fruit slice (the flat side).   Place two drops of frosting on the gumdrops, one on each eye.Stick the candy eyes onto the now frosted gumdrops.Cut a peach ring in half.Cut the half of peach ring in half again making two claws for the crab.   Stick the two claws onto the ends of the remaining two toothpicks.   Stick the toothpicks into the sides of the fruit slice.    Place the curved side of the fruit slice on to the frosted cupcake.Tip: If the crab is having trouble staying on the cupcake then use another toothpick in the bottom to stick into the cupcake for stability.. Now that you're finished, you should have fun and cute beach/ocean themed cupcakes like the ones above for any event! Have fun sharing this delicious tasting desert while impressing your friends and family.Optional:To save on time try using store bought cupcake box mix and frosting, or invite others to bake with you!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 2, 0, 3]\nC: [1, 2, 3, 0]\nD: [2, 0, 1, 3]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_7_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_7_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_7_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_7_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [1, 0, 3, 2]\nC: [0, 3, 1, 2]\nD: [2, 1, 0, 3]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Ingredients & Tools   Martini glass     Malibu (1.0 oz)     Cake vodka (1.0 oz)     Blue curacao (Just a splash!)     Slice of Lemon     Ice     Cocktail shaker Mixing  Mix your cake vodka, Malibu, and splash of blue curacao into the cocktail shaker with ice. Shake and strain into your martini glass and garnish with a slice of lemon.Cutting Your Lemon To make your presentation match the image, cut a thin slice of lemon in half and then place the blade of your knife between the knife and the pulp of the fruit, separating the fruit from the rind just barely. Do this up to the halfway point and then wedge your new lemon slice onto the lip of the cocktail glass.\n        . Ingredients and Tools   Jaegermeister (1.0 oz)     Black Spiced Rum (2 oz)     Ginger ale or ginger beer (top off remaining glass)     Ice     Lime wedge     Low ball glass/tumbler Mixing  Pour your Jaeger and rum into a low ball glass filled with ice, stir, then top off with ginger ale/ginger beer. Garnish with a lime wedge.\n        . Ingredients and Tools   White rum (1.5 oz)     Peach Schnapps (0.5 oz)     Pink Lemonade (top off remainder of glass)     Ice     Orange slice     Maraschino Cherry     High ball glass     Tooth pick Mixing  Pour your rum, pink lemonade, and peach schnapps into a high ball glass with ice and stir. Garnish with a pinwheel. Best enjoyed through a straw.What is a pinwheel?! It's that thing in the picture, silly! Just take your tooth pick, one maraschino cherry, and an orange slice and follow along!   Wrap your orange slice around the maraschino cherry as far as you can.     Place your tooth pick through your orange wrapped cherry so that it looks like the picture! Simple, no? There are many variations on the pinwheel. This is just the one I was taught.. Ingredients and Tools   Black cherry juice (3.5 oz)     Gin (1.0 oz)     Blackberry Brandy (0.5 oz)     Tonic water (2.5 oz)     Cherry pop rocks     Conical flask (250 ml) Prepping your Flask -Wet the lip of your flask thoroughly with tap water. -Pour your pop rocks onto a small plate -Rub your wet flask into the pop rocks until the rim is evenly coated.Mixing First line the rim of your flask with pop rocks, then pour in all of your liquid ingredients. Give it a good stir, drink, and save Townsville!I know that tonic water glows under black light, but I'm not sure what it would look like with all of these ingredients mixed in. If anyone has the means to take a picture of Chemical X under black light, please post in the comments!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 0, 3, 2]\nC: [0, 3, 1, 2]\nD: [2, 1, 0, 3]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_8_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_8_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_8_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_8_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [2, 3, 1, 0]\nC: [1, 0, 3, 2]\nD: [1, 2, 0, 3]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. First things first: Go Wash Your Hands! Cleanliness is always appreciated when cooking.Ingredients for the Cake\n1 1/2 Cups Butter (3 sticks) - room temperature\n1 Cup White Sugar\n1 Cup Brown Sugar\n3 Large Eggs - room temperature\n2 1/2 Cups Self Rising Flour\n4 Medium (or 3 large) Apples\n1 Cup Nuts (chopped) - walnuts or pecans are my favorite\n2 teaspoons Vanilla Extract\n2 teaspoons Cinnamon\n3/4 teaspoon Ground Cloves\n3/4 teaspoon Ground NutmegIngredients for the Caramel Glaze:\n1/2 Cup Butter (1 stick)\n2 teaspoons milk\n1/2 Cup Brown Sugar. You are going to need:\nA mixer, or a mixing bowl and a strong arm\nMeasuring Spoons\nMeasuring Cups\nKnife (not shown)\nPeeler (I love mine since it cores, peels, and slices all at once)\nCake Pan of some sort\nSome spray oil\nZiploc bag (or something like it)\nOptional\nA friend to do the dishes. Go ahead and preheat your oven to 350 degrees Fahrenheit .\nPeel, core, and cut your apples into small pieces. Place those in the Ziploc bag and set them aside for now.\nIf your nuts aren't chopped, chop them up. Place those in the Ziploc bag too.\nIn your mixer, cream 1 1/2 Cups of room temperature Butter (3 sticks), 1 Cup of white granulated Sugar, and 1 Cup of Brown Sugar.\nFirst the butter and sugar will lump up into a ball. Keep mixing until the butter and sugar are completely incorporated and usually they'll stick to the bowl. The finished product should be like grainy peanut butter.. Next you will need to add your spices and eggs to the mixer.\nAdd 3 large eggs, 2 tsp of vanilla, 2 tsp of cinnamon, 3/4 tsp ground cloves, 3/4 tsp nutmeg.Mix Mix Mix\nStart mixing slowly so that the powdered spices don't just fly out. Speed it up to get everything incorporated.. Get your bag of apples and nuts out.\nPut a little flour in the bag, about 1/8 of a cup or just enough to lightly coat everything.\nThe flour will help keep the apples and nuts from sinking into the batter while it cooks.\nZip the bag up and\nSHAKE SHAKE SHAKE\nSHAKE SHAKE SHAKE\nShake your baggy\nPut the floured apples and nuts into the mixer and give that a good mix to get everything incorporated.. Get your measuring cups and your self rising flour out.\nAdd 1/2 to 1 Cup of flour at a time and mix it into the batter.\nKeep adding until you have a total of 2 1/2 Cups of Self Rising Flour added to the batter.\nTurn your mixer up and mix well.\nYour batter will change from dark lumpy goop to light, slightly fluffy goop. Your batter will also be rather thick, so don't get flustered by it. \u00a0It will be easy to spoon out, not runny like a typical yellow cake mix.. If you haven't already, get your cake pan out.\nSpray some oil into the pan and dust it with flour. You could alternatively use shortening or even more butter.\nShake out the excess flour since you don't want flour lumps on the outside of your cake.\nSpoon your cake batter into the pan and spread it evenly.\nMake sure you also bang your cake pan down a few times to release any trapped air bubbles.. Put your cake pan on the middle rack of the oven and close the oven door.\nYou'll need to cook this cake for approximately 1 hour and 10 minutes, so set your timer.\nI like to put this particular cake pan on a cookie sheet so that the bottom doesn't overcook/burn.. While you wait for your cake to bake find that optional friend.\nAsk him or her if he or she would like a big piece of apple cake.\nIf he or she says yes, tell them they'll have to work for it by washing all those dishes you just made.\nIf you aren't lucky and/or don't have an optional friend to wash dishes, then you just found something to do for the next hour.DO NOT skip ahead and make the Caramel Glaze while the cake cooks. It sets quickly and by the time the cake is done all you'll have is a pot of really thick and almost unusable caramel goop. It will taste good but you'll ruin your dinner and you'll have to make it again anyways.Other things to read about:\nThis cake recipe can be made with all white granulated sugar. I like the added taste of the brown sugar.\nThis cake recipe can be made with cooking oil instead of butter. I like the taste of butter.\nThe Caramel Glaze still needs to be made with butter and brown sugar.Substitutions:\nThe nuts are optional, they just taste good.\nIf you do not have brown sugar you can make your own:\nTo make one cup of packed light brown sugar\u00a0mix together 1 cup of white granulated sugar and 1 1/2 Tablespoons of molasses. For dark brown sugar use up to 4 Tablespoons of molasses.. Your timer should be beeping/ringing/whatever it does and that means you need to check your cake.\nOpen up your oven and poke the cake with a broom straw, or really long toothpick, or if you're lucky you have one of those metal cake testers. I personally used an ice pick.\nThere should be no cake on the pick when you take it out. Don't be alarmed if you poke an apple. The top of the cake should be browned and cracked. The edges of the cake should have pulled away from the sides of the cake pan.\nIf your cake does not meet these criteria, use your judgement and put the cake back carefully. Watch it closely until you deem it done.\nI like to turn off the oven and leave the cake in for another 10 minutes so that a nice crust forms.\nOnce the time is up take your cake out and put it on a cooling rack.. While your cake is resting get your pan, a spoon of some sort, 1 stick of butter, 2 teaspoons of milk, and 1/2 Cup of packed brown sugar.\nPut the ingredients in the pan, turn your burner on medium high, and begin to stir.\nKeep stirring.\nBring the mixture to a low rolling boil and keep stirring for about 2-3 minutes. You want the sugar crystals to completely dissolve and the Caramel Glaze to slightly thicken.\nTake the glaze off of the heat and go back to your cake.. For the first glazing, you will want to poke holes in your cake (This is another reason I used the ice pick). This is optional but highly recommended.\nSlather some of the hot Caramel Glaze onto the top of the hot cake and let it sink into the holes.\nYou can also paint the sides of the cake at this time.\nDon't use all of the glaze up though. Save some of the glaze and set it aside to cool and thicken.. After a few minutes the rest of the Caramel Glaze should be cooler and thicker.\nApply the glaze to the top of the cake.Congratulations\nYou've just finished making a delicious Fresh Apple Cake with Caramel Glaze.\nI find that this cake is best after it has cooled and settled in the\u00a0refrigerator\u00a0overnight. It also makes it easier to cut.\nIf you can't wait that long, at least wait an hour so that you don't burn your mouth on the molten lava apples inside. Trust me, not being able to taste anything for a week isn't worth the one bite.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 3, 1, 0]\nC: [1, 0, 3, 2]\nD: [1, 2, 0, 3]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_9_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_9_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_9_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_9_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [3, 0, 2, 1]\nC: [2, 1, 0, 3]\nD: [0, 1, 3, 2]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Bacon\nPotatoes\nSea Salt - Optional\nKnife/Cutting Board\nFrying Pan\nPaper Towels\nTongs/Other Standard Kitchen Supplies. Cook up the bacon. \u00a0You won't need it, so discard it (by which, of course, I mean eat it!) \u00a0The idea here is to keep as much grease in the pan as possible.. You're looking for a slice that's a bit thicker than a sheet of paper. \u00a0Not the thinnest slice you can make, but close to it. \u00a0Peel them first if that's your preference, and by all means use your mandoline slicer if you have one. \u00a0One pound of bacon made more than enough grease to fry 3 potatoes.. Place one layer of potatoes in the pan and fry them. \u00a0You want them lightly browned. \u00a0The darker they got the more likely they were to go from tasting bacony to tasting burned.. Drain them on a paper towel, and sprinkle lightly with sea salt if you would like. \u00a0Do this while they're fresh from the pan. \u00a0Keep frying and draining, pretty much until you decide to stop. \u00a0Serve them in a bowl, or individual small bowls if you're fancy like that!\nBonus flavor options:\nBacon Onion\n- before frying the potatoes fry up a diced onion in the grease then scoop it out - the onion infuses the bacon grease with flavor\nBacon Parmesan\n- as soon as you remove the chips from the bacon grease sprinkle them with grated parmesan (the dry kind from a round can)\nFlavored Bacon\n- different types of bacon come with different flavor profiles - applewood, hickory, etc. will all give you a different chip flavors\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 0, 2, 1]\nC: [2, 1, 0, 3]\nD: [0, 1, 3, 2]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_10_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_10_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_10_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_10_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [2, 3, 1, 0]\nC: [3, 1, 0, 2]\nD: [1, 2, 0, 3]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. \n          For the dough:\n\n\t\t3 tsp of sugar\n\t\t2 packs of yeast or 1 1/2 tbsp\n\t\t1 1/4c warm water\n\t\t3 1/4 cup of bread flour\n\t\t3 tsp of salt\n\t\t4 tbsp Olive oil\nFor the sauce:\nTwo 28oz\u00a0cans of crushed tomatoes\n\t\t2 tbsp of olive oil\n\t\t1 tsp of dried oregano\n\t\t2 tbsp of\u00a0Italian\u00a0seasoning\n\t\t1 tsp of salt\n\t\t1/4 cup of fresh grated parmasan cheese\n\t\t2 cloves of\u00a0roasted\u00a0garlic crushed\nFor the toppings: whatever you would like.\n        . \n          Good crust and sauce are what make the pizza first we will start with the dough since it needs time to proof.\n\t\tDissolve\u00a0the\u00a0sugar\u00a0in the warm water then add the yeast, let it sit for 5 minutes.\n\t\tWhile the yeast is becoming a\u00a0slurry\u00a0mix together your dry\u00a0ingredients\u00a0and form a well in the middle.\n\t\tNow pour in the yeast slurry and mix\u00a0thoroughly, after it is mixed kneed for 5 minutes.\n\t\tNow just form it into a ball set it back into the mixing bowl for proof between 90 and 120 minutes. Make sure to set it in a warm area.\nWhile it is\u00a0proofing\u00a0we will work on the\u00a0roasted\u00a0garlic and pizza sauce.\n        . \n          To make\u00a0roasted\u00a0garlic you only need two ingredients:\u00a0Olive\u00a0oil and bulbs of garlic.\n\t\tPre-heat the over to 400\u00baF\n\t\tWith a sharp knife slice through about the first 1/2 inch of the garlic\u00a0bulb.\n\t\tNext peel away most of the outer layers of the bulb and trim the outside cloves down about 1/2 an inch as well.\n\t\tNow place the clove in either a foil cupcake liner, a foil bowl, or even a cupcake pan.\n\t\tPour about 2 tbsp of olive oil over the bulb and cover it in foil.\n\t\tNow bake it for about 45\u00a0minutes,\u00a0after 30 minutes remove the foil and bake for another 10-15 minutes.\n\t\tRemove from the oven and let cool, after it has cooled crush the garlic with a fork.. For the sauce, just put all of the sauce ingredients in a large saucepan and let it simmer and boil down while the dough is proofing and you are building the pizza. Just make sure to stir the sauce occasionally so does not burn and stick to the bottom of the saucepan. You are going to want to have a thicker sauce because if the sauce is to watery, it could easily leak through the crust sides or the bottom and just go everywhere when the pizza is cut into.. \n          Now that dough has proofed your going to want to remove it from the bowl and knead it for an additional minute. Now that dough has been\u00a0kneaded\u00a0again\u00a0you might have noticed the textures changed and its more stretchy.\n\n\t\tDivide the dough ball into two dough balls one little bigger than the other.\n\t\tTake the bigger dough ball and begin stretching and pulling and flattening it out so that it will fit into the 10 inch\u00a0spring form\u00a0pan. Make sure to bring it a little bit up over the sides of the pan so that we will be able to tuck the top crust under the sides of the bottom crust.\n\t\tNow just bake the\u00a0crust in the oven set to 415\u00baF for about 5 minutes. \u00a0\n\t\tWhile the crust is baking begin preparing the ingredients will go inside the pizza pie.\u00a0For mine I used Italian sausage, pepperoni, red onions, and peppers.\n\t\tOnce the crust is done remove from the oven and let cool for about 10 minutes.\n\t\t\u00a0Once it has cooled, begin layering your ingredients. The order I went it was: pepperoni, sausage, sauce, onions, pepperoni, sausage, peppers, sauce, cheese. \u00a0\n\t\tNow were ready to add the top layers of crust. Do the same as you did for the bottom layer but this time just drape it over the cheese and pinch together the two layers of crust.\n\t\tNow just tuck the top player into the sides of the Spring form pan so every thing is sealed in.. \n\t\tTo bake the pizza put it in the preheated oven of 415\u00b0F for 45 minutes with foil over the top of the spring form pan.\n\t\tAfter 45 minutes remove foil and bake for an additional 15 minutes.\n\t\tOnce this is done\u00a0baking\u00a0remove from the oven and let sit for 10 minutes.\u00a0\nNow you can remove the outermost layer of the spring form pan, and now it is ready to cut and enjoy.\n        \nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 3, 1, 0]\nC: [3, 1, 0, 2]\nD: [1, 2, 0, 3]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_11_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_11_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_11_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_11_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [2, 3, 0, 1]\nC: [0, 1, 3, 2]\nD: [3, 1, 0, 2]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. A Basic Set of Canning Tools with:Wide-Mouth Canning FunnelJar LifterMagnetic Lid LifterBubble Remover/Headspace Tool.* I say this is optional, because I canned for years prior to buying a basic set of canning tools. They are helpful, but not 100% required. . I have used the same brine pretty much since I started making relish. The base is very very versatile and can be flavored with a number of spices or herbs to your own personal taste. In fact, for several years, it was all I used for my pickles. It's based on a 1 cup batch, so it's easy to adjust depending on how much you intend to make.  For this, I made a 4 cup batch (which I split between relish and pickled tomatoes, 2 cups each). Ingredients: 1 Cup of White Vinegar 1/2 Tablespoon of Granulated Sugar 1/2 Tablespoon of Brown Sugar 1/2 Teaspoon of Pickling Salt The last few years, I've also added the following Spices (based on a 1 cup batch): 1/8 Teaspoon Dill Seed 1/8 Teaspoon Mustard Seed 1/4 Teaspoon Black Pepper Corn 1/8 Teaspoon Celery Seed 1/4 Teaspoon Chopped Garlic Once everything's measured out, combine all of the above ingredients in a medium sized pot. Bring to a  rolling boil, and reduce the heat to medium. Strain the brine through a fine mesh strainer and return the liquid to heat. Now the brine is ready for pickling.  * You can wrap your spice mix in cheesecloth to avoid having to strain your brine or you can leave it loose. I do both, depending on the batch.. My relish recipe is really very simple and consist of only hot peppers that I have on hand, nothing else. This years batch has Jalapeno's, Serrano's, Thai Chili's, Red Habanero's, and Cayenne (I'm still waiting to see if the jolokias are going to fruit).Take approximately 2-3 cups of your favorite peppers (seeds and all) and in a food processor, pulse until you have a nice evenly chunky, peppery mix. * You can also finely chop the peppers if you don't have a food processor handy.. Hot Water Bath Canning vs Refrigerator Pickling is a tough decisions and should be based on your goals. Either of these methods will render you with a fantastic batch of \"Hellish\" relish. The biggest difference comes down to how much you are making and storage. If you are planning on long term storage, you can follow the directions here, \"Hot Water Bath\" Canning - The Basics. It's a very straight forward method of canning and allows you to store the relish for 1-2 years. It's is a bit more time consuming and does require a large canning pot, but I find it to be well worth it, especially for large batches. If you are going with the Refrigerator Pickle method, simply wash the lids and jars thoroughly with hot soapy water, dry, and set them aside until needed. . Now that the peppers are processed (or chopped) and the jars/lids are prepped, it's time to fill them.Using a Wide-Mouth Canning Funnel, spoon the hot pepper mixture into each of the jars, leaving at least a 1/2\" of room at the top. Ladle in the pickling brine, being sure to leave at least a 1/2\" of headspace. Insert the Bubble Remover/Headspace Tool and gently move it around the inside of the jar, to release any bubbles.Finally, wipe the rim of each jar, to make sure they are clean prior to sealing the jars. . Apply the top to the jar and lastly, apply the ring and tighten to seal.If you are going to go with the Refrigerator Pickle Method, You're Done!!!! Allow the jars to cool on the counter and then store in the refrigerator for 1-3 weeks.. If you've decided to go with the \"Hot Water Bath\" method to seal your \"Hellish\" Relish, using the \"Jar Lifters\" place the jars back into in the hot water bath and put the lid back on the canning pot. Process for 15 minutes, then remove and allow to cool on the counter over night (12-24 hours).When you hear the lids start to pop, you'll know you have a solid seal!Store in a cool, dry place until you're ready to taste test and devour!I hope you've enjoyed this Instructable, on how I make \"Hellish\" hot pepper relish!!! Happy Pickling!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 3, 0, 1]\nC: [0, 1, 3, 2]\nD: [3, 1, 0, 2]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_12_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_12_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_12_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_12_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [3, 1, 0, 2]\nC: [0, 2, 1, 3]\nD: [0, 3, 2, 1]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Bake your angelfood cake in a loaf pan for easy slicing. Line the pan with parchment for ease of release after baking and cooling. You can use a cake mix or bake from scratch using any recipe.   Baking from scratch is quick and easy if you use powdered eggs or liquid egg whites from the store. No need to find another recipe for all those yolks. (I've gone and made 1 Litre of lemon curd in the past, it's a bit much!). When your cake has cooled, Peel off parchment paper. it can now be sliced into even servings. (I like to serve 2 slices per person).  Use an angelfood cutter, forks or serrated knife to gently cut the cake.. While your cake is cooling, get the toppings you want to use.I used:70% dark chocolate,Toasted slivered almonds,Fresh raspberries.You can use any variety of cake, chocolate, nuts and berries.  You could even add soft cheeses like cream cheese or mascarpone mixed with some vanilla sugar and lemon zest to create something else grand.. Toast your cake slices in a dry griddle or frying pan.  Use a Medium Low temperature.  The sugars in the cake will burn easily on higher heats. . When the underside starts to brown, it is time to turn the cakes over. Put your chocolate on the warmed cake.  Cover with a lid to speed the melting, before the other side burns. This should only take a minute. . Transfer the cakes to a plate. Use a knife, or the back of a spoon to spread the now melted chocolate.Top with the chopped nuts and fruit.Serve and enjoy!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 1, 0, 2]\nC: [0, 2, 1, 3]\nD: [0, 3, 2, 1]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_13_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_13_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_13_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_13_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [0, 2, 3, 1]\nC: [0, 1, 2, 3]\nD: [2, 0, 1, 3]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. The first step is the hardest part for me, but the easiest for all of my friends.You should take your parsley and wash it. Than, gently wipe extra water from parsley, and cut it.Take a middle sized bowl and mix the butter and parsley in it.Wrap the butter in a plastic food wrap and put into the freezer for 20-30 minutes.. This step is simple and easy.Take the chicken breast and \"open\" it with the sharp knife the way you will get a rectangular form of chicken.Gently beat your chicken with a food hammer from one side.Take the butter our of the refrigerator and cut it in pieces.Then, put the butter on the chicken breasts.Wrap the breast, so that the butter is inside.Now you are almost done.. Slightly beat eggs with mixer eggs and add salt and pepper.Dip the chicken in the egg mixture.Roll the breasts in the breadcrumbs. Repeat this process twice.. In a deep frying pan or saucepan, heat the oil and fry the breasts. Turn it around occasionally, until golden brown colour would appear from all of the sides.The oil should cover the breasts at least to the half.Your breasts are ready! I recommend to serve the breasts with fried or mashed potatoes.  I hope you and your family will enjoy this dish. Bon Appetit!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 2, 3, 1]\nC: [0, 1, 2, 3]\nD: [2, 0, 1, 3]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_14_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_14_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_14_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_14_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [3, 0, 2, 1]\nC: [2, 1, 3, 0]\nD: [2, 0, 3, 1]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. \nIngredients:\n\n\t\t1 14 oz. can OR 2 cups beans of your choice (I'm using chickpeas & black beans)\n\t\t1 medium onion, cut into large chunks\n\t\t1/2 cup rolled oats (not instant)\n\t\t1 tbsp spice of your choice (I'm using chili powder in the black beans, cumin in the chickpeas)\n\t\tsalt and pepper to taste\n\t\t1 egg\n\t\toil for cooking (olive or canola)\nHardware:\n\n\t\tlarge pan for frying\n\t\tfood processor\n\t\tspatulas, forks, etc.\n\t\tcutting boards. \n(Get out your food processor - if you have one that's 4+ cups, feel free to dump everything in. If you're working with a little tiny one like me, first mix everything together in a bowl and do it in two batches.)\nOpen your can of beans and drain it - reserving the liquid, just in case. (I've not had to use it, but who knows!) Cut your onion into chunks.\nCombine the beans, 1/2 cup of oats, spices, the egg, onion, salt, and pepper in your bowl and mix together.\nThe first picture is chickpeas with cumin, the second picture is black beans with chili powder.. \nOnce you get a reasonable amount of the mixture into your food processor, you'll start to pulse. Pulse a few times, take the top off, scrape down the sides, repeat.\nYou only want to do this until things start to break down - you don't want things to get pureed. It still needs to be slightly chunky and have good body. :)\nAlso - don't add any liquid at all if you can help it. The mixture is wet enough as-is and should mix up on its own... but if, for some odd reason, your food processor starts smoking and screaming, add a tablespoon or so of liquid.\nThe last picture shows you what you should end up with - a good, firm, chunky consistency.\nWhat do do if it gets too thin, or you got too excited while pulsing:\nPut a tablespoon or two of extra oats in your second batch if you have a small food processor, or just empty everything but a small amount of the mixture and add in the oats.. Wet your hands and shake off any excess.\nWet hands are especially important for this bit - otherwise you just get gloppy bean hands, and no one wants that.\nI had the best luck making four patties out of the mixture, but you can make up to six. Divide the mixture into equal parts and roll into balls and then flatten carefully into patties.\nI found it was easiest to do this on a big flexible cutting board that I could carry around with me - I'd just make them into balls and then flatten them while they were on the board.\nDon't make them too thin or they become very hard to pick back up and they'll start to break. And don't press too hard into the surface you're working on either, or they'll stick like crazy!. \nNow the best part!\nHeat a pan over medium heat with your oil of choice. Once the oil is nice and hot, coax the patties off your work station (keep in mind you can reshape them a little in the pan if you need to) and into the pan. Don't overcrowd them, though - 2-3 patties is the maximum - if they're touching too much they will not get crispy, they'll just steam.\nCook for five minutes on one side, flip, and cook for an additional five minutes.\nMake sure to move them around a little during this time - stove tops and pans can be finicky, and you don't want one to get burnt while the other stays golden brown. ;)\nAnd if a patty breaks during flipping, no worries, just craft yourself a falafel-esque bite and consider that your tip for cooking.. \nTop these as you would any other burger - I suspect almost anything would be good on them!\nThe black bean ones were my favorite, but the chickpea ones are also quite good. I can't wait to try additional combinations and I'm looking forward to trying more veggies and fresh herbs in there!\nThe original recipe came from one of the kings of cooking, Mark Bittman, and you can check out more ideas on his bean burger recipe page - he gives a lot of great substitution ideas!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 0, 2, 1]\nC: [2, 1, 3, 0]\nD: [2, 0, 3, 1]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_15_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_15_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_15_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_15_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [1, 2, 3, 0]\nC: [3, 0, 2, 1]\nD: [0, 2, 3, 1]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Materials:\nWhite Fondant\nPink or red food coloring\nToothpick\nSmall Circle Cookie Cutter\nFondant work mat\nRolling pin\nGloves (optional)\nSorry for the blurry picture!. Dip the toothpick in the food coloring and then rub the food coloring onto the white fondant. Put on gloves and start to work the fondant until it turns the desired color.. Take a little piece of the fondant that you just colored and roll it into a ball. Put the ball on the fondant work mat and start shaping so that it ends up looking like a hershey's kiss.. Put the slab of fondant on the Fondant Work Mat and roll it very flat. Then use the circle cookie cutter to cut as many circles from the fondant as you can. If you don't have a circle cookie cutter, you can also use a water bottle cap like I did.\u00a0. Fill a little cup with water and keep it with you the whole time you are putting together the roses. Take \u00a0a base and a circle and wet one of your fingers. Rub your wet finger on the bottom of the base and then stick the circle onto the wet part of the base. Keep repeating until the rose has as many petals as you would like it to have. To finish the roses, gently twist the very bottom and then use scissors to cut it off.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 2, 3, 0]\nC: [3, 0, 2, 1]\nD: [0, 2, 3, 1]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_16_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_16_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_16_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_16_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [2, 1, 3, 0]\nC: [2, 3, 1, 0]\nD: [2, 0, 3, 1]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. \nI make two batches of granola every Sunday. One I make with the nuts and one without.\n12 cups of\u00a0 quick oats\n1 cup of Olive oil (or your favorite light oil, canola, grapeseed)\n1 cup of Maple syrup if you don't like the flavor of the maple syrup I have also made it with light corn syrup\n1 cup of packed dark brown sugar. If you prefer the light or golden feel free to substitute\n3 cups of shredded Coconut- again this is optional, if your allergic or just don't like the consistency don't put it in.\n3 cups of sliced almonds- I just buy the whole ones and chop mine up these are optional\n1 cup of dried fruit-again optional. In the one I make without nuts I also do not add the dried fruit. The kids add fresh strawberries, bananas to their own bowl.\n1 pan large enough to hold everything. I use my roasting pan.\n3 cups of chopped cashews- again optional\nmeasuring cup\nknife for chopping nuts. Pre-heat the oven to 250 degrees Fahrenheit.\nNow is when you will chop your nuts up if you bought them whole.\nNo need to chop them too small. I like the way they clump together while they are baking. No need for them to all be the same size either.. Ok, now your going to add all the ingredients except for the oil, syrup and fruit. Mix well.\nNow add your oil , mix.\nThen add the syrup. mix well.. \nPlace your pan into the oven.\u00a0 You are going to stir the granola every fifteen minutes for one hour and fifteen minutes. This way it gets a nice even color.\nLet cool. This is when you add your dried fruit , chocolate chips if you want.\nWe eat it over soy yogurt, with soy milk, also if you heat up your milk then pour it over the granola yummy!!\nI hope you have enjoyed my first instruct-able.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 1, 3, 0]\nC: [2, 3, 1, 0]\nD: [2, 0, 3, 1]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_17_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_17_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_17_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_17_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [2, 0, 1, 3]\nC: [2, 1, 3, 0]\nD: [3, 2, 1, 0]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n.   So the general plan is to suspend your brewing pot/kettle in a big container and surround it with expanding foam. I found the main challenge to be getting a suitable container. For my 15 litre stock pot I found a big plant pot that would do the job (in the UK you can get one of these from B&Q or Wilkinsons, for about \u00a37). This allowed about three inches dead space around the whole pot, except at the bottom edge of the pot where the taper reduced this to two inches or so.Requirements for the container   You're looking for something a few inches (say ~3\") larger than your pot on every 'side' (including underneath)     The smaller the space, the less expanding foam you will need     The larger the space, the better the insulation, but the returns diminish since mash times are only an hour or so     You could make your own container, but be careful about materials - cardboard might go soggy     Something like a garden dustbin might be a good choice for large pots/kettles So, here's everything you'll need   Your brewing pot/kettle (duh)     A large container as discussed above     A can or two of expanding foam (one 750 ml can was enough for me - I got it from Screwfix)*     Something to raise the pot off the bottom of the container (see next step)     Aluminium foil to wrap around the pot * apparently you should steer clear of 'easy cleanup' or 'water soluble' varieties of this foam because they're too weak - thanks sharpstick for the tip.   We want insulation under the pot as well as around it, so it needs to be lifted up off the bottom of the big container. This is how I did it: just a piece of plywood with three nylon screws.  Advantages of doing it like this:   Leaves a nice air gap under the pot for filling with foam      You can easily adjust the height of the screws if the pot doesn't sit level      Three-point contact means the pot won't rock (kinematic mount)      Nylon screws rather than metal to avoid scratching the pot   Whatever you use, it doesn't need to be fixed to the bottom of the big container because the foam will bond everything together. You might want to stick it down with something light duty though, to avoid it shifting for the time being.. VERY IMPORTANT! This is what makes your insulation a removable sleeve rather than a permanent addition to your brew pot!  The idea here is to wrap the pot in foil so that, once the foam has set, the pot can be removed. Hence you can mash inside the insulation, then pull out the pot and get it straight on the heat for boiling.  This is quite an easy task if you take your time and do things in a sensible order:   Start with the part of the pot below the handles, where's it's a constant diameter, and work towards the base from there. The foil doesn't have to be pulled tight, but you don't want it baggy either. Try to not crinkle the foil too much. Secure it where necessary with some tape (I used electrical tape).     Gently fold the foil around the base of the pot and tape it again. The foam won't want to invade between tight folds of foil, so there's no need to make all 'airtight'.     Continue the foil wrap past the top of the pot and fold it inside. Carefully scrunch around the handles, avoiding any tears. If the foil tears then patch it up with tape.     Tape over any edges where one sheet of foil ends and another begins.   The thing to remember is that this doesn't have to be pretty, but the foil does need to cover all of the outside of the pot. Any gaps and the foam will make direct contact with your pot and stick to it, and this will only make your life harder when you eventually try to get the pot out..   Adding the foam has to be done in layers, so you'll need to allow enough time. My experience was that each layer added three or four inches of height to the overall level of foam, with at least five minutes for each layer to finish most of its expansion. It took me about half an hour to add all the foam.Lessons learned  You'll see in the photos that initially I thought a bag of rice would be heavy enough to hold the pot in place. WRONG! In the end I threw in as many heavy things as I could find, so seek out some ballast before you start. The base layer of foam can really push up!  Also it's easy to miss just how much the foam expands. If you watch it it barely seems to move, but in reality it will puff up to four or five times its initial volume over five or 10 minutes. Observe carefully and you'll get a feel for the expansion.Protection  WEAR GLOVES AND SCRAPPY CLOTHES. If the foam dries on your clothes it will never come out. Get it on your skin and you'll end up removing it with sandpaper, not soap :|Preparation  Gloves as already mentioned. Plastic sheet on the floor. The foam will drip out the end of the can between uses, and it's best to just let this fall onto your plastic sheet and let it dry there. Wiping this stuff is almost always a bad idea.  The foam sets in the presence of moisture, so I slightly wet the sides of my pot and container to help it along. Probably not essential though.Procedure  Ok, here goes:   First the base layer needs to be added, so remove the pot from the container (not the spacer from step 2 though) and squirt an even layer of foam across the entire base of the container. Avoid the temptation to fill it all - you only need a layer around a quarter of the height of the gap and magic will do the rest.     Replace the pot, fill it with as much mass as you can find*\u00a0and ensure that the pot is centred in the container. We don't want to move it from this point onwards if we can help it.     Wait for the base foam layer to expand and creep around the edge of the pot. Only about five minutes in my case.     Add another layer of foam directly on top of the last layer, working evenly around the pot. Aim for an initial layer height of between half an inch and an inch - it will expand four or five times this.     Wait again for expansion to slow down, but not so long that it starts to harden. Five to 10 minutes again, perhaps.     Repeat steps 4 and 5 until your latest (unexpanded) layer sits just below the rim of the pot.     Now relax and have a homebrew! The foam will take longer to completely set than the can advises, because it's such a deep gap. Leave it at least overnight to fully harden. *\u00a0you could fill the pot with water to weigh it down - thanks woolleyy for that idea.   The beauty of this foam is how easy it is to shape once it's dry. Using a sharp knife, cut around the rim of the pot so the foam is flush. Also cut or tear the protruding foil and remove the excess to expose your pot.Something unexpected  At this point I expected the pot to simply lift out of the foam-filled container, but not so. Actually there was barely any adhesion of the foam to the big plant pot, so I actually removed the brew pot + insulation as one! See photo.  In fact, the fit was so snug that it took a little while to get the pot out of its new jacket - I now realise it's because there was no way for air to enter the cavity beneath the pot. You could solve this by poking a hole trough the insulation at the bottom, but after I removed the pot the first time I found the foam had flexed enough to allow air to enter/exit down the sides of the pot. Something to think about though if you have this problem.UPDATE 2014-03-19: See new step 6 for, possibly, an even better solution to this.Voila!  Anyway, you should end up with a perfectly-fitting, custom made cozy jacket for your pot during mashing. You could even make a foam lid, although I just use towels.Performance  My temperature drops during a mash are now consistently less than 1 deg C per hour, and that's including removal of the lid to do pH checks etc. \u00a0Also I tend to do ~8 litre mashes, so a \"true\" BIAB (full volume) mash would hold even better.  \u00a0TIP: If you're doing BIAB, then you're probably adding the grain to the water rather than the other way around. If this is the case then consider heating your mash water a few degrees higher than usual, then insert the pot in the insulation with the lid off and wait for it to drop to strike temp. This way you pre-heat the insulated jacket a little and avoid any initial losses when inserting the pot into a cold jacket.. This step is a later addition to the instructable, because it suddenly hit me that you can save yourself the hassle of lifting (especially big pots) if you cut the finished jacket in two. The idea is simple: slice the set foam in half, then you can bring the halves together around the pot and secure them with a strap, tape etc.  Lifting isn't eliminated entirely, since you'll probably be lifting the pot off a burner, but life is certainly made much easier, especially for removal after the mash has finished.I recently did this for a 32 litre pot with great results (see photo).Spigotry If your pot has a spigot (tap) then there's hope for you yet... assuming you can remove the spigot for the 'moulding' phase when the foam is setting, then you can cut the foam in two, as above, but also take a little extra off so there's a gap for the spigot to poke through.  This foam is so easy to shape that I can't see this being a problem.  It could even work for some electric boilers.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 0, 1, 3]\nC: [2, 1, 3, 0]\nD: [3, 2, 1, 0]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_18_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_18_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_18_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_18_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [0, 1, 2, 3]\nC: [0, 1, 3, 2]\nD: [3, 0, 1, 2]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. The ingredients in this wonderful and OFFICIAL Peet's recipe are as follows:1/4 lb. of Hibiscus C tea12 oz. of boiling water12 oz. lemonade1/2 cup of white sugar4 oz. cold waterIce cubes to coolY'all will also need a tea-kettle, an iced-tea pitcher, and a tea pot of some sort. Boil 12 oz. of water, DO NOT REMOVE KETTLE UNTIL THE WHISTLE BLOWS.. Pour the boiling water over the Hibiscus C tea into a teapot (or other heat-safe container, in our case we used a coffee presspot) and let steep for 5 minutes. (If you prefer a stronger tea taste, feel free to let it steep a bit longer). After the tea has steeped for 5 minutes or so, use a strainer to separate the hot liquid from the loose tea into an iced-tea pitcher, and immediately afterward add the 1/2 cup of sugar. This is critical to do directly after the tea has steeped so the sugar can dissolve in the hot liquid. Gently stir to ensure that all sugar is dissolved.. ... pour lemonade into the mix y'all!After the sugar is dissolved into the concentrated tea, pour 12 oz. of cold lemonade into the pitch.Continue to stir the mixture. This step is simple y'all, while stirring, pour 4 oz. of cold fresh water into the pitcher. yep, that's all for this step.. So what would an iced tea cooler be without the ice, right? Once the mixture is completely stirred together, add a few handfuls of ice cubes to chill the drink. If you really want to get festive, you can use fun ice cube shapes...we used puzzle and triangle ice-cube molds. Special ice shapes are the perfect mundane detail to dazzle your friends and show up Martha!!!. Add some more flare to this fabulous drink by pouring it into your favorite cocktail glass and adding a colorful garnish like a slice of lime or a lemon twist.Your friends and dog will love it!!. This drink is best served chilled on a hot day. Add some banana loungers, Girl Talk's \"Feed the Animals\" album and a friend or two and you have an instant party!!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 1, 2, 3]\nC: [0, 1, 3, 2]\nD: [3, 0, 1, 2]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_19_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_19_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_19_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_19_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [0, 3, 1, 2]\nC: [0, 1, 3, 2]\nD: [1, 2, 3, 0]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. First off is the materials: - Heat Lamp Bulb (Any wattage will do, I had different types for different heat levels) - Hotel Style Plate Cover- Heat Lamp - Pigtail Light Bulb Socket- Outlet socket wire adapter Optional: - Extension Cord- In-Line Dimmer . The first place to start is cutting the hole. Since the bulb needs to be exposed to the food, you will need a hole that is wider than the diameter of the bulb. I first used a Dremel to get the cut started, continued with some pruning scissors (not recommended), and finished with an angle grinder. When I had gotten the hole cut out, I went around the edge with the angle grinder to smooth out the jagged edges. Since this hole will not be seen when assembled, it does not have to be perfect. Make sure to be safe and wear the appropriate gear when cutting the whole as the edges are sharp and the tools throw sparks. . Next, you take apart the heat lamp. If you chose the parts correctly, the pigtail socket should fit into the hole of the previous socket. For your lamp this may not be necessary, but for my lamp I wanted to be able to use bulbs of up to 300 watts, which no pre made lamp that I could find had. . This step is fairly self explanatory, but you need to put the wires from the pig tail socket into the outlet to wire converter. The tip for wiring the right way this is that white is right. . At this point, the hole is made and everything is ready to be put together. I decided to glue the shade to the cover before I glued the pig tail in order to have less in the way, but either way works. I put a small line of glue all around the base of the lamp shade and lined it up with the cover and made sure to make a good seal. Then, I did the same with the pigtail socket and the top of the shade. It is important that you make sure that the shade is well in place before use to be sure of no fire hazard if parts were to fall apart. At this point, you are almost done! . I chose to add an extension chord and a dimmer that both plug into the wall. This helps to adjust the heat that the light can be set on and increase the range it can go from the socket. . By now, you should just have to screw in the bulb and be off to the races! I hope that you enjoyed this simple and helpful guide on how to create your very own and to keep a warm meal for anyone who happens to miss it. Thanks! \nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 3, 1, 2]\nC: [0, 1, 3, 2]\nD: [1, 2, 3, 0]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_20_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_20_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_20_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_20_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [2, 0, 1, 3]\nC: [0, 1, 3, 2]\nD: [1, 2, 0, 3]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. I started with around 1.5kg of silverside beef. Using a very sharp knife I cut the beef along the grain. When cutting down for normal steaks usually you are encouraged to cut across the grain. This stops the beef from being chewy, however that is sort of the effect we want from the biltong. I cut them into slices of around 1cm thick. The thicker you make the longer it will take to dry, and the thinner obviously it will be quicker. . I transferred the beef into a plastic container. Then I poured some red wine vinegar into the container and worked the beef into it all to make sure there was vinegar on all the beef. I let this marinate for around half an hour. This helps with the preservation, I made sure not to leave it in too long or else it will start to effect the final flavour too much.. This section you can mix and match as much as you like. The only one you really need to include is the salt. The salt is what kickstarts the drying and preservation. I started with some sea salt flakes, I crushed them up with my hands. Next I used some black peppercorns and crushed them in my pestle and mortar. I left them fairly course, you can of obviously crush them as fine as you like.Then I added a small handful of chilli flakes. This will add a kick to the biltong. Finally I added some crushed coriander seeds. This is the what gives it that real traditional South African biltong flavour.. I then rubbed all the salt and spices into the beef and made sure it was fully covered. Using small metal hooks I hung each bit of beef into my Biltong Box. If you don't have any hooks you can use paper clips to bend into a hook shape. It needs to dry for around 4-5 days. The longer you leave it, the drier it will get. I like mine a little wet so it is a little bit rare in the middle. . Once you're ready just take it down and slice the Biltong as thin or thick as you like it.It should last a good few weeks if you keep it covered in a sealable bag or tub. I hope you enjoyed this instructables. And remember if you make it make your own, share some photos here! \nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 0, 1, 3]\nC: [0, 1, 3, 2]\nD: [1, 2, 0, 3]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_21_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_21_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_21_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_21_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [3, 1, 2, 0]\nC: [3, 2, 1, 0]\nD: [2, 1, 0, 3]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. \n\tDirections\n\t\t\tPreheat oven to 350 degrees. Sift flour, cinnamon, baking soda, baking powder, and 1/4 teaspoon salt into a bowl. Beat 1 stick butter and the sugars until pale and fluffy. Beat in egg and vanilla.\u00a0 (I sometime skip this step and add all these ingredients at the same time... the result is absolutely the same). \n\t2.\u00a0Mix in zucchini, oats, and walnuts. Refrigerate until firm, about 1 hour.. \n\t3. Depending on the size you want your cookies, Drop spoonfuls of dough onto parchment-lined baking sheets, spacing about 2 inches apart. Bake until edges are golden, about 17 minutes. Let cool on a wire rack.. \n\t4. Beat together remaining 1 1/2 stick butter, crisco, vanilla, milk and confectioners' sugar until smooth. (Buy cream cheese frosting in the tub for an easy alternative) Spread 1 heaping tablespoon filling onto the flat side of 1 cookie, and sandwich with another cookie. Repeat with remaining filling and cookies.. Scrumptious Veggie Cookies!!!!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 1, 2, 0]\nC: [3, 2, 1, 0]\nD: [2, 1, 0, 3]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_22_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_22_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_22_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_22_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [3, 2, 1, 0]\nC: [0, 2, 3, 1]\nD: [3, 0, 1, 2]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. To make this delicous Ajam Ketjap you will need:500 grams of chicken breast (Ajam)a splash of oil 2 large onions 6 cloves garlic 1 small can of tomato puree 1 theespoon of salt 6 tablespoons of Sweet soy sauce (Ketjap manis)You also need a cutting board and knife a stirrer and a large pan.. Cut the onion into pieces, put a little bit of oil in your pan and add the sliced onion and tomato puree together in the pan and fry until the onions are translucent. (it is very importent to put them in together, for the taste of the end product). Whille you fry the unions an tomato puree, Cut the chicken breasts in dices, when the unions are translucent add the chicken and fry it until the chicken is brown.. crush the garlic and put it in the pan stir and fry for 1 or 2 minutes. (Some times people say that 6 cloves is to much and there breath will be terible afterwards. But you do not have to be afraid this wont hapen.). Now add the Theespoon of salt and 6 tablesppoons of Sweet soy sauce also called Ketjap manis, stir it and add about 1 cup of water ( the chicken has to be covered with the sauce you made.. Put the lid on youre pan and let it simmer for about 15 minutes occasionaly stir it, this is a good time to get yourself a nice cup of coffee.. After about 15 minutes get the lid off of your pan and let it simer for another 5 to 10 minutes depending on the amount of watehr that was added in step 5, this has to be done for 2 very important reasons, first of all the excess liquid wil vaporize and second every body in the house will come towards the kitchen wondering what it is that smells so good.You can eat this with bread or rice, both is delicious.Enjoy your meal!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 2, 1, 0]\nC: [0, 2, 3, 1]\nD: [3, 0, 1, 2]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_23_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_23_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_23_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_23_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [3, 0, 1, 2]\nC: [3, 1, 0, 2]\nD: [3, 1, 2, 0]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. For pie cones1 cup All purpose flour  1/2 cup Cold milk ( add little by little) 1/4 cup vegetable oil  SaltFor Sweet filling16 Oz extra firm Tofu  1 tsp Cocoa powder  1/2 cup Melted chocolate or Nutella 1/3 cup Maple syrup 1/2 tsp Vanilla essenceFor Savory Filling1 cup minced turkey meat  1 medium size onion, finely chopped  4 garlic cloves, finely chopped  1 cup mixed vegetables  4 sweet mini peppers 1 tsp salt  1/2 tsp fresh ground pepper  1/4 tsp dried oregano  1 egg 1 tbsp oil cilantro for garnishingSupplies for making moldParchment paper  Aluminim foil sheet. Take A4 sheet size of both parchment paper and aluminium foil sheet. Make a fold in the middle of parchment paper and crush it on one end.Take aluminium foil sheet and wrap it over the parchment paper to attain the cone shape.. Take a mixing bowl, add in flour and salt, whisk until all combine. Add oil and mix it all well.Add a tablespoon of milk at a time, mix well using hand.Keep adding milk  and knead until the dough becomes right consistency.. Preheat oven to 400 F Divide the dough into 3 equal portion. Take one portion, dust the floor with some flour and roll the dough using rolling pin. Roll it according to the size of the mold u have made. Once you rolled it, keep the cone mold on the dough and start rolling the dough over the mold. seal the edges using water. One cone is ready now. Repeat the same process for rest of the dough. Once you have made all the 3 cones, it's time for baking. For better result, pie cones should stand upright.Keep the cones carefully in the baking dish and bake it for about 25 to 30 minutes. Take it out from the oven and let it cool down completely. Remove the cones carefully from mold.. Wash tofu in fresh running water and cut into small cubes. Take all the ingredients in blender and blend it until you get smooth paste consistency.  Transfer it to air tight container and refrigerate it for an hour.. Heat oil over medium flame, once oil heated, add garlic and saute for few seconds. Add onions, saute until it becomes translucent. Add in vegetables, season them with salt and pepper, cook for 3 minutes. Add minced turkey meat & cilantro, mix all well and cook uncovered and stir often until the meat is cooked well. Remove it from flame and let it cool down for sometime.. Take all your ingredients Pie cones   Sweet filling   Savory fillingFor sweet pie cone: Take a cone, scoop some chocolate tofu mousse into the cone and top with fresh strawberry. For savory pie cone: Take a cone, add some turkey meat fillling into the cone and top with shredder cheese.Serving suggestion: Refrigerate the sweet pie cone for an hour and serve chill. For savory pie, preheat oven to 200C, bake for 5 minutes and serve hot.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 0, 1, 2]\nC: [3, 1, 0, 2]\nD: [3, 1, 2, 0]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_24_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_24_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_24_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_24_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [1, 3, 0, 2]\nC: [3, 0, 1, 2]\nD: [1, 3, 0, 2]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. I shall be making several sandwich suggestions in this Instructable for you to try yourselves.Seek out and buy some of many types and makes,  to discover the ones that you like the best.The point of this Instructable is to inspire you to try the thins and to maybe try a few sandwich suggestions that you have not tried before. Eat them on a camping trip, in the summer house or just lay outdoors in the sun. They are delicious.I hope that you already know how to make a sandwich, but for completeness I have gone through the steps with a few personal notes that may be of interest.Spread a thin layer of your favourite spread such as butter or an olive oil based one, as used here.This stage obviously applies to all sandwich ideas and so will not be repeated in this 'ible.. I love the way apple and cheese go together to create great fresh summer sandwiches.Of course you can try all sorts of combinations of different cheeses and applesYou may also like to try mixing cheeses as I have done in this example.. I like the apple sliced and chunky but some like it grated or chopped.Peel and slice the apple then layer it onto the thins.. Grate your cheese or cheeses and spread over the apple.Mix cheeses for extra flavour (or flavor).Here I used English Cheddar and Red Leicester.. You may leave them just as they are for a good sized handful, or cut them.In a recent survey of the Gregarious family it was agreed that diagonal slices are more 'posh' and taste best.. This is a mouth watering delight.Fresh prawns, fresh shredded lettuce and a light spread of your favourite dressing (e.g.Thousand Island, makes a gorgeous sandwich.. Rough chop the boiled eggs in a bowl. Then mix in some fresh cress. Spread liberally over the thins.Perhaps the cheapest and simplest sandwich shown in this 'ible but surely one of the tastiest and satisfying.. Thins are good toasted but we have found that it is best to lightly cover with spread first before popping them into the toaster.We find it best to set the toaster for just a short period, enough to warm and melt the spread.Do experiment with different settings to perfect your ideal toasted thin.. Get a good fresh Hummus and spread it over one thin.(You may choose to not have another spread but I prefer it that way)Then place a nice slice of corned beef on the other slice and put them together.(Note how the beef slice neatly fits onto the thin)These may also be combined very nicely with the warmed thin toast as described above.. Good back bacon (local terms may vary) grilled (broiled) until deep red and trimmed to remove all fat.. Skinned or with skins, grilled or broiled to a warm glow.Neatly arranged to complement the lovely bacon.Brought together in possibly one of the worlds most delicious sandwiches.(Especially on thins). Ok... I admit it...I could go on forever detailing all the yummy treats that can be made with sandwich thins.Before I stop, here are just a few more serving suggestions:The good old 'Chip Butty' as we call it here in the UK.Lovely thick gammon steaks.Simple thin sliced ham (maybe with tomato).I was going to include a nice steak too...but I ate it before I could take a picture.Mmmmm, I do hope that you have been inspired by these suggestions, which have certainly made me hungry just typing about them.If you make any please do upload a few pictures.EnjoyEnjoyEnjoy\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 3, 0, 2]\nC: [3, 0, 1, 2]\nD: [1, 3, 0, 2]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_25_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_25_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_25_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_25_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [3, 0, 2, 1]\nC: [0, 2, 1, 3]\nD: [1, 2, 0, 3]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. You will need:\n-Cake Crumbles\n-Frosting\n-White Chocolate\n-Caramel Candies\n-Orange Food Dye *non water based*\n-Candy Decorator\nI used pumpkin cake and cream cheese frosting.. Although any flavor of cake and frosting works for cake balls, I decided to go with pumpkin cake.\nFor the cake, I mixed the following:\n-3 eggs\n-1 Cup sugar\n-1 teaspoon baking soda\n-3/4 Cup canned pumpkin\n-1/2 teaspoon cinnamon, ground\n-1/2 teaspoon nutmeg, ground\n-3/4 Cup flour\nThen I spread out the batter on a sheet pan and baked it at 350 for ~12 minutes.\u00a0\u00a0 Cooking time varies though, so be sure to check the cake while it is cooking.\u00a0\u00a0\nAfter the cake is cooked and cooled, crumble it.\u00a0 If it is still pretty moist, break it into small pieces.. Cream Cheese frosting pairs nicely with pumpkin cake.\u00a0\u00a0 However, just like the cake, any flavor of frosting will work.\u00a0\u00a0\nI mixed together:\n-8 oz cream cheese\n-1 Cup powdered sugar\n-2 Tablespoons butter\n-1 teaspoon cake vodka\nMake sure to taste the frosting to make sure it is sweet enough.\u00a0\u00a0 If the cream cheese flavor is too strong, add in a bit more powdered sugar.\u00a0\u00a0\nAdd some orange food coloring to the frosting so the inside color of the pumpkins is also pumpkin-like.\u00a0\u00a0\nI tried out cake vodka because we were out of vanilla.\u00a0\u00a0 It added an excellent flavor layer and I will probably use it in future frostings too.. After the cake crumbles and frosting are ready, mix them together.\u00a0\u00a0 The consistency should be a bit heavy and sticky.\u00a0\u00a0 Try not to over mix the frosting with the cake, or the texture will be mushy.\u00a0\u00a0\nRefrigerate the batter until it is chilled, so it is easier to roll into balls.\nRoll the batter into balls.\u00a0\u00a0 Keep in mind they need to be big enough for jack-o-lantern decorations, yet small enough to still be easily eatable.\u00a0\u00a0\nIf the batter is too sticky, even after chilling it, add more cake.\u00a0\u00a0 If the batter is not sticking together, add more frosting.\u00a0\u00a0\nAfter the balls are rolled, freeze them for at least an hour.\u00a0\u00a0 They need to be nice and firm for dipping.. Cut your caramel squares into stem shapes for the pumpkins.\u00a0\u00a0 I got 6 stems out of 1 square of caramel.\u00a0\u00a0\nCut your chocolate into slivers for melting.\u00a0\u00a0\n-I used almond bark for these pumpkins.\u00a0\u00a0 You can also find chocolate / white chocolate chips, big blocks of bakers chocolate, or pre-colored candy melts.\u00a0\u00a0. Melt the chocolate over a double boiler.\u00a0\u00a0 This is important because if it melts directly on the heat of the stove, it can easily overheat.\u00a0\nAdd in your food coloring as the chocolate melts.\u00a0\u00a0 Do Not use a wooden spoon as shown in the picture- it can add moisture to the chocolate which breaks it.\u00a0\u00a0 Also, make sure your dye is not water based, or the chocolate will break.\u00a0\u00a0\nOnce the chocolate is melted and dye added, it should be a nice smooth texture.\u00a0\u00a0 Turn down / off the heat for dipping.\u00a0 . When the chocolate is ready, dip the cake balls one at a time.\u00a0\u00a0\u00a0 They should be completely coated in chocolate.\u00a0\u00a0 After coated, place them on wax paper to cool.\u00a0\u00a0\nI find it is easiest to use a tooth pick for dipping, and have a second toothpick ready to help detach the pumpkin ball.\u00a0\u00a0 In the hole left from the toothpick, add a caramel \"stem\" before the chocolate is completely cooled.\u00a0\u00a0\nIf the cake balls are turning too mushy in the chocolate, put them back in the freezer until completely firm.\u00a0\nIf it is too hard to dip the balls with a toothpick, a fork can also be used.. When the pumpkins have cooled completely, they are ready to decorate!\nIf any of them have cracked on the outside, they can be patched very carefully with any remaining chocolate.\u00a0\u00a0\nAdd any features that a jack-o-lantern might have.\nI used a candy-decorator that needed to be warmed up in water to be useable.\u00a0\u00a0 After I decorated my jack-o-lanterns the candy turned solid again.\u00a0\u00a0\nEnjoy!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 0, 2, 1]\nC: [0, 2, 1, 3]\nD: [1, 2, 0, 3]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_26_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_26_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_26_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_26_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [0, 3, 2, 1]\nC: [1, 0, 2, 3]\nD: [2, 1, 0, 3]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. I used this recipe from Food and Wine magazine, but I left out the almonds in order to make the recipe easier for a beginner. You can also add in a tablespoon of ground flax seeds for added health benefits, or swap the dried cranberries for dried cherries.  Cranberry Almond Biscotti (recipe will make about 30 biscotti)1 3/4 cups all-purpose flour1 1/2 teaspoons baking powder1/4 teaspoon salt2 large eggs3/4 cup sugar3 tablespoons unsalted butter, melted and cooled1/4 teaspoon almond extract1 cup dried cranberriesoptional, 1 tbsp. ground flax seedsBeat the eggs and sugar together until thoroughly mixed and creamy yellow. Stir in the almond extract and butter. Add in the dry ingredients (flour, baking powder, salt, and optional ground flax seeds) and mix again. (I used a stand mixer, but you can do it by hand as well). Stir in the dried cranberries last. . Wet your hands, and shape the dough into two flat logs, on a parchment covered baking tray. The dough will be very sticky, so wetting your hands (and re-wetting!) is a crucial step in shaping the dough.Bake in a 350 degree oven, for about 25 minutes, until the top of the logs are golden. Remove and let cool. . Biscotti means twice baked, and that's what we do in this recipe!Once your logs have cooled, carefully slice them on the diagonal, using a serrated bread knife. Go slowly. Using a serrated knife is critical. . Lie the slices of cookie on their sides, back on the parchment paper lined cookie tray. Bake for about 10 minutes, at 350 degrees, until the cookies are crispy and golden on their sides. The goal is to end up with a nice dry and crunchy cookie, that stores well, and is great for dipping!Let cool and enjoy with a cup of coffee or tea! If you're adventurous, you can drizzle with some melted chocolate, but they're delicious without the embellishment! Like my instructable? Vote for me in the Baking Contest! And check out my blog: www.approachingfood.com. Thanks!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 3, 2, 1]\nC: [1, 0, 2, 3]\nD: [2, 1, 0, 3]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_27_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_27_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_27_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_27_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [3, 0, 1, 2]\nC: [2, 3, 1, 0]\nD: [3, 1, 0, 2]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Materials1.     All-Purpose Flour (from Costco)2.     Baking Yeast (from Costco)3.     Water4.     Vegetable Oil (from Costco)5.     Salt (from Walmart)6.     Flower Pepper Powder (Sold in local Asian supermarket)7.     Green Onion8.     One Chopstick9.     Rolling Pin10.  Knife11.  Cutting boardKitchen Appliances1.     Bread Machine2.     Steamer. a.   Add two cups of all-purpose flour, one teaspoon of baking yeast, and three cups of water into the bread machine.b.    The mixture is stirred in the bread machine for 30 minutes.c.    Let the dough stay in the bread machine for 50 minutes after stirring.d.    The whole internal container of the bread machine will be filled with the dough as the picture shows below.NOTE: MUST KEEP THE DOUGH IN THE BREAD MACHINE FOR ENOUGH TIME (50 MIN). Spread the dough on the cutting board with a rolling pin to form a 10 inch \u00d7 10 inch dough sheet.. Add 1 teaspoon of salt, 1 teaspoon of flower pepper powder, 2 teaspoon of vegetable oil, and prepared green onion chips onto the dough sheet homogeneously.. Roll the sheet as shown in the following picture.. Cut the roll into 16 pieces, and put one piece right on the other one.. Using a chopstick, press down at the middle of the overlapped two pieces. The edges of the balls will be opened up gradually.. a.   Put the flower buns on the upper layer of a steamer, stay for 30 minutes.b.    Add 10 cups of water into the lower layer of the steamer.c.    Heat the steamer for 15 minutes after boiling of lower layer water.NOTE: THE WELL-COOKED FLOWER BUNS COULD BE STOCKED IN A FREEZER FOR 3 MONTHS. BEFORE EAT, THE FROZED FLOWER BUNS SHOULD BE RE-COOKED AS THE ABOVE METHOD.SAFETY ISSUE: DO NOT LET THE WATER ALL BOIL AWAY.. Turn off heat and let it stay for 5 minutes, and enjoy it.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 0, 1, 2]\nC: [2, 3, 1, 0]\nD: [3, 1, 0, 2]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_28_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_28_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_28_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_28_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [1, 0, 2, 3]\nC: [1, 2, 3, 0]\nD: [3, 1, 0, 2]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Shopping list is as follows:\nSalmon, Fresh (1 - However many will fit in your BBQ)\nSugar, Brown (I use about a 1lb of Sugar per fillet)\nSalt, Kosher (About 2 Cups per fillet)\nWood - (Your choice on wood, I used Mesquite woodchips).\n. So there are many types of brines, and for my first foray, I wanted to keep it simple.  A simple ratio of 2:1 Sugar to Salt.  The intent here is to draw moisture out of the Salmon, and add a touch of flavor.   I had to make more often, so make a large batch and you can save your leftovers; saves from \"stop and going\" with your steps as I had to.. With the brine mixture created, you need to sandwich your Salmon with brine.  You can use any type of container, but steer away from metal, as it'll interact with the process, and mess with your Salmon.  Be sure to firmly press the mixture into the Salmon, and toss uncovered into fridge for 3 - 5 hours.  Both the before and after pictures are here.. Wash off the brine from the Salmon, and pat dry with paper towels.  Put on a rack (may wish to use the smoking rack so you don't need to move it), and put it in to the fridge again for 1 - 3 hours.  Essentially, you want the Salmon to become tacky to the touch (from the salt/sugar combo drying) so that smoke can attach to it.. This would be the time to prep your wood.  Many different ways to go about it, I've decided to use chips in a bread pan.  From past experiences, and watching my Father, soaking your chips allows for a longer smoke, rather than dry chips.. This is the time to get excited, smoking is almost ready to begin.  Heat your BBQ up, and wait for the coals to go grey (or if you have propane, turn it as low as it'll go).  Typically, when I want to do this, I'll have my charcoal on the right side (where the wood chips will be), and on the far left side ( no fire) my Salmon.  If you can get you BBQ below 100 degrees Fahrenheit and maintain it for the duration, you will have excellently smoked salmon. . I smoked for about 4 hours, stocking the coals as needed, and replacing the wood chips every 45 - 60 minutes.\nOn my first journey I was a little overzealous, and smoked hot.... Because salmon is so delicate, I had excellently cooked salmon, not smoked salmon.  After a day long process, it was a little disappointing, so take my advice Colder is Better....  \nEnjoy!!\nIf you think, 150 grams of Smoked Salmon is $8\nFor 3lbs of Salmon, it's $10, add some inexpensive ingredients and you have some excellently inexpensive Smoked Salmon!!!  \nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 0, 2, 3]\nC: [1, 2, 3, 0]\nD: [3, 1, 0, 2]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_29_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_29_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_29_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_29_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [1, 0, 3, 2]\nC: [2, 3, 0, 1]\nD: [3, 0, 1, 2]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. INGREDIENTS - 150g roasted cashews,250g Nutella hazelnut chocolate spread,20 no.s chocolate wafers.METHOD - 1) Chop the cashews finely.2) Chop the wafers.3) Combine 3/4th of the cashews with the Nutella. Refrigerate.4) Combine the remaining cashews with the wafers.5) When the Nutella mixture is firm, scoop out small balls of it.6) Roll the balls in the cashew and wafer mixture.7) Freeze until firm.. 1) Wrap the chocolates in foil.2) Cut out petals, from red crepe paper; for the roses(6-7 petals for each rose).3) Glue the petals onto the wrapped chocolates in an overlapping manner(the petals slightly overlapping each other).. 1) Glue together pieces of sponge to form the base of the boat.2) Cover the upper and lower surfaces of the boat with brown coloured card-stock and the sides of the boat with yellow card-stock.3) Line the edges of the boat with golden glitter.4) To make the mast, cover a stick in golden glitter glue and leave it to dry.5) For the sails, cut out triangles from white cotton fabric. Glue golden ribbon onto the sides of the triangles.6) Attach the sails to the stick(mast). Insert the lower end of the stick(mast), into the base of the boat(sponge); by making a hole in the card-stock covering the upper surface of the boat.. 1) Cut out leaves from green crepe paper and stick them onto the upper surface of the boat.2) Using double-sided tape, stick the roses onto the boat.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 0, 3, 2]\nC: [2, 3, 0, 1]\nD: [3, 0, 1, 2]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_30_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_30_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_30_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_30_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [2, 3, 0, 1]\nC: [1, 3, 0, 2]\nD: [2, 1, 0, 3]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. I decided to make two separate sodas. Insert 10 two different colored skittles into two separate cups.. In my instructable I used Sierra Mist for my lemon lime soda. You want to pour the soda into each cup.. I set a timer to 10 minutes and then went back over to the drink and took out the flavorless skittles.. Take out each skittle. In my case a few of the green and red skittles dissolved into the soda.. I grabbed two ice cubes from my freezer and dropped them into both skittle sodas.. Lastly enjoy the skittle soda of your choosing.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 3, 0, 1]\nC: [1, 3, 0, 2]\nD: [2, 1, 0, 3]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_31_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_31_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_31_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_31_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [1, 2, 0, 3]\nC: [0, 3, 1, 2]\nD: [3, 1, 0, 2]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Before you start preparing your cookies, place your oven rack in the middle of the oven and preheat your oven to 375\u00b0FIn a bowl, place 1 cup of softened margarine or butter, as well as 1 cup of both granulated and brown sugar.  Blend the ingredients until smooth, so there aren't streaks of margarine or butter.. The next step is to beat in 2 eggs and 1 tsp vanilla extract into the creamed butter and sugar.  Keep mixing the ingredients until smooth.. Now, measure and mix your dry ingredients into your bowl.  This is 2 cups all purpose flour, 1 1/2 tsp cinnamon, 1 tsp baking soda, and 1 tsp salt.*You may choose to measure these ingredients out into a separate bowl.*You may also choose to add the optional ingredients of nuts and raisins.  For this batch, I chose to add sunflower seeds and raisins.Blend everything together until the mixture starts to look uniform.. For this step, I chose to line my baking sheets with parchment paper.  Not only does it makes getting the cookies off of the pan easier, it also allows for easier clean up.Place approximately 10-12 1 1/2 inch balls of dough onto the baking sheets.  Slightly flatten with a spoon before baking (picture for reference).. Once the cookies are in the oven, set a timer for 8 minutes.  Depending on the size of cookies, they may need more than 8 minutes to cook. You'll want to take the cookies out before they look completely done, as they will continue to cook and harden after coming out of the oven.  Don't forget to take the cookies off of the baking sheet, or you will end up with hard bottoms.. \nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 2, 0, 3]\nC: [0, 3, 1, 2]\nD: [3, 1, 0, 2]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_32_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_32_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_32_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_32_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [2, 0, 1, 3]\nC: [1, 2, 3, 0]\nD: [3, 0, 1, 2]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. You will need :BaconOreoOil (fryer)Toothpick. Split a bacon in half.Roll the first part of the bacon around the oreo.Roll the second part on the oreo (on different ''axis''). To hold the bacon around the Oreo simply use toothpick. It will be useful when you will use the fryer.. Sorry for the lack of image in this step...So when you are about to serve it, fry the Oreo Bacon.For a better taste, fry at highest temperature for about 20 seconds.. The taste is simply surprising...Good luck, hope you will enjoy it!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 0, 1, 3]\nC: [1, 2, 3, 0]\nD: [3, 0, 1, 2]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_33_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_33_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_33_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_33_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [3, 1, 0, 2]\nC: [3, 1, 0, 2]\nD: [1, 3, 2, 0]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. (A)Dumplings-200g  glutinous rice flour50-100ml  waterFood coloring (optional)(B)Syrup-5 - 6  pandan leaves (tied in a knot)1 thumb size ginger (cut to thin slices)150g  rock sugar. Add flour into a large mixing bowl.. The amount of flour & water is very adaptable. Add water gradually and knead the dough until soft, smooth and easily kneadable.If the dough are:Too wet: sticky to the fingers (Add a bit more flour).Too dry: crumbling (Add a bit more water).. Repeat above step by adding fruit or vegetable juice.. Let the dough rest for 15 minutes.. Divide the dough (depending on the number of different colours you intend to make).Add food colouring. One drop at a time to each portion. Knead until the colour is distributed evenly.P/S:I skip this step because I am using natural food coloring in STEP 2.. Dip your fingers in water before shaping them if feels the dough dry.Shape the dough to balls or any others cute forms. (i.e. Panda, cat paw, alphabet, dice, flower etc).. The uncooked tang yuan in various shapes and colours.. Bring a pot of water (which enough to submerge them completely) to boil.Add the tang yuan into the boiling water and cook until they float to the surface.. Transfer tang yuan immediately to a bowl of (room temperature) water to cool down.This prevents them from sticking to one another or discolour the soup.. Add pandan leaves, gingers and sugar for sweet soup in a pot. Reduce heat to a simmer until the sugar is fully melted.. Add cooked dumplings to a serving bowl and ladle the sweet soup over.. Natural dyes from fruits and vegetables can be used to create a rainbow of dumplings.Eg:Orange = 2 tablespoon carrot juiceGreen = 2 tablespoon spinach juiceBlue = 3 tablespoons blueberry juicePurple = 2 tablespoons black berry juice + 1 teaspoon beet juiceBrown = 4 teaspoon coca powder\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 1, 0, 2]\nC: [3, 1, 0, 2]\nD: [1, 3, 2, 0]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_34_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_34_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_34_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_34_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [2, 1, 3, 0]\nC: [1, 0, 2, 3]\nD: [0, 1, 2, 3]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. First, take the ground beef and place it in a large sauce pan at medium heat with a little bit of oil. Brown the meat until all of it has crumbled and pinkness is gone.. Turn off the stove and drain the liquid from the pan. Return to stove, turn your burner back on to medium, and add the garlic powder and your diced green bell pepper. At this point, you may also add salt, black pepper or any other spices you like!. Now add in the tomato paste and ketchup. Set the burner on low. Stir the ketchup and tomato paste into the beef/pepper mixture until combined.. With the ketchup and paste mixed it, it's time to add some BBQ sauce! There aren't any real measurements for the sauce, just start with a little first. At this point, you may also add more ketchup, spices, or BBQ sauce to your liking.. Finally, assemble your sandwich! You can toast your buns, or not. You can add cheese, or not. The choice is yours! (However, the hot sloppy joe mixture will melt the cheese and make it even more sloppy; not a bad choice!) Enjoy!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 1, 3, 0]\nC: [1, 0, 2, 3]\nD: [0, 1, 2, 3]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_35_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_35_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_35_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_35_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [2, 1, 3, 0]\nC: [1, 0, 3, 2]\nD: [3, 1, 2, 0]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Measure 450 grams of chocolate chips. Put the chocolate chips on a double boiler on LOW temperature, and let it soften.. Mix the Graham and Chocolate crumbs together, once mixed add melted butter, and keep mixing. Spray your sprinform pan. Raise the crust onto the edges of the pan, and scrape off the extra to make it flat. Next put the rest onto the bottom. Chill in fridge.. Seperate 5 of the eggs, so that you have a bowl of yolks and one of whites. Add the last egg to the bowl with the yolks. Add 1/4 cup icing sugar to the egg whites. Put in mixer on High speed until stiff peaks form.. Mix 2 cups whipping cream with 2 tablespoons of icing sugar. Blend until stiff peaks begin to form.. Poor the melted chocolate into a bowl, next add the egg yolks and mix with mixer, then add one third of the stiff egg yolks and one third whipping cream and mix until light brown and smooth. Next add the rest of the egg yolks and whipping cream, instead of mixing, fold over and stir with spatula. Now pour the mousse (everything you just mixed) into the crust, and put in fridge over night.. Once you're finished you can decorate however you want.\u00a0\nEnjoy!\nThese chocolate mousse are great fundraisers, we sold over 240 and had to stop the orders.\nThanks for Reading!\nIf you have any questions just ask.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 1, 3, 0]\nC: [1, 0, 3, 2]\nD: [3, 1, 2, 0]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_36_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_36_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_36_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_36_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [1, 0, 3, 2]\nC: [0, 3, 2, 1]\nD: [0, 2, 1, 3]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. SoftwareCrust\n\t\tabout 1 cup of mini pretzels\n\t\t1 tablespoon butter, melted\n\t\t1 tablespoon sugarPeanut Butter Layer\n\t\t1 cup peanut butter\n\t\t1/4 cup butter\n\t\t1/4 cup brown sugar\n\t\t1 1/4 cup powdered sugar\n\t\tsplash of vanillaMicrowave Caramel (makes about twice as much as you'll need, but it's fantastic over ice cream (or off of a spoon!))\n\t\t1 cup sugar\n\t\t2 tablespoons water\n\t\t2 tablespoons corn syrup\n\t\tlarge pinch of salt\n\t\t1/2 cup heavy creamChocolate Layer\n\t\tabout 8 ounces of chocolate (milk or semi-sweet work well)HardwareFor All\nmicrowave\n8\" x 8\" baking dish\nmeasuring cups and spoons\nknifeCrust\nfood processorPeanut Butter Layer\nmedium mixing bowl\nmixing spoonMicrowave Caramel\nmedium mixing bowl\nsmall mixing bowlChocolate Layer\nsmall bowl\n        . \n\t\tIn a food processor, grind the pretzels to a fine powder\n\t\tAdd melted butter and sugar\n\t\tProcess until well combined\n\t\tSpread the mixture over the bottom of the baking dish. \n\t\tCombine sugar, water, and corn syrup in a medium-sized mixing bowl\n\t\tMicrowave on high for 5 1/2 minutes or until the sugar is melted, and the mixture is a very light amber color\n\t\tAllow the mixture to sit at room temperature for about 3 minutes\n\t\tWhile it's sitting, heat the cream in a small bowl for 45 seconds in the microwave\n\t\tAdd the heated cream and a pinch of salt to the sugar mixture and mix well\n\t\tTransfer to the fridge until ready to use. \n\t\tIn a medium mixing bowl, combine peanut butter, brown sugar, butter, and powdered sugar\n\t\tMicrowave for 1 1/2 minutes\n\t\tMix to combine\n\t\tAdd vanilla and mix well\n\t\tPour the mixture over the pretzel crust and spread evenly. \n\t\tIn a small bowl, melt the chocolate in the microwave by heating for 35 seconds\n\t\tPour the melted chocolate over the peanut butter mixture and spread evenly\n\t\tPour thin stripes of caramel over the chocolate\n\t\tGently pull the tip of a knife through the caramel and chocolate to combine them slightly\n\t\tPlace dish in the fridge for about a half hour to allow the chocolate and caramel to set up\n\t\tSlice into bars and enjoy!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 0, 3, 2]\nC: [0, 3, 2, 1]\nD: [0, 2, 1, 3]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_37_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_37_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_37_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_37_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [2, 3, 1, 0]\nC: [2, 1, 3, 0]\nD: [1, 2, 0, 3]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. To make these, you will need:\nSodium Alginate\nCalcium Lactate\nEspresso (this can be instant espresso powder + water)\nChocolate chips (ignore the fancy block of chocolate in the picture--I'll explain later)\n[Heavy cream\nSugar\nVanilla\nGelatin powder] - for panna cotta\nor\nWhipped cream (from a can or homemade)\nAnd also:\nA cylinder (paper towel roll or similar)\nWax paper\nTape\nScissors\nA scale (ideally)\nSome kind of dropper (pipette, syringe, straw, etc.)\nLots of bowls and silverware\nA fancy plate (for displaying). \n          Sushi connoisseurs know that the key to good sushi is good rice. That might not entirely be true in this case (it's hard to beat molecular gastronomy), but hey, those liquid-filled espresso spheres need to sit on top of something fluffy and white. This can be any sort of sturdy no-bake pudding (or whipped cream), but I went with panna cotta, the Italian custard set entirely with the power of gelatin.\nIf you decide to use panna cotta, you'll want to start about four hours in advance so that it'll have time to set. Here's how to make enough for 8-ish pieces of sushi, depending on how big they are.\nINGREDIENTS:\n1 1/2 cups heavy cream\n1 package gelatin\n1/4 cup sugar\n1/2 teaspoon vanilla extract\nDIRECTIONS:\n\t\tPut about half the cream in a saucepan and sprinkle the gelatin over it\n\t\tLet it sit for about 5 minutes, and then turn the heat to low and stir gently until the gelatin dissolves\n\t\tAdd the rest of the cream, the sugar, and the vanilla\n\t\tTurn heat to medium and stir until it starts steaming\n\t\tPut a cover on it and let it sit for ~15 minutes\n\t\tLet cool, then put in the refrigerator and let it sit until it sets (3 or 4 hours)\nOr just buy a can of whipped cream. Up to you.. \n          The next thing to make is the \"nori,\" a thin cylinder of chocolate that will contain all your fillings. You can use any reasonably sturdy, cylindrical object to form these--an empty paper towel roll is a good bet, but I used the cardboard roll from an empty box of aluminum foil, which was a little smaller and sturdier. You'll also need wax paper, tape, and scissors.\n\t\tThe first thing you want to do is to prep your cylinder with markings to show how tall the rolls should be. This will make it easier to make them all the same height. Take some tape and mark off a sushi-looking length on your cylinder.\n\t\tTear off a piece of wax paper big enough to wrap around your cylinder, and cut it in half.\n\t\tWrap one of the halves around the cylinder, and tape down the seam.\n\t\tWrap two pieces of tape around the cylinder, lining them up above and below the guide on the cylinder. Fold the tape under at the ends so you'll be able to pull it off easily.\nYou now have a form on which to mold your chocolate cylinder.\n        . \n          Now it's time for a hard lesson learned from the pretty block of fancy chocolate shown at the beginning, which is that tempering chocolate is actually kind of hard. If you trust yourself to do it, then go for it, but I ended up switching to a non-fancy package of mini semi-sweet chocolate chips, which I melted the old-fashioned way: in the microwave.\n\t\tPut the chocolate chips in a microwave-safe bowl.\n\t\tMicrowave at 20-second intervals until the chocolate chips are just starting to get shiny--they should still be shaped like chocolate chips.\n\t\tStir like crazy. You should have chocolate that is spreadable but not liquid.\n\t\tUsing a butter knife, spread the chocolate between the two strips of tape around the circumference of your cylinder, going lengthwise.\n\t\tGet a little more chocolate on your knife and go around the cylinder with the flat of the blade, smoothing it out. Make sure not to let the chocolate get too thin.\n\t\tLet it air-dry for a few seconds, then pull off the two horizontal strips of tape, leaving a perfect cylinder of chocolate\n\t\tCarefully pull the wax paper cylinder away from the cylinder form and stand it up in your freezer for a few minutes. The refrigerator should also work, if you leave it there longer.\n\t\tRepeat until you have as many chocolate cylinders as you want to make.\n\t\tGently push the wax paper away from the chocolate, pushing on either side of the chocolate cylinder at once. This is important, and why you can't just use the edge of the wax paper in place of one of the pieces of tape. When the paper is almost completely detached, twist it to fully disengage the paper.\n\t\tIf you didn't spread your chocolate too thin, you should now have a chocolate cylinder, ready to be filled. Put it on a plate and admire it.. So you've gotten the easy part out of the way. Any fool can make custard and make chocolate into a cylinder, right? Now it's time for what this dessert is really all about: Molecular Gastronomy.\nLet's let the phrase marinate a bit and get over some common misconceptions. Molecular Gastronomy approaches food and the art of cooking with a rigorous, scientific zeal. We know egg whites solidify at a certain temperature, but what is going on within the egg white itself? Does altitude, freshness, chicken diet, or water salinity have anything to do with it? These types of questions, seeking to isolate and pinpoint the how and why of food, are what spurred techniques in molecular gastronomy. If you'd like to learn more about the history of molecular gastronomy, as well as details about the science behind the how and why of your food, don't hesitate to check out On Food and Cooking by Harold McGee.\nMolecular gastronomy may be best known for its use in in stuffy and expensive small plate establishments (I may be bitter over the cost and wait for Alinea...), but the techniques used have been a part of our culinary language for years. Check the stabilizers and additives in any food you buy (the pimento pieces found in green olives are produced exactly the same way as this espresso roe!). If anyone raises a \"chemicals in our food\" line with you, gently remind them the food you are eating is a vast array of chemical and biological reactions cooperating and competing against each other. These ingredients have passed rigorous and lengthy health standards with flying colors, and make an amazing addition to your culinary tool-belt.\nMolecular Gastronomy has a relatively low cost of entry, as well.\nSodium Alginate-\nhttp://www.thespicehouse.com/spices/Algin#content\n(Carbohydrate derived from brown algae that forms a gel upon contact with calcium ions)\nCalcium Lactate-\nhttp://www.thespicehouse.com/spices/calcium-salt\n(Calcium ion source that dissolves in water- may taste bitter, but harmless)\nThe two ingredients used in this recipe go for less than $7.50 (+ tax and/or shipping) and are used in small quantities (on the order of grams). Inexpensive, and long lasting. If you don't have a food scale that is accurate to the gram, you may want to invest in one later- after all, food is science. But this technique is pretty resilient to estimation if you know what to look for. . \nI used a powdered espresso for this recipe, and prepared it a bit stronger than what the instructions dictate. Remember that these espresso caviar spheres will be small- you'll want them to pack a punch of flavor that you'll notice in just a few caviar \"drops\". In my case, the espresso was a bit bitter, so I added 3 tsp of sugar. You can substitute fresh espresso, or even strongly flavored fruit juices- just be careful with the acidity, as anything too acidic will throw off the reaction.\u00a0\n- three teaspoons espresso\n- three teaspoons sugar\n- 9 oz water\n__________\n158 g of espresso liquid\n\t\tAdd enough Sodium Alginate to make your espresso a 1% sodium alginate solution. Divide the grams of liquid you have by 100 to figure out how much Sodium Alginate you need to use. In my case, it was about 1.58 g. You'll be able to modify the solution after the fact if things aren't working out right or your drops aren't forming properly.\n\t\t\u00a0\n\t\tMix thoroughly. I used a latte foamer, but you can use anything you like- a hand mixer or a whisk will do just fine. The solution will be quite foamy- don't worry.\n\t\t\u00a0\n\t\tPass through a fine mesh strainer a few times, or let sit in the fridge until the foam has dissipated. Discard any lumps that may have formed- this may be due to hard water, or cross contamination. Sodium Alginate is sensitive to calcium.\n\t\t\u00a0\n\t\tSet aside and go on to prepare the calcium bath.. \nYou'll need a 1% calcium solution to serve as a bath for your spheres. As you drip the espresso into this bath, the outside of the droplet will form a carbohydrate shell, forming your caviar. The longer you let your caviar sit, the firmer the shell will be until your drop eventually solidifies.\nTime for some simple chemistry mathemagic for those of you without a kitchen scale. You may want to consult the internet/ wikipedia for densities if you have a different ingredient.\n3 cups water---> 709ml water\nDensity of water ~ 1g/ml\n709ml x 1g/ml = 709g\n709 g of water means you'll need 7.09 g of Calcium Lactate (1% solution)\nDensity of Calcium Lactate ~1.5 g/ml\n7.09 g \u00a0/ 1.5 g/ml = 4.7ml\nYou need about 4.7 ml of Calcium Lactate, or about 1 teaspoon. Easy. Mix this into your three cups of water until it dissolves.\nTime to Spherify.. For my caviar, I used what I lovingly refer to as a sawed-off pipette. Experiment with what you have available, and see what kind of sizes and shapes you can produce. Syringes (without needles, check your drugstore) of different sizes are commonly used, as well as pipettes and other types of droppers. You can even use teaspoons to produce a ravioli effect.\nTake your drop-making implement of choice and drip the espresso / Sodium Alginate mixture into the calcium bath, one drop at a time. Release the drops as close to the bath as possible so that they don't solidify in funny shapes due to the impact.\nWhen you're out of espresso-mixture or have as much caviar as you want, pour the contents of the bath through a strainer over the sink. Make sure to gently rinse your caviar to wash off the bitter calcium water solution.\nNote: if your caviar doesn't form and just dissolves, you may need to have a stronger Calcium bath. Experiment with the bath first in small quantities, followed by your Alginate solution. If the drops form too quickly, your bath may be too strong. Try adding more water and trying again. They may be fragile, depending on how long you've let them soak, so be careful.\nDue to this particular method, the caviar will continue to solidify slowly after formation- eat them quickly if you want that pop or prepare them last. There is also another method called reverse spherification, in which your liquid contains calcium and is dripped into an Alginate bath (opposite of what we've done here). This method has a thicker, but non-progressing shell for your liquid. Stay tuned in the comments for my experiment with that.\nTime for assembly.. \n          Now for the moment of truth: putting it all together.\n\t\tSpoon your panna cotta into the chocolate cylinder (or if you went the lazy route, \u00a0squirt some whipped cream in there) until the cylinder is almost full.\n\t\tCarefully spoon the espresso roe on top of your filling until the cylinder is covered.\n\t\tIf you want some plate decor, you can use your cardboard cylinder form to punch little discs of panna cotta, and then scatter some extra roe around. Or drizzle some chocolate around. Make it look nice.\n\t\tTake artsy photographs of your masterpiece. You've earned it.\nRemember, sushi is in fact traditionally picked up with the hands and eaten in one bite. If you want to be a bit more civilized with this dessert, though, a fork is recommended (or chopsticks if you make them a bit smaller).\nServe this as the dessert for a sushi dinner party, and all your friends will forever think you're 100% classier than you are in real life. Guaranteed.\n        \nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 3, 1, 0]\nC: [2, 1, 3, 0]\nD: [1, 2, 0, 3]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_38_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_38_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_38_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_38_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [0, 2, 3, 1]\nC: [1, 3, 0, 2]\nD: [2, 3, 1, 0]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. You will need: 1 1/2 cups heavy cream 7 bags of chai tea, (or whatever flavor of tea you would like! On two different occasions I have added a whole chili for a spicy truffle, and one time I used three cinnamon sticks!) 1/4 teaspoon salt 9 ounces dark chocolate (70 percent or higher!) 2/3 cups Dutch process cocoa powder Gold dust, for garnish (I have also sprinkled cinnamon sugar mixture and finely chopped nuts) ----> here is the link for gold leaf! (.20 a sheet!). Chop the dark chocolate finely, it melts faster. Combine the heavy cream and tea bags (or cinnamon) in a small saucepan. Place the pan over medium-low heat and warm gently, stirring occasionally, until bubbles just start to form around the edges of the cream, about 5 minutes. Simmer another 3 minutes before turning off the heat. *Note you can add vanilla or allspice too if you'd like a more intense flavor!I did this when my little took a nap!. Place the chopped chocolate and salt in a medium bowl. Strain the hot cream mixture through a fine-mesh strainer over the chocolate and let sit for 3 minutes. Slowly whisk the now-melted chocolate into the cream, starting with small circles in the center of the bowl and moving into larger circles as the mixture begins to come together; stop when smooth and completely blended. . Press a piece of plastic wrap directly on top of the ganache and refrigerate for about 3 hours, or until just set and cold but still pliable. (I didn't have plastic wrap on hand but I do recommend it, I feel like it didn't chill as fast with the foil!). Place the cocoa powder in a shallow bowl. With a 1-ounce scoop, scoop level rounds of the ganache into your palm and gently roll into a ball. Roll the truffles in the cocoa powder to coat, then place in an airtight container and refrigerate until ready to serve. Just before serving, sift a touch of gold dust, if using, over the top of the truffles for sparkle. ----> I small spoon works as well it will just be messier!Recipe courtesy of Giada De Laurentiis!. Take these little gems to your next Christmas party or as gifts for the ones you love! With love, Mama Mere Bear Enjoy! \nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 2, 3, 1]\nC: [1, 3, 0, 2]\nD: [2, 3, 1, 0]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_39_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_39_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_39_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_39_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [0, 1, 3, 2]\nC: [2, 3, 0, 1]\nD: [1, 2, 0, 3]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Ready made chocolate frosting\nHostess (or other mass produced) chocolate cupcakes - Buy as many cupcakes as you want to have Daleks. Adjust quantities of other ingredients as necessary.\nChocolate chip morsels\nChocolate sandwich cookies (like Oreos)\nMini chocolate frosted donuts (approx. size of cookies)\nFudge covered marshmallow cookies\u00a0\nSmall pretzel sticks, Chocolate covered Pocky\nCheerios (colored, if you can get them)\nGood & Plenty candy (red bits only)\nA drinking straw to use as a tool\nCookie sheets for transport & storage\nBase to set your work up on (helps with frosting process). I used styrofoam bowls, turned upside down. Set a cupcake upside down on the little card stock base that comes with it in package. This helps you move the piece later.\nPlease both on top of inverted bowl. This makes frosting and deco process easier.. In order,\non top of the inverted cupcake.\nstack\na mini donut,\nan Oreo\n& a marshmallow fudge cookie,\n(with some frosting in between each for 'glue').. Make sure your stack is nice & straight.\u00a0\nCover the donut & cupcake with chocolate frosting.\u00a0\nLeave the cookies showing.. At this stage, move on to the next Dalek, and the next, and the next......etc..\nBringing them all up to this stage of finish together.\nThe rest of the process is detail work, and I find it easiest/most satisfying to get the structural work done on all before moving on to finish.\nHave a couple of cookie sheets on hand to set them on. You can easily pick them up by the little cardboard bases to transfer them there. Then, once room is made, put them - en masse - into the refrigerator to chill until you're ready for finish.\u00a0\nOR - watch them pile up on the stove top and counter.\nThey don't really require chilling, and you may want to keep an eye on them as they multiply ; ). Add chocolate morsels around the frosted base, spacing them as they would appear on the real thing.\nI got 3 in a row top to bottom, and about 6 rows all around.\nI do this step to all Daleks before moving on.\nHave Cheerios, Pocky, pretzels, the straw and red Good & Plenty bits on hand for next steps.. You will be using the pretzel rods for arms. Sticking them into the donut layer.\nPick a grape colored Cheerio out. Dip one of the pretzel rods' end into the frosting and place the grape Cheerio on that end.\nThis is the plunger arm, it goes on the left.\nRegular, plain pretzel rod goes on the right.Chocolate covered pocky is the eye piece rod. Dip one end into frosting, same as for arm, and place a lighter colored Cheerio onto it's end. Insert completed part into front of marshmallow cookie top, centered over arms.The Good & Plenty eyes will go on top into the marshmallow cookie.\nYou need the straw tool to pre-drill the hole for the candy, so you have a clean insert, without cracking up the fudge coating on the cookie.. With all your Dalek's finished, place them back onto their cookie tray transports and seal them in the refrigerator until you are ready to release them.\u00a0\nThis last step keeps them from melting in warm weather, and stops them from escaping or signaling home.\n(This is what you tell the kids, to keep them out of the fridge until party time ; )\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 1, 3, 2]\nC: [2, 3, 0, 1]\nD: [1, 2, 0, 3]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_40_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_40_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_40_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_40_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [0, 1, 3, 2]\nC: [1, 0, 2, 3]\nD: [2, 3, 1, 0]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. For this salty-sweet ice cream, we need:\u2022 2 eggs\u2022 2 cups of milk\u2022 1/3 cup milk\u2022 1 teaspoon vanilla extract\u2022 1 cup heavy whipping cream\u2022 Sea salt (to taste)\u2022 Blue food coloring (optional)\u2022 Popsicle molds (optional). Got all your ingredients? Great! Start by separating the egg yolks and the egg whites into separate bowls. Now beat the egg whites until they're stiff. Then mix the egg yolks and sugar until thick (don't combine the two bowls like I did the first time).. Eggs all happy and separated? Good! Now slowly bring the milk to a boil, stirring occasionally. Once boiled, slowly mix it into the egg yolk-sugar mixture, mixing well as you do. Once it's mixed, put it back into the pot, heating it up to make a slightly thick custard. DO NOT BOIL THIS! Mix this in with the egg whites. Add sea salt until it's salty-sweet. Cool this mixture in the fridge. . Is it cooled? Perfect! Add in the cream, vanilla extract, and food coloring. You may need to add a little more salt because of the newly introduced ingredients. Now, time to make ice cream! Do this step in pint-sized (2 cup) quantities. I know, long and stuff. But work with it! Grab a large bowl, fill it about halfway with ice, and then stir in 3/4 cup rock salt. Now put the smaller bowl in there, almost all the way if possible. Now, put in your pint of ice cream, and mix it for ten minutes. All mixed? Put the whole set up (nested bowls and all) covered with a towel into the freezer for 45 minutes. Kill some time. Build a keyblade. Or beat that one kingdom hearts boss *coughsephirothcough* that's impossible. Time killed? Okay, now take it out of the freezer, and stir it for five more minutes, then put the ice cream in a container and freeze for 2 hours-overnight. Repeat with the remaining ice cream.  . Boy, that was time consuming! But congratulations! You're all done now! Share and enjoy it with your friends on the top of a really tall clock tower!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 1, 3, 2]\nC: [1, 0, 2, 3]\nD: [2, 3, 1, 0]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_41_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_41_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_41_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_41_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [3, 2, 1, 0]\nC: [1, 3, 0, 2]\nD: [1, 0, 3, 2]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Melt the butter in a saucepan over medium heat.. Add the garlic and cook until golden.. In a bowl, combine the half and half, cornstarch, Italian seasoning, oregano, salt, and pepper.. Add the mixture to the cooked garlic and stir well. Using a whisk, stir until the mixture thickens, about 5-10 minutes.. Add the cheese and stir well.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 2, 1, 0]\nC: [1, 3, 0, 2]\nD: [1, 0, 3, 2]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_42_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_42_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_42_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_42_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [2, 1, 3, 0]\nC: [1, 2, 0, 3]\nD: [1, 0, 2, 3]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. 2 tbsp Olive Oil\n2 cloves of Garlic\n1 Onion sliced\n2 pinches of Chilli Flakes\n2 pinches of Ground Ginger\n400g of Sweet Potato\n400ml of chicken or vegetable stock\n200g Creamed Coconut\n50g Fresh Coriander leaves chopped (optional)\nSalt and Pepper to taste\nCoriander or Spinach to garnish. Heat the olive oil in a large saute pan, add the finely chopped onion and the crushed garlic. Gently fry off for 3-4 minutes until soft and golden brown.. Once the onion and garlic are cooked through add the chilli flakes, ground ginger frying for a further 2-3 minutes.\nThen add the peeled and chopped sweet potato cooking for a further 2-3 minutes.. Now add the stock (hot) \u00a0and coconut. Allow to cook for about 10 minutes.\nTake care when adding hot liquid to a hot pan.\nAt this stage if you wish to add the optional chopped coriander leaves then do so.. Remove from the heat and allow to cool before pouring into a blender or food processor.\nBeware of adding hot liquids to a blender. Hot liquids can scald!\nBlend until a smooth and even consistency.\nYou might find that you need to add a little warm water to thin but this is entirely up to you.. Reheat or store in freezer.\nIf you are going to freeze the soup then it is essential to let the soup reach room temperature before putting it in suitable containers and freezing.\nSeason to taste and garnish with Coriander (or anything green), serve with home made crusty bread. Its really delightful.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 1, 3, 0]\nC: [1, 2, 0, 3]\nD: [1, 0, 2, 3]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_43_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_43_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_43_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_43_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [1, 2, 3, 0]\nC: [2, 0, 1, 3]\nD: [1, 0, 2, 3]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. One 14-16oz package of Eckrich Smoked Sausage 1.5 cups milk 2.5 Tablespoons of coconut oil (or use butter if you don't like coconut oil) One 8 oz block of mild or sharp cheddar cheese  A 16-17oz package of Gemelli pasta (feel free to try other styles of pasta, but I love this one! The sauce soaks into the folds in the noodles and it's fantastic!)  Panko bread crumbs (optional - but if you like an added crunch to your bite, it's delicious!) Small bag of your favorite shredded cheese blend (optional) . 1. A saut\u00e9 pan 2. A large enough Sauce Pan to boil noodles in 3. Noodle drainer 4. Cutting board & Knife5. Spatula6. Measuring cup to measure the milk 7. Glass baking pan (optional - if you would like to bake your mac n cheese with bread crumbs after it's cooked)8. A Timer9. A spoon. Dice up your sausage into bite size pieces and put it in the saut\u00e9 pan.Then, dice up your 8 oz block of cheese and leave it off to the side on the cutting board, we will need it soon. Don't add it to the dish yet. . Fill your sauce pan more than half full of water and turn the stove on medium/high heat to get the water boiling. Once the water is boiling, drop in a little over half of the bag of 17 oz gemelli noodles, then set the timer for 12 minutes.While your waiting for the water to boil, start frying the sausage in the saut\u00e9 pan on medium heat, browning the sausage. This will help add that nice smokey flavor to the mac and cheese! . While your noodles are still cooking and you have about 5-7 minutes left until the noodles are done, It's time to start making the cheese sauce. Add 2.5 Tablespoons of coconut oil to the sausage in the saut\u00e9 pan. . Once the coconut oil is fully melted, mix the 1.5 cups of milk in with the sausage and coconut oil! Stir and mix it all together. Turn the heat to medium low or about 3-4 depending on your stove at home. If your milk begins boiling, stir frequently. . After you add the milk in and stir it all up, drop in the cubed up cheese and mix it all together. Feel free to add a small handful of shredded cheese too! Turn the heat to medium and stir frequently until the cheese and milk is melted and blended together. Once the cheese is melted, turn the heat to low. The milk, cheese, and coconut oil will make a nice cheese sauce. At this point, your noodles should almost be done! If they are already finished, wait until the cheese is fully melted to add the noodles to the saut\u00e9 pan. . Once the cheese sauce is melted and blended together, drain the noodles and mix them in with the sausage and cheese sauce. Stir them all together, serve, and enjoy! The next step is optional unless you would like to bake yours with bread crumbs to add an added crunch on top! . Scoop your fabulous mac and cheese into a glass baking dish, sprinkle a handful of shredded cheese on top, then cover it with a thin layer of panko bread crumbs. Bake at 325 for 10-15 minutes. . Remove from the oven and enjoy! \nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 2, 3, 0]\nC: [2, 0, 1, 3]\nD: [1, 0, 2, 3]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_44_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_44_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_44_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_44_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [2, 0, 3, 1]\nC: [0, 1, 3, 2]\nD: [1, 0, 2, 3]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. You can, of course, use your own preferred pastry and filling, either homemade or bought, but here's what I did: Lydia's Dia de los Muertos savoury pie, feeds 3-4, for more add a hearty side of veg! For the pastry:\u00a0 225g (8oz) plain white flour, plus extra for dusting 50g (2oz) unsalted butter, diced, plus extra for construction 50g (2oz) lard or veg shortening (or just make the whole 100g up with butter) 2 eggs splash of water - I used about 2tbsp For the filling: - adjust this to your liking! 1 medium brown onion, chopped 2 cloves garlic, chopped 4 carrots, peeled and cut into 1.5cm discs 13 mushrooms (I used white button mushrooms), cut into quarters half a small pumpkin, chopped to 1.5cm chunks 2 large sprigs of rosemary 1 tbsp olive oil 1 tsp mustard seeds 2 tbsp tomato puree 1/4 tsp celery salt 1 tbsp gravy granules salt and pepper to taste splash of worcester sauce 1/4 tsp dried oregano 1/4 tsp dried sage 1/4 tsp dried mixed herbs 1 tbsp wholegrain mustard Tools: Chopping knife and board Wooden spoon / stirrer Pastry brush (or your fingers!) Tin foil Food wrap / Food bag Baking paper Rolling pin Pie dish about 7.5 - 9 inches in diameter Large frying pan / cooking pot with lid Fridge Oven Kettle About 2 hours.. To make the pastry, rub the fats into the flour (either with your fingertips or by cutting through the mixture with a knife) until you get a bread-crumb like texture. To this add 1 lightly beaten egg (leave the other egg for an egg wash later) and add the water a bit at a time, while stirring / working with your fingers, until a dough is formed. You won't need very much water - about 2 tbsp was all I needed. Cut off one third of the dough and flatten both dough balls into discs. Wrap in food wrap / place inside two food bags and refrigerate for 30 minutes. Meanwhile, start preparing the filling.\u00a0. Heat the oil in your pan and throw in half of the fresh rosemary (chopped, stalk removed), and your herbs. Add onions and cook on medium heat for about 5 minutes, stirring. Next, add the tomato puree, then the garlic, followed by the rest of the veg. Stir well and cook, covered, on a high heat for 5 minutes. Boil about 250ml water and add to the pan - only enough to reach halfway up the veg in the pot. Reduce heat to medium. Add the gravy granules, worcester sauce and season to taste with salt and pepper.\u00a0 Stir well, pushing the veg down into the water, place the remaining rosemary sprig on top and continue to cook, covered. After about 7-10 minutes remove the rosemary and continue to cook covered until the carrots and pumpkin are almost done - about 10 more minutes. Give it a taste and adjust the herbs as necessary. Finally, uncover and cook on low to medium to allow the filling to reduce (liquid to evaporate a fair bit).\u00a0 Preheat the oven to 180\u00b0C (350\u00b0F, gas mark 4).. Shape two pieces of foil (each about 50cm long) to form the cheek indentations (see photo) and fix in place on the pie dish with a piece of butter. Roll out the large disc of pastry on a floured surface, to fit the diameter and depth of the pie dish. Lift the pastry onto the rolling pin and gently place into the pie dish, easing it into the nooks and crannies with your knuckles. Put the dish back in the fridge. Roll out the\u00a0large disc of pastry on a floured surface, to fit the diameter of the pie dish. Cut out eyes (in the shape of aviator style sunglasses) and the nasal cavity (upside-down thin heart). Roll out the cut-outs of pastry into a long thin rectangle, and cut rectangles out for the teeth. Make about 20 that are almost square and 4 that are slightly thinner, for canines. Lightly beat the remaining egg and brush onto the pastry lid in a smile shape, and arrange your pastry teeth onto it. Transfer your pastry onto a baking sheet and refrigerate until needed.. Once your filling is ready, there'll be very little liquid left. This took about 45 minutes from the moment the oil was heated right at the start. You can either leave the filling to cool to room temperature, which will prevent the filling from melting your pastry and result in a slightly better structure, or you can work quickly! Ready? GO! Place the filling in the base and distribute evenly. Brush the pastry edges with beaten egg. Place the skull-shaped top, erm... on top, and seal the edges (push the pastry top and bottom together) using your fingers. Trim the excess from the edges. Brush the top with beaten egg - be careful around the teeth as vigorous brushing may dislodge them! Bake in your preheated oven for 45-50 minutes until nicely golden. Remove from oven and leave to cool for 5-10 minutes before removing from base with a slotted spoon and digging in!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 0, 3, 1]\nC: [0, 1, 3, 2]\nD: [1, 0, 2, 3]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_45_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_45_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_45_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_45_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [0, 1, 3, 2]\nC: [2, 0, 3, 1]\nD: [0, 3, 1, 2]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. 2-3 pounds boneless pork shoulder (aka boston butt) 1 tsp Oregano 1.5 tsp Cumin 2 tsp Salt 1 tsp Pepper 10 cloves of Garlic 1 medium Orange (or about 1/3 cup of orange juice) Onion (optional)(Note: I have coconut oil in the picture, but I didn't actually use it... ooops!). If you are using onions, then cut it into quarters and place it in your crock pot. Then, place the pork on top of the onions. If you're not using onions, then place the pork directly in the crock pot.If you don't have a fancy citrus press or a citrus reamer, then do what I did to juice the orange over the pork:Cut the orange in half.  Stick a fork in the middle of an orange half as a makeshift reamer.  Squeeze the orange shell in half while moving the fork around to break up the orange pulp. It's okay to get some pulpy bits on your pork, but you will want to remove any seeds that fall in. . I have to admit that I love garlic, so I use a lot! If you have some really large garlic cloves, then you can cut it in half like I did. Then using a paring knife, cut slits into each side of the pork and insert the garlic into the slits.. Mix the spices together in a small dish and sprinkle it evenly onto the pork. You can also pat the spices into the pork gently. . Set the crock pot to low and cook for 6-8 hours. Go to work, run some errands, hit the gym, and come home to the delicious aroma of perfectly cooked carnitas! Use two forks to shred it up and it's ready to be paired with some beans and yellow rice, stuffed into purple corn tacos, or anything! Note: If you want it crispier, you can put the shredded pork on a cookie sheet and broil for 3-5 minutes. I never get to this step because it smells so good, I have to dig in! ;-)  \nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 1, 3, 2]\nC: [2, 0, 3, 1]\nD: [0, 3, 1, 2]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_46_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_46_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_46_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_46_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [1, 0, 2, 3]\nC: [3, 1, 0, 2]\nD: [0, 1, 2, 3]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Take your almonds and put them into a food processor and pulse them until they become crumbs.Now drain your Dates (and save the soak water for later on) and process the Dates with the Almonds until they are well combined and very sticky.Press into a lined springform pan and set aside.. I don't know about you, but I love Ice Cream! So if you want to make this part a thicker layer on the cake...just put in more frozen Bananas!Put the frozen Bananas and the Vanilla into the blender and whiz away until it's thick, creamy, and smooth. Keep an eye on it as it doesn't take a long time to change!Spread this on top of the pie crust and set aside.. Place the Dates, a 1/4 cup of the date soakwater and the Almond Butter into the food processor and process until it's the smoothest that you can get it.Spread your Caramel on top of the Banana Ice Cream layer.. Place all the ingredients into the (cleaned) food processor. If you aren't using the Agave Nectar, just use 1/4 cup of the date soakwater and 1/2 cup of Dates.Again, just process everything together until it is smooth!Layer this on top of the Caramel, being careful not to mix the Caramel and Chocolate layers together!Put the Cake into the freezer and freeze overnight.How easy can it get?!?. When the time has come and you are ready to indulge or impress your friends...Take it out of the springform pan and let it sit out for about 5-10 minutes, then cut it into the sizes you wish to serve, but just before you serve it, sprinkle a little bit of pink Himalayan salt on top. This will bring out the chocolate flavor even more!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 0, 2, 3]\nC: [3, 1, 0, 2]\nD: [0, 1, 2, 3]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_47_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_47_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_47_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_47_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [2, 1, 3, 0]\nC: [2, 3, 1, 0]\nD: [3, 2, 0, 1]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. You will need: Food Processor or Strong Blender 2 Cups Spinach (Yup that's right, just stay with me!) 1 Cup Raw Almonds 10 Large Dates (Remove pits) 3/4 Cup Peanut Butter 1/2 Cup Honey 1/2 Cup Almond Meal 1/2 Cup Oats 1 Tbs. Flax Seed 1 Tbs. Green Superfood Powder 1 Tbs. Coconut Oil 1/2 Cup Coconut Flakes 1/4 Cup Coconut Flakes (To roll cookies in after). Place all ingredients in the food processor EXCEPT: Flax seeds Oats Almond Meal Pulse in food processor until everything is a small texture, should be very sticky at this point.\u00a0. Pulse in with the wet ingredients: Flax Seeds Oats Almond Meal Pulse everything together until it starts to ball up.. Make desired size cookie balls and roll in coconut flakes.\u00a0. Enjoy your Popeye Power Cookie immediately (and watch your muscles bulge!) or save them for later. If you want to save them, wrap each cookie in plastic wrap and put in refrigerator.\u00a0\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 1, 3, 0]\nC: [2, 3, 1, 0]\nD: [3, 2, 0, 1]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_48_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_48_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_48_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_48_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [0, 1, 3, 2]\nC: [0, 3, 2, 1]\nD: [3, 1, 0, 2]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. \u2022 1/2 pound sliced bacon (diced)\u2022 1/2 medium sweet onion (chopped)\u2022 3 eggs (lightly beaten)\u20222 cups frozen shredded hash brown potatoes(thawed)\u2022 1 cup shredded cheddar cheese\u2022 3/4 cups 4% cottage cheese\u2022 5/8 cups shredded Swiss cheese6 servings. Preheat oven to 350 degrees.  . Cut up the bacon and onion.  Dice the bacon and chop the onion.. In a large skillet cook the bacon and onion on medium heat until the bacon is crisp.  If you need to put the bacon in the microwave start with 30 seconds and add any additional time needed.  When it is cooked drain the bacon and onion.. Lightly beat the eggs and put them in a large bowl.. Shred the potatoes or just buy shredded hash browns and put them in the large bowl.. Add the remaining ingredients into the large bowl. (Shredded cheddar cheese, cottage cheese, shredded Swiss cheese, bacon and onions). Next transfer the ingredients to a 9 inch round or square dish. Put the dish in the oven for 35-40 minutes.  When done let stand for 10 minutes and enjoy your \"Go To Omish Egg Caserole\".\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 1, 3, 2]\nC: [0, 3, 2, 1]\nD: [3, 1, 0, 2]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_49_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_49_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_49_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_49_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [3, 1, 0, 2]\nC: [3, 1, 2, 0]\nD: [2, 1, 3, 0]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. 1 \u00bd cups pureed white part of a watermelon \u00be cup of wheat flour \u00bc cup rice flour1 teaspoon Red chili powder1 small onion2 green chilies15 chervil leavesSalt to tasteOil for frying. finely chop onion, green chilies and chervil.Mix rice flour and wheat flour together. Add red chili powder. Add salt, chopped onion, green chilies and chervil. And the pureed white part of a watermelon.Adjust consistency with water as required and make a smooth batter.Cover and keep aside for an hour. While it sits there let\u2019s make the Watermelon chili chutney. you will need:Chili sauce 2 tablespoonsWatermelon juice 2 tablespoonsSalt to tasteButter (not shown in the picture)Ginger and garlic paste (not shown in the picture)method:Add some butter to a small wok and add ginger garlic paste and saut\u00e9. Then add the chili sauce and watermelon juice and let it come to a boil and chutney is ready.. Heat a griddle with oil, when hot drop a ladleful of batter and spread into a circle with the back of your spoon. Place a chervil leaf in the centre. Let the pancake cook and flip over and cook the other side as well. Serve with hot watermelon chili chutney. :)Vote for me please. I'm aiming for one of the blenders :( I'm moving to NYC next month, I can\u2019t afford to buy furniture\u2019s and kitchen utensils all at once. Your vote can take me near to winning it. Thank you soo much xo\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 1, 0, 2]\nC: [3, 1, 2, 0]\nD: [2, 1, 3, 0]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_50_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_50_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_50_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_50_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [1, 3, 2, 0]\nC: [3, 1, 0, 2]\nD: [0, 3, 2, 1]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. You need the following ingredients to make these delicious pizza pancakes:\n- 400 gr. flour\n- 3 eggs\n- 900 dl. milk\n- tricolor peppers\n- 2 onions\n- 250 gr. mushrooms\n- 2 packs\u00a0 of bacon\n- oil\n- a pan. \nTake a bowl and add the flower and half of the milk.\nMix the dough with a spoon or whisk untill it is smooth and firm.\nAdd the rest of the milk, two eggs and a little salt in the dough and mix it all together.\nMake the dough smooth without lumbs.. Take a cutting board and knife.\nCut the peppers in pices, the mushrooms in slices and the onions in half rings.\nHeat up the pan with some oil and bake the veggies in separate pieces.\nMake sure you have everything ready for baking and easy to reach.. \nHeat up the pan and poor a little oil in.\u00a0Make sure the pan is really hot.\nPut in 2 slices of bacon (or less/more) and quickly poor the dough in the pan.\nFill up the pan nicely, not too thick not to thin.\nWhen the dough is still wet put in some of the unions, paprika and mushrooms.\nFlip the pizza when the top is dry and the bottom brown.\nWait till the other side gets a little brown and warmed up.\nTake the pan of the stove and put the pancake on a plate\nThe pancake is ready to eat! Poor some syrup on the Pizza Pancake if you like and you can enjoy your lovely Pizza Pancake!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 3, 2, 0]\nC: [3, 1, 0, 2]\nD: [0, 3, 2, 1]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_51_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_51_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_51_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_51_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [2, 1, 3, 0]\nC: [2, 1, 0, 3]\nD: [1, 2, 3, 0]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Dough Ball (store purchase) - oneShredded Mozzarella Cheese - 1 cupRicotta Cheese - 1/2 cup* Although I used a different brand this time,  Polly-O Ricotta Cheese (http://www.kraftbrands.com/pollyo/) is my favorite because of creaminess.Extra Virgin Olive Oil - 2 table spoonsFlour - 1/2 cupPeaches - one or twoRaspberries - 1/4 cupBlueberries - 1/4 cupFigs - a fewArugula - 1/2 cupBasil - a few leavesHoney - 1/4 cup* Sliced Almonds (option)* Cocoa Powder (option). Cut the dough ball into 3 pieces by using knife.. When you stretch the dough, you have to apply some flour so that they don't stick to your hands.. I preheated the grill really high, and then turned it down to low, so the dough doesn't get burned.However, some people like it crispier. If so, you can keep the flame high and grill it for 30 seconds or so.It's totally up to you!When you started to see dough rising, then flip it over to grill the other side.. I applied some olive oil on the top side for some flavor.. Slice peaches and figs. Drizzle some olive oil over arugula and toss them, so they stay fresh.Tear some basil leaves into smaller pieces by hand.Start with adding Mozzarella cheese on the crust. And add some Ricotta cheese just for peach and berry pizzas.#1) Arugula and figs#2) Peaches#3) Raspberries and blueberriesDrizzle some honey over the pizzas!. Put the pizzas back on the grill and cover the lid.It only takes a few minutes since I am just warming up the toppings and melt the cheese.. This step is an option. I added basil and sliced almonds on the peach pizza. And I sprinkled some cocoa powder on the berry pizza.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 1, 3, 0]\nC: [2, 1, 0, 3]\nD: [1, 2, 3, 0]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_52_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_52_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_52_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_52_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [2, 1, 3, 0]\nC: [0, 2, 1, 3]\nD: [0, 3, 2, 1]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Fire MeadIngredients\n1 kg blackberry honey\n3 black cardamom pods\n6 cloves\n2 dried ancho chili peppers\n1 cup plain black tea\n3 small blood oranges\n1 pkg Lalvin EC-1118Equipment\n1 sanitizing vessel\nlots of sanitizer\n1 - 2 x 1 gallon (3.78 L) glass carboy or other fermenting vessel\n1 funnel\n1 big metal spoon\n1 coffee mug\n1 brewing pot\n1 pair of scissors\n1 airlock setup\n1 siphon hose, at least 4 ft in length\n11 beer bottles and caps (or 5 pressure-capable 750 ml wine bottles and corks)Notes on Ingredients and EquipmentBlackberry Honey: Any sweet honey with a nice fruity flavour will do, but blackberry is just sooo good.*Black Cardamom Pods: Other cardamoms can be used as a substitute but they won't give it a nice smoky flavour.*Blood Oranges: Try to get blood oranges because they have a certain intensity and tang, but any other sweet and sour orange will work great. *Sanitizing Vessel: This should be solid and large enough to hold all the equipment you need for each step of the brewing process. I use a big Rubbermaid container.Sanitizer: Make sure you read the instructions for your sanitizer and get something food-safe. You can use unscented household bleach diluted to 4ml per Liter of water (1 tbsp per gallon) but make sure you rinse it really well or the mead will taste off.Carboy: Any fermentation will do as long as it's nonreactive, BPA free and foodsafe with some kind of one-way air release valve or a hole that will fit an airlock setup.Coffee Mug: Doesn't have to be a coffee mug but it should be big enough to hold yeast and a cup of warm water. This will be your yeast starter vessel.Airlock Setup: Usually a 3-piece thing that fills with water only lets the CO2 escape and no contaminated air into the brew. The important thing is that no contaminated air gets into the mead. A simple balloon with a pinhole in it covering the top of the fermentation vessel works well.Beer or Wine Bottles: These must be capable of withstanding the pressure of a carbonated beverage, no bottle-splosions here! If you choose to use wine bottles get ones with swing tops or champagne corks with cages.\n*Don't listen to me about the ingredients, do whatever you want with your own mead and then post awesome recipes in the comments!\nImage: My first brewed batch of Fire Mead.. \n          Clean your kitchen and then sanitize everything. I mean everything that you need for the brewing step, that is.\nFor making the must you need to\u00a0sanitize\n\n\t\tthe fermenting vessel\n\t\tairlock parts\n\t\tfunnel\n\t\tcup or other yeast starting container\n\t\tspoon\n\t\tscissors\n\t\tyeast packet\n\t\tsealed honey container\n\t\tbrew pot and lid\nFollow the directions that come with your sanitizer.\u00a0\nIf you're wondering, must is a solution that you feed the yeast to make wine or mead. It can be honey and water or grapes and water... it could even be tomatoes, sugar and water!\nOnce everything is sanitized and ready you can boil some water for the yeast starter. Dissolve a teaspoon of honey with the recommended amount of water on the package. Once the water has cooled to the recommended temperature scatter the yeast over the top. Give them a gentle stir so that most of them fall to the bottom of the dish and they can get nice and hydrated. Let the yeast bloom and come back to life. They'll start making bubbles and floating to the top of the water, it takes about 10-15 minutes.\nMeanwhile, get the brewing pot on the stove and add half a gallon of water. Put it on the stove and turn it on low. The must should never boil, or even simmer. Honey is a very delicate flavour and applying too much heat can destroy the flavour so make sure it only steams, and doesn't bubble. Chop the ancho chili up with the scissors and add it to the pot with the cloves, cardamom and honey. Give it a good stir until the honey is dissolved and zest those oranges. Segment the oranges so that there is no pith in the must. Add the orange fruit and zest and let steam for 20 minutes. Skim off anything that rises to the surface.\nLet the must cool, putting the pot with it's lid on it in a sink of cold water.\u00a0Once\u00a0the must has cooled to the same temperature as the yeast starter pour the must into the sanitized fermenting vessel and then add the yeast starter. Top it up with some warm water and seal with an airlock. Put it somewhere dark and cool and wait for 1 -2 weeks until there is about a 2.5 cm yeast layer on the bottom of the carboy. If you don't have somewhere dark cover it with a towel.\nImages: Beer bottles with sani-brew sanitizer, sanitizing a 1 gallon carboy, the mead pitches and topped up in a 1 gallon carboy.. Sanitize another 1 gallon carboy or fermenting vessel, a siphon hose, and the airlock setup. Cover the top of the carboy with aluminum foil while the equipment sanitizes. If you don't have a second carboy then sanitize a container that can hold the mead while you clean up the original fermenting container.\nPut the mead somewhere a couple feet off the ground and siphon the mead into a new carboy. Leave the spices and chili behind with a little bit of mead and the lees. Lees are the dead yeast cells that pile up on the bottom of the fermenting container and are a normal part of the process. They're pretty good for the garden if you water it down before you use it.\nOnce most of the mead is in the new fermenting vessel, top it up with a bit of water, if necessary and pop the airlock on. Wait for another month or so, giving the mead a soft kick every week or so to loosen up the CO2.\u00a0\nImages: Lees and leftovers from racking fire mead, Siphoning out the mead.. To bottle we need to sanitize the bottles, the siphon hose, and the bottle caps. Give the kitchen or wherever you'll be bottling a good clean and clean the bottle capper too.\nIf you have a bottle filler you should sanitize it but if you don't making a kink in the siphon hose works well too.\nOnce everything is clean and sanitized add 3.5 ml (3/4 tsp) sugar to each bottle and siphon the mead in. Cap the bottles and get ready for\u00a0the hard part. Store it somewhere cool and dark and wait for 3-6 months for the mead to become carbonated, and age a while.\u00a0\nWhen you can't wait any more, pop one open and enjoy!\nImages: Siphoning the mead into bottles, siphoning the last bits of the carboy, the final product.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 1, 3, 0]\nC: [0, 2, 1, 3]\nD: [0, 3, 2, 1]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_53_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_53_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_53_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_53_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [2, 1, 0, 3]\nC: [1, 2, 3, 0]\nD: [0, 2, 1, 3]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. 2 lb. or 907 grams Clementine Oranges 2 lb. or 907 grams GrapefruitLemon zest and juice from 1 lemon6 cups water6 cups sugarAdding a sweetener is must with marmalade; it is very sour and rich in vitamin C, you can go with Splenda or some other sweetener. Just match the recipe cup for cup, or \u00be cup sugar and \u00be cup Splenda for low sugar. For low sugar or no sugar added you will need to add pectin to get the marmalade to thicken.1 sachet of pectin powder.I used No Cook Pectin, if you use no cook pectin or regular pectin you will still need to process the marmalade. Processing allows you to store the jam at room temperature in a dark place and it doesn\u2019t affect the pectin negatively. Once opened you will need to refrigerate the marmalade, refrigerated the marmalade will keep six months.Equipment: 10 (8-ounce) canning jars with rings and lids, funnel, tongs, ladle, and 12-quart pot and a cooking thermometer.. Wash the oranges, grapefruit, and lemon thoroughly.Cut the oranges and grapefruit into thin slices, removing the seeds as you go. Stack the orange and grapefruit slices, and then cut them into quarters or more if that is your preference. Place the oranges and grapefruit into an 8-quart stainless steel pot. Add the lemon zest and juice and the water to the pot, set over high heat and bring to a boil, approximately 10 minutes. Once boiling, reduce the heat to maintain a rapid simmer and cook, stirring frequently, for 40 minutes or until the fruit is very soft.While the fruit is cooking, fill a large pot (at least 12-quart) 3/4 full with water, set over high heat and bring to a boil. Place 10 (8-ounce) jars and rings, canning funnel, ladle, and tongs into the boiling water and make sure the water covers the jars by at least an inch. Boil for 10 minutes. Turn off the heat, add the lids and leave everything in the pot until the marmalade is ready.. Increase the heat under the orange mixture to return to full boil. Add the sugar or sweetener and pectin, and stir the mixture continually, until it reaches about 222\u2070 F or 105\u2070 C on a deep fry or candy thermometer. Cook until marmalade darkens in color, approximately 15 to 20 minutes, the marmalade may not darken if you make no sugar added marmalade. You may need to adjust the heat in order to prevent boil over. Test the readiness of the marmalade by placing a teaspoon of the mixture onto the chilled plate and allowing it to sit for 30 seconds. Tilt the plate. The mixture should be a soft gel that moves slightly. If mixture is thin and runs easily, it is not ready.. Place your jars and lids in a pot of water and bring the water to a boil. Remove jars from the water and drain on a clean towel. Place a canning funnel onto the top of 1 of the jars and ladle in the marmalade just to below the bottom of the threads of the jar. Repeat until all of the mixture has been used. The amount of marmalade may vary by 1 to 2 jars. Wipe the rims and threads of the jars with a moist paper towel and top each with a lid. Place a ring on each jar and tighten.Return the jars to the pot with boiling water, being certain that they don't touch the bottom of the pot or each other. (If you don't have a jar rack, try a round cake rack, or metal mesh basket. Even a folded kitchen towel on the pot bottom will do in a pinch.) Add additional water if necessary to cover the jars by at least an inch. Boil for 10 minutes. Using canning tongs, carefully remove the jars from the water, place in a cool dry place and allow to sit at room temperature for at least 24 hours before opening. Once open, store in the refrigerator.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 1, 0, 3]\nC: [1, 2, 3, 0]\nD: [0, 2, 1, 3]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_54_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_54_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_54_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_54_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [3, 2, 0, 1]\nC: [2, 3, 1, 0]\nD: [1, 2, 3, 0]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Ingredients:1 package of puff pastry sheets.  These are located int he frozen dessert section of the grocery store.1 apple.  I used granny smith, but I but these would be great with honey crisp too.1 TBSP caramel topping. Any brand will do.  This Smuckers stuff was so good I wanted to drink it.1 Tsp honeyA sprinkling of Cinnamon and Sugar.Tools:Keep some flour handy in case you re-roll your dough, or find it gets too sticky.Knife, rolling pin, apple corer, baking sheets, spoon, microwaveable bowl. Your very first question is probably going to be \"WHERE DID YOU GET THAT AWESOME COOKIE CUTTER?\"I have a globe trotting friend who visited the Mutter Museum in Philadelphia, which is apparently a museum of medical oddities.  If you're not conveniently located near the museum, you can get yours here:  http://www.muttermuseumstore.com/merchant2/mercha... If you're feeling really ambitious, you could probably also splice together your own cookie cutter from existing gingerbread men, or freehand a design with a knife. I recommend the official conjoined twins for consistency.Allow the frozen puff pastry to thaw.  Ideal shape cutting happens when the dough is soft enough that it no longer fights to be unwrapped, but not so warm that it is sticky.  If you're using the Mutter twins cutter, you'll get about 4 sets of twins per pastry sheet.You can gather your excess dough and re-roll it to cut more shapes.  Just remember to dust the dough with flour to keep it from sticking to your tools.*Puff pastry dough is VERY elastic.  If you re-roll and cut more twins, expect those twins to contract and become smaller and thicker than your originals.  This is a freak show, so just embrace the anomalies.. At this time, go ahead and pre-heat your oven to 400 degrees.  It'll be ready to go by the time you're done prepping apples.Use your apple corer to slice the apple into sections.  Now take those sections and use your knife to slice them thin.  It doesn't have to be paper thin, but avoid large chunky pieces for a slick, finished look.  Always slice away from your hand, unless you have vampire guests who like extra sauce.. Place your apple slices onto the siamese twin pastry shapes.  2 or 3 slices seems to work well, depending on how you want to arrange them.  I have 2 slices going down the legs because it reminded me of little pants.Pop these fancy little freaks in the oven for 10-12 minutes.  Take a look at them as you approach 10, to make sure the edges aren't getting too brown for you.  I take mine out on the early side because our oven seems to run hot.. In a small microwaveable bowl, mix 1 hearty TBSP caramel topping with 1 tsp honey.  Microwave no longer than 10-15 seconds on normal power.  They will blend together smoothly when stirred.  Try not to eat the whole cup before the tarts emerge from the oven.I've added the honey because I think it helps make the caramel easy to drizzle, and it gives an additional layer to the flavor.  This recipe is also excellent with JUST honey, if you prefer.. Remove your twins from the oven when the edges are golden brown.  You will notice that the pastry has puffed up and some areas may be raised above your apple slices.  That is to be expected, and won't interfere with the readability of your shape.While the twins are still warm, use a spoon or other utensil to drizzle your caramel/ honey mix on top.  No need to try doing fancy designs, because the sauce will warm and just do whatever it wants anyway.  Sprinkle on sugar and cinnamon to taste.. Your Siamese Twin Tarts are ready to be unleashed upon your unsuspecting guests! Keep napkins nearby, because that delicious caramel sauce tends to remain pretty gooey. Your finished tarts should look something like this.  The little guy on the left is an example of a \"re-roll\", where the dough contracted and made him smaller.. This recipe can work with a variety of creepy shapes.  For example, my other favorite cookie cutter, the Fetus.  I made Fetus tarts last x-mas (because.... baby Jesus?) and it was definitely something my guests had never seen before.  This guy was also a gift from a friend and is available at http://hogmalion.com for those who dare.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 2, 0, 1]\nC: [2, 3, 1, 0]\nD: [1, 2, 3, 0]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_55_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_55_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_55_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_55_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [0, 3, 2, 1]\nC: [0, 1, 2, 3]\nD: [2, 0, 1, 3]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Take the fish and clean and wash,remove the innards, if you do not have cleaned the fish in the store.. Take a pot of water, bring the water to a boil and put in it tea bags. On low heat let them simmer for 5 minutes. Cool water to room temperature, put it in salt and sugar. In my case, I cut the fish head,no one will eat them.:))Put them to glass container. add pepper and bay leavespour the brine to cover the fish. put the container in the refrigerator.Two days later, took the container and turned the fish in the container. Put the container in the refrigerator for two more days. Take the fish out of the brine and hang it for 1-2 hours. Allow to drain the brine.\u0411rush the fish with olive oil.You can cut into meal or can food plastic wrap and put in the refrigerator.. \nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 3, 2, 1]\nC: [0, 1, 2, 3]\nD: [2, 0, 1, 3]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_56_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_56_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_56_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_56_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [1, 0, 3, 2]\nC: [2, 3, 1, 0]\nD: [2, 0, 3, 1]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. For this recipe you will need:6 to 6 1/2 cups of all-purpose flour 3 tablespoons sugar 2 envelopes of quick-acting yeast (or 4 1/2 teaspoons) 2 teaspoons salt 1/4 teaspoon baking soda 1 cup buttermilk   NOTE: You can substitute 4 tablespoons of dry buttermilk powder instead and add it to the dry ingredients.  Increase water to 2 cups 1 cup water 1/3 cup butter, cut into pieces. Add 3 cups of the flour, the sugar, yeast, salt and baking soda (and buttermilk powder, if using) to the bowl of a KitchenAid mixer with a dough hook.Using your KitchenAid mixer with the dough hook, mix the dry ingredients on speed 2 (low speed on a hand mixer) for one minute to combine.  NOTE: If you do not have a mixer, just use a large bowl and a wooden spoon, or a hand mixer.. Heat buttermilk, water and butter until very warm (120 -130 degrees F); butter does not need to melt.  To gauge the temperature, use a food-grade thermometer.  If you do not have a thermometer, make sure the water is warm to the touch, not hot.  If your water is too hot, the yeast will not activate.Add to flour mixture and mix on speed 2 (low speed on a hand mixer) for one minute. While KitchenAid mixer is still on speed 2, add enough of the remaining flour, 1/2 cup at a time, to make a soft dough. NOTE: I usually use all 6 1/2 cups of flour, but weather conditions can cause this to change.  Only use what you need in order to allow the dough to \"clean\" the sides of the bowl as in picture 2.. Continue with the KitchenAid mixer on speed 2 for two minutes in order to knead the dough.  NOTE: if you do not have a mixer, instead knead with your hands for 6 - 8 minutes until dough is smooth and elastic.  If you are unfamiliar with how to knead dough, check out this excellent tutorial from Allrecipes.com: Kneading TutorialKneading the dough is important in order to make sure the gluten in the dough is well developed.  Do not skip this step!. Remove dough from bowl and shape it into a ball on a floured surface. Let it rest on the floured surface, covered with a dry towel, for 10 minutes. You will notice the dough has risen slightly and has a smoother appearance.. On a lightly floured surface, cut the dough in half. Roll out one half of the dough into a 7\" x 12\" rectangle If necessary, use your hands to gently stretch and shape the dough to create as even a rectangle as possible.. Starting with the short end, roll the dough up tight into a log. Pinch the seams together. Seal the ends by flattening with the side of your hand to create a flap. Fold the flap underneath the log. Place the log, seam-side down, in a greased 8 1/2\" x 4 1/2\" loaf pan. Repeat with the remaining dough.. Place the pans in a warm, draft-free location and cover with a dry cloth until double in size, about 30 - 45 minutes.. Bake at 375 degrees F for 30 - 35 minutes or until done. Remove from pans; if desired, brush on melted butter with a pastry brushLet cool on wire racks.Slice and Enjoy!!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 0, 3, 2]\nC: [2, 3, 1, 0]\nD: [2, 0, 3, 1]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_57_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_57_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_57_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_57_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [0, 1, 3, 2]\nC: [0, 2, 3, 1]\nD: [3, 1, 2, 0]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Once your wooden applique has been cleaned, let it dry and place it on a non stick surface. We used a piece of glass. Melt the ComposiMold-FC and pour a thin layer over the wooden piece. Be sure it covers the entire surface, including the edges. Once this layer has cooled for approximately 20 minutes, pour another layer of ComposiMold-FC over the first. The second layer will add strength to the mold. The first layer picked up all the details. Use a toothpick to move any bubbles up and away from the surface of the wooden applique. You do not have to pop them or completely remove them, just be sure they are not touching the surface.. Once the second layer of ComposiMold-FC has cooled (another 30 mins or so) you can flip the entire mold over and bend it away from the wooden applique. . Roll your favorite fondant out to 1/8\" and lay this sheet into the mold. Press it into all the details of the mold being sure to get it into all the edges too. . Use your thumb to press and rub the excess edges of fondant away from the final piece. This will tear the extra away. Then you can gently roll the torn edge in towards the final piece to create a nice sharp edge once it's flipped back over. . Carefully flip the mold over again with the fondant in place. Then you can gently lift an edge of the mold and begin to bend it away from the fondant. You might have to hold an edge of the fondant down to get started, Then the mold will easily peel away from the fondant. Check out how the mold picked up every detail from the wooden applique. . Using edible spray paint and sugar dust we were able to paint the eagle to look like more realistic. To transfer the fondant to the side of a cake, simply roll the eagle onto a rolling pin and un-roll it onto the side of the cake. Remember to re-melt to re-use the ComposiMold-FC for all your cake decorating projects. \nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 1, 3, 2]\nC: [0, 2, 3, 1]\nD: [3, 1, 2, 0]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_58_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_58_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_58_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_58_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [3, 0, 2, 1]\nC: [3, 0, 1, 2]\nD: [0, 2, 3, 1]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Select tender ridge gourds which are not affected by any pests.. The skin of matured ridge gourds may be fibrous and will not taste good. Also make sure that the gourds are not sprayed with pesticides.Wash and cut the ridge gourds to manageable size.Peel the skin using a vegetable peeler and collect them in a platter.. Ingredients requiredHandful of shallots. You can use one or two big onions also.Three to four Garlic clovesFour Green chilies. If you want the chutney to be spicy, you can add more.About half a cup of raw coconutSmall piece of tamarindTwo teaspoons of split black lentilsSalt to tasteAbout a tablespoon of cooking oilPeel skins of shallots and garlic. Cut all ingredients into small pieces, so that all of them can be cooked evenly.. Heat a frying pan over medium fire and add a tablespoon of cooking oilAdd the split black lentils to the heated oilAdd the piece of tamarind and saute for a minuteAdd sliced shallots and garlic.When the shallots and garlic pieces turn to a light brown color, add sliced coconut piecesSaute for one more minute. Add the ridge gourd skin to the sauteed ingredients in the panStir fry till the raw smell from the ridge gourd skin disappearsAdd salt, mix well and remove from heat.Allow the ingredients to cool to room temperature.. Once the ridge gourd skin mix cools down, transfer the ingredients to a mixer / grinder jarGrind the ingredients to make a rough paste.From time to time turn over and mix the ingredients in the jar so that all ingredients are ground evenly.Add little amount of water if required.Once the ingredients are ground to required consistency, transfer to a bowl.Two or three teaspoons of this chutney is served as a side-dish along with rice. You can also add little more water and a little bit of salt to taste and make it thin so that this can be served with Idly and Dosa.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 0, 2, 1]\nC: [3, 0, 1, 2]\nD: [0, 2, 3, 1]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_59_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_59_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_59_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_59_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [1, 3, 0, 2]\nC: [2, 0, 3, 1]\nD: [2, 0, 3, 1]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. To make your campfire peach pies you will need:Ingredients:-Brown sugar-Butter-Pie crusts (I used the Pillsbury brand ones)-PeachOther stuff:-Aluminum foil-Bowl-Rolling pin-Roasting stick-Firepit-Firewood. To peal a peach you need to fist boil some water and then put it into a bowl/container and submerge your peach in that water for about 1-2 minutes.  Next submerge the peach into cold water for 1-2 minutes.  Then take your peach out and it should be easy to slip the peel of the peach with your hands.  . The next step is giving your peach a tasty coating.  For this you will need to make a mixture that is 2 parts brown sugar to 1 part butter.  Slather this all over your peach.  . Lay out your pie crust across a plate, and place your peach in the center.  Wrap and fold your crust up and around your peach so that it will have some excess on the top.  Remove the extra and roll that out again.  Wrap your peach until you've used all the dough from one circular pie crust.. Wrap your pie up in some tin foil with the shiny side facing in.  I found that it was best to do 3 layers so that the tin foil wouldn't burn off or rip if you moved it around (this happened the first time I tied this when I only USD one layer).  Wait until your fire has gone to coals before you put in your pie. I found that the best way to get your pie into you fire was by puncturing a hole in the tin foil at the top with a roasting stick and then sticking a prong of the roasting stick into that hole and guiding it into the firepit.  I would stick it in again to move the pie around as needed if there was a better area for cooking in the coals, or if it was getting too hot.  . Remove your pie form the fire pit using the roasting stick and place it on some pieces of wood to cool (5 minutes). One it is cooled remove the tin foil and you should have a lovely pie. You can eat it by just biting into it or cutting it up or serving it with ice cream. Enjoy!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 3, 0, 2]\nC: [2, 0, 3, 1]\nD: [2, 0, 3, 1]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_60_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_60_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_60_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_60_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [3, 0, 1, 2]\nC: [1, 2, 0, 3]\nD: [0, 3, 1, 2]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. One bread knife, one or more round rolls (buns), a number of eggs equivalent to the number of rolls.. Cut off the top of the roll.. Take out the inside of the roll. Do not take too much as the egg will leak. There has to be enough room to accommodate one egg and some spices and top ups.. Load the egg into the roll and top it up with your favorite spices. I have chosen a bit of salt, loads of white pepper and loads of red paprika powder. U can use chilli peppers, I would go for Scotch Bonnet if I had some, (u have to remember that they r very hot!!!) cut into small pieces, without any problem making the dish very Mexican ;). I added a bit of low temperature melting fat chase as well. You egg roll is ready for the oven.. Load the roll into the oven for about 20-25 min @180 centigrades thats 356 Fahrenheit. The time will vary depending on the oven type, forced air circulation etc. so you will have to experiment with your oven go get the desired effect. I usually try to have the egg roll very soft inside with the liquid yoke and the white barely done. You simply have to observe the time required for the desired effect in your oven so next time you will be able to set the timer up.. You egg roll is nice and ready. Because no fat is used for the egg processing the white is going to be very delicate and hot, so you will have to wait a bit longer than with the regular fried egg as the bun works like a thermo flask keeping the egg nice and warm for longer. Load your favorite ketchups and garlic mayonnaises or mustard and enjoy your extremely easy but tasty and good looking appetizer.That's it hope you will like it.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 0, 1, 2]\nC: [1, 2, 0, 3]\nD: [0, 3, 1, 2]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_61_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_61_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_61_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_61_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [2, 1, 0, 3]\nC: [2, 0, 3, 1]\nD: [0, 3, 2, 1]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Start by gathering the ingredients together.Ingredients1.       2 large bananas mashed 2.       4 eggs 3.       1 cup vegetable oil 4.       2/3 cup water 5.       3 1/2 cups all-purpose flour 6.       2 teaspoons baking soda 7.       1 1/2 teaspoons salt 8.       2 teaspoons vanilla 9.       Olive oil10.   3 cups Splenda or some other artificial sweetener, if you are not concerned about sugar content you can use white sugar.SuppliesLarge BowlPotato MasherMeasuring Cups & SpoonsLadleWhiskBrush for greasing pans.Disposable pans with lids.. The prep time to make the banana bread is 15 minutes so I put the oven on at 350\u2070 F or 175\u2070 C to preheat the oven.In a large bowl I peel and mash the bananas; mix in the eggs, oil, water, sugar, and vanilla, then whisk until well blended.When the wet ingredients are well blended whisk in the flour, baking soda, and salt.. These aluminum pans are ideal for gifting; prepare the pans by coating the pans with olive oil using a brush, I find the bread falls out of the pans with ease when you coat them this way.Pour the batter into the prepared pans; I like to use a ladle and count the ladles of batter in each of the pans. This batch should fill 4 pans to about half full, then check the oven to be sure it is up to heat.Place the pans on the middle rack of the oven and bake for about 50 minutes to one hour in the preheated oven. The loaves are done when you can insert a toothpick in the center of the loaf and it comes out clean and dry.. Once baked let the loves cool on a rack, when the loves are cool you can put the clear plastic covers on the pans or you can glaze the loves and serve.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 1, 0, 3]\nC: [2, 0, 3, 1]\nD: [0, 3, 2, 1]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_62_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_62_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_62_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_62_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [1, 3, 0, 2]\nC: [3, 0, 1, 2]\nD: [2, 1, 0, 3]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Ingredients:500ml of half and half cream250ml of heavy creamSugarVanillaSaltNutellaPeanut butterChocolate chipsOther things you will need:Ice cream makerMeasuring cupsMeasuring spoonsBig bowlOther container to store ice cream in (not in picture). First we need to make the mixture that will later turn into ice cream!Here's the recipe:2 cups (or just a 500ml carton) half and half cream1 cup (or just a 250ml carton) heavy cream1/3 cup white sugar1 teaspoon vanillaTiny pinch of saltOnce it's all added, mix until everything is dissolved.In case you're wondering, the peanut butter, nutella and chocolate chips will be added later.. Once your mixture is ready, pour it into your ice cream maker. Make sure that your frozen canister is completely frozen. I even froze mine a couple days before, just to be sure. Now turn on the machine and set a timer for 10 minutes. If your mixture doesn't look like the last two pictures after 10 minutes, then do it for another 10 but watch it so that it doesn't freeze too much.. After the ice cream mixture is the right consistency, it's time to add the yummy stuff! You will need to add:1/3 cup of peanut butter1/3 cup of nutella (yes, I know it's not a full 1/3 cup, I didn't realize that we had so little of it)1/3 cup of chocolate chipsAdd all of that into the mixture and stir with a long spoon to mix it in thoroughly, then turn the machine back on and set the timer for another 10 minutes.. Once your 10 minutes are up, turn the machine off and scrape the ice cream off the mixer part . Then carefully transfer the ice cream into a separate container and place it in your freezer for a couple hours, just to really harden it, unless you want it soft.. Once it has been in the freezer for a while, scoop it out into bowls and serve! I really hope you enjoyed my instructable. If you did, please vote for me in the \"Frozen Treats\" contest, also favorite and comment, I'd really like to hear your feedback! Thanks!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 3, 0, 2]\nC: [3, 0, 1, 2]\nD: [2, 1, 0, 3]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_63_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_63_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_63_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_63_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [0, 3, 2, 1]\nC: [1, 0, 3, 2]\nD: [2, 0, 1, 3]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. 1000grams apricots 500grams gelling sugar 2:1 1 tablespoon lemon juice 1 vanilla pod 2 tablespoons Marillenschnaps (apricot liquor)Tools:kitchen scale kitchen knife mason jars canning funnelhand held blender. First you have to wash and towel the fruits. Cut them in half, remove the kernel and then mince the rest of the fruit pulp. Put them in a big cooking pot.. Cut the vanilla pod with a longitudal cut, scrape out the pulp with the knife and add the pulp to the apricots.. Apricots are rich in sweetness but low in pectin. Therefor we need a gelling sugar. There are many types of gelling sugars, I picked the 2:1 version for a better taste and less sweetness. The 2:1 simply means you add twice as many fruits in weight than you have sugar. Stir everything and let it simmer. I added the rest of the vanilla pod to the mix for more flavour, it will be removed before the blending.. Time to pick a huge bowl, fill it with boiling water and lay the canning jars inside for desinfection.. Time to take out the vanilla pod and the hand held blender. . When everything is blended smoothly, increase the heat and boil for 7 minutes while stirring the mixture.. Meanwhile spread a kitchen towel on a table, take the canning jars out of the water and let them dry with the mouth down on the towel.. Now its time for the Marillenschnaps or whatever the liquor you picked. Add 2 tablespoons to the mix and stir.. Time to fill the prepared canning jars. Use a funnel to prevent spills. Then close the jars and flip them over. Let them rest upside down and cover them with a kitchen towel to keep in the heat.Have fun trying this one out!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 3, 2, 1]\nC: [1, 0, 3, 2]\nD: [2, 0, 1, 3]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_64_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_64_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_64_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_64_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [2, 0, 1, 3]\nC: [0, 1, 3, 2]\nD: [1, 0, 2, 3]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. I use chickens eggs. You can buy them from the store if you like, although I get mine for free, well sort of. For those who believe eggs come from a supermarket please see the photos above. They actually come from Chickens originally. I like to know where my food comes from and I also can directly control the quality and treatment some of it receives. If I do ever have to buy them I try and ensure they are free range. I keep a few chickens as pets. In return for my feeding them, cleaning their coop and generally keeping them happy they provide me with eggs. I generally end up with quite a glut of them as, during the summer with all three laying, I just can't use them quickly enough.. Okay, so we're going to need a few things. What you use can vary according to your taste and what you have available. I've used Beetroot to give the pickles a pink colour and also provide a crunchy pickle themselves. Distilled Vinegar, enough to fill your chosen container (this is the clear vinegar) Eggs, I used 13 for this recipe 2 medium sized raw beetroots 2-3 carrots, sliced about 5mm thick on the diagonal 1-2 reg peppers, sliced Pickling spices (optional) A little sugar if wanted.    I put the beetroots, still with their skins on, into a pan of cold water and bring to the boil. These need to cook for 30 minutes     After 20 minutes add the sliced vegetables to the beetroots and cook for the remaining 10 minutes. Then drain them a and allow to cool     Once cooled you should be able to peel the skin from the beetroot quite easily by rubbing a knife across it. (When handling the beetroot it's advisable to wear gloves or your hands will be stained pink too)     Chop the beetroot into 1cm dice, or small chunks     Add this in with the other vegetables and stir so they're all mixed together . \n          I like to add some spicy flavour to the vinegar, but you can leave this out and just use it as is. Some people alo advocate adding up to 50% water to the vinegar to stop the eggs being too rubbery and the flavour being too strong. I like strong flavours, but feel free to experiment and dilute to your taste. Before you start it may be advisable to open a window, the toasting spices can tickle your throat and make you cough. The heated vinegar is also quite a pungent smell and can be a little unpleasant.   Put a pan on the heat and add some spices to help release their flavours and natural oils, I used the following, but you might want to create your own spice mix:        1tsp Mustard Seeds         1tsp Coriander Seeds         1tsp Crushed Dried Chilli         1tsp Black Peppercorns         2-3 bay leaves         1tbsp Sugar      Once the spices have started releasing their aroma add your vinegar. Bring this close to the boil, add the sugar and stir until it dissolves     Remove from the heat and allow the flavours to infuse as the vinegar cools     If you have made more than you need simply store in a bottle and use for the next batch. .    Put your eggs into a large pan and add plenty of cold water. Fresh eggs will lie on the bottom, slightly older ones will turn upright, bad ones will float to the surface. You should dispose of any floaters, the ones that turn up are at your discretion, though I don't tend to have any.     Now put the pan of good eggs on the heat and bring to the boil     Once on a rolling boil you need to cook them for 7 minutes, use a timer     When 7 minutes are up I then drain the eggs and fill their pan with cold water. This stops them cooking any longer and prevents that dreaded 'black ring' around the yolk that overcooking them causes.     Now you need to peel them. I do this in the pan as the water helps to wash the loose shell away. You'll notice some are easier to peel than others. I think the older eggs are easier to peel. A couple of my eggs look half chewed where the shell was difficult to remove     Remove the eggs from the pan and allow to dry whilst you prepare the other components  You can see why the floating test works for the eggs once you've peeled them. As eggs get older the airspace at the rounded end gets larger. This is so that a developing chick has space to move into before escaping from the shell. Once the shell is removed from the boiled eggs you can see how large the airspace is, most of mine barely have any as they're only about a week old. I keep the egg shells and add it to my chickens feed, it helps them to digest their food and it also gets re-absorbed and helps them to create strong shells on the eggs.\n        . \n          It is very important that the container you are going to use for preserve is sterile. Any bacteria will ruin your food and may adversely affect your health. There are a number of ways to sterilise jars, I won't go into too much detail as it's fairly straightforward, but I tend to do one of the following:    Wash the jar then dry in the oven. I start with the oven cold so as not to shatter the glass.     Put a number of open jars into the dishwasher on a high heat setting. I tend to do this when I'm making large batches of preserves  Once sterilised your jar is ready to use. Be careful and use gloves when handling hot jars.\n        . \n          Now, you want to add the ingredients into the jar in layers so it all mixes evenly.   Start with some of the vegetables     Add a layer of Eggs     Now add more vegetables     Add more eggs     Keep doing so until you've filled up the the neck of the jar  I then use a jug to pour in the spiced vinegar. Ensure that everything is covered by the vinegar before sealing\n        . The eggs are best left to soak in the vinegar for at least a month. They can stay in the vinegar for a very long time and will just get better. As I say, I like my eggs in a bag of crisps. You can eat the vegetables too and all of these go nicely as part of a salad or ploughmans lunch. The eggs will have taken on a lovely pink colour, and all the flavours form the vinegar.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 0, 1, 3]\nC: [0, 1, 3, 2]\nD: [1, 0, 2, 3]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_65_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_65_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_65_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_65_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [3, 0, 2, 1]\nC: [2, 1, 3, 0]\nD: [3, 0, 1, 2]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Ice Mio\u00a0Caffeinated water enhancer, black cherry flavor True Lime crystalized lime Splenda Lime Maraschino cherries Club sodahttp://truelemonstore.com/products/true_lime&trk_src_ss=TLMPAYPCWEBMACSS\u00a0http://www.makeitmio.com/mio-original.aspx\u00a0. Fill glass with ice. Add 1 packet Splenda and one packet True Lime or the juice of 1/2 lime.. One squirt of Mio Caffeinated cherry flavor. Of course, you can use the non-caffeinated version\u00a0instead.\u00a0. Fill glass with club soda.. Add a Maraschino cherry and a slice of lime. Enjoy!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 0, 2, 1]\nC: [2, 1, 3, 0]\nD: [3, 0, 1, 2]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_66_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_66_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_66_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_66_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [3, 2, 0, 1]\nC: [2, 3, 1, 0]\nD: [1, 3, 0, 2]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. First off, start by twisting all the Golden Oreos apart.Then break the chocolate into a microwavable bowl. Reduce your microwave\u2019s power to half then heat the chocolate in 20 second intervals. Stir well between each time and repeat until your chocolate is melted and smooth. Now dunk a lollipop stick into the melted white chocolate & press it into the Oreo cream filling. Place the other Oreo half on top (with the white chocolate acting as a glue) then place on a baking tray/chopping board.. Repeat the process until all of the Oreos have been \u2018glued\u2019 back together with their new lollipop sticks.Pop the tray/board into the fridge for 15-20 minutes to allow the chocolate to set completely before we move onto the next step.. When the white chocolate \u2018glue\u2019 has set, you are ready to decorate! Get a large baking tray lined with baking paper ready & set to one side.If you need to, pop the white chocolate back into the microwave and heat on half power in 20 second intervals until smooth & melted again.Now all you need to do is dunk each Oreo Pop into the white chocolate and shake of any excess. Then dunk a side into the sprinkles and place onto your prepared baking tray to set.. Repeat until all the Oreo Pops have been covered in chocolate/sprinkles.Allow them to set at room temperature. Once set, peel off the baking paper and store at room temperature in an airtight container. Consume within 2 weeks.. These would look gorgeous wrapped in a cellophane bag & tied with a bow! It\u2019s also a lovely craft to get the kids involved with. You could even get a polystyrene block/flower oasis, place it in the bottom of a small box or vase & turn them into a bouquet of Valentines Day Oreo Pops flowers with some green crepe paper. I\u2019m certain that would keep Mum very happy!For more awesome step by step recipes check out  www.kitchenmason.com\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 2, 0, 1]\nC: [2, 3, 1, 0]\nD: [1, 3, 0, 2]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_67_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_67_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_67_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_67_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [0, 3, 1, 2]\nC: [2, 1, 0, 3]\nD: [2, 1, 0, 3]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Syrup: 1 cup water. 2 cups sugar (we use brown sugar) Optional: Vanilla and maple extract. Optional some people add a bit of buttermilk. Sometimes we substitute apple sauce or ground fruit and berries for the syrup. Pancake batter: 1 cup buttermilk (substitution: one cup milk with 1 tablespoon juice or vinegar made the night before and refrigerated) 1/3 to 1/2 cup cooking oil or melted butter. 1 room temperature egg. 1 teaspoon salt. 1 cup all-purpose flour (up to 1/2 cup of whole wheat flour can be substituted). 1 teaspoon baking powder 1 teaspoon or less of baking soda.. With a sauce pan bring the water to a boil. Add the sugar and karo syrup if any, Boil for about one minute. Turn off the heat. When the mixture is still slightly warm, add the vanilla and maple extract. Put in a sealed container and refrigerate for later usage.. Egg needs to be at room temperature. So let it sit a bit while making the syrup. In a bowl combine the flour, baking powder, salt, and baking soda. Stir well. In a separate container, crack the egg into it and whipped lightly till the yolk and the egg are combined. Into the bowl of flout add the cooking oil, buttermilk, and egg mixture Stir about 10 times. DO NOT OVER STIR.\u00a0 Lumps are okay. Let sit while you go on with the pan. Heat up a frying pan at just more than medium heat. Add oil or butter to coat the bottom. Pan needs to be warmed up (i.e. hot as cold skillet = bad hotcakes.) Ladle in the pancake mix one large spoon full at a time. Repeat till the pan is full say 3 or 4 depending on the size of the skillet. (each spoonful must not be touching each other). Let the cakes puff up. You can use a spatula to make sure they do not burn underneath. When you see a healthy amount of bubbles the cakes are ready to turn or flip. Let cook till they are like a piece of bread. (no liquid). Set aside and repeat the process till all the batter used. You can use a kitchen towel to keep the pancakes warm if they are not served immediately. (They are gone at our house as soon as they come out of the pan).. Note: pancakes do not have to be perfect in shape as they still taste just as good.. Note: you could also cook bacon and or eggs to go along with this dish. Then you need less pancakes per plate. Put two to three pancakes per plate. Add syrup and or butter. (we sometimes substitute apple sauce for the syrup). Serve.. How To Make Brown Sugar    1 Cup White Sugar     1 Tablespoon MolassesPour  the sugar in a food processor and then drizzle the molasses over the  top.  Process until the two are thoroughly mixed (about 2 minutes)   stopping to scrape the sides occasionally as needed.---Make Your Own Baking Powder     1-1/2 teaspoons baking soda     3 teaspoons cream of tartar (egg shells)     2 teaspoons cornstarchCombine the baking soda, cream of tartar, and cornstarch and use in the recipe as you would the baking powder.And here\u2019s one last thing to remember.  Moisture will make the baking powder loose its potency so never dip a damp measuring spoon into your container of baking powder.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 3, 1, 2]\nC: [2, 1, 0, 3]\nD: [2, 1, 0, 3]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_68_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_68_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_68_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_68_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [3, 2, 1, 0]\nC: [3, 1, 0, 2]\nD: [0, 3, 1, 2]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. \nFor this you will need\n5 cups of water\n1/2 c. of dried hibiscus flowers, look in mexican food stores if you can't find them at your local store.\n1/2 c. Agave Nectar or similar sweetener\n1/2 Lemon. Take half of the amount of water you will be using put it in a tea kettle to boil. This should take about 5 minutes.. While waiting for your water to boil, take the other half of the water and put it in the pitcher that you want to be using for the tea. Add sweetener and lemon and stir around.. Once the water comes to boil you can put the flowers in a glass bowl and pour the hot water over them. Let them steep for a good 10 minutes until the water gets a dark red.. Next add ice to the sugar water to get it nice and cold.. Mix the tea you made with the flowers in to the sweet water, combining them. If the mixture is not cold enough add more ice.. At this point the drink is done. Feel free to put it in the fridge to cool some more or to save for later.\nThe hibiscus tea is very refreshing especially for those hot days and now you can make your own!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 2, 1, 0]\nC: [3, 1, 0, 2]\nD: [0, 3, 1, 2]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_69_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_69_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_69_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_69_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [3, 2, 1, 0]\nC: [1, 2, 0, 3]\nD: [2, 0, 3, 1]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Ingredients (for 4 servings):30 quail eggs 2 scallions (20g) - 0.7 oz 1 celery stalk (30g) - 1 oz 3 pickled cornichon cucumbers (40g) - 1.4 oz 1 Tbsp chopped parsley 1 tsp Dijon mustard 1 tsp lemon zest 1 Tbsp lemon juice 1/8 \u2013 \u00bc tsp chili powder 3 \u2013 4 Tbsp mayonnaise (preferably homemade) salt and pepper to tastefor the homemade mayonnaise:1 large chicken egg  - at room temperature about 1 cup of any mild - flavored vegetable oil (I used sunflower oil)Tools:a medium saucepan a colander a large bowl (filled with water) 10 ice cubes a cutting board a sharp knife for the homemade mayonnaise: a bowl (I usually use a soup plate) and a whisk (or a wooden spoon)Abbreviations:tsp = teaspoon Tbsp = tablespoon. I make it the same way my grandmother used to (but it can be made using a blender, too).For the mayonnaise you need a fresh yolk. The yolk will be used raw, so it is very important for the egg to be as fresh as possible.Clean the egg (with water and soap) and pat it dry. Separate the yolk from the white and place the yolk in a bowl. Save the egg white for another recipe (pavlova, meringue or egg white omelet).Start whisking the yolk for about 1 minute. It doesn't matter if you whisk it clockwise or counter-clockwise as long as you don't shift direction. Use the direction that suits you best.Now you can start adding the oil. At the beginning add just a few drops of oil, whisking vigorously. Continue adding the oil, few drops at a time (whisking continuously, of course), until the emulsion seems to thicken. Now you can increase the oil volume, to about 1 tsp at a time. Whisk continuously until all the remaining oil is incorporated.Note: For this quail egg recipe you will need only about 3 - 4 Tbsp of mayonnaise. Store the remaining mayonnaise in an airtight container, refrigerate it and use it in other recipes (salads, dressings, sandwiches, fish cakes).. Carefully wash the quail eggs, place them in a pot and cover with cold water. Place the pot on the stove and wait until the water starts to boil. Reduce heat to minimum and let eggs boil for 3 minutes. Meanwhile prepare the ice bath. Fill 2/3 of a large bowl with cold water and add in about 10 ice cubes.Drain eggs using a colander, let them cool in cold water for 5 minutes and carefully peel them.. Finely chop the scallions, celery stalk, pickled cucumbers and parsley.Pat dry the eggs, roughly chop them and place them in a bowl. Add the scallions, celery stalk, cucumbers, parsley, mustard, lemon juice and 3 Tbsp of mayonnaise. Stir to combine. If the salad doesn't look creamy enough, add the remaining tablespoon of mayonnaise.. Season with lemon zest,1/8 tsp of chili powder, salt and pepper. Stir to combine and taste it. If the salad isn't spicy enough, fell free to add the remaining 1/8 tsp of chili powder. . Refrigerate for at least 15 minutes before serving. I noticed that the longer you refrigerate the salad, the better it will taste. Serve it on a slice of whole-wheat toast, garnished with arugula and radishes.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 2, 1, 0]\nC: [1, 2, 0, 3]\nD: [2, 0, 3, 1]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_70_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_70_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_70_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_70_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [3, 0, 1, 2]\nC: [0, 3, 2, 1]\nD: [2, 0, 3, 1]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Here's what you'll need: -pasta (I typically use dried, although fresh is even better!) -grated parmesan or other hard cheese -butter -egg -pancetta (or bacon, or\u00a0prosciutto, or guanciale, whatever you can get your hands on really. \u00a0Alternatively, you could leave this out entirely for a vegetarian version.) -black pepper -salt -cream (optional) I don't usually measure my ingredients, which is why I haven't given any quantities. \u00a0As a rough guide, I usually use about 100g of pasta, one egg, a bit less than a tablespoon of butter, and one thick slice of bacon if I'm making this for myself.. Dice up your bacon or pancetta, and toss it in a frying pan over medium-high heat. As soon as that's going, start boiling your pasta in water with a bit of salt added. \u00a0(If you're using fresh pasta, you may want to wait until you've done the next step before starting it, as it will cook much more quickly.). While those are cooking, put a knob of butter and a bit of your grated cheese in a large bowl - you want something big enough to mix your pasta in. When the bacon is finished cooking, put it in a small bowl (or teacup, as the case may be), and set it aside for now. (Warning: There might be a bit of thumb-twiddling at this stage. \u00a0This is a really easy recipe. \u00a0I had time to wash a sink full of dishes while waiting for the pasta. \u00a0Just try not to eat all the bacon cubes just yet.). When your pasta has finished cooking, drain it, and immediately put it into the bowl with your butter and cheese. \u00a0It's important that it's still hot at this point. As soon as the pasta is in the bowl, crack your egg into it. \u00a0That's right, directly into the pasta. \u00a0Then mix it! \u00a0You want to get the egg distributed as evenly as possible, since if you let it sit it will curdle. \u00a0Keep stirring until the egg, butter, and cheese are mixed, and the sauce starts to thicken. The heat of the pasta cooks the raw egg, but does so slowly enough that you don't need to worry about it scrambling and making your sauce all lumpy. \u00a0The mixing distributes the egg so it cooks completely, and forms an emulsion with the butter and cheese that makes it nice and creamy. This sauce won't look quite as thick as the stuff you buy in the store - they bulk it out with cornflour - but trust me, it's tastier. \u00a0Adding cream in the next step will get you a bit closer to that store-bought taste.. Now, you can mix in your bacon, cream (if you're using it), and pepper. \u00a0Maybe sprinkle a bit of extra cheese on top.. That's it! \u00a0You're done! \u00a0Go enjoy your delicious pasta. Once you're happy with this, you can try all sorts of interesting variants. \u00a0Maybe try adding vegetables to the basic sauce, or different kinds of spices. \u00a0I had a particularly tasty batch last week while cleaning out my fridge that used smoky bacon in place of pancetta, and some stilton cheese instead of cream. \u00a0Be creative!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 0, 1, 2]\nC: [0, 3, 2, 1]\nD: [2, 0, 3, 1]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_71_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_71_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_71_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_71_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [2, 1, 0, 3]\nC: [3, 2, 0, 1]\nD: [3, 0, 1, 2]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. BACON - 12 strips should be enough to flavor your vodka, but if you are like me...you may need more as not all of the bacon will survive to make it into the projectVODKA  - 1 liter should suffice - and with all cooking...the quality of the ingredients will effect the final product.\u00a0 I'm not saying you need to go crazy at the liquor store, but I wouldn't recommend anything that comes in a plastic bottleCONTAINER WITH LID\nCOFFEE FILTERS (OR SIMILAR)\nFREEZER. Everyone has their own preference when it comes to the crispiness of their bacon.\nFor the infusion I used bacon that was cooked to where it was still flexible and wouldn't crumble.\nJust place all of the bacon in the container with the vodka, close lid and let sit for 4-5 days at room temperature (infusions work better at room temperature and the alcohol will keep you safe). After the vodka has had time to soak up all of the bacon flavor, its time to remove the bacon.\nI used a mesh colander to catch the bacon and the larger pieces of grease, but you could just remove the bacon by hand.\nNext put the vodka in the freezer.\u00a0 This will cause the oils to solidify which will make them easier to remove.. \nAfter the oils have solidified, you'll want to strain the vodka through a coffee filter to remove the oils.\u00a0 The oils will clog the filter, so I'd do small amounts at a time and change the filter when the vodka stops dripping through.\nTake the filtered vodka and put it back in the freezer and repeat this step.\u00a0 The more times you repeat this step, the less oily the final product will be.\u00a0 I filtered three times.. Once you satisfied with the filtered vodka, its time to enjoy the fruits (or meats) of your labor.\u00a0 Here are some recommended drinks to use your bacon vodka with:\nChocolate Bacon Martini:\n2 oz of Bacon Vodka\n3/4 oz of Godiva Chocolate\u00a0 Liquor\nSplash of Half & Half\nShake all with ice and strain into a martini glass\nBacon Bloody:\n2 oz of Bacon Vodka and your favorite Bloody Mary mix.\u00a0 If you normally like your Bloodies spicy...I'd lay off the spice this time because the bacon is the star of this show\nAnd then there is just the bacon vodka chilled by itself...but the possibilities are endless\nEnjoy!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 1, 0, 3]\nC: [3, 2, 0, 1]\nD: [3, 0, 1, 2]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_72_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_72_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_72_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_72_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [3, 2, 0, 1]\nC: [0, 2, 1, 3]\nD: [1, 0, 3, 2]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Ingredients:   (makes 1 sandwich) 3+ slices sandwich meat (I use pastrami) 4 eggs, boiled Handful of lettuce 1 slice monterey jack cheese Mayonnaise 2 slices whole wheat bread. Boil the eggs. Place eggs in a saucepan with water covering them, and place over medium-high heat. Once it reaches a boil, turn heat off. Drain out the hot water, and dump eggs into a bowl of ice water. Once cooled, immediately peel the shells off, and rinse the eggs. Transfer shelled eggs to a cutting board and slice into medium slices.. Prep the rest of your ingredients: sandwich meat, lettuce, cheese, mayonnaise, and bread.. For a nice crunch, you can toast your sandwich bread. Place cheese on the top slice.. Build your sandwich: Spread a generous dab of mayonnaise over the bottom slice. Layer with lettuce, then arrange the boiled egg slices neatly on top. Then top the eggs with the meat, then place the slice with the cheese over the top of the sandwich.. Consume\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 2, 0, 1]\nC: [0, 2, 1, 3]\nD: [1, 0, 3, 2]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_73_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_73_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_73_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_73_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [1, 2, 3, 0]\nC: [3, 0, 2, 1]\nD: [0, 1, 3, 2]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. What you will need for this are all actually pretty common items.2 or 5 lb CO2 tank (any size will do, really, but a larger tank means fewer trips to the machine shop or welding supply store)Dual-gauge regulatorGas line with ball lock quick disconnect20z ball lock converter capRepurposed 20oz or 2L bottlesThere is some debate over whether CO2 tanks from a machine shop or welding supply shops. The truth is, these tanks and CO2 is no different than what you will find underneath the counter in bars and restaurants.Also, I purchase the CO2 tank online, thinking I might come out a little better if I could just have a 5lb tank filled on the spot. Turns out, no local places fill tanks on the spot and I ended up surrendering my pretty tank for another at the machine shop. But it wasn't a big deal, really, since it came out to the same price anyway, around $90 for a filled 5lb tank \u2013 $70 for the tank deposit (which I didn't pay, since I turned one in) and $20 for the CO2.The dual-gauge regulator I purchased works perfectly fine, but two things are worth noting. It has a safety release valve that engages at 45psi. If you want extra bubbly water, you may want to look for a different regulator. Also, you don't necessarily need a dual-gauge regulator \u2013 it just serves as a visual aid for how much CO2 is remaining in the tank.Save for the ball lock disconnect, you can easily find the hose clamps, gas line, and other connectors at your local hardware store. However, it's almost positively easier and cheaper to just order this pre-made assembly online. It's difficult to find a rubber gas line under 20' long and for less than $20. This entire assembly plus the quick disconnect is about $15 on Amazon.. You really won't need a lot of tools for this. Just some scissors, a pipe wrench or slot and groove pliers, and a screwdriver. And you should definitely consider some thread seal tape.. Begin by attaching the gas line assembly to the regulator. Slide a hose clamp over the open end of the gas line, then slide the hose over the barb on the bottom of the regulator.If you have trouble fitting this hose over the barb, simply soak the end of the hose in warm water for a minute or two, then try again.Slide the hose clamp to about 1/8\u201d from the end of the hose and use the screw driver to tighten the hose clamp over the connection.. Next, wrap some thread seal tape around the threads of the CO2 tank valve (in the direction you will be screwing the nut on, unlike what I'm doing in the above photo, because I goofed).Make sure the included nylon washer is in place, and screw the regulator onto the tank valve. Use pliers or a pipe wrench to snug the nut.. And that\u2019s it! Seriously, you\u2019re ready to carbonate some water.Turn the valve on the CO2 tank and adjust the regulator pressure to approximately 45psi, and twist the pressure valve on the regulator to the on position.Remove the cap from the bottle of water, squeeze out as much air as possible, and screw on the the ball lock converter cap \u2013 or carbonator. Then connect the bottle to the ball lock disconnect. When you do this, the bottle will immediately inflate and harden. Shake the bottle for 60 to 120 seconds and remove from the ball lock disconnect.Turn off the valve on the CO2 tank, pull the manual pressure release valve to release the remaining pressure in the gas line, and switch the regulator pressure valve back off.Twist off the cap on the bottle, pour into a glass, and enjoy some refreshing homemade sparkling water!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 2, 3, 0]\nC: [3, 0, 2, 1]\nD: [0, 1, 3, 2]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_74_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_74_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_74_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_74_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [3, 1, 2, 0]\nC: [1, 3, 0, 2]\nD: [3, 0, 2, 1]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. This recipe serves two people or one extremely hungry student :)Ingredientsone chicken breast ($1.50)one ramen noodle packet with seasoning ($0.25)one egg ($0.20)Total Cost: $1.95That's all the ingredients that there is to it! Feel free to double or even triple the recipe if there are more people.. 1. Start off buy putting your ramen in a plastic bag. Gallon bags work best but are not necessary. 2. Pour as much seasoning as you like. I usually use 3/4 the packet but add more or less to taste. Seal the plastic bag so that you don't make a big mess.3. Put your textbooks to work by using them to crush the ramen into small bits. Make the pieces bigger for crunchier nuggets or extremely fine for a softer nugget.4. Pour the ramen breadcrumbs into a bowl. If the plastic baggie you used to crush the noodles hasn't broken, you don't have to do this step, but most likely there will be a few small tears from where the textbook has stabbed it.. 1. Take your chicken breast and use a knife (ones that have broken and have been repaired by duct tape are fine) to cut it into bite sized pieces like in the second photo.2. Crack one egg into a bowl and whisk it using a fork/chopsticks/whatever.. 1. Start off by making sure you have a plate close by to put the breaded chicken on. Then place some chicken bits into the egg mixture making sure that every bit is coated.2. Place the eggy chicken into your crushed ramen noodles and use your hand to make sure every part of the chicken is coated and that there are no bare spots.3. Put the nuggets onto a place and get ready to cook!Tip: Have one hand do the wet stuff (coating the chicken with egg) and your other hand do the dry stuff (coating the chicken with ramen noodle, placing nuggets onto a plate). . 1. Pour some oil onto a pan. Doesn't really matter what kind of oil or what kind of pan, whatever you have. I used olive oil for this demonstration. Also, the more oil you use, the more tender and generally tastier the nuggets will be. Heat the pan on medium until the oil is hot.2. Place all of the nuggets in an even layer on the pan. Don't worry if some of the ramen noodle coating falls off, you can pick those up later.3. Cook until the bottom of the nuggets are a golden brown. The nugget in the third photo isn't done yet, it needs to cook for longer.4. Once the nuggets are golden brown like in the last photo, turn them over so that the other side can cook. Once you can see that the other side is also golden, remove the nuggets from the pan and transfer them to a plate (or just eat them out of the pan, less dishes amirite?)Tip: Don't put the cover on the pan! Condensation will form and drip onto your chicken nuggets making them soggy and wet.. Eat your nuggets when they are still warm and enjoy your delicious meal! If you liked this Instructible, please take a second to vote for me in the DIY University Contest! It would mean the world to me!Have a fantastic day,thederpyninja\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 1, 2, 0]\nC: [1, 3, 0, 2]\nD: [3, 0, 2, 1]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_75_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_75_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_75_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_75_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [2, 0, 1, 3]\nC: [0, 3, 1, 2]\nD: [0, 2, 3, 1]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Before you begin, acquire the materials that you will need to tap into your faucet supply line. Most faucets in the US are 3/8\" compression, so that is what I'm working with below. Bill of Materials3/8\" x 3/8\" x 3/8\" Compression Add-a-tee Adapter - $6.58 at Home Depot1 ft. long braided tube with 3/8\" Compression Fitting (Faucet supply line) - $4.98 at Lowes3/8\" Compression Straight Quarter-Turn Valve - $8.99 at Lowes1/2\" NPT to Male GHT Adapter - $4.39 at LowesTotal: $24.94Of these, the hardest for me to find was the Add-a-tee adapter (which, incidentally, the staff at both hardware stores I visited insisted didn't exist). For tools, you will need a small adjustable crescent wrench.. It's probably pretty cramped under your sink, so it helps to pre-assemble the parts you can to reduce how long you will need to spend bent over. Pre-assemble the adapter. The proper hookup is:Water supply --> Tee --> FaucetTee --> Valve --> Faucet supply line --> Garden Hose Adapter(Tip! Look at the diagram above, as well as the picture of the pre-assembled mechanism. In the diagram, the section shaded in purple is what you are assembling, and what corresponds to the picture.). It doesn't make sense to run hot water through our wort chiller, so we need to figure out which supply line is which. Thankfully, there are only two choices, so this is easy. Shut off the valve supplying one of the lines, and turn on your faucet. If only hot water comes out, then congratulations! That valve is hooked to the cold water line (and is the one you want to tap into). If only cold water comes out, reopen the valve and try the other one. Once you've identified the correct valve, turn off the water to both lines and test your faucet to make sure nothing comes out. NOTE - This is important! If you don't do this, at best you'll end up with a mess, at worst you could get hurt or destroy something. Be careful, be safe, and make sure to check that the water is shut off before proceeding. To attach the adapter you just built, simply unscrew the existing cold water supply line running from the water valve where it connects to the line running to the faucet, and reattach both lines to the sides of the add-a-tee. Use a small crescent wrench to tighten both lines to the sides of the add-a-tee.Once you have attached & tightened your adapter, turn the supply valves back on and check for leaks.. Congratulations! Your adapter is installed, and ready for use. Hook up your wort chiller, turn on the water, and check for leaks. These are compression fittings, so if you see a leak, try tightening the nut nearest the leak. Now, go brew some beer and test it out! You're one step closer to better homebrew!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 0, 1, 3]\nC: [0, 3, 1, 2]\nD: [0, 2, 3, 1]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_76_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_76_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_76_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_76_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [2, 3, 1, 0]\nC: [0, 1, 2, 3]\nD: [0, 3, 1, 2]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. \n          To make the spaghetti yeti, the only ingredients you'll need are:\n\t\tA brick of super firm tofu\n\t\tSpaghetti\n\t\tWater\nYou'll probably also want to serve it with some sort of sauce, but I'll leave that up to you. The spaghetti yeti is abominably bland, so goes well in any pasta dish,\n        . Your yeti is going to be made by skewering a tofu body with dozens of strands of raw spaghetti, then boiling the whole lot together.\nThe first step is to carve your yeti's body. Try to carve it all out of a single brick of tofu, being careful not to make it too thin and flimsy at any point. There's no need for fine detail here; just go for the main features. Be sure to give it a wide, sturdy base so that it can stand up. That is to say, give it bigfeet.\nAs you can see, nearly all the structure of my yeti's body was concealed by its hair later on. I also decided that my yeti was too short, so added a separate head.. Carefully push a strand of raw spaghetti all the way through your yeti, then break it off at the desired hair length.\nRepeat in varying directions and lengths until the tofu body is riddled with spaghetti spikes. At this point it should look more like a sea urchin than a yeti.. Find a pot big enough to contain your startled-looking raw yeti. Be careful not to break any of the brittle spaghetti while you're handling your monster.. Boil your yeti until the spaghetti and the tofu are both cooked through. Try to do this at a gentle simmer, as a hard boil will send your yeti tumbling dangerously. I know that yetis are rugged enough to survive most avalanches, but they're naturally found in cold climates; at higher temperatures they become much more fragile.\nIf your yeti's hair is sticking out of the water, you may need to cover it with an upturned pot so that the steam will cook the dry spaghetti enough for it to turn limp and flop into the water.\nOnce your yeti is cooked, carefully remove it from the water and drain it in a sieve.. You may decided that you want to make and cook the head separately. This was my yeti's uncooked head, made from leftover pieces of its tofu body.. Use a pair of scissors to give your yeti a haircut, if necessary. How shaggy you choose to leave your yeti will depend upon how formal an event it will be attending.\nTry to choose appropriately sized scissors that won't shear your yeti in half.. Give your yeti a final going-over before you serve it up. I just added a pair of peppercorns as eyes, but I'm sure you'll be able to think of other ways to customise your own creation.. Pour pasta sauce, soup or whatever else you'd normally serve with spaghetti around your yeti and serve it up to some unsuspecting diners.\nBe sure to upload pictures of your own spaghetti yetis to the comments!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 3, 1, 0]\nC: [0, 1, 2, 3]\nD: [0, 3, 1, 2]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_77_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_77_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_77_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_77_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [2, 1, 0, 3]\nC: [3, 2, 1, 0]\nD: [1, 2, 0, 3]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Dr. Pepper Glaze\nI did not have any pictures of this step as I already had some glaze made. Depending on how many chickens you have you can decrease or increase this recipe. You use about \u00bd cup of glaze per chicken\nIngredients:\n2L Dr.Pepper\nIn a pot bring Dr.Pepper to a boil. Once boiling reduce heat and simmer at medium heat until the Dr.Pepper. Once the Dr. Pepper has reached a consistency of corn syrup remove from heat and set aside till ready to use.. Brining the chicken helps add flavour and keep the meat moist when its cooked.\nRecipe\n2Gallons of Water\n2 Cups Kosher Salt\n\u00bd cup Vinegar\n3 cups Brown Sugar\n1 cup pickling spice(I bought this premade)\nIn a pot heat a \u00bc of the water, add all of your ingredients and let dissolve. Once dissolved simmer for 5 minutes then add to the\u00a0remainder of the cold water.\nOnce the water has cooled add your chickens and soak for 6-8 hours in the fridge.\nOnce chickens are done remove from brine and dry off.. I use a charcoal bbq for smoking and buy my wood from the bbq store in my city. If you have a gas bbq most bbq stores sell things you can put in your bbq to create the smoke.\nItems Needed\nBBQ/ Smoker\nWood(I used apple wood)\nWater\nChickens\nStart up your bbq/smoker .\nSoak your wood in water as this will create more smoke when the wood is added to your charcoal and help the wood take longer to burn.\nOnce your bbq is at temp(200 degrees fahrenhite)\u00a0add the wood(about three pieces to start) then add your chickens.\nGlaze your chickens with your Dr. Pepper every 15 minutes. When you notice there is very little smoke coming out of the bbq add more wood to the coals.\nKeep smoker at 200 degrees Fahrenheit\nSmoke chicken for 4 hours or till the chickens reach an internal temperature of 164 degrees Fahrenheit.. Remove Chicken from the BBQ you will now have\u00a0a nice sweet glaze on your chicken\u00a0with a nice smoky taste.\nOnce cut open the juice should run clear if the meat looks slightly pink do not worry the smoke has a tendency to do that to the meat.\nEnjoy.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 1, 0, 3]\nC: [3, 2, 1, 0]\nD: [1, 2, 0, 3]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_78_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_78_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_78_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_78_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [0, 2, 1, 3]\nC: [2, 3, 0, 1]\nD: [3, 1, 2, 0]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. -First step is to get ready with all ingredients. cut onion slices thin and long. Even clean mushrooms and cut them in to slices .(I used  tin mushrooms which are already cooked).  If using fresh mushrooms clean  them properly and cut in to slices. -Take tomatoes in blender and blend to puree.Take a vessel and add butter to it. You can add even few tsp of oil.-When better melts add cumin seeds.when they splutter add crushed cardamom to it.. -Next add sliced onions add saute until they slightly change color.-Add ginger garlic paste to it and cook until you get rid of raw smell of ginger.. - After onions are cooked.Next add blended tomato puree and give a mix. -Then add all spices coriander powder,red chilli powder,garam masala,turmeric powder, salt and mix properly. -Add sliced mushrooms and saute for few seconds. Then close the lid and cook for few minutes until mushrooms are properly cooked(if using fresh mushrooms instead of tin mushrooms close the lid and cook them for few minutes). -Last step is to add fresh cream or 2 tsp milk.-Finally garnish with coriander leaves and serve hot with rice or naan\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 2, 1, 3]\nC: [2, 3, 0, 1]\nD: [3, 1, 2, 0]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_79_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_79_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_79_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_79_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [3, 2, 1, 0]\nC: [1, 3, 2, 0]\nD: [3, 0, 2, 1]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. First turn stove on to medium or around that. I just didn't want to burn the chocolate.Next: add chocolate pieces to sauce pan. ( I broke mine into pieces to make it easier to melt). The next step is to add the cherries and the Chili.I added about 1/3 of the bag of cherries to make sure that I had cherry flavor in each bite.Then I added 1 tablespoon of the chili powder to start off with. You can add as much or as little as you want, but I wouldn't add too much because then it would take over the flavor of the chocolate and cherries.Cook the chocolate and remember to continually stir to prevent burning. Cook and stir until it is completely melted and thoroughly mixed.. The next step is to take the chocolate and place it in a form to harden.I had a stainless steel mixing bowl that I put my chocolate in, so that it would be easy to remove once cooled.Once in mold, it is optional to place it in the freezer to quickly set up the chocolate. I did this and it worked great because it made the chocolate very hard and easy to remove.To remove chocolate, I placed the bowl upside down on a towel and then hit the bottom with a solid object and the chocolate just popped off the bowl and into the towel.. Place chocolate in plastic bag to keep it fresh.The last step is to enjoy!!!  omm nom nom\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 2, 1, 0]\nC: [1, 3, 2, 0]\nD: [3, 0, 2, 1]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_80_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_80_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_80_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_80_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [1, 0, 2, 3]\nC: [0, 1, 3, 2]\nD: [0, 2, 3, 1]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. You will need:Two pieces of bread.Peanut butterYour favorite jam (jelly,preserve,w/e)MargarineYou will need the following:3 Spoons1 Fry pan1 Stove top or similar device1 Flipping device (optional). Place your fry pan over the burner. Set the burner to high. We want the pan nice and hot when we start to cook as to seer the sandwich (crispy outside juicy inside).Using the back side of one of your three spoons extract a generous amount of peanut butter from the jar. Evenly spread the peanut butter against one piece of bread. The reason we use a spoon verses a butter knife is because of the slight curve to the spoon. It allows for easy application of the peanut butter. And as an added bonus you can scoop a mouthful when no one is looking(Yum!). Using the same procedure as with the peanut butter apply an even layer of jam to the other piece of bread. Having completed this step you will have two pieced of bread with something smeared on them. If what you have doesn't resemble Figure (1) you may need to go back to step one.The next thing is to combine the two pieces. Holding one of the pieces flip it over onto the second.. Using your third and final spoon put a thing layer of margarine to the top piece of bread.Grab you sandwich and place it butter side down on the fry pan. Hear that nice sizzle? From here one out it's a time challenge. Work fast or risk burning your food. Using your third spoon again apply a thin layer of margarine to the top piece of bread.. It takes a certain amount of skill to know when it's time to flip the sandwich. You can use a flipping device as shown in figure (2) to see when it is time to flip. Personally however I use a slight shaking motion to the pan. When the sandwich starts to move around easily I listen for a crispy sounding motion to come from the sandwich. When I hear that magical sound I use a G shaped motion to flip the sandwich. Do the same to the other side. Wait for a nice browning, or the magical crisp noise.. When the sandwich is done slide it onto a plate. Using your flipping device cut it corner to corner. This is a very important step, don't forget it. I don't know what will happen if you don't cut it corner to corner I have always remembered. If you choose to skip that step I take no responsibility as to what may happen. Proceed with caution.Slowly bring the sandwich up to your mouth bite down and enjoy the little piece of heaven you just found in your kitchen.Personal thoughts on eating this:This is one of the tasteyiest snacks I have ever had. And as an added bonus if you are ever feeling slightly ill, the warm peanut butter seems to coat your stomach and sooth it. I just ate the Mach 17 version of this sandwich 30 min ago and my belly is still warm and comfy.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 0, 2, 3]\nC: [0, 1, 3, 2]\nD: [0, 2, 3, 1]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_81_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_81_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_81_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_81_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [2, 3, 1, 0]\nC: [0, 3, 1, 2]\nD: [1, 2, 0, 3]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Make chocolate cakes. Big ones! Put them on a cake board and cover them in buttercream icing. Let the buttercream set in the fridge for 30 minutes.\nDon't have cake boards? Neither did I. Cut a circle from a pizza box and cover it in tinfoil.. Cover the whole cake in a layer of fondant, tinted black. It takes a whole lot of icing colouring to get black, so dump a lot in at once. If you can find chocolate fondant, it will become black with less colouring.\nTo get the fondant onto the cake, you first need to roll it out onto a sheet of plastic.\nThen drape the fondant, still attached to the plastic, over your rolling pin.\nGently slide the fondant and plastic over the cake.\nPeel off the plastic.\nSmooth out the fondant to adhere to the buttercream.\nTrim around the bottom. I use a pizza cutter.\nNow cut out another round of fondant, this time leave it white.\nPlace it on top of the black fondant.. Mix some gum paste with fondant, leave it white. The gum paste will stiffen the fondant a bit, making it sturdier.\nPlace a strip of white gum paste+fondant (gumdant?, fonpaste?) along the top and bottom.\nTo get it to stick, dissolve a pea sized piece of gum paste in a teaspoon of water, or whatever you feel like. Once it is dissolved, you will have a sort of glue to use. Brush some on with a little clean paintbrush.\nAdd a few more accent pieces.\nNow, the silver part.\nTo make it shimmer, you will need some silver dust you can find in cake shops. Mix a bit with vodka, not water, to make a paste. Water dissolves sugar, remember? The vodka will evaporate, leaving the dust.\nPaint this paste onto your white pieces.. To make these I used just gum paste, as these can be flat and don't need to taste great. (Gum paste dries hard)\nTo cut out I painstakingly printed out the letters on a piece of paper, cut them out, traced them onto fondant. I then carefully cut the fondant with a series of different tools.\nThen I painted them with the vodka silver paste.\nThese items can be cut and painted on a flat surface, on parchment paper or something. Let them dry and they will stiffen up for handling.\nGlue them on with the sugar and water stuff you made before.\nI painted the 'J' with red icing colour.. \nEnjoy your finished product while you can.\nThen give it to who you made it for and pretend it's not being destroyed, eaten, and digested.\nBecause that was a lot of work!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 3, 1, 0]\nC: [0, 3, 1, 2]\nD: [1, 2, 0, 3]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_82_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_82_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_82_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_82_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [2, 0, 3, 1]\nC: [0, 1, 3, 2]\nD: [2, 3, 0, 1]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. The ingredients for this recipe are...A dollop or two of whipped cream (from heavy cream)1 0.75 oz packet of hot cocoa mix2 standard candy canes2 shots of espresso (or about 4-6 oz of strongly brewed coffee)1/2 cup of cold milk (any type, including almond or soy). Pour out a bit heavy cream (just a few spoonfuls is fine, enough to have some room for whisking), and use a hand mixer with a whisk attachment to whip your cream to stiff peaks. Put your whipped cream in a refrigerator until ready to use. . Unwrap one candy cane and stick it in a plastic seal-able bag. Push out all the air from the bag, and fold it a couple times. On a non-damageable surface (ie. ground outside, hard stone surface, carpet, etc.) use a heavy object like a hammer or meat tenderizer to crush your candy cane. Make sure the candy is crushed as finely as possible. Then, run the crushed candy through a sieve to remove large particles (the particles should be separated about half and half).. Pull out a large cup. The size doesn't have to be very exact. Pour in your finely crushed candy cane (about half of the cane) along with your packet of instant cocoa powder (0.75 oz). Mix the two together, and set aside. . Now, go ahead and brew your two shots of espresso (if you don't have espresso, use strongly brewed coffee). Immediately pour your two shots into your cup, and mix to dissolve the cocoa and candy cane. . Pour out a bit of milk (about 1/2 cup) into a frothing cup, and use a milk steamer to steam the milk. You can use any sort of milk. If you don't have a milk steamer, simply heat it up on a stove. Immediately pour your milk into the cup, and scoop the foam on top of the mixture. . Pull out your whipped cream and scoop a few spoonfuls onto your mocha (however much you want). Sprinkle the top with a bit of your large bits of candy cane, and stick in your other candy cane (unwrapped) for decoration. . And that's it!!If this video/Instructable was helpful, please <3 it, and subscribe. Also, find me on Facebook, Twitter, Tumblr, Instagram, and YouTube @joshpancooking. The greatest gift to me would be if everyone subscribed to my YouTube Channel. Well, thanks for your time, and I shall see you all soon. Goodbye!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 0, 3, 1]\nC: [0, 1, 3, 2]\nD: [2, 3, 0, 1]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_83_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_83_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_83_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_83_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [3, 1, 2, 0]\nC: [3, 2, 1, 0]\nD: [1, 2, 3, 0]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. ToolsCake board to suit     Large knife     Small knife     Rolling pin    Small decorating paint brush    Silicon fondant mat (or baking paper works just as well)Bamboo skewerCakeReady made, or make your own cake, in this instance I used a ready made Sponges.2 x 20cm x 10cm x 3cm slabs and a      15cm x 8cm x 8cm slab.DecoratingAs with the cakes, you can make your own fondant, butter and piping gel icing, but I chose to pre-purchase my fondant and butter icing as mixing black icing is time consuming and messy. I made the piping gel icing as it was unavailable where I live and it was simple enough to make (see later steps).Butter icing     Black Fondant icing     White Fondant icing     Red Piping Icing     Brown food colouring      Red food colouring. Prepare the boardClean the board and place a little butter icing where the cake will be placed to stop the cake from sliding off.Shaping the cakeWith the larger sponge cake(s) if needed remove any parts of the cake that need to be joined together so there are no cooked edges are on the internal parts of the cake.     Level off the top of the cake with a large knife.     Create the shape of the you want, in this case I went for an oval shape as I could only get a round cake board.     Clean any cake crumbs from the board and cake.Butter icingUse a small spatula or in my case a butter knife to apply the butter icing.                                                        Note: be careful with butter icing on freshly cut sponge cake as it is very crumbly, and edges are prone to breaking off.Cover the entire cake in butter icing, then smooth the icing on the cake.                                                              Note: this can by made very smooth using a cup of warm water to dip the knife in and then smoothing the icing.Add the cake mask base and final butter icingWith the smaller sponge cake cut the cake into the desired oval shape and size.    Place the cake mask base on the already iced cake.   Butter Ice the cake mask base cake and smooth.   Clean the cake board.. Note: Keep any fondant icing not in use wrapped in food wrap or a resealable plastic bag.   Fondant icing is very pliable when warm, you can warm it up easily by kneading it in your hands.Prepare the fondant icingI'm too lazy to make fondant icing myself so I buy it in prepared colours from a range of different stores.    To use this icing well as it has been made some time ago so it needs to be worked a little to make it warm and pliable again, I do this just by kneading it in my hands and on the bench, this is also a great time to add any colouring to change colours as desired.Roll the fondant icingAs I need the icing to cover a cake it needs to be rolled out, doing this I flatten the icing out a little before starting to roll it out on the fondant mat.    Begin rolling the icing out, once have reached a thickness of about 5mm, add a second fondant mat to the top of the rolled out icing and continue to roll.    Every so often flip the icing over, remove and replace the icing mat to keep icing being rolled out level and to stop it sticking to the fondant mat.    Once a thickness of around 2-3mm has been reached make sure your icing is big enough to cover all of the cake, including the curve of the mask base, the top and the sides of the cake.Placing the fondant icingRemove the top fondant mat.    Place the rolling pin at one end of the fondant icing    Roll the fondant icing and the bottom fondant mat around the rolling pin    Pick up the rolled up fondant icing in the fondant mat    Place the loose end of the fondant icing at one end of the cake, with enough fondant icing to cover the side of the cake.    Unroll the fondant icing so it sits on the cake, with the fondant mat on the top.    When all the fondant icing is unrolled, you should be able to remove the fondant mat.    Smooth the fondant icing onto the cake, carefully working the icing into edges and grooves with the sides of your hands. Any creases or bubbles should be able to be smoothed out by gently lifting the edges of the icing to the crease and gently replacing.   You can also use tools to press the icing gently into any edges if you wish.Removing excess fondant icingOnce happy the fondant icing is fully in place, with a butter knife or similar cut the excess fondant off the cake board leaving around 3-5mm of icing from each edge of the cake.   Remove the excess fondant icing.    Check the removed fondant icing and keep any that does not have any cake crumbs.. Use a reference picture to create your icing mask, I was lucky enough to have my printed reference picture just the right size to use as a template to create the icing mask.Roll the fondant icingSame as the previous steps, although this time use white icing and make it around 8-10mm thick.Cut the icing mask shapeCut the mask shape out from the template using a small knife.   Cut the eye holes   Mark all the air holes in the mask by piercing with a skewer   Mark the painted areas on the mask by tracing the are with a skewer without piercing the paper   Remove all excess paperCreate the mask icing mask air holesEither use a sharp small tubular device (I used a small syringe tube with the end cut off and then slightly sharpened) to shove into the icing at each marked place and then remove the excess piece, or bore a hole with a skewer and make it larger (this can create issues with irregular sizes and pushes icing around).Add fondant icing nose and eye definitionsUsing a small piece of white fondant icing, create a small nose shape.   Using small piece of white fondant icing , create a thin roll of icing and cut it into two pieces.   Turn the fondant mask over   Using a brush add a thin layer of water onto the areas where the nose and eyebrows are the mask.   Attach the created nose in position.   Attach the two rolls of icing around the top part of each eye hole.   Allow water to dry for a couple of minutes   Turn the fondant mask back over.Place fondant icing mask on the iced cakeThis should be simple enough by sliding a spatula or a piece of cardboard under the mask. When in the required position, lower the mask close the the cake and slide the mask into position onto the cake.Finishing touchesRound the edges of the outside, eye holes and air holes of the icing mask.   Paint the marked areas with a small brush and undiluted red food colouring.   Using a small knife create marks on the icing mask to look light damage to the mask.   Using a piece of paper towel lightly brush around the edges of the eye holes and sides of the mask to create areas that look dirty, and brush into the gouges created earlier.. I modeled the knife after a hunting style knife I thought seemed appropriate to match the cake.Knife Blade and GuardMake some grey icing by taking some of the premixed white icing and add a little of the premixed black icing and thoroughly knead them together. Add the black icing sparingly as a little goes a long way, but remember you can always add more white to get the colour you want.Side step... You can skip the above colouring of the icing and leave the icing white and just use some edible silver paint for a metallic or chrome effect to paint the icing once shaped.Roll the grey icing until it is around 5mm thick. Cut the icing into a knife shape, bit of an elongated triangle really.Shove a bamboo skewer through the middle of the knife, protruding through each end to allow the pointed end to be stuck into the cake, and the other end to have the Guard and handle attached.Roll what is to be the 'bevel' edge of the knife so the edge is around 3mm angling up to the middle of the knife where it is to remain at the 5mm thickness making it look like a blade.Once happy with the 'bevel' of the blade finish shaping into a knife. Finally add a line down the middle of the knife along the start of the 'bevel' to the 'sharp' edge of the knife.If you like you can add the wire cutters to the back of the knife blade by removing small triangular pieces from the back of the blade.With a remaining piece of the 5mm rolled grey icing cut a rectangular piece that will be the knife guard.Finish shaping the guard by rounding the corners.Place the Guard onto the bamboo skewer on the blunt end of the knife to act as the guard.Knife HandleTake some black icing and roll into a cylinder to match the size of your knife blade you have made out of icing.When you have the required diameter of your handle trim each edge of the handle to the required length with a real knife by putting the real knife sharp edge lightly on the icing handle and rolling it back and forth adding more pressure to the cutting edge of the real knife slowly working through the icing handle until you have cut all the way through.Repeat for the other end of the icing handle.Add the grip to the icing handle by repeating the above steps to make slight indentations along the icing handle in regular intervals about 0.5mm deep.Place the icing handle on the bamboo skewer completing your icing knife.Insert the Icing Knife into the CakePick up the icing knife and gently insert it into one of the eye sockets sinking the knife slightly into the cake. WARNING: Any attempts to vigorously stab or throw the icing knife may result in damage to the icing knife.Note: As you may notice from the pictures my end product knife seems quite a bit shorter than the step by step photos, Yes I did redo the knife, but not the photos, also this is in no way related to the final warning.. Thanks to Google and McGreevy Cakes I was able to locate a suitable recipe for making my edible blood, which was made from Piping GelIngredients\u2153 cup granulated sugar1 Tablespoon Corn Starch\u00bc cup lemon juice (but I\u2019ve used just a dash of lemon extract in a pinch and it works fine)\u00bc cup waterRed food colouringInstructionsAdd all the ingredients except for the food colouring into a pot (saucepan)Mix over high heat until boiling, and then cool.Tint as desired!Make a piping bagGrab a mug or a cupGrab a plastic freezer or sandwich bag and place it into the cup, roll the edges of the bag over the edges of the cupWhen cool, pour the Piping Gel Mix into the bag inserted in the cupSeal the bagPick up the piping bag and when ready cut a small section off of one corner and then let the piping gel pour into the cake, icing knife and partially on the cutting board to look like blood.. Unfortunately I didn't get to see the kids reaction to the cake but I was informed they were suitably impressed and disgusted at the sight, but loved the taste.Thanks for readingDale\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 1, 2, 0]\nC: [3, 2, 1, 0]\nD: [1, 2, 3, 0]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_84_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_84_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_84_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_84_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [2, 0, 3, 1]\nC: [1, 2, 3, 0]\nD: [0, 2, 1, 3]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. One bread knife, one or more round rolls (buns), a number of eggs equivalent to the number of rolls.. Cut off the top of the roll.. Take out the inside of the roll. Do not take too much as the egg will leak. There has to be enough room to accommodate one egg and some spices and top ups.. Load the egg into the roll and top it up with your favorite spices. I have chosen a bit of salt, loads of white pepper and loads of red paprika powder. U can use chilli peppers, I would go for Scotch Bonnet if I had some, (u have to remember that they r very hot!!!) cut into small pieces, without any problem making the dish very Mexican ;). I added a bit of low temperature melting fat chase as well. You egg roll is ready for the oven.. Load the roll into the oven for about 20-25 min @180 centigrades thats 356 Fahrenheit. The time will vary depending on the oven type, forced air circulation etc. so you will have to experiment with your oven go get the desired effect. I usually try to have the egg roll very soft inside with the liquid yoke and the white barely done. You simply have to observe the time required for the desired effect in your oven so next time you will be able to set the timer up.. You egg roll is nice and ready. Because no fat is used for the egg processing the white is going to be very delicate and hot, so you will have to wait a bit longer than with the regular fried egg as the bun works like a thermo flask keeping the egg nice and warm for longer. Load your favorite ketchups and garlic mayonnaises or mustard and enjoy your extremely easy but tasty and good looking appetizer.That's it hope you will like it.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 0, 3, 1]\nC: [1, 2, 3, 0]\nD: [0, 2, 1, 3]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_85_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_85_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_85_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_85_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [3, 2, 1, 0]\nC: [3, 2, 1, 0]\nD: [1, 3, 0, 2]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Boil a chicken in a large stock pot with water and a quartered onion. (or your favorite chicken stock ingredients). Use a food processor to cut carrots, broccoli stalks, onions, celery and broccoli florets. Items should be cut fairly finely. Pulse each item for best results and set aside in a bowl. Keep broccoli florets separate.. In a large skillet melt a stick of butter. Place, carrots, celery, broccoli stalks, and celery and saut\u00e9 until soft. Do not add the broccoli florets yet.. Add veggie mixture to a soup pot and pour in broth. If your chicken stock isn\u2019t enough liquid you can add stock from cans as well to supplement. Add black beans.Add white wine. I use cheap Trader Joes wine (thanks almost 2 buck chuck!) Add salt and pepper and a little hot sauce, or if you only have siracha that will do to. Add a pinch or dried oregano and I had some thyme in the garden, you can also use dried thyme too. Remember, with seasoning best to start with less and always add more. Let this simmer for at least 30 minutes.. Take the chicken from the stock (it should be fully cooked) and tear off the meat and cut/tear into little pieces.. Add the chicken and the broccoli to the soup pot. Add liquid smoke, which is like a cheat for not actually having to smoke a chicken but still getting that awesome flavor! Add Worcestershire sauce and heavy cream. Stir and serve hot with bread!Please subscribe and check out my other Youtube recipes and fun DIY projects :) Including this awesome kitchen remodel I did!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 2, 1, 0]\nC: [3, 2, 1, 0]\nD: [1, 3, 0, 2]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_86_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_86_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_86_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_86_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [3, 2, 1, 0]\nC: [2, 1, 0, 3]\nD: [0, 2, 1, 3]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Liquid Nitrogen- 10-15 liters1/4 cup Torani Bacon Syrup\n1 cup heavy cream\n3 cups half & half\n8 egg yolks\n1 cup sugar\n1/8 tsp saltSafety Gloves\nSafety Goggles- with splash guard\nSqueeze Bottle or Marinade Injector\nIf using marinade injector, do not attach needle to tip. To make the bacon ice cream base, combine the heavy cream and half & half in a large pot. Bring mixture to a simmer, stirring occasionally.\u00a0 Once mixture begins to simmer, turn heat to low.\nNext, whisk eggs, sugar & salt in a bowl. Then gently whisk in 1/4 cup bacon syrup. Whisk 1/2 cup of the hot cream mixture into the bacon/egg yolk mixture.Repeat three times, whisking in 1/2 cup of the hot cream mixture each time. Next, return mixture to the pot with the remaining hot cream and raise the heat to medium low. Stir frequently for 5 minutes or until the hot cream mixture coats the back of a spoon. Strain mixture into a bowl and set aside for 20 minutes.. Fill marinade injectors or squeeze bottles with the bacon ice cream base.\n**Remove the needle tip before squeezing into the liquid nitrogen. Removing the needle tip will result in more uniform droplets**.. Put on your safety goggles and gloves. Carefully pour the liquid nitrogen into a large saute pan.\nYou can remove your safety gloves now. The technique requires that you move quickly, but also methodically.\u00a0 Using your marinade injector or squeeze bottles, hold the tip very close to the surface of the nitrogen and carefully squeeze out a drop a little smaller than a juniper berry. Each drop should be about the same size. Move about 1/2'' each time your squeeze a new droplet so that the drops don't land on top of each other.\u00a0. Time to indulge! Kitchen Science has transformed a classic savory flavor into a whimsical satisfying savory/sweet dessert. These little dots of bacon perfection will leave your taste buds begging for seconds, thirds..and soon they'll be all gone.. I originally tried to make the Dippin Dots in small stainless steel bowls. This resulted in the dots clumping together. I also originally used the marinade injector with the needle tip attached which also resulted in the dots clumping together.\nThe large saute pan I used to make the final Dippin Dots was the perfect vehicle for this project. It allowed adequate space for the droplets to form. Removing the needle tip allowed for more control over the injector and uniformly shaped dots.\nIn the future it would be interesting to try pouring the ice cream base through a colander into the nitrogen.\u00a0\nAny feedback or suggestions welcome!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 2, 1, 0]\nC: [2, 1, 0, 3]\nD: [0, 2, 1, 3]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_87_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_87_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_87_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_87_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [2, 3, 0, 1]\nC: [3, 2, 0, 1]\nD: [3, 1, 2, 0]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Homemade soup served by the hands of a loving wife or mother has been used as a home remedy with remarkable results. It\u00a0soothes\u00a0the throat and warms the body. \u00a0Canning large batches will serve you well when a family member comes home sick. \u00a0. \u00a0How many of you have ever tasted home canned food? The truth is home canned foods \u00a0have more nutritional value than store bought foods. You control what goes into the foods you can. It is very beneficial to can what you grow yourself, \u00a0because most farmers use harmful chemicals on their fields. If you can't grow it yourself consider buying produce that is grown organically. The flavor of home grown and canned produce is amazing!\u00a0 I grew up in a time when many people were still growing and canning their own produce. I know what a real dill pickle taste like and what you buy in the stores today don't even come close!\u00a0 Canning\u00a0 takes \u00a0time but if your time is limited consider growing your own garden and freezing what you grow. The benefits are worth the extra effort.\nIn this guide I have canned Grannie's soup recipe the lazy way. I canned the soup but have frozen it instead of using the pressure canner or pressure cooker method. This is an inexpensive way to get started and see if it is something you might be interested in doing. From there you will gain confidence and may decide to go for the real deal. I personally have canned fruits and jellies but have never attempted canning meats. Canning some foods require education because of the dangers involved if you don't do it properly.. \n\tThis is what you will need to make the soup:\n\t1 Boiled whole chicken adding only salt when cooking it.\n\tSave all the chicken broth.\n\tRemove the meat using a strainer if you have one, save and freeze the skins and bones if you have dogs or cats. I will show what to do with them later.\u00a0\n\tCut chicken in small bite size pieces.\n\t1 cup peeled chopped carrots.\u00a0\n\t1 Cup chopped celery.\n\t1 Cup chopped onion.\n\t1 Chopped jalapeno.\n\t4 garlic cloves.\n\t1 Lemon juiced. This is to add to the soup after it is cooked.\n\t2 Cups of fresh chopped tomatoes.\n\t1 Cup chives I used the tops of 6 green onions because I did not have chives.\n\t2 Chicken Bouillon cubes.\n\tI used curly noodles but you can add egg noodles as well.\u00a0\n\tThe secret to this recipe is use as many green spices as you can. I use what I have on hand.\u00a0You can add just about any kind of vegetable to this recipe and receive benefits from it.\u00a0 This is the recipe we have used for a very long time.\u00a0 I often use what ever I have at the time.\u00a0 Nothing is in stone.\u00a0 You can add parsnips, sweet potato and turnips for even better results.\u00a0 I did not have any on hand.\u00a0\n\tSpices:\u00a0 I adjusted my recipe for a larger group of taste buds.\u00a0 I like mine more seasoned and with more pepper.\u00a0Taste it after you add everything and adjust it for your taste buds.\u00a0 The more spices the better it works.\u00a0\n\t1/8Th Teaspoon of each of the following as desired:\n\tBasil\n\tParsley\n\tOregano\n\tPaprika\n\tChili Powder\n\tBay Leaves\n\tSage\n\tCumin\n\tRed pepper\n\tCilantro\n\tItalian seasoning\n\tDill weed\n\tCinnamon\n\tNutmeg\n\tSea salt\n\tPepper if desired\n\tYou may omit the peppers if your family is sensitive to it. Peppers help clean out the sinuses.\n\tUtensils:\n\t1 Large stock pot\n\t1 Large spoon\n\t1 Medium funnel with large opening\n\t1 Sharp knife\n\t1 Cutting board\n\tMixing bowls\n\tFood strainer if you have one.\n\tClean canning jars or heavy jars and lids with wide mouths. If this is your first time freezing in a jar just can/freeze a few to get the feel of it.\u00a0\n\tPlastic bags the number of jars you will be freezing.\n\tPlease note:\u00a0 If you are a\u00a0vegetarian you may substitute the chicken broth for a vegetarian broth and add rice and beans to make a complete protein.\u00a0\n\t\u00a0. Place the broth in the stock pot or cook it in a crock pot.\u00a0\nAdd all the spices.\nAdd the chicken.\nAdd all the vegetables reserving\u00a01 cup of\u00a0the tomatoes and a few green onion tops or chives for garnish.\nStir well.\nTurn on the burner and cook until the carrots are done but not over cooked.\nAdd the lemon juice to the cooked mixture.. Add the remaining tomatoes and chives to the jars.\nDo not fill the jars above the neck line. Leave at least 1 inch at the top for small jars and 2 inches for larger jars to allow for expansion. If you don't allow enough the jars could break. As it turned out my jars did not expand that much but it is best to be safe than sorry.\nLadle the soup into the jars.\nAllow to cool completely to ovoid breakage.\nWhen they are cooled completely carefully place them in the freezer with the lids off!\u00a0 As a safety measure: Place the jars into the plastic bags to prevent any glass from getting on other foods if the jar breaks.\nAfter they are completely frozen place the lids on the jars and screw down the lids.\nPut back in the freezer. There is no need to place them back into the plastic bags because they are frozen and there is no danger in them breaking.\nThat is all there is to it!\nWhen you thaw out the soup allow it to thaw in a bowl with cool water if you will be around to start cooking it when it is thawed.\u00a0 I personally feel safer defrosting it in the fridge. Avoid rapid thawing to prevent breakage.. I\u00a0promised\u00a0that I would add the link to my chicken soup bones recipe. \u00a0I made a completely different tutorial about how to cook the chicken bones to feed you dog/cat. \u00a0I had been visiting my sister and she was feeding her dogs chicken bones. \u00a0I never knew you could actually safely give them dog bones and they are very good for them. This tutorial also gives tips on how to potty train your dog and useful grooming tips on\u00a0\u00a0friendly products. Step 4 is about the dog food. \u00a0 \u00a0Here is the link on how to safely do that: \u00a0https://www.instructables.com/id/Potty-Training-Grooming-Nutrition-And-Choosing-/. I have pictures here of ways you can package the soup for gift ideas. You can begin to make the soup now and avoid that last minute holiday rush. It is important to place a large note on the package and tell them that the jar must be placed in the freezer or fridge asap or eaten within a few days. I know this is a repeat but it is very important and you would sure hate to find out that someone got sick on the soup you canned. The jars are not sealed so they need to be frozen until they will be used. Do not let them sit on the counter all day because bacteria can make you very ill. Thaw them in a bowl of cool water if you are going to be around to check on it often. Otherwise thaw in the fridge. Cook frozen soup as soon as you can remove it safely from the jar.\nFor a care package\u00a0 just add stuff one would take for a cold along with the soup. You can add a little or add a lot. You could make a family package because a lot of times everyone in the family gets sick. You can make the soup in a crock pot and take the entire pot to a sick family. Many different options you could do for this type of gift. Add bath salts recipe here: https://www.instructables.com/id/How-To-Make-Bath-Bombs/\u00a0\u00a0\u00a0 Lip balm: https://www.instructables.com/id/Delicious-Chocolate-Chapstick-Honey-Balm/, \u00a0candle, cough drops how to here:\u00a0https://www.instructables.com/id/Cough-Drops/ , Vapor rub\u00a0\u00a0, Orange juice, Vitamin C, Tea, Get well rock, Throat spray, or footie's just to name a few.\nThere are people who have concerns of storing foods in plastic containers or bags and this is a good alternative for them.\u00a0 You can use plastic to store them in and that is an option you might consider.\u00a0 This is a great way to get you comfortable putting up your own food.\u00a0To freeze broth simply place the broth in the fridge until the fat settles to the top.\u00a0 Skim off the fat and pour\u00a0the broth into a freezer bag and work out the air.\u00a0 Lay flat single layered on the freezer shelf.\u00a0 After it is completely frozen you may stack it to make more room in the freezer.\u00a0\nI am currently working on an Instructable using chicken bones for cat/dog treats. \u00a0When it is finished I will add a link here.\u00a0\nThank you for stopping by and have a super day!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 3, 0, 1]\nC: [3, 2, 0, 1]\nD: [3, 1, 2, 0]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_88_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_88_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_88_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_88_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [1, 0, 2, 3]\nC: [0, 1, 3, 2]\nD: [2, 3, 0, 1]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. You will need:One bowl of flattened rice (poha)One chopped tomato One chopped onion One grated carrot A few peanuts, cashews and sesame seedsA teaspoon red chili powder A teaspoon coriander powderA pinch of chat masala Salt depending on your taste Half a teaspoon of lime juice A few coriander leaves. Take a pan and add a teaspoon of oilLightly saut\u00e9 the flattened rice for few minutes.Remove the flattened rice and add the peanuts, cashews and sesame  seeds to same pan.Lightly roast them till they change color.. Now we need to add all the ingredients to the flattened rice and mix everything. First add the chopped tomatoes, grated carrot and chopped onions.. Add a pinch of chat masala, coriander powder, red chili powder and as much salt as you like.. Add the roasted nuts and coriander leaves and mix. Finally squeeze a little lime juice and serve immediately\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 0, 2, 3]\nC: [0, 1, 3, 2]\nD: [2, 3, 0, 1]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_89_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_89_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_89_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_89_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [3, 0, 2, 1]\nC: [0, 2, 1, 3]\nD: [3, 1, 2, 0]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. This tree stump is going to be a great place for one of our cute little birds to perch. It's also going to add height to the cake making it appear larger and more complicated than it is! You will need:light brown fondantdark brown fondanttoothpickSCULPTING THE TREE STUMP 1. Cut a 1 1/4 ounce piece each of dark brown fondant and light brown fondant and place on top of each other.2. In a twisting or pulling motion mix the two colors of brown fondant together to marble them slightly. You can see I only have one distinct vein. More would be great!3. Roughly form the fondant into an egg shape and flatten the top by pressing with your fingers.4. Squeeze the sides of the stump to give the stump ridges and to elongate it.5. Press or pinch the bottom of the stump with one finger in random places to give the effect of roots.6. Pinch the top edges to sharpen the 'cut' edge of the stump.7. Using a toothpick, score or scrape lines in the sides of the stump to resemble bark and circles on the top of the stump to create rings.. We are going to stain toothpicks with food coloring to make stick legs for our birds!You will need: four toothpickspaper towel or napkinbrown food coloringMAKING THE LEGS1. Dip the toothpicks into the food coloring.2. Remove and place on the paper towel without rubbing off the excess food coloring.3. After about 10 minutes wipe off excess food coloring with the paper towel. The toothpicks should be stained a medium brown color.4. Set aside to dry.. Oftentimes the simplest of shapes make the cutest characters. Sometimes leaving it simple is best! These birds are definitely simple! You will need: light pink fondant - 1 ouncedark pink fondant - 1 ounceorange - pea sized amountbrown - teeny, tiny piece!turquoise - enough for two very small eyesclear vanilla extractfood safe paint brushrolling pinrolling matMAKING THE BIRDS1. Remove a small piece (large pea size) of light pink fondant and set aside for the wings.2. Between two hands, roll the larger piece of fondant into a ball.3. Place on your rolling mat and with your hand roll the piece back and forth with gentle pressure until you start to get a cone shape (without a point).4. Using your fingers, press or pinch the smaller end flat to create the tail and curve up by pressing gently with your fingers.5. Divide the remaining piece of fondant in half and roll one half into a ball. Press to flatten slightly.6. Slightly pinch one side, and curve the pinched/pointed side up slightly. You should have a paisley shape wing. Repeat for the other wing.7. Pour a small amount of clear vanilla extract or vodka into the cap or a small bowl and apply a very small amount of extract (using a food safe paint brush) where you want the wing to be located on the bird body. Gently press the wing onto the body. It should stick immediately! Repeat on the other side of the bird. (Extract will leave a shiny or glossy appearance on fondant so use sparingly and try to be as exact with it as possible.)8. Divide the orange fondant in half and set one half aside reserving for the next bird and roll into a ball and form into a cone with your fingertips. This is the beak.10. Apply a dab of extract and adhere the beak to the front or head of the bird.11. Roll a very small amount of turquoise fondant into a ball and flatten. This will be the iris. Apply to bird with a small amount of extract.12. Roll an even smaller amount of brown fondant into a ball and gently press it into the turquoise fondant eye creating a pupil. Since it's so small of a piece it should stick without fondant, but if it doesn't, secure it with extract. Repeat steps 11 and 12 to make another eye for the back side of the bird.13. Repeat the entire process to make one more bird in your choice of color!. There are tons of ways to make fondant roses. I will cover three uncomplicated ways to make cute roses to top our cake with!You will need:light yellow fondant (1 ounce)dark yellow fondant (1 ounce)small round cutters - 1 inch and 1 1/4 inch (you can also use a shot glass, biscuit cutter, or make a circle pattern from thick paper and trace around it with a knife)paring kniferolling matrolling pinpetal foam (or any craft foam)ball head sculpting tool (you can also use a rounded end drink stirrer, a melon baller or a dry finger)FLOWER #11. Roll out a small piece of light yellow fondant and cut out six small circles.2. Using your finger, press the edges of each circle to thin them.3. Line the circles up and gently press them together at each overlap to secure.4. Flip the line of circles over so the wrong side is facing up.5. Roll the circles up from one end to the other.6. Cut the roll in half to create two roses.7. Pinch the bottom of the roses to open them slightly.8. Peel back a few of the petals to make the rose look more natural. Set aside to harden. Repeat to make more roses.FLOWER #21. Roll out a long piece of light yellow fondant and cut a 1/2 inch by 6 inch strip.2. Remove excess fondant and set aside.3. Press ridges in one side of the fondant with your finger. Continue along the entire edge of the strip.4. Pick the fondant up and roll the strip until you reach the desired size. If 6 inches is too long or makes it too thick of a flower you can cut it short anywhere you would like.5. Pinch or squeeze the bottom of the rose to make a stem and open it slightly.6. Cut off the stem so the flower will sit upright on the top of the cake. Set aside to harden. Repeat to make more roses.FLOWER #31. Roll out a piece of dark yellow fondant.2. Cut ten to fourteen, 1 1/4 inch circles and remove excess fondant.3. Roll a small sized piece of fondant into a ball. About the size of a large marble.4. Roll one end making it into a cone shape. The same way we made the bird! This will be the center of the rose. Set aside.5. Place one circle on the petal foam and in a back and forth motion with the ball head tool flatten and ruffle the edges. Repeat with remaining circles.6. Wrap one circle around the center piece of the rose.7. Repeat with remaining circles placing them randomly so the seams don't match up. You can make the rose as small or as large as you want. Fold the petals back slightly to give the flower an open look.8. Pinch the bottom edge when finished to create a stem and cut the bottom so the rose will sit flat. Repeat the process to make more roses. . If you haven't already done so, mix the colors of fondant you wish to use in the pattern of the cake. I used eight different colors (nine shown) but more or less would be fine.This is where you get to be creative and think about repeating patterns. What shapes and sizes do you want for your cake? I chose a superellipse shape and used all the same size but you could experiment with various sizes of the same shape for a unique effect. Examples of other shapes that would make fun repeating patterns are: rhombus, stars, heart, circle, octagon, crescent, parallelogram, etc.You can also\n find a lot of cutter options in the clay modeling section of the craft \nstore. Let's get started!You will need:geometric cutter (I got my superellipse from a small set of inexpensive cutters from the Duff Goldman collection that I found at Micheal's.)rolling pinrolling matparchment or waxed paper lined sheet pan (You can also just use your countertop or a table!)MAKING THE PATTERN1. Roll out one color of fondant very thin on your rolling mat using a rolling pin. When I say 'thin' it should basically be as thin as possible without tearing.2. Using the wrong side of the cutter cut out 15-20 pieces or shapes. If you use the wrong side (folded metal side) of the cutter you will get an exact shape with sharp corners. If you use the correct or sharp side of the cutter you will get rounded edges. This drives me nuts, but some people don't mind at all!!! You can see the difference in the picture (left cut-out = wrong side of cutter; right cut-out = right side of cutter). If you are using a cutter with rounded shapes like a cloud or a flower it won't make much of a difference which side you use.3. Smooth the sides if any rough edges are present by folding or gently squeezing the frays to the back side of the shape. 4. Transfer pieces to a baking sheet lined with parchment (or waxed) paper. Make sure they are not touching so they don't stick together! If you don't have a baking sheet you can place the parchment directly on your table or countertop in a place that won't be disturbed.5. Repeat with remaining colors!. Here we will follow simple steps to apply a repeating pattern to our fondant cake.You will need:fondant covered cakeparing knifefondant shape cut-outsclear extractsugar pearlscake stand or base1. Cut one of the fondant pieces in half from point to point using a sharp knife. Do this with one of each of the remaining colors equaling 18 halved pieces total. You may need more later but this is a good starting point for now.2. Apply a small dab of clear vanilla extract to the back of the fondant piece and place the top flat edge in line with the top edge of the cake.3. In a random pattern of colors apply pieces straight down in the same manner cutting the bottom most piece as necessary. I only needed to cut a tiny little piece off to make it fit at the bottom edge of the cake.4. Continue around the entire cake trying to keep the pieces straight and the colors in a random pattern.5. Once you have finished applying the pattern press a sugar pearl into each corner where the pieces meet. If the pearls don't stick by pressing into the fondant you can apply a small amount of extract to get them to stick. Sugar pearls will acquire a strange thick gooey film where they come into contact with extract so use only if necessary!!We are ready to decorate and complete the cake!. Arrange flowers, birds, and tree stump in any manner you wish! I ended up using two birds, four roses, and six small flower cut-outs with pearl centers (same flower technique as from the drip cake) on the top of the cake. Here's an idea of how to arrange your fondant decorations:1. Decide which side you want to be the front of the cake.2. Place the tree stump toward the back of the cake just off center to the right and place a large rose on the edge of the cake near the front right. Add some small roses and flowers around in groupings or any arrangement you like. Secure with extract.3. Push toothpick legs into the birds. I left off the back wings so the bird could harden slightly when I was working on other things.4. Stick one bird so it is standing on the stump and one toward the front left. Adhere the back wings on each bird with extract.Congratulations!! You should be very proud of yourself! You have completed a beautiful decorated cake that is ready to serve! If you don't plan on serving the cake right away, leave it out at room temperature for up to 3 days until you want to slice it and serve it. Do not refrigerate!If this cake is for a birthday party and you choose to put candles in it they will easily poke through the fondant and stand securely.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 0, 2, 1]\nC: [0, 2, 1, 3]\nD: [3, 1, 2, 0]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_90_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_90_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_90_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_90_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [1, 3, 2, 0]\nC: [0, 2, 3, 1]\nD: [1, 0, 2, 3]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. 1 (10 oz.) bag mini salted pretzels (~3 c. of mini pretzels)\n2 c. toasted oat cereal (such as Cheerios)\n2 c. crispy corn cereal squares (such as Chex)\n2 c. mini marshmallows\n1 (8.2 oz. bag) candy coated chocolate candies (such as M&Ms - I used peanut M&Ms for this version)\n1/2 c. salted peanuts\n1 (11 oz.) bag white chocolate chips\n2.5 tsp. vegetable oil. Line 2 baking sheets with wax paper/parchment paper and set aside. This is where the snack mix will go when you are done mixing it up so it can cool and set.. Combine all ingredients in a large bowl EXCEPT for the white chocolate chips and vegetable oil. Make sure that the bowl is large enough!\nIn separate microwave-safe bowl, heat white chocolate chips and vegetable oil on medium-high heat for 1 minute, stirring once. Heat for 10 second intervals, stirring after each, until chocolate is smooth and melted.\nPour chocolate over cereal mixture and stir until evenly coated (don't forget to make sure all of the goodies at the bottom get coated!).. Spread mixture onto prepared baking sheets and let cool. Break apart once mixture is cool and put into serving bowl, cellophane bags with ribbon, etc. Store leftovers in airtight container (not like there will be any left after everyone tries it!).\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 3, 2, 0]\nC: [0, 2, 3, 1]\nD: [1, 0, 2, 3]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_91_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_91_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_91_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_91_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [2, 0, 3, 1]\nC: [0, 3, 1, 2]\nD: [2, 3, 1, 0]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Ingredients:1. One pack mutton knuckles2. One can whole kernel corn3. One chopped onion4. Four small chopped potatoes5. Olive oil6. Your favorite spices and herbs7. Half a cup rice8. Salt. 1. Wooden spoon2. Can opener3. One pot. Add to your pot a little bit olive oil, and then fry your chopped onion in it. You can add spices or herbs.. Add the mutton knuckles to your chopped onions and fry for a while.. Do Not stir!. After your meat has cooked for a while add your chopped potatoes and rice. Add also your choice of some salt,spices and herbs. Add a bit of water. Don't over do it.. Let it cook for about an Hour. You can add water if needed. Don't add to much, but add enough for the rice to cook. Do Not stir!. Have fun, just don't let your towers fall over!. After waiting, open the can and add corn to pot. Note: Dispose of all the liquid before adding. Stir in gently.. Enjoy your one pot meal! Eat it with anything you like, or enjoy it alone.Please vote for this Instructable!Thanks for reading through my Instructable. Please try it out and tell me what you think about it in the comment section.Check out our other Instructables:Easy OmeletteLichen GardenEasy Lemon Peel JuiceReal Chocolate BrowniesYou can also check out my forum topic:The Powerpoint Game Community\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 0, 3, 1]\nC: [0, 3, 1, 2]\nD: [2, 3, 1, 0]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_92_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_92_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_92_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_92_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [0, 3, 2, 1]\nC: [3, 0, 1, 2]\nD: [2, 3, 1, 0]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. I dropped the rock crab into boiling water and cooked them for 10 minutes. I strained the water, and cleaned them. The claws are where most of the meat is found. So tap these with a hammer or something hard to crack them, then clean out the crab meat. The legs are pretty small, so these I tossed whole into the gumbo. The bodies were also a little small to clean and a lot of meat was left in the claw shells, so I put these into a pan with five cups of water and let it simmer. The kitchen began to smell swampy.... Brown your sausage-I had about 1 pound. Remove from pan and then add 1 onion, 2 green peppers, and 4 stalks of celery. Sautee these. . To your pork stock, add 1 can tomato sauce, or canned diced tomatoes and 1 can tomato paste. Combine pork stock and crab stock (after you strain out the crab shells) in one large pan, then add the sausage vegetables and spices. (2 bay leaves, 2 tbsp. paprika, 2 tbsp. red chili flakes, 1 tbsp. salt, 2 tbsp. black pepper.) . Put 1 cup of pork lard in a frying pan and melt on low heat. Slowly stir in 1 3/4 cups of flour. Continue to stir and cook for 20 minutes. . Add the crab meat, 1 pound of slice okra, and 1 pound of collard greens cut into ribbons and simmer for 45 minutes until flavors blend. If you live somewhere where they sell fil\u00e9, or (sassafras leaves), then add this spice towards the end. I live in San Francisco and couldn't find it anywhere, so I added spices to taste. . The gumbo lets you stretch the crab far! I was able to serve about 25 people gumbo over white rice. \nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 3, 2, 1]\nC: [3, 0, 1, 2]\nD: [2, 3, 1, 0]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_93_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_93_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_93_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_93_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [0, 2, 3, 1]\nC: [1, 0, 3, 2]\nD: [2, 3, 1, 0]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. We start with the ingredients for the pastry cream:  4 egg yolks, 1/2 C sugar, 1 1/2 C whole milk, 1/4 C corn starch, 1 vanilla bean, and 2 Tbs unsalted butter.. Whisk together the yolks, sugar, and corn starch.  Whisk vigorously until the stiff mixture loosens up and turns pale yellow.  Then simmer the milk, and very slowly whisk the hot milk into the bowl with the egg mixture.  Stir constantly until well combined.. Pour the mixture into a medium sauce pan and place on the stovetop over medium heat.  Whisk this mixture vigorously and constantly.  As it comes to a boil, it will start to thicken.  Remove it from the heat as soon as you feel it tighten.  Slice open the vanilla bean, remove the caviar, and add it to the cream along with the butter and whisk to combine.  Place the cream into a bowl and cover it with plastic wrap, placing the wrap so it touches the cream.  This will prevent a skin from forming.. Take one thawed piece of puff pastry dough (one sheet from a standard package of frozen dough).  Cut eight ten centimeter circles with a cookie cutter.  Take a smaller cookie cutter and cut the insides from four of the eight circles, to form four rings.. Place the four circles on a greased sheet pan.  Brush them with egg wash (1 egg mixed with one Tbs of water).  Place the rings on top of each circle and brush those with egg wash too.  Let these rest on the counter for about 15 minutes while you preheat the oven to 400 degrees.  Bake them for 20 minutes on the center rack.. While the Vol au Vents bake, make the coulis.  A coulis is just a fruit puree that's used as a sauce.  To make it, heat a pint of fresh raspberries (reserve four berries for garnish) in a small saucepan.  Add 1/3 cup sugar, the zest of one lemon, and the juice of half of a lemon. Use a fork to smash the berries.  Add a sprig of thyme, leaving it whole.  Remove the sauce from the heat and let the thyme infuse it as is cools.  Then remove the thyme and discard.  Place the sauce in the refrigerator until you need it.. Pull a bit of pastry gently from the middle of each vol au vent so you have space for the cream.  Spoon the cream into a freezer bag and snip a 1/2 inch opening into one of the tips with scissors.  Pipe the filling into the pastries.  Garnish each of four plates with some of the coulis.  Add a raspberry and some fresh thyme leaves.  Serve and enjoy!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 2, 3, 1]\nC: [1, 0, 3, 2]\nD: [2, 3, 1, 0]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_94_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_94_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_94_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_94_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [1, 2, 0, 3]\nC: [0, 2, 1, 3]\nD: [1, 3, 0, 2]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. First let's preheat the over to 350 degrees F. Then let's start by creaming the melted butter with the sugar. . Next we add our eggs and mix it together. . Now we add the vanilla extract and mix it in. . Next we combine the mixture we just made with our applesauce and mix it together. Just until incorporated, we don't want to go crazy with the mixing. :). Now we add our cinnamon and baking soda to the flour and whisk it together. Or if you like, you can use a sifter to mix the dry ingredients. Then add the liquid mixture to the flour and mix it together until you get a nice batter/dough. . Now we fill up the muffin cups about 2/3 to 3/4 of the way full depending on how large you want your muffins. Then put them in the oven to bake. 350 degrees F for 20 to 25 minutes. They will be nice and golden brown on top. Poke a tooth pick in and if it comes out clean and dry they are done, if they have a bit of batter on them, bake it for a few more minutes. . Now all you have to do now is enjoy them!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 2, 0, 3]\nC: [0, 2, 1, 3]\nD: [1, 3, 0, 2]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_95_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_95_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_95_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_95_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [2, 3, 0, 1]\nC: [2, 1, 0, 3]\nD: [1, 2, 0, 3]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. \n          List of groceries:\n\n\t\t1 Can whole peaches if you can find them. \u00a0I used peach halves.\n\t\t1 Small bag\u00a0spaghetti.\n\t\tButter to taste.\n\t\tSeveral Goldfish Colors.\n\t\tSeveral\u00a0raisins.\n\t\tSeveral dried cranberries.\n\t\tYour choice of mixed vegetables for sea plants. \u00a0( I used carrots, onions, purple cabbage and parsley because that is what I had. )\n\t\tParmesan\u00a0cheese ( shredded.)\n\t\tParmesan cheese grated.\nPlease note: \u00a0I have\u00a0included a couple of hot dog versions. \u00a0You can see from the pictures I added a hot dog instead of a peach. \u00a0I also added different vegetables,\u00a0Ramon noodles\u00a0and a\u00a0bento\u00a0box.\u00a0It is basically the same idea though. \u00a0. For the octopus in a bowl you will need:\n\t\tKnife.\n\t\tCutting board.\n\t\tPan.\n\t\tColander.\n\t\tClear bowl.\n\t\tFork or spoon to stir.\n\t\tTooth Pick for poking the holes for eyes and mouth.If you want it in a\u00a0pedestal you will need this in addition:\n\t\tClear Pedestal bowl.\n\t\t2 Gallon zip lock bags.\n\t\tBlue food coloring.\n\t\tLED light.For the jar specimen you will need:\n\t\tEverything for making the clear bowl octopus plus:\n\t\t1 Small jar with lid.\n\t\tScissors.\n\t\tTape.\n\t\tBlue construction paper.\n\t\tStraight edge.\n\t\tLed light ( optional ). Spaghetti:\n\t\tCook the spaghetti according to the directions.\n\t\tStrain off the water.\n\t\tPlace in a bowl not the container it will be displayed in.\n\t\tAdd butter.\n\t\tCheese.\n\t\tSeasonings.\n\t\tStir well.\n\t\tPlace the\u00a0spaghetti carefully into the display bowl. \u00a0\u00a0\nIf you decide to add cooked vegetables instead of the raw go ahead and cook them at the same time you make the\u00a0spaghetti.. Center the octopus in the bowl:\n\t\tWith a tooth pick make the eye and mouth holes.\n\t\tPlace\u00a0raisins\u00a0and cranberries in the eyes and mouth. \u00a0\n\t\tYou might need to cut them smaller.\n\t\tPlace the octopus where you want.\n\t\tArrange the vegetables to look like a ocean scene.\n\t\tArrange the fish crackers in the front.. Jar:\n\t\tIf using hot dogs, cooked vegetables, or \u00a0noodles go ahead and cook them. \u00a0It is best to boil the hot dogs.\n\t\tWhile they are cooking start making the back drop for the jar..\n\t\tCut the construction paper half way around the jar so the octopus can be seen.\n\t\tTape it to the outside of the jar with the best side of the jar to the front.\n\t\tPlace the spaghetti or noodles in the jar carefully not to smudge the sides with oil.\n\t\tCut the legs for the dogs by looking at the pictures.\u00a0\n\t\tPlace the raisins and cranberries in the eyes and mouth of the peach or hot dog\n\t\tSet the octopus in the jar as desired.\n\t\tPlace the vegetables around the octopus.\n\t\tSet the jar on top of a LED light (\u00a0optional ).\u00a0\n\t\tAdd lid.\nI used\u00a0Vienna\u00a0sausages for the jar but\u00a0they were harder to work with.. \n          Pedestal arrangement:\n\n\t\tFollow the directions for making the octopus in the previous steps.\n\t\tPlace the spaghetti in the\u00a0pedestal\u00a0bowl.\n\t\tArrange the vegetables around the octopus.\n\t\tPlace the Led light in the bottom of the stand.\nShown are 2 different arrangements. \u00a0 I filled 2 zip lock bags with water and blue food coloring.\nI double bagged it to be safe. \u00a0I really liked the fish crackers. \u00a0I think it brought a lot of life to the arrangement. \u00a0. Here is the Bento style:\nI arranged it in the same way as I did the other arrangements only I used a Wendy's box. \u00a0I thought it turned out real cute. \u00a0I only cut 6 legs because the\u00a0Vienna's\u00a0\u00a0were too brittle. \u00a0. These were a lot of fun to make. \u00a0I liked all the different variations. \u00a0The\u00a0Vienna's were harder to work with though. \u00a0You might have noticed that I braided some of the spaghetti. I think it was hard to braid because I cooked them a little too long. \u00a0I wanted to use the onion to tie ribbons on the ends of the braid but they did not work very well. \u00a0I tried celery but they did not work either. \u00a0If you have any suggestion please share them. \u00a0I think it would be awesome to have made them with braids and bows. \u00a0\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 3, 0, 1]\nC: [2, 1, 0, 3]\nD: [1, 2, 0, 3]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_96_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_96_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_96_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_96_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [3, 0, 2, 1]\nC: [2, 1, 3, 0]\nD: [0, 2, 3, 1]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Before you start, it is important to get an assortment of different kinds of apples. Throwing golden delicious into the mix is always a good choice.. For the filling:6 Assorted apples1 Tablespoon of flour1 teaspoon of cinnamon\n1 Tablespoon (real) maple syrup\n(not pictured)1 Tablespoon of cold butter\n(chopped)2 teaspoons of fresh lemon juice\n(1/2 teaspoon of grated lemon peel - optional)For the crust (top and bottom):4 cups flours2 sticks cold butter (1 cup)1 teaspoon salt1 cup ice water. When making this pie, I made two separate balls of dough. One for the top, and one for the bottom.Alternately, I could have made one large ball of dough, and then separated it into two smaller balls.For the the first ball, we will mix together 2 cups of flour and 1/2 a teaspoon of salt. . Once the salt and flour are sifted, the next step is to slice a stick of cold butter into small pieces and mix them in with a pastry blender. Set aside the butter wrapper.. Mix ice cold water into the dough one tablespoon at a time.Start pressing the dough together with a fork. With each subsequent tablespoon of water, as it begins to bind, mix it together with your hands. Continue adding water until the dough can be formed into a ball. . Wipe the butter wrapper around on the inside of the pan to grease it. . Place the ball of dough on a well floured surface. With a floured rolling pin, spread out the dough into a large circle. . Transfer the dough to the pan, and press it into the bottom.Trim away any dough that extends past the edge of the pan. . For best results, lightly tap the dough on the bottom of the pan with a fork. . Cover the dough with plastic wrap and leave the pie crust in the refrigerator for at least 2 hours.. Repeating the process you have just followed to make the bottom pie crust, make another one for the top. Stick this in the refrigerator as well when you are done.. Peel and slice the apples. Place the slices in the pie crust. . Mix together all of the filling ingredients with the apples. . Place the dough topping over the top of the pie, and trim it to the edge of the pan. . Cut slits into the dough topping with a sharp knife so that the pie can breathe in the oven.With a pastry brush, coat the top of the dough with milk. This will give it a nice golden crust.. Bake the pie at 425 degrees for about fifty minutes with the top covered in aluminum foil. Remove the foil and continue baking until the top turns a nice golden brown.Once it is the right color, remove it and leave it on the stove top to cool. . After about an hour (or so), your pie should be cool enough to serve and eat. You made a pie. Hooray!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 0, 2, 1]\nC: [2, 1, 3, 0]\nD: [0, 2, 3, 1]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_97_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_97_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_97_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_97_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [0, 3, 1, 2]\nC: [2, 1, 0, 3]\nD: [1, 2, 0, 3]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Here's the complete shopping list: \u00a0\n\t\t4 large eggs\n\t\t1 cup half and half\n\t\t1/2 cup milk\n\t\t8 oz. Mexican Chorizo sausage\n\t\t8 Jalapeno Peppers- medium\n\t\t1/4 cup Red Peppers- any variety\n\t\t6 oz Chihuahua cheese- (may substitute Monterey Jack)\n\t\t8 oz Colby-Jack Cheese\n\t\t1/2 plus 1/3 cup flour\n\t\t1/2 cup yellow onion- chopped\n\t\t3 Tablespoons Butter\n\t\tGarlic Salt\n\t\tBlack Pepper\n\t\tTabasco\u00a0\n\t\tGround Cumin\nNot to worry... I'll break this list of ingredients down in each Step as needed and used. ;-) \u00a0Important Tools: \n\t\tThis pie is made with a\u00a0savory\u00a0Dutch Baby \"pie crust\". \u00a0You'll need\u00a0to use a #5\u00a0cast iron skillet -OR-\u00a0a capacity-equivalent\u00a0ironstone baking dish for peak results.\n\t\tAn electric hand mixer. \u00a0\n\t\tlatex gloves to handle the jalapeno peppers.\n\t\tPotholders that withstand extreme heat.. Great news! \u00a0This prep work can be done up to 2-3 days in advance. \u00a0\nThe only thing you'll need to remember is to remove the foods from the refrigerator at least\u00a0a 1/2 hour before bake time so they'll be at room temperature. \u00a0No worries, though... I'll remind you. ;-) \u00a0\nLet's get started.Begin with the jalapenos:\nChoose jalapenos that will fit comfortably around the inside of your skillet.\nRoast and sweat the jalapenos before cleaning them. \u00a0If you're unfamiliar with the process, here's an easy, how-to technique to get you started. \u00a0Don't be tempted to forego roasting and sweating the peppers.\u00a0They need\u00a0to be par-cooked before going into the quiche.\nPut on the latex gloves. \u00a0Cut a small pie-shaped wedge out of each jalapeno. \u00a0Dice the wedges, put in a bowl and set aside. \u00a0\nUse your gloved \u00a0thumb (or a grapefruit spoon) to scrape out the seeds and membrane. \u00a0Discard them.\nCut off a 1/2' thick slice from the block of chihuahua cheese. \u00a0Trim it into a wedge that will fit snugly inside the hollow jalapeno.\nRepeat until all of the jalapenos are stuffed. \u00a0Cover and refrigerate if you aren't using right away. \u00a0\nSet the leftover cheese aside to grate later.\nThinly slice approximately 1/4 cup red peppers. \u00a0Put them in the bowl with the diced jalapenos. \u00a0Cover and refrigerate if you aren't using right away.Pre-cook the Chorizo:\nBefore cooking, remove the chorizo from it's casing. \u00a0Put it in a medium skillet over medium high heat. Add 1/4 cup of water. Bring to a boil, reduce heat and simmer/stir for 7-8 minutes or until the water has evaporated. \u00a0Cover and refrigerate if you aren't using right away.Saute 1/2 cup chopped yellow onion\u00a0in 1 tablespoon of butter until just limp. Sprinkle with garlic salt. Cover and refrigerate if you aren't using right away.Grate the extra chihuahua cheese pieces and the colby jack cheese. \u00a0You'll need about 1 1/2 cups of cheese for the quiche filling and 1/2 cup for garnish. \u00a0Hint: You can find chihuahua cheese in any Mexican Market. \u00a0Save the grated chihuahua for the garnish. It melts beautifully.Work Station Management:\u00a0\nJalapeno Popper Pie is a bing-bang-boom-BREEZE\u00a0to prepare if you're organized. \u00a0\nMeasure and set out all of the ingredients (except the grated cheese and the butter) 1/2 hour ahead of time.\nFor the Dutch Baby crust to rise successfully, it's imperative that the skillet is blistering hot and the eggs/milk are at warm room temperature. \u00a0Take them out of the frig at least an hour\u00a0before baking. \u00a0I'll reiterate that one more time, \u00a0just to make sure we understand each other. ;-). Place skillet inside the oven and preheat to\u00a0475\u00b0 for 10 minutes.Crust Ingredients:\n\t\t2 Eggs-\u00a0Room\u00a0temperature\n\t\t1/2 cup Milk- Room\u00a0temperature\n\t\t1/2 cup flour- sifted\n\t\t1/2\u00a0teaspoon Garlic\u00a0Salt\n\t\t1 generous pinch\u00a0ground Cumin\n\t\t4-6\u00a0dashes of Tabasco Sauce (Great flavor enhancer with minimal heat)\n\t\t2 Tablespoons COLD ButterPreparation:\u00a0\nIn a medium bowl, beat eggs with an electric mixer until thoroughly combined. \u00a0\nAdd milk, garlic salt, cumin and Tabasco sauce. \u00a0Mix well. \u00a0\nGradually whisk in flour until smooth.\u00a0\nRemove skillet from oven. \u00a0Add the cold butter to the skillet. \u00a0Use a fork to move it around so the bottom and sides of the skillet are well coated. \u00a0\nPour all the batter into the center of the skillet, then return it to the oven immediately.\nBake until puffed and browned, about \u00a010-12 minutes.\u00a0\nWhile the crust is baking, prepare the filling so it can be immediately added when the dutch baby crust is finished.\n\u00a0. \n          \u00a0Egg filling ingredients:\u00a0\n\t\t2 large eggs\n\t\t1 cup half and half\n\t\t1/3 cup flour\n\t\t1/2 teaspoon Garlic seasoning saltAdditional ingredients- prepped in advance:\n\t\t1/2 cup sauteed yellow onion\n\t\t8 oz. Chorizo- pre-cooked\u00a0\n\t\t8 Jalapeno peppers-\u00a0stuffed with Chihuahua cheese\n\t\t1/4 cup red chili pepper- \u00a0thinly sliced\n\t\t2 cups Colby-Jack cheese- grated\n\t\tChihuahua cheese- grated for garnishPreparing the Quiche filling: \u00a0\nPut the eggs in a medium bowl. \u00a0Using the electric mixer, beat the eggs then add the half and half and garlic salt. \u00a0Beat on high speed until thoroughly mixed. \u00a0Slowly add the flour and beat until smooth.\nRemove the Dutch Baby from the oven.\nLayer it with onions, 3/4 cup colby-jack cheese and half of the diced peppers.\nUsing a measuring cup, pour half of the egg filling on top. Layer with 1/2 of the chorizo.\nArrange the stuffed jalapenos around the skillet. \u00a0Top with another 3/4 cup grated cheese. \u00a0Add the remaining chorizo and diced peppers.\nPour the remaining egg mixture around the jalapenos. Scatter the sliced red pepper on top.\nMove the skillet very carefully back into the oven.\nReduce the temperature to 350\u00b0. \u00a0Bake for 30 minutes, uncovered.\nCover loosely with aluminum foil, bake another 30 minutes.\nRemove from the oven, leave covered and allow the quiche to rest for 10 minutes.. Sprinkle the quiche with the extra cheese.\nServe the slices with sides of salsa, sour cream and warm flour tortillas .\nEnjoy!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 3, 1, 2]\nC: [2, 1, 0, 3]\nD: [1, 2, 0, 3]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_98_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_98_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_98_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_98_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [3, 2, 0, 1]\nC: [2, 1, 0, 3]\nD: [1, 0, 2, 3]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. You will need the following - 1 box of your favorite pasta - 1 can of Campbell's\u00a0Cheddar Cheese - 1 can of\u00a0Campbell's\u00a0Cream of Mushroom Soup - Breadcrumbs - 4 Tbsp salt - 18 oz. of extra sharp cheddar chese - Milk - Cutting Board - Knife - Large pot - Colander\u00a0 - Large Casserole Dish with lid - Wooden Spoon Preheat you oven to 375. - Fill pot with water about 3/4 of the way full - Set on stove to boil on high. - Add 4 Tbsp of salt to water - Stir until dissolved. - Cut 18 oz. of the sharp cheddar cheese into 1inch cubes. - Add the pasta to the boiling water - Follow the directions on the pasta box tho see how long to biol the pasta. - Cook to Al Dente. Pour cooked pasta and water into a colander in your sink to drain.. Add the cream of mushroom and the cream of cheddar to the pot. (If you are not a big fan of mushrooms then use two cans of the cheddar). Fill your now empty cans with milk. Pour your two cans of milk into the pot and set to medium-low heat.. - Add the cubed cheddar cheese to the pot - Melt, stirring frequently. - Add the pasta back to the pot. - Stir until well coated. Pour Macaroni and Cheese into the casserole dish. Sprinkle breadcrumbs on top and place in oven for 30-45 minutes or until top is golden brown and bubbling.. Carefully take out of the oven and enjoy.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 2, 0, 1]\nC: [2, 1, 0, 3]\nD: [1, 0, 2, 3]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_99_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_99_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_99_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_99_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [1, 0, 3, 2]\nC: [2, 0, 3, 1]\nD: [0, 2, 3, 1]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. As a base for the fillings, you can use wraps, tortillas or lavash bread.  Trader Joe's has a good soft lavash bread.  You may run into a cracker-type that needs to be moistened with water, but I wouldn't mess with that on your first attempt.  I used wraps in these examples, partly because they come in a variety of colors & flavors, and they work well for the heart shape.  Fold the circular wrap or tortilla in half, making a crease that will be the bottom of the heart.For fillings, choose one or two spreads, plus at least 6 other items from the list below, or use what you have!  It's good to have everything prepared, open and ready to go before you starting building your lavash.Ingredients:Wraps, tortillas, or lavash bread     Spreads:  hummus, cream cheese, goat cheese, pesto, refried beans, creme fraiche, sour cream, etc.     Sliced cooked meats, and sliced or shredded cheese     Shredded vegetables:  carrot, cabbage, jicama, radish     Fresh spinach or other greens     Sugar snap peas (whole)     Avocado     Thinly sliced tomato (drain on paper towel), bell pepper, green onion, red onion, cucumber     Grilled eggplant (thinly sliced)     Olives, pickles (gherkins or cornichons are good), capers     Condiments:  salsa, salt & pepper, fresh basil, parsley, cilantro, chives, garlicEquipment:Knives and spatulas for dipping and spreading     Grater or food processor     Plastic wrap     Sharp knife for slicing. You can build as many as will fit your workspace, but two at a time works for me.Use one or more spreads for each.  You don't need a lot, but you do need to spread it all the way to the edge.  The spreads have the advantage of being the \"glue\" that holds the whole thing together, as well as adding flavor and moisture.Examples:  Hummus and cream cheese, pesto and creme fraiche, refried beans and sour cream.. Next add your toppings in rows.  Choose compatible flavors, and contrasting colors and textures.  You want it to be full, but still be able to roll it up.  Keep in mind that when you slice the lavash, you will be seeing everything in cross section, so align spreads (if multiple) and rows of topping with the crease, not perpendicular to it.Some possible combos:Hummus, cream cheese, thinly sliced red pepper, spinach, thinly sliced tomato, shredded carrot, green onions, black olives     Refried beans (may need to thin with water for easier spreading), sour cream, shredded pepper Jack cheese, salsa, cilantro, thinly sliced red onion, thinly sliced tomato, avocado     Pesto sauce, creme fraiche, sliced ham, thinly sliced tomato, spinach, sugar snap peas, thinly sliced provolone cheese, thinly sliced yellow bell pepper.Salt & pepper or other condiments to taste!. For heart-shaped lavash, you roll each side only half way, meeting near the middle.  Fold gently along the crease.For a traditional lavash, you start at one end and roll it all the way.  A little spread on the end flap helps hold it together.. Wrap each lavash snugly in plastic.  Once you have done that you may need to slightly flatten or pinch the creased side again to create a heart shape.Chill for at least an hour before slicing; longer if possible.. When you're ready to serve, unwrap and cut the lavash into slices 1.25\u20131.5\" thick.  Use a sharp knife or serrated knife for best results.  You can gently reshape the pieces for the best look if needed.  Arrange on a platter in a single layer or stack in several layers.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 0, 3, 2]\nC: [2, 0, 3, 1]\nD: [0, 2, 3, 1]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_100_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_100_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_100_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_100_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [3, 1, 0, 2]\nC: [1, 0, 2, 3]\nD: [0, 3, 2, 1]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. This tree stump is going to be a great place for one of our cute little birds to perch. It's also going to add height to the cake making it appear larger and more complicated than it is! You will need:light brown fondantdark brown fondanttoothpickSCULPTING THE TREE STUMP 1. Cut a 1 1/4 ounce piece each of dark brown fondant and light brown fondant and place on top of each other.2. In a twisting or pulling motion mix the two colors of brown fondant together to marble them slightly. You can see I only have one distinct vein. More would be great!3. Roughly form the fondant into an egg shape and flatten the top by pressing with your fingers.4. Squeeze the sides of the stump to give the stump ridges and to elongate it.5. Press or pinch the bottom of the stump with one finger in random places to give the effect of roots.6. Pinch the top edges to sharpen the 'cut' edge of the stump.7. Using a toothpick, score or scrape lines in the sides of the stump to resemble bark and circles on the top of the stump to create rings.. We are going to stain toothpicks with food coloring to make stick legs for our birds!You will need: four toothpickspaper towel or napkinbrown food coloringMAKING THE LEGS1. Dip the toothpicks into the food coloring.2. Remove and place on the paper towel without rubbing off the excess food coloring.3. After about 10 minutes wipe off excess food coloring with the paper towel. The toothpicks should be stained a medium brown color.4. Set aside to dry.. Oftentimes the simplest of shapes make the cutest characters. Sometimes leaving it simple is best! These birds are definitely simple! You will need: light pink fondant - 1 ouncedark pink fondant - 1 ounceorange - pea sized amountbrown - teeny, tiny piece!turquoise - enough for two very small eyesclear vanilla extractfood safe paint brushrolling pinrolling matMAKING THE BIRDS1. Remove a small piece (large pea size) of light pink fondant and set aside for the wings.2. Between two hands, roll the larger piece of fondant into a ball.3. Place on your rolling mat and with your hand roll the piece back and forth with gentle pressure until you start to get a cone shape (without a point).4. Using your fingers, press or pinch the smaller end flat to create the tail and curve up by pressing gently with your fingers.5. Divide the remaining piece of fondant in half and roll one half into a ball. Press to flatten slightly.6. Slightly pinch one side, and curve the pinched/pointed side up slightly. You should have a paisley shape wing. Repeat for the other wing.7. Pour a small amount of clear vanilla extract or vodka into the cap or a small bowl and apply a very small amount of extract (using a food safe paint brush) where you want the wing to be located on the bird body. Gently press the wing onto the body. It should stick immediately! Repeat on the other side of the bird. (Extract will leave a shiny or glossy appearance on fondant so use sparingly and try to be as exact with it as possible.)8. Divide the orange fondant in half and set one half aside reserving for the next bird and roll into a ball and form into a cone with your fingertips. This is the beak.10. Apply a dab of extract and adhere the beak to the front or head of the bird.11. Roll a very small amount of turquoise fondant into a ball and flatten. This will be the iris. Apply to bird with a small amount of extract.12. Roll an even smaller amount of brown fondant into a ball and gently press it into the turquoise fondant eye creating a pupil. Since it's so small of a piece it should stick without fondant, but if it doesn't, secure it with extract. Repeat steps 11 and 12 to make another eye for the back side of the bird.13. Repeat the entire process to make one more bird in your choice of color!. There are tons of ways to make fondant roses. I will cover three uncomplicated ways to make cute roses to top our cake with!You will need:light yellow fondant (1 ounce)dark yellow fondant (1 ounce)small round cutters - 1 inch and 1 1/4 inch (you can also use a shot glass, biscuit cutter, or make a circle pattern from thick paper and trace around it with a knife)paring kniferolling matrolling pinpetal foam (or any craft foam)ball head sculpting tool (you can also use a rounded end drink stirrer, a melon baller or a dry finger)FLOWER #11. Roll out a small piece of light yellow fondant and cut out six small circles.2. Using your finger, press the edges of each circle to thin them.3. Line the circles up and gently press them together at each overlap to secure.4. Flip the line of circles over so the wrong side is facing up.5. Roll the circles up from one end to the other.6. Cut the roll in half to create two roses.7. Pinch the bottom of the roses to open them slightly.8. Peel back a few of the petals to make the rose look more natural. Set aside to harden. Repeat to make more roses.FLOWER #21. Roll out a long piece of light yellow fondant and cut a 1/2 inch by 6 inch strip.2. Remove excess fondant and set aside.3. Press ridges in one side of the fondant with your finger. Continue along the entire edge of the strip.4. Pick the fondant up and roll the strip until you reach the desired size. If 6 inches is too long or makes it too thick of a flower you can cut it short anywhere you would like.5. Pinch or squeeze the bottom of the rose to make a stem and open it slightly.6. Cut off the stem so the flower will sit upright on the top of the cake. Set aside to harden. Repeat to make more roses.FLOWER #31. Roll out a piece of dark yellow fondant.2. Cut ten to fourteen, 1 1/4 inch circles and remove excess fondant.3. Roll a small sized piece of fondant into a ball. About the size of a large marble.4. Roll one end making it into a cone shape. The same way we made the bird! This will be the center of the rose. Set aside.5. Place one circle on the petal foam and in a back and forth motion with the ball head tool flatten and ruffle the edges. Repeat with remaining circles.6. Wrap one circle around the center piece of the rose.7. Repeat with remaining circles placing them randomly so the seams don't match up. You can make the rose as small or as large as you want. Fold the petals back slightly to give the flower an open look.8. Pinch the bottom edge when finished to create a stem and cut the bottom so the rose will sit flat. Repeat the process to make more roses. . If you haven't already done so, mix the colors of fondant you wish to use in the pattern of the cake. I used eight different colors (nine shown) but more or less would be fine.This is where you get to be creative and think about repeating patterns. What shapes and sizes do you want for your cake? I chose a superellipse shape and used all the same size but you could experiment with various sizes of the same shape for a unique effect. Examples of other shapes that would make fun repeating patterns are: rhombus, stars, heart, circle, octagon, crescent, parallelogram, etc.You can also\n find a lot of cutter options in the clay modeling section of the craft \nstore. Let's get started!You will need:geometric cutter (I got my superellipse from a small set of inexpensive cutters from the Duff Goldman collection that I found at Micheal's.)rolling pinrolling matparchment or waxed paper lined sheet pan (You can also just use your countertop or a table!)MAKING THE PATTERN1. Roll out one color of fondant very thin on your rolling mat using a rolling pin. When I say 'thin' it should basically be as thin as possible without tearing.2. Using the wrong side of the cutter cut out 15-20 pieces or shapes. If you use the wrong side (folded metal side) of the cutter you will get an exact shape with sharp corners. If you use the correct or sharp side of the cutter you will get rounded edges. This drives me nuts, but some people don't mind at all!!! You can see the difference in the picture (left cut-out = wrong side of cutter; right cut-out = right side of cutter). If you are using a cutter with rounded shapes like a cloud or a flower it won't make much of a difference which side you use.3. Smooth the sides if any rough edges are present by folding or gently squeezing the frays to the back side of the shape. 4. Transfer pieces to a baking sheet lined with parchment (or waxed) paper. Make sure they are not touching so they don't stick together! If you don't have a baking sheet you can place the parchment directly on your table or countertop in a place that won't be disturbed.5. Repeat with remaining colors!. Here we will follow simple steps to apply a repeating pattern to our fondant cake.You will need:fondant covered cakeparing knifefondant shape cut-outsclear extractsugar pearlscake stand or base1. Cut one of the fondant pieces in half from point to point using a sharp knife. Do this with one of each of the remaining colors equaling 18 halved pieces total. You may need more later but this is a good starting point for now.2. Apply a small dab of clear vanilla extract to the back of the fondant piece and place the top flat edge in line with the top edge of the cake.3. In a random pattern of colors apply pieces straight down in the same manner cutting the bottom most piece as necessary. I only needed to cut a tiny little piece off to make it fit at the bottom edge of the cake.4. Continue around the entire cake trying to keep the pieces straight and the colors in a random pattern.5. Once you have finished applying the pattern press a sugar pearl into each corner where the pieces meet. If the pearls don't stick by pressing into the fondant you can apply a small amount of extract to get them to stick. Sugar pearls will acquire a strange thick gooey film where they come into contact with extract so use only if necessary!!We are ready to decorate and complete the cake!. Arrange flowers, birds, and tree stump in any manner you wish! I ended up using two birds, four roses, and six small flower cut-outs with pearl centers (same flower technique as from the drip cake) on the top of the cake. Here's an idea of how to arrange your fondant decorations:1. Decide which side you want to be the front of the cake.2. Place the tree stump toward the back of the cake just off center to the right and place a large rose on the edge of the cake near the front right. Add some small roses and flowers around in groupings or any arrangement you like. Secure with extract.3. Push toothpick legs into the birds. I left off the back wings so the bird could harden slightly when I was working on other things.4. Stick one bird so it is standing on the stump and one toward the front left. Adhere the back wings on each bird with extract.Congratulations!! You should be very proud of yourself! You have completed a beautiful decorated cake that is ready to serve! If you don't plan on serving the cake right away, leave it out at room temperature for up to 3 days until you want to slice it and serve it. Do not refrigerate!If this cake is for a birthday party and you choose to put candles in it they will easily poke through the fondant and stand securely.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 1, 0, 2]\nC: [1, 0, 2, 3]\nD: [0, 3, 2, 1]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_101_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_101_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_101_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_101_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [2, 3, 1, 0]\nC: [1, 0, 3, 2]\nD: [0, 3, 1, 2]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. 2 oz. Tequila - Use a good silver tequila1 1/2 oz. Orange Liqueur (i,e. Cointreau or Grand Marnier)2 oz. Margarita mix (opt. Use for a sweeter finish, otherwise omit)1 oz. Fresh Lime Juice1 oz. Orange Juice1 splash SpriteIceJalapeno stuffed olives!. Into a cocktail shaker, measure tequila and orange liqueur.. Roll lime firmly under your palm, or microwave it for 30 seconds to release more juice.Squeeze lime into shaker.. Add orange juice and ice, and shake.Top off shaker with a splash of Sprite (or other lemon-lime soda).. Pour into a chilled martini glass and garnish with three olives.Serve with shaker.Enjoy responsibly!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 3, 1, 0]\nC: [1, 0, 3, 2]\nD: [0, 3, 1, 2]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_102_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_102_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_102_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_102_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [2, 0, 3, 1]\nC: [1, 3, 0, 2]\nD: [1, 0, 2, 3]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. \nrectange cake board\ncircular cake boards\nspatula\ncake mix or your favorite scratch recipe (see my\u00a0Old fashioned sour cream fudge recipe\u00a0below)\nheart shaped cake\u00a0pan\ncake release\nrolling pin\nsaran wrap\nred or pink pearl dust\nclean (new) make- up brush\npliers\ndowel\nscissors\nsharp knife\nblack marker\nroller cutter (optional)\nred gel paste food coloring (if using white fondant)\nfondant ( you can use white and color or purchase red fondant)\ncandy cups\nchocolates (at least\u00a024\u00a0 )\nfood wrap and tin foil to cover\u00a0cake board\u00a0(optional)\nyour favorite buttercream icing (see my favorite below)\ngumtex or tylose or use\u00a0gumpaste insteadOld Fashioned Sour cream fudge cake\u00a0\u00a0Ingredients:AmountIngredient2 \u00bc cupscake and pastry flour2 tsp.Baking soda\u00bd cupbutter, softened2 \u00bc cupsfirmly packed brown sugar1/2 tspsalt3eggs1 1/2 tspvanilla1 cupboiling water3 ouncesbakers unsweetened chocolate (melted01 cupsour cream (cooled)\u00a0 \u00a0 Directions:Sift together flour, baking soda and salt; set aside. Cream butter. If you use salted butter (skip the salt). Gradually add brown sugar and continue beating for 5 minutes. Add eggs one at a time, beating well after each addition. Add vanilla and chocolate. Alternately blend in flour mixture and sour cream, one third at a time, on low speed of electric mixer. Add boiling water; blend well. (Batter will be thin.) Pour into one greased and floured, waxed paper lined 9 \u00bd inches layer pan. Bake at 350 degrees for 35 to 40 minutes, or until cake tester inserted into center comes out clean. Cool in pans for 10 minutes. Remove and finish cooling on racks.Optional Filling: Kirsh Cream with Strawberries\n\t250 ml. Heavy cream 250 g. chopped strawberries (about 1 \u00bd cups)\n\t1 to 1 \u00bd tbsp. Kirsh cream or any other\n\tfruit liquer.\n\tBeat cream until whipped. Fold in strawberries and liquer and fill cake. \n\t\u00a0\u00a0Frosting: 5 squares Unsweetened Chocolate \u00bd cup butter, softened 1/3 cup water 3 cups icing sugar 1 egg Melt chocolate with butter and water over low heat; cool. (Mixture may appear curdled.) Add icing sugar and egg. Blend; then beat on low speed of electric mixer for 2 minutes. Chill until of spreading consistency.\u00a0Alternative Frosting (Bittersweet Chocolate Frosting): Amount is for a wedding cake therefore cut in half. 1 lb. Bittersweet chocolate, chopped \u00be cup heavy cream 3 tbsp. Unsalted butter In medium saucepan, boil water. In medium steel bowl combine approximately 2/3 of the chocolate and cream. Place bowl over saucepan and sir frequently until melted and smooth. Remove from heat and stir in remaining chocolate until smooth. Gradually beat in butter, 1 tablespoon at a time. Let stand until cooled to room temperature. \u00a0Bittersweet Chocolate Whipped Cream Buttercream IcingIngredientsPart One 1 lb. powdered sugar (sifted) 2 1/2 cups Crisco, 4 oz melted bittersweet chocolatePart Two 3/4 cup granulated sugar 1/2 tsp. salt 2 TBSP. Meringue powder (add 1 additional TBSP for slight crusting) 1/2 cup BOILING water (less 2 TBSP) 1 TBSP Vanilla (or flavor of your choice)InstructionsPart one... put crisco in bowl and gradually add powdered sugar. Beat about 5 minutes until mixture is very creamy and fluffy then add melted chocolate squares.\u00a0Set this aside.Part two... In a very clean bowl mix dry ingredients. Add BOILING water and immediately mix on high speed. Beat until stiff peaks form, about 8 minutes. When mixture begins to get stiff add flavoring.NOW combine both mixtures\u00a0and beat together for another 8 minutes. When finished, use a rubber spatula to down beat a little to remove some of the air bubbles. Frosting will be very light and creamy. Cover. DO NOT REFRIGERATE.The frosting may be kept at room temperature for 3 months. Whip with a spoon each time you use it to restore fluffiness.. Optional: Line the cake board with tin foil and food safe plastic wrap (this is not necessary but makes it easier to wipe messes off the board) I usually use neutral gold or silver gift wrap I purchase at Michael's , but I had run out. . Bake 2 heart shaped cakes. I always use a generous amount of Cake Release to prevent the cake from sticking. Level the cake, but cut \u00a0the one for the top (lid) of the cake a little shorter\u00a0than the bottom and place it on a circular cake board. Put it aside.\nPut the bottom cake on the main rectangular cake board. Fill the bottom cake \u00a0with filling of your choice (this is optional). Ice the cake, being sure to fill in the area where the cake was cut to fill, if you filled it. This doesn't have to be a thick layer covering everything, only a crumb coat. If a few crumbs mix in, it's not a big deal. Smooth as best as you can.\nRepeat for the top of the cake. It is important to get the top of the cake very smooth, as you will be placing fondant on top of it.\nTip: Take a metal spatula, soak it in boiling water (I use a pot on the stove)\u00a0 and wipe the\u00a0water off on a clean tea towel, then\u00a0smooth the icing with the dry hot spatula over the surface of the cake.\u00a0\u00a0Then remove excess icing off spatula. \u00a0Keep repeating until your cake is smooth. Add\u00a0about 2 Tblsp of \u00a0tylose or gumtex to\u00a0your\u00a0fondant, roll out and cut into a long strip. Alternatively, you can use gumpaste that can be purchase at Michael's craft store or any cake decorating store. But you will still have to color it.\nMake sure the strip is wide enough to go about a 3/4 of an inch above the cake (measure with chocolate on top) and let\u00a0the strip\u00a0dry for about 15 minutes. It needs to be dry enough so it won't sag or droop.\nCarefully\u00a0place the\u00a0strip (you will\u00a0likely need 2) \u00a0around the cake,\u00a0and close the seam at the back with a little water.\u00a0. If you have a\u00a0sugarcraft gun,\u00a0 then use the rope attachment to make the rope border.\nIf you do not, then roll out 3 narrow strips with the flat of your\u00a0hands and twist the pieces\u00a0\u00a0together. Don't worry if it doesn't go all the way around. You can do it in pieces and use a little water to 'glue' it together - it won't be noticeable.\nThen 'glue' the strips on the cake with a\u00a0 little water. Do a little strip of rope\u00a0for the seam at the back. And you will also do this for the top of the cake when the time comes. . Fill the surface of the cake with chocolates in candy cups (you can buy at Michaels;) . You will need at least a 24 chocolates. . Outline the circular cakeboard and cut to fit under the top\u00a0cake. You will need this to support the cake. . Roll out colored fondant (1/8\" thick ) and cover the top of the cake. I usually just guage how much I need by looking at it. But you can tell approximately how much you'll need be measuring the cake across and adding a couple inches all around. You can cut off the excess and reuse. If you have no idea how to smooth fondant on a cake, google it - there are lots of tutorials. Some prefer a smoother, but I use my hands (wedding ring off!)\nPlace on cake, smooth and trim.\nTip #1: Stick\u00a0the top of the cake\u00a0in the freezer for 10 minutes while you roll out your fondant - this makes it easier to cover with fondant. Don't leave it longer than 10 minutes!\nTip#2: To transfer the fondant, I roll it up a little with my rolling pin and gently unroll over the cake. . \nRoll out the remaining \u00a0fondant with gumtex or tylose (or gumpaste) \u00a0as thinly as possible (as least half as thin\u00a0as you rolled it to cover the cake)\n\u00a0Cut two lengths of the\u00a0fondant (or gumpaste) \u00a0the same length and width. These will form the loops. I generally cut mine around 7.5 cm/3 inches wide and about 15 cm/6 inches long. The length of these loops will determine the size of you bow, so If you want a bow about 10cms/4 inches long the loops will need to be a little more than double that length when you cut them. Its a little bit of trial and error, but the length can be adjusted after they' ve been cut quite easily.\nTurn one loop piece over and pinch the ends together, then do the same with the other end, and pinch the two ends together. Put some saran wrap in the bow pieces to set it in place.\nRoll out the tails of the bow in the same manner as the loops but make them a little thinner, maybe \u00be of the width of the loop pieces. Cut the ends at a 45 degree angle. Pinch them as you did the loop piece.\nMake the centre knot with another piece of fondant, rolled and marked in the same manner as the other pieces, but only make it about \u00bd the length of the tail pieces. The knot is just a short strip (maybe 1' by 1\") and it is just wrapped around all the other scrunched up ends so that there arent any rough edges showing. It doesnt need to go all the way around the back of the bow, just tuck the edges under so they dont show.\n\"Glue\" the pieces together with a little water on a paint brush\nCut a long, narrow strip and put directly on the cake\nDry brush on the red or pink pearl dust (I use a never used new make-up brush).\nThen place the bow on the cake on top of the narrow strip.. Take a wooden dowel (I use the wooden ones you can buy in art section at Michaels and boil it) , cut to size with pliers and sharpen with pencil sharpener. It should stick out about 1- 1 1/2 \"\u00a0above chocolates. Carefully place the top of the cake onto the sharpened dowel. You may need to poke a little hole in it from behind first (through the back and into the cake board.) You want it resting just above the rim of the bottom cake, so it doesn't put weight on the rim and wreck it.\nServe and enjoy!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 0, 3, 1]\nC: [1, 3, 0, 2]\nD: [1, 0, 2, 3]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_103_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_103_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_103_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_103_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [1, 3, 0, 2]\nC: [0, 1, 3, 2]\nD: [0, 2, 3, 1]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Take large bowl and mix Suji, 1/2 tea spoon salt, baking powder. Take enough warm water to knead the flour into a hard lump. Then add basan and maida and knead it again.Then let it sit for 20 minutes in a thin wet cotton cloth.Then make small balls of all the kneaded flour as in the picture.. Now press the balls created into flat circular cakes. Make sure that the flat cakes are thin but not too thin. Keep the pressed balls on wet cotton cloth so that the flour does not dry out.(You may want to experiment some thicknesses in the next step).. Preheat vegetable oil on high.Take a sieve and a large bowl - to collect the extra oil from the deep fried gol gappasTake a plate and cover with paper towel to further collect the extra oil on the gol gappas.. Now take the pressed flat cakes and put into the hot oil one by one. Make sure that the oil is hot enough that the cakes become fluffy and round right away.Once golden brown turn all the gappas on the other side. Make sure the heat is now on low to cook the gappas well and not burn them.Take the gappas out once deep golden brown and put them on the sieve using a strainer. Be careful the oil is very hot!. Boil 3 potatoes and peel them.Then cut into small pieces and add roasted cumin seeds along with cut cilantro as in the picture. Mix it a bit but not too much (don't make it a paste).. Mix in a mixer:Mint, cilantro, mango, lemon, cumin seeds(roasted), salt, tamarind paste, black salt, sugar, waterTaste when fully mixed. You should be able to taste it as a tangy (and sweet) mix. Adjust the ingredients to suit your taste.For taste you can add boondi as well.. Now take a gol gappa and make a hole on top (on the softer side).Add a little potato mix created into the hole and add gol gappa water into it as well. If the gol gappas are made right they will not leak!Enjoy the full gol gappa in one bite. Eat as many as you have or like...\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 3, 0, 2]\nC: [0, 1, 3, 2]\nD: [0, 2, 3, 1]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_104_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_104_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_104_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_104_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [0, 3, 2, 1]\nC: [1, 0, 3, 2]\nD: [2, 0, 1, 3]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. For the crust:\n2.8 oz cold butter\n3.75 oz flour (pastry flour, if you have it)\n1/3 tsp salt\n1/2 tsp sugar\n1/2 tsp vanilla\nSome ice water (more on that later)\nFor the filling:\n3 eggs\n6 oz sugar\n1 tsp vanilla\n2 bananas, pureed\n1/2 cup milk\nPinch salt\nPinch cinnamon\nPinch nutmeg\nFor the meringue:\n6 egg whites\n10 oz sugar. First, mix your dry ingredients (not counting butter) until everything's well dispersed. A note before we get into things: From here on out, the colder you can keep things, the better. It's perfectly okay, and preferable if you have the time and patience, to chill your ingredients and utensils before every step. And definitely let the dough rest between steps, in the fridge or freezer. Back to the directions; Next, using a pastry cutter or two knives, blend the butter into the dry mixture until your butter is about the size of rice. Then slowly add your vanilla and then your ice water, until the dough comes together. It shouldn't be wet, however, so just take it slow. Once that happens, let the dough rest a bit, then give it just a few kneads on a lightly floured surface, just so that it doesn't break too easily. Then chill it, roll it out to about 1/4 inch, and place it in your pie pan, an 8 inch one is preferable. Be careful not to break or stretch it. Trim the edges about 1/2 inch over the edge of the pan, and then roll that excess under and crimp however you like. Freeze it until it gets cold.. This filling is actually a modified pumpkin pie filling (a type of custard pie), just made with pureed bananas instead of pumpkin puree.\nFirst, mix everything except the eggs in a food processor until well blended. Then add the eggs, pulsing the processor until they're incorporated, but not over mixing. We don't want any air to get into the eggs, if possible. Next, chill this mixture. Pour it into your pie dough, egg wash the edges, and bake it at 350 for 20-40 minutes, until the center of the pie sets. Then cool the pie so that it doesn't melt the meringue.. Note: This is a method for a Swiss Meringue. You could also make an Italian Meringue if you had the know how and the inclination. I know a French Meringue (in which one just whips up sugar and egg whites) is easier, but this method is preferable if you want to avoid things like salmonella.\nBegin by heating the egg whites and sugar, whisking gently, in a double boiler. Heat this mixture until it reaches 100 degrees, whisking moderately all the while. When the mixture reaches 140 degrees, remove it from the heat and whisk (or put into a mixer and put on medium speed) until medium-stiff peaks for. Transfer this to a piping bag (or just spread it out with a spatula) ASAP and pipe on to the top of your cold pie. To finish it, use a torch to brown the tops. Then, most importantly, cut and enjoy!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 3, 2, 1]\nC: [1, 0, 3, 2]\nD: [2, 0, 1, 3]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_105_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_105_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_105_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_105_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [2, 0, 3, 1]\nC: [2, 0, 3, 1]\nD: [1, 3, 0, 2]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Leftover mash1 eggFlour, a few tbspsbutterFor the purpose of this instructable I boiled and mashed my potatoes rather than using leftovers. I used 6 large potatoes, so, at a guess, I have enough for at least 20 potato cakes.. Peel, quarter and boil your potatoes until soft and mashable.Drain.Mash with a good knob of butter and leave to cool.If using cold, leftover mash then melt your butter first and then mash into the potato.Add 1/2 the egg and a good scoop of flour. Mix and mash until you have a soft, dough-like consistency.You may need more egg or flour depending on the type of potato and how much you have.For my vatful of spuds, I used the whole egg and about 4 tbsps of flour.. Use your hands to bring the potato together into a dough.Turn out onto a well floured board and knead a little, adding more flour as necessary.Pat out to a large circle about 1/2 inch thick. Or thicker if you like.Cut into rounds or triangles.I started off using a pastry cutter to make rounds but now find it easier to just pat out circles by hand.Make sure each cake has plenty of flour top and bottom.. Add a knob of butter to a heavy frying pan. (It just has to be butter I'm afraid!)Add a little oil to stop the butter from burning.Heat up the butter and carefully add the cakes.Don't cook too many at once otherwise they will become difficult to turn.Leave for about 5 minutes before turning.Thicker cakes will take longer, up to 8-10 per side.The butter in the pan will begin to darken so if you're cooking in batches you may need to wipe out the pan and use fresh butter. Enjoy for breakfast, lunch, dinner, whenever! Allow to cool before wrapping and freezing.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 0, 3, 1]\nC: [2, 0, 3, 1]\nD: [1, 3, 0, 2]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_106_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_106_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_106_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_106_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [1, 0, 3, 2]\nC: [3, 2, 1, 0]\nD: [3, 1, 2, 0]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Marshmallow Fondant:2 cups mini marshmallows 3 cups powdered sugar 1 tbsp waterCake:1 box carrot cake mix 1 cup shredded carrotsFrosting:1/4 cup butter (softened) 2 cups powdered sugar 1/8 tsp vanilla extract 1/8 cup milk Plus: 1/2 cup shredded carrots to cover the sides of the cake.You will also need to print a rabbit template. Cut out the individual pieces of the rabbit's body and set the pieces aside for later. These will be used to shape cake and the fondant.. Marshmallow Fondant:2 cups mini marshmallows 3 cups powdered sugar 1 tbsp waterPut the marshmallows and water into a microwave safe bowl. Microwave 30 seconds. Remove from the microwave. The marshmallows should be very soft, fluffy and mostly melted. Stir with a spoon. Microwave again for 10 more seconds and mix until smooth. Then, fold in the sugar one cup at a time until it turns into a dough.The next step is kneading. To prevent the mixture from sticking to everything, sprinkle powdered sugar on your hands as well as on the surface you will be kneading on (you can also use butter or shortening to preventing sticking). Remove everything from the bowl and knead it with your hands for about 5 minutes until it is smooth.. Roll the fondant out into a thin sheet big enough to cover the area of the template. Lay the paper pieces on top of the fondant so you will know exactly where to cut. Use a knife to carve the pieces out.. Carrot Cake: 1 Box Carrot Cake Mix (plus whatever additional ingredients the mix calls for like eggs, water, oil, etc.) 1 cup shredded carrotsPrepare your mix by following the directions on the back of the box. Then, mix in 1 cup of freshly shredded carrots. Pour the batter into a greased 11\" x 7\" baking dish and bake as directed (Approximately 40 minutes at 325 degrees).. Once the cake has cooled, remove it from the pan so that you can level it. To level the cake, cut off the rounded top by slicing straight across with a large knife. Now that you have a level cake, you're ready make it look like a rabbit. Lay the paper template pieces on top of the cake, line your knife up with edges of the paper and cut straight down.. Frosting: 1/4 cup butter (softened)2 cups powdered sugar1/8 tsp vanilla extract1/8 cup milk  In a bowl, use an electric mixer to cream the butter. Then, gradually add in the powdered sugar. It shouldn't look like much has happened because the mixture will still look like powder. Beat in the milk and vanilla extract- this will make a creamy frosting.. Cover the sides of each piece with frosting and roll the pieces in shredded carrots. Then, spread frosting across the top and attach the fondant pieces. Put the pieces of the rabbit together and you'll have a rabbit cake!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 0, 3, 2]\nC: [3, 2, 1, 0]\nD: [3, 1, 2, 0]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_107_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_107_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_107_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_107_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [3, 2, 0, 1]\nC: [3, 1, 0, 2]\nD: [0, 2, 3, 1]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. IMPORTANT:The icing recipe below is only enough to have a very thin layer on each waffle, including the top. It is not enough to ice the sides of the cake, as you can see in the picture. So if you want to have more, and thicker icing, I definitely suggest you double the icing recipe. (That is, if youmake 4 large Belgian waffles as the instructable says). If not, you have a minimally-iced cake, like I did, which is fine.Ingredients;For the waffles: (makes 4-5 large Belgian waffles)  3 eggs  1 cup coconut milk \u00be cup warm coffee  \u00bd cup chocolate chips, melted \u00bd cup vegetable oil 4 tablespoons sugar  1 \u00bd cups white whole wheat flour  1 cup all-purpose flour  \u00bd cup cocoa powder  4 teaspoons baking powder  \u00bc teaspoon salt  \u00bc teaspoon vanilla  For Frosting:  1 cup chocolate chips (= 8oz baking chocolate) \u2154 cup heavy cream 4 tablespoons butter 1 teaspoon vanilla extract \u00bc cup sugar 1 teaspoon corn syrup. Prepare a large Belgian waffle iron according to manufacturer\u2019s instructions.This is the one I used: Cuisinart Belgian Waffle Maker. In a separate medium bowl, combine flours, 4 tablespoons sugar, cocoa, baking powder, and salt.. Dump eggs, coconut milk, oil, and vanilla, into a large bowl. NOTE:I tried using a blender, which is why I have pictures of it; but it was a bad idea: the batter was too thick, and I ended up using a bowl instead. So don't be confused by the pictures ;). Dump the dry ingredients into the large bowl with wet ingredients, and mix until smooth and incorporated.. Heat the coffee up in a microwave-proof dish.Dump in 1/2 cup chocolate chips.Stir until melted and combined with coffee.Pour chocolate/coffee mixture into batter.Mix until combined.. Cook batter in waffle iron, according to instructions. (With my waffle iron I used the no. 3 setting)You should turn out with 4-5 large waffles. Optionally, you can use a small portion of left over batter: pour the small portion of leftover batter into one corner of the waffle maker. You can use this for decoration on the top of the cake later.. Heat heavy cream and butter in saucepan on medium heat until it just begins to boil. Make sure butter is melted. Return heat to low, and stir in corn syrup, and sugar, until sugar is dissolved. Add vanilla.. Pour hot mixture over chocolate chips in a medium metal bowl, and stir until chocolate is melted.. Place chocolate mixture in bowl in a ice bath, and stir until frosting hardens a bit. Take out of ice bath and whip with a beater until it is light and fluffy.. Don't ice the waffles until they are completely cooled.Place one waffle on a serving dish. Spread frosting thinly over it. Place the second waffle down over, and spread frosting over that. Do this with the rest of the waffles, spreading over the final part in a thin layer, not the sides.Here's where that extra optional waffle corner comes in: You can place it how you want to on the top of the cake on the icing. See picture. Place in fridge for 15 minutes before serving.NOTE:There is not much icing for the waffles, which is why you should only do thin layers, and not over the sides. But if you want to, you can double the icing, and make it look more like a standard cake.However, this look still looks good as you can see the sides of the waffles, and reduces sugar.. Enjoy!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 2, 0, 1]\nC: [3, 1, 0, 2]\nD: [0, 2, 3, 1]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_108_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_108_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_108_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_108_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [0, 2, 1, 3]\nC: [0, 3, 2, 1]\nD: [1, 0, 3, 2]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. When I originally received my seeds from a friend, he'd not germinated any. \u00a0My first go was a failure. \u00a0Out of 20 seeds planted, I had one germinate. \u00a0It produced one edible fruit (the rest rotted before obtaining a reasonable size). I rescued the seeds and planted again this year. \u00a0Again I put it in the same special watering system pot. \u00a0I fed regularly and this year was rewarded with a monster crop from one plant. The picture shows pepperdew (left) and twilight (right - quite hot!). \u00a0The twilight is very interesting - it grows purple and has no heat. \u00a0Later on as it starts to change it gets intensely hot!. I used: 1lb of pepperdew or sweet peppers 2/3 cup of sugar 1.5 cups of vinegar 1.5 cups of water Pinch of salt You'll also need; Ice slotted spoon Saucepans knife teaspoon Sterilised jar Chopping board\u00a0. I always give my raw peppers a wash in the sink. \u00a0They're grown organically, they're not sprayed, but they may be dusty. \u00a0Also removes surface level insects. I chopped the end of the peppers off with a knife, then used the end of the spoon to remove the seeds and innards. Keep the seeds - you can grow more peppers! Once hulled, you plunge into boiling water for two minutes. \u00a0Once done, use the slotted spoon to dunk them in ice cold water. \u00a0This stops the cooking and keeps the peppers crunchy. \u00a0It'll also kill any bacteria on the peppers so when you can/bottle them they won't spoil so quickly. Put them into the sterlised jar - if you've not sterlised them before, I put them in the oven at 160'C for about 10 minutes. \u00a0The lids I boil.. Add the water, sugar, salt and vinegar into a pan. \u00a0Bring to the boil and keep it there for two minutes. Add to the peppers making sure the last 1 cm is left clear. Make sure if you've not sliced the peppers that they don't have pockets of air in them. \u00a0I poked them with a sterilised spoon until bubbles stopped coming up. Add the lid and you're done.. Leave in the fridge for 24hrs before opening. They won't last long. \u00a0Well, theoretically they'll last 2-3 weeks. \u00a0 But they're so tasty expect them to be gone in a couple of days. Stuff with soft cheese, slice up and put on salads/pizzas etc. Don't forget that those seeds are a start of new life. \u00a0Leave them on a plate with a bit of tissue to dry over the next few days. \u00a0Next year, about April time, get them in some compost and see what happens!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 2, 1, 3]\nC: [0, 3, 2, 1]\nD: [1, 0, 3, 2]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_109_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_109_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_109_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_109_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [2, 1, 0, 3]\nC: [3, 0, 1, 2]\nD: [3, 2, 1, 0]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n.  1. You need 1 blow up  balloon                                                                                                                                          2. 1 bag of Chocolate chips                                                                                                                                              3. 1 bag of White Chocolate chips                                                                                                                                    4. Bowl and Microwave                                                                                                                                                        5. Any fruits like Strawberries,Oranges,Apples...etc slices. Now take out the milk chocolates out and put it in a bowl to microwave it until it melts or melt it in a stove pot than do the same thing to the white chocolate.. Spill the chocolate separate around the blow up balloon any pattern u want.Now let it dry into a freezer until freeze.Than take it out and pop the balloon and u get a chocolate bowl. Now put any fruits u want into the bowl and READY SET EAT!!!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 1, 0, 3]\nC: [3, 0, 1, 2]\nD: [3, 2, 1, 0]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_110_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_110_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_110_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_110_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [3, 2, 1, 0]\nC: [0, 3, 2, 1]\nD: [2, 0, 3, 1]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. You will notice, that some of the measurements are rather thematic and then some of those you are used to seeing in volume units are in mass units. The latter is due to my cooking practices - putting a dish on a scale and adding needed amounts of ingredients seems way more convenient than working by volume. In some of the places I've added the usual measurement units as well since my excessive use of \u03c0 may \u03c0ss some of you off. :)Tools:Blender or food processor Various bowls Cake tin or something else to bake in (rhyme wasn't intended) \u00d826cm or around 10\" in my caseKitchen scale (I prefer this to measuring spoons) Measuring spoons (maybe)Ingridients for crust:420\u03c0 cm\u00b3 popcorn (5.5 cups) 40g butter 190g brown sugarIngridients for chocolate cake filling:100g flour 130g sugar 35g cocoa powder 1\u03c0 cm\u00b3 baking powder (a bit shy of 1 tsp) Pinch of baking soda (can you pinch this?) Pinch of salt Some coffee or espresso powder(?) 80g milk 35g vegetable oil 1 small egg Some vanilla extract/bean/sugar (a matter of taste) 25\u03c0 cm\u00b3 boiling water or strong coffee (1/3 cup)Ingridients for cranberry top:105\u03c0 cm\u00b3 not too sweet cranberry jam (approx 1.5 cup) 10\u03c0 cm\u00b3 gelatine (2 tbsp) A random amount of fresh or frozen cranberries for sournessA handy guide for pi day pies:1 cup (236 ml) = 75\u03c0 cm\u00b31 tbsp (15 ml) = 5\u03c0 cm\u00b31 tsp (5 ml) = 1.5\u03c0 cm\u00b31 ml = 1 cm\u00b3. This was made after a quick search online. The source was this, but I found that the amount of butter suggested is insane and used a lot less of it for better or for worse. Since I couldn't be bothered with making actual caramel for use with popcorn (although I would, if I had to make this again), I opted for simple sticky combination of brown sugar and butter with ratio of around 4 to 1 (sugar : butter).I made around 230 grams of this butter - sugar mixture in total, although more would've been even better. When you make this and mix with popcorn, go for the amount which makes the popcorn stick together enough to form the vertical sides of pie crust.The popcorn itself was simply popped in the microwave and I used a little shy of 100g unpopped popcorn for the 420\u03c0 (5.5 cups) of popped popcorn I used for the crust. A little bit of chopping is required for it, but don't overdo that! See pictures for what it looked like.Mix the processed popcorn with sugar-butter mixture and let cool a bit. Use this cooled mixture in cake tin or whatever else you use as your pie dish and form a crust. Bake this formed crust at around 180\u007f\u00b0C (350\u00b0F) for around 10 minutes so it sticks together a little bit better as the sugar melt the popcorn together.Take it out and let cool, or, if you planned well ahead pour the cake like filling (from next step) in it and bake further.. This again wasn't exactly a fruit of my imagination. After using some google-fu I found a chocolate cake recipe with great reviews and decided to go with that. It seemed easy to make as well.As far as my limited kitchen skills go, I know one thing which helps when making stuff like that: mixing the dry ingredients first and adding the liquid ones later. So this was the tactic I used here turned out well.Since I didn't have espresso powder mentioned in the linked tutorial, I opted for a natural way of brewing very strong coffee and adding that instead. Not sure if that made any difference and how strong taste one does get from the espresso powder. Couldn't complain about the taste though and that's the most important part.The mixed batter minus the boiling coffee was withheld until the crust was ready so that when I added the boiling coffee to the mix, I was able to whisk it like I have never whisked before, apparently to make it more airy.After all of this and given that you have the crust ready - just pour it in and let put back in to bake. The centuries (probably) old toothpick technique for testing doneness (put toothpick in centre, if it comes out clean - it's done), while not very scientific, works really well. It took around 20 minutes total for it to cook. As it was done I took it out and away to cool. It was the time to make the top cranberry layer.. I wanted a sweet and sour sensation while eating this and my grandma's cranberry jam with some extra frozen cranberries added seemed like the right candidate for that.Making the top layer is at the same time simple and complex. It's a simple combination, but it's somewhat hard to get the consistency right. My aim was to make it not as runny as the jam was, yet not gummy. Basically, it should feel like eating jam with the convenience of it not oozing everywhere while you eat it. The ratio in my case was around 10 to 1 jam to gelatine, although it was on the gummy side, so a little less would've been even better.You will definitely need to test this yourself with the jam you're going to use. A good way to do that is by taking a small dish, putting in some 3 teaspoons or so of jam and adding a bit of gelatine (remember how much though). Heat this up in microwave oven and let cool. See what the consistency is after it is around room temperature and decide if that suits you.The gelatine won't melt in a cold jam, so you will need to heat all of it up. No need to overdo that though. While it is possible to do it on the stove or in the microwave oven, I simply mixed the cold jam with gelatine and threw the bowl in the oven which was still warm from baking. Since the cake was still cooling down and I didn't need to hurry anywhere this was very convenient and also saved energy I would've otherwise used to heat it!As the pie was cool, I took out the jam with already molten gelatine, mixed it a little bit just in case and poured some frozen cranberries in there. I would suggest defrosting them before this though since they cool down the jam rapidly and it will gummy up or at least become harder to pour over pie.As you have the jam and cranberries mixed, pour that over the pie still in tin and leave to cool so the gelat-jam isn't runny anymore. As it cools - remove from tin and eat or add decor.. The pi day decor was pretty simple. I made a vector image on my computer, made sure it is the right size for my pie and cut it out of cardboard. The file is attached so you can print and cut or simply cut in the size it is given that your dish is around the same size (the image is 20cm or 8\" wide). One thing to keep in mind is that I forgot to add stencil like connections for the 4 (or P) so you may want to do a little bit of editing to fix that. Not a big deal at all, I solved this issue with some bits of masking tape which held the islands in place. You should probably also mark the middle point in some way. I remembered that too late already and just eyeballed it.The lettering was made using powdered sugar, yet that is probably not the best way to do it, since it soaks up from the top layer of jam eventually, and the text becomes a lot less visible. So if you do this - do that right before serving for best effects.The cardboard stencil itself was not put directly on jam because it would stick awfully and I'm not sure that it's the kind of paper I want to get in contact with my food. I used some broken toothpicks as stencil supports so it is right above the pie, yet doesn't touch it.The powdered sugar was applied through a fine net to assure somewhat even coating.. All that's left now is to serve and enjoy and I certainly hope you do so!If some questions arise - leave a comment, I try to answer all of them. If there is something where you see bad form - definitely leave a comment, as I said, this is a new medium for me and it would be great to get into good habits early on.Speaking of the end result - I liked the taste, and everyone else I gave the opportunity to taste it enjoyed it as well. Some stuff was pointed out which I already had in mind as having the potential for being fixed. To put it simply, if I had to make this taste combination again, I would make a thicker, sweet, decently caramelized popcorn crust with chocolate fudge as a not too sweet filling and leave the top cranberry layer as is.The odd pictures you see in this step are from tests I ran for taste compatibility and the feel of things in general.If you think this instructable is worthy of a robot apron or something - leave your vote on top of the page. If you like what I make in general, follow me on instagram.Until next time!Raitis\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 2, 1, 0]\nC: [0, 3, 2, 1]\nD: [2, 0, 3, 1]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_111_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_111_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_111_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_111_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [2, 0, 1, 3]\nC: [0, 3, 1, 2]\nD: [2, 3, 0, 1]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Ingredients:\n\t\t32 elderflower heads\n\t\t1kg sugar\n\t\t55g citric acid\n\t\t4 washed, unwaxed lemons\n\t\t1.5 litres boiling waterTools & Materials:\n\t\t1 large pan\n\t\t1 jelly bag (or scrap of fine muslin)\n\t\tseveral sealable bottles. You'll need 32 elderflower heads.\nOf course, the best method is to go out and pick them. Pick any within reach from the elder tree until you have all that you need.\nOur elder tree at work however had to come down. It was starting to push a fence down and we wanted to heighten the wall as well. So I cut it down then found a small boy to forage through it picking off the heads while I finished off the tree.\nIf you leave picking the flowers too late like I almost did, try heading somewhere cooler (like up a hill or out of the town) to find some elder trees still in flower.. Pour 1.5L of boiling water over the kilo of sugar in a large pan.\nStir until it has dissolved.\nAllow the sugar syrup to cool before adding anything else.. Zest all four lemons then cut into thick slices.\nAdd to the cooled sugar syrup.. Measure out 55g of citric acid granules and add to the pan. Stir until they've dissolved.\nThe citric acid lowers the pH of the cordial and keeps bugs at bay. It also enhances the lemonie flavour.. Toss in the elderflower heads and smoosh around until they're under the water line.\nCover with a clean tea towel and store in a cool place to steep for 24-48 hours.. Sterilise the bottles in a hot oven. If reusing them, rinse well first. Mine were new so I just ovened them for 10 minutes at 150\u00baC.\nTo sterilise the lids, boil them for a few minutes.. Having steeped for 48 hours the cordial should now be ready. Remove the bottles from the oven and allow to cool enough to handle them. Fill the bottles any way you please, lab glassware not essential.\nCap immediately.. Serve in a glass with ice and lemon, diluted with sparkling water to taste.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 0, 1, 3]\nC: [0, 3, 1, 2]\nD: [2, 3, 0, 1]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_112_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_112_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_112_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_112_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [1, 2, 3, 0]\nC: [2, 3, 1, 0]\nD: [1, 2, 3, 0]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Serves six:6 tortillas (burrito-sized) 24 oz. salsa 2 cups sharp cheddar cheese, divided 1 bell pepper1 package rice (I used Spanish Rice a Roni) 2 cans pinto beans (drained & rinsed)Preheat your oven to 350 degrees. Cook your rice according to package directions. While it's cooking, dice up your bell pepper.. After rice is done cooking, mix in your diced bell pepper, rinsed pinto beans, 1 cup cheese and a third of your salsa.. Pour another third of the salsa on the bottom a 9x13 baking pan.. Lay out six tortillas and fill with mixture evenly. Spread more cheese on top if desired.Fold burrito style and fit all six tortillas next to each other in the pan.. Cover with remaining rice mix, remaining salsa, and cup of cheese.. Bake in oven for 15-20 minutes, until cheese is melted.. \nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 2, 3, 0]\nC: [2, 3, 1, 0]\nD: [1, 2, 3, 0]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_113_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_113_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_113_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_113_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [2, 1, 0, 3]\nC: [2, 0, 1, 3]\nD: [3, 1, 2, 0]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Dissolve yeast in warm water with a teaspoon of sugar; let stand until frothy.\u00a0 Combine yeast mixture, milk, 1/2 cup sugar and enough flour to make a medium batter (about the consistency of pancake batter).\u00a0Beat thoroughly Add melted butter, eggs and salt.\u00a0 Beat well. Add enough flour to form a soft dough Knead dough until smooth or use a mixer with the hook attachment for about 4-5 min Rub the dough with a thin layer of oil Cover and let rise in a warm place until doubled in size Punch down. Cut dough into individual portions Let rest 10 minutes Shape dough into desired forms (I have included a video of how I learned years ago when I worked at a pizza place, but do whatever works for you) Place on greased or lined baking sheets Let rise until doubled Bake at 350 degrees for 10-13 minutes. As soon as the rolls come out of the oven baste with butter And there you have it, delicious rolls similar to Texas Roadhouse's. For the cinnamon butter combine one softened stick of butter with approximately 1/4 tsp cinnamon, 1/4 teaspoon vanilla, 1 teaspoon powdered sugar, and 2 Tablespoons honey. Adjust the measurements as needed to get the sweetness level you would like.\u00a0\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 1, 0, 3]\nC: [2, 0, 1, 3]\nD: [3, 1, 2, 0]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_114_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_114_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_114_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_114_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [1, 3, 0, 2]\nC: [3, 1, 0, 2]\nD: [1, 3, 0, 2]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Ingredients : 1) 1/2 kg Dates 2) 1 tablespoon Ghee/Butter 3) 100gms (10-12 pieces) Marie Biscuit Supplies: 1) Non Stick Utensil 2) Spatula 3) Aluminium foil 4) A Dish ( for mixing) 5) A Knife 6) A thick plastic wrap ( i recycled and used the bag in which i purchased sugar) Preparation: 1) Remove the seed from all the dates and keep the cleaned dates aside.. 1) In a non-stick pan,\u00a0Heat Ghee/Butter and cook on medium flame. 2) When the Ghee/Butter starts to melt, add the cleaned dates and saute for 8-10 min till the dates soften to form one huge shiny lump. 3) Remove from the flame and transfer it in a separate plate. Let it cool down for 5 min.. 1) Randomly break the biscuits into medium pieces( or just break the biscuit into 4)\u00a0and add it into the date mixture. Be careful from now on as you will be handeling the mixture with your bare hands and it will be quite hot. 2) Lay a square piece of aluminium foil and cut and place the thick plastic sheet on it in such a way that atleat a 2 inch border of the foil is seen. 3) Roll the entire mixture into one tight & thick cylinder.I used the plastic sheet\u00a0 to make this process easier. 4) Make sure that your roll is longer than the plastic sheet and reaches onto the foil. 5) Twist the foil on the edges in opposite direction for it to look somthing like the pictures. 6) Place it in the freezer for atleast 2 hours.. Take\u00a0the mix out\u00a0from the freezer and remove the cover. Cut it into thin slices. Thinner the better. Serve immediately and enjoy. Note: This can be made well in advance and can be stored for a very long time. It is a healthy and nutritive preparation which you can have as a snack or as a dessert. I\u00a0tend to usually have it in my freezer for my unexpected guest at home.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 3, 0, 2]\nC: [3, 1, 0, 2]\nD: [1, 3, 0, 2]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_115_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_115_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_115_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_115_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [2, 0, 1, 3]\nC: [0, 2, 3, 1]\nD: [0, 3, 2, 1]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Equipment\nYou'll need a basic canning kit. This includes:\n- Wide mouth funnel\n- Jar Lifter\n- Magnetic Lid Lifter\n- Bubble remover/ Head Space Gauge\nThe first two are the most important however, I purchased all of them for around $7.\nYou will need a canning rack with the lifting attachments.\nAdditionally you will need:\n- A small sauce pot\n- A large canning pot (or regular pot permitting it fits your canning rack)\n- A large sauce pot\n- A wooden spoon\n- Ladle\nYou will need 6 - 250 mL jars with snap lids and screw bands. If you are purchasing jars new they should come with the snap lids and the screw bands. If you are reusing jars be sure to purchase new snap lids as these should not be reused.\nIngredients\n\u00a0- approx. 3 pints berries (4 cups, crushed)*\n- 1 package Pectin\n- Agave Nectar (3 cups)**\n- butter (1/2 tsp)\n- lemon juice (2 tbsp)\n\u00a0 \u00a0\nNote: This recipe only uses the equivalent of 3 and 3/4 cup sugar. This the \"light\" version of the recipe. If you choose to make the regular recipe it would require 5 cups sugar.\n\u00a0\n\u00a0* Last summer I used a pint each raspberries, blueberries and blackberries. Today I used 2 pints blackberries and 1 pint blueberries as our black berry bush was churning out more berries than we could eat.\u00a0\n\u00a0** If you do not have agave nectar readily available you can substitute sugar cup for cup or a ratio of 3:4.. Ingredients \u00a01. Wash* your berries. Lay your berries onto a baking sheet and use a potato masher to crush them. Measure 4 cups and place into a large sauce pot. \u00a0 2. Combine berries, butter, pectin and lemon juice. Measure agave nectar or sugar and set aside.\u00a0 Equipment 1. Place your canning rack in the bottom of your pot. Place your jars into your pot and cover with water. Bring to a simmer (180 F/ 82 C). 2. Set screw bands aside. Heat snap lids in hot water (not boiling). Keep both hot until ready to use. \u00a0 * I washed my berries with a small about of lemon juice to help remove any impurities. I additionally washed them twice as they came from my backyard and had some ants hanging about.\u00a0 . Ingredients 1. Over high heat bring fruit to a full boil.\u00a0 2. Add all the sugar stirring constantly and return to a full boil that cannot be stirred down.\u00a0 3. Boil hard for one minute*. Remove from heat and skim off any foam.\u00a0 4. Cool a few minutes. ** *My first time making jam I learnt the hard way that the pectin will not activate if you do not allow your jam to boil long enough.\u00a0 ** Some people recommend to ladle the jam into your jars while it is still hot. However if you have larger fruit chunks allow the jam to cool a few minutes to prevent all the fruit to floating to the top of your jars.\u00a0 . 1. Carefully remove a jar from the canning pot. I suggest handling the jar with a dishtowel to prevent burning yourself.\n\u00a02. Place the large mouthed funnel into the jar and quickly ladle the hot jam into the jar withing 1/4 inch from the top (this is where the head space gauge comes in handy). Add or remove jam as necessary.\n\u00a03.Using a non-metallic utensil (The head space gauge/bubble remover) remove any air bubbles from the jar.*\n4. Wipe the rim to removed any food residue and center the hot snap lid on the clean jar rim (use the magnetic lid tool to remove the snap lid from the pot of simmering water.\n5. Screw the band down until resistance is met, then increase to finger tip tight.\n6. Return to canner and repeat with remaining jars.\n\u00a07. If you have any jam left over just spoon them into a small ramekin or bowl and enjoy!\n*This jam was fairly liquid so I did not need to remove any bubbles.\nNote: The recipe should make 6 250 mL jars when using sugar. Because the agave nectar is liquid and I had an extra quarter cup of crushed berries I ended up with more, around 3- 500 mL jars and one 250 mL jars with a bit \u00a0jam left over in a ramekin.\u00a0 . 1. When your canner is full ensure that there is at least an inch of water covering the jars. Cover the pot and bring to a full roiling boil before you count processing time.\n2. For altitudes up to 1000 ft process for 10 minutes. After processing is complete remove lid from your canner and and wait five minutes.\n3. Remove the jars without tilting and place on a protected surface. Cool undisturbed for 24 hours.\n4. After cooling is completed check the seals on the jars. Sealed jar lids will curve downward and will not move when pressed.\n5. Remove screw bands and wipe jars clean.. \nAfter you have wiped down the jars you may replace the screw bands or cover the snap lids with a decorative swatch of fabric fastened with a piece of ribbon or twine. Like here:\n\u00a0Finally label your jars. There are plenty of websites that offer label templates for free. Here is a site that offers lots free printables; just choose one of your liking! \u00a0These two sites (here \u00a0& here) both offer printables exclusively for canning. You can either print on sticker paper (available at most craft stores) or an card stock and fasten with ribbon.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 0, 1, 3]\nC: [0, 2, 3, 1]\nD: [0, 3, 2, 1]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_116_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_116_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_116_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_116_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [1, 3, 2, 0]\nC: [2, 0, 3, 1]\nD: [1, 0, 2, 3]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Whisky Glass. 1/3 ice. 1/3 Black Malibu. 1/3 Squirt Soda. Garnish with XXX Vitamin Water for color or taste preference. Enjoy Responsibly\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 3, 2, 0]\nC: [2, 0, 3, 1]\nD: [1, 0, 2, 3]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_117_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_117_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_117_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_117_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [1, 3, 0, 2]\nC: [0, 3, 2, 1]\nD: [3, 2, 0, 1]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Pulled Pork\n\t\t1 3-4 lb pork shoulder or Boston Butt (which is a shoulder cut)\n\t\t12 oz. (1 1/2 c.) hard ciderDry Rub for Pulled Pork\n\t\t2 Tbs brown sugar\n\t\t1 Tbs kosher or other coarse salt\n\t\t1 Tbs Paprika\n\t\t1Tbs Black Pepper\n\t\t1 1/2 Tsp garlic powder\n\t\t1 1/2 tsp dry mustardPie Pastry\n\t\t3 c. flour\n\t\t2 Tbs sugar\n\t\t1 tsp salt\n\t\t1/2 c. pork lard (from pulled pork)\n\t\t1/2 c. + 2 Tbs Butter, cold\n\t\t1/2 c. ice cold water\n\t\t1 tsp cider vinegar\nPie Decorating\n\n\t\tBlack food dye\n\t\tGreen food dye\n\t\tRed food dye\n\t\t2 egg yolksTools\n\t\tRoasting Pan\n\t\tPlastic Wrap\n\t\tSpring Form Pan\n\t\tAluminum foil\n\t\tparchment paper\n\t\t3-4 c. uncooked rice\n\t\tPaper\n\t\tPencil\n\t\tScissors\n\t\tfood processor\n\t\trolling pin\n\t\tpastry brushes. Dry Rub\n\t\t2 Tbs brown sugar\n\t\t1 Tbs kosher or other coarse salt\n\t\t1 Tbs Paprika\n\t\t1Tbs Black Pepper\n\t\t1 1/2 Tsp garlic powder\n\t\t1 1/2 tsp dry mustard\nMix all dry ingredients in a bowl.\u00a0 Rub dry ingredients all over pork roast.\u00a0 Rub it deep into all cracks and folds.\u00a0 Wrap roast in plastic wrap and chill 4-12 hours.\u00a0 Preferably overnight.\u00a0. Preheat oven to 275-300 degrees F.\u00a0 Remove plastic wrap and place pork in a roaster.\u00a0\u00a0 Place roaster with pork in oven and roast for 3-4 hours, until pork begins to easily come apart with a fork.\u00a0 Crack the crusted surface of the pork and pour 12 oz. of hard cider over roast.\u00a0 Cover with aluminum foil and return to oven for 30 more minutes.. A classic British meat pie has a very dense filling, so once the pork is cooked and pulled apart, we want to compress it so it will make a nice compact portion for the inside of our pie.\u00a0\nAfter the 30 minutes of roasting with the cider, remove pork from oven and cool to room temp.\u00a0 Remove pork from roasting pan and place in bowl.\u00a0 Using a fork, pull pork apart.\u00a0 Be careful, pork will still be hot on the inside.\u00a0 The pork should be very moist and supple.\u00a0 Squeeze the pork together with your fingers and form into a ball.\u00a0\nUsing a sheet of plastic wrap, wrap the pork ball as tight as you can to further compress.\u00a0 Place under some weights, a few plates work nicely and chill for a few hours, or overnight.\nUsing a spatula, gently spoon the cider au jus into a container and chill.\u00a0 Once the liquid is chilled, the lard will solidify at the surface.\u00a0 We will reserve the pork lard for the pastry and the au jus as a serving sauce.. While the pork is roasting and compressing, we'll create our eye design and mold.\u00a0 Outline base of a spring form pan on a sheet of paper.\u00a0 Card stock is nice, as its thicker.\u00a0 I used an old manila folder.\u00a0 Using a stock image of an eye drawing, draw an eye within the circle of the spring form pan.\u00a0 This will be a guide for your eye pie.\u00a0\nNow its time to make the walls of our eye mold.\u00a0 Place the eye drawing at the base of your spring form pan.\u00a0 Fold sheets of aluminum foil and using your eye guide, place aluminum around edges of eye.\u00a0 Fold foil together until you have an eye shaped mold.\nOnce we have the eye mold walls made, we can assemble the mold.\u00a0 Begin by wrapping the circular base and the walls with aluminum foil.\u00a0 Wrap each individually.\u00a0 Place spring form base into pan walls and clip spring form base in place.\u00a0 Place the aluminum mold walls into the center of the pan.\u00a0 Fill the edges between the mold and the walls of the pan with a few cups of rice.\u00a0 The rice will give stability to the pie while it bakes.\u00a0\nPlace a sheet of parchment over your original eye design and outline outside of eye.\u00a0 Cut and place at the bottom of the foil mold.\u00a0 Cut sheets of parchment and line the walls.\u00a0 The parchment will keep the pie from sticking to any of the aluminum foil.\n\u00a0. Pie Dough:\n3 c. flour\n2 Tbs sugar\n1 tsp salt\n1/2 c. pork lard (from pulled pork)\n1/2 c. + 2 Tbs Butter, cold\n1/2 c. (8 Tbs)\u00a0 ice cold water\n1 tsp cider vinegar\nUsually I will make this recipe with just butter, but since we have the lard from the pork, we should use it!\u00a0 Lard makes an even flakier crust then butter and it adds a nice taste to the crust as well!\nOnce your au jus is chilled, the lard should have risen to the top and solidified.\u00a0 You can now easily remove the lard.\u00a0 Scrape off and pat away any au jus still on the lard using paper towel.\u00a0 The dark au jus could stain our pastry.\u00a0\nIn a food processor, mix all dry ingredients.\u00a0 Add butter and lard, pulse until well mixed.\u00a0\nIce down 1/2 c. cold water and add cider vinegar.\u00a0 While pulsing, slowly pour in icy liquid mixture until dough is chunky.\nRemove from food processor and pat into a dough ball.\u00a0 Wrap in plastic wrap and chill for at least an hour.. After dough has chilled, remove from the refrigerator.\u00a0 Cut 2/3 of dough.\u00a0 Place 1/3 of dough back in the fridge.\u00a0 On a floured surface, roll out dough in a rectangle big enough for the base and walls of the mold.\u00a0 In order for this pie to be self standing, after its baked, the dough has to be rolled out thick, about 1/4\" thick.\nPlace the dough in the base of the mold and pat it down into all the edges and crannies.\u00a0 You want a bit of an overhang on top so you can secure the top dough cover when you get it into place.\u00a0\nRemove your compressed pulled pork ball from the refrigerator.\u00a0 Remove plastic wrap.\u00a0 Place meat into pie.\u00a0 Form meat ball to fit the pie opening.\u00a0 Pour 1/4 c. of au jus over the pork.\nRemove remaining 1/3 of dough from fridge.\u00a0 Roll out 1/4\" thick.\u00a0 Using your original eye design template, cut out dough topper. Reserve dough scraps for decorating eye.\u00a0 Place dough topper into pie mold.\u00a0 Brush egg yolk over edges of pie topper and roll edges of eye dough into place.\u00a0. \nCut apart your original eye template into the parts, eye lashes, tear duct, iris, pupil, etc..\u00a0 Roll out remaining dough scraps, and cut out eye parts.\u00a0 Since this is a Pi day celebration, we're going to make the pupil Pi shapped!\u00a0 How clever.\nDilute a few drops of food coloring into a few drops of water.\u00a0 Using a pastry brush, brush appropriate colors onto eye parts, i.e. black onto eye lashes and pupil, green (or whatever eye color you like) for the iris and red for the tear duct.\nBrush undersides of eye parts with egg yolk and set into place on your eye.\u00a0. \nPreheat oven to 325.\nBefore putting pie into the oven, drip a 1 tsp of water into each of the rice wells and mix.\u00a0 This will keep any kernels from popping.\u00a0\nPlace pie into oven and bake for 30 minutes.\u00a0 After 30 minutes, remove and brush top of eye with egg wash.\u00a0 Bake for another 20 minutes.\u00a0\nNow its the moment of truth.\u00a0 Its time to remove the mold and see if this pie will stand on its own.\u00a0 Remove pie from the oven.\u00a0 Place the spring form pan into a large bowl or pan.\u00a0 Remove spring form sides and remove rice.\u00a0 Cut off aluminum foil mold and peel off parchment sides. Turn oven up to 400 degrees F.\u00a0 Brush sides and top with egg yolk.\u00a0 Put the pie on the spring form base onto a baking tray and put back into oven.\u00a0 Bake until sides are nicely golden, another 30 minutes or so.. Remove from oven and allow to sit for 15-20 minutes.\u00a0 Remove from spring form pan base and parchment bottom.\u00a0 Serve warm.\u00a0 Garnish with drizzled cider au jus. Enjoy the glorious eye pie for a little while, but not too long.\u00a0 Eat this up before it gets cold!\nServe with Hard Cider of course and get pie-eyed!\u00a0 Now that's a hell-of-a-pie.\u00a0 Happy Pi Day!\u00a0\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 3, 0, 2]\nC: [0, 3, 2, 1]\nD: [3, 2, 0, 1]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_118_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_118_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_118_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_118_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [0, 2, 3, 1]\nC: [2, 0, 1, 3]\nD: [1, 0, 3, 2]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. For the dog cupcakes, I frosted the whole cupcake with chocolate frosting, and added a dollop of chocolate frosting toward the bottom for the snout.. I then used Wilton tip #233 (the grass/hair tip) to do brown fur originating from the snout and going outward. I didn't do it on the snout, or in the center part of the head.. Next I added ears toward the top using the same tip.. I then switched to white frosting colored with \"Ivory\" food coloring until it was a light tan color. I used the same tip (Wilton #233) to do the center of the head, and then the snout. I always made sure to start each strand of fur in the center of the top of the snout.. I added a chocolate covered raisin for the nose.. I added eyes using the same chocolate frosting, and a pink bow.. For the cat, I frosted the cupcake white and added two balls for the cheeks using Wilton tip 12.. I added eyes using chocolate frosting and piped in a pink nose as well.. For the ears, I cut a miniature marshmallow in half using my kitchen scissors, and then pinched one end to form the triangle.. I then cut 6 strands of black licorice about 2 or 2 1/2 inches long, and poked them into the cupcake until they were the desirable length.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 2, 3, 1]\nC: [2, 0, 1, 3]\nD: [1, 0, 3, 2]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_119_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_119_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_119_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_119_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [1, 3, 0, 2]\nC: [1, 2, 3, 0]\nD: [0, 2, 3, 1]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Ingredients you'll need:\nStore-bought Corn Tortilla Chips (cooked)\n8 - 12 oz mini Chocolate chips\n2 Tablespoons Fleur de Sel or coarse Sea Salt\nCandied Jalapenos- See Step 2 for tools, ingredients and preparation\nChili-infused Toffee- See Step 4 for tools, ingredients and preparation.. Tools:\n1 cookie sheet lined with parchment paper.\n1 small saucepan\n1 small paring knife\n2 small bowls\nIngredients:\n4 to 6 Fresh green Jalapenos and red Fresno chiles. If you can't find red Fresno chiles, just stick with jalapenos.\nSugar- 1 cup total\nGreen food coloring for Jalapenos and Red food coloring for Red Peppers.\nWater\nPreheat your oven to 200 degrees. Line a cookie sheet with parchment paper or foil. Set aside.\nIn separate bowls, combine 1/4 cup sugar with a few drops of green food coloring and 1/4 cup sugar with a few drops of red food coloring. Stir them up until you're happy with the color of each sugar. Set aside.\nSlice the jalapenos in 1/4\" rounds. Use a small paring knife to cut away the excess fiber inside. Remove seeds, too. Set aside.\nIn a small sauce pan, combine 1/4 cup water with 1/2 cup sugar. Stir over medium-high heat until the sugar water/syrup boils. Reduce heat to medium-low and add the jalapeno slices. If you're using red and green jalapenos, do them in separate batches but use the same syrup.\nSlow boil for 1 minute. Remove the pan from the heat and 1 by 1, drop the sliced chiles into the coordinated colored sugar. Shake to coat then remove each sugar-coated slice onto the parchment lined baking sheet.\u00a0\nWhen all of the slices have been sugar-coated, place the baking sheet in the 200 degree oven for 15 minutes.\nAfter 15 minutes, remove them from the oven and leave them uncovered for an hour or until the sugar has dried and is set.\nIf the humidity is high in your area, you may want to bake them again at 200 degrees for 15 minutes.\nThe candied jalapenos can be made the day before and left to air-dry overnight.\nOnce dry, use small scissors to cut them in tiny pieces. Set aside.\n\u00a0. Tools and food stuff:\n2 cookie sheets covered with parchment paper.\nHand select the flattest tortilla chips from the bag and lay them on the lined cookie sheet.\nBowl of mini chocolate chips\nChopped candied jalapenos\nYou'll be building the Toffee Tortilla Chips right next to your stovetop so make room if you haven't already.\u00a0. Tools:\nA candy thermometer, a saucepan, a long wooden spoon and 2 metal spoons.\nIngredients:\n1 cup Sugar\n1 cup Butter\n1 Tablespoon Light corn syrup (optional)\n1 teaspoon Chili Powder (Use your favorite)\nClip the candy thermometer to your sauce pan, then add all of the ingredients above.\nOver medium heat, cook and stir until the sugar is melted. \u00a0\nTurn the heat to high and stir constantly until the thermometer reads 350 degrees. At this temperature you'll begin to smell the chili powder and see a little smoke.\nRemove boiling hot toffee to a cool burner.\nOne at a time, hold each tortilla chip (by a corner) over the saucepan, angled downward.\nCarefully spoon the hot toffee syrup over the top of the chip. (Only coat one side, not both sides.)\nShake gently, allowing the excess toffee syrup to drip back into the saucepan. Be careful. The toffee is HOT!\nPlace each toffee-coated chip on the lined cookie sheet.\n\u00a0\nSprinkle mini chocolate chips over the toffee while it's still \u00a0fairly hot. (This the time when an extra pair of hands will be helpful, if you can find them.) \u00a0\nGive the mini-chips a few minutes to soften, then use the back of a clean spoon to smooth the chocolate over the toffee.\nSprinkle a few bits of candied Jalapeno peppers on top of the chocolate.\nPlace the Toffee Chips in the freezer for 15-20 minutes to set.\nRemove the Toffee Chips from the freezer and pinch a tiny amount of fleur de sel (or sea salt) on top as the final garnish. \u00a0If this is done while the chocolate is hot, the salt might melt.\u00a0. One at a time, hold each tortilla chip (by a corner) over the saucepan, angled downward.\nCarefully spoon the hot toffee syrup over the top of the chip.\nShake gently, allowing the excess toffee syrup to drip back into the saucepan.*\nPlace each toffee-coated chip on the lined cookie sheet.\nSprinkle mini chocolate chips over the toffee while it's still fairly hot. (This the time when an extra pair of hands will be helpful, if you can find them.)\nGive the mini-chips a few minutes to soften, then use the back of a clean spoon to smooth the chocolate over the toffee.\nSprinkle a few bits of candied Jalapeno peppers on top of the chocolate.\nPlace the Toffee Chips in the freezer for 10-15 minutes for the chocolate to set.\nRemove the Toffee Chips from the freezer and pinch a tiny amount of fleur de sel (or sea salt) on top as the final garnish. If this is done while the chocolate is hot, the salt might melt.\n*Be really careful! The toffee is HOT and can burn you!!!. Toffee Tortilla Chips are like crunchy little candy bars!\nYou can serve them alone.... or with a bowl of Ice cream... or?\nThey taste best (crunchiest) when served the same day they're made. They are still yummy the next day and even the day after that.... just not as crunchy.\nIf you have leftover toffee just put it in a ziplock baggie and use the flat end of your meat hammer \u00a0to break it up into small pieces. They taste great by themselves, but can be used in cookie recipes and sprinkled over Ice cream or Cheesecake.\nEnjoy!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 3, 0, 2]\nC: [1, 2, 3, 0]\nD: [0, 2, 3, 1]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_120_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_120_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_120_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_120_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [3, 1, 0, 2]\nC: [3, 1, 2, 0]\nD: [3, 0, 1, 2]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. For this recipe you will need:2 c chopped walnuts1 1/2 c powdered sugar2 c all-purpose flour1/4 tsp salt2 tsp vanilla extract5 tbsp sugar1 c unsalted butter. Melt the butter in a microwave and then cream together the butter and sugar using an electric mixer.. Mix in the salt and vanilla until combined.. Add the flour to the batter a little at a time, mixing well between additions.. Mix in the walnuts.. Put the batter in the refrigerator for an hour.. Roll 1 tbsp of the cookie dough into a ball shape and place on a baking sheet lined with parchment paper.. Bake in an oven that has been preheated to 350\u00b0 for 13 minutes.. Remove from the oven and place the whole baking sheet on top of a wire rack to cool for 2 minutes.. Now coat the cookies with powdered sugar. . After all of the cookies have been coated with the powdered sugar, coat them again.. I hope you enjoyed this instructable and if you decide to make these, let me know what you think about them!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 1, 0, 2]\nC: [3, 1, 2, 0]\nD: [3, 0, 1, 2]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_121_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_121_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_121_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_121_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [1, 0, 3, 2]\nC: [0, 2, 1, 3]\nD: [3, 0, 1, 2]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Heavy kitchen pot2 tbsp coconut oil1/2 cup white popcorn kernels1 tbsp sugar1/2 tsp salt(Remember \"two, and a half, and one, and a half\"). Yes! Put it all in the 1/2 cup container.  It will fit.. Depending on the ambient temperature, the coconut oil may either be solid or liquid.  If solid, heat it up just enough in the pot to melt it.. Leave a little bit of the cover open to vent.. They should all pop within a few seconds of each other.  Have your vent on, as at this time the oil may smoke a bit.  If it's a lot, turn down the heat slightly.. Shake immediately once the ingredients are in.  You want to keep the kernels and sugar in motion so that the sugar does not burn on the bottom of the pan (which is very difficult to remove).. Nothing much will happen for about a minute, but the popcorn is quickly coming up to temperature.Once you get an inch or so of popped corn, vent the top to let out steam, and continue shaking. The popped corn helps keep the remaining kernels in the pot but be careful!  Occasional drops of hot oil may escape.  Protective eyewear is recommended.. If done correctly, there will be very few if any residual unpopped kernels.. As much as you will be tempted to, don't start eating it right away!  That sugar may still be nuclear hot!Wait a minute or two for all steam to evaporate and the sugar to crystallize.  The popcorn will be then be unbelievably crunchy, slightly sweet, and slightly salty.  And all with only two tablespoons of oil!Store in a sealed container, or eat it all immediately.. I actually use a heavy steel (not aluminum) Back to Basics whirly popper.  It's a bit more expensive but worth it.  You are also much less likely to burn the sugar with this device (except the first time).\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 0, 3, 2]\nC: [0, 2, 1, 3]\nD: [3, 0, 1, 2]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_122_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_122_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_122_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_122_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [2, 1, 0, 3]\nC: [3, 2, 1, 0]\nD: [0, 2, 1, 3]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. This is the second best part. There are only 5 ingredients.1. 1lb unsalted saltine crackers. 1lbs is a box of 4 sleeves.Take my word for it and get unsalted. The flavors of this whole \"dish\" are much better when you don't feel like you just licked a salt brick.2. 1 cup canola oil3. 1 packet powdered ranch dressing mix. 1 oz is a packet in case you have a vat of ranch dressing mix.4. 2 tblsp crushed red pepper flakesYes, 2 tablespoons, not a typo. This is not a dish for the weak. Add more if you dare.5. 1/2 tsp garlic powder. Ok here is the hardest step (at least for me...), you have to find a suitable plastic container. Start searching all the food storage containers, Tupperware, Rubbermaid and zip lock tubs you have in your kitchen. You are looking for a short container that can hold as many crackers as possible, in a sealed location. A good lid/seal is important since it will get flipped over. Ideally this container will hold all 4 sleeves, but I have used 2 containers, so don't worry if you have to split it up. The plan here is to line up the crackers on edge so they are loosely siting like they would in the sleeve and when the container has the lid on and is flipped over, all the crackers stay in place.After selecting the correct vessel, take the crackers out of the sleeves and line them up in the container(s).  See the picture for more help. Don't stack up, only 1 level.. Take out a bowl, measuring cup, large mug or whatever you have and pour all the ingredience (except the crackers) into the liquid holding device and stir.1 cup canola oil1 packet (1 oz) powdered ranch dressing mix. 2 tblsp crushed red pepper flakes1/2 tsp garlic powder. Keep stirring. When you stop the ingredients settle in the bottom and it won't pour as nicely. So as you stir, pour the mixture over the crackers. Make sure to get the crackers in the corners and on the edges. If you were not able to get all the crackers into one container, than with a little math you can tell how much of the mixture to pour in each container. 4 sleeves = all the mixture3 sleeves = 3/4 the cup2 sleeves = half the mixture1 sleeve = 1/4 the cup.After  you have run out of mixture, put the lid on the container(s) tight. Than flip the container(s) over. Every 5 minutes or so, flip the container over again. Again, and again for around 20 minutes. When your patience has run out, and you don't see liquid running down the sides of the container when you flip it over, you're done.. Supposedly, you can put leftovers in a zip lock bag, seal air tight, and they last about a week or so. I have never experienced this phenomena. At best, I made some in the evening and there were left overs the next morning, but not when I got off work that afternoon. They're surprisingly addictive...A word to the wise and a disclaimer from a friend: \"As temping as this is, do not, I repeat DO NOT, eat all 4 sleeves yourself in one sitting\". No matter how good the movie is, all you will remember is the stomach pain in the morning. \nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 1, 0, 3]\nC: [3, 2, 1, 0]\nD: [0, 2, 1, 3]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_123_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_123_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_123_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_123_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [0, 3, 1, 2]\nC: [3, 2, 0, 1]\nD: [2, 1, 0, 3]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. 1 LB GROUND BEEF\n1 ROLL OF SAUSAGE\n1/2 LARGE ONION\n2 EGGS\n1 CUP COOKED RICE\n1 LARGE GREEN CABBAGE\n1 JAR FAVORITE PASTA SAUCE\n1 PACKAGE ITALIAN CHEESE\nSALT, PEPPER, GARLIC POWDER TO YOUR TASTE.. core the cabbage(remove the hard middle) with a large knife, being careful not to cut the leaves. \nboil the cabbage for 20 to 30 minutes, or until the leaves are pliable, but not soft.\nnote: it took me some time to get the time right on this, if you make them too soft, they will be impossible to roll.\nremember that they will cook a little bit later on in the oven.. brown the ground beef, sausage, onions and seasonings in a skillet.\nadd salt and pepper to taste.\nyou MUST cool the meat in the freezer after it is fully cooked for at least 15 minutes. while the meat is cooling, cook the rice.\nadd two eggs and cooked rice to cooled meat and mix, this will be your filling.. peel one leaf at a time from the cabbage, being careful not to tear.\ncut the hard center piece of the cabbage off. (see picture for illustration)\nlay on a separate plate and fill with about 3 spoons of the meat mixture.\nadd a sprinkle of cheese, and roll up until tight.\ncut off remaining cabbage, and cabbage on end that does not contain filling.\narrange in a row in large glass baking dish.\npour your favorite spaghetti sauce over the rolls.\nfill empty sauce jar 1/2 full with water, cover, and shake.\npour water over rolls, making sure to cover every edge with water.\nadd remaining cheese over the top.\ncover the dish with foil.\nbake 350 for 1 hour 30 minutes.\nturn oven off and let rolls cool for 45 minutes.\nremove from oven and enjoy!\n\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 3, 1, 2]\nC: [3, 2, 0, 1]\nD: [2, 1, 0, 3]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_124_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_124_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_124_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_124_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [1, 2, 3, 0]\nC: [2, 0, 1, 3]\nD: [1, 0, 3, 2]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n.   \u2022 Onions   \u2022 2 separate cups of all-purpose flour   \u2022 salt, pepper   \u2022 two cloves of crushed garlic  \u2022 12 ounces beer/ 1 bottle   \u2022 Cold water or Buttermilk   \u2022 Oil (for frying). Cut onions half an inch thick, remove dark exterior cover of onions and separate them into rings. Soak them in Cold water (or buttermilk) for 1 hour. This helps in breaking down the onions and removing their bitterness. In a pan/dutch oven heat up oil for frying them rings.. In a bowl, combine one cup of flour,salt, pepper and crushed garlic.Mix them thoroughly.In another bowl combine beer and one cup of flour and mix thoroughly.adding beer to this recipe makes the coating much more crunchy!. Remove onion rings from buttermilk, shake off excess.    Dip the rings in flour,coat properly and shake off excess, then dip in beer batter.Drop the onion rings in oil and be sure not to crowd rings as they will stick together.    When golden, remove and drain on paper napkins/towels.The onion rings can be served with plain mayonnaise like I did or you can use this recipe to make a spicy dip -combine sour cream, mayonnaise, sun-dried tomatoes, lime juice, 1/2 teaspoon pepper, 1 teaspoon salt, white wine and minced garlic and little chipotle sauce in a mixer or food processor. you can even add chopped coriander or mint. cover and refrigerate it.Hope you all liked!suggestions and comments are welcome.Also, this is my twist on an onion rings recipe I saw on instructables- https://www.instructables.com/id/Goooulish-Onion-O-...\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 2, 3, 0]\nC: [2, 0, 1, 3]\nD: [1, 0, 3, 2]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_125_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_125_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_125_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_125_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [3, 1, 2, 0]\nC: [0, 2, 1, 3]\nD: [1, 3, 0, 2]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. layout some strip of bacon on your clean surface.. slap some sausage on your bacon. roll the bacon goodness making sure to tuck the sides in.. I cooked mine at 380f for about an hour.garnish with avocado to make yourself fell less guilty!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 1, 2, 0]\nC: [0, 2, 1, 3]\nD: [1, 3, 0, 2]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_126_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_126_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_126_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_126_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [3, 1, 0, 2]\nC: [3, 2, 0, 1]\nD: [1, 3, 0, 2]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. You have two options for getting chips. Yay, options!\nThe first is just to buy the chips. Normal tortilla chips will work OK, but I think a slightly sweetened chip is a better fit. The good folks from \"Food Should Taste Good\" have some particularly nice options: sweet potato and chocolate are both yummy.\nThe second approach is to make your chips. Since I've tagged this snack as healthy, I'm avoiding the deep fryer, and recommend you bake tortilla chips. (If you live for deep frying, well, you can go that route too.)Ingredients\nTortillas (corn, wheat, or flour... whatever floats your boat)\nCooking spray OR vegetable oil (e.g. peanut or corn; if you're feeling crazy, try coconut oil.)\nWhite OR brown sugarDirections\nPreheat oven to 400F. If you're using oil, brush the tortillas lightly on both sides with some oil to help the chips crisp up and avoid sticking to the pan. Cut tortillas into strips or triangles or whatever shape you want your final chips to turn out. Unicorns? Why not! If you didn't use oil, spray a flat pan with cooking spray and lay out your tortilla pieces, and then spray the tortillas again with cooking spray. Otherwise, just spread out the tortillas. Sprinkle sugar lightly over the chips, and bake in the oven until crispy and a pretty shade of brown (~10 minutes).. Next step is to make your salsa. There is a lot of room for customization here, but here is something to start with:IngredientsNote: These quantities are rough... don't stress, you can't go wrong.\n1 pint of strawberries\n1 small jicama\n2 Tablespoons lime juice\n2 kiwisOptional: fresh gingerInstructions\nChop up the strawberries, kiwis, and jicama into chunks. Martha has some instructions on how to chop jicama if that's a foreign food to you. Pour in the lime juice, and if you're a ginger fan, add some (1 teaspoon) fresh grated ginger. Pop it in the refrigerator until you're ready to serve.Feel free to add other or additional fruit. Apple would make a fine jicama replacement, cherries are great, green grapes might be nice. Go wild.. Next, we make our peach dip, which is supposed to look like nacho cheese... but you already knew that.Ingredients\nGreek yogurt (plain or vanilla)\n2 peachesOptional: honeyDirections\nRemove the skin from the peaches. Sound like a pain? I did it by blanching, which means adding a small slit in the skin, boiling for a few minutes, and than shocking in ice water. The skin comes right off. (Here's a video demonstration of blanching.) Then cut up the peach into chunks and toss it into a blender until it's properly pureed. Add greek yogurt to your blender in small batches until you get a nice nacho cheese color and texture. If your peaches are sweet and you like the tangy taste of yogurt, you can stop here. Otherwise, add and blend in some honey to taste.\nIf peaches ain't your thing, try this with a different orange-colored fruit puree. Mangoes would definitely work, and apricots might be good too.. I never considered using avocado in a sweet dish until I went to Indonesia, where avocados are commonly used in fruit smoothies and milkshakes. Making a sweet version of guacamole was the motivation behind these nachos, and in my opinion, it's the best of the dips. (P.S. You should also try to make an avocado milkshake. Here is a good recipe.)Ingredients\n2 Avocados\n2-3 teaspoons condensed milk\nCherries. I used bing cherries because they've got some yellow and red in them, but whatevs.Optional: jicamaDirections\nClean and cube the avocados. De-pit* the cherries and cut into small pieces. Pour condensed milk and mix. (If you have some extra jicama, feel free to chop that up and throw it in too.)\n* If you don't have one of those fancy cherry de-pitter gizmos, a poor man's version is to put a cherry on a beer bottle and poke the pit out with a chop stick.. You know what to do...\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 1, 0, 2]\nC: [3, 2, 0, 1]\nD: [1, 3, 0, 2]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_127_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_127_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_127_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_127_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [1, 0, 2, 3]\nC: [3, 2, 0, 1]\nD: [0, 2, 3, 1]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. 1) Beetroot - 1/3 Cup2) Carrot - 1/3 Cup3) Spinach - 1/3 Cup4) Red Cabbage - 1/3 Cup5) Turmeric - 1/8 Tsp6) Confectioner's Sugar (Powdered Sugar) - 1/2 Cup7) Corn Starch - 1 1/2 Tsp 8) Vanilla Extract (Any extract can be used)  - 1/2 Tsp9) Water - required to blend . Beets/ Carrots/ Spinach:1) Puree vegetables in a juicer or a blender. (I used Blender)2) When using a blender, add little water to blend the vegetables. 3) Then filter the vegetable puree in order to remove the crushed vegetable pieces. Red Cabbage :1) Boil the chopped red cabbage in water (covering the cabbage) till the water turns dark purple. Then strain the purple water. Keep it aside.2) Take some of the purple water and add little by little of Baking soda to obtain Blue color.Turmeric :1) It can be added directly to the batter in the powdered form in a very little quantity(If added in excess overpowers the taste) . 1) Sift the powdered sugar along with cornstarch in order to remove any lumps present.2) Take a portion of sifted powdered sugar and cornstarch, add vanilla extract and the desired vegetable puree little by little.3) Mix well to form a batter with smooth and little thick paste consistency (not very thick. should be able to pipe it using a piping bag)4) Similarly make the batter for all the vegetable puree separately.(different extracts like almond, lemon can be used)5) Transfer the mixture into disposable piping bags (I used individual Ziploc bags for each color) and cut the edge making a small hole or use a very small round tip 6) Line a cookie sheet with wax paper or parchment paper.7) Pipe out long lines across the parchment paper. 8) Let dry undisturbed in a cool place for about 24 hours or until dry to touch9) Gently break the lines into small pieces and store the sprinkles in the airtight container for up to 3 months.Naturally Made Rainbow Sprinkles are Ready ! . Boiling Method: [Boiling the vegetables in water and using the boiled water]1) The color obtained from boiling the vegetables is lighter as it is diluted. 2) water content is more. 3) As the color obtained is lighter it is needed in higher quantity. 4) The taste is diluted when compared to juice method.Juice Method : [Blending the vegetables]1) The color obtained from the vegetable juice is more vibrant.2) less water content.3) As the color obtained is vibrant it is needed in less quantity.4) It is likey to taste a little stronger than the boiling method.I tried both the method and used the Juice method for making sprinkles since I need the color with less water content.. 1) The vegetable puree can be frozen and stored for future use.(I used ice cube tray)2) Vegetable puree can be used as water color paint when it is frozen.3) The vegetable puree can be used to naturally color cupcake frosting, Play-Doh and homemade finger paints etc..\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 0, 2, 3]\nC: [3, 2, 0, 1]\nD: [0, 2, 3, 1]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_128_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_128_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_128_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_128_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [2, 0, 1, 3]\nC: [2, 3, 0, 1]\nD: [1, 0, 3, 2]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. \n\t\tTable\n\t\tChopping Block\n\t\tBig Knife (serrated)\n\t\tPineapple\n\t\tTea Towel\n\t\tNewspaper (for composting waste). Whip off both ends of your fruit and tip them onto the waiting newspaper. The bottom end can be a bit tough so make sure you remove enough.\u00a0. Stand your pineapple on end and begin to shave away thin slices of the rough skin. Don't go too deep and ignore the small round brown holes that are left, you will deal with these in the next step.\u00a0\nTwist your pineapple as you go and the cut edge will be a guide to your next slice.\u00a0. This speckled fruit must now be flipped onto it's side ready to take out those small brown holes. You will notice the holes form two spirals going in opposite directions, either of these spirals can be followed with this technique, I prefer the longer spiral in the first photo working from left to right.\u00a0\nWith the pineapple on it's side grip hard with one hand and cut out a shallow groove in a spiral pattern using the holes as your guide. This is best done with several small cuts matching up.\u00a0\nRepeat this until all the small brown holes are gone.\u00a0. You will always be left with a few little flecks of skin, you can nip these out with the knife depending on how much effort you are willing to put in. Remember it is all roughage.. Your pristine pineapple is now ready to serve. You can slice it as thick or thin as you like, it is all down to taste. \u00a0 \u00a0\nWith it on it's side grip it firmly without too much downward pressure, this could blunt the spikes and slice away.\nAs was mentioned before the bottom end can be a bit tough so when presenting leave this bit off the dish and eat it yourself.\nThe slices can be arranged in a line or any other pattern you want.\nIf done right this simple technique leaves a dish that will catch anyones eye.\u00a0\n\u00a0\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 0, 1, 3]\nC: [2, 3, 0, 1]\nD: [1, 0, 3, 2]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_129_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_129_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_129_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_129_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [1, 0, 2, 3]\nC: [3, 2, 0, 1]\nD: [3, 1, 0, 2]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. 1/2 lb of your favorite cheese (I'm using white cheddar)\n\t\t4 cooked potatoes (russet works well - mine were TINY so I used more)\n\n\t\t3-4 cups of cream (sure, you could use milk, but it won't be nearly as delicious)\n\n\t\ta tablespoon of butter\n\n\t\tgenerous pinch of red pepper\n\n\t\tcouple pinches Italian seasoning\n\n\t\tsalt and pepper to taste\n\n\t\toptional: green onions and bacon for topping/mixing in\nYou'll want to wash and then cook the potatoes before doing anything else.\nIn the microwave:\n\nPrick them all over with a fork, and put them on a microwave safe plate.\n\n\n\nDepending on your microwave, four potatoes should take around 20 \nminutes. I normally do ten minutes, flip them and check their doneness \nwith a knife, and then do an additional ten minutes if they're still \npretty hard, and a little less if they're beginning to soften. :)\n\n\n\n\n\nIn the oven:\n\n\nBake them at 350 F for an hour.\n\n\n\n\n\nOnce they're done, peel them and slice them in half to speed up cooling - and then leave them to cool a bit.\n        . Make sure all potatoes are peeled and cut into chunks.\n\n\nIn\n a saucepan over medium heat, drop in the tablespoon of butter, the red \npepper flakes and Italian seasoning. Let the butter melt and stir the \nseasonings around until they start smelling nice. :). Add the potatoes to the pan and then add 1/3 of the cream. Use a whisk or a potato masher to break down the potatoes until you're happy with \nthem. I like my potato soup to be a little chunky. :)\n\n\n\nOnce the potatoes are mashed enough for you, add in the rest of the cream.\n\n\n\nBring\n this up to a slow bubble, stirring every minute or so. You'll see tiny \nbubbles start to form around the edges of the pot and the soup will \nbegin to thicken a little once it's nice and hot.. Grate your cheese into the hot soup. Stir it often to make sure the cheese melts evenly.\n\n\nOnce\n the cheese is in, the soup will be nice and thick. At this point, add \nsalt and pepper to taste, as well as extra red pepper flakes and Italian\n seasoning if you want them. You might find that the cheese overpowers what you put in before. :D\n\n\n\nGarnish with bacon and green onions if you're feeling up to it or just dig right in. Enjoy!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 0, 2, 3]\nC: [3, 2, 0, 1]\nD: [3, 1, 0, 2]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_130_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_130_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_130_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_130_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [3, 1, 2, 0]\nC: [1, 2, 3, 0]\nD: [2, 1, 3, 0]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Recipe ingredients:Pomegranate-Butter GlazeWe juiced our own but you can purchase these:2/3 cup fresh pomegranates 1/4 cup fresh cranberries.1/2 cup brown sugar1/2 cup honey1/3 stick of butter1 Tablespoon minced garlic cloves1-1/2 teaspoon  Fresh Rosemary mincedTurkey rub recipe:4 Tablespoons fresh Rosemary minced3 Tablespoons fresh Oregano minced1/2 teaspoon Rubbed Sage1 teaspoon ThymeInfused butter and basting:1 stick of butter ( cut about 20 slivers from the stick for the infusing and use the rest of the stick whole for the basting. Freeze until needed for recipe. You will also need a container of chicken broth or make your own, 1 onion, 3-4 celery stalks, 3-4 Carrots Utensils:Roasting pan, juicer (optional), tongs, long fork, slicing/paring knife,  carving knife, chef knife, cutting board, bowls, saucepans, measuring cups and spoons, turkey baster or ladle and a basting brush. . Wash and Prep:For even cooking of the bird set the turkey out and allow it to reach room temperature.  Cut the butter into small slivers (approximately twenty 1/2 cm slivers and leave the remaining butter stick whole for basting and place them in the freezer until hard; this will make the cubes easier to insert into the bird and for easier basting later).Wash the berries, vegetables and herbs.Quarter the onionCube the carrots and celeryMince 1 Tablespoon garlic. Method for dry rub:Mince 3 Tablespoons fresh Rosemary and  4 Tablespoons fresh Oregano.. Method: Measure the dry herbs and mix thoroughly:1/2 teaspoon rubbed Sage 1 teaspoon Thyme Add Sage and Thyme, to the minced Oregano and Rosemary.  Add salt and pepper to taste.. Method:Remove the neck and giblets. You can use these for the dressing or soups later.Cut slits in the turkey breast as shown  using a slicing or paring knife: about 20 incisions.Insert the frozen slivered butter into the cut turkey flesh.Repeat this method across the turkey breast.Rub some butter in the turkey cavity. . Method:Insert and rub a small handful of the herb mixture into the cavity of the turkey. Using your hands rub the herb mixture all over the turkey as shown.. Tucking the wings helps prevent the wing tips from burning. This method works most of the time.Position the bird breast side up as shown.Lay the wing against the breast of the bird to its natural position.Taking note of where the wing tip is make a small incision about an inch and a half lower than the wing's tip taking care to only separate the skin from the flesh creating a small pocket for the wing tip to rest in.Tuck the wing into this pocket.. Make the stuffing according to the box instructions.Stuff the cavity of the turkey.Note: you can skip this step for a faster cooking turkey.. Method:Pre-heat oven to 350 F (for a faster cooking time; use 325 if you're willing to wait about an hour longer)Pour the turkey broth into the roasting pan and add the carrots, celery, and onion. Place the turkey breast side down into the roasting pan. This method will increase the moisture in the white meat as the fat from the dark meat renders and drains down. Midway through the baking process (approximately 1.5 to 2 hours for a 15 lbs bird) you will flip the bird breast side up.  The full cooking time will depend on your birds size.  Our 15 lbs / fully stuffed bird took about 3.75 hrs to fully cook.Baste the the turkey every twenty to thirty minutes for duration of cooking. We will glaze the turkey during the last 15 minutes of baking time.. Pomegranate Butter Glaze Recipe and method:We juiced our own but you can purchase these:2/3 cup fresh pomegranate1/4 cup Fresh cranberries1/2 cup brown sugar 1/2 cup honey 1 Tablespoon minced garlic cloves 1-1/2 teaspoon Fresh Rosemary1/3 stick of butterAdd ingredients to a saucepan starting with the herbs, seasonings, and butter.  For a bit more rosemary flare add an additional 1 sprig of rosemary.  Pour in the juice and bring to a simmer on a medium heat while stirring.  About 2-4 minutes.  Stir in the brown sugar and honey to make a syrup.Note: the syrup will tend to boil over the pans rim so do pay attention and stir continuously.Remove saucepan from the heat.. Using a turkey baster or ladle; baste the turkey using the pan drippings and then apply the frozen butter stick throughout the basting process every 20 to 30 minutes until midway through the baking time. Cover the butter bowl and freeze in-be-tween each basting process.. Remove the turkey from the oven midway through the baking time.Baste the turkey with the pan drippings and  butter as shown. Follow the instructions in the next step before placing the turkey back into the oven to continue cooking. . Flip the Turkey using the tongs and a large fork.Baste the breast with the pan drippings and finish off with the butter.Return the turkey to the oven and finish baking time making sure to baste using the pan drippings and the butter stick every 20 to 30 minutes until the last fifteen minutes of baking time; at this point you will glaze the turkey with the pomegranate glaze.. Using a basting brush glaze the turkey with the pomegranate glaze as shown.Continue baking the turkey until done (approximately 15 more minutes)Glaze the turkey again after you remove it from the oven.Allow the turkey to rest for twenty minutes before carving.. As with all turkey recipes your guests' palate will be enticed by the savory aroma of the roast; tempting them from the kitchen.  However, once the pomegranate-butter glaze hits the skin of the hot bird from the oven; the entire experience changes to a wonderful sweet and savory captivation of the imagination.  The visual appeal of the deep mahogany feast yet to begin will certainly be a welcome table piece for your holiday.This will be a Thanksgiving experience my son and I will cherish because we enjoyed creating a culinary masterpiece from both of our recipes. One year we made a complete Thanksgiving meal using a toaster oven, grill, and a fire pit because we were camped out at my son's property with no working kitchen. My son made the turkey using a power drill to turn the pole! I wish to thank contributors for making Instructables such a delightful place to share! Have a very safe and happy holiday. Thanks for stopping by and I almost forgot to mention . . .  this is an entry for the butter contest November 2014 and if you like this instructable your vote will be much appreciated! Thanks again.sunshiine~\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 1, 2, 0]\nC: [1, 2, 3, 0]\nD: [2, 1, 3, 0]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_131_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_131_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_131_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_131_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [3, 0, 1, 2]\nC: [0, 2, 1, 3]\nD: [2, 3, 1, 0]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Don't buy store bought crackers, make your own! They are really easy to make and people will be shocked that you made them. These crackers are based off a recipe from a food blog that I LOVE.\u00a0 www.foodinjars.com\u00a0The possibilities are endless when it comes to making a cracker flavour. \u00a0I'm going to be showing you how to make one of my favourite combo's,\u00a0ground pepper & Thyme.\u00a0Pre-Heat oven to 450 degreesYou'll Need:\nParchment paper- for baking crackersMixer with dough attachment( no mixer- do it the old fashion way *HANDS*)Ground Pepper & Thyme crackers\n1 1/2 Cups all purpose flour\n1/2 Cup Cake Flour\n3/4 Cup warm water\n3/4 tsp. Salt\n1/2 tsp. black pepper\n1 tsp. fresh thyme ( you can substitute dried)\nAdd the flours, salt, pepper and thyme to mixer. Stir to evenly distribute seasonings. Slowly add the warm water and oil.\u00a0 Mix on medium until ball dough forms.\nIsn't this easy!. Remove the ball of dough from the mixer and bring over to your floured surface\nStep 1:\n-Knead the dough ball on a floured surface until it doesn't stick to your hands\n-Let dough set for 15 minutes\n-Cut dough in half. Step 2:\n-Roll out the dough with a rolling pin until 1/4 thick.\n-Cut out your shapes and lay them on your pan thats\u00a0covered with parchment paper and bake\u00a0in preheated\u00a0oven(450 degrees) for 8-12 minutes*Since this was for a Alice In Wonderland Theme Party, I used a ridged edged heart to represent the Queen of Hearts. I also made some little rabbits because they are great for kids and who doesn't LOVE the White Rabbit from the story.You Can also just use a knife or pizza cutter and cut out odd shapes if you don't want to use cookie cutters. (I'll be showing pictures of all 3 types this way you can see what each looks like)*If you want to add extra thyme on some of the crackers, lightly moisten the top of the crackers with water(NOT TOO MUCH)\u00a0and add more thyme. Press down on the thyme to make it stick.. What you'll need:\nHomemade Pepper & Thyme Crackers\nA tangy Jelly- I used a homemade Plum Wine JellyParmesan\nCheese cutter\nWalnuts - These are from my tree but\u00a0bagged store bought\u00a0are fine tooWalnut Cracker ( if needed)Step 3:\n- Spoon a tiny bit of jelly onto heart shaped cracker\n- Add a pieces of parmesan & walnut\n- Top with a sprig of fresh thyme.\nThe red jelly and white cheese goes perfectly for Queen of Hearts theme***NOTE****\nMake sure NO ONE is allergic to nuts at your party. Make some without nuts and serve on a separate tray making sure to label both trays. Also, make some with just Jelly,\u00a0 or\u00a0sliced turkey\u00a0& provolone or Swiss Cheese. Step 4:\n- Arrange crackers on a fancy tray and serve. Don't over load the tray as it could look messy and unattractive. Think less is more.\n- Have some serve the guests. Alice served ours...**The last picture is some of the appetizers without any nuts. These just had Jelly\u00a0 or Jelly with slcie Turkey and Provolone or swiss cheese.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 0, 1, 2]\nC: [0, 2, 1, 3]\nD: [2, 3, 1, 0]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_132_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_132_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_132_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_132_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [2, 0, 3, 1]\nC: [3, 1, 2, 0]\nD: [0, 3, 2, 1]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. \n\tNeeded ingredients:\n\t8-10 peppers(Green pepper or red pepper are both ok\u00a0, but medium spicy is better)\n\tSoybean sauce,salt,MSG(optional),water,cooking oil\n\tCooking tools:\n\tA frying pan with a Turner\n\tA\u00a0plate\n\tStoves\n\tStart:\n\t\u00a0\u00a0 First,clean the peppers and cut off the stalk.Use clean towel to wipe the water on the surface of peppers, to keep the surface dry.\n\tTips:Remember you must clean up the mild water on the pepper!Because we will then\u00a0put them in the boiling oil\u00a0and fry them, if there is even a little bit water, hot oil will splash around, it's very dangerous!So please keep them dry!If you are the first time to fry,you'd better wear\u00a0gloves or long sleeves,to make you safe.:). \u00a0\u00a0\u00a0 Now we have prepared the ingredients.Then we will start to cook the peppers.Put the saucepan\u00a0on the stove,\u00a0turn on the heat.Put moderate oil into the pan,wait for seconds until the oil are 80 percent hot.(Picture 2)\n\u00a0\u00a0\u00a0 Then turn down the heat,put peppers slowly in the pan,and make them heat evenly.After all the peppers are appropriately placed,cover the pan with the lid,fry them for a short time,about 2 minutes.\nTips:At this step,as the oil is too hot,there will be some oil spots keep spilling out.Take care of yourself and don't worry,put the lid on.:P ). \n\t\tAfter about two minutes,one side of the peppers has already well-fried.Open the lid,use the turner to turn all the peppers to the other side,put on the lid and wait another 2 minutes,to make the other side well done,too.\n\t\tWhen another 2 minutes has past,open the lid,we will start to put sauces in the pan.Put the right amount of salt(Maybe more than half a teaspoon,I forgot XD,you can test it) and soybean sauce(about 20ml).Put a little extra water into the pan,and turn the peppers for several times,to make\u00a0the sauses\u00a0well mixed.\n\t\tTurn up the heat and put on the lid again,When the juice boils,turn down to low heat,heat gently until the juice was absorbed.\n\t\tPut some MSG in the pan.(optional)\n\t\tNow we have finished all the cooking steps!Let's put them on the plate and get ready to eat!:D. Put the dish out on the plate,and now we can start eating!Look at the finished plate,do you fell like\u00a0 to bite a pepper?=)To tell the truth,this dish is a perfect match to rice and porriage!The tempting smell can spread for several meters!The spicy peppers can also open your appetite for other dishes!\nSo if you have enough time and enthusiasm,especially you are a spicy loves,try to cook a\"Braised pepper\"dish!I'm waiting for the news of your success!\u00a0 \\(^o^)/\nauthor:yuhuaabc\u00a0\u00a0cook:my mum:)\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 0, 3, 1]\nC: [3, 1, 2, 0]\nD: [0, 3, 2, 1]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_133_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_133_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_133_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_133_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [1, 2, 3, 0]\nC: [0, 2, 1, 3]\nD: [2, 1, 3, 0]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. The ingredients for the pretzel pies is fairly easy to get, nothing weird or odd.You will need the following for a batchDough3 cups flour* ( might change a little depending on the flour*)4 tablespoons brown sugar1 tablespoon yeast2 teaspoons salt1 cup water (luke warm)Glaze1 egg1 teaspoon waterand pie filling! I used a can, but if you really want to, you can make your own (my way is easier). For this you will need some cooking weaponrymeasuring cups and spoons (duh)A heavy duty rolling pinA large cutting board( or something you can roll the dough out on)a big bowl for mixinga forka spoonOptionalA bread or dough machine, trust me, this makes it a lot easier to make. The dough is quite simple to make,\nadd the flour, water, yeast, sugar and salt together, and MIX! the dough gets really tough. The dough should end up fairly thick. If it is sticky, add a little more flour.\nWhen done, the blob of dough may not seem very large, but it will do a decent size batch of pretzel pie.. To being your pretzel pie making, cut off a golf ball sized chunk of dough.\nRoll the dough out to be longer and wider than your hand. It should be fairly thin, but still staying together well.. Before shaping, first take a spoon full of pie filling and put it in the middle of the flat dough.  Make sure that it doesn't go close to the edges, otherwise it will come out the sides and get all messy. About a spoon full and a little bit usually does it.. The shaping is quite easy, take one side and fold it over the pie filling, press all around the edges to make sure the filling wont come out. Do the same for the other side. Press the side edges down and roll them up a little bit to make sure nothing will escape. pinch all the edges and putt it onto a pan, making sure that the long edge in on the bottom.. To make the glaze, combine one egg and one teaspoon water, and beat. Then brush this onto the tops of the pies. This makes it more pretzelish. If you dont have a brush then gently smear it on.. Before baking, make some quick slits on the tops of the pies, make sure the aren't deep, and dont press into the pie, this will make them turn out better. too deep and it might ruin it, and if you press down too hard, the sides might open up, leading to a gooey mess. Now comes the baking. Put them into the oven at 425 F. Set the timer for about 12-15 minutes. Be sure to check on the regularly. Once they start to brown, remove from oven, or else they might burn on the bottom.. Remove them and enjoy. They take a few minutes too cool down, and be careful the filling will be really hot!\nSome might not turn out as well as the others. but they will all taste just as good!. If you don't pie you can do many other things. You can fill them with cheese and tomato sauce, you can fill them with meat, you can fill them with just about anything. Or if you are really extreme, you could make pretzels!! Enjoy and share! \nSince pretzel pie is such a boring name, I am looking for a new name. I need a really good professional name for them, or a really funny name!\n\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 2, 3, 0]\nC: [0, 2, 1, 3]\nD: [2, 1, 3, 0]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_134_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_134_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_134_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_134_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [0, 2, 1, 3]\nC: [3, 0, 2, 1]\nD: [1, 0, 2, 3]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Only 2 basic ingredients for this simple recipe:handful of chia seedssweetened soy milk[OPTIONAL] any type of fruit!You'll also need a sealable container to cool the pudding - in the picture, I reused an old jam jar. . For one serving of pudding, pour a handful of chia seeds into your container. Right now, they're dry, but once you add in the liquid, the volume will double, sometimes almost triple!. Add in about double or triple the amount of soy milk as you did the chia seeds and stir. This is when the magic happens! After a couple minutes, the seeds will start to absorb the liquid, providing a gel-like \"pudding\" consistency. Now is a good idea to sample the pudding. If you decide it isn't sweet enough, add in some sort of sweetner (preferably honey or syrup, but sugar is fine if you make sure it dissolves). After you're satisfied with the taste, seal the container and put it in the fridge for a couple hours.  An interesting fact - chia seeds can hold up to 12 times their weight in water! . When you take the container out of the fridge 2-4 hours later, the chia seeds will have absorbed enough of the liquid in the soy milk so that the pudding has a more solid, viscous, texture, like that shown in the first picture. To serve, scoop out the pudding into a small bowl, top with fruit and nuts, and Enjoy!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 2, 1, 3]\nC: [3, 0, 2, 1]\nD: [1, 0, 2, 3]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_135_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_135_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_135_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_135_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [3, 1, 2, 0]\nC: [0, 2, 1, 3]\nD: [1, 3, 2, 0]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. \nFor one 12 inch crust\n1 teaspoon corn meal\n1 3/4 to 2 \u00bc cups all-purpose flour\n1 envelope Fleischmann\u2019s Pizza Crust Yeast\n1 \u00bd teaspoon sugar\n\u00be teaspoon salt\n2/3 cup 120\u00b0 tap water\n3 tablespoons oil\n1 tablespoon chopped rosemary\u00a0\nPre heated 425\u00b0F oven. 1. Sprinkle the cornmeal over a 12 inch pizza pan.. 2. Follow the Fleischmann\u2019s Pizza Crust Yeast directions found on the package by combining 1 cup flour, the yeast, sugar and salt in a large bowl.\u00a0 Add the water and oil and with a spoon mix together with the spring onions for 1 minute until well blended.\u00a0 Gradually add \u00bd cup of the remaining flour until a soft dough ball is formed\u00a0 which will be sticky.\u00a0 Add additional flour if necessary to form the ball.\n3. Knead for about 4 minutes on a floured surface scattered with rosemary until smooth and elastic.. 4. Wet your fingers with tap water and press the dough into the pan mounding it slightly at the edges.\u00a0\u00a0 With your index fingers press the edges together to form a rim then prick the dough with a fork 15 to 20 times.. 5. Bake for five minutes on the bottom shelf of the oven.\u00a0 With a fork, pierce any bubbles that have formed.\u00a0 Allow the crust to cool long enough to handle \u2013about 5 minutes.. For the topping\n1 cup canned pumpkin (not spiced pumpkin pie filling)\n1 teaspoon Sriracha sauce\n\u00bd cup grated Parmesan cheese\n\u00bd teaspoon garlic powder\n\u00bd teaspoon onion powder\n\u00bd teaspoon salt\nchives\n6. While the pizza is par baking, make the topping.\u00a0 In a medium bowl, combine the pumpkin, Sriracha sauce, grated Parmesan cheese, garlic\u00a0 powder, onion powder and salt until thoroughly blended.. Spread the pumpkin mixture evenly over the crust then smooth it with an offset spatula.\u00a0 Using the tip of a paring knife, draw the outline of a basketball on the surface of the pizza.\u00a0 Press the chives into the outline making sure that there is a little overhang at the edges, since the chives will shrink while baking.\u00a0 Bake for 15-20 minutes until the crust is lightly browned.\u00a0 Give everyone a chance to ooh and ah over your work then cut into wedges and serve immediately.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 1, 2, 0]\nC: [0, 2, 1, 3]\nD: [1, 3, 2, 0]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_136_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_136_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_136_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_136_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [0, 2, 1, 3]\nC: [0, 1, 2, 3]\nD: [1, 3, 2, 0]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Mileage may vary, according to your fridge scraps, but we'd say the following make for an exceptionally good soup:One carton vegetable broth1/2 roast chicken - pulled/shredded or chopped1/4 cup chopped bacon1/2 white onion, diced1 glove garlic, minced1 cup frozen southern hash brown potatoes (or diced potato)1/8 cup dried lentils1/8 cup dried split peas1/2 cup frozen baby lima beans1/2 cup frozen corn1 tsp Italian SeasoningSalt & Pepper to taste3 tbsp olive oil. Add about 3 tablespoons olive oil to dutch oven and saute onions and garlic till tender and translucent.. Pretty straightforward.  You can shred or chop the chicken as desired, then add to sauteed onion and garlic.. We never really would have thought to add bacon to chicken soup, but there it was - so we nuked our pre-cooked bacon a bit, then chopped it up and dumped it in! . Pour in your broth, stir and bring the whole thing to a light roil.. While your broth base is heating up, wash your lentils and peas, and then add those to the pot.. Add in your potatoes and limas, season as desired, and stir.  We like to leave the corn till the end, so it doesn't overcook, and has that nice sweet firmer texture to it.. Set the pot over a back burner on low, and cover.  Let the whole thing simmer 45 min. to an hour, or until lentils and split peas are tender.. Add the corn during the last 10 to 15 min. of cooking, stir and add a little more water if desired. . Enjoy your Scrap Soup as is or over some rice, and celebrate the tasty fruits of your refrigerator emptying labor!  It's a delicious soup and the bacon really makes it!  Add a little hot sauce for an extra kick.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 2, 1, 3]\nC: [0, 1, 2, 3]\nD: [1, 3, 2, 0]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_137_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_137_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_137_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_137_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [2, 3, 1, 0]\nC: [1, 0, 2, 3]\nD: [0, 1, 3, 2]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Ingredients:\n3 medium onions, chopped\n1 tablespoon butter or olive oil\n~10 tomatoes (big juicy heirloom varieties* are best!)\n~10 cloves garlic\n~1 teaspoon salt\nfreshly ground pepper\nhandful fresh basil, chopped\n1/4 - 1/2 cup heavy creamTools:\nlarge heavy-bottomed pot\ncutting board\nsharp chef's knife\nwooden spoon\n* I get my awesome heirloom tomatoes from Wild Boar Farms at my local farmers' market.\u00a0 They sell seeds online if you want to grow your own!\u00a0 Highly recommended.. Heat pot to medium-low heat, add butter or olive oil, add onions and salt, and saute until onions are soft and just starting to brown.. While onions are cooking, coarsely chop tomatoes.\nAdd them to the pot, and stir gently to mix.\u00a0 Use tomato juice to deglaze the bottom of the pot if necessary.. Mince garlic, and add immediately after tomatoes.\u00a0 Stir to incorporate.. Bring the soup to a simmer and maintain on low heat, stirring occasionally, for about 20 minutes.\u00a0 Tomatoes will soften, and the garlic will cook down.\u00a0 You're ready for the next step when it looks like this.. Coarsely chop and add the basil.\u00a0 Stir to incorporate.\u00a0 Turn off the stove.\nYou want to heat it just enough to wilt the basil, but no more, so be sure the other ingredients are sufficiently cooked before you add the basil.. \nIf you want to add cream to your soup, do so now.\nStart with 1/4 cup of heavy cream, stir it in, then taste your soup.\u00a0 Does it need more cream?\u00a0 Then add more to your taste!\u00a0\nThe first picture below has cream added; the second picture does not.\u00a0 I used roughly 1/4 cup cream, as I like mine very lightly creamy.. Sample your soup, and add more salt and pepper to taste.\u00a0 Add more cream if desired.\nIf you want an additional umami kick a bit of Worchestershire sauce can help, but it's not necessary if you use great tomatoes.\nServe warm if you've used cream, warm or cold if you haven't.\u00a0\n- Great with a garnish of grated hard cheese or a grilled cheese sandwich.\n- Fantastic in a bread bowl.\n- Tastes even better the next day!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 3, 1, 0]\nC: [1, 0, 2, 3]\nD: [0, 1, 3, 2]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_138_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_138_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_138_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_138_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [2, 1, 0, 3]\nC: [2, 3, 0, 1]\nD: [3, 0, 2, 1]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n.  1. Three medium sized potatoes 2. Cooking oil ( I prefer grape seed oil) 3. Seasoned salt or your own spices 4. Onion, bell pepper, or other vegetables (all these are optional) 5. Utensils needed: cooking pot, tongs, fork, paring knife, cheese grater, oven mitts. Gather three medium size potatoes and wash them under running tap water. Place all of the potatoes in a medium sized cooking pan. Heat the potatoes at a low boil for approximately 10 to 15 minutes. When potatoes are done, you should be able to pierce them all the way through using a fork. Caution: Use tongs and oven mitts to handle hot potatoes.. \u00a0After boiling them, carefully remove the potatoes from the cooking pan using tongs. Put the potatoes in a medium bowl and place them in the refrigerator for twenty minutes to allow them to cool.. Once the potatoes have cooled, remove the potato skins with a paring knife.. Shred the potatoes over a container using a cheese grater to do so.. Once you have all the potatoes shredded, place them back into the refrigerator.. Place\u00a0 skillet upon stove and pour 1/3 cup of grape seed oil into the skillet. Heat the skillet at a medium heat until ripples appear on the surface of the oil. If bell peppers or onions are desired, using tongs, place them in skillet and cook them to desired tenderness. Use spatula to turn vegetables occasionally to avoid sticking. Use tongs to add shredded potatoes to skillet. Turn ingredients of skillet occasionally to avoid sticking. Caution: Be careful to avoid splashing hot oil on oneself. If seasonings are desired, add seasoning to skillet. Once all potatoes are a crispy looking brown color, carefully remove the potatoes with a spatula and place them on a paper towel covered plate. (the towel will absorb excess oil from food) Allow hash browns to cool until they are ready to serve. . Enjoy your hashbrowns without the need to tip or the restaurant price.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 1, 0, 3]\nC: [2, 3, 0, 1]\nD: [3, 0, 2, 1]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_139_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_139_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_139_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_139_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [1, 3, 0, 2]\nC: [2, 3, 1, 0]\nD: [1, 3, 2, 0]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. 3oz caster sugar4 oz pudding rice2 pints milk30g butter1/2 tsp ground nutmeg1/2 tsp ground cinnamon1/2 vanilla pod (or extract). Heat the oven to 140 degrees celsius.Melt the butter in a large casserole dish, add the rice and stir.Add the sugar and stir until dissolved. Keep stirring for a couple more minutes.. Stir in the milk.Add the nutmeg and cinnamon and stir.Slice the vanilla pod in half length ways and scrape out the seeds.Add to the casserole dish.Bring up to a simmer, then transfer to the oven.. The pudding will take about 90 minutes to cook.Use a teaspoon to try a little of the rice to make sure it's soft. You want it to be soft and for most of the milk to be absorbed.I find this is lovely with a few sultanas sprinkled on top. Enjoy!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 3, 0, 2]\nC: [2, 3, 1, 0]\nD: [1, 3, 2, 0]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_140_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_140_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_140_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_140_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [2, 1, 3, 0]\nC: [2, 3, 0, 1]\nD: [0, 3, 1, 2]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Things you need are:\n- 1 cup flour (plus extra for flouring surfaces)\n- 1/2 cup warm water\n- 1/2 packet active dry yeast\n- 1 tablespoon olive oil\n- 1/2 teaspoon salt\n- 1 teaspoon sugar\n- 1-2 stalks of basil\n- 4-8 cherry tomatoes\n- shredded mozzarella cheese (as much as needed). In a small bowl, dissolve the yeast in the warm water. Let sit for 10 minutes or so, or until it's very creamy.\nIn a larger bowl, add flour, olive oil, salt, sugar, and the yeast mixture. Stir well with a spoon.\nCover with a dishcloth for 30 minutes. It should have risen and almost doubled in size.\nPreheat oven to 350 degrees F.. Wash your tomatoes well and cut off the tops. Then slice the tomatoes into rounds.\nRemove the seeds (the goopy stuff in the center :)) and chop them.\nAlso chop 1-2 leaves of basil fairly fine. Then cut a few more leaves into larger pieces.\nNOTE: It's easier to get the seeds out of the tomatoes when they're riper.. Now here's the fun part! :)\nFlour your hands and the surface you're working on. (I used aluminum foil because I could easily transfer it onto a baking sheet.)\u00a0\nTake a piece of dough small enough to fit in your palm. Spread it out a bit on the surface and place one or two finely sliced basil leaves in the middle. Layer tomatoes and cheese on top, but leave room to fold the dough up!\nFold the dough into a round, dumpling like shape. See my pictures for help.\nWe're ready to bake!\nNOTE: Don't put too much basil IN the calzone, because when basil bakes for a long time, it becomes brown and not as pleasant to eat. We will put the rest on later.. Your oven should probably be done preheating by now. Place the calzones on a baking sheet and let them cook for 20 minutes.\nAfter this,\u00a0don't\u00a0turn the oven off! \u00a0Quickly (but carefully!) take the pan out of the oven and put a piece of basil on top of each calzone. Sprinkle cheese on them and put back in the oven for 2 minutes.\nDone!. I suggest plating them like in the picture below. That way, with the cut calzone people can add any fillings if they want. (Obviously you would have to put them on the table. :) )\nEnjoy your culinary creation!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 1, 3, 0]\nC: [2, 3, 0, 1]\nD: [0, 3, 1, 2]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_141_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_141_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_141_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_141_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [1, 3, 0, 2]\nC: [3, 2, 1, 0]\nD: [0, 1, 2, 3]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. For two people (easily multiplied):Leftover roast lamb in bite size piecesBasmati rice, 150 ml by volume, washed well and soaked in 300 ml water for half an hour, then drained250 ml stock (or water and half a stock cube)1 small onion, thinly slicedI clove garlic, grated1 teaspoon grated gingerChopped red chilli to taste (we like lots)Quarter teaspoon garam masalaHalf teaspoon salt (less if stock is very salty)Handful cashew nutsHandful coriander leaves (if you like)1 tablespoon rapeseed oilBlack pepper. Heat the oil in a heavy pan over a medium heat. Fry the cashew nuts until golden, then scoop out into kitchen paper. Fry the onion in the same oil for about 5 minutes, lowering the heat when it starts to brown. Stir in the garlic, ginger, chilli, garam masala, salt and drained rice. Fry for a couple of minutes, stirring to coat the rice with the oil.. Add the stock to the pan and cook gently for five minutes, stirring. Cover with foil and a lid and cook in the oven at 170 deg/150 deg fan/ gas Mark 4 for 10 minutes. . Add the lamb on top of the rice, recover and return to the oven for 10 minutes more. Stir the lamb in gently and taste to check the rice is cooked. Recover and leave to stand for 5 to 10 minutes.. Stir in pepper and coriander to taste, check the seasoning and turn into a serving dish. Garnish with more coriander and the cashew nuts. Enjoy! Yoghurt mixed with mint and some salt and pepper is a good accompaniment.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 3, 0, 2]\nC: [3, 2, 1, 0]\nD: [0, 1, 2, 3]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_142_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_142_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_142_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_142_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [3, 2, 1, 0]\nC: [2, 1, 3, 0]\nD: [2, 0, 1, 3]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. A cardboard coffee cup is your best bet.Do not use a ceramic mug or cup.And avoid using a wax paper cold cup or a styrofoam coffee cup.Lightly coat the cup with cooking spray, or wipe the inside of the cup with olive oil, salad oil, butter or margarine on a paper towel.. For a large egg, a seven ounce (200 cc) paper cup is about the right size.Crack the egg and pour into the oiled paper cup.Did I mention that the cup is paper?. Yah, I know yer mom always added a little milk.But wait until you try your egg with a splash of water.A trick I learned from a pro chef.Milk makes eggs a bit rubbery.Water makes them fluffy.Start with about a teaspoon (5 cc).You do not need bottled water -- the bottle is just to make the photo more clear.. Gently stir the egg and water together with a fork, spoon, or chopsticks.Your goal is to break the yolk, and stir it and the water into the white.But you want to feel a little springy body in the mixture.Do not whip into a foam.At this point you can add optional ingredients like a little grated cheese, chopped peppers, onions.. Fold over the top edge of the paper cup.Fold the corners back.Place in your home microwave and start the process.Watch out for office, dorm, or other high power professional style microwaves. You may be scraping your eggs off the oven ceiling.Nuke for 1 minute at 30 percent.This is also called power level 3, or defrost mode on some units.. Time for some geek stuff.The microwave uses bang-bang control.  This just means that the cooking power is either on or off. No such thing as half power.Different microwaves use different schemes, but one of the most popular is to use short bursts of full power.You can test this yourself with a cup of cool water and a pencil and paper.Set the microwave for 3 minutes at power 3.The oven light will dim and the fan sound may change when the cooking power is on.Watch the seconds on the display to measure the time.Record the timing of the power bursts - it is easier to have a helper write them down as you call them out..Try different power settings and cook times and note the results.Always start each test run with cool water in a microwave-safe container. Ya don't want scalding water all over the place now do you? Likewise, never operate the microwave empty or you'll be buying a new one pretty soon.It is good, geeky fun to reverse engineer your microwave oven's power scheme.. So back to cooking the egg -- after the first heating wait ten seconds or so.Open the door.Do not open the cup.Pinch the folded edge shut and swirl the cup to distribute the heat.It should feel like half liquid with some cooked eggs floating in the center.Repeat the 1 minute at power 3 cooking process.Wait a few seconds after -- the egg is still cooking even after the power is off.Now open the cup, and peer anxiously over the edge.If it is still too liquid for your taste, close the cup, swirl it and heat it for 15 to 20 seconds on high.. In a rush?Scarf it right out of the cup with chopsticks or a fork or spoon. Watch out, parts of the egg may be scalding hot!Or pour it out  onto a fancy plate and serve with toast, jam, spices, Tabasco, ketchup - whatever makes your taste buds tingle!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 2, 1, 0]\nC: [2, 1, 3, 0]\nD: [2, 0, 1, 3]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_143_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_143_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_143_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_143_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [2, 0, 3, 1]\nC: [0, 1, 3, 2]\nD: [1, 0, 2, 3]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. For this project you will need the following supplies... 2 Large cooking potsCooking stove (preferably outside in case you make a mess)Isopropyl alcohol Small glassForkStrainerPotato masher5 Gal. bucketMeasuring spoonAnd most importantly, Crab-apples!. One of the great things about crab-apples is that they contain a high concentration on pectin. By tapping into this natural goldmine of pectin, we are taking a tree/ fruit that is typically only used as an ornamental tree and using it to reduce preserve making costs while using all natural ingredients. In this step, you will need to pick a rather large amount of apples. You don't need to worry about removing the stems from the fruit as they can be added to the mash. Try to avoid leaves if possible. We spent about an hour collecting around 3-4 gallons which ended up making around 16 quarts of pectin.This amount will allow you to make about 350 oz or jelly or jam. . In this step you want to thoroughly rinse off the apples you picked in the last step. Be sure to remove any leaves from the mash as they won't add anything to your pectin. You can leave the stems on the crab-apples tough, there is trace amounts of pectin in the stems that we will be extracting. Once all of the dirt, bugs, leaves, and everything else is rinsed off of the fruit transfer it into your cooking pot for the next step. . This step we will begin cooking the pectin out of the fruit. So just put your filled with the apples and topped off with water on the stove and cook to a boil. You will want to leave the mash boiling for some time, around 45 minutes. The longer you cook, the more pectin you will be able to extract. Be careful not to let it burn though as the burnt flavor can carry into your jams/jellies. Once the fruit has been cooked, you can use the potato masher to smash the softened fruit to release the pectin. After smashing, let the pot cook some more. . Once you feel confident that the mash is thoroughly cooked, use the strainer to remove the liquid from the mash. This liquid is the remaining water with the pectin inside of it. You are now ready to test your homemade pectin. Be careful not to splash any on yourself or burn yourself on the steam. Both feel quite unpleasant  :( . To test the pectin, pour a small amount of isopropyl alcohol into your glass. We used a shot glass as it is a prefect size for testing. Use the measuring spoon to transfer a small amount of the liquid pectin into the glass with the isopropyl. Use the fork to gently mix the two, and slowly remove the fork, pulling directly upward. The pectin should have gelled in the glass and stuck to the fork as a opaque slime like substance. If it does not, you need to return the liquid to the stove and continue to cook, checking again after another 10 minutes or so until the pectin has been cooked out. At this point, you are all done and can either add the pectin to your jams and jellies to be or you can jar it for the next time you make your preserves. Thanks for reading and I hope this guide has helped!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 0, 3, 1]\nC: [0, 1, 3, 2]\nD: [1, 0, 2, 3]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_144_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_144_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_144_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_144_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [3, 0, 2, 1]\nC: [2, 3, 0, 1]\nD: [2, 0, 1, 3]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. soda cancoat hangersmall hinge4 small screws and nuts ( 2 not shown)1 medium screw and nut (not shown)4 long bolts and nuts. Make a line about 2/3 through so that the bottom half is biggerThe edges are rough so cut some offdecide which is the front and bend over a bitIn the end where the iron will be placed (soda can bottom) make cuts and bend in for the iron to rest on. Make 4 holes for the bolts supporting it to go throughput the bolts through and screw on the nut. mark where the screws will go (on the top and bottom)put them in, and screw on the nuts (on the top and bottom)add Medium screw for handle. bend the coat hanger into a similar shape to this to fit on your grill (The king of random has a template for his Bitty Q in his mike hacks). Put your soldering iron in, turn it on, and start GrillingEnjoy!Please vote for me\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 0, 2, 1]\nC: [2, 3, 0, 1]\nD: [2, 0, 1, 3]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_145_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_145_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_145_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_145_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [3, 1, 0, 2]\nC: [3, 2, 1, 0]\nD: [3, 1, 2, 0]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. \nFirst you will want to chop up your onions to the desired\u00a0consistency\u00a0for your\u00a0lasagna. \u00a0I chopped mine into different sizes.\nSaute your onions and ground beef over medium high heat, seasoning with just a dash or two of garlic salt, until the beef is well browned.. \nUsing a bit of olive oil, generously grease each well of your cupcake pan.\nLay a\u00a0won ton\u00a0wrapper into each well and press down so that it covers the bottom and sides of the tin.. \nAdd a spoonful of your\u00a0sauteed beef and onions to each well on top of the won ton wrapper, being sure that it is all contained in the wrapper and none of it touches the edges of the pan.. \nAdd a spoonful of spaghetti sauce on top of each pile of ground beef, once again making sure that it is all contained in the wrapper and none touches the edges of the pan.\nSprinkle each cupcake with\u00a0Parmesan\u00a0cheese, garlic salt, and Italian seasoning. \u00a0(It is okay to get a little messy sprinkling these on!). \nCover each pile of meat, sauce, and seasoning with a fresh won ton wrapper. \u00a0Make sure to press down and seal all of the filling inside so that only the won ton wrapper touches the edge.\nRepeat your fillings, adding your meat , Parmesan cheese, garlic salt and Italian seasonings.. \nTop off each cupcake with a generous sprinkle of shredded mozzarella and a dash of garlic salt.\nMy favorite part of these is the extra-cheesy top, so don't skimp on the mozzarella!. \nBake these in the oven at 350 degrees for about 20-30 minutes or until the cheese is the desired level of browned. \u00a0\nMake sure to keep an eye on them, once they start browning the tops can burn fast!. \nLet the cupcakes cool for about 5-10 minutes before removing from the pan. \u00a0Serve hot immediately or store in the fridge for up to 3 days for easy-to-reheat dinners and tasty packed luches.\nWe enjoyed ours with a delicious fresh salad!\nFor more awesome recipes and fun projects, visit my blog, The Procrastibaker!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 1, 0, 2]\nC: [3, 2, 1, 0]\nD: [3, 1, 2, 0]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_146_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_146_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_146_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_146_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [1, 0, 2, 3]\nC: [3, 2, 0, 1]\nD: [1, 3, 2, 0]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Setup either a mincer or food processor with the cutting blades.I\u2019m using chuck beef; I find it has great flavour for a burger. Mince the chuck beef. I like my burgers to have a fat ratio of 70% meat to 30% fat, this will give you juicy, tender burgers.. Slice some cheese into strips.  I\u2019m using cheddar but feel free to experiment with different types of cheese.Take a hand full of the mince and mould the first patty. Then layer out the cheese in two layers.Mould the second patty but make this one 10% largerThis will allow you to fold the edge down and seal the cheese inside.Place them in the fridge for half an hour just to firm up.. Setup the BBQ for direct grilling.. Make sure to rotate the patty minute.Cook for 2 minutes before flipping.Toast your bun now if you like.Cover with the lid for 2 minutes for beautiful melted cheese in the centre.Now it\u2019s time to make this burger.. Spoon of a layer of caramelized onions.Then add some mustard to the top half of the bun before placing it on top.. \nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 0, 2, 3]\nC: [3, 2, 0, 1]\nD: [1, 3, 2, 0]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_147_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_147_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_147_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_147_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [1, 3, 2, 0]\nC: [3, 2, 0, 1]\nD: [1, 3, 2, 0]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Here you will find all the ingredients needed for our descent into the heart of dark chocolatey deliciousness.  The dry goods1 tbsp(8g) cinnamon(not pictured)2 cups(200g) all purpose flower1 1/2(300g) cups sugar1 cup cocoa powder(I used the special dark variety)1/4 tsp(1.5g) salt3/4 tsp(3g) baking powder1 1/2 cups of cocoa nibsThe wet goods2 large eggs1/2 cup(118ml) milk1 cup(227g) of butter softened1 tbsp(15ml) vanilla extractToolsOvenCookie sheetMixing bowlElectric mixer or a lot of commitment and strong armsWhiskCooling rackSilicon baking mats or parchment paper.. Now is the time that was foretold of a time of much darkness.  As prophesied preheat your oven to 350 degrees Fahrenheit(177c).  The mixening is upon us.Set aside the Cocoa nibs.  Then combine the remaining dry goods in a mixing bowl using a whisk to incorporate them.  Your mix should be a pretty dull grey once everything is thoroughly combined.  Once you have the dry goods mixed. Dump all of the wet goods into the bowl of your stand mixer or a separate mixing bowl and cream them together.Now that the wet goods are looking creamy slowly incorporate the mixed dry goods.  By now you should have an almost black sticky dough.  You will want to add a half cup of the cocoa nibs to this mix setting aside the remainder for the next step.. Using your hands take about a ping pong or golf ball sized hunk of black dough from the mixer.  Then dip what will become the top of the cookie in cocoa nibs.  Finally place it along with its' friends on to your baking sheet leaving some room between them non nibbed side down and get ready to put them in the oven.(Now if you are like me your oven has realized it is ides of March and it is time to betray you so there is now the optional step of fixing your oven handle so you can open the oven door. Yay!)Once you have repaired your oven go ahead and bake the cookies for 15 minutes.  When the timer goes off remove your cookies from the oven and place them on a cooling rack.. Now that the cookies have cooled to the point where you won't burn your mouth the time has come to enjoy the fruits of your labor.Thank you for reading this instructable.  I hope you enjoy the cookies as much as I do.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 3, 2, 0]\nC: [3, 2, 0, 1]\nD: [1, 3, 2, 0]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_148_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_148_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_148_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_148_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [3, 2, 1, 0]\nC: [0, 3, 2, 1]\nD: [1, 2, 3, 0]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. For the Pie Dough:\n2 1/2 Cups of Flour\n1 Tablespoon Sugar\n1 Teaspoon Salt\n1 Cup of Unsalted Butter, Cold\n1 Cup Ice Cold WaterFor the Topping:\n1/4 Cup Melted Unsalted Butter\n1/2 Cup Cinnamon SugarFor the Whip Cream:\n1 Cup Heavy Whipping Cream\n2 Tablespoon Powdered Sugar\n1 Teaspoon VanillaFor the Strawberry Filling:\n1 Pound of Fresh, Cleaned Strawberries\n1 Cup Sugar\n1 Cup Water\n3 Tablespoons Cornstarch\n1-2 Tablespoons Corn SyrupNote: You can use any filling you want with these, I just happened to use a red filling to mimic ketchup.. First, sift together the flour, sugar, and salt into a large bowl and cut the cold butter into small cubes. . Combine the cut butter with the sifted flour, salt, and sugar.\nDo this by using either a pastry blender or a fork, cutting the butter into the flour until it is evenly distributed.. After the butter is combined with the flour, add 1/2 a cup of cold water and mix.\nAdd another 1/4 a cup of cold water and continue mixing until dough begins to form and hold it's shape.\nThis may take another 1/4 cup of cold water depending on factors such as where you live or how much moisture is in the air. . When the dough has started forming, start kneading it lightly to ensure the ingredients are fully combined.\nWrap the dough in saran and refrigerate for a half hour.. In the meantime you can make the whip cream and filling of your choice.\nTo make the whip cream, combine the powdered sugar, vanilla, and heavy whipping cream in a small bowl and use a beater or whisk to mix it until stiff peaks form.\nStore in a container and refrigerate until needed. . To make the strawberry filling, cut the strawberries into the desired size, though smaller cuts work better for dipping.\nIn a small sauce pan, combine your cut strawberries with your sugar, half the cup of water, and corn syrup.\nSet on medium to medium-high heat and bring to a simmer.\nIn a separate bowl, whisk together the corn starch and the other half cup of water until combined.\nAdd the corn starch to the strawberry mixture and keep on simmer for about 10 minutes or until you mixture begins to thicken.\nTake off the heat and put in a container to chill in the fridge. . After the dough has chilled, unwrap it and divide it into 2 equal pieces to make it easier to roll out. This will help to keep the pie crust tender. The more times pie dough is rolled out, the tougher it gets.\nRoll out one of the pieces. . Cut out various sizes of fries from the pie crust dough and place them on a greased pan.. Take the butter set aside for the topping and melt it.\nBrush the pie crust fries with butter and sprinkle with cinnamon sugar.\nThen place them in the oven preheated to 375 degrees to bake for 10-15 minutes until golden brown.. Let the pie crust fries cool and then serve them up on a plate with your favorite pie filling and some whip cream!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 2, 1, 0]\nC: [0, 3, 2, 1]\nD: [1, 2, 3, 0]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_149_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_149_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_149_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_149_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [3, 0, 1, 2]\nC: [3, 2, 0, 1]\nD: [1, 0, 2, 3]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Guests were coming for dinner. When we tried to open a bottle of red wine the cork crumbled and what you see here remained in the neck of the bottle outside the reach of this wine bottle opener and other similar corkscrew openers. Even if the openers we had could have reached the cork, crumbles of cork would have fallen down into the wine and we would have needed to strain the contents of the bottle. We considered pushing this piece of cork down into the bottle, but were concerned that it would tumble into the neck of the bottle and block the flow of the wine. Total removal was the goal.. I connected an air gun to an air compressor and attached a longer inflation needle to the air gun. This compressor does not have an attached tank. I set the air pressure for 100 psi. The cork came out quickly, but so did some wine, as you can see from the stains on the floor. Nearly the same amount of wine found the front of my shirt. You can see what was left of the cork on the floor. (Because my air compressor is tankless, far less than 100 psi. accumulated before the remainder of the cork was expelled.). Shown is the neck of an unopened wine bottle. The cork is one of the longer corks we have removed from a wine bottle. The longer needle is what I used to remove the remainder of the rotted and crumbling cork from the wine bottle we needed to open. Its threaded fitting is larger than a standard tire valve. The shorter needle is a standard needle for inflating a basketball using a bicycle hand pump or a small air compressor. Its screw fitting is the same as any Shrader tire valve, but it is too short to reach through a wine bottle cork. Also shown is a Presta to Shrader tire stem adapter.. The photo shows an old Presta to Shrader tire stem adapter I have for my bicycle. This adapter contains an \"O\" ring for a seal. I removed it with a pick. The tire stem adapter will be the right size for a bicycle pump when finished.. My tire stem adapter is corroded, but I need a bright surface for soldering parts together. I used a drill to clean the inside of the adapter.. I used some thin stranded copper wire to wrap around a piece of thin brass hobby tubing. The wire wrap will fill the space between the hobby tubing and the inside of the tire adapter.\u00a0. I used a small screwdriver to push the wire wrap into the cleaned opening in the tire stem adapter. I held the tire stem adapter in wooden vise jaws. I used a soldering gun at its higher heat to make the tire stem adapter hot enough for the solder to flow well and make a good seal. When cooled, the thin piece of hobby tubing is firmly sealed in the tire stem adapter. See the second photo. I used 100 grit sandpaper on a countertop to sand an oblique point onto the end of the needle so it pierces the cork more easily. Then I used a straight pin to make certain the hole in the tubing is fully open.. The thin brass tubing bends easily. Handle with care. I inserted it into the cork in a wine bottle. (This bottle has already been opened with an electric opener that made another hole all of the way through the cork. Air pressure did not remove this cork from this wine bottle because air escaped through the hole from the electric opener.). Attach the air hose from the pump or compressor. Hold the hose fitting so the brass tube does not bend or break. Secure the wine bottle so it does not tip over while opening it. Pump air into the bottle. My pump has a pressure gauge and it went to about 80 psi. before the cork began to move. When the cork moved, it moved quickly and came out of the bottle immediately. (Plastic 2 liter soda bottles have been tested and burst at about 120 psi. A glass wine bottle is stronger than a plastic soft drink bottle. People have been using commercial versions of air pressure wine bottle openers safely for years.)\u00a0 A long inflation needle like this makes it easy to reach down into the neck of a wine bottle in which the cork has begun to crumble. You may or may not want to open your bottles this way regularly, but it sure helps remove a cork that broke apart before it was fully removed.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 0, 1, 2]\nC: [3, 2, 0, 1]\nD: [1, 0, 2, 3]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_150_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_150_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_150_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_150_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [1, 0, 2, 3]\nC: [3, 0, 2, 1]\nD: [0, 1, 3, 2]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. -Take 2 cups All purpose flour in bowl.Dissolve sugar in half cup warm water.-Add sugar dissolved warm water,oil and milk to flour and knead to soft dough. BASIC SHAPE NYAPSHA:-Flatten the dough using rolling pin.Cut longstrips,then make criss cross cuts to form diamond shapes as shown in my images.-Make a small cut at the center of each diamond. -Now insert the tip of diamond in to middle hole and make a twist.-You can also insert base of diamond in to middle hole and make atwist as shown in my images.. -Make similar process with all diamond shapes.. -Flatten the dough and make thin long strips-Join the tip of strips. -Make braid shape similar to how we braid our hair by swirling one strip above other.. -Here is the final braid shape. -Cut long wide strips and make cut in the centre. Insert the base of strip in to middle hole and make a twist. -Here is the final shape.. -Make long rope and join 2 ends-Then make swirls from one end and join the tip. -Fry all khapse in oil till they turn brown in color. -Enjoy crispy khapse with teaMy tip-Adjust sugar according to your sweetness.If sugar quantity is less sprinkle sugar powder after frying-Rolling should be thin and not thick for crispy khapse\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 0, 2, 3]\nC: [3, 0, 2, 1]\nD: [0, 1, 3, 2]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_151_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_151_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_151_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_151_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [3, 1, 2, 0]\nC: [2, 3, 1, 0]\nD: [1, 0, 2, 3]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. To make this drink you will need:1 cup rice (uncooked)5 cups water1 1/4 cups milk2/3 cup sugar1 teaspoon vanilla extract2/3 teaspoon ground cinnamon. Put the rice in a blender and blend for 1-2min. Add water and let it sit overnight, and then strain the rice out of the water. You can throw the rice away, but you need to keep the water.. Add the rest of the ingredients in any order and stir until thoroughly mixed. Then put it in the fridge and let it chill.. Your done! Have fun sharing this delicious drink with your friends!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 1, 2, 0]\nC: [2, 3, 1, 0]\nD: [1, 0, 2, 3]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_152_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_152_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_152_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_152_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [0, 1, 3, 2]\nC: [3, 2, 0, 1]\nD: [1, 2, 3, 0]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. The recipe I used is as follows.\n1 cup sugar\n1/2 cup butter\n2 eggs\n2tsp vanilla\n1 1/2 flour\n1 3/4 tsp baking powder\n1/2 cup of milk\nYou'll also need\nIcing sugar\nRaspberry Jam (some with seeds, some without\nDark and white chocolate\nRed food coloring\nA mold for the brains\nYou could also use any other recipe for the base, up to you. Cream sugar and butter. Add eggs and vanilla.\nCombine flour and baking soda\nAdd to mixture\nStir in Milk\nAdd desired amount of red food coloring\nCook 175c for 30 minutes . Let cupcakes cool for about an hour. I cored the cupcakes out and filled them with raspberry jam.\nI then capped them with the left over cupcake.. I added a small layer of white icing to help keep the brains in place and add a little contrast. I made the brains the night before as they required some time to set\nI melted some chocolate in a pyrex and added to the mold.\nI then place the mold in the fridge for about 3 hours to set\nI experimented with different shades of chocolate, and even painted on some red chocolate to help make the brains stand out more. Now, just place the brains in the middle of the cupcake.\nI heated up some seedless reapberry jam to pour over the brains to give the bloody look.. And that's it. now you can devour some zombie brains without the fear of being infected.\nI hope you have enjoyed this instructable,\nThank you\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 1, 3, 2]\nC: [3, 2, 0, 1]\nD: [1, 2, 3, 0]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_153_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_153_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_153_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_153_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [0, 1, 3, 2]\nC: [3, 0, 1, 2]\nD: [0, 3, 1, 2]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Ingredients:16 OZ Heavy cream (Cold)Material:Large mixing bowl  Silicon Spatula  Electric beater.  Pour the cream in a large bowl and beat it using an electric beater till you can see white liquid in the bowl. It took me 20 minutes to get to this stage. Note : The stages to get to this stage is, first the cream got soft peaks, then stiff peaks, then the cream started turning a little yellow and then it gave out white liquid.  This white liquid is buttermilk and can be used to prepare bread etc. Pour the buttermilk into another bowl  This is our butter. But the butter might have some buttermilk left and if we do not remove all the buttermilk from the butter, the butter will go bad if not used within 2-3 days.  In order to remove the buttermilk from the butter completely, add ice cubes in water to get ice cold water and pour 4 tbsp of this water on the butter and beat it again for 3-4 minutes and we will get white liquid. Discard the water. Repeat this process till we get clear water.  Place the butter on a plastic wrap and roll to get a tight tube.  Place in fridge for 1-2 hours and can use as required :). Ingredients Required:Softened Butter - 4 tbsp  Parsley - 1/4 cup  Garlic - 1 tbsp  Lime zest - 1 tsp  Salt - 1/4 tspSteps Required:Mix together all the above ingredients well  Place the butter on a plastic wrap and roll to get a tight tube  Place in fridge till required. Ingredients Required:Softened Butter - 4 tbsp  Coriander Leaves - 1/4 cup  Garlic - 1 tbsp  Chilli Flakes - 1/2 tsp  Smoked Paprika - 1/2 tsp  Salt - 1/4 tspSteps Required:Mix together all the above ingredients well  Place the butter on a plastic wrap and roll to get a  tight tube  Place in fridge till required. Ingredients Required:Softened Butter - 4 tbsp  Chopped Pecans - 1/4 cup  Maple syrup - 1 to 2 tbspSteps Required:Mix together all the above ingredients well  Place the butter on a plastic wrap and roll to get a  tight tube  Place in fridge till required. Ingredients Required:Softened Butter - 4 tbsp Orange Marmalade - 2 to 3 tbspSteps Required:Mix together all the above ingredients well Place the butter on a plastic wrap and roll to get a tight tube Place in fridge till required. Ingredients Required:Softened Butter - 4 tbsp Chai Spice - 1/2 tsp (Ingredients required to prepare Chai Spice is below) Vanilla essence - 1 tspChai Spice :4 parts ground cinnamon 2 parts ground ginger  2 parts ground cardamom  1 part ground cloves  1 part ground coriander  1 part ground white pepperSteps Required:Mix together all the above ingredients well Place the butter on a plastic wrap and roll to get a  tight tube Place in fridge till required\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 1, 3, 2]\nC: [3, 0, 1, 2]\nD: [0, 3, 1, 2]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_154_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_154_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_154_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_154_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [1, 3, 0, 2]\nC: [0, 2, 1, 3]\nD: [3, 0, 2, 1]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Ingredients\n1 1/2 cup of Sugar\n1/2 cup of Butter, softened\n1 teaspoon of Vanilla extract\n2 Eggs\n2 3/4 cup of Flour\n1 teaspoon of Baking soda\n1/2 teaspoon of Cream of Tartar\n1/4 teaspoon of Salt\n1 teaspoon of Cinnamon\na pinch of Cayenne pepper\n2 bars of Chili infused chocolate (I used Lindt, but if you want to make this from scratch it is roughly 7 ounces)\n2 teaspoons of Cinnamon\n2 Tablespoons of Sugar\na pinch of Cayenne pepperEquipment\nAn Electric mixer, either a stand or hand mixer would work fine\nA Chopping board\nA Large knife\nA Small extra bowl\nParchment paper or cooking spray. There are plenty of great ways to go about doing this, probably some easier than the way I ended up going about it, but this way seemed to work pretty well for me. If you know an easier way, please feel free to leave a comment on how you go about \"chipping\" chocolate.\nLie the chocolate bar on the cutting board. Use a long knife to press into chocolate, holding onto the handle and applying pressure on the blade. By using the knife like a see-saw, rock the pressure back and forth to cut the chocolate into strips. Turn the cutting board and using the same method, cut in the opposite direction until the chocolate is roughly chip sized. I like to vary the size of the chunks from very small to the size of a dime. I also like a lot of chocolate chips in each cookie, but feel free to vary the amount to your preference.\nIf you have a stand mixer or a second set of helping hands, you can do this while the dough is mixing.. Preheat the oven to 400 degrees F\nCombine the Sugar (1 1/2 cup), Butter (1/2 cup), Vanilla extract (1 tsp), and eggs (2). Mix well.\nSift the dry ingredients: Flour (2 3/4 cups), Baking soda (1 tsp), Cream of Tartar (1/2 tsp), Salt (1/4 tsp), Cinnamon (1 tsp), and a pinch of Cayenne pepper. Mix into the sugar mixture.\nAdd the chocolate and mix until integrated.. Mix the remaining ingredients in a small bowl: Sugar (2 Tbsp), Cinnamon (2 tsp), and a pinch of cayenne pepper. This will be the powdered coating to the cookies.\nThe dough isn't that sticky and can easily be handled. Form the dough into balls roughly the size of a ping pong ball. Roll each ball in the cinnamon sugar mixture.\nArrange balls on a prepared cookie sheet (spray it with cooking spray or use parchment paper) roughly an 1 1/2 to 2 inches apart.\nThis recipe should make 20-22 cookies.\nBake cookies for 8-10 min.\nWhen they are done, immediately move them to a cooling rack.\nLet them cool for a bit, then eat and enjoy. Cookie are the best warm out of the oven with a tall glass of milk.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 3, 0, 2]\nC: [0, 2, 1, 3]\nD: [3, 0, 2, 1]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_155_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_155_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_155_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_155_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [0, 2, 3, 1]\nC: [0, 2, 3, 1]\nD: [1, 3, 0, 2]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Pink Mambo Monsters  (I swear this name will make sense in a few paragraphs)\n3/4 cup plus two tablespoons (or just under a cup) pink champagne\n1/8 cup soymilk\n1 teaspoon apple cider vinegar\n1 1/4 cups flour\n2 tablespoons cornstarch\n3/4 teaspoon baking powder\n1/2 teaspoon baking soda\n1/3 cup canola oil\n3/4 cup sugar\n2 teaspoons strawberry extract\nCombine all ingredients and mix well. The batter will get slightly frothy at first due to the carbonation in the champagne.\nPour into greased or lined muffin pans and bake at 350 degrees for 12 minutes (give or take). Remove and let cool. The cupcakes will be very very fluffy.\nA Word of Caution: These cupcakes were extremely boozy when I made them. If you want to cut down on the champagne, try 1/2 cup champagne and 1/2 cup soymilk.\nAfter you bake them, you may notice a slightly green tint. I\u2019ll be honest, I don\u2019t have the slightest idea why this happens. I have a theory about the baking soda and alcohol combining to form a weird chemical reaction, but maybe not. It\u2019s a mystery I guess. (see\u2026 you get why they are Pink Mambo Monsters now right? Pink Mambo because of the champagne, and Monster because they are now green tinted).. While your muffins are in the oven, cut six small strawberries in half. Make the chocolate ganache recipe below:Chocolate Ganache\n1 cup vegan chocolate chips\n1/4 cup soymilk\nsplash of maple syrup\nPlace all three ingredients in a microwave bowl and nuke it for about 30 seconds then stir. Heat in additional 10 second increments, while stirring in between, until melted.\nDip each strawberry half in the ganache then set on wax paper to harden. \u00a0. Finally, while your cupcakes are cooling and your chocolate covered strawberries are setting, make your frosting. I used strawberry buttercream which was really good. It wasn\u2019t super sweet and complimented the champagne flavor of the cupcakes well.Strawberry Buttercream Frosting\n1/2 cup shortening\n1/2 cup margarine (Earth Balance)\n3 1/2 cups powdered sugar\n1/4 cup soymilk\n1 1/2 teaspoons strawberry extract\nCream shortening and margarine together with hand mixer. Slowly add the powdered sugar a 1/2 cup at a time. Once combined, add in soymilk and strawberry extract. Blend on slow/medium speed for 5 to 7 minutes (trust me, you don\u2019t want to skimp on the 5 to 7). I added about a half a cup of fresh strawberries and folded them into the frosting as well.\nIf you want to add the strawberries, reduce the amount of soymilk just a tad. The juice from the strawberries will add too much extra liquid to the frosting and make it runny, but decreasing the soymilk will leave the frosting a bit stiff in prep for extra juice.. Once you have your cupcakes, frosting and chocolate covered strawberries set, you have to assemble them. Because I had chunky frosting from bits of strawberry, I found it easier to cut a hole in the top of my cupcake and fill it (think of it as cutting off the top of the pumpkin). Cut the hole in a slightly angled fashion so you have a cone shaped hole (this will prevent you from having to try and detach the top for the bottom and mashing your cupcakes)\nFill the hole with frosting then put the top of the cupcake back on. I did shave a little extra cake off the top piece as to not squish the frosting out the sides. Add a dab of frosting to the underside of a strawberry half and stick it on top of the cupcake.\nAnd there you have it - booze cakes.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 2, 3, 1]\nC: [0, 2, 3, 1]\nD: [1, 3, 0, 2]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_156_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_156_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_156_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_156_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [1, 0, 2, 3]\nC: [0, 3, 2, 1]\nD: [1, 2, 3, 0]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. MIX INGREDIENT TOGETHER (butter should be at room-temperature and not melted, cut in little pieces).\nTry to touch the dough less possible and do it fast, don't make it warm, or the crust will lost his crunchiness (if you have a marble counter use it).\nMAKE A BALL and put it in transparent film, so it don't dry. Put it in fridge for at list 2 hours (or it will shrink in baking pan), better over a night.. BIT BUTTER AND SUGAR TOGETHER, until they become a smooth paste.\nADD ALL THE OTHER INGREDIENT and steer really well, until everything is smooth.. take out of the fridge the ball of dough half an hour before so it soften.\nPUT IT FLAT AND ON THE SIDE IN a PAN.\nMAKE LITTLE HOLE WITH FORK everywhere in the the bottom of dough (so it wont make air boubble when it cook).\nPUT FILLING IN.\nCOOK FOR\u00a0\u00a0 MINUTE AT\u00a0\u00a0 DEGREE.\ntake it out and let it cool down.. PUT RASPBERRIES ALL OVER YOUR CAKE.\ndo a glass of gelatin with water and sugar (read instructions on gelatin packaging) and wait few minuts for it to become a tiny bith thick, but not all the way.\nPUT GELATIN ALL OVER THE RASPBERRIES.. refrigerate cake for 5 hours to a night.\nthis cake is delicious also the day after.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 0, 2, 3]\nC: [0, 3, 2, 1]\nD: [1, 2, 3, 0]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_157_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_157_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_157_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_157_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [1, 2, 0, 3]\nC: [0, 2, 3, 1]\nD: [3, 0, 2, 1]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Cut the layer in half. Attach it to 1/2 of an 8\" cake board with white buttercream. Stack the 2 half layers and refrigerate for 15 minutes.. Stack 2 10\" cake layers with white buttercream on a 14X14\" cake board. We are going to add our half 8\" cake layer in the back of the 10\" cake. Mark where the cake will sit. Use milkshake straws to add support in that area. Cut them off at cake level.Add buttercream in that area.. Stack the cake layers. Then give the entire cake a crumb coat.. You need 4 sugar cones.  Take one of the sugar cones and using a serrated knife cut the end off the cone. Then using buttercream attach a sugar cone to the cone you just cut. This will make a taller mountain. Cover all the sugar cones with buttercream and attach them to the cake. Put them in the refrigerator for 15 mins. Then you can add more buttercream to make them look more like mountains. Then frost the cake completely with white buttercream. . Using an offset spatula, mark out a stream running from behind the far right mountain.  Make it wider as it approaches the cliff to make a waterfall. Then make a rounded lake at the bottom of the waterfall. Leave the buttercream rough. Then use blue and white piping gel, frosting in an up and down motion to create the waterfall. . Frost the cake board. I made snowy evergreen trees for the cake. I have a video for making them very easily. Bring some of them down onto the cake board. I spread some sugar crystals around the cake and board fro some sparkle. For Elsa's Ice Castle, I made some light blue hard candy and broke it into pieces. Then add your Disney Frozen characters. You could also make this cake for other themes as well by switching out the cake toppers.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 2, 0, 3]\nC: [0, 2, 3, 1]\nD: [3, 0, 2, 1]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_158_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_158_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_158_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_158_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [2, 1, 0, 3]\nC: [0, 3, 1, 2]\nD: [1, 0, 3, 2]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. -6 small red apples (with stems)-3/4c sugar-1/3c light corn syrup-2-3 drops red food gel-edible red glitter-candy thermometer-flavoring *optional*. Put your glitter into a bowl and set near your cooking area, if you have a friend, extra hands definitely help with this recipe. . Put all of the ingredients, except for the red gel and any flavorings, into a pot and bring to a boil. It's a lot easier if you use a candy thermometer to keep an eye on the temperature. You want the sugar to hit the \"hard crack\" stage, usually 300-310 degrees. Just before it does, add your coloring and flavors. . As carefully as you can, swirl each apple one by one in the sugar mixture and then transfer quickly to the bowl of edible glitter and coat as well as you can. Set aside on parchment and let cool. \nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 1, 0, 3]\nC: [0, 3, 1, 2]\nD: [1, 0, 3, 2]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_159_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_159_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_159_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_159_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [3, 2, 0, 1]\nC: [1, 2, 3, 0]\nD: [0, 2, 1, 3]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Complete ingredient list:\n1 pound sole or other white fish, cut into medium-small chunks or slices\n15\u00a0 jalape\u00f1o peppers (3 for the marinade, 12 for the sauce)\n1 habanero pepper\n1 cup cider vinegar\n1/2 cup Chardonnay\n3 Tb. soy sauce\n5 or 6 bamboo skewers\n1 cup all-purpose flour\n2 tsp. coriander\n2 tsp. cumin\n1 tsp. ground black pepper\n1 cup rice vinegar\n1/2 cup chopped onion\nJuice of 1 large lemon\n1 Tb. olive oil\n1 tsp. salt\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~Jalape\u00f1o\u00a0 Marinade:\n3 jalape\u00f1o peppers\n1 habanero pepper\n1 cup cider vinegar\n1/2 cup chardonnay\n3 Tb. soy sauce\nWater (optional, if the marinade doesn't quite cover your fish). Chop and seed peppers, and discard stems.\u00a0 Using a food processor or blender, blend peppers, cider vinegar, Chardonnay, soy sauce, and black pepper until smooth.. Slice the fish into medium-small slices, and submerge in the marinade (that's the green stuff in the bowl).\u00a0 Leave it in the fridge several hours, or overnight.\u00a0 If you don't give the fish enough soaking time, it won't pick up very much flavor.\nMeasure out and mix together:\n1 cup flour\n2 tsp. coriander\n2 tsp. cumin\n1 tsp. ground black pepper\nBreak 3 eggs into a bowl with 1 Tb. water and whisk until smooth.. Rinse skewers and set aside (they don't need to be soaked).\nCoat each piece of fish in the flour-spice mixture and transfer to a plate\nFor the second coating, completely cover each piece in the beaten egg, and then roll in breadcrumbs.\u00a0 (The breadcrumbs in the second photo are Kikkoman brand panko crumbs.). Preheat oven to 375 degrees Fahrenheit,.\nThread the pieces onto skewers, about 4 per each.\nBake fish on a large baking sheet for approximately 30 minutes.\u00a0 Fish should be moist and slightly flaky when done.\u00a0 Be careful to not over-bake, especially if your fish is thinly sliced.. If you're speedy, you can put this together while the fish is baking.\n1 T. olive oil (at least)\n12 jalape\u00f1o peppers, seeded and chopped\n1/2 cup chopped onion\n1 tsp. salt\n1 cup rice vinegar\nJuice of 1 large lemon\nSaut\u00e9 onions and peppers in the olive oil over medium heat until they just start go go limp (a little bit of brown is okay).\u00a0 Pur\u00e9e them together using a blender or food processor with the salt, rice vinegar and lemon juice.. Serve your Tortured Sole by drowning it in copious amounts of jalape\u00f1o sauce. and dousing with sour cream.\u00a0 A side of strangely-colored rice spiked with cilantro leaves\u00a0 is an ideal accompaniment.\u00a0 \nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 2, 0, 1]\nC: [1, 2, 3, 0]\nD: [0, 2, 1, 3]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_160_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_160_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_160_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_160_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [1, 3, 2, 0]\nC: [1, 0, 3, 2]\nD: [3, 0, 2, 1]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Ingredients:1) 3-4 cups of glutinous short-grained rice. Note: basmati, jasmine or other longer-grain and non-sticky rice will not work! 2) 1 packet of Hainan Chicken Rice seasoning - you can find it in most Asian grocers or easily make it from scratch using garlic, ginger, onion, sugar, oil and chicken stock cubes)Preparation:Wash and rinse the rice in a metal bowl until the water inside the bowl is clear Fill up the rice cooker with water, about 300 ml or so. Put a little more to be on the safe side if you're unsure what is the right amount. . Steaming:1) Place the metal bowl of rice inside the rice cooker, ensure the rice itself isn't immersed in water, cover the lid and let it cook for 25 mins. Normally, rice is cooked by pouring water into the rice and letting it boil. However, this will make the rice too wet even after extensive drying, leading to uneven clumps of fried and not-so-fried crispies as you can see in the latter picture.2) After 25 mins, open the lid and thoroughly mix in the full packet of Hainan Chicken Rice seasoning into the bowl of rice. Close the lid and let it steam for another 25 mins. The rice should be sticky and slightly hard. . Drying:There are 3 methods to dry the rice, via mother nature, an oven or using a dehydrator. If you live in a climate with dry and hot temperatures, drying rice in the sun is fast, effective and easy. Layout circular clumps of rice on a tray and let it sit in a full day or two of sun and it will turn into hard and dense clumps. Just beware that freshly cooked rice is decently attractive prey for all forms of wildlife including birds, ants, cats, dogs, hungry family members, you name it and it will pose a threat. A mesh screen and water moat will help keep away hungry intruders.  For every other type of climate, a cheap dehydrator or oven works just as well. Put the oven on low to low-medium heat for at least 4-6 hours. I used a dehydrator as it lets me control the exact temperature of 60-65 Deg Celsius and I leave it on overnight for 12 hours. Feel free to experiment with shapes and sizes but note that the bigger or thicker the shape, the longer and more uneven the drying will be. Here are a couple of close-ups of pre and post dried rice crispies. . Frying:1) Fill a pot with some vegetable oil and turn on high heat to bring the oil to a boil2) Fry one or two crispies at a time for about 15 seconds, it should puff up and turn golden brown pretty quickly. Take care to avoid frying crispies to touch each other as they stick to each other very easily and transform into mega crispies. Smaller shaped crispies or a wider pot helps. 3) Place the fried crispies on paper towels to soak up excess oil and let the crispies cool down to room temperature. They should get more crispy as it cools down and let it dry thoroughly before serving or storing in an air-tight container. 4) When serving, it's great to pair the rice crispies with Hainan Chicken Rice chili, which is easily found at Asian grocers, online or make your own recipe. Bonus points if you serve it with sweet dark soy sauce (a mixture of water, thick dark soy sauce and rock sugar) and minced garlic/ginger for the complete chicken rice condiment combo. If the crispies are well prepared - it can keep for at least a month in a container. I currently have 2 containers with crispies made 6 weeks ago and they are still in good shape. Enjoy making and eating a new Singapore-style snack! \nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 3, 2, 0]\nC: [1, 0, 3, 2]\nD: [3, 0, 2, 1]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_161_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_161_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_161_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_161_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [0, 3, 1, 2]\nC: [3, 0, 1, 2]\nD: [1, 3, 0, 2]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Apples! Lots of them. Fill up your crock pot! (These should be sweet apples; or otherwise you'll have to add sugar later.)\nCrock pot\nWooden spoon\nSpices- I used ginger, cinnamon, and clove\nBrown sugar, if you want to sweeten it\nWater. Wash them core them, slice them.\nYou don't have to cut them any smaller than slices; they'll cook down.\n(Cut them smaller if you're in a hurry; they'll cook faster.)\nIf you want, peel the apples.\nI didn't, and I haven't found the skins to be obnoxious; rather, it adds texture.\nHowever, if you want to take the time, go ahead and peel the skins off.. Put all of the cut up apples in the crock pot.\nAdd a little bit of water. This is just to keep the apples from sticking to the bottom; we'll try to evaporate it out later.\nCover crock pot and cook on high for 3 hours, stirring occasionally (once an hour or so).. At the end of three hours, your apples should look something like the picture- they're starting to disintegrate.\nFeel free to help them along with the wooden spoon.\nYou can also add spices at this juncture. I used lots of ginger, a fair amount of cinnamon, and some clove.\nIt's to taste; add whatever spices you want. You can also add sugar if you want a sweeter, more caramel-gooey apple butter.\nTurn your crock pot down to Low and keep cooking, stirring intermittently, until you have first applesauce (lighter colored disintegrated apples) then apple butter (darker colored from caramelization).. Once you have a satisfactorily caramelized apple butter, you could be done!\nIf you want to condense it so that it's thicker, you could leave it on for a few more hours without the lid until enough of the water has evaporated.. The serving suggestion pictured here:\nApple butter spread on buttered multigrain toast, topped with honey Greek yogurt and nutmeg. The plate is drizzled with organic molasses.\nYou should also try apple butter on vanilla ice cream.\nIf you made a lot, apple butter is traditionally preserved in jars.\nIn case you get sick of eating it, apple butter makes a nice gift for friends and family!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 3, 1, 2]\nC: [3, 0, 1, 2]\nD: [1, 3, 0, 2]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_162_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_162_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_162_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_162_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [3, 0, 2, 1]\nC: [0, 1, 2, 3]\nD: [3, 2, 0, 1]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. *NOTE* to save time you can use all frozen vegetables or omit some vegetables. This recipe is very versatile and you can change it as much as you like!! *NOTE*- 4 Cups COOKED short grain rice (2 cups of uncooked rice = 4 cups cooked rice)- 2-3 Tablespoons butter (you can use oil but butter gives it a lovely flavour)- 3 Eggs, whisked-1 large onion, diced- 2 sticks of celery, chopped- 2 Carrots, diced- 2 Garlic cloves (or 1 large), crushed- 1/4 Green cabbage, sliced thinly- 1/2 Cup frozen peas- 1/2 Cup frozen corn- 2 Teaspoons oyster sauce- 3-4 Tablespoons of soy sauce- Spring onion, to serve- Sweet chilli sauce, to serve. - Chopping board- Cutting knife- 1 Large frying pan/ wok- 1 Small frying pan (or you can use the same pan to cook the scrambled eggs in)- Frying spatula/ spoon- Garlic Crusher- Fork- Small bowls- Serving bowls. The first step is to prepare the vegetables, it is simple and can be done in advance to save time, if not it only takes a few minutes to chop and cut all the vegetables.Firstly wash the vegetables under cold water to remove any dirt, then finely dice the onion to your preferred size, the smaller they are the less noticeable they are in the rice. Then dice the carrots finely, chop the celery and finally thinly slice the cabbage, if you prefer large chunks of cabbage then slice it thicker. Then finally crush the garlic in a garlic press.*NOTE* if you are serving this to children you will want to chop the vegetables as fine as possible so that it is harder to notice.  *. To make the scrambled eggs crack the 3 eggs into a small bowl and whisk them together until they are uniform in colour. Then heat a small pan to medium high and place the eggs into it and cook them for about 2 minutes while stirring them to form scrambled eggs. Then place them on a side plate and break it up into small pieces, leave them to cool while you cook the vegetables.. Heat the wok onto a medium high heat then once the pan is hot add the butter and allow it to melt completely. Then add the diced onion to the pan and cook until the onions become transparent and just start to brown.  Then add the garlic and let it cook for about 1 minute.*NOTE* do not add the garlic while cooking the onions as the garlic can burn and give the dish a bitter taste. Then add the diced carrot, celery and cabbage, stirring for about 2 more minutes until the vegetables slightly soften.. Once the fresh vegetables are cooking in the pan add the pre-cooked rice and stir it for about 1 minute to break it apart. Then add the frozen peas and corn, scrambled eggs, oyster sauce and soy sauce and stir the rice and vegetables until they are evenly coated with all the different sauces. Try a small amount of the fried rice and season it accordingly (ie. ass more or less sauce until it tastes yummy!!). Also make sure that the rice is heated evenly, when it is take it off the heat and get the serving bowls!. Once the rice is cooked place it in the serving bowls and add any toppings that you like. My suggestions are freshly chopped spring onions with sweet chilli sauce on the side.  This is a great dish as it can be a meal on its own or a side dish for meat. It is delicious, quick and you can use any vegetables in your fridge!Food pairing suggestions:- Serve with Teriyaki chicken- Serve with steak- Serve with Tofu- Serve as a side dish for a group dinner\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 0, 2, 1]\nC: [0, 1, 2, 3]\nD: [3, 2, 0, 1]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_163_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_163_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_163_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_163_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [0, 2, 3, 1]\nC: [2, 0, 3, 1]\nD: [1, 0, 2, 3]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. 400 gr spaghetti500 gr ground beefTomato sauceSalt and pepper. Cook the spaghetti in boiling water with some salt. Stir every 2 minutes and drain when it's al dente (8-10 minutes).. Put the ground beef in a saucepan in medium heat. Cook it and cut it with the scoop to make it in little pieces. When it changes its color let it cook for five more minutes in low heat.. Pour the sauce in the cooked beef and stir. Add some salt and pepper and let it boil for 3 minutes.. Put spaghetti on a dish and use a spoon to put some bolognese on top of it. You can use some parmesan cheese to give it an extra special flavor. This pasta goes great with garlic bread and salad. \nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 2, 3, 1]\nC: [2, 0, 3, 1]\nD: [1, 0, 2, 3]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_164_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_164_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_164_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_164_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [0, 2, 3, 1]\nC: [2, 0, 3, 1]\nD: [3, 0, 1, 2]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Clean the Habanero Peppers by washing them under running water in a Colander and\u00a0Drain.Pick off any stems.Roughly Chop Peppers and Garlic just to make it easier to blend. Habaneros are ridiculously hot (100,000 to 350,000 on the Scoville Scale), so removing some of the seeds and the white membrane inside the habaneros can kick back some of the heat. For my sauce, I just removed half the seeds from the peppers by running the inside of the peppers with cold water.\u00a0. Put the chopped peppers, and garlic into the blender.Add A Pinch of SaltPour 1/2 Cup of Water and Apple Cider Vinegar\u00a0into the blender as well.\u00a0 I like using Apple Cider Vinegar, because it adds more of a fruity flavor to the sauce that complements the Habanero. It also helps darken the sauce a bit to make it similar to regular store bought Sriracha Sauce, but you can always use White Distilled Vinegar if you want.\u00a0Blend for 5 minutes, until everything is a smooth puree.\u00a0. Pour the mixture into a saucepan on medium heat.Wait till the mixture comes to a low simmer.\u00a0Slowly stir in a 1/4 cup of sugar till it dissolves.\u00a0 I used Sugar in the Raw, because that's what I prefer. Most people use brown sugar to deepen the flavor in their Sriracha sauces, but any kind of sugar here is fine.\u00a0Simmer for 20 minutes or till the mixture reduces by 1/3.\u00a0Skim off any foam that collects on the top, and discard foam.\u00a0. Pour reduced sauce into the blender again, and let it cool down for a few minutes.\u00a0Blend for another 5 minutes to further break down the warmed peppers.\u00a0. Pour the mixture through a fine mesh strainer.\u00a0Push the mixture through with a spoon to drain all that hot sauce till all that's left is dry pulp and seeds. If the mixture at this stage is still not the consistency \u00a0you want. You can add more water and vinegar to thin it out, or reduce further on a saucepan on low heat to thicken it up.\u00a0. Most recipes call for Sriracha to be fermented for a few days, but I didn't see any need for it. You're welcome to try though. This homemade Habanero Sriracha sauce is way more hotter than usual store bought Sriracha, but it's still really good. It has the same texture, the garlic-y, sweet and sour notes of regular Sriracha sauce, but has a brighter, more vibrant, and fresher color and taste. The habaneros add a slightly more fruity, and citrus notes, but use it like regular Sriracha sauce, and you will not be disappointed.\u00a0 That is, If you still have your tastebuds left after trying some. \u00a0 This should hold for about 5-6 months.\u00a0\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 2, 3, 1]\nC: [2, 0, 3, 1]\nD: [3, 0, 1, 2]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_165_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_165_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_165_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_165_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [0, 3, 1, 2]\nC: [2, 3, 0, 1]\nD: [1, 3, 2, 0]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. \n          As for Ingredients, this is as SIMPLE as it gets... It's only *REALLY* got three essential ones... the rest are just pomp and glory... delicious pomp and glory\nWatcha gunna need:\n\t\tIcing sugar (300g, BUT I would have extra, you'll see why later)\n\t\t1x Egg white\n\t\tPeppermint flavouring (The natural stuff is best, but go with what you can get)\nPomp and Glory:\n\n\t\tOrange food colouring, or Orange sweets for his nose\n\t\tSilver Catering balls for his eyes\nUtensils:\n\t\tScales for weighing out the Icing sugar\n\t\tSpatula, or spoon\n\t\tBig bowl\n\t\tPlate\n\t\tCocktail stick. See this REALLY annoys me, when someone makes something for you to CONSUME and they don't bother to wash their hands before hand... So I have decided to make a point of a few things that TV Chefs or whatnot don't seem to be able to do\n# Remove any rings/ Jewellery that may come in contact with the food\n# Wash your hands before you cook\n# After handling Eggs, its good practice to wash your hands (Stops the spread of Salmonella and such)\n# DO NOT Lick or taste off a spoon THEN put it back into the bowl you took the sample from... EWW... If I wanted your germs I would kiss you!\nBefore I start sounding like a rambling germ freak, I'm sure you'll agree this is all common sense... But some people just don't ...get... that its not hygienic. \n          Firstly, measure out your 300g of Icing sugar and sieve it into your large bowl... I didn't sieve it, because I'm an idiot... I spent AAAAGES trying to mix the lumps out *sigh* seriously... sieve... It will save you loads of hastle.\nNow, crack your egg, and seperate your white from the yolk...personally I like to crack my egg in half, and pass the yolk from one half to another so that the white lands into my mix... there are other ways,\n\n\t\tlike cracking the egg onto a plate and manually lifting out the yolk with a spoon...\n\t\tOr... Pouring the egg into a sieve so that the white runs through the holes and the yolk stays in the sieve\u00a0\n\t\tBuy a carton of eggwhite\nWhichever you decide to use... pop your eggwhite into your Icing sugar.. It is at this time, that you need to add your flavouring... Add 2-3 cap fuls of your flavouring, but if you think it doesn't taste strong enough... add a dash more in until your happy!\nNow, using your spatula, or whatever stirring implement you have on hand give the Goop a good old mix... And if your feeling particularily inclined... dive in with your hands it's quite cathartic!. Now in step 1 I mentioned about having extra Icing sugar....\u00a0 Well...this is where it comes in (maybe)... Now not all eggs are a standard size and weight... so your mix (or hereafter refered to as Goop) may be a little bit runny, and may not hold together like it should... My Goop was abit of a runny mess, so I added little ammounts of icing sugar until it became a firm white ball. that doesn't feel wet to the touch.\nIF you add too much icing sugar... add a tiny bit of liquid (egg white, water, or more flavouring) but I mean TINY... we don't want to be constantly adding more of one thing then another. I found at this point that since I had been kneeding the Goop with my hands It was a little too warm to model, So I bunged it in the refridgerator for about 5minutes to firm up.\u00a0 You should be able to roll small balls of the mix and have it hold together. Damnation, part of my I'ble didn't save... right lets try and remember what whitty comment I made here\nNow, I'm not going to teach you how to suck eggs, as I'm assuming that most of you at some point in your lives will have made an ACCTUAL Snowman. What you need do, is to roll out a seris of balls...\nthe one on the bottom must me slightly larger than the one which will form the head... Stick one to the other and repete until your bored ;)\nIf you intend to get creative and make your decorations with the Goop, remember to leave some to one side for that task.. Now you have a little army of blind Snowmen, don't you think it's time we gave him some eyes, and a nose?\nOr well... Now's the time to go mad with your decoration... I was getting sleey at this point so I stuck on some little silver balls for eyes, and cut up some orange sweets for a nose... BUT you could draw it on with icing, or eddible ink... Or make him a scarf... I don't know, do what makes you feel happy ;)\nTo make the eyes and nose stick, I made a little hole with a toothpick and jammed them in the hole.\nIf you are making your own nose, take your left over Goop, and mix in a little orange food colouring, roll into a carrot shape and stick it to the front of your snowmans face.\nYou will have to work quick as I found that my Goop went hard very quickly... but then again I was working quite slowly. ;). \nThis is a last minute thought.\nPossible Variations:\n\t\tMake round disks of your goop, add a small ammount of green food colouring and half dip with milk chocolate.\n\t\tCut out into festive shapes and hand out as peppermint thins\n\t\tCompletly dip in chocolate and serve as a cheep alternative to After Eight mints\n\t\tGet gorey... Make into an eyeball shape (make two balls, one white, one red... or fill the inside with red jelly or cornstarch blood) Paint on an Iris and a few veins and hand out at Halloween!\nGo nuts and enjoy people :). Well, now, It's best to leave your Snowmen to set over night, either in the Refrigerator or somewhere equally as cool...\nThen when theyre set, hand them out to your nearest and dearest... or just gobble them yourself. My collegues loved them, and I'm sure your mates will love them too\nFor other Ideas go and look at my other I'bles. If the I'ble needs explaining or altering let me know... Vote, Reply, Rate and Enjoy\nCheerio\nBiggsy\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 3, 1, 2]\nC: [2, 3, 0, 1]\nD: [1, 3, 2, 0]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_166_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_166_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_166_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_166_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [1, 3, 0, 2]\nC: [1, 3, 0, 2]\nD: [2, 0, 3, 1]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. So, how do you make this wonderful stuff? The ingredients are as follows:1 750 mL bottle of grain alcohol ( Everclear or similar, also known as rectified spirit--as long as it's potable, strong, and unflavored you'll be fine)Zest of 8 lemonsSugarWaterSimple, yes? Oh, you'll also need a glass jar in which to keep the stuff. Be sure you have lots of spare room, as you'll add more liquid later. Mine is two liters, and works great.You want to get the strongest alcohol you can get your hands on. Vodka, even the 100 proof stuff, isn't sufficient. In some states, such as Nevada, you can get 190-proof Everclear, which is 95% ethyl alcohol. Alas, California isn't one of them, so I'll make do with 151 proof (75.5% alcohol, which is still pretty stiff). You can as well, but go with the high-test if you get it. You'll dilute it down to something drinkable later; right now we need a strong but potable nonpolar solvent, and high-proof alcohol fits the bill. I understand an old catalog came with a disclaimer that Everclear was to be used \"for the production of homemade cordials,\" or some such, which is exactly what you're doing here.. First, wash the lemons thoroughly. A produce brush helps a lot with this. Some folks use a special-purpose fruit and vegetable wash solution to get them super-clean, but I've never been one for such luxuries.Next, zest the lemons. For those of you who aren't familiar with the process, lemon peel consists of two layers: zest and pith. The pith is the inner, white part, and the zest is the outer, yellow part. You only want the zest, because the pith is bitter and will impart that bitterness to your limoncello. Therefore, be careful that you don't get any bits of white in your zest.There are a lot of ways to zest lemons. Going from low-tech to high, they're as follows:A knife. You can zest lemons with a knife, but it needs to be small and very sharp, and you need to be careful with it. Blood in your limoncello is not cool, no matter how much of a goth you are.A potato peeler. Some people like these, but they probably have sharper potato peelers than I do. The first time I made this stuff, I tried this but then switched to a (just-sharpened) knife. Then I bought . . .A lemon zester. Mine's a knock-off of a nice ergonomic model from Zyliss and also includes a channel knife so you can make twists too.A Microplane or similar fine grater. This might be the ultimate zesting tool--I've heard people say they make it much easier, and they certainly look like they would, but I don't have enough use for one to justify dropping $15 or $20 on it. (Edit: On the recommendation of nattles, below, I have purchased a Microplane grating rasp, and it is everything a grater should be. Strongly recommended.)Keep in mind that smaller bits of zest will give you more surface area, and therefore more chance for the lemon oils to dissolve into the alcohol. Knives and potato peelers will each give you little chips of zest, whereas the zester will give you thin strips, and the Microplane very tiny shreds. I'd go for the lemon zester if you didn't have anything more specialized; it should only cost five bucks or thereabouts. Or if you want to splash out a bit more, get a Microplane rasp.. Next, pour the alcohol over the zest and wait a month or so. Keep the jar in a cool, dark place, and shake it every so often to mix the lemon zest around. In the meantime, maybe you could make lemonade or lemon chicken or something with all the lemons you have. Be advised that they'll spoil much sooner without their zest, so you'd better get to juicing pretty quickly.. OK! It's been a month or so, and the alcohol has taken on a very bright yellow color. This is just what we want--it shows us that the lemon oils have left the zest and entered the liquid. Now it's time to take out the lemon zest. If it's done, the booze should be lemony and the zest very pale and somewhat more brittle. This is about right.. Remember when I said we'd dilute it down to something more reasonable? Now's the time. I used 4 cups of water and 2-1/2 of sugar, which is a decent starting point. You may want to add a bit more sugar-water if you used the high-test Nevada Everclear instead of the weak stuff we get here in the California Republic, but it's easy enough to adjust the strength later. (Edit: I have a batch in now that I'm planning on preparing according to Alain80's recommendation below of a 1:1:1: ratio of alcohol to water to sugar (one gram of sugar per one milliliter of water/alcohol). I'll post my results here once it's done.)Anyway, heat the water on the stove and stir in the sugar. You don't need to boil the water, but you do need to get it hot enough so the sugar dissolves. Stir it frequently until it turns clear. The sugar-water will be markedly more refractive than plain water, because of all the dissolved sugar, but you should be able to see the bottom of the pan clearly.There's an argument that I should have taken pictures of making the syrup for completeness, but dissolving white powder in clear liquid to make another clear liquid is the sort of thing even the dimmest Photo 102 student would recognize as \"not visually interesting.\" My pedantic side demanded one, though, so it's in this batch as well.In any event, that's it! You can drink it as it is, but it'll improve with a month or so of sitting. It won't freeze unless you added a lot of water, so feel free to keep it in the freezer. Good luck!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 3, 0, 2]\nC: [1, 3, 0, 2]\nD: [2, 0, 3, 1]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_167_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_167_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_167_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_167_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [1, 2, 3, 0]\nC: [3, 2, 0, 1]\nD: [3, 1, 0, 2]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Most of these ingredients can be found around the house without much trouble.Ingredients:-Milk (best with 2% or whole)-Sugar-Cocoa Powder-Pure Vanilla Extract-Small Pot-Mug-Tablespoon-Stove . First off fill your mug about 3/4 full with milk. I'd say about 1 1/2 cups worth. From here pour from the mug to the pot.. Get about a tablespoon or so or cocoa and put in the pot with the milk. You can add more or less depending on what you like.. Get your pure vanilla extract and put in a drop, and only a drop. You can fill up the cap to get the exact right amount. You only need a tiny bit. . Get about 2 tablespoons of sugar and add it to the mix.. Turn your stove on high and start to stir. You want to keep it from getting to a boil. To get the cocoa to mix in, mash it up against the side of the pot with the back of your spoon and rub it in.. When it seems hot enough to you, (usually when it starts to steam) pour it back into your mug. Add whatever you want, like marshmallows, although i just like mine plain. . Drink up and Enjoy!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 2, 3, 0]\nC: [3, 2, 0, 1]\nD: [3, 1, 0, 2]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_168_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_168_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_168_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_168_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [0, 2, 1, 3]\nC: [1, 2, 3, 0]\nD: [3, 1, 2, 0]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. You will need the following: *Jello (light-colored Jello such as Lemon or Lime will work best for this project) *16oz of tonic water *Stove *Liquid measuring cup *Small pot for boiling water *Mixing bowl *Mixing spoon *Refrigerator *Small Table Lamp with fluorescent blacklight (The blacklight must be fluorescent and not simply a colored bulb. It can be purchased at most Wal-Mart locations for around $4.00). Measure out 8oz (1 cup) of tonic water in a liquid measuring cup.Safety: Use caution when removing the lid from the tonic water. If the water has been shaken, the lid will shoot off and overflow. Allow for the water to settle if the bottle is under pressure. . Pour tonic water into pot. . Put pot on burner and turn on high. . While water is boiling, pour Jello packet into mixing bowl. . Once water has boiled, pour the boiling tonic water into mixing bowl.Safety: Pot will be hot! If the pot is too hot to grab, use a potholder to remove the water from the burner. . Stir together Jello and boiling tonic water, making sure that the Jello powder fully dissolves. . Add one cup of cold tap water to the mixing bowl.Alternate method: If you would like your Jello to glow more brightly, add one cup of cold tonic water instead of tap water. However, doing this will make the Jello taste more bitter than normal. . Place the mixing bowl in the fridge and chill for four hours. . After four hours, remove from fridge. Turn out all lights and turn on the lamp with the blacklight in it. You should have glowing Jello!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 2, 1, 3]\nC: [1, 2, 3, 0]\nD: [3, 1, 2, 0]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_169_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_169_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_169_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_169_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [2, 0, 1, 3]\nC: [3, 0, 2, 1]\nD: [3, 1, 0, 2]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Here's what you'll need to create these delectable appys and be assured of an invitation to all future parties:\n20 to 25 jalapenos peppers\n2 packages of cream cheese (softened)\n1 package of powdered ranch dressing mix\n1 cup of shredded cheddar cheese\n1 pound of bacon (you need one slice for each pepper)\nThe ingredients in the photo don't reflect the amounts needed for a full recipe as I was only making partial batches for testing purposes.. Put the cream cheese in a bowl and add the cheddar cheese. Sprinkle the ranch dressing mix over the cheeses and mix it all up with a fork. You can also use a food processor for this step but if you leave the cream cheese out until its room temperature its easy enough to do by hand. I like the texture of the cheese better if you don't chop it all up with the food processor.\nSet the cheese mixture aside while you do the next step.. Ok the first thing is to remove the stems. You don't want to cut them off because you need to keep the end intact to keep the cheese from melting out when you cook them. So just pop them off with a twisting motion.\nThen you need to cut the peppers in half lengthwise like in the picture. If you hold the pepper with the curved ends facing up you'll have a better looking end result and they'll lie down better.\nNow you've got to get the seeds out. I have found that a grapefruit spoon with the serrated end works really well for this and is a lot safer than using your fingers. Nevertheless, remember to wash your hands often. Those little alcohol pads will work the best if you can get your doctor's office to donate some. Trust me, if you have to go to the bathroom during this process you will want to be very careful about washing first. Enough said.\nNotice in the pictures I have them all cleaned out and the ends of the peppers are still intact. You have to leave a little bit of the white pithy stuff there and some of the seeds will want to hide up in there. That's ok, a couple of seeds will just make them interesting.\nNow you have a choice to make. Wild or mild? Mild will mean almost everybody will enjoy them and compliment you on what a great cook you are. Wild will mean most people will eat one and gasp and you might end up with leftovers.\nIf you want your peppers to be kind of mild, go ahead and scoop the cheese mixture into the pepper halves. Don't overdo it or the excess will just melt out onto the broiler pan which is just a waste of the good stuff.\nIf you want people to know these are jalapenos then you need to get some of those seeds you just scraped out and mix them up with the cheese. I can't tell you how much to use but a little goes a long way. I've tried tasting the mix but it changes after its cooked and also some peppers are hotter than others. If you use all the seeds I can assure you that only the strongest will survive the experience. Not really, they are still really good but most people will steer clear of them because they will be very hot.\nI got tired of the comments from some people about they aren't hot enough. So I started making both. Truth is I agreed with them. The mild ones can be disappointing if you like spicy stuff. So I do both.\nStuff half of the peppers with the seedless mixture and then put a few scoops of seeds into the remaining cheese and stuff the rest. I keep the hot ones separate from the mild ones so I can sprinkle paprika on them to mark them. The paprika doesn't change the flavor much but it works to warn the meek.. Now let's wrap this up. With bacon.\nYou need as many slices of bacon as you have peppers. We're going to cut the bacon in half like in the picture and use the half slices to wrap each one.\nHow you wrap them is important. I always start with the leading edge of the bacon just under the bottom of the pepper and hold it there while you wrap the rest around ending up usually on the top. It depends on how fat the peppers are. You can stretch it some to make sure it lays across the top so it doesn't uncurl when you broil them.\nGo ahead and wrap all the peppers and put them on a broiler pan. You can use a baking tray also but the broiler pan works best to drain away some of the bacon grease.\nLeave a little space between them or the bacon won't get done. The ones on the top row here are too close together.\nIf you made some wild ones now is when you sprinkle the paprika on them. You can see from the pictures it marks them pretty well.. Set the oven rack onto the second level down. You don't want the peppers real close to the heat or the tops will be really crispy before the rest of the bacon gets done.\nPut the peppers in and turn the oven to broil and close the door. If you're freaked out about how done your bacon needs to be then you might even put the shelf farther down and just bake them in a really hot oven. I prefer the peppers to still have some firm texture to them so I like to broil them quickly.\nTen minutes seems to work the best in my oven, yours may vary. The tops of the bacon are a little crispy, the sides are well done.\nTake them out of the even and let them sit a couple of minutes to let the cheese set.\nPut them on a tray and have at it. You are now the MVP of the super bowl party. (Unless somebody else brings a keg or something.)\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 0, 1, 3]\nC: [3, 0, 2, 1]\nD: [3, 1, 0, 2]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_170_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_170_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_170_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_170_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [2, 0, 1, 3]\nC: [0, 3, 1, 2]\nD: [3, 0, 2, 1]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. For this project, you don't need that much, just some basic cooking ingrediants and tools.\nThe ingrediants* for the cupcakes are:\n- 2 1/4 cups Flour\n- 1 1/3 cups Sugar\n- 1 cup Milk\n- 2 Eggs\n- 1 teaspoon Vanilla\n- 3 teaspoons Baking Powder\n- 1/2 cup Shortening\n- 1/2 teaspoon Salt\n- 2 Lemons\n- 3 Tea Bags\n- Food Coloring\n* You will need to split the ingredients in half to make two different batches. With the exception of the lemons and teabags, all of the ingredients are going to be split to be used for two different batches.\nThe Ingredients for the Frosting are:\n- 3 - 4 cups Powdered Sugar\n- 1/8 cup Milk\n- Teaspoon of Vanilla\n- 1/2 stick of Butter\n- Food Coloring (green)\nThe Materials are:\n- Assorted Measuring Utensils\u00a0\n- Mixer\n- Bowls\n- Cupcake Pan\n- Paper Liners\n- Spoon\n- Knife\n- Juicer\n- Oven. The first step is to put 1 1/8 cups of\u00a0flour, 2/3 cup of sugar, 1 1/2 teaspoon of baking powder, and 1/4 teaspoon of salt in a mixing bowl. Mix together. Then, add 1/4 cup of shortening, 1/2 cup of milk, and 1/2 teaspoon of vanilla. Beat all of the ingrediants together for about one minute. The next step is to add one egg to the mixture. Beat with mixer for around a minute. Then, mix on the high speed for a minute. The next step is to take two lemons. First, use a grater to grate the lemon peel to get lemon zest. Add that to the mixture. Then, take the two lemons and cut them both in half. Collect juice from them using a juicer. Add to cupcake mix, and beat with the mixer for around 30 seconds. The final step is to take yellow food dye and add that to the mixture. It depends on what shade of yellow you want, but I added around 20 or so drops to get the color I wanted.. The first step is to put 1 1/8 cups of flour, 2/3 cup of sugar, 1 1/2 teaspoon of baking powder, and 1/4 teaspoon of salt in a mixing bowl. Mix together. Then, add 1/4 cup of shortening, 1/2 cup of milk*, and 1/2 teaspoon of vanilla. Beat all of the ingrediants together for about one minute. The next step is to add one egg to the mixture. Beat with mixer for around a minute. Then, mix on the high speed for a minute. The next step is to add food coloring. I added 1 drop of blue, 1 drop of green, 2 drops of red, and 5 drops of yellow to get a pinkish-brownish color.*Before you do anything, heat the 1/2 cup milk until hot to touch. Then, put the three tea bags of your choice in the milk and let them steep for ten to fifteen minutes. Remove tea bags before putting the milk in the cupcake mix.\u00a0. Once you have both of your cup cake mixtures, it's time to start baking them! Preheat the oven to 350 degrees (Fahrenheit). Next, grab a cupcake pan and line it with the paper liners. Then, fill it one third of the way with one of the mixes. I choose the iced tea mix to go on the bottom, but it doesn't really matter. Then, fill it up another third with the other mix, so for me, the lemonade mix. Leave about one third left, because the cupcake rises. When done filling them up, throw them in the oven for about 20 minutes. \u00a0A good way to tell if they are ready or not is to poke the center with a toothpick and if it comes out clean without anything sticking to it, the cupcakes are ready!\u00a0. After the cupcakes have cooled, it is time to frost them. The recipe is pretty simple. First, melt half a stick of butter in the microwave so it becomes soft. Then, add 1/8 cup of milk, 3-4 cups of confectioners sugar (depends on how dense you want the frosting), and one teaspoon vanilla. Blend together with mixer. Once done, add several (10-ish) drops of green food coloring. Mix that in as well so the frosting becomes a nice grass color. Once the frosting is completed, spread generously over the cupcakes. Wait for frosting to dry once done.. Once the frosting has cooled, you can now decorate your cupcakes!\n-To make a sandtrap, simply take brown sugar and put some on.\n-For a pond or lake, take a little of the left over green frosting and mix it with some blue food coloring.\n-For the rough, take a toothpick, put some green frosting on it, and put it on the cupcake. Then make movements up.\n-For the golfball and the hole, take a mini marshmallow, knead it until it becomes sticky, and coat it in confectioners sugar. For the hole, take a toothpick and dig a little one. Then, put white sprinlles in it.\n-For the flag, take a piece of paper and cut a triangle. Tape or glue the triangle on a toothpick.\u00a0\n-For the Happy Father's Day, I took the leftover blue frosting used to make the ponds/lakes and added more blue food coloring and some red food coloring to get a dark purple. Then, I took a toothpick, dipped it in the frosting, and used that to write the words.. Once the cupcakes are done, bring them over to your dad and thank him for all he's done. Enjoy this treat together for a memory that will last forever! Enjoy!\nThis is a great project that the whole family can enjoy making for their Dad. Kids love to decorate cupcakes, especially of they get to eat them too! Take the family out golfing or mini-golfing and bring these too for a perfect Father's Day!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 0, 1, 3]\nC: [0, 3, 1, 2]\nD: [3, 0, 2, 1]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_171_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_171_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_171_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_171_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [0, 3, 1, 2]\nC: [2, 3, 1, 0]\nD: [2, 3, 0, 1]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Clockwise around Rim:2 dried red peppers, stemmed   6 white cardamom pods   6 green cardamom pods   3 black cardamom pods   1/2 teaspoon ajowan seeds   1/2 teaspoon charnushka seeds (also known as nigella or kalonji)   1 teaspoon cumin seed   1/2 teaspoon fennel seed   1/2 teaspoon fenugreek seeds   1/2 teaspoon blade mace   1 clove   1/2 teaspoon ground nutmeg (I used about half of the nutmeg piece in the picture) 1/2 inch Ceylon cinnamon stick 1/2 inch Sumatran cinnamon stickCenter:1/2 teaspoon each pink and black peppercorns. Preheat a pan on medium. Once hot, add all of the spices *except* the blade mace and the nutmeg.  Toast for 90 seconds or so, then dump the mix into a bowl to cool.  Immediately after dumping, return the chilies, cinnamon, and cardamom pods from the bowl to the stove. Toast for another 3-4 minutes, until the chilies start to blacken a little. Open the cardamom pods, and add the seeds inside to the spice mix along with the chilies and cinnamon.Let it cool down completely before grinding.. Open a 16 oz can of chickpeas, and rinse thoroughly under cold water, agitating by hand. As you mix it up, the skins will begin to come off.Put down some paper towels in a rimmed sheet tray, and pour out the chickpeas. With more paper towels on top, begin rolling the chickpeas around. This will simultaneously dry them a bit and help remove the skins.. Put the cooled spices into a spice grinder, plus the grated nutmeg and blade mace. Grind until fine.In a bowl combine:Chickpeas 1 tablespoon olive oil 1 tablespoon sweet smoked paprika 2 tablespoons garam marsala Healthy pinch of sea salt.Mix well.. Have a rimmed sheet pan preheated inside a 400 degree F oven. Add the chickpeas, avoiding clumps and multiple layers.Cook for 20-30 minutes, shaking the pan occasionally. When you pull them out, sprinkle with kosher salt.They are good finger food, or mixed in with some Greek yogurt.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 3, 1, 2]\nC: [2, 3, 1, 0]\nD: [2, 3, 0, 1]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_172_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_172_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_172_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_172_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [1, 3, 2, 0]\nC: [1, 2, 0, 3]\nD: [2, 0, 1, 3]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Recipe: For 6 smaller fluffy vegan pancakes!Ingredients: Dry ingredients:1 cup + 1/4 cup (131 gr) sprouted whole spelt flour (I used Rude Health) 2 teaspoons baking powder Wet ingredients: 1/2 cup (137 gr) apple sauce 1/2 cup unsweetened soy milk 1/2 teaspoon home-made vanilla extract 1 tablespoon agave Passionfruit glaze:3 ripe passionfruit, each cut open, seeds & flesh spooned out 1 tablespoon agave. Place dry ingredients & wet ingredients in this order into your Vitamix container. Place fitted lid & tamper in. Blend on high-speed until fully mixed. This took me about 10 seconds. Remove lid & tamper. Your pancake mix will be thicker, like this, see photo above!. Place ripe passionfruit seeds & flesh into fitted cooking pot & add agave. Stir & heat up. Simmer for about 3-4 minutes until it forms like a gel aka jam like consistency aka glaze. Taste. It is a bit sweet & you really can taste the full passionfruit flavour & that is what you want. Turn heat off & keep warm while you fry your pancakes. it will look like this:. Shape & fry your pancakes. Take a small pancake pan & smear it in with a fruity oil. Heat up on medium-high. Spoon 2 big small spoonfuls of the dough into your pan & flatten it all out with the back of your spoon to form quickly a roundish shape for your pancake. Wait until bubbles appear into the surface of the pancake & carefully flip your pancake over with help of a pancake spatula & fork. With your pancake flipper, push the upside of the pancake down for an equal fry. When done, place onto a plate & repeat the process. I oiled my pan again & again. When all the pancake batter has been used up, serve at once, like photo above & enjoy! 2 pancakes will be enough because they really fill you up & that is what you want!The pancake itself is a bit sweet, thick & fluffy & the passionfruit glaze on top is also a bit sweet & so addictive as well.You can also read this tasty post here on my blog: http://sophiesfoodiefiles.wordpress.com/2016/07/12/vegan-pancakes-passionfruit-glaze/\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 3, 2, 0]\nC: [1, 2, 0, 3]\nD: [2, 0, 1, 3]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_173_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_173_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_173_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_173_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [1, 0, 2, 3]\nC: [2, 3, 0, 1]\nD: [0, 1, 3, 2]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Ingredients for Orange Cake:All purpose flour - 1 1/2 cupOrange juice - 1 1/4 cupOrange zest - 1 tbspBaking soda - 1 tspBaking powder - 1/4 tspSugar - 1/2 cupSalt - 1/8 tspCanola oil - 1/3 cuplemon juice - 1 tbspVanilla extract - 1 tspIngredients for Orange Icing:Icing sugar - 1 cupOrange juice - 1/4 cupOrange zest - 1 tspFor baking9 inch springform Pan. Grease the pan with butter and add some all purpose flour in it. Swirl the pan to flour coat the bottom and sides of the pan.. preheat oven to 375 in bake mode.Take all the dry ingredients in a large mixing bowl. whisk until all incorporated.In a separate bowl, take all the wet ingredients, zest of Orange and mix all well. Now add wet mixture into the dry mixture. Mix all together without lumps.Now pour the cake batter into the prepared cake pan. Tap on the counter top, to release the air bubbles.Bake the cake for about 20 to 25 mins. Check the cake after 20 mins of baking. Insert a toothpick in the centre of cake and it should comes out clean. If it's sticky in the toothpick, bake for 2 to 3 more mins. Once it baked, remove carefully from the oven and let it cool down for sometime.. Take a cup of icing sugar in a bowl, add a tbsp of Orange juice at a time. Mix and check for the right icing pouring consistency.Add orange zest to it and mix well.Orange Icing is ready! It is soooo yumm!!. Once the cake is completely cooled down, take out the cake from cake pan and place it in cake stand/plate. [ I inverted the cake for smooth flat base]Pour the icing in middle of the cake and use butter knife or back side of knife to spread the icing all over the cake. I left the sides of cake as it is. [you can apply icing if you need]Decorate the cake with sliced oranges and orange zest.Before you cut the cake, make sure the icing sets well. Happy Baking :)\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 0, 2, 3]\nC: [2, 3, 0, 1]\nD: [0, 1, 3, 2]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_174_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_174_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_174_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_174_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [2, 0, 3, 1]\nC: [3, 1, 0, 2]\nD: [1, 2, 0, 3]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Roughly chop one onion.The reason for sitting the pork belly on a bed of veg is to lift it off the pan and it adds a little bit of flavour to the meat. Next roughly chop a carrot. Spread the veg across the tray.. Gather together a few metal skewer and tape them together. Using the skewers begin to poke holes into the skin, this will help the skin crackle evenly.Once the skin has been completely poked, cut the pork belly in half.. Then sprinkle over a pinch of salt.Next place the pork belly on the tray lined with veg. Spread salt across the skin, this will also help the skin to crackle.. Add a few pieces of wood to the charcoal to add some smoke.. Cook for a total of 4 hours.Two hours into the cook, make sure to turn the pork belly so it evenly cooks.. Take it off the bbq and let it rest uncover for 20 to 30 minutes.. You'll need lettuce, cherry tomatoes, sliced avocado and kimchi which is salted and fermented vegs. The kimchi adds an amazing flavor which cuts through the richness of the pork belly. You can find it in most Asian supermarkets.. Lay out the pork as you would a sandwich.First, add a layer of lettuce.Then a layer of cherry tomatoes.Follow with a layer of kimchi. Finally, a layer of avocado slices. To top it off spread over some garlic aioli.. There you have it! A Pork Belly Burger, the twist is the pork belly is the bun!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 0, 3, 1]\nC: [3, 1, 0, 2]\nD: [1, 2, 0, 3]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_175_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_175_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_175_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_175_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [2, 1, 0, 3]\nC: [0, 3, 2, 1]\nD: [1, 3, 2, 0]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. You'll need the following:4 Cups of Froot Loops* (or cereal of your choice...Trix? Lucky Charms?)1 Stick of Butter1 bag of Mini MarshmallowsTools:SaucepanSpoonMeasuring CupWax PaperBaking Dish*NOTE: I picked Froot Loops for the color aspect. Try other cereals too. Trix or Lucky Charms could be fun ones to experiment with. Ideally, when you are picking your cereal, you'd like to pick a crunchy corn or rice based cereal. This will allow for you to have a krispies treat with some crunch. . The whole process of making these delicious treats happens relatively fast, so you'll want to prep your pan first so that when the marshmallow coating on the cereal is still gooey you can transfer it easily. Butter your baking dish thoroughly. Make sure to butter the sides as well as the bottom. This will make it so your krispie treats will slide out when they are done instead of sticking to the pan. . Place your stick of butter in your sauce pan on your stove. Melt the butter over low heat so that you do not burn your butter. . Once the butter has melted you can add your marshmallows. I used the entire bag of mini marshmallows for this Instructable. Stir constantly, insuring even mixing of your melting marshmallows and the butter. Keep heating until the marshmallows have melted completely and you can no longer distinguish single marshmallows. . Once you have a uniform mixture of butter and marshmallows, add your Froot Loops. Mix gently with a wooden spoon until the cereal is coated in the marshmallow mixture. Since Froot Loops are much bigger than regular Rice Krispies, you'll want to be careful when mixing so that you don't break the loops up. . Once adequately mixed, transfer your cereal marshmallow mixture to your buttered baking dish. Then, using a piece of wax paper, press your krispies down so that they have a uniform shape and top. . Let your krispies treats cool for at least 10 minutes, allowing the marshmallow to harden. After they have cooled you can remove them from a pan (they should slide right out with the buttering you did earlier). Cut with a sharp knife and serve!If you plan on storing them, place them in an air tight container. They will keep for a few days. . Enjoy your new twist on the classic rice krispies treat! Take them to potlucks, parties, and wherever else you need a little rainbow marshmallow goodness. \nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 1, 0, 3]\nC: [0, 3, 2, 1]\nD: [1, 3, 2, 0]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_176_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_176_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_176_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_176_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [0, 3, 2, 1]\nC: [0, 1, 2, 3]\nD: [3, 2, 1, 0]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. As with almost all of the cookbooks I've collected, this recipe came from a book found in a second hand store. So often, I find great books tossed to the curb simply because they aren't the latest, greatest, or hottest thing off the printing press from the next celebrity cook. I'm proud to say I have quite an extensive library of cookbooks, but the most expensive of all only cost a few dollars at most. This recipe produces a pickle that is reminiscent of a bread-and-butter pickle, though not as sweet. Of course, that may be resolved by simply adding a bit more sugar to the recipe, your choice. From the Cooking Light Annual Cookbook from 2008, (it is common courtesy, and often a matter of copyright, to credit your recipe source) I present to you: EASY REFRIGERATOR PICKLES 6 cups of pickling cucumbers, sliced thin (or according to your preference) (This is usually about two pounds) 2 cups of thinly sliced onions 1 1/2 cups of white vinegar 3/4 cup of white sugar 3/4 teaspoon of salt (Kosher, if you have it) 1/2 teaspoon of mustard seeds 1/2 teaspoon of celery seeds 1/2 teaspoon of ground Turmeric (adds great color!) 1/2 teaspoon of crushed red pepper (yes, that stuff from the pizza place) 1/4 teaspoon of freshly ground black pepper 4 cloves of garlic, sliced very thin, or pushed through a garlic press. To reduce clutter, and eliminate the potential to forget an ingredient, or even to prevent dropping a salt shaker into a bowl of batter below a cabinet, I prefer to gather all of my ingredients before beginning any recipe. Many fast-food restaurants offer nifty little plastic cups for carryout condiments. With a slight touch of inner hoarder, I've collected and saved many of these cups for just such an occasion as this. Grab a few lids while you're at it, and you can easily prepare in advance if you are not quite ready to cook. After measuring out all the spices, grab a few onions, a head of garlic and a bottle of white vinegar. We're going to make brine!. If you don't have a garden full of vegetables, consider visiting your local farmer's market in search of pickling cucumbers. Generally, pickling cukes are are shorter, smaller, and often knobby. Crisp, bright green and white skin is not mandatory, but typical of this snappy little veggie! No one is going to judge you for buying cucumbers at the grocery store. Sometimes it happens. . There are a few gadgets in my kitchen (pffft, that is an understatement)\u00a0 that I use rather frequently, a mandoline being one of them. MANDOLINE (note the letter 'e' on the end) - not to be confused with a mandolin, which is a musical instrument. These are very valuable kitchen tools when you have a lot of thin slices to make, but are slightly fond of your fingertips. Typically, a mandoline has an adjustable dial for various thicknesses. This gadget makes slicing vegetables an absolute breeze!. Cut the ends from each onion, remove the skin, and slice very thin. If you desire super-thin slices of onion, consider using a mandoline, also known as a slicer. It is not necessary, or even suggested, that you peel the cucumbers, though you might consider removing a bit of each end. Though some people don't mind the blossom or end nubs, I'm not one of them. Using caution, carefully cut the cucumbers into thin slices. Yes, you may cut them slighter thicker if you wish. It is entirely up to you. Another option is to use a mandoline if you have one. See step 4 for more detail about mandolines. After slicing all of the cucumbers and onions, combine them in a large glass bowl in layers of half the cucumbers (three cups), half the onions, (one cup) and repeat. Remember, you'll need to have enough room in the bowl for the brine. Cut the garlic into tiny little slices, though you may also simply send the cloves through a press if you have one. Set the garlic aside to be added to the brine process in step 6. By all means, feel free to add other veggies! Only because I did not have any on hand did I not include various colors of super-thin sliced pimentos, jalapenos, carrots, etc. They only add to the beauty of your pickles. . In a small saucepan, combine the vinegar and all of the following, (and remaining) ingredients: 3/4 cup of white sugar 3/4 teaspoon of salt 1/2 teaspoon of mustard seeds 1/2 teaspoon of celery seeds 1/2 teaspoon of ground Turmeric (adds great color!) 1/2 teaspoon of crushed red pepper (yes, that stuff from the pizza place) 1/4 teaspoon of freshly ground black pepper 4 cloves of garlic, sliced very thin, or pushed through a garlic press Stir the brine well, and bring to a boil. Allow to cook for one minute.. After you have removed the brine from the stovetop, pour it over the onions and cucumbers. Be sure to mix it well. If your brine doesn't quite cover the cucumbers, you can always put another glass bowl on top, press down, and wrap the bowls tightly with plastic wrap to keep the top bowl forcing the cucumbers to be submerged in the brine. Allow the mixture to cool, then cover and refrigerate for four days. Approximate yield is seven cups of pickles. These pickles may be stored in the refrigerator for up to one month. If your large glass bowl is taking up as much room as mine did in the refrigerator, after it has completely cooled, you may transfer the pickles and brine to a plastic container if you desire. . And what does an image of lettuce have to do with pickles? Well, nothing, actually, but I needed a picture for the health tab. Lettuce is green. And green is usually healthy, right? For label-obsessed foodies, this is yet another wonderful feature of the Cooking Light books, they tell you what you are eating! Here is a partial low-down on the pickles you've just made, assuming you stick to the 1/4 cup serving size: Calories: 28 (10% from fat) Fat: 0.1 grams (polyunsaturated) Protein: 0.3 grams Carbohydrates: 7 grams Fiber: 0.3 grams Cholesterol: 0 milligrams (yeah!) Iron: 0.1 milligrams Sodium: 64 milligrams Calcium: 7 milligrams Enjoy!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 3, 2, 1]\nC: [0, 1, 2, 3]\nD: [3, 2, 1, 0]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_177_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_177_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_177_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_177_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [2, 0, 3, 1]\nC: [3, 1, 0, 2]\nD: [2, 1, 0, 3]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. You'll need:\n- one potato per person\n- 1/4 onion per potato\n- Two largish mushrooms per potato\n- Pepper (we use a four pepper blend)\n- Spices (1)\n- Soft butter\n- Aluminum foil\n- A barbecue (duh!)\n- Knife of slicer (2)\n(1) We use a spice blend from a popular spice company who shall remain nameless (but whose initials are Victorian Epicure).\u00a0You can make your own with chives, basil, sage, oregano and dehydrated garlic).\n(2) We use a slicer from another well known kitchen products company (who shall also remain nameless but is an anagram of\u00a0Prefaced Hemp).. Cut your potatoes, onions and mushrooms with the (unnamed) slicer. If you're not using a slicer, cut with a very sharp knife. All veggies should be sliced to approxiamtely 1/4\" thickness, but it;s not too critical to pull out the tape measure to check.. Cut a two-foot long piece of\u00a0aluminum foil. Butter it with about a tablespoon of soft butter, spreading with a fork as shown below.. Lay out your ingredients in layers on the buttered foil. As you can see from the image, lay out the potatoes first, then the onions, then the mushrooms. The photos show us making the recipe for two people. If you are making for more diners (or for fewer REALLY hungry people), add additional layers.\nNote that two layers should be your maximum. If you're making this dish for more than\u00a0four people, make a second foil container. Also remember that the ingredients that wind up in contact with the foil will brown much more than the other ingredients. In our case, the potatoes and the mushrooms will brown. If you're adding extra layers, consider which ingredient you want browned when figuring out the layers.. Drop another tablespoon of butter in nut sized clumps on top of it all. Obviously, this is not a cholesterol-free recipe...\nAdd pepper and spices to taste. Don;t feel limited by the spices we selected. Experiment! Paprika may be good here, or dehydrated jalapeno if you're into that kind of thing.. Close\u00a0up the foil by matching\u00a0the front and back\u00a0edges together and rolling the matched edges. Roll the entire package so that the resulting seam is on the side of the package. The melting butter will liquefy and putting the seam on the side helps prevent leakage.\nTwist up the ends of the foil.. Heat the barbecue on medium (about 350-400 degrees F\u00a0or 175-200 Celsius for those with a thermometer). Cook for 45-60 minutes.\nThe time variance is based on whether you cook at a lower or higher temp, and whether you cook on the bottom or top grill.. As soon as you open the package at the end of cooking, you'll know you're in for something special. This stuff smells divine and tastes even better!\nEnjoy!!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 0, 3, 1]\nC: [3, 1, 0, 2]\nD: [2, 1, 0, 3]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_178_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_178_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_178_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_178_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [0, 2, 1, 3]\nC: [1, 3, 0, 2]\nD: [0, 1, 3, 2]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Filling:\n500 grams Bramley apples (apples for apple pie or apple sause)\n70 grams golden caster sugar\n\u00bd tsp cinnamon\nPastry:\n112 grams butter , room temperature (if you can\u2019t be bothered microwave it for abit)\n25g golden caster sugar , plus extra\n2 eggs\n175g plain flour\nTools:\nOven\nBaking sheet\nRolling pin\nMuffin tin\nCookie cutter/glass/cup (one the seize of the muffin hole and one twice that seize\nPi cookie cutter/knife (you can cut them out yourself with a knife like i did.. Peel, core and dice the apples. I recommend not making the pieces any bigger that 1cm cubed. Preferably smaller (not like i did. It makes them neater)\nLay them out on the cookie sheet with paper towels on it.. Beat the butter and sugar together until just mixed. Add one egg and one egg yolk (safe the white for glazing) beat until it\u2019s like thick pancake mix.\nAdd flour. I recommend doing it in parts as that\u2019s easier and less likely to get lumps. The last bit you\u2019ll have to do with your hands. Wrap the pastry in cling film and chill for 40-45 minutes. While that is chilling make finish the filling by mixing the rest of the sugar (of the filling part) with the cinnamon (add more or less to taste). Then mix in the apples.. Roll out the pastry thinly and cut out the shapes as pictured. (on the size of the muffing hole, one twice that size and one the shape of pi.\nPut the one twice as big into a (greased, optional) muffin hole and push in and around the sides. Put the apple filling in it. seal with the other pastry and add the pi-shaped piece on top. Put a few holes in the top (around pi) with a knife.\nBrush with the egg white\nRepeat with the rest of the pastry (I got exactly 7 out of this recipe)\nBake for 25 minutes or until slightly golden.\nLet them sit in the tin for about 5 minutes (to cool a bit) and then move to a wire rack.. eat them!\u00a0\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 2, 1, 3]\nC: [1, 3, 0, 2]\nD: [0, 1, 3, 2]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_179_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_179_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_179_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_179_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [2, 0, 3, 1]\nC: [0, 2, 1, 3]\nD: [2, 3, 0, 1]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. use plasterstrips to get an imprint of your chest - you want the bra the right size.\nI'm not showing that step, due I don't want to see my chest on the internet ;)\n- i'm sure you'll figure out how to do that.\nInstructions are found on the package of the plasterstrips.\nWhen the imprint is dry cover it with aluminiumfoil.. you can use all cookie doughs that can be used with cookiecutters.\nthe recipe I used:\n0,5 egg\n125g flour\n62g sugar\n62g butter\n1Tablespoon Cocoapowder\n1 teaspoon vanillasugar\n1 little bit bakingpowder\nand for flavour 2 tablespoons of instant cappuccino. Form the cups of the bra on your covered mold.\nmake sure to make it evenly thick - about 0,5 cm\nbake it on 200\u00b0C for about 10minutes ( may vary with another recipe). at this point you can get as creative as you want :)\nHere's what I did:\nmelt some white choclate in the still warm oven\nspread it with a clean brush on the warm bra.\nmix some white chocolate with cacoa-powder\nand paint whatever you like :)\nbrush some chocolate on the edge of the bra and sprinkle it with chocolate-pieces\nlet everything cool down.. carefully peel the foil of the mold\ntake a corkskrew and make holes to join the two cups in the middle - be very careful!\ntie the cups together with a nice lace or string.\nYour done!. Now surprise your beloved one and have a nice cup of tea!\n- Or whatever you like\u00a0 :D\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 0, 3, 1]\nC: [0, 2, 1, 3]\nD: [2, 3, 0, 1]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_180_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_180_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_180_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_180_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [3, 2, 0, 1]\nC: [0, 1, 3, 2]\nD: [3, 0, 2, 1]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. ingredients: \t\t\t\t\t\t\t6 rashers of bacon \t\t\t\t\t\t\t2 large potatoes \t\t\t\t\t\t\t1 medium onion \t\t\t\t\t\t\t2 cups water \t\t\t\t\t\t\t3 cups corn \t\t\t\t\t\t\tsalt + pepper \t\t\t\t\t\t\t1 cup half-and-half \t\t\t\t\t\t\t1/2 cup cheddar cheese \t\t\t\t\t\t\tcornmeal (optional)equipment: \t\t\t\t\t\t\tsharp knife \t\t\t\t\t\t\tcutting board \t\t\t\t\t\t\tlarge pot \t\t\t\t\t\t\tladle \t\t\t\t\t\t\tblender \t\t\t\t\t\t\tstove \t\t\t\t\t.. Bacon:  \t\tHeat large pot on stove and add bacon. \t\tCook bacon until crispy (about 5 minutes). \t\tRemove bacon and crumble into a separate bowl. \t\tKeep bacon drippings for cooking onions.Onion + potato: \t\tPeel onion and potatoes. \t\tdice onion and cook in bacon drippings for about a minute. \t\tCut potatoes into uniform cubes and add to pot with onion.Combine: \t\tIn pot with onion and potato add water, corn, spices and bacon bits. \t\tCover and let simmer for about 20 minutes. After simmering for a while the potatoes should be cooked through. Carefully scoop a few ladles of the chowder into a blender, make sure you get a good mix of broth and chunky-bits. Blend on high until smooth, then add blended chowder back into pot and mix.. \n          The chowder could be served now, but to really take it over the top dairy is added for that extra-smooth and amazing taste. Slowly stir cream into the blended mixture, this will lighten the colour of your chowder and give it a very creamy consistency. Then add grated cheese, I used an aged cheddar. Let chowder simmer on low for about 10 minutes. Keep that temperature on low, otherwise you risk burning the dairy. The picture here shows the marked difference between the stages of the chowder: chunky stock > blended stock > blended stock with cream. After a few minutes of the cream and cheese getting to know the rest of the ingredients it's time to serve. Scoop heaping portions into a bowl, then top with more bacon bits and cheese. A nice hot bowl of corn chowder for those chilly fall days, perfect! Did you make your own corn chowder? Post a picture of your results in the comments below and get a free Pro Membership to Instructables!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 2, 0, 1]\nC: [0, 1, 3, 2]\nD: [3, 0, 2, 1]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_181_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_181_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_181_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_181_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [1, 3, 0, 2]\nC: [0, 3, 1, 2]\nD: [1, 2, 3, 0]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. For this you will need:- some frozen french fries (aka Freedom Fries, you know,  pommes frites)- some hot dogs, preferably the ones with skins or casing, but any type frankfurter will do. A bit harder to find, but they do sell the monster size Polish-sausage type red-hots.  I have not tried this with Italian sausage, kielbasa, chorizo, or bratwurst but it should also give the same effect.- some frozen prepared buffalo-style chicken wings or fried chicken wings- a bit of butcher's twine or cotton string or clean thread- a bit of aluminum foil- Optional: a can of vegetarian beans in tomato sauce or whatever variety/flavor if you want a full entree.  A can of beans with that slab of pork fat works also.- blood-like condiments, catsup, ketchup or tomatoe sauce . Take a part of a frozen french fry.  You can shape it like a fingernail or just find a small piece that is pointy on one end and blunt at the other.  Cut a slit or pocket into the end of the hot dog where a fingernail would be. \nTake the point of the knife and create a pilot hole to insert or stuff the french fry into the hot dog.  Be careful not to bulge the hot dog too much so that you end up ripping the skin.  Dig out some more meat if you need room to fit the french fry.\n. Take a hot dog and break it in half.  You want a ragged edge. You can also break it 1/3 of the way down.  One part can be a toe and the other part can be a part of the finger. \nTo create a finger, wrap a small piece of aluminum foil around the hot dog. Tie a string around where the joints of the finger should be.  The foil will ensure you do not cut into the hot dog and break the skin. \nCompress the hot dog slightly as you tie the string.  When it cooks, the hot dog will take a more natural shape and give you a ridge at the finger joint.  We will cook the dog with the string on and cut them after they are cooked.\nFor hors' d'oeuvres, you can just bake them till heated through and browned in a toaster oven. You can also take a big pot and bring your batch of beans to a simmer.  Gently place the \"fingers and toes\" into the pot and let them heat through with the beans.  Baste with liquid and beans in the pot.  \n. Cut off the strings and remove foil before serving.  You can mash the french fry down to form a better fingernail shape. \nThe chicken wings should be frozen prepared kind where they require short cooking or just reheating. Have the chicken wings defrosted so you can work with them easier.  Don't do this with fresh chicken parts unless you intend to fully cook them on the side or with the beans later.  \nIt probably is best to just use the drummette portion of the wing, the one with a single bone.  \nAfter cooking, peel or rip off most of the meat surrounding the bone, just be sure to remove any signs of chicken skin.  This will allow you to skewer or insert the bone into the end of the hot dog.  \nFor the fingers, jam in a chicken bone in the ragged end of the hot dog.. Garnish with your blood-like condiments, Serve with a big helping of Muuuuuuhhahhahhahahhhahahh!You can also bake with pastry dough strips wrapped around body parts and apply red sauce to look like bandages.  Or have fingers sticking out of your mom's pumpkin pie.  Hand pie, anybody?Serve grilled with grill marks for those especially deviant.  Enjoy!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 3, 0, 2]\nC: [0, 3, 1, 2]\nD: [1, 2, 3, 0]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_182_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_182_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_182_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_182_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [3, 2, 1, 0]\nC: [1, 0, 2, 3]\nD: [2, 1, 3, 0]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Ingredients\n3 medium beets\n1/2 onion\n12 oz spinach\n3 cloves garlic\n2 packages active dry yeast\n2 teaspoon sugar\n3 cups flour\n4 tablespoons olive oil\n2 teaspoons fine grain salt\n2 Tbs black lava saltTools\nJuicer\nBaking sheet\nMixing bowls (2)\nMeasuring cups/spoons\nParchment paper\nMortar and pestle. \n\t\tRun spinach and garlic through juicer and set aside\n\t\tJuice beets and onion into a separate bowel. \nFollow steps below for both the spinach and the beet dough...\n\t\tYou'll need 1 cup warm liquid, so pour veggie juice into mixing cup and fill to 1 cup mark with warm water.\n\t\tPour liquid into mixing bowl\n\t\tAdd 1 packet yeast, 1 tsp sugar and 1/2 cup flour\n\t\tStir a couple times, cover bowl with plastic wrap and set in cool, dark place for 10 minutes (until bubbly)\n\t\tMix in 1 tsp salt (I used mortar and pestle to grind lava salt, but any fine grain salt is fine)\n\t\tMix 1 cup flour into mix\n\t\tMove dough to floured surface and knead for 3 - 5 minutes\n\t\tRinse mixing bowl, dry and lightly coat inside with olive oil\n\t\tRoll dough into ball and place in a bowl\n\t\tRecover and let sit for around an hour (until doubled in size). \n\t\tPreheat oven to 400oF\n\t\tPunch down dough, knead 2 minutes and divide into balls 1/2\" in diameter\n\t\tRoll into thin sticks - about 1/4\" thick and 8\" long\n\t\tBrush bottom 1/2\" with oil and cover with lava salt.\n\t\tBake for 8 minutes\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 2, 1, 0]\nC: [1, 0, 2, 3]\nD: [2, 1, 3, 0]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_183_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_183_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_183_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_183_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [3, 2, 0, 1]\nC: [3, 2, 1, 0]\nD: [3, 0, 2, 1]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. This step is about what you HAVE. \u00a0Not what you need. \u00a0(Mine are usually leftovers.)\nBasically, my opinion on\u00a0Bento\u00a0is that you may have lots of cool things in stock to use, play with, shape things...but the food comes down to what you have to work with. \u00a0There are no rules. \u00a0Just try to make something cute out of what you have. \u00a0That's my opinion. \u00a0No reason to stress out about what you need to have to make something. \u00a0Just use your imagination and put things\u00a0together\u00a0that are already in your kitchen. \u00a0If you want more color, maybe it's a good idea to go shopping. \u00a0:) \u00a0It's up to you.\nOne other thing that I love - cake decorating supplies. \u00a0I use them as props when I do this stuff. \u00a0I will try to find some photos to attach to this step, to show you. \u00a0. I needed a base for the\u00a0Bento\u00a0because I like build the meal upwards. \u00a0If you start at the bottom...I don't know, it just doesn't make sense to me. \u00a0I don't know too much about\u00a0Bento, it's been less than a year and I've only done it a few times. \u00a0Generally, the food is at an even level when finished and it's separated strategically. \u00a0\nSo, for a base, I put some salad at the bottom, with a\u00a0ziploc\u00a0bag of dressing. \u00a0That should go well with the left over rib eye. \u00a0Plus...I think it's cool to call it a secret salad because no one knows its there! \u00a0Until they eat their way to it.\nAfter the salad, I put a thin layer of seaweed/nori, teriyaki flavored. \u00a0It works well because it's kinda sticky.. I cut his steak up with a sharp knife and tried to shape it as best as I could. \u00a0I can't visualize these things before I do them, they just unfold. \u00a0So I can't give you any advice on how to shape your food. \u00a0Play around with it until you're happy. \u00a0I am horrible at perspective and if I ever make anything, it's flat. \u00a0Plus, my lines suck. haha.\nI used string cheese to line the center console of the Tie Fighter. \u00a0Cut the .. string cheese horizontally and wrap it around your circle. \u00a0Pin it with a couple toothpicks and put it in the microwave for around 20 seconds. \u00a0Once it's starting to melt, take it out and press it into shape. \u00a0Set it face down and the melted gravity will help form your cheese circle. \u00a0You can then add your windows and melt them on, but I didn't want to take any chances so I left them for the end, unmelted.\nI did the same cheese melting technique with the patterns on the...edge wing things? \u00a0It took forever and it was the hardest part. \u00a0My tip for you - don't do what I did. \u00a0Use a knife or an exacto instead. \u00a0By hand, it takes decades to make the right size cheese. \u00a0Then you can melt these pieces together and shape them as needed.. My favorite part about this thing is the stars! \u00a0I love all stars of all kinds.\nOkay, situate your Tie Fighter in some ritualistic Star Wars way...I just guessed. \u00a0Then you can put some stars around your ship. \u00a0I used a tiny cookie cutter, a slice of provolone cheese, and spaced them out nicely. \u00a0For the TINY stars, specks, I just sprinkled some sesame seeds around. \u00a0\nFor the lasers I used some sour punch kinda candy. \u00a0I had it leftover from the cupcakes I was making, so I figured that I would put it to use. \u00a0(Pretty awesome to have all these random things in the house and not have to plan this out or buy things.) \u00a0Back to the green stuff...I rinsed off the sour stuff and let them dry on a plate, stuck a hard spaghetti stick into the tube and left an un-filled part of the straw to puncture as a stilt against the rest of the\u00a0Bento, to hold it up in the design.\nThen I added my windows and I was done!. \n          It's back in the fridge, waiting for my husband to wake up and take it to work. \u00a0I will probably add a sign that says to keep it upright because I don't want him to crash his Tie Fighter before he even sees it. \u00a0\nThanks for checking out my attempt at a Tie Fighter! \u00a0Hope you guys have a nice day at work! \u00a0:) \u00a0\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 2, 0, 1]\nC: [3, 2, 1, 0]\nD: [3, 0, 2, 1]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_184_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_184_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_184_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_184_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [0, 2, 3, 1]\nC: [0, 2, 1, 3]\nD: [2, 3, 1, 0]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Start by sterilizing your jars, lids, rings, and tools by submerging them in boiling water for several minutes.. Add to a large pan one and a half cups water and two cups apple cider vinegar. Then add six cinnamon sticks and a teaspoon of cloves. Add five cups sugar. Mix all together and bring to a boil. Let simmer at least ten minutes. . I like the taste of the pickles with mint, but my wife doesn't like the mint ones on hamburgers. So depending on your preferences, you decide.. You'll need to peel, core, and slice your apples. Try to get your slices about as thick as a hamburger chip pickle would be. This is a great way to get rid of small apples. . Fish one of the cinnamon sticks out of the syrup. Place it in the jar with the apple slices. Ladle the syrup over, leaving about a third of an inch head space. A few of the cloves in the jar won't hurt, but a lot can get too strong. . Put a lid on the clean rim of your jar. Snug down a ring. Then submerge in the hot water bath. . After the water has resumed a rolling boil, process for thirty minutes. Then remove the jars.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 2, 3, 1]\nC: [0, 2, 1, 3]\nD: [2, 3, 1, 0]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_185_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_185_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_185_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_185_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [1, 2, 0, 3]\nC: [1, 0, 2, 3]\nD: [3, 1, 2, 0]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. \nA head of cabbage makes an excellent katamari and radishes are the perfect size to use as the ends...\n\t\tWash and cut radishes in half\n\t\tSlice off the very end off each half\n\t\tPush toothpicks into cabbage, leaving about 1/2\" exposed\n\t\tPress radish halves onto toothpicks\n\t\tEvenly distribute radishes around entire cabbage. \nThe Prince is constructed from 2 cucumbers, a baby carrot and 4 green beans...Head\n\t\tCut both ends off\u00a0 two small cucumbers\n\t\tCut one of the cucumbers in half\n\t\tTake one of the halves and carve out a rectangle from the outer peel\n\t\tRemove small band of peel from both ends\n\t\tTake two of the end pieces and attach one to each side of the head with toothpicks\n\t\tStick a baby carrot in the top of the head to make the antennaBody\n\t\tTake the other half of the cucumber and press one or two toothpick in one end\n\t\tAttach head to body using the aforementioned toothpicksLegs/Feet\n\t\tRun toothpicks through two green beans, leaving a bit of toothpick exposed on one end\n\t\tPress legs into body at toothpick end\n\t\tTake the ends cut off the second cucumber (these will be the feet)\n\t\tCut a small circle out of the middle of each foot (approximately green bean in diameter)Arms\n\t\tRun toothpicks through two green beans, leaving a bit of toothpick exposed on one end\n\t\tPress toothpick ends into body at a reasonable arm position. \nSet The Prince up next to the cabbage katamari in a rolling stance.\nNow, The Prince did remain standing for the duration of the display, but i won't lie, it was precarious.\u00a0 I recommend setting up the veggies in the same place it will remain throughout the event.\u00a0 Also, make sure that the cabbage is stable, as it provides most of the support for The Prince.. \nToothpicks and/or skewers of fruits, veggies and cheeses can now be added...\n...along with turnip flowers : ). \nAdd Brussels sprout bushes, mixed green grass and weird fruit trees of strawberry and melon atop artichokes (or whatever weird fruit trees you can imagine).\nAnd don't discard those rinds!\u00a0 They can be filled with dips or salsa.\u00a0 The lemon pictured here is happily holding a yogurt fruit dip.\nThe example here, while a little out of control, is a very simple example of what can be done with the Katamari theme.\u00a0 It could be applied to a wide variety of foods and/or represent different levels of the game.\u00a0 A skilled garnish maker could do an amazing representation of the flower level...\n...and yes, that is a request.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 2, 0, 3]\nC: [1, 0, 2, 3]\nD: [3, 1, 2, 0]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_186_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_186_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_186_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_186_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [1, 0, 3, 2]\nC: [3, 0, 1, 2]\nD: [3, 1, 0, 2]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. One package Nutter Butter cookies\n2/3 cup Nutella\n2 oz cream cheese, softened\n2 cups chocolate chips\n(optional) White chocolate for decoration. Using a food processor, grind up the Nutter Butter cookies into crumbs -- it\u2019s best to do this in two or three batches, depending on the size of your food processor.\nMix the cookie crumbs with \u00e2\u0085\u0094 cup Nutella and 2 oz cream cheese. The mixture will still be crumbly, but should hold together when pressed -- if need be, add a little more Nutella or cream cheese.. To get the balls all the same size, I use a mashed potato scooper. Pack the crumb mixture firmly into the scoop, then eject onto a piece of waxed paper. Because the mixture is so crumbly, I just left them as half-balls -- if you were making cake-balls, you could roll them into balls.\nPut the balls into the refrigerator for at least \u00bd hour to firm.. Melt two cups of chocolate chips in a double boiler. Depending on your chocolate chips, adding a little vegetable shortening can make it easier to coat the cookie balls. I usually use about 1 tsp of shortening per cup of chocolate chips. Warning -- adding too much shortening will make the chocolate too soft.\nI use a homemade dipping tool that I formed from a piece of stainless steel wire (a bicycle spoke).\nRoll each ball in the melted chocolate, and place on waxed paper.\nThe dipping tool makes it easier to scoop out the balls and shake off excess chocolate.. Optional - decorate the cookie balls\nAn easy way to decorate the cookie balls is with white-chocolate piping.\nUsing a microwave, melt some white chocolate chips in a small plastic bag. Cut the corner off the bag and drizzle the melted chocolate in a thin stream over the top of the cookie balls.\nPlace the balls in the refrigerator till the chocolate hardens.. Remove from the waxed paper & enjoy!\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 0, 3, 2]\nC: [3, 0, 1, 2]\nD: [3, 1, 0, 2]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_187_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_187_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_187_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_187_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [0, 1, 3, 2]\nC: [0, 2, 3, 1]\nD: [3, 0, 2, 1]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. This step is about what you HAVE. \u00a0Not what you need. \u00a0(Mine are usually leftovers.)\nBasically, my opinion on\u00a0Bento\u00a0is that you may have lots of cool things in stock to use, play with, shape things...but the food comes down to what you have to work with. \u00a0There are no rules. \u00a0Just try to make something cute out of what you have. \u00a0That's my opinion. \u00a0No reason to stress out about what you need to have to make something. \u00a0Just use your imagination and put things\u00a0together\u00a0that are already in your kitchen. \u00a0If you want more color, maybe it's a good idea to go shopping. \u00a0:) \u00a0It's up to you.\nOne other thing that I love - cake decorating supplies. \u00a0I use them as props when I do this stuff. \u00a0I will try to find some photos to attach to this step, to show you. \u00a0. I needed a base for the\u00a0Bento\u00a0because I like build the meal upwards. \u00a0If you start at the bottom...I don't know, it just doesn't make sense to me. \u00a0I don't know too much about\u00a0Bento, it's been less than a year and I've only done it a few times. \u00a0Generally, the food is at an even level when finished and it's separated strategically. \u00a0\nSo, for a base, I put some salad at the bottom, with a\u00a0ziploc\u00a0bag of dressing. \u00a0That should go well with the left over rib eye. \u00a0Plus...I think it's cool to call it a secret salad because no one knows its there! \u00a0Until they eat their way to it.\nAfter the salad, I put a thin layer of seaweed/nori, teriyaki flavored. \u00a0It works well because it's kinda sticky.. I cut his steak up with a sharp knife and tried to shape it as best as I could. \u00a0I can't visualize these things before I do them, they just unfold. \u00a0So I can't give you any advice on how to shape your food. \u00a0Play around with it until you're happy. \u00a0I am horrible at perspective and if I ever make anything, it's flat. \u00a0Plus, my lines suck. haha.\nI used string cheese to line the center console of the Tie Fighter. \u00a0Cut the .. string cheese horizontally and wrap it around your circle. \u00a0Pin it with a couple toothpicks and put it in the microwave for around 20 seconds. \u00a0Once it's starting to melt, take it out and press it into shape. \u00a0Set it face down and the melted gravity will help form your cheese circle. \u00a0You can then add your windows and melt them on, but I didn't want to take any chances so I left them for the end, unmelted.\nI did the same cheese melting technique with the patterns on the...edge wing things? \u00a0It took forever and it was the hardest part. \u00a0My tip for you - don't do what I did. \u00a0Use a knife or an exacto instead. \u00a0By hand, it takes decades to make the right size cheese. \u00a0Then you can melt these pieces together and shape them as needed.. My favorite part about this thing is the stars! \u00a0I love all stars of all kinds.\nOkay, situate your Tie Fighter in some ritualistic Star Wars way...I just guessed. \u00a0Then you can put some stars around your ship. \u00a0I used a tiny cookie cutter, a slice of provolone cheese, and spaced them out nicely. \u00a0For the TINY stars, specks, I just sprinkled some sesame seeds around. \u00a0\nFor the lasers I used some sour punch kinda candy. \u00a0I had it leftover from the cupcakes I was making, so I figured that I would put it to use. \u00a0(Pretty awesome to have all these random things in the house and not have to plan this out or buy things.) \u00a0Back to the green stuff...I rinsed off the sour stuff and let them dry on a plate, stuck a hard spaghetti stick into the tube and left an un-filled part of the straw to puncture as a stilt against the rest of the\u00a0Bento, to hold it up in the design.\nThen I added my windows and I was done!. \n          It's back in the fridge, waiting for my husband to wake up and take it to work. \u00a0I will probably add a sign that says to keep it upright because I don't want him to crash his Tie Fighter before he even sees it. \u00a0\nThanks for checking out my attempt at a Tie Fighter! \u00a0Hope you guys have a nice day at work! \u00a0:) \u00a0\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 1, 3, 2]\nC: [0, 2, 3, 1]\nD: [3, 0, 2, 1]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_188_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_188_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_188_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_188_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [1, 0, 3, 2]\nC: [1, 2, 0, 3]\nD: [2, 3, 1, 0]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Makes 10 muffin sized cakes.\nIngredients:\na) 1 tablespoon icing sugar\nb) 150ml milk\nc) 175g self raising flower\nd) 150ml corn oil\ne) 150g caster sugar\nf) 2.5 tablespoons of cocoa powder\ng) 1 teaspoon bicarbonate of soda\nh) 100g dark chocolate (70% cocoa solids minimum)\ni) 397g can of Carnation Caramel\n2 eggs (i forgot to take a picture 0_0 )\nFor the heart Decorations:\nFlat red sweets. I\u00a0used a roll of sour red candy\nWhite icing. I\u00a0used \"Queen\"\u00a0white chocolate fudge writing icing because the nozzle was small)Tools:\na)\u00a0Baking sheet or cupcake tray\nb)\u00a0Spoon\nc)\u00a0Fork\nd)\u00a0Measuring spoons\ne)\u00a0Seive\nf)\u00a0Mixing bowl\ng)\u00a0Measuring jug\nh)\u00a0cake cases - i used muffin cases for extra large cupcakes, nom!\ni)\u00a0Timer\nj)\u00a0Scales\nk)\u00a0Spatula\u00a0 or wooden spoon\nl)\u00a0Small spatula (useful for icing, you could use a butter knife)\nScissors (not pictured). 1)\u00a0Pre heat your oven to 180\u00ba C2) Prepare the dry ingredients: Sieve the flour and cocoa powder into your mixing bowl3) Add the bicarbonate of soda and caster sugar4)\u00a0Use the spatula to mix to an even colour5) Prepare the wet ingredients:\u00a0 In your mixing jug measure the corn oil, and add the milk6)\u00a0Add in the eggs, and 2 tablespoons of the Carnation Caramel7) Give the oil/milk/egg mixture a good mix with a fork to combine it with the caramel. And pour the wet ingredients into the bowl of dry ingredients.8)\u00a0Mix all the ingredients together thoroughly, scrape the bottom of the bowl a couple of times to make sure there isn't a sneaky pocket of dry ingredients . 9) Lay out your paper muffin cases onto baking sheet, Spoon the mixture into the cases leaving 1cm clear at the top to give them room to rise.10) Once it's up to temperature pop them in the oven for 20 minutes. Don't open the oven while they're coking or they make not rise so well. \nTip:\u00a0To check that they're cooked insert a clean sharp knife or knitting needle into the centre of one, if it comes out clean they're done. If it comes out sticky give them 3 minutes more. \nPlace on a wire rack to cool (Use the wire rack from your grill pan (give it a bloody good clean first!) if you don't have a cake rack) . 11) Break the dark chocolate into small pieces and melt in an appropriate bowl/jug in the microwave. Inbetween 30 second bursts give it a mix with a fork to see how its melting.\n12)\u00a0Combine the remaining Carnation Caramel with the chocolate. Leave this to cool a little, 10 or 15 minutes13) While the icing is setting a little (if you spread it straight awy it'll be more inclined to run over the edge of the cupcakes)\u00a0cut the tops of the cakes off flat with a sharp knife. This gives you a nice flat surface to ice onto.14) Start by spooning a blog of icing into the centre of the cake, and use a small spatula to drag it down and around the edge of the cake. Add a little more with the spatula as you go along if you find it's not enough. Finally, drag the spatula around the top of each cake to smooth over where you've added mroe icing. . Making the hearts is better described with pictures so have a look at them before you start, You want to use sharp scissors to get nice sharp edges.15) This is a diagram of the shape you're trying to make out of the sour candy. The white dotted line indicates where i've used two pieces of candy to make each heart.16) Take your roll of sour candy and cut off a section about 6cm long, Trim it to look like the top half of diagram 1517) Cut a 'T' shape from another piece of sour candy to make the bottom part of diagram 15. This is what your two-part heart should look like. 18)\u00a0Make four hearts, and cut the fourth one in half. This will be the unfilled heart at the end. Press a heart onto each cupcake. Leave them to set for at least half an hour, this way the surface will set slightly so if you make a mistake with the icing it can be easily corrected. 19)\u00a0This diagram show where the white line goes.20) Use your white icing to pipe the thins white line around the heart. Hold it as close to the cupcake as you can without dragging it over the surface. \nTip:\u00a0Practice on a piece or paper first (or the back of your hand, yum!) If you make a mistake on the cupcakes dip your finger in water and wipe it off.21)\u00a0For the half filled heart pipe a line where the sour candy comes up to on the other hearts, and fill in with horizontal lines of icing. Pipe the white line around this like you did on the other cakes. . Thats it!\u00a0Decorate the rest of the cakes with sweets, pipe patterns with the white icing, or dust with icing sugar. They'll keep in an airtight tin for a few days, you could give them in a clean white tupperware tub lined with coloured tissue, or lay them out on a pretty plate. \nA new tutorial every Monday on www.alittlestranger.com\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 0, 3, 2]\nC: [1, 2, 0, 3]\nD: [2, 3, 1, 0]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_189_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_189_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_189_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_189_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [0, 1, 2, 3]\nC: [0, 2, 3, 1]\nD: [2, 0, 1, 3]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. I would suggest you to use complete quantity and not to half them like i did else you would end up with half semi circles...2 eggs1cup all purpose flour 3/4 cup sugar powdered3/4 cup oil1/2 tsp baking powderFew drops vanilla ( optional)Mix egg and sugar until incorporated. Add oil and mix well. Sift flour and baking powder together. Add. Add your vanilla.I cooked my cake in microwave oven Preheat oven at 180 Celsius. Bake for 20-25min.  Once cooked let it cool and cut circles ( i used three as my cake thickness was dense.). I made this sauce only for this cake amount. It can be stored easily.4 teaspoon milk2 tablespoon nutellaWarm your milk in microwave. Add in your nutella. Mix it well. It sould be in runny constancy. Serve hot for this cake.. Warm 2/3 part of chocolate at intervals of 10 seconds. Making sure you stir nicely every time. Once your chocolate has just melted. Add in your 1/3 of remaining chocolate and mix well and quickly. If you feel that your chocolate is too cold that lumps of solid chocolate are visible abd are tough to dissolve, microwave for another 5 seconds. Your chocolate should not be too warm or it will not temper correctly.   Spread your chocolate over aluminium foil and pat it to remove air bubbles.Let it stay at room temperature for atleast 30 min to an hour. Once u can feel that chocolate is not sticking to your fingers but it not firm yet, score circles and place your chocolate sheet in refrigerator. Only take it out before serving.. Place your disc of top of your cold cake and pour warm nutella sauce on top of it . Your dessert is ready\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [0, 1, 2, 3]\nC: [0, 2, 3, 1]\nD: [2, 0, 1, 3]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_190_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_190_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_190_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_190_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [3, 0, 2, 1]\nC: [2, 3, 0, 1]\nD: [2, 1, 0, 3]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. There's quit some internal parts you will need. I found all the parts on the internet, but you might check your local diystore if you can find some parts there.These are the internal parts you will need:- Funnel (top diameter around 100mm)- Aluminum plate (for small parts, see next steps)- Thermostat (KSD301 105\u00b0C)- 4x M3 bolt length 6mm with Phillips or slotted head- 4x M3 nut- Temperature fuse- 2x Parallel connector- 1m flexible silicon wire 1,5mm- Ring connector for 1,5mm wire (M3 hole)- 1m Silicone hose 10mm inner diameter- 1m Silicone hose 6mm inner diameter- Heat resistant tube- 7x M4 bolt length 12mm for embossing with Phillips or slotted head- 2x M4 bolt length 6mm for embossing with Phillips or slotted head- 2x M4 bolt length 30mm with Phillips or slotted head- 13x M4 nut- Rocker switch (minimum 6A)- 4x Hose clamp 13,7mm to 15,3mm- 2x Hose clamp 10.8mm to 12,3mm- Grounded chord and plug for 220-250V- 3x Faston angle connector 4.8 width- 2x Faston connector straight (6mm width, depending on the connectors on your thermostat)- Thermal paste (1 gram)- Heat-shrink tubing 6mm to 2mm- Heating element- One way valve for water which fits the 10mm silicon hose- Plastic cup (from paint gun, so make sure it's new and clean)I ordered my parts from a German company called 'Conrad'. They supply many countries in Europe, so you can find the order number of most of the parts on this list:https://www.conrad.be/ce/nl/ShoppingList.html?view...On Conrad you can find everything except these:-Heating element:You will have to order this from a supplier for replacement components. I ordered mine from Servilux (order nr 141709)If your order somewhere else you have to make sure it's similar to the one on the picture in order to complete the coffee maker.- Plastic cup 600ccThis will serve as the water tank. Depending on where you order them they might have a different thread. So if possible order one with a thread where the hose fits around, otherwise you also need an adapter. This so you can make a transition from the thread of the cup to an outer diameter between 10mm and 13mm.I ordered mine from nonpaintstore.nl (part nr 4213505). It's not cheap but it has the right fitting on it for the silicon hose.. Here you can find the link to all the 3D-printed parts. Because of the size of the object, they take quit some time to print. I printed all of them with standard print resolution from Makerware.http://www.thingiverse.com/thing:348199NOTE: These parts were made according to the dimensions of the components which I used. Different components might give problems with the dimensions of the 3Dprinted casing. In the future I might try to learn Openscad and make a parametric model out of it for dimension adjustments.I ordered filabot filament, which is made out of recycled sources, to try out of it's possible to print with it. I first ordered a blue roll and when I tested it, it came out quit well. Only difference with normal filament was the inconsistency of the color. But that's what they also mention on their website and is because of the use of recycled sources that this might occur.So I ordered two more colours: red & black. Red turned out to be more like salmon pink instead of red. Black was normal black. Both spools seemed to be pretty consistent in color. But printing with the black spool gave some problem, which is why I printed all parts in 'red'.. Making the mold was not easy and I had to try some stuff out before making the final shape. Basically you fill up the drag with sand, then place the 3D-printed model on top and fill in the gaps on the side. I then cut down the sand, so that the model can be taken out afterwards. After applying talk powder you fill up the cope.When the cope is filled with sand I open op the mold and get the model out. I also make a hole where the aluminum will be casted through. I know it's not according to the best aluminum casting technique, but it works. If there are any advices on how I can adjust the sand mold for a better casting with this shape, let me know. Or ideas for making a better cast-able shape are also welcome.You need to cast part 1 two times as a left and right side. It's better if you sand the 3D printed model before making the mold, so the model loosens better from the sand. One of the 2 sides needs a 'bump' where we will fix the heating element later.I made the bottom of the coffee maker out of aluminum so it has a heavy and stable base. It's also stronger for fixing the heating element. But there are ways to attach the heating element in a plastic 3D-printed model without having to use aluminum. The reason I chose to use aluminum was because this project was about searching for techniques for local producing and recycling. Casting aluminum & 3D printing with plastic from recycled sources came out as useful techniques, also because of the possibilities to share digital models. I used oil based sand because I'm still learning how to cast and thought this was the safest method, instead of mixing my own sand with water like you can find on other tutorials.. I made a furnace for melting aluminum out of an old fire extinguisher by checking out other instructables: https://www.instructables.com/id/Oven/I used scrap aluminum from the Fablab, old aluminum cans and trash aluminum foil to melt and cast into the mold. Remember safety: read and learn enough about this before trying it out and use enough safety gear to protect you from any mistakes.. Saw off the not wanted aluminum from the part and smooth down the model. The sides need to be straight because there are 3D-printed parts which will be attached onto them.. The heating element is the essential part of the coffee maker. It warms up the water so it starts to boil. This is the way it works: The water goes through the heating element until it's leveled (communicating vessels). When the coffee maker is turned on the heating element starts to heat until the water starts to boil. Therefore the water wants to expand and will push upwards. In the side of the water reservoir there is a valve which makes sure the water doesn't go up on that side, therefore pushing it through the other side upwards.Here is a more visual explanation video on youtube by Crazy Builders:https://www.youtube.com/watch?v=COKlObhGt50So, to make sure the element heats up to the right temperature you have to use a thermostat. This will interrupt the electric circuit when it reaches the stop temperature of the thermostat. I used a thermostat of 105\u00b0C, just above boiling point but I'm thinking a lower one (90\u00b0C for instance) might work as well, if not better.The heating element I ordered has a small piece welded to it where you can fix the thermostat. I cut a piece of aluminum for this. On this piece it will later also be possible to connect the ground to for safety. To make sure the heat gets transferred well to the thermostat I used thermal paste between the connection of the aluminum parts, the heating element & the thermostat.. Check the thickness of the 'bump' from the casted part. Take (at least) 1 mm less and tape off your drill to make sure you don't drill through the whole part. Tap an M4 thread inside the holes.To fix the heating element to the part of the casing we need to cut some strips which we will use to clamp. By tightening the nuts, the heating element will get fixed. Make sure you put a piece heat resistant plastic between the aluminum strip and the heating element. This will make sure there is no way electricity can flow to the outside casing.Since I don't have pictures from the fixing of the heating element, I made an exploded view to try to explain how it's fastened.. Drill a hole to the size of the switch. I used the biggest size drill I could find and then rasped to the exact size. Make sure this is the side where the heating element is NOT fixed.. Connect all the parts in the right place. I made a scheme where you can see how everything is connected. Use as much silicon wires as possible since they can handle the heat better than normal wires. The wires from the chord can be protected with the silicon tube.Make sure the valve is in the right direction otherwise the water can't get to the heating element.I suggest leaving the coffee maker for a few hours before trying it out. Then make sure there isn't any water leaking through the sleeves in the bottom. Also check again to make sure the casing can't conduct any electricity with a multimeter.. Choose what flavor of coffee you like, how strong you want it to be, and how much you want. Press the button and start brewing. Enjoy!Edit: I now also casted the dripping plate in aluminum. This because the 3D-printed part is not completely water sealing. \nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 0, 2, 1]\nC: [2, 3, 0, 1]\nD: [2, 1, 0, 3]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_191_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_191_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_191_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_191_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [3, 1, 0, 2]\nC: [0, 3, 2, 1]\nD: [2, 1, 3, 0]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Sure, you can just show up and buy stuff without any special gear. But these specialty urban shopping items will make your life a lot easier.These three gear options can work by themselves or with each other. 1. REUSABLE BAGSThere are quite a few possibilities for bags beyond the world of Paper and Plastic. Pick one that works best for carrying your groceries and matching your good style. Keep one in your office/gym bag you have with you every day for those spur-of-the-moment shopping trips on your way home.2. GRANNY CARTThe Granny Cart is for when you and your roommates start saying things like, \"Plain pasta for breakfast AGAIN?\" or \"You can just scrape mold off and it's ok, right?\" They're best for the BIG trips. They are also great for shopping with kids since they provide easy schlepping and lots of entertainment.Something important to keep in mind about Granny Carts is how you pack them. You don't want your tomatoes on the bottom where they will be squashed by things stacked on top and subsequently pureed for sidewalk gaspatcho by the metal squares of the cart. Put heavy, boxed or canned items on the bottom (any pre-made frozen stuff, canned soups and beans etc), followed by lighter containers (pasta, cereal, tea), and lastly, crushable produce. One last tip: the key to looking cool with your Granny Cart is KNOWING you look cool with your Granny Cart. So stand tall and STRUT IT.3. BICYCLE BASKETClassic, functional and an easy way to express your style. A very sensible shopping option for those who bike to work, or for trips just beyond walking distance. (But beware of hills!)There are a million bike baskets out there, from the insubstantial wicker or plastic ones to the standard metal to the super high end wood and metal. You can put a basket on the front of your bike (attached to your handlebars), on top of your rear rack or on the sides of your rear rack. Some baskets fold and some detach.There are also a variety of saddle bags (technically called \"panniers\") you can attach to the back of your bike for bigger, heavier loads. Much like with the Granny Cart, keep a stacking strategy in mind while packing it.. Take a stroll through your neighborhood and take note of all the food shops. You might be surprised by how many small grocers, fruit stands, bodegas and specialized food stores there are. Venture inside them to see just what they offer. These places can provide a lot more than bread or beer at the last minute, particularly in climate-blessed San Francisco. (See our Food & Liquor project.)Not only can these places provide equal (if not better) goods, they often provide more personal service, which can be a huge help when you have all of one minute to find an obscure item before you need catch the bus to make your meeting. Get to know your shop owners, or at least the regular check-out folk, since they can keep you updated on sales or new goods, and can even put in a request for things they don't normally carry. All just for you! And keep in mind that you don't have to shop for everything in one store. The first place might do produce better than the second, while the third place has the best butcher counter and the fourth has great baked goods and the cheapest beer. Of course it's easiest if you find a group of these stores that are near each other. San Francisco has lots of \"market streets\": from parts of Irving Street in the Sunset to stretches of Geary Avenue in the Richmond to Church Street between Duboce and Market in Duboce Triangle, and many more. . Once you've found your local stores, START SHOPPING. \nNow that you have the right gear, it'll be easy for you to stop by the stores on your way home from work or after you pick up your kids from school. Keep your big shopping trip on the weekend for staples, but try stopping by stores on your way home for produce and other spoilables during the week. This can keep your meals fresher and your weekend shopping less gargantuan. You weekend-shoppers will be surprised by how wonderfully empty a market can feel the rest of the week.\nMaking a list before you go can help ensure you get everything you need. That way you won't be that nutter crying out \"Butter!\" on the way home because you forgot to get some. Be flexible though-- you might arrive at a store to find that the pears are much more delicious looking than the apples that were on your list.\nAnd keep your mode of transportation in mind while shopping. Don't buy that case of Hansen's soda just because on it's on sale if you're walking home up hill with your groceries in hand. Thar's a job for a Granny Cart or bike basket.\nWhen you're checking out, just say, \"I have my own bags.\" Some stores will give you a discount, or maybe just a smile. This is also a good time to ask about a new product you'd like to see available or who that cute kid is in the photo taped to the cash register.\n. Now that you are fully loaded with hot urban shopping gear strategically packed with delicious things from your local stores, go home and eat!\nOf course, don't be afraid to make a few other stops along the way. Your good packing job should ensure that socializing won't be a problem. (As long as you stayed away from the frozen food aisle.) You just might be surprised by who your super hot shopping gear attracts.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 1, 0, 2]\nC: [0, 3, 2, 1]\nD: [2, 1, 3, 0]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_192_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_192_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_192_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_192_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [3, 0, 2, 1]\nC: [2, 3, 0, 1]\nD: [2, 3, 1, 0]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Roasting the cocoa beans is essential to develop the \"chocolate\" flavor. Its also pretty easy to do (there is a lot of science to roasting cocoa, but we won't get into that here)\n1. Adjust an oven rack to the middle position and pre-heat your oven to 300F.\n2. Spread your unroasted beans out on your baking sheet so they are in a single layer.\n3. Once the oven is preheated, place the pan in and start your timer.\u00a0 Roast the beans for 30 minutes. For your first time trying this, pay close attention to the smell. When your beans first start heating, you might notice an acidic smell coming off--this is normal. What we want to do is cook that off and wait until they start smelling like brownies. Because there are endless variations on how to roast and beans vary in how much they should be roasted, I've suggested a very \"average\" roast. You can experiment with future batches.\n4. At 30 mins, pull the beans out and place the pan in front of a fan to cool.\u00a0 If you don't have a fan, don't worry, just let them cool until they are cool enough to handle.. Now comes the fun part--removing all the shells from the roasted cocoa beans!\u00a0 Remember that friend I mentioned?\u00a0 Now is their time to shine.\u00a0 Each bean needs to have its shell removed.\u00a0 After roasting, the shells will be brittle and should crack off easily (some will be harder than others).\u00a0 The nibs inside will also break apart, this is ok.\u00a0 This is tedious and can get tiring and will ruin a manicure, but the alternative is to spend lots of money and buy an industrial machine to do it for you. On second thought, you might want to get two friends to help...\n\u00a0\nAs you de-shell, keep the beans/nibs in one bowl, and the shells in another (the shell can be thrown away or composted, but keep in mind cocoa shells are still just as bad for dogs as chocolate is, so don't let Fido find them!).\u00a0 And don't worry if there are a few bits of shell in with the cocoa nibs, they will get filtered out by the juicer.. \nNow is the messy part, and that friend will come in handy here, too.\u00a0\nThe de-shelled cocoa beans need to be run through the juicer.\u00a0 If you aren't sure if your juicer can handle this, check the brochure--any juicer that can make nut butters should be suitable.\u00a0 What you want is for the juicer to grind AND heat the cocoa so that the cocoa butter present in the beans melts** .\u00a0\nWith the filter screen in the juicer, and a bowl under the \"juice\" port and one under the \"pulp\" port on the juicer, slowly start adding the nibs. Don't rush, you can overload the juicer.\u00a0 At first almost all of what you add will come out the \"pulp\" port. Once you have run it all through, do it again.\u00a0 Each subsequent pass will heat the mass and more and more will melt through the filter screen and come out the juice port, while less comes out the pulp end. Each time, run what comes out the pulp end through, you will be collecting what is called cocoa liquor--partially refined, liquified cocoa mass--flowing out the juice port. At a certain point the only thing coming out the pulp end should be cocoa shell, since you have de-shelled by hand, almost nothing should come out. You are done when nothing or very little comes out the pulp end. You should have a nice bowl of melted cocoa liquor.**If your juicer doesn't generate enough heat (i.e., nothing comes through the juice port), have that friend point a hair drier at the auger end of the juicer until everything starts melting and flowing smoothly.\u00a0 Be careful not to blow all your cocoa away with the hair drier as it comes out of the juicer (I've learned this from experience).. You will need to weigh your cocoa liquor in order to formulate your final chocolate's percentage.\u00a0 I like a dark chocolate so I never go lower than 70%.\nHere's how it works:\nif you want your final chocolate to be a 70% dark chocolate, take the weight of the cocoa liquor (in grams), and divide that number by 70. Take the resulting number and multiply that by 30--that is the grams of sugar you will need to add to make the final batch a 70% dark chocolate. See, math is fun when it makes chocolate!\nNow, weigh out the amount of sugar you calculated for your custom chocolate. Pre-grind the sugar in small batches in your coffee grinder. You only need to grind for about 30 seconds to get a nice powdered sugar. With a rubber spatula, mix the freshly powdered sugar into the cocoa liquor, making sure there are no clumps.\u00a0 While mixing, have a friend lay out a piece of parchment or foil on a baking sheet (aren't friend's great?)\nOk, i'm sure the suspense is killing you---once the sugar is mixed in, have a taste! (I'll pretend like I didn't see you tasting the cocoa liquor :)\u00a0 Once you have a few batches under your belt, you can add spices and other dehydrated goodies at this step, so long as they don't contain water or moisture. Water will ruin chocolate (even just a drop or two).. \nYou are almost done---All that's left to do is to dispense your chocolate into portions onto the lined cookie sheet.**\u00a0 If you have chocolate molds, use them.\u00a0 With a spoon, portion out the liquid chocolate onto the cookie sheet.\u00a0 When done, pop it in the fridge for 15 minutes.\u00a0 Once they are solid, they are ready to eat.\u00a0 They will melt in your hands because they are untempered, but if you keep them in a tupperware in the fridge or freezer, it will slow their melting (and slow the formation of fat and sugar \"bloom\"---the swirls and speckles in the picture). \u00a0\nPat yourself on the back, because you just made handmade chocolate from scratch!\u00a0 Now go out and brag to your friends about your accomplishment, but be prepared to share!**I'm deliberately skipping tempering the final chocolate because that is a whole science by itself, but there are plenty of websites that explain how to do it.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 0, 2, 1]\nC: [2, 3, 0, 1]\nD: [2, 3, 1, 0]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_193_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_193_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_193_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_193_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [3, 0, 2, 1]\nC: [1, 0, 2, 3]\nD: [1, 3, 0, 2]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. 1 package of dry yeast1/4 cup very warm water (about 100-105 degrees F.)1/2 tsp white sugar1/2 cup pure pumpkin puree (not pumpkin pie filling)1/4 cup heavy cream or milk 1 tsp fine salt1/4 cup melted butter1/2 tsp vanilla extract3/4 tsp pumpkin pie spice (or 1/2 tsp ground ginger and 1/4 tsp together grounded cloves,cinnamon,nutmeg)1 large egg1/4 cup granulated sugar3 to 4 cups all purpose flour (divided), as needed  (add enough flour so that dough just barely pulls away from sides, and a very soft, slightly sticky dough is formed). Warm together water, milk  and sugar to reach about 100-105 degrees F. and add yeast and mix till incorporated and leave keep aside for 10 minutes and if the yeast mixture bubbles or increases in size, that means the yeast is active and our rolls will turn out soft and fluffy. If the yeast mixture remains the same after 10 minutes, that means either the yeast is not good or water milk mixture was not the right temperature, so need to repeat this step using fresh ingredientsMelt the butter and keep asideIn a large mixing bowl + Sugar + melted butter and mix well + egg + yeast mixture and mix well + pumpkin puree + pumpkin pie spice + Salt + Vanilla essence and mix well till incorporatedAdd in 1 cup of flour at a time so that dough just barely pulls away from sides, and a very soft,slightly sticky dough is formedSpread all sides of dough with olive oil, wrap the bowl with cling wrap and let it sit on a warm place for 1-2 hours or till doubled in size. Ingredients:3/4 cup packed brown sugar1/4 cup of granulated sugar2 tbsp ground cinnamonSteps:Mix all the above ingredients well till incorporated and keep aside until required. Place the doubled dough on a surface smeared with flour and with the help of rolling pin, roll the dough till it reaches 1/8th inchMelt 5 tbsp butter and brush on the rolled dough Spread the cinnamon sugar on the rolled doughTightly roll the dough and cut into 16 rollsPlace the cut rolls on a greased baking tray about 1/2 inch apart. I used 2 trays because i did not have a big enough trayCover with cling wrap and keep aside for 45 minutes in a warm placePreheat oven to 350F and bake for 30 minutes. Ingredients:1/4 cup room temperature cream cheese1 cup powdered sugar1/4 cup milk, or as needed1/4 tsp vanilla extract, optionalSteps:Mix all the above ingredients very well till you get smooth mixturePour on top of warm baked rolls and serveDone :)\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 0, 2, 1]\nC: [1, 0, 2, 3]\nD: [1, 3, 0, 2]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_194_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_194_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_194_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_194_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [2, 3, 0, 1]\nC: [1, 0, 3, 2]\nD: [1, 3, 2, 0]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. This list cover 2 cups/glasses of dessert, but if you want more you only have to double this amount! :)100ml (3.38oz) of raw-fat milk100ml (3.38oz) of Italian style coffee (the best one you can find)100gr (3.52oz) of sweet white yoghurt (don't go for the sour one, this time)Muller makes an excellent 0,1% fats sweet yoghurt, so you can have a good choice with the illusion of a less fatty recipe XD200gr (7.05oz) of Stracciatella ice-creamStracciatella is like regular white ice-cream but with a lot of crunchy chocolate chips inside. This will give a great texture to the mix. If you can't find Stracciatella, you can always go for the classical white one and add some chocolate chips after. If possibile, buy homemade ice-cream. Here in Italy is very easy... 1 out 5 shops are gelato's ones :DAbout 12 ice cubes4 teaspoon of sugar, I suggest brown sugarFresh liquid cream (to whip later!) or homemade whipped creamSome gelato's shop also prepares a wonderful whipped cream that you can take away. If you prefer, you can buy it there, already whipped! But please, don't use spray whipped cream. It's awful. XDI used whipped cream directly from gelato shop! :)The tools you will need are as follow:Blender   Digital scale Whip for the liquid cream (if needed)   Spoons and cups   Sac \u00e0 poche or syringe for food decoration (to apply whipped cream)The recipe is pretty easy, in my style! :DSo let's move on, you will have your Frozen Cappuccino in minutes.. As some of my followers may notice, I love this kind of recipes: put everything in the blender and then eat! XDThey are super easy, fast, and satisfying!So, let's start by making a good coffee with yuor moka. Use the best quality coffee you can find in your market. It may sounds unnecessary, but coffee quality REALLY makes the difference.Pour 100ml (3.38oz) of coffee in a cup with 4 teaspoons of brown sugar, and mix it up.The heat of coffee will help you to do this quickly!Now, add 100ml (3.38oz) of raw-fat milk to the mixture. I suggest to use fridge-cool milk, so you will have it at room temperature without waiting.. In the meantime, take 200gr (7.05oz) of ice-cream out of the freezer, to let it soft a bit.Did you know that super-hard frozen ice-cream can easily burn the spin motor of your blender? I had some bad experiences about it... so trust me, let it soft a bit XDPut ice-cream, 100gr (3.52oz) of sweet yoghurt and 12 ice cubes in the blender, along with the sweet milk and coffee mix prepared in the previous step.If you like the taste, you can also add a portion of vanilla extract before mixing.Now blend everything until you have a smooth cream.You can change the \"texture\" of your mix by varying blending time: less time means a more grainy mixture. Your choice! ;)Now, pour all the mixture in two cups/glasses (or more if you have doubled the ingredients).We are almost done, it's time for whipped cream!. This part is very easy.Simply pour the fresh liquid cream in a large cup and whip it until it is at the right density.My Kenwood blender (I love it) has an extra accessory that can be applied on the bottom of the mixing cup, and it works like a big whip. It's also perfect for whipping white eggs. :PYou can add some powdered sugar or vanilla sugar before you whip it, to add some extra sweeteness!If you have bought fresh whipped cream, skip this part. Like me! :DNow you only have to load your sac \u00e0 poche or your decoration syringe (I use the last one) with a good amount of whipped cream.Take the cups/glasses with the coffee mixture that we have prepared in the previous step and add a whirly, magical cream top on your dessert! Time for om nom nom nom! :D. You can finish everything by adding a sprinkle of brown sugar onto the creamy top.Now it's time to taste this little sin of gluttony!Yum!Hope you'll find it very tasty and satisfying, and remeber to not overdo with this!!!My little gluttons! ;)From Italy, this is Filippo.Have sweet days.Ciao a tutti! :D\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 3, 0, 1]\nC: [1, 0, 3, 2]\nD: [1, 3, 2, 0]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_195_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_195_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_195_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_195_3.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [2, 3, 1, 0]\nC: [1, 0, 2, 3]\nD: [1, 3, 0, 2]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. First combine the dry ingredients and mix them well. - add the flour - add the baking powder - add the salt - mix well Then cut off the appropriate amount of butter, and chop it into small pieces before adding it to the dry ingredients. Mix until the dough forms small clumps. Add milk slowly until the dough forms a ball. Dump out all of the dough onto the counter and roll out into a sheet. Use a cookie cutter to cut out biscuits. Bake for 6 minutes on an ungreased cookie sheet in a 450 degree Fahrenheit oven. I carefully rolled out each batch to 0.5 cm thickness and measured each product after baking to compare the three oils.. I baked each batch on an ungreased cookie sheet for 6 minutes in a 450 degree Fahrenheit oven on the middle rack.. After baking, I sliced one of the butter batch in half and measured the cross sectional thickness of the biscuit. The biscuit had been rolled out to 0.5 cm in thickness before baking. After baking the biscuit was 1.0 cm in thickness. The biscuit tasted fluffy and moist.. For a second batch, I replicated the same recipe with the same procedure and other ingredients but substituted olive oil for the butter. I found it easier to mix this batch since the oil was a liquid rather than a hard solid like the butter. The dough was drier and crumblier than the butter dough.. After baking, I sliced one of the olive oil batch in half and measured its thickness. Before baking these biscuits were 0.5 cm in thickness, after baking they were 0.75 cm in thickness.. Again I repeated the same procedure except this time I used coconut oil instead of butter. The coconut oil is a soft solid at room temperature so it was really easy to mix into a batter.. The biscuits made with coconut oil were flakier in texture, had a nice nutty flavor, and also had large gas pockets within the biscuit that increased their overall volume. This batch definitely looked fluffy when they came out of the oven. Before baking these biscuits were 0.5 cm in thickness. After baking they were 1.0 cm in thickness.. All three biscuits were tasty, but the texture and appearance of the coconut oil biscuit was the smoothest, fluffiest, and sweetest. In the photo of the cross sections of the biscuits, they are from left to right: butter, olive oil, and then coconut oil. Now I wonder what will happen if I switch to another type of flour? or add a sweetener to make desert biscuits? or spices to make them savory?\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [2, 3, 1, 0]\nC: [1, 0, 2, 3]\nD: [1, 3, 0, 2]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_196_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_196_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_196_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_196_3.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [3, 1, 2, 0]\nC: [0, 1, 3, 2]\nD: [3, 0, 1, 2]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. You will need:Plain flour an egg or two Milk Salt Oil - sunflower or canola (rapeseed) Curry powder (controversial, I know)...yes, not really many ingredients at all...which begs the question: what do they put in Yorkshire pudding packet mix if you still need to add an egg and milk???. You will need:dessert spoon to measure the flour with (...you will see scales in one image, but these are not necessary, I was calibrating my spoon!!!) tea spoon (5ml) fork, to beat out the lumps whisk, to whip air in mixing bowl or jug (I've tried both and the bowl wins hands-down!) sieve for the flour Yorkshire pudding/cup-cake/bun  tin to cook them in (I'm making individual ones here). Place the sieve over your mixing bowl.For 12 good-sized individual puddings you will need 4 heaped tablespoons of flour (slightly over 4oz, 125g if you want the security of using scales!)Then add about 1 teaspoon of curry powder - if you have it - if not you can use paprika, garlic granules/powder, onion salt, or whatever you fancy.Add a pinch of salt ...well no more that 1/4 tea-spoon (if you are using onion or garlic salt you may not need any further added salt!)Now sift this through the sieve into the bowl - use the tablespoon to break up any lumps of spice or flavouring.Make a well in the centre and crack a large egg into this (or two small ones). Then add about 1/4 pint (5 fl oz, 150ml) milk and mix it together with the fork and work it well to get all the lumps out. Then beat well to get some air into the mix.  Then put it into the refrigerator for about half an hour.  So... you will have worked out that if you want your puddings ready with the rest of the meal you want to make them about 1 hour in advance.... First, prepare your tin ... place a small amount of oil in each cup then use something to thoroughly coat the inside of each compartment, I use a pastry brush, but grease proof paper could also be used. Make sure there is a small puddle in the bottom of each cup.Wind the temperature of the oven up to 200C (400F) and put the tin in to heat up.... but not for too long ...you don't want to spoil your meal, you just need that tin to be really hot!Get your batter out of the fridge and whip in about another 1/4 pint of milk and a tea spoon of oil. Now whip it again to get lots of air into it ... loads of small bubbles will cover the top like a foam, stir them into the batter then whip again.Now quickly take the hot tin from the oven, pour the batter into the cups and get it back into the oven. Give it a minute or two to warm up then turn the oven back down to the cooking temperature of your other dishes.The puddings will want between 25 and 35 minutes... please see the next step for more details.... So... how do you like your puddings? Do you like them the traditional pudding texture (soft and creamy) or do you like them crisp and brittle? Or crisp, but a bit chewy?  Well, as you can see from the images, I've been calibrating the process.  The more batter you put in each cup, the more moist/soft each pudding will be. The longer you cook them the more moisture you drive out so the crisper they will be.In the first two images you can see I have filled the left-most row to the brim, the next row are half full then the third row were about 1/4 full.  In the oven at 185C (362F) each row rose up, the left-most over the top of the cups. I gave that tin 25 minutes only.  The result was that the left-most row were soft and melting, the middle crisp and chewy the third row were really crisp, but not brittle-crisp. The fifth image shows the result of filling the cups completely and giving them 35 minutes at 190C (375F), again they rise up, however do not collapse when the tin comes out of the oven.  So, to get really crisp and crunchy \"puddings\" like my kids love you need to only fill each cup 1/4 full, then cook them at 190C (375F) for 35 minutes... but beware you don't want to be removing the joint from the oven to stand too soon in the cooking cycle... so what you could do is take the meat from the oven, cover it and keep it warm to let it stand and go juicy, while you cook your Yorkshire Puddings to perfection in a hot oven.Enjoy!Later Note: A contributor from Yorkshire has kindly pointed out that traditional Yorkshire Puddings rise to 3 times their volume and are light and fluffy... unfortunately an aspiration a lot of restaurants fail to meet in my experience! To get them to rise that much you need to be whipping plenty of air into the batter, and not removing them from the oven until they have risen and set. One egg can absorb up to 1 pint of milk (just look a quiche!!!) so do not worry if you add \"too much\" milk; your puddings will be more fluffy than if you stick to just 1/2 pint for the crispy version.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 1, 2, 0]\nC: [0, 1, 3, 2]\nD: [3, 0, 1, 2]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_197_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_197_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_197_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_197_3.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [3, 1, 0, 2]\nC: [1, 3, 0, 2]\nD: [3, 0, 1, 2]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Add enough curd to soak the sago for at least 6 hours.( Approximately 1/3 cup of sago requires 1/4 cup curd)After 6 hours the sago pearls have absorbed the curd. It becomes soft and almost doubles in size.(Soaking the sago for about 6 hours is very important or else they will burst in the oil while cooking). Take a mixing bowl and add the soaked sago to it.Add three cups of rice flour to sago.Add as much red chili powder and salt as you like, depending on your taste peference.. After adding the spices, mix everything. Add two tablespoons of hot oil and a small amount of water, to make a nonstick dough.The dough consistency should be such that it can easily pass through a murukku press.. Now take a murukku press or any other type of press and grease it with a little oil. Place a star shaped disc inside it.Take a small portion of dough and fill the murukku press with it.. Fry the dough slowly in batches by pressing it through the murukku press, till they turn a little brown.Enjoy the sago murukkus with some tea as an evening snack.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [3, 1, 0, 2]\nC: [1, 3, 0, 2]\nD: [3, 0, 1, 2]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_198_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_198_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_198_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_198_3.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_ordering", "visual_input_component": "Natural image", "source": "RecipeQA", "options": "A: [0, 1, 2, 3]\nB: [1, 3, 0, 2]\nC: [2, 3, 0, 1]\nD: [0, 1, 3, 2]", "question": "What is the correct order of the images?", "context": "Your task is to give the correct order to the input images according to the context.\nHere is the context:\n. Cake\n2 lbs 80/20 Ground Chuck\n2 eggs\n1 cup breadcrumbs\n1 cup chopped onions\n1 carrot shredded\n4 grape tomatoes\n1/2 cup cheddar cheese\n2 tablespoons\u00a0Worcestershire\u00a0sauce\n1 Teaspoon Salt\n1 Teaspoon Pepper\nFrosting\n1 packet Brown Gravy\n1/8 teaspoon corn starch\n3 large potatoes (although 2 would have been enough)\n1/4 cup milk\n6 grape tomatoes. First things first. You want to get the potatoes cooking, those will take the longest\u00a0guaranteed.\nStart by washing them off. Give them a rinse, maybe a little scrub with a brush if you have one. We're going to be pealing them but there is no reason to risk getting anything in your food. \"Cleanliness is next to godliness\", is more true than ever in the kitchen.\nOnce they're washed go ahead and peel them, then chop them up. Try to keep them all about the same size so they cook evenly. Dump the chopped potatoes in a pot, fill with water till their covered then put on the lid and set them on the stove. Medium should be enough. Now don't forget about these, we'll use them later. And that pot might boil over depending how big it is. I\u00a0accidentally\u00a0grabbed one a size too small if you ask me.. Meatloaf is actually a very straightforward process.\nTake all the ingredients and put them in a bowl, mix by hand.\nNow to be more specific. I started by opening up the ground chuck and putting it in the bowl. On top of that I shredded the carrot, added the onions, and half of the breadcrumbs. I took some of the grape tomatoes and chopped them up, cleaning out as much of the seeds as i could. Just give them a little squeeze and pinch off the pulp and seed. I added the tomatoes for texture, but I wasn't after crunchy. I gave it a little mix, added the tomatoes, the rest of the breadcrumbs, the cheese, salt, pepper, and\u00a0Worcestershire\u00a0sauce. I just broke it up a little to make sure it was a bit more even all the way through.\u00a0. Now we're getting to the second long stage of the process. I took two cake pans and buttered the bottom and the sides. After that I poured in some of the breadcrumbs and shook them around until the pan was covered. They stick to the butter. This gives the meatloaf a shape and surface thats a little more uniform and cake like. Divide the meat from the bowl in half and press half into each pan, shaking some more breadcrumbs on top.\nThese two go into a 350 degree oven for half an hour. I know regular meatloaf tend to cook a bit longer, but because the surface to mass ratio is much larger than an average loaf, meaning they're much thinner so the heat doesn't need to penetrate as far, I cut the time.. Remember those potatoes? This is the perfect time to start fiddling with them again. If they are fork tender, meaning in the pot, you can stab them with a fork and not encounter much resistance, go ahead and take them off the heat. Drain out the water and then mash the potatoes. Once they're broken down fairly well add a little bit of milk. The idea is to have a very creamy texture. If you have an immersion blender give them a buzz. Keep in mind we're aiming for creamy, not soupy, so keep the moisture level down as much as possible. Add just enough milk to get them soft and smooth. \u00a0\nNow if you still have some time before your giant hamburgers come out of the oven it'd be a good time to get your gravy done. Just follow the directions on the brown gravy package although once its finished stir in an extra 1/8 of a teaspoon of corn starch. This is a thickening agent and with how we need the gravy to behave we're gonna need it.. This part things went really fast. I misjudged my time slightly, and was also at a complete and utter loss of what the heck I was doing. I have never decorated cakes before, but I am an avid fan of the Food Network. Here's how it went for me\nWhen teh patties are done pull them out of the oven, giving them a little time to cool. Take one and place it on the plate or platter you're going to use. Then take a scoop of the mashed potatoes and spread it out on top of the patty, like a layer of frosting in a double decker cake. Place the other patty on top of the mashed potatoes. Even with everything centered and made in circular pans they probably don't match up perfectly. Take a knife and go around the edge, trimming them down so they're even. After that pour the gravy on top, and using a rubber spatula to spread it out best you can. It will adhere fairly well but its not an exact science in my experience.\u00a0\nOnce the gravy is on the meatloaf, put the mashed potatoes in a zip top bag. Compress them down into a corner and give the top a twist. With a pair of scissors snip off the corner and you have a make shift piping bag. Put a few dollops of the potatoes on the cake, and then take some more grape tomatoes and put them on top of the potatoes for a frosting and cherry look. Make sure you have that one candle sitting in the middle, and when you light it up remember your favorite companion cube.\nEnjoy fooling your friends with this unconventional cake.\nRead the question below and select from the following choices.\nA: [0, 1, 2, 3]\nB: [1, 3, 0, 2]\nC: [2, 3, 0, 1]\nD: [0, 1, 3, 2]", "input_image_path": ["./Discrete-temporal/visual_ordering/visual_ordering_199_0.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_199_1.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_199_2.jpg", "./Discrete-temporal/visual_ordering/visual_ordering_199_3.jpg"], "output": "A", "qwen3-vl": "image none"}]
\ No newline at end of file
diff --git a/results/visual_quality_assessment_q_bench+/qwen3-vl/metadata_info.json b/results/visual_quality_assessment_q_bench+/qwen3-vl/metadata_info.json
new file mode 100644
index 0000000..b900395
--- /dev/null
+++ b/results/visual_quality_assessment_q_bench+/qwen3-vl/metadata_info.json
@@ -0,0 +1 @@
+[{"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: No\nB: Yes\n", "question": "Are both of these images relatively realistic?", "context": "Candidates: A. No B. Yes", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_0_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_0_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Yes\nB: No\n", "question": "Is the first image sharper than the second image?", "context": "Candidates: A. Yes B. No", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_1_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_1_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: overexposure\nB: low light\nC: noise\nD: blur\n", "question": "Which distortion is missing in the second image compared to the first image?", "context": "Candidates: A. overexposure B. low light C. noise D. blur", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_2_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_2_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: The sky in the first image\nB: The figure's back in the second image\nC: The building in the center of the first image\nD: The shop window in the second image\n", "question": "Which part of the two images is more affected by underexposure?", "context": "Candidates: A. The sky in the first image B. The figure's back in the second image C. The building in the center of the first image D. The shop window in the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_3_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_3_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: The ground in the first image\nB: The dog in the first image\nC: The baby in the second image\n", "question": "Which part below is affected by motion blur?", "context": "Candidates: A. The ground in the first image B. The dog in the first image C. The baby in the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_4_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_4_1.JPG"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Much weaker\nB: About the same\nC: Much stronger\n", "question": "Compared to the second image, how is the lighting in the first image?", "context": "Candidates: A. Much weaker B. About the same C. Much stronger", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_5_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_5_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Similar\nB: Slightly better\nC: Slightly worse\n", "question": "Compared to the second image, how is the lighting situation in the first image?", "context": "Candidates: A. Similar B. Slightly better C. Slightly worse", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_6_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_6_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: No\nB: Yes\n", "question": "Is the illumination of the second image stronger than the first image?", "context": "Candidates: A. No B. Yes", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_7_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_7_1.bmp"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Yes\nB: No\n", "question": "Is the first image clearer than the second image?", "context": "Candidates: A. Yes B. No", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_8_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_8_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: No\nB: Yes\n", "question": "Does the first image have more overexposure distortion than the second image?", "context": "Candidates: A. No B. Yes", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_9_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_9_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: The person in the first image\nB: The telephone booth in the first image\nC: The background in the second image\n", "question": "Which part is most affected by motion blur?", "context": "Candidates: A. The person in the first image B. The telephone booth in the first image C. The background in the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_10_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_10_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: No\nB: Yes\n", "question": "Is the first image more realistic than the second image?", "context": "Candidates: A. No B. Yes", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_11_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_11_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: No\nB: Yes\n", "question": "Is the lighting of the first image stronger than the second image?", "context": "Candidates: A. No B. Yes", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_12_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_12_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Motion blur\nB: Overexposure\nC: Out of focus\n", "question": "What kind of distortion issue do these two images not have?", "context": "Candidates: A. Motion blur B. Overexposure C. Out of focus", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_13_0.bmp", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_13_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Yes\nB: No\n", "question": "Is the first image more blurry than the second image?", "context": "Candidates: A. Yes B. No", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_14_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_14_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Underexposure\nB: Blur\nC: Motion blur\nD: Overexposure\n", "question": "What problem is not present in the two images?", "context": "Candidates: A. Underexposure B. Blur C. Motion blur D. Overexposure", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_15_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_15_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Richer\nB: About the same\nC: Less rich\n", "question": "Compared to the first image, how does the texture detail level of the second image look like?", "context": "Candidates: A. Richer B. About the same C. Less rich", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_16_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_16_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: The woman's face in the second image\nB: The blanket in the second image\nC: The grassland background in the first image\nD: The dog's fur in the first image\n", "question": "Which area has clearer details and textures?", "context": "Candidates: A. The woman's face in the second image B. The blanket in the second image C. The grassland background in the first image D. The dog's fur in the first image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_17_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_17_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: No\nB: Yes\n", "question": "Are the colors of these two images both monotonous?", "context": "Candidates: A. No B. Yes", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_18_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_18_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: The background of the first image\nB: The apple in the first image\nC: The black and white wall of the second image\n", "question": "Which part is most seriously affected by overexposure?", "context": "Candidates: A. The background of the first image B. The apple in the first image C. The black and white wall of the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_19_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_19_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Yes\nB: No\n", "question": "Is the composition of the first image better than the second image?", "context": "Candidates: A. Yes B. No", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_20_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_20_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: worse\nB: similar\nC: better\n", "question": "Compared to the second image, how is the composition of the first image?", "context": "Candidates: A. worse B. similar C. better", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_21_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_21_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Blurrier\nB: Clearer\nC: About the same\n", "question": "Relative to the first image, how clear is the second image?", "context": "Candidates: A. Blurrier B. Clearer C. About the same", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_22_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_22_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Less realistic\nB: More realistic\nC: About the same\n", "question": "Compared to the first image, how would you rate the realism of the second image?", "context": "Candidates: A. Less realistic B. More realistic C. About the same", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_23_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_23_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: The left side of the second image\nB: The dog in the second image\nC: The figures in the first image\n", "question": "Which part below is most severely affected by overexposure?", "context": "Candidates: A. The left side of the second image B. The dog in the second image C. The figures in the first image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_24_0.webp", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_24_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Yes\nB: No\n", "question": "Is the color of the first image more vivid than the second image?", "context": "Candidates: A. Yes B. No", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_25_0.JPG", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_25_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: More realistic\nB: Less realistic\nC: About the same\n", "question": "Compared to the first image, how real is the second image?", "context": "Candidates: A. More realistic B. Less realistic C. About the same", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_26_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_26_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Yes\nB: No\n", "question": "Is the second image sharper than the first image?", "context": "Candidates: A. Yes B. No", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_27_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_27_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Yes\nB: No\n", "question": "Is the first image blurrier than the second image?", "context": "Candidates: A. Yes B. No", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_28_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_28_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Much worse\nB: About the same\nC: Much better\n", "question": "Compared to the first image, how is the composition of the second image?", "context": "Candidates: A. Much worse B. About the same C. Much better", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_29_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_29_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Street lamp in the first image\nB: Pedestrian in the second image\nC: Ground in the first image\n", "question": "Which part below is most severely affected by overexposure?", "context": "Candidates: A. Street lamp in the first image B. Pedestrian in the second image C. Ground in the first image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_30_0.bmp", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_30_1.bmp"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Snowflake\nB: Strong light\nC: Low light\nD: Overexposure\n", "question": "In the problem of which is more severe between the first image and the second image, which of the following is not present?", "context": "Candidates: A. Snowflake B. Strong light C. Low light D. Overexposure", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_31_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_31_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: much worse\nB: almost the same\nC: much worse\nD: much better\n", "question": "Compared to the first image, how is the aesthetic composition of the second image?", "context": "Candidates: A. much worse B. almost the same C. much worse D. much better", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_32_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_32_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: No\nB: Yes\n", "question": "Is the first image sharper than the second image?", "context": "Candidates: A. No B. Yes", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_33_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_33_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Second image\nB: First image\n", "question": "Which image is affected more by overexposure?", "context": "Candidates: A. Second image B. First image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_34_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_34_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Less rich\nB: About the same\nC: Richer\n", "question": "Compared to the first image, how rich are the texture details in the second image?", "context": "Candidates: A. Less rich B. About the same C. Richer", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_35_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_35_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: About the same\nB: Blurrier\nC: Clearer\n", "question": "Compared to the first image, how is the clarity of the second image?", "context": "Candidates: A. About the same B. Blurrier C. Clearer", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_36_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_36_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Clearer\nB: Blurrier\nC: About the same\n", "question": "Compared to the first image, how is the clarity of the second image?", "context": "Candidates: A. Clearer B. Blurrier C. About the same", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_37_0.JPG", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_37_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Ground in the first image\nB: Dog in the first image\nC: Person in the second image\n", "question": "Which part below is most affected by motion blur?", "context": "Candidates: A. Ground in the first image B. Dog in the first image C. Person in the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_38_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_38_1.bmp"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: The sky in the second image\nB: The person in the second image\nC: The strawberry in the first image\n", "question": "Which part below is most severely affected by overexposure?", "context": "Candidates: A. The sky in the second image B. The person in the second image C. The strawberry in the first image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_39_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_39_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Sharper\nB: Blurrier\nC: About the same\n", "question": "Compared to the first image, how is the sharpness of the second image?", "context": "Candidates: A. Sharper B. Blurrier C. About the same", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_40_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_40_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: The focused red flowers in the second image\nB: The flower bush background in the second image\nC: The background in the first image\nD: The man's silhouette in the first image\n", "question": "Which area is more affected by blurring?", "context": "Candidates: A. The focused red flowers in the second image B. The flower bush background in the second image C. The background in the first image D. The man's silhouette in the first image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_41_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_41_1.JPG"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Second image\nB: First image\n", "question": "Which of the following images has a more serious overexposure issue?", "context": "Candidates: A. Second image B. First image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_42_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_42_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Yes\nB: No\n", "question": "Is the composition of the first image better than the second image?", "context": "Candidates: A. Yes B. No", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_43_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_43_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Yes\nB: No\n", "question": "Is the second image more realistic than the first image?", "context": "Candidates: A. Yes B. No", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_44_0.webp", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_44_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: No\nB: Yes\n", "question": "Is the texture detail of the first image richer than the second image?", "context": "Candidates: A. No B. Yes", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_45_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_45_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: About the same\nB: Slightly sharper\nC: Slightly more blurry\n", "question": "Compared to the first image, how is the sharpness of the second image?", "context": "Candidates: A. About the same B. Slightly sharper C. Slightly more blurry", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_46_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_46_1.webp"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Checkerboard ground in the first image\nB: Horse in the second image\nC: Background in the second image\n", "question": "Which part has the most severe issue of losing texture details?", "context": "Candidates: A. Checkerboard ground in the first image B. Horse in the second image C. Background in the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_47_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_47_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: The ground in the second image\nB: The waves in the first image\nC: The plants in the second image\n", "question": "Which part below is most severely affected by snowflakes?", "context": "Candidates: A. The ground in the second image B. The waves in the first image C. The plants in the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_48_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_48_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: No\nB: Yes\n", "question": "Are both of these images very clear?", "context": "Candidates: A. No B. Yes", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_49_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_49_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: House windows in the second image\nB: Banana in the first image\nC: Facial features of the person in the first image\n", "question": "Which part below suffers the most severe underexposure problem?", "context": "Candidates: A. House windows in the second image B. Banana in the first image C. Facial features of the person in the first image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_50_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_50_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Much less\nB: About the same\nC: Much more\n", "question": "How is the noise situation in the second image compared to the first image?", "context": "Candidates: A. Much less B. About the same C. Much more", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_51_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_51_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: the road in the second image\nB: the background of the first image\nC: the ground of the first image\n", "question": "Which part below is most severely affected by snowflake-like distortion?", "context": "Candidates: A. the road in the second image B. the background of the first image C. the ground of the first image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_52_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_52_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: The floor in the first image\nB: The ground in the second image\nC: The hand holding a gun in the second image\n", "question": "Which part has the richest detail texture?", "context": "Candidates: A. The floor in the first image B. The ground in the second image C. The hand holding a gun in the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_53_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_53_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Similar\nB: More sufficient\nC: Less sufficient\n", "question": "Compared to the first image, how is the illumination of the second image?", "context": "Candidates: A. Similar B. More sufficient C. Less sufficient", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_54_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_54_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Window in the second image\nB: Aircraft in the first image\nC: Person in the second image\n", "question": "Which part below is most severely affected by overexposure?", "context": "Candidates: A. Window in the second image B. Aircraft in the first image C. Person in the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_55_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_55_1.JPG"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: No\nB: Yes\n", "question": "Are there severe motion blur in both images?", "context": "Candidates: A. No B. Yes", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_56_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_56_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: The person in red in the second image\nB: The facial part of the person in the first image\nC: The sunglasses in the first image\nD: The top of the tent in the second image\n", "question": "Which part below is most severely affected by overexposure?", "context": "Candidates: A. The person in red in the second image B. The facial part of the person in the first image C. The sunglasses in the first image D. The top of the tent in the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_57_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_57_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Background of the first image\nB: Character in the second image\nC: Character in the first image\n", "question": "Which part is most severely affected by noise?", "context": "Candidates: A. Background of the first image B. Character in the second image C. Character in the first image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_58_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_58_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Similar\nB: More Adequate\nC: Less Adequate\n", "question": "Compared to the first image, how is the illumination of the second image?", "context": "Candidates: A. Similar B. More Adequate C. Less Adequate", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_59_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_59_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Similar\nB: Less realistic\nC: More realistic\n", "question": "Compared with the first image, how does the authenticity of the second image differ?", "context": "Candidates: A. Similar B. Less realistic C. More realistic", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_60_0.bmp", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_60_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: No\nB: Yes\n", "question": "Is the first image more realistic than the second image?", "context": "Candidates: A. No B. Yes", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_61_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_61_1.bmp"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: the top of the first image\nB: bird in the second image\nC: ground in the second image\n", "question": "Which part below is most affected by overexposure?", "context": "Candidates: A. the top of the first image B. bird in the second image C. ground in the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_62_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_62_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Similar\nB: More fake\nC: More authentic\n", "question": "Compared to the second image, how does the first image's authenticity compare?", "context": "Candidates: A. Similar B. More fake C. More authentic", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_63_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_63_1.webp"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: No\nB: Yes\n", "question": "Is the first image clearer than the second image?", "context": "Candidates: A. No B. Yes", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_64_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_64_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: No\nB: Yes\n", "question": "Are the details and textures in the first image clearer than those in the second image?", "context": "Candidates: A. No B. Yes", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_65_0.webp", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_65_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: More blurry\nB: Clearer\nC: About the same\n", "question": "Compared to the first image, how is the sharpness of the second image?", "context": "Candidates: A. More blurry B. Clearer C. About the same", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_66_0.bmp", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_66_1.JPG"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: No\nB: Yes\n", "question": "Are the two images both quite clear?", "context": "Candidates: A. No B. Yes", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_67_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_67_1.JPG"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: No\nB: Yes\n", "question": "Is the first image sharper than the second image?", "context": "Candidates: A. No B. Yes", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_68_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_68_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: No\nB: Yes\n", "question": "Are there noise issues in both of these images?", "context": "Candidates: A. No B. Yes", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_69_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_69_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Much more severe\nB: Similar\nC: Much slighter\n", "question": "How does the noise situation in the second image compare to the first image?", "context": "Candidates: A. Much more severe B. Similar C. Much slighter", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_70_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_70_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Much worse\nB: Much better\nC: About the same\n", "question": "Compared to the first image, how is the focusing situation of the second image?", "context": "Candidates: A. Much worse B. Much better C. About the same", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_71_0.JPG", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_71_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: More authentic\nB: About the same\nC: Less authentic\n", "question": "Compared to the first image, how does the authenticity of the second image compare?", "context": "Candidates: A. More authentic B. About the same C. Less authentic", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_72_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_72_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Yes\nB: No\n", "question": "Compared to the second image, is the detail texture of the first image clearer?", "context": "Candidates: A. Yes B. No", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_73_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_73_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Similar\nB: Much worse\nC: Much better\n", "question": "How is the focus of the second image relative to the first image?", "context": "Candidates: A. Similar B. Much worse C. Much better", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_74_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_74_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Underexposure\nB: Motion blur\nC: Overexposure\nD: Blur\n", "question": "Which kind of distortion is not present in the two images?", "context": "Candidates: A. Underexposure B. Motion blur C. Overexposure D. Blur", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_75_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_75_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Overexposure\nB: Focus problem\nC: Noise\n", "question": "What is the distortion that does not appear in the two images?", "context": "Candidates: A. Overexposure B. Focus problem C. Noise", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_76_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_76_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: First image\nB: Second image\n", "question": "Which image does not have overexposure distortion issue?", "context": "Candidates: A. First image B. Second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_77_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_77_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Yes\nB: No\n", "question": "Is the focus of the first image not as good as the second image?", "context": "Candidates: A. Yes B. No", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_78_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_78_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: About the same\nB: Clearer\nC: Blurrier\n", "question": "Compared to the first image, how clear are the texture details of the subject in the second image?", "context": "Candidates: A. About the same B. Clearer C. Blurrier", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_79_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_79_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: About the same\nB: Much clearer\nC: Much blurrier\n", "question": "Compared to the first image, how is the clarity of texture details in the second image?", "context": "Candidates: A. About the same B. Much clearer C. Much blurrier", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_80_0.bmp", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_80_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: The wall in the first image\nB: The large tree on the right side in the second image\nC: The street light in the second image\nD: The clothes hanger in the first image\n", "question": "Which part below is most affected by overexposure?", "context": "Candidates: A. The wall in the first image B. The large tree on the right side in the second image C. The street light in the second image D. The clothes hanger in the first image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_81_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_81_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: About the same\nB: More blurry\nC: Clearer\n", "question": "Compared to the first image, how is the clarity of the second image?", "context": "Candidates: A. About the same B. More blurry C. Clearer", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_82_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_82_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Less authentic\nB: About the same\nC: More authentic\n", "question": "Compared to the first image, how is the authenticity of the second image?", "context": "Candidates: A. Less authentic B. About the same C. More authentic", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_83_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_83_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Yes\nB: No\n", "question": "Is the first image sharper than the second image?", "context": "Candidates: A. Yes B. No", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_84_0.JPG", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_84_1.bmp"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Less real\nB: About the same\nC: More real\n", "question": "Compared to the first image, how does the reality of the second image compare?", "context": "Candidates: A. Less real B. About the same C. More real", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_85_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_85_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: No\nB: Yes\n", "question": "Is the noise in the first image much more obvious than in the second image?", "context": "Candidates: A. No B. Yes", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_86_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_86_1.bmp"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Similar\nB: More realistic\nC: Less realistic\n", "question": "How does the realism of the second image compare to the first image?", "context": "Candidates: A. Similar B. More realistic C. Less realistic", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_87_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_87_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Yes\nB: No\n", "question": "Are both images not genuine?", "context": "Candidates: A. Yes B. No", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_88_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_88_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Yes\nB: No\n", "question": "Is the first image more realistic than the second image?", "context": "Candidates: A. Yes B. No", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_89_0.webp", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_89_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: More blurry\nB: Clearer\nC: About the same\n", "question": "Compared to the first image, how is the clarity of the second image?", "context": "Candidates: A. More blurry B. Clearer C. About the same", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_90_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_90_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Much higher\nB: About the same\nC: Much lower\n", "question": "Compared to the second image, how is the pixel quality of the first image?", "context": "Candidates: A. Much higher B. About the same C. Much lower", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_91_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_91_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Slightly more\nB: More severe\nC: About the same\n", "question": "Compared to the first image, how much is the second image affected by motion blur?", "context": "Candidates: A. Slightly more B. More severe C. About the same", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_92_0.webp", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_92_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Yes\nB: No\n", "question": "Is the first image clearer than the second image?", "context": "Candidates: A. Yes B. No", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_93_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_93_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Similar\nB: More realistic\nC: Less realistic\n", "question": "How does the realism of the second image compare to the first image?", "context": "Candidates: A. Similar B. More realistic C. Less realistic", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_94_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_94_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: No\nB: Yes\n", "question": "Is the first image sharper than the second image?", "context": "Candidates: A. No B. Yes", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_95_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_95_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Yes\nB: No\n", "question": "Is the color of the first image more rich and vivid than the second image?", "context": "Candidates: A. Yes B. No", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_96_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_96_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Yes\nB: No\n", "question": "Is the first image significantly less clear than the second image?", "context": "Candidates: A. Yes B. No", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_97_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_97_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: The sky in the upper right corner of the second image\nB: The buildings in the second image\nC: The lake surface in the first image\n", "question": "Which part below has the most severe overexposure?", "context": "Candidates: A. The sky in the upper right corner of the second image B. The buildings in the second image C. The lake surface in the first image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_98_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_98_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: More abundant\nB: Less abundant\nC: About the same\n", "question": "Compared to the first image, how is the texture detail in the second image?", "context": "Candidates: A. More abundant B. Less abundant C. About the same", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_99_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_99_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Background of the first image\nB: Stamen of the second image\nC: Person in the first image\n", "question": "Which part below is most severely affected by out-of-focus?", "context": "Candidates: A. Background of the first image B. Stamen of the second image C. Person in the first image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_100_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_100_1.JPG"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Underexposed\nB: Blurry\nC: Motion blur\nD: Overexposed\n", "question": "Compared to the second image, what kind of distortion does the first image not have?", "context": "Candidates: A. Underexposed B. Blurry C. Motion blur D. Overexposed", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_101_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_101_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: More blurry\nB: About the same\nC: Clearer\n", "question": "Compared to the first image, how is the clarity of the subject's details and textures in the second image?", "context": "Candidates: A. More blurry B. About the same C. Clearer", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_102_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_102_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Lens flare\nB: Motion blur\nC: Overexposure\nD: Noise\n", "question": "What kind of distortion do not appear in these two images?", "context": "Candidates: A. Lens flare B. Motion blur C. Overexposure D. Noise", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_103_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_103_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Blur\nB: Motion blur\nC: Underexposure\nD: Overexposure\n", "question": "What kind of distortion is not present in the two images?", "context": "Candidates: A. Blur B. Motion blur C. Underexposure D. Overexposure", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_104_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_104_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: richer\nB: about the same\nC: more monotonous\n", "question": "Compared to the first image, how rich is the color in the second image?", "context": "Candidates: A. richer B. about the same C. more monotonous", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_105_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_105_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Yes\nB: No\n", "question": "Is the color of the first image richer than the second image?", "context": "Candidates: A. Yes B. No", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_106_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_106_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: No\nB: Yes\n", "question": "Is the sharpness of the first image lower than the second image?", "context": "Candidates: A. No B. Yes", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_107_0.bmp", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_107_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Yes\nB: No\n", "question": "Is the color of the first image richer than the second image?", "context": "Candidates: A. Yes B. No", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_108_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_108_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Similar\nB: Worse\nC: Better\n", "question": "How does the composition of the second image compare to the first image?", "context": "Candidates: A. Similar B. Worse C. Better", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_109_0.bmp", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_109_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: The sky in the first image\nB: The person in the first image\nC: The bus in the second image\n", "question": "Which part below is most affected by noise?", "context": "Candidates: A. The sky in the first image B. The person in the first image C. The bus in the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_110_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_110_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Underexposure\nB: Low light\nC: Out of focus\nD: Noise\n", "question": "Which type of distortion is more severe in the second image compared to the first image?", "context": "Candidates: A. Underexposure B. Low light C. Out of focus D. Noise", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_111_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_111_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: More severe\nB: Slightly more\nC: About the same\n", "question": "Compared to the second image, how is the first image affected by underexposure?", "context": "Candidates: A. More severe B. Slightly more C. About the same", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_112_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_112_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: No\nB: Yes\n", "question": "Are both of these images very realistic?", "context": "Candidates: A. No B. Yes", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_113_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_113_1.webp"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: No\nB: Yes\n", "question": "Is the illumination sufficient in both of these images?", "context": "Candidates: A. No B. Yes", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_114_0.JPG", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_114_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Yes\nB: No\n", "question": "Is the first image sharper than the second image?", "context": "Candidates: A. Yes B. No", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_115_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_115_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: similar\nB: less rich\nC: richer\n", "question": "Compared to the first image, how is the texture detail of the second image?", "context": "Candidates: A. similar B. less rich C. richer", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_116_0.JPG", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_116_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: No\nB: Yes\n", "question": "Is the color of the first image more vivid than the second image?", "context": "Candidates: A. No B. Yes", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_117_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_117_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: No\nB: Yes\n", "question": "Is the first image clearer than the second image?", "context": "Candidates: A. No B. Yes", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_118_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_118_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Front building in the second image\nB: Aircraft in the first image\nC: Left sky in the second image\n", "question": "Which part below is most affected by overexposure?", "context": "Candidates: A. Front building in the second image B. Aircraft in the first image C. Left sky in the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_119_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_119_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: No\nB: Yes\n", "question": "Is the noise in the first image significantly more severe than in the second image?", "context": "Candidates: A. No B. Yes", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_120_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_120_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: No\nB: Yes\n", "question": "Is the first image clearer than the second image?", "context": "Candidates: A. No B. Yes", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_121_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_121_1.webp"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Characters in the first image\nB: Top right corner of the second image\nC: Ground in the first image\n", "question": "Which part below is most affected by overexposure?", "context": "Candidates: A. Characters in the first image B. Top right corner of the second image C. Ground in the first image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_122_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_122_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: First image\nB: Second image\n", "question": "Which of the following images is most affected by motion blur?", "context": "Candidates: A. First image B. Second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_123_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_123_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: No\nB: Yes\n", "question": "Is the illumination sufficient in these two images?", "context": "Candidates: A. No B. Yes", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_124_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_124_1.bmp"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Yes\nB: No\n", "question": "Is the fidelity of the first image lower than the second image?", "context": "Candidates: A. Yes B. No", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_125_0.webp", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_125_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Yes\nB: No\n", "question": "Are both of these images clear?", "context": "Candidates: A. Yes B. No", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_126_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_126_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Background of the first image\nB: Background of the second image\nC: Person in the first image\n", "question": "Which part below is most severely affected by motion blur?", "context": "Candidates: A. Background of the first image B. Background of the second image C. Person in the first image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_127_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_127_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Similar\nB: More blurry\nC: Clearer\n", "question": "Compared to the first image, how is the clarity of the second image?", "context": "Candidates: A. Similar B. More blurry C. Clearer", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_128_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_128_1.bmp"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Similar\nB: Clearer\nC: Blurrier\n", "question": "How does the clarity of the second image compare to the first image?", "context": "Candidates: A. Similar B. Clearer C. Blurrier", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_129_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_129_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Yes\nB: No\n", "question": "Are both images rich in color?", "context": "Candidates: A. Yes B. No", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_130_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_130_1.webp"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Second image\nB: First image\n", "question": "Which of the following images has a serious overexposure issue?", "context": "Candidates: A. Second image B. First image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_131_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_131_1.webp"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Yes\nB: No\n", "question": "Are the colors of these two images not very vivid?", "context": "Candidates: A. Yes B. No", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_132_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_132_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Much better\nB: Much worse\nC: About the same\n", "question": "Compared to the first image, how rich are the colors in the second image?", "context": "Candidates: A. Much better B. Much worse C. About the same", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_133_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_133_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Slightly worse\nB: Slightly better\nC: Much worse\nD: About the same\n", "question": "Compared to the lighting of the second image, how is the lighting of the first image?", "context": "Candidates: A. Slightly worse B. Slightly better C. Much worse D. About the same", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_134_0.JPG", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_134_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: No\nB: Yes\n", "question": "Are both of these images clear?", "context": "Candidates: A. No B. Yes", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_135_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_135_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Ground in the first image\nB: Car in the first image\nC: Plane in the second image\nD: Background in the second image\n", "question": "Which area is more affected by motion blur?", "context": "Candidates: A. Ground in the first image B. Car in the first image C. Plane in the second image D. Background in the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_136_0.bmp", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_136_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Yes\nB: No\n", "question": "Is the lighting of the first image more sufficient than the second image?", "context": "Candidates: A. Yes B. No", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_137_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_137_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: No\nB: Yes\n", "question": "Are both images very real?", "context": "Candidates: A. No B. Yes", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_138_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_138_1.webp"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Yes\nB: No\n", "question": "Are both of these images relatively blurry?", "context": "Candidates: A. Yes B. No", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_139_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_139_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: The player in the first image\nB: The horse in the second image\nC: The audience in the background of the first image\nD: The background in the second image\n", "question": "In which area of the two images is more affected by motion blur?", "context": "Candidates: A. The player in the first image B. The horse in the second image C. The audience in the background of the first image D. The background in the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_140_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_140_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Ground of the second image\nB: Sky of the second image\nC: Ground of the first image\n", "question": "Which part has the most severe overexposure issue?", "context": "Candidates: A. Ground of the second image B. Sky of the second image C. Ground of the first image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_141_0.JPG", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_141_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: No\nB: Yes\n", "question": "Is the first image more realistic than the second image?", "context": "Candidates: A. No B. Yes", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_142_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_142_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Yes\nB: No\n", "question": "Is the first image sharper than the second image?", "context": "Candidates: A. Yes B. No", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_143_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_143_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: About the same\nB: Much blurrier\nC: Much clearer\n", "question": "Compared to the first image, how is the clarity of the second image?", "context": "Candidates: A. About the same B. Much blurrier C. Much clearer", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_144_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_144_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Background of the second image\nB: Table in front of the second image\nC: Grass in the first image\nD: Snowy mountain in the first image\n", "question": "Which area is more severely affected by blurring?", "context": "Candidates: A. Background of the second image B. Table in front of the second image C. Grass in the first image D. Snowy mountain in the first image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_145_0.JPG", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_145_1.webp"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: The ground in the second image\nB: The person in the second image\nC: The person in the first image\n", "question": "Which part below is most affected by motion blur?", "context": "Candidates: A. The ground in the second image B. The person in the second image C. The person in the first image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_146_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_146_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Similar\nB: Slightly smaller\nC: Significantly larger\n", "question": "Compared to the first image, how is the second image affected by overexposure?", "context": "Candidates: A. Similar B. Slightly smaller C. Significantly larger", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_147_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_147_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: No\nB: Yes\n", "question": "Is the first image sharper than the second image?", "context": "Candidates: A. No B. Yes", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_148_0.png", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_148_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Less rich\nB: About the same\nC: Richer\n", "question": "Compared to the first image, how is the color richness of the second image?", "context": "Candidates: A. Less rich B. About the same C. Richer", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_149_0.JPG", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_149_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Much poorer\nB: Much richer\nC: About the same\n", "question": "Compared to the first image, how is the richness of colors in the second image?", "context": "Candidates: A. Much poorer B. Much richer C. About the same", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_150_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_150_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Similar\nB: Better\nC: Worse\n", "question": "Compared to the first photo, how is the focus of the second photo?", "context": "Candidates: A. Similar B. Better C. Worse", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_151_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_151_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: No\nB: Yes\n", "question": "Is the composition of the first image better than the second image?", "context": "Candidates: A. No B. Yes", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_152_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_152_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: More monotonous\nB: About the same\nC: More rich\n", "question": "Compared to the first image, how is the color richness of the second image?", "context": "Candidates: A. More monotonous B. About the same C. More rich", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_153_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_153_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Similar\nB: Clearer\nC: Blurrier\n", "question": "Compared to the first image, how is the clarity of the second image?", "context": "Candidates: A. Similar B. Clearer C. Blurrier", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_154_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_154_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Ground in the first image\nB: Sky in the second image\nC: Lion in the first image\n", "question": "Which part below is most severely affected by overexposure?", "context": "Candidates: A. Ground in the first image B. Sky in the second image C. Lion in the first image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_155_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_155_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: More authentic\nB: About the same\nC: Less authentic\n", "question": "Compared to the first image, how would you rate the authenticity of the second image?", "context": "Candidates: A. More authentic B. About the same C. Less authentic", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_156_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_156_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Yes\nB: No\n", "question": "Are both of these images very clear?", "context": "Candidates: A. Yes B. No", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_157_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_157_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: No\nB: Yes\n", "question": "Have both figures in these two images been overexposed?", "context": "Candidates: A. No B. Yes", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_158_0.bmp", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_158_1.JPG"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Area in the first image\nB: Roof of the building in the second image\nC: Athlete in the first image\nD: Sky in the second image\n", "question": "Which area is most affected by overexposure?", "context": "Candidates: A. Area in the first image B. Roof of the building in the second image C. Athlete in the first image D. Sky in the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_159_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_159_1.jpg"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: The moon in the second image\nB: The person in the bottom right corner of the first image\nC: The left sky in the first image\n", "question": "Which part below is most affected by overexposure?", "context": "Candidates: A. The moon in the second image B. The person in the bottom right corner of the first image C. The left sky in the first image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_160_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_160_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Yes\nB: No\n", "question": "Is the texture detail of the first image less rich than the second image?", "context": "Candidates: A. Yes B. No", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_161_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_161_1.bmp"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Less real\nB: About the same\nC: More real\n", "question": "Compared to the first image, how real is the second image?", "context": "Candidates: A. Less real B. About the same C. More real", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_162_0.webp", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_162_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: About the same\nB: More blurry\nC: Clearer\n", "question": "Compared to the first image, how clear is the second image?", "context": "Candidates: A. About the same B. More blurry C. Clearer", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_163_0.webp", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_163_1.webp"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Low light\nB: Vignetting\nC: Noise\nD: Motion blur\n", "question": "Which type of distortion does not appear in the two images?", "context": "Candidates: A. Low light B. Vignetting C. Noise D. Motion blur", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_164_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_164_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Out of focus\nB: Noise\nC: Overexposure\n", "question": "What kind of distortion did not appear in these two images?", "context": "Candidates: A. Out of focus B. Noise C. Overexposure", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_165_0.bmp", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_165_1.JPG"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: overexposure\nB: motion blur\nC: out of focus\n", "question": "Are there any distortion issues in these two images?", "context": "Candidates: A. overexposure B. motion blur C. out of focus", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_166_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_166_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Yes\nB: No\n", "question": "Is the noise in the first image larger than the second image?", "context": "Candidates: A. Yes B. No", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_167_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_167_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: No\nB: Yes\n", "question": "Is the sky in the second image more affected by overexposure than the sky in the first image?", "context": "Candidates: A. No B. Yes", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_168_0.bmp", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_168_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Less sufficient\nB: About the same\nC: More sufficient\n", "question": "Compared to the first image, how is the lighting in the second image?", "context": "Candidates: A. Less sufficient B. About the same C. More sufficient", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_169_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_169_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: No\nB: Yes\n", "question": "Is the first image blurrier than the second image?", "context": "Candidates: A. No B. Yes", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_170_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_170_1.JPG"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: No\nB: Yes\n", "question": "Is the first image sharper than the second image?", "context": "Candidates: A. No B. Yes", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_171_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_171_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Much clearer\nB: About the same\nC: Much blurrier\n", "question": "Compared to the second image, how is the fine texture of the first image?", "context": "Candidates: A. Much clearer B. About the same C. Much blurrier", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_172_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_172_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Less sufficient\nB: More sufficient\nC: About the same\n", "question": "How does the illumination of the second image compare to the first image?", "context": "Candidates: A. Less sufficient B. More sufficient C. About the same", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_173_0.bmp", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_173_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Low light\nB: Blur\nC: Motion blur\n", "question": "What problems are not present in the two images?", "context": "Candidates: A. Low light B. Blur C. Motion blur", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_174_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_174_1.JPG"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Motion blur\nB: Underexposure\nC: Overexposure\nD: Weak light\n", "question": "Which of the following distortions does not appear in the two images?", "context": "Candidates: A. Motion blur B. Underexposure C. Overexposure D. Weak light", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_175_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_175_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Yes\nB: No\n", "question": "Is the illumination sufficient in both of these images?", "context": "Candidates: A. Yes B. No", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_176_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_176_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: No\nB: Yes\n", "question": "Is the first image sharper than the second image?", "context": "Candidates: A. No B. Yes", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_177_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_177_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Yes\nB: No\n", "question": "Is the second image more realistic than the first image?", "context": "Candidates: A. Yes B. No", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_178_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_178_1.webp"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Blur\nB: Overexposure\nC: Underexposure\nD: Noise\n", "question": "Please identify what kind of distortion is not present in these two images?", "context": "Candidates: A. Blur B. Overexposure C. Underexposure D. Noise", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_179_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_179_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: similar\nB: more realistic\nC: less realistic\n", "question": "Compared to the first image, how is the realism of the second image?", "context": "Candidates: A. similar B. more realistic C. less realistic", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_180_0.webp", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_180_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Yes\nB: No\n", "question": "Is the first image more realistic?", "context": "Candidates: A. Yes B. No", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_181_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_181_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Second image\nB: First image\n", "question": "Which image below is more severely affected by overexposure?", "context": "Candidates: A. Second image B. First image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_182_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_182_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Similar\nB: Worse\nC: Better\n", "question": "How does the composition of the second image compare to the first image?", "context": "Candidates: A. Similar B. Worse C. Better", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_183_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_183_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Very dark\nB: Much darker\nC: Much brighter\nD: About the same\n", "question": "Compared to the first image, how is the lighting in the second image?", "context": "Candidates: A. Very dark B. Much darker C. Much brighter D. About the same", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_184_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_184_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: The man in front of the lens in the first picture\nB: The bus in the first picture\nC: The fish in the second picture\nD: The leaves in the background of the second picture\n", "question": "Which area is more affected by low light?", "context": "Candidates: A. The man in front of the lens in the first picture B. The bus in the first picture C. The fish in the second picture D. The leaves in the background of the second picture", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_185_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_185_1.bmp"], "output": "D", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Much better\nB: Much worse\nC: About the same\n", "question": "Compared to the first image, how is the sharpness of the second image?", "context": "Candidates: A. Much better B. Much worse C. About the same", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_186_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_186_1.jpg"], "output": "C", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: No\nB: Yes\n", "question": "Are both of these images very blurry?", "context": "Candidates: A. No B. Yes", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_187_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_187_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: No\nB: Yes\n", "question": "Are both of these images not very clear?", "context": "Candidates: A. No B. Yes", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_188_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_188_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: The person riding a bike in the first image\nB: The background of the first image\nC: The plant in the second image\n", "question": "Which part below is most severely affected by motion blur?", "context": "Candidates: A. The person riding a bike in the first image B. The background of the first image C. The plant in the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_189_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_189_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: No\nB: Yes\n", "question": "Compared to the second image, is the first image more affected by motion blur?", "context": "Candidates: A. No B. Yes", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_190_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_190_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: More monotonous\nB: About the same\nC: More rich\n", "question": "Compared to the first image, what is the color vividness of the second image?", "context": "Candidates: A. More monotonous B. About the same C. More rich", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_191_0.JPG", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_191_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Yes\nB: No\n", "question": "Is the first image sharper than the second image?", "context": "Candidates: A. Yes B. No", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_192_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_192_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: the yellow doll in the first image\nB: the street lamp in the second image\nC: the wall in the first image\nD: the vehicle in the second image\n", "question": "Which part below is most affected by overexposure?", "context": "Candidates: A. the yellow doll in the first image B. the street lamp in the second image C. the wall in the first image D. the vehicle in the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_193_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_193_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: No\nB: Yes\n", "question": "Is the detail texture of the second image clearer than the first image?", "context": "Candidates: A. No B. Yes", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_194_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_194_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: No\nB: Yes\n", "question": "Is the first image more realistic than the second image?", "context": "Candidates: A. No B. Yes", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_195_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_195_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: No\nB: Yes\n", "question": "Is the color of the first image richer than the second image?", "context": "Candidates: A. No B. Yes", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_196_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_196_1.JPG"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: Yes\nB: No\n", "question": "Is the first image more authentic than the second image?", "context": "Candidates: A. Yes B. No", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_197_0.JPG", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_197_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: worse\nB: better\nC: similar\n", "question": "How does the lighting of the second image compare to the first image?", "context": "Candidates: A. worse B. better C. similar", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_198_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_198_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_q_bench+", "visual_input_component": "natural image", "source": "q bench+", "options": "A: No\nB: Yes\n", "question": "Is the focus of the first image better than the second image?", "context": "Candidates: A. No B. Yes", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_199_0.jpg", "./Low-level-semantic/visual_quality_assessment_q_bench+/visual_quality_assessment_q_bench+_199_1.jpg"], "output": "A", "qwen3-vl": "image none"}]
\ No newline at end of file
diff --git a/results/visual_quality_assessment_ve_lol_l/qwen3-vl/metadata_info.json b/results/visual_quality_assessment_ve_lol_l/qwen3-vl/metadata_info.json
new file mode 100644
index 0000000..2898e05
--- /dev/null
+++ b/results/visual_quality_assessment_ve_lol_l/qwen3-vl/metadata_info.json
@@ -0,0 +1 @@
+[{"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_0_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_0_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_1_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_1_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_2_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_2_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_3_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_3_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_4_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_4_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_5_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_5_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_6_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_6_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_7_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_7_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_8_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_8_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_9_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_9_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_10_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_10_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_11_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_11_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_12_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_12_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_13_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_13_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_14_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_14_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_15_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_15_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_16_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_16_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_17_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_17_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_18_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_18_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_19_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_19_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_20_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_20_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_21_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_21_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_22_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_22_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_23_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_23_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_24_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_24_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_25_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_25_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_26_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_26_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_27_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_27_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_28_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_28_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_29_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_29_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_30_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_30_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_31_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_31_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_32_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_32_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_33_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_33_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_34_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_34_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_35_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_35_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_36_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_36_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_37_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_37_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_38_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_38_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_39_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_39_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_40_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_40_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_41_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_41_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_42_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_42_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_43_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_43_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_44_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_44_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_45_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_45_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_46_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_46_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_47_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_47_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_48_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_48_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_49_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_49_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_50_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_50_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_51_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_51_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_52_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_52_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_53_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_53_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_54_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_54_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_55_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_55_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_56_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_56_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_57_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_57_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_58_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_58_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_59_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_59_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_60_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_60_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_61_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_61_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_62_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_62_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_63_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_63_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_64_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_64_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_65_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_65_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_66_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_66_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_67_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_67_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_68_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_68_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_69_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_69_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_70_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_70_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_71_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_71_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_72_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_72_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_73_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_73_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_74_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_74_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_75_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_75_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_76_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_76_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_77_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_77_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_78_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_78_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_79_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_79_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_80_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_80_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_81_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_81_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_82_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_82_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_83_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_83_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_84_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_84_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_85_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_85_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_86_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_86_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_87_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_87_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_88_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_88_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_89_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_89_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_90_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_90_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_91_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_91_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_92_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_92_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_93_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_93_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_94_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_94_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_95_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_95_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_96_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_96_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_97_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_97_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_98_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_98_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_99_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_99_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_100_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_100_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_101_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_101_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_102_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_102_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_103_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_103_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_104_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_104_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_105_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_105_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_106_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_106_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_107_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_107_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_108_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_108_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_109_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_109_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_110_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_110_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_111_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_111_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_112_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_112_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_113_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_113_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_114_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_114_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_115_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_115_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_116_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_116_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_117_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_117_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_118_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_118_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_119_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_119_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_120_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_120_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_121_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_121_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_122_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_122_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_123_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_123_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_124_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_124_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_125_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_125_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_126_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_126_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_127_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_127_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_128_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_128_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_129_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_129_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_130_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_130_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_131_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_131_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_132_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_132_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_133_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_133_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_134_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_134_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_135_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_135_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_136_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_136_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_137_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_137_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_138_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_138_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_139_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_139_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_140_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_140_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_141_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_141_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_142_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_142_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_143_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_143_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_144_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_144_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_145_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_145_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_146_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_146_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_147_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_147_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_148_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_148_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_149_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_149_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_150_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_150_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_151_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_151_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_152_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_152_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_153_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_153_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_154_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_154_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_155_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_155_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_156_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_156_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_157_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_157_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_158_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_158_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_159_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_159_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_160_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_160_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_161_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_161_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_162_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_162_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_163_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_163_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_164_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_164_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_165_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_165_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_166_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_166_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_167_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_167_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_168_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_168_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_169_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_169_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_170_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_170_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_171_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_171_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_172_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_172_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_173_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_173_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_174_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_174_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_175_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_175_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_176_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_176_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_177_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_177_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_178_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_178_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_179_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_179_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_180_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_180_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_181_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_181_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_182_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_182_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_183_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_183_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_184_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_184_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_185_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_185_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_186_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_186_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_187_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_187_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_188_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_188_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_189_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_189_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_190_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_190_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_191_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_191_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_192_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_192_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_193_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_193_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_194_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_194_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_195_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_195_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_196_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_196_1.png"], "output": "B", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_197_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_197_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a lower brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_198_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_198_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visual_quality_assessment_ve_lol_l", "visual_input_component": "natural image", "source": "ve_lol_l", "options": "A: the first image\nB: the second image\n", "question": "Which image has a higher brightness?", "context": "Candidates: A. the first image B. the second image", "input_image_path": ["./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_199_0.png", "./Low-level-semantic/visual_quality_assessment_ve_lol_l/visual_quality_assessment_ve_lol_l_199_1.png"], "output": "B", "qwen3-vl": "image none"}]
\ No newline at end of file
diff --git a/results/visually_grounded_reasoning_marvl/qwen3-vl/metadata_info.json b/results/visually_grounded_reasoning_marvl/qwen3-vl/metadata_info.json
new file mode 100644
index 0000000..a20eaf9
--- /dev/null
+++ b/results/visually_grounded_reasoning_marvl/qwen3-vl/metadata_info.json
@@ -0,0 +1 @@
+[{"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The spoon on the left is made of porcelain, the spoon on the right is made of stainless steel. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_0_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_0_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "At least one picture has fireworks in it. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_1_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_1_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The right picture has exactly three spoons, while the left picture has no more than two. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_2_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_2_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The picture on the left shows potatoes that have not yet been cooked, while the potatoes on the right have already been fried into fries. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_3_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_3_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "One of the pictures only has dark green broccoli, while the other picture has both white and dark green broccoli. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_4_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_4_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In one of the pictures, there is exactly one mouse, while in another picture, there are at least three or more. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_5_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_5_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "Both pictures contain longan, and there are people in the right picture. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_6_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_6_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In both pictures, there is exactly one prominent pavilion, and on the pavilion in the left picture, there is a red plaque. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_7_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_7_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "There is no one in the living room in the left picture, while in the right picture, someone is watching TV. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_8_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_8_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The right picture has exactly one cow, while the left picture has more than one cow and they are light-colored. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_9_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_9_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "There is only one egret in each of the two pictures. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_10_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_10_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In both pictures, there is exactly one person using a plow to work with a cow. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_11_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_11_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In both pictures, there is at least one sickle. In one of the pictures, the sickle is placed on a backpack or on the grass. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_12_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_12_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "One of the pictures has exactly two Peking Opera actors, while the other picture has at least four actors. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_13_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_13_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The left picture has Buddhist-related statues, while the right picture has people conducting ceremonies. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_14_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_14_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The left picture has hanging paper cuttings, while the right picture has red paper cuttings. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_15_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_15_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The left picture has hanging paper cuttings, while the right picture has red paper cuttings. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_16_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_16_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In both pictures, there is exactly one panda, and neither of them is moving. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_17_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_17_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The lotus flowers in both pictures are blooming, and there is exactly one purplish-red lotus flower in the left picture. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_18_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_18_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The right picture has at least two dark-colored cows, while the left picture has one or more cows. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_19_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_19_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The porridge in the right picture has added fruit, and there is no spoon in the porridge in the left picture. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_20_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_20_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In the left picture, there is exactly one sparrow, while in the right picture, there are exactly two. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_21_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_21_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The left picture has fried rice, while the right picture does not have fried rice or similar dishes. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_22_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_22_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The T-shirt in the right picture is pure white, while the T-shirt in the left picture has Chinese characters on it. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_23_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_23_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "Both pictures contain cut open cantaloupes. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_24_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_24_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "At least one picture has milk placed in the refrigerator. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_25_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_25_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In the left picture, there is exactly one woman playing the guzheng, while in the right picture, there are some guzhengs but no one. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_26_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_26_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In the left picture, there are chicks that have not yet grown up, and in the right picture, there are at least three or more chickens. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_27_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_27_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The right picture includes not only willow trees but also a lake, and the willow leaves in the left picture are a bit dark in color. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_28_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_28_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The left picture has a lot of dumplings, the right picture is some pan-fried dumplings. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_29_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_29_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In both pictures, there is exactly one hummingbird, and they are both flying in the same direction. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_30_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_30_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In both pictures, there is only one person wearing a suit, either a man or a woman, and the one in the right picture is a man. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_31_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_31_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The left picture has several blooming chrysanthemums, the chrysanthemum petals in the right picture are yellowish or orange. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_32_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_32_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The crows in both pictures are not flying with their wings spread, and at least one of the pictures has five crows. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_33_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_33_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The left picture only has one obvious fish, while the right picture has many swimming fish. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_34_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_34_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In both pictures, there is exactly one hummingbird, and they are both flying in the same direction. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_35_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_35_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The noodles in both pictures are placed in plates or bowls, and there are also chopsticks in the left picture. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_36_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_36_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In one of the pictures, there is a ginkgo tree and a deep green lawn, while in the other picture, there is a ginkgo tree but the ground is not visible. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_37_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_37_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "On the left, there are several lilies planted together, and on the other side, there are two lilies planted together. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_38_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_38_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The lilies in the right picture are not just white, the lilies in the left picture are blooming. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_39_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_39_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "One of the pictures features both fish and non-fish animals together. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_40_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_40_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The left picture contains lychees and an adult, while the right picture shows lychees and a price tag. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_41_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_41_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "One of the pictures has exactly one pair of chopsticks placed on the bowl, while the other picture has more than one pair of light-colored chopsticks. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_42_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_42_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "There are two adult men in suits in the left picture. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_43_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_43_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The right picture shows a hand holding a ballpoint pen, while the left picture only has a ballpoint pen. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_44_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_44_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In the right picture, there is exactly one coffee cup, and in the left picture, the coffee cup is placed on a saucer. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_45_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_45_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In the right picture, besides broccoli, there are also other red fruits and vegetables, but the left picture only has broccoli. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_46_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_46_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The left picture has Buddhist-related statues, while the right picture has people conducting ceremonies. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_47_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_47_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "There is exactly one football in each of the two pictures, and there are exactly two football players in one of the pictures. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_48_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_48_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "Both pictures contain villas, but no cars or people. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_49_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_49_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The left picture has exactly two Peking Opera actors, while the right picture has at least five. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_50_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_50_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The person in the left picture is wearing a suit and holding a computer or bag, while the person in the right picture is also wearing a suit but is not holding any noticeable items. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_51_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_51_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "Among the two pictures of dumplings, only one is definitely placed on a square plate. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_52_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_52_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In one of the pictures, there are two people in the farmland, while in another picture, there is exactly one person working with a hoe. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_53_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_53_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The right picture only has a blackboard with no people, while the left picture has both a blackboard and people. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_54_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_54_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The peony flower in the left picture occupies at least half of the area, while the peony flower in the right picture does not. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_55_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_55_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In one picture, you can only see one person playing table tennis, while in another picture, you can see more than one person. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_56_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_56_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The right picture precisely has a child with calligraphy, while the left picture is of a person writing calligraphy. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_57_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_57_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In the right picture, you can see a prominent main pine tree, while in the left picture, the pine tree has the sky as its background. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_58_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_58_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "On the left is a separate bowl of porridge, and in the porridge on the right there is a spoon. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_59_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_59_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "One of the pictures shows someone interacting with a hummingbird, while another picture shows a hummingbird trying to eat something red. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_60_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_60_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The left picture has people and paper cuttings, while the right picture contains paper cuttings of the Chinese character for \"spring\". Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_61_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_61_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "There are at least two or more people in the living room in both pictures. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_62_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_62_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In the left image, you can see drums with eight or more sides, while the drum surfaces in the right image are gold or brown. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_63_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_63_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In the right picture, there is only one blooming rose-colored orchid, while in the left picture, there are several blooming orchids. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_64_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_64_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In the right picture, there is exactly one hummingbird facing right, and in the left picture, there is exactly one hummingbird facing left. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_65_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_65_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In the left picture, you can see many obvious coffee beans, while the right picture has a coffee cup and pot. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_66_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_66_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In one of the pictures, there is a clear and complete rose, while the other picture does not have one. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_67_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_67_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In one picture, the porridge is yellow, while in another picture, it is either white porridge or purple rice porridge. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_68_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_68_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The right picture has green apples, while the left picture has red apples. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_69_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_69_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "One of the pictures contains an image of firecrackers, while the other picture has wine or a wine glass. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_70_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_70_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "One of the pictures has a deep blue sky behind the pine tree, while the other picture only has the pine tree. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_71_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_71_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In the left picture, there are a farmer, a plow, and a cow, while the right picture only has a plow. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_72_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_72_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In the right picture, you can see decorations with the Chinese character for \"luck\", while the left picture shows a street during Chinese New Year. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_73_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_73_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The left picture has Terracotta Warriors and people, while the right picture only has Terracotta Warriors. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_74_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_74_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "On the left, a boy is playing the erhu, and on the right, a girl is playing the erhu. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_75_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_75_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "One of the cups of coffee is regular coffee without any distinct pattern, while the other one is coffee with latte art visible. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_76_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_76_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In both pictures, you can see tea in the teacup, but in the left picture, there is only a teacup and no teapot. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_77_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_77_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "One of the pictures presents a courtyard house from a bird's-eye view, while the other picture features a courtyard house and green trees. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_78_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_78_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "One of the pictures is of a newborn puppy, and another picture features a spotted dog. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_79_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_79_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In the left picture, there is a statue of Buddha, while in the right picture, there are some people wearing Buddhist attire. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_80_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_80_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "Both bubble teas in the two pictures have pearls added, and there is exactly one cup in the left picture. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_81_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_81_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The total number of cows in the two pictures exceeds five. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_82_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_82_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In both pictures, there are people wearing red cheongsams, and in one picture, there is more than one person. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_83_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_83_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "One of the pictures contains an image of firecrackers, while the other picture has wine or a wine glass. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_84_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_84_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The left picture has Terracotta Warriors and people, while the right picture only has Terracotta Warriors. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_85_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_85_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "Both pictures contain Fujian Tulou, and one of them happens to have only one dome-shaped Fujian Tulou. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_86_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_86_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The right picture has green apples, while the left picture has red apples. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_87_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_87_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "There is no one in the living room in the left picture, while in the right picture, there are people watching TV. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_88_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_88_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In one picture, the birch tree is clearly sawed off, while in the other it is not. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_89_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_89_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In one picture, a yellow note appeared and another one is about the Qingming Festival activities. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_90_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_90_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The left picture is a shadow puppet master, and the right picture is a shadow puppet stage. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_91_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_91_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In the right picture, you can see at least one blue or yellow orchid, while the color of the orchid in the left picture is lighter. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_92_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_92_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In the right picture, you can see a prominent pine tree, and in the left picture, the pine tree has the sky as a background. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_93_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_93_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The lotus flowers in both pictures are blooming, and there is exactly one purplish-red lotus flower in the left picture. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_94_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_94_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The right picture has exactly one very obvious light-colored lotus, while the left picture has two obvious purple lotuses. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_95_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_95_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "One of the pictures shows fish and non-fish animals together. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_96_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_96_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "There is no one in the dining room in both pictures. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_97_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_97_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The picture on the left is a single guzheng, and the picture on the right is a guzheng sold in a music store. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_98_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_98_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In both pictures, there are women wearing cheongsams, and there are at least three of them. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_99_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_99_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In one picture, the birch tree is clearly sawed off, while in the other it is not. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_100_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_100_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "One of the pictures only has at most two prominent chrysanthemums, while the other picture has a lot of chrysanthemums. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_101_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_101_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "Both pictures contain stilt houses, and the left picture includes red lantern decorations. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_102_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_102_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "One of the pictures only has dark green broccoli, while the other picture has both white and dark green broccoli. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_103_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_103_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "On the left is a separate bowl of porridge, and in the porridge on the right, there is a spoon. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_104_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_104_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "One of the pictures only has at most two prominent chrysanthemums, while the other picture has a lot of chrysanthemums. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_105_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_105_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "On the left, a boy is playing the erhu, and on the right, a girl is playing the erhu. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_106_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_106_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "There is exactly one cup of milk tea in each of the two pictures. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_107_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_107_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "Both pictures contain a rake, and there are absolutely no parts of people or feet. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_108_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_108_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The picture on the left contains lanterns used during the Mid-Autumn Festival, and the picture on the right is of mooncakes eaten during the Mid-Autumn Festival. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_109_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_109_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In both pictures, there is exactly one little egret, and the one in the right picture is not flying. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_110_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_110_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In the left picture, there are exactly three people wearing similar Tang suits, while in the right picture, there are only one or two people. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_111_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_111_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The bok choy in one picture has already been stir-fried and served, while the one in the other picture has not. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_112_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_112_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In the bedroom of the left picture, there are two people, while in the bedroom of the right picture, there is no one. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_113_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_113_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In the right picture, you can see a prominent pine tree, and in the left picture, the pine tree has the sky as its background. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_114_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_114_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The noodles in both pictures are placed in plates or bowls, and there are also chopsticks in the left picture. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_115_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_115_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In one picture, the raw meat slices occupy a large area, while in the other picture, there are no obvious raw meat slices. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_116_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_116_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "One of the pictures shows someone interacting with a hummingbird, while the other picture is of a hummingbird attempting to eat something red. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_117_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_117_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In the right picture, there are many heart-shaped decorations, while the left picture has bear toys or lights. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_118_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_118_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "At least one of the two pictures contains a picture of a Mandarin Duck Pot. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_119_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_119_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "Both pictures are related to the bathroom, and there is a child in the right picture. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_120_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_120_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "Both of the two pictures contain at least five whole carrots, and they have not been cooked or juiced yet. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_121_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_121_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In the left picture, there is exactly one woman playing the guzheng, while in the right picture, there are some guzhengs but no one. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_122_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_122_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "Both pictures contain stilt houses, and the picture on the left includes red lantern decorations. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_123_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_123_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The left picture has people and paper cuttings, while the right picture contains paper cuttings of the Chinese character for \"spring\". Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_124_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_124_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "There are whole, uncut cantaloupes in both pictures. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_125_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_125_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The picture on the left is of exactly one person training to swim, while the other one is of multiple people swimming in a swimming pool. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_126_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_126_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In the left picture, there are exactly three people wearing Tang suits, while the people in the right picture are wearing blue or red Tang suits. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_127_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_127_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In the left picture, there is only one brush, while in the right picture, there are several. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_128_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_128_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The right picture has a hand holding a ballpoint pen, while the left picture only has a ballpoint pen. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_129_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_129_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The right picture has green apples, while the left picture has red apples. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_130_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_130_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "One of the pictures shows a lion dance performance during the Spring Festival, and another picture shows the character \"Fu\" hanging. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_131_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_131_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "One of the pictures is of a market street during the Spring Festival, and the other picture is of a Spring Festival couplet with the character \"Fu\" written on it. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_132_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_132_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In the right picture, there is exactly one woman playing the erhu, while in the left picture, the person playing the erhu seems to be performing. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_133_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_133_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In the left picture, there is exactly one woman playing the pipa, while in the right picture, there are two or more children playing the pipa. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_134_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_134_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "At least one picture has a chick; at least one picture shows an adult chicken. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_135_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_135_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In one of the pictures, there is a clear and complete rose, while the other picture does not have one. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_136_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_136_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "Both of the two pictures contain at least five whole carrots, and they have not been cooked or juiced yet. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_137_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_137_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In the right picture, there is exactly one person running, while in the left picture, there are at least two people. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_138_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_138_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The lilies in the right picture are not just white, the lilies in the left picture are blooming. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_139_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_139_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In both pictures, you can see at least one real person (not a sculpture) playing the suona. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_140_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_140_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "There is no one in the bedroom in the left picture, while there is someone in the bedroom in the right picture. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_141_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_141_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "Adding the two pictures together, there are at least five bottles of cola, and most of them are stored in the refrigerator or in cardboard packaging boxes. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_142_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_142_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The left picture has exactly one dog, while the right picture has exactly two. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_143_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_143_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "Both pictures clearly show green leaves and the trunk of the plane tree, not just the trunk or leaves. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_144_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_144_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In one of the pictures, there is only an empty bowl, while in the other picture, the bowl is filled with things. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_145_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_145_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In one picture, there is a blackboard but no people, while in the other picture, there is both a blackboard and people. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_146_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_146_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In the left picture, there is a statue of Buddha, while in the right picture, there are some people wearing Buddhist robes. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_147_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_147_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "Both bubble teas in the two pictures have pearls added, and there is exactly one cup in the left picture. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_148_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_148_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In the right picture, you can see at least one blue or yellow orchid, while the color of the orchid in the left picture is lighter. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_149_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_149_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In one picture, there is a blackboard but no people, while in the other picture, there is both a blackboard and people. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_150_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_150_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In the right picture, you can see decorations with the Chinese character for \"luck\", while the left picture shows a street during Chinese New Year. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_151_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_151_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The crows in both pictures are not flying with their wings spread, and at least one of the pictures has at least five crows. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_152_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_152_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In one picture, the porridge is yellow, while in another picture, it is either white porridge or purple rice porridge. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_153_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_153_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The left picture has exactly one brush, while the right picture has several brushes. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_154_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_154_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "One of the pictures shows a large soup spoon serving something or placed in the soup, while the other picture does not. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_155_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_155_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In the left picture, there are exactly two people in the dining room, while in the right picture, the dining room is empty. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_156_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_156_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In one of the pictures, there is a person playing the guzheng, while in another picture, there is at least one guzheng, but no one is playing it. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_157_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_157_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In the right picture, besides the willow trees, you can also clearly see the greenery. However, there is no greenery in the left picture. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_158_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_158_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In one picture, there is a hand holding scissors, while the other picture does not have this. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_159_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_159_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "Both pictures contain chalk but there are absolutely no people. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_160_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_160_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In both pictures, you can see tea in the teacup, but in the left picture, there is only a teacup and no teapot. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_161_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_161_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In both pictures, there is exactly one giant panda, and the giant panda in the right picture is not active. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_162_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_162_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "Both bubble teas in the two pictures have pearls added, and there is exactly one cup in the left picture. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_163_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_163_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "Both pictures feature the scene of a quadrangle courtyard, and one of them is an overhead view. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_164_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_164_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "At least one picture contains a chick; at least one picture shows an adult chicken. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_165_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_165_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In both pictures, there is one or two crows with their wings folded, and they are not spreading their wings or flying. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_166_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_166_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In one of the pictures, there are two people playing football together, and in another picture, there is one person playing football. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_167_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_167_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "One of the pictures is of a market street during the Spring Festival, and the other picture is of a Spring Festival couplet with the character \"Fu\" written on it. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_168_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_168_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The left picture has exactly two Peking Opera actors, while the right picture has at least five. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_169_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_169_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In one of the pictures, there is a ginkgo tree and a deep green lawn, while in the other picture, there is a ginkgo tree but the ground is not visible. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_170_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_170_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In the picture on the right, there are chopsticks, and another picture is of noodles in a white bowl. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_171_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_171_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In both pictures, there is more than one chicken, and the chickens in the left picture have different feather colors. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_172_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_172_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The right picture includes not only willow trees but also a lake, and the leaves of the willow tree in the left picture are a darker shade. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_173_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_173_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In one picture, there is exactly one pair of chopsticks placed on the bowl, while in the other picture, there is more than one pair of chopsticks. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_174_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_174_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The person in one of the pictures is serving the ball. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_175_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_175_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In both pictures, there is at least one sickle. In one of the pictures, the sickle is placed on a backpack or on the grass. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_176_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_176_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "Both pictures contain apples, and there is no one in the right picture. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_177_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_177_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In the left picture, there are several fish placed in a pile of ice cubes, while the right picture has many long, slender fish. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_178_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_178_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In the left picture, you can see many obvious coffee beans, while the right picture has a coffee cup and pot. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_179_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_179_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The left picture is a shadow puppet master, and the right picture is a shadow puppet stage. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_180_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_180_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In one of the pictures, the white rice is placed on a plate without any side dishes, while the rice in the other picture comes with side dishes and green vegetables. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_181_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_181_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In the left picture, there is a statue of Buddha, while in the right picture, there are some people wearing Buddhist attire. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_182_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_182_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In the two pictures, only one picture has a knife on the cutting board, while the other picture has food. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_183_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_183_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The left picture has exactly one dog, while the right picture has exactly two. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_184_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_184_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The picture on the right has exactly three spoons, while the one on the left has no more than two. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_185_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_185_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "At least one picture has milk placed in the refrigerator. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_186_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_186_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "One picture shows a teapot, while another shows tea being poured into a cup. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_187_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_187_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In both pictures, there is only one person blowing a suona, and the person in the right picture is facing to the left. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_188_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_188_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The right picture has exactly two white lilies, while the left picture has exactly one obvious lily. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_189_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_189_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The right picture shows exactly one child with calligraphy, while the left picture is of a person writing calligraphy. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_190_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_190_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The right picture shows exactly one egret with its wings spread, while the egret in the left picture is not flying. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_191_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_191_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The left picture has a Coca-Cola can, while the Coca-Cola in the right picture is bottled. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_192_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_192_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "Both pictures contain a rake, and both pictures are completely devoid of people. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_193_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_193_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The ginkgo tree in the right picture is dark green, while the one in the left picture is brilliant colored. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_194_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_194_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In both pictures, you can see at least one real person (not a sculpture) playing the suona. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_195_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_195_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The right picture has at least two dark-colored cows, while the left picture has one or more cows. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_196_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_196_1.jpg"], "output": "B", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The right picture has five crows, while the left picture only has two. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_197_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_197_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The scissors in the right picture are being used. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_198_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_198_1.jpg"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_marvl", "visual_input_component": "natural image", "source": "marvl", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The bowl in the left picture has chopsticks on it, while the bowl in the right picture does not have chopsticks or a spoon. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_199_0.jpg", "./High-level-obj-semantic/visually_grounded_reasoning_marvl/visually_grounded_reasoning_marvl_199_1.jpg"], "output": "A", "qwen3-vl": "image none"}]
\ No newline at end of file
diff --git a/results/visually_grounded_reasoning_nlvr2/qwen3-vl/metadata_info.json b/results/visually_grounded_reasoning_nlvr2/qwen3-vl/metadata_info.json
new file mode 100644
index 0000000..b6d2956
--- /dev/null
+++ b/results/visually_grounded_reasoning_nlvr2/qwen3-vl/metadata_info.json
@@ -0,0 +1 @@
+[{"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "There are three dogs. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_0_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_0_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "An image contains a sitting baboon who is holding a roundish, yellowish fruit. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_1_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_1_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "There are two jellyfish and they both appear to have long tails trailing below them. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_2_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_2_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "All the cars are convertible and red. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_3_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_3_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The carts have only single riders on them. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_4_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_4_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "One image features a multi-door scene with one tree and floating pink petal shapes, and the other image features a multi-door scene with a tree on each side. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_5_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_5_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "A man is riding between two animals. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_6_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_6_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The two dogs' bodies are pointing in opposite directions from each other. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_7_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_7_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "One of the images contains at least three graduates with gold-colored sashes around their necks. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_8_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_8_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "There are at least 2 animals in the left image. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_9_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_9_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "An image contains two dingoes. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_10_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_10_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "An image shows a baby chimp and baby gorilla sitting side by side and interacting. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_11_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_11_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "An arched doorway sits under the stairway in one of the images. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_12_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_12_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "There are two dogs that are staring straight ahead. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_13_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_13_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "Each image contains an old-fashioned TV with controls on the right of its screen, and no TV has a lit screen or picture displayed on the screen. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_14_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_14_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "An image contains a single chimp, which is eating something nut-like and holding more food in its hand. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_15_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_15_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "A dog is sitting on the grass in the image on the left. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_16_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_16_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "There is a dog on a pool raft in the left image. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_17_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_17_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "There are two ibex in the left image. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_18_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_18_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "One set of lips is not glossy. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_19_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_19_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "One of the dogs is on wood. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_20_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_20_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "Both staircases have vertical post designed railings. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_21_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_21_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The dumbbells in the image on the right are shown in a variety of colors. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_22_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_22_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "One pizza is in a box. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_23_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_23_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "There is a single dog in the right image and it is wearing a red collar. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_24_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_24_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The right image contains exactly three dogs. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_25_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_25_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "There are at least two rodents in the right image. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_26_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_26_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In the left image, a person is lifting a free weight. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_27_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_27_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In at least one image there is a saxophone with keys that are a different color from the base instrument. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_28_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_28_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "All the instruments are standing on their ends. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_29_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_29_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "There are fewer than twenty golf balls. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_30_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_30_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "Multiple people are riding in a two wheeled cart pulled along a dirt path by one water buffalo. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_31_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_31_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The left image contains exactly two dispensers. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_32_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_32_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The right image contains two dogs wearing life vests. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_33_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_33_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "One image includes closed multi-compartment zipper cases shown in six solid-color options. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_34_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_34_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "There are two dogs on the left image, and three dogs on the right. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_35_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_35_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "All the jellyfish have long tentacles. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_36_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_36_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "One elephant has long tusks. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_37_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_37_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "A man is sitting. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_38_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_38_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "All of the cheetahs are eating. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_39_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_39_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "One of the crashed buses has at least two tires up in the air. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_40_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_40_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "There are more desserts in the image on the left. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_41_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_41_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "There are exactly two beakers in the right image. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_42_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_42_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "Each image shows one vase with an open top, a short base, a tear-drop shape, and no handles. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_43_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_43_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "A cartoon cat appears once in each image, and the left image features a cartoon cat posed sitting with face forward and leg to the right. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_44_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_44_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "There are only two adult skunks. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_45_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_45_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "There is a stairway with an arched doorway under the stairs Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_46_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_46_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In at least one image there is a flag and flagpole in front of a monastery Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_47_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_47_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "One image features exactly two side-by-side black-and-white dogs, and the other features one reclining tri-colored dog with both front paws extended forward. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_48_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_48_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In at least one image there is a brown rectangle tv with two silver knobs on it. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_49_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_49_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "An image shows a group of safety pins arranged in the shape of a flower. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_50_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_50_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The right image contains a vertical stack of two pillows. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_51_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_51_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "All kneepads are modeled by a human body, and at least one image shows only one kneepad and one bare knee. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_52_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_52_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In at least one image someone is using a kitchen utensil. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_53_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_53_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "One image features a squarish light-colored building with a tiered green roof and columns in front of an arch doorway. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_54_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_54_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "All dogs are posed on some outdoor structure made of wood and are gazing generally toward the camera. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_55_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_55_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "One of the images shows two guinea pigs diving into a pool. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_56_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_56_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "One image shows no more than five zebras running and kicking up dust, and the other image shows a large herd of zebras running and splashing across a wet green field. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_57_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_57_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "An image shows a group of three pet rodents in a container, and all share the same fur coloration Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_58_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_58_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "There are exactly four birds perched on a branch in the pair of images. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_59_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_59_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The left and right image contains the same number of horses pulling a cart. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_60_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_60_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "Each image appears to contain only zebra-type animals, and in at least one image, the zebras are massed together so its hard to distinguish individual animals. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_61_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_61_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The right image shows several dinosaur shaped balloons hung in a room with a beige sofa and a TV hanging on the wall. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_62_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_62_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In one of the images you can see something that is not a towel. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_63_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_63_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In at least one image there is only a single zebra with a closed mouth look straight forward. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_64_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_64_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "One image contains no more than two short-haired guinea pigs posed on a blue surface, and the other image shows a single long-haired brown-and-white guinea pig. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_65_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_65_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In at least one image there is a total of three women in bikinis with at least one holding a drink. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_66_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_66_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "there are 7 pencil puches in the image pair Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_67_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_67_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "An image features a model in a pink bikini standing with her arms over her head. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_68_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_68_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The left and right image contains the same number of black weights. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_69_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_69_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "Two identical dining tables, each with chairs arranged for seating at least four, are placed side by side and are empty except for a centerpiece on each. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_70_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_70_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "A person can be seen holding more than one puppy. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_71_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_71_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The combined images include at least one two-wheeled cart with a wagon bed on it, exactly one man, and exactly one horse. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_72_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_72_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "An image shows a hamster clutching a snack while posed with its hind legs raised off the ground and at least one pink sole showing. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_73_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_73_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "Both trains are headed diagonally down towards the left. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_74_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_74_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "All of the dogs are standing. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_75_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_75_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "At least one of the bottles has a kind of loop on the lid, and the bottles on the left and right are different styles. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_76_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_76_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "One of the pictures includes a patch of brown rock. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_77_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_77_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "Each image contains a wolf, and one image shows a black wolf and a dark doberman in a face-to-face confrontation. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_78_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_78_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "At least one cheetah is near a pool of water, and two cheetahs have their heads lowered. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_79_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_79_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The right image shows one panda posed on its back with at least one front paw raised and mouth open. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_80_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_80_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "There are two glass becker. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_81_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_81_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "One image shows three side-by-side gray-and-white husky puppies in upright sitting poses, and all dogs in both images are puppies. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_82_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_82_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The right image contains two black beetles. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_83_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_83_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "There are over a dozen pictures of women with lipstick. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_84_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_84_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In the image on the right there are exactly 5 pillows. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_85_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_85_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "Each image contains a dog with black spots on white fur, and the large spotted dog is in a reclining pose near a french bulldog in one image. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_86_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_86_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The carts have only single riders on them. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_87_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_87_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "Both beds have round top drapes. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_88_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_88_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The image on the left shows a single white dog being fed something. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_89_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_89_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "One image shows a stack of two pillows with pointed corners, and the other image shows flat-edged pillows, with one pillow leaning against a pillow that is lying flat. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_90_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_90_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "A tiny skunk with a thin white stripe down its forehead is sleeping on the side of its head. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_91_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_91_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "There are six or more vending machines that have food or drinks. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_92_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_92_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "All horned animal figures are standing facing rightward, and each image contains just one horned animal form. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_93_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_93_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In at least one image there is a dark brown staircase end facing left with dark brown railing. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_94_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_94_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "One image contains a trio of black pugs out of costume, and the other image includes no uncostumed dogs and includes at least one dog wearing a fur hood. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_95_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_95_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The left image shows a stack of at least four round patties topped with a dollop of white cream and sprinkled with green ring shapes, all on a white dish. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_96_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_96_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "Each image shows a corgi dog in a sitting pose, and the dog on the right has an open mouth while the dog on the left does not. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_97_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_97_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The left and right image contains the same number empty clear glass soap dispenser. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_98_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_98_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "At least one case is pinkish and depicts the Eiffel tower on its front. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_99_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_99_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "There is an ibex in a wooded area with trees behind it Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_100_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_100_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "A skin product in one image is in a short fat beige jar with a brown lid the same width as the jar. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_101_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_101_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "An image shows at least six faces modeling lipstick, with eyes visible and all images displayed in side-by-side pairs. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_102_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_102_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "There are exactly two beakers in the right image. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_103_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_103_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "Some penguins are swimming in water. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_104_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_104_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "Each image shows one white dog with an open mouth, but the dog depicted in the left image also has its eyes shut. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_105_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_105_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "An image shows a workout with only women holding a weight in each hand raised in front of their bodies. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_106_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_106_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The dog in the image on the right has its mouth open. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_107_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_107_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "There are exactly two empty containers. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_108_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_108_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "Each image includes three zebras posed in a row with their bodies parallel to one another. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_109_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_109_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "There is only one guinea pig in each of the images. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_110_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_110_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "Each image includes a bus with a non-flat front in the foreground, and multiple buses are visible in each image. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_111_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_111_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In one of the images, a dog is sleeping on their back in a belly up position. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_112_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_112_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The girl in the left image is blonde. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_113_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_113_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "There is fruit salad in a white bowl. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_114_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_114_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "People are posed and visible, including torsos and some legs. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_115_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_115_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In exactly one image there are sliced kiwis in a dessert bowl. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_116_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_116_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In the image to the right, you can see the person's fingers. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_117_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_117_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "At least 2 giant safety pins are hanging next to a sign that has the word Laundry on it. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_118_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_118_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "A dog is laying on its back Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_119_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_119_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "Two models are standing. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_120_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_120_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The left and right image contains the same number of brown bookshelves. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_121_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_121_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "At least some of the zebras in the image on the left are standing on dirt. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_122_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_122_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "There are no more than five penguins. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_123_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_123_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "There are four birds in the pair of images. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_124_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_124_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "One image contains only whole, unpeeled lemons, while the other image contains one lemon cut in half, and at least as many unpeeled lemons as the image with only unpeeled lemons. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_125_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_125_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "One of the nets is pink. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_126_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_126_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The weights in the right image are in use by a man, unlike the weights in the left image. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_127_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_127_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "One image shows multiple pandas on a structure made of wooden logs, and the other shows two pandas by something that extends from the bottom of the image. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_128_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_128_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "At least one perfume bottle cap has pink flowers. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_129_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_129_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "There is a bottle of pepper sauce with a gold-colored sealing band. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_130_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_130_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "white painted stripes are painted horizontally on the train engine Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_131_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_131_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "A person has a hand around the neck of a camera-facing pug in the left image, and the right image contains exactly two dogs. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_132_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_132_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "There are 3 phones on the left and two phones on the right. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_133_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_133_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "Human legs model kneepads in both images, and at least one image contains a single kneepad. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_134_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_134_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "One dog is black. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_135_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_135_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "A roll of paper towel is in a stainless steel holder. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_136_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_136_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In one image, a dog has its mouth open. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_137_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_137_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "At least one image shows a room with clusters of lights suspended from an exposed beam ceiling over rectangular tables and bright orange chairs. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_138_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_138_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "One image in the pair shows a single pig swimming and the other shows at least two pigs swimming. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_139_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_139_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The image on the right has no more than three jellyfish. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_140_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_140_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "Each image contains just one container used for drinking, and the front of at least one of the containers depicts a city skyline. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_141_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_141_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The right image contains one chimpanzee that is exposing its teeth. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_142_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_142_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "There is a dog on a pure white background. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_143_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_143_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The combined images include at least one standing adult wild pig and at least one standing baby piglet with distinctive brown and beige striped fur. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_144_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_144_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In both images a plant is sprouting out of a vase. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_145_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_145_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The left and right image contains the same number of mugs. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_146_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_146_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "Two pandas are eating. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_147_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_147_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "There is only one pillow in the right image. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_148_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_148_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "There is one stand that is both glass top and wider than the TV it is holding. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_149_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_149_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In the image to the right, you can see the person's fingers. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_150_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_150_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "There is only one pillow in the right image. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_151_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_151_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "Every safety pin in the images is closed. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_152_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_152_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "One flower arrangement is not in a vase. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_153_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_153_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In one of the images you can see something that is not a towel. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_154_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_154_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "Loose rolls are sitting on a package of toilet paper. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_155_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_155_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "At least five light-colored dogs are running forward over a field of grass in the left image. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_156_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_156_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The animal in the left image has an open mouth, and the skunk of the right is standing on all fours with its body in profile. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_157_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_157_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "Exactly two bowls of mashed potatoes are in round containers, only one with a spoon. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_158_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_158_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "There are no more than 3 monkeys. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_159_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_159_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "There are both gray and white section of  fur on a single wolf whose body is facing right with their head tilted left forward. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_160_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_160_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In two of the images, the unbaked puffed pastry dough is folded and has finger poke marks on top. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_161_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_161_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "One image includes closed multi-compartment zipper cases shown in six solid-color options. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_162_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_162_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "there is at least one tube of lipstick with a silver base and gold accents Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_163_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_163_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "One of the images shows a spider-like creature next to a blush or beige colored urchin, while the other image shows a pink urchin without the spider creature. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_164_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_164_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "One of the pictures shows a doctor holding a syringe on the right. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_165_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_165_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The golf balls in one of the pictures are arranged in three rows and four columns. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_166_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_166_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "the right side has bananas as dolphins Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_167_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_167_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "Each image shows one vase with an open top, a short base, a tear-drop shape, and no handles. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_168_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_168_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The dog on the left has a smiling face, and the dog on the right is baring its teeth. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_169_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_169_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "All of the horses are light brown Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_170_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_170_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The left image features a blond girl in a pink tank top that covers her midriff standing in front of at least one person and posing with a red dumbbell in each hand, lifted with the elbow forming a right angle. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_171_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_171_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The left and right image contains the same number of horses pulling a cart. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_172_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_172_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The left and right image contains a circle and square canopy. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_173_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_173_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "Each image shows the bare framework of a yurt-type building under construction, and the right image shows a ladder under the framework. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_174_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_174_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "There is a total of eight drink bottles. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_175_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_175_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "An entire bracelet is visible in the left image. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_176_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_176_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "There is at least one dog that has its mouth open. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_177_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_177_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "One image shows a pair of finger-exposing gloves with a panda face on each glove front, and the other image shows one pair of fir-trimmed hand coverings. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_178_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_178_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The right and left images each show a two-wheeled cart with one female passenger, and each cart is pulled by one horse and headed in the same direction. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_179_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_179_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "There are only two dogs and both of them have their tails curled over their backs. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_180_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_180_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "A slice is separated. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_181_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_181_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "One image contains a single whole orange and one half orange. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_182_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_182_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "There is a machine in the image on the right near a trash can. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_183_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_183_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "One image features a clear glass with a flat bottom holding water and one variety of flowers in it, and the other image includes at least one pink flower in something with a roundish shape. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_184_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_184_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In each image, a black ring binder notebook is standing on end with open edges to the back, and the binder end visible with a label attached. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_185_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_185_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "A roll of paper towel is in a stainless steel holder. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_186_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_186_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "There are at most four shoes. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_187_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_187_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In at least one image tehr is a brown mother dog looking after at least one puppy. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_188_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_188_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "In the right image, people in purple attire are lined up in front of a temple. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_189_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_189_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "Three blue birds are perched outside. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_190_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_190_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "a pair of warthogs are facing each other with noses touching Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_191_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_191_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The left and right image contains a total of four women in bikinis. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_192_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_192_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "There are books stacked flat on the floor next to the bookshelves. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_193_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_193_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "The wagons in both pictures are parked in a grassy area. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_194_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_194_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "Each image shows exactly one girl, who is wearing matching knitted mittens and cap, her hands pointing up towards her face, and a large pompom on her hat. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_195_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_195_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "There are no more than 3 people in the image on the right. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_196_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_196_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "All of the pictures have at least one dog with a baby. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_197_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_197_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "At least one of the dogs in the image on the right is wearing a Santa hat. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_198_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_198_1.png"], "output": "A", "qwen3-vl": "image none"}, {"task": "visually_grounded_reasoning_nlvr2", "visual_input_component": "natural image", "source": "nlvr2", "options": "A: true\nB: false", "question": "Is it true or false?", "context": "Each image contains one dark gray puppy with upright ears sitting on a fabric surface and facing forward with open eyes. Is it true or false? Select from the following options.\nA: true\nB: false\n", "input_image_path": ["./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_199_0.png", "./High-level-obj-semantic/visually_grounded_reasoning_nlvr2/visually_grounded_reasoning_nlvr2_199_1.png"], "output": "A", "qwen3-vl": "image none"}]
\ No newline at end of file